1444 files changed, 761356 insertions, 0 deletions
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
new file mode 100644
index 000000000000..210b80ab63ef
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -0,0 +1,557 @@
+//===- AliasAnalysis.cpp - Generic Alias Analysis Interface Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic AliasAnalysis interface which is used as the
+// common interface used by all clients and implementations of alias analysis.
+//
+// This file also implements the default version of the AliasAnalysis interface
+// that is to be used when no other implementation is specified.  This does some
+// simple tests that detect obvious cases: two different global pointers cannot
+// alias, a global cannot alias a malloc, two different mallocs cannot alias,
+// etc.
+//
+// This alias analysis implementation really isn't very good for anything, but
+// it is very fast, and makes a nice clean default implementation.  Because it
+// handles lots of little corner cases, other, more complex, alias analysis
+// implementations may choose to rely on this pass to resolve these simple and
+// easy cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+
+// Register the AliasAnalysis interface, providing a nice name to refer to.
+INITIALIZE_ANALYSIS_GROUP(AliasAnalysis, "Alias Analysis", NoAA)
+char AliasAnalysis::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Default chaining methods
+//===----------------------------------------------------------------------===//
+
+AliasAnalysis::AliasResult
+AliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->alias(LocA, LocB);
+}
+
+bool AliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                           bool OrLocal) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->pointsToConstantMemory(Loc, OrLocal);
+}
+
+void AliasAnalysis::deleteValue(Value *V) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  AA->deleteValue(V);
+}
+
+void AliasAnalysis::copyValue(Value *From, Value *To) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  AA->copyValue(From, To);
+}
+
+void AliasAnalysis::addEscapingUse(Use &U) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  AA->addEscapingUse(U);
+}
+
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                             const Location &Loc) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+
+  ModRefBehavior MRB = getModRefBehavior(CS);
+  if (MRB == DoesNotAccessMemory)
+    return NoModRef;
+
+  ModRefResult Mask = ModRef;
+  if (onlyReadsMemory(MRB))
+    Mask = Ref;
+
+  if (onlyAccessesArgPointees(MRB)) {
+    bool doesAlias = false;
+    if (doesAccessArgPointees(MRB)) {
+      MDNode *CSTag = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
+      for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+           AI != AE; ++AI) {
+        const Value *Arg = *AI;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CSLoc(Arg, UnknownSize, CSTag);
+        if (!isNoAlias(CSLoc, Loc)) {
+          doesAlias = true;
+          break;
+        }
+      }
+    }
+    if (!doesAlias)
+      return NoModRef;
+  }
+
+  // If Loc is a constant memory location, the call definitely could not
+  // modify the memory location.
+  if ((Mask & Mod) && pointsToConstantMemory(Loc))
+    Mask = ModRefResult(Mask & ~Mod);
+
+  // If this is the end of the chain, don't forward.
+  if (!AA) return Mask;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any mask we've managed to compute.
+  return ModRefResult(AA->getModRefInfo(CS, Loc) & Mask);
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(ImmutableCallSite CS1, ImmutableCallSite CS2) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+
+  // If CS1 or CS2 are readnone, they don't interact.
+  ModRefBehavior CS1B = getModRefBehavior(CS1);
+  if (CS1B == DoesNotAccessMemory) return NoModRef;
+
+  ModRefBehavior CS2B = getModRefBehavior(CS2);
+  if (CS2B == DoesNotAccessMemory) return NoModRef;
+
+  // If they both only read from memory, there is no dependence.
+  if (onlyReadsMemory(CS1B) && onlyReadsMemory(CS2B))
+    return NoModRef;
+
+  AliasAnalysis::ModRefResult Mask = ModRef;
+
+  // If CS1 only reads memory, the only dependence on CS2 can be
+  // from CS1 reading memory written by CS2.
+  if (onlyReadsMemory(CS1B))
+    Mask = ModRefResult(Mask & Ref);
+
+  // If CS2 only access memory through arguments, accumulate the mod/ref
+  // information from CS1's references to the memory referenced by
+  // CS2's arguments.
+  if (onlyAccessesArgPointees(CS2B)) {
+    AliasAnalysis::ModRefResult R = NoModRef;
+    if (doesAccessArgPointees(CS2B)) {
+      MDNode *CS2Tag = CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
+      for (ImmutableCallSite::arg_iterator
+           I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) {
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CS2Loc(Arg, UnknownSize, CS2Tag);
+        R = ModRefResult((R | getModRefInfo(CS1, CS2Loc)) & Mask);
+        if (R == Mask)
+          break;
+      }
+    }
+    return R;
+  }
+
+  // If CS1 only accesses memory through arguments, check if CS2 references
+  // any of the memory referenced by CS1's arguments. If not, return NoModRef.
+  if (onlyAccessesArgPointees(CS1B)) {
+    AliasAnalysis::ModRefResult R = NoModRef;
+    if (doesAccessArgPointees(CS1B)) {
+      MDNode *CS1Tag = CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa);
+      for (ImmutableCallSite::arg_iterator
+           I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) {
+        const Value *Arg = *I;
+        if (!Arg->getType()->isPointerTy())
+          continue;
+        Location CS1Loc(Arg, UnknownSize, CS1Tag);
+        if (getModRefInfo(CS2, CS1Loc) != NoModRef) {
+          R = Mask;
+          break;
+        }
+      }
+    }
+    if (R == NoModRef)
+      return R;
+  }
+
+  // If this is the end of the chain, don't forward.
+  if (!AA) return Mask;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any mask we've managed to compute.
+  return ModRefResult(AA->getModRefInfo(CS1, CS2) & Mask);
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // Call back into the alias analysis with the other form of getModRefBehavior
+  // to see if it can give a better response.
+  if (const Function *F = CS.getCalledFunction())
+    Min = getModRefBehavior(F);
+
+  // If this is the end of the chain, don't forward.
+  if (!AA) return Min;
+
+  // Otherwise, fall back to the next AA in the chain. But we can merge
+  // in any result we've managed to compute.
+  return ModRefBehavior(AA->getModRefBehavior(CS) & Min);
+}
+
+AliasAnalysis::ModRefBehavior
+AliasAnalysis::getModRefBehavior(const Function *F) {
+  assert(AA && "AA didn't call InitializeAliasAnalysis in its run method!");
+  return AA->getModRefBehavior(F);
+}
+
+//===----------------------------------------------------------------------===//
+// AliasAnalysis non-virtual helper method implementation
+//===----------------------------------------------------------------------===//
+
+AliasAnalysis::Location AliasAnalysis::getLocation(const LoadInst *LI) {
+  return Location(LI->getPointerOperand(),
+                  getTypeStoreSize(LI->getType()),
+                  LI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location AliasAnalysis::getLocation(const StoreInst *SI) {
+  return Location(SI->getPointerOperand(),
+                  getTypeStoreSize(SI->getValueOperand()->getType()),
+                  SI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location AliasAnalysis::getLocation(const VAArgInst *VI) {
+  return Location(VI->getPointerOperand(),
+                  UnknownSize,
+                  VI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location
+AliasAnalysis::getLocation(const AtomicCmpXchgInst *CXI) {
+  return Location(CXI->getPointerOperand(),
+                  getTypeStoreSize(CXI->getCompareOperand()->getType()),
+                  CXI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location
+AliasAnalysis::getLocation(const AtomicRMWInst *RMWI) {
+  return Location(RMWI->getPointerOperand(),
+                  getTypeStoreSize(RMWI->getValOperand()->getType()),
+                  RMWI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location 
+AliasAnalysis::getLocationForSource(const MemTransferInst *MTI) {
+  uint64_t Size = UnknownSize;
+  if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
+    Size = C->getValue().getZExtValue();
+
+  // memcpy/memmove can have TBAA tags. For memcpy, they apply
+  // to both the source and the destination.
+  MDNode *TBAATag = MTI->getMetadata(LLVMContext::MD_tbaa);
+
+  return Location(MTI->getRawSource(), Size, TBAATag);
+}
+
+AliasAnalysis::Location 
+AliasAnalysis::getLocationForDest(const MemIntrinsic *MTI) {
+  uint64_t Size = UnknownSize;
+  if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
+    Size = C->getValue().getZExtValue();
+
+  // memcpy/memmove can have TBAA tags. For memcpy, they apply
+  // to both the source and the destination.
+  MDNode *TBAATag = MTI->getMetadata(LLVMContext::MD_tbaa);
+  
+  return Location(MTI->getRawDest(), Size, TBAATag);
+}
+
+
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const LoadInst *L, const Location &Loc) {
+  // Be conservative in the face of volatile/atomic.
+  if (!L->isUnordered())
+    return ModRef;
+
+  // If the load address doesn't alias the given address, it doesn't read
+  // or write the specified memory.
+  if (!alias(getLocation(L), Loc))
+    return NoModRef;
+
+  // Otherwise, a load just reads.
+  return Ref;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const StoreInst *S, const Location &Loc) {
+  // Be conservative in the face of volatile/atomic.
+  if (!S->isUnordered())
+    return ModRef;
+
+  // If the store address cannot alias the pointer in question, then the
+  // specified memory cannot be modified by the store.
+  if (!alias(getLocation(S), Loc))
+    return NoModRef;
+
+  // If the pointer is a pointer to constant memory, then it could not have been
+  // modified by this store.
+  if (pointsToConstantMemory(Loc))
+    return NoModRef;
+
+  // Otherwise, a store just writes.
+  return Mod;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const VAArgInst *V, const Location &Loc) {
+  // If the va_arg address cannot alias the pointer in question, then the
+  // specified memory cannot be accessed by the va_arg.
+  if (!alias(getLocation(V), Loc))
+    return NoModRef;
+
+  // If the pointer is a pointer to constant memory, then it could not have been
+  // modified by this va_arg.
+  if (pointsToConstantMemory(Loc))
+    return NoModRef;
+
+  // Otherwise, a va_arg reads and writes.
+  return ModRef;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const AtomicCmpXchgInst *CX, const Location &Loc) {
+  // Acquire/Release cmpxchg has properties that matter for arbitrary addresses.
+  if (CX->getOrdering() > Monotonic)
+    return ModRef;
+
+  // If the cmpxchg address does not alias the location, it does not access it.
+  if (!alias(getLocation(CX), Loc))
+    return NoModRef;
+
+  return ModRef;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) {
+  // Acquire/Release atomicrmw has properties that matter for arbitrary addresses.
+  if (RMW->getOrdering() > Monotonic)
+    return ModRef;
+
+  // If the atomicrmw address does not alias the location, it does not access it.
+  if (!alias(getLocation(RMW), Loc))
+    return NoModRef;
+
+  return ModRef;
+}
+
+namespace {
+  // Conservatively return true. Return false, if there is a single path
+  // starting from "From" and the path does not reach "To".
+  static bool hasPath(const BasicBlock *From, const BasicBlock *To) {
+    const unsigned MaxCheck = 5;
+    const BasicBlock *Current = From;
+    for (unsigned I = 0; I < MaxCheck; I++) {
+      unsigned NumSuccs = Current->getTerminator()->getNumSuccessors();
+      if (NumSuccs > 1)
+        return true;
+      if (NumSuccs == 0)
+        return false;
+      Current = Current->getTerminator()->getSuccessor(0);
+      if (Current == To)
+        return true;
+    }
+    return true;
+  }
+
+  /// Only find pointer captures which happen before the given instruction. Uses
+  /// the dominator tree to determine whether one instruction is before another.
+  /// Only support the case where the Value is defined in the same basic block
+  /// as the given instruction and the use.
+  struct CapturesBefore : public CaptureTracker {
+    CapturesBefore(const Instruction *I, DominatorTree *DT)
+      : BeforeHere(I), DT(DT), Captured(false) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool shouldExplore(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      // We explore this usage only if the usage can reach "BeforeHere".
+      // If use is not reachable from entry, there is no need to explore.
+      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
+        return false;
+      // If the value is defined in the same basic block as use and BeforeHere,
+      // there is no need to explore the use if BeforeHere dominates use.
+      // Check whether there is a path from I to BeforeHere.
+      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
+          !hasPath(BB, BeforeHere->getParent()))
+        return false;
+      return true;
+    }
+
+    bool captured(Use *U) {
+      Instruction *I = cast<Instruction>(U->getUser());
+      BasicBlock *BB = I->getParent();
+      // Same logic as in shouldExplore.
+      if (BeforeHere != I && !DT->isReachableFromEntry(BB))
+        return false;
+      if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
+          !hasPath(BB, BeforeHere->getParent()))
+        return false;
+      Captured = true;
+      return true;
+    }
+
+    const Instruction *BeforeHere;
+    DominatorTree *DT;
+
+    bool Captured;
+  };
+}
+
+// FIXME: this is really just shoring-up a deficiency in alias analysis.
+// BasicAA isn't willing to spend linear time determining whether an alloca
+// was captured before or after this particular call, while we are. However,
+// with a smarter AA in place, this test is just wasting compile time.
+AliasAnalysis::ModRefResult
+AliasAnalysis::callCapturesBefore(const Instruction *I,
+                                  const AliasAnalysis::Location &MemLoc,
+                                  DominatorTree *DT) {
+  if (!DT || !TD) return AliasAnalysis::ModRef;
+
+  const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
+  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object) ||
+      isa<Constant>(Object))
+    return AliasAnalysis::ModRef;
+
+  ImmutableCallSite CS(I);
+  if (!CS.getInstruction() || CS.getInstruction() == Object)
+    return AliasAnalysis::ModRef;
+
+  CapturesBefore CB(I, DT);
+  llvm::PointerMayBeCaptured(Object, &CB);
+  if (CB.Captured)
+    return AliasAnalysis::ModRef;
+
+  unsigned ArgNo = 0;
+  for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+       CI != CE; ++CI, ++ArgNo) {
+    // Only look at the no-capture or byval pointer arguments.  If this
+    // pointer were passed to arguments that were neither of these, then it
+    // couldn't be no-capture.
+    if (!(*CI)->getType()->isPointerTy() ||
+        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
+      continue;
+
+    // If this is a no-capture pointer argument, see if we can tell that it
+    // is impossible to alias the pointer we're checking.  If not, we have to
+    // assume that the call could touch the pointer, even though it doesn't
+    // escape.
+    if (!isNoAlias(AliasAnalysis::Location(*CI),
+                   AliasAnalysis::Location(Object))) {
+      return AliasAnalysis::ModRef;
+    }
+  }
+  return AliasAnalysis::NoModRef;
+}
+
+// AliasAnalysis destructor: DO NOT move this to the header file for
+// AliasAnalysis or else clients of the AliasAnalysis class may not depend on
+// the AliasAnalysis.o file in the current .a file, causing alias analysis
+// support to not be included in the tool correctly!
+//
+AliasAnalysis::~AliasAnalysis() {}
+
+/// InitializeAliasAnalysis - Subclasses must call this method to initialize the
+/// AliasAnalysis interface before any other methods are called.
+///
+void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
+  TD = P->getAnalysisIfAvailable<DataLayout>();
+  TLI = P->getAnalysisIfAvailable<TargetLibraryInfo>();
+  AA = &P->getAnalysis<AliasAnalysis>();
+}
+
+// getAnalysisUsage - All alias analysis implementations should invoke this
+// directly (using AliasAnalysis::getAnalysisUsage(AU)).
+void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();         // All AA's chain
+}
+
+/// getTypeStoreSize - Return the DataLayout store size for the given type,
+/// if known, or a conservative value otherwise.
+///
+uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) {
+  return TD ? TD->getTypeStoreSize(Ty) : UnknownSize;
+}
+
+/// canBasicBlockModify - Return true if it is possible for execution of the
+/// specified basic block to modify the value pointed to by Ptr.
+///
+bool AliasAnalysis::canBasicBlockModify(const BasicBlock &BB,
+                                        const Location &Loc) {
+  return canInstructionRangeModify(BB.front(), BB.back(), Loc);
+}
+
+/// canInstructionRangeModify - Return true if it is possible for the execution
+/// of the specified instructions to modify the value pointed to by Ptr.  The
+/// instructions to consider are all of the instructions in the range of [I1,I2]
+/// INCLUSIVE.  I1 and I2 must be in the same basic block.
+///
+bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
+                                              const Instruction &I2,
+                                              const Location &Loc) {
+  assert(I1.getParent() == I2.getParent() &&
+         "Instructions not in same basic block!");
+  BasicBlock::const_iterator I = &I1;
+  BasicBlock::const_iterator E = &I2;
+  ++E;  // Convert from inclusive to exclusive range.
+
+  for (; I != E; ++I) // Check every instruction in range
+    if (getModRefInfo(I, Loc) & Mod)
+      return true;
+  return false;
+}
+
+/// isNoAliasCall - Return true if this pointer is returned by a noalias
+/// function.
+bool llvm::isNoAliasCall(const Value *V) {
+  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+    return ImmutableCallSite(cast<Instruction>(V))
+      .paramHasAttr(0, Attribute::NoAlias);
+  return false;
+}
+
+/// isIdentifiedObject - Return true if this pointer refers to a distinct and
+/// identifiable object.  This returns true for:
+///    Global Variables and Functions (but not Global Aliases)
+///    Allocas and Mallocs
+///    ByVal and NoAlias Arguments
+///    NoAlias returns
+///
+bool llvm::isIdentifiedObject(const Value *V) {
+  if (isa<AllocaInst>(V))
+    return true;
+  if (isa<GlobalValue>(V) && !isa<GlobalAlias>(V))
+    return true;
+  if (isNoAliasCall(V))
+    return true;
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasNoAliasAttr() || A->hasByValAttr();
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp
new file mode 100644
index 000000000000..9f4a47c77e03
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisCounter.cpp
@@ -0,0 +1,171 @@
+//===- AliasAnalysisCounter.cpp - Alias Analysis Query Counter ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass which can be used to count how many alias queries
+// are being made and how the alias analysis implementation being used responds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+PrintAll("count-aa-print-all-queries", cl::ReallyHidden, cl::init(true));
+static cl::opt<bool>
+PrintAllFailures("count-aa-print-all-failed-queries", cl::ReallyHidden);
+
+namespace {
+  class AliasAnalysisCounter : public ModulePass, public AliasAnalysis {
+    unsigned No, May, Partial, Must;
+    unsigned NoMR, JustRef, JustMod, MR;
+    Module *M;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    AliasAnalysisCounter() : ModulePass(ID) {
+      initializeAliasAnalysisCounterPass(*PassRegistry::getPassRegistry());
+      No = May = Partial = Must = 0;
+      NoMR = JustRef = JustMod = MR = 0;
+    }
+
+    void printLine(const char *Desc, unsigned Val, unsigned Sum) {
+      errs() <<  "  " << Val << " " << Desc << " responses ("
+             << Val*100/Sum << "%)\n";
+    }
+    ~AliasAnalysisCounter() {
+      unsigned AASum = No+May+Partial+Must;
+      unsigned MRSum = NoMR+JustRef+JustMod+MR;
+      if (AASum + MRSum) { // Print a report if any counted queries occurred...
+        errs() << "\n===== Alias Analysis Counter Report =====\n"
+               << "  Analysis counted:\n"
+               << "  " << AASum << " Total Alias Queries Performed\n";
+        if (AASum) {
+          printLine("no alias",     No, AASum);
+          printLine("may alias",   May, AASum);
+          printLine("partial alias", Partial, AASum);
+          printLine("must alias", Must, AASum);
+          errs() << "  Alias Analysis Counter Summary: " << No*100/AASum << "%/"
+                 << May*100/AASum << "%/"
+                 << Partial*100/AASum << "%/"
+                 << Must*100/AASum<<"%\n\n";
+        }
+
+        errs() << "  " << MRSum    << " Total Mod/Ref Queries Performed\n";
+        if (MRSum) {
+          printLine("no mod/ref",    NoMR, MRSum);
+          printLine("ref",        JustRef, MRSum);
+          printLine("mod",        JustMod, MRSum);
+          printLine("mod/ref",         MR, MRSum);
+          errs() << "  Mod/Ref Analysis Counter Summary: " <<NoMR*100/MRSum
+                 << "%/" << JustRef*100/MRSum << "%/" << JustMod*100/MRSum
+                 << "%/" << MR*100/MRSum <<"%\n\n";
+        }
+      }
+    }
+
+    bool runOnModule(Module &M) {
+      this->M = &M;
+      InitializeAliasAnalysis(this);
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.setPreservesAll();
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+    
+    // FIXME: We could count these too...
+    bool pointsToConstantMemory(const Location &Loc, bool OrLocal) {
+      return getAnalysis<AliasAnalysis>().pointsToConstantMemory(Loc, OrLocal);
+    }
+
+    // Forwarding functions: just delegate to a real AA implementation, counting
+    // the number of responses...
+    AliasResult alias(const Location &LocA, const Location &LocB);
+
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Location &Loc);
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    }
+  };
+}
+
+char AliasAnalysisCounter::ID = 0;
+INITIALIZE_AG_PASS(AliasAnalysisCounter, AliasAnalysis, "count-aa",
+                   "Count Alias Analysis Query Responses", false, true, false)
+
+ModulePass *llvm::createAliasAnalysisCounterPass() {
+  return new AliasAnalysisCounter();
+}
+
+AliasAnalysis::AliasResult
+AliasAnalysisCounter::alias(const Location &LocA, const Location &LocB) {
+  AliasResult R = getAnalysis<AliasAnalysis>().alias(LocA, LocB);
+
+  const char *AliasString = 0;
+  switch (R) {
+  case NoAlias:   No++;   AliasString = "No alias"; break;
+  case MayAlias:  May++;  AliasString = "May alias"; break;
+  case PartialAlias: Partial++; AliasString = "Partial alias"; break;
+  case MustAlias: Must++; AliasString = "Must alias"; break;
+  }
+
+  if (PrintAll || (PrintAllFailures && R == MayAlias)) {
+    errs() << AliasString << ":\t";
+    errs() << "[" << LocA.Size << "B] ";
+    WriteAsOperand(errs(), LocA.Ptr, true, M);
+    errs() << ", ";
+    errs() << "[" << LocB.Size << "B] ";
+    WriteAsOperand(errs(), LocB.Ptr, true, M);
+    errs() << "\n";
+  }
+
+  return R;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysisCounter::getModRefInfo(ImmutableCallSite CS,
+                                    const Location &Loc) {
+  ModRefResult R = getAnalysis<AliasAnalysis>().getModRefInfo(CS, Loc);
+
+  const char *MRString = 0;
+  switch (R) {
+  case NoModRef: NoMR++;     MRString = "NoModRef"; break;
+  case Ref:      JustRef++;  MRString = "JustRef"; break;
+  case Mod:      JustMod++;  MRString = "JustMod"; break;
+  case ModRef:   MR++;       MRString = "ModRef"; break;
+  }
+
+  if (PrintAll || (PrintAllFailures && R == ModRef)) {
+    errs() << MRString << ":  Ptr: ";
+    errs() << "[" << Loc.Size << "B] ";
+    WriteAsOperand(errs(), Loc.Ptr, true, M);
+    errs() << "\t<->" << *CS.getInstruction() << '\n';
+  }
+  return R;
+}
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
new file mode 100644
index 000000000000..a571463dfe12
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -0,0 +1,372 @@
+//===- AliasAnalysisEvaluator.cpp - Alias Analysis Accuracy Evaluator -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple N^2 alias analysis accuracy evaluator.
+// Basically, for each function in the program, it simply queries to see how the
+// alias analysis implementation answers alias queries between each pair of
+// pointers in the function.
+//
+// This is inspired and adapted from code by: Naveen Neelakantam, Francesco
+// Spadini, and Wojciech Stryjewski.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool> PrintAll("print-all-alias-modref-info", cl::ReallyHidden);
+
+static cl::opt<bool> PrintNoAlias("print-no-aliases", cl::ReallyHidden);
+static cl::opt<bool> PrintMayAlias("print-may-aliases", cl::ReallyHidden);
+static cl::opt<bool> PrintPartialAlias("print-partial-aliases", cl::ReallyHidden);
+static cl::opt<bool> PrintMustAlias("print-must-aliases", cl::ReallyHidden);
+
+static cl::opt<bool> PrintNoModRef("print-no-modref", cl::ReallyHidden);
+static cl::opt<bool> PrintMod("print-mod", cl::ReallyHidden);
+static cl::opt<bool> PrintRef("print-ref", cl::ReallyHidden);
+static cl::opt<bool> PrintModRef("print-modref", cl::ReallyHidden);
+
+static cl::opt<bool> EvalTBAA("evaluate-tbaa", cl::ReallyHidden);
+
+namespace {
+  class AAEval : public FunctionPass {
+    unsigned NoAlias, MayAlias, PartialAlias, MustAlias;
+    unsigned NoModRef, Mod, Ref, ModRef;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    AAEval() : FunctionPass(ID) {
+      initializeAAEvalPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.setPreservesAll();
+    }
+
+    bool doInitialization(Module &M) {
+      NoAlias = MayAlias = PartialAlias = MustAlias = 0;
+      NoModRef = Mod = Ref = ModRef = 0;
+
+      if (PrintAll) {
+        PrintNoAlias = PrintMayAlias = true;
+        PrintPartialAlias = PrintMustAlias = true;
+        PrintNoModRef = PrintMod = PrintRef = PrintModRef = true;
+      }
+      return false;
+    }
+
+    bool runOnFunction(Function &F);
+    bool doFinalization(Module &M);
+  };
+}
+
+char AAEval::ID = 0;
+INITIALIZE_PASS_BEGIN(AAEval, "aa-eval",
+                "Exhaustive Alias Analysis Precision Evaluator", false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(AAEval, "aa-eval",
+                "Exhaustive Alias Analysis Precision Evaluator", false, true)
+
+FunctionPass *llvm::createAAEvalPass() { return new AAEval(); }
+
+static void PrintResults(const char *Msg, bool P, const Value *V1,
+                         const Value *V2, const Module *M) {
+  if (P) {
+    std::string o1, o2;
+    {
+      raw_string_ostream os1(o1), os2(o2);
+      WriteAsOperand(os1, V1, true, M);
+      WriteAsOperand(os2, V2, true, M);
+    }
+    
+    if (o2 < o1)
+      std::swap(o1, o2);
+    errs() << "  " << Msg << ":\t"
+           << o1 << ", "
+           << o2 << "\n";
+  }
+}
+
+static inline void
+PrintModRefResults(const char *Msg, bool P, Instruction *I, Value *Ptr,
+                   Module *M) {
+  if (P) {
+    errs() << "  " << Msg << ":  Ptr: ";
+    WriteAsOperand(errs(), Ptr, true, M);
+    errs() << "\t<->" << *I << '\n';
+  }
+}
+
+static inline void
+PrintModRefResults(const char *Msg, bool P, CallSite CSA, CallSite CSB,
+                   Module *M) {
+  if (P) {
+    errs() << "  " << Msg << ": " << *CSA.getInstruction()
+           << " <-> " << *CSB.getInstruction() << '\n';
+  }
+}
+
+static inline void
+PrintLoadStoreResults(const char *Msg, bool P, const Value *V1,
+                      const Value *V2, const Module *M) {
+  if (P) {
+    errs() << "  " << Msg << ": " << *V1
+           << " <-> " << *V2 << '\n';
+  }
+}
+
+static inline bool isInterestingPointer(Value *V) {
+  return V->getType()->isPointerTy()
+      && !isa<ConstantPointerNull>(V);
+}
+
+bool AAEval::runOnFunction(Function &F) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  SetVector<Value *> Pointers;
+  SetVector<CallSite> CallSites;
+  SetVector<Value *> Loads;
+  SetVector<Value *> Stores;
+
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    if (I->getType()->isPointerTy())    // Add all pointer arguments.
+      Pointers.insert(I);
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    if (I->getType()->isPointerTy()) // Add all pointer instructions.
+      Pointers.insert(&*I);
+    if (EvalTBAA && isa<LoadInst>(&*I))
+      Loads.insert(&*I);
+    if (EvalTBAA && isa<StoreInst>(&*I))
+      Stores.insert(&*I);
+    Instruction &Inst = *I;
+    if (CallSite CS = cast<Value>(&Inst)) {
+      Value *Callee = CS.getCalledValue();
+      // Skip actual functions for direct function calls.
+      if (!isa<Function>(Callee) && isInterestingPointer(Callee))
+        Pointers.insert(Callee);
+      // Consider formals.
+      for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+           AI != AE; ++AI)
+        if (isInterestingPointer(*AI))
+          Pointers.insert(*AI);
+      CallSites.insert(CS);
+    } else {
+      // Consider all operands.
+      for (Instruction::op_iterator OI = Inst.op_begin(), OE = Inst.op_end();
+           OI != OE; ++OI)
+        if (isInterestingPointer(*OI))
+          Pointers.insert(*OI);
+    }
+  }
+
+  if (PrintNoAlias || PrintMayAlias || PrintPartialAlias || PrintMustAlias ||
+      PrintNoModRef || PrintMod || PrintRef || PrintModRef)
+    errs() << "Function: " << F.getName() << ": " << Pointers.size()
+           << " pointers, " << CallSites.size() << " call sites\n";
+
+  // iterate over the worklist, and run the full (n^2)/2 disambiguations
+  for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
+       I1 != E; ++I1) {
+    uint64_t I1Size = AliasAnalysis::UnknownSize;
+    Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
+    if (I1ElTy->isSized()) I1Size = AA.getTypeStoreSize(I1ElTy);
+
+    for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
+      uint64_t I2Size = AliasAnalysis::UnknownSize;
+      Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
+      if (I2ElTy->isSized()) I2Size = AA.getTypeStoreSize(I2ElTy);
+
+      switch (AA.alias(*I1, I1Size, *I2, I2Size)) {
+      case AliasAnalysis::NoAlias:
+        PrintResults("NoAlias", PrintNoAlias, *I1, *I2, F.getParent());
+        ++NoAlias; break;
+      case AliasAnalysis::MayAlias:
+        PrintResults("MayAlias", PrintMayAlias, *I1, *I2, F.getParent());
+        ++MayAlias; break;
+      case AliasAnalysis::PartialAlias:
+        PrintResults("PartialAlias", PrintPartialAlias, *I1, *I2,
+                     F.getParent());
+        ++PartialAlias; break;
+      case AliasAnalysis::MustAlias:
+        PrintResults("MustAlias", PrintMustAlias, *I1, *I2, F.getParent());
+        ++MustAlias; break;
+      }
+    }
+  }
+
+  if (EvalTBAA) {
+    // iterate over all pairs of load, store
+    for (SetVector<Value *>::iterator I1 = Loads.begin(), E = Loads.end();
+         I1 != E; ++I1) {
+      for (SetVector<Value *>::iterator I2 = Stores.begin(), E2 = Stores.end();
+           I2 != E2; ++I2) {
+        switch (AA.alias(AA.getLocation(cast<LoadInst>(*I1)),
+                         AA.getLocation(cast<StoreInst>(*I2)))) {
+        case AliasAnalysis::NoAlias:
+          PrintLoadStoreResults("NoAlias", PrintNoAlias, *I1, *I2,
+                                F.getParent());
+          ++NoAlias; break;
+        case AliasAnalysis::MayAlias:
+          PrintLoadStoreResults("MayAlias", PrintMayAlias, *I1, *I2,
+                                F.getParent());
+          ++MayAlias; break;
+        case AliasAnalysis::PartialAlias:
+          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, *I1, *I2,
+                                F.getParent());
+          ++PartialAlias; break;
+        case AliasAnalysis::MustAlias:
+          PrintLoadStoreResults("MustAlias", PrintMustAlias, *I1, *I2,
+                                F.getParent());
+          ++MustAlias; break;
+        }
+      }
+    }
+
+    // iterate over all pairs of store, store
+    for (SetVector<Value *>::iterator I1 = Stores.begin(), E = Stores.end();
+         I1 != E; ++I1) {
+      for (SetVector<Value *>::iterator I2 = Stores.begin(); I2 != I1; ++I2) {
+        switch (AA.alias(AA.getLocation(cast<StoreInst>(*I1)),
+                         AA.getLocation(cast<StoreInst>(*I2)))) {
+        case AliasAnalysis::NoAlias:
+          PrintLoadStoreResults("NoAlias", PrintNoAlias, *I1, *I2,
+                                F.getParent());
+          ++NoAlias; break;
+        case AliasAnalysis::MayAlias:
+          PrintLoadStoreResults("MayAlias", PrintMayAlias, *I1, *I2,
+                                F.getParent());
+          ++MayAlias; break;
+        case AliasAnalysis::PartialAlias:
+          PrintLoadStoreResults("PartialAlias", PrintPartialAlias, *I1, *I2,
+                                F.getParent());
+          ++PartialAlias; break;
+        case AliasAnalysis::MustAlias:
+          PrintLoadStoreResults("MustAlias", PrintMustAlias, *I1, *I2,
+                                F.getParent());
+          ++MustAlias; break;
+        }
+      }
+    }
+  }
+
+  // Mod/ref alias analysis: compare all pairs of calls and values
+  for (SetVector<CallSite>::iterator C = CallSites.begin(),
+         Ce = CallSites.end(); C != Ce; ++C) {
+    Instruction *I = C->getInstruction();
+
+    for (SetVector<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end();
+         V != Ve; ++V) {
+      uint64_t Size = AliasAnalysis::UnknownSize;
+      Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
+      if (ElTy->isSized()) Size = AA.getTypeStoreSize(ElTy);
+
+      switch (AA.getModRefInfo(*C, *V, Size)) {
+      case AliasAnalysis::NoModRef:
+        PrintModRefResults("NoModRef", PrintNoModRef, I, *V, F.getParent());
+        ++NoModRef; break;
+      case AliasAnalysis::Mod:
+        PrintModRefResults("Just Mod", PrintMod, I, *V, F.getParent());
+        ++Mod; break;
+      case AliasAnalysis::Ref:
+        PrintModRefResults("Just Ref", PrintRef, I, *V, F.getParent());
+        ++Ref; break;
+      case AliasAnalysis::ModRef:
+        PrintModRefResults("Both ModRef", PrintModRef, I, *V, F.getParent());
+        ++ModRef; break;
+      }
+    }
+  }
+
+  // Mod/ref alias analysis: compare all pairs of calls
+  for (SetVector<CallSite>::iterator C = CallSites.begin(),
+         Ce = CallSites.end(); C != Ce; ++C) {
+    for (SetVector<CallSite>::iterator D = CallSites.begin(); D != Ce; ++D) {
+      if (D == C)
+        continue;
+      switch (AA.getModRefInfo(*C, *D)) {
+      case AliasAnalysis::NoModRef:
+        PrintModRefResults("NoModRef", PrintNoModRef, *C, *D, F.getParent());
+        ++NoModRef; break;
+      case AliasAnalysis::Mod:
+        PrintModRefResults("Just Mod", PrintMod, *C, *D, F.getParent());
+        ++Mod; break;
+      case AliasAnalysis::Ref:
+        PrintModRefResults("Just Ref", PrintRef, *C, *D, F.getParent());
+        ++Ref; break;
+      case AliasAnalysis::ModRef:
+        PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent());
+        ++ModRef; break;
+      }
+    }
+  }
+
+  return false;
+}
+
+static void PrintPercent(unsigned Num, unsigned Sum) {
+  errs() << "(" << Num*100ULL/Sum << "."
+         << ((Num*1000ULL/Sum) % 10) << "%)\n";
+}
+
+bool AAEval::doFinalization(Module &M) {
+  unsigned AliasSum = NoAlias + MayAlias + PartialAlias + MustAlias;
+  errs() << "===== Alias Analysis Evaluator Report =====\n";
+  if (AliasSum == 0) {
+    errs() << "  Alias Analysis Evaluator Summary: No pointers!\n";
+  } else {
+    errs() << "  " << AliasSum << " Total Alias Queries Performed\n";
+    errs() << "  " << NoAlias << " no alias responses ";
+    PrintPercent(NoAlias, AliasSum);
+    errs() << "  " << MayAlias << " may alias responses ";
+    PrintPercent(MayAlias, AliasSum);
+    errs() << "  " << PartialAlias << " partial alias responses ";
+    PrintPercent(PartialAlias, AliasSum);
+    errs() << "  " << MustAlias << " must alias responses ";
+    PrintPercent(MustAlias, AliasSum);
+    errs() << "  Alias Analysis Evaluator Pointer Alias Summary: "
+           << NoAlias*100/AliasSum  << "%/" << MayAlias*100/AliasSum << "%/"
+           << PartialAlias*100/AliasSum << "%/"
+           << MustAlias*100/AliasSum << "%\n";
+  }
+
+  // Display the summary for mod/ref analysis
+  unsigned ModRefSum = NoModRef + Mod + Ref + ModRef;
+  if (ModRefSum == 0) {
+    errs() << "  Alias Analysis Mod/Ref Evaluator Summary: no mod/ref!\n";
+  } else {
+    errs() << "  " << ModRefSum << " Total ModRef Queries Performed\n";
+    errs() << "  " << NoModRef << " no mod/ref responses ";
+    PrintPercent(NoModRef, ModRefSum);
+    errs() << "  " << Mod << " mod responses ";
+    PrintPercent(Mod, ModRefSum);
+    errs() << "  " << Ref << " ref responses ";
+    PrintPercent(Ref, ModRefSum);
+    errs() << "  " << ModRef << " mod & ref responses ";
+    PrintPercent(ModRef, ModRefSum);
+    errs() << "  Alias Analysis Evaluator Mod/Ref Summary: "
+           << NoModRef*100/ModRefSum  << "%/" << Mod*100/ModRefSum << "%/"
+           << Ref*100/ModRefSum << "%/" << ModRef*100/ModRefSum << "%\n";
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/AliasDebugger.cpp b/contrib/llvm/lib/Analysis/AliasDebugger.cpp
new file mode 100644
index 000000000000..f6178e36f0a9
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/AliasDebugger.cpp
@@ -0,0 +1,138 @@
+//===- AliasDebugger.cpp - Simple Alias Analysis Use Checker --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass checks alias analysis users to ensure that if they
+// create a new value, they do not query AA without informing it of the value.
+// It acts as a shim over any other AA pass you want.
+//
+// Yes keeping track of every value in the program is expensive, but this is 
+// a debugging pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <set>
+using namespace llvm;
+
+namespace {
+  
+  class AliasDebugger : public ModulePass, public AliasAnalysis {
+
+    //What we do is simple.  Keep track of every value the AA could
+    //know about, and verify that queries are one of those.
+    //A query to a value that didn't exist when the AA was created
+    //means someone forgot to update the AA when creating new values
+
+    std::set<const Value*> Vals;
+    
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    AliasDebugger() : ModulePass(ID) {
+      initializeAliasDebuggerPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) {
+      InitializeAliasAnalysis(this);                 // set up super class
+
+      for(Module::global_iterator I = M.global_begin(),
+            E = M.global_end(); I != E; ++I) {
+        Vals.insert(&*I);
+        for (User::const_op_iterator OI = I->op_begin(),
+             OE = I->op_end(); OI != OE; ++OI)
+          Vals.insert(*OI);
+      }
+
+      for(Module::iterator I = M.begin(),
+            E = M.end(); I != E; ++I){
+        Vals.insert(&*I);
+        if(!I->isDeclaration()) {
+          for (Function::arg_iterator AI = I->arg_begin(), AE = I->arg_end();
+               AI != AE; ++AI) 
+            Vals.insert(&*AI);     
+          for (Function::const_iterator FI = I->begin(), FE = I->end();
+               FI != FE; ++FI) 
+            for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end();
+                 BI != BE; ++BI) {
+              Vals.insert(&*BI);
+              for (User::const_op_iterator OI = BI->op_begin(),
+                   OE = BI->op_end(); OI != OE; ++OI)
+                Vals.insert(*OI);
+            }
+        }
+        
+      }
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.setPreservesAll();                         // Does not transform code
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+    
+    //------------------------------------------------
+    // Implement the AliasAnalysis API
+    //
+    AliasResult alias(const Location &LocA, const Location &LocB) {
+      assert(Vals.find(LocA.Ptr) != Vals.end() &&
+             "Never seen value in AA before");
+      assert(Vals.find(LocB.Ptr) != Vals.end() &&
+             "Never seen value in AA before");
+      return AliasAnalysis::alias(LocA, LocB);
+    }
+
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Location &Loc) {
+      assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before");
+      return AliasAnalysis::getModRefInfo(CS, Loc);
+    }
+
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1,CS2);
+    }
+    
+    bool pointsToConstantMemory(const Location &Loc, bool OrLocal) {
+      assert(Vals.find(Loc.Ptr) != Vals.end() && "Never seen value in AA before");
+      return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+    }
+
+    virtual void deleteValue(Value *V) {
+      assert(Vals.find(V) != Vals.end() && "Never seen value in AA before");
+      AliasAnalysis::deleteValue(V);
+    }
+    virtual void copyValue(Value *From, Value *To) {
+      Vals.insert(To);
+      AliasAnalysis::copyValue(From, To);
+    }
+
+  };
+}
+
+char AliasDebugger::ID = 0;
+INITIALIZE_AG_PASS(AliasDebugger, AliasAnalysis, "debug-aa",
+                   "AA use debugger", false, true, false)
+
+Pass *llvm::createAliasDebugger() { return new AliasDebugger(); }
+
diff --git a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
new file mode 100644
index 000000000000..591052671d6e
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -0,0 +1,655 @@
+//===- AliasSetTracker.cpp - Alias Sets Tracker implementation-------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AliasSetTracker and AliasSet classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// mergeSetIn - Merge the specified alias set into this alias set.
+///
+void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
+  assert(!AS.Forward && "Alias set is already forwarding!");
+  assert(!Forward && "This set is a forwarding set!!");
+
+  // Update the alias and access types of this set...
+  AccessTy |= AS.AccessTy;
+  AliasTy  |= AS.AliasTy;
+  Volatile |= AS.Volatile;
+
+  if (AliasTy == MustAlias) {
+    // Check that these two merged sets really are must aliases.  Since both
+    // used to be must-alias sets, we can just check any pointer from each set
+    // for aliasing.
+    AliasAnalysis &AA = AST.getAliasAnalysis();
+    PointerRec *L = getSomePointer();
+    PointerRec *R = AS.getSomePointer();
+
+    // If the pointers are not a must-alias pair, this set becomes a may alias.
+    if (AA.alias(AliasAnalysis::Location(L->getValue(),
+                                         L->getSize(),
+                                         L->getTBAAInfo()),
+                 AliasAnalysis::Location(R->getValue(),
+                                         R->getSize(),
+                                         R->getTBAAInfo()))
+        != AliasAnalysis::MustAlias)
+      AliasTy = MayAlias;
+  }
+
+  if (UnknownInsts.empty()) {            // Merge call sites...
+    if (!AS.UnknownInsts.empty())
+      std::swap(UnknownInsts, AS.UnknownInsts);
+  } else if (!AS.UnknownInsts.empty()) {
+    UnknownInsts.insert(UnknownInsts.end(), AS.UnknownInsts.begin(), AS.UnknownInsts.end());
+    AS.UnknownInsts.clear();
+  }
+
+  AS.Forward = this;  // Forward across AS now...
+  addRef();           // AS is now pointing to us...
+
+  // Merge the list of constituent pointers...
+  if (AS.PtrList) {
+    *PtrListEnd = AS.PtrList;
+    AS.PtrList->setPrevInList(PtrListEnd);
+    PtrListEnd = AS.PtrListEnd;
+
+    AS.PtrList = 0;
+    AS.PtrListEnd = &AS.PtrList;
+    assert(*AS.PtrListEnd == 0 && "End of list is not null?");
+  }
+}
+
+void AliasSetTracker::removeAliasSet(AliasSet *AS) {
+  if (AliasSet *Fwd = AS->Forward) {
+    Fwd->dropRef(*this);
+    AS->Forward = 0;
+  }
+  AliasSets.erase(AS);
+}
+
+void AliasSet::removeFromTracker(AliasSetTracker &AST) {
+  assert(RefCount == 0 && "Cannot remove non-dead alias set from tracker!");
+  AST.removeAliasSet(this);
+}
+
+void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
+                          uint64_t Size, const MDNode *TBAAInfo,
+                          bool KnownMustAlias) {
+  assert(!Entry.hasAliasSet() && "Entry already in set!");
+
+  // Check to see if we have to downgrade to _may_ alias.
+  if (isMustAlias() && !KnownMustAlias)
+    if (PointerRec *P = getSomePointer()) {
+      AliasAnalysis &AA = AST.getAliasAnalysis();
+      AliasAnalysis::AliasResult Result =
+        AA.alias(AliasAnalysis::Location(P->getValue(), P->getSize(),
+                                         P->getTBAAInfo()),
+                 AliasAnalysis::Location(Entry.getValue(), Size, TBAAInfo));
+      if (Result != AliasAnalysis::MustAlias)
+        AliasTy = MayAlias;
+      else                  // First entry of must alias must have maximum size!
+        P->updateSizeAndTBAAInfo(Size, TBAAInfo);
+      assert(Result != AliasAnalysis::NoAlias && "Cannot be part of must set!");
+    }
+
+  Entry.setAliasSet(this);
+  Entry.updateSizeAndTBAAInfo(Size, TBAAInfo);
+
+  // Add it to the end of the list...
+  assert(*PtrListEnd == 0 && "End of list is not null?");
+  *PtrListEnd = &Entry;
+  PtrListEnd = Entry.setPrevInList(PtrListEnd);
+  assert(*PtrListEnd == 0 && "End of list is not null?");
+  addRef();               // Entry points to alias set.
+}
+
+void AliasSet::addUnknownInst(Instruction *I, AliasAnalysis &AA) {
+  UnknownInsts.push_back(I);
+
+  if (!I->mayWriteToMemory()) {
+    AliasTy = MayAlias;
+    AccessTy |= Refs;
+    return;
+  }
+
+  // FIXME: This should use mod/ref information to make this not suck so bad
+  AliasTy = MayAlias;
+  AccessTy = ModRef;
+}
+
+/// aliasesPointer - Return true if the specified pointer "may" (or must)
+/// alias one of the members in the set.
+///
+bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
+                              const MDNode *TBAAInfo,
+                              AliasAnalysis &AA) const {
+  if (AliasTy == MustAlias) {
+    assert(UnknownInsts.empty() && "Illegal must alias set!");
+
+    // If this is a set of MustAliases, only check to see if the pointer aliases
+    // SOME value in the set.
+    PointerRec *SomePtr = getSomePointer();
+    assert(SomePtr && "Empty must-alias set??");
+    return AA.alias(AliasAnalysis::Location(SomePtr->getValue(),
+                                            SomePtr->getSize(),
+                                            SomePtr->getTBAAInfo()),
+                    AliasAnalysis::Location(Ptr, Size, TBAAInfo));
+  }
+
+  // If this is a may-alias set, we have to check all of the pointers in the set
+  // to be sure it doesn't alias the set...
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (AA.alias(AliasAnalysis::Location(Ptr, Size, TBAAInfo),
+                 AliasAnalysis::Location(I.getPointer(), I.getSize(),
+                                         I.getTBAAInfo())))
+      return true;
+
+  // Check the unknown instructions...
+  if (!UnknownInsts.empty()) {
+    for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
+      if (AA.getModRefInfo(UnknownInsts[i],
+                           AliasAnalysis::Location(Ptr, Size, TBAAInfo)) !=
+            AliasAnalysis::NoModRef)
+        return true;
+  }
+
+  return false;
+}
+
+bool AliasSet::aliasesUnknownInst(Instruction *Inst, AliasAnalysis &AA) const {
+  if (!Inst->mayReadOrWriteMemory())
+    return false;
+
+  for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
+    CallSite C1 = getUnknownInst(i), C2 = Inst;
+    if (!C1 || !C2 ||
+        AA.getModRefInfo(C1, C2) != AliasAnalysis::NoModRef ||
+        AA.getModRefInfo(C2, C1) != AliasAnalysis::NoModRef)
+      return true;
+  }
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (AA.getModRefInfo(Inst, AliasAnalysis::Location(I.getPointer(),
+                                                       I.getSize(),
+                                                       I.getTBAAInfo())) !=
+           AliasAnalysis::NoModRef)
+      return true;
+
+  return false;
+}
+
+void AliasSetTracker::clear() {
+  // Delete all the PointerRec entries.
+  for (PointerMapType::iterator I = PointerMap.begin(), E = PointerMap.end();
+       I != E; ++I)
+    I->second->eraseFromList();
+  
+  PointerMap.clear();
+  
+  // The alias sets should all be clear now.
+  AliasSets.clear();
+}
+
+
+/// findAliasSetForPointer - Given a pointer, find the one alias set to put the
+/// instruction referring to the pointer into.  If there are multiple alias sets
+/// that may alias the pointer, merge them together and return the unified set.
+///
+AliasSet *AliasSetTracker::findAliasSetForPointer(const Value *Ptr,
+                                                  uint64_t Size,
+                                                  const MDNode *TBAAInfo) {
+  AliasSet *FoundSet = 0;
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I->Forward || !I->aliasesPointer(Ptr, Size, TBAAInfo, AA)) continue;
+    
+    if (FoundSet == 0) {  // If this is the first alias set ptr can go into.
+      FoundSet = I;       // Remember it.
+    } else {              // Otherwise, we must merge the sets.
+      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+    }
+  }
+
+  return FoundSet;
+}
+
+/// containsPointer - Return true if the specified location is represented by
+/// this alias set, false otherwise.  This does not modify the AST object or
+/// alias sets.
+bool AliasSetTracker::containsPointer(Value *Ptr, uint64_t Size,
+                                      const MDNode *TBAAInfo) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if (!I->Forward && I->aliasesPointer(Ptr, Size, TBAAInfo, AA))
+      return true;
+  return false;
+}
+
+
+
+AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
+  AliasSet *FoundSet = 0;
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I->Forward || !I->aliasesUnknownInst(Inst, AA))
+      continue;
+    
+    if (FoundSet == 0)        // If this is the first alias set ptr can go into.
+      FoundSet = I;           // Remember it.
+    else if (!I->Forward)     // Otherwise, we must merge the sets.
+      FoundSet->mergeSetIn(*I, *this);     // Merge in contents.
+  }
+  return FoundSet;
+}
+
+
+
+
+/// getAliasSetForPointer - Return the alias set that the specified pointer
+/// lives in.
+AliasSet &AliasSetTracker::getAliasSetForPointer(Value *Pointer, uint64_t Size,
+                                                 const MDNode *TBAAInfo,
+                                                 bool *New) {
+  AliasSet::PointerRec &Entry = getEntryFor(Pointer);
+
+  // Check to see if the pointer is already known.
+  if (Entry.hasAliasSet()) {
+    Entry.updateSizeAndTBAAInfo(Size, TBAAInfo);
+    // Return the set!
+    return *Entry.getAliasSet(*this)->getForwardedTarget(*this);
+  }
+  
+  if (AliasSet *AS = findAliasSetForPointer(Pointer, Size, TBAAInfo)) {
+    // Add it to the alias set it aliases.
+    AS->addPointer(*this, Entry, Size, TBAAInfo);
+    return *AS;
+  }
+  
+  if (New) *New = true;
+  // Otherwise create a new alias set to hold the loaded pointer.
+  AliasSets.push_back(new AliasSet());
+  AliasSets.back().addPointer(*this, Entry, Size, TBAAInfo);
+  return AliasSets.back();
+}
+
+bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
+  bool NewPtr;
+  addPointer(Ptr, Size, TBAAInfo, AliasSet::NoModRef, NewPtr);
+  return NewPtr;
+}
+
+
+bool AliasSetTracker::add(LoadInst *LI) {
+  if (LI->getOrdering() > Monotonic) return addUnknown(LI);
+  AliasSet::AccessType ATy = AliasSet::Refs;
+  if (!LI->isUnordered()) ATy = AliasSet::ModRef;
+  bool NewPtr;
+  AliasSet &AS = addPointer(LI->getOperand(0),
+                            AA.getTypeStoreSize(LI->getType()),
+                            LI->getMetadata(LLVMContext::MD_tbaa),
+                            ATy, NewPtr);
+  if (LI->isVolatile()) AS.setVolatile();
+  return NewPtr;
+}
+
+bool AliasSetTracker::add(StoreInst *SI) {
+  if (SI->getOrdering() > Monotonic) return addUnknown(SI);
+  AliasSet::AccessType ATy = AliasSet::Mods;
+  if (!SI->isUnordered()) ATy = AliasSet::ModRef;
+  bool NewPtr;
+  Value *Val = SI->getOperand(0);
+  AliasSet &AS = addPointer(SI->getOperand(1),
+                            AA.getTypeStoreSize(Val->getType()),
+                            SI->getMetadata(LLVMContext::MD_tbaa),
+                            ATy, NewPtr);
+  if (SI->isVolatile()) AS.setVolatile();
+  return NewPtr;
+}
+
+bool AliasSetTracker::add(VAArgInst *VAAI) {
+  bool NewPtr;
+  addPointer(VAAI->getOperand(0), AliasAnalysis::UnknownSize, 
+             VAAI->getMetadata(LLVMContext::MD_tbaa),
+             AliasSet::ModRef, NewPtr);
+  return NewPtr;
+}
+
+
+bool AliasSetTracker::addUnknown(Instruction *Inst) {
+  if (isa<DbgInfoIntrinsic>(Inst)) 
+    return true; // Ignore DbgInfo Intrinsics.
+  if (!Inst->mayReadOrWriteMemory())
+    return true; // doesn't alias anything
+
+  AliasSet *AS = findAliasSetForUnknownInst(Inst);
+  if (AS) {
+    AS->addUnknownInst(Inst, AA);
+    return false;
+  }
+  AliasSets.push_back(new AliasSet());
+  AS = &AliasSets.back();
+  AS->addUnknownInst(Inst, AA);
+  return true;
+}
+
+bool AliasSetTracker::add(Instruction *I) {
+  // Dispatch to one of the other add methods.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return add(LI);
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return add(SI);
+  if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+    return add(VAAI);
+  return addUnknown(I);
+}
+
+void AliasSetTracker::add(BasicBlock &BB) {
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    add(I);
+}
+
+void AliasSetTracker::add(const AliasSetTracker &AST) {
+  assert(&AA == &AST.AA &&
+         "Merging AliasSetTracker objects with different Alias Analyses!");
+
+  // Loop over all of the alias sets in AST, adding the pointers contained
+  // therein into the current alias sets.  This can cause alias sets to be
+  // merged together in the current AST.
+  for (const_iterator I = AST.begin(), E = AST.end(); I != E; ++I) {
+    if (I->Forward) continue;   // Ignore forwarding alias sets
+    
+    AliasSet &AS = const_cast<AliasSet&>(*I);
+
+    // If there are any call sites in the alias set, add them to this AST.
+    for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
+      add(AS.UnknownInsts[i]);
+
+    // Loop over all of the pointers in this alias set.
+    bool X;
+    for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+      AliasSet &NewAS = addPointer(ASI.getPointer(), ASI.getSize(),
+                                   ASI.getTBAAInfo(),
+                                   (AliasSet::AccessType)AS.AccessTy, X);
+      if (AS.isVolatile()) NewAS.setVolatile();
+    }
+  }
+}
+
+/// remove - Remove the specified (potentially non-empty) alias set from the
+/// tracker.
+void AliasSetTracker::remove(AliasSet &AS) {
+  // Drop all call sites.
+  AS.UnknownInsts.clear();
+  
+  // Clear the alias set.
+  unsigned NumRefs = 0;
+  while (!AS.empty()) {
+    AliasSet::PointerRec *P = AS.PtrList;
+
+    Value *ValToRemove = P->getValue();
+    
+    // Unlink and delete entry from the list of values.
+    P->eraseFromList();
+    
+    // Remember how many references need to be dropped.
+    ++NumRefs;
+
+    // Finally, remove the entry.
+    PointerMap.erase(ValToRemove);
+  }
+  
+  // Stop using the alias set, removing it.
+  AS.RefCount -= NumRefs;
+  if (AS.RefCount == 0)
+    AS.removeFromTracker(*this);
+}
+
+bool
+AliasSetTracker::remove(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
+  AliasSet *AS = findAliasSetForPointer(Ptr, Size, TBAAInfo);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(LoadInst *LI) {
+  uint64_t Size = AA.getTypeStoreSize(LI->getType());
+  const MDNode *TBAAInfo = LI->getMetadata(LLVMContext::MD_tbaa);
+  AliasSet *AS = findAliasSetForPointer(LI->getOperand(0), Size, TBAAInfo);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(StoreInst *SI) {
+  uint64_t Size = AA.getTypeStoreSize(SI->getOperand(0)->getType());
+  const MDNode *TBAAInfo = SI->getMetadata(LLVMContext::MD_tbaa);
+  AliasSet *AS = findAliasSetForPointer(SI->getOperand(1), Size, TBAAInfo);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(VAArgInst *VAAI) {
+  AliasSet *AS = findAliasSetForPointer(VAAI->getOperand(0),
+                                        AliasAnalysis::UnknownSize,
+                                        VAAI->getMetadata(LLVMContext::MD_tbaa));
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::removeUnknown(Instruction *I) {
+  if (!I->mayReadOrWriteMemory())
+    return false; // doesn't alias anything
+
+  AliasSet *AS = findAliasSetForUnknownInst(I);
+  if (!AS) return false;
+  remove(*AS);
+  return true;
+}
+
+bool AliasSetTracker::remove(Instruction *I) {
+  // Dispatch to one of the other remove methods...
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return remove(LI);
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return remove(SI);
+  if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+    return remove(VAAI);
+  return removeUnknown(I);
+}
+
+
+// deleteValue method - This method is used to remove a pointer value from the
+// AliasSetTracker entirely.  It should be used when an instruction is deleted
+// from the program to update the AST.  If you don't use this, you would have
+// dangling pointers to deleted instructions.
+//
+void AliasSetTracker::deleteValue(Value *PtrVal) {
+  // Notify the alias analysis implementation that this value is gone.
+  AA.deleteValue(PtrVal);
+
+  // If this is a call instruction, remove the callsite from the appropriate
+  // AliasSet (if present).
+  if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
+    if (Inst->mayReadOrWriteMemory()) {
+      // Scan all the alias sets to see if this call site is contained.
+      for (iterator I = begin(), E = end(); I != E; ++I) {
+        if (I->Forward) continue;
+        
+        I->removeUnknownInst(Inst);
+      }
+    }
+  }
+
+  // First, look up the PointerRec for this pointer.
+  PointerMapType::iterator I = PointerMap.find_as(PtrVal);
+  if (I == PointerMap.end()) return;  // Noop
+
+  // If we found one, remove the pointer from the alias set it is in.
+  AliasSet::PointerRec *PtrValEnt = I->second;
+  AliasSet *AS = PtrValEnt->getAliasSet(*this);
+
+  // Unlink and delete from the list of values.
+  PtrValEnt->eraseFromList();
+  
+  // Stop using the alias set.
+  AS->dropRef(*this);
+  
+  PointerMap.erase(I);
+}
+
+// copyValue - This method should be used whenever a preexisting value in the
+// program is copied or cloned, introducing a new value.  Note that it is ok for
+// clients that use this method to introduce the same value multiple times: if
+// the tracker already knows about a value, it will ignore the request.
+//
+void AliasSetTracker::copyValue(Value *From, Value *To) {
+  // Notify the alias analysis implementation that this value is copied.
+  AA.copyValue(From, To);
+
+  // First, look up the PointerRec for this pointer.
+  PointerMapType::iterator I = PointerMap.find_as(From);
+  if (I == PointerMap.end())
+    return;  // Noop
+  assert(I->second->hasAliasSet() && "Dead entry?");
+
+  AliasSet::PointerRec &Entry = getEntryFor(To);
+  if (Entry.hasAliasSet()) return;    // Already in the tracker!
+
+  // Add it to the alias set it aliases...
+  I = PointerMap.find_as(From);
+  AliasSet *AS = I->second->getAliasSet(*this);
+  AS->addPointer(*this, Entry, I->second->getSize(),
+                 I->second->getTBAAInfo(),
+                 true);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//               AliasSet/AliasSetTracker Printing Support
+//===----------------------------------------------------------------------===//
+
+void AliasSet::print(raw_ostream &OS) const {
+  OS << "  AliasSet[" << (const void*)this << ", " << RefCount << "] ";
+  OS << (AliasTy == MustAlias ? "must" : "may") << " alias, ";
+  switch (AccessTy) {
+  case NoModRef: OS << "No access "; break;
+  case Refs    : OS << "Ref       "; break;
+  case Mods    : OS << "Mod       "; break;
+  case ModRef  : OS << "Mod/Ref   "; break;
+  default: llvm_unreachable("Bad value for AccessTy!");
+  }
+  if (isVolatile()) OS << "[volatile] ";
+  if (Forward)
+    OS << " forwarding to " << (void*)Forward;
+
+
+  if (!empty()) {
+    OS << "Pointers: ";
+    for (iterator I = begin(), E = end(); I != E; ++I) {
+      if (I != begin()) OS << ", ";
+      WriteAsOperand(OS << "(", I.getPointer());
+      OS << ", " << I.getSize() << ")";
+    }
+  }
+  if (!UnknownInsts.empty()) {
+    OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
+    for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
+      if (i) OS << ", ";
+      WriteAsOperand(OS, UnknownInsts[i]);
+    }
+  }
+  OS << "\n";
+}
+
+void AliasSetTracker::print(raw_ostream &OS) const {
+  OS << "Alias Set Tracker: " << AliasSets.size() << " alias sets for "
+     << PointerMap.size() << " pointer values.\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    I->print(OS);
+  OS << "\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void AliasSet::dump() const { print(dbgs()); }
+void AliasSetTracker::dump() const { print(dbgs()); }
+#endif
+
+//===----------------------------------------------------------------------===//
+//                     ASTCallbackVH Class Implementation
+//===----------------------------------------------------------------------===//
+
+void AliasSetTracker::ASTCallbackVH::deleted() {
+  assert(AST && "ASTCallbackVH called with a null AliasSetTracker!");
+  AST->deleteValue(getValPtr());
+  // this now dangles!
+}
+
+void AliasSetTracker::ASTCallbackVH::allUsesReplacedWith(Value *V) {
+  AST->copyValue(getValPtr(), V);
+}
+
+AliasSetTracker::ASTCallbackVH::ASTCallbackVH(Value *V, AliasSetTracker *ast)
+  : CallbackVH(V), AST(ast) {}
+
+AliasSetTracker::ASTCallbackVH &
+AliasSetTracker::ASTCallbackVH::operator=(Value *V) {
+  return *this = ASTCallbackVH(V, AST);
+}
+
+//===----------------------------------------------------------------------===//
+//                            AliasSetPrinter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class AliasSetPrinter : public FunctionPass {
+    AliasSetTracker *Tracker;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    AliasSetPrinter() : FunctionPass(ID) {
+      initializeAliasSetPrinterPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<AliasAnalysis>();
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      Tracker = new AliasSetTracker(getAnalysis<AliasAnalysis>());
+
+      for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+        Tracker->add(&*I);
+      Tracker->print(errs());
+      delete Tracker;
+      return false;
+    }
+  };
+}
+
+char AliasSetPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(AliasSetPrinter, "print-alias-sets",
+                "Alias Set Printer", false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(AliasSetPrinter, "print-alias-sets",
+                "Alias Set Printer", false, true)
diff --git a/contrib/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm/lib/Analysis/Analysis.cpp
new file mode 100644
index 000000000000..66e416cd140c
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/Analysis.cpp
@@ -0,0 +1,107 @@
+//===-- Analysis.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Analysis.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include <cstring>
+
+using namespace llvm;
+
+/// initializeAnalysis - Initialize all passes linked into the Analysis library.
+void llvm::initializeAnalysis(PassRegistry &Registry) {
+  initializeAliasAnalysisAnalysisGroup(Registry);
+  initializeAliasAnalysisCounterPass(Registry);
+  initializeAAEvalPass(Registry);
+  initializeAliasDebuggerPass(Registry);
+  initializeAliasSetPrinterPass(Registry);
+  initializeNoAAPass(Registry);
+  initializeBasicAliasAnalysisPass(Registry);
+  initializeBlockFrequencyInfoPass(Registry);
+  initializeBranchProbabilityInfoPass(Registry);
+  initializeCostModelAnalysisPass(Registry);
+  initializeCFGViewerPass(Registry);
+  initializeCFGPrinterPass(Registry);
+  initializeCFGOnlyViewerPass(Registry);
+  initializeCFGOnlyPrinterPass(Registry);
+  initializeDependenceAnalysisPass(Registry);
+  initializeDominanceFrontierPass(Registry);
+  initializeDomViewerPass(Registry);
+  initializeDomPrinterPass(Registry);
+  initializeDomOnlyViewerPass(Registry);
+  initializePostDomViewerPass(Registry);
+  initializeDomOnlyPrinterPass(Registry);
+  initializePostDomPrinterPass(Registry);
+  initializePostDomOnlyViewerPass(Registry);
+  initializePostDomOnlyPrinterPass(Registry);
+  initializeIVUsersPass(Registry);
+  initializeInstCountPass(Registry);
+  initializeIntervalPartitionPass(Registry);
+  initializeLazyValueInfoPass(Registry);
+  initializeLibCallAliasAnalysisPass(Registry);
+  initializeLintPass(Registry);
+  initializeLoopInfoPass(Registry);
+  initializeMemDepPrinterPass(Registry);
+  initializeMemoryDependenceAnalysisPass(Registry);
+  initializeModuleDebugInfoPrinterPass(Registry);
+  initializePostDominatorTreePass(Registry);
+  initializeProfileEstimatorPassPass(Registry);
+  initializeNoProfileInfoPass(Registry);
+  initializeNoPathProfileInfoPass(Registry);
+  initializeProfileInfoAnalysisGroup(Registry);
+  initializePathProfileInfoAnalysisGroup(Registry);
+  initializeLoaderPassPass(Registry);
+  initializePathProfileLoaderPassPass(Registry);
+  initializeProfileVerifierPassPass(Registry);
+  initializePathProfileVerifierPass(Registry);
+  initializeProfileMetadataLoaderPassPass(Registry);
+  initializeRegionInfoPass(Registry);
+  initializeRegionViewerPass(Registry);
+  initializeRegionPrinterPass(Registry);
+  initializeRegionOnlyViewerPass(Registry);
+  initializeRegionOnlyPrinterPass(Registry);
+  initializeScalarEvolutionPass(Registry);
+  initializeScalarEvolutionAliasAnalysisPass(Registry);
+  initializeTargetTransformInfoAnalysisGroup(Registry);
+  initializeTypeBasedAliasAnalysisPass(Registry);
+}
+
+void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
+  initializeAnalysis(*unwrap(R));
+}
+
+LLVMBool LLVMVerifyModule(LLVMModuleRef M, LLVMVerifierFailureAction Action,
+                          char **OutMessages) {
+  std::string Messages;
+
+  LLVMBool Result = verifyModule(*unwrap(M),
+                            static_cast<VerifierFailureAction>(Action),
+                            OutMessages? &Messages : 0);
+
+  if (OutMessages)
+    *OutMessages = strdup(Messages.c_str());
+
+  return Result;
+}
+
+LLVMBool LLVMVerifyFunction(LLVMValueRef Fn, LLVMVerifierFailureAction Action) {
+  return verifyFunction(*unwrap<Function>(Fn),
+                        static_cast<VerifierFailureAction>(Action));
+}
+
+void LLVMViewFunctionCFG(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  F->viewCFG();
+}
+
+void LLVMViewFunctionCFGOnly(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  F->viewCFGOnly();
+}
diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
new file mode 100644
index 000000000000..ae6da1af0c4f
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -0,0 +1,1270 @@
+//===- BasicAliasAnalysis.cpp - Stateless Alias Analysis Impl -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the primary stateless implementation of the
+// Alias Analysis interface that implements identities (two different
+// globals cannot alias, etc), but does no stateful analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Useful predicates
+//===----------------------------------------------------------------------===//
+
+/// isNonEscapingLocalObject - Return true if the pointer is to a function-local
+/// object that never escapes from the function.
+static bool isNonEscapingLocalObject(const Value *V) {
+  // If this is a local allocation, check to see if it escapes.
+  if (isa<AllocaInst>(V) || isNoAliasCall(V))
+    // Set StoreCaptures to True so that we can assume in our callers that the
+    // pointer is not the result of a load instruction. Currently
+    // PointerMayBeCaptured doesn't have any special analysis for the
+    // StoreCaptures=false case; if it did, our callers could be refined to be
+    // more precise.
+    return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
+
+  // If this is an argument that corresponds to a byval or noalias argument,
+  // then it has not escaped before entering the function.  Check if it escapes
+  // inside the function.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    if (A->hasByValAttr() || A->hasNoAliasAttr())
+      // Note even if the argument is marked nocapture we still need to check
+      // for copies made inside the function. The nocapture attribute only
+      // specifies that there are no copies made that outlive the function.
+      return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
+
+  return false;
+}
+
+/// isEscapeSource - Return true if the pointer is one which would have
+/// been considered an escape by isNonEscapingLocalObject.
+static bool isEscapeSource(const Value *V) {
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V))
+    return true;
+
+  // The load case works because isNonEscapingLocalObject considers all
+  // stores to be escapes (it passes true for the StoreCaptures argument
+  // to PointerMayBeCaptured).
+  if (isa<LoadInst>(V))
+    return true;
+
+  return false;
+}
+
+/// getObjectSize - Return the size of the object specified by V, or
+/// UnknownSize if unknown.
+static uint64_t getObjectSize(const Value *V, const DataLayout &TD,
+                              const TargetLibraryInfo &TLI,
+                              bool RoundToAlign = false) {
+  uint64_t Size;
+  if (getUnderlyingObjectSize(V, Size, &TD, &TLI, RoundToAlign))
+    return Size;
+  return AliasAnalysis::UnknownSize;
+}
+
+/// isObjectSmallerThan - Return true if we can prove that the object specified
+/// by V is smaller than Size.
+static bool isObjectSmallerThan(const Value *V, uint64_t Size,
+                                const DataLayout &TD,
+                                const TargetLibraryInfo &TLI) {
+  // This function needs to use the aligned object size because we allow
+  // reads a bit past the end given sufficient alignment.
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
+  
+  return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
+}
+
+/// isObjectSize - Return true if we can prove that the object specified
+/// by V has size Size.
+static bool isObjectSize(const Value *V, uint64_t Size,
+                         const DataLayout &TD, const TargetLibraryInfo &TLI) {
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI);
+  return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size;
+}
+
+//===----------------------------------------------------------------------===//
+// GetElementPtr Instruction Decomposition and Analysis
+//===----------------------------------------------------------------------===//
+
+namespace {
+  enum ExtensionKind {
+    EK_NotExtended,
+    EK_SignExt,
+    EK_ZeroExt
+  };
+  
+  struct VariableGEPIndex {
+    const Value *V;
+    ExtensionKind Extension;
+    int64_t Scale;
+
+    bool operator==(const VariableGEPIndex &Other) const {
+      return V == Other.V && Extension == Other.Extension &&
+        Scale == Other.Scale;
+    }
+
+    bool operator!=(const VariableGEPIndex &Other) const {
+      return !operator==(Other);
+    }
+  };
+}
+
+
+/// GetLinearExpression - Analyze the specified value as a linear expression:
+/// "A*V + B", where A and B are constant integers.  Return the scale and offset
+/// values as APInts and return V as a Value*, and return whether we looked
+/// through any sign or zero extends.  The incoming Value is known to have
+/// IntegerType and it may already be sign or zero extended.
+///
+/// Note that this looks through extends, so the high bits may not be
+/// represented in the result.
+static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
+                                  ExtensionKind &Extension,
+                                  const DataLayout &TD, unsigned Depth) {
+  assert(V->getType()->isIntegerTy() && "Not an integer value");
+
+  // Limit our recursion depth.
+  if (Depth == 6) {
+    Scale = 1;
+    Offset = 0;
+    return V;
+  }
+  
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
+      switch (BOp->getOpcode()) {
+      default: break;
+      case Instruction::Or:
+        // X|C == X+C if all the bits in C are unset in X.  Otherwise we can't
+        // analyze it.
+        if (!MaskedValueIsZero(BOp->getOperand(0), RHSC->getValue(), &TD))
+          break;
+        // FALL THROUGH.
+      case Instruction::Add:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset += RHSC->getValue();
+        return V;
+      case Instruction::Mul:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset *= RHSC->getValue();
+        Scale *= RHSC->getValue();
+        return V;
+      case Instruction::Shl:
+        V = GetLinearExpression(BOp->getOperand(0), Scale, Offset, Extension,
+                                TD, Depth+1);
+        Offset <<= RHSC->getValue().getLimitedValue();
+        Scale <<= RHSC->getValue().getLimitedValue();
+        return V;
+      }
+    }
+  }
+  
+  // Since GEP indices are sign extended anyway, we don't care about the high
+  // bits of a sign or zero extended value - just scales and offsets.  The
+  // extensions have to be consistent though.
+  if ((isa<SExtInst>(V) && Extension != EK_ZeroExt) ||
+      (isa<ZExtInst>(V) && Extension != EK_SignExt)) {
+    Value *CastOp = cast<CastInst>(V)->getOperand(0);
+    unsigned OldWidth = Scale.getBitWidth();
+    unsigned SmallWidth = CastOp->getType()->getPrimitiveSizeInBits();
+    Scale = Scale.trunc(SmallWidth);
+    Offset = Offset.trunc(SmallWidth);
+    Extension = isa<SExtInst>(V) ? EK_SignExt : EK_ZeroExt;
+
+    Value *Result = GetLinearExpression(CastOp, Scale, Offset, Extension,
+                                        TD, Depth+1);
+    Scale = Scale.zext(OldWidth);
+    Offset = Offset.zext(OldWidth);
+    
+    return Result;
+  }
+  
+  Scale = 1;
+  Offset = 0;
+  return V;
+}
+
+/// DecomposeGEPExpression - If V is a symbolic pointer expression, decompose it
+/// into a base pointer with a constant offset and a number of scaled symbolic
+/// offsets.
+///
+/// The scaled symbolic offsets (represented by pairs of a Value* and a scale in
+/// the VarIndices vector) are Value*'s that are known to be scaled by the
+/// specified amount, but which may have other unrepresented high bits. As such,
+/// the gep cannot necessarily be reconstructed from its decomposed form.
+///
+/// When DataLayout is around, this function is capable of analyzing everything
+/// that GetUnderlyingObject can look through.  When not, it just looks
+/// through pointer casts.
+///
+static const Value *
+DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
+                       SmallVectorImpl<VariableGEPIndex> &VarIndices,
+                       const DataLayout *TD) {
+  // Limit recursion depth to limit compile time in crazy cases.
+  unsigned MaxLookup = 6;
+  
+  BaseOffs = 0;
+  do {
+    // See if this is a bitcast or GEP.
+    const Operator *Op = dyn_cast<Operator>(V);
+    if (Op == 0) {
+      // The only non-operator case we can handle are GlobalAliases.
+      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+        if (!GA->mayBeOverridden()) {
+          V = GA->getAliasee();
+          continue;
+        }
+      }
+      return V;
+    }
+    
+    if (Op->getOpcode() == Instruction::BitCast) {
+      V = Op->getOperand(0);
+      continue;
+    }
+
+    const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
+    if (GEPOp == 0) {
+      // If it's not a GEP, hand it off to SimplifyInstruction to see if it
+      // can come up with something. This matches what GetUnderlyingObject does.
+      if (const Instruction *I = dyn_cast<Instruction>(V))
+        // TODO: Get a DominatorTree and use it here.
+        if (const Value *Simplified =
+              SimplifyInstruction(const_cast<Instruction *>(I), TD)) {
+          V = Simplified;
+          continue;
+        }
+    
+      return V;
+    }
+    
+    // Don't attempt to analyze GEPs over unsized objects.
+    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
+        ->getElementType()->isSized())
+      return V;
+    
+    // If we are lacking DataLayout information, we can't compute the offets of
+    // elements computed by GEPs.  However, we can handle bitcast equivalent
+    // GEPs.
+    if (TD == 0) {
+      if (!GEPOp->hasAllZeroIndices())
+        return V;
+      V = GEPOp->getOperand(0);
+      continue;
+    }
+    
+    // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
+    gep_type_iterator GTI = gep_type_begin(GEPOp);
+    for (User::const_op_iterator I = GEPOp->op_begin()+1,
+         E = GEPOp->op_end(); I != E; ++I) {
+      Value *Index = *I;
+      // Compute the (potentially symbolic) offset in bytes for this index.
+      if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
+        // For a struct, add the member offset.
+        unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
+        if (FieldNo == 0) continue;
+        
+        BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
+        continue;
+      }
+      
+      // For an array/pointer, add the element offset, explicitly scaled.
+      if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
+        if (CIdx->isZero()) continue;
+        BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
+        continue;
+      }
+      
+      uint64_t Scale = TD->getTypeAllocSize(*GTI);
+      ExtensionKind Extension = EK_NotExtended;
+      
+      // If the integer type is smaller than the pointer size, it is implicitly
+      // sign extended to pointer size.
+      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
+      if (TD->getPointerSizeInBits() > Width)
+        Extension = EK_SignExt;
+      
+      // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
+      APInt IndexScale(Width, 0), IndexOffset(Width, 0);
+      Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
+                                  *TD, 0);
+      
+      // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
+      // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
+      BaseOffs += IndexOffset.getSExtValue()*Scale;
+      Scale *= IndexScale.getSExtValue();
+      
+      
+      // If we already had an occurrence of this index variable, merge this
+      // scale into it.  For example, we want to handle:
+      //   A[x][x] -> x*16 + x*4 -> x*20
+      // This also ensures that 'x' only appears in the index list once.
+      for (unsigned i = 0, e = VarIndices.size(); i != e; ++i) {
+        if (VarIndices[i].V == Index &&
+            VarIndices[i].Extension == Extension) {
+          Scale += VarIndices[i].Scale;
+          VarIndices.erase(VarIndices.begin()+i);
+          break;
+        }
+      }
+      
+      // Make sure that we have a scale that makes sense for this target's
+      // pointer size.
+      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
+        Scale <<= ShiftBits;
+        Scale = (int64_t)Scale >> ShiftBits;
+      }
+      
+      if (Scale) {
+        VariableGEPIndex Entry = {Index, Extension,
+                                  static_cast<int64_t>(Scale)};
+        VarIndices.push_back(Entry);
+      }
+    }
+    
+    // Analyze the base pointer next.
+    V = GEPOp->getOperand(0);
+  } while (--MaxLookup);
+  
+  // If the chain of expressions is too deep, just return early.
+  return V;
+}
+
+/// GetIndexDifference - Dest and Src are the variable indices from two
+/// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
+/// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
+/// difference between the two pointers. 
+static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
+                               const SmallVectorImpl<VariableGEPIndex> &Src) {
+  if (Src.empty()) return;
+
+  for (unsigned i = 0, e = Src.size(); i != e; ++i) {
+    const Value *V = Src[i].V;
+    ExtensionKind Extension = Src[i].Extension;
+    int64_t Scale = Src[i].Scale;
+    
+    // Find V in Dest.  This is N^2, but pointer indices almost never have more
+    // than a few variable indexes.
+    for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
+      if (Dest[j].V != V || Dest[j].Extension != Extension) continue;
+      
+      // If we found it, subtract off Scale V's from the entry in Dest.  If it
+      // goes to zero, remove the entry.
+      if (Dest[j].Scale != Scale)
+        Dest[j].Scale -= Scale;
+      else
+        Dest.erase(Dest.begin()+j);
+      Scale = 0;
+      break;
+    }
+    
+    // If we didn't consume this entry, add it to the end of the Dest list.
+    if (Scale) {
+      VariableGEPIndex Entry = { V, Extension, -Scale };
+      Dest.push_back(Entry);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// BasicAliasAnalysis Pass
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+static const Function *getParent(const Value *V) {
+  if (const Instruction *inst = dyn_cast<Instruction>(V))
+    return inst->getParent()->getParent();
+
+  if (const Argument *arg = dyn_cast<Argument>(V))
+    return arg->getParent();
+
+  return NULL;
+}
+
+static bool notDifferentParent(const Value *O1, const Value *O2) {
+
+  const Function *F1 = getParent(O1);
+  const Function *F2 = getParent(O2);
+
+  return !F1 || !F2 || F1 == F2;
+}
+#endif
+
+namespace {
+  /// BasicAliasAnalysis - This is the primary alias analysis implementation.
+  struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
+    static char ID; // Class identification, replacement for typeinfo
+    BasicAliasAnalysis() : ImmutablePass(ID) {
+      initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+    virtual AliasResult alias(const Location &LocA,
+                              const Location &LocB) {
+      assert(AliasCache.empty() && "AliasCache must be cleared after use!");
+      assert(notDifferentParent(LocA.Ptr, LocB.Ptr) &&
+             "BasicAliasAnalysis doesn't support interprocedural queries.");
+      AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
+                                     LocB.Ptr, LocB.Size, LocB.TBAATag);
+      // AliasCache rarely has more than 1 or 2 elements, always use
+      // shrink_and_clear so it quickly returns to the inline capacity of the
+      // SmallDenseMap if it ever grows larger.
+      // FIXME: This should really be shrink_to_inline_capacity_and_clear().
+      AliasCache.shrink_and_clear();
+      return Alias;
+    }
+
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2) {
+      // The AliasAnalysis base class has some smarts, lets use them.
+      return AliasAnalysis::getModRefInfo(CS1, CS2);
+    }
+
+    /// pointsToConstantMemory - Chase pointers until we find a (constant
+    /// global) or not.
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+
+    /// getModRefBehavior - Return the behavior when calling the given
+    /// call site.
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+
+    /// getModRefBehavior - Return the behavior when calling the given function.
+    /// For use when the call site is not known.
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *ID) {
+      if (ID == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+    
+  private:
+    // AliasCache - Track alias queries to guard against recursion.
+    typedef std::pair<Location, Location> LocPair;
+    typedef SmallDenseMap<LocPair, AliasResult, 8> AliasCacheTy;
+    AliasCacheTy AliasCache;
+
+    // Visited - Track instructions visited by pointsToConstantMemory.
+    SmallPtrSet<const Value*, 16> Visited;
+
+    // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
+    // instruction against another.
+    AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
+                         const MDNode *V1TBAAInfo,
+                         const Value *V2, uint64_t V2Size,
+                         const MDNode *V2TBAAInfo,
+                         const Value *UnderlyingV1, const Value *UnderlyingV2);
+
+    // aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI
+    // instruction against another.
+    AliasResult aliasPHI(const PHINode *PN, uint64_t PNSize,
+                         const MDNode *PNTBAAInfo,
+                         const Value *V2, uint64_t V2Size,
+                         const MDNode *V2TBAAInfo);
+
+    /// aliasSelect - Disambiguate a Select instruction against another value.
+    AliasResult aliasSelect(const SelectInst *SI, uint64_t SISize,
+                            const MDNode *SITBAAInfo,
+                            const Value *V2, uint64_t V2Size,
+                            const MDNode *V2TBAAInfo);
+
+    AliasResult aliasCheck(const Value *V1, uint64_t V1Size,
+                           const MDNode *V1TBAATag,
+                           const Value *V2, uint64_t V2Size,
+                           const MDNode *V2TBAATag);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char BasicAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa",
+                   "Basic Alias Analysis (stateless AA impl)",
+                   false, true, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa",
+                   "Basic Alias Analysis (stateless AA impl)",
+                   false, true, false)
+
+
+ImmutablePass *llvm::createBasicAliasAnalysisPass() {
+  return new BasicAliasAnalysis();
+}
+
+/// pointsToConstantMemory - Returns whether the given pointer value
+/// points to memory that is local to the function, with global constants being
+/// considered local to all functions.
+bool
+BasicAliasAnalysis::pointsToConstantMemory(const Location &Loc, bool OrLocal) {
+  assert(Visited.empty() && "Visited must be cleared after use!");
+
+  unsigned MaxLookup = 8;
+  SmallVector<const Value *, 16> Worklist;
+  Worklist.push_back(Loc.Ptr);
+  do {
+    const Value *V = GetUnderlyingObject(Worklist.pop_back_val(), TD);
+    if (!Visited.insert(V)) {
+      Visited.clear();
+      return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+    }
+
+    // An alloca instruction defines local memory.
+    if (OrLocal && isa<AllocaInst>(V))
+      continue;
+
+    // A global constant counts as local memory for our purposes.
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+      // Note: this doesn't require GV to be "ODR" because it isn't legal for a
+      // global to be marked constant in some modules and non-constant in
+      // others.  GV may even be a declaration, not a definition.
+      if (!GV->isConstant()) {
+        Visited.clear();
+        return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+      }
+      continue;
+    }
+
+    // If both select values point to local memory, then so does the select.
+    if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    // If all values incoming to a phi node point to local memory, then so does
+    // the phi.
+    if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      // Don't bother inspecting phi nodes with many operands.
+      if (PN->getNumIncomingValues() > MaxLookup) {
+        Visited.clear();
+        return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+      }
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+
+    // Otherwise be conservative.
+    Visited.clear();
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  } while (!Worklist.empty() && --MaxLookup);
+
+  Visited.clear();
+  return Worklist.empty();
+}
+
+/// getModRefBehavior - Return the behavior when calling the given call site.
+AliasAnalysis::ModRefBehavior
+BasicAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  if (CS.doesNotAccessMemory())
+    // Can't do better than this.
+    return DoesNotAccessMemory;
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // If the callsite knows it only reads memory, don't return worse
+  // than that.
+  if (CS.onlyReadsMemory())
+    Min = OnlyReadsMemory;
+
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+}
+
+/// getModRefBehavior - Return the behavior when calling the given function.
+/// For use when the call site is not known.
+AliasAnalysis::ModRefBehavior
+BasicAliasAnalysis::getModRefBehavior(const Function *F) {
+  // If the function declares it doesn't access memory, we can't do better.
+  if (F->doesNotAccessMemory())
+    return DoesNotAccessMemory;
+
+  // For intrinsics, we can check the table.
+  if (unsigned iid = F->getIntrinsicID()) {
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
+  }
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // If the function declares it only reads memory, go with that.
+  if (F->onlyReadsMemory())
+    Min = OnlyReadsMemory;
+
+  // Otherwise be conservative.
+  return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
+}
+
+/// getModRefInfo - Check to see if the specified callsite can clobber the
+/// specified memory object.  Since we only look at local properties of this
+/// function, we really can't say much about this query.  We do, however, use
+/// simple "address taken" analysis on local objects.
+AliasAnalysis::ModRefResult
+BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                                  const Location &Loc) {
+  assert(notDifferentParent(CS.getInstruction(), Loc.Ptr) &&
+         "AliasAnalysis query involving multiple functions!");
+
+  const Value *Object = GetUnderlyingObject(Loc.Ptr, TD);
+  
+  // If this is a tail call and Loc.Ptr points to a stack location, we know that
+  // the tail call cannot access or modify the local stack.
+  // We cannot exclude byval arguments here; these belong to the caller of
+  // the current function not to the current function, and a tail callee
+  // may reference them.
+  if (isa<AllocaInst>(Object))
+    if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
+      if (CI->isTailCall())
+        return NoModRef;
+  
+  // If the pointer is to a locally allocated object that does not escape,
+  // then the call can not mod/ref the pointer unless the call takes the pointer
+  // as an argument, and itself doesn't capture it.
+  if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
+      isNonEscapingLocalObject(Object)) {
+    bool PassedAsArg = false;
+    unsigned ArgNo = 0;
+    for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+         CI != CE; ++CI, ++ArgNo) {
+      // Only look at the no-capture or byval pointer arguments.  If this
+      // pointer were passed to arguments that were neither of these, then it
+      // couldn't be no-capture.
+      if (!(*CI)->getType()->isPointerTy() ||
+          (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
+        continue;
+      
+      // If this is a no-capture pointer argument, see if we can tell that it
+      // is impossible to alias the pointer we're checking.  If not, we have to
+      // assume that the call could touch the pointer, even though it doesn't
+      // escape.
+      if (!isNoAlias(Location(*CI), Location(Object))) {
+        PassedAsArg = true;
+        break;
+      }
+    }
+    
+    if (!PassedAsArg)
+      return NoModRef;
+  }
+
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
+  ModRefResult Min = ModRef;
+
+  // Finally, handle specific knowledge of intrinsics.
+  const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction());
+  if (II != 0)
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove: {
+      uint64_t Len = UnknownSize;
+      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2)))
+        Len = LenCI->getZExtValue();
+      Value *Dest = II->getArgOperand(0);
+      Value *Src = II->getArgOperand(1);
+      // If it can't overlap the source dest, then it doesn't modref the loc.
+      if (isNoAlias(Location(Dest, Len), Loc)) {
+        if (isNoAlias(Location(Src, Len), Loc))
+          return NoModRef;
+        // If it can't overlap the dest, then worst case it reads the loc.
+        Min = Ref;
+      } else if (isNoAlias(Location(Src, Len), Loc)) {
+        // If it can't overlap the source, then worst case it mutates the loc.
+        Min = Mod;
+      }
+      break;
+    }
+    case Intrinsic::memset:
+      // Since memset is 'accesses arguments' only, the AliasAnalysis base class
+      // will handle it for the variable length case.
+      if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+        uint64_t Len = LenCI->getZExtValue();
+        Value *Dest = II->getArgOperand(0);
+        if (isNoAlias(Location(Dest, Len), Loc))
+          return NoModRef;
+      }
+      // We know that memset doesn't load anything.
+      Min = Mod;
+      break;
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_start: {
+      uint64_t PtrSize =
+        cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+      if (isNoAlias(Location(II->getArgOperand(1),
+                             PtrSize,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
+    case Intrinsic::invariant_end: {
+      uint64_t PtrSize =
+        cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
+      if (isNoAlias(Location(II->getArgOperand(2),
+                             PtrSize,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
+    case Intrinsic::arm_neon_vld1: {
+      // LLVM's vld1 and vst1 intrinsics currently only support a single
+      // vector register.
+      uint64_t Size =
+        TD ? TD->getTypeStoreSize(II->getType()) : UnknownSize;
+      if (isNoAlias(Location(II->getArgOperand(0), Size,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
+    case Intrinsic::arm_neon_vst1: {
+      uint64_t Size =
+        TD ? TD->getTypeStoreSize(II->getArgOperand(1)->getType()) : UnknownSize;
+      if (isNoAlias(Location(II->getArgOperand(0), Size,
+                             II->getMetadata(LLVMContext::MD_tbaa)),
+                    Loc))
+        return NoModRef;
+      break;
+    }
+    }
+
+  // We can bound the aliasing properties of memset_pattern16 just as we can
+  // for memcpy/memset.  This is particularly important because the 
+  // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
+  // whenever possible.
+  else if (TLI.has(LibFunc::memset_pattern16) &&
+           CS.getCalledFunction() &&
+           CS.getCalledFunction()->getName() == "memset_pattern16") {
+    const Function *MS = CS.getCalledFunction();
+    FunctionType *MemsetType = MS->getFunctionType();
+    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
+        isa<PointerType>(MemsetType->getParamType(0)) &&
+        isa<PointerType>(MemsetType->getParamType(1)) &&
+        isa<IntegerType>(MemsetType->getParamType(2))) {
+      uint64_t Len = UnknownSize;
+      if (const ConstantInt *LenCI = dyn_cast<ConstantInt>(CS.getArgument(2)))
+        Len = LenCI->getZExtValue();
+      const Value *Dest = CS.getArgument(0);
+      const Value *Src = CS.getArgument(1);
+      // If it can't overlap the source dest, then it doesn't modref the loc.
+      if (isNoAlias(Location(Dest, Len), Loc)) {
+        // Always reads 16 bytes of the source.
+        if (isNoAlias(Location(Src, 16), Loc))
+          return NoModRef;
+        // If it can't overlap the dest, then worst case it reads the loc.
+        Min = Ref;
+      // Always reads 16 bytes of the source.
+      } else if (isNoAlias(Location(Src, 16), Loc)) {
+        // If it can't overlap the source, then worst case it mutates the loc.
+        Min = Mod;
+      }
+    }
+  }
+
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
+}
+
+static bool areVarIndicesEqual(SmallVector<VariableGEPIndex, 4> &Indices1,
+                               SmallVector<VariableGEPIndex, 4> &Indices2) {
+  unsigned Size1 = Indices1.size();
+  unsigned Size2 = Indices2.size();
+
+  if (Size1 != Size2)
+    return false;
+
+  for (unsigned I = 0; I != Size1; ++I)
+    if (Indices1[I] != Indices2[I])
+      return false;
+
+  return true;
+}
+
+/// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
+/// against another pointer.  We know that V1 is a GEP, but we don't know
+/// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, TD),
+/// UnderlyingV2 is the same for V2.
+///
+AliasAnalysis::AliasResult
+BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
+                             const MDNode *V1TBAAInfo,
+                             const Value *V2, uint64_t V2Size,
+                             const MDNode *V2TBAAInfo,
+                             const Value *UnderlyingV1,
+                             const Value *UnderlyingV2) {
+  int64_t GEP1BaseOffset;
+  SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
+
+  // If we have two gep instructions with must-alias or not-alias'ing base
+  // pointers, figure out if the indexes to the GEP tell us anything about the
+  // derived pointer.
+  if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
+    // Do the base pointers alias?
+    AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, 0,
+                                       UnderlyingV2, UnknownSize, 0);
+
+    // Check for geps of non-aliasing underlying pointers where the offsets are
+    // identical.
+    if ((BaseAlias == MayAlias) && V1Size == V2Size) {
+      // Do the base pointers alias assuming type and size.
+      AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size,
+                                                V1TBAAInfo, UnderlyingV2,
+                                                V2Size, V2TBAAInfo);
+      if (PreciseBaseAlias == NoAlias) {
+        // See if the computed offset from the common pointer tells us about the
+        // relation of the resulting pointer.
+        int64_t GEP2BaseOffset;
+        SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
+        const Value *GEP2BasePtr =
+          DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
+        const Value *GEP1BasePtr =
+          DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
+        // DecomposeGEPExpression and GetUnderlyingObject should return the
+        // same result except when DecomposeGEPExpression has no DataLayout.
+        if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
+          assert(TD == 0 &&
+             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+          return MayAlias;
+        }
+        // Same offsets.
+        if (GEP1BaseOffset == GEP2BaseOffset &&
+            areVarIndicesEqual(GEP1VariableIndices, GEP2VariableIndices))
+          return NoAlias;
+        GEP1VariableIndices.clear();
+      }
+    }
+    
+    // If we get a No or May, then return it immediately, no amount of analysis
+    // will improve this situation.
+    if (BaseAlias != MustAlias) return BaseAlias;
+    
+    // Otherwise, we have a MustAlias.  Since the base pointers alias each other
+    // exactly, see if the computed offset from the common pointer tells us
+    // about the relation of the resulting pointer.
+    const Value *GEP1BasePtr =
+      DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
+    
+    int64_t GEP2BaseOffset;
+    SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
+    const Value *GEP2BasePtr =
+      DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
+    
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no DataLayout.
+    if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
+      assert(TD == 0 &&
+             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+      return MayAlias;
+    }
+    
+    // Subtract the GEP2 pointer from the GEP1 pointer to find out their
+    // symbolic difference.
+    GEP1BaseOffset -= GEP2BaseOffset;
+    GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
+    
+  } else {
+    // Check to see if these two pointers are related by the getelementptr
+    // instruction.  If one pointer is a GEP with a non-zero index of the other
+    // pointer, we know they cannot alias.
+
+    // If both accesses are unknown size, we can't do anything useful here.
+    if (V1Size == UnknownSize && V2Size == UnknownSize)
+      return MayAlias;
+
+    AliasResult R = aliasCheck(UnderlyingV1, UnknownSize, 0,
+                               V2, V2Size, V2TBAAInfo);
+    if (R != MustAlias)
+      // If V2 may alias GEP base pointer, conservatively returns MayAlias.
+      // If V2 is known not to alias GEP base pointer, then the two values
+      // cannot alias per GEP semantics: "A pointer value formed from a
+      // getelementptr instruction is associated with the addresses associated
+      // with the first operand of the getelementptr".
+      return R;
+
+    const Value *GEP1BasePtr =
+      DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
+    
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no DataLayout.
+    if (GEP1BasePtr != UnderlyingV1) {
+      assert(TD == 0 &&
+             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+      return MayAlias;
+    }
+  }
+  
+  // In the two GEP Case, if there is no difference in the offsets of the
+  // computed pointers, the resultant pointers are a must alias.  This
+  // hapens when we have two lexically identical GEP's (for example).
+  //
+  // In the other case, if we have getelementptr <ptr>, 0, 0, 0, 0, ... and V2
+  // must aliases the GEP, the end result is a must alias also.
+  if (GEP1BaseOffset == 0 && GEP1VariableIndices.empty())
+    return MustAlias;
+
+  // If there is a constant difference between the pointers, but the difference
+  // is less than the size of the associated memory object, then we know
+  // that the objects are partially overlapping.  If the difference is
+  // greater, we know they do not overlap.
+  if (GEP1BaseOffset != 0 && GEP1VariableIndices.empty()) {
+    if (GEP1BaseOffset >= 0) {
+      if (V2Size != UnknownSize) {
+        if ((uint64_t)GEP1BaseOffset < V2Size)
+          return PartialAlias;
+        return NoAlias;
+      }
+    } else {
+      if (V1Size != UnknownSize) {
+        if (-(uint64_t)GEP1BaseOffset < V1Size)
+          return PartialAlias;
+        return NoAlias;
+      }
+    }
+  }
+
+  // Try to distinguish something like &A[i][1] against &A[42][0].
+  // Grab the least significant bit set in any of the scales.
+  if (!GEP1VariableIndices.empty()) {
+    uint64_t Modulo = 0;
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
+      Modulo |= (uint64_t)GEP1VariableIndices[i].Scale;
+    Modulo = Modulo ^ (Modulo & (Modulo - 1));
+
+    // We can compute the difference between the two addresses
+    // mod Modulo. Check whether that difference guarantees that the
+    // two locations do not alias.
+    uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1);
+    if (V1Size != UnknownSize && V2Size != UnknownSize &&
+        ModOffset >= V2Size && V1Size <= Modulo - ModOffset)
+      return NoAlias;
+  }
+
+  // Statically, we can see that the base objects are the same, but the
+  // pointers have dynamic offsets which we can't resolve. And none of our
+  // little tricks above worked.
+  //
+  // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the
+  // practical effect of this is protecting TBAA in the case of dynamic
+  // indices into arrays of unions or malloc'd memory.
+  return PartialAlias;
+}
+
+static AliasAnalysis::AliasResult
+MergeAliasResults(AliasAnalysis::AliasResult A, AliasAnalysis::AliasResult B) {
+  // If the results agree, take it.
+  if (A == B)
+    return A;
+  // A mix of PartialAlias and MustAlias is PartialAlias.
+  if ((A == AliasAnalysis::PartialAlias && B == AliasAnalysis::MustAlias) ||
+      (B == AliasAnalysis::PartialAlias && A == AliasAnalysis::MustAlias))
+    return AliasAnalysis::PartialAlias;
+  // Otherwise, we don't know anything.
+  return AliasAnalysis::MayAlias;
+}
+
+/// aliasSelect - Provide a bunch of ad-hoc rules to disambiguate a Select
+/// instruction against another.
+AliasAnalysis::AliasResult
+BasicAliasAnalysis::aliasSelect(const SelectInst *SI, uint64_t SISize,
+                                const MDNode *SITBAAInfo,
+                                const Value *V2, uint64_t V2Size,
+                                const MDNode *V2TBAAInfo) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for aliases between the values on corresponding arms.
+  if (const SelectInst *SI2 = dyn_cast<SelectInst>(V2))
+    if (SI->getCondition() == SI2->getCondition()) {
+      AliasResult Alias =
+        aliasCheck(SI->getTrueValue(), SISize, SITBAAInfo,
+                   SI2->getTrueValue(), V2Size, V2TBAAInfo);
+      if (Alias == MayAlias)
+        return MayAlias;
+      AliasResult ThisAlias =
+        aliasCheck(SI->getFalseValue(), SISize, SITBAAInfo,
+                   SI2->getFalseValue(), V2Size, V2TBAAInfo);
+      return MergeAliasResults(ThisAlias, Alias);
+    }
+
+  // If both arms of the Select node NoAlias or MustAlias V2, then returns
+  // NoAlias / MustAlias. Otherwise, returns MayAlias.
+  AliasResult Alias =
+    aliasCheck(V2, V2Size, V2TBAAInfo, SI->getTrueValue(), SISize, SITBAAInfo);
+  if (Alias == MayAlias)
+    return MayAlias;
+
+  AliasResult ThisAlias =
+    aliasCheck(V2, V2Size, V2TBAAInfo, SI->getFalseValue(), SISize, SITBAAInfo);
+  return MergeAliasResults(ThisAlias, Alias);
+}
+
+// aliasPHI - Provide a bunch of ad-hoc rules to disambiguate a PHI instruction
+// against another.
+AliasAnalysis::AliasResult
+BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
+                             const MDNode *PNTBAAInfo,
+                             const Value *V2, uint64_t V2Size,
+                             const MDNode *V2TBAAInfo) {
+  // If the values are PHIs in the same block, we can do a more precise
+  // as well as efficient check: just check for aliases between the values
+  // on corresponding edges.
+  if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
+    if (PN2->getParent() == PN->getParent()) {
+      LocPair Locs(Location(PN, PNSize, PNTBAAInfo),
+                   Location(V2, V2Size, V2TBAAInfo));
+      if (PN > V2)
+        std::swap(Locs.first, Locs.second);
+      // Analyse the PHIs' inputs under the assumption that the PHIs are
+      // NoAlias.
+      // If the PHIs are May/MustAlias there must be (recursively) an input
+      // operand from outside the PHIs' cycle that is MayAlias/MustAlias or
+      // there must be an operation on the PHIs within the PHIs' value cycle
+      // that causes a MayAlias.
+      // Pretend the phis do not alias.
+      AliasResult Alias = NoAlias;
+      assert(AliasCache.count(Locs) &&
+             "There must exist an entry for the phi node");
+      AliasResult OrigAliasResult = AliasCache[Locs];
+      AliasCache[Locs] = NoAlias;
+
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        AliasResult ThisAlias =
+          aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
+                     PN2->getIncomingValueForBlock(PN->getIncomingBlock(i)),
+                     V2Size, V2TBAAInfo);
+        Alias = MergeAliasResults(ThisAlias, Alias);
+        if (Alias == MayAlias)
+          break;
+      }
+
+      // Reset if speculation failed.
+      if (Alias != NoAlias)
+        AliasCache[Locs] = OrigAliasResult;
+
+      return Alias;
+    }
+
+  SmallPtrSet<Value*, 4> UniqueSrc;
+  SmallVector<Value*, 4> V1Srcs;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *PV1 = PN->getIncomingValue(i);
+    if (isa<PHINode>(PV1))
+      // If any of the source itself is a PHI, return MayAlias conservatively
+      // to avoid compile time explosion. The worst possible case is if both
+      // sides are PHI nodes. In which case, this is O(m x n) time where 'm'
+      // and 'n' are the number of PHI sources.
+      return MayAlias;
+    if (UniqueSrc.insert(PV1))
+      V1Srcs.push_back(PV1);
+  }
+
+  AliasResult Alias = aliasCheck(V2, V2Size, V2TBAAInfo,
+                                 V1Srcs[0], PNSize, PNTBAAInfo);
+  // Early exit if the check of the first PHI source against V2 is MayAlias.
+  // Other results are not possible.
+  if (Alias == MayAlias)
+    return MayAlias;
+
+  // If all sources of the PHI node NoAlias or MustAlias V2, then returns
+  // NoAlias / MustAlias. Otherwise, returns MayAlias.
+  for (unsigned i = 1, e = V1Srcs.size(); i != e; ++i) {
+    Value *V = V1Srcs[i];
+
+    AliasResult ThisAlias = aliasCheck(V2, V2Size, V2TBAAInfo,
+                                       V, PNSize, PNTBAAInfo);
+    Alias = MergeAliasResults(ThisAlias, Alias);
+    if (Alias == MayAlias)
+      break;
+  }
+
+  return Alias;
+}
+
+// aliasCheck - Provide a bunch of ad-hoc rules to disambiguate in common cases,
+// such as array references.
+//
+AliasAnalysis::AliasResult
+BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
+                               const MDNode *V1TBAAInfo,
+                               const Value *V2, uint64_t V2Size,
+                               const MDNode *V2TBAAInfo) {
+  // If either of the memory references is empty, it doesn't matter what the
+  // pointer values are.
+  if (V1Size == 0 || V2Size == 0)
+    return NoAlias;
+
+  // Strip off any casts if they exist.
+  V1 = V1->stripPointerCasts();
+  V2 = V2->stripPointerCasts();
+
+  // Are we checking for alias of the same value?
+  if (V1 == V2) return MustAlias;
+
+  if (!V1->getType()->isPointerTy() || !V2->getType()->isPointerTy())
+    return NoAlias;  // Scalars cannot alias each other
+
+  // Figure out what objects these things are pointing to if we can.
+  const Value *O1 = GetUnderlyingObject(V1, TD);
+  const Value *O2 = GetUnderlyingObject(V2, TD);
+
+  // Null values in the default address space don't point to any object, so they
+  // don't alias any other pointer.
+  if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O1))
+    if (CPN->getType()->getAddressSpace() == 0)
+      return NoAlias;
+  if (const ConstantPointerNull *CPN = dyn_cast<ConstantPointerNull>(O2))
+    if (CPN->getType()->getAddressSpace() == 0)
+      return NoAlias;
+
+  if (O1 != O2) {
+    // If V1/V2 point to two different objects we know that we have no alias.
+    if (isIdentifiedObject(O1) && isIdentifiedObject(O2))
+      return NoAlias;
+
+    // Constant pointers can't alias with non-const isIdentifiedObject objects.
+    if ((isa<Constant>(O1) && isIdentifiedObject(O2) && !isa<Constant>(O2)) ||
+        (isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
+      return NoAlias;
+
+    // Arguments can't alias with local allocations or noalias calls
+    // in the same function.
+    if (((isa<Argument>(O1) && (isa<AllocaInst>(O2) || isNoAliasCall(O2))) ||
+         (isa<Argument>(O2) && (isa<AllocaInst>(O1) || isNoAliasCall(O1)))))
+      return NoAlias;
+
+    // Most objects can't alias null.
+    if ((isa<ConstantPointerNull>(O2) && isKnownNonNull(O1)) ||
+        (isa<ConstantPointerNull>(O1) && isKnownNonNull(O2)))
+      return NoAlias;
+  
+    // If one pointer is the result of a call/invoke or load and the other is a
+    // non-escaping local object within the same function, then we know the
+    // object couldn't escape to a point where the call could return it.
+    //
+    // Note that if the pointers are in different functions, there are a
+    // variety of complications. A call with a nocapture argument may still
+    // temporary store the nocapture argument's value in a temporary memory
+    // location if that memory location doesn't escape. Or it may pass a
+    // nocapture value to other functions as long as they don't capture it.
+    if (isEscapeSource(O1) && isNonEscapingLocalObject(O2))
+      return NoAlias;
+    if (isEscapeSource(O2) && isNonEscapingLocalObject(O1))
+      return NoAlias;
+  }
+
+  // If the size of one access is larger than the entire object on the other
+  // side, then we know such behavior is undefined and can assume no alias.
+  if (TD)
+    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
+      return NoAlias;
+  
+  // Check the cache before climbing up use-def chains. This also terminates
+  // otherwise infinitely recursive queries.
+  LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
+               Location(V2, V2Size, V2TBAAInfo));
+  if (V1 > V2)
+    std::swap(Locs.first, Locs.second);
+  std::pair<AliasCacheTy::iterator, bool> Pair =
+    AliasCache.insert(std::make_pair(Locs, MayAlias));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  // FIXME: This isn't aggressively handling alias(GEP, PHI) for example: if the
+  // GEP can't simplify, we don't even look at the PHI cases.
+  if (!isa<GEPOperator>(V1) && isa<GEPOperator>(V2)) {
+    std::swap(V1, V2);
+    std::swap(V1Size, V2Size);
+    std::swap(O1, O2);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
+  }
+  if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
+    AliasResult Result = aliasGEP(GV1, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo, O1, O2);
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
+  }
+
+  if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
+    std::swap(V1, V2);
+    std::swap(V1Size, V2Size);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
+  }
+  if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
+    AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo,
+                                  V2, V2Size, V2TBAAInfo);
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
+  }
+
+  if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
+    std::swap(V1, V2);
+    std::swap(V1Size, V2Size);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
+  }
+  if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
+    AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo,
+                                     V2, V2Size, V2TBAAInfo);
+    if (Result != MayAlias) return AliasCache[Locs] = Result;
+  }
+
+  // If both pointers are pointing into the same object and one of them
+  // accesses is accessing the entire object, then the accesses must
+  // overlap in some way.
+  if (TD && O1 == O2)
+    if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD, *TLI)))
+      return AliasCache[Locs] = PartialAlias;
+
+  AliasResult Result =
+    AliasAnalysis::alias(Location(V1, V1Size, V1TBAAInfo),
+                         Location(V2, V2Size, V2TBAAInfo));
+  return AliasCache[Locs] = Result;
+}
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..100e5c8ae7dd
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -0,0 +1,63 @@
+//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------=======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
+                      true, true)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
+INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
+                    true, true)
+
+char BlockFrequencyInfo::ID = 0;
+
+
+BlockFrequencyInfo::BlockFrequencyInfo() : FunctionPass(ID) {
+  initializeBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
+  BFI = new BlockFrequencyImpl<BasicBlock, Function, BranchProbabilityInfo>();
+}
+
+BlockFrequencyInfo::~BlockFrequencyInfo() {
+  delete BFI;
+}
+
+void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BranchProbabilityInfo>();
+  AU.setPreservesAll();
+}
+
+bool BlockFrequencyInfo::runOnFunction(Function &F) {
+  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+  BFI->doFunction(&F, &BPI);
+  return false;
+}
+
+void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
+  if (BFI) BFI->print(O);
+}
+
+/// getblockFreq - Return block frequency. Return 0 if we don't have the
+/// information. Please note that initial frequency is equal to 1024. It means
+/// that we should not rely on the value itself, but only on the comparison to
+/// the other block frequencies. We do this to avoid using of floating points.
+///
+BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
+  return BFI->getBlockFreq(BB);
+}
diff --git a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
new file mode 100644
index 000000000000..6c5885601fa3
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -0,0 +1,558 @@
+//===-- BranchProbabilityInfo.cpp - Branch Probability Analysis -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(BranchProbabilityInfo, "branch-prob",
+                      "Branch Probability Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(BranchProbabilityInfo, "branch-prob",
+                    "Branch Probability Analysis", false, true)
+
+char BranchProbabilityInfo::ID = 0;
+
+// Weights are for internal use only. They are used by heuristics to help to
+// estimate edges' probability. Example:
+//
+// Using "Loop Branch Heuristics" we predict weights of edges for the
+// block BB2.
+//         ...
+//          |
+//          V
+//         BB1<-+
+//          |   |
+//          |   | (Weight = 124)
+//          V   |
+//         BB2--+
+//          |
+//          | (Weight = 4)
+//          V
+//         BB3
+//
+// Probability of the edge BB2->BB1 = 124 / (124 + 4) = 0.96875
+// Probability of the edge BB2->BB3 = 4 / (124 + 4) = 0.03125
+static const uint32_t LBH_TAKEN_WEIGHT = 124;
+static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
+
+/// \brief Unreachable-terminating branch taken weight.
+///
+/// This is the weight for a branch being taken to a block that terminates
+/// (eventually) in unreachable. These are predicted as unlikely as possible.
+static const uint32_t UR_TAKEN_WEIGHT = 1;
+
+/// \brief Unreachable-terminating branch not-taken weight.
+///
+/// This is the weight for a branch not being taken toward a block that
+/// terminates (eventually) in unreachable. Such a branch is essentially never
+/// taken. Set the weight to an absurdly high value so that nested loops don't
+/// easily subsume it.
+static const uint32_t UR_NONTAKEN_WEIGHT = 1024*1024 - 1;
+
+static const uint32_t PH_TAKEN_WEIGHT = 20;
+static const uint32_t PH_NONTAKEN_WEIGHT = 12;
+
+static const uint32_t ZH_TAKEN_WEIGHT = 20;
+static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
+
+static const uint32_t FPH_TAKEN_WEIGHT = 20;
+static const uint32_t FPH_NONTAKEN_WEIGHT = 12;
+
+/// \brief Invoke-terminating normal branch taken weight
+///
+/// This is the weight for branching to the normal destination of an invoke
+/// instruction. We expect this to happen most of the time. Set the weight to an
+/// absurdly high value so that nested loops subsume it.
+static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
+
+/// \brief Invoke-terminating normal branch not-taken weight.
+///
+/// This is the weight for branching to the unwind destination of an invoke
+/// instruction. This is essentially never taken.
+static const uint32_t IH_NONTAKEN_WEIGHT = 1;
+
+// Standard weight value. Used when none of the heuristics set weight for
+// the edge.
+static const uint32_t NORMAL_WEIGHT = 16;
+
+// Minimum weight of an edge. Please note, that weight is NEVER 0.
+static const uint32_t MIN_WEIGHT = 1;
+
+static uint32_t getMaxWeightFor(BasicBlock *BB) {
+  return UINT32_MAX / BB->getTerminator()->getNumSuccessors();
+}
+
+
+/// \brief Calculate edge weights for successors lead to unreachable.
+///
+/// Predict that a successor which leads necessarily to an
+/// unreachable-terminated block as extremely unlikely.
+bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0) {
+    if (isa<UnreachableInst>(TI))
+      PostDominatedByUnreachable.insert(BB);
+    return false;
+  }
+
+  SmallVector<unsigned, 4> UnreachableEdges;
+  SmallVector<unsigned, 4> ReachableEdges;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    if (PostDominatedByUnreachable.count(*I))
+      UnreachableEdges.push_back(I.getSuccessorIndex());
+    else
+      ReachableEdges.push_back(I.getSuccessorIndex());
+  }
+
+  // If all successors are in the set of blocks post-dominated by unreachable,
+  // this block is too.
+  if (UnreachableEdges.size() == TI->getNumSuccessors())
+    PostDominatedByUnreachable.insert(BB);
+
+  // Skip probabilities if this block has a single successor or if all were
+  // reachable.
+  if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty())
+    return false;
+
+  uint32_t UnreachableWeight =
+    std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = UnreachableEdges.begin(),
+                                          E = UnreachableEdges.end();
+       I != E; ++I)
+    setEdgeWeight(BB, *I, UnreachableWeight);
+
+  if (ReachableEdges.empty())
+    return true;
+  uint32_t ReachableWeight =
+    std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(),
+             NORMAL_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = ReachableEdges.begin(),
+                                          E = ReachableEdges.end();
+       I != E; ++I)
+    setEdgeWeight(BB, *I, ReachableWeight);
+
+  return true;
+}
+
+// Propagate existing explicit probabilities from either profile data or
+// 'expect' intrinsic processing.
+bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 1)
+    return false;
+  if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+    return false;
+
+  MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+  if (!WeightsNode)
+    return false;
+
+  // Ensure there are weights for all of the successors. Note that the first
+  // operand to the metadata node is a name, not a weight.
+  if (WeightsNode->getNumOperands() != TI->getNumSuccessors() + 1)
+    return false;
+
+  // Build up the final weights that will be used in a temporary buffer, but
+  // don't add them until all weihts are present. Each weight value is clamped
+  // to [1, getMaxWeightFor(BB)].
+  uint32_t WeightLimit = getMaxWeightFor(BB);
+  SmallVector<uint32_t, 2> Weights;
+  Weights.reserve(TI->getNumSuccessors());
+  for (unsigned i = 1, e = WeightsNode->getNumOperands(); i != e; ++i) {
+    ConstantInt *Weight = dyn_cast<ConstantInt>(WeightsNode->getOperand(i));
+    if (!Weight)
+      return false;
+    Weights.push_back(
+      std::max<uint32_t>(1, Weight->getLimitedValue(WeightLimit)));
+  }
+  assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    setEdgeWeight(BB, i, Weights[i]);
+
+  return true;
+}
+
+// Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
+// between two pointer or pointer and NULL will fail.
+bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
+  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI || !CI->isEquality())
+    return false;
+
+  Value *LHS = CI->getOperand(0);
+
+  if (!LHS->getType()->isPointerTy())
+    return false;
+
+  assert(CI->getOperand(1)->getType()->isPointerTy());
+
+  // p != 0   ->   isProb = true
+  // p == 0   ->   isProb = false
+  // p != q   ->   isProb = true
+  // p == q   ->   isProb = false;
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
+  bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE;
+  if (!isProb)
+    std::swap(TakenIdx, NonTakenIdx);
+
+  setEdgeWeight(BB, TakenIdx, PH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, PH_NONTAKEN_WEIGHT);
+  return true;
+}
+
+// Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
+// as taken, exiting edges as not-taken.
+bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return false;
+
+  SmallVector<unsigned, 8> BackEdges;
+  SmallVector<unsigned, 8> ExitingEdges;
+  SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    if (!L->contains(*I))
+      ExitingEdges.push_back(I.getSuccessorIndex());
+    else if (L->getHeader() == *I)
+      BackEdges.push_back(I.getSuccessorIndex());
+    else
+      InEdges.push_back(I.getSuccessorIndex());
+  }
+
+  if (uint32_t numBackEdges = BackEdges.size()) {
+    uint32_t backWeight = LBH_TAKEN_WEIGHT / numBackEdges;
+    if (backWeight < NORMAL_WEIGHT)
+      backWeight = NORMAL_WEIGHT;
+
+    for (SmallVector<unsigned, 8>::iterator EI = BackEdges.begin(),
+         EE = BackEdges.end(); EI != EE; ++EI) {
+      setEdgeWeight(BB, *EI, backWeight);
+    }
+  }
+
+  if (uint32_t numInEdges = InEdges.size()) {
+    uint32_t inWeight = LBH_TAKEN_WEIGHT / numInEdges;
+    if (inWeight < NORMAL_WEIGHT)
+      inWeight = NORMAL_WEIGHT;
+
+    for (SmallVector<unsigned, 8>::iterator EI = InEdges.begin(),
+         EE = InEdges.end(); EI != EE; ++EI) {
+      setEdgeWeight(BB, *EI, inWeight);
+    }
+  }
+
+  if (uint32_t numExitingEdges = ExitingEdges.size()) {
+    uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numExitingEdges;
+    if (exitWeight < MIN_WEIGHT)
+      exitWeight = MIN_WEIGHT;
+
+    for (SmallVector<unsigned, 8>::iterator EI = ExitingEdges.begin(),
+         EE = ExitingEdges.end(); EI != EE; ++EI) {
+      setEdgeWeight(BB, *EI, exitWeight);
+    }
+  }
+
+  return true;
+}
+
+bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
+  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return false;
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (!CV)
+    return false;
+
+  bool isProb;
+  if (CV->isZero()) {
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      // X == 0   ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      // X != 0   ->  Likely
+      isProb = true;
+      break;
+    case CmpInst::ICMP_SLT:
+      // X < 0   ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_SGT:
+      // X > 0   ->  Likely
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
+  } else if (CV->isOne() && CI->getPredicate() == CmpInst::ICMP_SLT) {
+    // InstCombine canonicalizes X <= 0 into X < 1.
+    // X <= 0   ->  Unlikely
+    isProb = false;
+  } else if (CV->isAllOnesValue() && CI->getPredicate() == CmpInst::ICMP_SGT) {
+    // InstCombine canonicalizes X >= 0 into X > -1.
+    // X >= 0   ->  Likely
+    isProb = true;
+  } else {
+    return false;
+  }
+
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
+
+  if (!isProb)
+    std::swap(TakenIdx, NonTakenIdx);
+
+  setEdgeWeight(BB, TakenIdx, ZH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, ZH_NONTAKEN_WEIGHT);
+
+  return true;
+}
+
+bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) {
+  BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  FCmpInst *FCmp = dyn_cast<FCmpInst>(Cond);
+  if (!FCmp)
+    return false;
+
+  bool isProb;
+  if (FCmp->isEquality()) {
+    // f1 == f2 -> Unlikely
+    // f1 != f2 -> Likely
+    isProb = !FCmp->isTrueWhenEqual();
+  } else if (FCmp->getPredicate() == FCmpInst::FCMP_ORD) {
+    // !isnan -> Likely
+    isProb = true;
+  } else if (FCmp->getPredicate() == FCmpInst::FCMP_UNO) {
+    // isnan -> Unlikely
+    isProb = false;
+  } else {
+    return false;
+  }
+
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
+
+  if (!isProb)
+    std::swap(TakenIdx, NonTakenIdx);
+
+  setEdgeWeight(BB, TakenIdx, FPH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, FPH_NONTAKEN_WEIGHT);
+
+  return true;
+}
+
+bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) {
+  InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator());
+  if (!II)
+    return false;
+
+  setEdgeWeight(BB, 0/*Index for Normal*/, IH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, 1/*Index for Unwind*/, IH_NONTAKEN_WEIGHT);
+  return true;
+}
+
+void BranchProbabilityInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfo>();
+  AU.setPreservesAll();
+}
+
+bool BranchProbabilityInfo::runOnFunction(Function &F) {
+  LastF = &F; // Store the last function we ran on for printing.
+  LI = &getAnalysis<LoopInfo>();
+  assert(PostDominatedByUnreachable.empty());
+
+  // Walk the basic blocks in post-order so that we can build up state about
+  // the successors of a block iteratively.
+  for (po_iterator<BasicBlock *> I = po_begin(&F.getEntryBlock()),
+                                 E = po_end(&F.getEntryBlock());
+       I != E; ++I) {
+    DEBUG(dbgs() << "Computing probabilities for " << I->getName() << "\n");
+    if (calcUnreachableHeuristics(*I))
+      continue;
+    if (calcMetadataWeights(*I))
+      continue;
+    if (calcLoopBranchHeuristics(*I))
+      continue;
+    if (calcPointerHeuristics(*I))
+      continue;
+    if (calcZeroHeuristics(*I))
+      continue;
+    if (calcFloatingPointHeuristics(*I))
+      continue;
+    calcInvokeHeuristics(*I);
+  }
+
+  PostDominatedByUnreachable.clear();
+  return false;
+}
+
+void BranchProbabilityInfo::print(raw_ostream &OS, const Module *) const {
+  OS << "---- Branch Probabilities ----\n";
+  // We print the probabilities from the last function the analysis ran over,
+  // or the function it is currently running over.
+  assert(LastF && "Cannot print prior to running over a function");
+  for (Function::const_iterator BI = LastF->begin(), BE = LastF->end();
+       BI != BE; ++BI) {
+    for (succ_const_iterator SI = succ_begin(BI), SE = succ_end(BI);
+         SI != SE; ++SI) {
+      printEdgeProbability(OS << "  ", BI, *SI);
+    }
+  }
+}
+
+uint32_t BranchProbabilityInfo::getSumForBlock(const BasicBlock *BB) const {
+  uint32_t Sum = 0;
+
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(BB, I.getSuccessorIndex());
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+  }
+
+  return Sum;
+}
+
+bool BranchProbabilityInfo::
+isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
+  // Hot probability is at least 4/5 = 80%
+  // FIXME: Compare against a static "hot" BranchProbability.
+  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+}
+
+BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
+  uint32_t Sum = 0;
+  uint32_t MaxWeight = 0;
+  BasicBlock *MaxSucc = 0;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    BasicBlock *Succ = *I;
+    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t PrevSum = Sum;
+
+    Sum += Weight;
+    assert(Sum > PrevSum); (void) PrevSum;
+
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
+      MaxSucc = Succ;
+    }
+  }
+
+  // Hot probability is at least 4/5 = 80%
+  if (BranchProbability(MaxWeight, Sum) > BranchProbability(4, 5))
+    return MaxSucc;
+
+  return 0;
+}
+
+/// Get the raw edge weight for the edge. If can't find it, return
+/// DEFAULT_WEIGHT value. Here an edge is specified using PredBlock and an index
+/// to the successors.
+uint32_t BranchProbabilityInfo::
+getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  DenseMap<Edge, uint32_t>::const_iterator I =
+      Weights.find(std::make_pair(Src, IndexInSuccessors));
+
+  if (I != Weights.end())
+    return I->second;
+
+  return DEFAULT_WEIGHT;
+}
+
+/// Get the raw edge weight calculated for the block pair. This returns the sum
+/// of all raw edge weights from Src to Dst.
+uint32_t BranchProbabilityInfo::
+getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
+  uint32_t Weight = 0;
+  DenseMap<Edge, uint32_t>::const_iterator MapI;
+  for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
+    if (*I == Dst) {
+      MapI = Weights.find(std::make_pair(Src, I.getSuccessorIndex()));
+      if (MapI != Weights.end())
+        Weight += MapI->second;
+    }
+  return (Weight == 0) ? DEFAULT_WEIGHT : Weight;
+}
+
+/// Set the edge weight for a given edge specified by PredBlock and an index
+/// to the successors.
+void BranchProbabilityInfo::
+setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
+              uint32_t Weight) {
+  Weights[std::make_pair(Src, IndexInSuccessors)] = Weight;
+  DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
+               << IndexInSuccessors << " successor weight to "
+               << Weight << "\n");
+}
+
+/// Get an edge's probability, relative to other out-edges from Src.
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  uint32_t N = getEdgeWeight(Src, IndexInSuccessors);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+/// Get the probability of going from Src to Dst. It returns the sum of all
+/// probabilities for edges from Src to Dst.
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
+
+  uint32_t N = getEdgeWeight(Src, Dst);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+raw_ostream &
+BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
+                                            const BasicBlock *Src,
+                                            const BasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge " << Src->getName() << " -> " << Dst->getName()
+     << " probability is " << Prob
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/contrib/llvm/lib/Analysis/CFGPrinter.cpp b/contrib/llvm/lib/Analysis/CFGPrinter.cpp
new file mode 100644
index 000000000000..9b6879a42ed4
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/CFGPrinter.cpp
@@ -0,0 +1,164 @@
+//===- CFGPrinter.cpp - DOT printer for the control flow graph ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a '-dot-cfg' analysis pass, which emits the
+// cfg.<fnname>.dot file for each function in the program, with a graph of the
+// CFG for that function.
+//
+// The other main feature of this file is that it implements the
+// Function::viewCFG method, which is useful for debugging passes which operate
+// on the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  struct CFGViewer : public FunctionPass {
+    static char ID; // Pass identifcation, replacement for typeid
+    CFGViewer() : FunctionPass(ID) {
+      initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      F.viewCFG();
+      return false;
+    }
+
+    void print(raw_ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGViewer::ID = 0;
+INITIALIZE_PASS(CFGViewer, "view-cfg", "View CFG of function", false, true)
+
+namespace {
+  struct CFGOnlyViewer : public FunctionPass {
+    static char ID; // Pass identifcation, replacement for typeid
+    CFGOnlyViewer() : FunctionPass(ID) {
+      initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      F.viewCFGOnly();
+      return false;
+    }
+
+    void print(raw_ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGOnlyViewer::ID = 0;
+INITIALIZE_PASS(CFGOnlyViewer, "view-cfg-only",
+                "View CFG of function (with no function bodies)", false, true)
+
+namespace {
+  struct CFGPrinter : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGPrinter() : FunctionPass(ID) {
+      initializeCFGPrinterPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      std::string Filename = "cfg." + F.getName().str() + ".dot";
+      errs() << "Writing '" << Filename << "'...";
+      
+      std::string ErrorInfo;
+      raw_fd_ostream File(Filename.c_str(), ErrorInfo);
+
+      if (ErrorInfo.empty())
+        WriteGraph(File, (const Function*)&F);
+      else
+        errs() << "  error opening file for writing!";
+      errs() << "\n";
+      return false;
+    }
+
+    void print(raw_ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGPrinter::ID = 0;
+INITIALIZE_PASS(CFGPrinter, "dot-cfg", "Print CFG of function to 'dot' file", 
+                false, true)
+
+namespace {
+  struct CFGOnlyPrinter : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGOnlyPrinter() : FunctionPass(ID) {
+      initializeCFGOnlyPrinterPass(*PassRegistry::getPassRegistry());
+    }
+    
+    virtual bool runOnFunction(Function &F) {
+      std::string Filename = "cfg." + F.getName().str() + ".dot";
+      errs() << "Writing '" << Filename << "'...";
+
+      std::string ErrorInfo;
+      raw_fd_ostream File(Filename.c_str(), ErrorInfo);
+      
+      if (ErrorInfo.empty())
+        WriteGraph(File, (const Function*)&F, true);
+      else
+        errs() << "  error opening file for writing!";
+      errs() << "\n";
+      return false;
+    }
+    void print(raw_ostream &OS, const Module* = 0) const {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char CFGOnlyPrinter::ID = 0;
+INITIALIZE_PASS(CFGOnlyPrinter, "dot-cfg-only",
+   "Print CFG of function to 'dot' file (with no function bodies)",
+   false, true)
+
+/// viewCFG - This function is meant for use from the debugger.  You can just
+/// say 'call F->viewCFG()' and a ghostview window should pop up from the
+/// program, displaying the CFG of the current function.  This depends on there
+/// being a 'dot' and 'gv' program in your path.
+///
+void Function::viewCFG() const {
+  ViewGraph(this, "cfg" + getName());
+}
+
+/// viewCFGOnly - This function is meant for use from the debugger.  It works
+/// just like viewCFG, but it does not include the contents of basic blocks
+/// into the nodes, just the label.  If you are only interested in the CFG t
+/// his can make the graph smaller.
+///
+void Function::viewCFGOnly() const {
+  ViewGraph(this, "cfg" + getName(), true);
+}
+
+FunctionPass *llvm::createCFGPrinterPass () {
+  return new CFGPrinter();
+}
+
+FunctionPass *llvm::createCFGOnlyPrinterPass () {
+  return new CFGOnlyPrinter();
+}
+
diff --git a/contrib/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
new file mode 100644
index 000000000000..a7292706dfa8
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
@@ -0,0 +1,180 @@
+//===--- CaptureTracking.cpp - Determine whether a pointer is captured ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines that help determine which pointers are captured.
+// A pointer value is captured if the function makes a copy of any part of the
+// pointer that outlives the call.  Not being captured means, more or less, that
+// the pointer is only dereferenced and not stored in a global.  Returning part
+// of the pointer as the function return value may or may not count as capturing
+// the pointer, depending on the context.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CallSite.h"
+
+using namespace llvm;
+
+CaptureTracker::~CaptureTracker() {}
+
+bool CaptureTracker::shouldExplore(Use *U) { return true; }
+
+namespace {
+  struct SimpleCaptureTracker : public CaptureTracker {
+    explicit SimpleCaptureTracker(bool ReturnCaptures)
+      : ReturnCaptures(ReturnCaptures), Captured(false) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool captured(Use *U) {
+      if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
+        return false;
+
+      Captured = true;
+      return true;
+    }
+
+    bool ReturnCaptures;
+
+    bool Captured;
+  };
+}
+
+/// PointerMayBeCaptured - Return true if this pointer value may be captured
+/// by the enclosing function (which is required to exist).  This routine can
+/// be expensive, so consider caching the results.  The boolean ReturnCaptures
+/// specifies whether returning the value (or part of it) from the function
+/// counts as capturing it or not.  The boolean StoreCaptures specified whether
+/// storing the value (or part of it) into memory anywhere automatically
+/// counts as capturing it or not.
+bool llvm::PointerMayBeCaptured(const Value *V,
+                                bool ReturnCaptures, bool StoreCaptures) {
+  assert(!isa<GlobalValue>(V) &&
+         "It doesn't make sense to ask whether a global is captured.");
+
+  // TODO: If StoreCaptures is not true, we could do Fancy analysis
+  // to determine whether this store is not actually an escape point.
+  // In that case, BasicAliasAnalysis should be updated as well to
+  // take advantage of this.
+  (void)StoreCaptures;
+
+  SimpleCaptureTracker SCT(ReturnCaptures);
+  PointerMayBeCaptured(V, &SCT);
+  return SCT.Captured;
+}
+
+/// TODO: Write a new FunctionPass AliasAnalysis so that it can keep
+/// a cache. Then we can move the code from BasicAliasAnalysis into
+/// that path, and remove this threshold.
+static int const Threshold = 20;
+
+void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
+  assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
+  SmallVector<Use*, Threshold> Worklist;
+  SmallSet<Use*, Threshold> Visited;
+  int Count = 0;
+
+  for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+       UI != UE; ++UI) {
+    // If there are lots of uses, conservatively say that the value
+    // is captured to avoid taking too much compile time.
+    if (Count++ >= Threshold)
+      return Tracker->tooManyUses();
+
+    Use *U = &UI.getUse();
+    if (!Tracker->shouldExplore(U)) continue;
+    Visited.insert(U);
+    Worklist.push_back(U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    V = U->get();
+
+    switch (I->getOpcode()) {
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS(I);
+      // Not captured if the callee is readonly, doesn't return a copy through
+      // its return value and doesn't unwind (a readonly function can leak bits
+      // by throwing an exception or not depending on the input value).
+      if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
+        break;
+
+      // Not captured if only passed via 'nocapture' arguments.  Note that
+      // calling a function pointer does not in itself cause the pointer to
+      // be captured.  This is a subtle point considering that (for example)
+      // the callee might return its own address.  It is analogous to saying
+      // that loading a value from a pointer does not cause the pointer to be
+      // captured, even though the loaded value might be the pointer itself
+      // (think of self-referential objects).
+      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+      for (CallSite::arg_iterator A = B; A != E; ++A)
+        if (A->get() == V && !CS.doesNotCapture(A - B))
+          // The parameter is not marked 'nocapture' - captured.
+          if (Tracker->captured(U))
+            return;
+      break;
+    }
+    case Instruction::Load:
+      // Loading from a pointer does not cause it to be captured.
+      break;
+    case Instruction::VAArg:
+      // "va-arg" from a pointer does not cause it to be captured.
+      break;
+    case Instruction::Store:
+      if (V == I->getOperand(0))
+        // Stored the pointer - conservatively assume it may be captured.
+        if (Tracker->captured(U))
+          return;
+      // Storing to the pointee does not cause the pointer to be captured.
+      break;
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+      // The original value is not captured via this if the new value isn't.
+      for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Use *U = &UI.getUse();
+        if (Visited.insert(U))
+          if (Tracker->shouldExplore(U))
+            Worklist.push_back(U);
+      }
+      break;
+    case Instruction::ICmp:
+      // Don't count comparisons of a no-alias return value against null as
+      // captures. This allows us to ignore comparisons of malloc results
+      // with null, for example.
+      if (isNoAliasCall(V->stripPointerCasts()))
+        if (ConstantPointerNull *CPN =
+              dyn_cast<ConstantPointerNull>(I->getOperand(1)))
+          if (CPN->getType()->getAddressSpace() == 0)
+            break;
+      // Otherwise, be conservative. There are crazy ways to capture pointers
+      // using comparisons.
+      if (Tracker->captured(U))
+        return;
+      break;
+    default:
+      // Something else - be conservative and say it is captured.
+      if (Tracker->captured(U))
+        return;
+      break;
+    }
+  }
+
+  // All uses examined.
+}
diff --git a/contrib/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
new file mode 100644
index 000000000000..8cda01a24c0d
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
@@ -0,0 +1,96 @@
+//===- CodeMetrics.cpp - Code cost measurements ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements code cost measurement utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+
+using namespace llvm;
+
+/// analyzeBasicBlock - Fill in the current structure with information gleaned
+/// from the specified block.
+void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
+                                    const TargetTransformInfo &TTI) {
+  ++NumBlocks;
+  unsigned NumInstsBeforeThisBB = NumInsts;
+  for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
+       II != E; ++II) {
+    // Special handling for calls.
+    if (isa<CallInst>(II) || isa<InvokeInst>(II)) {
+      ImmutableCallSite CS(cast<Instruction>(II));
+
+      if (const Function *F = CS.getCalledFunction()) {
+        // If a function is both internal and has a single use, then it is
+        // extremely likely to get inlined in the future (it was probably
+        // exposed by an interleaved devirtualization pass).
+        if (!CS.isNoInline() && F->hasInternalLinkage() && F->hasOneUse())
+          ++NumInlineCandidates;
+
+        // If this call is to function itself, then the function is recursive.
+        // Inlining it into other functions is a bad idea, because this is
+        // basically just a form of loop peeling, and our metrics aren't useful
+        // for that case.
+        if (F == BB->getParent())
+          isRecursive = true;
+
+        if (TTI.isLoweredToCall(F))
+          ++NumCalls;
+      } else {
+        // We don't want inline asm to count as a call - that would prevent loop
+        // unrolling. The argument setup cost is still real, though.
+        if (!isa<InlineAsm>(CS.getCalledValue()))
+          ++NumCalls;
+      }
+    }
+
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (!AI->isStaticAlloca())
+        this->usesDynamicAlloca = true;
+    }
+
+    if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
+      ++NumVectorInsts;
+
+    if (const CallInst *CI = dyn_cast<CallInst>(II))
+      if (CI->hasFnAttr(Attribute::NoDuplicate))
+        notDuplicatable = true;
+
+    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(II))
+      if (InvI->hasFnAttr(Attribute::NoDuplicate))
+        notDuplicatable = true;
+
+    NumInsts += TTI.getUserCost(&*II);
+  }
+
+  if (isa<ReturnInst>(BB->getTerminator()))
+    ++NumRets;
+
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this indirect
+  // jump would jump from the inlined copy of the function into the original
+  // function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions
+  // with indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.  And as a QOI issue,
+  // if someone is using a blockaddress without an indirectbr, and that
+  // reference somehow ends up in another function or global, we probably
+  // don't want to inline this function.
+  notDuplicatable |= isa<IndirectBrInst>(BB->getTerminator());
+
+  // Remember NumInsts for this BB.
+  NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
+}
diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
new file mode 100644
index 000000000000..09d7608c51da
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
@@ -0,0 +1,1578 @@
+//===-- ConstantFolding.cpp - Fold instructions into constants ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines routines for folding instructions into constants.
+//
+// Also, to supplement the basic IR ConstantExpr simplifications,
+// this file defines some additional folding routines that can make use of
+// DataLayout information. These functions cannot go in IR due to library
+// dependency issues.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FEnv.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <cerrno>
+#include <cmath>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Constant Folding internal helper functions
+//===----------------------------------------------------------------------===//
+
+/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with
+/// DataLayout.  This always returns a non-null constant, but it may be a
+/// ConstantExpr if unfoldable.
+static Constant *FoldBitCast(Constant *C, Type *DestTy,
+                             const DataLayout &TD) {
+  // Catch the obvious splat cases.
+  if (C->isNullValue() && !DestTy->isX86_MMXTy())
+    return Constant::getNullValue(DestTy);
+  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy())
+    return Constant::getAllOnesValue(DestTy);
+
+  // Handle a vector->integer cast.
+  if (IntegerType *IT = dyn_cast<IntegerType>(DestTy)) {
+    VectorType *VTy = dyn_cast<VectorType>(C->getType());
+    if (VTy == 0)
+      return ConstantExpr::getBitCast(C, DestTy);
+
+    unsigned NumSrcElts = VTy->getNumElements();
+    Type *SrcEltTy = VTy->getElementType();
+
+    // If the vector is a vector of floating point, convert it to vector of int
+    // to simplify things.
+    if (SrcEltTy->isFloatingPointTy()) {
+      unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
+      Type *SrcIVTy =
+        VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
+      // Ask IR to do the conversion now that #elts line up.
+      C = ConstantExpr::getBitCast(C, SrcIVTy);
+    }
+
+    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
+    if (CDV == 0)
+      return ConstantExpr::getBitCast(C, DestTy);
+
+    // Now that we know that the input value is a vector of integers, just shift
+    // and insert them into our result.
+    unsigned BitShift = TD.getTypeAllocSizeInBits(SrcEltTy);
+    APInt Result(IT->getBitWidth(), 0);
+    for (unsigned i = 0; i != NumSrcElts; ++i) {
+      Result <<= BitShift;
+      if (TD.isLittleEndian())
+        Result |= CDV->getElementAsInteger(NumSrcElts-i-1);
+      else
+        Result |= CDV->getElementAsInteger(i);
+    }
+
+    return ConstantInt::get(IT, Result);
+  }
+
+  // The code below only handles casts to vectors currently.
+  VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
+  if (DestVTy == 0)
+    return ConstantExpr::getBitCast(C, DestTy);
+
+  // If this is a scalar -> vector cast, convert the input into a <1 x scalar>
+  // vector so the code below can handle it uniformly.
+  if (isa<ConstantFP>(C) || isa<ConstantInt>(C)) {
+    Constant *Ops = C; // don't take the address of C!
+    return FoldBitCast(ConstantVector::get(Ops), DestTy, TD);
+  }
+
+  // If this is a bitcast from constant vector -> vector, fold it.
+  if (!isa<ConstantDataVector>(C) && !isa<ConstantVector>(C))
+    return ConstantExpr::getBitCast(C, DestTy);
+
+  // If the element types match, IR can fold it.
+  unsigned NumDstElt = DestVTy->getNumElements();
+  unsigned NumSrcElt = C->getType()->getVectorNumElements();
+  if (NumDstElt == NumSrcElt)
+    return ConstantExpr::getBitCast(C, DestTy);
+
+  Type *SrcEltTy = C->getType()->getVectorElementType();
+  Type *DstEltTy = DestVTy->getElementType();
+
+  // Otherwise, we're changing the number of elements in a vector, which
+  // requires endianness information to do the right thing.  For example,
+  //    bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
+  // folds to (little endian):
+  //    <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+  // and to (big endian):
+  //    <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+
+  // First thing is first.  We only want to think about integer here, so if
+  // we have something in FP form, recast it as integer.
+  if (DstEltTy->isFloatingPointTy()) {
+    // Fold to an vector of integers with same size as our FP type.
+    unsigned FPWidth = DstEltTy->getPrimitiveSizeInBits();
+    Type *DestIVTy =
+      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
+    // Recursively handle this integer conversion, if possible.
+    C = FoldBitCast(C, DestIVTy, TD);
+
+    // Finally, IR can handle this now that #elts line up.
+    return ConstantExpr::getBitCast(C, DestTy);
+  }
+
+  // Okay, we know the destination is integer, if the input is FP, convert
+  // it to integer first.
+  if (SrcEltTy->isFloatingPointTy()) {
+    unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
+    Type *SrcIVTy =
+      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
+    // Ask IR to do the conversion now that #elts line up.
+    C = ConstantExpr::getBitCast(C, SrcIVTy);
+    // If IR wasn't able to fold it, bail out.
+    if (!isa<ConstantVector>(C) &&  // FIXME: Remove ConstantVector.
+        !isa<ConstantDataVector>(C))
+      return C;
+  }
+
+  // Now we know that the input and output vectors are both integer vectors
+  // of the same size, and that their #elements is not the same.  Do the
+  // conversion here, which depends on whether the input or output has
+  // more elements.
+  bool isLittleEndian = TD.isLittleEndian();
+
+  SmallVector<Constant*, 32> Result;
+  if (NumDstElt < NumSrcElt) {
+    // Handle: bitcast (<4 x i32> <i32 0, i32 1, i32 2, i32 3> to <2 x i64>)
+    Constant *Zero = Constant::getNullValue(DstEltTy);
+    unsigned Ratio = NumSrcElt/NumDstElt;
+    unsigned SrcBitSize = SrcEltTy->getPrimitiveSizeInBits();
+    unsigned SrcElt = 0;
+    for (unsigned i = 0; i != NumDstElt; ++i) {
+      // Build each element of the result.
+      Constant *Elt = Zero;
+      unsigned ShiftAmt = isLittleEndian ? 0 : SrcBitSize*(Ratio-1);
+      for (unsigned j = 0; j != Ratio; ++j) {
+        Constant *Src =dyn_cast<ConstantInt>(C->getAggregateElement(SrcElt++));
+        if (!Src)  // Reject constantexpr elements.
+          return ConstantExpr::getBitCast(C, DestTy);
+
+        // Zero extend the element to the right size.
+        Src = ConstantExpr::getZExt(Src, Elt->getType());
+
+        // Shift it to the right place, depending on endianness.
+        Src = ConstantExpr::getShl(Src,
+                                   ConstantInt::get(Src->getType(), ShiftAmt));
+        ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+
+        // Mix it in.
+        Elt = ConstantExpr::getOr(Elt, Src);
+      }
+      Result.push_back(Elt);
+    }
+    return ConstantVector::get(Result);
+  }
+
+  // Handle: bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
+  unsigned Ratio = NumDstElt/NumSrcElt;
+  unsigned DstBitSize = DstEltTy->getPrimitiveSizeInBits();
+
+  // Loop over each source value, expanding into multiple results.
+  for (unsigned i = 0; i != NumSrcElt; ++i) {
+    Constant *Src = dyn_cast<ConstantInt>(C->getAggregateElement(i));
+    if (!Src)  // Reject constantexpr elements.
+      return ConstantExpr::getBitCast(C, DestTy);
+
+    unsigned ShiftAmt = isLittleEndian ? 0 : DstBitSize*(Ratio-1);
+    for (unsigned j = 0; j != Ratio; ++j) {
+      // Shift the piece of the value into the right place, depending on
+      // endianness.
+      Constant *Elt = ConstantExpr::getLShr(Src,
+                                  ConstantInt::get(Src->getType(), ShiftAmt));
+      ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+
+      // Truncate and remember this piece.
+      Result.push_back(ConstantExpr::getTrunc(Elt, DstEltTy));
+    }
+  }
+
+  return ConstantVector::get(Result);
+}
+
+
+/// IsConstantOffsetFromGlobal - If this constant is actually a constant offset
+/// from a global, return the global and the constant.  Because of
+/// constantexprs, this function is recursive.
+static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
+                                       APInt &Offset, const DataLayout &TD) {
+  // Trivial case, constant is the global.
+  if ((GV = dyn_cast<GlobalValue>(C))) {
+    Offset.clearAllBits();
+    return true;
+  }
+
+  // Otherwise, if this isn't a constant expr, bail out.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE) return false;
+
+  // Look through ptr->int and ptr->ptr casts.
+  if (CE->getOpcode() == Instruction::PtrToInt ||
+      CE->getOpcode() == Instruction::BitCast)
+    return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
+
+  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
+    // If the base isn't a global+constant, we aren't either.
+    if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
+      return false;
+
+    // Otherwise, add any offset that our operands provide.
+    return GEP->accumulateConstantOffset(TD, Offset);
+  }
+
+  return false;
+}
+
+/// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
+/// constant being copied out of. ByteOffset is an offset into C.  CurPtr is the
+/// pointer to copy results into and BytesLeft is the number of bytes left in
+/// the CurPtr buffer.  TD is the target data.
+static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
+                               unsigned char *CurPtr, unsigned BytesLeft,
+                               const DataLayout &TD) {
+  assert(ByteOffset <= TD.getTypeAllocSize(C->getType()) &&
+         "Out of range access");
+
+  // If this element is zero or undefined, we can just return since *CurPtr is
+  // zero initialized.
+  if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
+    return true;
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    if (CI->getBitWidth() > 64 ||
+        (CI->getBitWidth() & 7) != 0)
+      return false;
+
+    uint64_t Val = CI->getZExtValue();
+    unsigned IntBytes = unsigned(CI->getBitWidth()/8);
+
+    for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
+      int n = ByteOffset;
+      if (!TD.isLittleEndian())
+        n = IntBytes - n - 1;
+      CurPtr[i] = (unsigned char)(Val >> (n * 8));
+      ++ByteOffset;
+    }
+    return true;
+  }
+
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isDoubleTy()) {
+      C = FoldBitCast(C, Type::getInt64Ty(C->getContext()), TD);
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
+    if (CFP->getType()->isFloatTy()){
+      C = FoldBitCast(C, Type::getInt32Ty(C->getContext()), TD);
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
+    if (CFP->getType()->isHalfTy()){
+      C = FoldBitCast(C, Type::getInt16Ty(C->getContext()), TD);
+      return ReadDataFromGlobal(C, ByteOffset, CurPtr, BytesLeft, TD);
+    }
+    return false;
+  }
+
+  if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
+    const StructLayout *SL = TD.getStructLayout(CS->getType());
+    unsigned Index = SL->getElementContainingOffset(ByteOffset);
+    uint64_t CurEltOffset = SL->getElementOffset(Index);
+    ByteOffset -= CurEltOffset;
+
+    while (1) {
+      // If the element access is to the element itself and not to tail padding,
+      // read the bytes from the element.
+      uint64_t EltSize = TD.getTypeAllocSize(CS->getOperand(Index)->getType());
+
+      if (ByteOffset < EltSize &&
+          !ReadDataFromGlobal(CS->getOperand(Index), ByteOffset, CurPtr,
+                              BytesLeft, TD))
+        return false;
+
+      ++Index;
+
+      // Check to see if we read from the last struct element, if so we're done.
+      if (Index == CS->getType()->getNumElements())
+        return true;
+
+      // If we read all of the bytes we needed from this element we're done.
+      uint64_t NextEltOffset = SL->getElementOffset(Index);
+
+      if (BytesLeft <= NextEltOffset-CurEltOffset-ByteOffset)
+        return true;
+
+      // Move to the next element of the struct.
+      CurPtr += NextEltOffset-CurEltOffset-ByteOffset;
+      BytesLeft -= NextEltOffset-CurEltOffset-ByteOffset;
+      ByteOffset = 0;
+      CurEltOffset = NextEltOffset;
+    }
+    // not reached.
+  }
+
+  if (isa<ConstantArray>(C) || isa<ConstantVector>(C) ||
+      isa<ConstantDataSequential>(C)) {
+    Type *EltTy = cast<SequentialType>(C->getType())->getElementType();
+    uint64_t EltSize = TD.getTypeAllocSize(EltTy);
+    uint64_t Index = ByteOffset / EltSize;
+    uint64_t Offset = ByteOffset - Index * EltSize;
+    uint64_t NumElts;
+    if (ArrayType *AT = dyn_cast<ArrayType>(C->getType()))
+      NumElts = AT->getNumElements();
+    else
+      NumElts = cast<VectorType>(C->getType())->getNumElements();
+
+    for (; Index != NumElts; ++Index) {
+      if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
+                              BytesLeft, TD))
+        return false;
+
+      uint64_t BytesWritten = EltSize - Offset;
+      assert(BytesWritten <= EltSize && "Not indexing into this element?");
+      if (BytesWritten >= BytesLeft)
+        return true;
+
+      Offset = 0;
+      BytesLeft -= BytesWritten;
+      CurPtr += BytesWritten;
+    }
+    return true;
+  }
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    if (CE->getOpcode() == Instruction::IntToPtr &&
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext()))
+      return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
+                                BytesLeft, TD);
+  }
+
+  // Otherwise, unknown initializer type.
+  return false;
+}
+
+static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
+                                                 const DataLayout &TD) {
+  Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
+
+  // If this isn't an integer load we can't fold it directly.
+  if (!IntType) {
+    // If this is a float/double load, we can try folding it as an int32/64 load
+    // and then bitcast the result.  This can be useful for union cases.  Note
+    // that address spaces don't matter here since we're not going to result in
+    // an actual new load.
+    Type *MapTy;
+    if (LoadTy->isHalfTy())
+      MapTy = Type::getInt16PtrTy(C->getContext());
+    else if (LoadTy->isFloatTy())
+      MapTy = Type::getInt32PtrTy(C->getContext());
+    else if (LoadTy->isDoubleTy())
+      MapTy = Type::getInt64PtrTy(C->getContext());
+    else if (LoadTy->isVectorTy()) {
+      MapTy = IntegerType::get(C->getContext(),
+                               TD.getTypeAllocSizeInBits(LoadTy));
+      MapTy = PointerType::getUnqual(MapTy);
+    } else
+      return 0;
+
+    C = FoldBitCast(C, MapTy, TD);
+    if (Constant *Res = FoldReinterpretLoadFromConstPtr(C, TD))
+      return FoldBitCast(Res, LoadTy, TD);
+    return 0;
+  }
+
+  unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
+  if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
+
+  GlobalValue *GVal;
+  APInt Offset(TD.getPointerSizeInBits(), 0);
+  if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
+    return 0;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+      !GV->getInitializer()->getType()->isSized())
+    return 0;
+
+  // If we're loading off the beginning of the global, some bytes may be valid,
+  // but we don't try to handle this.
+  if (Offset.isNegative()) return 0;
+
+  // If we're not accessing anything in this constant, the result is undefined.
+  if (Offset.getZExtValue() >=
+      TD.getTypeAllocSize(GV->getInitializer()->getType()))
+    return UndefValue::get(IntType);
+
+  unsigned char RawBytes[32] = {0};
+  if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes,
+                          BytesLoaded, TD))
+    return 0;
+
+  APInt ResultVal = APInt(IntType->getBitWidth(), 0);
+  if (TD.isLittleEndian()) {
+    ResultVal = RawBytes[BytesLoaded - 1];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[BytesLoaded-1-i];
+    }
+  } else {
+    ResultVal = RawBytes[0];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[i];
+    }
+  }
+
+  return ConstantInt::get(IntType->getContext(), ResultVal);
+}
+
+/// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
+/// produce if it is constant and determinable.  If this is not determinable,
+/// return null.
+Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
+                                             const DataLayout *TD) {
+  // First, try the easy cases:
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+
+  // If the loaded value isn't a constant expr, we can't handle it.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE) return 0;
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+        if (Constant *V =
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
+          return V;
+  }
+
+  // Instead of loading constant c string, use corresponding integer value
+  // directly if string length is small enough.
+  StringRef Str;
+  if (TD && getConstantStringInfo(CE, Str) && !Str.empty()) {
+    unsigned StrLen = Str.size();
+    Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+    unsigned NumBits = Ty->getPrimitiveSizeInBits();
+    // Replace load with immediate integer if the result is an integer or fp
+    // value.
+    if ((NumBits >> 3) == StrLen + 1 && (NumBits & 7) == 0 &&
+        (isa<IntegerType>(Ty) || Ty->isFloatingPointTy())) {
+      APInt StrVal(NumBits, 0);
+      APInt SingleChar(NumBits, 0);
+      if (TD->isLittleEndian()) {
+        for (signed i = StrLen-1; i >= 0; i--) {
+          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+          StrVal = (StrVal << 8) | SingleChar;
+        }
+      } else {
+        for (unsigned i = 0; i < StrLen; i++) {
+          SingleChar = (uint64_t) Str[i] & UCHAR_MAX;
+          StrVal = (StrVal << 8) | SingleChar;
+        }
+        // Append NULL at the end.
+        SingleChar = 0;
+        StrVal = (StrVal << 8) | SingleChar;
+      }
+
+      Constant *Res = ConstantInt::get(CE->getContext(), StrVal);
+      if (Ty->isFloatingPointTy())
+        Res = ConstantExpr::getBitCast(Res, Ty);
+      return Res;
+    }
+  }
+
+  // If this load comes from anywhere in a constant global, and if the global
+  // is all undef or zero, we know what it loads.
+  if (GlobalVariable *GV =
+        dyn_cast<GlobalVariable>(GetUnderlyingObject(CE, TD))) {
+    if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+      Type *ResTy = cast<PointerType>(C->getType())->getElementType();
+      if (GV->getInitializer()->isNullValue())
+        return Constant::getNullValue(ResTy);
+      if (isa<UndefValue>(GV->getInitializer()))
+        return UndefValue::get(ResTy);
+    }
+  }
+
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.
+  if (TD)
+    return FoldReinterpretLoadFromConstPtr(CE, *TD);
+  return 0;
+}
+
+static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
+  if (LI->isVolatile()) return 0;
+
+  if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
+    return ConstantFoldLoadFromConstPtr(C, TD);
+
+  return 0;
+}
+
+/// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
+/// Attempt to symbolically evaluate the result of a binary operator merging
+/// these together.  If target data info is available, it is provided as DL,
+/// otherwise DL is null.
+static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
+                                           Constant *Op1, const DataLayout *DL){
+  // SROA
+
+  // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
+  // Fold (lshr (or X, Y), 32) -> (lshr [X/Y], 32) if one doesn't contribute
+  // bits.
+
+
+  if (Opc == Instruction::And && DL) {
+    unsigned BitWidth = DL->getTypeSizeInBits(Op0->getType());
+    APInt KnownZero0(BitWidth, 0), KnownOne0(BitWidth, 0);
+    APInt KnownZero1(BitWidth, 0), KnownOne1(BitWidth, 0);
+    ComputeMaskedBits(Op0, KnownZero0, KnownOne0, DL);
+    ComputeMaskedBits(Op1, KnownZero1, KnownOne1, DL);
+    if ((KnownOne1 | KnownZero0).isAllOnesValue()) {
+      // All the bits of Op0 that the 'and' could be masking are already zero.
+      return Op0;
+    }
+    if ((KnownOne0 | KnownZero1).isAllOnesValue()) {
+      // All the bits of Op1 that the 'and' could be masking are already zero.
+      return Op1;
+    }
+
+    APInt KnownZero = KnownZero0 | KnownZero1;
+    APInt KnownOne = KnownOne0 & KnownOne1;
+    if ((KnownZero | KnownOne).isAllOnesValue()) {
+      return ConstantInt::get(Op0->getType(), KnownOne);
+    }
+  }
+
+  // If the constant expr is something like &A[123] - &A[4].f, fold this into a
+  // constant.  This happens frequently when iterating over a global array.
+  if (Opc == Instruction::Sub && DL) {
+    GlobalValue *GV1, *GV2;
+    unsigned PtrSize = DL->getPointerSizeInBits();
+    unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
+    APInt Offs1(PtrSize, 0), Offs2(PtrSize, 0);
+
+    if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *DL))
+      if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *DL) &&
+          GV1 == GV2) {
+        // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
+        // PtrToInt may change the bitwidth so we have convert to the right size
+        // first.
+        return ConstantInt::get(Op0->getType(), Offs1.zextOrTrunc(OpSize) -
+                                                Offs2.zextOrTrunc(OpSize));
+      }
+  }
+
+  return 0;
+}
+
+/// CastGEPIndices - If array indices are not pointer-sized integers,
+/// explicitly cast them so that they aren't implicitly casted by the
+/// getelementptr.
+static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
+                                Type *ResultTy, const DataLayout *TD,
+                                const TargetLibraryInfo *TLI) {
+  if (!TD) return 0;
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+
+  bool Any = false;
+  SmallVector<Constant*, 32> NewIdxs;
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
+    if ((i == 1 ||
+         !isa<StructType>(GetElementPtrInst::getIndexedType(Ops[0]->getType(),
+                                                        Ops.slice(1, i-1)))) &&
+        Ops[i]->getType() != IntPtrTy) {
+      Any = true;
+      NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
+                                                                      true,
+                                                                      IntPtrTy,
+                                                                      true),
+                                              Ops[i], IntPtrTy));
+    } else
+      NewIdxs.push_back(Ops[i]);
+  }
+  if (!Any) return 0;
+
+  Constant *C =
+    ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
+      C = Folded;
+  return C;
+}
+
+/// Strip the pointer casts, but preserve the address space information.
+static Constant* StripPtrCastKeepAS(Constant* Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Not a pointer type");
+  PointerType *OldPtrTy = cast<PointerType>(Ptr->getType());
+  Ptr = cast<Constant>(Ptr->stripPointerCasts());
+  PointerType *NewPtrTy = cast<PointerType>(Ptr->getType());
+
+  // Preserve the address space number of the pointer.
+  if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
+    NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
+      OldPtrTy->getAddressSpace());
+    Ptr = ConstantExpr::getBitCast(Ptr, NewPtrTy);
+  }
+  return Ptr;
+}
+
+/// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
+/// constant expression, do so.
+static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
+                                         Type *ResultTy, const DataLayout *TD,
+                                         const TargetLibraryInfo *TLI) {
+  Constant *Ptr = Ops[0];
+  if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
+      !Ptr->getType()->isPointerTy())
+    return 0;
+
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+
+  // If this is a constant expr gep that is effectively computing an
+  // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    if (!isa<ConstantInt>(Ops[i])) {
+
+      // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
+      // "inttoptr (sub (ptrtoint Ptr), V)"
+      if (Ops.size() == 2 &&
+          cast<PointerType>(ResultTy)->getElementType()->isIntegerTy(8)) {
+        ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
+        assert((CE == 0 || CE->getType() == IntPtrTy) &&
+               "CastGEPIndices didn't canonicalize index types!");
+        if (CE && CE->getOpcode() == Instruction::Sub &&
+            CE->getOperand(0)->isNullValue()) {
+          Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
+          Res = ConstantExpr::getSub(Res, CE->getOperand(1));
+          Res = ConstantExpr::getIntToPtr(Res, ResultTy);
+          if (ConstantExpr *ResCE = dyn_cast<ConstantExpr>(Res))
+            Res = ConstantFoldConstantExpression(ResCE, TD, TLI);
+          return Res;
+        }
+      }
+      return 0;
+    }
+
+  unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
+  APInt Offset =
+    APInt(BitWidth, TD->getIndexedOffset(Ptr->getType(),
+                                         makeArrayRef((Value *const*)
+                                                        Ops.data() + 1,
+                                                      Ops.size() - 1)));
+  Ptr = StripPtrCastKeepAS(Ptr);
+
+  // If this is a GEP of a GEP, fold it all into a single GEP.
+  while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+    SmallVector<Value *, 4> NestedOps(GEP->op_begin()+1, GEP->op_end());
+
+    // Do not try the incorporate the sub-GEP if some index is not a number.
+    bool AllConstantInt = true;
+    for (unsigned i = 0, e = NestedOps.size(); i != e; ++i)
+      if (!isa<ConstantInt>(NestedOps[i])) {
+        AllConstantInt = false;
+        break;
+      }
+    if (!AllConstantInt)
+      break;
+
+    Ptr = cast<Constant>(GEP->getOperand(0));
+    Offset += APInt(BitWidth,
+                    TD->getIndexedOffset(Ptr->getType(), NestedOps));
+    Ptr = StripPtrCastKeepAS(Ptr);
+  }
+
+  // If the base value for this address is a literal integer value, fold the
+  // getelementptr to the resulting integer value casted to the pointer type.
+  APInt BasePtr(BitWidth, 0);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
+        BasePtr = Base->getValue().zextOrTrunc(BitWidth);
+  if (Ptr->isNullValue() || BasePtr != 0) {
+    Constant *C = ConstantInt::get(Ptr->getContext(), Offset+BasePtr);
+    return ConstantExpr::getIntToPtr(C, ResultTy);
+  }
+
+  // Otherwise form a regular getelementptr. Recompute the indices so that
+  // we eliminate over-indexing of the notional static type array bounds.
+  // This makes it easy to determine if the getelementptr is "inbounds".
+  // Also, this helps GlobalOpt do SROA on GlobalVariables.
+  Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
+  SmallVector<Constant*, 32> NewIdxs;
+  do {
+    if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
+      if (ATy->isPointerTy()) {
+        // The only pointer indexing we'll do is on the first index of the GEP.
+        if (!NewIdxs.empty())
+          break;
+
+        // Only handle pointers to sized types, not pointers to functions.
+        if (!ATy->getElementType()->isSized())
+          return 0;
+      }
+
+      // Determine which element of the array the offset points into.
+      APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
+      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+      if (ElemSize == 0)
+        // The element size is 0. This may be [0 x Ty]*, so just use a zero
+        // index for this level and proceed to the next level to see if it can
+        // accommodate the offset.
+        NewIdxs.push_back(ConstantInt::get(IntPtrTy, 0));
+      else {
+        // The element size is non-zero divide the offset by the element
+        // size (rounding down), to compute the index at this level.
+        APInt NewIdx = Offset.udiv(ElemSize);
+        Offset -= NewIdx * ElemSize;
+        NewIdxs.push_back(ConstantInt::get(IntPtrTy, NewIdx));
+      }
+      Ty = ATy->getElementType();
+    } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      // If we end up with an offset that isn't valid for this struct type, we
+      // can't re-form this GEP in a regular form, so bail out. The pointer
+      // operand likely went through casts that are necessary to make the GEP
+      // sensible.
+      const StructLayout &SL = *TD->getStructLayout(STy);
+      if (Offset.uge(SL.getSizeInBytes()))
+        break;
+
+      // Determine which field of the struct the offset points into. The
+      // getZExtValue is fine as we've already ensured that the offset is
+      // within the range representable by the StructLayout API.
+      unsigned ElIdx = SL.getElementContainingOffset(Offset.getZExtValue());
+      NewIdxs.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
+                                         ElIdx));
+      Offset -= APInt(BitWidth, SL.getElementOffset(ElIdx));
+      Ty = STy->getTypeAtIndex(ElIdx);
+    } else {
+      // We've reached some non-indexable type.
+      break;
+    }
+  } while (Ty != cast<PointerType>(ResultTy)->getElementType());
+
+  // If we haven't used up the entire offset by descending the static
+  // type, then the offset is pointing into the middle of an indivisible
+  // member, so we can't simplify it.
+  if (Offset != 0)
+    return 0;
+
+  // Create a GEP.
+  Constant *C =
+    ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
+  assert(cast<PointerType>(C->getType())->getElementType() == Ty &&
+         "Computed GetElementPtr has unexpected type!");
+
+  // If we ended up indexing a member with a type that doesn't match
+  // the type of what the original indices indexed, add a cast.
+  if (Ty != cast<PointerType>(ResultTy)->getElementType())
+    C = FoldBitCast(C, ResultTy, *TD);
+
+  return C;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Constant Folding public APIs
+//===----------------------------------------------------------------------===//
+
+/// ConstantFoldInstruction - Try to constant fold the specified instruction.
+/// If successful, the constant result is returned, if not, null is returned.
+/// Note that this fails if not all of the operands are constant.  Otherwise,
+/// this function can only fail when attempting to fold instructions like loads
+/// and stores, which have no constant expression form.
+Constant *llvm::ConstantFoldInstruction(Instruction *I,
+                                        const DataLayout *TD,
+                                        const TargetLibraryInfo *TLI) {
+  // Handle PHI nodes quickly here...
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    Constant *CommonValue = 0;
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = PN->getIncomingValue(i);
+      // If the incoming value is undef then skip it.  Note that while we could
+      // skip the value if it is equal to the phi node itself we choose not to
+      // because that would break the rule that constant folding only applies if
+      // all operands are constants.
+      if (isa<UndefValue>(Incoming))
+        continue;
+      // If the incoming value is not a constant, then give up.
+      Constant *C = dyn_cast<Constant>(Incoming);
+      if (!C)
+        return 0;
+      // Fold the PHI's operands.
+      if (ConstantExpr *NewC = dyn_cast<ConstantExpr>(C))
+        C = ConstantFoldConstantExpression(NewC, TD, TLI);
+      // If the incoming value is a different constant to
+      // the one we saw previously, then give up.
+      if (CommonValue && C != CommonValue)
+        return 0;
+      CommonValue = C;
+    }
+
+
+    // If we reach here, all incoming values are the same constant or undef.
+    return CommonValue ? CommonValue : UndefValue::get(PN->getType());
+  }
+
+  // Scan the operand list, checking to see if they are all constants, if so,
+  // hand off to ConstantFoldInstOperands.
+  SmallVector<Constant*, 8> Ops;
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+    Constant *Op = dyn_cast<Constant>(*i);
+    if (!Op)
+      return 0;  // All operands not constant!
+
+    // Fold the Instruction's operands.
+    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(Op))
+      Op = ConstantFoldConstantExpression(NewCE, TD, TLI);
+
+    Ops.push_back(Op);
+  }
+
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
+                                           TD, TLI);
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return ConstantFoldLoadInst(LI, TD);
+
+  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I))
+    return ConstantExpr::getInsertValue(
+                                cast<Constant>(IVI->getAggregateOperand()),
+                                cast<Constant>(IVI->getInsertedValueOperand()),
+                                IVI->getIndices());
+
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I))
+    return ConstantExpr::getExtractValue(
+                                    cast<Constant>(EVI->getAggregateOperand()),
+                                    EVI->getIndices());
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD, TLI);
+}
+
+/// ConstantFoldConstantExpression - Attempt to fold the constant expression
+/// using the specified DataLayout.  If successful, the constant result is
+/// result is returned, if not, null is returned.
+Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
+                                               const DataLayout *TD,
+                                               const TargetLibraryInfo *TLI) {
+  SmallVector<Constant*, 8> Ops;
+  for (User::const_op_iterator i = CE->op_begin(), e = CE->op_end();
+       i != e; ++i) {
+    Constant *NewC = cast<Constant>(*i);
+    // Recursively fold the ConstantExpr's operands.
+    if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(NewC))
+      NewC = ConstantFoldConstantExpression(NewCE, TD, TLI);
+    Ops.push_back(NewC);
+  }
+
+  if (CE->isCompare())
+    return ConstantFoldCompareInstOperands(CE->getPredicate(), Ops[0], Ops[1],
+                                           TD, TLI);
+  return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(), Ops, TD, TLI);
+}
+
+/// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
+/// specified opcode and operands.  If successful, the constant result is
+/// returned, if not, null is returned.  Note that this function can fail when
+/// attempting to fold instructions like loads and stores, which have no
+/// constant expression form.
+///
+/// TODO: This function neither utilizes nor preserves nsw/nuw/inbounds/etc
+/// information, due to only being passed an opcode and operands. Constant
+/// folding using this function strips this information.
+///
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
+                                         ArrayRef<Constant *> Ops,
+                                         const DataLayout *TD,
+                                         const TargetLibraryInfo *TLI) {
+  // Handle easy binops first.
+  if (Instruction::isBinaryOp(Opcode)) {
+    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
+      if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
+        return C;
+
+    return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
+  }
+
+  switch (Opcode) {
+  default: return 0;
+  case Instruction::ICmp:
+  case Instruction::FCmp: llvm_unreachable("Invalid for compares");
+  case Instruction::Call:
+    if (Function *F = dyn_cast<Function>(Ops.back()))
+      if (canConstantFoldCallTo(F))
+        return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1), TLI);
+    return 0;
+  case Instruction::PtrToInt:
+    // If the input is a inttoptr, eliminate the pair.  This requires knowing
+    // the width of a pointer, so it can't be done in ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD && CE->getOpcode() == Instruction::IntToPtr) {
+        Constant *Input = CE->getOperand(0);
+        unsigned InWidth = Input->getType()->getScalarSizeInBits();
+        if (TD->getPointerSizeInBits() < InWidth) {
+          Constant *Mask =
+            ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
+                                                  TD->getPointerSizeInBits()));
+          Input = ConstantExpr::getAnd(Input, Mask);
+        }
+        // Do a zext or trunc to get to the dest size.
+        return ConstantExpr::getIntegerCast(Input, DestTy, false);
+      }
+    }
+    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::IntToPtr:
+    // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
+    // the int size is >= the ptr size.  This requires knowing the width of a
+    // pointer, so it can't be done in ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
+      if (TD &&
+          TD->getPointerSizeInBits() <= CE->getType()->getScalarSizeInBits() &&
+          CE->getOpcode() == Instruction::PtrToInt)
+        return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+
+    return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+      return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
+  case Instruction::BitCast:
+    if (TD)
+      return FoldBitCast(Ops[0], DestTy, *TD);
+    return ConstantExpr::getBitCast(Ops[0], DestTy);
+  case Instruction::Select:
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+  case Instruction::InsertElement:
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ShuffleVector:
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+  case Instruction::GetElementPtr:
+    if (Constant *C = CastGEPIndices(Ops, DestTy, TD, TLI))
+      return C;
+    if (Constant *C = SymbolicallyEvaluateGEP(Ops, DestTy, TD, TLI))
+      return C;
+
+    return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1));
+  }
+}
+
+/// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
+/// instruction (icmp/fcmp) with the specified operands.  If it fails, it
+/// returns a constant expression of the specified operands.
+///
+Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
+                                                Constant *Ops0, Constant *Ops1,
+                                                const DataLayout *TD,
+                                                const TargetLibraryInfo *TLI) {
+  // fold: icmp (inttoptr x), null         -> icmp x, 0
+  // fold: icmp (ptrtoint x), 0            -> icmp x, null
+  // fold: icmp (inttoptr x), (inttoptr y) -> icmp trunc/zext x, trunc/zext y
+  // fold: icmp (ptrtoint x), (ptrtoint y) -> icmp x, y
+  //
+  // ConstantExpr::getCompare cannot do this, because it doesn't have TD
+  // around to know if bit truncation is happening.
+  if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
+    if (TD && Ops1->isNullValue()) {
+      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
+      if (CE0->getOpcode() == Instruction::IntToPtr) {
+        // Convert the integer value to the right size to ensure we get the
+        // proper extension or truncation.
+        Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
+                                                   IntPtrTy, false);
+        Constant *Null = Constant::getNullValue(C->getType());
+        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      }
+
+      // Only do this transformation if the int is intptrty in size, otherwise
+      // there is a truncation or extension that we aren't modeling.
+      if (CE0->getOpcode() == Instruction::PtrToInt &&
+          CE0->getType() == IntPtrTy) {
+        Constant *C = CE0->getOperand(0);
+        Constant *Null = Constant::getNullValue(C->getType());
+        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      }
+    }
+
+    if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
+      if (TD && CE0->getOpcode() == CE1->getOpcode()) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
+
+        if (CE0->getOpcode() == Instruction::IntToPtr) {
+          // Convert the integer value to the right size to ensure we get the
+          // proper extension or truncation.
+          Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
+                                                      IntPtrTy, false);
+          Constant *C1 = ConstantExpr::getIntegerCast(CE1->getOperand(0),
+                                                      IntPtrTy, false);
+          return ConstantFoldCompareInstOperands(Predicate, C0, C1, TD, TLI);
+        }
+
+        // Only do this transformation if the int is intptrty in size, otherwise
+        // there is a truncation or extension that we aren't modeling.
+        if ((CE0->getOpcode() == Instruction::PtrToInt &&
+             CE0->getType() == IntPtrTy &&
+             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()))
+          return ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0),
+                                                 CE1->getOperand(0), TD, TLI);
+      }
+    }
+
+    // icmp eq (or x, y), 0 -> (icmp eq x, 0) & (icmp eq y, 0)
+    // icmp ne (or x, y), 0 -> (icmp ne x, 0) | (icmp ne y, 0)
+    if ((Predicate == ICmpInst::ICMP_EQ || Predicate == ICmpInst::ICMP_NE) &&
+        CE0->getOpcode() == Instruction::Or && Ops1->isNullValue()) {
+      Constant *LHS =
+        ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0), Ops1,
+                                        TD, TLI);
+      Constant *RHS =
+        ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(1), Ops1,
+                                        TD, TLI);
+      unsigned OpC =
+        Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+      Constant *Ops[] = { LHS, RHS };
+      return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, TD, TLI);
+    }
+  }
+
+  return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
+}
+
+
+/// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
+/// getelementptr constantexpr, return the constant value being addressed by the
+/// constant expression, or null if something is funny and we can't decide.
+Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
+                                                       ConstantExpr *CE) {
+  if (!CE->getOperand(1)->isNullValue())
+    return 0;  // Do not allow stepping over the value!
+
+  // Loop over all of the operands, tracking down which value we are
+  // addressing.
+  for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
+    C = C->getAggregateElement(CE->getOperand(i));
+    if (C == 0) return 0;
+  }
+  return C;
+}
+
+/// ConstantFoldLoadThroughGEPIndices - Given a constant and getelementptr
+/// indices (with an *implied* zero pointer index that is not in the list),
+/// return the constant value being addressed by a virtual load, or null if
+/// something is funny and we can't decide.
+Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
+                                                  ArrayRef<Constant*> Indices) {
+  // Loop over all of the operands, tracking down which value we are
+  // addressing.
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
+    C = C->getAggregateElement(Indices[i]);
+    if (C == 0) return 0;
+  }
+  return C;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Constant Folding for Calls
+//
+
+/// canConstantFoldCallTo - Return true if its even possible to fold a call to
+/// the specified function.
+bool
+llvm::canConstantFoldCallTo(const Function *F) {
+  switch (F->getIntrinsicID()) {
+  case Intrinsic::fabs:
+  case Intrinsic::log:
+  case Intrinsic::log2:
+  case Intrinsic::log10:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::floor:
+  case Intrinsic::sqrt:
+  case Intrinsic::pow:
+  case Intrinsic::powi:
+  case Intrinsic::bswap:
+  case Intrinsic::ctpop:
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::convert_from_fp16:
+  case Intrinsic::convert_to_fp16:
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64:
+    return true;
+  default:
+    return false;
+  case 0: break;
+  }
+
+  if (!F->hasName()) return false;
+  StringRef Name = F->getName();
+
+  // In these cases, the check of the length is required.  We don't want to
+  // return true for a name like "cos\0blah" which strcmp would return equal to
+  // "cos", but has length 8.
+  switch (Name[0]) {
+  default: return false;
+  case 'a':
+    return Name == "acos" || Name == "asin" || Name == "atan" || Name =="atan2";
+  case 'c':
+    return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
+  case 'e':
+    return Name == "exp" || Name == "exp2";
+  case 'f':
+    return Name == "fabs" || Name == "fmod" || Name == "floor";
+  case 'l':
+    return Name == "log" || Name == "log10";
+  case 'p':
+    return Name == "pow";
+  case 's':
+    return Name == "sin" || Name == "sinh" || Name == "sqrt" ||
+      Name == "sinf" || Name == "sqrtf";
+  case 't':
+    return Name == "tan" || Name == "tanh";
+  }
+}
+
+static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
+                                Type *Ty) {
+  sys::llvm_fenv_clearexcept();
+  V = NativeFP(V);
+  if (sys::llvm_fenv_testexcept()) {
+    sys::llvm_fenv_clearexcept();
+    return 0;
+  }
+
+  if (Ty->isHalfTy()) {
+    APFloat APF(V);
+    bool unused;
+    APF.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &unused);
+    return ConstantFP::get(Ty->getContext(), APF);
+  }
+  if (Ty->isFloatTy())
+    return ConstantFP::get(Ty->getContext(), APFloat((float)V));
+  if (Ty->isDoubleTy())
+    return ConstantFP::get(Ty->getContext(), APFloat(V));
+  llvm_unreachable("Can only constant fold half/float/double");
+}
+
+static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
+                                      double V, double W, Type *Ty) {
+  sys::llvm_fenv_clearexcept();
+  V = NativeFP(V, W);
+  if (sys::llvm_fenv_testexcept()) {
+    sys::llvm_fenv_clearexcept();
+    return 0;
+  }
+
+  if (Ty->isHalfTy()) {
+    APFloat APF(V);
+    bool unused;
+    APF.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &unused);
+    return ConstantFP::get(Ty->getContext(), APF);
+  }
+  if (Ty->isFloatTy())
+    return ConstantFP::get(Ty->getContext(), APFloat((float)V));
+  if (Ty->isDoubleTy())
+    return ConstantFP::get(Ty->getContext(), APFloat(V));
+  llvm_unreachable("Can only constant fold half/float/double");
+}
+
+/// ConstantFoldConvertToInt - Attempt to an SSE floating point to integer
+/// conversion of a constant floating point. If roundTowardZero is false, the
+/// default IEEE rounding is used (toward nearest, ties to even). This matches
+/// the behavior of the non-truncating SSE instructions in the default rounding
+/// mode. The desired integer type Ty is used to select how many bits are
+/// available for the result. Returns null if the conversion cannot be
+/// performed, otherwise returns the Constant value resulting from the
+/// conversion.
+static Constant *ConstantFoldConvertToInt(const APFloat &Val,
+                                          bool roundTowardZero, Type *Ty) {
+  // All of these conversion intrinsics form an integer of at most 64bits.
+  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
+  assert(ResultWidth <= 64 &&
+         "Can only constant fold conversions to 64 and 32 bit ints");
+
+  uint64_t UIntVal;
+  bool isExact = false;
+  APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
+                                              : APFloat::rmNearestTiesToEven;
+  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
+                                                  /*isSigned=*/true, mode,
+                                                  &isExact);
+  if (status != APFloat::opOK && status != APFloat::opInexact)
+    return 0;
+  return ConstantInt::get(Ty, UIntVal, /*isSigned=*/true);
+}
+
+/// ConstantFoldCall - Attempt to constant fold a call to the specified function
+/// with the specified arguments, returning null if unsuccessful.
+Constant *
+llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
+                       const TargetLibraryInfo *TLI) {
+  if (!F->hasName()) return 0;
+  StringRef Name = F->getName();
+
+  Type *Ty = F->getReturnType();
+  if (Operands.size() == 1) {
+    if (ConstantFP *Op = dyn_cast<ConstantFP>(Operands[0])) {
+      if (F->getIntrinsicID() == Intrinsic::convert_to_fp16) {
+        APFloat Val(Op->getValueAPF());
+
+        bool lost = false;
+        Val.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &lost);
+
+        return ConstantInt::get(F->getContext(), Val.bitcastToAPInt());
+      }
+      if (!TLI)
+        return 0;
+
+      if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
+        return 0;
+
+      /// We only fold functions with finite arguments. Folding NaN and inf is
+      /// likely to be aborted with an exception anyway, and some host libms
+      /// have known errors raising exceptions.
+      if (Op->getValueAPF().isNaN() || Op->getValueAPF().isInfinity())
+        return 0;
+
+      /// Currently APFloat versions of these functions do not exist, so we use
+      /// the host native double versions.  Float versions are not called
+      /// directly but for all these it is true (float)(f((double)arg)) ==
+      /// f(arg).  Long double not supported yet.
+      double V;
+      if (Ty->isFloatTy())
+        V = Op->getValueAPF().convertToFloat();
+      else if (Ty->isDoubleTy())
+        V = Op->getValueAPF().convertToDouble();
+      else {
+        bool unused;
+        APFloat APF = Op->getValueAPF();
+        APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+        V = APF.convertToDouble();
+      }
+
+      switch (F->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::fabs:
+          return ConstantFoldFP(fabs, V, Ty);
+#if HAVE_LOG2
+        case Intrinsic::log2:
+          return ConstantFoldFP(log2, V, Ty);
+#endif
+#if HAVE_LOG
+        case Intrinsic::log:
+          return ConstantFoldFP(log, V, Ty);
+#endif
+#if HAVE_LOG10
+        case Intrinsic::log10:
+          return ConstantFoldFP(log10, V, Ty);
+#endif
+#if HAVE_EXP
+        case Intrinsic::exp:
+          return ConstantFoldFP(exp, V, Ty);
+#endif
+#if HAVE_EXP2
+        case Intrinsic::exp2:
+          return ConstantFoldFP(exp2, V, Ty);
+#endif
+        case Intrinsic::floor:
+          return ConstantFoldFP(floor, V, Ty);
+      }
+
+      switch (Name[0]) {
+      case 'a':
+        if (Name == "acos" && TLI->has(LibFunc::acos))
+          return ConstantFoldFP(acos, V, Ty);
+        else if (Name == "asin" && TLI->has(LibFunc::asin))
+          return ConstantFoldFP(asin, V, Ty);
+        else if (Name == "atan" && TLI->has(LibFunc::atan))
+          return ConstantFoldFP(atan, V, Ty);
+        break;
+      case 'c':
+        if (Name == "ceil" && TLI->has(LibFunc::ceil))
+          return ConstantFoldFP(ceil, V, Ty);
+        else if (Name == "cos" && TLI->has(LibFunc::cos))
+          return ConstantFoldFP(cos, V, Ty);
+        else if (Name == "cosh" && TLI->has(LibFunc::cosh))
+          return ConstantFoldFP(cosh, V, Ty);
+        else if (Name == "cosf" && TLI->has(LibFunc::cosf))
+          return ConstantFoldFP(cos, V, Ty);
+        break;
+      case 'e':
+        if (Name == "exp" && TLI->has(LibFunc::exp))
+          return ConstantFoldFP(exp, V, Ty);
+
+        if (Name == "exp2" && TLI->has(LibFunc::exp2)) {
+          // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
+          // C99 library.
+          return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
+        }
+        break;
+      case 'f':
+        if (Name == "fabs" && TLI->has(LibFunc::fabs))
+          return ConstantFoldFP(fabs, V, Ty);
+        else if (Name == "floor" && TLI->has(LibFunc::floor))
+          return ConstantFoldFP(floor, V, Ty);
+        break;
+      case 'l':
+        if (Name == "log" && V > 0 && TLI->has(LibFunc::log))
+          return ConstantFoldFP(log, V, Ty);
+        else if (Name == "log10" && V > 0 && TLI->has(LibFunc::log10))
+          return ConstantFoldFP(log10, V, Ty);
+        else if (F->getIntrinsicID() == Intrinsic::sqrt &&
+                 (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
+          if (V >= -0.0)
+            return ConstantFoldFP(sqrt, V, Ty);
+          else // Undefined
+            return Constant::getNullValue(Ty);
+        }
+        break;
+      case 's':
+        if (Name == "sin" && TLI->has(LibFunc::sin))
+          return ConstantFoldFP(sin, V, Ty);
+        else if (Name == "sinh" && TLI->has(LibFunc::sinh))
+          return ConstantFoldFP(sinh, V, Ty);
+        else if (Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt))
+          return ConstantFoldFP(sqrt, V, Ty);
+        else if (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf))
+          return ConstantFoldFP(sqrt, V, Ty);
+        else if (Name == "sinf" && TLI->has(LibFunc::sinf))
+          return ConstantFoldFP(sin, V, Ty);
+        break;
+      case 't':
+        if (Name == "tan" && TLI->has(LibFunc::tan))
+          return ConstantFoldFP(tan, V, Ty);
+        else if (Name == "tanh" && TLI->has(LibFunc::tanh))
+          return ConstantFoldFP(tanh, V, Ty);
+        break;
+      default:
+        break;
+      }
+      return 0;
+    }
+
+    if (ConstantInt *Op = dyn_cast<ConstantInt>(Operands[0])) {
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::bswap:
+        return ConstantInt::get(F->getContext(), Op->getValue().byteSwap());
+      case Intrinsic::ctpop:
+        return ConstantInt::get(Ty, Op->getValue().countPopulation());
+      case Intrinsic::convert_from_fp16: {
+        APFloat Val(APFloat::IEEEhalf, Op->getValue());
+
+        bool lost = false;
+        APFloat::opStatus status =
+          Val.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
+
+        // Conversion is always precise.
+        (void)status;
+        assert(status == APFloat::opOK && !lost &&
+               "Precision lost during fp16 constfolding");
+
+        return ConstantFP::get(F->getContext(), Val);
+      }
+      default:
+        return 0;
+      }
+    }
+
+    // Support ConstantVector in case we have an Undef in the top.
+    if (isa<ConstantVector>(Operands[0]) ||
+        isa<ConstantDataVector>(Operands[0])) {
+      Constant *Op = cast<Constant>(Operands[0]);
+      switch (F->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::x86_sse_cvtss2si:
+      case Intrinsic::x86_sse_cvtss2si64:
+      case Intrinsic::x86_sse2_cvtsd2si:
+      case Intrinsic::x86_sse2_cvtsd2si64:
+        if (ConstantFP *FPOp =
+              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
+                                          /*roundTowardZero=*/false, Ty);
+      case Intrinsic::x86_sse_cvttss2si:
+      case Intrinsic::x86_sse_cvttss2si64:
+      case Intrinsic::x86_sse2_cvttsd2si:
+      case Intrinsic::x86_sse2_cvttsd2si64:
+        if (ConstantFP *FPOp =
+              dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
+          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
+                                          /*roundTowardZero=*/true, Ty);
+      }
+    }
+
+    if (isa<UndefValue>(Operands[0])) {
+      if (F->getIntrinsicID() == Intrinsic::bswap)
+        return Operands[0];
+      return 0;
+    }
+
+    return 0;
+  }
+
+  if (Operands.size() == 2) {
+    if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
+      if (!Ty->isHalfTy() && !Ty->isFloatTy() && !Ty->isDoubleTy())
+        return 0;
+      double Op1V;
+      if (Ty->isFloatTy())
+        Op1V = Op1->getValueAPF().convertToFloat();
+      else if (Ty->isDoubleTy())
+        Op1V = Op1->getValueAPF().convertToDouble();
+      else {
+        bool unused;
+        APFloat APF = Op1->getValueAPF();
+        APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+        Op1V = APF.convertToDouble();
+      }
+
+      if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
+        if (Op2->getType() != Op1->getType())
+          return 0;
+
+        double Op2V;
+        if (Ty->isFloatTy())
+          Op2V = Op2->getValueAPF().convertToFloat();
+        else if (Ty->isDoubleTy())
+          Op2V = Op2->getValueAPF().convertToDouble();
+        else {
+          bool unused;
+          APFloat APF = Op2->getValueAPF();
+          APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &unused);
+          Op2V = APF.convertToDouble();
+        }
+
+        if (F->getIntrinsicID() == Intrinsic::pow) {
+          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+        }
+        if (!TLI)
+          return 0;
+        if (Name == "pow" && TLI->has(LibFunc::pow))
+          return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
+        if (Name == "fmod" && TLI->has(LibFunc::fmod))
+          return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
+        if (Name == "atan2" && TLI->has(LibFunc::atan2))
+          return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
+      } else if (ConstantInt *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
+        if (F->getIntrinsicID() == Intrinsic::powi && Ty->isHalfTy())
+          return ConstantFP::get(F->getContext(),
+                                 APFloat((float)std::pow((float)Op1V,
+                                                 (int)Op2C->getZExtValue())));
+        if (F->getIntrinsicID() == Intrinsic::powi && Ty->isFloatTy())
+          return ConstantFP::get(F->getContext(),
+                                 APFloat((float)std::pow((float)Op1V,
+                                                 (int)Op2C->getZExtValue())));
+        if (F->getIntrinsicID() == Intrinsic::powi && Ty->isDoubleTy())
+          return ConstantFP::get(F->getContext(),
+                                 APFloat((double)std::pow((double)Op1V,
+                                                   (int)Op2C->getZExtValue())));
+      }
+      return 0;
+    }
+
+    if (ConstantInt *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
+      if (ConstantInt *Op2 = dyn_cast<ConstantInt>(Operands[1])) {
+        switch (F->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::sadd_with_overflow:
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::ssub_with_overflow:
+        case Intrinsic::usub_with_overflow:
+        case Intrinsic::smul_with_overflow:
+        case Intrinsic::umul_with_overflow: {
+          APInt Res;
+          bool Overflow;
+          switch (F->getIntrinsicID()) {
+          default: llvm_unreachable("Invalid case");
+          case Intrinsic::sadd_with_overflow:
+            Res = Op1->getValue().sadd_ov(Op2->getValue(), Overflow);
+            break;
+          case Intrinsic::uadd_with_overflow:
+            Res = Op1->getValue().uadd_ov(Op2->getValue(), Overflow);
+            break;
+          case Intrinsic::ssub_with_overflow:
+            Res = Op1->getValue().ssub_ov(Op2->getValue(), Overflow);
+            break;
+          case Intrinsic::usub_with_overflow:
+            Res = Op1->getValue().usub_ov(Op2->getValue(), Overflow);
+            break;
+          case Intrinsic::smul_with_overflow:
+            Res = Op1->getValue().smul_ov(Op2->getValue(), Overflow);
+            break;
+          case Intrinsic::umul_with_overflow:
+            Res = Op1->getValue().umul_ov(Op2->getValue(), Overflow);
+            break;
+          }
+          Constant *Ops[] = {
+            ConstantInt::get(F->getContext(), Res),
+            ConstantInt::get(Type::getInt1Ty(F->getContext()), Overflow)
+          };
+          return ConstantStruct::get(cast<StructType>(F->getReturnType()), Ops);
+        }
+        case Intrinsic::cttz:
+          if (Op2->isOne() && Op1->isZero()) // cttz(0, 1) is undef.
+            return UndefValue::get(Ty);
+          return ConstantInt::get(Ty, Op1->getValue().countTrailingZeros());
+        case Intrinsic::ctlz:
+          if (Op2->isOne() && Op1->isZero()) // ctlz(0, 1) is undef.
+            return UndefValue::get(Ty);
+          return ConstantInt::get(Ty, Op1->getValue().countLeadingZeros());
+        }
+      }
+
+      return 0;
+    }
+    return 0;
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/Analysis/CostModel.cpp b/contrib/llvm/lib/Analysis/CostModel.cpp
new file mode 100644
index 000000000000..98a7780ad9a6
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/CostModel.cpp
@@ -0,0 +1,247 @@
+//===- CostModel.cpp ------ Cost Model Analysis ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the cost model analysis. It provides a very basic cost
+// estimation for LLVM-IR. This analysis uses the services of the codegen
+// to approximate the cost of any IR instruction when lowered to machine
+// instructions. The cost results are unit-less and the cost number represents
+// the throughput of the machine assuming that all loads hit the cache, all
+// branches are predicted, etc. The cost numbers can be added in order to
+// compare two or more transformation alternatives.
+//
+//===----------------------------------------------------------------------===//
+
+#define CM_NAME "cost-model"
+#define DEBUG_TYPE CM_NAME
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  class CostModelAnalysis : public FunctionPass {
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    CostModelAnalysis() : FunctionPass(ID), F(0), TTI(0) {
+      initializeCostModelAnalysisPass(
+        *PassRegistry::getPassRegistry());
+    }
+
+    /// Returns the expected cost of the instruction.
+    /// Returns -1 if the cost is unknown.
+    /// Note, this method does not cache the cost calculation and it
+    /// can be expensive in some cases.
+    unsigned getInstructionCost(const Instruction *I) const;
+
+  private:
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+    virtual void print(raw_ostream &OS, const Module*) const;
+
+    /// The function that we analyze.
+    Function *F;
+    /// Target information.
+    const TargetTransformInfo *TTI;
+  };
+}  // End of anonymous namespace
+
+// Register this pass.
+char CostModelAnalysis::ID = 0;
+static const char cm_name[] = "Cost Model Analysis";
+INITIALIZE_PASS_BEGIN(CostModelAnalysis, CM_NAME, cm_name, false, true)
+INITIALIZE_PASS_END  (CostModelAnalysis, CM_NAME, cm_name, false, true)
+
+FunctionPass *llvm::createCostModelAnalysisPass() {
+  return new CostModelAnalysis();
+}
+
+void
+CostModelAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+bool
+CostModelAnalysis::runOnFunction(Function &F) {
+ this->F = &F;
+ TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+
+ return false;
+}
+
+static bool isReverseVectorMask(SmallVector<int, 16> &Mask) {
+  for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i)
+    if (Mask[i] > 0 && Mask[i] != (int)(MaskSize - 1 - i))
+      return false;
+  return true;
+}
+
+static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
+  TargetTransformInfo::OperandValueKind OpInfo =
+    TargetTransformInfo::OK_AnyValue;
+
+  // Check for a splat of a constant.
+  ConstantDataVector *CDV = 0;
+  if ((CDV = dyn_cast<ConstantDataVector>(V)))
+    if (CDV->getSplatValue() != NULL)
+      OpInfo = TargetTransformInfo::OK_UniformConstantValue;
+  ConstantVector *CV = 0;
+  if ((CV = dyn_cast<ConstantVector>(V)))
+    if (CV->getSplatValue() != NULL)
+      OpInfo = TargetTransformInfo::OK_UniformConstantValue;
+
+  return OpInfo;
+}
+
+unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
+  if (!TTI)
+    return -1;
+
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:{
+    Type *ValTy = I->getOperand(0)->getType()->getPointerElementType();
+    return TTI->getAddressComputationCost(ValTy);
+  }
+
+  case Instruction::Ret:
+  case Instruction::PHI:
+  case Instruction::Br: {
+    return TTI->getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+      getOperandInfo(I->getOperand(0));
+    TargetTransformInfo::OperandValueKind Op2VK =
+      getOperandInfo(I->getOperand(1));
+    return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
+                                       Op2VK);
+  }
+  case Instruction::Select: {
+    const SelectInst *SI = cast<SelectInst>(I);
+    Type *CondTy = SI->getCondition()->getType();
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+  }
+  case Instruction::Store: {
+    const StoreInst *SI = cast<StoreInst>(I);
+    Type *ValTy = SI->getValueOperand()->getType();
+    return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                                 SI->getAlignment(),
+                                 SI->getPointerAddressSpace());
+  }
+  case Instruction::Load: {
+    const LoadInst *LI = cast<LoadInst>(I);
+    return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+                                 LI->getAlignment(),
+                                 LI->getPointerAddressSpace());
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = I->getOperand(0)->getType();
+    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+  }
+  case Instruction::ExtractElement: {
+    const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    unsigned Idx = -1;
+    if (CI)
+      Idx = CI->getZExtValue();
+    return TTI->getVectorInstrCost(I->getOpcode(),
+                                   EEI->getOperand(0)->getType(), Idx);
+  }
+  case Instruction::InsertElement: {
+      const InsertElementInst * IE = cast<InsertElementInst>(I);
+      ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+      unsigned Idx = -1;
+      if (CI)
+        Idx = CI->getZExtValue();
+      return TTI->getVectorInstrCost(I->getOpcode(),
+                                     IE->getType(), Idx);
+    }
+  case Instruction::ShuffleVector: {
+    const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    Type *VecTypOp0 = Shuffle->getOperand(0)->getType();
+    unsigned NumVecElems = VecTypOp0->getVectorNumElements();
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+
+    if (NumVecElems == Mask.size() && isReverseVectorMask(Mask))
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0,
+                                 0);
+    return -1;
+  }
+  case Instruction::Call:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      SmallVector<Type*, 4> Tys;
+      for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
+        Tys.push_back(II->getArgOperand(J)->getType());
+
+      return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
+                                        Tys);
+    }
+    return -1;
+  default:
+    // We don't have any information on this instruction.
+    return -1;
+  }
+}
+
+void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
+  if (!F)
+    return;
+
+  for (Function::iterator B = F->begin(), BE = F->end(); B != BE; ++B) {
+    for (BasicBlock::iterator it = B->begin(), e = B->end(); it != e; ++it) {
+      Instruction *Inst = it;
+      unsigned Cost = getInstructionCost(Inst);
+      if (Cost != (unsigned)-1)
+        OS << "Cost Model: Found an estimated cost of " << Cost;
+      else
+        OS << "Cost Model: Unknown cost";
+
+      OS << " for instruction: "<< *Inst << "\n";
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
new file mode 100644
index 000000000000..cbc71bd6e739
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -0,0 +1,3838 @@
+//===-- DependenceAnalysis.cpp - DA Implementation --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// DependenceAnalysis is an LLVM pass that analyses dependences between memory
+// accesses. Currently, it is an (incomplete) implementation of the approach
+// described in
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+//
+// There's a single entry point that analyzes the dependence between a pair
+// of memory references in a function, returning either NULL, for no dependence,
+// or a more-or-less detailed description of the dependence between them.
+//
+// Currently, the implementation cannot propagate constraints between
+// coupled RDIV subscripts and lacks a multi-subscript MIV test.
+// Both of these are conservative weaknesses;
+// that is, not a source of correctness problems.
+//
+// The implementation depends on the GEP instruction to
+// differentiate subscripts. Since Clang linearizes subscripts
+// for most arrays, we give up some precision (though the existing MIV tests
+// will help). We trust that the GEP instruction will eventually be extended.
+// In the meantime, we should explore Maslov's ideas about delinearization.
+//
+// We should pay some careful attention to the possibility of integer overflow
+// in the implementation of the various tests. This could happen with Add,
+// Subtract, or Multiply, with both APInt's and SCEV's.
+//
+// Some non-linear subscript pairs can be handled by the GCD test
+// (and perhaps other tests).
+// Should explore how often these things occur.
+//
+// Finally, it seems like certain test cases expose weaknesses in the SCEV
+// simplification, especially in the handling of sign and zero extensions.
+// It could be useful to spend time exploring these.
+//
+// Please note that this is work in progress and the interface is subject to
+// change.
+//
+//===----------------------------------------------------------------------===//
+//                                                                            //
+//                   In memory of Ken Kennedy, 1945 - 2007                    //
+//                                                                            //
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "da"
+
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// statistics
+
+STATISTIC(TotalArrayPairs, "Array pairs tested");
+STATISTIC(SeparableSubscriptPairs, "Separable subscript pairs");
+STATISTIC(CoupledSubscriptPairs, "Coupled subscript pairs");
+STATISTIC(NonlinearSubscriptPairs, "Nonlinear subscript pairs");
+STATISTIC(ZIVapplications, "ZIV applications");
+STATISTIC(ZIVindependence, "ZIV independence");
+STATISTIC(StrongSIVapplications, "Strong SIV applications");
+STATISTIC(StrongSIVsuccesses, "Strong SIV successes");
+STATISTIC(StrongSIVindependence, "Strong SIV independence");
+STATISTIC(WeakCrossingSIVapplications, "Weak-Crossing SIV applications");
+STATISTIC(WeakCrossingSIVsuccesses, "Weak-Crossing SIV successes");
+STATISTIC(WeakCrossingSIVindependence, "Weak-Crossing SIV independence");
+STATISTIC(ExactSIVapplications, "Exact SIV applications");
+STATISTIC(ExactSIVsuccesses, "Exact SIV successes");
+STATISTIC(ExactSIVindependence, "Exact SIV independence");
+STATISTIC(WeakZeroSIVapplications, "Weak-Zero SIV applications");
+STATISTIC(WeakZeroSIVsuccesses, "Weak-Zero SIV successes");
+STATISTIC(WeakZeroSIVindependence, "Weak-Zero SIV independence");
+STATISTIC(ExactRDIVapplications, "Exact RDIV applications");
+STATISTIC(ExactRDIVindependence, "Exact RDIV independence");
+STATISTIC(SymbolicRDIVapplications, "Symbolic RDIV applications");
+STATISTIC(SymbolicRDIVindependence, "Symbolic RDIV independence");
+STATISTIC(DeltaApplications, "Delta applications");
+STATISTIC(DeltaSuccesses, "Delta successes");
+STATISTIC(DeltaIndependence, "Delta independence");
+STATISTIC(DeltaPropagations, "Delta propagations");
+STATISTIC(GCDapplications, "GCD applications");
+STATISTIC(GCDsuccesses, "GCD successes");
+STATISTIC(GCDindependence, "GCD independence");
+STATISTIC(BanerjeeApplications, "Banerjee applications");
+STATISTIC(BanerjeeIndependence, "Banerjee independence");
+STATISTIC(BanerjeeSuccesses, "Banerjee successes");
+
+//===----------------------------------------------------------------------===//
+// basics
+
+INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da",
+                      "Dependence Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(DependenceAnalysis, "da",
+                    "Dependence Analysis", true, true)
+
+char DependenceAnalysis::ID = 0;
+
+
+FunctionPass *llvm::createDependenceAnalysisPass() {
+  return new DependenceAnalysis();
+}
+
+
+bool DependenceAnalysis::runOnFunction(Function &F) {
+  this->F = &F;
+  AA = &getAnalysis<AliasAnalysis>();
+  SE = &getAnalysis<ScalarEvolution>();
+  LI = &getAnalysis<LoopInfo>();
+  return false;
+}
+
+
+void DependenceAnalysis::releaseMemory() {
+}
+
+
+void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<AliasAnalysis>();
+  AU.addRequiredTransitive<ScalarEvolution>();
+  AU.addRequiredTransitive<LoopInfo>();
+}
+
+
+// Used to test the dependence analyzer.
+// Looks through the function, noting loads and stores.
+// Calls depends() on every possible pair and prints out the result.
+// Ignores all other instructions.
+static
+void dumpExampleDependence(raw_ostream &OS, Function *F,
+                           DependenceAnalysis *DA) {
+  for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F);
+       SrcI != SrcE; ++SrcI) {
+    if (isa<StoreInst>(*SrcI) || isa<LoadInst>(*SrcI)) {
+      for (inst_iterator DstI = SrcI, DstE = inst_end(F);
+           DstI != DstE; ++DstI) {
+        if (isa<StoreInst>(*DstI) || isa<LoadInst>(*DstI)) {
+          OS << "da analyze - ";
+          if (Dependence *D = DA->depends(&*SrcI, &*DstI, true)) {
+            D->dump(OS);
+            for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
+              if (D->isSplitable(Level)) {
+                OS << "da analyze - split level = " << Level;
+                OS << ", iteration = " << *DA->getSplitIteration(D, Level);
+                OS << "!\n";
+              }
+            }
+            delete D;
+          }
+          else
+            OS << "none!\n";
+        }
+      }
+    }
+  }
+}
+
+
+void DependenceAnalysis::print(raw_ostream &OS, const Module*) const {
+  dumpExampleDependence(OS, F, const_cast<DependenceAnalysis *>(this));
+}
+
+//===----------------------------------------------------------------------===//
+// Dependence methods
+
+// Returns true if this is an input dependence.
+bool Dependence::isInput() const {
+  return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
+}
+
+
+// Returns true if this is an output dependence.
+bool Dependence::isOutput() const {
+  return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
+}
+
+
+// Returns true if this is an flow (aka true)  dependence.
+bool Dependence::isFlow() const {
+  return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
+}
+
+
+// Returns true if this is an anti dependence.
+bool Dependence::isAnti() const {
+  return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
+}
+
+
+// Returns true if a particular level is scalar; that is,
+// if no subscript in the source or destination mention the induction
+// variable associated with the loop at this level.
+// Leave this out of line, so it will serve as a virtual method anchor
+bool Dependence::isScalar(unsigned level) const {
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FullDependence methods
+
+FullDependence::FullDependence(Instruction *Source,
+                               Instruction *Destination,
+                               bool PossiblyLoopIndependent,
+                               unsigned CommonLevels) :
+  Dependence(Source, Destination),
+  Levels(CommonLevels),
+  LoopIndependent(PossiblyLoopIndependent) {
+  Consistent = true;
+  DV = CommonLevels ? new DVEntry[CommonLevels] : NULL;
+}
+
+// The rest are simple getters that hide the implementation.
+
+// getDirection - Returns the direction associated with a particular level.
+unsigned FullDependence::getDirection(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Direction;
+}
+
+
+// Returns the distance (or NULL) associated with a particular level.
+const SCEV *FullDependence::getDistance(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Distance;
+}
+
+
+// Returns true if a particular level is scalar; that is,
+// if no subscript in the source or destination mention the induction
+// variable associated with the loop at this level.
+bool FullDependence::isScalar(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Scalar;
+}
+
+
+// Returns true if peeling the first iteration from this loop
+// will break this dependence.
+bool FullDependence::isPeelFirst(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].PeelFirst;
+}
+
+
+// Returns true if peeling the last iteration from this loop
+// will break this dependence.
+bool FullDependence::isPeelLast(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].PeelLast;
+}
+
+
+// Returns true if splitting this loop will break the dependence.
+bool FullDependence::isSplitable(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Splitable;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DependenceAnalysis::Constraint methods
+
+// If constraint is a point <X, Y>, returns X.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getX() const {
+  assert(Kind == Point && "Kind should be Point");
+  return A;
+}
+
+
+// If constraint is a point <X, Y>, returns Y.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getY() const {
+  assert(Kind == Point && "Kind should be Point");
+  return B;
+}
+
+
+// If constraint is a line AX + BY = C, returns A.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getA() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return A;
+}
+
+
+// If constraint is a line AX + BY = C, returns B.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getB() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return B;
+}
+
+
+// If constraint is a line AX + BY = C, returns C.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getC() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return C;
+}
+
+
+// If constraint is a distance, returns D.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getD() const {
+  assert(Kind == Distance && "Kind should be Distance");
+  return SE->getNegativeSCEV(C);
+}
+
+
+// Returns the loop associated with this constraint.
+const Loop *DependenceAnalysis::Constraint::getAssociatedLoop() const {
+  assert((Kind == Distance || Kind == Line || Kind == Point) &&
+         "Kind should be Distance, Line, or Point");
+  return AssociatedLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setPoint(const SCEV *X,
+                                              const SCEV *Y,
+                                              const Loop *CurLoop) {
+  Kind = Point;
+  A = X;
+  B = Y;
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setLine(const SCEV *AA,
+                                             const SCEV *BB,
+                                             const SCEV *CC,
+                                             const Loop *CurLoop) {
+  Kind = Line;
+  A = AA;
+  B = BB;
+  C = CC;
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setDistance(const SCEV *D,
+                                                 const Loop *CurLoop) {
+  Kind = Distance;
+  A = SE->getConstant(D->getType(), 1);
+  B = SE->getNegativeSCEV(A);
+  C = SE->getNegativeSCEV(D);
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setEmpty() {
+  Kind = Empty;
+}
+
+
+void DependenceAnalysis::Constraint::setAny(ScalarEvolution *NewSE) {
+  SE = NewSE;
+  Kind = Any;
+}
+
+
+// For debugging purposes. Dumps the constraint out to OS.
+void DependenceAnalysis::Constraint::dump(raw_ostream &OS) const {
+  if (isEmpty())
+    OS << " Empty\n";
+  else if (isAny())
+    OS << " Any\n";
+  else if (isPoint())
+    OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
+  else if (isDistance())
+    OS << " Distance is " << *getD() <<
+      " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+  else if (isLine())
+    OS << " Line is " << *getA() << "*X + " <<
+      *getB() << "*Y = " << *getC() << "\n";
+  else
+    llvm_unreachable("unknown constraint type in Constraint::dump");
+}
+
+
+// Updates X with the intersection
+// of the Constraints X and Y. Returns true if X has changed.
+// Corresponds to Figure 4 from the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+bool DependenceAnalysis::intersectConstraints(Constraint *X,
+                                              const Constraint *Y) {
+  ++DeltaApplications;
+  DEBUG(dbgs() << "\tintersect constraints\n");
+  DEBUG(dbgs() << "\t    X ="; X->dump(dbgs()));
+  DEBUG(dbgs() << "\t    Y ="; Y->dump(dbgs()));
+  assert(!Y->isPoint() && "Y must not be a Point");
+  if (X->isAny()) {
+    if (Y->isAny())
+      return false;
+    *X = *Y;
+    return true;
+  }
+  if (X->isEmpty())
+    return false;
+  if (Y->isEmpty()) {
+    X->setEmpty();
+    return true;
+  }
+
+  if (X->isDistance() && Y->isDistance()) {
+    DEBUG(dbgs() << "\t    intersect 2 distances\n");
+    if (isKnownPredicate(CmpInst::ICMP_EQ, X->getD(), Y->getD()))
+      return false;
+    if (isKnownPredicate(CmpInst::ICMP_NE, X->getD(), Y->getD())) {
+      X->setEmpty();
+      ++DeltaSuccesses;
+      return true;
+    }
+    // Hmmm, interesting situation.
+    // I guess if either is constant, keep it and ignore the other.
+    if (isa<SCEVConstant>(Y->getD())) {
+      *X = *Y;
+      return true;
+    }
+    return false;
+  }
+
+  // At this point, the pseudo-code in Figure 4 of the paper
+  // checks if (X->isPoint() && Y->isPoint()).
+  // This case can't occur in our implementation,
+  // since a Point can only arise as the result of intersecting
+  // two Line constraints, and the right-hand value, Y, is never
+  // the result of an intersection.
+  assert(!(X->isPoint() && Y->isPoint()) &&
+         "We shouldn't ever see X->isPoint() && Y->isPoint()");
+
+  if (X->isLine() && Y->isLine()) {
+    DEBUG(dbgs() << "\t    intersect 2 lines\n");
+    const SCEV *Prod1 = SE->getMulExpr(X->getA(), Y->getB());
+    const SCEV *Prod2 = SE->getMulExpr(X->getB(), Y->getA());
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2)) {
+      // slopes are equal, so lines are parallel
+      DEBUG(dbgs() << "\t\tsame slope\n");
+      Prod1 = SE->getMulExpr(X->getC(), Y->getB());
+      Prod2 = SE->getMulExpr(X->getB(), Y->getC());
+      if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2))
+        return false;
+      if (isKnownPredicate(CmpInst::ICMP_NE, Prod1, Prod2)) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      return false;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_NE, Prod1, Prod2)) {
+      // slopes differ, so lines intersect
+      DEBUG(dbgs() << "\t\tdifferent slopes\n");
+      const SCEV *C1B2 = SE->getMulExpr(X->getC(), Y->getB());
+      const SCEV *C1A2 = SE->getMulExpr(X->getC(), Y->getA());
+      const SCEV *C2B1 = SE->getMulExpr(Y->getC(), X->getB());
+      const SCEV *C2A1 = SE->getMulExpr(Y->getC(), X->getA());
+      const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
+      const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
+      const SCEVConstant *C1A2_C2A1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+      const SCEVConstant *C1B2_C2B1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+      const SCEVConstant *A1B2_A2B1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+      const SCEVConstant *A2B1_A1B2 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+      if (!C1B2_C2B1 || !C1A2_C2A1 ||
+          !A1B2_A2B1 || !A2B1_A1B2)
+        return false;
+      APInt Xtop = C1B2_C2B1->getValue()->getValue();
+      APInt Xbot = A1B2_A2B1->getValue()->getValue();
+      APInt Ytop = C1A2_C2A1->getValue()->getValue();
+      APInt Ybot = A2B1_A1B2->getValue()->getValue();
+      DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n");
+      DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n");
+      DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n");
+      DEBUG(dbgs() << "\t\tYbot = " << Ybot << "\n");
+      APInt Xq = Xtop; // these need to be initialized, even
+      APInt Xr = Xtop; // though they're just going to be overwritten
+      APInt::sdivrem(Xtop, Xbot, Xq, Xr);
+      APInt Yq = Ytop;
+      APInt Yr = Ytop;;
+      APInt::sdivrem(Ytop, Ybot, Yq, Yr);
+      if (Xr != 0 || Yr != 0) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      DEBUG(dbgs() << "\t\tX = " << Xq << ", Y = " << Yq << "\n");
+      if (Xq.slt(0) || Yq.slt(0)) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      if (const SCEVConstant *CUB =
+          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+        APInt UpperBound = CUB->getValue()->getValue();
+        DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
+        if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
+          X->setEmpty();
+          ++DeltaSuccesses;
+          return true;
+        }
+      }
+      X->setPoint(SE->getConstant(Xq),
+                  SE->getConstant(Yq),
+                  X->getAssociatedLoop());
+      ++DeltaSuccesses;
+      return true;
+    }
+    return false;
+  }
+
+  // if (X->isLine() && Y->isPoint()) This case can't occur.
+  assert(!(X->isLine() && Y->isPoint()) && "This case should never occur");
+
+  if (X->isPoint() && Y->isLine()) {
+    DEBUG(dbgs() << "\t    intersect Point and Line\n");
+    const SCEV *A1X1 = SE->getMulExpr(Y->getA(), X->getX());
+    const SCEV *B1Y1 = SE->getMulExpr(Y->getB(), X->getY());
+    const SCEV *Sum = SE->getAddExpr(A1X1, B1Y1);
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Sum, Y->getC()))
+      return false;
+    if (isKnownPredicate(CmpInst::ICMP_NE, Sum, Y->getC())) {
+      X->setEmpty();
+      ++DeltaSuccesses;
+      return true;
+    }
+    return false;
+  }
+
+  llvm_unreachable("shouldn't reach the end of Constraint intersection");
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DependenceAnalysis methods
+
+// For debugging purposes. Dumps a dependence to OS.
+void Dependence::dump(raw_ostream &OS) const {
+  bool Splitable = false;
+  if (isConfused())
+    OS << "confused";
+  else {
+    if (isConsistent())
+      OS << "consistent ";
+    if (isFlow())
+      OS << "flow";
+    else if (isOutput())
+      OS << "output";
+    else if (isAnti())
+      OS << "anti";
+    else if (isInput())
+      OS << "input";
+    unsigned Levels = getLevels();
+    OS << " [";
+    for (unsigned II = 1; II <= Levels; ++II) {
+      if (isSplitable(II))
+        Splitable = true;
+      if (isPeelFirst(II))
+        OS << 'p';
+      const SCEV *Distance = getDistance(II);
+      if (Distance)
+        OS << *Distance;
+      else if (isScalar(II))
+        OS << "S";
+      else {
+        unsigned Direction = getDirection(II);
+        if (Direction == DVEntry::ALL)
+          OS << "*";
+        else {
+          if (Direction & DVEntry::LT)
+            OS << "<";
+          if (Direction & DVEntry::EQ)
+            OS << "=";
+          if (Direction & DVEntry::GT)
+            OS << ">";
+        }
+      }
+      if (isPeelLast(II))
+        OS << 'p';
+      if (II < Levels)
+        OS << " ";
+    }
+    if (isLoopIndependent())
+      OS << "|<";
+    OS << "]";
+    if (Splitable)
+      OS << " splitable";
+  }
+  OS << "!\n";
+}
+
+
+
+static
+AliasAnalysis::AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
+                                                  const Value *A,
+                                                  const Value *B) {
+  const Value *AObj = GetUnderlyingObject(A);
+  const Value *BObj = GetUnderlyingObject(B);
+  return AA->alias(AObj, AA->getTypeStoreSize(AObj->getType()),
+                   BObj, AA->getTypeStoreSize(BObj->getType()));
+}
+
+
+// Returns true if the load or store can be analyzed. Atomic and volatile
+// operations have properties which this analysis does not understand.
+static
+bool isLoadOrStore(const Instruction *I) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isUnordered();
+  else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+  return false;
+}
+
+
+static
+Value *getPointerOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  llvm_unreachable("Value is not load or store instruction");
+  return 0;
+}
+
+
+// Examines the loop nesting of the Src and Dst
+// instructions and establishes their shared loops. Sets the variables
+// CommonLevels, SrcLevels, and MaxLevels.
+// The source and destination instructions needn't be contained in the same
+// loop. The routine establishNestingLevels finds the level of most deeply
+// nested loop that contains them both, CommonLevels. An instruction that's
+// not contained in a loop is at level = 0. MaxLevels is equal to the level
+// of the source plus the level of the destination, minus CommonLevels.
+// This lets us allocate vectors MaxLevels in length, with room for every
+// distinct loop referenced in both the source and destination subscripts.
+// The variable SrcLevels is the nesting depth of the source instruction.
+// It's used to help calculate distinct loops referenced by the destination.
+// Here's the map from loops to levels:
+//            0 - unused
+//            1 - outermost common loop
+//          ... - other common loops
+// CommonLevels - innermost common loop
+//          ... - loops containing Src but not Dst
+//    SrcLevels - innermost loop containing Src but not Dst
+//          ... - loops containing Dst but not Src
+//    MaxLevels - innermost loops containing Dst but not Src
+// Consider the follow code fragment:
+//   for (a = ...) {
+//     for (b = ...) {
+//       for (c = ...) {
+//         for (d = ...) {
+//           A[] = ...;
+//         }
+//       }
+//       for (e = ...) {
+//         for (f = ...) {
+//           for (g = ...) {
+//             ... = A[];
+//           }
+//         }
+//       }
+//     }
+//   }
+// If we're looking at the possibility of a dependence between the store
+// to A (the Src) and the load from A (the Dst), we'll note that they
+// have 2 loops in common, so CommonLevels will equal 2 and the direction
+// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
+// A map from loop names to loop numbers would look like
+//     a - 1
+//     b - 2 = CommonLevels
+//     c - 3
+//     d - 4 = SrcLevels
+//     e - 5
+//     f - 6
+//     g - 7 = MaxLevels
+void DependenceAnalysis::establishNestingLevels(const Instruction *Src,
+                                                const Instruction *Dst) {
+  const BasicBlock *SrcBlock = Src->getParent();
+  const BasicBlock *DstBlock = Dst->getParent();
+  unsigned SrcLevel = LI->getLoopDepth(SrcBlock);
+  unsigned DstLevel = LI->getLoopDepth(DstBlock);
+  const Loop *SrcLoop = LI->getLoopFor(SrcBlock);
+  const Loop *DstLoop = LI->getLoopFor(DstBlock);
+  SrcLevels = SrcLevel;
+  MaxLevels = SrcLevel + DstLevel;
+  while (SrcLevel > DstLevel) {
+    SrcLoop = SrcLoop->getParentLoop();
+    SrcLevel--;
+  }
+  while (DstLevel > SrcLevel) {
+    DstLoop = DstLoop->getParentLoop();
+    DstLevel--;
+  }
+  while (SrcLoop != DstLoop) {
+    SrcLoop = SrcLoop->getParentLoop();
+    DstLoop = DstLoop->getParentLoop();
+    SrcLevel--;
+  }
+  CommonLevels = SrcLevel;
+  MaxLevels -= CommonLevels;
+}
+
+
+// Given one of the loops containing the source, return
+// its level index in our numbering scheme.
+unsigned DependenceAnalysis::mapSrcLoop(const Loop *SrcLoop) const {
+  return SrcLoop->getLoopDepth();
+}
+
+
+// Given one of the loops containing the destination,
+// return its level index in our numbering scheme.
+unsigned DependenceAnalysis::mapDstLoop(const Loop *DstLoop) const {
+  unsigned D = DstLoop->getLoopDepth();
+  if (D > CommonLevels)
+    return D - CommonLevels + SrcLevels;
+  else
+    return D;
+}
+
+
+// Returns true if Expression is loop invariant in LoopNest.
+bool DependenceAnalysis::isLoopInvariant(const SCEV *Expression,
+                                         const Loop *LoopNest) const {
+  if (!LoopNest)
+    return true;
+  return SE->isLoopInvariant(Expression, LoopNest) &&
+    isLoopInvariant(Expression, LoopNest->getParentLoop());
+}
+
+
+
+// Finds the set of loops from the LoopNest that
+// have a level <= CommonLevels and are referred to by the SCEV Expression.
+void DependenceAnalysis::collectCommonLoops(const SCEV *Expression,
+                                            const Loop *LoopNest,
+                                            SmallBitVector &Loops) const {
+  while (LoopNest) {
+    unsigned Level = LoopNest->getLoopDepth();
+    if (Level <= CommonLevels && !SE->isLoopInvariant(Expression, LoopNest))
+      Loops.set(Level);
+    LoopNest = LoopNest->getParentLoop();
+  }
+}
+
+
+// removeMatchingExtensions - Examines a subscript pair.
+// If the source and destination are identically sign (or zero)
+// extended, it strips off the extension in an effect to simplify
+// the actual analysis.
+void DependenceAnalysis::removeMatchingExtensions(Subscript *Pair) {
+  const SCEV *Src = Pair->Src;
+  const SCEV *Dst = Pair->Dst;
+  if ((isa<SCEVZeroExtendExpr>(Src) && isa<SCEVZeroExtendExpr>(Dst)) ||
+      (isa<SCEVSignExtendExpr>(Src) && isa<SCEVSignExtendExpr>(Dst))) {
+    const SCEVCastExpr *SrcCast = cast<SCEVCastExpr>(Src);
+    const SCEVCastExpr *DstCast = cast<SCEVCastExpr>(Dst);
+    if (SrcCast->getType() == DstCast->getType()) {
+      Pair->Src = SrcCast->getOperand();
+      Pair->Dst = DstCast->getOperand();
+    }
+  }
+}
+
+
+// Examine the scev and return true iff it's linear.
+// Collect any loops mentioned in the set of "Loops".
+bool DependenceAnalysis::checkSrcSubscript(const SCEV *Src,
+                                           const Loop *LoopNest,
+                                           SmallBitVector &Loops) {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  if (!AddRec)
+    return isLoopInvariant(Src, LoopNest);
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  if (!isLoopInvariant(Step, LoopNest))
+    return false;
+  Loops.set(mapSrcLoop(AddRec->getLoop()));
+  return checkSrcSubscript(Start, LoopNest, Loops);
+}
+
+
+
+// Examine the scev and return true iff it's linear.
+// Collect any loops mentioned in the set of "Loops".
+bool DependenceAnalysis::checkDstSubscript(const SCEV *Dst,
+                                           const Loop *LoopNest,
+                                           SmallBitVector &Loops) {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (!AddRec)
+    return isLoopInvariant(Dst, LoopNest);
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  if (!isLoopInvariant(Step, LoopNest))
+    return false;
+  Loops.set(mapDstLoop(AddRec->getLoop()));
+  return checkDstSubscript(Start, LoopNest, Loops);
+}
+
+
+// Examines the subscript pair (the Src and Dst SCEVs)
+// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
+// Collects the associated loops in a set.
+DependenceAnalysis::Subscript::ClassificationKind
+DependenceAnalysis::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
+                                 const SCEV *Dst, const Loop *DstLoopNest,
+                                 SmallBitVector &Loops) {
+  SmallBitVector SrcLoops(MaxLevels + 1);
+  SmallBitVector DstLoops(MaxLevels + 1);
+  if (!checkSrcSubscript(Src, SrcLoopNest, SrcLoops))
+    return Subscript::NonLinear;
+  if (!checkDstSubscript(Dst, DstLoopNest, DstLoops))
+    return Subscript::NonLinear;
+  Loops = SrcLoops;
+  Loops |= DstLoops;
+  unsigned N = Loops.count();
+  if (N == 0)
+    return Subscript::ZIV;
+  if (N == 1)
+    return Subscript::SIV;
+  if (N == 2 && (SrcLoops.count() == 0 ||
+                 DstLoops.count() == 0 ||
+                 (SrcLoops.count() == 1 && DstLoops.count() == 1)))
+    return Subscript::RDIV;
+  return Subscript::MIV;
+}
+
+
+// A wrapper around SCEV::isKnownPredicate.
+// Looks for cases where we're interested in comparing for equality.
+// If both X and Y have been identically sign or zero extended,
+// it strips off the (confusing) extensions before invoking
+// SCEV::isKnownPredicate. Perhaps, someday, the ScalarEvolution package
+// will be similarly updated.
+//
+// If SCEV::isKnownPredicate can't prove the predicate,
+// we try simple subtraction, which seems to help in some cases
+// involving symbolics.
+bool DependenceAnalysis::isKnownPredicate(ICmpInst::Predicate Pred,
+                                          const SCEV *X,
+                                          const SCEV *Y) const {
+  if (Pred == CmpInst::ICMP_EQ ||
+      Pred == CmpInst::ICMP_NE) {
+    if ((isa<SCEVSignExtendExpr>(X) &&
+         isa<SCEVSignExtendExpr>(Y)) ||
+        (isa<SCEVZeroExtendExpr>(X) &&
+         isa<SCEVZeroExtendExpr>(Y))) {
+      const SCEVCastExpr *CX = cast<SCEVCastExpr>(X);
+      const SCEVCastExpr *CY = cast<SCEVCastExpr>(Y);
+      const SCEV *Xop = CX->getOperand();
+      const SCEV *Yop = CY->getOperand();
+      if (Xop->getType() == Yop->getType()) {
+        X = Xop;
+        Y = Yop;
+      }
+    }
+  }
+  if (SE->isKnownPredicate(Pred, X, Y))
+    return true;
+  // If SE->isKnownPredicate can't prove the condition,
+  // we try the brute-force approach of subtracting
+  // and testing the difference.
+  // By testing with SE->isKnownPredicate first, we avoid
+  // the possibility of overflow when the arguments are constants.
+  const SCEV *Delta = SE->getMinusSCEV(X, Y);
+  switch (Pred) {
+  case CmpInst::ICMP_EQ:
+    return Delta->isZero();
+  case CmpInst::ICMP_NE:
+    return SE->isKnownNonZero(Delta);
+  case CmpInst::ICMP_SGE:
+    return SE->isKnownNonNegative(Delta);
+  case CmpInst::ICMP_SLE:
+    return SE->isKnownNonPositive(Delta);
+  case CmpInst::ICMP_SGT:
+    return SE->isKnownPositive(Delta);
+  case CmpInst::ICMP_SLT:
+    return SE->isKnownNegative(Delta);
+  default:
+    llvm_unreachable("unexpected predicate in isKnownPredicate");
+  }
+}
+
+
+// All subscripts are all the same type.
+// Loop bound may be smaller (e.g., a char).
+// Should zero extend loop bound, since it's always >= 0.
+// This routine collects upper bound and extends if needed.
+// Return null if no bound available.
+const SCEV *DependenceAnalysis::collectUpperBound(const Loop *L,
+                                                  Type *T) const {
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    const SCEV *UB = SE->getBackedgeTakenCount(L);
+    return SE->getNoopOrZeroExtend(UB, T);
+  }
+  return NULL;
+}
+
+
+// Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
+// If the cast fails, returns NULL.
+const SCEVConstant *DependenceAnalysis::collectConstantUpperBound(const Loop *L,
+                                                                  Type *T
+                                                                  ) const {
+  if (const SCEV *UB = collectUpperBound(L, T))
+    return dyn_cast<SCEVConstant>(UB);
+  return NULL;
+}
+
+
+// testZIV -
+// When we have a pair of subscripts of the form [c1] and [c2],
+// where c1 and c2 are both loop invariant, we attack it using
+// the ZIV test. Basically, we test by comparing the two values,
+// but there are actually three possible results:
+// 1) the values are equal, so there's a dependence
+// 2) the values are different, so there's no dependence
+// 3) the values might be equal, so we have to assume a dependence.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testZIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 FullDependence &Result) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  ++ZIVapplications;
+  if (isKnownPredicate(CmpInst::ICMP_EQ, Src, Dst)) {
+    DEBUG(dbgs() << "    provably dependent\n");
+    return false; // provably dependent
+  }
+  if (isKnownPredicate(CmpInst::ICMP_NE, Src, Dst)) {
+    DEBUG(dbgs() << "    provably independent\n");
+    ++ZIVindependence;
+    return true; // provably independent
+  }
+  DEBUG(dbgs() << "    possibly dependent\n");
+  Result.Consistent = false;
+  return false; // possibly dependent
+}
+
+
+// strongSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.1
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2 + a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+//  and a is a constant, we can solve it exactly using the Strong SIV test.
+//
+// Can prove independence. Failing that, can compute distance (and direction).
+// In the presence of symbolic terms, we can sometimes make progress.
+//
+// If there's a dependence,
+//
+//    c1 + a*i = c2 + a*i'
+//
+// The dependence distance is
+//
+//    d = i' - i = (c1 - c2)/a
+//
+// A dependence only exists if d is an integer and abs(d) <= U, where U is the
+// loop's upper bound. If a dependence exists, the dependence direction is
+// defined as
+//
+//                { < if d > 0
+//    direction = { = if d = 0
+//                { > if d < 0
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff,
+                                       const SCEV *SrcConst,
+                                       const SCEV *DstConst,
+                                       const Loop *CurLoop,
+                                       unsigned Level,
+                                       FullDependence &Result,
+                                       Constraint &NewConstraint) const {
+  DEBUG(dbgs() << "\tStrong SIV test\n");
+  DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
+  DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst);
+  DEBUG(dbgs() << ", " << *SrcConst->getType() << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst);
+  DEBUG(dbgs() << ", " << *DstConst->getType() << "\n");
+  ++StrongSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "level out of range");
+  Level--;
+
+  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta);
+  DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
+
+  // check that |Delta| < iteration count
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
+    DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
+    const SCEV *AbsDelta =
+      SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+    const SCEV *AbsCoeff =
+      SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+    const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
+      // Distance greater than trip count - no dependence
+      ++StrongSIVindependence;
+      ++StrongSIVsuccesses;
+      return true;
+    }
+  }
+
+  // Can we compute distance?
+  if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
+    APInt ConstDelta = cast<SCEVConstant>(Delta)->getValue()->getValue();
+    APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getValue()->getValue();
+    APInt Distance  = ConstDelta; // these need to be initialized
+    APInt Remainder = ConstDelta;
+    APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
+    DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+    DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+    // Make sure Coeff divides Delta exactly
+    if (Remainder != 0) {
+      // Coeff doesn't divide Distance, no dependence
+      ++StrongSIVindependence;
+      ++StrongSIVsuccesses;
+      return true;
+    }
+    Result.DV[Level].Distance = SE->getConstant(Distance);
+    NewConstraint.setDistance(SE->getConstant(Distance), CurLoop);
+    if (Distance.sgt(0))
+      Result.DV[Level].Direction &= Dependence::DVEntry::LT;
+    else if (Distance.slt(0))
+      Result.DV[Level].Direction &= Dependence::DVEntry::GT;
+    else
+      Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
+    ++StrongSIVsuccesses;
+  }
+  else if (Delta->isZero()) {
+    // since 0/X == 0
+    Result.DV[Level].Distance = Delta;
+    NewConstraint.setDistance(Delta, CurLoop);
+    Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
+    ++StrongSIVsuccesses;
+  }
+  else {
+    if (Coeff->isOne()) {
+      DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
+      Result.DV[Level].Distance = Delta; // since X/1 == X
+      NewConstraint.setDistance(Delta, CurLoop);
+    }
+    else {
+      Result.Consistent = false;
+      NewConstraint.setLine(Coeff,
+                            SE->getNegativeSCEV(Coeff),
+                            SE->getNegativeSCEV(Delta), CurLoop);
+    }
+
+    // maybe we can get a useful direction
+    bool DeltaMaybeZero     = !SE->isKnownNonZero(Delta);
+    bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
+    bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
+    bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
+    bool CoeffMaybeNegative = !SE->isKnownNonNegative(Coeff);
+    // The double negatives above are confusing.
+    // It helps to read !SE->isKnownNonZero(Delta)
+    // as "Delta might be Zero"
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if ((DeltaMaybePositive && CoeffMaybePositive) ||
+        (DeltaMaybeNegative && CoeffMaybeNegative))
+      NewDirection = Dependence::DVEntry::LT;
+    if (DeltaMaybeZero)
+      NewDirection |= Dependence::DVEntry::EQ;
+    if ((DeltaMaybeNegative && CoeffMaybePositive) ||
+        (DeltaMaybePositive && CoeffMaybeNegative))
+      NewDirection |= Dependence::DVEntry::GT;
+    if (NewDirection < Result.DV[Level].Direction)
+      ++StrongSIVsuccesses;
+    Result.DV[Level].Direction &= NewDirection;
+  }
+  return false;
+}
+
+
+// weakCrossingSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2 - a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Crossing SIV test.
+//
+// Given c1 + a*i = c2 - a*i', we can look for the intersection of
+// the two lines, where i = i', yielding
+//
+//    c1 + a*i = c2 - a*i
+//    2a*i = c2 - c1
+//    i = (c2 - c1)/2a
+//
+// If i < 0, there is no dependence.
+// If i > upperbound, there is no dependence.
+// If i = 0 (i.e., if c1 = c2), there's a dependence with distance = 0.
+// If i = upperbound, there's a dependence with distance = 0.
+// If i is integral, there's a dependence (all directions).
+// If the non-integer part = 1/2, there's a dependence (<> directions).
+// Otherwise, there's no dependence.
+//
+// Can prove independence. Failing that,
+// can sometimes refine the directions.
+// Can determine iteration for splitting.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
+                                             const SCEV *SrcConst,
+                                             const SCEV *DstConst,
+                                             const Loop *CurLoop,
+                                             unsigned Level,
+                                             FullDependence &Result,
+                                             Constraint &NewConstraint,
+                                             const SCEV *&SplitIter) const {
+  DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
+  DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakCrossingSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  NewConstraint.setLine(Coeff, Coeff, Delta, CurLoop);
+  if (Delta->isZero()) {
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::LT);
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::GT);
+    ++WeakCrossingSIVsuccesses;
+    if (!Result.DV[Level].Direction) {
+      ++WeakCrossingSIVindependence;
+      return true;
+    }
+    Result.DV[Level].Distance = Delta; // = 0
+    return false;
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(Coeff);
+  if (!ConstCoeff)
+    return false;
+
+  Result.DV[Level].Splitable = true;
+  if (SE->isKnownNegative(ConstCoeff)) {
+    ConstCoeff = dyn_cast<SCEVConstant>(SE->getNegativeSCEV(ConstCoeff));
+    assert(ConstCoeff &&
+           "dynamic cast of negative of ConstCoeff should yield constant");
+    Delta = SE->getNegativeSCEV(Delta);
+  }
+  assert(SE->isKnownPositive(ConstCoeff) && "ConstCoeff should be positive");
+
+  // compute SplitIter for use by DependenceAnalysis::getSplitIteration()
+  SplitIter =
+    SE->getUDivExpr(SE->getSMaxExpr(SE->getConstant(Delta->getType(), 0),
+                                    Delta),
+                    SE->getMulExpr(SE->getConstant(Delta->getType(), 2),
+                                   ConstCoeff));
+  DEBUG(dbgs() << "\t    Split iter = " << *SplitIter << "\n");
+
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  if (!ConstDelta)
+    return false;
+
+  // We're certain that ConstCoeff > 0; therefore,
+  // if Delta < 0, then no dependence.
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  DEBUG(dbgs() << "\t    ConstCoeff = " << *ConstCoeff << "\n");
+  if (SE->isKnownNegative(Delta)) {
+    // No dependence, Delta < 0
+    ++WeakCrossingSIVindependence;
+    ++WeakCrossingSIVsuccesses;
+    return true;
+  }
+
+  // We're certain that Delta > 0 and ConstCoeff > 0.
+  // Check Delta/(2*ConstCoeff) against upper loop bound
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
+    const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
+                                    ConstantTwo);
+    DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
+    if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
+      // Delta too big, no dependence
+      ++WeakCrossingSIVindependence;
+      ++WeakCrossingSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Delta, ML)) {
+      // i = i' = UB
+      Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::LT);
+      Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::GT);
+      ++WeakCrossingSIVsuccesses;
+      if (!Result.DV[Level].Direction) {
+        ++WeakCrossingSIVindependence;
+        return true;
+      }
+      Result.DV[Level].Splitable = false;
+      Result.DV[Level].Distance = SE->getConstant(Delta->getType(), 0);
+      return false;
+    }
+  }
+
+  // check that Coeff divides Delta
+  APInt APDelta = ConstDelta->getValue()->getValue();
+  APInt APCoeff = ConstCoeff->getValue()->getValue();
+  APInt Distance = APDelta; // these need to be initialzed
+  APInt Remainder = APDelta;
+  APInt::sdivrem(APDelta, APCoeff, Distance, Remainder);
+  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  if (Remainder != 0) {
+    // Coeff doesn't divide Delta, no dependence
+    ++WeakCrossingSIVindependence;
+    ++WeakCrossingSIVsuccesses;
+    return true;
+  }
+  DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+
+  // if 2*Coeff doesn't divide Delta, then the equal direction isn't possible
+  APInt Two = APInt(Distance.getBitWidth(), 2, true);
+  Remainder = Distance.srem(Two);
+  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  if (Remainder != 0) {
+    // Equal direction isn't possible
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::EQ);
+    ++WeakCrossingSIVsuccesses;
+  }
+  return false;
+}
+
+
+// Kirch's algorithm, from
+//
+//        Optimizing Supercompilers for Supercomputers
+//        Michael Wolfe
+//        MIT Press, 1989
+//
+// Program 2.1, page 29.
+// Computes the GCD of AM and BM.
+// Also finds a solution to the equation ax - by = gdc(a, b).
+// Returns true iff the gcd divides Delta.
+static
+bool findGCD(unsigned Bits, APInt AM, APInt BM, APInt Delta,
+             APInt &G, APInt &X, APInt &Y) {
+  APInt A0(Bits, 1, true), A1(Bits, 0, true);
+  APInt B0(Bits, 0, true), B1(Bits, 1, true);
+  APInt G0 = AM.abs();
+  APInt G1 = BM.abs();
+  APInt Q = G0; // these need to be initialized
+  APInt R = G0;
+  APInt::sdivrem(G0, G1, Q, R);
+  while (R != 0) {
+    APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
+    APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
+    G0 = G1; G1 = R;
+    APInt::sdivrem(G0, G1, Q, R);
+  }
+  G = G1;
+  DEBUG(dbgs() << "\t    GCD = " << G << "\n");
+  X = AM.slt(0) ? -A1 : A1;
+  Y = BM.slt(0) ? B1 : -B1;
+
+  // make sure gcd divides Delta
+  R = Delta.srem(G);
+  if (R != 0)
+    return true; // gcd doesn't divide Delta, no dependence
+  Q = Delta.sdiv(G);
+  X *= Q;
+  Y *= Q;
+  return false;
+}
+
+
+static
+APInt floorOfQuotient(APInt A, APInt B) {
+  APInt Q = A; // these need to be initialized
+  APInt R = A;
+  APInt::sdivrem(A, B, Q, R);
+  if (R == 0)
+    return Q;
+  if ((A.sgt(0) && B.sgt(0)) ||
+      (A.slt(0) && B.slt(0)))
+    return Q;
+  else
+    return Q - 1;
+}
+
+
+static
+APInt ceilingOfQuotient(APInt A, APInt B) {
+  APInt Q = A; // these need to be initialized
+  APInt R = A;
+  APInt::sdivrem(A, B, Q, R);
+  if (R == 0)
+    return Q;
+  if ((A.sgt(0) && B.sgt(0)) ||
+      (A.slt(0) && B.slt(0)))
+    return Q + 1;
+  else
+    return Q;
+}
+
+
+static
+APInt maxAPInt(APInt A, APInt B) {
+  return A.sgt(B) ? A : B;
+}
+
+
+static
+APInt minAPInt(APInt A, APInt B) {
+  return A.slt(B) ? A : B;
+}
+
+
+// exactSIVtest -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i],
+// where i is an induction variable, c1 and c2 are loop invariant, and a1
+// and a2 are constant, we can solve it exactly using an algorithm developed
+// by Banerjee and Wolfe. See Section 2.5.3 in
+//
+//        Optimizing Supercompilers for Supercomputers
+//        Michael Wolfe
+//        MIT Press, 1989
+//
+// It's slower than the specialized tests (strong SIV, weak-zero SIV, etc),
+// so use them if possible. They're also a bit better with symbolics and,
+// in the case of the strong SIV test, can compute Distances.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
+                                      const SCEV *DstCoeff,
+                                      const SCEV *SrcConst,
+                                      const SCEV *DstConst,
+                                      const Loop *CurLoop,
+                                      unsigned Level,
+                                      FullDependence &Result,
+                                      Constraint &NewConstraint) const {
+  DEBUG(dbgs() << "\tExact SIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++ExactSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  NewConstraint.setLine(SrcCoeff, SE->getNegativeSCEV(DstCoeff),
+                        Delta, CurLoop);
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstDelta || !ConstSrcCoeff || !ConstDstCoeff)
+    return false;
+
+  // find gcd
+  APInt G, X, Y;
+  APInt AM = ConstSrcCoeff->getValue()->getValue();
+  APInt BM = ConstDstCoeff->getValue()->getValue();
+  unsigned Bits = AM.getBitWidth();
+  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+    // gcd doesn't divide Delta, no dependence
+    ++ExactSIVindependence;
+    ++ExactSIVsuccesses;
+    return true;
+  }
+
+  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+
+  // since SCEV construction normalizes, LM = 0
+  APInt UM(Bits, 1, true);
+  bool UMvalid = false;
+  // UM is perhaps unavailable, let's check
+  if (const SCEVConstant *CUB =
+      collectConstantUpperBound(CurLoop, Delta->getType())) {
+    UM = CUB->getValue()->getValue();
+    DEBUG(dbgs() << "\t    UM = " << UM << "\n");
+    UMvalid = true;
+  }
+
+  APInt TU(APInt::getSignedMaxValue(Bits));
+  APInt TL(APInt::getSignedMinValue(Bits));
+
+  // test(BM/G, LM-X) and test(-BM/G, X-UM)
+  APInt TMUL = BM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (UMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(UM - X, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (UMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(UM - X, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+
+  // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
+  TMUL = AM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (UMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(UM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (UMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(UM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+  if (TL.sgt(TU)) {
+    ++ExactSIVindependence;
+    ++ExactSIVsuccesses;
+    return true;
+  }
+
+  // explore directions
+  unsigned NewDirection = Dependence::DVEntry::NONE;
+
+  // less than
+  APInt SaveTU(TU); // save these
+  APInt SaveTL(TL);
+  DEBUG(dbgs() << "\t    exploring LT direction\n");
+  TMUL = AM - BM;
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(X - Y + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(X - Y + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::LT;
+    ++ExactSIVsuccesses;
+  }
+
+  // equal
+  TU = SaveTU; // restore
+  TL = SaveTL;
+  DEBUG(dbgs() << "\t    exploring EQ direction\n");
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(X - Y, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(X - Y, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  TMUL = BM - AM;
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(Y - X, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(Y - X, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::EQ;
+    ++ExactSIVsuccesses;
+  }
+
+  // greater than
+  TU = SaveTU; // restore
+  TL = SaveTL;
+  DEBUG(dbgs() << "\t    exploring GT direction\n");
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(Y - X + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(Y - X + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::GT;
+    ++ExactSIVsuccesses;
+  }
+
+  // finished
+  Result.DV[Level].Direction &= NewDirection;
+  if (Result.DV[Level].Direction == Dependence::DVEntry::NONE)
+    ++ExactSIVindependence;
+  return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
+}
+
+
+
+// Return true if the divisor evenly divides the dividend.
+static
+bool isRemainderZero(const SCEVConstant *Dividend,
+                     const SCEVConstant *Divisor) {
+  APInt ConstDividend = Dividend->getValue()->getValue();
+  APInt ConstDivisor = Divisor->getValue()->getValue();
+  return ConstDividend.srem(ConstDivisor) == 0;
+}
+
+
+// weakZeroSrcSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1] and [c2 + a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Zero SIV test.
+//
+// Given
+//
+//    c1 = c2 + a*i
+//
+// we get
+//
+//    (c1 - c2)/a = i
+//
+// If i is not an integer, there's no dependence.
+// If i < 0 or > UB, there's no dependence.
+// If i = 0, the direction is <= and peeling the
+// 1st iteration will break the dependence.
+// If i = UB, the direction is >= and peeling the
+// last iteration will break the dependence.
+// Otherwise, the direction is *.
+//
+// Can prove independence. Failing that, we can sometimes refine
+// the directions. Can sometimes show that first or last
+// iteration carries all the dependences (so worth peeling).
+//
+// (see also weakZeroDstSIVtest)
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff,
+                                            const SCEV *SrcConst,
+                                            const SCEV *DstConst,
+                                            const Loop *CurLoop,
+                                            unsigned Level,
+                                            FullDependence &Result,
+                                            Constraint &NewConstraint) const {
+  // For the WeakSIV test, it's possible the loop isn't common to
+  // the Src and Dst loops. If it isn't, then there's no need to
+  // record a direction.
+  DEBUG(dbgs() << "\tWeak-Zero (src) SIV test\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakZeroSIVapplications;
+  assert(0 < Level && Level <= MaxLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  NewConstraint.setLine(SE->getConstant(Delta->getType(), 0),
+                        DstCoeff, Delta, CurLoop);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) {
+    if (Level < CommonLevels) {
+      Result.DV[Level].Direction &= Dependence::DVEntry::LE;
+      Result.DV[Level].PeelFirst = true;
+      ++WeakZeroSIVsuccesses;
+    }
+    return false; // dependences caused by first iteration
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstCoeff)
+    return false;
+  const SCEV *AbsCoeff =
+    SE->isKnownNegative(ConstCoeff) ?
+    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *NewDelta =
+    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+
+  // check that Delta/SrcCoeff < iteration count
+  // really check NewDelta < count*AbsCoeff
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
+      ++WeakZeroSIVindependence;
+      ++WeakZeroSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, NewDelta, Product)) {
+      // dependences caused by last iteration
+      if (Level < CommonLevels) {
+        Result.DV[Level].Direction &= Dependence::DVEntry::GE;
+        Result.DV[Level].PeelLast = true;
+        ++WeakZeroSIVsuccesses;
+      }
+      return false;
+    }
+  }
+
+  // check that Delta/SrcCoeff >= 0
+  // really check that NewDelta >= 0
+  if (SE->isKnownNegative(NewDelta)) {
+    // No dependence, newDelta < 0
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+
+  // if SrcCoeff doesn't divide Delta, then no dependence
+  if (isa<SCEVConstant>(Delta) &&
+      !isRemainderZero(cast<SCEVConstant>(Delta), ConstCoeff)) {
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+  return false;
+}
+
+
+// weakZeroDstSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Zero SIV test.
+//
+// Given
+//
+//    c1 + a*i = c2
+//
+// we get
+//
+//    i = (c2 - c1)/a
+//
+// If i is not an integer, there's no dependence.
+// If i < 0 or > UB, there's no dependence.
+// If i = 0, the direction is <= and peeling the
+// 1st iteration will break the dependence.
+// If i = UB, the direction is >= and peeling the
+// last iteration will break the dependence.
+// Otherwise, the direction is *.
+//
+// Can prove independence. Failing that, we can sometimes refine
+// the directions. Can sometimes show that first or last
+// iteration carries all the dependences (so worth peeling).
+//
+// (see also weakZeroSrcSIVtest)
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff,
+                                            const SCEV *SrcConst,
+                                            const SCEV *DstConst,
+                                            const Loop *CurLoop,
+                                            unsigned Level,
+                                            FullDependence &Result,
+                                            Constraint &NewConstraint) const {
+  // For the WeakSIV test, it's possible the loop isn't common to the
+  // Src and Dst loops. If it isn't, then there's no need to record a direction.
+  DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakZeroSIVapplications;
+  assert(0 < Level && Level <= SrcLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  NewConstraint.setLine(SrcCoeff, SE->getConstant(Delta->getType(), 0),
+                        Delta, CurLoop);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) {
+    if (Level < CommonLevels) {
+      Result.DV[Level].Direction &= Dependence::DVEntry::LE;
+      Result.DV[Level].PeelFirst = true;
+      ++WeakZeroSIVsuccesses;
+    }
+    return false; // dependences caused by first iteration
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  if (!ConstCoeff)
+    return false;
+  const SCEV *AbsCoeff =
+    SE->isKnownNegative(ConstCoeff) ?
+    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *NewDelta =
+    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+
+  // check that Delta/SrcCoeff < iteration count
+  // really check NewDelta < count*AbsCoeff
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
+      ++WeakZeroSIVindependence;
+      ++WeakZeroSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, NewDelta, Product)) {
+      // dependences caused by last iteration
+      if (Level < CommonLevels) {
+        Result.DV[Level].Direction &= Dependence::DVEntry::GE;
+        Result.DV[Level].PeelLast = true;
+        ++WeakZeroSIVsuccesses;
+      }
+      return false;
+    }
+  }
+
+  // check that Delta/SrcCoeff >= 0
+  // really check that NewDelta >= 0
+  if (SE->isKnownNegative(NewDelta)) {
+    // No dependence, newDelta < 0
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+
+  // if SrcCoeff doesn't divide Delta, then no dependence
+  if (isa<SCEVConstant>(Delta) &&
+      !isRemainderZero(cast<SCEVConstant>(Delta), ConstCoeff)) {
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+  return false;
+}
+
+
+// exactRDIVtest - Tests the RDIV subscript pair for dependence.
+// Things of the form [c1 + a*i] and [c2 + b*j],
+// where i and j are induction variable, c1 and c2 are loop invariant,
+// and a and b are constants.
+// Returns true if any possible dependence is disproved.
+// Marks the result as inconsistent.
+// Works in some cases that symbolicRDIVtest doesn't, and vice versa.
+bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
+                                       const SCEV *DstCoeff,
+                                       const SCEV *SrcConst,
+                                       const SCEV *DstConst,
+                                       const Loop *SrcLoop,
+                                       const Loop *DstLoop,
+                                       FullDependence &Result) const {
+  DEBUG(dbgs() << "\tExact RDIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++ExactRDIVapplications;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstDelta || !ConstSrcCoeff || !ConstDstCoeff)
+    return false;
+
+  // find gcd
+  APInt G, X, Y;
+  APInt AM = ConstSrcCoeff->getValue()->getValue();
+  APInt BM = ConstDstCoeff->getValue()->getValue();
+  unsigned Bits = AM.getBitWidth();
+  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+    // gcd doesn't divide Delta, no dependence
+    ++ExactRDIVindependence;
+    return true;
+  }
+
+  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+
+  // since SCEV construction seems to normalize, LM = 0
+  APInt SrcUM(Bits, 1, true);
+  bool SrcUMvalid = false;
+  // SrcUM is perhaps unavailable, let's check
+  if (const SCEVConstant *UpperBound =
+      collectConstantUpperBound(SrcLoop, Delta->getType())) {
+    SrcUM = UpperBound->getValue()->getValue();
+    DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
+    SrcUMvalid = true;
+  }
+
+  APInt DstUM(Bits, 1, true);
+  bool DstUMvalid = false;
+  // UM is perhaps unavailable, let's check
+  if (const SCEVConstant *UpperBound =
+      collectConstantUpperBound(DstLoop, Delta->getType())) {
+    DstUM = UpperBound->getValue()->getValue();
+    DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
+    DstUMvalid = true;
+  }
+
+  APInt TU(APInt::getSignedMaxValue(Bits));
+  APInt TL(APInt::getSignedMinValue(Bits));
+
+  // test(BM/G, LM-X) and test(-BM/G, X-UM)
+  APInt TMUL = BM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (SrcUMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(SrcUM - X, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (SrcUMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(SrcUM - X, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+
+  // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
+  TMUL = AM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (DstUMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(DstUM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (DstUMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(DstUM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+  if (TL.sgt(TU))
+    ++ExactRDIVindependence;
+  return TL.sgt(TU);
+}
+
+
+// symbolicRDIVtest -
+// In Section 4.5 of the Practical Dependence Testing paper,the authors
+// introduce a special case of Banerjee's Inequalities (also called the
+// Extreme-Value Test) that can handle some of the SIV and RDIV cases,
+// particularly cases with symbolics. Since it's only able to disprove
+// dependence (not compute distances or directions), we'll use it as a
+// fall back for the other tests.
+//
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
+// where i and j are induction variables and c1 and c2 are loop invariants,
+// we can use the symbolic tests to disprove some dependences, serving as a
+// backup for the RDIV test. Note that i and j can be the same variable,
+// letting this test serve as a backup for the various SIV tests.
+//
+// For a dependence to exist, c1 + a1*i must equal c2 + a2*j for some
+//  0 <= i <= N1 and some 0 <= j <= N2, where N1 and N2 are the (normalized)
+// loop bounds for the i and j loops, respectively. So, ...
+//
+// c1 + a1*i = c2 + a2*j
+// a1*i - a2*j = c2 - c1
+//
+// To test for a dependence, we compute c2 - c1 and make sure it's in the
+// range of the maximum and minimum possible values of a1*i - a2*j.
+// Considering the signs of a1 and a2, we have 4 possible cases:
+//
+// 1) If a1 >= 0 and a2 >= 0, then
+//        a1*0 - a2*N2 <= c2 - c1 <= a1*N1 - a2*0
+//              -a2*N2 <= c2 - c1 <= a1*N1
+//
+// 2) If a1 >= 0 and a2 <= 0, then
+//        a1*0 - a2*0 <= c2 - c1 <= a1*N1 - a2*N2
+//                  0 <= c2 - c1 <= a1*N1 - a2*N2
+//
+// 3) If a1 <= 0 and a2 >= 0, then
+//        a1*N1 - a2*N2 <= c2 - c1 <= a1*0 - a2*0
+//        a1*N1 - a2*N2 <= c2 - c1 <= 0
+//
+// 4) If a1 <= 0 and a2 <= 0, then
+//        a1*N1 - a2*0  <= c2 - c1 <= a1*0 - a2*N2
+//        a1*N1         <= c2 - c1 <=       -a2*N2
+//
+// return true if dependence disproved
+bool DependenceAnalysis::symbolicRDIVtest(const SCEV *A1,
+                                          const SCEV *A2,
+                                          const SCEV *C1,
+                                          const SCEV *C2,
+                                          const Loop *Loop1,
+                                          const Loop *Loop2) const {
+  ++SymbolicRDIVapplications;
+  DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
+  DEBUG(dbgs() << "\t    A1 = " << *A1);
+  DEBUG(dbgs() << ", type = " << *A1->getType() << "\n");
+  DEBUG(dbgs() << "\t    A2 = " << *A2 << "\n");
+  DEBUG(dbgs() << "\t    C1 = " << *C1 << "\n");
+  DEBUG(dbgs() << "\t    C2 = " << *C2 << "\n");
+  const SCEV *N1 = collectUpperBound(Loop1, A1->getType());
+  const SCEV *N2 = collectUpperBound(Loop2, A1->getType());
+  DEBUG(if (N1) dbgs() << "\t    N1 = " << *N1 << "\n");
+  DEBUG(if (N2) dbgs() << "\t    N2 = " << *N2 << "\n");
+  const SCEV *C2_C1 = SE->getMinusSCEV(C2, C1);
+  const SCEV *C1_C2 = SE->getMinusSCEV(C1, C2);
+  DEBUG(dbgs() << "\t    C2 - C1 = " << *C2_C1 << "\n");
+  DEBUG(dbgs() << "\t    C1 - C2 = " << *C1_C2 << "\n");
+  if (SE->isKnownNonNegative(A1)) {
+    if (SE->isKnownNonNegative(A2)) {
+      // A1 >= 0 && A2 >= 0
+      if (N1) {
+        // make sure that c2 - c1 <= a1*N1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      if (N2) {
+        // make sure that -a2*N2 <= c2 - c1, or a2*N2 >= c1 - c2
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SLT, A2N2, C1_C2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+    }
+    else if (SE->isKnownNonPositive(A2)) {
+      // a1 >= 0 && a2 <= 0
+      if (N1 && N2) {
+        // make sure that c2 - c1 <= a1*N1 - a2*N2
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
+        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1_A2N2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      // make sure that 0 <= c2 - c1
+      if (SE->isKnownNegative(C2_C1)) {
+        ++SymbolicRDIVindependence;
+        return true;
+      }
+    }
+  }
+  else if (SE->isKnownNonPositive(A1)) {
+    if (SE->isKnownNonNegative(A2)) {
+      // a1 <= 0 && a2 >= 0
+      if (N1 && N2) {
+        // make sure that a1*N1 - a2*N2 <= c2 - c1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
+        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1_A2N2, C2_C1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      // make sure that c2 - c1 <= 0
+      if (SE->isKnownPositive(C2_C1)) {
+        ++SymbolicRDIVindependence;
+        return true;
+      }
+    }
+    else if (SE->isKnownNonPositive(A2)) {
+      // a1 <= 0 && a2 <= 0
+      if (N1) {
+        // make sure that a1*N1 <= c2 - c1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1, C2_C1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      if (N2) {
+        // make sure that c2 - c1 <= -a2*N2, or c1 - c2 >= a2*N2
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SLT, C1_C2, A2N2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+
+// testSIV -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
+// where i is an induction variable, c1 and c2 are loop invariant, and a1 and
+// a2 are constant, we attack it with an SIV test. While they can all be
+// solved with the Exact SIV test, it's worthwhile to use simpler tests when
+// they apply; they're cheaper and sometimes more precise.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testSIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 unsigned &Level,
+                                 FullDependence &Result,
+                                 Constraint &NewConstraint,
+                                 const SCEV *&SplitIter) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (SrcAddRec && DstAddRec) {
+    const SCEV *SrcConst = SrcAddRec->getStart();
+    const SCEV *DstConst = DstAddRec->getStart();
+    const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    const Loop *CurLoop = SrcAddRec->getLoop();
+    assert(CurLoop == DstAddRec->getLoop() &&
+           "both loops in SIV should be same");
+    Level = mapSrcLoop(CurLoop);
+    bool disproven;
+    if (SrcCoeff == DstCoeff)
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                                Level, Result, NewConstraint);
+    else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
+      disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                                      Level, Result, NewConstraint, SplitIter);
+    else
+      disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+                               Level, Result, NewConstraint);
+    return disproven ||
+      gcdMIVtest(Src, Dst, Result) ||
+      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+  }
+  if (SrcAddRec) {
+    const SCEV *SrcConst = SrcAddRec->getStart();
+    const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    const SCEV *DstConst = Dst;
+    const Loop *CurLoop = SrcAddRec->getLoop();
+    Level = mapSrcLoop(CurLoop);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                              Level, Result, NewConstraint) ||
+      gcdMIVtest(Src, Dst, Result);
+  }
+  if (DstAddRec) {
+    const SCEV *DstConst = DstAddRec->getStart();
+    const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    const SCEV *SrcConst = Src;
+    const Loop *CurLoop = DstAddRec->getLoop();
+    Level = mapDstLoop(CurLoop);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
+                              CurLoop, Level, Result, NewConstraint) ||
+      gcdMIVtest(Src, Dst, Result);
+  }
+  llvm_unreachable("SIV test expected at least one AddRec");
+  return false;
+}
+
+
+// testRDIV -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
+// where i and j are induction variables, c1 and c2 are loop invariant,
+// and a1 and a2 are constant, we can solve it exactly with an easy adaptation
+// of the Exact SIV test, the Restricted Double Index Variable (RDIV) test.
+// It doesn't make sense to talk about distance or direction in this case,
+// so there's no point in making special versions of the Strong SIV test or
+// the Weak-crossing SIV test.
+//
+// With minor algebra, this test can also be used for things like
+// [c1 + a1*i + a2*j][c2].
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testRDIV(const SCEV *Src,
+                                  const SCEV *Dst,
+                                  FullDependence &Result) const {
+  // we have 3 possible situations here:
+  //   1) [a*i + b] and [c*j + d]
+  //   2) [a*i + c*j + b] and [d]
+  //   3) [b] and [a*i + c*j + d]
+  // We need to find what we've got and get organized
+
+  const SCEV *SrcConst, *DstConst;
+  const SCEV *SrcCoeff, *DstCoeff;
+  const Loop *SrcLoop, *DstLoop;
+
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (SrcAddRec && DstAddRec) {
+    SrcConst = SrcAddRec->getStart();
+    SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    SrcLoop = SrcAddRec->getLoop();
+    DstConst = DstAddRec->getStart();
+    DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    DstLoop = DstAddRec->getLoop();
+  }
+  else if (SrcAddRec) {
+    if (const SCEVAddRecExpr *tmpAddRec =
+        dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+      SrcConst = tmpAddRec->getStart();
+      SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
+      SrcLoop = tmpAddRec->getLoop();
+      DstConst = Dst;
+      DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
+      DstLoop = SrcAddRec->getLoop();
+    }
+    else
+      llvm_unreachable("RDIV reached by surprising SCEVs");
+  }
+  else if (DstAddRec) {
+    if (const SCEVAddRecExpr *tmpAddRec =
+        dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+      DstConst = tmpAddRec->getStart();
+      DstCoeff = tmpAddRec->getStepRecurrence(*SE);
+      DstLoop = tmpAddRec->getLoop();
+      SrcConst = Src;
+      SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
+      SrcLoop = DstAddRec->getLoop();
+    }
+    else
+      llvm_unreachable("RDIV reached by surprising SCEVs");
+  }
+  else
+    llvm_unreachable("RDIV expected at least one AddRec");
+  return exactRDIVtest(SrcCoeff, DstCoeff,
+                       SrcConst, DstConst,
+                       SrcLoop, DstLoop,
+                       Result) ||
+    gcdMIVtest(Src, Dst, Result) ||
+    symbolicRDIVtest(SrcCoeff, DstCoeff,
+                     SrcConst, DstConst,
+                     SrcLoop, DstLoop);
+}
+
+
+// Tests the single-subscript MIV pair (Src and Dst) for dependence.
+// Return true if dependence disproved.
+// Can sometimes refine direction vectors.
+bool DependenceAnalysis::testMIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 const SmallBitVector &Loops,
+                                 FullDependence &Result) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  Result.Consistent = false;
+  return gcdMIVtest(Src, Dst, Result) ||
+    banerjeeMIVtest(Src, Dst, Loops, Result);
+}
+
+
+// Given a product, e.g., 10*X*Y, returns the first constant operand,
+// in this case 10. If there is no constant part, returns NULL.
+static
+const SCEVConstant *getConstantPart(const SCEVMulExpr *Product) {
+  for (unsigned Op = 0, Ops = Product->getNumOperands(); Op < Ops; Op++) {
+    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Product->getOperand(Op)))
+      return Constant;
+  }
+  return NULL;
+}
+
+
+//===----------------------------------------------------------------------===//
+// gcdMIVtest -
+// Tests an MIV subscript pair for dependence.
+// Returns true if any possible dependence is disproved.
+// Marks the result as inconsistent.
+// Can sometimes disprove the equal direction for 1 or more loops,
+// as discussed in Michael Wolfe's book,
+// High Performance Compilers for Parallel Computing, page 235.
+//
+// We spend some effort (code!) to handle cases like
+// [10*i + 5*N*j + 15*M + 6], where i and j are induction variables,
+// but M and N are just loop-invariant variables.
+// This should help us handle linearized subscripts;
+// also makes this test a useful backup to the various SIV tests.
+//
+// It occurs to me that the presence of loop-invariant variables
+// changes the nature of the test from "greatest common divisor"
+// to "a common divisor".
+bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
+                                    const SCEV *Dst,
+                                    FullDependence &Result) const {
+  DEBUG(dbgs() << "starting gcd\n");
+  ++GCDapplications;
+  unsigned BitWidth = SE->getTypeSizeInBits(Src->getType());
+  APInt RunningGCD = APInt::getNullValue(BitWidth);
+
+  // Examine Src coefficients.
+  // Compute running GCD and record source constant.
+  // Because we're looking for the constant at the end of the chain,
+  // we can't quit the loop just because the GCD == 1.
+  const SCEV *Coefficients = Src;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    if (!Constant)
+      return false;
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    Coefficients = AddRec->getStart();
+  }
+  const SCEV *SrcConst = Coefficients;
+
+  // Examine Dst coefficients.
+  // Compute running GCD and record destination constant.
+  // Because we're looking for the constant at the end of the chain,
+  // we can't quit the loop just because the GCD == 1.
+  Coefficients = Dst;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    if (!Constant)
+      return false;
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    Coefficients = AddRec->getStart();
+  }
+  const SCEV *DstConst = Coefficients;
+
+  APInt ExtraGCD = APInt::getNullValue(BitWidth);
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "    Delta = " << *Delta << "\n");
+  const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Delta);
+  if (const SCEVAddExpr *Sum = dyn_cast<SCEVAddExpr>(Delta)) {
+    // If Delta is a sum of products, we may be able to make further progress.
+    for (unsigned Op = 0, Ops = Sum->getNumOperands(); Op < Ops; Op++) {
+      const SCEV *Operand = Sum->getOperand(Op);
+      if (isa<SCEVConstant>(Operand)) {
+        assert(!Constant && "Surprised to find multiple constants");
+        Constant = cast<SCEVConstant>(Operand);
+      }
+      else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+        // Search for constant operand to participate in GCD;
+        // If none found; return false.
+        const SCEVConstant *ConstOp = getConstantPart(Product);
+        if (!ConstOp)
+          return false;
+        APInt ConstOpValue = ConstOp->getValue()->getValue();
+        ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD,
+                                                   ConstOpValue.abs());
+      }
+      else
+        return false;
+    }
+  }
+  if (!Constant)
+    return false;
+  APInt ConstDelta = cast<SCEVConstant>(Constant)->getValue()->getValue();
+  DEBUG(dbgs() << "    ConstDelta = " << ConstDelta << "\n");
+  if (ConstDelta == 0)
+    return false;
+  RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ExtraGCD);
+  DEBUG(dbgs() << "    RunningGCD = " << RunningGCD << "\n");
+  APInt Remainder = ConstDelta.srem(RunningGCD);
+  if (Remainder != 0) {
+    ++GCDindependence;
+    return true;
+  }
+
+  // Try to disprove equal directions.
+  // For example, given a subscript pair [3*i + 2*j] and [i' + 2*j' - 1],
+  // the code above can't disprove the dependence because the GCD = 1.
+  // So we consider what happen if i = i' and what happens if j = j'.
+  // If i = i', we can simplify the subscript to [2*i + 2*j] and [2*j' - 1],
+  // which is infeasible, so we can disallow the = direction for the i level.
+  // Setting j = j' doesn't help matters, so we end up with a direction vector
+  // of [<>, *]
+  //
+  // Given A[5*i + 10*j*M + 9*M*N] and A[15*i + 20*j*M - 21*N*M + 5],
+  // we need to remember that the constant part is 5 and the RunningGCD should
+  // be initialized to ExtraGCD = 30.
+  DEBUG(dbgs() << "    ExtraGCD = " << ExtraGCD << '\n');
+
+  bool Improved = false;
+  Coefficients = Src;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    Coefficients = AddRec->getStart();
+    const Loop *CurLoop = AddRec->getLoop();
+    RunningGCD = ExtraGCD;
+    const SCEV *SrcCoeff = AddRec->getStepRecurrence(*SE);
+    const SCEV *DstCoeff = SE->getMinusSCEV(SrcCoeff, SrcCoeff);
+    const SCEV *Inner = Src;
+    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
+      AddRec = cast<SCEVAddRecExpr>(Inner);
+      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+      if (CurLoop == AddRec->getLoop())
+        ; // SrcCoeff == Coeff
+      else {
+        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+          // If the coefficient is the product of a constant and other stuff,
+          // we can use the constant in the GCD computation.
+          Constant = getConstantPart(Product);
+        else
+          Constant = cast<SCEVConstant>(Coeff);
+        APInt ConstCoeff = Constant->getValue()->getValue();
+        RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+      }
+      Inner = AddRec->getStart();
+    }
+    Inner = Dst;
+    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
+      AddRec = cast<SCEVAddRecExpr>(Inner);
+      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+      if (CurLoop == AddRec->getLoop())
+        DstCoeff = Coeff;
+      else {
+        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+          // If the coefficient is the product of a constant and other stuff,
+          // we can use the constant in the GCD computation.
+          Constant = getConstantPart(Product);
+        else
+          Constant = cast<SCEVConstant>(Coeff);
+        APInt ConstCoeff = Constant->getValue()->getValue();
+        RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+      }
+      Inner = AddRec->getStart();
+    }
+    Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Delta))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    else if (isa<SCEVConstant>(Delta))
+      Constant = cast<SCEVConstant>(Delta);
+    else {
+      // The difference of the two coefficients might not be a product
+      // or constant, in which case we give up on this direction.
+      continue;
+    }
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
+    if (RunningGCD != 0) {
+      Remainder = ConstDelta.srem(RunningGCD);
+      DEBUG(dbgs() << "\tRemainder = " << Remainder << "\n");
+      if (Remainder != 0) {
+        unsigned Level = mapSrcLoop(CurLoop);
+        Result.DV[Level - 1].Direction &= unsigned(~Dependence::DVEntry::EQ);
+        Improved = true;
+      }
+    }
+  }
+  if (Improved)
+    ++GCDsuccesses;
+  DEBUG(dbgs() << "all done\n");
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// banerjeeMIVtest -
+// Use Banerjee's Inequalities to test an MIV subscript pair.
+// (Wolfe, in the race-car book, calls this the Extreme Value Test.)
+// Generally follows the discussion in Section 2.5.2 of
+//
+//    Optimizing Supercompilers for Supercomputers
+//    Michael Wolfe
+//
+// The inequalities given on page 25 are simplified in that loops are
+// normalized so that the lower bound is always 0 and the stride is always 1.
+// For example, Wolfe gives
+//
+//     LB^<_k = (A^-_k - B_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//
+// where A_k is the coefficient of the kth index in the source subscript,
+// B_k is the coefficient of the kth index in the destination subscript,
+// U_k is the upper bound of the kth index, L_k is the lower bound of the Kth
+// index, and N_k is the stride of the kth index. Since all loops are normalized
+// by the SCEV package, N_k = 1 and L_k = 0, allowing us to simplify the
+// equation to
+//
+//     LB^<_k = (A^-_k - B_k)^- (U_k - 0 - 1) + (A_k - B_k)0 - B_k 1
+//            = (A^-_k - B_k)^- (U_k - 1)  - B_k
+//
+// Similar simplifications are possible for the other equations.
+//
+// When we can't determine the number of iterations for a loop,
+// we use NULL as an indicator for the worst case, infinity.
+// When computing the upper bound, NULL denotes +inf;
+// for the lower bound, NULL denotes -inf.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::banerjeeMIVtest(const SCEV *Src,
+                                         const SCEV *Dst,
+                                         const SmallBitVector &Loops,
+                                         FullDependence &Result) const {
+  DEBUG(dbgs() << "starting Banerjee\n");
+  ++BanerjeeApplications;
+  DEBUG(dbgs() << "    Src = " << *Src << '\n');
+  const SCEV *A0;
+  CoefficientInfo *A = collectCoeffInfo(Src, true, A0);
+  DEBUG(dbgs() << "    Dst = " << *Dst << '\n');
+  const SCEV *B0;
+  CoefficientInfo *B = collectCoeffInfo(Dst, false, B0);
+  BoundInfo *Bound = new BoundInfo[MaxLevels + 1];
+  const SCEV *Delta = SE->getMinusSCEV(B0, A0);
+  DEBUG(dbgs() << "\tDelta = " << *Delta << '\n');
+
+  // Compute bounds for all the * directions.
+  DEBUG(dbgs() << "\tBounds[*]\n");
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    Bound[K].Iterations = A[K].Iterations ? A[K].Iterations : B[K].Iterations;
+    Bound[K].Direction = Dependence::DVEntry::ALL;
+    Bound[K].DirSet = Dependence::DVEntry::NONE;
+    findBoundsALL(A, B, Bound, K);
+#ifndef NDEBUG
+    DEBUG(dbgs() << "\t    " << K << '\t');
+    if (Bound[K].Lower[Dependence::DVEntry::ALL])
+      DEBUG(dbgs() << *Bound[K].Lower[Dependence::DVEntry::ALL] << '\t');
+    else
+      DEBUG(dbgs() << "-inf\t");
+    if (Bound[K].Upper[Dependence::DVEntry::ALL])
+      DEBUG(dbgs() << *Bound[K].Upper[Dependence::DVEntry::ALL] << '\n');
+    else
+      DEBUG(dbgs() << "+inf\n");
+#endif
+  }
+
+  // Test the *, *, *, ... case.
+  bool Disproved = false;
+  if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
+    // Explore the direction vector hierarchy.
+    unsigned DepthExpanded = 0;
+    unsigned NewDeps = exploreDirections(1, A, B, Bound,
+                                         Loops, DepthExpanded, Delta);
+    if (NewDeps > 0) {
+      bool Improved = false;
+      for (unsigned K = 1; K <= CommonLevels; ++K) {
+        if (Loops[K]) {
+          unsigned Old = Result.DV[K - 1].Direction;
+          Result.DV[K - 1].Direction = Old & Bound[K].DirSet;
+          Improved |= Old != Result.DV[K - 1].Direction;
+          if (!Result.DV[K - 1].Direction) {
+            Improved = false;
+            Disproved = true;
+            break;
+          }
+        }
+      }
+      if (Improved)
+        ++BanerjeeSuccesses;
+    }
+    else {
+      ++BanerjeeIndependence;
+      Disproved = true;
+    }
+  }
+  else {
+    ++BanerjeeIndependence;
+    Disproved = true;
+  }
+  delete [] Bound;
+  delete [] A;
+  delete [] B;
+  return Disproved;
+}
+
+
+// Hierarchically expands the direction vector
+// search space, combining the directions of discovered dependences
+// in the DirSet field of Bound. Returns the number of distinct
+// dependences discovered. If the dependence is disproved,
+// it will return 0.
+unsigned DependenceAnalysis::exploreDirections(unsigned Level,
+                                               CoefficientInfo *A,
+                                               CoefficientInfo *B,
+                                               BoundInfo *Bound,
+                                               const SmallBitVector &Loops,
+                                               unsigned &DepthExpanded,
+                                               const SCEV *Delta) const {
+  if (Level > CommonLevels) {
+    // record result
+    DEBUG(dbgs() << "\t[");
+    for (unsigned K = 1; K <= CommonLevels; ++K) {
+      if (Loops[K]) {
+        Bound[K].DirSet |= Bound[K].Direction;
+#ifndef NDEBUG
+        switch (Bound[K].Direction) {
+        case Dependence::DVEntry::LT:
+          DEBUG(dbgs() << " <");
+          break;
+        case Dependence::DVEntry::EQ:
+          DEBUG(dbgs() << " =");
+          break;
+        case Dependence::DVEntry::GT:
+          DEBUG(dbgs() << " >");
+          break;
+        case Dependence::DVEntry::ALL:
+          DEBUG(dbgs() << " *");
+          break;
+        default:
+          llvm_unreachable("unexpected Bound[K].Direction");
+        }
+#endif
+      }
+    }
+    DEBUG(dbgs() << " ]\n");
+    return 1;
+  }
+  if (Loops[Level]) {
+    if (Level > DepthExpanded) {
+      DepthExpanded = Level;
+      // compute bounds for <, =, > at current level
+      findBoundsLT(A, B, Bound, Level);
+      findBoundsGT(A, B, Bound, Level);
+      findBoundsEQ(A, B, Bound, Level);
+#ifndef NDEBUG
+      DEBUG(dbgs() << "\tBound for level = " << Level << '\n');
+      DEBUG(dbgs() << "\t    <\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::LT])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::LT] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::LT])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::LT] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+      DEBUG(dbgs() << "\t    =\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::EQ])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::EQ] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::EQ])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::EQ] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+      DEBUG(dbgs() << "\t    >\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::GT])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::GT] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::GT])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::GT] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+#endif
+    }
+
+    unsigned NewDeps = 0;
+
+    // test bounds for <, *, *, ...
+    if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    // Test bounds for =, *, *, ...
+    if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    // test bounds for >, *, *, ...
+    if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    Bound[Level].Direction = Dependence::DVEntry::ALL;
+    return NewDeps;
+  }
+  else
+    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+}
+
+
+// Returns true iff the current bounds are plausible.
+bool DependenceAnalysis::testBounds(unsigned char DirKind,
+                                    unsigned Level,
+                                    BoundInfo *Bound,
+                                    const SCEV *Delta) const {
+  Bound[Level].Direction = DirKind;
+  if (const SCEV *LowerBound = getLowerBound(Bound))
+    if (isKnownPredicate(CmpInst::ICMP_SGT, LowerBound, Delta))
+      return false;
+  if (const SCEV *UpperBound = getUpperBound(Bound))
+    if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, UpperBound))
+      return false;
+  return true;
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the * direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^*_k = (A^-_k - B^+_k)(U_k - L_k) + (A_k - B_k)L_k
+//    UB^*_k = (A^+_k - B^-_k)(U_k - L_k) + (A_k - B_k)L_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^*_k = (A^-_k - B^+_k)U_k
+//    UB^*_k = (A^+_k - B^-_k)U_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+// Note that the lower bound is always <= 0
+// and the upper bound is always >= 0.
+void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
+                                       CoefficientInfo *B,
+                                       BoundInfo *Bound,
+                                       unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::ALL] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    Bound[K].Lower[Dependence::DVEntry::ALL] =
+      SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
+                     Bound[K].Iterations);
+    Bound[K].Upper[Dependence::DVEntry::ALL] =
+      SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
+                     Bound[K].Iterations);
+  }
+  else {
+    // If the difference is 0, we won't need to know the number of iterations.
+    if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
+      Bound[K].Lower[Dependence::DVEntry::ALL] =
+        SE->getConstant(A[K].Coeff->getType(), 0);
+    if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart))
+      Bound[K].Upper[Dependence::DVEntry::ALL] =
+        SE->getConstant(A[K].Coeff->getType(), 0);
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the = direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^=_k = (A_k - B_k)^- (U_k - L_k) + (A_k - B_k)L_k
+//    UB^=_k = (A_k - B_k)^+ (U_k - L_k) + (A_k - B_k)L_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^=_k = (A_k - B_k)^- U_k
+//    UB^=_k = (A_k - B_k)^+ U_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+// Note that the lower bound is always <= 0
+// and the upper bound is always >= 0.
+void DependenceAnalysis::findBoundsEQ(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::EQ] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
+    const SCEV *NegativePart = getNegativePart(Delta);
+    Bound[K].Lower[Dependence::DVEntry::EQ] =
+      SE->getMulExpr(NegativePart, Bound[K].Iterations);
+    const SCEV *PositivePart = getPositivePart(Delta);
+    Bound[K].Upper[Dependence::DVEntry::EQ] =
+      SE->getMulExpr(PositivePart, Bound[K].Iterations);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
+    const SCEV *NegativePart = getNegativePart(Delta);
+    if (NegativePart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::EQ] = NegativePart; // Zero
+    const SCEV *PositivePart = getPositivePart(Delta);
+    if (PositivePart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::EQ] = PositivePart; // Zero
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the < direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^<_k = (A^-_k - B_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//    UB^<_k = (A^+_k - B_k)^+ (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^<_k = (A^-_k - B_k)^- (U_k - 1) - B_k
+//    UB^<_k = (A^+_k - B_k)^+ (U_k - 1) - B_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::LT] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Iter_1 =
+      SE->getMinusSCEV(Bound[K].Iterations,
+                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+    Bound[K].Lower[Dependence::DVEntry::LT] =
+      SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+    Bound[K].Upper[Dependence::DVEntry::LT] =
+      SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+    if (NegPart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+    if (PosPart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the > direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^>_k = (A_k - B^+_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k + A_k N_k
+//    UB^>_k = (A_k - B^-_k)^+ (U_k - L_k - N_k) + (A_k - B_k)L_k + A_k N_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^>_k = (A_k - B^+_k)^- (U_k - 1) + A_k
+//    UB^>_k = (A_k - B^-_k)^+ (U_k - 1) + A_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::GT] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Iter_1 =
+      SE->getMinusSCEV(Bound[K].Iterations,
+                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    Bound[K].Lower[Dependence::DVEntry::GT] =
+      SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    Bound[K].Upper[Dependence::DVEntry::GT] =
+      SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    if (NegPart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
+    const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    if (PosPart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
+  }
+}
+
+
+// X^+ = max(X, 0)
+const SCEV *DependenceAnalysis::getPositivePart(const SCEV *X) const {
+  return SE->getSMaxExpr(X, SE->getConstant(X->getType(), 0));
+}
+
+
+// X^- = min(X, 0)
+const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const {
+  return SE->getSMinExpr(X, SE->getConstant(X->getType(), 0));
+}
+
+
+// Walks through the subscript,
+// collecting each coefficient, the associated loop bounds,
+// and recording its positive and negative parts for later use.
+DependenceAnalysis::CoefficientInfo *
+DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
+                                     bool SrcFlag,
+                                     const SCEV *&Constant) const {
+  const SCEV *Zero = SE->getConstant(Subscript->getType(), 0);
+  CoefficientInfo *CI = new CoefficientInfo[MaxLevels + 1];
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    CI[K].Coeff = Zero;
+    CI[K].PosPart = Zero;
+    CI[K].NegPart = Zero;
+    CI[K].Iterations = NULL;
+  }
+  while (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Subscript)) {
+    const Loop *L = AddRec->getLoop();
+    unsigned K = SrcFlag ? mapSrcLoop(L) : mapDstLoop(L);
+    CI[K].Coeff = AddRec->getStepRecurrence(*SE);
+    CI[K].PosPart = getPositivePart(CI[K].Coeff);
+    CI[K].NegPart = getNegativePart(CI[K].Coeff);
+    CI[K].Iterations = collectUpperBound(L, Subscript->getType());
+    Subscript = AddRec->getStart();
+  }
+  Constant = Subscript;
+#ifndef NDEBUG
+  DEBUG(dbgs() << "\tCoefficient Info\n");
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    DEBUG(dbgs() << "\t    " << K << "\t" << *CI[K].Coeff);
+    DEBUG(dbgs() << "\tPos Part = ");
+    DEBUG(dbgs() << *CI[K].PosPart);
+    DEBUG(dbgs() << "\tNeg Part = ");
+    DEBUG(dbgs() << *CI[K].NegPart);
+    DEBUG(dbgs() << "\tUpper Bound = ");
+    if (CI[K].Iterations)
+      DEBUG(dbgs() << *CI[K].Iterations);
+    else
+      DEBUG(dbgs() << "+inf");
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "\t    Constant = " << *Subscript << '\n');
+#endif
+  return CI;
+}
+
+
+// Looks through all the bounds info and
+// computes the lower bound given the current direction settings
+// at each level. If the lower bound for any level is -inf,
+// the result is -inf.
+const SCEV *DependenceAnalysis::getLowerBound(BoundInfo *Bound) const {
+  const SCEV *Sum = Bound[1].Lower[Bound[1].Direction];
+  for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
+    if (Bound[K].Lower[Bound[K].Direction])
+      Sum = SE->getAddExpr(Sum, Bound[K].Lower[Bound[K].Direction]);
+    else
+      Sum = NULL;
+  }
+  return Sum;
+}
+
+
+// Looks through all the bounds info and
+// computes the upper bound given the current direction settings
+// at each level. If the upper bound at any level is +inf,
+// the result is +inf.
+const SCEV *DependenceAnalysis::getUpperBound(BoundInfo *Bound) const {
+  const SCEV *Sum = Bound[1].Upper[Bound[1].Direction];
+  for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
+    if (Bound[K].Upper[Bound[K].Direction])
+      Sum = SE->getAddExpr(Sum, Bound[K].Upper[Bound[K].Direction]);
+    else
+      Sum = NULL;
+  }
+  return Sum;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Constraint manipulation for Delta test.
+
+// Given a linear SCEV,
+// return the coefficient (the step)
+// corresponding to the specified loop.
+// If there isn't one, return 0.
+// For example, given a*i + b*j + c*k, zeroing the coefficient
+// corresponding to the j loop would yield b.
+const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr,
+                                                const Loop *TargetLoop)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec)
+    return SE->getConstant(Expr->getType(), 0);
+  if (AddRec->getLoop() == TargetLoop)
+    return AddRec->getStepRecurrence(*SE);
+  return findCoefficient(AddRec->getStart(), TargetLoop);
+}
+
+
+// Given a linear SCEV,
+// return the SCEV given by zeroing out the coefficient
+// corresponding to the specified loop.
+// For example, given a*i + b*j + c*k, zeroing the coefficient
+// corresponding to the j loop would yield a*i + c*k.
+const SCEV *DependenceAnalysis::zeroCoefficient(const SCEV *Expr,
+                                                const Loop *TargetLoop)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec)
+    return Expr; // ignore
+  if (AddRec->getLoop() == TargetLoop)
+    return AddRec->getStart();
+  return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
+                           AddRec->getStepRecurrence(*SE),
+                           AddRec->getLoop(),
+                           AddRec->getNoWrapFlags());
+}
+
+
+// Given a linear SCEV Expr,
+// return the SCEV given by adding some Value to the
+// coefficient corresponding to the specified TargetLoop.
+// For example, given a*i + b*j + c*k, adding 1 to the coefficient
+// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
+const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
+                                                 const Loop *TargetLoop,
+                                                 const SCEV *Value)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec) // create a new addRec
+    return SE->getAddRecExpr(Expr,
+                             Value,
+                             TargetLoop,
+                             SCEV::FlagAnyWrap); // Worst case, with no info.
+  if (AddRec->getLoop() == TargetLoop) {
+    const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
+    if (Sum->isZero())
+      return AddRec->getStart();
+    return SE->getAddRecExpr(AddRec->getStart(),
+                             Sum,
+                             AddRec->getLoop(),
+                             AddRec->getNoWrapFlags());
+  }
+  return SE->getAddRecExpr(addToCoefficient(AddRec->getStart(),
+                                            TargetLoop, Value),
+                           AddRec->getStepRecurrence(*SE),
+                           AddRec->getLoop(),
+                           AddRec->getNoWrapFlags());
+}
+
+
+// Review the constraints, looking for opportunities
+// to simplify a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+// Corresponds to Figure 5 from the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+bool DependenceAnalysis::propagate(const SCEV *&Src,
+                                   const SCEV *&Dst,
+                                   SmallBitVector &Loops,
+                                   SmallVector<Constraint, 4> &Constraints,
+                                   bool &Consistent) {
+  bool Result = false;
+  for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
+    DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
+    DEBUG(Constraints[LI].dump(dbgs()));
+    if (Constraints[LI].isDistance())
+      Result |= propagateDistance(Src, Dst, Constraints[LI], Consistent);
+    else if (Constraints[LI].isLine())
+      Result |= propagateLine(Src, Dst, Constraints[LI], Consistent);
+    else if (Constraints[LI].isPoint())
+      Result |= propagatePoint(Src, Dst, Constraints[LI]);
+  }
+  return Result;
+}
+
+
+// Attempt to propagate a distance
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+bool DependenceAnalysis::propagateDistance(const SCEV *&Src,
+                                           const SCEV *&Dst,
+                                           Constraint &CurConstraint,
+                                           bool &Consistent) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  const SCEV *A_K = findCoefficient(Src, CurLoop);
+  if (A_K->isZero())
+    return false;
+  const SCEV *DA_K = SE->getMulExpr(A_K, CurConstraint.getD());
+  Src = SE->getMinusSCEV(Src, DA_K);
+  Src = zeroCoefficient(Src, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  Dst = addToCoefficient(Dst, CurLoop, SE->getNegativeSCEV(A_K));
+  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  if (!findCoefficient(Dst, CurLoop)->isZero())
+    Consistent = false;
+  return true;
+}
+
+
+// Attempt to propagate a line
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+bool DependenceAnalysis::propagateLine(const SCEV *&Src,
+                                       const SCEV *&Dst,
+                                       Constraint &CurConstraint,
+                                       bool &Consistent) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const SCEV *A = CurConstraint.getA();
+  const SCEV *B = CurConstraint.getB();
+  const SCEV *C = CurConstraint.getC();
+  DEBUG(dbgs() << "\t\tA = " << *A << ", B = " << *B << ", C = " << *C << "\n");
+  DEBUG(dbgs() << "\t\tSrc = " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst = " << *Dst << "\n");
+  if (A->isZero()) {
+    const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Bconst || !Cconst) return false;
+    APInt Beta = Bconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivB = Charlie.sdiv(Beta);
+    assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
+    const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+    //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
+    Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
+    Dst = zeroCoefficient(Dst, CurLoop);
+    if (!findCoefficient(Src, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else if (B->isZero()) {
+    const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Aconst || !Cconst) return false;
+    APInt Alpha = Aconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivA = Charlie.sdiv(Alpha);
+    assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
+    Src = zeroCoefficient(Src, CurLoop);
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+    const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Aconst || !Cconst) return false;
+    APInt Alpha = Aconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivA = Charlie.sdiv(Alpha);
+    assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
+    Src = zeroCoefficient(Src, CurLoop);
+    Dst = addToCoefficient(Dst, CurLoop, A_K);
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else {
+    // paper is incorrect here, or perhaps just misleading
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getMulExpr(Src, A);
+    Dst = SE->getMulExpr(Dst, A);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, C));
+    Src = zeroCoefficient(Src, CurLoop);
+    Dst = addToCoefficient(Dst, CurLoop, SE->getMulExpr(A_K, B));
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  DEBUG(dbgs() << "\t\tnew Src = " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tnew Dst = " << *Dst << "\n");
+  return true;
+}
+
+
+// Attempt to propagate a point
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+bool DependenceAnalysis::propagatePoint(const SCEV *&Src,
+                                        const SCEV *&Dst,
+                                        Constraint &CurConstraint) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const SCEV *A_K = findCoefficient(Src, CurLoop);
+  const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+  const SCEV *XA_K = SE->getMulExpr(A_K, CurConstraint.getX());
+  const SCEV *YAP_K = SE->getMulExpr(AP_K, CurConstraint.getY());
+  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  Src = SE->getAddExpr(Src, SE->getMinusSCEV(XA_K, YAP_K));
+  Src = zeroCoefficient(Src, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  Dst = zeroCoefficient(Dst, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  return true;
+}
+
+
+// Update direction vector entry based on the current constraint.
+void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
+                                         const Constraint &CurConstraint
+                                         ) const {
+  DEBUG(dbgs() << "\tUpdate direction, constraint =");
+  DEBUG(CurConstraint.dump(dbgs()));
+  if (CurConstraint.isAny())
+    ; // use defaults
+  else if (CurConstraint.isDistance()) {
+    // this one is consistent, the others aren't
+    Level.Scalar = false;
+    Level.Distance = CurConstraint.getD();
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if (!SE->isKnownNonZero(Level.Distance)) // if may be zero
+      NewDirection = Dependence::DVEntry::EQ;
+    if (!SE->isKnownNonPositive(Level.Distance)) // if may be positive
+      NewDirection |= Dependence::DVEntry::LT;
+    if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
+      NewDirection |= Dependence::DVEntry::GT;
+    Level.Direction &= NewDirection;
+  }
+  else if (CurConstraint.isLine()) {
+    Level.Scalar = false;
+    Level.Distance = NULL;
+    // direction should be accurate
+  }
+  else if (CurConstraint.isPoint()) {
+    Level.Scalar = false;
+    Level.Distance = NULL;
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if (!isKnownPredicate(CmpInst::ICMP_NE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if X may be = Y
+      NewDirection |= Dependence::DVEntry::EQ;
+    if (!isKnownPredicate(CmpInst::ICMP_SLE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if Y may be > X
+      NewDirection |= Dependence::DVEntry::LT;
+    if (!isKnownPredicate(CmpInst::ICMP_SGE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if Y may be < X
+      NewDirection |= Dependence::DVEntry::GT;
+    Level.Direction &= NewDirection;
+  }
+  else
+    llvm_unreachable("constraint has unexpected kind");
+}
+
+
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+// For debugging purposes, dump a small bit vector to dbgs().
+static void dumpSmallBitVector(SmallBitVector &BV) {
+  dbgs() << "{";
+  for (int VI = BV.find_first(); VI >= 0; VI = BV.find_next(VI)) {
+    dbgs() << VI;
+    if (BV.find_next(VI) >= 0)
+      dbgs() << ' ';
+  }
+  dbgs() << "}\n";
+}
+#endif
+
+
+// depends -
+// Returns NULL if there is no dependence.
+// Otherwise, return a Dependence with as many details as possible.
+// Corresponds to Section 3.1 in the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+//
+// Care is required to keep the routine below, getSplitIteration(),
+// up to date with respect to this routine.
+Dependence *DependenceAnalysis::depends(Instruction *Src,
+                                        Instruction *Dst,
+                                        bool PossiblyLoopIndependent) {
+  if (Src == Dst)
+    PossiblyLoopIndependent = false;
+
+  if ((!Src->mayReadFromMemory() && !Src->mayWriteToMemory()) ||
+      (!Dst->mayReadFromMemory() && !Dst->mayWriteToMemory()))
+    // if both instructions don't reference memory, there's no dependence
+    return NULL;
+
+  if (!isLoadOrStore(Src) || !isLoadOrStore(Dst)) {
+    // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
+    DEBUG(dbgs() << "can only handle simple loads and stores\n");
+    return new Dependence(Src, Dst);
+  }
+
+  Value *SrcPtr = getPointerOperand(Src);
+  Value *DstPtr = getPointerOperand(Dst);
+
+  switch (underlyingObjectsAlias(AA, DstPtr, SrcPtr)) {
+  case AliasAnalysis::MayAlias:
+  case AliasAnalysis::PartialAlias:
+    // cannot analyse objects if we don't understand their aliasing.
+    DEBUG(dbgs() << "can't analyze may or partial alias\n");
+    return new Dependence(Src, Dst);
+  case AliasAnalysis::NoAlias:
+    // If the objects noalias, they are distinct, accesses are independent.
+    DEBUG(dbgs() << "no alias\n");
+    return NULL;
+  case AliasAnalysis::MustAlias:
+    break; // The underlying objects alias; test accesses for dependence.
+  }
+
+  // establish loop nesting levels
+  establishNestingLevels(Src, Dst);
+  DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
+  DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
+
+  FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels);
+  ++TotalArrayPairs;
+
+  // See if there are GEPs we can use.
+  bool UsefulGEP = false;
+  GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
+  GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
+  if (SrcGEP && DstGEP &&
+      SrcGEP->getPointerOperandType() == DstGEP->getPointerOperandType()) {
+    const SCEV *SrcPtrSCEV = SE->getSCEV(SrcGEP->getPointerOperand());
+    const SCEV *DstPtrSCEV = SE->getSCEV(DstGEP->getPointerOperand());
+    DEBUG(dbgs() << "    SrcPtrSCEV = " << *SrcPtrSCEV << "\n");
+    DEBUG(dbgs() << "    DstPtrSCEV = " << *DstPtrSCEV << "\n");
+
+    UsefulGEP =
+      isLoopInvariant(SrcPtrSCEV, LI->getLoopFor(Src->getParent())) &&
+      isLoopInvariant(DstPtrSCEV, LI->getLoopFor(Dst->getParent()));
+  }
+  unsigned Pairs = UsefulGEP ? SrcGEP->idx_end() - SrcGEP->idx_begin() : 1;
+  SmallVector<Subscript, 4> Pair(Pairs);
+  if (UsefulGEP) {
+    DEBUG(dbgs() << "    using GEPs\n");
+    unsigned P = 0;
+    for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
+           SrcEnd = SrcGEP->idx_end(),
+           DstIdx = DstGEP->idx_begin();
+         SrcIdx != SrcEnd;
+         ++SrcIdx, ++DstIdx, ++P) {
+      Pair[P].Src = SE->getSCEV(*SrcIdx);
+      Pair[P].Dst = SE->getSCEV(*DstIdx);
+    }
+  }
+  else {
+    DEBUG(dbgs() << "    ignoring GEPs\n");
+    const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
+    const SCEV *DstSCEV = SE->getSCEV(DstPtr);
+    DEBUG(dbgs() << "    SrcSCEV = " << *SrcSCEV << "\n");
+    DEBUG(dbgs() << "    DstSCEV = " << *DstSCEV << "\n");
+    Pair[0].Src = SrcSCEV;
+    Pair[0].Dst = DstSCEV;
+  }
+
+  for (unsigned P = 0; P < Pairs; ++P) {
+    Pair[P].Loops.resize(MaxLevels + 1);
+    Pair[P].GroupLoops.resize(MaxLevels + 1);
+    Pair[P].Group.resize(Pairs);
+    removeMatchingExtensions(&Pair[P]);
+    Pair[P].Classification =
+      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
+                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
+                   Pair[P].Loops);
+    Pair[P].GroupLoops = Pair[P].Loops;
+    Pair[P].Group.set(P);
+    DEBUG(dbgs() << "    subscript " << P << "\n");
+    DEBUG(dbgs() << "\tsrc = " << *Pair[P].Src << "\n");
+    DEBUG(dbgs() << "\tdst = " << *Pair[P].Dst << "\n");
+    DEBUG(dbgs() << "\tclass = " << Pair[P].Classification << "\n");
+    DEBUG(dbgs() << "\tloops = ");
+    DEBUG(dumpSmallBitVector(Pair[P].Loops));
+  }
+
+  SmallBitVector Separable(Pairs);
+  SmallBitVector Coupled(Pairs);
+
+  // Partition subscripts into separable and minimally-coupled groups
+  // Algorithm in paper is algorithmically better;
+  // this may be faster in practice. Check someday.
+  //
+  // Here's an example of how it works. Consider this code:
+  //
+  //   for (i = ...) {
+  //     for (j = ...) {
+  //       for (k = ...) {
+  //         for (l = ...) {
+  //           for (m = ...) {
+  //             A[i][j][k][m] = ...;
+  //             ... = A[0][j][l][i + j];
+  //           }
+  //         }
+  //       }
+  //     }
+  //   }
+  //
+  // There are 4 subscripts here:
+  //    0 [i] and [0]
+  //    1 [j] and [j]
+  //    2 [k] and [l]
+  //    3 [m] and [i + j]
+  //
+  // We've already classified each subscript pair as ZIV, SIV, etc.,
+  // and collected all the loops mentioned by pair P in Pair[P].Loops.
+  // In addition, we've initialized Pair[P].GroupLoops to Pair[P].Loops
+  // and set Pair[P].Group = {P}.
+  //
+  //      Src Dst    Classification Loops  GroupLoops Group
+  //    0 [i] [0]         SIV       {1}      {1}        {0}
+  //    1 [j] [j]         SIV       {2}      {2}        {1}
+  //    2 [k] [l]         RDIV      {3,4}    {3,4}      {2}
+  //    3 [m] [i + j]     MIV       {1,2,5}  {1,2,5}    {3}
+  //
+  // For each subscript SI 0 .. 3, we consider each remaining subscript, SJ.
+  // So, 0 is compared against 1, 2, and 3; 1 is compared against 2 and 3, etc.
+  //
+  // We begin by comparing 0 and 1. The intersection of the GroupLoops is empty.
+  // Next, 0 and 2. Again, the intersection of their GroupLoops is empty.
+  // Next 0 and 3. The intersection of their GroupLoop = {1}, not empty,
+  // so Pair[3].Group = {0,3} and Done = false (that is, 0 will not be added
+  // to either Separable or Coupled).
+  //
+  // Next, we consider 1 and 2. The intersection of the GroupLoops is empty.
+  // Next, 1 and 3. The intersectionof their GroupLoops = {2}, not empty,
+  // so Pair[3].Group = {0, 1, 3} and Done = false.
+  //
+  // Next, we compare 2 against 3. The intersection of the GroupLoops is empty.
+  // Since Done remains true, we add 2 to the set of Separable pairs.
+  //
+  // Finally, we consider 3. There's nothing to compare it with,
+  // so Done remains true and we add it to the Coupled set.
+  // Pair[3].Group = {0, 1, 3} and GroupLoops = {1, 2, 5}.
+  //
+  // In the end, we've got 1 separable subscript and 1 coupled group.
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    if (Pair[SI].Classification == Subscript::NonLinear) {
+      // ignore these, but collect loops for later
+      ++NonlinearSubscriptPairs;
+      collectCommonLoops(Pair[SI].Src,
+                         LI->getLoopFor(Src->getParent()),
+                         Pair[SI].Loops);
+      collectCommonLoops(Pair[SI].Dst,
+                         LI->getLoopFor(Dst->getParent()),
+                         Pair[SI].Loops);
+      Result.Consistent = false;
+    }
+    else if (Pair[SI].Classification == Subscript::ZIV) {
+      // always separable
+      Separable.set(SI);
+    }
+    else {
+      // SIV, RDIV, or MIV, so check for coupled group
+      bool Done = true;
+      for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
+        SmallBitVector Intersection = Pair[SI].GroupLoops;
+        Intersection &= Pair[SJ].GroupLoops;
+        if (Intersection.any()) {
+          // accumulate set of all the loops in group
+          Pair[SJ].GroupLoops |= Pair[SI].GroupLoops;
+          // accumulate set of all subscripts in group
+          Pair[SJ].Group |= Pair[SI].Group;
+          Done = false;
+        }
+      }
+      if (Done) {
+        if (Pair[SI].Group.count() == 1) {
+          Separable.set(SI);
+          ++SeparableSubscriptPairs;
+        }
+        else {
+          Coupled.set(SI);
+          ++CoupledSubscriptPairs;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "    Separable = ");
+  DEBUG(dumpSmallBitVector(Separable));
+  DEBUG(dbgs() << "    Coupled = ");
+  DEBUG(dumpSmallBitVector(Coupled));
+
+  Constraint NewConstraint;
+  NewConstraint.setAny(SE);
+
+  // test separable subscripts
+  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+    DEBUG(dbgs() << "testing subscript " << SI);
+    switch (Pair[SI].Classification) {
+    case Subscript::ZIV:
+      DEBUG(dbgs() << ", ZIV\n");
+      if (testZIV(Pair[SI].Src, Pair[SI].Dst, Result))
+        return NULL;
+      break;
+    case Subscript::SIV: {
+      DEBUG(dbgs() << ", SIV\n");
+      unsigned Level;
+      const SCEV *SplitIter = NULL;
+      if (testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
+                  Result, NewConstraint, SplitIter))
+        return NULL;
+      break;
+    }
+    case Subscript::RDIV:
+      DEBUG(dbgs() << ", RDIV\n");
+      if (testRDIV(Pair[SI].Src, Pair[SI].Dst, Result))
+        return NULL;
+      break;
+    case Subscript::MIV:
+      DEBUG(dbgs() << ", MIV\n");
+      if (testMIV(Pair[SI].Src, Pair[SI].Dst, Pair[SI].Loops, Result))
+        return NULL;
+      break;
+    default:
+      llvm_unreachable("subscript has unexpected classification");
+    }
+  }
+
+  if (Coupled.count()) {
+    // test coupled subscript groups
+    DEBUG(dbgs() << "starting on coupled subscripts\n");
+    DEBUG(dbgs() << "MaxLevels + 1 = " << MaxLevels + 1 << "\n");
+    SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
+    for (unsigned II = 0; II <= MaxLevels; ++II)
+      Constraints[II].setAny(SE);
+    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+      DEBUG(dbgs() << "testing subscript group " << SI << " { ");
+      SmallBitVector Group(Pair[SI].Group);
+      SmallBitVector Sivs(Pairs);
+      SmallBitVector Mivs(Pairs);
+      SmallBitVector ConstrainedLevels(MaxLevels + 1);
+      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+        DEBUG(dbgs() << SJ << " ");
+        if (Pair[SJ].Classification == Subscript::SIV)
+          Sivs.set(SJ);
+        else
+          Mivs.set(SJ);
+      }
+      DEBUG(dbgs() << "}\n");
+      while (Sivs.any()) {
+        bool Changed = false;
+        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+          DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
+          // SJ is an SIV subscript that's part of the current coupled group
+          unsigned Level;
+          const SCEV *SplitIter = NULL;
+          DEBUG(dbgs() << "SIV\n");
+          if (testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
+                      Result, NewConstraint, SplitIter))
+            return NULL;
+          ConstrainedLevels.set(Level);
+          if (intersectConstraints(&Constraints[Level], &NewConstraint)) {
+            if (Constraints[Level].isEmpty()) {
+              ++DeltaIndependence;
+              return NULL;
+            }
+            Changed = true;
+          }
+          Sivs.reset(SJ);
+        }
+        if (Changed) {
+          // propagate, possibly creating new SIVs and ZIVs
+          DEBUG(dbgs() << "    propagating\n");
+          DEBUG(dbgs() << "\tMivs = ");
+          DEBUG(dumpSmallBitVector(Mivs));
+          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+            // SJ is an MIV subscript that's part of the current coupled group
+            DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
+            if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops,
+                          Constraints, Result.Consistent)) {
+              DEBUG(dbgs() << "\t    Changed\n");
+              ++DeltaPropagations;
+              Pair[SJ].Classification =
+                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
+                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
+                             Pair[SJ].Loops);
+              switch (Pair[SJ].Classification) {
+              case Subscript::ZIV:
+                DEBUG(dbgs() << "ZIV\n");
+                if (testZIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
+                  return NULL;
+                Mivs.reset(SJ);
+                break;
+              case Subscript::SIV:
+                Sivs.set(SJ);
+                Mivs.reset(SJ);
+                break;
+              case Subscript::RDIV:
+              case Subscript::MIV:
+                break;
+              default:
+                llvm_unreachable("bad subscript classification");
+              }
+            }
+          }
+        }
+      }
+
+      // test & propagate remaining RDIVs
+      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::RDIV) {
+          DEBUG(dbgs() << "RDIV test\n");
+          if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
+            return NULL;
+          // I don't yet understand how to propagate RDIV results
+          Mivs.reset(SJ);
+        }
+      }
+
+      // test remaining MIVs
+      // This code is temporary.
+      // Better to somehow test all remaining subscripts simultaneously.
+      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::MIV) {
+          DEBUG(dbgs() << "MIV test\n");
+          if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
+            return NULL;
+        }
+        else
+          llvm_unreachable("expected only MIV subscripts at this point");
+      }
+
+      // update Result.DV from constraint vector
+      DEBUG(dbgs() << "    updating\n");
+      for (int SJ = ConstrainedLevels.find_first();
+           SJ >= 0; SJ = ConstrainedLevels.find_next(SJ)) {
+        updateDirection(Result.DV[SJ - 1], Constraints[SJ]);
+        if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE)
+          return NULL;
+      }
+    }
+  }
+
+  // Make sure the Scalar flags are set correctly.
+  SmallBitVector CompleteLoops(MaxLevels + 1);
+  for (unsigned SI = 0; SI < Pairs; ++SI)
+    CompleteLoops |= Pair[SI].Loops;
+  for (unsigned II = 1; II <= CommonLevels; ++II)
+    if (CompleteLoops[II])
+      Result.DV[II - 1].Scalar = false;
+
+  if (PossiblyLoopIndependent) {
+    // Make sure the LoopIndependent flag is set correctly.
+    // All directions must include equal, otherwise no
+    // loop-independent dependence is possible.
+    for (unsigned II = 1; II <= CommonLevels; ++II) {
+      if (!(Result.getDirection(II) & Dependence::DVEntry::EQ)) {
+        Result.LoopIndependent = false;
+        break;
+      }
+    }
+  }
+  else {
+    // On the other hand, if all directions are equal and there's no
+    // loop-independent dependence possible, then no dependence exists.
+    bool AllEqual = true;
+    for (unsigned II = 1; II <= CommonLevels; ++II) {
+      if (Result.getDirection(II) != Dependence::DVEntry::EQ) {
+        AllEqual = false;
+        break;
+      }
+    }
+    if (AllEqual)
+      return NULL;
+  }
+
+  FullDependence *Final = new FullDependence(Result);
+  Result.DV = NULL;
+  return Final;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// getSplitIteration -
+// Rather than spend rarely-used space recording the splitting iteration
+// during the Weak-Crossing SIV test, we re-compute it on demand.
+// The re-computation is basically a repeat of the entire dependence test,
+// though simplified since we know that the dependence exists.
+// It's tedious, since we must go through all propagations, etc.
+//
+// Care is required to keep this code up to date with respect to the routine
+// above, depends().
+//
+// Generally, the dependence analyzer will be used to build
+// a dependence graph for a function (basically a map from instructions
+// to dependences). Looking for cycles in the graph shows us loops
+// that cannot be trivially vectorized/parallelized.
+//
+// We can try to improve the situation by examining all the dependences
+// that make up the cycle, looking for ones we can break.
+// Sometimes, peeling the first or last iteration of a loop will break
+// dependences, and we've got flags for those possibilities.
+// Sometimes, splitting a loop at some other iteration will do the trick,
+// and we've got a flag for that case. Rather than waste the space to
+// record the exact iteration (since we rarely know), we provide
+// a method that calculates the iteration. It's a drag that it must work
+// from scratch, but wonderful in that it's possible.
+//
+// Here's an example:
+//
+//    for (i = 0; i < 10; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//
+// There's a loop-carried flow dependence from the store to the load,
+// found by the weak-crossing SIV test. The dependence will have a flag,
+// indicating that the dependence can be broken by splitting the loop.
+// Calling getSplitIteration will return 5.
+// Splitting the loop breaks the dependence, like so:
+//
+//    for (i = 0; i <= 5; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//    for (i = 6; i < 10; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//
+// breaks the dependence and allows us to vectorize/parallelize
+// both loops.
+const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
+                                                   unsigned SplitLevel) {
+  assert(Dep && "expected a pointer to a Dependence");
+  assert(Dep->isSplitable(SplitLevel) &&
+         "Dep should be splitable at SplitLevel");
+  Instruction *Src = Dep->getSrc();
+  Instruction *Dst = Dep->getDst();
+  assert(Src->mayReadFromMemory() || Src->mayWriteToMemory());
+  assert(Dst->mayReadFromMemory() || Dst->mayWriteToMemory());
+  assert(isLoadOrStore(Src));
+  assert(isLoadOrStore(Dst));
+  Value *SrcPtr = getPointerOperand(Src);
+  Value *DstPtr = getPointerOperand(Dst);
+  assert(underlyingObjectsAlias(AA, DstPtr, SrcPtr) ==
+         AliasAnalysis::MustAlias);
+
+  // establish loop nesting levels
+  establishNestingLevels(Src, Dst);
+
+  FullDependence Result(Src, Dst, false, CommonLevels);
+
+  // See if there are GEPs we can use.
+  bool UsefulGEP = false;
+  GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
+  GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
+  if (SrcGEP && DstGEP &&
+      SrcGEP->getPointerOperandType() == DstGEP->getPointerOperandType()) {
+    const SCEV *SrcPtrSCEV = SE->getSCEV(SrcGEP->getPointerOperand());
+    const SCEV *DstPtrSCEV = SE->getSCEV(DstGEP->getPointerOperand());
+    UsefulGEP =
+      isLoopInvariant(SrcPtrSCEV, LI->getLoopFor(Src->getParent())) &&
+      isLoopInvariant(DstPtrSCEV, LI->getLoopFor(Dst->getParent()));
+  }
+  unsigned Pairs = UsefulGEP ? SrcGEP->idx_end() - SrcGEP->idx_begin() : 1;
+  SmallVector<Subscript, 4> Pair(Pairs);
+  if (UsefulGEP) {
+    unsigned P = 0;
+    for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
+           SrcEnd = SrcGEP->idx_end(),
+           DstIdx = DstGEP->idx_begin();
+         SrcIdx != SrcEnd;
+         ++SrcIdx, ++DstIdx, ++P) {
+      Pair[P].Src = SE->getSCEV(*SrcIdx);
+      Pair[P].Dst = SE->getSCEV(*DstIdx);
+    }
+  }
+  else {
+    const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
+    const SCEV *DstSCEV = SE->getSCEV(DstPtr);
+    Pair[0].Src = SrcSCEV;
+    Pair[0].Dst = DstSCEV;
+  }
+
+  for (unsigned P = 0; P < Pairs; ++P) {
+    Pair[P].Loops.resize(MaxLevels + 1);
+    Pair[P].GroupLoops.resize(MaxLevels + 1);
+    Pair[P].Group.resize(Pairs);
+    removeMatchingExtensions(&Pair[P]);
+    Pair[P].Classification =
+      classifyPair(Pair[P].Src, LI->getLoopFor(Src->getParent()),
+                   Pair[P].Dst, LI->getLoopFor(Dst->getParent()),
+                   Pair[P].Loops);
+    Pair[P].GroupLoops = Pair[P].Loops;
+    Pair[P].Group.set(P);
+  }
+
+  SmallBitVector Separable(Pairs);
+  SmallBitVector Coupled(Pairs);
+
+  // partition subscripts into separable and minimally-coupled groups
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    if (Pair[SI].Classification == Subscript::NonLinear) {
+      // ignore these, but collect loops for later
+      collectCommonLoops(Pair[SI].Src,
+                         LI->getLoopFor(Src->getParent()),
+                         Pair[SI].Loops);
+      collectCommonLoops(Pair[SI].Dst,
+                         LI->getLoopFor(Dst->getParent()),
+                         Pair[SI].Loops);
+      Result.Consistent = false;
+    }
+    else if (Pair[SI].Classification == Subscript::ZIV)
+      Separable.set(SI);
+    else {
+      // SIV, RDIV, or MIV, so check for coupled group
+      bool Done = true;
+      for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
+        SmallBitVector Intersection = Pair[SI].GroupLoops;
+        Intersection &= Pair[SJ].GroupLoops;
+        if (Intersection.any()) {
+          // accumulate set of all the loops in group
+          Pair[SJ].GroupLoops |= Pair[SI].GroupLoops;
+          // accumulate set of all subscripts in group
+          Pair[SJ].Group |= Pair[SI].Group;
+          Done = false;
+        }
+      }
+      if (Done) {
+        if (Pair[SI].Group.count() == 1)
+          Separable.set(SI);
+        else
+          Coupled.set(SI);
+      }
+    }
+  }
+
+  Constraint NewConstraint;
+  NewConstraint.setAny(SE);
+
+  // test separable subscripts
+  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+    switch (Pair[SI].Classification) {
+    case Subscript::SIV: {
+      unsigned Level;
+      const SCEV *SplitIter = NULL;
+      (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
+                     Result, NewConstraint, SplitIter);
+      if (Level == SplitLevel) {
+        assert(SplitIter != NULL);
+        return SplitIter;
+      }
+      break;
+    }
+    case Subscript::ZIV:
+    case Subscript::RDIV:
+    case Subscript::MIV:
+      break;
+    default:
+      llvm_unreachable("subscript has unexpected classification");
+    }
+  }
+
+  if (Coupled.count()) {
+    // test coupled subscript groups
+    SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
+    for (unsigned II = 0; II <= MaxLevels; ++II)
+      Constraints[II].setAny(SE);
+    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+      SmallBitVector Group(Pair[SI].Group);
+      SmallBitVector Sivs(Pairs);
+      SmallBitVector Mivs(Pairs);
+      SmallBitVector ConstrainedLevels(MaxLevels + 1);
+      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::SIV)
+          Sivs.set(SJ);
+        else
+          Mivs.set(SJ);
+      }
+      while (Sivs.any()) {
+        bool Changed = false;
+        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+          // SJ is an SIV subscript that's part of the current coupled group
+          unsigned Level;
+          const SCEV *SplitIter = NULL;
+          (void) testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
+                         Result, NewConstraint, SplitIter);
+          if (Level == SplitLevel && SplitIter)
+            return SplitIter;
+          ConstrainedLevels.set(Level);
+          if (intersectConstraints(&Constraints[Level], &NewConstraint))
+            Changed = true;
+          Sivs.reset(SJ);
+        }
+        if (Changed) {
+          // propagate, possibly creating new SIVs and ZIVs
+          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+            // SJ is an MIV subscript that's part of the current coupled group
+            if (propagate(Pair[SJ].Src, Pair[SJ].Dst,
+                          Pair[SJ].Loops, Constraints, Result.Consistent)) {
+              Pair[SJ].Classification =
+                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
+                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
+                             Pair[SJ].Loops);
+              switch (Pair[SJ].Classification) {
+              case Subscript::ZIV:
+                Mivs.reset(SJ);
+                break;
+              case Subscript::SIV:
+                Sivs.set(SJ);
+                Mivs.reset(SJ);
+                break;
+              case Subscript::RDIV:
+              case Subscript::MIV:
+                break;
+              default:
+                llvm_unreachable("bad subscript classification");
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  llvm_unreachable("somehow reached end of routine");
+  return NULL;
+}
diff --git a/contrib/llvm/lib/Analysis/DomPrinter.cpp b/contrib/llvm/lib/Analysis/DomPrinter.cpp
new file mode 100644
index 000000000000..cde431459d50
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/DomPrinter.cpp
@@ -0,0 +1,232 @@
+//===- DomPrinter.cpp - DOT printer for the dominance trees    ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines '-dot-dom' and '-dot-postdom' analysis passes, which emit
+// a dom.<fnname>.dot or postdom.<fnname>.dot file for each function in the
+// program, with a graph of the dominance/postdominance tree of that
+// function.
+//
+// There are also passes available to directly call dotty ('-view-dom' or
+// '-view-postdom'). By appending '-only' like '-dot-dom-only' only the
+// names of the bbs are printed, but the content is hidden.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DomPrinter.h"
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/PostDominators.h"
+
+using namespace llvm;
+
+namespace llvm {
+template<>
+struct DOTGraphTraits<DomTreeNode*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(DomTreeNode *Node, DomTreeNode *Graph) {
+
+    BasicBlock *BB = Node->getBlock();
+
+    if (!BB)
+      return "Post dominance root node";
+
+
+    if (isSimple())
+      return DOTGraphTraits<const Function*>
+        ::getSimpleNodeLabel(BB, BB->getParent());
+    else
+      return DOTGraphTraits<const Function*>
+        ::getCompleteNodeLabel(BB, BB->getParent());
+  }
+};
+
+template<>
+struct DOTGraphTraits<DominatorTree*> : public DOTGraphTraits<DomTreeNode*> {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DOTGraphTraits<DomTreeNode*>(isSimple) {}
+
+  static std::string getGraphName(DominatorTree *DT) {
+    return "Dominator tree";
+  }
+
+  std::string getNodeLabel(DomTreeNode *Node, DominatorTree *G) {
+    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode());
+  }
+};
+
+template<>
+struct DOTGraphTraits<PostDominatorTree*>
+  : public DOTGraphTraits<DomTreeNode*> {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DOTGraphTraits<DomTreeNode*>(isSimple) {}
+
+  static std::string getGraphName(PostDominatorTree *DT) {
+    return "Post dominator tree";
+  }
+
+  std::string getNodeLabel(DomTreeNode *Node, PostDominatorTree *G ) {
+    return DOTGraphTraits<DomTreeNode*>::getNodeLabel(Node, G->getRootNode());
+  }
+};
+}
+
+namespace {
+struct DomViewer
+  : public DOTGraphTraitsViewer<DominatorTree, false> {
+  static char ID;
+  DomViewer() : DOTGraphTraitsViewer<DominatorTree, false>("dom", ID){
+    initializeDomViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct DomOnlyViewer
+  : public DOTGraphTraitsViewer<DominatorTree, true> {
+  static char ID;
+  DomOnlyViewer() : DOTGraphTraitsViewer<DominatorTree, true>("domonly", ID){
+    initializeDomOnlyViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct PostDomViewer
+  : public DOTGraphTraitsViewer<PostDominatorTree, false> {
+  static char ID;
+  PostDomViewer() :
+    DOTGraphTraitsViewer<PostDominatorTree, false>("postdom", ID){
+      initializePostDomViewerPass(*PassRegistry::getPassRegistry());
+    }
+};
+
+struct PostDomOnlyViewer
+  : public DOTGraphTraitsViewer<PostDominatorTree, true> {
+  static char ID;
+  PostDomOnlyViewer() :
+    DOTGraphTraitsViewer<PostDominatorTree, true>("postdomonly", ID){
+      initializePostDomOnlyViewerPass(*PassRegistry::getPassRegistry());
+    }
+};
+} // end anonymous namespace
+
+char DomViewer::ID = 0;
+INITIALIZE_PASS(DomViewer, "view-dom",
+                "View dominance tree of function", false, false)
+
+char DomOnlyViewer::ID = 0;
+INITIALIZE_PASS(DomOnlyViewer, "view-dom-only",
+                "View dominance tree of function (with no function bodies)",
+                false, false)
+
+char PostDomViewer::ID = 0;
+INITIALIZE_PASS(PostDomViewer, "view-postdom",
+                "View postdominance tree of function", false, false)
+
+char PostDomOnlyViewer::ID = 0;
+INITIALIZE_PASS(PostDomOnlyViewer, "view-postdom-only",
+                "View postdominance tree of function "
+                "(with no function bodies)",
+                false, false)
+
+namespace {
+struct DomPrinter
+  : public DOTGraphTraitsPrinter<DominatorTree, false> {
+  static char ID;
+  DomPrinter() : DOTGraphTraitsPrinter<DominatorTree, false>("dom", ID) {
+    initializeDomPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct DomOnlyPrinter
+  : public DOTGraphTraitsPrinter<DominatorTree, true> {
+  static char ID;
+  DomOnlyPrinter() : DOTGraphTraitsPrinter<DominatorTree, true>("domonly", ID) {
+    initializeDomOnlyPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct PostDomPrinter
+  : public DOTGraphTraitsPrinter<PostDominatorTree, false> {
+  static char ID;
+  PostDomPrinter() :
+    DOTGraphTraitsPrinter<PostDominatorTree, false>("postdom", ID) {
+      initializePostDomPrinterPass(*PassRegistry::getPassRegistry());
+    }
+};
+
+struct PostDomOnlyPrinter
+  : public DOTGraphTraitsPrinter<PostDominatorTree, true> {
+  static char ID;
+  PostDomOnlyPrinter() :
+    DOTGraphTraitsPrinter<PostDominatorTree, true>("postdomonly", ID) {
+      initializePostDomOnlyPrinterPass(*PassRegistry::getPassRegistry());
+    }
+};
+} // end anonymous namespace
+
+
+
+char DomPrinter::ID = 0;
+INITIALIZE_PASS(DomPrinter, "dot-dom",
+                "Print dominance tree of function to 'dot' file",
+                false, false)
+
+char DomOnlyPrinter::ID = 0;
+INITIALIZE_PASS(DomOnlyPrinter, "dot-dom-only",
+                "Print dominance tree of function to 'dot' file "
+                "(with no function bodies)",
+                false, false)
+
+char PostDomPrinter::ID = 0;
+INITIALIZE_PASS(PostDomPrinter, "dot-postdom",
+                "Print postdominance tree of function to 'dot' file",
+                false, false)
+
+char PostDomOnlyPrinter::ID = 0;
+INITIALIZE_PASS(PostDomOnlyPrinter, "dot-postdom-only",
+                "Print postdominance tree of function to 'dot' file "
+                "(with no function bodies)",
+                false, false)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+FunctionPass *llvm::createDomPrinterPass() {
+  return new DomPrinter();
+}
+
+FunctionPass *llvm::createDomOnlyPrinterPass() {
+  return new DomOnlyPrinter();
+}
+
+FunctionPass *llvm::createDomViewerPass() {
+  return new DomViewer();
+}
+
+FunctionPass *llvm::createDomOnlyViewerPass() {
+  return new DomOnlyViewer();
+}
+
+FunctionPass *llvm::createPostDomPrinterPass() {
+  return new PostDomPrinter();
+}
+
+FunctionPass *llvm::createPostDomOnlyPrinterPass() {
+  return new PostDomOnlyPrinter();
+}
+
+FunctionPass *llvm::createPostDomViewerPass() {
+  return new PostDomViewer();
+}
+
+FunctionPass *llvm::createPostDomOnlyViewerPass() {
+  return new PostDomOnlyViewer();
+}
diff --git a/contrib/llvm/lib/Analysis/DominanceFrontier.cpp b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
new file mode 100644
index 000000000000..7e4a89f1bd57
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
@@ -0,0 +1,141 @@
+//===- DominanceFrontier.cpp - Dominance Frontier Calculation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+char DominanceFrontier::ID = 0;
+INITIALIZE_PASS_BEGIN(DominanceFrontier, "domfrontier",
+                "Dominance Frontier Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(DominanceFrontier, "domfrontier",
+                "Dominance Frontier Construction", true, true)
+
+namespace {
+  class DFCalculateWorkObject {
+  public:
+    DFCalculateWorkObject(BasicBlock *B, BasicBlock *P, 
+                          const DomTreeNode *N,
+                          const DomTreeNode *PN)
+    : currentBB(B), parentBB(P), Node(N), parentNode(PN) {}
+    BasicBlock *currentBB;
+    BasicBlock *parentBB;
+    const DomTreeNode *Node;
+    const DomTreeNode *parentNode;
+  };
+}
+
+void DominanceFrontier::anchor() { }
+
+const DominanceFrontier::DomSetType &
+DominanceFrontier::calculate(const DominatorTree &DT,
+                             const DomTreeNode *Node) {
+  BasicBlock *BB = Node->getBlock();
+  DomSetType *Result = NULL;
+
+  std::vector<DFCalculateWorkObject> workList;
+  SmallPtrSet<BasicBlock *, 32> visited;
+
+  workList.push_back(DFCalculateWorkObject(BB, NULL, Node, NULL));
+  do {
+    DFCalculateWorkObject *currentW = &workList.back();
+    assert (currentW && "Missing work object.");
+
+    BasicBlock *currentBB = currentW->currentBB;
+    BasicBlock *parentBB = currentW->parentBB;
+    const DomTreeNode *currentNode = currentW->Node;
+    const DomTreeNode *parentNode = currentW->parentNode;
+    assert (currentBB && "Invalid work object. Missing current Basic Block");
+    assert (currentNode && "Invalid work object. Missing current Node");
+    DomSetType &S = Frontiers[currentBB];
+
+    // Visit each block only once.
+    if (visited.count(currentBB) == 0) {
+      visited.insert(currentBB);
+
+      // Loop over CFG successors to calculate DFlocal[currentNode]
+      for (succ_iterator SI = succ_begin(currentBB), SE = succ_end(currentBB);
+           SI != SE; ++SI) {
+        // Does Node immediately dominate this successor?
+        if (DT[*SI]->getIDom() != currentNode)
+          S.insert(*SI);
+      }
+    }
+
+    // At this point, S is DFlocal.  Now we union in DFup's of our children...
+    // Loop through and visit the nodes that Node immediately dominates (Node's
+    // children in the IDomTree)
+    bool visitChild = false;
+    for (DomTreeNode::const_iterator NI = currentNode->begin(), 
+           NE = currentNode->end(); NI != NE; ++NI) {
+      DomTreeNode *IDominee = *NI;
+      BasicBlock *childBB = IDominee->getBlock();
+      if (visited.count(childBB) == 0) {
+        workList.push_back(DFCalculateWorkObject(childBB, currentBB,
+                                                 IDominee, currentNode));
+        visitChild = true;
+      }
+    }
+
+    // If all children are visited or there is any child then pop this block
+    // from the workList.
+    if (!visitChild) {
+
+      if (!parentBB) {
+        Result = &S;
+        break;
+      }
+
+      DomSetType::const_iterator CDFI = S.begin(), CDFE = S.end();
+      DomSetType &parentSet = Frontiers[parentBB];
+      for (; CDFI != CDFE; ++CDFI) {
+        if (!DT.properlyDominates(parentNode, DT[*CDFI]))
+          parentSet.insert(*CDFI);
+      }
+      workList.pop_back();
+    }
+
+  } while (!workList.empty());
+
+  return *Result;
+}
+
+void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    OS << "  DomFrontier for BB ";
+    if (I->first)
+      WriteAsOperand(OS, I->first, false);
+    else
+      OS << " <<exit node>>";
+    OS << " is:\t";
+    
+    const std::set<BasicBlock*> &BBs = I->second;
+    
+    for (std::set<BasicBlock*>::const_iterator I = BBs.begin(), E = BBs.end();
+         I != E; ++I) {
+      OS << ' ';
+      if (*I)
+        WriteAsOperand(OS, *I, false);
+      else
+        OS << "<<exit node>>";
+    }
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void DominanceFrontierBase::dump() const {
+  print(dbgs());
+}
+#endif
+
diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp b/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp
new file mode 100644
index 000000000000..7620fd9842cc
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/CallGraph.cpp
@@ -0,0 +1,338 @@
+//===- CallGraph.cpp - Build a Module's call graph ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CallGraph class and provides the BasicCallGraph
+// default implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BasicCallGraph class definition
+//
+class BasicCallGraph : public ModulePass, public CallGraph {
+  // Root is root of the call graph, or the external node if a 'main' function
+  // couldn't be found.
+  //
+  CallGraphNode *Root;
+
+  // ExternalCallingNode - This node has edges to all external functions and
+  // those internal functions that have their address taken.
+  CallGraphNode *ExternalCallingNode;
+
+  // CallsExternalNode - This node has edges to it from all functions making
+  // indirect calls or calling an external function.
+  CallGraphNode *CallsExternalNode;
+
+public:
+  static char ID; // Class identification, replacement for typeinfo
+  BasicCallGraph() : ModulePass(ID), Root(0), 
+    ExternalCallingNode(0), CallsExternalNode(0) {
+      initializeBasicCallGraphPass(*PassRegistry::getPassRegistry());
+    }
+
+  // runOnModule - Compute the call graph for the specified module.
+  virtual bool runOnModule(Module &M) {
+    CallGraph::initialize(M);
+    
+    ExternalCallingNode = getOrInsertFunction(0);
+    CallsExternalNode = new CallGraphNode(0);
+    Root = 0;
+  
+    // Add every function to the call graph.
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+      addToCallGraph(I);
+  
+    // If we didn't find a main function, use the external call graph node
+    if (Root == 0) Root = ExternalCallingNode;
+    
+    return false;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+  virtual void print(raw_ostream &OS, const Module *) const {
+    OS << "CallGraph Root is: ";
+    if (Function *F = getRoot()->getFunction())
+      OS << F->getName() << "\n";
+    else {
+      OS << "<<null function: 0x" << getRoot() << ">>\n";
+    }
+    
+    CallGraph::print(OS, 0);
+  }
+
+  virtual void releaseMemory() {
+    destroy();
+  }
+  
+  /// getAdjustedAnalysisPointer - This method is used when a pass implements
+  /// an analysis interface through multiple inheritance.  If needed, it should
+  /// override this to adjust the this pointer as needed for the specified pass
+  /// info.
+  virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+    if (PI == &CallGraph::ID)
+      return (CallGraph*)this;
+    return this;
+  }
+  
+  CallGraphNode* getExternalCallingNode() const { return ExternalCallingNode; }
+  CallGraphNode* getCallsExternalNode()   const { return CallsExternalNode; }
+
+  // getRoot - Return the root of the call graph, which is either main, or if
+  // main cannot be found, the external node.
+  //
+  CallGraphNode *getRoot()             { return Root; }
+  const CallGraphNode *getRoot() const { return Root; }
+
+private:
+  //===---------------------------------------------------------------------
+  // Implementation of CallGraph construction
+  //
+
+  // addToCallGraph - Add a function to the call graph, and link the node to all
+  // of the functions that it calls.
+  //
+  void addToCallGraph(Function *F) {
+    CallGraphNode *Node = getOrInsertFunction(F);
+
+    // If this function has external linkage, anything could call it.
+    if (!F->hasLocalLinkage()) {
+      ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+      // Found the entry point?
+      if (F->getName() == "main") {
+        if (Root)    // Found multiple external mains?  Don't pick one.
+          Root = ExternalCallingNode;
+        else
+          Root = Node;          // Found a main, keep track of it!
+      }
+    }
+
+    // If this function has its address taken, anything could call it.
+    if (F->hasAddressTaken())
+      ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+    // If this function is not defined in this translation unit, it could call
+    // anything.
+    if (F->isDeclaration() && !F->isIntrinsic())
+      Node->addCalledFunction(CallSite(), CallsExternalNode);
+
+    // Look for calls by this function.
+    for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
+      for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+           II != IE; ++II) {
+        CallSite CS(cast<Value>(II));
+        if (CS) {
+          const Function *Callee = CS.getCalledFunction();
+          if (!Callee)
+            // Indirect calls of intrinsics are not allowed so no need to check.
+            Node->addCalledFunction(CS, CallsExternalNode);
+          else if (!Callee->isIntrinsic())
+            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
+        }
+      }
+  }
+
+  //
+  // destroy - Release memory for the call graph
+  virtual void destroy() {
+    /// CallsExternalNode is not in the function map, delete it explicitly.
+    if (CallsExternalNode) {
+      CallsExternalNode->allReferencesDropped();
+      delete CallsExternalNode;
+      CallsExternalNode = 0;
+    }
+    CallGraph::destroy();
+  }
+};
+
+} //End anonymous namespace
+
+INITIALIZE_ANALYSIS_GROUP(CallGraph, "Call Graph", BasicCallGraph)
+INITIALIZE_AG_PASS(BasicCallGraph, CallGraph, "basiccg",
+                   "Basic CallGraph Construction", false, true, true)
+
+char CallGraph::ID = 0;
+char BasicCallGraph::ID = 0;
+
+void CallGraph::initialize(Module &M) {
+  Mod = &M;
+}
+
+void CallGraph::destroy() {
+  if (FunctionMap.empty()) return;
+  
+  // Reset all node's use counts to zero before deleting them to prevent an
+  // assertion from firing.
+#ifndef NDEBUG
+  for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
+       I != E; ++I)
+    I->second->allReferencesDropped();
+#endif
+  
+  for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
+      I != E; ++I)
+    delete I->second;
+  FunctionMap.clear();
+}
+
+void CallGraph::print(raw_ostream &OS, Module*) const {
+  for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
+    I->second->print(OS);
+}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void CallGraph::dump() const {
+  print(dbgs(), 0);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// Implementations of public modification methods
+//
+
+// removeFunctionFromModule - Unlink the function from this module, returning
+// it.  Because this removes the function from the module, the call graph node
+// is destroyed.  This is only valid if the function does not call any other
+// functions (ie, there are no edges in it's CGN).  The easiest way to do this
+// is to dropAllReferences before calling this.
+//
+Function *CallGraph::removeFunctionFromModule(CallGraphNode *CGN) {
+  assert(CGN->empty() && "Cannot remove function from call "
+         "graph if it references other functions!");
+  Function *F = CGN->getFunction(); // Get the function for the call graph node
+  delete CGN;                       // Delete the call graph node for this func
+  FunctionMap.erase(F);             // Remove the call graph node from the map
+
+  Mod->getFunctionList().remove(F);
+  return F;
+}
+
+/// spliceFunction - Replace the function represented by this node by another.
+/// This does not rescan the body of the function, so it is suitable when
+/// splicing the body of the old function to the new while also updating all
+/// callers from old to new.
+///
+void CallGraph::spliceFunction(const Function *From, const Function *To) {
+  assert(FunctionMap.count(From) && "No CallGraphNode for function!");
+  assert(!FunctionMap.count(To) &&
+         "Pointing CallGraphNode at a function that already exists");
+  FunctionMapTy::iterator I = FunctionMap.find(From);
+  I->second->F = const_cast<Function*>(To);
+  FunctionMap[To] = I->second;
+  FunctionMap.erase(I);
+}
+
+// getOrInsertFunction - This method is identical to calling operator[], but
+// it will insert a new CallGraphNode for the specified function if one does
+// not already exist.
+CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) {
+  CallGraphNode *&CGN = FunctionMap[F];
+  if (CGN) return CGN;
+  
+  assert((!F || F->getParent() == Mod) && "Function not in current module!");
+  return CGN = new CallGraphNode(const_cast<Function*>(F));
+}
+
+void CallGraphNode::print(raw_ostream &OS) const {
+  if (Function *F = getFunction())
+    OS << "Call graph node for function: '" << F->getName() << "'";
+  else
+    OS << "Call graph node <<null function>>";
+  
+  OS << "<<" << this << ">>  #uses=" << getNumReferences() << '\n';
+
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    OS << "  CS<" << I->first << "> calls ";
+    if (Function *FI = I->second->getFunction())
+      OS << "function '" << FI->getName() <<"'\n";
+    else
+      OS << "external node\n";
+  }
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void CallGraphNode::dump() const { print(dbgs()); }
+#endif
+
+/// removeCallEdgeFor - This method removes the edge in the node for the
+/// specified call site.  Note that this method takes linear time, so it
+/// should be used sparingly.
+void CallGraphNode::removeCallEdgeFor(CallSite CS) {
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
+    if (I->first == CS.getInstruction()) {
+      I->second->DropRef();
+      *I = CalledFunctions.back();
+      CalledFunctions.pop_back();
+      return;
+    }
+  }
+}
+
+// removeAnyCallEdgeTo - This method removes any call edges from this node to
+// the specified callee function.  This takes more time to execute than
+// removeCallEdgeTo, so it should not be used unless necessary.
+void CallGraphNode::removeAnyCallEdgeTo(CallGraphNode *Callee) {
+  for (unsigned i = 0, e = CalledFunctions.size(); i != e; ++i)
+    if (CalledFunctions[i].second == Callee) {
+      Callee->DropRef();
+      CalledFunctions[i] = CalledFunctions.back();
+      CalledFunctions.pop_back();
+      --i; --e;
+    }
+}
+
+/// removeOneAbstractEdgeTo - Remove one edge associated with a null callsite
+/// from this node to the specified callee function.
+void CallGraphNode::removeOneAbstractEdgeTo(CallGraphNode *Callee) {
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callee to remove!");
+    CallRecord &CR = *I;
+    if (CR.second == Callee && CR.first == 0) {
+      Callee->DropRef();
+      *I = CalledFunctions.back();
+      CalledFunctions.pop_back();
+      return;
+    }
+  }
+}
+
+/// replaceCallEdge - This method replaces the edge in the node for the
+/// specified call site with a new one.  Note that this method takes linear
+/// time, so it should be used sparingly.
+void CallGraphNode::replaceCallEdge(CallSite CS,
+                                    CallSite NewCS, CallGraphNode *NewNode){
+  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
+    assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
+    if (I->first == CS.getInstruction()) {
+      I->second->DropRef();
+      I->first = NewCS.getInstruction();
+      I->second = NewNode;
+      NewNode->AddRef();
+      return;
+    }
+  }
+}
+
+// Enuse that users of CallGraph.h also link with this file
+DEFINING_FILE_FOR(CallGraph)
diff --git a/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp b/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp
new file mode 100644
index 000000000000..a0d788f34a3c
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -0,0 +1,614 @@
+//===- CallGraphSCCPass.cpp - Pass that operates BU on call graph ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CallGraphSCCPass class, which is used for passes
+// which are implemented as bottom-up traversals on the call graph.  Because
+// there may be cycles in the call graph, passes of this type operate on the
+// call-graph in SCC order: that is, they process function bottom-up, except for
+// recursive functions, which they process all at once.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cgscc-passmgr"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/PassManagers.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<unsigned> 
+MaxIterations("max-cg-scc-iterations", cl::ReallyHidden, cl::init(4));
+
+STATISTIC(MaxSCCIterations, "Maximum CGSCCPassMgr iterations on one SCC");
+
+//===----------------------------------------------------------------------===//
+// CGPassManager
+//
+/// CGPassManager manages FPPassManagers and CallGraphSCCPasses.
+
+namespace {
+
+class CGPassManager : public ModulePass, public PMDataManager {
+public:
+  static char ID;
+  explicit CGPassManager() 
+    : ModulePass(ID), PMDataManager() { }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  using ModulePass::doInitialization;
+  using ModulePass::doFinalization;
+
+  bool doInitialization(CallGraph &CG);
+  bool doFinalization(CallGraph &CG);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    // CGPassManager walks SCC and it needs CallGraph.
+    Info.addRequired<CallGraph>();
+    Info.setPreservesAll();
+  }
+
+  virtual const char *getPassName() const {
+    return "CallGraph Pass Manager";
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    errs().indent(Offset*2) << "Call Graph SCC Pass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      Pass *P = getContainedPass(Index);
+      P->dumpPassStructure(Offset + 1);
+      dumpLastUses(P, Offset+1);
+    }
+  }
+
+  Pass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<Pass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const { 
+    return PMT_CallGraphPassManager; 
+  }
+  
+private:
+  bool RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG,
+                         bool &DevirtualizedCall);
+  
+  bool RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
+                    CallGraph &CG, bool &CallGraphUpToDate,
+                    bool &DevirtualizedCall);
+  bool RefreshCallGraph(CallGraphSCC &CurSCC, CallGraph &CG,
+                        bool IsCheckingMode);
+};
+
+} // end anonymous namespace.
+
+char CGPassManager::ID = 0;
+
+
+bool CGPassManager::RunPassOnSCC(Pass *P, CallGraphSCC &CurSCC,
+                                 CallGraph &CG, bool &CallGraphUpToDate,
+                                 bool &DevirtualizedCall) {
+  bool Changed = false;
+  PMDataManager *PM = P->getAsPMDataManager();
+
+  if (PM == 0) {
+    CallGraphSCCPass *CGSP = (CallGraphSCCPass*)P;
+    if (!CallGraphUpToDate) {
+      DevirtualizedCall |= RefreshCallGraph(CurSCC, CG, false);
+      CallGraphUpToDate = true;
+    }
+
+    {
+      TimeRegion PassTimer(getPassTimer(CGSP));
+      Changed = CGSP->runOnSCC(CurSCC);
+    }
+    
+    // After the CGSCCPass is done, when assertions are enabled, use
+    // RefreshCallGraph to verify that the callgraph was correctly updated.
+#ifndef NDEBUG
+    if (Changed)
+      RefreshCallGraph(CurSCC, CG, true);
+#endif
+    
+    return Changed;
+  }
+  
+  
+  assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+         "Invalid CGPassManager member");
+  FPPassManager *FPP = (FPPassManager*)P;
+  
+  // Run pass P on all functions in the current SCC.
+  for (CallGraphSCC::iterator I = CurSCC.begin(), E = CurSCC.end();
+       I != E; ++I) {
+    if (Function *F = (*I)->getFunction()) {
+      dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getName());
+      TimeRegion PassTimer(getPassTimer(FPP));
+      Changed |= FPP->runOnFunction(*F);
+    }
+  }
+  
+  // The function pass(es) modified the IR, they may have clobbered the
+  // callgraph.
+  if (Changed && CallGraphUpToDate) {
+    DEBUG(dbgs() << "CGSCCPASSMGR: Pass Dirtied SCC: "
+                 << P->getPassName() << '\n');
+    CallGraphUpToDate = false;
+  }
+  return Changed;
+}
+
+
+/// RefreshCallGraph - Scan the functions in the specified CFG and resync the
+/// callgraph with the call sites found in it.  This is used after
+/// FunctionPasses have potentially munged the callgraph, and can be used after
+/// CallGraphSCC passes to verify that they correctly updated the callgraph.
+///
+/// This function returns true if it devirtualized an existing function call,
+/// meaning it turned an indirect call into a direct call.  This happens when
+/// a function pass like GVN optimizes away stuff feeding the indirect call.
+/// This never happens in checking mode.
+///
+bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
+                                     CallGraph &CG, bool CheckingMode) {
+  DenseMap<Value*, CallGraphNode*> CallSites;
+  
+  DEBUG(dbgs() << "CGSCCPASSMGR: Refreshing SCC with " << CurSCC.size()
+               << " nodes:\n";
+        for (CallGraphSCC::iterator I = CurSCC.begin(), E = CurSCC.end();
+             I != E; ++I)
+          (*I)->dump();
+        );
+
+  bool MadeChange = false;
+  bool DevirtualizedCall = false;
+  
+  // Scan all functions in the SCC.
+  unsigned FunctionNo = 0;
+  for (CallGraphSCC::iterator SCCIdx = CurSCC.begin(), E = CurSCC.end();
+       SCCIdx != E; ++SCCIdx, ++FunctionNo) {
+    CallGraphNode *CGN = *SCCIdx;
+    Function *F = CGN->getFunction();
+    if (F == 0 || F->isDeclaration()) continue;
+    
+    // Walk the function body looking for call sites.  Sync up the call sites in
+    // CGN with those actually in the function.
+
+    // Keep track of the number of direct and indirect calls that were
+    // invalidated and removed.
+    unsigned NumDirectRemoved = 0, NumIndirectRemoved = 0;
+    
+    // Get the set of call sites currently in the function.
+    for (CallGraphNode::iterator I = CGN->begin(), E = CGN->end(); I != E; ) {
+      // If this call site is null, then the function pass deleted the call
+      // entirely and the WeakVH nulled it out.  
+      if (I->first == 0 ||
+          // If we've already seen this call site, then the FunctionPass RAUW'd
+          // one call with another, which resulted in two "uses" in the edge
+          // list of the same call.
+          CallSites.count(I->first) ||
+
+          // If the call edge is not from a call or invoke, then the function
+          // pass RAUW'd a call with another value.  This can happen when
+          // constant folding happens of well known functions etc.
+          !CallSite(I->first)) {
+        assert(!CheckingMode &&
+               "CallGraphSCCPass did not update the CallGraph correctly!");
+        
+        // If this was an indirect call site, count it.
+        if (I->second->getFunction() == 0)
+          ++NumIndirectRemoved;
+        else 
+          ++NumDirectRemoved;
+        
+        // Just remove the edge from the set of callees, keep track of whether
+        // I points to the last element of the vector.
+        bool WasLast = I + 1 == E;
+        CGN->removeCallEdge(I);
+        
+        // If I pointed to the last element of the vector, we have to bail out:
+        // iterator checking rejects comparisons of the resultant pointer with
+        // end.
+        if (WasLast)
+          break;
+        E = CGN->end();
+        continue;
+      }
+      
+      assert(!CallSites.count(I->first) &&
+             "Call site occurs in node multiple times");
+      CallSites.insert(std::make_pair(I->first, I->second));
+      ++I;
+    }
+    
+    // Loop over all of the instructions in the function, getting the callsites.
+    // Keep track of the number of direct/indirect calls added.
+    unsigned NumDirectAdded = 0, NumIndirectAdded = 0;
+    
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        CallSite CS(cast<Value>(I));
+        if (!CS) continue;
+        Function *Callee = CS.getCalledFunction();
+        if (Callee && Callee->isIntrinsic()) continue;
+        
+        // If this call site already existed in the callgraph, just verify it
+        // matches up to expectations and remove it from CallSites.
+        DenseMap<Value*, CallGraphNode*>::iterator ExistingIt =
+          CallSites.find(CS.getInstruction());
+        if (ExistingIt != CallSites.end()) {
+          CallGraphNode *ExistingNode = ExistingIt->second;
+
+          // Remove from CallSites since we have now seen it.
+          CallSites.erase(ExistingIt);
+          
+          // Verify that the callee is right.
+          if (ExistingNode->getFunction() == CS.getCalledFunction())
+            continue;
+          
+          // If we are in checking mode, we are not allowed to actually mutate
+          // the callgraph.  If this is a case where we can infer that the
+          // callgraph is less precise than it could be (e.g. an indirect call
+          // site could be turned direct), don't reject it in checking mode, and
+          // don't tweak it to be more precise.
+          if (CheckingMode && CS.getCalledFunction() &&
+              ExistingNode->getFunction() == 0)
+            continue;
+          
+          assert(!CheckingMode &&
+                 "CallGraphSCCPass did not update the CallGraph correctly!");
+          
+          // If not, we either went from a direct call to indirect, indirect to
+          // direct, or direct to different direct.
+          CallGraphNode *CalleeNode;
+          if (Function *Callee = CS.getCalledFunction()) {
+            CalleeNode = CG.getOrInsertFunction(Callee);
+            // Keep track of whether we turned an indirect call into a direct
+            // one.
+            if (ExistingNode->getFunction() == 0) {
+              DevirtualizedCall = true;
+              DEBUG(dbgs() << "  CGSCCPASSMGR: Devirtualized call to '"
+                           << Callee->getName() << "'\n");
+            }
+          } else {
+            CalleeNode = CG.getCallsExternalNode();
+          }
+
+          // Update the edge target in CGN.
+          CGN->replaceCallEdge(CS, CS, CalleeNode);
+          MadeChange = true;
+          continue;
+        }
+        
+        assert(!CheckingMode &&
+               "CallGraphSCCPass did not update the CallGraph correctly!");
+
+        // If the call site didn't exist in the CGN yet, add it.
+        CallGraphNode *CalleeNode;
+        if (Function *Callee = CS.getCalledFunction()) {
+          CalleeNode = CG.getOrInsertFunction(Callee);
+          ++NumDirectAdded;
+        } else {
+          CalleeNode = CG.getCallsExternalNode();
+          ++NumIndirectAdded;
+        }
+        
+        CGN->addCalledFunction(CS, CalleeNode);
+        MadeChange = true;
+      }
+    
+    // We scanned the old callgraph node, removing invalidated call sites and
+    // then added back newly found call sites.  One thing that can happen is
+    // that an old indirect call site was deleted and replaced with a new direct
+    // call.  In this case, we have devirtualized a call, and CGSCCPM would like
+    // to iteratively optimize the new code.  Unfortunately, we don't really
+    // have a great way to detect when this happens.  As an approximation, we
+    // just look at whether the number of indirect calls is reduced and the
+    // number of direct calls is increased.  There are tons of ways to fool this
+    // (e.g. DCE'ing an indirect call and duplicating an unrelated block with a
+    // direct call) but this is close enough.
+    if (NumIndirectRemoved > NumIndirectAdded &&
+        NumDirectRemoved < NumDirectAdded)
+      DevirtualizedCall = true;
+    
+    // After scanning this function, if we still have entries in callsites, then
+    // they are dangling pointers.  WeakVH should save us for this, so abort if
+    // this happens.
+    assert(CallSites.empty() && "Dangling pointers found in call sites map");
+    
+    // Periodically do an explicit clear to remove tombstones when processing
+    // large scc's.
+    if ((FunctionNo & 15) == 15)
+      CallSites.clear();
+  }
+
+  DEBUG(if (MadeChange) {
+          dbgs() << "CGSCCPASSMGR: Refreshed SCC is now:\n";
+          for (CallGraphSCC::iterator I = CurSCC.begin(), E = CurSCC.end();
+            I != E; ++I)
+              (*I)->dump();
+          if (DevirtualizedCall)
+            dbgs() << "CGSCCPASSMGR: Refresh devirtualized a call!\n";
+
+         } else {
+           dbgs() << "CGSCCPASSMGR: SCC Refresh didn't change call graph.\n";
+         }
+        );
+  (void)MadeChange;
+
+  return DevirtualizedCall;
+}
+
+/// RunAllPassesOnSCC -  Execute the body of the entire pass manager on the
+/// specified SCC.  This keeps track of whether a function pass devirtualizes
+/// any calls and returns it in DevirtualizedCall.
+bool CGPassManager::RunAllPassesOnSCC(CallGraphSCC &CurSCC, CallGraph &CG,
+                                      bool &DevirtualizedCall) {
+  bool Changed = false;
+  
+  // CallGraphUpToDate - Keep track of whether the callgraph is known to be
+  // up-to-date or not.  The CGSSC pass manager runs two types of passes:
+  // CallGraphSCC Passes and other random function passes.  Because other
+  // random function passes are not CallGraph aware, they may clobber the
+  // call graph by introducing new calls or deleting other ones.  This flag
+  // is set to false when we run a function pass so that we know to clean up
+  // the callgraph when we need to run a CGSCCPass again.
+  bool CallGraphUpToDate = true;
+
+  // Run all passes on current SCC.
+  for (unsigned PassNo = 0, e = getNumContainedPasses();
+       PassNo != e; ++PassNo) {
+    Pass *P = getContainedPass(PassNo);
+    
+    // If we're in -debug-pass=Executions mode, construct the SCC node list,
+    // otherwise avoid constructing this string as it is expensive.
+    if (isPassDebuggingExecutionsOrMore()) {
+      std::string Functions;
+  #ifndef NDEBUG
+      raw_string_ostream OS(Functions);
+      for (CallGraphSCC::iterator I = CurSCC.begin(), E = CurSCC.end();
+           I != E; ++I) {
+        if (I != CurSCC.begin()) OS << ", ";
+        (*I)->print(OS);
+      }
+      OS.flush();
+  #endif
+      dumpPassInfo(P, EXECUTION_MSG, ON_CG_MSG, Functions);
+    }
+    dumpRequiredSet(P);
+    
+    initializeAnalysisImpl(P);
+    
+    // Actually run this pass on the current SCC.
+    Changed |= RunPassOnSCC(P, CurSCC, CG,
+                            CallGraphUpToDate, DevirtualizedCall);
+    
+    if (Changed)
+      dumpPassInfo(P, MODIFICATION_MSG, ON_CG_MSG, "");
+    dumpPreservedSet(P);
+    
+    verifyPreservedAnalysis(P);      
+    removeNotPreservedAnalysis(P);
+    recordAvailableAnalysis(P);
+    removeDeadPasses(P, "", ON_CG_MSG);
+  }
+  
+  // If the callgraph was left out of date (because the last pass run was a
+  // functionpass), refresh it before we move on to the next SCC.
+  if (!CallGraphUpToDate)
+    DevirtualizedCall |= RefreshCallGraph(CurSCC, CG, false);
+  return Changed;
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool CGPassManager::runOnModule(Module &M) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  bool Changed = doInitialization(CG);
+  
+  // Walk the callgraph in bottom-up SCC order.
+  scc_iterator<CallGraph*> CGI = scc_begin(&CG);
+
+  CallGraphSCC CurSCC(&CGI);
+  while (!CGI.isAtEnd()) {
+    // Copy the current SCC and increment past it so that the pass can hack
+    // on the SCC if it wants to without invalidating our iterator.
+    std::vector<CallGraphNode*> &NodeVec = *CGI;
+    CurSCC.initialize(&NodeVec[0], &NodeVec[0]+NodeVec.size());
+    ++CGI;
+    
+    // At the top level, we run all the passes in this pass manager on the
+    // functions in this SCC.  However, we support iterative compilation in the
+    // case where a function pass devirtualizes a call to a function.  For
+    // example, it is very common for a function pass (often GVN or instcombine)
+    // to eliminate the addressing that feeds into a call.  With that improved
+    // information, we would like the call to be an inline candidate, infer
+    // mod-ref information etc.
+    //
+    // Because of this, we allow iteration up to a specified iteration count.
+    // This only happens in the case of a devirtualized call, so we only burn
+    // compile time in the case that we're making progress.  We also have a hard
+    // iteration count limit in case there is crazy code.
+    unsigned Iteration = 0;
+    bool DevirtualizedCall = false;
+    do {
+      DEBUG(if (Iteration)
+              dbgs() << "  SCCPASSMGR: Re-visiting SCC, iteration #"
+                     << Iteration << '\n');
+      DevirtualizedCall = false;
+      Changed |= RunAllPassesOnSCC(CurSCC, CG, DevirtualizedCall);
+    } while (Iteration++ < MaxIterations && DevirtualizedCall);
+    
+    if (DevirtualizedCall)
+      DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after " << Iteration
+                   << " times, due to -max-cg-scc-iterations\n");
+    
+    if (Iteration > MaxSCCIterations)
+      MaxSCCIterations = Iteration;
+    
+  }
+  Changed |= doFinalization(CG);
+  return Changed;
+}
+
+
+/// Initialize CG
+bool CGPassManager::doInitialization(CallGraph &CG) {
+  bool Changed = false;
+  for (unsigned i = 0, e = getNumContainedPasses(); i != e; ++i) {  
+    if (PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) {
+      assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+             "Invalid CGPassManager member");
+      Changed |= ((FPPassManager*)PM)->doInitialization(CG.getModule());
+    } else {
+      Changed |= ((CallGraphSCCPass*)getContainedPass(i))->doInitialization(CG);
+    }
+  }
+  return Changed;
+}
+
+/// Finalize CG
+bool CGPassManager::doFinalization(CallGraph &CG) {
+  bool Changed = false;
+  for (unsigned i = 0, e = getNumContainedPasses(); i != e; ++i) {  
+    if (PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) {
+      assert(PM->getPassManagerType() == PMT_FunctionPassManager &&
+             "Invalid CGPassManager member");
+      Changed |= ((FPPassManager*)PM)->doFinalization(CG.getModule());
+    } else {
+      Changed |= ((CallGraphSCCPass*)getContainedPass(i))->doFinalization(CG);
+    }
+  }
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// CallGraphSCC Implementation
+//===----------------------------------------------------------------------===//
+
+/// ReplaceNode - This informs the SCC and the pass manager that the specified
+/// Old node has been deleted, and New is to be used in its place.
+void CallGraphSCC::ReplaceNode(CallGraphNode *Old, CallGraphNode *New) {
+  assert(Old != New && "Should not replace node with self");
+  for (unsigned i = 0; ; ++i) {
+    assert(i != Nodes.size() && "Node not in SCC");
+    if (Nodes[i] != Old) continue;
+    Nodes[i] = New;
+    break;
+  }
+  
+  // Update the active scc_iterator so that it doesn't contain dangling
+  // pointers to the old CallGraphNode.
+  scc_iterator<CallGraph*> *CGI = (scc_iterator<CallGraph*>*)Context;
+  CGI->ReplaceNode(Old, New);
+}
+
+
+//===----------------------------------------------------------------------===//
+// CallGraphSCCPass Implementation
+//===----------------------------------------------------------------------===//
+
+/// Assign pass manager to manage this pass.
+void CallGraphSCCPass::assignPassManager(PMStack &PMS,
+                                         PassManagerType PreferredType) {
+  // Find CGPassManager 
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_CallGraphPassManager)
+    PMS.pop();
+
+  assert(!PMS.empty() && "Unable to handle Call Graph Pass");
+  CGPassManager *CGP;
+  
+  if (PMS.top()->getPassManagerType() == PMT_CallGraphPassManager)
+    CGP = (CGPassManager*)PMS.top();
+  else {
+    // Create new Call Graph SCC Pass Manager if it does not exist. 
+    assert(!PMS.empty() && "Unable to create Call Graph Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Call Graph Pass Manager
+    CGP = new CGPassManager();
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(CGP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    Pass *P = CGP;
+    TPM->schedulePass(P);
+
+    // [4] Push new manager into PMS
+    PMS.push(CGP);
+  }
+
+  CGP->add(this);
+}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// the call graph.  If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void CallGraphSCCPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraph>();
+  AU.addPreserved<CallGraph>();
+}
+
+
+//===----------------------------------------------------------------------===//
+// PrintCallGraphPass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// PrintCallGraphPass - Print a Module corresponding to a call graph.
+  ///
+  class PrintCallGraphPass : public CallGraphSCCPass {
+    std::string Banner;
+    raw_ostream &Out;       // raw_ostream to print on.
+    
+  public:
+    static char ID;
+    PrintCallGraphPass(const std::string &B, raw_ostream &o)
+      : CallGraphSCCPass(ID), Banner(B), Out(o) {}
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+    
+    bool runOnSCC(CallGraphSCC &SCC) {
+      Out << Banner;
+      for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+        (*I)->getFunction()->print(Out);
+      return false;
+    }
+  };
+  
+} // end anonymous namespace.
+
+char PrintCallGraphPass::ID = 0;
+
+Pass *CallGraphSCCPass::createPrinterPass(raw_ostream &O,
+                                          const std::string &Banner) const {
+  return new PrintCallGraphPass(Banner, O);
+}
+
diff --git a/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp b/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp
new file mode 100644
index 000000000000..306ae7a4dbfb
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/CallPrinter.cpp
@@ -0,0 +1,87 @@
+//===- CallPrinter.cpp - DOT printer for call graph -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines '-dot-callgraph', which emit a callgraph.<fnname>.dot
+// containing the call graph of a module.
+//
+// There is also a pass available to directly call dotty ('-view-callgraph').
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallPrinter.h"
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+template<>
+struct DOTGraphTraits<CallGraph*> : public DefaultDOTGraphTraits {
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(CallGraph *Graph) {
+    return "Call graph";
+  }
+
+  std::string getNodeLabel(CallGraphNode *Node, CallGraph *Graph) {
+    if (Function *Func = Node->getFunction())
+      return Func->getName();
+
+    return "external node";
+  }
+};
+
+} // end llvm namespace
+
+namespace {
+
+struct CallGraphViewer
+  : public DOTGraphTraitsModuleViewer<CallGraph, true> {
+  static char ID;
+
+  CallGraphViewer()
+    : DOTGraphTraitsModuleViewer<CallGraph, true>("callgraph", ID) {
+    initializeCallGraphViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct CallGraphPrinter
+  : public DOTGraphTraitsModulePrinter<CallGraph, true> {
+  static char ID;
+
+  CallGraphPrinter()
+    : DOTGraphTraitsModulePrinter<CallGraph, true>("callgraph", ID) {
+      initializeCallGraphPrinterPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+} // end anonymous namespace
+
+char CallGraphViewer::ID = 0;
+INITIALIZE_PASS(CallGraphViewer, "view-callgraph",
+                "View call graph",
+                false, false)
+
+char CallGraphPrinter::ID = 0;
+INITIALIZE_PASS(CallGraphPrinter, "dot-callgraph",
+                "Print call graph to 'dot' file",
+                false, false)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+ModulePass *llvm::createCallGraphViewerPass() {
+  return new CallGraphViewer();
+}
+
+ModulePass *llvm::createCallGraphPrinterPass() {
+  return new CallGraphPrinter();
+}
diff --git a/contrib/llvm/lib/Analysis/IPA/FindUsedTypes.cpp b/contrib/llvm/lib/Analysis/IPA/FindUsedTypes.cpp
new file mode 100644
index 000000000000..1c4f17d3819a
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -0,0 +1,101 @@
+//===- FindUsedTypes.cpp - Find all Types used by a module ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to seek out all of the types in use by the program.  Note
+// that this analysis explicitly does not include types only used by the symbol
+// table.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+char FindUsedTypes::ID = 0;
+INITIALIZE_PASS(FindUsedTypes, "print-used-types",
+                "Find Used Types", false, true)
+
+// IncorporateType - Incorporate one type and all of its subtypes into the
+// collection of used types.
+//
+void FindUsedTypes::IncorporateType(Type *Ty) {
+  // If ty doesn't already exist in the used types map, add it now, otherwise
+  // return.
+  if (!UsedTypes.insert(Ty)) return;  // Already contain Ty.
+
+  // Make sure to add any types this type references now.
+  //
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
+    IncorporateType(*I);
+}
+
+void FindUsedTypes::IncorporateValue(const Value *V) {
+  IncorporateType(V->getType());
+
+  // If this is a constant, it could be using other types...
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (!isa<GlobalValue>(C))
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI)
+        IncorporateValue(*OI);
+  }
+}
+
+
+// run - This incorporates all types used by the specified module
+//
+bool FindUsedTypes::runOnModule(Module &m) {
+  UsedTypes.clear();  // reset if run multiple times...
+
+  // Loop over global variables, incorporating their types
+  for (Module::const_global_iterator I = m.global_begin(), E = m.global_end();
+       I != E; ++I) {
+    IncorporateType(I->getType());
+    if (I->hasInitializer())
+      IncorporateValue(I->getInitializer());
+  }
+
+  for (Module::iterator MI = m.begin(), ME = m.end(); MI != ME; ++MI) {
+    IncorporateType(MI->getType());
+    const Function &F = *MI;
+
+    // Loop over all of the instructions in the function, adding their return
+    // type as well as the types of their operands.
+    //
+    for (const_inst_iterator II = inst_begin(F), IE = inst_end(F);
+         II != IE; ++II) {
+      const Instruction &I = *II;
+
+      IncorporateType(I.getType());  // Incorporate the type of the instruction
+      for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+           OI != OE; ++OI)
+        IncorporateValue(*OI);  // Insert inst operand types as well
+    }
+  }
+
+  return false;
+}
+
+// Print the types found in the module.  If the optional Module parameter is
+// passed in, then the types are printed symbolically if possible, using the
+// symbol table from the module.
+//
+void FindUsedTypes::print(raw_ostream &OS, const Module *M) const {
+  OS << "Types in use by this module:\n";
+  for (SetVector<Type *>::const_iterator I = UsedTypes.begin(),
+       E = UsedTypes.end(); I != E; ++I) {
+    OS << "   " << **I << '\n';
+  }
+}
diff --git a/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp
new file mode 100644
index 000000000000..92d0d2318e0d
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -0,0 +1,607 @@
+//===- GlobalsModRef.cpp - Simple Mod/Ref Analysis for Globals ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This simple pass provides alias and mod/ref information for global values
+// that do not have their address taken, and keeps track of whether functions
+// read or write memory (are "pure").  For this simple (but very common) case,
+// we can provide pretty accurate and useful information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globalsmodref-aa"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InstIterator.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumNonAddrTakenGlobalVars,
+          "Number of global vars without address taken");
+STATISTIC(NumNonAddrTakenFunctions,"Number of functions without address taken");
+STATISTIC(NumNoMemFunctions, "Number of functions that do not access memory");
+STATISTIC(NumReadMemFunctions, "Number of functions that only read memory");
+STATISTIC(NumIndirectGlobalVars, "Number of indirect global objects");
+
+namespace {
+  /// FunctionRecord - One instance of this structure is stored for every
+  /// function in the program.  Later, the entries for these functions are
+  /// removed if the function is found to call an external function (in which
+  /// case we know nothing about it.
+  struct FunctionRecord {
+    /// GlobalInfo - Maintain mod/ref info for all of the globals without
+    /// addresses taken that are read or written (transitively) by this
+    /// function.
+    std::map<const GlobalValue*, unsigned> GlobalInfo;
+
+    /// MayReadAnyGlobal - May read global variables, but it is not known which.
+    bool MayReadAnyGlobal;
+
+    unsigned getInfoForGlobal(const GlobalValue *GV) const {
+      unsigned Effect = MayReadAnyGlobal ? AliasAnalysis::Ref : 0;
+      std::map<const GlobalValue*, unsigned>::const_iterator I =
+        GlobalInfo.find(GV);
+      if (I != GlobalInfo.end())
+        Effect |= I->second;
+      return Effect;
+    }
+
+    /// FunctionEffect - Capture whether or not this function reads or writes to
+    /// ANY memory.  If not, we can do a lot of aggressive analysis on it.
+    unsigned FunctionEffect;
+
+    FunctionRecord() : MayReadAnyGlobal (false), FunctionEffect(0) {}
+  };
+
+  /// GlobalsModRef - The actual analysis pass.
+  class GlobalsModRef : public ModulePass, public AliasAnalysis {
+    /// NonAddressTakenGlobals - The globals that do not have their addresses
+    /// taken.
+    std::set<const GlobalValue*> NonAddressTakenGlobals;
+
+    /// IndirectGlobals - The memory pointed to by this global is known to be
+    /// 'owned' by the global.
+    std::set<const GlobalValue*> IndirectGlobals;
+
+    /// AllocsForIndirectGlobals - If an instruction allocates memory for an
+    /// indirect global, this map indicates which one.
+    std::map<const Value*, const GlobalValue*> AllocsForIndirectGlobals;
+
+    /// FunctionInfo - For each function, keep track of what globals are
+    /// modified or read.
+    std::map<const Function*, FunctionRecord> FunctionInfo;
+
+  public:
+    static char ID;
+    GlobalsModRef() : ModulePass(ID) {
+      initializeGlobalsModRefPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) {
+      InitializeAliasAnalysis(this);                 // set up super class
+      AnalyzeGlobals(M);                          // find non-addr taken globals
+      AnalyzeCallGraph(getAnalysis<CallGraph>(), M); // Propagate on CG
+      return false;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AliasAnalysis::getAnalysisUsage(AU);
+      AU.addRequired<CallGraph>();
+      AU.setPreservesAll();                         // Does not transform code
+    }
+
+    //------------------------------------------------
+    // Implement the AliasAnalysis API
+    //
+    AliasResult alias(const Location &LocA, const Location &LocB);
+    ModRefResult getModRefInfo(ImmutableCallSite CS,
+                               const Location &Loc);
+    ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                               ImmutableCallSite CS2) {
+      return AliasAnalysis::getModRefInfo(CS1, CS2);
+    }
+
+    /// getModRefBehavior - Return the behavior of the specified function if
+    /// called from the specified call site.  The call site may be null in which
+    /// case the most generic behavior of this function should be returned.
+    ModRefBehavior getModRefBehavior(const Function *F) {
+      ModRefBehavior Min = UnknownModRefBehavior;
+
+      if (FunctionRecord *FR = getFunctionInfo(F)) {
+        if (FR->FunctionEffect == 0)
+          Min = DoesNotAccessMemory;
+        else if ((FR->FunctionEffect & Mod) == 0)
+          Min = OnlyReadsMemory;
+      }
+
+      return ModRefBehavior(AliasAnalysis::getModRefBehavior(F) & Min);
+    }
+    
+    /// getModRefBehavior - Return the behavior of the specified function if
+    /// called from the specified call site.  The call site may be null in which
+    /// case the most generic behavior of this function should be returned.
+    ModRefBehavior getModRefBehavior(ImmutableCallSite CS) {
+      ModRefBehavior Min = UnknownModRefBehavior;
+
+      if (const Function* F = CS.getCalledFunction())
+        if (FunctionRecord *FR = getFunctionInfo(F)) {
+          if (FR->FunctionEffect == 0)
+            Min = DoesNotAccessMemory;
+          else if ((FR->FunctionEffect & Mod) == 0)
+            Min = OnlyReadsMemory;
+        }
+
+      return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+    }
+
+    virtual void deleteValue(Value *V);
+    virtual void copyValue(Value *From, Value *To);
+    virtual void addEscapingUse(Use &U);
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+    
+  private:
+    /// getFunctionInfo - Return the function info for the function, or null if
+    /// we don't have anything useful to say about it.
+    FunctionRecord *getFunctionInfo(const Function *F) {
+      std::map<const Function*, FunctionRecord>::iterator I =
+        FunctionInfo.find(F);
+      if (I != FunctionInfo.end())
+        return &I->second;
+      return 0;
+    }
+
+    void AnalyzeGlobals(Module &M);
+    void AnalyzeCallGraph(CallGraph &CG, Module &M);
+    bool AnalyzeUsesOfPointer(Value *V, std::vector<Function*> &Readers,
+                              std::vector<Function*> &Writers,
+                              GlobalValue *OkayStoreDest = 0);
+    bool AnalyzeIndirectGlobalMemory(GlobalValue *GV);
+  };
+}
+
+char GlobalsModRef::ID = 0;
+INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis,
+                "globalsmodref-aa", "Simple mod/ref analysis for globals",    
+                false, true, false)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis,
+                "globalsmodref-aa", "Simple mod/ref analysis for globals",    
+                false, true, false)
+
+Pass *llvm::createGlobalsModRefPass() { return new GlobalsModRef(); }
+
+/// AnalyzeGlobals - Scan through the users of all of the internal
+/// GlobalValue's in the program.  If none of them have their "address taken"
+/// (really, their address passed to something nontrivial), record this fact,
+/// and record the functions that they are used directly in.
+void GlobalsModRef::AnalyzeGlobals(Module &M) {
+  std::vector<Function*> Readers, Writers;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->hasLocalLinkage()) {
+      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
+        // Remember that we are tracking this global.
+        NonAddressTakenGlobals.insert(I);
+        ++NumNonAddrTakenFunctions;
+      }
+      Readers.clear(); Writers.clear();
+    }
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (I->hasLocalLinkage()) {
+      if (!AnalyzeUsesOfPointer(I, Readers, Writers)) {
+        // Remember that we are tracking this global, and the mod/ref fns
+        NonAddressTakenGlobals.insert(I);
+
+        for (unsigned i = 0, e = Readers.size(); i != e; ++i)
+          FunctionInfo[Readers[i]].GlobalInfo[I] |= Ref;
+
+        if (!I->isConstant())  // No need to keep track of writers to constants
+          for (unsigned i = 0, e = Writers.size(); i != e; ++i)
+            FunctionInfo[Writers[i]].GlobalInfo[I] |= Mod;
+        ++NumNonAddrTakenGlobalVars;
+
+        // If this global holds a pointer type, see if it is an indirect global.
+        if (I->getType()->getElementType()->isPointerTy() &&
+            AnalyzeIndirectGlobalMemory(I))
+          ++NumIndirectGlobalVars;
+      }
+      Readers.clear(); Writers.clear();
+    }
+}
+
+/// AnalyzeUsesOfPointer - Look at all of the users of the specified pointer.
+/// If this is used by anything complex (i.e., the address escapes), return
+/// true.  Also, while we are at it, keep track of those functions that read and
+/// write to the value.
+///
+/// If OkayStoreDest is non-null, stores into this global are allowed.
+bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
+                                         std::vector<Function*> &Readers,
+                                         std::vector<Function*> &Writers,
+                                         GlobalValue *OkayStoreDest) {
+  if (!V->getType()->isPointerTy()) return true;
+
+  for (Value::use_iterator UI = V->use_begin(), E=V->use_end(); UI != E; ++UI) {
+    User *U = *UI;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      Readers.push_back(LI->getParent()->getParent());
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (V == SI->getOperand(1)) {
+        Writers.push_back(SI->getParent()->getParent());
+      } else if (SI->getOperand(1) != OkayStoreDest) {
+        return true;  // Storing the pointer
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      if (AnalyzeUsesOfPointer(GEP, Readers, Writers)) return true;
+    } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (AnalyzeUsesOfPointer(BCI, Readers, Writers, OkayStoreDest))
+        return true;
+    } else if (isFreeCall(U, TLI)) {
+      Writers.push_back(cast<Instruction>(U)->getParent()->getParent());
+    } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+        if (CI->getArgOperand(i) == V) return true;
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+      // Make sure that this is just the function being called, not that it is
+      // passing into the function.
+      for (unsigned i = 0, e = II->getNumArgOperands(); i != e; ++i)
+        if (II->getArgOperand(i) == V) return true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr ||
+          CE->getOpcode() == Instruction::BitCast) {
+        if (AnalyzeUsesOfPointer(CE, Readers, Writers))
+          return true;
+      } else {
+        return true;
+      }
+    } else if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return true;  // Allow comparison against null.
+    } else {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable
+/// which holds a pointer type.  See if the global always points to non-aliased
+/// heap memory: that is, all initializers of the globals are allocations, and
+/// those allocations have no use other than initialization of the global.
+/// Further, all loads out of GV must directly use the memory, not store the
+/// pointer somewhere.  If this is true, we consider the memory pointed to by
+/// GV to be owned by GV and can disambiguate other pointers from it.
+bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
+  // Keep track of values related to the allocation of the memory, f.e. the
+  // value produced by the malloc call and any casts.
+  std::vector<Value*> AllocRelatedValues;
+
+  // Walk the user list of the global.  If we find anything other than a direct
+  // load or store, bail out.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){
+    User *U = *I;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // The pointer loaded from the global can only be used in simple ways:
+      // we allow addressing of it and loading storing to it.  We do *not* allow
+      // storing the loaded pointer somewhere else or passing to a function.
+      std::vector<Function*> ReadersWriters;
+      if (AnalyzeUsesOfPointer(LI, ReadersWriters, ReadersWriters))
+        return false;  // Loaded pointer escapes.
+      // TODO: Could try some IP mod/ref of the loaded pointer.
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Storing the global itself.
+      if (SI->getOperand(0) == GV) return false;
+
+      // If storing the null pointer, ignore it.
+      if (isa<ConstantPointerNull>(SI->getOperand(0)))
+        continue;
+
+      // Check the value being stored.
+      Value *Ptr = GetUnderlyingObject(SI->getOperand(0));
+
+      if (!isAllocLikeFn(Ptr, TLI))
+        return false;  // Too hard to analyze.
+
+      // Analyze all uses of the allocation.  If any of them are used in a
+      // non-simple way (e.g. stored to another global) bail out.
+      std::vector<Function*> ReadersWriters;
+      if (AnalyzeUsesOfPointer(Ptr, ReadersWriters, ReadersWriters, GV))
+        return false;  // Loaded pointer escapes.
+
+      // Remember that this allocation is related to the indirect global.
+      AllocRelatedValues.push_back(Ptr);
+    } else {
+      // Something complex, bail out.
+      return false;
+    }
+  }
+
+  // Okay, this is an indirect global.  Remember all of the allocations for
+  // this global in AllocsForIndirectGlobals.
+  while (!AllocRelatedValues.empty()) {
+    AllocsForIndirectGlobals[AllocRelatedValues.back()] = GV;
+    AllocRelatedValues.pop_back();
+  }
+  IndirectGlobals.insert(GV);
+  return true;
+}
+
+/// AnalyzeCallGraph - At this point, we know the functions where globals are
+/// immediately stored to and read from.  Propagate this information up the call
+/// graph to all callers and compute the mod/ref info for all memory for each
+/// function.
+void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
+  // We do a bottom-up SCC traversal of the call graph.  In other words, we
+  // visit all callees before callers (leaf-first).
+  for (scc_iterator<CallGraph*> I = scc_begin(&CG), E = scc_end(&CG); I != E;
+       ++I) {
+    std::vector<CallGraphNode *> &SCC = *I;
+    assert(!SCC.empty() && "SCC with no functions?");
+
+    if (!SCC[0]->getFunction()) {
+      // Calls externally - can't say anything useful.  Remove any existing
+      // function records (may have been created when scanning globals).
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+        FunctionInfo.erase(SCC[i]->getFunction());
+      continue;
+    }
+
+    FunctionRecord &FR = FunctionInfo[SCC[0]->getFunction()];
+
+    bool KnowNothing = false;
+    unsigned FunctionEffect = 0;
+
+    // Collect the mod/ref properties due to called functions.  We only compute
+    // one mod-ref set.
+    for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
+      Function *F = SCC[i]->getFunction();
+      if (!F) {
+        KnowNothing = true;
+        break;
+      }
+
+      if (F->isDeclaration()) {
+        // Try to get mod/ref behaviour from function attributes.
+        if (F->doesNotAccessMemory()) {
+          // Can't do better than that!
+        } else if (F->onlyReadsMemory()) {
+          FunctionEffect |= Ref;
+          if (!F->isIntrinsic())
+            // This function might call back into the module and read a global -
+            // consider every global as possibly being read by this function.
+            FR.MayReadAnyGlobal = true;
+        } else {
+          FunctionEffect |= ModRef;
+          // Can't say anything useful unless it's an intrinsic - they don't
+          // read or write global variables of the kind considered here.
+          KnowNothing = !F->isIntrinsic();
+        }
+        continue;
+      }
+
+      for (CallGraphNode::iterator CI = SCC[i]->begin(), E = SCC[i]->end();
+           CI != E && !KnowNothing; ++CI)
+        if (Function *Callee = CI->second->getFunction()) {
+          if (FunctionRecord *CalleeFR = getFunctionInfo(Callee)) {
+            // Propagate function effect up.
+            FunctionEffect |= CalleeFR->FunctionEffect;
+
+            // Incorporate callee's effects on globals into our info.
+            for (std::map<const GlobalValue*, unsigned>::iterator GI =
+                   CalleeFR->GlobalInfo.begin(), E = CalleeFR->GlobalInfo.end();
+                 GI != E; ++GI)
+              FR.GlobalInfo[GI->first] |= GI->second;
+            FR.MayReadAnyGlobal |= CalleeFR->MayReadAnyGlobal;
+          } else {
+            // Can't say anything about it.  However, if it is inside our SCC,
+            // then nothing needs to be done.
+            CallGraphNode *CalleeNode = CG[Callee];
+            if (std::find(SCC.begin(), SCC.end(), CalleeNode) == SCC.end())
+              KnowNothing = true;
+          }
+        } else {
+          KnowNothing = true;
+        }
+    }
+
+    // If we can't say anything useful about this SCC, remove all SCC functions
+    // from the FunctionInfo map.
+    if (KnowNothing) {
+      for (unsigned i = 0, e = SCC.size(); i != e; ++i)
+        FunctionInfo.erase(SCC[i]->getFunction());
+      continue;
+    }
+
+    // Scan the function bodies for explicit loads or stores.
+    for (unsigned i = 0, e = SCC.size(); i != e && FunctionEffect != ModRef;++i)
+      for (inst_iterator II = inst_begin(SCC[i]->getFunction()),
+             E = inst_end(SCC[i]->getFunction());
+           II != E && FunctionEffect != ModRef; ++II)
+        if (LoadInst *LI = dyn_cast<LoadInst>(&*II)) {
+          FunctionEffect |= Ref;
+          if (LI->isVolatile())
+            // Volatile loads may have side-effects, so mark them as writing
+            // memory (for example, a flag inside the processor).
+            FunctionEffect |= Mod;
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(&*II)) {
+          FunctionEffect |= Mod;
+          if (SI->isVolatile())
+            // Treat volatile stores as reading memory somewhere.
+            FunctionEffect |= Ref;
+        } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) {
+          FunctionEffect |= ModRef;
+        } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
+          // The callgraph doesn't include intrinsic calls.
+          Function *Callee = Intrinsic->getCalledFunction();
+          ModRefBehavior Behaviour = AliasAnalysis::getModRefBehavior(Callee);
+          FunctionEffect |= (Behaviour & ModRef);
+        }
+
+    if ((FunctionEffect & Mod) == 0)
+      ++NumReadMemFunctions;
+    if (FunctionEffect == 0)
+      ++NumNoMemFunctions;
+    FR.FunctionEffect = FunctionEffect;
+
+    // Finally, now that we know the full effect on this SCC, clone the
+    // information to each function in the SCC.
+    for (unsigned i = 1, e = SCC.size(); i != e; ++i)
+      FunctionInfo[SCC[i]->getFunction()] = FR;
+  }
+}
+
+
+
+/// alias - If one of the pointers is to a global that we are tracking, and the
+/// other is some random pointer, we know there cannot be an alias, because the
+/// address of the global isn't taken.
+AliasAnalysis::AliasResult
+GlobalsModRef::alias(const Location &LocA,
+                     const Location &LocB) {
+  // Get the base object these pointers point to.
+  const Value *UV1 = GetUnderlyingObject(LocA.Ptr);
+  const Value *UV2 = GetUnderlyingObject(LocB.Ptr);
+
+  // If either of the underlying values is a global, they may be non-addr-taken
+  // globals, which we can answer queries about.
+  const GlobalValue *GV1 = dyn_cast<GlobalValue>(UV1);
+  const GlobalValue *GV2 = dyn_cast<GlobalValue>(UV2);
+  if (GV1 || GV2) {
+    // If the global's address is taken, pretend we don't know it's a pointer to
+    // the global.
+    if (GV1 && !NonAddressTakenGlobals.count(GV1)) GV1 = 0;
+    if (GV2 && !NonAddressTakenGlobals.count(GV2)) GV2 = 0;
+
+    // If the two pointers are derived from two different non-addr-taken
+    // globals, or if one is and the other isn't, we know these can't alias.
+    if ((GV1 || GV2) && GV1 != GV2)
+      return NoAlias;
+
+    // Otherwise if they are both derived from the same addr-taken global, we
+    // can't know the two accesses don't overlap.
+  }
+
+  // These pointers may be based on the memory owned by an indirect global.  If
+  // so, we may be able to handle this.  First check to see if the base pointer
+  // is a direct load from an indirect global.
+  GV1 = GV2 = 0;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV1))
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV1 = GV;
+  if (const LoadInst *LI = dyn_cast<LoadInst>(UV2))
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getOperand(0)))
+      if (IndirectGlobals.count(GV))
+        GV2 = GV;
+
+  // These pointers may also be from an allocation for the indirect global.  If
+  // so, also handle them.
+  if (AllocsForIndirectGlobals.count(UV1))
+    GV1 = AllocsForIndirectGlobals[UV1];
+  if (AllocsForIndirectGlobals.count(UV2))
+    GV2 = AllocsForIndirectGlobals[UV2];
+
+  // Now that we know whether the two pointers are related to indirect globals,
+  // use this to disambiguate the pointers.  If either pointer is based on an
+  // indirect global and if they are not both based on the same indirect global,
+  // they cannot alias.
+  if ((GV1 || GV2) && GV1 != GV2)
+    return NoAlias;
+
+  return AliasAnalysis::alias(LocA, LocB);
+}
+
+AliasAnalysis::ModRefResult
+GlobalsModRef::getModRefInfo(ImmutableCallSite CS,
+                             const Location &Loc) {
+  unsigned Known = ModRef;
+
+  // If we are asking for mod/ref info of a direct call with a pointer to a
+  // global we are tracking, return information if we have it.
+  if (const GlobalValue *GV =
+        dyn_cast<GlobalValue>(GetUnderlyingObject(Loc.Ptr)))
+    if (GV->hasLocalLinkage())
+      if (const Function *F = CS.getCalledFunction())
+        if (NonAddressTakenGlobals.count(GV))
+          if (const FunctionRecord *FR = getFunctionInfo(F))
+            Known = FR->getInfoForGlobal(GV);
+
+  if (Known == NoModRef)
+    return NoModRef; // No need to query other mod/ref analyses
+  return ModRefResult(Known & AliasAnalysis::getModRefInfo(CS, Loc));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Methods to update the analysis as a result of the client transformation.
+//
+void GlobalsModRef::deleteValue(Value *V) {
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (NonAddressTakenGlobals.erase(GV)) {
+      // This global might be an indirect global.  If so, remove it and remove
+      // any AllocRelatedValues for it.
+      if (IndirectGlobals.erase(GV)) {
+        // Remove any entries in AllocsForIndirectGlobals for this global.
+        for (std::map<const Value*, const GlobalValue*>::iterator
+             I = AllocsForIndirectGlobals.begin(),
+             E = AllocsForIndirectGlobals.end(); I != E; ) {
+          if (I->second == GV) {
+            AllocsForIndirectGlobals.erase(I++);
+          } else {
+            ++I;
+          }
+        }
+      }
+    }
+  }
+
+  // Otherwise, if this is an allocation related to an indirect global, remove
+  // it.
+  AllocsForIndirectGlobals.erase(V);
+
+  AliasAnalysis::deleteValue(V);
+}
+
+void GlobalsModRef::copyValue(Value *From, Value *To) {
+  AliasAnalysis::copyValue(From, To);
+}
+
+void GlobalsModRef::addEscapingUse(Use &U) {
+  // For the purposes of this analysis, it is conservatively correct to treat
+  // a newly escaping value equivalently to a deleted one.  We could perhaps
+  // be more precise by processing the new use and attempting to update our
+  // saved analysis results to accommodate it.
+  deleteValue(U);
+  
+  AliasAnalysis::addEscapingUse(U);
+}
diff --git a/contrib/llvm/lib/Analysis/IPA/IPA.cpp b/contrib/llvm/lib/Analysis/IPA/IPA.cpp
new file mode 100644
index 000000000000..aa5164e9e79b
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/IPA.cpp
@@ -0,0 +1,31 @@
+//===-- IPA.cpp -----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common initialization routines for the IPA library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+
+using namespace llvm;
+
+/// initializeIPA - Initialize all passes linked into the IPA library.
+void llvm::initializeIPA(PassRegistry &Registry) {
+  initializeBasicCallGraphPass(Registry);
+  initializeCallGraphAnalysisGroup(Registry);
+  initializeCallGraphPrinterPass(Registry);
+  initializeCallGraphViewerPass(Registry);
+  initializeFindUsedTypesPass(Registry);
+  initializeGlobalsModRefPass(Registry);
+}
+
+void LLVMInitializeIPA(LLVMPassRegistryRef R) {
+  initializeIPA(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp b/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp
new file mode 100644
index 000000000000..35c45e61808b
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IPA/InlineCost.cpp
@@ -0,0 +1,1239 @@
+//===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inline cost analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline-cost"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
+
+namespace {
+
+class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
+  typedef InstVisitor<CallAnalyzer, bool> Base;
+  friend class InstVisitor<CallAnalyzer, bool>;
+
+  // DataLayout if available, or null.
+  const DataLayout *const TD;
+
+  /// The TargetTransformInfo available for this compilation.
+  const TargetTransformInfo &TTI;
+
+  // The called function.
+  Function &F;
+
+  int Threshold;
+  int Cost;
+
+  bool IsCallerRecursive;
+  bool IsRecursiveCall;
+  bool ExposesReturnsTwice;
+  bool HasDynamicAlloca;
+  bool ContainsNoDuplicateCall;
+
+  /// Number of bytes allocated statically by the callee.
+  uint64_t AllocatedSize;
+  unsigned NumInstructions, NumVectorInstructions;
+  int FiftyPercentVectorBonus, TenPercentVectorBonus;
+  int VectorBonus;
+
+  // While we walk the potentially-inlined instructions, we build up and
+  // maintain a mapping of simplified values specific to this callsite. The
+  // idea is to propagate any special information we have about arguments to
+  // this call through the inlinable section of the function, and account for
+  // likely simplifications post-inlining. The most important aspect we track
+  // is CFG altering simplifications -- when we prove a basic block dead, that
+  // can cause dramatic shifts in the cost of inlining a function.
+  DenseMap<Value *, Constant *> SimplifiedValues;
+
+  // Keep track of the values which map back (through function arguments) to
+  // allocas on the caller stack which could be simplified through SROA.
+  DenseMap<Value *, Value *> SROAArgValues;
+
+  // The mapping of caller Alloca values to their accumulated cost savings. If
+  // we have to disable SROA for one of the allocas, this tells us how much
+  // cost must be added.
+  DenseMap<Value *, int> SROAArgCosts;
+
+  // Keep track of values which map to a pointer base and constant offset.
+  DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
+
+  // Custom simplification helper routines.
+  bool isAllocaDerivedArg(Value *V);
+  bool lookupSROAArgAndCost(Value *V, Value *&Arg,
+                            DenseMap<Value *, int>::iterator &CostIt);
+  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
+  void disableSROA(Value *V);
+  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                          int InstructionCost);
+  bool handleSROACandidate(bool IsSROAValid,
+                           DenseMap<Value *, int>::iterator CostIt,
+                           int InstructionCost);
+  bool isGEPOffsetConstant(GetElementPtrInst &GEP);
+  bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
+  bool simplifyCallSite(Function *F, CallSite CS);
+  ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
+
+  // Custom analysis routines.
+  bool analyzeBlock(BasicBlock *BB);
+
+  // Disable several entry points to the visitor so we don't accidentally use
+  // them by declaring but not defining them here.
+  void visit(Module *);     void visit(Module &);
+  void visit(Function *);   void visit(Function &);
+  void visit(BasicBlock *); void visit(BasicBlock &);
+
+  // Provide base case for our instruction visit.
+  bool visitInstruction(Instruction &I);
+
+  // Our visit overrides.
+  bool visitAlloca(AllocaInst &I);
+  bool visitPHI(PHINode &I);
+  bool visitGetElementPtr(GetElementPtrInst &I);
+  bool visitBitCast(BitCastInst &I);
+  bool visitPtrToInt(PtrToIntInst &I);
+  bool visitIntToPtr(IntToPtrInst &I);
+  bool visitCastInst(CastInst &I);
+  bool visitUnaryInstruction(UnaryInstruction &I);
+  bool visitICmp(ICmpInst &I);
+  bool visitSub(BinaryOperator &I);
+  bool visitBinaryOperator(BinaryOperator &I);
+  bool visitLoad(LoadInst &I);
+  bool visitStore(StoreInst &I);
+  bool visitExtractValue(ExtractValueInst &I);
+  bool visitInsertValue(InsertValueInst &I);
+  bool visitCallSite(CallSite CS);
+
+public:
+  CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
+               Function &Callee, int Threshold)
+      : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
+        IsCallerRecursive(false), IsRecursiveCall(false),
+        ExposesReturnsTwice(false), HasDynamicAlloca(false),
+        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
+        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
+        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
+        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
+        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
+        SROACostSavings(0), SROACostSavingsLost(0) {}
+
+  bool analyzeCall(CallSite CS);
+
+  int getThreshold() { return Threshold; }
+  int getCost() { return Cost; }
+
+  // Keep a bunch of stats about the cost savings found so we can print them
+  // out when debugging.
+  unsigned NumConstantArgs;
+  unsigned NumConstantOffsetPtrArgs;
+  unsigned NumAllocaArgs;
+  unsigned NumConstantPtrCmps;
+  unsigned NumConstantPtrDiffs;
+  unsigned NumInstructionsSimplified;
+  unsigned SROACostSavings;
+  unsigned SROACostSavingsLost;
+
+  void dump();
+};
+
+} // namespace
+
+/// \brief Test whether the given value is an Alloca-derived function argument.
+bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
+  return SROAArgValues.count(V);
+}
+
+/// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
+/// Returns false if V does not map to a SROA-candidate.
+bool CallAnalyzer::lookupSROAArgAndCost(
+    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+  if (SROAArgValues.empty() || SROAArgCosts.empty())
+    return false;
+
+  DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
+  if (ArgIt == SROAArgValues.end())
+    return false;
+
+  Arg = ArgIt->second;
+  CostIt = SROAArgCosts.find(Arg);
+  return CostIt != SROAArgCosts.end();
+}
+
+/// \brief Disable SROA for the candidate marked by this cost iterator.
+///
+/// This marks the candidate as no longer viable for SROA, and adds the cost
+/// savings associated with it back into the inline cost measurement.
+void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+  // If we're no longer able to perform SROA we need to undo its cost savings
+  // and prevent subsequent analysis.
+  Cost += CostIt->second;
+  SROACostSavings -= CostIt->second;
+  SROACostSavingsLost += CostIt->second;
+  SROAArgCosts.erase(CostIt);
+}
+
+/// \brief If 'V' maps to a SROA candidate, disable SROA for it.
+void CallAnalyzer::disableSROA(Value *V) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(V, SROAArg, CostIt))
+    disableSROA(CostIt);
+}
+
+/// \brief Accumulate the given cost for a particular SROA candidate.
+void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                                      int InstructionCost) {
+  CostIt->second += InstructionCost;
+  SROACostSavings += InstructionCost;
+}
+
+/// \brief Helper for the common pattern of handling a SROA candidate.
+/// Either accumulates the cost savings if the SROA remains valid, or disables
+/// SROA for the candidate.
+bool CallAnalyzer::handleSROACandidate(bool IsSROAValid,
+                                       DenseMap<Value *, int>::iterator CostIt,
+                                       int InstructionCost) {
+  if (IsSROAValid) {
+    accumulateSROACost(CostIt, InstructionCost);
+    return true;
+  }
+
+  disableSROA(CostIt);
+  return false;
+}
+
+/// \brief Check whether a GEP's indices are all constant.
+///
+/// Respects any simplified values known during the analysis of this callsite.
+bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
+  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+    if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+      return false;
+
+  return true;
+}
+
+/// \brief Accumulate a constant GEP offset into an APInt if possible.
+///
+/// Returns false if unable to compute the offset for any reason. Respects any
+/// simplified values known during the analysis of this callsite.
+bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
+  if (!TD)
+    return false;
+
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  assert(IntPtrWidth == Offset.getBitWidth());
+
+  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+       GTI != GTE; ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (!OpC)
+      if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
+        OpC = dyn_cast<ConstantInt>(SimpleOp);
+    if (!OpC)
+      return false;
+    if (OpC->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = TD->getStructLayout(STy);
+      Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
+      continue;
+    }
+
+    APInt TypeSize(IntPtrWidth, TD->getTypeAllocSize(GTI.getIndexedType()));
+    Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
+  }
+  return true;
+}
+
+bool CallAnalyzer::visitAlloca(AllocaInst &I) {
+  // FIXME: Check whether inlining will turn a dynamic alloca into a static
+  // alloca, and handle that case.
+
+  // Accumulate the allocated size.
+  if (I.isStaticAlloca()) {
+    Type *Ty = I.getAllocatedType();
+    AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) :
+                      Ty->getPrimitiveSizeInBits());
+  }
+
+  // We will happily inline static alloca instructions.
+  if (I.isStaticAlloca())
+    return Base::visitAlloca(I);
+
+  // FIXME: This is overly conservative. Dynamic allocas are inefficient for
+  // a variety of reasons, and so we would like to not inline them into
+  // functions which don't currently have a dynamic alloca. This simply
+  // disables inlining altogether in the presence of a dynamic alloca.
+  HasDynamicAlloca = true;
+  return false;
+}
+
+bool CallAnalyzer::visitPHI(PHINode &I) {
+  // FIXME: We should potentially be tracking values through phi nodes,
+  // especially when they collapse to a single value due to deleted CFG edges
+  // during inlining.
+
+  // FIXME: We need to propagate SROA *disabling* through phi nodes, even
+  // though we don't want to propagate it's bonuses. The idea is to disable
+  // SROA if it *might* be used in an inappropriate manner.
+
+  // Phi nodes are always zero-cost.
+  return true;
+}
+
+bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
+                                            SROAArg, CostIt);
+
+  // Try to fold GEPs of constant-offset call site argument pointers. This
+  // requires target data and inbounds GEPs.
+  if (TD && I.isInBounds()) {
+    // Check if we have a base + offset for the pointer.
+    Value *Ptr = I.getPointerOperand();
+    std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
+    if (BaseAndOffset.first) {
+      // Check if the offset of this GEP is constant, and if so accumulate it
+      // into Offset.
+      if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
+        // Non-constant GEPs aren't folded, and disable SROA.
+        if (SROACandidate)
+          disableSROA(CostIt);
+        return false;
+      }
+
+      // Add the result as a new mapping to Base + Offset.
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
+
+      // Also handle SROA candidates here, we already know that the GEP is
+      // all-constant indexed.
+      if (SROACandidate)
+        SROAArgValues[&I] = SROAArg;
+
+      return true;
+    }
+  }
+
+  if (isGEPOffsetConstant(I)) {
+    if (SROACandidate)
+      SROAArgValues[&I] = SROAArg;
+
+    // Constant GEPs are modeled as free.
+    return true;
+  }
+
+  // Variable GEPs will require math and will disable SROA.
+  if (SROACandidate)
+    disableSROA(CostIt);
+  return false;
+}
+
+bool CallAnalyzer::visitBitCast(BitCastInst &I) {
+  // Propagate constants through bitcasts.
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
+    if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Track base/offsets through casts
+  std::pair<Value *, APInt> BaseAndOffset
+    = ConstantOffsetPtrs.lookup(I.getOperand(0));
+  // Casts don't change the offset, just wrap it up.
+  if (BaseAndOffset.first)
+    ConstantOffsetPtrs[&I] = BaseAndOffset;
+
+  // Also look for SROA candidates here.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
+
+  // Bitcasts are always zero cost.
+  return true;
+}
+
+bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
+  // Propagate constants through ptrtoint.
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
+    if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Track base/offset pairs when converted to a plain integer provided the
+  // integer is large enough to represent the pointer.
+  unsigned IntegerSize = I.getType()->getScalarSizeInBits();
+  if (TD && IntegerSize >= TD->getPointerSizeInBits()) {
+    std::pair<Value *, APInt> BaseAndOffset
+      = ConstantOffsetPtrs.lookup(I.getOperand(0));
+    if (BaseAndOffset.first)
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
+  }
+
+  // This is really weird. Technically, ptrtoint will disable SROA. However,
+  // unless that ptrtoint is *used* somewhere in the live basic blocks after
+  // inlining, it will be nuked, and SROA should proceed. All of the uses which
+  // would block SROA would also block SROA if applied directly to a pointer,
+  // and so we can just add the integer in here. The only places where SROA is
+  // preserved either cannot fire on an integer, or won't in-and-of themselves
+  // disable SROA (ext) w/o some later use that we would see and disable.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
+
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+}
+
+bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
+  // Propagate constants through ptrtoint.
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
+    if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Track base/offset pairs when round-tripped through a pointer without
+  // modifications provided the integer is not too large.
+  Value *Op = I.getOperand(0);
+  unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
+  if (TD && IntegerSize <= TD->getPointerSizeInBits()) {
+    std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
+    if (BaseAndOffset.first)
+      ConstantOffsetPtrs[&I] = BaseAndOffset;
+  }
+
+  // "Propagate" SROA here in the same manner as we do for ptrtoint above.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
+    SROAArgValues[&I] = SROAArg;
+
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+}
+
+bool CallAnalyzer::visitCastInst(CastInst &I) {
+  // Propagate constants through ptrtoint.
+  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
+  if (!COp)
+    COp = SimplifiedValues.lookup(I.getOperand(0));
+  if (COp)
+    if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
+  disableSROA(I.getOperand(0));
+
+  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+}
+
+bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
+  Value *Operand = I.getOperand(0);
+  Constant *COp = dyn_cast<Constant>(Operand);
+  if (!COp)
+    COp = SimplifiedValues.lookup(Operand);
+  if (COp)
+    if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(),
+                                               COp, TD)) {
+      SimplifiedValues[&I] = C;
+      return true;
+    }
+
+  // Disable any SROA on the argument to arbitrary unary operators.
+  disableSROA(Operand);
+
+  return false;
+}
+
+bool CallAnalyzer::visitICmp(ICmpInst &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  // First try to handle simplified comparisons.
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        return true;
+      }
+
+  // Otherwise look for a comparison between constant offset pointers with
+  // a common base.
+  Value *LHSBase, *RHSBase;
+  APInt LHSOffset, RHSOffset;
+  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  if (LHSBase) {
+    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    if (RHSBase && LHSBase == RHSBase) {
+      // We have common bases, fold the icmp to a constant based on the
+      // offsets.
+      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
+      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
+      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        ++NumConstantPtrCmps;
+        return true;
+      }
+    }
+  }
+
+  // If the comparison is an equality comparison with null, we can simplify it
+  // for any alloca-derived argument.
+  if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)))
+    if (isAllocaDerivedArg(I.getOperand(0))) {
+      // We can actually predict the result of comparisons between an
+      // alloca-derived value and null. Note that this fires regardless of
+      // SROA firing.
+      bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
+      SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
+                                        : ConstantInt::getFalse(I.getType());
+      return true;
+    }
+
+  // Finally check for SROA candidates in comparisons.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (isa<ConstantPointerNull>(I.getOperand(1))) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
+
+  return false;
+}
+
+bool CallAnalyzer::visitSub(BinaryOperator &I) {
+  // Try to handle a special case: we can fold computing the difference of two
+  // constant-related pointers.
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  Value *LHSBase, *RHSBase;
+  APInt LHSOffset, RHSOffset;
+  llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
+  if (LHSBase) {
+    llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
+    if (RHSBase && LHSBase == RHSBase) {
+      // We have common bases, fold the subtract to a constant based on the
+      // offsets.
+      Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
+      Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
+      if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
+        SimplifiedValues[&I] = C;
+        ++NumConstantPtrDiffs;
+        return true;
+      }
+    }
+  }
+
+  // Otherwise, fall back to the generic logic for simplifying and handling
+  // instructions.
+  return Base::visitSub(I);
+}
+
+bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  if (!isa<Constant>(LHS))
+    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
+      LHS = SimpleLHS;
+  if (!isa<Constant>(RHS))
+    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
+      RHS = SimpleRHS;
+  Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, TD);
+  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
+    SimplifiedValues[&I] = C;
+    return true;
+  }
+
+  // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
+  disableSROA(LHS);
+  disableSROA(RHS);
+
+  return false;
+}
+
+bool CallAnalyzer::visitLoad(LoadInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
+
+  return false;
+}
+
+bool CallAnalyzer::visitStore(StoreInst &I) {
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
+
+  return false;
+}
+
+bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
+  // Constant folding for extract value is trivial.
+  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!C)
+    C = SimplifiedValues.lookup(I.getAggregateOperand());
+  if (C) {
+    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
+  // Constant folding for insert value is trivial.
+  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
+  if (!AggC)
+    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
+  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
+  if (!InsertedC)
+    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
+  if (AggC && InsertedC) {
+    SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
+                                                        I.getIndices());
+    return true;
+  }
+
+  // SROA can look through these but give them a cost.
+  return false;
+}
+
+/// \brief Try to simplify a call site.
+///
+/// Takes a concrete function and callsite and tries to actually simplify it by
+/// analyzing the arguments and call itself with instsimplify. Returns true if
+/// it has simplified the callsite to some other entity (a constant), making it
+/// free.
+bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
+  // FIXME: Using the instsimplify logic directly for this is inefficient
+  // because we have to continually rebuild the argument list even when no
+  // simplifications can be performed. Until that is fixed with remapping
+  // inside of instsimplify, directly constant fold calls here.
+  if (!canConstantFoldCallTo(F))
+    return false;
+
+  // Try to re-map the arguments to constants.
+  SmallVector<Constant *, 4> ConstantArgs;
+  ConstantArgs.reserve(CS.arg_size());
+  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I) {
+    Constant *C = dyn_cast<Constant>(*I);
+    if (!C)
+      C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
+    if (!C)
+      return false; // This argument doesn't map to a constant.
+
+    ConstantArgs.push_back(C);
+  }
+  if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
+    SimplifiedValues[CS.getInstruction()] = C;
+    return true;
+  }
+
+  return false;
+}
+
+bool CallAnalyzer::visitCallSite(CallSite CS) {
+  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
+      !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::ReturnsTwice)) {
+    // This aborts the entire analysis.
+    ExposesReturnsTwice = true;
+    return false;
+  }
+  if (CS.isCall() &&
+      cast<CallInst>(CS.getInstruction())->hasFnAttr(Attribute::NoDuplicate))
+    ContainsNoDuplicateCall = true;
+
+  if (Function *F = CS.getCalledFunction()) {
+    // When we have a concrete function, first try to simplify it directly.
+    if (simplifyCallSite(F, CS))
+      return true;
+
+    // Next check if it is an intrinsic we know about.
+    // FIXME: Lift this into part of the InstVisitor.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+      switch (II->getIntrinsicID()) {
+      default:
+        return Base::visitCallSite(CS);
+
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+      case Intrinsic::memmove:
+        // SROA can usually chew through these intrinsics, but they aren't free.
+        return false;
+      }
+    }
+
+    if (F == CS.getInstruction()->getParent()->getParent()) {
+      // This flag will fully abort the analysis, so don't bother with anything
+      // else.
+      IsRecursiveCall = true;
+      return false;
+    }
+
+    if (TTI.isLoweredToCall(F)) {
+      // We account for the average 1 instruction per call argument setup
+      // here.
+      Cost += CS.arg_size() * InlineConstants::InstrCost;
+
+      // Everything other than inline ASM will also have a significant cost
+      // merely from making the call.
+      if (!isa<InlineAsm>(CS.getCalledValue()))
+        Cost += InlineConstants::CallPenalty;
+    }
+
+    return Base::visitCallSite(CS);
+  }
+
+  // Otherwise we're in a very special case -- an indirect function call. See
+  // if we can be particularly clever about this.
+  Value *Callee = CS.getCalledValue();
+
+  // First, pay the price of the argument setup. We account for the average
+  // 1 instruction per call argument setup here.
+  Cost += CS.arg_size() * InlineConstants::InstrCost;
+
+  // Next, check if this happens to be an indirect function call to a known
+  // function in this inline context. If not, we've done all we can.
+  Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
+  if (!F)
+    return Base::visitCallSite(CS);
+
+  // If we have a constant that we are calling as a function, we can peer
+  // through it and see the function target. This happens not infrequently
+  // during devirtualization and so we want to give it a hefty bonus for
+  // inlining, but cap that bonus in the event that inlining wouldn't pan
+  // out. Pretend to inline the function, with a custom threshold.
+  CallAnalyzer CA(TD, TTI, *F, InlineConstants::IndirectCallThreshold);
+  if (CA.analyzeCall(CS)) {
+    // We were able to inline the indirect call! Subtract the cost from the
+    // bonus we want to apply, but don't go below zero.
+    Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost());
+  }
+
+  return Base::visitCallSite(CS);
+}
+
+bool CallAnalyzer::visitInstruction(Instruction &I) {
+  // Some instructions are free. All of the free intrinsics can also be
+  // handled by SROA, etc.
+  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
+    return true;
+
+  // We found something we don't understand or can't handle. Mark any SROA-able
+  // values in the operand list as no longer viable.
+  for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
+    disableSROA(*OI);
+
+  return false;
+}
+
+
+/// \brief Analyze a basic block for its contribution to the inline cost.
+///
+/// This method walks the analyzer over every instruction in the given basic
+/// block and accounts for their cost during inlining at this callsite. It
+/// aborts early if the threshold has been exceeded or an impossible to inline
+/// construct has been detected. It returns false if inlining is no longer
+/// viable, and true if inlining remains viable.
+bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
+       I != E; ++I) {
+    ++NumInstructions;
+    if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
+      ++NumVectorInstructions;
+
+    // If the instruction simplified to a constant, there is no cost to this
+    // instruction. Visit the instructions using our InstVisitor to account for
+    // all of the per-instruction logic. The visit tree returns true if we
+    // consumed the instruction in any way, and false if the instruction's base
+    // cost should count against inlining.
+    if (Base::visit(I))
+      ++NumInstructionsSimplified;
+    else
+      Cost += InlineConstants::InstrCost;
+
+    // If the visit this instruction detected an uninlinable pattern, abort.
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      return false;
+
+    // If the caller is a recursive function then we don't want to inline
+    // functions which allocate a lot of stack space because it would increase
+    // the caller stack usage dramatically.
+    if (IsCallerRecursive &&
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+      return false;
+
+    if (NumVectorInstructions > NumInstructions/2)
+      VectorBonus = FiftyPercentVectorBonus;
+    else if (NumVectorInstructions > NumInstructions/10)
+      VectorBonus = TenPercentVectorBonus;
+    else
+      VectorBonus = 0;
+
+    // Check if we've past the threshold so we don't spin in huge basic
+    // blocks that will never inline.
+    if (Cost > (Threshold + VectorBonus))
+      return false;
+  }
+
+  return true;
+}
+
+/// \brief Compute the base pointer and cumulative constant offsets for V.
+///
+/// This strips all constant offsets off of V, leaving it the base pointer, and
+/// accumulates the total constant offset applied in the returned constant. It
+/// returns 0 if V is not a pointer, and returns the constant '0' if there are
+/// no constant offsets applied.
+ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
+  if (!TD || !V->getType()->isPointerTy())
+    return 0;
+
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  APInt Offset = APInt::getNullValue(IntPtrWidth);
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(V);
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
+        return 0;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        break;
+      V = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
+}
+
+/// \brief Analyze a call site for potential inlining.
+///
+/// Returns true if inlining this call is viable, and false if it is not
+/// viable. It computes the cost and adjusts the threshold based on numerous
+/// factors and heuristics. If this method returns false but the computed cost
+/// is below the computed threshold, then inlining was forcibly disabled by
+/// some artifact of the routine.
+bool CallAnalyzer::analyzeCall(CallSite CS) {
+  ++NumCallsAnalyzed;
+
+  // Track whether the post-inlining function would have more than one basic
+  // block. A single basic block is often intended for inlining. Balloon the
+  // threshold by 50% until we pass the single-BB phase.
+  bool SingleBB = true;
+  int SingleBBBonus = Threshold / 2;
+  Threshold += SingleBBBonus;
+
+  // Perform some tweaks to the cost and threshold based on the direct
+  // callsite information.
+
+  // We want to more aggressively inline vector-dense kernels, so up the
+  // threshold, and we'll lower it if the % of vector instructions gets too
+  // low.
+  assert(NumInstructions == 0);
+  assert(NumVectorInstructions == 0);
+  FiftyPercentVectorBonus = Threshold;
+  TenPercentVectorBonus = Threshold / 2;
+
+  // Give out bonuses per argument, as the instructions setting them up will
+  // be gone after inlining.
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+    if (TD && CS.isByValArgument(I)) {
+      // We approximate the number of loads and stores needed by dividing the
+      // size of the byval type by the target's pointer size.
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = TD->getPointerSizeInBits();
+      // Ceiling division.
+      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+      // If it generates more than 8 stores it is likely to be expanded as an
+      // inline memcpy so we take that as an upper bound. Otherwise we assume
+      // one load and one store per word copied.
+      // FIXME: The maxStoresPerMemcpy setting from the target should be used
+      // here instead of a magic number of 8, but it's not available via
+      // DataLayout.
+      NumStores = std::min(NumStores, 8U);
+
+      Cost -= 2 * NumStores * InlineConstants::InstrCost;
+    } else {
+      // For non-byval arguments subtract off one instruction per call
+      // argument.
+      Cost -= InlineConstants::InstrCost;
+    }
+  }
+
+  // If there is only one call of the function, and it has internal linkage,
+  // the cost of inlining it drops dramatically.
+  bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
+    &F == CS.getCalledFunction();
+  if (OnlyOneCallAndLocalLinkage)
+    Cost += InlineConstants::LastCallToStaticBonus;
+
+  // If the instruction after the call, or if the normal destination of the
+  // invoke is an unreachable instruction, the function is noreturn. As such,
+  // there is little point in inlining this unless there is literally zero
+  // cost.
+  Instruction *Instr = CS.getInstruction();
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
+    if (isa<UnreachableInst>(II->getNormalDest()->begin()))
+      Threshold = 1;
+  } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
+    Threshold = 1;
+
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (F.getCallingConv() == CallingConv::Cold)
+    Cost += InlineConstants::ColdccPenalty;
+
+  // Check if we're done. This can happen due to bonuses and penalties.
+  if (Cost > Threshold)
+    return false;
+
+  if (F.empty())
+    return true;
+
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  // Check if the caller function is recursive itself.
+  for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end();
+       U != E; ++U) {
+    CallSite Site(cast<Value>(*U));
+    if (!Site)
+      continue;
+    Instruction *I = Site.getInstruction();
+    if (I->getParent()->getParent() == Caller) {
+      IsCallerRecursive = true;
+      break;
+    }
+  }
+
+  // Track whether we've seen a return instruction. The first return
+  // instruction is free, as at least one will usually disappear in inlining.
+  bool HasReturn = false;
+
+  // Populate our simplified values by mapping from function arguments to call
+  // arguments with known important simplifications.
+  CallSite::arg_iterator CAI = CS.arg_begin();
+  for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
+       FAI != FAE; ++FAI, ++CAI) {
+    assert(CAI != CS.arg_end());
+    if (Constant *C = dyn_cast<Constant>(CAI))
+      SimplifiedValues[FAI] = C;
+
+    Value *PtrArg = *CAI;
+    if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
+      ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue());
+
+      // We can SROA any pointer arguments derived from alloca instructions.
+      if (isa<AllocaInst>(PtrArg)) {
+        SROAArgValues[FAI] = PtrArg;
+        SROAArgCosts[PtrArg] = 0;
+      }
+    }
+  }
+  NumConstantArgs = SimplifiedValues.size();
+  NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
+  NumAllocaArgs = SROAArgValues.size();
+
+  // The worklist of live basic blocks in the callee *after* inlining. We avoid
+  // adding basic blocks of the callee which can be proven to be dead for this
+  // particular call site in order to get more accurate cost estimates. This
+  // requires a somewhat heavyweight iteration pattern: we need to walk the
+  // basic blocks in a breadth-first order as we insert live successors. To
+  // accomplish this, prioritizing for small iterations because we exit after
+  // crossing our threshold, we use a small-size optimized SetVector.
+  typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
+                                  SmallPtrSet<BasicBlock *, 16> > BBSetVector;
+  BBSetVector BBWorklist;
+  BBWorklist.insert(&F.getEntryBlock());
+  // Note that we *must not* cache the size, this loop grows the worklist.
+  for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+    // Bail out the moment we cross the threshold. This means we'll under-count
+    // the cost, but only when undercounting doesn't matter.
+    if (Cost > (Threshold + VectorBonus))
+      break;
+
+    BasicBlock *BB = BBWorklist[Idx];
+    if (BB->empty())
+      continue;
+
+    // Handle the terminator cost here where we can track returns and other
+    // function-wide constructs.
+    TerminatorInst *TI = BB->getTerminator();
+
+    // We never want to inline functions that contain an indirectbr.  This is
+    // incorrect because all the blockaddress's (in static global initializers
+    // for example) would be referring to the original function, and this
+    // indirect jump would jump from the inlined copy of the function into the 
+    // original function which is extremely undefined behavior.
+    // FIXME: This logic isn't really right; we can safely inline functions
+    // with indirectbr's as long as no other function or global references the
+    // blockaddress of a block within the current function.  And as a QOI issue,
+    // if someone is using a blockaddress without an indirectbr, and that
+    // reference somehow ends up in another function or global, we probably
+    // don't want to inline this function.
+    if (isa<IndirectBrInst>(TI))
+      return false;
+
+    if (!HasReturn && isa<ReturnInst>(TI))
+      HasReturn = true;
+    else
+      Cost += InlineConstants::InstrCost;
+
+    // Analyze the cost of this block. If we blow through the threshold, this
+    // returns false, and we can bail on out.
+    if (!analyzeBlock(BB)) {
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+        return false;
+
+      // If the caller is a recursive function then we don't want to inline
+      // functions which allocate a lot of stack space because it would increase
+      // the caller stack usage dramatically.
+      if (IsCallerRecursive &&
+          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+        return false;
+
+      break;
+    }
+
+    // Add in the live successors by first checking whether we have terminator
+    // that may be simplified based on the values simplified by this call.
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional()) {
+        Value *Cond = BI->getCondition();
+        if (ConstantInt *SimpleCond
+              = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+          BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
+          continue;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *Cond = SI->getCondition();
+      if (ConstantInt *SimpleCond
+            = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
+        BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+        continue;
+      }
+    }
+
+    // If we're unable to select a particular successor, just count all of
+    // them.
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
+         ++TIdx)
+      BBWorklist.insert(TI->getSuccessor(TIdx));
+
+    // If we had any successors at this point, than post-inlining is likely to
+    // have them as well. Note that we assume any basic blocks which existed
+    // due to branches or switches which folded above will also fold after
+    // inlining.
+    if (SingleBB && TI->getNumSuccessors() > 1) {
+      // Take off the bonus we applied to the threshold.
+      Threshold -= SingleBBBonus;
+      SingleBB = false;
+    }
+  }
+
+  // If this is a noduplicate call, we can still inline as long as 
+  // inlining this would cause the removal of the caller (so the instruction
+  // is not actually duplicated, just moved).
+  if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
+    return false;
+
+  Threshold += VectorBonus;
+
+  return Cost < Threshold;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// \brief Dump stats about this call's analysis.
+void CallAnalyzer::dump() {
+#define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
+  DEBUG_PRINT_STAT(NumConstantArgs);
+  DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
+  DEBUG_PRINT_STAT(NumAllocaArgs);
+  DEBUG_PRINT_STAT(NumConstantPtrCmps);
+  DEBUG_PRINT_STAT(NumConstantPtrDiffs);
+  DEBUG_PRINT_STAT(NumInstructionsSimplified);
+  DEBUG_PRINT_STAT(SROACostSavings);
+  DEBUG_PRINT_STAT(SROACostSavingsLost);
+  DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
+#undef DEBUG_PRINT_STAT
+}
+#endif
+
+INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                      true, true)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis",
+                    true, true)
+
+char InlineCostAnalysis::ID = 0;
+
+InlineCostAnalysis::InlineCostAnalysis() : CallGraphSCCPass(ID), TD(0) {}
+
+InlineCostAnalysis::~InlineCostAnalysis() {}
+
+void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetTransformInfo>();
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TTI = &getAnalysis<TargetTransformInfo>();
+  return false;
+}
+
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
+  return getInlineCost(CS, CS.getCalledFunction(), Threshold);
+}
+
+InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
+                                             int Threshold) {
+  // Cannot inline indirect calls.
+  if (!Callee)
+    return llvm::InlineCost::getNever();
+
+  // Calls to functions with always-inline attributes should be inlined
+  // whenever possible.
+  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::AlwaysInline)) {
+    if (isInlineViable(*Callee))
+      return llvm::InlineCost::getAlways();
+    return llvm::InlineCost::getNever();
+  }
+
+  // Don't inline functions which can be redefined at link-time to mean
+  // something else.  Don't inline functions marked noinline or call sites
+  // marked noinline.
+  if (Callee->mayBeOverridden() ||
+      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::NoInline) ||
+      CS.isNoInline())
+    return llvm::InlineCost::getNever();
+
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+        << "...\n");
+
+  CallAnalyzer CA(TD, *TTI, *Callee, Threshold);
+  bool ShouldInline = CA.analyzeCall(CS);
+
+  DEBUG(CA.dump());
+
+  // Check if there was a reason to force inlining or no inlining.
+  if (!ShouldInline && CA.getCost() < CA.getThreshold())
+    return InlineCost::getNever();
+  if (ShouldInline && CA.getCost() >= CA.getThreshold())
+    return InlineCost::getAlways();
+
+  return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
+}
+
+bool InlineCostAnalysis::isInlineViable(Function &F) {
+  bool ReturnsTwice =
+    F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                   Attribute::ReturnsTwice);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Disallow inlining of functions which contain an indirect branch.
+    if (isa<IndirectBrInst>(BI->getTerminator()))
+      return false;
+
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      CallSite CS(II);
+      if (!CS)
+        continue;
+
+      // Disallow recursive calls.
+      if (&F == CS.getCalledFunction())
+        return false;
+
+      // Disallow calls which expose returns-twice to a function not previously
+      // attributed as such.
+      if (!ReturnsTwice && CS.isCall() &&
+          cast<CallInst>(CS.getInstruction())->canReturnTwice())
+        return false;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Analysis/IVUsers.cpp b/contrib/llvm/lib/Analysis/IVUsers.cpp
new file mode 100644
index 000000000000..b33e2cb9999e
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IVUsers.cpp
@@ -0,0 +1,335 @@
+//===- IVUsers.cpp - Induction Variable Users -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bookkeeping for "interesting" users of expressions
+// computed from induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "iv-users"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+char IVUsers::ID = 0;
+INITIALIZE_PASS_BEGIN(IVUsers, "iv-users",
+                      "Induction Variable Users", false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(IVUsers, "iv-users",
+                      "Induction Variable Users", false, true)
+
+Pass *llvm::createIVUsersPass() {
+  return new IVUsers();
+}
+
+/// isInteresting - Test whether the given expression is "interesting" when
+/// used by the given expression, within the context of analyzing the
+/// given loop.
+static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
+                          ScalarEvolution *SE, LoopInfo *LI) {
+  // An addrec is interesting if it's affine or if it has an interesting start.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Keep things simple. Don't touch loop-variant strides unless they're
+    // only used outside the loop and we can simplify them.
+    if (AR->getLoop() == L)
+      return AR->isAffine() ||
+             (!L->contains(I) &&
+              SE->getSCEVAtScope(AR, LI->getLoopFor(I->getParent())) != AR);
+    // Otherwise recurse to see if the start value is interesting, and that
+    // the step value is not interesting, since we don't yet know how to
+    // do effective SCEV expansions for addrecs with interesting steps.
+    return isInteresting(AR->getStart(), I, L, SE, LI) &&
+          !isInteresting(AR->getStepRecurrence(*SE), I, L, SE, LI);
+  }
+
+  // An add is interesting if exactly one of its operands is interesting.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    bool AnyInterestingYet = false;
+    for (SCEVAddExpr::op_iterator OI = Add->op_begin(), OE = Add->op_end();
+         OI != OE; ++OI)
+      if (isInteresting(*OI, I, L, SE, LI)) {
+        if (AnyInterestingYet)
+          return false;
+        AnyInterestingYet = true;
+      }
+    return AnyInterestingYet;
+  }
+
+  // Nothing else is interesting here.
+  return false;
+}
+
+/// Return true if all loop headers that dominate this block are in simplified
+/// form.
+static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
+                                 const LoopInfo *LI,
+                                 SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+  Loop *NearestLoop = 0;
+  for (DomTreeNode *Rung = DT->getNode(BB);
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *DomBB = Rung->getBlock();
+    Loop *DomLoop = LI->getLoopFor(DomBB);
+    if (DomLoop && DomLoop->getHeader() == DomBB) {
+      // If the domtree walk reaches a loop with no preheader, return false.
+      if (!DomLoop->isLoopSimplifyForm())
+        return false;
+      // If we have already checked this loop nest, stop checking.
+      if (SimpleLoopNests.count(DomLoop))
+        break;
+      // If we have not already checked this loop nest, remember the loop
+      // header nearest to BB. The nearest loop may not contain BB.
+      if (!NearestLoop)
+        NearestLoop = DomLoop;
+    }
+  }
+  if (NearestLoop)
+    SimpleLoopNests.insert(NearestLoop);
+  return true;
+}
+
+/// AddUsersImpl - Inspect the specified instruction.  If it is a
+/// reducible SCEV, recursively add its users to the IVUsesByStride set and
+/// return true.  Otherwise, return false.
+bool IVUsers::AddUsersImpl(Instruction *I,
+                           SmallPtrSet<Loop*,16> &SimpleLoopNests) {
+  // Add this IV user to the Processed set before returning false to ensure that
+  // all IV users are members of the set. See IVUsers::isIVUserOrOperand.
+  if (!Processed.insert(I))
+    return true;    // Instruction already handled.
+
+  if (!SE->isSCEVable(I->getType()))
+    return false;   // Void and FP expressions cannot be reduced.
+
+  // IVUsers is used by LSR which assumes that all SCEV expressions are safe to
+  // pass to SCEVExpander. Expressions are not safe to expand if they represent
+  // operations that are not safe to speculate, namely integer division.
+  if (!isa<PHINode>(I) && !isSafeToSpeculativelyExecute(I, TD))
+    return false;
+
+  // LSR is not APInt clean, do not touch integers bigger than 64-bits.
+  // Also avoid creating IVs of non-native types. For example, we don't want a
+  // 64-bit IV in 32-bit code just because the loop has one 64-bit cast.
+  uint64_t Width = SE->getTypeSizeInBits(I->getType());
+  if (Width > 64 || (TD && !TD->isLegalInteger(Width)))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *ISE = SE->getSCEV(I);
+
+  // If we've come to an uninteresting expression, stop the traversal and
+  // call this a user.
+  if (!isInteresting(ISE, I, L, SE, LI))
+    return false;
+
+  SmallPtrSet<Instruction *, 4> UniqueUsers;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!UniqueUsers.insert(User))
+      continue;
+
+    // Do not infinitely recurse on PHI nodes.
+    if (isa<PHINode>(User) && Processed.count(User))
+      continue;
+
+    // Only consider IVUsers that are dominated by simplified loop
+    // headers. Otherwise, SCEVExpander will crash.
+    BasicBlock *UseBB = User->getParent();
+    // A phi's use is live out of its predecessor block.
+    if (PHINode *PHI = dyn_cast<PHINode>(User)) {
+      unsigned OperandNo = UI.getOperandNo();
+      unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+      UseBB = PHI->getIncomingBlock(ValNo);
+    }
+    if (!isSimplifiedLoopNest(UseBB, DT, LI, SimpleLoopNests))
+      return false;
+
+    // Descend recursively, but not into PHI nodes outside the current loop.
+    // It's important to see the entire expression outside the loop to get
+    // choices that depend on addressing mode use right, although we won't
+    // consider references outside the loop in all cases.
+    // If User is already in Processed, we don't want to recurse into it again,
+    // but do want to record a second reference in the same instruction.
+    bool AddUserToIVUsers = false;
+    if (LI->getLoopFor(User->getParent()) != L) {
+      if (isa<PHINode>(User) || Processed.count(User) ||
+          !AddUsersImpl(User, SimpleLoopNests)) {
+        DEBUG(dbgs() << "FOUND USER in other loop: " << *User << '\n'
+                     << "   OF SCEV: " << *ISE << '\n');
+        AddUserToIVUsers = true;
+      }
+    } else if (Processed.count(User) || !AddUsersImpl(User, SimpleLoopNests)) {
+      DEBUG(dbgs() << "FOUND USER: " << *User << '\n'
+                   << "   OF SCEV: " << *ISE << '\n');
+      AddUserToIVUsers = true;
+    }
+
+    if (AddUserToIVUsers) {
+      // Okay, we found a user that we cannot reduce.
+      IVUses.push_back(new IVStrideUse(this, User, I));
+      IVStrideUse &NewUse = IVUses.back();
+      // Autodetect the post-inc loop set, populating NewUse.PostIncLoops.
+      // The regular return value here is discarded; instead of recording
+      // it, we just recompute it when we need it.
+      ISE = TransformForPostIncUse(NormalizeAutodetect,
+                                   ISE, User, I,
+                                   NewUse.PostIncLoops,
+                                   *SE, *DT);
+      DEBUG(if (SE->getSCEV(I) != ISE)
+              dbgs() << "   NORMALIZED TO: " << *ISE << '\n');
+    }
+  }
+  return true;
+}
+
+bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+  // SCEVExpander can only handle users that are dominated by simplified loop
+  // entries. Keep track of all loops that are only dominated by other simple
+  // loops so we don't traverse the domtree for each user.
+  SmallPtrSet<Loop*,16> SimpleLoopNests;
+
+  return AddUsersImpl(I, SimpleLoopNests);
+}
+
+IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
+  IVUses.push_back(new IVStrideUse(this, User, Operand));
+  return IVUses.back();
+}
+
+IVUsers::IVUsers()
+    : LoopPass(ID) {
+  initializeIVUsersPass(*PassRegistry::getPassRegistry());
+}
+
+void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<DominatorTree>();
+  AU.addRequired<ScalarEvolution>();
+  AU.setPreservesAll();
+}
+
+bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
+
+  L = l;
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  // Find all uses of induction variables in this loop, and categorize
+  // them by stride.  Start by finding all of the PHI nodes in the header for
+  // this loop.  If they are induction variables, inspect their uses.
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
+    (void)AddUsersIfInteresting(I);
+
+  return false;
+}
+
+void IVUsers::print(raw_ostream &OS, const Module *M) const {
+  OS << "IV Users for loop ";
+  WriteAsOperand(OS, L->getHeader(), false);
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << " with backedge-taken count "
+       << *SE->getBackedgeTakenCount(L);
+  }
+  OS << ":\n";
+
+  for (ilist<IVStrideUse>::const_iterator UI = IVUses.begin(),
+       E = IVUses.end(); UI != E; ++UI) {
+    OS << "  ";
+    WriteAsOperand(OS, UI->getOperandValToReplace(), false);
+    OS << " = " << *getReplacementExpr(*UI);
+    for (PostIncLoopSet::const_iterator
+         I = UI->PostIncLoops.begin(),
+         E = UI->PostIncLoops.end(); I != E; ++I) {
+      OS << " (post-inc with loop ";
+      WriteAsOperand(OS, (*I)->getHeader(), false);
+      OS << ")";
+    }
+    OS << " in  ";
+    UI->getUser()->print(OS);
+    OS << '\n';
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void IVUsers::dump() const {
+  print(dbgs());
+}
+#endif
+
+void IVUsers::releaseMemory() {
+  Processed.clear();
+  IVUses.clear();
+}
+
+/// getReplacementExpr - Return a SCEV expression which computes the
+/// value of the OperandValToReplace.
+const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &IU) const {
+  return SE->getSCEV(IU.getOperandValToReplace());
+}
+
+/// getExpr - Return the expression for the use.
+const SCEV *IVUsers::getExpr(const IVStrideUse &IU) const {
+  return
+    TransformForPostIncUse(Normalize, getReplacementExpr(IU),
+                           IU.getUser(), IU.getOperandValToReplace(),
+                           const_cast<PostIncLoopSet &>(IU.getPostIncLoops()),
+                           *SE, *DT);
+}
+
+static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (AR->getLoop() == L)
+      return AR;
+    return findAddRecForLoop(AR->getStart(), L);
+  }
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I)
+      if (const SCEVAddRecExpr *AR = findAddRecForLoop(*I, L))
+        return AR;
+    return 0;
+  }
+
+  return 0;
+}
+
+const SCEV *IVUsers::getStride(const IVStrideUse &IU, const Loop *L) const {
+  if (const SCEVAddRecExpr *AR = findAddRecForLoop(getExpr(IU), L))
+    return AR->getStepRecurrence(*SE);
+  return 0;
+}
+
+void IVStrideUse::transformToPostInc(const Loop *L) {
+  PostIncLoops.insert(L);
+}
+
+void IVStrideUse::deleted() {
+  // Remove this user from the list.
+  Parent->Processed.erase(this->getUser());
+  Parent->IVUses.erase(this);
+  // this now dangles!
+}
diff --git a/contrib/llvm/lib/Analysis/InstCount.cpp b/contrib/llvm/lib/Analysis/InstCount.cpp
new file mode 100644
index 000000000000..75a49eb90a88
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/InstCount.cpp
@@ -0,0 +1,87 @@
+//===-- InstCount.cpp - Collects the count of all instructions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass collects the count of all instructions and reports them
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcount"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(TotalInsts , "Number of instructions (of all types)");
+STATISTIC(TotalBlocks, "Number of basic blocks");
+STATISTIC(TotalFuncs , "Number of non-external functions");
+STATISTIC(TotalMemInst, "Number of memory instructions");
+
+#define HANDLE_INST(N, OPCODE, CLASS) \
+  STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
+
+#include "llvm/IR/Instruction.def"
+
+
+namespace {
+  class InstCount : public FunctionPass, public InstVisitor<InstCount> {
+    friend class InstVisitor<InstCount>;
+
+    void visitFunction  (Function &F) { ++TotalFuncs; }
+    void visitBasicBlock(BasicBlock &BB) { ++TotalBlocks; }
+
+#define HANDLE_INST(N, OPCODE, CLASS) \
+    void visit##OPCODE(CLASS &) { ++Num##OPCODE##Inst; ++TotalInsts; }
+
+#include "llvm/IR/Instruction.def"
+
+    void visitInstruction(Instruction &I) {
+      errs() << "Instruction Count does not know about " << I;
+      llvm_unreachable(0);
+    }
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    InstCount() : FunctionPass(ID) {
+      initializeInstCountPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+    virtual void print(raw_ostream &O, const Module *M) const {}
+
+  };
+}
+
+char InstCount::ID = 0;
+INITIALIZE_PASS(InstCount, "instcount",
+                "Counts the various types of Instructions", false, true)
+
+FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
+
+// InstCount::run - This is the main Analysis entry point for a
+// function.
+//
+bool InstCount::runOnFunction(Function &F) {
+  unsigned StartMemInsts =
+    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
+    NumInvokeInst + NumAllocaInst;
+  visit(F);
+  unsigned EndMemInsts =
+    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
+    NumInvokeInst + NumAllocaInst;
+  TotalMemInst += EndMemInsts-StartMemInsts;
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
new file mode 100644
index 000000000000..4a3c74e9db35
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -0,0 +1,3215 @@
+//===- InstructionSimplify.cpp - Fold instruction operands ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements routines for folding instructions into simpler forms
+// that do not require creating new instructions.  This does constant folding
+// ("add i32 1, 1" -> "2") but can also handle non-constant operands, either
+// returning a constant ("and i32 %x, 0" -> "0") or an already existing value
+// ("and i32 %x, %x" -> "%x").  All operands are assumed to have already been
+// simplified: This is usually true and assuming it simplifies the logic (if
+// they have not been simplified then results are correct but maybe suboptimal).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instsimplify"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+enum { RecursionLimit = 3 };
+
+STATISTIC(NumExpand,  "Number of expansions");
+STATISTIC(NumFactor , "Number of factorizations");
+STATISTIC(NumReassoc, "Number of reassociations");
+
+struct Query {
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const DominatorTree *DT;
+
+  Query(const DataLayout *td, const TargetLibraryInfo *tli,
+        const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {}
+};
+
+static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &,
+                            unsigned);
+static Value *SimplifyCmpInst(unsigned, Value *, Value *, const Query &,
+                              unsigned);
+static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned);
+static Value *SimplifyTruncInst(Value *, Type *, const Query &, unsigned);
+
+/// getFalse - For a boolean type, or a vector of boolean type, return false, or
+/// a vector with every element false, as appropriate for the type.
+static Constant *getFalse(Type *Ty) {
+  assert(Ty->getScalarType()->isIntegerTy(1) &&
+         "Expected i1 type or a vector of i1!");
+  return Constant::getNullValue(Ty);
+}
+
+/// getTrue - For a boolean type, or a vector of boolean type, return true, or
+/// a vector with every element true, as appropriate for the type.
+static Constant *getTrue(Type *Ty) {
+  assert(Ty->getScalarType()->isIntegerTy(1) &&
+         "Expected i1 type or a vector of i1!");
+  return Constant::getAllOnesValue(Ty);
+}
+
+/// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
+static bool isSameCompare(Value *V, CmpInst::Predicate Pred, Value *LHS,
+                          Value *RHS) {
+  CmpInst *Cmp = dyn_cast<CmpInst>(V);
+  if (!Cmp)
+    return false;
+  CmpInst::Predicate CPred = Cmp->getPredicate();
+  Value *CLHS = Cmp->getOperand(0), *CRHS = Cmp->getOperand(1);
+  if (CPred == Pred && CLHS == LHS && CRHS == RHS)
+    return true;
+  return CPred == CmpInst::getSwappedPredicate(Pred) && CLHS == RHS &&
+    CRHS == LHS;
+}
+
+/// ValueDominatesPHI - Does the given value dominate the specified phi node?
+static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    // Arguments and constants dominate all instructions.
+    return true;
+
+  // If we are processing instructions (and/or basic blocks) that have not been
+  // fully added to a function, the parent nodes may still be null. Simply
+  // return the conservative answer in these cases.
+  if (!I->getParent() || !P->getParent() || !I->getParent()->getParent())
+    return false;
+
+  // If we have a DominatorTree then do a precise test.
+  if (DT) {
+    if (!DT->isReachableFromEntry(P->getParent()))
+      return true;
+    if (!DT->isReachableFromEntry(I->getParent()))
+      return false;
+    return DT->dominates(I, P);
+  }
+
+  // Otherwise, if the instruction is in the entry block, and is not an invoke,
+  // then it obviously dominates all phi nodes.
+  if (I->getParent() == &I->getParent()->getParent()->getEntryBlock() &&
+      !isa<InvokeInst>(I))
+    return true;
+
+  return false;
+}
+
+/// ExpandBinOp - Simplify "A op (B op' C)" by distributing op over op', turning
+/// it into "(A op B) op' (A op C)".  Here "op" is given by Opcode and "op'" is
+/// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
+/// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
+/// Returns the simplified value, or null if no simplification was performed.
+static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                          unsigned OpcToExpand, const Query &Q,
+                          unsigned MaxRecurse) {
+  Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  // Check whether the expression has the form "(A op' B) op C".
+  if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
+    if (Op0->getOpcode() == OpcodeToExpand) {
+      // It does!  Try turning it into "(A op C) op' (B op C)".
+      Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
+      // Do "A op C" and "B op C" both simplify?
+      if (Value *L = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse))
+        if (Value *R = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
+          // They do! Return "L op' R" if it simplifies or is already available.
+          // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
+          if ((L == A && R == B) || (Instruction::isCommutative(OpcodeToExpand)
+                                     && L == B && R == A)) {
+            ++NumExpand;
+            return LHS;
+          }
+          // Otherwise return "L op' R" if it simplifies.
+          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
+            ++NumExpand;
+            return V;
+          }
+        }
+    }
+
+  // Check whether the expression has the form "A op (B op' C)".
+  if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
+    if (Op1->getOpcode() == OpcodeToExpand) {
+      // It does!  Try turning it into "(A op B) op' (A op C)".
+      Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
+      // Do "A op B" and "A op C" both simplify?
+      if (Value *L = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse))
+        if (Value *R = SimplifyBinOp(Opcode, A, C, Q, MaxRecurse)) {
+          // They do! Return "L op' R" if it simplifies or is already available.
+          // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
+          if ((L == B && R == C) || (Instruction::isCommutative(OpcodeToExpand)
+                                     && L == C && R == B)) {
+            ++NumExpand;
+            return RHS;
+          }
+          // Otherwise return "L op' R" if it simplifies.
+          if (Value *V = SimplifyBinOp(OpcodeToExpand, L, R, Q, MaxRecurse)) {
+            ++NumExpand;
+            return V;
+          }
+        }
+    }
+
+  return 0;
+}
+
+/// FactorizeBinOp - Simplify "LHS Opcode RHS" by factorizing out a common term
+/// using the operation OpCodeToExtract.  For example, when Opcode is Add and
+/// OpCodeToExtract is Mul then this tries to turn "(A*B)+(A*C)" into "A*(B+C)".
+/// Returns the simplified value, or null if no simplification was performed.
+static Value *FactorizeBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                             unsigned OpcToExtract, const Query &Q,
+                             unsigned MaxRecurse) {
+  Instruction::BinaryOps OpcodeToExtract = (Instruction::BinaryOps)OpcToExtract;
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+
+  if (!Op0 || Op0->getOpcode() != OpcodeToExtract ||
+      !Op1 || Op1->getOpcode() != OpcodeToExtract)
+    return 0;
+
+  // The expression has the form "(A op' B) op (C op' D)".
+  Value *A = Op0->getOperand(0), *B = Op0->getOperand(1);
+  Value *C = Op1->getOperand(0), *D = Op1->getOperand(1);
+
+  // Use left distributivity, i.e. "X op' (Y op Z) = (X op' Y) op (X op' Z)".
+  // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+  // commutative case, "(A op' B) op (C op' A)"?
+  if (A == C || (Instruction::isCommutative(OpcodeToExtract) && A == D)) {
+    Value *DD = A == C ? D : C;
+    // Form "A op' (B op DD)" if it simplifies completely.
+    // Does "B op DD" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, B, DD, Q, MaxRecurse)) {
+      // It does!  Return "A op' V" if it simplifies or is already available.
+      // If V equals B then "A op' V" is just the LHS.  If V equals DD then
+      // "A op' V" is just the RHS.
+      if (V == B || V == DD) {
+        ++NumFactor;
+        return V == B ? LHS : RHS;
+      }
+      // Otherwise return "A op' V" if it simplifies.
+      if (Value *W = SimplifyBinOp(OpcodeToExtract, A, V, Q, MaxRecurse)) {
+        ++NumFactor;
+        return W;
+      }
+    }
+  }
+
+  // Use right distributivity, i.e. "(X op Y) op' Z = (X op' Z) op (Y op' Z)".
+  // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+  // commutative case, "(A op' B) op (B op' D)"?
+  if (B == D || (Instruction::isCommutative(OpcodeToExtract) && B == C)) {
+    Value *CC = B == D ? C : D;
+    // Form "(A op CC) op' B" if it simplifies completely..
+    // Does "A op CC" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, A, CC, Q, MaxRecurse)) {
+      // It does!  Return "V op' B" if it simplifies or is already available.
+      // If V equals A then "V op' B" is just the LHS.  If V equals CC then
+      // "V op' B" is just the RHS.
+      if (V == A || V == CC) {
+        ++NumFactor;
+        return V == A ? LHS : RHS;
+      }
+      // Otherwise return "V op' B" if it simplifies.
+      if (Value *W = SimplifyBinOp(OpcodeToExtract, V, B, Q, MaxRecurse)) {
+        ++NumFactor;
+        return W;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// SimplifyAssociativeBinOp - Generic simplifications for associative binary
+/// operations.  Returns the simpler value, or null if none was found.
+static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
+                                       const Query &Q, unsigned MaxRecurse) {
+  Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
+  assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
+
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+
+  // Transform: "(A op B) op C" ==> "A op (B op C)" if it simplifies completely.
+  if (Op0 && Op0->getOpcode() == Opcode) {
+    Value *A = Op0->getOperand(0);
+    Value *B = Op0->getOperand(1);
+    Value *C = RHS;
+
+    // Does "B op C" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, B, C, Q, MaxRecurse)) {
+      // It does!  Return "A op V" if it simplifies or is already available.
+      // If V equals B then "A op V" is just the LHS.
+      if (V == B) return LHS;
+      // Otherwise return "A op V" if it simplifies.
+      if (Value *W = SimplifyBinOp(Opcode, A, V, Q, MaxRecurse)) {
+        ++NumReassoc;
+        return W;
+      }
+    }
+  }
+
+  // Transform: "A op (B op C)" ==> "(A op B) op C" if it simplifies completely.
+  if (Op1 && Op1->getOpcode() == Opcode) {
+    Value *A = LHS;
+    Value *B = Op1->getOperand(0);
+    Value *C = Op1->getOperand(1);
+
+    // Does "A op B" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, A, B, Q, MaxRecurse)) {
+      // It does!  Return "V op C" if it simplifies or is already available.
+      // If V equals B then "V op C" is just the RHS.
+      if (V == B) return RHS;
+      // Otherwise return "V op C" if it simplifies.
+      if (Value *W = SimplifyBinOp(Opcode, V, C, Q, MaxRecurse)) {
+        ++NumReassoc;
+        return W;
+      }
+    }
+  }
+
+  // The remaining transforms require commutativity as well as associativity.
+  if (!Instruction::isCommutative(Opcode))
+    return 0;
+
+  // Transform: "(A op B) op C" ==> "(C op A) op B" if it simplifies completely.
+  if (Op0 && Op0->getOpcode() == Opcode) {
+    Value *A = Op0->getOperand(0);
+    Value *B = Op0->getOperand(1);
+    Value *C = RHS;
+
+    // Does "C op A" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
+      // It does!  Return "V op B" if it simplifies or is already available.
+      // If V equals A then "V op B" is just the LHS.
+      if (V == A) return LHS;
+      // Otherwise return "V op B" if it simplifies.
+      if (Value *W = SimplifyBinOp(Opcode, V, B, Q, MaxRecurse)) {
+        ++NumReassoc;
+        return W;
+      }
+    }
+  }
+
+  // Transform: "A op (B op C)" ==> "B op (C op A)" if it simplifies completely.
+  if (Op1 && Op1->getOpcode() == Opcode) {
+    Value *A = LHS;
+    Value *B = Op1->getOperand(0);
+    Value *C = Op1->getOperand(1);
+
+    // Does "C op A" simplify?
+    if (Value *V = SimplifyBinOp(Opcode, C, A, Q, MaxRecurse)) {
+      // It does!  Return "B op V" if it simplifies or is already available.
+      // If V equals C then "B op V" is just the RHS.
+      if (V == C) return RHS;
+      // Otherwise return "B op V" if it simplifies.
+      if (Value *W = SimplifyBinOp(Opcode, B, V, Q, MaxRecurse)) {
+        ++NumReassoc;
+        return W;
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// ThreadBinOpOverSelect - In the case of a binary operation with a select
+/// instruction as an operand, try to simplify the binop by seeing whether
+/// evaluating it on both branches of the select results in the same value.
+/// Returns the common value if so, otherwise returns null.
+static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
+                                    const Query &Q, unsigned MaxRecurse) {
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  SelectInst *SI;
+  if (isa<SelectInst>(LHS)) {
+    SI = cast<SelectInst>(LHS);
+  } else {
+    assert(isa<SelectInst>(RHS) && "No select instruction operand!");
+    SI = cast<SelectInst>(RHS);
+  }
+
+  // Evaluate the BinOp on the true and false branches of the select.
+  Value *TV;
+  Value *FV;
+  if (SI == LHS) {
+    TV = SimplifyBinOp(Opcode, SI->getTrueValue(), RHS, Q, MaxRecurse);
+    FV = SimplifyBinOp(Opcode, SI->getFalseValue(), RHS, Q, MaxRecurse);
+  } else {
+    TV = SimplifyBinOp(Opcode, LHS, SI->getTrueValue(), Q, MaxRecurse);
+    FV = SimplifyBinOp(Opcode, LHS, SI->getFalseValue(), Q, MaxRecurse);
+  }
+
+  // If they simplified to the same value, then return the common value.
+  // If they both failed to simplify then return null.
+  if (TV == FV)
+    return TV;
+
+  // If one branch simplified to undef, return the other one.
+  if (TV && isa<UndefValue>(TV))
+    return FV;
+  if (FV && isa<UndefValue>(FV))
+    return TV;
+
+  // If applying the operation did not change the true and false select values,
+  // then the result of the binop is the select itself.
+  if (TV == SI->getTrueValue() && FV == SI->getFalseValue())
+    return SI;
+
+  // If one branch simplified and the other did not, and the simplified
+  // value is equal to the unsimplified one, return the simplified value.
+  // For example, select (cond, X, X & Z) & Z -> X & Z.
+  if ((FV && !TV) || (TV && !FV)) {
+    // Check that the simplified value has the form "X op Y" where "op" is the
+    // same as the original operation.
+    Instruction *Simplified = dyn_cast<Instruction>(FV ? FV : TV);
+    if (Simplified && Simplified->getOpcode() == Opcode) {
+      // The value that didn't simplify is "UnsimplifiedLHS op UnsimplifiedRHS".
+      // We already know that "op" is the same as for the simplified value.  See
+      // if the operands match too.  If so, return the simplified value.
+      Value *UnsimplifiedBranch = FV ? SI->getTrueValue() : SI->getFalseValue();
+      Value *UnsimplifiedLHS = SI == LHS ? UnsimplifiedBranch : LHS;
+      Value *UnsimplifiedRHS = SI == LHS ? RHS : UnsimplifiedBranch;
+      if (Simplified->getOperand(0) == UnsimplifiedLHS &&
+          Simplified->getOperand(1) == UnsimplifiedRHS)
+        return Simplified;
+      if (Simplified->isCommutative() &&
+          Simplified->getOperand(1) == UnsimplifiedLHS &&
+          Simplified->getOperand(0) == UnsimplifiedRHS)
+        return Simplified;
+    }
+  }
+
+  return 0;
+}
+
+/// ThreadCmpOverSelect - In the case of a comparison with a select instruction,
+/// try to simplify the comparison by seeing whether both branches of the select
+/// result in the same value.  Returns the common value if so, otherwise returns
+/// null.
+static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
+                                  Value *RHS, const Query &Q,
+                                  unsigned MaxRecurse) {
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  // Make sure the select is on the LHS.
+  if (!isa<SelectInst>(LHS)) {
+    std::swap(LHS, RHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+  }
+  assert(isa<SelectInst>(LHS) && "Not comparing with a select instruction!");
+  SelectInst *SI = cast<SelectInst>(LHS);
+  Value *Cond = SI->getCondition();
+  Value *TV = SI->getTrueValue();
+  Value *FV = SI->getFalseValue();
+
+  // Now that we have "cmp select(Cond, TV, FV), RHS", analyse it.
+  // Does "cmp TV, RHS" simplify?
+  Value *TCmp = SimplifyCmpInst(Pred, TV, RHS, Q, MaxRecurse);
+  if (TCmp == Cond) {
+    // It not only simplified, it simplified to the select condition.  Replace
+    // it with 'true'.
+    TCmp = getTrue(Cond->getType());
+  } else if (!TCmp) {
+    // It didn't simplify.  However if "cmp TV, RHS" is equal to the select
+    // condition then we can replace it with 'true'.  Otherwise give up.
+    if (!isSameCompare(Cond, Pred, TV, RHS))
+      return 0;
+    TCmp = getTrue(Cond->getType());
+  }
+
+  // Does "cmp FV, RHS" simplify?
+  Value *FCmp = SimplifyCmpInst(Pred, FV, RHS, Q, MaxRecurse);
+  if (FCmp == Cond) {
+    // It not only simplified, it simplified to the select condition.  Replace
+    // it with 'false'.
+    FCmp = getFalse(Cond->getType());
+  } else if (!FCmp) {
+    // It didn't simplify.  However if "cmp FV, RHS" is equal to the select
+    // condition then we can replace it with 'false'.  Otherwise give up.
+    if (!isSameCompare(Cond, Pred, FV, RHS))
+      return 0;
+    FCmp = getFalse(Cond->getType());
+  }
+
+  // If both sides simplified to the same value, then use it as the result of
+  // the original comparison.
+  if (TCmp == FCmp)
+    return TCmp;
+
+  // The remaining cases only make sense if the select condition has the same
+  // type as the result of the comparison, so bail out if this is not so.
+  if (Cond->getType()->isVectorTy() != RHS->getType()->isVectorTy())
+    return 0;
+  // If the false value simplified to false, then the result of the compare
+  // is equal to "Cond && TCmp".  This also catches the case when the false
+  // value simplified to false and the true value to true, returning "Cond".
+  if (match(FCmp, m_Zero()))
+    if (Value *V = SimplifyAndInst(Cond, TCmp, Q, MaxRecurse))
+      return V;
+  // If the true value simplified to true, then the result of the compare
+  // is equal to "Cond || FCmp".
+  if (match(TCmp, m_One()))
+    if (Value *V = SimplifyOrInst(Cond, FCmp, Q, MaxRecurse))
+      return V;
+  // Finally, if the false value simplified to true and the true value to
+  // false, then the result of the compare is equal to "!Cond".
+  if (match(FCmp, m_One()) && match(TCmp, m_Zero()))
+    if (Value *V =
+        SimplifyXorInst(Cond, Constant::getAllOnesValue(Cond->getType()),
+                        Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+/// ThreadBinOpOverPHI - In the case of a binary operation with an operand that
+/// is a PHI instruction, try to simplify the binop by seeing whether evaluating
+/// it on the incoming phi values yields the same result for every value.  If so
+/// returns the common value, otherwise returns null.
+static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
+                                 const Query &Q, unsigned MaxRecurse) {
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  PHINode *PI;
+  if (isa<PHINode>(LHS)) {
+    PI = cast<PHINode>(LHS);
+    // Bail out if RHS and the phi may be mutually interdependent due to a loop.
+    if (!ValueDominatesPHI(RHS, PI, Q.DT))
+      return 0;
+  } else {
+    assert(isa<PHINode>(RHS) && "No PHI instruction operand!");
+    PI = cast<PHINode>(RHS);
+    // Bail out if LHS and the phi may be mutually interdependent due to a loop.
+    if (!ValueDominatesPHI(LHS, PI, Q.DT))
+      return 0;
+  }
+
+  // Evaluate the BinOp on the incoming phi values.
+  Value *CommonValue = 0;
+  for (unsigned i = 0, e = PI->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = PI->getIncomingValue(i);
+    // If the incoming value is the phi node itself, it can safely be skipped.
+    if (Incoming == PI) continue;
+    Value *V = PI == LHS ?
+      SimplifyBinOp(Opcode, Incoming, RHS, Q, MaxRecurse) :
+      SimplifyBinOp(Opcode, LHS, Incoming, Q, MaxRecurse);
+    // If the operation failed to simplify, or simplified to a different value
+    // to previously, then give up.
+    if (!V || (CommonValue && V != CommonValue))
+      return 0;
+    CommonValue = V;
+  }
+
+  return CommonValue;
+}
+
+/// ThreadCmpOverPHI - In the case of a comparison with a PHI instruction, try
+/// try to simplify the comparison by seeing whether comparing with all of the
+/// incoming phi values yields the same result every time.  If so returns the
+/// common result, otherwise returns null.
+static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
+                               const Query &Q, unsigned MaxRecurse) {
+  // Recursion is always used, so bail out at once if we already hit the limit.
+  if (!MaxRecurse--)
+    return 0;
+
+  // Make sure the phi is on the LHS.
+  if (!isa<PHINode>(LHS)) {
+    std::swap(LHS, RHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+  }
+  assert(isa<PHINode>(LHS) && "Not comparing with a phi instruction!");
+  PHINode *PI = cast<PHINode>(LHS);
+
+  // Bail out if RHS and the phi may be mutually interdependent due to a loop.
+  if (!ValueDominatesPHI(RHS, PI, Q.DT))
+    return 0;
+
+  // Evaluate the BinOp on the incoming phi values.
+  Value *CommonValue = 0;
+  for (unsigned i = 0, e = PI->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = PI->getIncomingValue(i);
+    // If the incoming value is the phi node itself, it can safely be skipped.
+    if (Incoming == PI) continue;
+    Value *V = SimplifyCmpInst(Pred, Incoming, RHS, Q, MaxRecurse);
+    // If the operation failed to simplify, or simplified to a different value
+    // to previously, then give up.
+    if (!V || (CommonValue && V != CommonValue))
+      return 0;
+    CommonValue = V;
+  }
+
+  return CommonValue;
+}
+
+/// SimplifyAddInst - Given operands for an Add, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::Add, CLHS->getType(), Ops,
+                                      Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // X + undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // X + 0 -> X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // X + (Y - X) -> Y
+  // (Y - X) + X -> Y
+  // Eg: X + -X -> 0
+  Value *Y = 0;
+  if (match(Op1, m_Sub(m_Value(Y), m_Specific(Op0))) ||
+      match(Op0, m_Sub(m_Value(Y), m_Specific(Op1))))
+    return Y;
+
+  // X + ~X -> -1   since   ~X = -X-1
+  if (match(Op0, m_Not(m_Specific(Op1))) ||
+      match(Op1, m_Not(m_Specific(Op0))))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  /// i1 add -> xor.
+  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
+      return V;
+
+  // Try some generic simplifications for associative operations.
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Add, Op0, Op1, Q,
+                                          MaxRecurse))
+    return V;
+
+  // Mul distributes over Add.  Try some generic simplifications based on this.
+  if (Value *V = FactorizeBinOp(Instruction::Add, Op0, Op1, Instruction::Mul,
+                                Q, MaxRecurse))
+    return V;
+
+  // Threading Add over selects and phi nodes is pointless, so don't bother.
+  // Threading over the select in "A + select(cond, B, C)" means evaluating
+  // "A+B" and "A+C" and seeing if they are equal; but they are equal if and
+  // only if B and C are equal.  If B and C are equal then (since we assume
+  // that operands have already been simplified) "select(cond, B, C)" should
+  // have been simplified to the common value of B and C already.  Analysing
+  // "A+B" and "A+C" thus gains nothing, but costs compile time.  Similarly
+  // for threading over phi nodes.
+
+  return 0;
+}
+
+Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
+}
+
+/// \brief Compute the base pointer and cumulative constant offsets for V.
+///
+/// This strips all constant offsets off of V, leaving it the base pointer, and
+/// accumulates the total constant offset applied in the returned constant. It
+/// returns 0 if V is not a pointer, and returns the constant '0' if there are
+/// no constant offsets applied.
+///
+/// This is very similar to GetPointerBaseWithConstantOffset except it doesn't
+/// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
+/// folding.
+static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
+                                                Value *&V) {
+  assert(V->getType()->getScalarType()->isPointerTy());
+
+  // Without DataLayout, just be conservative for now. Theoretically, more could
+  // be done in this case.
+  if (!TD)
+    return ConstantInt::get(IntegerType::get(V->getContext(), 64), 0);
+
+  unsigned IntPtrWidth = TD->getPointerSizeInBits();
+  APInt Offset = APInt::getNullValue(IntPtrWidth);
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(V);
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(*TD, Offset))
+        break;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        break;
+      V = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(V->getType()->getScalarType()->isPointerTy() &&
+           "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
+  if (V->getType()->isVectorTy())
+    return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
+                                    OffsetIntPtr);
+  return OffsetIntPtr;
+}
+
+/// \brief Compute the constant difference between two pointer values.
+/// If the difference is not a constant, returns zero.
+static Constant *computePointerDifference(const DataLayout *TD,
+                                          Value *LHS, Value *RHS) {
+  Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
+  Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
+
+  // If LHS and RHS are not related via constant offsets to the same base
+  // value, there is nothing we can do here.
+  if (LHS != RHS)
+    return 0;
+
+  // Otherwise, the difference of LHS - RHS can be computed as:
+  //    LHS - RHS
+  //  = (LHSOffset + Base) - (RHSOffset + Base)
+  //  = LHSOffset - RHSOffset
+  return ConstantExpr::getSub(LHSOffset, RHSOffset);
+}
+
+/// SimplifySubInst - Given operands for a Sub, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0))
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::Sub, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+  // X - undef -> undef
+  // undef - X -> undef
+  if (match(Op0, m_Undef()) || match(Op1, m_Undef()))
+    return UndefValue::get(Op0->getType());
+
+  // X - 0 -> X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // X - X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  // (X*2) - X -> X
+  // (X<<1) - X -> X
+  Value *X = 0;
+  if (match(Op0, m_Mul(m_Specific(Op1), m_ConstantInt<2>())) ||
+      match(Op0, m_Shl(m_Specific(Op1), m_One())))
+    return Op1;
+
+  // (X + Y) - Z -> X + (Y - Z) or Y + (X - Z) if everything simplifies.
+  // For example, (X + Y) - Y -> X; (Y + X) - Y -> X
+  Value *Y = 0, *Z = Op1;
+  if (MaxRecurse && match(Op0, m_Add(m_Value(X), m_Value(Y)))) { // (X + Y) - Z
+    // See if "V === Y - Z" simplifies.
+    if (Value *V = SimplifyBinOp(Instruction::Sub, Y, Z, Q, MaxRecurse-1))
+      // It does!  Now see if "X + V" simplifies.
+      if (Value *W = SimplifyBinOp(Instruction::Add, X, V, Q, MaxRecurse-1)) {
+        // It does, we successfully reassociated!
+        ++NumReassoc;
+        return W;
+      }
+    // See if "V === X - Z" simplifies.
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
+      // It does!  Now see if "Y + V" simplifies.
+      if (Value *W = SimplifyBinOp(Instruction::Add, Y, V, Q, MaxRecurse-1)) {
+        // It does, we successfully reassociated!
+        ++NumReassoc;
+        return W;
+      }
+  }
+
+  // X - (Y + Z) -> (X - Y) - Z or (X - Z) - Y if everything simplifies.
+  // For example, X - (X + 1) -> -1
+  X = Op0;
+  if (MaxRecurse && match(Op1, m_Add(m_Value(Y), m_Value(Z)))) { // X - (Y + Z)
+    // See if "V === X - Y" simplifies.
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
+      // It does!  Now see if "V - Z" simplifies.
+      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Z, Q, MaxRecurse-1)) {
+        // It does, we successfully reassociated!
+        ++NumReassoc;
+        return W;
+      }
+    // See if "V === X - Z" simplifies.
+    if (Value *V = SimplifyBinOp(Instruction::Sub, X, Z, Q, MaxRecurse-1))
+      // It does!  Now see if "V - Y" simplifies.
+      if (Value *W = SimplifyBinOp(Instruction::Sub, V, Y, Q, MaxRecurse-1)) {
+        // It does, we successfully reassociated!
+        ++NumReassoc;
+        return W;
+      }
+  }
+
+  // Z - (X - Y) -> (Z - X) + Y if everything simplifies.
+  // For example, X - (X - Y) -> Y.
+  Z = Op0;
+  if (MaxRecurse && match(Op1, m_Sub(m_Value(X), m_Value(Y)))) // Z - (X - Y)
+    // See if "V === Z - X" simplifies.
+    if (Value *V = SimplifyBinOp(Instruction::Sub, Z, X, Q, MaxRecurse-1))
+      // It does!  Now see if "V + Y" simplifies.
+      if (Value *W = SimplifyBinOp(Instruction::Add, V, Y, Q, MaxRecurse-1)) {
+        // It does, we successfully reassociated!
+        ++NumReassoc;
+        return W;
+      }
+
+  // trunc(X) - trunc(Y) -> trunc(X - Y) if everything simplifies.
+  if (MaxRecurse && match(Op0, m_Trunc(m_Value(X))) &&
+      match(Op1, m_Trunc(m_Value(Y))))
+    if (X->getType() == Y->getType())
+      // See if "V === X - Y" simplifies.
+      if (Value *V = SimplifyBinOp(Instruction::Sub, X, Y, Q, MaxRecurse-1))
+        // It does!  Now see if "trunc V" simplifies.
+        if (Value *W = SimplifyTruncInst(V, Op0->getType(), Q, MaxRecurse-1))
+          // It does, return the simplified "trunc V".
+          return W;
+
+  // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
+  if (match(Op0, m_PtrToInt(m_Value(X))) &&
+      match(Op1, m_PtrToInt(m_Value(Y))))
+    if (Constant *Result = computePointerDifference(Q.TD, X, Y))
+      return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
+
+  // Mul distributes over Sub.  Try some generic simplifications based on this.
+  if (Value *V = FactorizeBinOp(Instruction::Sub, Op0, Op1, Instruction::Mul,
+                                Q, MaxRecurse))
+    return V;
+
+  // i1 sub -> xor.
+  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+    if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
+      return V;
+
+  // Threading Sub over selects and phi nodes is pointless, so don't bother.
+  // Threading over the select in "A - select(cond, B, C)" means evaluating
+  // "A-B" and "A-C" and seeing if they are equal; but they are equal if and
+  // only if B and C are equal.  If B and C are equal then (since we assume
+  // that operands have already been simplified) "select(cond, B, C)" should
+  // have been simplified to the common value of B and C already.  Analysing
+  // "A-B" and "A-C" thus gains nothing, but costs compile time.  Similarly
+  // for threading over phi nodes.
+
+  return 0;
+}
+
+Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
+}
+
+/// Given operands for an FAdd, see if we can fold the result.  If not, this
+/// returns null.
+static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::FAdd, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // fadd X, -0 ==> X
+  if (match(Op1, m_NegZero()))
+    return Op0;
+
+  // fadd X, 0 ==> X, when we know X is not -0
+  if (match(Op1, m_Zero()) &&
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+    return Op0;
+
+  // fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
+  //   where nnan and ninf have to occur at least once somewhere in this
+  //   expression
+  Value *SubOp = 0;
+  if (match(Op1, m_FSub(m_AnyZero(), m_Specific(Op0))))
+    SubOp = Op1;
+  else if (match(Op0, m_FSub(m_AnyZero(), m_Specific(Op1))))
+    SubOp = Op0;
+  if (SubOp) {
+    Instruction *FSub = cast<Instruction>(SubOp);
+    if ((FMF.noNaNs() || FSub->hasNoNaNs()) &&
+        (FMF.noInfs() || FSub->hasNoInfs()))
+      return Constant::getNullValue(Op0->getType());
+  }
+
+  return 0;
+}
+
+/// Given operands for an FSub, see if we can fold the result.  If not, this
+/// returns null.
+static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::FSub, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+  }
+
+  // fsub X, 0 ==> X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // fsub X, -0 ==> X, when we know X is not -0
+  if (match(Op1, m_NegZero()) &&
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+    return Op0;
+
+  // fsub 0, (fsub -0.0, X) ==> X
+  Value *X;
+  if (match(Op0, m_AnyZero())) {
+    if (match(Op1, m_FSub(m_NegZero(), m_Value(X))))
+      return X;
+    if (FMF.noSignedZeros() && match(Op1, m_FSub(m_AnyZero(), m_Value(X))))
+      return X;
+  }
+
+  // fsub nnan ninf x, x ==> 0.0
+  if (FMF.noNaNs() && FMF.noInfs() && Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  return 0;
+}
+
+/// Given the operands for an FMul, see if we can fold the result
+static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
+                               FastMathFlags FMF,
+                               const Query &Q,
+                               unsigned MaxRecurse) {
+ if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::FMul, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+ }
+
+ // fmul X, 1.0 ==> X
+ if (match(Op1, m_FPOne()))
+   return Op0;
+
+ // fmul nnan nsz X, 0 ==> 0
+ if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
+   return Op1;
+
+ return 0;
+}
+
+/// SimplifyMulInst - Given operands for a Mul, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
+                              unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::Mul, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // X * undef -> 0
+  if (match(Op1, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // X * 0 -> 0
+  if (match(Op1, m_Zero()))
+    return Op1;
+
+  // X * 1 -> X
+  if (match(Op1, m_One()))
+    return Op0;
+
+  // (X / Y) * Y -> X if the division is exact.
+  Value *X = 0;
+  if (match(Op0, m_Exact(m_IDiv(m_Value(X), m_Specific(Op1)))) || // (X / Y) * Y
+      match(Op1, m_Exact(m_IDiv(m_Value(X), m_Specific(Op0)))))   // Y * (X / Y)
+    return X;
+
+  // i1 mul -> and.
+  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+    if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
+      return V;
+
+  // Try some generic simplifications for associative operations.
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Mul, Op0, Op1, Q,
+                                          MaxRecurse))
+    return V;
+
+  // Mul distributes over Add.  Try some generic simplifications based on this.
+  if (Value *V = ExpandBinOp(Instruction::Mul, Op0, Op1, Instruction::Add,
+                             Q, MaxRecurse))
+    return V;
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Instruction::Mul, Op0, Op1, Q,
+                                         MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Instruction::Mul, Op0, Op1, Q,
+                                      MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyFAddInst(Op0, Op1, FMF, Query (TD, TLI, DT), RecursionLimit);
+}
+
+Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyFSubInst(Op0, Op1, FMF, Query (TD, TLI, DT), RecursionLimit);
+}
+
+Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1,
+                              FastMathFlags FMF,
+                              const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyFMulInst(Op0, Op1, FMF, Query (TD, TLI, DT), RecursionLimit);
+}
+
+Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyMulInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyDiv - Given operands for an SDiv or UDiv, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
+    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { C0, C1 };
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
+    }
+  }
+
+  bool isSigned = Opcode == Instruction::SDiv;
+
+  // X / undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // undef / X -> 0
+  if (match(Op0, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // 0 / X -> 0, we don't need to preserve faults!
+  if (match(Op0, m_Zero()))
+    return Op0;
+
+  // X / 1 -> X
+  if (match(Op1, m_One()))
+    return Op0;
+
+  if (Op0->getType()->isIntegerTy(1))
+    // It can't be division by zero, hence it must be division by one.
+    return Op0;
+
+  // X / X -> 1
+  if (Op0 == Op1)
+    return ConstantInt::get(Op0->getType(), 1);
+
+  // (X * Y) / Y -> X if the multiplication does not overflow.
+  Value *X = 0, *Y = 0;
+  if (match(Op0, m_Mul(m_Value(X), m_Value(Y))) && (X == Op1 || Y == Op1)) {
+    if (Y != Op1) std::swap(X, Y); // Ensure expression is (X * Y) / Y, Y = Op1
+    OverflowingBinaryOperator *Mul = cast<OverflowingBinaryOperator>(Op0);
+    // If the Mul knows it does not overflow, then we are good to go.
+    if ((isSigned && Mul->hasNoSignedWrap()) ||
+        (!isSigned && Mul->hasNoUnsignedWrap()))
+      return X;
+    // If X has the form X = A / Y then X * Y cannot overflow.
+    if (BinaryOperator *Div = dyn_cast<BinaryOperator>(X))
+      if (Div->getOpcode() == Opcode && Div->getOperand(1) == Y)
+        return X;
+  }
+
+  // (X rem Y) / Y -> 0
+  if ((isSigned && match(Op0, m_SRem(m_Value(), m_Specific(Op1)))) ||
+      (!isSigned && match(Op0, m_URem(m_Value(), m_Specific(Op1)))))
+    return Constant::getNullValue(Op0->getType());
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+/// SimplifySDivInst - Given operands for an SDiv, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifySDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyUDivInst - Given operands for a UDiv, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyUDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned) {
+  // undef / X -> undef    (the undef could be a snan).
+  if (match(Op0, m_Undef()))
+    return Op0;
+
+  // X / undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  return 0;
+}
+
+Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyFDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyRem - Given operands for an SRem or URem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
+    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { C0, C1 };
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
+    }
+  }
+
+  // X % undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // undef % X -> 0
+  if (match(Op0, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // 0 % X -> 0, we don't need to preserve faults!
+  if (match(Op0, m_Zero()))
+    return Op0;
+
+  // X % 0 -> undef, we don't need to preserve faults!
+  if (match(Op1, m_Zero()))
+    return UndefValue::get(Op0->getType());
+
+  // X % 1 -> 0
+  if (match(Op1, m_One()))
+    return Constant::getNullValue(Op0->getType());
+
+  if (Op0->getType()->isIntegerTy(1))
+    // It can't be remainder by zero, hence it must be remainder by one.
+    return Constant::getNullValue(Op0->getType());
+
+  // X % X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+/// SimplifySRemInst - Given operands for an SRem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifySRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyURemInst - Given operands for a URem, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
+                               unsigned MaxRecurse) {
+  if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyURemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
+                               unsigned) {
+  // undef % X -> undef    (the undef could be a snan).
+  if (match(Op0, m_Undef()))
+    return Op0;
+
+  // X % undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  return 0;
+}
+
+Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyFRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyShift - Given operands for an Shl, LShr or AShr, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
+                            const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C0 = dyn_cast<Constant>(Op0)) {
+    if (Constant *C1 = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { C0, C1 };
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, Q.TD, Q.TLI);
+    }
+  }
+
+  // 0 shift by X -> 0
+  if (match(Op0, m_Zero()))
+    return Op0;
+
+  // X shift by 0 -> X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // X shift by undef -> undef because it may shift by the bitwidth.
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // Shifting by the bitwidth or more is undefined.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1))
+    if (CI->getValue().getLimitedValue() >=
+        Op0->getType()->getScalarSizeInBits())
+      return UndefValue::get(Op0->getType());
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Opcode, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+/// SimplifyShlInst - Given operands for an Shl, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  // undef << X -> 0
+  if (match(Op0, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // (X >> A) << A -> X
+  Value *X;
+  if (match(Op0, m_Exact(m_Shr(m_Value(X), m_Specific(Op1)))))
+    return X;
+  return 0;
+}
+
+Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
+                           RecursionLimit);
+}
+
+/// SimplifyLShrInst - Given operands for an LShr, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::LShr, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  // undef >>l X -> 0
+  if (match(Op0, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // (X << A) >> A -> X
+  Value *X;
+  if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1))) &&
+      cast<OverflowingBinaryOperator>(Op0)->hasNoUnsignedWrap())
+    return X;
+
+  return 0;
+}
+
+Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+                              const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyLShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
+                            RecursionLimit);
+}
+
+/// SimplifyAShrInst - Given operands for an AShr, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Value *V = SimplifyShift(Instruction::AShr, Op0, Op1, Q, MaxRecurse))
+    return V;
+
+  // all ones >>a X -> all ones
+  if (match(Op0, m_AllOnes()))
+    return Op0;
+
+  // undef >>a X -> all ones
+  if (match(Op0, m_Undef()))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  // (X << A) >> A -> X
+  Value *X;
+  if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1))) &&
+      cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap())
+    return X;
+
+  return 0;
+}
+
+Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+                              const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyAShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
+                            RecursionLimit);
+}
+
+/// SimplifyAndInst - Given operands for an And, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
+                              unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::And, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // X & undef -> 0
+  if (match(Op1, m_Undef()))
+    return Constant::getNullValue(Op0->getType());
+
+  // X & X = X
+  if (Op0 == Op1)
+    return Op0;
+
+  // X & 0 = 0
+  if (match(Op1, m_Zero()))
+    return Op1;
+
+  // X & -1 = X
+  if (match(Op1, m_AllOnes()))
+    return Op0;
+
+  // A & ~A  =  ~A & A  =  0
+  if (match(Op0, m_Not(m_Specific(Op1))) ||
+      match(Op1, m_Not(m_Specific(Op0))))
+    return Constant::getNullValue(Op0->getType());
+
+  // (A | ?) & A = A
+  Value *A = 0, *B = 0;
+  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+      (A == Op1 || B == Op1))
+    return Op1;
+
+  // A & (A | ?) = A
+  if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
+      (A == Op0 || B == Op0))
+    return Op0;
+
+  // A & (-A) = A if A is a power of two or zero.
+  if (match(Op0, m_Neg(m_Specific(Op1))) ||
+      match(Op1, m_Neg(m_Specific(Op0)))) {
+    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/true))
+      return Op0;
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true))
+      return Op1;
+  }
+
+  // Try some generic simplifications for associative operations.
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
+                                          MaxRecurse))
+    return V;
+
+  // And distributes over Or.  Try some generic simplifications based on this.
+  if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Or,
+                             Q, MaxRecurse))
+    return V;
+
+  // And distributes over Xor.  Try some generic simplifications based on this.
+  if (Value *V = ExpandBinOp(Instruction::And, Op0, Op1, Instruction::Xor,
+                             Q, MaxRecurse))
+    return V;
+
+  // Or distributes over And.  Try some generic simplifications based on this.
+  if (Value *V = FactorizeBinOp(Instruction::And, Op0, Op1, Instruction::Or,
+                                Q, MaxRecurse))
+    return V;
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Instruction::And, Op0, Op1, Q,
+                                         MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Instruction::And, Op0, Op1, Q,
+                                      MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyAndInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyOrInst - Given operands for an Or, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
+                             unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::Or, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // X | undef -> -1
+  if (match(Op1, m_Undef()))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  // X | X = X
+  if (Op0 == Op1)
+    return Op0;
+
+  // X | 0 = X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // X | -1 = -1
+  if (match(Op1, m_AllOnes()))
+    return Op1;
+
+  // A | ~A  =  ~A | A  =  -1
+  if (match(Op0, m_Not(m_Specific(Op1))) ||
+      match(Op1, m_Not(m_Specific(Op0))))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  // (A & ?) | A = A
+  Value *A = 0, *B = 0;
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      (A == Op1 || B == Op1))
+    return Op1;
+
+  // A | (A & ?) = A
+  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+      (A == Op0 || B == Op0))
+    return Op0;
+
+  // ~(A & ?) | A = -1
+  if (match(Op0, m_Not(m_And(m_Value(A), m_Value(B)))) &&
+      (A == Op1 || B == Op1))
+    return Constant::getAllOnesValue(Op1->getType());
+
+  // A | ~(A & ?) = -1
+  if (match(Op1, m_Not(m_And(m_Value(A), m_Value(B)))) &&
+      (A == Op0 || B == Op0))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  // Try some generic simplifications for associative operations.
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
+                                          MaxRecurse))
+    return V;
+
+  // Or distributes over And.  Try some generic simplifications based on this.
+  if (Value *V = ExpandBinOp(Instruction::Or, Op0, Op1, Instruction::And, Q,
+                             MaxRecurse))
+    return V;
+
+  // And distributes over Or.  Try some generic simplifications based on this.
+  if (Value *V = FactorizeBinOp(Instruction::Or, Op0, Op1, Instruction::And,
+                                Q, MaxRecurse))
+    return V;
+
+  // If the operation is with the result of a select instruction, check whether
+  // operating on either branch of the select always yields the same value.
+  if (isa<SelectInst>(Op0) || isa<SelectInst>(Op1))
+    if (Value *V = ThreadBinOpOverSelect(Instruction::Or, Op0, Op1, Q,
+                                         MaxRecurse))
+      return V;
+
+  // If the operation is with the result of a phi instruction, check whether
+  // operating on all incoming values of the phi always yields the same value.
+  if (isa<PHINode>(Op0) || isa<PHINode>(Op1))
+    if (Value *V = ThreadBinOpOverPHI(Instruction::Or, Op0, Op1, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                            const TargetLibraryInfo *TLI,
+                            const DominatorTree *DT) {
+  return ::SimplifyOrInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyXorInst - Given operands for a Xor, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
+                              unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::Xor, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // A ^ undef -> undef
+  if (match(Op1, m_Undef()))
+    return Op1;
+
+  // A ^ 0 = A
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // A ^ A = 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  // A ^ ~A  =  ~A ^ A  =  -1
+  if (match(Op0, m_Not(m_Specific(Op1))) ||
+      match(Op1, m_Not(m_Specific(Op0))))
+    return Constant::getAllOnesValue(Op0->getType());
+
+  // Try some generic simplifications for associative operations.
+  if (Value *V = SimplifyAssociativeBinOp(Instruction::Xor, Op0, Op1, Q,
+                                          MaxRecurse))
+    return V;
+
+  // And distributes over Xor.  Try some generic simplifications based on this.
+  if (Value *V = FactorizeBinOp(Instruction::Xor, Op0, Op1, Instruction::And,
+                                Q, MaxRecurse))
+    return V;
+
+  // Threading Xor over selects and phi nodes is pointless, so don't bother.
+  // Threading over the select in "A ^ select(cond, B, C)" means evaluating
+  // "A^B" and "A^C" and seeing if they are equal; but they are equal if and
+  // only if B and C are equal.  If B and C are equal then (since we assume
+  // that operands have already been simplified) "select(cond, B, C)" should
+  // have been simplified to the common value of B and C already.  Analysing
+  // "A^B" and "A^C" thus gains nothing, but costs compile time.  Similarly
+  // for threading over phi nodes.
+
+  return 0;
+}
+
+Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout *TD,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyXorInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
+}
+
+static Type *GetCompareTy(Value *Op) {
+  return CmpInst::makeCmpResultType(Op->getType());
+}
+
+/// ExtractEquivalentCondition - Rummage around inside V looking for something
+/// equivalent to the comparison "LHS Pred RHS".  Return such a value if found,
+/// otherwise return null.  Helper function for analyzing max/min idioms.
+static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
+                                         Value *LHS, Value *RHS) {
+  SelectInst *SI = dyn_cast<SelectInst>(V);
+  if (!SI)
+    return 0;
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+  if (!Cmp)
+    return 0;
+  Value *CmpLHS = Cmp->getOperand(0), *CmpRHS = Cmp->getOperand(1);
+  if (Pred == Cmp->getPredicate() && LHS == CmpLHS && RHS == CmpRHS)
+    return Cmp;
+  if (Pred == CmpInst::getSwappedPredicate(Cmp->getPredicate()) &&
+      LHS == CmpRHS && RHS == CmpLHS)
+    return Cmp;
+  return 0;
+}
+
+// A significant optimization not implemented here is assuming that alloca
+// addresses are not equal to incoming argument values. They don't *alias*,
+// as we say, but that doesn't mean they aren't equal, so we take a
+// conservative approach.
+//
+// This is inspired in part by C++11 5.10p1:
+//   "Two pointers of the same type compare equal if and only if they are both
+//    null, both point to the same function, or both represent the same
+//    address."
+//
+// This is pretty permissive.
+//
+// It's also partly due to C11 6.5.9p6:
+//   "Two pointers compare equal if and only if both are null pointers, both are
+//    pointers to the same object (including a pointer to an object and a
+//    subobject at its beginning) or function, both are pointers to one past the
+//    last element of the same array object, or one is a pointer to one past the
+//    end of one array object and the other is a pointer to the start of a
+//    different array object that happens to immediately follow the ﬁrst array
+//    object in the address space.)
+//
+// C11's version is more restrictive, however there's no reason why an argument
+// couldn't be a one-past-the-end value for a stack object in the caller and be
+// equal to the beginning of a stack object in the callee.
+//
+// If the C and C++ standards are ever made sufficiently restrictive in this
+// area, it may be possible to update LLVM's semantics accordingly and reinstate
+// this optimization.
+static Constant *computePointerICmp(const DataLayout *TD,
+                                    const TargetLibraryInfo *TLI,
+                                    CmpInst::Predicate Pred,
+                                    Value *LHS, Value *RHS) {
+  // First, skip past any trivial no-ops.
+  LHS = LHS->stripPointerCasts();
+  RHS = RHS->stripPointerCasts();
+
+  // A non-null pointer is not equal to a null pointer.
+  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
+      (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
+    return ConstantInt::get(GetCompareTy(LHS),
+                            !CmpInst::isTrueWhenEqual(Pred));
+
+  // We can only fold certain predicates on pointer comparisons.
+  switch (Pred) {
+  default:
+    return 0;
+
+    // Equality comaprisons are easy to fold.
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    break;
+
+    // We can only handle unsigned relational comparisons because 'inbounds' on
+    // a GEP only protects against unsigned wrapping.
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    // However, we have to switch them to their signed variants to handle
+    // negative indices from the base pointer.
+    Pred = ICmpInst::getSignedPredicate(Pred);
+    break;
+  }
+
+  // Strip off any constant offsets so that we can reason about them.
+  // It's tempting to use getUnderlyingObject or even just stripInBoundsOffsets
+  // here and compare base addresses like AliasAnalysis does, however there are
+  // numerous hazards. AliasAnalysis and its utilities rely on special rules
+  // governing loads and stores which don't apply to icmps. Also, AliasAnalysis
+  // doesn't need to guarantee pointer inequality when it says NoAlias.
+  Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
+  Constant *RHSOffset = stripAndComputeConstantOffsets(TD, RHS);
+
+  // If LHS and RHS are related via constant offsets to the same base
+  // value, we can replace it with an icmp which just compares the offsets.
+  if (LHS == RHS)
+    return ConstantExpr::getICmp(Pred, LHSOffset, RHSOffset);
+
+  // Various optimizations for (in)equality comparisons.
+  if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
+    // Different non-empty allocations that exist at the same time have
+    // different addresses (if the program can tell). Global variables always
+    // exist, so they always exist during the lifetime of each other and all
+    // allocas. Two different allocas usually have different addresses...
+    //
+    // However, if there's an @llvm.stackrestore dynamically in between two
+    // allocas, they may have the same address. It's tempting to reduce the
+    // scope of the problem by only looking at *static* allocas here. That would
+    // cover the majority of allocas while significantly reducing the likelihood
+    // of having an @llvm.stackrestore pop up in the middle. However, it's not
+    // actually impossible for an @llvm.stackrestore to pop up in the middle of
+    // an entry block. Also, if we have a block that's not attached to a
+    // function, we can't tell if it's "static" under the current definition.
+    // Theoretically, this problem could be fixed by creating a new kind of
+    // instruction kind specifically for static allocas. Such a new instruction
+    // could be required to be at the top of the entry block, thus preventing it
+    // from being subject to a @llvm.stackrestore. Instcombine could even
+    // convert regular allocas into these special allocas. It'd be nifty.
+    // However, until then, this problem remains open.
+    //
+    // So, we'll assume that two non-empty allocas have different addresses
+    // for now.
+    //
+    // With all that, if the offsets are within the bounds of their allocations
+    // (and not one-past-the-end! so we can't use inbounds!), and their
+    // allocations aren't the same, the pointers are not equal.
+    //
+    // Note that it's not necessary to check for LHS being a global variable
+    // address, due to canonicalization and constant folding.
+    if (isa<AllocaInst>(LHS) &&
+        (isa<AllocaInst>(RHS) || isa<GlobalVariable>(RHS))) {
+      ConstantInt *LHSOffsetCI = dyn_cast<ConstantInt>(LHSOffset);
+      ConstantInt *RHSOffsetCI = dyn_cast<ConstantInt>(RHSOffset);
+      uint64_t LHSSize, RHSSize;
+      if (LHSOffsetCI && RHSOffsetCI &&
+          getObjectSize(LHS, LHSSize, TD, TLI) &&
+          getObjectSize(RHS, RHSSize, TD, TLI)) {
+        const APInt &LHSOffsetValue = LHSOffsetCI->getValue();
+        const APInt &RHSOffsetValue = RHSOffsetCI->getValue();
+        if (!LHSOffsetValue.isNegative() &&
+            !RHSOffsetValue.isNegative() &&
+            LHSOffsetValue.ult(LHSSize) &&
+            RHSOffsetValue.ult(RHSSize)) {
+          return ConstantInt::get(GetCompareTy(LHS),
+                                  !CmpInst::isTrueWhenEqual(Pred));
+        }
+      }
+
+      // Repeat the above check but this time without depending on DataLayout
+      // or being able to compute a precise size.
+      if (!cast<PointerType>(LHS->getType())->isEmptyTy() &&
+          !cast<PointerType>(RHS->getType())->isEmptyTy() &&
+          LHSOffset->isNullValue() &&
+          RHSOffset->isNullValue())
+        return ConstantInt::get(GetCompareTy(LHS),
+                                !CmpInst::isTrueWhenEqual(Pred));
+    }
+  }
+
+  // Otherwise, fail.
+  return 0;
+}
+
+/// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                               const Query &Q, unsigned MaxRecurse) {
+  CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
+  assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
+
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.TD, Q.TLI);
+
+    // If we have a constant, make sure it is on the RHS.
+    std::swap(LHS, RHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+  }
+
+  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *OpTy = LHS->getType();   // The operand type.
+
+  // icmp X, X -> true/false
+  // X icmp undef -> true/false.  For example, icmp ugt %X, undef -> false
+  // because X could be 0.
+  if (LHS == RHS || isa<UndefValue>(RHS))
+    return ConstantInt::get(ITy, CmpInst::isTrueWhenEqual(Pred));
+
+  // Special case logic when the operands have i1 type.
+  if (OpTy->getScalarType()->isIntegerTy(1)) {
+    switch (Pred) {
+    default: break;
+    case ICmpInst::ICMP_EQ:
+      // X == 1 -> X
+      if (match(RHS, m_One()))
+        return LHS;
+      break;
+    case ICmpInst::ICMP_NE:
+      // X != 0 -> X
+      if (match(RHS, m_Zero()))
+        return LHS;
+      break;
+    case ICmpInst::ICMP_UGT:
+      // X >u 0 -> X
+      if (match(RHS, m_Zero()))
+        return LHS;
+      break;
+    case ICmpInst::ICMP_UGE:
+      // X >=u 1 -> X
+      if (match(RHS, m_One()))
+        return LHS;
+      break;
+    case ICmpInst::ICMP_SLT:
+      // X <s 0 -> X
+      if (match(RHS, m_Zero()))
+        return LHS;
+      break;
+    case ICmpInst::ICMP_SLE:
+      // X <=s -1 -> X
+      if (match(RHS, m_One()))
+        return LHS;
+      break;
+    }
+  }
+
+  // If we are comparing with zero then try hard since this is a common case.
+  if (match(RHS, m_Zero())) {
+    bool LHSKnownNonNegative, LHSKnownNegative;
+    switch (Pred) {
+    default: llvm_unreachable("Unknown ICmp predicate!");
+    case ICmpInst::ICMP_ULT:
+      return getFalse(ITy);
+    case ICmpInst::ICMP_UGE:
+      return getTrue(ITy);
+    case ICmpInst::ICMP_EQ:
+    case ICmpInst::ICMP_ULE:
+      if (isKnownNonZero(LHS, Q.TD))
+        return getFalse(ITy);
+      break;
+    case ICmpInst::ICMP_NE:
+    case ICmpInst::ICMP_UGT:
+      if (isKnownNonZero(LHS, Q.TD))
+        return getTrue(ITy);
+      break;
+    case ICmpInst::ICMP_SLT:
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
+      if (LHSKnownNegative)
+        return getTrue(ITy);
+      if (LHSKnownNonNegative)
+        return getFalse(ITy);
+      break;
+    case ICmpInst::ICMP_SLE:
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
+      if (LHSKnownNegative)
+        return getTrue(ITy);
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.TD))
+        return getFalse(ITy);
+      break;
+    case ICmpInst::ICMP_SGE:
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
+      if (LHSKnownNegative)
+        return getFalse(ITy);
+      if (LHSKnownNonNegative)
+        return getTrue(ITy);
+      break;
+    case ICmpInst::ICMP_SGT:
+      ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.TD);
+      if (LHSKnownNegative)
+        return getFalse(ITy);
+      if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.TD))
+        return getTrue(ITy);
+      break;
+    }
+  }
+
+  // See if we are doing a comparison with a constant integer.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+    // Rule out tautological comparisons (eg., ult 0 or uge 0).
+    ConstantRange RHS_CR = ICmpInst::makeConstantRange(Pred, CI->getValue());
+    if (RHS_CR.isEmptySet())
+      return ConstantInt::getFalse(CI->getContext());
+    if (RHS_CR.isFullSet())
+      return ConstantInt::getTrue(CI->getContext());
+
+    // Many binary operators with constant RHS have easy to compute constant
+    // range.  Use them to check whether the comparison is a tautology.
+    uint32_t Width = CI->getBitWidth();
+    APInt Lower = APInt(Width, 0);
+    APInt Upper = APInt(Width, 0);
+    ConstantInt *CI2;
+    if (match(LHS, m_URem(m_Value(), m_ConstantInt(CI2)))) {
+      // 'urem x, CI2' produces [0, CI2).
+      Upper = CI2->getValue();
+    } else if (match(LHS, m_SRem(m_Value(), m_ConstantInt(CI2)))) {
+      // 'srem x, CI2' produces (-|CI2|, |CI2|).
+      Upper = CI2->getValue().abs();
+      Lower = (-Upper) + 1;
+    } else if (match(LHS, m_UDiv(m_ConstantInt(CI2), m_Value()))) {
+      // 'udiv CI2, x' produces [0, CI2].
+      Upper = CI2->getValue() + 1;
+    } else if (match(LHS, m_UDiv(m_Value(), m_ConstantInt(CI2)))) {
+      // 'udiv x, CI2' produces [0, UINT_MAX / CI2].
+      APInt NegOne = APInt::getAllOnesValue(Width);
+      if (!CI2->isZero())
+        Upper = NegOne.udiv(CI2->getValue()) + 1;
+    } else if (match(LHS, m_SDiv(m_Value(), m_ConstantInt(CI2)))) {
+      // 'sdiv x, CI2' produces [INT_MIN / CI2, INT_MAX / CI2].
+      APInt IntMin = APInt::getSignedMinValue(Width);
+      APInt IntMax = APInt::getSignedMaxValue(Width);
+      APInt Val = CI2->getValue().abs();
+      if (!Val.isMinValue()) {
+        Lower = IntMin.sdiv(Val);
+        Upper = IntMax.sdiv(Val) + 1;
+      }
+    } else if (match(LHS, m_LShr(m_Value(), m_ConstantInt(CI2)))) {
+      // 'lshr x, CI2' produces [0, UINT_MAX >> CI2].
+      APInt NegOne = APInt::getAllOnesValue(Width);
+      if (CI2->getValue().ult(Width))
+        Upper = NegOne.lshr(CI2->getValue()) + 1;
+    } else if (match(LHS, m_AShr(m_Value(), m_ConstantInt(CI2)))) {
+      // 'ashr x, CI2' produces [INT_MIN >> CI2, INT_MAX >> CI2].
+      APInt IntMin = APInt::getSignedMinValue(Width);
+      APInt IntMax = APInt::getSignedMaxValue(Width);
+      if (CI2->getValue().ult(Width)) {
+        Lower = IntMin.ashr(CI2->getValue());
+        Upper = IntMax.ashr(CI2->getValue()) + 1;
+      }
+    } else if (match(LHS, m_Or(m_Value(), m_ConstantInt(CI2)))) {
+      // 'or x, CI2' produces [CI2, UINT_MAX].
+      Lower = CI2->getValue();
+    } else if (match(LHS, m_And(m_Value(), m_ConstantInt(CI2)))) {
+      // 'and x, CI2' produces [0, CI2].
+      Upper = CI2->getValue() + 1;
+    }
+    if (Lower != Upper) {
+      ConstantRange LHS_CR = ConstantRange(Lower, Upper);
+      if (RHS_CR.contains(LHS_CR))
+        return ConstantInt::getTrue(RHS->getContext());
+      if (RHS_CR.inverse().contains(LHS_CR))
+        return ConstantInt::getFalse(RHS->getContext());
+    }
+  }
+
+  // Compare of cast, for example (zext X) != 0 -> X != 0
+  if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
+    Instruction *LI = cast<CastInst>(LHS);
+    Value *SrcOp = LI->getOperand(0);
+    Type *SrcTy = SrcOp->getType();
+    Type *DstTy = LI->getType();
+
+    // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
+    // if the integer type is the same size as the pointer type.
+    if (MaxRecurse && Q.TD && isa<PtrToIntInst>(LI) &&
+        Q.TD->getPointerSizeInBits() == DstTy->getPrimitiveSizeInBits()) {
+      if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+        // Transfer the cast to the constant.
+        if (Value *V = SimplifyICmpInst(Pred, SrcOp,
+                                        ConstantExpr::getIntToPtr(RHSC, SrcTy),
+                                        Q, MaxRecurse-1))
+          return V;
+      } else if (PtrToIntInst *RI = dyn_cast<PtrToIntInst>(RHS)) {
+        if (RI->getOperand(0)->getType() == SrcTy)
+          // Compare without the cast.
+          if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
+                                          Q, MaxRecurse-1))
+            return V;
+      }
+    }
+
+    if (isa<ZExtInst>(LHS)) {
+      // Turn icmp (zext X), (zext Y) into a compare of X and Y if they have the
+      // same type.
+      if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
+        if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
+          // Compare X and Y.  Note that signed predicates become unsigned.
+          if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
+                                          SrcOp, RI->getOperand(0), Q,
+                                          MaxRecurse-1))
+            return V;
+      }
+      // Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended
+      // too.  If not, then try to deduce the result of the comparison.
+      else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+        // Compute the constant that would happen if we truncated to SrcTy then
+        // reextended to DstTy.
+        Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
+        Constant *RExt = ConstantExpr::getCast(CastInst::ZExt, Trunc, DstTy);
+
+        // If the re-extended constant didn't change then this is effectively
+        // also a case of comparing two zero-extended values.
+        if (RExt == CI && MaxRecurse)
+          if (Value *V = SimplifyICmpInst(ICmpInst::getUnsignedPredicate(Pred),
+                                        SrcOp, Trunc, Q, MaxRecurse-1))
+            return V;
+
+        // Otherwise the upper bits of LHS are zero while RHS has a non-zero bit
+        // there.  Use this to work out the result of the comparison.
+        if (RExt != CI) {
+          switch (Pred) {
+          default: llvm_unreachable("Unknown ICmp predicate!");
+          // LHS <u RHS.
+          case ICmpInst::ICMP_EQ:
+          case ICmpInst::ICMP_UGT:
+          case ICmpInst::ICMP_UGE:
+            return ConstantInt::getFalse(CI->getContext());
+
+          case ICmpInst::ICMP_NE:
+          case ICmpInst::ICMP_ULT:
+          case ICmpInst::ICMP_ULE:
+            return ConstantInt::getTrue(CI->getContext());
+
+          // LHS is non-negative.  If RHS is negative then LHS >s LHS.  If RHS
+          // is non-negative then LHS <s RHS.
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_SGE:
+            return CI->getValue().isNegative() ?
+              ConstantInt::getTrue(CI->getContext()) :
+              ConstantInt::getFalse(CI->getContext());
+
+          case ICmpInst::ICMP_SLT:
+          case ICmpInst::ICMP_SLE:
+            return CI->getValue().isNegative() ?
+              ConstantInt::getFalse(CI->getContext()) :
+              ConstantInt::getTrue(CI->getContext());
+          }
+        }
+      }
+    }
+
+    if (isa<SExtInst>(LHS)) {
+      // Turn icmp (sext X), (sext Y) into a compare of X and Y if they have the
+      // same type.
+      if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
+        if (MaxRecurse && SrcTy == RI->getOperand(0)->getType())
+          // Compare X and Y.  Note that the predicate does not change.
+          if (Value *V = SimplifyICmpInst(Pred, SrcOp, RI->getOperand(0),
+                                          Q, MaxRecurse-1))
+            return V;
+      }
+      // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
+      // too.  If not, then try to deduce the result of the comparison.
+      else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+        // Compute the constant that would happen if we truncated to SrcTy then
+        // reextended to DstTy.
+        Constant *Trunc = ConstantExpr::getTrunc(CI, SrcTy);
+        Constant *RExt = ConstantExpr::getCast(CastInst::SExt, Trunc, DstTy);
+
+        // If the re-extended constant didn't change then this is effectively
+        // also a case of comparing two sign-extended values.
+        if (RExt == CI && MaxRecurse)
+          if (Value *V = SimplifyICmpInst(Pred, SrcOp, Trunc, Q, MaxRecurse-1))
+            return V;
+
+        // Otherwise the upper bits of LHS are all equal, while RHS has varying
+        // bits there.  Use this to work out the result of the comparison.
+        if (RExt != CI) {
+          switch (Pred) {
+          default: llvm_unreachable("Unknown ICmp predicate!");
+          case ICmpInst::ICMP_EQ:
+            return ConstantInt::getFalse(CI->getContext());
+          case ICmpInst::ICMP_NE:
+            return ConstantInt::getTrue(CI->getContext());
+
+          // If RHS is non-negative then LHS <s RHS.  If RHS is negative then
+          // LHS >s RHS.
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_SGE:
+            return CI->getValue().isNegative() ?
+              ConstantInt::getTrue(CI->getContext()) :
+              ConstantInt::getFalse(CI->getContext());
+          case ICmpInst::ICMP_SLT:
+          case ICmpInst::ICMP_SLE:
+            return CI->getValue().isNegative() ?
+              ConstantInt::getFalse(CI->getContext()) :
+              ConstantInt::getTrue(CI->getContext());
+
+          // If LHS is non-negative then LHS <u RHS.  If LHS is negative then
+          // LHS >u RHS.
+          case ICmpInst::ICMP_UGT:
+          case ICmpInst::ICMP_UGE:
+            // Comparison is true iff the LHS <s 0.
+            if (MaxRecurse)
+              if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SLT, SrcOp,
+                                              Constant::getNullValue(SrcTy),
+                                              Q, MaxRecurse-1))
+                return V;
+            break;
+          case ICmpInst::ICMP_ULT:
+          case ICmpInst::ICMP_ULE:
+            // Comparison is true iff the LHS >=s 0.
+            if (MaxRecurse)
+              if (Value *V = SimplifyICmpInst(ICmpInst::ICMP_SGE, SrcOp,
+                                              Constant::getNullValue(SrcTy),
+                                              Q, MaxRecurse-1))
+                return V;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Special logic for binary operators.
+  BinaryOperator *LBO = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *RBO = dyn_cast<BinaryOperator>(RHS);
+  if (MaxRecurse && (LBO || RBO)) {
+    // Analyze the case when either LHS or RHS is an add instruction.
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    // LHS = A + B (or A and B are null); RHS = C + D (or C and D are null).
+    bool NoLHSWrapProblem = false, NoRHSWrapProblem = false;
+    if (LBO && LBO->getOpcode() == Instruction::Add) {
+      A = LBO->getOperand(0); B = LBO->getOperand(1);
+      NoLHSWrapProblem = ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && LBO->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && LBO->hasNoSignedWrap());
+    }
+    if (RBO && RBO->getOpcode() == Instruction::Add) {
+      C = RBO->getOperand(0); D = RBO->getOperand(1);
+      NoRHSWrapProblem = ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && RBO->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && RBO->hasNoSignedWrap());
+    }
+
+    // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+    if ((A == RHS || B == RHS) && NoLHSWrapProblem)
+      if (Value *V = SimplifyICmpInst(Pred, A == RHS ? B : A,
+                                      Constant::getNullValue(RHS->getType()),
+                                      Q, MaxRecurse-1))
+        return V;
+
+    // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+    if ((C == LHS || D == LHS) && NoRHSWrapProblem)
+      if (Value *V = SimplifyICmpInst(Pred,
+                                      Constant::getNullValue(LHS->getType()),
+                                      C == LHS ? D : C, Q, MaxRecurse-1))
+        return V;
+
+    // icmp (X+Y), (X+Z) -> icmp Y,Z for equalities or if there is no overflow.
+    if (A && C && (A == C || A == D || B == C || B == D) &&
+        NoLHSWrapProblem && NoRHSWrapProblem) {
+      // Determine Y and Z in the form icmp (X+Y), (X+Z).
+      Value *Y, *Z;
+      if (A == C) {
+        // C + B == C + D  ->  B == D
+        Y = B;
+        Z = D;
+      } else if (A == D) {
+        // D + B == C + D  ->  B == C
+        Y = B;
+        Z = C;
+      } else if (B == C) {
+        // A + C == C + D  ->  A == D
+        Y = A;
+        Z = D;
+      } else {
+        assert(B == D);
+        // A + D == C + D  ->  A == C
+        Y = A;
+        Z = C;
+      }
+      if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse-1))
+        return V;
+    }
+  }
+
+  if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
+    bool KnownNonNegative, KnownNegative;
+    switch (Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE:
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
+      if (!KnownNonNegative)
+        break;
+      // fall-through
+    case ICmpInst::ICMP_EQ:
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE:
+      return getFalse(ITy);
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE:
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
+      if (!KnownNonNegative)
+        break;
+      // fall-through
+    case ICmpInst::ICMP_NE:
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE:
+      return getTrue(ITy);
+    }
+  }
+  if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
+    bool KnownNonNegative, KnownNegative;
+    switch (Pred) {
+    default:
+      break;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE:
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
+      if (!KnownNonNegative)
+        break;
+      // fall-through
+    case ICmpInst::ICMP_NE:
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE:
+      return getTrue(ITy);
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE:
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
+      if (!KnownNonNegative)
+        break;
+      // fall-through
+    case ICmpInst::ICMP_EQ:
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE:
+      return getFalse(ITy);
+    }
+  }
+
+  // x udiv y <=u x.
+  if (LBO && match(LBO, m_UDiv(m_Specific(RHS), m_Value()))) {
+    // icmp pred (X /u Y), X
+    if (Pred == ICmpInst::ICMP_UGT)
+      return getFalse(ITy);
+    if (Pred == ICmpInst::ICMP_ULE)
+      return getTrue(ITy);
+  }
+
+  if (MaxRecurse && LBO && RBO && LBO->getOpcode() == RBO->getOpcode() &&
+      LBO->getOperand(1) == RBO->getOperand(1)) {
+    switch (LBO->getOpcode()) {
+    default: break;
+    case Instruction::UDiv:
+    case Instruction::LShr:
+      if (ICmpInst::isSigned(Pred))
+        break;
+      // fall-through
+    case Instruction::SDiv:
+    case Instruction::AShr:
+      if (!LBO->isExact() || !RBO->isExact())
+        break;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse-1))
+        return V;
+      break;
+    case Instruction::Shl: {
+      bool NUW = LBO->hasNoUnsignedWrap() && RBO->hasNoUnsignedWrap();
+      bool NSW = LBO->hasNoSignedWrap() && RBO->hasNoSignedWrap();
+      if (!NUW && !NSW)
+        break;
+      if (!NSW && ICmpInst::isSigned(Pred))
+        break;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse-1))
+        return V;
+      break;
+    }
+    }
+  }
+
+  // Simplify comparisons involving max/min.
+  Value *A, *B;
+  CmpInst::Predicate P = CmpInst::BAD_ICMP_PREDICATE;
+  CmpInst::Predicate EqP; // Chosen so that "A == max/min(A,B)" iff "A EqP B".
+
+  // Signed variants on "max(a,b)>=a -> true".
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smax(A, B) pred A.
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_SMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smax(A, B).
+    EqP = CmpInst::ICMP_SGE; // "A == smax(A, B)" iff "A sge B".
+    // We analyze this as smax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // smin(A, B) pred A.
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_SMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred smin(A, B).
+    EqP = CmpInst::ICMP_SLE; // "A == smin(A, B)" iff "A sle B".
+    // We analyze this as smax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_SLE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_SGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_SGE:
+      // Always true.
+      return getTrue(ITy);
+    case CmpInst::ICMP_SLT:
+      // Always false.
+      return getFalse(ITy);
+    }
+  }
+
+  // Unsigned variants on "max(a,b)>=a -> true".
+  P = CmpInst::BAD_ICMP_PREDICATE;
+  if (match(LHS, m_UMax(m_Value(A), m_Value(B))) && (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umax(A, B) pred A.
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) pred A.
+    P = Pred;
+  } else if (match(RHS, m_UMax(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umax(A, B).
+    EqP = CmpInst::ICMP_UGE; // "A == umax(A, B)" iff "A uge B".
+    // We analyze this as umax(A, B) swapped-pred A.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == RHS || B == RHS)) {
+    if (A != RHS) std::swap(A, B); // umin(A, B) pred A.
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) swapped-pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = CmpInst::getSwappedPredicate(Pred);
+  } else if (match(RHS, m_UMin(m_Value(A), m_Value(B))) &&
+             (A == LHS || B == LHS)) {
+    if (A != LHS) std::swap(A, B); // A pred umin(A, B).
+    EqP = CmpInst::ICMP_ULE; // "A == umin(A, B)" iff "A ule B".
+    // We analyze this as umax(-A, -B) pred -A.
+    // Note that we do not need to actually form -A or -B thanks to EqP.
+    P = Pred;
+  }
+  if (P != CmpInst::BAD_ICMP_PREDICATE) {
+    // Cases correspond to "max(A, B) p A".
+    switch (P) {
+    default:
+      break;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::ICMP_ULE:
+      // Equivalent to "A EqP B".  This may be the same as the condition tested
+      // in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, EqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, EqP, A, B))
+        return V;
+      // Otherwise, see if "A EqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(EqP, A, B, Q, MaxRecurse-1))
+          return V;
+      break;
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_UGT: {
+      CmpInst::Predicate InvEqP = CmpInst::getInversePredicate(EqP);
+      // Equivalent to "A InvEqP B".  This may be the same as the condition
+      // tested in the max/min; if so, we can just return that.
+      if (Value *V = ExtractEquivalentCondition(LHS, InvEqP, A, B))
+        return V;
+      if (Value *V = ExtractEquivalentCondition(RHS, InvEqP, A, B))
+        return V;
+      // Otherwise, see if "A InvEqP B" simplifies.
+      if (MaxRecurse)
+        if (Value *V = SimplifyICmpInst(InvEqP, A, B, Q, MaxRecurse-1))
+          return V;
+      break;
+    }
+    case CmpInst::ICMP_UGE:
+      // Always true.
+      return getTrue(ITy);
+    case CmpInst::ICMP_ULT:
+      // Always false.
+      return getFalse(ITy);
+    }
+  }
+
+  // Variants on "max(x,y) >= min(x,z)".
+  Value *C, *D;
+  if (match(LHS, m_SMax(m_Value(A), m_Value(B))) &&
+      match(RHS, m_SMin(m_Value(C), m_Value(D))) &&
+      (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_SGE)
+      // Always true.
+      return getTrue(ITy);
+    if (Pred == CmpInst::ICMP_SLT)
+      // Always false.
+      return getFalse(ITy);
+  } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_SLE)
+      // Always true.
+      return getTrue(ITy);
+    if (Pred == CmpInst::ICMP_SGT)
+      // Always false.
+      return getFalse(ITy);
+  } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // max(x, ?) pred min(x, ?).
+    if (Pred == CmpInst::ICMP_UGE)
+      // Always true.
+      return getTrue(ITy);
+    if (Pred == CmpInst::ICMP_ULT)
+      // Always false.
+      return getFalse(ITy);
+  } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
+             match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
+             (A == C || A == D || B == C || B == D)) {
+    // min(x, ?) pred max(x, ?).
+    if (Pred == CmpInst::ICMP_ULE)
+      // Always true.
+      return getTrue(ITy);
+    if (Pred == CmpInst::ICMP_UGT)
+      // Always false.
+      return getFalse(ITy);
+  }
+
+  // Simplify comparisons of related pointers using a powerful, recursive
+  // GEP-walk when we have target data available..
+  if (LHS->getType()->isPointerTy())
+    if (Constant *C = computePointerICmp(Q.TD, Q.TLI, Pred, LHS, RHS))
+      return C;
+
+  if (GetElementPtrInst *GLHS = dyn_cast<GetElementPtrInst>(LHS)) {
+    if (GEPOperator *GRHS = dyn_cast<GEPOperator>(RHS)) {
+      if (GLHS->getPointerOperand() == GRHS->getPointerOperand() &&
+          GLHS->hasAllConstantIndices() && GRHS->hasAllConstantIndices() &&
+          (ICmpInst::isEquality(Pred) ||
+           (GLHS->isInBounds() && GRHS->isInBounds() &&
+            Pred == ICmpInst::getSignedPredicate(Pred)))) {
+        // The bases are equal and the indices are constant.  Build a constant
+        // expression GEP with the same indices and a null base pointer to see
+        // what constant folding can make out of it.
+        Constant *Null = Constant::getNullValue(GLHS->getPointerOperandType());
+        SmallVector<Value *, 4> IndicesLHS(GLHS->idx_begin(), GLHS->idx_end());
+        Constant *NewLHS = ConstantExpr::getGetElementPtr(Null, IndicesLHS);
+
+        SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
+        Constant *NewRHS = ConstantExpr::getGetElementPtr(Null, IndicesRHS);
+        return ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
+      }
+    }
+  }
+
+  // If the comparison is with the result of a select instruction, check whether
+  // comparing with either branch of the select always yields the same value.
+  if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
+    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
+      return V;
+
+  // If the comparison is with the result of a phi instruction, check whether
+  // doing the compare with each incoming phi value yields a common result.
+  if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
+    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                              const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                            RecursionLimit);
+}
+
+/// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                               const Query &Q, unsigned MaxRecurse) {
+  CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
+  assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
+
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      return ConstantFoldCompareInstOperands(Pred, CLHS, CRHS, Q.TD, Q.TLI);
+
+    // If we have a constant, make sure it is on the RHS.
+    std::swap(LHS, RHS);
+    Pred = CmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Fold trivial predicates.
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return ConstantInt::get(GetCompareTy(LHS), 0);
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return ConstantInt::get(GetCompareTy(LHS), 1);
+
+  if (isa<UndefValue>(RHS))                  // fcmp pred X, undef -> undef
+    return UndefValue::get(GetCompareTy(LHS));
+
+  // fcmp x,x -> true/false.  Not all compares are foldable.
+  if (LHS == RHS) {
+    if (CmpInst::isTrueWhenEqual(Pred))
+      return ConstantInt::get(GetCompareTy(LHS), 1);
+    if (CmpInst::isFalseWhenEqual(Pred))
+      return ConstantInt::get(GetCompareTy(LHS), 0);
+  }
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+    // If the constant is a nan, see if we can fold the comparison based on it.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->getValueAPF().isNaN()) {
+        if (FCmpInst::isOrdered(Pred))   // True "if ordered and foo"
+          return ConstantInt::getFalse(CFP->getContext());
+        assert(FCmpInst::isUnordered(Pred) &&
+               "Comparison must be either ordered or unordered!");
+        // True if unordered.
+        return ConstantInt::getTrue(CFP->getContext());
+      }
+      // Check whether the constant is an infinity.
+      if (CFP->getValueAPF().isInfinity()) {
+        if (CFP->getValueAPF().isNegative()) {
+          switch (Pred) {
+          case FCmpInst::FCMP_OLT:
+            // No value is ordered and less than negative infinity.
+            return ConstantInt::getFalse(CFP->getContext());
+          case FCmpInst::FCMP_UGE:
+            // All values are unordered with or at least negative infinity.
+            return ConstantInt::getTrue(CFP->getContext());
+          default:
+            break;
+          }
+        } else {
+          switch (Pred) {
+          case FCmpInst::FCMP_OGT:
+            // No value is ordered and greater than infinity.
+            return ConstantInt::getFalse(CFP->getContext());
+          case FCmpInst::FCMP_ULE:
+            // All values are unordered with and at most infinity.
+            return ConstantInt::getTrue(CFP->getContext());
+          default:
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // If the comparison is with the result of a select instruction, check whether
+  // comparing with either branch of the select always yields the same value.
+  if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
+    if (Value *V = ThreadCmpOverSelect(Pred, LHS, RHS, Q, MaxRecurse))
+      return V;
+
+  // If the comparison is with the result of a phi instruction, check whether
+  // doing the compare with each incoming phi value yields a common result.
+  if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
+    if (Value *V = ThreadCmpOverPHI(Pred, LHS, RHS, Q, MaxRecurse))
+      return V;
+
+  return 0;
+}
+
+Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                              const DataLayout *TD,
+                              const TargetLibraryInfo *TLI,
+                              const DominatorTree *DT) {
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                            RecursionLimit);
+}
+
+/// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
+/// the result.  If not, this returns null.
+static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
+                                 Value *FalseVal, const Query &Q,
+                                 unsigned MaxRecurse) {
+  // select true, X, Y  -> X
+  // select false, X, Y -> Y
+  if (ConstantInt *CB = dyn_cast<ConstantInt>(CondVal))
+    return CB->getZExtValue() ? TrueVal : FalseVal;
+
+  // select C, X, X -> X
+  if (TrueVal == FalseVal)
+    return TrueVal;
+
+  if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
+    if (isa<Constant>(TrueVal))
+      return TrueVal;
+    return FalseVal;
+  }
+  if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
+    return FalseVal;
+  if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
+    return TrueVal;
+
+  return 0;
+}
+
+Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
+                                const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
+                                const DominatorTree *DT) {
+  return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Query (TD, TLI, DT),
+                              RecursionLimit);
+}
+
+/// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
+  // The type of the GEP pointer operand.
+  PointerType *PtrTy = dyn_cast<PointerType>(Ops[0]->getType());
+  // The GEP pointer operand is not a pointer, it's a vector of pointers.
+  if (!PtrTy)
+    return 0;
+
+  // getelementptr P -> P.
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  if (isa<UndefValue>(Ops[0])) {
+    // Compute the (pointer) type returned by the GEP instruction.
+    Type *LastType = GetElementPtrInst::getIndexedType(PtrTy, Ops.slice(1));
+    Type *GEPTy = PointerType::get(LastType, PtrTy->getAddressSpace());
+    return UndefValue::get(GEPTy);
+  }
+
+  if (Ops.size() == 2) {
+    // getelementptr P, 0 -> P.
+    if (ConstantInt *C = dyn_cast<ConstantInt>(Ops[1]))
+      if (C->isZero())
+        return Ops[0];
+    // getelementptr P, N -> P if P points to a type of zero size.
+    if (Q.TD) {
+      Type *Ty = PtrTy->getElementType();
+      if (Ty->isSized() && Q.TD->getTypeAllocSize(Ty) == 0)
+        return Ops[0];
+    }
+  }
+
+  // Check to see if this is constant foldable.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (!isa<Constant>(Ops[i]))
+      return 0;
+
+  return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]), Ops.slice(1));
+}
+
+Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD,
+                             const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyGEPInst(Ops, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
+/// can fold the result.  If not, this returns null.
+static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                      ArrayRef<unsigned> Idxs, const Query &Q,
+                                      unsigned) {
+  if (Constant *CAgg = dyn_cast<Constant>(Agg))
+    if (Constant *CVal = dyn_cast<Constant>(Val))
+      return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
+
+  // insertvalue x, undef, n -> x
+  if (match(Val, m_Undef()))
+    return Agg;
+
+  // insertvalue x, (extractvalue y, n), n
+  if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val))
+    if (EV->getAggregateOperand()->getType() == Agg->getType() &&
+        EV->getIndices() == Idxs) {
+      // insertvalue undef, (extractvalue y, n), n -> y
+      if (match(Agg, m_Undef()))
+        return EV->getAggregateOperand();
+
+      // insertvalue y, (extractvalue y, n), n -> y
+      if (Agg == EV->getAggregateOperand())
+        return Agg;
+    }
+
+  return 0;
+}
+
+Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                     ArrayRef<unsigned> Idxs,
+                                     const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI,
+                                     const DominatorTree *DT) {
+  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query (TD, TLI, DT),
+                                   RecursionLimit);
+}
+
+/// SimplifyPHINode - See if we can fold the given phi.  If not, returns null.
+static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
+  // If all of the PHI's incoming values are the same then replace the PHI node
+  // with the common value.
+  Value *CommonValue = 0;
+  bool HasUndefInput = false;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *Incoming = PN->getIncomingValue(i);
+    // If the incoming value is the phi node itself, it can safely be skipped.
+    if (Incoming == PN) continue;
+    if (isa<UndefValue>(Incoming)) {
+      // Remember that we saw an undef value, but otherwise ignore them.
+      HasUndefInput = true;
+      continue;
+    }
+    if (CommonValue && Incoming != CommonValue)
+      return 0;  // Not the same, bail out.
+    CommonValue = Incoming;
+  }
+
+  // If CommonValue is null then all of the incoming values were either undef or
+  // equal to the phi node itself.
+  if (!CommonValue)
+    return UndefValue::get(PN->getType());
+
+  // If we have a PHI node like phi(X, undef, X), where X is defined by some
+  // instruction, we cannot return X as the result of the PHI node unless it
+  // dominates the PHI block.
+  if (HasUndefInput)
+    return ValueDominatesPHI(CommonValue, PN, Q.DT) ? CommonValue : 0;
+
+  return CommonValue;
+}
+
+static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
+  if (Constant *C = dyn_cast<Constant>(Op))
+    return ConstantFoldInstOperands(Instruction::Trunc, Ty, C, Q.TD, Q.TLI);
+
+  return 0;
+}
+
+Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD,
+                               const TargetLibraryInfo *TLI,
+                               const DominatorTree *DT) {
+  return ::SimplifyTruncInst(Op, Ty, Query (TD, TLI, DT), RecursionLimit);
+}
+
+//=== Helper functions for higher up the class hierarchy.
+
+/// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
+/// fold the result.  If not, this returns null.
+static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                            const Query &Q, unsigned MaxRecurse) {
+  switch (Opcode) {
+  case Instruction::Add:
+    return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
+                           Q, MaxRecurse);
+  case Instruction::FAdd:
+    return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+
+  case Instruction::Sub:
+    return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
+                           Q, MaxRecurse);
+  case Instruction::FSub:
+    return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+
+  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::FMul:
+    return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::FRem: return SimplifyFRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Shl:
+    return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
+                           Q, MaxRecurse);
+  case Instruction::LShr:
+    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+  case Instruction::AShr:
+    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+  case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
+  default:
+    if (Constant *CLHS = dyn_cast<Constant>(LHS))
+      if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
+        Constant *COps[] = {CLHS, CRHS};
+        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, Q.TD,
+                                        Q.TLI);
+      }
+
+    // If the operation is associative, try some generic simplifications.
+    if (Instruction::isAssociative(Opcode))
+      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, Q, MaxRecurse))
+        return V;
+
+    // If the operation is with the result of a select instruction check whether
+    // operating on either branch of the select always yields the same value.
+    if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
+      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, Q, MaxRecurse))
+        return V;
+
+    // If the operation is with the result of a phi instruction, check whether
+    // operating on all incoming values of the phi always yields the same value.
+    if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
+      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
+        return V;
+
+    return 0;
+  }
+}
+
+Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                           const DataLayout *TD, const TargetLibraryInfo *TLI,
+                           const DominatorTree *DT) {
+  return ::SimplifyBinOp(Opcode, LHS, RHS, Query (TD, TLI, DT), RecursionLimit);
+}
+
+/// SimplifyCmpInst - Given operands for a CmpInst, see if we can
+/// fold the result.
+static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
+    return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
+  return SimplifyFCmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
+}
+
+Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
+                           RecursionLimit);
+}
+
+static bool IsIdempotent(Intrinsic::ID ID) {
+  switch (ID) {
+  default: return false;
+
+  // Unary idempotent: f(f(x)) = f(x)
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+    return true;
+  }
+}
+
+template <typename IterTy>
+static Value *SimplifyIntrinsic(Intrinsic::ID IID, IterTy ArgBegin, IterTy ArgEnd,
+                                const Query &Q, unsigned MaxRecurse) {
+  // Perform idempotent optimizations
+  if (!IsIdempotent(IID))
+    return 0;
+
+  // Unary Ops
+  if (std::distance(ArgBegin, ArgEnd) == 1)
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*ArgBegin))
+      if (II->getIntrinsicID() == IID)
+        return II;
+
+  return 0;
+}
+
+template <typename IterTy>
+static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
+                           const Query &Q, unsigned MaxRecurse) {
+  Type *Ty = V->getType();
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+    Ty = PTy->getElementType();
+  FunctionType *FTy = cast<FunctionType>(Ty);
+
+  // call undef -> undef
+  if (isa<UndefValue>(V))
+    return UndefValue::get(FTy->getReturnType());
+
+  Function *F = dyn_cast<Function>(V);
+  if (!F)
+    return 0;
+
+  if (unsigned IID = F->getIntrinsicID())
+    if (Value *Ret =
+        SimplifyIntrinsic((Intrinsic::ID) IID, ArgBegin, ArgEnd, Q, MaxRecurse))
+      return Ret;
+
+  if (!canConstantFoldCallTo(F))
+    return 0;
+
+  SmallVector<Constant *, 4> ConstantArgs;
+  ConstantArgs.reserve(ArgEnd - ArgBegin);
+  for (IterTy I = ArgBegin, E = ArgEnd; I != E; ++I) {
+    Constant *C = dyn_cast<Constant>(*I);
+    if (!C)
+      return 0;
+    ConstantArgs.push_back(C);
+  }
+
+  return ConstantFoldCall(F, ConstantArgs, Q.TLI);
+}
+
+Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin,
+                          User::op_iterator ArgEnd, const DataLayout *TD,
+                          const TargetLibraryInfo *TLI,
+                          const DominatorTree *DT) {
+  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(TD, TLI, DT),
+                        RecursionLimit);
+}
+
+Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
+                          const DataLayout *TD, const TargetLibraryInfo *TLI,
+                          const DominatorTree *DT) {
+  return ::SimplifyCall(V, Args.begin(), Args.end(), Query(TD, TLI, DT),
+                        RecursionLimit);
+}
+
+/// SimplifyInstruction - See if we can compute a simplified version of this
+/// instruction.  If not, this returns null.
+Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *TD,
+                                 const TargetLibraryInfo *TLI,
+                                 const DominatorTree *DT) {
+  Value *Result;
+
+  switch (I->getOpcode()) {
+  default:
+    Result = ConstantFoldInstruction(I, TD, TLI);
+    break;
+  case Instruction::FAdd:
+    Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), TD, TLI, DT);
+    break;
+  case Instruction::Add:
+    Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
+                             cast<BinaryOperator>(I)->hasNoSignedWrap(),
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
+                             TD, TLI, DT);
+    break;
+  case Instruction::FSub:
+    Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), TD, TLI, DT);
+    break;
+  case Instruction::Sub:
+    Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
+                             cast<BinaryOperator>(I)->hasNoSignedWrap(),
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
+                             TD, TLI, DT);
+    break;
+  case Instruction::FMul:
+    Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), TD, TLI, DT);
+    break;
+  case Instruction::Mul:
+    Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::SDiv:
+    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::UDiv:
+    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::FDiv:
+    Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::SRem:
+    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::URem:
+    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::FRem:
+    Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::Shl:
+    Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
+                             cast<BinaryOperator>(I)->hasNoSignedWrap(),
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
+                             TD, TLI, DT);
+    break;
+  case Instruction::LShr:
+    Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
+                              cast<BinaryOperator>(I)->isExact(),
+                              TD, TLI, DT);
+    break;
+  case Instruction::AShr:
+    Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
+                              cast<BinaryOperator>(I)->isExact(),
+                              TD, TLI, DT);
+    break;
+  case Instruction::And:
+    Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::Or:
+    Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::Xor:
+    Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::ICmp:
+    Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
+                              I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::FCmp:
+    Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(),
+                              I->getOperand(0), I->getOperand(1), TD, TLI, DT);
+    break;
+  case Instruction::Select:
+    Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
+                                I->getOperand(2), TD, TLI, DT);
+    break;
+  case Instruction::GetElementPtr: {
+    SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+    Result = SimplifyGEPInst(Ops, TD, TLI, DT);
+    break;
+  }
+  case Instruction::InsertValue: {
+    InsertValueInst *IV = cast<InsertValueInst>(I);
+    Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
+                                     IV->getInsertedValueOperand(),
+                                     IV->getIndices(), TD, TLI, DT);
+    break;
+  }
+  case Instruction::PHI:
+    Result = SimplifyPHINode(cast<PHINode>(I), Query (TD, TLI, DT));
+    break;
+  case Instruction::Call: {
+    CallSite CS(cast<CallInst>(I));
+    Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
+                          TD, TLI, DT);
+    break;
+  }
+  case Instruction::Trunc:
+    Result = SimplifyTruncInst(I->getOperand(0), I->getType(), TD, TLI, DT);
+    break;
+  }
+
+  /// If called on unreachable code, the above logic may report that the
+  /// instruction simplified to itself.  Make life easier for users by
+  /// detecting that case here, returning a safe value instead.
+  return Result == I ? UndefValue::get(I->getType()) : Result;
+}
+
+/// \brief Implementation of recursive simplification through an instructions
+/// uses.
+///
+/// This is the common implementation of the recursive simplification routines.
+/// If we have a pre-simplified value in 'SimpleV', that is forcibly used to
+/// replace the instruction 'I'. Otherwise, we simply add 'I' to the list of
+/// instructions to process and attempt to simplify it using
+/// InstructionSimplify.
+///
+/// This routine returns 'true' only when *it* simplifies something. The passed
+/// in simplified value does not count toward this.
+static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
+                                              const DataLayout *TD,
+                                              const TargetLibraryInfo *TLI,
+                                              const DominatorTree *DT) {
+  bool Simplified = false;
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // If we have an explicit value to collapse to, do that round of the
+  // simplification loop by hand initially.
+  if (SimpleV) {
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+         ++UI)
+      if (*UI != I)
+        Worklist.insert(cast<Instruction>(*UI));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // Gracefully handle edge cases where the instruction is not wired into any
+    // parent block.
+    if (I->getParent())
+      I->eraseFromParent();
+  } else {
+    Worklist.insert(I);
+  }
+
+  // Note that we must test the size on each iteration, the worklist can grow.
+  for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+    I = Worklist[Idx];
+
+    // See if this instruction simplifies.
+    SimpleV = SimplifyInstruction(I, TD, TLI, DT);
+    if (!SimpleV)
+      continue;
+
+    Simplified = true;
+
+    // Stash away all the uses of the old instruction so we can check them for
+    // recursive simplifications after a RAUW. This is cheaper than checking all
+    // uses of To on the recursive step in most cases.
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+         ++UI)
+      Worklist.insert(cast<Instruction>(*UI));
+
+    // Replace the instruction with its simplified value.
+    I->replaceAllUsesWith(SimpleV);
+
+    // Gracefully handle edge cases where the instruction is not wired into any
+    // parent block.
+    if (I->getParent())
+      I->eraseFromParent();
+  }
+  return Simplified;
+}
+
+bool llvm::recursivelySimplifyInstruction(Instruction *I,
+                                          const DataLayout *TD,
+                                          const TargetLibraryInfo *TLI,
+                                          const DominatorTree *DT) {
+  return replaceAndRecursivelySimplifyImpl(I, 0, TD, TLI, DT);
+}
+
+bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
+                                         const DataLayout *TD,
+                                         const TargetLibraryInfo *TLI,
+                                         const DominatorTree *DT) {
+  assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
+  assert(SimpleV && "Must provide a simplified value.");
+  return replaceAndRecursivelySimplifyImpl(I, SimpleV, TD, TLI, DT);
+}
diff --git a/contrib/llvm/lib/Analysis/Interval.cpp b/contrib/llvm/lib/Analysis/Interval.cpp
new file mode 100644
index 000000000000..26a0322407ec
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/Interval.cpp
@@ -0,0 +1,58 @@
+//===- Interval.cpp - Interval class code ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of the Interval class, which represents a
+// partition of a control flow graph of some kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Interval.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Interval Implementation
+//===----------------------------------------------------------------------===//
+
+// isLoop - Find out if there is a back edge in this interval...
+//
+bool Interval::isLoop() const {
+  // There is a loop in this interval iff one of the predecessors of the header
+  // node lives in the interval.
+  for (::pred_iterator I = ::pred_begin(HeaderNode), E = ::pred_end(HeaderNode);
+       I != E; ++I)
+    if (contains(*I))
+      return true;
+  return false;
+}
+
+
+void Interval::print(raw_ostream &OS) const {
+  OS << "-------------------------------------------------------------\n"
+       << "Interval Contents:\n";
+
+  // Print out all of the basic blocks in the interval...
+  for (std::vector<BasicBlock*>::const_iterator I = Nodes.begin(),
+         E = Nodes.end(); I != E; ++I)
+    OS << **I << "\n";
+
+  OS << "Interval Predecessors:\n";
+  for (std::vector<BasicBlock*>::const_iterator I = Predecessors.begin(),
+         E = Predecessors.end(); I != E; ++I)
+    OS << **I << "\n";
+
+  OS << "Interval Successors:\n";
+  for (std::vector<BasicBlock*>::const_iterator I = Successors.begin(),
+         E = Successors.end(); I != E; ++I)
+    OS << **I << "\n";
+}
diff --git a/contrib/llvm/lib/Analysis/IntervalPartition.cpp b/contrib/llvm/lib/Analysis/IntervalPartition.cpp
new file mode 100644
index 000000000000..2e259b147b8b
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/IntervalPartition.cpp
@@ -0,0 +1,114 @@
+//===- IntervalPartition.cpp - Interval Partition module code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definition of the IntervalPartition class, which
+// calculates and represent the interval partition of a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IntervalIterator.h"
+using namespace llvm;
+
+char IntervalPartition::ID = 0;
+INITIALIZE_PASS(IntervalPartition, "intervals",
+                "Interval Partition Construction", true, true)
+
+//===----------------------------------------------------------------------===//
+// IntervalPartition Implementation
+//===----------------------------------------------------------------------===//
+
+// releaseMemory - Reset state back to before function was analyzed
+void IntervalPartition::releaseMemory() {
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    delete Intervals[i];
+  IntervalMap.clear();
+  Intervals.clear();
+  RootInterval = 0;
+}
+
+void IntervalPartition::print(raw_ostream &O, const Module*) const {
+  for(unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    Intervals[i]->print(O);
+}
+
+// addIntervalToPartition - Add an interval to the internal list of intervals,
+// and then add mappings from all of the basic blocks in the interval to the
+// interval itself (in the IntervalMap).
+//
+void IntervalPartition::addIntervalToPartition(Interval *I) {
+  Intervals.push_back(I);
+
+  // Add mappings for all of the basic blocks in I to the IntervalPartition
+  for (Interval::node_iterator It = I->Nodes.begin(), End = I->Nodes.end();
+       It != End; ++It)
+    IntervalMap.insert(std::make_pair(*It, I));
+}
+
+// updatePredecessors - Interval generation only sets the successor fields of
+// the interval data structures.  After interval generation is complete,
+// run through all of the intervals and propagate successor info as
+// predecessor info.
+//
+void IntervalPartition::updatePredecessors(Interval *Int) {
+  BasicBlock *Header = Int->getHeaderNode();
+  for (Interval::succ_iterator I = Int->Successors.begin(),
+         E = Int->Successors.end(); I != E; ++I)
+    getBlockInterval(*I)->Predecessors.push_back(Header);
+}
+
+// IntervalPartition ctor - Build the first level interval partition for the
+// specified function...
+//
+bool IntervalPartition::runOnFunction(Function &F) {
+  // Pass false to intervals_begin because we take ownership of it's memory
+  function_interval_iterator I = intervals_begin(&F, false);
+  assert(I != intervals_end(&F) && "No intervals in function!?!?!");
+
+  addIntervalToPartition(RootInterval = *I);
+
+  ++I;  // After the first one...
+
+  // Add the rest of the intervals to the partition.
+  for (function_interval_iterator E = intervals_end(&F); I != E; ++I)
+    addIntervalToPartition(*I);
+
+  // Now that we know all of the successor information, propagate this to the
+  // predecessors for each block.
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    updatePredecessors(Intervals[i]);
+  return false;
+}
+
+
+// IntervalPartition ctor - Build a reduced interval partition from an
+// existing interval graph.  This takes an additional boolean parameter to
+// distinguish it from a copy constructor.  Always pass in false for now.
+//
+IntervalPartition::IntervalPartition(IntervalPartition &IP, bool)
+  : FunctionPass(ID) {
+  assert(IP.getRootInterval() && "Cannot operate on empty IntervalPartitions!");
+
+  // Pass false to intervals_begin because we take ownership of it's memory
+  interval_part_interval_iterator I = intervals_begin(IP, false);
+  assert(I != intervals_end(IP) && "No intervals in interval partition!?!?!");
+
+  addIntervalToPartition(RootInterval = *I);
+
+  ++I;  // After the first one...
+
+  // Add the rest of the intervals to the partition.
+  for (interval_part_interval_iterator E = intervals_end(IP); I != E; ++I)
+    addIntervalToPartition(*I);
+
+  // Now that we know all of the successor information, propagate this to the
+  // predecessors for each block.
+  for (unsigned i = 0, e = Intervals.size(); i != e; ++i)
+    updatePredecessors(Intervals[i]);
+}
+
diff --git a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
new file mode 100644
index 000000000000..66b5e852c02f
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -0,0 +1,1143 @@
+//===- LazyValueInfo.cpp - Value constraint analysis ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for lazy computation of value constraint
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lazy-value-info"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <map>
+#include <stack>
+using namespace llvm;
+using namespace PatternMatch;
+
+char LazyValueInfo::ID = 0;
+INITIALIZE_PASS_BEGIN(LazyValueInfo, "lazy-value-info",
+                "Lazy Value Information Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(LazyValueInfo, "lazy-value-info",
+                "Lazy Value Information Analysis", false, true)
+
+namespace llvm {
+  FunctionPass *createLazyValueInfoPass() { return new LazyValueInfo(); }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               LVILatticeVal
+//===----------------------------------------------------------------------===//
+
+/// LVILatticeVal - This is the information tracked by LazyValueInfo for each
+/// value.
+///
+/// FIXME: This is basically just for bringup, this can be made a lot more rich
+/// in the future.
+///
+namespace {
+class LVILatticeVal {
+  enum LatticeValueTy {
+    /// undefined - This Value has no known value yet.
+    undefined,
+    
+    /// constant - This Value has a specific constant value.
+    constant,
+    /// notconstant - This Value is known to not have the specified value.
+    notconstant,
+
+    /// constantrange - The Value falls within this range.
+    constantrange,
+
+    /// overdefined - This value is not known to be constant, and we know that
+    /// it has a value.
+    overdefined
+  };
+  
+  /// Val: This stores the current lattice value along with the Constant* for
+  /// the constant if this is a 'constant' or 'notconstant' value.
+  LatticeValueTy Tag;
+  Constant *Val;
+  ConstantRange Range;
+  
+public:
+  LVILatticeVal() : Tag(undefined), Val(0), Range(1, true) {}
+
+  static LVILatticeVal get(Constant *C) {
+    LVILatticeVal Res;
+    if (!isa<UndefValue>(C))
+      Res.markConstant(C);
+    return Res;
+  }
+  static LVILatticeVal getNot(Constant *C) {
+    LVILatticeVal Res;
+    if (!isa<UndefValue>(C))
+      Res.markNotConstant(C);
+    return Res;
+  }
+  static LVILatticeVal getRange(ConstantRange CR) {
+    LVILatticeVal Res;
+    Res.markConstantRange(CR);
+    return Res;
+  }
+  
+  bool isUndefined() const     { return Tag == undefined; }
+  bool isConstant() const      { return Tag == constant; }
+  bool isNotConstant() const   { return Tag == notconstant; }
+  bool isConstantRange() const { return Tag == constantrange; }
+  bool isOverdefined() const   { return Tag == overdefined; }
+  
+  Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return Val;
+  }
+  
+  Constant *getNotConstant() const {
+    assert(isNotConstant() && "Cannot get the constant of a non-notconstant!");
+    return Val;
+  }
+  
+  ConstantRange getConstantRange() const {
+    assert(isConstantRange() &&
+           "Cannot get the constant-range of a non-constant-range!");
+    return Range;
+  }
+  
+  /// markOverdefined - Return true if this is a change in status.
+  bool markOverdefined() {
+    if (isOverdefined())
+      return false;
+    Tag = overdefined;
+    return true;
+  }
+
+  /// markConstant - Return true if this is a change in status.
+  bool markConstant(Constant *V) {
+    assert(V && "Marking constant with NULL");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+      return markConstantRange(ConstantRange(CI->getValue()));
+    if (isa<UndefValue>(V))
+      return false;
+
+    assert((!isConstant() || getConstant() == V) &&
+           "Marking constant with different value");
+    assert(isUndefined());
+    Tag = constant;
+    Val = V;
+    return true;
+  }
+  
+  /// markNotConstant - Return true if this is a change in status.
+  bool markNotConstant(Constant *V) {
+    assert(V && "Marking constant with NULL");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+      return markConstantRange(ConstantRange(CI->getValue()+1, CI->getValue()));
+    if (isa<UndefValue>(V))
+      return false;
+
+    assert((!isConstant() || getConstant() != V) &&
+           "Marking constant !constant with same value");
+    assert((!isNotConstant() || getNotConstant() == V) &&
+           "Marking !constant with different value");
+    assert(isUndefined() || isConstant());
+    Tag = notconstant;
+    Val = V;
+    return true;
+  }
+  
+  /// markConstantRange - Return true if this is a change in status.
+  bool markConstantRange(const ConstantRange NewR) {
+    if (isConstantRange()) {
+      if (NewR.isEmptySet())
+        return markOverdefined();
+      
+      bool changed = Range != NewR;
+      Range = NewR;
+      return changed;
+    }
+    
+    assert(isUndefined());
+    if (NewR.isEmptySet())
+      return markOverdefined();
+    
+    Tag = constantrange;
+    Range = NewR;
+    return true;
+  }
+  
+  /// mergeIn - Merge the specified lattice value into this one, updating this
+  /// one and returning true if anything changed.
+  bool mergeIn(const LVILatticeVal &RHS) {
+    if (RHS.isUndefined() || isOverdefined()) return false;
+    if (RHS.isOverdefined()) return markOverdefined();
+
+    if (isUndefined()) {
+      Tag = RHS.Tag;
+      Val = RHS.Val;
+      Range = RHS.Range;
+      return true;
+    }
+
+    if (isConstant()) {
+      if (RHS.isConstant()) {
+        if (Val == RHS.Val)
+          return false;
+        return markOverdefined();
+      }
+
+      if (RHS.isNotConstant()) {
+        if (Val == RHS.Val)
+          return markOverdefined();
+
+        // Unless we can prove that the two Constants are different, we must
+        // move to overdefined.
+        // FIXME: use DataLayout/TargetLibraryInfo for smarter constant folding.
+        if (ConstantInt *Res = dyn_cast<ConstantInt>(
+                ConstantFoldCompareInstOperands(CmpInst::ICMP_NE,
+                                                getConstant(),
+                                                RHS.getNotConstant())))
+          if (Res->isOne())
+            return markNotConstant(RHS.getNotConstant());
+
+        return markOverdefined();
+      }
+
+      // RHS is a ConstantRange, LHS is a non-integer Constant.
+
+      // FIXME: consider the case where RHS is a range [1, 0) and LHS is
+      // a function. The correct result is to pick up RHS.
+
+      return markOverdefined();
+    }
+
+    if (isNotConstant()) {
+      if (RHS.isConstant()) {
+        if (Val == RHS.Val)
+          return markOverdefined();
+
+        // Unless we can prove that the two Constants are different, we must
+        // move to overdefined.
+        // FIXME: use DataLayout/TargetLibraryInfo for smarter constant folding.
+        if (ConstantInt *Res = dyn_cast<ConstantInt>(
+                ConstantFoldCompareInstOperands(CmpInst::ICMP_NE,
+                                                getNotConstant(),
+                                                RHS.getConstant())))
+          if (Res->isOne())
+            return false;
+
+        return markOverdefined();
+      }
+
+      if (RHS.isNotConstant()) {
+        if (Val == RHS.Val)
+          return false;
+        return markOverdefined();
+      }
+
+      return markOverdefined();
+    }
+
+    assert(isConstantRange() && "New LVILattice type?");
+    if (!RHS.isConstantRange())
+      return markOverdefined();
+
+    ConstantRange NewR = Range.unionWith(RHS.getConstantRange());
+    if (NewR.isFullSet())
+      return markOverdefined();
+    return markConstantRange(NewR);
+  }
+};
+  
+} // end anonymous namespace.
+
+namespace llvm {
+raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val)
+    LLVM_ATTRIBUTE_USED;
+raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
+  if (Val.isUndefined())
+    return OS << "undefined";
+  if (Val.isOverdefined())
+    return OS << "overdefined";
+
+  if (Val.isNotConstant())
+    return OS << "notconstant<" << *Val.getNotConstant() << '>';
+  else if (Val.isConstantRange())
+    return OS << "constantrange<" << Val.getConstantRange().getLower() << ", "
+              << Val.getConstantRange().getUpper() << '>';
+  return OS << "constant<" << *Val.getConstant() << '>';
+}
+}
+
+//===----------------------------------------------------------------------===//
+//                          LazyValueInfoCache Decl
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// LVIValueHandle - A callback value handle updates the cache when
+  /// values are erased.
+  class LazyValueInfoCache;
+  struct LVIValueHandle : public CallbackVH {
+    LazyValueInfoCache *Parent;
+      
+    LVIValueHandle(Value *V, LazyValueInfoCache *P)
+      : CallbackVH(V), Parent(P) { }
+      
+    void deleted();
+    void allUsesReplacedWith(Value *V) {
+      deleted();
+    }
+  };
+}
+
+namespace { 
+  /// LazyValueInfoCache - This is the cache kept by LazyValueInfo which
+  /// maintains information about queries across the clients' queries.
+  class LazyValueInfoCache {
+    /// ValueCacheEntryTy - This is all of the cached block information for
+    /// exactly one Value*.  The entries are sorted by the BasicBlock* of the
+    /// entries, allowing us to do a lookup with a binary search.
+    typedef std::map<AssertingVH<BasicBlock>, LVILatticeVal> ValueCacheEntryTy;
+
+    /// ValueCache - This is all of the cached information for all values,
+    /// mapped from Value* to key information.
+    std::map<LVIValueHandle, ValueCacheEntryTy> ValueCache;
+    
+    /// OverDefinedCache - This tracks, on a per-block basis, the set of 
+    /// values that are over-defined at the end of that block.  This is required
+    /// for cache updating.
+    typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
+    DenseSet<OverDefinedPairTy> OverDefinedCache;
+
+    /// SeenBlocks - Keep track of all blocks that we have ever seen, so we
+    /// don't spend time removing unused blocks from our caches.
+    DenseSet<AssertingVH<BasicBlock> > SeenBlocks;
+
+    /// BlockValueStack - This stack holds the state of the value solver
+    /// during a query.  It basically emulates the callstack of the naive
+    /// recursive value lookup process.
+    std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
+    
+    friend struct LVIValueHandle;
+    
+    /// OverDefinedCacheUpdater - A helper object that ensures that the
+    /// OverDefinedCache is updated whenever solveBlockValue returns.
+    struct OverDefinedCacheUpdater {
+      LazyValueInfoCache *Parent;
+      Value *Val;
+      BasicBlock *BB;
+      LVILatticeVal &BBLV;
+      
+      OverDefinedCacheUpdater(Value *V, BasicBlock *B, LVILatticeVal &LV,
+                       LazyValueInfoCache *P)
+        : Parent(P), Val(V), BB(B), BBLV(LV) { }
+      
+      bool markResult(bool changed) { 
+        if (changed && BBLV.isOverdefined())
+          Parent->OverDefinedCache.insert(std::make_pair(BB, Val));
+        return changed;
+      }
+    };
+    
+
+
+    LVILatticeVal getBlockValue(Value *Val, BasicBlock *BB);
+    bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
+                      LVILatticeVal &Result);
+    bool hasBlockValue(Value *Val, BasicBlock *BB);
+
+    // These methods process one work item and may add more. A false value
+    // returned means that the work item was not completely processed and must
+    // be revisited after going through the new items.
+    bool solveBlockValue(Value *Val, BasicBlock *BB);
+    bool solveBlockValueNonLocal(LVILatticeVal &BBLV,
+                                 Value *Val, BasicBlock *BB);
+    bool solveBlockValuePHINode(LVILatticeVal &BBLV,
+                                PHINode *PN, BasicBlock *BB);
+    bool solveBlockValueConstantRange(LVILatticeVal &BBLV,
+                                      Instruction *BBI, BasicBlock *BB);
+
+    void solve();
+    
+    ValueCacheEntryTy &lookup(Value *V) {
+      return ValueCache[LVIValueHandle(V, this)];
+    }
+
+  public:
+    /// getValueInBlock - This is the query interface to determine the lattice
+    /// value for the specified Value* at the end of the specified block.
+    LVILatticeVal getValueInBlock(Value *V, BasicBlock *BB);
+
+    /// getValueOnEdge - This is the query interface to determine the lattice
+    /// value for the specified Value* that is true on the specified edge.
+    LVILatticeVal getValueOnEdge(Value *V, BasicBlock *FromBB,BasicBlock *ToBB);
+    
+    /// threadEdge - This is the update interface to inform the cache that an
+    /// edge from PredBB to OldSucc has been threaded to be from PredBB to
+    /// NewSucc.
+    void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
+    
+    /// eraseBlock - This is part of the update interface to inform the cache
+    /// that a block has been deleted.
+    void eraseBlock(BasicBlock *BB);
+    
+    /// clear - Empty the cache.
+    void clear() {
+      SeenBlocks.clear();
+      ValueCache.clear();
+      OverDefinedCache.clear();
+    }
+  };
+} // end anonymous namespace
+
+void LVIValueHandle::deleted() {
+  typedef std::pair<AssertingVH<BasicBlock>, Value*> OverDefinedPairTy;
+  
+  SmallVector<OverDefinedPairTy, 4> ToErase;
+  for (DenseSet<OverDefinedPairTy>::iterator 
+       I = Parent->OverDefinedCache.begin(),
+       E = Parent->OverDefinedCache.end();
+       I != E; ++I) {
+    if (I->second == getValPtr())
+      ToErase.push_back(*I);
+  }
+  
+  for (SmallVector<OverDefinedPairTy, 4>::iterator I = ToErase.begin(),
+       E = ToErase.end(); I != E; ++I)
+    Parent->OverDefinedCache.erase(*I);
+  
+  // This erasure deallocates *this, so it MUST happen after we're done
+  // using any and all members of *this.
+  Parent->ValueCache.erase(*this);
+}
+
+void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
+  // Shortcut if we have never seen this block.
+  DenseSet<AssertingVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
+  if (I == SeenBlocks.end())
+    return;
+  SeenBlocks.erase(I);
+
+  SmallVector<OverDefinedPairTy, 4> ToErase;
+  for (DenseSet<OverDefinedPairTy>::iterator  I = OverDefinedCache.begin(),
+       E = OverDefinedCache.end(); I != E; ++I) {
+    if (I->first == BB)
+      ToErase.push_back(*I);
+  }
+  
+  for (SmallVector<OverDefinedPairTy, 4>::iterator I = ToErase.begin(),
+       E = ToErase.end(); I != E; ++I)
+    OverDefinedCache.erase(*I);
+
+  for (std::map<LVIValueHandle, ValueCacheEntryTy>::iterator
+       I = ValueCache.begin(), E = ValueCache.end(); I != E; ++I)
+    I->second.erase(BB);
+}
+
+void LazyValueInfoCache::solve() {
+  while (!BlockValueStack.empty()) {
+    std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
+    if (solveBlockValue(e.second, e.first)) {
+      assert(BlockValueStack.top() == e);
+      BlockValueStack.pop();
+    }
+  }
+}
+
+bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) {
+  // If already a constant, there is nothing to compute.
+  if (isa<Constant>(Val))
+    return true;
+
+  LVIValueHandle ValHandle(Val, this);
+  std::map<LVIValueHandle, ValueCacheEntryTy>::iterator I =
+    ValueCache.find(ValHandle);
+  if (I == ValueCache.end()) return false;
+  return I->second.count(BB);
+}
+
+LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
+  // If already a constant, there is nothing to compute.
+  if (Constant *VC = dyn_cast<Constant>(Val))
+    return LVILatticeVal::get(VC);
+
+  SeenBlocks.insert(BB);
+  return lookup(Val)[BB];
+}
+
+bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
+  if (isa<Constant>(Val))
+    return true;
+
+  ValueCacheEntryTy &Cache = lookup(Val);
+  SeenBlocks.insert(BB);
+  LVILatticeVal &BBLV = Cache[BB];
+  
+  // OverDefinedCacheUpdater is a helper object that will update
+  // the OverDefinedCache for us when this method exits.  Make sure to
+  // call markResult on it as we exist, passing a bool to indicate if the
+  // cache needs updating, i.e. if we have solve a new value or not.
+  OverDefinedCacheUpdater ODCacheUpdater(Val, BB, BBLV, this);
+
+  // If we've already computed this block's value, return it.
+  if (!BBLV.isUndefined()) {
+    DEBUG(dbgs() << "  reuse BB '" << BB->getName() << "' val=" << BBLV <<'\n');
+    
+    // Since we're reusing a cached value here, we don't need to update the 
+    // OverDefinedCahce.  The cache will have been properly updated 
+    // whenever the cached value was inserted.
+    ODCacheUpdater.markResult(false);
+    return true;
+  }
+
+  // Otherwise, this is the first time we're seeing this block.  Reset the
+  // lattice value to overdefined, so that cycles will terminate and be
+  // conservatively correct.
+  BBLV.markOverdefined();
+  
+  Instruction *BBI = dyn_cast<Instruction>(Val);
+  if (BBI == 0 || BBI->getParent() != BB) {
+    return ODCacheUpdater.markResult(solveBlockValueNonLocal(BBLV, Val, BB));
+  }
+
+  if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+    return ODCacheUpdater.markResult(solveBlockValuePHINode(BBLV, PN, BB));
+  }
+
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(BBI)) {
+    BBLV = LVILatticeVal::getNot(ConstantPointerNull::get(AI->getType()));
+    return ODCacheUpdater.markResult(true);
+  }
+
+  // We can only analyze the definitions of certain classes of instructions
+  // (integral binops and casts at the moment), so bail if this isn't one.
+  LVILatticeVal Result;
+  if ((!isa<BinaryOperator>(BBI) && !isa<CastInst>(BBI)) ||
+     !BBI->getType()->isIntegerTy()) {
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+    BBLV.markOverdefined();
+    return ODCacheUpdater.markResult(true);
+  }
+
+  // FIXME: We're currently limited to binops with a constant RHS.  This should
+  // be improved.
+  BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
+  if (BO && !isa<ConstantInt>(BO->getOperand(1))) { 
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+
+    BBLV.markOverdefined();
+    return ODCacheUpdater.markResult(true);
+  }
+
+  return ODCacheUpdater.markResult(solveBlockValueConstantRange(BBLV, BBI, BB));
+}
+
+static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    return L->getPointerAddressSpace() == 0 &&
+        GetUnderlyingObject(L->getPointerOperand()) == Ptr;
+  }
+  if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+    return S->getPointerAddressSpace() == 0 &&
+        GetUnderlyingObject(S->getPointerOperand()) == Ptr;
+  }
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+    if (MI->isVolatile()) return false;
+
+    // FIXME: check whether it has a valuerange that excludes zero?
+    ConstantInt *Len = dyn_cast<ConstantInt>(MI->getLength());
+    if (!Len || Len->isZero()) return false;
+
+    if (MI->getDestAddressSpace() == 0)
+      if (GetUnderlyingObject(MI->getRawDest()) == Ptr)
+        return true;
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+      if (MTI->getSourceAddressSpace() == 0)
+        if (GetUnderlyingObject(MTI->getRawSource()) == Ptr)
+          return true;
+  }
+  return false;
+}
+
+bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
+                                                 Value *Val, BasicBlock *BB) {
+  LVILatticeVal Result;  // Start Undefined.
+
+  // If this is a pointer, and there's a load from that pointer in this BB,
+  // then we know that the pointer can't be NULL.
+  bool NotNull = false;
+  if (Val->getType()->isPointerTy()) {
+    if (isKnownNonNull(Val)) {
+      NotNull = true;
+    } else {
+      Value *UnderlyingVal = GetUnderlyingObject(Val);
+      // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
+      // inside InstructionDereferencesPointer either.
+      if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, NULL, 1)) {
+        for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+             BI != BE; ++BI) {
+          if (InstructionDereferencesPointer(BI, UnderlyingVal)) {
+            NotNull = true;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // If this is the entry block, we must be asking about an argument.  The
+  // value is overdefined.
+  if (BB == &BB->getParent()->getEntryBlock()) {
+    assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
+    if (NotNull) {
+      PointerType *PTy = cast<PointerType>(Val->getType());
+      Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
+    } else {
+      Result.markOverdefined();
+    }
+    BBLV = Result;
+    return true;
+  }
+
+  // Loop over all of our predecessors, merging what we know from them into
+  // result.
+  bool EdgesMissing = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    LVILatticeVal EdgeResult;
+    EdgesMissing |= !getEdgeValue(Val, *PI, BB, EdgeResult);
+    if (EdgesMissing)
+      continue;
+
+    Result.mergeIn(EdgeResult);
+
+    // If we hit overdefined, exit early.  The BlockVals entry is already set
+    // to overdefined.
+    if (Result.isOverdefined()) {
+      DEBUG(dbgs() << " compute BB '" << BB->getName()
+            << "' - overdefined because of pred.\n");
+      // If we previously determined that this is a pointer that can't be null
+      // then return that rather than giving up entirely.
+      if (NotNull) {
+        PointerType *PTy = cast<PointerType>(Val->getType());
+        Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
+      }
+      
+      BBLV = Result;
+      return true;
+    }
+  }
+  if (EdgesMissing)
+    return false;
+
+  // Return the merged value, which is more precise than 'overdefined'.
+  assert(!Result.isOverdefined());
+  BBLV = Result;
+  return true;
+}
+  
+bool LazyValueInfoCache::solveBlockValuePHINode(LVILatticeVal &BBLV,
+                                                PHINode *PN, BasicBlock *BB) {
+  LVILatticeVal Result;  // Start Undefined.
+
+  // Loop over all of our predecessors, merging what we know from them into
+  // result.
+  bool EdgesMissing = false;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *PhiBB = PN->getIncomingBlock(i);
+    Value *PhiVal = PN->getIncomingValue(i);
+    LVILatticeVal EdgeResult;
+    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult);
+    if (EdgesMissing)
+      continue;
+
+    Result.mergeIn(EdgeResult);
+
+    // If we hit overdefined, exit early.  The BlockVals entry is already set
+    // to overdefined.
+    if (Result.isOverdefined()) {
+      DEBUG(dbgs() << " compute BB '" << BB->getName()
+            << "' - overdefined because of pred.\n");
+      
+      BBLV = Result;
+      return true;
+    }
+  }
+  if (EdgesMissing)
+    return false;
+
+  // Return the merged value, which is more precise than 'overdefined'.
+  assert(!Result.isOverdefined() && "Possible PHI in entry block?");
+  BBLV = Result;
+  return true;
+}
+
+bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
+                                                      Instruction *BBI,
+                                                      BasicBlock *BB) {
+  // Figure out the range of the LHS.  If that fails, bail.
+  if (!hasBlockValue(BBI->getOperand(0), BB)) {
+    BlockValueStack.push(std::make_pair(BB, BBI->getOperand(0)));
+    return false;
+  }
+
+  LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
+  if (!LHSVal.isConstantRange()) {
+    BBLV.markOverdefined();
+    return true;
+  }
+  
+  ConstantRange LHSRange = LHSVal.getConstantRange();
+  ConstantRange RHSRange(1);
+  IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
+  if (isa<BinaryOperator>(BBI)) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(BBI->getOperand(1))) {
+      RHSRange = ConstantRange(RHS->getValue());
+    } else {
+      BBLV.markOverdefined();
+      return true;
+    }
+  }
+
+  // NOTE: We're currently limited by the set of operations that ConstantRange
+  // can evaluate symbolically.  Enhancing that set will allows us to analyze
+  // more definitions.
+  LVILatticeVal Result;
+  switch (BBI->getOpcode()) {
+  case Instruction::Add:
+    Result.markConstantRange(LHSRange.add(RHSRange));
+    break;
+  case Instruction::Sub:
+    Result.markConstantRange(LHSRange.sub(RHSRange));
+    break;
+  case Instruction::Mul:
+    Result.markConstantRange(LHSRange.multiply(RHSRange));
+    break;
+  case Instruction::UDiv:
+    Result.markConstantRange(LHSRange.udiv(RHSRange));
+    break;
+  case Instruction::Shl:
+    Result.markConstantRange(LHSRange.shl(RHSRange));
+    break;
+  case Instruction::LShr:
+    Result.markConstantRange(LHSRange.lshr(RHSRange));
+    break;
+  case Instruction::Trunc:
+    Result.markConstantRange(LHSRange.truncate(ResultTy->getBitWidth()));
+    break;
+  case Instruction::SExt:
+    Result.markConstantRange(LHSRange.signExtend(ResultTy->getBitWidth()));
+    break;
+  case Instruction::ZExt:
+    Result.markConstantRange(LHSRange.zeroExtend(ResultTy->getBitWidth()));
+    break;
+  case Instruction::BitCast:
+    Result.markConstantRange(LHSRange);
+    break;
+  case Instruction::And:
+    Result.markConstantRange(LHSRange.binaryAnd(RHSRange));
+    break;
+  case Instruction::Or:
+    Result.markConstantRange(LHSRange.binaryOr(RHSRange));
+    break;
+  
+  // Unhandled instructions are overdefined.
+  default:
+    DEBUG(dbgs() << " compute BB '" << BB->getName()
+                 << "' - overdefined because inst def found.\n");
+    Result.markOverdefined();
+    break;
+  }
+  
+  BBLV = Result;
+  return true;
+}
+
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
+/// Val is not constrained on the edge.
+static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
+                              BasicBlock *BBTo, LVILatticeVal &Result) {
+  // TODO: Handle more complex conditionals.  If (v == 0 || v2 < 1) is false, we
+  // know that v != 0.
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
+    // If this is a conditional branch and only one successor goes to BBTo, then
+    // we maybe able to infer something from the condition. 
+    if (BI->isConditional() &&
+        BI->getSuccessor(0) != BI->getSuccessor(1)) {
+      bool isTrueDest = BI->getSuccessor(0) == BBTo;
+      assert(BI->getSuccessor(!isTrueDest) == BBTo &&
+             "BBTo isn't a successor of BBFrom");
+      
+      // If V is the condition of the branch itself, then we know exactly what
+      // it is.
+      if (BI->getCondition() == Val) {
+        Result = LVILatticeVal::get(ConstantInt::get(
+                              Type::getInt1Ty(Val->getContext()), isTrueDest));
+        return true;
+      }
+      
+      // If the condition of the branch is an equality comparison, we may be
+      // able to infer the value.
+      ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+      if (ICI && isa<Constant>(ICI->getOperand(1))) {
+        if (ICI->isEquality() && ICI->getOperand(0) == Val) {
+          // We know that V has the RHS constant if this is a true SETEQ or
+          // false SETNE. 
+          if (isTrueDest == (ICI->getPredicate() == ICmpInst::ICMP_EQ))
+            Result = LVILatticeVal::get(cast<Constant>(ICI->getOperand(1)));
+          else
+            Result = LVILatticeVal::getNot(cast<Constant>(ICI->getOperand(1)));
+          return true;
+        }
+
+        // Recognize the range checking idiom that InstCombine produces.
+        // (X-C1) u< C2 --> [C1, C1+C2)
+        ConstantInt *NegOffset = 0;
+        if (ICI->getPredicate() == ICmpInst::ICMP_ULT)
+          match(ICI->getOperand(0), m_Add(m_Specific(Val),
+                                          m_ConstantInt(NegOffset)));
+
+        ConstantInt *CI = dyn_cast<ConstantInt>(ICI->getOperand(1));
+        if (CI && (ICI->getOperand(0) == Val || NegOffset)) {
+          // Calculate the range of values that would satisfy the comparison.
+          ConstantRange CmpRange(CI->getValue());
+          ConstantRange TrueValues =
+            ConstantRange::makeICmpRegion(ICI->getPredicate(), CmpRange);
+
+          if (NegOffset) // Apply the offset from above.
+            TrueValues = TrueValues.subtract(NegOffset->getValue());
+
+          // If we're interested in the false dest, invert the condition.
+          if (!isTrueDest) TrueValues = TrueValues.inverse();
+
+          Result = LVILatticeVal::getRange(TrueValues);
+          return true;
+        }
+      }
+    }
+  }
+
+  // If the edge was formed by a switch on the value, then we may know exactly
+  // what it is.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) {
+    if (SI->getCondition() != Val)
+      return false;
+
+    bool DefaultCase = SI->getDefaultDest() == BBTo;
+    unsigned BitWidth = Val->getType()->getIntegerBitWidth();
+    ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+      if (DefaultCase) {
+        // It is possible that the default destination is the destination of
+        // some cases. There is no need to perform difference for those cases.
+        if (i.getCaseSuccessor() != BBTo)
+          EdgesVals = EdgesVals.difference(EdgeVal);
+      } else if (i.getCaseSuccessor() == BBTo)
+        EdgesVals = EdgesVals.unionWith(EdgeVal);
+    }
+    Result = LVILatticeVal::getRange(EdgesVals);
+    return true;
+  }
+  return false;
+}
+
+/// \brief Compute the value of Val on the edge BBFrom -> BBTo, or the value at
+/// the basic block if the edge does not constraint Val.
+bool LazyValueInfoCache::getEdgeValue(Value *Val, BasicBlock *BBFrom,
+                                      BasicBlock *BBTo, LVILatticeVal &Result) {
+  // If already a constant, there is nothing to compute.
+  if (Constant *VC = dyn_cast<Constant>(Val)) {
+    Result = LVILatticeVal::get(VC);
+    return true;
+  }
+
+  if (getEdgeValueLocal(Val, BBFrom, BBTo, Result)) {
+    if (!Result.isConstantRange() ||
+      Result.getConstantRange().getSingleElement())
+      return true;
+
+    // FIXME: this check should be moved to the beginning of the function when
+    // LVI better supports recursive values. Even for the single value case, we
+    // can intersect to detect dead code (an empty range).
+    if (!hasBlockValue(Val, BBFrom)) {
+      BlockValueStack.push(std::make_pair(BBFrom, Val));
+      return false;
+    }
+
+    // Try to intersect ranges of the BB and the constraint on the edge.
+    LVILatticeVal InBlock = getBlockValue(Val, BBFrom);
+    if (!InBlock.isConstantRange())
+      return true;
+
+    ConstantRange Range =
+      Result.getConstantRange().intersectWith(InBlock.getConstantRange());
+    Result = LVILatticeVal::getRange(Range);
+    return true;
+  }
+
+  if (!hasBlockValue(Val, BBFrom)) {
+    BlockValueStack.push(std::make_pair(BBFrom, Val));
+    return false;
+  }
+
+  // if we couldn't compute the value on the edge, use the value from the BB
+  Result = getBlockValue(Val, BBFrom);
+  return true;
+}
+
+LVILatticeVal LazyValueInfoCache::getValueInBlock(Value *V, BasicBlock *BB) {
+  DEBUG(dbgs() << "LVI Getting block end value " << *V << " at '"
+        << BB->getName() << "'\n");
+  
+  BlockValueStack.push(std::make_pair(BB, V));
+  solve();
+  LVILatticeVal Result = getBlockValue(V, BB);
+
+  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  return Result;
+}
+
+LVILatticeVal LazyValueInfoCache::
+getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB) {
+  DEBUG(dbgs() << "LVI Getting edge value " << *V << " from '"
+        << FromBB->getName() << "' to '" << ToBB->getName() << "'\n");
+  
+  LVILatticeVal Result;
+  if (!getEdgeValue(V, FromBB, ToBB, Result)) {
+    solve();
+    bool WasFastQuery = getEdgeValue(V, FromBB, ToBB, Result);
+    (void)WasFastQuery;
+    assert(WasFastQuery && "More work to do after problem solved?");
+  }
+
+  DEBUG(dbgs() << "  Result = " << Result << "\n");
+  return Result;
+}
+
+void LazyValueInfoCache::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
+                                    BasicBlock *NewSucc) {
+  // When an edge in the graph has been threaded, values that we could not 
+  // determine a value for before (i.e. were marked overdefined) may be possible
+  // to solve now.  We do NOT try to proactively update these values.  Instead,
+  // we clear their entries from the cache, and allow lazy updating to recompute
+  // them when needed.
+  
+  // The updating process is fairly simple: we need to dropped cached info
+  // for all values that were marked overdefined in OldSucc, and for those same
+  // values in any successor of OldSucc (except NewSucc) in which they were
+  // also marked overdefined.
+  std::vector<BasicBlock*> worklist;
+  worklist.push_back(OldSucc);
+  
+  DenseSet<Value*> ClearSet;
+  for (DenseSet<OverDefinedPairTy>::iterator I = OverDefinedCache.begin(),
+       E = OverDefinedCache.end(); I != E; ++I) {
+    if (I->first == OldSucc)
+      ClearSet.insert(I->second);
+  }
+  
+  // Use a worklist to perform a depth-first search of OldSucc's successors.
+  // NOTE: We do not need a visited list since any blocks we have already
+  // visited will have had their overdefined markers cleared already, and we
+  // thus won't loop to their successors.
+  while (!worklist.empty()) {
+    BasicBlock *ToUpdate = worklist.back();
+    worklist.pop_back();
+    
+    // Skip blocks only accessible through NewSucc.
+    if (ToUpdate == NewSucc) continue;
+    
+    bool changed = false;
+    for (DenseSet<Value*>::iterator I = ClearSet.begin(), E = ClearSet.end();
+         I != E; ++I) {
+      // If a value was marked overdefined in OldSucc, and is here too...
+      DenseSet<OverDefinedPairTy>::iterator OI =
+        OverDefinedCache.find(std::make_pair(ToUpdate, *I));
+      if (OI == OverDefinedCache.end()) continue;
+
+      // Remove it from the caches.
+      ValueCacheEntryTy &Entry = ValueCache[LVIValueHandle(*I, this)];
+      ValueCacheEntryTy::iterator CI = Entry.find(ToUpdate);
+
+      assert(CI != Entry.end() && "Couldn't find entry to update?");
+      Entry.erase(CI);
+      OverDefinedCache.erase(OI);
+
+      // If we removed anything, then we potentially need to update 
+      // blocks successors too.
+      changed = true;
+    }
+
+    if (!changed) continue;
+    
+    worklist.insert(worklist.end(), succ_begin(ToUpdate), succ_end(ToUpdate));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                            LazyValueInfo Impl
+//===----------------------------------------------------------------------===//
+
+/// getCache - This lazily constructs the LazyValueInfoCache.
+static LazyValueInfoCache &getCache(void *&PImpl) {
+  if (!PImpl)
+    PImpl = new LazyValueInfoCache();
+  return *static_cast<LazyValueInfoCache*>(PImpl);
+}
+
+bool LazyValueInfo::runOnFunction(Function &F) {
+  if (PImpl)
+    getCache(PImpl).clear();
+
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  // Fully lazy.
+  return false;
+}
+
+void LazyValueInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<TargetLibraryInfo>();
+}
+
+void LazyValueInfo::releaseMemory() {
+  // If the cache was allocated, free it.
+  if (PImpl) {
+    delete &getCache(PImpl);
+    PImpl = 0;
+  }
+}
+
+Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB) {
+  LVILatticeVal Result = getCache(PImpl).getValueInBlock(V, BB);
+  
+  if (Result.isConstant())
+    return Result.getConstant();
+  if (Result.isConstantRange()) {
+    ConstantRange CR = Result.getConstantRange();
+    if (const APInt *SingleVal = CR.getSingleElement())
+      return ConstantInt::get(V->getContext(), *SingleVal);
+  }
+  return 0;
+}
+
+/// getConstantOnEdge - Determine whether the specified value is known to be a
+/// constant on the specified edge.  Return null if not.
+Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
+                                           BasicBlock *ToBB) {
+  LVILatticeVal Result = getCache(PImpl).getValueOnEdge(V, FromBB, ToBB);
+  
+  if (Result.isConstant())
+    return Result.getConstant();
+  if (Result.isConstantRange()) {
+    ConstantRange CR = Result.getConstantRange();
+    if (const APInt *SingleVal = CR.getSingleElement())
+      return ConstantInt::get(V->getContext(), *SingleVal);
+  }
+  return 0;
+}
+
+/// getPredicateOnEdge - Determine whether the specified value comparison
+/// with a constant is known to be true or false on the specified CFG edge.
+/// Pred is a CmpInst predicate.
+LazyValueInfo::Tristate
+LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
+                                  BasicBlock *FromBB, BasicBlock *ToBB) {
+  LVILatticeVal Result = getCache(PImpl).getValueOnEdge(V, FromBB, ToBB);
+  
+  // If we know the value is a constant, evaluate the conditional.
+  Constant *Res = 0;
+  if (Result.isConstant()) {
+    Res = ConstantFoldCompareInstOperands(Pred, Result.getConstant(), C, TD,
+                                          TLI);
+    if (ConstantInt *ResCI = dyn_cast<ConstantInt>(Res))
+      return ResCI->isZero() ? False : True;
+    return Unknown;
+  }
+  
+  if (Result.isConstantRange()) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(C);
+    if (!CI) return Unknown;
+    
+    ConstantRange CR = Result.getConstantRange();
+    if (Pred == ICmpInst::ICMP_EQ) {
+      if (!CR.contains(CI->getValue()))
+        return False;
+      
+      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+        return True;
+    } else if (Pred == ICmpInst::ICMP_NE) {
+      if (!CR.contains(CI->getValue()))
+        return True;
+      
+      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+        return False;
+    }
+    
+    // Handle more complex predicates.
+    ConstantRange TrueValues =
+        ICmpInst::makeConstantRange((ICmpInst::Predicate)Pred, CI->getValue());
+    if (TrueValues.contains(CR))
+      return True;
+    if (TrueValues.inverse().contains(CR))
+      return False;
+    return Unknown;
+  }
+  
+  if (Result.isNotConstant()) {
+    // If this is an equality comparison, we can try to fold it knowing that
+    // "V != C1".
+    if (Pred == ICmpInst::ICMP_EQ) {
+      // !C1 == C -> false iff C1 == C.
+      Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
+                                            Result.getNotConstant(), C, TD,
+                                            TLI);
+      if (Res->isNullValue())
+        return False;
+    } else if (Pred == ICmpInst::ICMP_NE) {
+      // !C1 != C -> true iff C1 == C.
+      Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
+                                            Result.getNotConstant(), C, TD,
+                                            TLI);
+      if (Res->isNullValue())
+        return True;
+    }
+    return Unknown;
+  }
+  
+  return Unknown;
+}
+
+void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
+                               BasicBlock *NewSucc) {
+  if (PImpl) getCache(PImpl).threadEdge(PredBB, OldSucc, NewSucc);
+}
+
+void LazyValueInfo::eraseBlock(BasicBlock *BB) {
+  if (PImpl) getCache(PImpl).eraseBlock(BB);
+}
diff --git a/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp
new file mode 100644
index 000000000000..fefa51660f92
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/LibCallAliasAnalysis.cpp
@@ -0,0 +1,137 @@
+//===- LibCallAliasAnalysis.cpp - Implement AliasAnalysis for libcalls ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LibCallAliasAnalysis class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LibCallAliasAnalysis.h"
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+  
+// Register this pass...
+char LibCallAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(LibCallAliasAnalysis, AliasAnalysis, "libcall-aa",
+                   "LibCall Alias Analysis", false, true, false)
+
+FunctionPass *llvm::createLibCallAliasAnalysisPass(LibCallInfo *LCI) {
+  return new LibCallAliasAnalysis(LCI);
+}
+
+LibCallAliasAnalysis::~LibCallAliasAnalysis() {
+  delete LCI;
+}
+
+void LibCallAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AliasAnalysis::getAnalysisUsage(AU);
+  AU.setPreservesAll();                         // Does not transform code
+}
+
+
+
+/// AnalyzeLibCallDetails - Given a call to a function with the specified
+/// LibCallFunctionInfo, see if we can improve the mod/ref footprint of the call
+/// vs the specified pointer/size.
+AliasAnalysis::ModRefResult
+LibCallAliasAnalysis::AnalyzeLibCallDetails(const LibCallFunctionInfo *FI,
+                                            ImmutableCallSite CS,
+                                            const Location &Loc) {
+  // If we have a function, check to see what kind of mod/ref effects it
+  // has.  Start by including any info globally known about the function.
+  AliasAnalysis::ModRefResult MRInfo = FI->UniversalBehavior;
+  if (MRInfo == NoModRef) return MRInfo;
+  
+  // If that didn't tell us that the function is 'readnone', check to see
+  // if we have detailed info and if 'P' is any of the locations we know
+  // about.
+  const LibCallFunctionInfo::LocationMRInfo *Details = FI->LocationDetails;
+  if (Details == 0)
+    return MRInfo;
+  
+  // If the details array is of the 'DoesNot' kind, we only know something if
+  // the pointer is a match for one of the locations in 'Details'.  If we find a
+  // match, we can prove some interactions cannot happen.
+  // 
+  if (FI->DetailsType == LibCallFunctionInfo::DoesNot) {
+    // Find out if the pointer refers to a known location.
+    for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
+      const LibCallLocationInfo &LocInfo =
+      LCI->getLocationInfo(Details[i].LocationID);
+      LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc);
+      if (Res != LibCallLocationInfo::Yes) continue;
+      
+      // If we find a match against a location that we 'do not' interact with,
+      // learn this info into MRInfo.
+      return ModRefResult(MRInfo & ~Details[i].MRInfo);
+    }
+    return MRInfo;
+  }
+  
+  // If the details are of the 'DoesOnly' sort, we know something if the pointer
+  // is a match for one of the locations in 'Details'.  Also, if we can prove
+  // that the pointers is *not* one of the locations in 'Details', we know that
+  // the call is NoModRef.
+  assert(FI->DetailsType == LibCallFunctionInfo::DoesOnly);
+  
+  // Find out if the pointer refers to a known location.
+  bool NoneMatch = true;
+  for (unsigned i = 0; Details[i].LocationID != ~0U; ++i) {
+    const LibCallLocationInfo &LocInfo =
+    LCI->getLocationInfo(Details[i].LocationID);
+    LibCallLocationInfo::LocResult Res = LocInfo.isLocation(CS, Loc);
+    if (Res == LibCallLocationInfo::No) continue;
+    
+    // If we don't know if this pointer points to the location, then we have to
+    // assume it might alias in some case.
+    if (Res == LibCallLocationInfo::Unknown) {
+      NoneMatch = false;
+      continue;
+    }
+    
+    // If we know that this pointer definitely is pointing into the location,
+    // merge in this information.
+    return ModRefResult(MRInfo & Details[i].MRInfo);
+  }
+  
+  // If we found that the pointer is guaranteed to not match any of the
+  // locations in our 'DoesOnly' rule, then we know that the pointer must point
+  // to some other location.  Since the libcall doesn't mod/ref any other
+  // locations, return NoModRef.
+  if (NoneMatch)
+    return NoModRef;
+  
+  // Otherwise, return any other info gained so far.
+  return MRInfo;
+}
+
+// getModRefInfo - Check to see if the specified callsite can clobber the
+// specified memory object.
+//
+AliasAnalysis::ModRefResult
+LibCallAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                                    const Location &Loc) {
+  ModRefResult MRInfo = ModRef;
+  
+  // If this is a direct call to a function that LCI knows about, get the
+  // information about the runtime function.
+  if (LCI) {
+    if (const Function *F = CS.getCalledFunction()) {
+      if (const LibCallFunctionInfo *FI = LCI->getFunctionInfo(F)) {
+        MRInfo = ModRefResult(MRInfo & AnalyzeLibCallDetails(FI, CS, Loc));
+        if (MRInfo == NoModRef) return NoModRef;
+      }
+    }
+  }
+  
+  // The AliasAnalysis base class has some smarts, lets use them.
+  return (ModRefResult)(MRInfo | AliasAnalysis::getModRefInfo(CS, Loc));
+}
diff --git a/contrib/llvm/lib/Analysis/LibCallSemantics.cpp b/contrib/llvm/lib/Analysis/LibCallSemantics.cpp
new file mode 100644
index 000000000000..0592ccb26c12
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/LibCallSemantics.cpp
@@ -0,0 +1,63 @@
+//===- LibCallSemantics.cpp - Describe library semantics ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements interfaces that can be used to describe language
+// specific runtime library interfaces (e.g. libc, libm, etc) to LLVM
+// optimizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+/// getMap - This impl pointer in ~LibCallInfo is actually a StringMap.  This
+/// helper does the cast.
+static StringMap<const LibCallFunctionInfo*> *getMap(void *Ptr) {
+  return static_cast<StringMap<const LibCallFunctionInfo*> *>(Ptr);
+}
+
+LibCallInfo::~LibCallInfo() {
+  delete getMap(Impl);
+}
+
+const LibCallLocationInfo &LibCallInfo::getLocationInfo(unsigned LocID) const {
+  // Get location info on the first call.
+  if (NumLocations == 0)
+    NumLocations = getLocationInfo(Locations);
+  
+  assert(LocID < NumLocations && "Invalid location ID!");
+  return Locations[LocID];
+}
+
+
+/// getFunctionInfo - Return the LibCallFunctionInfo object corresponding to
+/// the specified function if we have it.  If not, return null.
+const LibCallFunctionInfo *
+LibCallInfo::getFunctionInfo(const Function *F) const {
+  StringMap<const LibCallFunctionInfo*> *Map = getMap(Impl);
+  
+  /// If this is the first time we are querying for this info, lazily construct
+  /// the StringMap to index it.
+  if (Map == 0) {
+    Impl = Map = new StringMap<const LibCallFunctionInfo*>();
+    
+    const LibCallFunctionInfo *Array = getFunctionInfoArray();
+    if (Array == 0) return 0;
+    
+    // We now have the array of entries.  Populate the StringMap.
+    for (unsigned i = 0; Array[i].Name; ++i)
+      (*Map)[Array[i].Name] = Array+i;
+  }
+  
+  // Look up this function in the string map.
+  return Map->lookup(F->getName());
+}
+
diff --git a/contrib/llvm/lib/Analysis/Lint.cpp b/contrib/llvm/lib/Analysis/Lint.cpp
new file mode 100644
index 000000000000..9393508a9e67
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/Lint.cpp
@@ -0,0 +1,692 @@
+//===-- Lint.cpp - Check for common errors in LLVM IR ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass statically checks for common and easily-identified constructs
+// which produce undefined or likely unintended behavior in LLVM IR.
+//
+// It is not a guarantee of correctness, in two ways. First, it isn't
+// comprehensive. There are checks which could be done statically which are
+// not yet implemented. Some of these are indicated by TODO comments, but
+// those aren't comprehensive either. Second, many conditions cannot be
+// checked statically. This pass does no dynamic instrumentation, so it
+// can't check for all possible problems.
+// 
+// Another limitation is that it assumes all code will be executed. A store
+// through a null pointer in a basic block which is never reached is harmless,
+// but this pass will warn about it anyway. This is the main reason why most
+// of these checks live here instead of in the Verifier pass.
+//
+// Optimization passes may make conditions that this pass checks for more or
+// less obvious. If an optimization pass appears to be introducing a warning,
+// it may be that the optimization pass is merely exposing an existing
+// condition in the code.
+// 
+// This code may be run before instcombine. In many cases, instcombine checks
+// for the same kinds of things and turns instructions with undefined behavior
+// into unreachable (or equivalent). Because of this, this pass makes some
+// effort to look through bitcasts and so on.
+// 
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Lint.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+
+namespace {
+  namespace MemRef {
+    static unsigned Read     = 1;
+    static unsigned Write    = 2;
+    static unsigned Callee   = 4;
+    static unsigned Branchee = 8;
+  }
+
+  class Lint : public FunctionPass, public InstVisitor<Lint> {
+    friend class InstVisitor<Lint>;
+
+    void visitFunction(Function &F);
+
+    void visitCallSite(CallSite CS);
+    void visitMemoryReference(Instruction &I, Value *Ptr,
+                              uint64_t Size, unsigned Align,
+                              Type *Ty, unsigned Flags);
+
+    void visitCallInst(CallInst &I);
+    void visitInvokeInst(InvokeInst &I);
+    void visitReturnInst(ReturnInst &I);
+    void visitLoadInst(LoadInst &I);
+    void visitStoreInst(StoreInst &I);
+    void visitXor(BinaryOperator &I);
+    void visitSub(BinaryOperator &I);
+    void visitLShr(BinaryOperator &I);
+    void visitAShr(BinaryOperator &I);
+    void visitShl(BinaryOperator &I);
+    void visitSDiv(BinaryOperator &I);
+    void visitUDiv(BinaryOperator &I);
+    void visitSRem(BinaryOperator &I);
+    void visitURem(BinaryOperator &I);
+    void visitAllocaInst(AllocaInst &I);
+    void visitVAArgInst(VAArgInst &I);
+    void visitIndirectBrInst(IndirectBrInst &I);
+    void visitExtractElementInst(ExtractElementInst &I);
+    void visitInsertElementInst(InsertElementInst &I);
+    void visitUnreachableInst(UnreachableInst &I);
+
+    Value *findValue(Value *V, bool OffsetOk) const;
+    Value *findValueImpl(Value *V, bool OffsetOk,
+                         SmallPtrSet<Value *, 4> &Visited) const;
+
+  public:
+    Module *Mod;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    DataLayout *TD;
+    TargetLibraryInfo *TLI;
+
+    std::string Messages;
+    raw_string_ostream MessagesStr;
+
+    static char ID; // Pass identification, replacement for typeid
+    Lint() : FunctionPass(ID), MessagesStr(Messages) {
+      initializeLintPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<DominatorTree>();
+    }
+    virtual void print(raw_ostream &O, const Module *M) const {}
+
+    void WriteValue(const Value *V) {
+      if (!V) return;
+      if (isa<Instruction>(V)) {
+        MessagesStr << *V << '\n';
+      } else {
+        WriteAsOperand(MessagesStr, V, true, Mod);
+        MessagesStr << '\n';
+      }
+    }
+
+    // CheckFailed - A check failed, so print out the condition and the message
+    // that failed.  This provides a nice place to put a breakpoint if you want
+    // to see why something is not correct.
+    void CheckFailed(const Twine &Message,
+                     const Value *V1 = 0, const Value *V2 = 0,
+                     const Value *V3 = 0, const Value *V4 = 0) {
+      MessagesStr << Message.str() << "\n";
+      WriteValue(V1);
+      WriteValue(V2);
+      WriteValue(V3);
+      WriteValue(V4);
+    }
+  };
+}
+
+char Lint::ID = 0;
+INITIALIZE_PASS_BEGIN(Lint, "lint", "Statically lint-checks LLVM IR",
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(Lint, "lint", "Statically lint-checks LLVM IR",
+                    false, true)
+
+// Assert - We know that cond should be true, if not print an error message.
+#define Assert(C, M) \
+    do { if (!(C)) { CheckFailed(M); return; } } while (0)
+#define Assert1(C, M, V1) \
+    do { if (!(C)) { CheckFailed(M, V1); return; } } while (0)
+#define Assert2(C, M, V1, V2) \
+    do { if (!(C)) { CheckFailed(M, V1, V2); return; } } while (0)
+#define Assert3(C, M, V1, V2, V3) \
+    do { if (!(C)) { CheckFailed(M, V1, V2, V3); return; } } while (0)
+#define Assert4(C, M, V1, V2, V3, V4) \
+    do { if (!(C)) { CheckFailed(M, V1, V2, V3, V4); return; } } while (0)
+
+// Lint::run - This is the main Analysis entry point for a
+// function.
+//
+bool Lint::runOnFunction(Function &F) {
+  Mod = F.getParent();
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  visit(F);
+  dbgs() << MessagesStr.str();
+  Messages.clear();
+  return false;
+}
+
+void Lint::visitFunction(Function &F) {
+  // This isn't undefined behavior, it's just a little unusual, and it's a
+  // fairly common mistake to neglect to name a function.
+  Assert1(F.hasName() || F.hasLocalLinkage(),
+          "Unusual: Unnamed function with non-local linkage", &F);
+
+  // TODO: Check for irreducible control flow.
+}
+
+void Lint::visitCallSite(CallSite CS) {
+  Instruction &I = *CS.getInstruction();
+  Value *Callee = CS.getCalledValue();
+
+  visitMemoryReference(I, Callee, AliasAnalysis::UnknownSize,
+                       0, 0, MemRef::Callee);
+
+  if (Function *F = dyn_cast<Function>(findValue(Callee, /*OffsetOk=*/false))) {
+    Assert1(CS.getCallingConv() == F->getCallingConv(),
+            "Undefined behavior: Caller and callee calling convention differ",
+            &I);
+
+    FunctionType *FT = F->getFunctionType();
+    unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+
+    Assert1(FT->isVarArg() ?
+              FT->getNumParams() <= NumActualArgs :
+              FT->getNumParams() == NumActualArgs,
+            "Undefined behavior: Call argument count mismatches callee "
+            "argument count", &I);
+
+    Assert1(FT->getReturnType() == I.getType(),
+            "Undefined behavior: Call return type mismatches "
+            "callee return type", &I);
+
+    // Check argument types (in case the callee was casted) and attributes.
+    // TODO: Verify that caller and callee attributes are compatible.
+    Function::arg_iterator PI = F->arg_begin(), PE = F->arg_end();
+    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+    for (; AI != AE; ++AI) {
+      Value *Actual = *AI;
+      if (PI != PE) {
+        Argument *Formal = PI++;
+        Assert1(Formal->getType() == Actual->getType(),
+                "Undefined behavior: Call argument type mismatches "
+                "callee parameter type", &I);
+
+        // Check that noalias arguments don't alias other arguments. This is
+        // not fully precise because we don't know the sizes of the dereferenced
+        // memory regions.
+        if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy())
+          for (CallSite::arg_iterator BI = CS.arg_begin(); BI != AE; ++BI)
+            if (AI != BI && (*BI)->getType()->isPointerTy()) {
+              AliasAnalysis::AliasResult Result = AA->alias(*AI, *BI);
+              Assert1(Result != AliasAnalysis::MustAlias &&
+                      Result != AliasAnalysis::PartialAlias,
+                      "Unusual: noalias argument aliases another argument", &I);
+            }
+
+        // Check that an sret argument points to valid memory.
+        if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) {
+          Type *Ty =
+            cast<PointerType>(Formal->getType())->getElementType();
+          visitMemoryReference(I, Actual, AA->getTypeStoreSize(Ty),
+                               TD ? TD->getABITypeAlignment(Ty) : 0,
+                               Ty, MemRef::Read | MemRef::Write);
+        }
+      }
+    }
+  }
+
+  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->isTailCall())
+    for (CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+         AI != AE; ++AI) {
+      Value *Obj = findValue(*AI, /*OffsetOk=*/true);
+      Assert1(!isa<AllocaInst>(Obj),
+              "Undefined behavior: Call with \"tail\" keyword references "
+              "alloca", &I);
+    }
+
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
+    switch (II->getIntrinsicID()) {
+    default: break;
+
+    // TODO: Check more intrinsics
+
+    case Intrinsic::memcpy: {
+      MemCpyInst *MCI = cast<MemCpyInst>(&I);
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MCI->getDest(), AliasAnalysis::UnknownSize,
+                           MCI->getAlignment(), 0,
+                           MemRef::Write);
+      visitMemoryReference(I, MCI->getSource(), AliasAnalysis::UnknownSize,
+                           MCI->getAlignment(), 0,
+                           MemRef::Read);
+
+      // Check that the memcpy arguments don't overlap. The AliasAnalysis API
+      // isn't expressive enough for what we really want to do. Known partial
+      // overlap is not distinguished from the case where nothing is known.
+      uint64_t Size = 0;
+      if (const ConstantInt *Len =
+            dyn_cast<ConstantInt>(findValue(MCI->getLength(),
+                                            /*OffsetOk=*/false)))
+        if (Len->getValue().isIntN(32))
+          Size = Len->getValue().getZExtValue();
+      Assert1(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=
+              AliasAnalysis::MustAlias,
+              "Undefined behavior: memcpy source and destination overlap", &I);
+      break;
+    }
+    case Intrinsic::memmove: {
+      MemMoveInst *MMI = cast<MemMoveInst>(&I);
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MMI->getDest(), AliasAnalysis::UnknownSize,
+                           MMI->getAlignment(), 0,
+                           MemRef::Write);
+      visitMemoryReference(I, MMI->getSource(), AliasAnalysis::UnknownSize,
+                           MMI->getAlignment(), 0,
+                           MemRef::Read);
+      break;
+    }
+    case Intrinsic::memset: {
+      MemSetInst *MSI = cast<MemSetInst>(&I);
+      // TODO: If the size is known, use it.
+      visitMemoryReference(I, MSI->getDest(), AliasAnalysis::UnknownSize,
+                           MSI->getAlignment(), 0,
+                           MemRef::Write);
+      break;
+    }
+
+    case Intrinsic::vastart:
+      Assert1(I.getParent()->getParent()->isVarArg(),
+              "Undefined behavior: va_start called in a non-varargs function",
+              &I);
+
+      visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
+                           0, 0, MemRef::Read | MemRef::Write);
+      break;
+    case Intrinsic::vacopy:
+      visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
+                           0, 0, MemRef::Write);
+      visitMemoryReference(I, CS.getArgument(1), AliasAnalysis::UnknownSize,
+                           0, 0, MemRef::Read);
+      break;
+    case Intrinsic::vaend:
+      visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
+                           0, 0, MemRef::Read | MemRef::Write);
+      break;
+
+    case Intrinsic::stackrestore:
+      // Stackrestore doesn't read or write memory, but it sets the
+      // stack pointer, which the compiler may read from or write to
+      // at any time, so check it for both readability and writeability.
+      visitMemoryReference(I, CS.getArgument(0), AliasAnalysis::UnknownSize,
+                           0, 0, MemRef::Read | MemRef::Write);
+      break;
+    }
+}
+
+void Lint::visitCallInst(CallInst &I) {
+  return visitCallSite(&I);
+}
+
+void Lint::visitInvokeInst(InvokeInst &I) {
+  return visitCallSite(&I);
+}
+
+void Lint::visitReturnInst(ReturnInst &I) {
+  Function *F = I.getParent()->getParent();
+  Assert1(!F->doesNotReturn(),
+          "Unusual: Return statement in function with noreturn attribute",
+          &I);
+
+  if (Value *V = I.getReturnValue()) {
+    Value *Obj = findValue(V, /*OffsetOk=*/true);
+    Assert1(!isa<AllocaInst>(Obj),
+            "Unusual: Returning alloca value", &I);
+  }
+}
+
+// TODO: Check that the reference is in bounds.
+// TODO: Check readnone/readonly function attributes.
+void Lint::visitMemoryReference(Instruction &I,
+                                Value *Ptr, uint64_t Size, unsigned Align,
+                                Type *Ty, unsigned Flags) {
+  // If no memory is being referenced, it doesn't matter if the pointer
+  // is valid.
+  if (Size == 0)
+    return;
+
+  Value *UnderlyingObject = findValue(Ptr, /*OffsetOk=*/true);
+  Assert1(!isa<ConstantPointerNull>(UnderlyingObject),
+          "Undefined behavior: Null pointer dereference", &I);
+  Assert1(!isa<UndefValue>(UnderlyingObject),
+          "Undefined behavior: Undef pointer dereference", &I);
+  Assert1(!isa<ConstantInt>(UnderlyingObject) ||
+          !cast<ConstantInt>(UnderlyingObject)->isAllOnesValue(),
+          "Unusual: All-ones pointer dereference", &I);
+  Assert1(!isa<ConstantInt>(UnderlyingObject) ||
+          !cast<ConstantInt>(UnderlyingObject)->isOne(),
+          "Unusual: Address one pointer dereference", &I);
+
+  if (Flags & MemRef::Write) {
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(UnderlyingObject))
+      Assert1(!GV->isConstant(),
+              "Undefined behavior: Write to read-only memory", &I);
+    Assert1(!isa<Function>(UnderlyingObject) &&
+            !isa<BlockAddress>(UnderlyingObject),
+            "Undefined behavior: Write to text section", &I);
+  }
+  if (Flags & MemRef::Read) {
+    Assert1(!isa<Function>(UnderlyingObject),
+            "Unusual: Load from function body", &I);
+    Assert1(!isa<BlockAddress>(UnderlyingObject),
+            "Undefined behavior: Load from block address", &I);
+  }
+  if (Flags & MemRef::Callee) {
+    Assert1(!isa<BlockAddress>(UnderlyingObject),
+            "Undefined behavior: Call to block address", &I);
+  }
+  if (Flags & MemRef::Branchee) {
+    Assert1(!isa<Constant>(UnderlyingObject) ||
+            isa<BlockAddress>(UnderlyingObject),
+            "Undefined behavior: Branch to non-blockaddress", &I);
+  }
+
+  // Check for buffer overflows and misalignment.
+  // Only handles memory references that read/write something simple like an
+  // alloca instruction or a global variable.
+  int64_t Offset = 0;
+  if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, TD)) {
+    // OK, so the access is to a constant offset from Ptr.  Check that Ptr is
+    // something we can handle and if so extract the size of this base object
+    // along with its alignment.
+    uint64_t BaseSize = AliasAnalysis::UnknownSize;
+    unsigned BaseAlign = 0;
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
+      Type *ATy = AI->getAllocatedType();
+      if (TD && !AI->isArrayAllocation() && ATy->isSized())
+        BaseSize = TD->getTypeAllocSize(ATy);
+      BaseAlign = AI->getAlignment();
+      if (TD && BaseAlign == 0 && ATy->isSized())
+        BaseAlign = TD->getABITypeAlignment(ATy);
+    } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+      // If the global may be defined differently in another compilation unit
+      // then don't warn about funky memory accesses.
+      if (GV->hasDefinitiveInitializer()) {
+        Type *GTy = GV->getType()->getElementType();
+        if (TD && GTy->isSized())
+          BaseSize = TD->getTypeAllocSize(GTy);
+        BaseAlign = GV->getAlignment();
+        if (TD && BaseAlign == 0 && GTy->isSized())
+          BaseAlign = TD->getABITypeAlignment(GTy);
+      }
+    }
+
+    // Accesses from before the start or after the end of the object are not
+    // defined.
+    Assert1(Size == AliasAnalysis::UnknownSize ||
+            BaseSize == AliasAnalysis::UnknownSize ||
+            (Offset >= 0 && Offset + Size <= BaseSize),
+            "Undefined behavior: Buffer overflow", &I);
+
+    // Accesses that say that the memory is more aligned than it is are not
+    // defined.
+    if (TD && Align == 0 && Ty && Ty->isSized())
+      Align = TD->getABITypeAlignment(Ty);
+    Assert1(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
+            "Undefined behavior: Memory reference address is misaligned", &I);
+  }
+}
+
+void Lint::visitLoadInst(LoadInst &I) {
+  visitMemoryReference(I, I.getPointerOperand(),
+                       AA->getTypeStoreSize(I.getType()), I.getAlignment(),
+                       I.getType(), MemRef::Read);
+}
+
+void Lint::visitStoreInst(StoreInst &I) {
+  visitMemoryReference(I, I.getPointerOperand(),
+                       AA->getTypeStoreSize(I.getOperand(0)->getType()),
+                       I.getAlignment(),
+                       I.getOperand(0)->getType(), MemRef::Write);
+}
+
+void Lint::visitXor(BinaryOperator &I) {
+  Assert1(!isa<UndefValue>(I.getOperand(0)) ||
+          !isa<UndefValue>(I.getOperand(1)),
+          "Undefined result: xor(undef, undef)", &I);
+}
+
+void Lint::visitSub(BinaryOperator &I) {
+  Assert1(!isa<UndefValue>(I.getOperand(0)) ||
+          !isa<UndefValue>(I.getOperand(1)),
+          "Undefined result: sub(undef, undef)", &I);
+}
+
+void Lint::visitLShr(BinaryOperator &I) {
+  if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
+    Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+            "Undefined result: Shift count out of range", &I);
+}
+
+void Lint::visitAShr(BinaryOperator &I) {
+  if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
+    Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+            "Undefined result: Shift count out of range", &I);
+}
+
+void Lint::visitShl(BinaryOperator &I) {
+  if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(findValue(I.getOperand(1), /*OffsetOk=*/false)))
+    Assert1(CI->getValue().ult(cast<IntegerType>(I.getType())->getBitWidth()),
+            "Undefined result: Shift count out of range", &I);
+}
+
+static bool isZero(Value *V, DataLayout *TD) {
+  // Assume undef could be zero.
+  if (isa<UndefValue>(V)) return true;
+
+  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
+  return KnownZero.isAllOnesValue();
+}
+
+void Lint::visitSDiv(BinaryOperator &I) {
+  Assert1(!isZero(I.getOperand(1), TD),
+          "Undefined behavior: Division by zero", &I);
+}
+
+void Lint::visitUDiv(BinaryOperator &I) {
+  Assert1(!isZero(I.getOperand(1), TD),
+          "Undefined behavior: Division by zero", &I);
+}
+
+void Lint::visitSRem(BinaryOperator &I) {
+  Assert1(!isZero(I.getOperand(1), TD),
+          "Undefined behavior: Division by zero", &I);
+}
+
+void Lint::visitURem(BinaryOperator &I) {
+  Assert1(!isZero(I.getOperand(1), TD),
+          "Undefined behavior: Division by zero", &I);
+}
+
+void Lint::visitAllocaInst(AllocaInst &I) {
+  if (isa<ConstantInt>(I.getArraySize()))
+    // This isn't undefined behavior, it's just an obvious pessimization.
+    Assert1(&I.getParent()->getParent()->getEntryBlock() == I.getParent(),
+            "Pessimization: Static alloca outside of entry block", &I);
+
+  // TODO: Check for an unusual size (MSB set?)
+}
+
+void Lint::visitVAArgInst(VAArgInst &I) {
+  visitMemoryReference(I, I.getOperand(0), AliasAnalysis::UnknownSize, 0, 0,
+                       MemRef::Read | MemRef::Write);
+}
+
+void Lint::visitIndirectBrInst(IndirectBrInst &I) {
+  visitMemoryReference(I, I.getAddress(), AliasAnalysis::UnknownSize, 0, 0,
+                       MemRef::Branchee);
+
+  Assert1(I.getNumDestinations() != 0,
+          "Undefined behavior: indirectbr with no destinations", &I);
+}
+
+void Lint::visitExtractElementInst(ExtractElementInst &I) {
+  if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(findValue(I.getIndexOperand(),
+                                        /*OffsetOk=*/false)))
+    Assert1(CI->getValue().ult(I.getVectorOperandType()->getNumElements()),
+            "Undefined result: extractelement index out of range", &I);
+}
+
+void Lint::visitInsertElementInst(InsertElementInst &I) {
+  if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(findValue(I.getOperand(2),
+                                        /*OffsetOk=*/false)))
+    Assert1(CI->getValue().ult(I.getType()->getNumElements()),
+            "Undefined result: insertelement index out of range", &I);
+}
+
+void Lint::visitUnreachableInst(UnreachableInst &I) {
+  // This isn't undefined behavior, it's merely suspicious.
+  Assert1(&I == I.getParent()->begin() ||
+          prior(BasicBlock::iterator(&I))->mayHaveSideEffects(),
+          "Unusual: unreachable immediately preceded by instruction without "
+          "side effects", &I);
+}
+
+/// findValue - Look through bitcasts and simple memory reference patterns
+/// to identify an equivalent, but more informative, value.  If OffsetOk
+/// is true, look through getelementptrs with non-zero offsets too.
+///
+/// Most analysis passes don't require this logic, because instcombine
+/// will simplify most of these kinds of things away. But it's a goal of
+/// this Lint pass to be useful even on non-optimized IR.
+Value *Lint::findValue(Value *V, bool OffsetOk) const {
+  SmallPtrSet<Value *, 4> Visited;
+  return findValueImpl(V, OffsetOk, Visited);
+}
+
+/// findValueImpl - Implementation helper for findValue.
+Value *Lint::findValueImpl(Value *V, bool OffsetOk,
+                           SmallPtrSet<Value *, 4> &Visited) const {
+  // Detect self-referential values.
+  if (!Visited.insert(V))
+    return UndefValue::get(V->getType());
+
+  // TODO: Look through sext or zext cast, when the result is known to
+  // be interpreted as signed or unsigned, respectively.
+  // TODO: Look through eliminable cast pairs.
+  // TODO: Look through calls with unique return values.
+  // TODO: Look through vector insert/extract/shuffle.
+  V = OffsetOk ? GetUnderlyingObject(V, TD) : V->stripPointerCasts();
+  if (LoadInst *L = dyn_cast<LoadInst>(V)) {
+    BasicBlock::iterator BBI = L;
+    BasicBlock *BB = L->getParent();
+    SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
+    for (;;) {
+      if (!VisitedBlocks.insert(BB)) break;
+      if (Value *U = FindAvailableLoadedValue(L->getPointerOperand(),
+                                              BB, BBI, 6, AA))
+        return findValueImpl(U, OffsetOk, Visited);
+      if (BBI != BB->begin()) break;
+      BB = BB->getUniquePredecessor();
+      if (!BB) break;
+      BBI = BB->end();
+    }
+  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (Value *W = PN->hasConstantValue())
+      if (W != V)
+        return findValueImpl(W, OffsetOk, Visited);
+  } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
+    if (CI->isNoopCast(TD ? TD->getIntPtrType(V->getContext()) :
+                            Type::getInt64Ty(V->getContext())))
+      return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
+  } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) {
+    if (Value *W = FindInsertedValue(Ex->getAggregateOperand(),
+                                     Ex->getIndices()))
+      if (W != V)
+        return findValueImpl(W, OffsetOk, Visited);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    // Same as above, but for ConstantExpr instead of Instruction.
+    if (Instruction::isCast(CE->getOpcode())) {
+      if (CastInst::isNoopCast(Instruction::CastOps(CE->getOpcode()),
+                               CE->getOperand(0)->getType(),
+                               CE->getType(),
+                               TD ? TD->getIntPtrType(V->getContext()) :
+                                    Type::getInt64Ty(V->getContext())))
+        return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
+    } else if (CE->getOpcode() == Instruction::ExtractValue) {
+      ArrayRef<unsigned> Indices = CE->getIndices();
+      if (Value *W = FindInsertedValue(CE->getOperand(0), Indices))
+        if (W != V)
+          return findValueImpl(W, OffsetOk, Visited);
+    }
+  }
+
+  // As a last resort, try SimplifyInstruction or constant folding.
+  if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+    if (Value *W = SimplifyInstruction(Inst, TD, TLI, DT))
+      return findValueImpl(W, OffsetOk, Visited);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (Value *W = ConstantFoldConstantExpression(CE, TD, TLI))
+      if (W != V)
+        return findValueImpl(W, OffsetOk, Visited);
+  }
+
+  return V;
+}
+
+//===----------------------------------------------------------------------===//
+//  Implement the public interfaces to this file...
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createLintPass() {
+  return new Lint();
+}
+
+/// lintFunction - Check a function for errors, printing messages on stderr.
+///
+void llvm::lintFunction(const Function &f) {
+  Function &F = const_cast<Function&>(f);
+  assert(!F.isDeclaration() && "Cannot lint external functions");
+
+  FunctionPassManager FPM(F.getParent());
+  Lint *V = new Lint();
+  FPM.add(V);
+  FPM.run(F);
+}
+
+/// lintModule - Check a module for errors, printing messages on stderr.
+///
+void llvm::lintModule(const Module &M) {
+  PassManager PM;
+  Lint *V = new Lint();
+  PM.add(V);
+  PM.run(const_cast<Module&>(M));
+}
diff --git a/contrib/llvm/lib/Analysis/Loads.cpp b/contrib/llvm/lib/Analysis/Loads.cpp
new file mode 100644
index 000000000000..0902a39a9f81
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/Loads.cpp
@@ -0,0 +1,221 @@
+//===- Loads.cpp - Local load analysis ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines simple local analyses for load instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+using namespace llvm;
+
+/// AreEquivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+
+  // Test if the values come from identical arithmetic instructions.
+  // Use isIdenticalToWhenDefined instead of isIdenticalTo because
+  // this function is only used when one address use dominates the
+  // other, which means that they'll always either have the same
+  // value or one of them will have an undefined value.
+  if (isa<BinaryOperator>(A) || isa<CastInst>(A) ||
+      isa<PHINode>(A) || isa<GetElementPtrInst>(A))
+    if (const Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+        return true;
+
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+/// isSafeToLoadUnconditionally - Return true if we know that executing a load
+/// from this value cannot trap.  If it is not obviously safe to load from the
+/// specified pointer, we do a quick local scan of the basic block containing
+/// ScanFrom, to determine if the address is already accessed.
+bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
+                                       unsigned Align, const DataLayout *TD) {
+  int64_t ByteOffset = 0;
+  Value *Base = V;
+  Base = GetPointerBaseWithConstantOffset(V, ByteOffset, TD);
+
+  if (ByteOffset < 0) // out of bounds
+    return false;
+
+  Type *BaseType = 0;
+  unsigned BaseAlign = 0;
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
+    // An alloca is safe to load from as load as it is suitably aligned.
+    BaseType = AI->getAllocatedType();
+    BaseAlign = AI->getAlignment();
+  } else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+    // Global variables are safe to load from but their size cannot be
+    // guaranteed if they are overridden.
+    if (!GV->mayBeOverridden()) {
+      BaseType = GV->getType()->getElementType();
+      BaseAlign = GV->getAlignment();
+    }
+  }
+
+  if (BaseType && BaseType->isSized()) {
+    if (TD && BaseAlign == 0)
+      BaseAlign = TD->getPrefTypeAlignment(BaseType);
+
+    if (Align <= BaseAlign) {
+      if (!TD)
+        return true; // Loading directly from an alloca or global is OK.
+
+      // Check if the load is within the bounds of the underlying object.
+      PointerType *AddrTy = cast<PointerType>(V->getType());
+      uint64_t LoadSize = TD->getTypeStoreSize(AddrTy->getElementType());
+      if (ByteOffset + LoadSize <= TD->getTypeAllocSize(BaseType) &&
+          (Align == 0 || (ByteOffset % Align) == 0))
+        return true;
+    }
+  }
+
+  // Otherwise, be a little bit aggressive by scanning the local block where we
+  // want to check to see if the pointer is already being loaded or stored
+  // from/to.  If so, the previous load or store would have already trapped,
+  // so there is no harm doing an extra load (also, CSE will later eliminate
+  // the load entirely).
+  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+
+  while (BBI != E) {
+    --BBI;
+
+    // If we see a free or a call which may write to memory (i.e. which might do
+    // a free) the pointer could be marked invalid.
+    if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
+        !isa<DbgInfoIntrinsic>(BBI))
+      return false;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (AreEquivalentAddressValues(LI->getOperand(0), V)) return true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+      if (AreEquivalentAddressValues(SI->getOperand(1), V)) return true;
+    }
+  }
+  return false;
+}
+
+/// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at the
+/// instruction before ScanFrom) checking to see if we have the value at the
+/// memory address *Ptr locally available within a small number of instructions.
+/// If the value is available, return it.
+///
+/// If not, return the iterator for the last validated instruction that the 
+/// value would be live through.  If we scanned the entire block and didn't find
+/// something that invalidates *Ptr or provides it, ScanFrom would be left at
+/// begin() and this returns null.  ScanFrom could also be left 
+///
+/// MaxInstsToScan specifies the maximum instructions to scan in the block.  If
+/// it is set to 0, it will scan the whole block. You can also optionally
+/// specify an alias analysis implementation, which makes this more precise.
+///
+/// If TBAATag is non-null and a load or store is found, the TBAA tag from the
+/// load or store is recorded there.  If there is no TBAA tag or if no access
+/// is found, it is left unmodified.
+Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
+                                      BasicBlock::iterator &ScanFrom,
+                                      unsigned MaxInstsToScan,
+                                      AliasAnalysis *AA,
+                                      MDNode **TBAATag) {
+  if (MaxInstsToScan == 0) MaxInstsToScan = ~0U;
+
+  // If we're using alias analysis to disambiguate get the size of *Ptr.
+  uint64_t AccessSize = 0;
+  if (AA) {
+    Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+    AccessSize = AA->getTypeStoreSize(AccessTy);
+  }
+  
+  while (ScanFrom != ScanBB->begin()) {
+    // We must ignore debug info directives when counting (otherwise they
+    // would affect codegen).
+    Instruction *Inst = --ScanFrom;
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    // Restore ScanFrom to expected value in case next test succeeds
+    ScanFrom++;
+   
+    // Don't scan huge blocks.
+    if (MaxInstsToScan-- == 0) return 0;
+    
+    --ScanFrom;
+    // If this is a load of Ptr, the loaded value is available.
+    // (This is true even if the load is volatile or atomic, although
+    // those cases are unlikely.)
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      if (AreEquivalentAddressValues(LI->getOperand(0), Ptr)) {
+        if (TBAATag) *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
+        return LI;
+      }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // If this is a store through Ptr, the value is available!
+      // (This is true even if the store is volatile or atomic, although
+      // those cases are unlikely.)
+      if (AreEquivalentAddressValues(SI->getOperand(1), Ptr)) {
+        if (TBAATag) *TBAATag = SI->getMetadata(LLVMContext::MD_tbaa);
+        return SI->getOperand(0);
+      }
+      
+      // If Ptr is an alloca and this is a store to a different alloca, ignore
+      // the store.  This is a trivial form of alias analysis that is important
+      // for reg2mem'd code.
+      if ((isa<AllocaInst>(Ptr) || isa<GlobalVariable>(Ptr)) &&
+          (isa<AllocaInst>(SI->getOperand(1)) ||
+           isa<GlobalVariable>(SI->getOperand(1))))
+        continue;
+      
+      // If we have alias analysis and it says the store won't modify the loaded
+      // value, ignore the store.
+      if (AA &&
+          (AA->getModRefInfo(SI, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // Otherwise the store that may or may not alias the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+    
+    // If this is some other instruction that may clobber Ptr, bail out.
+    if (Inst->mayWriteToMemory()) {
+      // If alias analysis claims that it really won't modify the load,
+      // ignore it.
+      if (AA &&
+          (AA->getModRefInfo(Inst, Ptr, AccessSize) & AliasAnalysis::Mod) == 0)
+        continue;
+      
+      // May modify the pointer, bail out.
+      ++ScanFrom;
+      return 0;
+    }
+  }
+  
+  // Got to the start of the block, we didn't find it, but are done for this
+  // block.
+  return 0;
+}
diff --git a/contrib/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm/lib/Analysis/LoopInfo.cpp
new file mode 100644
index 000000000000..f1ad6506e4ba
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/LoopInfo.cpp
@@ -0,0 +1,687 @@
+//===- LoopInfo.cpp - Natural Loop Calculator -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoopInfo class that is used to identify natural loops
+// and determine the loop depth of various nodes of the CFG.  Note that the
+// loops identified may actually be several natural loops that share the same
+// header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+using namespace llvm;
+
+// Explicitly instantiate methods in LoopInfoImpl.h for IR-level Loops.
+template class llvm::LoopBase<BasicBlock, Loop>;
+template class llvm::LoopInfoBase<BasicBlock, Loop>;
+
+// Always verify loopinfo if expensive checking is enabled.
+#ifdef XDEBUG
+static bool VerifyLoopInfo = true;
+#else
+static bool VerifyLoopInfo = false;
+#endif
+static cl::opt<bool,true>
+VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
+                cl::desc("Verify loop info (time consuming)"));
+
+char LoopInfo::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInfo, "loops", "Natural Loop Information", true, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(LoopInfo, "loops", "Natural Loop Information", true, true)
+
+//===----------------------------------------------------------------------===//
+// Loop implementation
+//
+
+/// isLoopInvariant - Return true if the specified value is loop invariant
+///
+bool Loop::isLoopInvariant(Value *V) const {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return !contains(I);
+  return true;  // All non-instructions are loop invariant
+}
+
+/// hasLoopInvariantOperands - Return true if all the operands of the
+/// specified instruction are loop invariant.
+bool Loop::hasLoopInvariantOperands(Instruction *I) const {
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!isLoopInvariant(I->getOperand(i)))
+      return false;
+
+  return true;
+}
+
+/// makeLoopInvariant - If the given value is an instruciton inside of the
+/// loop and it can be hoisted, do so to make it trivially loop-invariant.
+/// Return true if the value after any hoisting is loop invariant. This
+/// function can be used as a slightly more aggressive replacement for
+/// isLoopInvariant.
+///
+/// If InsertPt is specified, it is the point to hoist instructions to.
+/// If null, the terminator of the loop preheader is used.
+///
+bool Loop::makeLoopInvariant(Value *V, bool &Changed,
+                             Instruction *InsertPt) const {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return makeLoopInvariant(I, Changed, InsertPt);
+  return true;  // All non-instructions are loop-invariant.
+}
+
+/// makeLoopInvariant - If the given instruction is inside of the
+/// loop and it can be hoisted, do so to make it trivially loop-invariant.
+/// Return true if the instruction after any hoisting is loop invariant. This
+/// function can be used as a slightly more aggressive replacement for
+/// isLoopInvariant.
+///
+/// If InsertPt is specified, it is the point to hoist instructions to.
+/// If null, the terminator of the loop preheader is used.
+///
+bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
+                             Instruction *InsertPt) const {
+  // Test if the value is already loop-invariant.
+  if (isLoopInvariant(I))
+    return true;
+  if (!isSafeToSpeculativelyExecute(I))
+    return false;
+  if (I->mayReadFromMemory())
+    return false;
+  // The landingpad instruction is immobile.
+  if (isa<LandingPadInst>(I))
+    return false;
+  // Determine the insertion point, unless one was given.
+  if (!InsertPt) {
+    BasicBlock *Preheader = getLoopPreheader();
+    // Without a preheader, hoisting is not feasible.
+    if (!Preheader)
+      return false;
+    InsertPt = Preheader->getTerminator();
+  }
+  // Don't hoist instructions with loop-variant operands.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!makeLoopInvariant(I->getOperand(i), Changed, InsertPt))
+      return false;
+
+  // Hoist.
+  I->moveBefore(InsertPt);
+  Changed = true;
+  return true;
+}
+
+/// getCanonicalInductionVariable - Check to see if the loop has a canonical
+/// induction variable: an integer recurrence that starts at 0 and increments
+/// by one each time through the loop.  If so, return the phi node that
+/// corresponds to it.
+///
+/// The IndVarSimplify pass transforms loops to have a canonical induction
+/// variable.
+///
+PHINode *Loop::getCanonicalInductionVariable() const {
+  BasicBlock *H = getHeader();
+
+  BasicBlock *Incoming = 0, *Backedge = 0;
+  pred_iterator PI = pred_begin(H);
+  assert(PI != pred_end(H) &&
+         "Loop must have at least one backedge!");
+  Backedge = *PI++;
+  if (PI == pred_end(H)) return 0;  // dead loop
+  Incoming = *PI++;
+  if (PI != pred_end(H)) return 0;  // multiple backedges?
+
+  if (contains(Incoming)) {
+    if (contains(Backedge))
+      return 0;
+    std::swap(Incoming, Backedge);
+  } else if (!contains(Backedge))
+    return 0;
+
+  // Loop over all of the PHI nodes, looking for a canonical indvar.
+  for (BasicBlock::iterator I = H->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    if (ConstantInt *CI =
+        dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Incoming)))
+      if (CI->isNullValue())
+        if (Instruction *Inc =
+            dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge)))
+          if (Inc->getOpcode() == Instruction::Add &&
+                Inc->getOperand(0) == PN)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(Inc->getOperand(1)))
+              if (CI->equalsInt(1))
+                return PN;
+  }
+  return 0;
+}
+
+/// isLCSSAForm - Return true if the Loop is in LCSSA form
+bool Loop::isLCSSAForm(DominatorTree &DT) const {
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallPtrSet<BasicBlock*, 16> LoopBBs(block_begin(), block_end());
+
+  for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
+    BasicBlock *BB = *BI;
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+           ++UI) {
+        User *U = *UI;
+        BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+        if (PHINode *P = dyn_cast<PHINode>(U))
+          UserBB = P->getIncomingBlock(UI);
+
+        // Check the current block, as a fast-path, before checking whether
+        // the use is anywhere in the loop.  Most values are used in the same
+        // block they are defined in.  Also, blocks not reachable from the
+        // entry are special; uses in them don't need to go through PHIs.
+        if (UserBB != BB &&
+            !LoopBBs.count(UserBB) &&
+            DT.isReachableFromEntry(UserBB))
+          return false;
+      }
+  }
+
+  return true;
+}
+
+/// isLoopSimplifyForm - Return true if the Loop is in the form that
+/// the LoopSimplify form transforms loops to, which is sometimes called
+/// normal form.
+bool Loop::isLoopSimplifyForm() const {
+  // Normal-form loops have a preheader, a single backedge, and all of their
+  // exits have all their predecessors inside the loop.
+  return getLoopPreheader() && getLoopLatch() && hasDedicatedExits();
+}
+
+/// isSafeToClone - Return true if the loop body is safe to clone in practice.
+/// Routines that reform the loop CFG and split edges often fail on indirectbr.
+bool Loop::isSafeToClone() const {
+  // Return false if any loop blocks contain indirectbrs, or there are any calls
+  // to noduplicate functions.
+  for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
+    if (isa<IndirectBrInst>((*I)->getTerminator())) {
+      return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
+      if (II->hasFnAttr(Attribute::NoDuplicate))
+        return false;
+    }
+
+    for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
+      if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
+        if (CI->hasFnAttr(Attribute::NoDuplicate))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool Loop::isAnnotatedParallel() const {
+
+  BasicBlock *latch = getLoopLatch();
+  if (latch == NULL)
+    return false;
+
+  MDNode *desiredLoopIdMetadata =
+    latch->getTerminator()->getMetadata("llvm.loop.parallel");
+
+  if (!desiredLoopIdMetadata)
+      return false;
+
+  // The loop branch contains the parallel loop metadata. In order to ensure
+  // that any parallel-loop-unaware optimization pass hasn't added loop-carried
+  // dependencies (thus converted the loop back to a sequential loop), check
+  // that all the memory instructions in the loop contain parallelism metadata
+  // that point to the same unique "loop id metadata" the loop branch does.
+  for (block_iterator BB = block_begin(), BE = block_end(); BB != BE; ++BB) {
+    for (BasicBlock::iterator II = (*BB)->begin(), EE = (*BB)->end();
+         II != EE; II++) {
+
+      if (!II->mayReadOrWriteMemory())
+        continue;
+
+      if (!II->getMetadata("llvm.mem.parallel_loop_access"))
+        return false;
+
+      // The memory instruction can refer to the loop identifier metadata
+      // directly or indirectly through another list metadata (in case of
+      // nested parallel loops). The loop identifier metadata refers to
+      // itself so we can check both cases with the same routine.
+      MDNode *loopIdMD =
+          dyn_cast<MDNode>(II->getMetadata("llvm.mem.parallel_loop_access"));
+      bool loopIdMDFound = false;
+      for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
+        if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
+          loopIdMDFound = true;
+          break;
+        }
+      }
+
+      if (!loopIdMDFound)
+        return false;
+    }
+  }
+  return true;
+}
+
+
+/// hasDedicatedExits - Return true if no exit block for the loop
+/// has a predecessor that is outside the loop.
+bool Loop::hasDedicatedExits() const {
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
+  // Each predecessor of each exit block of a normal loop is contained
+  // within the loop.
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  getExitBlocks(ExitBlocks);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    for (pred_iterator PI = pred_begin(ExitBlocks[i]),
+         PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
+      if (!LoopBBs.count(*PI))
+        return false;
+  // All the requirements are met.
+  return true;
+}
+
+/// getUniqueExitBlocks - Return all unique successor blocks of this loop.
+/// These are the blocks _outside of the current loop_ which are branched to.
+/// This assumes that loop exits are in canonical form.
+///
+void
+Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
+  assert(hasDedicatedExits() &&
+         "getUniqueExitBlocks assumes the loop has canonical form exits!");
+
+  // Sort the blocks vector so that we can use binary search to do quick
+  // lookups.
+  SmallVector<BasicBlock *, 128> LoopBBs(block_begin(), block_end());
+  std::sort(LoopBBs.begin(), LoopBBs.end());
+
+  SmallVector<BasicBlock *, 32> switchExitBlocks;
+
+  for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
+
+    BasicBlock *current = *BI;
+    switchExitBlocks.clear();
+
+    for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
+      // If block is inside the loop then it is not a exit block.
+      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+        continue;
+
+      pred_iterator PI = pred_begin(*I);
+      BasicBlock *firstPred = *PI;
+
+      // If current basic block is this exit block's first predecessor
+      // then only insert exit block in to the output ExitBlocks vector.
+      // This ensures that same exit block is not inserted twice into
+      // ExitBlocks vector.
+      if (current != firstPred)
+        continue;
+
+      // If a terminator has more then two successors, for example SwitchInst,
+      // then it is possible that there are multiple edges from current block
+      // to one exit block.
+      if (std::distance(succ_begin(current), succ_end(current)) <= 2) {
+        ExitBlocks.push_back(*I);
+        continue;
+      }
+
+      // In case of multiple edges from current block to exit block, collect
+      // only one edge in ExitBlocks. Use switchExitBlocks to keep track of
+      // duplicate edges.
+      if (std::find(switchExitBlocks.begin(), switchExitBlocks.end(), *I)
+          == switchExitBlocks.end()) {
+        switchExitBlocks.push_back(*I);
+        ExitBlocks.push_back(*I);
+      }
+    }
+  }
+}
+
+/// getUniqueExitBlock - If getUniqueExitBlocks would return exactly one
+/// block, return that block. Otherwise return null.
+BasicBlock *Loop::getUniqueExitBlock() const {
+  SmallVector<BasicBlock *, 8> UniqueExitBlocks;
+  getUniqueExitBlocks(UniqueExitBlocks);
+  if (UniqueExitBlocks.size() == 1)
+    return UniqueExitBlocks[0];
+  return 0;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Loop::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// UnloopUpdater implementation
+//
+
+namespace {
+/// Find the new parent loop for all blocks within the "unloop" whose last
+/// backedges has just been removed.
+class UnloopUpdater {
+  Loop *Unloop;
+  LoopInfo *LI;
+
+  LoopBlocksDFS DFS;
+
+  // Map unloop's immediate subloops to their nearest reachable parents. Nested
+  // loops within these subloops will not change parents. However, an immediate
+  // subloop's new parent will be the nearest loop reachable from either its own
+  // exits *or* any of its nested loop's exits.
+  DenseMap<Loop*, Loop*> SubloopParents;
+
+  // Flag the presence of an irreducible backedge whose destination is a block
+  // directly contained by the original unloop.
+  bool FoundIB;
+
+public:
+  UnloopUpdater(Loop *UL, LoopInfo *LInfo) :
+    Unloop(UL), LI(LInfo), DFS(UL), FoundIB(false) {}
+
+  void updateBlockParents();
+
+  void removeBlocksFromAncestors();
+
+  void updateSubloopParents();
+
+protected:
+  Loop *getNearestLoop(BasicBlock *BB, Loop *BBLoop);
+};
+} // end anonymous namespace
+
+/// updateBlockParents - Update the parent loop for all blocks that are directly
+/// contained within the original "unloop".
+void UnloopUpdater::updateBlockParents() {
+  if (Unloop->getNumBlocks()) {
+    // Perform a post order CFG traversal of all blocks within this loop,
+    // propagating the nearest loop from sucessors to predecessors.
+    LoopBlocksTraversal Traversal(DFS, LI);
+    for (LoopBlocksTraversal::POTIterator POI = Traversal.begin(),
+           POE = Traversal.end(); POI != POE; ++POI) {
+
+      Loop *L = LI->getLoopFor(*POI);
+      Loop *NL = getNearestLoop(*POI, L);
+
+      if (NL != L) {
+        // For reducible loops, NL is now an ancestor of Unloop.
+        assert((NL != Unloop && (!NL || NL->contains(Unloop))) &&
+               "uninitialized successor");
+        LI->changeLoopFor(*POI, NL);
+      }
+      else {
+        // Or the current block is part of a subloop, in which case its parent
+        // is unchanged.
+        assert((FoundIB || Unloop->contains(L)) && "uninitialized successor");
+      }
+    }
+  }
+  // Each irreducible loop within the unloop induces a round of iteration using
+  // the DFS result cached by Traversal.
+  bool Changed = FoundIB;
+  for (unsigned NIters = 0; Changed; ++NIters) {
+    assert(NIters < Unloop->getNumBlocks() && "runaway iterative algorithm");
+
+    // Iterate over the postorder list of blocks, propagating the nearest loop
+    // from successors to predecessors as before.
+    Changed = false;
+    for (LoopBlocksDFS::POIterator POI = DFS.beginPostorder(),
+           POE = DFS.endPostorder(); POI != POE; ++POI) {
+
+      Loop *L = LI->getLoopFor(*POI);
+      Loop *NL = getNearestLoop(*POI, L);
+      if (NL != L) {
+        assert(NL != Unloop && (!NL || NL->contains(Unloop)) &&
+               "uninitialized successor");
+        LI->changeLoopFor(*POI, NL);
+        Changed = true;
+      }
+    }
+  }
+}
+
+/// removeBlocksFromAncestors - Remove unloop's blocks from all ancestors below
+/// their new parents.
+void UnloopUpdater::removeBlocksFromAncestors() {
+  // Remove all unloop's blocks (including those in nested subloops) from
+  // ancestors below the new parent loop.
+  for (Loop::block_iterator BI = Unloop->block_begin(),
+         BE = Unloop->block_end(); BI != BE; ++BI) {
+    Loop *OuterParent = LI->getLoopFor(*BI);
+    if (Unloop->contains(OuterParent)) {
+      while (OuterParent->getParentLoop() != Unloop)
+        OuterParent = OuterParent->getParentLoop();
+      OuterParent = SubloopParents[OuterParent];
+    }
+    // Remove blocks from former Ancestors except Unloop itself which will be
+    // deleted.
+    for (Loop *OldParent = Unloop->getParentLoop(); OldParent != OuterParent;
+         OldParent = OldParent->getParentLoop()) {
+      assert(OldParent && "new loop is not an ancestor of the original");
+      OldParent->removeBlockFromLoop(*BI);
+    }
+  }
+}
+
+/// updateSubloopParents - Update the parent loop for all subloops directly
+/// nested within unloop.
+void UnloopUpdater::updateSubloopParents() {
+  while (!Unloop->empty()) {
+    Loop *Subloop = *llvm::prior(Unloop->end());
+    Unloop->removeChildLoop(llvm::prior(Unloop->end()));
+
+    assert(SubloopParents.count(Subloop) && "DFS failed to visit subloop");
+    if (Loop *Parent = SubloopParents[Subloop])
+      Parent->addChildLoop(Subloop);
+    else
+      LI->addTopLevelLoop(Subloop);
+  }
+}
+
+/// getNearestLoop - Return the nearest parent loop among this block's
+/// successors. If a successor is a subloop header, consider its parent to be
+/// the nearest parent of the subloop's exits.
+///
+/// For subloop blocks, simply update SubloopParents and return NULL.
+Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
+
+  // Initially for blocks directly contained by Unloop, NearLoop == Unloop and
+  // is considered uninitialized.
+  Loop *NearLoop = BBLoop;
+
+  Loop *Subloop = 0;
+  if (NearLoop != Unloop && Unloop->contains(NearLoop)) {
+    Subloop = NearLoop;
+    // Find the subloop ancestor that is directly contained within Unloop.
+    while (Subloop->getParentLoop() != Unloop) {
+      Subloop = Subloop->getParentLoop();
+      assert(Subloop && "subloop is not an ancestor of the original loop");
+    }
+    // Get the current nearest parent of the Subloop exits, initially Unloop.
+    NearLoop =
+      SubloopParents.insert(std::make_pair(Subloop, Unloop)).first->second;
+  }
+
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E) {
+    assert(!Subloop && "subloop blocks must have a successor");
+    NearLoop = 0; // unloop blocks may now exit the function.
+  }
+  for (; I != E; ++I) {
+    if (*I == BB)
+      continue; // self loops are uninteresting
+
+    Loop *L = LI->getLoopFor(*I);
+    if (L == Unloop) {
+      // This successor has not been processed. This path must lead to an
+      // irreducible backedge.
+      assert((FoundIB || !DFS.hasPostorder(*I)) && "should have seen IB");
+      FoundIB = true;
+    }
+    if (L != Unloop && Unloop->contains(L)) {
+      // Successor is in a subloop.
+      if (Subloop)
+        continue; // Branching within subloops. Ignore it.
+
+      // BB branches from the original into a subloop header.
+      assert(L->getParentLoop() == Unloop && "cannot skip into nested loops");
+
+      // Get the current nearest parent of the Subloop's exits.
+      L = SubloopParents[L];
+      // L could be Unloop if the only exit was an irreducible backedge.
+    }
+    if (L == Unloop) {
+      continue;
+    }
+    // Handle critical edges from Unloop into a sibling loop.
+    if (L && !L->contains(Unloop)) {
+      L = L->getParentLoop();
+    }
+    // Remember the nearest parent loop among successors or subloop exits.
+    if (NearLoop == Unloop || !NearLoop || NearLoop->contains(L))
+      NearLoop = L;
+  }
+  if (Subloop) {
+    SubloopParents[Subloop] = NearLoop;
+    return BBLoop;
+  }
+  return NearLoop;
+}
+
+//===----------------------------------------------------------------------===//
+// LoopInfo implementation
+//
+bool LoopInfo::runOnFunction(Function &) {
+  releaseMemory();
+  LI.Analyze(getAnalysis<DominatorTree>().getBase());
+  return false;
+}
+
+/// updateUnloop - The last backedge has been removed from a loop--now the
+/// "unloop". Find a new parent for the blocks contained within unloop and
+/// update the loop tree. We don't necessarily have valid dominators at this
+/// point, but LoopInfo is still valid except for the removal of this loop.
+///
+/// Note that Unloop may now be an empty loop. Calling Loop::getHeader without
+/// checking first is illegal.
+void LoopInfo::updateUnloop(Loop *Unloop) {
+
+  // First handle the special case of no parent loop to simplify the algorithm.
+  if (!Unloop->getParentLoop()) {
+    // Since BBLoop had no parent, Unloop blocks are no longer in a loop.
+    for (Loop::block_iterator I = Unloop->block_begin(),
+         E = Unloop->block_end(); I != E; ++I) {
+
+      // Don't reparent blocks in subloops.
+      if (getLoopFor(*I) != Unloop)
+        continue;
+
+      // Blocks no longer have a parent but are still referenced by Unloop until
+      // the Unloop object is deleted.
+      LI.changeLoopFor(*I, 0);
+    }
+
+    // Remove the loop from the top-level LoopInfo object.
+    for (LoopInfo::iterator I = LI.begin();; ++I) {
+      assert(I != LI.end() && "Couldn't find loop");
+      if (*I == Unloop) {
+        LI.removeLoop(I);
+        break;
+      }
+    }
+
+    // Move all of the subloops to the top-level.
+    while (!Unloop->empty())
+      LI.addTopLevelLoop(Unloop->removeChildLoop(llvm::prior(Unloop->end())));
+
+    return;
+  }
+
+  // Update the parent loop for all blocks within the loop. Blocks within
+  // subloops will not change parents.
+  UnloopUpdater Updater(Unloop, this);
+  Updater.updateBlockParents();
+
+  // Remove blocks from former ancestor loops.
+  Updater.removeBlocksFromAncestors();
+
+  // Add direct subloops as children in their new parent loop.
+  Updater.updateSubloopParents();
+
+  // Remove unloop from its parent loop.
+  Loop *ParentLoop = Unloop->getParentLoop();
+  for (Loop::iterator I = ParentLoop->begin();; ++I) {
+    assert(I != ParentLoop->end() && "Couldn't find loop");
+    if (*I == Unloop) {
+      ParentLoop->removeChildLoop(I);
+      break;
+    }
+  }
+}
+
+void LoopInfo::verifyAnalysis() const {
+  // LoopInfo is a FunctionPass, but verifying every loop in the function
+  // each time verifyAnalysis is called is very expensive. The
+  // -verify-loop-info option can enable this. In order to perform some
+  // checking by default, LoopPass has been taught to call verifyLoop
+  // manually during loop pass sequences.
+
+  if (!VerifyLoopInfo) return;
+
+  DenseSet<const Loop*> Loops;
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
+    (*I)->verifyLoopNest(&Loops);
+  }
+
+  // Verify that blocks are mapped to valid loops.
+  for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(),
+         E = LI.BBMap.end(); I != E; ++I) {
+    assert(Loops.count(I->second) && "orphaned loop");
+    assert(I->second->contains(I->first) && "orphaned block");
+  }
+}
+
+void LoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DominatorTree>();
+}
+
+void LoopInfo::print(raw_ostream &OS, const Module*) const {
+  LI.print(OS);
+}
+
+//===----------------------------------------------------------------------===//
+// LoopBlocksDFS implementation
+//
+
+/// Traverse the loop blocks and store the DFS result.
+/// Useful for clients that just want the final DFS result and don't need to
+/// visit blocks during the initial traversal.
+void LoopBlocksDFS::perform(LoopInfo *LI) {
+  LoopBlocksTraversal Traversal(*this, LI);
+  for (LoopBlocksTraversal::POTIterator POI = Traversal.begin(),
+         POE = Traversal.end(); POI != POE; ++POI) ;
+}
diff --git a/contrib/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm/lib/Analysis/LoopPass.cpp
new file mode 100644
index 000000000000..1540112fe10c
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/LoopPass.cpp
@@ -0,0 +1,363 @@
+//===- LoopPass.cpp - Loop Pass and Loop Pass Manager ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LoopPass and LPPassManager. All loop optimization
+// and transformation passes are derived from LoopPass. LPPassManager is
+// responsible for managing LoopPasses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Timer.h"
+using namespace llvm;
+
+namespace {
+
+/// PrintLoopPass - Print a Function corresponding to a Loop.
+///
+class PrintLoopPass : public LoopPass {
+private:
+  std::string Banner;
+  raw_ostream &Out;       // raw_ostream to print on.
+
+public:
+  static char ID;
+  PrintLoopPass(const std::string &B, raw_ostream &o)
+      : LoopPass(ID), Banner(B), Out(o) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &) {
+    Out << Banner;
+    for (Loop::block_iterator b = L->block_begin(), be = L->block_end();
+         b != be;
+         ++b) {
+      (*b)->print(Out);
+    }
+    return false;
+  }
+};
+
+char PrintLoopPass::ID = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// LPPassManager
+//
+
+char LPPassManager::ID = 0;
+
+LPPassManager::LPPassManager()
+  : FunctionPass(ID), PMDataManager() {
+  skipThisLoop = false;
+  redoThisLoop = false;
+  LI = NULL;
+  CurrentLoop = NULL;
+}
+
+/// Delete loop from the loop queue and loop hierarchy (LoopInfo).
+void LPPassManager::deleteLoopFromQueue(Loop *L) {
+
+  LI->updateUnloop(L);
+
+  // If L is current loop then skip rest of the passes and let
+  // runOnFunction remove L from LQ. Otherwise, remove L from LQ now
+  // and continue applying other passes on CurrentLoop.
+  if (CurrentLoop == L)
+    skipThisLoop = true;
+
+  delete L;
+
+  if (skipThisLoop)
+    return;
+
+  for (std::deque<Loop *>::iterator I = LQ.begin(),
+         E = LQ.end(); I != E; ++I) {
+    if (*I == L) {
+      LQ.erase(I);
+      break;
+    }
+  }
+}
+
+// Inset loop into loop nest (LoopInfo) and loop queue (LQ).
+void LPPassManager::insertLoop(Loop *L, Loop *ParentLoop) {
+
+  assert (CurrentLoop != L && "Cannot insert CurrentLoop");
+
+  // Insert into loop nest
+  if (ParentLoop)
+    ParentLoop->addChildLoop(L);
+  else
+    LI->addTopLevelLoop(L);
+
+  insertLoopIntoQueue(L);
+}
+
+void LPPassManager::insertLoopIntoQueue(Loop *L) {
+  // Insert L into loop queue
+  if (L == CurrentLoop)
+    redoLoop(L);
+  else if (!L->getParentLoop())
+    // This is top level loop.
+    LQ.push_front(L);
+  else {
+    // Insert L after the parent loop.
+    for (std::deque<Loop *>::iterator I = LQ.begin(),
+           E = LQ.end(); I != E; ++I) {
+      if (*I == L->getParentLoop()) {
+        // deque does not support insert after.
+        ++I;
+        LQ.insert(I, 1, L);
+        break;
+      }
+    }
+  }
+}
+
+// Reoptimize this loop. LPPassManager will re-insert this loop into the
+// queue. This allows LoopPass to change loop nest for the loop. This
+// utility may send LPPassManager into infinite loops so use caution.
+void LPPassManager::redoLoop(Loop *L) {
+  assert (CurrentLoop == L && "Can redo only CurrentLoop");
+  redoThisLoop = true;
+}
+
+/// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
+/// all loop passes.
+void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From,
+                                                  BasicBlock *To, Loop *L) {
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    LoopPass *LP = getContainedPass(Index);
+    LP->cloneBasicBlockAnalysis(From, To, L);
+  }
+}
+
+/// deleteSimpleAnalysisValue - Invoke deleteAnalysisValue hook for all passes.
+void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;
+         ++BI) {
+      Instruction &I = *BI;
+      deleteSimpleAnalysisValue(&I, L);
+    }
+  }
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    LoopPass *LP = getContainedPass(Index);
+    LP->deleteAnalysisValue(V, L);
+  }
+}
+
+
+// Recurse through all subloops and all loops  into LQ.
+static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
+  LQ.push_back(L);
+  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I)
+    addLoopIntoQueue(*I, LQ);
+}
+
+/// Pass Manager itself does not invalidate any analysis info.
+void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
+  // LPPassManager needs LoopInfo. In the long term LoopInfo class will
+  // become part of LPPassManager.
+  Info.addRequired<LoopInfo>();
+  Info.setPreservesAll();
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the function, and if so, return true.
+bool LPPassManager::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfo>();
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  // Populate the loop queue in reverse program order. There is no clear need to
+  // process sibling loops in either forward or reverse order. There may be some
+  // advantage in deleting uses in a later loop before optimizing the
+  // definitions in an earlier loop. If we find a clear reason to process in
+  // forward order, then a forward variant of LoopPassManager should be created.
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+    addLoopIntoQueue(*I, LQ);
+
+  if (LQ.empty()) // No loops, skip calling finalizers
+    return false;
+
+  // Initialization
+  for (std::deque<Loop *>::const_iterator I = LQ.begin(), E = LQ.end();
+       I != E; ++I) {
+    Loop *L = *I;
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      LoopPass *P = getContainedPass(Index);
+      Changed |= P->doInitialization(L, *this);
+    }
+  }
+
+  // Walk Loops
+  while (!LQ.empty()) {
+
+    CurrentLoop  = LQ.back();
+    skipThisLoop = false;
+    redoThisLoop = false;
+
+    // Run all passes on the current Loop.
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      LoopPass *P = getContainedPass(Index);
+
+      dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG,
+                   CurrentLoop->getHeader()->getName());
+      dumpRequiredSet(P);
+
+      initializeAnalysisImpl(P);
+
+      {
+        PassManagerPrettyStackEntry X(P, *CurrentLoop->getHeader());
+        TimeRegion PassTimer(getPassTimer(P));
+
+        Changed |= P->runOnLoop(CurrentLoop, *this);
+      }
+
+      if (Changed)
+        dumpPassInfo(P, MODIFICATION_MSG, ON_LOOP_MSG,
+                     skipThisLoop ? "<deleted>" :
+                                    CurrentLoop->getHeader()->getName());
+      dumpPreservedSet(P);
+
+      if (!skipThisLoop) {
+        // Manually check that this loop is still healthy. This is done
+        // instead of relying on LoopInfo::verifyLoop since LoopInfo
+        // is a function pass and it's really expensive to verify every
+        // loop in the function every time. That level of checking can be
+        // enabled with the -verify-loop-info option.
+        {
+          TimeRegion PassTimer(getPassTimer(LI));
+          CurrentLoop->verifyLoop();
+        }
+
+        // Then call the regular verifyAnalysis functions.
+        verifyPreservedAnalysis(P);
+      }
+
+      removeNotPreservedAnalysis(P);
+      recordAvailableAnalysis(P);
+      removeDeadPasses(P,
+                       skipThisLoop ? "<deleted>" :
+                                      CurrentLoop->getHeader()->getName(),
+                       ON_LOOP_MSG);
+
+      if (skipThisLoop)
+        // Do not run other passes on this loop.
+        break;
+    }
+
+    // If the loop was deleted, release all the loop passes. This frees up
+    // some memory, and avoids trouble with the pass manager trying to call
+    // verifyAnalysis on them.
+    if (skipThisLoop)
+      for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+        Pass *P = getContainedPass(Index);
+        freePass(P, "<deleted>", ON_LOOP_MSG);
+      }
+
+    // Pop the loop from queue after running all passes.
+    LQ.pop_back();
+
+    if (redoThisLoop)
+      LQ.push_back(CurrentLoop);
+  }
+
+  // Finalization
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    LoopPass *P = getContainedPass(Index);
+    Changed |= P->doFinalization();
+  }
+
+  return Changed;
+}
+
+/// Print passes managed by this manager
+void LPPassManager::dumpPassStructure(unsigned Offset) {
+  errs().indent(Offset*2) << "Loop Pass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    Pass *P = getContainedPass(Index);
+    P->dumpPassStructure(Offset + 1);
+    dumpLastUses(P, Offset+1);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// LoopPass
+
+Pass *LoopPass::createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+  return new PrintLoopPass(Banner, O);
+}
+
+// Check if this pass is suitable for the current LPPassManager, if
+// available. This pass P is not suitable for a LPPassManager if P
+// is not preserving higher level analysis info used by other
+// LPPassManager passes. In such case, pop LPPassManager from the
+// stack. This will force assignPassManager() to create new
+// LPPassManger as expected.
+void LoopPass::preparePassManager(PMStack &PMS) {
+
+  // Find LPPassManager
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_LoopPassManager)
+    PMS.pop();
+
+  // If this pass is destroying high level information that is used
+  // by other passes that are managed by LPM then do not insert
+  // this pass in current LPM. Use new LPPassManager.
+  if (PMS.top()->getPassManagerType() == PMT_LoopPassManager &&
+      !PMS.top()->preserveHigherLevelAnalysis(this))
+    PMS.pop();
+}
+
+/// Assign pass manager to manage this pass.
+void LoopPass::assignPassManager(PMStack &PMS,
+                                 PassManagerType PreferredType) {
+  // Find LPPassManager
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_LoopPassManager)
+    PMS.pop();
+
+  LPPassManager *LPPM;
+  if (PMS.top()->getPassManagerType() == PMT_LoopPassManager)
+    LPPM = (LPPassManager*)PMS.top();
+  else {
+    // Create new Loop Pass Manager if it does not exist.
+    assert (!PMS.empty() && "Unable to create Loop Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Loop Pass Manager
+    LPPM = new LPPassManager();
+    LPPM->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(LPPM);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    Pass *P = LPPM->getAsPass();
+    TPM->schedulePass(P);
+
+    // [4] Push new manager into PMS
+    PMS.push(LPPM);
+  }
+
+  LPPM->add(this);
+}
diff --git a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
new file mode 100644
index 000000000000..d26aaf1b9048
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -0,0 +1,192 @@
+//===- MemDepPrinter.cpp - Printer for MemoryDependenceAnalysis -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  struct MemDepPrinter : public FunctionPass {
+    const Function *F;
+
+    enum DepType {
+      Clobber = 0,
+      Def,
+      NonFuncLocal,
+      Unknown
+    };
+
+    static const char *const DepTypeStr[];
+
+    typedef PointerIntPair<const Instruction *, 2, DepType> InstTypePair;
+    typedef std::pair<InstTypePair, const BasicBlock *> Dep;
+    typedef SmallSetVector<Dep, 4> DepSet;
+    typedef DenseMap<const Instruction *, DepSet> DepSetMap;
+    DepSetMap Deps;
+
+    static char ID; // Pass identifcation, replacement for typeid
+    MemDepPrinter() : FunctionPass(ID) {
+      initializeMemDepPrinterPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    void print(raw_ostream &OS, const Module * = 0) const;
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredTransitive<AliasAnalysis>();
+      AU.addRequiredTransitive<MemoryDependenceAnalysis>();
+      AU.setPreservesAll();
+    }
+
+    virtual void releaseMemory() {
+      Deps.clear();
+      F = 0;
+    }
+
+  private:
+    static InstTypePair getInstTypePair(MemDepResult dep) {
+      if (dep.isClobber())
+        return InstTypePair(dep.getInst(), Clobber);
+      if (dep.isDef())
+        return InstTypePair(dep.getInst(), Def);
+      if (dep.isNonFuncLocal())
+        return InstTypePair(dep.getInst(), NonFuncLocal);
+      assert(dep.isUnknown() && "unexptected dependence type");
+      return InstTypePair(dep.getInst(), Unknown);
+    }
+    static InstTypePair getInstTypePair(const Instruction* inst, DepType type) {
+      return InstTypePair(inst, type);
+    }
+  };
+}
+
+char MemDepPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(MemDepPrinter, "print-memdeps",
+                      "Print MemDeps of function", false, true)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_END(MemDepPrinter, "print-memdeps",
+                      "Print MemDeps of function", false, true)
+
+FunctionPass *llvm::createMemDepPrinter() {
+  return new MemDepPrinter();
+}
+
+const char *const MemDepPrinter::DepTypeStr[]
+  = {"Clobber", "Def", "NonFuncLocal", "Unknown"};
+
+bool MemDepPrinter::runOnFunction(Function &F) {
+  this->F = &F;
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>();
+
+  // All this code uses non-const interfaces because MemDep is not
+  // const-friendly, though nothing is actually modified.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    if (!Inst->mayReadFromMemory() && !Inst->mayWriteToMemory())
+      continue;
+
+    MemDepResult Res = MDA.getDependency(Inst);
+    if (!Res.isNonLocal()) {
+      Deps[Inst].insert(std::make_pair(getInstTypePair(Res),
+                                       static_cast<BasicBlock *>(0)));
+    } else if (CallSite CS = cast<Value>(Inst)) {
+      const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI =
+        MDA.getNonLocalCallDependency(CS);
+
+      DepSet &InstDeps = Deps[Inst];
+      for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator
+           I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
+        const MemDepResult &Res = I->getResult();
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
+      }
+    } else {
+      SmallVector<NonLocalDepResult, 4> NLDI;
+      if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        if (!LI->isUnordered()) {
+          // FIXME: Handle atomic/volatile loads.
+          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
+                                           static_cast<BasicBlock *>(0)));
+          continue;
+        }
+        AliasAnalysis::Location Loc = AA.getLocation(LI);
+        MDA.getNonLocalPointerDependency(Loc, true, LI->getParent(), NLDI);
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        if (!SI->isUnordered()) {
+          // FIXME: Handle atomic/volatile stores.
+          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
+                                           static_cast<BasicBlock *>(0)));
+          continue;
+        }
+        AliasAnalysis::Location Loc = AA.getLocation(SI);
+        MDA.getNonLocalPointerDependency(Loc, false, SI->getParent(), NLDI);
+      } else if (VAArgInst *VI = dyn_cast<VAArgInst>(Inst)) {
+        AliasAnalysis::Location Loc = AA.getLocation(VI);
+        MDA.getNonLocalPointerDependency(Loc, false, VI->getParent(), NLDI);
+      } else {
+        llvm_unreachable("Unknown memory instruction!");
+      }
+
+      DepSet &InstDeps = Deps[Inst];
+      for (SmallVectorImpl<NonLocalDepResult>::const_iterator
+           I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
+        const MemDepResult &Res = I->getResult();
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
+      }
+    }
+  }
+
+  return false;
+}
+
+void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
+  for (const_inst_iterator I = inst_begin(*F), E = inst_end(*F); I != E; ++I) {
+    const Instruction *Inst = &*I;
+
+    DepSetMap::const_iterator DI = Deps.find(Inst);
+    if (DI == Deps.end())
+      continue;
+
+    const DepSet &InstDeps = DI->second;
+
+    for (DepSet::const_iterator I = InstDeps.begin(), E = InstDeps.end();
+         I != E; ++I) {
+      const Instruction *DepInst = I->first.getPointer();
+      DepType type = I->first.getInt();
+      const BasicBlock *DepBB = I->second;
+
+      OS << "    ";
+      OS << DepTypeStr[type];
+      if (DepBB) {
+        OS << " in block ";
+        WriteAsOperand(OS, DepBB, /*PrintType=*/false, M);
+      }
+      if (DepInst) {
+        OS << " from: ";
+        DepInst->print(OS);
+      }
+      OS << "\n";
+    }
+
+    Inst->print(OS);
+    OS << "\n\n";
+  }
+}
diff --git a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
new file mode 100644
index 000000000000..d490d5419f75
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -0,0 +1,819 @@
+//===------ MemoryBuiltins.cpp - Identify calls to memory builtins --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions identifies calls to builtin functions that allocate
+// or free memory.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memory-builtins"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+enum AllocType {
+  MallocLike         = 1<<0, // allocates
+  CallocLike         = 1<<1, // allocates + bzero
+  ReallocLike        = 1<<2, // reallocates
+  StrDupLike         = 1<<3,
+  AllocLike          = MallocLike | CallocLike | StrDupLike,
+  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+};
+
+struct AllocFnsTy {
+  LibFunc::Func Func;
+  AllocType AllocTy;
+  unsigned char NumParams;
+  // First and Second size parameters (or -1 if unused)
+  signed char FstParam, SndParam;
+};
+
+// FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
+// know which functions are nounwind, noalias, nocapture parameters, etc.
+static const AllocFnsTy AllocationFnData[] = {
+  {LibFunc::malloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::valloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
+  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
+  {LibFunc::calloc,              CallocLike,  2, 0,   1},
+  {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
+  {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
+  {LibFunc::strdup,              StrDupLike,  1, -1, -1},
+  {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
+};
+
+
+static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
+  if (LookThroughBitCast)
+    V = V->stripPointerCasts();
+
+  CallSite CS(const_cast<Value*>(V));
+  if (!CS.getInstruction())
+    return 0;
+
+  Function *Callee = CS.getCalledFunction();
+  if (!Callee || !Callee->isDeclaration())
+    return 0;
+  return Callee;
+}
+
+/// \brief Returns the allocation data for the given value if it is a call to a
+/// known allocation function, and NULL otherwise.
+static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
+                                           const TargetLibraryInfo *TLI,
+                                           bool LookThroughBitCast = false) {
+  // Skip intrinsics
+  if (isa<IntrinsicInst>(V))
+    return 0;
+
+  Function *Callee = getCalledFunction(V, LookThroughBitCast);
+  if (!Callee)
+    return 0;
+
+  // Make sure that the function is available.
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
+  unsigned i = 0;
+  bool found = false;
+  for ( ; i < array_lengthof(AllocationFnData); ++i) {
+    if (AllocationFnData[i].Func == TLIFn) {
+      found = true;
+      break;
+    }
+  }
+  if (!found)
+    return 0;
+
+  const AllocFnsTy *FnData = &AllocationFnData[i];
+  if ((FnData->AllocTy & AllocTy) == 0)
+    return 0;
+
+  // Check function prototype.
+  int FstParam = FnData->FstParam;
+  int SndParam = FnData->SndParam;
+  FunctionType *FTy = Callee->getFunctionType();
+
+  if (FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
+      FTy->getNumParams() == FnData->NumParams &&
+      (FstParam < 0 ||
+       (FTy->getParamType(FstParam)->isIntegerTy(32) ||
+        FTy->getParamType(FstParam)->isIntegerTy(64))) &&
+      (SndParam < 0 ||
+       FTy->getParamType(SndParam)->isIntegerTy(32) ||
+       FTy->getParamType(SndParam)->isIntegerTy(64)))
+    return FnData;
+  return 0;
+}
+
+static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
+  ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
+  return CS && CS.hasFnAttr(Attribute::NoAlias);
+}
+
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
+/// like).
+bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a function that returns a
+/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
+bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
+                       bool LookThroughBitCast) {
+  // it's safe to consider realloc as noalias since accessing the original
+  // pointer is undefined behavior
+  return isAllocationFn(V, TLI, LookThroughBitCast) ||
+         hasNoAliasAttr(V, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates uninitialized memory (such as malloc).
+bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates zero-filled memory (such as calloc).
+bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory (either malloc, calloc, or strdup like).
+bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                         bool LookThroughBitCast) {
+  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
+/// reallocates memory (such as realloc).
+bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                           bool LookThroughBitCast) {
+  return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
+}
+
+/// extractMallocCall - Returns the corresponding CallInst if the instruction
+/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
+/// ignore InvokeInst here.
+const CallInst *llvm::extractMallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
+}
+
+static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
+                               const TargetLibraryInfo *TLI,
+                               bool LookThroughSExt = false) {
+  if (!CI)
+    return 0;
+
+  // The size of the malloc's result type must be known to determine array size.
+  Type *T = getMallocAllocatedType(CI, TLI);
+  if (!T || !T->isSized() || !TD)
+    return 0;
+
+  unsigned ElementSize = TD->getTypeAllocSize(T);
+  if (StructType *ST = dyn_cast<StructType>(T))
+    ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
+
+  // If malloc call's arg can be determined to be a multiple of ElementSize,
+  // return the multiple.  Otherwise, return NULL.
+  Value *MallocArg = CI->getArgOperand(0);
+  Value *Multiple = 0;
+  if (ComputeMultiple(MallocArg, ElementSize, Multiple,
+                      LookThroughSExt))
+    return Multiple;
+
+  return 0;
+}
+
+/// isArrayMalloc - Returns the corresponding CallInst if the instruction
+/// is a call to malloc whose array size can be determined and the array size
+/// is not constant 1.  Otherwise, return NULL.
+const CallInst *llvm::isArrayMalloc(const Value *I,
+                                    const DataLayout *TD,
+                                    const TargetLibraryInfo *TLI) {
+  const CallInst *CI = extractMallocCall(I, TLI);
+  Value *ArraySize = computeArraySize(CI, TD, TLI);
+
+  if (ConstantInt *ConstSize = dyn_cast_or_null<ConstantInt>(ArraySize))
+    if (ConstSize->isOne())
+      return CI;
+
+  // CI is a non-array malloc or we can't figure out that it is an array malloc.
+  return 0;
+}
+
+/// getMallocType - Returns the PointerType resulting from the malloc call.
+/// The PointerType depends on the number of bitcast uses of the malloc call:
+///   0: PointerType is the calls' return type.
+///   1: PointerType is the bitcast's result type.
+///  >1: Unique PointerType cannot be determined, return NULL.
+PointerType *llvm::getMallocType(const CallInst *CI,
+                                 const TargetLibraryInfo *TLI) {
+  assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call");
+
+  PointerType *MallocType = 0;
+  unsigned NumOfBitCastUses = 0;
+
+  // Determine if CallInst has a bitcast use.
+  for (Value::const_use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; )
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(*UI++)) {
+      MallocType = cast<PointerType>(BCI->getDestTy());
+      NumOfBitCastUses++;
+    }
+
+  // Malloc call has 1 bitcast use, so type is the bitcast's destination type.
+  if (NumOfBitCastUses == 1)
+    return MallocType;
+
+  // Malloc call was not bitcast, so type is the malloc function's return type.
+  if (NumOfBitCastUses == 0)
+    return cast<PointerType>(CI->getType());
+
+  // Type could not be determined.
+  return 0;
+}
+
+/// getMallocAllocatedType - Returns the Type allocated by malloc call.
+/// The Type depends on the number of bitcast uses of the malloc call:
+///   0: PointerType is the malloc calls' return type.
+///   1: PointerType is the bitcast's result type.
+///  >1: Unique PointerType cannot be determined, return NULL.
+Type *llvm::getMallocAllocatedType(const CallInst *CI,
+                                   const TargetLibraryInfo *TLI) {
+  PointerType *PT = getMallocType(CI, TLI);
+  return PT ? PT->getElementType() : 0;
+}
+
+/// getMallocArraySize - Returns the array size of a malloc call.  If the
+/// argument passed to malloc is a multiple of the size of the malloced type,
+/// then return that multiple.  For non-array mallocs, the multiple is
+/// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
+/// determined.
+Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
+                                bool LookThroughSExt) {
+  assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
+  return computeArraySize(CI, TD, TLI, LookThroughSExt);
+}
+
+
+/// extractCallocCall - Returns the corresponding CallInst if the instruction
+/// is a calloc call.
+const CallInst *llvm::extractCallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : 0;
+}
+
+
+/// isFreeCall - Returns non-null if the value is a call to the builtin free()
+const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
+  const CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || isa<IntrinsicInst>(CI))
+    return 0;
+  Function *Callee = CI->getCalledFunction();
+  if (Callee == 0 || !Callee->isDeclaration())
+    return 0;
+
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
+  if (TLIFn != LibFunc::free &&
+      TLIFn != LibFunc::ZdlPv && // operator delete(void*)
+      TLIFn != LibFunc::ZdaPv)   // operator delete[](void*)
+    return 0;
+
+  // Check free prototype.
+  // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin
+  // attribute will exist.
+  FunctionType *FTy = Callee->getFunctionType();
+  if (!FTy->getReturnType()->isVoidTy())
+    return 0;
+  if (FTy->getNumParams() != 1)
+    return 0;
+  if (FTy->getParamType(0) != Type::getInt8PtrTy(Callee->getContext()))
+    return 0;
+
+  return CI;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Utility functions to compute size of objects.
+//
+
+
+/// \brief Compute the size of the object pointed by Ptr. Returns true and the
+/// object size in Size if successful, and false otherwise.
+/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
+/// byval arguments, and global variables.
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+                         const TargetLibraryInfo *TLI, bool RoundToAlign) {
+  if (!TD)
+    return false;
+
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
+  SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
+  if (!Visitor.bothKnown(Data))
+    return false;
+
+  APInt ObjSize = Data.first, Offset = Data.second;
+  // check for overflow
+  if (Offset.slt(0) || ObjSize.ult(Offset))
+    Size = 0;
+  else
+    Size = (ObjSize - Offset).getZExtValue();
+  return true;
+}
+
+/// \brief Compute the size of the underlying object pointed by Ptr. Returns
+/// true and the object size in Size if successful, and false otherwise.
+/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
+/// byval arguments, and global variables.
+bool llvm::getUnderlyingObjectSize(const Value *Ptr, uint64_t &Size,
+                                   const DataLayout *TD,
+                                   const TargetLibraryInfo *TLI,
+                                   bool RoundToAlign) {
+  if (!TD)
+    return false;
+
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
+  SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
+  if (!Visitor.knownSize(Data))
+    return false;
+
+  Size = Data.first.getZExtValue();
+  return true;
+}
+
+
+STATISTIC(ObjectVisitorArgument,
+          "Number of arguments with unsolved size and offset");
+STATISTIC(ObjectVisitorLoad,
+          "Number of load instructions with unsolved size and offset");
+
+
+APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
+  if (RoundToAlign && Align)
+    return APInt(IntTyBits, RoundUpToAlignment(Size.getZExtValue(), Align));
+  return Size;
+}
+
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
+                                                 const TargetLibraryInfo *TLI,
+                                                 LLVMContext &Context,
+                                                 bool RoundToAlign)
+: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = TD->getIntPtrType(Context);
+  IntTyBits = IntTy->getBitWidth();
+  Zero = APInt::getNullValue(IntTyBits);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
+  V = V->stripPointerCasts();
+
+  if (isa<Instruction>(V) || isa<GEPOperator>(V)) {
+    // Return cached value or insert unknown in cache if size of V was not
+    // computed yet in order to avoid recursions in PHis.
+    std::pair<CacheMapTy::iterator, bool> CacheVal =
+      CacheMap.insert(std::make_pair(V, unknown()));
+    if (!CacheVal.second)
+      return CacheVal.first->second;
+
+    SizeOffsetType Result;
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+      Result = visitGEPOperator(*GEP);
+    else
+      Result = visit(cast<Instruction>(*V));
+    return CacheMap[V] = Result;
+  }
+
+  if (Argument *A = dyn_cast<Argument>(V))
+    return visitArgument(*A);
+  if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V))
+    return visitConstantPointerNull(*P);
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return visitGlobalAlias(*GA);
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return visitGlobalVariable(*GV);
+  if (UndefValue *UV = dyn_cast<UndefValue>(V))
+    return visitUndefValue(*UV);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      return unknown(); // clueless
+  }
+
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
+        << '\n');
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  if (!I.isArrayAllocation())
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+
+  Value *ArraySize = I.getArraySize();
+  if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) {
+    Size *= C->getValue().zextOrSelf(IntTyBits);
+    return std::make_pair(align(Size, I.getAlignment()), Zero);
+  }
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
+  // no interprocedural analysis is done at the moment
+  if (!A.hasByValAttr()) {
+    ++ObjectVisitorArgument;
+    return unknown();
+  }
+  PointerType *PT = cast<PointerType>(A.getType());
+  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  return std::make_pair(align(Size, A.getParamAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    APInt Size(IntTyBits, GetStringLength(CS.getArgument(0)));
+    if (!Size)
+      return unknown();
+
+    // strndup limits strlen
+    if (FnData->FstParam > 0) {
+      ConstantInt *Arg= dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+      if (!Arg)
+        return unknown();
+
+      APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits);
+      if (Size.ugt(MaxSize))
+        Size = MaxSize + 1;
+    }
+    return std::make_pair(Size, Zero);
+  }
+
+  ConstantInt *Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+  if (!Arg)
+    return unknown();
+
+  APInt Size = Arg->getValue().zextOrSelf(IntTyBits);
+  // size determined by just 1 parameter
+  if (FnData->SndParam < 0)
+    return std::make_pair(Size, Zero);
+
+  Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->SndParam));
+  if (!Arg)
+    return unknown();
+
+  Size *= Arg->getValue().zextOrSelf(IntTyBits);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetType
+ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
+  // Easy cases were already folded by previous passes.
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetType PtrData = compute(GEP.getPointerOperand());
+  APInt Offset(IntTyBits, 0);
+  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*TD, Offset))
+    return unknown();
+
+  return std::make_pair(PtrData.first, PtrData.second + Offset);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalAlias(GlobalAlias &GA) {
+  if (GA.mayBeOverridden())
+    return unknown();
+  return compute(GA.getAliasee());
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
+  if (!GV.hasDefinitiveInitializer())
+    return unknown();
+
+  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  return std::make_pair(align(Size, GV.getAlignment()), Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitLoadInst(LoadInst&) {
+  ++ObjectVisitorLoad;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode &PHI) {
+  if (PHI.getNumIncomingValues() == 0)
+    return unknown();
+
+  SizeOffsetType Ret = compute(PHI.getIncomingValue(0));
+  if (!bothKnown(Ret))
+    return unknown();
+
+  // Verify that all PHI incoming pointers have the same size and offset.
+  for (unsigned i = 1, e = PHI.getNumIncomingValues(); i != e; ++i) {
+    SizeOffsetType EdgeData = compute(PHI.getIncomingValue(i));
+    if (!bothKnown(EdgeData) || EdgeData != Ret)
+      return unknown();
+  }
+  return Ret;
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
+  SizeOffsetType TrueSide  = compute(I.getTrueValue());
+  SizeOffsetType FalseSide = compute(I.getFalseValue());
+  if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide)
+    return TrueSide;
+  return unknown();
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitUndefValue(UndefValue&) {
+  return std::make_pair(Zero, Zero);
+}
+
+SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetVisitor unknown instruction:" << I << '\n');
+  return unknown();
+}
+
+
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
+                                                   const TargetLibraryInfo *TLI,
+                                                     LLVMContext &Context)
+: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
+  IntTy = TD->getIntPtrType(Context);
+  Zero = ConstantInt::get(IntTy, 0);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
+  SizeOffsetEvalType Result = compute_(V);
+
+  if (!bothKnown(Result)) {
+    // erase everything that was computed in this iteration from the cache, so
+    // that no dangling references are left behind. We could be a bit smarter if
+    // we kept a dependency graph. It's probably not worth the complexity.
+    for (PtrSetTy::iterator I=SeenVals.begin(), E=SeenVals.end(); I != E; ++I) {
+      CacheMapTy::iterator CacheIt = CacheMap.find(*I);
+      // non-computable results can be safely cached
+      if (CacheIt != CacheMap.end() && anyKnown(CacheIt->second))
+        CacheMap.erase(CacheIt);
+    }
+  }
+
+  SeenVals.clear();
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
+  SizeOffsetType Const = Visitor.compute(V);
+  if (Visitor.bothKnown(Const))
+    return std::make_pair(ConstantInt::get(Context, Const.first),
+                          ConstantInt::get(Context, Const.second));
+
+  V = V->stripPointerCasts();
+
+  // check cache
+  CacheMapTy::iterator CacheIt = CacheMap.find(V);
+  if (CacheIt != CacheMap.end())
+    return CacheIt->second;
+
+  // always generate code immediately before the instruction being
+  // processed, so that the generated code dominates the same BBs
+  Instruction *PrevInsertPoint = Builder.GetInsertPoint();
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    Builder.SetInsertPoint(I);
+
+  // record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails
+  SeenVals.insert(V);
+
+  // now compute the size and offset
+  SizeOffsetEvalType Result;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    Result = visitGEPOperator(*GEP);
+  } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Result = visit(*I);
+  } else if (isa<Argument>(V) ||
+             (isa<ConstantExpr>(V) &&
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::IntToPtr) ||
+             isa<GlobalAlias>(V) ||
+             isa<GlobalVariable>(V)) {
+    // ignore values where we cannot do more than what ObjectSizeVisitor can
+    Result = unknown();
+  } else {
+    DEBUG(dbgs() << "ObjectSizeOffsetEvaluator::compute() unhandled value: "
+          << *V << '\n');
+    Result = unknown();
+  }
+
+  if (PrevInsertPoint)
+    Builder.SetInsertPoint(PrevInsertPoint);
+
+  // Don't reuse CacheIt since it may be invalid at this point.
+  CacheMap[V] = Result;
+  return Result;
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
+  if (!I.getAllocatedType()->isSized())
+    return unknown();
+
+  // must be a VLA
+  assert(I.isArrayAllocation());
+  Value *ArraySize = I.getArraySize();
+  Value *Size = ConstantInt::get(ArraySize->getType(),
+                                 TD->getTypeAllocSize(I.getAllocatedType()));
+  Size = Builder.CreateMul(Size, ArraySize);
+  return std::make_pair(Size, Zero);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
+  if (!FnData)
+    return unknown();
+
+  // handle strdup-like functions separately
+  if (FnData->AllocTy == StrDupLike) {
+    // TODO
+    return unknown();
+  }
+
+  Value *FirstArg = CS.getArgument(FnData->FstParam);
+  FirstArg = Builder.CreateZExt(FirstArg, IntTy);
+  if (FnData->SndParam < 0)
+    return std::make_pair(FirstArg, Zero);
+
+  Value *SecondArg = CS.getArgument(FnData->SndParam);
+  SecondArg = Builder.CreateZExt(SecondArg, IntTy);
+  Value *Size = Builder.CreateMul(FirstArg, SecondArg);
+  return std::make_pair(Size, Zero);
+
+  // TODO: handle more standard functions (+ wchar cousins):
+  // - strdup / strndup
+  // - strcpy / strncpy
+  // - strcat / strncat
+  // - memcpy / memmove
+  // - strcat / strncat
+  // - memset
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractElementInst(ExtractElementInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitExtractValueInst(ExtractValueInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType
+ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
+  SizeOffsetEvalType PtrData = compute_(GEP.getPointerOperand());
+  if (!bothKnown(PtrData))
+    return unknown();
+
+  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Offset = Builder.CreateAdd(PtrData.second, Offset);
+  return std::make_pair(PtrData.first, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitIntToPtrInst(IntToPtrInst&) {
+  // clueless
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitLoadInst(LoadInst&) {
+  return unknown();
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
+  // create 2 PHIs: one for size and another for offset
+  PHINode *SizePHI   = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+  PHINode *OffsetPHI = Builder.CreatePHI(IntTy, PHI.getNumIncomingValues());
+
+  // insert right away in the cache to handle recursive PHIs
+  CacheMap[&PHI] = std::make_pair(SizePHI, OffsetPHI);
+
+  // compute offset/size for each PHI incoming pointer
+  for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i) {
+    Builder.SetInsertPoint(PHI.getIncomingBlock(i)->getFirstInsertionPt());
+    SizeOffsetEvalType EdgeData = compute_(PHI.getIncomingValue(i));
+
+    if (!bothKnown(EdgeData)) {
+      OffsetPHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      OffsetPHI->eraseFromParent();
+      SizePHI->replaceAllUsesWith(UndefValue::get(IntTy));
+      SizePHI->eraseFromParent();
+      return unknown();
+    }
+    SizePHI->addIncoming(EdgeData.first, PHI.getIncomingBlock(i));
+    OffsetPHI->addIncoming(EdgeData.second, PHI.getIncomingBlock(i));
+  }
+
+  Value *Size = SizePHI, *Offset = OffsetPHI, *Tmp;
+  if ((Tmp = SizePHI->hasConstantValue())) {
+    Size = Tmp;
+    SizePHI->replaceAllUsesWith(Size);
+    SizePHI->eraseFromParent();
+  }
+  if ((Tmp = OffsetPHI->hasConstantValue())) {
+    Offset = Tmp;
+    OffsetPHI->replaceAllUsesWith(Offset);
+    OffsetPHI->eraseFromParent();
+  }
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) {
+  SizeOffsetEvalType TrueSide  = compute_(I.getTrueValue());
+  SizeOffsetEvalType FalseSide = compute_(I.getFalseValue());
+
+  if (!bothKnown(TrueSide) || !bothKnown(FalseSide))
+    return unknown();
+  if (TrueSide == FalseSide)
+    return TrueSide;
+
+  Value *Size = Builder.CreateSelect(I.getCondition(), TrueSide.first,
+                                     FalseSide.first);
+  Value *Offset = Builder.CreateSelect(I.getCondition(), TrueSide.second,
+                                       FalseSide.second);
+  return std::make_pair(Size, Offset);
+}
+
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitInstruction(Instruction &I) {
+  DEBUG(dbgs() << "ObjectSizeOffsetEvaluator unknown instruction:" << I <<'\n');
+  return unknown();
+}
diff --git a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
new file mode 100644
index 000000000000..2240e9de33eb
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -0,0 +1,1525 @@
+//===- MemoryDependenceAnalysis.cpp - Mem Deps Implementation  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an analysis that determines, for a given memory
+// operation, what preceding memory operations it depends on.  It builds on
+// alias analysis information, and tries to provide a lazy, caching interface to
+// a common kind of alias information query.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memdep"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/PredIteratorCache.h"
+using namespace llvm;
+
+STATISTIC(NumCacheNonLocal, "Number of fully cached non-local responses");
+STATISTIC(NumCacheDirtyNonLocal, "Number of dirty cached non-local responses");
+STATISTIC(NumUncacheNonLocal, "Number of uncached non-local responses");
+
+STATISTIC(NumCacheNonLocalPtr,
+          "Number of fully cached non-local ptr responses");
+STATISTIC(NumCacheDirtyNonLocalPtr,
+          "Number of cached, but dirty, non-local ptr responses");
+STATISTIC(NumUncacheNonLocalPtr,
+          "Number of uncached non-local ptr responses");
+STATISTIC(NumCacheCompleteNonLocalPtr,
+          "Number of block queries that were completely cached");
+
+// Limit for the number of instructions to scan in a block.
+// FIXME: Figure out what a sane value is for this.
+//        (500 is relatively insane.)
+static const int BlockScanLimit = 500;
+
+char MemoryDependenceAnalysis::ID = 0;
+
+// Register this pass...
+INITIALIZE_PASS_BEGIN(MemoryDependenceAnalysis, "memdep",
+                "Memory Dependence Analysis", false, true)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MemoryDependenceAnalysis, "memdep",
+                      "Memory Dependence Analysis", false, true)
+
+MemoryDependenceAnalysis::MemoryDependenceAnalysis()
+: FunctionPass(ID), PredCache(0) {
+  initializeMemoryDependenceAnalysisPass(*PassRegistry::getPassRegistry());
+}
+MemoryDependenceAnalysis::~MemoryDependenceAnalysis() {
+}
+
+/// Clean up memory in between runs
+void MemoryDependenceAnalysis::releaseMemory() {
+  LocalDeps.clear();
+  NonLocalDeps.clear();
+  NonLocalPointerDeps.clear();
+  ReverseLocalDeps.clear();
+  ReverseNonLocalDeps.clear();
+  ReverseNonLocalPtrDeps.clear();
+  PredCache->clear();
+}
+
+
+
+/// getAnalysisUsage - Does not modify anything.  It uses Alias Analysis.
+///
+void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<AliasAnalysis>();
+}
+
+bool MemoryDependenceAnalysis::runOnFunction(Function &) {
+  AA = &getAnalysis<AliasAnalysis>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  if (PredCache == 0)
+    PredCache.reset(new PredIteratorCache());
+  return false;
+}
+
+/// RemoveFromReverseMap - This is a helper function that removes Val from
+/// 'Inst's set in ReverseMap.  If the set becomes empty, remove Inst's entry.
+template <typename KeyTy>
+static void RemoveFromReverseMap(DenseMap<Instruction*,
+                                 SmallPtrSet<KeyTy, 4> > &ReverseMap,
+                                 Instruction *Inst, KeyTy Val) {
+  typename DenseMap<Instruction*, SmallPtrSet<KeyTy, 4> >::iterator
+  InstIt = ReverseMap.find(Inst);
+  assert(InstIt != ReverseMap.end() && "Reverse map out of sync?");
+  bool Found = InstIt->second.erase(Val);
+  assert(Found && "Invalid reverse map!"); (void)Found;
+  if (InstIt->second.empty())
+    ReverseMap.erase(InstIt);
+}
+
+/// GetLocation - If the given instruction references a specific memory
+/// location, fill in Loc with the details, otherwise set Loc.Ptr to null.
+/// Return a ModRefInfo value describing the general behavior of the
+/// instruction.
+static
+AliasAnalysis::ModRefResult GetLocation(const Instruction *Inst,
+                                        AliasAnalysis::Location &Loc,
+                                        AliasAnalysis *AA) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    if (LI->isUnordered()) {
+      Loc = AA->getLocation(LI);
+      return AliasAnalysis::Ref;
+    }
+    if (LI->getOrdering() == Monotonic) {
+      Loc = AA->getLocation(LI);
+      return AliasAnalysis::ModRef;
+    }
+    Loc = AliasAnalysis::Location();
+    return AliasAnalysis::ModRef;
+  }
+
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->isUnordered()) {
+      Loc = AA->getLocation(SI);
+      return AliasAnalysis::Mod;
+    }
+    if (SI->getOrdering() == Monotonic) {
+      Loc = AA->getLocation(SI);
+      return AliasAnalysis::ModRef;
+    }
+    Loc = AliasAnalysis::Location();
+    return AliasAnalysis::ModRef;
+  }
+
+  if (const VAArgInst *V = dyn_cast<VAArgInst>(Inst)) {
+    Loc = AA->getLocation(V);
+    return AliasAnalysis::ModRef;
+  }
+
+  if (const CallInst *CI = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
+    // calls to free() deallocate the entire structure
+    Loc = AliasAnalysis::Location(CI->getArgOperand(0));
+    return AliasAnalysis::Mod;
+  }
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_start:
+      Loc = AliasAnalysis::Location(II->getArgOperand(1),
+                                    cast<ConstantInt>(II->getArgOperand(0))
+                                      ->getZExtValue(),
+                                    II->getMetadata(LLVMContext::MD_tbaa));
+      // These intrinsics don't really modify the memory, but returning Mod
+      // will allow them to be handled conservatively.
+      return AliasAnalysis::Mod;
+    case Intrinsic::invariant_end:
+      Loc = AliasAnalysis::Location(II->getArgOperand(2),
+                                    cast<ConstantInt>(II->getArgOperand(1))
+                                      ->getZExtValue(),
+                                    II->getMetadata(LLVMContext::MD_tbaa));
+      // These intrinsics don't really modify the memory, but returning Mod
+      // will allow them to be handled conservatively.
+      return AliasAnalysis::Mod;
+    default:
+      break;
+    }
+
+  // Otherwise, just do the coarse-grained thing that always works.
+  if (Inst->mayWriteToMemory())
+    return AliasAnalysis::ModRef;
+  if (Inst->mayReadFromMemory())
+    return AliasAnalysis::Ref;
+  return AliasAnalysis::NoModRef;
+}
+
+/// getCallSiteDependencyFrom - Private helper for finding the local
+/// dependencies of a call site.
+MemDepResult MemoryDependenceAnalysis::
+getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
+                          BasicBlock::iterator ScanIt, BasicBlock *BB) {
+  unsigned Limit = BlockScanLimit;
+
+  // Walk backwards through the block, looking for dependencies
+  while (ScanIt != BB->begin()) {
+    // Limit the amount of scanning we do so we don't end up with quadratic
+    // running time on extreme testcases.
+    --Limit;
+    if (!Limit)
+      return MemDepResult::getUnknown();
+
+    Instruction *Inst = --ScanIt;
+
+    // If this inst is a memory op, get the pointer it accessed
+    AliasAnalysis::Location Loc;
+    AliasAnalysis::ModRefResult MR = GetLocation(Inst, Loc, AA);
+    if (Loc.Ptr) {
+      // A simple instruction.
+      if (AA->getModRefInfo(CS, Loc) != AliasAnalysis::NoModRef)
+        return MemDepResult::getClobber(Inst);
+      continue;
+    }
+
+    if (CallSite InstCS = cast<Value>(Inst)) {
+      // Debug intrinsics don't cause dependences.
+      if (isa<DbgInfoIntrinsic>(Inst)) continue;
+      // If these two calls do not interfere, look past it.
+      switch (AA->getModRefInfo(CS, InstCS)) {
+      case AliasAnalysis::NoModRef:
+        // If the two calls are the same, return InstCS as a Def, so that
+        // CS can be found redundant and eliminated.
+        if (isReadOnlyCall && !(MR & AliasAnalysis::Mod) &&
+            CS.getInstruction()->isIdenticalToWhenDefined(Inst))
+          return MemDepResult::getDef(Inst);
+
+        // Otherwise if the two calls don't interact (e.g. InstCS is readnone)
+        // keep scanning.
+        continue;
+      default:
+        return MemDepResult::getClobber(Inst);
+      }
+    }
+
+    // If we could not obtain a pointer for the instruction and the instruction
+    // touches memory then assume that this is a dependency.
+    if (MR != AliasAnalysis::NoModRef)
+      return MemDepResult::getClobber(Inst);
+  }
+
+  // No dependence found.  If this is the entry block of the function, it is
+  // unknown, otherwise it is non-local.
+  if (BB != &BB->getParent()->getEntryBlock())
+    return MemDepResult::getNonLocal();
+  return MemDepResult::getNonFuncLocal();
+}
+
+/// isLoadLoadClobberIfExtendedToFullWidth - Return true if LI is a load that
+/// would fully overlap MemLoc if done as a wider legal integer load.
+///
+/// MemLocBase, MemLocOffset are lazily computed here the first time the
+/// base/offs of memloc is needed.
+static bool
+isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
+                                       const Value *&MemLocBase,
+                                       int64_t &MemLocOffs,
+                                       const LoadInst *LI,
+                                       const DataLayout *TD) {
+  // If we have no target data, we can't do this.
+  if (TD == 0) return false;
+
+  // If we haven't already computed the base/offset of MemLoc, do so now.
+  if (MemLocBase == 0)
+    MemLocBase = GetPointerBaseWithConstantOffset(MemLoc.Ptr, MemLocOffs, TD);
+
+  unsigned Size = MemoryDependenceAnalysis::
+    getLoadLoadClobberFullWidthSize(MemLocBase, MemLocOffs, MemLoc.Size,
+                                    LI, *TD);
+  return Size != 0;
+}
+
+/// getLoadLoadClobberFullWidthSize - This is a little bit of analysis that
+/// looks at a memory location for a load (specified by MemLocBase, Offs,
+/// and Size) and compares it against a load.  If the specified load could
+/// be safely widened to a larger integer load that is 1) still efficient,
+/// 2) safe for the target, and 3) would provide the specified memory
+/// location value, then this function returns the size in bytes of the
+/// load width to use.  If not, this returns zero.
+unsigned MemoryDependenceAnalysis::
+getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
+                                unsigned MemLocSize, const LoadInst *LI,
+                                const DataLayout &TD) {
+  // We can only extend simple integer loads.
+  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) return 0;
+
+  // Load widening is hostile to ThreadSanitizer: it may cause false positives
+  // or make the reports more cryptic (access sizes are wrong).
+  if (LI->getParent()->getParent()->getAttributes().
+      hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeThread))
+    return 0;
+
+  // Get the base of this load.
+  int64_t LIOffs = 0;
+  const Value *LIBase =
+    GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, &TD);
+
+  // If the two pointers are not based on the same pointer, we can't tell that
+  // they are related.
+  if (LIBase != MemLocBase) return 0;
+
+  // Okay, the two values are based on the same pointer, but returned as
+  // no-alias.  This happens when we have things like two byte loads at "P+1"
+  // and "P+3".  Check to see if increasing the size of the "LI" load up to its
+  // alignment (or the largest native integer type) will allow us to load all
+  // the bits required by MemLoc.
+
+  // If MemLoc is before LI, then no widening of LI will help us out.
+  if (MemLocOffs < LIOffs) return 0;
+
+  // Get the alignment of the load in bytes.  We assume that it is safe to load
+  // any legal integer up to this size without a problem.  For example, if we're
+  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
+  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
+  // to i16.
+  unsigned LoadAlign = LI->getAlignment();
+
+  int64_t MemLocEnd = MemLocOffs+MemLocSize;
+
+  // If no amount of rounding up will let MemLoc fit into LI, then bail out.
+  if (LIOffs+LoadAlign < MemLocEnd) return 0;
+
+  // This is the size of the load to try.  Start with the next larger power of
+  // two.
+  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits()/8U;
+  NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
+
+  while (1) {
+    // If this load size is bigger than our known alignment or would not fit
+    // into a native integer register, then we fail.
+    if (NewLoadByteSize > LoadAlign ||
+        !TD.fitsInLegalInteger(NewLoadByteSize*8))
+      return 0;
+
+    if (LIOffs+NewLoadByteSize > MemLocEnd &&
+        LI->getParent()->getParent()->getAttributes().
+          hasAttribute(AttributeSet::FunctionIndex, Attribute::SanitizeAddress))
+      // We will be reading past the location accessed by the original program.
+      // While this is safe in a regular build, Address Safety analysis tools
+      // may start reporting false warnings. So, don't do widening.
+      return 0;
+
+    // If a load of this width would include all of MemLoc, then we succeed.
+    if (LIOffs+NewLoadByteSize >= MemLocEnd)
+      return NewLoadByteSize;
+
+    NewLoadByteSize <<= 1;
+  }
+}
+
+/// getPointerDependencyFrom - Return the instruction on which a memory
+/// location depends.  If isLoad is true, this routine ignores may-aliases with
+/// read-only operations.  If isLoad is false, this routine ignores may-aliases
+/// with reads from read-only locations.  If possible, pass the query
+/// instruction as well; this function may take advantage of the metadata
+/// annotated to the query instruction to refine the result.
+MemDepResult MemoryDependenceAnalysis::
+getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
+                         BasicBlock::iterator ScanIt, BasicBlock *BB,
+                         Instruction *QueryInst) {
+
+  const Value *MemLocBase = 0;
+  int64_t MemLocOffset = 0;
+  unsigned Limit = BlockScanLimit;
+  bool isInvariantLoad = false;
+  if (isLoad && QueryInst) {
+    LoadInst *LI = dyn_cast<LoadInst>(QueryInst);
+    if (LI && LI->getMetadata(LLVMContext::MD_invariant_load) != 0)
+      isInvariantLoad = true;
+  }
+
+  // Walk backwards through the basic block, looking for dependencies.
+  while (ScanIt != BB->begin()) {
+    // Limit the amount of scanning we do so we don't end up with quadratic
+    // running time on extreme testcases.
+    --Limit;
+    if (!Limit)
+      return MemDepResult::getUnknown();
+
+    Instruction *Inst = --ScanIt;
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+      // Debug intrinsics don't (and can't) cause dependences.
+      if (isa<DbgInfoIntrinsic>(II)) continue;
+
+      // If we reach a lifetime begin or end marker, then the query ends here
+      // because the value is undefined.
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+        // FIXME: This only considers queries directly on the invariant-tagged
+        // pointer, not on query pointers that are indexed off of them.  It'd
+        // be nice to handle that at some point (the right approach is to use
+        // GetPointerBaseWithConstantOffset).
+        if (AA->isMustAlias(AliasAnalysis::Location(II->getArgOperand(1)),
+                            MemLoc))
+          return MemDepResult::getDef(II);
+        continue;
+      }
+    }
+
+    // Values depend on loads if the pointers are must aliased.  This means that
+    // a load depends on another must aliased load from the same value.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      // Atomic loads have complications involved.
+      // FIXME: This is overly conservative.
+      if (!LI->isUnordered())
+        return MemDepResult::getClobber(LI);
+
+      AliasAnalysis::Location LoadLoc = AA->getLocation(LI);
+
+      // If we found a pointer, check if it could be the same as our pointer.
+      AliasAnalysis::AliasResult R = AA->alias(LoadLoc, MemLoc);
+
+      if (isLoad) {
+        if (R == AliasAnalysis::NoAlias) {
+          // If this is an over-aligned integer load (for example,
+          // "load i8* %P, align 4") see if it would obviously overlap with the
+          // queried location if widened to a larger load (e.g. if the queried
+          // location is 1 byte at P+1).  If so, return it as a load/load
+          // clobber result, allowing the client to decide to widen the load if
+          // it wants to.
+          if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType()))
+            if (LI->getAlignment()*8 > ITy->getPrimitiveSizeInBits() &&
+                isLoadLoadClobberIfExtendedToFullWidth(MemLoc, MemLocBase,
+                                                       MemLocOffset, LI, TD))
+              return MemDepResult::getClobber(Inst);
+
+          continue;
+        }
+
+        // Must aliased loads are defs of each other.
+        if (R == AliasAnalysis::MustAlias)
+          return MemDepResult::getDef(Inst);
+
+#if 0 // FIXME: Temporarily disabled. GVN is cleverly rewriting loads
+      // in terms of clobbering loads, but since it does this by looking
+      // at the clobbering load directly, it doesn't know about any
+      // phi translation that may have happened along the way.
+
+        // If we have a partial alias, then return this as a clobber for the
+        // client to handle.
+        if (R == AliasAnalysis::PartialAlias)
+          return MemDepResult::getClobber(Inst);
+#endif
+
+        // Random may-alias loads don't depend on each other without a
+        // dependence.
+        continue;
+      }
+
+      // Stores don't depend on other no-aliased accesses.
+      if (R == AliasAnalysis::NoAlias)
+        continue;
+
+      // Stores don't alias loads from read-only memory.
+      if (AA->pointsToConstantMemory(LoadLoc))
+        continue;
+
+      // Stores depend on may/must aliased loads.
+      return MemDepResult::getDef(Inst);
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // Atomic stores have complications involved.
+      // FIXME: This is overly conservative.
+      if (!SI->isUnordered())
+        return MemDepResult::getClobber(SI);
+
+      // If alias analysis can tell that this store is guaranteed to not modify
+      // the query pointer, ignore it.  Use getModRefInfo to handle cases where
+      // the query pointer points to constant memory etc.
+      if (AA->getModRefInfo(SI, MemLoc) == AliasAnalysis::NoModRef)
+        continue;
+
+      // Ok, this store might clobber the query pointer.  Check to see if it is
+      // a must alias: in this case, we want to return this as a def.
+      AliasAnalysis::Location StoreLoc = AA->getLocation(SI);
+
+      // If we found a pointer, check if it could be the same as our pointer.
+      AliasAnalysis::AliasResult R = AA->alias(StoreLoc, MemLoc);
+
+      if (R == AliasAnalysis::NoAlias)
+        continue;
+      if (R == AliasAnalysis::MustAlias)
+        return MemDepResult::getDef(Inst);
+      if (isInvariantLoad)
+       continue;
+      return MemDepResult::getClobber(Inst);
+    }
+
+    // If this is an allocation, and if we know that the accessed pointer is to
+    // the allocation, return Def.  This means that there is no dependence and
+    // the access can be optimized based on that.  For example, a load could
+    // turn into undef.
+    // Note: Only determine this to be a malloc if Inst is the malloc call, not
+    // a subsequent bitcast of the malloc call result.  There can be stores to
+    // the malloced memory between the malloc call and its bitcast uses, and we
+    // need to continue scanning until the malloc call.
+    const TargetLibraryInfo *TLI = AA->getTargetLibraryInfo();
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) {
+      const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
+
+      if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
+        return MemDepResult::getDef(Inst);
+      // Be conservative if the accessed pointer may alias the allocation.
+      if (AA->alias(Inst, AccessPtr) != AliasAnalysis::NoAlias)
+        return MemDepResult::getClobber(Inst);
+      // If the allocation is not aliased and does not read memory (like
+      // strdup), it is safe to ignore.
+      if (isa<AllocaInst>(Inst) ||
+          isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+        continue;
+    }
+
+    // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
+    AliasAnalysis::ModRefResult MR = AA->getModRefInfo(Inst, MemLoc);
+    // If necessary, perform additional analysis.
+    if (MR == AliasAnalysis::ModRef)
+      MR = AA->callCapturesBefore(Inst, MemLoc, DT);
+    switch (MR) {
+    case AliasAnalysis::NoModRef:
+      // If the call has no effect on the queried pointer, just ignore it.
+      continue;
+    case AliasAnalysis::Mod:
+      return MemDepResult::getClobber(Inst);
+    case AliasAnalysis::Ref:
+      // If the call is known to never store to the pointer, and if this is a
+      // load query, we can safely ignore it (scan past it).
+      if (isLoad)
+        continue;
+    default:
+      // Otherwise, there is a potential dependence.  Return a clobber.
+      return MemDepResult::getClobber(Inst);
+    }
+  }
+
+  // No dependence found.  If this is the entry block of the function, it is
+  // unknown, otherwise it is non-local.
+  if (BB != &BB->getParent()->getEntryBlock())
+    return MemDepResult::getNonLocal();
+  return MemDepResult::getNonFuncLocal();
+}
+
+/// getDependency - Return the instruction on which a memory operation
+/// depends.
+MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
+  Instruction *ScanPos = QueryInst;
+
+  // Check for a cached result
+  MemDepResult &LocalCache = LocalDeps[QueryInst];
+
+  // If the cached entry is non-dirty, just return it.  Note that this depends
+  // on MemDepResult's default constructing to 'dirty'.
+  if (!LocalCache.isDirty())
+    return LocalCache;
+
+  // Otherwise, if we have a dirty entry, we know we can start the scan at that
+  // instruction, which may save us some work.
+  if (Instruction *Inst = LocalCache.getInst()) {
+    ScanPos = Inst;
+
+    RemoveFromReverseMap(ReverseLocalDeps, Inst, QueryInst);
+  }
+
+  BasicBlock *QueryParent = QueryInst->getParent();
+
+  // Do the scan.
+  if (BasicBlock::iterator(QueryInst) == QueryParent->begin()) {
+    // No dependence found.  If this is the entry block of the function, it is
+    // unknown, otherwise it is non-local.
+    if (QueryParent != &QueryParent->getParent()->getEntryBlock())
+      LocalCache = MemDepResult::getNonLocal();
+    else
+      LocalCache = MemDepResult::getNonFuncLocal();
+  } else {
+    AliasAnalysis::Location MemLoc;
+    AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA);
+    if (MemLoc.Ptr) {
+      // If we can do a pointer scan, make it happen.
+      bool isLoad = !(MR & AliasAnalysis::Mod);
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(QueryInst))
+        isLoad |= II->getIntrinsicID() == Intrinsic::lifetime_start;
+
+      LocalCache = getPointerDependencyFrom(MemLoc, isLoad, ScanPos,
+                                            QueryParent, QueryInst);
+    } else if (isa<CallInst>(QueryInst) || isa<InvokeInst>(QueryInst)) {
+      CallSite QueryCS(QueryInst);
+      bool isReadOnly = AA->onlyReadsMemory(QueryCS);
+      LocalCache = getCallSiteDependencyFrom(QueryCS, isReadOnly, ScanPos,
+                                             QueryParent);
+    } else
+      // Non-memory instruction.
+      LocalCache = MemDepResult::getUnknown();
+  }
+
+  // Remember the result!
+  if (Instruction *I = LocalCache.getInst())
+    ReverseLocalDeps[I].insert(QueryInst);
+
+  return LocalCache;
+}
+
+#ifndef NDEBUG
+/// AssertSorted - This method is used when -debug is specified to verify that
+/// cache arrays are properly kept sorted.
+static void AssertSorted(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
+                         int Count = -1) {
+  if (Count == -1) Count = Cache.size();
+  if (Count == 0) return;
+
+  for (unsigned i = 1; i != unsigned(Count); ++i)
+    assert(!(Cache[i] < Cache[i-1]) && "Cache isn't sorted!");
+}
+#endif
+
+/// getNonLocalCallDependency - Perform a full dependency query for the
+/// specified call, returning the set of blocks that the value is
+/// potentially live across.  The returned set of results will include a
+/// "NonLocal" result for all blocks where the value is live across.
+///
+/// This method assumes the instruction returns a "NonLocal" dependency
+/// within its own block.
+///
+/// This returns a reference to an internal data structure that may be
+/// invalidated on the next non-local query or when an instruction is
+/// removed.  Clients must copy this data if they want it around longer than
+/// that.
+const MemoryDependenceAnalysis::NonLocalDepInfo &
+MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
+  assert(getDependency(QueryCS.getInstruction()).isNonLocal() &&
+ "getNonLocalCallDependency should only be used on calls with non-local deps!");
+  PerInstNLInfo &CacheP = NonLocalDeps[QueryCS.getInstruction()];
+  NonLocalDepInfo &Cache = CacheP.first;
+
+  /// DirtyBlocks - This is the set of blocks that need to be recomputed.  In
+  /// the cached case, this can happen due to instructions being deleted etc. In
+  /// the uncached case, this starts out as the set of predecessors we care
+  /// about.
+  SmallVector<BasicBlock*, 32> DirtyBlocks;
+
+  if (!Cache.empty()) {
+    // Okay, we have a cache entry.  If we know it is not dirty, just return it
+    // with no computation.
+    if (!CacheP.second) {
+      ++NumCacheNonLocal;
+      return Cache;
+    }
+
+    // If we already have a partially computed set of results, scan them to
+    // determine what is dirty, seeding our initial DirtyBlocks worklist.
+    for (NonLocalDepInfo::iterator I = Cache.begin(), E = Cache.end();
+       I != E; ++I)
+      if (I->getResult().isDirty())
+        DirtyBlocks.push_back(I->getBB());
+
+    // Sort the cache so that we can do fast binary search lookups below.
+    std::sort(Cache.begin(), Cache.end());
+
+    ++NumCacheDirtyNonLocal;
+    //cerr << "CACHED CASE: " << DirtyBlocks.size() << " dirty: "
+    //     << Cache.size() << " cached: " << *QueryInst;
+  } else {
+    // Seed DirtyBlocks with each of the preds of QueryInst's block.
+    BasicBlock *QueryBB = QueryCS.getInstruction()->getParent();
+    for (BasicBlock **PI = PredCache->GetPreds(QueryBB); *PI; ++PI)
+      DirtyBlocks.push_back(*PI);
+    ++NumUncacheNonLocal;
+  }
+
+  // isReadonlyCall - If this is a read-only call, we can be more aggressive.
+  bool isReadonlyCall = AA->onlyReadsMemory(QueryCS);
+
+  SmallPtrSet<BasicBlock*, 64> Visited;
+
+  unsigned NumSortedEntries = Cache.size();
+  DEBUG(AssertSorted(Cache));
+
+  // Iterate while we still have blocks to update.
+  while (!DirtyBlocks.empty()) {
+    BasicBlock *DirtyBB = DirtyBlocks.back();
+    DirtyBlocks.pop_back();
+
+    // Already processed this block?
+    if (!Visited.insert(DirtyBB))
+      continue;
+
+    // Do a binary search to see if we already have an entry for this block in
+    // the cache set.  If so, find it.
+    DEBUG(AssertSorted(Cache, NumSortedEntries));
+    NonLocalDepInfo::iterator Entry =
+      std::upper_bound(Cache.begin(), Cache.begin()+NumSortedEntries,
+                       NonLocalDepEntry(DirtyBB));
+    if (Entry != Cache.begin() && prior(Entry)->getBB() == DirtyBB)
+      --Entry;
+
+    NonLocalDepEntry *ExistingResult = 0;
+    if (Entry != Cache.begin()+NumSortedEntries &&
+        Entry->getBB() == DirtyBB) {
+      // If we already have an entry, and if it isn't already dirty, the block
+      // is done.
+      if (!Entry->getResult().isDirty())
+        continue;
+
+      // Otherwise, remember this slot so we can update the value.
+      ExistingResult = &*Entry;
+    }
+
+    // If the dirty entry has a pointer, start scanning from it so we don't have
+    // to rescan the entire block.
+    BasicBlock::iterator ScanPos = DirtyBB->end();
+    if (ExistingResult) {
+      if (Instruction *Inst = ExistingResult->getResult().getInst()) {
+        ScanPos = Inst;
+        // We're removing QueryInst's use of Inst.
+        RemoveFromReverseMap(ReverseNonLocalDeps, Inst,
+                             QueryCS.getInstruction());
+      }
+    }
+
+    // Find out if this block has a local dependency for QueryInst.
+    MemDepResult Dep;
+
+    if (ScanPos != DirtyBB->begin()) {
+      Dep = getCallSiteDependencyFrom(QueryCS, isReadonlyCall,ScanPos, DirtyBB);
+    } else if (DirtyBB != &DirtyBB->getParent()->getEntryBlock()) {
+      // No dependence found.  If this is the entry block of the function, it is
+      // a clobber, otherwise it is unknown.
+      Dep = MemDepResult::getNonLocal();
+    } else {
+      Dep = MemDepResult::getNonFuncLocal();
+    }
+
+    // If we had a dirty entry for the block, update it.  Otherwise, just add
+    // a new entry.
+    if (ExistingResult)
+      ExistingResult->setResult(Dep);
+    else
+      Cache.push_back(NonLocalDepEntry(DirtyBB, Dep));
+
+    // If the block has a dependency (i.e. it isn't completely transparent to
+    // the value), remember the association!
+    if (!Dep.isNonLocal()) {
+      // Keep the ReverseNonLocalDeps map up to date so we can efficiently
+      // update this when we remove instructions.
+      if (Instruction *Inst = Dep.getInst())
+        ReverseNonLocalDeps[Inst].insert(QueryCS.getInstruction());
+    } else {
+
+      // If the block *is* completely transparent to the load, we need to check
+      // the predecessors of this block.  Add them to our worklist.
+      for (BasicBlock **PI = PredCache->GetPreds(DirtyBB); *PI; ++PI)
+        DirtyBlocks.push_back(*PI);
+    }
+  }
+
+  return Cache;
+}
+
+/// getNonLocalPointerDependency - Perform a full dependency query for an
+/// access to the specified (non-volatile) memory location, returning the
+/// set of instructions that either define or clobber the value.
+///
+/// This method assumes the pointer has a "NonLocal" dependency within its
+/// own block.
+///
+void MemoryDependenceAnalysis::
+getNonLocalPointerDependency(const AliasAnalysis::Location &Loc, bool isLoad,
+                             BasicBlock *FromBB,
+                             SmallVectorImpl<NonLocalDepResult> &Result) {
+  assert(Loc.Ptr->getType()->isPointerTy() &&
+         "Can't get pointer deps of a non-pointer!");
+  Result.clear();
+
+  PHITransAddr Address(const_cast<Value *>(Loc.Ptr), TD);
+
+  // This is the set of blocks we've inspected, and the pointer we consider in
+  // each block.  Because of critical edges, we currently bail out if querying
+  // a block with multiple different pointers.  This can happen during PHI
+  // translation.
+  DenseMap<BasicBlock*, Value*> Visited;
+  if (!getNonLocalPointerDepFromBB(Address, Loc, isLoad, FromBB,
+                                   Result, Visited, true))
+    return;
+  Result.clear();
+  Result.push_back(NonLocalDepResult(FromBB,
+                                     MemDepResult::getUnknown(),
+                                     const_cast<Value *>(Loc.Ptr)));
+}
+
+/// GetNonLocalInfoForBlock - Compute the memdep value for BB with
+/// Pointer/PointeeSize using either cached information in Cache or by doing a
+/// lookup (which may use dirty cache info if available).  If we do a lookup,
+/// add the result to the cache.
+MemDepResult MemoryDependenceAnalysis::
+GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
+                        bool isLoad, BasicBlock *BB,
+                        NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
+
+  // Do a binary search to see if we already have an entry for this block in
+  // the cache set.  If so, find it.
+  NonLocalDepInfo::iterator Entry =
+    std::upper_bound(Cache->begin(), Cache->begin()+NumSortedEntries,
+                     NonLocalDepEntry(BB));
+  if (Entry != Cache->begin() && (Entry-1)->getBB() == BB)
+    --Entry;
+
+  NonLocalDepEntry *ExistingResult = 0;
+  if (Entry != Cache->begin()+NumSortedEntries && Entry->getBB() == BB)
+    ExistingResult = &*Entry;
+
+  // If we have a cached entry, and it is non-dirty, use it as the value for
+  // this dependency.
+  if (ExistingResult && !ExistingResult->getResult().isDirty()) {
+    ++NumCacheNonLocalPtr;
+    return ExistingResult->getResult();
+  }
+
+  // Otherwise, we have to scan for the value.  If we have a dirty cache
+  // entry, start scanning from its position, otherwise we scan from the end
+  // of the block.
+  BasicBlock::iterator ScanPos = BB->end();
+  if (ExistingResult && ExistingResult->getResult().getInst()) {
+    assert(ExistingResult->getResult().getInst()->getParent() == BB &&
+           "Instruction invalidated?");
+    ++NumCacheDirtyNonLocalPtr;
+    ScanPos = ExistingResult->getResult().getInst();
+
+    // Eliminating the dirty entry from 'Cache', so update the reverse info.
+    ValueIsLoadPair CacheKey(Loc.Ptr, isLoad);
+    RemoveFromReverseMap(ReverseNonLocalPtrDeps, ScanPos, CacheKey);
+  } else {
+    ++NumUncacheNonLocalPtr;
+  }
+
+  // Scan the block for the dependency.
+  MemDepResult Dep = getPointerDependencyFrom(Loc, isLoad, ScanPos, BB);
+
+  // If we had a dirty entry for the block, update it.  Otherwise, just add
+  // a new entry.
+  if (ExistingResult)
+    ExistingResult->setResult(Dep);
+  else
+    Cache->push_back(NonLocalDepEntry(BB, Dep));
+
+  // If the block has a dependency (i.e. it isn't completely transparent to
+  // the value), remember the reverse association because we just added it
+  // to Cache!
+  if (!Dep.isDef() && !Dep.isClobber())
+    return Dep;
+
+  // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently
+  // update MemDep when we remove instructions.
+  Instruction *Inst = Dep.getInst();
+  assert(Inst && "Didn't depend on anything?");
+  ValueIsLoadPair CacheKey(Loc.Ptr, isLoad);
+  ReverseNonLocalPtrDeps[Inst].insert(CacheKey);
+  return Dep;
+}
+
+/// SortNonLocalDepInfoCache - Sort the a NonLocalDepInfo cache, given a certain
+/// number of elements in the array that are already properly ordered.  This is
+/// optimized for the case when only a few entries are added.
+static void
+SortNonLocalDepInfoCache(MemoryDependenceAnalysis::NonLocalDepInfo &Cache,
+                         unsigned NumSortedEntries) {
+  switch (Cache.size() - NumSortedEntries) {
+  case 0:
+    // done, no new entries.
+    break;
+  case 2: {
+    // Two new entries, insert the last one into place.
+    NonLocalDepEntry Val = Cache.back();
+    Cache.pop_back();
+    MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
+      std::upper_bound(Cache.begin(), Cache.end()-1, Val);
+    Cache.insert(Entry, Val);
+    // FALL THROUGH.
+  }
+  case 1:
+    // One new entry, Just insert the new value at the appropriate position.
+    if (Cache.size() != 1) {
+      NonLocalDepEntry Val = Cache.back();
+      Cache.pop_back();
+      MemoryDependenceAnalysis::NonLocalDepInfo::iterator Entry =
+        std::upper_bound(Cache.begin(), Cache.end(), Val);
+      Cache.insert(Entry, Val);
+    }
+    break;
+  default:
+    // Added many values, do a full scale sort.
+    std::sort(Cache.begin(), Cache.end());
+    break;
+  }
+}
+
+/// getNonLocalPointerDepFromBB - Perform a dependency query based on
+/// pointer/pointeesize starting at the end of StartBB.  Add any clobber/def
+/// results to the results vector and keep track of which blocks are visited in
+/// 'Visited'.
+///
+/// This has special behavior for the first block queries (when SkipFirstBlock
+/// is true).  In this special case, it ignores the contents of the specified
+/// block and starts returning dependence info for its predecessors.
+///
+/// This function returns false on success, or true to indicate that it could
+/// not compute dependence information for some reason.  This should be treated
+/// as a clobber dependence on the first instruction in the predecessor block.
+bool MemoryDependenceAnalysis::
+getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
+                            const AliasAnalysis::Location &Loc,
+                            bool isLoad, BasicBlock *StartBB,
+                            SmallVectorImpl<NonLocalDepResult> &Result,
+                            DenseMap<BasicBlock*, Value*> &Visited,
+                            bool SkipFirstBlock) {
+
+  // Look up the cached info for Pointer.
+  ValueIsLoadPair CacheKey(Pointer.getAddr(), isLoad);
+
+  // Set up a temporary NLPI value. If the map doesn't yet have an entry for
+  // CacheKey, this value will be inserted as the associated value. Otherwise,
+  // it'll be ignored, and we'll have to check to see if the cached size and
+  // tbaa tag are consistent with the current query.
+  NonLocalPointerInfo InitialNLPI;
+  InitialNLPI.Size = Loc.Size;
+  InitialNLPI.TBAATag = Loc.TBAATag;
+
+  // Get the NLPI for CacheKey, inserting one into the map if it doesn't
+  // already have one.
+  std::pair<CachedNonLocalPointerInfo::iterator, bool> Pair =
+    NonLocalPointerDeps.insert(std::make_pair(CacheKey, InitialNLPI));
+  NonLocalPointerInfo *CacheInfo = &Pair.first->second;
+
+  // If we already have a cache entry for this CacheKey, we may need to do some
+  // work to reconcile the cache entry and the current query.
+  if (!Pair.second) {
+    if (CacheInfo->Size < Loc.Size) {
+      // The query's Size is greater than the cached one. Throw out the
+      // cached data and proceed with the query at the greater size.
+      CacheInfo->Pair = BBSkipFirstBlockPair();
+      CacheInfo->Size = Loc.Size;
+      for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
+           DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
+        if (Instruction *Inst = DI->getResult().getInst())
+          RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
+      CacheInfo->NonLocalDeps.clear();
+    } else if (CacheInfo->Size > Loc.Size) {
+      // This query's Size is less than the cached one. Conservatively restart
+      // the query using the greater size.
+      return getNonLocalPointerDepFromBB(Pointer,
+                                         Loc.getWithNewSize(CacheInfo->Size),
+                                         isLoad, StartBB, Result, Visited,
+                                         SkipFirstBlock);
+    }
+
+    // If the query's TBAATag is inconsistent with the cached one,
+    // conservatively throw out the cached data and restart the query with
+    // no tag if needed.
+    if (CacheInfo->TBAATag != Loc.TBAATag) {
+      if (CacheInfo->TBAATag) {
+        CacheInfo->Pair = BBSkipFirstBlockPair();
+        CacheInfo->TBAATag = 0;
+        for (NonLocalDepInfo::iterator DI = CacheInfo->NonLocalDeps.begin(),
+             DE = CacheInfo->NonLocalDeps.end(); DI != DE; ++DI)
+          if (Instruction *Inst = DI->getResult().getInst())
+            RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
+        CacheInfo->NonLocalDeps.clear();
+      }
+      if (Loc.TBAATag)
+        return getNonLocalPointerDepFromBB(Pointer, Loc.getWithoutTBAATag(),
+                                           isLoad, StartBB, Result, Visited,
+                                           SkipFirstBlock);
+    }
+  }
+
+  NonLocalDepInfo *Cache = &CacheInfo->NonLocalDeps;
+
+  // If we have valid cached information for exactly the block we are
+  // investigating, just return it with no recomputation.
+  if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
+    // We have a fully cached result for this query then we can just return the
+    // cached results and populate the visited set.  However, we have to verify
+    // that we don't already have conflicting results for these blocks.  Check
+    // to ensure that if a block in the results set is in the visited set that
+    // it was for the same pointer query.
+    if (!Visited.empty()) {
+      for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
+           I != E; ++I) {
+        DenseMap<BasicBlock*, Value*>::iterator VI = Visited.find(I->getBB());
+        if (VI == Visited.end() || VI->second == Pointer.getAddr())
+          continue;
+
+        // We have a pointer mismatch in a block.  Just return clobber, saying
+        // that something was clobbered in this result.  We could also do a
+        // non-fully cached query, but there is little point in doing this.
+        return true;
+      }
+    }
+
+    Value *Addr = Pointer.getAddr();
+    for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
+         I != E; ++I) {
+      Visited.insert(std::make_pair(I->getBB(), Addr));
+      if (!I->getResult().isNonLocal() && DT->isReachableFromEntry(I->getBB()))
+        Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(), Addr));
+    }
+    ++NumCacheCompleteNonLocalPtr;
+    return false;
+  }
+
+  // Otherwise, either this is a new block, a block with an invalid cache
+  // pointer or one that we're about to invalidate by putting more info into it
+  // than its valid cache info.  If empty, the result will be valid cache info,
+  // otherwise it isn't.
+  if (Cache->empty())
+    CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
+  else
+    CacheInfo->Pair = BBSkipFirstBlockPair();
+
+  SmallVector<BasicBlock*, 32> Worklist;
+  Worklist.push_back(StartBB);
+
+  // PredList used inside loop.
+  SmallVector<std::pair<BasicBlock*, PHITransAddr>, 16> PredList;
+
+  // Keep track of the entries that we know are sorted.  Previously cached
+  // entries will all be sorted.  The entries we add we only sort on demand (we
+  // don't insert every element into its sorted position).  We know that we
+  // won't get any reuse from currently inserted values, because we don't
+  // revisit blocks after we insert info for them.
+  unsigned NumSortedEntries = Cache->size();
+  DEBUG(AssertSorted(*Cache));
+
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+
+    // Skip the first block if we have it.
+    if (!SkipFirstBlock) {
+      // Analyze the dependency of *Pointer in FromBB.  See if we already have
+      // been here.
+      assert(Visited.count(BB) && "Should check 'visited' before adding to WL");
+
+      // Get the dependency info for Pointer in BB.  If we have cached
+      // information, we will use it, otherwise we compute it.
+      DEBUG(AssertSorted(*Cache, NumSortedEntries));
+      MemDepResult Dep = GetNonLocalInfoForBlock(Loc, isLoad, BB, Cache,
+                                                 NumSortedEntries);
+
+      // If we got a Def or Clobber, add this to the list of results.
+      if (!Dep.isNonLocal() && DT->isReachableFromEntry(BB)) {
+        Result.push_back(NonLocalDepResult(BB, Dep, Pointer.getAddr()));
+        continue;
+      }
+    }
+
+    // If 'Pointer' is an instruction defined in this block, then we need to do
+    // phi translation to change it into a value live in the predecessor block.
+    // If not, we just add the predecessors to the worklist and scan them with
+    // the same Pointer.
+    if (!Pointer.NeedsPHITranslationFromBlock(BB)) {
+      SkipFirstBlock = false;
+      SmallVector<BasicBlock*, 16> NewBlocks;
+      for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
+        // Verify that we haven't looked at this block yet.
+        std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
+          InsertRes = Visited.insert(std::make_pair(*PI, Pointer.getAddr()));
+        if (InsertRes.second) {
+          // First time we've looked at *PI.
+          NewBlocks.push_back(*PI);
+          continue;
+        }
+
+        // If we have seen this block before, but it was with a different
+        // pointer then we have a phi translation failure and we have to treat
+        // this as a clobber.
+        if (InsertRes.first->second != Pointer.getAddr()) {
+          // Make sure to clean up the Visited map before continuing on to
+          // PredTranslationFailure.
+          for (unsigned i = 0; i < NewBlocks.size(); i++)
+            Visited.erase(NewBlocks[i]);
+          goto PredTranslationFailure;
+        }
+      }
+      Worklist.append(NewBlocks.begin(), NewBlocks.end());
+      continue;
+    }
+
+    // We do need to do phi translation, if we know ahead of time we can't phi
+    // translate this value, don't even try.
+    if (!Pointer.IsPotentiallyPHITranslatable())
+      goto PredTranslationFailure;
+
+    // We may have added values to the cache list before this PHI translation.
+    // If so, we haven't done anything to ensure that the cache remains sorted.
+    // Sort it now (if needed) so that recursive invocations of
+    // getNonLocalPointerDepFromBB and other routines that could reuse the cache
+    // value will only see properly sorted cache arrays.
+    if (Cache && NumSortedEntries != Cache->size()) {
+      SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
+      NumSortedEntries = Cache->size();
+    }
+    Cache = 0;
+
+    PredList.clear();
+    for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
+      BasicBlock *Pred = *PI;
+      PredList.push_back(std::make_pair(Pred, Pointer));
+
+      // Get the PHI translated pointer in this predecessor.  This can fail if
+      // not translatable, in which case the getAddr() returns null.
+      PHITransAddr &PredPointer = PredList.back().second;
+      PredPointer.PHITranslateValue(BB, Pred, 0);
+
+      Value *PredPtrVal = PredPointer.getAddr();
+
+      // Check to see if we have already visited this pred block with another
+      // pointer.  If so, we can't do this lookup.  This failure can occur
+      // with PHI translation when a critical edge exists and the PHI node in
+      // the successor translates to a pointer value different than the
+      // pointer the block was first analyzed with.
+      std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
+        InsertRes = Visited.insert(std::make_pair(Pred, PredPtrVal));
+
+      if (!InsertRes.second) {
+        // We found the pred; take it off the list of preds to visit.
+        PredList.pop_back();
+
+        // If the predecessor was visited with PredPtr, then we already did
+        // the analysis and can ignore it.
+        if (InsertRes.first->second == PredPtrVal)
+          continue;
+
+        // Otherwise, the block was previously analyzed with a different
+        // pointer.  We can't represent the result of this case, so we just
+        // treat this as a phi translation failure.
+
+        // Make sure to clean up the Visited map before continuing on to
+        // PredTranslationFailure.
+        for (unsigned i = 0, n = PredList.size(); i < n; ++i)
+          Visited.erase(PredList[i].first);
+
+        goto PredTranslationFailure;
+      }
+    }
+
+    // Actually process results here; this need to be a separate loop to avoid
+    // calling getNonLocalPointerDepFromBB for blocks we don't want to return
+    // any results for.  (getNonLocalPointerDepFromBB will modify our
+    // datastructures in ways the code after the PredTranslationFailure label
+    // doesn't expect.)
+    for (unsigned i = 0, n = PredList.size(); i < n; ++i) {
+      BasicBlock *Pred = PredList[i].first;
+      PHITransAddr &PredPointer = PredList[i].second;
+      Value *PredPtrVal = PredPointer.getAddr();
+
+      bool CanTranslate = true;
+      // If PHI translation was unable to find an available pointer in this
+      // predecessor, then we have to assume that the pointer is clobbered in
+      // that predecessor.  We can still do PRE of the load, which would insert
+      // a computation of the pointer in this predecessor.
+      if (PredPtrVal == 0)
+        CanTranslate = false;
+
+      // FIXME: it is entirely possible that PHI translating will end up with
+      // the same value.  Consider PHI translating something like:
+      // X = phi [x, bb1], [y, bb2].  PHI translating for bb1 doesn't *need*
+      // to recurse here, pedantically speaking.
+
+      // If getNonLocalPointerDepFromBB fails here, that means the cached
+      // result conflicted with the Visited list; we have to conservatively
+      // assume it is unknown, but this also does not block PRE of the load.
+      if (!CanTranslate ||
+          getNonLocalPointerDepFromBB(PredPointer,
+                                      Loc.getWithNewPtr(PredPtrVal),
+                                      isLoad, Pred,
+                                      Result, Visited)) {
+        // Add the entry to the Result list.
+        NonLocalDepResult Entry(Pred, MemDepResult::getUnknown(), PredPtrVal);
+        Result.push_back(Entry);
+
+        // Since we had a phi translation failure, the cache for CacheKey won't
+        // include all of the entries that we need to immediately satisfy future
+        // queries.  Mark this in NonLocalPointerDeps by setting the
+        // BBSkipFirstBlockPair pointer to null.  This requires reuse of the
+        // cached value to do more work but not miss the phi trans failure.
+        NonLocalPointerInfo &NLPI = NonLocalPointerDeps[CacheKey];
+        NLPI.Pair = BBSkipFirstBlockPair();
+        continue;
+      }
+    }
+
+    // Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
+    CacheInfo = &NonLocalPointerDeps[CacheKey];
+    Cache = &CacheInfo->NonLocalDeps;
+    NumSortedEntries = Cache->size();
+
+    // Since we did phi translation, the "Cache" set won't contain all of the
+    // results for the query.  This is ok (we can still use it to accelerate
+    // specific block queries) but we can't do the fastpath "return all
+    // results from the set"  Clear out the indicator for this.
+    CacheInfo->Pair = BBSkipFirstBlockPair();
+    SkipFirstBlock = false;
+    continue;
+
+  PredTranslationFailure:
+    // The following code is "failure"; we can't produce a sane translation
+    // for the given block.  It assumes that we haven't modified any of
+    // our datastructures while processing the current block.
+
+    if (Cache == 0) {
+      // Refresh the CacheInfo/Cache pointer if it got invalidated.
+      CacheInfo = &NonLocalPointerDeps[CacheKey];
+      Cache = &CacheInfo->NonLocalDeps;
+      NumSortedEntries = Cache->size();
+    }
+
+    // Since we failed phi translation, the "Cache" set won't contain all of the
+    // results for the query.  This is ok (we can still use it to accelerate
+    // specific block queries) but we can't do the fastpath "return all
+    // results from the set".  Clear out the indicator for this.
+    CacheInfo->Pair = BBSkipFirstBlockPair();
+
+    // If *nothing* works, mark the pointer as unknown.
+    //
+    // If this is the magic first block, return this as a clobber of the whole
+    // incoming value.  Since we can't phi translate to one of the predecessors,
+    // we have to bail out.
+    if (SkipFirstBlock)
+      return true;
+
+    for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
+      assert(I != Cache->rend() && "Didn't find current block??");
+      if (I->getBB() != BB)
+        continue;
+
+      assert(I->getResult().isNonLocal() &&
+             "Should only be here with transparent block");
+      I->setResult(MemDepResult::getUnknown());
+      Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(),
+                                         Pointer.getAddr()));
+      break;
+    }
+  }
+
+  // Okay, we're done now.  If we added new values to the cache, re-sort it.
+  SortNonLocalDepInfoCache(*Cache, NumSortedEntries);
+  DEBUG(AssertSorted(*Cache));
+  return false;
+}
+
+/// RemoveCachedNonLocalPointerDependencies - If P exists in
+/// CachedNonLocalPointerInfo, remove it.
+void MemoryDependenceAnalysis::
+RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair P) {
+  CachedNonLocalPointerInfo::iterator It =
+    NonLocalPointerDeps.find(P);
+  if (It == NonLocalPointerDeps.end()) return;
+
+  // Remove all of the entries in the BB->val map.  This involves removing
+  // instructions from the reverse map.
+  NonLocalDepInfo &PInfo = It->second.NonLocalDeps;
+
+  for (unsigned i = 0, e = PInfo.size(); i != e; ++i) {
+    Instruction *Target = PInfo[i].getResult().getInst();
+    if (Target == 0) continue;  // Ignore non-local dep results.
+    assert(Target->getParent() == PInfo[i].getBB());
+
+    // Eliminating the dirty entry from 'Cache', so update the reverse info.
+    RemoveFromReverseMap(ReverseNonLocalPtrDeps, Target, P);
+  }
+
+  // Remove P from NonLocalPointerDeps (which deletes NonLocalDepInfo).
+  NonLocalPointerDeps.erase(It);
+}
+
+
+/// invalidateCachedPointerInfo - This method is used to invalidate cached
+/// information about the specified pointer, because it may be too
+/// conservative in memdep.  This is an optional call that can be used when
+/// the client detects an equivalence between the pointer and some other
+/// value and replaces the other value with ptr. This can make Ptr available
+/// in more places that cached info does not necessarily keep.
+void MemoryDependenceAnalysis::invalidateCachedPointerInfo(Value *Ptr) {
+  // If Ptr isn't really a pointer, just ignore it.
+  if (!Ptr->getType()->isPointerTy()) return;
+  // Flush store info for the pointer.
+  RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, false));
+  // Flush load info for the pointer.
+  RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(Ptr, true));
+}
+
+/// invalidateCachedPredecessors - Clear the PredIteratorCache info.
+/// This needs to be done when the CFG changes, e.g., due to splitting
+/// critical edges.
+void MemoryDependenceAnalysis::invalidateCachedPredecessors() {
+  PredCache->clear();
+}
+
+/// removeInstruction - Remove an instruction from the dependence analysis,
+/// updating the dependence of instructions that previously depended on it.
+/// This method attempts to keep the cache coherent using the reverse map.
+void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
+  // Walk through the Non-local dependencies, removing this one as the value
+  // for any cached queries.
+  NonLocalDepMapType::iterator NLDI = NonLocalDeps.find(RemInst);
+  if (NLDI != NonLocalDeps.end()) {
+    NonLocalDepInfo &BlockMap = NLDI->second.first;
+    for (NonLocalDepInfo::iterator DI = BlockMap.begin(), DE = BlockMap.end();
+         DI != DE; ++DI)
+      if (Instruction *Inst = DI->getResult().getInst())
+        RemoveFromReverseMap(ReverseNonLocalDeps, Inst, RemInst);
+    NonLocalDeps.erase(NLDI);
+  }
+
+  // If we have a cached local dependence query for this instruction, remove it.
+  //
+  LocalDepMapType::iterator LocalDepEntry = LocalDeps.find(RemInst);
+  if (LocalDepEntry != LocalDeps.end()) {
+    // Remove us from DepInst's reverse set now that the local dep info is gone.
+    if (Instruction *Inst = LocalDepEntry->second.getInst())
+      RemoveFromReverseMap(ReverseLocalDeps, Inst, RemInst);
+
+    // Remove this local dependency info.
+    LocalDeps.erase(LocalDepEntry);
+  }
+
+  // If we have any cached pointer dependencies on this instruction, remove
+  // them.  If the instruction has non-pointer type, then it can't be a pointer
+  // base.
+
+  // Remove it from both the load info and the store info.  The instruction
+  // can't be in either of these maps if it is non-pointer.
+  if (RemInst->getType()->isPointerTy()) {
+    RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, false));
+    RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, true));
+  }
+
+  // Loop over all of the things that depend on the instruction we're removing.
+  //
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> ReverseDepsToAdd;
+
+  // If we find RemInst as a clobber or Def in any of the maps for other values,
+  // we need to replace its entry with a dirty version of the instruction after
+  // it.  If RemInst is a terminator, we use a null dirty value.
+  //
+  // Using a dirty version of the instruction after RemInst saves having to scan
+  // the entire block to get to this point.
+  MemDepResult NewDirtyVal;
+  if (!RemInst->isTerminator())
+    NewDirtyVal = MemDepResult::getDirty(++BasicBlock::iterator(RemInst));
+
+  ReverseDepMapType::iterator ReverseDepIt = ReverseLocalDeps.find(RemInst);
+  if (ReverseDepIt != ReverseLocalDeps.end()) {
+    SmallPtrSet<Instruction*, 4> &ReverseDeps = ReverseDepIt->second;
+    // RemInst can't be the terminator if it has local stuff depending on it.
+    assert(!ReverseDeps.empty() && !isa<TerminatorInst>(RemInst) &&
+           "Nothing can locally depend on a terminator");
+
+    for (SmallPtrSet<Instruction*, 4>::iterator I = ReverseDeps.begin(),
+         E = ReverseDeps.end(); I != E; ++I) {
+      Instruction *InstDependingOnRemInst = *I;
+      assert(InstDependingOnRemInst != RemInst &&
+             "Already removed our local dep info");
+
+      LocalDeps[InstDependingOnRemInst] = NewDirtyVal;
+
+      // Make sure to remember that new things depend on NewDepInst.
+      assert(NewDirtyVal.getInst() && "There is no way something else can have "
+             "a local dep on this if it is a terminator!");
+      ReverseDepsToAdd.push_back(std::make_pair(NewDirtyVal.getInst(),
+                                                InstDependingOnRemInst));
+    }
+
+    ReverseLocalDeps.erase(ReverseDepIt);
+
+    // Add new reverse deps after scanning the set, to avoid invalidating the
+    // 'ReverseDeps' reference.
+    while (!ReverseDepsToAdd.empty()) {
+      ReverseLocalDeps[ReverseDepsToAdd.back().first]
+        .insert(ReverseDepsToAdd.back().second);
+      ReverseDepsToAdd.pop_back();
+    }
+  }
+
+  ReverseDepIt = ReverseNonLocalDeps.find(RemInst);
+  if (ReverseDepIt != ReverseNonLocalDeps.end()) {
+    SmallPtrSet<Instruction*, 4> &Set = ReverseDepIt->second;
+    for (SmallPtrSet<Instruction*, 4>::iterator I = Set.begin(), E = Set.end();
+         I != E; ++I) {
+      assert(*I != RemInst && "Already removed NonLocalDep info for RemInst");
+
+      PerInstNLInfo &INLD = NonLocalDeps[*I];
+      // The information is now dirty!
+      INLD.second = true;
+
+      for (NonLocalDepInfo::iterator DI = INLD.first.begin(),
+           DE = INLD.first.end(); DI != DE; ++DI) {
+        if (DI->getResult().getInst() != RemInst) continue;
+
+        // Convert to a dirty entry for the subsequent instruction.
+        DI->setResult(NewDirtyVal);
+
+        if (Instruction *NextI = NewDirtyVal.getInst())
+          ReverseDepsToAdd.push_back(std::make_pair(NextI, *I));
+      }
+    }
+
+    ReverseNonLocalDeps.erase(ReverseDepIt);
+
+    // Add new reverse deps after scanning the set, to avoid invalidating 'Set'
+    while (!ReverseDepsToAdd.empty()) {
+      ReverseNonLocalDeps[ReverseDepsToAdd.back().first]
+        .insert(ReverseDepsToAdd.back().second);
+      ReverseDepsToAdd.pop_back();
+    }
+  }
+
+  // If the instruction is in ReverseNonLocalPtrDeps then it appears as a
+  // value in the NonLocalPointerDeps info.
+  ReverseNonLocalPtrDepTy::iterator ReversePtrDepIt =
+    ReverseNonLocalPtrDeps.find(RemInst);
+  if (ReversePtrDepIt != ReverseNonLocalPtrDeps.end()) {
+    SmallPtrSet<ValueIsLoadPair, 4> &Set = ReversePtrDepIt->second;
+    SmallVector<std::pair<Instruction*, ValueIsLoadPair>,8> ReversePtrDepsToAdd;
+
+    for (SmallPtrSet<ValueIsLoadPair, 4>::iterator I = Set.begin(),
+         E = Set.end(); I != E; ++I) {
+      ValueIsLoadPair P = *I;
+      assert(P.getPointer() != RemInst &&
+             "Already removed NonLocalPointerDeps info for RemInst");
+
+      NonLocalDepInfo &NLPDI = NonLocalPointerDeps[P].NonLocalDeps;
+
+      // The cache is not valid for any specific block anymore.
+      NonLocalPointerDeps[P].Pair = BBSkipFirstBlockPair();
+
+      // Update any entries for RemInst to use the instruction after it.
+      for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
+           DI != DE; ++DI) {
+        if (DI->getResult().getInst() != RemInst) continue;
+
+        // Convert to a dirty entry for the subsequent instruction.
+        DI->setResult(NewDirtyVal);
+
+        if (Instruction *NewDirtyInst = NewDirtyVal.getInst())
+          ReversePtrDepsToAdd.push_back(std::make_pair(NewDirtyInst, P));
+      }
+
+      // Re-sort the NonLocalDepInfo.  Changing the dirty entry to its
+      // subsequent value may invalidate the sortedness.
+      std::sort(NLPDI.begin(), NLPDI.end());
+    }
+
+    ReverseNonLocalPtrDeps.erase(ReversePtrDepIt);
+
+    while (!ReversePtrDepsToAdd.empty()) {
+      ReverseNonLocalPtrDeps[ReversePtrDepsToAdd.back().first]
+        .insert(ReversePtrDepsToAdd.back().second);
+      ReversePtrDepsToAdd.pop_back();
+    }
+  }
+
+
+  assert(!NonLocalDeps.count(RemInst) && "RemInst got reinserted?");
+  AA->deleteValue(RemInst);
+  DEBUG(verifyRemoved(RemInst));
+}
+/// verifyRemoved - Verify that the specified instruction does not occur
+/// in our internal data structures.
+void MemoryDependenceAnalysis::verifyRemoved(Instruction *D) const {
+  for (LocalDepMapType::const_iterator I = LocalDeps.begin(),
+       E = LocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    assert(I->second.getInst() != D &&
+           "Inst occurs in data structures");
+  }
+
+  for (CachedNonLocalPointerInfo::const_iterator I =NonLocalPointerDeps.begin(),
+       E = NonLocalPointerDeps.end(); I != E; ++I) {
+    assert(I->first.getPointer() != D && "Inst occurs in NLPD map key");
+    const NonLocalDepInfo &Val = I->second.NonLocalDeps;
+    for (NonLocalDepInfo::const_iterator II = Val.begin(), E = Val.end();
+         II != E; ++II)
+      assert(II->getResult().getInst() != D && "Inst occurs as NLPD value");
+  }
+
+  for (NonLocalDepMapType::const_iterator I = NonLocalDeps.begin(),
+       E = NonLocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    const PerInstNLInfo &INLD = I->second;
+    for (NonLocalDepInfo::const_iterator II = INLD.first.begin(),
+         EE = INLD.first.end(); II  != EE; ++II)
+      assert(II->getResult().getInst() != D && "Inst occurs in data structures");
+  }
+
+  for (ReverseDepMapType::const_iterator I = ReverseLocalDeps.begin(),
+       E = ReverseLocalDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
+         EE = I->second.end(); II != EE; ++II)
+      assert(*II != D && "Inst occurs in data structures");
+  }
+
+  for (ReverseDepMapType::const_iterator I = ReverseNonLocalDeps.begin(),
+       E = ReverseNonLocalDeps.end();
+       I != E; ++I) {
+    assert(I->first != D && "Inst occurs in data structures");
+    for (SmallPtrSet<Instruction*, 4>::const_iterator II = I->second.begin(),
+         EE = I->second.end(); II != EE; ++II)
+      assert(*II != D && "Inst occurs in data structures");
+  }
+
+  for (ReverseNonLocalPtrDepTy::const_iterator
+       I = ReverseNonLocalPtrDeps.begin(),
+       E = ReverseNonLocalPtrDeps.end(); I != E; ++I) {
+    assert(I->first != D && "Inst occurs in rev NLPD map");
+
+    for (SmallPtrSet<ValueIsLoadPair, 4>::const_iterator II = I->second.begin(),
+         E = I->second.end(); II != E; ++II)
+      assert(*II != ValueIsLoadPair(D, false) &&
+             *II != ValueIsLoadPair(D, true) &&
+             "Inst occurs in ReverseNonLocalPtrDeps map");
+  }
+
+}
diff --git a/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
new file mode 100644
index 000000000000..03415375263a
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -0,0 +1,87 @@
+//===-- ModuleDebugInfoPrinter.cpp - Prints module debug info metadata ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass decodes the debug info metadata in a module and prints in a
+// (sufficiently-prepared-) human-readable form.
+//
+// For example, run this pass from opt along with the -analyze option, and
+// it'll print to standard output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  class ModuleDebugInfoPrinter : public ModulePass {
+    DebugInfoFinder Finder;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ModuleDebugInfoPrinter() : ModulePass(ID) {
+      initializeModuleDebugInfoPrinterPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+    virtual void print(raw_ostream &O, const Module *M) const;
+  };
+}
+
+char ModuleDebugInfoPrinter::ID = 0;
+INITIALIZE_PASS(ModuleDebugInfoPrinter, "module-debuginfo",
+                "Decodes module-level debug info", false, true)
+
+ModulePass *llvm::createModuleDebugInfoPrinterPass() {
+  return new ModuleDebugInfoPrinter();
+}
+
+bool ModuleDebugInfoPrinter::runOnModule(Module &M) {
+  Finder.processModule(M);
+  return false;
+}
+
+void ModuleDebugInfoPrinter::print(raw_ostream &O, const Module *M) const {
+  for (DebugInfoFinder::iterator I = Finder.compile_unit_begin(),
+       E = Finder.compile_unit_end(); I != E; ++I) {
+    O << "Compile Unit: ";
+    DICompileUnit(*I).print(O);
+    O << '\n';
+  }
+
+  for (DebugInfoFinder::iterator I = Finder.subprogram_begin(),
+       E = Finder.subprogram_end(); I != E; ++I) {
+    O << "Subprogram: ";
+    DISubprogram(*I).print(O);
+    O << '\n';
+  }
+
+  for (DebugInfoFinder::iterator I = Finder.global_variable_begin(),
+       E = Finder.global_variable_end(); I != E; ++I) {
+    O << "GlobalVariable: ";
+    DIGlobalVariable(*I).print(O);
+    O << '\n';
+  }
+
+  for (DebugInfoFinder::iterator I = Finder.type_begin(),
+       E = Finder.type_end(); I != E; ++I) {
+    O << "Type: ";
+    DIType(*I).print(O);
+    O << '\n';
+  }
+}
diff --git a/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp
new file mode 100644
index 000000000000..907e9621baed
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/NoAliasAnalysis.cpp
@@ -0,0 +1,88 @@
+//===- NoAliasAnalysis.cpp - Minimal Alias Analysis Impl ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the default implementation of the Alias Analysis interface
+// that simply returns "I don't know" for all queries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  /// NoAA - This class implements the -no-aa pass, which always returns "I
+  /// don't know" for alias queries.  NoAA is unlike other alias analysis
+  /// implementations, in that it does not chain to a previous analysis.  As
+  /// such it doesn't follow many of the rules that other alias analyses must.
+  ///
+  struct NoAA : public ImmutablePass, public AliasAnalysis {
+    static char ID; // Class identification, replacement for typeinfo
+    NoAA() : ImmutablePass(ID) {
+      initializeNoAAPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    }
+
+    virtual void initializePass() {
+      // Note: NoAA does not call InitializeAliasAnalysis because it's
+      // special and does not support chaining.
+      TD = getAnalysisIfAvailable<DataLayout>();
+    }
+
+    virtual AliasResult alias(const Location &LocA, const Location &LocB) {
+      return MayAlias;
+    }
+
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS) {
+      return UnknownModRefBehavior;
+    }
+    virtual ModRefBehavior getModRefBehavior(const Function *F) {
+      return UnknownModRefBehavior;
+    }
+
+    virtual bool pointsToConstantMemory(const Location &Loc,
+                                        bool OrLocal) {
+      return false;
+    }
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc) {
+      return ModRef;
+    }
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2) {
+      return ModRef;
+    }
+
+    virtual void deleteValue(Value *V) {}
+    virtual void copyValue(Value *From, Value *To) {}
+    virtual void addEscapingUse(Use &U) {}
+    
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *ID) {
+      if (ID == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char NoAA::ID = 0;
+INITIALIZE_AG_PASS(NoAA, AliasAnalysis, "no-aa",
+                   "No Alias Analysis (always returns 'may' alias)",
+                   true, true, true)
+
+ImmutablePass *llvm::createNoAAPass() { return new NoAA(); }
diff --git a/contrib/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
new file mode 100644
index 000000000000..e6af0663feaa
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
@@ -0,0 +1,443 @@
+//===- PHITransAddr.cpp - PHI Translation for Addresses -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PHITransAddr class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static bool CanPHITrans(Instruction *Inst) {
+  if (isa<PHINode>(Inst) ||
+      isa<GetElementPtrInst>(Inst))
+    return true;
+
+  if (isa<CastInst>(Inst) &&
+      isSafeToSpeculativelyExecute(Inst))
+    return true;
+
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1)))
+    return true;
+
+  //   cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
+  //   if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
+  //     cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
+  return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void PHITransAddr::dump() const {
+  if (Addr == 0) {
+    dbgs() << "PHITransAddr: null\n";
+    return;
+  }
+  dbgs() << "PHITransAddr: " << *Addr << "\n";
+  for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
+    dbgs() << "  Input #" << i << " is " << *InstInputs[i] << "\n";
+}
+#endif
+
+
+static bool VerifySubExpr(Value *Expr,
+                          SmallVectorImpl<Instruction*> &InstInputs) {
+  // If this is a non-instruction value, there is nothing to do.
+  Instruction *I = dyn_cast<Instruction>(Expr);
+  if (I == 0) return true;
+
+  // If it's an instruction, it is either in Tmp or its operands recursively
+  // are.
+  SmallVectorImpl<Instruction*>::iterator Entry =
+    std::find(InstInputs.begin(), InstInputs.end(), I);
+  if (Entry != InstInputs.end()) {
+    InstInputs.erase(Entry);
+    return true;
+  }
+
+  // If it isn't in the InstInputs list it is a subexpr incorporated into the
+  // address.  Sanity check that it is phi translatable.
+  if (!CanPHITrans(I)) {
+    errs() << "Non phi translatable instruction found in PHITransAddr:\n";
+    errs() << *I << '\n';
+    llvm_unreachable("Either something is missing from InstInputs or "
+                     "CanPHITrans is wrong.");
+  }
+
+  // Validate the operands of the instruction.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (!VerifySubExpr(I->getOperand(i), InstInputs))
+      return false;
+
+  return true;
+}
+
+/// Verify - Check internal consistency of this data structure.  If the
+/// structure is valid, it returns true.  If invalid, it prints errors and
+/// returns false.
+bool PHITransAddr::Verify() const {
+  if (Addr == 0) return true;
+
+  SmallVector<Instruction*, 8> Tmp(InstInputs.begin(), InstInputs.end());
+
+  if (!VerifySubExpr(Addr, Tmp))
+    return false;
+
+  if (!Tmp.empty()) {
+    errs() << "PHITransAddr contains extra instructions:\n";
+    for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
+      errs() << "  InstInput #" << i << " is " << *InstInputs[i] << "\n";
+    llvm_unreachable("This is unexpected.");
+  }
+
+  // a-ok.
+  return true;
+}
+
+
+/// IsPotentiallyPHITranslatable - If this needs PHI translation, return true
+/// if we have some hope of doing it.  This should be used as a filter to
+/// avoid calling PHITranslateValue in hopeless situations.
+bool PHITransAddr::IsPotentiallyPHITranslatable() const {
+  // If the input value is not an instruction, or if it is not defined in CurBB,
+  // then we don't need to phi translate it.
+  Instruction *Inst = dyn_cast<Instruction>(Addr);
+  return Inst == 0 || CanPHITrans(Inst);
+}
+
+
+static void RemoveInstInputs(Value *V,
+                             SmallVectorImpl<Instruction*> &InstInputs) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return;
+
+  // If the instruction is in the InstInputs list, remove it.
+  SmallVectorImpl<Instruction*>::iterator Entry =
+    std::find(InstInputs.begin(), InstInputs.end(), I);
+  if (Entry != InstInputs.end()) {
+    InstInputs.erase(Entry);
+    return;
+  }
+
+  assert(!isa<PHINode>(I) && "Error, removing something that isn't an input");
+
+  // Otherwise, it must have instruction inputs itself.  Zap them recursively.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(i)))
+      RemoveInstInputs(Op, InstInputs);
+  }
+}
+
+Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
+                                         BasicBlock *PredBB,
+                                         const DominatorTree *DT) {
+  // If this is a non-instruction value, it can't require PHI translation.
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (Inst == 0) return V;
+
+  // Determine whether 'Inst' is an input to our PHI translatable expression.
+  bool isInput = std::count(InstInputs.begin(), InstInputs.end(), Inst);
+
+  // Handle inputs instructions if needed.
+  if (isInput) {
+    if (Inst->getParent() != CurBB) {
+      // If it is an input defined in a different block, then it remains an
+      // input.
+      return Inst;
+    }
+
+    // If 'Inst' is defined in this block and is an input that needs to be phi
+    // translated, we need to incorporate the value into the expression or fail.
+
+    // In either case, the instruction itself isn't an input any longer.
+    InstInputs.erase(std::find(InstInputs.begin(), InstInputs.end(), Inst));
+
+    // If this is a PHI, go ahead and translate it.
+    if (PHINode *PN = dyn_cast<PHINode>(Inst))
+      return AddAsInput(PN->getIncomingValueForBlock(PredBB));
+
+    // If this is a non-phi value, and it is analyzable, we can incorporate it
+    // into the expression by making all instruction operands be inputs.
+    if (!CanPHITrans(Inst))
+      return 0;
+
+    // All instruction operands are now inputs (and of course, they may also be
+    // defined in this block, so they may need to be phi translated themselves.
+    for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
+      if (Instruction *Op = dyn_cast<Instruction>(Inst->getOperand(i)))
+        InstInputs.push_back(Op);
+  }
+
+  // Ok, it must be an intermediate result (either because it started that way
+  // or because we just incorporated it into the expression).  See if its
+  // operands need to be phi translated, and if so, reconstruct it.
+
+  if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
+    if (!isSafeToSpeculativelyExecute(Cast)) return 0;
+    Value *PHIIn = PHITranslateSubExpr(Cast->getOperand(0), CurBB, PredBB, DT);
+    if (PHIIn == 0) return 0;
+    if (PHIIn == Cast->getOperand(0))
+      return Cast;
+
+    // Find an available version of this cast.
+
+    // Constants are trivial to find.
+    if (Constant *C = dyn_cast<Constant>(PHIIn))
+      return AddAsInput(ConstantExpr::getCast(Cast->getOpcode(),
+                                              C, Cast->getType()));
+
+    // Otherwise we have to see if a casted version of the incoming pointer
+    // is available.  If so, we can use it, otherwise we have to fail.
+    for (Value::use_iterator UI = PHIIn->use_begin(), E = PHIIn->use_end();
+         UI != E; ++UI) {
+      if (CastInst *CastI = dyn_cast<CastInst>(*UI))
+        if (CastI->getOpcode() == Cast->getOpcode() &&
+            CastI->getType() == Cast->getType() &&
+            (!DT || DT->dominates(CastI->getParent(), PredBB)))
+          return CastI;
+    }
+    return 0;
+  }
+
+  // Handle getelementptr with at least one PHI translatable operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+    SmallVector<Value*, 8> GEPOps;
+    bool AnyChanged = false;
+    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
+      Value *GEPOp = PHITranslateSubExpr(GEP->getOperand(i), CurBB, PredBB, DT);
+      if (GEPOp == 0) return 0;
+
+      AnyChanged |= GEPOp != GEP->getOperand(i);
+      GEPOps.push_back(GEPOp);
+    }
+
+    if (!AnyChanged)
+      return GEP;
+
+    // Simplify the GEP to handle 'gep x, 0' -> x etc.
+    if (Value *V = SimplifyGEPInst(GEPOps, TD, TLI, DT)) {
+      for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
+        RemoveInstInputs(GEPOps[i], InstInputs);
+
+      return AddAsInput(V);
+    }
+
+    // Scan to see if we have this GEP available.
+    Value *APHIOp = GEPOps[0];
+    for (Value::use_iterator UI = APHIOp->use_begin(), E = APHIOp->use_end();
+         UI != E; ++UI) {
+      if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI))
+        if (GEPI->getType() == GEP->getType() &&
+            GEPI->getNumOperands() == GEPOps.size() &&
+            GEPI->getParent()->getParent() == CurBB->getParent() &&
+            (!DT || DT->dominates(GEPI->getParent(), PredBB))) {
+          bool Mismatch = false;
+          for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
+            if (GEPI->getOperand(i) != GEPOps[i]) {
+              Mismatch = true;
+              break;
+            }
+          if (!Mismatch)
+            return GEPI;
+        }
+    }
+    return 0;
+  }
+
+  // Handle add with a constant RHS.
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1))) {
+    // PHI translate the LHS.
+    Constant *RHS = cast<ConstantInt>(Inst->getOperand(1));
+    bool isNSW = cast<BinaryOperator>(Inst)->hasNoSignedWrap();
+    bool isNUW = cast<BinaryOperator>(Inst)->hasNoUnsignedWrap();
+
+    Value *LHS = PHITranslateSubExpr(Inst->getOperand(0), CurBB, PredBB, DT);
+    if (LHS == 0) return 0;
+
+    // If the PHI translated LHS is an add of a constant, fold the immediates.
+    if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(LHS))
+      if (BOp->getOpcode() == Instruction::Add)
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
+          LHS = BOp->getOperand(0);
+          RHS = ConstantExpr::getAdd(RHS, CI);
+          isNSW = isNUW = false;
+
+          // If the old 'LHS' was an input, add the new 'LHS' as an input.
+          if (std::count(InstInputs.begin(), InstInputs.end(), BOp)) {
+            RemoveInstInputs(BOp, InstInputs);
+            AddAsInput(LHS);
+          }
+        }
+
+    // See if the add simplifies away.
+    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, TD, TLI, DT)) {
+      // If we simplified the operands, the LHS is no longer an input, but Res
+      // is.
+      RemoveInstInputs(LHS, InstInputs);
+      return AddAsInput(Res);
+    }
+
+    // If we didn't modify the add, just return it.
+    if (LHS == Inst->getOperand(0) && RHS == Inst->getOperand(1))
+      return Inst;
+
+    // Otherwise, see if we have this add available somewhere.
+    for (Value::use_iterator UI = LHS->use_begin(), E = LHS->use_end();
+         UI != E; ++UI) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(*UI))
+        if (BO->getOpcode() == Instruction::Add &&
+            BO->getOperand(0) == LHS && BO->getOperand(1) == RHS &&
+            BO->getParent()->getParent() == CurBB->getParent() &&
+            (!DT || DT->dominates(BO->getParent(), PredBB)))
+          return BO;
+    }
+
+    return 0;
+  }
+
+  // Otherwise, we failed.
+  return 0;
+}
+
+
+/// PHITranslateValue - PHI translate the current address up the CFG from
+/// CurBB to Pred, updating our state to reflect any needed changes.  If the
+/// dominator tree DT is non-null, the translated value must dominate
+/// PredBB.  This returns true on failure and sets Addr to null.
+bool PHITransAddr::PHITranslateValue(BasicBlock *CurBB, BasicBlock *PredBB,
+                                     const DominatorTree *DT) {
+  assert(Verify() && "Invalid PHITransAddr!");
+  Addr = PHITranslateSubExpr(Addr, CurBB, PredBB, DT);
+  assert(Verify() && "Invalid PHITransAddr!");
+
+  if (DT) {
+    // Make sure the value is live in the predecessor.
+    if (Instruction *Inst = dyn_cast_or_null<Instruction>(Addr))
+      if (!DT->dominates(Inst->getParent(), PredBB))
+        Addr = 0;
+  }
+
+  return Addr == 0;
+}
+
+/// PHITranslateWithInsertion - PHI translate this value into the specified
+/// predecessor block, inserting a computation of the value if it is
+/// unavailable.
+///
+/// All newly created instructions are added to the NewInsts list.  This
+/// returns null on failure.
+///
+Value *PHITransAddr::
+PHITranslateWithInsertion(BasicBlock *CurBB, BasicBlock *PredBB,
+                          const DominatorTree &DT,
+                          SmallVectorImpl<Instruction*> &NewInsts) {
+  unsigned NISize = NewInsts.size();
+
+  // Attempt to PHI translate with insertion.
+  Addr = InsertPHITranslatedSubExpr(Addr, CurBB, PredBB, DT, NewInsts);
+
+  // If successful, return the new value.
+  if (Addr) return Addr;
+
+  // If not, destroy any intermediate instructions inserted.
+  while (NewInsts.size() != NISize)
+    NewInsts.pop_back_val()->eraseFromParent();
+  return 0;
+}
+
+
+/// InsertPHITranslatedPointer - Insert a computation of the PHI translated
+/// version of 'V' for the edge PredBB->CurBB into the end of the PredBB
+/// block.  All newly created instructions are added to the NewInsts list.
+/// This returns null on failure.
+///
+Value *PHITransAddr::
+InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
+                           BasicBlock *PredBB, const DominatorTree &DT,
+                           SmallVectorImpl<Instruction*> &NewInsts) {
+  // See if we have a version of this value already available and dominating
+  // PredBB.  If so, there is no need to insert a new instance of it.
+  PHITransAddr Tmp(InVal, TD);
+  if (!Tmp.PHITranslateValue(CurBB, PredBB, &DT))
+    return Tmp.getAddr();
+
+  // If we don't have an available version of this value, it must be an
+  // instruction.
+  Instruction *Inst = cast<Instruction>(InVal);
+
+  // Handle cast of PHI translatable value.
+  if (CastInst *Cast = dyn_cast<CastInst>(Inst)) {
+    if (!isSafeToSpeculativelyExecute(Cast)) return 0;
+    Value *OpVal = InsertPHITranslatedSubExpr(Cast->getOperand(0),
+                                              CurBB, PredBB, DT, NewInsts);
+    if (OpVal == 0) return 0;
+
+    // Otherwise insert a cast at the end of PredBB.
+    CastInst *New = CastInst::Create(Cast->getOpcode(),
+                                     OpVal, InVal->getType(),
+                                     InVal->getName()+".phi.trans.insert",
+                                     PredBB->getTerminator());
+    NewInsts.push_back(New);
+    return New;
+  }
+
+  // Handle getelementptr with at least one PHI operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+    SmallVector<Value*, 8> GEPOps;
+    BasicBlock *CurBB = GEP->getParent();
+    for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) {
+      Value *OpVal = InsertPHITranslatedSubExpr(GEP->getOperand(i),
+                                                CurBB, PredBB, DT, NewInsts);
+      if (OpVal == 0) return 0;
+      GEPOps.push_back(OpVal);
+    }
+
+    GetElementPtrInst *Result =
+      GetElementPtrInst::Create(GEPOps[0], makeArrayRef(GEPOps).slice(1),
+                                InVal->getName()+".phi.trans.insert",
+                                PredBB->getTerminator());
+    Result->setIsInBounds(GEP->isInBounds());
+    NewInsts.push_back(Result);
+    return Result;
+  }
+
+#if 0
+  // FIXME: This code works, but it is unclear that we actually want to insert
+  // a big chain of computation in order to make a value available in a block.
+  // This needs to be evaluated carefully to consider its cost trade offs.
+
+  // Handle add with a constant RHS.
+  if (Inst->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Inst->getOperand(1))) {
+    // PHI translate the LHS.
+    Value *OpVal = InsertPHITranslatedSubExpr(Inst->getOperand(0),
+                                              CurBB, PredBB, DT, NewInsts);
+    if (OpVal == 0) return 0;
+
+    BinaryOperator *Res = BinaryOperator::CreateAdd(OpVal, Inst->getOperand(1),
+                                           InVal->getName()+".phi.trans.insert",
+                                                    PredBB->getTerminator());
+    Res->setHasNoSignedWrap(cast<BinaryOperator>(Inst)->hasNoSignedWrap());
+    Res->setHasNoUnsignedWrap(cast<BinaryOperator>(Inst)->hasNoUnsignedWrap());
+    NewInsts.push_back(Res);
+    return Res;
+  }
+#endif
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Analysis/PathNumbering.cpp b/contrib/llvm/lib/Analysis/PathNumbering.cpp
new file mode 100644
index 000000000000..30d213b77576
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PathNumbering.cpp
@@ -0,0 +1,521 @@
+//===- PathNumbering.cpp --------------------------------------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Ball-Larus path numbers uniquely identify paths through a directed acyclic
+// graph (DAG) [Ball96].  For a CFG backedges are removed and replaced by phony
+// edges to obtain a DAG, and thus the unique path numbers [Ball96].
+//
+// The purpose of this analysis is to enumerate the edges in a CFG in order
+// to obtain paths from path numbers in a convenient manner.  As described in
+// [Ball96] edges can be enumerated such that given a path number by following
+// the CFG and updating the path number, the path is obtained.
+//
+// [Ball96]
+//  T. Ball and J. R. Larus. "Efficient Path Profiling."
+//  International Symposium on Microarchitecture, pages 46-57, 1996.
+//  http://portal.acm.org/citation.cfm?id=243857
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "ball-larus-numbering"
+
+#include "llvm/Analysis/PathNumbering.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <queue>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+
+// Are we enabling early termination
+static cl::opt<bool> ProcessEarlyTermination(
+  "path-profile-early-termination", cl::Hidden,
+  cl::desc("In path profiling, insert extra instrumentation to account for "
+           "unexpected function termination."));
+
+// Returns the basic block for the BallLarusNode
+BasicBlock* BallLarusNode::getBlock() {
+  return(_basicBlock);
+}
+
+// Returns the number of paths to the exit starting at the node.
+unsigned BallLarusNode::getNumberPaths() {
+  return(_numberPaths);
+}
+
+// Sets the number of paths to the exit starting at the node.
+void BallLarusNode::setNumberPaths(unsigned numberPaths) {
+  _numberPaths = numberPaths;
+}
+
+// Gets the NodeColor used in graph algorithms.
+BallLarusNode::NodeColor BallLarusNode::getColor() {
+  return(_color);
+}
+
+// Sets the NodeColor used in graph algorithms.
+void BallLarusNode::setColor(BallLarusNode::NodeColor color) {
+  _color = color;
+}
+
+// Returns an iterator over predecessor edges. Includes phony and
+// backedges.
+BLEdgeIterator BallLarusNode::predBegin() {
+  return(_predEdges.begin());
+}
+
+// Returns the end sentinel for the predecessor iterator.
+BLEdgeIterator BallLarusNode::predEnd() {
+  return(_predEdges.end());
+}
+
+// Returns the number of predecessor edges.  Includes phony and
+// backedges.
+unsigned BallLarusNode::getNumberPredEdges() {
+  return(_predEdges.size());
+}
+
+// Returns an iterator over successor edges. Includes phony and
+// backedges.
+BLEdgeIterator BallLarusNode::succBegin() {
+  return(_succEdges.begin());
+}
+
+// Returns the end sentinel for the successor iterator.
+BLEdgeIterator BallLarusNode::succEnd() {
+  return(_succEdges.end());
+}
+
+// Returns the number of successor edges.  Includes phony and
+// backedges.
+unsigned BallLarusNode::getNumberSuccEdges() {
+  return(_succEdges.size());
+}
+
+// Add an edge to the predecessor list.
+void BallLarusNode::addPredEdge(BallLarusEdge* edge) {
+  _predEdges.push_back(edge);
+}
+
+// Remove an edge from the predecessor list.
+void BallLarusNode::removePredEdge(BallLarusEdge* edge) {
+  removeEdge(_predEdges, edge);
+}
+
+// Add an edge to the successor list.
+void BallLarusNode::addSuccEdge(BallLarusEdge* edge) {
+  _succEdges.push_back(edge);
+}
+
+// Remove an edge from the successor list.
+void BallLarusNode::removeSuccEdge(BallLarusEdge* edge) {
+  removeEdge(_succEdges, edge);
+}
+
+// Returns the name of the BasicBlock being represented.  If BasicBlock
+// is null then returns "<null>".  If BasicBlock has no name, then
+// "<unnamed>" is returned.  Intended for use with debug output.
+std::string BallLarusNode::getName() {
+  std::stringstream name;
+
+  if(getBlock() != NULL) {
+    if(getBlock()->hasName()) {
+      std::string tempName(getBlock()->getName());
+      name << tempName.c_str() << " (" << _uid << ")";
+    } else
+      name << "<unnamed> (" << _uid << ")";
+  } else
+    name << "<null> (" << _uid << ")";
+
+  return name.str();
+}
+
+// Removes an edge from an edgeVector.  Used by removePredEdge and
+// removeSuccEdge.
+void BallLarusNode::removeEdge(BLEdgeVector& v, BallLarusEdge* e) {
+  // TODO: Avoid linear scan by using a set instead
+  for(BLEdgeIterator i = v.begin(),
+        end = v.end();
+      i != end;
+      ++i) {
+    if((*i) == e) {
+      v.erase(i);
+      break;
+    }
+  }
+}
+
+// Returns the source node of this edge.
+BallLarusNode* BallLarusEdge::getSource() const {
+  return(_source);
+}
+
+// Returns the target node of this edge.
+BallLarusNode* BallLarusEdge::getTarget() const {
+  return(_target);
+}
+
+// Sets the type of the edge.
+BallLarusEdge::EdgeType BallLarusEdge::getType() const {
+  return _edgeType;
+}
+
+// Gets the type of the edge.
+void BallLarusEdge::setType(EdgeType type) {
+  _edgeType = type;
+}
+
+// Returns the weight of this edge.  Used to decode path numbers to sequences
+// of basic blocks.
+unsigned BallLarusEdge::getWeight() {
+  return(_weight);
+}
+
+// Sets the weight of the edge.  Used during path numbering.
+void BallLarusEdge::setWeight(unsigned weight) {
+  _weight = weight;
+}
+
+// Gets the phony edge originating at the root.
+BallLarusEdge* BallLarusEdge::getPhonyRoot() {
+  return _phonyRoot;
+}
+
+// Sets the phony edge originating at the root.
+void BallLarusEdge::setPhonyRoot(BallLarusEdge* phonyRoot) {
+  _phonyRoot = phonyRoot;
+}
+
+// Gets the phony edge terminating at the exit.
+BallLarusEdge* BallLarusEdge::getPhonyExit() {
+  return _phonyExit;
+}
+
+// Sets the phony edge terminating at the exit.
+void BallLarusEdge::setPhonyExit(BallLarusEdge* phonyExit) {
+  _phonyExit = phonyExit;
+}
+
+// Gets the associated real edge if this is a phony edge.
+BallLarusEdge* BallLarusEdge::getRealEdge() {
+  return _realEdge;
+}
+
+// Sets the associated real edge if this is a phony edge.
+void BallLarusEdge::setRealEdge(BallLarusEdge* realEdge) {
+  _realEdge = realEdge;
+}
+
+// Returns the duplicate number of the edge.
+unsigned BallLarusEdge::getDuplicateNumber() {
+  return(_duplicateNumber);
+}
+
+// Initialization that requires virtual functions which are not fully
+// functional in the constructor.
+void BallLarusDag::init() {
+  BLBlockNodeMap inDag;
+  std::stack<BallLarusNode*> dfsStack;
+
+  _root = addNode(&(_function.getEntryBlock()));
+  _exit = addNode(NULL);
+
+  // start search from root
+  dfsStack.push(getRoot());
+
+  // dfs to add each bb into the dag
+  while(dfsStack.size())
+    buildNode(inDag, dfsStack);
+
+  // put in the final edge
+  addEdge(getExit(),getRoot(),0);
+}
+
+// Frees all memory associated with the DAG.
+BallLarusDag::~BallLarusDag() {
+  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); edge != end;
+      ++edge)
+    delete (*edge);
+
+  for(BLNodeIterator node = _nodes.begin(), end = _nodes.end(); node != end;
+      ++node)
+    delete (*node);
+}
+
+// Calculate the path numbers by assigning edge increments as prescribed
+// in Ball-Larus path profiling.
+void BallLarusDag::calculatePathNumbers() {
+  BallLarusNode* node;
+  std::queue<BallLarusNode*> bfsQueue;
+  bfsQueue.push(getExit());
+
+  while(bfsQueue.size() > 0) {
+    node = bfsQueue.front();
+
+    DEBUG(dbgs() << "calculatePathNumbers on " << node->getName() << "\n");
+
+    bfsQueue.pop();
+    unsigned prevPathNumber = node->getNumberPaths();
+    calculatePathNumbersFrom(node);
+
+    // Check for DAG splitting
+    if( node->getNumberPaths() > 100000000 && node != getRoot() ) {
+      // Add new phony edge from the split-node to the DAG's exit
+      BallLarusEdge* exitEdge = addEdge(node, getExit(), 0);
+      exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
+
+      // Counters to handle the possibility of a multi-graph
+      BasicBlock* oldTarget = 0;
+      unsigned duplicateNumber = 0;
+
+      // Iterate through each successor edge, adding phony edges
+      for( BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
+           succ != end; oldTarget = (*succ)->getTarget()->getBlock(), succ++ ) {
+
+        if( (*succ)->getType() == BallLarusEdge::NORMAL ) {
+          // is this edge a duplicate?
+          if( oldTarget != (*succ)->getTarget()->getBlock() )
+            duplicateNumber = 0;
+
+          // create the new phony edge: root -> succ
+          BallLarusEdge* rootEdge =
+            addEdge(getRoot(), (*succ)->getTarget(), duplicateNumber++);
+          rootEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
+          rootEdge->setRealEdge(*succ);
+
+          // split on this edge and reference it's exit/root phony edges
+          (*succ)->setType(BallLarusEdge::SPLITEDGE);
+          (*succ)->setPhonyRoot(rootEdge);
+          (*succ)->setPhonyExit(exitEdge);
+          (*succ)->setWeight(0);
+        }
+      }
+
+      calculatePathNumbersFrom(node);
+    }
+
+    DEBUG(dbgs() << "prev, new number paths " << prevPathNumber << ", "
+          << node->getNumberPaths() << ".\n");
+
+    if(prevPathNumber == 0 && node->getNumberPaths() != 0) {
+      DEBUG(dbgs() << "node ready : " << node->getName() << "\n");
+      for(BLEdgeIterator pred = node->predBegin(), end = node->predEnd();
+          pred != end; pred++) {
+        if( (*pred)->getType() == BallLarusEdge::BACKEDGE ||
+            (*pred)->getType() == BallLarusEdge::SPLITEDGE )
+          continue;
+
+        BallLarusNode* nextNode = (*pred)->getSource();
+        // not yet visited?
+        if(nextNode->getNumberPaths() == 0)
+          bfsQueue.push(nextNode);
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "\tNumber of paths: " << getRoot()->getNumberPaths() << "\n");
+}
+
+// Returns the number of paths for the Dag.
+unsigned BallLarusDag::getNumberOfPaths() {
+  return(getRoot()->getNumberPaths());
+}
+
+// Returns the root (i.e. entry) node for the DAG.
+BallLarusNode* BallLarusDag::getRoot() {
+  return _root;
+}
+
+// Returns the exit node for the DAG.
+BallLarusNode* BallLarusDag::getExit() {
+  return _exit;
+}
+
+// Returns the function for the DAG.
+Function& BallLarusDag::getFunction() {
+  return(_function);
+}
+
+// Clears the node colors.
+void BallLarusDag::clearColors(BallLarusNode::NodeColor color) {
+  for (BLNodeIterator nodeIt = _nodes.begin(); nodeIt != _nodes.end(); nodeIt++)
+    (*nodeIt)->setColor(color);
+}
+
+// Processes one node and its imediate edges for building the DAG.
+void BallLarusDag::buildNode(BLBlockNodeMap& inDag, BLNodeStack& dfsStack) {
+  BallLarusNode* currentNode = dfsStack.top();
+  BasicBlock* currentBlock = currentNode->getBlock();
+
+  if(currentNode->getColor() != BallLarusNode::WHITE) {
+    // we have already visited this node
+    dfsStack.pop();
+    currentNode->setColor(BallLarusNode::BLACK);
+  } else {
+    // are there any external procedure calls?
+    if( ProcessEarlyTermination ) {
+      for( BasicBlock::iterator bbCurrent = currentNode->getBlock()->begin(),
+             bbEnd = currentNode->getBlock()->end(); bbCurrent != bbEnd;
+           bbCurrent++ ) {
+        Instruction& instr = *bbCurrent;
+        if( instr.getOpcode() == Instruction::Call ) {
+          BallLarusEdge* callEdge = addEdge(currentNode, getExit(), 0);
+          callEdge->setType(BallLarusEdge::CALLEDGE_PHONY);
+          break;
+        }
+      }
+    }
+
+    TerminatorInst* terminator = currentNode->getBlock()->getTerminator();
+    if(isa<ReturnInst>(terminator) || isa<UnreachableInst>(terminator) ||
+       isa<ResumeInst>(terminator))
+      addEdge(currentNode, getExit(),0);
+
+    currentNode->setColor(BallLarusNode::GRAY);
+    inDag[currentBlock] = currentNode;
+
+    BasicBlock* oldSuccessor = 0;
+    unsigned duplicateNumber = 0;
+
+    // iterate through this node's successors
+    for(succ_iterator successor = succ_begin(currentBlock),
+          succEnd = succ_end(currentBlock); successor != succEnd;
+        oldSuccessor = *successor, ++successor ) {
+      BasicBlock* succBB = *successor;
+
+      // is this edge a duplicate?
+      if (oldSuccessor == succBB)
+        duplicateNumber++;
+      else
+        duplicateNumber = 0;
+
+      buildEdge(inDag, dfsStack, currentNode, succBB, duplicateNumber);
+    }
+  }
+}
+
+// Process an edge in the CFG for DAG building.
+void BallLarusDag::buildEdge(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>&
+                             dfsStack, BallLarusNode* currentNode,
+                             BasicBlock* succBB, unsigned duplicateCount) {
+  BallLarusNode* succNode = inDag[succBB];
+
+  if(succNode && succNode->getColor() == BallLarusNode::BLACK) {
+    // visited node and forward edge
+    addEdge(currentNode, succNode, duplicateCount);
+  } else if(succNode && succNode->getColor() == BallLarusNode::GRAY) {
+    // visited node and back edge
+    DEBUG(dbgs() << "Backedge detected.\n");
+    addBackedge(currentNode, succNode, duplicateCount);
+  } else {
+    BallLarusNode* childNode;
+    // not visited node and forward edge
+    if(succNode) // an unvisited node that is child of a gray node
+      childNode = succNode;
+    else { // an unvisited node that is a child of a an unvisted node
+      childNode = addNode(succBB);
+      inDag[succBB] = childNode;
+    }
+    addEdge(currentNode, childNode, duplicateCount);
+    dfsStack.push(childNode);
+  }
+}
+
+// The weight on each edge is the increment required along any path that
+// contains that edge.
+void BallLarusDag::calculatePathNumbersFrom(BallLarusNode* node) {
+  if(node == getExit())
+    // The Exit node must be base case
+    node->setNumberPaths(1);
+  else {
+    unsigned sumPaths = 0;
+    BallLarusNode* succNode;
+
+    for(BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
+        succ != end; succ++) {
+      if( (*succ)->getType() == BallLarusEdge::BACKEDGE ||
+          (*succ)->getType() == BallLarusEdge::SPLITEDGE )
+        continue;
+
+      (*succ)->setWeight(sumPaths);
+      succNode = (*succ)->getTarget();
+
+      if( !succNode->getNumberPaths() )
+        return;
+      sumPaths += succNode->getNumberPaths();
+    }
+
+    node->setNumberPaths(sumPaths);
+  }
+}
+
+// Allows subclasses to determine which type of Node is created.
+// Override this method to produce subclasses of BallLarusNode if
+// necessary. The destructor of BallLarusDag will call free on each
+// pointer created.
+BallLarusNode* BallLarusDag::createNode(BasicBlock* BB) {
+  return( new BallLarusNode(BB) );
+}
+
+// Allows subclasses to determine which type of Edge is created.
+// Override this method to produce subclasses of BallLarusEdge if
+// necessary. The destructor of BallLarusDag will call free on each
+// pointer created.
+BallLarusEdge* BallLarusDag::createEdge(BallLarusNode* source,
+                                        BallLarusNode* target,
+                                        unsigned duplicateCount) {
+  return( new BallLarusEdge(source, target, duplicateCount) );
+}
+
+// Proxy to node's constructor.  Updates the DAG state.
+BallLarusNode* BallLarusDag::addNode(BasicBlock* BB) {
+  BallLarusNode* newNode = createNode(BB);
+  _nodes.push_back(newNode);
+  return( newNode );
+}
+
+// Proxy to edge's constructor. Updates the DAG state.
+BallLarusEdge* BallLarusDag::addEdge(BallLarusNode* source,
+                                     BallLarusNode* target,
+                                     unsigned duplicateCount) {
+  BallLarusEdge* newEdge = createEdge(source, target, duplicateCount);
+  _edges.push_back(newEdge);
+  source->addSuccEdge(newEdge);
+  target->addPredEdge(newEdge);
+  return(newEdge);
+}
+
+// Adds a backedge with its phony edges. Updates the DAG state.
+void BallLarusDag::addBackedge(BallLarusNode* source, BallLarusNode* target,
+                               unsigned duplicateCount) {
+  BallLarusEdge* childEdge = addEdge(source, target, duplicateCount);
+  childEdge->setType(BallLarusEdge::BACKEDGE);
+
+  childEdge->setPhonyRoot(addEdge(getRoot(), target,0));
+  childEdge->setPhonyExit(addEdge(source, getExit(),0));
+
+  childEdge->getPhonyRoot()->setRealEdge(childEdge);
+  childEdge->getPhonyRoot()->setType(BallLarusEdge::BACKEDGE_PHONY);
+
+  childEdge->getPhonyExit()->setRealEdge(childEdge);
+  childEdge->getPhonyExit()->setType(BallLarusEdge::BACKEDGE_PHONY);
+  _backEdges.push_back(childEdge);
+}
diff --git a/contrib/llvm/lib/Analysis/PathProfileInfo.cpp b/contrib/llvm/lib/Analysis/PathProfileInfo.cpp
new file mode 100644
index 000000000000..bc53221d3176
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PathProfileInfo.cpp
@@ -0,0 +1,433 @@
+//===- PathProfileInfo.cpp ------------------------------------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface used by optimizers to load path profiles,
+// and provides a loader pass which reads a path profile file.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "path-profile-info"
+
+#include "llvm/Analysis/PathProfileInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileInfoTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+
+using namespace llvm;
+
+// command line option for loading path profiles
+static cl::opt<std::string>
+PathProfileInfoFilename("path-profile-loader-file", cl::init("llvmprof.out"),
+  cl::value_desc("filename"),
+  cl::desc("Path profile file loaded by -path-profile-loader"), cl::Hidden);
+
+namespace {
+  class PathProfileLoaderPass : public ModulePass, public PathProfileInfo {
+  public:
+    PathProfileLoaderPass() : ModulePass(ID) { }
+    ~PathProfileLoaderPass();
+
+    // this pass doesn't change anything (only loads information)
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    // the full name of the loader pass
+    virtual const char* getPassName() const {
+      return "Path Profiling Information Loader";
+    }
+
+    // required since this pass implements multiple inheritance
+                virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &PathProfileInfo::ID)
+        return (PathProfileInfo*)this;
+      return this;
+    }
+
+    // entry point to run the pass
+    bool runOnModule(Module &M);
+
+    // pass identification
+    static char ID;
+
+  private:
+    // make a reference table to refer to function by number
+    void buildFunctionRefs(Module &M);
+
+    // process argument info of a program from the input file
+    void handleArgumentInfo();
+
+    // process path number information from the input file
+    void handlePathInfo();
+
+    // array of references to the functions in the module
+    std::vector<Function*> _functions;
+
+    // path profile file handle
+    FILE* _file;
+
+    // path profile file name
+    std::string _filename;
+  };
+}
+
+// register PathLoader
+char PathProfileLoaderPass::ID = 0;
+
+INITIALIZE_ANALYSIS_GROUP(PathProfileInfo, "Path Profile Information",
+                          NoPathProfileInfo)
+INITIALIZE_AG_PASS(PathProfileLoaderPass, PathProfileInfo,
+                   "path-profile-loader",
+                   "Load path profile information from file",
+                   false, true, false)
+
+char &llvm::PathProfileLoaderPassID = PathProfileLoaderPass::ID;
+
+// link PathLoader as a pass, and make it available as an optimisation
+ModulePass *llvm::createPathProfileLoaderPass() {
+  return new PathProfileLoaderPass;
+}
+
+// ----------------------------------------------------------------------------
+// PathEdge implementation
+//
+ProfilePathEdge::ProfilePathEdge (BasicBlock* source, BasicBlock* target,
+                                  unsigned duplicateNumber)
+  : _source(source), _target(target), _duplicateNumber(duplicateNumber) {}
+
+// ----------------------------------------------------------------------------
+// Path implementation
+//
+
+ProfilePath::ProfilePath (unsigned int number, unsigned int count,
+                          double countStdDev,   PathProfileInfo* ppi)
+  : _number(number) , _count(count), _countStdDev(countStdDev), _ppi(ppi) {}
+
+double ProfilePath::getFrequency() const {
+  return 100 * double(_count) /
+    double(_ppi->_functionPathCounts[_ppi->_currentFunction]);
+}
+
+static BallLarusEdge* getNextEdge (BallLarusNode* node,
+                                   unsigned int pathNumber) {
+  BallLarusEdge* best = 0;
+
+  for( BLEdgeIterator next = node->succBegin(),
+         end = node->succEnd(); next != end; next++ ) {
+    if( (*next)->getType() != BallLarusEdge::BACKEDGE && // no backedges
+        (*next)->getType() != BallLarusEdge::SPLITEDGE && // no split edges
+        (*next)->getWeight() <= pathNumber && // weight must be <= pathNumber
+        (!best || (best->getWeight() < (*next)->getWeight())) ) // best one?
+      best = *next;
+  }
+
+  return best;
+}
+
+ProfilePathEdgeVector* ProfilePath::getPathEdges() const {
+  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
+  unsigned int increment = _number;
+  ProfilePathEdgeVector* pev = new ProfilePathEdgeVector;
+
+  while (currentNode != _ppi->_currentDag->getExit()) {
+    BallLarusEdge* next = getNextEdge(currentNode, increment);
+
+    increment -= next->getWeight();
+
+    if( next->getType() != BallLarusEdge::BACKEDGE_PHONY &&
+        next->getType() != BallLarusEdge::SPLITEDGE_PHONY &&
+        next->getTarget() != _ppi->_currentDag->getExit() )
+      pev->push_back(ProfilePathEdge(
+                       next->getSource()->getBlock(),
+                       next->getTarget()->getBlock(),
+                       next->getDuplicateNumber()));
+
+    if( next->getType() == BallLarusEdge::BACKEDGE_PHONY &&
+        next->getTarget() == _ppi->_currentDag->getExit() )
+      pev->push_back(ProfilePathEdge(
+                       next->getRealEdge()->getSource()->getBlock(),
+                       next->getRealEdge()->getTarget()->getBlock(),
+                       next->getDuplicateNumber()));
+
+    if( next->getType() == BallLarusEdge::SPLITEDGE_PHONY &&
+        next->getSource() == _ppi->_currentDag->getRoot() )
+      pev->push_back(ProfilePathEdge(
+                       next->getRealEdge()->getSource()->getBlock(),
+                       next->getRealEdge()->getTarget()->getBlock(),
+                       next->getDuplicateNumber()));
+
+    // set the new node
+    currentNode = next->getTarget();
+  }
+
+  return pev;
+}
+
+ProfilePathBlockVector* ProfilePath::getPathBlocks() const {
+  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
+  unsigned int increment = _number;
+  ProfilePathBlockVector* pbv = new ProfilePathBlockVector;
+
+  while (currentNode != _ppi->_currentDag->getExit()) {
+    BallLarusEdge* next = getNextEdge(currentNode, increment);
+    increment -= next->getWeight();
+
+    // add block to the block list if it is a real edge
+    if( next->getType() == BallLarusEdge::NORMAL)
+      pbv->push_back (currentNode->getBlock());
+    // make the back edge the last edge since we are at the end
+    else if( next->getTarget() == _ppi->_currentDag->getExit() ) {
+      pbv->push_back (currentNode->getBlock());
+      pbv->push_back (next->getRealEdge()->getTarget()->getBlock());
+    }
+
+    // set the new node
+    currentNode = next->getTarget();
+  }
+
+  return pbv;
+}
+
+BasicBlock* ProfilePath::getFirstBlockInPath() const {
+  BallLarusNode* root = _ppi->_currentDag->getRoot();
+  BallLarusEdge* edge = getNextEdge(root, _number);
+
+  if( edge && (edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
+               edge->getType() == BallLarusEdge::SPLITEDGE_PHONY) )
+    return edge->getTarget()->getBlock();
+
+  return root->getBlock();
+}
+
+// ----------------------------------------------------------------------------
+// PathProfileInfo implementation
+//
+
+// Pass identification
+char llvm::PathProfileInfo::ID = 0;
+
+PathProfileInfo::PathProfileInfo () : _currentDag(0) , _currentFunction(0) {
+}
+
+PathProfileInfo::~PathProfileInfo() {
+  if (_currentDag)
+    delete _currentDag;
+}
+
+// set the function for which paths are currently begin processed
+void PathProfileInfo::setCurrentFunction(Function* F) {
+  // Make sure it exists
+  if (!F) return;
+
+  if (_currentDag)
+    delete _currentDag;
+
+  _currentFunction = F;
+  _currentDag = new BallLarusDag(*F);
+  _currentDag->init();
+  _currentDag->calculatePathNumbers();
+}
+
+// get the function for which paths are currently being processed
+Function* PathProfileInfo::getCurrentFunction() const {
+  return _currentFunction;
+}
+
+// get the entry block of the function
+BasicBlock* PathProfileInfo::getCurrentFunctionEntry() {
+  return _currentDag->getRoot()->getBlock();
+}
+
+// return the path based on its number
+ProfilePath* PathProfileInfo::getPath(unsigned int number) {
+  return _functionPaths[_currentFunction][number];
+}
+
+// return the number of paths which a function may potentially execute
+unsigned int PathProfileInfo::getPotentialPathCount() {
+  return _currentDag ? _currentDag->getNumberOfPaths() : 0;
+}
+
+// return an iterator for the beginning of a functions executed paths
+ProfilePathIterator PathProfileInfo::pathBegin() {
+  return _functionPaths[_currentFunction].begin();
+}
+
+// return an iterator for the end of a functions executed paths
+ProfilePathIterator PathProfileInfo::pathEnd() {
+  return _functionPaths[_currentFunction].end();
+}
+
+// returns the total number of paths run in the function
+unsigned int PathProfileInfo::pathsRun() {
+  return _currentFunction ? _functionPaths[_currentFunction].size() : 0;
+}
+
+// ----------------------------------------------------------------------------
+// PathLoader implementation
+//
+
+// remove all generated paths
+PathProfileLoaderPass::~PathProfileLoaderPass() {
+  for( FunctionPathIterator funcNext = _functionPaths.begin(),
+         funcEnd = _functionPaths.end(); funcNext != funcEnd; funcNext++)
+    for( ProfilePathIterator pathNext = funcNext->second.begin(),
+           pathEnd = funcNext->second.end(); pathNext != pathEnd; pathNext++)
+      delete pathNext->second;
+}
+
+// entry point of the pass; this loads and parses a file
+bool PathProfileLoaderPass::runOnModule(Module &M) {
+  // get the filename and setup the module's function references
+  _filename = PathProfileInfoFilename;
+  buildFunctionRefs (M);
+
+  if (!(_file = fopen(_filename.c_str(), "rb"))) {
+    errs () << "error: input '" << _filename << "' file does not exist.\n";
+    return false;
+  }
+
+  ProfilingType profType;
+
+  while( fread(&profType, sizeof(ProfilingType), 1, _file) ) {
+    switch (profType) {
+    case ArgumentInfo:
+      handleArgumentInfo ();
+      break;
+    case PathInfo:
+      handlePathInfo ();
+      break;
+    default:
+      errs () << "error: bad path profiling file syntax, " << profType << "\n";
+      fclose (_file);
+      return false;
+    }
+  }
+
+  fclose (_file);
+
+  return true;
+}
+
+// create a reference table for functions defined in the path profile file
+void PathProfileLoaderPass::buildFunctionRefs (Module &M) {
+  _functions.push_back(0); // make the 0 index a null pointer
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
+    if (F->isDeclaration())
+      continue;
+    _functions.push_back(F);
+  }
+}
+
+// handle command like argument infor in the output file
+void PathProfileLoaderPass::handleArgumentInfo() {
+  // get the argument list's length
+  unsigned savedArgsLength;
+  if( fread(&savedArgsLength, sizeof(unsigned), 1, _file) != 1 ) {
+    errs() << "warning: argument info header/data mismatch\n";
+    return;
+  }
+
+  // allocate a buffer, and get the arguments
+  char* args = new char[savedArgsLength+1];
+  if( fread(args, 1, savedArgsLength, _file) != savedArgsLength )
+    errs() << "warning: argument info header/data mismatch\n";
+
+  args[savedArgsLength] = '\0';
+  argList = std::string(args);
+  delete [] args; // cleanup dynamic string
+
+  // byte alignment
+  if (savedArgsLength & 3)
+    fseek(_file, 4-(savedArgsLength&3), SEEK_CUR);
+}
+
+// Handle path profile information in the output file
+void PathProfileLoaderPass::handlePathInfo () {
+  // get the number of functions in this profile
+  unsigned functionCount;
+  if( fread(&functionCount, sizeof(functionCount), 1, _file) != 1 ) {
+    errs() << "warning: path info header/data mismatch\n";
+    return;
+  }
+
+  // gather path information for each function
+  for (unsigned i = 0; i < functionCount; i++) {
+    PathProfileHeader pathHeader;
+    if( fread(&pathHeader, sizeof(pathHeader), 1, _file) != 1 ) {
+      errs() << "warning: bad header for path function info\n";
+      break;
+    }
+
+    Function* f = _functions[pathHeader.fnNumber];
+
+    // dynamically allocate a table to store path numbers
+    PathProfileTableEntry* pathTable =
+      new PathProfileTableEntry[pathHeader.numEntries];
+
+    if( fread(pathTable, sizeof(PathProfileTableEntry),
+              pathHeader.numEntries, _file) != pathHeader.numEntries) {
+      delete [] pathTable;
+      errs() << "warning: path function info header/data mismatch\n";
+      return;
+    }
+
+    // Build a new path for the current function
+    unsigned int totalPaths = 0;
+    for (unsigned int j = 0; j < pathHeader.numEntries; j++) {
+      totalPaths += pathTable[j].pathCounter;
+      _functionPaths[f][pathTable[j].pathNumber]
+        = new ProfilePath(pathTable[j].pathNumber, pathTable[j].pathCounter,
+                          0, this);
+    }
+
+    _functionPathCounts[f] = totalPaths;
+
+    delete [] pathTable;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  NoProfile PathProfileInfo implementation
+//
+
+namespace {
+  struct NoPathProfileInfo : public ImmutablePass, public PathProfileInfo {
+    static char ID; // Class identification, replacement for typeinfo
+    NoPathProfileInfo() : ImmutablePass(ID) {
+      initializeNoPathProfileInfoPass(*PassRegistry::getPassRegistry());
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &PathProfileInfo::ID)
+        return (PathProfileInfo*)this;
+      return this;
+    }
+
+    virtual const char *getPassName() const {
+      return "NoPathProfileInfo";
+    }
+  };
+}  // End of anonymous namespace
+
+char NoPathProfileInfo::ID = 0;
+// Register this pass...
+INITIALIZE_AG_PASS(NoPathProfileInfo, PathProfileInfo, "no-path-profile",
+                   "No Path Profile Information", false, true, true)
+
+ImmutablePass *llvm::createNoPathProfileInfoPass() { return new NoPathProfileInfo(); }
diff --git a/contrib/llvm/lib/Analysis/PathProfileVerifier.cpp b/contrib/llvm/lib/Analysis/PathProfileVerifier.cpp
new file mode 100644
index 000000000000..48d7d05d788f
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PathProfileVerifier.cpp
@@ -0,0 +1,206 @@
+//===- PathProfileVerifier.cpp --------------------------------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This verifier derives an edge profile file from current path profile
+// information
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "path-profile-verifier"
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/PathProfileInfo.h"
+#include "llvm/Analysis/ProfileInfoTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <stdio.h>
+
+using namespace llvm;
+
+namespace {
+  class PathProfileVerifier : public ModulePass {
+  private:
+    bool runOnModule(Module &M);
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PathProfileVerifier() : ModulePass(ID) {
+      initializePathProfileVerifierPass(*PassRegistry::getPassRegistry());
+    }
+
+
+    virtual const char *getPassName() const {
+      return "Path Profiler Verifier";
+    }
+
+    // The verifier requires the path profile and edge profile.
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const;
+  };
+}
+
+static cl::opt<std::string>
+EdgeProfileFilename("path-profile-verifier-file",
+  cl::init("edgefrompath.llvmprof.out"),
+  cl::value_desc("filename"),
+  cl::desc("Edge profile file generated by -path-profile-verifier"),
+  cl::Hidden);
+
+char PathProfileVerifier::ID = 0;
+INITIALIZE_PASS(PathProfileVerifier, "path-profile-verifier",
+                "Compare the path profile derived edge profile against the "
+                "edge profile.", true, true)
+
+ModulePass *llvm::createPathProfileVerifierPass() {
+  return new PathProfileVerifier();
+}
+
+// The verifier requires the path profile and edge profile.
+void PathProfileVerifier::getAnalysisUsage(AnalysisUsage& AU) const {
+  AU.addRequired<PathProfileInfo>();
+  AU.addPreserved<PathProfileInfo>();
+}
+
+typedef std::map<unsigned, unsigned> DuplicateToIndexMap;
+typedef std::map<BasicBlock*,DuplicateToIndexMap> BlockToDuplicateMap;
+typedef std::map<BasicBlock*,BlockToDuplicateMap> NestedBlockToIndexMap;
+
+// the verifier iterates through each path to gather the total
+// number of edge frequencies
+bool PathProfileVerifier::runOnModule (Module &M) {
+  PathProfileInfo& pathProfileInfo = getAnalysis<PathProfileInfo>();
+
+  // setup a data structure to map path edges which index an
+  // array of edge counters
+  NestedBlockToIndexMap arrayMap;
+  unsigned i = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+
+    arrayMap[(BasicBlock*)0][F->begin()][0] = i++;
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+
+      unsigned duplicate = 0;
+      BasicBlock* prev = 0;
+      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e;
+           prev = TI->getSuccessor(s), ++s) {
+        if (prev == TI->getSuccessor(s))
+          duplicate++;
+        else duplicate = 0;
+
+        arrayMap[BB][TI->getSuccessor(s)][duplicate] = i++;
+      }
+    }
+  }
+
+  std::vector<unsigned> edgeArray(i);
+
+  // iterate through each path and increment the edge counters as needed
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+
+    pathProfileInfo.setCurrentFunction(F);
+
+    DEBUG(dbgs() << "function '" << F->getName() << "' ran "
+          << pathProfileInfo.pathsRun()
+          << "/" << pathProfileInfo.getPotentialPathCount()
+          << " potential paths\n");
+
+    for( ProfilePathIterator nextPath = pathProfileInfo.pathBegin(),
+           endPath = pathProfileInfo.pathEnd();
+         nextPath != endPath; nextPath++ ) {
+      ProfilePath* currentPath = nextPath->second;
+
+      ProfilePathEdgeVector* pev = currentPath->getPathEdges();
+      DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": "
+            << currentPath->getCount() << "\n");
+      // setup the entry edge (normally path profiling doesn't care about this)
+      if (currentPath->getFirstBlockInPath() == &F->getEntryBlock())
+        edgeArray[arrayMap[(BasicBlock*)0][currentPath->getFirstBlockInPath()][0]]
+          += currentPath->getCount();
+
+      for( ProfilePathEdgeIterator nextEdge = pev->begin(),
+             endEdge = pev->end(); nextEdge != endEdge; nextEdge++ ) {
+        if (nextEdge != pev->begin())
+          DEBUG(dbgs() << " :: ");
+
+        BasicBlock* source = nextEdge->getSource();
+        BasicBlock* target = nextEdge->getTarget();
+        unsigned duplicateNumber = nextEdge->getDuplicateNumber();
+        DEBUG(dbgs() << source->getName() << " --{" << duplicateNumber
+                     << "}--> " << target->getName());
+
+        // Ensure all the referenced edges exist
+        // TODO: make this a separate function
+        if( !arrayMap.count(source) ) {
+          errs() << "  error [" << F->getName() << "()]: source '"
+                 << source->getName()
+                 << "' does not exist in the array map.\n";
+        } else if( !arrayMap[source].count(target) ) {
+          errs() << "  error [" << F->getName() << "()]: target '"
+                 << target->getName()
+                 << "' does not exist in the array map.\n";
+        } else if( !arrayMap[source][target].count(duplicateNumber) ) {
+          errs() << "  error [" << F->getName() << "()]: edge "
+                 << source->getName() << " -> " << target->getName()
+                 << " duplicate number " << duplicateNumber
+                 << " does not exist in the array map.\n";
+        } else {
+          edgeArray[arrayMap[source][target][duplicateNumber]]
+            += currentPath->getCount();
+        }
+      }
+
+      DEBUG(errs() << "\n");
+
+      delete pev;
+    }
+  }
+
+  std::string errorInfo;
+  std::string filename = EdgeProfileFilename;
+
+  // Open a handle to the file
+  FILE* edgeFile = fopen(filename.c_str(),"wb");
+
+  if (!edgeFile) {
+    errs() << "error: unable to open file '" << filename << "' for output.\n";
+    return false;
+  }
+
+  errs() << "Generating edge profile '" << filename << "' ...\n";
+
+  // write argument info
+  unsigned type = ArgumentInfo;
+  unsigned num = pathProfileInfo.argList.size();
+  int zeros = 0;
+
+  fwrite(&type,sizeof(unsigned),1,edgeFile);
+  fwrite(&num,sizeof(unsigned),1,edgeFile);
+  fwrite(pathProfileInfo.argList.c_str(),1,num,edgeFile);
+  if (num&3)
+    fwrite(&zeros, 1, 4-(num&3), edgeFile);
+
+  type = EdgeInfo;
+  num = edgeArray.size();
+  fwrite(&type,sizeof(unsigned),1,edgeFile);
+  fwrite(&num,sizeof(unsigned),1,edgeFile);
+
+  // write each edge to the file
+  for( std::vector<unsigned>::iterator s = edgeArray.begin(),
+         e = edgeArray.end(); s != e; s++)
+    fwrite(&*s, sizeof (unsigned), 1, edgeFile);
+
+  fclose (edgeFile);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Analysis/PostDominators.cpp b/contrib/llvm/lib/Analysis/PostDominators.cpp
new file mode 100644
index 000000000000..96804a01edc6
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PostDominators.cpp
@@ -0,0 +1,51 @@
+//===- PostDominators.cpp - Post-Dominator Calculation --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the post-dominator construction algorithms.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "postdomtree"
+
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  PostDominatorTree Implementation
+//===----------------------------------------------------------------------===//
+
+char PostDominatorTree::ID = 0;
+INITIALIZE_PASS(PostDominatorTree, "postdomtree",
+                "Post-Dominator Tree Construction", true, true)
+
+bool PostDominatorTree::runOnFunction(Function &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+PostDominatorTree::~PostDominatorTree() {
+  delete DT;
+}
+
+void PostDominatorTree::print(raw_ostream &OS, const Module *) const {
+  DT->print(OS);
+}
+
+
+FunctionPass* llvm::createPostDomTree() {
+  return new PostDominatorTree();
+}
+
diff --git a/contrib/llvm/lib/Analysis/ProfileDataLoader.cpp b/contrib/llvm/lib/Analysis/ProfileDataLoader.cpp
new file mode 100644
index 000000000000..d7f444b4b6d7
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileDataLoader.cpp
@@ -0,0 +1,155 @@
+//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileDataLoader class is used to load raw profiling data from the dump
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Analysis/ProfileDataTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
+                                                        const BasicBlock *> E) {
+  O << "(";
+
+  if (E.first)
+    O << E.first->getName();
+  else
+    O << "0";
+
+  O << ",";
+
+  if (E.second)
+    O << E.second->getName();
+  else
+    O << "0";
+
+  return O << ")";
+}
+
+/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
+/// (or both) may not be defined.
+static unsigned AddCounts(unsigned A, unsigned B) {
+  // If either value is undefined, use the other.
+  // Undefined + undefined = undefined.
+  if (A == ProfileDataLoader::Uncounted) return B;
+  if (B == ProfileDataLoader::Uncounted) return A;
+
+  return A + B;
+}
+
+/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
+template <typename T>
+static void ReadProfilingData(const char *ToolName, FILE *F,
+                              T *Data, size_t NumEntries) {
+  // Read in the block of data...
+  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
+    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
+}
+
+/// ReadProfilingNumEntries - Read how many entries are in this profiling data
+/// packet.
+static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
+                                        bool ShouldByteSwap) {
+  unsigned Entry;
+  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
+  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
+}
+
+/// ReadProfilingBlock - Read the number of entries in the next profiling data
+/// packet and then accumulate the entries into 'Data'.
+static void ReadProfilingBlock(const char *ToolName, FILE *F,
+                               bool ShouldByteSwap,
+                               SmallVector<unsigned, 32> &Data) {
+  // Read the number of entries...
+  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the data.
+  SmallVector<unsigned, 8> TempSpace(NumEntries);
+  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
+
+  // Make sure we have enough space ...
+  if (Data.size() < NumEntries)
+    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
+
+  // Accumulate the data we just read into the existing data.
+  for (unsigned i = 0; i < NumEntries; ++i) {
+    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
+    Data[i] = AddCounts(Entry, Data[i]);
+  }
+}
+
+/// ReadProfilingArgBlock - Read the command line arguments that the progam was
+/// run with when the current profiling data packet(s) were generated.
+static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
+                                  bool ShouldByteSwap,
+                                  SmallVector<std::string, 1> &CommandLines) {
+  // Read the number of bytes ...
+  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the arguments (if there are any to read).  Round up the length to
+  // the nearest 4-byte multiple.
+  SmallVector<char, 8> Args(ArgLength+4);
+  if (ArgLength)
+    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
+
+  // Store the arguments.
+  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
+}
+
+const unsigned ProfileDataLoader::Uncounted = ~0U;
+
+/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
+/// a fatal error if the file is invalid or broken.
+ProfileDataLoader::ProfileDataLoader(const char *ToolName,
+                                     const std::string &Filename)
+  : Filename(Filename) {
+  FILE *F = fopen(Filename.c_str(), "rb");
+  if (F == 0)
+    report_fatal_error(Twine(ToolName) + ": Error opening '" +
+                       Filename + "': ");
+
+  // Keep reading packets until we run out of them.
+  unsigned PacketType;
+  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
+    // If the low eight bits of the packet are zero, we must be dealing with an
+    // endianness mismatch.  Byteswap all words read from the profiling
+    // information.  This can happen when the compiler host and target have
+    // different endianness.
+    bool ShouldByteSwap = (char)PacketType == 0;
+    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
+
+    switch (PacketType) {
+      case ArgumentInfo:
+        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
+        break;
+
+      case EdgeInfo:
+        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
+        break;
+
+      default:
+        report_fatal_error(std::string(ToolName)
+                           + ": Unknown profiling packet type");
+        break;
+    }
+  }
+
+  fclose(F);
+}
diff --git a/contrib/llvm/lib/Analysis/ProfileDataLoaderPass.cpp b/contrib/llvm/lib/Analysis/ProfileDataLoaderPass.cpp
new file mode 100644
index 000000000000..2ee0093a8f57
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileDataLoaderPass.cpp
@@ -0,0 +1,188 @@
+//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loads profiling data from a dump file and sets branch weight
+// metadata.
+//
+// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
+// once ProfileInfo etc. has been removed.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-metadata-loader"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumEdgesRead, "The # of edges read.");
+STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
+
+static cl::opt<std::string>
+ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
+                  cl::value_desc("filename"),
+                  cl::desc("Profile file loaded by -profile-metadata-loader"));
+
+namespace {
+  /// This pass loads profiling data from a dump file and sets branch weight
+  /// metadata.
+  class ProfileMetadataLoaderPass : public ModulePass {
+    std::string Filename;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
+        : ModulePass(ID), Filename(filename) {
+      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
+      if (filename.empty()) Filename = ProfileMetadataFilename;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profile loader";
+    }
+
+    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
+                          ArrayRef<unsigned>);
+    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
+    virtual void setBranchWeightMetadata(Module&, ProfileData&);
+
+    virtual bool runOnModule(Module &M);
+  };
+}  // End of anonymous namespace
+
+char ProfileMetadataLoaderPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+
+char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
+
+/// createProfileMetadataLoaderPass - This function returns a Pass that loads
+/// the profiling information for the module from the specified filename,
+/// making it available to the optimizers.
+ModulePass *llvm::createProfileMetadataLoaderPass() { 
+    return new ProfileMetadataLoaderPass();
+}
+ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
+  return new ProfileMetadataLoaderPass(Filename);
+}
+
+/// readEdge - Take the value from a profile counter and assign it to an edge.
+void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
+                                         ProfileData &PB, ProfileData::Edge e,
+                                         ArrayRef<unsigned> Counters) {
+  if (ReadCount >= Counters.size()) return;
+
+  unsigned weight = Counters[ReadCount];
+  assert(weight != ProfileDataLoader::Uncounted);
+  PB.addEdgeWeight(e, weight);
+
+  DEBUG(dbgs() << "-- Read Edge Counter for " << e
+               << " (# "<< (ReadCount) << "): "
+               << PB.getEdgeWeight(e) << "\n");
+}
+
+/// matchEdges - Link every profile counter with an edge.
+unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
+                                               ArrayRef<unsigned> Counters) {
+  if (Counters.size() == 0) return 0;
+
+  unsigned ReadCount = 0;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
+    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
+                 Counters);
+      }
+    }
+  }
+
+  return ReadCount;
+}
+
+/// setBranchWeightMetadata - Translate the counter values associated with each
+/// edge into branch weights for each conditional branch (a branch with 2 or
+/// more desinations).
+void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
+                                                        ProfileData &PB) {
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      unsigned NumSuccessors = TI->getNumSuccessors();
+
+      // If there is only one successor then we can not set a branch
+      // probability as the target is certain.
+      if (NumSuccessors < 2) continue;
+
+      // Load the weights of all edges leading from this terminator.
+      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
+                   << " successors:\n");
+      SmallVector<uint32_t, 4> Weights(NumSuccessors);
+      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
+          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
+          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
+          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
+                       << Weights[s] << "\n");
+      }
+
+      // Set branch weight metadata.  This will set branch probabilities of
+      // 100%/0% if that is true of the dynamic execution.
+      // BranchProbabilityInfo can account for this when it loads this metadata
+      // (it gives the unexectuted branch a weight of 1 for the purposes of
+      // probability calculations).
+      MDBuilder MDB(TI->getContext());
+      MDNode *Node = MDB.createBranchWeights(Weights);
+      TI->setMetadata(LLVMContext::MD_prof, Node);
+      NumTermsAnnotated++;
+    }
+  }
+}
+
+bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
+  ProfileDataLoader PDL("profile-data-loader", Filename);
+  ProfileData PB;
+
+  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
+
+  unsigned ReadCount = matchEdges(M, PB, Counters);
+
+  if (ReadCount != Counters.size()) {
+    errs() << "WARNING: profile information is inconsistent with "
+           << "the current program!\n";
+  }
+  NumEdgesRead = ReadCount;
+
+  setBranchWeightMetadata(M, PB);
+
+  return ReadCount > 0;
+}
diff --git a/contrib/llvm/lib/Analysis/ProfileEstimatorPass.cpp b/contrib/llvm/lib/Analysis/ProfileEstimatorPass.cpp
new file mode 100644
index 000000000000..b284b995ac78
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileEstimatorPass.cpp
@@ -0,0 +1,426 @@
+//===- ProfileEstimatorPass.cpp - LLVM Pass to estimate profile info ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a concrete implementation of profiling information that
+// estimates the profiling information in a very crude and unimaginative way.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-estimator"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<double>
+LoopWeight(
+    "profile-estimator-loop-weight", cl::init(10),
+    cl::value_desc("loop-weight"),
+    cl::desc("Number of loop executions used for profile-estimator")
+);
+
+namespace {
+  class ProfileEstimatorPass : public FunctionPass, public ProfileInfo {
+    double ExecCount;
+    LoopInfo *LI;
+    std::set<BasicBlock*>  BBToVisit;
+    std::map<Loop*,double> LoopExitWeights;
+    std::map<Edge,double>  MinimalWeight;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit ProfileEstimatorPass(const double execcount = 0)
+        : FunctionPass(ID), ExecCount(execcount) {
+      initializeProfileEstimatorPassPass(*PassRegistry::getPassRegistry());
+      if (execcount == 0) ExecCount = LoopWeight;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<LoopInfo>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profiling information estimator";
+    }
+
+    /// run - Estimate the profile information from the specified file.
+    virtual bool runOnFunction(Function &F);
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
+        return (ProfileInfo*)this;
+      return this;
+    }
+    
+    virtual void recurseBasicBlock(BasicBlock *BB);
+
+    void inline printEdgeWeight(Edge);
+  };
+}  // End of anonymous namespace
+
+char ProfileEstimatorPass::ID = 0;
+INITIALIZE_AG_PASS_BEGIN(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
+                "Estimate profiling information", false, true, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_AG_PASS_END(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
+                "Estimate profiling information", false, true, false)
+
+namespace llvm {
+  char &ProfileEstimatorPassID = ProfileEstimatorPass::ID;
+
+  FunctionPass *createProfileEstimatorPass() {
+    return new ProfileEstimatorPass();
+  }
+
+  /// createProfileEstimatorPass - This function returns a Pass that estimates
+  /// profiling information using the given loop execution count.
+  Pass *createProfileEstimatorPass(const unsigned execcount) {
+    return new ProfileEstimatorPass(execcount);
+  }
+}
+
+static double ignoreMissing(double w) {
+  if (w == ProfileInfo::MissingValue) return 0;
+  return w;
+}
+
+static void inline printEdgeError(ProfileInfo::Edge e, const char *M) {
+  DEBUG(dbgs() << "-- Edge " << e << " is not calculated, " << M << "\n");
+}
+
+void inline ProfileEstimatorPass::printEdgeWeight(Edge E) {
+  DEBUG(dbgs() << "-- Weight of Edge " << E << ":"
+               << format("%20.20g", getEdgeWeight(E)) << "\n");
+}
+
+// recurseBasicBlock() - This calculates the ProfileInfo estimation for a
+// single block and then recurses into the successors.
+// The algorithm preserves the flow condition, meaning that the sum of the
+// weight of the incoming edges must be equal the block weight which must in
+// turn be equal to the sume of the weights of the outgoing edges.
+// Since the flow of an block is deterimined from the current state of the
+// flow, once an edge has a flow assigned this flow is never changed again,
+// otherwise it would be possible to violate the flow condition in another
+// block.
+void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
+
+  // Break the recursion if this BasicBlock was already visited.
+  if (BBToVisit.find(BB) == BBToVisit.end()) return;
+
+  // Read the LoopInfo for this block.
+  bool  BBisHeader = LI->isLoopHeader(BB);
+  Loop* BBLoop     = LI->getLoopFor(BB);
+
+  // To get the block weight, read all incoming edges.
+  double BBWeight = 0;
+  std::set<BasicBlock*> ProcessedPreds;
+  for ( pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+        bbi != bbe; ++bbi ) {
+    // If this block was not considered already, add weight.
+    Edge edge = getEdge(*bbi,BB);
+    double w = getEdgeWeight(edge);
+    if (ProcessedPreds.insert(*bbi).second) {
+      BBWeight += ignoreMissing(w);
+    }
+    // If this block is a loop header and the predecessor is contained in this
+    // loop, thus the edge is a backedge, continue and do not check if the
+    // value is valid.
+    if (BBisHeader && BBLoop->contains(*bbi)) {
+      printEdgeError(edge, "but is backedge, continuing");
+      continue;
+    }
+    // If the edges value is missing (and this is no loop header, and this is
+    // no backedge) return, this block is currently non estimatable.
+    if (w == MissingValue) {
+      printEdgeError(edge, "returning");
+      return;
+    }
+  }
+  if (getExecutionCount(BB) != MissingValue) {
+    BBWeight = getExecutionCount(BB);
+  }
+
+  // Fetch all necessary information for current block.
+  SmallVector<Edge, 8> ExitEdges;
+  SmallVector<Edge, 8> Edges;
+  if (BBLoop) {
+    BBLoop->getExitEdges(ExitEdges);
+  }
+
+  // If this is a loop header, consider the following:
+  // Exactly the flow that is entering this block, must exit this block too. So
+  // do the following: 
+  // *) get all the exit edges, read the flow that is already leaving this
+  // loop, remember the edges that do not have any flow on them right now.
+  // (The edges that have already flow on them are most likely exiting edges of
+  // other loops, do not touch those flows because the previously caclulated
+  // loopheaders would not be exact anymore.)
+  // *) In case there is not a single exiting edge left, create one at the loop
+  // latch to prevent the flow from building up in the loop.
+  // *) Take the flow that is not leaving the loop already and distribute it on
+  // the remaining exiting edges.
+  // (This ensures that all flow that enters the loop also leaves it.)
+  // *) Increase the flow into the loop by increasing the weight of this block.
+  // There is at least one incoming backedge that will bring us this flow later
+  // on. (So that the flow condition in this node is valid again.)
+  if (BBisHeader) {
+    double incoming = BBWeight;
+    // Subtract the flow leaving the loop.
+    std::set<Edge> ProcessedExits;
+    for (SmallVector<Edge, 8>::iterator ei = ExitEdges.begin(),
+         ee = ExitEdges.end(); ei != ee; ++ei) {
+      if (ProcessedExits.insert(*ei).second) {
+        double w = getEdgeWeight(*ei);
+        if (w == MissingValue) {
+          Edges.push_back(*ei);
+          // Check if there is a necessary minimal weight, if yes, subtract it 
+          // from weight.
+          if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+            incoming -= MinimalWeight[*ei];
+            DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+          }
+        } else {
+          incoming -= w;
+        }
+      }
+    }
+    // If no exit edges, create one:
+    if (Edges.size() == 0) {
+      BasicBlock *Latch = BBLoop->getLoopLatch();
+      if (Latch) {
+        Edge edge = getEdge(Latch,0);
+        EdgeInformation[BB->getParent()][edge] = BBWeight;
+        printEdgeWeight(edge);
+        edge = getEdge(Latch, BB);
+        EdgeInformation[BB->getParent()][edge] = BBWeight * ExecCount;
+        printEdgeWeight(edge);
+      }
+    }
+
+    // Distribute remaining weight to the exting edges. To prevent fractions
+    // from building up and provoking precision problems the weight which is to
+    // be distributed is split and the rounded, the last edge gets a somewhat
+    // bigger value, but we are close enough for an estimation.
+    double fraction = floor(incoming/Edges.size());
+    for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
+         ei != ee; ++ei) {
+      double w = 0;
+      if (ei != (ee-1)) {
+        w = fraction;
+        incoming -= fraction;
+      } else {
+        w = incoming;
+      }
+      EdgeInformation[BB->getParent()][*ei] += w;
+      // Read necessary minimal weight.
+      if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+        EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
+        DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+      }
+      printEdgeWeight(*ei);
+      
+      // Add minimal weight to paths to all exit edges, this is used to ensure
+      // that enough flow is reaching this edges.
+      Path p;
+      const BasicBlock *Dest = GetPath(BB, (*ei).first, p, GetPathToDest);
+      while (Dest != BB) {
+        const BasicBlock *Parent = p.find(Dest)->second;
+        Edge e = getEdge(Parent, Dest);
+        if (MinimalWeight.find(e) == MinimalWeight.end()) {
+          MinimalWeight[e] = 0;
+        }
+        MinimalWeight[e] += w;
+        DEBUG(dbgs() << "Minimal Weight for " << e << ": " << format("%.20g",MinimalWeight[e]) << "\n");
+        Dest = Parent;
+      }
+    }
+    // Increase flow into the loop.
+    BBWeight *= (ExecCount+1);
+  }
+
+  BlockInformation[BB->getParent()][BB] = BBWeight;
+  // Up until now we considered only the loop exiting edges, now we have a
+  // definite block weight and must distribute this onto the outgoing edges.
+  // Since there may be already flow attached to some of the edges, read this
+  // flow first and remember the edges that have still now flow attached.
+  Edges.clear();
+  std::set<BasicBlock*> ProcessedSuccs;
+
+  succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+  // Also check for (BB,0) edges that may already contain some flow. (But only
+  // in case there are no successors.)
+  if (bbi == bbe) {
+    Edge edge = getEdge(BB,0);
+    EdgeInformation[BB->getParent()][edge] = BBWeight;
+    printEdgeWeight(edge);
+  }
+  for ( ; bbi != bbe; ++bbi ) {
+    if (ProcessedSuccs.insert(*bbi).second) {
+      Edge edge = getEdge(BB,*bbi);
+      double w = getEdgeWeight(edge);
+      if (w != MissingValue) {
+        BBWeight -= getEdgeWeight(edge);
+      } else {
+        Edges.push_back(edge);
+        // If minimal weight is necessary, reserve weight by subtracting weight
+        // from block weight, this is readded later on.
+        if (MinimalWeight.find(edge) != MinimalWeight.end()) {
+          BBWeight -= MinimalWeight[edge];
+          DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[edge]) << " at " << edge << "\n");
+        }
+      }
+    }
+  }
+
+  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
+  // Finally we know what flow is still not leaving the block, distribute this
+  // flow onto the empty edges.
+  for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
+       ei != ee; ++ei) {
+    if (ei != (ee-1)) {
+      EdgeInformation[BB->getParent()][*ei] += fraction;
+      BBWeight -= fraction;
+    } else {
+      EdgeInformation[BB->getParent()][*ei] += BBWeight;
+    }
+    // Readd minial necessary weight.
+    if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
+      EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
+      DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
+    }
+    printEdgeWeight(*ei);
+  }
+
+  // This block is visited, mark this before the recursion.
+  BBToVisit.erase(BB);
+
+  // Recurse into successors.
+  for (succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+       bbi != bbe; ++bbi) {
+    recurseBasicBlock(*bbi);
+  }
+}
+
+bool ProfileEstimatorPass::runOnFunction(Function &F) {
+  if (F.isDeclaration()) return false;
+
+  // Fetch LoopInfo and clear ProfileInfo for this function.
+  LI = &getAnalysis<LoopInfo>();
+  FunctionInformation.erase(&F);
+  BlockInformation[&F].clear();
+  EdgeInformation[&F].clear();
+  BBToVisit.clear();
+
+  // Mark all blocks as to visit.
+  for (Function::iterator bi = F.begin(), be = F.end(); bi != be; ++bi)
+    BBToVisit.insert(bi);
+
+  // Clear Minimal Edges.
+  MinimalWeight.clear();
+
+  DEBUG(dbgs() << "Working on function " << F.getName() << "\n");
+
+  // Since the entry block is the first one and has no predecessors, the edge
+  // (0,entry) is inserted with the starting weight of 1.
+  BasicBlock *entry = &F.getEntryBlock();
+  BlockInformation[&F][entry] = pow(2.0, 32.0);
+  Edge edge = getEdge(0,entry);
+  EdgeInformation[&F][edge] = BlockInformation[&F][entry];
+  printEdgeWeight(edge);
+
+  // Since recurseBasicBlock() maybe returns with a block which was not fully
+  // estimated, use recurseBasicBlock() until everything is calculated.
+  bool cleanup = false;
+  recurseBasicBlock(entry);
+  while (BBToVisit.size() > 0 && !cleanup) {
+    // Remember number of open blocks, this is later used to check if progress
+    // was made.
+    unsigned size = BBToVisit.size();
+
+    // Try to calculate all blocks in turn.
+    for (std::set<BasicBlock*>::iterator bi = BBToVisit.begin(),
+         be = BBToVisit.end(); bi != be; ++bi) {
+      recurseBasicBlock(*bi);
+      // If at least one block was finished, break because iterator may be
+      // invalid.
+      if (BBToVisit.size() < size) break;
+    }
+
+    // If there was not a single block resolved, make some assumptions.
+    if (BBToVisit.size() == size) {
+      bool found = false;
+      for (std::set<BasicBlock*>::iterator BBI = BBToVisit.begin(), BBE = BBToVisit.end(); 
+           (BBI != BBE) && (!found); ++BBI) {
+        BasicBlock *BB = *BBI;
+        // Try each predecessor if it can be assumend.
+        for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+             (bbi != bbe) && (!found); ++bbi) {
+          Edge e = getEdge(*bbi,BB);
+          double w = getEdgeWeight(e);
+          // Check that edge from predecessor is still free.
+          if (w == MissingValue) {
+            // Check if there is a circle from this block to predecessor.
+            Path P;
+            const BasicBlock *Dest = GetPath(BB, *bbi, P, GetPathToDest);
+            if (Dest != *bbi) {
+              // If there is no circle, just set edge weight to 0
+              EdgeInformation[&F][e] = 0;
+              DEBUG(dbgs() << "Assuming edge weight: ");
+              printEdgeWeight(e);
+              found = true;
+            }
+          }
+        }
+      }
+      if (!found) {
+        cleanup = true;
+        DEBUG(dbgs() << "No assumption possible in Fuction "<<F.getName()<<", setting all to zero\n");
+      }
+    }
+  }
+  // In case there was no safe way to assume edges, set as a last measure, 
+  // set _everything_ to zero.
+  if (cleanup) {
+    FunctionInformation[&F] = 0;
+    BlockInformation[&F].clear();
+    EdgeInformation[&F].clear();
+    for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+      const BasicBlock *BB = &(*FI);
+      BlockInformation[&F][BB] = 0;
+      const_pred_iterator predi = pred_begin(BB), prede = pred_end(BB);
+      if (predi == prede) {
+        Edge e = getEdge(0,BB);
+        setEdgeWeight(e,0);
+      }
+      for (;predi != prede; ++predi) {
+        Edge e = getEdge(*predi,BB);
+        setEdgeWeight(e,0);
+      }
+      succ_const_iterator succi = succ_begin(BB), succe = succ_end(BB);
+      if (succi == succe) {
+        Edge e = getEdge(BB,0);
+        setEdgeWeight(e,0);
+      }
+      for (;succi != succe; ++succi) {
+        Edge e = getEdge(*succi,BB);
+        setEdgeWeight(e,0);
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/ProfileInfo.cpp b/contrib/llvm/lib/Analysis/ProfileInfo.cpp
new file mode 100644
index 000000000000..9626a48b9d0d
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileInfo.cpp
@@ -0,0 +1,1079 @@
+//===- ProfileInfo.cpp - Profile Info Interface ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract ProfileInfo interface, and the default
+// "no profile" implementation.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-info"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include <limits>
+#include <queue>
+#include <set>
+using namespace llvm;
+
+namespace llvm {
+  template<> char ProfileInfoT<Function,BasicBlock>::ID = 0;
+}
+
+// Register the ProfileInfo interface, providing a nice name to refer to.
+INITIALIZE_ANALYSIS_GROUP(ProfileInfo, "Profile Information", NoProfileInfo)
+
+namespace llvm {
+
+template <>
+ProfileInfoT<MachineFunction, MachineBasicBlock>::ProfileInfoT() {}
+template <>
+ProfileInfoT<MachineFunction, MachineBasicBlock>::~ProfileInfoT() {}
+
+template <>
+ProfileInfoT<Function, BasicBlock>::ProfileInfoT() {
+  MachineProfile = 0;
+}
+template <>
+ProfileInfoT<Function, BasicBlock>::~ProfileInfoT() {
+  if (MachineProfile) delete MachineProfile;
+}
+
+template<>
+char ProfileInfoT<MachineFunction, MachineBasicBlock>::ID = 0;
+
+template<>
+const double ProfileInfoT<Function,BasicBlock>::MissingValue = -1;
+
+template<> const
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::MissingValue = -1;
+
+template<> double
+ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
+  std::map<const Function*, BlockCounts>::iterator J =
+    BlockInformation.find(BB->getParent());
+  if (J != BlockInformation.end()) {
+    BlockCounts::iterator I = J->second.find(BB);
+    if (I != J->second.end())
+      return I->second;
+  }
+
+  double Count = MissingValue;
+
+  const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+  // Are there zero predecessors of this block?
+  if (PI == PE) {
+    Edge e = getEdge(0, BB);
+    Count = getEdgeWeight(e);
+  } else {
+    // Otherwise, if there are predecessors, the execution count of this block is
+    // the sum of the edge frequencies from the incoming edges.
+    std::set<const BasicBlock*> ProcessedPreds;
+    Count = 0;
+    for (; PI != PE; ++PI) {
+      const BasicBlock *P = *PI;
+      if (ProcessedPreds.insert(P).second) {
+        double w = getEdgeWeight(getEdge(P, BB));
+        if (w == MissingValue) {
+          Count = MissingValue;
+          break;
+        }
+        Count += w;
+      }
+    }
+  }
+
+  // If the predecessors did not suffice to get block weight, try successors.
+  if (Count == MissingValue) {
+
+    succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
+
+    // Are there zero successors of this block?
+    if (SI == SE) {
+      Edge e = getEdge(BB,0);
+      Count = getEdgeWeight(e);
+    } else {
+      std::set<const BasicBlock*> ProcessedSuccs;
+      Count = 0;
+      for (; SI != SE; ++SI)
+        if (ProcessedSuccs.insert(*SI).second) {
+          double w = getEdgeWeight(getEdge(BB, *SI));
+          if (w == MissingValue) {
+            Count = MissingValue;
+            break;
+          }
+          Count += w;
+        }
+    }
+  }
+
+  if (Count != MissingValue) BlockInformation[BB->getParent()][BB] = Count;
+  return Count;
+}
+
+template<>
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        getExecutionCount(const MachineBasicBlock *MBB) {
+  std::map<const MachineFunction*, BlockCounts>::iterator J =
+    BlockInformation.find(MBB->getParent());
+  if (J != BlockInformation.end()) {
+    BlockCounts::iterator I = J->second.find(MBB);
+    if (I != J->second.end())
+      return I->second;
+  }
+
+  return MissingValue;
+}
+
+template<>
+double ProfileInfoT<Function,BasicBlock>::getExecutionCount(const Function *F) {
+  std::map<const Function*, double>::iterator J =
+    FunctionInformation.find(F);
+  if (J != FunctionInformation.end())
+    return J->second;
+
+  // isDeclaration() is checked here and not at start of function to allow
+  // functions without a body still to have a execution count.
+  if (F->isDeclaration()) return MissingValue;
+
+  double Count = getExecutionCount(&F->getEntryBlock());
+  if (Count != MissingValue) FunctionInformation[F] = Count;
+  return Count;
+}
+
+template<>
+double ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        getExecutionCount(const MachineFunction *MF) {
+  std::map<const MachineFunction*, double>::iterator J =
+    FunctionInformation.find(MF);
+  if (J != FunctionInformation.end())
+    return J->second;
+
+  double Count = getExecutionCount(&MF->front());
+  if (Count != MissingValue) FunctionInformation[MF] = Count;
+  return Count;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        setExecutionCount(const BasicBlock *BB, double w) {
+  DEBUG(dbgs() << "Creating Block " << BB->getName() 
+               << " (weight: " << format("%.20g",w) << ")\n");
+  BlockInformation[BB->getParent()][BB] = w;
+}
+
+template<>
+void ProfileInfoT<MachineFunction, MachineBasicBlock>::
+        setExecutionCount(const MachineBasicBlock *MBB, double w) {
+  DEBUG(dbgs() << "Creating Block " << MBB->getBasicBlock()->getName()
+               << " (weight: " << format("%.20g",w) << ")\n");
+  BlockInformation[MBB->getParent()][MBB] = w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::addEdgeWeight(Edge e, double w) {
+  double oldw = getEdgeWeight(e);
+  assert (oldw != MissingValue && "Adding weight to Edge with no previous weight");
+  DEBUG(dbgs() << "Adding to Edge " << e
+               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
+  EdgeInformation[getFunction(e)][e] = oldw + w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        addExecutionCount(const BasicBlock *BB, double w) {
+  double oldw = getExecutionCount(BB);
+  assert (oldw != MissingValue && "Adding weight to Block with no previous weight");
+  DEBUG(dbgs() << "Adding to Block " << BB->getName()
+               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
+  BlockInformation[BB->getParent()][BB] = oldw + w;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::removeBlock(const BasicBlock *BB) {
+  std::map<const Function*, BlockCounts>::iterator J =
+    BlockInformation.find(BB->getParent());
+  if (J == BlockInformation.end()) return;
+
+  DEBUG(dbgs() << "Deleting " << BB->getName() << "\n");
+  J->second.erase(BB);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::removeEdge(Edge e) {
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(getFunction(e));
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(dbgs() << "Deleting" << e << "\n");
+  J->second.erase(e);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        replaceEdge(const Edge &oldedge, const Edge &newedge) {
+  double w;
+  if ((w = getEdgeWeight(newedge)) == MissingValue) {
+    w = getEdgeWeight(oldedge);
+    DEBUG(dbgs() << "Replacing " << oldedge << " with " << newedge  << "\n");
+  } else {
+    w += getEdgeWeight(oldedge);
+    DEBUG(dbgs() << "Adding " << oldedge << " to " << newedge  << "\n");
+  }
+  setEdgeWeight(newedge,w);
+  removeEdge(oldedge);
+}
+
+template<>
+const BasicBlock *ProfileInfoT<Function,BasicBlock>::
+        GetPath(const BasicBlock *Src, const BasicBlock *Dest,
+                Path &P, unsigned Mode) {
+  const BasicBlock *BB = 0;
+  bool hasFoundPath = false;
+
+  std::queue<const BasicBlock *> BFS;
+  BFS.push(Src);
+
+  while(BFS.size() && !hasFoundPath) {
+    BB = BFS.front();
+    BFS.pop();
+
+    succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
+    if (Succ == End) {
+      P[(const BasicBlock*)0] = BB;
+      if (Mode & GetPathToExit) {
+        hasFoundPath = true;
+        BB = 0;
+      }
+    }
+    for(;Succ != End; ++Succ) {
+      if (P.find(*Succ) != P.end()) continue;
+      Edge e = getEdge(BB,*Succ);
+      if ((Mode & GetPathWithNewEdges) && (getEdgeWeight(e) != MissingValue)) continue;
+      P[*Succ] = BB;
+      BFS.push(*Succ);
+      if ((Mode & GetPathToDest) && *Succ == Dest) {
+        hasFoundPath = true;
+        BB = *Succ;
+        break;
+      }
+      if ((Mode & GetPathToValue) && (getExecutionCount(*Succ) != MissingValue)) {
+        hasFoundPath = true;
+        BB = *Succ;
+        break;
+      }
+    }
+  }
+
+  return BB;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        divertFlow(const Edge &oldedge, const Edge &newedge) {
+  DEBUG(dbgs() << "Diverting " << oldedge << " via " << newedge );
+
+  // First check if the old edge was taken, if not, just delete it...
+  if (getEdgeWeight(oldedge) == 0) {
+    removeEdge(oldedge);
+    return;
+  }
+
+  Path P;
+  P[newedge.first] = 0;
+  P[newedge.second] = newedge.first;
+  const BasicBlock *BB = GetPath(newedge.second,oldedge.second,P,GetPathToExit | GetPathToDest);
+
+  double w = getEdgeWeight (oldedge);
+  DEBUG(dbgs() << ", Weight: " << format("%.20g",w) << "\n");
+  do {
+    const BasicBlock *Parent = P.find(BB)->second;
+    Edge e = getEdge(Parent,BB);
+    double oldw = getEdgeWeight(e);
+    double oldc = getExecutionCount(e.first);
+    setEdgeWeight(e, w+oldw);
+    if (Parent != oldedge.first) {
+      setExecutionCount(e.first, w+oldc);
+    }
+    BB = Parent;
+  } while (BB != newedge.first);
+  removeEdge(oldedge);
+}
+
+/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB.
+/// This checks all edges of the function the blocks reside in and replaces the
+/// occurrences of RmBB with DestBB.
+template<>
+void ProfileInfoT<Function,BasicBlock>::
+        replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
+  DEBUG(dbgs() << "Replacing " << RmBB->getName()
+               << " with " << DestBB->getName() << "\n");
+  const Function *F = DestBB->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  Edge e, newedge;
+  bool erasededge = false;
+  EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
+  while(I != E) {
+    e = (I++)->first;
+    bool foundedge = false; bool eraseedge = false;
+    if (e.first == RmBB) {
+      if (e.second == DestBB) {
+        eraseedge = true;
+      } else {
+        newedge = getEdge(DestBB, e.second);
+        foundedge = true;
+      }
+    }
+    if (e.second == RmBB) {
+      if (e.first == DestBB) {
+        eraseedge = true;
+      } else {
+        newedge = getEdge(e.first, DestBB);
+        foundedge = true;
+      }
+    }
+    if (foundedge) {
+      replaceEdge(e, newedge);
+    }
+    if (eraseedge) {
+      if (erasededge) {
+        Edge newedge = getEdge(DestBB, DestBB);
+        replaceEdge(e, newedge);
+      } else {
+        removeEdge(e);
+        erasededge = true;
+      }
+    }
+  }
+}
+
+/// Splits an edge in the ProfileInfo and redirects flow over NewBB.
+/// Since its possible that there is more than one edge in the CFG from FristBB
+/// to SecondBB its necessary to redirect the flow proporionally.
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitEdge(const BasicBlock *FirstBB,
+                                                  const BasicBlock *SecondBB,
+                                                  const BasicBlock *NewBB,
+                                                  bool MergeIdenticalEdges) {
+  const Function *F = FirstBB->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  // Generate edges and read current weight.
+  Edge e  = getEdge(FirstBB, SecondBB);
+  Edge n1 = getEdge(FirstBB, NewBB);
+  Edge n2 = getEdge(NewBB, SecondBB);
+  EdgeWeights &ECs = J->second;
+  double w = ECs[e];
+
+  int succ_count = 0;
+  if (!MergeIdenticalEdges) {
+    // First count the edges from FristBB to SecondBB, if there is more than
+    // one, only slice out a proporional part for NewBB.
+    for(succ_const_iterator BBI = succ_begin(FirstBB), BBE = succ_end(FirstBB);
+        BBI != BBE; ++BBI) {
+      if (*BBI == SecondBB) succ_count++;  
+    }
+    // When the NewBB is completely new, increment the count by one so that
+    // the counts are properly distributed.
+    if (getExecutionCount(NewBB) == ProfileInfo::MissingValue) succ_count++;
+  } else {
+    // When the edges are merged anyway, then redirect all flow.
+    succ_count = 1;
+  }
+
+  // We know now how many edges there are from FirstBB to SecondBB, reroute a
+  // proportional part of the edge weight over NewBB.
+  double neww = floor(w / succ_count);
+  ECs[n1] += neww;
+  ECs[n2] += neww;
+  BlockInformation[F][NewBB] += neww;
+  if (succ_count == 1) {
+    ECs.erase(e);
+  } else {
+    ECs[e] -= neww;
+  }
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *Old,
+                                                   const BasicBlock* New) {
+  const Function *F = Old->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(dbgs() << "Splitting " << Old->getName() << " to " << New->getName() << "\n");
+
+  std::set<Edge> Edges;
+  for (EdgeWeights::iterator ewi = J->second.begin(), ewe = J->second.end(); 
+       ewi != ewe; ++ewi) {
+    Edge old = ewi->first;
+    if (old.first == Old) {
+      Edges.insert(old);
+    }
+  }
+  for (std::set<Edge>::iterator EI = Edges.begin(), EE = Edges.end(); 
+       EI != EE; ++EI) {
+    Edge newedge = getEdge(New, EI->second);
+    replaceEdge(*EI, newedge);
+  }
+
+  double w = getExecutionCount(Old);
+  setEdgeWeight(getEdge(Old, New), w);
+  setExecutionCount(New, w);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *BB,
+                                                   const BasicBlock* NewBB,
+                                                   BasicBlock *const *Preds,
+                                                   unsigned NumPreds) {
+  const Function *F = BB->getParent();
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(F);
+  if (J == EdgeInformation.end()) return;
+
+  DEBUG(dbgs() << "Splitting " << NumPreds << " Edges from " << BB->getName() 
+               << " to " << NewBB->getName() << "\n");
+
+  // Collect weight that was redirected over NewBB.
+  double newweight = 0;
+  
+  std::set<const BasicBlock *> ProcessedPreds;
+  // For all requestes Predecessors.
+  for (unsigned pred = 0; pred < NumPreds; ++pred) {
+    const BasicBlock * Pred = Preds[pred];
+    if (ProcessedPreds.insert(Pred).second) {
+      // Create edges and read old weight.
+      Edge oldedge = getEdge(Pred, BB);
+      Edge newedge = getEdge(Pred, NewBB);
+
+      // Remember how much weight was redirected.
+      newweight += getEdgeWeight(oldedge);
+    
+      replaceEdge(oldedge,newedge);
+    }
+  }
+
+  Edge newedge = getEdge(NewBB,BB);
+  setEdgeWeight(newedge, newweight);
+  setExecutionCount(NewBB, newweight);
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::transfer(const Function *Old,
+                                                 const Function *New) {
+  DEBUG(dbgs() << "Replacing Function " << Old->getName() << " with "
+               << New->getName() << "\n");
+  std::map<const Function*, EdgeWeights>::iterator J =
+    EdgeInformation.find(Old);
+  if(J != EdgeInformation.end()) {
+    EdgeInformation[New] = J->second;
+  }
+  EdgeInformation.erase(Old);
+  BlockInformation.erase(Old);
+  FunctionInformation.erase(Old);
+}
+
+static double readEdgeOrRemember(ProfileInfo::Edge edge, double w,
+                                 ProfileInfo::Edge &tocalc, unsigned &uncalc) {
+  if (w == ProfileInfo::MissingValue) {
+    tocalc = edge;
+    uncalc++;
+    return 0;
+  } else {
+    return w;
+  }
+}
+
+template<>
+bool ProfileInfoT<Function,BasicBlock>::
+        CalculateMissingEdge(const BasicBlock *BB, Edge &removed,
+                             bool assumeEmptySelf) {
+  Edge edgetocalc;
+  unsigned uncalculated = 0;
+
+  // collect weights of all incoming and outgoing edges, rememer edges that
+  // have no value
+  double incount = 0;
+  SmallSet<const BasicBlock*,8> pred_visited;
+  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  if (bbi==bbe) {
+    Edge e = getEdge(0,BB);
+    incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
+  }
+  for (;bbi != bbe; ++bbi) {
+    if (pred_visited.insert(*bbi)) {
+      Edge e = getEdge(*bbi,BB);
+      incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
+    }
+  }
+
+  double outcount = 0;
+  SmallSet<const BasicBlock*,8> succ_visited;
+  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
+  if (sbbi==sbbe) {
+    Edge e = getEdge(BB,0);
+    if (getEdgeWeight(e) == MissingValue) {
+      double w = getExecutionCount(BB);
+      if (w != MissingValue) {
+        setEdgeWeight(e,w);
+        removed = e;
+      }
+    }
+    outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
+  }
+  for (;sbbi != sbbe; ++sbbi) {
+    if (succ_visited.insert(*sbbi)) {
+      Edge e = getEdge(BB,*sbbi);
+      outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
+    }
+  }
+
+  // if exactly one edge weight was missing, calculate it and remove it from
+  // spanning tree
+  if (uncalculated == 0 ) {
+    return true;
+  } else
+  if (uncalculated == 1) {
+    if (incount < outcount) {
+      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
+    } else {
+      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
+    }
+    DEBUG(dbgs() << "--Calc Edge Counter for " << edgetocalc << ": "
+                 << format("%.20g", getEdgeWeight(edgetocalc)) << "\n");
+    removed = edgetocalc;
+    return true;
+  } else 
+  if (uncalculated == 2 && assumeEmptySelf && edgetocalc.first == edgetocalc.second && incount == outcount) {
+    setEdgeWeight(edgetocalc, incount * 10);
+    removed = edgetocalc;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::set<ProfileInfo::Edge> &misscount) {
+  double w = PI->getEdgeWeight(e);
+  if (w != ProfileInfo::MissingValue) {
+    calcw += w;
+  } else {
+    misscount.insert(e);
+  }
+}
+
+template<>
+bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
+  double inWeight = 0;
+  std::set<Edge> inMissing;
+  std::set<const BasicBlock*> ProcessedPreds;
+  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+  if (bbi == bbe) {
+    readEdge(this,getEdge(0,BB),inWeight,inMissing);
+  }
+  for( ; bbi != bbe; ++bbi ) {
+    if (ProcessedPreds.insert(*bbi).second) {
+      readEdge(this,getEdge(*bbi,BB),inWeight,inMissing);
+    }
+  }
+
+  double outWeight = 0;
+  std::set<Edge> outMissing;
+  std::set<const BasicBlock*> ProcessedSuccs;
+  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
+  if (sbbi == sbbe)
+    readEdge(this,getEdge(BB,0),outWeight,outMissing);
+  for ( ; sbbi != sbbe; ++sbbi ) {
+    if (ProcessedSuccs.insert(*sbbi).second) {
+      readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
+    }
+  }
+
+  double share;
+  std::set<Edge>::iterator ei,ee;
+  if (inMissing.size() == 0 && outMissing.size() > 0) {
+    ei = outMissing.begin();
+    ee = outMissing.end();
+    share = inWeight/outMissing.size();
+    setExecutionCount(BB,inWeight);
+  } else
+  if (inMissing.size() > 0 && outMissing.size() == 0 && outWeight == 0) {
+    ei = inMissing.begin();
+    ee = inMissing.end();
+    share = 0;
+    setExecutionCount(BB,0);
+  } else
+  if (inMissing.size() == 0 && outMissing.size() == 0) {
+    setExecutionCount(BB,outWeight);
+    return true;
+  } else {
+    return false;
+  }
+  for ( ; ei != ee; ++ei ) {
+    setEdgeWeight(*ei,share);
+  }
+  return true;
+}
+
+template<>
+void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
+//  if (getExecutionCount(&(F->getEntryBlock())) == 0) {
+//    for (Function::const_iterator FI = F->begin(), FE = F->end();
+//         FI != FE; ++FI) {
+//      const BasicBlock* BB = &(*FI);
+//      {
+//        const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+//        if (NBB == End) {
+//          setEdgeWeight(getEdge(0,BB),0);
+//        }
+//        for(;NBB != End; ++NBB) {
+//          setEdgeWeight(getEdge(*NBB,BB),0);
+//        }
+//      }
+//      {
+//        succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+//        if (NBB == End) {
+//          setEdgeWeight(getEdge(0,BB),0);
+//        }
+//        for(;NBB != End; ++NBB) {
+//          setEdgeWeight(getEdge(*NBB,BB),0);
+//        }
+//      }
+//    }
+//    return;
+//  }
+  // The set of BasicBlocks that are still unvisited.
+  std::set<const BasicBlock*> Unvisited;
+
+  // The set of return edges (Edges with no successors).
+  std::set<Edge> ReturnEdges;
+  double ReturnWeight = 0;
+  
+  // First iterate over the whole function and collect:
+  // 1) The blocks in this function in the Unvisited set.
+  // 2) The return edges in the ReturnEdges set.
+  // 3) The flow that is leaving the function already via return edges.
+
+  // Data structure for searching the function.
+  std::queue<const BasicBlock *> BFS;
+  const BasicBlock *BB = &(F->getEntryBlock());
+  BFS.push(BB);
+  Unvisited.insert(BB);
+
+  while (BFS.size()) {
+    BB = BFS.front(); BFS.pop();
+    succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+    if (NBB == End) {
+      Edge e = getEdge(BB,0);
+      double w = getEdgeWeight(e);
+      if (w == MissingValue) {
+        // If the return edge has no value, try to read value from block.
+        double bw = getExecutionCount(BB);
+        if (bw != MissingValue) {
+          setEdgeWeight(e,bw);
+          ReturnWeight += bw;
+        } else {
+          // If both return edge and block provide no value, collect edge.
+          ReturnEdges.insert(e);
+        }
+      } else {
+        // If the return edge has a proper value, collect it.
+        ReturnWeight += w;
+      }
+    }
+    for (;NBB != End; ++NBB) {
+      if (Unvisited.insert(*NBB).second) {
+        BFS.push(*NBB);
+      }
+    }
+  }
+
+  while (Unvisited.size() > 0) {
+    unsigned oldUnvisitedCount = Unvisited.size();
+    bool FoundPath = false;
+
+    // If there is only one edge left, calculate it.
+    if (ReturnEdges.size() == 1) {
+      ReturnWeight = getExecutionCount(&(F->getEntryBlock())) - ReturnWeight;
+
+      Edge e = *ReturnEdges.begin();
+      setEdgeWeight(e,ReturnWeight);
+      setExecutionCount(e.first,ReturnWeight);
+
+      Unvisited.erase(e.first);
+      ReturnEdges.erase(e);
+      continue;
+    }
+
+    // Calculate all blocks where only one edge is missing, this may also
+    // resolve furhter return edges.
+    std::set<const BasicBlock *>::iterator FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      Edge e;
+      if(CalculateMissingEdge(BB,e,true)) {
+        if (BlockInformation[F].find(BB) == BlockInformation[F].end()) {
+          setExecutionCount(BB,getExecutionCount(BB));
+        }
+        Unvisited.erase(BB);
+        if (e.first != 0 && e.second == 0) {
+          ReturnEdges.erase(e);
+          ReturnWeight += getEdgeWeight(e);
+        }
+      }
+    }
+    if (oldUnvisitedCount > Unvisited.size()) continue;
+
+    // Estimate edge weights by dividing the flow proportionally.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      const BasicBlock *Dest = 0;
+      bool AllEdgesHaveSameReturn = true;
+      // Check each Successor, these must all end up in the same or an empty
+      // return block otherwise its dangerous to do an estimation on them.
+      for (succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
+           Succ != End; ++Succ) {
+        Path P;
+        GetPath(*Succ, 0, P, GetPathToExit);
+        if (Dest && Dest != P[(const BasicBlock*)0]) {
+          AllEdgesHaveSameReturn = false;
+        }
+        Dest = P[(const BasicBlock*)0];
+      }
+      if (AllEdgesHaveSameReturn) {
+        if(EstimateMissingEdges(BB)) {
+          Unvisited.erase(BB);
+          break;
+        }
+      }
+    }
+    if (oldUnvisitedCount > Unvisited.size()) continue;
+
+    // Check if there is a path to an block that has a known value and redirect
+    // flow accordingly.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      // Fetch path.
+      const BasicBlock *BB = *FI; ++FI;
+      Path P;
+      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToValue);
+
+      // Calculate incoming flow.
+      double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
+      std::set<const BasicBlock *> Processed;
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(*NBB, BB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            iw += ew;
+            invalid++;
+          } else {
+            // If the path contains the successor, this means its a backedge,
+            // do not count as missing.
+            if (P.find(*NBB) == P.end())
+              inmissing++;
+          }
+          incount++;
+        }
+      }
+      if (inmissing == incount) continue;
+      if (invalid == 0) continue;
+
+      // Subtract (already) outgoing flow.
+      Processed.clear();
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(BB, *NBB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            iw -= ew;
+          }
+        }
+      }
+      if (iw < 0) continue;
+
+      // Check the receiving end of the path if it can handle the flow.
+      double ow = getExecutionCount(Dest);
+      Processed.clear();
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (Processed.insert(*NBB).second) {
+          Edge e = getEdge(BB, *NBB);
+          double ew = getEdgeWeight(e);
+          if (ew != MissingValue) {
+            ow -= ew;
+          }
+        }
+      }
+      if (ow < 0) continue;
+
+      // Determine how much flow shall be used.
+      double ew = getEdgeWeight(getEdge(P[Dest],Dest));
+      if (ew != MissingValue) {
+        ew = ew<ow?ew:ow;
+        ew = ew<iw?ew:iw;
+      } else {
+        if (inmissing == 0)
+          ew = iw;
+      }
+
+      // Create flow.
+      if (ew != MissingValue) {
+        do {
+          Edge e = getEdge(P[Dest],Dest);
+          if (getEdgeWeight(e) == MissingValue) {
+            setEdgeWeight(e,ew);
+            FoundPath = true;
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Calculate a block with self loop.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+      bool SelfEdgeFound = false;
+      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
+           NBB != End; ++NBB) {
+        if (*NBB == BB) {
+          SelfEdgeFound = true;
+          break;
+        }
+      }
+      if (SelfEdgeFound) {
+        Edge e = getEdge(BB,BB);
+        if (getEdgeWeight(e) == MissingValue) {
+          double iw = 0;
+          std::set<const BasicBlock *> Processed;
+          for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+               NBB != End; ++NBB) {
+            if (Processed.insert(*NBB).second) {
+              Edge e = getEdge(*NBB, BB);
+              double ew = getEdgeWeight(e);
+              if (ew != MissingValue) {
+                iw += ew;
+              }
+            }
+          }
+          setEdgeWeight(e,iw * 10);
+          FoundPath = true;
+        }
+      }
+    }
+    if (FoundPath) continue;
+
+    // Determine backedges, set them to zero.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+      const BasicBlock *Dest = 0;
+      Path P;
+      bool BackEdgeFound = false;
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
+        if (Dest == *NBB) {
+          BackEdgeFound = true;
+          break;
+        }
+      }
+      if (BackEdgeFound) {
+        Edge e = getEdge(Dest,BB);
+        double w = getEdgeWeight(e);
+        if (w == MissingValue) {
+          setEdgeWeight(e,0);
+          FoundPath = true;
+        }
+        do {
+          Edge e = getEdge(P[Dest], Dest);
+          double w = getEdgeWeight(e);
+          if (w == MissingValue) {
+            setEdgeWeight(e,0);
+            FoundPath = true;
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Channel flow to return block.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+
+      Path P;
+      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToExit | GetPathWithNewEdges);
+      Dest = P[(const BasicBlock*)0];
+      if (!Dest) continue;
+
+      if (getEdgeWeight(getEdge(Dest,0)) == MissingValue) {
+        // Calculate incoming flow.
+        double iw = 0;
+        std::set<const BasicBlock *> Processed;
+        for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+             NBB != End; ++NBB) {
+          if (Processed.insert(*NBB).second) {
+            Edge e = getEdge(*NBB, BB);
+            double ew = getEdgeWeight(e);
+            if (ew != MissingValue) {
+              iw += ew;
+            }
+          }
+        }
+        do {
+          Edge e = getEdge(P[Dest], Dest);
+          double w = getEdgeWeight(e);
+          if (w == MissingValue) {
+            setEdgeWeight(e,iw);
+            FoundPath = true;
+          } else {
+            assert(0 && "Edge should not have value already!");
+          }
+          Dest = P[Dest];
+        } while (Dest != BB);
+      }
+    }
+    if (FoundPath) continue;
+
+    // Speculatively set edges to zero.
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE && !FoundPath) {
+      const BasicBlock *BB = *FI; ++FI;
+
+      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
+           NBB != End; ++NBB) {
+        Edge e = getEdge(*NBB,BB);
+        double w = getEdgeWeight(e);
+        if (w == MissingValue) {
+          setEdgeWeight(e,0);
+          FoundPath = true;
+          break;
+        }
+      }
+    }
+    if (FoundPath) continue;
+
+    errs() << "{";
+    FI = Unvisited.begin(), FE = Unvisited.end();
+    while(FI != FE) {
+      const BasicBlock *BB = *FI; ++FI;
+      dbgs() << BB->getName();
+      if (FI != FE)
+        dbgs() << ",";
+    }
+    errs() << "}";
+
+    errs() << "ASSERT: could not repair function";
+    assert(0 && "could not repair function");
+  }
+
+  EdgeWeights J = EdgeInformation[F];
+  for (EdgeWeights::iterator EI = J.begin(), EE = J.end(); EI != EE; ++EI) {
+    Edge e = EI->first;
+
+    bool SuccFound = false;
+    if (e.first != 0) {
+      succ_const_iterator NBB = succ_begin(e.first), End = succ_end(e.first);
+      if (NBB == End) {
+        if (0 == e.second) {
+          SuccFound = true;
+        }
+      }
+      for (;NBB != End; ++NBB) {
+        if (*NBB == e.second) {
+          SuccFound = true;
+          break;
+        }
+      }
+      if (!SuccFound) {
+        removeEdge(e);
+      }
+    }
+  }
+}
+
+raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
+  return O << MF->getFunction()->getName() << "(MF)";
+}
+
+raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
+  return O << MBB->getBasicBlock()->getName() << "(MB)";
+}
+
+raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
+  O << "(";
+
+  if (E.first)
+    O << E.first;
+  else
+    O << "0";
+
+  O << ",";
+
+  if (E.second)
+    O << E.second;
+  else
+    O << "0";
+
+  return O << ")";
+}
+
+} // namespace llvm
+
+//===----------------------------------------------------------------------===//
+//  NoProfile ProfileInfo implementation
+//
+
+namespace {
+  struct NoProfileInfo : public ImmutablePass, public ProfileInfo {
+    static char ID; // Class identification, replacement for typeinfo
+    NoProfileInfo() : ImmutablePass(ID) {
+      initializeNoProfileInfoPass(*PassRegistry::getPassRegistry());
+    }
+    
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
+        return (ProfileInfo*)this;
+      return this;
+    }
+    
+    virtual const char *getPassName() const {
+      return "NoProfileInfo";
+    }
+  };
+}  // End of anonymous namespace
+
+char NoProfileInfo::ID = 0;
+// Register this pass...
+INITIALIZE_AG_PASS(NoProfileInfo, ProfileInfo, "no-profile",
+                   "No Profile Information", false, true, true)
+
+ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/contrib/llvm/lib/Analysis/ProfileInfoLoader.cpp b/contrib/llvm/lib/Analysis/ProfileInfoLoader.cpp
new file mode 100644
index 000000000000..f1f3e940c932
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileInfoLoader.cpp
@@ -0,0 +1,155 @@
+//===- ProfileInfoLoad.cpp - Load profile information from disk -----------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileInfoLoader class is used to load and represent profiling
+// information read in from the dump file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ProfileInfoLoader.h"
+#include "llvm/Analysis/ProfileInfoTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+// ByteSwap - Byteswap 'Var' if 'Really' is true.
+//
+static inline unsigned ByteSwap(unsigned Var, bool Really) {
+  if (!Really) return Var;
+  return ((Var & (255U<< 0U)) << 24U) |
+         ((Var & (255U<< 8U)) <<  8U) |
+         ((Var & (255U<<16U)) >>  8U) |
+         ((Var & (255U<<24U)) >> 24U);
+}
+
+static unsigned AddCounts(unsigned A, unsigned B) {
+  // If either value is undefined, use the other.
+  if (A == ProfileInfoLoader::Uncounted) return B;
+  if (B == ProfileInfoLoader::Uncounted) return A;
+  return A + B;
+}
+
+static void ReadProfilingBlock(const char *ToolName, FILE *F,
+                               bool ShouldByteSwap,
+                               std::vector<unsigned> &Data) {
+  // Read the number of entries...
+  unsigned NumEntries;
+  if (fread(&NumEntries, sizeof(unsigned), 1, F) != 1) {
+    errs() << ToolName << ": data packet truncated!\n";
+    perror(0);
+    exit(1);
+  }
+  NumEntries = ByteSwap(NumEntries, ShouldByteSwap);
+
+  // Read the counts...
+  std::vector<unsigned> TempSpace(NumEntries);
+
+  // Read in the block of data...
+  if (fread(&TempSpace[0], sizeof(unsigned)*NumEntries, 1, F) != 1) {
+    errs() << ToolName << ": data packet truncated!\n";
+    perror(0);
+    exit(1);
+  }
+
+  // Make sure we have enough space... The space is initialised to -1 to
+  // facitiltate the loading of missing values for OptimalEdgeProfiling.
+  if (Data.size() < NumEntries)
+    Data.resize(NumEntries, ProfileInfoLoader::Uncounted);
+
+  // Accumulate the data we just read into the data.
+  if (!ShouldByteSwap) {
+    for (unsigned i = 0; i != NumEntries; ++i) {
+      Data[i] = AddCounts(TempSpace[i], Data[i]);
+    }
+  } else {
+    for (unsigned i = 0; i != NumEntries; ++i) {
+      Data[i] = AddCounts(ByteSwap(TempSpace[i], true), Data[i]);
+    }
+  }
+}
+
+const unsigned ProfileInfoLoader::Uncounted = ~0U;
+
+// ProfileInfoLoader ctor - Read the specified profiling data file, exiting the
+// program if the file is invalid or broken.
+//
+ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
+                                     const std::string &Filename)
+  : Filename(Filename) {
+  FILE *F = fopen(Filename.c_str(), "rb");
+  if (F == 0) {
+    errs() << ToolName << ": Error opening '" << Filename << "': ";
+    perror(0);
+    exit(1);
+  }
+
+  // Keep reading packets until we run out of them.
+  unsigned PacketType;
+  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
+    // If the low eight bits of the packet are zero, we must be dealing with an
+    // endianness mismatch.  Byteswap all words read from the profiling
+    // information.
+    bool ShouldByteSwap = (char)PacketType == 0;
+    PacketType = ByteSwap(PacketType, ShouldByteSwap);
+
+    switch (PacketType) {
+    case ArgumentInfo: {
+      unsigned ArgLength;
+      if (fread(&ArgLength, sizeof(unsigned), 1, F) != 1) {
+        errs() << ToolName << ": arguments packet truncated!\n";
+        perror(0);
+        exit(1);
+      }
+      ArgLength = ByteSwap(ArgLength, ShouldByteSwap);
+
+      // Read in the arguments...
+      std::vector<char> Chars(ArgLength+4);
+
+      if (ArgLength)
+        if (fread(&Chars[0], (ArgLength+3) & ~3, 1, F) != 1) {
+          errs() << ToolName << ": arguments packet truncated!\n";
+          perror(0);
+          exit(1);
+        }
+      CommandLines.push_back(std::string(&Chars[0], &Chars[ArgLength]));
+      break;
+    }
+
+    case FunctionInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, FunctionCounts);
+      break;
+
+    case BlockInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BlockCounts);
+      break;
+
+    case EdgeInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
+      break;
+
+    case OptEdgeInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, OptimalEdgeCounts);
+      break;
+
+    case BBTraceInfo:
+      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BBTrace);
+      break;
+
+    default:
+      errs() << ToolName << ": Unknown packet type #" << PacketType << "!\n";
+      exit(1);
+    }
+  }
+
+  fclose(F);
+}
+
diff --git a/contrib/llvm/lib/Analysis/ProfileInfoLoaderPass.cpp b/contrib/llvm/lib/Analysis/ProfileInfoLoaderPass.cpp
new file mode 100644
index 000000000000..346f8d6d6258
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -0,0 +1,267 @@
+//===- ProfileInfoLoaderPass.cpp - LLVM Pass to load profile info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a concrete implementation of profiling information that
+// loads the information from a profile dump file.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-loader"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Analysis/ProfileInfoLoader.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumEdgesRead, "The # of edges read.");
+
+static cl::opt<std::string>
+ProfileInfoFilename("profile-info-file", cl::init("llvmprof.out"),
+                    cl::value_desc("filename"),
+                    cl::desc("Profile file loaded by -profile-loader"));
+
+namespace {
+  class LoaderPass : public ModulePass, public ProfileInfo {
+    std::string Filename;
+    std::set<Edge> SpanningTree;
+    std::set<const BasicBlock*> BBisUnvisited;
+    unsigned ReadCount;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit LoaderPass(const std::string &filename = "")
+      : ModulePass(ID), Filename(filename) {
+      initializeLoaderPassPass(*PassRegistry::getPassRegistry());
+      if (filename.empty()) Filename = ProfileInfoFilename;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profiling information loader";
+    }
+
+    // recurseBasicBlock() - Calculates the edge weights for as much basic
+    // blocks as possbile.
+    virtual void recurseBasicBlock(const BasicBlock *BB);
+    virtual void readEdgeOrRemember(Edge, Edge&, unsigned &, double &);
+    virtual void readEdge(ProfileInfo::Edge, std::vector<unsigned>&);
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &ProfileInfo::ID)
+        return (ProfileInfo*)this;
+      return this;
+    }
+    
+    /// run - Load the profile information from the specified file.
+    virtual bool runOnModule(Module &M);
+  };
+}  // End of anonymous namespace
+
+char LoaderPass::ID = 0;
+INITIALIZE_AG_PASS(LoaderPass, ProfileInfo, "profile-loader",
+              "Load profile information from llvmprof.out", false, true, false)
+
+char &llvm::ProfileLoaderPassID = LoaderPass::ID;
+
+ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
+
+/// createProfileLoaderPass - This function returns a Pass that loads the
+/// profiling information for the module from the specified filename, making it
+/// available to the optimizers.
+Pass *llvm::createProfileLoaderPass(const std::string &Filename) {
+  return new LoaderPass(Filename);
+}
+
+void LoaderPass::readEdgeOrRemember(Edge edge, Edge &tocalc, 
+                                    unsigned &uncalc, double &count) {
+  double w;
+  if ((w = getEdgeWeight(edge)) == MissingValue) {
+    tocalc = edge;
+    uncalc++;
+  } else {
+    count+=w;
+  }
+}
+
+// recurseBasicBlock - Visits all neighbours of a block and then tries to
+// calculate the missing edge values.
+void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
+
+  // break recursion if already visited
+  if (BBisUnvisited.find(BB) == BBisUnvisited.end()) return;
+  BBisUnvisited.erase(BB);
+  if (!BB) return;
+
+  for (succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+       bbi != bbe; ++bbi) {
+    recurseBasicBlock(*bbi);
+  }
+  for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+       bbi != bbe; ++bbi) {
+    recurseBasicBlock(*bbi);
+  }
+
+  Edge tocalc;
+  if (CalculateMissingEdge(BB, tocalc)) {
+    SpanningTree.erase(tocalc);
+  }
+}
+
+void LoaderPass::readEdge(ProfileInfo::Edge e,
+                          std::vector<unsigned> &ECs) {
+  if (ReadCount < ECs.size()) {
+    double weight = ECs[ReadCount++];
+    if (weight != ProfileInfoLoader::Uncounted) {
+      // Here the data realm changes from the unsigned of the file to the
+      // double of the ProfileInfo. This conversion is save because we know
+      // that everything thats representable in unsinged is also representable
+      // in double.
+      EdgeInformation[getFunction(e)][e] += (double)weight;
+
+      DEBUG(dbgs() << "--Read Edge Counter for " << e
+                   << " (# "<< (ReadCount-1) << "): "
+                   << (unsigned)getEdgeWeight(e) << "\n");
+    } else {
+      // This happens only if reading optimal profiling information, not when
+      // reading regular profiling information.
+      SpanningTree.insert(e);
+    }
+  }
+}
+
+bool LoaderPass::runOnModule(Module &M) {
+  ProfileInfoLoader PIL("profile-loader", Filename);
+
+  EdgeInformation.clear();
+  std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
+  if (Counters.size() > 0) {
+    ReadCount = 0;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+      if (F->isDeclaration()) continue;
+      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
+      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
+        }
+      }
+    }
+    if (ReadCount != Counters.size()) {
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
+    }
+    NumEdgesRead = ReadCount;
+  }
+
+  Counters = PIL.getRawOptimalEdgeCounts();
+  if (Counters.size() > 0) {
+    ReadCount = 0;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+      if (F->isDeclaration()) continue;
+      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
+      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        if (TI->getNumSuccessors() == 0) {
+          readEdge(getEdge(BB,0), Counters);
+        }
+        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
+        }
+      }
+      while (SpanningTree.size() > 0) {
+
+        unsigned size = SpanningTree.size();
+
+        BBisUnvisited.clear();
+        for (std::set<Edge>::iterator ei = SpanningTree.begin(),
+             ee = SpanningTree.end(); ei != ee; ++ei) {
+          BBisUnvisited.insert(ei->first);
+          BBisUnvisited.insert(ei->second);
+        }
+        while (BBisUnvisited.size() > 0) {
+          recurseBasicBlock(*BBisUnvisited.begin());
+        }
+
+        if (SpanningTree.size() == size) {
+          DEBUG(dbgs()<<"{");
+          for (std::set<Edge>::iterator ei = SpanningTree.begin(),
+               ee = SpanningTree.end(); ei != ee; ++ei) {
+            DEBUG(dbgs()<< *ei <<",");
+          }
+          assert(0 && "No edge calculated!");
+        }
+
+      }
+    }
+    if (ReadCount != Counters.size()) {
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
+    }
+    NumEdgesRead = ReadCount;
+  }
+
+  BlockInformation.clear();
+  Counters = PIL.getRawBlockCounts();
+  if (Counters.size() > 0) {
+    ReadCount = 0;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+      if (F->isDeclaration()) continue;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        if (ReadCount < Counters.size())
+          // Here the data realm changes from the unsigned of the file to the
+          // double of the ProfileInfo. This conversion is save because we know
+          // that everything thats representable in unsinged is also
+          // representable in double.
+          BlockInformation[F][BB] = (double)Counters[ReadCount++];
+    }
+    if (ReadCount != Counters.size()) {
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
+    }
+  }
+
+  FunctionInformation.clear();
+  Counters = PIL.getRawFunctionCounts();
+  if (Counters.size() > 0) {
+    ReadCount = 0;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+      if (F->isDeclaration()) continue;
+      if (ReadCount < Counters.size())
+        // Here the data realm changes from the unsigned of the file to the
+        // double of the ProfileInfo. This conversion is save because we know
+        // that everything thats representable in unsinged is also
+        // representable in double.
+        FunctionInformation[F] = (double)Counters[ReadCount++];
+    }
+    if (ReadCount != Counters.size()) {
+      errs() << "WARNING: profile information is inconsistent with "
+             << "the current program!\n";
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/ProfileVerifierPass.cpp b/contrib/llvm/lib/Analysis/ProfileVerifierPass.cpp
new file mode 100644
index 000000000000..c8896de89301
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ProfileVerifierPass.cpp
@@ -0,0 +1,383 @@
+//===- ProfileVerifierPass.cpp - LLVM Pass to estimate profile info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that checks profiling information for 
+// plausibility.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-verifier"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+using namespace llvm;
+
+static cl::opt<bool,false>
+ProfileVerifierDisableAssertions("profile-verifier-noassert",
+     cl::desc("Disable assertions"));
+
+namespace {
+  template<class FType, class BType>
+  class ProfileVerifierPassT : public FunctionPass {
+
+    struct DetailedBlockInfo {
+      const BType *BB;
+      double      BBWeight;
+      double      inWeight;
+      int         inCount;
+      double      outWeight;
+      int         outCount;
+    };
+
+    ProfileInfoT<FType, BType> *PI;
+    std::set<const BType*> BBisVisited;
+    std::set<const FType*>   FisVisited;
+    bool DisableAssertions;
+
+    // When debugging is enabled, the verifier prints a whole slew of debug
+    // information, otherwise its just the assert. These are all the helper
+    // functions.
+    bool PrintedDebugTree;
+    std::set<const BType*> BBisPrinted;
+    void debugEntry(DetailedBlockInfo*);
+    void printDebugInfo(const BType *BB);
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+
+    explicit ProfileVerifierPassT () : FunctionPass(ID) {
+      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
+      DisableAssertions = ProfileVerifierDisableAssertions;
+    }
+    explicit ProfileVerifierPassT (bool da) : FunctionPass(ID), 
+                                              DisableAssertions(da) {
+      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<ProfileInfoT<FType, BType> >();
+    }
+
+    const char *getPassName() const {
+      return "Profiling information verifier";
+    }
+
+    /// run - Verify the profile information.
+    bool runOnFunction(FType &F);
+    void recurseBasicBlock(const BType*);
+
+    bool   exitReachable(const FType*);
+    double ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge);
+    void   CheckValue(bool, const char*, DetailedBlockInfo*);
+  };
+
+  typedef ProfileVerifierPassT<Function, BasicBlock> ProfileVerifierPass;
+
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::printDebugInfo(const BType *BB) {
+
+    if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
+
+    double BBWeight = PI->getExecutionCount(BB);
+    if (BBWeight == ProfileInfoT<FType, BType>::MissingValue) { BBWeight = 0; }
+    double inWeight = 0;
+    int inCount = 0;
+    std::set<const BType*> ProcessedPreds;
+    for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
+         bbi != bbe; ++bbi ) {
+      if (ProcessedPreds.insert(*bbi).second) {
+        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
+        double EdgeWeight = PI->getEdgeWeight(E);
+        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
+        dbgs() << "calculated in-edge " << E << ": " 
+               << format("%20.20g",EdgeWeight) << "\n";
+        inWeight += EdgeWeight;
+        inCount++;
+      }
+    }
+    double outWeight = 0;
+    int outCount = 0;
+    std::set<const BType*> ProcessedSuccs;
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+          bbi != bbe; ++bbi ) {
+      if (ProcessedSuccs.insert(*bbi).second) {
+        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(BB,*bbi);
+        double EdgeWeight = PI->getEdgeWeight(E);
+        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
+        dbgs() << "calculated out-edge " << E << ": " 
+               << format("%20.20g",EdgeWeight) << "\n";
+        outWeight += EdgeWeight;
+        outCount++;
+      }
+    }
+    dbgs() << "Block " << BB->getName()                   << " in "
+           << BB->getParent()->getName()                  << ":"
+           << "BBWeight="  << format("%20.20g",BBWeight)  << ","
+           << "inWeight="  << format("%20.20g",inWeight)  << ","
+           << "inCount="   << inCount                     << ","
+           << "outWeight=" << format("%20.20g",outWeight) << ","
+           << "outCount"   << outCount                    << "\n";
+
+    // mark as visited and recurse into subnodes
+    BBisPrinted.insert(BB);
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
+          bbi != bbe; ++bbi ) {
+      printDebugInfo(*bbi);
+    }
+  }
+
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
+    dbgs() << "TROUBLE: Block " << DI->BB->getName()          << " in "
+           << DI->BB->getParent()->getName()                  << ":"
+           << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
+           << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
+           << "inCount="   << DI->inCount                     << ","
+           << "outWeight=" << format("%20.20g",DI->outWeight) << ","
+           << "outCount="  << DI->outCount                    << "\n";
+    if (!PrintedDebugTree) {
+      PrintedDebugTree = true;
+      printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
+    }
+  }
+
+  // This compares A and B for equality.
+  static bool Equals(double A, double B) {
+    return A == B;
+  }
+
+  // This checks if the function "exit" is reachable from an given function
+  // via calls, this is necessary to check if a profile is valid despite the
+  // counts not fitting exactly.
+  template<class FType, class BType>
+  bool ProfileVerifierPassT<FType, BType>::exitReachable(const FType *F) {
+    if (!F) return false;
+
+    if (FisVisited.count(F)) return false;
+
+    FType *Exit = F->getParent()->getFunction("exit");
+    if (Exit == F) {
+      return true;
+    }
+
+    FisVisited.insert(F);
+    bool exits = false;
+    for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+      if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
+        FType *F = CI->getCalledFunction();
+        if (F) {
+          exits |= exitReachable(F);
+        } else {
+          // This is a call to a pointer, all bets are off...
+          exits = true;
+        }
+        if (exits) break;
+      }
+    }
+    return exits;
+  }
+
+  #define ASSERTMESSAGE(M) \
+    { dbgs() << "ASSERT:" << (M) << "\n"; \
+      if (!DisableAssertions) assert(0 && (M)); }
+
+  template<class FType, class BType>
+  double ProfileVerifierPassT<FType, BType>::ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge E) {
+    double EdgeWeight = PI->getEdgeWeight(E);
+    if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
+      dbgs() << "Edge " << E << " in Function " 
+             << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
+      ASSERTMESSAGE("Edge has missing value");
+      return 0;
+    } else {
+      if (EdgeWeight < 0) {
+        dbgs() << "Edge " << E << " in Function " 
+               << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
+        ASSERTMESSAGE("Edge has negative value");
+      }
+      return EdgeWeight;
+    }
+  }
+
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::CheckValue(bool Error, 
+                                                      const char *Message,
+                                                      DetailedBlockInfo *DI) {
+    if (Error) {
+      DEBUG(debugEntry(DI));
+      dbgs() << "Block " << DI->BB->getName() << " in Function "
+             << DI->BB->getParent()->getName() << ": ";
+      ASSERTMESSAGE(Message);
+    }
+    return;
+  }
+
+  // This calculates the Information for a block and then recurses into the
+  // successors.
+  template<class FType, class BType>
+  void ProfileVerifierPassT<FType, BType>::recurseBasicBlock(const BType *BB) {
+
+    // Break the recursion by remembering all visited blocks.
+    if (BBisVisited.find(BB) != BBisVisited.end()) return;
+
+    // Use a data structure to store all the information, this can then be handed
+    // to debug printers.
+    DetailedBlockInfo DI;
+    DI.BB = BB;
+    DI.outCount = DI.inCount = 0;
+    DI.inWeight = DI.outWeight = 0;
+
+    // Read predecessors.
+    std::set<const BType*> ProcessedPreds;
+    const_pred_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
+    // If there are none, check for (0,BB) edge.
+    if (bpi == bpe) {
+      DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
+      DI.inCount++;
+    }
+    for (;bpi != bpe; ++bpi) {
+      if (ProcessedPreds.insert(*bpi).second) {
+        DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
+        DI.inCount++;
+      }
+    }
+
+    // Read successors.
+    std::set<const BType*> ProcessedSuccs;
+    succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
+    // If there is an (0,BB) edge, consider it too. (This is done not only when
+    // there are no successors, but every time; not every function contains
+    // return blocks with no successors (think loop latch as return block)).
+    double w = PI->getEdgeWeight(PI->getEdge(BB,0));
+    if (w != ProfileInfoT<FType, BType>::MissingValue) {
+      DI.outWeight += w;
+      DI.outCount++;
+    }
+    for (;bbi != bbe; ++bbi) {
+      if (ProcessedSuccs.insert(*bbi).second) {
+        DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
+        DI.outCount++;
+      }
+    }
+
+    // Read block weight.
+    DI.BBWeight = PI->getExecutionCount(BB);
+    CheckValue(DI.BBWeight == ProfileInfoT<FType, BType>::MissingValue,
+               "BasicBlock has missing value", &DI);
+    CheckValue(DI.BBWeight < 0,
+               "BasicBlock has negative value", &DI);
+
+    // Check if this block is a setjmp target.
+    bool isSetJmpTarget = false;
+    if (DI.outWeight > DI.inWeight) {
+      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
+           i != ie; ++i) {
+        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
+          FType *F = CI->getCalledFunction();
+          if (F && (F->getName() == "_setjmp")) {
+            isSetJmpTarget = true; break;
+          }
+        }
+      }
+    }
+    // Check if this block is eventually reaching exit.
+    bool isExitReachable = false;
+    if (DI.inWeight > DI.outWeight) {
+      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
+           i != ie; ++i) {
+        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
+          FType *F = CI->getCalledFunction();
+          if (F) {
+            FisVisited.clear();
+            isExitReachable |= exitReachable(F);
+          } else {
+            // This is a call to a pointer, all bets are off...
+            isExitReachable = true;
+          }
+          if (isExitReachable) break;
+        }
+      }
+    }
+
+    if (DI.inCount > 0 && DI.outCount == 0) {
+       // If this is a block with no successors.
+      if (!isSetJmpTarget) {
+        CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
+                   "inWeight and BBWeight do not match", &DI);
+      }
+    } else if (DI.inCount == 0 && DI.outCount > 0) {
+      // If this is a block with no predecessors.
+      if (!isExitReachable)
+        CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
+                   "BBWeight and outWeight do not match", &DI);
+    } else {
+      // If this block has successors and predecessors.
+      if (DI.inWeight > DI.outWeight && !isExitReachable)
+        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
+                   "inWeight and outWeight do not match", &DI);
+      if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
+        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
+                   "inWeight and outWeight do not match", &DI);
+    }
+
+
+    // Mark this block as visited, rescurse into successors.
+    BBisVisited.insert(BB);
+    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
+          bbi != bbe; ++bbi ) {
+      recurseBasicBlock(*bbi);
+    }
+  }
+
+  template<class FType, class BType>
+  bool ProfileVerifierPassT<FType, BType>::runOnFunction(FType &F) {
+    PI = getAnalysisIfAvailable<ProfileInfoT<FType, BType> >();
+    if (!PI)
+      ASSERTMESSAGE("No ProfileInfo available");
+
+    // Prepare global variables.
+    PrintedDebugTree = false;
+    BBisVisited.clear();
+
+    // Fetch entry block and recurse into it.
+    const BType *entry = &F.getEntryBlock();
+    recurseBasicBlock(entry);
+
+    if (PI->getExecutionCount(&F) != PI->getExecutionCount(entry))
+      ASSERTMESSAGE("Function count and entry block count do not match");
+
+    return false;
+  }
+
+  template<class FType, class BType>
+  char ProfileVerifierPassT<FType, BType>::ID = 0;
+}
+
+INITIALIZE_PASS_BEGIN(ProfileVerifierPass, "profile-verifier",
+                "Verify profiling information", false, true)
+INITIALIZE_AG_DEPENDENCY(ProfileInfo)
+INITIALIZE_PASS_END(ProfileVerifierPass, "profile-verifier",
+                "Verify profiling information", false, true)
+
+namespace llvm {
+  FunctionPass *createProfileVerifierPass() {
+    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
+  }
+}
+
diff --git a/contrib/llvm/lib/Analysis/PtrUseVisitor.cpp b/contrib/llvm/lib/Analysis/PtrUseVisitor.cpp
new file mode 100644
index 000000000000..0a342b2167e4
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -0,0 +1,36 @@
+//===- PtrUseVisitor.cpp - InstVisitors over a pointers uses --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// Implementation of the pointer use visitors.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/PtrUseVisitor.h"
+
+using namespace llvm;
+
+void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+       UI != UE; ++UI) {
+    if (VisitedUses.insert(&UI.getUse())) {
+      UseToVisit NewU = {
+        UseToVisit::UseAndIsOffsetKnownPair(&UI.getUse(), IsOffsetKnown),
+        Offset
+      };
+      Worklist.push_back(llvm_move(NewU));
+    }
+  }
+}
+
+bool detail::PtrUseVisitorBase::adjustOffsetForGEP(GetElementPtrInst &GEPI) {
+  if (!IsOffsetKnown)
+    return false;
+
+  return GEPI.accumulateConstantOffset(DL, Offset);
+}
diff --git a/contrib/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm/lib/Analysis/RegionInfo.cpp
new file mode 100644
index 000000000000..fad5074086ce
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/RegionInfo.cpp
@@ -0,0 +1,834 @@
+//===- RegionInfo.cpp - SESE region detection analysis --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Detects single entry single exit regions in the control flow graph.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define DEBUG_TYPE "region"
+#include "llvm/Support/Debug.h"
+
+#include <set>
+#include <algorithm>
+
+using namespace llvm;
+
+// Always verify if expensive checking is enabled.
+#ifdef XDEBUG
+static bool VerifyRegionInfo = true;
+#else
+static bool VerifyRegionInfo = false;
+#endif
+
+static cl::opt<bool,true>
+VerifyRegionInfoX("verify-region-info", cl::location(VerifyRegionInfo),
+                cl::desc("Verify region info (time consuming)"));
+
+STATISTIC(numRegions,       "The # of regions");
+STATISTIC(numSimpleRegions, "The # of simple regions");
+
+static cl::opt<enum Region::PrintStyle> printStyle("print-region-style",
+  cl::Hidden,
+  cl::desc("style of printing regions"),
+  cl::values(
+    clEnumValN(Region::PrintNone, "none",  "print no details"),
+    clEnumValN(Region::PrintBB, "bb",
+               "print regions in detail with block_iterator"),
+    clEnumValN(Region::PrintRN, "rn",
+               "print regions in detail with element_iterator"),
+    clEnumValEnd));
+//===----------------------------------------------------------------------===//
+/// Region Implementation
+Region::Region(BasicBlock *Entry, BasicBlock *Exit, RegionInfo* RInfo,
+               DominatorTree *dt, Region *Parent)
+               : RegionNode(Parent, Entry, 1), RI(RInfo), DT(dt), exit(Exit) {}
+
+Region::~Region() {
+  // Free the cached nodes.
+  for (BBNodeMapT::iterator it = BBNodeMap.begin(),
+         ie = BBNodeMap.end(); it != ie; ++it)
+    delete it->second;
+
+  // Only clean the cache for this Region. Caches of child Regions will be
+  // cleaned when the child Regions are deleted.
+  BBNodeMap.clear();
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+}
+
+void Region::replaceEntry(BasicBlock *BB) {
+  entry.setPointer(BB);
+}
+
+void Region::replaceExit(BasicBlock *BB) {
+  assert(exit && "No exit to replace!");
+  exit = BB;
+}
+
+bool Region::contains(const BasicBlock *B) const {
+  BasicBlock *BB = const_cast<BasicBlock*>(B);
+
+  assert(DT->getNode(BB) && "BB not part of the dominance tree");
+
+  BasicBlock *entry = getEntry(), *exit = getExit();
+
+  // Toplevel region.
+  if (!exit)
+    return true;
+
+  return (DT->dominates(entry, BB)
+    && !(DT->dominates(exit, BB) && DT->dominates(entry, exit)));
+}
+
+bool Region::contains(const Loop *L) const {
+  // BBs that are not part of any loop are element of the Loop
+  // described by the NULL pointer. This loop is not part of any region,
+  // except if the region describes the whole function.
+  if (L == 0)
+    return getExit() == 0;
+
+  if (!contains(L->getHeader()))
+    return false;
+
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (SmallVectorImpl<BasicBlock*>::iterator BI = ExitingBlocks.begin(),
+       BE = ExitingBlocks.end(); BI != BE; ++BI)
+    if (!contains(*BI))
+      return false;
+
+  return true;
+}
+
+Loop *Region::outermostLoopInRegion(Loop *L) const {
+  if (!contains(L))
+    return 0;
+
+  while (L && contains(L->getParentLoop())) {
+    L = L->getParentLoop();
+  }
+
+  return L;
+}
+
+Loop *Region::outermostLoopInRegion(LoopInfo *LI, BasicBlock* BB) const {
+  assert(LI && BB && "LI and BB cannot be null!");
+  Loop *L = LI->getLoopFor(BB);
+  return outermostLoopInRegion(L);
+}
+
+BasicBlock *Region::getEnteringBlock() const {
+  BasicBlock *entry = getEntry();
+  BasicBlock *Pred;
+  BasicBlock *enteringBlock = 0;
+
+  for (pred_iterator PI = pred_begin(entry), PE = pred_end(entry); PI != PE;
+       ++PI) {
+    Pred = *PI;
+    if (DT->getNode(Pred) && !contains(Pred)) {
+      if (enteringBlock)
+        return 0;
+
+      enteringBlock = Pred;
+    }
+  }
+
+  return enteringBlock;
+}
+
+BasicBlock *Region::getExitingBlock() const {
+  BasicBlock *exit = getExit();
+  BasicBlock *Pred;
+  BasicBlock *exitingBlock = 0;
+
+  if (!exit)
+    return 0;
+
+  for (pred_iterator PI = pred_begin(exit), PE = pred_end(exit); PI != PE;
+       ++PI) {
+    Pred = *PI;
+    if (contains(Pred)) {
+      if (exitingBlock)
+        return 0;
+
+      exitingBlock = Pred;
+    }
+  }
+
+  return exitingBlock;
+}
+
+bool Region::isSimple() const {
+  return !isTopLevelRegion() && getEnteringBlock() && getExitingBlock();
+}
+
+std::string Region::getNameStr() const {
+  std::string exitName;
+  std::string entryName;
+
+  if (getEntry()->getName().empty()) {
+    raw_string_ostream OS(entryName);
+
+    WriteAsOperand(OS, getEntry(), false);
+  } else
+    entryName = getEntry()->getName();
+
+  if (getExit()) {
+    if (getExit()->getName().empty()) {
+      raw_string_ostream OS(exitName);
+
+      WriteAsOperand(OS, getExit(), false);
+    } else
+      exitName = getExit()->getName();
+  } else
+    exitName = "<Function Return>";
+
+  return entryName + " => " + exitName;
+}
+
+void Region::verifyBBInRegion(BasicBlock *BB) const {
+  if (!contains(BB))
+    llvm_unreachable("Broken region found!");
+
+  BasicBlock *entry = getEntry(), *exit = getExit();
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    if (!contains(*SI) && exit != *SI)
+      llvm_unreachable("Broken region found!");
+
+  if (entry != BB)
+    for (pred_iterator SI = pred_begin(BB), SE = pred_end(BB); SI != SE; ++SI)
+      if (!contains(*SI))
+        llvm_unreachable("Broken region found!");
+}
+
+void Region::verifyWalk(BasicBlock *BB, std::set<BasicBlock*> *visited) const {
+  BasicBlock *exit = getExit();
+
+  visited->insert(BB);
+
+  verifyBBInRegion(BB);
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    if (*SI != exit && visited->find(*SI) == visited->end())
+        verifyWalk(*SI, visited);
+}
+
+void Region::verifyRegion() const {
+  // Only do verification when user wants to, otherwise this expensive
+  // check will be invoked by PassManager.
+  if (!VerifyRegionInfo) return;
+
+  std::set<BasicBlock*> visited;
+  verifyWalk(getEntry(), &visited);
+}
+
+void Region::verifyRegionNest() const {
+  for (Region::const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
+    (*RI)->verifyRegionNest();
+
+  verifyRegion();
+}
+
+Region::element_iterator Region::element_begin() {
+  return GraphTraits<Region*>::nodes_begin(this);
+}
+
+Region::element_iterator Region::element_end() {
+  return GraphTraits<Region*>::nodes_end(this);
+}
+
+Region::const_element_iterator Region::element_begin() const {
+  return GraphTraits<const Region*>::nodes_begin(this);
+}
+
+Region::const_element_iterator Region::element_end() const {
+  return GraphTraits<const Region*>::nodes_end(this);
+}
+
+Region* Region::getSubRegionNode(BasicBlock *BB) const {
+  Region *R = RI->getRegionFor(BB);
+
+  if (!R || R == this)
+    return 0;
+
+  // If we pass the BB out of this region, that means our code is broken.
+  assert(contains(R) && "BB not in current region!");
+
+  while (contains(R->getParent()) && R->getParent() != this)
+    R = R->getParent();
+
+  if (R->getEntry() != BB)
+    return 0;
+
+  return R;
+}
+
+RegionNode* Region::getBBNode(BasicBlock *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+
+  BBNodeMapT::const_iterator at = BBNodeMap.find(BB);
+
+  if (at != BBNodeMap.end())
+    return at->second;
+
+  RegionNode *NewNode = new RegionNode(const_cast<Region*>(this), BB);
+  BBNodeMap.insert(std::make_pair(BB, NewNode));
+  return NewNode;
+}
+
+RegionNode* Region::getNode(BasicBlock *BB) const {
+  assert(contains(BB) && "Can get BB node out of this region!");
+  if (Region* Child = getSubRegionNode(BB))
+    return Child->getNode();
+
+  return getBBNode(BB);
+}
+
+void Region::transferChildrenTo(Region *To) {
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    (*I)->parent = To;
+    To->children.push_back(*I);
+  }
+  children.clear();
+}
+
+void Region::addSubRegion(Region *SubRegion, bool moveChildren) {
+  assert(SubRegion->parent == 0 && "SubRegion already has a parent!");
+  assert(std::find(begin(), end(), SubRegion) == children.end()
+         && "Subregion already exists!");
+
+  SubRegion->parent = this;
+  children.push_back(SubRegion);
+
+  if (!moveChildren)
+    return;
+
+  assert(SubRegion->children.size() == 0
+         && "SubRegions that contain children are not supported");
+
+  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I)
+    if (!(*I)->isSubRegion()) {
+      BasicBlock *BB = (*I)->getNodeAs<BasicBlock>();
+
+      if (SubRegion->contains(BB))
+        RI->setRegionFor(BB, SubRegion);
+    }
+
+  std::vector<Region*> Keep;
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    if (SubRegion->contains(*I) && *I != SubRegion) {
+      SubRegion->children.push_back(*I);
+      (*I)->parent = SubRegion;
+    } else
+      Keep.push_back(*I);
+
+  children.clear();
+  children.insert(children.begin(), Keep.begin(), Keep.end());
+}
+
+
+Region *Region::removeSubRegion(Region *Child) {
+  assert(Child->parent == this && "Child is not a child of this region!");
+  Child->parent = 0;
+  RegionSet::iterator I = std::find(children.begin(), children.end(), Child);
+  assert(I != children.end() && "Region does not exit. Unable to remove.");
+  children.erase(children.begin()+(I-begin()));
+  return Child;
+}
+
+unsigned Region::getDepth() const {
+  unsigned Depth = 0;
+
+  for (Region *R = parent; R != 0; R = R->parent)
+    ++Depth;
+
+  return Depth;
+}
+
+Region *Region::getExpandedRegion() const {
+  unsigned NumSuccessors = exit->getTerminator()->getNumSuccessors();
+
+  if (NumSuccessors == 0)
+    return NULL;
+
+  for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
+       PI != PE; ++PI)
+    if (!DT->dominates(getEntry(), *PI))
+      return NULL;
+
+  Region *R = RI->getRegionFor(exit);
+
+  if (R->getEntry() != exit) {
+    if (exit->getTerminator()->getNumSuccessors() == 1)
+      return new Region(getEntry(), *succ_begin(exit), RI, DT);
+    else
+      return NULL;
+  }
+
+  while (R->getParent() && R->getParent()->getEntry() == exit)
+    R = R->getParent();
+
+  if (!DT->dominates(getEntry(), R->getExit()))
+    for (pred_iterator PI = pred_begin(getExit()), PE = pred_end(getExit());
+         PI != PE; ++PI)
+    if (!DT->dominates(R->getExit(), *PI))
+      return NULL;
+
+  return new Region(getEntry(), R->getExit(), RI, DT);
+}
+
+void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
+                   enum PrintStyle Style) const {
+  if (print_tree)
+    OS.indent(level*2) << "[" << level << "] " << getNameStr();
+  else
+    OS.indent(level*2) << getNameStr();
+
+  OS << "\n";
+
+
+  if (Style != PrintNone) {
+    OS.indent(level*2) << "{\n";
+    OS.indent(level*2 + 2);
+
+    if (Style == PrintBB) {
+      for (const_block_iterator I = block_begin(), E = block_end(); I != E; ++I)
+        OS << (*I)->getName() << ", "; // TODO: remove the last ","
+    } else if (Style == PrintRN) {
+      for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
+        OS << **I << ", "; // TODO: remove the last ",
+    }
+
+    OS << "\n";
+  }
+
+  if (print_tree)
+    for (const_iterator RI = begin(), RE = end(); RI != RE; ++RI)
+      (*RI)->print(OS, print_tree, level+1, Style);
+
+  if (Style != PrintNone)
+    OS.indent(level*2) << "} \n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Region::dump() const {
+  print(dbgs(), true, getDepth(), printStyle.getValue());
+}
+#endif
+
+void Region::clearNodeCache() {
+  // Free the cached nodes.
+  for (BBNodeMapT::iterator I = BBNodeMap.begin(),
+       IE = BBNodeMap.end(); I != IE; ++I)
+    delete I->second;
+
+  BBNodeMap.clear();
+  for (Region::iterator RI = begin(), RE = end(); RI != RE; ++RI)
+    (*RI)->clearNodeCache();
+}
+
+//===----------------------------------------------------------------------===//
+// RegionInfo implementation
+//
+
+bool RegionInfo::isCommonDomFrontier(BasicBlock *BB, BasicBlock *entry,
+                                     BasicBlock *exit) const {
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (DT->dominates(entry, P) && !DT->dominates(exit, P))
+      return false;
+  }
+  return true;
+}
+
+bool RegionInfo::isRegion(BasicBlock *entry, BasicBlock *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+  typedef DominanceFrontier::DomSetType DST;
+
+  DST *entrySuccs = &DF->find(entry)->second;
+
+  // Exit is the header of a loop that contains the entry. In this case,
+  // the dominance frontier must only contain the exit.
+  if (!DT->dominates(entry, exit)) {
+    for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
+         SI != SE; ++SI)
+      if (*SI != exit && *SI != entry)
+        return false;
+
+    return true;
+  }
+
+  DST *exitSuccs = &DF->find(exit)->second;
+
+  // Do not allow edges leaving the region.
+  for (DST::iterator SI = entrySuccs->begin(), SE = entrySuccs->end();
+       SI != SE; ++SI) {
+    if (*SI == exit || *SI == entry)
+      continue;
+    if (exitSuccs->find(*SI) == exitSuccs->end())
+      return false;
+    if (!isCommonDomFrontier(*SI, entry, exit))
+      return false;
+  }
+
+  // Do not allow edges pointing into the region.
+  for (DST::iterator SI = exitSuccs->begin(), SE = exitSuccs->end();
+       SI != SE; ++SI)
+    if (DT->properlyDominates(entry, *SI) && *SI != exit)
+      return false;
+
+
+  return true;
+}
+
+void RegionInfo::insertShortCut(BasicBlock *entry, BasicBlock *exit,
+                             BBtoBBMap *ShortCut) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  BBtoBBMap::iterator e = ShortCut->find(exit);
+
+  if (e == ShortCut->end())
+    // No further region at exit available.
+    (*ShortCut)[entry] = exit;
+  else {
+    // We found a region e that starts at exit. Therefore (entry, e->second)
+    // is also a region, that is larger than (entry, exit). Insert the
+    // larger one.
+    BasicBlock *BB = e->second;
+    (*ShortCut)[entry] = BB;
+  }
+}
+
+DomTreeNode* RegionInfo::getNextPostDom(DomTreeNode* N,
+                                        BBtoBBMap *ShortCut) const {
+  BBtoBBMap::iterator e = ShortCut->find(N->getBlock());
+
+  if (e == ShortCut->end())
+    return N->getIDom();
+
+  return PDT->getNode(e->second)->getIDom();
+}
+
+bool RegionInfo::isTrivialRegion(BasicBlock *entry, BasicBlock *exit) const {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  unsigned num_successors = succ_end(entry) - succ_begin(entry);
+
+  if (num_successors <= 1 && exit == *(succ_begin(entry)))
+    return true;
+
+  return false;
+}
+
+void RegionInfo::updateStatistics(Region *R) {
+  ++numRegions;
+
+  // TODO: Slow. Should only be enabled if -stats is used.
+  if (R->isSimple()) ++numSimpleRegions;
+}
+
+Region *RegionInfo::createRegion(BasicBlock *entry, BasicBlock *exit) {
+  assert(entry && exit && "entry and exit must not be null!");
+
+  if (isTrivialRegion(entry, exit))
+    return 0;
+
+  Region *region = new Region(entry, exit, this, DT);
+  BBtoRegion.insert(std::make_pair(entry, region));
+
+ #ifdef XDEBUG
+    region->verifyRegion();
+ #else
+    DEBUG(region->verifyRegion());
+ #endif
+
+  updateStatistics(region);
+  return region;
+}
+
+void RegionInfo::findRegionsWithEntry(BasicBlock *entry, BBtoBBMap *ShortCut) {
+  assert(entry);
+
+  DomTreeNode *N = PDT->getNode(entry);
+
+  if (!N)
+    return;
+
+  Region *lastRegion= 0;
+  BasicBlock *lastExit = entry;
+
+  // As only a BasicBlock that postdominates entry can finish a region, walk the
+  // post dominance tree upwards.
+  while ((N = getNextPostDom(N, ShortCut))) {
+    BasicBlock *exit = N->getBlock();
+
+    if (!exit)
+      break;
+
+    if (isRegion(entry, exit)) {
+      Region *newRegion = createRegion(entry, exit);
+
+      if (lastRegion)
+        newRegion->addSubRegion(lastRegion);
+
+      lastRegion = newRegion;
+      lastExit = exit;
+    }
+
+    // This can never be a region, so stop the search.
+    if (!DT->dominates(entry, exit))
+      break;
+  }
+
+  // Tried to create regions from entry to lastExit.  Next time take a
+  // shortcut from entry to lastExit.
+  if (lastExit != entry)
+    insertShortCut(entry, lastExit, ShortCut);
+}
+
+void RegionInfo::scanForRegions(Function &F, BBtoBBMap *ShortCut) {
+  BasicBlock *entry = &(F.getEntryBlock());
+  DomTreeNode *N = DT->getNode(entry);
+
+  // Iterate over the dominance tree in post order to start with the small
+  // regions from the bottom of the dominance tree.  If the small regions are
+  // detected first, detection of bigger regions is faster, as we can jump
+  // over the small regions.
+  for (po_iterator<DomTreeNode*> FI = po_begin(N), FE = po_end(N); FI != FE;
+    ++FI) {
+    findRegionsWithEntry(FI->getBlock(), ShortCut);
+  }
+}
+
+Region *RegionInfo::getTopMostParent(Region *region) {
+  while (region->parent)
+    region = region->getParent();
+
+  return region;
+}
+
+void RegionInfo::buildRegionsTree(DomTreeNode *N, Region *region) {
+  BasicBlock *BB = N->getBlock();
+
+  // Passed region exit
+  while (BB == region->getExit())
+    region = region->getParent();
+
+  BBtoRegionMap::iterator it = BBtoRegion.find(BB);
+
+  // This basic block is a start block of a region. It is already in the
+  // BBtoRegion relation. Only the child basic blocks have to be updated.
+  if (it != BBtoRegion.end()) {
+    Region *newRegion = it->second;
+    region->addSubRegion(getTopMostParent(newRegion));
+    region = newRegion;
+  } else {
+    BBtoRegion[BB] = region;
+  }
+
+  for (DomTreeNode::iterator CI = N->begin(), CE = N->end(); CI != CE; ++CI)
+    buildRegionsTree(*CI, region);
+}
+
+void RegionInfo::releaseMemory() {
+  BBtoRegion.clear();
+  if (TopLevelRegion)
+    delete TopLevelRegion;
+  TopLevelRegion = 0;
+}
+
+RegionInfo::RegionInfo() : FunctionPass(ID) {
+  initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+  TopLevelRegion = 0;
+}
+
+RegionInfo::~RegionInfo() {
+  releaseMemory();
+}
+
+void RegionInfo::Calculate(Function &F) {
+  // ShortCut a function where for every BB the exit of the largest region
+  // starting with BB is stored. These regions can be threated as single BBS.
+  // This improves performance on linear CFGs.
+  BBtoBBMap ShortCut;
+
+  scanForRegions(F, &ShortCut);
+  BasicBlock *BB = &F.getEntryBlock();
+  buildRegionsTree(DT->getNode(BB), TopLevelRegion);
+}
+
+bool RegionInfo::runOnFunction(Function &F) {
+  releaseMemory();
+
+  DT = &getAnalysis<DominatorTree>();
+  PDT = &getAnalysis<PostDominatorTree>();
+  DF = &getAnalysis<DominanceFrontier>();
+
+  TopLevelRegion = new Region(&F.getEntryBlock(), 0, this, DT, 0);
+  updateStatistics(TopLevelRegion);
+
+  Calculate(F);
+
+  return false;
+}
+
+void RegionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTree>();
+  AU.addRequired<PostDominatorTree>();
+  AU.addRequired<DominanceFrontier>();
+}
+
+void RegionInfo::print(raw_ostream &OS, const Module *) const {
+  OS << "Region tree:\n";
+  TopLevelRegion->print(OS, true, 0, printStyle.getValue());
+  OS << "End region tree\n";
+}
+
+void RegionInfo::verifyAnalysis() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (!VerifyRegionInfo) return;
+
+  TopLevelRegion->verifyRegionNest();
+}
+
+// Region pass manager support.
+Region *RegionInfo::getRegionFor(BasicBlock *BB) const {
+  BBtoRegionMap::const_iterator I=
+    BBtoRegion.find(BB);
+  return I != BBtoRegion.end() ? I->second : 0;
+}
+
+void RegionInfo::setRegionFor(BasicBlock *BB, Region *R) {
+  BBtoRegion[BB] = R;
+}
+
+Region *RegionInfo::operator[](BasicBlock *BB) const {
+  return getRegionFor(BB);
+}
+
+BasicBlock *RegionInfo::getMaxRegionExit(BasicBlock *BB) const {
+  BasicBlock *Exit = NULL;
+
+  while (true) {
+    // Get largest region that starts at BB.
+    Region *R = getRegionFor(BB);
+    while (R && R->getParent() && R->getParent()->getEntry() == BB)
+      R = R->getParent();
+
+    // Get the single exit of BB.
+    if (R && R->getEntry() == BB)
+      Exit = R->getExit();
+    else if (++succ_begin(BB) == succ_end(BB))
+      Exit = *succ_begin(BB);
+    else // No single exit exists.
+      return Exit;
+
+    // Get largest region that starts at Exit.
+    Region *ExitR = getRegionFor(Exit);
+    while (ExitR && ExitR->getParent()
+           && ExitR->getParent()->getEntry() == Exit)
+      ExitR = ExitR->getParent();
+
+    for (pred_iterator PI = pred_begin(Exit), PE = pred_end(Exit); PI != PE;
+         ++PI)
+      if (!R->contains(*PI) && !ExitR->contains(*PI))
+        break;
+
+    // This stops infinite cycles.
+    if (DT->dominates(Exit, BB))
+      break;
+
+    BB = Exit;
+  }
+
+  return Exit;
+}
+
+Region*
+RegionInfo::getCommonRegion(Region *A, Region *B) const {
+  assert (A && B && "One of the Regions is NULL");
+
+  if (A->contains(B)) return A;
+
+  while (!B->contains(A))
+    B = B->getParent();
+
+  return B;
+}
+
+Region*
+RegionInfo::getCommonRegion(SmallVectorImpl<Region*> &Regions) const {
+  Region* ret = Regions.back();
+  Regions.pop_back();
+
+  for (SmallVectorImpl<Region*>::const_iterator I = Regions.begin(),
+       E = Regions.end(); I != E; ++I)
+      ret = getCommonRegion(ret, *I);
+
+  return ret;
+}
+
+Region*
+RegionInfo::getCommonRegion(SmallVectorImpl<BasicBlock*> &BBs) const {
+  Region* ret = getRegionFor(BBs.back());
+  BBs.pop_back();
+
+  for (SmallVectorImpl<BasicBlock*>::const_iterator I = BBs.begin(),
+       E = BBs.end(); I != E; ++I)
+      ret = getCommonRegion(ret, getRegionFor(*I));
+
+  return ret;
+}
+
+void RegionInfo::splitBlock(BasicBlock* NewBB, BasicBlock *OldBB)
+{
+  Region *R = getRegionFor(OldBB);
+
+  setRegionFor(NewBB, R);
+
+  while (R->getEntry() == OldBB && !R->isTopLevelRegion()) {
+    R->replaceEntry(NewBB);
+    R = R->getParent();
+  }
+
+  setRegionFor(OldBB, R);
+}
+
+char RegionInfo::ID = 0;
+INITIALIZE_PASS_BEGIN(RegionInfo, "regions",
+                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(DominanceFrontier)
+INITIALIZE_PASS_END(RegionInfo, "regions",
+                "Detect single entry single exit regions", true, true)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+namespace llvm {
+  FunctionPass *createRegionInfoPass() {
+    return new RegionInfo();
+  }
+}
+
diff --git a/contrib/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm/lib/Analysis/RegionPass.cpp
new file mode 100644
index 000000000000..9208fa21d7ec
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/RegionPass.cpp
@@ -0,0 +1,275 @@
+//===- RegionPass.cpp - Region Pass and Region Pass Manager ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements RegionPass and RGPassManager. All region optimization
+// and transformation passes are derived from RegionPass. RGPassManager is
+// responsible for managing RegionPasses.
+// most of these codes are COPY from LoopPass.cpp
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Support/Timer.h"
+
+#define DEBUG_TYPE "regionpassmgr"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// RGPassManager
+//
+
+char RGPassManager::ID = 0;
+
+RGPassManager::RGPassManager()
+  : FunctionPass(ID), PMDataManager() {
+  skipThisRegion = false;
+  redoThisRegion = false;
+  RI = NULL;
+  CurrentRegion = NULL;
+}
+
+// Recurse through all subregions and all regions  into RQ.
+static void addRegionIntoQueue(Region *R, std::deque<Region *> &RQ) {
+  RQ.push_back(R);
+  for (Region::iterator I = R->begin(), E = R->end(); I != E; ++I)
+    addRegionIntoQueue(*I, RQ);
+}
+
+/// Pass Manager itself does not invalidate any analysis info.
+void RGPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
+  Info.addRequired<RegionInfo>();
+  Info.setPreservesAll();
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the function, and if so, return true.
+bool RGPassManager::runOnFunction(Function &F) {
+  RI = &getAnalysis<RegionInfo>();
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  addRegionIntoQueue(RI->getTopLevelRegion(), RQ);
+
+  if (RQ.empty()) // No regions, skip calling finalizers
+    return false;
+
+  // Initialization
+  for (std::deque<Region *>::const_iterator I = RQ.begin(), E = RQ.end();
+       I != E; ++I) {
+    Region *R = *I;
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      RegionPass *RP = (RegionPass *)getContainedPass(Index);
+      Changed |= RP->doInitialization(R, *this);
+    }
+  }
+
+  // Walk Regions
+  while (!RQ.empty()) {
+
+    CurrentRegion  = RQ.back();
+    skipThisRegion = false;
+    redoThisRegion = false;
+
+    // Run all passes on the current Region.
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      RegionPass *P = (RegionPass*)getContainedPass(Index);
+
+      dumpPassInfo(P, EXECUTION_MSG, ON_REGION_MSG,
+                   CurrentRegion->getNameStr());
+      dumpRequiredSet(P);
+
+      initializeAnalysisImpl(P);
+
+      {
+        PassManagerPrettyStackEntry X(P, *CurrentRegion->getEntry());
+
+        TimeRegion PassTimer(getPassTimer(P));
+        Changed |= P->runOnRegion(CurrentRegion, *this);
+      }
+
+      if (Changed)
+        dumpPassInfo(P, MODIFICATION_MSG, ON_REGION_MSG,
+                     skipThisRegion ? "<deleted>" :
+                                    CurrentRegion->getNameStr());
+      dumpPreservedSet(P);
+
+      if (!skipThisRegion) {
+        // Manually check that this region is still healthy. This is done
+        // instead of relying on RegionInfo::verifyRegion since RegionInfo
+        // is a function pass and it's really expensive to verify every
+        // Region in the function every time. That level of checking can be
+        // enabled with the -verify-region-info option.
+        {
+          TimeRegion PassTimer(getPassTimer(P));
+          CurrentRegion->verifyRegion();
+        }
+
+        // Then call the regular verifyAnalysis functions.
+        verifyPreservedAnalysis(P);
+      }
+
+      removeNotPreservedAnalysis(P);
+      recordAvailableAnalysis(P);
+      removeDeadPasses(P,
+                       skipThisRegion ? "<deleted>" :
+                                      CurrentRegion->getNameStr(),
+                       ON_REGION_MSG);
+
+      if (skipThisRegion)
+        // Do not run other passes on this region.
+        break;
+    }
+
+    // If the region was deleted, release all the region passes. This frees up
+    // some memory, and avoids trouble with the pass manager trying to call
+    // verifyAnalysis on them.
+    if (skipThisRegion)
+      for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+        Pass *P = getContainedPass(Index);
+        freePass(P, "<deleted>", ON_REGION_MSG);
+      }
+
+    // Pop the region from queue after running all passes.
+    RQ.pop_back();
+
+    if (redoThisRegion)
+      RQ.push_back(CurrentRegion);
+
+    // Free all region nodes created in region passes.
+    RI->clearNodeCache();
+  }
+
+  // Finalization
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    RegionPass *P = (RegionPass*)getContainedPass(Index);
+    Changed |= P->doFinalization();
+  }
+
+  // Print the region tree after all pass.
+  DEBUG(
+    dbgs() << "\nRegion tree of function " << F.getName()
+           << " after all region Pass:\n";
+    RI->dump();
+    dbgs() << "\n";
+    );
+
+  return Changed;
+}
+
+/// Print passes managed by this manager
+void RGPassManager::dumpPassStructure(unsigned Offset) {
+  errs().indent(Offset*2) << "Region Pass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    Pass *P = getContainedPass(Index);
+    P->dumpPassStructure(Offset + 1);
+    dumpLastUses(P, Offset+1);
+  }
+}
+
+namespace {
+//===----------------------------------------------------------------------===//
+// PrintRegionPass
+class PrintRegionPass : public RegionPass {
+private:
+  std::string Banner;
+  raw_ostream &Out;       // raw_ostream to print on.
+
+public:
+  static char ID;
+  PrintRegionPass() : RegionPass(ID), Out(dbgs()) {}
+  PrintRegionPass(const std::string &B, raw_ostream &o)
+      : RegionPass(ID), Banner(B), Out(o) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+  virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
+    Out << Banner;
+    for (Region::block_iterator I = R->block_begin(), E = R->block_end();
+         I != E; ++I)
+      (*I)->print(Out);
+
+    return false;
+  }
+};
+
+char PrintRegionPass::ID = 0;
+}  //end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// RegionPass
+
+// Check if this pass is suitable for the current RGPassManager, if
+// available. This pass P is not suitable for a RGPassManager if P
+// is not preserving higher level analysis info used by other
+// RGPassManager passes. In such case, pop RGPassManager from the
+// stack. This will force assignPassManager() to create new
+// LPPassManger as expected.
+void RegionPass::preparePassManager(PMStack &PMS) {
+
+  // Find RGPassManager
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_RegionPassManager)
+    PMS.pop();
+
+
+  // If this pass is destroying high level information that is used
+  // by other passes that are managed by LPM then do not insert
+  // this pass in current LPM. Use new RGPassManager.
+  if (PMS.top()->getPassManagerType() == PMT_RegionPassManager &&
+    !PMS.top()->preserveHigherLevelAnalysis(this))
+    PMS.pop();
+}
+
+/// Assign pass manager to manage this pass.
+void RegionPass::assignPassManager(PMStack &PMS,
+                                 PassManagerType PreferredType) {
+  // Find RGPassManager
+  while (!PMS.empty() &&
+         PMS.top()->getPassManagerType() > PMT_RegionPassManager)
+    PMS.pop();
+
+  RGPassManager *RGPM;
+
+  // Create new Region Pass Manager if it does not exist.
+  if (PMS.top()->getPassManagerType() == PMT_RegionPassManager)
+    RGPM = (RGPassManager*)PMS.top();
+  else {
+
+    assert (!PMS.empty() && "Unable to create Region Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Region Pass Manager
+    RGPM = new RGPassManager();
+    RGPM->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(RGPM);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    TPM->schedulePass(RGPM);
+
+    // [4] Push new manager into PMS
+    PMS.push(RGPM);
+  }
+
+  RGPM->add(this);
+}
+
+/// Get the printer pass
+Pass *RegionPass::createPrinterPass(raw_ostream &O,
+                                  const std::string &Banner) const {
+  return new PrintRegionPass(Banner, O);
+}
diff --git a/contrib/llvm/lib/Analysis/RegionPrinter.cpp b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
new file mode 100644
index 000000000000..c5f1b925921b
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
@@ -0,0 +1,218 @@
+//===- RegionPrinter.cpp - Print regions tree pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Print out the region tree of a function using dotty/graphviz.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// onlySimpleRegion - Show only the simple regions in the RegionViewer.
+static cl::opt<bool>
+onlySimpleRegions("only-simple-regions",
+                  cl::desc("Show only simple regions in the graphviz viewer"),
+                  cl::Hidden,
+                  cl::init(false));
+
+namespace llvm {
+template<>
+struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(RegionNode *Node, RegionNode *Graph) {
+
+    if (!Node->isSubRegion()) {
+      BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+
+      if (isSimple())
+        return DOTGraphTraits<const Function*>
+          ::getSimpleNodeLabel(BB, BB->getParent());
+      else
+        return DOTGraphTraits<const Function*>
+          ::getCompleteNodeLabel(BB, BB->getParent());
+    }
+
+    return "Not implemented";
+  }
+};
+
+template<>
+struct DOTGraphTraits<RegionInfo*> : public DOTGraphTraits<RegionNode*> {
+
+  DOTGraphTraits (bool isSimple=false)
+    : DOTGraphTraits<RegionNode*>(isSimple) {}
+
+  static std::string getGraphName(RegionInfo *DT) {
+    return "Region Graph";
+  }
+
+  std::string getNodeLabel(RegionNode *Node, RegionInfo *G) {
+    return DOTGraphTraits<RegionNode*>::getNodeLabel(Node,
+                                                     G->getTopLevelRegion());
+  }
+
+  std::string getEdgeAttributes(RegionNode *srcNode,
+    GraphTraits<RegionInfo*>::ChildIteratorType CI, RegionInfo *RI) {
+
+    RegionNode *destNode = *CI;
+
+    if (srcNode->isSubRegion() || destNode->isSubRegion())
+      return "";
+
+    // In case of a backedge, do not use it to define the layout of the nodes.
+    BasicBlock *srcBB = srcNode->getNodeAs<BasicBlock>();
+    BasicBlock *destBB = destNode->getNodeAs<BasicBlock>();
+
+    Region *R = RI->getRegionFor(destBB);
+
+    while (R && R->getParent())
+      if (R->getParent()->getEntry() == destBB)
+        R = R->getParent();
+      else
+        break;
+
+    if (R->getEntry() == destBB && R->contains(srcBB))
+      return "constraint=false";
+
+    return "";
+  }
+
+  // Print the cluster of the subregions. This groups the single basic blocks
+  // and adds a different background color for each group.
+  static void printRegionCluster(const Region *R, GraphWriter<RegionInfo*> &GW,
+                                 unsigned depth = 0) {
+    raw_ostream &O = GW.getOStream();
+    O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void*>(R)
+      << " {\n";
+    O.indent(2 * (depth + 1)) << "label = \"\";\n";
+
+    if (!onlySimpleRegions || R->isSimple()) {
+      O.indent(2 * (depth + 1)) << "style = filled;\n";
+      O.indent(2 * (depth + 1)) << "color = "
+        << ((R->getDepth() * 2 % 12) + 1) << "\n";
+
+    } else {
+      O.indent(2 * (depth + 1)) << "style = solid;\n";
+      O.indent(2 * (depth + 1)) << "color = "
+        << ((R->getDepth() * 2 % 12) + 2) << "\n";
+    }
+
+    for (Region::const_iterator RI = R->begin(), RE = R->end(); RI != RE; ++RI)
+      printRegionCluster(*RI, GW, depth + 1);
+
+    RegionInfo *RI = R->getRegionInfo();
+
+    for (Region::const_block_iterator BI = R->block_begin(),
+         BE = R->block_end(); BI != BE; ++BI)
+      if (RI->getRegionFor(*BI) == R)
+        O.indent(2 * (depth + 1)) << "Node"
+          << static_cast<const void*>(RI->getTopLevelRegion()->getBBNode(*BI))
+          << ";\n";
+
+    O.indent(2 * depth) << "}\n";
+  }
+
+  static void addCustomGraphFeatures(const RegionInfo* RI,
+                                     GraphWriter<RegionInfo*> &GW) {
+    raw_ostream &O = GW.getOStream();
+    O << "\tcolorscheme = \"paired12\"\n";
+    printRegionCluster(RI->getTopLevelRegion(), GW, 4);
+  }
+};
+} //end namespace llvm
+
+namespace {
+
+struct RegionViewer
+  : public DOTGraphTraitsViewer<RegionInfo, false> {
+  static char ID;
+  RegionViewer() : DOTGraphTraitsViewer<RegionInfo, false>("reg", ID){
+    initializeRegionViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+char RegionViewer::ID = 0;
+
+struct RegionOnlyViewer
+  : public DOTGraphTraitsViewer<RegionInfo, true> {
+  static char ID;
+  RegionOnlyViewer() : DOTGraphTraitsViewer<RegionInfo, true>("regonly", ID) {
+    initializeRegionOnlyViewerPass(*PassRegistry::getPassRegistry());
+  }
+};
+char RegionOnlyViewer::ID = 0;
+
+struct RegionPrinter
+  : public DOTGraphTraitsPrinter<RegionInfo, false> {
+  static char ID;
+  RegionPrinter() :
+    DOTGraphTraitsPrinter<RegionInfo, false>("reg", ID) {
+      initializeRegionPrinterPass(*PassRegistry::getPassRegistry());
+    }
+};
+char RegionPrinter::ID = 0;
+} //end anonymous namespace
+
+INITIALIZE_PASS(RegionPrinter, "dot-regions",
+                "Print regions of function to 'dot' file", true, true)
+
+INITIALIZE_PASS(RegionViewer, "view-regions", "View regions of function",
+                true, true)
+                
+INITIALIZE_PASS(RegionOnlyViewer, "view-regions-only",
+                "View regions of function (with no function bodies)",
+                true, true)
+
+namespace {
+
+struct RegionOnlyPrinter
+  : public DOTGraphTraitsPrinter<RegionInfo, true> {
+  static char ID;
+  RegionOnlyPrinter() :
+    DOTGraphTraitsPrinter<RegionInfo, true>("reg", ID) {
+      initializeRegionOnlyPrinterPass(*PassRegistry::getPassRegistry());
+    }
+};
+
+}
+
+char RegionOnlyPrinter::ID = 0;
+INITIALIZE_PASS(RegionOnlyPrinter, "dot-regions-only",
+                "Print regions of function to 'dot' file "
+                "(with no function bodies)",
+                true, true)
+
+FunctionPass* llvm::createRegionViewerPass() {
+  return new RegionViewer();
+}
+
+FunctionPass* llvm::createRegionOnlyViewerPass() {
+  return new RegionOnlyViewer();
+}
+
+FunctionPass* llvm::createRegionPrinterPass() {
+  return new RegionPrinter();
+}
+
+FunctionPass* llvm::createRegionOnlyPrinterPass() {
+  return new RegionOnlyPrinter();
+}
+
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
new file mode 100644
index 000000000000..6ea915fdb0b7
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -0,0 +1,7057 @@
+//===- ScalarEvolution.cpp - Scalar Evolution Analysis ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution analysis
+// engine, which is used primarily to analyze expressions involving induction
+// variables in loops.
+//
+// There are several aspects to this library.  First is the representation of
+// scalar expressions, which are represented as subclasses of the SCEV class.
+// These classes are used to represent certain types of subexpressions that we
+// can handle. We only create one SCEV of a particular shape, so
+// pointer-comparisons for equality are legal.
+//
+// One important aspect of the SCEV objects is that they are never cyclic, even
+// if there is a cycle in the dataflow for an expression (ie, a PHI node).  If
+// the PHI node is one of the idioms that we can represent (e.g., a polynomial
+// recurrence) then we represent it directly as a recurrence node, otherwise we
+// represent it as a SCEVUnknown node.
+//
+// In addition to being able to represent expressions of various types, we also
+// have folders that are used to build the *canonical* representation for a
+// particular expression.  These folders are capable of using a variety of
+// rewrite rules to simplify the expressions.
+//
+// Once the folders are defined, we can implement the more interesting
+// higher-level code, such as the code that recognizes PHI nodes of various
+// types, computes the execution count of a loop, etc.
+//
+// TODO: We should use these routines and value representations to implement
+// dependence analysis!
+//
+//===----------------------------------------------------------------------===//
+//
+// There are several good references for the techniques used in this analysis.
+//
+//  Chains of recurrences -- a method to expedite the evaluation
+//  of closed-form functions
+//  Olaf Bachmann, Paul S. Wang, Eugene V. Zima
+//
+//  On computational properties of chains of recurrences
+//  Eugene V. Zima
+//
+//  Symbolic Evaluation of Chains of Recurrences for Loop Optimization
+//  Robert A. van Engelen
+//
+//  Efficient Symbolic Analysis for Optimizing Compilers
+//  Robert A. van Engelen
+//
+//  Using the chains of recurrences algebra for data dependence testing and
+//  induction variable substitution
+//  MS Thesis, Johnie Birch
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalar-evolution"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumArrayLenItCounts,
+          "Number of trip counts computed with array length");
+STATISTIC(NumTripCountsComputed,
+          "Number of loops with predictable loop counts");
+STATISTIC(NumTripCountsNotComputed,
+          "Number of loops without predictable loop counts");
+STATISTIC(NumBruteForceTripCountsComputed,
+          "Number of loops with trip counts computed by force");
+
+static cl::opt<unsigned>
+MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
+                        cl::desc("Maximum number of iterations SCEV will "
+                                 "symbolically execute a constant "
+                                 "derived loop"),
+                        cl::init(100));
+
+// FIXME: Enable this with XDEBUG when the test suite is clean.
+static cl::opt<bool>
+VerifySCEV("verify-scev",
+           cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
+
+INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution",
+                "Scalar Evolution Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(ScalarEvolution, "scalar-evolution",
+                "Scalar Evolution Analysis", false, true)
+char ScalarEvolution::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                           SCEV class definitions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Implementation of the SCEV class.
+//
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SCEV::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+void SCEV::print(raw_ostream &OS) const {
+  switch (getSCEVType()) {
+  case scConstant:
+    WriteAsOperand(OS, cast<SCEVConstant>(this)->getValue(), false);
+    return;
+  case scTruncate: {
+    const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(this);
+    const SCEV *Op = Trunc->getOperand();
+    OS << "(trunc " << *Op->getType() << " " << *Op << " to "
+       << *Trunc->getType() << ")";
+    return;
+  }
+  case scZeroExtend: {
+    const SCEVZeroExtendExpr *ZExt = cast<SCEVZeroExtendExpr>(this);
+    const SCEV *Op = ZExt->getOperand();
+    OS << "(zext " << *Op->getType() << " " << *Op << " to "
+       << *ZExt->getType() << ")";
+    return;
+  }
+  case scSignExtend: {
+    const SCEVSignExtendExpr *SExt = cast<SCEVSignExtendExpr>(this);
+    const SCEV *Op = SExt->getOperand();
+    OS << "(sext " << *Op->getType() << " " << *Op << " to "
+       << *SExt->getType() << ")";
+    return;
+  }
+  case scAddRecExpr: {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(this);
+    OS << "{" << *AR->getOperand(0);
+    for (unsigned i = 1, e = AR->getNumOperands(); i != e; ++i)
+      OS << ",+," << *AR->getOperand(i);
+    OS << "}<";
+    if (AR->getNoWrapFlags(FlagNUW))
+      OS << "nuw><";
+    if (AR->getNoWrapFlags(FlagNSW))
+      OS << "nsw><";
+    if (AR->getNoWrapFlags(FlagNW) &&
+        !AR->getNoWrapFlags((NoWrapFlags)(FlagNUW | FlagNSW)))
+      OS << "nw><";
+    WriteAsOperand(OS, AR->getLoop()->getHeader(), /*PrintType=*/false);
+    OS << ">";
+    return;
+  }
+  case scAddExpr:
+  case scMulExpr:
+  case scUMaxExpr:
+  case scSMaxExpr: {
+    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(this);
+    const char *OpStr = 0;
+    switch (NAry->getSCEVType()) {
+    case scAddExpr: OpStr = " + "; break;
+    case scMulExpr: OpStr = " * "; break;
+    case scUMaxExpr: OpStr = " umax "; break;
+    case scSMaxExpr: OpStr = " smax "; break;
+    }
+    OS << "(";
+    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
+         I != E; ++I) {
+      OS << **I;
+      if (llvm::next(I) != E)
+        OS << OpStr;
+    }
+    OS << ")";
+    switch (NAry->getSCEVType()) {
+    case scAddExpr:
+    case scMulExpr:
+      if (NAry->getNoWrapFlags(FlagNUW))
+        OS << "<nuw>";
+      if (NAry->getNoWrapFlags(FlagNSW))
+        OS << "<nsw>";
+    }
+    return;
+  }
+  case scUDivExpr: {
+    const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(this);
+    OS << "(" << *UDiv->getLHS() << " /u " << *UDiv->getRHS() << ")";
+    return;
+  }
+  case scUnknown: {
+    const SCEVUnknown *U = cast<SCEVUnknown>(this);
+    Type *AllocTy;
+    if (U->isSizeOf(AllocTy)) {
+      OS << "sizeof(" << *AllocTy << ")";
+      return;
+    }
+    if (U->isAlignOf(AllocTy)) {
+      OS << "alignof(" << *AllocTy << ")";
+      return;
+    }
+
+    Type *CTy;
+    Constant *FieldNo;
+    if (U->isOffsetOf(CTy, FieldNo)) {
+      OS << "offsetof(" << *CTy << ", ";
+      WriteAsOperand(OS, FieldNo, false);
+      OS << ")";
+      return;
+    }
+
+    // Otherwise just print it normally.
+    WriteAsOperand(OS, U->getValue(), false);
+    return;
+  }
+  case scCouldNotCompute:
+    OS << "***COULDNOTCOMPUTE***";
+    return;
+  default: break;
+  }
+  llvm_unreachable("Unknown SCEV kind!");
+}
+
+Type *SCEV::getType() const {
+  switch (getSCEVType()) {
+  case scConstant:
+    return cast<SCEVConstant>(this)->getType();
+  case scTruncate:
+  case scZeroExtend:
+  case scSignExtend:
+    return cast<SCEVCastExpr>(this)->getType();
+  case scAddRecExpr:
+  case scMulExpr:
+  case scUMaxExpr:
+  case scSMaxExpr:
+    return cast<SCEVNAryExpr>(this)->getType();
+  case scAddExpr:
+    return cast<SCEVAddExpr>(this)->getType();
+  case scUDivExpr:
+    return cast<SCEVUDivExpr>(this)->getType();
+  case scUnknown:
+    return cast<SCEVUnknown>(this)->getType();
+  case scCouldNotCompute:
+    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+  default:
+    llvm_unreachable("Unknown SCEV kind!");
+  }
+}
+
+bool SCEV::isZero() const {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
+    return SC->getValue()->isZero();
+  return false;
+}
+
+bool SCEV::isOne() const {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
+    return SC->getValue()->isOne();
+  return false;
+}
+
+bool SCEV::isAllOnesValue() const {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
+    return SC->getValue()->isAllOnesValue();
+  return false;
+}
+
+/// isNonConstantNegative - Return true if the specified scev is negated, but
+/// not a constant.
+bool SCEV::isNonConstantNegative() const {
+  const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(this);
+  if (!Mul) return false;
+
+  // If there is a constant factor, it will be first.
+  const SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
+  if (!SC) return false;
+
+  // Return true if the value is negative, this matches things like (-42 * V).
+  return SC->getValue()->getValue().isNegative();
+}
+
+SCEVCouldNotCompute::SCEVCouldNotCompute() :
+  SCEV(FoldingSetNodeIDRef(), scCouldNotCompute) {}
+
+bool SCEVCouldNotCompute::classof(const SCEV *S) {
+  return S->getSCEVType() == scCouldNotCompute;
+}
+
+const SCEV *ScalarEvolution::getConstant(ConstantInt *V) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(scConstant);
+  ID.AddPointer(V);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  SCEV *S = new (SCEVAllocator) SCEVConstant(ID.Intern(SCEVAllocator), V);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+const SCEV *ScalarEvolution::getConstant(const APInt& Val) {
+  return getConstant(ConstantInt::get(getContext(), Val));
+}
+
+const SCEV *
+ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) {
+  IntegerType *ITy = cast<IntegerType>(getEffectiveSCEVType(Ty));
+  return getConstant(ConstantInt::get(ITy, V, isSigned));
+}
+
+SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID,
+                           unsigned SCEVTy, const SCEV *op, Type *ty)
+  : SCEV(ID, SCEVTy), Op(op), Ty(ty) {}
+
+SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
+                                   const SCEV *op, Type *ty)
+  : SCEVCastExpr(ID, scTruncate, op, ty) {
+  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot truncate non-integer value!");
+}
+
+SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
+                                       const SCEV *op, Type *ty)
+  : SCEVCastExpr(ID, scZeroExtend, op, ty) {
+  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot zero extend non-integer value!");
+}
+
+SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
+                                       const SCEV *op, Type *ty)
+  : SCEVCastExpr(ID, scSignExtend, op, ty) {
+  assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot sign extend non-integer value!");
+}
+
+void SCEVUnknown::deleted() {
+  // Clear this SCEVUnknown from various maps.
+  SE->forgetMemoizedResults(this);
+
+  // Remove this SCEVUnknown from the uniquing map.
+  SE->UniqueSCEVs.RemoveNode(this);
+
+  // Release the value.
+  setValPtr(0);
+}
+
+void SCEVUnknown::allUsesReplacedWith(Value *New) {
+  // Clear this SCEVUnknown from various maps.
+  SE->forgetMemoizedResults(this);
+
+  // Remove this SCEVUnknown from the uniquing map.
+  SE->UniqueSCEVs.RemoveNode(this);
+
+  // Update this SCEVUnknown to point to the new value. This is needed
+  // because there may still be outstanding SCEVs which still point to
+  // this SCEVUnknown.
+  setValPtr(New);
+}
+
+bool SCEVUnknown::isSizeOf(Type *&AllocTy) const {
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
+    if (VCE->getOpcode() == Instruction::PtrToInt)
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
+        if (CE->getOpcode() == Instruction::GetElementPtr &&
+            CE->getOperand(0)->isNullValue() &&
+            CE->getNumOperands() == 2)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(1)))
+            if (CI->isOne()) {
+              AllocTy = cast<PointerType>(CE->getOperand(0)->getType())
+                                 ->getElementType();
+              return true;
+            }
+
+  return false;
+}
+
+bool SCEVUnknown::isAlignOf(Type *&AllocTy) const {
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
+    if (VCE->getOpcode() == Instruction::PtrToInt)
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
+        if (CE->getOpcode() == Instruction::GetElementPtr &&
+            CE->getOperand(0)->isNullValue()) {
+          Type *Ty =
+            cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
+          if (StructType *STy = dyn_cast<StructType>(Ty))
+            if (!STy->isPacked() &&
+                CE->getNumOperands() == 3 &&
+                CE->getOperand(1)->isNullValue()) {
+              if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(2)))
+                if (CI->isOne() &&
+                    STy->getNumElements() == 2 &&
+                    STy->getElementType(0)->isIntegerTy(1)) {
+                  AllocTy = STy->getElementType(1);
+                  return true;
+                }
+            }
+        }
+
+  return false;
+}
+
+bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const {
+  if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
+    if (VCE->getOpcode() == Instruction::PtrToInt)
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
+        if (CE->getOpcode() == Instruction::GetElementPtr &&
+            CE->getNumOperands() == 3 &&
+            CE->getOperand(0)->isNullValue() &&
+            CE->getOperand(1)->isNullValue()) {
+          Type *Ty =
+            cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
+          // Ignore vector types here so that ScalarEvolutionExpander doesn't
+          // emit getelementptrs that index into vectors.
+          if (Ty->isStructTy() || Ty->isArrayTy()) {
+            CTy = Ty;
+            FieldNo = CE->getOperand(2);
+            return true;
+          }
+        }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                               SCEV Utilities
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// SCEVComplexityCompare - Return true if the complexity of the LHS is less
+  /// than the complexity of the RHS.  This comparator is used to canonicalize
+  /// expressions.
+  class SCEVComplexityCompare {
+    const LoopInfo *const LI;
+  public:
+    explicit SCEVComplexityCompare(const LoopInfo *li) : LI(li) {}
+
+    // Return true or false if LHS is less than, or at least RHS, respectively.
+    bool operator()(const SCEV *LHS, const SCEV *RHS) const {
+      return compare(LHS, RHS) < 0;
+    }
+
+    // Return negative, zero, or positive, if LHS is less than, equal to, or
+    // greater than RHS, respectively. A three-way result allows recursive
+    // comparisons to be more efficient.
+    int compare(const SCEV *LHS, const SCEV *RHS) const {
+      // Fast-path: SCEVs are uniqued so we can do a quick equality check.
+      if (LHS == RHS)
+        return 0;
+
+      // Primarily, sort the SCEVs by their getSCEVType().
+      unsigned LType = LHS->getSCEVType(), RType = RHS->getSCEVType();
+      if (LType != RType)
+        return (int)LType - (int)RType;
+
+      // Aside from the getSCEVType() ordering, the particular ordering
+      // isn't very important except that it's beneficial to be consistent,
+      // so that (a + b) and (b + a) don't end up as different expressions.
+      switch (LType) {
+      case scUnknown: {
+        const SCEVUnknown *LU = cast<SCEVUnknown>(LHS);
+        const SCEVUnknown *RU = cast<SCEVUnknown>(RHS);
+
+        // Sort SCEVUnknown values with some loose heuristics. TODO: This is
+        // not as complete as it could be.
+        const Value *LV = LU->getValue(), *RV = RU->getValue();
+
+        // Order pointer values after integer values. This helps SCEVExpander
+        // form GEPs.
+        bool LIsPointer = LV->getType()->isPointerTy(),
+             RIsPointer = RV->getType()->isPointerTy();
+        if (LIsPointer != RIsPointer)
+          return (int)LIsPointer - (int)RIsPointer;
+
+        // Compare getValueID values.
+        unsigned LID = LV->getValueID(),
+                 RID = RV->getValueID();
+        if (LID != RID)
+          return (int)LID - (int)RID;
+
+        // Sort arguments by their position.
+        if (const Argument *LA = dyn_cast<Argument>(LV)) {
+          const Argument *RA = cast<Argument>(RV);
+          unsigned LArgNo = LA->getArgNo(), RArgNo = RA->getArgNo();
+          return (int)LArgNo - (int)RArgNo;
+        }
+
+        // For instructions, compare their loop depth, and their operand
+        // count.  This is pretty loose.
+        if (const Instruction *LInst = dyn_cast<Instruction>(LV)) {
+          const Instruction *RInst = cast<Instruction>(RV);
+
+          // Compare loop depths.
+          const BasicBlock *LParent = LInst->getParent(),
+                           *RParent = RInst->getParent();
+          if (LParent != RParent) {
+            unsigned LDepth = LI->getLoopDepth(LParent),
+                     RDepth = LI->getLoopDepth(RParent);
+            if (LDepth != RDepth)
+              return (int)LDepth - (int)RDepth;
+          }
+
+          // Compare the number of operands.
+          unsigned LNumOps = LInst->getNumOperands(),
+                   RNumOps = RInst->getNumOperands();
+          return (int)LNumOps - (int)RNumOps;
+        }
+
+        return 0;
+      }
+
+      case scConstant: {
+        const SCEVConstant *LC = cast<SCEVConstant>(LHS);
+        const SCEVConstant *RC = cast<SCEVConstant>(RHS);
+
+        // Compare constant values.
+        const APInt &LA = LC->getValue()->getValue();
+        const APInt &RA = RC->getValue()->getValue();
+        unsigned LBitWidth = LA.getBitWidth(), RBitWidth = RA.getBitWidth();
+        if (LBitWidth != RBitWidth)
+          return (int)LBitWidth - (int)RBitWidth;
+        return LA.ult(RA) ? -1 : 1;
+      }
+
+      case scAddRecExpr: {
+        const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
+        const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
+
+        // Compare addrec loop depths.
+        const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
+        if (LLoop != RLoop) {
+          unsigned LDepth = LLoop->getLoopDepth(),
+                   RDepth = RLoop->getLoopDepth();
+          if (LDepth != RDepth)
+            return (int)LDepth - (int)RDepth;
+        }
+
+        // Addrec complexity grows with operand count.
+        unsigned LNumOps = LA->getNumOperands(), RNumOps = RA->getNumOperands();
+        if (LNumOps != RNumOps)
+          return (int)LNumOps - (int)RNumOps;
+
+        // Lexicographically compare.
+        for (unsigned i = 0; i != LNumOps; ++i) {
+          long X = compare(LA->getOperand(i), RA->getOperand(i));
+          if (X != 0)
+            return X;
+        }
+
+        return 0;
+      }
+
+      case scAddExpr:
+      case scMulExpr:
+      case scSMaxExpr:
+      case scUMaxExpr: {
+        const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
+        const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
+
+        // Lexicographically compare n-ary expressions.
+        unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+        for (unsigned i = 0; i != LNumOps; ++i) {
+          if (i >= RNumOps)
+            return 1;
+          long X = compare(LC->getOperand(i), RC->getOperand(i));
+          if (X != 0)
+            return X;
+        }
+        return (int)LNumOps - (int)RNumOps;
+      }
+
+      case scUDivExpr: {
+        const SCEVUDivExpr *LC = cast<SCEVUDivExpr>(LHS);
+        const SCEVUDivExpr *RC = cast<SCEVUDivExpr>(RHS);
+
+        // Lexicographically compare udiv expressions.
+        long X = compare(LC->getLHS(), RC->getLHS());
+        if (X != 0)
+          return X;
+        return compare(LC->getRHS(), RC->getRHS());
+      }
+
+      case scTruncate:
+      case scZeroExtend:
+      case scSignExtend: {
+        const SCEVCastExpr *LC = cast<SCEVCastExpr>(LHS);
+        const SCEVCastExpr *RC = cast<SCEVCastExpr>(RHS);
+
+        // Compare cast expressions by operand.
+        return compare(LC->getOperand(), RC->getOperand());
+      }
+
+      default:
+        llvm_unreachable("Unknown SCEV kind!");
+      }
+    }
+  };
+}
+
+/// GroupByComplexity - Given a list of SCEV objects, order them by their
+/// complexity, and group objects of the same complexity together by value.
+/// When this routine is finished, we know that any duplicates in the vector are
+/// consecutive and that complexity is monotonically increasing.
+///
+/// Note that we go take special precautions to ensure that we get deterministic
+/// results from this routine.  In other words, we don't want the results of
+/// this to depend on where the addresses of various SCEV objects happened to
+/// land in memory.
+///
+static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
+                              LoopInfo *LI) {
+  if (Ops.size() < 2) return;  // Noop
+  if (Ops.size() == 2) {
+    // This is the common case, which also happens to be trivially simple.
+    // Special case it.
+    const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
+    if (SCEVComplexityCompare(LI)(RHS, LHS))
+      std::swap(LHS, RHS);
+    return;
+  }
+
+  // Do the rough sort by complexity.
+  std::stable_sort(Ops.begin(), Ops.end(), SCEVComplexityCompare(LI));
+
+  // Now that we are sorted by complexity, group elements of the same
+  // complexity.  Note that this is, at worst, N^2, but the vector is likely to
+  // be extremely short in practice.  Note that we take this approach because we
+  // do not want to depend on the addresses of the objects we are grouping.
+  for (unsigned i = 0, e = Ops.size(); i != e-2; ++i) {
+    const SCEV *S = Ops[i];
+    unsigned Complexity = S->getSCEVType();
+
+    // If there are any objects of the same complexity and same value as this
+    // one, group them.
+    for (unsigned j = i+1; j != e && Ops[j]->getSCEVType() == Complexity; ++j) {
+      if (Ops[j] == S) { // Found a duplicate.
+        // Move it to immediately after i'th element.
+        std::swap(Ops[i+1], Ops[j]);
+        ++i;   // no need to rescan it.
+        if (i == e-2) return;  // Done!
+      }
+    }
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                      Simple SCEV method implementations
+//===----------------------------------------------------------------------===//
+
+/// BinomialCoefficient - Compute BC(It, K).  The result has width W.
+/// Assume, K > 0.
+static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
+                                       ScalarEvolution &SE,
+                                       Type *ResultTy) {
+  // Handle the simplest case efficiently.
+  if (K == 1)
+    return SE.getTruncateOrZeroExtend(It, ResultTy);
+
+  // We are using the following formula for BC(It, K):
+  //
+  //   BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K!
+  //
+  // Suppose, W is the bitwidth of the return value.  We must be prepared for
+  // overflow.  Hence, we must assure that the result of our computation is
+  // equal to the accurate one modulo 2^W.  Unfortunately, division isn't
+  // safe in modular arithmetic.
+  //
+  // However, this code doesn't use exactly that formula; the formula it uses
+  // is something like the following, where T is the number of factors of 2 in
+  // K! (i.e. trailing zeros in the binary representation of K!), and ^ is
+  // exponentiation:
+  //
+  //   BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T)
+  //
+  // This formula is trivially equivalent to the previous formula.  However,
+  // this formula can be implemented much more efficiently.  The trick is that
+  // K! / 2^T is odd, and exact division by an odd number *is* safe in modular
+  // arithmetic.  To do exact division in modular arithmetic, all we have
+  // to do is multiply by the inverse.  Therefore, this step can be done at
+  // width W.
+  //
+  // The next issue is how to safely do the division by 2^T.  The way this
+  // is done is by doing the multiplication step at a width of at least W + T
+  // bits.  This way, the bottom W+T bits of the product are accurate. Then,
+  // when we perform the division by 2^T (which is equivalent to a right shift
+  // by T), the bottom W bits are accurate.  Extra bits are okay; they'll get
+  // truncated out after the division by 2^T.
+  //
+  // In comparison to just directly using the first formula, this technique
+  // is much more efficient; using the first formula requires W * K bits,
+  // but this formula less than W + K bits. Also, the first formula requires
+  // a division step, whereas this formula only requires multiplies and shifts.
+  //
+  // It doesn't matter whether the subtraction step is done in the calculation
+  // width or the input iteration count's width; if the subtraction overflows,
+  // the result must be zero anyway.  We prefer here to do it in the width of
+  // the induction variable because it helps a lot for certain cases; CodeGen
+  // isn't smart enough to ignore the overflow, which leads to much less
+  // efficient code if the width of the subtraction is wider than the native
+  // register width.
+  //
+  // (It's possible to not widen at all by pulling out factors of 2 before
+  // the multiplication; for example, K=2 can be calculated as
+  // It/2*(It+(It*INT_MIN/INT_MIN)+-1). However, it requires
+  // extra arithmetic, so it's not an obvious win, and it gets
+  // much more complicated for K > 3.)
+
+  // Protection from insane SCEVs; this bound is conservative,
+  // but it probably doesn't matter.
+  if (K > 1000)
+    return SE.getCouldNotCompute();
+
+  unsigned W = SE.getTypeSizeInBits(ResultTy);
+
+  // Calculate K! / 2^T and T; we divide out the factors of two before
+  // multiplying for calculating K! / 2^T to avoid overflow.
+  // Other overflow doesn't matter because we only care about the bottom
+  // W bits of the result.
+  APInt OddFactorial(W, 1);
+  unsigned T = 1;
+  for (unsigned i = 3; i <= K; ++i) {
+    APInt Mult(W, i);
+    unsigned TwoFactors = Mult.countTrailingZeros();
+    T += TwoFactors;
+    Mult = Mult.lshr(TwoFactors);
+    OddFactorial *= Mult;
+  }
+
+  // We need at least W + T bits for the multiplication step
+  unsigned CalculationBits = W + T;
+
+  // Calculate 2^T, at width T+W.
+  APInt DivFactor = APInt(CalculationBits, 1).shl(T);
+
+  // Calculate the multiplicative inverse of K! / 2^T;
+  // this multiplication factor will perform the exact division by
+  // K! / 2^T.
+  APInt Mod = APInt::getSignedMinValue(W+1);
+  APInt MultiplyFactor = OddFactorial.zext(W+1);
+  MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
+  MultiplyFactor = MultiplyFactor.trunc(W);
+
+  // Calculate the product, at width T+W
+  IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
+                                                      CalculationBits);
+  const SCEV *Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
+  for (unsigned i = 1; i != K; ++i) {
+    const SCEV *S = SE.getMinusSCEV(It, SE.getConstant(It->getType(), i));
+    Dividend = SE.getMulExpr(Dividend,
+                             SE.getTruncateOrZeroExtend(S, CalculationTy));
+  }
+
+  // Divide by 2^T
+  const SCEV *DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));
+
+  // Truncate the result, and divide by K! / 2^T.
+
+  return SE.getMulExpr(SE.getConstant(MultiplyFactor),
+                       SE.getTruncateOrZeroExtend(DivResult, ResultTy));
+}
+
+/// evaluateAtIteration - Return the value of this chain of recurrences at
+/// the specified iteration number.  We can evaluate this recurrence by
+/// multiplying each element in the chain by the binomial coefficient
+/// corresponding to it.  In other words, we can evaluate {A,+,B,+,C,+,D} as:
+///
+///   A*BC(It, 0) + B*BC(It, 1) + C*BC(It, 2) + D*BC(It, 3)
+///
+/// where BC(It, k) stands for binomial coefficient.
+///
+const SCEV *SCEVAddRecExpr::evaluateAtIteration(const SCEV *It,
+                                                ScalarEvolution &SE) const {
+  const SCEV *Result = getStart();
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    // The computation is correct in the face of overflow provided that the
+    // multiplication is performed _after_ the evaluation of the binomial
+    // coefficient.
+    const SCEV *Coeff = BinomialCoefficient(It, i, SE, getType());
+    if (isa<SCEVCouldNotCompute>(Coeff))
+      return Coeff;
+
+    Result = SE.getAddExpr(Result, SE.getMulExpr(getOperand(i), Coeff));
+  }
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                    SCEV Expression folder implementations
+//===----------------------------------------------------------------------===//
+
+const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
+                                             Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
+         "This is not a truncating conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(scTruncate);
+  ID.AddPointer(Op);
+  ID.AddPointer(Ty);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+
+  // Fold if the operand is constant.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getConstant(
+      cast<ConstantInt>(ConstantExpr::getTrunc(SC->getValue(), Ty)));
+
+  // trunc(trunc(x)) --> trunc(x)
+  if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op))
+    return getTruncateExpr(ST->getOperand(), Ty);
+
+  // trunc(sext(x)) --> sext(x) if widening or trunc(x) if narrowing
+  if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
+    return getTruncateOrSignExtend(SS->getOperand(), Ty);
+
+  // trunc(zext(x)) --> zext(x) if widening or trunc(x) if narrowing
+  if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
+    return getTruncateOrZeroExtend(SZ->getOperand(), Ty);
+
+  // trunc(x1+x2+...+xN) --> trunc(x1)+trunc(x2)+...+trunc(xN) if we can
+  // eliminate all the truncates.
+  if (const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Op)) {
+    SmallVector<const SCEV *, 4> Operands;
+    bool hasTrunc = false;
+    for (unsigned i = 0, e = SA->getNumOperands(); i != e && !hasTrunc; ++i) {
+      const SCEV *S = getTruncateExpr(SA->getOperand(i), Ty);
+      hasTrunc = isa<SCEVTruncateExpr>(S);
+      Operands.push_back(S);
+    }
+    if (!hasTrunc)
+      return getAddExpr(Operands);
+    UniqueSCEVs.FindNodeOrInsertPos(ID, IP);  // Mutates IP, returns NULL.
+  }
+
+  // trunc(x1*x2*...*xN) --> trunc(x1)*trunc(x2)*...*trunc(xN) if we can
+  // eliminate all the truncates.
+  if (const SCEVMulExpr *SM = dyn_cast<SCEVMulExpr>(Op)) {
+    SmallVector<const SCEV *, 4> Operands;
+    bool hasTrunc = false;
+    for (unsigned i = 0, e = SM->getNumOperands(); i != e && !hasTrunc; ++i) {
+      const SCEV *S = getTruncateExpr(SM->getOperand(i), Ty);
+      hasTrunc = isa<SCEVTruncateExpr>(S);
+      Operands.push_back(S);
+    }
+    if (!hasTrunc)
+      return getMulExpr(Operands);
+    UniqueSCEVs.FindNodeOrInsertPos(ID, IP);  // Mutates IP, returns NULL.
+  }
+
+  // If the input value is a chrec scev, truncate the chrec's operands.
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Op)) {
+    SmallVector<const SCEV *, 4> Operands;
+    for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
+      Operands.push_back(getTruncateExpr(AddRec->getOperand(i), Ty));
+    return getAddRecExpr(Operands, AddRec->getLoop(), SCEV::FlagAnyWrap);
+  }
+
+  // The cast wasn't folded; create an explicit cast node. We can reuse
+  // the existing insert position since if we get here, we won't have
+  // made any changes which would invalidate it.
+  SCEV *S = new (SCEVAllocator) SCEVTruncateExpr(ID.Intern(SCEVAllocator),
+                                                 Op, Ty);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
+                                               Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  // Fold if the operand is constant.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getConstant(
+      cast<ConstantInt>(ConstantExpr::getZExt(SC->getValue(), Ty)));
+
+  // zext(zext(x)) --> zext(x)
+  if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
+    return getZeroExtendExpr(SZ->getOperand(), Ty);
+
+  // Before doing any expensive analysis, check to see if we've already
+  // computed a SCEV for this Op and Ty.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scZeroExtend);
+  ID.AddPointer(Op);
+  ID.AddPointer(Ty);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+
+  // zext(trunc(x)) --> zext(x) or x or trunc(x)
+  if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
+    // It's possible the bits taken off by the truncate were all zero bits. If
+    // so, we should be able to simplify this further.
+    const SCEV *X = ST->getOperand();
+    ConstantRange CR = getUnsignedRange(X);
+    unsigned TruncBits = getTypeSizeInBits(ST->getType());
+    unsigned NewBits = getTypeSizeInBits(Ty);
+    if (CR.truncate(TruncBits).zeroExtend(NewBits).contains(
+            CR.zextOrTrunc(NewBits)))
+      return getTruncateOrZeroExtend(X, Ty);
+  }
+
+  // If the input value is a chrec scev, and we can prove that the value
+  // did not overflow the old, smaller, value, we can zero extend all of the
+  // operands (often constants).  This allows analysis of something like
+  // this:  for (unsigned char X = 0; X < 100; ++X) { int Y = X; }
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
+    if (AR->isAffine()) {
+      const SCEV *Start = AR->getStart();
+      const SCEV *Step = AR->getStepRecurrence(*this);
+      unsigned BitWidth = getTypeSizeInBits(AR->getType());
+      const Loop *L = AR->getLoop();
+
+      // If we have special knowledge that this addrec won't overflow,
+      // we don't need to do any further analysis.
+      if (AR->getNoWrapFlags(SCEV::FlagNUW))
+        return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                             getZeroExtendExpr(Step, Ty),
+                             L, AR->getNoWrapFlags());
+
+      // Check whether the backedge-taken count is SCEVCouldNotCompute.
+      // Note that this serves two purposes: It filters out loops that are
+      // simply not analyzable, and it covers the case where this code is
+      // being called from within backedge-taken count analysis, such that
+      // attempting to ask for the backedge-taken count would likely result
+      // in infinite recursion. In the later case, the analysis code will
+      // cope with a conservative value, and it will take care to purge
+      // that value once it has finished.
+      const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
+      if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
+        // Manually compute the final value for AR, checking for
+        // overflow.
+
+        // Check whether the backedge-taken count can be losslessly casted to
+        // the addrec's type. The count is always unsigned.
+        const SCEV *CastedMaxBECount =
+          getTruncateOrZeroExtend(MaxBECount, Start->getType());
+        const SCEV *RecastedMaxBECount =
+          getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
+        if (MaxBECount == RecastedMaxBECount) {
+          Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
+          // Check whether Start+Step*MaxBECount has no unsigned overflow.
+          const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
+          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy);
+          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
+          const SCEV *OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getZeroExtendExpr(Step, WideTy)));
+          if (ZAdd == OperandExtendedAdd) {
+            // Cache knowledge of AR NUW, which is propagated to this AddRec.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getZeroExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+          // Similar to above, only this time treat the step value as signed.
+          // This covers loops that count down.
+          OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getSignExtendExpr(Step, WideTy)));
+          if (ZAdd == OperandExtendedAdd) {
+            // Cache knowledge of AR NW, which is propagated to this AddRec.
+            // Negative step causes unsigned wrap, but it still can't self-wrap.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getSignExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+        }
+
+        // If the backedge is guarded by a comparison with the pre-inc value
+        // the addrec is safe. Also, if the entry is guarded by a comparison
+        // with the start value and the backedge is guarded by a comparison
+        // with the post-inc value, the addrec is safe.
+        if (isKnownPositive(Step)) {
+          const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
+                                      getUnsignedRange(Step).getUnsignedMax());
+          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
+              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
+               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
+                                           AR->getPostIncExpr(*this), N))) {
+            // Cache knowledge of AR NUW, which is propagated to this AddRec.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getZeroExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+        } else if (isKnownNegative(Step)) {
+          const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
+                                      getSignedRange(Step).getSignedMin());
+          if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) ||
+              (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
+               isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
+                                           AR->getPostIncExpr(*this), N))) {
+            // Cache knowledge of AR NW, which is propagated to this AddRec.
+            // Negative step causes unsigned wrap, but it still can't self-wrap.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getZeroExtendExpr(Start, Ty),
+                                 getSignExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+        }
+      }
+    }
+
+  // The cast wasn't folded; create an explicit cast node.
+  // Recompute the insert position, as it may have been invalidated.
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
+                                                   Op, Ty);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+// Get the limit of a recurrence such that incrementing by Step cannot cause
+// signed overflow as long as the value of the recurrence within the loop does
+// not exceed this limit before incrementing.
+static const SCEV *getOverflowLimitForStep(const SCEV *Step,
+                                           ICmpInst::Predicate *Pred,
+                                           ScalarEvolution *SE) {
+  unsigned BitWidth = SE->getTypeSizeInBits(Step->getType());
+  if (SE->isKnownPositive(Step)) {
+    *Pred = ICmpInst::ICMP_SLT;
+    return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
+                           SE->getSignedRange(Step).getSignedMax());
+  }
+  if (SE->isKnownNegative(Step)) {
+    *Pred = ICmpInst::ICMP_SGT;
+    return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
+                       SE->getSignedRange(Step).getSignedMin());
+  }
+  return 0;
+}
+
+// The recurrence AR has been shown to have no signed wrap. Typically, if we can
+// prove NSW for AR, then we can just as easily prove NSW for its preincrement
+// or postincrement sibling. This allows normalizing a sign extended AddRec as
+// such: {sext(Step + Start),+,Step} => {(Step + sext(Start),+,Step} As a
+// result, the expression "Step + sext(PreIncAR)" is congruent with
+// "sext(PostIncAR)"
+static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
+                                            Type *Ty,
+                                            ScalarEvolution *SE) {
+  const Loop *L = AR->getLoop();
+  const SCEV *Start = AR->getStart();
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Check for a simple looking step prior to loop entry.
+  const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
+  if (!SA)
+    return 0;
+
+  // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
+  // subtraction is expensive. For this purpose, perform a quick and dirty
+  // difference, by checking for Step in the operand list.
+  SmallVector<const SCEV *, 4> DiffOps;
+  for (SCEVAddExpr::op_iterator I = SA->op_begin(), E = SA->op_end();
+       I != E; ++I) {
+    if (*I != Step)
+      DiffOps.push_back(*I);
+  }
+  if (DiffOps.size() == SA->getNumOperands())
+    return 0;
+
+  // This is a postinc AR. Check for overflow on the preinc recurrence using the
+  // same three conditions that getSignExtendedExpr checks.
+
+  // 1. NSW flags on the step increment.
+  const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags());
+  const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
+
+  if (PreAR && PreAR->getNoWrapFlags(SCEV::FlagNSW))
+    return PreStart;
+
+  // 2. Direct overflow check on the step operation's expression.
+  unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
+  Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
+  const SCEV *OperandExtendedStart =
+    SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy),
+                   SE->getSignExtendExpr(Step, WideTy));
+  if (SE->getSignExtendExpr(Start, WideTy) == OperandExtendedStart) {
+    // Cache knowledge of PreAR NSW.
+    if (PreAR)
+      const_cast<SCEVAddRecExpr *>(PreAR)->setNoWrapFlags(SCEV::FlagNSW);
+    // FIXME: this optimization needs a unit test
+    DEBUG(dbgs() << "SCEV: untested prestart overflow check\n");
+    return PreStart;
+  }
+
+  // 3. Loop precondition.
+  ICmpInst::Predicate Pred;
+  const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, SE);
+
+  if (OverflowLimit &&
+      SE->isLoopEntryGuardedByCond(L, Pred, PreStart, OverflowLimit)) {
+    return PreStart;
+  }
+  return 0;
+}
+
+// Get the normalized sign-extended expression for this AddRec's Start.
+static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
+                                            Type *Ty,
+                                            ScalarEvolution *SE) {
+  const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE);
+  if (!PreStart)
+    return SE->getSignExtendExpr(AR->getStart(), Ty);
+
+  return SE->getAddExpr(SE->getSignExtendExpr(AR->getStepRecurrence(*SE), Ty),
+                        SE->getSignExtendExpr(PreStart, Ty));
+}
+
+const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
+                                               Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  // Fold if the operand is constant.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    return getConstant(
+      cast<ConstantInt>(ConstantExpr::getSExt(SC->getValue(), Ty)));
+
+  // sext(sext(x)) --> sext(x)
+  if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
+    return getSignExtendExpr(SS->getOperand(), Ty);
+
+  // sext(zext(x)) --> zext(x)
+  if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
+    return getZeroExtendExpr(SZ->getOperand(), Ty);
+
+  // Before doing any expensive analysis, check to see if we've already
+  // computed a SCEV for this Op and Ty.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scSignExtend);
+  ID.AddPointer(Op);
+  ID.AddPointer(Ty);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+
+  // If the input value is provably positive, build a zext instead.
+  if (isKnownNonNegative(Op))
+    return getZeroExtendExpr(Op, Ty);
+
+  // sext(trunc(x)) --> sext(x) or x or trunc(x)
+  if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
+    // It's possible the bits taken off by the truncate were all sign bits. If
+    // so, we should be able to simplify this further.
+    const SCEV *X = ST->getOperand();
+    ConstantRange CR = getSignedRange(X);
+    unsigned TruncBits = getTypeSizeInBits(ST->getType());
+    unsigned NewBits = getTypeSizeInBits(Ty);
+    if (CR.truncate(TruncBits).signExtend(NewBits).contains(
+            CR.sextOrTrunc(NewBits)))
+      return getTruncateOrSignExtend(X, Ty);
+  }
+
+  // If the input value is a chrec scev, and we can prove that the value
+  // did not overflow the old, smaller, value, we can sign extend all of the
+  // operands (often constants).  This allows analysis of something like
+  // this:  for (signed char X = 0; X < 100; ++X) { int Y = X; }
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op))
+    if (AR->isAffine()) {
+      const SCEV *Start = AR->getStart();
+      const SCEV *Step = AR->getStepRecurrence(*this);
+      unsigned BitWidth = getTypeSizeInBits(AR->getType());
+      const Loop *L = AR->getLoop();
+
+      // If we have special knowledge that this addrec won't overflow,
+      // we don't need to do any further analysis.
+      if (AR->getNoWrapFlags(SCEV::FlagNSW))
+        return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                             getSignExtendExpr(Step, Ty),
+                             L, SCEV::FlagNSW);
+
+      // Check whether the backedge-taken count is SCEVCouldNotCompute.
+      // Note that this serves two purposes: It filters out loops that are
+      // simply not analyzable, and it covers the case where this code is
+      // being called from within backedge-taken count analysis, such that
+      // attempting to ask for the backedge-taken count would likely result
+      // in infinite recursion. In the later case, the analysis code will
+      // cope with a conservative value, and it will take care to purge
+      // that value once it has finished.
+      const SCEV *MaxBECount = getMaxBackedgeTakenCount(L);
+      if (!isa<SCEVCouldNotCompute>(MaxBECount)) {
+        // Manually compute the final value for AR, checking for
+        // overflow.
+
+        // Check whether the backedge-taken count can be losslessly casted to
+        // the addrec's type. The count is always unsigned.
+        const SCEV *CastedMaxBECount =
+          getTruncateOrZeroExtend(MaxBECount, Start->getType());
+        const SCEV *RecastedMaxBECount =
+          getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
+        if (MaxBECount == RecastedMaxBECount) {
+          Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
+          // Check whether Start+Step*MaxBECount has no signed overflow.
+          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
+          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy);
+          const SCEV *WideStart = getSignExtendExpr(Start, WideTy);
+          const SCEV *WideMaxBECount =
+            getZeroExtendExpr(CastedMaxBECount, WideTy);
+          const SCEV *OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getSignExtendExpr(Step, WideTy)));
+          if (SAdd == OperandExtendedAdd) {
+            // Cache knowledge of AR NSW, which is propagated to this AddRec.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                                 getSignExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+          // Similar to above, only this time treat the step value as unsigned.
+          // This covers loops that count up with an unsigned step.
+          OperandExtendedAdd =
+            getAddExpr(WideStart,
+                       getMulExpr(WideMaxBECount,
+                                  getZeroExtendExpr(Step, WideTy)));
+          if (SAdd == OperandExtendedAdd) {
+            // Cache knowledge of AR NSW, which is propagated to this AddRec.
+            const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+            // Return the expression with the addrec on the outside.
+            return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                                 getZeroExtendExpr(Step, Ty),
+                                 L, AR->getNoWrapFlags());
+          }
+        }
+
+        // If the backedge is guarded by a comparison with the pre-inc value
+        // the addrec is safe. Also, if the entry is guarded by a comparison
+        // with the start value and the backedge is guarded by a comparison
+        // with the post-inc value, the addrec is safe.
+        ICmpInst::Predicate Pred;
+        const SCEV *OverflowLimit = getOverflowLimitForStep(Step, &Pred, this);
+        if (OverflowLimit &&
+            (isLoopBackedgeGuardedByCond(L, Pred, AR, OverflowLimit) ||
+             (isLoopEntryGuardedByCond(L, Pred, Start, OverflowLimit) &&
+              isLoopBackedgeGuardedByCond(L, Pred, AR->getPostIncExpr(*this),
+                                          OverflowLimit)))) {
+          // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
+          const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
+          return getAddRecExpr(getSignExtendAddRecStart(AR, Ty, this),
+                               getSignExtendExpr(Step, Ty),
+                               L, AR->getNoWrapFlags());
+        }
+      }
+    }
+
+  // The cast wasn't folded; create an explicit cast node.
+  // Recompute the insert position, as it may have been invalidated.
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
+                                                   Op, Ty);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+/// getAnyExtendExpr - Return a SCEV for the given operand extended with
+/// unspecified bits out to the given type.
+///
+const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
+                                              Type *Ty) {
+  assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
+         "This is not an extending conversion!");
+  assert(isSCEVable(Ty) &&
+         "This is not a conversion to a SCEVable type!");
+  Ty = getEffectiveSCEVType(Ty);
+
+  // Sign-extend negative constants.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Op))
+    if (SC->getValue()->getValue().isNegative())
+      return getSignExtendExpr(Op, Ty);
+
+  // Peel off a truncate cast.
+  if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Op)) {
+    const SCEV *NewOp = T->getOperand();
+    if (getTypeSizeInBits(NewOp->getType()) < getTypeSizeInBits(Ty))
+      return getAnyExtendExpr(NewOp, Ty);
+    return getTruncateOrNoop(NewOp, Ty);
+  }
+
+  // Next try a zext cast. If the cast is folded, use it.
+  const SCEV *ZExt = getZeroExtendExpr(Op, Ty);
+  if (!isa<SCEVZeroExtendExpr>(ZExt))
+    return ZExt;
+
+  // Next try a sext cast. If the cast is folded, use it.
+  const SCEV *SExt = getSignExtendExpr(Op, Ty);
+  if (!isa<SCEVSignExtendExpr>(SExt))
+    return SExt;
+
+  // Force the cast to be folded into the operands of an addrec.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Op)) {
+    SmallVector<const SCEV *, 4> Ops;
+    for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
+         I != E; ++I)
+      Ops.push_back(getAnyExtendExpr(*I, Ty));
+    return getAddRecExpr(Ops, AR->getLoop(), SCEV::FlagNW);
+  }
+
+  // If the expression is obviously signed, use the sext cast value.
+  if (isa<SCEVSMaxExpr>(Op))
+    return SExt;
+
+  // Absent any other information, use the zext cast value.
+  return ZExt;
+}
+
+/// CollectAddOperandsWithScales - Process the given Ops list, which is
+/// a list of operands to be added under the given scale, update the given
+/// map. This is a helper function for getAddRecExpr. As an example of
+/// what it does, given a sequence of operands that would form an add
+/// expression like this:
+///
+///    m + n + 13 + (A * (o + p + (B * q + m + 29))) + r + (-1 * r)
+///
+/// where A and B are constants, update the map with these values:
+///
+///    (m, 1+A*B), (n, 1), (o, A), (p, A), (q, A*B), (r, 0)
+///
+/// and add 13 + A*B*29 to AccumulatedConstant.
+/// This will allow getAddRecExpr to produce this:
+///
+///    13+A*B*29 + n + (m * (1+A*B)) + ((o + p) * A) + (q * A*B)
+///
+/// This form often exposes folding opportunities that are hidden in
+/// the original operand list.
+///
+/// Return true iff it appears that any interesting folding opportunities
+/// may be exposed. This helps getAddRecExpr short-circuit extra work in
+/// the common case where no interesting opportunities are present, and
+/// is also used as a check to avoid infinite recursion.
+///
+static bool
+CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
+                             SmallVector<const SCEV *, 8> &NewOps,
+                             APInt &AccumulatedConstant,
+                             const SCEV *const *Ops, size_t NumOperands,
+                             const APInt &Scale,
+                             ScalarEvolution &SE) {
+  bool Interesting = false;
+
+  // Iterate over the add operands. They are sorted, with constants first.
+  unsigned i = 0;
+  while (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
+    ++i;
+    // Pull a buried constant out to the outside.
+    if (Scale != 1 || AccumulatedConstant != 0 || C->getValue()->isZero())
+      Interesting = true;
+    AccumulatedConstant += Scale * C->getValue()->getValue();
+  }
+
+  // Next comes everything else. We're especially interested in multiplies
+  // here, but they're in the middle, so just visit the rest with one loop.
+  for (; i != NumOperands; ++i) {
+    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[i]);
+    if (Mul && isa<SCEVConstant>(Mul->getOperand(0))) {
+      APInt NewScale =
+        Scale * cast<SCEVConstant>(Mul->getOperand(0))->getValue()->getValue();
+      if (Mul->getNumOperands() == 2 && isa<SCEVAddExpr>(Mul->getOperand(1))) {
+        // A multiplication of a constant with another add; recurse.
+        const SCEVAddExpr *Add = cast<SCEVAddExpr>(Mul->getOperand(1));
+        Interesting |=
+          CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
+                                       Add->op_begin(), Add->getNumOperands(),
+                                       NewScale, SE);
+      } else {
+        // A multiplication of a constant with some other value. Update
+        // the map.
+        SmallVector<const SCEV *, 4> MulOps(Mul->op_begin()+1, Mul->op_end());
+        const SCEV *Key = SE.getMulExpr(MulOps);
+        std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
+          M.insert(std::make_pair(Key, NewScale));
+        if (Pair.second) {
+          NewOps.push_back(Pair.first->first);
+        } else {
+          Pair.first->second += NewScale;
+          // The map already had an entry for this value, which may indicate
+          // a folding opportunity.
+          Interesting = true;
+        }
+      }
+    } else {
+      // An ordinary operand. Update the map.
+      std::pair<DenseMap<const SCEV *, APInt>::iterator, bool> Pair =
+        M.insert(std::make_pair(Ops[i], Scale));
+      if (Pair.second) {
+        NewOps.push_back(Pair.first->first);
+      } else {
+        Pair.first->second += Scale;
+        // The map already had an entry for this value, which may indicate
+        // a folding opportunity.
+        Interesting = true;
+      }
+    }
+  }
+
+  return Interesting;
+}
+
+namespace {
+  struct APIntCompare {
+    bool operator()(const APInt &LHS, const APInt &RHS) const {
+      return LHS.ult(RHS);
+    }
+  };
+}
+
+/// getAddExpr - Get a canonical add expression, or something simpler if
+/// possible.
+const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                        SCEV::NoWrapFlags Flags) {
+  assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
+         "only nuw or nsw allowed");
+  assert(!Ops.empty() && "Cannot get empty add!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
+           "SCEVAddExpr operand types don't match!");
+#endif
+
+  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
+  // And vice-versa.
+  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
+  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
+  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
+    bool All = true;
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
+         E = Ops.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
+        All = false;
+        break;
+      }
+    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
+  }
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      Ops[0] = getConstant(LHSC->getValue()->getValue() +
+                           RHSC->getValue()->getValue());
+      if (Ops.size() == 2) return Ops[0];
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant zero being added, strip it off.
+    if (LHSC->getValue()->isZero()) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    }
+
+    if (Ops.size() == 1) return Ops[0];
+  }
+
+  // Okay, check to see if the same value occurs in the operand list more than
+  // once.  If so, merge them together into an multiply expression.  Since we
+  // sorted the list, these values are required to be adjacent.
+  Type *Ty = Ops[0]->getType();
+  bool FoundMatch = false;
+  for (unsigned i = 0, e = Ops.size(); i != e-1; ++i)
+    if (Ops[i] == Ops[i+1]) {      //  X + Y + Y  -->  X + Y*2
+      // Scan ahead to count how many equal operands there are.
+      unsigned Count = 2;
+      while (i+Count != e && Ops[i+Count] == Ops[i])
+        ++Count;
+      // Merge the values into a multiply.
+      const SCEV *Scale = getConstant(Ty, Count);
+      const SCEV *Mul = getMulExpr(Scale, Ops[i]);
+      if (Ops.size() == Count)
+        return Mul;
+      Ops[i] = Mul;
+      Ops.erase(Ops.begin()+i+1, Ops.begin()+i+Count);
+      --i; e -= Count - 1;
+      FoundMatch = true;
+    }
+  if (FoundMatch)
+    return getAddExpr(Ops, Flags);
+
+  // Check for truncates. If all the operands are truncated from the same
+  // type, see if factoring out the truncate would permit the result to be
+  // folded. eg., trunc(x) + m*trunc(n) --> trunc(x + trunc(m)*n)
+  // if the contents of the resulting outer trunc fold to something simple.
+  for (; Idx < Ops.size() && isa<SCEVTruncateExpr>(Ops[Idx]); ++Idx) {
+    const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Ops[Idx]);
+    Type *DstType = Trunc->getType();
+    Type *SrcType = Trunc->getOperand()->getType();
+    SmallVector<const SCEV *, 8> LargeOps;
+    bool Ok = true;
+    // Check all the operands to see if they can be represented in the
+    // source type of the truncate.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(Ops[i])) {
+        if (T->getOperand()->getType() != SrcType) {
+          Ok = false;
+          break;
+        }
+        LargeOps.push_back(T->getOperand());
+      } else if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[i])) {
+        LargeOps.push_back(getAnyExtendExpr(C, SrcType));
+      } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Ops[i])) {
+        SmallVector<const SCEV *, 8> LargeMulOps;
+        for (unsigned j = 0, f = M->getNumOperands(); j != f && Ok; ++j) {
+          if (const SCEVTruncateExpr *T =
+                dyn_cast<SCEVTruncateExpr>(M->getOperand(j))) {
+            if (T->getOperand()->getType() != SrcType) {
+              Ok = false;
+              break;
+            }
+            LargeMulOps.push_back(T->getOperand());
+          } else if (const SCEVConstant *C =
+                       dyn_cast<SCEVConstant>(M->getOperand(j))) {
+            LargeMulOps.push_back(getAnyExtendExpr(C, SrcType));
+          } else {
+            Ok = false;
+            break;
+          }
+        }
+        if (Ok)
+          LargeOps.push_back(getMulExpr(LargeMulOps));
+      } else {
+        Ok = false;
+        break;
+      }
+    }
+    if (Ok) {
+      // Evaluate the expression in the larger type.
+      const SCEV *Fold = getAddExpr(LargeOps, Flags);
+      // If it folds to something simple, use it. Otherwise, don't.
+      if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
+        return getTruncateExpr(Fold, DstType);
+    }
+  }
+
+  // Skip past any other cast SCEVs.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
+    ++Idx;
+
+  // If there are add operands they would be next.
+  if (Idx < Ops.size()) {
+    bool DeletedAdd = false;
+    while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
+      // If we have an add, expand the add operands onto the end of the operands
+      // list.
+      Ops.erase(Ops.begin()+Idx);
+      Ops.append(Add->op_begin(), Add->op_end());
+      DeletedAdd = true;
+    }
+
+    // If we deleted at least one add, we added operands to the end of the list,
+    // and they are not necessarily sorted.  Recurse to resort and resimplify
+    // any operands we just acquired.
+    if (DeletedAdd)
+      return getAddExpr(Ops);
+  }
+
+  // Skip over the add expression until we get to a multiply.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
+    ++Idx;
+
+  // Check to see if there are any folding opportunities present with
+  // operands multiplied by constant values.
+  if (Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx])) {
+    uint64_t BitWidth = getTypeSizeInBits(Ty);
+    DenseMap<const SCEV *, APInt> M;
+    SmallVector<const SCEV *, 8> NewOps;
+    APInt AccumulatedConstant(BitWidth, 0);
+    if (CollectAddOperandsWithScales(M, NewOps, AccumulatedConstant,
+                                     Ops.data(), Ops.size(),
+                                     APInt(BitWidth, 1), *this)) {
+      // Some interesting folding opportunity is present, so its worthwhile to
+      // re-generate the operands list. Group the operands by constant scale,
+      // to avoid multiplying by the same constant scale multiple times.
+      std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists;
+      for (SmallVector<const SCEV *, 8>::const_iterator I = NewOps.begin(),
+           E = NewOps.end(); I != E; ++I)
+        MulOpLists[M.find(*I)->second].push_back(*I);
+      // Re-generate the operands list.
+      Ops.clear();
+      if (AccumulatedConstant != 0)
+        Ops.push_back(getConstant(AccumulatedConstant));
+      for (std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare>::iterator
+           I = MulOpLists.begin(), E = MulOpLists.end(); I != E; ++I)
+        if (I->first != 0)
+          Ops.push_back(getMulExpr(getConstant(I->first),
+                                   getAddExpr(I->second)));
+      if (Ops.empty())
+        return getConstant(Ty, 0);
+      if (Ops.size() == 1)
+        return Ops[0];
+      return getAddExpr(Ops);
+    }
+  }
+
+  // If we are adding something to a multiply expression, make sure the
+  // something is not already an operand of the multiply.  If so, merge it into
+  // the multiply.
+  for (; Idx < Ops.size() && isa<SCEVMulExpr>(Ops[Idx]); ++Idx) {
+    const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Ops[Idx]);
+    for (unsigned MulOp = 0, e = Mul->getNumOperands(); MulOp != e; ++MulOp) {
+      const SCEV *MulOpSCEV = Mul->getOperand(MulOp);
+      if (isa<SCEVConstant>(MulOpSCEV))
+        continue;
+      for (unsigned AddOp = 0, e = Ops.size(); AddOp != e; ++AddOp)
+        if (MulOpSCEV == Ops[AddOp]) {
+          // Fold W + X + (X * Y * Z)  -->  W + (X * ((Y*Z)+1))
+          const SCEV *InnerMul = Mul->getOperand(MulOp == 0);
+          if (Mul->getNumOperands() != 2) {
+            // If the multiply has more than two operands, we must get the
+            // Y*Z term.
+            SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
+                                                Mul->op_begin()+MulOp);
+            MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
+            InnerMul = getMulExpr(MulOps);
+          }
+          const SCEV *One = getConstant(Ty, 1);
+          const SCEV *AddOne = getAddExpr(One, InnerMul);
+          const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
+          if (Ops.size() == 2) return OuterMul;
+          if (AddOp < Idx) {
+            Ops.erase(Ops.begin()+AddOp);
+            Ops.erase(Ops.begin()+Idx-1);
+          } else {
+            Ops.erase(Ops.begin()+Idx);
+            Ops.erase(Ops.begin()+AddOp-1);
+          }
+          Ops.push_back(OuterMul);
+          return getAddExpr(Ops);
+        }
+
+      // Check this multiply against other multiplies being added together.
+      for (unsigned OtherMulIdx = Idx+1;
+           OtherMulIdx < Ops.size() && isa<SCEVMulExpr>(Ops[OtherMulIdx]);
+           ++OtherMulIdx) {
+        const SCEVMulExpr *OtherMul = cast<SCEVMulExpr>(Ops[OtherMulIdx]);
+        // If MulOp occurs in OtherMul, we can fold the two multiplies
+        // together.
+        for (unsigned OMulOp = 0, e = OtherMul->getNumOperands();
+             OMulOp != e; ++OMulOp)
+          if (OtherMul->getOperand(OMulOp) == MulOpSCEV) {
+            // Fold X + (A*B*C) + (A*D*E) --> X + (A*(B*C+D*E))
+            const SCEV *InnerMul1 = Mul->getOperand(MulOp == 0);
+            if (Mul->getNumOperands() != 2) {
+              SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
+                                                  Mul->op_begin()+MulOp);
+              MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
+              InnerMul1 = getMulExpr(MulOps);
+            }
+            const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0);
+            if (OtherMul->getNumOperands() != 2) {
+              SmallVector<const SCEV *, 4> MulOps(OtherMul->op_begin(),
+                                                  OtherMul->op_begin()+OMulOp);
+              MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
+              InnerMul2 = getMulExpr(MulOps);
+            }
+            const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
+            const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
+            if (Ops.size() == 2) return OuterMul;
+            Ops.erase(Ops.begin()+Idx);
+            Ops.erase(Ops.begin()+OtherMulIdx-1);
+            Ops.push_back(OuterMul);
+            return getAddExpr(Ops);
+          }
+      }
+    }
+  }
+
+  // If there are any add recurrences in the operands list, see if any other
+  // added values are loop invariant.  If so, we can fold them into the
+  // recurrence.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
+    ++Idx;
+
+  // Scan over all recurrences, trying to fold loop invariants into them.
+  for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
+    // Scan all of the other operands to this add and add them to the vector if
+    // they are loop invariant w.r.t. the recurrence.
+    SmallVector<const SCEV *, 8> LIOps;
+    const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
+    const Loop *AddRecLoop = AddRec->getLoop();
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+        LIOps.push_back(Ops[i]);
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+      }
+
+    // If we found some loop invariants, fold them into the recurrence.
+    if (!LIOps.empty()) {
+      //  NLI + LI + {Start,+,Step}  -->  NLI + {LI+Start,+,Step}
+      LIOps.push_back(AddRec->getStart());
+
+      SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
+                                             AddRec->op_end());
+      AddRecOps[0] = getAddExpr(LIOps);
+
+      // Build the new addrec. Propagate the NUW and NSW flags if both the
+      // outer add and the inner addrec are guaranteed to have no overflow.
+      // Always propagate NW.
+      Flags = AddRec->getNoWrapFlags(setFlags(Flags, SCEV::FlagNW));
+      const SCEV *NewRec = getAddRecExpr(AddRecOps, AddRecLoop, Flags);
+
+      // If all of the other operands were loop invariant, we are done.
+      if (Ops.size() == 1) return NewRec;
+
+      // Otherwise, add the folded AddRec by the non-invariant parts.
+      for (unsigned i = 0;; ++i)
+        if (Ops[i] == AddRec) {
+          Ops[i] = NewRec;
+          break;
+        }
+      return getAddExpr(Ops);
+    }
+
+    // Okay, if there weren't any loop invariants to be folded, check to see if
+    // there are multiple AddRec's with the same loop induction variable being
+    // added together.  If so, we can fold them.
+    for (unsigned OtherIdx = Idx+1;
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+         ++OtherIdx)
+      if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
+        // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
+        SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
+                                               AddRec->op_end());
+        for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+             ++OtherIdx)
+          if (const SCEVAddRecExpr *OtherAddRec =
+                dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
+            if (OtherAddRec->getLoop() == AddRecLoop) {
+              for (unsigned i = 0, e = OtherAddRec->getNumOperands();
+                   i != e; ++i) {
+                if (i >= AddRecOps.size()) {
+                  AddRecOps.append(OtherAddRec->op_begin()+i,
+                                   OtherAddRec->op_end());
+                  break;
+                }
+                AddRecOps[i] = getAddExpr(AddRecOps[i],
+                                          OtherAddRec->getOperand(i));
+              }
+              Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+            }
+        // Step size has changed, so we cannot guarantee no self-wraparound.
+        Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
+        return getAddExpr(Ops);
+      }
+
+    // Otherwise couldn't fold anything into this recurrence.  Move onto the
+    // next one.
+  }
+
+  // Okay, it looks like we really DO need an add expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scAddExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = 0;
+  SCEVAddExpr *S =
+    static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+    S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator),
+                                        O, Ops.size());
+    UniqueSCEVs.InsertNode(S, IP);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
+static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) {
+  uint64_t k = i*j;
+  if (j > 1 && k / j != i) Overflow = true;
+  return k;
+}
+
+/// Compute the result of "n choose k", the binomial coefficient.  If an
+/// intermediate computation overflows, Overflow will be set and the return will
+/// be garbage. Overflow is not cleared on absence of overflow.
+static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
+  // We use the multiplicative formula:
+  //     n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 .
+  // At each iteration, we take the n-th term of the numeral and divide by the
+  // (k-n)th term of the denominator.  This division will always produce an
+  // integral result, and helps reduce the chance of overflow in the
+  // intermediate computations. However, we can still overflow even when the
+  // final result would fit.
+
+  if (n == 0 || n == k) return 1;
+  if (k > n) return 0;
+
+  if (k > n/2)
+    k = n-k;
+
+  uint64_t r = 1;
+  for (uint64_t i = 1; i <= k; ++i) {
+    r = umul_ov(r, n-(i-1), Overflow);
+    r /= i;
+  }
+  return r;
+}
+
+/// getMulExpr - Get a canonical multiply expression, or something simpler if
+/// possible.
+const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                        SCEV::NoWrapFlags Flags) {
+  assert(Flags == maskFlags(Flags, SCEV::FlagNUW | SCEV::FlagNSW) &&
+         "only nuw or nsw allowed");
+  assert(!Ops.empty() && "Cannot get empty mul!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
+           "SCEVMulExpr operand types don't match!");
+#endif
+
+  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
+  // And vice-versa.
+  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
+  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
+  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
+    bool All = true;
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Ops.begin(),
+         E = Ops.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
+        All = false;
+        break;
+      }
+    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
+  }
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+
+    // C1*(C2+V) -> C1*C2 + C1*V
+    if (Ops.size() == 2)
+      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1]))
+        if (Add->getNumOperands() == 2 &&
+            isa<SCEVConstant>(Add->getOperand(0)))
+          return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)),
+                            getMulExpr(LHSC, Add->getOperand(1)));
+
+    ++Idx;
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(getContext(),
+                                           LHSC->getValue()->getValue() *
+                                           RHSC->getValue()->getValue());
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant one being multiplied, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->equalsInt(1)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    } else if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
+      // If we have a multiply of zero, it will always be zero.
+      return Ops[0];
+    } else if (Ops[0]->isAllOnesValue()) {
+      // If we have a mul by -1 of an add, try distributing the -1 among the
+      // add operands.
+      if (Ops.size() == 2) {
+        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[1])) {
+          SmallVector<const SCEV *, 4> NewOps;
+          bool AnyFolded = false;
+          for (SCEVAddRecExpr::op_iterator I = Add->op_begin(),
+                 E = Add->op_end(); I != E; ++I) {
+            const SCEV *Mul = getMulExpr(Ops[0], *I);
+            if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
+            NewOps.push_back(Mul);
+          }
+          if (AnyFolded)
+            return getAddExpr(NewOps);
+        }
+        else if (const SCEVAddRecExpr *
+                 AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
+          // Negation preserves a recurrence's no self-wrap property.
+          SmallVector<const SCEV *, 4> Operands;
+          for (SCEVAddRecExpr::op_iterator I = AddRec->op_begin(),
+                 E = AddRec->op_end(); I != E; ++I) {
+            Operands.push_back(getMulExpr(Ops[0], *I));
+          }
+          return getAddRecExpr(Operands, AddRec->getLoop(),
+                               AddRec->getNoWrapFlags(SCEV::FlagNW));
+        }
+      }
+    }
+
+    if (Ops.size() == 1)
+      return Ops[0];
+  }
+
+  // Skip over the add expression until we get to a multiply.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scMulExpr)
+    ++Idx;
+
+  // If there are mul operands inline them all into this expression.
+  if (Idx < Ops.size()) {
+    bool DeletedMul = false;
+    while (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[Idx])) {
+      // If we have an mul, expand the mul operands onto the end of the operands
+      // list.
+      Ops.erase(Ops.begin()+Idx);
+      Ops.append(Mul->op_begin(), Mul->op_end());
+      DeletedMul = true;
+    }
+
+    // If we deleted at least one mul, we added operands to the end of the list,
+    // and they are not necessarily sorted.  Recurse to resort and resimplify
+    // any operands we just acquired.
+    if (DeletedMul)
+      return getMulExpr(Ops);
+  }
+
+  // If there are any add recurrences in the operands list, see if any other
+  // added values are loop invariant.  If so, we can fold them into the
+  // recurrence.
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddRecExpr)
+    ++Idx;
+
+  // Scan over all recurrences, trying to fold loop invariants into them.
+  for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
+    // Scan all of the other operands to this mul and add them to the vector if
+    // they are loop invariant w.r.t. the recurrence.
+    SmallVector<const SCEV *, 8> LIOps;
+    const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
+    const Loop *AddRecLoop = AddRec->getLoop();
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+        LIOps.push_back(Ops[i]);
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+      }
+
+    // If we found some loop invariants, fold them into the recurrence.
+    if (!LIOps.empty()) {
+      //  NLI * LI * {Start,+,Step}  -->  NLI * {LI*Start,+,LI*Step}
+      SmallVector<const SCEV *, 4> NewOps;
+      NewOps.reserve(AddRec->getNumOperands());
+      const SCEV *Scale = getMulExpr(LIOps);
+      for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
+        NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
+
+      // Build the new addrec. Propagate the NUW and NSW flags if both the
+      // outer mul and the inner addrec are guaranteed to have no overflow.
+      //
+      // No self-wrap cannot be guaranteed after changing the step size, but
+      // will be inferred if either NUW or NSW is true.
+      Flags = AddRec->getNoWrapFlags(clearFlags(Flags, SCEV::FlagNW));
+      const SCEV *NewRec = getAddRecExpr(NewOps, AddRecLoop, Flags);
+
+      // If all of the other operands were loop invariant, we are done.
+      if (Ops.size() == 1) return NewRec;
+
+      // Otherwise, multiply the folded AddRec by the non-invariant parts.
+      for (unsigned i = 0;; ++i)
+        if (Ops[i] == AddRec) {
+          Ops[i] = NewRec;
+          break;
+        }
+      return getMulExpr(Ops);
+    }
+
+    // Okay, if there weren't any loop invariants to be folded, check to see if
+    // there are multiple AddRec's with the same loop induction variable being
+    // multiplied together.  If so, we can fold them.
+    for (unsigned OtherIdx = Idx+1;
+         OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+         ++OtherIdx) {
+      if (AddRecLoop != cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop())
+        continue;
+
+      // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
+      // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
+      //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
+      //   ]]],+,...up to x=2n}.
+      // Note that the arguments to choose() are always integers with values
+      // known at compile time, never SCEV objects.
+      //
+      // The implementation avoids pointless extra computations when the two
+      // addrec's are of different length (mathematically, it's equivalent to
+      // an infinite stream of zeros on the right).
+      bool OpsModified = false;
+      for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
+           ++OtherIdx) {
+        const SCEVAddRecExpr *OtherAddRec =
+          dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+        if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
+          continue;
+
+        bool Overflow = false;
+        Type *Ty = AddRec->getType();
+        bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
+        SmallVector<const SCEV*, 7> AddRecOps;
+        for (int x = 0, xe = AddRec->getNumOperands() +
+               OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
+          const SCEV *Term = getConstant(Ty, 0);
+          for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
+            uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
+            for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
+                   ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
+                 z < ze && !Overflow; ++z) {
+              uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
+              uint64_t Coeff;
+              if (LargerThan64Bits)
+                Coeff = umul_ov(Coeff1, Coeff2, Overflow);
+              else
+                Coeff = Coeff1*Coeff2;
+              const SCEV *CoeffTerm = getConstant(Ty, Coeff);
+              const SCEV *Term1 = AddRec->getOperand(y-z);
+              const SCEV *Term2 = OtherAddRec->getOperand(z);
+              Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
+            }
+          }
+          AddRecOps.push_back(Term);
+        }
+        if (!Overflow) {
+          const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
+                                                SCEV::FlagAnyWrap);
+          if (Ops.size() == 2) return NewAddRec;
+          Ops[Idx] = NewAddRec;
+          Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+          OpsModified = true;
+          AddRec = dyn_cast<SCEVAddRecExpr>(NewAddRec);
+          if (!AddRec)
+            break;
+        }
+      }
+      if (OpsModified)
+        return getMulExpr(Ops);
+    }
+
+    // Otherwise couldn't fold anything into this recurrence.  Move onto the
+    // next one.
+  }
+
+  // Okay, it looks like we really DO need an mul expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scMulExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = 0;
+  SCEVMulExpr *S =
+    static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+    S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator),
+                                        O, Ops.size());
+    UniqueSCEVs.InsertNode(S, IP);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
+/// getUDivExpr - Get a canonical unsigned division expression, or something
+/// simpler if possible.
+const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
+                                         const SCEV *RHS) {
+  assert(getEffectiveSCEVType(LHS->getType()) ==
+         getEffectiveSCEVType(RHS->getType()) &&
+         "SCEVUDivExpr operand types don't match!");
+
+  if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
+    if (RHSC->getValue()->equalsInt(1))
+      return LHS;                               // X udiv 1 --> x
+    // If the denominator is zero, the result of the udiv is undefined. Don't
+    // try to analyze it, because the resolution chosen here may differ from
+    // the resolution chosen in other parts of the compiler.
+    if (!RHSC->getValue()->isZero()) {
+      // Determine if the division can be folded into the operands of
+      // its operands.
+      // TODO: Generalize this to non-constants by using known-bits information.
+      Type *Ty = LHS->getType();
+      unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros();
+      unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1;
+      // For non-power-of-two values, effectively round the value up to the
+      // nearest power of two.
+      if (!RHSC->getValue()->getValue().isPowerOf2())
+        ++MaxShiftAmt;
+      IntegerType *ExtTy =
+        IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt);
+      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
+        if (const SCEVConstant *Step =
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this))) {
+          // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
+          const APInt &StepInt = Step->getValue()->getValue();
+          const APInt &DivInt = RHSC->getValue()->getValue();
+          if (!StepInt.urem(DivInt) &&
+              getZeroExtendExpr(AR, ExtTy) ==
+              getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                            getZeroExtendExpr(Step, ExtTy),
+                            AR->getLoop(), SCEV::FlagAnyWrap)) {
+            SmallVector<const SCEV *, 4> Operands;
+            for (unsigned i = 0, e = AR->getNumOperands(); i != e; ++i)
+              Operands.push_back(getUDivExpr(AR->getOperand(i), RHS));
+            return getAddRecExpr(Operands, AR->getLoop(),
+                                 SCEV::FlagNW);
+          }
+          /// Get a canonical UDivExpr for a recurrence.
+          /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
+          // We can currently only fold X%N if X is constant.
+          const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
+          if (StartC && !DivInt.urem(StepInt) &&
+              getZeroExtendExpr(AR, ExtTy) ==
+              getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                            getZeroExtendExpr(Step, ExtTy),
+                            AR->getLoop(), SCEV::FlagAnyWrap)) {
+            const APInt &StartInt = StartC->getValue()->getValue();
+            const APInt &StartRem = StartInt.urem(StepInt);
+            if (StartRem != 0)
+              LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step,
+                                  AR->getLoop(), SCEV::FlagNW);
+          }
+        }
+      // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
+      if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
+        SmallVector<const SCEV *, 4> Operands;
+        for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i)
+          Operands.push_back(getZeroExtendExpr(M->getOperand(i), ExtTy));
+        if (getZeroExtendExpr(M, ExtTy) == getMulExpr(Operands))
+          // Find an operand that's safely divisible.
+          for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
+            const SCEV *Op = M->getOperand(i);
+            const SCEV *Div = getUDivExpr(Op, RHSC);
+            if (!isa<SCEVUDivExpr>(Div) && getMulExpr(Div, RHSC) == Op) {
+              Operands = SmallVector<const SCEV *, 4>(M->op_begin(),
+                                                      M->op_end());
+              Operands[i] = Div;
+              return getMulExpr(Operands);
+            }
+          }
+      }
+      // (A+B)/C --> (A/C + B/C) if safe and A/C and B/C can be folded.
+      if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(LHS)) {
+        SmallVector<const SCEV *, 4> Operands;
+        for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i)
+          Operands.push_back(getZeroExtendExpr(A->getOperand(i), ExtTy));
+        if (getZeroExtendExpr(A, ExtTy) == getAddExpr(Operands)) {
+          Operands.clear();
+          for (unsigned i = 0, e = A->getNumOperands(); i != e; ++i) {
+            const SCEV *Op = getUDivExpr(A->getOperand(i), RHS);
+            if (isa<SCEVUDivExpr>(Op) ||
+                getMulExpr(Op, RHS) != A->getOperand(i))
+              break;
+            Operands.push_back(Op);
+          }
+          if (Operands.size() == A->getNumOperands())
+            return getAddExpr(Operands);
+        }
+      }
+
+      // Fold if both operands are constant.
+      if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
+        Constant *LHSCV = LHSC->getValue();
+        Constant *RHSCV = RHSC->getValue();
+        return getConstant(cast<ConstantInt>(ConstantExpr::getUDiv(LHSCV,
+                                                                   RHSCV)));
+      }
+    }
+  }
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(scUDivExpr);
+  ID.AddPointer(LHS);
+  ID.AddPointer(RHS);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator),
+                                             LHS, RHS);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+
+/// getAddRecExpr - Get an add recurrence expression for the specified loop.
+/// Simplify the expression as much as possible.
+const SCEV *ScalarEvolution::getAddRecExpr(const SCEV *Start, const SCEV *Step,
+                                           const Loop *L,
+                                           SCEV::NoWrapFlags Flags) {
+  SmallVector<const SCEV *, 4> Operands;
+  Operands.push_back(Start);
+  if (const SCEVAddRecExpr *StepChrec = dyn_cast<SCEVAddRecExpr>(Step))
+    if (StepChrec->getLoop() == L) {
+      Operands.append(StepChrec->op_begin(), StepChrec->op_end());
+      return getAddRecExpr(Operands, L, maskFlags(Flags, SCEV::FlagNW));
+    }
+
+  Operands.push_back(Step);
+  return getAddRecExpr(Operands, L, Flags);
+}
+
+/// getAddRecExpr - Get an add recurrence expression for the specified loop.
+/// Simplify the expression as much as possible.
+const SCEV *
+ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
+                               const Loop *L, SCEV::NoWrapFlags Flags) {
+  if (Operands.size() == 1) return Operands[0];
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy &&
+           "SCEVAddRecExpr operand types don't match!");
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+    assert(isLoopInvariant(Operands[i], L) &&
+           "SCEVAddRecExpr operand is not loop-invariant!");
+#endif
+
+  if (Operands.back()->isZero()) {
+    Operands.pop_back();
+    return getAddRecExpr(Operands, L, SCEV::FlagAnyWrap); // {X,+,0}  -->  X
+  }
+
+  // It's tempting to want to call getMaxBackedgeTakenCount count here and
+  // use that information to infer NUW and NSW flags. However, computing a
+  // BE count requires calling getAddRecExpr, so we may not yet have a
+  // meaningful BE count at this point (and if we don't, we'd be stuck
+  // with a SCEVCouldNotCompute as the cached BE count).
+
+  // If FlagNSW is true and all the operands are non-negative, infer FlagNUW.
+  // And vice-versa.
+  int SignOrUnsignMask = SCEV::FlagNUW | SCEV::FlagNSW;
+  SCEV::NoWrapFlags SignOrUnsignWrap = maskFlags(Flags, SignOrUnsignMask);
+  if (SignOrUnsignWrap && (SignOrUnsignWrap != SignOrUnsignMask)) {
+    bool All = true;
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = Operands.begin(),
+         E = Operands.end(); I != E; ++I)
+      if (!isKnownNonNegative(*I)) {
+        All = false;
+        break;
+      }
+    if (All) Flags = setFlags(Flags, (SCEV::NoWrapFlags)SignOrUnsignMask);
+  }
+
+  // Canonicalize nested AddRecs in by nesting them in order of loop depth.
+  if (const SCEVAddRecExpr *NestedAR = dyn_cast<SCEVAddRecExpr>(Operands[0])) {
+    const Loop *NestedLoop = NestedAR->getLoop();
+    if (L->contains(NestedLoop) ?
+        (L->getLoopDepth() < NestedLoop->getLoopDepth()) :
+        (!NestedLoop->contains(L) &&
+         DT->dominates(L->getHeader(), NestedLoop->getHeader()))) {
+      SmallVector<const SCEV *, 4> NestedOperands(NestedAR->op_begin(),
+                                                  NestedAR->op_end());
+      Operands[0] = NestedAR->getStart();
+      // AddRecs require their operands be loop-invariant with respect to their
+      // loops. Don't perform this transformation if it would break this
+      // requirement.
+      bool AllInvariant = true;
+      for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+        if (!isLoopInvariant(Operands[i], L)) {
+          AllInvariant = false;
+          break;
+        }
+      if (AllInvariant) {
+        // Create a recurrence for the outer loop with the same step size.
+        //
+        // The outer recurrence keeps its NW flag but only keeps NUW/NSW if the
+        // inner recurrence has the same property.
+        SCEV::NoWrapFlags OuterFlags =
+          maskFlags(Flags, SCEV::FlagNW | NestedAR->getNoWrapFlags());
+
+        NestedOperands[0] = getAddRecExpr(Operands, L, OuterFlags);
+        AllInvariant = true;
+        for (unsigned i = 0, e = NestedOperands.size(); i != e; ++i)
+          if (!isLoopInvariant(NestedOperands[i], NestedLoop)) {
+            AllInvariant = false;
+            break;
+          }
+        if (AllInvariant) {
+          // Ok, both add recurrences are valid after the transformation.
+          //
+          // The inner recurrence keeps its NW flag but only keeps NUW/NSW if
+          // the outer recurrence has the same property.
+          SCEV::NoWrapFlags InnerFlags =
+            maskFlags(NestedAR->getNoWrapFlags(), SCEV::FlagNW | Flags);
+          return getAddRecExpr(NestedOperands, NestedLoop, InnerFlags);
+        }
+      }
+      // Reset Operands to its original state.
+      Operands[0] = NestedAR;
+    }
+  }
+
+  // Okay, it looks like we really DO need an addrec expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scAddRecExpr);
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
+    ID.AddPointer(Operands[i]);
+  ID.AddPointer(L);
+  void *IP = 0;
+  SCEVAddRecExpr *S =
+    static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Operands.size());
+    std::uninitialized_copy(Operands.begin(), Operands.end(), O);
+    S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator),
+                                           O, Operands.size(), L);
+    UniqueSCEVs.InsertNode(S, IP);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
+const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS,
+                                         const SCEV *RHS) {
+  SmallVector<const SCEV *, 2> Ops;
+  Ops.push_back(LHS);
+  Ops.push_back(RHS);
+  return getSMaxExpr(Ops);
+}
+
+const SCEV *
+ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
+  assert(!Ops.empty() && "Cannot get empty smax!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
+           "SCEVSMaxExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(getContext(),
+                              APIntOps::smax(LHSC->getValue()->getValue(),
+                                             RHSC->getValue()->getValue()));
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant minimum-int, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(true)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    } else if (cast<SCEVConstant>(Ops[0])->getValue()->isMaxValue(true)) {
+      // If we have an smax with a constant maximum-int, it will always be
+      // maximum-int.
+      return Ops[0];
+    }
+
+    if (Ops.size() == 1) return Ops[0];
+  }
+
+  // Find the first SMax
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scSMaxExpr)
+    ++Idx;
+
+  // Check to see if one of the operands is an SMax. If so, expand its operands
+  // onto our operand list, and recurse to simplify.
+  if (Idx < Ops.size()) {
+    bool DeletedSMax = false;
+    while (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(Ops[Idx])) {
+      Ops.erase(Ops.begin()+Idx);
+      Ops.append(SMax->op_begin(), SMax->op_end());
+      DeletedSMax = true;
+    }
+
+    if (DeletedSMax)
+      return getSMaxExpr(Ops);
+  }
+
+  // Okay, check to see if the same value occurs in the operand list twice.  If
+  // so, delete one.  Since we sorted the list, these values are required to
+  // be adjacent.
+  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+    //  X smax Y smax Y  -->  X smax Y
+    //  X smax Y         -->  X, if X is always greater than Y
+    if (Ops[i] == Ops[i+1] ||
+        isKnownPredicate(ICmpInst::ICMP_SGE, Ops[i], Ops[i+1])) {
+      Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2);
+      --i; --e;
+    } else if (isKnownPredicate(ICmpInst::ICMP_SLE, Ops[i], Ops[i+1])) {
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
+      --i; --e;
+    }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  assert(!Ops.empty() && "Reduced smax down to nothing!");
+
+  // Okay, it looks like we really DO need an smax expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scSMaxExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+  std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+  SCEV *S = new (SCEVAllocator) SCEVSMaxExpr(ID.Intern(SCEVAllocator),
+                                             O, Ops.size());
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+const SCEV *ScalarEvolution::getUMaxExpr(const SCEV *LHS,
+                                         const SCEV *RHS) {
+  SmallVector<const SCEV *, 2> Ops;
+  Ops.push_back(LHS);
+  Ops.push_back(RHS);
+  return getUMaxExpr(Ops);
+}
+
+const SCEV *
+ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
+  assert(!Ops.empty() && "Cannot get empty umax!");
+  if (Ops.size() == 1) return Ops[0];
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
+           "SCEVUMaxExpr operand types don't match!");
+#endif
+
+  // Sort by complexity, this groups all similar expression types together.
+  GroupByComplexity(Ops, LI);
+
+  // If there are any constants, fold them together.
+  unsigned Idx = 0;
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
+    ++Idx;
+    assert(Idx < Ops.size());
+    while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
+      // We found two constants, fold them together!
+      ConstantInt *Fold = ConstantInt::get(getContext(),
+                              APIntOps::umax(LHSC->getValue()->getValue(),
+                                             RHSC->getValue()->getValue()));
+      Ops[0] = getConstant(Fold);
+      Ops.erase(Ops.begin()+1);  // Erase the folded element
+      if (Ops.size() == 1) return Ops[0];
+      LHSC = cast<SCEVConstant>(Ops[0]);
+    }
+
+    // If we are left with a constant minimum-int, strip it off.
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isMinValue(false)) {
+      Ops.erase(Ops.begin());
+      --Idx;
+    } else if (cast<SCEVConstant>(Ops[0])->getValue()->isMaxValue(false)) {
+      // If we have an umax with a constant maximum-int, it will always be
+      // maximum-int.
+      return Ops[0];
+    }
+
+    if (Ops.size() == 1) return Ops[0];
+  }
+
+  // Find the first UMax
+  while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scUMaxExpr)
+    ++Idx;
+
+  // Check to see if one of the operands is a UMax. If so, expand its operands
+  // onto our operand list, and recurse to simplify.
+  if (Idx < Ops.size()) {
+    bool DeletedUMax = false;
+    while (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(Ops[Idx])) {
+      Ops.erase(Ops.begin()+Idx);
+      Ops.append(UMax->op_begin(), UMax->op_end());
+      DeletedUMax = true;
+    }
+
+    if (DeletedUMax)
+      return getUMaxExpr(Ops);
+  }
+
+  // Okay, check to see if the same value occurs in the operand list twice.  If
+  // so, delete one.  Since we sorted the list, these values are required to
+  // be adjacent.
+  for (unsigned i = 0, e = Ops.size()-1; i != e; ++i)
+    //  X umax Y umax Y  -->  X umax Y
+    //  X umax Y         -->  X, if X is always greater than Y
+    if (Ops[i] == Ops[i+1] ||
+        isKnownPredicate(ICmpInst::ICMP_UGE, Ops[i], Ops[i+1])) {
+      Ops.erase(Ops.begin()+i+1, Ops.begin()+i+2);
+      --i; --e;
+    } else if (isKnownPredicate(ICmpInst::ICMP_ULE, Ops[i], Ops[i+1])) {
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+1);
+      --i; --e;
+    }
+
+  if (Ops.size() == 1) return Ops[0];
+
+  assert(!Ops.empty() && "Reduced umax down to nothing!");
+
+  // Okay, it looks like we really DO need a umax expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(scUMaxExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = 0;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+  std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+  SCEV *S = new (SCEVAllocator) SCEVUMaxExpr(ID.Intern(SCEVAllocator),
+                                             O, Ops.size());
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+const SCEV *ScalarEvolution::getSMinExpr(const SCEV *LHS,
+                                         const SCEV *RHS) {
+  // ~smax(~x, ~y) == smin(x, y).
+  return getNotSCEV(getSMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
+}
+
+const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
+                                         const SCEV *RHS) {
+  // ~umax(~x, ~y) == umin(x, y)
+  return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
+}
+
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
+  // If we have DataLayout, we can bypass creating a target-independent
+  // constant expression and then folding it back into a ConstantInt.
+  // This is just a compile-time optimization.
+  if (TD)
+    return getConstant(TD->getIntPtrType(getContext()),
+                       TD->getTypeAllocSize(AllocTy));
+
+  Constant *C = ConstantExpr::getSizeOf(AllocTy);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
+      C = Folded;
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  return getTruncateOrZeroExtend(getSCEV(C), Ty);
+}
+
+const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
+  Constant *C = ConstantExpr::getAlignOf(AllocTy);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
+      C = Folded;
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  return getTruncateOrZeroExtend(getSCEV(C), Ty);
+}
+
+const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
+                                             unsigned FieldNo) {
+  // If we have DataLayout, we can bypass creating a target-independent
+  // constant expression and then folding it back into a ConstantInt.
+  // This is just a compile-time optimization.
+  if (TD)
+    return getConstant(TD->getIntPtrType(getContext()),
+                       TD->getStructLayout(STy)->getElementOffset(FieldNo));
+
+  Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
+      C = Folded;
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
+  return getTruncateOrZeroExtend(getSCEV(C), Ty);
+}
+
+const SCEV *ScalarEvolution::getOffsetOfExpr(Type *CTy,
+                                             Constant *FieldNo) {
+  Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
+      C = Folded;
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
+  return getTruncateOrZeroExtend(getSCEV(C), Ty);
+}
+
+const SCEV *ScalarEvolution::getUnknown(Value *V) {
+  // Don't attempt to do anything other than create a SCEVUnknown object
+  // here.  createSCEV only calls getUnknown after checking for all other
+  // interesting possibilities, and any other code that calls getUnknown
+  // is doing so in order to hide a value from SCEV canonicalization.
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(scUnknown);
+  ID.AddPointer(V);
+  void *IP = 0;
+  if (SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) {
+    assert(cast<SCEVUnknown>(S)->getValue() == V &&
+           "Stale SCEVUnknown in uniquing map!");
+    return S;
+  }
+  SCEV *S = new (SCEVAllocator) SCEVUnknown(ID.Intern(SCEVAllocator), V, this,
+                                            FirstUnknown);
+  FirstUnknown = cast<SCEVUnknown>(S);
+  UniqueSCEVs.InsertNode(S, IP);
+  return S;
+}
+
+//===----------------------------------------------------------------------===//
+//            Basic SCEV Analysis and PHI Idiom Recognition Code
+//
+
+/// isSCEVable - Test if values of the given type are analyzable within
+/// the SCEV framework. This primarily includes integer types, and it
+/// can optionally include pointer types if the ScalarEvolution class
+/// has access to target-specific information.
+bool ScalarEvolution::isSCEVable(Type *Ty) const {
+  // Integers and pointers are always SCEVable.
+  return Ty->isIntegerTy() || Ty->isPointerTy();
+}
+
+/// getTypeSizeInBits - Return the size in bits of the specified type,
+/// for which isSCEVable must return true.
+uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
+  assert(isSCEVable(Ty) && "Type is not SCEVable!");
+
+  // If we have a DataLayout, use it!
+  if (TD)
+    return TD->getTypeSizeInBits(Ty);
+
+  // Integer types have fixed sizes.
+  if (Ty->isIntegerTy())
+    return Ty->getPrimitiveSizeInBits();
+
+  // The only other support type is pointer. Without DataLayout, conservatively
+  // assume pointers are 64-bit.
+  assert(Ty->isPointerTy() && "isSCEVable permitted a non-SCEVable type!");
+  return 64;
+}
+
+/// getEffectiveSCEVType - Return a type with the same bitwidth as
+/// the given type and which represents how SCEV will treat the given
+/// type, for which isSCEVable must return true. For pointer types,
+/// this is the pointer-sized integer type.
+Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
+  assert(isSCEVable(Ty) && "Type is not SCEVable!");
+
+  if (Ty->isIntegerTy())
+    return Ty;
+
+  // The only other support type is pointer.
+  assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
+  if (TD) return TD->getIntPtrType(getContext());
+
+  // Without DataLayout, conservatively assume pointers are 64-bit.
+  return Type::getInt64Ty(getContext());
+}
+
+const SCEV *ScalarEvolution::getCouldNotCompute() {
+  return &CouldNotCompute;
+}
+
+/// getSCEV - Return an existing SCEV if it exists, otherwise analyze the
+/// expression and create a new one.
+const SCEV *ScalarEvolution::getSCEV(Value *V) {
+  assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
+
+  ValueExprMapType::const_iterator I = ValueExprMap.find_as(V);
+  if (I != ValueExprMap.end()) return I->second;
+  const SCEV *S = createSCEV(V);
+
+  // The process of creating a SCEV for V may have caused other SCEVs
+  // to have been created, so it's necessary to insert the new entry
+  // from scratch, rather than trying to remember the insert position
+  // above.
+  ValueExprMap.insert(std::make_pair(SCEVCallbackVH(V, this), S));
+  return S;
+}
+
+/// getNegativeSCEV - Return a SCEV corresponding to -V = -1*V
+///
+const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V) {
+  if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
+    return getConstant(
+               cast<ConstantInt>(ConstantExpr::getNeg(VC->getValue())));
+
+  Type *Ty = V->getType();
+  Ty = getEffectiveSCEVType(Ty);
+  return getMulExpr(V,
+                  getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))));
+}
+
+/// getNotSCEV - Return a SCEV corresponding to ~V = -1-V
+const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
+  if (const SCEVConstant *VC = dyn_cast<SCEVConstant>(V))
+    return getConstant(
+                cast<ConstantInt>(ConstantExpr::getNot(VC->getValue())));
+
+  Type *Ty = V->getType();
+  Ty = getEffectiveSCEVType(Ty);
+  const SCEV *AllOnes =
+                   getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty)));
+  return getMinusSCEV(AllOnes, V);
+}
+
+/// getMinusSCEV - Return LHS-RHS.  Minus is represented in SCEV as A+B*-1.
+const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
+                                          SCEV::NoWrapFlags Flags) {
+  assert(!maskFlags(Flags, SCEV::FlagNUW) && "subtraction does not have NUW");
+
+  // Fast path: X - X --> 0.
+  if (LHS == RHS)
+    return getConstant(LHS->getType(), 0);
+
+  // X - Y --> X + -Y
+  return getAddExpr(LHS, getNegativeSCEV(RHS), Flags);
+}
+
+/// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is zero
+/// extended.
+const SCEV *
+ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot truncate or zero extend with non-integer arguments!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
+    return getTruncateExpr(V, Ty);
+  return getZeroExtendExpr(V, Ty);
+}
+
+/// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is sign
+/// extended.
+const SCEV *
+ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
+                                         Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot truncate or zero extend with non-integer arguments!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  if (getTypeSizeInBits(SrcTy) > getTypeSizeInBits(Ty))
+    return getTruncateExpr(V, Ty);
+  return getSignExtendExpr(V, Ty);
+}
+
+/// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is zero
+/// extended.  The conversion must not be narrowing.
+const SCEV *
+ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot noop or zero extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrZeroExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getZeroExtendExpr(V, Ty);
+}
+
+/// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  If the type must be extended, it is sign
+/// extended.  The conversion must not be narrowing.
+const SCEV *
+ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot noop or sign extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrSignExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getSignExtendExpr(V, Ty);
+}
+
+/// getNoopOrAnyExtend - Return a SCEV corresponding to a conversion of
+/// the input value to the specified type. If the type must be extended,
+/// it is extended with unspecified bits. The conversion must not be
+/// narrowing.
+const SCEV *
+ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot noop or any extend with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) <= getTypeSizeInBits(Ty) &&
+         "getNoopOrAnyExtend cannot truncate!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getAnyExtendExpr(V, Ty);
+}
+
+/// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
+/// input value to the specified type.  The conversion must not be widening.
+const SCEV *
+ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
+  assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+         (Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Cannot truncate or noop with non-integer arguments!");
+  assert(getTypeSizeInBits(SrcTy) >= getTypeSizeInBits(Ty) &&
+         "getTruncateOrNoop cannot extend!");
+  if (getTypeSizeInBits(SrcTy) == getTypeSizeInBits(Ty))
+    return V;  // No conversion
+  return getTruncateExpr(V, Ty);
+}
+
+/// getUMaxFromMismatchedTypes - Promote the operands to the wider of
+/// the types using zero-extension, and then perform a umax operation
+/// with them.
+const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS,
+                                                        const SCEV *RHS) {
+  const SCEV *PromotedLHS = LHS;
+  const SCEV *PromotedRHS = RHS;
+
+  if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
+    PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
+  else
+    PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());
+
+  return getUMaxExpr(PromotedLHS, PromotedRHS);
+}
+
+/// getUMinFromMismatchedTypes - Promote the operands to the wider of
+/// the types using zero-extension, and then perform a umin operation
+/// with them.
+const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS,
+                                                        const SCEV *RHS) {
+  const SCEV *PromotedLHS = LHS;
+  const SCEV *PromotedRHS = RHS;
+
+  if (getTypeSizeInBits(LHS->getType()) > getTypeSizeInBits(RHS->getType()))
+    PromotedRHS = getZeroExtendExpr(RHS, LHS->getType());
+  else
+    PromotedLHS = getNoopOrZeroExtend(LHS, RHS->getType());
+
+  return getUMinExpr(PromotedLHS, PromotedRHS);
+}
+
+/// getPointerBase - Transitively follow the chain of pointer-type operands
+/// until reaching a SCEV that does not have a single pointer operand. This
+/// returns a SCEVUnknown pointer for well-formed pointer-type expressions,
+/// but corner cases do exist.
+const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
+  // A pointer operand may evaluate to a nonpointer expression, such as null.
+  if (!V->getType()->isPointerTy())
+    return V;
+
+  if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
+    return getPointerBase(Cast->getOperand());
+  }
+  else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
+    const SCEV *PtrOp = 0;
+    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
+         I != E; ++I) {
+      if ((*I)->getType()->isPointerTy()) {
+        // Cannot find the base of an expression with multiple pointer operands.
+        if (PtrOp)
+          return V;
+        PtrOp = *I;
+      }
+    }
+    if (!PtrOp)
+      return V;
+    return getPointerBase(PtrOp);
+  }
+  return V;
+}
+
+/// PushDefUseChildren - Push users of the given Instruction
+/// onto the given Worklist.
+static void
+PushDefUseChildren(Instruction *I,
+                   SmallVectorImpl<Instruction *> &Worklist) {
+  // Push the def-use children onto the Worklist stack.
+  for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+       UI != UE; ++UI)
+    Worklist.push_back(cast<Instruction>(*UI));
+}
+
+/// ForgetSymbolicValue - This looks up computed SCEV values for all
+/// instructions that depend on the given instruction and removes them from
+/// the ValueExprMapType map if they reference SymName. This is used during PHI
+/// resolution.
+void
+ScalarEvolution::ForgetSymbolicName(Instruction *PN, const SCEV *SymName) {
+  SmallVector<Instruction *, 16> Worklist;
+  PushDefUseChildren(PN, Worklist);
+
+  SmallPtrSet<Instruction *, 8> Visited;
+  Visited.insert(PN);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    if (!Visited.insert(I)) continue;
+
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
+      const SCEV *Old = It->second;
+
+      // Short-circuit the def-use traversal if the symbolic name
+      // ceases to appear in expressions.
+      if (Old != SymName && !hasOperand(Old, SymName))
+        continue;
+
+      // SCEVUnknown for a PHI either means that it has an unrecognized
+      // structure, it's a PHI that's in the progress of being computed
+      // by createNodeForPHI, or it's a single-value PHI. In the first case,
+      // additional loop trip count information isn't going to change anything.
+      // In the second case, createNodeForPHI will perform the necessary
+      // updates on its own when it gets to that point. In the third, we do
+      // want to forget the SCEVUnknown.
+      if (!isa<PHINode>(I) ||
+          !isa<SCEVUnknown>(Old) ||
+          (I != PN && Old == SymName)) {
+        forgetMemoizedResults(Old);
+        ValueExprMap.erase(It);
+      }
+    }
+
+    PushDefUseChildren(I, Worklist);
+  }
+}
+
+/// createNodeForPHI - PHI nodes have two cases.  Either the PHI node exists in
+/// a loop header, making it a potential recurrence, or it doesn't.
+///
+const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
+  if (const Loop *L = LI->getLoopFor(PN->getParent()))
+    if (L->getHeader() == PN->getParent()) {
+      // The loop may have multiple entrances or multiple exits; we can analyze
+      // this phi as an addrec if it has a unique entry value and a unique
+      // backedge value.
+      Value *BEValueV = 0, *StartValueV = 0;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *V = PN->getIncomingValue(i);
+        if (L->contains(PN->getIncomingBlock(i))) {
+          if (!BEValueV) {
+            BEValueV = V;
+          } else if (BEValueV != V) {
+            BEValueV = 0;
+            break;
+          }
+        } else if (!StartValueV) {
+          StartValueV = V;
+        } else if (StartValueV != V) {
+          StartValueV = 0;
+          break;
+        }
+      }
+      if (BEValueV && StartValueV) {
+        // While we are analyzing this PHI node, handle its value symbolically.
+        const SCEV *SymbolicName = getUnknown(PN);
+        assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
+               "PHI node already processed?");
+        ValueExprMap.insert(std::make_pair(SCEVCallbackVH(PN, this), SymbolicName));
+
+        // Using this symbolic name for the PHI, analyze the value coming around
+        // the back-edge.
+        const SCEV *BEValue = getSCEV(BEValueV);
+
+        // NOTE: If BEValue is loop invariant, we know that the PHI node just
+        // has a special value for the first iteration of the loop.
+
+        // If the value coming around the backedge is an add with the symbolic
+        // value we just inserted, then we found a simple induction variable!
+        if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
+          // If there is a single occurrence of the symbolic value, replace it
+          // with a recurrence.
+          unsigned FoundIndex = Add->getNumOperands();
+          for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+            if (Add->getOperand(i) == SymbolicName)
+              if (FoundIndex == e) {
+                FoundIndex = i;
+                break;
+              }
+
+          if (FoundIndex != Add->getNumOperands()) {
+            // Create an add with everything but the specified operand.
+            SmallVector<const SCEV *, 8> Ops;
+            for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+              if (i != FoundIndex)
+                Ops.push_back(Add->getOperand(i));
+            const SCEV *Accum = getAddExpr(Ops);
+
+            // This is not a valid addrec if the step amount is varying each
+            // loop iteration, but is not itself an addrec in this loop.
+            if (isLoopInvariant(Accum, L) ||
+                (isa<SCEVAddRecExpr>(Accum) &&
+                 cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
+              SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+
+              // If the increment doesn't overflow, then neither the addrec nor
+              // the post-increment will overflow.
+              if (const AddOperator *OBO = dyn_cast<AddOperator>(BEValueV)) {
+                if (OBO->hasNoUnsignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNUW);
+                if (OBO->hasNoSignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNSW);
+              } else if (const GEPOperator *GEP =
+                         dyn_cast<GEPOperator>(BEValueV)) {
+                // If the increment is an inbounds GEP, then we know the address
+                // space cannot be wrapped around. We cannot make any guarantee
+                // about signed or unsigned overflow because pointers are
+                // unsigned but we may have a negative index from the base
+                // pointer.
+                if (GEP->isInBounds())
+                  Flags = setFlags(Flags, SCEV::FlagNW);
+              }
+
+              const SCEV *StartVal = getSCEV(StartValueV);
+              const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
+
+              // Since the no-wrap flags are on the increment, they apply to the
+              // post-incremented value as well.
+              if (isLoopInvariant(Accum, L))
+                (void)getAddRecExpr(getAddExpr(StartVal, Accum),
+                                    Accum, L, Flags);
+
+              // Okay, for the entire analysis of this edge we assumed the PHI
+              // to be symbolic.  We now need to go back and purge all of the
+              // entries for the scalars that use the symbolic expression.
+              ForgetSymbolicName(PN, SymbolicName);
+              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+              return PHISCEV;
+            }
+          }
+        } else if (const SCEVAddRecExpr *AddRec =
+                     dyn_cast<SCEVAddRecExpr>(BEValue)) {
+          // Otherwise, this could be a loop like this:
+          //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
+          // In this case, j = {1,+,1}  and BEValue is j.
+          // Because the other in-value of i (0) fits the evolution of BEValue
+          // i really is an addrec evolution.
+          if (AddRec->getLoop() == L && AddRec->isAffine()) {
+            const SCEV *StartVal = getSCEV(StartValueV);
+
+            // If StartVal = j.start - j.stride, we can use StartVal as the
+            // initial step of the addrec evolution.
+            if (StartVal == getMinusSCEV(AddRec->getOperand(0),
+                                         AddRec->getOperand(1))) {
+              // FIXME: For constant StartVal, we should be able to infer
+              // no-wrap flags.
+              const SCEV *PHISCEV =
+                getAddRecExpr(StartVal, AddRec->getOperand(1), L,
+                              SCEV::FlagAnyWrap);
+
+              // Okay, for the entire analysis of this edge we assumed the PHI
+              // to be symbolic.  We now need to go back and purge all of the
+              // entries for the scalars that use the symbolic expression.
+              ForgetSymbolicName(PN, SymbolicName);
+              ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+              return PHISCEV;
+            }
+          }
+        }
+      }
+    }
+
+  // If the PHI has a single incoming value, follow that value, unless the
+  // PHI's incoming blocks are in a different loop, in which case doing so
+  // risks breaking LCSSA form. Instcombine would normally zap these, but
+  // it doesn't have DominatorTree information, so it may miss cases.
+  if (Value *V = SimplifyInstruction(PN, TD, TLI, DT))
+    if (LI->replacementPreservesLCSSAForm(PN, V))
+      return getSCEV(V);
+
+  // If it's not a loop phi, we can't handle it yet.
+  return getUnknown(PN);
+}
+
+/// createNodeForGEP - Expand GEP instructions into add and multiply
+/// operations. This allows them to be analyzed by regular SCEV code.
+///
+const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
+
+  // Don't blindly transfer the inbounds flag from the GEP instruction to the
+  // Add expression, because the Instruction may be guarded by control flow
+  // and the no-overflow bits may not be valid for the expression in any
+  // context.
+  bool isInBounds = GEP->isInBounds();
+
+  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
+  Value *Base = GEP->getOperand(0);
+  // Don't attempt to analyze GEPs over unsized objects.
+  if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
+    return getUnknown(GEP);
+  const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (GetElementPtrInst::op_iterator I = llvm::next(GEP->op_begin()),
+                                      E = GEP->op_end();
+       I != E; ++I) {
+    Value *Index = *I;
+    // Compute the (potentially symbolic) offset in bytes for this index.
+    if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
+      // For a struct, add the member offset.
+      unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
+      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+
+      // Add the field offset to the running total offset.
+      TotalOffset = getAddExpr(TotalOffset, FieldOffset);
+    } else {
+      // For an array, add the element offset, explicitly scaled.
+      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *IndexS = getSCEV(Index);
+      // Getelementptr indices are signed.
+      IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
+
+      // Multiply the index by the element size to compute the element offset.
+      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize,
+                                           isInBounds ? SCEV::FlagNSW :
+                                           SCEV::FlagAnyWrap);
+
+      // Add the element offset to the running total offset.
+      TotalOffset = getAddExpr(TotalOffset, LocalOffset);
+    }
+  }
+
+  // Get the SCEV for the GEP base.
+  const SCEV *BaseS = getSCEV(Base);
+
+  // Add the total offset from all the GEP indices to the base.
+  return getAddExpr(BaseS, TotalOffset,
+                    isInBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap);
+}
+
+/// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
+/// guaranteed to end in (at every loop iteration).  It is, at the same time,
+/// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
+/// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
+uint32_t
+ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
+    return C->getValue()->getValue().countTrailingZeros();
+
+  if (const SCEVTruncateExpr *T = dyn_cast<SCEVTruncateExpr>(S))
+    return std::min(GetMinTrailingZeros(T->getOperand()),
+                    (uint32_t)getTypeSizeInBits(T->getType()));
+
+  if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
+    uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
+             getTypeSizeInBits(E->getType()) : OpRes;
+  }
+
+  if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
+    uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
+             getTypeSizeInBits(E->getType()) : OpRes;
+  }
+
+  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0));
+    for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i)));
+    return MinOpRes;
+  }
+
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    // The result is the sum of all operands results.
+    uint32_t SumOpRes = GetMinTrailingZeros(M->getOperand(0));
+    uint32_t BitWidth = getTypeSizeInBits(M->getType());
+    for (unsigned i = 1, e = M->getNumOperands();
+         SumOpRes != BitWidth && i != e; ++i)
+      SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)),
+                          BitWidth);
+    return SumOpRes;
+  }
+
+  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(A->getOperand(0));
+    for (unsigned i = 1, e = A->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(A->getOperand(i)));
+    return MinOpRes;
+  }
+
+  if (const SCEVSMaxExpr *M = dyn_cast<SCEVSMaxExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0));
+    for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i)));
+    return MinOpRes;
+  }
+
+  if (const SCEVUMaxExpr *M = dyn_cast<SCEVUMaxExpr>(S)) {
+    // The result is the min of all operands results.
+    uint32_t MinOpRes = GetMinTrailingZeros(M->getOperand(0));
+    for (unsigned i = 1, e = M->getNumOperands(); MinOpRes && i != e; ++i)
+      MinOpRes = std::min(MinOpRes, GetMinTrailingZeros(M->getOperand(i)));
+    return MinOpRes;
+  }
+
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // For a SCEVUnknown, ask ValueTracking.
+    unsigned BitWidth = getTypeSizeInBits(U->getType());
+    APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
+    ComputeMaskedBits(U->getValue(), Zeros, Ones);
+    return Zeros.countTrailingOnes();
+  }
+
+  // SCEVUDivExpr
+  return 0;
+}
+
+/// getUnsignedRange - Determine the unsigned range for a particular SCEV.
+///
+ConstantRange
+ScalarEvolution::getUnsignedRange(const SCEV *S) {
+  // See if we've computed this range already.
+  DenseMap<const SCEV *, ConstantRange>::iterator I = UnsignedRanges.find(S);
+  if (I != UnsignedRanges.end())
+    return I->second;
+
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
+    return setUnsignedRange(C, ConstantRange(C->getValue()->getValue()));
+
+  unsigned BitWidth = getTypeSizeInBits(S->getType());
+  ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true);
+
+  // If the value has known zeros, the maximum unsigned value will have those
+  // known zeros as well.
+  uint32_t TZ = GetMinTrailingZeros(S);
+  if (TZ != 0)
+    ConservativeResult =
+      ConstantRange(APInt::getMinValue(BitWidth),
+                    APInt::getMaxValue(BitWidth).lshr(TZ).shl(TZ) + 1);
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    ConstantRange X = getUnsignedRange(Add->getOperand(0));
+    for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i)
+      X = X.add(getUnsignedRange(Add->getOperand(i)));
+    return setUnsignedRange(Add, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    ConstantRange X = getUnsignedRange(Mul->getOperand(0));
+    for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i)
+      X = X.multiply(getUnsignedRange(Mul->getOperand(i)));
+    return setUnsignedRange(Mul, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
+    ConstantRange X = getUnsignedRange(SMax->getOperand(0));
+    for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
+      X = X.smax(getUnsignedRange(SMax->getOperand(i)));
+    return setUnsignedRange(SMax, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
+    ConstantRange X = getUnsignedRange(UMax->getOperand(0));
+    for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
+      X = X.umax(getUnsignedRange(UMax->getOperand(i)));
+    return setUnsignedRange(UMax, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
+    ConstantRange X = getUnsignedRange(UDiv->getLHS());
+    ConstantRange Y = getUnsignedRange(UDiv->getRHS());
+    return setUnsignedRange(UDiv, ConservativeResult.intersectWith(X.udiv(Y)));
+  }
+
+  if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
+    ConstantRange X = getUnsignedRange(ZExt->getOperand());
+    return setUnsignedRange(ZExt,
+      ConservativeResult.intersectWith(X.zeroExtend(BitWidth)));
+  }
+
+  if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
+    ConstantRange X = getUnsignedRange(SExt->getOperand());
+    return setUnsignedRange(SExt,
+      ConservativeResult.intersectWith(X.signExtend(BitWidth)));
+  }
+
+  if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
+    ConstantRange X = getUnsignedRange(Trunc->getOperand());
+    return setUnsignedRange(Trunc,
+      ConservativeResult.intersectWith(X.truncate(BitWidth)));
+  }
+
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
+    // If there's no unsigned wrap, the value will never be less than its
+    // initial value.
+    if (AddRec->getNoWrapFlags(SCEV::FlagNUW))
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(AddRec->getStart()))
+        if (!C->getValue()->isZero())
+          ConservativeResult =
+            ConservativeResult.intersectWith(
+              ConstantRange(C->getValue()->getValue(), APInt(BitWidth, 0)));
+
+    // TODO: non-affine addrec
+    if (AddRec->isAffine()) {
+      Type *Ty = AddRec->getType();
+      const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
+      if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
+          getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
+        MaxBECount = getNoopOrZeroExtend(MaxBECount, Ty);
+
+        const SCEV *Start = AddRec->getStart();
+        const SCEV *Step = AddRec->getStepRecurrence(*this);
+
+        ConstantRange StartRange = getUnsignedRange(Start);
+        ConstantRange StepRange = getSignedRange(Step);
+        ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
+        ConstantRange EndRange =
+          StartRange.add(MaxBECountRange.multiply(StepRange));
+
+        // Check for overflow. This must be done with ConstantRange arithmetic
+        // because we could be called from within the ScalarEvolution overflow
+        // checking code.
+        ConstantRange ExtStartRange = StartRange.zextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtStepRange = StepRange.sextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtMaxBECountRange =
+          MaxBECountRange.zextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtEndRange = EndRange.zextOrTrunc(BitWidth*2+1);
+        if (ExtStartRange.add(ExtMaxBECountRange.multiply(ExtStepRange)) !=
+            ExtEndRange)
+          return setUnsignedRange(AddRec, ConservativeResult);
+
+        APInt Min = APIntOps::umin(StartRange.getUnsignedMin(),
+                                   EndRange.getUnsignedMin());
+        APInt Max = APIntOps::umax(StartRange.getUnsignedMax(),
+                                   EndRange.getUnsignedMax());
+        if (Min.isMinValue() && Max.isMaxValue())
+          return setUnsignedRange(AddRec, ConservativeResult);
+        return setUnsignedRange(AddRec,
+          ConservativeResult.intersectWith(ConstantRange(Min, Max+1)));
+      }
+    }
+
+    return setUnsignedRange(AddRec, ConservativeResult);
+  }
+
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // For a SCEVUnknown, ask ValueTracking.
+    APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
+    ComputeMaskedBits(U->getValue(), Zeros, Ones, TD);
+    if (Ones == ~Zeros + 1)
+      return setUnsignedRange(U, ConservativeResult);
+    return setUnsignedRange(U,
+      ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1)));
+  }
+
+  return setUnsignedRange(S, ConservativeResult);
+}
+
+/// getSignedRange - Determine the signed range for a particular SCEV.
+///
+ConstantRange
+ScalarEvolution::getSignedRange(const SCEV *S) {
+  // See if we've computed this range already.
+  DenseMap<const SCEV *, ConstantRange>::iterator I = SignedRanges.find(S);
+  if (I != SignedRanges.end())
+    return I->second;
+
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
+    return setSignedRange(C, ConstantRange(C->getValue()->getValue()));
+
+  unsigned BitWidth = getTypeSizeInBits(S->getType());
+  ConstantRange ConservativeResult(BitWidth, /*isFullSet=*/true);
+
+  // If the value has known zeros, the maximum signed value will have those
+  // known zeros as well.
+  uint32_t TZ = GetMinTrailingZeros(S);
+  if (TZ != 0)
+    ConservativeResult =
+      ConstantRange(APInt::getSignedMinValue(BitWidth),
+                    APInt::getSignedMaxValue(BitWidth).ashr(TZ).shl(TZ) + 1);
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    ConstantRange X = getSignedRange(Add->getOperand(0));
+    for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i)
+      X = X.add(getSignedRange(Add->getOperand(i)));
+    return setSignedRange(Add, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    ConstantRange X = getSignedRange(Mul->getOperand(0));
+    for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i)
+      X = X.multiply(getSignedRange(Mul->getOperand(i)));
+    return setSignedRange(Mul, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
+    ConstantRange X = getSignedRange(SMax->getOperand(0));
+    for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
+      X = X.smax(getSignedRange(SMax->getOperand(i)));
+    return setSignedRange(SMax, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
+    ConstantRange X = getSignedRange(UMax->getOperand(0));
+    for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
+      X = X.umax(getSignedRange(UMax->getOperand(i)));
+    return setSignedRange(UMax, ConservativeResult.intersectWith(X));
+  }
+
+  if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
+    ConstantRange X = getSignedRange(UDiv->getLHS());
+    ConstantRange Y = getSignedRange(UDiv->getRHS());
+    return setSignedRange(UDiv, ConservativeResult.intersectWith(X.udiv(Y)));
+  }
+
+  if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
+    ConstantRange X = getSignedRange(ZExt->getOperand());
+    return setSignedRange(ZExt,
+      ConservativeResult.intersectWith(X.zeroExtend(BitWidth)));
+  }
+
+  if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
+    ConstantRange X = getSignedRange(SExt->getOperand());
+    return setSignedRange(SExt,
+      ConservativeResult.intersectWith(X.signExtend(BitWidth)));
+  }
+
+  if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
+    ConstantRange X = getSignedRange(Trunc->getOperand());
+    return setSignedRange(Trunc,
+      ConservativeResult.intersectWith(X.truncate(BitWidth)));
+  }
+
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S)) {
+    // If there's no signed wrap, and all the operands have the same sign or
+    // zero, the value won't ever change sign.
+    if (AddRec->getNoWrapFlags(SCEV::FlagNSW)) {
+      bool AllNonNeg = true;
+      bool AllNonPos = true;
+      for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
+        if (!isKnownNonNegative(AddRec->getOperand(i))) AllNonNeg = false;
+        if (!isKnownNonPositive(AddRec->getOperand(i))) AllNonPos = false;
+      }
+      if (AllNonNeg)
+        ConservativeResult = ConservativeResult.intersectWith(
+          ConstantRange(APInt(BitWidth, 0),
+                        APInt::getSignedMinValue(BitWidth)));
+      else if (AllNonPos)
+        ConservativeResult = ConservativeResult.intersectWith(
+          ConstantRange(APInt::getSignedMinValue(BitWidth),
+                        APInt(BitWidth, 1)));
+    }
+
+    // TODO: non-affine addrec
+    if (AddRec->isAffine()) {
+      Type *Ty = AddRec->getType();
+      const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
+      if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
+          getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
+        MaxBECount = getNoopOrZeroExtend(MaxBECount, Ty);
+
+        const SCEV *Start = AddRec->getStart();
+        const SCEV *Step = AddRec->getStepRecurrence(*this);
+
+        ConstantRange StartRange = getSignedRange(Start);
+        ConstantRange StepRange = getSignedRange(Step);
+        ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
+        ConstantRange EndRange =
+          StartRange.add(MaxBECountRange.multiply(StepRange));
+
+        // Check for overflow. This must be done with ConstantRange arithmetic
+        // because we could be called from within the ScalarEvolution overflow
+        // checking code.
+        ConstantRange ExtStartRange = StartRange.sextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtStepRange = StepRange.sextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtMaxBECountRange =
+          MaxBECountRange.zextOrTrunc(BitWidth*2+1);
+        ConstantRange ExtEndRange = EndRange.sextOrTrunc(BitWidth*2+1);
+        if (ExtStartRange.add(ExtMaxBECountRange.multiply(ExtStepRange)) !=
+            ExtEndRange)
+          return setSignedRange(AddRec, ConservativeResult);
+
+        APInt Min = APIntOps::smin(StartRange.getSignedMin(),
+                                   EndRange.getSignedMin());
+        APInt Max = APIntOps::smax(StartRange.getSignedMax(),
+                                   EndRange.getSignedMax());
+        if (Min.isMinSignedValue() && Max.isMaxSignedValue())
+          return setSignedRange(AddRec, ConservativeResult);
+        return setSignedRange(AddRec,
+          ConservativeResult.intersectWith(ConstantRange(Min, Max+1)));
+      }
+    }
+
+    return setSignedRange(AddRec, ConservativeResult);
+  }
+
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // For a SCEVUnknown, ask ValueTracking.
+    if (!U->getValue()->getType()->isIntegerTy() && !TD)
+      return setSignedRange(U, ConservativeResult);
+    unsigned NS = ComputeNumSignBits(U->getValue(), TD);
+    if (NS == 1)
+      return setSignedRange(U, ConservativeResult);
+    return setSignedRange(U, ConservativeResult.intersectWith(
+      ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1),
+                    APInt::getSignedMaxValue(BitWidth).ashr(NS - 1)+1)));
+  }
+
+  return setSignedRange(S, ConservativeResult);
+}
+
+/// createSCEV - We know that there is no SCEV for the specified value.
+/// Analyze the expression.
+///
+const SCEV *ScalarEvolution::createSCEV(Value *V) {
+  if (!isSCEVable(V->getType()))
+    return getUnknown(V);
+
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+
+    // Don't attempt to analyze instructions in blocks that aren't
+    // reachable. Such instructions don't matter, and they aren't required
+    // to obey basic rules for definitions dominating uses which this
+    // analysis depends on.
+    if (!DT->isReachableFromEntry(I->getParent()))
+      return getUnknown(V);
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    Opcode = CE->getOpcode();
+  else if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return getConstant(CI);
+  else if (isa<ConstantPointerNull>(V))
+    return getConstant(V->getType(), 0);
+  else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return GA->mayBeOverridden() ? getUnknown(V) : getSCEV(GA->getAliasee());
+  else
+    return getUnknown(V);
+
+  Operator *U = cast<Operator>(V);
+  switch (Opcode) {
+  case Instruction::Add: {
+    // The simple thing to do would be to just call getSCEV on both operands
+    // and call getAddExpr with the result. However if we're looking at a
+    // bunch of things all added together, this can be quite inefficient,
+    // because it leads to N-1 getAddExpr calls for N ultimate operands.
+    // Instead, gather up all the operands and make a single getAddExpr call.
+    // LLVM IR canonical form means we need only traverse the left operands.
+    //
+    // Don't apply this instruction's NSW or NUW flags to the new
+    // expression. The instruction may be guarded by control flow that the
+    // no-wrap behavior depends on. Non-control-equivalent instructions can be
+    // mapped to the same SCEV expression, and it would be incorrect to transfer
+    // NSW/NUW semantics to those operations.
+    SmallVector<const SCEV *, 4> AddOps;
+    AddOps.push_back(getSCEV(U->getOperand(1)));
+    for (Value *Op = U->getOperand(0); ; Op = U->getOperand(0)) {
+      unsigned Opcode = Op->getValueID() - Value::InstructionVal;
+      if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+        break;
+      U = cast<Operator>(Op);
+      const SCEV *Op1 = getSCEV(U->getOperand(1));
+      if (Opcode == Instruction::Sub)
+        AddOps.push_back(getNegativeSCEV(Op1));
+      else
+        AddOps.push_back(Op1);
+    }
+    AddOps.push_back(getSCEV(U->getOperand(0)));
+    return getAddExpr(AddOps);
+  }
+  case Instruction::Mul: {
+    // Don't transfer NSW/NUW for the same reason as AddExpr.
+    SmallVector<const SCEV *, 4> MulOps;
+    MulOps.push_back(getSCEV(U->getOperand(1)));
+    for (Value *Op = U->getOperand(0);
+         Op->getValueID() == Instruction::Mul + Value::InstructionVal;
+         Op = U->getOperand(0)) {
+      U = cast<Operator>(Op);
+      MulOps.push_back(getSCEV(U->getOperand(1)));
+    }
+    MulOps.push_back(getSCEV(U->getOperand(0)));
+    return getMulExpr(MulOps);
+  }
+  case Instruction::UDiv:
+    return getUDivExpr(getSCEV(U->getOperand(0)),
+                       getSCEV(U->getOperand(1)));
+  case Instruction::Sub:
+    return getMinusSCEV(getSCEV(U->getOperand(0)),
+                        getSCEV(U->getOperand(1)));
+  case Instruction::And:
+    // For an expression like x&255 that merely masks off the high bits,
+    // use zext(trunc(x)) as the SCEV expression.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      if (CI->isNullValue())
+        return getSCEV(U->getOperand(1));
+      if (CI->isAllOnesValue())
+        return getSCEV(U->getOperand(0));
+      const APInt &A = CI->getValue();
+
+      // Instcombine's ShrinkDemandedConstant may strip bits out of
+      // constants, obscuring what would otherwise be a low-bits mask.
+      // Use ComputeMaskedBits to compute what ShrinkDemandedConstant
+      // knew about to reconstruct a low-bits mask value.
+      unsigned LZ = A.countLeadingZeros();
+      unsigned BitWidth = A.getBitWidth();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, TD);
+
+      APInt EffectiveMask = APInt::getLowBitsSet(BitWidth, BitWidth - LZ);
+
+      if (LZ != 0 && !((~A & ~KnownZero) & EffectiveMask))
+        return
+          getZeroExtendExpr(getTruncateExpr(getSCEV(U->getOperand(0)),
+                                IntegerType::get(getContext(), BitWidth - LZ)),
+                            U->getType());
+    }
+    break;
+
+  case Instruction::Or:
+    // If the RHS of the Or is a constant, we may have something like:
+    // X*4+1 which got turned into X*4|1.  Handle this as an Add so loop
+    // optimizations will transparently handle this case.
+    //
+    // In order for this transformation to be safe, the LHS must be of the
+    // form X*(2^n) and the Or constant must be less than 2^n.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      const SCEV *LHS = getSCEV(U->getOperand(0));
+      const APInt &CIVal = CI->getValue();
+      if (GetMinTrailingZeros(LHS) >=
+          (CIVal.getBitWidth() - CIVal.countLeadingZeros())) {
+        // Build a plain add SCEV.
+        const SCEV *S = getAddExpr(LHS, getSCEV(CI));
+        // If the LHS of the add was an addrec and it has no-wrap flags,
+        // transfer the no-wrap flags, since an or won't introduce a wrap.
+        if (const SCEVAddRecExpr *NewAR = dyn_cast<SCEVAddRecExpr>(S)) {
+          const SCEVAddRecExpr *OldAR = cast<SCEVAddRecExpr>(LHS);
+          const_cast<SCEVAddRecExpr *>(NewAR)->setNoWrapFlags(
+            OldAR->getNoWrapFlags());
+        }
+        return S;
+      }
+    }
+    break;
+  case Instruction::Xor:
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      // If the RHS of the xor is a signbit, then this is just an add.
+      // Instcombine turns add of signbit into xor as a strength reduction step.
+      if (CI->getValue().isSignBit())
+        return getAddExpr(getSCEV(U->getOperand(0)),
+                          getSCEV(U->getOperand(1)));
+
+      // If the RHS of xor is -1, then this is a not operation.
+      if (CI->isAllOnesValue())
+        return getNotSCEV(getSCEV(U->getOperand(0)));
+
+      // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
+      // This is a variant of the check for xor with -1, and it handles
+      // the case where instcombine has trimmed non-demanded bits out
+      // of an xor with -1.
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U->getOperand(0)))
+        if (ConstantInt *LCI = dyn_cast<ConstantInt>(BO->getOperand(1)))
+          if (BO->getOpcode() == Instruction::And &&
+              LCI->getValue() == CI->getValue())
+            if (const SCEVZeroExtendExpr *Z =
+                  dyn_cast<SCEVZeroExtendExpr>(getSCEV(U->getOperand(0)))) {
+              Type *UTy = U->getType();
+              const SCEV *Z0 = Z->getOperand();
+              Type *Z0Ty = Z0->getType();
+              unsigned Z0TySize = getTypeSizeInBits(Z0Ty);
+
+              // If C is a low-bits mask, the zero extend is serving to
+              // mask off the high bits. Complement the operand and
+              // re-apply the zext.
+              if (APIntOps::isMask(Z0TySize, CI->getValue()))
+                return getZeroExtendExpr(getNotSCEV(Z0), UTy);
+
+              // If C is a single bit, it may be in the sign-bit position
+              // before the zero-extend. In this case, represent the xor
+              // using an add, which is equivalent, and re-apply the zext.
+              APInt Trunc = CI->getValue().trunc(Z0TySize);
+              if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() &&
+                  Trunc.isSignBit())
+                return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)),
+                                         UTy);
+            }
+    }
+    break;
+
+  case Instruction::Shl:
+    // Turn shift left of a constant amount into a multiply.
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint32_t BitWidth = cast<IntegerType>(U->getType())->getBitWidth();
+
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (SA->getValue().uge(BitWidth))
+        break;
+
+      Constant *X = ConstantInt::get(getContext(),
+        APInt(BitWidth, 1).shl(SA->getZExtValue()));
+      return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+    }
+    break;
+
+  case Instruction::LShr:
+    // Turn logical shift right of a constant into a unsigned divide.
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint32_t BitWidth = cast<IntegerType>(U->getType())->getBitWidth();
+
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (SA->getValue().uge(BitWidth))
+        break;
+
+      Constant *X = ConstantInt::get(getContext(),
+        APInt(BitWidth, 1).shl(SA->getZExtValue()));
+      return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(X));
+    }
+    break;
+
+  case Instruction::AShr:
+    // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1)))
+      if (Operator *L = dyn_cast<Operator>(U->getOperand(0)))
+        if (L->getOpcode() == Instruction::Shl &&
+            L->getOperand(1) == U->getOperand(1)) {
+          uint64_t BitWidth = getTypeSizeInBits(U->getType());
+
+          // If the shift count is not less than the bitwidth, the result of
+          // the shift is undefined. Don't try to analyze it, because the
+          // resolution chosen here may differ from the resolution chosen in
+          // other parts of the compiler.
+          if (CI->getValue().uge(BitWidth))
+            break;
+
+          uint64_t Amt = BitWidth - CI->getZExtValue();
+          if (Amt == BitWidth)
+            return getSCEV(L->getOperand(0));       // shift by zero --> noop
+          return
+            getSignExtendExpr(getTruncateExpr(getSCEV(L->getOperand(0)),
+                                              IntegerType::get(getContext(),
+                                                               Amt)),
+                              U->getType());
+        }
+    break;
+
+  case Instruction::Trunc:
+    return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::ZExt:
+    return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::SExt:
+    return getSignExtendExpr(getSCEV(U->getOperand(0)), U->getType());
+
+  case Instruction::BitCast:
+    // BitCasts are no-op casts so we just eliminate the cast.
+    if (isSCEVable(U->getType()) && isSCEVable(U->getOperand(0)->getType()))
+      return getSCEV(U->getOperand(0));
+    break;
+
+  // It's tempting to handle inttoptr and ptrtoint as no-ops, however this can
+  // lead to pointer expressions which cannot safely be expanded to GEPs,
+  // because ScalarEvolution doesn't respect the GEP aliasing rules when
+  // simplifying integer expressions.
+
+  case Instruction::GetElementPtr:
+    return createNodeForGEP(cast<GEPOperator>(U));
+
+  case Instruction::PHI:
+    return createNodeForPHI(cast<PHINode>(U));
+
+  case Instruction::Select:
+    // This could be a smax or umax that was lowered earlier.
+    // Try to recover it.
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U->getOperand(0))) {
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+      switch (ICI->getPredicate()) {
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_SLE:
+        std::swap(LHS, RHS);
+        // fall through
+      case ICmpInst::ICMP_SGT:
+      case ICmpInst::ICMP_SGE:
+        // a >s b ? a+x : b+x  ->  smax(a, b)+x
+        // a >s b ? b+x : a+x  ->  smin(a, b)+x
+        if (LHS->getType() == U->getType()) {
+          const SCEV *LS = getSCEV(LHS);
+          const SCEV *RS = getSCEV(RHS);
+          const SCEV *LA = getSCEV(U->getOperand(1));
+          const SCEV *RA = getSCEV(U->getOperand(2));
+          const SCEV *LDiff = getMinusSCEV(LA, LS);
+          const SCEV *RDiff = getMinusSCEV(RA, RS);
+          if (LDiff == RDiff)
+            return getAddExpr(getSMaxExpr(LS, RS), LDiff);
+          LDiff = getMinusSCEV(LA, RS);
+          RDiff = getMinusSCEV(RA, LS);
+          if (LDiff == RDiff)
+            return getAddExpr(getSMinExpr(LS, RS), LDiff);
+        }
+        break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_ULE:
+        std::swap(LHS, RHS);
+        // fall through
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_UGE:
+        // a >u b ? a+x : b+x  ->  umax(a, b)+x
+        // a >u b ? b+x : a+x  ->  umin(a, b)+x
+        if (LHS->getType() == U->getType()) {
+          const SCEV *LS = getSCEV(LHS);
+          const SCEV *RS = getSCEV(RHS);
+          const SCEV *LA = getSCEV(U->getOperand(1));
+          const SCEV *RA = getSCEV(U->getOperand(2));
+          const SCEV *LDiff = getMinusSCEV(LA, LS);
+          const SCEV *RDiff = getMinusSCEV(RA, RS);
+          if (LDiff == RDiff)
+            return getAddExpr(getUMaxExpr(LS, RS), LDiff);
+          LDiff = getMinusSCEV(LA, RS);
+          RDiff = getMinusSCEV(RA, LS);
+          if (LDiff == RDiff)
+            return getAddExpr(getUMinExpr(LS, RS), LDiff);
+        }
+        break;
+      case ICmpInst::ICMP_NE:
+        // n != 0 ? n+x : 1+x  ->  umax(n, 1)+x
+        if (LHS->getType() == U->getType() &&
+            isa<ConstantInt>(RHS) &&
+            cast<ConstantInt>(RHS)->isZero()) {
+          const SCEV *One = getConstant(LHS->getType(), 1);
+          const SCEV *LS = getSCEV(LHS);
+          const SCEV *LA = getSCEV(U->getOperand(1));
+          const SCEV *RA = getSCEV(U->getOperand(2));
+          const SCEV *LDiff = getMinusSCEV(LA, LS);
+          const SCEV *RDiff = getMinusSCEV(RA, One);
+          if (LDiff == RDiff)
+            return getAddExpr(getUMaxExpr(One, LS), LDiff);
+        }
+        break;
+      case ICmpInst::ICMP_EQ:
+        // n == 0 ? 1+x : n+x  ->  umax(n, 1)+x
+        if (LHS->getType() == U->getType() &&
+            isa<ConstantInt>(RHS) &&
+            cast<ConstantInt>(RHS)->isZero()) {
+          const SCEV *One = getConstant(LHS->getType(), 1);
+          const SCEV *LS = getSCEV(LHS);
+          const SCEV *LA = getSCEV(U->getOperand(1));
+          const SCEV *RA = getSCEV(U->getOperand(2));
+          const SCEV *LDiff = getMinusSCEV(LA, One);
+          const SCEV *RDiff = getMinusSCEV(RA, LS);
+          if (LDiff == RDiff)
+            return getAddExpr(getUMaxExpr(One, LS), LDiff);
+        }
+        break;
+      default:
+        break;
+      }
+    }
+
+  default: // We cannot analyze this expression.
+    break;
+  }
+
+  return getUnknown(V);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                   Iteration Count Computation Code
+//
+
+/// getSmallConstantTripCount - Returns the maximum trip count of this loop as a
+/// normal unsigned value. Returns 0 if the trip count is unknown or not
+/// constant. Will also return 0 if the maximum trip count is very large (>=
+/// 2^32).
+///
+/// This "trip count" assumes that control exits via ExitingBlock. More
+/// precisely, it is the number of times that control may reach ExitingBlock
+/// before taking the branch. For loops with multiple exits, it may not be the
+/// number times that the loop header executes because the loop may exit
+/// prematurely via another branch.
+unsigned ScalarEvolution::
+getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) {
+  const SCEVConstant *ExitCount =
+    dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
+  if (!ExitCount)
+    return 0;
+
+  ConstantInt *ExitConst = ExitCount->getValue();
+
+  // Guard against huge trip counts.
+  if (ExitConst->getValue().getActiveBits() > 32)
+    return 0;
+
+  // In case of integer overflow, this returns 0, which is correct.
+  return ((unsigned)ExitConst->getZExtValue()) + 1;
+}
+
+/// getSmallConstantTripMultiple - Returns the largest constant divisor of the
+/// trip count of this loop as a normal unsigned value, if possible. This
+/// means that the actual trip count is always a multiple of the returned
+/// value (don't forget the trip count could very well be zero as well!).
+///
+/// Returns 1 if the trip count is unknown or not guaranteed to be the
+/// multiple of a constant (which is also the case if the trip count is simply
+/// constant, use getSmallConstantTripCount for that case), Will also return 1
+/// if the trip count is very large (>= 2^32).
+///
+/// As explained in the comments for getSmallConstantTripCount, this assumes
+/// that control exits the loop via ExitingBlock.
+unsigned ScalarEvolution::
+getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
+  const SCEV *ExitCount = getExitCount(L, ExitingBlock);
+  if (ExitCount == getCouldNotCompute())
+    return 1;
+
+  // Get the trip count from the BE count by adding 1.
+  const SCEV *TCMul = getAddExpr(ExitCount,
+                                 getConstant(ExitCount->getType(), 1));
+  // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
+  // to factor simple cases.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
+    TCMul = Mul->getOperand(0);
+
+  const SCEVConstant *MulC = dyn_cast<SCEVConstant>(TCMul);
+  if (!MulC)
+    return 1;
+
+  ConstantInt *Result = MulC->getValue();
+
+  // Guard against huge trip counts (this requires checking
+  // for zero to handle the case where the trip count == -1 and the
+  // addition wraps).
+  if (!Result || Result->getValue().getActiveBits() > 32 ||
+      Result->getValue().getActiveBits() == 0)
+    return 1;
+
+  return (unsigned)Result->getZExtValue();
+}
+
+// getExitCount - Get the expression for the number of loop iterations for which
+// this loop is guaranteed not to exit via ExitintBlock. Otherwise return
+// SCEVCouldNotCompute.
+const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
+  return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
+}
+
+/// getBackedgeTakenCount - If the specified loop has a predictable
+/// backedge-taken count, return it, otherwise return a SCEVCouldNotCompute
+/// object. The backedge-taken count is the number of times the loop header
+/// will be branched to from within the loop. This is one less than the
+/// trip count of the loop, since it doesn't count the first iteration,
+/// when the header is branched to from outside the loop.
+///
+/// Note that it is not valid to call this method on a loop without a
+/// loop-invariant backedge-taken count (see
+/// hasLoopInvariantBackedgeTakenCount).
+///
+const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
+  return getBackedgeTakenInfo(L).getExact(this);
+}
+
+/// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
+/// return the least SCEV value that is known never to be less than the
+/// actual backedge taken count.
+const SCEV *ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
+  return getBackedgeTakenInfo(L).getMax(this);
+}
+
+/// PushLoopPHIs - Push PHI nodes in the header of the given loop
+/// onto the given Worklist.
+static void
+PushLoopPHIs(const Loop *L, SmallVectorImpl<Instruction *> &Worklist) {
+  BasicBlock *Header = L->getHeader();
+
+  // Push all Loop-header PHIs onto the Worklist stack.
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    Worklist.push_back(PN);
+}
+
+const ScalarEvolution::BackedgeTakenInfo &
+ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
+  // Initially insert an invalid entry for this loop. If the insertion
+  // succeeds, proceed to actually compute a backedge-taken count and
+  // update the value. The temporary CouldNotCompute value tells SCEV
+  // code elsewhere that it shouldn't attempt to request a new
+  // backedge-taken count, which could result in infinite recursion.
+  std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
+    BackedgeTakenCounts.insert(std::make_pair(L, BackedgeTakenInfo()));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  // ComputeBackedgeTakenCount may allocate memory for its result. Inserting it
+  // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result
+  // must be cleared in this scope.
+  BackedgeTakenInfo Result = ComputeBackedgeTakenCount(L);
+
+  if (Result.getExact(this) != getCouldNotCompute()) {
+    assert(isLoopInvariant(Result.getExact(this), L) &&
+           isLoopInvariant(Result.getMax(this), L) &&
+           "Computed backedge-taken count isn't loop invariant for loop!");
+    ++NumTripCountsComputed;
+  }
+  else if (Result.getMax(this) == getCouldNotCompute() &&
+           isa<PHINode>(L->getHeader()->begin())) {
+    // Only count loops that have phi nodes as not being computable.
+    ++NumTripCountsNotComputed;
+  }
+
+  // Now that we know more about the trip count for this loop, forget any
+  // existing SCEV values for PHI nodes in this loop since they are only
+  // conservative estimates made without the benefit of trip count
+  // information. This is similar to the code in forgetLoop, except that
+  // it handles SCEVUnknown PHI nodes specially.
+  if (Result.hasAnyInfo()) {
+    SmallVector<Instruction *, 16> Worklist;
+    PushLoopPHIs(L, Worklist);
+
+    SmallPtrSet<Instruction *, 8> Visited;
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+      if (!Visited.insert(I)) continue;
+
+      ValueExprMapType::iterator It =
+        ValueExprMap.find_as(static_cast<Value *>(I));
+      if (It != ValueExprMap.end()) {
+        const SCEV *Old = It->second;
+
+        // SCEVUnknown for a PHI either means that it has an unrecognized
+        // structure, or it's a PHI that's in the progress of being computed
+        // by createNodeForPHI.  In the former case, additional loop trip
+        // count information isn't going to change anything. In the later
+        // case, createNodeForPHI will perform the necessary updates on its
+        // own when it gets to that point.
+        if (!isa<PHINode>(I) || !isa<SCEVUnknown>(Old)) {
+          forgetMemoizedResults(Old);
+          ValueExprMap.erase(It);
+        }
+        if (PHINode *PN = dyn_cast<PHINode>(I))
+          ConstantEvolutionLoopExitValue.erase(PN);
+      }
+
+      PushDefUseChildren(I, Worklist);
+    }
+  }
+
+  // Re-lookup the insert position, since the call to
+  // ComputeBackedgeTakenCount above could result in a
+  // recusive call to getBackedgeTakenInfo (on a different
+  // loop), which would invalidate the iterator computed
+  // earlier.
+  return BackedgeTakenCounts.find(L)->second = Result;
+}
+
+/// forgetLoop - This method should be called by the client when it has
+/// changed a loop in a way that may effect ScalarEvolution's ability to
+/// compute a trip count, or if the loop is deleted.
+void ScalarEvolution::forgetLoop(const Loop *L) {
+  // Drop any stored trip count value.
+  DenseMap<const Loop*, BackedgeTakenInfo>::iterator BTCPos =
+    BackedgeTakenCounts.find(L);
+  if (BTCPos != BackedgeTakenCounts.end()) {
+    BTCPos->second.clear();
+    BackedgeTakenCounts.erase(BTCPos);
+  }
+
+  // Drop information about expressions based on loop-header PHIs.
+  SmallVector<Instruction *, 16> Worklist;
+  PushLoopPHIs(L, Worklist);
+
+  SmallPtrSet<Instruction *, 8> Visited;
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    if (!Visited.insert(I)) continue;
+
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
+      forgetMemoizedResults(It->second);
+      ValueExprMap.erase(It);
+      if (PHINode *PN = dyn_cast<PHINode>(I))
+        ConstantEvolutionLoopExitValue.erase(PN);
+    }
+
+    PushDefUseChildren(I, Worklist);
+  }
+
+  // Forget all contained loops too, to avoid dangling entries in the
+  // ValuesAtScopes map.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    forgetLoop(*I);
+}
+
+/// forgetValue - This method should be called by the client when it has
+/// changed a value in a way that may effect its value, or which may
+/// disconnect it from a def-use chain linking it to a loop.
+void ScalarEvolution::forgetValue(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return;
+
+  // Drop information about expressions based on loop-header PHIs.
+  SmallVector<Instruction *, 16> Worklist;
+  Worklist.push_back(I);
+
+  SmallPtrSet<Instruction *, 8> Visited;
+  while (!Worklist.empty()) {
+    I = Worklist.pop_back_val();
+    if (!Visited.insert(I)) continue;
+
+    ValueExprMapType::iterator It =
+      ValueExprMap.find_as(static_cast<Value *>(I));
+    if (It != ValueExprMap.end()) {
+      forgetMemoizedResults(It->second);
+      ValueExprMap.erase(It);
+      if (PHINode *PN = dyn_cast<PHINode>(I))
+        ConstantEvolutionLoopExitValue.erase(PN);
+    }
+
+    PushDefUseChildren(I, Worklist);
+  }
+}
+
+/// getExact - Get the exact loop backedge taken count considering all loop
+/// exits. A computable result can only be return for loops with a single exit.
+/// Returning the minimum taken count among all exits is incorrect because one
+/// of the loop's exit limit's may have been skipped. HowFarToZero assumes that
+/// the limit of each loop test is never skipped. This is a valid assumption as
+/// long as the loop exits via that test. For precise results, it is the
+/// caller's responsibility to specify the relevant loop exit using
+/// getExact(ExitingBlock, SE).
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
+  // If any exits were not computable, the loop is not computable.
+  if (!ExitNotTaken.isCompleteList()) return SE->getCouldNotCompute();
+
+  // We need exactly one computable exit.
+  if (!ExitNotTaken.ExitingBlock) return SE->getCouldNotCompute();
+  assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info");
+
+  const SCEV *BECount = 0;
+  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
+       ENT != 0; ENT = ENT->getNextExit()) {
+
+    assert(ENT->ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
+
+    if (!BECount)
+      BECount = ENT->ExactNotTaken;
+    else if (BECount != ENT->ExactNotTaken)
+      return SE->getCouldNotCompute();
+  }
+  assert(BECount && "Invalid not taken count for loop exit");
+  return BECount;
+}
+
+/// getExact - Get the exact not taken count for this loop exit.
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
+                                             ScalarEvolution *SE) const {
+  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
+       ENT != 0; ENT = ENT->getNextExit()) {
+
+    if (ENT->ExitingBlock == ExitingBlock)
+      return ENT->ExactNotTaken;
+  }
+  return SE->getCouldNotCompute();
+}
+
+/// getMax - Get the max backedge taken count for the loop.
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
+  return Max ? Max : SE->getCouldNotCompute();
+}
+
+bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
+                                                    ScalarEvolution *SE) const {
+  if (Max && Max != SE->getCouldNotCompute() && SE->hasOperand(Max, S))
+    return true;
+
+  if (!ExitNotTaken.ExitingBlock)
+    return false;
+
+  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
+       ENT != 0; ENT = ENT->getNextExit()) {
+
+    if (ENT->ExactNotTaken != SE->getCouldNotCompute()
+        && SE->hasOperand(ENT->ExactNotTaken, S)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
+/// computable exit into a persistent ExitNotTakenInfo array.
+ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
+  SmallVectorImpl< std::pair<BasicBlock *, const SCEV *> > &ExitCounts,
+  bool Complete, const SCEV *MaxCount) : Max(MaxCount) {
+
+  if (!Complete)
+    ExitNotTaken.setIncomplete();
+
+  unsigned NumExits = ExitCounts.size();
+  if (NumExits == 0) return;
+
+  ExitNotTaken.ExitingBlock = ExitCounts[0].first;
+  ExitNotTaken.ExactNotTaken = ExitCounts[0].second;
+  if (NumExits == 1) return;
+
+  // Handle the rare case of multiple computable exits.
+  ExitNotTakenInfo *ENT = new ExitNotTakenInfo[NumExits-1];
+
+  ExitNotTakenInfo *PrevENT = &ExitNotTaken;
+  for (unsigned i = 1; i < NumExits; ++i, PrevENT = ENT, ++ENT) {
+    PrevENT->setNextExit(ENT);
+    ENT->ExitingBlock = ExitCounts[i].first;
+    ENT->ExactNotTaken = ExitCounts[i].second;
+  }
+}
+
+/// clear - Invalidate this result and free the ExitNotTakenInfo array.
+void ScalarEvolution::BackedgeTakenInfo::clear() {
+  ExitNotTaken.ExitingBlock = 0;
+  ExitNotTaken.ExactNotTaken = 0;
+  delete[] ExitNotTaken.getNextExit();
+}
+
+/// ComputeBackedgeTakenCount - Compute the number of times the backedge
+/// of the specified loop will execute.
+ScalarEvolution::BackedgeTakenInfo
+ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
+  SmallVector<BasicBlock *, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  // Examine all exits and pick the most conservative values.
+  const SCEV *MaxBECount = getCouldNotCompute();
+  bool CouldComputeBECount = true;
+  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
+  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+    ExitLimit EL = ComputeExitLimit(L, ExitingBlocks[i]);
+    if (EL.Exact == getCouldNotCompute())
+      // We couldn't compute an exact value for this exit, so
+      // we won't be able to compute an exact value for the loop.
+      CouldComputeBECount = false;
+    else
+      ExitCounts.push_back(std::make_pair(ExitingBlocks[i], EL.Exact));
+
+    if (MaxBECount == getCouldNotCompute())
+      MaxBECount = EL.Max;
+    else if (EL.Max != getCouldNotCompute()) {
+      // We cannot take the "min" MaxBECount, because non-unit stride loops may
+      // skip some loop tests. Taking the max over the exits is sufficiently
+      // conservative.  TODO: We could do better taking into consideration
+      // that (1) the loop has unit stride (2) the last loop test is
+      // less-than/greater-than (3) any loop test is less-than/greater-than AND
+      // falls-through some constant times less then the other tests.
+      MaxBECount = getUMaxFromMismatchedTypes(MaxBECount, EL.Max);
+    }
+  }
+
+  return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
+}
+
+/// ComputeExitLimit - Compute the number of times the backedge of the specified
+/// loop will execute if it exits via the specified block.
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
+
+  // Okay, we've chosen an exiting block.  See what condition causes us to
+  // exit at this block.
+  //
+  // FIXME: we should be able to handle switch instructions (with a single exit)
+  BranchInst *ExitBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (ExitBr == 0) return getCouldNotCompute();
+  assert(ExitBr->isConditional() && "If unconditional, it can't be in loop!");
+
+  // At this point, we know we have a conditional branch that determines whether
+  // the loop is exited.  However, we don't know if the branch is executed each
+  // time through the loop.  If not, then the execution count of the branch will
+  // not be equal to the trip count of the loop.
+  //
+  // Currently we check for this by checking to see if the Exit branch goes to
+  // the loop header.  If so, we know it will always execute the same number of
+  // times as the loop.  We also handle the case where the exit block *is* the
+  // loop header.  This is common for un-rotated loops.
+  //
+  // If both of those tests fail, walk up the unique predecessor chain to the
+  // header, stopping if there is an edge that doesn't exit the loop. If the
+  // header is reached, the execution count of the branch will be equal to the
+  // trip count of the loop.
+  //
+  //  More extensive analysis could be done to handle more cases here.
+  //
+  if (ExitBr->getSuccessor(0) != L->getHeader() &&
+      ExitBr->getSuccessor(1) != L->getHeader() &&
+      ExitBr->getParent() != L->getHeader()) {
+    // The simple checks failed, try climbing the unique predecessor chain
+    // up to the header.
+    bool Ok = false;
+    for (BasicBlock *BB = ExitBr->getParent(); BB; ) {
+      BasicBlock *Pred = BB->getUniquePredecessor();
+      if (!Pred)
+        return getCouldNotCompute();
+      TerminatorInst *PredTerm = Pred->getTerminator();
+      for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *PredSucc = PredTerm->getSuccessor(i);
+        if (PredSucc == BB)
+          continue;
+        // If the predecessor has a successor that isn't BB and isn't
+        // outside the loop, assume the worst.
+        if (L->contains(PredSucc))
+          return getCouldNotCompute();
+      }
+      if (Pred == L->getHeader()) {
+        Ok = true;
+        break;
+      }
+      BB = Pred;
+    }
+    if (!Ok)
+      return getCouldNotCompute();
+  }
+
+  // Proceed to the next level to examine the exit condition expression.
+  return ComputeExitLimitFromCond(L, ExitBr->getCondition(),
+                                  ExitBr->getSuccessor(0),
+                                  ExitBr->getSuccessor(1));
+}
+
+/// ComputeExitLimitFromCond - Compute the number of times the
+/// backedge of the specified loop will execute if its exit condition
+/// were a conditional branch of ExitCond, TBB, and FBB.
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
+                                          Value *ExitCond,
+                                          BasicBlock *TBB,
+                                          BasicBlock *FBB) {
+  // Check if the controlling expression for this loop is an And or Or.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
+    if (BO->getOpcode() == Instruction::And) {
+      // Recurse on the operands of the and.
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+      const SCEV *BECount = getCouldNotCompute();
+      const SCEV *MaxBECount = getCouldNotCompute();
+      if (L->contains(TBB)) {
+        // Both conditions must be true for the loop to continue executing.
+        // Choose the less conservative count.
+        if (EL0.Exact == getCouldNotCompute() ||
+            EL1.Exact == getCouldNotCompute())
+          BECount = getCouldNotCompute();
+        else
+          BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact);
+        if (EL0.Max == getCouldNotCompute())
+          MaxBECount = EL1.Max;
+        else if (EL1.Max == getCouldNotCompute())
+          MaxBECount = EL0.Max;
+        else
+          MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
+      } else {
+        // Both conditions must be true at the same time for the loop to exit.
+        // For now, be conservative.
+        assert(L->contains(FBB) && "Loop block has no successor in loop!");
+        if (EL0.Max == EL1.Max)
+          MaxBECount = EL0.Max;
+        if (EL0.Exact == EL1.Exact)
+          BECount = EL0.Exact;
+      }
+
+      return ExitLimit(BECount, MaxBECount);
+    }
+    if (BO->getOpcode() == Instruction::Or) {
+      // Recurse on the operands of the or.
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+      const SCEV *BECount = getCouldNotCompute();
+      const SCEV *MaxBECount = getCouldNotCompute();
+      if (L->contains(FBB)) {
+        // Both conditions must be false for the loop to continue executing.
+        // Choose the less conservative count.
+        if (EL0.Exact == getCouldNotCompute() ||
+            EL1.Exact == getCouldNotCompute())
+          BECount = getCouldNotCompute();
+        else
+          BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact);
+        if (EL0.Max == getCouldNotCompute())
+          MaxBECount = EL1.Max;
+        else if (EL1.Max == getCouldNotCompute())
+          MaxBECount = EL0.Max;
+        else
+          MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
+      } else {
+        // Both conditions must be false at the same time for the loop to exit.
+        // For now, be conservative.
+        assert(L->contains(TBB) && "Loop block has no successor in loop!");
+        if (EL0.Max == EL1.Max)
+          MaxBECount = EL0.Max;
+        if (EL0.Exact == EL1.Exact)
+          BECount = EL0.Exact;
+      }
+
+      return ExitLimit(BECount, MaxBECount);
+    }
+  }
+
+  // With an icmp, it may be feasible to compute an exact backedge-taken count.
+  // Proceed to the next level to examine the icmp.
+  if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
+    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB);
+
+  // Check for a constant condition. These are normally stripped out by
+  // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
+  // preserve the CFG and is temporarily leaving constant conditions
+  // in place.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(ExitCond)) {
+    if (L->contains(FBB) == !CI->getZExtValue())
+      // The backedge is always taken.
+      return getCouldNotCompute();
+    else
+      // The backedge is never taken.
+      return getConstant(CI->getType(), 0);
+  }
+
+  // If it's not an integer or pointer comparison then compute it the hard way.
+  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+}
+
+/// ComputeExitLimitFromICmp - Compute the number of times the
+/// backedge of the specified loop will execute if its exit condition
+/// were a conditional branch of the ICmpInst ExitCond, TBB, and FBB.
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
+                                          ICmpInst *ExitCond,
+                                          BasicBlock *TBB,
+                                          BasicBlock *FBB) {
+
+  // If the condition was exit on true, convert the condition to exit on false
+  ICmpInst::Predicate Cond;
+  if (!L->contains(FBB))
+    Cond = ExitCond->getPredicate();
+  else
+    Cond = ExitCond->getInversePredicate();
+
+  // Handle common loops like: for (X = "string"; *X; ++X)
+  if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
+    if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
+      ExitLimit ItCnt =
+        ComputeLoadConstantCompareExitLimit(LI, RHS, L, Cond);
+      if (ItCnt.hasAnyInfo())
+        return ItCnt;
+    }
+
+  const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
+  const SCEV *RHS = getSCEV(ExitCond->getOperand(1));
+
+  // Try to evaluate any dependencies out of the loop.
+  LHS = getSCEVAtScope(LHS, L);
+  RHS = getSCEVAtScope(RHS, L);
+
+  // At this point, we would like to compute how many iterations of the
+  // loop the predicate will return true for these inputs.
+  if (isLoopInvariant(LHS, L) && !isLoopInvariant(RHS, L)) {
+    // If there is a loop-invariant, force it into the RHS.
+    std::swap(LHS, RHS);
+    Cond = ICmpInst::getSwappedPredicate(Cond);
+  }
+
+  // Simplify the operands before analyzing them.
+  (void)SimplifyICmpOperands(Cond, LHS, RHS);
+
+  // If we have a comparison of a chrec against a constant, try to use value
+  // ranges to answer this query.
+  if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS))
+    if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS))
+      if (AddRec->getLoop() == L) {
+        // Form the constant range.
+        ConstantRange CompRange(
+            ICmpInst::makeConstantRange(Cond, RHSC->getValue()->getValue()));
+
+        const SCEV *Ret = AddRec->getNumIterationsInRange(CompRange, *this);
+        if (!isa<SCEVCouldNotCompute>(Ret)) return Ret;
+      }
+
+  switch (Cond) {
+  case ICmpInst::ICMP_NE: {                     // while (X != Y)
+    // Convert to: while (X-Y != 0)
+    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  case ICmpInst::ICMP_EQ: {                     // while (X == Y)
+    // Convert to: while (X-Y == 0)
+    ExitLimit EL = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  case ICmpInst::ICMP_SLT: {
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  case ICmpInst::ICMP_SGT: {
+    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
+                                             getNotSCEV(RHS), L, true);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  case ICmpInst::ICMP_ULT: {
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  case ICmpInst::ICMP_UGT: {
+    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
+                                             getNotSCEV(RHS), L, false);
+    if (EL.hasAnyInfo()) return EL;
+    break;
+  }
+  default:
+#if 0
+    dbgs() << "ComputeBackedgeTakenCount ";
+    if (ExitCond->getOperand(0)->getType()->isUnsigned())
+      dbgs() << "[unsigned] ";
+    dbgs() << *LHS << "   "
+         << Instruction::getOpcodeName(Instruction::ICmp)
+         << "   " << *RHS << "\n";
+#endif
+    break;
+  }
+  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
+}
+
+static ConstantInt *
+EvaluateConstantChrecAtConstant(const SCEVAddRecExpr *AddRec, ConstantInt *C,
+                                ScalarEvolution &SE) {
+  const SCEV *InVal = SE.getConstant(C);
+  const SCEV *Val = AddRec->evaluateAtIteration(InVal, SE);
+  assert(isa<SCEVConstant>(Val) &&
+         "Evaluation of SCEV at constant didn't fold correctly?");
+  return cast<SCEVConstant>(Val)->getValue();
+}
+
+/// ComputeLoadConstantCompareExitLimit - Given an exit condition of
+/// 'icmp op load X, cst', try to see if we can compute the backedge
+/// execution count.
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeLoadConstantCompareExitLimit(
+  LoadInst *LI,
+  Constant *RHS,
+  const Loop *L,
+  ICmpInst::Predicate predicate) {
+
+  if (LI->isVolatile()) return getCouldNotCompute();
+
+  // Check to see if the loaded pointer is a getelementptr of a global.
+  // TODO: Use SCEV instead of manually grubbing with GEPs.
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0));
+  if (!GEP) return getCouldNotCompute();
+
+  // Make sure that it is really a constant global we are gepping, with an
+  // initializer, and make sure the first IDX is really 0.
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0));
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+      GEP->getNumOperands() < 3 || !isa<Constant>(GEP->getOperand(1)) ||
+      !cast<Constant>(GEP->getOperand(1))->isNullValue())
+    return getCouldNotCompute();
+
+  // Okay, we allow one non-constant index into the GEP instruction.
+  Value *VarIdx = 0;
+  std::vector<Constant*> Indexes;
+  unsigned VarIdxNum = 0;
+  for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i)
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      Indexes.push_back(CI);
+    } else if (!isa<ConstantInt>(GEP->getOperand(i))) {
+      if (VarIdx) return getCouldNotCompute();  // Multiple non-constant idx's.
+      VarIdx = GEP->getOperand(i);
+      VarIdxNum = i-2;
+      Indexes.push_back(0);
+    }
+
+  // Loop-invariant loads may be a byproduct of loop optimization. Skip them.
+  if (!VarIdx)
+    return getCouldNotCompute();
+
+  // Okay, we know we have a (load (gep GV, 0, X)) comparison with a constant.
+  // Check to see if X is a loop variant variable value now.
+  const SCEV *Idx = getSCEV(VarIdx);
+  Idx = getSCEVAtScope(Idx, L);
+
+  // We can only recognize very limited forms of loop index expressions, in
+  // particular, only affine AddRec's like {C1,+,C2}.
+  const SCEVAddRecExpr *IdxExpr = dyn_cast<SCEVAddRecExpr>(Idx);
+  if (!IdxExpr || !IdxExpr->isAffine() || isLoopInvariant(IdxExpr, L) ||
+      !isa<SCEVConstant>(IdxExpr->getOperand(0)) ||
+      !isa<SCEVConstant>(IdxExpr->getOperand(1)))
+    return getCouldNotCompute();
+
+  unsigned MaxSteps = MaxBruteForceIterations;
+  for (unsigned IterationNum = 0; IterationNum != MaxSteps; ++IterationNum) {
+    ConstantInt *ItCst = ConstantInt::get(
+                           cast<IntegerType>(IdxExpr->getType()), IterationNum);
+    ConstantInt *Val = EvaluateConstantChrecAtConstant(IdxExpr, ItCst, *this);
+
+    // Form the GEP offset.
+    Indexes[VarIdxNum] = Val;
+
+    Constant *Result = ConstantFoldLoadThroughGEPIndices(GV->getInitializer(),
+                                                         Indexes);
+    if (Result == 0) break;  // Cannot compute!
+
+    // Evaluate the condition for this iteration.
+    Result = ConstantExpr::getICmp(predicate, Result, RHS);
+    if (!isa<ConstantInt>(Result)) break;  // Couldn't decide for sure
+    if (cast<ConstantInt>(Result)->getValue().isMinValue()) {
+#if 0
+      dbgs() << "\n***\n*** Computed loop count " << *ItCst
+             << "\n*** From global " << *GV << "*** BB: " << *L->getHeader()
+             << "***\n";
+#endif
+      ++NumArrayLenItCounts;
+      return getConstant(ItCst);   // Found terminating iteration!
+    }
+  }
+  return getCouldNotCompute();
+}
+
+
+/// CanConstantFold - Return true if we can constant fold an instruction of the
+/// specified type, assuming that all operands were constants.
+static bool CanConstantFold(const Instruction *I) {
+  if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
+      isa<SelectInst>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+      isa<LoadInst>(I))
+    return true;
+
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (const Function *F = CI->getCalledFunction())
+      return canConstantFoldCallTo(F);
+  return false;
+}
+
+/// Determine whether this instruction can constant evolve within this loop
+/// assuming its operands can all constant evolve.
+static bool canConstantEvolve(Instruction *I, const Loop *L) {
+  // An instruction outside of the loop can't be derived from a loop PHI.
+  if (!L->contains(I)) return false;
+
+  if (isa<PHINode>(I)) {
+    if (L->getHeader() == I->getParent())
+      return true;
+    else
+      // We don't currently keep track of the control flow needed to evaluate
+      // PHIs, so we cannot handle PHIs inside of loops.
+      return false;
+  }
+
+  // If we won't be able to constant fold this expression even if the operands
+  // are constants, bail early.
+  return CanConstantFold(I);
+}
+
+/// getConstantEvolvingPHIOperands - Implement getConstantEvolvingPHI by
+/// recursing through each instruction operand until reaching a loop header phi.
+static PHINode *
+getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
+                               DenseMap<Instruction *, PHINode *> &PHIMap) {
+
+  // Otherwise, we can evaluate this instruction if all of its operands are
+  // constant or derived from a PHI node themselves.
+  PHINode *PHI = 0;
+  for (Instruction::op_iterator OpI = UseInst->op_begin(),
+         OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
+
+    if (isa<Constant>(*OpI)) continue;
+
+    Instruction *OpInst = dyn_cast<Instruction>(*OpI);
+    if (!OpInst || !canConstantEvolve(OpInst, L)) return 0;
+
+    PHINode *P = dyn_cast<PHINode>(OpInst);
+    if (!P)
+      // If this operand is already visited, reuse the prior result.
+      // We may have P != PHI if this is the deepest point at which the
+      // inconsistent paths meet.
+      P = PHIMap.lookup(OpInst);
+    if (!P) {
+      // Recurse and memoize the results, whether a phi is found or not.
+      // This recursive call invalidates pointers into PHIMap.
+      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap);
+      PHIMap[OpInst] = P;
+    }
+    if (P == 0) return 0;        // Not evolving from PHI
+    if (PHI && PHI != P) return 0;  // Evolving from multiple different PHIs.
+    PHI = P;
+  }
+  // This is a expression evolving from a constant PHI!
+  return PHI;
+}
+
+/// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node
+/// in the loop that V is derived from.  We allow arbitrary operations along the
+/// way, but the operands of an operation must either be constants or a value
+/// derived from a constant PHI.  If this expression does not fit with these
+/// constraints, return null.
+static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !canConstantEvolve(I, L)) return 0;
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    return PN;
+  }
+
+  // Record non-constant instructions contained by the loop.
+  DenseMap<Instruction *, PHINode *> PHIMap;
+  return getConstantEvolvingPHIOperands(I, L, PHIMap);
+}
+
+/// EvaluateExpression - Given an expression that passes the
+/// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node
+/// in the loop has the value PHIVal.  If we can't fold this expression for some
+/// reason, return null.
+static Constant *EvaluateExpression(Value *V, const Loop *L,
+                                    DenseMap<Instruction *, Constant *> &Vals,
+                                    const DataLayout *TD,
+                                    const TargetLibraryInfo *TLI) {
+  // Convenient constant check, but redundant for recursive calls.
+  if (Constant *C = dyn_cast<Constant>(V)) return C;
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return 0;
+
+  if (Constant *C = Vals.lookup(I)) return C;
+
+  // An instruction inside the loop depends on a value outside the loop that we
+  // weren't given a mapping for, or a value such as a call inside the loop.
+  if (!canConstantEvolve(I, L)) return 0;
+
+  // An unmapped PHI can be due to a branch or another loop inside this loop,
+  // or due to this not being the initial iteration through a loop where we
+  // couldn't compute the evolution of this particular PHI last time.
+  if (isa<PHINode>(I)) return 0;
+
+  std::vector<Constant*> Operands(I->getNumOperands());
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    Instruction *Operand = dyn_cast<Instruction>(I->getOperand(i));
+    if (!Operand) {
+      Operands[i] = dyn_cast<Constant>(I->getOperand(i));
+      if (!Operands[i]) return 0;
+      continue;
+    }
+    Constant *C = EvaluateExpression(Operand, L, Vals, TD, TLI);
+    Vals[Operand] = C;
+    if (!C) return 0;
+    Operands[i] = C;
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
+                                           Operands[1], TD, TLI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!LI->isVolatile())
+      return ConstantFoldLoadFromConstPtr(Operands[0], TD);
+  }
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands, TD,
+                                  TLI);
+}
+
+/// getConstantEvolutionLoopExitValue - If we know that the specified Phi is
+/// in the header of its containing loop, we know the loop executes a
+/// constant number of times, and the PHI node is just a recurrence
+/// involving constants, fold it.
+Constant *
+ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
+                                                   const APInt &BEs,
+                                                   const Loop *L) {
+  DenseMap<PHINode*, Constant*>::const_iterator I =
+    ConstantEvolutionLoopExitValue.find(PN);
+  if (I != ConstantEvolutionLoopExitValue.end())
+    return I->second;
+
+  if (BEs.ugt(MaxBruteForceIterations))
+    return ConstantEvolutionLoopExitValue[PN] = 0;  // Not going to evaluate it.
+
+  Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];
+
+  DenseMap<Instruction *, Constant *> CurrentIterVals;
+  BasicBlock *Header = L->getHeader();
+  assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");
+
+  // Since the loop is canonicalized, the PHI node must have two entries.  One
+  // entry must be a constant (coming in from outside of the loop), and the
+  // second must be derived from the same PHI.
+  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
+  PHINode *PHI = 0;
+  for (BasicBlock::iterator I = Header->begin();
+       (PHI = dyn_cast<PHINode>(I)); ++I) {
+    Constant *StartCST =
+      dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
+    if (StartCST == 0) continue;
+    CurrentIterVals[PHI] = StartCST;
+  }
+  if (!CurrentIterVals.count(PN))
+    return RetVal = 0;
+
+  Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
+
+  // Execute the loop symbolically to determine the exit value.
+  if (BEs.getActiveBits() >= 32)
+    return RetVal = 0; // More than 2^32-1 iterations?? Not doing it!
+
+  unsigned NumIterations = BEs.getZExtValue(); // must be in range
+  unsigned IterationNum = 0;
+  for (; ; ++IterationNum) {
+    if (IterationNum == NumIterations)
+      return RetVal = CurrentIterVals[PN];  // Got exit value!
+
+    // Compute the value of the PHIs for the next iteration.
+    // EvaluateExpression adds non-phi values to the CurrentIterVals map.
+    DenseMap<Instruction *, Constant *> NextIterVals;
+    Constant *NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, TD,
+                                           TLI);
+    if (NextPHI == 0)
+      return 0;        // Couldn't evaluate!
+    NextIterVals[PN] = NextPHI;
+
+    bool StoppedEvolving = NextPHI == CurrentIterVals[PN];
+
+    // Also evaluate the other PHI nodes.  However, we don't get to stop if we
+    // cease to be able to evaluate one of them or if they stop evolving,
+    // because that doesn't necessarily prevent us from computing PN.
+    SmallVector<std::pair<PHINode *, Constant *>, 8> PHIsToCompute;
+    for (DenseMap<Instruction *, Constant *>::const_iterator
+           I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){
+      PHINode *PHI = dyn_cast<PHINode>(I->first);
+      if (!PHI || PHI == PN || PHI->getParent() != Header) continue;
+      PHIsToCompute.push_back(std::make_pair(PHI, I->second));
+    }
+    // We use two distinct loops because EvaluateExpression may invalidate any
+    // iterators into CurrentIterVals.
+    for (SmallVectorImpl<std::pair<PHINode *, Constant*> >::const_iterator
+             I = PHIsToCompute.begin(), E = PHIsToCompute.end(); I != E; ++I) {
+      PHINode *PHI = I->first;
+      Constant *&NextPHI = NextIterVals[PHI];
+      if (!NextPHI) {   // Not already computed.
+        Value *BEValue = PHI->getIncomingValue(SecondIsBackedge);
+        NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, TD, TLI);
+      }
+      if (NextPHI != I->second)
+        StoppedEvolving = false;
+    }
+
+    // If all entries in CurrentIterVals == NextIterVals then we can stop
+    // iterating, the loop can't continue to change.
+    if (StoppedEvolving)
+      return RetVal = CurrentIterVals[PN];
+
+    CurrentIterVals.swap(NextIterVals);
+  }
+}
+
+/// ComputeExitCountExhaustively - If the loop is known to execute a
+/// constant number of times (the condition evolves only from constants),
+/// try to evaluate a few iterations of the loop until we get the exit
+/// condition gets a value of ExitWhen (true or false).  If we cannot
+/// evaluate the trip count of the loop, return getCouldNotCompute().
+const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
+                                                          Value *Cond,
+                                                          bool ExitWhen) {
+  PHINode *PN = getConstantEvolvingPHI(Cond, L);
+  if (PN == 0) return getCouldNotCompute();
+
+  // If the loop is canonicalized, the PHI will have exactly two entries.
+  // That's the only form we support here.
+  if (PN->getNumIncomingValues() != 2) return getCouldNotCompute();
+
+  DenseMap<Instruction *, Constant *> CurrentIterVals;
+  BasicBlock *Header = L->getHeader();
+  assert(PN->getParent() == Header && "Can't evaluate PHI not in loop header!");
+
+  // One entry must be a constant (coming in from outside of the loop), and the
+  // second must be derived from the same PHI.
+  bool SecondIsBackedge = L->contains(PN->getIncomingBlock(1));
+  PHINode *PHI = 0;
+  for (BasicBlock::iterator I = Header->begin();
+       (PHI = dyn_cast<PHINode>(I)); ++I) {
+    Constant *StartCST =
+      dyn_cast<Constant>(PHI->getIncomingValue(!SecondIsBackedge));
+    if (StartCST == 0) continue;
+    CurrentIterVals[PHI] = StartCST;
+  }
+  if (!CurrentIterVals.count(PN))
+    return getCouldNotCompute();
+
+  // Okay, we find a PHI node that defines the trip count of this loop.  Execute
+  // the loop symbolically to determine when the condition gets a value of
+  // "ExitWhen".
+
+  unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.
+  for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){
+    ConstantInt *CondVal =
+      dyn_cast_or_null<ConstantInt>(EvaluateExpression(Cond, L, CurrentIterVals,
+                                                       TD, TLI));
+
+    // Couldn't symbolically evaluate.
+    if (!CondVal) return getCouldNotCompute();
+
+    if (CondVal->getValue() == uint64_t(ExitWhen)) {
+      ++NumBruteForceTripCountsComputed;
+      return getConstant(Type::getInt32Ty(getContext()), IterationNum);
+    }
+
+    // Update all the PHI nodes for the next iteration.
+    DenseMap<Instruction *, Constant *> NextIterVals;
+
+    // Create a list of which PHIs we need to compute. We want to do this before
+    // calling EvaluateExpression on them because that may invalidate iterators
+    // into CurrentIterVals.
+    SmallVector<PHINode *, 8> PHIsToCompute;
+    for (DenseMap<Instruction *, Constant *>::const_iterator
+           I = CurrentIterVals.begin(), E = CurrentIterVals.end(); I != E; ++I){
+      PHINode *PHI = dyn_cast<PHINode>(I->first);
+      if (!PHI || PHI->getParent() != Header) continue;
+      PHIsToCompute.push_back(PHI);
+    }
+    for (SmallVectorImpl<PHINode *>::const_iterator I = PHIsToCompute.begin(),
+             E = PHIsToCompute.end(); I != E; ++I) {
+      PHINode *PHI = *I;
+      Constant *&NextPHI = NextIterVals[PHI];
+      if (NextPHI) continue;    // Already computed!
+
+      Value *BEValue = PHI->getIncomingValue(SecondIsBackedge);
+      NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, TD, TLI);
+    }
+    CurrentIterVals.swap(NextIterVals);
+  }
+
+  // Too many iterations were needed to evaluate.
+  return getCouldNotCompute();
+}
+
+/// getSCEVAtScope - Return a SCEV expression for the specified value
+/// at the specified scope in the program.  The L value specifies a loop
+/// nest to evaluate the expression at, where null is the top-level or a
+/// specified loop is immediately inside of the loop.
+///
+/// This method can be used to compute the exit value for a variable defined
+/// in a loop by querying what the value will hold in the parent loop.
+///
+/// In the case that a relevant loop exit value cannot be computed, the
+/// original value V is returned.
+const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
+  // Check to see if we've folded this expression at this loop before.
+  std::map<const Loop *, const SCEV *> &Values = ValuesAtScopes[V];
+  std::pair<std::map<const Loop *, const SCEV *>::iterator, bool> Pair =
+    Values.insert(std::make_pair(L, static_cast<const SCEV *>(0)));
+  if (!Pair.second)
+    return Pair.first->second ? Pair.first->second : V;
+
+  // Otherwise compute it.
+  const SCEV *C = computeSCEVAtScope(V, L);
+  ValuesAtScopes[V][L] = C;
+  return C;
+}
+
+/// This builds up a Constant using the ConstantExpr interface.  That way, we
+/// will return Constants for objects which aren't represented by a
+/// SCEVConstant, because SCEVConstant is restricted to ConstantInt.
+/// Returns NULL if the SCEV isn't representable as a Constant.
+static Constant *BuildConstantFromSCEV(const SCEV *V) {
+  switch (V->getSCEVType()) {
+    default:  // TODO: smax, umax.
+    case scCouldNotCompute:
+    case scAddRecExpr:
+      break;
+    case scConstant:
+      return cast<SCEVConstant>(V)->getValue();
+    case scUnknown:
+      return dyn_cast<Constant>(cast<SCEVUnknown>(V)->getValue());
+    case scSignExtend: {
+      const SCEVSignExtendExpr *SS = cast<SCEVSignExtendExpr>(V);
+      if (Constant *CastOp = BuildConstantFromSCEV(SS->getOperand()))
+        return ConstantExpr::getSExt(CastOp, SS->getType());
+      break;
+    }
+    case scZeroExtend: {
+      const SCEVZeroExtendExpr *SZ = cast<SCEVZeroExtendExpr>(V);
+      if (Constant *CastOp = BuildConstantFromSCEV(SZ->getOperand()))
+        return ConstantExpr::getZExt(CastOp, SZ->getType());
+      break;
+    }
+    case scTruncate: {
+      const SCEVTruncateExpr *ST = cast<SCEVTruncateExpr>(V);
+      if (Constant *CastOp = BuildConstantFromSCEV(ST->getOperand()))
+        return ConstantExpr::getTrunc(CastOp, ST->getType());
+      break;
+    }
+    case scAddExpr: {
+      const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
+      if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
+        if (C->getType()->isPointerTy())
+          C = ConstantExpr::getBitCast(C, Type::getInt8PtrTy(C->getContext()));
+        for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
+          Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
+          if (!C2) return 0;
+
+          // First pointer!
+          if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
+            std::swap(C, C2);
+            // The offsets have been converted to bytes.  We can add bytes to an
+            // i8* by GEP with the byte count in the first index.
+            C = ConstantExpr::getBitCast(C,Type::getInt8PtrTy(C->getContext()));
+          }
+
+          // Don't bother trying to sum two pointers. We probably can't
+          // statically compute a load that results from it anyway.
+          if (C2->getType()->isPointerTy())
+            return 0;
+
+          if (C->getType()->isPointerTy()) {
+            if (cast<PointerType>(C->getType())->getElementType()->isStructTy())
+              C2 = ConstantExpr::getIntegerCast(
+                  C2, Type::getInt32Ty(C->getContext()), true);
+            C = ConstantExpr::getGetElementPtr(C, C2);
+          } else
+            C = ConstantExpr::getAdd(C, C2);
+        }
+        return C;
+      }
+      break;
+    }
+    case scMulExpr: {
+      const SCEVMulExpr *SM = cast<SCEVMulExpr>(V);
+      if (Constant *C = BuildConstantFromSCEV(SM->getOperand(0))) {
+        // Don't bother with pointers at all.
+        if (C->getType()->isPointerTy()) return 0;
+        for (unsigned i = 1, e = SM->getNumOperands(); i != e; ++i) {
+          Constant *C2 = BuildConstantFromSCEV(SM->getOperand(i));
+          if (!C2 || C2->getType()->isPointerTy()) return 0;
+          C = ConstantExpr::getMul(C, C2);
+        }
+        return C;
+      }
+      break;
+    }
+    case scUDivExpr: {
+      const SCEVUDivExpr *SU = cast<SCEVUDivExpr>(V);
+      if (Constant *LHS = BuildConstantFromSCEV(SU->getLHS()))
+        if (Constant *RHS = BuildConstantFromSCEV(SU->getRHS()))
+          if (LHS->getType() == RHS->getType())
+            return ConstantExpr::getUDiv(LHS, RHS);
+      break;
+    }
+  }
+  return 0;
+}
+
+const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
+  if (isa<SCEVConstant>(V)) return V;
+
+  // If this instruction is evolved from a constant-evolving PHI, compute the
+  // exit value from the loop without using SCEVs.
+  if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) {
+    if (Instruction *I = dyn_cast<Instruction>(SU->getValue())) {
+      const Loop *LI = (*this->LI)[I->getParent()];
+      if (LI && LI->getParentLoop() == L)  // Looking for loop exit value.
+        if (PHINode *PN = dyn_cast<PHINode>(I))
+          if (PN->getParent() == LI->getHeader()) {
+            // Okay, there is no closed form solution for the PHI node.  Check
+            // to see if the loop that contains it has a known backedge-taken
+            // count.  If so, we may be able to force computation of the exit
+            // value.
+            const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI);
+            if (const SCEVConstant *BTCC =
+                  dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
+              // Okay, we know how many times the containing loop executes.  If
+              // this is a constant evolving PHI node, get the final value at
+              // the specified iteration number.
+              Constant *RV = getConstantEvolutionLoopExitValue(PN,
+                                                   BTCC->getValue()->getValue(),
+                                                               LI);
+              if (RV) return getSCEV(RV);
+            }
+          }
+
+      // Okay, this is an expression that we cannot symbolically evaluate
+      // into a SCEV.  Check to see if it's possible to symbolically evaluate
+      // the arguments into constants, and if so, try to constant propagate the
+      // result.  This is particularly useful for computing loop exit values.
+      if (CanConstantFold(I)) {
+        SmallVector<Constant *, 4> Operands;
+        bool MadeImprovement = false;
+        for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+          Value *Op = I->getOperand(i);
+          if (Constant *C = dyn_cast<Constant>(Op)) {
+            Operands.push_back(C);
+            continue;
+          }
+
+          // If any of the operands is non-constant and if they are
+          // non-integer and non-pointer, don't even try to analyze them
+          // with scev techniques.
+          if (!isSCEVable(Op->getType()))
+            return V;
+
+          const SCEV *OrigV = getSCEV(Op);
+          const SCEV *OpV = getSCEVAtScope(OrigV, L);
+          MadeImprovement |= OrigV != OpV;
+
+          Constant *C = BuildConstantFromSCEV(OpV);
+          if (!C) return V;
+          if (C->getType() != Op->getType())
+            C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                              Op->getType(),
+                                                              false),
+                                      C, Op->getType());
+          Operands.push_back(C);
+        }
+
+        // Check to see if getSCEVAtScope actually made an improvement.
+        if (MadeImprovement) {
+          Constant *C = 0;
+          if (const CmpInst *CI = dyn_cast<CmpInst>(I))
+            C = ConstantFoldCompareInstOperands(CI->getPredicate(),
+                                                Operands[0], Operands[1], TD,
+                                                TLI);
+          else if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+            if (!LI->isVolatile())
+              C = ConstantFoldLoadFromConstPtr(Operands[0], TD);
+          } else
+            C = ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                         Operands, TD, TLI);
+          if (!C) return V;
+          return getSCEV(C);
+        }
+      }
+    }
+
+    // This is some other type of SCEVUnknown, just return it.
+    return V;
+  }
+
+  if (const SCEVCommutativeExpr *Comm = dyn_cast<SCEVCommutativeExpr>(V)) {
+    // Avoid performing the look-up in the common case where the specified
+    // expression has no loop-variant portions.
+    for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) {
+      const SCEV *OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
+      if (OpAtScope != Comm->getOperand(i)) {
+        // Okay, at least one of these operands is loop variant but might be
+        // foldable.  Build a new instance of the folded commutative expression.
+        SmallVector<const SCEV *, 8> NewOps(Comm->op_begin(),
+                                            Comm->op_begin()+i);
+        NewOps.push_back(OpAtScope);
+
+        for (++i; i != e; ++i) {
+          OpAtScope = getSCEVAtScope(Comm->getOperand(i), L);
+          NewOps.push_back(OpAtScope);
+        }
+        if (isa<SCEVAddExpr>(Comm))
+          return getAddExpr(NewOps);
+        if (isa<SCEVMulExpr>(Comm))
+          return getMulExpr(NewOps);
+        if (isa<SCEVSMaxExpr>(Comm))
+          return getSMaxExpr(NewOps);
+        if (isa<SCEVUMaxExpr>(Comm))
+          return getUMaxExpr(NewOps);
+        llvm_unreachable("Unknown commutative SCEV type!");
+      }
+    }
+    // If we got here, all operands are loop invariant.
+    return Comm;
+  }
+
+  if (const SCEVUDivExpr *Div = dyn_cast<SCEVUDivExpr>(V)) {
+    const SCEV *LHS = getSCEVAtScope(Div->getLHS(), L);
+    const SCEV *RHS = getSCEVAtScope(Div->getRHS(), L);
+    if (LHS == Div->getLHS() && RHS == Div->getRHS())
+      return Div;   // must be loop invariant
+    return getUDivExpr(LHS, RHS);
+  }
+
+  // If this is a loop recurrence for a loop that does not contain L, then we
+  // are dealing with the final value computed by the loop.
+  if (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V)) {
+    // First, attempt to evaluate each operand.
+    // Avoid performing the look-up in the common case where the specified
+    // expression has no loop-variant portions.
+    for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i) {
+      const SCEV *OpAtScope = getSCEVAtScope(AddRec->getOperand(i), L);
+      if (OpAtScope == AddRec->getOperand(i))
+        continue;
+
+      // Okay, at least one of these operands is loop variant but might be
+      // foldable.  Build a new instance of the folded commutative expression.
+      SmallVector<const SCEV *, 8> NewOps(AddRec->op_begin(),
+                                          AddRec->op_begin()+i);
+      NewOps.push_back(OpAtScope);
+      for (++i; i != e; ++i)
+        NewOps.push_back(getSCEVAtScope(AddRec->getOperand(i), L));
+
+      const SCEV *FoldedRec =
+        getAddRecExpr(NewOps, AddRec->getLoop(),
+                      AddRec->getNoWrapFlags(SCEV::FlagNW));
+      AddRec = dyn_cast<SCEVAddRecExpr>(FoldedRec);
+      // The addrec may be folded to a nonrecurrence, for example, if the
+      // induction variable is multiplied by zero after constant folding. Go
+      // ahead and return the folded value.
+      if (!AddRec)
+        return FoldedRec;
+      break;
+    }
+
+    // If the scope is outside the addrec's loop, evaluate it by using the
+    // loop exit value of the addrec.
+    if (!AddRec->getLoop()->contains(L)) {
+      // To evaluate this recurrence, we need to know how many times the AddRec
+      // loop iterates.  Compute this now.
+      const SCEV *BackedgeTakenCount = getBackedgeTakenCount(AddRec->getLoop());
+      if (BackedgeTakenCount == getCouldNotCompute()) return AddRec;
+
+      // Then, evaluate the AddRec.
+      return AddRec->evaluateAtIteration(BackedgeTakenCount, *this);
+    }
+
+    return AddRec;
+  }
+
+  if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) {
+    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getZeroExtendExpr(Op, Cast->getType());
+  }
+
+  if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) {
+    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getSignExtendExpr(Op, Cast->getType());
+  }
+
+  if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) {
+    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
+    if (Op == Cast->getOperand())
+      return Cast;  // must be loop invariant
+    return getTruncateExpr(Op, Cast->getType());
+  }
+
+  llvm_unreachable("Unknown SCEV type!");
+}
+
+/// getSCEVAtScope - This is a convenience function which does
+/// getSCEVAtScope(getSCEV(V), L).
+const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
+  return getSCEVAtScope(getSCEV(V), L);
+}
+
+/// SolveLinEquationWithOverflow - Finds the minimum unsigned root of the
+/// following equation:
+///
+///     A * X = B (mod N)
+///
+/// where N = 2^BW and BW is the common bit width of A and B. The signedness of
+/// A and B isn't important.
+///
+/// If the equation does not have a solution, SCEVCouldNotCompute is returned.
+static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
+                                               ScalarEvolution &SE) {
+  uint32_t BW = A.getBitWidth();
+  assert(BW == B.getBitWidth() && "Bit widths must be the same.");
+  assert(A != 0 && "A must be non-zero.");
+
+  // 1. D = gcd(A, N)
+  //
+  // The gcd of A and N may have only one prime factor: 2. The number of
+  // trailing zeros in A is its multiplicity
+  uint32_t Mult2 = A.countTrailingZeros();
+  // D = 2^Mult2
+
+  // 2. Check if B is divisible by D.
+  //
+  // B is divisible by D if and only if the multiplicity of prime factor 2 for B
+  // is not less than multiplicity of this prime factor for D.
+  if (B.countTrailingZeros() < Mult2)
+    return SE.getCouldNotCompute();
+
+  // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
+  // modulo (N / D).
+  //
+  // (N / D) may need BW+1 bits in its representation.  Hence, we'll use this
+  // bit width during computations.
+  APInt AD = A.lshr(Mult2).zext(BW + 1);  // AD = A / D
+  APInt Mod(BW + 1, 0);
+  Mod.setBit(BW - Mult2);  // Mod = N / D
+  APInt I = AD.multiplicativeInverse(Mod);
+
+  // 4. Compute the minimum unsigned root of the equation:
+  // I * (B / D) mod (N / D)
+  APInt Result = (I * B.lshr(Mult2).zext(BW + 1)).urem(Mod);
+
+  // The result is guaranteed to be less than 2^BW so we may truncate it to BW
+  // bits.
+  return SE.getConstant(Result.trunc(BW));
+}
+
+/// SolveQuadraticEquation - Find the roots of the quadratic equation for the
+/// given quadratic chrec {L,+,M,+,N}.  This returns either the two roots (which
+/// might be the same) or two SCEVCouldNotCompute objects.
+///
+static std::pair<const SCEV *,const SCEV *>
+SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
+  assert(AddRec->getNumOperands() == 3 && "This is not a quadratic chrec!");
+  const SCEVConstant *LC = dyn_cast<SCEVConstant>(AddRec->getOperand(0));
+  const SCEVConstant *MC = dyn_cast<SCEVConstant>(AddRec->getOperand(1));
+  const SCEVConstant *NC = dyn_cast<SCEVConstant>(AddRec->getOperand(2));
+
+  // We currently can only solve this if the coefficients are constants.
+  if (!LC || !MC || !NC) {
+    const SCEV *CNC = SE.getCouldNotCompute();
+    return std::make_pair(CNC, CNC);
+  }
+
+  uint32_t BitWidth = LC->getValue()->getValue().getBitWidth();
+  const APInt &L = LC->getValue()->getValue();
+  const APInt &M = MC->getValue()->getValue();
+  const APInt &N = NC->getValue()->getValue();
+  APInt Two(BitWidth, 2);
+  APInt Four(BitWidth, 4);
+
+  {
+    using namespace APIntOps;
+    const APInt& C = L;
+    // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
+    // The B coefficient is M-N/2
+    APInt B(M);
+    B -= sdiv(N,Two);
+
+    // The A coefficient is N/2
+    APInt A(N.sdiv(Two));
+
+    // Compute the B^2-4ac term.
+    APInt SqrtTerm(B);
+    SqrtTerm *= B;
+    SqrtTerm -= Four * (A * C);
+
+    if (SqrtTerm.isNegative()) {
+      // The loop is provably infinite.
+      const SCEV *CNC = SE.getCouldNotCompute();
+      return std::make_pair(CNC, CNC);
+    }
+
+    // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+    // integer value or else APInt::sqrt() will assert.
+    APInt SqrtVal(SqrtTerm.sqrt());
+
+    // Compute the two solutions for the quadratic formula.
+    // The divisions must be performed as signed divisions.
+    APInt NegB(-B);
+    APInt TwoA(A << 1);
+    if (TwoA.isMinValue()) {
+      const SCEV *CNC = SE.getCouldNotCompute();
+      return std::make_pair(CNC, CNC);
+    }
+
+    LLVMContext &Context = SE.getContext();
+
+    ConstantInt *Solution1 =
+      ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+    ConstantInt *Solution2 =
+      ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+
+    return std::make_pair(SE.getConstant(Solution1),
+                          SE.getConstant(Solution2));
+  } // end APIntOps namespace
+}
+
+/// HowFarToZero - Return the number of times a backedge comparing the specified
+/// value to zero will execute.  If not computable, return CouldNotCompute.
+///
+/// This is only used for loops with a "x != y" exit test. The exit condition is
+/// now expressed as a single expression, V = x-y. So the exit test is
+/// effectively V != 0.  We know and take advantage of the fact that this
+/// expression only being used in a comparison by zero context.
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
+  // If the value is a constant
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
+    // If the value is already zero, the branch will execute zero times.
+    if (C->getValue()->isZero()) return C;
+    return getCouldNotCompute();  // Otherwise it will loop infinitely.
+  }
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(V);
+  if (!AddRec || AddRec->getLoop() != L)
+    return getCouldNotCompute();
+
+  // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of
+  // the quadratic equation to solve it.
+  if (AddRec->isQuadratic() && AddRec->getType()->isIntegerTy()) {
+    std::pair<const SCEV *,const SCEV *> Roots =
+      SolveQuadraticEquation(AddRec, *this);
+    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
+    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
+    if (R1 && R2) {
+#if 0
+      dbgs() << "HFTZ: " << *V << " - sol#1: " << *R1
+             << "  sol#2: " << *R2 << "\n";
+#endif
+      // Pick the smallest positive root value.
+      if (ConstantInt *CB =
+          dyn_cast<ConstantInt>(ConstantExpr::getICmp(CmpInst::ICMP_ULT,
+                                                      R1->getValue(),
+                                                      R2->getValue()))) {
+        if (CB->getZExtValue() == false)
+          std::swap(R1, R2);   // R1 is the minimum root now.
+
+        // We can only use this value if the chrec ends up with an exact zero
+        // value at this index.  When solving for "X*X != 5", for example, we
+        // should not accept a root of 2.
+        const SCEV *Val = AddRec->evaluateAtIteration(R1, *this);
+        if (Val->isZero())
+          return R1;  // We found a quadratic root!
+      }
+    }
+    return getCouldNotCompute();
+  }
+
+  // Otherwise we can only handle this if it is affine.
+  if (!AddRec->isAffine())
+    return getCouldNotCompute();
+
+  // If this is an affine expression, the execution count of this branch is
+  // the minimum unsigned root of the following equation:
+  //
+  //     Start + Step*N = 0 (mod 2^BW)
+  //
+  // equivalent to:
+  //
+  //             Step*N = -Start (mod 2^BW)
+  //
+  // where BW is the common bit width of Start and Step.
+
+  // Get the initial value for the loop.
+  const SCEV *Start = getSCEVAtScope(AddRec->getStart(), L->getParentLoop());
+  const SCEV *Step = getSCEVAtScope(AddRec->getOperand(1), L->getParentLoop());
+
+  // For now we handle only constant steps.
+  //
+  // TODO: Handle a nonconstant Step given AddRec<NUW>. If the
+  // AddRec is NUW, then (in an unsigned sense) it cannot be counting up to wrap
+  // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
+  // We have not yet seen any such cases.
+  const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
+  if (StepC == 0 || StepC->getValue()->equalsInt(0))
+    return getCouldNotCompute();
+
+  // For positive steps (counting up until unsigned overflow):
+  //   N = -Start/Step (as unsigned)
+  // For negative steps (counting down to zero):
+  //   N = Start/-Step
+  // First compute the unsigned distance from zero in the direction of Step.
+  bool CountDown = StepC->getValue()->getValue().isNegative();
+  const SCEV *Distance = CountDown ? Start : getNegativeSCEV(Start);
+
+  // Handle unitary steps, which cannot wraparound.
+  // 1*N = -Start; -1*N = Start (mod 2^BW), so:
+  //   N = Distance (as unsigned)
+  if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
+    ConstantRange CR = getUnsignedRange(Start);
+    const SCEV *MaxBECount;
+    if (!CountDown && CR.getUnsignedMin().isMinValue())
+      // When counting up, the worst starting value is 1, not 0.
+      MaxBECount = CR.getUnsignedMax().isMinValue()
+        ? getConstant(APInt::getMinValue(CR.getBitWidth()))
+        : getConstant(APInt::getMaxValue(CR.getBitWidth()));
+    else
+      MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
+                                         : -CR.getUnsignedMin());
+    return ExitLimit(Distance, MaxBECount);
+  }
+
+  // If the recurrence is known not to wraparound, unsigned divide computes the
+  // back edge count. We know that the value will either become zero (and thus
+  // the loop terminates), that the loop will terminate through some other exit
+  // condition first, or that the loop has undefined behavior.  This means
+  // we can't "miss" the exit value, even with nonunit stride.
+  //
+  // FIXME: Prove that loops always exhibits *acceptable* undefined
+  // behavior. Loops must exhibit defined behavior until a wrapped value is
+  // actually used. So the trip count computed by udiv could be smaller than the
+  // number of well-defined iterations.
+  if (AddRec->getNoWrapFlags(SCEV::FlagNW)) {
+    // FIXME: We really want an "isexact" bit for udiv.
+    return getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
+  }
+  // Then, try to solve the above equation provided that Start is constant.
+  if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
+    return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
+                                        -StartC->getValue()->getValue(),
+                                        *this);
+  return getCouldNotCompute();
+}
+
+/// HowFarToNonZero - Return the number of times a backedge checking the
+/// specified value for nonzero will execute.  If not computable, return
+/// CouldNotCompute
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
+  // Loops that look like: while (X == 0) are very strange indeed.  We don't
+  // handle them yet except for the trivial case.  This could be expanded in the
+  // future as needed.
+
+  // If the value is a constant, check to see if it is known to be non-zero
+  // already.  If so, the backedge will execute zero times.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
+    if (!C->getValue()->isNullValue())
+      return getConstant(C->getType(), 0);
+    return getCouldNotCompute();  // Otherwise it will loop infinitely.
+  }
+
+  // We could implement others, but I really doubt anyone writes loops like
+  // this, and if they did, they would already be constant folded.
+  return getCouldNotCompute();
+}
+
+/// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
+/// (which may not be an immediate predecessor) which has exactly one
+/// successor from which BB is reachable, or null if no such block is
+/// found.
+///
+std::pair<BasicBlock *, BasicBlock *>
+ScalarEvolution::getPredecessorWithUniqueSuccessorForBB(BasicBlock *BB) {
+  // If the block has a unique predecessor, then there is no path from the
+  // predecessor to the block that does not go through the direct edge
+  // from the predecessor to the block.
+  if (BasicBlock *Pred = BB->getSinglePredecessor())
+    return std::make_pair(Pred, BB);
+
+  // A loop's header is defined to be a block that dominates the loop.
+  // If the header has a unique predecessor outside the loop, it must be
+  // a block that has exactly one successor that can reach the loop.
+  if (Loop *L = LI->getLoopFor(BB))
+    return std::make_pair(L->getLoopPredecessor(), L->getHeader());
+
+  return std::pair<BasicBlock *, BasicBlock *>();
+}
+
+/// HasSameValue - SCEV structural equivalence is usually sufficient for
+/// testing whether two expressions are equal, however for the purposes of
+/// looking for a condition guarding a loop, it can be useful to be a little
+/// more general, since a front-end may have replicated the controlling
+/// expression.
+///
+static bool HasSameValue(const SCEV *A, const SCEV *B) {
+  // Quick check to see if they are the same SCEV.
+  if (A == B) return true;
+
+  // Otherwise, if they're both SCEVUnknown, it's possible that they hold
+  // two different instructions with the same value. Check for this case.
+  if (const SCEVUnknown *AU = dyn_cast<SCEVUnknown>(A))
+    if (const SCEVUnknown *BU = dyn_cast<SCEVUnknown>(B))
+      if (const Instruction *AI = dyn_cast<Instruction>(AU->getValue()))
+        if (const Instruction *BI = dyn_cast<Instruction>(BU->getValue()))
+          if (AI->isIdenticalTo(BI) && !AI->mayReadFromMemory())
+            return true;
+
+  // Otherwise assume they may have a different value.
+  return false;
+}
+
+/// SimplifyICmpOperands - Simplify LHS and RHS in a comparison with
+/// predicate Pred. Return true iff any changes were made.
+///
+bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
+                                           const SCEV *&LHS, const SCEV *&RHS,
+                                           unsigned Depth) {
+  bool Changed = false;
+
+  // If we hit the max recursion limit bail out.
+  if (Depth >= 3)
+    return false;
+
+  // Canonicalize a constant to the right side.
+  if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS)) {
+    // Check for both operands constant.
+    if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
+      if (ConstantExpr::getICmp(Pred,
+                                LHSC->getValue(),
+                                RHSC->getValue())->isNullValue())
+        goto trivially_false;
+      else
+        goto trivially_true;
+    }
+    // Otherwise swap the operands to put the constant on the right.
+    std::swap(LHS, RHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+    Changed = true;
+  }
+
+  // If we're comparing an addrec with a value which is loop-invariant in the
+  // addrec's loop, put the addrec on the left. Also make a dominance check,
+  // as both operands could be addrecs loop-invariant in each other's loop.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(RHS)) {
+    const Loop *L = AR->getLoop();
+    if (isLoopInvariant(LHS, L) && properlyDominates(LHS, L->getHeader())) {
+      std::swap(LHS, RHS);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+      Changed = true;
+    }
+  }
+
+  // If there's a constant operand, canonicalize comparisons with boundary
+  // cases, and canonicalize *-or-equal comparisons to regular comparisons.
+  if (const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS)) {
+    const APInt &RA = RC->getValue()->getValue();
+    switch (Pred) {
+    default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
+    case ICmpInst::ICMP_EQ:
+    case ICmpInst::ICMP_NE:
+      // Fold ((-1) * %a) + %b == 0 (equivalent to %b-%a == 0) into %a == %b.
+      if (!RA)
+        if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(LHS))
+          if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(AE->getOperand(0)))
+            if (AE->getNumOperands() == 2 && ME->getNumOperands() == 2 &&
+                ME->getOperand(0)->isAllOnesValue()) {
+              RHS = AE->getOperand(1);
+              LHS = ME->getOperand(1);
+              Changed = true;
+            }
+      break;
+    case ICmpInst::ICMP_UGE:
+      if ((RA - 1).isMinValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        RHS = getConstant(RA - 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        Changed = true;
+        break;
+      }
+      if (RA.isMinValue()) goto trivially_true;
+
+      Pred = ICmpInst::ICMP_UGT;
+      RHS = getConstant(RA - 1);
+      Changed = true;
+      break;
+    case ICmpInst::ICMP_ULE:
+      if ((RA + 1).isMaxValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        RHS = getConstant(RA + 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMinValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxValue()) goto trivially_true;
+
+      Pred = ICmpInst::ICMP_ULT;
+      RHS = getConstant(RA + 1);
+      Changed = true;
+      break;
+    case ICmpInst::ICMP_SGE:
+      if ((RA - 1).isMinSignedValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        RHS = getConstant(RA - 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxSignedValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        Changed = true;
+        break;
+      }
+      if (RA.isMinSignedValue()) goto trivially_true;
+
+      Pred = ICmpInst::ICMP_SGT;
+      RHS = getConstant(RA - 1);
+      Changed = true;
+      break;
+    case ICmpInst::ICMP_SLE:
+      if ((RA + 1).isMaxSignedValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        RHS = getConstant(RA + 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMinSignedValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxSignedValue()) goto trivially_true;
+
+      Pred = ICmpInst::ICMP_SLT;
+      RHS = getConstant(RA + 1);
+      Changed = true;
+      break;
+    case ICmpInst::ICMP_UGT:
+      if (RA.isMinValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        Changed = true;
+        break;
+      }
+      if ((RA + 1).isMaxValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        RHS = getConstant(RA + 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxValue()) goto trivially_false;
+      break;
+    case ICmpInst::ICMP_ULT:
+      if (RA.isMaxValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        Changed = true;
+        break;
+      }
+      if ((RA - 1).isMinValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        RHS = getConstant(RA - 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMinValue()) goto trivially_false;
+      break;
+    case ICmpInst::ICMP_SGT:
+      if (RA.isMinSignedValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        Changed = true;
+        break;
+      }
+      if ((RA + 1).isMaxSignedValue()) {
+        Pred = ICmpInst::ICMP_EQ;
+        RHS = getConstant(RA + 1);
+        Changed = true;
+        break;
+      }
+      if (RA.isMaxSignedValue()) goto trivially_false;
+      break;
+    case ICmpInst::ICMP_SLT:
+      if (RA.isMaxSignedValue()) {
+        Pred = ICmpInst::ICMP_NE;
+        Changed = true;
+        break;
+      }
+      if ((RA - 1).isMinSignedValue()) {
+       Pred = ICmpInst::ICMP_EQ;
+       RHS = getConstant(RA - 1);
+        Changed = true;
+       break;
+      }
+      if (RA.isMinSignedValue()) goto trivially_false;
+      break;
+    }
+  }
+
+  // Check for obvious equality.
+  if (HasSameValue(LHS, RHS)) {
+    if (ICmpInst::isTrueWhenEqual(Pred))
+      goto trivially_true;
+    if (ICmpInst::isFalseWhenEqual(Pred))
+      goto trivially_false;
+  }
+
+  // If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by
+  // adding or subtracting 1 from one of the operands.
+  switch (Pred) {
+  case ICmpInst::ICMP_SLE:
+    if (!getSignedRange(RHS).getSignedMax().isMaxSignedValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
+                       SCEV::FlagNSW);
+      Pred = ICmpInst::ICMP_SLT;
+      Changed = true;
+    } else if (!getSignedRange(LHS).getSignedMin().isMinSignedValue()) {
+      LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
+                       SCEV::FlagNSW);
+      Pred = ICmpInst::ICMP_SLT;
+      Changed = true;
+    }
+    break;
+  case ICmpInst::ICMP_SGE:
+    if (!getSignedRange(RHS).getSignedMin().isMinSignedValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
+                       SCEV::FlagNSW);
+      Pred = ICmpInst::ICMP_SGT;
+      Changed = true;
+    } else if (!getSignedRange(LHS).getSignedMax().isMaxSignedValue()) {
+      LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
+                       SCEV::FlagNSW);
+      Pred = ICmpInst::ICMP_SGT;
+      Changed = true;
+    }
+    break;
+  case ICmpInst::ICMP_ULE:
+    if (!getUnsignedRange(RHS).getUnsignedMax().isMaxValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
+                       SCEV::FlagNUW);
+      Pred = ICmpInst::ICMP_ULT;
+      Changed = true;
+    } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) {
+      LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
+                       SCEV::FlagNUW);
+      Pred = ICmpInst::ICMP_ULT;
+      Changed = true;
+    }
+    break;
+  case ICmpInst::ICMP_UGE:
+    if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
+                       SCEV::FlagNUW);
+      Pred = ICmpInst::ICMP_UGT;
+      Changed = true;
+    } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) {
+      LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
+                       SCEV::FlagNUW);
+      Pred = ICmpInst::ICMP_UGT;
+      Changed = true;
+    }
+    break;
+  default:
+    break;
+  }
+
+  // TODO: More simplifications are possible here.
+
+  // Recursively simplify until we either hit a recursion limit or nothing
+  // changes.
+  if (Changed)
+    return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);
+
+  return Changed;
+
+trivially_true:
+  // Return 0 == 0.
+  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
+  Pred = ICmpInst::ICMP_EQ;
+  return true;
+
+trivially_false:
+  // Return 0 != 0.
+  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
+  Pred = ICmpInst::ICMP_NE;
+  return true;
+}
+
+bool ScalarEvolution::isKnownNegative(const SCEV *S) {
+  return getSignedRange(S).getSignedMax().isNegative();
+}
+
+bool ScalarEvolution::isKnownPositive(const SCEV *S) {
+  return getSignedRange(S).getSignedMin().isStrictlyPositive();
+}
+
+bool ScalarEvolution::isKnownNonNegative(const SCEV *S) {
+  return !getSignedRange(S).getSignedMin().isNegative();
+}
+
+bool ScalarEvolution::isKnownNonPositive(const SCEV *S) {
+  return !getSignedRange(S).getSignedMax().isStrictlyPositive();
+}
+
+bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
+  return isKnownNegative(S) || isKnownPositive(S);
+}
+
+bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
+                                       const SCEV *LHS, const SCEV *RHS) {
+  // Canonicalize the inputs first.
+  (void)SimplifyICmpOperands(Pred, LHS, RHS);
+
+  // If LHS or RHS is an addrec, check to see if the condition is true in
+  // every iteration of the loop.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
+    if (isLoopEntryGuardedByCond(
+          AR->getLoop(), Pred, AR->getStart(), RHS) &&
+        isLoopBackedgeGuardedByCond(
+          AR->getLoop(), Pred, AR->getPostIncExpr(*this), RHS))
+      return true;
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(RHS))
+    if (isLoopEntryGuardedByCond(
+          AR->getLoop(), Pred, LHS, AR->getStart()) &&
+        isLoopBackedgeGuardedByCond(
+          AR->getLoop(), Pred, LHS, AR->getPostIncExpr(*this)))
+      return true;
+
+  // Otherwise see what can be done with known constant ranges.
+  return isKnownPredicateWithRanges(Pred, LHS, RHS);
+}
+
+bool
+ScalarEvolution::isKnownPredicateWithRanges(ICmpInst::Predicate Pred,
+                                            const SCEV *LHS, const SCEV *RHS) {
+  if (HasSameValue(LHS, RHS))
+    return ICmpInst::isTrueWhenEqual(Pred);
+
+  // This code is split out from isKnownPredicate because it is called from
+  // within isLoopEntryGuardedByCond.
+  switch (Pred) {
+  default:
+    llvm_unreachable("Unexpected ICmpInst::Predicate value!");
+  case ICmpInst::ICMP_SGT:
+    Pred = ICmpInst::ICMP_SLT;
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_SLT: {
+    ConstantRange LHSRange = getSignedRange(LHS);
+    ConstantRange RHSRange = getSignedRange(RHS);
+    if (LHSRange.getSignedMax().slt(RHSRange.getSignedMin()))
+      return true;
+    if (LHSRange.getSignedMin().sge(RHSRange.getSignedMax()))
+      return false;
+    break;
+  }
+  case ICmpInst::ICMP_SGE:
+    Pred = ICmpInst::ICMP_SLE;
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_SLE: {
+    ConstantRange LHSRange = getSignedRange(LHS);
+    ConstantRange RHSRange = getSignedRange(RHS);
+    if (LHSRange.getSignedMax().sle(RHSRange.getSignedMin()))
+      return true;
+    if (LHSRange.getSignedMin().sgt(RHSRange.getSignedMax()))
+      return false;
+    break;
+  }
+  case ICmpInst::ICMP_UGT:
+    Pred = ICmpInst::ICMP_ULT;
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_ULT: {
+    ConstantRange LHSRange = getUnsignedRange(LHS);
+    ConstantRange RHSRange = getUnsignedRange(RHS);
+    if (LHSRange.getUnsignedMax().ult(RHSRange.getUnsignedMin()))
+      return true;
+    if (LHSRange.getUnsignedMin().uge(RHSRange.getUnsignedMax()))
+      return false;
+    break;
+  }
+  case ICmpInst::ICMP_UGE:
+    Pred = ICmpInst::ICMP_ULE;
+    std::swap(LHS, RHS);
+  case ICmpInst::ICMP_ULE: {
+    ConstantRange LHSRange = getUnsignedRange(LHS);
+    ConstantRange RHSRange = getUnsignedRange(RHS);
+    if (LHSRange.getUnsignedMax().ule(RHSRange.getUnsignedMin()))
+      return true;
+    if (LHSRange.getUnsignedMin().ugt(RHSRange.getUnsignedMax()))
+      return false;
+    break;
+  }
+  case ICmpInst::ICMP_NE: {
+    if (getUnsignedRange(LHS).intersectWith(getUnsignedRange(RHS)).isEmptySet())
+      return true;
+    if (getSignedRange(LHS).intersectWith(getSignedRange(RHS)).isEmptySet())
+      return true;
+
+    const SCEV *Diff = getMinusSCEV(LHS, RHS);
+    if (isKnownNonZero(Diff))
+      return true;
+    break;
+  }
+  case ICmpInst::ICMP_EQ:
+    // The check at the top of the function catches the case where
+    // the values are known to be equal.
+    break;
+  }
+  return false;
+}
+
+/// isLoopBackedgeGuardedByCond - Test whether the backedge of the loop is
+/// protected by a conditional between LHS and RHS.  This is used to
+/// to eliminate casts.
+bool
+ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
+                                             ICmpInst::Predicate Pred,
+                                             const SCEV *LHS, const SCEV *RHS) {
+  // Interpret a null as meaning no loop, where there is obviously no guard
+  // (interprocedural conditions notwithstanding).
+  if (!L) return true;
+
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return false;
+
+  BranchInst *LoopContinuePredicate =
+    dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LoopContinuePredicate ||
+      LoopContinuePredicate->isUnconditional())
+    return false;
+
+  return isImpliedCond(Pred, LHS, RHS,
+                       LoopContinuePredicate->getCondition(),
+                       LoopContinuePredicate->getSuccessor(0) != L->getHeader());
+}
+
+/// isLoopEntryGuardedByCond - Test whether entry to the loop is protected
+/// by a conditional between LHS and RHS.  This is used to help avoid max
+/// expressions in loop trip counts, and to eliminate casts.
+bool
+ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
+                                          ICmpInst::Predicate Pred,
+                                          const SCEV *LHS, const SCEV *RHS) {
+  // Interpret a null as meaning no loop, where there is obviously no guard
+  // (interprocedural conditions notwithstanding).
+  if (!L) return false;
+
+  // Starting at the loop predecessor, climb up the predecessor chain, as long
+  // as there are predecessors that can be found that have unique successors
+  // leading to the original header.
+  for (std::pair<BasicBlock *, BasicBlock *>
+         Pair(L->getLoopPredecessor(), L->getHeader());
+       Pair.first;
+       Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
+
+    BranchInst *LoopEntryPredicate =
+      dyn_cast<BranchInst>(Pair.first->getTerminator());
+    if (!LoopEntryPredicate ||
+        LoopEntryPredicate->isUnconditional())
+      continue;
+
+    if (isImpliedCond(Pred, LHS, RHS,
+                      LoopEntryPredicate->getCondition(),
+                      LoopEntryPredicate->getSuccessor(0) != Pair.second))
+      return true;
+  }
+
+  return false;
+}
+
+/// RAII wrapper to prevent recursive application of isImpliedCond.
+/// ScalarEvolution's PendingLoopPredicates set must be empty unless we are
+/// currently evaluating isImpliedCond.
+struct MarkPendingLoopPredicate {
+  Value *Cond;
+  DenseSet<Value*> &LoopPreds;
+  bool Pending;
+
+  MarkPendingLoopPredicate(Value *C, DenseSet<Value*> &LP)
+    : Cond(C), LoopPreds(LP) {
+    Pending = !LoopPreds.insert(Cond).second;
+  }
+  ~MarkPendingLoopPredicate() {
+    if (!Pending)
+      LoopPreds.erase(Cond);
+  }
+};
+
+/// isImpliedCond - Test whether the condition described by Pred, LHS,
+/// and RHS is true whenever the given Cond value evaluates to true.
+bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
+                                    const SCEV *LHS, const SCEV *RHS,
+                                    Value *FoundCondValue,
+                                    bool Inverse) {
+  MarkPendingLoopPredicate Mark(FoundCondValue, PendingLoopPredicates);
+  if (Mark.Pending)
+    return false;
+
+  // Recursively handle And and Or conditions.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FoundCondValue)) {
+    if (BO->getOpcode() == Instruction::And) {
+      if (!Inverse)
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+    } else if (BO->getOpcode() == Instruction::Or) {
+      if (Inverse)
+        return isImpliedCond(Pred, LHS, RHS, BO->getOperand(0), Inverse) ||
+               isImpliedCond(Pred, LHS, RHS, BO->getOperand(1), Inverse);
+    }
+  }
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(FoundCondValue);
+  if (!ICI) return false;
+
+  // Bail if the ICmp's operands' types are wider than the needed type
+  // before attempting to call getSCEV on them. This avoids infinite
+  // recursion, since the analysis of widening casts can require loop
+  // exit condition information for overflow checking, which would
+  // lead back here.
+  if (getTypeSizeInBits(LHS->getType()) <
+      getTypeSizeInBits(ICI->getOperand(0)->getType()))
+    return false;
+
+  // Now that we found a conditional branch that dominates the loop or controls
+  // the loop latch. Check to see if it is the comparison we are looking for.
+  ICmpInst::Predicate FoundPred;
+  if (Inverse)
+    FoundPred = ICI->getInversePredicate();
+  else
+    FoundPred = ICI->getPredicate();
+
+  const SCEV *FoundLHS = getSCEV(ICI->getOperand(0));
+  const SCEV *FoundRHS = getSCEV(ICI->getOperand(1));
+
+  // Balance the types. The case where FoundLHS' type is wider than
+  // LHS' type is checked for above.
+  if (getTypeSizeInBits(LHS->getType()) >
+      getTypeSizeInBits(FoundLHS->getType())) {
+    if (CmpInst::isSigned(Pred)) {
+      FoundLHS = getSignExtendExpr(FoundLHS, LHS->getType());
+      FoundRHS = getSignExtendExpr(FoundRHS, LHS->getType());
+    } else {
+      FoundLHS = getZeroExtendExpr(FoundLHS, LHS->getType());
+      FoundRHS = getZeroExtendExpr(FoundRHS, LHS->getType());
+    }
+  }
+
+  // Canonicalize the query to match the way instcombine will have
+  // canonicalized the comparison.
+  if (SimplifyICmpOperands(Pred, LHS, RHS))
+    if (LHS == RHS)
+      return CmpInst::isTrueWhenEqual(Pred);
+  if (SimplifyICmpOperands(FoundPred, FoundLHS, FoundRHS))
+    if (FoundLHS == FoundRHS)
+      return CmpInst::isFalseWhenEqual(FoundPred);
+
+  // Check to see if we can make the LHS or RHS match.
+  if (LHS == FoundRHS || RHS == FoundLHS) {
+    if (isa<SCEVConstant>(RHS)) {
+      std::swap(FoundLHS, FoundRHS);
+      FoundPred = ICmpInst::getSwappedPredicate(FoundPred);
+    } else {
+      std::swap(LHS, RHS);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    }
+  }
+
+  // Check whether the found predicate is the same as the desired predicate.
+  if (FoundPred == Pred)
+    return isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS);
+
+  // Check whether swapping the found predicate makes it the same as the
+  // desired predicate.
+  if (ICmpInst::getSwappedPredicate(FoundPred) == Pred) {
+    if (isa<SCEVConstant>(RHS))
+      return isImpliedCondOperands(Pred, LHS, RHS, FoundRHS, FoundLHS);
+    else
+      return isImpliedCondOperands(ICmpInst::getSwappedPredicate(Pred),
+                                   RHS, LHS, FoundLHS, FoundRHS);
+  }
+
+  // Check whether the actual condition is beyond sufficient.
+  if (FoundPred == ICmpInst::ICMP_EQ)
+    if (ICmpInst::isTrueWhenEqual(Pred))
+      if (isImpliedCondOperands(Pred, LHS, RHS, FoundLHS, FoundRHS))
+        return true;
+  if (Pred == ICmpInst::ICMP_NE)
+    if (!ICmpInst::isTrueWhenEqual(FoundPred))
+      if (isImpliedCondOperands(FoundPred, LHS, RHS, FoundLHS, FoundRHS))
+        return true;
+
+  // Otherwise assume the worst.
+  return false;
+}
+
+/// isImpliedCondOperands - Test whether the condition described by Pred,
+/// LHS, and RHS is true whenever the condition described by Pred, FoundLHS,
+/// and FoundRHS is true.
+bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
+                                            const SCEV *LHS, const SCEV *RHS,
+                                            const SCEV *FoundLHS,
+                                            const SCEV *FoundRHS) {
+  return isImpliedCondOperandsHelper(Pred, LHS, RHS,
+                                     FoundLHS, FoundRHS) ||
+         // ~x < ~y --> x > y
+         isImpliedCondOperandsHelper(Pred, LHS, RHS,
+                                     getNotSCEV(FoundRHS),
+                                     getNotSCEV(FoundLHS));
+}
+
+/// isImpliedCondOperandsHelper - Test whether the condition described by
+/// Pred, LHS, and RHS is true whenever the condition described by Pred,
+/// FoundLHS, and FoundRHS is true.
+bool
+ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
+                                             const SCEV *LHS, const SCEV *RHS,
+                                             const SCEV *FoundLHS,
+                                             const SCEV *FoundRHS) {
+  switch (Pred) {
+  default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE:
+    if (HasSameValue(LHS, FoundLHS) && HasSameValue(RHS, FoundRHS))
+      return true;
+    break;
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_SLE:
+    if (isKnownPredicateWithRanges(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        isKnownPredicateWithRanges(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+      return true;
+    break;
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_SGE:
+    if (isKnownPredicateWithRanges(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        isKnownPredicateWithRanges(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+      return true;
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_ULE:
+    if (isKnownPredicateWithRanges(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        isKnownPredicateWithRanges(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+      return true;
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_UGE:
+    if (isKnownPredicateWithRanges(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        isKnownPredicateWithRanges(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+      return true;
+    break;
+  }
+
+  return false;
+}
+
+/// getBECount - Subtract the end and start values and divide by the step,
+/// rounding up, to get the number of times the backedge is executed. Return
+/// CouldNotCompute if an intermediate computation overflows.
+const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
+                                        const SCEV *End,
+                                        const SCEV *Step,
+                                        bool NoWrap) {
+  assert(!isKnownNegative(Step) &&
+         "This code doesn't handle negative strides yet!");
+
+  Type *Ty = Start->getType();
+
+  // When Start == End, we have an exact BECount == 0. Short-circuit this case
+  // here because SCEV may not be able to determine that the unsigned division
+  // after rounding is zero.
+  if (Start == End)
+    return getConstant(Ty, 0);
+
+  const SCEV *NegOne = getConstant(Ty, (uint64_t)-1);
+  const SCEV *Diff = getMinusSCEV(End, Start);
+  const SCEV *RoundUp = getAddExpr(Step, NegOne);
+
+  // Add an adjustment to the difference between End and Start so that
+  // the division will effectively round up.
+  const SCEV *Add = getAddExpr(Diff, RoundUp);
+
+  if (!NoWrap) {
+    // Check Add for unsigned overflow.
+    // TODO: More sophisticated things could be done here.
+    Type *WideTy = IntegerType::get(getContext(),
+                                          getTypeSizeInBits(Ty) + 1);
+    const SCEV *EDiff = getZeroExtendExpr(Diff, WideTy);
+    const SCEV *ERoundUp = getZeroExtendExpr(RoundUp, WideTy);
+    const SCEV *OperandExtendedAdd = getAddExpr(EDiff, ERoundUp);
+    if (getZeroExtendExpr(Add, WideTy) != OperandExtendedAdd)
+      return getCouldNotCompute();
+  }
+
+  return getUDivExpr(Add, Step);
+}
+
+/// HowManyLessThans - Return the number of times a backedge containing the
+/// specified less-than comparison will execute.  If not computable, return
+/// CouldNotCompute.
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
+                                  const Loop *L, bool isSigned) {
+  // Only handle:  "ADDREC < LoopInvariant".
+  if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS);
+  if (!AddRec || AddRec->getLoop() != L)
+    return getCouldNotCompute();
+
+  // Check to see if we have a flag which makes analysis easy.
+  bool NoWrap = isSigned ?
+    AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNW)) :
+    AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNW));
+
+  if (AddRec->isAffine()) {
+    unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
+    const SCEV *Step = AddRec->getStepRecurrence(*this);
+
+    if (Step->isZero())
+      return getCouldNotCompute();
+    if (Step->isOne()) {
+      // With unit stride, the iteration never steps past the limit value.
+    } else if (isKnownPositive(Step)) {
+      // Test whether a positive iteration can step past the limit
+      // value and past the maximum value for its type in a single step.
+      // Note that it's not sufficient to check NoWrap here, because even
+      // though the value after a wrap is undefined, it's not undefined
+      // behavior, so if wrap does occur, the loop could either terminate or
+      // loop infinitely, but in either case, the loop is guaranteed to
+      // iterate at least until the iteration where the wrapping occurs.
+      const SCEV *One = getConstant(Step->getType(), 1);
+      if (isSigned) {
+        APInt Max = APInt::getSignedMaxValue(BitWidth);
+        if ((Max - getSignedRange(getMinusSCEV(Step, One)).getSignedMax())
+              .slt(getSignedRange(RHS).getSignedMax()))
+          return getCouldNotCompute();
+      } else {
+        APInt Max = APInt::getMaxValue(BitWidth);
+        if ((Max - getUnsignedRange(getMinusSCEV(Step, One)).getUnsignedMax())
+              .ult(getUnsignedRange(RHS).getUnsignedMax()))
+          return getCouldNotCompute();
+      }
+    } else
+      // TODO: Handle negative strides here and below.
+      return getCouldNotCompute();
+
+    // We know the LHS is of the form {n,+,s} and the RHS is some loop-invariant
+    // m.  So, we count the number of iterations in which {n,+,s} < m is true.
+    // Note that we cannot simply return max(m-n,0)/s because it's not safe to
+    // treat m-n as signed nor unsigned due to overflow possibility.
+
+    // First, we get the value of the LHS in the first iteration: n
+    const SCEV *Start = AddRec->getOperand(0);
+
+    // Determine the minimum constant start value.
+    const SCEV *MinStart = getConstant(isSigned ?
+      getSignedRange(Start).getSignedMin() :
+      getUnsignedRange(Start).getUnsignedMin());
+
+    // If we know that the condition is true in order to enter the loop,
+    // then we know that it will run exactly (m-n)/s times. Otherwise, we
+    // only know that it will execute (max(m,n)-n)/s times. In both cases,
+    // the division must round up.
+    const SCEV *End = RHS;
+    if (!isLoopEntryGuardedByCond(L,
+                                  isSigned ? ICmpInst::ICMP_SLT :
+                                             ICmpInst::ICMP_ULT,
+                                  getMinusSCEV(Start, Step), RHS))
+      End = isSigned ? getSMaxExpr(RHS, Start)
+                     : getUMaxExpr(RHS, Start);
+
+    // Determine the maximum constant end value.
+    const SCEV *MaxEnd = getConstant(isSigned ?
+      getSignedRange(End).getSignedMax() :
+      getUnsignedRange(End).getUnsignedMax());
+
+    // If MaxEnd is within a step of the maximum integer value in its type,
+    // adjust it down to the minimum value which would produce the same effect.
+    // This allows the subsequent ceiling division of (N+(step-1))/step to
+    // compute the correct value.
+    const SCEV *StepMinusOne = getMinusSCEV(Step,
+                                            getConstant(Step->getType(), 1));
+    MaxEnd = isSigned ?
+      getSMinExpr(MaxEnd,
+                  getMinusSCEV(getConstant(APInt::getSignedMaxValue(BitWidth)),
+                               StepMinusOne)) :
+      getUMinExpr(MaxEnd,
+                  getMinusSCEV(getConstant(APInt::getMaxValue(BitWidth)),
+                               StepMinusOne));
+
+    // Finally, we subtract these two values and divide, rounding up, to get
+    // the number of times the backedge is executed.
+    const SCEV *BECount = getBECount(Start, End, Step, NoWrap);
+
+    // The maximum backedge count is similar, except using the minimum start
+    // value and the maximum end value.
+    // If we already have an exact constant BECount, use it instead.
+    const SCEV *MaxBECount = isa<SCEVConstant>(BECount) ? BECount
+      : getBECount(MinStart, MaxEnd, Step, NoWrap);
+
+    // If the stride is nonconstant, and NoWrap == true, then
+    // getBECount(MinStart, MaxEnd) may not compute. This would result in an
+    // exact BECount and invalid MaxBECount, which should be avoided to catch
+    // more optimization opportunities.
+    if (isa<SCEVCouldNotCompute>(MaxBECount))
+      MaxBECount = BECount;
+
+    return ExitLimit(BECount, MaxBECount);
+  }
+
+  return getCouldNotCompute();
+}
+
+/// getNumIterationsInRange - Return the number of iterations of this loop that
+/// produce values in the specified constant range.  Another way of looking at
+/// this is that it returns the first iteration number where the value is not in
+/// the condition, thus computing the exit count. If the iteration count can't
+/// be computed, an instance of SCEVCouldNotCompute is returned.
+const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
+                                                    ScalarEvolution &SE) const {
+  if (Range.isFullSet())  // Infinite loop.
+    return SE.getCouldNotCompute();
+
+  // If the start is a non-zero constant, shift the range to simplify things.
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(getStart()))
+    if (!SC->getValue()->isZero()) {
+      SmallVector<const SCEV *, 4> Operands(op_begin(), op_end());
+      Operands[0] = SE.getConstant(SC->getType(), 0);
+      const SCEV *Shifted = SE.getAddRecExpr(Operands, getLoop(),
+                                             getNoWrapFlags(FlagNW));
+      if (const SCEVAddRecExpr *ShiftedAddRec =
+            dyn_cast<SCEVAddRecExpr>(Shifted))
+        return ShiftedAddRec->getNumIterationsInRange(
+                           Range.subtract(SC->getValue()->getValue()), SE);
+      // This is strange and shouldn't happen.
+      return SE.getCouldNotCompute();
+    }
+
+  // The only time we can solve this is when we have all constant indices.
+  // Otherwise, we cannot determine the overflow conditions.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (!isa<SCEVConstant>(getOperand(i)))
+      return SE.getCouldNotCompute();
+
+
+  // Okay at this point we know that all elements of the chrec are constants and
+  // that the start element is zero.
+
+  // First check to see if the range contains zero.  If not, the first
+  // iteration exits.
+  unsigned BitWidth = SE.getTypeSizeInBits(getType());
+  if (!Range.contains(APInt(BitWidth, 0)))
+    return SE.getConstant(getType(), 0);
+
+  if (isAffine()) {
+    // If this is an affine expression then we have this situation:
+    //   Solve {0,+,A} in Range  ===  Ax in Range
+
+    // We know that zero is in the range.  If A is positive then we know that
+    // the upper value of the range must be the first possible exit value.
+    // If A is negative then the lower of the range is the last possible loop
+    // value.  Also note that we already checked for a full range.
+    APInt One(BitWidth,1);
+    APInt A     = cast<SCEVConstant>(getOperand(1))->getValue()->getValue();
+    APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower();
+
+    // The exit value should be (End+A)/A.
+    APInt ExitVal = (End + A).udiv(A);
+    ConstantInt *ExitValue = ConstantInt::get(SE.getContext(), ExitVal);
+
+    // Evaluate at the exit value.  If we really did fall out of the valid
+    // range, then we computed our trip count, otherwise wrap around or other
+    // things must have happened.
+    ConstantInt *Val = EvaluateConstantChrecAtConstant(this, ExitValue, SE);
+    if (Range.contains(Val->getValue()))
+      return SE.getCouldNotCompute();  // Something strange happened
+
+    // Ensure that the previous value is in the range.  This is a sanity check.
+    assert(Range.contains(
+           EvaluateConstantChrecAtConstant(this,
+           ConstantInt::get(SE.getContext(), ExitVal - One), SE)->getValue()) &&
+           "Linear scev computation is off in a bad way!");
+    return SE.getConstant(ExitValue);
+  } else if (isQuadratic()) {
+    // If this is a quadratic (3-term) AddRec {L,+,M,+,N}, find the roots of the
+    // quadratic equation to solve it.  To do this, we must frame our problem in
+    // terms of figuring out when zero is crossed, instead of when
+    // Range.getUpper() is crossed.
+    SmallVector<const SCEV *, 4> NewOps(op_begin(), op_end());
+    NewOps[0] = SE.getNegativeSCEV(SE.getConstant(Range.getUpper()));
+    const SCEV *NewAddRec = SE.getAddRecExpr(NewOps, getLoop(),
+                                             // getNoWrapFlags(FlagNW)
+                                             FlagAnyWrap);
+
+    // Next, solve the constructed addrec
+    std::pair<const SCEV *,const SCEV *> Roots =
+      SolveQuadraticEquation(cast<SCEVAddRecExpr>(NewAddRec), SE);
+    const SCEVConstant *R1 = dyn_cast<SCEVConstant>(Roots.first);
+    const SCEVConstant *R2 = dyn_cast<SCEVConstant>(Roots.second);
+    if (R1) {
+      // Pick the smallest positive root value.
+      if (ConstantInt *CB =
+          dyn_cast<ConstantInt>(ConstantExpr::getICmp(ICmpInst::ICMP_ULT,
+                         R1->getValue(), R2->getValue()))) {
+        if (CB->getZExtValue() == false)
+          std::swap(R1, R2);   // R1 is the minimum root now.
+
+        // Make sure the root is not off by one.  The returned iteration should
+        // not be in the range, but the previous one should be.  When solving
+        // for "X*X < 5", for example, we should not return a root of 2.
+        ConstantInt *R1Val = EvaluateConstantChrecAtConstant(this,
+                                                             R1->getValue(),
+                                                             SE);
+        if (Range.contains(R1Val->getValue())) {
+          // The next iteration must be out of the range...
+          ConstantInt *NextVal =
+                ConstantInt::get(SE.getContext(), R1->getValue()->getValue()+1);
+
+          R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
+          if (!Range.contains(R1Val->getValue()))
+            return SE.getConstant(NextVal);
+          return SE.getCouldNotCompute();  // Something strange happened
+        }
+
+        // If R1 was not in the range, then it is a good return value.  Make
+        // sure that R1-1 WAS in the range though, just in case.
+        ConstantInt *NextVal =
+               ConstantInt::get(SE.getContext(), R1->getValue()->getValue()-1);
+        R1Val = EvaluateConstantChrecAtConstant(this, NextVal, SE);
+        if (Range.contains(R1Val->getValue()))
+          return R1;
+        return SE.getCouldNotCompute();  // Something strange happened
+      }
+    }
+  }
+
+  return SE.getCouldNotCompute();
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                   SCEVCallbackVH Class Implementation
+//===----------------------------------------------------------------------===//
+
+void ScalarEvolution::SCEVCallbackVH::deleted() {
+  assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
+  if (PHINode *PN = dyn_cast<PHINode>(getValPtr()))
+    SE->ConstantEvolutionLoopExitValue.erase(PN);
+  SE->ValueExprMap.erase(getValPtr());
+  // this now dangles!
+}
+
+void ScalarEvolution::SCEVCallbackVH::allUsesReplacedWith(Value *V) {
+  assert(SE && "SCEVCallbackVH called with a null ScalarEvolution!");
+
+  // Forget all the expressions associated with users of the old value,
+  // so that future queries will recompute the expressions using the new
+  // value.
+  Value *Old = getValPtr();
+  SmallVector<User *, 16> Worklist;
+  SmallPtrSet<User *, 8> Visited;
+  for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
+       UI != UE; ++UI)
+    Worklist.push_back(*UI);
+  while (!Worklist.empty()) {
+    User *U = Worklist.pop_back_val();
+    // Deleting the Old value will cause this to dangle. Postpone
+    // that until everything else is done.
+    if (U == Old)
+      continue;
+    if (!Visited.insert(U))
+      continue;
+    if (PHINode *PN = dyn_cast<PHINode>(U))
+      SE->ConstantEvolutionLoopExitValue.erase(PN);
+    SE->ValueExprMap.erase(U);
+    for (Value::use_iterator UI = U->use_begin(), UE = U->use_end();
+         UI != UE; ++UI)
+      Worklist.push_back(*UI);
+  }
+  // Delete the Old value.
+  if (PHINode *PN = dyn_cast<PHINode>(Old))
+    SE->ConstantEvolutionLoopExitValue.erase(PN);
+  SE->ValueExprMap.erase(Old);
+  // this now dangles!
+}
+
+ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
+  : CallbackVH(V), SE(se) {}
+
+//===----------------------------------------------------------------------===//
+//                   ScalarEvolution Class Implementation
+//===----------------------------------------------------------------------===//
+
+ScalarEvolution::ScalarEvolution()
+  : FunctionPass(ID), FirstUnknown(0) {
+  initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
+}
+
+bool ScalarEvolution::runOnFunction(Function &F) {
+  this->F = &F;
+  LI = &getAnalysis<LoopInfo>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  return false;
+}
+
+void ScalarEvolution::releaseMemory() {
+  // Iterate through all the SCEVUnknown instances and call their
+  // destructors, so that they release their references to their values.
+  for (SCEVUnknown *U = FirstUnknown; U; U = U->Next)
+    U->~SCEVUnknown();
+  FirstUnknown = 0;
+
+  ValueExprMap.clear();
+
+  // Free any extra memory created for ExitNotTakenInfo in the unlikely event
+  // that a loop had multiple computable exits.
+  for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I =
+         BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end();
+       I != E; ++I) {
+    I->second.clear();
+  }
+
+  assert(PendingLoopPredicates.empty() && "isImpliedCond garbage");
+
+  BackedgeTakenCounts.clear();
+  ConstantEvolutionLoopExitValue.clear();
+  ValuesAtScopes.clear();
+  LoopDispositions.clear();
+  BlockDispositions.clear();
+  UnsignedRanges.clear();
+  SignedRanges.clear();
+  UniqueSCEVs.clear();
+  SCEVAllocator.Reset();
+}
+
+void ScalarEvolution::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LoopInfo>();
+  AU.addRequiredTransitive<DominatorTree>();
+  AU.addRequired<TargetLibraryInfo>();
+}
+
+bool ScalarEvolution::hasLoopInvariantBackedgeTakenCount(const Loop *L) {
+  return !isa<SCEVCouldNotCompute>(getBackedgeTakenCount(L));
+}
+
+static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
+                          const Loop *L) {
+  // Print all inner loops first
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    PrintLoopInfo(OS, SE, *I);
+
+  OS << "Loop ";
+  WriteAsOperand(OS, L->getHeader(), /*PrintType=*/false);
+  OS << ": ";
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1)
+    OS << "<multiple exits> ";
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << "backedge-taken count is " << *SE->getBackedgeTakenCount(L);
+  } else {
+    OS << "Unpredictable backedge-taken count. ";
+  }
+
+  OS << "\n"
+        "Loop ";
+  WriteAsOperand(OS, L->getHeader(), /*PrintType=*/false);
+  OS << ": ";
+
+  if (!isa<SCEVCouldNotCompute>(SE->getMaxBackedgeTakenCount(L))) {
+    OS << "max backedge-taken count is " << *SE->getMaxBackedgeTakenCount(L);
+  } else {
+    OS << "Unpredictable max backedge-taken count. ";
+  }
+
+  OS << "\n";
+}
+
+void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
+  // ScalarEvolution's implementation of the print method is to print
+  // out SCEV values of all instructions that are interesting. Doing
+  // this potentially causes it to create new SCEV objects though,
+  // which technically conflicts with the const qualifier. This isn't
+  // observable from outside the class though, so casting away the
+  // const isn't dangerous.
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+
+  OS << "Classifying expressions for: ";
+  WriteAsOperand(OS, F, /*PrintType=*/false);
+  OS << "\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isSCEVable(I->getType()) && !isa<CmpInst>(*I)) {
+      OS << *I << '\n';
+      OS << "  -->  ";
+      const SCEV *SV = SE.getSCEV(&*I);
+      SV->print(OS);
+
+      const Loop *L = LI->getLoopFor((*I).getParent());
+
+      const SCEV *AtUse = SE.getSCEVAtScope(SV, L);
+      if (AtUse != SV) {
+        OS << "  -->  ";
+        AtUse->print(OS);
+      }
+
+      if (L) {
+        OS << "\t\t" "Exits: ";
+        const SCEV *ExitValue = SE.getSCEVAtScope(SV, L->getParentLoop());
+        if (!SE.isLoopInvariant(ExitValue, L)) {
+          OS << "<<Unknown>>";
+        } else {
+          OS << *ExitValue;
+        }
+      }
+
+      OS << "\n";
+    }
+
+  OS << "Determining loop execution counts for: ";
+  WriteAsOperand(OS, F, /*PrintType=*/false);
+  OS << "\n";
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+    PrintLoopInfo(OS, &SE, *I);
+}
+
+ScalarEvolution::LoopDisposition
+ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
+  std::map<const Loop *, LoopDisposition> &Values = LoopDispositions[S];
+  std::pair<std::map<const Loop *, LoopDisposition>::iterator, bool> Pair =
+    Values.insert(std::make_pair(L, LoopVariant));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  LoopDisposition D = computeLoopDisposition(S, L);
+  return LoopDispositions[S][L] = D;
+}
+
+ScalarEvolution::LoopDisposition
+ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
+  switch (S->getSCEVType()) {
+  case scConstant:
+    return LoopInvariant;
+  case scTruncate:
+  case scZeroExtend:
+  case scSignExtend:
+    return getLoopDisposition(cast<SCEVCastExpr>(S)->getOperand(), L);
+  case scAddRecExpr: {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S);
+
+    // If L is the addrec's loop, it's computable.
+    if (AR->getLoop() == L)
+      return LoopComputable;
+
+    // Add recurrences are never invariant in the function-body (null loop).
+    if (!L)
+      return LoopVariant;
+
+    // This recurrence is variant w.r.t. L if L contains AR's loop.
+    if (L->contains(AR->getLoop()))
+      return LoopVariant;
+
+    // This recurrence is invariant w.r.t. L if AR's loop contains L.
+    if (AR->getLoop()->contains(L))
+      return LoopInvariant;
+
+    // This recurrence is variant w.r.t. L if any of its operands
+    // are variant.
+    for (SCEVAddRecExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
+         I != E; ++I)
+      if (!isLoopInvariant(*I, L))
+        return LoopVariant;
+
+    // Otherwise it's loop-invariant.
+    return LoopInvariant;
+  }
+  case scAddExpr:
+  case scMulExpr:
+  case scUMaxExpr:
+  case scSMaxExpr: {
+    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
+    bool HasVarying = false;
+    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
+         I != E; ++I) {
+      LoopDisposition D = getLoopDisposition(*I, L);
+      if (D == LoopVariant)
+        return LoopVariant;
+      if (D == LoopComputable)
+        HasVarying = true;
+    }
+    return HasVarying ? LoopComputable : LoopInvariant;
+  }
+  case scUDivExpr: {
+    const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
+    LoopDisposition LD = getLoopDisposition(UDiv->getLHS(), L);
+    if (LD == LoopVariant)
+      return LoopVariant;
+    LoopDisposition RD = getLoopDisposition(UDiv->getRHS(), L);
+    if (RD == LoopVariant)
+      return LoopVariant;
+    return (LD == LoopInvariant && RD == LoopInvariant) ?
+           LoopInvariant : LoopComputable;
+  }
+  case scUnknown:
+    // All non-instruction values are loop invariant.  All instructions are loop
+    // invariant if they are not contained in the specified loop.
+    // Instructions are never considered invariant in the function body
+    // (null loop) because they are defined within the "loop".
+    if (Instruction *I = dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue()))
+      return (L && !L->contains(I)) ? LoopInvariant : LoopVariant;
+    return LoopInvariant;
+  case scCouldNotCompute:
+    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+  default: llvm_unreachable("Unknown SCEV kind!");
+  }
+}
+
+bool ScalarEvolution::isLoopInvariant(const SCEV *S, const Loop *L) {
+  return getLoopDisposition(S, L) == LoopInvariant;
+}
+
+bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
+  return getLoopDisposition(S, L) == LoopComputable;
+}
+
+ScalarEvolution::BlockDisposition
+ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
+  std::map<const BasicBlock *, BlockDisposition> &Values = BlockDispositions[S];
+  std::pair<std::map<const BasicBlock *, BlockDisposition>::iterator, bool>
+    Pair = Values.insert(std::make_pair(BB, DoesNotDominateBlock));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  BlockDisposition D = computeBlockDisposition(S, BB);
+  return BlockDispositions[S][BB] = D;
+}
+
+ScalarEvolution::BlockDisposition
+ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
+  switch (S->getSCEVType()) {
+  case scConstant:
+    return ProperlyDominatesBlock;
+  case scTruncate:
+  case scZeroExtend:
+  case scSignExtend:
+    return getBlockDisposition(cast<SCEVCastExpr>(S)->getOperand(), BB);
+  case scAddRecExpr: {
+    // This uses a "dominates" query instead of "properly dominates" query
+    // to test for proper dominance too, because the instruction which
+    // produces the addrec's value is a PHI, and a PHI effectively properly
+    // dominates its entire containing block.
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(S);
+    if (!DT->dominates(AR->getLoop()->getHeader(), BB))
+      return DoesNotDominateBlock;
+  }
+  // FALL THROUGH into SCEVNAryExpr handling.
+  case scAddExpr:
+  case scMulExpr:
+  case scUMaxExpr:
+  case scSMaxExpr: {
+    const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
+    bool Proper = true;
+    for (SCEVNAryExpr::op_iterator I = NAry->op_begin(), E = NAry->op_end();
+         I != E; ++I) {
+      BlockDisposition D = getBlockDisposition(*I, BB);
+      if (D == DoesNotDominateBlock)
+        return DoesNotDominateBlock;
+      if (D == DominatesBlock)
+        Proper = false;
+    }
+    return Proper ? ProperlyDominatesBlock : DominatesBlock;
+  }
+  case scUDivExpr: {
+    const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(S);
+    const SCEV *LHS = UDiv->getLHS(), *RHS = UDiv->getRHS();
+    BlockDisposition LD = getBlockDisposition(LHS, BB);
+    if (LD == DoesNotDominateBlock)
+      return DoesNotDominateBlock;
+    BlockDisposition RD = getBlockDisposition(RHS, BB);
+    if (RD == DoesNotDominateBlock)
+      return DoesNotDominateBlock;
+    return (LD == ProperlyDominatesBlock && RD == ProperlyDominatesBlock) ?
+      ProperlyDominatesBlock : DominatesBlock;
+  }
+  case scUnknown:
+    if (Instruction *I =
+          dyn_cast<Instruction>(cast<SCEVUnknown>(S)->getValue())) {
+      if (I->getParent() == BB)
+        return DominatesBlock;
+      if (DT->properlyDominates(I->getParent(), BB))
+        return ProperlyDominatesBlock;
+      return DoesNotDominateBlock;
+    }
+    return ProperlyDominatesBlock;
+  case scCouldNotCompute:
+    llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+  default:
+    llvm_unreachable("Unknown SCEV kind!");
+  }
+}
+
+bool ScalarEvolution::dominates(const SCEV *S, const BasicBlock *BB) {
+  return getBlockDisposition(S, BB) >= DominatesBlock;
+}
+
+bool ScalarEvolution::properlyDominates(const SCEV *S, const BasicBlock *BB) {
+  return getBlockDisposition(S, BB) == ProperlyDominatesBlock;
+}
+
+namespace {
+// Search for a SCEV expression node within an expression tree.
+// Implements SCEVTraversal::Visitor.
+struct SCEVSearch {
+  const SCEV *Node;
+  bool IsFound;
+
+  SCEVSearch(const SCEV *N): Node(N), IsFound(false) {}
+
+  bool follow(const SCEV *S) {
+    IsFound |= (S == Node);
+    return !IsFound;
+  }
+  bool isDone() const { return IsFound; }
+};
+}
+
+bool ScalarEvolution::hasOperand(const SCEV *S, const SCEV *Op) const {
+  SCEVSearch Search(Op);
+  visitAll(S, Search);
+  return Search.IsFound;
+}
+
+void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
+  ValuesAtScopes.erase(S);
+  LoopDispositions.erase(S);
+  BlockDispositions.erase(S);
+  UnsignedRanges.erase(S);
+  SignedRanges.erase(S);
+
+  for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I =
+         BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end(); I != E; ) {
+    BackedgeTakenInfo &BEInfo = I->second;
+    if (BEInfo.hasOperand(S, this)) {
+      BEInfo.clear();
+      BackedgeTakenCounts.erase(I++);
+    }
+    else
+      ++I;
+  }
+}
+
+typedef DenseMap<const Loop *, std::string> VerifyMap;
+
+/// replaceSubString - Replaces all occurences of From in Str with To.
+static void replaceSubString(std::string &Str, StringRef From, StringRef To) {
+  size_t Pos = 0;
+  while ((Pos = Str.find(From, Pos)) != std::string::npos) {
+    Str.replace(Pos, From.size(), To.data(), To.size());
+    Pos += To.size();
+  }
+}
+
+/// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis.
+static void
+getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) {
+  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I) {
+    getLoopBackedgeTakenCounts(*I, Map, SE); // recurse.
+
+    std::string &S = Map[L];
+    if (S.empty()) {
+      raw_string_ostream OS(S);
+      SE.getBackedgeTakenCount(L)->print(OS);
+
+      // false and 0 are semantically equivalent. This can happen in dead loops.
+      replaceSubString(OS.str(), "false", "0");
+      // Remove wrap flags, their use in SCEV is highly fragile.
+      // FIXME: Remove this when SCEV gets smarter about them.
+      replaceSubString(OS.str(), "<nw>", "");
+      replaceSubString(OS.str(), "<nsw>", "");
+      replaceSubString(OS.str(), "<nuw>", "");
+    }
+  }
+}
+
+void ScalarEvolution::verifyAnalysis() const {
+  if (!VerifySCEV)
+    return;
+
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+
+  // Gather stringified backedge taken counts for all loops using SCEV's caches.
+  // FIXME: It would be much better to store actual values instead of strings,
+  //        but SCEV pointers will change if we drop the caches.
+  VerifyMap BackedgeDumpsOld, BackedgeDumpsNew;
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+    getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE);
+
+  // Gather stringified backedge taken counts for all loops without using
+  // SCEV's caches.
+  SE.releaseMemory();
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+    getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE);
+
+  // Now compare whether they're the same with and without caches. This allows
+  // verifying that no pass changed the cache.
+  assert(BackedgeDumpsOld.size() == BackedgeDumpsNew.size() &&
+         "New loops suddenly appeared!");
+
+  for (VerifyMap::iterator OldI = BackedgeDumpsOld.begin(),
+                           OldE = BackedgeDumpsOld.end(),
+                           NewI = BackedgeDumpsNew.begin();
+       OldI != OldE; ++OldI, ++NewI) {
+    assert(OldI->first == NewI->first && "Loop order changed!");
+
+    // Compare the stringified SCEVs. We don't care if undef backedgetaken count
+    // changes.
+    // FIXME: We currently ignore SCEV changes from/to CouldNotCompute. This
+    // means that a pass is buggy or SCEV has to learn a new pattern but is
+    // usually not harmful.
+    if (OldI->second != NewI->second &&
+        OldI->second.find("undef") == std::string::npos &&
+        NewI->second.find("undef") == std::string::npos &&
+        OldI->second != "***COULDNOTCOMPUTE***" &&
+        NewI->second != "***COULDNOTCOMPUTE***") {
+      dbgs() << "SCEVValidator: SCEV for loop '"
+             << OldI->first->getHeader()->getName()
+             << "' changed from '" << OldI->second
+             << "' to '" << NewI->second << "'!\n";
+      std::abort();
+    }
+  }
+
+  // TODO: Verify more things.
+}
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
new file mode 100644
index 000000000000..79c5f0deb03b
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -0,0 +1,173 @@
+//===- ScalarEvolutionAliasAnalysis.cpp - SCEV-based Alias Analysis -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ScalarEvolutionAliasAnalysis pass, which implements a
+// simple alias analysis implemented in terms of ScalarEvolution queries.
+//
+// This differs from traditional loop dependence analysis in that it tests
+// for dependencies within a single iteration of a loop, rather than
+// dependencies between different iterations.
+//
+// ScalarEvolution has a more complete understanding of pointer arithmetic
+// than BasicAliasAnalysis' collection of ad-hoc analyses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  /// ScalarEvolutionAliasAnalysis - This is a simple alias analysis
+  /// implementation that uses ScalarEvolution to answer queries.
+  class ScalarEvolutionAliasAnalysis : public FunctionPass,
+                                       public AliasAnalysis {
+    ScalarEvolution *SE;
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ScalarEvolutionAliasAnalysis() : FunctionPass(ID), SE(0) {
+      initializeScalarEvolutionAliasAnalysisPass(
+        *PassRegistry::getPassRegistry());
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+
+  private:
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+
+    Value *GetBaseValue(const SCEV *S);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char ScalarEvolutionAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS_BEGIN(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa",
+                   "ScalarEvolution-based Alias Analysis", false, true, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_AG_PASS_END(ScalarEvolutionAliasAnalysis, AliasAnalysis, "scev-aa",
+                    "ScalarEvolution-based Alias Analysis", false, true, false)
+
+FunctionPass *llvm::createScalarEvolutionAliasAnalysisPass() {
+  return new ScalarEvolutionAliasAnalysis();
+}
+
+void
+ScalarEvolutionAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredTransitive<ScalarEvolution>();
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+bool
+ScalarEvolutionAliasAnalysis::runOnFunction(Function &F) {
+  InitializeAliasAnalysis(this);
+  SE = &getAnalysis<ScalarEvolution>();
+  return false;
+}
+
+/// GetBaseValue - Given an expression, try to find a
+/// base value. Return null is none was found.
+Value *
+ScalarEvolutionAliasAnalysis::GetBaseValue(const SCEV *S) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // In an addrec, assume that the base will be in the start, rather
+    // than the step.
+    return GetBaseValue(AR->getStart());
+  } else if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
+    // If there's a pointer operand, it'll be sorted at the end of the list.
+    const SCEV *Last = A->getOperand(A->getNumOperands()-1);
+    if (Last->getType()->isPointerTy())
+      return GetBaseValue(Last);
+  } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    // This is a leaf node.
+    return U->getValue();
+  }
+  // No Identified object found.
+  return 0;
+}
+
+AliasAnalysis::AliasResult
+ScalarEvolutionAliasAnalysis::alias(const Location &LocA,
+                                    const Location &LocB) {
+  // If either of the memory references is empty, it doesn't matter what the
+  // pointer values are. This allows the code below to ignore this special
+  // case.
+  if (LocA.Size == 0 || LocB.Size == 0)
+    return NoAlias;
+
+  // This is ScalarEvolutionAliasAnalysis. Get the SCEVs!
+  const SCEV *AS = SE->getSCEV(const_cast<Value *>(LocA.Ptr));
+  const SCEV *BS = SE->getSCEV(const_cast<Value *>(LocB.Ptr));
+
+  // If they evaluate to the same expression, it's a MustAlias.
+  if (AS == BS) return MustAlias;
+
+  // If something is known about the difference between the two addresses,
+  // see if it's enough to prove a NoAlias.
+  if (SE->getEffectiveSCEVType(AS->getType()) ==
+      SE->getEffectiveSCEVType(BS->getType())) {
+    unsigned BitWidth = SE->getTypeSizeInBits(AS->getType());
+    APInt ASizeInt(BitWidth, LocA.Size);
+    APInt BSizeInt(BitWidth, LocB.Size);
+
+    // Compute the difference between the two pointers.
+    const SCEV *BA = SE->getMinusSCEV(BS, AS);
+
+    // Test whether the difference is known to be great enough that memory of
+    // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
+    // are non-zero, which is special-cased above.
+    if (ASizeInt.ule(SE->getUnsignedRange(BA).getUnsignedMin()) &&
+        (-BSizeInt).uge(SE->getUnsignedRange(BA).getUnsignedMax()))
+      return NoAlias;
+
+    // Folding the subtraction while preserving range information can be tricky
+    // (because of INT_MIN, etc.); if the prior test failed, swap AS and BS
+    // and try again to see if things fold better that way.
+
+    // Compute the difference between the two pointers.
+    const SCEV *AB = SE->getMinusSCEV(AS, BS);
+
+    // Test whether the difference is known to be great enough that memory of
+    // the given sizes don't overlap. This assumes that ASizeInt and BSizeInt
+    // are non-zero, which is special-cased above.
+    if (BSizeInt.ule(SE->getUnsignedRange(AB).getUnsignedMin()) &&
+        (-ASizeInt).uge(SE->getUnsignedRange(AB).getUnsignedMax()))
+      return NoAlias;
+  }
+
+  // If ScalarEvolution can find an underlying object, form a new query.
+  // The correctness of this depends on ScalarEvolution not recognizing
+  // inttoptr and ptrtoint operators.
+  Value *AO = GetBaseValue(AS);
+  Value *BO = GetBaseValue(BS);
+  if ((AO && AO != LocA.Ptr) || (BO && BO != LocB.Ptr))
+    if (alias(Location(AO ? AO : LocA.Ptr,
+                       AO ? +UnknownSize : LocA.Size,
+                       AO ? 0 : LocA.TBAATag),
+              Location(BO ? BO : LocB.Ptr,
+                       BO ? +UnknownSize : LocB.Size,
+                       BO ? 0 : LocB.TBAATag)) == NoAlias)
+      return NoAlias;
+
+  // Forward the query to the next analysis.
+  return AliasAnalysis::alias(LocA, LocB);
+}
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
new file mode 100644
index 000000000000..fcd7ce272a22
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -0,0 +1,1753 @@
+//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution expander,
+// which is used to generate the code corresponding to a given scalar evolution
+// expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
+/// reusing an existing cast if a suitable one exists, moving an existing
+/// cast if a suitable one exists but isn't in the right place, or
+/// creating a new one.
+Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
+                                       Instruction::CastOps Op,
+                                       BasicBlock::iterator IP) {
+  // This function must be called with the builder having a valid insertion
+  // point. It doesn't need to be the actual IP where the uses of the returned
+  // cast will be added, but it must dominate such IP.
+  // We use this precondition to produce a cast that will dominate all its
+  // uses. In particular, this is crucial for the case where the builder's
+  // insertion point *is* the point where we were asked to put the cast.
+  // Since we don't know the builder's insertion point is actually
+  // where the uses will be added (only that it dominates it), we are
+  // not allowed to move it.
+  BasicBlock::iterator BIP = Builder.GetInsertPoint();
+
+  Instruction *Ret = NULL;
+
+  // Check to see if there is already a cast!
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+    if (U->getType() == Ty)
+      if (CastInst *CI = dyn_cast<CastInst>(U))
+        if (CI->getOpcode() == Op) {
+          // If the cast isn't where we want it, create a new cast at IP.
+          // Likewise, do not reuse a cast at BIP because it must dominate
+          // instructions that might be inserted before BIP.
+          if (BasicBlock::iterator(CI) != IP || BIP == IP) {
+            // Create a new cast, and leave the old cast in place in case
+            // it is being used as an insert point. Clear its operand
+            // so that it doesn't hold anything live.
+            Ret = CastInst::Create(Op, V, Ty, "", IP);
+            Ret->takeName(CI);
+            CI->replaceAllUsesWith(Ret);
+            CI->setOperand(0, UndefValue::get(V->getType()));
+            break;
+          }
+          Ret = CI;
+          break;
+        }
+  }
+
+  // Create a new cast.
+  if (!Ret)
+    Ret = CastInst::Create(Op, V, Ty, V->getName(), IP);
+
+  // We assert at the end of the function since IP might point to an
+  // instruction with different dominance properties than a cast
+  // (an invoke for example) and not dominate BIP (but the cast does).
+  assert(SE.DT->dominates(Ret, BIP));
+
+  rememberInstruction(Ret);
+  return Ret;
+}
+
+/// InsertNoopCastOfTo - Insert a cast of V to the specified type,
+/// which must be possible with a noop cast, doing what we can to share
+/// the casts.
+Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
+  Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
+  assert((Op == Instruction::BitCast ||
+          Op == Instruction::PtrToInt ||
+          Op == Instruction::IntToPtr) &&
+         "InsertNoopCastOfTo cannot perform non-noop casts!");
+  assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
+         "InsertNoopCastOfTo cannot change sizes!");
+
+  // Short-circuit unnecessary bitcasts.
+  if (Op == Instruction::BitCast) {
+    if (V->getType() == Ty)
+      return V;
+    if (CastInst *CI = dyn_cast<CastInst>(V)) {
+      if (CI->getOperand(0)->getType() == Ty)
+        return CI->getOperand(0);
+    }
+  }
+  // Short-circuit unnecessary inttoptr<->ptrtoint casts.
+  if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) &&
+      SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) {
+    if (CastInst *CI = dyn_cast<CastInst>(V))
+      if ((CI->getOpcode() == Instruction::PtrToInt ||
+           CI->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CI->getType()) ==
+          SE.getTypeSizeInBits(CI->getOperand(0)->getType()))
+        return CI->getOperand(0);
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      if ((CE->getOpcode() == Instruction::PtrToInt ||
+           CE->getOpcode() == Instruction::IntToPtr) &&
+          SE.getTypeSizeInBits(CE->getType()) ==
+          SE.getTypeSizeInBits(CE->getOperand(0)->getType()))
+        return CE->getOperand(0);
+  }
+
+  // Fold a cast of a constant.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(Op, C, Ty);
+
+  // Cast the argument at the beginning of the entry block, after
+  // any bitcasts of other arguments.
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
+    while ((isa<BitCastInst>(IP) &&
+            isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
+            cast<BitCastInst>(IP)->getOperand(0) != A) ||
+           isa<DbgInfoIntrinsic>(IP) ||
+           isa<LandingPadInst>(IP))
+      ++IP;
+    return ReuseOrCreateCast(A, Ty, Op, IP);
+  }
+
+  // Cast the instruction immediately after the instruction.
+  Instruction *I = cast<Instruction>(V);
+  BasicBlock::iterator IP = I; ++IP;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    IP = II->getNormalDest()->begin();
+  while (isa<PHINode>(IP) || isa<LandingPadInst>(IP))
+    ++IP;
+  return ReuseOrCreateCast(I, Ty, Op, IP);
+}
+
+/// InsertBinop - Insert the specified binary operator, doing a small amount
+/// of work to avoid inserting an obviously redundant operation.
+Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
+                                 Value *LHS, Value *RHS) {
+  // Fold a binop with constant operands.
+  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+    if (Constant *CRHS = dyn_cast<Constant>(RHS))
+      return ConstantExpr::get(Opcode, CLHS, CRHS);
+
+  // Do a quick scan to see if we have this binop nearby.  If so, reuse it.
+  unsigned ScanLimit = 6;
+  BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+  // Scanning starts from the last instruction before the insertion point.
+  BasicBlock::iterator IP = Builder.GetInsertPoint();
+  if (IP != BlockBegin) {
+    --IP;
+    for (; ScanLimit; --IP, --ScanLimit) {
+      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+      // generated code.
+      if (isa<DbgInfoIntrinsic>(IP))
+        ScanLimit++;
+      if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
+          IP->getOperand(1) == RHS)
+        return IP;
+      if (IP == BlockBegin) break;
+    }
+  }
+
+  // Save the original insertion point so we can restore it when we're done.
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+
+  // Move the insertion point out of as many loops as we can.
+  while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+    if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break;
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!Preheader) break;
+
+    // Ok, move up a level.
+    Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+  }
+
+  // If we haven't found this binop, insert it.
+  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
+  BO->setDebugLoc(SaveInsertPt->getDebugLoc());
+  rememberInstruction(BO);
+
+  // Restore the original insert point.
+  if (SaveInsertBB)
+    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  return BO;
+}
+
+/// FactorOutConstant - Test if S is divisible by Factor, using signed
+/// division. If so, update S with Factor divided out and return true.
+/// S need not be evenly divisible if a reasonable remainder can be
+/// computed.
+/// TODO: When ScalarEvolution gets a SCEVSDivExpr, this can be made
+/// unnecessary; in its place, just signed-divide Ops[i] by the scale and
+/// check to see if the divide was folded.
+static bool FactorOutConstant(const SCEV *&S,
+                              const SCEV *&Remainder,
+                              const SCEV *Factor,
+                              ScalarEvolution &SE,
+                              const DataLayout *TD) {
+  // Everything is divisible by one.
+  if (Factor->isOne())
+    return true;
+
+  // x/x == 1.
+  if (S == Factor) {
+    S = SE.getConstant(S->getType(), 1);
+    return true;
+  }
+
+  // For a Constant, check for a multiple of the given factor.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    // 0/x == 0.
+    if (C->isZero())
+      return true;
+    // Check for divisibility.
+    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) {
+      ConstantInt *CI =
+        ConstantInt::get(SE.getContext(),
+                         C->getValue()->getValue().sdiv(
+                                                   FC->getValue()->getValue()));
+      // If the quotient is zero and the remainder is non-zero, reject
+      // the value at this scale. It will be considered for subsequent
+      // smaller scales.
+      if (!CI->isZero()) {
+        const SCEV *Div = SE.getConstant(CI);
+        S = Div;
+        Remainder =
+          SE.getAddExpr(Remainder,
+                        SE.getConstant(C->getValue()->getValue().srem(
+                                                  FC->getValue()->getValue())));
+        return true;
+      }
+    }
+  }
+
+  // In a Mul, check if there is a constant operand which is a multiple
+  // of the given factor.
+  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+    if (TD) {
+      // With DataLayout, the size is known. Check if there is a constant
+      // operand which is a multiple of the given factor. If so, we can
+      // factor it.
+      const SCEVConstant *FC = cast<SCEVConstant>(Factor);
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+        if (!C->getValue()->getValue().srem(FC->getValue()->getValue())) {
+          SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
+          NewMulOps[0] =
+            SE.getConstant(C->getValue()->getValue().sdiv(
+                                                   FC->getValue()->getValue()));
+          S = SE.getMulExpr(NewMulOps);
+          return true;
+        }
+    } else {
+      // Without DataLayout, check if Factor can be factored out of any of the
+      // Mul's operands. If so, we can just remove it.
+      for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
+        const SCEV *SOp = M->getOperand(i);
+        const SCEV *Remainder = SE.getConstant(SOp->getType(), 0);
+        if (FactorOutConstant(SOp, Remainder, Factor, SE, TD) &&
+            Remainder->isZero()) {
+          SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
+          NewMulOps[i] = SOp;
+          S = SE.getMulExpr(NewMulOps);
+          return true;
+        }
+      }
+    }
+  }
+
+  // In an AddRec, check if both start and step are divisible.
+  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+    const SCEV *Step = A->getStepRecurrence(SE);
+    const SCEV *StepRem = SE.getConstant(Step->getType(), 0);
+    if (!FactorOutConstant(Step, StepRem, Factor, SE, TD))
+      return false;
+    if (!StepRem->isZero())
+      return false;
+    const SCEV *Start = A->getStart();
+    if (!FactorOutConstant(Start, Remainder, Factor, SE, TD))
+      return false;
+    // FIXME: can use A->getNoWrapFlags(FlagNW)
+    S = SE.getAddRecExpr(Start, Step, A->getLoop(), SCEV::FlagAnyWrap);
+    return true;
+  }
+
+  return false;
+}
+
+/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs
+/// is the number of SCEVAddRecExprs present, which are kept at the end of
+/// the list.
+///
+static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
+                                Type *Ty,
+                                ScalarEvolution &SE) {
+  unsigned NumAddRecs = 0;
+  for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i)
+    ++NumAddRecs;
+  // Group Ops into non-addrecs and addrecs.
+  SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs);
+  SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end());
+  // Let ScalarEvolution sort and simplify the non-addrecs list.
+  const SCEV *Sum = NoAddRecs.empty() ?
+                    SE.getConstant(Ty, 0) :
+                    SE.getAddExpr(NoAddRecs);
+  // If it returned an add, use the operands. Otherwise it simplified
+  // the sum into a single value, so just use that.
+  Ops.clear();
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum))
+    Ops.append(Add->op_begin(), Add->op_end());
+  else if (!Sum->isZero())
+    Ops.push_back(Sum);
+  // Then append the addrecs.
+  Ops.append(AddRecs.begin(), AddRecs.end());
+}
+
+/// SplitAddRecs - Flatten a list of add operands, moving addrec start values
+/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}.
+/// This helps expose more opportunities for folding parts of the expressions
+/// into GEP indices.
+///
+static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
+                         Type *Ty,
+                         ScalarEvolution &SE) {
+  // Find the addrecs.
+  SmallVector<const SCEV *, 8> AddRecs;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) {
+      const SCEV *Start = A->getStart();
+      if (Start->isZero()) break;
+      const SCEV *Zero = SE.getConstant(Ty, 0);
+      AddRecs.push_back(SE.getAddRecExpr(Zero,
+                                         A->getStepRecurrence(SE),
+                                         A->getLoop(),
+                                         // FIXME: A->getNoWrapFlags(FlagNW)
+                                         SCEV::FlagAnyWrap));
+      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
+        Ops[i] = Zero;
+        Ops.append(Add->op_begin(), Add->op_end());
+        e += Add->getNumOperands();
+      } else {
+        Ops[i] = Start;
+      }
+    }
+  if (!AddRecs.empty()) {
+    // Add the addrecs onto the end of the list.
+    Ops.append(AddRecs.begin(), AddRecs.end());
+    // Resort the operand list, moving any constants to the front.
+    SimplifyAddOperands(Ops, Ty, SE);
+  }
+}
+
+/// expandAddToGEP - Expand an addition expression with a pointer type into
+/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps
+/// BasicAliasAnalysis and other passes analyze the result. See the rules
+/// for getelementptr vs. inttoptr in
+/// http://llvm.org/docs/LangRef.html#pointeraliasing
+/// for details.
+///
+/// Design note: The correctness of using getelementptr here depends on
+/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as
+/// they may introduce pointer arithmetic which may not be safely converted
+/// into getelementptr.
+///
+/// Design note: It might seem desirable for this function to be more
+/// loop-aware. If some of the indices are loop-invariant while others
+/// aren't, it might seem desirable to emit multiple GEPs, keeping the
+/// loop-invariant portions of the overall computation outside the loop.
+/// However, there are a few reasons this is not done here. Hoisting simple
+/// arithmetic is a low-level optimization that often isn't very
+/// important until late in the optimization process. In fact, passes
+/// like InstructionCombining will combine GEPs, even if it means
+/// pushing loop-invariant computation down into loops, so even if the
+/// GEPs were split here, the work would quickly be undone. The
+/// LoopStrengthReduction pass, which is usually run quite late (and
+/// after the last InstructionCombining pass), takes care of hoisting
+/// loop-invariant portions of expressions, after considering what
+/// can be folded using target addressing modes.
+///
+Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
+                                    const SCEV *const *op_end,
+                                    PointerType *PTy,
+                                    Type *Ty,
+                                    Value *V) {
+  Type *ElTy = PTy->getElementType();
+  SmallVector<Value *, 4> GepIndices;
+  SmallVector<const SCEV *, 8> Ops(op_begin, op_end);
+  bool AnyNonZeroIndices = false;
+
+  // Split AddRecs up into parts as either of the parts may be usable
+  // without the other.
+  SplitAddRecs(Ops, Ty, SE);
+
+  // Descend down the pointer's type and attempt to convert the other
+  // operands into GEP indices, at each level. The first index in a GEP
+  // indexes into the array implied by the pointer operand; the rest of
+  // the indices index into the element or field type selected by the
+  // preceding index.
+  for (;;) {
+    // If the scale size is not 0, attempt to factor out a scale for
+    // array indexing.
+    SmallVector<const SCEV *, 8> ScaledOps;
+    if (ElTy->isSized()) {
+      const SCEV *ElSize = SE.getSizeOfExpr(ElTy);
+      if (!ElSize->isZero()) {
+        SmallVector<const SCEV *, 8> NewOps;
+        for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+          const SCEV *Op = Ops[i];
+          const SCEV *Remainder = SE.getConstant(Ty, 0);
+          if (FactorOutConstant(Op, Remainder, ElSize, SE, SE.TD)) {
+            // Op now has ElSize factored out.
+            ScaledOps.push_back(Op);
+            if (!Remainder->isZero())
+              NewOps.push_back(Remainder);
+            AnyNonZeroIndices = true;
+          } else {
+            // The operand was not divisible, so add it to the list of operands
+            // we'll scan next iteration.
+            NewOps.push_back(Ops[i]);
+          }
+        }
+        // If we made any changes, update Ops.
+        if (!ScaledOps.empty()) {
+          Ops = NewOps;
+          SimplifyAddOperands(Ops, Ty, SE);
+        }
+      }
+    }
+
+    // Record the scaled array index for this level of the type. If
+    // we didn't find any operands that could be factored, tentatively
+    // assume that element zero was selected (since the zero offset
+    // would obviously be folded away).
+    Value *Scaled = ScaledOps.empty() ?
+                    Constant::getNullValue(Ty) :
+                    expandCodeFor(SE.getAddExpr(ScaledOps), Ty);
+    GepIndices.push_back(Scaled);
+
+    // Collect struct field index operands.
+    while (StructType *STy = dyn_cast<StructType>(ElTy)) {
+      bool FoundFieldNo = false;
+      // An empty struct has no fields.
+      if (STy->getNumElements() == 0) break;
+      if (SE.TD) {
+        // With DataLayout, field offsets are known. See if a constant offset
+        // falls within any of the struct fields.
+        if (Ops.empty()) break;
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
+          if (SE.getTypeSizeInBits(C->getType()) <= 64) {
+            const StructLayout &SL = *SE.TD->getStructLayout(STy);
+            uint64_t FullOffset = C->getValue()->getZExtValue();
+            if (FullOffset < SL.getSizeInBytes()) {
+              unsigned ElIdx = SL.getElementContainingOffset(FullOffset);
+              GepIndices.push_back(
+                  ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx));
+              ElTy = STy->getTypeAtIndex(ElIdx);
+              Ops[0] =
+                SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx));
+              AnyNonZeroIndices = true;
+              FoundFieldNo = true;
+            }
+          }
+      } else {
+        // Without DataLayout, just check for an offsetof expression of the
+        // appropriate struct type.
+        for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+          if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Ops[i])) {
+            Type *CTy;
+            Constant *FieldNo;
+            if (U->isOffsetOf(CTy, FieldNo) && CTy == STy) {
+              GepIndices.push_back(FieldNo);
+              ElTy =
+                STy->getTypeAtIndex(cast<ConstantInt>(FieldNo)->getZExtValue());
+              Ops[i] = SE.getConstant(Ty, 0);
+              AnyNonZeroIndices = true;
+              FoundFieldNo = true;
+              break;
+            }
+          }
+      }
+      // If no struct field offsets were found, tentatively assume that
+      // field zero was selected (since the zero offset would obviously
+      // be folded away).
+      if (!FoundFieldNo) {
+        ElTy = STy->getTypeAtIndex(0u);
+        GepIndices.push_back(
+          Constant::getNullValue(Type::getInt32Ty(Ty->getContext())));
+      }
+    }
+
+    if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
+      ElTy = ATy->getElementType();
+    else
+      break;
+  }
+
+  // If none of the operands were convertible to proper GEP indices, cast
+  // the base to i8* and do an ugly getelementptr with that. It's still
+  // better than ptrtoint+arithmetic+inttoptr at least.
+  if (!AnyNonZeroIndices) {
+    // Cast the base to i8*.
+    V = InsertNoopCastOfTo(V,
+       Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace()));
+
+    assert(!isa<Instruction>(V) ||
+           SE.DT->dominates(cast<Instruction>(V), Builder.GetInsertPoint()));
+
+    // Expand the operands for a plain byte offset.
+    Value *Idx = expandCodeFor(SE.getAddExpr(Ops), Ty);
+
+    // Fold a GEP with constant operands.
+    if (Constant *CLHS = dyn_cast<Constant>(V))
+      if (Constant *CRHS = dyn_cast<Constant>(Idx))
+        return ConstantExpr::getGetElementPtr(CLHS, CRHS);
+
+    // Do a quick scan to see if we have this GEP nearby.  If so, reuse it.
+    unsigned ScanLimit = 6;
+    BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+    // Scanning starts from the last instruction before the insertion point.
+    BasicBlock::iterator IP = Builder.GetInsertPoint();
+    if (IP != BlockBegin) {
+      --IP;
+      for (; ScanLimit; --IP, --ScanLimit) {
+        // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+        // generated code.
+        if (isa<DbgInfoIntrinsic>(IP))
+          ScanLimit++;
+        if (IP->getOpcode() == Instruction::GetElementPtr &&
+            IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+          return IP;
+        if (IP == BlockBegin) break;
+      }
+    }
+
+    // Save the original insertion point so we can restore it when we're done.
+    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+
+    // Move the insertion point out of as many loops as we can.
+    while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+      if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break;
+      BasicBlock *Preheader = L->getLoopPreheader();
+      if (!Preheader) break;
+
+      // Ok, move up a level.
+      Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+    }
+
+    // Emit a GEP.
+    Value *GEP = Builder.CreateGEP(V, Idx, "uglygep");
+    rememberInstruction(GEP);
+
+    // Restore the original insert point.
+    if (SaveInsertBB)
+      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
+    return GEP;
+  }
+
+  // Save the original insertion point so we can restore it when we're done.
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+
+  // Move the insertion point out of as many loops as we can.
+  while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
+    if (!L->isLoopInvariant(V)) break;
+
+    bool AnyIndexNotLoopInvariant = false;
+    for (SmallVectorImpl<Value *>::const_iterator I = GepIndices.begin(),
+         E = GepIndices.end(); I != E; ++I)
+      if (!L->isLoopInvariant(*I)) {
+        AnyIndexNotLoopInvariant = true;
+        break;
+      }
+    if (AnyIndexNotLoopInvariant)
+      break;
+
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!Preheader) break;
+
+    // Ok, move up a level.
+    Builder.SetInsertPoint(Preheader, Preheader->getTerminator());
+  }
+
+  // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
+  // because ScalarEvolution may have changed the address arithmetic to
+  // compute a value which is beyond the end of the allocated object.
+  Value *Casted = V;
+  if (V->getType() != PTy)
+    Casted = InsertNoopCastOfTo(Casted, PTy);
+  Value *GEP = Builder.CreateGEP(Casted,
+                                 GepIndices,
+                                 "scevgep");
+  Ops.push_back(SE.getUnknown(GEP));
+  rememberInstruction(GEP);
+
+  // Restore the original insert point.
+  if (SaveInsertBB)
+    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  return expand(SE.getAddExpr(Ops));
+}
+
+/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
+/// SCEV expansion. If they are nested, this is the most nested. If they are
+/// neighboring, pick the later.
+static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
+                                        DominatorTree &DT) {
+  if (!A) return B;
+  if (!B) return A;
+  if (A->contains(B)) return B;
+  if (B->contains(A)) return A;
+  if (DT.dominates(A->getHeader(), B->getHeader())) return B;
+  if (DT.dominates(B->getHeader(), A->getHeader())) return A;
+  return A; // Arbitrarily break the tie.
+}
+
+/// getRelevantLoop - Get the most relevant loop associated with the given
+/// expression, according to PickMostRelevantLoop.
+const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
+  // Test whether we've already computed the most relevant loop for this SCEV.
+  std::pair<DenseMap<const SCEV *, const Loop *>::iterator, bool> Pair =
+    RelevantLoops.insert(std::make_pair(S, static_cast<const Loop *>(0)));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  if (isa<SCEVConstant>(S))
+    // A constant has no relevant loops.
+    return 0;
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
+      return Pair.first->second = SE.LI->getLoopFor(I->getParent());
+    // A non-instruction has no relevant loops.
+    return 0;
+  }
+  if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) {
+    const Loop *L = 0;
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+      L = AR->getLoop();
+    for (SCEVNAryExpr::op_iterator I = N->op_begin(), E = N->op_end();
+         I != E; ++I)
+      L = PickMostRelevantLoop(L, getRelevantLoop(*I), *SE.DT);
+    return RelevantLoops[N] = L;
+  }
+  if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) {
+    const Loop *Result = getRelevantLoop(C->getOperand());
+    return RelevantLoops[C] = Result;
+  }
+  if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+    const Loop *Result =
+      PickMostRelevantLoop(getRelevantLoop(D->getLHS()),
+                           getRelevantLoop(D->getRHS()),
+                           *SE.DT);
+    return RelevantLoops[D] = Result;
+  }
+  llvm_unreachable("Unexpected SCEV type!");
+}
+
+namespace {
+
+/// LoopCompare - Compare loops by PickMostRelevantLoop.
+class LoopCompare {
+  DominatorTree &DT;
+public:
+  explicit LoopCompare(DominatorTree &dt) : DT(dt) {}
+
+  bool operator()(std::pair<const Loop *, const SCEV *> LHS,
+                  std::pair<const Loop *, const SCEV *> RHS) const {
+    // Keep pointer operands sorted at the end.
+    if (LHS.second->getType()->isPointerTy() !=
+        RHS.second->getType()->isPointerTy())
+      return LHS.second->getType()->isPointerTy();
+
+    // Compare loops with PickMostRelevantLoop.
+    if (LHS.first != RHS.first)
+      return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first;
+
+    // If one operand is a non-constant negative and the other is not,
+    // put the non-constant negative on the right so that a sub can
+    // be used instead of a negate and add.
+    if (LHS.second->isNonConstantNegative()) {
+      if (!RHS.second->isNonConstantNegative())
+        return false;
+    } else if (RHS.second->isNonConstantNegative())
+      return true;
+
+    // Otherwise they are equivalent according to this comparison.
+    return false;
+  }
+};
+
+}
+
+Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  // Collect all the add operands in a loop, along with their associated loops.
+  // Iterate in reverse so that constants are emitted last, all else equal, and
+  // so that pointer operands are inserted first, which the code below relies on
+  // to form more involved GEPs.
+  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()),
+       E(S->op_begin()); I != E; ++I)
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+  // Sort by loop. Use a stable sort so that constants follow non-constants and
+  // pointer operands precede non-pointer operands.
+  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT));
+
+  // Emit instructions to add all the operands. Hoist as much as possible
+  // out of loops, and form meaningful getelementptrs where possible.
+  Value *Sum = 0;
+  for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
+       I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) {
+    const Loop *CurLoop = I->first;
+    const SCEV *Op = I->second;
+    if (!Sum) {
+      // This is the first operand. Just expand it.
+      Sum = expand(Op);
+      ++I;
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
+      // The running sum expression is a pointer. Try to form a getelementptr
+      // at this level with that as the base.
+      SmallVector<const SCEV *, 4> NewOps;
+      for (; I != E && I->first == CurLoop; ++I) {
+        // If the operand is SCEVUnknown and not instructions, peek through
+        // it, to enable more of it to be folded into the GEP.
+        const SCEV *X = I->second;
+        if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X))
+          if (!isa<Instruction>(U->getValue()))
+            X = SE.getSCEV(U->getValue());
+        NewOps.push_back(X);
+      }
+      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
+      // The running sum is an integer, and there's a pointer at this level.
+      // Try to form a getelementptr. If the running sum is instructions,
+      // use a SCEVUnknown to avoid re-analyzing them.
+      SmallVector<const SCEV *, 4> NewOps;
+      NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) :
+                                               SE.getSCEV(Sum));
+      for (++I; I != E && I->first == CurLoop; ++I)
+        NewOps.push_back(I->second);
+      Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
+    } else if (Op->isNonConstantNegative()) {
+      // Instead of doing a negate and add, just do a subtract.
+      Value *W = expandCodeFor(SE.getNegativeSCEV(Op), Ty);
+      Sum = InsertNoopCastOfTo(Sum, Ty);
+      Sum = InsertBinop(Instruction::Sub, Sum, W);
+      ++I;
+    } else {
+      // A simple add.
+      Value *W = expandCodeFor(Op, Ty);
+      Sum = InsertNoopCastOfTo(Sum, Ty);
+      // Canonicalize a constant to the RHS.
+      if (isa<Constant>(Sum)) std::swap(Sum, W);
+      Sum = InsertBinop(Instruction::Add, Sum, W);
+      ++I;
+    }
+  }
+
+  return Sum;
+}
+
+Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  // Collect all the mul operands in a loop, along with their associated loops.
+  // Iterate in reverse so that constants are emitted last, all else equal.
+  SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+  for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()),
+       E(S->op_begin()); I != E; ++I)
+    OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+  // Sort by loop. Use a stable sort so that constants follow non-constants.
+  std::stable_sort(OpsAndLoops.begin(), OpsAndLoops.end(), LoopCompare(*SE.DT));
+
+  // Emit instructions to mul all the operands. Hoist as much as possible
+  // out of loops.
+  Value *Prod = 0;
+  for (SmallVectorImpl<std::pair<const Loop *, const SCEV *> >::iterator
+       I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E; ) {
+    const SCEV *Op = I->second;
+    if (!Prod) {
+      // This is the first operand. Just expand it.
+      Prod = expand(Op);
+      ++I;
+    } else if (Op->isAllOnesValue()) {
+      // Instead of doing a multiply by negative one, just do a negate.
+      Prod = InsertNoopCastOfTo(Prod, Ty);
+      Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod);
+      ++I;
+    } else {
+      // A simple mul.
+      Value *W = expandCodeFor(Op, Ty);
+      Prod = InsertNoopCastOfTo(Prod, Ty);
+      // Canonicalize a constant to the RHS.
+      if (isa<Constant>(Prod)) std::swap(Prod, W);
+      Prod = InsertBinop(Instruction::Mul, Prod, W);
+      ++I;
+    }
+  }
+
+  return Prod;
+}
+
+Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+  Value *LHS = expandCodeFor(S->getLHS(), Ty);
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
+    const APInt &RHS = SC->getValue()->getValue();
+    if (RHS.isPowerOf2())
+      return InsertBinop(Instruction::LShr, LHS,
+                         ConstantInt::get(Ty, RHS.logBase2()));
+  }
+
+  Value *RHS = expandCodeFor(S->getRHS(), Ty);
+  return InsertBinop(Instruction::UDiv, LHS, RHS);
+}
+
+/// Move parts of Base into Rest to leave Base with the minimal
+/// expression that provides a pointer operand suitable for a
+/// GEP expansion.
+static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
+                              ScalarEvolution &SE) {
+  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
+    Base = A->getStart();
+    Rest = SE.getAddExpr(Rest,
+                         SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
+                                          A->getStepRecurrence(SE),
+                                          A->getLoop(),
+                                          // FIXME: A->getNoWrapFlags(FlagNW)
+                                          SCEV::FlagAnyWrap));
+  }
+  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
+    Base = A->getOperand(A->getNumOperands()-1);
+    SmallVector<const SCEV *, 8> NewAddOps(A->op_begin(), A->op_end());
+    NewAddOps.back() = Rest;
+    Rest = SE.getAddExpr(NewAddOps);
+    ExposePointerBase(Base, Rest, SE);
+  }
+}
+
+/// Determine if this is a well-behaved chain of instructions leading back to
+/// the PHI. If so, it may be reused by expanded expressions.
+bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                         const Loop *L) {
+  if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
+      (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV)))
+    return false;
+  // If any of the operands don't dominate the insert position, bail.
+  // Addrec operands are always loop-invariant, so this can only happen
+  // if there are instructions which haven't been hoisted.
+  if (L == IVIncInsertLoop) {
+    for (User::op_iterator OI = IncV->op_begin()+1,
+           OE = IncV->op_end(); OI != OE; ++OI)
+      if (Instruction *OInst = dyn_cast<Instruction>(OI))
+        if (!SE.DT->dominates(OInst, IVIncInsertPos))
+          return false;
+  }
+  // Advance to the next instruction.
+  IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+  if (!IncV)
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  if (IncV != PN)
+    return true;
+
+  return isNormalAddRecExprPHI(PN, IncV, L);
+}
+
+/// getIVIncOperand returns an induction variable increment's induction
+/// variable operand.
+///
+/// If allowScale is set, any type of GEP is allowed as long as the nonIV
+/// operands dominate InsertPos.
+///
+/// If allowScale is not set, ensure that a GEP increment conforms to one of the
+/// simple patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP. If the pattern isn't recognized, return NULL.
+Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
+                                           Instruction *InsertPos,
+                                           bool allowScale) {
+  if (IncV == InsertPos)
+    return NULL;
+
+  switch (IncV->getOpcode()) {
+  default:
+    return NULL;
+  // Check for a simple Add/Sub or GEP of a loop invariant step.
+  case Instruction::Add:
+  case Instruction::Sub: {
+    Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
+    if (!OInst || SE.DT->dominates(OInst, InsertPos))
+      return dyn_cast<Instruction>(IncV->getOperand(0));
+    return NULL;
+  }
+  case Instruction::BitCast:
+    return dyn_cast<Instruction>(IncV->getOperand(0));
+  case Instruction::GetElementPtr:
+    for (Instruction::op_iterator I = IncV->op_begin()+1, E = IncV->op_end();
+         I != E; ++I) {
+      if (isa<Constant>(*I))
+        continue;
+      if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
+        if (!SE.DT->dominates(OInst, InsertPos))
+          return NULL;
+      }
+      if (allowScale) {
+        // allow any kind of GEP as long as it can be hoisted.
+        continue;
+      }
+      // This must be a pointer addition of constants (pretty), which is already
+      // handled, or some number of address-size elements (ugly). Ugly geps
+      // have 2 operands. i1* is used by the expander to represent an
+      // address-size element.
+      if (IncV->getNumOperands() != 2)
+        return NULL;
+      unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
+      if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
+          && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
+        return NULL;
+      break;
+    }
+    return dyn_cast<Instruction>(IncV->getOperand(0));
+  }
+}
+
+/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make
+/// it available to other uses in this loop. Recursively hoist any operands,
+/// until we reach a value that dominates InsertPos.
+bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
+  if (SE.DT->dominates(IncV, InsertPos))
+      return true;
+
+  // InsertPos must itself dominate IncV so that IncV's new position satisfies
+  // its existing users.
+  if (isa<PHINode>(InsertPos)
+      || !SE.DT->dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  // Check that the chain of IV operands leading back to Phi can be hoisted.
+  SmallVector<Instruction*, 4> IVIncs;
+  for(;;) {
+    Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true);
+    if (!Oper)
+      return false;
+    // IncV is safe to hoist.
+    IVIncs.push_back(IncV);
+    IncV = Oper;
+    if (SE.DT->dominates(IncV, InsertPos))
+      break;
+  }
+  for (SmallVectorImpl<Instruction*>::reverse_iterator I = IVIncs.rbegin(),
+         E = IVIncs.rend(); I != E; ++I) {
+    (*I)->moveBefore(InsertPos);
+  }
+  return true;
+}
+
+/// Determine if this cyclic phi is in a form that would have been generated by
+/// LSR. We don't care if the phi was actually expanded in this pass, as long
+/// as it is in a low-cost form, for example, no implied multiplication. This
+/// should match any patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP.
+bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                           const Loop *L) {
+  for(Instruction *IVOper = IncV;
+      (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(),
+                                /*allowScale=*/false));) {
+    if (IVOper == PN)
+      return true;
+  }
+  return false;
+}
+
+/// expandIVInc - Expand an IV increment at Builder's current InsertPos.
+/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may
+/// need to materialize IV increments elsewhere to handle difficult situations.
+Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
+                                 Type *ExpandTy, Type *IntTy,
+                                 bool useSubtract) {
+  Value *IncV;
+  // If the PHI is a pointer, use a GEP, otherwise use an add or sub.
+  if (ExpandTy->isPointerTy()) {
+    PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
+    // If the step isn't constant, don't use an implicitly scaled GEP, because
+    // that would require a multiply inside the loop.
+    if (!isa<ConstantInt>(StepV))
+      GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
+                                  GEPPtrTy->getAddressSpace());
+    const SCEV *const StepArray[1] = { SE.getSCEV(StepV) };
+    IncV = expandAddToGEP(StepArray, StepArray+1, GEPPtrTy, IntTy, PN);
+    if (IncV->getType() != PN->getType()) {
+      IncV = Builder.CreateBitCast(IncV, PN->getType());
+      rememberInstruction(IncV);
+    }
+  } else {
+    IncV = useSubtract ?
+      Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") :
+      Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next");
+    rememberInstruction(IncV);
+  }
+  return IncV;
+}
+
+/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
+/// the base addrec, which is the addrec without any non-loop-dominating
+/// values, and return the PHI.
+PHINode *
+SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
+                                        const Loop *L,
+                                        Type *ExpandTy,
+                                        Type *IntTy) {
+  assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position");
+
+  // Reuse a previously-inserted PHI, if present.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (LatchBlock) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      if (!SE.isSCEVable(PN->getType()) ||
+          (SE.getEffectiveSCEVType(PN->getType()) !=
+           SE.getEffectiveSCEVType(Normalized->getType())) ||
+          SE.getSCEV(PN) != Normalized)
+        continue;
+
+      Instruction *IncV =
+        cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock));
+
+      if (LSRMode) {
+        if (!isExpandedAddRecExprPHI(PN, IncV, L))
+          continue;
+        if (L == IVIncInsertLoop && !hoistIVInc(IncV, IVIncInsertPos))
+          continue;
+      }
+      else {
+        if (!isNormalAddRecExprPHI(PN, IncV, L))
+          continue;
+        if (L == IVIncInsertLoop)
+          do {
+            if (SE.DT->dominates(IncV, IVIncInsertPos))
+              break;
+            // Make sure the increment is where we want it. But don't move it
+            // down past a potential existing post-inc user.
+            IncV->moveBefore(IVIncInsertPos);
+            IVIncInsertPos = IncV;
+            IncV = cast<Instruction>(IncV->getOperand(0));
+          } while (IncV != PN);
+      }
+      // Ok, the add recurrence looks usable.
+      // Remember this PHI, even in post-inc mode.
+      InsertedValues.insert(PN);
+      // Remember the increment.
+      rememberInstruction(IncV);
+      return PN;
+    }
+  }
+
+  // Save the original insertion point so we can restore it when we're done.
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+
+  // Another AddRec may need to be recursively expanded below. For example, if
+  // this AddRec is quadratic, the StepV may itself be an AddRec in this
+  // loop. Remove this loop from the PostIncLoops set before expanding such
+  // AddRecs. Otherwise, we cannot find a valid position for the step
+  // (i.e. StepV can never dominate its loop header).  Ideally, we could do
+  // SavedIncLoops.swap(PostIncLoops), but we generally have a single element,
+  // so it's not worth implementing SmallPtrSet::swap.
+  PostIncLoopSet SavedPostIncLoops = PostIncLoops;
+  PostIncLoops.clear();
+
+  // Expand code for the start value.
+  Value *StartV = expandCodeFor(Normalized->getStart(), ExpandTy,
+                                L->getHeader()->begin());
+
+  // StartV must be hoisted into L's preheader to dominate the new phi.
+  assert(!isa<Instruction>(StartV) ||
+         SE.DT->properlyDominates(cast<Instruction>(StartV)->getParent(),
+                                  L->getHeader()));
+
+  // Expand code for the step value. Do this before creating the PHI so that PHI
+  // reuse code doesn't see an incomplete PHI.
+  const SCEV *Step = Normalized->getStepRecurrence(SE);
+  // If the stride is negative, insert a sub instead of an add for the increment
+  // (unless it's a constant, because subtracts of constants are canonicalized
+  // to adds).
+  bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+  if (useSubtract)
+    Step = SE.getNegativeSCEV(Step);
+  // Expand the step somewhere that dominates the loop header.
+  Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+
+  // Create the PHI.
+  BasicBlock *Header = L->getHeader();
+  Builder.SetInsertPoint(Header, Header->begin());
+  pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+  PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE),
+                                  Twine(IVName) + ".iv");
+  rememberInstruction(PN);
+
+  // Create the step instructions and populate the PHI.
+  for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+    BasicBlock *Pred = *HPI;
+
+    // Add a start value.
+    if (!L->contains(Pred)) {
+      PN->addIncoming(StartV, Pred);
+      continue;
+    }
+
+    // Create a step value and add it to the PHI.
+    // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the
+    // instructions at IVIncInsertPos.
+    Instruction *InsertPos = L == IVIncInsertLoop ?
+      IVIncInsertPos : Pred->getTerminator();
+    Builder.SetInsertPoint(InsertPos);
+    Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+
+    PN->addIncoming(IncV, Pred);
+  }
+
+  // Restore the original insert point.
+  if (SaveInsertBB)
+    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  // After expanding subexpressions, restore the PostIncLoops set so the caller
+  // can ensure that IVIncrement dominates the current uses.
+  PostIncLoops = SavedPostIncLoops;
+
+  // Remember this PHI, even in post-inc mode.
+  InsertedValues.insert(PN);
+
+  return PN;
+}
+
+Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
+  Type *STy = S->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(STy);
+  const Loop *L = S->getLoop();
+
+  // Determine a normalized form of this expression, which is the expression
+  // before any post-inc adjustment is made.
+  const SCEVAddRecExpr *Normalized = S;
+  if (PostIncLoops.count(L)) {
+    PostIncLoopSet Loops;
+    Loops.insert(L);
+    Normalized =
+      cast<SCEVAddRecExpr>(TransformForPostIncUse(Normalize, S, 0, 0,
+                                                  Loops, SE, *SE.DT));
+  }
+
+  // Strip off any non-loop-dominating component from the addrec start.
+  const SCEV *Start = Normalized->getStart();
+  const SCEV *PostLoopOffset = 0;
+  if (!SE.properlyDominates(Start, L->getHeader())) {
+    PostLoopOffset = Start;
+    Start = SE.getConstant(Normalized->getType(), 0);
+    Normalized = cast<SCEVAddRecExpr>(
+      SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE),
+                       Normalized->getLoop(),
+                       // FIXME: Normalized->getNoWrapFlags(FlagNW)
+                       SCEV::FlagAnyWrap));
+  }
+
+  // Strip off any non-loop-dominating component from the addrec step.
+  const SCEV *Step = Normalized->getStepRecurrence(SE);
+  const SCEV *PostLoopScale = 0;
+  if (!SE.dominates(Step, L->getHeader())) {
+    PostLoopScale = Step;
+    Step = SE.getConstant(Normalized->getType(), 1);
+    Normalized =
+      cast<SCEVAddRecExpr>(SE.getAddRecExpr(Start, Step,
+                                            Normalized->getLoop(),
+                                            // FIXME: Normalized
+                                            // ->getNoWrapFlags(FlagNW)
+                                            SCEV::FlagAnyWrap));
+  }
+
+  // Expand the core addrec. If we need post-loop scaling, force it to
+  // expand to an integer type to avoid the need for additional casting.
+  Type *ExpandTy = PostLoopScale ? IntTy : STy;
+  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy);
+
+  // Accommodate post-inc mode, if necessary.
+  Value *Result;
+  if (!PostIncLoops.count(L))
+    Result = PN;
+  else {
+    // In PostInc mode, use the post-incremented value.
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    assert(LatchBlock && "PostInc mode requires a unique loop latch!");
+    Result = PN->getIncomingValueForBlock(LatchBlock);
+
+    // For an expansion to use the postinc form, the client must call
+    // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
+    // or dominated by IVIncInsertPos.
+    if (isa<Instruction>(Result)
+        && !SE.DT->dominates(cast<Instruction>(Result),
+                             Builder.GetInsertPoint())) {
+      // The induction variable's postinc expansion does not dominate this use.
+      // IVUsers tries to prevent this case, so it is rare. However, it can
+      // happen when an IVUser outside the loop is not dominated by the latch
+      // block. Adjusting IVIncInsertPos before expansion begins cannot handle
+      // all cases. Consider a phi outide whose operand is replaced during
+      // expansion with the value of the postinc user. Without fundamentally
+      // changing the way postinc users are tracked, the only remedy is
+      // inserting an extra IV increment. StepV might fold into PostLoopOffset,
+      // but hopefully expandCodeFor handles that.
+      bool useSubtract =
+        !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+      if (useSubtract)
+        Step = SE.getNegativeSCEV(Step);
+      // Expand the step somewhere that dominates the loop header.
+      BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+      BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+      Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+      // Restore the insertion point to the place where the caller has
+      // determined dominates all uses.
+      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+      Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+    }
+  }
+
+  // Re-apply any non-loop-dominating scale.
+  if (PostLoopScale) {
+    Result = InsertNoopCastOfTo(Result, IntTy);
+    Result = Builder.CreateMul(Result,
+                               expandCodeFor(PostLoopScale, IntTy));
+    rememberInstruction(Result);
+  }
+
+  // Re-apply any non-loop-dominating offset.
+  if (PostLoopOffset) {
+    if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
+      const SCEV *const OffsetArray[1] = { PostLoopOffset };
+      Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result);
+    } else {
+      Result = InsertNoopCastOfTo(Result, IntTy);
+      Result = Builder.CreateAdd(Result,
+                                 expandCodeFor(PostLoopOffset, IntTy));
+      rememberInstruction(Result);
+    }
+  }
+
+  return Result;
+}
+
+Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
+  if (!CanonicalMode) return expandAddRecExprLiterally(S);
+
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  const Loop *L = S->getLoop();
+
+  // First check for an existing canonical IV in a suitable type.
+  PHINode *CanonicalIV = 0;
+  if (PHINode *PN = L->getCanonicalInductionVariable())
+    if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+      CanonicalIV = PN;
+
+  // Rewrite an AddRec in terms of the canonical induction variable, if
+  // its type is more narrow.
+  if (CanonicalIV &&
+      SE.getTypeSizeInBits(CanonicalIV->getType()) >
+      SE.getTypeSizeInBits(Ty)) {
+    SmallVector<const SCEV *, 4> NewOps(S->getNumOperands());
+    for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
+      NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
+    Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
+                                       // FIXME: S->getNoWrapFlags(FlagNW)
+                                       SCEV::FlagAnyWrap));
+    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+    BasicBlock::iterator NewInsertPt =
+      llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
+    while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
+           isa<LandingPadInst>(NewInsertPt))
+      ++NewInsertPt;
+    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
+                      NewInsertPt);
+    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+    return V;
+  }
+
+  // {X,+,F} --> X + {0,+,F}
+  if (!S->getStart()->isZero()) {
+    SmallVector<const SCEV *, 4> NewOps(S->op_begin(), S->op_end());
+    NewOps[0] = SE.getConstant(Ty, 0);
+    // FIXME: can use S->getNoWrapFlags()
+    const SCEV *Rest = SE.getAddRecExpr(NewOps, L, SCEV::FlagAnyWrap);
+
+    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
+    // comments on expandAddToGEP for details.
+    const SCEV *Base = S->getStart();
+    const SCEV *RestArray[1] = { Rest };
+    // Dig into the expression to find the pointer base for a GEP.
+    ExposePointerBase(Base, RestArray[0], SE);
+    // If we found a pointer, expand the AddRec with a GEP.
+    if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
+      // Make sure the Base isn't something exotic, such as a multiplied
+      // or divided pointer value. In those cases, the result type isn't
+      // actually a pointer type.
+      if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
+        Value *StartV = expand(Base);
+        assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
+        return expandAddToGEP(RestArray, RestArray+1, PTy, Ty, StartV);
+      }
+    }
+
+    // Just do a normal add. Pre-expand the operands to suppress folding.
+    return expand(SE.getAddExpr(SE.getUnknown(expand(S->getStart())),
+                                SE.getUnknown(expand(Rest))));
+  }
+
+  // If we don't yet have a canonical IV, create one.
+  if (!CanonicalIV) {
+    // Create and insert the PHI node for the induction variable in the
+    // specified loop.
+    BasicBlock *Header = L->getHeader();
+    pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+    CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
+                                  Header->begin());
+    rememberInstruction(CanonicalIV);
+
+    Constant *One = ConstantInt::get(Ty, 1);
+    for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+      BasicBlock *HP = *HPI;
+      if (L->contains(HP)) {
+        // Insert a unit add instruction right before the terminator
+        // corresponding to the back-edge.
+        Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
+                                                     "indvar.next",
+                                                     HP->getTerminator());
+        Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
+        rememberInstruction(Add);
+        CanonicalIV->addIncoming(Add, HP);
+      } else {
+        CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
+      }
+    }
+  }
+
+  // {0,+,1} --> Insert a canonical induction variable into the loop!
+  if (S->isAffine() && S->getOperand(1)->isOne()) {
+    assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
+           "IVs with types different from the canonical IV should "
+           "already have been handled!");
+    return CanonicalIV;
+  }
+
+  // {0,+,F} --> {0,+,1} * F
+
+  // If this is a simple linear addrec, emit it now as a special case.
+  if (S->isAffine())    // {0,+,F} --> i*F
+    return
+      expand(SE.getTruncateOrNoop(
+        SE.getMulExpr(SE.getUnknown(CanonicalIV),
+                      SE.getNoopOrAnyExtend(S->getOperand(1),
+                                            CanonicalIV->getType())),
+        Ty));
+
+  // If this is a chain of recurrences, turn it into a closed form, using the
+  // folders, then expandCodeFor the closed form.  This allows the folders to
+  // simplify the expression without having to build a bunch of special code
+  // into this folder.
+  const SCEV *IH = SE.getUnknown(CanonicalIV);   // Get I as a "symbolic" SCEV.
+
+  // Promote S up to the canonical IV type, if the cast is foldable.
+  const SCEV *NewS = S;
+  const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
+  if (isa<SCEVAddRecExpr>(Ext))
+    NewS = Ext;
+
+  const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
+
+  // Truncate the result down to the original type, if needed.
+  const SCEV *T = SE.getTruncateOrNoop(V, Ty);
+  return expand(T);
+}
+
+Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
+  Value *I = Builder.CreateTrunc(V, Ty);
+  rememberInstruction(I);
+  return I;
+}
+
+Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
+  Value *I = Builder.CreateZExt(V, Ty);
+  rememberInstruction(I);
+  return I;
+}
+
+Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Value *V = expandCodeFor(S->getOperand(),
+                           SE.getEffectiveSCEVType(S->getOperand()->getType()));
+  Value *I = Builder.CreateSExt(V, Ty);
+  rememberInstruction(I);
+  return I;
+}
+
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands()-2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    if (S->getOperand(i)->getType() != Ty) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
+    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
+    rememberInstruction(ICmp);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
+    rememberInstruction(Sel);
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+  Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+  Type *Ty = LHS->getType();
+  for (int i = S->getNumOperands()-2; i >= 0; --i) {
+    // In the case of mixed integer and pointer types, do the
+    // rest of the comparisons as integer.
+    if (S->getOperand(i)->getType() != Ty) {
+      Ty = SE.getEffectiveSCEVType(Ty);
+      LHS = InsertNoopCastOfTo(LHS, Ty);
+    }
+    Value *RHS = expandCodeFor(S->getOperand(i), Ty);
+    Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
+    rememberInstruction(ICmp);
+    Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
+    rememberInstruction(Sel);
+    LHS = Sel;
+  }
+  // In the case of mixed integer and pointer types, cast the
+  // final result back to the pointer type.
+  if (LHS->getType() != S->getType())
+    LHS = InsertNoopCastOfTo(LHS, S->getType());
+  return LHS;
+}
+
+Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
+                                   Instruction *IP) {
+  Builder.SetInsertPoint(IP->getParent(), IP);
+  return expandCodeFor(SH, Ty);
+}
+
+Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
+  // Expand the code for this SCEV.
+  Value *V = expand(SH);
+  if (Ty) {
+    assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
+           "non-trivial casts should be done with the SCEVs directly!");
+    V = InsertNoopCastOfTo(V, Ty);
+  }
+  return V;
+}
+
+Value *SCEVExpander::expand(const SCEV *S) {
+  // Compute an insertion point for this SCEV object. Hoist the instructions
+  // as far out in the loop nest as possible.
+  Instruction *InsertPt = Builder.GetInsertPoint();
+  for (Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock()); ;
+       L = L->getParentLoop())
+    if (SE.isLoopInvariant(S, L)) {
+      if (!L) break;
+      if (BasicBlock *Preheader = L->getLoopPreheader())
+        InsertPt = Preheader->getTerminator();
+      else {
+        // LSR sets the insertion point for AddRec start/step values to the
+        // block start to simplify value reuse, even though it's an invalid
+        // position. SCEVExpander must correct for this in all cases.
+        InsertPt = L->getHeader()->getFirstInsertionPt();
+      }
+    } else {
+      // If the SCEV is computable at this level, insert it into the header
+      // after the PHIs (and after any other instructions that we've inserted
+      // there) so that it is guaranteed to dominate any user inside the loop.
+      if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
+        InsertPt = L->getHeader()->getFirstInsertionPt();
+      while (InsertPt != Builder.GetInsertPoint()
+             && (isInsertedInstruction(InsertPt)
+                 || isa<DbgInfoIntrinsic>(InsertPt))) {
+        InsertPt = llvm::next(BasicBlock::iterator(InsertPt));
+      }
+      break;
+    }
+
+  // Check to see if we already expanded this here.
+  std::map<std::pair<const SCEV *, Instruction *>, TrackingVH<Value> >::iterator
+    I = InsertedExpressions.find(std::make_pair(S, InsertPt));
+  if (I != InsertedExpressions.end())
+    return I->second;
+
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
+
+  // Expand the expression into instructions.
+  Value *V = visit(S);
+
+  // Remember the expanded value for this SCEV at this location.
+  //
+  // This is independent of PostIncLoops. The mapped value simply materializes
+  // the expression at this insertion point. If the mapped value happened to be
+  // a postinc expansion, it could be reused by a non postinc user, but only if
+  // its insertion point was already at the head of the loop.
+  InsertedExpressions[std::make_pair(S, InsertPt)] = V;
+
+  restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+  return V;
+}
+
+void SCEVExpander::rememberInstruction(Value *I) {
+  if (!PostIncLoops.empty())
+    InsertedPostIncValues.insert(I);
+  else
+    InsertedValues.insert(I);
+}
+
+void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
+  Builder.SetInsertPoint(BB, I);
+}
+
+/// getOrInsertCanonicalInductionVariable - This method returns the
+/// canonical induction variable of the specified type for the specified
+/// loop (inserting one if there is none).  A canonical induction variable
+/// starts at zero and steps by one on each iteration.
+PHINode *
+SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
+                                                    Type *Ty) {
+  assert(Ty->isIntegerTy() && "Can only insert integer induction variables!");
+
+  // Build a SCEV for {0,+,1}<L>.
+  // Conservatively use FlagAnyWrap for now.
+  const SCEV *H = SE.getAddRecExpr(SE.getConstant(Ty, 0),
+                                   SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
+
+  // Emit code for it.
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
+  if (SaveInsertBB)
+    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  return V;
+}
+
+/// Sort values by integer width for replaceCongruentIVs.
+static bool width_descending(Value *lhs, Value *rhs) {
+  // Put pointers at the back and make sure pointer < pointer = false.
+  if (!lhs->getType()->isIntegerTy() || !rhs->getType()->isIntegerTy())
+    return rhs->getType()->isIntegerTy() && !lhs->getType()->isIntegerTy();
+  return rhs->getType()->getPrimitiveSizeInBits()
+    < lhs->getType()->getPrimitiveSizeInBits();
+}
+
+/// replaceCongruentIVs - Check for congruent phis in this loop header and
+/// replace them with their most canonical representative. Return the number of
+/// phis eliminated.
+///
+/// This does not depend on any SCEVExpander state but should be used in
+/// the same context that SCEVExpander is used.
+unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+                                           SmallVectorImpl<WeakVH> &DeadInsts,
+                                           const TargetTransformInfo *TTI) {
+  // Find integer phis in order of increasing width.
+  SmallVector<PHINode*, 8> Phis;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+    Phis.push_back(Phi);
+  }
+  if (TTI)
+    std::sort(Phis.begin(), Phis.end(), width_descending);
+
+  unsigned NumElim = 0;
+  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
+  // Process phis from wide to narrow. Mapping wide phis to the their truncation
+  // so narrow phis can reuse them.
+  for (SmallVectorImpl<PHINode*>::const_iterator PIter = Phis.begin(),
+         PEnd = Phis.end(); PIter != PEnd; ++PIter) {
+    PHINode *Phi = *PIter;
+
+    // Fold constant phis. They may be congruent to other constant phis and
+    // would confuse the logic below that expects proper IVs.
+    if (Value *V = Phi->hasConstantValue()) {
+      Phi->replaceAllUsesWith(V);
+      DeadInsts.push_back(Phi);
+      ++NumElim;
+      DEBUG_WITH_TYPE(DebugType, dbgs()
+                      << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
+      continue;
+    }
+
+    if (!SE.isSCEVable(Phi->getType()))
+      continue;
+
+    PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
+    if (!OrigPhiRef) {
+      OrigPhiRef = Phi;
+      if (Phi->getType()->isIntegerTy() && TTI
+          && TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
+        // This phi can be freely truncated to the narrowest phi type. Map the
+        // truncated expression to it so it will be reused for narrow types.
+        const SCEV *TruncExpr =
+          SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType());
+        ExprToIVMap[TruncExpr] = Phi;
+      }
+      continue;
+    }
+
+    // Replacing a pointer phi with an integer phi or vice-versa doesn't make
+    // sense.
+    if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy())
+      continue;
+
+    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+      Instruction *OrigInc =
+        cast<Instruction>(OrigPhiRef->getIncomingValueForBlock(LatchBlock));
+      Instruction *IsomorphicInc =
+        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
+
+      // If this phi has the same width but is more canonical, replace the
+      // original with it. As part of the "more canonical" determination,
+      // respect a prior decision to use an IV chain.
+      if (OrigPhiRef->getType() == Phi->getType()
+          && !(ChainedPhis.count(Phi)
+               || isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L))
+          && (ChainedPhis.count(Phi)
+              || isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
+        std::swap(OrigPhiRef, Phi);
+        std::swap(OrigInc, IsomorphicInc);
+      }
+      // Replacing the congruent phi is sufficient because acyclic redundancy
+      // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
+      // that a phi is congruent, it's often the head of an IV user cycle that
+      // is isomorphic with the original phi. It's worth eagerly cleaning up the
+      // common case of a single IV increment so that DeleteDeadPHIs can remove
+      // cycles that had postinc uses.
+      const SCEV *TruncExpr = SE.getTruncateOrNoop(SE.getSCEV(OrigInc),
+                                                   IsomorphicInc->getType());
+      if (OrigInc != IsomorphicInc
+          && TruncExpr == SE.getSCEV(IsomorphicInc)
+          && ((isa<PHINode>(OrigInc) && isa<PHINode>(IsomorphicInc))
+              || hoistIVInc(OrigInc, IsomorphicInc))) {
+        DEBUG_WITH_TYPE(DebugType, dbgs()
+                        << "INDVARS: Eliminated congruent iv.inc: "
+                        << *IsomorphicInc << '\n');
+        Value *NewInc = OrigInc;
+        if (OrigInc->getType() != IsomorphicInc->getType()) {
+          Instruction *IP = isa<PHINode>(OrigInc)
+            ? (Instruction*)L->getHeader()->getFirstInsertionPt()
+            : OrigInc->getNextNode();
+          IRBuilder<> Builder(IP);
+          Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
+          NewInc = Builder.
+            CreateTruncOrBitCast(OrigInc, IsomorphicInc->getType(), IVName);
+        }
+        IsomorphicInc->replaceAllUsesWith(NewInc);
+        DeadInsts.push_back(IsomorphicInc);
+      }
+    }
+    DEBUG_WITH_TYPE(DebugType, dbgs()
+                    << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
+    ++NumElim;
+    Value *NewIV = OrigPhiRef;
+    if (OrigPhiRef->getType() != Phi->getType()) {
+      IRBuilder<> Builder(L->getHeader()->getFirstInsertionPt());
+      Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
+      NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName);
+    }
+    Phi->replaceAllUsesWith(NewIV);
+    DeadInsts.push_back(Phi);
+  }
+  return NumElim;
+}
+
+namespace {
+// Search for a SCEV subexpression that is not safe to expand.  Any expression
+// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
+// UDiv expressions. We don't know if the UDiv is derived from an IR divide
+// instruction, but the important thing is that we prove the denominator is
+// nonzero before expansion.
+//
+// IVUsers already checks that IV-derived expressions are safe. So this check is
+// only needed when the expression includes some subexpression that is not IV
+// derived.
+//
+// Currently, we only allow division by a nonzero constant here. If this is
+// inadequate, we could easily allow division by SCEVUnknown by using
+// ValueTracking to check isKnownNonZero().
+struct SCEVFindUnsafe {
+  bool IsUnsafe;
+
+  SCEVFindUnsafe(): IsUnsafe(false) {}
+
+  bool follow(const SCEV *S) {
+    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
+    if (!D)
+      return true;
+    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+    if (SC && !SC->getValue()->isZero())
+      return true;
+    IsUnsafe = true;
+    return false;
+  }
+  bool isDone() const { return IsUnsafe; }
+};
+}
+
+namespace llvm {
+bool isSafeToExpand(const SCEV *S) {
+  SCEVFindUnsafe Search;
+  visitAll(S, Search);
+  return !Search.IsUnsafe;
+}
+}
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
new file mode 100644
index 000000000000..dd2ed4ff831c
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -0,0 +1,223 @@
+//===- ScalarEvolutionNormalization.cpp - See below -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities for working with "normalized" expressions.
+// See the comments at the top of ScalarEvolutionNormalization.h for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+using namespace llvm;
+
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
+                                       const Loop *L, DominatorTree *DT) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User)) return false;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock)
+    return false;
+
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that are not dominated by the latch block, but (since
+  // their uses occur in the predecessor block, not the block the PHI lives in)
+  // should still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN || !Operand) return false; // not a phi, not dominated by latch block.
+
+  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
+  // to a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == Operand &&
+        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+      return false;
+
+  // Okay, all uses of Operand by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Use the post-incremented value.
+  return true;
+}
+
+namespace {
+
+/// Hold the state used during post-inc expression transformation, including a
+/// map of transformed expressions.
+class PostIncTransform {
+  TransformKind Kind;
+  PostIncLoopSet &Loops;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+
+  DenseMap<const SCEV*, const SCEV*> Transformed;
+
+public:
+  PostIncTransform(TransformKind kind, PostIncLoopSet &loops,
+                   ScalarEvolution &se, DominatorTree &dt):
+    Kind(kind), Loops(loops), SE(se), DT(dt) {}
+
+  const SCEV *TransformSubExpr(const SCEV *S, Instruction *User,
+                               Value *OperandValToReplace);
+
+protected:
+  const SCEV *TransformImpl(const SCEV *S, Instruction *User,
+                            Value *OperandValToReplace);
+};
+
+} // namespace
+
+/// Implement post-inc transformation for all valid expression types.
+const SCEV *PostIncTransform::
+TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
+
+  if (const SCEVCastExpr *X = dyn_cast<SCEVCastExpr>(S)) {
+    const SCEV *O = X->getOperand();
+    const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
+    if (O != N)
+      switch (S->getSCEVType()) {
+      case scZeroExtend: return SE.getZeroExtendExpr(N, S->getType());
+      case scSignExtend: return SE.getSignExtendExpr(N, S->getType());
+      case scTruncate: return SE.getTruncateExpr(N, S->getType());
+      default: llvm_unreachable("Unexpected SCEVCastExpr kind!");
+      }
+    return S;
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // An addrec. This is the interesting part.
+    SmallVector<const SCEV *, 8> Operands;
+    const Loop *L = AR->getLoop();
+    // The addrec conceptually uses its operands at loop entry.
+    Instruction *LUser = L->getHeader()->begin();
+    // Transform each operand.
+    for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
+         I != E; ++I) {
+      Operands.push_back(TransformSubExpr(*I, LUser, 0));
+    }
+    // Conservatively use AnyWrap until/unless we need FlagNW.
+    const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
+    switch (Kind) {
+    case NormalizeAutodetect:
+      if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
+        const SCEV *TransformedStep =
+          TransformSubExpr(AR->getStepRecurrence(SE),
+                           User, OperandValToReplace);
+        Result = SE.getMinusSCEV(Result, TransformedStep);
+        Loops.insert(L);
+      }
+#if 0
+      // This assert is conceptually correct, but ScalarEvolution currently
+      // sometimes fails to canonicalize two equal SCEVs to exactly the same
+      // form. It's possibly a pessimization when this happens, but it isn't a
+      // correctness problem, so disable this assert for now.
+      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
+             "SCEV normalization is not invertible!");
+#endif
+      break;
+    case Normalize:
+      if (Loops.count(L)) {
+        const SCEV *TransformedStep =
+          TransformSubExpr(AR->getStepRecurrence(SE),
+                           User, OperandValToReplace);
+        Result = SE.getMinusSCEV(Result, TransformedStep);
+      }
+#if 0
+      // See the comment on the assert above.
+      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
+             "SCEV normalization is not invertible!");
+#endif
+      break;
+    case Denormalize:
+      if (Loops.count(L))
+        Result = cast<SCEVAddRecExpr>(Result)->getPostIncExpr(SE);
+      break;
+    }
+    return Result;
+  }
+
+  if (const SCEVNAryExpr *X = dyn_cast<SCEVNAryExpr>(S)) {
+    SmallVector<const SCEV *, 8> Operands;
+    bool Changed = false;
+    // Transform each operand.
+    for (SCEVNAryExpr::op_iterator I = X->op_begin(), E = X->op_end();
+         I != E; ++I) {
+      const SCEV *O = *I;
+      const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
+      Changed |= N != O;
+      Operands.push_back(N);
+    }
+    // If any operand actually changed, return a transformed result.
+    if (Changed)
+      switch (S->getSCEVType()) {
+      case scAddExpr: return SE.getAddExpr(Operands);
+      case scMulExpr: return SE.getMulExpr(Operands);
+      case scSMaxExpr: return SE.getSMaxExpr(Operands);
+      case scUMaxExpr: return SE.getUMaxExpr(Operands);
+      default: llvm_unreachable("Unexpected SCEVNAryExpr kind!");
+      }
+    return S;
+  }
+
+  if (const SCEVUDivExpr *X = dyn_cast<SCEVUDivExpr>(S)) {
+    const SCEV *LO = X->getLHS();
+    const SCEV *RO = X->getRHS();
+    const SCEV *LN = TransformSubExpr(LO, User, OperandValToReplace);
+    const SCEV *RN = TransformSubExpr(RO, User, OperandValToReplace);
+    if (LO != LN || RO != RN)
+      return SE.getUDivExpr(LN, RN);
+    return S;
+  }
+
+  llvm_unreachable("Unexpected SCEV kind!");
+}
+
+/// Manage recursive transformation across an expression DAG. Revisiting
+/// expressions would lead to exponential recursion.
+const SCEV *PostIncTransform::
+TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
+
+  if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
+    return S;
+
+  const SCEV *Result = Transformed.lookup(S);
+  if (Result)
+    return Result;
+
+  Result = TransformImpl(S, User, OperandValToReplace);
+  Transformed[S] = Result;
+  return Result;
+}
+
+/// Top level driver for transforming an expression DAG into its requested
+/// post-inc form (either "Normalized" or "Denormalized".
+const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
+                                         const SCEV *S,
+                                         Instruction *User,
+                                         Value *OperandValToReplace,
+                                         PostIncLoopSet &Loops,
+                                         ScalarEvolution &SE,
+                                         DominatorTree &DT) {
+  PostIncTransform Transform(Kind, Loops, SE, DT);
+  return Transform.TransformSubExpr(S, User, OperandValToReplace);
+}
diff --git a/contrib/llvm/lib/Analysis/SparsePropagation.cpp b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
new file mode 100644
index 000000000000..15b78728a73c
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
@@ -0,0 +1,347 @@
+//===- SparsePropagation.cpp - Sparse Conditional Property Propagation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an abstract sparse conditional propagation algorithm,
+// modeled after SCCP, but with a customizable lattice function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sparseprop"
+#include "llvm/Analysis/SparsePropagation.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                  AbstractLatticeFunction Implementation
+//===----------------------------------------------------------------------===//
+
+AbstractLatticeFunction::~AbstractLatticeFunction() {}
+
+/// PrintValue - Render the specified lattice value to the specified stream.
+void AbstractLatticeFunction::PrintValue(LatticeVal V, raw_ostream &OS) {
+  if (V == UndefVal)
+    OS << "undefined";
+  else if (V == OverdefinedVal)
+    OS << "overdefined";
+  else if (V == UntrackedVal)
+    OS << "untracked";
+  else
+    OS << "unknown lattice value";
+}
+
+//===----------------------------------------------------------------------===//
+//                          SparseSolver Implementation
+//===----------------------------------------------------------------------===//
+
+/// getOrInitValueState - Return the LatticeVal object that corresponds to the
+/// value, initializing the value's state if it hasn't been entered into the
+/// map yet.   This function is necessary because not all values should start
+/// out in the underdefined state... Arguments should be overdefined, and
+/// constants should be marked as constants.
+///
+SparseSolver::LatticeVal SparseSolver::getOrInitValueState(Value *V) {
+  DenseMap<Value*, LatticeVal>::iterator I = ValueState.find(V);
+  if (I != ValueState.end()) return I->second;  // Common case, in the map
+  
+  LatticeVal LV;
+  if (LatticeFunc->IsUntrackedValue(V))
+    return LatticeFunc->getUntrackedVal();
+  else if (Constant *C = dyn_cast<Constant>(V))
+    LV = LatticeFunc->ComputeConstant(C);
+  else if (Argument *A = dyn_cast<Argument>(V))
+    LV = LatticeFunc->ComputeArgument(A);
+  else if (!isa<Instruction>(V))
+    // All other non-instructions are overdefined.
+    LV = LatticeFunc->getOverdefinedVal();
+  else
+    // All instructions are underdefined by default.
+    LV = LatticeFunc->getUndefVal();
+  
+  // If this value is untracked, don't add it to the map.
+  if (LV == LatticeFunc->getUntrackedVal())
+    return LV;
+  return ValueState[V] = LV;
+}
+
+/// UpdateState - When the state for some instruction is potentially updated,
+/// this function notices and adds I to the worklist if needed.
+void SparseSolver::UpdateState(Instruction &Inst, LatticeVal V) {
+  DenseMap<Value*, LatticeVal>::iterator I = ValueState.find(&Inst);
+  if (I != ValueState.end() && I->second == V)
+    return;  // No change.
+  
+  // An update.  Visit uses of I.
+  ValueState[&Inst] = V;
+  InstWorkList.push_back(&Inst);
+}
+
+/// MarkBlockExecutable - This method can be used by clients to mark all of
+/// the blocks that are known to be intrinsically live in the processed unit.
+void SparseSolver::MarkBlockExecutable(BasicBlock *BB) {
+  DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
+  BBExecutable.insert(BB);   // Basic block is executable!
+  BBWorkList.push_back(BB);  // Add the block to the work list!
+}
+
+/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+/// work list if it is not already executable...
+void SparseSolver::markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+  if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+    return;  // This edge is already known to be executable!
+  
+  DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+        << " -> " << Dest->getName() << "\n");
+
+  if (BBExecutable.count(Dest)) {
+    // The destination is already executable, but we just made an edge
+    // feasible that wasn't before.  Revisit the PHI nodes in the block
+    // because they have potentially new operands.
+    for (BasicBlock::iterator I = Dest->begin(); isa<PHINode>(I); ++I)
+      visitPHINode(*cast<PHINode>(I));
+    
+  } else {
+    MarkBlockExecutable(Dest);
+  }
+}
+
+
+/// getFeasibleSuccessors - Return a vector of booleans to indicate which
+/// successors are reachable from a given terminator instruction.
+void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                         SmallVectorImpl<bool> &Succs,
+                                         bool AggressiveUndef) {
+  Succs.resize(TI.getNumSuccessors());
+  if (TI.getNumSuccessors() == 0) return;
+  
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+    
+    LatticeVal BCValue;
+    if (AggressiveUndef)
+      BCValue = getOrInitValueState(BI->getCondition());
+    else
+      BCValue = getLatticeState(BI->getCondition());
+    
+    if (BCValue == LatticeFunc->getOverdefinedVal() ||
+        BCValue == LatticeFunc->getUntrackedVal()) {
+      // Overdefined condition variables can branch either way.
+      Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // If undefined, neither is feasible yet.
+    if (BCValue == LatticeFunc->getUndefVal())
+      return;
+
+    Constant *C = LatticeFunc->GetConstant(BCValue, BI->getCondition(), *this);
+    if (C == 0 || !isa<ConstantInt>(C)) {
+      // Non-constant values can go either way.
+      Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way
+    Succs[C->isNullValue()] = true;
+    return;
+  }
+  
+  if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    // TODO: Could ask the lattice function if the value can throw.
+    Succs[0] = Succs[1] = true;
+    return;
+  }
+  
+  if (isa<IndirectBrInst>(TI)) {
+    Succs.assign(Succs.size(), true);
+    return;
+  }
+  
+  SwitchInst &SI = cast<SwitchInst>(TI);
+  LatticeVal SCValue;
+  if (AggressiveUndef)
+    SCValue = getOrInitValueState(SI.getCondition());
+  else
+    SCValue = getLatticeState(SI.getCondition());
+  
+  if (SCValue == LatticeFunc->getOverdefinedVal() ||
+      SCValue == LatticeFunc->getUntrackedVal()) {
+    // All destinations are executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+  
+  // If undefined, neither is feasible yet.
+  if (SCValue == LatticeFunc->getUndefVal())
+    return;
+  
+  Constant *C = LatticeFunc->GetConstant(SCValue, SI.getCondition(), *this);
+  if (C == 0 || !isa<ConstantInt>(C)) {
+    // All destinations are executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+  SwitchInst::CaseIt Case = SI.findCaseValue(cast<ConstantInt>(C));
+  Succs[Case.getSuccessorIndex()] = true;
+}
+
+
+/// isEdgeFeasible - Return true if the control flow edge from the 'From'
+/// basic block to the 'To' basic block is currently feasible...
+bool SparseSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To,
+                                  bool AggressiveUndef) {
+  SmallVector<bool, 16> SuccFeasible;
+  TerminatorInst *TI = From->getTerminator();
+  getFeasibleSuccessors(*TI, SuccFeasible, AggressiveUndef);
+  
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    if (TI->getSuccessor(i) == To && SuccFeasible[i])
+      return true;
+  
+  return false;
+}
+
+void SparseSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible, true);
+  
+  BasicBlock *BB = TI.getParent();
+  
+  // Mark all feasible successors executable...
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SparseSolver::visitPHINode(PHINode &PN) {
+  // The lattice function may store more information on a PHINode than could be
+  // computed from its incoming values.  For example, SSI form stores its sigma
+  // functions as PHINodes with a single incoming value.
+  if (LatticeFunc->IsSpecialCasedPHI(&PN)) {
+    LatticeVal IV = LatticeFunc->ComputeInstructionState(PN, *this);
+    if (IV != LatticeFunc->getUntrackedVal())
+      UpdateState(PN, IV);
+    return;
+  }
+
+  LatticeVal PNIV = getOrInitValueState(&PN);
+  LatticeVal Overdefined = LatticeFunc->getOverdefinedVal();
+  
+  // If this value is already overdefined (common) just return.
+  if (PNIV == Overdefined || PNIV == LatticeFunc->getUntrackedVal())
+    return;  // Quick exit
+  
+  // Super-extra-high-degree PHI nodes are unlikely to ever be interesting,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64) {
+    UpdateState(PN, Overdefined);
+    return;
+  }
+  
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  Otherwise, ask the
+  // transfer function to give us the merge of the incoming values.
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    // If the edge is not yet known to be feasible, it doesn't impact the PHI.
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent(), true))
+      continue;
+    
+    // Merge in this value.
+    LatticeVal OpVal = getOrInitValueState(PN.getIncomingValue(i));
+    if (OpVal != PNIV)
+      PNIV = LatticeFunc->MergeValues(PNIV, OpVal);
+    
+    if (PNIV == Overdefined)
+      break;  // Rest of input values don't matter.
+  }
+
+  // Update the PHI with the compute value, which is the merge of the inputs.
+  UpdateState(PN, PNIV);
+}
+
+
+void SparseSolver::visitInst(Instruction &I) {
+  // PHIs are handled by the propagation logic, they are never passed into the
+  // transfer functions.
+  if (PHINode *PN = dyn_cast<PHINode>(&I))
+    return visitPHINode(*PN);
+  
+  // Otherwise, ask the transfer function what the result is.  If this is
+  // something that we care about, remember it.
+  LatticeVal IV = LatticeFunc->ComputeInstructionState(I, *this);
+  if (IV != LatticeFunc->getUntrackedVal())
+    UpdateState(I, IV);
+  
+  if (TerminatorInst *TI = dyn_cast<TerminatorInst>(&I))
+    visitTerminatorInst(*TI);
+}
+
+void SparseSolver::Solve(Function &F) {
+  MarkBlockExecutable(&F.getEntryBlock());
+  
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty()) {
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Instruction *I = InstWorkList.back();
+      InstWorkList.pop_back();
+
+      DEBUG(dbgs() << "\nPopped off I-WL: " << *I << "\n");
+
+      // "I" got into the work list because it made a transition.  See if any
+      // users are both live and in need of updating.
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI) {
+        Instruction *U = cast<Instruction>(*UI);
+        if (BBExecutable.count(U->getParent()))   // Inst is executable?
+          visitInst(*U);
+      }
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB);
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        visitInst(*I);
+    }
+  }
+}
+
+void SparseSolver::Print(Function &F, raw_ostream &OS) const {
+  OS << "\nFUNCTION: " << F.getName() << "\n";
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      OS << "INFEASIBLE: ";
+    OS << "\t";
+    if (BB->hasName())
+      OS << BB->getName() << ":\n";
+    else
+      OS << "; anon bb\n";
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      LatticeFunc->PrintValue(getLatticeState(I), OS);
+      OS << *I << "\n";
+    }
+    
+    OS << "\n";
+  }
+}
+
diff --git a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
new file mode 100644
index 000000000000..64f8e96884c7
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -0,0 +1,558 @@
+//===- llvm/Analysis/TargetTransformInfo.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tti"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+// Setup the analysis group to manage the TargetTransformInfo passes.
+INITIALIZE_ANALYSIS_GROUP(TargetTransformInfo, "Target Information", NoTTI)
+char TargetTransformInfo::ID = 0;
+
+TargetTransformInfo::~TargetTransformInfo() {
+}
+
+void TargetTransformInfo::pushTTIStack(Pass *P) {
+  TopTTI = this;
+  PrevTTI = &P->getAnalysis<TargetTransformInfo>();
+
+  // Walk up the chain and update the top TTI pointer.
+  for (TargetTransformInfo *PTTI = PrevTTI; PTTI; PTTI = PTTI->PrevTTI)
+    PTTI->TopTTI = this;
+}
+
+void TargetTransformInfo::popTTIStack() {
+  TopTTI = 0;
+
+  // Walk up the chain and update the top TTI pointer.
+  for (TargetTransformInfo *PTTI = PrevTTI; PTTI; PTTI = PTTI->PrevTTI)
+    PTTI->TopTTI = PrevTTI;
+
+  PrevTTI = 0;
+}
+
+void TargetTransformInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+}
+
+unsigned TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
+                                               Type *OpTy) const {
+  return PrevTTI->getOperationCost(Opcode, Ty, OpTy);
+}
+
+unsigned TargetTransformInfo::getGEPCost(
+    const Value *Ptr, ArrayRef<const Value *> Operands) const {
+  return PrevTTI->getGEPCost(Ptr, Operands);
+}
+
+unsigned TargetTransformInfo::getCallCost(FunctionType *FTy,
+                                          int NumArgs) const {
+  return PrevTTI->getCallCost(FTy, NumArgs);
+}
+
+unsigned TargetTransformInfo::getCallCost(const Function *F,
+                                          int NumArgs) const {
+  return PrevTTI->getCallCost(F, NumArgs);
+}
+
+unsigned TargetTransformInfo::getCallCost(
+    const Function *F, ArrayRef<const Value *> Arguments) const {
+  return PrevTTI->getCallCost(F, Arguments);
+}
+
+unsigned TargetTransformInfo::getIntrinsicCost(
+    Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> ParamTys) const {
+  return PrevTTI->getIntrinsicCost(IID, RetTy, ParamTys);
+}
+
+unsigned TargetTransformInfo::getIntrinsicCost(
+    Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
+  return PrevTTI->getIntrinsicCost(IID, RetTy, Arguments);
+}
+
+unsigned TargetTransformInfo::getUserCost(const User *U) const {
+  return PrevTTI->getUserCost(U);
+}
+
+bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
+  return PrevTTI->isLoweredToCall(F);
+}
+
+bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
+  return PrevTTI->isLegalAddImmediate(Imm);
+}
+
+bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
+  return PrevTTI->isLegalICmpImmediate(Imm);
+}
+
+bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
+                                                int64_t BaseOffset,
+                                                bool HasBaseReg,
+                                                int64_t Scale) const {
+  return PrevTTI->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
+                                        Scale);
+}
+
+bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  return PrevTTI->isTruncateFree(Ty1, Ty2);
+}
+
+bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
+  return PrevTTI->isTypeLegal(Ty);
+}
+
+unsigned TargetTransformInfo::getJumpBufAlignment() const {
+  return PrevTTI->getJumpBufAlignment();
+}
+
+unsigned TargetTransformInfo::getJumpBufSize() const {
+  return PrevTTI->getJumpBufSize();
+}
+
+bool TargetTransformInfo::shouldBuildLookupTables() const {
+  return PrevTTI->shouldBuildLookupTables();
+}
+
+TargetTransformInfo::PopcntSupportKind
+TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
+  return PrevTTI->getPopcntSupport(IntTyWidthInBit);
+}
+
+unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  return PrevTTI->getIntImmCost(Imm, Ty);
+}
+
+unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const {
+  return PrevTTI->getNumberOfRegisters(Vector);
+}
+
+unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
+  return PrevTTI->getRegisterBitWidth(Vector);
+}
+
+unsigned TargetTransformInfo::getMaximumUnrollFactor() const {
+  return PrevTTI->getMaximumUnrollFactor();
+}
+
+unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+                                                Type *Ty,
+                                                OperandValueKind Op1Info,
+                                                OperandValueKind Op2Info) const {
+  return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+}
+
+unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                             int Index, Type *SubTp) const {
+  return PrevTTI->getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                               Type *Src) const {
+  return PrevTTI->getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
+  return PrevTTI->getCFInstrCost(Opcode);
+}
+
+unsigned TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                                 Type *CondTy) const {
+  return PrevTTI->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                 unsigned Index) const {
+  return PrevTTI->getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                              unsigned Alignment,
+                                              unsigned AddressSpace) const {
+  return PrevTTI->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  ;
+}
+
+unsigned
+TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID,
+                                           Type *RetTy,
+                                           ArrayRef<Type *> Tys) const {
+  return PrevTTI->getIntrinsicInstrCost(ID, RetTy, Tys);
+}
+
+unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
+  return PrevTTI->getNumberOfParts(Tp);
+}
+
+unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
+  return PrevTTI->getAddressComputationCost(Tp);
+}
+
+namespace {
+
+struct NoTTI : ImmutablePass, TargetTransformInfo {
+  const DataLayout *DL;
+
+  NoTTI() : ImmutablePass(ID), DL(0) {
+    initializeNoTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    // Note that this subclass is special, and must *not* call initializeTTI as
+    // it does not chain.
+    TopTTI = this;
+    PrevTTI = 0;
+    DL = getAnalysisIfAvailable<DataLayout>();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    // Note that this subclass is special, and must *not* call
+    // TTI::getAnalysisUsage as it breaks the recursion.
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) const {
+    switch (Opcode) {
+    default:
+      // By default, just classify everything as 'basic'.
+      return TCC_Basic;
+
+    case Instruction::GetElementPtr:
+      llvm_unreachable("Use getGEPCost for GEP operations!");
+
+    case Instruction::BitCast:
+      assert(OpTy && "Cast instructions must provide the operand type");
+      if (Ty == OpTy || (Ty->isPointerTy() && OpTy->isPointerTy()))
+        // Identity and pointer-to-pointer casts are free.
+        return TCC_Free;
+
+      // Otherwise, the default basic cost is used.
+      return TCC_Basic;
+
+    case Instruction::IntToPtr:
+      // An inttoptr cast is free so long as the input is a legal integer type
+      // which doesn't contain values outside the range of a pointer.
+      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
+          OpTy->getScalarSizeInBits() <= DL->getPointerSizeInBits())
+        return TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TCC_Basic;
+
+    case Instruction::PtrToInt:
+      // A ptrtoint cast is free so long as the result is large enough to store
+      // the pointer, and a legal integer type.
+      if (DL && DL->isLegalInteger(Ty->getScalarSizeInBits()) &&
+          Ty->getScalarSizeInBits() >= DL->getPointerSizeInBits())
+        return TCC_Free;
+
+      // Otherwise it's not a no-op.
+      return TCC_Basic;
+
+    case Instruction::Trunc:
+      // trunc to a native type is free (assuming the target has compare and
+      // shift-right of the same width).
+      if (DL && DL->isLegalInteger(DL->getTypeSizeInBits(Ty)))
+        return TCC_Free;
+
+      return TCC_Basic;
+    }
+  }
+
+  unsigned getGEPCost(const Value *Ptr,
+                      ArrayRef<const Value *> Operands) const {
+    // In the basic model, we just assume that all-constant GEPs will be folded
+    // into their uses via addressing modes.
+    for (unsigned Idx = 0, Size = Operands.size(); Idx != Size; ++Idx)
+      if (!isa<Constant>(Operands[Idx]))
+        return TCC_Basic;
+
+    return TCC_Free;
+  }
+
+  unsigned getCallCost(FunctionType *FTy, int NumArgs = -1) const {
+    assert(FTy && "FunctionType must be provided to this routine.");
+
+    // The target-independent implementation just measures the size of the
+    // function by approximating that each argument will take on average one
+    // instruction to prepare.
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = FTy->getNumParams();
+
+    return TCC_Basic * (NumArgs + 1);
+  }
+
+  unsigned getCallCost(const Function *F, int NumArgs = -1) const {
+    assert(F && "A concrete function must be provided to this routine.");
+
+    if (NumArgs < 0)
+      // Set the argument number to the number of explicit arguments in the
+      // function.
+      NumArgs = F->arg_size();
+
+    if (Intrinsic::ID IID = (Intrinsic::ID)F->getIntrinsicID()) {
+      FunctionType *FTy = F->getFunctionType();
+      SmallVector<Type *, 8> ParamTys(FTy->param_begin(), FTy->param_end());
+      return TopTTI->getIntrinsicCost(IID, FTy->getReturnType(), ParamTys);
+    }
+
+    if (!TopTTI->isLoweredToCall(F))
+      return TCC_Basic; // Give a basic cost if it will be lowered directly.
+
+    return TopTTI->getCallCost(F->getFunctionType(), NumArgs);
+  }
+
+  unsigned getCallCost(const Function *F,
+                       ArrayRef<const Value *> Arguments) const {
+    // Simply delegate to generic handling of the call.
+    // FIXME: We should use instsimplify or something else to catch calls which
+    // will constant fold with these arguments.
+    return TopTTI->getCallCost(F, Arguments.size());
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<Type *> ParamTys) const {
+    switch (IID) {
+    default:
+      // Intrinsics rarely (if ever) have normal argument setup constraints.
+      // Model them as having a basic instruction cost.
+      // FIXME: This is wrong for libc intrinsics.
+      return TCC_Basic;
+
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+      // These intrinsics don't actually represent code after lowering.
+      return TCC_Free;
+    }
+  }
+
+  unsigned getIntrinsicCost(Intrinsic::ID IID, Type *RetTy,
+                            ArrayRef<const Value *> Arguments) const {
+    // Delegate to the generic intrinsic handling code. This mostly provides an
+    // opportunity for targets to (for example) special case the cost of
+    // certain intrinsics based on constants used as arguments.
+    SmallVector<Type *, 8> ParamTys;
+    ParamTys.reserve(Arguments.size());
+    for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
+      ParamTys.push_back(Arguments[Idx]->getType());
+    return TopTTI->getIntrinsicCost(IID, RetTy, ParamTys);
+  }
+
+  unsigned getUserCost(const User *U) const {
+    if (isa<PHINode>(U))
+      return TCC_Free; // Model all PHI nodes as free.
+
+    if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U))
+      // In the basic model we just assume that all-constant GEPs will be
+      // folded into their uses via addressing modes.
+      return GEP->hasAllConstantIndices() ? TCC_Free : TCC_Basic;
+
+    if (ImmutableCallSite CS = U) {
+      const Function *F = CS.getCalledFunction();
+      if (!F) {
+        // Just use the called value type.
+        Type *FTy = CS.getCalledValue()->getType()->getPointerElementType();
+        return TopTTI->getCallCost(cast<FunctionType>(FTy), CS.arg_size());
+      }
+
+      SmallVector<const Value *, 8> Arguments;
+      for (ImmutableCallSite::arg_iterator AI = CS.arg_begin(),
+                                           AE = CS.arg_end();
+           AI != AE; ++AI)
+        Arguments.push_back(*AI);
+
+      return TopTTI->getCallCost(F, Arguments);
+    }
+
+    if (const CastInst *CI = dyn_cast<CastInst>(U)) {
+      // Result of a cmp instruction is often extended (to be used by other
+      // cmp instructions, logical or return instructions). These are usually
+      // nop on most sane targets.
+      if (isa<CmpInst>(CI->getOperand(0)))
+        return TCC_Free;
+    }
+
+    // Otherwise delegate to the fully generic implementations.
+    return getOperationCost(Operator::getOpcode(U), U->getType(),
+                            U->getNumOperands() == 1 ?
+                                U->getOperand(0)->getType() : 0);
+  }
+
+  bool isLoweredToCall(const Function *F) const {
+    // FIXME: These should almost certainly not be handled here, and instead
+    // handled with the help of TLI or the target itself. This was largely
+    // ported from existing analysis heuristics here so that such refactorings
+    // can take place in the future.
+
+    if (F->isIntrinsic())
+      return false;
+
+    if (F->hasLocalLinkage() || !F->hasName())
+      return true;
+
+    StringRef Name = F->getName();
+
+    // These will all likely lower to a single selection DAG node.
+    if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
+        Name == "fabs" || Name == "fabsf" || Name == "fabsl" || Name == "sin" ||
+        Name == "sinf" || Name == "sinl" || Name == "cos" || Name == "cosf" ||
+        Name == "cosl" || Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl")
+      return false;
+
+    // These are all likely to be optimized into something smaller.
+    if (Name == "pow" || Name == "powf" || Name == "powl" || Name == "exp2" ||
+        Name == "exp2l" || Name == "exp2f" || Name == "floor" || Name ==
+        "floorf" || Name == "ceil" || Name == "round" || Name == "ffs" ||
+        Name == "ffsl" || Name == "abs" || Name == "labs" || Name == "llabs")
+      return false;
+
+    return true;
+  }
+
+  bool isLegalAddImmediate(int64_t Imm) const {
+    return false;
+  }
+
+  bool isLegalICmpImmediate(int64_t Imm) const {
+    return false;
+  }
+
+  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg, int64_t Scale) const {
+    // Guess that reg+reg addressing is allowed. This heuristic is taken from
+    // the implementation of LSR.
+    return !BaseGV && BaseOffset == 0 && Scale <= 1;
+  }
+
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const {
+    return false;
+  }
+
+  bool isTypeLegal(Type *Ty) const {
+    return false;
+  }
+
+  unsigned getJumpBufAlignment() const {
+    return 0;
+  }
+
+  unsigned getJumpBufSize() const {
+    return 0;
+  }
+
+  bool shouldBuildLookupTables() const {
+    return true;
+  }
+
+  PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const {
+    return PSK_Software;
+  }
+
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const {
+    return 1;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    return 8;
+  }
+
+  unsigned  getRegisterBitWidth(bool Vector) const {
+    return 32;
+  }
+
+  unsigned getMaximumUnrollFactor() const {
+    return 1;
+  }
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
+                                  OperandValueKind) const {
+    return 1;
+  }
+
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index = 0, Type *SubTp = 0) const {
+    return 1;
+  }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                            Type *Src) const {
+    return 1;
+  }
+
+  unsigned getCFInstrCost(unsigned Opcode) const {
+    return 1;
+  }
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                              Type *CondTy = 0) const {
+    return 1;
+  }
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                              unsigned Index = -1) const {
+    return 1;
+  }
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                           unsigned Alignment,
+                           unsigned AddressSpace) const {
+    return 1;
+  }
+
+  unsigned getIntrinsicInstrCost(Intrinsic::ID ID,
+                                 Type *RetTy,
+                                 ArrayRef<Type*> Tys) const {
+    return 1;
+  }
+
+  unsigned getNumberOfParts(Type *Tp) const {
+    return 0;
+  }
+
+  unsigned getAddressComputationCost(Type *Tp) const {
+    return 0;
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(NoTTI, TargetTransformInfo, "notti",
+                   "No target information", true, true, true)
+char NoTTI::ID = 0;
+
+ImmutablePass *llvm::createNoTargetTransformInfoPass() {
+  return new NoTTI();
+}
diff --git a/contrib/llvm/lib/Analysis/Trace.cpp b/contrib/llvm/lib/Analysis/Trace.cpp
new file mode 100644
index 000000000000..4c68322b8282
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/Trace.cpp
@@ -0,0 +1,53 @@
+//===- Trace.cpp - Implementation of Trace class --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents a single trace of LLVM basic blocks.  A trace is a
+// single entry, multiple exit, region of code that is often hot.  Trace-based
+// optimizations treat traces almost like they are a large, strange, basic
+// block: because the trace path is assumed to be hot, optimizations for the
+// fall-through path are made at the expense of the non-fall-through paths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Trace.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+Function *Trace::getFunction() const {
+  return getEntryBasicBlock()->getParent();
+}
+
+Module *Trace::getModule() const {
+  return getFunction()->getParent();
+}
+
+/// print - Write trace to output stream.
+///
+void Trace::print(raw_ostream &O) const {
+  Function *F = getFunction();
+  O << "; Trace from function " << F->getName() << ", blocks:\n";
+  for (const_iterator i = begin(), e = end(); i != e; ++i) {
+    O << "; ";
+    WriteAsOperand(O, *i, true, getModule());
+    O << "\n";
+  }
+  O << "; Trace parent function: \n" << *F;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dump - Debugger convenience method; writes trace to standard error
+/// output stream.
+///
+void Trace::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
new file mode 100644
index 000000000000..68e43b2cdb63
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -0,0 +1,300 @@
+//===- TypeBasedAliasAnalysis.cpp - Type-Based Alias Analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TypeBasedAliasAnalysis pass, which implements
+// metadata-based TBAA.
+//
+// In LLVM IR, memory does not have types, so LLVM's own type system is not
+// suitable for doing TBAA. Instead, metadata is added to the IR to describe
+// a type system of a higher level language. This can be used to implement
+// typical C/C++ TBAA, but it can also be used to implement custom alias
+// analysis behavior for other languages.
+//
+// The current metadata format is very simple. TBAA MDNodes have up to
+// three fields, e.g.:
+//   !0 = metadata !{ metadata !"an example type tree" }
+//   !1 = metadata !{ metadata !"int", metadata !0 }
+//   !2 = metadata !{ metadata !"float", metadata !0 }
+//   !3 = metadata !{ metadata !"const float", metadata !2, i64 1 }
+//
+// The first field is an identity field. It can be any value, usually
+// an MDString, which uniquely identifies the type. The most important
+// name in the tree is the name of the root node. Two trees with
+// different root node names are entirely disjoint, even if they
+// have leaves with common names.
+//
+// The second field identifies the type's parent node in the tree, or
+// is null or omitted for a root node. A type is considered to alias
+// all of its descendants and all of its ancestors in the tree. Also,
+// a type is considered to alias all types in other trees, so that
+// bitcode produced from multiple front-ends is handled conservatively.
+//
+// If the third field is present, it's an integer which if equal to 1
+// indicates that the type is "constant" (meaning pointsToConstantMemory
+// should return true; see
+// http://llvm.org/docs/AliasAnalysis.html#OtherItfs).
+//
+// TODO: The current metadata format doesn't support struct
+// fields. For example:
+//   struct X {
+//     double d;
+//     int i;
+//   };
+//   void foo(struct X *x, struct X *y, double *p) {
+//     *x = *y;
+//     *p = 0.0;
+//   }
+// Struct X has a double member, so the store to *x can alias the store to *p.
+// Currently it's not possible to precisely describe all the things struct X
+// aliases, so struct assignments must use conservative TBAA nodes. There's
+// no scheme for attaching metadata to @llvm.memcpy yet either.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+// A handy option for disabling TBAA functionality. The same effect can also be
+// achieved by stripping the !tbaa tags from IR, but this option is sometimes
+// more convenient.
+static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true));
+
+namespace {
+  /// TBAANode - This is a simple wrapper around an MDNode which provides a
+  /// higher-level interface by hiding the details of how alias analysis
+  /// information is encoded in its operands.
+  class TBAANode {
+    const MDNode *Node;
+
+  public:
+    TBAANode() : Node(0) {}
+    explicit TBAANode(const MDNode *N) : Node(N) {}
+
+    /// getNode - Get the MDNode for this TBAANode.
+    const MDNode *getNode() const { return Node; }
+
+    /// getParent - Get this TBAANode's Alias tree parent.
+    TBAANode getParent() const {
+      if (Node->getNumOperands() < 2)
+        return TBAANode();
+      MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
+      if (!P)
+        return TBAANode();
+      // Ok, this node has a valid parent. Return it.
+      return TBAANode(P);
+    }
+
+    /// TypeIsImmutable - Test if this TBAANode represents a type for objects
+    /// which are not modified (by any means) in the context where this
+    /// AliasAnalysis is relevant.
+    bool TypeIsImmutable() const {
+      if (Node->getNumOperands() < 3)
+        return false;
+      ConstantInt *CI = dyn_cast<ConstantInt>(Node->getOperand(2));
+      if (!CI)
+        return false;
+      return CI->getValue()[0];
+    }
+  };
+}
+
+namespace {
+  /// TypeBasedAliasAnalysis - This is a simple alias analysis
+  /// implementation that uses TypeBased to answer queries.
+  class TypeBasedAliasAnalysis : public ImmutablePass,
+                                 public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    TypeBasedAliasAnalysis() : ImmutablePass(ID) {
+      initializeTypeBasedAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// getAdjustedAnalysisPointer - This method is used when a pass implements
+    /// an analysis interface through multiple inheritance.  If needed, it
+    /// should override this to adjust the this pointer as needed for the
+    /// specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return (AliasAnalysis*)this;
+      return this;
+    }
+
+    bool Aliases(const MDNode *A, const MDNode *B) const;
+
+  private:
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+}  // End of anonymous namespace
+
+// Register this pass...
+char TypeBasedAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(TypeBasedAliasAnalysis, AliasAnalysis, "tbaa",
+                   "Type-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createTypeBasedAliasAnalysisPass() {
+  return new TypeBasedAliasAnalysis();
+}
+
+void
+TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+/// Aliases - Test whether the type represented by A may alias the
+/// type represented by B.
+bool
+TypeBasedAliasAnalysis::Aliases(const MDNode *A,
+                                const MDNode *B) const {
+  // Keep track of the root node for A and B.
+  TBAANode RootA, RootB;
+
+  // Climb the tree from A to see if we reach B.
+  for (TBAANode T(A); ; ) {
+    if (T.getNode() == B)
+      // B is an ancestor of A.
+      return true;
+
+    RootA = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Climb the tree from B to see if we reach A.
+  for (TBAANode T(B); ; ) {
+    if (T.getNode() == A)
+      // A is an ancestor of B.
+      return true;
+
+    RootB = T;
+    T = T.getParent();
+    if (!T.getNode())
+      break;
+  }
+
+  // Neither node is an ancestor of the other.
+  
+  // If they have different roots, they're part of different potentially
+  // unrelated type systems, so we must be conservative.
+  if (RootA.getNode() != RootB.getNode())
+    return true;
+
+  // If they have the same root, then we've proved there's no alias.
+  return false;
+}
+
+AliasAnalysis::AliasResult
+TypeBasedAliasAnalysis::alias(const Location &LocA,
+                              const Location &LocB) {
+  if (!EnableTBAA)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // Get the attached MDNodes. If either value lacks a tbaa MDNode, we must
+  // be conservative.
+  const MDNode *AM = LocA.TBAATag;
+  if (!AM) return AliasAnalysis::alias(LocA, LocB);
+  const MDNode *BM = LocB.TBAATag;
+  if (!BM) return AliasAnalysis::alias(LocA, LocB);
+
+  // If they may alias, chain to the next AliasAnalysis.
+  if (Aliases(AM, BM))
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // Otherwise return a definitive result.
+  return NoAlias;
+}
+
+bool TypeBasedAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                                    bool OrLocal) {
+  if (!EnableTBAA)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  const MDNode *M = Loc.TBAATag;
+  if (!M) return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // If this is an "immutable" type, we can assume the pointer is pointing
+  // to constant memory.
+  if (TBAANode(M).TypeIsImmutable())
+    return true;
+
+  return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+}
+
+AliasAnalysis::ModRefBehavior
+TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  if (!EnableTBAA)
+    return AliasAnalysis::getModRefBehavior(CS);
+
+  ModRefBehavior Min = UnknownModRefBehavior;
+
+  // If this is an "immutable" type, we can assume the call doesn't write
+  // to memory.
+  if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+    if (TBAANode(M).TypeIsImmutable())
+      Min = OnlyReadsMemory;
+
+  return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
+}
+
+AliasAnalysis::ModRefBehavior
+TypeBasedAliasAnalysis::getModRefBehavior(const Function *F) {
+  // Functions don't have metadata. Just chain to the next implementation.
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
+                                      const Location &Loc) {
+  if (!EnableTBAA)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  if (const MDNode *L = Loc.TBAATag)
+    if (const MDNode *M =
+          CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+      if (!Aliases(L, M))
+        return NoModRef;
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                      ImmutableCallSite CS2) {
+  if (!EnableTBAA)
+    return AliasAnalysis::getModRefInfo(CS1, CS2);
+
+  if (const MDNode *M1 =
+        CS1.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+    if (const MDNode *M2 =
+          CS2.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
+      if (!Aliases(M1, M2))
+        return NoModRef;
+
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
diff --git a/contrib/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm/lib/Analysis/ValueTracking.cpp
new file mode 100644
index 000000000000..45dcc5e37ecf
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/ValueTracking.cpp
@@ -0,0 +1,2034 @@
+//===- ValueTracking.cpp - Walk computations to compute properties --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains routines that help analyze properties that chains of
+// computations have.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PatternMatch.h"
+#include <cstring>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+const unsigned MaxDepth = 6;
+
+/// getBitWidth - Returns the bitwidth of the given scalar or pointer type (if
+/// unknown returns 0).  For vector types, returns the element type's bitwidth.
+static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
+  if (unsigned BitWidth = Ty->getScalarSizeInBits())
+    return BitWidth;
+  assert(isa<PointerType>(Ty) && "Expected a pointer type!");
+  return TD ? TD->getPointerSizeInBits() : 0;
+}
+
+static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
+                                    APInt &KnownZero, APInt &KnownOne,
+                                    APInt &KnownZero2, APInt &KnownOne2,
+                                    const DataLayout *TD, unsigned Depth) {
+  if (!Add) {
+    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (!CLHS->getValue().isNegative()) {
+        unsigned BitWidth = KnownZero.getBitWidth();
+        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+        }
+      }
+    }
+  }
+
+  unsigned BitWidth = KnownZero.getBitWidth();
+
+  // If one of the operands has trailing zeros, then the bits that the
+  // other operand has in those bit positions will be preserved in the
+  // result. For an add, this works with either operand. For a subtract,
+  // this only works if the known zeros are in the right operand.
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  llvm::ComputeMaskedBits(Op0, LHSKnownZero, LHSKnownOne, TD, Depth+1);
+  assert((LHSKnownZero & LHSKnownOne) == 0 &&
+         "Bits known to be one AND zero?");
+  unsigned LHSKnownZeroOut = LHSKnownZero.countTrailingOnes();
+
+  llvm::ComputeMaskedBits(Op1, KnownZero2, KnownOne2, TD, Depth+1);
+  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+  unsigned RHSKnownZeroOut = KnownZero2.countTrailingOnes();
+
+  // Determine which operand has more trailing zeros, and use that
+  // many bits from the other operand.
+  if (LHSKnownZeroOut > RHSKnownZeroOut) {
+    if (Add) {
+      APInt Mask = APInt::getLowBitsSet(BitWidth, LHSKnownZeroOut);
+      KnownZero |= KnownZero2 & Mask;
+      KnownOne  |= KnownOne2 & Mask;
+    } else {
+      // If the known zeros are in the left operand for a subtract,
+      // fall back to the minimum known zeros in both operands.
+      KnownZero |= APInt::getLowBitsSet(BitWidth,
+                                        std::min(LHSKnownZeroOut,
+                                                 RHSKnownZeroOut));
+    }
+  } else if (RHSKnownZeroOut >= LHSKnownZeroOut) {
+    APInt Mask = APInt::getLowBitsSet(BitWidth, RHSKnownZeroOut);
+    KnownZero |= LHSKnownZero & Mask;
+    KnownOne  |= LHSKnownOne & Mask;
+  }
+
+  // Are we still trying to solve for the sign bit?
+  if (!KnownZero.isNegative() && !KnownOne.isNegative()) {
+    if (NSW) {
+      if (Add) {
+        // Adding two positive numbers can't wrap into negative
+        if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
+          KnownZero |= APInt::getSignBit(BitWidth);
+        // and adding two negative numbers can't wrap into positive.
+        else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
+          KnownOne |= APInt::getSignBit(BitWidth);
+      } else {
+        // Subtracting a negative number from a positive one can't wrap
+        if (LHSKnownZero.isNegative() && KnownOne2.isNegative())
+          KnownZero |= APInt::getSignBit(BitWidth);
+        // neither can subtracting a positive number from a negative one.
+        else if (LHSKnownOne.isNegative() && KnownZero2.isNegative())
+          KnownOne |= APInt::getSignBit(BitWidth);
+      }
+    }
+  }
+}
+
+static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
+                                 APInt &KnownZero, APInt &KnownOne,
+                                 APInt &KnownZero2, APInt &KnownOne2,
+                                 const DataLayout *TD, unsigned Depth) {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  ComputeMaskedBits(Op1, KnownZero, KnownOne, TD, Depth+1);
+  ComputeMaskedBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+  bool isKnownNegative = false;
+  bool isKnownNonNegative = false;
+  // If the multiplication is known not to overflow, compute the sign bit.
+  if (NSW) {
+    if (Op0 == Op1) {
+      // The product of a number with itself is non-negative.
+      isKnownNonNegative = true;
+    } else {
+      bool isKnownNonNegativeOp1 = KnownZero.isNegative();
+      bool isKnownNonNegativeOp0 = KnownZero2.isNegative();
+      bool isKnownNegativeOp1 = KnownOne.isNegative();
+      bool isKnownNegativeOp0 = KnownOne2.isNegative();
+      // The product of two numbers with the same sign is non-negative.
+      isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
+        (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
+      // The product of a negative number and a non-negative number is either
+      // negative or zero.
+      if (!isKnownNonNegative)
+        isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
+                           isKnownNonZero(Op0, TD, Depth)) ||
+                          (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
+                           isKnownNonZero(Op1, TD, Depth));
+    }
+  }
+
+  // If low bits are zero in either operand, output low known-0 bits.
+  // Also compute a conserative estimate for high known-0 bits.
+  // More trickiness is possible, but this is sufficient for the
+  // interesting case of alignment computation.
+  KnownOne.clearAllBits();
+  unsigned TrailZ = KnownZero.countTrailingOnes() +
+                    KnownZero2.countTrailingOnes();
+  unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                             KnownZero2.countLeadingOnes(),
+                             BitWidth) - BitWidth;
+
+  TrailZ = std::min(TrailZ, BitWidth);
+  LeadZ = std::min(LeadZ, BitWidth);
+  KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+              APInt::getHighBitsSet(BitWidth, LeadZ);
+
+  // Only make use of no-wrap flags if we failed to compute the sign bit
+  // directly.  This matters if the multiplication always overflows, in
+  // which case we prefer to follow the result of the direct computation,
+  // though as the program is invoking undefined behaviour we can choose
+  // whatever we like here.
+  if (isKnownNonNegative && !KnownOne.isNegative())
+    KnownZero.setBit(BitWidth - 1);
+  else if (isKnownNegative && !KnownZero.isNegative())
+    KnownOne.setBit(BitWidth - 1);
+}
+
+void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned NumRanges = Ranges.getNumOperands() / 2;
+  assert(NumRanges >= 1);
+
+  // Use the high end of the ranges to find leading zeros.
+  unsigned MinLeadingZeros = BitWidth;
+  for (unsigned i = 0; i < NumRanges; ++i) {
+    ConstantInt *Lower = cast<ConstantInt>(Ranges.getOperand(2*i + 0));
+    ConstantInt *Upper = cast<ConstantInt>(Ranges.getOperand(2*i + 1));
+    ConstantRange Range(Lower->getValue(), Upper->getValue());
+    if (Range.isWrappedSet())
+      MinLeadingZeros = 0; // -1 has no zeros
+    unsigned LeadingZeros = (Upper->getValue() - 1).countLeadingZeros();
+    MinLeadingZeros = std::min(LeadingZeros, MinLeadingZeros);
+  }
+
+  KnownZero = APInt::getHighBitsSet(BitWidth, MinLeadingZeros);
+}
+/// ComputeMaskedBits - Determine which of the bits are known to be either zero
+/// or one and return them in the KnownZero/KnownOne bit sets.
+///
+/// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
+/// we cannot optimize based on the assumption that it is zero without changing
+/// it to be an explicit zero.  If we don't change it to zero, other code could
+/// optimized based on the contradictory assumption that it is non-zero.
+/// Because instcombine aggressively folds operations with undef args anyway,
+/// this won't lose us code quality.
+///
+/// This function is defined on values with integer type, values with pointer
+/// type (but only if TD is non-null), and vectors of integers.  In the case
+/// where V is a vector, known zero, and known one values are the
+/// same width as the vector element, and the bit is set only if it is true
+/// for all of the elements in the vector.
+void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
+                             const DataLayout *TD, unsigned Depth) {
+  assert(V && "No Value?");
+  assert(Depth <= MaxDepth && "Limit Search Depth");
+  unsigned BitWidth = KnownZero.getBitWidth();
+
+  assert((V->getType()->isIntOrIntVectorTy() ||
+          V->getType()->getScalarType()->isPointerTy()) &&
+         "Not integer or pointer type!");
+  assert((!TD ||
+          TD->getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) &&
+         (!V->getType()->isIntOrIntVectorTy() ||
+          V->getType()->getScalarSizeInBits() == BitWidth) &&
+         KnownZero.getBitWidth() == BitWidth &&
+         KnownOne.getBitWidth() == BitWidth &&
+         "V, Mask, KnownOne and KnownZero should have same BitWidth");
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue();
+    KnownZero = ~KnownOne;
+    return;
+  }
+  // Null and aggregate-zero are all-zeros.
+  if (isa<ConstantPointerNull>(V) ||
+      isa<ConstantAggregateZero>(V)) {
+    KnownOne.clearAllBits();
+    KnownZero = APInt::getAllOnesValue(BitWidth);
+    return;
+  }
+  // Handle a constant vector by taking the intersection of the known bits of
+  // each element.  There is no real need to handle ConstantVector here, because
+  // we don't handle undef in any particularly useful way.
+  if (ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
+    // We know that CDS must be a vector of integers. Take the intersection of
+    // each element.
+    KnownZero.setAllBits(); KnownOne.setAllBits();
+    APInt Elt(KnownZero.getBitWidth(), 0);
+    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+      Elt = CDS->getElementAsInteger(i);
+      KnownZero &= ~Elt;
+      KnownOne &= Elt;
+    }
+    return;
+  }
+
+  // The address of an aligned GlobalValue has trailing zeros.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    unsigned Align = GV->getAlignment();
+    if (Align == 0 && TD) {
+      if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+        Type *ObjectType = GVar->getType()->getElementType();
+        if (ObjectType->isSized()) {
+          // If the object is defined in the current Module, we'll be giving
+          // it the preferred alignment. Otherwise, we have to assume that it
+          // may only have the minimum ABI alignment.
+          if (!GVar->isDeclaration() && !GVar->isWeakForLinker())
+            Align = TD->getPreferredAlignment(GVar);
+          else
+            Align = TD->getABITypeAlignment(ObjectType);
+        }
+      }
+    }
+    if (Align > 0)
+      KnownZero = APInt::getLowBitsSet(BitWidth,
+                                       CountTrailingZeros_32(Align));
+    else
+      KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+    return;
+  }
+  // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
+  // the bits of its aliasee.
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    if (GA->mayBeOverridden()) {
+      KnownZero.clearAllBits(); KnownOne.clearAllBits();
+    } else {
+      ComputeMaskedBits(GA->getAliasee(), KnownZero, KnownOne, TD, Depth+1);
+    }
+    return;
+  }
+
+  if (Argument *A = dyn_cast<Argument>(V)) {
+    unsigned Align = 0;
+
+    if (A->hasByValAttr()) {
+      // Get alignment information off byval arguments if specified in the IR.
+      Align = A->getParamAlignment();
+    } else if (TD && A->hasStructRetAttr()) {
+      // An sret parameter has at least the ABI alignment of the return type.
+      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
+      if (EltTy->isSized())
+        Align = TD->getABITypeAlignment(EltTy);
+    }
+
+    if (Align)
+      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
+    return;
+  }
+
+  // Start out not knowing anything.
+  KnownZero.clearAllBits(); KnownOne.clearAllBits();
+
+  if (Depth == MaxDepth)
+    return;  // Limit search depth.
+
+  Operator *I = dyn_cast<Operator>(V);
+  if (!I) return;
+
+  APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
+      computeMaskedBitsLoad(*MD, KnownZero);
+    return;
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  }
+  case Instruction::Or: {
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  }
+  case Instruction::Xor: {
+    ComputeMaskedBits(I->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case Instruction::Mul: {
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsMul(I->getOperand(0), I->getOperand(1), NSW,
+                         KnownZero, KnownOne, KnownZero2, KnownOne2, TD, Depth);
+    break;
+  }
+  case Instruction::UDiv: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clearAllBits();
+    KnownZero2.clearAllBits();
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    return;
+  }
+  case Instruction::Select:
+    ComputeMaskedBits(I->getOperand(2), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD,
+                      Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+    return; // Can't work with floating point.
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    // We can't handle these if we don't know the pointer size.
+    if (!TD) return;
+    // FALL THROUGH and handle them the same as zext/trunc.
+  case Instruction::ZExt:
+  case Instruction::Trunc: {
+    Type *SrcTy = I->getOperand(0)->getType();
+
+    unsigned SrcBitWidth;
+    // Note that we handle pointer operands here because of inttoptr/ptrtoint
+    // which fall through here.
+    if(TD) {
+      SrcBitWidth = TD->getTypeSizeInBits(SrcTy->getScalarType());
+    } else {
+      SrcBitWidth = SrcTy->getScalarSizeInBits();
+      if (!SrcBitWidth) return;
+    }
+
+    assert(SrcBitWidth && "SrcBitWidth can't be zero");
+    KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
+    KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    KnownZero = KnownZero.zextOrTrunc(BitWidth);
+    KnownOne = KnownOne.zextOrTrunc(BitWidth);
+    // Any top bits are known to be zero.
+    if (BitWidth > SrcBitWidth)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::BitCast: {
+    Type *SrcTy = I->getOperand(0)->getType();
+    if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
+        // TODO: For now, not handling conversions like:
+        // (bitcast i64 %x to <2 x i32>)
+        !I->getType()->isVectorTy()) {
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      return;
+    }
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    KnownZero = KnownZero.trunc(SrcBitWidth);
+    KnownOne = KnownOne.trunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
+      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::Shl:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero <<= ShiftAmt;
+      KnownOne  <<= ShiftAmt;
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
+      return;
+    }
+    break;
+  case Instruction::LShr:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+
+      // Unsigned shift right.
+      ComputeMaskedBits(I->getOperand(0), KnownZero,KnownOne, TD, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      // high bits known zero.
+      KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      return;
+    }
+    break;
+  case Instruction::AShr:
+    // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Signed shift right.
+      ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      if (KnownZero[BitWidth-ShiftAmt-1])    // New bits are known zero.
+        KnownZero |= HighBits;
+      else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
+        KnownOne |= HighBits;
+      return;
+    }
+    break;
+  case Instruction::Sub: {
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
+                            KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
+                            Depth);
+    break;
+  }
+  case Instruction::Add: {
+    bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    ComputeMaskedBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
+                            KnownZero, KnownOne, KnownZero2, KnownOne2, TD,
+                            Depth);
+    break;
+  }
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = RA - 1;
+        ComputeMaskedBits(I->getOperand(0), KnownZero2, KnownOne2, TD, Depth+1);
+
+        // The low bits of the first operand are unchanged by the srem.
+        KnownZero = KnownZero2 & LowBits;
+        KnownOne = KnownOne2 & LowBits;
+
+        // If the first operand is non-negative or has all low bits zero, then
+        // the upper bits are all zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero |= ~LowBits;
+
+        // If the first operand is negative and not all low bits are zero, then
+        // the upper bits are all one.
+        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+          KnownOne |= ~LowBits;
+
+        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      }
+    }
+
+    // The sign bit is the LHS's sign bit, except when the result of the
+    // remainder is zero.
+    if (KnownZero.isNonNegative()) {
+      APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, TD,
+                        Depth+1);
+      // If it's known zero, our sign bit is also zero.
+      if (LHSKnownZero.isNegative())
+        KnownZero.setBit(BitWidth - 1);
+    }
+
+    break;
+  case Instruction::URem: {
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      APInt RA = Rem->getValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD,
+                          Depth+1);
+        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+        KnownZero |= ~LowBits;
+        KnownOne &= LowBits;
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), KnownZero2, KnownOne2, TD, Depth+1);
+
+    unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clearAllBits();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    break;
+  }
+
+  case Instruction::Alloca: {
+    AllocaInst *AI = cast<AllocaInst>(V);
+    unsigned Align = AI->getAlignment();
+    if (Align == 0 && TD)
+      Align = TD->getABITypeAlignment(AI->getType()->getElementType());
+
+    if (Align > 0)
+      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    // Analyze all of the subscripts of this getelementptr instruction
+    // to determine if we can prove known low zero bits.
+    APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
+    ComputeMaskedBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, TD,
+                      Depth+1);
+    unsigned TrailZ = LocalKnownZero.countTrailingOnes();
+
+    gep_type_iterator GTI = gep_type_begin(I);
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
+      Value *Index = I->getOperand(i);
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        // Handle struct member offset arithmetic.
+        if (!TD) return;
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+        uint64_t Offset = SL->getElementOffset(Idx);
+        TrailZ = std::min(TrailZ,
+                          CountTrailingZeros_64(Offset));
+      } else {
+        // Handle array index arithmetic.
+        Type *IndexedTy = GTI.getIndexedType();
+        if (!IndexedTy->isSized()) return;
+        unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
+        uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
+        LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
+        ComputeMaskedBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
+        TrailZ = std::min(TrailZ,
+                          unsigned(CountTrailingZeros_64(TypeSize) +
+                                   LocalKnownZero.countTrailingOnes()));
+      }
+    }
+
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *P = cast<PHINode>(I);
+    // Handle the case of a simple two-predecessor recurrence PHI.
+    // There's a lot more that could theoretically be done here, but
+    // this is sufficient to catch some interesting cases.
+    if (P->getNumIncomingValues() == 2) {
+      for (unsigned i = 0; i != 2; ++i) {
+        Value *L = P->getIncomingValue(i);
+        Value *R = P->getIncomingValue(!i);
+        Operator *LU = dyn_cast<Operator>(L);
+        if (!LU)
+          continue;
+        unsigned Opcode = LU->getOpcode();
+        // Check for operations that have the property that if
+        // both their operands have low zero bits, the result
+        // will have low zero bits.
+        if (Opcode == Instruction::Add ||
+            Opcode == Instruction::Sub ||
+            Opcode == Instruction::And ||
+            Opcode == Instruction::Or ||
+            Opcode == Instruction::Mul) {
+          Value *LL = LU->getOperand(0);
+          Value *LR = LU->getOperand(1);
+          // Find a recurrence.
+          if (LL == I)
+            L = LR;
+          else if (LR == I)
+            L = LL;
+          else
+            break;
+          // Ok, we have a PHI of the form L op= R. Check for low
+          // zero bits.
+          ComputeMaskedBits(R, KnownZero2, KnownOne2, TD, Depth+1);
+
+          // We need to take the minimum number of known bits
+          APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
+          ComputeMaskedBits(L, KnownZero3, KnownOne3, TD, Depth+1);
+
+          KnownZero = APInt::getLowBitsSet(BitWidth,
+                                           std::min(KnownZero2.countTrailingOnes(),
+                                                    KnownZero3.countTrailingOnes()));
+          break;
+        }
+      }
+    }
+
+    // Unreachable blocks may have zero-operand PHI nodes.
+    if (P->getNumIncomingValues() == 0)
+      return;
+
+    // Otherwise take the unions of the known bit sets of the operands,
+    // taking conservative care to avoid excessive recursion.
+    if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) {
+      // Skip if every incoming value references to ourself.
+      if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
+        break;
+
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getAllOnesValue(BitWidth);
+      for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+        // Skip direct self references.
+        if (P->getIncomingValue(i) == P) continue;
+
+        KnownZero2 = APInt(BitWidth, 0);
+        KnownOne2 = APInt(BitWidth, 0);
+        // Recurse, but cap the recursion to one level, because we don't
+        // want to waste time spinning around in loops.
+        ComputeMaskedBits(P->getIncomingValue(i), KnownZero2, KnownOne2, TD,
+                          MaxDepth-1);
+        KnownZero &= KnownZero2;
+        KnownOne &= KnownOne2;
+        // If all bits have been ruled out, there's no need to check
+        // more operands.
+        if (!KnownZero && !KnownOne)
+          break;
+      }
+    }
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz: {
+        unsigned LowBits = Log2_32(BitWidth)+1;
+        // If this call is undefined for 0, the result will be less than 2^n.
+        if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+          LowBits -= 1;
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        break;
+      }
+      case Intrinsic::ctpop: {
+        unsigned LowBits = Log2_32(BitWidth)+1;
+        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        break;
+      }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        break;
+      }
+    }
+    break;
+  case Instruction::ExtractValue:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
+      ExtractValueInst *EVI = cast<ExtractValueInst>(I);
+      if (EVI->getNumIndices() != 1) break;
+      if (EVI->getIndices()[0] == 0) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::uadd_with_overflow:
+        case Intrinsic::sadd_with_overflow:
+          ComputeMaskedBitsAddSub(true, II->getArgOperand(0),
+                                  II->getArgOperand(1), false, KnownZero,
+                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
+          break;
+        case Intrinsic::usub_with_overflow:
+        case Intrinsic::ssub_with_overflow:
+          ComputeMaskedBitsAddSub(false, II->getArgOperand(0),
+                                  II->getArgOperand(1), false, KnownZero,
+                                  KnownOne, KnownZero2, KnownOne2, TD, Depth);
+          break;
+        case Intrinsic::umul_with_overflow:
+        case Intrinsic::smul_with_overflow:
+          ComputeMaskedBitsMul(II->getArgOperand(0), II->getArgOperand(1),
+                               false, KnownZero, KnownOne,
+                               KnownZero2, KnownOne2, TD, Depth);
+          break;
+        }
+      }
+    }
+  }
+}
+
+/// ComputeSignBit - Determine whether the sign bit is known to be zero or
+/// one.  Convenience wrapper around ComputeMaskedBits.
+void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
+                          const DataLayout *TD, unsigned Depth) {
+  unsigned BitWidth = getBitWidth(V->getType(), TD);
+  if (!BitWidth) {
+    KnownZero = false;
+    KnownOne = false;
+    return;
+  }
+  APInt ZeroBits(BitWidth, 0);
+  APInt OneBits(BitWidth, 0);
+  ComputeMaskedBits(V, ZeroBits, OneBits, TD, Depth);
+  KnownOne = OneBits[BitWidth - 1];
+  KnownZero = ZeroBits[BitWidth - 1];
+}
+
+/// isKnownToBeAPowerOfTwo - Return true if the given value is known to have exactly one
+/// bit set when defined. For vectors return true if every element is known to
+/// be a power of two when defined.  Supports values with integer or pointer
+/// types and vectors of integers.
+bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (C->isNullValue())
+      return OrZero;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return CI->getValue().isPowerOf2();
+    // TODO: Handle vector constants.
+  }
+
+  // 1 << X is clearly a power of two if the one is not shifted off the end.  If
+  // it is shifted off the end then the result is undefined.
+  if (match(V, m_Shl(m_One(), m_Value())))
+    return true;
+
+  // (signbit) >>l X is clearly a power of two if the one is not shifted off the
+  // bottom.  If it is shifted off the bottom then the result is undefined.
+  if (match(V, m_LShr(m_SignBit(), m_Value())))
+    return true;
+
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ == MaxDepth)
+    return false;
+
+  Value *X = 0, *Y = 0;
+  // A shift of a power of two is a power of two or zero.
+  if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
+                 match(V, m_Shr(m_Value(X), m_Value()))))
+    return isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth);
+
+  if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
+    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth);
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(V))
+    return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth) &&
+      isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth);
+
+  if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
+    // A power of two and'd with anything is a power of two or zero.
+    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth) ||
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/true, Depth))
+      return true;
+    // X & (-X) is always a power of two or zero.
+    if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
+      return true;
+    return false;
+  }
+
+  // An exact divide or right shift can only shift off zero bits, so the result
+  // is a power of two only if the first operand is a power of two and not
+  // copying a sign bit (sdiv int_min, 2).
+  if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
+      match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
+    return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero, Depth);
+  }
+
+  return false;
+}
+
+/// \brief Test whether a GEP's result is known to be non-null.
+///
+/// Uses properties inherent in a GEP to try to determine whether it is known
+/// to be non-null.
+///
+/// Currently this routine does not support vector GEPs.
+static bool isGEPKnownNonNull(GEPOperator *GEP, const DataLayout *DL,
+                              unsigned Depth) {
+  if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0)
+    return false;
+
+  // FIXME: Support vector-GEPs.
+  assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP");
+
+  // If the base pointer is non-null, we cannot walk to a null address with an
+  // inbounds GEP in address space zero.
+  if (isKnownNonZero(GEP->getPointerOperand(), DL, Depth))
+    return true;
+
+  // Past this, if we don't have DataLayout, we can't do much.
+  if (!DL)
+    return false;
+
+  // Walk the GEP operands and see if any operand introduces a non-zero offset.
+  // If so, then the GEP cannot produce a null pointer, as doing so would
+  // inherently violate the inbounds contract within address space zero.
+  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+       GTI != GTE; ++GTI) {
+    // Struct types are easy -- they must always be indexed by a constant.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = DL->getStructLayout(STy);
+      uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
+      if (ElementOffset > 0)
+        return true;
+      continue;
+    }
+
+    // If we have a zero-sized type, the index doesn't matter. Keep looping.
+    if (DL->getTypeAllocSize(GTI.getIndexedType()) == 0)
+      continue;
+
+    // Fast path the constant operand case both for efficiency and so we don't
+    // increment Depth when just zipping down an all-constant GEP.
+    if (ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand())) {
+      if (!OpC->isZero())
+        return true;
+      continue;
+    }
+
+    // We post-increment Depth here because while isKnownNonZero increments it
+    // as well, when we pop back up that increment won't persist. We don't want
+    // to recurse 10k times just because we have 10k GEP operands. We don't
+    // bail completely out because we want to handle constant GEPs regardless
+    // of depth.
+    if (Depth++ >= MaxDepth)
+      continue;
+
+    if (isKnownNonZero(GTI.getOperand(), DL, Depth))
+      return true;
+  }
+
+  return false;
+}
+
+/// isKnownNonZero - Return true if the given value is known to be non-zero
+/// when defined.  For vectors return true if every element is known to be
+/// non-zero when defined.  Supports values with integer or pointer type and
+/// vectors of integers.
+bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (C->isNullValue())
+      return false;
+    if (isa<ConstantInt>(C))
+      // Must be non-zero due to null test above.
+      return true;
+    // TODO: Handle vectors
+    return false;
+  }
+
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ >= MaxDepth)
+    return false;
+
+  // Check for pointer simplifications.
+  if (V->getType()->isPointerTy()) {
+    if (isKnownNonNull(V))
+      return true; 
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+      if (isGEPKnownNonNull(GEP, TD, Depth))
+        return true;
+  }
+
+  unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), TD);
+
+  // X | Y != 0 if X != 0 or Y != 0.
+  Value *X = 0, *Y = 0;
+  if (match(V, m_Or(m_Value(X), m_Value(Y))))
+    return isKnownNonZero(X, TD, Depth) || isKnownNonZero(Y, TD, Depth);
+
+  // ext X != 0 if X != 0.
+  if (isa<SExtInst>(V) || isa<ZExtInst>(V))
+    return isKnownNonZero(cast<Instruction>(V)->getOperand(0), TD, Depth);
+
+  // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
+  // if the lowest bit is shifted off the end.
+  if (BitWidth && match(V, m_Shl(m_Value(X), m_Value(Y)))) {
+    // shl nuw can't remove any non-zero bits.
+    OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
+    if (BO->hasNoUnsignedWrap())
+      return isKnownNonZero(X, TD, Depth);
+
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
+    if (KnownOne[0])
+      return true;
+  }
+  // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
+  // defined if the sign bit is shifted off the end.
+  else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) {
+    // shr exact can only shift out zero bits.
+    PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
+    if (BO->isExact())
+      return isKnownNonZero(X, TD, Depth);
+
+    bool XKnownNonNegative, XKnownNegative;
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth);
+    if (XKnownNegative)
+      return true;
+  }
+  // div exact can only produce a zero if the dividend is zero.
+  else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
+    return isKnownNonZero(X, TD, Depth);
+  }
+  // X + Y.
+  else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
+    bool XKnownNonNegative, XKnownNegative;
+    bool YKnownNonNegative, YKnownNegative;
+    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, TD, Depth);
+    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, TD, Depth);
+
+    // If X and Y are both non-negative (as signed values) then their sum is not
+    // zero unless both X and Y are zero.
+    if (XKnownNonNegative && YKnownNonNegative)
+      if (isKnownNonZero(X, TD, Depth) || isKnownNonZero(Y, TD, Depth))
+        return true;
+
+    // If X and Y are both negative (as signed values) then their sum is not
+    // zero unless both X and Y equal INT_MIN.
+    if (BitWidth && XKnownNegative && YKnownNegative) {
+      APInt KnownZero(BitWidth, 0);
+      APInt KnownOne(BitWidth, 0);
+      APInt Mask = APInt::getSignedMaxValue(BitWidth);
+      // The sign bit of X is set.  If some other bit is set then X is not equal
+      // to INT_MIN.
+      ComputeMaskedBits(X, KnownZero, KnownOne, TD, Depth);
+      if ((KnownOne & Mask) != 0)
+        return true;
+      // The sign bit of Y is set.  If some other bit is set then Y is not equal
+      // to INT_MIN.
+      ComputeMaskedBits(Y, KnownZero, KnownOne, TD, Depth);
+      if ((KnownOne & Mask) != 0)
+        return true;
+    }
+
+    // The sum of a non-negative number and a power of two is not zero.
+    if (XKnownNonNegative && isKnownToBeAPowerOfTwo(Y, /*OrZero*/false, Depth))
+      return true;
+    if (YKnownNonNegative && isKnownToBeAPowerOfTwo(X, /*OrZero*/false, Depth))
+      return true;
+  }
+  // X * Y.
+  else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) {
+    OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
+    // If X and Y are non-zero then so is X * Y as long as the multiplication
+    // does not overflow.
+    if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) &&
+        isKnownNonZero(X, TD, Depth) && isKnownNonZero(Y, TD, Depth))
+      return true;
+  }
+  // (C ? X : Y) != 0 if X != 0 and Y != 0.
+  else if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    if (isKnownNonZero(SI->getTrueValue(), TD, Depth) &&
+        isKnownNonZero(SI->getFalseValue(), TD, Depth))
+      return true;
+  }
+
+  if (!BitWidth) return false;
+  APInt KnownZero(BitWidth, 0);
+  APInt KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+  return KnownOne != 0;
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+///
+/// This function is defined on values with integer type, values with pointer
+/// type (but only if TD is non-null), and vectors of integers.  In the case
+/// where V is a vector, the mask, known zero, and known one values are the
+/// same width as the vector element, and the bit is set only if it is true
+/// for all of the elements in the vector.
+bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
+                             const DataLayout *TD, unsigned Depth) {
+  APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  return (KnownZero & Mask) == Mask;
+}
+
+
+
+/// ComputeNumSignBits - Return the number of times the sign bit of the
+/// register is replicated into the other bits.  We know that at least 1 bit
+/// is always equal to the sign bit (itself), but other cases can give us
+/// information.  For example, immediately after an "ashr X, 2", we know that
+/// the top 3 bits are all equal to each other, so we return 3.
+///
+/// 'Op' must have a scalar integer type.
+///
+unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
+                                  unsigned Depth) {
+  assert((TD || V->getType()->isIntOrIntVectorTy()) &&
+         "ComputeNumSignBits requires a DataLayout object to operate "
+         "on non-integer values!");
+  Type *Ty = V->getType();
+  unsigned TyBits = TD ? TD->getTypeSizeInBits(V->getType()->getScalarType()) :
+                         Ty->getScalarSizeInBits();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  // Note that ConstantInt is handled by the general ComputeMaskedBits case
+  // below.
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  Operator *U = dyn_cast<Operator>(V);
+  switch (Operator::getOpcode(V)) {
+  default: break;
+  case Instruction::SExt:
+    Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
+    return ComputeNumSignBits(U->getOperand(0), TD, Depth+1) + Tmp;
+
+  case Instruction::AShr: {
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    // ashr X, C   -> adds C sign bits.  Vectors too.
+    const APInt *ShAmt;
+    if (match(U->getOperand(1), m_APInt(ShAmt))) {
+      Tmp += ShAmt->getZExtValue();
+      if (Tmp > TyBits) Tmp = TyBits;
+    }
+    return Tmp;
+  }
+  case Instruction::Shl: {
+    const APInt *ShAmt;
+    if (match(U->getOperand(1), m_APInt(ShAmt))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+      Tmp2 = ShAmt->getZExtValue();
+      if (Tmp2 >= TyBits ||      // Bad shift.
+          Tmp2 >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - Tmp2;
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
+
+  case Instruction::Select:
+    Tmp = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(U->getOperand(2), TD, Depth+1);
+    return std::min(Tmp, Tmp2);
+
+  case Instruction::Add:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantInt *CRHS = dyn_cast<ConstantInt>(U->getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+        ComputeMaskedBits(U->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
+          return TyBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp2 == 1) return 1;
+    return std::min(Tmp, Tmp2)-1;
+
+  case Instruction::Sub:
+    Tmp2 = ComputeNumSignBits(U->getOperand(1), TD, Depth+1);
+    if (Tmp2 == 1) return 1;
+
+    // Handle NEG.
+    if (ConstantInt *CLHS = dyn_cast<ConstantInt>(U->getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+        ComputeMaskedBits(U->getOperand(1), KnownZero, KnownOne, TD, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
+          return TyBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(U->getOperand(0), TD, Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    return std::min(Tmp, Tmp2)-1;
+
+  case Instruction::PHI: {
+    PHINode *PN = cast<PHINode>(U);
+    // Don't analyze large in-degree PHIs.
+    if (PN->getNumIncomingValues() > 4) break;
+
+    // Take the minimum of all incoming values.  This can't infinitely loop
+    // because of our depth threshold.
+    Tmp = ComputeNumSignBits(PN->getIncomingValue(0), TD, Depth+1);
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+      if (Tmp == 1) return Tmp;
+      Tmp = std::min(Tmp,
+                     ComputeNumSignBits(PN->getIncomingValue(i), TD, Depth+1));
+    }
+    return Tmp;
+  }
+
+  case Instruction::Trunc:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
+  APInt Mask;
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-TyBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(TyBits, Mask.countLeadingZeros()));
+}
+
+/// ComputeMultiple - This function computes the integer multiple of Base that
+/// equals V.  If successful, it returns true and returns the multiple in
+/// Multiple.  If unsuccessful, it returns false. It looks
+/// through SExt instructions only if LookThroughSExt is true.
+bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
+                           bool LookThroughSExt, unsigned Depth) {
+  const unsigned MaxDepth = 6;
+
+  assert(V && "No Value?");
+  assert(Depth <= MaxDepth && "Limit Search Depth");
+  assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
+
+  Type *T = V->getType();
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+
+  if (Base == 0)
+    return false;
+
+  if (Base == 1) {
+    Multiple = V;
+    return true;
+  }
+
+  ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
+  Constant *BaseVal = ConstantInt::get(T, Base);
+  if (CO && CO == BaseVal) {
+    // Multiple is 1.
+    Multiple = ConstantInt::get(T, 1);
+    return true;
+  }
+
+  if (CI && CI->getZExtValue() % Base == 0) {
+    Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
+    return true;
+  }
+
+  if (Depth == MaxDepth) return false;  // Limit search depth.
+
+  Operator *I = dyn_cast<Operator>(V);
+  if (!I) return false;
+
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::SExt:
+    if (!LookThroughSExt) return false;
+    // otherwise fall through to ZExt
+  case Instruction::ZExt:
+    return ComputeMultiple(I->getOperand(0), Base, Multiple,
+                           LookThroughSExt, Depth+1);
+  case Instruction::Shl:
+  case Instruction::Mul: {
+    Value *Op0 = I->getOperand(0);
+    Value *Op1 = I->getOperand(1);
+
+    if (I->getOpcode() == Instruction::Shl) {
+      ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
+      if (!Op1CI) return false;
+      // Turn Op0 << Op1 into Op0 * 2^Op1
+      APInt Op1Int = Op1CI->getValue();
+      uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
+      APInt API(Op1Int.getBitWidth(), 0);
+      API.setBit(BitToSet);
+      Op1 = ConstantInt::get(V->getContext(), API);
+    }
+
+    Value *Mul0 = NULL;
+    if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
+      if (Constant *Op1C = dyn_cast<Constant>(Op1))
+        if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
+          if (Op1C->getType()->getPrimitiveSizeInBits() <
+              MulC->getType()->getPrimitiveSizeInBits())
+            Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
+          if (Op1C->getType()->getPrimitiveSizeInBits() >
+              MulC->getType()->getPrimitiveSizeInBits())
+            MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
+
+          // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
+          Multiple = ConstantExpr::getMul(MulC, Op1C);
+          return true;
+        }
+
+      if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
+        if (Mul0CI->getValue() == 1) {
+          // V == Base * Op1, so return Op1
+          Multiple = Op1;
+          return true;
+        }
+    }
+
+    Value *Mul1 = NULL;
+    if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
+      if (Constant *Op0C = dyn_cast<Constant>(Op0))
+        if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
+          if (Op0C->getType()->getPrimitiveSizeInBits() <
+              MulC->getType()->getPrimitiveSizeInBits())
+            Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
+          if (Op0C->getType()->getPrimitiveSizeInBits() >
+              MulC->getType()->getPrimitiveSizeInBits())
+            MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
+
+          // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
+          Multiple = ConstantExpr::getMul(MulC, Op0C);
+          return true;
+        }
+
+      if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
+        if (Mul1CI->getValue() == 1) {
+          // V == Base * Op0, so return Op0
+          Multiple = Op0;
+          return true;
+        }
+    }
+  }
+  }
+
+  // We could not determine if V is a multiple of Base.
+  return false;
+}
+
+/// CannotBeNegativeZero - Return true if we can prove that the specified FP
+/// value is never equal to -0.0.
+///
+/// NOTE: this function will need to be revisited when we support non-default
+/// rounding modes!
+///
+bool llvm::CannotBeNegativeZero(const Value *V, unsigned Depth) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
+    return !CFP->getValueAPF().isNegZero();
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  const Operator *I = dyn_cast<Operator>(V);
+  if (I == 0) return false;
+
+  // Check if the nsz fast-math flag is set
+  if (const FPMathOperator *FPO = dyn_cast<FPMathOperator>(I))
+    if (FPO->hasNoSignedZeros())
+      return true;
+
+  // (add x, 0.0) is guaranteed to return +0.0, not -0.0.
+  if (I->getOpcode() == Instruction::FAdd)
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(1)))
+      if (CFP->isNullValue())
+        return true;
+
+  // sitofp and uitofp turn into +0.0 for zero.
+  if (isa<SIToFPInst>(I) || isa<UIToFPInst>(I))
+    return true;
+
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    // sqrt(-0.0) = -0.0, no other negative results are possible.
+    if (II->getIntrinsicID() == Intrinsic::sqrt)
+      return CannotBeNegativeZero(II->getArgOperand(0), Depth+1);
+
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (const Function *F = CI->getCalledFunction()) {
+      if (F->isDeclaration()) {
+        // abs(x) != -0.0
+        if (F->getName() == "abs") return true;
+        // fabs[lf](x) != -0.0
+        if (F->getName() == "fabs") return true;
+        if (F->getName() == "fabsf") return true;
+        if (F->getName() == "fabsl") return true;
+        if (F->getName() == "sqrt" || F->getName() == "sqrtf" ||
+            F->getName() == "sqrtl")
+          return CannotBeNegativeZero(CI->getArgOperand(0), Depth+1);
+      }
+    }
+
+  return false;
+}
+
+/// isBytewiseValue - If the specified value can be set by repeating the same
+/// byte in memory, return the i8 value that it is represented with.  This is
+/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
+/// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
+/// byte store (e.g. i16 0x1234), return null.
+Value *llvm::isBytewiseValue(Value *V) {
+  // All byte-wide stores are splatable, even of arbitrary variables.
+  if (V->getType()->isIntegerTy(8)) return V;
+
+  // Handle 'null' ConstantArrayZero etc.
+  if (Constant *C = dyn_cast<Constant>(V))
+    if (C->isNullValue())
+      return Constant::getNullValue(Type::getInt8Ty(V->getContext()));
+
+  // Constant float and double values can be handled as integer values if the
+  // corresponding integer value is "byteable".  An important case is 0.0.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType()->isFloatTy())
+      V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
+    if (CFP->getType()->isDoubleTy())
+      V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
+    // Don't handle long double formats, which have strange constraints.
+  }
+
+  // We can handle constant integers that are power of two in size and a
+  // multiple of 8 bits.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    unsigned Width = CI->getBitWidth();
+    if (isPowerOf2_32(Width) && Width > 8) {
+      // We can handle this value if the recursive binary decomposition is the
+      // same at all levels.
+      APInt Val = CI->getValue();
+      APInt Val2;
+      while (Val.getBitWidth() != 8) {
+        unsigned NextWidth = Val.getBitWidth()/2;
+        Val2  = Val.lshr(NextWidth);
+        Val2 = Val2.trunc(Val.getBitWidth()/2);
+        Val = Val.trunc(Val.getBitWidth()/2);
+
+        // If the top/bottom halves aren't the same, reject it.
+        if (Val != Val2)
+          return 0;
+      }
+      return ConstantInt::get(V->getContext(), Val);
+    }
+  }
+
+  // A ConstantDataArray/Vector is splatable if all its members are equal and
+  // also splatable.
+  if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
+    Value *Elt = CA->getElementAsConstant(0);
+    Value *Val = isBytewiseValue(Elt);
+    if (!Val)
+      return 0;
+
+    for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
+      if (CA->getElementAsConstant(I) != Elt)
+        return 0;
+
+    return Val;
+  }
+
+  // Conceptually, we could handle things like:
+  //   %a = zext i8 %X to i16
+  //   %b = shl i16 %a, 8
+  //   %c = or i16 %a, %b
+  // but until there is an example that actually needs this, it doesn't seem
+  // worth worrying about.
+  return 0;
+}
+
+
+// This is the recursive version of BuildSubAggregate. It takes a few different
+// arguments. Idxs is the index within the nested struct From that we are
+// looking at now (which is of type IndexedType). IdxSkip is the number of
+// indices from Idxs that should be left out when inserting into the resulting
+// struct. To is the result struct built so far, new insertvalue instructions
+// build on that.
+static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
+                                SmallVector<unsigned, 10> &Idxs,
+                                unsigned IdxSkip,
+                                Instruction *InsertBefore) {
+  llvm::StructType *STy = dyn_cast<llvm::StructType>(IndexedType);
+  if (STy) {
+    // Save the original To argument so we can modify it
+    Value *OrigTo = To;
+    // General case, the type indexed by Idxs is a struct
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      // Process each struct element recursively
+      Idxs.push_back(i);
+      Value *PrevTo = To;
+      To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
+                             InsertBefore);
+      Idxs.pop_back();
+      if (!To) {
+        // Couldn't find any inserted value for this index? Cleanup
+        while (PrevTo != OrigTo) {
+          InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
+          PrevTo = Del->getAggregateOperand();
+          Del->eraseFromParent();
+        }
+        // Stop processing elements
+        break;
+      }
+    }
+    // If we successfully found a value for each of our subaggregates
+    if (To)
+      return To;
+  }
+  // Base case, the type indexed by SourceIdxs is not a struct, or not all of
+  // the struct's elements had a value that was inserted directly. In the latter
+  // case, perhaps we can't determine each of the subelements individually, but
+  // we might be able to find the complete struct somewhere.
+
+  // Find the value that is at that particular spot
+  Value *V = FindInsertedValue(From, Idxs);
+
+  if (!V)
+    return NULL;
+
+  // Insert the value in the new (sub) aggregrate
+  return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
+                                       "tmp", InsertBefore);
+}
+
+// This helper takes a nested struct and extracts a part of it (which is again a
+// struct) into a new value. For example, given the struct:
+// { a, { b, { c, d }, e } }
+// and the indices "1, 1" this returns
+// { c, d }.
+//
+// It does this by inserting an insertvalue for each element in the resulting
+// struct, as opposed to just inserting a single struct. This will only work if
+// each of the elements of the substruct are known (ie, inserted into From by an
+// insertvalue instruction somewhere).
+//
+// All inserted insertvalue instructions are inserted before InsertBefore
+static Value *BuildSubAggregate(Value *From, ArrayRef<unsigned> idx_range,
+                                Instruction *InsertBefore) {
+  assert(InsertBefore && "Must have someplace to insert!");
+  Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
+                                                             idx_range);
+  Value *To = UndefValue::get(IndexedType);
+  SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
+  unsigned IdxSkip = Idxs.size();
+
+  return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
+}
+
+/// FindInsertedValue - Given an aggregrate and an sequence of indices, see if
+/// the scalar value indexed is already around as a register, for example if it
+/// were inserted directly into the aggregrate.
+///
+/// If InsertBefore is not null, this function will duplicate (modified)
+/// insertvalues when a part of a nested struct is extracted.
+Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
+                               Instruction *InsertBefore) {
+  // Nothing to index? Just return V then (this is useful at the end of our
+  // recursion).
+  if (idx_range.empty())
+    return V;
+  // We have indices, so V should have an indexable type.
+  assert((V->getType()->isStructTy() || V->getType()->isArrayTy()) &&
+         "Not looking at a struct or array?");
+  assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) &&
+         "Invalid indices for type?");
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    C = C->getAggregateElement(idx_range[0]);
+    if (C == 0) return 0;
+    return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
+  }
+
+  if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
+    // Loop the indices for the insertvalue instruction in parallel with the
+    // requested indices
+    const unsigned *req_idx = idx_range.begin();
+    for (const unsigned *i = I->idx_begin(), *e = I->idx_end();
+         i != e; ++i, ++req_idx) {
+      if (req_idx == idx_range.end()) {
+        // We can't handle this without inserting insertvalues
+        if (!InsertBefore)
+          return 0;
+
+        // The requested index identifies a part of a nested aggregate. Handle
+        // this specially. For example,
+        // %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
+        // %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
+        // %C = extractvalue {i32, { i32, i32 } } %B, 1
+        // This can be changed into
+        // %A = insertvalue {i32, i32 } undef, i32 10, 0
+        // %C = insertvalue {i32, i32 } %A, i32 11, 1
+        // which allows the unused 0,0 element from the nested struct to be
+        // removed.
+        return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
+                                 InsertBefore);
+      }
+
+      // This insert value inserts something else than what we are looking for.
+      // See if the (aggregrate) value inserted into has the value we are
+      // looking for, then.
+      if (*req_idx != *i)
+        return FindInsertedValue(I->getAggregateOperand(), idx_range,
+                                 InsertBefore);
+    }
+    // If we end up here, the indices of the insertvalue match with those
+    // requested (though possibly only partially). Now we recursively look at
+    // the inserted value, passing any remaining indices.
+    return FindInsertedValue(I->getInsertedValueOperand(),
+                             makeArrayRef(req_idx, idx_range.end()),
+                             InsertBefore);
+  }
+
+  if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
+    // If we're extracting a value from an aggregrate that was extracted from
+    // something else, we can extract from that something else directly instead.
+    // However, we will need to chain I's indices with the requested indices.
+
+    // Calculate the number of indices required
+    unsigned size = I->getNumIndices() + idx_range.size();
+    // Allocate some space to put the new indices in
+    SmallVector<unsigned, 5> Idxs;
+    Idxs.reserve(size);
+    // Add indices from the extract value instruction
+    Idxs.append(I->idx_begin(), I->idx_end());
+
+    // Add requested indices
+    Idxs.append(idx_range.begin(), idx_range.end());
+
+    assert(Idxs.size() == size
+           && "Number of indices added not correct?");
+
+    return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore);
+  }
+  // Otherwise, we don't know (such as, extracting from a function return value
+  // or load instruction)
+  return 0;
+}
+
+/// GetPointerBaseWithConstantOffset - Analyze the specified pointer to see if
+/// it can be expressed as a base pointer plus a constant offset.  Return the
+/// base and offset to the caller.
+Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
+                                              const DataLayout *TD) {
+  // Without DataLayout, conservatively assume 64-bit offsets, which is
+  // the widest we support.
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  APInt ByteOffset(BitWidth, 0);
+  while (1) {
+    if (Ptr->getType()->isVectorTy())
+      break;
+
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(BitWidth, 0);
+      if (TD && !GEP->accumulateConstantOffset(*TD, GEPOffset))
+        break;
+      ByteOffset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->mayBeOverridden())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+  }
+  Offset = ByteOffset.getSExtValue();
+  return Ptr;
+}
+
+
+/// getConstantStringInfo - This function computes the length of a
+/// null-terminated C string pointed to by V.  If successful, it returns true
+/// and returns the string in Str.  If unsuccessful, it returns false.
+bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
+                                 uint64_t Offset, bool TrimAtNul) {
+  assert(V);
+
+  // Look through bitcast instructions and geps.
+  V = V->stripPointerCasts();
+
+  // If the value is a GEP instructionor  constant expression, treat it as an
+  // offset.
+  if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    // Make sure the GEP has exactly three arguments.
+    if (GEP->getNumOperands() != 3)
+      return false;
+
+    // Make sure the index-ee is a pointer to array of i8.
+    PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
+    ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
+    if (AT == 0 || !AT->getElementType()->isIntegerTy(8))
+      return false;
+
+    // Check to make sure that the first operand of the GEP is an integer and
+    // has value 0 so that we are sure we're indexing into the initializer.
+    const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
+    if (FirstIdx == 0 || !FirstIdx->isZero())
+      return false;
+
+    // If the second index isn't a ConstantInt, then this is a variable index
+    // into the array.  If this occurs, we can't say anything meaningful about
+    // the string.
+    uint64_t StartIdx = 0;
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
+      StartIdx = CI->getZExtValue();
+    else
+      return false;
+    return getConstantStringInfo(GEP->getOperand(0), Str, StartIdx+Offset);
+  }
+
+  // The GEP instruction, constant or instruction, must reference a global
+  // variable that is a constant and is initialized. The referenced constant
+  // initializer is the array that we'll use for optimization.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return false;
+
+  // Handle the all-zeros case
+  if (GV->getInitializer()->isNullValue()) {
+    // This is a degenerate case. The initializer is constant zero so the
+    // length of the string must be zero.
+    Str = "";
+    return true;
+  }
+
+  // Must be a Constant Array
+  const ConstantDataArray *Array =
+    dyn_cast<ConstantDataArray>(GV->getInitializer());
+  if (Array == 0 || !Array->isString())
+    return false;
+
+  // Get the number of elements in the array
+  uint64_t NumElts = Array->getType()->getArrayNumElements();
+
+  // Start out with the entire array in the StringRef.
+  Str = Array->getAsString();
+
+  if (Offset > NumElts)
+    return false;
+
+  // Skip over 'offset' bytes.
+  Str = Str.substr(Offset);
+
+  if (TrimAtNul) {
+    // Trim off the \0 and anything after it.  If the array is not nul
+    // terminated, we just return the whole end of string.  The client may know
+    // some other way that the string is length-bound.
+    Str = Str.substr(0, Str.find('\0'));
+  }
+  return true;
+}
+
+// These next two are very similar to the above, but also look through PHI
+// nodes.
+// TODO: See if we can integrate these two together.
+
+/// GetStringLengthH - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+static uint64_t GetStringLengthH(Value *V, SmallPtrSet<PHINode*, 32> &PHIs) {
+  // Look through noop bitcast instructions.
+  V = V->stripPointerCasts();
+
+  // If this is a PHI node, there are two cases: either we have already seen it
+  // or we haven't.
+  if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    if (!PHIs.insert(PN))
+      return ~0ULL;  // already in the set.
+
+    // If it was new, see if all the input strings are the same length.
+    uint64_t LenSoFar = ~0ULL;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      uint64_t Len = GetStringLengthH(PN->getIncomingValue(i), PHIs);
+      if (Len == 0) return 0; // Unknown length -> unknown.
+
+      if (Len == ~0ULL) continue;
+
+      if (Len != LenSoFar && LenSoFar != ~0ULL)
+        return 0;    // Disagree -> unknown.
+      LenSoFar = Len;
+    }
+
+    // Success, all agree.
+    return LenSoFar;
+  }
+
+  // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
+  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    if (Len1 == 0) return 0;
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    if (Len2 == 0) return 0;
+    if (Len1 == ~0ULL) return Len2;
+    if (Len2 == ~0ULL) return Len1;
+    if (Len1 != Len2) return 0;
+    return Len1;
+  }
+
+  // Otherwise, see if we can read the string.
+  StringRef StrData;
+  if (!getConstantStringInfo(V, StrData))
+    return 0;
+
+  return StrData.size()+1;
+}
+
+/// GetStringLength - If we can compute the length of the string pointed to by
+/// the specified pointer, return 'len+1'.  If we can't, return 0.
+uint64_t llvm::GetStringLength(Value *V) {
+  if (!V->getType()->isPointerTy()) return 0;
+
+  SmallPtrSet<PHINode*, 32> PHIs;
+  uint64_t Len = GetStringLengthH(V, PHIs);
+  // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
+  // an empty string as a length.
+  return Len == ~0ULL ? 1 : Len;
+}
+
+Value *
+llvm::GetUnderlyingObject(Value *V, const DataLayout *TD, unsigned MaxLookup) {
+  if (!V->getType()->isPointerTy())
+    return V;
+  for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        return V;
+      V = GA->getAliasee();
+    } else {
+      // See if InstructionSimplify knows any relevant tricks.
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        // TODO: Acquire a DominatorTree and use it.
+        if (Value *Simplified = SimplifyInstruction(I, TD, 0)) {
+          V = Simplified;
+          continue;
+        }
+
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  }
+  return V;
+}
+
+void
+llvm::GetUnderlyingObjects(Value *V,
+                           SmallVectorImpl<Value *> &Objects,
+                           const DataLayout *TD,
+                           unsigned MaxLookup) {
+  SmallPtrSet<Value *, 4> Visited;
+  SmallVector<Value *, 4> Worklist;
+  Worklist.push_back(V);
+  do {
+    Value *P = Worklist.pop_back_val();
+    P = GetUnderlyingObject(P, TD, MaxLookup);
+
+    if (!Visited.insert(P))
+      continue;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
+    }
+
+    Objects.push_back(P);
+  } while (!Worklist.empty());
+}
+
+/// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
+/// are lifetime markers.
+///
+bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
+  for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+       UI != UE; ++UI) {
+    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI);
+    if (!II) return false;
+
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  }
+  return true;
+}
+
+bool llvm::isSafeToSpeculativelyExecute(const Value *V,
+                                        const DataLayout *TD) {
+  const Operator *Inst = dyn_cast<Operator>(V);
+  if (!Inst)
+    return false;
+
+  for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
+    if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
+      if (C->canTrap())
+        return false;
+
+  switch (Inst->getOpcode()) {
+  default:
+    return true;
+  case Instruction::UDiv:
+  case Instruction::URem:
+    // x / y is undefined if y == 0, but calcuations like x / 3 are safe.
+    return isKnownNonZero(Inst->getOperand(1), TD);
+  case Instruction::SDiv:
+  case Instruction::SRem: {
+    Value *Op = Inst->getOperand(1);
+    // x / y is undefined if y == 0
+    if (!isKnownNonZero(Op, TD))
+      return false;
+    // x / y might be undefined if y == -1
+    unsigned BitWidth = getBitWidth(Op->getType(), TD);
+    if (BitWidth == 0)
+      return false;
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    ComputeMaskedBits(Op, KnownZero, KnownOne, TD);
+    return !!KnownZero;
+  }
+  case Instruction::Load: {
+    const LoadInst *LI = cast<LoadInst>(Inst);
+    if (!LI->isUnordered())
+      return false;
+    return LI->getPointerOperand()->isDereferenceablePointer();
+  }
+  case Instruction::Call: {
+   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+     switch (II->getIntrinsicID()) {
+       // These synthetic intrinsics have no side-effects, and just mark
+       // information about their operands.
+       // FIXME: There are other no-op synthetic instructions that potentially
+       // should be considered at least *safe* to speculate...
+       case Intrinsic::dbg_declare:
+       case Intrinsic::dbg_value:
+         return true;
+
+       case Intrinsic::bswap:
+       case Intrinsic::ctlz:
+       case Intrinsic::ctpop:
+       case Intrinsic::cttz:
+       case Intrinsic::objectsize:
+       case Intrinsic::sadd_with_overflow:
+       case Intrinsic::smul_with_overflow:
+       case Intrinsic::ssub_with_overflow:
+       case Intrinsic::uadd_with_overflow:
+       case Intrinsic::umul_with_overflow:
+       case Intrinsic::usub_with_overflow:
+         return true;
+       // TODO: some fp intrinsics are marked as having the same error handling
+       // as libm. They're safe to speculate when they won't error.
+       // TODO: are convert_{from,to}_fp16 safe?
+       // TODO: can we list target-specific intrinsics here?
+       default: break;
+     }
+   }
+    return false; // The called function could have undefined behavior or
+                  // side-effects, even if marked readnone nounwind.
+  }
+  case Instruction::VAArg:
+  case Instruction::Alloca:
+  case Instruction::Invoke:
+  case Instruction::PHI:
+  case Instruction::Store:
+  case Instruction::Ret:
+  case Instruction::Br:
+  case Instruction::IndirectBr:
+  case Instruction::Switch:
+  case Instruction::Unreachable:
+  case Instruction::Fence:
+  case Instruction::LandingPad:
+  case Instruction::AtomicRMW:
+  case Instruction::AtomicCmpXchg:
+  case Instruction::Resume:
+    return false; // Misc instructions which have effects
+  }
+}
+
+/// isKnownNonNull - Return true if we know that the specified value is never
+/// null.
+bool llvm::isKnownNonNull(const Value *V) {
+  // Alloca never returns null, malloc might.
+  if (isa<AllocaInst>(V)) return true;
+
+  // A byval argument is never null.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+
+  // Global values are not null unless extern weak.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return !GV->hasExternalWeakLinkage();
+  return false;
+}
diff --git a/contrib/llvm/lib/Archive/Archive.cpp b/contrib/llvm/lib/Archive/Archive.cpp
new file mode 100644
index 000000000000..1f36a00ab086
--- /dev/null
+++ b/contrib/llvm/lib/Archive/Archive.cpp
@@ -0,0 +1,262 @@
+//===-- Archive.cpp - Generic LLVM archive functions ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the Archive and ArchiveMember
+// classes that is common to both reading and writing archives..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/Archive.h"
+#include "ArchiveInternals.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/system_error.h"
+#include <cstring>
+#include <memory>
+using namespace llvm;
+
+// getMemberSize - compute the actual physical size of the file member as seen
+// on disk. This isn't the size of member's payload. Use getSize() for that.
+unsigned
+ArchiveMember::getMemberSize() const {
+  // Basically its the file size plus the header size
+  unsigned result =  info.fileSize + sizeof(ArchiveMemberHeader);
+
+  // If it has a long filename, include the name length
+  if (hasLongFilename())
+    result += path.str().length() + 1;
+
+  // If its now odd lengthed, include the padding byte
+  if (result % 2 != 0 )
+    result++;
+
+  return result;
+}
+
+// This default constructor is only use by the ilist when it creates its
+// sentry node. We give it specific static values to make it stand out a bit.
+ArchiveMember::ArchiveMember()
+  : parent(0), path("--invalid--"), flags(0), data(0)
+{
+  info.user = sys::Process::GetCurrentUserId();
+  info.group = sys::Process::GetCurrentGroupId();
+  info.mode = 0777;
+  info.fileSize = 0;
+  info.modTime = sys::TimeValue::now();
+}
+
+// This is the constructor that the Archive class uses when it is building or
+// reading an archive. It just defaults a few things and ensures the parent is
+// set for the iplist. The Archive class fills in the ArchiveMember's data.
+// This is required because correctly setting the data may depend on other
+// things in the Archive.
+ArchiveMember::ArchiveMember(Archive* PAR)
+  : parent(PAR), path(), flags(0), data(0)
+{
+}
+
+// This method allows an ArchiveMember to be replaced with the data for a
+// different file, presumably as an update to the member. It also makes sure
+// the flags are reset correctly.
+bool ArchiveMember::replaceWith(const sys::Path& newFile, std::string* ErrMsg) {
+  bool Exists;
+  if (sys::fs::exists(newFile.str(), Exists) || !Exists) {
+    if (ErrMsg)
+      *ErrMsg = "Can not replace an archive member with a non-existent file";
+    return true;
+  }
+
+  data = 0;
+  path = newFile;
+
+  // SVR4 symbol tables have an empty name
+  if (path.str() == ARFILE_SVR4_SYMTAB_NAME)
+    flags |= SVR4SymbolTableFlag;
+  else
+    flags &= ~SVR4SymbolTableFlag;
+
+  // BSD4.4 symbol tables have a special name
+  if (path.str() == ARFILE_BSD4_SYMTAB_NAME)
+    flags |= BSD4SymbolTableFlag;
+  else
+    flags &= ~BSD4SymbolTableFlag;
+
+  // LLVM symbol tables have a very specific name
+  if (path.str() == ARFILE_LLVM_SYMTAB_NAME)
+    flags |= LLVMSymbolTableFlag;
+  else
+    flags &= ~LLVMSymbolTableFlag;
+
+  // String table name
+  if (path.str() == ARFILE_STRTAB_NAME)
+    flags |= StringTableFlag;
+  else
+    flags &= ~StringTableFlag;
+
+  // If it has a slash then it has a path
+  bool hasSlash = path.str().find('/') != std::string::npos;
+  if (hasSlash)
+    flags |= HasPathFlag;
+  else
+    flags &= ~HasPathFlag;
+
+  // If it has a slash or its over 15 chars then its a long filename format
+  if (hasSlash || path.str().length() > 15)
+    flags |= HasLongFilenameFlag;
+  else
+    flags &= ~HasLongFilenameFlag;
+
+  // Get the signature and status info
+  const char* signature = (const char*) data;
+  SmallString<4> magic;
+  if (!signature) {
+    sys::fs::get_magic(path.str(), magic.capacity(), magic);
+    signature = magic.c_str();
+    const sys::FileStatus *FSinfo = path.getFileStatus(false, ErrMsg);
+    if (FSinfo)
+      info = *FSinfo;
+    else
+      return true;
+  }
+
+  // Determine what kind of file it is.
+  switch (sys::IdentifyFileType(signature,4)) {
+    case sys::Bitcode_FileType:
+      flags |= BitcodeFlag;
+      break;
+    default:
+      flags &= ~BitcodeFlag;
+      break;
+  }
+  return false;
+}
+
+// Archive constructor - this is the only constructor that gets used for the
+// Archive class. Everything else (default,copy) is deprecated. This just
+// initializes and maps the file into memory, if requested.
+Archive::Archive(const sys::Path& filename, LLVMContext& C)
+  : archPath(filename), members(), mapfile(0), base(0), symTab(), strtab(),
+    symTabSize(0), firstFileOffset(0), modules(), foreignST(0), Context(C) {
+}
+
+bool
+Archive::mapToMemory(std::string* ErrMsg) {
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec = MemoryBuffer::getFile(archPath.c_str(), File)) {
+    if (ErrMsg)
+      *ErrMsg = ec.message();
+    return true;
+  }
+  mapfile = File.take();
+  base = mapfile->getBufferStart();
+  return false;
+}
+
+void Archive::cleanUpMemory() {
+  // Shutdown the file mapping
+  delete mapfile;
+  mapfile = 0;
+  base = 0;
+
+  // Forget the entire symbol table
+  symTab.clear();
+  symTabSize = 0;
+
+  firstFileOffset = 0;
+
+  // Free the foreign symbol table member
+  if (foreignST) {
+    delete foreignST;
+    foreignST = 0;
+  }
+
+  // Delete any Modules and ArchiveMember's we've allocated as a result of
+  // symbol table searches.
+  for (ModuleMap::iterator I=modules.begin(), E=modules.end(); I != E; ++I ) {
+    delete I->second.first;
+    delete I->second.second;
+  }
+}
+
+// Archive destructor - just clean up memory
+Archive::~Archive() {
+  cleanUpMemory();
+}
+
+
+
+static void getSymbols(Module*M, std::vector<std::string>& symbols) {
+  // Loop over global variables
+  for (Module::global_iterator GI = M->global_begin(), GE=M->global_end(); GI != GE; ++GI)
+    if (!GI->isDeclaration() && !GI->hasLocalLinkage())
+      if (!GI->getName().empty())
+        symbols.push_back(GI->getName());
+
+  // Loop over functions
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
+    if (!FI->isDeclaration() && !FI->hasLocalLinkage())
+      if (!FI->getName().empty())
+        symbols.push_back(FI->getName());
+
+  // Loop over aliases
+  for (Module::alias_iterator AI = M->alias_begin(), AE = M->alias_end();
+       AI != AE; ++AI) {
+    if (AI->hasName())
+      symbols.push_back(AI->getName());
+  }
+}
+
+// Get just the externally visible defined symbols from the bitcode
+bool llvm::GetBitcodeSymbols(const sys::Path& fName,
+                             LLVMContext& Context,
+                             std::vector<std::string>& symbols,
+                             std::string* ErrMsg) {
+  OwningPtr<MemoryBuffer> Buffer;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(fName.c_str(), Buffer)) {
+    if (ErrMsg) *ErrMsg = "Could not open file '" + fName.str() + "'" + ": "
+                        + ec.message();
+    return true;
+  }
+
+  Module *M = ParseBitcodeFile(Buffer.get(), Context, ErrMsg);
+  if (!M)
+    return true;
+
+  // Get the symbols
+  getSymbols(M, symbols);
+
+  // Done with the module.
+  delete M;
+  return true;
+}
+
+Module*
+llvm::GetBitcodeSymbols(const char *BufPtr, unsigned Length,
+                        const std::string& ModuleID,
+                        LLVMContext& Context,
+                        std::vector<std::string>& symbols,
+                        std::string* ErrMsg) {
+  // Get the module.
+  OwningPtr<MemoryBuffer> Buffer(
+    MemoryBuffer::getMemBufferCopy(StringRef(BufPtr, Length),ModuleID.c_str()));
+
+  Module *M = ParseBitcodeFile(Buffer.get(), Context, ErrMsg);
+  if (!M)
+    return 0;
+
+  // Get the symbols
+  getSymbols(M, symbols);
+
+  // Done with the module. Note that it's the caller's responsibility to delete
+  // the Module.
+  return M;
+}
diff --git a/contrib/llvm/lib/Archive/ArchiveInternals.h b/contrib/llvm/lib/Archive/ArchiveInternals.h
new file mode 100644
index 000000000000..f6c87e899f25
--- /dev/null
+++ b/contrib/llvm/lib/Archive/ArchiveInternals.h
@@ -0,0 +1,88 @@
+//===-- lib/Archive/ArchiveInternals.h -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Internal implementation header for LLVM Archive files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_ARCHIVE_ARCHIVEINTERNALS_H
+#define LIB_ARCHIVE_ARCHIVEINTERNALS_H
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitcode/Archive.h"
+#include "llvm/Support/TimeValue.h"
+#include <cstring>
+
+#define ARFILE_MAGIC "!<arch>\n"                   ///< magic string
+#define ARFILE_MAGIC_LEN (sizeof(ARFILE_MAGIC)-1)  ///< length of magic string
+#define ARFILE_SVR4_SYMTAB_NAME "/               " ///< SVR4 symtab entry name
+#define ARFILE_LLVM_SYMTAB_NAME "#_LLVM_SYM_TAB_#" ///< LLVM symtab entry name
+#define ARFILE_BSD4_SYMTAB_NAME "__.SYMDEF SORTED" ///< BSD4 symtab entry name
+#define ARFILE_STRTAB_NAME      "//              " ///< Name of string table
+#define ARFILE_PAD "\n"                            ///< inter-file align padding
+#define ARFILE_MEMBER_MAGIC "`\n"                  ///< fmag field magic #
+
+namespace llvm {
+
+  class LLVMContext;
+
+  /// The ArchiveMemberHeader structure is used internally for bitcode
+  /// archives.
+  /// The header precedes each file member in the archive. This structure is
+  /// defined using character arrays for direct and correct interpretation
+  /// regardless of the endianess of the machine that produced it.
+  /// @brief Archive File Member Header
+  class ArchiveMemberHeader {
+    /// @name Data
+    /// @{
+    public:
+      char name[16];  ///< Name of the file member.
+      char date[12];  ///< File date, decimal seconds since Epoch
+      char uid[6];    ///< user id in ASCII decimal
+      char gid[6];    ///< group id in ASCII decimal
+      char mode[8];   ///< file mode in ASCII octal
+      char size[10];  ///< file size in ASCII decimal
+      char fmag[2];   ///< Always contains ARFILE_MAGIC_TERMINATOR
+
+    /// @}
+    /// @name Methods
+    /// @{
+    public:
+    void init() {
+      memset(name,' ',16);
+      memset(date,' ',12);
+      memset(uid,' ',6);
+      memset(gid,' ',6);
+      memset(mode,' ',8);
+      memset(size,' ',10);
+      fmag[0] = '`';
+      fmag[1] = '\n';
+    }
+
+    bool checkSignature() const {
+      return 0 == memcmp(fmag, ARFILE_MEMBER_MAGIC,2);
+    }
+  };
+  
+  // Get just the externally visible defined symbols from the bitcode
+  bool GetBitcodeSymbols(const sys::Path& fName,
+                          LLVMContext& Context,
+                          std::vector<std::string>& symbols,
+                          std::string* ErrMsg);
+  
+  Module* GetBitcodeSymbols(const char *Buffer, unsigned Length,
+                            const std::string& ModuleID,
+                            LLVMContext& Context,
+                            std::vector<std::string>& symbols,
+                            std::string* ErrMsg);
+}
+
+#endif
+
+// vim: sw=2 ai
diff --git a/contrib/llvm/lib/Archive/ArchiveReader.cpp b/contrib/llvm/lib/Archive/ArchiveReader.cpp
new file mode 100644
index 000000000000..14713e692c0f
--- /dev/null
+++ b/contrib/llvm/lib/Archive/ArchiveReader.cpp
@@ -0,0 +1,633 @@
+//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up standard unix archive files (.a) containing LLVM bitcode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/Archive.h"
+#include "ArchiveInternals.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+/// Read a variable-bit-rate encoded unsigned integer
+static inline unsigned readInteger(const char*&At, const char*End) {
+  unsigned Shift = 0;
+  unsigned Result = 0;
+
+  do {
+    if (At == End)
+      return Result;
+    Result |= (unsigned)((*At++) & 0x7F) << Shift;
+    Shift += 7;
+  } while (At[-1] & 0x80);
+  return Result;
+}
+
+// Completely parse the Archive's symbol table and populate symTab member var.
+bool
+Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) {
+  const char* At = (const char*) data;
+  const char* End = At + size;
+  while (At < End) {
+    unsigned offset = readInteger(At, End);
+    if (At == End) {
+      if (error)
+        *error = "Ran out of data reading vbr_uint for symtab offset!";
+      return false;
+    }
+    unsigned length = readInteger(At, End);
+    if (At == End) {
+      if (error)
+        *error = "Ran out of data reading vbr_uint for symtab length!";
+      return false;
+    }
+    if (At + length > End) {
+      if (error)
+        *error = "Malformed symbol table: length not consistent with size";
+      return false;
+    }
+    // we don't care if it can't be inserted (duplicate entry)
+    symTab.insert(std::make_pair(std::string(At, length), offset));
+    At += length;
+  }
+  symTabSize = size;
+  return true;
+}
+
+// This member parses an ArchiveMemberHeader that is presumed to be pointed to
+// by At. The At pointer is updated to the byte just after the header, which
+// can be variable in size.
+ArchiveMember*
+Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
+{
+  if (At + sizeof(ArchiveMemberHeader) >= End) {
+    if (error)
+      *error = "Unexpected end of file";
+    return 0;
+  }
+
+  // Cast archive member header
+  const ArchiveMemberHeader* Hdr = (const ArchiveMemberHeader*)At;
+  At += sizeof(ArchiveMemberHeader);
+
+  int flags = 0;
+  int MemberSize = atoi(Hdr->size);
+  assert(MemberSize >= 0);
+
+  // Check the size of the member for sanity
+  if (At + MemberSize > End) {
+    if (error)
+      *error = "invalid member length in archive file";
+    return 0;
+  }
+
+  // Check the member signature
+  if (!Hdr->checkSignature()) {
+    if (error)
+      *error = "invalid file member signature";
+    return 0;
+  }
+
+  // Convert and check the member name
+  // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol
+  // table. The special name "//" and 14 blanks is for a string table, used
+  // for long file names. This library doesn't generate either of those but
+  // it will accept them. If the name starts with #1/ and the remainder is
+  // digits, then those digits specify the length of the name that is
+  // stored immediately following the header. The special name
+  // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bitcode.
+  // Anything else is a regular, short filename that is terminated with
+  // a '/' and blanks.
+
+  std::string pathname;
+  switch (Hdr->name[0]) {
+    case '#':
+      if (Hdr->name[1] == '1' && Hdr->name[2] == '/') {
+        if (isdigit(Hdr->name[3])) {
+          unsigned len = atoi(&Hdr->name[3]);
+          const char *nulp = (const char *)memchr(At, '\0', len);
+          pathname.assign(At, nulp != 0 ? (uintptr_t)(nulp - At) : len);
+          At += len;
+          MemberSize -= len;
+          flags |= ArchiveMember::HasLongFilenameFlag;
+        } else {
+          if (error)
+            *error = "invalid long filename";
+          return 0;
+        }
+      } else if (Hdr->name[1] == '_' &&
+                 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) {
+        // The member is using a long file name (>15 chars) format.
+        // This format is standard for 4.4BSD and Mac OSX operating
+        // systems. LLVM uses it similarly. In this format, the
+        // remainder of the name field (after #1/) specifies the
+        // length of the file name which occupy the first bytes of
+        // the member's data. The pathname already has the #1/ stripped.
+        pathname.assign(ARFILE_LLVM_SYMTAB_NAME);
+        flags |= ArchiveMember::LLVMSymbolTableFlag;
+      }
+      break;
+    case '/':
+      if (Hdr->name[1]== '/') {
+        if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) {
+          pathname.assign(ARFILE_STRTAB_NAME);
+          flags |= ArchiveMember::StringTableFlag;
+        } else {
+          if (error)
+            *error = "invalid string table name";
+          return 0;
+        }
+      } else if (Hdr->name[1] == ' ') {
+        if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) {
+          pathname.assign(ARFILE_SVR4_SYMTAB_NAME);
+          flags |= ArchiveMember::SVR4SymbolTableFlag;
+        } else {
+          if (error)
+            *error = "invalid SVR4 symbol table name";
+          return 0;
+        }
+      } else if (isdigit(Hdr->name[1])) {
+        unsigned index = atoi(&Hdr->name[1]);
+        if (index < strtab.length()) {
+          const char* namep = strtab.c_str() + index;
+          const char* endp = strtab.c_str() + strtab.length();
+          const char* p = namep;
+          const char* last_p = p;
+          while (p < endp) {
+            if (*p == '\n' && *last_p == '/') {
+              pathname.assign(namep, last_p - namep);
+              flags |= ArchiveMember::HasLongFilenameFlag;
+              break;
+            }
+            last_p = p;
+            p++;
+          }
+          if (p >= endp) {
+            if (error)
+              *error = "missing name terminator in string table";
+            return 0;
+          }
+        } else {
+          if (error)
+            *error = "name index beyond string table";
+          return 0;
+        }
+      }
+      break;
+    case '_':
+      if (Hdr->name[1] == '_' &&
+          (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) {
+        pathname.assign(ARFILE_BSD4_SYMTAB_NAME);
+        flags |= ArchiveMember::BSD4SymbolTableFlag;
+        break;
+      }
+      /* FALL THROUGH */
+
+    default:
+      const char* slash = (const char*) memchr(Hdr->name, '/', 16);
+      if (slash == 0)
+        slash = Hdr->name + 16;
+      pathname.assign(Hdr->name, slash - Hdr->name);
+      break;
+  }
+
+  // Determine if this is a bitcode file
+  switch (sys::IdentifyFileType(At, 4)) {
+    case sys::Bitcode_FileType:
+      flags |= ArchiveMember::BitcodeFlag;
+      break;
+    default:
+      flags &= ~ArchiveMember::BitcodeFlag;
+      break;
+  }
+
+  // Instantiate the ArchiveMember to be filled
+  ArchiveMember* member = new ArchiveMember(this);
+
+  // Fill in fields of the ArchiveMember
+  member->parent = this;
+  member->path.set(pathname);
+  member->info.fileSize = MemberSize;
+  member->info.modTime.fromEpochTime(atoi(Hdr->date));
+  unsigned int mode;
+  sscanf(Hdr->mode, "%o", &mode);
+  member->info.mode = mode;
+  member->info.user = atoi(Hdr->uid);
+  member->info.group = atoi(Hdr->gid);
+  member->flags = flags;
+  member->data = At;
+
+  return member;
+}
+
+bool
+Archive::checkSignature(std::string* error) {
+  // Check the magic string at file's header
+  if (mapfile->getBufferSize() < 8 || memcmp(base, ARFILE_MAGIC, 8)) {
+    if (error)
+      *error = "invalid signature for an archive file";
+    return false;
+  }
+  return true;
+}
+
+// This function loads the entire archive and fully populates its ilist with
+// the members of the archive file. This is typically used in preparation for
+// editing the contents of the archive.
+bool
+Archive::loadArchive(std::string* error) {
+
+  // Set up parsing
+  members.clear();
+  symTab.clear();
+  const char *At = base;
+  const char *End = mapfile->getBufferEnd();
+
+  if (!checkSignature(error))
+    return false;
+
+  At += 8;  // Skip the magic string.
+
+  bool seenSymbolTable = false;
+  bool foundFirstFile = false;
+  while (At < End) {
+    // parse the member header
+    const char* Save = At;
+    ArchiveMember* mbr = parseMemberHeader(At, End, error);
+    if (!mbr)
+      return false;
+
+    // check if this is the foreign symbol table
+    if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
+      // We just save this but don't do anything special
+      // with it. It doesn't count as the "first file".
+      if (foreignST) {
+        // What? Multiple foreign symbol tables? Just chuck it
+        // and retain the last one found.
+        delete foreignST;
+      }
+      foreignST = mbr;
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    } else if (mbr->isStringTable()) {
+      // Simply suck the entire string table into a string
+      // variable. This will be used to get the names of the
+      // members that use the "/ddd" format for their names
+      // (SVR4 style long names).
+      strtab.assign(At, mbr->getSize());
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+      delete mbr;
+    } else if (mbr->isLLVMSymbolTable()) {
+      // This is the LLVM symbol table for the archive. If we've seen it
+      // already, its an error. Otherwise, parse the symbol table and move on.
+      if (seenSymbolTable) {
+        if (error)
+          *error = "invalid archive: multiple symbol tables";
+        return false;
+      }
+      if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error))
+        return false;
+      seenSymbolTable = true;
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+      delete mbr; // We don't need this member in the list of members.
+    } else {
+      // This is just a regular file. If its the first one, save its offset.
+      // Otherwise just push it on the list and move on to the next file.
+      if (!foundFirstFile) {
+        firstFileOffset = Save - base;
+        foundFirstFile = true;
+      }
+      members.push_back(mbr);
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    }
+  }
+  return true;
+}
+
+// Open and completely load the archive file.
+Archive*
+Archive::OpenAndLoad(const sys::Path& File, LLVMContext& C,
+                     std::string* ErrorMessage) {
+  OwningPtr<Archive> result ( new Archive(File, C));
+  if (result->mapToMemory(ErrorMessage))
+    return NULL;
+  if (!result->loadArchive(ErrorMessage))
+    return NULL;
+  return result.take();
+}
+
+// Get all the bitcode modules from the archive
+bool
+Archive::getAllModules(std::vector<Module*>& Modules,
+                       std::string* ErrMessage) {
+
+  for (iterator I=begin(), E=end(); I != E; ++I) {
+    if (I->isBitcode()) {
+      std::string FullMemberName = archPath.str() +
+        "(" + I->getPath().str() + ")";
+      MemoryBuffer *Buffer =
+        MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
+                                       FullMemberName.c_str());
+      
+      Module *M = ParseBitcodeFile(Buffer, Context, ErrMessage);
+      delete Buffer;
+      if (!M)
+        return true;
+
+      Modules.push_back(M);
+    }
+  }
+  return false;
+}
+
+// Load just the symbol table from the archive file
+bool
+Archive::loadSymbolTable(std::string* ErrorMsg) {
+
+  // Set up parsing
+  members.clear();
+  symTab.clear();
+  const char *At = base;
+  const char *End = mapfile->getBufferEnd();
+
+  // Make sure we're dealing with an archive
+  if (!checkSignature(ErrorMsg))
+    return false;
+
+  At += 8; // Skip signature
+
+  // Parse the first file member header
+  const char* FirstFile = At;
+  ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg);
+  if (!mbr)
+    return false;
+
+  if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
+    // Skip the foreign symbol table, we don't do anything with it
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+
+    // Read the next one
+    FirstFile = At;
+    mbr = parseMemberHeader(At, End, ErrorMsg);
+    if (!mbr) {
+      delete mbr;
+      return false;
+    }
+  }
+
+  if (mbr->isStringTable()) {
+    // Process the string table entry
+    strtab.assign((const char*)mbr->getData(), mbr->getSize());
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+    // Get the next one
+    FirstFile = At;
+    mbr = parseMemberHeader(At, End, ErrorMsg);
+    if (!mbr) {
+      delete mbr;
+      return false;
+    }
+  }
+
+  // See if its the symbol table
+  if (mbr->isLLVMSymbolTable()) {
+    if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) {
+      delete mbr;
+      return false;
+    }
+
+    At += mbr->getSize();
+    if ((intptr_t(At) & 1) == 1)
+      At++;
+    delete mbr;
+    // Can't be any more symtab headers so just advance
+    FirstFile = At;
+  } else {
+    // There's no symbol table in the file. We have to rebuild it from scratch
+    // because the intent of this method is to get the symbol table loaded so
+    // it can be searched efficiently.
+    // Add the member to the members list
+    members.push_back(mbr);
+  }
+
+  firstFileOffset = FirstFile - base;
+  return true;
+}
+
+// Open the archive and load just the symbol tables
+Archive* Archive::OpenAndLoadSymbols(const sys::Path& File,
+                                     LLVMContext& C,
+                                     std::string* ErrorMessage) {
+  OwningPtr<Archive> result ( new Archive(File, C) );
+  if (result->mapToMemory(ErrorMessage))
+    return NULL;
+  if (!result->loadSymbolTable(ErrorMessage))
+    return NULL;
+  return result.take();
+}
+
+// Look up one symbol in the symbol table and return the module that defines
+// that symbol.
+Module*
+Archive::findModuleDefiningSymbol(const std::string& symbol, 
+                                  std::string* ErrMsg) {
+  SymTabType::iterator SI = symTab.find(symbol);
+  if (SI == symTab.end())
+    return 0;
+
+  // The symbol table was previously constructed assuming that the members were
+  // written without the symbol table header. Because VBR encoding is used, the
+  // values could not be adjusted to account for the offset of the symbol table
+  // because that could affect the size of the symbol table due to VBR encoding.
+  // We now have to account for this by adjusting the offset by the size of the
+  // symbol table and its header.
+  unsigned fileOffset =
+    SI->second +                // offset in symbol-table-less file
+    firstFileOffset;            // add offset to first "real" file in archive
+
+  // See if the module is already loaded
+  ModuleMap::iterator MI = modules.find(fileOffset);
+  if (MI != modules.end())
+    return MI->second.first;
+
+  // Module hasn't been loaded yet, we need to load it
+  const char* modptr = base + fileOffset;
+  ArchiveMember* mbr = parseMemberHeader(modptr, mapfile->getBufferEnd(),
+                                         ErrMsg);
+  if (!mbr)
+    return 0;
+
+  // Now, load the bitcode module to get the Module.
+  std::string FullMemberName = archPath.str() + "(" +
+    mbr->getPath().str() + ")";
+  MemoryBuffer *Buffer =
+    MemoryBuffer::getMemBufferCopy(StringRef(mbr->getData(), mbr->getSize()),
+                                   FullMemberName.c_str());
+  
+  Module *m = getLazyBitcodeModule(Buffer, Context, ErrMsg);
+  if (!m)
+    return 0;
+
+  modules.insert(std::make_pair(fileOffset, std::make_pair(m, mbr)));
+
+  return m;
+}
+
+// Look up multiple symbols in the symbol table and return a set of
+// Modules that define those symbols.
+bool
+Archive::findModulesDefiningSymbols(std::set<std::string>& symbols,
+                                    SmallVectorImpl<Module*>& result,
+                                    std::string* error) {
+  if (!mapfile || !base) {
+    if (error)
+      *error = "Empty archive invalid for finding modules defining symbols";
+    return false;
+  }
+
+  if (symTab.empty()) {
+    // We don't have a symbol table, so we must build it now but lets also
+    // make sure that we populate the modules table as we do this to ensure
+    // that we don't load them twice when findModuleDefiningSymbol is called
+    // below.
+
+    // Get a pointer to the first file
+    const char* At  = base + firstFileOffset;
+    const char* End = mapfile->getBufferEnd();
+
+    while ( At < End) {
+      // Compute the offset to be put in the symbol table
+      unsigned offset = At - base - firstFileOffset;
+
+      // Parse the file's header
+      ArchiveMember* mbr = parseMemberHeader(At, End, error);
+      if (!mbr)
+        return false;
+
+      // If it contains symbols
+      if (mbr->isBitcode()) {
+        // Get the symbols
+        std::vector<std::string> symbols;
+        std::string FullMemberName = archPath.str() + "(" +
+          mbr->getPath().str() + ")";
+        Module* M = 
+          GetBitcodeSymbols(At, mbr->getSize(), FullMemberName, Context,
+                            symbols, error);
+
+        if (M) {
+          // Insert the module's symbols into the symbol table
+          for (std::vector<std::string>::iterator I = symbols.begin(),
+               E=symbols.end(); I != E; ++I ) {
+            symTab.insert(std::make_pair(*I, offset));
+          }
+          // Insert the Module and the ArchiveMember into the table of
+          // modules.
+          modules.insert(std::make_pair(offset, std::make_pair(M, mbr)));
+        } else {
+          if (error)
+            *error = "Can't parse bitcode member: " + 
+              mbr->getPath().str() + ": " + *error;
+          delete mbr;
+          return false;
+        }
+      }
+
+      // Go to the next file location
+      At += mbr->getSize();
+      if ((intptr_t(At) & 1) == 1)
+        At++;
+    }
+  }
+
+  // At this point we have a valid symbol table (one way or another) so we
+  // just use it to quickly find the symbols requested.
+
+  SmallPtrSet<Module*, 16> Added;
+  for (std::set<std::string>::iterator I=symbols.begin(),
+         Next = I,
+         E=symbols.end(); I != E; I = Next) {
+    // Increment Next before we invalidate it.
+    ++Next;
+
+    // See if this symbol exists
+    Module* m = findModuleDefiningSymbol(*I,error);
+    if (!m)
+      continue;
+    bool NewMember = Added.insert(m);
+    if (!NewMember)
+      continue;
+
+    // The symbol exists, insert the Module into our result.
+    result.push_back(m);
+
+    // Remove the symbol now that its been resolved.
+    symbols.erase(I);
+  }
+  return true;
+}
+
+bool Archive::isBitcodeArchive() {
+  // Make sure the symTab has been loaded. In most cases this should have been
+  // done when the archive was constructed, but still,  this is just in case.
+  if (symTab.empty())
+    if (!loadSymbolTable(0))
+      return false;
+
+  // Now that we know it's been loaded, return true
+  // if it has a size
+  if (symTab.size()) return true;
+
+  // We still can't be sure it isn't a bitcode archive
+  if (!loadArchive(0))
+    return false;
+
+  std::vector<Module *> Modules;
+  std::string ErrorMessage;
+
+  // Scan the archive, trying to load a bitcode member.  We only load one to
+  // see if this works.
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (!I->isBitcode())
+      continue;
+    
+    std::string FullMemberName = 
+      archPath.str() + "(" + I->getPath().str() + ")";
+
+    MemoryBuffer *Buffer =
+      MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
+                                     FullMemberName.c_str());
+    Module *M = ParseBitcodeFile(Buffer, Context);
+    delete Buffer;
+    if (!M)
+      return false;  // Couldn't parse bitcode, not a bitcode archive.
+    delete M;
+    return true;
+  }
+  
+  return false;
+}
diff --git a/contrib/llvm/lib/Archive/ArchiveWriter.cpp b/contrib/llvm/lib/Archive/ArchiveWriter.cpp
new file mode 100644
index 000000000000..3eba701c9535
--- /dev/null
+++ b/contrib/llvm/lib/Archive/ArchiveWriter.cpp
@@ -0,0 +1,489 @@
+//===-- ArchiveWriter.cpp - Write LLVM archive files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Builds up an LLVM archive file (.a) containing LLVM bitcode.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/Archive.h"
+#include "ArchiveInternals.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/system_error.h"
+#include <fstream>
+#include <iomanip>
+#include <ostream>
+using namespace llvm;
+
+// Write an integer using variable bit rate encoding. This saves a few bytes
+// per entry in the symbol table.
+static inline void writeInteger(unsigned num, std::ofstream& ARFile) {
+  while (1) {
+    if (num < 0x80) { // done?
+      ARFile << (unsigned char)num;
+      return;
+    }
+
+    // Nope, we are bigger than a character, output the next 7 bits and set the
+    // high bit to say that there is more coming...
+    ARFile << (unsigned char)(0x80 | ((unsigned char)num & 0x7F));
+    num >>= 7;  // Shift out 7 bits now...
+  }
+}
+
+// Compute how many bytes are taken by a given VBR encoded value. This is needed
+// to pre-compute the size of the symbol table.
+static inline unsigned numVbrBytes(unsigned num) {
+
+  // Note that the following nested ifs are somewhat equivalent to a binary
+  // search. We split it in half by comparing against 2^14 first. This allows
+  // most reasonable values to be done in 2 comparisons instead of 1 for
+  // small ones and four for large ones. We expect this to access file offsets
+  // in the 2^10 to 2^24 range and symbol lengths in the 2^0 to 2^8 range,
+  // so this approach is reasonable.
+  if (num < 1<<14) {
+    if (num < 1<<7)
+      return 1;
+    else
+      return 2;
+  }
+  if (num < 1<<21)
+    return 3;
+
+  if (num < 1<<28)
+    return 4;
+  return 5; // anything >= 2^28 takes 5 bytes
+}
+
+// Create an empty archive.
+Archive* Archive::CreateEmpty(const sys::Path& FilePath, LLVMContext& C) {
+  Archive* result = new Archive(FilePath, C);
+  return result;
+}
+
+// Fill the ArchiveMemberHeader with the information from a member. If
+// TruncateNames is true, names are flattened to 15 chars or less. The sz field
+// is provided here instead of coming from the mbr because the member might be
+// stored compressed and the compressed size is not the ArchiveMember's size.
+// Furthermore compressed files have negative size fields to identify them as
+// compressed.
+bool
+Archive::fillHeader(const ArchiveMember &mbr, ArchiveMemberHeader& hdr,
+                    int sz, bool TruncateNames) const {
+
+  // Set the permissions mode, uid and gid
+  hdr.init();
+  char buffer[32];
+  sprintf(buffer, "%-8o", mbr.getMode());
+  memcpy(hdr.mode,buffer,8);
+  sprintf(buffer,  "%-6u", mbr.getUser());
+  memcpy(hdr.uid,buffer,6);
+  sprintf(buffer,  "%-6u", mbr.getGroup());
+  memcpy(hdr.gid,buffer,6);
+
+  // Set the last modification date
+  uint64_t secondsSinceEpoch = mbr.getModTime().toEpochTime();
+  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
+  memcpy(hdr.date,buffer,12);
+
+  // Get rid of trailing blanks in the name
+  std::string mbrPath = mbr.getPath().str();
+  size_t mbrLen = mbrPath.length();
+  while (mbrLen > 0 && mbrPath[mbrLen-1] == ' ') {
+    mbrPath.erase(mbrLen-1,1);
+    mbrLen--;
+  }
+
+  // Set the name field in one of its various flavors.
+  bool writeLongName = false;
+  if (mbr.isStringTable()) {
+    memcpy(hdr.name,ARFILE_STRTAB_NAME,16);
+  } else if (mbr.isSVR4SymbolTable()) {
+    memcpy(hdr.name,ARFILE_SVR4_SYMTAB_NAME,16);
+  } else if (mbr.isBSD4SymbolTable()) {
+    memcpy(hdr.name,ARFILE_BSD4_SYMTAB_NAME,16);
+  } else if (mbr.isLLVMSymbolTable()) {
+    memcpy(hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
+  } else if (TruncateNames) {
+    const char* nm = mbrPath.c_str();
+    unsigned len = mbrPath.length();
+    size_t slashpos = mbrPath.rfind('/');
+    if (slashpos != std::string::npos) {
+      nm += slashpos + 1;
+      len -= slashpos +1;
+    }
+    if (len > 15)
+      len = 15;
+    memcpy(hdr.name,nm,len);
+    hdr.name[len] = '/';
+  } else if (mbrPath.length() < 16 && mbrPath.find('/') == std::string::npos) {
+    memcpy(hdr.name,mbrPath.c_str(),mbrPath.length());
+    hdr.name[mbrPath.length()] = '/';
+  } else {
+    std::string nm = "#1/";
+    nm += utostr(mbrPath.length());
+    memcpy(hdr.name,nm.data(),nm.length());
+    if (sz < 0)
+      sz -= mbrPath.length();
+    else
+      sz += mbrPath.length();
+    writeLongName = true;
+  }
+
+  // Set the size field
+  if (sz < 0) {
+    buffer[0] = '-';
+    sprintf(&buffer[1],"%-9u",(unsigned)-sz);
+  } else {
+    sprintf(buffer, "%-10u", (unsigned)sz);
+  }
+  memcpy(hdr.size,buffer,10);
+
+  return writeLongName;
+}
+
+// Insert a file into the archive before some other member. This also takes care
+// of extracting the necessary flags and information from the file.
+bool
+Archive::addFileBefore(const sys::Path& filePath, iterator where,
+                        std::string* ErrMsg) {
+  bool Exists;
+  if (sys::fs::exists(filePath.str(), Exists) || !Exists) {
+    if (ErrMsg)
+      *ErrMsg = "Can not add a non-existent file to archive";
+    return true;
+  }
+
+  ArchiveMember* mbr = new ArchiveMember(this);
+
+  mbr->data = 0;
+  mbr->path = filePath;
+  const sys::FileStatus *FSInfo = mbr->path.getFileStatus(false, ErrMsg);
+  if (!FSInfo) {
+    delete mbr;
+    return true;
+  }
+  mbr->info = *FSInfo;
+
+  unsigned flags = 0;
+  bool hasSlash = filePath.str().find('/') != std::string::npos;
+  if (hasSlash)
+    flags |= ArchiveMember::HasPathFlag;
+  if (hasSlash || filePath.str().length() > 15)
+    flags |= ArchiveMember::HasLongFilenameFlag;
+
+  sys::fs::file_magic type;
+  if (sys::fs::identify_magic(mbr->path.str(), type))
+    type = sys::fs::file_magic::unknown;
+  switch (type) {
+    case sys::fs::file_magic::bitcode:
+      flags |= ArchiveMember::BitcodeFlag;
+      break;
+    default:
+      break;
+  }
+  mbr->flags = flags;
+  members.insert(where,mbr);
+  return false;
+}
+
+// Write one member out to the file.
+bool
+Archive::writeMember(
+  const ArchiveMember& member,
+  std::ofstream& ARFile,
+  bool CreateSymbolTable,
+  bool TruncateNames,
+  std::string* ErrMsg
+) {
+
+  unsigned filepos = ARFile.tellp();
+  filepos -= 8;
+
+  // Get the data and its size either from the
+  // member's in-memory data or directly from the file.
+  size_t fSize = member.getSize();
+  const char *data = (const char*)member.getData();
+  MemoryBuffer *mFile = 0;
+  if (!data) {
+    OwningPtr<MemoryBuffer> File;
+    if (error_code ec = MemoryBuffer::getFile(member.getPath().c_str(), File)) {
+      if (ErrMsg)
+        *ErrMsg = ec.message();
+      return true;
+    }
+    mFile = File.take();
+    data = mFile->getBufferStart();
+    fSize = mFile->getBufferSize();
+  }
+
+  // Now that we have the data in memory, update the
+  // symbol table if it's a bitcode file.
+  if (CreateSymbolTable && member.isBitcode()) {
+    std::vector<std::string> symbols;
+    std::string FullMemberName = archPath.str() + "(" + member.getPath().str()
+      + ")";
+    Module* M =
+      GetBitcodeSymbols(data, fSize, FullMemberName, Context, symbols, ErrMsg);
+
+    // If the bitcode parsed successfully
+    if ( M ) {
+      for (std::vector<std::string>::iterator SI = symbols.begin(),
+           SE = symbols.end(); SI != SE; ++SI) {
+
+        std::pair<SymTabType::iterator,bool> Res =
+          symTab.insert(std::make_pair(*SI,filepos));
+
+        if (Res.second) {
+          symTabSize += SI->length() +
+                        numVbrBytes(SI->length()) +
+                        numVbrBytes(filepos);
+        }
+      }
+      // We don't need this module any more.
+      delete M;
+    } else {
+      delete mFile;
+      if (ErrMsg)
+        *ErrMsg = "Can't parse bitcode member: " + member.getPath().str()
+          + ": " + *ErrMsg;
+      return true;
+    }
+  }
+
+  int hdrSize = fSize;
+
+  // Compute the fields of the header
+  ArchiveMemberHeader Hdr;
+  bool writeLongName = fillHeader(member,Hdr,hdrSize,TruncateNames);
+
+  // Write header to archive file
+  ARFile.write((char*)&Hdr, sizeof(Hdr));
+
+  // Write the long filename if its long
+  if (writeLongName) {
+    ARFile.write(member.getPath().str().data(),
+                 member.getPath().str().length());
+  }
+
+  // Write the (possibly compressed) member's content to the file.
+  ARFile.write(data,fSize);
+
+  // Make sure the member is an even length
+  if ((ARFile.tellp() & 1) == 1)
+    ARFile << ARFILE_PAD;
+
+  // Close the mapped file if it was opened
+  delete mFile;
+  return false;
+}
+
+// Write out the LLVM symbol table as an archive member to the file.
+void
+Archive::writeSymbolTable(std::ofstream& ARFile) {
+
+  // Construct the symbol table's header
+  ArchiveMemberHeader Hdr;
+  Hdr.init();
+  memcpy(Hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
+  uint64_t secondsSinceEpoch = sys::TimeValue::now().toEpochTime();
+  char buffer[32];
+  sprintf(buffer, "%-8o", 0644);
+  memcpy(Hdr.mode,buffer,8);
+  sprintf(buffer, "%-6u", sys::Process::GetCurrentUserId());
+  memcpy(Hdr.uid,buffer,6);
+  sprintf(buffer, "%-6u", sys::Process::GetCurrentGroupId());
+  memcpy(Hdr.gid,buffer,6);
+  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
+  memcpy(Hdr.date,buffer,12);
+  sprintf(buffer,"%-10u",symTabSize);
+  memcpy(Hdr.size,buffer,10);
+
+  // Write the header
+  ARFile.write((char*)&Hdr, sizeof(Hdr));
+
+#ifndef NDEBUG
+  // Save the starting position of the symbol tables data content.
+  unsigned startpos = ARFile.tellp();
+#endif
+
+  // Write out the symbols sequentially
+  for ( Archive::SymTabType::iterator I = symTab.begin(), E = symTab.end();
+        I != E; ++I)
+  {
+    // Write out the file index
+    writeInteger(I->second, ARFile);
+    // Write out the length of the symbol
+    writeInteger(I->first.length(), ARFile);
+    // Write out the symbol
+    ARFile.write(I->first.data(), I->first.length());
+  }
+
+#ifndef NDEBUG
+  // Now that we're done with the symbol table, get the ending file position
+  unsigned endpos = ARFile.tellp();
+#endif
+
+  // Make sure that the amount we wrote is what we pre-computed. This is
+  // critical for file integrity purposes.
+  assert(endpos - startpos == symTabSize && "Invalid symTabSize computation");
+
+  // Make sure the symbol table is even sized
+  if (symTabSize % 2 != 0 )
+    ARFile << ARFILE_PAD;
+}
+
+// Write the entire archive to the file specified when the archive was created.
+// This writes to a temporary file first. Options are for creating a symbol
+// table, flattening the file names (no directories, 15 chars max) and
+// compressing each archive member.
+bool
+Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames,
+                     std::string* ErrMsg)
+{
+  // Make sure they haven't opened up the file, not loaded it,
+  // but are now trying to write it which would wipe out the file.
+  if (members.empty() && mapfile && mapfile->getBufferSize() > 8) {
+    if (ErrMsg)
+      *ErrMsg = "Can't write an archive not opened for writing";
+    return true;
+  }
+
+  // Create a temporary file to store the archive in
+  sys::Path TmpArchive = archPath;
+  if (TmpArchive.createTemporaryFileOnDisk(ErrMsg))
+    return true;
+
+  // Make sure the temporary gets removed if we crash
+  sys::RemoveFileOnSignal(TmpArchive);
+
+  // Create archive file for output.
+  std::ios::openmode io_mode = std::ios::out | std::ios::trunc |
+                               std::ios::binary;
+  std::ofstream ArchiveFile(TmpArchive.c_str(), io_mode);
+
+  // Check for errors opening or creating archive file.
+  if (!ArchiveFile.is_open() || ArchiveFile.bad()) {
+    TmpArchive.eraseFromDisk();
+    if (ErrMsg)
+      *ErrMsg = "Error opening archive file: " + archPath.str();
+    return true;
+  }
+
+  // If we're creating a symbol table, reset it now
+  if (CreateSymbolTable) {
+    symTabSize = 0;
+    symTab.clear();
+  }
+
+  // Write magic string to archive.
+  ArchiveFile << ARFILE_MAGIC;
+
+  // Loop over all member files, and write them out. Note that this also
+  // builds the symbol table, symTab.
+  for (MembersList::iterator I = begin(), E = end(); I != E; ++I) {
+    if (writeMember(*I, ArchiveFile, CreateSymbolTable,
+                     TruncateNames, ErrMsg)) {
+      TmpArchive.eraseFromDisk();
+      ArchiveFile.close();
+      return true;
+    }
+  }
+
+  // Close archive file.
+  ArchiveFile.close();
+
+  // Write the symbol table
+  if (CreateSymbolTable) {
+    // At this point we have written a file that is a legal archive but it
+    // doesn't have a symbol table in it. To aid in faster reading and to
+    // ensure compatibility with other archivers we need to put the symbol
+    // table first in the file. Unfortunately, this means mapping the file
+    // we just wrote back in and copying it to the destination file.
+    sys::Path FinalFilePath = archPath;
+
+    // Map in the archive we just wrote.
+    {
+    OwningPtr<MemoryBuffer> arch;
+    if (error_code ec = MemoryBuffer::getFile(TmpArchive.c_str(), arch)) {
+      if (ErrMsg)
+        *ErrMsg = ec.message();
+      return true;
+    }
+    const char* base = arch->getBufferStart();
+
+    // Open another temporary file in order to avoid invalidating the
+    // mmapped data
+    if (FinalFilePath.createTemporaryFileOnDisk(ErrMsg))
+      return true;
+    sys::RemoveFileOnSignal(FinalFilePath);
+
+    std::ofstream FinalFile(FinalFilePath.c_str(), io_mode);
+    if (!FinalFile.is_open() || FinalFile.bad()) {
+      TmpArchive.eraseFromDisk();
+      if (ErrMsg)
+        *ErrMsg = "Error opening archive file: " + FinalFilePath.str();
+      return true;
+    }
+
+    // Write the file magic number
+    FinalFile << ARFILE_MAGIC;
+
+    // If there is a foreign symbol table, put it into the file now. Most
+    // ar(1) implementations require the symbol table to be first but llvm-ar
+    // can deal with it being after a foreign symbol table. This ensures
+    // compatibility with other ar(1) implementations as well as allowing the
+    // archive to store both native .o and LLVM .bc files, both indexed.
+    if (foreignST) {
+      if (writeMember(*foreignST, FinalFile, false, false, ErrMsg)) {
+        FinalFile.close();
+        TmpArchive.eraseFromDisk();
+        return true;
+      }
+    }
+
+    // Put out the LLVM symbol table now.
+    writeSymbolTable(FinalFile);
+
+    // Copy the temporary file contents being sure to skip the file's magic
+    // number.
+    FinalFile.write(base + sizeof(ARFILE_MAGIC)-1,
+      arch->getBufferSize()-sizeof(ARFILE_MAGIC)+1);
+
+    // Close up shop
+    FinalFile.close();
+    } // free arch.
+
+    // Move the final file over top of TmpArchive
+    if (FinalFilePath.renamePathOnDisk(TmpArchive, ErrMsg))
+      return true;
+  }
+
+  // Before we replace the actual archive, we need to forget all the
+  // members, since they point to data in that old archive. We need to do
+  // this because we cannot replace an open file on Windows.
+  cleanUpMemory();
+
+  if (TmpArchive.renamePathOnDisk(archPath, ErrMsg))
+    return true;
+
+  // Set correct read and write permissions after temporary file is moved
+  // to final destination path.
+  if (archPath.makeReadableOnDisk(ErrMsg))
+    return true;
+  if (archPath.makeWriteableOnDisk(ErrMsg))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm/lib/AsmParser/LLLexer.cpp
new file mode 100644
index 000000000000..f46383be7e46
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/LLLexer.cpp
@@ -0,0 +1,884 @@
+//===- LLLexer.cpp - Lexer for .ll Files ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Lexer for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLLexer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Assembly/Parser.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+bool LLLexer::Error(LocTy ErrorLoc, const Twine &Msg) const {
+  ErrorInfo = SM.GetMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions.
+//===----------------------------------------------------------------------===//
+
+// atoull - Convert an ascii string of decimal digits into the unsigned long
+// long representation... this does not have to do input error checking,
+// because we know that the input will be matched by a suitable regex...
+//
+uint64_t LLLexer::atoull(const char *Buffer, const char *End) {
+  uint64_t Result = 0;
+  for (; Buffer != End; Buffer++) {
+    uint64_t OldRes = Result;
+    Result *= 10;
+    Result += *Buffer-'0';
+    if (Result < OldRes) {  // Uh, oh, overflow detected!!!
+      Error("constant bigger than 64 bits detected!");
+      return 0;
+    }
+  }
+  return Result;
+}
+
+uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
+  uint64_t Result = 0;
+  for (; Buffer != End; ++Buffer) {
+    uint64_t OldRes = Result;
+    Result *= 16;
+    Result += hexDigitValue(*Buffer);
+
+    if (Result < OldRes) {   // Uh, oh, overflow detected!!!
+      Error("constant bigger than 64 bits detected!");
+      return 0;
+    }
+  }
+  return Result;
+}
+
+void LLLexer::HexToIntPair(const char *Buffer, const char *End,
+                           uint64_t Pair[2]) {
+  Pair[0] = 0;
+  for (int i=0; i<16; i++, Buffer++) {
+    assert(Buffer != End);
+    Pair[0] *= 16;
+    Pair[0] += hexDigitValue(*Buffer);
+  }
+  Pair[1] = 0;
+  for (int i=0; i<16 && Buffer != End; i++, Buffer++) {
+    Pair[1] *= 16;
+    Pair[1] += hexDigitValue(*Buffer);
+  }
+  if (Buffer != End)
+    Error("constant bigger than 128 bits detected!");
+}
+
+/// FP80HexToIntPair - translate an 80 bit FP80 number (20 hexits) into
+/// { low64, high16 } as usual for an APInt.
+void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
+                           uint64_t Pair[2]) {
+  Pair[1] = 0;
+  for (int i=0; i<4 && Buffer != End; i++, Buffer++) {
+    assert(Buffer != End);
+    Pair[1] *= 16;
+    Pair[1] += hexDigitValue(*Buffer);
+  }
+  Pair[0] = 0;
+  for (int i=0; i<16; i++, Buffer++) {
+    Pair[0] *= 16;
+    Pair[0] += hexDigitValue(*Buffer);
+  }
+  if (Buffer != End)
+    Error("constant bigger than 128 bits detected!");
+}
+
+// UnEscapeLexed - Run through the specified buffer and change \xx codes to the
+// appropriate character.
+static void UnEscapeLexed(std::string &Str) {
+  if (Str.empty()) return;
+
+  char *Buffer = &Str[0], *EndBuffer = Buffer+Str.size();
+  char *BOut = Buffer;
+  for (char *BIn = Buffer; BIn != EndBuffer; ) {
+    if (BIn[0] == '\\') {
+      if (BIn < EndBuffer-1 && BIn[1] == '\\') {
+        *BOut++ = '\\'; // Two \ becomes one
+        BIn += 2;
+      } else if (BIn < EndBuffer-2 &&
+                 isxdigit(static_cast<unsigned char>(BIn[1])) &&
+                 isxdigit(static_cast<unsigned char>(BIn[2]))) {
+        *BOut = hexDigitValue(BIn[1]) * 16 + hexDigitValue(BIn[2]);
+        BIn += 3;                           // Skip over handled chars
+        ++BOut;
+      } else {
+        *BOut++ = *BIn++;
+      }
+    } else {
+      *BOut++ = *BIn++;
+    }
+  }
+  Str.resize(BOut-Buffer);
+}
+
+/// isLabelChar - Return true for [-a-zA-Z$._0-9].
+static bool isLabelChar(char C) {
+  return isalnum(static_cast<unsigned char>(C)) || C == '-' || C == '$' ||
+         C == '.' || C == '_';
+}
+
+
+/// isLabelTail - Return true if this pointer points to a valid end of a label.
+static const char *isLabelTail(const char *CurPtr) {
+  while (1) {
+    if (CurPtr[0] == ':') return CurPtr+1;
+    if (!isLabelChar(CurPtr[0])) return 0;
+    ++CurPtr;
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Lexer definition.
+//===----------------------------------------------------------------------===//
+
+LLLexer::LLLexer(MemoryBuffer *StartBuf, SourceMgr &sm, SMDiagnostic &Err,
+                 LLVMContext &C)
+  : CurBuf(StartBuf), ErrorInfo(Err), SM(sm), Context(C), APFloatVal(0.0) {
+  CurPtr = CurBuf->getBufferStart();
+}
+
+std::string LLLexer::getFilename() const {
+  return CurBuf->getBufferIdentifier();
+}
+
+int LLLexer::getNextChar() {
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default: return (unsigned char)CurChar;
+  case 0:
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr-1 != CurBuf->getBufferEnd())
+      return 0;  // Just whitespace.
+
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.
+    return EOF;
+  }
+}
+
+
+lltok::Kind LLLexer::LexToken() {
+  TokStart = CurPtr;
+
+  int CurChar = getNextChar();
+  switch (CurChar) {
+  default:
+    // Handle letters: [a-zA-Z_]
+    if (isalpha(static_cast<unsigned char>(CurChar)) || CurChar == '_')
+      return LexIdentifier();
+
+    return lltok::Error;
+  case EOF: return lltok::Eof;
+  case 0:
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\r':
+    // Ignore whitespace.
+    return LexToken();
+  case '+': return LexPositive();
+  case '@': return LexAt();
+  case '%': return LexPercent();
+  case '"': return LexQuote();
+  case '.':
+    if (const char *Ptr = isLabelTail(CurPtr)) {
+      CurPtr = Ptr;
+      StrVal.assign(TokStart, CurPtr-1);
+      return lltok::LabelStr;
+    }
+    if (CurPtr[0] == '.' && CurPtr[1] == '.') {
+      CurPtr += 2;
+      return lltok::dotdotdot;
+    }
+    return lltok::Error;
+  case '$':
+    if (const char *Ptr = isLabelTail(CurPtr)) {
+      CurPtr = Ptr;
+      StrVal.assign(TokStart, CurPtr-1);
+      return lltok::LabelStr;
+    }
+    return lltok::Error;
+  case ';':
+    SkipLineComment();
+    return LexToken();
+  case '!': return LexExclaim();
+  case '#': return LexHash();
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '-':
+    return LexDigitOrNegative();
+  case '=': return lltok::equal;
+  case '[': return lltok::lsquare;
+  case ']': return lltok::rsquare;
+  case '{': return lltok::lbrace;
+  case '}': return lltok::rbrace;
+  case '<': return lltok::less;
+  case '>': return lltok::greater;
+  case '(': return lltok::lparen;
+  case ')': return lltok::rparen;
+  case ',': return lltok::comma;
+  case '*': return lltok::star;
+  case '\\': return lltok::backslash;
+  }
+}
+
+void LLLexer::SkipLineComment() {
+  while (1) {
+    if (CurPtr[0] == '\n' || CurPtr[0] == '\r' || getNextChar() == EOF)
+      return;
+  }
+}
+
+/// LexAt - Lex all tokens that start with an @ character:
+///   GlobalVar   @\"[^\"]*\"
+///   GlobalVar   @[-a-zA-Z$._][-a-zA-Z$._0-9]*
+///   GlobalVarID @[0-9]+
+lltok::Kind LLLexer::LexAt() {
+  // Handle AtStringConstant: @\"[^\"]*\"
+  if (CurPtr[0] == '"') {
+    ++CurPtr;
+
+    while (1) {
+      int CurChar = getNextChar();
+
+      if (CurChar == EOF) {
+        Error("end of file in global variable name");
+        return lltok::Error;
+      }
+      if (CurChar == '"') {
+        StrVal.assign(TokStart+2, CurPtr-1);
+        UnEscapeLexed(StrVal);
+        return lltok::GlobalVar;
+      }
+    }
+  }
+
+  // Handle GlobalVarName: @[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  if (ReadVarName())
+    return lltok::GlobalVar;
+
+  // Handle GlobalVarID: @[0-9]+
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::GlobalID;
+  }
+
+  return lltok::Error;
+}
+
+/// ReadString - Read a string until the closing quote.
+lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
+  const char *Start = CurPtr;
+  while (1) {
+    int CurChar = getNextChar();
+
+    if (CurChar == EOF) {
+      Error("end of file in string constant");
+      return lltok::Error;
+    }
+    if (CurChar == '"') {
+      StrVal.assign(Start, CurPtr-1);
+      UnEscapeLexed(StrVal);
+      return kind;
+    }
+  }
+}
+
+/// ReadVarName - Read the rest of a token containing a variable name.
+bool LLLexer::ReadVarName() {
+  const char *NameStart = CurPtr;
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
+      CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_') {
+    ++CurPtr;
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
+           CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_')
+      ++CurPtr;
+
+    StrVal.assign(NameStart, CurPtr);
+    return true;
+  }
+  return false;
+}
+
+/// LexPercent - Lex all tokens that start with a % character:
+///   LocalVar   ::= %\"[^\"]*\"
+///   LocalVar   ::= %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+///   LocalVarID ::= %[0-9]+
+lltok::Kind LLLexer::LexPercent() {
+  // Handle LocalVarName: %\"[^\"]*\"
+  if (CurPtr[0] == '"') {
+    ++CurPtr;
+    return ReadString(lltok::LocalVar);
+  }
+
+  // Handle LocalVarName: %[-a-zA-Z$._][-a-zA-Z$._0-9]*
+  if (ReadVarName())
+    return lltok::LocalVar;
+
+  // Handle LocalVarID: %[0-9]+
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::LocalVarID;
+  }
+
+  return lltok::Error;
+}
+
+/// LexQuote - Lex all tokens that start with a " character:
+///   QuoteLabel        "[^"]+":
+///   StringConstant    "[^"]*"
+lltok::Kind LLLexer::LexQuote() {
+  lltok::Kind kind = ReadString(lltok::StringConstant);
+  if (kind == lltok::Error || kind == lltok::Eof)
+    return kind;
+
+  if (CurPtr[0] == ':') {
+    ++CurPtr;
+    kind = lltok::LabelStr;
+  }
+
+  return kind;
+}
+
+/// LexExclaim:
+///    !foo
+///    !
+lltok::Kind LLLexer::LexExclaim() {
+  // Lex a metadata name as a MetadataVar.
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
+      CurPtr[0] == '-' || CurPtr[0] == '$' ||
+      CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
+    ++CurPtr;
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
+           CurPtr[0] == '-' || CurPtr[0] == '$' ||
+           CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
+      ++CurPtr;
+
+    StrVal.assign(TokStart+1, CurPtr);   // Skip !
+    UnEscapeLexed(StrVal);
+    return lltok::MetadataVar;
+  }
+  return lltok::exclaim;
+}
+
+/// LexHash - Lex all tokens that start with a # character:
+///    AttrGrpID ::= #[0-9]+
+lltok::Kind LLLexer::LexHash() {
+  // Handle AttrGrpID: #[0-9]+
+  if (isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+      /*empty*/;
+
+    uint64_t Val = atoull(TokStart+1, CurPtr);
+    if ((unsigned)Val != Val)
+      Error("invalid value number (too large)!");
+    UIntVal = unsigned(Val);
+    return lltok::AttrGrpID;
+  }
+
+  return lltok::Error;
+}
+
+/// LexIdentifier: Handle several related productions:
+///    Label           [-a-zA-Z$._0-9]+:
+///    IntegerType     i[0-9]+
+///    Keyword         sdiv, float, ...
+///    HexIntConstant  [us]0x[0-9A-Fa-f]+
+lltok::Kind LLLexer::LexIdentifier() {
+  const char *StartChar = CurPtr;
+  const char *IntEnd = CurPtr[-1] == 'i' ? 0 : StartChar;
+  const char *KeywordEnd = 0;
+
+  for (; isLabelChar(*CurPtr); ++CurPtr) {
+    // If we decide this is an integer, remember the end of the sequence.
+    if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
+      IntEnd = CurPtr;
+    if (!KeywordEnd && !isalnum(static_cast<unsigned char>(*CurPtr)) &&
+        *CurPtr != '_')
+      KeywordEnd = CurPtr;
+  }
+
+  // If we stopped due to a colon, this really is a label.
+  if (*CurPtr == ':') {
+    StrVal.assign(StartChar-1, CurPtr++);
+    return lltok::LabelStr;
+  }
+
+  // Otherwise, this wasn't a label.  If this was valid as an integer type,
+  // return it.
+  if (IntEnd == 0) IntEnd = CurPtr;
+  if (IntEnd != StartChar) {
+    CurPtr = IntEnd;
+    uint64_t NumBits = atoull(StartChar, CurPtr);
+    if (NumBits < IntegerType::MIN_INT_BITS ||
+        NumBits > IntegerType::MAX_INT_BITS) {
+      Error("bitwidth for integer type out of range!");
+      return lltok::Error;
+    }
+    TyVal = IntegerType::get(Context, NumBits);
+    return lltok::Type;
+  }
+
+  // Otherwise, this was a letter sequence.  See which keyword this is.
+  if (KeywordEnd == 0) KeywordEnd = CurPtr;
+  CurPtr = KeywordEnd;
+  --StartChar;
+  unsigned Len = CurPtr-StartChar;
+#define KEYWORD(STR)                                                    \
+  do {                                                                  \
+    if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR)))  \
+      return lltok::kw_##STR;                                           \
+  } while (0)
+
+  KEYWORD(true);    KEYWORD(false);
+  KEYWORD(declare); KEYWORD(define);
+  KEYWORD(global);  KEYWORD(constant);
+
+  KEYWORD(private);
+  KEYWORD(linker_private);
+  KEYWORD(linker_private_weak);
+  KEYWORD(linker_private_weak_def_auto); // FIXME: For backwards compatibility.
+  KEYWORD(internal);
+  KEYWORD(available_externally);
+  KEYWORD(linkonce);
+  KEYWORD(linkonce_odr);
+  KEYWORD(linkonce_odr_auto_hide);
+  KEYWORD(weak);
+  KEYWORD(weak_odr);
+  KEYWORD(appending);
+  KEYWORD(dllimport);
+  KEYWORD(dllexport);
+  KEYWORD(common);
+  KEYWORD(default);
+  KEYWORD(hidden);
+  KEYWORD(protected);
+  KEYWORD(unnamed_addr);
+  KEYWORD(externally_initialized);
+  KEYWORD(extern_weak);
+  KEYWORD(external);
+  KEYWORD(thread_local);
+  KEYWORD(localdynamic);
+  KEYWORD(initialexec);
+  KEYWORD(localexec);
+  KEYWORD(zeroinitializer);
+  KEYWORD(undef);
+  KEYWORD(null);
+  KEYWORD(to);
+  KEYWORD(tail);
+  KEYWORD(target);
+  KEYWORD(triple);
+  KEYWORD(unwind);
+  KEYWORD(deplibs);             // FIXME: Remove in 4.0.
+  KEYWORD(datalayout);
+  KEYWORD(volatile);
+  KEYWORD(atomic);
+  KEYWORD(unordered);
+  KEYWORD(monotonic);
+  KEYWORD(acquire);
+  KEYWORD(release);
+  KEYWORD(acq_rel);
+  KEYWORD(seq_cst);
+  KEYWORD(singlethread);
+
+  KEYWORD(nnan);
+  KEYWORD(ninf);
+  KEYWORD(nsz);
+  KEYWORD(arcp);
+  KEYWORD(fast);
+  KEYWORD(nuw);
+  KEYWORD(nsw);
+  KEYWORD(exact);
+  KEYWORD(inbounds);
+  KEYWORD(align);
+  KEYWORD(addrspace);
+  KEYWORD(section);
+  KEYWORD(alias);
+  KEYWORD(module);
+  KEYWORD(asm);
+  KEYWORD(sideeffect);
+  KEYWORD(alignstack);
+  KEYWORD(inteldialect);
+  KEYWORD(gc);
+
+  KEYWORD(ccc);
+  KEYWORD(fastcc);
+  KEYWORD(coldcc);
+  KEYWORD(x86_stdcallcc);
+  KEYWORD(x86_fastcallcc);
+  KEYWORD(x86_thiscallcc);
+  KEYWORD(arm_apcscc);
+  KEYWORD(arm_aapcscc);
+  KEYWORD(arm_aapcs_vfpcc);
+  KEYWORD(msp430_intrcc);
+  KEYWORD(ptx_kernel);
+  KEYWORD(ptx_device);
+  KEYWORD(spir_kernel);
+  KEYWORD(spir_func);
+  KEYWORD(intel_ocl_bicc);
+
+  KEYWORD(cc);
+  KEYWORD(c);
+
+  KEYWORD(attributes);
+
+  KEYWORD(alwaysinline);
+  KEYWORD(byval);
+  KEYWORD(inlinehint);
+  KEYWORD(inreg);
+  KEYWORD(minsize);
+  KEYWORD(naked);
+  KEYWORD(nest);
+  KEYWORD(noalias);
+  KEYWORD(nobuiltin);
+  KEYWORD(nocapture);
+  KEYWORD(noduplicate);
+  KEYWORD(noimplicitfloat);
+  KEYWORD(noinline);
+  KEYWORD(nonlazybind);
+  KEYWORD(noredzone);
+  KEYWORD(noreturn);
+  KEYWORD(nounwind);
+  KEYWORD(optsize);
+  KEYWORD(readnone);
+  KEYWORD(readonly);
+  KEYWORD(returns_twice);
+  KEYWORD(signext);
+  KEYWORD(sret);
+  KEYWORD(ssp);
+  KEYWORD(sspreq);
+  KEYWORD(sspstrong);
+  KEYWORD(sanitize_address);
+  KEYWORD(sanitize_thread);
+  KEYWORD(sanitize_memory);
+  KEYWORD(uwtable);
+  KEYWORD(zeroext);
+
+  KEYWORD(type);
+  KEYWORD(opaque);
+
+  KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle);
+  KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge);
+  KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole);
+  KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une);
+
+  KEYWORD(xchg); KEYWORD(nand); KEYWORD(max); KEYWORD(min); KEYWORD(umax);
+  KEYWORD(umin);
+
+  KEYWORD(x);
+  KEYWORD(blockaddress);
+
+  KEYWORD(personality);
+  KEYWORD(cleanup);
+  KEYWORD(catch);
+  KEYWORD(filter);
+#undef KEYWORD
+
+  // Keywords for types.
+#define TYPEKEYWORD(STR, LLVMTY) \
+  if (Len == strlen(STR) && !memcmp(StartChar, STR, strlen(STR))) { \
+    TyVal = LLVMTY; return lltok::Type; }
+  TYPEKEYWORD("void",      Type::getVoidTy(Context));
+  TYPEKEYWORD("half",      Type::getHalfTy(Context));
+  TYPEKEYWORD("float",     Type::getFloatTy(Context));
+  TYPEKEYWORD("double",    Type::getDoubleTy(Context));
+  TYPEKEYWORD("x86_fp80",  Type::getX86_FP80Ty(Context));
+  TYPEKEYWORD("fp128",     Type::getFP128Ty(Context));
+  TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
+  TYPEKEYWORD("label",     Type::getLabelTy(Context));
+  TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
+  TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
+#undef TYPEKEYWORD
+
+  // Keywords for instructions.
+#define INSTKEYWORD(STR, Enum) \
+  if (Len == strlen(#STR) && !memcmp(StartChar, #STR, strlen(#STR))) { \
+    UIntVal = Instruction::Enum; return lltok::kw_##STR; }
+
+  INSTKEYWORD(add,   Add);  INSTKEYWORD(fadd,   FAdd);
+  INSTKEYWORD(sub,   Sub);  INSTKEYWORD(fsub,   FSub);
+  INSTKEYWORD(mul,   Mul);  INSTKEYWORD(fmul,   FMul);
+  INSTKEYWORD(udiv,  UDiv); INSTKEYWORD(sdiv,  SDiv); INSTKEYWORD(fdiv,  FDiv);
+  INSTKEYWORD(urem,  URem); INSTKEYWORD(srem,  SRem); INSTKEYWORD(frem,  FRem);
+  INSTKEYWORD(shl,   Shl);  INSTKEYWORD(lshr,  LShr); INSTKEYWORD(ashr,  AShr);
+  INSTKEYWORD(and,   And);  INSTKEYWORD(or,    Or);   INSTKEYWORD(xor,   Xor);
+  INSTKEYWORD(icmp,  ICmp); INSTKEYWORD(fcmp,  FCmp);
+
+  INSTKEYWORD(phi,         PHI);
+  INSTKEYWORD(call,        Call);
+  INSTKEYWORD(trunc,       Trunc);
+  INSTKEYWORD(zext,        ZExt);
+  INSTKEYWORD(sext,        SExt);
+  INSTKEYWORD(fptrunc,     FPTrunc);
+  INSTKEYWORD(fpext,       FPExt);
+  INSTKEYWORD(uitofp,      UIToFP);
+  INSTKEYWORD(sitofp,      SIToFP);
+  INSTKEYWORD(fptoui,      FPToUI);
+  INSTKEYWORD(fptosi,      FPToSI);
+  INSTKEYWORD(inttoptr,    IntToPtr);
+  INSTKEYWORD(ptrtoint,    PtrToInt);
+  INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(select,      Select);
+  INSTKEYWORD(va_arg,      VAArg);
+  INSTKEYWORD(ret,         Ret);
+  INSTKEYWORD(br,          Br);
+  INSTKEYWORD(switch,      Switch);
+  INSTKEYWORD(indirectbr,  IndirectBr);
+  INSTKEYWORD(invoke,      Invoke);
+  INSTKEYWORD(resume,      Resume);
+  INSTKEYWORD(unreachable, Unreachable);
+
+  INSTKEYWORD(alloca,      Alloca);
+  INSTKEYWORD(load,        Load);
+  INSTKEYWORD(store,       Store);
+  INSTKEYWORD(cmpxchg,     AtomicCmpXchg);
+  INSTKEYWORD(atomicrmw,   AtomicRMW);
+  INSTKEYWORD(fence,       Fence);
+  INSTKEYWORD(getelementptr, GetElementPtr);
+
+  INSTKEYWORD(extractelement, ExtractElement);
+  INSTKEYWORD(insertelement,  InsertElement);
+  INSTKEYWORD(shufflevector,  ShuffleVector);
+  INSTKEYWORD(extractvalue,   ExtractValue);
+  INSTKEYWORD(insertvalue,    InsertValue);
+  INSTKEYWORD(landingpad,     LandingPad);
+#undef INSTKEYWORD
+
+  // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
+  // the CFE to avoid forcing it to deal with 64-bit numbers.
+  if ((TokStart[0] == 'u' || TokStart[0] == 's') &&
+      TokStart[1] == '0' && TokStart[2] == 'x' &&
+      isxdigit(static_cast<unsigned char>(TokStart[3]))) {
+    int len = CurPtr-TokStart-3;
+    uint32_t bits = len * 4;
+    APInt Tmp(bits, StringRef(TokStart+3, len), 16);
+    uint32_t activeBits = Tmp.getActiveBits();
+    if (activeBits > 0 && activeBits < bits)
+      Tmp = Tmp.trunc(activeBits);
+    APSIntVal = APSInt(Tmp, TokStart[0] == 'u');
+    return lltok::APSInt;
+  }
+
+  // If this is "cc1234", return this as just "cc".
+  if (TokStart[0] == 'c' && TokStart[1] == 'c') {
+    CurPtr = TokStart+2;
+    return lltok::kw_cc;
+  }
+
+  // Finally, if this isn't known, return an error.
+  CurPtr = TokStart+1;
+  return lltok::Error;
+}
+
+
+/// Lex0x: Handle productions that start with 0x, knowing that it matches and
+/// that this is not a label:
+///    HexFPConstant     0x[0-9A-Fa-f]+
+///    HexFP80Constant   0xK[0-9A-Fa-f]+
+///    HexFP128Constant  0xL[0-9A-Fa-f]+
+///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+///    HexHalfConstant   0xH[0-9A-Fa-f]+
+lltok::Kind LLLexer::Lex0x() {
+  CurPtr = TokStart + 2;
+
+  char Kind;
+  if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H') {
+    Kind = *CurPtr++;
+  } else {
+    Kind = 'J';
+  }
+
+  if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    // Bad token, return it as an error.
+    CurPtr = TokStart+1;
+    return lltok::Error;
+  }
+
+  while (isxdigit(static_cast<unsigned char>(CurPtr[0])))
+    ++CurPtr;
+
+  if (Kind == 'J') {
+    // HexFPConstant - Floating point constant represented in IEEE format as a
+    // hexadecimal number for when exponential notation is not precise enough.
+    // Half, Float, and double only.
+    APFloatVal = APFloat(BitsToDouble(HexIntToVal(TokStart+2, CurPtr)));
+    return lltok::APFloat;
+  }
+
+  uint64_t Pair[2];
+  switch (Kind) {
+  default: llvm_unreachable("Unknown kind!");
+  case 'K':
+    // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes)
+    FP80HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APFloat::x87DoubleExtended, APInt(80, Pair));
+    return lltok::APFloat;
+  case 'L':
+    // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes)
+    HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APFloat::IEEEquad, APInt(128, Pair));
+    return lltok::APFloat;
+  case 'M':
+    // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes)
+    HexToIntPair(TokStart+3, CurPtr, Pair);
+    APFloatVal = APFloat(APFloat::PPCDoubleDouble, APInt(128, Pair));
+    return lltok::APFloat;
+  case 'H':
+    APFloatVal = APFloat(APFloat::IEEEhalf,
+                         APInt(16,HexIntToVal(TokStart+3, CurPtr)));
+    return lltok::APFloat;
+  }
+}
+
+/// LexIdentifier: Handle several related productions:
+///    Label             [-a-zA-Z$._0-9]+:
+///    NInteger          -[0-9]+
+///    FPConstant        [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+///    PInteger          [0-9]+
+///    HexFPConstant     0x[0-9A-Fa-f]+
+///    HexFP80Constant   0xK[0-9A-Fa-f]+
+///    HexFP128Constant  0xL[0-9A-Fa-f]+
+///    HexPPC128Constant 0xM[0-9A-Fa-f]+
+lltok::Kind LLLexer::LexDigitOrNegative() {
+  // If the letter after the negative is not a number, this is probably a label.
+  if (!isdigit(static_cast<unsigned char>(TokStart[0])) &&
+      !isdigit(static_cast<unsigned char>(CurPtr[0]))) {
+    // Okay, this is not a number after the -, it's probably a label.
+    if (const char *End = isLabelTail(CurPtr)) {
+      StrVal.assign(TokStart, End-1);
+      CurPtr = End;
+      return lltok::LabelStr;
+    }
+
+    return lltok::Error;
+  }
+
+  // At this point, it is either a label, int or fp constant.
+
+  // Skip digits, we have at least one.
+  for (; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+    /*empty*/;
+
+  // Check to see if this really is a label afterall, e.g. "-1:".
+  if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
+    if (const char *End = isLabelTail(CurPtr)) {
+      StrVal.assign(TokStart, End-1);
+      CurPtr = End;
+      return lltok::LabelStr;
+    }
+  }
+
+  // If the next character is a '.', then it is a fp value, otherwise its
+  // integer.
+  if (CurPtr[0] != '.') {
+    if (TokStart[0] == '0' && TokStart[1] == 'x')
+      return Lex0x();
+    unsigned Len = CurPtr-TokStart;
+    uint32_t numBits = ((Len * 64) / 19) + 2;
+    APInt Tmp(numBits, StringRef(TokStart, Len), 10);
+    if (TokStart[0] == '-') {
+      uint32_t minBits = Tmp.getMinSignedBits();
+      if (minBits > 0 && minBits < numBits)
+        Tmp = Tmp.trunc(minBits);
+      APSIntVal = APSInt(Tmp, false);
+    } else {
+      uint32_t activeBits = Tmp.getActiveBits();
+      if (activeBits > 0 && activeBits < numBits)
+        Tmp = Tmp.trunc(activeBits);
+      APSIntVal = APSInt(Tmp, true);
+    }
+    return lltok::APSInt;
+  }
+
+  ++CurPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+
+  if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
+    if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
+          isdigit(static_cast<unsigned char>(CurPtr[2])))) {
+      CurPtr += 2;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+    }
+  }
+
+  APFloatVal = APFloat(std::atof(TokStart));
+  return lltok::APFloat;
+}
+
+///    FPConstant  [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
+lltok::Kind LLLexer::LexPositive() {
+  // If the letter after the negative is a number, this is probably not a
+  // label.
+  if (!isdigit(static_cast<unsigned char>(CurPtr[0])))
+    return lltok::Error;
+
+  // Skip digits.
+  for (++CurPtr; isdigit(static_cast<unsigned char>(CurPtr[0])); ++CurPtr)
+    /*empty*/;
+
+  // At this point, we need a '.'.
+  if (CurPtr[0] != '.') {
+    CurPtr = TokStart+1;
+    return lltok::Error;
+  }
+
+  ++CurPtr;
+
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+
+  if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
+    if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
+        ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
+        isdigit(static_cast<unsigned char>(CurPtr[2])))) {
+      CurPtr += 2;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+    }
+  }
+
+  APFloatVal = APFloat(std::atof(TokStart));
+  return lltok::APFloat;
+}
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.h b/contrib/llvm/lib/AsmParser/LLLexer.h
new file mode 100644
index 000000000000..85703c766b09
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/LLLexer.h
@@ -0,0 +1,93 @@
+//===- LLLexer.h - Lexer for LLVM Assembly Files ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Lexer for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIB_ASMPARSER_LLLEXER_H
+#define LIB_ASMPARSER_LLLEXER_H
+
+#include "LLToken.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/Support/SourceMgr.h"
+#include <string>
+
+namespace llvm {
+  class MemoryBuffer;
+  class Type;
+  class SMDiagnostic;
+  class LLVMContext;
+
+  class LLLexer {
+    const char *CurPtr;
+    MemoryBuffer *CurBuf;
+    SMDiagnostic &ErrorInfo;
+    SourceMgr &SM;
+    LLVMContext &Context;
+
+    // Information about the current token.
+    const char *TokStart;
+    lltok::Kind CurKind;
+    std::string StrVal;
+    unsigned UIntVal;
+    Type *TyVal;
+    APFloat APFloatVal;
+    APSInt  APSIntVal;
+
+  public:
+    explicit LLLexer(MemoryBuffer *StartBuf, SourceMgr &SM, SMDiagnostic &,
+                     LLVMContext &C);
+    ~LLLexer() {}
+
+    lltok::Kind Lex() {
+      return CurKind = LexToken();
+    }
+
+    typedef SMLoc LocTy;
+    LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
+    lltok::Kind getKind() const { return CurKind; }
+    const std::string &getStrVal() const { return StrVal; }
+    Type *getTyVal() const { return TyVal; }
+    unsigned getUIntVal() const { return UIntVal; }
+    const APSInt &getAPSIntVal() const { return APSIntVal; }
+    const APFloat &getAPFloatVal() const { return APFloatVal; }
+
+
+    bool Error(LocTy L, const Twine &Msg) const;
+    bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
+    std::string getFilename() const;
+
+  private:
+    lltok::Kind LexToken();
+
+    int getNextChar();
+    void SkipLineComment();
+    lltok::Kind ReadString(lltok::Kind kind);
+    bool ReadVarName();
+
+    lltok::Kind LexIdentifier();
+    lltok::Kind LexDigitOrNegative();
+    lltok::Kind LexPositive();
+    lltok::Kind LexAt();
+    lltok::Kind LexExclaim();
+    lltok::Kind LexPercent();
+    lltok::Kind LexQuote();
+    lltok::Kind Lex0x();
+    lltok::Kind LexHash();
+
+    uint64_t atoull(const char *Buffer, const char *End);
+    uint64_t HexIntToVal(const char *Buffer, const char *End);
+    void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
+    void FP80HexToIntPair(const char *Buff, const char *End, uint64_t Pair[2]);
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm/lib/AsmParser/LLParser.cpp
new file mode 100644
index 000000000000..c8da1f8bc661
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/LLParser.cpp
@@ -0,0 +1,4335 @@
+//===-- LLParser.cpp - Parser Class ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the parser class for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLParser.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/AutoUpgrade.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static std::string getTypeString(Type *T) {
+  std::string Result;
+  raw_string_ostream Tmp(Result);
+  Tmp << *T;
+  return Tmp.str();
+}
+
+/// Run: module ::= toplevelentity*
+bool LLParser::Run() {
+  // Prime the lexer.
+  Lex.Lex();
+
+  return ParseTopLevelEntities() ||
+         ValidateEndOfModule();
+}
+
+/// ValidateEndOfModule - Do final validity and sanity checks at the end of the
+/// module.
+bool LLParser::ValidateEndOfModule() {
+  // Handle any instruction metadata forward references.
+  if (!ForwardRefInstMetadata.empty()) {
+    for (DenseMap<Instruction*, std::vector<MDRef> >::iterator
+         I = ForwardRefInstMetadata.begin(), E = ForwardRefInstMetadata.end();
+         I != E; ++I) {
+      Instruction *Inst = I->first;
+      const std::vector<MDRef> &MDList = I->second;
+
+      for (unsigned i = 0, e = MDList.size(); i != e; ++i) {
+        unsigned SlotNo = MDList[i].MDSlot;
+
+        if (SlotNo >= NumberedMetadata.size() || NumberedMetadata[SlotNo] == 0)
+          return Error(MDList[i].Loc, "use of undefined metadata '!" +
+                       Twine(SlotNo) + "'");
+        Inst->setMetadata(MDList[i].MDKind, NumberedMetadata[SlotNo]);
+      }
+    }
+    ForwardRefInstMetadata.clear();
+  }
+
+  // Handle any function attribute group forward references.
+  for (std::map<Value*, std::vector<unsigned> >::iterator
+         I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
+         I != E; ++I) {
+    Value *V = I->first;
+    std::vector<unsigned> &Vec = I->second;
+    AttrBuilder B;
+
+    for (std::vector<unsigned>::iterator VI = Vec.begin(), VE = Vec.end();
+         VI != VE; ++VI)
+      B.merge(NumberedAttrBuilders[*VI]);
+
+    if (Function *Fn = dyn_cast<Function>(V)) {
+      AttributeSet AS = Fn->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+
+      FnAttrs.merge(B);
+
+      // If the alignment was parsed as an attribute, move to the alignment
+      // field.
+      if (FnAttrs.hasAlignmentAttr()) {
+        Fn->setAlignment(FnAttrs.getAlignment());
+        FnAttrs.removeAttribute(Attribute::Alignment);
+      }
+
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      Fn->setAttributes(AS);
+    } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
+      AttributeSet AS = CI->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+      FnAttrs.merge(B);
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      CI->setAttributes(AS);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
+      AttributeSet AS = II->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
+      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
+                               AS.getFnAttributes());
+      FnAttrs.merge(B);
+      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
+                            AttributeSet::get(Context,
+                                              AttributeSet::FunctionIndex,
+                                              FnAttrs));
+      II->setAttributes(AS);
+    } else {
+      llvm_unreachable("invalid object with forward attribute group reference");
+    }
+  }
+
+  // If there are entries in ForwardRefBlockAddresses at this point, they are
+  // references after the function was defined.  Resolve those now.
+  while (!ForwardRefBlockAddresses.empty()) {
+    // Okay, we are referencing an already-parsed function, resolve them now.
+    Function *TheFn = 0;
+    const ValID &Fn = ForwardRefBlockAddresses.begin()->first;
+    if (Fn.Kind == ValID::t_GlobalName)
+      TheFn = M->getFunction(Fn.StrVal);
+    else if (Fn.UIntVal < NumberedVals.size())
+      TheFn = dyn_cast<Function>(NumberedVals[Fn.UIntVal]);
+
+    if (TheFn == 0)
+      return Error(Fn.Loc, "unknown function referenced by blockaddress");
+
+    // Resolve all these references.
+    if (ResolveForwardRefBlockAddresses(TheFn,
+                                      ForwardRefBlockAddresses.begin()->second,
+                                        0))
+      return true;
+
+    ForwardRefBlockAddresses.erase(ForwardRefBlockAddresses.begin());
+  }
+
+  for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i)
+    if (NumberedTypes[i].second.isValid())
+      return Error(NumberedTypes[i].second,
+                   "use of undefined type '%" + Twine(i) + "'");
+
+  for (StringMap<std::pair<Type*, LocTy> >::iterator I =
+       NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I)
+    if (I->second.second.isValid())
+      return Error(I->second.second,
+                   "use of undefined type named '" + I->getKey() + "'");
+
+  if (!ForwardRefVals.empty())
+    return Error(ForwardRefVals.begin()->second.second,
+                 "use of undefined value '@" + ForwardRefVals.begin()->first +
+                 "'");
+
+  if (!ForwardRefValIDs.empty())
+    return Error(ForwardRefValIDs.begin()->second.second,
+                 "use of undefined value '@" +
+                 Twine(ForwardRefValIDs.begin()->first) + "'");
+
+  if (!ForwardRefMDNodes.empty())
+    return Error(ForwardRefMDNodes.begin()->second.second,
+                 "use of undefined metadata '!" +
+                 Twine(ForwardRefMDNodes.begin()->first) + "'");
+
+
+  // Look for intrinsic functions and CallInst that need to be upgraded
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
+    UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
+
+  return false;
+}
+
+bool LLParser::ResolveForwardRefBlockAddresses(Function *TheFn,
+                             std::vector<std::pair<ValID, GlobalValue*> > &Refs,
+                                               PerFunctionState *PFS) {
+  // Loop over all the references, resolving them.
+  for (unsigned i = 0, e = Refs.size(); i != e; ++i) {
+    BasicBlock *Res;
+    if (PFS) {
+      if (Refs[i].first.Kind == ValID::t_LocalName)
+        Res = PFS->GetBB(Refs[i].first.StrVal, Refs[i].first.Loc);
+      else
+        Res = PFS->GetBB(Refs[i].first.UIntVal, Refs[i].first.Loc);
+    } else if (Refs[i].first.Kind == ValID::t_LocalID) {
+      return Error(Refs[i].first.Loc,
+       "cannot take address of numeric label after the function is defined");
+    } else {
+      Res = dyn_cast_or_null<BasicBlock>(
+                     TheFn->getValueSymbolTable().lookup(Refs[i].first.StrVal));
+    }
+
+    if (Res == 0)
+      return Error(Refs[i].first.Loc,
+                   "referenced value is not a basic block");
+
+    // Get the BlockAddress for this and update references to use it.
+    BlockAddress *BA = BlockAddress::get(TheFn, Res);
+    Refs[i].second->replaceAllUsesWith(BA);
+    Refs[i].second->eraseFromParent();
+  }
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Top-Level Entities
+//===----------------------------------------------------------------------===//
+
+bool LLParser::ParseTopLevelEntities() {
+  while (1) {
+    switch (Lex.getKind()) {
+    default:         return TokError("expected top-level entity");
+    case lltok::Eof: return false;
+    case lltok::kw_declare: if (ParseDeclare()) return true; break;
+    case lltok::kw_define:  if (ParseDefine()) return true; break;
+    case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
+    case lltok::kw_target:  if (ParseTargetDefinition()) return true; break;
+    case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
+    case lltok::LocalVarID: if (ParseUnnamedType()) return true; break;
+    case lltok::LocalVar:   if (ParseNamedType()) return true; break;
+    case lltok::GlobalID:   if (ParseUnnamedGlobal()) return true; break;
+    case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
+    case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
+    case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
+
+    // The Global variable production with no name can have many different
+    // optional leading prefixes, the production is:
+    // GlobalVar ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
+    //               OptionalAddrSpace OptionalUnNammedAddr
+    //               ('constant'|'global') ...
+    case lltok::kw_private:             // OptionalLinkage
+    case lltok::kw_linker_private:      // OptionalLinkage
+    case lltok::kw_linker_private_weak: // OptionalLinkage
+    case lltok::kw_linker_private_weak_def_auto: // FIXME: backwards compat.
+    case lltok::kw_internal:            // OptionalLinkage
+    case lltok::kw_weak:                // OptionalLinkage
+    case lltok::kw_weak_odr:            // OptionalLinkage
+    case lltok::kw_linkonce:            // OptionalLinkage
+    case lltok::kw_linkonce_odr:        // OptionalLinkage
+    case lltok::kw_linkonce_odr_auto_hide: // OptionalLinkage
+    case lltok::kw_appending:           // OptionalLinkage
+    case lltok::kw_dllexport:           // OptionalLinkage
+    case lltok::kw_common:              // OptionalLinkage
+    case lltok::kw_dllimport:           // OptionalLinkage
+    case lltok::kw_extern_weak:         // OptionalLinkage
+    case lltok::kw_external: {          // OptionalLinkage
+      unsigned Linkage, Visibility;
+      if (ParseOptionalLinkage(Linkage) ||
+          ParseOptionalVisibility(Visibility) ||
+          ParseGlobal("", SMLoc(), Linkage, true, Visibility))
+        return true;
+      break;
+    }
+    case lltok::kw_default:       // OptionalVisibility
+    case lltok::kw_hidden:        // OptionalVisibility
+    case lltok::kw_protected: {   // OptionalVisibility
+      unsigned Visibility;
+      if (ParseOptionalVisibility(Visibility) ||
+          ParseGlobal("", SMLoc(), 0, false, Visibility))
+        return true;
+      break;
+    }
+
+    case lltok::kw_thread_local:  // OptionalThreadLocal
+    case lltok::kw_addrspace:     // OptionalAddrSpace
+    case lltok::kw_constant:      // GlobalType
+    case lltok::kw_global:        // GlobalType
+      if (ParseGlobal("", SMLoc(), 0, false, 0)) return true;
+      break;
+
+    case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
+    }
+  }
+}
+
+
+/// toplevelentity
+///   ::= 'module' 'asm' STRINGCONSTANT
+bool LLParser::ParseModuleAsm() {
+  assert(Lex.getKind() == lltok::kw_module);
+  Lex.Lex();
+
+  std::string AsmStr;
+  if (ParseToken(lltok::kw_asm, "expected 'module asm'") ||
+      ParseStringConstant(AsmStr)) return true;
+
+  M->appendModuleInlineAsm(AsmStr);
+  return false;
+}
+
+/// toplevelentity
+///   ::= 'target' 'triple' '=' STRINGCONSTANT
+///   ::= 'target' 'datalayout' '=' STRINGCONSTANT
+bool LLParser::ParseTargetDefinition() {
+  assert(Lex.getKind() == lltok::kw_target);
+  std::string Str;
+  switch (Lex.Lex()) {
+  default: return TokError("unknown target property");
+  case lltok::kw_triple:
+    Lex.Lex();
+    if (ParseToken(lltok::equal, "expected '=' after target triple") ||
+        ParseStringConstant(Str))
+      return true;
+    M->setTargetTriple(Str);
+    return false;
+  case lltok::kw_datalayout:
+    Lex.Lex();
+    if (ParseToken(lltok::equal, "expected '=' after target datalayout") ||
+        ParseStringConstant(Str))
+      return true;
+    M->setDataLayout(Str);
+    return false;
+  }
+}
+
+/// toplevelentity
+///   ::= 'deplibs' '=' '[' ']'
+///   ::= 'deplibs' '=' '[' STRINGCONSTANT (',' STRINGCONSTANT)* ']'
+/// FIXME: Remove in 4.0. Currently parse, but ignore.
+bool LLParser::ParseDepLibs() {
+  assert(Lex.getKind() == lltok::kw_deplibs);
+  Lex.Lex();
+  if (ParseToken(lltok::equal, "expected '=' after deplibs") ||
+      ParseToken(lltok::lsquare, "expected '=' after deplibs"))
+    return true;
+
+  if (EatIfPresent(lltok::rsquare))
+    return false;
+
+  do {
+    std::string Str;
+    if (ParseStringConstant(Str)) return true;
+  } while (EatIfPresent(lltok::comma));
+
+  return ParseToken(lltok::rsquare, "expected ']' at end of list");
+}
+
+/// ParseUnnamedType:
+///   ::= LocalVarID '=' 'type' type
+bool LLParser::ParseUnnamedType() {
+  LocTy TypeLoc = Lex.getLoc();
+  unsigned TypeID = Lex.getUIntVal();
+  Lex.Lex(); // eat LocalVarID;
+
+  if (ParseToken(lltok::equal, "expected '=' after name") ||
+      ParseToken(lltok::kw_type, "expected 'type' after '='"))
+    return true;
+
+  if (TypeID >= NumberedTypes.size())
+    NumberedTypes.resize(TypeID+1);
+
+  Type *Result = 0;
+  if (ParseStructDefinition(TypeLoc, "",
+                            NumberedTypes[TypeID], Result)) return true;
+
+  if (!isa<StructType>(Result)) {
+    std::pair<Type*, LocTy> &Entry = NumberedTypes[TypeID];
+    if (Entry.first)
+      return Error(TypeLoc, "non-struct types may not be recursive");
+    Entry.first = Result;
+    Entry.second = SMLoc();
+  }
+
+  return false;
+}
+
+
+/// toplevelentity
+///   ::= LocalVar '=' 'type' type
+bool LLParser::ParseNamedType() {
+  std::string Name = Lex.getStrVal();
+  LocTy NameLoc = Lex.getLoc();
+  Lex.Lex();  // eat LocalVar.
+
+  if (ParseToken(lltok::equal, "expected '=' after name") ||
+      ParseToken(lltok::kw_type, "expected 'type' after name"))
+    return true;
+
+  Type *Result = 0;
+  if (ParseStructDefinition(NameLoc, Name,
+                            NamedTypes[Name], Result)) return true;
+
+  if (!isa<StructType>(Result)) {
+    std::pair<Type*, LocTy> &Entry = NamedTypes[Name];
+    if (Entry.first)
+      return Error(NameLoc, "non-struct types may not be recursive");
+    Entry.first = Result;
+    Entry.second = SMLoc();
+  }
+
+  return false;
+}
+
+
+/// toplevelentity
+///   ::= 'declare' FunctionHeader
+bool LLParser::ParseDeclare() {
+  assert(Lex.getKind() == lltok::kw_declare);
+  Lex.Lex();
+
+  Function *F;
+  return ParseFunctionHeader(F, false);
+}
+
+/// toplevelentity
+///   ::= 'define' FunctionHeader '{' ...
+bool LLParser::ParseDefine() {
+  assert(Lex.getKind() == lltok::kw_define);
+  Lex.Lex();
+
+  Function *F;
+  return ParseFunctionHeader(F, true) ||
+         ParseFunctionBody(*F);
+}
+
+/// ParseGlobalType
+///   ::= 'constant'
+///   ::= 'global'
+bool LLParser::ParseGlobalType(bool &IsConstant) {
+  if (Lex.getKind() == lltok::kw_constant)
+    IsConstant = true;
+  else if (Lex.getKind() == lltok::kw_global)
+    IsConstant = false;
+  else {
+    IsConstant = false;
+    return TokError("expected 'global' or 'constant'");
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseUnnamedGlobal:
+///   OptionalVisibility ALIAS ...
+///   OptionalLinkage OptionalVisibility ...   -> global variable
+///   GlobalID '=' OptionalVisibility ALIAS ...
+///   GlobalID '=' OptionalLinkage OptionalVisibility ...   -> global variable
+bool LLParser::ParseUnnamedGlobal() {
+  unsigned VarID = NumberedVals.size();
+  std::string Name;
+  LocTy NameLoc = Lex.getLoc();
+
+  // Handle the GlobalID form.
+  if (Lex.getKind() == lltok::GlobalID) {
+    if (Lex.getUIntVal() != VarID)
+      return Error(Lex.getLoc(), "variable expected to be numbered '%" +
+                   Twine(VarID) + "'");
+    Lex.Lex(); // eat GlobalID;
+
+    if (ParseToken(lltok::equal, "expected '=' after name"))
+      return true;
+  }
+
+  bool HasLinkage;
+  unsigned Linkage, Visibility;
+  if (ParseOptionalLinkage(Linkage, HasLinkage) ||
+      ParseOptionalVisibility(Visibility))
+    return true;
+
+  if (HasLinkage || Lex.getKind() != lltok::kw_alias)
+    return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility);
+  return ParseAlias(Name, NameLoc, Visibility);
+}
+
+/// ParseNamedGlobal:
+///   GlobalVar '=' OptionalVisibility ALIAS ...
+///   GlobalVar '=' OptionalLinkage OptionalVisibility ...   -> global variable
+bool LLParser::ParseNamedGlobal() {
+  assert(Lex.getKind() == lltok::GlobalVar);
+  LocTy NameLoc = Lex.getLoc();
+  std::string Name = Lex.getStrVal();
+  Lex.Lex();
+
+  bool HasLinkage;
+  unsigned Linkage, Visibility;
+  if (ParseToken(lltok::equal, "expected '=' in global variable") ||
+      ParseOptionalLinkage(Linkage, HasLinkage) ||
+      ParseOptionalVisibility(Visibility))
+    return true;
+
+  if (HasLinkage || Lex.getKind() != lltok::kw_alias)
+    return ParseGlobal(Name, NameLoc, Linkage, HasLinkage, Visibility);
+  return ParseAlias(Name, NameLoc, Visibility);
+}
+
+// MDString:
+//   ::= '!' STRINGCONSTANT
+bool LLParser::ParseMDString(MDString *&Result) {
+  std::string Str;
+  if (ParseStringConstant(Str)) return true;
+  Result = MDString::get(Context, Str);
+  return false;
+}
+
+// MDNode:
+//   ::= '!' MDNodeNumber
+//
+/// This version of ParseMDNodeID returns the slot number and null in the case
+/// of a forward reference.
+bool LLParser::ParseMDNodeID(MDNode *&Result, unsigned &SlotNo) {
+  // !{ ..., !42, ... }
+  if (ParseUInt32(SlotNo)) return true;
+
+  // Check existing MDNode.
+  if (SlotNo < NumberedMetadata.size() && NumberedMetadata[SlotNo] != 0)
+    Result = NumberedMetadata[SlotNo];
+  else
+    Result = 0;
+  return false;
+}
+
+bool LLParser::ParseMDNodeID(MDNode *&Result) {
+  // !{ ..., !42, ... }
+  unsigned MID = 0;
+  if (ParseMDNodeID(Result, MID)) return true;
+
+  // If not a forward reference, just return it now.
+  if (Result) return false;
+
+  // Otherwise, create MDNode forward reference.
+  MDNode *FwdNode = MDNode::getTemporary(Context, ArrayRef<Value*>());
+  ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc());
+
+  if (NumberedMetadata.size() <= MID)
+    NumberedMetadata.resize(MID+1);
+  NumberedMetadata[MID] = FwdNode;
+  Result = FwdNode;
+  return false;
+}
+
+/// ParseNamedMetadata:
+///   !foo = !{ !1, !2 }
+bool LLParser::ParseNamedMetadata() {
+  assert(Lex.getKind() == lltok::MetadataVar);
+  std::string Name = Lex.getStrVal();
+  Lex.Lex();
+
+  if (ParseToken(lltok::equal, "expected '=' here") ||
+      ParseToken(lltok::exclaim, "Expected '!' here") ||
+      ParseToken(lltok::lbrace, "Expected '{' here"))
+    return true;
+
+  NamedMDNode *NMD = M->getOrInsertNamedMetadata(Name);
+  if (Lex.getKind() != lltok::rbrace)
+    do {
+      if (ParseToken(lltok::exclaim, "Expected '!' here"))
+        return true;
+
+      MDNode *N = 0;
+      if (ParseMDNodeID(N)) return true;
+      NMD->addOperand(N);
+    } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rbrace, "expected end of metadata node"))
+    return true;
+
+  return false;
+}
+
+/// ParseStandaloneMetadata:
+///   !42 = !{...}
+bool LLParser::ParseStandaloneMetadata() {
+  assert(Lex.getKind() == lltok::exclaim);
+  Lex.Lex();
+  unsigned MetadataID = 0;
+
+  LocTy TyLoc;
+  Type *Ty = 0;
+  SmallVector<Value *, 16> Elts;
+  if (ParseUInt32(MetadataID) ||
+      ParseToken(lltok::equal, "expected '=' here") ||
+      ParseType(Ty, TyLoc) ||
+      ParseToken(lltok::exclaim, "Expected '!' here") ||
+      ParseToken(lltok::lbrace, "Expected '{' here") ||
+      ParseMDNodeVector(Elts, NULL) ||
+      ParseToken(lltok::rbrace, "expected end of metadata node"))
+    return true;
+
+  MDNode *Init = MDNode::get(Context, Elts);
+
+  // See if this was forward referenced, if so, handle it.
+  std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> >::iterator
+    FI = ForwardRefMDNodes.find(MetadataID);
+  if (FI != ForwardRefMDNodes.end()) {
+    MDNode *Temp = FI->second.first;
+    Temp->replaceAllUsesWith(Init);
+    MDNode::deleteTemporary(Temp);
+    ForwardRefMDNodes.erase(FI);
+
+    assert(NumberedMetadata[MetadataID] == Init && "Tracking VH didn't work");
+  } else {
+    if (MetadataID >= NumberedMetadata.size())
+      NumberedMetadata.resize(MetadataID+1);
+
+    if (NumberedMetadata[MetadataID] != 0)
+      return TokError("Metadata id is already used");
+    NumberedMetadata[MetadataID] = Init;
+  }
+
+  return false;
+}
+
+/// ParseAlias:
+///   ::= GlobalVar '=' OptionalVisibility 'alias' OptionalLinkage Aliasee
+/// Aliasee
+///   ::= TypeAndValue
+///   ::= 'bitcast' '(' TypeAndValue 'to' Type ')'
+///   ::= 'getelementptr' 'inbounds'? '(' ... ')'
+///
+/// Everything through visibility has already been parsed.
+///
+bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
+                          unsigned Visibility) {
+  assert(Lex.getKind() == lltok::kw_alias);
+  Lex.Lex();
+  unsigned Linkage;
+  LocTy LinkageLoc = Lex.getLoc();
+  if (ParseOptionalLinkage(Linkage))
+    return true;
+
+  if (Linkage != GlobalValue::ExternalLinkage &&
+      Linkage != GlobalValue::WeakAnyLinkage &&
+      Linkage != GlobalValue::WeakODRLinkage &&
+      Linkage != GlobalValue::InternalLinkage &&
+      Linkage != GlobalValue::PrivateLinkage &&
+      Linkage != GlobalValue::LinkerPrivateLinkage &&
+      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
+    return Error(LinkageLoc, "invalid linkage type for alias");
+
+  Constant *Aliasee;
+  LocTy AliaseeLoc = Lex.getLoc();
+  if (Lex.getKind() != lltok::kw_bitcast &&
+      Lex.getKind() != lltok::kw_getelementptr) {
+    if (ParseGlobalTypeAndValue(Aliasee)) return true;
+  } else {
+    // The bitcast dest type is not present, it is implied by the dest type.
+    ValID ID;
+    if (ParseValID(ID)) return true;
+    if (ID.Kind != ValID::t_Constant)
+      return Error(AliaseeLoc, "invalid aliasee");
+    Aliasee = ID.ConstantVal;
+  }
+
+  if (!Aliasee->getType()->isPointerTy())
+    return Error(AliaseeLoc, "alias must have pointer type");
+
+  // Okay, create the alias but do not insert it into the module yet.
+  GlobalAlias* GA = new GlobalAlias(Aliasee->getType(),
+                                    (GlobalValue::LinkageTypes)Linkage, Name,
+                                    Aliasee);
+  GA->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+
+  // See if this value already exists in the symbol table.  If so, it is either
+  // a redefinition or a definition of a forward reference.
+  if (GlobalValue *Val = M->getNamedValue(Name)) {
+    // See if this was a redefinition.  If so, there is no entry in
+    // ForwardRefVals.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I == ForwardRefVals.end())
+      return Error(NameLoc, "redefinition of global named '@" + Name + "'");
+
+    // Otherwise, this was a definition of forward ref.  Verify that types
+    // agree.
+    if (Val->getType() != GA->getType())
+      return Error(NameLoc,
+              "forward reference and definition of alias have different types");
+
+    // If they agree, just RAUW the old value with the alias and remove the
+    // forward ref info.
+    Val->replaceAllUsesWith(GA);
+    Val->eraseFromParent();
+    ForwardRefVals.erase(I);
+  }
+
+  // Insert into the module, we know its name won't collide now.
+  M->getAliasList().push_back(GA);
+  assert(GA->getName() == Name && "Should not be a name conflict!");
+
+  return false;
+}
+
+/// ParseGlobal
+///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalThreadLocal
+///       OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalExternallyInitialized GlobalType Type Const
+///   ::= OptionalLinkage OptionalVisibility OptionalThreadLocal
+///       OptionalAddrSpace OptionalUnNammedAddr
+///       OptionalExternallyInitialized GlobalType Type Const
+///
+/// Everything through visibility has been parsed already.
+///
+bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
+                           unsigned Linkage, bool HasLinkage,
+                           unsigned Visibility) {
+  unsigned AddrSpace;
+  bool IsConstant, UnnamedAddr, IsExternallyInitialized;
+  GlobalVariable::ThreadLocalMode TLM;
+  LocTy UnnamedAddrLoc;
+  LocTy IsExternallyInitializedLoc;
+  LocTy TyLoc;
+
+  Type *Ty = 0;
+  if (ParseOptionalThreadLocal(TLM) ||
+      ParseOptionalAddrSpace(AddrSpace) ||
+      ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
+                         &UnnamedAddrLoc) ||
+      ParseOptionalToken(lltok::kw_externally_initialized,
+                         IsExternallyInitialized,
+                         &IsExternallyInitializedLoc) ||
+      ParseGlobalType(IsConstant) ||
+      ParseType(Ty, TyLoc))
+    return true;
+
+  // If the linkage is specified and is external, then no initializer is
+  // present.
+  Constant *Init = 0;
+  if (!HasLinkage || (Linkage != GlobalValue::DLLImportLinkage &&
+                      Linkage != GlobalValue::ExternalWeakLinkage &&
+                      Linkage != GlobalValue::ExternalLinkage)) {
+    if (ParseGlobalValue(Ty, Init))
+      return true;
+  }
+
+  if (Ty->isFunctionTy() || Ty->isLabelTy())
+    return Error(TyLoc, "invalid type for global variable");
+
+  GlobalVariable *GV = 0;
+
+  // See if the global was forward referenced, if so, use the global.
+  if (!Name.empty()) {
+    if (GlobalValue *GVal = M->getNamedValue(Name)) {
+      if (!ForwardRefVals.erase(Name) || !isa<GlobalValue>(GVal))
+        return Error(NameLoc, "redefinition of global '@" + Name + "'");
+      GV = cast<GlobalVariable>(GVal);
+    }
+  } else {
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(NumberedVals.size());
+    if (I != ForwardRefValIDs.end()) {
+      GV = cast<GlobalVariable>(I->second.first);
+      ForwardRefValIDs.erase(I);
+    }
+  }
+
+  if (GV == 0) {
+    GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, 0,
+                            Name, 0, GlobalVariable::NotThreadLocal,
+                            AddrSpace);
+  } else {
+    if (GV->getType()->getElementType() != Ty)
+      return Error(TyLoc,
+            "forward reference and definition of global have different types");
+
+    // Move the forward-reference to the correct spot in the module.
+    M->getGlobalList().splice(M->global_end(), M->getGlobalList(), GV);
+  }
+
+  if (Name.empty())
+    NumberedVals.push_back(GV);
+
+  // Set the parsed properties on the global.
+  if (Init)
+    GV->setInitializer(Init);
+  GV->setConstant(IsConstant);
+  GV->setLinkage((GlobalValue::LinkageTypes)Linkage);
+  GV->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  GV->setExternallyInitialized(IsExternallyInitialized);
+  GV->setThreadLocalMode(TLM);
+  GV->setUnnamedAddr(UnnamedAddr);
+
+  // Parse attributes on the global.
+  while (Lex.getKind() == lltok::comma) {
+    Lex.Lex();
+
+    if (Lex.getKind() == lltok::kw_section) {
+      Lex.Lex();
+      GV->setSection(Lex.getStrVal());
+      if (ParseToken(lltok::StringConstant, "expected global section string"))
+        return true;
+    } else if (Lex.getKind() == lltok::kw_align) {
+      unsigned Alignment;
+      if (ParseOptionalAlignment(Alignment)) return true;
+      GV->setAlignment(Alignment);
+    } else {
+      TokError("unknown global variable property!");
+    }
+  }
+
+  return false;
+}
+
+/// ParseUnnamedAttrGrp
+///   ::= 'attributes' AttrGrpID '=' '{' AttrValPair+ '}'
+bool LLParser::ParseUnnamedAttrGrp() {
+  assert(Lex.getKind() == lltok::kw_attributes);
+  LocTy AttrGrpLoc = Lex.getLoc();
+  Lex.Lex();
+
+  assert(Lex.getKind() == lltok::AttrGrpID);
+  unsigned VarID = Lex.getUIntVal();
+  std::vector<unsigned> unused;
+  LocTy NoBuiltinLoc;
+  Lex.Lex();
+
+  if (ParseToken(lltok::equal, "expected '=' here") ||
+      ParseToken(lltok::lbrace, "expected '{' here") ||
+      ParseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
+                                 NoBuiltinLoc) ||
+      ParseToken(lltok::rbrace, "expected end of attribute group"))
+    return true;
+
+  if (!NumberedAttrBuilders[VarID].hasAttributes())
+    return Error(AttrGrpLoc, "attribute group has no attributes");
+
+  return false;
+}
+
+/// ParseFnAttributeValuePairs
+///   ::= <attr> | <attr> '=' <value>
+bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
+                                          std::vector<unsigned> &FwdRefAttrGrps,
+                                          bool inAttrGrp, LocTy &NoBuiltinLoc) {
+  bool HaveError = false;
+
+  B.clear();
+
+  while (true) {
+    lltok::Kind Token = Lex.getKind();
+    if (Token == lltok::kw_nobuiltin)
+      NoBuiltinLoc = Lex.getLoc();
+    switch (Token) {
+    default:
+      if (!inAttrGrp) return HaveError;
+      return Error(Lex.getLoc(), "unterminated attribute group");
+    case lltok::rbrace:
+      // Finished.
+      return false;
+
+    case lltok::AttrGrpID: {
+      // Allow a function to reference an attribute group:
+      //
+      //   define void @foo() #1 { ... }
+      if (inAttrGrp)
+        HaveError |=
+          Error(Lex.getLoc(),
+              "cannot have an attribute group reference in an attribute group");
+
+      unsigned AttrGrpNum = Lex.getUIntVal();
+      if (inAttrGrp) break;
+
+      // Save the reference to the attribute group. We'll fill it in later.
+      FwdRefAttrGrps.push_back(AttrGrpNum);
+      break;
+    }
+    // Target-dependent attributes:
+    case lltok::StringConstant: {
+      std::string Attr = Lex.getStrVal();
+      Lex.Lex();
+      std::string Val;
+      if (EatIfPresent(lltok::equal) &&
+          ParseStringConstant(Val))
+        return true;
+
+      B.addAttribute(Attr, Val);
+      continue;
+    }
+
+    // Target-independent attributes:
+    case lltok::kw_align: {
+      // As a hack, we allow "align 2" on functions as a synonym for "alignstack
+      // 2".
+      unsigned Alignment;
+      if (inAttrGrp) {
+        Lex.Lex();
+        if (ParseToken(lltok::equal, "expected '=' here") ||
+            ParseUInt32(Alignment))
+          return true;
+      } else {
+        if (ParseOptionalAlignment(Alignment))
+          return true;
+      }
+      B.addAlignmentAttr(Alignment);
+      continue;
+    }
+    case lltok::kw_alignstack: {
+      unsigned Alignment;
+      if (inAttrGrp) {
+        Lex.Lex();
+        if (ParseToken(lltok::equal, "expected '=' here") ||
+            ParseUInt32(Alignment))
+          return true;
+      } else {
+        if (ParseOptionalStackAlignment(Alignment))
+          return true;
+      }
+      B.addStackAlignmentAttr(Alignment);
+      continue;
+    }
+    case lltok::kw_alwaysinline:      B.addAttribute(Attribute::AlwaysInline); break;
+    case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
+    case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
+    case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
+    case lltok::kw_nobuiltin:         B.addAttribute(Attribute::NoBuiltin); break;
+    case lltok::kw_noduplicate:       B.addAttribute(Attribute::NoDuplicate); break;
+    case lltok::kw_noimplicitfloat:   B.addAttribute(Attribute::NoImplicitFloat); break;
+    case lltok::kw_noinline:          B.addAttribute(Attribute::NoInline); break;
+    case lltok::kw_nonlazybind:       B.addAttribute(Attribute::NonLazyBind); break;
+    case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
+    case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
+    case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
+    case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
+    case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
+    case lltok::kw_returns_twice:     B.addAttribute(Attribute::ReturnsTwice); break;
+    case lltok::kw_ssp:               B.addAttribute(Attribute::StackProtect); break;
+    case lltok::kw_sspreq:            B.addAttribute(Attribute::StackProtectReq); break;
+    case lltok::kw_sspstrong:         B.addAttribute(Attribute::StackProtectStrong); break;
+    case lltok::kw_sanitize_address:  B.addAttribute(Attribute::SanitizeAddress); break;
+    case lltok::kw_sanitize_thread:   B.addAttribute(Attribute::SanitizeThread); break;
+    case lltok::kw_sanitize_memory:   B.addAttribute(Attribute::SanitizeMemory); break;
+    case lltok::kw_uwtable:           B.addAttribute(Attribute::UWTable); break;
+
+    // Error handling.
+    case lltok::kw_inreg:
+    case lltok::kw_signext:
+    case lltok::kw_zeroext:
+      HaveError |=
+        Error(Lex.getLoc(),
+              "invalid use of attribute on a function");
+      break;
+    case lltok::kw_byval:
+    case lltok::kw_nest:
+    case lltok::kw_noalias:
+    case lltok::kw_nocapture:
+    case lltok::kw_sret:
+      HaveError |=
+        Error(Lex.getLoc(),
+              "invalid use of parameter-only attribute on a function");
+      break;
+    }
+
+    Lex.Lex();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// GlobalValue Reference/Resolution Routines.
+//===----------------------------------------------------------------------===//
+
+/// GetGlobalVal - Get a value with the specified name or ID, creating a
+/// forward reference record if needed.  This can return null if the value
+/// exists but does not have the right type.
+GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
+                                    LocTy Loc) {
+  PointerType *PTy = dyn_cast<PointerType>(Ty);
+  if (PTy == 0) {
+    Error(Loc, "global variable reference must have pointer type");
+    return 0;
+  }
+
+  // Look this name up in the normal function symbol table.
+  GlobalValue *Val =
+    cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name));
+
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I != ForwardRefVals.end())
+      Val = I->second.first;
+  }
+
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    Error(Loc, "'@" + Name + "' defined with type '" +
+          getTypeString(Val->getType()) + "'");
+    return 0;
+  }
+
+  // Otherwise, create a new forward reference for this value and remember it.
+  GlobalValue *FwdVal;
+  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
+    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
+  else
+    FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
+                                GlobalValue::ExternalWeakLinkage, 0, Name,
+                                0, GlobalVariable::NotThreadLocal,
+                                PTy->getAddressSpace());
+
+  ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
+  PointerType *PTy = dyn_cast<PointerType>(Ty);
+  if (PTy == 0) {
+    Error(Loc, "global variable reference must have pointer type");
+    return 0;
+  }
+
+  GlobalValue *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(ID);
+    if (I != ForwardRefValIDs.end())
+      Val = I->second.first;
+  }
+
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    Error(Loc, "'@" + Twine(ID) + "' defined with type '" +
+          getTypeString(Val->getType()) + "'");
+    return 0;
+  }
+
+  // Otherwise, create a new forward reference for this value and remember it.
+  GlobalValue *FwdVal;
+  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
+    FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M);
+  else
+    FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
+                                GlobalValue::ExternalWeakLinkage, 0, "");
+
+  ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Helper Routines.
+//===----------------------------------------------------------------------===//
+
+/// ParseToken - If the current token has the specified kind, eat it and return
+/// success.  Otherwise, emit the specified error and return failure.
+bool LLParser::ParseToken(lltok::Kind T, const char *ErrMsg) {
+  if (Lex.getKind() != T)
+    return TokError(ErrMsg);
+  Lex.Lex();
+  return false;
+}
+
+/// ParseStringConstant
+///   ::= StringConstant
+bool LLParser::ParseStringConstant(std::string &Result) {
+  if (Lex.getKind() != lltok::StringConstant)
+    return TokError("expected string constant");
+  Result = Lex.getStrVal();
+  Lex.Lex();
+  return false;
+}
+
+/// ParseUInt32
+///   ::= uint32
+bool LLParser::ParseUInt32(unsigned &Val) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
+    return TokError("expected integer");
+  uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL+1);
+  if (Val64 != unsigned(Val64))
+    return TokError("expected 32-bit integer (too large)");
+  Val = Val64;
+  Lex.Lex();
+  return false;
+}
+
+/// ParseTLSModel
+///   := 'localdynamic'
+///   := 'initialexec'
+///   := 'localexec'
+bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
+  switch (Lex.getKind()) {
+    default:
+      return TokError("expected localdynamic, initialexec or localexec");
+    case lltok::kw_localdynamic:
+      TLM = GlobalVariable::LocalDynamicTLSModel;
+      break;
+    case lltok::kw_initialexec:
+      TLM = GlobalVariable::InitialExecTLSModel;
+      break;
+    case lltok::kw_localexec:
+      TLM = GlobalVariable::LocalExecTLSModel;
+      break;
+  }
+
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalThreadLocal
+///   := /*empty*/
+///   := 'thread_local'
+///   := 'thread_local' '(' tlsmodel ')'
+bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
+  TLM = GlobalVariable::NotThreadLocal;
+  if (!EatIfPresent(lltok::kw_thread_local))
+    return false;
+
+  TLM = GlobalVariable::GeneralDynamicTLSModel;
+  if (Lex.getKind() == lltok::lparen) {
+    Lex.Lex();
+    return ParseTLSModel(TLM) ||
+      ParseToken(lltok::rparen, "expected ')' after thread local model");
+  }
+  return false;
+}
+
+/// ParseOptionalAddrSpace
+///   := /*empty*/
+///   := 'addrspace' '(' uint32 ')'
+bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
+  AddrSpace = 0;
+  if (!EatIfPresent(lltok::kw_addrspace))
+    return false;
+  return ParseToken(lltok::lparen, "expected '(' in address space") ||
+         ParseUInt32(AddrSpace) ||
+         ParseToken(lltok::rparen, "expected ')' in address space");
+}
+
+/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes.
+bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
+  bool HaveError = false;
+
+  B.clear();
+
+  while (1) {
+    lltok::Kind Token = Lex.getKind();
+    switch (Token) {
+    default:  // End of attributes.
+      return HaveError;
+    case lltok::kw_align: {
+      unsigned Alignment;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      B.addAlignmentAttr(Alignment);
+      continue;
+    }
+    case lltok::kw_byval:           B.addAttribute(Attribute::ByVal); break;
+    case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
+    case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
+    case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
+    case lltok::kw_nocapture:       B.addAttribute(Attribute::NoCapture); break;
+    case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
+    case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
+    case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
+
+    case lltok::kw_alignstack:      case lltok::kw_nounwind:
+    case lltok::kw_alwaysinline:    case lltok::kw_optsize:
+    case lltok::kw_inlinehint:      case lltok::kw_readnone:
+    case lltok::kw_minsize:         case lltok::kw_readonly:
+    case lltok::kw_naked:           case lltok::kw_returns_twice:
+    case lltok::kw_nobuiltin:       case lltok::kw_sanitize_address:
+    case lltok::kw_noimplicitfloat: case lltok::kw_sanitize_memory:
+    case lltok::kw_noinline:        case lltok::kw_sanitize_thread:
+    case lltok::kw_nonlazybind:     case lltok::kw_ssp:
+    case lltok::kw_noredzone:       case lltok::kw_sspreq:
+    case lltok::kw_noreturn:        case lltok::kw_uwtable:
+      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      break;
+    }
+
+    Lex.Lex();
+  }
+}
+
+/// ParseOptionalReturnAttrs - Parse a potentially empty list of return attributes.
+bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
+  bool HaveError = false;
+
+  B.clear();
+
+  while (1) {
+    lltok::Kind Token = Lex.getKind();
+    switch (Token) {
+    default:  // End of attributes.
+      return HaveError;
+    case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
+    case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
+    case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
+    case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
+
+    // Error handling.
+    case lltok::kw_sret:  case lltok::kw_nocapture:
+    case lltok::kw_byval: case lltok::kw_nest:
+      HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
+      break;
+
+    case lltok::kw_align:                 case lltok::kw_noreturn:
+    case lltok::kw_alignstack:            case lltok::kw_nounwind:
+    case lltok::kw_alwaysinline:          case lltok::kw_optsize:
+    case lltok::kw_inlinehint:            case lltok::kw_readnone:
+    case lltok::kw_minsize:               case lltok::kw_readonly:
+    case lltok::kw_naked:                 case lltok::kw_returns_twice:
+    case lltok::kw_nobuiltin:             case lltok::kw_sanitize_address:
+    case lltok::kw_noduplicate:           case lltok::kw_sanitize_memory:
+    case lltok::kw_noimplicitfloat:       case lltok::kw_sanitize_thread:
+    case lltok::kw_noinline:              case lltok::kw_ssp:
+    case lltok::kw_nonlazybind:           case lltok::kw_sspreq:
+    case lltok::kw_noredzone:             case lltok::kw_sspstrong:
+                                          case lltok::kw_uwtable:
+      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      break;
+    }
+
+    Lex.Lex();
+  }
+}
+
+/// ParseOptionalLinkage
+///   ::= /*empty*/
+///   ::= 'private'
+///   ::= 'linker_private'
+///   ::= 'linker_private_weak'
+///   ::= 'internal'
+///   ::= 'weak'
+///   ::= 'weak_odr'
+///   ::= 'linkonce'
+///   ::= 'linkonce_odr'
+///   ::= 'linkonce_odr_auto_hide'
+///   ::= 'available_externally'
+///   ::= 'appending'
+///   ::= 'dllexport'
+///   ::= 'common'
+///   ::= 'dllimport'
+///   ::= 'extern_weak'
+///   ::= 'external'
+bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
+  HasLinkage = false;
+  switch (Lex.getKind()) {
+  default:                       Res=GlobalValue::ExternalLinkage; return false;
+  case lltok::kw_private:        Res = GlobalValue::PrivateLinkage;       break;
+  case lltok::kw_linker_private: Res = GlobalValue::LinkerPrivateLinkage; break;
+  case lltok::kw_linker_private_weak:
+    Res = GlobalValue::LinkerPrivateWeakLinkage;
+    break;
+  case lltok::kw_internal:       Res = GlobalValue::InternalLinkage;      break;
+  case lltok::kw_weak:           Res = GlobalValue::WeakAnyLinkage;       break;
+  case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
+  case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
+  case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
+  case lltok::kw_linkonce_odr_auto_hide:
+  case lltok::kw_linker_private_weak_def_auto: // FIXME: For backwards compat.
+    Res = GlobalValue::LinkOnceODRAutoHideLinkage;
+    break;
+  case lltok::kw_available_externally:
+    Res = GlobalValue::AvailableExternallyLinkage;
+    break;
+  case lltok::kw_appending:      Res = GlobalValue::AppendingLinkage;     break;
+  case lltok::kw_dllexport:      Res = GlobalValue::DLLExportLinkage;     break;
+  case lltok::kw_common:         Res = GlobalValue::CommonLinkage;        break;
+  case lltok::kw_dllimport:      Res = GlobalValue::DLLImportLinkage;     break;
+  case lltok::kw_extern_weak:    Res = GlobalValue::ExternalWeakLinkage;  break;
+  case lltok::kw_external:       Res = GlobalValue::ExternalLinkage;      break;
+  }
+  Lex.Lex();
+  HasLinkage = true;
+  return false;
+}
+
+/// ParseOptionalVisibility
+///   ::= /*empty*/
+///   ::= 'default'
+///   ::= 'hidden'
+///   ::= 'protected'
+///
+bool LLParser::ParseOptionalVisibility(unsigned &Res) {
+  switch (Lex.getKind()) {
+  default:                  Res = GlobalValue::DefaultVisibility; return false;
+  case lltok::kw_default:   Res = GlobalValue::DefaultVisibility; break;
+  case lltok::kw_hidden:    Res = GlobalValue::HiddenVisibility; break;
+  case lltok::kw_protected: Res = GlobalValue::ProtectedVisibility; break;
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalCallingConv
+///   ::= /*empty*/
+///   ::= 'ccc'
+///   ::= 'fastcc'
+///   ::= 'kw_intel_ocl_bicc'
+///   ::= 'coldcc'
+///   ::= 'x86_stdcallcc'
+///   ::= 'x86_fastcallcc'
+///   ::= 'x86_thiscallcc'
+///   ::= 'arm_apcscc'
+///   ::= 'arm_aapcscc'
+///   ::= 'arm_aapcs_vfpcc'
+///   ::= 'msp430_intrcc'
+///   ::= 'ptx_kernel'
+///   ::= 'ptx_device'
+///   ::= 'spir_func'
+///   ::= 'spir_kernel'
+///   ::= 'cc' UINT
+///
+bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
+  switch (Lex.getKind()) {
+  default:                       CC = CallingConv::C; return false;
+  case lltok::kw_ccc:            CC = CallingConv::C; break;
+  case lltok::kw_fastcc:         CC = CallingConv::Fast; break;
+  case lltok::kw_coldcc:         CC = CallingConv::Cold; break;
+  case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
+  case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
+  case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
+  case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
+  case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
+  case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
+  case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
+  case lltok::kw_ptx_kernel:     CC = CallingConv::PTX_Kernel; break;
+  case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
+  case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
+  case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
+  case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
+  case lltok::kw_cc: {
+      unsigned ArbitraryCC;
+      Lex.Lex();
+      if (ParseUInt32(ArbitraryCC))
+        return true;
+      CC = static_cast<CallingConv::ID>(ArbitraryCC);
+      return false;
+    }
+  }
+
+  Lex.Lex();
+  return false;
+}
+
+/// ParseInstructionMetadata
+///   ::= !dbg !42 (',' !dbg !57)*
+bool LLParser::ParseInstructionMetadata(Instruction *Inst,
+                                        PerFunctionState *PFS) {
+  do {
+    if (Lex.getKind() != lltok::MetadataVar)
+      return TokError("expected metadata after comma");
+
+    std::string Name = Lex.getStrVal();
+    unsigned MDK = M->getMDKindID(Name);
+    Lex.Lex();
+
+    MDNode *Node;
+    SMLoc Loc = Lex.getLoc();
+
+    if (ParseToken(lltok::exclaim, "expected '!' here"))
+      return true;
+
+    // This code is similar to that of ParseMetadataValue, however it needs to
+    // have special-case code for a forward reference; see the comments on
+    // ForwardRefInstMetadata for details. Also, MDStrings are not supported
+    // at the top level here.
+    if (Lex.getKind() == lltok::lbrace) {
+      ValID ID;
+      if (ParseMetadataListValue(ID, PFS))
+        return true;
+      assert(ID.Kind == ValID::t_MDNode);
+      Inst->setMetadata(MDK, ID.MDNodeVal);
+    } else {
+      unsigned NodeID = 0;
+      if (ParseMDNodeID(Node, NodeID))
+        return true;
+      if (Node) {
+        // If we got the node, add it to the instruction.
+        Inst->setMetadata(MDK, Node);
+      } else {
+        MDRef R = { Loc, MDK, NodeID };
+        // Otherwise, remember that this should be resolved later.
+        ForwardRefInstMetadata[Inst].push_back(R);
+      }
+    }
+
+    // If this is the end of the list, we're done.
+  } while (EatIfPresent(lltok::comma));
+  return false;
+}
+
+/// ParseOptionalAlignment
+///   ::= /* empty */
+///   ::= 'align' 4
+bool LLParser::ParseOptionalAlignment(unsigned &Alignment) {
+  Alignment = 0;
+  if (!EatIfPresent(lltok::kw_align))
+    return false;
+  LocTy AlignLoc = Lex.getLoc();
+  if (ParseUInt32(Alignment)) return true;
+  if (!isPowerOf2_32(Alignment))
+    return Error(AlignLoc, "alignment is not a power of two");
+  if (Alignment > Value::MaximumAlignment)
+    return Error(AlignLoc, "huge alignments are not supported yet");
+  return false;
+}
+
+/// ParseOptionalCommaAlign
+///   ::=
+///   ::= ',' align 4
+///
+/// This returns with AteExtraComma set to true if it ate an excess comma at the
+/// end.
+bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
+                                       bool &AteExtraComma) {
+  AteExtraComma = false;
+  while (EatIfPresent(lltok::comma)) {
+    // Metadata at the end is an early exit.
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      return false;
+    }
+
+    if (Lex.getKind() != lltok::kw_align)
+      return Error(Lex.getLoc(), "expected metadata or 'align'");
+
+    if (ParseOptionalAlignment(Alignment)) return true;
+  }
+
+  return false;
+}
+
+/// ParseScopeAndOrdering
+///   if isAtomic: ::= 'singlethread'? AtomicOrdering
+///   else: ::=
+///
+/// This sets Scope and Ordering to the parsed values.
+bool LLParser::ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+                                     AtomicOrdering &Ordering) {
+  if (!isAtomic)
+    return false;
+
+  Scope = CrossThread;
+  if (EatIfPresent(lltok::kw_singlethread))
+    Scope = SingleThread;
+  switch (Lex.getKind()) {
+  default: return TokError("Expected ordering on atomic instruction");
+  case lltok::kw_unordered: Ordering = Unordered; break;
+  case lltok::kw_monotonic: Ordering = Monotonic; break;
+  case lltok::kw_acquire: Ordering = Acquire; break;
+  case lltok::kw_release: Ordering = Release; break;
+  case lltok::kw_acq_rel: Ordering = AcquireRelease; break;
+  case lltok::kw_seq_cst: Ordering = SequentiallyConsistent; break;
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// ParseOptionalStackAlignment
+///   ::= /* empty */
+///   ::= 'alignstack' '(' 4 ')'
+bool LLParser::ParseOptionalStackAlignment(unsigned &Alignment) {
+  Alignment = 0;
+  if (!EatIfPresent(lltok::kw_alignstack))
+    return false;
+  LocTy ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::lparen))
+    return Error(ParenLoc, "expected '('");
+  LocTy AlignLoc = Lex.getLoc();
+  if (ParseUInt32(Alignment)) return true;
+  ParenLoc = Lex.getLoc();
+  if (!EatIfPresent(lltok::rparen))
+    return Error(ParenLoc, "expected ')'");
+  if (!isPowerOf2_32(Alignment))
+    return Error(AlignLoc, "stack alignment is not a power of two");
+  return false;
+}
+
+/// ParseIndexList - This parses the index list for an insert/extractvalue
+/// instruction.  This sets AteExtraComma in the case where we eat an extra
+/// comma at the end of the line and find that it is followed by metadata.
+/// Clients that don't allow metadata can call the version of this function that
+/// only takes one argument.
+///
+/// ParseIndexList
+///    ::=  (',' uint32)+
+///
+bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
+                              bool &AteExtraComma) {
+  AteExtraComma = false;
+
+  if (Lex.getKind() != lltok::comma)
+    return TokError("expected ',' as start of index list");
+
+  while (EatIfPresent(lltok::comma)) {
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      return false;
+    }
+    unsigned Idx = 0;
+    if (ParseUInt32(Idx)) return true;
+    Indices.push_back(Idx);
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Type Parsing.
+//===----------------------------------------------------------------------===//
+
+/// ParseType - Parse a type.
+bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
+  SMLoc TypeLoc = Lex.getLoc();
+  switch (Lex.getKind()) {
+  default:
+    return TokError("expected type");
+  case lltok::Type:
+    // Type ::= 'float' | 'void' (etc)
+    Result = Lex.getTyVal();
+    Lex.Lex();
+    break;
+  case lltok::lbrace:
+    // Type ::= StructType
+    if (ParseAnonStructType(Result, false))
+      return true;
+    break;
+  case lltok::lsquare:
+    // Type ::= '[' ... ']'
+    Lex.Lex(); // eat the lsquare.
+    if (ParseArrayVectorType(Result, false))
+      return true;
+    break;
+  case lltok::less: // Either vector or packed struct.
+    // Type ::= '<' ... '>'
+    Lex.Lex();
+    if (Lex.getKind() == lltok::lbrace) {
+      if (ParseAnonStructType(Result, true) ||
+          ParseToken(lltok::greater, "expected '>' at end of packed struct"))
+        return true;
+    } else if (ParseArrayVectorType(Result, true))
+      return true;
+    break;
+  case lltok::LocalVar: {
+    // Type ::= %foo
+    std::pair<Type*, LocTy> &Entry = NamedTypes[Lex.getStrVal()];
+
+    // If the type hasn't been defined yet, create a forward definition and
+    // remember where that forward def'n was seen (in case it never is defined).
+    if (Entry.first == 0) {
+      Entry.first = StructType::create(Context, Lex.getStrVal());
+      Entry.second = Lex.getLoc();
+    }
+    Result = Entry.first;
+    Lex.Lex();
+    break;
+  }
+
+  case lltok::LocalVarID: {
+    // Type ::= %4
+    if (Lex.getUIntVal() >= NumberedTypes.size())
+      NumberedTypes.resize(Lex.getUIntVal()+1);
+    std::pair<Type*, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()];
+
+    // If the type hasn't been defined yet, create a forward definition and
+    // remember where that forward def'n was seen (in case it never is defined).
+    if (Entry.first == 0) {
+      Entry.first = StructType::create(Context);
+      Entry.second = Lex.getLoc();
+    }
+    Result = Entry.first;
+    Lex.Lex();
+    break;
+  }
+  }
+
+  // Parse the type suffixes.
+  while (1) {
+    switch (Lex.getKind()) {
+    // End of type.
+    default:
+      if (!AllowVoid && Result->isVoidTy())
+        return Error(TypeLoc, "void type only allowed for function results");
+      return false;
+
+    // Type ::= Type '*'
+    case lltok::star:
+      if (Result->isLabelTy())
+        return TokError("basic block pointers are invalid");
+      if (Result->isVoidTy())
+        return TokError("pointers to void are invalid - use i8* instead");
+      if (!PointerType::isValidElementType(Result))
+        return TokError("pointer to this type is invalid");
+      Result = PointerType::getUnqual(Result);
+      Lex.Lex();
+      break;
+
+    // Type ::= Type 'addrspace' '(' uint32 ')' '*'
+    case lltok::kw_addrspace: {
+      if (Result->isLabelTy())
+        return TokError("basic block pointers are invalid");
+      if (Result->isVoidTy())
+        return TokError("pointers to void are invalid; use i8* instead");
+      if (!PointerType::isValidElementType(Result))
+        return TokError("pointer to this type is invalid");
+      unsigned AddrSpace;
+      if (ParseOptionalAddrSpace(AddrSpace) ||
+          ParseToken(lltok::star, "expected '*' in address space"))
+        return true;
+
+      Result = PointerType::get(Result, AddrSpace);
+      break;
+    }
+
+    /// Types '(' ArgTypeListI ')' OptFuncAttrs
+    case lltok::lparen:
+      if (ParseFunctionType(Result))
+        return true;
+      break;
+    }
+  }
+}
+
+/// ParseParameterList
+///    ::= '(' ')'
+///    ::= '(' Arg (',' Arg)* ')'
+///  Arg
+///    ::= Type OptionalAttributes Value OptionalAttributes
+bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                                  PerFunctionState &PFS) {
+  if (ParseToken(lltok::lparen, "expected '(' in call"))
+    return true;
+
+  unsigned AttrIndex = 1;
+  while (Lex.getKind() != lltok::rparen) {
+    // If this isn't the first argument, we need a comma.
+    if (!ArgList.empty() &&
+        ParseToken(lltok::comma, "expected ',' in argument list"))
+      return true;
+
+    // Parse the argument.
+    LocTy ArgLoc;
+    Type *ArgTy = 0;
+    AttrBuilder ArgAttrs;
+    Value *V;
+    if (ParseType(ArgTy, ArgLoc))
+      return true;
+
+    // Otherwise, handle normal operands.
+    if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
+      return true;
+    ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
+                                                             AttrIndex++,
+                                                             ArgAttrs)));
+  }
+
+  Lex.Lex();  // Lex the ')'.
+  return false;
+}
+
+
+
+/// ParseArgumentList - Parse the argument list for a function type or function
+/// prototype.
+///   ::= '(' ArgTypeListI ')'
+/// ArgTypeListI
+///   ::= /*empty*/
+///   ::= '...'
+///   ::= ArgTypeList ',' '...'
+///   ::= ArgType (',' ArgType)*
+///
+bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
+                                 bool &isVarArg){
+  isVarArg = false;
+  assert(Lex.getKind() == lltok::lparen);
+  Lex.Lex(); // eat the (.
+
+  if (Lex.getKind() == lltok::rparen) {
+    // empty
+  } else if (Lex.getKind() == lltok::dotdotdot) {
+    isVarArg = true;
+    Lex.Lex();
+  } else {
+    LocTy TypeLoc = Lex.getLoc();
+    Type *ArgTy = 0;
+    AttrBuilder Attrs;
+    std::string Name;
+
+    if (ParseType(ArgTy) ||
+        ParseOptionalParamAttrs(Attrs)) return true;
+
+    if (ArgTy->isVoidTy())
+      return Error(TypeLoc, "argument can not have void type");
+
+    if (Lex.getKind() == lltok::LocalVar) {
+      Name = Lex.getStrVal();
+      Lex.Lex();
+    }
+
+    if (!FunctionType::isValidArgumentType(ArgTy))
+      return Error(TypeLoc, "invalid type for function argument");
+
+    unsigned AttrIndex = 1;
+    ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
+                              AttributeSet::get(ArgTy->getContext(),
+                                                AttrIndex++, Attrs), Name));
+
+    while (EatIfPresent(lltok::comma)) {
+      // Handle ... at end of arg list.
+      if (EatIfPresent(lltok::dotdotdot)) {
+        isVarArg = true;
+        break;
+      }
+
+      // Otherwise must be an argument type.
+      TypeLoc = Lex.getLoc();
+      if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) return true;
+
+      if (ArgTy->isVoidTy())
+        return Error(TypeLoc, "argument can not have void type");
+
+      if (Lex.getKind() == lltok::LocalVar) {
+        Name = Lex.getStrVal();
+        Lex.Lex();
+      } else {
+        Name = "";
+      }
+
+      if (!ArgTy->isFirstClassType())
+        return Error(TypeLoc, "invalid type for function argument");
+
+      ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
+                                AttributeSet::get(ArgTy->getContext(),
+                                                  AttrIndex++, Attrs),
+                                Name));
+    }
+  }
+
+  return ParseToken(lltok::rparen, "expected ')' at end of argument list");
+}
+
+/// ParseFunctionType
+///  ::= Type ArgumentList OptionalAttrs
+bool LLParser::ParseFunctionType(Type *&Result) {
+  assert(Lex.getKind() == lltok::lparen);
+
+  if (!FunctionType::isValidReturnType(Result))
+    return TokError("invalid function return type");
+
+  SmallVector<ArgInfo, 8> ArgList;
+  bool isVarArg;
+  if (ParseArgumentList(ArgList, isVarArg))
+    return true;
+
+  // Reject names on the arguments lists.
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    if (!ArgList[i].Name.empty())
+      return Error(ArgList[i].Loc, "argument name invalid in function type");
+    if (ArgList[i].Attrs.hasAttributes(i + 1))
+      return Error(ArgList[i].Loc,
+                   "argument attributes invalid in function type");
+  }
+
+  SmallVector<Type*, 16> ArgListTy;
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+    ArgListTy.push_back(ArgList[i].Ty);
+
+  Result = FunctionType::get(Result, ArgListTy, isVarArg);
+  return false;
+}
+
+/// ParseAnonStructType - Parse an anonymous struct type, which is inlined into
+/// other structs.
+bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) {
+  SmallVector<Type*, 8> Elts;
+  if (ParseStructBody(Elts)) return true;
+
+  Result = StructType::get(Context, Elts, Packed);
+  return false;
+}
+
+/// ParseStructDefinition - Parse a struct in a 'type' definition.
+bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
+                                     std::pair<Type*, LocTy> &Entry,
+                                     Type *&ResultTy) {
+  // If the type was already defined, diagnose the redefinition.
+  if (Entry.first && !Entry.second.isValid())
+    return Error(TypeLoc, "redefinition of type");
+
+  // If we have opaque, just return without filling in the definition for the
+  // struct.  This counts as a definition as far as the .ll file goes.
+  if (EatIfPresent(lltok::kw_opaque)) {
+    // This type is being defined, so clear the location to indicate this.
+    Entry.second = SMLoc();
+
+    // If this type number has never been uttered, create it.
+    if (Entry.first == 0)
+      Entry.first = StructType::create(Context, Name);
+    ResultTy = Entry.first;
+    return false;
+  }
+
+  // If the type starts with '<', then it is either a packed struct or a vector.
+  bool isPacked = EatIfPresent(lltok::less);
+
+  // If we don't have a struct, then we have a random type alias, which we
+  // accept for compatibility with old files.  These types are not allowed to be
+  // forward referenced and not allowed to be recursive.
+  if (Lex.getKind() != lltok::lbrace) {
+    if (Entry.first)
+      return Error(TypeLoc, "forward references to non-struct type");
+
+    ResultTy = 0;
+    if (isPacked)
+      return ParseArrayVectorType(ResultTy, true);
+    return ParseType(ResultTy);
+  }
+
+  // This type is being defined, so clear the location to indicate this.
+  Entry.second = SMLoc();
+
+  // If this type number has never been uttered, create it.
+  if (Entry.first == 0)
+    Entry.first = StructType::create(Context, Name);
+
+  StructType *STy = cast<StructType>(Entry.first);
+
+  SmallVector<Type*, 8> Body;
+  if (ParseStructBody(Body) ||
+      (isPacked && ParseToken(lltok::greater, "expected '>' in packed struct")))
+    return true;
+
+  STy->setBody(Body, isPacked);
+  ResultTy = STy;
+  return false;
+}
+
+
+/// ParseStructType: Handles packed and unpacked types.  </> parsed elsewhere.
+///   StructType
+///     ::= '{' '}'
+///     ::= '{' Type (',' Type)* '}'
+///     ::= '<' '{' '}' '>'
+///     ::= '<' '{' Type (',' Type)* '}' '>'
+bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
+  assert(Lex.getKind() == lltok::lbrace);
+  Lex.Lex(); // Consume the '{'
+
+  // Handle the empty struct.
+  if (EatIfPresent(lltok::rbrace))
+    return false;
+
+  LocTy EltTyLoc = Lex.getLoc();
+  Type *Ty = 0;
+  if (ParseType(Ty)) return true;
+  Body.push_back(Ty);
+
+  if (!StructType::isValidElementType(Ty))
+    return Error(EltTyLoc, "invalid element type for struct");
+
+  while (EatIfPresent(lltok::comma)) {
+    EltTyLoc = Lex.getLoc();
+    if (ParseType(Ty)) return true;
+
+    if (!StructType::isValidElementType(Ty))
+      return Error(EltTyLoc, "invalid element type for struct");
+
+    Body.push_back(Ty);
+  }
+
+  return ParseToken(lltok::rbrace, "expected '}' at end of struct");
+}
+
+/// ParseArrayVectorType - Parse an array or vector type, assuming the first
+/// token has already been consumed.
+///   Type
+///     ::= '[' APSINTVAL 'x' Types ']'
+///     ::= '<' APSINTVAL 'x' Types '>'
+bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
+  if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned() ||
+      Lex.getAPSIntVal().getBitWidth() > 64)
+    return TokError("expected number in address space");
+
+  LocTy SizeLoc = Lex.getLoc();
+  uint64_t Size = Lex.getAPSIntVal().getZExtValue();
+  Lex.Lex();
+
+  if (ParseToken(lltok::kw_x, "expected 'x' after element count"))
+      return true;
+
+  LocTy TypeLoc = Lex.getLoc();
+  Type *EltTy = 0;
+  if (ParseType(EltTy)) return true;
+
+  if (ParseToken(isVector ? lltok::greater : lltok::rsquare,
+                 "expected end of sequential type"))
+    return true;
+
+  if (isVector) {
+    if (Size == 0)
+      return Error(SizeLoc, "zero element vector is illegal");
+    if ((unsigned)Size != Size)
+      return Error(SizeLoc, "size too large for vector");
+    if (!VectorType::isValidElementType(EltTy))
+      return Error(TypeLoc, "invalid vector element type");
+    Result = VectorType::get(EltTy, unsigned(Size));
+  } else {
+    if (!ArrayType::isValidElementType(EltTy))
+      return Error(TypeLoc, "invalid array element type");
+    Result = ArrayType::get(EltTy, Size);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Function Semantic Analysis.
+//===----------------------------------------------------------------------===//
+
+LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f,
+                                             int functionNumber)
+  : P(p), F(f), FunctionNumber(functionNumber) {
+
+  // Insert unnamed arguments into the NumberedVals list.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end();
+       AI != E; ++AI)
+    if (!AI->hasName())
+      NumberedVals.push_back(AI);
+}
+
+LLParser::PerFunctionState::~PerFunctionState() {
+  // If there were any forward referenced non-basicblock values, delete them.
+  for (std::map<std::string, std::pair<Value*, LocTy> >::iterator
+       I = ForwardRefVals.begin(), E = ForwardRefVals.end(); I != E; ++I)
+    if (!isa<BasicBlock>(I->second.first)) {
+      I->second.first->replaceAllUsesWith(
+                           UndefValue::get(I->second.first->getType()));
+      delete I->second.first;
+      I->second.first = 0;
+    }
+
+  for (std::map<unsigned, std::pair<Value*, LocTy> >::iterator
+       I = ForwardRefValIDs.begin(), E = ForwardRefValIDs.end(); I != E; ++I)
+    if (!isa<BasicBlock>(I->second.first)) {
+      I->second.first->replaceAllUsesWith(
+                           UndefValue::get(I->second.first->getType()));
+      delete I->second.first;
+      I->second.first = 0;
+    }
+}
+
+bool LLParser::PerFunctionState::FinishFunction() {
+  // Check to see if someone took the address of labels in this block.
+  if (!P.ForwardRefBlockAddresses.empty()) {
+    ValID FunctionID;
+    if (!F.getName().empty()) {
+      FunctionID.Kind = ValID::t_GlobalName;
+      FunctionID.StrVal = F.getName();
+    } else {
+      FunctionID.Kind = ValID::t_GlobalID;
+      FunctionID.UIntVal = FunctionNumber;
+    }
+
+    std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >::iterator
+      FRBAI = P.ForwardRefBlockAddresses.find(FunctionID);
+    if (FRBAI != P.ForwardRefBlockAddresses.end()) {
+      // Resolve all these references.
+      if (P.ResolveForwardRefBlockAddresses(&F, FRBAI->second, this))
+        return true;
+
+      P.ForwardRefBlockAddresses.erase(FRBAI);
+    }
+  }
+
+  if (!ForwardRefVals.empty())
+    return P.Error(ForwardRefVals.begin()->second.second,
+                   "use of undefined value '%" + ForwardRefVals.begin()->first +
+                   "'");
+  if (!ForwardRefValIDs.empty())
+    return P.Error(ForwardRefValIDs.begin()->second.second,
+                   "use of undefined value '%" +
+                   Twine(ForwardRefValIDs.begin()->first) + "'");
+  return false;
+}
+
+
+/// GetVal - Get a value with the specified name or ID, creating a
+/// forward reference record if needed.  This can return null if the value
+/// exists but does not have the right type.
+Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
+                                          Type *Ty, LocTy Loc) {
+  // Look this name up in the normal function symbol table.
+  Value *Val = F.getValueSymbolTable().lookup(Name);
+
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<std::string, std::pair<Value*, LocTy> >::iterator
+      I = ForwardRefVals.find(Name);
+    if (I != ForwardRefVals.end())
+      Val = I->second.first;
+  }
+
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    if (Ty->isLabelTy())
+      P.Error(Loc, "'%" + Name + "' is not a basic block");
+    else
+      P.Error(Loc, "'%" + Name + "' defined with type '" +
+              getTypeString(Val->getType()) + "'");
+    return 0;
+  }
+
+  // Don't make placeholders with invalid type.
+  if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
+    P.Error(Loc, "invalid use of a non-first-class type");
+    return 0;
+  }
+
+  // Otherwise, create a new forward reference for this value and remember it.
+  Value *FwdVal;
+  if (Ty->isLabelTy())
+    FwdVal = BasicBlock::Create(F.getContext(), Name, &F);
+  else
+    FwdVal = new Argument(Ty, Name);
+
+  ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
+                                          LocTy Loc) {
+  // Look this name up in the normal function symbol table.
+  Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
+
+  // If this is a forward reference for the value, see if we already created a
+  // forward ref record.
+  if (Val == 0) {
+    std::map<unsigned, std::pair<Value*, LocTy> >::iterator
+      I = ForwardRefValIDs.find(ID);
+    if (I != ForwardRefValIDs.end())
+      Val = I->second.first;
+  }
+
+  // If we have the value in the symbol table or fwd-ref table, return it.
+  if (Val) {
+    if (Val->getType() == Ty) return Val;
+    if (Ty->isLabelTy())
+      P.Error(Loc, "'%" + Twine(ID) + "' is not a basic block");
+    else
+      P.Error(Loc, "'%" + Twine(ID) + "' defined with type '" +
+              getTypeString(Val->getType()) + "'");
+    return 0;
+  }
+
+  if (!Ty->isFirstClassType() && !Ty->isLabelTy()) {
+    P.Error(Loc, "invalid use of a non-first-class type");
+    return 0;
+  }
+
+  // Otherwise, create a new forward reference for this value and remember it.
+  Value *FwdVal;
+  if (Ty->isLabelTy())
+    FwdVal = BasicBlock::Create(F.getContext(), "", &F);
+  else
+    FwdVal = new Argument(Ty);
+
+  ForwardRefValIDs[ID] = std::make_pair(FwdVal, Loc);
+  return FwdVal;
+}
+
+/// SetInstName - After an instruction is parsed and inserted into its
+/// basic block, this installs its name.
+bool LLParser::PerFunctionState::SetInstName(int NameID,
+                                             const std::string &NameStr,
+                                             LocTy NameLoc, Instruction *Inst) {
+  // If this instruction has void type, it cannot have a name or ID specified.
+  if (Inst->getType()->isVoidTy()) {
+    if (NameID != -1 || !NameStr.empty())
+      return P.Error(NameLoc, "instructions returning void cannot have a name");
+    return false;
+  }
+
+  // If this was a numbered instruction, verify that the instruction is the
+  // expected value and resolve any forward references.
+  if (NameStr.empty()) {
+    // If neither a name nor an ID was specified, just use the next ID.
+    if (NameID == -1)
+      NameID = NumberedVals.size();
+
+    if (unsigned(NameID) != NumberedVals.size())
+      return P.Error(NameLoc, "instruction expected to be numbered '%" +
+                     Twine(NumberedVals.size()) + "'");
+
+    std::map<unsigned, std::pair<Value*, LocTy> >::iterator FI =
+      ForwardRefValIDs.find(NameID);
+    if (FI != ForwardRefValIDs.end()) {
+      if (FI->second.first->getType() != Inst->getType())
+        return P.Error(NameLoc, "instruction forward referenced with type '" +
+                       getTypeString(FI->second.first->getType()) + "'");
+      FI->second.first->replaceAllUsesWith(Inst);
+      delete FI->second.first;
+      ForwardRefValIDs.erase(FI);
+    }
+
+    NumberedVals.push_back(Inst);
+    return false;
+  }
+
+  // Otherwise, the instruction had a name.  Resolve forward refs and set it.
+  std::map<std::string, std::pair<Value*, LocTy> >::iterator
+    FI = ForwardRefVals.find(NameStr);
+  if (FI != ForwardRefVals.end()) {
+    if (FI->second.first->getType() != Inst->getType())
+      return P.Error(NameLoc, "instruction forward referenced with type '" +
+                     getTypeString(FI->second.first->getType()) + "'");
+    FI->second.first->replaceAllUsesWith(Inst);
+    delete FI->second.first;
+    ForwardRefVals.erase(FI);
+  }
+
+  // Set the name on the instruction.
+  Inst->setName(NameStr);
+
+  if (Inst->getName() != NameStr)
+    return P.Error(NameLoc, "multiple definition of local value named '" +
+                   NameStr + "'");
+  return false;
+}
+
+/// GetBB - Get a basic block with the specified name or ID, creating a
+/// forward reference record if needed.
+BasicBlock *LLParser::PerFunctionState::GetBB(const std::string &Name,
+                                              LocTy Loc) {
+  return cast_or_null<BasicBlock>(GetVal(Name,
+                                        Type::getLabelTy(F.getContext()), Loc));
+}
+
+BasicBlock *LLParser::PerFunctionState::GetBB(unsigned ID, LocTy Loc) {
+  return cast_or_null<BasicBlock>(GetVal(ID,
+                                        Type::getLabelTy(F.getContext()), Loc));
+}
+
+/// DefineBB - Define the specified basic block, which is either named or
+/// unnamed.  If there is an error, this returns null otherwise it returns
+/// the block being defined.
+BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
+                                                 LocTy Loc) {
+  BasicBlock *BB;
+  if (Name.empty())
+    BB = GetBB(NumberedVals.size(), Loc);
+  else
+    BB = GetBB(Name, Loc);
+  if (BB == 0) return 0; // Already diagnosed error.
+
+  // Move the block to the end of the function.  Forward ref'd blocks are
+  // inserted wherever they happen to be referenced.
+  F.getBasicBlockList().splice(F.end(), F.getBasicBlockList(), BB);
+
+  // Remove the block from forward ref sets.
+  if (Name.empty()) {
+    ForwardRefValIDs.erase(NumberedVals.size());
+    NumberedVals.push_back(BB);
+  } else {
+    // BB forward references are already in the function symbol table.
+    ForwardRefVals.erase(Name);
+  }
+
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Constants.
+//===----------------------------------------------------------------------===//
+
+/// ParseValID - Parse an abstract value that doesn't necessarily have a
+/// type implied.  For example, if we parse "4" we don't know what integer type
+/// it has.  The value will later be combined with its type and checked for
+/// sanity.  PFS is used to convert function-local operands of metadata (since
+/// metadata operands are not just parsed here but also converted to values).
+/// PFS can be null when we are not parsing metadata values inside a function.
+bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
+  ID.Loc = Lex.getLoc();
+  switch (Lex.getKind()) {
+  default: return TokError("expected value token");
+  case lltok::GlobalID:  // @42
+    ID.UIntVal = Lex.getUIntVal();
+    ID.Kind = ValID::t_GlobalID;
+    break;
+  case lltok::GlobalVar:  // @foo
+    ID.StrVal = Lex.getStrVal();
+    ID.Kind = ValID::t_GlobalName;
+    break;
+  case lltok::LocalVarID:  // %42
+    ID.UIntVal = Lex.getUIntVal();
+    ID.Kind = ValID::t_LocalID;
+    break;
+  case lltok::LocalVar:  // %foo
+    ID.StrVal = Lex.getStrVal();
+    ID.Kind = ValID::t_LocalName;
+    break;
+  case lltok::exclaim:   // !42, !{...}, or !"foo"
+    return ParseMetadataValue(ID, PFS);
+  case lltok::APSInt:
+    ID.APSIntVal = Lex.getAPSIntVal();
+    ID.Kind = ValID::t_APSInt;
+    break;
+  case lltok::APFloat:
+    ID.APFloatVal = Lex.getAPFloatVal();
+    ID.Kind = ValID::t_APFloat;
+    break;
+  case lltok::kw_true:
+    ID.ConstantVal = ConstantInt::getTrue(Context);
+    ID.Kind = ValID::t_Constant;
+    break;
+  case lltok::kw_false:
+    ID.ConstantVal = ConstantInt::getFalse(Context);
+    ID.Kind = ValID::t_Constant;
+    break;
+  case lltok::kw_null: ID.Kind = ValID::t_Null; break;
+  case lltok::kw_undef: ID.Kind = ValID::t_Undef; break;
+  case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break;
+
+  case lltok::lbrace: {
+    // ValID ::= '{' ConstVector '}'
+    Lex.Lex();
+    SmallVector<Constant*, 16> Elts;
+    if (ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rbrace, "expected end of struct constant"))
+      return true;
+
+    ID.ConstantStructElts = new Constant*[Elts.size()];
+    ID.UIntVal = Elts.size();
+    memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0]));
+    ID.Kind = ValID::t_ConstantStruct;
+    return false;
+  }
+  case lltok::less: {
+    // ValID ::= '<' ConstVector '>'         --> Vector.
+    // ValID ::= '<' '{' ConstVector '}' '>' --> Packed Struct.
+    Lex.Lex();
+    bool isPackedStruct = EatIfPresent(lltok::lbrace);
+
+    SmallVector<Constant*, 16> Elts;
+    LocTy FirstEltLoc = Lex.getLoc();
+    if (ParseGlobalValueVector(Elts) ||
+        (isPackedStruct &&
+         ParseToken(lltok::rbrace, "expected end of packed struct")) ||
+        ParseToken(lltok::greater, "expected end of constant"))
+      return true;
+
+    if (isPackedStruct) {
+      ID.ConstantStructElts = new Constant*[Elts.size()];
+      memcpy(ID.ConstantStructElts, Elts.data(), Elts.size()*sizeof(Elts[0]));
+      ID.UIntVal = Elts.size();
+      ID.Kind = ValID::t_PackedConstantStruct;
+      return false;
+    }
+
+    if (Elts.empty())
+      return Error(ID.Loc, "constant vector must not be empty");
+
+    if (!Elts[0]->getType()->isIntegerTy() &&
+        !Elts[0]->getType()->isFloatingPointTy() &&
+        !Elts[0]->getType()->isPointerTy())
+      return Error(FirstEltLoc,
+            "vector elements must have integer, pointer or floating point type");
+
+    // Verify that all the vector elements have the same type.
+    for (unsigned i = 1, e = Elts.size(); i != e; ++i)
+      if (Elts[i]->getType() != Elts[0]->getType())
+        return Error(FirstEltLoc,
+                     "vector element #" + Twine(i) +
+                    " is not of type '" + getTypeString(Elts[0]->getType()));
+
+    ID.ConstantVal = ConstantVector::get(Elts);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::lsquare: {   // Array Constant
+    Lex.Lex();
+    SmallVector<Constant*, 16> Elts;
+    LocTy FirstEltLoc = Lex.getLoc();
+    if (ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rsquare, "expected end of array constant"))
+      return true;
+
+    // Handle empty element.
+    if (Elts.empty()) {
+      // Use undef instead of an array because it's inconvenient to determine
+      // the element type at this point, there being no elements to examine.
+      ID.Kind = ValID::t_EmptyArray;
+      return false;
+    }
+
+    if (!Elts[0]->getType()->isFirstClassType())
+      return Error(FirstEltLoc, "invalid array element type: " +
+                   getTypeString(Elts[0]->getType()));
+
+    ArrayType *ATy = ArrayType::get(Elts[0]->getType(), Elts.size());
+
+    // Verify all elements are correct type!
+    for (unsigned i = 0, e = Elts.size(); i != e; ++i) {
+      if (Elts[i]->getType() != Elts[0]->getType())
+        return Error(FirstEltLoc,
+                     "array element #" + Twine(i) +
+                     " is not of type '" + getTypeString(Elts[0]->getType()));
+    }
+
+    ID.ConstantVal = ConstantArray::get(ATy, Elts);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_c:  // c "foo"
+    Lex.Lex();
+    ID.ConstantVal = ConstantDataArray::getString(Context, Lex.getStrVal(),
+                                                  false);
+    if (ParseToken(lltok::StringConstant, "expected string")) return true;
+    ID.Kind = ValID::t_Constant;
+    return false;
+
+  case lltok::kw_asm: {
+    // ValID ::= 'asm' SideEffect? AlignStack? IntelDialect? STRINGCONSTANT ','
+    //             STRINGCONSTANT
+    bool HasSideEffect, AlignStack, AsmDialect;
+    Lex.Lex();
+    if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
+        ParseOptionalToken(lltok::kw_alignstack, AlignStack) ||
+        ParseOptionalToken(lltok::kw_inteldialect, AsmDialect) ||
+        ParseStringConstant(ID.StrVal) ||
+        ParseToken(lltok::comma, "expected comma in inline asm expression") ||
+        ParseToken(lltok::StringConstant, "expected constraint string"))
+      return true;
+    ID.StrVal2 = Lex.getStrVal();
+    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) |
+      (unsigned(AsmDialect)<<2);
+    ID.Kind = ValID::t_InlineAsm;
+    return false;
+  }
+
+  case lltok::kw_blockaddress: {
+    // ValID ::= 'blockaddress' '(' @foo ',' %bar ')'
+    Lex.Lex();
+
+    ValID Fn, Label;
+    LocTy FnLoc, LabelLoc;
+
+    if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
+        ParseValID(Fn) ||
+        ParseToken(lltok::comma, "expected comma in block address expression")||
+        ParseValID(Label) ||
+        ParseToken(lltok::rparen, "expected ')' in block address expression"))
+      return true;
+
+    if (Fn.Kind != ValID::t_GlobalID && Fn.Kind != ValID::t_GlobalName)
+      return Error(Fn.Loc, "expected function name in blockaddress");
+    if (Label.Kind != ValID::t_LocalID && Label.Kind != ValID::t_LocalName)
+      return Error(Label.Loc, "expected basic block name in blockaddress");
+
+    // Make a global variable as a placeholder for this reference.
+    GlobalVariable *FwdRef = new GlobalVariable(*M, Type::getInt8Ty(Context),
+                                           false, GlobalValue::InternalLinkage,
+                                                0, "");
+    ForwardRefBlockAddresses[Fn].push_back(std::make_pair(Label, FwdRef));
+    ID.ConstantVal = FwdRef;
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+
+  case lltok::kw_trunc:
+  case lltok::kw_zext:
+  case lltok::kw_sext:
+  case lltok::kw_fptrunc:
+  case lltok::kw_fpext:
+  case lltok::kw_bitcast:
+  case lltok::kw_uitofp:
+  case lltok::kw_sitofp:
+  case lltok::kw_fptoui:
+  case lltok::kw_fptosi:
+  case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoint: {
+    unsigned Opc = Lex.getUIntVal();
+    Type *DestTy = 0;
+    Constant *SrcVal;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' after constantexpr cast") ||
+        ParseGlobalTypeAndValue(SrcVal) ||
+        ParseToken(lltok::kw_to, "expected 'to' in constantexpr cast") ||
+        ParseType(DestTy) ||
+        ParseToken(lltok::rparen, "expected ')' at end of constantexpr cast"))
+      return true;
+    if (!CastInst::castIsValid((Instruction::CastOps)Opc, SrcVal, DestTy))
+      return Error(ID.Loc, "invalid cast opcode for cast from '" +
+                   getTypeString(SrcVal->getType()) + "' to '" +
+                   getTypeString(DestTy) + "'");
+    ID.ConstantVal = ConstantExpr::getCast((Instruction::CastOps)Opc,
+                                                 SrcVal, DestTy);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_extractvalue: {
+    Lex.Lex();
+    Constant *Val;
+    SmallVector<unsigned, 4> Indices;
+    if (ParseToken(lltok::lparen, "expected '(' in extractvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val) ||
+        ParseIndexList(Indices) ||
+        ParseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
+      return true;
+
+    if (!Val->getType()->isAggregateType())
+      return Error(ID.Loc, "extractvalue operand must be aggregate type");
+    if (!ExtractValueInst::getIndexedType(Val->getType(), Indices))
+      return Error(ID.Loc, "invalid indices for extractvalue");
+    ID.ConstantVal = ConstantExpr::getExtractValue(Val, Indices);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_insertvalue: {
+    Lex.Lex();
+    Constant *Val0, *Val1;
+    SmallVector<unsigned, 4> Indices;
+    if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in insertvalue constantexpr")||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseIndexList(Indices) ||
+        ParseToken(lltok::rparen, "expected ')' in insertvalue constantexpr"))
+      return true;
+    if (!Val0->getType()->isAggregateType())
+      return Error(ID.Loc, "insertvalue operand must be aggregate type");
+    if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices))
+      return Error(ID.Loc, "invalid indices for insertvalue");
+    ID.ConstantVal = ConstantExpr::getInsertValue(Val0, Val1, Indices);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  case lltok::kw_icmp:
+  case lltok::kw_fcmp: {
+    unsigned PredVal, Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    if (ParseCmpPredicate(PredVal, Opc) ||
+        ParseToken(lltok::lparen, "expected '(' in compare constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in compare constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in compare constantexpr"))
+      return true;
+
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "compare operands must have the same type");
+
+    CmpInst::Predicate Pred = (CmpInst::Predicate)PredVal;
+
+    if (Opc == Instruction::FCmp) {
+      if (!Val0->getType()->isFPOrFPVectorTy())
+        return Error(ID.Loc, "fcmp requires floating point operands");
+      ID.ConstantVal = ConstantExpr::getFCmp(Pred, Val0, Val1);
+    } else {
+      assert(Opc == Instruction::ICmp && "Unexpected opcode for CmpInst!");
+      if (!Val0->getType()->isIntOrIntVectorTy() &&
+          !Val0->getType()->getScalarType()->isPointerTy())
+        return Error(ID.Loc, "icmp requires pointer or integer operands");
+      ID.ConstantVal = ConstantExpr::getICmp(Pred, Val0, Val1);
+    }
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+
+  // Binary Operators.
+  case lltok::kw_add:
+  case lltok::kw_fadd:
+  case lltok::kw_sub:
+  case lltok::kw_fsub:
+  case lltok::kw_mul:
+  case lltok::kw_fmul:
+  case lltok::kw_udiv:
+  case lltok::kw_sdiv:
+  case lltok::kw_fdiv:
+  case lltok::kw_urem:
+  case lltok::kw_srem:
+  case lltok::kw_frem:
+  case lltok::kw_shl:
+  case lltok::kw_lshr:
+  case lltok::kw_ashr: {
+    bool NUW = false;
+    bool NSW = false;
+    bool Exact = false;
+    unsigned Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    LocTy ModifierLoc = Lex.getLoc();
+    if (Opc == Instruction::Add || Opc == Instruction::Sub ||
+        Opc == Instruction::Mul || Opc == Instruction::Shl) {
+      if (EatIfPresent(lltok::kw_nuw))
+        NUW = true;
+      if (EatIfPresent(lltok::kw_nsw)) {
+        NSW = true;
+        if (EatIfPresent(lltok::kw_nuw))
+          NUW = true;
+      }
+    } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv ||
+               Opc == Instruction::LShr || Opc == Instruction::AShr) {
+      if (EatIfPresent(lltok::kw_exact))
+        Exact = true;
+    }
+    if (ParseToken(lltok::lparen, "expected '(' in binary constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in binary constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in binary constantexpr"))
+      return true;
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "operands of constexpr must have same type");
+    if (!Val0->getType()->isIntOrIntVectorTy()) {
+      if (NUW)
+        return Error(ModifierLoc, "nuw only applies to integer operations");
+      if (NSW)
+        return Error(ModifierLoc, "nsw only applies to integer operations");
+    }
+    // Check that the type is valid for the operator.
+    switch (Opc) {
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::Shl:
+    case Instruction::AShr:
+    case Instruction::LShr:
+      if (!Val0->getType()->isIntOrIntVectorTy())
+        return Error(ID.Loc, "constexpr requires integer operands");
+      break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      if (!Val0->getType()->isFPOrFPVectorTy())
+        return Error(ID.Loc, "constexpr requires fp operands");
+      break;
+    default: llvm_unreachable("Unknown binary operator!");
+    }
+    unsigned Flags = 0;
+    if (NUW)   Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
+    if (NSW)   Flags |= OverflowingBinaryOperator::NoSignedWrap;
+    if (Exact) Flags |= PossiblyExactOperator::IsExact;
+    Constant *C = ConstantExpr::get(Opc, Val0, Val1, Flags);
+    ID.ConstantVal = C;
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+
+  // Logical Operations
+  case lltok::kw_and:
+  case lltok::kw_or:
+  case lltok::kw_xor: {
+    unsigned Opc = Lex.getUIntVal();
+    Constant *Val0, *Val1;
+    Lex.Lex();
+    if (ParseToken(lltok::lparen, "expected '(' in logical constantexpr") ||
+        ParseGlobalTypeAndValue(Val0) ||
+        ParseToken(lltok::comma, "expected comma in logical constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) ||
+        ParseToken(lltok::rparen, "expected ')' in logical constantexpr"))
+      return true;
+    if (Val0->getType() != Val1->getType())
+      return Error(ID.Loc, "operands of constexpr must have same type");
+    if (!Val0->getType()->isIntOrIntVectorTy())
+      return Error(ID.Loc,
+                   "constexpr requires integer or integer vector operands");
+    ID.ConstantVal = ConstantExpr::get(Opc, Val0, Val1);
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+
+  case lltok::kw_getelementptr:
+  case lltok::kw_shufflevector:
+  case lltok::kw_insertelement:
+  case lltok::kw_extractelement:
+  case lltok::kw_select: {
+    unsigned Opc = Lex.getUIntVal();
+    SmallVector<Constant*, 16> Elts;
+    bool InBounds = false;
+    Lex.Lex();
+    if (Opc == Instruction::GetElementPtr)
+      InBounds = EatIfPresent(lltok::kw_inbounds);
+    if (ParseToken(lltok::lparen, "expected '(' in constantexpr") ||
+        ParseGlobalValueVector(Elts) ||
+        ParseToken(lltok::rparen, "expected ')' in constantexpr"))
+      return true;
+
+    if (Opc == Instruction::GetElementPtr) {
+      if (Elts.size() == 0 ||
+          !Elts[0]->getType()->getScalarType()->isPointerTy())
+        return Error(ID.Loc, "getelementptr requires pointer operand");
+
+      ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
+      if (!GetElementPtrInst::getIndexedType(Elts[0]->getType(), Indices))
+        return Error(ID.Loc, "invalid indices for getelementptr");
+      ID.ConstantVal = ConstantExpr::getGetElementPtr(Elts[0], Indices,
+                                                      InBounds);
+    } else if (Opc == Instruction::Select) {
+      if (Elts.size() != 3)
+        return Error(ID.Loc, "expected three operands to select");
+      if (const char *Reason = SelectInst::areInvalidOperands(Elts[0], Elts[1],
+                                                              Elts[2]))
+        return Error(ID.Loc, Reason);
+      ID.ConstantVal = ConstantExpr::getSelect(Elts[0], Elts[1], Elts[2]);
+    } else if (Opc == Instruction::ShuffleVector) {
+      if (Elts.size() != 3)
+        return Error(ID.Loc, "expected three operands to shufflevector");
+      if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
+        return Error(ID.Loc, "invalid operands to shufflevector");
+      ID.ConstantVal =
+                 ConstantExpr::getShuffleVector(Elts[0], Elts[1],Elts[2]);
+    } else if (Opc == Instruction::ExtractElement) {
+      if (Elts.size() != 2)
+        return Error(ID.Loc, "expected two operands to extractelement");
+      if (!ExtractElementInst::isValidOperands(Elts[0], Elts[1]))
+        return Error(ID.Loc, "invalid extractelement operands");
+      ID.ConstantVal = ConstantExpr::getExtractElement(Elts[0], Elts[1]);
+    } else {
+      assert(Opc == Instruction::InsertElement && "Unknown opcode");
+      if (Elts.size() != 3)
+      return Error(ID.Loc, "expected three operands to insertelement");
+      if (!InsertElementInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
+        return Error(ID.Loc, "invalid insertelement operands");
+      ID.ConstantVal =
+                 ConstantExpr::getInsertElement(Elts[0], Elts[1],Elts[2]);
+    }
+
+    ID.Kind = ValID::t_Constant;
+    return false;
+  }
+  }
+
+  Lex.Lex();
+  return false;
+}
+
+/// ParseGlobalValue - Parse a global value with the specified type.
+bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
+  C = 0;
+  ValID ID;
+  Value *V = NULL;
+  bool Parsed = ParseValID(ID) ||
+                ConvertValIDToValue(Ty, ID, V, NULL);
+  if (V && !(C = dyn_cast<Constant>(V)))
+    return Error(ID.Loc, "global values must be constants");
+  return Parsed;
+}
+
+bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
+  Type *Ty = 0;
+  return ParseType(Ty) ||
+         ParseGlobalValue(Ty, V);
+}
+
+/// ParseGlobalValueVector
+///   ::= /*empty*/
+///   ::= TypeAndValue (',' TypeAndValue)*
+bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts) {
+  // Empty list.
+  if (Lex.getKind() == lltok::rbrace ||
+      Lex.getKind() == lltok::rsquare ||
+      Lex.getKind() == lltok::greater ||
+      Lex.getKind() == lltok::rparen)
+    return false;
+
+  Constant *C;
+  if (ParseGlobalTypeAndValue(C)) return true;
+  Elts.push_back(C);
+
+  while (EatIfPresent(lltok::comma)) {
+    if (ParseGlobalTypeAndValue(C)) return true;
+    Elts.push_back(C);
+  }
+
+  return false;
+}
+
+bool LLParser::ParseMetadataListValue(ValID &ID, PerFunctionState *PFS) {
+  assert(Lex.getKind() == lltok::lbrace);
+  Lex.Lex();
+
+  SmallVector<Value*, 16> Elts;
+  if (ParseMDNodeVector(Elts, PFS) ||
+      ParseToken(lltok::rbrace, "expected end of metadata node"))
+    return true;
+
+  ID.MDNodeVal = MDNode::get(Context, Elts);
+  ID.Kind = ValID::t_MDNode;
+  return false;
+}
+
+/// ParseMetadataValue
+///  ::= !42
+///  ::= !{...}
+///  ::= !"string"
+bool LLParser::ParseMetadataValue(ValID &ID, PerFunctionState *PFS) {
+  assert(Lex.getKind() == lltok::exclaim);
+  Lex.Lex();
+
+  // MDNode:
+  // !{ ... }
+  if (Lex.getKind() == lltok::lbrace)
+    return ParseMetadataListValue(ID, PFS);
+
+  // Standalone metadata reference
+  // !42
+  if (Lex.getKind() == lltok::APSInt) {
+    if (ParseMDNodeID(ID.MDNodeVal)) return true;
+    ID.Kind = ValID::t_MDNode;
+    return false;
+  }
+
+  // MDString:
+  //   ::= '!' STRINGCONSTANT
+  if (ParseMDString(ID.MDStringVal)) return true;
+  ID.Kind = ValID::t_MDString;
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Function Parsing.
+//===----------------------------------------------------------------------===//
+
+bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
+                                   PerFunctionState *PFS) {
+  if (Ty->isFunctionTy())
+    return Error(ID.Loc, "functions are not values, refer to them as pointers");
+
+  switch (ID.Kind) {
+  case ValID::t_LocalID:
+    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
+    V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc);
+    return (V == 0);
+  case ValID::t_LocalName:
+    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
+    V = PFS->GetVal(ID.StrVal, Ty, ID.Loc);
+    return (V == 0);
+  case ValID::t_InlineAsm: {
+    PointerType *PTy = dyn_cast<PointerType>(Ty);
+    FunctionType *FTy =
+      PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
+    if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
+      return Error(ID.Loc, "invalid type for inline asm constraint string");
+    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1,
+                       (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2)));
+    return false;
+  }
+  case ValID::t_MDNode:
+    if (!Ty->isMetadataTy())
+      return Error(ID.Loc, "metadata value must have metadata type");
+    V = ID.MDNodeVal;
+    return false;
+  case ValID::t_MDString:
+    if (!Ty->isMetadataTy())
+      return Error(ID.Loc, "metadata value must have metadata type");
+    V = ID.MDStringVal;
+    return false;
+  case ValID::t_GlobalName:
+    V = GetGlobalVal(ID.StrVal, Ty, ID.Loc);
+    return V == 0;
+  case ValID::t_GlobalID:
+    V = GetGlobalVal(ID.UIntVal, Ty, ID.Loc);
+    return V == 0;
+  case ValID::t_APSInt:
+    if (!Ty->isIntegerTy())
+      return Error(ID.Loc, "integer constant must have integer type");
+    ID.APSIntVal = ID.APSIntVal.extOrTrunc(Ty->getPrimitiveSizeInBits());
+    V = ConstantInt::get(Context, ID.APSIntVal);
+    return false;
+  case ValID::t_APFloat:
+    if (!Ty->isFloatingPointTy() ||
+        !ConstantFP::isValueValidForType(Ty, ID.APFloatVal))
+      return Error(ID.Loc, "floating point constant invalid for type");
+
+    // The lexer has no type info, so builds all half, float, and double FP
+    // constants as double.  Fix this here.  Long double does not need this.
+    if (&ID.APFloatVal.getSemantics() == &APFloat::IEEEdouble) {
+      bool Ignored;
+      if (Ty->isHalfTy())
+        ID.APFloatVal.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven,
+                              &Ignored);
+      else if (Ty->isFloatTy())
+        ID.APFloatVal.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven,
+                              &Ignored);
+    }
+    V = ConstantFP::get(Context, ID.APFloatVal);
+
+    if (V->getType() != Ty)
+      return Error(ID.Loc, "floating point constant does not have type '" +
+                   getTypeString(Ty) + "'");
+
+    return false;
+  case ValID::t_Null:
+    if (!Ty->isPointerTy())
+      return Error(ID.Loc, "null must be a pointer type");
+    V = ConstantPointerNull::get(cast<PointerType>(Ty));
+    return false;
+  case ValID::t_Undef:
+    // FIXME: LabelTy should not be a first-class type.
+    if (!Ty->isFirstClassType() || Ty->isLabelTy())
+      return Error(ID.Loc, "invalid type for undef constant");
+    V = UndefValue::get(Ty);
+    return false;
+  case ValID::t_EmptyArray:
+    if (!Ty->isArrayTy() || cast<ArrayType>(Ty)->getNumElements() != 0)
+      return Error(ID.Loc, "invalid empty array initializer");
+    V = UndefValue::get(Ty);
+    return false;
+  case ValID::t_Zero:
+    // FIXME: LabelTy should not be a first-class type.
+    if (!Ty->isFirstClassType() || Ty->isLabelTy())
+      return Error(ID.Loc, "invalid type for null constant");
+    V = Constant::getNullValue(Ty);
+    return false;
+  case ValID::t_Constant:
+    if (ID.ConstantVal->getType() != Ty)
+      return Error(ID.Loc, "constant expression type mismatch");
+
+    V = ID.ConstantVal;
+    return false;
+  case ValID::t_ConstantStruct:
+  case ValID::t_PackedConstantStruct:
+    if (StructType *ST = dyn_cast<StructType>(Ty)) {
+      if (ST->getNumElements() != ID.UIntVal)
+        return Error(ID.Loc,
+                     "initializer with struct type has wrong # elements");
+      if (ST->isPacked() != (ID.Kind == ValID::t_PackedConstantStruct))
+        return Error(ID.Loc, "packed'ness of initializer and type don't match");
+
+      // Verify that the elements are compatible with the structtype.
+      for (unsigned i = 0, e = ID.UIntVal; i != e; ++i)
+        if (ID.ConstantStructElts[i]->getType() != ST->getElementType(i))
+          return Error(ID.Loc, "element " + Twine(i) +
+                    " of struct initializer doesn't match struct element type");
+
+      V = ConstantStruct::get(ST, makeArrayRef(ID.ConstantStructElts,
+                                               ID.UIntVal));
+    } else
+      return Error(ID.Loc, "constant expression type mismatch");
+    return false;
+  }
+  llvm_unreachable("Invalid ValID");
+}
+
+bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
+  V = 0;
+  ValID ID;
+  return ParseValID(ID, PFS) ||
+         ConvertValIDToValue(Ty, ID, V, PFS);
+}
+
+bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
+  Type *Ty = 0;
+  return ParseType(Ty) ||
+         ParseValue(Ty, V, PFS);
+}
+
+bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
+                                      PerFunctionState &PFS) {
+  Value *V;
+  Loc = Lex.getLoc();
+  if (ParseTypeAndValue(V, PFS)) return true;
+  if (!isa<BasicBlock>(V))
+    return Error(Loc, "expected a basic block");
+  BB = cast<BasicBlock>(V);
+  return false;
+}
+
+
+/// FunctionHeader
+///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
+///       OptUnnamedAddr Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
+///       OptionalAlign OptGC
+bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
+  // Parse the linkage.
+  LocTy LinkageLoc = Lex.getLoc();
+  unsigned Linkage;
+
+  unsigned Visibility;
+  AttrBuilder RetAttrs;
+  CallingConv::ID CC;
+  Type *RetType = 0;
+  LocTy RetTypeLoc = Lex.getLoc();
+  if (ParseOptionalLinkage(Linkage) ||
+      ParseOptionalVisibility(Visibility) ||
+      ParseOptionalCallingConv(CC) ||
+      ParseOptionalReturnAttrs(RetAttrs) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/))
+    return true;
+
+  // Verify that the linkage is ok.
+  switch ((GlobalValue::LinkageTypes)Linkage) {
+  case GlobalValue::ExternalLinkage:
+    break; // always ok.
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    if (isDefine)
+      return Error(LinkageLoc, "invalid linkage for function definition");
+    break;
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::DLLExportLinkage:
+    if (!isDefine)
+      return Error(LinkageLoc, "invalid linkage for function declaration");
+    break;
+  case GlobalValue::AppendingLinkage:
+  case GlobalValue::CommonLinkage:
+    return Error(LinkageLoc, "invalid function linkage type");
+  }
+
+  if (!FunctionType::isValidReturnType(RetType))
+    return Error(RetTypeLoc, "invalid function return type");
+
+  LocTy NameLoc = Lex.getLoc();
+
+  std::string FunctionName;
+  if (Lex.getKind() == lltok::GlobalVar) {
+    FunctionName = Lex.getStrVal();
+  } else if (Lex.getKind() == lltok::GlobalID) {     // @42 is ok.
+    unsigned NameID = Lex.getUIntVal();
+
+    if (NameID != NumberedVals.size())
+      return TokError("function expected to be numbered '%" +
+                      Twine(NumberedVals.size()) + "'");
+  } else {
+    return TokError("expected function name");
+  }
+
+  Lex.Lex();
+
+  if (Lex.getKind() != lltok::lparen)
+    return TokError("expected '(' in function argument list");
+
+  SmallVector<ArgInfo, 8> ArgList;
+  bool isVarArg;
+  AttrBuilder FuncAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
+  std::string Section;
+  unsigned Alignment;
+  std::string GC;
+  bool UnnamedAddr;
+  LocTy UnnamedAddrLoc;
+
+  if (ParseArgumentList(ArgList, isVarArg) ||
+      ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
+                         &UnnamedAddrLoc) ||
+      ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc) ||
+      (EatIfPresent(lltok::kw_section) &&
+       ParseStringConstant(Section)) ||
+      ParseOptionalAlignment(Alignment) ||
+      (EatIfPresent(lltok::kw_gc) &&
+       ParseStringConstant(GC)))
+    return true;
+
+  if (FuncAttrs.contains(Attribute::NoBuiltin))
+    return Error(NoBuiltinLoc, "'nobuiltin' attribute not valid on function");
+
+  // If the alignment was parsed as an attribute, move to the alignment field.
+  if (FuncAttrs.hasAlignmentAttr()) {
+    Alignment = FuncAttrs.getAlignment();
+    FuncAttrs.removeAttribute(Attribute::Alignment);
+  }
+
+  // Okay, if we got here, the function is syntactically valid.  Convert types
+  // and do semantic checks.
+  std::vector<Type*> ParamTypeList;
+  SmallVector<AttributeSet, 8> Attrs;
+
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
+
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    ParamTypeList.push_back(ArgList[i].Ty);
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
+  }
+
+  if (FuncAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FuncAttrs));
+
+  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+
+  if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
+    return Error(RetTypeLoc, "functions with 'sret' argument must return void");
+
+  FunctionType *FT =
+    FunctionType::get(RetType, ParamTypeList, isVarArg);
+  PointerType *PFT = PointerType::getUnqual(FT);
+
+  Fn = 0;
+  if (!FunctionName.empty()) {
+    // If this was a definition of a forward reference, remove the definition
+    // from the forward reference table and fill in the forward ref.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> >::iterator FRVI =
+      ForwardRefVals.find(FunctionName);
+    if (FRVI != ForwardRefVals.end()) {
+      Fn = M->getFunction(FunctionName);
+      if (!Fn)
+        return Error(FRVI->second.second, "invalid forward reference to "
+                     "function as global value!");
+      if (Fn->getType() != PFT)
+        return Error(FRVI->second.second, "invalid forward reference to "
+                     "function '" + FunctionName + "' with wrong type!");
+
+      ForwardRefVals.erase(FRVI);
+    } else if ((Fn = M->getFunction(FunctionName))) {
+      // Reject redefinitions.
+      return Error(NameLoc, "invalid redefinition of function '" +
+                   FunctionName + "'");
+    } else if (M->getNamedValue(FunctionName)) {
+      return Error(NameLoc, "redefinition of function '@" + FunctionName + "'");
+    }
+
+  } else {
+    // If this is a definition of a forward referenced function, make sure the
+    // types agree.
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> >::iterator I
+      = ForwardRefValIDs.find(NumberedVals.size());
+    if (I != ForwardRefValIDs.end()) {
+      Fn = cast<Function>(I->second.first);
+      if (Fn->getType() != PFT)
+        return Error(NameLoc, "type of definition and forward reference of '@" +
+                     Twine(NumberedVals.size()) + "' disagree");
+      ForwardRefValIDs.erase(I);
+    }
+  }
+
+  if (Fn == 0)
+    Fn = Function::Create(FT, GlobalValue::ExternalLinkage, FunctionName, M);
+  else // Move the forward-reference to the correct spot in the module.
+    M->getFunctionList().splice(M->end(), M->getFunctionList(), Fn);
+
+  if (FunctionName.empty())
+    NumberedVals.push_back(Fn);
+
+  Fn->setLinkage((GlobalValue::LinkageTypes)Linkage);
+  Fn->setVisibility((GlobalValue::VisibilityTypes)Visibility);
+  Fn->setCallingConv(CC);
+  Fn->setAttributes(PAL);
+  Fn->setUnnamedAddr(UnnamedAddr);
+  Fn->setAlignment(Alignment);
+  Fn->setSection(Section);
+  if (!GC.empty()) Fn->setGC(GC.c_str());
+  ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
+
+  // Add all of the arguments we parsed to the function.
+  Function::arg_iterator ArgIt = Fn->arg_begin();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i, ++ArgIt) {
+    // If the argument has a name, insert it into the argument symbol table.
+    if (ArgList[i].Name.empty()) continue;
+
+    // Set the name, if it conflicted, it will be auto-renamed.
+    ArgIt->setName(ArgList[i].Name);
+
+    if (ArgIt->getName() != ArgList[i].Name)
+      return Error(ArgList[i].Loc, "redefinition of argument '%" +
+                   ArgList[i].Name + "'");
+  }
+
+  return false;
+}
+
+
+/// ParseFunctionBody
+///   ::= '{' BasicBlock+ '}'
+///
+bool LLParser::ParseFunctionBody(Function &Fn) {
+  if (Lex.getKind() != lltok::lbrace)
+    return TokError("expected '{' in function body");
+  Lex.Lex();  // eat the {.
+
+  int FunctionNumber = -1;
+  if (!Fn.hasName()) FunctionNumber = NumberedVals.size()-1;
+
+  PerFunctionState PFS(*this, Fn, FunctionNumber);
+
+  // We need at least one basic block.
+  if (Lex.getKind() == lltok::rbrace)
+    return TokError("function body requires at least one basic block");
+
+  while (Lex.getKind() != lltok::rbrace)
+    if (ParseBasicBlock(PFS)) return true;
+
+  // Eat the }.
+  Lex.Lex();
+
+  // Verify function is ok.
+  return PFS.FinishFunction();
+}
+
+/// ParseBasicBlock
+///   ::= LabelStr? Instruction*
+bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
+  // If this basic block starts out with a name, remember it.
+  std::string Name;
+  LocTy NameLoc = Lex.getLoc();
+  if (Lex.getKind() == lltok::LabelStr) {
+    Name = Lex.getStrVal();
+    Lex.Lex();
+  }
+
+  BasicBlock *BB = PFS.DefineBB(Name, NameLoc);
+  if (BB == 0) return true;
+
+  std::string NameStr;
+
+  // Parse the instructions in this block until we get a terminator.
+  Instruction *Inst;
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MetadataOnInst;
+  do {
+    // This instruction may have three possibilities for a name: a) none
+    // specified, b) name specified "%foo =", c) number specified: "%4 =".
+    LocTy NameLoc = Lex.getLoc();
+    int NameID = -1;
+    NameStr = "";
+
+    if (Lex.getKind() == lltok::LocalVarID) {
+      NameID = Lex.getUIntVal();
+      Lex.Lex();
+      if (ParseToken(lltok::equal, "expected '=' after instruction id"))
+        return true;
+    } else if (Lex.getKind() == lltok::LocalVar) {
+      NameStr = Lex.getStrVal();
+      Lex.Lex();
+      if (ParseToken(lltok::equal, "expected '=' after instruction name"))
+        return true;
+    }
+
+    switch (ParseInstruction(Inst, BB, PFS)) {
+    default: llvm_unreachable("Unknown ParseInstruction result!");
+    case InstError: return true;
+    case InstNormal:
+      BB->getInstList().push_back(Inst);
+
+      // With a normal result, we check to see if the instruction is followed by
+      // a comma and metadata.
+      if (EatIfPresent(lltok::comma))
+        if (ParseInstructionMetadata(Inst, &PFS))
+          return true;
+      break;
+    case InstExtraComma:
+      BB->getInstList().push_back(Inst);
+
+      // If the instruction parser ate an extra comma at the end of it, it
+      // *must* be followed by metadata.
+      if (ParseInstructionMetadata(Inst, &PFS))
+        return true;
+      break;
+    }
+
+    // Set the name on the instruction.
+    if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) return true;
+  } while (!isa<TerminatorInst>(Inst));
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Parsing.
+//===----------------------------------------------------------------------===//
+
+/// ParseInstruction - Parse one of the many different instructions.
+///
+int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+                               PerFunctionState &PFS) {
+  lltok::Kind Token = Lex.getKind();
+  if (Token == lltok::Eof)
+    return TokError("found end of file when expecting more instructions");
+  LocTy Loc = Lex.getLoc();
+  unsigned KeywordVal = Lex.getUIntVal();
+  Lex.Lex();  // Eat the keyword.
+
+  switch (Token) {
+  default:                    return Error(Loc, "expected instruction opcode");
+  // Terminator Instructions.
+  case lltok::kw_unreachable: Inst = new UnreachableInst(Context); return false;
+  case lltok::kw_ret:         return ParseRet(Inst, BB, PFS);
+  case lltok::kw_br:          return ParseBr(Inst, PFS);
+  case lltok::kw_switch:      return ParseSwitch(Inst, PFS);
+  case lltok::kw_indirectbr:  return ParseIndirectBr(Inst, PFS);
+  case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
+  case lltok::kw_resume:      return ParseResume(Inst, PFS);
+  // Binary Operators.
+  case lltok::kw_add:
+  case lltok::kw_sub:
+  case lltok::kw_mul:
+  case lltok::kw_shl: {
+    bool NUW = EatIfPresent(lltok::kw_nuw);
+    bool NSW = EatIfPresent(lltok::kw_nsw);
+    if (!NUW) NUW = EatIfPresent(lltok::kw_nuw);
+
+    if (ParseArithmetic(Inst, PFS, KeywordVal, 1)) return true;
+
+    if (NUW) cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true);
+    if (NSW) cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true);
+    return false;
+  }
+  case lltok::kw_fadd:
+  case lltok::kw_fsub:
+  case lltok::kw_fmul:
+  case lltok::kw_fdiv:
+  case lltok::kw_frem: {
+    FastMathFlags FMF = EatFastMathFlagsIfPresent();
+    int Res = ParseArithmetic(Inst, PFS, KeywordVal, 2);
+    if (Res != 0)
+      return Res;
+    if (FMF.any())
+      Inst->setFastMathFlags(FMF);
+    return 0;
+  }
+
+  case lltok::kw_sdiv:
+  case lltok::kw_udiv:
+  case lltok::kw_lshr:
+  case lltok::kw_ashr: {
+    bool Exact = EatIfPresent(lltok::kw_exact);
+
+    if (ParseArithmetic(Inst, PFS, KeywordVal, 1)) return true;
+    if (Exact) cast<BinaryOperator>(Inst)->setIsExact(true);
+    return false;
+  }
+
+  case lltok::kw_urem:
+  case lltok::kw_srem:   return ParseArithmetic(Inst, PFS, KeywordVal, 1);
+  case lltok::kw_and:
+  case lltok::kw_or:
+  case lltok::kw_xor:    return ParseLogical(Inst, PFS, KeywordVal);
+  case lltok::kw_icmp:
+  case lltok::kw_fcmp:   return ParseCompare(Inst, PFS, KeywordVal);
+  // Casts.
+  case lltok::kw_trunc:
+  case lltok::kw_zext:
+  case lltok::kw_sext:
+  case lltok::kw_fptrunc:
+  case lltok::kw_fpext:
+  case lltok::kw_bitcast:
+  case lltok::kw_uitofp:
+  case lltok::kw_sitofp:
+  case lltok::kw_fptoui:
+  case lltok::kw_fptosi:
+  case lltok::kw_inttoptr:
+  case lltok::kw_ptrtoint:       return ParseCast(Inst, PFS, KeywordVal);
+  // Other.
+  case lltok::kw_select:         return ParseSelect(Inst, PFS);
+  case lltok::kw_va_arg:         return ParseVA_Arg(Inst, PFS);
+  case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS);
+  case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
+  case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
+  case lltok::kw_phi:            return ParsePHI(Inst, PFS);
+  case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
+  case lltok::kw_call:           return ParseCall(Inst, PFS, false);
+  case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
+  // Memory.
+  case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
+  case lltok::kw_load:           return ParseLoad(Inst, PFS);
+  case lltok::kw_store:          return ParseStore(Inst, PFS);
+  case lltok::kw_cmpxchg:        return ParseCmpXchg(Inst, PFS);
+  case lltok::kw_atomicrmw:      return ParseAtomicRMW(Inst, PFS);
+  case lltok::kw_fence:          return ParseFence(Inst, PFS);
+  case lltok::kw_getelementptr: return ParseGetElementPtr(Inst, PFS);
+  case lltok::kw_extractvalue:  return ParseExtractValue(Inst, PFS);
+  case lltok::kw_insertvalue:   return ParseInsertValue(Inst, PFS);
+  }
+}
+
+/// ParseCmpPredicate - Parse an integer or fp predicate, based on Kind.
+bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
+  if (Opc == Instruction::FCmp) {
+    switch (Lex.getKind()) {
+    default: return TokError("expected fcmp predicate (e.g. 'oeq')");
+    case lltok::kw_oeq: P = CmpInst::FCMP_OEQ; break;
+    case lltok::kw_one: P = CmpInst::FCMP_ONE; break;
+    case lltok::kw_olt: P = CmpInst::FCMP_OLT; break;
+    case lltok::kw_ogt: P = CmpInst::FCMP_OGT; break;
+    case lltok::kw_ole: P = CmpInst::FCMP_OLE; break;
+    case lltok::kw_oge: P = CmpInst::FCMP_OGE; break;
+    case lltok::kw_ord: P = CmpInst::FCMP_ORD; break;
+    case lltok::kw_uno: P = CmpInst::FCMP_UNO; break;
+    case lltok::kw_ueq: P = CmpInst::FCMP_UEQ; break;
+    case lltok::kw_une: P = CmpInst::FCMP_UNE; break;
+    case lltok::kw_ult: P = CmpInst::FCMP_ULT; break;
+    case lltok::kw_ugt: P = CmpInst::FCMP_UGT; break;
+    case lltok::kw_ule: P = CmpInst::FCMP_ULE; break;
+    case lltok::kw_uge: P = CmpInst::FCMP_UGE; break;
+    case lltok::kw_true: P = CmpInst::FCMP_TRUE; break;
+    case lltok::kw_false: P = CmpInst::FCMP_FALSE; break;
+    }
+  } else {
+    switch (Lex.getKind()) {
+    default: return TokError("expected icmp predicate (e.g. 'eq')");
+    case lltok::kw_eq:  P = CmpInst::ICMP_EQ; break;
+    case lltok::kw_ne:  P = CmpInst::ICMP_NE; break;
+    case lltok::kw_slt: P = CmpInst::ICMP_SLT; break;
+    case lltok::kw_sgt: P = CmpInst::ICMP_SGT; break;
+    case lltok::kw_sle: P = CmpInst::ICMP_SLE; break;
+    case lltok::kw_sge: P = CmpInst::ICMP_SGE; break;
+    case lltok::kw_ult: P = CmpInst::ICMP_ULT; break;
+    case lltok::kw_ugt: P = CmpInst::ICMP_UGT; break;
+    case lltok::kw_ule: P = CmpInst::ICMP_ULE; break;
+    case lltok::kw_uge: P = CmpInst::ICMP_UGE; break;
+    }
+  }
+  Lex.Lex();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Terminator Instructions.
+//===----------------------------------------------------------------------===//
+
+/// ParseRet - Parse a return instruction.
+///   ::= 'ret' void (',' !dbg, !1)*
+///   ::= 'ret' TypeAndValue (',' !dbg, !1)*
+bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
+                        PerFunctionState &PFS) {
+  SMLoc TypeLoc = Lex.getLoc();
+  Type *Ty = 0;
+  if (ParseType(Ty, true /*void allowed*/)) return true;
+
+  Type *ResType = PFS.getFunction().getReturnType();
+
+  if (Ty->isVoidTy()) {
+    if (!ResType->isVoidTy())
+      return Error(TypeLoc, "value doesn't match function result type '" +
+                   getTypeString(ResType) + "'");
+
+    Inst = ReturnInst::Create(Context);
+    return false;
+  }
+
+  Value *RV;
+  if (ParseValue(Ty, RV, PFS)) return true;
+
+  if (ResType != RV->getType())
+    return Error(TypeLoc, "value doesn't match function result type '" +
+                 getTypeString(ResType) + "'");
+
+  Inst = ReturnInst::Create(Context, RV);
+  return false;
+}
+
+
+/// ParseBr
+///   ::= 'br' TypeAndValue
+///   ::= 'br' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc, Loc2;
+  Value *Op0;
+  BasicBlock *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS)) return true;
+
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(Op0)) {
+    Inst = BranchInst::Create(BB);
+    return false;
+  }
+
+  if (Op0->getType() != Type::getInt1Ty(Context))
+    return Error(Loc, "branch condition must have 'i1' type");
+
+  if (ParseToken(lltok::comma, "expected ',' after branch condition") ||
+      ParseTypeAndBasicBlock(Op1, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after true destination") ||
+      ParseTypeAndBasicBlock(Op2, Loc2, PFS))
+    return true;
+
+  Inst = BranchInst::Create(Op1, Op2, Op0);
+  return false;
+}
+
+/// ParseSwitch
+///  Instruction
+///    ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']'
+///  JumpTable
+///    ::= (TypeAndValue ',' TypeAndValue)*
+bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy CondLoc, BBLoc;
+  Value *Cond;
+  BasicBlock *DefaultBB;
+  if (ParseTypeAndValue(Cond, CondLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after switch condition") ||
+      ParseTypeAndBasicBlock(DefaultBB, BBLoc, PFS) ||
+      ParseToken(lltok::lsquare, "expected '[' with switch table"))
+    return true;
+
+  if (!Cond->getType()->isIntegerTy())
+    return Error(CondLoc, "switch condition must have integer type");
+
+  // Parse the jump table pairs.
+  SmallPtrSet<Value*, 32> SeenCases;
+  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 32> Table;
+  while (Lex.getKind() != lltok::rsquare) {
+    Value *Constant;
+    BasicBlock *DestBB;
+
+    if (ParseTypeAndValue(Constant, CondLoc, PFS) ||
+        ParseToken(lltok::comma, "expected ',' after case value") ||
+        ParseTypeAndBasicBlock(DestBB, PFS))
+      return true;
+
+    if (!SeenCases.insert(Constant))
+      return Error(CondLoc, "duplicate case value in switch");
+    if (!isa<ConstantInt>(Constant))
+      return Error(CondLoc, "case value is not a constant integer");
+
+    Table.push_back(std::make_pair(cast<ConstantInt>(Constant), DestBB));
+  }
+
+  Lex.Lex();  // Eat the ']'.
+
+  SwitchInst *SI = SwitchInst::Create(Cond, DefaultBB, Table.size());
+  for (unsigned i = 0, e = Table.size(); i != e; ++i)
+    SI->addCase(Table[i].first, Table[i].second);
+  Inst = SI;
+  return false;
+}
+
+/// ParseIndirectBr
+///  Instruction
+///    ::= 'indirectbr' TypeAndValue ',' '[' LabelList ']'
+bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy AddrLoc;
+  Value *Address;
+  if (ParseTypeAndValue(Address, AddrLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after indirectbr address") ||
+      ParseToken(lltok::lsquare, "expected '[' with indirectbr"))
+    return true;
+
+  if (!Address->getType()->isPointerTy())
+    return Error(AddrLoc, "indirectbr address must have pointer type");
+
+  // Parse the destination list.
+  SmallVector<BasicBlock*, 16> DestList;
+
+  if (Lex.getKind() != lltok::rsquare) {
+    BasicBlock *DestBB;
+    if (ParseTypeAndBasicBlock(DestBB, PFS))
+      return true;
+    DestList.push_back(DestBB);
+
+    while (EatIfPresent(lltok::comma)) {
+      if (ParseTypeAndBasicBlock(DestBB, PFS))
+        return true;
+      DestList.push_back(DestBB);
+    }
+  }
+
+  if (ParseToken(lltok::rsquare, "expected ']' at end of block list"))
+    return true;
+
+  IndirectBrInst *IBI = IndirectBrInst::Create(Address, DestList.size());
+  for (unsigned i = 0, e = DestList.size(); i != e; ++i)
+    IBI->addDestination(DestList[i]);
+  Inst = IBI;
+  return false;
+}
+
+
+/// ParseInvoke
+///   ::= 'invoke' OptionalCallingConv OptionalAttrs Type Value ParamList
+///       OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue
+bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy CallLoc = Lex.getLoc();
+  AttrBuilder RetAttrs, FnAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
+  CallingConv::ID CC;
+  Type *RetType = 0;
+  LocTy RetTypeLoc;
+  ValID CalleeID;
+  SmallVector<ParamInfo, 16> ArgList;
+
+  BasicBlock *NormalBB, *UnwindBB;
+  if (ParseOptionalCallingConv(CC) ||
+      ParseOptionalReturnAttrs(RetAttrs) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      ParseValID(CalleeID) ||
+      ParseParameterList(ArgList, PFS) ||
+      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc) ||
+      ParseToken(lltok::kw_to, "expected 'to' in invoke") ||
+      ParseTypeAndBasicBlock(NormalBB, PFS) ||
+      ParseToken(lltok::kw_unwind, "expected 'unwind' in invoke") ||
+      ParseTypeAndBasicBlock(UnwindBB, PFS))
+    return true;
+
+  // If RetType is a non-function pointer type, then this is the short syntax
+  // for the call, which means that RetType is just the return type.  Infer the
+  // rest of the function argument types from the arguments that are present.
+  PointerType *PFTy = 0;
+  FunctionType *Ty = 0;
+  if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
+      !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
+    // Pull out the types of all of the arguments...
+    std::vector<Type*> ParamTypes;
+    for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+      ParamTypes.push_back(ArgList[i].V->getType());
+
+    if (!FunctionType::isValidReturnType(RetType))
+      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+
+    Ty = FunctionType::get(RetType, ParamTypes, false);
+    PFTy = PointerType::getUnqual(Ty);
+  }
+
+  // Look up the callee.
+  Value *Callee;
+  if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
+
+  // Set up the Attribute for the function.
+  SmallVector<AttributeSet, 8> Attrs;
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
+
+  SmallVector<Value*, 8> Args;
+
+  // Loop through FunctionType's arguments and ensure they are specified
+  // correctly.  Also, gather any parameter attributes.
+  FunctionType::param_iterator I = Ty->param_begin();
+  FunctionType::param_iterator E = Ty->param_end();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    Type *ExpectedTy = 0;
+    if (I != E) {
+      ExpectedTy = *I++;
+    } else if (!Ty->isVarArg()) {
+      return Error(ArgList[i].Loc, "too many arguments specified");
+    }
+
+    if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
+      return Error(ArgList[i].Loc, "argument is not of expected type '" +
+                   getTypeString(ExpectedTy) + "'");
+    Args.push_back(ArgList[i].V);
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
+  }
+
+  if (I != E)
+    return Error(CallLoc, "not enough parameters specified for call");
+
+  if (FnAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FnAttrs));
+
+  // Finish off the Attribute and check them
+  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+
+  InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args);
+  II->setCallingConv(CC);
+  II->setAttributes(PAL);
+  ForwardRefAttrGroups[II] = FwdRefAttrGrps;
+  Inst = II;
+  return false;
+}
+
+/// ParseResume
+///   ::= 'resume' TypeAndValue
+bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Exn; LocTy ExnLoc;
+  if (ParseTypeAndValue(Exn, ExnLoc, PFS))
+    return true;
+
+  ResumeInst *RI = ResumeInst::Create(Exn);
+  Inst = RI;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Binary Operators.
+//===----------------------------------------------------------------------===//
+
+/// ParseArithmetic
+///  ::= ArithmeticOps TypeAndValue ',' Value
+///
+/// If OperandType is 0, then any FP or integer operand is allowed.  If it is 1,
+/// then any integer operand is allowed, if it is 2, any fp operand is allowed.
+bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
+                               unsigned Opc, unsigned OperandType) {
+  LocTy Loc; Value *LHS, *RHS;
+  if (ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' in arithmetic operation") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+
+  bool Valid;
+  switch (OperandType) {
+  default: llvm_unreachable("Unknown operand type!");
+  case 0: // int or FP.
+    Valid = LHS->getType()->isIntOrIntVectorTy() ||
+            LHS->getType()->isFPOrFPVectorTy();
+    break;
+  case 1: Valid = LHS->getType()->isIntOrIntVectorTy(); break;
+  case 2: Valid = LHS->getType()->isFPOrFPVectorTy(); break;
+  }
+
+  if (!Valid)
+    return Error(Loc, "invalid operand type for instruction");
+
+  Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+  return false;
+}
+
+/// ParseLogical
+///  ::= ArithmeticOps TypeAndValue ',' Value {
+bool LLParser::ParseLogical(Instruction *&Inst, PerFunctionState &PFS,
+                            unsigned Opc) {
+  LocTy Loc; Value *LHS, *RHS;
+  if (ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' in logical operation") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+
+  if (!LHS->getType()->isIntOrIntVectorTy())
+    return Error(Loc,"instruction requires integer or integer vector operands");
+
+  Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+  return false;
+}
+
+
+/// ParseCompare
+///  ::= 'icmp' IPredicates TypeAndValue ',' Value
+///  ::= 'fcmp' FPredicates TypeAndValue ',' Value
+bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
+                            unsigned Opc) {
+  // Parse the integer/fp comparison predicate.
+  LocTy Loc;
+  unsigned Pred;
+  Value *LHS, *RHS;
+  if (ParseCmpPredicate(Pred, Opc) ||
+      ParseTypeAndValue(LHS, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after compare value") ||
+      ParseValue(LHS->getType(), RHS, PFS))
+    return true;
+
+  if (Opc == Instruction::FCmp) {
+    if (!LHS->getType()->isFPOrFPVectorTy())
+      return Error(Loc, "fcmp requires floating point operands");
+    Inst = new FCmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  } else {
+    assert(Opc == Instruction::ICmp && "Unknown opcode for CmpInst!");
+    if (!LHS->getType()->isIntOrIntVectorTy() &&
+        !LHS->getType()->getScalarType()->isPointerTy())
+      return Error(Loc, "icmp requires integer operands");
+    Inst = new ICmpInst(CmpInst::Predicate(Pred), LHS, RHS);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Other Instructions.
+//===----------------------------------------------------------------------===//
+
+
+/// ParseCast
+///   ::= CastOpc TypeAndValue 'to' Type
+bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
+                         unsigned Opc) {
+  LocTy Loc;
+  Value *Op;
+  Type *DestTy = 0;
+  if (ParseTypeAndValue(Op, Loc, PFS) ||
+      ParseToken(lltok::kw_to, "expected 'to' after cast value") ||
+      ParseType(DestTy))
+    return true;
+
+  if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) {
+    CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy);
+    return Error(Loc, "invalid cast opcode for cast from '" +
+                 getTypeString(Op->getType()) + "' to '" +
+                 getTypeString(DestTy) + "'");
+  }
+  Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy);
+  return false;
+}
+
+/// ParseSelect
+///   ::= 'select' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseSelect(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after select condition") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after select value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+
+  if (const char *Reason = SelectInst::areInvalidOperands(Op0, Op1, Op2))
+    return Error(Loc, Reason);
+
+  Inst = SelectInst::Create(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParseVA_Arg
+///   ::= 'va_arg' TypeAndValue ',' Type
+bool LLParser::ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Op;
+  Type *EltTy = 0;
+  LocTy TypeLoc;
+  if (ParseTypeAndValue(Op, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after vaarg operand") ||
+      ParseType(EltTy, TypeLoc))
+    return true;
+
+  if (!EltTy->isFirstClassType())
+    return Error(TypeLoc, "va_arg requires operand with first class type");
+
+  Inst = new VAArgInst(Op, EltTy);
+  return false;
+}
+
+/// ParseExtractElement
+///   ::= 'extractelement' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after extract value") ||
+      ParseTypeAndValue(Op1, PFS))
+    return true;
+
+  if (!ExtractElementInst::isValidOperands(Op0, Op1))
+    return Error(Loc, "invalid extractelement operands");
+
+  Inst = ExtractElementInst::Create(Op0, Op1);
+  return false;
+}
+
+/// ParseInsertElement
+///   ::= 'insertelement' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+
+  if (!InsertElementInst::isValidOperands(Op0, Op1, Op2))
+    return Error(Loc, "invalid insertelement operands");
+
+  Inst = InsertElementInst::Create(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParseShuffleVector
+///   ::= 'shufflevector' TypeAndValue ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *Op0, *Op1, *Op2;
+  if (ParseTypeAndValue(Op0, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after shuffle mask") ||
+      ParseTypeAndValue(Op1, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after shuffle value") ||
+      ParseTypeAndValue(Op2, PFS))
+    return true;
+
+  if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
+    return Error(Loc, "invalid shufflevector operands");
+
+  Inst = new ShuffleVectorInst(Op0, Op1, Op2);
+  return false;
+}
+
+/// ParsePHI
+///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')*
+int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
+  Type *Ty = 0;  LocTy TypeLoc;
+  Value *Op0, *Op1;
+
+  if (ParseType(Ty, TypeLoc) ||
+      ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
+      ParseValue(Ty, Op0, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+      ParseValue(Type::getLabelTy(Context), Op1, PFS) ||
+      ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+    return true;
+
+  bool AteExtraComma = false;
+  SmallVector<std::pair<Value*, BasicBlock*>, 16> PHIVals;
+  while (1) {
+    PHIVals.push_back(std::make_pair(Op0, cast<BasicBlock>(Op1)));
+
+    if (!EatIfPresent(lltok::comma))
+      break;
+
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      break;
+    }
+
+    if (ParseToken(lltok::lsquare, "expected '[' in phi value list") ||
+        ParseValue(Ty, Op0, PFS) ||
+        ParseToken(lltok::comma, "expected ',' after insertelement value") ||
+        ParseValue(Type::getLabelTy(Context), Op1, PFS) ||
+        ParseToken(lltok::rsquare, "expected ']' in phi value list"))
+      return true;
+  }
+
+  if (!Ty->isFirstClassType())
+    return Error(TypeLoc, "phi node must have first class type");
+
+  PHINode *PN = PHINode::Create(Ty, PHIVals.size());
+  for (unsigned i = 0, e = PHIVals.size(); i != e; ++i)
+    PN->addIncoming(PHIVals[i].first, PHIVals[i].second);
+  Inst = PN;
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseLandingPad
+///   ::= 'landingpad' Type 'personality' TypeAndValue 'cleanup'? Clause+
+/// Clause
+///   ::= 'catch' TypeAndValue
+///   ::= 'filter'
+///   ::= 'filter' TypeAndValue ( ',' TypeAndValue )*
+bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
+  Type *Ty = 0; LocTy TyLoc;
+  Value *PersFn; LocTy PersFnLoc;
+
+  if (ParseType(Ty, TyLoc) ||
+      ParseToken(lltok::kw_personality, "expected 'personality'") ||
+      ParseTypeAndValue(PersFn, PersFnLoc, PFS))
+    return true;
+
+  LandingPadInst *LP = LandingPadInst::Create(Ty, PersFn, 0);
+  LP->setCleanup(EatIfPresent(lltok::kw_cleanup));
+
+  while (Lex.getKind() == lltok::kw_catch || Lex.getKind() == lltok::kw_filter){
+    LandingPadInst::ClauseType CT;
+    if (EatIfPresent(lltok::kw_catch))
+      CT = LandingPadInst::Catch;
+    else if (EatIfPresent(lltok::kw_filter))
+      CT = LandingPadInst::Filter;
+    else
+      return TokError("expected 'catch' or 'filter' clause type");
+
+    Value *V; LocTy VLoc;
+    if (ParseTypeAndValue(V, VLoc, PFS)) {
+      delete LP;
+      return true;
+    }
+
+    // A 'catch' type expects a non-array constant. A filter clause expects an
+    // array constant.
+    if (CT == LandingPadInst::Catch) {
+      if (isa<ArrayType>(V->getType()))
+        Error(VLoc, "'catch' clause has an invalid type");
+    } else {
+      if (!isa<ArrayType>(V->getType()))
+        Error(VLoc, "'filter' clause has an invalid type");
+    }
+
+    LP->addClause(V);
+  }
+
+  Inst = LP;
+  return false;
+}
+
+/// ParseCall
+///   ::= 'tail'? 'call' OptionalCallingConv OptionalAttrs Type Value
+///       ParameterList OptionalAttrs
+bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+                         bool isTail) {
+  AttrBuilder RetAttrs, FnAttrs;
+  std::vector<unsigned> FwdRefAttrGrps;
+  LocTy NoBuiltinLoc;
+  CallingConv::ID CC;
+  Type *RetType = 0;
+  LocTy RetTypeLoc;
+  ValID CalleeID;
+  SmallVector<ParamInfo, 16> ArgList;
+  LocTy CallLoc = Lex.getLoc();
+
+  if ((isTail && ParseToken(lltok::kw_call, "expected 'tail call'")) ||
+      ParseOptionalCallingConv(CC) ||
+      ParseOptionalReturnAttrs(RetAttrs) ||
+      ParseType(RetType, RetTypeLoc, true /*void allowed*/) ||
+      ParseValID(CalleeID) ||
+      ParseParameterList(ArgList, PFS) ||
+      ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
+                                 NoBuiltinLoc))
+    return true;
+
+  // If RetType is a non-function pointer type, then this is the short syntax
+  // for the call, which means that RetType is just the return type.  Infer the
+  // rest of the function argument types from the arguments that are present.
+  PointerType *PFTy = 0;
+  FunctionType *Ty = 0;
+  if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
+      !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
+    // Pull out the types of all of the arguments...
+    std::vector<Type*> ParamTypes;
+    for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
+      ParamTypes.push_back(ArgList[i].V->getType());
+
+    if (!FunctionType::isValidReturnType(RetType))
+      return Error(RetTypeLoc, "Invalid result type for LLVM function");
+
+    Ty = FunctionType::get(RetType, ParamTypes, false);
+    PFTy = PointerType::getUnqual(Ty);
+  }
+
+  // Look up the callee.
+  Value *Callee;
+  if (ConvertValIDToValue(PFTy, CalleeID, Callee, &PFS)) return true;
+
+  // Set up the Attribute for the function.
+  SmallVector<AttributeSet, 8> Attrs;
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::ReturnIndex,
+                                      RetAttrs));
+
+  SmallVector<Value*, 8> Args;
+
+  // Loop through FunctionType's arguments and ensure they are specified
+  // correctly.  Also, gather any parameter attributes.
+  FunctionType::param_iterator I = Ty->param_begin();
+  FunctionType::param_iterator E = Ty->param_end();
+  for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
+    Type *ExpectedTy = 0;
+    if (I != E) {
+      ExpectedTy = *I++;
+    } else if (!Ty->isVarArg()) {
+      return Error(ArgList[i].Loc, "too many arguments specified");
+    }
+
+    if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
+      return Error(ArgList[i].Loc, "argument is not of expected type '" +
+                   getTypeString(ExpectedTy) + "'");
+    Args.push_back(ArgList[i].V);
+    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
+      AttrBuilder B(ArgList[i].Attrs, i + 1);
+      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
+    }
+  }
+
+  if (I != E)
+    return Error(CallLoc, "not enough parameters specified for call");
+
+  if (FnAttrs.hasAttributes())
+    Attrs.push_back(AttributeSet::get(RetType->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      FnAttrs));
+
+  // Finish off the Attribute and check them
+  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+
+  CallInst *CI = CallInst::Create(Callee, Args);
+  CI->setTailCall(isTail);
+  CI->setCallingConv(CC);
+  CI->setAttributes(PAL);
+  ForwardRefAttrGroups[CI] = FwdRefAttrGrps;
+  Inst = CI;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Instructions.
+//===----------------------------------------------------------------------===//
+
+/// ParseAlloc
+///   ::= 'alloca' Type (',' TypeAndValue)? (',' OptionalInfo)?
+int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Size = 0;
+  LocTy SizeLoc;
+  unsigned Alignment = 0;
+  Type *Ty = 0;
+  if (ParseType(Ty)) return true;
+
+  bool AteExtraComma = false;
+  if (EatIfPresent(lltok::comma)) {
+    if (Lex.getKind() == lltok::kw_align) {
+      if (ParseOptionalAlignment(Alignment)) return true;
+    } else if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+    } else {
+      if (ParseTypeAndValue(Size, SizeLoc, PFS) ||
+          ParseOptionalCommaAlign(Alignment, AteExtraComma))
+        return true;
+    }
+  }
+
+  if (Size && !Size->getType()->isIntegerTy())
+    return Error(SizeLoc, "element count must have integer type");
+
+  Inst = new AllocaInst(Ty, Size, Alignment);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseLoad
+///   ::= 'load' 'volatile'? TypeAndValue (',' 'align' i32)?
+///   ::= 'load' 'atomic' 'volatile'? TypeAndValue
+///       'singlethread'? AtomicOrdering (',' 'align' i32)?
+int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val; LocTy Loc;
+  unsigned Alignment = 0;
+  bool AteExtraComma = false;
+  bool isAtomic = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+
+  if (Lex.getKind() == lltok::kw_atomic) {
+    isAtomic = true;
+    Lex.Lex();
+  }
+
+  bool isVolatile = false;
+  if (Lex.getKind() == lltok::kw_volatile) {
+    isVolatile = true;
+    Lex.Lex();
+  }
+
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
+      ParseOptionalCommaAlign(Alignment, AteExtraComma))
+    return true;
+
+  if (!Val->getType()->isPointerTy() ||
+      !cast<PointerType>(Val->getType())->getElementType()->isFirstClassType())
+    return Error(Loc, "load operand must be a pointer to a first class type");
+  if (isAtomic && !Alignment)
+    return Error(Loc, "atomic load must have explicit non-zero alignment");
+  if (Ordering == Release || Ordering == AcquireRelease)
+    return Error(Loc, "atomic load cannot use Release ordering");
+
+  Inst = new LoadInst(Val, "", isVolatile, Alignment, Ordering, Scope);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseStore
+
+///   ::= 'store' 'volatile'? TypeAndValue ',' TypeAndValue (',' 'align' i32)?
+///   ::= 'store' 'atomic' 'volatile'? TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering (',' 'align' i32)?
+int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val, *Ptr; LocTy Loc, PtrLoc;
+  unsigned Alignment = 0;
+  bool AteExtraComma = false;
+  bool isAtomic = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+
+  if (Lex.getKind() == lltok::kw_atomic) {
+    isAtomic = true;
+    Lex.Lex();
+  }
+
+  bool isVolatile = false;
+  if (Lex.getKind() == lltok::kw_volatile) {
+    isVolatile = true;
+    Lex.Lex();
+  }
+
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after store operand") ||
+      ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
+      ParseOptionalCommaAlign(Alignment, AteExtraComma))
+    return true;
+
+  if (!Ptr->getType()->isPointerTy())
+    return Error(PtrLoc, "store operand must be a pointer");
+  if (!Val->getType()->isFirstClassType())
+    return Error(Loc, "store operand must be a first class value");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
+    return Error(Loc, "stored value and pointer type do not match");
+  if (isAtomic && !Alignment)
+    return Error(Loc, "atomic store must have explicit non-zero alignment");
+  if (Ordering == Acquire || Ordering == AcquireRelease)
+    return Error(Loc, "atomic store cannot use Acquire ordering");
+
+  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, Scope);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseCmpXchg
+///   ::= 'cmpxchg' 'volatile'? TypeAndValue ',' TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering
+int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
+  bool AteExtraComma = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  bool isVolatile = false;
+
+  if (EatIfPresent(lltok::kw_volatile))
+    isVolatile = true;
+
+  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after cmpxchg address") ||
+      ParseTypeAndValue(Cmp, CmpLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after cmpxchg cmp operand") ||
+      ParseTypeAndValue(New, NewLoc, PFS) ||
+      ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("cmpxchg cannot be unordered");
+  if (!Ptr->getType()->isPointerTy())
+    return Error(PtrLoc, "cmpxchg operand must be a pointer");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Cmp->getType())
+    return Error(CmpLoc, "compare value and pointer type do not match");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != New->getType())
+    return Error(NewLoc, "new value and pointer type do not match");
+  if (!New->getType()->isIntegerTy())
+    return Error(NewLoc, "cmpxchg operand must be an integer");
+  unsigned Size = New->getType()->getPrimitiveSizeInBits();
+  if (Size < 8 || (Size & (Size - 1)))
+    return Error(NewLoc, "cmpxchg operand must be power-of-two byte-sized"
+                         " integer");
+
+  AtomicCmpXchgInst *CXI =
+    new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, Scope);
+  CXI->setVolatile(isVolatile);
+  Inst = CXI;
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseAtomicRMW
+///   ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering
+int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
+  bool AteExtraComma = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  bool isVolatile = false;
+  AtomicRMWInst::BinOp Operation;
+
+  if (EatIfPresent(lltok::kw_volatile))
+    isVolatile = true;
+
+  switch (Lex.getKind()) {
+  default: return TokError("expected binary operation in atomicrmw");
+  case lltok::kw_xchg: Operation = AtomicRMWInst::Xchg; break;
+  case lltok::kw_add: Operation = AtomicRMWInst::Add; break;
+  case lltok::kw_sub: Operation = AtomicRMWInst::Sub; break;
+  case lltok::kw_and: Operation = AtomicRMWInst::And; break;
+  case lltok::kw_nand: Operation = AtomicRMWInst::Nand; break;
+  case lltok::kw_or: Operation = AtomicRMWInst::Or; break;
+  case lltok::kw_xor: Operation = AtomicRMWInst::Xor; break;
+  case lltok::kw_max: Operation = AtomicRMWInst::Max; break;
+  case lltok::kw_min: Operation = AtomicRMWInst::Min; break;
+  case lltok::kw_umax: Operation = AtomicRMWInst::UMax; break;
+  case lltok::kw_umin: Operation = AtomicRMWInst::UMin; break;
+  }
+  Lex.Lex();  // Eat the operation.
+
+  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after atomicrmw address") ||
+      ParseTypeAndValue(Val, ValLoc, PFS) ||
+      ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("atomicrmw cannot be unordered");
+  if (!Ptr->getType()->isPointerTy())
+    return Error(PtrLoc, "atomicrmw operand must be a pointer");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
+    return Error(ValLoc, "atomicrmw value and pointer type do not match");
+  if (!Val->getType()->isIntegerTy())
+    return Error(ValLoc, "atomicrmw operand must be an integer");
+  unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+  if (Size < 8 || (Size & (Size - 1)))
+    return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
+                         " integer");
+
+  AtomicRMWInst *RMWI =
+    new AtomicRMWInst(Operation, Ptr, Val, Ordering, Scope);
+  RMWI->setVolatile(isVolatile);
+  Inst = RMWI;
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseFence
+///   ::= 'fence' 'singlethread'? AtomicOrdering
+int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  if (ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("fence cannot be unordered");
+  if (Ordering == Monotonic)
+    return TokError("fence cannot be monotonic");
+
+  Inst = new FenceInst(Context, Ordering, Scope);
+  return InstNormal;
+}
+
+/// ParseGetElementPtr
+///   ::= 'getelementptr' 'inbounds'? TypeAndValue (',' TypeAndValue)*
+int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr = 0;
+  Value *Val = 0;
+  LocTy Loc, EltLoc;
+
+  bool InBounds = EatIfPresent(lltok::kw_inbounds);
+
+  if (ParseTypeAndValue(Ptr, Loc, PFS)) return true;
+
+  if (!Ptr->getType()->getScalarType()->isPointerTy())
+    return Error(Loc, "base of getelementptr must be a pointer");
+
+  SmallVector<Value*, 16> Indices;
+  bool AteExtraComma = false;
+  while (EatIfPresent(lltok::comma)) {
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      break;
+    }
+    if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
+    if (!Val->getType()->getScalarType()->isIntegerTy())
+      return Error(EltLoc, "getelementptr index must be an integer");
+    if (Val->getType()->isVectorTy() != Ptr->getType()->isVectorTy())
+      return Error(EltLoc, "getelementptr index type missmatch");
+    if (Val->getType()->isVectorTy()) {
+      unsigned ValNumEl = cast<VectorType>(Val->getType())->getNumElements();
+      unsigned PtrNumEl = cast<VectorType>(Ptr->getType())->getNumElements();
+      if (ValNumEl != PtrNumEl)
+        return Error(EltLoc,
+          "getelementptr vector index has a wrong number of elements");
+    }
+    Indices.push_back(Val);
+  }
+
+  if (!GetElementPtrInst::getIndexedType(Ptr->getType(), Indices))
+    return Error(Loc, "invalid getelementptr indices");
+  Inst = GetElementPtrInst::Create(Ptr, Indices);
+  if (InBounds)
+    cast<GetElementPtrInst>(Inst)->setIsInBounds(true);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseExtractValue
+///   ::= 'extractvalue' TypeAndValue (',' uint32)+
+int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val; LocTy Loc;
+  SmallVector<unsigned, 4> Indices;
+  bool AteExtraComma;
+  if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseIndexList(Indices, AteExtraComma))
+    return true;
+
+  if (!Val->getType()->isAggregateType())
+    return Error(Loc, "extractvalue operand must be aggregate type");
+
+  if (!ExtractValueInst::getIndexedType(Val->getType(), Indices))
+    return Error(Loc, "invalid indices for extractvalue");
+  Inst = ExtractValueInst::Create(Val, Indices);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseInsertValue
+///   ::= 'insertvalue' TypeAndValue ',' TypeAndValue (',' uint32)+
+int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Val0, *Val1; LocTy Loc0, Loc1;
+  SmallVector<unsigned, 4> Indices;
+  bool AteExtraComma;
+  if (ParseTypeAndValue(Val0, Loc0, PFS) ||
+      ParseToken(lltok::comma, "expected comma after insertvalue operand") ||
+      ParseTypeAndValue(Val1, Loc1, PFS) ||
+      ParseIndexList(Indices, AteExtraComma))
+    return true;
+
+  if (!Val0->getType()->isAggregateType())
+    return Error(Loc0, "insertvalue operand must be aggregate type");
+
+  if (!ExtractValueInst::getIndexedType(Val0->getType(), Indices))
+    return Error(Loc0, "invalid indices for insertvalue");
+  Inst = InsertValueInst::Create(Val0, Val1, Indices);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+//===----------------------------------------------------------------------===//
+// Embedded metadata.
+//===----------------------------------------------------------------------===//
+
+/// ParseMDNodeVector
+///   ::= Element (',' Element)*
+/// Element
+///   ::= 'null' | TypeAndValue
+bool LLParser::ParseMDNodeVector(SmallVectorImpl<Value*> &Elts,
+                                 PerFunctionState *PFS) {
+  // Check for an empty list.
+  if (Lex.getKind() == lltok::rbrace)
+    return false;
+
+  do {
+    // Null is a special case since it is typeless.
+    if (EatIfPresent(lltok::kw_null)) {
+      Elts.push_back(0);
+      continue;
+    }
+
+    Value *V = 0;
+    if (ParseTypeAndValue(V, PFS)) return true;
+    Elts.push_back(V);
+  } while (EatIfPresent(lltok::comma));
+
+  return false;
+}
diff --git a/contrib/llvm/lib/AsmParser/LLParser.h b/contrib/llvm/lib/AsmParser/LLParser.h
new file mode 100644
index 000000000000..1f2879e948d9
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/LLParser.h
@@ -0,0 +1,410 @@
+//===-- LLParser.h - Parser Class -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the parser class for .ll files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_LLPARSER_H
+#define LLVM_ASMPARSER_LLPARSER_H
+
+#include "LLLexer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ValueHandle.h"
+#include <map>
+
+namespace llvm {
+  class Module;
+  class OpaqueType;
+  class Function;
+  class Value;
+  class BasicBlock;
+  class Instruction;
+  class Constant;
+  class GlobalValue;
+  class MDString;
+  class MDNode;
+  class StructType;
+
+  /// ValID - Represents a reference of a definition of some sort with no type.
+  /// There are several cases where we have to parse the value but where the
+  /// type can depend on later context.  This may either be a numeric reference
+  /// or a symbolic (%var) reference.  This is just a discriminated union.
+  struct ValID {
+    enum {
+      t_LocalID, t_GlobalID,      // ID in UIntVal.
+      t_LocalName, t_GlobalName,  // Name in StrVal.
+      t_APSInt, t_APFloat,        // Value in APSIntVal/APFloatVal.
+      t_Null, t_Undef, t_Zero,    // No value.
+      t_EmptyArray,               // No value:  []
+      t_Constant,                 // Value in ConstantVal.
+      t_InlineAsm,                // Value in StrVal/StrVal2/UIntVal.
+      t_MDNode,                   // Value in MDNodeVal.
+      t_MDString,                 // Value in MDStringVal.
+      t_ConstantStruct,           // Value in ConstantStructElts.
+      t_PackedConstantStruct      // Value in ConstantStructElts.
+    } Kind;
+
+    LLLexer::LocTy Loc;
+    unsigned UIntVal;
+    std::string StrVal, StrVal2;
+    APSInt APSIntVal;
+    APFloat APFloatVal;
+    Constant *ConstantVal;
+    MDNode *MDNodeVal;
+    MDString *MDStringVal;
+    Constant **ConstantStructElts;
+
+    ValID() : Kind(t_LocalID), APFloatVal(0.0) {}
+    ~ValID() {
+      if (Kind == t_ConstantStruct || Kind == t_PackedConstantStruct)
+        delete [] ConstantStructElts;
+    }
+
+    bool operator<(const ValID &RHS) const {
+      if (Kind == t_LocalID || Kind == t_GlobalID)
+        return UIntVal < RHS.UIntVal;
+      assert((Kind == t_LocalName || Kind == t_GlobalName ||
+              Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) &&
+             "Ordering not defined for this ValID kind yet");
+      return StrVal < RHS.StrVal;
+    }
+  };
+
+  class LLParser {
+  public:
+    typedef LLLexer::LocTy LocTy;
+  private:
+    LLVMContext &Context;
+    LLLexer Lex;
+    Module *M;
+
+    // Instruction metadata resolution.  Each instruction can have a list of
+    // MDRef info associated with them.
+    //
+    // The simpler approach of just creating temporary MDNodes and then calling
+    // RAUW on them when the definition is processed doesn't work because some
+    // instruction metadata kinds, such as dbg, get stored in the IR in an
+    // "optimized" format which doesn't participate in the normal value use
+    // lists. This means that RAUW doesn't work, even on temporary MDNodes
+    // which otherwise support RAUW. Instead, we defer resolving MDNode
+    // references until the definitions have been processed.
+    struct MDRef {
+      SMLoc Loc;
+      unsigned MDKind, MDSlot;
+    };
+    DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
+
+    // Type resolution handling data structures.  The location is set when we
+    // have processed a use of the type but not a definition yet.
+    StringMap<std::pair<Type*, LocTy> > NamedTypes;
+    std::vector<std::pair<Type*, LocTy> > NumberedTypes;
+
+    std::vector<TrackingVH<MDNode> > NumberedMetadata;
+    std::map<unsigned, std::pair<TrackingVH<MDNode>, LocTy> > ForwardRefMDNodes;
+
+    // Global Value reference information.
+    std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals;
+    std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs;
+    std::vector<GlobalValue*> NumberedVals;
+
+    // References to blockaddress.  The key is the function ValID, the value is
+    // a list of references to blocks in that function.
+    std::map<ValID, std::vector<std::pair<ValID, GlobalValue*> > >
+      ForwardRefBlockAddresses;
+
+    // Attribute builder reference information.
+    std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups;
+    std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
+
+  public:
+    LLParser(MemoryBuffer *F, SourceMgr &SM, SMDiagnostic &Err, Module *m) :
+      Context(m->getContext()), Lex(F, SM, Err, m->getContext()),
+      M(m) {}
+    bool Run();
+
+    LLVMContext &getContext() { return Context; }
+
+  private:
+
+    bool Error(LocTy L, const Twine &Msg) const {
+      return Lex.Error(L, Msg);
+    }
+    bool TokError(const Twine &Msg) const {
+      return Error(Lex.getLoc(), Msg);
+    }
+
+    /// GetGlobalVal - Get a value with the specified name or ID, creating a
+    /// forward reference record if needed.  This can return null if the value
+    /// exists but does not have the right type.
+    GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc);
+    GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc);
+
+    // Helper Routines.
+    bool ParseToken(lltok::Kind T, const char *ErrMsg);
+    bool EatIfPresent(lltok::Kind T) {
+      if (Lex.getKind() != T) return false;
+      Lex.Lex();
+      return true;
+    }
+
+    FastMathFlags EatFastMathFlagsIfPresent() {
+      FastMathFlags FMF;
+      while (true)
+        switch (Lex.getKind()) {
+        case lltok::kw_fast: FMF.setUnsafeAlgebra();   Lex.Lex(); continue;
+        case lltok::kw_nnan: FMF.setNoNaNs();          Lex.Lex(); continue;
+        case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
+        case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
+        case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue;
+        default: return FMF;
+        }
+      return FMF;
+    }
+
+    bool ParseOptionalToken(lltok::Kind T, bool &Present, LocTy *Loc = 0) {
+      if (Lex.getKind() != T) {
+        Present = false;
+      } else {
+        if (Loc)
+          *Loc = Lex.getLoc();
+        Lex.Lex();
+        Present = true;
+      }
+      return false;
+    }
+    bool ParseStringConstant(std::string &Result);
+    bool ParseUInt32(unsigned &Val);
+    bool ParseUInt32(unsigned &Val, LocTy &Loc) {
+      Loc = Lex.getLoc();
+      return ParseUInt32(Val);
+    }
+
+    bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
+    bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
+    bool ParseOptionalAddrSpace(unsigned &AddrSpace);
+    bool ParseOptionalParamAttrs(AttrBuilder &B);
+    bool ParseOptionalReturnAttrs(AttrBuilder &B);
+    bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
+    bool ParseOptionalLinkage(unsigned &Linkage) {
+      bool HasLinkage; return ParseOptionalLinkage(Linkage, HasLinkage);
+    }
+    bool ParseOptionalVisibility(unsigned &Visibility);
+    bool ParseOptionalCallingConv(CallingConv::ID &CC);
+    bool ParseOptionalAlignment(unsigned &Alignment);
+    bool ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+                               AtomicOrdering &Ordering);
+    bool ParseOptionalStackAlignment(unsigned &Alignment);
+    bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,bool &AteExtraComma);
+    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
+      bool AteExtraComma;
+      if (ParseIndexList(Indices, AteExtraComma)) return true;
+      if (AteExtraComma)
+        return TokError("expected index");
+      return false;
+    }
+
+    // Top-Level Entities
+    bool ParseTopLevelEntities();
+    bool ValidateEndOfModule();
+    bool ParseTargetDefinition();
+    bool ParseModuleAsm();
+    bool ParseDepLibs();        // FIXME: Remove in 4.0.
+    bool ParseUnnamedType();
+    bool ParseNamedType();
+    bool ParseDeclare();
+    bool ParseDefine();
+
+    bool ParseGlobalType(bool &IsConstant);
+    bool ParseUnnamedGlobal();
+    bool ParseNamedGlobal();
+    bool ParseGlobal(const std::string &Name, LocTy Loc, unsigned Linkage,
+                     bool HasLinkage, unsigned Visibility);
+    bool ParseAlias(const std::string &Name, LocTy Loc, unsigned Visibility);
+    bool ParseStandaloneMetadata();
+    bool ParseNamedMetadata();
+    bool ParseMDString(MDString *&Result);
+    bool ParseMDNodeID(MDNode *&Result);
+    bool ParseMDNodeID(MDNode *&Result, unsigned &SlotNo);
+    bool ParseUnnamedAttrGrp();
+    bool ParseFnAttributeValuePairs(AttrBuilder &B,
+                                    std::vector<unsigned> &FwdRefAttrGrps,
+                                    bool inAttrGrp, LocTy &NoBuiltinLoc);
+
+    // Type Parsing.
+    bool ParseType(Type *&Result, bool AllowVoid = false);
+    bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
+      Loc = Lex.getLoc();
+      return ParseType(Result, AllowVoid);
+    }
+    bool ParseAnonStructType(Type *&Result, bool Packed);
+    bool ParseStructBody(SmallVectorImpl<Type*> &Body);
+    bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
+                               std::pair<Type*, LocTy> &Entry,
+                               Type *&ResultTy);
+
+    bool ParseArrayVectorType(Type *&Result, bool isVector);
+    bool ParseFunctionType(Type *&Result);
+
+    // Function Semantic Analysis.
+    class PerFunctionState {
+      LLParser &P;
+      Function &F;
+      std::map<std::string, std::pair<Value*, LocTy> > ForwardRefVals;
+      std::map<unsigned, std::pair<Value*, LocTy> > ForwardRefValIDs;
+      std::vector<Value*> NumberedVals;
+
+      /// FunctionNumber - If this is an unnamed function, this is the slot
+      /// number of it, otherwise it is -1.
+      int FunctionNumber;
+    public:
+      PerFunctionState(LLParser &p, Function &f, int FunctionNumber);
+      ~PerFunctionState();
+
+      Function &getFunction() const { return F; }
+
+      bool FinishFunction();
+
+      /// GetVal - Get a value with the specified name or ID, creating a
+      /// forward reference record if needed.  This can return null if the value
+      /// exists but does not have the right type.
+      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc);
+      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc);
+
+      /// SetInstName - After an instruction is parsed and inserted into its
+      /// basic block, this installs its name.
+      bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
+                       Instruction *Inst);
+
+      /// GetBB - Get a basic block with the specified name or ID, creating a
+      /// forward reference record if needed.  This can return null if the value
+      /// is not a BasicBlock.
+      BasicBlock *GetBB(const std::string &Name, LocTy Loc);
+      BasicBlock *GetBB(unsigned ID, LocTy Loc);
+
+      /// DefineBB - Define the specified basic block, which is either named or
+      /// unnamed.  If there is an error, this returns null otherwise it returns
+      /// the block being defined.
+      BasicBlock *DefineBB(const std::string &Name, LocTy Loc);
+    };
+
+    bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
+                             PerFunctionState *PFS);
+
+    bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
+    bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
+      return ParseValue(Ty, V, &PFS);
+    }
+    bool ParseValue(Type *Ty, Value *&V, LocTy &Loc,
+                    PerFunctionState &PFS) {
+      Loc = Lex.getLoc();
+      return ParseValue(Ty, V, &PFS);
+    }
+
+    bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS);
+    bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) {
+      return ParseTypeAndValue(V, &PFS);
+    }
+    bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
+      Loc = Lex.getLoc();
+      return ParseTypeAndValue(V, PFS);
+    }
+    bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
+                                PerFunctionState &PFS);
+    bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) {
+      LocTy Loc;
+      return ParseTypeAndBasicBlock(BB, Loc, PFS);
+    }
+
+
+    struct ParamInfo {
+      LocTy Loc;
+      Value *V;
+      AttributeSet Attrs;
+      ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
+        : Loc(loc), V(v), Attrs(attrs) {}
+    };
+    bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                            PerFunctionState &PFS);
+
+    // Constant Parsing.
+    bool ParseValID(ValID &ID, PerFunctionState *PFS = NULL);
+    bool ParseGlobalValue(Type *Ty, Constant *&V);
+    bool ParseGlobalTypeAndValue(Constant *&V);
+    bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
+    bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
+    bool ParseMetadataValue(ValID &ID, PerFunctionState *PFS);
+    bool ParseMDNodeVector(SmallVectorImpl<Value*> &, PerFunctionState *PFS);
+    bool ParseInstructionMetadata(Instruction *Inst, PerFunctionState *PFS);
+
+    // Function Parsing.
+    struct ArgInfo {
+      LocTy Loc;
+      Type *Ty;
+      AttributeSet Attrs;
+      std::string Name;
+      ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
+        : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
+    };
+    bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
+    bool ParseFunctionHeader(Function *&Fn, bool isDefine);
+    bool ParseFunctionBody(Function &Fn);
+    bool ParseBasicBlock(PerFunctionState &PFS);
+
+    // Instruction Parsing.  Each instruction parsing routine can return with a
+    // normal result, an error result, or return having eaten an extra comma.
+    enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
+    int ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+                         PerFunctionState &PFS);
+    bool ParseCmpPredicate(unsigned &Pred, unsigned Opc);
+
+    bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
+    bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
+
+    bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc,
+                         unsigned OperandType);
+    bool ParseLogical(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCompare(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseCast(Instruction *&I, PerFunctionState &PFS, unsigned Opc);
+    bool ParseSelect(Instruction *&I, PerFunctionState &PFS);
+    bool ParseVA_Arg(Instruction *&I, PerFunctionState &PFS);
+    bool ParseExtractElement(Instruction *&I, PerFunctionState &PFS);
+    bool ParseInsertElement(Instruction *&I, PerFunctionState &PFS);
+    bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
+    int ParsePHI(Instruction *&I, PerFunctionState &PFS);
+    bool ParseLandingPad(Instruction *&I, PerFunctionState &PFS);
+    bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
+    int ParseAlloc(Instruction *&I, PerFunctionState &PFS);
+    int ParseLoad(Instruction *&I, PerFunctionState &PFS);
+    int ParseStore(Instruction *&I, PerFunctionState &PFS);
+    int ParseCmpXchg(Instruction *&I, PerFunctionState &PFS);
+    int ParseAtomicRMW(Instruction *&I, PerFunctionState &PFS);
+    int ParseFence(Instruction *&I, PerFunctionState &PFS);
+    int ParseGetElementPtr(Instruction *&I, PerFunctionState &PFS);
+    int ParseExtractValue(Instruction *&I, PerFunctionState &PFS);
+    int ParseInsertValue(Instruction *&I, PerFunctionState &PFS);
+
+    bool ResolveForwardRefBlockAddresses(Function *TheFn,
+                             std::vector<std::pair<ValID, GlobalValue*> > &Refs,
+                                         PerFunctionState *PFS);
+  };
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/AsmParser/LLToken.h b/contrib/llvm/lib/AsmParser/LLToken.h
new file mode 100644
index 000000000000..cd25ba30008f
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/LLToken.h
@@ -0,0 +1,181 @@
+//===- LLToken.h - Token Codes for LLVM Assembly Files ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the enums for the .ll lexer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBS_ASMPARSER_LLTOKEN_H
+#define LIBS_ASMPARSER_LLTOKEN_H
+
+namespace llvm {
+namespace lltok {
+  enum Kind {
+    // Markers
+    Eof, Error,
+
+    // Tokens with no info.
+    dotdotdot,         // ...
+    equal, comma,      // =  ,
+    star,              // *
+    lsquare, rsquare,  // [  ]
+    lbrace, rbrace,    // {  }
+    less, greater,     // <  >
+    lparen, rparen,    // (  )
+    backslash,         // \    (not /)
+    exclaim,           // !
+    hash,              // #
+
+    kw_x,
+    kw_true,    kw_false,
+    kw_declare, kw_define,
+    kw_global,  kw_constant,
+
+    kw_private, kw_linker_private, kw_linker_private_weak,
+    kw_linker_private_weak_def_auto, // FIXME: For backwards compatibility.
+    kw_internal,
+    kw_linkonce, kw_linkonce_odr, kw_linkonce_odr_auto_hide,
+    kw_weak, kw_weak_odr, kw_appending,
+    kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
+    kw_default, kw_hidden, kw_protected,
+    kw_unnamed_addr,
+    kw_externally_initialized,
+    kw_extern_weak,
+    kw_external, kw_thread_local,
+    kw_localdynamic, kw_initialexec, kw_localexec,
+    kw_zeroinitializer,
+    kw_undef, kw_null,
+    kw_to,
+    kw_tail,
+    kw_target,
+    kw_triple,
+    kw_unwind,
+    kw_deplibs,                 // FIXME: Remove in 4.0
+    kw_datalayout,
+    kw_volatile,
+    kw_atomic,
+    kw_unordered, kw_monotonic, kw_acquire, kw_release, kw_acq_rel, kw_seq_cst,
+    kw_singlethread,
+    kw_nnan,
+    kw_ninf,
+    kw_nsz,
+    kw_arcp,
+    kw_fast,
+    kw_nuw,
+    kw_nsw,
+    kw_exact,
+    kw_inbounds,
+    kw_align,
+    kw_addrspace,
+    kw_section,
+    kw_alias,
+    kw_module,
+    kw_asm,
+    kw_sideeffect,
+    kw_alignstack,
+    kw_inteldialect,
+    kw_gc,
+    kw_c,
+
+    kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
+	  kw_intel_ocl_bicc,
+    kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
+    kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
+    kw_msp430_intrcc,
+    kw_ptx_kernel, kw_ptx_device,
+    kw_spir_kernel, kw_spir_func,
+
+    // Attributes:
+    kw_attributes,
+    kw_alwaysinline,
+    kw_sanitize_address,
+    kw_byval,
+    kw_inlinehint,
+    kw_inreg,
+    kw_minsize,
+    kw_naked,
+    kw_nest,
+    kw_noalias,
+    kw_nobuiltin,
+    kw_nocapture,
+    kw_noduplicate,
+    kw_noimplicitfloat,
+    kw_noinline,
+    kw_nonlazybind,
+    kw_noredzone,
+    kw_noreturn,
+    kw_nounwind,
+    kw_optsize,
+    kw_readnone,
+    kw_readonly,
+    kw_returns_twice,
+    kw_signext,
+    kw_ssp,
+    kw_sspreq,
+    kw_sspstrong,
+    kw_sret,
+    kw_sanitize_thread,
+    kw_sanitize_memory,
+    kw_uwtable,
+    kw_zeroext,
+
+    kw_type,
+    kw_opaque,
+
+    kw_eq, kw_ne, kw_slt, kw_sgt, kw_sle, kw_sge, kw_ult, kw_ugt, kw_ule,
+    kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
+    kw_ueq, kw_une,
+
+    // atomicrmw operations that aren't also instruction keywords.
+    kw_xchg, kw_nand, kw_max, kw_min, kw_umax, kw_umin,
+
+    // Instruction Opcodes (Opcode in UIntVal).
+    kw_add,  kw_fadd, kw_sub,  kw_fsub, kw_mul,  kw_fmul,
+    kw_udiv, kw_sdiv, kw_fdiv,
+    kw_urem, kw_srem, kw_frem, kw_shl,  kw_lshr, kw_ashr,
+    kw_and,  kw_or,   kw_xor,  kw_icmp, kw_fcmp,
+
+    kw_phi, kw_call,
+    kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
+    kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
+    kw_select, kw_va_arg,
+
+    kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
+
+    kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_resume,
+    kw_unreachable,
+
+    kw_alloca, kw_load, kw_store, kw_fence, kw_cmpxchg, kw_atomicrmw,
+    kw_getelementptr,
+
+    kw_extractelement, kw_insertelement, kw_shufflevector,
+    kw_extractvalue, kw_insertvalue, kw_blockaddress,
+
+    // Unsigned Valued tokens (UIntVal).
+    GlobalID,          // @42
+    LocalVarID,        // %42
+    AttrGrpID,         // #42
+
+    // String valued tokens (StrVal).
+    LabelStr,          // foo:
+    GlobalVar,         // @foo @"foo"
+    LocalVar,          // %foo %"foo"
+    MetadataVar,       // !foo
+    StringConstant,    // "foo"
+
+    // Type valued tokens (TyVal).
+    Type,
+
+    APFloat,  // APFloatVal
+    APSInt // APSInt
+  };
+} // end namespace lltok
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/AsmParser/Parser.cpp b/contrib/llvm/lib/AsmParser/Parser.cpp
new file mode 100644
index 000000000000..bb4f03bacc17
--- /dev/null
+++ b/contrib/llvm/lib/AsmParser/Parser.cpp
@@ -0,0 +1,62 @@
+//===- Parser.cpp - Main dispatch module for the Parser library -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library implements the functionality defined in llvm/Assembly/Parser.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/Parser.h"
+#include "LLParser.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cstring>
+using namespace llvm;
+
+Module *llvm::ParseAssembly(MemoryBuffer *F,
+                            Module *M,
+                            SMDiagnostic &Err,
+                            LLVMContext &Context) {
+  SourceMgr SM;
+  SM.AddNewSourceBuffer(F, SMLoc());
+
+  // If we are parsing into an existing module, do it.
+  if (M)
+    return LLParser(F, SM, Err, M).Run() ? 0 : M;
+
+  // Otherwise create a new module.
+  OwningPtr<Module> M2(new Module(F->getBufferIdentifier(), Context));
+  if (LLParser(F, SM, Err, M2.get()).Run())
+    return 0;
+  return M2.take();
+}
+
+Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
+                                LLVMContext &Context) {
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+    Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                       "Could not open input file: " + ec.message());
+    return 0;
+  }
+
+  return ParseAssembly(File.take(), 0, Err, Context);
+}
+
+Module *llvm::ParseAssemblyString(const char *AsmString, Module *M,
+                                  SMDiagnostic &Err, LLVMContext &Context) {
+  MemoryBuffer *F =
+    MemoryBuffer::getMemBuffer(StringRef(AsmString, strlen(AsmString)),
+                               "<string>");
+
+  return ParseAssembly(F, M, Err, Context);
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp
new file mode 100644
index 000000000000..5cd6c552bd8a
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Reader/BitReader.cpp
@@ -0,0 +1,88 @@
+//===-- BitReader.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/BitReader.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstring>
+#include <string>
+
+using namespace llvm;
+
+/* Builds a module from the bitcode in the specified memory buffer, returning a
+   reference to the module via the OutModule parameter. Returns 0 on success.
+   Optionally returns a human-readable error message via OutMessage. */
+LLVMBool LLVMParseBitcode(LLVMMemoryBufferRef MemBuf,
+                          LLVMModuleRef *OutModule, char **OutMessage) {
+  return LLVMParseBitcodeInContext(wrap(&getGlobalContext()), MemBuf, OutModule,
+                                   OutMessage);
+}
+
+LLVMBool LLVMParseBitcodeInContext(LLVMContextRef ContextRef,
+                                   LLVMMemoryBufferRef MemBuf,
+                                   LLVMModuleRef *OutModule,
+                                   char **OutMessage) {
+  std::string Message;
+
+  *OutModule = wrap(ParseBitcodeFile(unwrap(MemBuf), *unwrap(ContextRef),
+                                     &Message));
+  if (!*OutModule) {
+    if (OutMessage)
+      *OutMessage = strdup(Message.c_str());
+    return 1;
+  }
+
+  return 0;
+}
+
+/* Reads a module from the specified path, returning via the OutModule parameter
+   a module provider which performs lazy deserialization. Returns 0 on success.
+   Optionally returns a human-readable error message via OutMessage. */
+LLVMBool LLVMGetBitcodeModuleInContext(LLVMContextRef ContextRef,
+                                       LLVMMemoryBufferRef MemBuf,
+                                       LLVMModuleRef *OutM,
+                                       char **OutMessage) {
+  std::string Message;
+
+  *OutM = wrap(getLazyBitcodeModule(unwrap(MemBuf), *unwrap(ContextRef),
+                                    &Message));
+  if (!*OutM) {
+    if (OutMessage)
+      *OutMessage = strdup(Message.c_str());
+    return 1;
+  }
+
+  return 0;
+
+}
+
+LLVMBool LLVMGetBitcodeModule(LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
+                              char **OutMessage) {
+  return LLVMGetBitcodeModuleInContext(LLVMGetGlobalContext(), MemBuf, OutM,
+                                       OutMessage);
+}
+
+/* Deprecated: Use LLVMGetBitcodeModuleInContext instead. */
+LLVMBool LLVMGetBitcodeModuleProviderInContext(LLVMContextRef ContextRef,
+                                               LLVMMemoryBufferRef MemBuf,
+                                               LLVMModuleProviderRef *OutMP,
+                                               char **OutMessage) {
+  return LLVMGetBitcodeModuleInContext(ContextRef, MemBuf,
+                                       reinterpret_cast<LLVMModuleRef*>(OutMP),
+                                       OutMessage);
+}
+
+/* Deprecated: Use LLVMGetBitcodeModule instead. */
+LLVMBool LLVMGetBitcodeModuleProvider(LLVMMemoryBufferRef MemBuf,
+                                      LLVMModuleProviderRef *OutMP,
+                                      char **OutMessage) {
+  return LLVMGetBitcodeModuleProviderInContext(LLVMGetGlobalContext(), MemBuf,
+                                               OutMP, OutMessage);
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
new file mode 100644
index 000000000000..f34884391a74
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -0,0 +1,3111 @@
+//===- BitcodeReader.cpp - Internal BitcodeReader implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "BitcodeReader.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/AutoUpgrade.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/DataStream.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+using namespace llvm;
+
+enum {
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+};
+
+void BitcodeReader::materializeForwardReferencedFunctions() {
+  while (!BlockAddrFwdRefs.empty()) {
+    Function *F = BlockAddrFwdRefs.begin()->first;
+    F->Materialize();
+  }
+}
+
+void BitcodeReader::FreeState() {
+  if (BufferOwned)
+    delete Buffer;
+  Buffer = 0;
+  std::vector<Type*>().swap(TypeList);
+  ValueList.clear();
+  MDValueList.clear();
+
+  std::vector<AttributeSet>().swap(MAttributes);
+  std::vector<BasicBlock*>().swap(FunctionBBs);
+  std::vector<Function*>().swap(FunctionsWithBodies);
+  DeferredFunctionInfo.clear();
+  MDKindMap.clear();
+
+  assert(BlockAddrFwdRefs.empty() && "Unresolved blockaddress fwd references");
+}
+
+//===----------------------------------------------------------------------===//
+//  Helper functions to implement forward reference resolution, etc.
+//===----------------------------------------------------------------------===//
+
+/// ConvertToString - Convert a string from a record into an std::string, return
+/// true on failure.
+template<typename StrTy>
+static bool ConvertToString(ArrayRef<uint64_t> Record, unsigned Idx,
+                            StrTy &Result) {
+  if (Idx > Record.size())
+    return true;
+
+  for (unsigned i = Idx, e = Record.size(); i != e; ++i)
+    Result += (char)Record[i];
+  return false;
+}
+
+static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
+  switch (Val) {
+  default: // Map unknown/new linkages to external
+  case 0:  return GlobalValue::ExternalLinkage;
+  case 1:  return GlobalValue::WeakAnyLinkage;
+  case 2:  return GlobalValue::AppendingLinkage;
+  case 3:  return GlobalValue::InternalLinkage;
+  case 4:  return GlobalValue::LinkOnceAnyLinkage;
+  case 5:  return GlobalValue::DLLImportLinkage;
+  case 6:  return GlobalValue::DLLExportLinkage;
+  case 7:  return GlobalValue::ExternalWeakLinkage;
+  case 8:  return GlobalValue::CommonLinkage;
+  case 9:  return GlobalValue::PrivateLinkage;
+  case 10: return GlobalValue::WeakODRLinkage;
+  case 11: return GlobalValue::LinkOnceODRLinkage;
+  case 12: return GlobalValue::AvailableExternallyLinkage;
+  case 13: return GlobalValue::LinkerPrivateLinkage;
+  case 14: return GlobalValue::LinkerPrivateWeakLinkage;
+  case 15: return GlobalValue::LinkOnceODRAutoHideLinkage;
+  }
+}
+
+static GlobalValue::VisibilityTypes GetDecodedVisibility(unsigned Val) {
+  switch (Val) {
+  default: // Map unknown visibilities to default.
+  case 0: return GlobalValue::DefaultVisibility;
+  case 1: return GlobalValue::HiddenVisibility;
+  case 2: return GlobalValue::ProtectedVisibility;
+  }
+}
+
+static GlobalVariable::ThreadLocalMode GetDecodedThreadLocalMode(unsigned Val) {
+  switch (Val) {
+    case 0: return GlobalVariable::NotThreadLocal;
+    default: // Map unknown non-zero value to general dynamic.
+    case 1: return GlobalVariable::GeneralDynamicTLSModel;
+    case 2: return GlobalVariable::LocalDynamicTLSModel;
+    case 3: return GlobalVariable::InitialExecTLSModel;
+    case 4: return GlobalVariable::LocalExecTLSModel;
+  }
+}
+
+static int GetDecodedCastOpcode(unsigned Val) {
+  switch (Val) {
+  default: return -1;
+  case bitc::CAST_TRUNC   : return Instruction::Trunc;
+  case bitc::CAST_ZEXT    : return Instruction::ZExt;
+  case bitc::CAST_SEXT    : return Instruction::SExt;
+  case bitc::CAST_FPTOUI  : return Instruction::FPToUI;
+  case bitc::CAST_FPTOSI  : return Instruction::FPToSI;
+  case bitc::CAST_UITOFP  : return Instruction::UIToFP;
+  case bitc::CAST_SITOFP  : return Instruction::SIToFP;
+  case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
+  case bitc::CAST_FPEXT   : return Instruction::FPExt;
+  case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
+  case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
+  case bitc::CAST_BITCAST : return Instruction::BitCast;
+  }
+}
+static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) {
+  switch (Val) {
+  default: return -1;
+  case bitc::BINOP_ADD:
+    return Ty->isFPOrFPVectorTy() ? Instruction::FAdd : Instruction::Add;
+  case bitc::BINOP_SUB:
+    return Ty->isFPOrFPVectorTy() ? Instruction::FSub : Instruction::Sub;
+  case bitc::BINOP_MUL:
+    return Ty->isFPOrFPVectorTy() ? Instruction::FMul : Instruction::Mul;
+  case bitc::BINOP_UDIV: return Instruction::UDiv;
+  case bitc::BINOP_SDIV:
+    return Ty->isFPOrFPVectorTy() ? Instruction::FDiv : Instruction::SDiv;
+  case bitc::BINOP_UREM: return Instruction::URem;
+  case bitc::BINOP_SREM:
+    return Ty->isFPOrFPVectorTy() ? Instruction::FRem : Instruction::SRem;
+  case bitc::BINOP_SHL:  return Instruction::Shl;
+  case bitc::BINOP_LSHR: return Instruction::LShr;
+  case bitc::BINOP_ASHR: return Instruction::AShr;
+  case bitc::BINOP_AND:  return Instruction::And;
+  case bitc::BINOP_OR:   return Instruction::Or;
+  case bitc::BINOP_XOR:  return Instruction::Xor;
+  }
+}
+
+static AtomicRMWInst::BinOp GetDecodedRMWOperation(unsigned Val) {
+  switch (Val) {
+  default: return AtomicRMWInst::BAD_BINOP;
+  case bitc::RMW_XCHG: return AtomicRMWInst::Xchg;
+  case bitc::RMW_ADD: return AtomicRMWInst::Add;
+  case bitc::RMW_SUB: return AtomicRMWInst::Sub;
+  case bitc::RMW_AND: return AtomicRMWInst::And;
+  case bitc::RMW_NAND: return AtomicRMWInst::Nand;
+  case bitc::RMW_OR: return AtomicRMWInst::Or;
+  case bitc::RMW_XOR: return AtomicRMWInst::Xor;
+  case bitc::RMW_MAX: return AtomicRMWInst::Max;
+  case bitc::RMW_MIN: return AtomicRMWInst::Min;
+  case bitc::RMW_UMAX: return AtomicRMWInst::UMax;
+  case bitc::RMW_UMIN: return AtomicRMWInst::UMin;
+  }
+}
+
+static AtomicOrdering GetDecodedOrdering(unsigned Val) {
+  switch (Val) {
+  case bitc::ORDERING_NOTATOMIC: return NotAtomic;
+  case bitc::ORDERING_UNORDERED: return Unordered;
+  case bitc::ORDERING_MONOTONIC: return Monotonic;
+  case bitc::ORDERING_ACQUIRE: return Acquire;
+  case bitc::ORDERING_RELEASE: return Release;
+  case bitc::ORDERING_ACQREL: return AcquireRelease;
+  default: // Map unknown orderings to sequentially-consistent.
+  case bitc::ORDERING_SEQCST: return SequentiallyConsistent;
+  }
+}
+
+static SynchronizationScope GetDecodedSynchScope(unsigned Val) {
+  switch (Val) {
+  case bitc::SYNCHSCOPE_SINGLETHREAD: return SingleThread;
+  default: // Map unknown scopes to cross-thread.
+  case bitc::SYNCHSCOPE_CROSSTHREAD: return CrossThread;
+  }
+}
+
+namespace llvm {
+namespace {
+  /// @brief A class for maintaining the slot number definition
+  /// as a placeholder for the actual definition for forward constants defs.
+  class ConstantPlaceHolder : public ConstantExpr {
+    void operator=(const ConstantPlaceHolder &) LLVM_DELETED_FUNCTION;
+  public:
+    // allocate space for exactly one operand
+    void *operator new(size_t s) {
+      return User::operator new(s, 1);
+    }
+    explicit ConstantPlaceHolder(Type *Ty, LLVMContext& Context)
+      : ConstantExpr(Ty, Instruction::UserOp1, &Op<0>(), 1) {
+      Op<0>() = UndefValue::get(Type::getInt32Ty(Context));
+    }
+
+    /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
+    static bool classof(const Value *V) {
+      return isa<ConstantExpr>(V) &&
+             cast<ConstantExpr>(V)->getOpcode() == Instruction::UserOp1;
+    }
+
+
+    /// Provide fast operand accessors
+    //DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+  };
+}
+
+// FIXME: can we inherit this from ConstantExpr?
+template <>
+struct OperandTraits<ConstantPlaceHolder> :
+  public FixedNumOperandTraits<ConstantPlaceHolder, 1> {
+};
+}
+
+
+void BitcodeReaderValueList::AssignValue(Value *V, unsigned Idx) {
+  if (Idx == size()) {
+    push_back(V);
+    return;
+  }
+
+  if (Idx >= size())
+    resize(Idx+1);
+
+  WeakVH &OldV = ValuePtrs[Idx];
+  if (OldV == 0) {
+    OldV = V;
+    return;
+  }
+
+  // Handle constants and non-constants (e.g. instrs) differently for
+  // efficiency.
+  if (Constant *PHC = dyn_cast<Constant>(&*OldV)) {
+    ResolveConstants.push_back(std::make_pair(PHC, Idx));
+    OldV = V;
+  } else {
+    // If there was a forward reference to this value, replace it.
+    Value *PrevVal = OldV;
+    OldV->replaceAllUsesWith(V);
+    delete PrevVal;
+  }
+}
+
+
+Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx,
+                                                    Type *Ty) {
+  if (Idx >= size())
+    resize(Idx + 1);
+
+  if (Value *V = ValuePtrs[Idx]) {
+    assert(Ty == V->getType() && "Type mismatch in constant table!");
+    return cast<Constant>(V);
+  }
+
+  // Create and return a placeholder, which will later be RAUW'd.
+  Constant *C = new ConstantPlaceHolder(Ty, Context);
+  ValuePtrs[Idx] = C;
+  return C;
+}
+
+Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
+  if (Idx >= size())
+    resize(Idx + 1);
+
+  if (Value *V = ValuePtrs[Idx]) {
+    assert((Ty == 0 || Ty == V->getType()) && "Type mismatch in value table!");
+    return V;
+  }
+
+  // No type specified, must be invalid reference.
+  if (Ty == 0) return 0;
+
+  // Create and return a placeholder, which will later be RAUW'd.
+  Value *V = new Argument(Ty);
+  ValuePtrs[Idx] = V;
+  return V;
+}
+
+/// ResolveConstantForwardRefs - Once all constants are read, this method bulk
+/// resolves any forward references.  The idea behind this is that we sometimes
+/// get constants (such as large arrays) which reference *many* forward ref
+/// constants.  Replacing each of these causes a lot of thrashing when
+/// building/reuniquing the constant.  Instead of doing this, we look at all the
+/// uses and rewrite all the place holders at once for any constant that uses
+/// a placeholder.
+void BitcodeReaderValueList::ResolveConstantForwardRefs() {
+  // Sort the values by-pointer so that they are efficient to look up with a
+  // binary search.
+  std::sort(ResolveConstants.begin(), ResolveConstants.end());
+
+  SmallVector<Constant*, 64> NewOps;
+
+  while (!ResolveConstants.empty()) {
+    Value *RealVal = operator[](ResolveConstants.back().second);
+    Constant *Placeholder = ResolveConstants.back().first;
+    ResolveConstants.pop_back();
+
+    // Loop over all users of the placeholder, updating them to reference the
+    // new value.  If they reference more than one placeholder, update them all
+    // at once.
+    while (!Placeholder->use_empty()) {
+      Value::use_iterator UI = Placeholder->use_begin();
+      User *U = *UI;
+
+      // If the using object isn't uniqued, just update the operands.  This
+      // handles instructions and initializers for global variables.
+      if (!isa<Constant>(U) || isa<GlobalValue>(U)) {
+        UI.getUse().set(RealVal);
+        continue;
+      }
+
+      // Otherwise, we have a constant that uses the placeholder.  Replace that
+      // constant with a new constant that has *all* placeholder uses updated.
+      Constant *UserC = cast<Constant>(U);
+      for (User::op_iterator I = UserC->op_begin(), E = UserC->op_end();
+           I != E; ++I) {
+        Value *NewOp;
+        if (!isa<ConstantPlaceHolder>(*I)) {
+          // Not a placeholder reference.
+          NewOp = *I;
+        } else if (*I == Placeholder) {
+          // Common case is that it just references this one placeholder.
+          NewOp = RealVal;
+        } else {
+          // Otherwise, look up the placeholder in ResolveConstants.
+          ResolveConstantsTy::iterator It =
+            std::lower_bound(ResolveConstants.begin(), ResolveConstants.end(),
+                             std::pair<Constant*, unsigned>(cast<Constant>(*I),
+                                                            0));
+          assert(It != ResolveConstants.end() && It->first == *I);
+          NewOp = operator[](It->second);
+        }
+
+        NewOps.push_back(cast<Constant>(NewOp));
+      }
+
+      // Make the new constant.
+      Constant *NewC;
+      if (ConstantArray *UserCA = dyn_cast<ConstantArray>(UserC)) {
+        NewC = ConstantArray::get(UserCA->getType(), NewOps);
+      } else if (ConstantStruct *UserCS = dyn_cast<ConstantStruct>(UserC)) {
+        NewC = ConstantStruct::get(UserCS->getType(), NewOps);
+      } else if (isa<ConstantVector>(UserC)) {
+        NewC = ConstantVector::get(NewOps);
+      } else {
+        assert(isa<ConstantExpr>(UserC) && "Must be a ConstantExpr.");
+        NewC = cast<ConstantExpr>(UserC)->getWithOperands(NewOps);
+      }
+
+      UserC->replaceAllUsesWith(NewC);
+      UserC->destroyConstant();
+      NewOps.clear();
+    }
+
+    // Update all ValueHandles, they should be the only users at this point.
+    Placeholder->replaceAllUsesWith(RealVal);
+    delete Placeholder;
+  }
+}
+
+void BitcodeReaderMDValueList::AssignValue(Value *V, unsigned Idx) {
+  if (Idx == size()) {
+    push_back(V);
+    return;
+  }
+
+  if (Idx >= size())
+    resize(Idx+1);
+
+  WeakVH &OldV = MDValuePtrs[Idx];
+  if (OldV == 0) {
+    OldV = V;
+    return;
+  }
+
+  // If there was a forward reference to this value, replace it.
+  MDNode *PrevVal = cast<MDNode>(OldV);
+  OldV->replaceAllUsesWith(V);
+  MDNode::deleteTemporary(PrevVal);
+  // Deleting PrevVal sets Idx value in MDValuePtrs to null. Set new
+  // value for Idx.
+  MDValuePtrs[Idx] = V;
+}
+
+Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
+  if (Idx >= size())
+    resize(Idx + 1);
+
+  if (Value *V = MDValuePtrs[Idx]) {
+    assert(V->getType()->isMetadataTy() && "Type mismatch in value table!");
+    return V;
+  }
+
+  // Create and return a placeholder, which will later be RAUW'd.
+  Value *V = MDNode::getTemporary(Context, ArrayRef<Value*>());
+  MDValuePtrs[Idx] = V;
+  return V;
+}
+
+Type *BitcodeReader::getTypeByID(unsigned ID) {
+  // The type table size is always specified correctly.
+  if (ID >= TypeList.size())
+    return 0;
+
+  if (Type *Ty = TypeList[ID])
+    return Ty;
+
+  // If we have a forward reference, the only possible case is when it is to a
+  // named struct.  Just create a placeholder for now.
+  return TypeList[ID] = StructType::create(Context);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Functions for parsing blocks from the bitcode file
+//===----------------------------------------------------------------------===//
+
+
+/// \brief This fills an AttrBuilder object with the LLVM attributes that have
+/// been decoded from the given integer. This function must stay in sync with
+/// 'encodeLLVMAttributesForBitcode'.
+static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
+                                           uint64_t EncodedAttrs) {
+  // FIXME: Remove in 4.0.
+
+  // The alignment is stored as a 16-bit raw value from bits 31--16.  We shift
+  // the bits above 31 down by 11 bits.
+  unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16;
+  assert((!Alignment || isPowerOf2_32(Alignment)) &&
+         "Alignment must be a power of two.");
+
+  if (Alignment)
+    B.addAlignmentAttr(Alignment);
+  B.addRawValue(((EncodedAttrs & (0xfffffULL << 32)) >> 11) |
+                (EncodedAttrs & 0xffff));
+}
+
+bool BitcodeReader::ParseAttributeBlock() {
+  if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
+    return Error("Malformed block record");
+
+  if (!MAttributes.empty())
+    return Error("Multiple PARAMATTR blocks found!");
+
+  SmallVector<uint64_t, 64> Record;
+
+  SmallVector<AttributeSet, 8> Attrs;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("Error at end of PARAMATTR block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
+      // FIXME: Remove in 4.0.
+      if (Record.size() & 1)
+        return Error("Invalid ENTRY record");
+
+      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
+        AttrBuilder B;
+        decodeLLVMAttributesForBitcode(B, Record[i+1]);
+        Attrs.push_back(AttributeSet::get(Context, Record[i], B));
+      }
+
+      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      Attrs.clear();
+      break;
+    }
+    case bitc::PARAMATTR_CODE_ENTRY: { // ENTRY: [attrgrp0, attrgrp1, ...]
+      for (unsigned i = 0, e = Record.size(); i != e; ++i)
+        Attrs.push_back(MAttributeGroups[Record[i]]);
+
+      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      Attrs.clear();
+      break;
+    }
+    }
+  }
+}
+
+bool BitcodeReader::ParseAttributeGroupBlock() {
+  if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
+    return Error("Malformed block record");
+
+  if (!MAttributeGroups.empty())
+    return Error("Multiple PARAMATTR_GROUP blocks found!");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("Error at end of PARAMATTR_GROUP block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
+      if (Record.size() < 3)
+        return Error("Invalid ENTRY record");
+
+      uint64_t GrpID = Record[0];
+      uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
+
+      AttrBuilder B;
+      for (unsigned i = 2, e = Record.size(); i != e; ++i) {
+        if (Record[i] == 0) {        // Enum attribute
+          B.addAttribute(Attribute::AttrKind(Record[++i]));
+        } else if (Record[i] == 1) { // Align attribute
+          if (Attribute::AttrKind(Record[++i]) == Attribute::Alignment)
+            B.addAlignmentAttr(Record[++i]);
+          else
+            B.addStackAlignmentAttr(Record[++i]);
+        } else {                     // String attribute
+          assert((Record[i] == 3 || Record[i] == 4) &&
+                 "Invalid attribute group entry");
+          bool HasValue = (Record[i++] == 4);
+          SmallString<64> KindStr;
+          SmallString<64> ValStr;
+
+          while (Record[i] != 0 && i != e)
+            KindStr += Record[i++];
+          assert(Record[i] == 0 && "Kind string not null terminated");
+
+          if (HasValue) {
+            // Has a value associated with it.
+            ++i; // Skip the '0' that terminates the "kind" string.
+            while (Record[i] != 0 && i != e)
+              ValStr += Record[i++];
+            assert(Record[i] == 0 && "Value string not null terminated");
+          }
+
+          B.addAttribute(KindStr.str(), ValStr.str());
+        }
+      }
+
+      MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B);
+      break;
+    }
+    }
+  }
+}
+
+bool BitcodeReader::ParseTypeTable() {
+  if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
+    return Error("Malformed block record");
+
+  return ParseTypeTableBody();
+}
+
+bool BitcodeReader::ParseTypeTableBody() {
+  if (!TypeList.empty())
+    return Error("Multiple TYPE_BLOCKs found!");
+
+  SmallVector<uint64_t, 64> Record;
+  unsigned NumRecords = 0;
+
+  SmallString<64> TypeName;
+
+  // Read all the records for this type table.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      Error("Error in the type table block");
+      return true;
+    case BitstreamEntry::EndBlock:
+      if (NumRecords != TypeList.size())
+        return Error("Invalid type forward reference in TYPE_BLOCK");
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    Type *ResultTy = 0;
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: return Error("unknown type in type table");
+    case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
+      // TYPE_CODE_NUMENTRY contains a count of the number of types in the
+      // type list.  This allows us to reserve space.
+      if (Record.size() < 1)
+        return Error("Invalid TYPE_CODE_NUMENTRY record");
+      TypeList.resize(Record[0]);
+      continue;
+    case bitc::TYPE_CODE_VOID:      // VOID
+      ResultTy = Type::getVoidTy(Context);
+      break;
+    case bitc::TYPE_CODE_HALF:     // HALF
+      ResultTy = Type::getHalfTy(Context);
+      break;
+    case bitc::TYPE_CODE_FLOAT:     // FLOAT
+      ResultTy = Type::getFloatTy(Context);
+      break;
+    case bitc::TYPE_CODE_DOUBLE:    // DOUBLE
+      ResultTy = Type::getDoubleTy(Context);
+      break;
+    case bitc::TYPE_CODE_X86_FP80:  // X86_FP80
+      ResultTy = Type::getX86_FP80Ty(Context);
+      break;
+    case bitc::TYPE_CODE_FP128:     // FP128
+      ResultTy = Type::getFP128Ty(Context);
+      break;
+    case bitc::TYPE_CODE_PPC_FP128: // PPC_FP128
+      ResultTy = Type::getPPC_FP128Ty(Context);
+      break;
+    case bitc::TYPE_CODE_LABEL:     // LABEL
+      ResultTy = Type::getLabelTy(Context);
+      break;
+    case bitc::TYPE_CODE_METADATA:  // METADATA
+      ResultTy = Type::getMetadataTy(Context);
+      break;
+    case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
+      ResultTy = Type::getX86_MMXTy(Context);
+      break;
+    case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
+      if (Record.size() < 1)
+        return Error("Invalid Integer type record");
+
+      ResultTy = IntegerType::get(Context, Record[0]);
+      break;
+    case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
+                                    //          [pointee type, address space]
+      if (Record.size() < 1)
+        return Error("Invalid POINTER type record");
+      unsigned AddressSpace = 0;
+      if (Record.size() == 2)
+        AddressSpace = Record[1];
+      ResultTy = getTypeByID(Record[0]);
+      if (ResultTy == 0) return Error("invalid element type in pointer type");
+      ResultTy = PointerType::get(ResultTy, AddressSpace);
+      break;
+    }
+    case bitc::TYPE_CODE_FUNCTION_OLD: {
+      // FIXME: attrid is dead, remove it in LLVM 4.0
+      // FUNCTION: [vararg, attrid, retty, paramty x N]
+      if (Record.size() < 3)
+        return Error("Invalid FUNCTION type record");
+      SmallVector<Type*, 8> ArgTys;
+      for (unsigned i = 3, e = Record.size(); i != e; ++i) {
+        if (Type *T = getTypeByID(Record[i]))
+          ArgTys.push_back(T);
+        else
+          break;
+      }
+
+      ResultTy = getTypeByID(Record[2]);
+      if (ResultTy == 0 || ArgTys.size() < Record.size()-3)
+        return Error("invalid type in function type");
+
+      ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
+      break;
+    }
+    case bitc::TYPE_CODE_FUNCTION: {
+      // FUNCTION: [vararg, retty, paramty x N]
+      if (Record.size() < 2)
+        return Error("Invalid FUNCTION type record");
+      SmallVector<Type*, 8> ArgTys;
+      for (unsigned i = 2, e = Record.size(); i != e; ++i) {
+        if (Type *T = getTypeByID(Record[i]))
+          ArgTys.push_back(T);
+        else
+          break;
+      }
+
+      ResultTy = getTypeByID(Record[1]);
+      if (ResultTy == 0 || ArgTys.size() < Record.size()-2)
+        return Error("invalid type in function type");
+
+      ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
+      break;
+    }
+    case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
+      if (Record.size() < 1)
+        return Error("Invalid STRUCT type record");
+      SmallVector<Type*, 8> EltTys;
+      for (unsigned i = 1, e = Record.size(); i != e; ++i) {
+        if (Type *T = getTypeByID(Record[i]))
+          EltTys.push_back(T);
+        else
+          break;
+      }
+      if (EltTys.size() != Record.size()-1)
+        return Error("invalid type in struct type");
+      ResultTy = StructType::get(Context, EltTys, Record[0]);
+      break;
+    }
+    case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
+      if (ConvertToString(Record, 0, TypeName))
+        return Error("Invalid STRUCT_NAME record");
+      continue;
+
+    case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
+      if (Record.size() < 1)
+        return Error("Invalid STRUCT type record");
+
+      if (NumRecords >= TypeList.size())
+        return Error("invalid TYPE table");
+
+      // Check to see if this was forward referenced, if so fill in the temp.
+      StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
+      if (Res) {
+        Res->setName(TypeName);
+        TypeList[NumRecords] = 0;
+      } else  // Otherwise, create a new struct.
+        Res = StructType::create(Context, TypeName);
+      TypeName.clear();
+
+      SmallVector<Type*, 8> EltTys;
+      for (unsigned i = 1, e = Record.size(); i != e; ++i) {
+        if (Type *T = getTypeByID(Record[i]))
+          EltTys.push_back(T);
+        else
+          break;
+      }
+      if (EltTys.size() != Record.size()-1)
+        return Error("invalid STRUCT type record");
+      Res->setBody(EltTys, Record[0]);
+      ResultTy = Res;
+      break;
+    }
+    case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
+      if (Record.size() != 1)
+        return Error("Invalid OPAQUE type record");
+
+      if (NumRecords >= TypeList.size())
+        return Error("invalid TYPE table");
+
+      // Check to see if this was forward referenced, if so fill in the temp.
+      StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
+      if (Res) {
+        Res->setName(TypeName);
+        TypeList[NumRecords] = 0;
+      } else  // Otherwise, create a new struct with no body.
+        Res = StructType::create(Context, TypeName);
+      TypeName.clear();
+      ResultTy = Res;
+      break;
+    }
+    case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
+      if (Record.size() < 2)
+        return Error("Invalid ARRAY type record");
+      if ((ResultTy = getTypeByID(Record[1])))
+        ResultTy = ArrayType::get(ResultTy, Record[0]);
+      else
+        return Error("Invalid ARRAY type element");
+      break;
+    case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
+      if (Record.size() < 2)
+        return Error("Invalid VECTOR type record");
+      if ((ResultTy = getTypeByID(Record[1])))
+        ResultTy = VectorType::get(ResultTy, Record[0]);
+      else
+        return Error("Invalid ARRAY type element");
+      break;
+    }
+
+    if (NumRecords >= TypeList.size())
+      return Error("invalid TYPE table");
+    assert(ResultTy && "Didn't read a type?");
+    assert(TypeList[NumRecords] == 0 && "Already read type?");
+    TypeList[NumRecords++] = ResultTy;
+  }
+}
+
+bool BitcodeReader::ParseValueSymbolTable() {
+  if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records for this value table.
+  SmallString<128> ValueName;
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed value symbol table block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: unknown type.
+      break;
+    case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
+      if (ConvertToString(Record, 1, ValueName))
+        return Error("Invalid VST_ENTRY record");
+      unsigned ValueID = Record[0];
+      if (ValueID >= ValueList.size())
+        return Error("Invalid Value ID in VST_ENTRY record");
+      Value *V = ValueList[ValueID];
+
+      V->setName(StringRef(ValueName.data(), ValueName.size()));
+      ValueName.clear();
+      break;
+    }
+    case bitc::VST_CODE_BBENTRY: {
+      if (ConvertToString(Record, 1, ValueName))
+        return Error("Invalid VST_BBENTRY record");
+      BasicBlock *BB = getBasicBlock(Record[0]);
+      if (BB == 0)
+        return Error("Invalid BB ID in VST_BBENTRY record");
+
+      BB->setName(StringRef(ValueName.data(), ValueName.size()));
+      ValueName.clear();
+      break;
+    }
+    }
+  }
+}
+
+bool BitcodeReader::ParseMetadata() {
+  unsigned NextMDValueNo = MDValueList.size();
+
+  if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      Error("malformed metadata block");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    bool IsFunctionLocal = false;
+    // Read a record.
+    Record.clear();
+    unsigned Code = Stream.readRecord(Entry.ID, Record);
+    switch (Code) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::METADATA_NAME: {
+      // Read name of the named metadata.
+      SmallString<8> Name(Record.begin(), Record.end());
+      Record.clear();
+      Code = Stream.ReadCode();
+
+      // METADATA_NAME is always followed by METADATA_NAMED_NODE.
+      unsigned NextBitCode = Stream.readRecord(Code, Record);
+      assert(NextBitCode == bitc::METADATA_NAMED_NODE); (void)NextBitCode;
+
+      // Read named metadata elements.
+      unsigned Size = Record.size();
+      NamedMDNode *NMD = TheModule->getOrInsertNamedMetadata(Name);
+      for (unsigned i = 0; i != Size; ++i) {
+        MDNode *MD = dyn_cast<MDNode>(MDValueList.getValueFwdRef(Record[i]));
+        if (MD == 0)
+          return Error("Malformed metadata record");
+        NMD->addOperand(MD);
+      }
+      break;
+    }
+    case bitc::METADATA_FN_NODE:
+      IsFunctionLocal = true;
+      // fall-through
+    case bitc::METADATA_NODE: {
+      if (Record.size() % 2 == 1)
+        return Error("Invalid METADATA_NODE record");
+
+      unsigned Size = Record.size();
+      SmallVector<Value*, 8> Elts;
+      for (unsigned i = 0; i != Size; i += 2) {
+        Type *Ty = getTypeByID(Record[i]);
+        if (!Ty) return Error("Invalid METADATA_NODE record");
+        if (Ty->isMetadataTy())
+          Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
+        else if (!Ty->isVoidTy())
+          Elts.push_back(ValueList.getValueFwdRef(Record[i+1], Ty));
+        else
+          Elts.push_back(NULL);
+      }
+      Value *V = MDNode::getWhenValsUnresolved(Context, Elts, IsFunctionLocal);
+      IsFunctionLocal = false;
+      MDValueList.AssignValue(V, NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_STRING: {
+      SmallString<8> String(Record.begin(), Record.end());
+      Value *V = MDString::get(Context, String);
+      MDValueList.AssignValue(V, NextMDValueNo++);
+      break;
+    }
+    case bitc::METADATA_KIND: {
+      if (Record.size() < 2)
+        return Error("Invalid METADATA_KIND record");
+
+      unsigned Kind = Record[0];
+      SmallString<8> Name(Record.begin()+1, Record.end());
+
+      unsigned NewKind = TheModule->getMDKindID(Name.str());
+      if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
+        return Error("Conflicting METADATA_KIND records");
+      break;
+    }
+    }
+  }
+}
+
+/// decodeSignRotatedValue - Decode a signed value stored with the sign bit in
+/// the LSB for dense VBR encoding.
+uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
+  if ((V & 1) == 0)
+    return V >> 1;
+  if (V != 1)
+    return -(V >> 1);
+  // There is no such thing as -0 with integers.  "-0" really means MININT.
+  return 1ULL << 63;
+}
+
+/// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
+/// values and aliases that we can.
+bool BitcodeReader::ResolveGlobalAndAliasInits() {
+  std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
+  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+
+  GlobalInitWorklist.swap(GlobalInits);
+  AliasInitWorklist.swap(AliasInits);
+
+  while (!GlobalInitWorklist.empty()) {
+    unsigned ValID = GlobalInitWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      // Not ready to resolve this yet, it requires something later in the file.
+      GlobalInits.push_back(GlobalInitWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        GlobalInitWorklist.back().first->setInitializer(C);
+      else
+        return Error("Global variable initializer is not a constant!");
+    }
+    GlobalInitWorklist.pop_back();
+  }
+
+  while (!AliasInitWorklist.empty()) {
+    unsigned ValID = AliasInitWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      AliasInits.push_back(AliasInitWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        AliasInitWorklist.back().first->setAliasee(C);
+      else
+        return Error("Alias initializer is not a constant!");
+    }
+    AliasInitWorklist.pop_back();
+  }
+  return false;
+}
+
+static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
+  SmallVector<uint64_t, 8> Words(Vals.size());
+  std::transform(Vals.begin(), Vals.end(), Words.begin(),
+                 BitcodeReader::decodeSignRotatedValue);
+
+  return APInt(TypeBits, Words);
+}
+
+bool BitcodeReader::ParseConstants() {
+  if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records for this value table.
+  Type *CurTy = Type::getInt32Ty(Context);
+  unsigned NextCstNo = ValueList.size();
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed block record in AST file");
+    case BitstreamEntry::EndBlock:
+      if (NextCstNo != ValueList.size())
+        return Error("Invalid constant reference!");
+
+      // Once all the constants have been read, go through and resolve forward
+      // references.
+      ValueList.ResolveConstantForwardRefs();
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    Value *V = 0;
+    unsigned BitCode = Stream.readRecord(Entry.ID, Record);
+    switch (BitCode) {
+    default:  // Default behavior: unknown constant
+    case bitc::CST_CODE_UNDEF:     // UNDEF
+      V = UndefValue::get(CurTy);
+      break;
+    case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
+      if (Record.empty())
+        return Error("Malformed CST_SETTYPE record");
+      if (Record[0] >= TypeList.size())
+        return Error("Invalid Type ID in CST_SETTYPE record");
+      CurTy = TypeList[Record[0]];
+      continue;  // Skip the ValueList manipulation.
+    case bitc::CST_CODE_NULL:      // NULL
+      V = Constant::getNullValue(CurTy);
+      break;
+    case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
+      if (!CurTy->isIntegerTy() || Record.empty())
+        return Error("Invalid CST_INTEGER record");
+      V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
+      break;
+    case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
+      if (!CurTy->isIntegerTy() || Record.empty())
+        return Error("Invalid WIDE_INTEGER record");
+
+      APInt VInt = ReadWideAPInt(Record,
+                                 cast<IntegerType>(CurTy)->getBitWidth());
+      V = ConstantInt::get(Context, VInt);
+
+      break;
+    }
+    case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
+      if (Record.empty())
+        return Error("Invalid FLOAT record");
+      if (CurTy->isHalfTy())
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
+                                             APInt(16, (uint16_t)Record[0])));
+      else if (CurTy->isFloatTy())
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle,
+                                             APInt(32, (uint32_t)Record[0])));
+      else if (CurTy->isDoubleTy())
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble,
+                                             APInt(64, Record[0])));
+      else if (CurTy->isX86_FP80Ty()) {
+        // Bits are not stored the same way as a normal i80 APInt, compensate.
+        uint64_t Rearrange[2];
+        Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
+        Rearrange[1] = Record[0] >> 48;
+        V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended,
+                                             APInt(80, Rearrange)));
+      } else if (CurTy->isFP128Ty())
+        V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad,
+                                             APInt(128, Record)));
+      else if (CurTy->isPPC_FP128Ty())
+        V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble,
+                                             APInt(128, Record)));
+      else
+        V = UndefValue::get(CurTy);
+      break;
+    }
+
+    case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
+      if (Record.empty())
+        return Error("Invalid CST_AGGREGATE record");
+
+      unsigned Size = Record.size();
+      SmallVector<Constant*, 16> Elts;
+
+      if (StructType *STy = dyn_cast<StructType>(CurTy)) {
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i],
+                                                     STy->getElementType(i)));
+        V = ConstantStruct::get(STy, Elts);
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
+        Type *EltTy = ATy->getElementType();
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+        V = ConstantArray::get(ATy, Elts);
+      } else if (VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
+        Type *EltTy = VTy->getElementType();
+        for (unsigned i = 0; i != Size; ++i)
+          Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
+        V = ConstantVector::get(Elts);
+      } else {
+        V = UndefValue::get(CurTy);
+      }
+      break;
+    }
+    case bitc::CST_CODE_STRING:    // STRING: [values]
+    case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
+      if (Record.empty())
+        return Error("Invalid CST_STRING record");
+
+      SmallString<16> Elts(Record.begin(), Record.end());
+      V = ConstantDataArray::getString(Context, Elts,
+                                       BitCode == bitc::CST_CODE_CSTRING);
+      break;
+    }
+    case bitc::CST_CODE_DATA: {// DATA: [n x value]
+      if (Record.empty())
+        return Error("Invalid CST_DATA record");
+
+      Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
+      unsigned Size = Record.size();
+
+      if (EltTy->isIntegerTy(8)) {
+        SmallVector<uint8_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isIntegerTy(16)) {
+        SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isIntegerTy(32)) {
+        SmallVector<uint32_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isIntegerTy(64)) {
+        SmallVector<uint64_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isFloatTy()) {
+        SmallVector<float, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(), BitsToFloat);
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isDoubleTy()) {
+        SmallVector<double, 16> Elts(Size);
+        std::transform(Record.begin(), Record.end(), Elts.begin(),
+                       BitsToDouble);
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::get(Context, Elts);
+        else
+          V = ConstantDataArray::get(Context, Elts);
+      } else {
+        return Error("Unknown element type in CE_DATA");
+      }
+      break;
+    }
+
+    case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
+      if (Record.size() < 3) return Error("Invalid CE_BINOP record");
+      int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
+      if (Opc < 0) {
+        V = UndefValue::get(CurTy);  // Unknown binop.
+      } else {
+        Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
+        Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy);
+        unsigned Flags = 0;
+        if (Record.size() >= 4) {
+          if (Opc == Instruction::Add ||
+              Opc == Instruction::Sub ||
+              Opc == Instruction::Mul ||
+              Opc == Instruction::Shl) {
+            if (Record[3] & (1 << bitc::OBO_NO_SIGNED_WRAP))
+              Flags |= OverflowingBinaryOperator::NoSignedWrap;
+            if (Record[3] & (1 << bitc::OBO_NO_UNSIGNED_WRAP))
+              Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
+          } else if (Opc == Instruction::SDiv ||
+                     Opc == Instruction::UDiv ||
+                     Opc == Instruction::LShr ||
+                     Opc == Instruction::AShr) {
+            if (Record[3] & (1 << bitc::PEO_EXACT))
+              Flags |= SDivOperator::IsExact;
+          }
+        }
+        V = ConstantExpr::get(Opc, LHS, RHS, Flags);
+      }
+      break;
+    }
+    case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
+      if (Record.size() < 3) return Error("Invalid CE_CAST record");
+      int Opc = GetDecodedCastOpcode(Record[0]);
+      if (Opc < 0) {
+        V = UndefValue::get(CurTy);  // Unknown cast.
+      } else {
+        Type *OpTy = getTypeByID(Record[1]);
+        if (!OpTy) return Error("Invalid CE_CAST record");
+        Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
+        V = ConstantExpr::getCast(Opc, Op, CurTy);
+      }
+      break;
+    }
+    case bitc::CST_CODE_CE_INBOUNDS_GEP:
+    case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
+      if (Record.size() & 1) return Error("Invalid CE_GEP record");
+      SmallVector<Constant*, 16> Elts;
+      for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
+        Type *ElTy = getTypeByID(Record[i]);
+        if (!ElTy) return Error("Invalid CE_GEP record");
+        Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
+      }
+      ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
+      V = ConstantExpr::getGetElementPtr(Elts[0], Indices,
+                                         BitCode ==
+                                           bitc::CST_CODE_CE_INBOUNDS_GEP);
+      break;
+    }
+    case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
+      if (Record.size() < 3) return Error("Invalid CE_SELECT record");
+      V = ConstantExpr::getSelect(
+                          ValueList.getConstantFwdRef(Record[0],
+                                                      Type::getInt1Ty(Context)),
+                          ValueList.getConstantFwdRef(Record[1],CurTy),
+                          ValueList.getConstantFwdRef(Record[2],CurTy));
+      break;
+    case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
+      if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
+      VectorType *OpTy =
+        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+      if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2],
+                                                  Type::getInt32Ty(Context));
+      V = ConstantExpr::getExtractElement(Op0, Op1);
+      break;
+    }
+    case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
+      VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      if (Record.size() < 3 || OpTy == 0)
+        return Error("Invalid CE_INSERTELT record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
+                                                  OpTy->getElementType());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[2],
+                                                  Type::getInt32Ty(Context));
+      V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
+      VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      if (Record.size() < 3 || OpTy == 0)
+        return Error("Invalid CE_SHUFFLEVEC record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
+                                                 OpTy->getNumElements());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy);
+      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
+      VectorType *RTy = dyn_cast<VectorType>(CurTy);
+      VectorType *OpTy =
+        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+      if (Record.size() < 4 || RTy == 0 || OpTy == 0)
+        return Error("Invalid CE_SHUFVEC_EX record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
+      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
+                                                 RTy->getNumElements());
+      Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy);
+      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
+      break;
+    }
+    case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
+      if (Record.size() < 4) return Error("Invalid CE_CMP record");
+      Type *OpTy = getTypeByID(Record[0]);
+      if (OpTy == 0) return Error("Invalid CE_CMP record");
+      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
+      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
+
+      if (OpTy->isFPOrFPVectorTy())
+        V = ConstantExpr::getFCmp(Record[3], Op0, Op1);
+      else
+        V = ConstantExpr::getICmp(Record[3], Op0, Op1);
+      break;
+    }
+    // This maintains backward compatibility, pre-asm dialect keywords.
+    // FIXME: Remove with the 4.0 release.
+    case bitc::CST_CODE_INLINEASM_OLD: {
+      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[0] & 1;
+      bool IsAlignStack = Record[0] >> 1;
+      unsigned AsmStrSize = Record[1];
+      if (2+AsmStrSize >= Record.size())
+        return Error("Invalid INLINEASM record");
+      unsigned ConstStrSize = Record[2+AsmStrSize];
+      if (3+AsmStrSize+ConstStrSize > Record.size())
+        return Error("Invalid INLINEASM record");
+
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[2+i];
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[3+AsmStrSize+i];
+      PointerType *PTy = cast<PointerType>(CurTy);
+      V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
+      break;
+    }
+    // This version adds support for the asm dialect keywords (e.g.,
+    // inteldialect).
+    case bitc::CST_CODE_INLINEASM: {
+      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[0] & 1;
+      bool IsAlignStack = (Record[0] >> 1) & 1;
+      unsigned AsmDialect = Record[0] >> 2;
+      unsigned AsmStrSize = Record[1];
+      if (2+AsmStrSize >= Record.size())
+        return Error("Invalid INLINEASM record");
+      unsigned ConstStrSize = Record[2+AsmStrSize];
+      if (3+AsmStrSize+ConstStrSize > Record.size())
+        return Error("Invalid INLINEASM record");
+
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[2+i];
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[3+AsmStrSize+i];
+      PointerType *PTy = cast<PointerType>(CurTy);
+      V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect));
+      break;
+    }
+    case bitc::CST_CODE_BLOCKADDRESS:{
+      if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
+      Type *FnTy = getTypeByID(Record[0]);
+      if (FnTy == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      Function *Fn =
+        dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
+      if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
+
+      // If the function is already parsed we can insert the block address right
+      // away.
+      if (!Fn->empty()) {
+        Function::iterator BBI = Fn->begin(), BBE = Fn->end();
+        for (size_t I = 0, E = Record[2]; I != E; ++I) {
+          if (BBI == BBE)
+            return Error("Invalid blockaddress block #");
+          ++BBI;
+        }
+        V = BlockAddress::get(Fn, BBI);
+      } else {
+        // Otherwise insert a placeholder and remember it so it can be inserted
+        // when the function is parsed.
+        GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
+                                                    Type::getInt8Ty(Context),
+                                            false, GlobalValue::InternalLinkage,
+                                                    0, "");
+        BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
+        V = FwdRef;
+      }
+      break;
+    }
+    }
+
+    ValueList.AssignValue(V, NextCstNo);
+    ++NextCstNo;
+  }
+}
+
+bool BitcodeReader::ParseUseLists() {
+  if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed use list block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a use list record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: unknown type.
+      break;
+    case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
+      unsigned RecordLength = Record.size();
+      if (RecordLength < 1)
+        return Error ("Invalid UseList reader!");
+      UseListRecords.push_back(Record);
+      break;
+    }
+    }
+  }
+}
+
+/// RememberAndSkipFunctionBody - When we see the block for a function body,
+/// remember where it is and then skip it.  This lets us lazily deserialize the
+/// functions.
+bool BitcodeReader::RememberAndSkipFunctionBody() {
+  // Get the function we are talking about.
+  if (FunctionsWithBodies.empty())
+    return Error("Insufficient function protos");
+
+  Function *Fn = FunctionsWithBodies.back();
+  FunctionsWithBodies.pop_back();
+
+  // Save the current stream state.
+  uint64_t CurBit = Stream.GetCurrentBitNo();
+  DeferredFunctionInfo[Fn] = CurBit;
+
+  // Skip over the function block for now.
+  if (Stream.SkipBlock())
+    return Error("Malformed block record");
+  return false;
+}
+
+bool BitcodeReader::GlobalCleanup() {
+  // Patch the initializers for globals and aliases up.
+  ResolveGlobalAndAliasInits();
+  if (!GlobalInits.empty() || !AliasInits.empty())
+    return Error("Malformed global initializer set");
+
+  // Look for intrinsic functions which need to be upgraded at some point
+  for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    Function *NewFn;
+    if (UpgradeIntrinsicFunction(FI, NewFn))
+      UpgradedIntrinsics.push_back(std::make_pair(FI, NewFn));
+  }
+
+  // Look for global variables which need to be renamed.
+  for (Module::global_iterator
+         GI = TheModule->global_begin(), GE = TheModule->global_end();
+       GI != GE; ++GI)
+    UpgradeGlobalVariable(GI);
+  // Force deallocation of memory for these vectors to favor the client that
+  // want lazy deserialization.
+  std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
+  std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
+  return false;
+}
+
+bool BitcodeReader::ParseModule(bool Resume) {
+  if (Resume)
+    Stream.JumpToBit(NextUnreadBit);
+  else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+  std::vector<std::string> SectionTable;
+  std::vector<std::string> GCTable;
+
+  // Read all the records for this module.
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module block");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return GlobalCleanup();
+
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
+      default:  // Skip unknown content.
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      case bitc::BLOCKINFO_BLOCK_ID:
+        if (Stream.ReadBlockInfoBlock())
+          return Error("Malformed BlockInfoBlock");
+        break;
+      case bitc::PARAMATTR_BLOCK_ID:
+        if (ParseAttributeBlock())
+          return true;
+        break;
+      case bitc::PARAMATTR_GROUP_BLOCK_ID:
+        if (ParseAttributeGroupBlock())
+          return true;
+        break;
+      case bitc::TYPE_BLOCK_ID_NEW:
+        if (ParseTypeTable())
+          return true;
+        break;
+      case bitc::VALUE_SYMTAB_BLOCK_ID:
+        if (ParseValueSymbolTable())
+          return true;
+        SeenValueSymbolTable = true;
+        break;
+      case bitc::CONSTANTS_BLOCK_ID:
+        if (ParseConstants() || ResolveGlobalAndAliasInits())
+          return true;
+        break;
+      case bitc::METADATA_BLOCK_ID:
+        if (ParseMetadata())
+          return true;
+        break;
+      case bitc::FUNCTION_BLOCK_ID:
+        // If this is the first function body we've seen, reverse the
+        // FunctionsWithBodies list.
+        if (!SeenFirstFunctionBody) {
+          std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
+          if (GlobalCleanup())
+            return true;
+          SeenFirstFunctionBody = true;
+        }
+
+        if (RememberAndSkipFunctionBody())
+          return true;
+        // For streaming bitcode, suspend parsing when we reach the function
+        // bodies. Subsequent materialization calls will resume it when
+        // necessary. For streaming, the function bodies must be at the end of
+        // the bitcode. If the bitcode file is old, the symbol table will be
+        // at the end instead and will not have been seen yet. In this case,
+        // just finish the parse now.
+        if (LazyStreamer && SeenValueSymbolTable) {
+          NextUnreadBit = Stream.GetCurrentBitNo();
+          return false;
+        }
+        break;
+      case bitc::USELIST_BLOCK_ID:
+        if (ParseUseLists())
+          return true;
+        break;
+      }
+      continue;
+
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+
+    // Read a record.
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: break;  // Default behavior, ignore unknown content.
+    case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
+      if (Record.size() < 1)
+        return Error("Malformed MODULE_CODE_VERSION");
+      // Only version #0 and #1 are supported so far.
+      unsigned module_version = Record[0];
+      switch (module_version) {
+        default: return Error("Unknown bitstream version!");
+        case 0:
+          UseRelativeIDs = false;
+          break;
+        case 1:
+          UseRelativeIDs = true;
+          break;
+      }
+      break;
+    }
+    case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_TRIPLE record");
+      TheModule->setTargetTriple(S);
+      break;
+    }
+    case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_DATALAYOUT record");
+      TheModule->setDataLayout(S);
+      break;
+    }
+    case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_ASM record");
+      TheModule->setModuleInlineAsm(S);
+      break;
+    }
+    case bitc::MODULE_CODE_DEPLIB: {  // DEPLIB: [strchr x N]
+      // FIXME: Remove in 4.0.
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_DEPLIB record");
+      // Ignore value.
+      break;
+    }
+    case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_SECTIONNAME record");
+      SectionTable.push_back(S);
+      break;
+    }
+    case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_GCNAME record");
+      GCTable.push_back(S);
+      break;
+    }
+    // GLOBALVAR: [pointer type, isconst, initid,
+    //             linkage, alignment, section, visibility, threadlocal,
+    //             unnamed_addr]
+    case bitc::MODULE_CODE_GLOBALVAR: {
+      if (Record.size() < 6)
+        return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      Type *Ty = getTypeByID(Record[0]);
+      if (!Ty) return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      if (!Ty->isPointerTy())
+        return Error("Global not a pointer type!");
+      unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
+      Ty = cast<PointerType>(Ty)->getElementType();
+
+      bool isConstant = Record[1];
+      GlobalValue::LinkageTypes Linkage = GetDecodedLinkage(Record[3]);
+      unsigned Alignment = (1 << Record[4]) >> 1;
+      std::string Section;
+      if (Record[5]) {
+        if (Record[5]-1 >= SectionTable.size())
+          return Error("Invalid section ID");
+        Section = SectionTable[Record[5]-1];
+      }
+      GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+      if (Record.size() > 6)
+        Visibility = GetDecodedVisibility(Record[6]);
+
+      GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
+      if (Record.size() > 7)
+        TLM = GetDecodedThreadLocalMode(Record[7]);
+
+      bool UnnamedAddr = false;
+      if (Record.size() > 8)
+        UnnamedAddr = Record[8];
+
+      bool ExternallyInitialized = false;
+      if (Record.size() > 9)
+        ExternallyInitialized = Record[9];
+
+      GlobalVariable *NewGV =
+        new GlobalVariable(*TheModule, Ty, isConstant, Linkage, 0, "", 0,
+                           TLM, AddressSpace, ExternallyInitialized);
+      NewGV->setAlignment(Alignment);
+      if (!Section.empty())
+        NewGV->setSection(Section);
+      NewGV->setVisibility(Visibility);
+      NewGV->setUnnamedAddr(UnnamedAddr);
+
+      ValueList.push_back(NewGV);
+
+      // Remember which value to use for the global initializer.
+      if (unsigned InitID = Record[2])
+        GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
+      break;
+    }
+    // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
+    //             alignment, section, visibility, gc, unnamed_addr]
+    case bitc::MODULE_CODE_FUNCTION: {
+      if (Record.size() < 8)
+        return Error("Invalid MODULE_CODE_FUNCTION record");
+      Type *Ty = getTypeByID(Record[0]);
+      if (!Ty) return Error("Invalid MODULE_CODE_FUNCTION record");
+      if (!Ty->isPointerTy())
+        return Error("Function not a pointer type!");
+      FunctionType *FTy =
+        dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
+      if (!FTy)
+        return Error("Function not a pointer to function type!");
+
+      Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                        "", TheModule);
+
+      Func->setCallingConv(static_cast<CallingConv::ID>(Record[1]));
+      bool isProto = Record[2];
+      Func->setLinkage(GetDecodedLinkage(Record[3]));
+      Func->setAttributes(getAttributes(Record[4]));
+
+      Func->setAlignment((1 << Record[5]) >> 1);
+      if (Record[6]) {
+        if (Record[6]-1 >= SectionTable.size())
+          return Error("Invalid section ID");
+        Func->setSection(SectionTable[Record[6]-1]);
+      }
+      Func->setVisibility(GetDecodedVisibility(Record[7]));
+      if (Record.size() > 8 && Record[8]) {
+        if (Record[8]-1 > GCTable.size())
+          return Error("Invalid GC ID");
+        Func->setGC(GCTable[Record[8]-1].c_str());
+      }
+      bool UnnamedAddr = false;
+      if (Record.size() > 9)
+        UnnamedAddr = Record[9];
+      Func->setUnnamedAddr(UnnamedAddr);
+      ValueList.push_back(Func);
+
+      // If this is a function with a body, remember the prototype we are
+      // creating now, so that we can match up the body with them later.
+      if (!isProto) {
+        FunctionsWithBodies.push_back(Func);
+        if (LazyStreamer) DeferredFunctionInfo[Func] = 0;
+      }
+      break;
+    }
+    // ALIAS: [alias type, aliasee val#, linkage]
+    // ALIAS: [alias type, aliasee val#, linkage, visibility]
+    case bitc::MODULE_CODE_ALIAS: {
+      if (Record.size() < 3)
+        return Error("Invalid MODULE_ALIAS record");
+      Type *Ty = getTypeByID(Record[0]);
+      if (!Ty) return Error("Invalid MODULE_ALIAS record");
+      if (!Ty->isPointerTy())
+        return Error("Function not a pointer type!");
+
+      GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
+                                           "", 0, TheModule);
+      // Old bitcode files didn't have visibility field.
+      if (Record.size() > 3)
+        NewGA->setVisibility(GetDecodedVisibility(Record[3]));
+      ValueList.push_back(NewGA);
+      AliasInits.push_back(std::make_pair(NewGA, Record[1]));
+      break;
+    }
+    /// MODULE_CODE_PURGEVALS: [numvals]
+    case bitc::MODULE_CODE_PURGEVALS:
+      // Trim down the value list to the specified size.
+      if (Record.size() < 1 || Record[0] > ValueList.size())
+        return Error("Invalid MODULE_PURGEVALS record");
+      ValueList.shrinkTo(Record[0]);
+      break;
+    }
+    Record.clear();
+  }
+}
+
+bool BitcodeReader::ParseBitcodeInto(Module *M) {
+  TheModule = 0;
+
+  if (InitStream()) return true;
+
+  // Sniff for the signature.
+  if (Stream.Read(8) != 'B' ||
+      Stream.Read(8) != 'C' ||
+      Stream.Read(4) != 0x0 ||
+      Stream.Read(4) != 0xC ||
+      Stream.Read(4) != 0xE ||
+      Stream.Read(4) != 0xD)
+    return Error("Invalid bitcode signature");
+
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (1) {
+    if (Stream.AtEndOfStream())
+      return false;
+
+    BitstreamEntry Entry =
+      Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module file");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
+
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
+      case bitc::BLOCKINFO_BLOCK_ID:
+        if (Stream.ReadBlockInfoBlock())
+          return Error("Malformed BlockInfoBlock");
+        break;
+      case bitc::MODULE_BLOCK_ID:
+        // Reject multiple MODULE_BLOCK's in a single bitstream.
+        if (TheModule)
+          return Error("Multiple MODULE_BLOCKs in same stream");
+        TheModule = M;
+        if (ParseModule(false))
+          return true;
+        if (LazyStreamer) return false;
+        break;
+      default:
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      }
+      continue;
+    case BitstreamEntry::Record:
+      // There should be no records in the top-level of blocks.
+
+      // The ranlib in Xcode 4 will align archive members by appending newlines
+      // to the end of them. If this file size is a multiple of 4 but not 8, we
+      // have to read and ignore these final 4 bytes :-(
+      if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
+          Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
+          Stream.AtEndOfStream())
+        return false;
+
+      return Error("Invalid record at top-level");
+    }
+  }
+}
+
+bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
+  if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Read all the records for this module.
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed module block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default: break;  // Default behavior, ignore unknown content.
+    case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+      std::string S;
+      if (ConvertToString(Record, 0, S))
+        return Error("Invalid MODULE_CODE_TRIPLE record");
+      Triple = S;
+      break;
+    }
+    }
+    Record.clear();
+  }
+}
+
+bool BitcodeReader::ParseTriple(std::string &Triple) {
+  if (InitStream()) return true;
+
+  // Sniff for the signature.
+  if (Stream.Read(8) != 'B' ||
+      Stream.Read(8) != 'C' ||
+      Stream.Read(4) != 0x0 ||
+      Stream.Read(4) != 0xC ||
+      Stream.Read(4) != 0xE ||
+      Stream.Read(4) != 0xD)
+    return Error("Invalid bitcode signature");
+
+  // We expect a number of well-defined blocks, though we don't necessarily
+  // need to understand them all.
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      Error("malformed module file");
+      return true;
+    case BitstreamEntry::EndBlock:
+      return false;
+
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::MODULE_BLOCK_ID)
+        return ParseModuleTriple(Triple);
+
+      // Ignore other sub-blocks.
+      if (Stream.SkipBlock()) {
+        Error("malformed block record in AST file");
+        return true;
+      }
+      continue;
+
+    case BitstreamEntry::Record:
+      Stream.skipRecord(Entry.ID);
+      continue;
+    }
+  }
+}
+
+/// ParseMetadataAttachment - Parse metadata attachments.
+bool BitcodeReader::ParseMetadataAttachment() {
+  if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
+    return Error("Malformed block record");
+
+  SmallVector<uint64_t, 64> Record;
+  while (1) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return Error("malformed metadata block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a metadata attachment record.
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    default:  // Default behavior: ignore.
+      break;
+    case bitc::METADATA_ATTACHMENT: {
+      unsigned RecordLength = Record.size();
+      if (Record.empty() || (RecordLength - 1) % 2 == 1)
+        return Error ("Invalid METADATA_ATTACHMENT reader!");
+      Instruction *Inst = InstructionList[Record[0]];
+      for (unsigned i = 1; i != RecordLength; i = i+2) {
+        unsigned Kind = Record[i];
+        DenseMap<unsigned, unsigned>::iterator I =
+          MDKindMap.find(Kind);
+        if (I == MDKindMap.end())
+          return Error("Invalid metadata kind ID");
+        Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
+        Inst->setMetadata(I->second, cast<MDNode>(Node));
+      }
+      break;
+    }
+    }
+  }
+}
+
+/// ParseFunctionBody - Lazily parse the specified function body block.
+bool BitcodeReader::ParseFunctionBody(Function *F) {
+  if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
+    return Error("Malformed block record");
+
+  InstructionList.clear();
+  unsigned ModuleValueListSize = ValueList.size();
+  unsigned ModuleMDValueListSize = MDValueList.size();
+
+  // Add all the function arguments to the value table.
+  for(Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
+    ValueList.push_back(I);
+
+  unsigned NextValueNo = ValueList.size();
+  BasicBlock *CurBB = 0;
+  unsigned CurBBNo = 0;
+
+  DebugLoc LastLoc;
+
+  // Read all the records.
+  SmallVector<uint64_t, 64> Record;
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return Error("Bitcode error in function block");
+    case BitstreamEntry::EndBlock:
+      goto OutOfRecordLoop;
+
+    case BitstreamEntry::SubBlock:
+      switch (Entry.ID) {
+      default:  // Skip unknown content.
+        if (Stream.SkipBlock())
+          return Error("Malformed block record");
+        break;
+      case bitc::CONSTANTS_BLOCK_ID:
+        if (ParseConstants()) return true;
+        NextValueNo = ValueList.size();
+        break;
+      case bitc::VALUE_SYMTAB_BLOCK_ID:
+        if (ParseValueSymbolTable()) return true;
+        break;
+      case bitc::METADATA_ATTACHMENT_ID:
+        if (ParseMetadataAttachment()) return true;
+        break;
+      case bitc::METADATA_BLOCK_ID:
+        if (ParseMetadata()) return true;
+        break;
+      }
+      continue;
+
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read a record.
+    Record.clear();
+    Instruction *I = 0;
+    unsigned BitCode = Stream.readRecord(Entry.ID, Record);
+    switch (BitCode) {
+    default: // Default behavior: reject
+      return Error("Unknown instruction");
+    case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
+      if (Record.size() < 1 || Record[0] == 0)
+        return Error("Invalid DECLAREBLOCKS record");
+      // Create all the basic blocks for the function.
+      FunctionBBs.resize(Record[0]);
+      for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
+        FunctionBBs[i] = BasicBlock::Create(Context, "", F);
+      CurBB = FunctionBBs[0];
+      continue;
+
+    case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
+      // This record indicates that the last instruction is at the same
+      // location as the previous instruction with a location.
+      I = 0;
+
+      // Get the last instruction emitted.
+      if (CurBB && !CurBB->empty())
+        I = &CurBB->back();
+      else if (CurBBNo && FunctionBBs[CurBBNo-1] &&
+               !FunctionBBs[CurBBNo-1]->empty())
+        I = &FunctionBBs[CurBBNo-1]->back();
+
+      if (I == 0) return Error("Invalid DEBUG_LOC_AGAIN record");
+      I->setDebugLoc(LastLoc);
+      I = 0;
+      continue;
+
+    case bitc::FUNC_CODE_DEBUG_LOC: {      // DEBUG_LOC: [line, col, scope, ia]
+      I = 0;     // Get the last instruction emitted.
+      if (CurBB && !CurBB->empty())
+        I = &CurBB->back();
+      else if (CurBBNo && FunctionBBs[CurBBNo-1] &&
+               !FunctionBBs[CurBBNo-1]->empty())
+        I = &FunctionBBs[CurBBNo-1]->back();
+      if (I == 0 || Record.size() < 4)
+        return Error("Invalid FUNC_CODE_DEBUG_LOC record");
+
+      unsigned Line = Record[0], Col = Record[1];
+      unsigned ScopeID = Record[2], IAID = Record[3];
+
+      MDNode *Scope = 0, *IA = 0;
+      if (ScopeID) Scope = cast<MDNode>(MDValueList.getValueFwdRef(ScopeID-1));
+      if (IAID)    IA = cast<MDNode>(MDValueList.getValueFwdRef(IAID-1));
+      LastLoc = DebugLoc::get(Line, Col, Scope, IA);
+      I->setDebugLoc(LastLoc);
+      I = 0;
+      continue;
+    }
+
+    case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
+      unsigned OpNum = 0;
+      Value *LHS, *RHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
+          OpNum+1 > Record.size())
+        return Error("Invalid BINOP record");
+
+      int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
+      if (Opc == -1) return Error("Invalid BINOP record");
+      I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+      InstructionList.push_back(I);
+      if (OpNum < Record.size()) {
+        if (Opc == Instruction::Add ||
+            Opc == Instruction::Sub ||
+            Opc == Instruction::Mul ||
+            Opc == Instruction::Shl) {
+          if (Record[OpNum] & (1 << bitc::OBO_NO_SIGNED_WRAP))
+            cast<BinaryOperator>(I)->setHasNoSignedWrap(true);
+          if (Record[OpNum] & (1 << bitc::OBO_NO_UNSIGNED_WRAP))
+            cast<BinaryOperator>(I)->setHasNoUnsignedWrap(true);
+        } else if (Opc == Instruction::SDiv ||
+                   Opc == Instruction::UDiv ||
+                   Opc == Instruction::LShr ||
+                   Opc == Instruction::AShr) {
+          if (Record[OpNum] & (1 << bitc::PEO_EXACT))
+            cast<BinaryOperator>(I)->setIsExact(true);
+        } else if (isa<FPMathOperator>(I)) {
+          FastMathFlags FMF;
+          if (0 != (Record[OpNum] & FastMathFlags::UnsafeAlgebra))
+            FMF.setUnsafeAlgebra();
+          if (0 != (Record[OpNum] & FastMathFlags::NoNaNs))
+            FMF.setNoNaNs();
+          if (0 != (Record[OpNum] & FastMathFlags::NoInfs))
+            FMF.setNoInfs();
+          if (0 != (Record[OpNum] & FastMathFlags::NoSignedZeros))
+            FMF.setNoSignedZeros();
+          if (0 != (Record[OpNum] & FastMathFlags::AllowReciprocal))
+            FMF.setAllowReciprocal();
+          if (FMF.any())
+            I->setFastMathFlags(FMF);
+        }
+
+      }
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CAST: {    // CAST: [opval, opty, destty, castopc]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid CAST record");
+
+      Type *ResTy = getTypeByID(Record[OpNum]);
+      int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
+      if (Opc == -1 || ResTy == 0)
+        return Error("Invalid CAST record");
+      I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_INBOUNDS_GEP:
+    case bitc::FUNC_CODE_INST_GEP: { // GEP: [n x operands]
+      unsigned OpNum = 0;
+      Value *BasePtr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
+        return Error("Invalid GEP record");
+
+      SmallVector<Value*, 16> GEPIdx;
+      while (OpNum != Record.size()) {
+        Value *Op;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          return Error("Invalid GEP record");
+        GEPIdx.push_back(Op);
+      }
+
+      I = GetElementPtrInst::Create(BasePtr, GEPIdx);
+      InstructionList.push_back(I);
+      if (BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP)
+        cast<GetElementPtrInst>(I)->setIsInBounds(true);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_EXTRACTVAL: {
+                                       // EXTRACTVAL: [opty, opval, n x indices]
+      unsigned OpNum = 0;
+      Value *Agg;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+        return Error("Invalid EXTRACTVAL record");
+
+      SmallVector<unsigned, 4> EXTRACTVALIdx;
+      for (unsigned RecSize = Record.size();
+           OpNum != RecSize; ++OpNum) {
+        uint64_t Index = Record[OpNum];
+        if ((unsigned)Index != Index)
+          return Error("Invalid EXTRACTVAL index");
+        EXTRACTVALIdx.push_back((unsigned)Index);
+      }
+
+      I = ExtractValueInst::Create(Agg, EXTRACTVALIdx);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_INSERTVAL: {
+                           // INSERTVAL: [opty, opval, opty, opval, n x indices]
+      unsigned OpNum = 0;
+      Value *Agg;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
+        return Error("Invalid INSERTVAL record");
+      Value *Val;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Val))
+        return Error("Invalid INSERTVAL record");
+
+      SmallVector<unsigned, 4> INSERTVALIdx;
+      for (unsigned RecSize = Record.size();
+           OpNum != RecSize; ++OpNum) {
+        uint64_t Index = Record[OpNum];
+        if ((unsigned)Index != Index)
+          return Error("Invalid INSERTVAL index");
+        INSERTVALIdx.push_back((unsigned)Index);
+      }
+
+      I = InsertValueInst::Create(Agg, Val, INSERTVALIdx);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_SELECT: { // SELECT: [opval, ty, opval, opval]
+      // obsolete form of select
+      // handles select i1 ... in old bitcode
+      unsigned OpNum = 0;
+      Value *TrueVal, *FalseVal, *Cond;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
+          popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
+        return Error("Invalid SELECT record");
+
+      I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_VSELECT: {// VSELECT: [ty,opval,opval,predty,pred]
+      // new form of select
+      // handles select i1 or select [N x i1]
+      unsigned OpNum = 0;
+      Value *TrueVal, *FalseVal, *Cond;
+      if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
+          getValueTypePair(Record, OpNum, NextValueNo, Cond))
+        return Error("Invalid SELECT record");
+
+      // select condition can be either i1 or [N x i1]
+      if (VectorType* vector_type =
+          dyn_cast<VectorType>(Cond->getType())) {
+        // expect <n x i1>
+        if (vector_type->getElementType() != Type::getInt1Ty(Context))
+          return Error("Invalid SELECT condition type");
+      } else {
+        // expect i1
+        if (Cond->getType() != Type::getInt1Ty(Context))
+          return Error("Invalid SELECT condition type");
+      }
+
+      I = SelectInst::Create(Cond, TrueVal, FalseVal);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_EXTRACTELT: { // EXTRACTELT: [opty, opval, opval]
+      unsigned OpNum = 0;
+      Value *Vec, *Idx;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
+          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
+        return Error("Invalid EXTRACTELT record");
+      I = ExtractElementInst::Create(Vec, Idx);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_INSERTELT: { // INSERTELT: [ty, opval,opval,opval]
+      unsigned OpNum = 0;
+      Value *Vec, *Elt, *Idx;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
+          popValue(Record, OpNum, NextValueNo,
+                   cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
+          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
+        return Error("Invalid INSERTELT record");
+      I = InsertElementInst::Create(Vec, Elt, Idx);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval]
+      unsigned OpNum = 0;
+      Value *Vec1, *Vec2, *Mask;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
+          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
+        return Error("Invalid SHUFFLEVEC record");
+
+      if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
+        return Error("Invalid SHUFFLEVEC record");
+      I = new ShuffleVectorInst(Vec1, Vec2, Mask);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_CMP:   // CMP: [opty, opval, opval, pred]
+      // Old form of ICmp/FCmp returning bool
+      // Existed to differentiate between icmp/fcmp and vicmp/vfcmp which were
+      // both legal on vectors but had different behaviour.
+    case bitc::FUNC_CODE_INST_CMP2: { // CMP2: [opty, opval, opval, pred]
+      // FCmp/ICmp returning bool or vector of bool
+
+      unsigned OpNum = 0;
+      Value *LHS, *RHS;
+      if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
+          OpNum+1 != Record.size())
+        return Error("Invalid CMP record");
+
+      if (LHS->getType()->isFPOrFPVectorTy())
+        I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
+      else
+        I = new ICmpInst((ICmpInst::Predicate)Record[OpNum], LHS, RHS);
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_RET: // RET: [opty,opval<optional>]
+      {
+        unsigned Size = Record.size();
+        if (Size == 0) {
+          I = ReturnInst::Create(Context);
+          InstructionList.push_back(I);
+          break;
+        }
+
+        unsigned OpNum = 0;
+        Value *Op = NULL;
+        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+          return Error("Invalid RET record");
+        if (OpNum != Record.size())
+          return Error("Invalid RET record");
+
+        I = ReturnInst::Create(Context, Op);
+        InstructionList.push_back(I);
+        break;
+      }
+    case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
+      if (Record.size() != 1 && Record.size() != 3)
+        return Error("Invalid BR record");
+      BasicBlock *TrueDest = getBasicBlock(Record[0]);
+      if (TrueDest == 0)
+        return Error("Invalid BR record");
+
+      if (Record.size() == 1) {
+        I = BranchInst::Create(TrueDest);
+        InstructionList.push_back(I);
+      }
+      else {
+        BasicBlock *FalseDest = getBasicBlock(Record[1]);
+        Value *Cond = getValue(Record, 2, NextValueNo,
+                               Type::getInt1Ty(Context));
+        if (FalseDest == 0 || Cond == 0)
+          return Error("Invalid BR record");
+        I = BranchInst::Create(TrueDest, FalseDest, Cond);
+        InstructionList.push_back(I);
+      }
+      break;
+    }
+    case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
+      // Check magic
+      if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
+        // New SwitchInst format with case ranges.
+
+        Type *OpTy = getTypeByID(Record[1]);
+        unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
+
+        Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
+        BasicBlock *Default = getBasicBlock(Record[3]);
+        if (OpTy == 0 || Cond == 0 || Default == 0)
+          return Error("Invalid SWITCH record");
+
+        unsigned NumCases = Record[4];
+
+        SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
+        InstructionList.push_back(SI);
+
+        unsigned CurIdx = 5;
+        for (unsigned i = 0; i != NumCases; ++i) {
+          IntegersSubsetToBB CaseBuilder;
+          unsigned NumItems = Record[CurIdx++];
+          for (unsigned ci = 0; ci != NumItems; ++ci) {
+            bool isSingleNumber = Record[CurIdx++];
+
+            APInt Low;
+            unsigned ActiveWords = 1;
+            if (ValueBitWidth > 64)
+              ActiveWords = Record[CurIdx++];
+            Low = ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+            CurIdx += ActiveWords;
+
+            if (!isSingleNumber) {
+              ActiveWords = 1;
+              if (ValueBitWidth > 64)
+                ActiveWords = Record[CurIdx++];
+              APInt High =
+                  ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
+                                ValueBitWidth);
+
+              CaseBuilder.add(IntItem::fromType(OpTy, Low),
+                              IntItem::fromType(OpTy, High));
+              CurIdx += ActiveWords;
+            } else
+              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+          }
+          BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
+          IntegersSubset Case = CaseBuilder.getCase();
+          SI->addCase(Case, DestBB);
+        }
+        uint16_t Hash = SI->hash();
+        if (Hash != (Record[0] & 0xFFFF))
+          return Error("Invalid SWITCH record");
+        I = SI;
+        break;
+      }
+
+      // Old SwitchInst format without case ranges.
+
+      if (Record.size() < 3 || (Record.size() & 1) == 0)
+        return Error("Invalid SWITCH record");
+      Type *OpTy = getTypeByID(Record[0]);
+      Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
+      BasicBlock *Default = getBasicBlock(Record[2]);
+      if (OpTy == 0 || Cond == 0 || Default == 0)
+        return Error("Invalid SWITCH record");
+      unsigned NumCases = (Record.size()-3)/2;
+      SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
+      InstructionList.push_back(SI);
+      for (unsigned i = 0, e = NumCases; i != e; ++i) {
+        ConstantInt *CaseVal =
+          dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
+        BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
+        if (CaseVal == 0 || DestBB == 0) {
+          delete SI;
+          return Error("Invalid SWITCH record!");
+        }
+        SI->addCase(CaseVal, DestBB);
+      }
+      I = SI;
+      break;
+    }
+    case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
+      if (Record.size() < 2)
+        return Error("Invalid INDIRECTBR record");
+      Type *OpTy = getTypeByID(Record[0]);
+      Value *Address = getValue(Record, 1, NextValueNo, OpTy);
+      if (OpTy == 0 || Address == 0)
+        return Error("Invalid INDIRECTBR record");
+      unsigned NumDests = Record.size()-2;
+      IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
+      InstructionList.push_back(IBI);
+      for (unsigned i = 0, e = NumDests; i != e; ++i) {
+        if (BasicBlock *DestBB = getBasicBlock(Record[2+i])) {
+          IBI->addDestination(DestBB);
+        } else {
+          delete IBI;
+          return Error("Invalid INDIRECTBR record!");
+        }
+      }
+      I = IBI;
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_INVOKE: {
+      // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
+      if (Record.size() < 4) return Error("Invalid INVOKE record");
+      AttributeSet PAL = getAttributes(Record[0]);
+      unsigned CCInfo = Record[1];
+      BasicBlock *NormalBB = getBasicBlock(Record[2]);
+      BasicBlock *UnwindBB = getBasicBlock(Record[3]);
+
+      unsigned OpNum = 4;
+      Value *Callee;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+        return Error("Invalid INVOKE record");
+
+      PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
+      FunctionType *FTy = !CalleeTy ? 0 :
+        dyn_cast<FunctionType>(CalleeTy->getElementType());
+
+      // Check that the right number of fixed parameters are here.
+      if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
+          Record.size() < OpNum+FTy->getNumParams())
+        return Error("Invalid INVOKE record");
+
+      SmallVector<Value*, 16> Ops;
+      for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        Ops.push_back(getValue(Record, OpNum, NextValueNo,
+                               FTy->getParamType(i)));
+        if (Ops.back() == 0) return Error("Invalid INVOKE record");
+      }
+
+      if (!FTy->isVarArg()) {
+        if (Record.size() != OpNum)
+          return Error("Invalid INVOKE record");
+      } else {
+        // Read type/value pairs for varargs params.
+        while (OpNum != Record.size()) {
+          Value *Op;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+            return Error("Invalid INVOKE record");
+          Ops.push_back(Op);
+        }
+      }
+
+      I = InvokeInst::Create(Callee, NormalBB, UnwindBB, Ops);
+      InstructionList.push_back(I);
+      cast<InvokeInst>(I)->setCallingConv(
+        static_cast<CallingConv::ID>(CCInfo));
+      cast<InvokeInst>(I)->setAttributes(PAL);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval]
+      unsigned Idx = 0;
+      Value *Val = 0;
+      if (getValueTypePair(Record, Idx, NextValueNo, Val))
+        return Error("Invalid RESUME record");
+      I = ResumeInst::Create(Val);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE
+      I = new UnreachableInst(Context);
+      InstructionList.push_back(I);
+      break;
+    case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
+      if (Record.size() < 1 || ((Record.size()-1)&1))
+        return Error("Invalid PHI record");
+      Type *Ty = getTypeByID(Record[0]);
+      if (!Ty) return Error("Invalid PHI record");
+
+      PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
+      InstructionList.push_back(PN);
+
+      for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
+        Value *V;
+        // With the new function encoding, it is possible that operands have
+        // negative IDs (for forward references).  Use a signed VBR
+        // representation to keep the encoding small.
+        if (UseRelativeIDs)
+          V = getValueSigned(Record, 1+i, NextValueNo, Ty);
+        else
+          V = getValue(Record, 1+i, NextValueNo, Ty);
+        BasicBlock *BB = getBasicBlock(Record[2+i]);
+        if (!V || !BB) return Error("Invalid PHI record");
+        PN->addIncoming(V, BB);
+      }
+      I = PN;
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_LANDINGPAD: {
+      // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
+      unsigned Idx = 0;
+      if (Record.size() < 4)
+        return Error("Invalid LANDINGPAD record");
+      Type *Ty = getTypeByID(Record[Idx++]);
+      if (!Ty) return Error("Invalid LANDINGPAD record");
+      Value *PersFn = 0;
+      if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
+        return Error("Invalid LANDINGPAD record");
+
+      bool IsCleanup = !!Record[Idx++];
+      unsigned NumClauses = Record[Idx++];
+      LandingPadInst *LP = LandingPadInst::Create(Ty, PersFn, NumClauses);
+      LP->setCleanup(IsCleanup);
+      for (unsigned J = 0; J != NumClauses; ++J) {
+        LandingPadInst::ClauseType CT =
+          LandingPadInst::ClauseType(Record[Idx++]); (void)CT;
+        Value *Val;
+
+        if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
+          delete LP;
+          return Error("Invalid LANDINGPAD record");
+        }
+
+        assert((CT != LandingPadInst::Catch ||
+                !isa<ArrayType>(Val->getType())) &&
+               "Catch clause has a invalid type!");
+        assert((CT != LandingPadInst::Filter ||
+                isa<ArrayType>(Val->getType())) &&
+               "Filter clause has invalid type!");
+        LP->addClause(Val);
+      }
+
+      I = LP;
+      InstructionList.push_back(I);
+      break;
+    }
+
+    case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
+      if (Record.size() != 4)
+        return Error("Invalid ALLOCA record");
+      PointerType *Ty =
+        dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
+      Type *OpTy = getTypeByID(Record[1]);
+      Value *Size = getFnValueByID(Record[2], OpTy);
+      unsigned Align = Record[3];
+      if (!Ty || !Size) return Error("Invalid ALLOCA record");
+      I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_LOAD: { // LOAD: [opty, op, align, vol]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid LOAD record");
+
+      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_LOADATOMIC: {
+       // LOADATOMIC: [opty, op, align, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid LOADATOMIC record");
+
+
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Release ||
+          Ordering == AcquireRelease)
+        return Error("Invalid LOADATOMIC record");
+      if (Ordering != NotAtomic && Record[OpNum] == 0)
+        return Error("Invalid LOADATOMIC record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+
+      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
+                       Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+      unsigned OpNum = 0;
+      Value *Val, *Ptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          popValue(Record, OpNum, NextValueNo,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+2 != Record.size())
+        return Error("Invalid STORE record");
+
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_STOREATOMIC: {
+      // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Val, *Ptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          popValue(Record, OpNum, NextValueNo,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid STOREATOMIC record");
+
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Acquire ||
+          Ordering == AcquireRelease)
+        return Error("Invalid STOREATOMIC record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+      if (Ordering != NotAtomic && Record[OpNum] == 0)
+        return Error("Invalid STOREATOMIC record");
+
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
+                        Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CMPXCHG: {
+      // CMPXCHG:[ptrty, ptr, cmp, new, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Ptr, *Cmp, *New;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          popValue(Record, OpNum, NextValueNo,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Cmp) ||
+          popValue(Record, OpNum, NextValueNo,
+                    cast<PointerType>(Ptr->getType())->getElementType(), New) ||
+          OpNum+3 != Record.size())
+        return Error("Invalid CMPXCHG record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+1]);
+      if (Ordering == NotAtomic || Ordering == Unordered)
+        return Error("Invalid CMPXCHG record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
+      I = new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, SynchScope);
+      cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_ATOMICRMW: {
+      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Ptr, *Val;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          popValue(Record, OpNum, NextValueNo,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid ATOMICRMW record");
+      AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
+      if (Operation < AtomicRMWInst::FIRST_BINOP ||
+          Operation > AtomicRMWInst::LAST_BINOP)
+        return Error("Invalid ATOMICRMW record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Unordered)
+        return Error("Invalid ATOMICRMW record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+      I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
+      cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
+      if (2 != Record.size())
+        return Error("Invalid FENCE record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
+      if (Ordering == NotAtomic || Ordering == Unordered ||
+          Ordering == Monotonic)
+        return Error("Invalid FENCE record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
+      I = new FenceInst(Context, Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CALL: {
+      // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
+      if (Record.size() < 3)
+        return Error("Invalid CALL record");
+
+      AttributeSet PAL = getAttributes(Record[0]);
+      unsigned CCInfo = Record[1];
+
+      unsigned OpNum = 2;
+      Value *Callee;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
+        return Error("Invalid CALL record");
+
+      PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
+      FunctionType *FTy = 0;
+      if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
+      if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
+        return Error("Invalid CALL record");
+
+      SmallVector<Value*, 16> Args;
+      // Read the fixed params.
+      for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        if (FTy->getParamType(i)->isLabelTy())
+          Args.push_back(getBasicBlock(Record[OpNum]));
+        else
+          Args.push_back(getValue(Record, OpNum, NextValueNo,
+                                  FTy->getParamType(i)));
+        if (Args.back() == 0) return Error("Invalid CALL record");
+      }
+
+      // Read type/value pairs for varargs params.
+      if (!FTy->isVarArg()) {
+        if (OpNum != Record.size())
+          return Error("Invalid CALL record");
+      } else {
+        while (OpNum != Record.size()) {
+          Value *Op;
+          if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+            return Error("Invalid CALL record");
+          Args.push_back(Op);
+        }
+      }
+
+      I = CallInst::Create(Callee, Args);
+      InstructionList.push_back(I);
+      cast<CallInst>(I)->setCallingConv(
+        static_cast<CallingConv::ID>(CCInfo>>1));
+      cast<CallInst>(I)->setTailCall(CCInfo & 1);
+      cast<CallInst>(I)->setAttributes(PAL);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
+      if (Record.size() < 3)
+        return Error("Invalid VAARG record");
+      Type *OpTy = getTypeByID(Record[0]);
+      Value *Op = getValue(Record, 1, NextValueNo, OpTy);
+      Type *ResTy = getTypeByID(Record[2]);
+      if (!OpTy || !Op || !ResTy)
+        return Error("Invalid VAARG record");
+      I = new VAArgInst(Op, ResTy);
+      InstructionList.push_back(I);
+      break;
+    }
+    }
+
+    // Add instruction to end of current BB.  If there is no current BB, reject
+    // this file.
+    if (CurBB == 0) {
+      delete I;
+      return Error("Invalid instruction with no BB");
+    }
+    CurBB->getInstList().push_back(I);
+
+    // If this was a terminator instruction, move to the next block.
+    if (isa<TerminatorInst>(I)) {
+      ++CurBBNo;
+      CurBB = CurBBNo < FunctionBBs.size() ? FunctionBBs[CurBBNo] : 0;
+    }
+
+    // Non-void values get registered in the value table for future use.
+    if (I && !I->getType()->isVoidTy())
+      ValueList.AssignValue(I, NextValueNo++);
+  }
+
+OutOfRecordLoop:
+
+  // Check the function list for unresolved values.
+  if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
+    if (A->getParent() == 0) {
+      // We found at least one unresolved value.  Nuke them all to avoid leaks.
+      for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){
+        if ((A = dyn_cast<Argument>(ValueList[i])) && A->getParent() == 0) {
+          A->replaceAllUsesWith(UndefValue::get(A->getType()));
+          delete A;
+        }
+      }
+      return Error("Never resolved value found in function!");
+    }
+  }
+
+  // FIXME: Check for unresolved forward-declared metadata references
+  // and clean up leaks.
+
+  // See if anything took the address of blocks in this function.  If so,
+  // resolve them now.
+  DenseMap<Function*, std::vector<BlockAddrRefTy> >::iterator BAFRI =
+    BlockAddrFwdRefs.find(F);
+  if (BAFRI != BlockAddrFwdRefs.end()) {
+    std::vector<BlockAddrRefTy> &RefList = BAFRI->second;
+    for (unsigned i = 0, e = RefList.size(); i != e; ++i) {
+      unsigned BlockIdx = RefList[i].first;
+      if (BlockIdx >= FunctionBBs.size())
+        return Error("Invalid blockaddress block #");
+
+      GlobalVariable *FwdRef = RefList[i].second;
+      FwdRef->replaceAllUsesWith(BlockAddress::get(F, FunctionBBs[BlockIdx]));
+      FwdRef->eraseFromParent();
+    }
+
+    BlockAddrFwdRefs.erase(BAFRI);
+  }
+
+  // Trim the value list down to the size it was before we parsed this function.
+  ValueList.shrinkTo(ModuleValueListSize);
+  MDValueList.shrinkTo(ModuleMDValueListSize);
+  std::vector<BasicBlock*>().swap(FunctionBBs);
+  return false;
+}
+
+/// FindFunctionInStream - Find the function body in the bitcode stream
+bool BitcodeReader::FindFunctionInStream(Function *F,
+       DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
+  while (DeferredFunctionInfoIterator->second == 0) {
+    if (Stream.AtEndOfStream())
+      return Error("Could not find Function in stream");
+    // ParseModule will parse the next body in the stream and set its
+    // position in the DeferredFunctionInfo map.
+    if (ParseModule(true)) return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// GVMaterializer implementation
+//===----------------------------------------------------------------------===//
+
+
+bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
+  if (const Function *F = dyn_cast<Function>(GV)) {
+    return F->isDeclaration() &&
+      DeferredFunctionInfo.count(const_cast<Function*>(F));
+  }
+  return false;
+}
+
+bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+  Function *F = dyn_cast<Function>(GV);
+  // If it's not a function or is already material, ignore the request.
+  if (!F || !F->isMaterializable()) return false;
+
+  DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
+  assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
+  // If its position is recorded as 0, its body is somewhere in the stream
+  // but we haven't seen it yet.
+  if (DFII->second == 0)
+    if (LazyStreamer && FindFunctionInStream(F, DFII)) return true;
+
+  // Move the bit stream to the saved position of the deferred function body.
+  Stream.JumpToBit(DFII->second);
+
+  if (ParseFunctionBody(F)) {
+    if (ErrInfo) *ErrInfo = ErrorString;
+    return true;
+  }
+
+  // Upgrade any old intrinsic calls in the function.
+  for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
+       E = UpgradedIntrinsics.end(); I != E; ++I) {
+    if (I->first != I->second) {
+      for (Value::use_iterator UI = I->first->use_begin(),
+           UE = I->first->use_end(); UI != UE; ) {
+        if (CallInst* CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, I->second);
+      }
+    }
+  }
+
+  return false;
+}
+
+bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F || F->isDeclaration())
+    return false;
+  return DeferredFunctionInfo.count(const_cast<Function*>(F));
+}
+
+void BitcodeReader::Dematerialize(GlobalValue *GV) {
+  Function *F = dyn_cast<Function>(GV);
+  // If this function isn't dematerializable, this is a noop.
+  if (!F || !isDematerializable(F))
+    return;
+
+  assert(DeferredFunctionInfo.count(F) && "No info to read function later?");
+
+  // Just forget the function body, we can remat it later.
+  F->deleteBody();
+}
+
+
+bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
+  assert(M == TheModule &&
+         "Can only Materialize the Module this BitcodeReader is attached to.");
+  // Iterate over the module, deserializing any functions that are still on
+  // disk.
+  for (Module::iterator F = TheModule->begin(), E = TheModule->end();
+       F != E; ++F)
+    if (F->isMaterializable() &&
+        Materialize(F, ErrInfo))
+      return true;
+
+  // At this point, if there are any function bodies, the current bit is
+  // pointing to the END_BLOCK record after them. Now make sure the rest
+  // of the bits in the module have been read.
+  if (NextUnreadBit)
+    ParseModule(true);
+
+  // Upgrade any intrinsic calls that slipped through (should not happen!) and
+  // delete the old functions to clean up. We can't do this unless the entire
+  // module is materialized because there could always be another function body
+  // with calls to the old function.
+  for (std::vector<std::pair<Function*, Function*> >::iterator I =
+       UpgradedIntrinsics.begin(), E = UpgradedIntrinsics.end(); I != E; ++I) {
+    if (I->first != I->second) {
+      for (Value::use_iterator UI = I->first->use_begin(),
+           UE = I->first->use_end(); UI != UE; ) {
+        if (CallInst* CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, I->second);
+      }
+      if (!I->first->use_empty())
+        I->first->replaceAllUsesWith(I->second);
+      I->first->eraseFromParent();
+    }
+  }
+  std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
+
+  return false;
+}
+
+bool BitcodeReader::InitStream() {
+  if (LazyStreamer) return InitLazyStream();
+  return InitStreamFromBuffer();
+}
+
+bool BitcodeReader::InitStreamFromBuffer() {
+  const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
+  const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
+
+  if (Buffer->getBufferSize() & 3) {
+    if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
+      return Error("Invalid bitcode signature");
+    else
+      return Error("Bitcode stream should be a multiple of 4 bytes in length");
+  }
+
+  // If we have a wrapper header, parse it and ignore the non-bc file contents.
+  // The magic number is 0x0B17C0DE stored in little endian.
+  if (isBitcodeWrapper(BufPtr, BufEnd))
+    if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
+      return Error("Invalid bitcode wrapper header");
+
+  StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
+  Stream.init(*StreamFile);
+
+  return false;
+}
+
+bool BitcodeReader::InitLazyStream() {
+  // Check and strip off the bitcode wrapper; BitstreamReader expects never to
+  // see it.
+  StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
+  StreamFile.reset(new BitstreamReader(Bytes));
+  Stream.init(*StreamFile);
+
+  unsigned char buf[16];
+  if (Bytes->readBytes(0, 16, buf, NULL) == -1)
+    return Error("Bitcode stream must be at least 16 bytes in length");
+
+  if (!isBitcode(buf, buf + 16))
+    return Error("Invalid bitcode signature");
+
+  if (isBitcodeWrapper(buf, buf + 4)) {
+    const unsigned char *bitcodeStart = buf;
+    const unsigned char *bitcodeEnd = buf + 16;
+    SkipBitcodeWrapperHeader(bitcodeStart, bitcodeEnd, false);
+    Bytes->dropLeadingBytes(bitcodeStart - buf);
+    Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// External interface
+//===----------------------------------------------------------------------===//
+
+/// getLazyBitcodeModule - lazy function-at-a-time loading from a file.
+///
+Module *llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
+                                   LLVMContext& Context,
+                                   std::string *ErrMsg) {
+  Module *M = new Module(Buffer->getBufferIdentifier(), Context);
+  BitcodeReader *R = new BitcodeReader(Buffer, Context);
+  M->setMaterializer(R);
+  if (R->ParseBitcodeInto(M)) {
+    if (ErrMsg)
+      *ErrMsg = R->getErrorString();
+
+    delete M;  // Also deletes R.
+    return 0;
+  }
+  // Have the BitcodeReader dtor delete 'Buffer'.
+  R->setBufferOwned(true);
+
+  R->materializeForwardReferencedFunctions();
+
+  return M;
+}
+
+
+Module *llvm::getStreamedBitcodeModule(const std::string &name,
+                                       DataStreamer *streamer,
+                                       LLVMContext &Context,
+                                       std::string *ErrMsg) {
+  Module *M = new Module(name, Context);
+  BitcodeReader *R = new BitcodeReader(streamer, Context);
+  M->setMaterializer(R);
+  if (R->ParseBitcodeInto(M)) {
+    if (ErrMsg)
+      *ErrMsg = R->getErrorString();
+    delete M;  // Also deletes R.
+    return 0;
+  }
+  R->setBufferOwned(false); // no buffer to delete
+  return M;
+}
+
+/// ParseBitcodeFile - Read the specified bitcode file, returning the module.
+/// If an error occurs, return null and fill in *ErrMsg if non-null.
+Module *llvm::ParseBitcodeFile(MemoryBuffer *Buffer, LLVMContext& Context,
+                               std::string *ErrMsg){
+  Module *M = getLazyBitcodeModule(Buffer, Context, ErrMsg);
+  if (!M) return 0;
+
+  // Don't let the BitcodeReader dtor delete 'Buffer', regardless of whether
+  // there was an error.
+  static_cast<BitcodeReader*>(M->getMaterializer())->setBufferOwned(false);
+
+  // Read in the entire module, and destroy the BitcodeReader.
+  if (M->MaterializeAllPermanently(ErrMsg)) {
+    delete M;
+    return 0;
+  }
+
+  // TODO: Restore the use-lists to the in-memory state when the bitcode was
+  // written.  We must defer until the Module has been fully materialized.
+
+  return M;
+}
+
+std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
+                                         LLVMContext& Context,
+                                         std::string *ErrMsg) {
+  BitcodeReader *R = new BitcodeReader(Buffer, Context);
+  // Don't let the BitcodeReader dtor delete 'Buffer'.
+  R->setBufferOwned(false);
+
+  std::string Triple("");
+  if (R->ParseTriple(Triple))
+    if (ErrMsg)
+      *ErrMsg = R->getErrorString();
+
+  delete R;
+  return Triple;
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h
new file mode 100644
index 000000000000..28674eb14ef2
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.h
@@ -0,0 +1,349 @@
+//===- BitcodeReader.h - Internal BitcodeReader impl ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header defines the BitcodeReader class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BITCODE_READER_H
+#define BITCODE_READER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Bitcode/BitstreamReader.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/GVMaterializer.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ValueHandle.h"
+#include <vector>
+
+namespace llvm {
+  class MemoryBuffer;
+  class LLVMContext;
+
+//===----------------------------------------------------------------------===//
+//                          BitcodeReaderValueList Class
+//===----------------------------------------------------------------------===//
+
+class BitcodeReaderValueList {
+  std::vector<WeakVH> ValuePtrs;
+
+  /// ResolveConstants - As we resolve forward-referenced constants, we add
+  /// information about them to this vector.  This allows us to resolve them in
+  /// bulk instead of resolving each reference at a time.  See the code in
+  /// ResolveConstantForwardRefs for more information about this.
+  ///
+  /// The key of this vector is the placeholder constant, the value is the slot
+  /// number that holds the resolved value.
+  typedef std::vector<std::pair<Constant*, unsigned> > ResolveConstantsTy;
+  ResolveConstantsTy ResolveConstants;
+  LLVMContext &Context;
+public:
+  BitcodeReaderValueList(LLVMContext &C) : Context(C) {}
+  ~BitcodeReaderValueList() {
+    assert(ResolveConstants.empty() && "Constants not resolved?");
+  }
+
+  // vector compatibility methods
+  unsigned size() const { return ValuePtrs.size(); }
+  void resize(unsigned N) { ValuePtrs.resize(N); }
+  void push_back(Value *V) {
+    ValuePtrs.push_back(V);
+  }
+
+  void clear() {
+    assert(ResolveConstants.empty() && "Constants not resolved?");
+    ValuePtrs.clear();
+  }
+
+  Value *operator[](unsigned i) const {
+    assert(i < ValuePtrs.size());
+    return ValuePtrs[i];
+  }
+
+  Value *back() const { return ValuePtrs.back(); }
+    void pop_back() { ValuePtrs.pop_back(); }
+  bool empty() const { return ValuePtrs.empty(); }
+  void shrinkTo(unsigned N) {
+    assert(N <= size() && "Invalid shrinkTo request!");
+    ValuePtrs.resize(N);
+  }
+
+  Constant *getConstantFwdRef(unsigned Idx, Type *Ty);
+  Value *getValueFwdRef(unsigned Idx, Type *Ty);
+
+  void AssignValue(Value *V, unsigned Idx);
+
+  /// ResolveConstantForwardRefs - Once all constants are read, this method bulk
+  /// resolves any forward references.
+  void ResolveConstantForwardRefs();
+};
+
+
+//===----------------------------------------------------------------------===//
+//                          BitcodeReaderMDValueList Class
+//===----------------------------------------------------------------------===//
+
+class BitcodeReaderMDValueList {
+  std::vector<WeakVH> MDValuePtrs;
+
+  LLVMContext &Context;
+public:
+  BitcodeReaderMDValueList(LLVMContext& C) : Context(C) {}
+
+  // vector compatibility methods
+  unsigned size() const       { return MDValuePtrs.size(); }
+  void resize(unsigned N)     { MDValuePtrs.resize(N); }
+  void push_back(Value *V)    { MDValuePtrs.push_back(V);  }
+  void clear()                { MDValuePtrs.clear();  }
+  Value *back() const         { return MDValuePtrs.back(); }
+  void pop_back()             { MDValuePtrs.pop_back(); }
+  bool empty() const          { return MDValuePtrs.empty(); }
+
+  Value *operator[](unsigned i) const {
+    assert(i < MDValuePtrs.size());
+    return MDValuePtrs[i];
+  }
+
+  void shrinkTo(unsigned N) {
+    assert(N <= size() && "Invalid shrinkTo request!");
+    MDValuePtrs.resize(N);
+  }
+
+  Value *getValueFwdRef(unsigned Idx);
+  void AssignValue(Value *V, unsigned Idx);
+};
+
+class BitcodeReader : public GVMaterializer {
+  LLVMContext &Context;
+  Module *TheModule;
+  MemoryBuffer *Buffer;
+  bool BufferOwned;
+  OwningPtr<BitstreamReader> StreamFile;
+  BitstreamCursor Stream;
+  DataStreamer *LazyStreamer;
+  uint64_t NextUnreadBit;
+  bool SeenValueSymbolTable;
+
+  const char *ErrorString;
+
+  std::vector<Type*> TypeList;
+  BitcodeReaderValueList ValueList;
+  BitcodeReaderMDValueList MDValueList;
+  SmallVector<Instruction *, 64> InstructionList;
+  SmallVector<SmallVector<uint64_t, 64>, 64> UseListRecords;
+
+  std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
+  std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+
+  /// MAttributes - The set of attributes by index.  Index zero in the
+  /// file is for null, and is thus not represented here.  As such all indices
+  /// are off by one.
+  std::vector<AttributeSet> MAttributes;
+
+  /// \brief The set of attribute groups.
+  std::map<unsigned, AttributeSet> MAttributeGroups;
+
+  /// FunctionBBs - While parsing a function body, this is a list of the basic
+  /// blocks for the function.
+  std::vector<BasicBlock*> FunctionBBs;
+
+  // When reading the module header, this list is populated with functions that
+  // have bodies later in the file.
+  std::vector<Function*> FunctionsWithBodies;
+
+  // When intrinsic functions are encountered which require upgrading they are
+  // stored here with their replacement function.
+  typedef std::vector<std::pair<Function*, Function*> > UpgradedIntrinsicMap;
+  UpgradedIntrinsicMap UpgradedIntrinsics;
+
+  // Map the bitcode's custom MDKind ID to the Module's MDKind ID.
+  DenseMap<unsigned, unsigned> MDKindMap;
+
+  // Several operations happen after the module header has been read, but
+  // before function bodies are processed. This keeps track of whether
+  // we've done this yet.
+  bool SeenFirstFunctionBody;
+
+  /// DeferredFunctionInfo - When function bodies are initially scanned, this
+  /// map contains info about where to find deferred function body in the
+  /// stream.
+  DenseMap<Function*, uint64_t> DeferredFunctionInfo;
+
+  /// BlockAddrFwdRefs - These are blockaddr references to basic blocks.  These
+  /// are resolved lazily when functions are loaded.
+  typedef std::pair<unsigned, GlobalVariable*> BlockAddrRefTy;
+  DenseMap<Function*, std::vector<BlockAddrRefTy> > BlockAddrFwdRefs;
+
+  /// UseRelativeIDs - Indicates that we are using a new encoding for
+  /// instruction operands where most operands in the current
+  /// FUNCTION_BLOCK are encoded relative to the instruction number,
+  /// for a more compact encoding.  Some instruction operands are not
+  /// relative to the instruction ID: basic block numbers, and types.
+  /// Once the old style function blocks have been phased out, we would
+  /// not need this flag.
+  bool UseRelativeIDs;
+
+public:
+  explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
+    : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
+      LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
+      ErrorString(0), ValueList(C), MDValueList(C),
+      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
+  }
+  explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
+    : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
+      LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
+      ErrorString(0), ValueList(C), MDValueList(C),
+      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
+  }
+  ~BitcodeReader() {
+    FreeState();
+  }
+
+  void materializeForwardReferencedFunctions();
+
+  void FreeState();
+
+  /// setBufferOwned - If this is true, the reader will destroy the MemoryBuffer
+  /// when the reader is destroyed.
+  void setBufferOwned(bool Owned) { BufferOwned = Owned; }
+
+  virtual bool isMaterializable(const GlobalValue *GV) const;
+  virtual bool isDematerializable(const GlobalValue *GV) const;
+  virtual bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0);
+  virtual bool MaterializeModule(Module *M, std::string *ErrInfo = 0);
+  virtual void Dematerialize(GlobalValue *GV);
+
+  bool Error(const char *Str) {
+    ErrorString = Str;
+    return true;
+  }
+  const char *getErrorString() const { return ErrorString; }
+
+  /// @brief Main interface to parsing a bitcode buffer.
+  /// @returns true if an error occurred.
+  bool ParseBitcodeInto(Module *M);
+
+  /// @brief Cheap mechanism to just extract module triple
+  /// @returns true if an error occurred.
+  bool ParseTriple(std::string &Triple);
+
+  static uint64_t decodeSignRotatedValue(uint64_t V);
+
+private:
+  Type *getTypeByID(unsigned ID);
+  Value *getFnValueByID(unsigned ID, Type *Ty) {
+    if (Ty && Ty->isMetadataTy())
+      return MDValueList.getValueFwdRef(ID);
+    return ValueList.getValueFwdRef(ID, Ty);
+  }
+  BasicBlock *getBasicBlock(unsigned ID) const {
+    if (ID >= FunctionBBs.size()) return 0; // Invalid ID
+    return FunctionBBs[ID];
+  }
+  AttributeSet getAttributes(unsigned i) const {
+    if (i-1 < MAttributes.size())
+      return MAttributes[i-1];
+    return AttributeSet();
+  }
+
+  /// getValueTypePair - Read a value/type pair out of the specified record from
+  /// slot 'Slot'.  Increment Slot past the number of slots used in the record.
+  /// Return true on failure.
+  bool getValueTypePair(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+                        unsigned InstNum, Value *&ResVal) {
+    if (Slot == Record.size()) return true;
+    unsigned ValNo = (unsigned)Record[Slot++];
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
+    if (ValNo < InstNum) {
+      // If this is not a forward reference, just return the value we already
+      // have.
+      ResVal = getFnValueByID(ValNo, 0);
+      return ResVal == 0;
+    } else if (Slot == Record.size()) {
+      return true;
+    }
+
+    unsigned TypeNo = (unsigned)Record[Slot++];
+    ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
+    return ResVal == 0;
+  }
+
+  /// popValue - Read a value out of the specified record from slot 'Slot'.
+  /// Increment Slot past the number of slots used by the value in the record.
+  /// Return true if there is an error.
+  bool popValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+                unsigned InstNum, Type *Ty, Value *&ResVal) {
+    if (getValue(Record, Slot, InstNum, Ty, ResVal))
+      return true;
+    // All values currently take a single record slot.
+    ++Slot;
+    return false;
+  }
+
+  /// getValue -- Like popValue, but does not increment the Slot number.
+  bool getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                unsigned InstNum, Type *Ty, Value *&ResVal) {
+    ResVal = getValue(Record, Slot, InstNum, Ty);
+    return ResVal == 0;
+  }
+
+  /// getValue -- Version of getValue that returns ResVal directly,
+  /// or 0 if there is an error.
+  Value *getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                  unsigned InstNum, Type *Ty) {
+    if (Slot == Record.size()) return 0;
+    unsigned ValNo = (unsigned)Record[Slot];
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
+    return getFnValueByID(ValNo, Ty);
+  }
+
+  /// getValueSigned -- Like getValue, but decodes signed VBRs.
+  Value *getValueSigned(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                        unsigned InstNum, Type *Ty) {
+    if (Slot == Record.size()) return 0;
+    unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
+    return getFnValueByID(ValNo, Ty);
+  }
+
+  bool ParseModule(bool Resume);
+  bool ParseAttributeBlock();
+  bool ParseAttributeGroupBlock();
+  bool ParseTypeTable();
+  bool ParseTypeTableBody();
+
+  bool ParseValueSymbolTable();
+  bool ParseConstants();
+  bool RememberAndSkipFunctionBody();
+  bool ParseFunctionBody(Function *F);
+  bool GlobalCleanup();
+  bool ResolveGlobalAndAliasInits();
+  bool ParseMetadata();
+  bool ParseMetadataAttachment();
+  bool ParseModuleTriple(std::string &Triple);
+  bool ParseUseLists();
+  bool InitStream();
+  bool InitStreamFromBuffer();
+  bool InitLazyStream();
+  bool FindFunctionInStream(Function *F,
+         DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitstreamReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitstreamReader.cpp
new file mode 100644
index 000000000000..9dafe2a03670
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -0,0 +1,371 @@
+//===- BitstreamReader.cpp - BitstreamReader implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitstreamReader.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  BitstreamCursor implementation
+//===----------------------------------------------------------------------===//
+
+void BitstreamCursor::operator=(const BitstreamCursor &RHS) {
+  freeState();
+
+  BitStream = RHS.BitStream;
+  NextChar = RHS.NextChar;
+  CurWord = RHS.CurWord;
+  BitsInCurWord = RHS.BitsInCurWord;
+  CurCodeSize = RHS.CurCodeSize;
+
+  // Copy abbreviations, and bump ref counts.
+  CurAbbrevs = RHS.CurAbbrevs;
+  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
+    CurAbbrevs[i]->addRef();
+
+  // Copy block scope and bump ref counts.
+  BlockScope = RHS.BlockScope;
+  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
+    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
+    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
+      Abbrevs[i]->addRef();
+  }
+}
+
+void BitstreamCursor::freeState() {
+  // Free all the Abbrevs.
+  for (size_t i = 0, e = CurAbbrevs.size(); i != e; ++i)
+    CurAbbrevs[i]->dropRef();
+  CurAbbrevs.clear();
+
+  // Free all the Abbrevs in the block scope.
+  for (size_t S = 0, e = BlockScope.size(); S != e; ++S) {
+    std::vector<BitCodeAbbrev*> &Abbrevs = BlockScope[S].PrevAbbrevs;
+    for (size_t i = 0, e = Abbrevs.size(); i != e; ++i)
+      Abbrevs[i]->dropRef();
+  }
+  BlockScope.clear();
+}
+
+/// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
+/// the block, and return true if the block has an error.
+bool BitstreamCursor::EnterSubBlock(unsigned BlockID, unsigned *NumWordsP) {
+  // Save the current block's state on BlockScope.
+  BlockScope.push_back(Block(CurCodeSize));
+  BlockScope.back().PrevAbbrevs.swap(CurAbbrevs);
+
+  // Add the abbrevs specific to this block to the CurAbbrevs list.
+  if (const BitstreamReader::BlockInfo *Info =
+      BitStream->getBlockInfo(BlockID)) {
+    for (size_t i = 0, e = Info->Abbrevs.size(); i != e; ++i) {
+      CurAbbrevs.push_back(Info->Abbrevs[i]);
+      CurAbbrevs.back()->addRef();
+    }
+  }
+
+  // Get the codesize of this block.
+  CurCodeSize = ReadVBR(bitc::CodeLenWidth);
+  SkipToFourByteBoundary();
+  unsigned NumWords = Read(bitc::BlockSizeWidth);
+  if (NumWordsP) *NumWordsP = NumWords;
+
+  // Validate that this block is sane.
+  if (CurCodeSize == 0 || AtEndOfStream())
+    return true;
+
+  return false;
+}
+
+void BitstreamCursor::readAbbreviatedLiteral(const BitCodeAbbrevOp &Op,
+                                             SmallVectorImpl<uint64_t> &Vals) {
+  assert(Op.isLiteral() && "Not a literal");
+  // If the abbrev specifies the literal value to use, use it.
+  Vals.push_back(Op.getLiteralValue());
+}
+
+void BitstreamCursor::readAbbreviatedField(const BitCodeAbbrevOp &Op,
+                                           SmallVectorImpl<uint64_t> &Vals) {
+  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+
+  // Decode the value as we are commanded.
+  switch (Op.getEncoding()) {
+  case BitCodeAbbrevOp::Array:
+  case BitCodeAbbrevOp::Blob:
+    assert(0 && "Should not reach here");
+  case BitCodeAbbrevOp::Fixed:
+    Vals.push_back(Read((unsigned)Op.getEncodingData()));
+    break;
+  case BitCodeAbbrevOp::VBR:
+    Vals.push_back(ReadVBR64((unsigned)Op.getEncodingData()));
+    break;
+  case BitCodeAbbrevOp::Char6:
+    Vals.push_back(BitCodeAbbrevOp::DecodeChar6(Read(6)));
+    break;
+  }
+}
+
+void BitstreamCursor::skipAbbreviatedField(const BitCodeAbbrevOp &Op) {
+  assert(!Op.isLiteral() && "Use ReadAbbreviatedLiteral for literals!");
+
+  // Decode the value as we are commanded.
+  switch (Op.getEncoding()) {
+  case BitCodeAbbrevOp::Array:
+  case BitCodeAbbrevOp::Blob:
+    assert(0 && "Should not reach here");
+  case BitCodeAbbrevOp::Fixed:
+    (void)Read((unsigned)Op.getEncodingData());
+    break;
+  case BitCodeAbbrevOp::VBR:
+    (void)ReadVBR64((unsigned)Op.getEncodingData());
+    break;
+  case BitCodeAbbrevOp::Char6:
+    (void)Read(6);
+    break;
+  }
+}
+
+
+
+/// skipRecord - Read the current record and discard it.
+void BitstreamCursor::skipRecord(unsigned AbbrevID) {
+  // Skip unabbreviated records by reading past their entries.
+  if (AbbrevID == bitc::UNABBREV_RECORD) {
+    unsigned Code = ReadVBR(6);
+    (void)Code;
+    unsigned NumElts = ReadVBR(6);
+    for (unsigned i = 0; i != NumElts; ++i)
+      (void)ReadVBR64(6);
+    return;
+  }
+
+  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+
+  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+    const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
+    if (Op.isLiteral())
+      continue;
+
+    if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
+        Op.getEncoding() != BitCodeAbbrevOp::Blob) {
+      skipAbbreviatedField(Op);
+      continue;
+    }
+
+    if (Op.getEncoding() == BitCodeAbbrevOp::Array) {
+      // Array case.  Read the number of elements as a vbr6.
+      unsigned NumElts = ReadVBR(6);
+
+      // Get the element encoding.
+      assert(i+2 == e && "array op not second to last?");
+      const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
+
+      // Read all the elements.
+      for (; NumElts; --NumElts)
+        skipAbbreviatedField(EltEnc);
+      continue;
+    }
+
+    assert(Op.getEncoding() == BitCodeAbbrevOp::Blob);
+    // Blob case.  Read the number of bytes as a vbr6.
+    unsigned NumElts = ReadVBR(6);
+    SkipToFourByteBoundary();  // 32-bit alignment
+
+    // Figure out where the end of this blob will be including tail padding.
+    size_t NewEnd = GetCurrentBitNo()+((NumElts+3)&~3)*8;
+
+    // If this would read off the end of the bitcode file, just set the
+    // record to empty and return.
+    if (!canSkipToPos(NewEnd/8)) {
+      NextChar = BitStream->getBitcodeBytes().getExtent();
+      break;
+    }
+
+    // Skip over the blob.
+    JumpToBit(NewEnd);
+  }
+}
+
+unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
+                                     SmallVectorImpl<uint64_t> &Vals,
+                                     StringRef *Blob) {
+  if (AbbrevID == bitc::UNABBREV_RECORD) {
+    unsigned Code = ReadVBR(6);
+    unsigned NumElts = ReadVBR(6);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Vals.push_back(ReadVBR64(6));
+    return Code;
+  }
+
+  const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
+
+  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+    const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
+    if (Op.isLiteral()) {
+      readAbbreviatedLiteral(Op, Vals);
+      continue;
+    }
+
+    if (Op.getEncoding() != BitCodeAbbrevOp::Array &&
+        Op.getEncoding() != BitCodeAbbrevOp::Blob) {
+      readAbbreviatedField(Op, Vals);
+      continue;
+    }
+
+    if (Op.getEncoding() == BitCodeAbbrevOp::Array) {
+      // Array case.  Read the number of elements as a vbr6.
+      unsigned NumElts = ReadVBR(6);
+
+      // Get the element encoding.
+      assert(i+2 == e && "array op not second to last?");
+      const BitCodeAbbrevOp &EltEnc = Abbv->getOperandInfo(++i);
+
+      // Read all the elements.
+      for (; NumElts; --NumElts)
+        readAbbreviatedField(EltEnc, Vals);
+      continue;
+    }
+
+    assert(Op.getEncoding() == BitCodeAbbrevOp::Blob);
+    // Blob case.  Read the number of bytes as a vbr6.
+    unsigned NumElts = ReadVBR(6);
+    SkipToFourByteBoundary();  // 32-bit alignment
+
+    // Figure out where the end of this blob will be including tail padding.
+    size_t CurBitPos = GetCurrentBitNo();
+    size_t NewEnd = CurBitPos+((NumElts+3)&~3)*8;
+
+    // If this would read off the end of the bitcode file, just set the
+    // record to empty and return.
+    if (!canSkipToPos(NewEnd/8)) {
+      Vals.append(NumElts, 0);
+      NextChar = BitStream->getBitcodeBytes().getExtent();
+      break;
+    }
+
+    // Otherwise, inform the streamer that we need these bytes in memory.
+    const char *Ptr = (const char*)
+      BitStream->getBitcodeBytes().getPointer(CurBitPos/8, NumElts);
+
+    // If we can return a reference to the data, do so to avoid copying it.
+    if (Blob) {
+      *Blob = StringRef(Ptr, NumElts);
+    } else {
+      // Otherwise, unpack into Vals with zero extension.
+      for (; NumElts; --NumElts)
+        Vals.push_back((unsigned char)*Ptr++);
+    }
+    // Skip over tail padding.
+    JumpToBit(NewEnd);
+  }
+
+  unsigned Code = (unsigned)Vals[0];
+  Vals.erase(Vals.begin());
+  return Code;
+}
+
+
+void BitstreamCursor::ReadAbbrevRecord() {
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  unsigned NumOpInfo = ReadVBR(5);
+  for (unsigned i = 0; i != NumOpInfo; ++i) {
+    bool IsLiteral = Read(1) ? true : false;
+    if (IsLiteral) {
+      Abbv->Add(BitCodeAbbrevOp(ReadVBR64(8)));
+      continue;
+    }
+
+    BitCodeAbbrevOp::Encoding E = (BitCodeAbbrevOp::Encoding)Read(3);
+    if (BitCodeAbbrevOp::hasEncodingData(E)) {
+      unsigned Data = ReadVBR64(5);
+
+      // As a special case, handle fixed(0) (i.e., a fixed field with zero bits)
+      // and vbr(0) as a literal zero.  This is decoded the same way, and avoids
+      // a slow path in Read() to have to handle reading zero bits.
+      if ((E == BitCodeAbbrevOp::Fixed || E == BitCodeAbbrevOp::VBR) &&
+          Data == 0) {
+        Abbv->Add(BitCodeAbbrevOp(0));
+        continue;
+      }
+
+      Abbv->Add(BitCodeAbbrevOp(E, Data));
+    } else
+      Abbv->Add(BitCodeAbbrevOp(E));
+  }
+  CurAbbrevs.push_back(Abbv);
+}
+
+bool BitstreamCursor::ReadBlockInfoBlock() {
+  // If this is the second stream to get to the block info block, skip it.
+  if (BitStream->hasBlockInfoRecords())
+    return SkipBlock();
+
+  if (EnterSubBlock(bitc::BLOCKINFO_BLOCK_ID)) return true;
+
+  SmallVector<uint64_t, 64> Record;
+  BitstreamReader::BlockInfo *CurBlockInfo = 0;
+
+  // Read all the records for this module.
+  while (1) {
+    BitstreamEntry Entry = advanceSkippingSubblocks(AF_DontAutoprocessAbbrevs);
+
+    switch (Entry.Kind) {
+    case llvm::BitstreamEntry::SubBlock: // Handled for us already.
+    case llvm::BitstreamEntry::Error:
+      return true;
+    case llvm::BitstreamEntry::EndBlock:
+      return false;
+    case llvm::BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Read abbrev records, associate them with CurBID.
+    if (Entry.ID == bitc::DEFINE_ABBREV) {
+      if (!CurBlockInfo) return true;
+      ReadAbbrevRecord();
+
+      // ReadAbbrevRecord installs the abbrev in CurAbbrevs.  Move it to the
+      // appropriate BlockInfo.
+      BitCodeAbbrev *Abbv = CurAbbrevs.back();
+      CurAbbrevs.pop_back();
+      CurBlockInfo->Abbrevs.push_back(Abbv);
+      continue;
+    }
+
+    // Read a record.
+    Record.clear();
+    switch (readRecord(Entry.ID, Record)) {
+      default: break;  // Default behavior, ignore unknown content.
+      case bitc::BLOCKINFO_CODE_SETBID:
+        if (Record.size() < 1) return true;
+        CurBlockInfo = &BitStream->getOrCreateBlockInfo((unsigned)Record[0]);
+        break;
+      case bitc::BLOCKINFO_CODE_BLOCKNAME: {
+        if (!CurBlockInfo) return true;
+        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        std::string Name;
+        for (unsigned i = 0, e = Record.size(); i != e; ++i)
+          Name += (char)Record[i];
+        CurBlockInfo->Name = Name;
+        break;
+      }
+      case bitc::BLOCKINFO_CODE_SETRECORDNAME: {
+        if (!CurBlockInfo) return true;
+        if (BitStream->isIgnoringBlockInfoNames()) break;  // Ignore name.
+        std::string Name;
+        for (unsigned i = 1, e = Record.size(); i != e; ++i)
+          Name += (char)Record[i];
+        CurBlockInfo->RecordNames.push_back(std::make_pair((unsigned)Record[0],
+                                                           Name));
+        break;
+      }
+    }
+  }
+}
+
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp
new file mode 100644
index 000000000000..9f51c35ad92e
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Writer/BitWriter.cpp
@@ -0,0 +1,39 @@
+//===-- BitWriter.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/BitWriter.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+/*===-- Operations on modules ---------------------------------------------===*/
+
+int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
+  std::string ErrorInfo;
+  raw_fd_ostream OS(Path, ErrorInfo, raw_fd_ostream::F_Binary);
+
+  if (!ErrorInfo.empty())
+    return -1;
+
+  WriteBitcodeToFile(unwrap(M), OS);
+  return 0;
+}
+
+int LLVMWriteBitcodeToFD(LLVMModuleRef M, int FD, int ShouldClose,
+                         int Unbuffered) {
+  raw_fd_ostream OS(FD, ShouldClose, Unbuffered);
+
+  WriteBitcodeToFile(unwrap(M), OS);
+  return 0;
+}
+
+int LLVMWriteBitcodeToFileHandle(LLVMModuleRef M, int FileHandle) {
+  return LLVMWriteBitcodeToFD(M, FileHandle, true, false);
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
new file mode 100644
index 000000000000..1b73f23e8f60
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -0,0 +1,2020 @@
+//===--- Bitcode/Writer/BitcodeWriter.cpp - Bitcode Writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Bitcode writer implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "ValueEnumerator.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitstreamWriter.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+#include <map>
+using namespace llvm;
+
+static cl::opt<bool>
+EnablePreserveUseListOrdering("enable-bc-uselist-preserve",
+                              cl::desc("Turn on experimental support for "
+                                       "use-list order preservation."),
+                              cl::init(false), cl::Hidden);
+
+/// These are manifest constants used by the bitcode writer. They do not need to
+/// be kept in sync with the reader, but need to be consistent within this file.
+enum {
+  // VALUE_SYMTAB_BLOCK abbrev id's.
+  VST_ENTRY_8_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  VST_ENTRY_7_ABBREV,
+  VST_ENTRY_6_ABBREV,
+  VST_BBENTRY_6_ABBREV,
+
+  // CONSTANTS_BLOCK abbrev id's.
+  CONSTANTS_SETTYPE_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  CONSTANTS_INTEGER_ABBREV,
+  CONSTANTS_CE_CAST_Abbrev,
+  CONSTANTS_NULL_Abbrev,
+
+  // FUNCTION_BLOCK abbrev id's.
+  FUNCTION_INST_LOAD_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
+  FUNCTION_INST_BINOP_ABBREV,
+  FUNCTION_INST_BINOP_FLAGS_ABBREV,
+  FUNCTION_INST_CAST_ABBREV,
+  FUNCTION_INST_RET_VOID_ABBREV,
+  FUNCTION_INST_RET_VAL_ABBREV,
+  FUNCTION_INST_UNREACHABLE_ABBREV,
+
+  // SwitchInst Magic
+  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+};
+
+static unsigned GetEncodedCastOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown cast instruction!");
+  case Instruction::Trunc   : return bitc::CAST_TRUNC;
+  case Instruction::ZExt    : return bitc::CAST_ZEXT;
+  case Instruction::SExt    : return bitc::CAST_SEXT;
+  case Instruction::FPToUI  : return bitc::CAST_FPTOUI;
+  case Instruction::FPToSI  : return bitc::CAST_FPTOSI;
+  case Instruction::UIToFP  : return bitc::CAST_UITOFP;
+  case Instruction::SIToFP  : return bitc::CAST_SITOFP;
+  case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
+  case Instruction::FPExt   : return bitc::CAST_FPEXT;
+  case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
+  case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
+  case Instruction::BitCast : return bitc::CAST_BITCAST;
+  }
+}
+
+static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown binary instruction!");
+  case Instruction::Add:
+  case Instruction::FAdd: return bitc::BINOP_ADD;
+  case Instruction::Sub:
+  case Instruction::FSub: return bitc::BINOP_SUB;
+  case Instruction::Mul:
+  case Instruction::FMul: return bitc::BINOP_MUL;
+  case Instruction::UDiv: return bitc::BINOP_UDIV;
+  case Instruction::FDiv:
+  case Instruction::SDiv: return bitc::BINOP_SDIV;
+  case Instruction::URem: return bitc::BINOP_UREM;
+  case Instruction::FRem:
+  case Instruction::SRem: return bitc::BINOP_SREM;
+  case Instruction::Shl:  return bitc::BINOP_SHL;
+  case Instruction::LShr: return bitc::BINOP_LSHR;
+  case Instruction::AShr: return bitc::BINOP_ASHR;
+  case Instruction::And:  return bitc::BINOP_AND;
+  case Instruction::Or:   return bitc::BINOP_OR;
+  case Instruction::Xor:  return bitc::BINOP_XOR;
+  }
+}
+
+static unsigned GetEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default: llvm_unreachable("Unknown RMW operation!");
+  case AtomicRMWInst::Xchg: return bitc::RMW_XCHG;
+  case AtomicRMWInst::Add: return bitc::RMW_ADD;
+  case AtomicRMWInst::Sub: return bitc::RMW_SUB;
+  case AtomicRMWInst::And: return bitc::RMW_AND;
+  case AtomicRMWInst::Nand: return bitc::RMW_NAND;
+  case AtomicRMWInst::Or: return bitc::RMW_OR;
+  case AtomicRMWInst::Xor: return bitc::RMW_XOR;
+  case AtomicRMWInst::Max: return bitc::RMW_MAX;
+  case AtomicRMWInst::Min: return bitc::RMW_MIN;
+  case AtomicRMWInst::UMax: return bitc::RMW_UMAX;
+  case AtomicRMWInst::UMin: return bitc::RMW_UMIN;
+  }
+}
+
+static unsigned GetEncodedOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  case NotAtomic: return bitc::ORDERING_NOTATOMIC;
+  case Unordered: return bitc::ORDERING_UNORDERED;
+  case Monotonic: return bitc::ORDERING_MONOTONIC;
+  case Acquire: return bitc::ORDERING_ACQUIRE;
+  case Release: return bitc::ORDERING_RELEASE;
+  case AcquireRelease: return bitc::ORDERING_ACQREL;
+  case SequentiallyConsistent: return bitc::ORDERING_SEQCST;
+  }
+  llvm_unreachable("Invalid ordering");
+}
+
+static unsigned GetEncodedSynchScope(SynchronizationScope SynchScope) {
+  switch (SynchScope) {
+  case SingleThread: return bitc::SYNCHSCOPE_SINGLETHREAD;
+  case CrossThread: return bitc::SYNCHSCOPE_CROSSTHREAD;
+  }
+  llvm_unreachable("Invalid synch scope");
+}
+
+static void WriteStringRecord(unsigned Code, StringRef Str,
+                              unsigned AbbrevToUse, BitstreamWriter &Stream) {
+  SmallVector<unsigned, 64> Vals;
+
+  // Code: [strchar x N]
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (AbbrevToUse && !BitCodeAbbrevOp::isChar6(Str[i]))
+      AbbrevToUse = 0;
+    Vals.push_back(Str[i]);
+  }
+
+  // Emit the finished record.
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+}
+
+static void WriteAttributeGroupTable(const ValueEnumerator &VE,
+                                     BitstreamWriter &Stream) {
+  const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
+  if (AttrGrps.empty()) return;
+
+  Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) {
+    AttributeSet AS = AttrGrps[i];
+    for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) {
+      AttributeSet A = AS.getSlotAttributes(i);
+
+      Record.push_back(VE.getAttributeGroupID(A));
+      Record.push_back(AS.getSlotIndex(i));
+
+      for (AttributeSet::iterator I = AS.begin(0), E = AS.end(0);
+           I != E; ++I) {
+        Attribute Attr = *I;
+        if (Attr.isEnumAttribute()) {
+          Record.push_back(0);
+          Record.push_back(Attr.getKindAsEnum());
+        } else if (Attr.isAlignAttribute()) {
+          Record.push_back(1);
+          Record.push_back(Attr.getKindAsEnum());
+          Record.push_back(Attr.getValueAsInt());
+        } else {
+          StringRef Kind = Attr.getKindAsString();
+          StringRef Val = Attr.getValueAsString();
+
+          Record.push_back(Val.empty() ? 3 : 4);
+          Record.append(Kind.begin(), Kind.end());
+          Record.push_back(0);
+          if (!Val.empty()) {
+            Record.append(Val.begin(), Val.end());
+            Record.push_back(0);
+          }
+        }
+      }
+
+      Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record);
+      Record.clear();
+    }
+  }
+
+  Stream.ExitBlock();
+}
+
+static void WriteAttributeTable(const ValueEnumerator &VE,
+                                BitstreamWriter &Stream) {
+  const std::vector<AttributeSet> &Attrs = VE.getAttributes();
+  if (Attrs.empty()) return;
+
+  Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+    const AttributeSet &A = Attrs[i];
+    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
+      Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i)));
+
+    Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+/// WriteTypeTable - Write out the type table for a module.
+static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  const ValueEnumerator::TypeList &TypeList = VE.getTypes();
+
+  Stream.EnterSubblock(bitc::TYPE_BLOCK_ID_NEW, 4 /*count from # abbrevs */);
+  SmallVector<uint64_t, 64> TypeVals;
+
+  uint64_t NumBits = Log2_32_Ceil(VE.getTypes().size()+1);
+
+  // Abbrev for TYPE_CODE_POINTER.
+  BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+  Abbv->Add(BitCodeAbbrevOp(0));  // Addrspace = 0
+  unsigned PtrAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for TYPE_CODE_FUNCTION.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+
+  unsigned FunctionAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for TYPE_CODE_STRUCT_ANON.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+
+  unsigned StructAnonAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for TYPE_CODE_STRUCT_NAME.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAME));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+  unsigned StructNameAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for TYPE_CODE_STRUCT_NAMED.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+
+  unsigned StructNamedAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Abbrev for TYPE_CODE_ARRAY.
+  Abbv = new BitCodeAbbrev();
+  Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
+
+  unsigned ArrayAbbrev = Stream.EmitAbbrev(Abbv);
+
+  // Emit an entry count so the reader can reserve space.
+  TypeVals.push_back(TypeList.size());
+  Stream.EmitRecord(bitc::TYPE_CODE_NUMENTRY, TypeVals);
+  TypeVals.clear();
+
+  // Loop over all of the types, emitting each in turn.
+  for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
+    Type *T = TypeList[i];
+    int AbbrevToUse = 0;
+    unsigned Code = 0;
+
+    switch (T->getTypeID()) {
+    default: llvm_unreachable("Unknown type!");
+    case Type::VoidTyID:      Code = bitc::TYPE_CODE_VOID;      break;
+    case Type::HalfTyID:      Code = bitc::TYPE_CODE_HALF;      break;
+    case Type::FloatTyID:     Code = bitc::TYPE_CODE_FLOAT;     break;
+    case Type::DoubleTyID:    Code = bitc::TYPE_CODE_DOUBLE;    break;
+    case Type::X86_FP80TyID:  Code = bitc::TYPE_CODE_X86_FP80;  break;
+    case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
+    case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
+    case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
+    case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
+    case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
+    case Type::IntegerTyID:
+      // INTEGER: [width]
+      Code = bitc::TYPE_CODE_INTEGER;
+      TypeVals.push_back(cast<IntegerType>(T)->getBitWidth());
+      break;
+    case Type::PointerTyID: {
+      PointerType *PTy = cast<PointerType>(T);
+      // POINTER: [pointee type, address space]
+      Code = bitc::TYPE_CODE_POINTER;
+      TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
+      unsigned AddressSpace = PTy->getAddressSpace();
+      TypeVals.push_back(AddressSpace);
+      if (AddressSpace == 0) AbbrevToUse = PtrAbbrev;
+      break;
+    }
+    case Type::FunctionTyID: {
+      FunctionType *FT = cast<FunctionType>(T);
+      // FUNCTION: [isvararg, retty, paramty x N]
+      Code = bitc::TYPE_CODE_FUNCTION;
+      TypeVals.push_back(FT->isVarArg());
+      TypeVals.push_back(VE.getTypeID(FT->getReturnType()));
+      for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i)
+        TypeVals.push_back(VE.getTypeID(FT->getParamType(i)));
+      AbbrevToUse = FunctionAbbrev;
+      break;
+    }
+    case Type::StructTyID: {
+      StructType *ST = cast<StructType>(T);
+      // STRUCT: [ispacked, eltty x N]
+      TypeVals.push_back(ST->isPacked());
+      // Output all of the element types.
+      for (StructType::element_iterator I = ST->element_begin(),
+           E = ST->element_end(); I != E; ++I)
+        TypeVals.push_back(VE.getTypeID(*I));
+
+      if (ST->isLiteral()) {
+        Code = bitc::TYPE_CODE_STRUCT_ANON;
+        AbbrevToUse = StructAnonAbbrev;
+      } else {
+        if (ST->isOpaque()) {
+          Code = bitc::TYPE_CODE_OPAQUE;
+        } else {
+          Code = bitc::TYPE_CODE_STRUCT_NAMED;
+          AbbrevToUse = StructNamedAbbrev;
+        }
+
+        // Emit the name if it is present.
+        if (!ST->getName().empty())
+          WriteStringRecord(bitc::TYPE_CODE_STRUCT_NAME, ST->getName(),
+                            StructNameAbbrev, Stream);
+      }
+      break;
+    }
+    case Type::ArrayTyID: {
+      ArrayType *AT = cast<ArrayType>(T);
+      // ARRAY: [numelts, eltty]
+      Code = bitc::TYPE_CODE_ARRAY;
+      TypeVals.push_back(AT->getNumElements());
+      TypeVals.push_back(VE.getTypeID(AT->getElementType()));
+      AbbrevToUse = ArrayAbbrev;
+      break;
+    }
+    case Type::VectorTyID: {
+      VectorType *VT = cast<VectorType>(T);
+      // VECTOR [numelts, eltty]
+      Code = bitc::TYPE_CODE_VECTOR;
+      TypeVals.push_back(VT->getNumElements());
+      TypeVals.push_back(VE.getTypeID(VT->getElementType()));
+      break;
+    }
+    }
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, TypeVals, AbbrevToUse);
+    TypeVals.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+static unsigned getEncodedLinkage(const GlobalValue *GV) {
+  switch (GV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:                 return 0;
+  case GlobalValue::WeakAnyLinkage:                  return 1;
+  case GlobalValue::AppendingLinkage:                return 2;
+  case GlobalValue::InternalLinkage:                 return 3;
+  case GlobalValue::LinkOnceAnyLinkage:              return 4;
+  case GlobalValue::DLLImportLinkage:                return 5;
+  case GlobalValue::DLLExportLinkage:                return 6;
+  case GlobalValue::ExternalWeakLinkage:             return 7;
+  case GlobalValue::CommonLinkage:                   return 8;
+  case GlobalValue::PrivateLinkage:                  return 9;
+  case GlobalValue::WeakODRLinkage:                  return 10;
+  case GlobalValue::LinkOnceODRLinkage:              return 11;
+  case GlobalValue::AvailableExternallyLinkage:      return 12;
+  case GlobalValue::LinkerPrivateLinkage:            return 13;
+  case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:      return 15;
+  }
+  llvm_unreachable("Invalid linkage");
+}
+
+static unsigned getEncodedVisibility(const GlobalValue *GV) {
+  switch (GV->getVisibility()) {
+  case GlobalValue::DefaultVisibility:   return 0;
+  case GlobalValue::HiddenVisibility:    return 1;
+  case GlobalValue::ProtectedVisibility: return 2;
+  }
+  llvm_unreachable("Invalid visibility");
+}
+
+static unsigned getEncodedThreadLocalMode(const GlobalVariable *GV) {
+  switch (GV->getThreadLocalMode()) {
+    case GlobalVariable::NotThreadLocal:         return 0;
+    case GlobalVariable::GeneralDynamicTLSModel: return 1;
+    case GlobalVariable::LocalDynamicTLSModel:   return 2;
+    case GlobalVariable::InitialExecTLSModel:    return 3;
+    case GlobalVariable::LocalExecTLSModel:      return 4;
+  }
+  llvm_unreachable("Invalid TLS model");
+}
+
+// Emit top-level description of module, including target triple, inline asm,
+// descriptors for global variables, and function prototype info.
+static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
+                            BitstreamWriter &Stream) {
+  // Emit various pieces of data attached to a module.
+  if (!M->getTargetTriple().empty())
+    WriteStringRecord(bitc::MODULE_CODE_TRIPLE, M->getTargetTriple(),
+                      0/*TODO*/, Stream);
+  if (!M->getDataLayout().empty())
+    WriteStringRecord(bitc::MODULE_CODE_DATALAYOUT, M->getDataLayout(),
+                      0/*TODO*/, Stream);
+  if (!M->getModuleInlineAsm().empty())
+    WriteStringRecord(bitc::MODULE_CODE_ASM, M->getModuleInlineAsm(),
+                      0/*TODO*/, Stream);
+
+  // Emit information about sections and GC, computing how many there are. Also
+  // compute the maximum alignment value.
+  std::map<std::string, unsigned> SectionMap;
+  std::map<std::string, unsigned> GCMap;
+  unsigned MaxAlignment = 0;
+  unsigned MaxGlobalType = 0;
+  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
+       GV != E; ++GV) {
+    MaxAlignment = std::max(MaxAlignment, GV->getAlignment());
+    MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV->getType()));
+    if (GV->hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[GV->getSection()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
+                          0/*TODO*/, Stream);
+        Entry = SectionMap.size();
+      }
+    }
+  }
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+    MaxAlignment = std::max(MaxAlignment, F->getAlignment());
+    if (F->hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[F->getSection()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, F->getSection(),
+                          0/*TODO*/, Stream);
+        Entry = SectionMap.size();
+      }
+    }
+    if (F->hasGC()) {
+      // Same for GC names.
+      unsigned &Entry = GCMap[F->getGC()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_GCNAME, F->getGC(),
+                          0/*TODO*/, Stream);
+        Entry = GCMap.size();
+      }
+    }
+  }
+
+  // Emit abbrev for globals, now that we know # sections and max alignment.
+  unsigned SimpleGVarAbbrev = 0;
+  if (!M->global_empty()) {
+    // Add an abbrev for common globals with no visibility or thread localness.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              Log2_32_Ceil(MaxGlobalType+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));      // Constant.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));        // Initializer.
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));      // Linkage.
+    if (MaxAlignment == 0)                                      // Alignment.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else {
+      unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1;
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                               Log2_32_Ceil(MaxEncAlignment+1)));
+    }
+    if (SectionMap.empty())                                    // Section.
+      Abbv->Add(BitCodeAbbrevOp(0));
+    else
+      Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                               Log2_32_Ceil(SectionMap.size()+1)));
+    // Don't bother emitting vis + thread local.
+    SimpleGVarAbbrev = Stream.EmitAbbrev(Abbv);
+  }
+
+  // Emit the global variable information.
+  SmallVector<unsigned, 64> Vals;
+  for (Module::const_global_iterator GV = M->global_begin(),E = M->global_end();
+       GV != E; ++GV) {
+    unsigned AbbrevToUse = 0;
+
+    // GLOBALVAR: [type, isconst, initid,
+    //             linkage, alignment, section, visibility, threadlocal,
+    //             unnamed_addr]
+    Vals.push_back(VE.getTypeID(GV->getType()));
+    Vals.push_back(GV->isConstant());
+    Vals.push_back(GV->isDeclaration() ? 0 :
+                   (VE.getValueID(GV->getInitializer()) + 1));
+    Vals.push_back(getEncodedLinkage(GV));
+    Vals.push_back(Log2_32(GV->getAlignment())+1);
+    Vals.push_back(GV->hasSection() ? SectionMap[GV->getSection()] : 0);
+    if (GV->isThreadLocal() ||
+        GV->getVisibility() != GlobalValue::DefaultVisibility ||
+        GV->hasUnnamedAddr() || GV->isExternallyInitialized()) {
+      Vals.push_back(getEncodedVisibility(GV));
+      Vals.push_back(getEncodedThreadLocalMode(GV));
+      Vals.push_back(GV->hasUnnamedAddr());
+      Vals.push_back(GV->isExternallyInitialized());
+    } else {
+      AbbrevToUse = SimpleGVarAbbrev;
+    }
+
+    Stream.EmitRecord(bitc::MODULE_CODE_GLOBALVAR, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+
+  // Emit the function proto information.
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+    // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
+    //             section, visibility, gc, unnamed_addr]
+    Vals.push_back(VE.getTypeID(F->getType()));
+    Vals.push_back(F->getCallingConv());
+    Vals.push_back(F->isDeclaration());
+    Vals.push_back(getEncodedLinkage(F));
+    Vals.push_back(VE.getAttributeID(F->getAttributes()));
+    Vals.push_back(Log2_32(F->getAlignment())+1);
+    Vals.push_back(F->hasSection() ? SectionMap[F->getSection()] : 0);
+    Vals.push_back(getEncodedVisibility(F));
+    Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
+    Vals.push_back(F->hasUnnamedAddr());
+
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+
+  // Emit the alias information.
+  for (Module::const_alias_iterator AI = M->alias_begin(), E = M->alias_end();
+       AI != E; ++AI) {
+    // ALIAS: [alias type, aliasee val#, linkage, visibility]
+    Vals.push_back(VE.getTypeID(AI->getType()));
+    Vals.push_back(VE.getValueID(AI->getAliasee()));
+    Vals.push_back(getEncodedLinkage(AI));
+    Vals.push_back(getEncodedVisibility(AI));
+    unsigned AbbrevToUse = 0;
+    Stream.EmitRecord(bitc::MODULE_CODE_ALIAS, Vals, AbbrevToUse);
+    Vals.clear();
+  }
+}
+
+static uint64_t GetOptimizationFlags(const Value *V) {
+  uint64_t Flags = 0;
+
+  if (const OverflowingBinaryOperator *OBO =
+        dyn_cast<OverflowingBinaryOperator>(V)) {
+    if (OBO->hasNoSignedWrap())
+      Flags |= 1 << bitc::OBO_NO_SIGNED_WRAP;
+    if (OBO->hasNoUnsignedWrap())
+      Flags |= 1 << bitc::OBO_NO_UNSIGNED_WRAP;
+  } else if (const PossiblyExactOperator *PEO =
+               dyn_cast<PossiblyExactOperator>(V)) {
+    if (PEO->isExact())
+      Flags |= 1 << bitc::PEO_EXACT;
+  } else if (const FPMathOperator *FPMO =
+             dyn_cast<const FPMathOperator>(V)) {
+    if (FPMO->hasUnsafeAlgebra())
+      Flags |= FastMathFlags::UnsafeAlgebra;
+    if (FPMO->hasNoNaNs())
+      Flags |= FastMathFlags::NoNaNs;
+    if (FPMO->hasNoInfs())
+      Flags |= FastMathFlags::NoInfs;
+    if (FPMO->hasNoSignedZeros())
+      Flags |= FastMathFlags::NoSignedZeros;
+    if (FPMO->hasAllowReciprocal())
+      Flags |= FastMathFlags::AllowReciprocal;
+  }
+
+  return Flags;
+}
+
+static void WriteMDNode(const MDNode *N,
+                        const ValueEnumerator &VE,
+                        BitstreamWriter &Stream,
+                        SmallVector<uint64_t, 64> &Record) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i)) {
+      Record.push_back(VE.getTypeID(N->getOperand(i)->getType()));
+      Record.push_back(VE.getValueID(N->getOperand(i)));
+    } else {
+      Record.push_back(VE.getTypeID(Type::getVoidTy(N->getContext())));
+      Record.push_back(0);
+    }
+  }
+  unsigned MDCode = N->isFunctionLocal() ? bitc::METADATA_FN_NODE :
+                                           bitc::METADATA_NODE;
+  Stream.EmitRecord(MDCode, Record, 0);
+  Record.clear();
+}
+
+static void WriteModuleMetadata(const Module *M,
+                                const ValueEnumerator &VE,
+                                BitstreamWriter &Stream) {
+  const ValueEnumerator::ValueList &Vals = VE.getMDValues();
+  bool StartedMetadataBlock = false;
+  unsigned MDSAbbrev = 0;
+  SmallVector<uint64_t, 64> Record;
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+
+    if (const MDNode *N = dyn_cast<MDNode>(Vals[i].first)) {
+      if (!N->isFunctionLocal() || !N->getFunction()) {
+        if (!StartedMetadataBlock) {
+          Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+          StartedMetadataBlock = true;
+        }
+        WriteMDNode(N, VE, Stream, Record);
+      }
+    } else if (const MDString *MDS = dyn_cast<MDString>(Vals[i].first)) {
+      if (!StartedMetadataBlock)  {
+        Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+
+        // Abbrev for METADATA_STRING.
+        BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+        Abbv->Add(BitCodeAbbrevOp(bitc::METADATA_STRING));
+        Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+        Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+        MDSAbbrev = Stream.EmitAbbrev(Abbv);
+        StartedMetadataBlock = true;
+      }
+
+      // Code: [strchar x N]
+      Record.append(MDS->begin(), MDS->end());
+
+      // Emit the finished record.
+      Stream.EmitRecord(bitc::METADATA_STRING, Record, MDSAbbrev);
+      Record.clear();
+    }
+  }
+
+  // Write named metadata.
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+       E = M->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode *NMD = I;
+    if (!StartedMetadataBlock)  {
+      Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+      StartedMetadataBlock = true;
+    }
+
+    // Write name.
+    StringRef Str = NMD->getName();
+    for (unsigned i = 0, e = Str.size(); i != e; ++i)
+      Record.push_back(Str[i]);
+    Stream.EmitRecord(bitc::METADATA_NAME, Record, 0/*TODO*/);
+    Record.clear();
+
+    // Write named metadata operands.
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      Record.push_back(VE.getValueID(NMD->getOperand(i)));
+    Stream.EmitRecord(bitc::METADATA_NAMED_NODE, Record, 0);
+    Record.clear();
+  }
+
+  if (StartedMetadataBlock)
+    Stream.ExitBlock();
+}
+
+static void WriteFunctionLocalMetadata(const Function &F,
+                                       const ValueEnumerator &VE,
+                                       BitstreamWriter &Stream) {
+  bool StartedMetadataBlock = false;
+  SmallVector<uint64_t, 64> Record;
+  const SmallVector<const MDNode *, 8> &Vals = VE.getFunctionLocalMDValues();
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (const MDNode *N = Vals[i])
+      if (N->isFunctionLocal() && N->getFunction() == &F) {
+        if (!StartedMetadataBlock) {
+          Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+          StartedMetadataBlock = true;
+        }
+        WriteMDNode(N, VE, Stream, Record);
+      }
+
+  if (StartedMetadataBlock)
+    Stream.ExitBlock();
+}
+
+static void WriteMetadataAttachment(const Function &F,
+                                    const ValueEnumerator &VE,
+                                    BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::METADATA_ATTACHMENT_ID, 3);
+
+  SmallVector<uint64_t, 64> Record;
+
+  // Write metadata attachments
+  // METADATA_ATTACHMENT - [m x [value, [n x [id, mdnode]]]
+  SmallVector<std::pair<unsigned, MDNode*>, 4> MDs;
+
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      MDs.clear();
+      I->getAllMetadataOtherThanDebugLoc(MDs);
+
+      // If no metadata, ignore instruction.
+      if (MDs.empty()) continue;
+
+      Record.push_back(VE.getInstructionID(I));
+
+      for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
+        Record.push_back(MDs[i].first);
+        Record.push_back(VE.getValueID(MDs[i].second));
+      }
+      Stream.EmitRecord(bitc::METADATA_ATTACHMENT, Record, 0);
+      Record.clear();
+    }
+
+  Stream.ExitBlock();
+}
+
+static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
+  SmallVector<uint64_t, 64> Record;
+
+  // Write metadata kinds
+  // METADATA_KIND - [n x [id, name]]
+  SmallVector<StringRef, 8> Names;
+  M->getMDKindNames(Names);
+
+  if (Names.empty()) return;
+
+  Stream.EnterSubblock(bitc::METADATA_BLOCK_ID, 3);
+
+  for (unsigned MDKindID = 0, e = Names.size(); MDKindID != e; ++MDKindID) {
+    Record.push_back(MDKindID);
+    StringRef KName = Names[MDKindID];
+    Record.append(KName.begin(), KName.end());
+
+    Stream.EmitRecord(bitc::METADATA_KIND, Record, 0);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
+  if ((int64_t)V >= 0)
+    Vals.push_back(V << 1);
+  else
+    Vals.push_back((-V << 1) | 1);
+}
+
+static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
+                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
+                      bool EmitSizeForWideNumbers = false
+                      ) {
+  if (Val.getBitWidth() <= 64) {
+    uint64_t V = Val.getSExtValue();
+    emitSignedInt64(Vals, V);
+    Code = bitc::CST_CODE_INTEGER;
+    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+  } else {
+    // Wide integers, > 64 bits in size.
+    // We have an arbitrary precision integer value to write whose
+    // bit width is > 64. However, in canonical unsigned integer
+    // format it is likely that the high bits are going to be zero.
+    // So, we only write the number of active words.
+    unsigned NWords = Val.getActiveWords();
+
+    if (EmitSizeForWideNumbers)
+      Vals.push_back(NWords);
+
+    const uint64_t *RawWords = Val.getRawData();
+    for (unsigned i = 0; i != NWords; ++i) {
+      emitSignedInt64(Vals, RawWords[i]);
+    }
+    Code = bitc::CST_CODE_WIDE_INTEGER;
+  }
+}
+
+static void WriteConstants(unsigned FirstVal, unsigned LastVal,
+                           const ValueEnumerator &VE,
+                           BitstreamWriter &Stream, bool isGlobal) {
+  if (FirstVal == LastVal) return;
+
+  Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4);
+
+  unsigned AggregateAbbrev = 0;
+  unsigned String8Abbrev = 0;
+  unsigned CString7Abbrev = 0;
+  unsigned CString6Abbrev = 0;
+  // If this is a constant pool for the module, emit module-specific abbrevs.
+  if (isGlobal) {
+    // Abbrev for CST_CODE_AGGREGATE.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1)));
+    AggregateAbbrev = Stream.EmitAbbrev(Abbv);
+
+    // Abbrev for CST_CODE_STRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_STRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    String8Abbrev = Stream.EmitAbbrev(Abbv);
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    CString7Abbrev = Stream.EmitAbbrev(Abbv);
+    // Abbrev for CST_CODE_CSTRING.
+    Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CSTRING));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    CString6Abbrev = Stream.EmitAbbrev(Abbv);
+  }
+
+  SmallVector<uint64_t, 64> Record;
+
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+  Type *LastTy = 0;
+  for (unsigned i = FirstVal; i != LastVal; ++i) {
+    const Value *V = Vals[i].first;
+    // If we need to switch types, do so now.
+    if (V->getType() != LastTy) {
+      LastTy = V->getType();
+      Record.push_back(VE.getTypeID(LastTy));
+      Stream.EmitRecord(bitc::CST_CODE_SETTYPE, Record,
+                        CONSTANTS_SETTYPE_ABBREV);
+      Record.clear();
+    }
+
+    if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+      Record.push_back(unsigned(IA->hasSideEffects()) |
+                       unsigned(IA->isAlignStack()) << 1 |
+                       unsigned(IA->getDialect()&1) << 2);
+
+      // Add the asm string.
+      const std::string &AsmStr = IA->getAsmString();
+      Record.push_back(AsmStr.size());
+      for (unsigned i = 0, e = AsmStr.size(); i != e; ++i)
+        Record.push_back(AsmStr[i]);
+
+      // Add the constraint string.
+      const std::string &ConstraintStr = IA->getConstraintString();
+      Record.push_back(ConstraintStr.size());
+      for (unsigned i = 0, e = ConstraintStr.size(); i != e; ++i)
+        Record.push_back(ConstraintStr[i]);
+      Stream.EmitRecord(bitc::CST_CODE_INLINEASM, Record);
+      Record.clear();
+      continue;
+    }
+    const Constant *C = cast<Constant>(V);
+    unsigned Code = -1U;
+    unsigned AbbrevToUse = 0;
+    if (C->isNullValue()) {
+      Code = bitc::CST_CODE_NULL;
+    } else if (isa<UndefValue>(C)) {
+      Code = bitc::CST_CODE_UNDEF;
+    } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
+      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      Code = bitc::CST_CODE_FLOAT;
+      Type *Ty = CFP->getType();
+      if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) {
+        Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      } else if (Ty->isX86_FP80Ty()) {
+        // api needed to prevent premature destruction
+        // bits are not in the same order as a normal i80 APInt, compensate.
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back((p[1] << 48) | (p[0] >> 16));
+        Record.push_back(p[0] & 0xffffLL);
+      } else if (Ty->isFP128Ty() || Ty->isPPC_FP128Ty()) {
+        APInt api = CFP->getValueAPF().bitcastToAPInt();
+        const uint64_t *p = api.getRawData();
+        Record.push_back(p[0]);
+        Record.push_back(p[1]);
+      } else {
+        assert (0 && "Unknown FP type!");
+      }
+    } else if (isa<ConstantDataSequential>(C) &&
+               cast<ConstantDataSequential>(C)->isString()) {
+      const ConstantDataSequential *Str = cast<ConstantDataSequential>(C);
+      // Emit constant strings specially.
+      unsigned NumElts = Str->getNumElements();
+      // If this is a null-terminated string, use the denser CSTRING encoding.
+      if (Str->isCString()) {
+        Code = bitc::CST_CODE_CSTRING;
+        --NumElts;  // Don't encode the null, which isn't allowed by char6.
+      } else {
+        Code = bitc::CST_CODE_STRING;
+        AbbrevToUse = String8Abbrev;
+      }
+      bool isCStr7 = Code == bitc::CST_CODE_CSTRING;
+      bool isCStrChar6 = Code == bitc::CST_CODE_CSTRING;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        unsigned char V = Str->getElementAsInteger(i);
+        Record.push_back(V);
+        isCStr7 &= (V & 128) == 0;
+        if (isCStrChar6)
+          isCStrChar6 = BitCodeAbbrevOp::isChar6(V);
+      }
+
+      if (isCStrChar6)
+        AbbrevToUse = CString6Abbrev;
+      else if (isCStr7)
+        AbbrevToUse = CString7Abbrev;
+    } else if (const ConstantDataSequential *CDS =
+                  dyn_cast<ConstantDataSequential>(C)) {
+      Code = bitc::CST_CODE_DATA;
+      Type *EltTy = CDS->getType()->getElementType();
+      if (isa<IntegerType>(EltTy)) {
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i)
+          Record.push_back(CDS->getElementAsInteger(i));
+      } else if (EltTy->isFloatTy()) {
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+          union { float F; uint32_t I; };
+          F = CDS->getElementAsFloat(i);
+          Record.push_back(I);
+        }
+      } else {
+        assert(EltTy->isDoubleTy() && "Unknown ConstantData element type");
+        for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+          union { double F; uint64_t I; };
+          F = CDS->getElementAsDouble(i);
+          Record.push_back(I);
+        }
+      }
+    } else if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
+               isa<ConstantVector>(C)) {
+      Code = bitc::CST_CODE_AGGREGATE;
+      for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+        Record.push_back(VE.getValueID(C->getOperand(i)));
+      AbbrevToUse = AggregateAbbrev;
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      switch (CE->getOpcode()) {
+      default:
+        if (Instruction::isCast(CE->getOpcode())) {
+          Code = bitc::CST_CODE_CE_CAST;
+          Record.push_back(GetEncodedCastOpcode(CE->getOpcode()));
+          Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          AbbrevToUse = CONSTANTS_CE_CAST_Abbrev;
+        } else {
+          assert(CE->getNumOperands() == 2 && "Unknown constant expr!");
+          Code = bitc::CST_CODE_CE_BINOP;
+          Record.push_back(GetEncodedBinaryOpcode(CE->getOpcode()));
+          Record.push_back(VE.getValueID(C->getOperand(0)));
+          Record.push_back(VE.getValueID(C->getOperand(1)));
+          uint64_t Flags = GetOptimizationFlags(CE);
+          if (Flags != 0)
+            Record.push_back(Flags);
+        }
+        break;
+      case Instruction::GetElementPtr:
+        Code = bitc::CST_CODE_CE_GEP;
+        if (cast<GEPOperator>(C)->isInBounds())
+          Code = bitc::CST_CODE_CE_INBOUNDS_GEP;
+        for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
+          Record.push_back(VE.getTypeID(C->getOperand(i)->getType()));
+          Record.push_back(VE.getValueID(C->getOperand(i)));
+        }
+        break;
+      case Instruction::Select:
+        Code = bitc::CST_CODE_CE_SELECT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ExtractElement:
+        Code = bitc::CST_CODE_CE_EXTRACTELT;
+        Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        break;
+      case Instruction::InsertElement:
+        Code = bitc::CST_CODE_CE_INSERTELT;
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ShuffleVector:
+        // If the return type and argument types are the same, this is a
+        // standard shufflevector instruction.  If the types are different,
+        // then the shuffle is widening or truncating the input vectors, and
+        // the argument type must also be encoded.
+        if (C->getType() == C->getOperand(0)->getType()) {
+          Code = bitc::CST_CODE_CE_SHUFFLEVEC;
+        } else {
+          Code = bitc::CST_CODE_CE_SHUFVEC_EX;
+          Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        }
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(VE.getValueID(C->getOperand(2)));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        Code = bitc::CST_CODE_CE_CMP;
+        Record.push_back(VE.getTypeID(C->getOperand(0)->getType()));
+        Record.push_back(VE.getValueID(C->getOperand(0)));
+        Record.push_back(VE.getValueID(C->getOperand(1)));
+        Record.push_back(CE->getPredicate());
+        break;
+      }
+    } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
+      Code = bitc::CST_CODE_BLOCKADDRESS;
+      Record.push_back(VE.getTypeID(BA->getFunction()->getType()));
+      Record.push_back(VE.getValueID(BA->getFunction()));
+      Record.push_back(VE.getGlobalBasicBlockID(BA->getBasicBlock()));
+    } else {
+#ifndef NDEBUG
+      C->dump();
+#endif
+      llvm_unreachable("Unknown constant!");
+    }
+    Stream.EmitRecord(Code, Record, AbbrevToUse);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
+static void WriteModuleConstants(const ValueEnumerator &VE,
+                                 BitstreamWriter &Stream) {
+  const ValueEnumerator::ValueList &Vals = VE.getValues();
+
+  // Find the first constant to emit, which is the first non-globalvalue value.
+  // We know globalvalues have been emitted by WriteModuleInfo.
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+    if (!isa<GlobalValue>(Vals[i].first)) {
+      WriteConstants(i, Vals.size(), VE, Stream, true);
+      return;
+    }
+  }
+}
+
+/// PushValueAndType - The file has to encode both the value and type id for
+/// many values, because we need to know what type to create for forward
+/// references.  However, most operands are not forward references, so this type
+/// field is not needed.
+///
+/// This function adds V's value ID to Vals.  If the value ID is higher than the
+/// instruction ID, then it is a forward reference, and it also includes the
+/// type ID.  The value ID that is written is encoded relative to the InstID.
+static bool PushValueAndType(const Value *V, unsigned InstID,
+                             SmallVector<unsigned, 64> &Vals,
+                             ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  // Make encoding relative to the InstID.
+  Vals.push_back(InstID - ValID);
+  if (ValID >= InstID) {
+    Vals.push_back(VE.getTypeID(V->getType()));
+    return true;
+  }
+  return false;
+}
+
+/// pushValue - Like PushValueAndType, but where the type of the value is
+/// omitted (perhaps it was already encoded in an earlier operand).
+static void pushValue(const Value *V, unsigned InstID,
+                      SmallVector<unsigned, 64> &Vals,
+                      ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  Vals.push_back(InstID - ValID);
+}
+
+static void pushValue64(const Value *V, unsigned InstID,
+                        SmallVector<uint64_t, 128> &Vals,
+                        ValueEnumerator &VE) {
+  uint64_t ValID = VE.getValueID(V);
+  Vals.push_back(InstID - ValID);
+}
+
+static void pushValueSigned(const Value *V, unsigned InstID,
+                            SmallVector<uint64_t, 128> &Vals,
+                            ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  int64_t diff = ((int32_t)InstID - (int32_t)ValID);
+  emitSignedInt64(Vals, diff);
+}
+
+/// WriteInstruction - Emit an instruction to the specified stream.
+static void WriteInstruction(const Instruction &I, unsigned InstID,
+                             ValueEnumerator &VE, BitstreamWriter &Stream,
+                             SmallVector<unsigned, 64> &Vals) {
+  unsigned Code = 0;
+  unsigned AbbrevToUse = 0;
+  VE.setInstructionID(&I);
+  switch (I.getOpcode()) {
+  default:
+    if (Instruction::isCast(I.getOpcode())) {
+      Code = bitc::FUNC_CODE_INST_CAST;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+        AbbrevToUse = FUNCTION_INST_CAST_ABBREV;
+      Vals.push_back(VE.getTypeID(I.getType()));
+      Vals.push_back(GetEncodedCastOpcode(I.getOpcode()));
+    } else {
+      assert(isa<BinaryOperator>(I) && "Unknown instruction!");
+      Code = bitc::FUNC_CODE_INST_BINOP;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+        AbbrevToUse = FUNCTION_INST_BINOP_ABBREV;
+      pushValue(I.getOperand(1), InstID, Vals, VE);
+      Vals.push_back(GetEncodedBinaryOpcode(I.getOpcode()));
+      uint64_t Flags = GetOptimizationFlags(&I);
+      if (Flags != 0) {
+        if (AbbrevToUse == FUNCTION_INST_BINOP_ABBREV)
+          AbbrevToUse = FUNCTION_INST_BINOP_FLAGS_ABBREV;
+        Vals.push_back(Flags);
+      }
+    }
+    break;
+
+  case Instruction::GetElementPtr:
+    Code = bitc::FUNC_CODE_INST_GEP;
+    if (cast<GEPOperator>(&I)->isInBounds())
+      Code = bitc::FUNC_CODE_INST_INBOUNDS_GEP;
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+      PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+    break;
+  case Instruction::ExtractValue: {
+    Code = bitc::FUNC_CODE_INST_EXTRACTVAL;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    const ExtractValueInst *EVI = cast<ExtractValueInst>(&I);
+    for (const unsigned *i = EVI->idx_begin(), *e = EVI->idx_end(); i != e; ++i)
+      Vals.push_back(*i);
+    break;
+  }
+  case Instruction::InsertValue: {
+    Code = bitc::FUNC_CODE_INST_INSERTVAL;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    const InsertValueInst *IVI = cast<InsertValueInst>(&I);
+    for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
+      Vals.push_back(*i);
+    break;
+  }
+  case Instruction::Select:
+    Code = bitc::FUNC_CODE_INST_VSELECT;
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);
+    pushValue(I.getOperand(2), InstID, Vals, VE);
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    break;
+  case Instruction::ExtractElement:
+    Code = bitc::FUNC_CODE_INST_EXTRACTELT;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    break;
+  case Instruction::InsertElement:
+    Code = bitc::FUNC_CODE_INST_INSERTELT;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    pushValue(I.getOperand(2), InstID, Vals, VE);
+    break;
+  case Instruction::ShuffleVector:
+    Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    pushValue(I.getOperand(2), InstID, Vals, VE);
+    break;
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    // compare returning Int1Ty or vector of Int1Ty
+    Code = bitc::FUNC_CODE_INST_CMP2;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    Vals.push_back(cast<CmpInst>(I).getPredicate());
+    break;
+
+  case Instruction::Ret:
+    {
+      Code = bitc::FUNC_CODE_INST_RET;
+      unsigned NumOperands = I.getNumOperands();
+      if (NumOperands == 0)
+        AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
+      else if (NumOperands == 1) {
+        if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
+          AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
+      } else {
+        for (unsigned i = 0, e = NumOperands; i != e; ++i)
+          PushValueAndType(I.getOperand(i), InstID, Vals, VE);
+      }
+    }
+    break;
+  case Instruction::Br:
+    {
+      Code = bitc::FUNC_CODE_INST_BR;
+      const BranchInst &II = cast<BranchInst>(I);
+      Vals.push_back(VE.getValueID(II.getSuccessor(0)));
+      if (II.isConditional()) {
+        Vals.push_back(VE.getValueID(II.getSuccessor(1)));
+        pushValue(II.getCondition(), InstID, Vals, VE);
+      }
+    }
+    break;
+  case Instruction::Switch:
+    {
+      // Redefine Vals, since here we need to use 64 bit values
+      // explicitly to store large APInt numbers.
+      SmallVector<uint64_t, 128> Vals64;
+
+      Code = bitc::FUNC_CODE_INST_SWITCH;
+      const SwitchInst &SI = cast<SwitchInst>(I);
+
+      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16);
+      Vals64.push_back(SwitchRecordHeader);
+
+      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      pushValue64(SI.getCondition(), InstID, Vals64, VE);
+      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
+      Vals64.push_back(SI.getNumCases());
+      for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
+           i != e; ++i) {
+        const IntegersSubset& CaseRanges = i.getCaseValueEx();
+        unsigned Code, Abbrev; // will unused.
+
+        if (CaseRanges.isSingleNumber()) {
+          Vals64.push_back(1/*NumItems = 1*/);
+          Vals64.push_back(true/*IsSingleNumber = true*/);
+          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
+        } else {
+
+          Vals64.push_back(CaseRanges.getNumItems());
+
+          if (CaseRanges.isSingleNumbersOnly()) {
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+
+              Vals64.push_back(true/*IsSingleNumber = true*/);
+
+              EmitAPInt(Vals64, Code, Abbrev,
+                        CaseRanges.getSingleNumber(ri), true);
+            }
+          } else
+            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
+                 ri != rn; ++ri) {
+              IntegersSubset::Range r = CaseRanges.getItem(ri);
+              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
+
+              Vals64.push_back(IsSingleNumber);
+
+              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
+              if (!IsSingleNumber)
+                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
+            }
+        }
+        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
+      }
+
+      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+
+      // Also do expected action - clear external Vals collection:
+      Vals.clear();
+      return;
+    }
+    break;
+  case Instruction::IndirectBr:
+    Code = bitc::FUNC_CODE_INST_INDIRECTBR;
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
+    // Encode the address operand as relative, but not the basic blocks.
+    pushValue(I.getOperand(0), InstID, Vals, VE);
+    for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i)
+      Vals.push_back(VE.getValueID(I.getOperand(i)));
+    break;
+
+  case Instruction::Invoke: {
+    const InvokeInst *II = cast<InvokeInst>(&I);
+    const Value *Callee(II->getCalledValue());
+    PointerType *PTy = cast<PointerType>(Callee->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    Code = bitc::FUNC_CODE_INST_INVOKE;
+
+    Vals.push_back(VE.getAttributeID(II->getAttributes()));
+    Vals.push_back(II->getCallingConv());
+    Vals.push_back(VE.getValueID(II->getNormalDest()));
+    Vals.push_back(VE.getValueID(II->getUnwindDest()));
+    PushValueAndType(Callee, InstID, Vals, VE);
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+      pushValue(I.getOperand(i), InstID, Vals, VE);  // fixed param.
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = I.getNumOperands()-3;
+           i != e; ++i)
+        PushValueAndType(I.getOperand(i), InstID, Vals, VE); // vararg
+    }
+    break;
+  }
+  case Instruction::Resume:
+    Code = bitc::FUNC_CODE_INST_RESUME;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    break;
+  case Instruction::Unreachable:
+    Code = bitc::FUNC_CODE_INST_UNREACHABLE;
+    AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
+    break;
+
+  case Instruction::PHI: {
+    const PHINode &PN = cast<PHINode>(I);
+    Code = bitc::FUNC_CODE_INST_PHI;
+    // With the newer instruction encoding, forward references could give
+    // negative valued IDs.  This is most common for PHIs, so we use
+    // signed VBRs.
+    SmallVector<uint64_t, 128> Vals64;
+    Vals64.push_back(VE.getTypeID(PN.getType()));
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64, VE);
+      Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
+    }
+    // Emit a Vals64 vector and exit.
+    Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+    Vals64.clear();
+    return;
+  }
+
+  case Instruction::LandingPad: {
+    const LandingPadInst &LP = cast<LandingPadInst>(I);
+    Code = bitc::FUNC_CODE_INST_LANDINGPAD;
+    Vals.push_back(VE.getTypeID(LP.getType()));
+    PushValueAndType(LP.getPersonalityFn(), InstID, Vals, VE);
+    Vals.push_back(LP.isCleanup());
+    Vals.push_back(LP.getNumClauses());
+    for (unsigned I = 0, E = LP.getNumClauses(); I != E; ++I) {
+      if (LP.isCatch(I))
+        Vals.push_back(LandingPadInst::Catch);
+      else
+        Vals.push_back(LandingPadInst::Filter);
+      PushValueAndType(LP.getClause(I), InstID, Vals, VE);
+    }
+    break;
+  }
+
+  case Instruction::Alloca:
+    Code = bitc::FUNC_CODE_INST_ALLOCA;
+    Vals.push_back(VE.getTypeID(I.getType()));
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
+    Vals.push_back(VE.getValueID(I.getOperand(0))); // size.
+    Vals.push_back(Log2_32(cast<AllocaInst>(I).getAlignment())+1);
+    break;
+
+  case Instruction::Load:
+    if (cast<LoadInst>(I).isAtomic()) {
+      Code = bitc::FUNC_CODE_INST_LOADATOMIC;
+      PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    } else {
+      Code = bitc::FUNC_CODE_INST_LOAD;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
+        AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
+    }
+    Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
+    Vals.push_back(cast<LoadInst>(I).isVolatile());
+    if (cast<LoadInst>(I).isAtomic()) {
+      Vals.push_back(GetEncodedOrdering(cast<LoadInst>(I).getOrdering()));
+      Vals.push_back(GetEncodedSynchScope(cast<LoadInst>(I).getSynchScope()));
+    }
+    break;
+  case Instruction::Store:
+    if (cast<StoreInst>(I).isAtomic())
+      Code = bitc::FUNC_CODE_INST_STOREATOMIC;
+    else
+      Code = bitc::FUNC_CODE_INST_STORE;
+    PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
+    pushValue(I.getOperand(0), InstID, Vals, VE);         // val.
+    Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
+    Vals.push_back(cast<StoreInst>(I).isVolatile());
+    if (cast<StoreInst>(I).isAtomic()) {
+      Vals.push_back(GetEncodedOrdering(cast<StoreInst>(I).getOrdering()));
+      Vals.push_back(GetEncodedSynchScope(cast<StoreInst>(I).getSynchScope()));
+    }
+    break;
+  case Instruction::AtomicCmpXchg:
+    Code = bitc::FUNC_CODE_INST_CMPXCHG;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
+    pushValue(I.getOperand(1), InstID, Vals, VE);         // cmp.
+    pushValue(I.getOperand(2), InstID, Vals, VE);         // newval.
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
+    Vals.push_back(GetEncodedOrdering(
+                     cast<AtomicCmpXchgInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(
+                     cast<AtomicCmpXchgInst>(I).getSynchScope()));
+    break;
+  case Instruction::AtomicRMW:
+    Code = bitc::FUNC_CODE_INST_ATOMICRMW;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
+    pushValue(I.getOperand(1), InstID, Vals, VE);         // val.
+    Vals.push_back(GetEncodedRMWOperation(
+                     cast<AtomicRMWInst>(I).getOperation()));
+    Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
+    Vals.push_back(GetEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(
+                     cast<AtomicRMWInst>(I).getSynchScope()));
+    break;
+  case Instruction::Fence:
+    Code = bitc::FUNC_CODE_INST_FENCE;
+    Vals.push_back(GetEncodedOrdering(cast<FenceInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(cast<FenceInst>(I).getSynchScope()));
+    break;
+  case Instruction::Call: {
+    const CallInst &CI = cast<CallInst>(I);
+    PointerType *PTy = cast<PointerType>(CI.getCalledValue()->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+
+    Code = bitc::FUNC_CODE_INST_CALL;
+
+    Vals.push_back(VE.getAttributeID(CI.getAttributes()));
+    Vals.push_back((CI.getCallingConv() << 1) | unsigned(CI.isTailCall()));
+    PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
+
+    // Emit value #'s for the fixed parameters.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+      // Check for labels (can happen with asm labels).
+      if (FTy->getParamType(i)->isLabelTy())
+        Vals.push_back(VE.getValueID(CI.getArgOperand(i)));
+      else
+        pushValue(CI.getArgOperand(i), InstID, Vals, VE);  // fixed param.
+    }
+
+    // Emit type/value pairs for varargs params.
+    if (FTy->isVarArg()) {
+      for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands();
+           i != e; ++i)
+        PushValueAndType(CI.getArgOperand(i), InstID, Vals, VE);  // varargs
+    }
+    break;
+  }
+  case Instruction::VAArg:
+    Code = bitc::FUNC_CODE_INST_VAARG;
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
+    pushValue(I.getOperand(0), InstID, Vals, VE); // valist.
+    Vals.push_back(VE.getTypeID(I.getType())); // restype.
+    break;
+  }
+
+  Stream.EmitRecord(Code, Vals, AbbrevToUse);
+  Vals.clear();
+}
+
+// Emit names for globals/functions etc.
+static void WriteValueSymbolTable(const ValueSymbolTable &VST,
+                                  const ValueEnumerator &VE,
+                                  BitstreamWriter &Stream) {
+  if (VST.empty()) return;
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+
+  // FIXME: Set up the abbrev, we know how many values there are!
+  // FIXME: We know if the type names can use 7-bit ascii.
+  SmallVector<unsigned, 64> NameVals;
+
+  for (ValueSymbolTable::const_iterator SI = VST.begin(), SE = VST.end();
+       SI != SE; ++SI) {
+
+    const ValueName &Name = *SI;
+
+    // Figure out the encoding to use for the name.
+    bool is7Bit = true;
+    bool isChar6 = true;
+    for (const char *C = Name.getKeyData(), *E = C+Name.getKeyLength();
+         C != E; ++C) {
+      if (isChar6)
+        isChar6 = BitCodeAbbrevOp::isChar6(*C);
+      if ((unsigned char)*C & 128) {
+        is7Bit = false;
+        break;  // don't bother scanning the rest.
+      }
+    }
+
+    unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
+
+    // VST_ENTRY:   [valueid, namechar x N]
+    // VST_BBENTRY: [bbid, namechar x N]
+    unsigned Code;
+    if (isa<BasicBlock>(SI->getValue())) {
+      Code = bitc::VST_CODE_BBENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_BBENTRY_6_ABBREV;
+    } else {
+      Code = bitc::VST_CODE_ENTRY;
+      if (isChar6)
+        AbbrevToUse = VST_ENTRY_6_ABBREV;
+      else if (is7Bit)
+        AbbrevToUse = VST_ENTRY_7_ABBREV;
+    }
+
+    NameVals.push_back(VE.getValueID(SI->getValue()));
+    for (const char *P = Name.getKeyData(),
+         *E = Name.getKeyData()+Name.getKeyLength(); P != E; ++P)
+      NameVals.push_back((unsigned char)*P);
+
+    // Emit the finished record.
+    Stream.EmitRecord(Code, NameVals, AbbrevToUse);
+    NameVals.clear();
+  }
+  Stream.ExitBlock();
+}
+
+/// WriteFunction - Emit a function body to the module stream.
+static void WriteFunction(const Function &F, ValueEnumerator &VE,
+                          BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::FUNCTION_BLOCK_ID, 4);
+  VE.incorporateFunction(F);
+
+  SmallVector<unsigned, 64> Vals;
+
+  // Emit the number of basic blocks, so the reader can create them ahead of
+  // time.
+  Vals.push_back(VE.getBasicBlocks().size());
+  Stream.EmitRecord(bitc::FUNC_CODE_DECLAREBLOCKS, Vals);
+  Vals.clear();
+
+  // If there are function-local constants, emit them now.
+  unsigned CstStart, CstEnd;
+  VE.getFunctionConstantRange(CstStart, CstEnd);
+  WriteConstants(CstStart, CstEnd, VE, Stream, false);
+
+  // If there is function-local metadata, emit it now.
+  WriteFunctionLocalMetadata(F, VE, Stream);
+
+  // Keep a running idea of what the instruction ID is.
+  unsigned InstID = CstEnd;
+
+  bool NeedsMetadataAttachment = false;
+
+  DebugLoc LastDL;
+
+  // Finally, emit all the instructions, in order.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      WriteInstruction(*I, InstID, VE, Stream, Vals);
+
+      if (!I->getType()->isVoidTy())
+        ++InstID;
+
+      // If the instruction has metadata, write a metadata attachment later.
+      NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
+
+      // If the instruction has a debug location, emit it.
+      DebugLoc DL = I->getDebugLoc();
+      if (DL.isUnknown()) {
+        // nothing todo.
+      } else if (DL == LastDL) {
+        // Just repeat the same debug loc as last time.
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC_AGAIN, Vals);
+      } else {
+        MDNode *Scope, *IA;
+        DL.getScopeAndInlinedAt(Scope, IA, I->getContext());
+
+        Vals.push_back(DL.getLine());
+        Vals.push_back(DL.getCol());
+        Vals.push_back(Scope ? VE.getValueID(Scope)+1 : 0);
+        Vals.push_back(IA ? VE.getValueID(IA)+1 : 0);
+        Stream.EmitRecord(bitc::FUNC_CODE_DEBUG_LOC, Vals);
+        Vals.clear();
+
+        LastDL = DL;
+      }
+    }
+
+  // Emit names for all the instructions etc.
+  WriteValueSymbolTable(F.getValueSymbolTable(), VE, Stream);
+
+  if (NeedsMetadataAttachment)
+    WriteMetadataAttachment(F, VE, Stream);
+  VE.purgeFunction();
+  Stream.ExitBlock();
+}
+
+// Emit blockinfo, which defines the standard abbreviations etc.
+static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
+  // We only want to emit block info records for blocks that have multiple
+  // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.
+  // Other blocks can define their abbrevs inline.
+  Stream.EnterBlockInfoBlock(2);
+
+  { // 8-bit fixed-width VST_ENTRY/VST_BBENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_ENTRY_8_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  { // 7-bit fixed width VST_ENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_ENTRY_7_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_ENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_ENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_ENTRY_6_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // 6-bit char6 VST_BBENTRY strings.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_BBENTRY));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
+    if (Stream.EmitBlockInfoAbbrev(bitc::VALUE_SYMTAB_BLOCK_ID,
+                                   Abbv) != VST_BBENTRY_6_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+
+
+  { // SETTYPE abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_SETTYPE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_SETTYPE_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  { // INTEGER abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_INTEGER));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_INTEGER_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  { // CE_CAST abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
+
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_CE_CAST_Abbrev)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // NULL abbrev for CONSTANTS_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_NULL));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID,
+                                   Abbv) != CONSTANTS_NULL_Abbrev)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  // FIXME: This should only use space for first class types!
+
+  { // INST_LOAD abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_LOAD));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_LOAD_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_BINOP abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_BINOP_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7)); // flags
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_BINOP_FLAGS_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_CAST abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
+                              Log2_32_Ceil(VE.getTypes().size()+1)));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_CAST_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_RET_VOID_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_RET abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_RET));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // ValID
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_RET_VAL_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+  { // INST_UNREACHABLE abbrev for FUNCTION_BLOCK.
+    BitCodeAbbrev *Abbv = new BitCodeAbbrev();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNREACHABLE));
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID,
+                                   Abbv) != FUNCTION_INST_UNREACHABLE_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
+  Stream.ExitBlock();
+}
+
+// Sort the Users based on the order in which the reader parses the bitcode
+// file.
+static bool bitcodereader_order(const User *lhs, const User *rhs) {
+  // TODO: Implement.
+  return true;
+}
+
+static void WriteUseList(const Value *V, const ValueEnumerator &VE,
+                         BitstreamWriter &Stream) {
+
+  // One or zero uses can't get out of order.
+  if (V->use_empty() || V->hasNUses(1))
+    return;
+
+  // Make a copy of the in-memory use-list for sorting.
+  unsigned UseListSize = std::distance(V->use_begin(), V->use_end());
+  SmallVector<const User*, 8> UseList;
+  UseList.reserve(UseListSize);
+  for (Value::const_use_iterator I = V->use_begin(), E = V->use_end();
+       I != E; ++I) {
+    const User *U = *I;
+    UseList.push_back(U);
+  }
+
+  // Sort the copy based on the order read by the BitcodeReader.
+  std::sort(UseList.begin(), UseList.end(), bitcodereader_order);
+
+  // TODO: Generate a diff between the BitcodeWriter in-memory use-list and the
+  // sorted list (i.e., the expected BitcodeReader in-memory use-list).
+
+  // TODO: Emit the USELIST_CODE_ENTRYs.
+}
+
+static void WriteFunctionUseList(const Function *F, ValueEnumerator &VE,
+                                 BitstreamWriter &Stream) {
+  VE.incorporateFunction(*F);
+
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI)
+    WriteUseList(AI, VE, Stream);
+  for (Function::const_iterator BB = F->begin(), FE = F->end(); BB != FE;
+       ++BB) {
+    WriteUseList(BB, VE, Stream);
+    for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
+      WriteUseList(II, VE, Stream);
+      for (User::const_op_iterator OI = II->op_begin(), E = II->op_end();
+           OI != E; ++OI) {
+        if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) ||
+            isa<InlineAsm>(*OI))
+          WriteUseList(*OI, VE, Stream);
+      }
+    }
+  }
+  VE.purgeFunction();
+}
+
+// Emit use-lists.
+static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
+                                BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::USELIST_BLOCK_ID, 3);
+
+  // XXX: this modifies the module, but in a way that should never change the
+  // behavior of any pass or codegen in LLVM. The problem is that GVs may
+  // contain entries in the use_list that do not exist in the Module and are
+  // not stored in the .bc file.
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I)
+    I->removeDeadConstantUsers();
+
+  // Write the global variables.
+  for (Module::const_global_iterator GI = M->global_begin(),
+         GE = M->global_end(); GI != GE; ++GI) {
+    WriteUseList(GI, VE, Stream);
+
+    // Write the global variable initializers.
+    if (GI->hasInitializer())
+      WriteUseList(GI->getInitializer(), VE, Stream);
+  }
+
+  // Write the functions.
+  for (Module::const_iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI) {
+    WriteUseList(FI, VE, Stream);
+    if (!FI->isDeclaration())
+      WriteFunctionUseList(FI, VE, Stream);
+  }
+
+  // Write the aliases.
+  for (Module::const_alias_iterator AI = M->alias_begin(), AE = M->alias_end();
+       AI != AE; ++AI) {
+    WriteUseList(AI, VE, Stream);
+    WriteUseList(AI->getAliasee(), VE, Stream);
+  }
+
+  Stream.ExitBlock();
+}
+
+/// WriteModule - Emit the specified module to the bitstream.
+static void WriteModule(const Module *M, BitstreamWriter &Stream) {
+  Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
+
+  SmallVector<unsigned, 1> Vals;
+  unsigned CurVersion = 1;
+  Vals.push_back(CurVersion);
+  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
+
+  // Analyze the module, enumerating globals, functions, etc.
+  ValueEnumerator VE(M);
+
+  // Emit blockinfo, which defines the standard abbreviations etc.
+  WriteBlockInfo(VE, Stream);
+
+  // Emit information about attribute groups.
+  WriteAttributeGroupTable(VE, Stream);
+
+  // Emit information about parameter attributes.
+  WriteAttributeTable(VE, Stream);
+
+  // Emit information describing all of the types in the module.
+  WriteTypeTable(VE, Stream);
+
+  // Emit top-level description of module, including target triple, inline asm,
+  // descriptors for global variables, and function prototype info.
+  WriteModuleInfo(M, VE, Stream);
+
+  // Emit constants.
+  WriteModuleConstants(VE, Stream);
+
+  // Emit metadata.
+  WriteModuleMetadata(M, VE, Stream);
+
+  // Emit metadata.
+  WriteModuleMetadataStore(M, Stream);
+
+  // Emit names for globals/functions etc.
+  WriteValueSymbolTable(M->getValueSymbolTable(), VE, Stream);
+
+  // Emit use-lists.
+  if (EnablePreserveUseListOrdering)
+    WriteModuleUseLists(M, VE, Stream);
+
+  // Emit function bodies.
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F)
+    if (!F->isDeclaration())
+      WriteFunction(*F, VE, Stream);
+
+  Stream.ExitBlock();
+}
+
+/// EmitDarwinBCHeader - If generating a bc file on darwin, we have to emit a
+/// header and trailer to make it compatible with the system archiver.  To do
+/// this we emit the following header, and then emit a trailer that pads the
+/// file out to be a multiple of 16 bytes.
+///
+/// struct bc_header {
+///   uint32_t Magic;         // 0x0B17C0DE
+///   uint32_t Version;       // Version, currently always 0.
+///   uint32_t BitcodeOffset; // Offset to traditional bitcode file.
+///   uint32_t BitcodeSize;   // Size of traditional bitcode file.
+///   uint32_t CPUType;       // CPU specifier.
+///   ... potentially more later ...
+/// };
+enum {
+  DarwinBCSizeFieldOffset = 3*4, // Offset to bitcode_size.
+  DarwinBCHeaderSize = 5*4
+};
+
+static void WriteInt32ToBuffer(uint32_t Value, SmallVectorImpl<char> &Buffer,
+                               uint32_t &Position) {
+  Buffer[Position + 0] = (unsigned char) (Value >>  0);
+  Buffer[Position + 1] = (unsigned char) (Value >>  8);
+  Buffer[Position + 2] = (unsigned char) (Value >> 16);
+  Buffer[Position + 3] = (unsigned char) (Value >> 24);
+  Position += 4;
+}
+
+static void EmitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
+                                         const Triple &TT) {
+  unsigned CPUType = ~0U;
+
+  // Match x86_64-*, i[3-9]86-*, powerpc-*, powerpc64-*, arm-*, thumb-*,
+  // armv[0-9]-*, thumbv[0-9]-*, armv5te-*, or armv6t2-*. The CPUType is a magic
+  // number from /usr/include/mach/machine.h.  It is ok to reproduce the
+  // specific constants here because they are implicitly part of the Darwin ABI.
+  enum {
+    DARWIN_CPU_ARCH_ABI64      = 0x01000000,
+    DARWIN_CPU_TYPE_X86        = 7,
+    DARWIN_CPU_TYPE_ARM        = 12,
+    DARWIN_CPU_TYPE_POWERPC    = 18
+  };
+
+  Triple::ArchType Arch = TT.getArch();
+  if (Arch == Triple::x86_64)
+    CPUType = DARWIN_CPU_TYPE_X86 | DARWIN_CPU_ARCH_ABI64;
+  else if (Arch == Triple::x86)
+    CPUType = DARWIN_CPU_TYPE_X86;
+  else if (Arch == Triple::ppc)
+    CPUType = DARWIN_CPU_TYPE_POWERPC;
+  else if (Arch == Triple::ppc64)
+    CPUType = DARWIN_CPU_TYPE_POWERPC | DARWIN_CPU_ARCH_ABI64;
+  else if (Arch == Triple::arm || Arch == Triple::thumb)
+    CPUType = DARWIN_CPU_TYPE_ARM;
+
+  // Traditional Bitcode starts after header.
+  assert(Buffer.size() >= DarwinBCHeaderSize &&
+         "Expected header size to be reserved");
+  unsigned BCOffset = DarwinBCHeaderSize;
+  unsigned BCSize = Buffer.size()-DarwinBCHeaderSize;
+
+  // Write the magic and version.
+  unsigned Position = 0;
+  WriteInt32ToBuffer(0x0B17C0DE , Buffer, Position);
+  WriteInt32ToBuffer(0          , Buffer, Position); // Version.
+  WriteInt32ToBuffer(BCOffset   , Buffer, Position);
+  WriteInt32ToBuffer(BCSize     , Buffer, Position);
+  WriteInt32ToBuffer(CPUType    , Buffer, Position);
+
+  // If the file is not a multiple of 16 bytes, insert dummy padding.
+  while (Buffer.size() & 15)
+    Buffer.push_back(0);
+}
+
+/// WriteBitcodeToFile - Write the specified module to the specified output
+/// stream.
+void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out) {
+  SmallVector<char, 0> Buffer;
+  Buffer.reserve(256*1024);
+
+  // If this is darwin or another generic macho target, reserve space for the
+  // header.
+  Triple TT(M->getTargetTriple());
+  if (TT.isOSDarwin())
+    Buffer.insert(Buffer.begin(), DarwinBCHeaderSize, 0);
+
+  // Emit the module into the buffer.
+  {
+    BitstreamWriter Stream(Buffer);
+
+    // Emit the file header.
+    Stream.Emit((unsigned)'B', 8);
+    Stream.Emit((unsigned)'C', 8);
+    Stream.Emit(0x0, 4);
+    Stream.Emit(0xC, 4);
+    Stream.Emit(0xE, 4);
+    Stream.Emit(0xD, 4);
+
+    // Emit the module.
+    WriteModule(M, Stream);
+  }
+
+  if (TT.isOSDarwin())
+    EmitDarwinBCHeaderAndTrailer(Buffer, TT);
+
+  // Write the generated bitstream to "Out".
+  Out.write((char*)&Buffer.front(), Buffer.size());
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
new file mode 100644
index 000000000000..e5e76e29bd2d
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -0,0 +1,41 @@
+//===--- Bitcode/Writer/BitcodeWriterPass.cpp - Bitcode Writer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BitcodeWriterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  class WriteBitcodePass : public ModulePass {
+    raw_ostream &OS; // raw_ostream to print on
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit WriteBitcodePass(raw_ostream &o)
+      : ModulePass(ID), OS(o) {}
+
+    const char *getPassName() const { return "Bitcode Writer"; }
+
+    bool runOnModule(Module &M) {
+      WriteBitcodeToFile(&M, OS);
+      return false;
+    }
+  };
+}
+
+char WriteBitcodePass::ID = 0;
+
+/// createBitcodeWriterPass - Create and return a pass that writes the module
+/// to the specified ostream.
+ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str) {
+  return new WriteBitcodePass(Str);
+}
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
new file mode 100644
index 000000000000..8bac6da89285
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -0,0 +1,540 @@
+//===-- ValueEnumerator.cpp - Number values and types for bitcode writer --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ValueEnumerator class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueEnumerator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+static bool isIntOrIntVectorValue(const std::pair<const Value*, unsigned> &V) {
+  return V.first->getType()->isIntOrIntVectorTy();
+}
+
+/// ValueEnumerator - Enumerate module-level information.
+ValueEnumerator::ValueEnumerator(const Module *M) {
+  // Enumerate the global variables.
+  for (Module::const_global_iterator I = M->global_begin(),
+         E = M->global_end(); I != E; ++I)
+    EnumerateValue(I);
+
+  // Enumerate the functions.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    EnumerateValue(I);
+    EnumerateAttributes(cast<Function>(I)->getAttributes());
+  }
+
+  // Enumerate the aliases.
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    EnumerateValue(I);
+
+  // Remember what is the cutoff between globalvalue's and other constants.
+  unsigned FirstConstant = Values.size();
+
+  // Enumerate the global variable initializers.
+  for (Module::const_global_iterator I = M->global_begin(),
+         E = M->global_end(); I != E; ++I)
+    if (I->hasInitializer())
+      EnumerateValue(I->getInitializer());
+
+  // Enumerate the aliasees.
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    EnumerateValue(I->getAliasee());
+
+  // Insert constants and metadata that are named at module level into the slot
+  // pool so that the module symbol table can refer to them...
+  EnumerateValueSymbolTable(M->getValueSymbolTable());
+  EnumerateNamedMetadata(M);
+
+  SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
+
+  // Enumerate types used by function bodies and argument lists.
+  for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
+
+    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I)
+      EnumerateType(I->getType());
+
+    for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;++I){
+        for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
+             OI != E; ++OI) {
+          if (MDNode *MD = dyn_cast<MDNode>(*OI))
+            if (MD->isFunctionLocal() && MD->getFunction())
+              // These will get enumerated during function-incorporation.
+              continue;
+          EnumerateOperandType(*OI);
+        }
+        EnumerateType(I->getType());
+        if (const CallInst *CI = dyn_cast<CallInst>(I))
+          EnumerateAttributes(CI->getAttributes());
+        else if (const InvokeInst *II = dyn_cast<InvokeInst>(I))
+          EnumerateAttributes(II->getAttributes());
+
+        // Enumerate metadata attached with this instruction.
+        MDs.clear();
+        I->getAllMetadataOtherThanDebugLoc(MDs);
+        for (unsigned i = 0, e = MDs.size(); i != e; ++i)
+          EnumerateMetadata(MDs[i].second);
+
+        if (!I->getDebugLoc().isUnknown()) {
+          MDNode *Scope, *IA;
+          I->getDebugLoc().getScopeAndInlinedAt(Scope, IA, I->getContext());
+          if (Scope) EnumerateMetadata(Scope);
+          if (IA) EnumerateMetadata(IA);
+        }
+      }
+  }
+
+  // Optimize constant ordering.
+  OptimizeConstants(FirstConstant, Values.size());
+}
+
+unsigned ValueEnumerator::getInstructionID(const Instruction *Inst) const {
+  InstructionMapType::const_iterator I = InstructionMap.find(Inst);
+  assert(I != InstructionMap.end() && "Instruction is not mapped!");
+  return I->second;
+}
+
+void ValueEnumerator::setInstructionID(const Instruction *I) {
+  InstructionMap[I] = InstructionCount++;
+}
+
+unsigned ValueEnumerator::getValueID(const Value *V) const {
+  if (isa<MDNode>(V) || isa<MDString>(V)) {
+    ValueMapType::const_iterator I = MDValueMap.find(V);
+    assert(I != MDValueMap.end() && "Value not in slotcalculator!");
+    return I->second-1;
+  }
+
+  ValueMapType::const_iterator I = ValueMap.find(V);
+  assert(I != ValueMap.end() && "Value not in slotcalculator!");
+  return I->second-1;
+}
+
+void ValueEnumerator::dump() const {
+  print(dbgs(), ValueMap, "Default");
+  dbgs() << '\n';
+  print(dbgs(), MDValueMap, "MetaData");
+  dbgs() << '\n';
+}
+
+void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
+                            const char *Name) const {
+
+  OS << "Map Name: " << Name << "\n";
+  OS << "Size: " << Map.size() << "\n";
+  for (ValueMapType::const_iterator I = Map.begin(),
+         E = Map.end(); I != E; ++I) {
+
+    const Value *V = I->first;
+    if (V->hasName())
+      OS << "Value: " << V->getName();
+    else
+      OS << "Value: [null]\n";
+    V->dump();
+
+    OS << " Uses(" << std::distance(V->use_begin(),V->use_end()) << "):";
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      if (UI != V->use_begin())
+        OS << ",";
+      if((*UI)->hasName())
+        OS << " " << (*UI)->getName();
+      else
+        OS << " [null]";
+
+    }
+    OS <<  "\n\n";
+  }
+}
+
+// Optimize constant ordering.
+namespace {
+  struct CstSortPredicate {
+    ValueEnumerator &VE;
+    explicit CstSortPredicate(ValueEnumerator &ve) : VE(ve) {}
+    bool operator()(const std::pair<const Value*, unsigned> &LHS,
+                    const std::pair<const Value*, unsigned> &RHS) {
+      // Sort by plane.
+      if (LHS.first->getType() != RHS.first->getType())
+        return VE.getTypeID(LHS.first->getType()) <
+               VE.getTypeID(RHS.first->getType());
+      // Then by frequency.
+      return LHS.second > RHS.second;
+    }
+  };
+}
+
+/// OptimizeConstants - Reorder constant pool for denser encoding.
+void ValueEnumerator::OptimizeConstants(unsigned CstStart, unsigned CstEnd) {
+  if (CstStart == CstEnd || CstStart+1 == CstEnd) return;
+
+  CstSortPredicate P(*this);
+  std::stable_sort(Values.begin()+CstStart, Values.begin()+CstEnd, P);
+
+  // Ensure that integer and vector of integer constants are at the start of the
+  // constant pool.  This is important so that GEP structure indices come before
+  // gep constant exprs.
+  std::partition(Values.begin()+CstStart, Values.begin()+CstEnd,
+                 isIntOrIntVectorValue);
+
+  // Rebuild the modified portion of ValueMap.
+  for (; CstStart != CstEnd; ++CstStart)
+    ValueMap[Values[CstStart].first] = CstStart+1;
+}
+
+
+/// EnumerateValueSymbolTable - Insert all of the values in the specified symbol
+/// table into the values table.
+void ValueEnumerator::EnumerateValueSymbolTable(const ValueSymbolTable &VST) {
+  for (ValueSymbolTable::const_iterator VI = VST.begin(), VE = VST.end();
+       VI != VE; ++VI)
+    EnumerateValue(VI->getValue());
+}
+
+/// EnumerateNamedMetadata - Insert all of the values referenced by
+/// named metadata in the specified module.
+void ValueEnumerator::EnumerateNamedMetadata(const Module *M) {
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+       E = M->named_metadata_end(); I != E; ++I)
+    EnumerateNamedMDNode(I);
+}
+
+void ValueEnumerator::EnumerateNamedMDNode(const NamedMDNode *MD) {
+  for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i)
+    EnumerateMetadata(MD->getOperand(i));
+}
+
+/// EnumerateMDNodeOperands - Enumerate all non-function-local values
+/// and types referenced by the given MDNode.
+void ValueEnumerator::EnumerateMDNodeOperands(const MDNode *N) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i)) {
+      if (isa<MDNode>(V) || isa<MDString>(V))
+        EnumerateMetadata(V);
+      else if (!isa<Instruction>(V) && !isa<Argument>(V))
+        EnumerateValue(V);
+    } else
+      EnumerateType(Type::getVoidTy(N->getContext()));
+  }
+}
+
+void ValueEnumerator::EnumerateMetadata(const Value *MD) {
+  assert((isa<MDNode>(MD) || isa<MDString>(MD)) && "Invalid metadata kind");
+
+  // Enumerate the type of this value.
+  EnumerateType(MD->getType());
+
+  const MDNode *N = dyn_cast<MDNode>(MD);
+
+  // In the module-level pass, skip function-local nodes themselves, but
+  // do walk their operands.
+  if (N && N->isFunctionLocal() && N->getFunction()) {
+    EnumerateMDNodeOperands(N);
+    return;
+  }
+
+  // Check to see if it's already in!
+  unsigned &MDValueID = MDValueMap[MD];
+  if (MDValueID) {
+    // Increment use count.
+    MDValues[MDValueID-1].second++;
+    return;
+  }
+  MDValues.push_back(std::make_pair(MD, 1U));
+  MDValueID = MDValues.size();
+
+  // Enumerate all non-function-local operands.
+  if (N)
+    EnumerateMDNodeOperands(N);
+}
+
+/// EnumerateFunctionLocalMetadataa - Incorporate function-local metadata
+/// information reachable from the given MDNode.
+void ValueEnumerator::EnumerateFunctionLocalMetadata(const MDNode *N) {
+  assert(N->isFunctionLocal() && N->getFunction() &&
+         "EnumerateFunctionLocalMetadata called on non-function-local mdnode!");
+
+  // Enumerate the type of this value.
+  EnumerateType(N->getType());
+
+  // Check to see if it's already in!
+  unsigned &MDValueID = MDValueMap[N];
+  if (MDValueID) {
+    // Increment use count.
+    MDValues[MDValueID-1].second++;
+    return;
+  }
+  MDValues.push_back(std::make_pair(N, 1U));
+  MDValueID = MDValues.size();
+
+  // To incoroporate function-local information visit all function-local
+  // MDNodes and all function-local values they reference.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (Value *V = N->getOperand(i)) {
+      if (MDNode *O = dyn_cast<MDNode>(V)) {
+        if (O->isFunctionLocal() && O->getFunction())
+          EnumerateFunctionLocalMetadata(O);
+      } else if (isa<Instruction>(V) || isa<Argument>(V))
+        EnumerateValue(V);
+    }
+
+  // Also, collect all function-local MDNodes for easy access.
+  FunctionLocalMDs.push_back(N);
+}
+
+void ValueEnumerator::EnumerateValue(const Value *V) {
+  assert(!V->getType()->isVoidTy() && "Can't insert void values!");
+  assert(!isa<MDNode>(V) && !isa<MDString>(V) &&
+         "EnumerateValue doesn't handle Metadata!");
+
+  // Check to see if it's already in!
+  unsigned &ValueID = ValueMap[V];
+  if (ValueID) {
+    // Increment use count.
+    Values[ValueID-1].second++;
+    return;
+  }
+
+  // Enumerate the type of this value.
+  EnumerateType(V->getType());
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (isa<GlobalValue>(C)) {
+      // Initializers for globals are handled explicitly elsewhere.
+    } else if (C->getNumOperands()) {
+      // If a constant has operands, enumerate them.  This makes sure that if a
+      // constant has uses (for example an array of const ints), that they are
+      // inserted also.
+
+      // We prefer to enumerate them with values before we enumerate the user
+      // itself.  This makes it more likely that we can avoid forward references
+      // in the reader.  We know that there can be no cycles in the constants
+      // graph that don't go through a global variable.
+      for (User::const_op_iterator I = C->op_begin(), E = C->op_end();
+           I != E; ++I)
+        if (!isa<BasicBlock>(*I)) // Don't enumerate BB operand to BlockAddress.
+          EnumerateValue(*I);
+
+      // Finally, add the value.  Doing this could make the ValueID reference be
+      // dangling, don't reuse it.
+      Values.push_back(std::make_pair(V, 1U));
+      ValueMap[V] = Values.size();
+      return;
+    }
+  }
+
+  // Add the value.
+  Values.push_back(std::make_pair(V, 1U));
+  ValueID = Values.size();
+}
+
+
+void ValueEnumerator::EnumerateType(Type *Ty) {
+  unsigned *TypeID = &TypeMap[Ty];
+
+  // We've already seen this type.
+  if (*TypeID)
+    return;
+
+  // If it is a non-anonymous struct, mark the type as being visited so that we
+  // don't recursively visit it.  This is safe because we allow forward
+  // references of these in the bitcode reader.
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!STy->isLiteral())
+      *TypeID = ~0U;
+
+  // Enumerate all of the subtypes before we enumerate this type.  This ensures
+  // that the type will be enumerated in an order that can be directly built.
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
+    EnumerateType(*I);
+
+  // Refresh the TypeID pointer in case the table rehashed.
+  TypeID = &TypeMap[Ty];
+
+  // Check to see if we got the pointer another way.  This can happen when
+  // enumerating recursive types that hit the base case deeper than they start.
+  //
+  // If this is actually a struct that we are treating as forward ref'able,
+  // then emit the definition now that all of its contents are available.
+  if (*TypeID && *TypeID != ~0U)
+    return;
+
+  // Add this type now that its contents are all happily enumerated.
+  Types.push_back(Ty);
+
+  *TypeID = Types.size();
+}
+
+// Enumerate the types for the specified value.  If the value is a constant,
+// walk through it, enumerating the types of the constant.
+void ValueEnumerator::EnumerateOperandType(const Value *V) {
+  EnumerateType(V->getType());
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    // If this constant is already enumerated, ignore it, we know its type must
+    // be enumerated.
+    if (ValueMap.count(V)) return;
+
+    // This constant may have operands, make sure to enumerate the types in
+    // them.
+    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
+      const Value *Op = C->getOperand(i);
+
+      // Don't enumerate basic blocks here, this happens as operands to
+      // blockaddress.
+      if (isa<BasicBlock>(Op)) continue;
+
+      EnumerateOperandType(Op);
+    }
+
+    if (const MDNode *N = dyn_cast<MDNode>(V)) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        if (Value *Elem = N->getOperand(i))
+          EnumerateOperandType(Elem);
+    }
+  } else if (isa<MDString>(V) || isa<MDNode>(V))
+    EnumerateMetadata(V);
+}
+
+void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
+  if (PAL.isEmpty()) return;  // null is always 0.
+
+  // Do a lookup.
+  unsigned &Entry = AttributeMap[PAL];
+  if (Entry == 0) {
+    // Never saw this before, add it.
+    Attribute.push_back(PAL);
+    Entry = Attribute.size();
+  }
+
+  // Do lookups for all attribute groups.
+  for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
+    AttributeSet AS = PAL.getSlotAttributes(i);
+    unsigned &Entry = AttributeGroupMap[AS];
+    if (Entry == 0) {
+      AttributeGroups.push_back(AS);
+      Entry = AttributeGroups.size();
+    }
+  }
+}
+
+void ValueEnumerator::incorporateFunction(const Function &F) {
+  InstructionCount = 0;
+  NumModuleValues = Values.size();
+  NumModuleMDValues = MDValues.size();
+
+  // Adding function arguments to the value table.
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I)
+    EnumerateValue(I);
+
+  FirstFuncConstantID = Values.size();
+
+  // Add all function-level constants to the value table.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I)
+      for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
+           OI != E; ++OI) {
+        if ((isa<Constant>(*OI) && !isa<GlobalValue>(*OI)) ||
+            isa<InlineAsm>(*OI))
+          EnumerateValue(*OI);
+      }
+    BasicBlocks.push_back(BB);
+    ValueMap[BB] = BasicBlocks.size();
+  }
+
+  // Optimize the constant layout.
+  OptimizeConstants(FirstFuncConstantID, Values.size());
+
+  // Add the function's parameter attributes so they are available for use in
+  // the function's instruction.
+  EnumerateAttributes(F.getAttributes());
+
+  FirstInstID = Values.size();
+
+  SmallVector<MDNode *, 8> FnLocalMDVector;
+  // Add all of the instructions.
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
+      for (User::const_op_iterator OI = I->op_begin(), E = I->op_end();
+           OI != E; ++OI) {
+        if (MDNode *MD = dyn_cast<MDNode>(*OI))
+          if (MD->isFunctionLocal() && MD->getFunction())
+            // Enumerate metadata after the instructions they might refer to.
+            FnLocalMDVector.push_back(MD);
+      }
+
+      SmallVector<std::pair<unsigned, MDNode*>, 8> MDs;
+      I->getAllMetadataOtherThanDebugLoc(MDs);
+      for (unsigned i = 0, e = MDs.size(); i != e; ++i) {
+        MDNode *N = MDs[i].second;
+        if (N->isFunctionLocal() && N->getFunction())
+          FnLocalMDVector.push_back(N);
+      }
+
+      if (!I->getType()->isVoidTy())
+        EnumerateValue(I);
+    }
+  }
+
+  // Add all of the function-local metadata.
+  for (unsigned i = 0, e = FnLocalMDVector.size(); i != e; ++i)
+    EnumerateFunctionLocalMetadata(FnLocalMDVector[i]);
+}
+
+void ValueEnumerator::purgeFunction() {
+  /// Remove purged values from the ValueMap.
+  for (unsigned i = NumModuleValues, e = Values.size(); i != e; ++i)
+    ValueMap.erase(Values[i].first);
+  for (unsigned i = NumModuleMDValues, e = MDValues.size(); i != e; ++i)
+    MDValueMap.erase(MDValues[i].first);
+  for (unsigned i = 0, e = BasicBlocks.size(); i != e; ++i)
+    ValueMap.erase(BasicBlocks[i]);
+
+  Values.resize(NumModuleValues);
+  MDValues.resize(NumModuleMDValues);
+  BasicBlocks.clear();
+  FunctionLocalMDs.clear();
+}
+
+static void IncorporateFunctionInfoGlobalBBIDs(const Function *F,
+                                 DenseMap<const BasicBlock*, unsigned> &IDMap) {
+  unsigned Counter = 0;
+  for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+    IDMap[BB] = ++Counter;
+}
+
+/// getGlobalBasicBlockID - This returns the function-specific ID for the
+/// specified basic block.  This is relatively expensive information, so it
+/// should only be used by rare constructs such as address-of-label.
+unsigned ValueEnumerator::getGlobalBasicBlockID(const BasicBlock *BB) const {
+  unsigned &Idx = GlobalBasicBlockIDs[BB];
+  if (Idx != 0)
+    return Idx-1;
+
+  IncorporateFunctionInfoGlobalBBIDs(BB->getParent(), GlobalBasicBlockIDs);
+  return getGlobalBasicBlockID(BB);
+}
+
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
new file mode 100644
index 000000000000..0af6164c944f
--- /dev/null
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -0,0 +1,171 @@
+//===-- Bitcode/Writer/ValueEnumerator.h - Number values --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class gives values and types Unique ID's.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef VALUE_ENUMERATOR_H
+#define VALUE_ENUMERATOR_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Attributes.h"
+#include <vector>
+
+namespace llvm {
+
+class Type;
+class Value;
+class Instruction;
+class BasicBlock;
+class Function;
+class Module;
+class MDNode;
+class NamedMDNode;
+class AttributeSet;
+class ValueSymbolTable;
+class MDSymbolTable;
+class raw_ostream;
+
+class ValueEnumerator {
+public:
+  typedef std::vector<Type*> TypeList;
+
+  // For each value, we remember its Value* and occurrence frequency.
+  typedef std::vector<std::pair<const Value*, unsigned> > ValueList;
+private:
+  typedef DenseMap<Type*, unsigned> TypeMapType;
+  TypeMapType TypeMap;
+  TypeList Types;
+
+  typedef DenseMap<const Value*, unsigned> ValueMapType;
+  ValueMapType ValueMap;
+  ValueList Values;
+  ValueList MDValues;
+  SmallVector<const MDNode *, 8> FunctionLocalMDs;
+  ValueMapType MDValueMap;
+
+  typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
+  AttributeGroupMapType AttributeGroupMap;
+  std::vector<AttributeSet> AttributeGroups;
+
+  typedef DenseMap<AttributeSet, unsigned> AttributeMapType;
+  AttributeMapType AttributeMap;
+  std::vector<AttributeSet> Attribute;
+
+  /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by
+  /// the "getGlobalBasicBlockID" method.
+  mutable DenseMap<const BasicBlock*, unsigned> GlobalBasicBlockIDs;
+
+  typedef DenseMap<const Instruction*, unsigned> InstructionMapType;
+  InstructionMapType InstructionMap;
+  unsigned InstructionCount;
+
+  /// BasicBlocks - This contains all the basic blocks for the currently
+  /// incorporated function.  Their reverse mapping is stored in ValueMap.
+  std::vector<const BasicBlock*> BasicBlocks;
+
+  /// When a function is incorporated, this is the size of the Values list
+  /// before incorporation.
+  unsigned NumModuleValues;
+
+  /// When a function is incorporated, this is the size of the MDValues list
+  /// before incorporation.
+  unsigned NumModuleMDValues;
+
+  unsigned FirstFuncConstantID;
+  unsigned FirstInstID;
+
+  ValueEnumerator(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
+  void operator=(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
+public:
+  ValueEnumerator(const Module *M);
+
+  void dump() const;
+  void print(raw_ostream &OS, const ValueMapType &Map, const char *Name) const;
+
+  unsigned getValueID(const Value *V) const;
+
+  unsigned getTypeID(Type *T) const {
+    TypeMapType::const_iterator I = TypeMap.find(T);
+    assert(I != TypeMap.end() && "Type not in ValueEnumerator!");
+    return I->second-1;
+  }
+
+  unsigned getInstructionID(const Instruction *I) const;
+  void setInstructionID(const Instruction *I);
+
+  unsigned getAttributeID(AttributeSet PAL) const {
+    if (PAL.isEmpty()) return 0;  // Null maps to zero.
+    AttributeMapType::const_iterator I = AttributeMap.find(PAL);
+    assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
+  unsigned getAttributeGroupID(AttributeSet PAL) const {
+    if (PAL.isEmpty()) return 0;  // Null maps to zero.
+    AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL);
+    assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
+    return I->second;
+  }
+
+  /// getFunctionConstantRange - Return the range of values that corresponds to
+  /// function-local constants.
+  void getFunctionConstantRange(unsigned &Start, unsigned &End) const {
+    Start = FirstFuncConstantID;
+    End = FirstInstID;
+  }
+
+  const ValueList &getValues() const { return Values; }
+  const ValueList &getMDValues() const { return MDValues; }
+  const SmallVector<const MDNode *, 8> &getFunctionLocalMDValues() const {
+    return FunctionLocalMDs;
+  }
+  const TypeList &getTypes() const { return Types; }
+  const std::vector<const BasicBlock*> &getBasicBlocks() const {
+    return BasicBlocks;
+  }
+  const std::vector<AttributeSet> &getAttributes() const {
+    return Attribute;
+  }
+  const std::vector<AttributeSet> &getAttributeGroups() const {
+    return AttributeGroups;
+  }
+
+  /// getGlobalBasicBlockID - This returns the function-specific ID for the
+  /// specified basic block.  This is relatively expensive information, so it
+  /// should only be used by rare constructs such as address-of-label.
+  unsigned getGlobalBasicBlockID(const BasicBlock *BB) const;
+
+  /// incorporateFunction/purgeFunction - If you'd like to deal with a function,
+  /// use these two methods to get its data into the ValueEnumerator!
+  ///
+  void incorporateFunction(const Function &F);
+  void purgeFunction();
+
+private:
+  void OptimizeConstants(unsigned CstStart, unsigned CstEnd);
+
+  void EnumerateMDNodeOperands(const MDNode *N);
+  void EnumerateMetadata(const Value *MD);
+  void EnumerateFunctionLocalMetadata(const MDNode *N);
+  void EnumerateNamedMDNode(const NamedMDNode *NMD);
+  void EnumerateValue(const Value *V);
+  void EnumerateType(Type *T);
+  void EnumerateOperandType(const Value *V);
+  void EnumerateAttributes(AttributeSet PAL);
+
+  void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
+  void EnumerateNamedMetadata(const Module *M);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
new file mode 100644
index 000000000000..c50f8b5a42ad
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -0,0 +1,943 @@
+//===----- AggressiveAntiDepBreaker.cpp - Anti-dep breaker ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "AggressiveAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+// If DebugDiv > 0 then only break antidep with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("agg-antidep-debugdiv",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("agg-antidep-debugmod",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB) :
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0),
+  GroupNodeIndices(TargetRegs, 0),
+  KillIndices(TargetRegs, 0),
+  DefIndices(TargetRegs, 0)
+{
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
+    GroupNodeIndices[i] = i;
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+}
+
+unsigned AggressiveAntiDepState::GetGroup(unsigned Reg) {
+  unsigned Node = GroupNodeIndices[Reg];
+  while (GroupNodes[Node] != Node)
+    Node = GroupNodes[Node];
+
+  return Node;
+}
+
+void AggressiveAntiDepState::GetGroupRegs(
+  unsigned Group,
+  std::vector<unsigned> &Regs,
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
+{
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
+    if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
+      Regs.push_back(Reg);
+  }
+}
+
+unsigned AggressiveAntiDepState::UnionGroups(unsigned Reg1, unsigned Reg2)
+{
+  assert(GroupNodes[0] == 0 && "GroupNode 0 not parent!");
+  assert(GroupNodeIndices[0] == 0 && "Reg 0 not in Group 0!");
+
+  // find group for each register
+  unsigned Group1 = GetGroup(Reg1);
+  unsigned Group2 = GetGroup(Reg2);
+
+  // if either group is 0, then that must become the parent
+  unsigned Parent = (Group1 == 0) ? Group1 : Group2;
+  unsigned Other = (Parent == Group1) ? Group2 : Group1;
+  GroupNodes.at(Other) = Parent;
+  return Parent;
+}
+
+unsigned AggressiveAntiDepState::LeaveGroup(unsigned Reg)
+{
+  // Create a new GroupNode for Reg. Reg's existing GroupNode must
+  // stay as is because there could be other GroupNodes referring to
+  // it.
+  unsigned idx = GroupNodes.size();
+  GroupNodes.push_back(idx);
+  GroupNodeIndices[Reg] = idx;
+  return idx;
+}
+
+bool AggressiveAntiDepState::IsLive(unsigned Reg)
+{
+  // KillIndex must be defined and DefIndex not defined for a register
+  // to be live.
+  return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
+}
+
+
+
+AggressiveAntiDepBreaker::
+AggressiveAntiDepBreaker(MachineFunction& MFi,
+                         const RegisterClassInfo &RCI,
+                         TargetSubtargetInfo::RegClassVector& CriticalPathRCs) :
+  AntiDepBreaker(), MF(MFi),
+  MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
+  TRI(MF.getTarget().getRegisterInfo()),
+  RegClassInfo(RCI),
+  State(NULL) {
+  /* Collect a bitset of all registers that are only broken if they
+     are on the critical path. */
+  for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
+    BitVector CPSet = TRI->getAllocatableSet(MF, CriticalPathRCs[i]);
+    if (CriticalPathSet.none())
+      CriticalPathSet = CPSet;
+    else
+      CriticalPathSet |= CPSet;
+   }
+
+  DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
+  DEBUG(for (int r = CriticalPathSet.find_first(); r != -1;
+             r = CriticalPathSet.find_next(r))
+          dbgs() << " " << TRI->getName(r));
+  DEBUG(dbgs() << '\n');
+}
+
+AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
+  delete State;
+}
+
+void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  assert(State == NULL);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
+
+  bool IsReturnBlock = (!BB->empty() && BB->back().isReturn());
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+           E = (*SI)->livein_end(); I != E; ++I) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        State->UnionGroups(Reg, 0);
+        KillIndices[Reg] = BB->size();
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(BB);
+  for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      State->UnionGroups(AliasReg, 0);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::FinishBlock() {
+  delete State;
+  State = NULL;
+}
+
+void AggressiveAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+                                       unsigned InsertPosIndex) {
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  std::set<unsigned> PassthruRegs;
+  GetPassthruRegs(MI, PassthruRegs);
+  PrescanInstruction(MI, Count, PassthruRegs);
+  ScanInstruction(MI, Count);
+
+  DEBUG(dbgs() << "Observe: ");
+  DEBUG(MI->dump());
+  DEBUG(dbgs() << "\tRegs:");
+
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
+    // If Reg is current live, then mark that it can't be renamed as
+    // we don't know the extent of its live-range anymore (now that it
+    // has been scheduled). If it is not live but was defined in the
+    // previous schedule region, then set its def index to the most
+    // conservative location (i.e. the beginning of the previous
+    // schedule region).
+    if (State->IsLive(Reg)) {
+      DEBUG(if (State->GetGroup(Reg) != 0)
+              dbgs() << " " << TRI->getName(Reg) << "=g" <<
+                State->GetGroup(Reg) << "->g0(region live-out)");
+      State->UnionGroups(Reg, 0);
+    } else if ((DefIndices[Reg] < InsertPosIndex)
+               && (DefIndices[Reg] >= Count)) {
+      DefIndices[Reg] = Count;
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr *MI,
+                                                MachineOperand& MO)
+{
+  if (!MO.isReg() || !MO.isImplicit())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  if (Reg == 0)
+    return false;
+
+  MachineOperand *Op = NULL;
+  if (MO.isDef())
+    Op = MI->findRegisterUseOperand(Reg, true);
+  else
+    Op = MI->findRegisterDefOperand(Reg);
+
+  return((Op != NULL) && Op->isImplicit());
+}
+
+void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
+                                           std::set<unsigned>& PassthruRegs) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    if ((MO.isDef() && MI->isRegTiedToUseOperand(i)) ||
+        IsImplicitDefUse(MI, MO)) {
+      const unsigned Reg = MO.getReg();
+      PassthruRegs.insert(Reg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        PassthruRegs.insert(*SubRegs);
+    }
+  }
+}
+
+/// AntiDepEdges - Return in Edges the anti- and output- dependencies
+/// in SU that we want to consider for breaking.
+static void AntiDepEdges(const SUnit *SU, std::vector<const SDep*>& Edges) {
+  SmallSet<unsigned, 4> RegSet;
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    if ((P->getKind() == SDep::Anti) || (P->getKind() == SDep::Output)) {
+      unsigned Reg = P->getReg();
+      if (RegSet.count(Reg) == 0) {
+        Edges.push_back(&*P);
+        RegSet.insert(Reg);
+      }
+    }
+  }
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SUnit *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = 0;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  if (SU != 0) {
+    for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+         P != PE; ++P) {
+      const SUnit *PredSU = P->getSUnit();
+      unsigned PredLatency = P->getLatency();
+      unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+      // In the case of a latency tie, prefer an anti-dependency edge over
+      // other types of edges.
+      if (NextDepth < PredTotalLatency ||
+          (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+        NextDepth = PredTotalLatency;
+        Next = &*P;
+      }
+    }
+  }
+
+  return (Next) ? Next->getSUnit() : 0;
+}
+
+void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
+                                             const char *tag,
+                                             const char *header,
+                                             const char *footer) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  if (!State->IsLive(Reg)) {
+    KillIndices[Reg] = KillIdx;
+    DefIndices[Reg] = ~0u;
+    RegRefs.erase(Reg);
+    State->LeaveGroup(Reg);
+    DEBUG(if (header != NULL) {
+        dbgs() << header << TRI->getName(Reg); header = NULL; });
+    DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
+  }
+  // Repeat for subregisters.
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubregReg = *SubRegs;
+    if (!State->IsLive(SubregReg)) {
+      KillIndices[SubregReg] = KillIdx;
+      DefIndices[SubregReg] = ~0u;
+      RegRefs.erase(SubregReg);
+      State->LeaveGroup(SubregReg);
+      DEBUG(if (header != NULL) {
+          dbgs() << header << TRI->getName(Reg); header = NULL; });
+      DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
+            State->GetGroup(SubregReg) << tag);
+    }
+  }
+
+  DEBUG(if ((header == NULL) && (footer != NULL)) dbgs() << footer);
+}
+
+void AggressiveAntiDepBreaker::PrescanInstruction(MachineInstr *MI,
+                                                  unsigned Count,
+                                             std::set<unsigned>& PassthruRegs) {
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Handle dead defs by simulating a last-use of the register just
+  // after the def. A dead def can occur because the def is truly
+  // dead, or because only a subregister is live at the def. If we
+  // don't do this the dead def will be incorrectly merged into the
+  // previous def.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
+  }
+
+  DEBUG(dbgs() << "\tDef Groups:");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" << State->GetGroup(Reg));
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI)) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Any aliased that are live at this point are completely or
+    // partially defined here, so group those aliases with Reg.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (State->IsLive(AliasReg)) {
+        State->UnionGroups(Reg, AliasReg);
+        DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " <<
+              TRI->getName(AliasReg) << ")");
+      }
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = NULL;
+    if (i < MI->getDesc().getNumOperands())
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Scan the register defs for this instruction and update
+  // live-ranges.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    // Ignore KILLs and passthru registers for liveness...
+    if (MI->isKill() || (PassthruRegs.count(Reg) != 0))
+      continue;
+
+    // Update def for Reg and aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      DefIndices[*AI] = Count;
+  }
+}
+
+void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr *MI,
+                                               unsigned Count) {
+  DEBUG(dbgs() << "\tUse Groups:");
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->isCall() ||
+    MI->hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
+  // Scan the register uses for this instruction and update
+  // live-ranges, groups and RegRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" <<
+          State->GetGroup(Reg));
+
+    // It wasn't previously live but now it is, this is a kill. Forget
+    // the previous live-range information and start a new live-range
+    // for the register.
+    HandleLastUse(Reg, Count, "(last-use)");
+
+    if (Special) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = NULL;
+    if (i < MI->getDesc().getNumOperands())
+      RC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Form a group of all defs and uses of a KILL instruction to ensure
+  // that all registers are renamed as a group.
+  if (MI->isKill()) {
+    DEBUG(dbgs() << "\tKill Group:");
+
+    unsigned FirstReg = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (FirstReg != 0) {
+        DEBUG(dbgs() << "=" << TRI->getName(Reg));
+        State->UnionGroups(FirstReg, Reg);
+      } else {
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+        FirstReg = Reg;
+      }
+    }
+
+    DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
+  }
+}
+
+BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
+  BitVector BV(TRI->getNumRegs(), false);
+  bool first = true;
+
+  // Check all references that need rewriting for Reg. For each, use
+  // the corresponding register class to narrow the set of registers
+  // that are appropriate for renaming.
+  std::pair<std::multimap<unsigned,
+                     AggressiveAntiDepState::RegisterReference>::iterator,
+            std::multimap<unsigned,
+                     AggressiveAntiDepState::RegisterReference>::iterator>
+    Range = State->GetRegRefs().equal_range(Reg);
+  for (std::multimap<unsigned,
+       AggressiveAntiDepState::RegisterReference>::iterator Q = Range.first,
+       QE = Range.second; Q != QE; ++Q) {
+    const TargetRegisterClass *RC = Q->second.RC;
+    if (RC == NULL) continue;
+
+    BitVector RCBV = TRI->getAllocatableSet(MF, RC);
+    if (first) {
+      BV |= RCBV;
+      first = false;
+    } else {
+      BV &= RCBV;
+    }
+
+    DEBUG(dbgs() << " " << RC->getName());
+  }
+
+  return BV;
+}
+
+bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
+                                unsigned AntiDepGroupIndex,
+                                RenameOrderType& RenameOrder,
+                                std::map<unsigned, unsigned> &RenameMap) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Collect all referenced registers in the same group as
+  // AntiDepReg. These all need to be renamed together if we are to
+  // break the anti-dependence.
+  std::vector<unsigned> Regs;
+  State->GetGroupRegs(AntiDepGroupIndex, Regs, &RegRefs);
+  assert(Regs.size() > 0 && "Empty register group!");
+  if (Regs.size() == 0)
+    return false;
+
+  // Find the "superest" register in the group. At the same time,
+  // collect the BitVector of registers that can be used to rename
+  // each register.
+  DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
+        << ":\n");
+  std::map<unsigned, BitVector> RenameRegisterMap;
+  unsigned SuperReg = 0;
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg))
+      SuperReg = Reg;
+
+    // If Reg has any references, then collect possible rename regs
+    if (RegRefs.count(Reg) > 0) {
+      DEBUG(dbgs() << "\t\t" << TRI->getName(Reg) << ":");
+
+      BitVector BV = GetRenameRegisters(Reg);
+      RenameRegisterMap.insert(std::pair<unsigned, BitVector>(Reg, BV));
+
+      DEBUG(dbgs() << " ::");
+      DEBUG(for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+              dbgs() << " " << TRI->getName(r));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+
+  // All group registers should be a subreg of SuperReg.
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if (Reg == SuperReg) continue;
+    bool IsSub = TRI->isSubRegister(SuperReg, Reg);
+    assert(IsSub && "Expecting group subregister");
+    if (!IsSub)
+      return false;
+  }
+
+#ifndef NDEBUG
+  // If DebugDiv > 0 then only rename (renamecnt % DebugDiv) == DebugMod
+  if (DebugDiv > 0) {
+    static int renamecnt = 0;
+    if (renamecnt++ % DebugDiv != DebugMod)
+      return false;
+
+    dbgs() << "*** Performing rename " << TRI->getName(SuperReg) <<
+      " for debug ***\n";
+  }
+#endif
+
+  // Check each possible rename register for SuperReg in round-robin
+  // order. If that register is available, and the corresponding
+  // registers are available for the other group subregisters, then we
+  // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
+  const TargetRegisterClass *SuperRC =
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
+
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(SuperRC);
+  if (Order.empty()) {
+    DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "\tFind Registers:");
+
+  if (RenameOrder.count(SuperRC) == 0)
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
+
+  unsigned OrigR = RenameOrder[SuperRC];
+  unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);
+  unsigned R = OrigR;
+  do {
+    if (R == 0) R = Order.size();
+    --R;
+    const unsigned NewSuperReg = Order[R];
+    // Don't consider non-allocatable registers
+    if (!MRI.isAllocatable(NewSuperReg)) continue;
+    // Don't replace a register with itself.
+    if (NewSuperReg == SuperReg) continue;
+
+    DEBUG(dbgs() << " [" << TRI->getName(NewSuperReg) << ':');
+    RenameMap.clear();
+
+    // For each referenced group register (which must be a SuperReg or
+    // a subregister of SuperReg), find the corresponding subregister
+    // of NewSuperReg and make sure it is free to be renamed.
+    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+      unsigned Reg = Regs[i];
+      unsigned NewReg = 0;
+      if (Reg == SuperReg) {
+        NewReg = NewSuperReg;
+      } else {
+        unsigned NewSubRegIdx = TRI->getSubRegIndex(SuperReg, Reg);
+        if (NewSubRegIdx != 0)
+          NewReg = TRI->getSubReg(NewSuperReg, NewSubRegIdx);
+      }
+
+      DEBUG(dbgs() << " " << TRI->getName(NewReg));
+
+      // Check if Reg can be renamed to NewReg.
+      BitVector BV = RenameRegisterMap[Reg];
+      if (!BV.test(NewReg)) {
+        DEBUG(dbgs() << "(no rename)");
+        goto next_super_reg;
+      }
+
+      // If NewReg is dead and NewReg's most recent def is not before
+      // Regs's kill, it's safe to replace Reg with NewReg. We
+      // must also check all aliases of NewReg, because we can't define a
+      // register when any sub or super is already live.
+      if (State->IsLive(NewReg) || (KillIndices[Reg] > DefIndices[NewReg])) {
+        DEBUG(dbgs() << "(live)");
+        goto next_super_reg;
+      } else {
+        bool found = false;
+        for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) {
+          unsigned AliasReg = *AI;
+          if (State->IsLive(AliasReg) ||
+              (KillIndices[Reg] > DefIndices[AliasReg])) {
+            DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)");
+            found = true;
+            break;
+          }
+        }
+        if (found)
+          goto next_super_reg;
+      }
+
+      // Record that 'Reg' can be renamed to 'NewReg'.
+      RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
+    }
+
+    // If we fall-out here, then every register in the group can be
+    // renamed, as recorded in RenameMap.
+    RenameOrder.erase(SuperRC);
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, R));
+    DEBUG(dbgs() << "]\n");
+    return true;
+
+  next_super_reg:
+    DEBUG(dbgs() << ']');
+  } while (R != EndR);
+
+  DEBUG(dbgs() << '\n');
+
+  // No registers are free and available!
+  return false;
+}
+
+/// BreakAntiDependencies - Identifiy anti-dependencies within the
+/// ScheduleDAG and break them by renaming registers.
+///
+unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
+                              const std::vector<SUnit>& SUnits,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              unsigned InsertPosIndex,
+                              DbgValueVector &DbgValues) {
+
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // For each regclass the next register to use for renaming.
+  RenameOrderType RenameOrder;
+
+  // ...need a map from MI to SUnit.
+  std::map<MachineInstr *, const SUnit *> MISUnitMap;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap.insert(std::pair<MachineInstr *, const SUnit *>(SU->getInstr(),
+                                                               SU));
+  }
+
+  // Track progress along the critical path through the SUnit graph as
+  // we walk the instructions. This is needed for regclasses that only
+  // break critical-path anti-dependencies.
+  const SUnit *CriticalPathSU = 0;
+  MachineInstr *CriticalPathMI = 0;
+  if (CriticalPathSet.any()) {
+    for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+      const SUnit *SU = &SUnits[i];
+      if (!CriticalPathSU ||
+          ((SU->getDepth() + SU->Latency) >
+           (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) {
+        CriticalPathSU = SU;
+      }
+    }
+
+    CriticalPathMI = CriticalPathSU->getInstr();
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
+  DEBUG(dbgs() << "Available regs:");
+  for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+    if (!State->IsLive(Reg))
+      DEBUG(dbgs() << " " << TRI->getName(Reg));
+  }
+  DEBUG(dbgs() << '\n');
+#endif
+
+  // Attempt to break anti-dependence edges. Walk the instructions
+  // from the bottom up, tracking information about liveness as we go
+  // to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+
+    if (MI->isDebugValue())
+      continue;
+
+    DEBUG(dbgs() << "Anti: ");
+    DEBUG(MI->dump());
+
+    std::set<unsigned> PassthruRegs;
+    GetPassthruRegs(MI, PassthruRegs);
+
+    // Process the defs in MI...
+    PrescanInstruction(MI, Count, PassthruRegs);
+
+    // The dependence edges that represent anti- and output-
+    // dependencies that are candidates for breaking.
+    std::vector<const SDep *> Edges;
+    const SUnit *PathSU = MISUnitMap[MI];
+    AntiDepEdges(PathSU, Edges);
+
+    // If MI is not on the critical path, then we don't rename
+    // registers in the CriticalPathSet.
+    BitVector *ExcludeRegs = NULL;
+    if (MI == CriticalPathMI) {
+      CriticalPathSU = CriticalPathStep(CriticalPathSU);
+      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
+    } else {
+      ExcludeRegs = &CriticalPathSet;
+    }
+
+    // Ignore KILL instructions (they form a group in ScanInstruction
+    // but don't cause any anti-dependence breaking themselves)
+    if (!MI->isKill()) {
+      // Attempt to break each anti-dependency...
+      for (unsigned i = 0, e = Edges.size(); i != e; ++i) {
+        const SDep *Edge = Edges[i];
+        SUnit *NextSU = Edge->getSUnit();
+
+        if ((Edge->getKind() != SDep::Anti) &&
+            (Edge->getKind() != SDep::Output)) continue;
+
+        unsigned AntiDepReg = Edge->getReg();
+        DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg));
+        assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+
+        if (!MRI.isAllocatable(AntiDepReg)) {
+          // Don't break anti-dependencies on non-allocatable registers.
+          DEBUG(dbgs() << " (non-allocatable)\n");
+          continue;
+        } else if ((ExcludeRegs != NULL) && ExcludeRegs->test(AntiDepReg)) {
+          // Don't break anti-dependencies for critical path registers
+          // if not on the critical path
+          DEBUG(dbgs() << " (not critical-path)\n");
+          continue;
+        } else if (PassthruRegs.count(AntiDepReg) != 0) {
+          // If the anti-dep register liveness "passes-thru", then
+          // don't try to change it. It will be changed along with
+          // the use if required to break an earlier antidep.
+          DEBUG(dbgs() << " (passthru)\n");
+          continue;
+        } else {
+          // No anti-dep breaking for implicit deps
+          MachineOperand *AntiDepOp = MI->findRegisterDefOperand(AntiDepReg);
+          assert(AntiDepOp != NULL &&
+                 "Can't find index for defined register operand");
+          if ((AntiDepOp == NULL) || AntiDepOp->isImplicit()) {
+            DEBUG(dbgs() << " (implicit)\n");
+            continue;
+          }
+
+          // If the SUnit has other dependencies on the SUnit that
+          // it anti-depends on, don't bother breaking the
+          // anti-dependency since those edges would prevent such
+          // units from being scheduled past each other
+          // regardless.
+          //
+          // Also, if there are dependencies on other SUnits with the
+          // same register as the anti-dependency, don't attempt to
+          // break it.
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if (P->getSUnit() == NextSU ?
+                (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+              AntiDepReg = 0;
+              break;
+            }
+          }
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if ((P->getSUnit() == NextSU) && (P->getKind() != SDep::Anti) &&
+                (P->getKind() != SDep::Output)) {
+              DEBUG(dbgs() << " (real dependency)\n");
+              AntiDepReg = 0;
+              break;
+            } else if ((P->getSUnit() != NextSU) &&
+                       (P->getKind() == SDep::Data) &&
+                       (P->getReg() == AntiDepReg)) {
+              DEBUG(dbgs() << " (other dependency)\n");
+              AntiDepReg = 0;
+              break;
+            }
+          }
+
+          if (AntiDepReg == 0) continue;
+        }
+
+        assert(AntiDepReg != 0);
+        if (AntiDepReg == 0) continue;
+
+        // Determine AntiDepReg's register group.
+        const unsigned GroupIndex = State->GetGroup(AntiDepReg);
+        if (GroupIndex == 0) {
+          DEBUG(dbgs() << " (zero group)\n");
+          continue;
+        }
+
+        DEBUG(dbgs() << '\n');
+
+        // Look for a suitable register to use to break the anti-dependence.
+        std::map<unsigned, unsigned> RenameMap;
+        if (FindSuitableFreeRegisters(GroupIndex, RenameOrder, RenameMap)) {
+          DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
+                << TRI->getName(AntiDepReg) << ":");
+
+          // Handle each group register...
+          for (std::map<unsigned, unsigned>::iterator
+                 S = RenameMap.begin(), E = RenameMap.end(); S != E; ++S) {
+            unsigned CurrReg = S->first;
+            unsigned NewReg = S->second;
+
+            DEBUG(dbgs() << " " << TRI->getName(CurrReg) << "->" <<
+                  TRI->getName(NewReg) << "(" <<
+                  RegRefs.count(CurrReg) << " refs)");
+
+            // Update the references to the old register CurrReg to
+            // refer to the new register NewReg.
+            std::pair<std::multimap<unsigned,
+                           AggressiveAntiDepState::RegisterReference>::iterator,
+                      std::multimap<unsigned,
+                           AggressiveAntiDepState::RegisterReference>::iterator>
+              Range = RegRefs.equal_range(CurrReg);
+            for (std::multimap<unsigned,
+                 AggressiveAntiDepState::RegisterReference>::iterator
+                   Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+              Q->second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q->second.Operand->getParent()];
+              if (!SU) continue;
+              for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                     DVE = DbgValues.end(); DVI != DVE; ++DVI)
+                if (DVI->second == Q->second.Operand->getParent())
+                  UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
+            }
+
+            // We just went back in time and modified history; the
+            // liveness information for CurrReg is now inconsistent. Set
+            // the state as if it were dead.
+            State->UnionGroups(NewReg, 0);
+            RegRefs.erase(NewReg);
+            DefIndices[NewReg] = DefIndices[CurrReg];
+            KillIndices[NewReg] = KillIndices[CurrReg];
+
+            State->UnionGroups(CurrReg, 0);
+            RegRefs.erase(CurrReg);
+            DefIndices[CurrReg] = KillIndices[CurrReg];
+            KillIndices[CurrReg] = ~0u;
+            assert(((KillIndices[CurrReg] == ~0u) !=
+                    (DefIndices[CurrReg] == ~0u)) &&
+                   "Kill and Def maps aren't consistent for AntiDepReg!");
+          }
+
+          ++Broken;
+          DEBUG(dbgs() << '\n');
+        }
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
new file mode 100644
index 000000000000..6683630fba6d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -0,0 +1,184 @@
+//=- llvm/CodeGen/AggressiveAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
+
+namespace llvm {
+class RegisterClassInfo;
+
+  /// Class AggressiveAntiDepState
+  /// Contains all the state necessary for anti-dep breaking.
+  class AggressiveAntiDepState {
+  public:
+    /// RegisterReference - Information about a register reference
+    /// within a liverange
+    typedef struct {
+      /// Operand - The registers operand
+      MachineOperand *Operand;
+      /// RC - The register class
+      const TargetRegisterClass *RC;
+    } RegisterReference;
+
+  private:
+    /// NumTargetRegs - Number of non-virtual target registers
+    /// (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
+    /// GroupNodes - Implements a disjoint-union data structure to
+    /// form register groups. A node is represented by an index into
+    /// the vector. A node can "point to" itself to indicate that it
+    /// is the parent of a group, or point to another node to indicate
+    /// that it is a member of the same group as that node.
+    std::vector<unsigned> GroupNodes;
+
+    /// GroupNodeIndices - For each register, the index of the GroupNode
+    /// currently representing the group that the register belongs to.
+    /// Register 0 is always represented by the 0 group, a group
+    /// composed of registers that are not eligible for anti-aliasing.
+    std::vector<unsigned> GroupNodeIndices;
+
+    /// RegRefs - Map registers to all their references within a live range.
+    std::multimap<unsigned, RegisterReference> RegRefs;
+
+    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+  public:
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
+
+    /// GetKillIndices - Return the kill indices.
+    std::vector<unsigned> &GetKillIndices() { return KillIndices; }
+
+    /// GetDefIndices - Return the define indices.
+    std::vector<unsigned> &GetDefIndices() { return DefIndices; }
+
+    /// GetRegRefs - Return the RegRefs map.
+    std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
+
+    // GetGroup - Get the group for a register. The returned value is
+    // the index of the GroupNode representing the group.
+    unsigned GetGroup(unsigned Reg);
+
+    // GetGroupRegs - Return a vector of the registers belonging to a
+    // group. If RegRefs is non-NULL then only included referenced registers.
+    void GetGroupRegs(
+       unsigned Group,
+       std::vector<unsigned> &Regs,
+       std::multimap<unsigned,
+         AggressiveAntiDepState::RegisterReference> *RegRefs);
+
+    // UnionGroups - Union Reg1's and Reg2's groups to form a new
+    // group. Return the index of the GroupNode representing the
+    // group.
+    unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
+
+    // LeaveGroup - Remove a register from its current group and place
+    // it alone in its own group. Return the index of the GroupNode
+    // representing the registers new group.
+    unsigned LeaveGroup(unsigned Reg);
+
+    /// IsLive - Return true if Reg is live
+    bool IsLive(unsigned Reg);
+  };
+
+
+  /// Class AggressiveAntiDepBreaker
+  class AggressiveAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// CriticalPathSet - The set of registers that should only be
+    /// renamed if they are on the critical path.
+    BitVector CriticalPathSet;
+
+    /// State - The state used to identify and rename anti-dependence
+    /// registers.
+    AggressiveAntiDepState *State;
+
+  public:
+    AggressiveAntiDepBreaker(MachineFunction& MFi,
+                          const RegisterClassInfo &RCI,
+                          TargetSubtargetInfo::RegClassVector& CriticalPathRCs);
+    ~AggressiveAntiDepBreaker();
+
+    /// Start - Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
+    /// path
+    /// of the ScheduleDAG and break them by renaming registers.
+    ///
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count, unsigned InsertPosIndex);
+
+    /// Finish - Finish anti-dep breaking for a basic block.
+    void FinishBlock();
+
+  private:
+    /// Keep track of a position in the allocation order for each regclass.
+    typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType;
+
+    /// IsImplicitDefUse - Return true if MO represents a register
+    /// that is both implicitly used and defined in MI
+    bool IsImplicitDefUse(MachineInstr *MI, MachineOperand& MO);
+
+    /// GetPassthruRegs - If MI implicitly def/uses a register, then
+    /// return that register and all subregisters.
+    void GetPassthruRegs(MachineInstr *MI, std::set<unsigned>& PassthruRegs);
+
+    void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
+                       const char *header =NULL, const char *footer =NULL);
+
+    void PrescanInstruction(MachineInstr *MI, unsigned Count,
+                            std::set<unsigned>& PassthruRegs);
+    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    BitVector GetRenameRegisters(unsigned Reg);
+    bool FindSuitableFreeRegisters(unsigned AntiDepGroupIndex,
+                                   RenameOrderType& RenameOrder,
+                                   std::map<unsigned, unsigned> &RenameMap);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
new file mode 100644
index 000000000000..3fa1f8ff206c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -0,0 +1,52 @@
+//===-- llvm/CodeGen/AllocationOrder.cpp - Allocation Order ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "AllocationOrder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Compare VirtRegMap::getRegAllocPref().
+AllocationOrder::AllocationOrder(unsigned VirtReg,
+                                 const VirtRegMap &VRM,
+                                 const RegisterClassInfo &RegClassInfo)
+  : Pos(0) {
+  const MachineFunction &MF = VRM.getMachineFunction();
+  const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
+  Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
+  TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM);
+  rewind();
+
+  DEBUG({
+    if (!Hints.empty()) {
+      dbgs() << "hints:";
+      for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+        dbgs() << ' ' << PrintReg(Hints[I], TRI);
+      dbgs() << '\n';
+    }
+  });
+#ifndef NDEBUG
+  for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+    assert(std::find(Order.begin(), Order.end(), Hints[I]) != Order.end() &&
+           "Target hint is outside allocation order.");
+#endif
+}
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm/lib/CodeGen/AllocationOrder.h
new file mode 100644
index 000000000000..aed461a7ed02
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.h
@@ -0,0 +1,85 @@
+//===-- llvm/CodeGen/AllocationOrder.h - Allocation Order -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ALLOCATIONORDER_H
+#define LLVM_CODEGEN_ALLOCATIONORDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+class RegisterClassInfo;
+class VirtRegMap;
+
+class AllocationOrder {
+  SmallVector<MCPhysReg, 16> Hints;
+  ArrayRef<MCPhysReg> Order;
+  int Pos;
+
+public:
+  /// Create a new AllocationOrder for VirtReg.
+  /// @param VirtReg      Virtual register to allocate for.
+  /// @param VRM          Virtual register map for function.
+  /// @param RegClassInfo Information about reserved and allocatable registers.
+  AllocationOrder(unsigned VirtReg,
+                  const VirtRegMap &VRM,
+                  const RegisterClassInfo &RegClassInfo);
+
+  /// Get the allocation order without reordered hints.
+  ArrayRef<MCPhysReg> getOrder() const { return Order; }
+
+  /// Return the next physical register in the allocation order, or 0.
+  /// It is safe to call next() again after it returned 0, it will keep
+  /// returning 0 until rewind() is called.
+  unsigned next() {
+    if (Pos < 0)
+      return Hints.end()[Pos++];
+    while (Pos < int(Order.size())) {
+      unsigned Reg = Order[Pos++];
+      if (!isHint(Reg))
+        return Reg;
+    }
+    return 0;
+  }
+
+  /// As next(), but allow duplicates to be returned, and stop before the
+  /// Limit'th register in the RegisterClassInfo allocation order.
+  ///
+  /// This can produce more than Limit registers if there are hints.
+  unsigned nextWithDups(unsigned Limit) {
+    if (Pos < 0)
+      return Hints.end()[Pos++];
+    if (Pos < int(Limit))
+      return Order[Pos++];
+    return 0;
+  }
+
+  /// Start over from the beginning.
+  void rewind() { Pos = -int(Hints.size()); }
+
+  /// Return true if the last register returned from next() was a preferred register.
+  bool isHint() const { return Pos <= 0; }
+
+  /// Return true if PhysReg is a preferred register.
+  bool isHint(unsigned PhysReg) const {
+    return std::find(Hints.begin(), Hints.end(), PhysReg) != Hints.end();
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp
new file mode 100644
index 000000000000..dd7282c0ad97
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Analysis.cpp
@@ -0,0 +1,349 @@
+//===-- Analysis.cpp - CodeGen LLVM IR Analysis Utilities -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines several CodeGen-specific LLVM IR analysis utilties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
+/// of insertvalue or extractvalue indices that identify a member, return
+/// the linearized index of the start of the member.
+///
+unsigned llvm::ComputeLinearIndex(Type *Ty,
+                                  const unsigned *Indices,
+                                  const unsigned *IndicesEnd,
+                                  unsigned CurIndex) {
+  // Base case: We're done.
+  if (Indices && Indices == IndicesEnd)
+    return CurIndex;
+
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+        EI != EE; ++EI) {
+      if (Indices && *Indices == unsigned(EI - EB))
+        return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(*EI, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // Given an array type, recursively traverse the elements.
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
+      if (Indices && *Indices == i)
+        return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(EltTy, 0, 0, CurIndex);
+    }
+    return CurIndex;
+  }
+  // We haven't found the type we're looking for, so keep searching.
+  return CurIndex + 1;
+}
+
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
+                           SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<uint64_t> *Offsets,
+                           uint64_t StartingOffset) {
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TLI.getDataLayout()->getStructLayout(STy);
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+         EI != EE; ++EI)
+      ComputeValueVTs(TLI, *EI, ValueVTs, Offsets,
+                      StartingOffset + SL->getElementOffset(EI - EB));
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = TLI.getDataLayout()->getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
+                      StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty->isVoidTy())
+    return;
+  // Base case: we can get an EVT for this LLVM IR type.
+  ValueVTs.push_back(TLI.getValueType(Ty));
+  if (Offsets)
+    Offsets->push_back(StartingOffset);
+}
+
+/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
+GlobalVariable *llvm::ExtractTypeInfo(Value *V) {
+  V = V->stripPointerCasts();
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+
+  if (GV && GV->getName() == "llvm.eh.catch.all.value") {
+    assert(GV->hasInitializer() &&
+           "The EH catch-all value must have an initializer");
+    Value *Init = GV->getInitializer();
+    GV = dyn_cast<GlobalVariable>(Init);
+    if (!GV) V = cast<ConstantPointerNull>(Init);
+  }
+
+  assert((GV || isa<ConstantPointerNull>(V)) &&
+         "TypeInfo must be a global variable or NULL");
+  return GV;
+}
+
+/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
+/// processed uses a memory 'm' constraint.
+bool
+llvm::hasInlineAsmMemConstraint(InlineAsm::ConstraintInfoVector &CInfos,
+                                const TargetLowering &TLI) {
+  for (unsigned i = 0, e = CInfos.size(); i != e; ++i) {
+    InlineAsm::ConstraintInfo &CI = CInfos[i];
+    for (unsigned j = 0, ee = CI.Codes.size(); j != ee; ++j) {
+      TargetLowering::ConstraintType CType = TLI.getConstraintType(CI.Codes[j]);
+      if (CType == TargetLowering::C_Memory)
+        return true;
+    }
+
+    // Indirect operand accesses access memory.
+    if (CI.isIndirect)
+      return true;
+  }
+
+  return false;
+}
+
+/// getFCmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR floating-point condition code.  This includes
+/// consideration of global floating-point math flags.
+///
+ISD::CondCode llvm::getFCmpCondCode(FCmpInst::Predicate Pred) {
+  switch (Pred) {
+  case FCmpInst::FCMP_FALSE: return ISD::SETFALSE;
+  case FCmpInst::FCMP_OEQ:   return ISD::SETOEQ;
+  case FCmpInst::FCMP_OGT:   return ISD::SETOGT;
+  case FCmpInst::FCMP_OGE:   return ISD::SETOGE;
+  case FCmpInst::FCMP_OLT:   return ISD::SETOLT;
+  case FCmpInst::FCMP_OLE:   return ISD::SETOLE;
+  case FCmpInst::FCMP_ONE:   return ISD::SETONE;
+  case FCmpInst::FCMP_ORD:   return ISD::SETO;
+  case FCmpInst::FCMP_UNO:   return ISD::SETUO;
+  case FCmpInst::FCMP_UEQ:   return ISD::SETUEQ;
+  case FCmpInst::FCMP_UGT:   return ISD::SETUGT;
+  case FCmpInst::FCMP_UGE:   return ISD::SETUGE;
+  case FCmpInst::FCMP_ULT:   return ISD::SETULT;
+  case FCmpInst::FCMP_ULE:   return ISD::SETULE;
+  case FCmpInst::FCMP_UNE:   return ISD::SETUNE;
+  case FCmpInst::FCMP_TRUE:  return ISD::SETTRUE;
+  default: llvm_unreachable("Invalid FCmp predicate opcode!");
+  }
+}
+
+ISD::CondCode llvm::getFCmpCodeWithoutNaN(ISD::CondCode CC) {
+  switch (CC) {
+    case ISD::SETOEQ: case ISD::SETUEQ: return ISD::SETEQ;
+    case ISD::SETONE: case ISD::SETUNE: return ISD::SETNE;
+    case ISD::SETOLT: case ISD::SETULT: return ISD::SETLT;
+    case ISD::SETOLE: case ISD::SETULE: return ISD::SETLE;
+    case ISD::SETOGT: case ISD::SETUGT: return ISD::SETGT;
+    case ISD::SETOGE: case ISD::SETUGE: return ISD::SETGE;
+    default: return CC;
+  }
+}
+
+/// getICmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR integer condition code.
+///
+ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
+  switch (Pred) {
+  case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
+  case ICmpInst::ICMP_NE:  return ISD::SETNE;
+  case ICmpInst::ICMP_SLE: return ISD::SETLE;
+  case ICmpInst::ICMP_ULE: return ISD::SETULE;
+  case ICmpInst::ICMP_SGE: return ISD::SETGE;
+  case ICmpInst::ICMP_UGE: return ISD::SETUGE;
+  case ICmpInst::ICMP_SLT: return ISD::SETLT;
+  case ICmpInst::ICMP_ULT: return ISD::SETULT;
+  case ICmpInst::ICMP_SGT: return ISD::SETGT;
+  case ICmpInst::ICMP_UGT: return ISD::SETUGT;
+  default:
+    llvm_unreachable("Invalid ICmp predicate opcode!");
+  }
+}
+
+
+/// getNoopInput - If V is a noop (i.e., lowers to no machine code), look
+/// through it (and any transitive noop operands to it) and return its input
+/// value.  This is used to determine if a tail call can be formed.
+///
+static const Value *getNoopInput(const Value *V, const TargetLowering &TLI) {
+  // If V is not an instruction, it can't be looked through.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !I->hasOneUse() || I->getNumOperands() == 0) return V;
+  
+  Value *Op = I->getOperand(0);
+
+  // Look through truly no-op truncates.
+  if (isa<TruncInst>(I) &&
+      TLI.isTruncateFree(I->getOperand(0)->getType(), I->getType()))
+    return getNoopInput(I->getOperand(0), TLI);
+  
+  // Look through truly no-op bitcasts.
+  if (isa<BitCastInst>(I)) {
+    // No type change at all.
+    if (Op->getType() == I->getType())
+      return getNoopInput(Op, TLI);
+
+    // Pointer to pointer cast.
+    if (Op->getType()->isPointerTy() && I->getType()->isPointerTy())
+      return getNoopInput(Op, TLI);
+    
+    if (isa<VectorType>(Op->getType()) && isa<VectorType>(I->getType()) &&
+        TLI.isTypeLegal(EVT::getEVT(Op->getType())) &&
+        TLI.isTypeLegal(EVT::getEVT(I->getType())))
+      return getNoopInput(Op, TLI);
+  }
+  
+  // Look through inttoptr.
+  if (isa<IntToPtrInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+          cast<IntegerType>(Op->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+  // Look through ptrtoint.
+  if (isa<PtrToIntInst>(I) && !isa<VectorType>(I->getType())) {
+    // Make sure this isn't a truncating or extending cast.  We could support
+    // this eventually, but don't bother for now.
+    if (TLI.getPointerTy().getSizeInBits() == 
+        cast<IntegerType>(I->getType())->getBitWidth())
+      return getNoopInput(Op, TLI);
+  }
+
+
+  // Otherwise it's not something we can look through.
+  return V;
+}
+
+
+/// Test if the given instruction is in a position to be optimized
+/// with a tail-call. This roughly means that it's in a block with
+/// a return and there's nothing that needs to be scheduled
+/// between it and the return.
+///
+/// This function only tests target-independent requirements.
+bool llvm::isInTailCallPosition(ImmutableCallSite CS,const TargetLowering &TLI){
+  const Instruction *I = CS.getInstruction();
+  const BasicBlock *ExitBB = I->getParent();
+  const TerminatorInst *Term = ExitBB->getTerminator();
+  const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
+
+  // The block must end in a return statement or unreachable.
+  //
+  // FIXME: Decline tailcall if it's not guaranteed and if the block ends in
+  // an unreachable, for now. The way tailcall optimization is currently
+  // implemented means it will add an epilogue followed by a jump. That is
+  // not profitable. Also, if the callee is a special function (e.g.
+  // longjmp on x86), it can end up causing miscompilation that has not
+  // been fully understood.
+  if (!Ret &&
+      (!TLI.getTargetMachine().Options.GuaranteedTailCallOpt ||
+       !isa<UnreachableInst>(Term)))
+    return false;
+
+  // If I will have a chain, make sure no other instruction that will have a
+  // chain interposes between I and the return.
+  if (I->mayHaveSideEffects() || I->mayReadFromMemory() ||
+      !isSafeToSpeculativelyExecute(I))
+    for (BasicBlock::const_iterator BBI = prior(prior(ExitBB->end())); ;
+         --BBI) {
+      if (&*BBI == I)
+        break;
+      // Debug info intrinsics do not get in the way of tail call optimization.
+      if (isa<DbgInfoIntrinsic>(BBI))
+        continue;
+      if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
+          !isSafeToSpeculativelyExecute(BBI))
+        return false;
+    }
+
+  // If the block ends with a void return or unreachable, it doesn't matter
+  // what the call's return type is.
+  if (!Ret || Ret->getNumOperands() == 0) return true;
+
+  // If the return value is undef, it doesn't matter what the call's
+  // return type is.
+  if (isa<UndefValue>(Ret->getOperand(0))) return true;
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore noalias because it doesn't affect the call sequence.
+  const Function *F = ExitBB->getParent();
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
+        removeAttribute(Attribute::NoAlias) !=
+      AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
+        removeAttribute(Attribute::NoAlias))
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Otherwise, make sure the unmodified return value of I is the return value.
+  // We handle two cases: multiple return values + scalars.
+  Value *RetVal = Ret->getOperand(0);
+  if (!isa<InsertValueInst>(RetVal) || !isa<StructType>(RetVal->getType()))
+    // Handle scalars first.
+    return getNoopInput(Ret->getOperand(0), TLI) == I;
+  
+  // If this is an aggregate return, look through the insert/extract values and
+  // see if each is transparent.
+  for (unsigned i = 0, e =cast<StructType>(RetVal->getType())->getNumElements();
+       i != e; ++i) {
+    const Value *InScalar = FindInsertedValue(RetVal, i);
+    if (InScalar == 0) return false;
+    InScalar = getNoopInput(InScalar, TLI);
+    
+    // If the scalar value being inserted is an extractvalue of the right index
+    // from the call, then everything is good.
+    const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(InScalar);
+    if (EVI == 0 || EVI->getOperand(0) != I || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != i)
+      return false;
+  }
+  
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
new file mode 100644
index 000000000000..df47f984d578
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
@@ -0,0 +1,71 @@
+//=- llvm/CodeGen/AntiDepBreaker.h - Anti-Dependence Breaking -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AntiDepBreaker class, which implements
+// anti-dependence breaking heuristics for post-register-allocation scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ANTIDEPBREAKER_H
+#define LLVM_CODEGEN_ANTIDEPBREAKER_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <vector>
+
+namespace llvm {
+
+/// AntiDepBreaker - This class works into conjunction with the
+/// post-RA scheduler to rename registers to break register
+/// anti-dependencies.
+class AntiDepBreaker {
+public:
+  typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+    DbgValueVector;
+
+  virtual ~AntiDepBreaker();
+
+  /// Start - Initialize anti-dep breaking for a new basic block.
+  virtual void StartBlock(MachineBasicBlock *BB) =0;
+
+  /// BreakAntiDependencies - Identifiy anti-dependencies within a
+  /// basic-block region and break them by renaming registers. Return
+  /// the number of anti-dependencies broken.
+  ///
+  virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                         MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End,
+                                         unsigned InsertPosIndex,
+                                         DbgValueVector &DbgValues) = 0;
+  
+  /// Observe - Update liveness information to account for the current
+  /// instruction, which will not be scheduled.
+  ///
+  virtual void Observe(MachineInstr *MI, unsigned Count,
+                       unsigned InsertPosIndex) =0;
+  
+  /// Finish - Finish anti-dep breaking for a basic block.
+  virtual void FinishBlock() =0;
+
+  /// UpdateDbgValue - Update DBG_VALUE if dependency breaker is updating
+  /// other machine instruction to use NewReg.
+  void UpdateDbgValue(MachineInstr *MI, unsigned OldReg, unsigned NewReg) {
+    assert (MI->isDebugValue() && "MI is not DBG_VALUE!");
+    if (MI && MI->getOperand(0).isReg() && MI->getOperand(0).getReg() == OldReg)
+      MI->getOperand(0).setReg(NewReg);
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
new file mode 100644
index 000000000000..188047d94f48
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -0,0 +1,136 @@
+//===-- CodeGen/AsmPrinter/ARMException.cpp - ARM EHABI Exception Impl ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden,
+  cl::desc("Generate ARM EHABI tables with unwinding descriptors"),
+  cl::init(false));
+
+
+ARMException::ARMException(AsmPrinter *A)
+  : DwarfException(A) {}
+
+ARMException::~ARMException() {}
+
+void ARMException::EndModule() {
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void ARMException::BeginFunction(const MachineFunction *MF) {
+  Asm->OutStreamer.EmitFnStart();
+  if (Asm->MF->getFunction()->needsUnwindTableEntry())
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                  Asm->getFunctionNumber()));
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void ARMException::EndFunction() {
+  if (!Asm->MF->getFunction()->needsUnwindTableEntry())
+    Asm->OutStreamer.EmitCantUnwind();
+  else {
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
+                                                  Asm->getFunctionNumber()));
+
+    if (EnableARMEHABIDescriptors) {
+      // Map all labels and get rid of any dead landing pads.
+      MMI->TidyLandingPads();
+
+      if (!MMI->getLandingPads().empty()) {
+        // Emit references to personality.
+        if (const Function * Personality =
+            MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
+          MCSymbol *PerSym = Asm->Mang->getSymbol(Personality);
+          Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
+          Asm->OutStreamer.EmitPersonality(PerSym);
+        }
+
+        // Emit .handlerdata directive.
+        Asm->OutStreamer.EmitHandlerData();
+
+        // Emit actual exception table
+        EmitExceptionTable();
+      }
+    }
+  }
+
+  Asm->OutStreamer.EmitFnEnd();
+}
+
+void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
+  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+
+  bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer.AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer.AddBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+         I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
+    const GlobalVariable *GV = *I;
+    if (VerboseAsm)
+      Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
+    Asm->EmitTTypeReference(GV, TTypeEncoding);
+  }
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer.AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer.AddBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (TypeID != 0)
+        Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->EmitTTypeReference((TypeID == 0 ? 0 : TypeInfos[TypeID - 1]),
+                            TTypeEncoding);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
new file mode 100644
index 000000000000..d4a745d985e8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -0,0 +1,2170 @@
+//===-- AsmPrinter.cpp - Common AsmPrinter code ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "DwarfDebug.h"
+#include "DwarfException.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static const char *DWARFGroupName = "DWARF Emission";
+static const char *DbgTimerName = "DWARF Debug Writer";
+static const char *EHTimerName = "DWARF Exception Writer";
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+char AsmPrinter::ID = 0;
+
+typedef DenseMap<GCStrategy*,GCMetadataPrinter*> gcp_map_type;
+static gcp_map_type &getGCMap(void *&P) {
+  if (P == 0)
+    P = new gcp_map_type();
+  return *(gcp_map_type*)P;
+}
+
+
+/// getGVAlignmentLog2 - Return the alignment to use for the specified global
+/// value in log2 form.  This rounds up to the preferred alignment if possible
+/// and legal.
+static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
+                                   unsigned InBits = 0) {
+  unsigned NumBits = 0;
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+    NumBits = TD.getPreferredAlignmentLog(GVar);
+
+  // If InBits is specified, round it to it.
+  if (InBits > NumBits)
+    NumBits = InBits;
+
+  // If the GV has a specified alignment, take it into account.
+  if (GV->getAlignment() == 0)
+    return NumBits;
+
+  unsigned GVAlign = Log2_32(GV->getAlignment());
+
+  // If the GVAlign is larger than NumBits, or if we are required to obey
+  // NumBits because the GV has an assigned section, obey it.
+  if (GVAlign > NumBits || GV->hasSection())
+    NumBits = GVAlign;
+  return NumBits;
+}
+
+AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
+  : MachineFunctionPass(ID),
+    TM(tm), MAI(tm.getMCAsmInfo()),
+    OutContext(Streamer.getContext()),
+    OutStreamer(Streamer),
+    LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
+  DD = 0; DE = 0; MMI = 0; LI = 0;
+  CurrentFnSym = CurrentFnSymForSize = 0;
+  GCMetadataPrinters = 0;
+  VerboseAsm = Streamer.isVerboseAsm();
+}
+
+AsmPrinter::~AsmPrinter() {
+  assert(DD == 0 && DE == 0 && "Debug/EH info didn't get finalized");
+
+  if (GCMetadataPrinters != 0) {
+    gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
+
+    for (gcp_map_type::iterator I = GCMap.begin(), E = GCMap.end(); I != E; ++I)
+      delete I->second;
+    delete &GCMap;
+    GCMetadataPrinters = 0;
+  }
+
+  delete &OutStreamer;
+}
+
+/// getFunctionNumber - Return a unique ID for the current function.
+///
+unsigned AsmPrinter::getFunctionNumber() const {
+  return MF->getFunctionNumber();
+}
+
+const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
+  return TM.getTargetLowering()->getObjFileLowering();
+}
+
+/// getDataLayout - Return information about data layout.
+const DataLayout &AsmPrinter::getDataLayout() const {
+  return *TM.getDataLayout();
+}
+
+/// getCurrentSection() - Return the current section we are emitting to.
+const MCSection *AsmPrinter::getCurrentSection() const {
+  return OutStreamer.getCurrentSection();
+}
+
+
+
+void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+  if (isVerbose())
+    AU.addRequired<MachineLoopInfo>();
+}
+
+bool AsmPrinter::doInitialization(Module &M) {
+  OutStreamer.InitStreamer();
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MMI->AnalyzeModule(M);
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+    .Initialize(OutContext, TM);
+
+  Mang = new Mangler(OutContext, *TM.getDataLayout());
+
+  // Allow the target to emit any magic that it wants at the start of the file.
+  EmitStartOfAsmFile(M);
+
+  // Very minimal debug info. It is ignored if we emit actual debug info. If we
+  // don't, this at least helps the user find where a global came from.
+  if (MAI->hasSingleParameterDotFile()) {
+    // .file "foo.c"
+    OutStreamer.EmitFileDirective(M.getModuleIdentifier());
+  }
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
+      MP->beginAssembly(*this);
+
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    OutStreamer.AddComment("Start of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+    EmitInlineAsm(M.getModuleInlineAsm()+"\n");
+    OutStreamer.AddComment("End of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    DD = new DwarfDebug(this, &M);
+
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    return false;
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+    DE = new DwarfCFIException(this);
+    return false;
+  case ExceptionHandling::ARM:
+    DE = new ARMException(this);
+    return false;
+  case ExceptionHandling::Win64:
+    DE = new Win64Exception(this);
+    return false;
+  }
+
+  llvm_unreachable("Unknown exception type.");
+}
+
+void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
+  switch ((GlobalValue::LinkageTypes)Linkage) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    if (MAI->getWeakDefDirective() != 0) {
+      // .globl _foo
+      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+
+      if ((GlobalValue::LinkageTypes)Linkage !=
+          GlobalValue::LinkOnceODRAutoHideLinkage)
+        // .weak_definition _foo
+        OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+      else
+        OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
+    } else if (MAI->getLinkOnceDirective() != 0) {
+      // .globl _foo
+      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+      //NOTE: linkonce is handled by the section the symbol was assigned to.
+    } else {
+      // .weak _foo
+      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
+    }
+    break;
+  case GlobalValue::DLLExportLinkage:
+  case GlobalValue::AppendingLinkage:
+    // FIXME: appending linkage variables should go into a section of
+    // their name or something.  For now, just emit them as external.
+  case GlobalValue::ExternalLinkage:
+    // If external or appending, declare as a global symbol.
+    // .globl _foo
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+    break;
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+    break;
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+}
+
+
+/// EmitGlobalVariable - Emit the specified global variable to the .s file.
+void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (GV->hasInitializer()) {
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(GV))
+      return;
+
+    if (isVerbose()) {
+      WriteAsOperand(OutStreamer.GetCommentOS(), GV,
+                     /*PrintType=*/false, GV->getParent());
+      OutStreamer.GetCommentOS() << '\n';
+    }
+  }
+
+  MCSymbol *GVSym = Mang->getSymbol(GV);
+  EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+
+  if (!GV->hasInitializer())   // External globals require no extra code.
+    return;
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+
+  const DataLayout *TD = TM.getDataLayout();
+  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+
+  // If the alignment is specified, we *must* obey it.  Overaligning a global
+  // with a specified alignment is a prompt way to break globals emitted to
+  // sections and expected to be contiguous (e.g. ObjC metadata).
+  unsigned AlignLog = getGVAlignmentLog2(GV, *TD);
+
+  // Handle common and BSS local symbols (.lcomm).
+  if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+    unsigned Align = 1 << AlignLog;
+
+    // Handle common symbols.
+    if (GVKind.isCommon()) {
+      if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
+        Align = 0;
+
+      // .comm _foo, 42, 4
+      OutStreamer.EmitCommonSymbol(GVSym, Size, Align);
+      return;
+    }
+
+    // Handle local BSS symbols.
+    if (MAI->hasMachoZeroFillDirective()) {
+      const MCSection *TheSection =
+        getObjFileLowering().SectionForGlobal(GV, GVKind, Mang, TM);
+      // .zerofill __DATA, __bss, _foo, 400, 5
+      OutStreamer.EmitZerofill(TheSection, GVSym, Size, Align);
+      return;
+    }
+
+    // Use .lcomm only if it supports user-specified alignment.
+    // Otherwise, while it would still be correct to use .lcomm in some
+    // cases (e.g. when Align == 1), the external assembler might enfore
+    // some -unknown- default alignment behavior, which could cause
+    // spurious differences between external and integrated assembler.
+    // Prefer to simply fall back to .local / .comm in this case.
+    if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
+      // .lcomm _foo, 42
+      OutStreamer.EmitLocalCommonSymbol(GVSym, Size, Align);
+      return;
+    }
+
+    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
+      Align = 0;
+
+    // .local _foo
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Local);
+    // .comm _foo, 42, 4
+    OutStreamer.EmitCommonSymbol(GVSym, Size, Align);
+    return;
+  }
+
+  const MCSection *TheSection =
+    getObjFileLowering().SectionForGlobal(GV, GVKind, Mang, TM);
+
+  // Handle the zerofill directive on darwin, which is a special form of BSS
+  // emission.
+  if (GVKind.isBSSExtern() && MAI->hasMachoZeroFillDirective()) {
+    if (Size == 0) Size = 1;  // zerofill of 0 bytes is undefined.
+
+    // .globl _foo
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+    // .zerofill __DATA, __common, _foo, 400, 5
+    OutStreamer.EmitZerofill(TheSection, GVSym, Size, 1 << AlignLog);
+    return;
+  }
+
+  // Handle thread local data for mach-o which requires us to output an
+  // additional structure of data and mangle the original symbol so that we
+  // can reference it later.
+  //
+  // TODO: This should become an "emit thread local global" method on TLOF.
+  // All of this macho specific stuff should be sunk down into TLOFMachO and
+  // stuff like "TLSExtraDataSection" should no longer be part of the parent
+  // TLOF class.  This will also make it more obvious that stuff like
+  // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho
+  // specific code.
+  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) {
+    // Emit the .tbss symbol
+    MCSymbol *MangSym =
+      OutContext.GetOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
+
+    if (GVKind.isThreadBSS())
+      OutStreamer.EmitTBSSSymbol(TheSection, MangSym, Size, 1 << AlignLog);
+    else if (GVKind.isThreadData()) {
+      OutStreamer.SwitchSection(TheSection);
+
+      EmitAlignment(AlignLog, GV);
+      OutStreamer.EmitLabel(MangSym);
+
+      EmitGlobalConstant(GV->getInitializer());
+    }
+
+    OutStreamer.AddBlankLine();
+
+    // Emit the variable struct for the runtime.
+    const MCSection *TLVSect
+      = getObjFileLowering().getTLSExtraDataSection();
+
+    OutStreamer.SwitchSection(TLVSect);
+    // Emit the linkage here.
+    EmitLinkage(GV->getLinkage(), GVSym);
+    OutStreamer.EmitLabel(GVSym);
+
+    // Three pointers in size:
+    //   - __tlv_bootstrap - used to make sure support exists
+    //   - spare pointer, used when mapped by the runtime
+    //   - pointer to mangled symbol above with initializer
+    unsigned PtrSize = TD->getPointerSizeInBits()/8;
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
+				PtrSize);
+    OutStreamer.EmitIntValue(0, PtrSize);
+    OutStreamer.EmitSymbolValue(MangSym, PtrSize);
+
+    OutStreamer.AddBlankLine();
+    return;
+  }
+
+  OutStreamer.SwitchSection(TheSection);
+
+  EmitLinkage(GV->getLinkage(), GVSym);
+  EmitAlignment(AlignLog, GV);
+
+  OutStreamer.EmitLabel(GVSym);
+
+  EmitGlobalConstant(GV->getInitializer());
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    // .size foo, 42
+    OutStreamer.EmitELFSize(GVSym, MCConstantExpr::Create(Size, OutContext));
+
+  OutStreamer.AddBlankLine();
+}
+
+/// EmitFunctionHeader - This method emits the header for the current
+/// function.
+void AsmPrinter::EmitFunctionHeader() {
+  // Print out constants referenced by the function
+  EmitConstantPool();
+
+  // Print the 'header' of function.
+  const Function *F = MF->getFunction();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  EmitVisibility(CurrentFnSym, F->getVisibility());
+
+  EmitLinkage(F->getLinkage(), CurrentFnSym);
+  EmitAlignment(MF->getAlignment(), F);
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer.EmitSymbolAttribute(CurrentFnSym, MCSA_ELF_TypeFunction);
+
+  if (isVerbose()) {
+    WriteAsOperand(OutStreamer.GetCommentOS(), F,
+                   /*PrintType=*/false, F->getParent());
+    OutStreamer.GetCommentOS() << '\n';
+  }
+
+  // Emit the CurrentFnSym.  This is a virtual function to allow targets to
+  // do their wild and crazy things as required.
+  EmitFunctionEntryLabel();
+
+  // If the function had address-taken blocks that got deleted, then we have
+  // references to the dangling symbols.  Emit them at the start of the function
+  // so that we don't get references to undefined symbols.
+  std::vector<MCSymbol*> DeadBlockSyms;
+  MMI->takeDeletedSymbolsForFunction(F, DeadBlockSyms);
+  for (unsigned i = 0, e = DeadBlockSyms.size(); i != e; ++i) {
+    OutStreamer.AddComment("Address taken block that was later removed");
+    OutStreamer.EmitLabel(DeadBlockSyms[i]);
+  }
+
+  // Add some workaround for linkonce linkage on Cygwin\MinGW.
+  if (MAI->getLinkOnceDirective() != 0 &&
+      (F->hasLinkOnceLinkage() || F->hasWeakLinkage())) {
+    // FIXME: What is this?
+    MCSymbol *FakeStub =
+      OutContext.GetOrCreateSymbol(Twine("Lllvm$workaround$fake$stub$")+
+                                   CurrentFnSym->getName());
+    OutStreamer.EmitLabel(FakeStub);
+  }
+
+  // Emit pre-function debug and/or EH information.
+  if (DE) {
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->BeginFunction(MF);
+  }
+  if (DD) {
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->beginFunction(MF);
+  }
+}
+
+/// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
+/// function.  This can be overridden by targets as required to do custom stuff.
+void AsmPrinter::EmitFunctionEntryLabel() {
+  // The function label could have already been emitted if two symbols end up
+  // conflicting due to asm renaming.  Detect this and emit an error.
+  if (CurrentFnSym->isUndefined())
+    return OutStreamer.EmitLabel(CurrentFnSym);
+
+  report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                     "' label emitted multiple times to assembly file");
+}
+
+/// emitComments - Pretty-print comments for instructions.
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetMachine &TM = MF->getTarget();
+
+  // Check for spills and reloads
+  int FI;
+
+  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+
+  // We assume a single instruction only has a spill or reload, not
+  // both.
+  const MachineMemOperand *MMO;
+  if (TM.getInstrInfo()->isLoadFromStackSlotPostFE(&MI, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
+      CommentOS << MMO->getSize() << "-byte Reload\n";
+    }
+  } else if (TM.getInstrInfo()->hasLoadFromStackSlot(&MI, MMO, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI))
+      CommentOS << MMO->getSize() << "-byte Folded Reload\n";
+  } else if (TM.getInstrInfo()->isStoreToStackSlotPostFE(&MI, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
+      CommentOS << MMO->getSize() << "-byte Spill\n";
+    }
+  } else if (TM.getInstrInfo()->hasStoreToStackSlot(&MI, MMO, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI))
+      CommentOS << MMO->getSize() << "-byte Folded Spill\n";
+  }
+
+  // Check for spill-induced copies
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
+    CommentOS << " Reload Reuse\n";
+}
+
+/// emitImplicitDef - This method emits the specified machine instruction
+/// that is an implicit def.
+static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  AP.OutStreamer.AddComment(Twine("implicit-def: ") +
+                            AP.TM.getRegisterInfo()->getName(RegNo));
+  AP.OutStreamer.AddBlankLine();
+}
+
+static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
+  std::string Str = "kill:";
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    assert(Op.isReg() && "KILL instruction must have only register operands");
+    Str += ' ';
+    Str += AP.TM.getRegisterInfo()->getName(Op.getReg());
+    Str += (Op.isDef() ? "<def>" : "<kill>");
+  }
+  AP.OutStreamer.AddComment(Str);
+  AP.OutStreamer.AddBlankLine();
+}
+
+/// emitDebugValueComment - This method handles the target-independent form
+/// of DBG_VALUE, returning true if it was able to do so.  A false return
+/// means the target will need to handle MI in EmitInstruction.
+static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
+  // This code handles only the 3-operand target-independent form.
+  if (MI->getNumOperands() != 3)
+    return false;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << '\t' << AP.MAI->getCommentString() << "DEBUG_VALUE: ";
+
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode*>(MI->getOperand(2).getMetadata()));
+  if (V.getContext().isSubprogram())
+    OS << DISubprogram(V.getContext()).getDisplayName() << ":";
+  OS << V.getName() << " <- ";
+
+  // Register or immediate value. Register 0 means undef.
+  if (MI->getOperand(0).isFPImm()) {
+    APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
+    if (MI->getOperand(0).getFPImm()->getType()->isFloatTy()) {
+      OS << (double)APF.convertToFloat();
+    } else if (MI->getOperand(0).getFPImm()->getType()->isDoubleTy()) {
+      OS << APF.convertToDouble();
+    } else {
+      // There is no good way to print long double.  Convert a copy to
+      // double.  Ah well, it's only a comment.
+      bool ignored;
+      APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                  &ignored);
+      OS << "(long double) " << APF.convertToDouble();
+    }
+  } else if (MI->getOperand(0).isImm()) {
+    OS << MI->getOperand(0).getImm();
+  } else if (MI->getOperand(0).isCImm()) {
+    MI->getOperand(0).getCImm()->getValue().print(OS, false /*isSigned*/);
+  } else {
+    assert(MI->getOperand(0).isReg() && "Unknown operand type");
+    if (MI->getOperand(0).getReg() == 0) {
+      // Suppress offset, it is not meaningful here.
+      OS << "undef";
+      // NOTE: Want this comment at start of line, don't emit with AddComment.
+      AP.OutStreamer.EmitRawText(OS.str());
+      return true;
+    }
+    OS << AP.TM.getRegisterInfo()->getName(MI->getOperand(0).getReg());
+  }
+
+  OS << '+' << MI->getOperand(1).getImm();
+  // NOTE: Want this comment at start of line, don't emit with AddComment.
+  AP.OutStreamer.EmitRawText(OS.str());
+  return true;
+}
+
+AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
+  if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
+      MF->getFunction()->needsUnwindTableEntry())
+    return CFI_M_EH;
+
+  if (MMI->hasDebugInfo())
+    return CFI_M_Debug;
+
+  return CFI_M_None;
+}
+
+bool AsmPrinter::needsSEHMoves() {
+  return MAI->getExceptionHandlingType() == ExceptionHandling::Win64 &&
+    MF->getFunction()->needsUnwindTableEntry();
+}
+
+bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
+  return MAI->doesDwarfUseRelocationsAcrossSections();
+}
+
+void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
+  MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+
+  if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
+    return;
+
+  if (needsCFIMoves() == CFI_M_None)
+    return;
+
+  if (MMI->getCompactUnwindEncoding() != 0)
+    OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
+
+  MachineModuleInfo &MMI = MF->getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  bool FoundOne = false;
+  (void)FoundOne;
+  for (std::vector<MachineMove>::iterator I = Moves.begin(),
+         E = Moves.end(); I != E; ++I) {
+    if (I->getLabel() == Label) {
+      EmitCFIFrameMove(*I);
+      FoundOne = true;
+    }
+  }
+  assert(FoundOne);
+}
+
+/// EmitFunctionBody - This method emits the body and trailer for a
+/// function.
+void AsmPrinter::EmitFunctionBody() {
+  // Emit target-specific gunk before the function body.
+  EmitFunctionBodyStart();
+
+  bool ShouldPrintDebugScopes = DD && MMI->hasDebugInfo();
+
+  // Print out code for the function.
+  bool HasAnyRealCode = false;
+  const MachineInstr *LastMI = 0;
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(I);
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      LastMI = II;
+
+      // Print the assembly for the instruction.
+      if (!II->isLabel() && !II->isImplicitDef() && !II->isKill() &&
+          !II->isDebugValue()) {
+        HasAnyRealCode = true;
+        ++EmittedInsts;
+      }
+
+      if (ShouldPrintDebugScopes) {
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->beginInstruction(II);
+      }
+
+      if (isVerbose())
+        emitComments(*II, OutStreamer.GetCommentOS());
+
+      switch (II->getOpcode()) {
+      case TargetOpcode::PROLOG_LABEL:
+        emitPrologLabel(*II);
+        break;
+
+      case TargetOpcode::EH_LABEL:
+      case TargetOpcode::GC_LABEL:
+        OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol());
+        break;
+      case TargetOpcode::INLINEASM:
+        EmitInlineAsm(II);
+        break;
+      case TargetOpcode::DBG_VALUE:
+        if (isVerbose()) {
+          if (!emitDebugValueComment(II, *this))
+            EmitInstruction(II);
+        }
+        break;
+      case TargetOpcode::IMPLICIT_DEF:
+        if (isVerbose()) emitImplicitDef(II, *this);
+        break;
+      case TargetOpcode::KILL:
+        if (isVerbose()) emitKill(II, *this);
+        break;
+      default:
+        if (!TM.hasMCUseLoc())
+          MCLineEntry::Make(&OutStreamer, getCurrentSection());
+
+        EmitInstruction(II);
+        break;
+      }
+
+      if (ShouldPrintDebugScopes) {
+        NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+        DD->endInstruction(II);
+      }
+    }
+  }
+
+  // If the last instruction was a prolog label, then we have a situation where
+  // we emitted a prolog but no function body. This results in the ending prolog
+  // label equaling the end of function label and an invalid "row" in the
+  // FDE. We need to emit a noop in this situation so that the FDE's rows are
+  // valid.
+  bool RequiresNoop = LastMI && LastMI->isPrologLabel();
+
+  // If the function is empty and the object file uses .subsections_via_symbols,
+  // then we need to emit *something* to the function body to prevent the
+  // labels from collapsing together.  Just emit a noop.
+  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode) || RequiresNoop) {
+    MCInst Noop;
+    TM.getInstrInfo()->getNoopForMachoTarget(Noop);
+    if (Noop.getOpcode()) {
+      OutStreamer.AddComment("avoids zero-length function");
+      OutStreamer.EmitInstruction(Noop);
+    } else  // Target not mc-ized yet.
+      OutStreamer.EmitRawText(StringRef("\tnop\n"));
+  }
+
+  const Function *F = MF->getFunction();
+  for (Function::const_iterator i = F->begin(), e = F->end(); i != e; ++i) {
+    const BasicBlock *BB = i;
+    if (!BB->hasAddressTaken())
+      continue;
+    MCSymbol *Sym = GetBlockAddressSymbol(BB);
+    if (Sym->isDefined())
+      continue;
+    OutStreamer.AddComment("Address of block that was removed by CodeGen");
+    OutStreamer.EmitLabel(Sym);
+  }
+
+  // Emit target-specific gunk after the function body.
+  EmitFunctionBodyEnd();
+
+  // If the target wants a .size directive for the size of the function, emit
+  // it.
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    // Create a symbol for the end of function, so we can get the size as
+    // difference between the function label and the temp label.
+    MCSymbol *FnEndLabel = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(FnEndLabel);
+
+    const MCExpr *SizeExp =
+      MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(FnEndLabel, OutContext),
+                              MCSymbolRefExpr::Create(CurrentFnSymForSize,
+                                                      OutContext),
+                              OutContext);
+    OutStreamer.EmitELFSize(CurrentFnSym, SizeExp);
+  }
+
+  // Emit post-function debug information.
+  if (DD) {
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DD->endFunction(MF);
+  }
+  if (DE) {
+    NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+    DE->EndFunction();
+  }
+  MMI->EndFunction();
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo();
+
+  OutStreamer.AddBlankLine();
+}
+
+/// getDebugValueLocation - Get location information encoded by DBG_VALUE
+/// operands.
+MachineLocation AsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  // Target specific DBG_VALUE instructions are handled by each target.
+  return MachineLocation();
+}
+
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+
+  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid() && Reg < 0;
+       ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    // FIXME: Get the bit range this register uses of the superregister
+    // so that we can produce a DW_OP_bit_piece
+  }
+
+  // FIXME: Handle cases like a super register being encoded as
+  // DW_OP_reg 32 DW_OP_piece 4 DW_OP_reg 33
+
+  // FIXME: We have no reasonable way of handling errors in here. The
+  // caller might be in the middle of an dwarf expression. We should
+  // probably assert that Reg >= 0 once debug info generation is more mature.
+
+  if (int Offset =  MLoc.getOffset()) {
+    if (Reg < 32) {
+      OutStreamer.AddComment(
+        dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
+      EmitInt8(dwarf::DW_OP_breg0 + Reg);
+    } else {
+      OutStreamer.AddComment("DW_OP_bregx");
+      EmitInt8(dwarf::DW_OP_bregx);
+      OutStreamer.AddComment(Twine(Reg));
+      EmitULEB128(Reg);
+    }
+    EmitSLEB128(Offset);
+  } else {
+    if (Reg < 32) {
+      OutStreamer.AddComment(
+        dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
+      EmitInt8(dwarf::DW_OP_reg0 + Reg);
+    } else {
+      OutStreamer.AddComment("DW_OP_regx");
+      EmitInt8(dwarf::DW_OP_regx);
+      OutStreamer.AddComment(Twine(Reg));
+      EmitULEB128(Reg);
+    }
+  }
+
+  // FIXME: Produce a DW_OP_bit_piece if we used a superregister
+}
+
+bool AsmPrinter::doFinalization(Module &M) {
+  // Emit global variables.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    EmitGlobalVariable(I);
+
+  // Emit visibility info for declarations
+  for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    const Function &F = *I;
+    if (!F.isDeclaration())
+      continue;
+    GlobalValue::VisibilityTypes V = F.getVisibility();
+    if (V == GlobalValue::DefaultVisibility)
+      continue;
+
+    MCSymbol *Name = Mang->getSymbol(&F);
+    EmitVisibility(Name, V, false);
+  }
+
+  // Emit module flags.
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+  if (!ModuleFlags.empty())
+    getObjFileLowering().emitModuleFlags(OutStreamer, ModuleFlags, Mang, TM);
+
+  // Finalize debug and EH information.
+  if (DE) {
+    {
+      NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
+      DE->EndModule();
+    }
+    delete DE; DE = 0;
+  }
+  if (DD) {
+    {
+      NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+      DD->endModule();
+    }
+    delete DD; DD = 0;
+  }
+
+  // If the target wants to know about weak references, print them all.
+  if (MAI->getWeakRefDirective()) {
+    // FIXME: This is not lazy, it would be nice to only print weak references
+    // to stuff that is actually used.  Note that doing so would require targets
+    // to notice uses in operands (due to constant exprs etc).  This should
+    // happen with the MC stuff eventually.
+
+    // Print out module-level global variables here.
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+      if (!I->hasExternalWeakLinkage()) continue;
+      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+    }
+
+    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
+      if (!I->hasExternalWeakLinkage()) continue;
+      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+    }
+  }
+
+  if (MAI->hasSetDirective()) {
+    OutStreamer.AddBlankLine();
+    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+         I != E; ++I) {
+      MCSymbol *Name = Mang->getSymbol(I);
+
+      const GlobalValue *GV = I->getAliasedGlobal();
+      MCSymbol *Target = Mang->getSymbol(GV);
+
+      if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
+        OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
+      else if (I->hasWeakLinkage())
+        OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
+      else
+        assert(I->hasLocalLinkage() && "Invalid alias linkage");
+
+      EmitVisibility(Name, I->getVisibility());
+
+      // Emit the directives as assignments aka .set:
+      OutStreamer.EmitAssignment(Name,
+                                 MCSymbolRefExpr::Create(Target, OutContext));
+    }
+  }
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
+      MP->finishAssembly(*this);
+
+  // If we don't have any trampolines, then we don't require stack memory
+  // to be executable. Some targets have a directive to declare this.
+  Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
+  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
+    if (const MCSection *S = MAI->getNonexecutableStackSection(OutContext))
+      OutStreamer.SwitchSection(S);
+
+  // Allow the target to emit any magic that it wants at the end of the file,
+  // after everything else has gone out.
+  EmitEndOfAsmFile(M);
+
+  delete Mang; Mang = 0;
+  MMI = 0;
+
+  OutStreamer.Finish();
+  OutStreamer.reset();
+
+  return false;
+}
+
+void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+  // Get the function symbol.
+  CurrentFnSym = Mang->getSymbol(MF.getFunction());
+  CurrentFnSymForSize = CurrentFnSym;
+
+  if (isVerbose())
+    LI = &getAnalysis<MachineLoopInfo>();
+}
+
+namespace {
+  // SectionCPs - Keep track the alignment, constpool entries per Section.
+  struct SectionCPs {
+    const MCSection *S;
+    unsigned Alignment;
+    SmallVector<unsigned, 4> CPEs;
+    SectionCPs(const MCSection *s, unsigned a) : S(s), Alignment(a) {}
+  };
+}
+
+/// EmitConstantPool - Print to the current output stream assembly
+/// representations of the constants in the constant pool MCP. This is
+/// used to print out constants which have been "spilled to memory" by
+/// the code generator.
+///
+void AsmPrinter::EmitConstantPool() {
+  const MachineConstantPool *MCP = MF->getConstantPool();
+  const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
+  if (CP.empty()) return;
+
+  // Calculate sections for constant pool entries. We collect entries to go into
+  // the same section together to reduce amount of section switch statements.
+  SmallVector<SectionCPs, 4> CPSections;
+  for (unsigned i = 0, e = CP.size(); i != e; ++i) {
+    const MachineConstantPoolEntry &CPE = CP[i];
+    unsigned Align = CPE.getAlignment();
+
+    SectionKind Kind;
+    switch (CPE.getRelocationInfo()) {
+    default: llvm_unreachable("Unknown section kind");
+    case 2: Kind = SectionKind::getReadOnlyWithRel(); break;
+    case 1:
+      Kind = SectionKind::getReadOnlyWithRelLocal();
+      break;
+    case 0:
+    switch (TM.getDataLayout()->getTypeAllocSize(CPE.getType())) {
+    case 4:  Kind = SectionKind::getMergeableConst4(); break;
+    case 8:  Kind = SectionKind::getMergeableConst8(); break;
+    case 16: Kind = SectionKind::getMergeableConst16();break;
+    default: Kind = SectionKind::getMergeableConst(); break;
+    }
+    }
+
+    const MCSection *S = getObjFileLowering().getSectionForConstant(Kind);
+
+    // The number of sections are small, just do a linear search from the
+    // last section to the first.
+    bool Found = false;
+    unsigned SecIdx = CPSections.size();
+    while (SecIdx != 0) {
+      if (CPSections[--SecIdx].S == S) {
+        Found = true;
+        break;
+      }
+    }
+    if (!Found) {
+      SecIdx = CPSections.size();
+      CPSections.push_back(SectionCPs(S, Align));
+    }
+
+    if (Align > CPSections[SecIdx].Alignment)
+      CPSections[SecIdx].Alignment = Align;
+    CPSections[SecIdx].CPEs.push_back(i);
+  }
+
+  // Now print stuff into the calculated sections.
+  for (unsigned i = 0, e = CPSections.size(); i != e; ++i) {
+    OutStreamer.SwitchSection(CPSections[i].S);
+    EmitAlignment(Log2_32(CPSections[i].Alignment));
+
+    unsigned Offset = 0;
+    for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) {
+      unsigned CPI = CPSections[i].CPEs[j];
+      MachineConstantPoolEntry CPE = CP[CPI];
+
+      // Emit inter-object padding for alignment.
+      unsigned AlignMask = CPE.getAlignment() - 1;
+      unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
+      OutStreamer.EmitZeros(NewOffset - Offset);
+
+      Type *Ty = CPE.getType();
+      Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty);
+      OutStreamer.EmitLabel(GetCPISymbol(CPI));
+
+      if (CPE.isMachineConstantPoolEntry())
+        EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
+      else
+        EmitGlobalConstant(CPE.Val.ConstVal);
+    }
+  }
+}
+
+/// EmitJumpTableInfo - Print assembly representations of the jump tables used
+/// by the current function to the current output stream.
+///
+void AsmPrinter::EmitJumpTableInfo() {
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (MJTI == 0) return;
+  if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  // Pick the directive to use to print the jump table entries, and switch to
+  // the appropriate section.
+  const Function *F = MF->getFunction();
+  bool JTInDiffSection = false;
+  if (// In PIC mode, we need to emit the jump table to the same section as the
+      // function body itself, otherwise the label differences won't make sense.
+      // FIXME: Need a better predicate for this: what about custom entries?
+      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
+      // We should also do if the section name is NULL or function is declared
+      // in discardable section
+      // FIXME: this isn't the right predicate, should be based on the MCSection
+      // for the function.
+      F->isWeakForLinker()) {
+    OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F,Mang,TM));
+  } else {
+    // Otherwise, drop it in the readonly section.
+    const MCSection *ReadOnlySection =
+      getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly());
+    OutStreamer.SwitchSection(ReadOnlySection);
+    JTInDiffSection = true;
+  }
+
+  EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getDataLayout())));
+
+  // Jump tables in code sections are marked with a data_region directive
+  // where that's supported.
+  if (!JTInDiffSection)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
+
+  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+    // If this jump table was deleted, ignore it.
+    if (JTBBs.empty()) continue;
+
+    // For the EK_LabelDifference32 entry, if the target supports .set, emit a
+    // .set directive for each unique entry.  This reduces the number of
+    // relocations the assembler will generate for the jump table.
+    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
+        MAI->hasSetDirective()) {
+      SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
+      const TargetLowering *TLI = TM.getTargetLowering();
+      const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
+      for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
+        const MachineBasicBlock *MBB = JTBBs[ii];
+        if (!EmittedSets.insert(MBB)) continue;
+
+        // .set LJTSet, LBB32-base
+        const MCExpr *LHS =
+          MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+        OutStreamer.EmitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
+                                MCBinaryExpr::CreateSub(LHS, Base, OutContext));
+      }
+    }
+
+    // On some targets (e.g. Darwin) we want to emit two consecutive labels
+    // before each jump table.  The first label is never referenced, but tells
+    // the assembler and linker the extents of the jump table object.  The
+    // second label is actually referenced by the code.
+    if (JTInDiffSection && MAI->getLinkerPrivateGlobalPrefix()[0])
+      // FIXME: This doesn't have to have any specific name, just any randomly
+      // named and numbered 'l' label would work.  Simplify GetJTISymbol.
+      OutStreamer.EmitLabel(GetJTISymbol(JTI, true));
+
+    OutStreamer.EmitLabel(GetJTISymbol(JTI));
+
+    for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
+      EmitJumpTableEntry(MJTI, JTBBs[ii], JTI);
+  }
+  if (!JTInDiffSection)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+/// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
+/// current stream.
+void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                    const MachineBasicBlock *MBB,
+                                    unsigned UID) const {
+  assert(MBB && MBB->getNumber() >= 0 && "Invalid basic block");
+  const MCExpr *Value = 0;
+  switch (MJTI->getEntryKind()) {
+  case MachineJumpTableInfo::EK_Inline:
+    llvm_unreachable("Cannot emit EK_Inline jump table entry");
+  case MachineJumpTableInfo::EK_Custom32:
+    Value = TM.getTargetLowering()->LowerCustomJumpTableEntry(MJTI, MBB, UID,
+                                                              OutContext);
+    break;
+  case MachineJumpTableInfo::EK_BlockAddress:
+    // EK_BlockAddress - Each entry is a plain address of block, e.g.:
+    //     .word LBB123
+    Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+    break;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress: {
+    // EK_GPRel32BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gprel32 LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer.EmitGPRel32Value(MCSymbolRefExpr::Create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress: {
+    // EK_GPRel64BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gpdword LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer.EmitGPRel64Value(MCSymbolRefExpr::Create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_LabelDifference32: {
+    // EK_LabelDifference32 - Each entry is the address of the block minus
+    // the address of the jump table.  This is used for PIC jump tables where
+    // gprel32 is not supported.  e.g.:
+    //      .word LBB123 - LJTI1_2
+    // If the .set directive is supported, this is emitted as:
+    //      .set L4_5_set_123, LBB123 - LJTI1_2
+    //      .word L4_5_set_123
+
+    // If we have emitted set directives for the jump table entries, print
+    // them rather than the entries themselves.  If we're emitting PIC, then
+    // emit the table entries as differences between two text section labels.
+    if (MAI->hasSetDirective()) {
+      // If we used .set, reference the .set's symbol.
+      Value = MCSymbolRefExpr::Create(GetJTSetSymbol(UID, MBB->getNumber()),
+                                      OutContext);
+      break;
+    }
+    // Otherwise, use the difference as the jump table entry.
+    Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+    const MCExpr *JTI = MCSymbolRefExpr::Create(GetJTISymbol(UID), OutContext);
+    Value = MCBinaryExpr::CreateSub(Value, JTI, OutContext);
+    break;
+  }
+  }
+
+  assert(Value && "Unknown entry kind!");
+
+  unsigned EntrySize = MJTI->getEntrySize(*TM.getDataLayout());
+  OutStreamer.EmitValue(Value, EntrySize);
+}
+
+
+/// EmitSpecialLLVMGlobal - Check to see if the specified global is a
+/// special global used by LLVM.  If so, emit it and return true, otherwise
+/// do nothing and return false.
+bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
+  if (GV->getName() == "llvm.used") {
+    if (MAI->hasNoDeadStrip())    // No need to emit this at all.
+      EmitLLVMUsedList(GV->getInitializer());
+    return true;
+  }
+
+  // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
+  if (GV->getSection() == "llvm.metadata" ||
+      GV->hasAvailableExternallyLinkage())
+    return true;
+
+  if (!GV->hasAppendingLinkage()) return false;
+
+  assert(GV->hasInitializer() && "Not a special LLVM global!");
+
+  if (GV->getName() == "llvm.global_ctors") {
+    EmitXXStructorList(GV->getInitializer(), /* isCtor */ true);
+
+    if (TM.getRelocationModel() == Reloc::Static &&
+        MAI->hasStaticCtorDtorReferenceInStaticMode()) {
+      StringRef Sym(".constructors_used");
+      OutStreamer.EmitSymbolAttribute(OutContext.GetOrCreateSymbol(Sym),
+                                      MCSA_Reference);
+    }
+    return true;
+  }
+
+  if (GV->getName() == "llvm.global_dtors") {
+    EmitXXStructorList(GV->getInitializer(), /* isCtor */ false);
+
+    if (TM.getRelocationModel() == Reloc::Static &&
+        MAI->hasStaticCtorDtorReferenceInStaticMode()) {
+      StringRef Sym(".destructors_used");
+      OutStreamer.EmitSymbolAttribute(OutContext.GetOrCreateSymbol(Sym),
+                                      MCSA_Reference);
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
+/// global in the specified llvm.used list for which emitUsedDirectiveFor
+/// is true, as being used with this directive.
+void AsmPrinter::EmitLLVMUsedList(const Constant *List) {
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
+  if (InitList == 0) return;
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    const GlobalValue *GV =
+      dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
+    if (GV && getObjFileLowering().shouldEmitUsedDirectiveFor(GV, Mang))
+      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(GV), MCSA_NoDeadStrip);
+  }
+}
+
+typedef std::pair<unsigned, Constant*> Structor;
+
+static bool priority_order(const Structor& lhs, const Structor& rhs) {
+  return lhs.first < rhs.first;
+}
+
+/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
+/// priority.
+void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
+  // Should be an array of '{ int, void ()* }' structs.  The first value is the
+  // init priority.
+  if (!isa<ConstantArray>(List)) return;
+
+  // Sanity check the structors list.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
+  if (!InitList) return; // Not an array!
+  StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
+  if (!ETy || ETy->getNumElements() != 2) return; // Not an array of pairs!
+  if (!isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
+      !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
+
+  // Gather the structors in a form that's convenient for sorting by priority.
+  SmallVector<Structor, 8> Structors;
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
+    if (!CS) continue; // Malformed.
+    if (CS->getOperand(1)->isNullValue())
+      break;  // Found a null terminator, skip the rest.
+    ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+    if (!Priority) continue; // Malformed.
+    Structors.push_back(std::make_pair(Priority->getLimitedValue(65535),
+                                       CS->getOperand(1)));
+  }
+
+  // Emit the function pointers in the target-specific order
+  const DataLayout *TD = TM.getDataLayout();
+  unsigned Align = Log2_32(TD->getPointerPrefAlignment());
+  std::stable_sort(Structors.begin(), Structors.end(), priority_order);
+  for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
+    const MCSection *OutputSection =
+      (isCtor ?
+       getObjFileLowering().getStaticCtorSection(Structors[i].first) :
+       getObjFileLowering().getStaticDtorSection(Structors[i].first));
+    OutStreamer.SwitchSection(OutputSection);
+    if (OutStreamer.getCurrentSection() != OutStreamer.getPreviousSection())
+      EmitAlignment(Align);
+    EmitXXStructor(Structors[i].second);
+  }
+}
+
+//===--------------------------------------------------------------------===//
+// Emission and print routines
+//
+
+/// EmitInt8 - Emit a byte directive and value.
+///
+void AsmPrinter::EmitInt8(int Value) const {
+  OutStreamer.EmitIntValue(Value, 1);
+}
+
+/// EmitInt16 - Emit a short directive and value.
+///
+void AsmPrinter::EmitInt16(int Value) const {
+  OutStreamer.EmitIntValue(Value, 2);
+}
+
+/// EmitInt32 - Emit a long directive and value.
+///
+void AsmPrinter::EmitInt32(int Value) const {
+  OutStreamer.EmitIntValue(Value, 4);
+}
+
+/// EmitLabelDifference - Emit something like ".long Hi-Lo" where the size
+/// in bytes of the directive is specified by Size and Hi/Lo specify the
+/// labels.  This implicitly uses .set if it is available.
+void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
+                                     unsigned Size) const {
+  // Get the Hi-Lo expression.
+  const MCExpr *Diff =
+    MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(Hi, OutContext),
+                            MCSymbolRefExpr::Create(Lo, OutContext),
+                            OutContext);
+
+  if (!MAI->hasSetDirective()) {
+    OutStreamer.EmitValue(Diff, Size);
+    return;
+  }
+
+  // Otherwise, emit with .set (aka assignment).
+  MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
+  OutStreamer.EmitAssignment(SetLabel, Diff);
+  OutStreamer.EmitSymbolValue(SetLabel, Size);
+}
+
+/// EmitLabelOffsetDifference - Emit something like ".long Hi+Offset-Lo"
+/// where the size in bytes of the directive is specified by Size and Hi/Lo
+/// specify the labels.  This implicitly uses .set if it is available.
+void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
+                                           const MCSymbol *Lo, unsigned Size)
+  const {
+
+  // Emit Hi+Offset - Lo
+  // Get the Hi+Offset expression.
+  const MCExpr *Plus =
+    MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Hi, OutContext),
+                            MCConstantExpr::Create(Offset, OutContext),
+                            OutContext);
+
+  // Get the Hi+Offset-Lo expression.
+  const MCExpr *Diff =
+    MCBinaryExpr::CreateSub(Plus,
+                            MCSymbolRefExpr::Create(Lo, OutContext),
+                            OutContext);
+
+  if (!MAI->hasSetDirective())
+    OutStreamer.EmitValue(Diff, 4);
+  else {
+    // Otherwise, emit with .set (aka assignment).
+    MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
+    OutStreamer.EmitAssignment(SetLabel, Diff);
+    OutStreamer.EmitSymbolValue(SetLabel, 4);
+  }
+}
+
+/// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
+/// where the size in bytes of the directive is specified by Size and Label
+/// specifies the label.  This implicitly uses .set if it is available.
+void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+                                      unsigned Size)
+  const {
+
+  // Emit Label+Offset (or just Label if Offset is zero)
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Label, OutContext);
+  if (Offset)
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(Offset, OutContext),
+                                   OutContext);
+
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+
+//===----------------------------------------------------------------------===//
+
+// EmitAlignment - Emit an alignment directive to the specified power of
+// two boundary.  For example, if you pass in 3 here, you will get an 8
+// byte alignment.  If a global value is specified, and if that global has
+// an explicit alignment requested, it will override the alignment request
+// if required for correctness.
+//
+void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalValue *GV) const {
+  if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), NumBits);
+
+  if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
+
+  if (getCurrentSection()->getKind().isText())
+    OutStreamer.EmitCodeAlignment(1 << NumBits);
+  else
+    OutStreamer.EmitValueToAlignment(1 << NumBits, 0, 1, 0);
+}
+
+//===----------------------------------------------------------------------===//
+// Constant emission.
+//===----------------------------------------------------------------------===//
+
+/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::Create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (CE == 0) {
+    llvm_unreachable("Unknown constant value to lower!");
+  }
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using DataLayout as a
+    // last resort before giving up.
+    if (Constant *C =
+          ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+      if (C != CE)
+        return lowerConstant(C, AP);
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      WriteAsOperand(OS, CE, /*PrintType=*/false,
+                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+  case Instruction::GetElementPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+
+    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    // FALL THROUGH.
+  case Instruction::BitCast:
+    return lowerConstant(CE->getOperand(0), AP);
+
+  case Instruction::IntToPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+                                      false/*ZExt*/);
+    return lowerConstant(Op, AP);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = lowerConstant(Op, AP);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+  // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
+    switch (CE->getOpcode()) {
+    default: llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Sub: return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+    case Instruction::Mul: return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+    case Instruction::SDiv: return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+    case Instruction::SRem: return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+    case Instruction::Shl: return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+    case Instruction::And: return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+    case Instruction::Or:  return MCBinaryExpr::CreateOr (LHS, RHS, Ctx);
+    case Instruction::Xor: return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+static void emitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
+                                   AsmPrinter &AP);
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const ConstantDataSequential *V) {
+  StringRef Data = V->getRawDataValues();
+  assert(!Data.empty() && "Empty aggregates should be CAZ node");
+  char C = Data[0];
+  for (unsigned i = 1, e = Data.size(); i != e; ++i)
+    if (Data[i] != C) return -1;
+  return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
+}
+
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getBitWidth() > 64) return -1;
+
+    uint64_t Size = TM.getDataLayout()->getTypeAllocSize(V->getType());
+    uint64_t Value = CI->getZExtValue();
+
+    // Make sure the constant is at least 8 bits long and has a power
+    // of 2 bit width.  This guarantees the constant bit width is
+    // always a multiple of 8 bits, avoiding issues with padding out
+    // to Size and other such corner cases.
+    if (CI->getBitWidth() < 8 || !isPowerOf2_64(CI->getBitWidth())) return -1;
+
+    uint8_t Byte = static_cast<uint8_t>(Value);
+
+    for (unsigned i = 1; i < Size; ++i) {
+      Value >>= 8;
+      if (static_cast<uint8_t>(Value) != Byte) return -1;
+    }
+    return Byte;
+  }
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(V)) {
+    // Make sure all array elements are sequences of the same repeated
+    // byte.
+    assert(CA->getNumOperands() != 0 && "Should be a CAZ");
+    int Byte = isRepeatedByteSequence(CA->getOperand(0), TM);
+    if (Byte == -1) return -1;
+
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+      int ThisByte = isRepeatedByteSequence(CA->getOperand(i), TM);
+      if (ThisByte == -1) return -1;
+      if (Byte != ThisByte) return -1;
+    }
+    return Byte;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V))
+    return isRepeatedByteSequence(CDS);
+
+  return -1;
+}
+
+static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
+                                             unsigned AddrSpace,AsmPrinter &AP){
+
+  // See if we can aggregate this into a .fill, if so, emit it as such.
+  int Value = isRepeatedByteSequence(CDS, AP.TM);
+  if (Value != -1) {
+    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CDS->getType());
+    // Don't emit a 1-byte object as a .fill.
+    if (Bytes > 1)
+      return AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
+  }
+
+  // If this can be emitted with .ascii/.asciz, emit it as such.
+  if (CDS->isString())
+    return AP.OutStreamer.EmitBytes(CDS->getAsString(), AddrSpace);
+
+  // Otherwise, emit the values in successive locations.
+  unsigned ElementByteSize = CDS->getElementByteSize();
+  if (isa<IntegerType>(CDS->getElementType())) {
+    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+      if (AP.isVerbose())
+        AP.OutStreamer.GetCommentOS() << format("0x%" PRIx64 "\n",
+                                                CDS->getElementAsInteger(i));
+      AP.OutStreamer.EmitIntValue(CDS->getElementAsInteger(i),
+                                  ElementByteSize, AddrSpace);
+    }
+  } else if (ElementByteSize == 4) {
+    // FP Constants are printed as integer constants to avoid losing
+    // precision.
+    assert(CDS->getElementType()->isFloatTy());
+    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+      union {
+        float F;
+        uint32_t I;
+      };
+
+      F = CDS->getElementAsFloat(i);
+      if (AP.isVerbose())
+        AP.OutStreamer.GetCommentOS() << "float " << F << '\n';
+      AP.OutStreamer.EmitIntValue(I, 4, AddrSpace);
+    }
+  } else {
+    assert(CDS->getElementType()->isDoubleTy());
+    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+      union {
+        double F;
+        uint64_t I;
+      };
+
+      F = CDS->getElementAsDouble(i);
+      if (AP.isVerbose())
+        AP.OutStreamer.GetCommentOS() << "double " << F << '\n';
+      AP.OutStreamer.EmitIntValue(I, 8, AddrSpace);
+    }
+  }
+
+  const DataLayout &TD = *AP.TM.getDataLayout();
+  unsigned Size = TD.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize = TD.getTypeAllocSize(CDS->getType()->getElementType()) *
+                        CDS->getNumElements();
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer.EmitZeros(Padding, AddrSpace);
+
+}
+
+static void emitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
+                                    AsmPrinter &AP) {
+  // See if we can aggregate some values.  Make sure it can be
+  // represented as a series of bytes of the constant value.
+  int Value = isRepeatedByteSequence(CA, AP.TM);
+
+  if (Value != -1) {
+    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CA->getType());
+    AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
+  }
+  else {
+    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
+      emitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+  }
+}
+
+static void emitGlobalConstantVector(const ConstantVector *CV,
+                                     unsigned AddrSpace, AsmPrinter &AP) {
+  for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
+    emitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
+
+  const DataLayout &TD = *AP.TM.getDataLayout();
+  unsigned Size = TD.getTypeAllocSize(CV->getType());
+  unsigned EmittedSize = TD.getTypeAllocSize(CV->getType()->getElementType()) *
+                         CV->getType()->getNumElements();
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer.EmitZeros(Padding, AddrSpace);
+}
+
+static void emitGlobalConstantStruct(const ConstantStruct *CS,
+                                     unsigned AddrSpace, AsmPrinter &AP) {
+  // Print the fields in successive locations. Pad to align if needed!
+  const DataLayout *TD = AP.TM.getDataLayout();
+  unsigned Size = TD->getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = TD->getStructLayout(CS->getType());
+  uint64_t SizeSoFar = 0;
+  for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
+    const Constant *Field = CS->getOperand(i);
+
+    // Check if padding is needed and insert one or more 0s.
+    uint64_t FieldSize = TD->getTypeAllocSize(Field->getType());
+    uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
+                        - Layout->getElementOffset(i)) - FieldSize;
+    SizeSoFar += FieldSize + PadSize;
+
+    // Now print the actual field value.
+    emitGlobalConstantImpl(Field, AddrSpace, AP);
+
+    // Insert padding - this may include padding to increase the size of the
+    // current field up to the ABI size (if the struct is not packed) as well
+    // as padding to ensure that the next field starts at the right offset.
+    AP.OutStreamer.EmitZeros(PadSize, AddrSpace);
+  }
+  assert(SizeSoFar == Layout->getSizeInBytes() &&
+         "Layout of constant struct may be incorrect!");
+}
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
+                                 AsmPrinter &AP) {
+  APInt API = CFP->getValueAPF().bitcastToAPInt();
+
+  // First print a comment with what we think the original floating-point value
+  // should have been.
+  if (AP.isVerbose()) {
+    SmallString<8> StrVal;
+    CFP->getValueAPF().toString(StrVal);
+
+    CFP->getType()->print(AP.OutStreamer.GetCommentOS());
+    AP.OutStreamer.GetCommentOS() << ' ' << StrVal << '\n';
+  }
+
+  // Now iterate through the APInt chunks, emitting them in endian-correct
+  // order, possibly with a smaller chunk at beginning/end (e.g. for x87 80-bit
+  // floats).
+  unsigned NumBytes = API.getBitWidth() / 8;
+  unsigned TrailingBytes = NumBytes % sizeof(uint64_t);
+  const uint64_t *p = API.getRawData();
+
+  // PPC's long double has odd notions of endianness compared to how LLVM
+  // handles it: p[0] goes first for *big* endian on PPC.
+  if (AP.TM.getDataLayout()->isBigEndian() != CFP->getType()->isPPC_FP128Ty()) {
+    int Chunk = API.getNumWords() - 1;
+
+    if (TrailingBytes)
+      AP.OutStreamer.EmitIntValue(p[Chunk--], TrailingBytes, AddrSpace);
+
+    for (; Chunk >= 0; --Chunk)
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
+  } else {
+    unsigned Chunk;
+    for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
+
+    if (TrailingBytes)
+      AP.OutStreamer.EmitIntValue(p[Chunk], TrailingBytes, AddrSpace);
+  }
+
+  // Emit the tail padding for the long double.
+  const DataLayout &TD = *AP.TM.getDataLayout();
+  AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
+                           TD.getTypeStoreSize(CFP->getType()), AddrSpace);
+}
+
+static void emitGlobalConstantLargeInt(const ConstantInt *CI,
+                                       unsigned AddrSpace, AsmPrinter &AP) {
+  const DataLayout *TD = AP.TM.getDataLayout();
+  unsigned BitWidth = CI->getBitWidth();
+  assert((BitWidth & 63) == 0 && "only support multiples of 64-bits");
+
+  // We don't expect assemblers to support integer data directives
+  // for more than 64 bits, so we emit the data in at most 64-bit
+  // quantities at a time.
+  const uint64_t *RawData = CI->getValue().getRawData();
+  for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
+    uint64_t Val = TD->isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    AP.OutStreamer.EmitIntValue(Val, 8, AddrSpace);
+  }
+}
+
+static void emitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
+                                   AsmPrinter &AP) {
+  const DataLayout *TD = AP.TM.getDataLayout();
+  uint64_t Size = TD->getTypeAllocSize(CV->getType());
+  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+    return AP.OutStreamer.EmitZeros(Size, AddrSpace);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    switch (Size) {
+    case 1:
+    case 2:
+    case 4:
+    case 8:
+      if (AP.isVerbose())
+        AP.OutStreamer.GetCommentOS() << format("0x%" PRIx64 "\n",
+                                                CI->getZExtValue());
+      AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
+      return;
+    default:
+      emitGlobalConstantLargeInt(CI, AddrSpace, AP);
+      return;
+    }
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
+    return emitGlobalConstantFP(CFP, AddrSpace, AP);
+
+  if (isa<ConstantPointerNull>(CV)) {
+    AP.OutStreamer.EmitIntValue(0, Size, AddrSpace);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
+    return emitGlobalConstantDataSequential(CDS, AddrSpace, AP);
+
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
+    return emitGlobalConstantArray(CVA, AddrSpace, AP);
+
+  if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
+    return emitGlobalConstantStruct(CVS, AddrSpace, AP);
+
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
+    // vectors).
+    if (CE->getOpcode() == Instruction::BitCast)
+      return emitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
+
+    if (Size > 8) {
+      // If the constant expression's size is greater than 64-bits, then we have
+      // to emit the value in chunks. Try to constant fold the value and emit it
+      // that way.
+      Constant *New = ConstantFoldConstantExpression(CE, TD);
+      if (New && New != CE)
+        return emitGlobalConstantImpl(New, AddrSpace, AP);
+    }
+  }
+
+  if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
+    return emitGlobalConstantVector(V, AddrSpace, AP);
+
+  // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
+  // thread the streamer with EmitValue.
+  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size, AddrSpace);
+}
+
+/// EmitGlobalConstant - Print a general LLVM constant to the .s file.
+void AsmPrinter::EmitGlobalConstant(const Constant *CV, unsigned AddrSpace) {
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  if (Size)
+    emitGlobalConstantImpl(CV, AddrSpace, *this);
+  else if (MAI->hasSubsectionsViaSymbols()) {
+    // If the global has zero size, emit a single byte so that two labels don't
+    // look like they are at the same location.
+    OutStreamer.EmitIntValue(0, 1, AddrSpace);
+  }
+}
+
+void AsmPrinter::EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  // Target doesn't support this yet!
+  llvm_unreachable("Target does not support EmitMachineConstantPoolValue");
+}
+
+void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
+  if (Offset > 0)
+    OS << '+' << Offset;
+  else if (Offset < 0)
+    OS << Offset;
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol Lowering Routines.
+//===----------------------------------------------------------------------===//
+
+/// GetTempSymbol - Return the MCSymbol corresponding to the assembler
+/// temporary label with the specified stem and unique ID.
+MCSymbol *AsmPrinter::GetTempSymbol(StringRef Name, unsigned ID) const {
+  return OutContext.GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                                      Name + Twine(ID));
+}
+
+/// GetTempSymbol - Return an assembler temporary label with the specified
+/// stem.
+MCSymbol *AsmPrinter::GetTempSymbol(StringRef Name) const {
+  return OutContext.GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix())+
+                                      Name);
+}
+
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BlockAddress *BA) const {
+  return MMI->getAddrLabelSymbol(BA->getBasicBlock());
+}
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
+  return MMI->getAddrLabelSymbol(BB);
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  return OutContext.GetOrCreateSymbol
+    (Twine(MAI->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
+     + "_" + Twine(CPID));
+}
+
+/// GetJTISymbol - Return the symbol for the specified jump table entry.
+MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
+  return MF->getJTISymbol(JTID, OutContext, isLinkerPrivate);
+}
+
+/// GetJTSetSymbol - Return the symbol for the specified jump table .set
+/// FIXME: privatize to AsmPrinter.
+MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
+  return OutContext.GetOrCreateSymbol
+  (Twine(MAI->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
+   Twine(UID) + "_set_" + Twine(MBBID));
+}
+
+/// GetSymbolWithGlobalValueBase - Return the MCSymbol for a symbol with
+/// global value name as its base, with the specified suffix, and where the
+/// symbol is forced to have private linkage if ForcePrivate is true.
+MCSymbol *AsmPrinter::GetSymbolWithGlobalValueBase(const GlobalValue *GV,
+                                                   StringRef Suffix,
+                                                   bool ForcePrivate) const {
+  SmallString<60> NameStr;
+  Mang->getNameWithPrefix(NameStr, GV, ForcePrivate);
+  NameStr.append(Suffix.begin(), Suffix.end());
+  return OutContext.GetOrCreateSymbol(NameStr.str());
+}
+
+/// GetExternalSymbolSymbol - Return the MCSymbol for the specified
+/// ExternalSymbol.
+MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
+  SmallString<60> NameStr;
+  Mang->getNameWithPrefix(NameStr, Sym);
+  return OutContext.GetOrCreateSymbol(NameStr.str());
+}
+
+
+
+/// PrintParentLoopComment - Print comments about parent loops of this one.
+static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                   unsigned FunctionNumber) {
+  if (Loop == 0) return;
+  PrintParentLoopComment(OS, Loop->getParentLoop(), FunctionNumber);
+  OS.indent(Loop->getLoopDepth()*2)
+    << "Parent Loop BB" << FunctionNumber << "_"
+    << Loop->getHeader()->getNumber()
+    << " Depth=" << Loop->getLoopDepth() << '\n';
+}
+
+
+/// PrintChildLoopComment - Print comments about child loops within
+/// the loop for this basic block, with nesting.
+static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                  unsigned FunctionNumber) {
+  // Add child loop information
+  for (MachineLoop::iterator CL = Loop->begin(), E = Loop->end();CL != E; ++CL){
+    OS.indent((*CL)->getLoopDepth()*2)
+      << "Child Loop BB" << FunctionNumber << "_"
+      << (*CL)->getHeader()->getNumber() << " Depth " << (*CL)->getLoopDepth()
+      << '\n';
+    PrintChildLoopComment(OS, *CL, FunctionNumber);
+  }
+}
+
+/// emitBasicBlockLoopComments - Pretty-print comments for basic blocks.
+static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
+                                       const MachineLoopInfo *LI,
+                                       const AsmPrinter &AP) {
+  // Add loop depth information
+  const MachineLoop *Loop = LI->getLoopFor(&MBB);
+  if (Loop == 0) return;
+
+  MachineBasicBlock *Header = Loop->getHeader();
+  assert(Header && "No header for loop");
+
+  // If this block is not a loop header, just print out what is the loop header
+  // and return.
+  if (Header != &MBB) {
+    AP.OutStreamer.AddComment("  in Loop: Header=BB" +
+                              Twine(AP.getFunctionNumber())+"_" +
+                              Twine(Loop->getHeader()->getNumber())+
+                              " Depth="+Twine(Loop->getLoopDepth()));
+    return;
+  }
+
+  // Otherwise, it is a loop header.  Print out information about child and
+  // parent loops.
+  raw_ostream &OS = AP.OutStreamer.GetCommentOS();
+
+  PrintParentLoopComment(OS, Loop->getParentLoop(), AP.getFunctionNumber());
+
+  OS << "=>";
+  OS.indent(Loop->getLoopDepth()*2-2);
+
+  OS << "This ";
+  if (Loop->empty())
+    OS << "Inner ";
+  OS << "Loop Header: Depth=" + Twine(Loop->getLoopDepth()) << '\n';
+
+  PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
+}
+
+
+/// EmitBasicBlockStart - This method prints the label for the specified
+/// MachineBasicBlock, an alignment (if present) and a comment describing
+/// it if appropriate.
+void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock *MBB) const {
+  // Emit an alignment directive for this block, if needed.
+  if (unsigned Align = MBB->getAlignment())
+    EmitAlignment(Align);
+
+  // If the block has its address taken, emit any labels that were used to
+  // reference the block.  It is possible that there is more than one label
+  // here, because multiple LLVM BB's may have been RAUW'd to this block after
+  // the references were generated.
+  if (MBB->hasAddressTaken()) {
+    const BasicBlock *BB = MBB->getBasicBlock();
+    if (isVerbose())
+      OutStreamer.AddComment("Block address taken");
+
+    std::vector<MCSymbol*> Syms = MMI->getAddrLabelSymbolToEmit(BB);
+
+    for (unsigned i = 0, e = Syms.size(); i != e; ++i)
+      OutStreamer.EmitLabel(Syms[i]);
+  }
+
+  // Print some verbose block comments.
+  if (isVerbose()) {
+    if (const BasicBlock *BB = MBB->getBasicBlock())
+      if (BB->hasName())
+        OutStreamer.AddComment("%" + BB->getName());
+    emitBasicBlockLoopComments(*MBB, LI, *this);
+  }
+
+  // Print the main label for the block.
+  if (MBB->pred_empty() || isBlockOnlyReachableByFallthrough(MBB)) {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      // NOTE: Want this comment at start of line, don't emit with AddComment.
+      OutStreamer.EmitRawText(Twine(MAI->getCommentString()) + " BB#" +
+                              Twine(MBB->getNumber()) + ":");
+    }
+  } else {
+    OutStreamer.EmitLabel(MBB->getSymbol());
+  }
+}
+
+void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility,
+                                bool IsDefinition) const {
+  MCSymbolAttr Attr = MCSA_Invalid;
+
+  switch (Visibility) {
+  default: break;
+  case GlobalValue::HiddenVisibility:
+    if (IsDefinition)
+      Attr = MAI->getHiddenVisibilityAttr();
+    else
+      Attr = MAI->getHiddenDeclarationVisibilityAttr();
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Attr = MAI->getProtectedVisibilityAttr();
+    break;
+  }
+
+  if (Attr != MCSA_Invalid)
+    OutStreamer.EmitSymbolAttribute(Sym, Attr);
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool AsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+  if (PI2 != MBB->pred_end())
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  MachineBasicBlock *Pred = *PI;
+
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Check the terminators in the previous blocks
+  for (MachineBasicBlock::iterator II = Pred->getFirstTerminator(),
+         IE = Pred->end(); II != IE; ++II) {
+    MachineInstr &MI = *II;
+
+    // If it is not a simple branch, we are in a table somewhere.
+    if (!MI.isBranch() || MI.isIndirectBranch())
+      return false;
+
+    // If we are the operands of one of the branches, this is not
+    // a fall through.
+    for (MachineInstr::mop_iterator OI = MI.operands_begin(),
+           OE = MI.operands_end(); OI != OE; ++OI) {
+      const MachineOperand& OP = *OI;
+      if (OP.isJTI())
+        return false;
+      if (OP.isMBB() && OP.getMBB() == MBB)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+
+
+GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy *S) {
+  if (!S->usesMetadata())
+    return 0;
+
+  gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
+  gcp_map_type::iterator GCPI = GCMap.find(S);
+  if (GCPI != GCMap.end())
+    return GCPI->second;
+
+  const char *Name = S->getName().c_str();
+
+  for (GCMetadataPrinterRegistry::iterator
+         I = GCMetadataPrinterRegistry::begin(),
+         E = GCMetadataPrinterRegistry::end(); I != E; ++I)
+    if (strcmp(Name, I->getName()) == 0) {
+      GCMetadataPrinter *GMP = I->instantiate();
+      GMP->S = S;
+      GCMap.insert(std::make_pair(S, GMP));
+      return GMP;
+    }
+
+  report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name));
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
new file mode 100644
index 000000000000..156acace553d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -0,0 +1,196 @@
+//===-- AsmPrinterDwarf.cpp - AsmPrinter Dwarf Support --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dwarf emissions parts of AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Dwarf Emission Helper Routines
+//===----------------------------------------------------------------------===//
+
+/// EmitSLEB128 - emit the specified signed leb128 value.
+void AsmPrinter::EmitSLEB128(int Value, const char *Desc) const {
+  if (isVerbose() && Desc)
+    OutStreamer.AddComment(Desc);
+
+  OutStreamer.EmitSLEB128IntValue(Value);
+}
+
+/// EmitULEB128 - emit the specified signed leb128 value.
+void AsmPrinter::EmitULEB128(unsigned Value, const char *Desc,
+                             unsigned PadTo) const {
+  if (isVerbose() && Desc)
+    OutStreamer.AddComment(Desc);
+
+  OutStreamer.EmitULEB128IntValue(Value, PadTo);
+}
+
+/// EmitCFAByte - Emit a .byte 42 directive for a DW_CFA_xxx value.
+void AsmPrinter::EmitCFAByte(unsigned Val) const {
+  if (isVerbose()) {
+    if (Val >= dwarf::DW_CFA_offset && Val < dwarf::DW_CFA_offset+64)
+      OutStreamer.AddComment("DW_CFA_offset + Reg (" +
+                             Twine(Val-dwarf::DW_CFA_offset) + ")");
+    else
+      OutStreamer.AddComment(dwarf::CallFrameString(Val));
+  }
+  OutStreamer.EmitIntValue(Val, 1);
+}
+
+static const char *DecodeDWARFEncoding(unsigned Encoding) {
+  switch (Encoding) {
+  case dwarf::DW_EH_PE_absptr: return "absptr";
+  case dwarf::DW_EH_PE_omit:   return "omit";
+  case dwarf::DW_EH_PE_pcrel:  return "pcrel";
+  case dwarf::DW_EH_PE_udata4: return "udata4";
+  case dwarf::DW_EH_PE_udata8: return "udata8";
+  case dwarf::DW_EH_PE_sdata4: return "sdata4";
+  case dwarf::DW_EH_PE_sdata8: return "sdata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4: return "pcrel udata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4: return "pcrel sdata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8: return "pcrel udata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8: return "pcrel sdata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |dwarf::DW_EH_PE_udata4:
+    return "indirect pcrel udata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |dwarf::DW_EH_PE_sdata4:
+    return "indirect pcrel sdata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |dwarf::DW_EH_PE_udata8:
+    return "indirect pcrel udata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |dwarf::DW_EH_PE_sdata8:
+    return "indirect pcrel sdata8";
+  }
+
+  return "<unknown encoding>";
+}
+
+
+/// EmitEncodingByte - Emit a .byte 42 directive that corresponds to an
+/// encoding.  If verbose assembly output is enabled, we output comments
+/// describing the encoding.  Desc is an optional string saying what the
+/// encoding is specifying (e.g. "LSDA").
+void AsmPrinter::EmitEncodingByte(unsigned Val, const char *Desc) const {
+  if (isVerbose()) {
+    if (Desc != 0)
+      OutStreamer.AddComment(Twine(Desc)+" Encoding = " +
+                             Twine(DecodeDWARFEncoding(Val)));
+    else
+      OutStreamer.AddComment(Twine("Encoding = ") +
+                             DecodeDWARFEncoding(Val));
+  }
+
+  OutStreamer.EmitIntValue(Val, 1);
+}
+
+/// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
+unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return 0;
+
+  switch (Encoding & 0x07) {
+  default: llvm_unreachable("Invalid encoded value.");
+  case dwarf::DW_EH_PE_absptr: return TM.getDataLayout()->getPointerSize();
+  case dwarf::DW_EH_PE_udata2: return 2;
+  case dwarf::DW_EH_PE_udata4: return 4;
+  case dwarf::DW_EH_PE_udata8: return 8;
+  }
+}
+
+void AsmPrinter::EmitTTypeReference(const GlobalValue *GV,
+                                    unsigned Encoding) const {
+  if (GV) {
+    const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+    const MCExpr *Exp =
+      TLOF.getTTypeGlobalReference(GV, Mang, MMI, Encoding, OutStreamer);
+    OutStreamer.EmitValue(Exp, GetSizeOfEncodedValue(Encoding));
+  } else
+    OutStreamer.EmitIntValue(0, GetSizeOfEncodedValue(Encoding));
+}
+
+/// EmitSectionOffset - Emit the 4-byte offset of Label from the start of its
+/// section.  This can be done with a special directive if the target supports
+/// it (e.g. cygwin) or by emitting it as an offset from a label at the start
+/// of the section.
+///
+/// SectionLabel is a temporary label emitted at the start of the section that
+/// Label lives in.
+void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
+                                   const MCSymbol *SectionLabel) const {
+  // On COFF targets, we have to emit the special .secrel32 directive.
+  if (MAI->getDwarfSectionOffsetDirective()) {
+    OutStreamer.EmitCOFFSecRel32(Label);
+    return;
+  }
+
+  // Get the section that we're referring to, based on SectionLabel.
+  const MCSection &Section = SectionLabel->getSection();
+
+  // If Label has already been emitted, verify that it is in the same section as
+  // section label for sanity.
+  assert((!Label->isInSection() || &Label->getSection() == &Section) &&
+         "Section offset using wrong section base for label");
+
+  // If the section in question will end up with an address of 0 anyway, we can
+  // just emit an absolute reference to save a relocation.
+  if (Section.isBaseAddressKnownZero()) {
+    OutStreamer.EmitSymbolValue(Label, 4);
+    return;
+  }
+
+  // Otherwise, emit it as a label difference from the start of the section.
+  EmitLabelDifference(Label, SectionLabel, 4);
+}
+
+//===----------------------------------------------------------------------===//
+// Dwarf Lowering Routines
+//===----------------------------------------------------------------------===//
+
+/// EmitCFIFrameMove - Emit a frame instruction.
+void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+
+  const MachineLocation &Dst = Move.getDestination();
+  const MachineLocation &Src = Move.getSource();
+
+  // If advancing cfa.
+  if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+    if (Src.getReg() == MachineLocation::VirtualFP) {
+      OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset());
+    } else {
+      // Reg + Offset
+      OutStreamer.EmitCFIDefCfa(RI->getDwarfRegNum(Src.getReg(), true),
+                                Src.getOffset());
+    }
+  } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
+    assert(Dst.isReg() && "Machine move not supported yet.");
+    OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true));
+  } else {
+    assert(!Dst.isReg() && "Machine move not supported yet.");
+    OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true),
+                              Dst.getOffset());
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
new file mode 100644
index 000000000000..abfa330fa29d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -0,0 +1,553 @@
+//===-- AsmPrinterInlineAsm.cpp - AsmPrinter Inline Asm Handling ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the inline assembler pieces of the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  struct SrcMgrDiagInfo {
+    const MDNode *LocInfo;
+    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
+    void *DiagContext;
+  };
+}
+
+/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
+/// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
+/// struct above.
+static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
+  SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
+  assert(DiagInfo && "Diagnostic context not passed down?");
+
+  // If the inline asm had metadata associated with it, pull out a location
+  // cookie corresponding to which line the error occurred on.
+  unsigned LocCookie = 0;
+  if (const MDNode *LocInfo = DiagInfo->LocInfo) {
+    unsigned ErrorLine = Diag.getLineNo()-1;
+    if (ErrorLine >= LocInfo->getNumOperands())
+      ErrorLine = 0;
+
+    if (LocInfo->getNumOperands() != 0)
+      if (const ConstantInt *CI =
+          dyn_cast<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+        LocCookie = CI->getZExtValue();
+  }
+
+  DiagInfo->DiagHandler(Diag, DiagInfo->DiagContext, LocCookie);
+}
+
+/// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
+void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
+                               InlineAsm::AsmDialect Dialect) const {
+  assert(!Str.empty() && "Can't emit empty inline asm block");
+
+  // Remember if the buffer is nul terminated or not so we can avoid a copy.
+  bool isNullTerminated = Str.back() == 0;
+  if (isNullTerminated)
+    Str = Str.substr(0, Str.size()-1);
+
+  // If the output streamer is actually a .s file, just emit the blob textually.
+  // This is useful in case the asm parser doesn't handle something but the
+  // system assembler does.
+  if (OutStreamer.hasRawTextSupport()) {
+    OutStreamer.EmitRawText(Str);
+    return;
+  }
+
+  SourceMgr SrcMgr;
+  SrcMgrDiagInfo DiagInfo;
+
+  // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
+  LLVMContext &LLVMCtx = MMI->getModule()->getContext();
+  bool HasDiagHandler = false;
+  if (LLVMCtx.getInlineAsmDiagnosticHandler() != 0) {
+    // If the source manager has an issue, we arrange for srcMgrDiagHandler
+    // to be invoked, getting DiagInfo passed into it.
+    DiagInfo.LocInfo = LocMDNode;
+    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
+    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
+    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
+    HasDiagHandler = true;
+  }
+
+  MemoryBuffer *Buffer;
+  if (isNullTerminated)
+    Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
+  else
+    Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
+
+  // Tell SrcMgr about this buffer, it takes ownership of the buffer.
+  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
+                                                  OutContext, OutStreamer,
+                                                  *MAI));
+
+  // FIXME: It would be nice if we can avoid createing a new instance of
+  // MCSubtargetInfo here given TargetSubtargetInfo is available. However,
+  // we have to watch out for asm directives which can change subtarget
+  // state. e.g. .code 16, .code 32.
+  OwningPtr<MCSubtargetInfo>
+    STI(TM.getTarget().createMCSubtargetInfo(TM.getTargetTriple(),
+                                             TM.getTargetCPU(),
+                                             TM.getTargetFeatureString()));
+  OwningPtr<MCTargetAsmParser>
+    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser));
+  if (!TAP)
+    report_fatal_error("Inline asm not supported by this streamer because"
+                       " we don't have an asm parser for this target\n");
+  Parser->setAssemblerDialect(Dialect);
+  Parser->setTargetParser(*TAP.get());
+
+  // Don't implicitly switch to the text section before the asm.
+  int Res = Parser->Run(/*NoInitialTextSection*/ true,
+                        /*NoFinalize*/ true);
+  if (Res && !HasDiagHandler)
+    report_fatal_error("Error parsing inline asm\n");
+}
+
+static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                               MachineModuleInfo *MMI, int InlineAsmVariant,
+                               AsmPrinter *AP, unsigned LocCookie,
+                               raw_ostream &OS) {
+  // Switch to the inline assembly variant.
+  OS << "\t.intel_syntax\n\t";
+
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+
+      OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      }
+      if (Done) break;
+
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
+
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+
+      bool Error = false;
+
+      // Scan to find the machine operand number for the operand.
+      for (; Val; --Val) {
+        if (OpNo >= MI->getNumOperands()) break;
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+      }
+
+      // We may have a location metadata attached to the end of the
+      // instruction, and at no point should see metadata at any
+      // other point while processing. It's an error if so.
+      if (OpNo >= MI->getNumOperands() ||
+          MI->getOperand(OpNo).isMetadata()) {
+        Error = true;
+      } else {
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        ++OpNo;  // Skip over the ID number.
+        
+        if (InlineAsm::isMemKind(OpFlags)) {
+          Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                            /*Modifier*/ 0, OS);
+        } else {
+          Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                      /*Modifier*/ 0, OS);
+        }
+      }
+      if (Error) {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+        MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+      }
+      break;
+    }
+    }
+  }
+  OS << "\n\t.att_syntax\n" << (char)0;  // null terminate string.
+}
+
+static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                                MachineModuleInfo *MMI, int InlineAsmVariant,
+                                int AsmPrinterVariant, AsmPrinter *AP,
+                                unsigned LocCookie, raw_ostream &OS) {
+  int CurVariant = -1;            // The number of the {.|.|.} region we are in.
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  OS << '\t';
+
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+        OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':     // $$ -> $
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          OS << '$';
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      case '(':             // $( -> same as GCC's { character.
+        ++LastEmitted;      // Consume '(' character.
+        if (CurVariant != -1)
+          report_fatal_error("Nested variants found in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        CurVariant = 0;     // We're in the first variant now.
+        break;
+      case '|':
+        ++LastEmitted;  // consume '|' character.
+        if (CurVariant == -1)
+          OS << '|';       // this is gcc's behavior for | outside a variant
+        else
+          ++CurVariant;   // We're in the next variant.
+        break;
+      case ')':         // $) -> same as GCC's } char.
+        ++LastEmitted;  // consume ')' character.
+        if (CurVariant == -1)
+          OS << '}';     // this is gcc's behavior for } outside a variant
+        else
+          CurVariant = -1;
+        break;
+      }
+      if (Done) break;
+
+      bool HasCurlyBraces = false;
+      if (*LastEmitted == '{') {     // ${variable}
+        ++LastEmitted;               // Consume '{' character.
+        HasCurlyBraces = true;
+      }
+
+      // If we have ${:foo}, then this is not a real operand reference, it is a
+      // "magic" string reference, just like in .td files.  Arrange to call
+      // PrintSpecial.
+      if (HasCurlyBraces && *LastEmitted == ':') {
+        ++LastEmitted;
+        const char *StrStart = LastEmitted;
+        const char *StrEnd = strchr(StrStart, '}');
+        if (StrEnd == 0)
+          report_fatal_error("Unterminated ${:foo} operand in inline asm"
+                             " string: '" + Twine(AsmStr) + "'");
+
+        std::string Val(StrStart, StrEnd);
+        AP->PrintSpecial(MI, OS, Val.c_str());
+        LastEmitted = StrEnd+1;
+        break;
+      }
+
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
+
+      char Modifier[2] = { 0, 0 };
+
+      if (HasCurlyBraces) {
+        // If we have curly braces, check for a modifier character.  This
+        // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
+        if (*LastEmitted == ':') {
+          ++LastEmitted;    // Consume ':' character.
+          if (*LastEmitted == 0)
+            report_fatal_error("Bad ${:} expression in inline asm string: '" +
+                               Twine(AsmStr) + "'");
+
+          Modifier[0] = *LastEmitted;
+          ++LastEmitted;    // Consume modifier character.
+        }
+
+        if (*LastEmitted != '}')
+          report_fatal_error("Bad ${} expression in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        ++LastEmitted;    // Consume '}' character.
+      }
+
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
+        unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+
+        bool Error = false;
+
+        // Scan to find the machine operand number for the operand.
+        for (; Val; --Val) {
+          if (OpNo >= MI->getNumOperands()) break;
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
+        if (OpNo >= MI->getNumOperands() ||
+            MI->getOperand(OpNo).isMetadata()) {
+          Error = true;
+        } else {
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          ++OpNo;  // Skip over the ID number.
+
+          if (Modifier[0] == 'l')  // labels are target independent
+            // FIXME: What if the operand isn't an MBB, report error?
+            OS << *MI->getOperand(OpNo).getMBB()->getSymbol();
+          else {
+            if (InlineAsm::isMemKind(OpFlags)) {
+              Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                                Modifier[0] ? Modifier : 0,
+                                                OS);
+            } else {
+              Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                          Modifier[0] ? Modifier : 0, OS);
+            }
+          }
+        }
+        if (Error) {
+          std::string msg;
+          raw_string_ostream Msg(msg);
+          Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+          MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+        }
+      }
+      break;
+    }
+    }
+  }
+  OS << '\n' << (char)0;  // null terminate string.
+}
+
+/// EmitInlineAsm - This method formats and emits the specified machine
+/// instruction that is an inline asm.
+void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
+  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
+
+  // Count the number of register definitions to find the asm string.
+  unsigned NumDefs = 0;
+  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+       ++NumDefs)
+    assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    // Don't emit the comments if writing to a .o file.
+    if (!OutStreamer.hasRawTextSupport()) return;
+
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmEnd());
+    return;
+  }
+
+  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use EmitRawText.
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+
+  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
+  // it.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = 0;
+  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
+    if (MI->getOperand(i-1).isMetadata() &&
+        (LocMD = MI->getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  // Emit the inline asm to a temporary string so we can emit it through
+  // EmitInlineAsm.
+  SmallString<256> StringData;
+  raw_svector_ostream OS(StringData);
+
+  // The variant of the current asmprinter.
+  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  InlineAsm::AsmDialect InlineAsmVariant = MI->getInlineAsmDialect();
+  AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+  if (InlineAsmVariant == InlineAsm::AD_ATT)
+    EmitGCCInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AsmPrinterVariant,
+                        AP, LocCookie, OS);
+  else
+    EmitMSInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AP, LocCookie, OS);
+
+  EmitInlineAsm(OS.str(), LocMD, MI->getInlineAsmDialect());
+
+  // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use EmitRawText.
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmEnd());
+}
+
+
+/// PrintSpecial - Print information related to the specified machine instr
+/// that is independent of the operand, and may be independent of the instr
+/// itself.  This can be useful for portably encoding the comment character
+/// or other bits of target-specific knowledge into the asmstrings.  The
+/// syntax used is ${:comment}.  Targets can override this to add support
+/// for their own strange codes.
+void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
+                              const char *Code) const {
+  if (!strcmp(Code, "private")) {
+    OS << MAI->getPrivateGlobalPrefix();
+  } else if (!strcmp(Code, "comment")) {
+    OS << MAI->getCommentString();
+  } else if (!strcmp(Code, "uid")) {
+    // Comparing the address of MI isn't sufficient, because machineinstrs may
+    // be allocated to the same address across functions.
+
+    // If this is a new LastFn instruction, bump the counter.
+    if (LastMI != MI || LastFn != getFunctionNumber()) {
+      ++Counter;
+      LastMI = MI;
+      LastFn = getFunctionNumber();
+    }
+    OS << Counter;
+  } else {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Unknown special formatter '" << Code
+         << "' for machine instr: " << *MI;
+    report_fatal_error(Msg.str());
+  }
+}
+
+/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
+/// instruction, using the specified assembler variant.  Targets should
+/// override this to format as appropriate.
+bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 unsigned AsmVariant, const char *ExtraCode,
+                                 raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (ExtraCode[0]) {
+    default:
+      return true;  // Unknown modifier.
+    case 'c': // Substitute immediate value without immediate syntax
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'n':  // Negate the immediate constant.
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << -MO.getImm();
+      return false;
+    }
+  }
+  return true;
+}
+
+bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Target doesn't support this yet!
+  return true;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
new file mode 100644
index 000000000000..57e0acda890f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -0,0 +1,368 @@
+//===--- lib/CodeGen/DIE.cpp - DWARF Info Entries -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIE.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrevData Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Attribute);
+  ID.AddInteger(Form);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrev Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(Tag);
+  ID.AddInteger(ChildrenFlag);
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i)
+    Data[i].Profile(ID);
+}
+
+/// Emit - Print the abbreviation using the specified asm printer.
+///
+void DIEAbbrev::Emit(AsmPrinter *AP) const {
+  // Emit its Dwarf tag type.
+  // FIXME: Doing work even in non-asm-verbose runs.
+  AP->EmitULEB128(Tag, dwarf::TagString(Tag));
+
+  // Emit whether it has children DIEs.
+  // FIXME: Doing work even in non-asm-verbose runs.
+  AP->EmitULEB128(ChildrenFlag, dwarf::ChildrenString(ChildrenFlag));
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    const DIEAbbrevData &AttrData = Data[i];
+
+    // Emit attribute type.
+    // FIXME: Doing work even in non-asm-verbose runs.
+    AP->EmitULEB128(AttrData.getAttribute(),
+                    dwarf::AttributeString(AttrData.getAttribute()));
+
+    // Emit form type.
+    // FIXME: Doing work even in non-asm-verbose runs.
+    AP->EmitULEB128(AttrData.getForm(),
+                    dwarf::FormEncodingString(AttrData.getForm()));
+  }
+
+  // Mark end of abbreviation.
+  AP->EmitULEB128(0, "EOM(1)");
+  AP->EmitULEB128(0, "EOM(2)");
+}
+
+#ifndef NDEBUG
+void DIEAbbrev::print(raw_ostream &O) {
+  O << "Abbreviation @"
+    << format("0x%lx", (long)(intptr_t)this)
+    << "  "
+    << dwarf::TagString(Tag)
+    << " "
+    << dwarf::ChildrenString(ChildrenFlag)
+    << '\n';
+
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << "  "
+      << dwarf::AttributeString(Data[i].getAttribute())
+      << "  "
+      << dwarf::FormEncodingString(Data[i].getForm())
+      << '\n';
+  }
+}
+void DIEAbbrev::dump() { print(dbgs()); }
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIE Implementation
+//===----------------------------------------------------------------------===//
+
+DIE::~DIE() {
+  for (unsigned i = 0, N = Children.size(); i < N; ++i)
+    delete Children[i];
+}
+
+/// Climb up the parent chain to get the compile unit DIE this DIE belongs to.
+DIE *DIE::getCompileUnit() const{
+  DIE *p = getParent();
+  while (p) {
+    if (p->getTag() == dwarf::DW_TAG_compile_unit)
+      return p;
+    p = p->getParent();
+  }
+  llvm_unreachable("We should not have orphaned DIEs.");
+}
+
+#ifndef NDEBUG
+void DIE::print(raw_ostream &O, unsigned IncIndent) {
+  IndentCount += IncIndent;
+  const std::string Indent(IndentCount, ' ');
+  bool isBlock = Abbrev.getTag() == 0;
+
+  if (!isBlock) {
+    O << Indent
+      << "Die: "
+      << format("0x%lx", (long)(intptr_t)this)
+      << ", Offset: " << Offset
+      << ", Size: " << Size << "\n";
+
+    O << Indent
+      << dwarf::TagString(Abbrev.getTag())
+      << " "
+      << dwarf::ChildrenString(Abbrev.getChildrenFlag()) << "\n";
+  } else {
+    O << "Size: " << Size << "\n";
+  }
+
+  const SmallVectorImpl<DIEAbbrevData> &Data = Abbrev.getData();
+
+  IndentCount += 2;
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << Indent;
+
+    if (!isBlock)
+      O << dwarf::AttributeString(Data[i].getAttribute());
+    else
+      O << "Blk[" << i << "]";
+
+    O <<  "  "
+      << dwarf::FormEncodingString(Data[i].getForm())
+      << " ";
+    Values[i]->print(O);
+    O << "\n";
+  }
+  IndentCount -= 2;
+
+  for (unsigned j = 0, M = Children.size(); j < M; ++j) {
+    Children[j]->print(O, 4);
+  }
+
+  if (!isBlock) O << "\n";
+  IndentCount -= IncIndent;
+}
+
+void DIE::dump() {
+  print(dbgs());
+}
+#endif
+
+void DIEValue::anchor() { }
+
+#ifndef NDEBUG
+void DIEValue::dump() {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEInteger Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit integer of appropriate size.
+///
+void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+  unsigned Size = ~0U;
+  switch (Form) {
+  case dwarf::DW_FORM_flag_present:
+    // Emit something to keep the lines and comments in sync.
+    // FIXME: Is there a better way to do this?
+    if (Asm->OutStreamer.hasRawTextSupport())
+      Asm->OutStreamer.EmitRawText(StringRef(""));
+    return;
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: Size = 1; break;
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: Size = 2; break;
+  case dwarf::DW_FORM_sec_offset: // Fall thru
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: Size = 4; break;
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_data8: Size = 8; break;
+  case dwarf::DW_FORM_GNU_str_index: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_GNU_addr_index: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
+  case dwarf::DW_FORM_addr:
+    Size = Asm->getDataLayout().getPointerSize(); break;
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+  Asm->OutStreamer.EmitIntValue(Integer, Size);
+}
+
+/// SizeOf - Determine size of integer value in bytes.
+///
+unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_flag_present: return 0;
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: return sizeof(int16_t);
+  case dwarf::DW_FORM_sec_offset: // Fall thru
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_GNU_str_index: return MCAsmInfo::getULEB128Size(Integer);
+  case dwarf::DW_FORM_GNU_addr_index: return MCAsmInfo::getULEB128Size(Integer);
+  case dwarf::DW_FORM_udata: return MCAsmInfo::getULEB128Size(Integer);
+  case dwarf::DW_FORM_sdata: return MCAsmInfo::getSLEB128Size(Integer);
+  case dwarf::DW_FORM_addr:  return AP->getDataLayout().getPointerSize();
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+#ifndef NDEBUG
+void DIEInteger::print(raw_ostream &O) {
+  O << "Int: " << (int64_t)Integer << "  0x";
+  O.write_hex(Integer);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIELabel Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit label value.
+///
+void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
+  AP->OutStreamer.EmitSymbolValue(Label, SizeOf(AP, Form));
+}
+
+/// SizeOf - Determine size of label value in bytes.
+///
+unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getDataLayout().getPointerSize();
+}
+
+#ifndef NDEBUG
+void DIELabel::print(raw_ostream &O) {
+  O << "Lbl: " << Label->getName();
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEDelta Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit delta value.
+///
+void DIEDelta::EmitValue(AsmPrinter *AP, unsigned Form) const {
+  AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEDelta::SizeOf(AsmPrinter *AP, unsigned Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getDataLayout().getPointerSize();
+}
+
+#ifndef NDEBUG
+void DIEDelta::print(raw_ostream &O) {
+  O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName();
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEEntry Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit debug information entry offset.
+///
+void DIEEntry::EmitValue(AsmPrinter *AP, unsigned Form) const {
+  AP->EmitInt32(Entry->getOffset());
+}
+
+#ifndef NDEBUG
+void DIEEntry::print(raw_ostream &O) {
+  O << format("Die: 0x%lx", (long)(intptr_t)Entry);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// DIEBlock Implementation
+//===----------------------------------------------------------------------===//
+
+/// ComputeSize - calculate the size of the block.
+///
+unsigned DIEBlock::ComputeSize(AsmPrinter *AP) {
+  if (!Size) {
+    const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev.getData();
+    for (unsigned i = 0, N = Values.size(); i < N; ++i)
+      Size += Values[i]->SizeOf(AP, AbbrevData[i].getForm());
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit block data.
+///
+void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+  switch (Form) {
+  default: llvm_unreachable("Improper form for block");
+  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);   break;
+  case dwarf::DW_FORM_block:  Asm->EmitULEB128(Size); break;
+  }
+
+  const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev.getData();
+  for (unsigned i = 0, N = Values.size(); i < N; ++i)
+    Values[i]->EmitValue(Asm, AbbrevData[i].getForm());
+}
+
+/// SizeOf - Determine size of block data in bytes.
+///
+unsigned DIEBlock::SizeOf(AsmPrinter *AP, unsigned Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_block:  return Size + MCAsmInfo::getULEB128Size(Size);
+  default: llvm_unreachable("Improper form for block");
+  }
+}
+
+#ifndef NDEBUG
+void DIEBlock::print(raw_ostream &O) {
+  O << "Blk: ";
+  DIE::print(O, 5);
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.h
new file mode 100644
index 000000000000..c332aa2a7db6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.h
@@ -0,0 +1,392 @@
+//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DIE_H__
+#define CODEGEN_ASMPRINTER_DIE_H__
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include <vector>
+
+namespace llvm {
+  class AsmPrinter;
+  class MCSymbol;
+  class raw_ostream;
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrevData - Dwarf abbreviation data, describes the one attribute of a
+  /// Dwarf abbreviation.
+  class DIEAbbrevData {
+    /// Attribute - Dwarf attribute code.
+    ///
+    uint16_t Attribute;
+
+    /// Form - Dwarf form code.
+    ///
+    uint16_t Form;
+  public:
+    DIEAbbrevData(uint16_t A, uint16_t F) : Attribute(A), Form(F) {}
+
+    // Accessors.
+    uint16_t getAttribute() const { return Attribute; }
+    uint16_t getForm()      const { return Form; }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEAbbrev - Dwarf abbreviation, describes the organization of a debug
+  /// information object.
+  class DIEAbbrev : public FoldingSetNode {
+    /// Tag - Dwarf tag code.
+    ///
+    uint16_t Tag;
+
+    /// ChildrenFlag - Dwarf children flag.
+    ///
+    uint16_t ChildrenFlag;
+
+    /// Unique number for node.
+    ///
+    unsigned Number;
+
+    /// Data - Raw data bytes for abbreviation.
+    ///
+    SmallVector<DIEAbbrevData, 12> Data;
+
+  public:
+    DIEAbbrev(uint16_t T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
+
+    // Accessors.
+    uint16_t getTag() const { return Tag; }
+    unsigned getNumber() const { return Number; }
+    uint16_t getChildrenFlag() const { return ChildrenFlag; }
+    const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
+    void setTag(uint16_t T) { Tag = T; }
+    void setChildrenFlag(uint16_t CF) { ChildrenFlag = CF; }
+    void setNumber(unsigned N) { Number = N; }
+
+    /// AddAttribute - Adds another set of attribute information to the
+    /// abbreviation.
+    void AddAttribute(uint16_t Attribute, uint16_t Form) {
+      Data.push_back(DIEAbbrevData(Attribute, Form));
+    }
+
+    /// AddFirstAttribute - Adds a set of attribute information to the front
+    /// of the abbreviation.
+    void AddFirstAttribute(uint16_t Attribute, uint16_t Form) {
+      Data.insert(Data.begin(), DIEAbbrevData(Attribute, Form));
+    }
+
+    /// Profile - Used to gather unique data for the abbreviation folding set.
+    ///
+    void Profile(FoldingSetNodeID &ID) const;
+
+    /// Emit - Print the abbreviation using the specified asm printer.
+    ///
+    void Emit(AsmPrinter *AP) const;
+
+#ifndef NDEBUG
+    void print(raw_ostream &O);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIE - A structured debug information entry.  Has an abbreviation which
+  /// describes its organization.
+  class DIEValue;
+
+  class DIE {
+  protected:
+    /// Offset - Offset in debug info section.
+    ///
+    unsigned Offset;
+
+    /// Size - Size of instance + children.
+    ///
+    unsigned Size;
+
+    /// Abbrev - Buffer for constructing abbreviation.
+    ///
+    DIEAbbrev Abbrev;
+
+    /// Children DIEs.
+    ///
+    std::vector<DIE *> Children;
+
+    DIE *Parent;
+
+    /// Attribute values.
+    ///
+    SmallVector<DIEValue*, 12> Values;
+
+    // Private data for print()
+    mutable unsigned IndentCount;
+  public:
+    explicit DIE(unsigned Tag)
+      : Offset(0), Size(0), Abbrev(Tag, dwarf::DW_CHILDREN_no), Parent(0),
+        IndentCount(0) {}
+    virtual ~DIE();
+
+    // Accessors.
+    DIEAbbrev &getAbbrev() { return Abbrev; }
+    unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
+    unsigned getTag() const { return Abbrev.getTag(); }
+    unsigned getOffset() const { return Offset; }
+    unsigned getSize() const { return Size; }
+    const std::vector<DIE *> &getChildren() const { return Children; }
+    const SmallVectorImpl<DIEValue*> &getValues() const { return Values; }
+    DIE *getParent() const { return Parent; }
+    /// Climb up the parent chain to get the compile unit DIE this DIE belongs
+    /// to.
+    DIE *getCompileUnit() const;
+    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    void setOffset(unsigned O) { Offset = O; }
+    void setSize(unsigned S) { Size = S; }
+
+    /// addValue - Add a value and attributes to a DIE.
+    ///
+    void addValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+      Abbrev.AddAttribute(Attribute, Form);
+      Values.push_back(Value);
+    }
+
+    /// addChild - Add a child to the DIE.
+    ///
+    void addChild(DIE *Child) {
+      if (Child->getParent()) {
+        assert (Child->getParent() == this && "Unexpected DIE Parent!");
+        return;
+      }
+      Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
+      Children.push_back(Child);
+      Child->Parent = this;
+    }
+
+#ifndef NDEBUG
+    void print(raw_ostream &O, unsigned IncIndent = 0);
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEValue - A debug information entry value.
+  ///
+  class DIEValue {
+    virtual void anchor();
+  public:
+    enum {
+      isInteger,
+      isString,
+      isLabel,
+      isDelta,
+      isEntry,
+      isBlock
+    };
+  protected:
+    /// Type - Type of data stored in the value.
+    ///
+    unsigned Type;
+  public:
+    explicit DIEValue(unsigned T) : Type(T) {}
+    virtual ~DIEValue() {}
+
+    // Accessors
+    unsigned getType()  const { return Type; }
+
+    /// EmitValue - Emit value via the Dwarf writer.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const = 0;
+
+    /// SizeOf - Return the size of a value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O) = 0;
+    void dump();
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEInteger - An integer value DIE.
+  ///
+  class DIEInteger : public DIEValue {
+    uint64_t Integer;
+  public:
+    explicit DIEInteger(uint64_t I) : DIEValue(isInteger), Integer(I) {}
+
+    /// BestForm - Choose the best form for integer.
+    ///
+    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+      if (IsSigned) {
+        const int64_t SignedInt = Int;
+        if ((char)Int == SignedInt)     return dwarf::DW_FORM_data1;
+        if ((short)Int == SignedInt)    return dwarf::DW_FORM_data2;
+        if ((int)Int == SignedInt)      return dwarf::DW_FORM_data4;
+      } else {
+        if ((unsigned char)Int == Int)  return dwarf::DW_FORM_data1;
+        if ((unsigned short)Int == Int) return dwarf::DW_FORM_data2;
+        if ((unsigned int)Int == Int)   return dwarf::DW_FORM_data4;
+      }
+      return dwarf::DW_FORM_data8;
+    }
+
+    /// EmitValue - Emit integer of appropriate size.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    uint64_t getValue() const { return Integer; }
+
+    /// SizeOf - Determine size of integer value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIELabel - A label expression DIE.
+  //
+  class DIELabel : public DIEValue {
+    const MCSymbol *Label;
+  public:
+    explicit DIELabel(const MCSymbol *L) : DIEValue(isLabel), Label(L) {}
+
+    /// EmitValue - Emit label value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// getValue - Get MCSymbol.
+    ///
+    const MCSymbol *getValue()       const { return Label; }
+
+    /// SizeOf - Determine size of label value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEDelta - A simple label difference DIE.
+  ///
+  class DIEDelta : public DIEValue {
+    const MCSymbol *LabelHi;
+    const MCSymbol *LabelLo;
+  public:
+    DIEDelta(const MCSymbol *Hi, const MCSymbol *Lo)
+      : DIEValue(isDelta), LabelHi(Hi), LabelLo(Lo) {}
+
+    /// EmitValue - Emit delta value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of delta value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEEntry - A pointer to another debug information entry.  An instance of
+  /// this class can also be used as a proxy for a debug information entry not
+  /// yet defined (ie. types.)
+  class DIEEntry : public DIEValue {
+    DIE *const Entry;
+  public:
+    explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {}
+
+    DIE *getEntry() const { return Entry; }
+
+    /// EmitValue - Emit debug information entry offset.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of debug information entry in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const {
+      return sizeof(int32_t);
+    }
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIEBlock - A block of values.  Primarily used for location expressions.
+  //
+  class DIEBlock : public DIEValue, public DIE {
+    unsigned Size;                // Size in bytes excluding size header.
+  public:
+    DIEBlock()
+      : DIEValue(isBlock), DIE(0), Size(0) {}
+    virtual ~DIEBlock() {}
+
+    /// ComputeSize - calculate the size of the block.
+    ///
+    unsigned ComputeSize(AsmPrinter *AP);
+
+    /// BestForm - Choose the best form for data.
+    ///
+    unsigned BestForm() const {
+      if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
+      if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
+      if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
+      return dwarf::DW_FORM_block;
+    }
+
+    /// EmitValue - Emit block data.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+
+    /// SizeOf - Determine size of block data in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O);
+#endif
+  };
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
new file mode 100644
index 000000000000..f58ec9b4bf46
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -0,0 +1,264 @@
+//=-- llvm/CodeGen/DwarfAccelTable.cpp - Dwarf Accelerator Tables -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfAccelTable.h"
+#include "DIE.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+const char *DwarfAccelTable::Atom::AtomTypeString(enum AtomType AT) {
+  switch (AT) {
+  case eAtomTypeNULL: return "eAtomTypeNULL";
+  case eAtomTypeDIEOffset: return "eAtomTypeDIEOffset";
+  case eAtomTypeCUOffset: return "eAtomTypeCUOffset";
+  case eAtomTypeTag: return "eAtomTypeTag";
+  case eAtomTypeNameFlags: return "eAtomTypeNameFlags";
+  case eAtomTypeTypeFlags: return "eAtomTypeTypeFlags";
+  }
+  llvm_unreachable("invalid AtomType!");
+}
+
+// The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList) :
+  Header(8 + (atomList.size() * 4)),
+  HeaderData(atomList),
+  Entries(Allocator) { }
+
+DwarfAccelTable::~DwarfAccelTable() { }
+
+void DwarfAccelTable::AddName(StringRef Name, DIE* die, char Flags) {
+  assert(Data.empty() && "Already finalized!");
+  // If the string is in the list already then add this die to the list
+  // otherwise add a new one.
+  DataArray &DIEs = Entries[Name];
+  DIEs.push_back(new (Allocator) HashDataContents(die, Flags));
+}
+
+void DwarfAccelTable::ComputeBucketCount(void) {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> uniques(Data.size());
+  for (size_t i = 0, e = Data.size(); i < e; ++i)
+    uniques[i] = Data[i]->HashValue;
+  array_pod_sort(uniques.begin(), uniques.end());
+  std::vector<uint32_t>::iterator p =
+    std::unique(uniques.begin(), uniques.end());
+  uint32_t num = std::distance(uniques.begin(), p);
+
+  // Then compute the bucket size, minimum of 1 bucket.
+  if (num > 1024) Header.bucket_count = num/4;
+  if (num > 16) Header.bucket_count = num/2;
+  else Header.bucket_count = num > 0 ? num : 1;
+
+  Header.hashes_count = num;
+}
+
+// compareDIEs - comparison predicate that sorts DIEs by their offset.
+static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
+                        const DwarfAccelTable::HashDataContents *B) {
+  return A->Die->getOffset() < B->Die->getOffset();
+}
+
+void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, const char *Prefix) {
+  // Create the individual hash data outputs.
+  for (StringMap<DataArray>::iterator
+         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+
+    // Unique the entries.
+    std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
+    EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
+                       EI->second.end());
+
+    HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
+    Data.push_back(Entry);
+  }
+
+  // Figure out how many buckets we need, then compute the bucket
+  // contents and the final ordering. We'll emit the hashes and offsets
+  // by doing a walk during the emission phase. We add temporary
+  // symbols to the data so that we can reference them during the offset
+  // later, we'll emit them when we emit the data.
+  ComputeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(Header.bucket_count);
+  for (size_t i = 0, e = Data.size(); i < e; ++i) {
+    uint32_t bucket = Data[i]->HashValue % Header.bucket_count;
+    Buckets[bucket].push_back(Data[i]);
+    Data[i]->Sym = Asm->GetTempSymbol(Prefix, i);
+  }
+}
+
+// Emits the header for the table via the AsmPrinter.
+void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
+  Asm->OutStreamer.AddComment("Header Magic");
+  Asm->EmitInt32(Header.magic);
+  Asm->OutStreamer.AddComment("Header Version");
+  Asm->EmitInt16(Header.version);
+  Asm->OutStreamer.AddComment("Header Hash Function");
+  Asm->EmitInt16(Header.hash_function);
+  Asm->OutStreamer.AddComment("Header Bucket Count");
+  Asm->EmitInt32(Header.bucket_count);
+  Asm->OutStreamer.AddComment("Header Hash Count");
+  Asm->EmitInt32(Header.hashes_count);
+  Asm->OutStreamer.AddComment("Header Data Length");
+  Asm->EmitInt32(Header.header_data_len);
+  Asm->OutStreamer.AddComment("HeaderData Die Offset Base");
+  Asm->EmitInt32(HeaderData.die_offset_base);
+  Asm->OutStreamer.AddComment("HeaderData Atom Count");
+  Asm->EmitInt32(HeaderData.Atoms.size());
+  for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
+    Atom A = HeaderData.Atoms[i];
+    Asm->OutStreamer.AddComment(Atom::AtomTypeString(A.type));
+    Asm->EmitInt16(A.type);
+    Asm->OutStreamer.AddComment(dwarf::FormEncodingString(A.form));
+    Asm->EmitInt16(A.form);
+  }
+}
+
+// Walk through and emit the buckets for the table. Each index is
+// an offset into the list of hashes.
+void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
+  unsigned index = 0;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    Asm->OutStreamer.AddComment("Bucket " + Twine(i));
+    if (Buckets[i].size() != 0)
+      Asm->EmitInt32(index);
+    else
+      Asm->EmitInt32(UINT32_MAX);
+    index += Buckets[i].size();
+  }
+}
+
+// Walk through the buckets and emit the individual hashes for each
+// bucket.
+void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+           HE = Buckets[i].end(); HI != HE; ++HI) {
+      Asm->OutStreamer.AddComment("Hash in Bucket " + Twine(i));
+      Asm->EmitInt32((*HI)->HashValue);
+    }
+  }
+}
+
+// Walk through the buckets and emit the individual offsets for each
+// element in each bucket. This is done via a symbol subtraction from the
+// beginning of the section. The non-section symbol will be output later
+// when we emit the actual data.
+void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+           HE = Buckets[i].end(); HI != HE; ++HI) {
+      Asm->OutStreamer.AddComment("Offset in Bucket " + Twine(i));
+      MCContext &Context = Asm->OutStreamer.getContext();
+      const MCExpr *Sub =
+        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create((*HI)->Sym, Context),
+                                MCSymbolRefExpr::Create(SecBegin, Context),
+                                Context);
+      Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t));
+    }
+  }
+}
+
+// Walk through the buckets and emit the full data for each element in
+// the bucket. For the string case emit the dies and the various offsets.
+// Terminate each HashData bucket with 0.
+void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
+  uint64_t PrevHash = UINT64_MAX;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+           HE = Buckets[i].end(); HI != HE; ++HI) {
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer.EmitLabel((*HI)->Sym);
+      Asm->OutStreamer.AddComment((*HI)->Str);
+      Asm->EmitSectionOffset(D->getStringPoolEntry((*HI)->Str),
+                             D->getStringPoolSym());
+      Asm->OutStreamer.AddComment("Num DIEs");
+      Asm->EmitInt32((*HI)->Data.size());
+      for (ArrayRef<HashDataContents*>::const_iterator
+             DI = (*HI)->Data.begin(), DE = (*HI)->Data.end();
+           DI != DE; ++DI) {
+        // Emit the DIE offset
+        Asm->EmitInt32((*DI)->Die->getOffset());
+        // If we have multiple Atoms emit that info too.
+        // FIXME: A bit of a hack, we either emit only one atom or all info.
+        if (HeaderData.Atoms.size() > 1) {
+          Asm->EmitInt16((*DI)->Die->getTag());
+          Asm->EmitInt8((*DI)->Flags);
+        }
+      }
+      // Emit a 0 to terminate the data unless we have a hash collision.
+      if (PrevHash != (*HI)->HashValue)
+        Asm->EmitInt32(0);
+      PrevHash = (*HI)->HashValue;
+    }
+  }
+}
+
+// Emit the entire data structure to the output file.
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin,
+                           DwarfUnits *D) {
+  // Emit the header.
+  EmitHeader(Asm);
+
+  // Emit the buckets.
+  EmitBuckets(Asm);
+
+  // Emit the hashes.
+  EmitHashes(Asm);
+
+  // Emit the offsets.
+  EmitOffsets(Asm, SecBegin);
+
+  // Emit the hash data.
+  EmitData(Asm, D);
+}
+
+#ifndef NDEBUG
+void DwarfAccelTable::print(raw_ostream &O) {
+
+  Header.print(O);
+  HeaderData.print(O);
+
+  O << "Entries: \n";
+  for (StringMap<DataArray>::const_iterator
+         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+    O << "Name: " << EI->getKeyData() << "\n";
+    for (DataArray::const_iterator DI = EI->second.begin(),
+           DE = EI->second.end();
+         DI != DE; ++DI)
+      (*DI)->print(O);
+  }
+
+  O << "Buckets and Hashes: \n";
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i)
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+           HE = Buckets[i].end(); HI != HE; ++HI)
+      (*HI)->print(O);
+
+  O << "Data: \n";
+    for (std::vector<HashData*>::const_iterator
+           DI = Data.begin(), DE = Data.end(); DI != DE; ++DI)
+      (*DI)->print(O);
+
+
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
new file mode 100644
index 000000000000..9915bcaa9b69
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -0,0 +1,283 @@
+//==-- llvm/CodeGen/DwarfAccelTable.h - Dwarf Accelerator Tables -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
+#define CODEGEN_ASMPRINTER_DWARFACCELTABLE_H__
+
+#include "DIE.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+#include <vector>
+
+// The dwarf accelerator tables are an indirect hash table optimized
+// for null lookup rather than access to known data. They are output into
+// an on-disk format that looks like this:
+//
+// .-------------.
+// |  HEADER     |
+// |-------------|
+// |  BUCKETS    |
+// |-------------|
+// |  HASHES     |
+// |-------------|
+// |  OFFSETS    |
+// |-------------|
+// |  DATA       |
+// `-------------'
+//
+// where the header contains a magic number, version, type of hash function,
+// the number of buckets, total number of hashes, and room for a special
+// struct of data and the length of that struct.
+//
+// The buckets contain an index (e.g. 6) into the hashes array. The hashes
+// section contains all of the 32-bit hash values in contiguous memory, and
+// the offsets contain the offset into the data area for the particular
+// hash.
+//
+// For a lookup example, we could hash a function name and take it modulo the
+// number of buckets giving us our bucket. From there we take the bucket value
+// as an index into the hashes table and look at each successive hash as long
+// as the hash value is still the same modulo result (bucket value) as earlier.
+// If we have a match we look at that same entry in the offsets table and
+// grab the offset in the data for our final match.
+
+namespace llvm {
+
+class AsmPrinter;
+class DIE;
+class DwarfUnits;
+
+class DwarfAccelTable {
+
+  enum HashFunctionType {
+    eHashFunctionDJB = 0u
+  };
+
+  static uint32_t HashDJB (StringRef Str) {
+    uint32_t h = 5381;
+    for (unsigned i = 0, e = Str.size(); i != e; ++i)
+      h = ((h << 5) + h) + Str[i];
+    return h;
+  }
+
+  // Helper function to compute the number of buckets needed based on
+  // the number of unique hashes.
+  void ComputeBucketCount (void);
+
+  struct TableHeader {
+    uint32_t   magic;           // 'HASH' magic value to allow endian detection
+    uint16_t   version;         // Version number.
+    uint16_t   hash_function;   // The hash function enumeration that was used.
+    uint32_t   bucket_count;    // The number of buckets in this hash table.
+    uint32_t   hashes_count;    // The total number of unique hash values
+                                // and hash data offsets in this table.
+    uint32_t   header_data_len; // The bytes to skip to get to the hash
+                                // indexes (buckets) for correct alignment.
+    // Also written to disk is the implementation specific header data.
+
+    static const uint32_t MagicHash = 0x48415348;
+
+    TableHeader (uint32_t data_len) :
+      magic (MagicHash), version (1), hash_function (eHashFunctionDJB),
+      bucket_count (0), hashes_count (0), header_data_len (data_len)
+    {}
+
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Magic: " << format("0x%x", magic) << "\n"
+        << "Version: " << version << "\n"
+        << "Hash Function: " << hash_function << "\n"
+        << "Bucket Count: " << bucket_count << "\n"
+        << "Header Data Length: " << header_data_len << "\n";
+    }
+    void dump() { print(dbgs()); }
+#endif
+  };
+
+public:
+  // The HeaderData describes the form of each set of data. In general this
+  // is as a list of atoms (atom_count) where each atom contains a type
+  // (AtomType type) of data, and an encoding form (form). In the case of
+  // data that is referenced via DW_FORM_ref_* the die_offset_base is
+  // used to describe the offset for all forms in the list of atoms.
+  // This also serves as a public interface of sorts.
+  // When written to disk this will have the form:
+  //
+  // uint32_t die_offset_base
+  // uint32_t atom_count
+  // atom_count Atoms
+  enum AtomType {
+    eAtomTypeNULL       = 0u,
+    eAtomTypeDIEOffset  = 1u,   // DIE offset, check form for encoding
+    eAtomTypeCUOffset   = 2u,   // DIE offset of the compiler unit header that
+                                // contains the item in question
+    eAtomTypeTag        = 3u,   // DW_TAG_xxx value, should be encoded as
+                                // DW_FORM_data1 (if no tags exceed 255) or
+                                // DW_FORM_data2.
+    eAtomTypeNameFlags  = 4u,   // Flags from enum NameFlags
+    eAtomTypeTypeFlags  = 5u    // Flags from enum TypeFlags
+  };
+
+  enum TypeFlags {
+    eTypeFlagClassMask = 0x0000000fu,
+
+    // Always set for C++, only set for ObjC if this is the
+    // @implementation for a class.
+    eTypeFlagClassIsImplementation  = ( 1u << 1 )
+  };
+
+  // Make these public so that they can be used as a general interface to
+  // the class.
+  struct Atom {
+    AtomType type; // enum AtomType
+    uint16_t form; // DWARF DW_FORM_ defines
+
+    Atom(AtomType type, uint16_t form) : type(type), form(form) {}
+    static const char * AtomTypeString(enum AtomType);
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Type: " << AtomTypeString(type) << "\n"
+        << "Form: " << dwarf::FormEncodingString(form) << "\n";
+    }
+    void dump() {
+      print(dbgs());
+    }
+#endif
+  };
+
+ private:
+  struct TableHeaderData {
+    uint32_t die_offset_base;
+    SmallVector<Atom, 1> Atoms;
+
+    TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
+      : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) { }
+
+#ifndef NDEBUG
+    void print (raw_ostream &O) {
+      O << "die_offset_base: " << die_offset_base << "\n";
+      for (size_t i = 0; i < Atoms.size(); i++)
+        Atoms[i].print(O);
+    }
+    void dump() {
+      print(dbgs());
+    }
+#endif
+  };
+
+  // The data itself consists of a str_offset, a count of the DIEs in the
+  // hash and the offsets to the DIEs themselves.
+  // On disk each data section is ended with a 0 KeyType as the end of the
+  // hash chain.
+  // On output this looks like:
+  // uint32_t str_offset
+  // uint32_t hash_data_count
+  // HashData[hash_data_count]
+public:
+  struct HashDataContents {
+    DIE *Die; // Offsets
+    char Flags; // Specific flags to output
+
+    HashDataContents(DIE *D, char Flags) :
+      Die(D),
+      Flags(Flags) { }
+    #ifndef NDEBUG
+    void print(raw_ostream &O) const {
+      O << "  Offset: " << Die->getOffset() << "\n";
+      O << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n";
+      O << "  Flags: " << Flags << "\n";
+    }
+    #endif
+  };
+private:
+  struct HashData {
+    StringRef Str;
+    uint32_t HashValue;
+    MCSymbol *Sym;
+    ArrayRef<HashDataContents*> Data; // offsets
+    HashData(StringRef S, ArrayRef<HashDataContents*> Data)
+      : Str(S), Data(Data) {
+      HashValue = DwarfAccelTable::HashDJB(S);
+    }
+    #ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Name: " << Str << "\n";
+      O << "  Hash Value: " << format("0x%x", HashValue) << "\n";
+      O << "  Symbol: " ;
+      if (Sym) Sym->print(O);
+      else O << "<none>";
+      O << "\n";
+      for (size_t i = 0; i < Data.size(); i++) {
+        O << "  Offset: " << Data[i]->Die->getOffset() << "\n";
+        O << "  Tag: " << dwarf::TagString(Data[i]->Die->getTag()) << "\n";
+        O << "  Flags: " << Data[i]->Flags << "\n";
+      }
+    }
+    void dump() {
+      print(dbgs());
+    }
+    #endif
+  };
+
+  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+
+  // Internal Functions
+  void EmitHeader(AsmPrinter *);
+  void EmitBuckets(AsmPrinter *);
+  void EmitHashes(AsmPrinter *);
+  void EmitOffsets(AsmPrinter *, MCSymbol *);
+  void EmitData(AsmPrinter *, DwarfUnits *D);
+
+  // Allocator for HashData and HashDataContents.
+  BumpPtrAllocator Allocator;
+
+  // Output Variables
+  TableHeader Header;
+  TableHeaderData HeaderData;
+  std::vector<HashData*> Data;
+
+  // String Data
+  typedef std::vector<HashDataContents*> DataArray;
+  typedef StringMap<DataArray, BumpPtrAllocator&> StringEntries;
+  StringEntries Entries;
+
+  // Buckets/Hashes/Offsets
+  typedef std::vector<HashData*> HashList;
+  typedef std::vector<HashList> BucketList;
+  BucketList Buckets;
+  HashList Hashes;
+
+  // Public Implementation
+ public:
+  DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
+  ~DwarfAccelTable();
+  void AddName(StringRef, DIE*, char = 0);
+  void FinalizeTable(AsmPrinter *, const char *);
+  void Emit(AsmPrinter *, MCSymbol *, DwarfUnits *);
+#ifndef NDEBUG
+  void print(raw_ostream &O);
+  void dump() { print(dbgs()); }
+#endif
+};
+
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
new file mode 100644
index 000000000000..fec5cedc684b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -0,0 +1,156 @@
+//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+DwarfCFIException::DwarfCFIException(AsmPrinter *A)
+  : DwarfException(A),
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false),
+    moveTypeModule(AsmPrinter::CFI_M_None) {}
+
+DwarfCFIException::~DwarfCFIException() {}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void DwarfCFIException::EndModule() {
+  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
+    Asm->OutStreamer.EmitCFISections(false, true);
+
+  if (!Asm->MAI->isExceptionHandlingDwarf())
+    return;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+
+  if ((PerEncoding & 0x70) != dwarf::DW_EH_PE_pcrel)
+    return;
+
+  // Emit references to all used personality functions
+  bool AtLeastOne = false;
+  const std::vector<const Function*> &Personalities = MMI->getPersonalities();
+  for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
+    if (!Personalities[i])
+      continue;
+    MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]);
+    TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
+    AtLeastOne = true;
+  }
+
+  if (AtLeastOne && !TLOF.isFunctionEHFrameSymbolPrivate()) {
+    // This is a temporary hack to keep sections in the same order they
+    // were before. This lets us produce bit identical outputs while
+    // transitioning to CFI.
+    Asm->OutStreamer.SwitchSection(
+               const_cast<TargetLoweringObjectFile&>(TLOF).getEHFrameSection());
+  }
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void DwarfCFIException::BeginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+
+  // See if we need frame move info.
+  AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
+  if (MoveType == AsmPrinter::CFI_M_EH ||
+      (MoveType == AsmPrinter::CFI_M_Debug &&
+       moveTypeModule == AsmPrinter::CFI_M_None))
+    moveTypeModule = MoveType;
+
+  shouldEmitMoves = MoveType != AsmPrinter::CFI_M_None;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+
+  shouldEmitPersonality = hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitCFIStartProc();
+
+  // Indicate personality routine, if any.
+  if (!shouldEmitPersonality)
+    return;
+
+  const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
+  Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding);
+
+  Asm->OutStreamer.EmitDebugLabel
+    (Asm->GetTempSymbol("eh_func_begin",
+                        Asm->getFunctionNumber()));
+
+  // Provide LSDA information.
+  if (!shouldEmitLSDA)
+    return;
+
+  Asm->OutStreamer.EmitCFILsda(Asm->GetTempSymbol("exception",
+                                                  Asm->getFunctionNumber()),
+                               LSDAEncoding);
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void DwarfCFIException::EndFunction() {
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitCFIEndProc();
+
+  if (!shouldEmitPersonality)
+    return;
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
+                                                Asm->getFunctionNumber()));
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  EmitExceptionTable();
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 000000000000..f9b6f9472141
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -0,0 +1,1711 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Unit ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+
+#include "DwarfCompileUnit.h"
+#include "DwarfAccelTable.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+/// CompileUnit - Compile unit constructor.
+CompileUnit::CompileUnit(unsigned UID, unsigned L, DIE *D, AsmPrinter *A,
+                         DwarfDebug *DW, DwarfUnits *DWU)
+  : UniqueID(UID), Language(L), CUDie(D), Asm(A), DD(DW), DU(DWU),
+    IndexTyDie(0), DebugInfoOffset(0) {
+  DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
+}
+
+/// ~CompileUnit - Destructor for compile unit.
+CompileUnit::~CompileUnit() {
+  for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
+    DIEBlocks[j]->~DIEBlock();
+}
+
+/// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+/// information entry.
+DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
+  DIEEntry *Value = new (DIEValueAllocator) DIEEntry(Entry);
+  return Value;
+}
+
+/// getDefaultLowerBound - Return the default lower bound for an array. If the
+/// DWARF version doesn't handle the language, return -1.
+int64_t CompileUnit::getDefaultLowerBound() const {
+  switch (Language) {
+  default:
+    break;
+
+  case dwarf::DW_LANG_C89:
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C_plus_plus:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    return 0;
+
+  case dwarf::DW_LANG_Fortran77:
+  case dwarf::DW_LANG_Fortran90:
+  case dwarf::DW_LANG_Fortran95:
+    return 1;
+
+  // The languages below have valid values only if the DWARF version >= 4.
+  case dwarf::DW_LANG_Java:
+  case dwarf::DW_LANG_Python:
+  case dwarf::DW_LANG_UPC:
+  case dwarf::DW_LANG_D:
+    if (dwarf::DWARF_VERSION >= 4)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Ada83:
+  case dwarf::DW_LANG_Ada95:
+  case dwarf::DW_LANG_Cobol74:
+  case dwarf::DW_LANG_Cobol85:
+  case dwarf::DW_LANG_Modula2:
+  case dwarf::DW_LANG_Pascal83:
+  case dwarf::DW_LANG_PLI:
+    if (dwarf::DWARF_VERSION >= 4)
+      return 1;
+    break;
+  }
+
+  return -1;
+}
+
+/// addFlag - Add a flag that is true.
+void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
+  if (!DD->useDarwinGDBCompat())
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
+                  DIEIntegerOne);
+  else
+    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+}
+
+/// addUInt - Add an unsigned integer attribute data and value.
+///
+void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
+                          unsigned Form, uint64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(false, Integer);
+  DIEValue *Value = Integer == 1 ?
+    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addSInt - Add an signed integer attribute data and value.
+///
+void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
+                          unsigned Form, int64_t Integer) {
+  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+  DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addString - Add a string attribute data and value. We always emit a
+/// reference to the string pool instead of immediate strings so that DIEs have
+/// more predictable sizes. In the case of split dwarf we emit an index
+/// into another table which gets us the static offset into the string
+/// table.
+void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
+  if (!DD->useSplitDwarf()) {
+    MCSymbol *Symb = DU->getStringPoolEntry(String);
+    DIEValue *Value;
+    if (Asm->needsRelocationsForDwarfStringPool())
+      Value = new (DIEValueAllocator) DIELabel(Symb);
+    else {
+      MCSymbol *StringPool = DU->getStringPoolSym();
+      Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
+    }
+    Die->addValue(Attribute, dwarf::DW_FORM_strp, Value);
+  } else {
+    unsigned idx = DU->getStringPoolIndex(String);
+    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+    Die->addValue(Attribute, dwarf::DW_FORM_GNU_str_index, Value);
+  }
+}
+
+/// addLocalString - Add a string attribute data and value. This is guaranteed
+/// to be in the local string pool instead of indirected.
+void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
+                                 StringRef String) {
+  MCSymbol *Symb = DU->getStringPoolEntry(String);
+  DIEValue *Value;
+  if (Asm->needsRelocationsForDwarfStringPool())
+    Value = new (DIEValueAllocator) DIELabel(Symb);
+  else {
+    MCSymbol *StringPool = DU->getStringPoolSym();
+    Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
+  }
+  Die->addValue(Attribute, dwarf::DW_FORM_strp, Value);
+}
+
+/// addLabel - Add a Dwarf label attribute data and value.
+///
+void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                           const MCSymbol *Label) {
+  DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
+                                  MCSymbol *Label) {
+  if (!DD->useSplitDwarf()) {
+    if (Label != NULL) {
+      DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
+      Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
+    } else {
+      DIEValue *Value = new (DIEValueAllocator) DIEInteger(0);
+      Die->addValue(Attribute, dwarf::DW_FORM_addr, Value);
+    }
+  } else {
+    unsigned idx = DU->getAddrPoolIndex(Label);
+    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+    Die->addValue(Attribute, dwarf::DW_FORM_GNU_addr_index, Value);
+  }
+}
+
+/// addOpAddress - Add a dwarf op address data and value using the
+/// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void CompileUnit::addOpAddress(DIE *Die, MCSymbol *Sym) {
+
+  if (!DD->useSplitDwarf()) {
+    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, 0, dwarf::DW_FORM_udata, Sym);
+  } else {
+    unsigned idx = DU->getAddrPoolIndex(Sym);
+    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
+    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    Die->addValue(0, dwarf::DW_FORM_GNU_addr_index, Value);
+  }
+}
+
+/// addDelta - Add a label delta attribute data and value.
+///
+void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                           const MCSymbol *Hi, const MCSymbol *Lo) {
+  DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
+  Die->addValue(Attribute, Form, Value);
+}
+
+/// addDIEEntry - Add a DIE attribute data and value.
+///
+void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
+                              DIE *Entry) {
+  Die->addValue(Attribute, Form, createDIEEntry(Entry));
+}
+
+/// addBlock - Add block data.
+///
+void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
+                           DIEBlock *Block) {
+  Block->ComputeSize(Asm);
+  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
+  Die->addValue(Attribute, Block->BestForm(), Block);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIVariable V) {
+  // Verify variable.
+  if (!V.Verify())
+    return;
+
+  unsigned Line = V.getLineNumber();
+  if (Line == 0)
+    return;
+  unsigned FileID = DD->getOrCreateSourceID(V.getContext().getFilename(),
+                                            V.getContext().getDirectory(),
+                                            getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
+  // Verify global variable.
+  if (!G.Verify())
+    return;
+
+  unsigned Line = G.getLineNumber();
+  if (Line == 0)
+    return;
+  unsigned FileID = DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(),
+                                            getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
+  // Verify subprogram.
+  if (!SP.Verify())
+    return;
+
+  // If the line number is 0, don't add it.
+  unsigned Line = SP.getLineNumber();
+  if (Line == 0)
+    return;
+
+  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(),
+                                            SP.getDirectory(), getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
+  // Verify type.
+  if (!Ty.Verify())
+    return;
+
+  unsigned Line = Ty.getLineNumber();
+  if (Line == 0)
+    return;
+  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(),
+                                            Ty.getDirectory(), getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
+  // Verify type.
+  if (!Ty.Verify())
+    return;
+
+  unsigned Line = Ty.getLineNumber();
+  if (Line == 0)
+    return;
+  DIFile File = Ty.getFile();
+  unsigned FileID = DD->getOrCreateSourceID(File.getFilename(),
+                                            File.getDirectory(), getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addSourceLine - Add location information to specified debug information
+/// entry.
+void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
+  // Verify namespace.
+  if (!NS.Verify())
+    return;
+
+  unsigned Line = NS.getLineNumber();
+  if (Line == 0)
+    return;
+  StringRef FN = NS.getFilename();
+
+  unsigned FileID = DD->getOrCreateSourceID(FN, NS.getDirectory(),
+                                            getUniqueID());
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+}
+
+/// addVariableAddress - Add DW_AT_location attribute for a
+/// DbgVariable based on provided MachineLocation.
+void CompileUnit::addVariableAddress(DbgVariable *&DV, DIE *Die,
+                                     MachineLocation Location) {
+  if (DV->variableHasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV->isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location);
+}
+
+/// addRegisterOp - Add register operand.
+void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) {
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
+  if (DWReg < 32)
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
+  else {
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+  }
+}
+
+/// addRegisterOffset - Add register offset.
+void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg,
+                                    int64_t Offset) {
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
+  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+  if (Reg == TRI->getFrameRegister(*Asm->MF))
+    // If variable offset is based in frame register then use fbreg.
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
+  else if (DWReg < 32)
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
+  else {
+    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+  }
+  addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset);
+}
+
+/// addAddress - Add an address attribute to a die based on the location
+/// provided.
+void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
+                             const MachineLocation &Location) {
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/// addComplexAddress - Start with the address based on the location provided,
+/// and generate the DWARF information necessary to find the actual variable
+/// given the extra address information encoded in the DIVariable, starting from
+/// the starting location.  Add the DWARF information to the die.
+///
+void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
+                                    unsigned Attribute,
+                                    const MachineLocation &Location) {
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  unsigned N = DV->getNumAddrElements();
+  unsigned i = 0;
+  if (Location.isReg()) {
+    if (N >= 2 && DV->getAddrElement(0) == DIBuilder::OpPlus) {
+      // If first address element is OpPlus then emit
+      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+      addRegisterOffset(Block, Location.getReg(), DV->getAddrElement(1));
+      i = 2;
+    } else
+      addRegisterOp(Block, Location.getReg());
+  }
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+
+  for (;i < N; ++i) {
+    uint64_t Element = DV->getAddrElement(i);
+    if (Element == DIBuilder::OpPlus) {
+      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
+    } else if (Element == DIBuilder::OpDeref) {
+      if (!Location.isReg())
+        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    } else llvm_unreachable("unknown DIBuilder Opcode");
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/* Byref variables, in Blocks, are declared by the programmer as "SomeType
+   VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
+   gives the variable VarName either the struct, or a pointer to the struct, as
+   its type.  This is necessary for various behind-the-scenes things the
+   compiler needs to do with by-reference variables in Blocks.
+
+   However, as far as the original *programmer* is concerned, the variable
+   should still have type 'SomeType', as originally declared.
+
+   The function getBlockByrefType dives into the __Block_byref_x_VarName
+   struct to find the original type of the variable, which is then assigned to
+   the variable's Debug Information Entry as its real type.  So far, so good.
+   However now the debugger will expect the variable VarName to have the type
+   SomeType.  So we need the location attribute for the variable to be an
+   expression that explains to the debugger how to navigate through the
+   pointers and struct to find the actual variable of type SomeType.
+
+   The following function does just that.  We start by getting
+   the "normal" location for the variable. This will be the location
+   of either the struct __Block_byref_x_VarName or the pointer to the
+   struct __Block_byref_x_VarName.
+
+   The struct will look something like:
+
+   struct __Block_byref_x_VarName {
+     ... <various fields>
+     struct __Block_byref_x_VarName *forwarding;
+     ... <various other fields>
+     SomeType VarName;
+     ... <maybe more fields>
+   };
+
+   If we are given the struct directly (as our starting point) we
+   need to tell the debugger to:
+
+   1).  Add the offset of the forwarding field.
+
+   2).  Follow that pointer to get the real __Block_byref_x_VarName
+   struct to use (the real one may have been copied onto the heap).
+
+   3).  Add the offset for the field VarName, to find the actual variable.
+
+   If we started with a pointer to the struct, then we need to
+   dereference that pointer first, before the other steps.
+   Translating this into DWARF ops, we will need to append the following
+   to the current location description for the variable:
+
+   DW_OP_deref                    -- optional, if we start with a pointer
+   DW_OP_plus_uconst <forward_fld_offset>
+   DW_OP_deref
+   DW_OP_plus_uconst <varName_fld_offset>
+
+   That is what this function does.  */
+
+/// addBlockByrefAddress - Start with the address based on the location
+/// provided, and generate the DWARF information necessary to find the
+/// actual Block variable (navigating the Block struct) based on the
+/// starting location.  Add the DWARF information to the die.  For
+/// more information, read large comment just above here.
+///
+void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
+                                       unsigned Attribute,
+                                       const MachineLocation &Location) {
+  DIType Ty = DV->getType();
+  DIType TmpTy = Ty;
+  unsigned Tag = Ty.getTag();
+  bool isPointer = false;
+
+  StringRef varName = DV->getName();
+
+  if (Tag == dwarf::DW_TAG_pointer_type) {
+    DIDerivedType DTy = DIDerivedType(Ty);
+    TmpTy = DTy.getTypeDerivedFrom();
+    isPointer = true;
+  }
+
+  DICompositeType blockStruct = DICompositeType(TmpTy);
+
+  // Find the __forwarding field and the variable field in the __Block_byref
+  // struct.
+  DIArray Fields = blockStruct.getTypeArray();
+  DIDescriptor varField = DIDescriptor();
+  DIDescriptor forwardingField = DIDescriptor();
+
+  for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Fields.getElement(i);
+    DIDerivedType DT = DIDerivedType(Element);
+    StringRef fieldName = DT.getName();
+    if (fieldName == "__forwarding")
+      forwardingField = Element;
+    else if (fieldName == varName)
+      varField = Element;
+  }
+
+  // Get the offsets for the forwarding field and the variable field.
+  unsigned forwardingFieldOffset =
+    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
+  unsigned varFieldOffset =
+    DIDerivedType(varField).getOffsetInBits() >> 3;
+
+  // Decode the original location, and use that as the start of the byref
+  // variable's location.
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+
+  // If we started with a pointer to the __Block_byref... struct, then
+  // the first thing we need to do is dereference the pointer (DW_OP_deref).
+  if (isPointer)
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Next add the offset for the '__forwarding' field:
+  // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
+  // adding the offset if it's 0.
+  if (forwardingFieldOffset > 0) {
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
+  }
+
+  // Now dereference the __forwarding field to get to the real __Block_byref
+  // struct:  DW_OP_deref.
+  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Now that we've got the real __Block_byref... struct, add the offset
+  // for the variable's field to get to the location of the actual variable:
+  // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
+  if (varFieldOffset > 0) {
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, 0, Block);
+}
+
+/// isTypeSigned - Return true if the type is signed.
+static bool isTypeSigned(DIType Ty, int *SizeInBits) {
+  if (Ty.isDerivedType())
+    return isTypeSigned(DIDerivedType(Ty).getTypeDerivedFrom(), SizeInBits);
+  if (Ty.isBasicType())
+    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed
+        || DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
+      *SizeInBits = Ty.getSizeInBits();
+      return true;
+    }
+  return false;
+}
+
+/// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
+                                   DIType Ty) {
+  assert(MO.isImm() && "Invalid machine operand!");
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  int SizeInBits = -1;
+  bool SignedConstant = isTypeSigned(Ty, &SizeInBits);
+  unsigned Form = SignedConstant ? dwarf::DW_FORM_sdata : dwarf::DW_FORM_udata;
+  switch (SizeInBits) {
+    case 8:  Form = dwarf::DW_FORM_data1; break;
+    case 16: Form = dwarf::DW_FORM_data2; break;
+    case 32: Form = dwarf::DW_FORM_data4; break;
+    case 64: Form = dwarf::DW_FORM_data8; break;
+    default: break;
+  }
+  SignedConstant ? addSInt(Block, 0, Form, MO.getImm())
+    : addUInt(Block, 0, Form, MO.getImm());
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addConstantFPValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
+  assert (MO.isFPImm() && "Invalid machine operand!");
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+  APFloat FPImm = MO.getFPImm()->getValueAPF();
+
+  // Get the raw data form of the floating point.
+  const APInt FltVal = FPImm.bitcastToAPInt();
+  const char *FltPtr = (const char*)FltVal.getRawData();
+
+  int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+  int Incr = (LittleEndian ? 1 : -1);
+  int Start = (LittleEndian ? 0 : NumBytes - 1);
+  int Stop = (LittleEndian ? NumBytes : -1);
+
+  // Output the constant to DWARF one byte at a time.
+  for (; Start != Stop; Start += Incr)
+    addUInt(Block, 0, dwarf::DW_FORM_data1,
+            (unsigned char)0xFF & FltPtr[Start]);
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addConstantFPValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
+  return addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), false);
+}
+
+/// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
+                                   bool Unsigned) {
+  return addConstantValue(Die, CI->getValue(), Unsigned);
+}
+
+// addConstantValue - Add constant value entry in variable DIE.
+bool CompileUnit::addConstantValue(DIE *Die, const APInt &Val,
+                                   bool Unsigned) {
+  unsigned CIBitWidth = Val.getBitWidth();
+  if (CIBitWidth <= 64) {
+    unsigned form = 0;
+    switch (CIBitWidth) {
+    case 8: form = dwarf::DW_FORM_data1; break;
+    case 16: form = dwarf::DW_FORM_data2; break;
+    case 32: form = dwarf::DW_FORM_data4; break;
+    case 64: form = dwarf::DW_FORM_data8; break;
+    default:
+      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+    }
+    if (Unsigned)
+      addUInt(Die, dwarf::DW_AT_const_value, form, Val.getZExtValue());
+    else
+      addSInt(Die, dwarf::DW_AT_const_value, form, Val.getSExtValue());
+    return true;
+  }
+
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+
+  // Get the raw data form of the large APInt.
+  const uint64_t *Ptr64 = Val.getRawData();
+
+  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+
+  // Output the constant to DWARF one byte at a time.
+  for (int i = 0; i < NumBytes; i++) {
+    uint8_t c;
+    if (LittleEndian)
+      c = Ptr64[i / 8] >> (8 * (i & 7));
+    else
+      c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
+    addUInt(Block, 0, dwarf::DW_FORM_data1, c);
+  }
+
+  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
+  return true;
+}
+
+/// addTemplateParams - Add template parameters in buffer.
+void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
+  // Add template parameters.
+  for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
+    DIDescriptor Element = TParams.getElement(i);
+    if (Element.isTemplateTypeParameter())
+      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
+                        DITemplateTypeParameter(Element)));
+    else if (Element.isTemplateValueParameter())
+      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
+                        DITemplateValueParameter(Element)));
+  }
+}
+
+/// getOrCreateContextDIE - Get context owner's DIE.
+DIE *CompileUnit::getOrCreateContextDIE(DIDescriptor Context) {
+  if (Context.isType())
+    return getOrCreateTypeDIE(DIType(Context));
+  else if (Context.isNameSpace())
+    return getOrCreateNameSpace(DINameSpace(Context));
+  else if (Context.isSubprogram())
+    return getOrCreateSubprogramDIE(DISubprogram(Context));
+  else 
+    return getDIE(Context);
+}
+
+/// addToContextOwner - Add Die into the list of its context owner's children.
+void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
+  if (DIE *ContextDIE = getOrCreateContextDIE(Context))
+    ContextDIE->addChild(Die);
+  else
+    addDie(Die);
+}
+
+/// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+/// given DIType.
+DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
+  DIType Ty(TyNode);
+  if (!Ty.Verify())
+    return NULL;
+  DIE *TyDIE = getDIE(Ty);
+  if (TyDIE)
+    return TyDIE;
+
+  // Create new type.
+  TyDIE = new DIE(dwarf::DW_TAG_base_type);
+  insertDIE(Ty, TyDIE);
+  if (Ty.isBasicType())
+    constructTypeDIE(*TyDIE, DIBasicType(Ty));
+  else if (Ty.isCompositeType())
+    constructTypeDIE(*TyDIE, DICompositeType(Ty));
+  else {
+    assert(Ty.isDerivedType() && "Unknown kind of DIType");
+    constructTypeDIE(*TyDIE, DIDerivedType(Ty));
+  }
+  // If this is a named finished type then include it in the list of types
+  // for the accelerator tables.
+  if (!Ty.getName().empty() && !Ty.isForwardDecl()) {
+    bool IsImplementation = 0;
+    if (Ty.isCompositeType()) {
+      DICompositeType CT(Ty);
+      // A runtime language of 0 actually means C/C++ and that any
+      // non-negative value is some version of Objective-C/C++.
+      IsImplementation = (CT.getRunTimeLang() == 0) ||
+        CT.isObjcClassComplete();
+    }
+    unsigned Flags = IsImplementation ?
+                     DwarfAccelTable::eTypeFlagClassIsImplementation : 0;
+    addAccelType(Ty.getName(), std::make_pair(TyDIE, Flags));
+  }
+
+  addToContextOwner(TyDIE, Ty.getContext());
+  return TyDIE;
+}
+
+/// addType - Add a new type attribute to the specified entity.
+void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
+  if (!Ty.Verify())
+    return;
+
+  // Check for pre-existence.
+  DIEEntry *Entry = getDIEEntry(Ty);
+  // If it exists then use the existing value.
+  if (Entry) {
+    Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+    return;
+  }
+
+  // Construct type.
+  DIE *Buffer = getOrCreateTypeDIE(Ty);
+
+  // Set up proxy.
+  Entry = createDIEEntry(Buffer);
+  insertDIEEntry(Ty, Entry);
+  Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+
+  // If this is a complete composite type then include it in the
+  // list of global types.
+  addGlobalType(Ty);
+}
+
+/// addGlobalType - Add a new global type to the compile unit.
+///
+void CompileUnit::addGlobalType(DIType Ty) {
+  DIDescriptor Context = Ty.getContext();
+  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl()
+      && (!Context || Context.isCompileUnit() || Context.isFile()
+          || Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty))
+      GlobalTypes[Ty.getName()] = Entry->getEntry();
+}
+
+/// addPubTypes - Add type for pubtypes section.
+void CompileUnit::addPubTypes(DISubprogram SP) {
+  DICompositeType SPTy = SP.getType();
+  unsigned SPTag = SPTy.getTag();
+  if (SPTag != dwarf::DW_TAG_subroutine_type)
+    return;
+
+  DIArray Args = SPTy.getTypeArray();
+  for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
+    DIType ATy(Args.getElement(i));
+    if (!ATy.Verify())
+      continue;
+    addGlobalType(ATy);
+  }
+}
+
+/// constructTypeDIE - Construct basic type die from DIBasicType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
+  // Get core information.
+  StringRef Name = BTy.getName();
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, Name);
+
+  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type) {
+    Buffer.setTag(dwarf::DW_TAG_unspecified_type);
+    // Unspecified types has only name, nothing else.
+    return;
+  }
+
+  Buffer.setTag(dwarf::DW_TAG_base_type);
+  addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          BTy.getEncoding());
+
+  uint64_t Size = BTy.getSizeInBits() >> 3;
+  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+}
+
+/// constructTypeDIE - Construct derived type die from DIDerivedType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
+  // Get core information.
+  StringRef Name = DTy.getName();
+  uint64_t Size = DTy.getSizeInBits() >> 3;
+  unsigned Tag = DTy.getTag();
+
+  // FIXME - Workaround for templates.
+  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
+
+  Buffer.setTag(Tag);
+
+  // Map to main type, void will not have a type.
+  DIType FromTy = DTy.getTypeDerivedFrom();
+  addType(&Buffer, FromTy);
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, Name);
+
+  // Add size if non-zero (derived types might be zero-sized.)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type)
+    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+
+  if (Tag == dwarf::DW_TAG_ptr_to_member_type)
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                  getOrCreateTypeDIE(DTy.getClassType()));
+  // Add source line info if available and TyDesc is not a forward declaration.
+  if (!DTy.isForwardDecl())
+    addSourceLine(&Buffer, DTy);
+}
+
+/// constructTypeDIE - Construct type DIE from DICompositeType.
+void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  // Get core information.
+  StringRef Name = CTy.getName();
+
+  uint64_t Size = CTy.getSizeInBits() >> 3;
+  unsigned Tag = CTy.getTag();
+  Buffer.setTag(Tag);
+
+  switch (Tag) {
+  case dwarf::DW_TAG_array_type:
+    constructArrayTypeDIE(Buffer, &CTy);
+    break;
+  case dwarf::DW_TAG_enumeration_type: {
+    DIArray Elements = CTy.getTypeArray();
+
+    // Add enumerators to enumeration type.
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIE *ElemDie = NULL;
+      DIDescriptor Enum(Elements.getElement(i));
+      if (Enum.isEnumerator()) {
+        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
+        Buffer.addChild(ElemDie);
+      }
+    }
+    DIType DTy = CTy.getTypeDerivedFrom();
+    if (DTy.Verify()) {
+      addType(&Buffer, DTy);
+      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
+    }
+  }
+    break;
+  case dwarf::DW_TAG_subroutine_type: {
+    // Add return type.
+    DIArray Elements = CTy.getTypeArray();
+    DIDescriptor RTy = Elements.getElement(0);
+    addType(&Buffer, DIType(RTy));
+
+    bool isPrototyped = true;
+    // Add arguments.
+    for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
+      DIDescriptor Ty = Elements.getElement(i);
+      if (Ty.isUnspecifiedParameter()) {
+        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
+        Buffer.addChild(Arg);
+        isPrototyped = false;
+      } else {
+        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        addType(Arg, DIType(Ty));
+        if (DIType(Ty).isArtificial())
+          addFlag(Arg, dwarf::DW_AT_artificial);
+        Buffer.addChild(Arg);
+      }
+    }
+    // Add prototype flag if we're dealing with a C language and the
+    // function has been prototyped.
+    if (isPrototyped &&
+        (Language == dwarf::DW_LANG_C89 ||
+         Language == dwarf::DW_LANG_C99 ||
+         Language == dwarf::DW_LANG_ObjC))
+      addFlag(&Buffer, dwarf::DW_AT_prototyped);
+  }
+    break;
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type: {
+    // Add elements to structure type.
+    DIArray Elements = CTy.getTypeArray();
+
+    // A forward struct declared type may not have elements available.
+    unsigned N = Elements.getNumElements();
+    if (N == 0)
+      break;
+
+    // Add elements to structure type.
+    for (unsigned i = 0; i < N; ++i) {
+      DIDescriptor Element = Elements.getElement(i);
+      DIE *ElemDie = NULL;
+      if (Element.isSubprogram()) {
+        DISubprogram SP(Element);
+        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
+        if (SP.isProtected())
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+                  dwarf::DW_ACCESS_protected);
+        else if (SP.isPrivate())
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+                  dwarf::DW_ACCESS_private);
+        else
+          addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+        if (SP.isExplicit())
+          addFlag(ElemDie, dwarf::DW_AT_explicit);
+      } else if (Element.isDerivedType()) {
+        DIDerivedType DDTy(Element);
+        if (DDTy.getTag() == dwarf::DW_TAG_friend) {
+          ElemDie = new DIE(dwarf::DW_TAG_friend);
+          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
+        } else if (DDTy.isStaticMember())
+          ElemDie = createStaticMemberDIE(DDTy);
+        else
+          ElemDie = createMemberDIE(DDTy);
+      } else if (Element.isObjCProperty()) {
+        DIObjCProperty Property(Element);
+        ElemDie = new DIE(Property.getTag());
+        StringRef PropertyName = Property.getObjCPropertyName();
+        addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
+        addType(ElemDie, Property.getType());
+        addSourceLine(ElemDie, Property);
+        StringRef GetterName = Property.getObjCPropertyGetterName();
+        if (!GetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
+        StringRef SetterName = Property.getObjCPropertySetterName();
+        if (!SetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_setter, SetterName);
+        unsigned PropertyAttributes = 0;
+        if (Property.isReadOnlyObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readonly;
+        if (Property.isReadWriteObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_readwrite;
+        if (Property.isAssignObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_assign;
+        if (Property.isRetainObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_retain;
+        if (Property.isCopyObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_copy;
+        if (Property.isNonAtomicObjCProperty())
+          PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic;
+        if (PropertyAttributes)
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, 0,
+                 PropertyAttributes);
+
+        DIEEntry *Entry = getDIEEntry(Element);
+        if (!Entry) {
+          Entry = createDIEEntry(ElemDie);
+          insertDIEEntry(Element, Entry);
+        }
+      } else
+        continue;
+      Buffer.addChild(ElemDie);
+    }
+
+    if (CTy.isAppleBlockExtension())
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
+
+    DICompositeType ContainingType = CTy.getContainingType();
+    if (DIDescriptor(ContainingType).isCompositeType())
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
+                  getOrCreateTypeDIE(DIType(ContainingType)));
+    else {
+      DIDescriptor Context = CTy.getContext();
+      addToContextOwner(&Buffer, Context);
+    }
+
+    if (CTy.isObjcClassComplete())
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
+
+    // Add template parameters to a class, structure or union types.
+    // FIXME: The support isn't in the metadata for this yet.
+    if (Tag == dwarf::DW_TAG_class_type ||
+        Tag == dwarf::DW_TAG_structure_type ||
+        Tag == dwarf::DW_TAG_union_type)
+      addTemplateParams(Buffer, CTy.getTemplateParams());
+
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(&Buffer, dwarf::DW_AT_name, Name);
+
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_class_type ||
+      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_union_type) {
+    // Add size if non-zero (derived types might be zero-sized.)
+    // TODO: Do we care about size for enum forward declarations?
+    if (Size)
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    else if (!CTy.isForwardDecl())
+      // Add zero size if it is not a forward declaration.
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+
+    // If we're a forward decl, say so.
+    if (CTy.isForwardDecl())
+      addFlag(&Buffer, dwarf::DW_AT_declaration);
+
+    // Add source line info if available.
+    if (!CTy.isForwardDecl())
+      addSourceLine(&Buffer, CTy);
+
+    // No harm in adding the runtime language to the declaration.
+    unsigned RLang = CTy.getRunTimeLang();
+    if (RLang)
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
+              dwarf::DW_FORM_data1, RLang);
+  }
+}
+
+/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
+/// for the given DITemplateTypeParameter.
+DIE *
+CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
+  DIE *ParamDIE = getDIE(TP);
+  if (ParamDIE)
+    return ParamDIE;
+
+  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
+  addType(ParamDIE, TP.getType());
+  addString(ParamDIE, dwarf::DW_AT_name, TP.getName());
+  return ParamDIE;
+}
+
+/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE
+/// for the given DITemplateValueParameter.
+DIE *
+CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
+  DIE *ParamDIE = getDIE(TPV);
+  if (ParamDIE)
+    return ParamDIE;
+
+  ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter);
+  addType(ParamDIE, TPV.getType());
+  if (!TPV.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, TPV.getName());
+  addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
+          TPV.getValue());
+  return ParamDIE;
+}
+
+/// getOrCreateNameSpace - Create a DIE for DINameSpace.
+DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  DIE *NDie = getDIE(NS);
+  if (NDie)
+    return NDie;
+  NDie = new DIE(dwarf::DW_TAG_namespace);
+  insertDIE(NS, NDie);
+  if (!NS.getName().empty()) {
+    addString(NDie, dwarf::DW_AT_name, NS.getName());
+    addAccelNamespace(NS.getName(), NDie);
+  } else
+    addAccelNamespace("(anonymous namespace)", NDie);
+  addSourceLine(NDie, NS);
+  addToContextOwner(NDie, NS.getContext());
+  return NDie;
+}
+
+/// getRealLinkageName - If special LLVM prefix that is used to inform the asm
+/// printer to not emit usual symbol prefix before the symbol name is used then
+/// return linkage name after skipping this special LLVM prefix.
+static StringRef getRealLinkageName(StringRef LinkageName) {
+  char One = '\1';
+  if (LinkageName.startswith(StringRef(&One, 1)))
+    return LinkageName.substr(1);
+  return LinkageName;
+}
+
+/// getOrCreateSubprogramDIE - Create new DIE using SP.
+DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+  DIE *SPDie = getDIE(SP);
+  if (SPDie)
+    return SPDie;
+
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  insertDIE(SP, SPDie);
+
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  DIE *DeclDie = NULL;
+  if (SPDecl.isSubprogram()) {
+    DeclDie = getOrCreateSubprogramDIE(SPDecl);
+  }
+
+  // Add to context owner.
+  addToContextOwner(SPDie, SP.getContext());
+
+  // Add function template parameters.
+  addTemplateParams(*SPDie, SP.getTemplateParams());
+
+  // Unfortunately this code needs to stay here instead of below the
+  // AT_specification code in order to work around a bug in older
+  // gdbs that requires the linkage name to resolve multiple template
+  // functions.
+  // TODO: Remove this set of code when we get rid of the old gdb
+  // compatibility.
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
+              getRealLinkageName(LinkageName));
+
+  // If this DIE is going to refer declaration info using AT_specification
+  // then there is no need to add other attributes.
+  if (DeclDie) {
+    // Refer function declaration directly.
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
+                DeclDie);
+
+    return SPDie;
+  }
+
+  // Add the linkage name if we have one.
+  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
+              getRealLinkageName(LinkageName));
+
+  // Constructors and operators for anonymous aggregates do not have names.
+  if (!SP.getName().empty())
+    addString(SPDie, dwarf::DW_AT_name, SP.getName());
+
+  addSourceLine(SPDie, SP);
+
+  // Add the prototype if we have a prototype and we have a C like
+  // language.
+  if (SP.isPrototyped() &&
+      (Language == dwarf::DW_LANG_C89 ||
+       Language == dwarf::DW_LANG_C99 ||
+       Language == dwarf::DW_LANG_ObjC))
+    addFlag(SPDie, dwarf::DW_AT_prototyped);
+
+  // Add Return Type.
+  DICompositeType SPTy = SP.getType();
+  DIArray Args = SPTy.getTypeArray();
+  unsigned SPTag = SPTy.getTag();
+
+  if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type)
+    addType(SPDie, SPTy);
+  else
+    addType(SPDie, DIType(Args.getElement(0)));
+
+  unsigned VK = SP.getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
+    DIEBlock *Block = getDIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
+    ContainingTypeMap.insert(std::make_pair(SPDie,
+                                            SP.getContainingType()));
+  }
+
+  if (!SP.isDefinition()) {
+    addFlag(SPDie, dwarf::DW_AT_declaration);
+
+    // Add arguments. Do not add arguments for subprogram definition. They will
+    // be handled while processing variables.
+    DICompositeType SPTy = SP.getType();
+    DIArray Args = SPTy.getTypeArray();
+    unsigned SPTag = SPTy.getTag();
+
+    if (SPTag == dwarf::DW_TAG_subroutine_type)
+      for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
+        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        DIType ATy = DIType(Args.getElement(i));
+        addType(Arg, ATy);
+        if (ATy.isArtificial())
+          addFlag(Arg, dwarf::DW_AT_artificial);
+        SPDie->addChild(Arg);
+      }
+  }
+
+  if (SP.isArtificial())
+    addFlag(SPDie, dwarf::DW_AT_artificial);
+
+  if (!SP.isLocalToUnit())
+    addFlag(SPDie, dwarf::DW_AT_external);
+
+  if (SP.isOptimized())
+    addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
+
+  if (unsigned isa = Asm->getISAEncoding()) {
+    addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
+
+  return SPDie;
+}
+
+// Return const expression if value is a GEP to access merged global
+// constant. e.g.
+// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
+static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
+  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
+  if (!CE || CE->getNumOperands() != 3 ||
+      CE->getOpcode() != Instruction::GetElementPtr)
+    return NULL;
+
+  // First operand points to a global struct.
+  Value *Ptr = CE->getOperand(0);
+  if (!isa<GlobalValue>(Ptr) ||
+      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
+    return NULL;
+
+  // Second operand is zero.
+  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
+  if (!CI || !CI->isZero())
+    return NULL;
+
+  // Third operand is offset.
+  if (!isa<ConstantInt>(CE->getOperand(2)))
+    return NULL;
+
+  return CE;
+}
+
+/// createGlobalVariableDIE - create global variable DIE.
+void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
+  // Check for pre-existence.
+  if (getDIE(N))
+    return;
+
+  DIGlobalVariable GV(N);
+  if (!GV.Verify())
+    return;
+
+  DIDescriptor GVContext = GV.getContext();
+  DIType GTy = GV.getType();
+
+  // If this is a static data member definition, some attributes belong
+  // to the declaration DIE.
+  DIE *VariableDIE = NULL;
+  bool IsStaticMember = false;
+  DIDerivedType SDMDecl = GV.getStaticDataMemberDeclaration();
+  if (SDMDecl.Verify()) {
+    assert(SDMDecl.isStaticMember() && "Expected static member decl");
+    // We need the declaration DIE that is in the static member's class.
+    // But that class might not exist in the DWARF yet.
+    // Creating the class will create the static member decl DIE.
+    getOrCreateContextDIE(SDMDecl.getContext());
+    VariableDIE = getDIE(SDMDecl);
+    assert(VariableDIE && "Static member decl has no context?");
+    IsStaticMember = true;
+  }
+
+  // If this is not a static data member definition, create the variable
+  // DIE and add the initial set of attributes to it.
+  if (!VariableDIE) {
+    VariableDIE = new DIE(GV.getTag());
+    // Add to map.
+    insertDIE(N, VariableDIE);
+
+    // Add name and type.
+    addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
+    addType(VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV.isLocalToUnit()) {
+      addFlag(VariableDIE, dwarf::DW_AT_external);
+      addGlobalName(GV.getName(), VariableDIE);
+    }
+
+    // Add line number info.
+    addSourceLine(VariableDIE, GV);
+    // Add to context owner.
+    addToContextOwner(VariableDIE, GVContext);
+  }
+
+  // Add location.
+  bool addToAccelTable = false;
+  DIE *VariableSpecDIE = NULL;
+  bool isGlobalVariable = GV.getGlobal() != NULL;
+  if (isGlobalVariable) {
+    addToAccelTable = true;
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+    addOpAddress(Block, Asm->Mang->getSymbol(GV.getGlobal()));
+    // Do not create specification DIE if context is either compile unit
+    // or a subprogram.
+    if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
+        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+      // Create specification DIE.
+      VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
+                  dwarf::DW_FORM_ref4, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      // A static member's declaration is already flagged as such.
+      if (!SDMDecl.Verify())
+        addFlag(VariableDIE, dwarf::DW_AT_declaration);
+      addDie(VariableSpecDIE);
+    } else {
+      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    }
+    // Add linkage name.
+    StringRef LinkageName = GV.getLinkageName();
+    if (!LinkageName.empty()) {
+      // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
+      // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
+      // TAG_variable.
+      addString(IsStaticMember && VariableSpecDIE ?
+                VariableSpecDIE : VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
+                getRealLinkageName(LinkageName));
+      // In compatibility mode with older gdbs we put the linkage name on both
+      // the TAG_variable DIE and on the TAG_member DIE.
+      if (IsStaticMember && VariableSpecDIE && DD->useDarwinGDBCompat())
+        addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
+                  getRealLinkageName(LinkageName));
+    }
+  } else if (const ConstantInt *CI =
+             dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+    // AT_const_value was added when the static member was created. To avoid
+    // emitting AT_const_value multiple times, we only add AT_const_value when
+    // it is not a static member.
+    if (!IsStaticMember)
+      addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+    addToAccelTable = true;
+    // GV is a merged global.
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+    Value *Ptr = CE->getOperand(0);
+    addOpAddress(Block, Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
+    addUInt(Block, 0, dwarf::DW_FORM_udata,
+                   Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+  }
+
+  if (addToAccelTable) {
+    DIE *AddrDIE = VariableSpecDIE ? VariableSpecDIE : VariableDIE;
+    addAccelName(GV.getName(), AddrDIE);
+
+    // If the linkage name is different than the name, go ahead and output
+    // that as well into the name table.
+    if (GV.getLinkageName() != "" && GV.getName() != GV.getLinkageName())
+      addAccelName(GV.getLinkageName(), AddrDIE);
+  }
+
+  return;
+}
+
+/// constructSubrangeDIE - Construct subrange DIE from DISubrange.
+void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
+                                       DIE *IndexTy) {
+  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+
+  // The LowerBound value defines the lower bounds which is typically zero for
+  // C/C++. The Count value is the number of elements.  Values are 64 bit. If
+  // Count == -1 then the array is unbounded and we do not emit
+  // DW_AT_lower_bound and DW_AT_upper_bound attributes. If LowerBound == 0 and
+  // Count == 0, then the array has zero elements in which case we do not emit
+  // an upper bound.
+  int64_t LowerBound = SR.getLo();
+  int64_t DefaultLowerBound = getDefaultLowerBound();
+  int64_t Count = SR.getCount();
+
+  if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, LowerBound);
+
+  if (Count != -1 && Count != 0)
+    // FIXME: An unbounded array should reference the expression that defines
+    // the array.
+    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, LowerBound + Count - 1);
+
+  Buffer.addChild(DW_Subrange);
+}
+
+/// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
+void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
+                                        DICompositeType *CTy) {
+  Buffer.setTag(dwarf::DW_TAG_array_type);
+  if (CTy->isVector())
+    addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
+
+  // Emit derived type.
+  addType(&Buffer, CTy->getTypeDerivedFrom());
+  DIArray Elements = CTy->getTypeArray();
+
+  // Get an anonymous type for index type.
+  // FIXME: This type should be passed down from the front end
+  // as different languages may have different sizes for indexes.
+  DIE *IdxTy = getIndexTyDie();
+  if (!IdxTy) {
+    // Construct an anonymous type for index type.
+    IdxTy = new DIE(dwarf::DW_TAG_base_type);
+    addString(IdxTy, dwarf::DW_AT_name, "int");
+    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+    addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+            dwarf::DW_ATE_signed);
+    addDie(IdxTy);
+    setIndexTyDie(IdxTy);
+  }
+
+  // Add subranges to array type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Elements.getElement(i);
+    if (Element.getTag() == dwarf::DW_TAG_subrange_type)
+      constructSubrangeDIE(Buffer, DISubrange(Element), IdxTy);
+  }
+}
+
+/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
+  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
+  StringRef Name = ETy.getName();
+  addString(Enumerator, dwarf::DW_AT_name, Name);
+  int64_t Value = ETy.getEnumValue();
+  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+  return Enumerator;
+}
+
+/// constructContainingTypeDIEs - Construct DIEs for types that contain
+/// vtables.
+void CompileUnit::constructContainingTypeDIEs() {
+  for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
+         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+    DIE *SPDie = CI->first;
+    const MDNode *N = CI->second;
+    if (!N) continue;
+    DIE *NDie = getDIE(N);
+    if (!NDie) continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+  }
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+DIE *CompileUnit::constructVariableDIE(DbgVariable *DV, bool isScopeAbstract) {
+  StringRef Name = DV->getName();
+
+  // Translate tag to proper Dwarf tag.
+  unsigned Tag = DV->getTag();
+
+  // Define variable debug information entry.
+  DIE *VariableDie = new DIE(Tag);
+  DbgVariable *AbsVar = DV->getAbstractVariable();
+  DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
+  if (AbsDIE)
+    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
+                            dwarf::DW_FORM_ref4, AbsDIE);
+  else {
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+    addSourceLine(VariableDie, DV->getVariable());
+    addType(VariableDie, DV->getType());
+  }
+
+  if (DV->isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+
+  if (isScopeAbstract) {
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Offset = DV->getDotDebugLocOffset();
+  if (Offset != ~0U) {
+    addLabel(VariableDie, dwarf::DW_AT_location,
+                         dwarf::DW_FORM_data4,
+                         Asm->GetTempSymbol("debug_loc", Offset));
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  }
+
+  // Check if variable is described by a DBG_VALUE instruction.
+  if (const MachineInstr *DVInsn = DV->getMInsn()) {
+    bool updated = false;
+    if (DVInsn->getNumOperands() == 3) {
+      if (DVInsn->getOperand(0).isReg()) {
+        const MachineOperand RegOp = DVInsn->getOperand(0);
+        const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+        if (DVInsn->getOperand(1).isImm() &&
+            TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
+          unsigned FrameReg = 0;
+          const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+          int Offset =
+            TFI->getFrameIndexReference(*Asm->MF,
+                                        DVInsn->getOperand(1).getImm(),
+                                        FrameReg);
+          MachineLocation Location(FrameReg, Offset);
+          addVariableAddress(DV, VariableDie, Location);
+
+        } else if (RegOp.getReg())
+          addVariableAddress(DV, VariableDie,
+                                         MachineLocation(RegOp.getReg()));
+        updated = true;
+      }
+      else if (DVInsn->getOperand(0).isImm())
+        updated =
+          addConstantValue(VariableDie, DVInsn->getOperand(0),
+                                       DV->getType());
+      else if (DVInsn->getOperand(0).isFPImm())
+        updated =
+          addConstantFPValue(VariableDie, DVInsn->getOperand(0));
+      else if (DVInsn->getOperand(0).isCImm())
+        updated =
+          addConstantValue(VariableDie,
+                                       DVInsn->getOperand(0).getCImm(),
+                                       DV->getType().isUnsignedDIType());
+    } else {
+      addVariableAddress(DV, VariableDie,
+                                     Asm->getDebugValueLocation(DVInsn));
+      updated = true;
+    }
+    if (!updated) {
+      // If variableDie is not updated then DBG_VALUE instruction does not
+      // have valid variable info.
+      delete VariableDie;
+      return NULL;
+    }
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  } else {
+    // .. else use frame index.
+    int FI = DV->getFrameIndex();
+    if (FI != ~0) {
+      unsigned FrameReg = 0;
+      const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+      int Offset =
+        TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+      MachineLocation Location(FrameReg, Offset);
+      addVariableAddress(DV, VariableDie, Location);
+    }
+  }
+
+  DV->setDIE(VariableDie);
+  return VariableDie;
+}
+
+/// createMemberDIE - Create new member DIE.
+DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
+  DIE *MemberDie = new DIE(DT.getTag());
+  StringRef Name = DT.getName();
+  if (!Name.empty())
+    addString(MemberDie, dwarf::DW_AT_name, Name);
+
+  addType(MemberDie, DT.getTypeDerivedFrom());
+
+  addSourceLine(MemberDie, DT);
+
+  DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
+  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+
+  uint64_t Size = DT.getSizeInBits();
+  uint64_t FieldSize = DT.getOriginalTypeSize();
+
+  if (Size != FieldSize) {
+    // Handle bitfield.
+    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
+    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
+
+    uint64_t Offset = DT.getOffsetInBits();
+    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+    uint64_t FieldOffset = (HiMark - FieldSize);
+    Offset -= FieldOffset;
+
+    // Maybe we need to work from the other end.
+    if (Asm->getDataLayout().isLittleEndian())
+      Offset = FieldSize - (Offset + Size);
+    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
+
+    // Here WD_AT_data_member_location points to the anonymous
+    // field that includes this bit field.
+    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
+
+  } else
+    // This is not a bitfield.
+    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
+
+  if (DT.getTag() == dwarf::DW_TAG_inheritance
+      && DT.isVirtual()) {
+
+    // For C++, virtual base classes are not at fixed offset. Use following
+    // expression to extract appropriate offset from vtable.
+    // BaseAddr = ObAddr + *((*ObAddr) - Offset)
+
+    DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
+             VBaseLocationDie);
+  } else
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
+
+  if (DT.isProtected())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (DT.isPrivate())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  // Otherwise C++ member and base classes are considered public.
+  else
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+  if (DT.isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1,
+            dwarf::DW_VIRTUALITY_virtual);
+
+  // Objective-C properties.
+  if (MDNode *PNode = DT.getObjCProperty())
+    if (DIEEntry *PropertyDie = getDIEEntry(PNode))
+      MemberDie->addValue(dwarf::DW_AT_APPLE_property, dwarf::DW_FORM_ref4,
+                          PropertyDie);
+
+  if (DT.isArtificial())
+    addFlag(MemberDie, dwarf::DW_AT_artificial);
+
+  return MemberDie;
+}
+
+/// createStaticMemberDIE - Create new DIE for C++ static member.
+DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
+  if (!DT.Verify())
+    return NULL;
+
+  DIE *StaticMemberDIE = new DIE(DT.getTag());
+  DIType Ty = DT.getTypeDerivedFrom();
+
+  addString(StaticMemberDIE, dwarf::DW_AT_name, DT.getName());
+  addType(StaticMemberDIE, Ty);
+  addSourceLine(StaticMemberDIE, DT);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_external);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_declaration);
+
+  // FIXME: We could omit private if the parent is a class_type, and
+  // public if the parent is something else.
+  if (DT.isProtected())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (DT.isPrivate())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+
+  if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
+    addConstantValue(StaticMemberDIE, CI, Ty.isUnsignedDIType());
+  if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
+    addConstantFPValue(StaticMemberDIE, CFP);
+
+  insertDIE(DT, StaticMemberDIE);
+  return StaticMemberDIE;
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 000000000000..2b180c6cc3f4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -0,0 +1,383 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/DebugInfo.h"
+
+namespace llvm {
+
+class DwarfDebug;
+class DwarfUnits;
+class MachineLocation;
+class MachineOperand;
+class ConstantInt;
+class ConstantFP;
+class DbgVariable;
+
+//===----------------------------------------------------------------------===//
+/// CompileUnit - This dwarf writer support class manages information associated
+/// with a source file.
+class CompileUnit {
+  /// UniqueID - a numeric ID unique among all CUs in the module
+  ///
+  unsigned UniqueID;
+
+  /// Language - The DW_AT_language of the compile unit
+  ///
+  unsigned Language;
+
+  /// Die - Compile unit debug information entry.
+  ///
+  const OwningPtr<DIE> CUDie;
+
+  /// Asm - Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  // Holders for some common dwarf information.
+  DwarfDebug *DD;
+  DwarfUnits *DU;
+
+  /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
+  DIE *IndexTyDie;
+
+  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
+  /// variables to debug information entries.
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
+
+  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// descriptors to debug information entries using a DIEEntry proxy.
+  DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
+
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  ///
+  StringMap<DIE*> GlobalNames;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  ///
+  StringMap<DIE*> GlobalTypes;
+
+  /// AccelNames - A map of names for the name accelerator table.
+  ///
+  StringMap<std::vector<DIE*> > AccelNames;
+  StringMap<std::vector<DIE*> > AccelObjC;
+  StringMap<std::vector<DIE*> > AccelNamespace;
+  StringMap<std::vector<std::pair<DIE*, unsigned> > > AccelTypes;
+
+  /// DIEBlocks - A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+  /// ContainingTypeMap - This map is used to keep track of subprogram DIEs that
+  /// need DW_AT_containing_type attribute. This attribute points to a DIE that
+  /// corresponds to the MDNode mapped with the subprogram DIE.
+  DenseMap<DIE *, const MDNode *> ContainingTypeMap;
+
+  /// Offset of the CUDie from beginning of debug info section.
+  unsigned DebugInfoOffset;
+
+  /// getLowerBoundDefault - Return the default lower bound for an array. If the
+  /// DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
+
+  /// getOrCreateContextDIE - Get context owner's DIE.
+  DIE *getOrCreateContextDIE(DIDescriptor Context);
+
+public:
+  CompileUnit(unsigned UID, unsigned L, DIE *D, AsmPrinter *A, DwarfDebug *DW,
+              DwarfUnits *);
+  ~CompileUnit();
+
+  // Accessors.
+  unsigned getUniqueID()            const { return UniqueID; }
+  unsigned getLanguage()            const { return Language; }
+  DIE* getCUDie()                   const { return CUDie.get(); }
+  unsigned getDebugInfoOffset()     const { return DebugInfoOffset; }
+  const StringMap<DIE*> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
+
+  const StringMap<std::vector<DIE*> > &getAccelNames() const {
+    return AccelNames;
+  }
+  const StringMap<std::vector<DIE*> > &getAccelObjC() const {
+    return AccelObjC;
+  }
+  const StringMap<std::vector<DIE*> > &getAccelNamespace() const {
+    return AccelNamespace;
+  }
+  const StringMap<std::vector<std::pair<DIE*, unsigned > > >
+  &getAccelTypes() const {
+    return AccelTypes;
+  }
+
+  void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+  /// hasContent - Return true if this compile unit has something to write out.
+  ///
+  bool hasContent() const { return !CUDie->getChildren().empty(); }
+
+  /// addGlobalName - Add a new global entity to the compile unit.
+  ///
+  void addGlobalName(StringRef Name, DIE *Die) { GlobalNames[Name] = Die; }
+
+  /// addGlobalType - Add a new global type to the compile unit.
+  ///
+  void addGlobalType(DIType Ty);
+
+
+  /// addAccelName - Add a new name to the name accelerator table.
+  void addAccelName(StringRef Name, DIE *Die) {
+    std::vector<DIE*> &DIEs = AccelNames[Name];
+    DIEs.push_back(Die);
+  }
+  void addAccelObjC(StringRef Name, DIE *Die) {
+    std::vector<DIE*> &DIEs = AccelObjC[Name];
+    DIEs.push_back(Die);
+  }
+  void addAccelNamespace(StringRef Name, DIE *Die) {
+    std::vector<DIE*> &DIEs = AccelNamespace[Name];
+    DIEs.push_back(Die);
+  }
+  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
+    std::vector<std::pair<DIE*, unsigned > > &DIEs = AccelTypes[Name];
+    DIEs.push_back(Die);
+  }
+
+  /// getDIE - Returns the debug information entry map slot for the
+  /// specified debug variable.
+  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
+
+  DIEBlock *getDIEBlock() {
+    return new (DIEValueAllocator) DIEBlock();
+  }
+
+  /// insertDIE - Insert DIE into the map.
+  void insertDIE(const MDNode *N, DIE *D) {
+    MDNodeToDieMap.insert(std::make_pair(N, D));
+  }
+
+  /// getDIEEntry - Returns the debug information entry for the specified
+  /// debug variable.
+  DIEEntry *getDIEEntry(const MDNode *N) {
+    DenseMap<const MDNode *, DIEEntry *>::iterator I =
+      MDNodeToDIEEntryMap.find(N);
+    if (I == MDNodeToDIEEntryMap.end())
+      return NULL;
+    return I->second;
+  }
+
+  /// insertDIEEntry - Insert debug information entry into the map.
+  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
+  }
+
+  /// addDie - Adds or interns the DIE to the compile unit.
+  ///
+  void addDie(DIE *Buffer) {
+    this->CUDie->addChild(Buffer);
+  }
+
+  // getIndexTyDie - Get an anonymous type for index type.
+  DIE *getIndexTyDie() {
+    return IndexTyDie;
+  }
+
+  // setIndexTyDie - Set D as anonymous type for index which can be reused
+  // later.
+  void setIndexTyDie(DIE *D) {
+    IndexTyDie = D;
+  }
+
+  /// addFlag - Add a flag that is true to the DIE.
+  void addFlag(DIE *Die, unsigned Attribute);
+
+  /// addUInt - Add an unsigned integer attribute data and value.
+  ///
+  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+
+  /// addSInt - Add an signed integer attribute data and value.
+  ///
+  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+
+  /// addString - Add a string attribute data and value.
+  ///
+  void addString(DIE *Die, unsigned Attribute, const StringRef Str);
+
+  /// addLocalString - Add a string attribute data and value.
+  ///
+  void addLocalString(DIE *Die, unsigned Attribute, const StringRef Str);
+
+  /// addLabel - Add a Dwarf label attribute data and value.
+  ///
+  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Label);
+
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  ///
+  void addLabelAddress(DIE *Die, unsigned Attribute, MCSymbol *Label);
+
+  /// addOpAddress - Add a dwarf op address data and value using the
+  /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  ///
+  void addOpAddress(DIE *Die, MCSymbol *Label);
+
+  /// addDelta - Add a label delta attribute data and value.
+  ///
+  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
+                const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+
+  /// addBlock - Add block data.
+  ///
+  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+
+  /// addSourceLine - Add location information to specified debug information
+  /// entry.
+  void addSourceLine(DIE *Die, DIVariable V);
+  void addSourceLine(DIE *Die, DIGlobalVariable G);
+  void addSourceLine(DIE *Die, DISubprogram SP);
+  void addSourceLine(DIE *Die, DIType Ty);
+  void addSourceLine(DIE *Die, DINameSpace NS);
+  void addSourceLine(DIE *Die, DIObjCProperty Ty);
+
+  /// addAddress - Add an address attribute to a die based on the location
+  /// provided.
+  void addAddress(DIE *Die, unsigned Attribute,
+                  const MachineLocation &Location);
+
+  /// addConstantValue - Add constant value entry in variable DIE.
+  bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
+  bool addConstantValue(DIE *Die, const ConstantInt *CI, bool Unsigned);
+  bool addConstantValue(DIE *Die, const APInt &Val, bool Unsigned);
+
+  /// addConstantFPValue - Add constant value entry in variable DIE.
+  bool addConstantFPValue(DIE *Die, const MachineOperand &MO);
+  bool addConstantFPValue(DIE *Die, const ConstantFP *CFP);
+
+  /// addTemplateParams - Add template parameters in buffer.
+  void addTemplateParams(DIE &Buffer, DIArray TParams);
+
+  /// addRegisterOp - Add register operand.
+  void addRegisterOp(DIE *TheDie, unsigned Reg);
+
+  /// addRegisterOffset - Add register offset.
+  void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset);
+
+  /// addComplexAddress - Start with the address based on the location provided,
+  /// and generate the DWARF information necessary to find the actual variable
+  /// (navigating the extra location information encoded in the type) based on
+  /// the starting location.  Add the DWARF information to the die.
+  ///
+  void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                         const MachineLocation &Location);
+
+  // FIXME: Should be reformulated in terms of addComplexAddress.
+  /// addBlockByrefAddress - Start with the address based on the location
+  /// provided, and generate the DWARF information necessary to find the
+  /// actual Block variable (navigating the Block struct) based on the
+  /// starting location.  Add the DWARF information to the die.  Obsolete,
+  /// please use addComplexAddress instead.
+  ///
+  void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+                            const MachineLocation &Location);
+
+  /// addVariableAddress - Add DW_AT_location attribute for a
+  /// DbgVariable based on provided MachineLocation.
+  void addVariableAddress(DbgVariable *&DV, DIE *Die, MachineLocation Location);
+
+  /// addToContextOwner - Add Die into the list of its context owner's children.
+  void addToContextOwner(DIE *Die, DIDescriptor Context);
+
+  /// addType - Add a new type attribute to the specified entity. This takes
+  /// and attribute parameter because DW_AT_friend attributes are also
+  /// type references.
+  void addType(DIE *Entity, DIType Ty, unsigned Attribute = dwarf::DW_AT_type);
+
+  /// getOrCreateNameSpace - Create a DIE for DINameSpace.
+  DIE *getOrCreateNameSpace(DINameSpace NS);
+
+  /// getOrCreateSubprogramDIE - Create new DIE using SP.
+  DIE *getOrCreateSubprogramDIE(DISubprogram SP);
+
+  /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
+  /// given DIType.
+  DIE *getOrCreateTypeDIE(const MDNode *N);
+
+  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
+  /// for the given DITemplateTypeParameter.
+  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
+
+  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create
+  /// new DIE for the given DITemplateValueParameter.
+  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  /// createGlobalVariableDIE - create global variable DIE.
+  void createGlobalVariableDIE(const MDNode *N);
+
+  void addPubTypes(DISubprogram SP);
+
+  /// constructTypeDIE - Construct basic type die from DIBasicType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIBasicType BTy);
+
+  /// constructTypeDIE - Construct derived type die from DIDerivedType.
+  void constructTypeDIE(DIE &Buffer,
+                        DIDerivedType DTy);
+
+  /// constructTypeDIE - Construct type DIE from DICompositeType.
+  void constructTypeDIE(DIE &Buffer,
+                        DICompositeType CTy);
+
+  /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
+  void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
+
+  /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
+  void constructArrayTypeDIE(DIE &Buffer,
+                             DICompositeType *CTy);
+
+  /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
+  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+
+  /// constructContainingTypeDIEs - Construct DIEs for types that contain
+  /// vtables.
+  void constructContainingTypeDIEs();
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable *DV, bool isScopeAbstract);
+
+  /// createMemberDIE - Create new member DIE.
+  DIE *createMemberDIE(DIDerivedType DT);
+
+  /// createStaticMemberDIE - Create new static data member DIE.
+  DIE *createStaticMemberDIE(DIDerivedType DT);
+
+private:
+
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+  DIEInteger *DIEIntegerOne;
+};
+
+} // end llvm namespace
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
new file mode 100644
index 000000000000..d3cb4f9c1c0e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -0,0 +1,2570 @@
+//===-- llvm/CodeGen/DwarfDebug.cpp - Dwarf Debug Framework ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+#include "DwarfDebug.h"
+#include "DIE.h"
+#include "DwarfAccelTable.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print",
+                                              cl::Hidden,
+     cl::desc("Disable debug info printing"));
+
+static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
+     cl::desc("Make an absence of debug location information explicit."),
+     cl::init(false));
+
+static cl::opt<bool> GenerateDwarfPubNamesSection("generate-dwarf-pubnames",
+     cl::Hidden, cl::init(false),
+     cl::desc("Generate DWARF pubnames section"));
+
+namespace {
+  enum DefaultOnOff {
+    Default, Enable, Disable
+  };
+}
+
+static cl::opt<DefaultOnOff> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
+     cl::desc("Output prototype dwarf accelerator tables."),
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
+
+static cl::opt<DefaultOnOff> DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
+     cl::desc("Compatibility with Darwin gdb."),
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
+
+static cl::opt<DefaultOnOff> SplitDwarf("split-dwarf", cl::Hidden,
+     cl::desc("Output prototype dwarf split debug info."),
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
+
+namespace {
+  const char *DWARFGroupName = "DWARF Emission";
+  const char *DbgTimerName = "DWARF Debug Writer";
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+// Configuration values for initial hash set sizes (log2).
+//
+static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
+
+namespace llvm {
+
+DIType DbgVariable::getType() const {
+  DIType Ty = Var.getType();
+  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
+  // addresses instead.
+  if (Var.isBlockByrefVariable()) {
+    /* Byref variables, in Blocks, are declared by the programmer as
+       "SomeType VarName;", but the compiler creates a
+       __Block_byref_x_VarName struct, and gives the variable VarName
+       either the struct, or a pointer to the struct, as its type.  This
+       is necessary for various behind-the-scenes things the compiler
+       needs to do with by-reference variables in blocks.
+
+       However, as far as the original *programmer* is concerned, the
+       variable should still have type 'SomeType', as originally declared.
+
+       The following function dives into the __Block_byref_x_VarName
+       struct to find the original type of the variable.  This will be
+       passed back to the code generating the type for the Debug
+       Information Entry for the variable 'VarName'.  'VarName' will then
+       have the original type 'SomeType' in its debug information.
+
+       The original type 'SomeType' will be the type of the field named
+       'VarName' inside the __Block_byref_x_VarName struct.
+
+       NOTE: In order for this to not completely fail on the debugger
+       side, the Debug Information Entry for the variable VarName needs to
+       have a DW_AT_location that tells the debugger how to unwind through
+       the pointers and __Block_byref_x_VarName struct to find the actual
+       value of the variable.  The function addBlockByrefType does this.  */
+    DIType subType = Ty;
+    unsigned tag = Ty.getTag();
+
+    if (tag == dwarf::DW_TAG_pointer_type) {
+      DIDerivedType DTy = DIDerivedType(Ty);
+      subType = DTy.getTypeDerivedFrom();
+    }
+
+    DICompositeType blockStruct = DICompositeType(subType);
+    DIArray Elements = blockStruct.getTypeArray();
+
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+      DIDescriptor Element = Elements.getElement(i);
+      DIDerivedType DT = DIDerivedType(Element);
+      if (getName() == DT.getName())
+        return (DT.getTypeDerivedFrom());
+    }
+  }
+  return Ty;
+}
+
+} // end llvm namespace
+
+DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
+  : Asm(A), MMI(Asm->MMI), FirstCU(0),
+    AbbreviationsSet(InitAbbreviationsSetSize),
+    SourceIdMap(DIEValueAllocator),
+    PrevLabel(NULL), GlobalCUIndexCount(0),
+    InfoHolder(A, &AbbreviationsSet, &Abbreviations, "info_string",
+               DIEValueAllocator),
+    SkeletonAbbrevSet(InitAbbreviationsSetSize),
+    SkeletonHolder(A, &SkeletonAbbrevSet, &SkeletonAbbrevs, "skel_string",
+                   DIEValueAllocator) {
+
+  DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
+  DwarfStrSectionSym = TextSectionSym = 0;
+  DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = DwarfLineSectionSym = 0;
+  DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
+  FunctionBeginSym = FunctionEndSym = 0;
+
+  // Turn on accelerator tables and older gdb compatibility
+  // for Darwin.
+  bool IsDarwin = Triple(M->getTargetTriple()).isOSDarwin();
+  if (DarwinGDBCompat == Default) {
+    if (IsDarwin)
+      IsDarwinGDBCompat = true;
+    else
+      IsDarwinGDBCompat = false;
+  } else
+    IsDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
+
+  if (DwarfAccelTables == Default) {
+    if (IsDarwin)
+      HasDwarfAccelTables = true;
+    else
+      HasDwarfAccelTables = false;
+  } else
+    HasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+
+  if (SplitDwarf == Default)
+    HasSplitDwarf = false;
+  else
+    HasSplitDwarf = SplitDwarf == Enable ? true : false;
+
+  {
+    NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+    beginModule();
+  }
+}
+DwarfDebug::~DwarfDebug() {
+}
+
+// Switch to the specified MCSection and emit an assembler
+// temporary label to it if SymbolStem is specified.
+static MCSymbol *emitSectionSym(AsmPrinter *Asm, const MCSection *Section,
+                                const char *SymbolStem = 0) {
+  Asm->OutStreamer.SwitchSection(Section);
+  if (!SymbolStem) return 0;
+
+  MCSymbol *TmpSym = Asm->GetTempSymbol(SymbolStem);
+  Asm->OutStreamer.EmitLabel(TmpSym);
+  return TmpSym;
+}
+
+MCSymbol *DwarfUnits::getStringPoolSym() {
+  return Asm->GetTempSymbol(StringPref);
+}
+
+MCSymbol *DwarfUnits::getStringPoolEntry(StringRef Str) {
+  std::pair<MCSymbol*, unsigned> &Entry =
+    StringPool.GetOrCreateValue(Str).getValue();
+  if (Entry.first) return Entry.first;
+
+  Entry.second = NextStringPoolNumber++;
+  return Entry.first = Asm->GetTempSymbol(StringPref, Entry.second);
+}
+
+unsigned DwarfUnits::getStringPoolIndex(StringRef Str) {
+  std::pair<MCSymbol*, unsigned> &Entry =
+    StringPool.GetOrCreateValue(Str).getValue();
+  if (Entry.first) return Entry.second;
+
+  Entry.second = NextStringPoolNumber++;
+  Entry.first = Asm->GetTempSymbol(StringPref, Entry.second);
+  return Entry.second;
+}
+
+unsigned DwarfUnits::getAddrPoolIndex(MCSymbol *Sym) {
+  std::pair<MCSymbol*, unsigned> &Entry = AddressPool[Sym];
+  if (Entry.first) return Entry.second;
+
+  Entry.second = NextAddrPoolNumber++;
+  Entry.first = Sym;
+  return Entry.second;
+}
+
+// Define a unique number for the abbreviation.
+//
+void DwarfUnits::assignAbbrevNumber(DIEAbbrev &Abbrev) {
+  // Profile the node so that we can make it unique.
+  FoldingSetNodeID ID;
+  Abbrev.Profile(ID);
+
+  // Check the set for priors.
+  DIEAbbrev *InSet = AbbreviationsSet->GetOrInsertNode(&Abbrev);
+
+  // If it's newly added.
+  if (InSet == &Abbrev) {
+    // Add to abbreviation list.
+    Abbreviations->push_back(&Abbrev);
+
+    // Assign the vector position + 1 as its number.
+    Abbrev.setNumber(Abbreviations->size());
+  } else {
+    // Assign existing abbreviation number.
+    Abbrev.setNumber(InSet->getNumber());
+  }
+}
+
+// If special LLVM prefix that is used to inform the asm
+// printer to not emit usual symbol prefix before the symbol name is used then
+// return linkage name after skipping this special LLVM prefix.
+static StringRef getRealLinkageName(StringRef LinkageName) {
+  char One = '\1';
+  if (LinkageName.startswith(StringRef(&One, 1)))
+    return LinkageName.substr(1);
+  return LinkageName;
+}
+
+static bool isObjCClass(StringRef Name) {
+  return Name.startswith("+") || Name.startswith("-");
+}
+
+static bool hasObjCCategory(StringRef Name) {
+  if (!isObjCClass(Name)) return false;
+
+  size_t pos = Name.find(')');
+  if (pos != std::string::npos) {
+    if (Name[pos+1] != ' ') return false;
+    return true;
+  }
+  return false;
+}
+
+static void getObjCClassCategory(StringRef In, StringRef &Class,
+                                 StringRef &Category) {
+  if (!hasObjCCategory(In)) {
+    Class = In.slice(In.find('[') + 1, In.find(' '));
+    Category = "";
+    return;
+  }
+
+  Class = In.slice(In.find('[') + 1, In.find('('));
+  Category = In.slice(In.find('[') + 1, In.find(' '));
+  return;
+}
+
+static StringRef getObjCMethodName(StringRef In) {
+  return In.slice(In.find(' ') + 1, In.find(']'));
+}
+
+// Add the various names to the Dwarf accelerator table names.
+static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
+                               DIE* Die) {
+  if (!SP.isDefinition()) return;
+
+  TheCU->addAccelName(SP.getName(), Die);
+
+  // If the linkage name is different than the name, go ahead and output
+  // that as well into the name table.
+  if (SP.getLinkageName() != "" && SP.getName() != SP.getLinkageName())
+    TheCU->addAccelName(SP.getLinkageName(), Die);
+
+  // If this is an Objective-C selector name add it to the ObjC accelerator
+  // too.
+  if (isObjCClass(SP.getName())) {
+    StringRef Class, Category;
+    getObjCClassCategory(SP.getName(), Class, Category);
+    TheCU->addAccelObjC(Class, Die);
+    if (Category != "")
+      TheCU->addAccelObjC(Category, Die);
+    // Also add the base method name to the name table.
+    TheCU->addAccelName(getObjCMethodName(SP.getName()), Die);
+  }
+}
+
+// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
+// and DW_AT_high_pc attributes. If there are global variables in this
+// scope then create and insert DIEs for these variables.
+DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
+                                          const MDNode *SPNode) {
+  DIE *SPDie = SPCU->getDIE(SPNode);
+
+  assert(SPDie && "Unable to find subprogram DIE!");
+  DISubprogram SP(SPNode);
+
+  // If we're updating an abstract DIE, then we will be adding the children and
+  // object pointer later on. But what we don't want to do is process the
+  // concrete DIE twice.
+  DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode);
+  if (AbsSPDIE) {
+    bool InSameCU = (AbsSPDIE->getCompileUnit() == SPCU->getCUDie());
+    // Pick up abstract subprogram DIE.
+    SPDie = new DIE(dwarf::DW_TAG_subprogram);
+    // If AbsSPDIE belongs to a different CU, use DW_FORM_ref_addr instead of
+    // DW_FORM_ref4.
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
+                      InSameCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
+                      AbsSPDIE);
+    SPCU->addDie(SPDie);
+  } else {
+    DISubprogram SPDecl = SP.getFunctionDeclaration();
+    if (!SPDecl.isSubprogram()) {
+      // There is not any need to generate specification DIE for a function
+      // defined at compile unit level. If a function is defined inside another
+      // function then gdb prefers the definition at top level and but does not
+      // expect specification DIE in parent function. So avoid creating
+      // specification DIE for a function defined inside a function.
+      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
+          !SP.getContext().isFile() &&
+          !isSubprogramContext(SP.getContext())) {
+        SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
+
+        // Add arguments.
+        DICompositeType SPTy = SP.getType();
+        DIArray Args = SPTy.getTypeArray();
+        unsigned SPTag = SPTy.getTag();
+        if (SPTag == dwarf::DW_TAG_subroutine_type)
+          for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+            DIType ATy = DIType(Args.getElement(i));
+            SPCU->addType(Arg, ATy);
+            if (ATy.isArtificial())
+              SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
+            if (ATy.isObjectPointer())
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
+                                dwarf::DW_FORM_ref4, Arg);
+            SPDie->addChild(Arg);
+          }
+        DIE *SPDeclDie = SPDie;
+        SPDie = new DIE(dwarf::DW_TAG_subprogram);
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification,
+                          dwarf::DW_FORM_ref4, SPDeclDie);
+        SPCU->addDie(SPDie);
+      }
+    }
+  }
+
+  SPCU->addLabelAddress(SPDie, dwarf::DW_AT_low_pc,
+                        Asm->GetTempSymbol("func_begin",
+                                           Asm->getFunctionNumber()));
+  SPCU->addLabelAddress(SPDie, dwarf::DW_AT_high_pc,
+                        Asm->GetTempSymbol("func_end",
+                                           Asm->getFunctionNumber()));
+  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
+  MachineLocation Location(RI->getFrameRegister(*Asm->MF));
+  SPCU->addAddress(SPDie, dwarf::DW_AT_frame_base, Location);
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_subprogram nodes.
+  addSubprogramNames(SPCU, SP, SPDie);
+
+  return SPDie;
+}
+
+// Construct new DW_TAG_lexical_block for this scope and attach
+// DW_AT_low_pc/DW_AT_high_pc labels.
+DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
+                                          LexicalScope *Scope) {
+  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
+  if (Scope->isAbstractScope())
+    return ScopeDIE;
+
+  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return 0;
+
+  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
+  if (Ranges.size() > 1) {
+    // .debug_range section has not been laid out yet. Emit offset in
+    // .debug_range as a uint, size 4, for now. emitDIE will handle
+    // DW_AT_ranges appropriately.
+    TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
+                   DebugRangeSymbols.size()
+                   * Asm->getDataLayout().getPointerSize());
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+         RE = Ranges.end(); RI != RE; ++RI) {
+      DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
+      DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
+    }
+    DebugRangeSymbols.push_back(NULL);
+    DebugRangeSymbols.push_back(NULL);
+    return ScopeDIE;
+  }
+
+  MCSymbol *Start = getLabelBeforeInsn(RI->first);
+  MCSymbol *End = getLabelAfterInsn(RI->second);
+
+  if (End == 0) return 0;
+
+  assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
+  assert(End->isDefined() && "Invalid end label for an inlined scope!");
+
+  TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_low_pc, Start);
+  TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_high_pc, End);
+
+  return ScopeDIE;
+}
+
+// This scope represents inlined body of a function. Construct DIE to
+// represent this concrete inlined copy of the function.
+DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
+                                          LexicalScope *Scope) {
+  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
+  assert(Ranges.empty() == false &&
+         "LexicalScope does not have instruction markers!");
+
+  if (!Scope->getScopeNode())
+    return NULL;
+  DIScope DS(Scope->getScopeNode());
+  DISubprogram InlinedSP = getDISubprogram(DS);
+  DIE *OriginDIE = TheCU->getDIE(InlinedSP);
+  if (!OriginDIE) {
+    DEBUG(dbgs() << "Unable to find original DIE for an inlined subprogram.");
+    return NULL;
+  }
+
+  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
+  MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
+  MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
+
+  if (StartLabel == 0 || EndLabel == 0) {
+    llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
+  }
+  assert(StartLabel->isDefined() &&
+         "Invalid starting label for an inlined scope!");
+  assert(EndLabel->isDefined() &&
+         "Invalid end label for an inlined scope!");
+
+  DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
+  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
+                     dwarf::DW_FORM_ref4, OriginDIE);
+
+  if (Ranges.size() > 1) {
+    // .debug_range section has not been laid out yet. Emit offset in
+    // .debug_range as a uint, size 4, for now. emitDIE will handle
+    // DW_AT_ranges appropriately.
+    TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
+                   DebugRangeSymbols.size()
+                   * Asm->getDataLayout().getPointerSize());
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+         RE = Ranges.end(); RI != RE; ++RI) {
+      DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
+      DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
+    }
+    DebugRangeSymbols.push_back(NULL);
+    DebugRangeSymbols.push_back(NULL);
+  } else {
+    TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_low_pc, StartLabel);
+    TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_high_pc, EndLabel);
+  }
+
+  InlinedSubprogramDIEs.insert(OriginDIE);
+
+  // Track the start label for this inlined function.
+  //.debug_inlined section specification does not clearly state how
+  // to emit inlined scope that is split into multiple instruction ranges.
+  // For now, use first instruction range and emit low_pc/high_pc pair and
+  // corresponding .debug_inlined section entry for this pair.
+  DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> >::iterator
+    I = InlineInfo.find(InlinedSP);
+
+  if (I == InlineInfo.end()) {
+    InlineInfo[InlinedSP].push_back(std::make_pair(StartLabel, ScopeDIE));
+    InlinedSPNodes.push_back(InlinedSP);
+  } else
+    I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
+
+  DILocation DL(Scope->getInlinedAt());
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0,
+                 getOrCreateSourceID(DL.getFilename(), DL.getDirectory(),
+                                     TheCU->getUniqueID()));
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
+  addSubprogramNames(TheCU, InlinedSP, ScopeDIE);
+
+  return ScopeDIE;
+}
+
+// Construct a DIE for this scope.
+DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
+  if (!Scope || !Scope->getScopeNode())
+    return NULL;
+
+  DIScope DS(Scope->getScopeNode());
+  // Early return to avoid creating dangling variable|scope DIEs.
+  if (!Scope->getInlinedAt() && DS.isSubprogram() && Scope->isAbstractScope() &&
+      !TheCU->getDIE(DS))
+    return NULL;
+
+  SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
+
+  // Collect arguments for current function.
+  if (LScopes.isCurrentFunctionScope(Scope))
+    for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
+      if (DbgVariable *ArgDV = CurrentFnArguments[i])
+        if (DIE *Arg =
+            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
+          Children.push_back(Arg);
+          if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
+        }
+
+  // Collect lexical scope children first.
+  const SmallVector<DbgVariable *, 8> &Variables = ScopeVariables.lookup(Scope);
+  for (unsigned i = 0, N = Variables.size(); i < N; ++i)
+    if (DIE *Variable =
+        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
+      Children.push_back(Variable);
+      if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
+    }
+  const SmallVector<LexicalScope *, 4> &Scopes = Scope->getChildren();
+  for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
+    if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
+      Children.push_back(Nested);
+  DIE *ScopeDIE = NULL;
+  if (Scope->getInlinedAt())
+    ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
+  else if (DS.isSubprogram()) {
+    ProcessedSPNodes.insert(DS);
+    if (Scope->isAbstractScope()) {
+      ScopeDIE = TheCU->getDIE(DS);
+      // Note down abstract DIE.
+      if (ScopeDIE)
+        AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
+    }
+    else
+      ScopeDIE = updateSubprogramScopeDIE(TheCU, DS);
+  }
+  else {
+    // There is no need to emit empty lexical block DIE.
+    if (Children.empty())
+      return NULL;
+    ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
+  }
+
+  if (!ScopeDIE) return NULL;
+
+  // Add children
+  for (SmallVector<DIE *, 8>::iterator I = Children.begin(),
+         E = Children.end(); I != E; ++I)
+    ScopeDIE->addChild(*I);
+
+  if (DS.isSubprogram() && ObjectPointer != NULL)
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
+                       dwarf::DW_FORM_ref4, ObjectPointer);
+
+  if (DS.isSubprogram())
+    TheCU->addPubTypes(DISubprogram(DS));
+
+  return ScopeDIE;
+}
+
+// Look up the source id with the given directory and source file names.
+// If none currently exists, create a new id and insert it in the
+// SourceIds map. This can update DirectoryNames and SourceFileNames maps
+// as well.
+unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
+                                         StringRef DirName, unsigned CUID) {
+  // If we use .loc in assembly, we can't separate .file entries according to
+  // compile units. Thus all files will belong to the default compile unit.
+  if (Asm->TM.hasMCUseLoc() &&
+      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+    CUID = 0;
+
+  // If FE did not provide a file name, then assume stdin.
+  if (FileName.empty())
+    return getOrCreateSourceID("<stdin>", StringRef(), CUID);
+
+  // TODO: this might not belong here. See if we can factor this better.
+  if (DirName == CompilationDir)
+    DirName = "";
+
+  // FileIDCUMap stores the current ID for the given compile unit.
+  unsigned SrcId = FileIDCUMap[CUID] + 1;
+
+  // We look up the CUID/file/dir by concatenating them with a zero byte.
+  SmallString<128> NamePair;
+  NamePair += CUID;
+  NamePair += '\0';
+  NamePair += DirName;
+  NamePair += '\0'; // Zero bytes are not allowed in paths.
+  NamePair += FileName;
+
+  StringMapEntry<unsigned> &Ent = SourceIdMap.GetOrCreateValue(NamePair, SrcId);
+  if (Ent.getValue() != SrcId)
+    return Ent.getValue();
+
+  FileIDCUMap[CUID] = SrcId;
+  // Print out a .file directive to specify files for .loc directives.
+  Asm->OutStreamer.EmitDwarfFileDirective(SrcId, DirName, FileName, CUID);
+
+  return SrcId;
+}
+
+// Create new CompileUnit for the given metadata node with tag
+// DW_TAG_compile_unit.
+CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
+  DICompileUnit DIUnit(N);
+  StringRef FN = DIUnit.getFilename();
+  CompilationDir = DIUnit.getDirectory();
+
+  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
+  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
+                                       DIUnit.getLanguage(), Die, Asm,
+                                       this, &InfoHolder);
+
+  FileIDCUMap[NewCU->getUniqueID()] = 0;
+  // Call this to emit a .file directive if it wasn't emitted for the source
+  // file this CU comes from yet.
+  getOrCreateSourceID(FN, CompilationDir, NewCU->getUniqueID());
+
+  NewCU->addString(Die, dwarf::DW_AT_producer, DIUnit.getProducer());
+  NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                 DIUnit.getLanguage());
+  NewCU->addString(Die, dwarf::DW_AT_name, FN);
+  // 2.17.1 requires that we use DW_AT_low_pc for a single entry point
+  // into an entity. We're using 0 (or a NULL label) for this.
+  NewCU->addLabelAddress(Die, dwarf::DW_AT_low_pc, NULL);
+
+  // Define start line table label for each Compile Unit.
+  MCSymbol *LineTableStartSym = Asm->GetTempSymbol("line_table_start",
+                                                   NewCU->getUniqueID());
+  Asm->OutStreamer.getContext().setMCLineTableSymbol(LineTableStartSym,
+                                                     NewCU->getUniqueID());
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
+                    NewCU->getUniqueID() == 0 ?
+                    Asm->GetTempSymbol("section_line") : LineTableStartSym);
+  else if (NewCU->getUniqueID() == 0)
+    NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
+  else
+    NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
+                    LineTableStartSym, DwarfLineSectionSym);
+
+  if (!CompilationDir.empty())
+    NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+  if (DIUnit.isOptimized())
+    NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
+
+  StringRef Flags = DIUnit.getFlags();
+  if (!Flags.empty())
+    NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
+
+  if (unsigned RVer = DIUnit.getRunTimeVersion())
+    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+            dwarf::DW_FORM_data1, RVer);
+
+  if (!FirstCU)
+    FirstCU = NewCU;
+
+  InfoHolder.addUnit(NewCU);
+
+  CUMap.insert(std::make_pair(N, NewCU));
+  return NewCU;
+}
+
+// Construct subprogram DIE.
+void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
+                                        const MDNode *N) {
+  CompileUnit *&CURef = SPMap[N];
+  if (CURef)
+    return;
+  CURef = TheCU;
+
+  DISubprogram SP(N);
+  if (!SP.isDefinition())
+    // This is a method declaration which will be handled while constructing
+    // class type.
+    return;
+
+  DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
+
+  // Add to map.
+  TheCU->insertDIE(N, SubprogramDie);
+
+  // Add to context owner.
+  TheCU->addToContextOwner(SubprogramDie, SP.getContext());
+
+  // Expose as global, if requested.
+  if (GenerateDwarfPubNamesSection)
+    TheCU->addGlobalName(SP.getName(), SubprogramDie);
+}
+
+// Emit all Dwarf sections that should come prior to the content. Create
+// global DIEs and emit initial debug info sections. This is invoked by
+// the target AsmPrinter.
+void DwarfDebug::beginModule() {
+  if (DisableDebugInfoPrinting)
+    return;
+
+  const Module *M = MMI->getModule();
+
+  // If module has named metadata anchors then use them, otherwise scan the
+  // module using debug info finder to collect debug info.
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes)
+    return;
+
+  // Emit initial sections so we can reference labels later.
+  emitSectionLabels();
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    DICompileUnit CUNode(CU_Nodes->getOperand(i));
+    CompileUnit *CU = constructCompileUnit(CUNode);
+    DIArray GVs = CUNode.getGlobalVariables();
+    for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
+      CU->createGlobalVariableDIE(GVs.getElement(i));
+    DIArray SPs = CUNode.getSubprograms();
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
+      constructSubprogramDIE(CU, SPs.getElement(i));
+    DIArray EnumTypes = CUNode.getEnumTypes();
+    for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
+      CU->getOrCreateTypeDIE(EnumTypes.getElement(i));
+    DIArray RetainedTypes = CUNode.getRetainedTypes();
+    for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
+      CU->getOrCreateTypeDIE(RetainedTypes.getElement(i));
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then construct a skeleton CU based upon it.
+    if (useSplitDwarf()) {
+    // This should be a unique identifier when we want to build .dwp files.
+      CU->addUInt(CU->getCUDie(), dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+      // Now construct the skeleton CU associated.
+      constructSkeletonCU(CUNode);
+    }
+  }
+
+  // Tell MMI that we have debug info.
+  MMI->setDebugInfoAvailability(true);
+
+  // Prime section data.
+  SectionMap.insert(Asm->getObjFileLowering().getTextSection());
+}
+
+// Attach DW_AT_inline attribute with inlined subprogram DIEs.
+void DwarfDebug::computeInlinedDIEs() {
+  // Attach DW_AT_inline attribute with inlined subprogram DIEs.
+  for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
+         AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) {
+    DIE *ISP = *AI;
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+  }
+  for (DenseMap<const MDNode *, DIE *>::iterator AI = AbstractSPDies.begin(),
+         AE = AbstractSPDies.end(); AI != AE; ++AI) {
+    DIE *ISP = AI->second;
+    if (InlinedSubprogramDIEs.count(ISP))
+      continue;
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+  }
+}
+
+// Collect info for variables that were optimized out.
+void DwarfDebug::collectDeadVariables() {
+  const Module *M = MMI->getModule();
+  DenseMap<const MDNode *, LexicalScope *> DeadFnScopeMap;
+
+  if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit TheCU(CU_Nodes->getOperand(i));
+      DIArray Subprograms = TheCU.getSubprograms();
+      for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
+        DISubprogram SP(Subprograms.getElement(i));
+        if (ProcessedSPNodes.count(SP) != 0) continue;
+        if (!SP.Verify()) continue;
+        if (!SP.isDefinition()) continue;
+        DIArray Variables = SP.getVariables();
+        if (Variables.getNumElements() == 0) continue;
+
+        LexicalScope *Scope =
+          new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
+        DeadFnScopeMap[SP] = Scope;
+
+        // Construct subprogram DIE and add variables DIEs.
+        CompileUnit *SPCU = CUMap.lookup(TheCU);
+        assert(SPCU && "Unable to find Compile Unit!");
+        constructSubprogramDIE(SPCU, SP);
+        DIE *ScopeDIE = SPCU->getDIE(SP);
+        for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
+          DIVariable DV(Variables.getElement(vi));
+          if (!DV.Verify()) continue;
+          DbgVariable *NewVar = new DbgVariable(DV, NULL);
+          if (DIE *VariableDIE =
+              SPCU->constructVariableDIE(NewVar, Scope->isAbstractScope()))
+            ScopeDIE->addChild(VariableDIE);
+        }
+      }
+    }
+  }
+  DeleteContainerSeconds(DeadFnScopeMap);
+}
+
+void DwarfDebug::finalizeModuleInfo() {
+  // Collect info for variables that were optimized out.
+  collectDeadVariables();
+
+  // Attach DW_AT_inline attribute with inlined subprogram DIEs.
+  computeInlinedDIEs();
+
+  // Emit DW_AT_containing_type attribute to connect types with their
+  // vtable holding type.
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
+         CUE = CUMap.end(); CUI != CUE; ++CUI) {
+    CompileUnit *TheCU = CUI->second;
+    TheCU->constructContainingTypeDIEs();
+  }
+
+   // Compute DIE offsets and sizes.
+  InfoHolder.computeSizeAndOffsets();
+  if (useSplitDwarf())
+    SkeletonHolder.computeSizeAndOffsets();
+}
+
+void DwarfDebug::endSections() {
+  // Standard sections final addresses.
+  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("text_end"));
+  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getDataSection());
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("data_end"));
+
+  // End text sections.
+  for (unsigned I = 0, E = SectionMap.size(); I != E; ++I) {
+    Asm->OutStreamer.SwitchSection(SectionMap[I]);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", I+1));
+  }
+}
+
+// Emit all Dwarf sections that should come after the content.
+void DwarfDebug::endModule() {
+
+  if (!FirstCU) return;
+
+  // End any existing sections.
+  // TODO: Does this need to happen?
+  endSections();
+
+  // Finalize the debug info for the module.
+  finalizeModuleInfo();
+
+  if (!useSplitDwarf()) {
+    // Emit all the DIEs into a debug info section.
+    emitDebugInfo();
+
+    // Corresponding abbreviations into a abbrev section.
+    emitAbbreviations();
+
+    // Emit info into a debug loc section.
+    emitDebugLoc();
+
+    // Emit info into a debug aranges section.
+    emitDebugARanges();
+
+    // Emit info into a debug ranges section.
+    emitDebugRanges();
+
+    // Emit info into a debug macinfo section.
+    emitDebugMacInfo();
+
+    // Emit inline info.
+    // TODO: When we don't need the option anymore we
+    // can remove all of the code that this section
+    // depends upon.
+    if (useDarwinGDBCompat())
+      emitDebugInlineInfo();
+  } else {
+    // TODO: Fill this in for separated debug sections and separate
+    // out information into new sections.
+
+    // Emit the debug info section and compile units.
+    emitDebugInfo();
+    emitDebugInfoDWO();
+
+    // Corresponding abbreviations into a abbrev section.
+    emitAbbreviations();
+    emitDebugAbbrevDWO();
+
+    // Emit info into a debug loc section.
+    emitDebugLoc();
+
+    // Emit info into a debug aranges section.
+    emitDebugARanges();
+
+    // Emit info into a debug ranges section.
+    emitDebugRanges();
+
+    // Emit info into a debug macinfo section.
+    emitDebugMacInfo();
+
+    // Emit DWO addresses.
+    InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
+
+    // Emit inline info.
+    // TODO: When we don't need the option anymore we
+    // can remove all of the code that this section
+    // depends upon.
+    if (useDarwinGDBCompat())
+      emitDebugInlineInfo();
+  }
+
+  // Emit info into the dwarf accelerator table sections.
+  if (useDwarfAccelTables()) {
+    emitAccelNames();
+    emitAccelObjC();
+    emitAccelNamespaces();
+    emitAccelTypes();
+  }
+
+  // Emit info into a debug pubnames section, if requested.
+  if (GenerateDwarfPubNamesSection)
+    emitDebugPubnames();
+
+  // Emit info into a debug pubtypes section.
+  // TODO: When we don't need the option anymore we can
+  // remove all of the code that adds to the table.
+  if (useDarwinGDBCompat())
+    emitDebugPubTypes();
+
+  // Finally emit string information into a string table.
+  emitDebugStr();
+  if (useSplitDwarf())
+    emitDebugStrDWO();
+
+  // clean up.
+  SPMap.clear();
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I)
+    delete I->second;
+
+  for (SmallVector<CompileUnit *, 1>::iterator I = SkeletonCUs.begin(),
+         E = SkeletonCUs.end(); I != E; ++I)
+    delete *I;
+
+  // Reset these for the next Module if we have one.
+  FirstCU = NULL;
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
+                                              DebugLoc ScopeLoc) {
+  LLVMContext &Ctx = DV->getContext();
+  // More then one inlined variable corresponds to one abstract variable.
+  DIVariable Var = cleanseInlinedVariable(DV, Ctx);
+  DbgVariable *AbsDbgVariable = AbstractVariables.lookup(Var);
+  if (AbsDbgVariable)
+    return AbsDbgVariable;
+
+  LexicalScope *Scope = LScopes.findAbstractScope(ScopeLoc.getScope(Ctx));
+  if (!Scope)
+    return NULL;
+
+  AbsDbgVariable = new DbgVariable(Var, NULL);
+  addScopeVariable(Scope, AbsDbgVariable);
+  AbstractVariables[Var] = AbsDbgVariable;
+  return AbsDbgVariable;
+}
+
+// If Var is a current function argument then add it to CurrentFnArguments list.
+bool DwarfDebug::addCurrentFnArgument(const MachineFunction *MF,
+                                      DbgVariable *Var, LexicalScope *Scope) {
+  if (!LScopes.isCurrentFunctionScope(Scope))
+    return false;
+  DIVariable DV = Var->getVariable();
+  if (DV.getTag() != dwarf::DW_TAG_arg_variable)
+    return false;
+  unsigned ArgNo = DV.getArgNumber();
+  if (ArgNo == 0)
+    return false;
+
+  size_t Size = CurrentFnArguments.size();
+  if (Size == 0)
+    CurrentFnArguments.resize(MF->getFunction()->arg_size());
+  // llvm::Function argument size is not good indicator of how many
+  // arguments does the function have at source level.
+  if (ArgNo > Size)
+    CurrentFnArguments.resize(ArgNo * 2);
+  CurrentFnArguments[ArgNo - 1] = Var;
+  return true;
+}
+
+// Collect variable information from side table maintained by MMI.
+void
+DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction *MF,
+                                   SmallPtrSet<const MDNode *, 16> &Processed) {
+  MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo();
+  for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(),
+         VE = VMap.end(); VI != VE; ++VI) {
+    const MDNode *Var = VI->first;
+    if (!Var) continue;
+    Processed.insert(Var);
+    DIVariable DV(Var);
+    const std::pair<unsigned, DebugLoc> &VP = VI->second;
+
+    LexicalScope *Scope = LScopes.findLexicalScope(VP.second);
+
+    // If variable scope is not found then skip this variable.
+    if (Scope == 0)
+      continue;
+
+    DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VP.second);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable);
+    RegVar->setFrameIndex(VP.first);
+    if (!addCurrentFnArgument(MF, RegVar, Scope))
+      addScopeVariable(Scope, RegVar);
+    if (AbsDbgVariable)
+      AbsDbgVariable->setFrameIndex(VP.first);
+  }
+}
+
+// Return true if debug value, encoded by DBG_VALUE instruction, is in a
+// defined reg.
+static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
+  assert(MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
+  return MI->getNumOperands() == 3 &&
+         MI->getOperand(0).isReg() && MI->getOperand(0).getReg() &&
+         MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0;
+}
+
+// Get .debug_loc entry for the instruction range starting at MI.
+static DotDebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
+                                         const MCSymbol *FLabel,
+                                         const MCSymbol *SLabel,
+                                         const MachineInstr *MI) {
+  const MDNode *Var =  MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+
+  if (MI->getNumOperands() != 3) {
+    MachineLocation MLoc = Asm->getDebugValueLocation(MI);
+    return DotDebugLocEntry(FLabel, SLabel, MLoc, Var);
+  }
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) {
+    MachineLocation MLoc;
+    MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+    return DotDebugLocEntry(FLabel, SLabel, MLoc, Var);
+  }
+  if (MI->getOperand(0).isImm())
+    return DotDebugLocEntry(FLabel, SLabel, MI->getOperand(0).getImm());
+  if (MI->getOperand(0).isFPImm())
+    return DotDebugLocEntry(FLabel, SLabel, MI->getOperand(0).getFPImm());
+  if (MI->getOperand(0).isCImm())
+    return DotDebugLocEntry(FLabel, SLabel, MI->getOperand(0).getCImm());
+
+  llvm_unreachable("Unexpected 3 operand DBG_VALUE instruction!");
+}
+
+// Find variables for each lexical scope.
+void
+DwarfDebug::collectVariableInfo(const MachineFunction *MF,
+                                SmallPtrSet<const MDNode *, 16> &Processed) {
+
+  // collection info from MMI table.
+  collectVariableInfoFromMMITable(MF, Processed);
+
+  for (SmallVectorImpl<const MDNode*>::const_iterator
+         UVI = UserVariables.begin(), UVE = UserVariables.end(); UVI != UVE;
+         ++UVI) {
+    const MDNode *Var = *UVI;
+    if (Processed.count(Var))
+      continue;
+
+    // History contains relevant DBG_VALUE instructions for Var and instructions
+    // clobbering it.
+    SmallVectorImpl<const MachineInstr*> &History = DbgValues[Var];
+    if (History.empty())
+      continue;
+    const MachineInstr *MInsn = History.front();
+
+    DIVariable DV(Var);
+    LexicalScope *Scope = NULL;
+    if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
+        DISubprogram(DV.getContext()).describes(MF->getFunction()))
+      Scope = LScopes.getCurrentFunctionScope();
+    else if (MDNode *IA = DV.getInlinedAt())
+      Scope = LScopes.findInlinedScope(DebugLoc::getFromDILocation(IA));
+    else
+      Scope = LScopes.findLexicalScope(cast<MDNode>(DV->getOperand(1)));
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    Processed.insert(DV);
+    assert(MInsn->isDebugValue() && "History must begin with debug value");
+    DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
+    DbgVariable *RegVar = new DbgVariable(DV, AbsVar);
+    if (!addCurrentFnArgument(MF, RegVar, Scope))
+      addScopeVariable(Scope, RegVar);
+    if (AbsVar)
+      AbsVar->setMInsn(MInsn);
+
+    // Simplify ranges that are fully coalesced.
+    if (History.size() <= 1 || (History.size() == 2 &&
+                                MInsn->isIdenticalTo(History.back()))) {
+      RegVar->setMInsn(MInsn);
+      continue;
+    }
+
+    // Handle multiple DBG_VALUE instructions describing one variable.
+    RegVar->setDotDebugLocOffset(DotDebugLocEntries.size());
+
+    for (SmallVectorImpl<const MachineInstr*>::const_iterator
+           HI = History.begin(), HE = History.end(); HI != HE; ++HI) {
+      const MachineInstr *Begin = *HI;
+      assert(Begin->isDebugValue() && "Invalid History entry");
+
+      // Check if DBG_VALUE is truncating a range.
+      if (Begin->getNumOperands() > 1 && Begin->getOperand(0).isReg()
+          && !Begin->getOperand(0).getReg())
+        continue;
+
+      // Compute the range for a register location.
+      const MCSymbol *FLabel = getLabelBeforeInsn(Begin);
+      const MCSymbol *SLabel = 0;
+
+      if (HI + 1 == HE)
+        // If Begin is the last instruction in History then its value is valid
+        // until the end of the function.
+        SLabel = FunctionEndSym;
+      else {
+        const MachineInstr *End = HI[1];
+        DEBUG(dbgs() << "DotDebugLoc Pair:\n"
+              << "\t" << *Begin << "\t" << *End << "\n");
+        if (End->isDebugValue())
+          SLabel = getLabelBeforeInsn(End);
+        else {
+          // End is a normal instruction clobbering the range.
+          SLabel = getLabelAfterInsn(End);
+          assert(SLabel && "Forgot label after clobber instruction");
+          ++HI;
+        }
+      }
+
+      // The value is valid until the next DBG_VALUE or clobber.
+      DotDebugLocEntries.push_back(getDebugLocEntry(Asm, FLabel, SLabel,
+                                                    Begin));
+    }
+    DotDebugLocEntries.push_back(DotDebugLocEntry());
+  }
+
+  // Collect info for variables that were optimized out.
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
+  for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
+    DIVariable DV(Variables.getElement(i));
+    if (!DV || !DV.Verify() || !Processed.insert(DV))
+      continue;
+    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
+      addScopeVariable(Scope, new DbgVariable(DV, NULL));
+  }
+}
+
+// Return Label preceding the instruction.
+MCSymbol *DwarfDebug::getLabelBeforeInsn(const MachineInstr *MI) {
+  MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
+  assert(Label && "Didn't insert label before instruction");
+  return Label;
+}
+
+// Return Label immediately following the instruction.
+MCSymbol *DwarfDebug::getLabelAfterInsn(const MachineInstr *MI) {
+  return LabelsAfterInsn.lookup(MI);
+}
+
+// Process beginning of an instruction.
+void DwarfDebug::beginInstruction(const MachineInstr *MI) {
+  // Check if source location changes, but ignore DBG_VALUE locations.
+  if (!MI->isDebugValue()) {
+    DebugLoc DL = MI->getDebugLoc();
+    if (DL != PrevInstLoc && (!DL.isUnknown() || UnknownLocations)) {
+      unsigned Flags = 0;
+      PrevInstLoc = DL;
+      if (DL == PrologEndLoc) {
+        Flags |= DWARF2_FLAG_PROLOGUE_END;
+        PrologEndLoc = DebugLoc();
+      }
+      if (PrologEndLoc.isUnknown())
+        Flags |= DWARF2_FLAG_IS_STMT;
+
+      if (!DL.isUnknown()) {
+        const MDNode *Scope = DL.getScope(Asm->MF->getFunction()->getContext());
+        recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+      } else
+        recordSourceLine(0, 0, 0, 0);
+    }
+  }
+
+  // Insert labels where requested.
+  DenseMap<const MachineInstr*, MCSymbol*>::iterator I =
+    LabelsBeforeInsn.find(MI);
+
+  // No label needed.
+  if (I == LabelsBeforeInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().CreateTempSymbol();
+    Asm->OutStreamer.EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+// Process end of an instruction.
+void DwarfDebug::endInstruction(const MachineInstr *MI) {
+  // Don't create a new label after DBG_VALUE instructions.
+  // They don't generate code.
+  if (!MI->isDebugValue())
+    PrevLabel = 0;
+
+  DenseMap<const MachineInstr*, MCSymbol*>::iterator I =
+    LabelsAfterInsn.find(MI);
+
+  // No label needed.
+  if (I == LabelsAfterInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  // We need a label after this instruction.
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().CreateTempSymbol();
+    Asm->OutStreamer.EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+// Each LexicalScope has first instruction and last instruction to mark
+// beginning and end of a scope respectively. Create an inverse map that list
+// scopes starts (and ends) with an instruction. One instruction may start (or
+// end) multiple scopes. Ignore scopes that are not reachable.
+void DwarfDebug::identifyScopeMarkers() {
+  SmallVector<LexicalScope *, 4> WorkList;
+  WorkList.push_back(LScopes.getCurrentFunctionScope());
+  while (!WorkList.empty()) {
+    LexicalScope *S = WorkList.pop_back_val();
+
+    const SmallVector<LexicalScope *, 4> &Children = S->getChildren();
+    if (!Children.empty())
+      for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
+             SE = Children.end(); SI != SE; ++SI)
+        WorkList.push_back(*SI);
+
+    if (S->isAbstractScope())
+      continue;
+
+    const SmallVector<InsnRange, 4> &Ranges = S->getRanges();
+    if (Ranges.empty())
+      continue;
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+           RE = Ranges.end(); RI != RE; ++RI) {
+      assert(RI->first && "InsnRange does not have first instruction!");
+      assert(RI->second && "InsnRange does not have second instruction!");
+      requestLabelBeforeInsn(RI->first);
+      requestLabelAfterInsn(RI->second);
+    }
+  }
+}
+
+// Get MDNode for DebugLoc's scope.
+static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) {
+  if (MDNode *InlinedAt = DL.getInlinedAt(Ctx))
+    return getScopeNode(DebugLoc::getFromDILocation(InlinedAt), Ctx);
+  return DL.getScope(Ctx);
+}
+
+// Walk up the scope chain of given debug loc and find line number info
+// for the function.
+static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
+  const MDNode *Scope = getScopeNode(DL, Ctx);
+  DISubprogram SP = getDISubprogram(Scope);
+  if (SP.Verify()) {
+    // Check for number of operands since the compatibility is
+    // cheap here.
+    if (SP->getNumOperands() > 19)
+      return DebugLoc::get(SP.getScopeLineNumber(), 0, SP);
+    else
+      return DebugLoc::get(SP.getLineNumber(), 0, SP);
+  }
+
+  return DebugLoc();
+}
+
+// Gather pre-function debug information.  Assumes being called immediately
+// after the function entry point has been emitted.
+void DwarfDebug::beginFunction(const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo()) return;
+  LScopes.initialize(*MF);
+  if (LScopes.empty()) return;
+  identifyScopeMarkers();
+
+  // Set DwarfCompileUnitID in MCContext to the Compile Unit this function
+  // belongs to.
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
+  assert(TheCU && "Unable to find compile unit!");
+  Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+
+  FunctionBeginSym = Asm->GetTempSymbol("func_begin",
+                                        Asm->getFunctionNumber());
+  // Assumes in correct section after the entry point.
+  Asm->OutStreamer.EmitLabel(FunctionBeginSym);
+
+  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
+
+  const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+  // LiveUserVar - Map physreg numbers to the MDNode they contain.
+  std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
+
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    bool AtBlockEntry = true;
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      const MachineInstr *MI = II;
+
+      if (MI->isDebugValue()) {
+        assert(MI->getNumOperands() > 1 && "Invalid machine instruction!");
+
+        // Keep track of user variables.
+        const MDNode *Var =
+          MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+
+        // Variable is in a register, we need to check for clobbers.
+        if (isDbgValueInDefinedReg(MI))
+          LiveUserVar[MI->getOperand(0).getReg()] = Var;
+
+        // Check the history of this variable.
+        SmallVectorImpl<const MachineInstr*> &History = DbgValues[Var];
+        if (History.empty()) {
+          UserVariables.push_back(Var);
+          // The first mention of a function argument gets the FunctionBeginSym
+          // label, so arguments are visible when breaking at function entry.
+          DIVariable DV(Var);
+          if (DV.Verify() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
+              DISubprogram(getDISubprogram(DV.getContext()))
+                .describes(MF->getFunction()))
+            LabelsBeforeInsn[MI] = FunctionBeginSym;
+        } else {
+          // We have seen this variable before. Try to coalesce DBG_VALUEs.
+          const MachineInstr *Prev = History.back();
+          if (Prev->isDebugValue()) {
+            // Coalesce identical entries at the end of History.
+            if (History.size() >= 2 &&
+                Prev->isIdenticalTo(History[History.size() - 2])) {
+              DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                    << "\t" << *Prev
+                    << "\t" << *History[History.size() - 2] << "\n");
+              History.pop_back();
+            }
+
+            // Terminate old register assignments that don't reach MI;
+            MachineFunction::const_iterator PrevMBB = Prev->getParent();
+            if (PrevMBB != I && (!AtBlockEntry || llvm::next(PrevMBB) != I) &&
+                isDbgValueInDefinedReg(Prev)) {
+              // Previous register assignment needs to terminate at the end of
+              // its basic block.
+              MachineBasicBlock::const_iterator LastMI =
+                PrevMBB->getLastNonDebugInstr();
+              if (LastMI == PrevMBB->end()) {
+                // Drop DBG_VALUE for empty range.
+                DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
+                      << "\t" << *Prev << "\n");
+                History.pop_back();
+              }
+              else {
+                // Terminate after LastMI.
+                History.push_back(LastMI);
+              }
+            }
+          }
+        }
+        History.push_back(MI);
+      } else {
+        // Not a DBG_VALUE instruction.
+        if (!MI->isLabel())
+          AtBlockEntry = false;
+
+        // First known non-DBG_VALUE and non-frame setup location marks
+        // the beginning of the function body.
+        if (!MI->getFlag(MachineInstr::FrameSetup) &&
+            (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown()))
+          PrologEndLoc = MI->getDebugLoc();
+
+        // Check if the instruction clobbers any registers with debug vars.
+        for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
+            continue;
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
+               AI.isValid(); ++AI) {
+            unsigned Reg = *AI;
+            const MDNode *Var = LiveUserVar[Reg];
+            if (!Var)
+              continue;
+            // Reg is now clobbered.
+            LiveUserVar[Reg] = 0;
+
+            // Was MD last defined by a DBG_VALUE referring to Reg?
+            DbgValueHistoryMap::iterator HistI = DbgValues.find(Var);
+            if (HistI == DbgValues.end())
+              continue;
+            SmallVectorImpl<const MachineInstr*> &History = HistI->second;
+            if (History.empty())
+              continue;
+            const MachineInstr *Prev = History.back();
+            // Sanity-check: Register assignments are terminated at the end of
+            // their block.
+            if (!Prev->isDebugValue() || Prev->getParent() != MI->getParent())
+              continue;
+            // Is the variable still in Reg?
+            if (!isDbgValueInDefinedReg(Prev) ||
+                Prev->getOperand(0).getReg() != Reg)
+              continue;
+            // Var is clobbered. Make sure the next instruction gets a label.
+            History.push_back(MI);
+          }
+        }
+      }
+    }
+  }
+
+  for (DbgValueHistoryMap::iterator I = DbgValues.begin(), E = DbgValues.end();
+       I != E; ++I) {
+    SmallVectorImpl<const MachineInstr*> &History = I->second;
+    if (History.empty())
+      continue;
+
+    // Make sure the final register assignments are terminated.
+    const MachineInstr *Prev = History.back();
+    if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
+      const MachineBasicBlock *PrevMBB = Prev->getParent();
+      MachineBasicBlock::const_iterator LastMI =
+        PrevMBB->getLastNonDebugInstr();
+      if (LastMI == PrevMBB->end())
+        // Drop DBG_VALUE for empty range.
+        History.pop_back();
+      else {
+        // Terminate after LastMI.
+        History.push_back(LastMI);
+      }
+    }
+    // Request labels for the full history.
+    for (unsigned i = 0, e = History.size(); i != e; ++i) {
+      const MachineInstr *MI = History[i];
+      if (MI->isDebugValue())
+        requestLabelBeforeInsn(MI);
+      else
+        requestLabelAfterInsn(MI);
+    }
+  }
+
+  PrevInstLoc = DebugLoc();
+  PrevLabel = FunctionBeginSym;
+
+  // Record beginning of function.
+  if (!PrologEndLoc.isUnknown()) {
+    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
+                                       MF->getFunction()->getContext());
+    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
+                     FnStartDL.getScope(MF->getFunction()->getContext()),
+    // We'd like to list the prologue as "not statements" but GDB behaves
+    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+                     DWARF2_FLAG_IS_STMT);
+  }
+}
+
+void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+//  SmallVector<DbgVariable *, 8> &Vars = ScopeVariables.lookup(LS);
+  ScopeVariables[LS].push_back(Var);
+//  Vars.push_back(Var);
+}
+
+// Gather and emit post-function debug information.
+void DwarfDebug::endFunction(const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo() || LScopes.empty()) return;
+
+  // Define end label for subprogram.
+  FunctionEndSym = Asm->GetTempSymbol("func_end",
+                                      Asm->getFunctionNumber());
+  // Assumes in correct section after the entry point.
+  Asm->OutStreamer.EmitLabel(FunctionEndSym);
+  // Set DwarfCompileUnitID in MCContext to default value.
+  Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
+
+  SmallPtrSet<const MDNode *, 16> ProcessedVars;
+  collectVariableInfo(MF, ProcessedVars);
+
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
+  assert(TheCU && "Unable to find compile unit!");
+
+  // Construct abstract scopes.
+  ArrayRef<LexicalScope *> AList = LScopes.getAbstractScopesList();
+  for (unsigned i = 0, e = AList.size(); i != e; ++i) {
+    LexicalScope *AScope = AList[i];
+    DISubprogram SP(AScope->getScopeNode());
+    if (SP.Verify()) {
+      // Collect info for variables that were optimized out.
+      DIArray Variables = SP.getVariables();
+      for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
+        DIVariable DV(Variables.getElement(i));
+        if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
+          continue;
+        // Check that DbgVariable for DV wasn't created earlier, when
+        // findAbstractVariable() was called for inlined instance of DV.
+        LLVMContext &Ctx = DV->getContext();
+        DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
+        if (AbstractVariables.lookup(CleanDV))
+          continue;
+        if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
+          addScopeVariable(Scope, new DbgVariable(DV, NULL));
+      }
+    }
+    if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
+      constructScopeDIE(TheCU, AScope);
+  }
+
+  DIE *CurFnDIE = constructScopeDIE(TheCU, FnScope);
+
+  if (!MF->getTarget().Options.DisableFramePointerElim(*MF))
+    TheCU->addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
+
+  DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
+                                               MMI->getFrameMoves()));
+
+  // Clear debug info
+  for (DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> >::iterator
+         I = ScopeVariables.begin(), E = ScopeVariables.end(); I != E; ++I)
+    DeleteContainerPointers(I->second);
+  ScopeVariables.clear();
+  DeleteContainerPointers(CurrentFnArguments);
+  UserVariables.clear();
+  DbgValues.clear();
+  AbstractVariables.clear();
+  LabelsBeforeInsn.clear();
+  LabelsAfterInsn.clear();
+  PrevLabel = NULL;
+}
+
+// Register a source line with debug info. Returns the  unique label that was
+// emitted and which provides correspondence to the source line list.
+void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
+                                  unsigned Flags) {
+  StringRef Fn;
+  StringRef Dir;
+  unsigned Src = 1;
+  if (S) {
+    DIDescriptor Scope(S);
+
+    if (Scope.isCompileUnit()) {
+      DICompileUnit CU(S);
+      Fn = CU.getFilename();
+      Dir = CU.getDirectory();
+    } else if (Scope.isFile()) {
+      DIFile F(S);
+      Fn = F.getFilename();
+      Dir = F.getDirectory();
+    } else if (Scope.isSubprogram()) {
+      DISubprogram SP(S);
+      Fn = SP.getFilename();
+      Dir = SP.getDirectory();
+    } else if (Scope.isLexicalBlockFile()) {
+      DILexicalBlockFile DBF(S);
+      Fn = DBF.getFilename();
+      Dir = DBF.getDirectory();
+    } else if (Scope.isLexicalBlock()) {
+      DILexicalBlock DB(S);
+      Fn = DB.getFilename();
+      Dir = DB.getDirectory();
+    } else
+      llvm_unreachable("Unexpected scope info");
+
+    Src = getOrCreateSourceID(Fn, Dir,
+            Asm->OutStreamer.getContext().getDwarfCompileUnitID());
+  }
+  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags, 0, 0, Fn);
+}
+
+//===----------------------------------------------------------------------===//
+// Emit Methods
+//===----------------------------------------------------------------------===//
+
+// Compute the size and offset of a DIE.
+unsigned
+DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
+  // Get the children.
+  const std::vector<DIE *> &Children = Die->getChildren();
+
+  // Record the abbreviation.
+  assignAbbrevNumber(Die->getAbbrev());
+
+  // Get the abbreviation for this DIE.
+  unsigned AbbrevNumber = Die->getAbbrevNumber();
+  const DIEAbbrev *Abbrev = Abbreviations->at(AbbrevNumber - 1);
+
+  // Set DIE offset
+  Die->setOffset(Offset);
+
+  // Start the size with the size of abbreviation code.
+  Offset += MCAsmInfo::getULEB128Size(AbbrevNumber);
+
+  const SmallVectorImpl<DIEValue*> &Values = Die->getValues();
+  const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev->getData();
+
+  // Size the DIE attribute values.
+  for (unsigned i = 0, N = Values.size(); i < N; ++i)
+    // Size attribute value.
+    Offset += Values[i]->SizeOf(Asm, AbbrevData[i].getForm());
+
+  // Size the DIE children if any.
+  if (!Children.empty()) {
+    assert(Abbrev->getChildrenFlag() == dwarf::DW_CHILDREN_yes &&
+           "Children flag not set");
+
+    for (unsigned j = 0, M = Children.size(); j < M; ++j)
+      Offset = computeSizeAndOffset(Children[j], Offset);
+
+    // End of children marker.
+    Offset += sizeof(int8_t);
+  }
+
+  Die->setSize(Offset - Die->getOffset());
+  return Offset;
+}
+
+// Compute the size and offset of all the DIEs.
+void DwarfUnits::computeSizeAndOffsets() {
+  // Offset from the beginning of debug info section.
+  unsigned AccuOffset = 0;
+  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
+         E = CUs.end(); I != E; ++I) {
+    (*I)->setDebugInfoOffset(AccuOffset);
+    unsigned Offset =
+      sizeof(int32_t) + // Length of Compilation Unit Info
+      sizeof(int16_t) + // DWARF version number
+      sizeof(int32_t) + // Offset Into Abbrev. Section
+      sizeof(int8_t);   // Pointer Size (in bytes)
+
+    unsigned EndOffset = computeSizeAndOffset((*I)->getCUDie(), Offset);
+    AccuOffset += EndOffset;
+  }
+}
+
+// Emit initial Dwarf sections with a label at the start of each one.
+void DwarfDebug::emitSectionLabels() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  // Dwarf sections base addresses.
+  DwarfInfoSectionSym =
+    emitSectionSym(Asm, TLOF.getDwarfInfoSection(), "section_info");
+  DwarfAbbrevSectionSym =
+    emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
+  if (useSplitDwarf())
+    DwarfAbbrevDWOSectionSym =
+      emitSectionSym(Asm, TLOF.getDwarfAbbrevDWOSection(),
+                     "section_abbrev_dwo");
+  emitSectionSym(Asm, TLOF.getDwarfARangesSection());
+
+  if (const MCSection *MacroInfo = TLOF.getDwarfMacroInfoSection())
+    emitSectionSym(Asm, MacroInfo);
+
+  DwarfLineSectionSym =
+    emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
+  emitSectionSym(Asm, TLOF.getDwarfLocSection());
+  if (GenerateDwarfPubNamesSection)
+    emitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
+  emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+  DwarfStrSectionSym =
+    emitSectionSym(Asm, TLOF.getDwarfStrSection(), "info_string");
+  if (useSplitDwarf())
+    DwarfStrDWOSectionSym =
+      emitSectionSym(Asm, TLOF.getDwarfStrDWOSection(), "skel_string");
+  DwarfDebugRangeSectionSym = emitSectionSym(Asm, TLOF.getDwarfRangesSection(),
+                                             "debug_range");
+
+  DwarfDebugLocSectionSym = emitSectionSym(Asm, TLOF.getDwarfLocSection(),
+                                           "section_debug_loc");
+
+  TextSectionSym = emitSectionSym(Asm, TLOF.getTextSection(), "text_begin");
+  emitSectionSym(Asm, TLOF.getDataSection());
+}
+
+// Recursively emits a debug information entry.
+void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
+  // Get the abbreviation for this DIE.
+  unsigned AbbrevNumber = Die->getAbbrevNumber();
+  const DIEAbbrev *Abbrev = Abbrevs->at(AbbrevNumber - 1);
+
+  // Emit the code (index) for the abbreviation.
+  if (Asm->isVerbose())
+    Asm->OutStreamer.AddComment("Abbrev [" + Twine(AbbrevNumber) + "] 0x" +
+                                Twine::utohexstr(Die->getOffset()) + ":0x" +
+                                Twine::utohexstr(Die->getSize()) + " " +
+                                dwarf::TagString(Abbrev->getTag()));
+  Asm->EmitULEB128(AbbrevNumber);
+
+  const SmallVectorImpl<DIEValue*> &Values = Die->getValues();
+  const SmallVectorImpl<DIEAbbrevData> &AbbrevData = Abbrev->getData();
+
+  // Emit the DIE attribute values.
+  for (unsigned i = 0, N = Values.size(); i < N; ++i) {
+    unsigned Attr = AbbrevData[i].getAttribute();
+    unsigned Form = AbbrevData[i].getForm();
+    assert(Form && "Too many attributes for DIE (check abbreviation)");
+
+    if (Asm->isVerbose())
+      Asm->OutStreamer.AddComment(dwarf::AttributeString(Attr));
+
+    switch (Attr) {
+    case dwarf::DW_AT_abstract_origin: {
+      DIEEntry *E = cast<DIEEntry>(Values[i]);
+      DIE *Origin = E->getEntry();
+      unsigned Addr = Origin->getOffset();
+      if (Form == dwarf::DW_FORM_ref_addr) {
+        // For DW_FORM_ref_addr, output the offset from beginning of debug info
+        // section. Origin->getOffset() returns the offset from start of the
+        // compile unit.
+        DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+        Addr += Holder.getCUOffset(Origin->getCompileUnit());
+      }
+      Asm->EmitInt32(Addr);
+      break;
+    }
+    case dwarf::DW_AT_ranges: {
+      // DW_AT_range Value encodes offset in debug_range section.
+      DIEInteger *V = cast<DIEInteger>(Values[i]);
+
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) {
+        Asm->EmitLabelPlusOffset(DwarfDebugRangeSectionSym,
+                                 V->getValue(),
+                                 4);
+      } else {
+        Asm->EmitLabelOffsetDifference(DwarfDebugRangeSectionSym,
+                                       V->getValue(),
+                                       DwarfDebugRangeSectionSym,
+                                       4);
+      }
+      break;
+    }
+    case dwarf::DW_AT_location: {
+      if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelReference(L->getValue(), 4);
+        else
+          Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
+      } else {
+        Values[i]->EmitValue(Asm, Form);
+      }
+      break;
+    }
+    case dwarf::DW_AT_accessibility: {
+      if (Asm->isVerbose()) {
+        DIEInteger *V = cast<DIEInteger>(Values[i]);
+        Asm->OutStreamer.AddComment(dwarf::AccessibilityString(V->getValue()));
+      }
+      Values[i]->EmitValue(Asm, Form);
+      break;
+    }
+    default:
+      // Emit an attribute using the defined form.
+      Values[i]->EmitValue(Asm, Form);
+      break;
+    }
+  }
+
+  // Emit the DIE children if any.
+  if (Abbrev->getChildrenFlag() == dwarf::DW_CHILDREN_yes) {
+    const std::vector<DIE *> &Children = Die->getChildren();
+
+    for (unsigned j = 0, M = Children.size(); j < M; ++j)
+      emitDIE(Children[j], Abbrevs);
+
+    if (Asm->isVerbose())
+      Asm->OutStreamer.AddComment("End Of Children Mark");
+    Asm->EmitInt8(0);
+  }
+}
+
+// Emit the various dwarf units to the unit section USection with
+// the abbreviations going into ASection.
+void DwarfUnits::emitUnits(DwarfDebug *DD,
+                           const MCSection *USection,
+                           const MCSection *ASection,
+                           const MCSymbol *ASectionSym) {
+  Asm->OutStreamer.SwitchSection(USection);
+  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
+         E = CUs.end(); I != E; ++I) {
+    CompileUnit *TheCU = *I;
+    DIE *Die = TheCU->getCUDie();
+
+    // Emit the compile units header.
+    Asm->OutStreamer
+      .EmitLabel(Asm->GetTempSymbol(USection->getLabelBeginName(),
+                                    TheCU->getUniqueID()));
+
+    // Emit size of content not including length itself
+    unsigned ContentSize = Die->getSize() +
+      sizeof(int16_t) + // DWARF version number
+      sizeof(int32_t) + // Offset Into Abbrev. Section
+      sizeof(int8_t);   // Pointer Size (in bytes)
+
+    Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
+    Asm->EmitInt32(ContentSize);
+    Asm->OutStreamer.AddComment("DWARF version number");
+    Asm->EmitInt16(dwarf::DWARF_VERSION);
+    Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
+                           ASectionSym);
+    Asm->OutStreamer.AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+
+    DD->emitDIE(Die, Abbreviations);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(USection->getLabelEndName(),
+                                                  TheCU->getUniqueID()));
+  }
+}
+
+/// For a given compile unit DIE, returns offset from beginning of debug info.
+unsigned DwarfUnits::getCUOffset(DIE *Die) {
+  assert(Die->getTag() == dwarf::DW_TAG_compile_unit  &&
+         "Input DIE should be compile unit in getCUOffset.");
+  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
+       E = CUs.end(); I != E; ++I) {
+    CompileUnit *TheCU = *I;
+    if (TheCU->getCUDie() == Die)
+      return TheCU->getDebugInfoOffset();
+  }
+  llvm_unreachable("The compile unit DIE should belong to CUs in DwarfUnits.");
+}
+
+// Emit the debug info section.
+void DwarfDebug::emitDebugInfo() {
+  DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  Holder.emitUnits(this, Asm->getObjFileLowering().getDwarfInfoSection(),
+                   Asm->getObjFileLowering().getDwarfAbbrevSection(),
+                   DwarfAbbrevSectionSym);
+}
+
+// Emit the abbreviation section.
+void DwarfDebug::emitAbbreviations() {
+  if (!useSplitDwarf())
+    emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection(),
+                &Abbreviations);
+  else
+    emitSkeletonAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection());
+}
+
+void DwarfDebug::emitAbbrevs(const MCSection *Section,
+                             std::vector<DIEAbbrev *> *Abbrevs) {
+  // Check to see if it is worth the effort.
+  if (!Abbrevs->empty()) {
+    // Start the debug abbrev section.
+    Asm->OutStreamer.SwitchSection(Section);
+
+    MCSymbol *Begin = Asm->GetTempSymbol(Section->getLabelBeginName());
+    Asm->OutStreamer.EmitLabel(Begin);
+
+    // For each abbrevation.
+    for (unsigned i = 0, N = Abbrevs->size(); i < N; ++i) {
+      // Get abbreviation data
+      const DIEAbbrev *Abbrev = Abbrevs->at(i);
+
+      // Emit the abbrevations code (base 1 index.)
+      Asm->EmitULEB128(Abbrev->getNumber(), "Abbreviation Code");
+
+      // Emit the abbreviations data.
+      Abbrev->Emit(Asm);
+    }
+
+    // Mark end of abbreviations.
+    Asm->EmitULEB128(0, "EOM(3)");
+
+    MCSymbol *End = Asm->GetTempSymbol(Section->getLabelEndName());
+    Asm->OutStreamer.EmitLabel(End);
+  }
+}
+
+// Emit the last address of the section and the end of the line matrix.
+void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
+  // Define last address of section.
+  Asm->OutStreamer.AddComment("Extended Op");
+  Asm->EmitInt8(0);
+
+  Asm->OutStreamer.AddComment("Op size");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize() + 1);
+  Asm->OutStreamer.AddComment("DW_LNE_set_address");
+  Asm->EmitInt8(dwarf::DW_LNE_set_address);
+
+  Asm->OutStreamer.AddComment("Section end label");
+
+  Asm->OutStreamer.EmitSymbolValue(Asm->GetTempSymbol("section_end",SectionEnd),
+                                   Asm->getDataLayout().getPointerSize());
+
+  // Mark end of matrix.
+  Asm->OutStreamer.AddComment("DW_LNE_end_sequence");
+  Asm->EmitInt8(0);
+  Asm->EmitInt8(1);
+  Asm->EmitInt8(1);
+}
+
+// Emit visible names into a hashed accelerator table section.
+void DwarfDebug::emitAccelNames() {
+  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+                                           dwarf::DW_FORM_data4));
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelNames();
+    for (StringMap<std::vector<DIE*> >::const_iterator
+           GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const std::vector<DIE *> &Entities = GI->second;
+      for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
+             DE = Entities.end(); DI != DE; ++DI)
+        AT.AddName(Name, (*DI));
+    }
+  }
+
+  AT.FinalizeTable(Asm, "Names");
+  Asm->OutStreamer.SwitchSection(
+    Asm->getObjFileLowering().getDwarfAccelNamesSection());
+  MCSymbol *SectionBegin = Asm->GetTempSymbol("names_begin");
+  Asm->OutStreamer.EmitLabel(SectionBegin);
+
+  // Emit the full data.
+  AT.Emit(Asm, SectionBegin, &InfoHolder);
+}
+
+// Emit objective C classes and categories into a hashed accelerator table
+// section.
+void DwarfDebug::emitAccelObjC() {
+  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+                                           dwarf::DW_FORM_data4));
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelObjC();
+    for (StringMap<std::vector<DIE*> >::const_iterator
+           GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const std::vector<DIE *> &Entities = GI->second;
+      for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
+             DE = Entities.end(); DI != DE; ++DI)
+        AT.AddName(Name, (*DI));
+    }
+  }
+
+  AT.FinalizeTable(Asm, "ObjC");
+  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering()
+                                 .getDwarfAccelObjCSection());
+  MCSymbol *SectionBegin = Asm->GetTempSymbol("objc_begin");
+  Asm->OutStreamer.EmitLabel(SectionBegin);
+
+  // Emit the full data.
+  AT.Emit(Asm, SectionBegin, &InfoHolder);
+}
+
+// Emit namespace dies into a hashed accelerator table.
+void DwarfDebug::emitAccelNamespaces() {
+  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+                                           dwarf::DW_FORM_data4));
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelNamespace();
+    for (StringMap<std::vector<DIE*> >::const_iterator
+           GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const std::vector<DIE *> &Entities = GI->second;
+      for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
+             DE = Entities.end(); DI != DE; ++DI)
+        AT.AddName(Name, (*DI));
+    }
+  }
+
+  AT.FinalizeTable(Asm, "namespac");
+  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering()
+                                 .getDwarfAccelNamespaceSection());
+  MCSymbol *SectionBegin = Asm->GetTempSymbol("namespac_begin");
+  Asm->OutStreamer.EmitLabel(SectionBegin);
+
+  // Emit the full data.
+  AT.Emit(Asm, SectionBegin, &InfoHolder);
+}
+
+// Emit type dies into a hashed accelerator table.
+void DwarfDebug::emitAccelTypes() {
+  std::vector<DwarfAccelTable::Atom> Atoms;
+  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+                                        dwarf::DW_FORM_data4));
+  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTag,
+                                        dwarf::DW_FORM_data2));
+  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTypeFlags,
+                                        dwarf::DW_FORM_data1));
+  DwarfAccelTable AT(Atoms);
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    const StringMap<std::vector<std::pair<DIE*, unsigned > > > &Names
+      = TheCU->getAccelTypes();
+    for (StringMap<std::vector<std::pair<DIE*, unsigned> > >::const_iterator
+           GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const std::vector<std::pair<DIE *, unsigned> > &Entities = GI->second;
+      for (std::vector<std::pair<DIE *, unsigned> >::const_iterator DI
+             = Entities.begin(), DE = Entities.end(); DI !=DE; ++DI)
+        AT.AddName(Name, (*DI).first, (*DI).second);
+    }
+  }
+
+  AT.FinalizeTable(Asm, "types");
+  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering()
+                                 .getDwarfAccelTypesSection());
+  MCSymbol *SectionBegin = Asm->GetTempSymbol("types_begin");
+  Asm->OutStreamer.EmitLabel(SectionBegin);
+
+  // Emit the full data.
+  AT.Emit(Asm, SectionBegin, &InfoHolder);
+}
+
+/// emitDebugPubnames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::emitDebugPubnames() {
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+
+  typedef DenseMap<const MDNode*, CompileUnit*> CUMapType;
+  for (CUMapType::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    unsigned ID = TheCU->getUniqueID();
+
+    if (TheCU->getGlobalNames().empty())
+      continue;
+
+    // Start the dwarf pubnames section.
+    Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getDwarfPubNamesSection());
+
+    Asm->OutStreamer.AddComment("Length of Public Names Info");
+    Asm->EmitLabelDifference(Asm->GetTempSymbol("pubnames_end", ID),
+                             Asm->GetTempSymbol("pubnames_begin", ID), 4);
+
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin", ID));
+
+    Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DWARF_VERSION);
+
+    Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
+                           DwarfInfoSectionSym);
+
+    Asm->OutStreamer.AddComment("Compilation Unit Length");
+    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(), ID),
+                             Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
+                             4);
+
+    const StringMap<DIE*> &Globals = TheCU->getGlobalNames();
+    for (StringMap<DIE*>::const_iterator
+           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      const DIE *Entity = GI->second;
+
+      Asm->OutStreamer.AddComment("DIE offset");
+      Asm->EmitInt32(Entity->getOffset());
+
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("External Name");
+      Asm->OutStreamer.EmitBytes(StringRef(Name, strlen(Name)+1), 0);
+    }
+
+    Asm->OutStreamer.AddComment("End Mark");
+    Asm->EmitInt32(0);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_end", ID));
+  }
+}
+
+void DwarfDebug::emitDebugPubTypes() {
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
+         E = CUMap.end(); I != E; ++I) {
+    CompileUnit *TheCU = I->second;
+    // Start the dwarf pubtypes section.
+    Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getDwarfPubTypesSection());
+    Asm->OutStreamer.AddComment("Length of Public Types Info");
+    Asm->EmitLabelDifference(
+      Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
+      Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
+
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_begin",
+                                                  TheCU->getUniqueID()));
+
+    if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DWARF_VERSION);
+
+    Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
+    const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(),
+                                              TheCU->getUniqueID()),
+                           DwarfInfoSectionSym);
+
+    Asm->OutStreamer.AddComment("Compilation Unit Length");
+    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(),
+                                                TheCU->getUniqueID()),
+                             Asm->GetTempSymbol(ISec->getLabelBeginName(),
+                                                TheCU->getUniqueID()),
+                             4);
+
+    const StringMap<DIE*> &Globals = TheCU->getGlobalTypes();
+    for (StringMap<DIE*>::const_iterator
+           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+      const char *Name = GI->getKeyData();
+      DIE *Entity = GI->second;
+
+      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
+      Asm->EmitInt32(Entity->getOffset());
+
+      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
+      // Emit the name with a terminating null byte.
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
+    }
+
+    Asm->OutStreamer.AddComment("End Mark");
+    Asm->EmitInt32(0);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_end",
+                                                  TheCU->getUniqueID()));
+  }
+}
+
+// Emit strings into a string section.
+void DwarfUnits::emitStrings(const MCSection *StrSection,
+                             const MCSection *OffsetSection = NULL,
+                             const MCSymbol *StrSecSym = NULL) {
+
+  if (StringPool.empty()) return;
+
+  // Start the dwarf str section.
+  Asm->OutStreamer.SwitchSection(StrSection);
+
+  // Get all of the string pool entries and put them in an array by their ID so
+  // we can sort them.
+  SmallVector<std::pair<unsigned,
+                 StringMapEntry<std::pair<MCSymbol*, unsigned> >*>, 64> Entries;
+
+  for (StringMap<std::pair<MCSymbol*, unsigned> >::iterator
+         I = StringPool.begin(), E = StringPool.end();
+       I != E; ++I)
+    Entries.push_back(std::make_pair(I->second.second, &*I));
+
+  array_pod_sort(Entries.begin(), Entries.end());
+
+  for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
+    // Emit a label for reference from debug information entries.
+    Asm->OutStreamer.EmitLabel(Entries[i].second->getValue().first);
+
+    // Emit the string itself with a terminating null byte.
+    Asm->OutStreamer.EmitBytes(StringRef(Entries[i].second->getKeyData(),
+                                         Entries[i].second->getKeyLength()+1));
+  }
+
+  // If we've got an offset section go ahead and emit that now as well.
+  if (OffsetSection) {
+    Asm->OutStreamer.SwitchSection(OffsetSection);
+    unsigned offset = 0;
+    unsigned size = 4; // FIXME: DWARF64 is 8.
+    for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
+      Asm->OutStreamer.EmitIntValue(offset, size);
+      offset += Entries[i].second->getKeyLength() + 1;
+    }
+  }
+}
+
+// Emit strings into a string section.
+void DwarfUnits::emitAddresses(const MCSection *AddrSection) {
+
+  if (AddressPool.empty()) return;
+
+  // Start the dwarf addr section.
+  Asm->OutStreamer.SwitchSection(AddrSection);
+
+  // Get all of the string pool entries and put them in an array by their ID so
+  // we can sort them.
+  SmallVector<std::pair<unsigned,
+                        std::pair<MCSymbol*, unsigned>* >, 64> Entries;
+
+  for (DenseMap<MCSymbol*, std::pair<MCSymbol*, unsigned> >::iterator
+         I = AddressPool.begin(), E = AddressPool.end();
+       I != E; ++I)
+    Entries.push_back(std::make_pair(I->second.second, &(I->second)));
+
+  array_pod_sort(Entries.begin(), Entries.end());
+
+  for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
+    // Emit a label for reference from debug information entries.
+    MCSymbol *Sym = Entries[i].second->first;
+    if (Sym)
+      Asm->EmitLabelReference(Entries[i].second->first,
+                              Asm->getDataLayout().getPointerSize());
+    else
+      Asm->OutStreamer.EmitIntValue(0, Asm->getDataLayout().getPointerSize());
+  }
+
+}
+
+// Emit visible names into a debug str section.
+void DwarfDebug::emitDebugStr() {
+  DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
+}
+
+// Emit visible names into a debug loc section.
+void DwarfDebug::emitDebugLoc() {
+  if (DotDebugLocEntries.empty())
+    return;
+
+  for (SmallVectorImpl<DotDebugLocEntry>::iterator
+         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end();
+       I != E; ++I) {
+    DotDebugLocEntry &Entry = *I;
+    if (I + 1 != DotDebugLocEntries.end())
+      Entry.Merge(I+1);
+  }
+
+  // Start the dwarf loc section.
+  Asm->OutStreamer.SwitchSection(
+    Asm->getObjFileLowering().getDwarfLocSection());
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", 0));
+  unsigned index = 1;
+  for (SmallVectorImpl<DotDebugLocEntry>::iterator
+         I = DotDebugLocEntries.begin(), E = DotDebugLocEntries.end();
+       I != E; ++I, ++index) {
+    DotDebugLocEntry &Entry = *I;
+    if (Entry.isMerged()) continue;
+    if (Entry.isEmpty()) {
+      Asm->OutStreamer.EmitIntValue(0, Size);
+      Asm->OutStreamer.EmitIntValue(0, Size);
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", index));
+    } else {
+      Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size);
+      Asm->OutStreamer.EmitSymbolValue(Entry.End, Size);
+      DIVariable DV(Entry.Variable);
+      Asm->OutStreamer.AddComment("Loc expr size");
+      MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol();
+      MCSymbol *end = Asm->OutStreamer.getContext().CreateTempSymbol();
+      Asm->EmitLabelDifference(end, begin, 2);
+      Asm->OutStreamer.EmitLabel(begin);
+      if (Entry.isInt()) {
+        DIBasicType BTy(DV.getType());
+        if (BTy.Verify() &&
+            (BTy.getEncoding()  == dwarf::DW_ATE_signed
+             || BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
+          Asm->OutStreamer.AddComment("DW_OP_consts");
+          Asm->EmitInt8(dwarf::DW_OP_consts);
+          Asm->EmitSLEB128(Entry.getInt());
+        } else {
+          Asm->OutStreamer.AddComment("DW_OP_constu");
+          Asm->EmitInt8(dwarf::DW_OP_constu);
+          Asm->EmitULEB128(Entry.getInt());
+        }
+      } else if (Entry.isLocation()) {
+        if (!DV.hasComplexAddress())
+          // Regular entry.
+          Asm->EmitDwarfRegOp(Entry.Loc);
+        else {
+          // Complex address entry.
+          unsigned N = DV.getNumAddrElements();
+          unsigned i = 0;
+          if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
+            if (Entry.Loc.getOffset()) {
+              i = 2;
+              Asm->EmitDwarfRegOp(Entry.Loc);
+              Asm->OutStreamer.AddComment("DW_OP_deref");
+              Asm->EmitInt8(dwarf::DW_OP_deref);
+              Asm->OutStreamer.AddComment("DW_OP_plus_uconst");
+              Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
+              Asm->EmitSLEB128(DV.getAddrElement(1));
+            } else {
+              // If first address element is OpPlus then emit
+              // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
+              MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1));
+              Asm->EmitDwarfRegOp(Loc);
+              i = 2;
+            }
+          } else {
+            Asm->EmitDwarfRegOp(Entry.Loc);
+          }
+
+          // Emit remaining complex address elements.
+          for (; i < N; ++i) {
+            uint64_t Element = DV.getAddrElement(i);
+            if (Element == DIBuilder::OpPlus) {
+              Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
+              Asm->EmitULEB128(DV.getAddrElement(++i));
+            } else if (Element == DIBuilder::OpDeref) {
+              if (!Entry.Loc.isReg())
+                Asm->EmitInt8(dwarf::DW_OP_deref);
+            } else
+              llvm_unreachable("unknown Opcode found in complex address");
+          }
+        }
+      }
+      // else ... ignore constant fp. There is not any good way to
+      // to represent them here in dwarf.
+      Asm->OutStreamer.EmitLabel(end);
+    }
+  }
+}
+
+// Emit visible names into a debug aranges section.
+void DwarfDebug::emitDebugARanges() {
+  // Start the dwarf aranges section.
+  Asm->OutStreamer.SwitchSection(
+                          Asm->getObjFileLowering().getDwarfARangesSection());
+}
+
+// Emit visible names into a debug ranges section.
+void DwarfDebug::emitDebugRanges() {
+  // Start the dwarf ranges section.
+  Asm->OutStreamer.SwitchSection(
+    Asm->getObjFileLowering().getDwarfRangesSection());
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
+  for (SmallVectorImpl<const MCSymbol *>::iterator
+         I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
+       I != E; ++I) {
+    if (*I)
+      Asm->OutStreamer.EmitSymbolValue(const_cast<MCSymbol*>(*I), Size);
+    else
+      Asm->OutStreamer.EmitIntValue(0, Size);
+  }
+}
+
+// Emit visible names into a debug macinfo section.
+void DwarfDebug::emitDebugMacInfo() {
+  if (const MCSection *LineInfo =
+      Asm->getObjFileLowering().getDwarfMacroInfoSection()) {
+    // Start the dwarf macinfo section.
+    Asm->OutStreamer.SwitchSection(LineInfo);
+  }
+}
+
+// Emit inline info using following format.
+// Section Header:
+// 1. length of section
+// 2. Dwarf version number
+// 3. address size.
+//
+// Entries (one "entry" for each function that was inlined):
+//
+// 1. offset into __debug_str section for MIPS linkage name, if exists;
+//   otherwise offset into __debug_str for regular function name.
+// 2. offset into __debug_str section for regular function name.
+// 3. an unsigned LEB128 number indicating the number of distinct inlining
+// instances for the function.
+//
+// The rest of the entry consists of a {die_offset, low_pc} pair for each
+// inlined instance; the die_offset points to the inlined_subroutine die in the
+// __debug_info section, and the low_pc is the starting address for the
+// inlining instance.
+void DwarfDebug::emitDebugInlineInfo() {
+  if (!Asm->MAI->doesDwarfUseInlineInfoSection())
+    return;
+
+  if (!FirstCU)
+    return;
+
+  Asm->OutStreamer.SwitchSection(
+                        Asm->getObjFileLowering().getDwarfDebugInlineSection());
+
+  Asm->OutStreamer.AddComment("Length of Debug Inlined Information Entry");
+  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_inlined_end", 1),
+                           Asm->GetTempSymbol("debug_inlined_begin", 1), 4);
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_begin", 1));
+
+  Asm->OutStreamer.AddComment("Dwarf Version");
+  Asm->EmitInt16(dwarf::DWARF_VERSION);
+  Asm->OutStreamer.AddComment("Address Size (in bytes)");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+
+  for (SmallVectorImpl<const MDNode *>::iterator I = InlinedSPNodes.begin(),
+         E = InlinedSPNodes.end(); I != E; ++I) {
+
+    const MDNode *Node = *I;
+    DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> >::iterator II
+      = InlineInfo.find(Node);
+    SmallVectorImpl<InlineInfoLabels> &Labels = II->second;
+    DISubprogram SP(Node);
+    StringRef LName = SP.getLinkageName();
+    StringRef Name = SP.getName();
+
+    Asm->OutStreamer.AddComment("MIPS linkage name");
+    if (LName.empty())
+      Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
+                             DwarfStrSectionSym);
+    else
+      Asm->EmitSectionOffset(InfoHolder
+                             .getStringPoolEntry(getRealLinkageName(LName)),
+                             DwarfStrSectionSym);
+
+    Asm->OutStreamer.AddComment("Function name");
+    Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
+                           DwarfStrSectionSym);
+    Asm->EmitULEB128(Labels.size(), "Inline count");
+
+    for (SmallVectorImpl<InlineInfoLabels>::iterator LI = Labels.begin(),
+           LE = Labels.end(); LI != LE; ++LI) {
+      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
+      Asm->EmitInt32(LI->second->getOffset());
+
+      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
+      Asm->OutStreamer.EmitSymbolValue(LI->first,
+                                       Asm->getDataLayout().getPointerSize());
+    }
+  }
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_end", 1));
+}
+
+// DWARF5 Experimental Separate Dwarf emitters.
+
+// This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
+// DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
+// DW_AT_ranges_base, DW_AT_addr_base. If DW_AT_ranges is present,
+// DW_AT_low_pc and DW_AT_high_pc are not used, and vice versa.
+CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
+  DICompileUnit DIUnit(N);
+  CompilationDir = DIUnit.getDirectory();
+
+  DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
+  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
+                                       DIUnit.getLanguage(), Die, Asm,
+                                       this, &SkeletonHolder);
+
+  NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
+                        DIUnit.getSplitDebugFilename());
+
+  // This should be a unique identifier when we want to build .dwp files.
+  NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+
+  // FIXME: The addr base should be relative for each compile unit, however,
+  // this one is going to be 0 anyhow.
+  NewCU->addUInt(Die, dwarf::DW_AT_GNU_addr_base, dwarf::DW_FORM_sec_offset, 0);
+
+  // 2.17.1 requires that we use DW_AT_low_pc for a single entry point
+  // into an entity. We're using 0, or a NULL label for this.
+  NewCU->addUInt(Die, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section.
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset,
+                    DwarfLineSectionSym);
+  else
+    NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset, 0);
+
+  if (!CompilationDir.empty())
+    NewCU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+  SkeletonHolder.addUnit(NewCU);
+  SkeletonCUs.push_back(NewCU);
+
+  return NewCU;
+}
+
+void DwarfDebug::emitSkeletonAbbrevs(const MCSection *Section) {
+  assert(useSplitDwarf() && "No split dwarf debug info?");
+  emitAbbrevs(Section, &SkeletonAbbrevs);
+}
+
+// Emit the .debug_info.dwo section for separated dwarf. This contains the
+// compile units that would normally be in debug_info.
+void DwarfDebug::emitDebugInfoDWO() {
+  assert(useSplitDwarf() && "No split dwarf debug info?");
+  InfoHolder.emitUnits(this, Asm->getObjFileLowering().getDwarfInfoDWOSection(),
+                       Asm->getObjFileLowering().getDwarfAbbrevDWOSection(),
+                       DwarfAbbrevDWOSectionSym);
+}
+
+// Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
+// abbreviations for the .debug_info.dwo section.
+void DwarfDebug::emitDebugAbbrevDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevDWOSection(),
+              &Abbreviations);
+}
+
+// Emit the .debug_str.dwo section for separated dwarf. This contains the
+// string section and is identical in format to traditional .debug_str
+// sections.
+void DwarfDebug::emitDebugStrDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  const MCSection *OffSec = Asm->getObjFileLowering()
+                            .getDwarfStrOffDWOSection();
+  const MCSymbol *StrSym = DwarfStrSectionSym;
+  InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
+                         OffSec, StrSym);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
new file mode 100644
index 000000000000..81e345e6281d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -0,0 +1,649 @@
+//===-- llvm/CodeGen/DwarfDebug.h - Dwarf Debug Framework ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+#define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
+
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLoc.h"
+
+namespace llvm {
+
+class CompileUnit;
+class ConstantInt;
+class ConstantFP;
+class DbgVariable;
+class MachineFrameInfo;
+class MachineModuleInfo;
+class MachineOperand;
+class MCAsmInfo;
+class DIEAbbrev;
+class DIE;
+class DIEBlock;
+class DIEEntry;
+class DwarfDebug;
+
+//===----------------------------------------------------------------------===//
+/// \brief This class is used to record source line correspondence.
+class SrcLineInfo {
+  unsigned Line;                     // Source line number.
+  unsigned Column;                   // Source column.
+  unsigned SourceID;                 // Source ID number.
+  MCSymbol *Label;                   // Label in code ID number.
+public:
+  SrcLineInfo(unsigned L, unsigned C, unsigned S, MCSymbol *label)
+    : Line(L), Column(C), SourceID(S), Label(label) {}
+
+  // Accessors
+  unsigned getLine() const { return Line; }
+  unsigned getColumn() const { return Column; }
+  unsigned getSourceID() const { return SourceID; }
+  MCSymbol *getLabel() const { return Label; }
+};
+
+/// \brief This struct describes location entries emitted in the .debug_loc
+/// section.
+typedef struct DotDebugLocEntry {
+  const MCSymbol *Begin;
+  const MCSymbol *End;
+  MachineLocation Loc;
+  const MDNode *Variable;
+  bool Merged;
+  bool Constant;
+  enum EntryType {
+    E_Location,
+    E_Integer,
+    E_ConstantFP,
+    E_ConstantInt
+  };
+  enum EntryType EntryKind;
+
+  union {
+    int64_t Int;
+    const ConstantFP *CFP;
+    const ConstantInt *CIP;
+  } Constants;
+  DotDebugLocEntry()
+    : Begin(0), End(0), Variable(0), Merged(false),
+      Constant(false) { Constants.Int = 0;}
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L,
+                   const MDNode *V)
+    : Begin(B), End(E), Loc(L), Variable(V), Merged(false),
+      Constant(false) { Constants.Int = 0; EntryKind = E_Location; }
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i)
+    : Begin(B), End(E), Variable(0), Merged(false),
+      Constant(true) { Constants.Int = i; EntryKind = E_Integer; }
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantFP *FPtr)
+    : Begin(B), End(E), Variable(0), Merged(false),
+      Constant(true) { Constants.CFP = FPtr; EntryKind = E_ConstantFP; }
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E,
+                   const ConstantInt *IPtr)
+    : Begin(B), End(E), Variable(0), Merged(false),
+      Constant(true) { Constants.CIP = IPtr; EntryKind = E_ConstantInt; }
+
+  /// \brief Empty entries are also used as a trigger to emit temp label. Such
+  /// labels are referenced is used to find debug_loc offset for a given DIE.
+  bool isEmpty() { return Begin == 0 && End == 0; }
+  bool isMerged() { return Merged; }
+  void Merge(DotDebugLocEntry *Next) {
+    if (!(Begin && Loc == Next->Loc && End == Next->Begin))
+      return;
+    Next->Begin = Begin;
+    Merged = true;
+  }
+  bool isLocation() const    { return EntryKind == E_Location; }
+  bool isInt() const         { return EntryKind == E_Integer; }
+  bool isConstantFP() const  { return EntryKind == E_ConstantFP; }
+  bool isConstantInt() const { return EntryKind == E_ConstantInt; }
+  int64_t getInt()                    { return Constants.Int; }
+  const ConstantFP *getConstantFP()   { return Constants.CFP; }
+  const ConstantInt *getConstantInt() { return Constants.CIP; }
+} DotDebugLocEntry;
+
+//===----------------------------------------------------------------------===//
+/// \brief This class is used to track local variable information.
+class DbgVariable {
+  DIVariable Var;                    // Variable Descriptor.
+  DIE *TheDIE;                       // Variable DIE.
+  unsigned DotDebugLocOffset;        // Offset in DotDebugLocEntries.
+  DbgVariable *AbsVar;               // Corresponding Abstract variable, if any.
+  const MachineInstr *MInsn;         // DBG_VALUE instruction of the variable.
+  int FrameIndex;
+public:
+  // AbsVar may be NULL.
+  DbgVariable(DIVariable V, DbgVariable *AV)
+    : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
+      FrameIndex(~0) {}
+
+  // Accessors.
+  DIVariable getVariable()           const { return Var; }
+  void setDIE(DIE *D)                      { TheDIE = D; }
+  DIE *getDIE()                      const { return TheDIE; }
+  void setDotDebugLocOffset(unsigned O)    { DotDebugLocOffset = O; }
+  unsigned getDotDebugLocOffset()    const { return DotDebugLocOffset; }
+  StringRef getName()                const { return Var.getName(); }
+  DbgVariable *getAbstractVariable() const { return AbsVar; }
+  const MachineInstr *getMInsn()     const { return MInsn; }
+  void setMInsn(const MachineInstr *M)     { MInsn = M; }
+  int getFrameIndex()                const { return FrameIndex; }
+  void setFrameIndex(int FI)               { FrameIndex = FI; }
+  // Translate tag to proper Dwarf tag.
+  unsigned getTag()                  const {
+    if (Var.getTag() == dwarf::DW_TAG_arg_variable)
+      return dwarf::DW_TAG_formal_parameter;
+
+    return dwarf::DW_TAG_variable;
+  }
+  /// \brief Return true if DbgVariable is artificial.
+  bool isArtificial()                const {
+    if (Var.isArtificial())
+      return true;
+    if (getType().isArtificial())
+      return true;
+    return false;
+  }
+
+  bool isObjectPointer()             const {
+    if (Var.isObjectPointer())
+      return true;
+    if (getType().isObjectPointer())
+      return true;
+    return false;
+  }
+
+  bool variableHasComplexAddress()   const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.hasComplexAddress();
+  }
+  bool isBlockByrefVariable()        const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.isBlockByrefVariable();
+  }
+  unsigned getNumAddrElements()      const {
+    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    return Var.getNumAddrElements();
+  }
+  uint64_t getAddrElement(unsigned i) const {
+    return Var.getAddrElement(i);
+  }
+  DIType getType() const;
+};
+
+
+// A String->Symbol mapping of strings used by indirect
+// references.
+typedef StringMap<std::pair<MCSymbol*, unsigned>,
+                  BumpPtrAllocator&> StrPool;
+
+// A Symbol->pair<Symbol, unsigned> mapping of addresses used by indirect
+// references.
+typedef DenseMap<MCSymbol *, std::pair<MCSymbol *, unsigned> > AddrPool;
+
+/// \brief Collects and handles information specific to a particular
+/// collection of units.
+class DwarfUnits {
+  // Target of Dwarf emission, used for sizing of abbreviations.
+  AsmPrinter *Asm;
+
+  // Used to uniquely define abbreviations.
+  FoldingSet<DIEAbbrev> *AbbreviationsSet;
+
+  // A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> *Abbreviations;
+
+  // A pointer to all units in the section.
+  SmallVector<CompileUnit *, 1> CUs;
+
+  // Collection of strings for this unit and assorted symbols.
+  StrPool StringPool;
+  unsigned NextStringPoolNumber;
+  std::string StringPref;
+
+  // Collection of addresses for this unit and assorted labels.
+  AddrPool AddressPool;
+  unsigned NextAddrPoolNumber;
+
+public:
+  DwarfUnits(AsmPrinter *AP, FoldingSet<DIEAbbrev> *AS,
+             std::vector<DIEAbbrev *> *A, const char *Pref,
+             BumpPtrAllocator &DA) :
+    Asm(AP), AbbreviationsSet(AS), Abbreviations(A),
+    StringPool(DA), NextStringPoolNumber(0), StringPref(Pref),
+    AddressPool(), NextAddrPoolNumber(0) {}
+
+  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
+
+  /// \brief Compute the size and offset of all the DIEs.
+  void computeSizeAndOffsets();
+
+  /// \brief Define a unique number for the abbreviation.
+  void assignAbbrevNumber(DIEAbbrev &Abbrev);
+
+  /// \brief Add a unit to the list of CUs.
+  void addUnit(CompileUnit *CU) { CUs.push_back(CU); }
+
+  /// \brief Emit all of the units to the section listed with the given
+  /// abbreviation section.
+  void emitUnits(DwarfDebug *, const MCSection *, const MCSection *,
+                 const MCSymbol *);
+
+  /// \brief Emit all of the strings to the section given.
+  void emitStrings(const MCSection *, const MCSection *, const MCSymbol *);
+
+  /// \brief Emit all of the addresses to the section given.
+  void emitAddresses(const MCSection *);
+
+  /// \brief Returns the entry into the start of the pool.
+  MCSymbol *getStringPoolSym();
+
+  /// \brief Returns an entry into the string pool with the given
+  /// string text.
+  MCSymbol *getStringPoolEntry(StringRef Str);
+
+  /// \brief Returns the index into the string pool with the given
+  /// string text.
+  unsigned getStringPoolIndex(StringRef Str);
+
+  /// \brief Returns the string pool.
+  StrPool *getStringPool() { return &StringPool; }
+
+  /// \brief Returns the index into the address pool with the given
+  /// label/symbol.
+  unsigned getAddrPoolIndex(MCSymbol *);
+
+  /// \brief Returns the address pool.
+  AddrPool *getAddrPool() { return &AddressPool; }
+
+  /// \brief for a given compile unit DIE, returns offset from beginning of
+  /// debug info.
+  unsigned getCUOffset(DIE *Die);
+};
+
+/// \brief Collects and handles dwarf debug information.
+class DwarfDebug {
+  // Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  // Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  // All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  //===--------------------------------------------------------------------===//
+  // Attribute used to construct specific Dwarf sections.
+  //
+
+  CompileUnit *FirstCU;
+
+  // Maps MDNode with its corresponding CompileUnit.
+  DenseMap <const MDNode *, CompileUnit *> CUMap;
+
+  // Maps subprogram MDNode with its corresponding CompileUnit.
+  DenseMap <const MDNode *, CompileUnit *> SPMap;
+
+  // Used to uniquely define abbreviations.
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
+
+  // A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> Abbreviations;
+
+  // Stores the current file ID for a given compile unit.
+  DenseMap <unsigned, unsigned> FileIDCUMap;
+  // Source id map, i.e. CUID, source filename and directory,
+  // separated by a zero byte, mapped to a unique id.
+  StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
+
+  // Provides a unique id per text section.
+  SetVector<const MCSection*> SectionMap;
+
+  // List of Arguments (DbgValues) for current function.
+  SmallVector<DbgVariable *, 8> CurrentFnArguments;
+
+  LexicalScopes LScopes;
+
+  // Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
+  // Collection of dbg variables of a scope.
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables;
+
+  // Collection of abstract variables.
+  DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
+
+  // Collection of DotDebugLocEntry.
+  SmallVector<DotDebugLocEntry, 4> DotDebugLocEntries;
+
+  // Collection of subprogram DIEs that are marked (at the end of the module)
+  // as DW_AT_inline.
+  SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
+
+  // Keep track of inlined functions and their location.  This
+  // information is used to populate the debug_inlined section.
+  typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
+  DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> > InlineInfo;
+  SmallVector<const MDNode *, 4> InlinedSPNodes;
+
+  // This is a collection of subprogram MDNodes that are processed to
+  // create DIEs.
+  SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
+
+  // Maps instruction with label emitted before instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
+
+  // Maps instruction with label emitted after instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
+
+  // Every user variable mentioned by a DBG_VALUE instruction in order of
+  // appearance.
+  SmallVector<const MDNode*, 8> UserVariables;
+
+  // For each user variable, keep a list of DBG_VALUE instructions in order.
+  // The list can also contain normal instructions that clobber the previous
+  // DBG_VALUE.
+  typedef DenseMap<const MDNode*, SmallVector<const MachineInstr*, 4> >
+    DbgValueHistoryMap;
+  DbgValueHistoryMap DbgValues;
+
+  SmallVector<const MCSymbol *, 8> DebugRangeSymbols;
+
+  // Previous instruction's location information. This is used to determine
+  // label location to indicate scope boundries in dwarf debug info.
+  DebugLoc PrevInstLoc;
+  MCSymbol *PrevLabel;
+
+  // This location indicates end of function prologue and beginning of function
+  // body.
+  DebugLoc PrologEndLoc;
+
+  struct FunctionDebugFrameInfo {
+    unsigned Number;
+    std::vector<MachineMove> Moves;
+
+    FunctionDebugFrameInfo(unsigned Num, const std::vector<MachineMove> &M)
+      : Number(Num), Moves(M) {}
+  };
+
+  std::vector<FunctionDebugFrameInfo> DebugFrames;
+
+  // Section Symbols: these are assembler temporary labels that are emitted at
+  // the beginning of each supported dwarf section.  These are used to form
+  // section offsets and are created by EmitSectionLabels.
+  MCSymbol *DwarfInfoSectionSym, *DwarfAbbrevSectionSym;
+  MCSymbol *DwarfStrSectionSym, *TextSectionSym, *DwarfDebugRangeSectionSym;
+  MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym;
+  MCSymbol *FunctionBeginSym, *FunctionEndSym;
+  MCSymbol *DwarfAbbrevDWOSectionSym, *DwarfStrDWOSectionSym;
+
+  // As an optimization, there is no need to emit an entry in the directory
+  // table for the same directory as DW_at_comp_dir.
+  StringRef CompilationDir;
+
+  // Counter for assigning globally unique IDs for CUs.
+  unsigned GlobalCUIndexCount;
+
+  // Holder for the file specific debug information.
+  DwarfUnits InfoHolder;
+
+  // Holders for the various debug information flags that we might need to
+  // have exposed. See accessor functions below for description.
+
+  // Whether or not we're emitting info for older versions of gdb on darwin.
+  bool IsDarwinGDBCompat;
+
+  // DWARF5 Experimental Options
+  bool HasDwarfAccelTables;
+  bool HasSplitDwarf;
+
+  // Separated Dwarf Variables
+  // In general these will all be for bits that are left in the
+  // original object file, rather than things that are meant
+  // to be in the .dwo sections.
+
+  // The CUs left in the original object file for separated debug info.
+  SmallVector<CompileUnit *, 1> SkeletonCUs;
+
+  // Used to uniquely define abbreviations for the skeleton emission.
+  FoldingSet<DIEAbbrev> SkeletonAbbrevSet;
+
+  // A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> SkeletonAbbrevs;
+
+  // Holder for the skeleton information.
+  DwarfUnits SkeletonHolder;
+
+private:
+
+  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+
+  /// \brief Find abstract variable associated with Var.
+  DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
+
+  /// \brief Find DIE for the given subprogram and attach appropriate
+  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
+  /// variables in this scope then create and insert DIEs for these
+  /// variables.
+  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, const MDNode *SPNode);
+
+  /// \brief Construct new DW_TAG_lexical_block for this scope and
+  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
+  DIE *constructLexicalScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+
+  /// \brief This scope represents inlined body of a function. Construct
+  /// DIE to represent this concrete inlined copy of the function.
+  DIE *constructInlinedScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+
+  /// \brief Construct a DIE for this scope.
+  DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+
+  /// \brief Emit initial Dwarf sections with a label at the start of each one.
+  void emitSectionLabels();
+
+  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
+
+  /// \brief Compute the size and offset of all the DIEs.
+  void computeSizeAndOffsets();
+
+  /// \brief Attach DW_AT_inline attribute with inlined subprogram DIEs.
+  void computeInlinedDIEs();
+
+  /// \brief Collect info for variables that were optimized out.
+  void collectDeadVariables();
+
+  /// \brief Finish off debug information after all functions have been
+  /// processed.
+  void finalizeModuleInfo();
+
+  /// \brief Emit labels to close any remaining sections that have been left
+  /// open.
+  void endSections();
+
+  /// \brief Emit a set of abbreviations to the specific section.
+  void emitAbbrevs(const MCSection *, std::vector<DIEAbbrev*> *);
+
+  /// \brief Emit the debug info section.
+  void emitDebugInfo();
+
+  /// \brief Emit the abbreviation section.
+  void emitAbbreviations();
+
+  /// \brief Emit the last address of the section and the end of
+  /// the line matrix.
+  void emitEndOfLineMatrix(unsigned SectionEnd);
+
+  /// \brief Emit visible names into a hashed accelerator table section.
+  void emitAccelNames();
+
+  /// \brief Emit objective C classes and categories into a hashed
+  /// accelerator table section.
+  void emitAccelObjC();
+
+  /// \brief Emit namespace dies into a hashed accelerator table.
+  void emitAccelNamespaces();
+
+  /// \brief Emit type dies into a hashed accelerator table.
+  void emitAccelTypes();
+
+  /// \brief Emit visible names into a debug pubnames section.
+  void emitDebugPubnames();
+
+  /// \brief Emit visible types into a debug pubtypes section.
+  void emitDebugPubTypes();
+
+  /// \brief Emit visible names into a debug str section.
+  void emitDebugStr();
+
+  /// \brief Emit visible names into a debug loc section.
+  void emitDebugLoc();
+
+  /// \brief Emit visible names into a debug aranges section.
+  void emitDebugARanges();
+
+  /// \brief Emit visible names into a debug ranges section.
+  void emitDebugRanges();
+
+  /// \brief Emit visible names into a debug macinfo section.
+  void emitDebugMacInfo();
+
+  /// \brief Emit inline info using custom format.
+  void emitDebugInlineInfo();
+
+  /// DWARF 5 Experimental Split Dwarf Emitters
+
+  /// \brief Construct the split debug info compile unit for the debug info
+  /// section.
+  CompileUnit *constructSkeletonCU(const MDNode *);
+
+  /// \brief Emit the local split abbreviations.
+  void emitSkeletonAbbrevs(const MCSection *);
+
+  /// \brief Emit the debug info dwo section.
+  void emitDebugInfoDWO();
+
+  /// \brief Emit the debug abbrev dwo section.
+  void emitDebugAbbrevDWO();
+
+  /// \brief Emit the debug str dwo section.
+  void emitDebugStrDWO();
+
+  /// \brief Create new CompileUnit for the given metadata node with tag
+  /// DW_TAG_compile_unit.
+  CompileUnit *constructCompileUnit(const MDNode *N);
+
+  /// \brief Construct subprogram DIE.
+  void constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N);
+
+  /// \brief Register a source line with debug info. Returns the unique
+  /// label that was emitted and which provides correspondence to the
+  /// source line list.
+  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
+                        unsigned Flags);
+
+  /// \brief Indentify instructions that are marking the beginning of or
+  /// ending of a scope.
+  void identifyScopeMarkers();
+
+  /// \brief If Var is an current function argument that add it in
+  /// CurrentFnArguments list.
+  bool addCurrentFnArgument(const MachineFunction *MF,
+                            DbgVariable *Var, LexicalScope *Scope);
+
+  /// \brief Populate LexicalScope entries with variables' info.
+  void collectVariableInfo(const MachineFunction *,
+                           SmallPtrSet<const MDNode *, 16> &ProcessedVars);
+
+  /// \brief Collect variable information from the side table maintained
+  /// by MMI.
+  void collectVariableInfoFromMMITable(const MachineFunction * MF,
+                                       SmallPtrSet<const MDNode *, 16> &P);
+
+  /// \brief Ensure that a label will be emitted before MI.
+  void requestLabelBeforeInsn(const MachineInstr *MI) {
+    LabelsBeforeInsn.insert(std::make_pair(MI, (MCSymbol*)0));
+  }
+
+  /// \brief Return Label preceding the instruction.
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// \brief Ensure that a label will be emitted after MI.
+  void requestLabelAfterInsn(const MachineInstr *MI) {
+    LabelsAfterInsn.insert(std::make_pair(MI, (MCSymbol*)0));
+  }
+
+  /// \brief Return Label immediately following the instruction.
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfDebug(AsmPrinter *A, Module *M);
+  ~DwarfDebug();
+
+  /// \brief Emit all Dwarf sections that should come prior to the
+  /// content.
+  void beginModule();
+
+  /// \brief Emit all Dwarf sections that should come after the content.
+  void endModule();
+
+  /// \brief Gather pre-function debug information.
+  void beginFunction(const MachineFunction *MF);
+
+  /// \brief Gather and emit post-function debug information.
+  void endFunction(const MachineFunction *MF);
+
+  /// \brief Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI);
+
+  /// \brief Process end of an instruction.
+  void endInstruction(const MachineInstr *MI);
+
+  /// \brief Look up the source id with the given directory and source file
+  /// names. If none currently exists, create a new id and insert it in the
+  /// SourceIds map.
+  unsigned getOrCreateSourceID(StringRef DirName, StringRef FullName,
+                               unsigned CUID);
+
+  /// \brief Recursively Emits a debug information entry.
+  void emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs);
+
+  /// \brief Returns whether or not to limit some of our debug
+  /// output to the limitations of darwin gdb.
+  bool useDarwinGDBCompat() { return IsDarwinGDBCompat; }
+
+  // Experimental DWARF5 features.
+
+  /// \brief Returns whether or not to emit tables that dwarf consumers can
+  /// use to accelerate lookup.
+  bool useDwarfAccelTables() { return HasDwarfAccelTables; }
+
+  /// \brief Returns whether or not to change the current debug info for the
+  /// split dwarf proposal support.
+  bool useSplitDwarf() { return HasSplitDwarf; }
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.cpp
new file mode 100644
index 000000000000..7133458129cc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -0,0 +1,736 @@
+//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+DwarfException::DwarfException(AsmPrinter *A)
+  : Asm(A), MMI(Asm->MMI) {}
+
+DwarfException::~DwarfException() {}
+
+/// SharedTypeIds - How many leading type ids two landing pads have in common.
+unsigned DwarfException::SharedTypeIds(const LandingPadInfo *L,
+                                       const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+  unsigned Count = 0;
+
+  for (; Count != MinSize; ++Count)
+    if (LIds[Count] != RIds[Count])
+      return Count;
+
+  return Count;
+}
+
+/// PadLT - Order landing pads lexicographically by type id.
+bool DwarfException::PadLT(const LandingPadInfo *L, const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+
+  for (unsigned i = 0; i != MinSize; ++i)
+    if (LIds[i] != RIds[i])
+      return LIds[i] < RIds[i];
+
+  return LSize < RSize;
+}
+
+/// ComputeActionsTable - Compute the actions table and gather the first action
+/// index for each landing pad site.
+unsigned DwarfException::
+ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
+                    SmallVectorImpl<ActionEntry> &Actions,
+                    SmallVectorImpl<unsigned> &FirstActions) {
+
+  // The action table follows the call-site table in the LSDA. The individual
+  // records are of two types:
+  //
+  //   * Catch clause
+  //   * Exception specification
+  //
+  // The two record kinds have the same format, with only small differences.
+  // They are distinguished by the "switch value" field: Catch clauses
+  // (TypeInfos) have strictly positive switch values, and exception
+  // specifications (FilterIds) have strictly negative switch values. Value 0
+  // indicates a catch-all clause.
+  //
+  // Negative type IDs index into FilterIds. Positive type IDs index into
+  // TypeInfos.  The value written for a positive type ID is just the type ID
+  // itself.  For a negative type ID, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type ID (because the FilterIds entries are
+  // written using a variable width encoding, which outputs one byte per entry
+  // as long as the value written is not too large) but can differ.  This kind
+  // of complication does not occur for positive type IDs because type infos are
+  // output using a fixed width encoding.  FilterOffsets[i] holds the byte
+  // offset corresponding to FilterIds[i].
+
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= MCAsmInfo::getULEB128Size(*I);
+  }
+
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  const LandingPadInfo *PrevLPI = 0;
+
+  for (SmallVectorImpl<const LandingPadInfo *>::const_iterator
+         I = LandingPads.begin(), E = LandingPads.end(); I != E; ++I) {
+    const LandingPadInfo *LPI = *I;
+    const std::vector<int> &TypeIds = LPI->TypeIds;
+    unsigned NumShared = PrevLPI ? SharedTypeIds(LPI, PrevLPI) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      unsigned PrevAction = (unsigned)-1;
+
+      if (NumShared) {
+        unsigned SizePrevIds = PrevLPI->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = Actions.size() - 1;
+        SizeAction =
+          MCAsmInfo::getSLEB128Size(Actions[PrevAction].NextAction) +
+          MCAsmInfo::getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          assert(PrevAction != (unsigned)-1 && "PrevAction is invalid!");
+          SizeAction -=
+            MCAsmInfo::getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+          SizeAction += -Actions[PrevAction].NextAction;
+          PrevAction = Actions[PrevAction].Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
+        int TypeID = TypeIds[J];
+        assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = MCAsmInfo::getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + MCAsmInfo::getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = { ValueForTypeID, NextAction, PrevAction };
+        Actions.push_back(Action);
+        PrevAction = Actions.size() - 1;
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    // Information used when created the call-site table. The action record
+    // field of the call site record is the offset of the first associated
+    // action record, relative to the start of the actions table. This value is
+    // biased by 1 (1 indicating the start of the actions table), and 0
+    // indicates that there are no actions.
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+
+    PrevLPI = LPI;
+  }
+
+  return SizeActions;
+}
+
+/// CallToNoUnwindFunction - Return `true' if this is a call to a function
+/// marked `nounwind'. Return `false' otherwise.
+bool DwarfException::CallToNoUnwindFunction(const MachineInstr *MI) {
+  assert(MI->isCall() && "This should be a call instruction!");
+
+  bool MarkedNoUnwind = false;
+  bool SawFunc = false;
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+
+    if (!MO.isGlobal()) continue;
+
+    const Function *F = dyn_cast<Function>(MO.getGlobal());
+    if (F == 0) continue;
+
+    if (SawFunc) {
+      // Be conservative. If we have more than one function operand for this
+      // call, then we can't make the assumption that it's the callee and
+      // not a parameter to the call.
+      //
+      // FIXME: Determine if there's a way to say that `F' is the callee or
+      // parameter.
+      MarkedNoUnwind = false;
+      break;
+    }
+
+    MarkedNoUnwind = F->doesNotThrow();
+    SawFunc = true;
+  }
+
+  return MarkedNoUnwind;
+}
+
+/// ComputeCallSiteTable - Compute the call-site table.  The entry for an invoke
+/// has a try-range containing the call, a non-zero landing pad, and an
+/// appropriate action.  The entry for an ordinary call has a try-range
+/// containing the call and zero for the landing pad and the action.  Calls
+/// marked 'nounwind' have no entry and must not be contained in the try-range
+/// of any entry - they form gaps in the table.  Entries must be ordered by
+/// try-range address.
+void DwarfException::
+ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                     const RangeMapType &PadMap,
+                     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                     const SmallVectorImpl<unsigned> &FirstActions) {
+  // The end label of the previous invoke or nounwind try-range.
+  MCSymbol *LastLabel = 0;
+
+  // Whether there is a potentially throwing instruction (currently this means
+  // an ordinary call) between the end of the previous try-range and now.
+  bool SawPotentiallyThrowing = false;
+
+  // Whether the last CallSite entry was for an invoke.
+  bool PreviousIsInvoke = false;
+
+  // Visit all instructions in order of address.
+  for (MachineFunction::const_iterator I = Asm->MF->begin(), E = Asm->MF->end();
+       I != E; ++I) {
+    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
+         MI != E; ++MI) {
+      if (!MI->isLabel()) {
+        if (MI->isCall())
+          SawPotentiallyThrowing |= !CallToNoUnwindFunction(MI);
+        continue;
+      }
+
+      // End of the previous try-range?
+      MCSymbol *BeginLabel = MI->getOperand(0).getMCSymbol();
+      if (BeginLabel == LastLabel)
+        SawPotentiallyThrowing = false;
+
+      // Beginning of a new try-range?
+      RangeMapType::const_iterator L = PadMap.find(BeginLabel);
+      if (L == PadMap.end())
+        // Nope, it was just some random label.
+        continue;
+
+      const PadRange &P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+             "Inconsistent landing pad map!");
+
+      // For Dwarf exception handling (SjLj handling doesn't use this). If some
+      // instruction between the previous try-range and this one may throw,
+      // create a call-site entry with no landing pad for the region between the
+      // try-ranges.
+      if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+        CallSiteEntry Site = { LastLabel, BeginLabel, 0, 0 };
+        CallSites.push_back(Site);
+        PreviousIsInvoke = false;
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      assert(BeginLabel && LastLabel && "Invalid landing pad!");
+
+      if (!LandingPad->LandingPadLabel) {
+        // Create a gap.
+        PreviousIsInvoke = false;
+      } else {
+        // This try-range is for an invoke.
+        CallSiteEntry Site = {
+          BeginLabel,
+          LastLabel,
+          LandingPad->LandingPadLabel,
+          FirstActions[P.PadIndex]
+        };
+
+        // Try to merge with the previous call-site. SJLJ doesn't do this
+        if (PreviousIsInvoke && Asm->MAI->isExceptionHandlingDwarf()) {
+          CallSiteEntry &Prev = CallSites.back();
+          if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+            // Extend the range of the previous entry.
+            Prev.EndLabel = Site.EndLabel;
+            continue;
+          }
+        }
+
+        // Otherwise, create a new call-site.
+        if (Asm->MAI->isExceptionHandlingDwarf())
+          CallSites.push_back(Site);
+        else {
+          // SjLj EH must maintain the call sites in the order assigned
+          // to them by the SjLjPrepare pass.
+          unsigned SiteNo = MMI->getCallSiteBeginLabel(BeginLabel);
+          if (CallSites.size() < SiteNo)
+            CallSites.resize(SiteNo);
+          CallSites[SiteNo - 1] = Site;
+        }
+        PreviousIsInvoke = true;
+      }
+    }
+  }
+
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (SawPotentiallyThrowing && Asm->MAI->isExceptionHandlingDwarf()) {
+    CallSiteEntry Site = { LastLabel, 0, 0, 0 };
+    CallSites.push_back(Site);
+  }
+}
+
+/// EmitExceptionTable - Emit landing pads and actions.
+///
+/// The general organization of the table is complex, but the basic concepts are
+/// easy.  First there is a header which describes the location and organization
+/// of the three components that follow.
+///
+///  1. The landing pad site information describes the range of code covered by
+///     the try.  In our case it's an accumulation of the ranges covered by the
+///     invokes in the try.  There is also a reference to the landing pad that
+///     handles the exception once processed.  Finally an index into the actions
+///     table.
+///  2. The action table, in our case, is composed of pairs of type IDs and next
+///     action offset.  Starting with the action index from the landing pad
+///     site, each type ID is checked for a match to the current exception.  If
+///     it matches then the exception and type id are passed on to the landing
+///     pad.  Otherwise the next action is looked up.  This chain is terminated
+///     with a next action of zero.  If no type id is found then the frame is
+///     unwound and handling continues.
+///  3. Type ID table contains references to all the C++ typeinfo for all
+///     catches in the function.  This tables is reverse indexed base 1.
+void DwarfException::EmitExceptionTable() {
+  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+
+  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  unsigned SizeActions=ComputeActionsTable(LandingPads, Actions, FirstActions);
+
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced when using DWARF exception handling.
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
+  // Compute the call-site table.
+  SmallVector<CallSiteEntry, 64> CallSites;
+  ComputeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+
+  // Final tallies.
+
+  // Call sites.
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool HaveTTData = IsSJLJ ? (!TypeInfos.empty() || !FilterIds.empty()) : true;
+
+  unsigned CallSiteTableLength;
+  if (IsSJLJ)
+    CallSiteTableLength = 0;
+  else {
+    unsigned SiteStartSize  = 4; // dwarf::DW_EH_PE_udata4
+    unsigned SiteLengthSize = 4; // dwarf::DW_EH_PE_udata4
+    unsigned LandingPadSize = 4; // dwarf::DW_EH_PE_udata4
+    CallSiteTableLength =
+      CallSites.size() * (SiteStartSize + SiteLengthSize + LandingPadSize);
+  }
+
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i) {
+    CallSiteTableLength += MCAsmInfo::getULEB128Size(CallSites[i].Action);
+    if (IsSJLJ)
+      CallSiteTableLength += MCAsmInfo::getULEB128Size(i);
+  }
+
+  // Type infos.
+  const MCSection *LSDASection = Asm->getObjFileLowering().getLSDASection();
+  unsigned TTypeEncoding;
+  unsigned TypeFormatSize;
+
+  if (!HaveTTData) {
+    // For SjLj exceptions, if there is no TypeInfo, then we just explicitly say
+    // that we're omitting that bit.
+    TTypeEncoding = dwarf::DW_EH_PE_omit;
+    // dwarf::DW_EH_PE_absptr
+    TypeFormatSize = Asm->getDataLayout().getPointerSize();
+  } else {
+    // Okay, we have actual filters or typeinfos to emit.  As such, we need to
+    // pick a type encoding for them.  We're about to emit a list of pointers to
+    // typeinfo objects at the end of the LSDA.  However, unless we're in static
+    // mode, this reference will require a relocation by the dynamic linker.
+    //
+    // Because of this, we have a couple of options:
+    //
+    //   1) If we are in -static mode, we can always use an absolute reference
+    //      from the LSDA, because the static linker will resolve it.
+    //
+    //   2) Otherwise, if the LSDA section is writable, we can output the direct
+    //      reference to the typeinfo and allow the dynamic linker to relocate
+    //      it.  Since it is in a writable section, the dynamic linker won't
+    //      have a problem.
+    //
+    //   3) Finally, if we're in PIC mode and the LDSA section isn't writable,
+    //      we need to use some form of indirection.  For example, on Darwin,
+    //      we can output a statically-relocatable reference to a dyld stub. The
+    //      offset to the stub is constant, but the contents are in a section
+    //      that is updated by the dynamic linker.  This is easy enough, but we
+    //      need to tell the personality function of the unwinder to indirect
+    //      through the dyld stub.
+    //
+    // FIXME: When (3) is actually implemented, we'll have to emit the stubs
+    // somewhere.  This predicate should be moved to a shared location that is
+    // in target-independent code.
+    //
+    TTypeEncoding = Asm->getObjFileLowering().getTTypeEncoding();
+    TypeFormatSize = Asm->GetSizeOfEncodedValue(TTypeEncoding);
+  }
+
+  // Begin the exception table.
+  // Sometimes we want not to emit the data into separate section (e.g. ARM
+  // EHABI). In this case LSDASection will be NULL.
+  if (LSDASection)
+    Asm->OutStreamer.SwitchSection(LSDASection);
+  Asm->EmitAlignment(2);
+
+  // Emit the LSDA.
+  MCSymbol *GCCETSym =
+    Asm->OutContext.GetOrCreateSymbol(Twine("GCC_except_table")+
+                                      Twine(Asm->getFunctionNumber()));
+  Asm->OutStreamer.EmitLabel(GCCETSym);
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("exception",
+                                                Asm->getFunctionNumber()));
+
+  if (IsSJLJ)
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("_LSDA_",
+                                                  Asm->getFunctionNumber()));
+
+  // Emit the LSDA header.
+  Asm->EmitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+  Asm->EmitEncodingByte(TTypeEncoding, "@TType");
+
+  // The type infos need to be aligned. GCC does this by inserting padding just
+  // before the type infos. However, this changes the size of the exception
+  // table, so you need to take this into account when you output the exception
+  // table size. However, the size is output using a variable length encoding.
+  // So by increasing the size by inserting padding, you may increase the number
+  // of bytes used for writing the size. If it increases, say by one byte, then
+  // you now need to output one less byte of padding to get the type infos
+  // aligned. However this decreases the size of the exception table. This
+  // changes the value you have to output for the exception table size. Due to
+  // the variable length encoding, the number of bytes used for writing the
+  // length may decrease. If so, you then have to increase the amount of
+  // padding. And so on. If you look carefully at the GCC code you will see that
+  // it indeed does this in a loop, going on and on until the values stabilize.
+  // We chose another solution: don't output padding inside the table like GCC
+  // does, instead output it before the table.
+  unsigned SizeTypes = TypeInfos.size() * TypeFormatSize;
+  unsigned CallSiteTableLengthSize =
+    MCAsmInfo::getULEB128Size(CallSiteTableLength);
+  unsigned TTypeBaseOffset =
+    sizeof(int8_t) +                            // Call site format
+    CallSiteTableLengthSize +                   // Call site table length size
+    CallSiteTableLength +                       // Call site table length
+    SizeActions +                               // Actions size
+    SizeTypes;
+  unsigned TTypeBaseOffsetSize = MCAsmInfo::getULEB128Size(TTypeBaseOffset);
+  unsigned TotalSize =
+    sizeof(int8_t) +                            // LPStart format
+    sizeof(int8_t) +                            // TType format
+    (HaveTTData ? TTypeBaseOffsetSize : 0) +    // TType base offset size
+    TTypeBaseOffset;                            // TType base offset
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  if (HaveTTData) {
+    // Account for any extra padding that will be added to the call site table
+    // length.
+    Asm->EmitULEB128(TTypeBaseOffset, "@TType base offset", SizeAlign);
+    SizeAlign = 0;
+  }
+
+  bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
+
+  // SjLj Exception handling
+  if (IsSJLJ) {
+    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
+
+    // Add extra padding if it wasn't added to the TType base offset.
+    Asm->EmitULEB128(CallSiteTableLength, "Call site table length", SizeAlign);
+
+    // Emit the landing pad site information.
+    unsigned idx = 0;
+    for (SmallVectorImpl<CallSiteEntry>::const_iterator
+         I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
+      const CallSiteEntry &S = *I;
+
+      // Offset of the landing pad, counted in 16-byte bundles relative to the
+      // @LPStart address.
+      if (VerboseAsm) {
+        Asm->OutStreamer.AddComment(">> Call Site " + Twine(idx) + " <<");
+        Asm->OutStreamer.AddComment("  On exception at call site "+Twine(idx));
+      }
+      Asm->EmitULEB128(idx);
+
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer.AddComment("  Action: cleanup");
+        else
+          Asm->OutStreamer.AddComment("  Action: " +
+                                      Twine((S.Action - 1) / 2 + 1));
+      }
+      Asm->EmitULEB128(S.Action);
+    }
+  } else {
+    // DWARF Exception handling
+    assert(Asm->MAI->isExceptionHandlingDwarf());
+
+    // The call-site table is a list of all call sites that may throw an
+    // exception (including C++ 'throw' statements) in the procedure
+    // fragment. It immediately follows the LSDA header. Each entry indicates,
+    // for a given call, the first corresponding action record and corresponding
+    // landing pad.
+    //
+    // The table begins with the number of bytes, stored as an LEB128
+    // compressed, unsigned integer. The records immediately follow the record
+    // count. They are sorted in increasing call-site address. Each record
+    // indicates:
+    //
+    //   * The position of the call-site.
+    //   * The position of the landing pad.
+    //   * The first action record for that call site.
+    //
+    // A missing entry in the call-site table indicates that a call is not
+    // supposed to throw.
+
+    // Emit the landing pad call site table.
+    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
+
+    // Add extra padding if it wasn't added to the TType base offset.
+    Asm->EmitULEB128(CallSiteTableLength, "Call site table length", SizeAlign);
+
+    unsigned Entry = 0;
+    for (SmallVectorImpl<CallSiteEntry>::const_iterator
+         I = CallSites.begin(), E = CallSites.end(); I != E; ++I) {
+      const CallSiteEntry &S = *I;
+
+      MCSymbol *EHFuncBeginSym =
+        Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
+
+      MCSymbol *BeginLabel = S.BeginLabel;
+      if (BeginLabel == 0)
+        BeginLabel = EHFuncBeginSym;
+      MCSymbol *EndLabel = S.EndLabel;
+      if (EndLabel == 0)
+        EndLabel = Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
+
+
+      // Offset of the call site relative to the previous call site, counted in
+      // number of 16-byte bundles. The first call site is counted relative to
+      // the start of the procedure fragment.
+      if (VerboseAsm)
+        Asm->OutStreamer.AddComment(">> Call Site " + Twine(++Entry) + " <<");
+      Asm->EmitLabelDifference(BeginLabel, EHFuncBeginSym, 4);
+      if (VerboseAsm)
+        Asm->OutStreamer.AddComment(Twine("  Call between ") +
+                                    BeginLabel->getName() + " and " +
+                                    EndLabel->getName());
+      Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+
+      // Offset of the landing pad, counted in 16-byte bundles relative to the
+      // @LPStart address.
+      if (!S.PadLabel) {
+        if (VerboseAsm)
+          Asm->OutStreamer.AddComment("    has no landing pad");
+        Asm->OutStreamer.EmitIntValue(0, 4/*size*/);
+      } else {
+        if (VerboseAsm)
+          Asm->OutStreamer.AddComment(Twine("    jumps to ") +
+                                      S.PadLabel->getName());
+        Asm->EmitLabelDifference(S.PadLabel, EHFuncBeginSym, 4);
+      }
+
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer.AddComment("  On action: cleanup");
+        else
+          Asm->OutStreamer.AddComment("  On action: " +
+                                      Twine((S.Action - 1) / 2 + 1));
+      }
+      Asm->EmitULEB128(S.Action);
+    }
+  }
+
+  // Emit the Action Table.
+  int Entry = 0;
+  for (SmallVectorImpl<ActionEntry>::const_iterator
+         I = Actions.begin(), E = Actions.end(); I != E; ++I) {
+    const ActionEntry &Action = *I;
+
+    if (VerboseAsm) {
+      // Emit comments that decode the action table.
+      Asm->OutStreamer.AddComment(">> Action Record " + Twine(++Entry) + " <<");
+    }
+
+    // Type Filter
+    //
+    //   Used by the runtime to match the type of the thrown exception to the
+    //   type of the catch clauses or the types in the exception specification.
+    if (VerboseAsm) {
+      if (Action.ValueForTypeID > 0)
+        Asm->OutStreamer.AddComment("  Catch TypeInfo " +
+                                    Twine(Action.ValueForTypeID));
+      else if (Action.ValueForTypeID < 0)
+        Asm->OutStreamer.AddComment("  Filter TypeInfo " +
+                                    Twine(Action.ValueForTypeID));
+      else
+        Asm->OutStreamer.AddComment("  Cleanup");
+    }
+    Asm->EmitSLEB128(Action.ValueForTypeID);
+
+    // Action Record
+    //
+    //   Self-relative signed displacement in bytes of the next action record,
+    //   or 0 if there is no next action record.
+    if (VerboseAsm) {
+      if (Action.NextAction == 0) {
+        Asm->OutStreamer.AddComment("  No further actions");
+      } else {
+        unsigned NextAction = Entry + (Action.NextAction + 1) / 2;
+        Asm->OutStreamer.AddComment("  Continue to action "+Twine(NextAction));
+      }
+    }
+    Asm->EmitSLEB128(Action.NextAction);
+  }
+
+  EmitTypeInfos(TTypeEncoding);
+
+  Asm->EmitAlignment(2);
+}
+
+void DwarfException::EmitTypeInfos(unsigned TTypeEncoding) {
+  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+
+  bool VerboseAsm = Asm->OutStreamer.isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer.AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer.AddBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (std::vector<const GlobalVariable *>::const_reverse_iterator
+         I = TypeInfos.rbegin(), E = TypeInfos.rend(); I != E; ++I) {
+    const GlobalVariable *GV = *I;
+    if (VerboseAsm)
+      Asm->OutStreamer.AddComment("TypeInfo " + Twine(Entry--));
+    Asm->EmitTTypeReference(GV, TTypeEncoding);
+  }
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer.AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer.AddBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (TypeID != 0)
+        Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->EmitULEB128(TypeID);
+  }
+}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void DwarfException::EndModule() {
+  llvm_unreachable("Should be implemented");
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void DwarfException::BeginFunction(const MachineFunction *MF) {
+  llvm_unreachable("Should be implemented");
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void DwarfException::EndFunction() {
+  llvm_unreachable("Should be implemented");
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
new file mode 100644
index 000000000000..74b1b13367a2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -0,0 +1,234 @@
+//===-- DwarfException.h - Dwarf Exception Framework -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include <vector>
+
+namespace llvm {
+
+template <typename T> class SmallVectorImpl;
+struct LandingPadInfo;
+class MachineModuleInfo;
+class MachineMove;
+class MachineInstr;
+class MachineFunction;
+class MCAsmInfo;
+class MCExpr;
+class MCSymbol;
+class Function;
+class AsmPrinter;
+
+//===----------------------------------------------------------------------===//
+/// DwarfException - Emits Dwarf exception handling directives.
+///
+class DwarfException {
+protected:
+  /// Asm - Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  /// MMI - Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// SharedTypeIds - How many leading type ids two landing pads have in common.
+  static unsigned SharedTypeIds(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// PadLT - Order landing pads lexicographically by type id.
+  static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R);
+
+  /// PadRange - Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
+
+  /// ActionEntry - Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// CallSiteEntry - Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // zero indicates the start of the function.
+    MCSymbol *EndLabel;   // zero indicates the end of the function.
+
+    // The landing pad starts at PadLabel.
+    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  /// ComputeActionsTable - Compute the actions table and gather the first
+  /// action index for each landing pad site.
+  unsigned ComputeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
+                               SmallVectorImpl<ActionEntry> &Actions,
+                               SmallVectorImpl<unsigned> &FirstActions);
+
+  /// CallToNoUnwindFunction - Return `true' if this is a call to a function
+  /// marked `nounwind'. Return `false' otherwise.
+  bool CallToNoUnwindFunction(const MachineInstr *MI);
+
+  /// ComputeCallSiteTable - Compute the call-site table.  The entry for an
+  /// invoke has a try-range containing the call, a non-zero landing pad and an
+  /// appropriate action.  The entry for an ordinary call has a try-range
+  /// containing the call and zero for the landing pad and the action.  Calls
+  /// marked 'nounwind' have no entry and must not be contained in the try-range
+  /// of any entry - they form gaps in the table.  Entries must be ordered by
+  /// try-range address.
+  void ComputeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const RangeMapType &PadMap,
+                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// EmitExceptionTable - Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+  void EmitExceptionTable();
+
+  virtual void EmitTypeInfos(unsigned TTypeEncoding);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfException(AsmPrinter *A);
+  virtual ~DwarfException();
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  virtual void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  virtual void BeginFunction(const MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  virtual void EndFunction();
+};
+
+class DwarfCFIException : public DwarfException {
+  /// shouldEmitPersonality - Per-function flag to indicate if .cfi_personality
+  /// should be emitted.
+  bool shouldEmitPersonality;
+
+  /// shouldEmitLSDA - Per-function flag to indicate if .cfi_lsda
+  /// should be emitted.
+  bool shouldEmitLSDA;
+
+  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
+  /// should be emitted.
+  bool shouldEmitMoves;
+
+  AsmPrinter::CFIMoveType moveTypeModule;
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfCFIException(AsmPrinter *A);
+  virtual ~DwarfCFIException();
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  virtual void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  virtual void BeginFunction(const MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  virtual void EndFunction();
+};
+
+class ARMException : public DwarfException {
+  void EmitTypeInfos(unsigned TTypeEncoding);
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  ARMException(AsmPrinter *A);
+  virtual ~ARMException();
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  virtual void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  virtual void BeginFunction(const MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  virtual void EndFunction();
+};
+
+class Win64Exception : public DwarfException {
+  /// shouldEmitPersonality - Per-function flag to indicate if personality
+  /// info should be emitted.
+  bool shouldEmitPersonality;
+
+  /// shouldEmitLSDA - Per-function flag to indicate if the LSDA
+  /// should be emitted.
+  bool shouldEmitLSDA;
+
+  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
+  /// should be emitted.
+  bool shouldEmitMoves;
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  Win64Exception(AsmPrinter *A);
+  virtual ~Win64Exception();
+
+  /// EndModule - Emit all exception information that should come after the
+  /// content.
+  virtual void EndModule();
+
+  /// BeginFunction - Gather pre-function exception information.  Assumes being
+  /// emitted immediately after the function entry point.
+  virtual void BeginFunction(const MachineFunction *MF);
+
+  /// EndFunction - Gather and emit post-function exception information.
+  virtual void EndFunction();
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
new file mode 100644
index 000000000000..a8fb66dcf17b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -0,0 +1,120 @@
+//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the compiler plugin that is used in order to emit
+// garbage collection information in a convenient layout for parsing and
+// loading in the Erlang/OTP runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+  class ErlangGCPrinter : public GCMetadataPrinter {
+  public:
+    void beginAssembly(AsmPrinter &AP);
+    void finishAssembly(AsmPrinter &AP);
+  };
+
+}
+
+static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
+X("erlang", "erlang-compatible garbage collector");
+
+void llvm::linkErlangGCPrinter() { }
+
+void ErlangGCPrinter::beginAssembly(AsmPrinter &AP) { }
+
+void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
+  MCStreamer &OS = AP.OutStreamer;
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+
+  // Put this in a custom .note section.
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getContext()
+    .getELFSection(".note.gc", ELF::SHT_PROGBITS, 0,
+                   SectionKind::getDataRel()));
+
+  // For each function...
+  for (iterator FI = begin(), FE = end(); FI != FE; ++FI) {
+    GCFunctionInfo &MD = **FI;
+
+    /** A compact GC layout. Emit this data structure:
+     *
+     * struct {
+     *   int16_t PointCount;
+     *   void *SafePointAddress[PointCount];
+     *   int16_t StackFrameSize; (in words)
+     *   int16_t StackArity;
+     *   int16_t LiveCount;
+     *   int16_t LiveOffsets[LiveCount];
+     * } __gcmap_<FUNCTIONNAME>;
+     **/
+
+    // Align to address width.
+    AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+
+    // Emit PointCount.
+    OS.AddComment("safe point count");
+    AP.EmitInt16(MD.size());
+
+    // And each safe point...
+    for (GCFunctionInfo::iterator PI = MD.begin(), PE = MD.end(); PI != PE;
+         ++PI) {
+      // Emit the address of the safe point.
+      OS.AddComment("safe point address");
+      MCSymbol *Label = PI->Label;
+      AP.EmitLabelPlusOffset(Label/*Hi*/, 0/*Offset*/, 4/*Size*/);
+    }
+
+    // Stack information never change in safe points! Only print info from the
+    // first call-site.
+    GCFunctionInfo::iterator PI = MD.begin();
+
+    // Emit the stack frame size.
+    OS.AddComment("stack frame size (in words)");
+    AP.EmitInt16(MD.getFrameSize() / IntPtrSize);
+
+    // Emit stack arity, i.e. the number of stacked arguments.
+    unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
+    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs ?
+                          MD.getFunction().arg_size() - RegisteredArgs : 0;
+    OS.AddComment("stack arity");
+    AP.EmitInt16(StackArity);
+
+    // Emit the number of live roots in the function.
+    OS.AddComment("live root count");
+    AP.EmitInt16(MD.live_size(PI));
+
+    // And for each live root...
+    for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
+                                       LE = MD.live_end(PI);
+                                       LI != LE; ++LI) {
+      // Emit live root's offset within the stack frame.
+      OS.AddComment("stack index (offset / wordsize)");
+      AP.EmitInt16(LI->StackOffset / IntPtrSize);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
new file mode 100644
index 000000000000..98177c0ba1cf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -0,0 +1,166 @@
+//===-- OcamlGCPrinter.cpp - Ocaml frametable emitter ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements printing the assembly code for an Ocaml frametable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+namespace {
+
+  class OcamlGCMetadataPrinter : public GCMetadataPrinter {
+  public:
+    void beginAssembly(AsmPrinter &AP);
+    void finishAssembly(AsmPrinter &AP);
+  };
+
+}
+
+static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
+Y("ocaml", "ocaml 3.10-compatible collector");
+
+void llvm::linkOcamlGCPrinter() { }
+
+static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
+  const std::string &MId = M.getModuleIdentifier();
+
+  std::string SymName;
+  SymName += "caml";
+  size_t Letter = SymName.size();
+  SymName.append(MId.begin(), std::find(MId.begin(), MId.end(), '.'));
+  SymName += "__";
+  SymName += Id;
+
+  // Capitalize the first letter of the module name.
+  SymName[Letter] = toupper(SymName[Letter]);
+
+  SmallString<128> TmpStr;
+  AP.Mang->getNameWithPrefix(TmpStr, SymName);
+
+  MCSymbol *Sym = AP.OutContext.GetOrCreateSymbol(TmpStr);
+
+  AP.OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+  AP.OutStreamer.EmitLabel(Sym);
+}
+
+void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(getModule(), AP, "code_begin");
+
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(getModule(), AP, "data_begin");
+}
+
+/// emitAssembly - Print the frametable. The ocaml frametable format is thus:
+///
+///   extern "C" struct align(sizeof(intptr_t)) {
+///     uint16_t NumDescriptors;
+///     struct align(sizeof(intptr_t)) {
+///       void *ReturnAddress;
+///       uint16_t FrameSize;
+///       uint16_t NumLiveOffsets;
+///       uint16_t LiveOffsets[NumLiveOffsets];
+///     } Descriptors[NumDescriptors];
+///   } caml${module}__frametable;
+///
+/// Note that this precludes programs from stack frames larger than 64K
+/// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
+/// either condition is detected in a function which uses the GC.
+///
+void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
+
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(getModule(), AP, "code_end");
+
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(getModule(), AP, "data_end");
+
+  // FIXME: Why does ocaml emit this??
+  AP.OutStreamer.EmitIntValue(0, IntPtrSize);
+
+  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(getModule(), AP, "frametable");
+
+  int NumDescriptors = 0;
+  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+    GCFunctionInfo &FI = **I;
+    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
+      NumDescriptors++;
+    }
+  }
+
+  if (NumDescriptors >= 1<<16) {
+    // Very rude!
+    report_fatal_error(" Too much descriptor for ocaml GC");
+  }
+  AP.EmitInt16(NumDescriptors);
+  AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+
+  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+    GCFunctionInfo &FI = **I;
+
+    uint64_t FrameSize = FI.getFrameSize();
+    if (FrameSize >= 1<<16) {
+      // Very rude!
+      report_fatal_error("Function '" + FI.getFunction().getName() +
+                         "' is too large for the ocaml GC! "
+                         "Frame size " + Twine(FrameSize) + ">= 65536.\n"
+                         "(" + Twine(uintptr_t(&FI)) + ")");
+    }
+
+    AP.OutStreamer.AddComment("live roots for " +
+                              Twine(FI.getFunction().getName()));
+    AP.OutStreamer.AddBlankLine();
+
+    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
+      size_t LiveCount = FI.live_size(J);
+      if (LiveCount >= 1<<16) {
+        // Very rude!
+        report_fatal_error("Function '" + FI.getFunction().getName() +
+                           "' is too large for the ocaml GC! "
+                           "Live root count "+Twine(LiveCount)+" >= 65536.");
+      }
+
+      AP.OutStreamer.EmitSymbolValue(J->Label, IntPtrSize);
+      AP.EmitInt16(FrameSize);
+      AP.EmitInt16(LiveCount);
+
+      for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
+                                         KE = FI.live_end(J); K != KE; ++K) {
+        if (K->StackOffset >= 1<<16) {
+          // Very rude!
+          report_fatal_error(
+                 "GC root stack offset is outside of fixed stack frame and out "
+                 "of range for ocaml GC!");
+        }
+        AP.EmitInt16(K->StackOffset);
+      }
+
+      AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/Win64Exception.cpp
new file mode 100644
index 000000000000..156101286b75
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -0,0 +1,114 @@
+//===-- CodeGen/AsmPrinter/Win64Exception.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+Win64Exception::Win64Exception(AsmPrinter *A)
+  : DwarfException(A),
+    shouldEmitPersonality(false), shouldEmitLSDA(false), shouldEmitMoves(false)
+    {}
+
+Win64Exception::~Win64Exception() {}
+
+/// EndModule - Emit all exception information that should come after the
+/// content.
+void Win64Exception::EndModule() {
+}
+
+/// BeginFunction - Gather pre-function exception information. Assumes it's
+/// being emitted immediately after the function entry point.
+void Win64Exception::BeginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+
+  shouldEmitMoves = Asm->needsSEHMoves();
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+
+  shouldEmitPersonality = hasLandingPads &&
+    PerEncoding != dwarf::DW_EH_PE_omit && Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitWin64EHStartProc(Asm->CurrentFnSym);
+
+  if (!shouldEmitPersonality)
+    return;
+
+  MCSymbol *GCCHandlerSym =
+    Asm->GetExternalSymbolSymbol("_GCC_specific_handler");
+  Asm->OutStreamer.EmitWin64EHHandler(GCCHandlerSym, true, true);
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
+                                                Asm->getFunctionNumber()));
+}
+
+/// EndFunction - Gather and emit post-function exception information.
+///
+void Win64Exception::EndFunction() {
+  if (!shouldEmitPersonality && !shouldEmitMoves)
+    return;
+
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
+                                                Asm->getFunctionNumber()));
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+    const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
+
+    Asm->OutStreamer.PushSection();
+    Asm->OutStreamer.EmitWin64EHHandlerData();
+    Asm->OutStreamer.EmitValue(MCSymbolRefExpr::Create(Sym, Asm->OutContext),
+                               4);
+    EmitExceptionTable();
+    Asm->OutStreamer.PopSection();
+  }
+  Asm->OutStreamer.EmitWin64EHEndProc();
+}
diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
new file mode 100644
index 000000000000..012ff8ad8339
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -0,0 +1,466 @@
+//===- BasicTargetTransformInfo.cpp - Basic target-independent TTI impl ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the implementation of a basic TargetTransformInfo pass
+/// predicated on the target abstractions present in the target independent
+/// code generator. It uses these (primarily TargetLowering) to model as much
+/// of the TTI query interface as possible. It is included by most targets so
+/// that they can specialize only a small subset of the query space.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "basictti"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+
+class BasicTTI : public ImmutablePass, public TargetTransformInfo {
+  const TargetLoweringBase *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  BasicTTI() : ImmutablePass(ID), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  BasicTTI(const TargetLoweringBase *TLI) : ImmutablePass(ID), TLI(TLI) {
+    initializeBasicTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  virtual bool isLegalAddImmediate(int64_t imm) const;
+  virtual bool isLegalICmpImmediate(int64_t imm) const;
+  virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
+                                     int64_t BaseOffset, bool HasBaseReg,
+                                     int64_t Scale) const;
+  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+  virtual bool isTypeLegal(Type *Ty) const;
+  virtual unsigned getJumpBufAlignment() const;
+  virtual unsigned getJumpBufSize() const;
+  virtual bool shouldBuildLookupTables() const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind,
+                                          OperandValueKind) const;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                  int Index, Type *SubTp) const;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+  virtual unsigned getCFInstrCost(unsigned Opcode) const;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+  virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
+                                         ArrayRef<Type*> Tys) const;
+  virtual unsigned getNumberOfParts(Type *Tp) const;
+  virtual unsigned getAddressComputationCost(Type *Ty) const;
+
+  /// @}
+};
+
+}
+
+INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti",
+                   "Target independent code generator's TTI", true, true, false)
+char BasicTTI::ID = 0;
+
+ImmutablePass *
+llvm::createBasicTargetTransformInfoPass(const TargetLoweringBase *TLI) {
+  return new BasicTTI(TLI);
+}
+
+
+bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
+  return TLI->isLegalAddImmediate(imm);
+}
+
+bool BasicTTI::isLegalICmpImmediate(int64_t imm) const {
+  return TLI->isLegalICmpImmediate(imm);
+}
+
+bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
+                                     int64_t BaseOffset, bool HasBaseReg,
+                                     int64_t Scale) const {
+  TargetLoweringBase::AddrMode AM;
+  AM.BaseGV = BaseGV;
+  AM.BaseOffs = BaseOffset;
+  AM.HasBaseReg = HasBaseReg;
+  AM.Scale = Scale;
+  return TLI->isLegalAddressingMode(AM, Ty);
+}
+
+bool BasicTTI::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  return TLI->isTruncateFree(Ty1, Ty2);
+}
+
+bool BasicTTI::isTypeLegal(Type *Ty) const {
+  EVT T = TLI->getValueType(Ty);
+  return TLI->isTypeLegal(T);
+}
+
+unsigned BasicTTI::getJumpBufAlignment() const {
+  return TLI->getJumpBufAlignment();
+}
+
+unsigned BasicTTI::getJumpBufSize() const {
+  return TLI->getJumpBufSize();
+}
+
+bool BasicTTI::shouldBuildLookupTables() const {
+  return TLI->supportJumpTables() &&
+      (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+       TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Calls used by the vectorizers.
+//
+//===----------------------------------------------------------------------===//
+
+unsigned BasicTTI::getScalarizationOverhead(Type *Ty, bool Insert,
+                                            bool Extract) const {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+unsigned BasicTTI::getNumberOfRegisters(bool Vector) const {
+  return 1;
+}
+
+unsigned BasicTTI::getRegisterBitWidth(bool Vector) const {
+  return 32;
+}
+
+unsigned BasicTTI::getMaximumUnrollFactor() const {
+  return 1;
+}
+
+unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind,
+                                          OperandValueKind) const {
+  // Check if any of the operands are vector operands.
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1.
+    // If the type is split to multiple registers, assume that thre is some
+    // overhead to this.
+    // TODO: Once we have extract/insert subvector cost we need to use them.
+    if (LT.first > 1)
+      return LT.first * 2;
+    return LT.first * 1;
+  }
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // If the operation is custom lowered then assume
+    // thare the code is twice as expensive.
+    return LT.first * 2;
+  }
+
+  // Else, assume that we need to scalarize this op.
+  if (Ty->isVectorTy()) {
+    unsigned Num = Ty->getVectorNumElements();
+    unsigned Cost = TopTTI->getArithmeticInstrCost(Opcode, Ty->getScalarType());
+    // return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(Ty, true, true) + Num * Cost;
+  }
+
+  // We don't know anything about this scalar instruction.
+  return 1;
+}
+
+unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                  Type *SubTp) const {
+  return 1;
+}
+
+unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
+
+  // Check for NOOP conversions.
+  if (SrcLT.first == DstLT.first &&
+      SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+      // Bitcast between types that are legalized to the same type are free.
+      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+        return 0;
+  }
+
+  if (Opcode == Instruction::Trunc &&
+      TLI->isTruncateFree(SrcLT.second, DstLT.second))
+    return 0;
+
+  if (Opcode == Instruction::ZExt &&
+      TLI->isZExtFree(SrcLT.second, DstLT.second))
+    return 0;
+
+  // If the cast is marked as legal (or promote) then assume low cost.
+  if (TLI->isOperationLegalOrPromote(ISD, DstLT.second))
+    return 1;
+
+  // Handle scalar conversions.
+  if (!Src->isVectorTy() && !Dst->isVectorTy()) {
+
+    // Scalar bitcasts are usually free.
+    if (Opcode == Instruction::BitCast)
+      return 0;
+
+    // Just check the op cost. If the operation is legal then assume it costs 1.
+    if (!TLI->isOperationExpand(ISD, DstLT.second))
+      return  1;
+
+    // Assume that illegal scalar instruction are expensive.
+    return 4;
+  }
+
+  // Check vector-to-vector casts.
+  if (Dst->isVectorTy() && Src->isVectorTy()) {
+
+    // If the cast is between same-sized registers, then the check is simple.
+    if (SrcLT.first == DstLT.first &&
+        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+      // Assume that Zext is done using AND.
+      if (Opcode == Instruction::ZExt)
+        return 1;
+
+      // Assume that sext is done using SHL and SRA.
+      if (Opcode == Instruction::SExt)
+        return 2;
+
+      // Just check the op cost. If the operation is legal then assume it costs
+      // 1 and multiply by the type-legalization overhead.
+      if (!TLI->isOperationExpand(ISD, DstLT.second))
+        return SrcLT.first * 1;
+    }
+
+    // If we are converting vectors and the operation is illegal, or
+    // if the vectors are legalized to different types, estimate the
+    // scalarization costs.
+    unsigned Num = Dst->getVectorNumElements();
+    unsigned Cost = TopTTI->getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+
+    // Return the cost of multiple scalar invocation plus the cost of
+    // inserting and extracting the values.
+    return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+  }
+
+  // We already handled vector-to-vector and scalar-to-scalar conversions. This
+  // is where we handle bitcast between vectors and scalars. We need to assume
+  //  that the conversion is scalarized in one way or another.
+  if (Opcode == Instruction::BitCast)
+    // Illegal bitcasts are done by storing and loading from a stack slot.
+    return (Src->isVectorTy()? getScalarizationOverhead(Src, false, true):0) +
+           (Dst->isVectorTy()? getScalarizationOverhead(Dst, true, false):0);
+
+  llvm_unreachable("Unhandled cast");
+ }
+
+unsigned BasicTTI::getCFInstrCost(unsigned Opcode) const {
+  // Branches are assumed to be predicted.
+  return 0;
+}
+
+unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Selects on vectors are actually vector selects.
+  if (ISD == ISD::SELECT) {
+    assert(CondTy && "CondTy must exist");
+    if (CondTy->isVectorTy())
+      ISD = ISD::VSELECT;
+  }
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  // Otherwise, assume that the cast is scalarized.
+  if (ValTy->isVectorTy()) {
+    unsigned Num = ValTy->getVectorNumElements();
+    if (CondTy)
+      CondTy = CondTy->getScalarType();
+    unsigned Cost = TopTTI->getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
+                                               CondTy);
+
+    // Return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
+  }
+
+  // Unknown scalar opcode.
+  return 1;
+}
+
+unsigned BasicTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  return 1;
+}
+
+unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  assert(!Src->isVoidTy() && "Invalid type");
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  // Assume that all loads of legal types cost 1.
+  return LT.first;
+}
+
+unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
+                                         ArrayRef<Type *> Tys) const {
+  unsigned ISD = 0;
+  switch (IID) {
+  default: {
+    // Assume that we need to scalarize this intrinsic.
+    unsigned ScalarizationCost = 0;
+    unsigned ScalarCalls = 1;
+    if (RetTy->isVectorTy()) {
+      ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+      ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
+    }
+    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
+      if (Tys[i]->isVectorTy()) {
+        ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
+      }
+    }
+
+    return ScalarCalls + ScalarizationCost;
+  }
+  // Look for intrinsics that can be lowered directly or turned into a scalar
+  // intrinsic call.
+  case Intrinsic::sqrt:    ISD = ISD::FSQRT;  break;
+  case Intrinsic::sin:     ISD = ISD::FSIN;   break;
+  case Intrinsic::cos:     ISD = ISD::FCOS;   break;
+  case Intrinsic::exp:     ISD = ISD::FEXP;   break;
+  case Intrinsic::exp2:    ISD = ISD::FEXP2;  break;
+  case Intrinsic::log:     ISD = ISD::FLOG;   break;
+  case Intrinsic::log10:   ISD = ISD::FLOG10; break;
+  case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
+  case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
+  case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
+  case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
+  case Intrinsic::rint:    ISD = ISD::FRINT;  break;
+  case Intrinsic::pow:     ISD = ISD::FPOW;   break;
+  case Intrinsic::fma:     ISD = ISD::FMA;    break;
+  case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
+  }
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
+
+  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1.
+    // If the type is split to multiple registers, assume that thre is some
+    // overhead to this.
+    // TODO: Once we have extract/insert subvector cost we need to use them.
+    if (LT.first > 1)
+      return LT.first * 2;
+    return LT.first * 1;
+  }
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // If the operation is custom lowered then assume
+    // thare the code is twice as expensive.
+    return LT.first * 2;
+  }
+
+  // Else, assume that we need to scalarize this intrinsic. For math builtins
+  // this will emit a costly libcall, adding call overhead and spills. Make it
+  // very expensive.
+  if (RetTy->isVectorTy()) {
+    unsigned Num = RetTy->getVectorNumElements();
+    unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
+                                                  Tys);
+    return 10 * Cost * Num;
+  }
+
+  // This is going to be turned into a library call, make it expensive.
+  return 10;
+}
+
+unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+  return LT.first;
+}
+
+unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
new file mode 100644
index 000000000000..f8cc3b3999e8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -0,0 +1,1725 @@
+//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass forwards branches to unconditional branches to make them branch
+// directly to the target block.  This pass often results in dead MBB's, which
+// it then removes.
+//
+// Note that this pass must be run after register allocation, it cannot handle
+// SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "branchfolding"
+#include "BranchFolding.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumBranchOpts, "Number of branches optimized");
+STATISTIC(NumTailMerge , "Number of block tails merged");
+STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
+
+static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
+                              cl::init(cl::BOU_UNSET), cl::Hidden);
+
+// Throttle for huge numbers of predecessors (compile speed problems)
+static cl::opt<unsigned>
+TailMergeThreshold("tail-merge-threshold",
+          cl::desc("Max number of predecessors to consider tail merging"),
+          cl::init(150), cl::Hidden);
+
+// Heuristic for tail merging (and, inversely, tail duplication).
+// TODO: This should be replaced with a target query.
+static cl::opt<unsigned>
+TailMergeSize("tail-merge-size",
+          cl::desc("Min number of instructions to consider tail merging"),
+                              cl::init(3), cl::Hidden);
+
+namespace {
+  /// BranchFolderPass - Wrap branch folder in a machine function pass.
+  class BranchFolderPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    explicit BranchFolderPass(): MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetPassConfig>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char BranchFolderPass::ID = 0;
+char &llvm::BranchFolderPassID = BranchFolderPass::ID;
+
+INITIALIZE_PASS(BranchFolderPass, "branch-folder",
+                "Control Flow Optimizer", false, false)
+
+bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  BranchFolder Folder(PassConfig->getEnableTailMerge(), /*CommonHoist=*/true);
+  return Folder.OptimizeFunction(MF,
+                                 MF.getTarget().getInstrInfo(),
+                                 MF.getTarget().getRegisterInfo(),
+                                 getAnalysisIfAvailable<MachineModuleInfo>());
+}
+
+
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist) {
+  switch (FlagEnableTailMerge) {
+  case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
+  case cl::BOU_TRUE: EnableTailMerge = true; break;
+  case cl::BOU_FALSE: EnableTailMerge = false; break;
+  }
+
+  EnableHoistCommonCode = CommonHoist;
+}
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  MachineFunction *MF = MBB->getParent();
+  // drop all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Avoid matching if this pointer gets reused.
+  TriedMerging.erase(MBB);
+
+  // Remove the block.
+  MF->erase(MBB);
+}
+
+/// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
+/// followed by terminators, and if the implicitly defined registers are not
+/// used by the terminators, remove those implicit_def's. e.g.
+/// BB1:
+///   r0 = implicit_def
+///   r1 = implicit_def
+///   br
+/// This block can be optimized away later if the implicit instructions are
+/// removed.
+bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
+  SmallSet<unsigned, 4> ImpDefRegs;
+  MachineBasicBlock::iterator I = MBB->begin();
+  while (I != MBB->end()) {
+    if (!I->isImplicitDef())
+      break;
+    unsigned Reg = I->getOperand(0).getReg();
+    ImpDefRegs.insert(Reg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      ImpDefRegs.insert(*SubRegs);
+    ++I;
+  }
+  if (ImpDefRegs.empty())
+    return false;
+
+  MachineBasicBlock::iterator FirstTerm = I;
+  while (I != MBB->end()) {
+    if (!TII->isUnpredicatedTerminator(I))
+      return false;
+    // See if it uses any of the implicitly defined registers.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = I->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (ImpDefRegs.count(Reg))
+        return false;
+    }
+    ++I;
+  }
+
+  I = MBB->begin();
+  while (I != FirstTerm) {
+    MachineInstr *ImpDefMI = &*I;
+    ++I;
+    MBB->erase(ImpDefMI);
+  }
+
+  return true;
+}
+
+/// OptimizeFunction - Perhaps branch folding, tail merging and other
+/// CFG optimizations on the given function.
+bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                    const TargetInstrInfo *tii,
+                                    const TargetRegisterInfo *tri,
+                                    MachineModuleInfo *mmi) {
+  if (!tii) return false;
+
+  TriedMerging.clear();
+
+  TII = tii;
+  TRI = tri;
+  MMI = mmi;
+  RS = NULL;
+
+  // Use a RegScavenger to help update liveness when required.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
+    RS = new RegScavenger();
+  else
+    MRI.invalidateLiveness();
+
+  // Fix CFG.  The later algorithms expect it to be right.
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; I++) {
+    MachineBasicBlock *MBB = I, *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true))
+      MadeChange |= MBB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+    MadeChange |= OptimizeImpDefsBlock(MBB);
+  }
+
+  bool MadeChangeThisIteration = true;
+  while (MadeChangeThisIteration) {
+    MadeChangeThisIteration    = TailMergeBlocks(MF);
+    MadeChangeThisIteration   |= OptimizeBranches(MF);
+    if (EnableHoistCommonCode)
+      MadeChangeThisIteration |= HoistCommonCode(MF);
+    MadeChange |= MadeChangeThisIteration;
+  }
+
+  // See if any jump tables have become dead as the code generator
+  // did its thing.
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  if (JTI == 0) {
+    delete RS;
+    return MadeChange;
+  }
+
+  // Walk the function to find jump tables that are live.
+  BitVector JTIsLive(JTI->getJumpTables().size());
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end();
+       BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I)
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+        MachineOperand &Op = I->getOperand(op);
+        if (!Op.isJTI()) continue;
+
+        // Remember that this JT is live.
+        JTIsLive.set(Op.getIndex());
+      }
+  }
+
+  // Finally, remove dead jump tables.  This happens when the
+  // indirect jump was unreachable (and thus deleted).
+  for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
+    if (!JTIsLive.test(i)) {
+      JTI->RemoveJumpTable(i);
+      MadeChange = true;
+    }
+
+  delete RS;
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Tail Merging of Blocks
+//===----------------------------------------------------------------------===//
+
+/// HashMachineInstr - Compute a hash value for MI and its operands.
+static unsigned HashMachineInstr(const MachineInstr *MI) {
+  unsigned Hash = MI->getOpcode();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+
+    // Merge in bits from the operand if easy.
+    unsigned OperandHash = 0;
+    switch (Op.getType()) {
+    case MachineOperand::MO_Register:          OperandHash = Op.getReg(); break;
+    case MachineOperand::MO_Immediate:         OperandHash = Op.getImm(); break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OperandHash = Op.getMBB()->getNumber();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      OperandHash = Op.getIndex();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      // Global address / external symbol are too hard, don't bother, but do
+      // pull in the offset.
+      OperandHash = Op.getOffset();
+      break;
+    default: break;
+    }
+
+    Hash += ((OperandHash << 3) | Op.getType()) << (i&31);
+  }
+  return Hash;
+}
+
+/// HashEndOfMBB - Hash the last instruction in the MBB.
+static unsigned HashEndOfMBB(const MachineBasicBlock *MBB) {
+  MachineBasicBlock::const_iterator I = MBB->end();
+  if (I == MBB->begin())
+    return 0;   // Empty MBB.
+
+  --I;
+  // Skip debug info so it will not affect codegen.
+  while (I->isDebugValue()) {
+    if (I==MBB->begin())
+      return 0;      // MBB empty except for debug info.
+    --I;
+  }
+
+  return HashMachineInstr(I);
+}
+
+/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
+/// of instructions they actually have in common together at their end.  Return
+/// iterators for the first shared instruction in each block.
+static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
+                                        MachineBasicBlock *MBB2,
+                                        MachineBasicBlock::iterator &I1,
+                                        MachineBasicBlock::iterator &I2) {
+  I1 = MBB1->end();
+  I2 = MBB2->end();
+
+  unsigned TailLen = 0;
+  while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
+    --I1; --I2;
+    // Skip debugging pseudos; necessary to avoid changing the code.
+    while (I1->isDebugValue()) {
+      if (I1==MBB1->begin()) {
+        while (I2->isDebugValue()) {
+          if (I2==MBB2->begin())
+            // I1==DBG at begin; I2==DBG at begin
+            return TailLen;
+          --I2;
+        }
+        ++I2;
+        // I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
+        return TailLen;
+      }
+      --I1;
+    }
+    // I1==first (untested) non-DBG preceding known match
+    while (I2->isDebugValue()) {
+      if (I2==MBB2->begin()) {
+        ++I1;
+        // I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
+        return TailLen;
+      }
+      --I2;
+    }
+    // I1, I2==first (untested) non-DBGs preceding known match
+    if (!I1->isIdenticalTo(I2) ||
+        // FIXME: This check is dubious. It's used to get around a problem where
+        // people incorrectly expect inline asm directives to remain in the same
+        // relative order. This is untenable because normal compiler
+        // optimizations (like this one) may reorder and/or merge these
+        // directives.
+        I1->isInlineAsm()) {
+      ++I1; ++I2;
+      break;
+    }
+    ++TailLen;
+  }
+  // Back past possible debugging pseudos at beginning of block.  This matters
+  // when one block differs from the other only by whether debugging pseudos
+  // are present at the beginning.  (This way, the various checks later for
+  // I1==MBB1->begin() work as expected.)
+  if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
+    --I2;
+    while (I2->isDebugValue()) {
+      if (I2 == MBB2->begin())
+        return TailLen;
+      --I2;
+    }
+    ++I2;
+  }
+  if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
+    --I1;
+    while (I1->isDebugValue()) {
+      if (I1 == MBB1->begin())
+        return TailLen;
+      --I1;
+    }
+    ++I1;
+  }
+  return TailLen;
+}
+
+void BranchFolder::MaintainLiveIns(MachineBasicBlock *CurMBB,
+                                   MachineBasicBlock *NewMBB) {
+  if (RS) {
+    RS->enterBasicBlock(CurMBB);
+    if (!CurMBB->empty())
+      RS->forward(prior(CurMBB->end()));
+    BitVector RegsLiveAtExit(TRI->getNumRegs());
+    RS->getRegsUsed(RegsLiveAtExit, false);
+    for (unsigned int i = 0, e = TRI->getNumRegs(); i != e; i++)
+      if (RegsLiveAtExit[i])
+        NewMBB->addLiveIn(i);
+  }
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                           MachineBasicBlock *NewDest) {
+  MachineBasicBlock *CurMBB = OldInst->getParent();
+
+  TII->ReplaceTailWithBranchTo(OldInst, NewDest);
+
+  // For targets that use the register scavenger, we must maintain LiveIns.
+  MaintainLiveIns(CurMBB, NewDest);
+
+  ++NumTailMerge;
+}
+
+/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
+/// MBB so that the part before the iterator falls into the part starting at the
+/// iterator.  This returns the new MBB.
+MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
+                                            MachineBasicBlock::iterator BBI1) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return 0;
+
+  MachineFunction &MF = *CurMBB.getParent();
+
+  // Create the fall-through block.
+  MachineFunction::iterator MBBI = &CurMBB;
+  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(CurMBB.getBasicBlock());
+  CurMBB.getParent()->insert(++MBBI, NewMBB);
+
+  // Move all the successors of this block to the specified block.
+  NewMBB->transferSuccessors(&CurMBB);
+
+  // Add an edge from CurMBB to NewMBB for the fall-through.
+  CurMBB.addSuccessor(NewMBB);
+
+  // Splice the code over.
+  NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
+
+  // For targets that use the register scavenger, we must maintain LiveIns.
+  MaintainLiveIns(&CurMBB, NewMBB);
+
+  return NewMBB;
+}
+
+/// EstimateRuntime - Make a rough estimate for how long it will take to run
+/// the specified code.
+static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
+                                MachineBasicBlock::iterator E) {
+  unsigned Time = 0;
+  for (; I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    if (I->isCall())
+      Time += 10;
+    else if (I->mayLoad() || I->mayStore())
+      Time += 2;
+    else
+      ++Time;
+  }
+  return Time;
+}
+
+// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
+// branches temporarily for tail merging).  In the case where CurMBB ends
+// with a conditional branch to the next block, optimize by reversing the
+// test and conditionally branching to SuccMBB instead.
+static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
+                    const TargetInstrInfo *TII) {
+  MachineFunction *MF = CurMBB->getParent();
+  MachineFunction::iterator I = llvm::next(MachineFunction::iterator(CurMBB));
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (I != MF->end() &&
+      !TII->AnalyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+    MachineBasicBlock *NextBB = I;
+    if (TBB == NextBB && !Cond.empty() && !FBB) {
+      if (!TII->ReverseBranchCondition(Cond)) {
+        TII->RemoveBranch(*CurMBB);
+        TII->InsertBranch(*CurMBB, SuccBB, NULL, Cond, dl);
+        return;
+      }
+    }
+  }
+  TII->InsertBranch(*CurMBB, SuccBB, NULL,
+                    SmallVector<MachineOperand, 0>(), dl);
+}
+
+bool
+BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
+  if (getHash() < o.getHash())
+    return true;
+  if (getHash() > o.getHash())
+    return false;
+  if (getBlock()->getNumber() < o.getBlock()->getNumber())
+    return true;
+  if (getBlock()->getNumber() > o.getBlock()->getNumber())
+    return false;
+  // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+  // an object with itself.
+#ifndef _GLIBCXX_DEBUG
+  llvm_unreachable("Predecessor appears twice");
+#else
+  return false;
+#endif
+}
+
+/// CountTerminators - Count the number of terminators in the given
+/// block and set I to the position of the first non-terminator, if there
+/// is one, or MBB->end() otherwise.
+static unsigned CountTerminators(MachineBasicBlock *MBB,
+                                 MachineBasicBlock::iterator &I) {
+  I = MBB->end();
+  unsigned NumTerms = 0;
+  for (;;) {
+    if (I == MBB->begin()) {
+      I = MBB->end();
+      break;
+    }
+    --I;
+    if (!I->isTerminator()) break;
+    ++NumTerms;
+  }
+  return NumTerms;
+}
+
+/// ProfitableToMerge - Check if two machine basic blocks have a common tail
+/// and decide if it would be profitable to merge those tails.  Return the
+/// length of the common tail and iterators to the first common instruction
+/// in each block.
+static bool ProfitableToMerge(MachineBasicBlock *MBB1,
+                              MachineBasicBlock *MBB2,
+                              unsigned minCommonTailLength,
+                              unsigned &CommonTailLen,
+                              MachineBasicBlock::iterator &I1,
+                              MachineBasicBlock::iterator &I2,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB) {
+  CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
+  if (CommonTailLen == 0)
+    return false;
+  DEBUG(dbgs() << "Common tail length of BB#" << MBB1->getNumber()
+               << " and BB#" << MBB2->getNumber() << " is " << CommonTailLen
+               << '\n');
+
+  // It's almost always profitable to merge any number of non-terminator
+  // instructions with the block that falls through into the common successor.
+  if (MBB1 == PredBB || MBB2 == PredBB) {
+    MachineBasicBlock::iterator I;
+    unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
+    if (CommonTailLen > NumTerms)
+      return true;
+  }
+
+  // If one of the blocks can be completely merged and happens to be in
+  // a position where the other could fall through into it, merge any number
+  // of instructions, because it can be done without a branch.
+  // TODO: If the blocks are not adjacent, move one of them so that they are?
+  if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
+    return true;
+  if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
+    return true;
+
+  // If both blocks have an unconditional branch temporarily stripped out,
+  // count that as an additional common instruction for the following
+  // heuristics.
+  unsigned EffectiveTailLen = CommonTailLen;
+  if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
+      !MBB1->back().isBarrier() &&
+      !MBB2->back().isBarrier())
+    ++EffectiveTailLen;
+
+  // Check if the common tail is long enough to be worthwhile.
+  if (EffectiveTailLen >= minCommonTailLength)
+    return true;
+
+  // If we are optimizing for code size, 2 instructions in common is enough if
+  // we don't have to split a block.  At worst we will be introducing 1 new
+  // branch instruction, which is likely to be smaller than the 2
+  // instructions that would be deleted in the merge.
+  MachineFunction *MF = MBB1->getParent();
+  if (EffectiveTailLen >= 2 &&
+      MF->getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      (I1 == MBB1->begin() || I2 == MBB2->begin()))
+    return true;
+
+  return false;
+}
+
+/// ComputeSameTails - Look through all the blocks in MergePotentials that have
+/// hash CurHash (guaranteed to match the last element).  Build the vector
+/// SameTails of all those that have the (same) largest number of instructions
+/// in common of any pair of these blocks.  SameTails entries contain an
+/// iterator into MergePotentials (from which the MachineBasicBlock can be
+/// found) and a MachineBasicBlock::iterator into that MBB indicating the
+/// instruction where the matching code sequence begins.
+/// Order of elements in SameTails is the reverse of the order in which
+/// those blocks appear in MergePotentials (where they are not necessarily
+/// consecutive).
+unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
+                                        unsigned minCommonTailLength,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  unsigned maxCommonTailLength = 0U;
+  SameTails.clear();
+  MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
+  MPIterator HighestMPIter = prior(MergePotentials.end());
+  for (MPIterator CurMPIter = prior(MergePotentials.end()),
+                  B = MergePotentials.begin();
+       CurMPIter != B && CurMPIter->getHash() == CurHash;
+       --CurMPIter) {
+    for (MPIterator I = prior(CurMPIter); I->getHash() == CurHash ; --I) {
+      unsigned CommonTailLen;
+      if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
+                            minCommonTailLength,
+                            CommonTailLen, TrialBBI1, TrialBBI2,
+                            SuccBB, PredBB)) {
+        if (CommonTailLen > maxCommonTailLength) {
+          SameTails.clear();
+          maxCommonTailLength = CommonTailLen;
+          HighestMPIter = CurMPIter;
+          SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
+        }
+        if (HighestMPIter == CurMPIter &&
+            CommonTailLen == maxCommonTailLength)
+          SameTails.push_back(SameTailElt(I, TrialBBI2));
+      }
+      if (I == B)
+        break;
+    }
+  }
+  return maxCommonTailLength;
+}
+
+/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
+/// MergePotentials, restoring branches at ends of blocks as appropriate.
+void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  MPIterator CurMPIter, B;
+  for (CurMPIter = prior(MergePotentials.end()), B = MergePotentials.begin();
+       CurMPIter->getHash() == CurHash;
+       --CurMPIter) {
+    // Put the unconditional branch back, if we need one.
+    MachineBasicBlock *CurMBB = CurMPIter->getBlock();
+    if (SuccBB && CurMBB != PredBB)
+      FixTail(CurMBB, SuccBB, TII);
+    if (CurMPIter == B)
+      break;
+  }
+  if (CurMPIter->getHash() != CurHash)
+    CurMPIter++;
+  MergePotentials.erase(CurMPIter, MergePotentials.end());
+}
+
+/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
+/// only of the common tail.  Create a block that does by splitting one.
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
+  unsigned TimeEstimate = ~0U;
+  for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+    // Use PredBB if possible; that doesn't require a new branch.
+    if (SameTails[i].getBlock() == PredBB) {
+      commonTailIndex = i;
+      break;
+    }
+    // Otherwise, make a (fairly bogus) choice based on estimate of
+    // how long it will take the various blocks to execute.
+    unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
+                                 SameTails[i].getTailStartPos());
+    if (t <= TimeEstimate) {
+      TimeEstimate = t;
+      commonTailIndex = i;
+    }
+  }
+
+  MachineBasicBlock::iterator BBI =
+    SameTails[commonTailIndex].getTailStartPos();
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  // If the common tail includes any debug info we will take it pretty
+  // randomly from one of the inputs.  Might be better to remove it?
+  DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
+               << maxCommonTailLength);
+
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  if (!newMBB) {
+    DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
+  SameTails[commonTailIndex].setBlock(newMBB);
+  SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
+
+  // If we split PredBB, newMBB is the new predecessor.
+  if (PredBB == MBB)
+    PredBB = newMBB;
+
+  return true;
+}
+
+// See if any of the blocks in MergePotentials (which all have a common single
+// successor, or all have no successor) can be tail-merged.  If there is a
+// successor, any blocks in MergePotentials that are not tail-merged and
+// are not immediately before Succ must have an unconditional branch to
+// Succ added (but the predecessor/successor lists need no adjustment).
+// The lone predecessor of Succ that falls through into Succ,
+// if any, is given in PredBB.
+
+bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
+                                      MachineBasicBlock *PredBB) {
+  bool MadeChange = false;
+
+  // Except for the special cases below, tail-merge if there are at least
+  // this many instructions in common.
+  unsigned minCommonTailLength = TailMergeSize;
+
+  DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
+        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+          dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
+                 << (i == e-1 ? "" : ", ");
+        dbgs() << "\n";
+        if (SuccBB) {
+          dbgs() << "  with successor BB#" << SuccBB->getNumber() << '\n';
+          if (PredBB)
+            dbgs() << "  which has fall-through from BB#"
+                   << PredBB->getNumber() << "\n";
+        }
+        dbgs() << "Looking for common tails of at least "
+               << minCommonTailLength << " instruction"
+               << (minCommonTailLength == 1 ? "" : "s") << '\n';
+       );
+
+  // Sort by hash value so that blocks with identical end sequences sort
+  // together.
+  std::stable_sort(MergePotentials.begin(), MergePotentials.end());
+
+  // Walk through equivalence sets looking for actual exact matches.
+  while (MergePotentials.size() > 1) {
+    unsigned CurHash = MergePotentials.back().getHash();
+
+    // Build SameTails, identifying the set of blocks with this hash code
+    // and with the maximum number of instructions in common.
+    unsigned maxCommonTailLength = ComputeSameTails(CurHash,
+                                                    minCommonTailLength,
+                                                    SuccBB, PredBB);
+
+    // If we didn't find any pair that has at least minCommonTailLength
+    // instructions in common, remove all blocks with this hash code and retry.
+    if (SameTails.empty()) {
+      RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+      continue;
+    }
+
+    // If one of the blocks is the entire common tail (and not the entry
+    // block, which we can't jump to), we can treat all blocks with this same
+    // tail at once.  Use PredBB if that is one of the possibilities, as that
+    // will not introduce any extra branches.
+    MachineBasicBlock *EntryBB = MergePotentials.begin()->getBlock()->
+                                 getParent()->begin();
+    unsigned commonTailIndex = SameTails.size();
+    // If there are two blocks, check to see if one can be made to fall through
+    // into the other.
+    if (SameTails.size() == 2 &&
+        SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
+        SameTails[1].tailIsWholeBlock())
+      commonTailIndex = 1;
+    else if (SameTails.size() == 2 &&
+             SameTails[1].getBlock()->isLayoutSuccessor(
+                                                     SameTails[0].getBlock()) &&
+             SameTails[0].tailIsWholeBlock())
+      commonTailIndex = 0;
+    else {
+      // Otherwise just pick one, favoring the fall-through predecessor if
+      // there is one.
+      for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+        MachineBasicBlock *MBB = SameTails[i].getBlock();
+        if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
+          continue;
+        if (MBB == PredBB) {
+          commonTailIndex = i;
+          break;
+        }
+        if (SameTails[i].tailIsWholeBlock())
+          commonTailIndex = i;
+      }
+    }
+
+    if (commonTailIndex == SameTails.size() ||
+        (SameTails[commonTailIndex].getBlock() == PredBB &&
+         !SameTails[commonTailIndex].tailIsWholeBlock())) {
+      // None of the blocks consist entirely of the common tail.
+      // Split a block so that one does.
+      if (!CreateCommonTailOnlyBlock(PredBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
+    }
+
+    MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+    // MBB is common tail.  Adjust all other BB's to jump to this one.
+    // Traversal must be forwards so erases work.
+    DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
+                 << " for ");
+    for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
+      if (commonTailIndex == i)
+        continue;
+      DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
+                   << (i == e-1 ? "" : ", "));
+      // Hack the end off BB i, making it jump to BB commonTailIndex instead.
+      ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
+      // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
+      MergePotentials.erase(SameTails[i].getMPIter());
+    }
+    DEBUG(dbgs() << "\n");
+    // We leave commonTailIndex in the worklist in case there are other blocks
+    // that match it with a smaller number of instructions.
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+  if (!EnableTailMerge) return MadeChange;
+
+  // First find blocks with no successors.
+  MergePotentials.clear();
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E && MergePotentials.size() < TailMergeThreshold; ++I) {
+    if (TriedMerging.count(I))
+      continue;
+    if (I->succ_empty())
+      MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(I), I));
+  }
+
+  // If this is a large problem, avoid visiting the same basic blocks
+  // multiple times.
+  if (MergePotentials.size() == TailMergeThreshold)
+    for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+      TriedMerging.insert(MergePotentials[i].getBlock());
+
+  // See if we can do any tail merging on those.
+  if (MergePotentials.size() >= 2)
+    MadeChange |= TryTailMergeBlocks(NULL, NULL);
+
+  // Look at blocks (IBB) with multiple predecessors (PBB).
+  // We change each predecessor to a canonical form, by
+  // (1) temporarily removing any unconditional branch from the predecessor
+  // to IBB, and
+  // (2) alter conditional branches so they branch to the other block
+  // not IBB; this may require adding back an unconditional branch to IBB
+  // later, where there wasn't one coming in.  E.g.
+  //   Bcc IBB
+  //   fallthrough to QBB
+  // here becomes
+  //   Bncc QBB
+  // with a conceptual B to IBB after that, which never actually exists.
+  // With those changes, we see whether the predecessors' tails match,
+  // and merge them if so.  We change things out of canonical form and
+  // back to the way they were later in the process.  (OptimizeBranches
+  // would undo some of this, but we can't use it, because we'd get into
+  // a compile-time infinite loop repeatedly doing and undoing the same
+  // transformations.)
+
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+       I != E; ++I) {
+    if (I->pred_size() < 2) continue;
+    SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+    MachineBasicBlock *IBB = I;
+    MachineBasicBlock *PredBB = prior(I);
+    MergePotentials.clear();
+    for (MachineBasicBlock::pred_iterator P = I->pred_begin(),
+           E2 = I->pred_end();
+         P != E2 && MergePotentials.size() < TailMergeThreshold; ++P) {
+      MachineBasicBlock *PBB = *P;
+      if (TriedMerging.count(PBB))
+        continue;
+
+      // Skip blocks that loop to themselves, can't tail merge these.
+      if (PBB == IBB)
+        continue;
+
+      // Visit each predecessor only once.
+      if (!UniquePreds.insert(PBB))
+        continue;
+
+      // Skip blocks which may jump to a landing pad. Can't tail merge these.
+      if (PBB->getLandingPadSuccessor())
+        continue;
+
+      MachineBasicBlock *TBB = 0, *FBB = 0;
+      SmallVector<MachineOperand, 4> Cond;
+      if (!TII->AnalyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+        // Failing case: IBB is the target of a cbr, and we cannot reverse the
+        // branch.
+        SmallVector<MachineOperand, 4> NewCond(Cond);
+        if (!Cond.empty() && TBB == IBB) {
+          if (TII->ReverseBranchCondition(NewCond))
+            continue;
+          // This is the QBB case described above
+          if (!FBB)
+            FBB = llvm::next(MachineFunction::iterator(PBB));
+        }
+
+        // Failing case: the only way IBB can be reached from PBB is via
+        // exception handling.  Happens for landing pads.  Would be nice to have
+        // a bit in the edge so we didn't have to do all this.
+        if (IBB->isLandingPad()) {
+          MachineFunction::iterator IP = PBB;  IP++;
+          MachineBasicBlock *PredNextBB = NULL;
+          if (IP != MF.end())
+            PredNextBB = IP;
+          if (TBB == NULL) {
+            if (IBB != PredNextBB)      // fallthrough
+              continue;
+          } else if (FBB) {
+            if (TBB != IBB && FBB != IBB)   // cbr then ubr
+              continue;
+          } else if (Cond.empty()) {
+            if (TBB != IBB)               // ubr
+              continue;
+          } else {
+            if (TBB != IBB && IBB != PredNextBB)  // cbr
+              continue;
+          }
+        }
+
+        // Remove the unconditional branch at the end, if any.
+        if (TBB && (Cond.empty() || FBB)) {
+          DebugLoc dl;  // FIXME: this is nowhere
+          TII->RemoveBranch(*PBB);
+          if (!Cond.empty())
+            // reinsert conditional branch only, for now
+            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, 0, NewCond, dl);
+        }
+
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(PBB), *P));
+      }
+    }
+
+    // If this is a large problem, avoid visiting the same basic blocks multiple
+    // times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
+
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(IBB, PredBB);
+
+    // Reinsert an unconditional branch if needed. The 1 below can occur as a
+    // result of removing blocks in TryTailMergeBlocks.
+    PredBB = prior(I);     // this may have been changed in TryTailMergeBlocks
+    if (MergePotentials.size() == 1 &&
+        MergePotentials.begin()->getBlock() != PredBB)
+      FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
+  }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Branch Optimization
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+
+  for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+       I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    MadeChange |= OptimizeBlock(MBB);
+
+    // If it is dead, remove it.
+    if (MBB->pred_empty()) {
+      RemoveDeadBlock(MBB);
+      MadeChange = true;
+      ++NumDeadBlocks;
+    }
+  }
+  return MadeChange;
+}
+
+// Blocks should be considered empty if they contain only debug info;
+// else the debug info would affect codegen.
+static bool IsEmptyBlock(MachineBasicBlock *MBB) {
+  if (MBB->empty())
+    return true;
+  for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+       MBBI!=MBBE; ++MBBI) {
+    if (!MBBI->isDebugValue())
+      return false;
+  }
+  return true;
+}
+
+// Blocks with only debug info and branches should be considered the same
+// as blocks with only branches.
+static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator MBBI, MBBE;
+  for (MBBI = MBB->begin(), MBBE = MBB->end(); MBBI!=MBBE; ++MBBI) {
+    if (!MBBI->isDebugValue())
+      break;
+  }
+  return (MBBI->isBranch());
+}
+
+/// IsBetterFallthrough - Return true if it would be clearly better to
+/// fall-through to MBB1 than to fall through into MBB2.  This has to return
+/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
+/// result in infinite loops.
+static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
+                                MachineBasicBlock *MBB2) {
+  // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
+  // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
+  // optimize branches that branch to either a return block or an assert block
+  // into a fallthrough to the return.
+  if (IsEmptyBlock(MBB1) || IsEmptyBlock(MBB2)) return false;
+
+  // If there is a clear successor ordering we make sure that one block
+  // will fall through to the next
+  if (MBB1->isSuccessor(MBB2)) return true;
+  if (MBB2->isSuccessor(MBB1)) return false;
+
+  // Neither block consists entirely of debug info (per IsEmptyBlock check),
+  // so we needn't test for falling off the beginning here.
+  MachineBasicBlock::iterator MBB1I = --MBB1->end();
+  while (MBB1I->isDebugValue())
+    --MBB1I;
+  MachineBasicBlock::iterator MBB2I = --MBB2->end();
+  while (MBB2I->isDebugValue())
+    --MBB2I;
+  return MBB2I->isCall() && !MBB1I->isCall();
+}
+
+/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
+/// instructions on the block. Always use the DebugLoc of the first
+/// branching instruction found unless its absent, in which case use the
+/// DebugLoc of the second if present.
+static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return DebugLoc();
+  --I;
+  while (I->isDebugValue() && I != MBB.begin())
+    --I;
+  if (I->isBranch())
+    return I->getDebugLoc();
+  return DebugLoc();
+}
+
+/// OptimizeBlock - Analyze and optimize control flow related to the specified
+/// block.  This is never called on the entry block.
+bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  MachineFunction &MF = *MBB->getParent();
+ReoptimizeBlock:
+
+  MachineFunction::iterator FallThrough = MBB;
+  ++FallThrough;
+
+  // If this block is empty, make everyone use its fall-through, not the block
+  // explicitly.  Landing pads should not do this since the landing-pad table
+  // points to this block.  Blocks with their addresses taken shouldn't be
+  // optimized away.
+  if (IsEmptyBlock(MBB) && !MBB->isLandingPad() && !MBB->hasAddressTaken()) {
+    // Dead block?  Leave for cleanup later.
+    if (MBB->pred_empty()) return MadeChange;
+
+    if (FallThrough == MF.end()) {
+      // TODO: Simplify preds to not branch here if possible!
+    } else {
+      // Rewrite all predecessors of the old block to go to the fallthrough
+      // instead.
+      while (!MBB->pred_empty()) {
+        MachineBasicBlock *Pred = *(MBB->pred_end()-1);
+        Pred->ReplaceUsesOfBlockWith(MBB, FallThrough);
+      }
+      // If MBB was the target of a jump table, update jump tables to go to the
+      // fallthrough instead.
+      if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+        MJTI->ReplaceMBBInJumpTables(MBB, FallThrough);
+      MadeChange = true;
+    }
+    return MadeChange;
+  }
+
+  // Check to see if we can simplify the terminator of the block before this
+  // one.
+  MachineBasicBlock &PrevBB = *prior(MachineFunction::iterator(MBB));
+
+  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+    TII->AnalyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  if (!PriorUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
+                                              !PriorCond.empty());
+
+    // If the previous branch is conditional and both conditions go to the same
+    // destination, remove the branch, replacing it with an unconditional one or
+    // a fall-through.
+    if (PriorTBB && PriorTBB == PriorFBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->RemoveBranch(PrevBB);
+      PriorCond.clear();
+      if (PriorTBB != MBB)
+        TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the previous block unconditionally falls through to this block and
+    // this block has no other predecessors, move the contents of this block
+    // into the prior block. This doesn't usually happen when SimplifyCFG
+    // has been used, but it can happen if tail merging splits a fall-through
+    // predecessor of a block.
+    // This has to check PrevBB->succ_size() because EH edges are ignored by
+    // AnalyzeBranch.
+    if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
+        PrevBB.succ_size() == 1 &&
+        !MBB->hasAddressTaken() && !MBB->isLandingPad()) {
+      DEBUG(dbgs() << "\nMerging into block: " << PrevBB
+                   << "From MBB: " << *MBB);
+      // Remove redundant DBG_VALUEs first.
+      if (PrevBB.begin() != PrevBB.end()) {
+        MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
+        --PrevBBIter;
+        MachineBasicBlock::iterator MBBIter = MBB->begin();
+        // Check if DBG_VALUE at the end of PrevBB is identical to the
+        // DBG_VALUE at the beginning of MBB.
+        while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
+               && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
+          if (!MBBIter->isIdenticalTo(PrevBBIter))
+            break;
+          MachineInstr *DuplicateDbg = MBBIter;
+          ++MBBIter; -- PrevBBIter;
+          DuplicateDbg->eraseFromParent();
+        }
+      }
+      PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
+      PrevBB.removeSuccessor(PrevBB.succ_begin());
+      assert(PrevBB.succ_empty());
+      PrevBB.transferSuccessors(MBB);
+      MadeChange = true;
+      return MadeChange;
+    }
+
+    // If the previous branch *only* branches to *this* block (conditional or
+    // not) remove the branch.
+    if (PriorTBB == MBB && PriorFBB == 0) {
+      TII->RemoveBranch(PrevBB);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches somewhere else on the condition and here if
+    // the condition is false, remove the uncond second branch.
+    if (PriorFBB == MBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->RemoveBranch(PrevBB);
+      TII->InsertBranch(PrevBB, PriorTBB, 0, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches here on true and somewhere else on false, and
+    // if the branch condition is reversible, reverse the branch to create a
+    // fall-through.
+    if (PriorTBB == MBB) {
+      SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+      if (!TII->ReverseBranchCondition(NewPriorCond)) {
+        DebugLoc dl = getBranchDebugLoc(PrevBB);
+        TII->RemoveBranch(PrevBB);
+        TII->InsertBranch(PrevBB, PriorFBB, 0, NewPriorCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this block has no successors (e.g. it is a return block or ends with
+    // a call to a no-return function like abort or __cxa_throw) and if the pred
+    // falls through into this block, and if it would otherwise fall through
+    // into the block after this, move this block to the end of the function.
+    //
+    // We consider it more likely that execution will stay in the function (e.g.
+    // due to loops) than it is to exit it.  This asserts in loops etc, moving
+    // the assert condition out of the loop body.
+    if (MBB->succ_empty() && !PriorCond.empty() && PriorFBB == 0 &&
+        MachineFunction::iterator(PriorTBB) == FallThrough &&
+        !MBB->canFallThrough()) {
+      bool DoTransform = true;
+
+      // We have to be careful that the succs of PredBB aren't both no-successor
+      // blocks.  If neither have successors and if PredBB is the second from
+      // last block in the function, we'd just keep swapping the two blocks for
+      // last.  Only do the swap if one is clearly better to fall through than
+      // the other.
+      if (FallThrough == --MF.end() &&
+          !IsBetterFallthrough(PriorTBB, MBB))
+        DoTransform = false;
+
+      if (DoTransform) {
+        // Reverse the branch so we will fall through on the previous true cond.
+        SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+        if (!TII->ReverseBranchCondition(NewPriorCond)) {
+          DEBUG(dbgs() << "\nMoving MBB: " << *MBB
+                       << "To make fallthrough to: " << *PriorTBB << "\n");
+
+          DebugLoc dl = getBranchDebugLoc(PrevBB);
+          TII->RemoveBranch(PrevBB);
+          TII->InsertBranch(PrevBB, MBB, 0, NewPriorCond, dl);
+
+          // Move this block to the end of the function.
+          MBB->moveAfter(--MF.end());
+          MadeChange = true;
+          ++NumBranchOpts;
+          return MadeChange;
+        }
+      }
+    }
+  }
+
+  // Analyze the branch in the current block.
+  MachineBasicBlock *CurTBB = 0, *CurFBB = 0;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable= TII->AnalyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+  if (!CurUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
+
+    // If this is a two-way branch, and the FBB branches to this block, reverse
+    // the condition so the single-basic-block loop is faster.  Instead of:
+    //    Loop: xxx; jcc Out; jmp Loop
+    // we want:
+    //    Loop: xxx; jncc Loop; jmp Out
+    if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
+      SmallVector<MachineOperand, 4> NewCond(CurCond);
+      if (!TII->ReverseBranchCondition(NewCond)) {
+        DebugLoc dl = getBranchDebugLoc(*MBB);
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this branch is the only thing in its block, see if we can forward
+    // other blocks across it.
+    if (CurTBB && CurCond.empty() && CurFBB == 0 &&
+        IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
+        !MBB->hasAddressTaken()) {
+      DebugLoc dl = getBranchDebugLoc(*MBB);
+      // This block may contain just an unconditional branch.  Because there can
+      // be 'non-branch terminators' in the block, try removing the branch and
+      // then seeing if the block is empty.
+      TII->RemoveBranch(*MBB);
+      // If the only things remaining in the block are debug info, remove these
+      // as well, so this will behave the same as an empty block in non-debug
+      // mode.
+      if (!MBB->empty()) {
+        bool NonDebugInfoFound = false;
+        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+             I != E; ++I) {
+          if (!I->isDebugValue()) {
+            NonDebugInfoFound = true;
+            break;
+          }
+        }
+        if (!NonDebugInfoFound)
+          // Make the block empty, losing the debug info (we could probably
+          // improve this in some cases.)
+          MBB->erase(MBB->begin(), MBB->end());
+      }
+      // If this block is just an unconditional branch to CurTBB, we can
+      // usually completely eliminate the block.  The only case we cannot
+      // completely eliminate the block is when the block before this one
+      // falls through into MBB and we can't understand the prior block's branch
+      // condition.
+      if (MBB->empty()) {
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
+        if (PredHasNoFallThrough || !PriorUnAnalyzable ||
+            !PrevBB.isSuccessor(MBB)) {
+          // If the prior block falls through into us, turn it into an
+          // explicit branch to us to make updates simpler.
+          if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
+              PriorTBB != MBB && PriorFBB != MBB) {
+            if (PriorTBB == 0) {
+              assert(PriorCond.empty() && PriorFBB == 0 &&
+                     "Bad branch analysis");
+              PriorTBB = MBB;
+            } else {
+              assert(PriorFBB == 0 && "Machine CFG out of date!");
+              PriorFBB = MBB;
+            }
+            DebugLoc pdl = getBranchDebugLoc(PrevBB);
+            TII->RemoveBranch(PrevBB);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
+          }
+
+          // Iterate through all the predecessors, revectoring each in-turn.
+          size_t PI = 0;
+          bool DidChange = false;
+          bool HasBranchToSelf = false;
+          while(PI != MBB->pred_size()) {
+            MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
+            if (PMBB == MBB) {
+              // If this block has an uncond branch to itself, leave it.
+              ++PI;
+              HasBranchToSelf = true;
+            } else {
+              DidChange = true;
+              PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
+              // If this change resulted in PMBB ending in a conditional
+              // branch where both conditions go to the same destination,
+              // change this to an unconditional branch (and fix the CFG).
+              MachineBasicBlock *NewCurTBB = 0, *NewCurFBB = 0;
+              SmallVector<MachineOperand, 4> NewCurCond;
+              bool NewCurUnAnalyzable = TII->AnalyzeBranch(*PMBB, NewCurTBB,
+                      NewCurFBB, NewCurCond, true);
+              if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                DebugLoc pdl = getBranchDebugLoc(*PMBB);
+                TII->RemoveBranch(*PMBB);
+                NewCurCond.clear();
+                TII->InsertBranch(*PMBB, NewCurTBB, 0, NewCurCond, pdl);
+                MadeChange = true;
+                ++NumBranchOpts;
+                PMBB->CorrectExtraCFGEdges(NewCurTBB, 0, false);
+              }
+            }
+          }
+
+          // Change any jumptables to go to the new MBB.
+          if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+            MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
+          if (DidChange) {
+            ++NumBranchOpts;
+            MadeChange = true;
+            if (!HasBranchToSelf) return MadeChange;
+          }
+        }
+      }
+
+      // Add the branch back if the block is more than just an uncond branch.
+      TII->InsertBranch(*MBB, CurTBB, 0, CurCond, dl);
+    }
+  }
+
+  // If the prior block doesn't fall through into this block, and if this
+  // block doesn't fall through into some other block, see if we can find a
+  // place to move this block where a fall-through will happen.
+  if (!PrevBB.canFallThrough()) {
+
+    // Now we know that there was no fall-through into this block, check to
+    // see if it has a fall-through into its successor.
+    bool CurFallsThru = MBB->canFallThrough();
+
+    if (!MBB->isLandingPad()) {
+      // Check all the predecessors of this block.  If one of them has no fall
+      // throughs, move this block right after it.
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+           E = MBB->pred_end(); PI != E; ++PI) {
+        // Analyze the branch at the end of the pred.
+        MachineBasicBlock *PredBB = *PI;
+        MachineFunction::iterator PredFallthrough = PredBB; ++PredFallthrough;
+        MachineBasicBlock *PredTBB = 0, *PredFBB = 0;
+        SmallVector<MachineOperand, 4> PredCond;
+        if (PredBB != MBB && !PredBB->canFallThrough() &&
+            !TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)
+            && (!CurFallsThru || !CurTBB || !CurFBB)
+            && (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+          // If the current block doesn't fall through, just move it.
+          // If the current block can fall through and does not end with a
+          // conditional branch, we need to append an unconditional jump to
+          // the (current) next block.  To avoid a possible compile-time
+          // infinite loop, move blocks only backward in this case.
+          // Also, if there are already 2 branches here, we cannot add a third;
+          // this means we have the case
+          // Bcc next
+          // B elsewhere
+          // next:
+          if (CurFallsThru) {
+            MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+            CurCond.clear();
+            TII->InsertBranch(*MBB, NextBB, 0, CurCond, DebugLoc());
+          }
+          MBB->moveAfter(PredBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+    }
+
+    if (!CurFallsThru) {
+      // Check all successors to see if we can move this block before it.
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           E = MBB->succ_end(); SI != E; ++SI) {
+        // Analyze the branch at the end of the block before the succ.
+        MachineBasicBlock *SuccBB = *SI;
+        MachineFunction::iterator SuccPrev = SuccBB; --SuccPrev;
+
+        // If this block doesn't already fall-through to that successor, and if
+        // the succ doesn't already have a block that can fall through into it,
+        // and if the successor isn't an EH destination, we can arrange for the
+        // fallthrough to happen.
+        if (SuccBB != MBB && &*SuccPrev != MBB &&
+            !SuccPrev->canFallThrough() && !CurUnAnalyzable &&
+            !SuccBB->isLandingPad()) {
+          MBB->moveBefore(SuccBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+
+      // Okay, there is no really great place to put this block.  If, however,
+      // the block before this one would be a fall-through if this block were
+      // removed, move this block to the end of the function.
+      MachineBasicBlock *PrevTBB = 0, *PrevFBB = 0;
+      SmallVector<MachineOperand, 4> PrevCond;
+      if (FallThrough != MF.end() &&
+          !TII->AnalyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
+          PrevBB.isSuccessor(FallThrough)) {
+        MBB->moveAfter(--MF.end());
+        MadeChange = true;
+        return MadeChange;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Hoist Common Code
+//===----------------------------------------------------------------------===//
+
+/// HoistCommonCode - Hoist common instruction sequences at the start of basic
+/// blocks to their common predecessor.
+bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+    MadeChange |= HoistCommonCodeInSuccs(MBB);
+  }
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return NULL;
+}
+
+/// findHoistingInsertPosAndDeps - Find the location to move common instructions
+/// in successors to. The location is usually just before the terminator,
+/// however if the terminator is a conditional branch and its previous
+/// instruction is the flag setting instruction, the previous instruction is
+/// the preferred location. This function also gathers uses and defs of the
+/// instructions from the insertion point to the end of the block. The data is
+/// used by HoistCommonCodeInSuccs to ensure safety.
+static
+MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
+                                                  const TargetInstrInfo *TII,
+                                                  const TargetRegisterInfo *TRI,
+                                                  SmallSet<unsigned,4> &Uses,
+                                                  SmallSet<unsigned,4> &Defs) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  if (!TII->isUnpredicatedTerminator(Loc))
+    return MBB->end();
+
+  for (unsigned i = 0, e = Loc->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Loc->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    } else if (!MO.isDead())
+      // Don't try to hoist code in the rare case the terminator defines a
+      // register that is later used.
+      return MBB->end();
+  }
+
+  if (Uses.empty())
+    return Loc;
+  if (Loc == MBB->begin())
+    return MBB->end();
+
+  // The terminator is probably a conditional branch, try not to separate the
+  // branch from condition setting instruction.
+  MachineBasicBlock::iterator PI = Loc;
+  --PI;
+  while (PI != MBB->begin() && Loc->isDebugValue())
+    --PI;
+
+  bool IsDef = false;
+  for (unsigned i = 0, e = PI->getNumOperands(); !IsDef && i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    // If PI has a regmask operand, it is probably a call. Separate away.
+    if (MO.isRegMask())
+      return Loc;
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Uses.count(Reg))
+      IsDef = true;
+  }
+  if (!IsDef)
+    // The condition setting instruction is not just before the conditional
+    // branch.
+    return Loc;
+
+  // Be conservative, don't insert instruction above something that may have
+  // side-effects. And since it's potentially bad to separate flag setting
+  // instruction from the conditional branch, just abort the optimization
+  // completely.
+  // Also avoid moving code above predicated instruction since it's hard to
+  // reason about register liveness with predicated instruction.
+  bool DontMoveAcrossStore = true;
+  if (!PI->isSafeToMove(TII, 0, DontMoveAcrossStore) ||
+      TII->isPredicated(PI))
+    return MBB->end();
+
+
+  // Find out what registers are live. Note this routine is ignoring other live
+  // registers which are only used by instructions in successor blocks.
+  for (unsigned i = 0, e = PI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = PI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    } else {
+      if (Uses.erase(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          Uses.erase(*SubRegs); // Use sub-registers to be conservative
+      }
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        Defs.insert(*AI);
+    }
+  }
+
+  return PI;
+}
+
+/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
+/// sequence at the start of the function, move the instructions before MBB
+/// terminator if it's legal.
+bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+    return false;
+
+  if (!FBB) FBB = findFalseBlock(MBB, TBB);
+  if (!FBB)
+    // Malformed bcc? True and false blocks are the same?
+    return false;
+
+  // Restrict the optimization to cases where MBB is the only predecessor,
+  // it is an obvious win.
+  if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
+    return false;
+
+  // Find a suitable position to hoist the common instructions to. Also figure
+  // out which registers are used or defined by instructions from the insertion
+  // point to the end of the block.
+  SmallSet<unsigned, 4> Uses, Defs;
+  MachineBasicBlock::iterator Loc =
+    findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
+  if (Loc == MBB->end())
+    return false;
+
+  bool HasDups = false;
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallSet<unsigned, 4> LocalDefsSet;
+  MachineBasicBlock::iterator TIB = TBB->begin();
+  MachineBasicBlock::iterator FIB = FBB->begin();
+  MachineBasicBlock::iterator TIE = TBB->end();
+  MachineBasicBlock::iterator FIE = FBB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(FIB, MachineInstr::CheckKillDead))
+      break;
+
+    if (TII->isPredicated(TIB))
+      // Hard to reason about register liveness with predicated instruction.
+      break;
+
+    bool IsSafe = true;
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      // Don't attempt to hoist instructions with register masks.
+      if (MO.isRegMask()) {
+        IsSafe = false;
+        break;
+      }
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(Reg)) {
+          // Avoid clobbering a register that's used by the instruction at
+          // the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (Defs.count(Reg) && !MO.isDead()) {
+          // Don't hoist the instruction if the def would be clobber by the
+          // instruction at the point insertion. FIXME: This is overly
+          // conservative. It should be possible to hoist the instructions
+          // in BB2 in the following example:
+          // BB1:
+          // r1, eflag = op1 r2, r3
+          // brcc eflag
+          //
+          // BB2:
+          // r1 = op2, ...
+          //    = op3, r1<kill>
+          IsSafe = false;
+          break;
+        }
+      } else if (!LocalDefsSet.count(Reg)) {
+        if (Defs.count(Reg)) {
+          // Use is defined by the instruction at the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (MO.isKill() && Uses.count(Reg))
+          // Kills a register that's read by the instruction at the point of
+          // insertion. Remove the kill marker.
+          MO.setIsKill(false);
+      }
+    }
+    if (!IsSafe)
+      break;
+
+    bool DontMoveAcrossStore = true;
+    if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore))
+      break;
+
+    // Remove kills from LocalDefsSet, these registers had short live ranges.
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || !LocalDefsSet.count(Reg))
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.erase(*AI);
+    }
+
+    // Track local defs so we can update liveins.
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || MO.isDead())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      LocalDefs.push_back(Reg);
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        LocalDefsSet.insert(*AI);
+    }
+
+    HasDups = true;
+    ++TIB;
+    ++FIB;
+  }
+
+  if (!HasDups)
+    return false;
+
+  MBB->splice(Loc, TBB, TBB->begin(), TIB);
+  FBB->erase(FBB->begin(), FIB);
+
+  // Update livein's.
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Def = LocalDefs[i];
+    if (LocalDefsSet.count(Def)) {
+      TBB->addLiveIn(Def);
+      FBB->addLiveIn(Def);
+    }
+  }
+
+  ++NumHoist;
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h
new file mode 100644
index 000000000000..df795dfc248e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.h
@@ -0,0 +1,123 @@
+//===-- BranchFolding.h - Fold machine code branch instructions --*- C++ -*===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_BRANCHFOLDING_HPP
+#define LLVM_CODEGEN_BRANCHFOLDING_HPP
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <vector>
+
+namespace llvm {
+  class MachineFunction;
+  class MachineModuleInfo;
+  class RegScavenger;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+
+  class BranchFolder {
+  public:
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist);
+
+    bool OptimizeFunction(MachineFunction &MF,
+                          const TargetInstrInfo *tii,
+                          const TargetRegisterInfo *tri,
+                          MachineModuleInfo *mmi);
+  private:
+    class MergePotentialsElt {
+      unsigned Hash;
+      MachineBasicBlock *Block;
+    public:
+      MergePotentialsElt(unsigned h, MachineBasicBlock *b)
+        : Hash(h), Block(b) {}
+
+      unsigned getHash() const { return Hash; }
+      MachineBasicBlock *getBlock() const { return Block; }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        Block = MBB;
+      }
+
+      bool operator<(const MergePotentialsElt &) const;
+    };
+    typedef std::vector<MergePotentialsElt>::iterator MPIterator;
+    std::vector<MergePotentialsElt> MergePotentials;
+    SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
+
+    class SameTailElt {
+      MPIterator MPIter;
+      MachineBasicBlock::iterator TailStartPos;
+    public:
+      SameTailElt(MPIterator mp, MachineBasicBlock::iterator tsp)
+        : MPIter(mp), TailStartPos(tsp) {}
+
+      MPIterator getMPIter() const {
+        return MPIter;
+      }
+      MergePotentialsElt &getMergePotentialsElt() const {
+        return *getMPIter();
+      }
+      MachineBasicBlock::iterator getTailStartPos() const {
+        return TailStartPos;
+      }
+      unsigned getHash() const {
+        return getMergePotentialsElt().getHash();
+      }
+      MachineBasicBlock *getBlock() const {
+        return getMergePotentialsElt().getBlock();
+      }
+      bool tailIsWholeBlock() const {
+        return TailStartPos == getBlock()->begin();
+      }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        getMergePotentialsElt().setBlock(MBB);
+      }
+      void setTailStartPos(MachineBasicBlock::iterator Pos) {
+        TailStartPos = Pos;
+      }
+    };
+    std::vector<SameTailElt> SameTails;
+
+    bool EnableTailMerge;
+    bool EnableHoistCommonCode;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineModuleInfo *MMI;
+    RegScavenger *RS;
+
+    bool TailMergeBlocks(MachineFunction &MF);
+    bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
+                       MachineBasicBlock* PredBB);
+    void MaintainLiveIns(MachineBasicBlock *CurMBB,
+                         MachineBasicBlock *NewMBB);
+    void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                 MachineBasicBlock *NewDest);
+    MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
+                                  MachineBasicBlock::iterator BBI1);
+    unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB);
+    void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
+                                                MachineBasicBlock* PredBB);
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
+
+    bool OptimizeBranches(MachineFunction &MF);
+    bool OptimizeBlock(MachineBasicBlock *MBB);
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+    bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+
+    bool HoistCommonCode(MachineFunction &MF);
+    bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
+  };
+}
+
+#endif /* LLVM_CODEGEN_BRANCHFOLDING_HPP */
diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 000000000000..dee339a45863
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,202 @@
+//===------------------------ CalcSpillWeights.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "calcspillweights"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+char CalculateSpillWeights::ID = 0;
+INITIALIZE_PASS_BEGIN(CalculateSpillWeights, "calcspillweights",
+                "Calculate spill weights", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(CalculateSpillWeights, "calcspillweights",
+                "Calculate spill weights", false, false)
+
+void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
+  au.addRequired<LiveIntervals>();
+  au.addRequired<MachineLoopInfo>();
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
+  }
+  return false;
+}
+
+// Return the preferred allocation register for reg, given a COPY instruction.
+static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+                         const TargetRegisterInfo &tri,
+                         const MachineRegisterInfo &mri) {
+  unsigned sub, hreg, hsub;
+  if (mi->getOperand(0).getReg() == reg) {
+    sub = mi->getOperand(0).getSubReg();
+    hreg = mi->getOperand(1).getReg();
+    hsub = mi->getOperand(1).getSubReg();
+  } else {
+    sub = mi->getOperand(1).getSubReg();
+    hreg = mi->getOperand(0).getReg();
+    hsub = mi->getOperand(0).getSubReg();
+  }
+
+  if (!hreg)
+    return 0;
+
+  if (TargetRegisterInfo::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : 0;
+
+  const TargetRegisterClass *rc = mri.getRegClass(reg);
+
+  // Only allow physreg hints in rc.
+  if (sub == 0)
+    return rc->contains(hreg) ? hreg : 0;
+
+  // reg:sub should match the physreg hreg.
+  return tri.getMatchingSuperReg(hreg, sub, rc);
+}
+
+// Check if all values in LI are rematerializable
+static bool isRematerializable(const LiveInterval &LI,
+                               const LiveIntervals &LIS,
+                               const TargetInstrInfo &TII) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    if (VNI->isPHIDef())
+      return false;
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Dead valno in interval");
+
+    if (!TII.isTriviallyReMaterializable(MI, LIS.getAliasAnalysis()))
+      return false;
+  }
+  return true;
+}
+
+void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+  MachineRegisterInfo &mri = MF.getRegInfo();
+  const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
+  MachineBasicBlock *mbb = 0;
+  MachineLoop *loop = 0;
+  unsigned loopDepth = 0;
+  bool isExiting = false;
+  float totalWeight = 0;
+  SmallPtrSet<MachineInstr*, 8> visited;
+
+  // Find the best physreg hist and the best virtreg hint.
+  float bestPhys = 0, bestVirt = 0;
+  unsigned hintPhys = 0, hintVirt = 0;
+
+  // Don't recompute a target specific hint.
+  bool noHint = mri.getRegAllocationHint(li.reg).first != 0;
+
+  // Don't recompute spill weight for an unspillable register.
+  bool Spillable = li.isSpillable();
+
+  for (MachineRegisterInfo::reg_iterator I = mri.reg_begin(li.reg);
+       MachineInstr *mi = I.skipInstruction();) {
+    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
+      continue;
+    if (!visited.insert(mi))
+      continue;
+
+    float weight = 1.0f;
+    if (Spillable) {
+      // Get loop info for mi.
+      if (mi->getParent() != mbb) {
+        mbb = mi->getParent();
+        loop = Loops.getLoopFor(mbb);
+        loopDepth = loop ? loop->getLoopDepth() : 0;
+        isExiting = loop ? loop->isLoopExiting(mbb) : false;
+      }
+
+      // Calculate instr weight.
+      bool reads, writes;
+      tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
+      weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth);
+
+      // Give extra weight to what looks like a loop induction variable update.
+      if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
+        weight *= 3;
+
+      totalWeight += weight;
+    }
+
+    // Get allocation hints from copies.
+    if (noHint || !mi->isCopy())
+      continue;
+    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    if (!hint)
+      continue;
+    float hweight = Hint[hint] += weight;
+    if (TargetRegisterInfo::isPhysicalRegister(hint)) {
+      if (hweight > bestPhys && mri.isAllocatable(hint))
+        bestPhys = hweight, hintPhys = hint;
+    } else {
+      if (hweight > bestVirt)
+        bestVirt = hweight, hintVirt = hint;
+    }
+  }
+
+  Hint.clear();
+
+  // Always prefer the physreg hint.
+  if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
+    mri.setRegAllocationHint(li.reg, 0, hint);
+    // Weakly boost the spill weight of hinted registers.
+    totalWeight *= 1.01F;
+  }
+
+  // If the live interval was already unspillable, leave it that way.
+  if (!Spillable)
+    return;
+
+  // Mark li as unspillable if all live ranges are tiny.
+  if (li.isZeroLength(LIS.getSlotIndexes())) {
+    li.markNotSpillable();
+    return;
+  }
+
+  // If all of the definitions of the interval are re-materializable,
+  // it is a preferred candidate for spilling.
+  // FIXME: this gets much more complicated once we support non-trivial
+  // re-materialization.
+  if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
+    totalWeight *= 0.5F;
+
+  li.weight = normalizeSpillWeight(totalWeight, li.getSize());
+}
diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
new file mode 100644
index 000000000000..f1d4ace92273
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -0,0 +1,180 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
+                 const TargetMachine &tm, SmallVector<CCValAssign, 16> &locs,
+                 LLVMContext &C)
+  : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
+    CallOrPrologue(Unknown) {
+  // No stack is used.
+  StackOffset = 0;
+
+  clearFirstByValReg();
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate space on the stack large enough to pass an argument
+// by value. The size and alignment information of the argument is encoded in
+// its parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          int MinSize, int MinAlign,
+                          ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  MF.getFrameInfo()->ensureMaxAlignment(Align);
+  TM.getTargetLowering()->HandleByVal(this, Size, Align);
+  unsigned Offset = AllocateStack(Size, Align);
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(unsigned Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an array of argument values,
+/// incorporating info about the formals into this state.
+void
+CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  unsigned NumArgs = Ins.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// CheckReturn - Analyze the return values of a function, returning true if
+/// the return can be performed without sret-demotion, and false otherwise.
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                          CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      return false;
+  }
+  return true;
+}
+
+/// AnalyzeReturn - Analyze the returned values of a return,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Return operand #" << i << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Analyze the outgoing arguments to a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    MVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << i << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+#ifndef NDEBUG
+    dbgs() << "Call result has unhandled type "
+           << EVT(VT).getEVTString() << '\n';
+#endif
+    llvm_unreachable(0);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp
new file mode 100644
index 000000000000..35ec68d00cec
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp
@@ -0,0 +1,77 @@
+//===-- CodeGen.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common initialization routines for the
+// CodeGen library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+
+using namespace llvm;
+
+/// initializeCodeGen - Initialize all passes linked into the CodeGen library.
+void llvm::initializeCodeGen(PassRegistry &Registry) {
+  initializeBasicTTIPass(Registry);
+  initializeBranchFolderPassPass(Registry);
+  initializeCalculateSpillWeightsPass(Registry);
+  initializeDeadMachineInstructionElimPass(Registry);
+  initializeEarlyIfConverterPass(Registry);
+  initializeExpandPostRAPass(Registry);
+  initializeExpandISelPseudosPass(Registry);
+  initializeFinalizeMachineBundlesPass(Registry);
+  initializeGCMachineCodeAnalysisPass(Registry);
+  initializeGCModuleInfoPass(Registry);
+  initializeIfConverterPass(Registry);
+  initializeLiveDebugVariablesPass(Registry);
+  initializeLiveIntervalsPass(Registry);
+  initializeLiveStacksPass(Registry);
+  initializeLiveVariablesPass(Registry);
+  initializeLocalStackSlotPassPass(Registry);
+  initializeMachineBlockFrequencyInfoPass(Registry);
+  initializeMachineBlockPlacementPass(Registry);
+  initializeMachineBlockPlacementStatsPass(Registry);
+  initializeMachineCopyPropagationPass(Registry);
+  initializeMachineCSEPass(Registry);
+  initializeMachineDominatorTreePass(Registry);
+  initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineLICMPass(Registry);
+  initializeMachineLoopInfoPass(Registry);
+  initializeMachineModuleInfoPass(Registry);
+  initializeMachineSchedulerPass(Registry);
+  initializeMachineSinkingPass(Registry);
+  initializeMachineVerifierPassPass(Registry);
+  initializeOptimizePHIsPass(Registry);
+  initializePHIEliminationPass(Registry);
+  initializePeepholeOptimizerPass(Registry);
+  initializePostRASchedulerPass(Registry);
+  initializeProcessImplicitDefsPass(Registry);
+  initializePEIPass(Registry);
+  initializeRegisterCoalescerPass(Registry);
+  initializeSlotIndexesPass(Registry);
+  initializeStackProtectorPass(Registry);
+  initializeStackColoringPass(Registry);
+  initializeStackSlotColoringPass(Registry);
+  initializeStrongPHIEliminationPass(Registry);
+  initializeTailDuplicatePassPass(Registry);
+  initializeTargetPassConfigPass(Registry);
+  initializeTwoAddressInstructionPassPass(Registry);
+  initializeUnpackMachineBundlesPass(Registry);
+  initializeUnreachableBlockElimPass(Registry);
+  initializeUnreachableMachineBlockElimPass(Registry);
+  initializeVirtRegMapPass(Registry);
+  initializeVirtRegRewriterPass(Registry);
+  initializeLowerIntrinsicsPass(Registry);
+  initializeMachineFunctionPrinterPassPass(Registry);
+}
+
+void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
+  initializeCodeGen(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
new file mode 100644
index 000000000000..0eb74a40d589
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -0,0 +1,658 @@
+//===----- CriticalAntiDepBreaker.cpp - Anti-dep breaker -------- ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+CriticalAntiDepBreaker::
+CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo &RCI) :
+  AntiDepBreaker(), MF(MFi),
+  MRI(MF.getRegInfo()),
+  TII(MF.getTarget().getInstrInfo()),
+  TRI(MF.getTarget().getRegisterInfo()),
+  RegClassInfo(RCI),
+  Classes(TRI->getNumRegs(), static_cast<const TargetRegisterClass *>(0)),
+  KillIndices(TRI->getNumRegs(), 0),
+  DefIndices(TRI->getNumRegs(), 0),
+  KeepRegs(TRI->getNumRegs(), false) {}
+
+CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
+}
+
+void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    // Clear out the register class data.
+    Classes[i] = static_cast<const TargetRegisterClass *>(0);
+
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+
+  // Clear "do not change" set.
+  KeepRegs.reset();
+
+  bool IsReturnBlock = (BBSize != 0 && BB->back().isReturn());
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+           E = (*SI)->livein_end(); I != E; ++I) {
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(BB);
+  for (const uint16_t *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+      unsigned Reg = *AI;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BBSize;
+      DefIndices[Reg] = ~0u;
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::FinishBlock() {
+  RegRefs.clear();
+  KeepRegs.reset();
+}
+
+void CriticalAntiDepBreaker::Observe(MachineInstr *MI, unsigned Count,
+                                     unsigned InsertPosIndex) {
+  if (MI->isDebugValue())
+    return;
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
+    if (KillIndices[Reg] != ~0u) {
+      // If Reg is currently live, then mark that it can't be renamed as
+      // we don't know the extent of its live-range anymore (now that it
+      // has been scheduled).
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = Count;
+    } else if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
+      // Any register which was defined within the previous scheduling region
+      // may have been rescheduled and its lifetime may overlap with registers
+      // in ways not reflected in our current liveness state. For each such
+      // register, adjust the liveness state to be conservatively correct.
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+      // Move the def index to the end of the previous region, to reflect
+      // that the def could theoretically have been scheduled at the end.
+      DefIndices[Reg] = InsertPosIndex;
+    }
+  }
+
+  PrescanInstruction(MI);
+  ScanInstruction(MI, Count);
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SDep *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = 0;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    const SUnit *PredSU = P->getSUnit();
+    unsigned PredLatency = P->getLatency();
+    unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+    // In the case of a latency tie, prefer an anti-dependency edge over
+    // other types of edges.
+    if (NextDepth < PredTotalLatency ||
+        (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+      NextDepth = PredTotalLatency;
+      Next = &*P;
+    }
+  }
+  return Next;
+}
+
+void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
+  // It's not safe to change register allocation for source operands of
+  // that have special allocation requirements. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservative here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI->isCall() ||
+    MI->hasExtraSrcRegAllocReq() ||
+    TII->isPredicated(MI);
+
+  // Scan the register operands for this instruction and update
+  // Classes and RegRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    const TargetRegisterClass *NewRC = 0;
+
+    if (i < MI->getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    // Now check for aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      // If an alias of the reg is used during the live range, give up.
+      // Note that this allows us to skip checking if AntiDepReg
+      // overlaps with any of the aliases, among other things.
+      unsigned AliasReg = *AI;
+      if (Classes[AliasReg]) {
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+
+    // If we're still willing to consider this register, note the reference.
+    if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
+      RegRefs.insert(std::make_pair(Reg, &MO));
+
+    if (MO.isUse() && Special) {
+      if (!KeepRegs.test(Reg)) {
+        KeepRegs.set(Reg);
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          KeepRegs.set(*SubRegs);
+      }
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::ScanInstruction(MachineInstr *MI,
+                                             unsigned Count) {
+  // Update liveness.
+  // Proceeding upwards, registers that are defed but not used in this
+  // instruction are now dead.
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+
+      if (MO.isRegMask())
+        for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
+          if (MO.clobbersPhysReg(i)) {
+            DefIndices[i] = Count;
+            KillIndices[i] = ~0u;
+            KeepRegs.reset(i);
+            Classes[i] = 0;
+            RegRefs.erase(i);
+          }
+
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      DefIndices[Reg] = Count;
+      KillIndices[Reg] = ~0u;
+      assert(((KillIndices[Reg] == ~0u) !=
+              (DefIndices[Reg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for Reg!");
+      KeepRegs.reset(Reg);
+      Classes[Reg] = 0;
+      RegRefs.erase(Reg);
+      // Repeat, for all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubregReg = *SubRegs;
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        KeepRegs.reset(SubregReg);
+        Classes[SubregReg] = 0;
+        RegRefs.erase(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+        Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1);
+    }
+  }
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isUse()) continue;
+
+    const TargetRegisterClass *NewRC = 0;
+    if (i < MI->getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI->getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // It wasn't previously live but now it is, this is a kill.
+    if (KillIndices[Reg] == ~0u) {
+      KillIndices[Reg] = Count;
+      DefIndices[Reg] = ~0u;
+          assert(((KillIndices[Reg] == ~0u) !=
+                  (DefIndices[Reg] == ~0u)) &&
+               "Kill and Def maps aren't consistent for Reg!");
+    }
+    // Repeat, for all aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (KillIndices[AliasReg] == ~0u) {
+        KillIndices[AliasReg] = Count;
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+}
+
+// Check all machine operands that reference the antidependent register and must
+// be replaced by NewReg. Return true if any of their parent instructions may
+// clobber the new register.
+//
+// Note: AntiDepReg may be referenced by a two-address instruction such that
+// it's use operand is tied to a def operand. We guard against the case in which
+// the two-address instruction also defines NewReg, as may happen with
+// pre/postincrement loads. In this case, both the use and def operands are in
+// RegRefs because the def is inserted by PrescanInstruction and not erased
+// during ScanInstruction. So checking for an instructions with definitions of
+// both NewReg and AntiDepReg covers it.
+bool
+CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                                RegRefIter RegRefEnd,
+                                                unsigned NewReg)
+{
+  for (RegRefIter I = RegRefBegin; I != RegRefEnd; ++I ) {
+    MachineOperand *RefOper = I->second;
+
+    // Don't allow the instruction defining AntiDepReg to earlyclobber its
+    // operands, in case they may be assigned to NewReg. In this case antidep
+    // breaking must fail, but it's too rare to bother optimizing.
+    if (RefOper->isDef() && RefOper->isEarlyClobber())
+      return true;
+
+    // Handle cases in which this instructions defines NewReg.
+    MachineInstr *MI = RefOper->getParent();
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &CheckOper = MI->getOperand(i);
+
+      if (CheckOper.isRegMask() && CheckOper.clobbersPhysReg(NewReg))
+        return true;
+
+      if (!CheckOper.isReg() || !CheckOper.isDef() ||
+          CheckOper.getReg() != NewReg)
+        continue;
+
+      // Don't allow the instruction to define NewReg and AntiDepReg.
+      // When AntiDepReg is renamed it will be an illegal op.
+      if (RefOper->isDef())
+        return true;
+
+      // Don't allow an instruction using AntiDepReg to be earlyclobbered by
+      // NewReg
+      if (CheckOper.isEarlyClobber())
+        return true;
+
+      // Don't allow inline asm to define NewReg at all. Who know what it's
+      // doing with it.
+      if (MI->isInlineAsm())
+        return true;
+    }
+  }
+  return false;
+}
+
+unsigned CriticalAntiDepBreaker::
+findSuitableFreeRegister(RegRefIter RegRefBegin,
+                         RegRefIter RegRefEnd,
+                         unsigned AntiDepReg,
+                         unsigned LastNewReg,
+                         const TargetRegisterClass *RC,
+                         SmallVector<unsigned, 2> &Forbid)
+{
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
+  for (unsigned i = 0; i != Order.size(); ++i) {
+    unsigned NewReg = Order[i];
+    // Don't replace a register with itself.
+    if (NewReg == AntiDepReg) continue;
+    // Don't replace a register with one that was recently used to repair
+    // an anti-dependence with this AntiDepReg, because that would
+    // re-introduce that anti-dependence.
+    if (NewReg == LastNewReg) continue;
+    // If any instructions that define AntiDepReg also define the NewReg, it's
+    // not suitable.  For example, Instruction with multiple definitions can
+    // result in this condition.
+    if (isNewRegClobberedByRefs(RegRefBegin, RegRefEnd, NewReg)) continue;
+    // If NewReg is dead and NewReg's most recent def is not before
+    // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg.
+    assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u))
+           && "Kill and Def maps aren't consistent for AntiDepReg!");
+    assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u))
+           && "Kill and Def maps aren't consistent for NewReg!");
+    if (KillIndices[NewReg] != ~0u ||
+        Classes[NewReg] == reinterpret_cast<TargetRegisterClass *>(-1) ||
+        KillIndices[AntiDepReg] > DefIndices[NewReg])
+      continue;
+    // If NewReg overlaps any of the forbidden registers, we can't use it.
+    bool Forbidden = false;
+    for (SmallVector<unsigned, 2>::iterator it = Forbid.begin(),
+           ite = Forbid.end(); it != ite; ++it)
+      if (TRI->regsOverlap(NewReg, *it)) {
+        Forbidden = true;
+        break;
+      }
+    if (Forbidden) continue;
+    return NewReg;
+  }
+
+  // No registers are free and available!
+  return 0;
+}
+
+unsigned CriticalAntiDepBreaker::
+BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                      MachineBasicBlock::iterator Begin,
+                      MachineBasicBlock::iterator End,
+                      unsigned InsertPosIndex,
+                      DbgValueVector &DbgValues) {
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  //
+  // FIXME: Replace this with the existing map in ScheduleDAGInstrs::MISUnitMap
+  DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
+
+  // Find the node at the bottom of the critical path.
+  const SUnit *Max = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap[SU->getInstr()] = SU;
+    if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
+      Max = SU;
+  }
+
+#ifndef NDEBUG
+  {
+    DEBUG(dbgs() << "Critical path has total latency "
+          << (Max->getDepth() + Max->Latency) << "\n");
+    DEBUG(dbgs() << "Available regs:");
+    for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+      if (KillIndices[Reg] == ~0u)
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+    }
+    DEBUG(dbgs() << '\n');
+  }
+#endif
+
+  // Track progress along the critical path through the SUnit graph as we walk
+  // the instructions.
+  const SUnit *CriticalPathSU = Max;
+  MachineInstr *CriticalPathMI = CriticalPathSU->getInstr();
+
+  // Consider this pattern:
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  // There are three anti-dependencies here, and without special care,
+  // we'd break all of them using the same register:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  // because at each anti-dependence, B is the first register that
+  // isn't A which is free.  This re-introduces anti-dependencies
+  // at all but one of the original anti-dependencies that we were
+  // trying to break.  To avoid this, keep track of the most recent
+  // register that each register was replaced with, avoid
+  // using it to repair an anti-dependence on the same register.
+  // This lets us produce this:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   C = ...
+  //   ... = C
+  //   B = ...
+  //   ... = B
+  // This still has an anti-dependence on B, but at least it isn't on the
+  // original critical path.
+  //
+  // TODO: If we tracked more than one register here, we could potentially
+  // fix that remaining critical edge too. This is a little more involved,
+  // because unlike the most recent register, less recent registers should
+  // still be considered, though only if no other registers are available.
+  std::vector<unsigned> LastNewReg(TRI->getNumRegs(), 0);
+
+  // Attempt to break anti-dependence edges on the critical path. Walk the
+  // instructions from the bottom up, tracking information about liveness
+  // as we go to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+    if (MI->isDebugValue())
+      continue;
+
+    // Check if this instruction has a dependence on the critical path that
+    // is an anti-dependence that we may be able to break. If it is, set
+    // AntiDepReg to the non-zero register associated with the anti-dependence.
+    //
+    // We limit our attention to the critical path as a heuristic to avoid
+    // breaking anti-dependence edges that aren't going to significantly
+    // impact the overall schedule. There are a limited number of registers
+    // and we want to save them for the important edges.
+    //
+    // TODO: Instructions with multiple defs could have multiple
+    // anti-dependencies. The current code here only knows how to break one
+    // edge per instruction. Note that we'd have to be able to break all of
+    // the anti-dependencies in an instruction in order to be effective.
+    unsigned AntiDepReg = 0;
+    if (MI == CriticalPathMI) {
+      if (const SDep *Edge = CriticalPathStep(CriticalPathSU)) {
+        const SUnit *NextSU = Edge->getSUnit();
+
+        // Only consider anti-dependence edges.
+        if (Edge->getKind() == SDep::Anti) {
+          AntiDepReg = Edge->getReg();
+          assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+          if (!MRI.isAllocatable(AntiDepReg))
+            // Don't break anti-dependencies on non-allocatable registers.
+            AntiDepReg = 0;
+          else if (KeepRegs.test(AntiDepReg))
+            // Don't break anti-dependencies if an use down below requires
+            // this exact register.
+            AntiDepReg = 0;
+          else {
+            // If the SUnit has other dependencies on the SUnit that it
+            // anti-depends on, don't bother breaking the anti-dependency
+            // since those edges would prevent such units from being
+            // scheduled past each other regardless.
+            //
+            // Also, if there are dependencies on other SUnits with the
+            // same register as the anti-dependency, don't attempt to
+            // break it.
+            for (SUnit::const_pred_iterator P = CriticalPathSU->Preds.begin(),
+                 PE = CriticalPathSU->Preds.end(); P != PE; ++P)
+              if (P->getSUnit() == NextSU ?
+                    (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                    (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+                AntiDepReg = 0;
+                break;
+              }
+          }
+        }
+        CriticalPathSU = NextSU;
+        CriticalPathMI = CriticalPathSU->getInstr();
+      } else {
+        // We've reached the end of the critical path.
+        CriticalPathSU = 0;
+        CriticalPathMI = 0;
+      }
+    }
+
+    PrescanInstruction(MI);
+
+    SmallVector<unsigned, 2> ForbidRegs;
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI->isCall() || MI->hasExtraDefRegAllocReq() ||
+        TII->isPredicated(MI))
+      // If this instruction's defs have special allocation requirement, don't
+      // break this anti-dependency.
+      AntiDepReg = 0;
+    else if (AntiDepReg) {
+      // If this instruction has a use of AntiDepReg, breaking it
+      // is invalid.  If the instruction defines other registers,
+      // save a list of them so that we don't pick a new register
+      // that overlaps any of them.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (Reg == 0) continue;
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
+          AntiDepReg = 0;
+          break;
+        }
+        if (MO.isDef() && Reg != AntiDepReg)
+          ForbidRegs.push_back(Reg);
+      }
+    }
+
+    // Determine AntiDepReg's register class, if it is live and is
+    // consistently used within a single class.
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg] : 0;
+    assert((AntiDepReg == 0 || RC != NULL) &&
+           "Register should be live if it's causing an anti-dependence!");
+    if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
+      AntiDepReg = 0;
+
+    // Look for a suitable register to use to break the anti-depenence.
+    //
+    // TODO: Instead of picking the first free register, consider which might
+    // be the best.
+    if (AntiDepReg != 0) {
+      std::pair<std::multimap<unsigned, MachineOperand *>::iterator,
+                std::multimap<unsigned, MachineOperand *>::iterator>
+        Range = RegRefs.equal_range(AntiDepReg);
+      if (unsigned NewReg = findSuitableFreeRegister(Range.first, Range.second,
+                                                     AntiDepReg,
+                                                     LastNewReg[AntiDepReg],
+                                                     RC, ForbidRegs)) {
+        DEBUG(dbgs() << "Breaking anti-dependence edge on "
+              << TRI->getName(AntiDepReg)
+              << " with " << RegRefs.count(AntiDepReg) << " references"
+              << " using " << TRI->getName(NewReg) << "!\n");
+
+        // Update the references to the old register to refer to the new
+        // register.
+        for (std::multimap<unsigned, MachineOperand *>::iterator
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+          Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                 DVE = DbgValues.end(); DVI != DVE; ++DVI)
+            if (DVI->second == Q->second->getParent())
+              UpdateDbgValue(DVI->first, AntiDepReg, NewReg);
+        }
+
+        // We just went back in time and modified history; the
+        // liveness information for the anti-dependence reg is now
+        // inconsistent. Set the state as if it were dead.
+        Classes[NewReg] = Classes[AntiDepReg];
+        DefIndices[NewReg] = DefIndices[AntiDepReg];
+        KillIndices[NewReg] = KillIndices[AntiDepReg];
+        assert(((KillIndices[NewReg] == ~0u) !=
+                (DefIndices[NewReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for NewReg!");
+
+        Classes[AntiDepReg] = 0;
+        DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
+        KillIndices[AntiDepReg] = ~0u;
+        assert(((KillIndices[AntiDepReg] == ~0u) !=
+                (DefIndices[AntiDepReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for AntiDepReg!");
+
+        RegRefs.erase(AntiDepReg);
+        LastNewReg[AntiDepReg] = NewReg;
+        ++Broken;
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
new file mode 100644
index 000000000000..df13dd31f6b2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -0,0 +1,110 @@
+//=- llvm/CodeGen/CriticalAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_CODEGEN_CRITICALANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <map>
+
+namespace llvm {
+class RegisterClassInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+
+  class CriticalAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// AllocatableSet - The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// Classes - For live regs that are only used in one register class in a
+    /// live range, the register class. If the register is not live, the
+    /// corresponding value is null. If the register is live but used in
+    /// multiple register classes, the corresponding value is -1 casted to a
+    /// pointer.
+    std::vector<const TargetRegisterClass*> Classes;
+
+    /// RegRefs - Map registers to all their references within a live range.
+    std::multimap<unsigned, MachineOperand *> RegRefs;
+    typedef std::multimap<unsigned, MachineOperand *>::const_iterator
+      RegRefIter;
+
+    /// KillIndices - The index of the most recent kill (proceding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// DefIndices - The index of the most recent complete def (proceding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+    /// KeepRegs - A set of registers which are live and cannot be changed to
+    /// break anti-dependencies.
+    BitVector KeepRegs;
+
+  public:
+    CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
+    ~CriticalAntiDepBreaker();
+
+    /// Start - Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB);
+
+    /// BreakAntiDependencies - Identifiy anti-dependencies along the critical
+    /// path
+    /// of the ScheduleDAG and break them by renaming registers.
+    ///
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues);
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count, unsigned InsertPosIndex);
+
+    /// Finish - Finish anti-dep breaking for a basic block.
+    void FinishBlock();
+
+  private:
+    void PrescanInstruction(MachineInstr *MI);
+    void ScanInstruction(MachineInstr *MI, unsigned Count);
+    bool isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                 RegRefIter RegRefEnd,
+                                 unsigned NewReg);
+    unsigned findSuitableFreeRegister(RegRefIter RegRefBegin,
+                                      RegRefIter RegRefEnd,
+                                      unsigned AntiDepReg,
+                                      unsigned LastNewReg,
+                                      const TargetRegisterClass *RC,
+                                      SmallVector<unsigned, 2> &Forbid);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
new file mode 100644
index 000000000000..840a10128daf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -0,0 +1,225 @@
+//=- llvm/CodeGen/DFAPacketizer.cpp - DFA Packetizer for VLIW -*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This class implements a deterministic finite automaton (DFA) based
+// packetizing mechanism for VLIW architectures. It provides APIs to
+// determine whether there exists a legal mapping of instructions to
+// functional unit assignments in a packet. The DFA is auto-generated from
+// the target's Schedule.td file.
+//
+// A DFA consists of 3 major elements: states, inputs, and transitions. For
+// the packetizing mechanism, the input is the set of instruction classes for
+// a target. The state models all possible combinations of functional unit
+// consumption for a given set of instructions in a packet. A transition
+// models the addition of an instruction to a packet. In the DFA constructed
+// by this class, if an instruction can be added to a packet, then a valid
+// transition exists from the corresponding state. Invalid transitions
+// indicate that the instruction cannot be added to the current packet.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+DFAPacketizer::DFAPacketizer(const InstrItineraryData *I, const int (*SIT)[2],
+                             const unsigned *SET):
+  InstrItins(I), CurrentState(0), DFAStateInputTable(SIT),
+  DFAStateEntryTable(SET) {}
+
+
+//
+// ReadTable - Read the DFA transition table and update CachedTable.
+//
+// Format of the transition tables:
+// DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
+//                           transitions
+// DFAStateEntryTable[i] = Index of the first entry in DFAStateInputTable
+//                         for the ith state
+//
+void DFAPacketizer::ReadTable(unsigned int state) {
+  unsigned ThisState = DFAStateEntryTable[state];
+  unsigned NextStateInTable = DFAStateEntryTable[state+1];
+  // Early exit in case CachedTable has already contains this
+  // state's transitions.
+  if (CachedTable.count(UnsignPair(state,
+                                   DFAStateInputTable[ThisState][0])))
+    return;
+
+  for (unsigned i = ThisState; i < NextStateInTable; i++)
+    CachedTable[UnsignPair(state, DFAStateInputTable[i][0])] =
+      DFAStateInputTable[i][1];
+}
+
+
+// canReserveResources - Check if the resources occupied by a MCInstrDesc
+// are available in the current state.
+bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
+  unsigned InsnClass = MID->getSchedClass();
+  const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass);
+  unsigned FuncUnits = IS->getUnits();
+  UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits);
+  ReadTable(CurrentState);
+  return (CachedTable.count(StateTrans) != 0);
+}
+
+
+// reserveResources - Reserve the resources occupied by a MCInstrDesc and
+// change the current state to reflect that change.
+void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
+  unsigned InsnClass = MID->getSchedClass();
+  const llvm::InstrStage *IS = InstrItins->beginStage(InsnClass);
+  unsigned FuncUnits = IS->getUnits();
+  UnsignPair StateTrans = UnsignPair(CurrentState, FuncUnits);
+  ReadTable(CurrentState);
+  assert(CachedTable.count(StateTrans) != 0);
+  CurrentState = CachedTable[StateTrans];
+}
+
+
+// canReserveResources - Check if the resources occupied by a machine
+// instruction are available in the current state.
+bool DFAPacketizer::canReserveResources(llvm::MachineInstr *MI) {
+  const llvm::MCInstrDesc &MID = MI->getDesc();
+  return canReserveResources(&MID);
+}
+
+// reserveResources - Reserve the resources occupied by a machine
+// instruction and change the current state to reflect that change.
+void DFAPacketizer::reserveResources(llvm::MachineInstr *MI) {
+  const llvm::MCInstrDesc &MID = MI->getDesc();
+  reserveResources(&MID);
+}
+
+namespace llvm {
+// DefaultVLIWScheduler - This class extends ScheduleDAGInstrs and overrides
+// Schedule method to build the dependence graph.
+class DefaultVLIWScheduler : public ScheduleDAGInstrs {
+public:
+  DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
+                   MachineDominatorTree &MDT, bool IsPostRA);
+  // Schedule - Actual scheduling work.
+  void schedule();
+};
+}
+
+DefaultVLIWScheduler::DefaultVLIWScheduler(
+  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
+  bool IsPostRA) :
+  ScheduleDAGInstrs(MF, MLI, MDT, IsPostRA) {
+  CanHandleTerminators = true;
+}
+
+void DefaultVLIWScheduler::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(0);
+}
+
+// VLIWPacketizerList Ctor
+VLIWPacketizerList::VLIWPacketizerList(
+  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
+  bool IsPostRA) : TM(MF.getTarget()), MF(MF)  {
+  TII = TM.getInstrInfo();
+  ResourceTracker = TII->CreateTargetScheduleState(&TM, 0);
+  VLIWScheduler = new DefaultVLIWScheduler(MF, MLI, MDT, IsPostRA);
+}
+
+// VLIWPacketizerList Dtor
+VLIWPacketizerList::~VLIWPacketizerList() {
+  if (VLIWScheduler)
+    delete VLIWScheduler;
+
+  if (ResourceTracker)
+    delete ResourceTracker;
+}
+
+// endPacket - End the current packet, bundle packet instructions and reset
+// DFA state.
+void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
+                                         MachineInstr *MI) {
+  if (CurrentPacketMIs.size() > 1) {
+    MachineInstr *MIFirst = CurrentPacketMIs.front();
+    finalizeBundle(*MBB, MIFirst, MI);
+  }
+  CurrentPacketMIs.clear();
+  ResourceTracker->clearResources();
+}
+
+// PacketizeMIs - Bundle machine instructions into packets.
+void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator BeginItr,
+                                      MachineBasicBlock::iterator EndItr) {
+  assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
+  VLIWScheduler->startBlock(MBB);
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->schedule();
+
+  // Generate MI -> SU map.
+  MIToSUnit.clear();
+  for (unsigned i = 0, e = VLIWScheduler->SUnits.size(); i != e; ++i) {
+    SUnit *SU = &VLIWScheduler->SUnits[i];
+    MIToSUnit[SU->getInstr()] = SU;
+  }
+
+  // The main packetizer loop.
+  for (; BeginItr != EndItr; ++BeginItr) {
+    MachineInstr *MI = BeginItr;
+
+    this->initPacketizerState();
+
+    // End the current packet if needed.
+    if (this->isSoloInstruction(MI)) {
+      endPacket(MBB, MI);
+      continue;
+    }
+
+    // Ignore pseudo instructions.
+    if (this->ignorePseudoInstruction(MI, MBB))
+      continue;
+
+    SUnit *SUI = MIToSUnit[MI];
+    assert(SUI && "Missing SUnit Info!");
+
+    // Ask DFA if machine resource is available for MI.
+    bool ResourceAvail = ResourceTracker->canReserveResources(MI);
+    if (ResourceAvail) {
+      // Dependency check for MI with instructions in CurrentPacketMIs.
+      for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
+           VE = CurrentPacketMIs.end(); VI != VE; ++VI) {
+        MachineInstr *MJ = *VI;
+        SUnit *SUJ = MIToSUnit[MJ];
+        assert(SUJ && "Missing SUnit Info!");
+
+        // Is it legal to packetize SUI and SUJ together.
+        if (!this->isLegalToPacketizeTogether(SUI, SUJ)) {
+          // Allow packetization if dependency can be pruned.
+          if (!this->isLegalToPruneDependencies(SUI, SUJ)) {
+            // End the packet if dependency cannot be pruned.
+            endPacket(MBB, MI);
+            break;
+          } // !isLegalToPruneDependencies.
+        } // !isLegalToPacketizeTogether.
+      } // For all instructions in CurrentPacketMIs.
+    } else {
+      // End the packet if resource is not available.
+      endPacket(MBB, MI);
+    }
+
+    // Add MI to the current packet.
+    BeginItr = this->addToPacket(MI);
+  } // For all instructions in BB.
+
+  // End any packet left behind.
+  endPacket(MBB, EndItr);
+  VLIWScheduler->exitRegion();
+  VLIWScheduler->finishBlock();
+}
diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
new file mode 100644
index 000000000000..a54217f5b2fb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -0,0 +1,190 @@
+//===- DeadMachineInstructionElim.cpp - Remove dead machine instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level dead-code-elimination pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegen-dce"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class DeadMachineInstructionElim : public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    BitVector LivePhysRegs;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DeadMachineInstructionElim() : MachineFunctionPass(ID) {
+     initializeDeadMachineInstructionElimPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    bool isDead(const MachineInstr *MI) const;
+  };
+}
+char DeadMachineInstructionElim::ID = 0;
+char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID;
+
+INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+                "Remove dead machine instructions", false, false)
+
+bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
+  // Technically speaking inline asm without side effects and no defs can still
+  // be deleted. But there is so much bad inline asm code out there, we should
+  // let them be.
+  if (MI->isInlineAsm())
+    return false;
+
+  // Don't delete instructions with side effects.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(TII, 0, SawStore) && !MI->isPHI())
+    return false;
+
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        // Don't delete live physreg defs, or any reserved register defs.
+        if (LivePhysRegs.test(Reg) || MRI->isReserved(Reg))
+          return false;
+      } else {
+        if (!MRI->use_nodbg_empty(Reg))
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
+  bool AnyChanges = false;
+  MRI = &MF.getRegInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Loop over all instructions in all blocks, from bottom to top, so that it's
+  // more likely that chains of dependent but ultimately dead instructions will
+  // be cleaned up.
+  for (MachineFunction::reverse_iterator I = MF.rbegin(), E = MF.rend();
+       I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    // Start out assuming that reserved registers are live out of this block.
+    LivePhysRegs = MRI->getReservedRegs();
+
+    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
+    // live across blocks, but some targets (x86) can have flags live out of a
+    // block.
+    for (MachineBasicBlock::succ_iterator S = MBB->succ_begin(),
+           E = MBB->succ_end(); S != E; S++)
+      for (MachineBasicBlock::livein_iterator LI = (*S)->livein_begin();
+           LI != (*S)->livein_end(); LI++)
+        LivePhysRegs.set(*LI);
+
+    // Now scan the instructions and delete dead ones, tracking physreg
+    // liveness as we go.
+    for (MachineBasicBlock::reverse_iterator MII = MBB->rbegin(),
+         MIE = MBB->rend(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // If the instruction is dead, delete it!
+      if (isDead(MI)) {
+        DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
+        // It is possible that some DBG_VALUE instructions refer to this
+        // instruction.  Examine each def operand for such references;
+        // if found, mark the DBG_VALUE as undef (but don't delete it).
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          const MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(Reg))
+            continue;
+          MachineRegisterInfo::use_iterator nextI;
+          for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+               E = MRI->use_end(); I!=E; I=nextI) {
+            nextI = llvm::next(I);  // I is invalidated by the setReg
+            MachineOperand& Use = I.getOperand();
+            MachineInstr *UseMI = Use.getParent();
+            if (UseMI==MI)
+              continue;
+            assert(Use.isDebug());
+            UseMI->getOperand(0).setReg(0U);
+          }
+        }
+        AnyChanges = true;
+        MI->eraseFromParent();
+        ++NumDeletes;
+        MIE = MBB->rend();
+        // MII is now pointing to the next instruction to process,
+        // so don't increment it.
+        continue;
+      }
+
+      // Record the physreg defs.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned Reg = MO.getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            LivePhysRegs.reset(Reg);
+            // Check the subreg set, not the alias set, because a def
+            // of a super-register may still be partially live after
+            // this def.
+            for (MCSubRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+              LivePhysRegs.reset(*SR);
+          }
+        } else if (MO.isRegMask()) {
+          // Register mask of preserved registers. All clobbers are dead.
+          LivePhysRegs.clearBitsNotInMask(MO.getRegMask());
+        }
+      }
+      // Record the physreg uses, after the defs, in case a physreg is
+      // both defined and used in the same instruction.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isUse()) {
+          unsigned Reg = MO.getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+              LivePhysRegs.set(*AI);
+          }
+        }
+      }
+
+      // We didn't delete the current instruction, so increment MII to
+      // the next one.
+      ++MII;
+    }
+  }
+
+  LivePhysRegs.clear();
+  return AnyChanges;
+}
diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
new file mode 100644
index 000000000000..f27ec770ebad
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -0,0 +1,183 @@
+//===-- DwarfEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mulches exception handling code into a form adapted to code
+// generation. Required if using dwarf exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfehprepare"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumResumesLowered, "Number of resume calls lowered");
+
+namespace {
+  class DwarfEHPrepare : public FunctionPass {
+    const TargetMachine *TM;
+    const TargetLoweringBase *TLI;
+
+    // RewindFunction - _Unwind_Resume or the target equivalent.
+    Constant *RewindFunction;
+
+    bool InsertUnwindResumeCalls(Function &Fn);
+    Value *GetExceptionObject(ResumeInst *RI);
+
+  public:
+    static char ID; // Pass identification, replacement for typeid.
+    DwarfEHPrepare(const TargetMachine *tm) :
+      FunctionPass(ID), TM(tm), TLI(TM->getTargetLowering()),
+      RewindFunction(0) {
+        initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnFunction(Function &Fn);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const { }
+
+    const char *getPassName() const {
+      return "Exception handling preparation";
+    }
+  };
+} // end anonymous namespace
+
+char DwarfEHPrepare::ID = 0;
+
+FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) {
+  return new DwarfEHPrepare(tm);
+}
+
+/// GetExceptionObject - Return the exception object from the value passed into
+/// the 'resume' instruction (typically an aggregate). Clean up any dead
+/// instructions, including the 'resume' instruction.
+Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
+  Value *V = RI->getOperand(0);
+  Value *ExnObj = 0;
+  InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
+  LoadInst *SelLoad = 0;
+  InsertValueInst *ExcIVI = 0;
+  bool EraseIVIs = false;
+
+  if (SelIVI) {
+    if (SelIVI->getNumIndices() == 1 && *SelIVI->idx_begin() == 1) {
+      ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0));
+      if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) &&
+          ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) {
+        ExnObj = ExcIVI->getOperand(1);
+        SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1));
+        EraseIVIs = true;
+      }
+    }
+  }
+
+  if (!ExnObj)
+    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj", RI);
+
+  RI->eraseFromParent();
+
+  if (EraseIVIs) {
+    if (SelIVI->getNumUses() == 0)
+      SelIVI->eraseFromParent();
+    if (ExcIVI->getNumUses() == 0)
+      ExcIVI->eraseFromParent();
+    if (SelLoad && SelLoad->getNumUses() == 0)
+      SelLoad->eraseFromParent();
+  }
+
+  return ExnObj;
+}
+
+/// InsertUnwindResumeCalls - Convert the ResumeInsts that are still present
+/// into calls to the appropriate _Unwind_Resume function.
+bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
+  bool UsesNewEH = false;
+  SmallVector<ResumeInst*, 16> Resumes;
+  for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(TI))
+      Resumes.push_back(RI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(TI))
+      UsesNewEH = II->getUnwindDest()->isLandingPad();
+  }
+
+  if (Resumes.empty())
+    return UsesNewEH;
+
+  // Find the rewind function if we didn't already.
+  if (!RewindFunction) {
+    LLVMContext &Ctx = Resumes[0]->getContext();
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
+                                          Type::getInt8PtrTy(Ctx), false);
+    const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
+    RewindFunction = Fn.getParent()->getOrInsertFunction(RewindName, FTy);
+  }
+
+  // Create the basic block where the _Unwind_Resume call will live.
+  LLVMContext &Ctx = Fn.getContext();
+  unsigned ResumesSize = Resumes.size();
+
+  if (ResumesSize == 1) {
+    // Instead of creating a new BB and PHI node, just append the call to
+    // _Unwind_Resume to the end of the single resume block.
+    ResumeInst *RI = Resumes.front();
+    BasicBlock *UnwindBB = RI->getParent();
+    Value *ExnObj = GetExceptionObject(RI);
+
+    // Call the _Unwind_Resume function.
+    CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
+    CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+
+    // We never expect _Unwind_Resume to return.
+    new UnreachableInst(Ctx, UnwindBB);
+    return true;
+  }
+
+  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &Fn);
+  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesSize,
+                                "exn.obj", UnwindBB);
+
+  // Extract the exception object from the ResumeInst and add it to the PHI node
+  // that feeds the _Unwind_Resume call.
+  for (SmallVectorImpl<ResumeInst*>::iterator
+         I = Resumes.begin(), E = Resumes.end(); I != E; ++I) {
+    ResumeInst *RI = *I;
+    BasicBlock *Parent = RI->getParent();
+    BranchInst::Create(UnwindBB, Parent);
+
+    Value *ExnObj = GetExceptionObject(RI);
+    PN->addIncoming(ExnObj, Parent);
+
+    ++NumResumesLowered;
+  }
+
+  // Call the function.
+  CallInst *CI = CallInst::Create(RewindFunction, PN, "", UnwindBB);
+  CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+
+  // We never expect _Unwind_Resume to return.
+  new UnreachableInst(Ctx, UnwindBB);
+  return true;
+}
+
+bool DwarfEHPrepare::runOnFunction(Function &Fn) {
+  bool Changed = InsertUnwindResumeCalls(Fn);
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 000000000000..5447df09cbb2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,801 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "early-ifcvt"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumDiamondsSeen,  "Number of diamonds");
+STATISTIC(NumDiamondsConv,  "Number of diamonds converted");
+STATISTIC(NumTrianglesSeen, "Number of triangles");
+STATISTIC(NumTrianglesConv, "Number of triangles converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Returns the Tail predecessor for the True side.
+  MachineBasicBlock *getTPred() const { return TBB == Tail ? Head : TBB; }
+
+  /// Returns the Tail predecessor for the  False side.
+  MachineBasicBlock *getFPred() const { return FBB == Tail ? Head : FBB; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg, FReg;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles, TCycles, FCycles;
+
+    PHIInfo(MachineInstr *phi)
+      : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+private:
+  /// The branch condition determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+  /// Replace PHI instructions in Tail with selects.
+  void replacePHIInstrs();
+
+  /// Insert selects and rewrite PHI operands to use them.
+  void rewritePHIOperands();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  bool canConvertIf(MachineBasicBlock *MBB);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+       E = MBB->getFirstTerminator(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      if (MO->isRegMask()) {
+        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        return false;
+      }
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+
+      // Remember clobbered regunits.
+      if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          ClobberedRegUnits.set(*Units);
+
+      if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI || DefMI->getParent() != Head)
+        continue;
+      if (InsertAfter.insert(DefMI))
+        DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
+      if (DefMI->isTerminator()) {
+        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<unsigned, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(I)) {
+      DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO->isDef())
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          LiveRegUnits.erase(*Units);
+      // Unless I reads Reg.
+      if (MO->readsReg())
+        Reads.push_back(Reg);
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid();
+           ++Units)
+        if (ClobberedRegUnits.test(*Units))
+          LiveRegUnits.insert(*Units);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      DEBUG({
+        dbgs() << "Would clobber";
+        for (SparseSet<unsigned>::const_iterator
+             i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
+          dbgs() << ' ' << PrintRegUnit(*i, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+  Head = MBB;
+  TBB = FBB = Tail = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  Tail = Succ0->succ_begin()[0];
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << "/BB#" << Succ1->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // If Tail doesn't have any phis, there must be side effects.
+  if (Tail->empty() || !Tail->front().isPHI()) {
+    DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) {
+    DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = getTPred();
+  MachineBasicBlock *FPred = getFPred();
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
+                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (TBB != Tail && !canSpeculateInstrs(TBB))
+    return false;
+  if (FBB != Tail && !canSpeculateInstrs(FBB))
+    return false;
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  if (isTriangle())
+    ++NumTrianglesSeen;
+  else
+    ++NumDiamondsSeen;
+  return true;
+}
+
+/// replacePHIInstrs - Completely replace PHI instructions with selects.
+/// This is possible when the only Tail predecessors are the if-converted
+/// blocks.
+void SSAIfConv::replacePHIInstrs() {
+  assert(Tail->pred_size() == 2 && "Cannot replace PHIs");
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = 0;
+  }
+}
+
+/// rewritePHIOperands - When there are additional Tail predecessors, insert
+/// select instructions in Head and rewrite PHI operands to use the selects.
+/// Keep the PHI instructions in Tail to handle the other predecessors.
+void SSAIfConv::rewritePHIOperands() {
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+    unsigned DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+
+    // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
+    for (unsigned i = PI.PHI->getNumOperands(); i != 1; i -= 2) {
+      MachineBasicBlock *MBB = PI.PHI->getOperand(i-1).getMBB();
+      if (MBB == getTPred()) {
+        PI.PHI->getOperand(i-1).setMBB(Head);
+        PI.PHI->getOperand(i-2).setReg(DstReg);
+      } else if (MBB == getFPred()) {
+        PI.PHI->RemoveOperand(i-1);
+        PI.PHI->RemoveOperand(i-2);
+      }
+    }
+    DEBUG(dbgs() << "          --> " << *PI.PHI);
+  }
+}
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Update statistics.
+  if (isTriangle())
+    ++NumTrianglesConv;
+  else
+    ++NumDiamondsConv;
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail)
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  if (FBB != Tail)
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+
+  // Are there extra Tail predecessors?
+  bool ExtraPreds = Tail->pred_size() != 2;
+  if (ExtraPreds)
+    rewritePHIOperands();
+  else
+    replacePHIInstrs();
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber()
+                 << " into head BB#" << Head->getNumber() << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  DEBUG(dbgs() << *Head);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+  const char *getPassName() const { return "Early If-Conversion"; }
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
+  void invalidateTraces();
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    }
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void EarlyIfConverter::invalidateTraces() {
+  Traces->verifyAnalysis();
+  Traces->invalidate(IfConv.Head);
+  Traces->invalidate(IfConv.Tail);
+  Traces->invalidate(IfConv.TBB);
+  Traces->invalidate(IfConv.FBB);
+  Traces->verifyAnalysis();
+}
+
+// Adjust cycles with downward saturation.
+static unsigned adjCycles(unsigned Cyc, int Delta) {
+  if (Delta < 0 && Cyc + Delta > Cyc)
+    return 0;
+  return Cyc + Delta;
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool EarlyIfConverter::shouldConvertIf() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
+  MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
+  DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
+                              FBBTrace.getCriticalPath());
+
+  // Set a somewhat arbitrary limit on the critical path extension we accept.
+  unsigned CritLimit = SchedModel->MispredictPenalty/2;
+
+  // If-conversion only makes sense when there is unexploited ILP. Compute the
+  // maximum-ILP resource length of the trace after if-conversion. Compare it
+  // to the shortest critical path.
+  SmallVector<const MachineBasicBlock*, 1> ExtraBlocks;
+  if (IfConv.TBB != IfConv.Tail)
+    ExtraBlocks.push_back(IfConv.TBB);
+  unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
+  DEBUG(dbgs() << "Resource length " << ResLength
+               << ", minimal critical path " << MinCrit << '\n');
+  if (ResLength > MinCrit + CritLimit) {
+    DEBUG(dbgs() << "Not enough available ILP.\n");
+    return false;
+  }
+
+  // Assume that the depth of the first head terminator will also be the depth
+  // of the select instruction inserted, as determined by the flag dependency.
+  // TBB / FBB data dependencies may delay the select even more.
+  MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
+  unsigned BranchDepth =
+    HeadTrace.getInstrCycles(IfConv.Head->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+
+  // Look at all the tail phis, and compute the critical path extension caused
+  // by inserting select instructions.
+  MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
+    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+    unsigned Slack = TailTrace.getInstrSlack(PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(PI.PHI).Depth;
+    DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+
+    // The condition is pulled into the critical path.
+    unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
+    if (CondDepth > MaxDepth) {
+      unsigned Extra = CondDepth - MaxDepth;
+      DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The TBB value is pulled into the critical path.
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(PI.PHI), PI.TCycles);
+    if (TDepth > MaxDepth) {
+      unsigned Extra = TDepth - MaxDepth;
+      DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The FBB value is pulled into the critical path.
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(PI.PHI), PI.FCycles);
+    if (FDepth > MaxDepth) {
+      unsigned Extra = FDepth - MaxDepth;
+      DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    invalidateTraces();
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel =
+    MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (po_iterator<MachineDominatorTree*>
+       I = po_begin(DomTree), E = po_end(DomTree); I != E; ++I)
+    if (tryConvertIf(I->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/EdgeBundles.cpp b/contrib/llvm/lib/CodeGen/EdgeBundles.cpp
new file mode 100644
index 000000000000..3bb04657b58a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -0,0 +1,97 @@
+//===-------- EdgeBundles.cpp - Bundles of CFG edges ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the EdgeBundles analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GraphWriter.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ViewEdgeBundles("view-edge-bundles", cl::Hidden,
+                cl::desc("Pop up a window to show edge bundle graphs"));
+
+char EdgeBundles::ID = 0;
+
+INITIALIZE_PASS(EdgeBundles, "edge-bundles", "Bundle Machine CFG Edges",
+                /* cfg = */true, /* analysis = */ true)
+
+char &llvm::EdgeBundlesID = EdgeBundles::ID;
+
+void EdgeBundles::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  EC.clear();
+  EC.grow(2 * MF->getNumBlockIDs());
+
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
+    const MachineBasicBlock &MBB = *I;
+    unsigned OutE = 2 * MBB.getNumber() + 1;
+    // Join the outgoing bundle with the ingoing bundles of all successors.
+    for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      EC.join(OutE, 2 * (*SI)->getNumber());
+  }
+  EC.compress();
+  if (ViewEdgeBundles)
+    view();
+
+  // Compute the reverse mapping.
+  Blocks.clear();
+  Blocks.resize(getNumBundles());
+
+  for (unsigned i = 0, e = MF->getNumBlockIDs(); i != e; ++i) {
+    unsigned b0 = getBundle(i, 0);
+    unsigned b1 = getBundle(i, 1);
+    Blocks[b0].push_back(i);
+    if (b1 != b0)
+      Blocks[b1].push_back(i);
+  }
+
+  return false;
+}
+
+/// view - Visualize the annotated bipartite CFG with Graphviz.
+void EdgeBundles::view() const {
+  ViewGraph(*this, "EdgeBundles");
+}
+
+/// Specialize WriteGraph, the standard implementation won't work.
+raw_ostream &llvm::WriteGraph(raw_ostream &O, const EdgeBundles &G,
+                              bool ShortNames,
+                              const Twine &Title) {
+  const MachineFunction *MF = G.getMachineFunction();
+
+  O << "digraph {\n";
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    unsigned BB = I->getNumber();
+    O << "\t\"BB#" << BB << "\" [ shape=box ]\n"
+      << '\t' << G.getBundle(BB, false) << " -> \"BB#" << BB << "\"\n"
+      << "\t\"BB#" << BB << "\" -> " << G.getBundle(BB, true) << '\n';
+    for (MachineBasicBlock::const_succ_iterator SI = I->succ_begin(),
+           SE = I->succ_end(); SI != SE; ++SI)
+      O << "\t\"BB#" << BB << "\" -> \"BB#" << (*SI)->getNumber()
+        << "\" [ color=lightgray ]\n";
+  }
+  O << "}\n";
+  return O;
+}
diff --git a/contrib/llvm/lib/CodeGen/ErlangGC.cpp b/contrib/llvm/lib/CodeGen/ErlangGC.cpp
new file mode 100644
index 000000000000..8a1e2d9c99a8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ErlangGC.cpp
@@ -0,0 +1,81 @@
+//===-- ErlangGC.cpp - Erlang/OTP GC strategy -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Erlang/OTP runtime-compatible garbage collector
+// (e.g. defines safe points, root initialization etc.)
+//
+// The frametable emitter is in ErlangGCPrinter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+  class ErlangGC : public GCStrategy {
+    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          DebugLoc DL) const;
+  public:
+    ErlangGC();
+    bool findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF);
+  };
+
+}
+
+static GCRegistry::Add<ErlangGC>
+X("erlang", "erlang-compatible garbage collector");
+
+void llvm::linkErlangGC() { }
+
+ErlangGC::ErlangGC() {
+  InitRoots = false;
+  NeededSafePoints = 1 << GC::PostCall;
+  UsesMetadata = true;
+  CustomRoots = false;
+  CustomSafePoints = true;
+}
+
+MCSymbol *ErlangGC::InsertLabel(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                DebugLoc DL) const {
+  const TargetInstrInfo* TII = MBB.getParent()->getTarget().getInstrInfo();
+  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+bool ErlangGC::findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
+       ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
+         MI != ME; ++MI)
+
+      if (MI->getDesc().isCall()) {
+
+        // Do not treat tail call sites as safe points.
+        if (MI->getDesc().isTerminator())
+          continue;
+
+        /* Code copied from VisitCallPoint(...) */
+        MachineBasicBlock::iterator RAI = MI; ++RAI;
+        MCSymbol* Label = InsertLabel(*MI->getParent(), RAI, MI->getDebugLoc());
+        FI.addSafePoint(GC::PostCall, Label, MI->getDebugLoc());
+      }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
new file mode 100644
index 000000000000..9b0e76fa20cb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
@@ -0,0 +1,725 @@
+//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the execution dependency fix pass.
+//
+// Some X86 SSE instructions like mov, and, or, xor are available in different
+// variants for different operand types. These variant instructions are
+// equivalent, but on Nehalem and newer cpus there is extra latency
+// transferring data between integer and floating point domains.  ARM cores
+// have similar issues when they are configured with both VFP and NEON
+// pipelines.
+//
+// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "execution-fix"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact that the register is now available in multiple
+/// domains.
+namespace {
+struct DomainValue {
+  // Basic reference counting.
+  unsigned Refs;
+
+  // Bitmask of available domains. For an open DomainValue, it is the still
+  // possible domains for collapsing. For a collapsed DomainValue it is the
+  // domains where the register is available for free.
+  unsigned AvailableDomains;
+
+  // Pointer to the next DomainValue in a chain.  When two DomainValues are
+  // merged, Victim.Next is set to point to Victor, so old DomainValue
+  // references can be updated by following the chain.
+  DomainValue *Next;
+
+  // Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr*, 8> Instrs;
+
+  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
+  // track of the domains where the registers are already available.
+  bool isCollapsed() const { return Instrs.empty(); }
+
+  // Is domain available?
+  bool hasDomain(unsigned domain) const {
+    return AvailableDomains & (1u << domain);
+  }
+
+  // Mark domain as available.
+  void addDomain(unsigned domain) {
+    AvailableDomains |= 1u << domain;
+  }
+
+  // Restrict to a single domain available.
+  void setSingleDomain(unsigned domain) {
+    AvailableDomains = 1u << domain;
+  }
+
+  // Return bitmask of domains that are available and in mask.
+  unsigned getCommonDomains(unsigned mask) const {
+    return AvailableDomains & mask;
+  }
+
+  // First domain available.
+  unsigned getFirstDomain() const {
+    return CountTrailingZeros_32(AvailableDomains);
+  }
+
+  DomainValue() : Refs(0) { clear(); }
+
+  // Clear this DomainValue and point to next which has all its data.
+  void clear() {
+    AvailableDomains = 0;
+    Next = 0;
+    Instrs.clear();
+  }
+};
+}
+
+namespace {
+/// LiveReg - Information about a live register.
+struct LiveReg {
+  /// Value currently in this register, or NULL when no value is being tracked.
+  /// This counts as a DomainValue reference.
+  DomainValue *Value;
+
+  /// Instruction that defined this register, relative to the beginning of the
+  /// current basic block.  When a LiveReg is used to represent a live-out
+  /// register, this value is relative to the end of the basic block, so it
+  /// will be a negative number.
+  int Def;
+};
+} // anonynous namespace
+
+namespace {
+class ExeDepsFix : public MachineFunctionPass {
+  static char ID;
+  SpecificBumpPtrAllocator<DomainValue> Allocator;
+  SmallVector<DomainValue*,16> Avail;
+
+  const TargetRegisterClass *const RC;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  std::vector<int> AliasMap;
+  const unsigned NumRegs;
+  LiveReg *LiveRegs;
+  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
+  LiveOutMap LiveOuts;
+
+  /// Current instruction number.
+  /// The first instruction in each basic block is 0.
+  int CurInstr;
+
+  /// True when the current block has a predecessor that hasn't been visited
+  /// yet.
+  bool SeenUnknownBackEdge;
+
+public:
+  ExeDepsFix(const TargetRegisterClass *rc)
+    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "Execution dependency fix";
+  }
+
+private:
+  // Register mapping.
+  int regIndex(unsigned Reg);
+
+  // DomainValue allocation.
+  DomainValue *alloc(int domain = -1);
+  DomainValue *retain(DomainValue *DV) {
+    if (DV) ++DV->Refs;
+    return DV;
+  }
+  void release(DomainValue*);
+  DomainValue *resolve(DomainValue*&);
+
+  // LiveRegs manipulations.
+  void setLiveReg(int rx, DomainValue *DV);
+  void kill(int rx);
+  void force(int rx, unsigned domain);
+  void collapse(DomainValue *dv, unsigned domain);
+  bool merge(DomainValue *A, DomainValue *B);
+
+  void enterBasicBlock(MachineBasicBlock*);
+  void leaveBasicBlock(MachineBasicBlock*);
+  void visitInstr(MachineInstr*);
+  void processDefs(MachineInstr*, bool Kill);
+  void visitSoftInstr(MachineInstr*, unsigned mask);
+  void visitHardInstr(MachineInstr*, unsigned domain);
+};
+}
+
+char ExeDepsFix::ID = 0;
+
+/// Translate TRI register number to an index into our smaller tables of
+/// interesting registers. Return -1 for boring registers.
+int ExeDepsFix::regIndex(unsigned Reg) {
+  assert(Reg < AliasMap.size() && "Invalid register");
+  return AliasMap[Reg];
+}
+
+DomainValue *ExeDepsFix::alloc(int domain) {
+  DomainValue *dv = Avail.empty() ?
+                      new(Allocator.Allocate()) DomainValue :
+                      Avail.pop_back_val();
+  if (domain >= 0)
+    dv->addDomain(domain);
+  assert(dv->Refs == 0 && "Reference count wasn't cleared");
+  assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
+  return dv;
+}
+
+/// release - Release a reference to DV.  When the last reference is released,
+/// collapse if needed.
+void ExeDepsFix::release(DomainValue *DV) {
+  while (DV) {
+    assert(DV->Refs && "Bad DomainValue");
+    if (--DV->Refs)
+      return;
+
+    // There are no more DV references. Collapse any contained instructions.
+    if (DV->AvailableDomains && !DV->isCollapsed())
+      collapse(DV, DV->getFirstDomain());
+
+    DomainValue *Next = DV->Next;
+    DV->clear();
+    Avail.push_back(DV);
+    // Also release the next DomainValue in the chain.
+    DV = Next;
+  }
+}
+
+/// resolve - Follow the chain of dead DomainValues until a live DomainValue is
+/// reached.  Update the referenced pointer when necessary.
+DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
+  DomainValue *DV = DVRef;
+  if (!DV || !DV->Next)
+    return DV;
+
+  // DV has a chain. Find the end.
+  do DV = DV->Next;
+  while (DV->Next);
+
+  // Update DVRef to point to DV.
+  retain(DV);
+  release(DVRef);
+  DVRef = DV;
+  return DV;
+}
+
+/// Set LiveRegs[rx] = dv, updating reference counts.
+void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+
+  if (LiveRegs[rx].Value == dv)
+    return;
+  if (LiveRegs[rx].Value)
+    release(LiveRegs[rx].Value);
+  LiveRegs[rx].Value = retain(dv);
+}
+
+// Kill register rx, recycle or collapse any DomainValue.
+void ExeDepsFix::kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+  if (!LiveRegs[rx].Value)
+    return;
+
+  release(LiveRegs[rx].Value);
+  LiveRegs[rx].Value = 0;
+}
+
+/// Force register rx into domain.
+void ExeDepsFix::force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+  if (DomainValue *dv = LiveRegs[rx].Value) {
+    if (dv->isCollapsed())
+      dv->addDomain(domain);
+    else if (dv->hasDomain(domain))
+      collapse(dv, domain);
+    else {
+      // This is an incompatible open DomainValue. Collapse it to whatever and
+      // force the new value into domain. This costs a domain crossing.
+      collapse(dv, dv->getFirstDomain());
+      assert(LiveRegs[rx].Value && "Not live after collapse?");
+      LiveRegs[rx].Value->addDomain(domain);
+    }
+  } else {
+    // Set up basic collapsed DomainValue.
+    setLiveReg(rx, alloc(domain));
+  }
+}
+
+/// Collapse open DomainValue into given domain. If there are multiple
+/// registers using dv, they each get a unique collapsed DomainValue.
+void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->hasDomain(domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty())
+    TII->setExecutionDomain(dv->Instrs.pop_back_val(), domain);
+  dv->setSingleDomain(domain);
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (LiveRegs && dv->Refs > 1)
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx].Value == dv)
+        setLiveReg(rx, alloc(domain));
+}
+
+/// Merge - All instructions and registers in B are moved to A, and B is
+/// released.
+bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
+  assert(!A->isCollapsed() && "Cannot merge into collapsed");
+  assert(!B->isCollapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  // Restrict to the domains that A and B have in common.
+  unsigned common = A->getCommonDomains(B->AvailableDomains);
+  if (!common)
+    return false;
+  A->AvailableDomains = common;
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+
+  // Clear the old DomainValue so we won't try to swizzle instructions twice.
+  B->clear();
+  // All uses of B are referred to A.
+  B->Next = retain(A);
+
+  for (unsigned rx = 0; rx != NumRegs; ++rx)
+    if (LiveRegs[rx].Value == B)
+      setLiveReg(rx, A);
+  return true;
+}
+
+// enterBasicBlock - Set up LiveRegs by merging predecessor live-out values.
+void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
+  // Detect back-edges from predecessors we haven't processed yet.
+  SeenUnknownBackEdge = false;
+
+  // Reset instruction counter in each basic block.
+  CurInstr = 0;
+
+  // Set up LiveRegs to represent registers entering MBB.
+  if (!LiveRegs)
+    LiveRegs = new LiveReg[NumRegs];
+
+  // Default values are 'nothing happened a long time ago'.
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    LiveRegs[rx].Value = 0;
+    LiveRegs[rx].Def = -(1 << 20);
+  }
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
+         e = MBB->livein_end(); i != e; ++i) {
+      int rx = regIndex(*i);
+      if (rx < 0)
+        continue;
+      // Treat function live-ins as if they were defined just before the first
+      // instruction.  Usually, function arguments are set up immediately
+      // before the call.
+      LiveRegs[rx].Def = -1;
+    }
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
+       pe = MBB->pred_end(); pi != pe; ++pi) {
+    LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
+    if (fi == LiveOuts.end()) {
+      SeenUnknownBackEdge = true;
+      continue;
+    }
+    assert(fi->second && "Can't have NULL entries");
+
+    for (unsigned rx = 0; rx != NumRegs; ++rx) {
+      // Use the most recent predecessor def for each register.
+      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
+
+      DomainValue *pdv = resolve(fi->second[rx].Value);
+      if (!pdv)
+        continue;
+      if (!LiveRegs[rx].Value) {
+        setLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx].Value->isCollapsed()) {
+        // We are already collapsed, but predecessor is not. Force him.
+        unsigned Domain = LiveRegs[rx].Value->getFirstDomain();
+        if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
+          collapse(pdv, Domain);
+        continue;
+      }
+
+      // Currently open, merge in predecessor.
+      if (!pdv->isCollapsed())
+        merge(LiveRegs[rx].Value, pdv);
+      else
+        force(rx, pdv->getFirstDomain());
+    }
+  }
+  DEBUG(dbgs() << "BB#" << MBB->getNumber()
+        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
+}
+
+void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
+  assert(LiveRegs && "Must enter basic block first.");
+  // Save live registers at end of MBB - used by enterBasicBlock().
+  // Also use LiveOuts as a visited set to detect back-edges.
+  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
+
+  if (First) {
+    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
+    // the end of this block instead of the beginning.
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      LiveRegs[i].Def -= CurInstr;
+  } else {
+    // Insertion failed, this must be the second pass.
+    // Release all the DomainValues instead of keeping them.
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      release(LiveRegs[i].Value);
+    delete[] LiveRegs;
+  }
+  LiveRegs = 0;
+}
+
+void ExeDepsFix::visitInstr(MachineInstr *MI) {
+  if (MI->isDebugValue())
+    return;
+
+  // Update instructions with explicit execution domains.
+  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(MI);
+  if (DomP.first) {
+    if (DomP.second)
+      visitSoftInstr(MI, DomP.second);
+    else
+      visitHardInstr(MI, DomP.first);
+  }
+
+  // Process defs to track register ages, and kill values clobbered by generic
+  // instructions.
+  processDefs(MI, !DomP.first);
+}
+
+// Update def-ages for registers defined by MI.
+// If Kill is set, also kill off DomainValues clobbered by the defs.
+void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+  assert(!MI->isDebugValue() && "Won't process debug values");
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+         e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+         i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isImplicit())
+      break;
+    if (MO.isUse())
+      continue;
+    int rx = regIndex(MO.getReg());
+    if (rx < 0)
+      continue;
+
+    // This instruction explicitly defines rx.
+    DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
+                 << '\t' << *MI);
+
+    // How many instructions since rx was last written?
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+    LiveRegs[rx].Def = CurInstr;
+
+    // Kill off domains redefined by generic instructions.
+    if (Kill)
+      kill(rx);
+
+    // Verify clearance before partial register updates.
+    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+    if (!Pref)
+      continue;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      TII->breakPartialRegDependency(MI, i, TRI);
+      continue;
+    }
+
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK.\n");
+      continue;
+    }
+
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  }
+
+  ++CurInstr;
+}
+
+// A hard instruction only works in one domain. All input registers will be
+// forced into that domain.
+void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = regIndex(mo.getReg());
+    if (rx < 0) continue;
+    force(rx, domain);
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    int rx = regIndex(mo.getReg());
+    if (rx < 0) continue;
+    kill(rx);
+    force(rx, domain);
+  }
+}
+
+// A soft instruction can be changed to work in other domains given by mask.
+void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Bitmask of available domains for this instruction after taking collapsed
+  // operands into account.
+  unsigned available = mask;
+
+  // Scan the explicit use operands for incoming domains.
+  SmallVector<int, 4> used;
+  if (LiveRegs)
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg()) continue;
+      int rx = regIndex(mo.getReg());
+      if (rx < 0) continue;
+      if (DomainValue *dv = LiveRegs[rx].Value) {
+        // Bitmask of domains that dv and available have in common.
+        unsigned common = dv->getCommonDomains(available);
+        // Is it possible to use this collapsed register for free?
+        if (dv->isCollapsed()) {
+          // Restrict available domains to the ones in common with the operand.
+          // If there are no common domains, we must pay the cross-domain 
+          // penalty for this operand.
+          if (common) available = common;
+        } else if (common)
+          // Open DomainValue is compatible, save it for merging.
+          used.push_back(rx);
+        else
+          // Open DomainValue is not compatible with instruction. It is useless
+          // now.
+          kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(available)) {
+    unsigned domain = CountTrailingZeros_32(available);
+    TII->setExecutionDomain(mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match available, and build a list of
+  // incoming DomainValues that we want to merge.
+  SmallVector<LiveReg, 4> Regs;
+  for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+    int rx = *i;
+    const LiveReg &LR = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!LR.Value->getCommonDomains(available)) {
+      kill(rx);
+      continue;
+    }
+    // Sorted insertion.
+    bool Inserted = false;
+    for (SmallVector<LiveReg, 4>::iterator i = Regs.begin(), e = Regs.end();
+           i != e && !Inserted; ++i) {
+      if (LR.Def < i->Def) {
+        Inserted = true;
+        Regs.insert(i, LR);
+      }
+    }
+    if (!Inserted)
+      Regs.push_back(LR);
+  }
+
+  // doms are now sorted in order of appearance. Try to merge them all, giving
+  // priority to the latest ones.
+  DomainValue *dv = 0;
+  while (!Regs.empty()) {
+    if (!dv) {
+      dv = Regs.pop_back_val().Value;
+      // Force the first dv to match the current instruction.
+      dv->AvailableDomains = dv->getCommonDomains(available);
+      assert(dv->AvailableDomains && "Domain should have been filtered");
+      continue;
+    }
+
+    DomainValue *Latest = Regs.pop_back_val().Value;
+    // Skip already merged values.
+    if (Latest == dv || Latest->Next)
+      continue;
+    if (merge(dv, Latest))
+      continue;
+
+    // If latest didn't merge, it is useless now. Kill all registers using it.
+    for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
+      if (LiveRegs[*i].Value == Latest)
+        kill(*i);
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv) {
+    dv = alloc();
+    dv->AvailableDomains = available;
+  }
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv. We must iterate through
+  // all the operators, including imp-def ones.
+  for (MachineInstr::mop_iterator ii = mi->operands_begin(),
+                                  ee = mi->operands_end();
+                                  ii != ee; ++ii) {
+    MachineOperand &mo = *ii;
+    if (!mo.isReg()) continue;
+    int rx = regIndex(mo.getReg());
+    if (rx < 0) continue;
+    if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
+      kill(rx);
+      setLiveReg(rx, dv);
+    }
+  }
+}
+
+bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  TII = MF->getTarget().getInstrInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+  LiveRegs = 0;
+  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
+
+  DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
+               << RC->getName() << " **********\n");
+
+  // If no relevant registers are used in the function, we can skip it
+  // completely.
+  bool anyregs = false;
+  for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
+      anyregs = true;
+      break;
+    }
+  if (!anyregs) return false;
+
+  // Initialize the AliasMap on the first use.
+  if (AliasMap.empty()) {
+    // Given a PhysReg, AliasMap[PhysReg] is either the relevant index into RC,
+    // or -1.
+    AliasMap.resize(TRI->getNumRegs(), -1);
+    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
+           AI.isValid(); ++AI)
+        AliasMap[*AI] = i;
+  }
+
+  MachineBasicBlock *Entry = MF->begin();
+  ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
+  SmallVector<MachineBasicBlock*, 16> Loops;
+  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
+         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock *MBB = *MBBI;
+    enterBasicBlock(MBB);
+    if (SeenUnknownBackEdge)
+      Loops.push_back(MBB);
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+        ++I)
+      visitInstr(I);
+    leaveBasicBlock(MBB);
+  }
+
+  // Visit all the loop blocks again in order to merge DomainValues from
+  // back-edges.
+  for (unsigned i = 0, e = Loops.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Loops[i];
+    enterBasicBlock(MBB);
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+        ++I)
+      if (!I->isDebugValue())
+        processDefs(I, false);
+    leaveBasicBlock(MBB);
+  }
+
+  // Clear the LiveOuts vectors and collapse any remaining DomainValues.
+  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
+         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
+    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
+    if (FI == LiveOuts.end() || !FI->second)
+      continue;
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      if (FI->second[i].Value)
+        release(FI->second[i].Value);
+    delete[] FI->second;
+  }
+  LiveOuts.clear();
+  Avail.clear();
+  Allocator.DestroyAll();
+
+  return false;
+}
+
+FunctionPass *
+llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
+  return new ExeDepsFix(RC);
+}
diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
new file mode 100644
index 000000000000..b2b68828a226
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
@@ -0,0 +1,74 @@
+//===-- llvm/CodeGen/ExpandISelPseudos.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand Pseudo-instructions produced by ISel. These are usually to allow
+// the expansion to contain control flow, such as a conditional move
+// implemented with a conditional branch and a phi, or an atomic operation
+// implemented with a loop.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "expand-isel-pseudos"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+namespace {
+  class ExpandISelPseudos : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandISelPseudos() : MachineFunctionPass(ID) {}
+
+  private:
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+} // end anonymous namespace
+
+char ExpandISelPseudos::ID = 0;
+char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID;
+INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos",
+                "Expand ISel Pseudo-instructions", false, false)
+
+bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  const TargetLowering *TLI = MF.getTarget().getTargetLowering();
+
+  // Iterate through each instruction in the function, looking for pseudos.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+         MBBI != MBBE; ) {
+      MachineInstr *MI = MBBI++;
+
+      // If MI is a pseudo, expand it.
+      if (MI->usesCustomInsertionHook()) {
+        Changed = true;
+        MachineBasicBlock *NewMBB =
+          TLI->EmitInstrWithCustomInserter(MI, MBB);
+        // The expansion may involve new basic blocks.
+        if (NewMBB != MBB) {
+          MBB = NewMBB;
+          I = NewMBB;
+          MBBI = NewMBB->begin();
+          MBBE = NewMBB->end();
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
new file mode 100644
index 000000000000..1611db8d91a3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -0,0 +1,225 @@
+//===-- ExpandPostRAPseudos.cpp - Pseudo instruction expansion pass -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands COPY and SUBREG_TO_REG pseudo
+// instructions after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "postrapseudos"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+struct ExpandPostRA : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ExpandPostRA() : MachineFunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// runOnMachineFunction - pass entry point
+  bool runOnMachineFunction(MachineFunction&);
+
+private:
+  bool LowerSubregToReg(MachineInstr *MI);
+  bool LowerCopy(MachineInstr *MI);
+
+  void TransferImplicitDefs(MachineInstr *MI);
+};
+} // end anonymous namespace
+
+char ExpandPostRA::ID = 0;
+char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
+
+INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
+                "Post-RA pseudo instruction expansion pass", false, false)
+
+/// TransferImplicitDefs - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit-def
+/// operands from MI to the replacement instruction.
+void
+ExpandPostRA::TransferImplicitDefs(MachineInstr *MI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit() || MO.isUse())
+      continue;
+    CopyMI->addOperand(MachineOperand::CreateReg(MO.getReg(), true, true));
+  }
+}
+
+bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         MI->getOperand(1).isImm() &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
+
+  unsigned DstReg  = MI->getOperand(0).getReg();
+  unsigned InsReg  = MI->getOperand(2).getReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
+  unsigned SubIdx  = MI->getOperand(3).getImm();
+
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+         "Insert destination must be in a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+         "Inserted value must be in a physical register");
+
+  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
+
+  if (MI->allDefsAreDead()) {
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "subreg: replaced by: " << *MI);
+    return true;
+  }
+
+  if (DstSubReg == InsReg) {
+    // No need to insert an identify copy instruction.
+    // Watch out for case like this:
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
+    // We must leave %RAX live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->RemoveOperand(3);     // SubIdx
+      MI->RemoveOperand(1);     // Imm
+      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
+    DEBUG(dbgs() << "subreg: eliminated!");
+  } else {
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
+
+    // Implicitly define DstReg for subsequent uses.
+    MachineBasicBlock::iterator CopyMI = MI;
+    --CopyMI;
+    CopyMI->addRegisterDefined(DstReg);
+    DEBUG(dbgs() << "subreg: " << *CopyMI);
+  }
+
+  DEBUG(dbgs() << '\n');
+  MBB->erase(MI);
+  return true;
+}
+
+bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
+
+  if (MI->allDefsAreDead()) {
+    DEBUG(dbgs() << "dead copy: " << *MI);
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "replaced by: " << *MI);
+    return true;
+  }
+
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
+
+  if (SrcMO.getReg() == DstMO.getReg()) {
+    DEBUG(dbgs() << "identity copy: " << *MI);
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      DEBUG(dbgs() << "replaced by:   " << *MI);
+      return true;
+    }
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return true;
+  }
+
+  DEBUG(dbgs() << "real copy:   " << *MI);
+  TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
+                   DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
+
+  if (MI->getNumOperands() > 2)
+    TransferImplicitDefs(MI);
+  DEBUG({
+    MachineBasicBlock::iterator dMI = MI;
+    dbgs() << "replaced by: " << *(--dMI);
+  });
+  MI->eraseFromParent();
+  return true;
+}
+
+/// runOnMachineFunction - Reduce subregister inserts and extracts to register
+/// copies.
+///
+bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Machine Function\n"
+               << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+
+  bool MadeChange = false;
+
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me;) {
+      MachineInstr *MI = mi;
+      // Advance iterator here because MI may be erased.
+      ++mi;
+
+      // Only expand pseudos.
+      if (!MI->isPseudo())
+        continue;
+
+      // Give targets a chance to expand even standard pseudos.
+      if (TII->expandPostRAPseudo(MI)) {
+        MadeChange = true;
+        continue;
+      }
+
+      // Expand standard pseudos.
+      switch (MI->getOpcode()) {
+      case TargetOpcode::SUBREG_TO_REG:
+        MadeChange |= LowerSubregToReg(MI);
+        break;
+      case TargetOpcode::COPY:
+        MadeChange |= LowerCopy(MI);
+        break;
+      case TargetOpcode::DBG_VALUE:
+        continue;
+      case TargetOpcode::INSERT_SUBREG:
+      case TargetOpcode::EXTRACT_SUBREG:
+        llvm_unreachable("Sub-register pseudos should have been eliminated.");
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/CodeGen/GCMetadata.cpp b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
new file mode 100644
index 000000000000..ef5247c2edff
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
@@ -0,0 +1,178 @@
+//===-- GCMetadata.cpp - Garbage collector metadata -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GCFunctionInfo class and GCModuleInfo pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  
+  class Printer : public FunctionPass {
+    static char ID;
+    raw_ostream &OS;
+    
+  public:
+    explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
+
+    
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+    
+    bool runOnFunction(Function &F);
+    bool doFinalization(Module &M);
+  };
+
+}
+
+INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
+                "Create Garbage Collector Module Metadata", false, false)
+
+// -----------------------------------------------------------------------------
+
+GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
+  : F(F), S(S), FrameSize(~0LL) {}
+
+GCFunctionInfo::~GCFunctionInfo() {}
+
+// -----------------------------------------------------------------------------
+
+char GCModuleInfo::ID = 0;
+
+GCModuleInfo::GCModuleInfo()
+    : ImmutablePass(ID) {
+  initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+GCModuleInfo::~GCModuleInfo() {
+  clear();
+}
+
+GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
+                                              const std::string &Name) {
+  strategy_map_type::iterator NMI = StrategyMap.find(Name);
+  if (NMI != StrategyMap.end())
+    return NMI->getValue();
+  
+  for (GCRegistry::iterator I = GCRegistry::begin(),
+                            E = GCRegistry::end(); I != E; ++I) {
+    if (Name == I->getName()) {
+      GCStrategy *S = I->instantiate();
+      S->M = M;
+      S->Name = Name;
+      StrategyMap.GetOrCreateValue(Name).setValue(S);
+      StrategyList.push_back(S);
+      return S;
+    }
+  }
+ 
+  dbgs() << "unsupported GC: " << Name << "\n";
+  llvm_unreachable(0);
+}
+
+GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
+  assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
+  assert(F.hasGC());
+  
+  finfo_map_type::iterator I = FInfoMap.find(&F);
+  if (I != FInfoMap.end())
+    return *I->second;
+  
+  GCStrategy *S = getOrCreateStrategy(F.getParent(), F.getGC());
+  GCFunctionInfo *GFI = S->insertFunctionInfo(F);
+  FInfoMap[&F] = GFI;
+  return *GFI;
+}
+
+void GCModuleInfo::clear() {
+  FInfoMap.clear();
+  StrategyMap.clear();
+  
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+  StrategyList.clear();
+}
+
+// -----------------------------------------------------------------------------
+
+char Printer::ID = 0;
+
+FunctionPass *llvm::createGCInfoPrinter(raw_ostream &OS) {
+  return new Printer(OS);
+}
+
+
+const char *Printer::getPassName() const {
+  return "Print Garbage Collector Information";
+}
+
+void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+static const char *DescKind(GC::PointKind Kind) {
+  switch (Kind) {
+    case GC::Loop:     return "loop";
+    case GC::Return:   return "return";
+    case GC::PreCall:  return "pre-call";
+    case GC::PostCall: return "post-call";
+  }
+  llvm_unreachable("Invalid point kind");
+}
+
+bool Printer::runOnFunction(Function &F) {
+  if (F.hasGC()) return false;
+  
+  GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  
+  OS << "GC roots for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
+                                      RE = FD->roots_end(); RI != RE; ++RI)
+    OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
+  
+  OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::iterator PI = FD->begin(),
+                                PE = FD->end(); PI != PE; ++PI) {
+    
+    OS << "\t" << PI->Label->getName() << ": "
+       << DescKind(PI->Kind) << ", live = {";
+    
+    for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
+                                       RE = FD->live_end(PI);;) {
+      OS << " " << RI->Num;
+      if (++RI == RE)
+        break;
+      OS << ",";
+    }
+    
+    OS << " }\n";
+  }
+  
+  return false;
+}
+
+bool Printer::doFinalization(Module &M) {
+  GCModuleInfo *GMI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(GMI && "Printer didn't require GCModuleInfo?!");
+  GMI->clear();
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
new file mode 100644
index 000000000000..f80e9ced0bc9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
@@ -0,0 +1,27 @@
+//===-- GCMetadataPrinter.cpp - Garbage collection infrastructure ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract base class GCMetadataPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+using namespace llvm;
+
+GCMetadataPrinter::GCMetadataPrinter() { }
+
+GCMetadataPrinter::~GCMetadataPrinter() { }
+
+void GCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
+  // Default is no action.
+}
+
+void GCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
+  // Default is no action.
+}
diff --git a/contrib/llvm/lib/CodeGen/GCStrategy.cpp b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
new file mode 100644
index 000000000000..1173d1102125
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
@@ -0,0 +1,430 @@
+//===-- GCStrategy.cpp - Garbage collection infrastructure -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements target- and collector-independent garbage collection
+// infrastructure.
+//
+// GCMachineCodeAnalysis identifies the GC safe points in the machine code.
+// Roots are identified in SelectionDAGISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+  /// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+  /// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
+  /// directed by the GCStrategy. It also performs automatic root initialization
+  /// and custom intrinsic lowering.
+  class LowerIntrinsics : public FunctionPass {
+    static bool NeedsDefaultLoweringPass(const GCStrategy &C);
+    static bool NeedsCustomLoweringPass(const GCStrategy &C);
+    static bool CouldBecomeSafePoint(Instruction *I);
+    bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+    static bool InsertRootInitializers(Function &F,
+                                       AllocaInst **Roots, unsigned Count);
+
+  public:
+    static char ID;
+
+    LowerIntrinsics();
+    const char *getPassName() const;
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+  };
+
+
+  /// GCMachineCodeAnalysis - This is a target-independent pass over the machine
+  /// function representation to identify safe points for the garbage collector
+  /// in the machine code. It inserts labels at safe points and populates a
+  /// GCMetadata record for each function.
+  class GCMachineCodeAnalysis : public MachineFunctionPass {
+    const TargetMachine *TM;
+    GCFunctionInfo *FI;
+    MachineModuleInfo *MMI;
+    const TargetInstrInfo *TII;
+
+    void FindSafePoints(MachineFunction &MF);
+    void VisitCallPoint(MachineBasicBlock::iterator MI);
+    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          DebugLoc DL) const;
+
+    void FindStackOffsets(MachineFunction &MF);
+
+  public:
+    static char ID;
+
+    GCMachineCodeAnalysis();
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    bool runOnMachineFunction(MachineFunction &MF);
+  };
+
+}
+
+// -----------------------------------------------------------------------------
+
+GCStrategy::GCStrategy() :
+  NeededSafePoints(0),
+  CustomReadBarriers(false),
+  CustomWriteBarriers(false),
+  CustomRoots(false),
+  CustomSafePoints(false),
+  InitRoots(true),
+  UsesMetadata(false)
+{}
+
+GCStrategy::~GCStrategy() {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+
+  Functions.clear();
+}
+
+bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
+
+bool GCStrategy::performCustomLowering(Function &F) {
+  dbgs() << "gc " << getName() << " must override performCustomLowering.\n";
+  llvm_unreachable("must override performCustomLowering");
+}
+
+
+bool GCStrategy::findCustomSafePoints(GCFunctionInfo& FI, MachineFunction &F) {
+  dbgs() << "gc " << getName() << " must override findCustomSafePoints.\n";
+  llvm_unreachable(0);
+}
+
+
+GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
+  GCFunctionInfo *FI = new GCFunctionInfo(F, *this);
+  Functions.push_back(FI);
+  return FI;
+}
+
+// -----------------------------------------------------------------------------
+
+INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
+
+FunctionPass *llvm::createGCLoweringPass() {
+  return new LowerIntrinsics();
+}
+
+char LowerIntrinsics::ID = 0;
+
+LowerIntrinsics::LowerIntrinsics()
+  : FunctionPass(ID) {
+    initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+const char *LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<DominatorTree>();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  // FIXME: This is rather antisocial in the context of a JIT since it performs
+  //        work against the entire module. But this cannot be done at
+  //        runFunction time (initializeCustomLowering likely needs to change
+  //        the module).
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() && I->hasGC())
+      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
+
+  bool MadeChange = false;
+  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
+    if (NeedsCustomLoweringPass(**I))
+      if ((*I)->initializeCustomLowering(M))
+        MadeChange = true;
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::InsertRootInitializers(Function &F, AllocaInst **Roots,
+                                                          unsigned Count) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP)) ++IP;
+
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst*,16> InitedRoots;
+  for (; !CouldBecomeSafePoint(IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+          dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+
+  // Add root initializers.
+  bool MadeChange = false;
+
+  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
+    if (!InitedRoots.count(*I)) {
+      StoreInst* SI = new StoreInst(ConstantPointerNull::get(cast<PointerType>(
+                        cast<PointerType>((*I)->getType())->getElementType())),
+                        *I);
+      SI->insertAfter(*I);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::NeedsDefaultLoweringPass(const GCStrategy &C) {
+  // Default lowering is necessary only if read or write barriers have a default
+  // action. The default for roots is no action.
+  return !C.customWriteBarrier()
+      || !C.customReadBarrier()
+      || C.initializeRoots();
+}
+
+bool LowerIntrinsics::NeedsCustomLoweringPass(const GCStrategy &C) {
+  // Custom lowering is only necessary if enabled for some action.
+  return C.customWriteBarrier()
+      || C.customReadBarrier()
+      || C.customRoots();
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+bool LowerIntrinsics::CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  //
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  //
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) ||
+      isa<StoreInst>(I) || isa<LoadInst>(I))
+    return false;
+
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (unsigned IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+
+  return true;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+
+  bool MadeChange = false;
+
+  if (NeedsDefaultLoweringPass(S))
+    MadeChange |= PerformDefaultLowering(F, S);
+
+  bool UseCustomLoweringPass = NeedsCustomLoweringPass(S);
+  if (UseCustomLoweringPass)
+    MadeChange |= S.performCustomLowering(F);
+
+  // Custom lowering may modify the CFG, so dominators must be recomputed.
+  if (UseCustomLoweringPass) {
+    if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
+      DT->DT->recalculate(F);
+  }
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
+  bool LowerWr = !S.customWriteBarrier();
+  bool LowerRd = !S.customReadBarrier();
+  bool InitRoots = S.initializeRoots();
+
+  SmallVector<AllocaInst*, 32> Roots;
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
+        Function *F = CI->getCalledFunction();
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::gcwrite:
+          if (LowerWr) {
+            // Replace a write barrier with a simple store.
+            Value *St = new StoreInst(CI->getArgOperand(0),
+                                      CI->getArgOperand(2), CI);
+            CI->replaceAllUsesWith(St);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcread:
+          if (LowerRd) {
+            // Replace a read barrier with a simple load.
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
+            Ld->takeName(CI);
+            CI->replaceAllUsesWith(Ld);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcroot:
+          if (InitRoots) {
+            // Initialize the GC root, but do not delete the intrinsic. The
+            // backend needs the intrinsic to flag the stack slot.
+            Roots.push_back(cast<AllocaInst>(
+                              CI->getArgOperand(0)->stripPointerCasts()));
+          }
+          break;
+        default:
+          continue;
+        }
+
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+char GCMachineCodeAnalysis::ID = 0;
+char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
+
+INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
+                "Analyze Machine Code For Garbage Collection", false, false)
+
+GCMachineCodeAnalysis::GCMachineCodeAnalysis()
+  : MachineFunctionPass(ID) {}
+
+void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+}
+
+MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             DebugLoc DL) const {
+  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), too, so as to bracket the call
+  // instruction.
+  MachineBasicBlock::iterator RAI = CI;
+  ++RAI;
+
+  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
+    MCSymbol* Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
+  }
+
+  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
+    MCSymbol* Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
+  }
+}
+
+void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(),
+                                 BBE = MF.end(); BBI != BBE; ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(),
+                                     ME = BBI->end(); MI != ME; ++MI)
+      if (MI->isCall())
+        VisitCallPoint(MI);
+}
+
+void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetFrameLowering *TFI = TM->getFrameLowering();
+  assert(TFI && "TargetRegisterInfo not available!");
+
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+      ++RI;
+    }
+  }
+}
+
+bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction()->hasGC())
+    return false;
+
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
+  if (!FI->getStrategy().needsSafePoints())
+    return false;
+
+  TM = &MF.getTarget();
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TII = TM->getInstrInfo();
+
+  // Find the size of the stack frame.
+  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
+
+  // Find all safe points.
+  if (FI->getStrategy().customSafePoints()) {
+    FI->getStrategy().findCustomSafePoints(*FI, MF);
+  } else {
+    FindSafePoints(MF);
+  }
+
+  // Find the stack offsets for all roots.
+  FindStackOffsets(MF);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp
new file mode 100644
index 000000000000..9958d7daada8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp
@@ -0,0 +1,1583 @@
+//===-- IfConversion.cpp - Machine code if conversion pass. ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level if-conversion pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ifcvt"
+#include "llvm/CodeGen/Passes.h"
+#include "BranchFolding.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+// Hidden options for help debugging.
+static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple",
+                                   cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle",
+                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev",
+                                       cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> IfCvtBranchFold("ifcvt-branch-fold",
+                                     cl::init(true), cl::Hidden);
+
+STATISTIC(NumSimple,       "Number of simple if-conversions performed");
+STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
+STATISTIC(NumTriangle,     "Number of triangle if-conversions performed");
+STATISTIC(NumTriangleRev,  "Number of triangle (R) if-conversions performed");
+STATISTIC(NumTriangleFalse,"Number of triangle (F) if-conversions performed");
+STATISTIC(NumTriangleFRev, "Number of triangle (F/R) if-conversions performed");
+STATISTIC(NumDiamonds,     "Number of diamond if-conversions performed");
+STATISTIC(NumIfConvBBs,    "Number of if-converted blocks");
+STATISTIC(NumDupBBs,       "Number of duplicated blocks");
+STATISTIC(NumUnpred,       "Number of true blocks of diamonds unpredicated");
+
+namespace {
+  class IfConverter : public MachineFunctionPass {
+    enum IfcvtKind {
+      ICNotClassfied,  // BB data valid, but not classified.
+      ICSimpleFalse,   // Same as ICSimple, but on the false path.
+      ICSimple,        // BB is entry of an one split, no rejoin sub-CFG.
+      ICTriangleFRev,  // Same as ICTriangleFalse, but false path rev condition.
+      ICTriangleRev,   // Same as ICTriangle, but true path rev condition.
+      ICTriangleFalse, // Same as ICTriangle, but on the false path.
+      ICTriangle,      // BB is entry of a triangle sub-CFG.
+      ICDiamond        // BB is entry of a diamond sub-CFG.
+    };
+
+    /// BBInfo - One per MachineBasicBlock, this is used to cache the result
+    /// if-conversion feasibility analysis. This includes results from
+    /// TargetInstrInfo::AnalyzeBranch() (i.e. TBB, FBB, and Cond), and its
+    /// classification, and common tail block of its successors (if it's a
+    /// diamond shape), its size, whether it's predicable, and whether any
+    /// instruction can clobber the 'would-be' predicate.
+    ///
+    /// IsDone          - True if BB is not to be considered for ifcvt.
+    /// IsBeingAnalyzed - True if BB is currently being analyzed.
+    /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
+    /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
+    /// IsBrAnalyzable  - True if AnalyzeBranch() returns false.
+    /// HasFallThrough  - True if BB may fallthrough to the following BB.
+    /// IsUnpredicable  - True if BB is known to be unpredicable.
+    /// ClobbersPred    - True if BB could modify predicates (e.g. has
+    ///                   cmp, call, etc.)
+    /// NonPredSize     - Number of non-predicated instructions.
+    /// ExtraCost       - Extra cost for multi-cycle instructions.
+    /// ExtraCost2      - Some instructions are slower when predicated
+    /// BB              - Corresponding MachineBasicBlock.
+    /// TrueBB / FalseBB- See AnalyzeBranch().
+    /// BrCond          - Conditions for end of block conditional branches.
+    /// Predicate       - Predicate used in the BB.
+    struct BBInfo {
+      bool IsDone          : 1;
+      bool IsBeingAnalyzed : 1;
+      bool IsAnalyzed      : 1;
+      bool IsEnqueued      : 1;
+      bool IsBrAnalyzable  : 1;
+      bool HasFallThrough  : 1;
+      bool IsUnpredicable  : 1;
+      bool CannotBeCopied  : 1;
+      bool ClobbersPred    : 1;
+      unsigned NonPredSize;
+      unsigned ExtraCost;
+      unsigned ExtraCost2;
+      MachineBasicBlock *BB;
+      MachineBasicBlock *TrueBB;
+      MachineBasicBlock *FalseBB;
+      SmallVector<MachineOperand, 4> BrCond;
+      SmallVector<MachineOperand, 4> Predicate;
+      BBInfo() : IsDone(false), IsBeingAnalyzed(false),
+                 IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
+                 HasFallThrough(false), IsUnpredicable(false),
+                 CannotBeCopied(false), ClobbersPred(false), NonPredSize(0),
+                 ExtraCost(0), ExtraCost2(0), BB(0), TrueBB(0), FalseBB(0) {}
+    };
+
+    /// IfcvtToken - Record information about pending if-conversions to attempt:
+    /// BBI             - Corresponding BBInfo.
+    /// Kind            - Type of block. See IfcvtKind.
+    /// NeedSubsumption - True if the to-be-predicated BB has already been
+    ///                   predicated.
+    /// NumDups      - Number of instructions that would be duplicated due
+    ///                   to this if-conversion. (For diamonds, the number of
+    ///                   identical instructions at the beginnings of both
+    ///                   paths).
+    /// NumDups2     - For diamonds, the number of identical instructions
+    ///                   at the ends of both paths.
+    struct IfcvtToken {
+      BBInfo &BBI;
+      IfcvtKind Kind;
+      bool NeedSubsumption;
+      unsigned NumDups;
+      unsigned NumDups2;
+      IfcvtToken(BBInfo &b, IfcvtKind k, bool s, unsigned d, unsigned d2 = 0)
+        : BBI(b), Kind(k), NeedSubsumption(s), NumDups(d), NumDups2(d2) {}
+    };
+
+    /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
+    /// basic block number.
+    std::vector<BBInfo> BBAnalysis;
+
+    const TargetLoweringBase *TLI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const InstrItineraryData *InstrItins;
+    const MachineBranchProbabilityInfo *MBPI;
+    MachineRegisterInfo *MRI;
+
+    bool PreRegAlloc;
+    bool MadeChange;
+    int FnNum;
+  public:
+    static char ID;
+    IfConverter() : MachineFunctionPass(ID), FnNum(-1) {
+      initializeIfConverterPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    bool ReverseBranchCondition(BBInfo &BBI);
+    bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                     const BranchProbability &Prediction) const;
+    bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                       bool FalseBranch, unsigned &Dups,
+                       const BranchProbability &Prediction) const;
+    bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                      unsigned &Dups1, unsigned &Dups2) const;
+    void ScanInstructions(BBInfo &BBI);
+    BBInfo &AnalyzeBlock(MachineBasicBlock *BB,
+                         std::vector<IfcvtToken*> &Tokens);
+    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
+                             bool isTriangle = false, bool RevBranch = false);
+    void AnalyzeBlocks(MachineFunction &MF, std::vector<IfcvtToken*> &Tokens);
+    void InvalidatePreds(MachineBasicBlock *BB);
+    void RemoveExtraEdges(BBInfo &BBI);
+    bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                          unsigned NumDups1, unsigned NumDups2);
+    void PredicateBlock(BBInfo &BBI,
+                        MachineBasicBlock::iterator E,
+                        SmallVectorImpl<MachineOperand> &Cond,
+                        SmallSet<unsigned, 4> &Redefs,
+                        SmallSet<unsigned, 4> *LaterRedefs = 0);
+    void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               SmallSet<unsigned, 4> &Redefs,
+                               bool IgnoreBr = false);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &BB,
+                            unsigned Cycle, unsigned Extra,
+                            const BranchProbability &Prediction) const {
+      return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra,
+                                                   Prediction);
+    }
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
+                            unsigned TCycle, unsigned TExtra,
+                            MachineBasicBlock &FBB,
+                            unsigned FCycle, unsigned FExtra,
+                            const BranchProbability &Prediction) const {
+      return TCycle > 0 && FCycle > 0 &&
+        TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
+                                 Prediction);
+    }
+
+    // blockAlwaysFallThrough - Block ends without a terminator.
+    bool blockAlwaysFallThrough(BBInfo &BBI) const {
+      return BBI.IsBrAnalyzable && BBI.TrueBB == NULL;
+    }
+
+    // IfcvtTokenCmp - Used to sort if-conversion candidates.
+    static bool IfcvtTokenCmp(IfcvtToken *C1, IfcvtToken *C2) {
+      int Incr1 = (C1->Kind == ICDiamond)
+        ? -(int)(C1->NumDups + C1->NumDups2) : (int)C1->NumDups;
+      int Incr2 = (C2->Kind == ICDiamond)
+        ? -(int)(C2->NumDups + C2->NumDups2) : (int)C2->NumDups;
+      if (Incr1 > Incr2)
+        return true;
+      else if (Incr1 == Incr2) {
+        // Favors subsumption.
+        if (C1->NeedSubsumption == false && C2->NeedSubsumption == true)
+          return true;
+        else if (C1->NeedSubsumption == C2->NeedSubsumption) {
+          // Favors diamond over triangle, etc.
+          if ((unsigned)C1->Kind < (unsigned)C2->Kind)
+            return true;
+          else if (C1->Kind == C2->Kind)
+            return C1->BBI.BB->getNumber() < C2->BBI.BB->getNumber();
+        }
+      }
+      return false;
+    }
+  };
+
+  char IfConverter::ID = 0;
+}
+
+char &llvm::IfConverterID = IfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
+
+bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
+  TLI = MF.getTarget().getTargetLowering();
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MRI = &MF.getRegInfo();
+  InstrItins = MF.getTarget().getInstrItineraryData();
+  if (!TII) return false;
+
+  PreRegAlloc = MRI->isSSA();
+
+  bool BFChange = false;
+  if (!PreRegAlloc) {
+    // Tail merge tend to expose more if-conversion opportunities.
+    BranchFolder BF(true, false);
+    BFChange = BF.OptimizeFunction(MF, TII,
+                                   MF.getTarget().getRegisterInfo(),
+                                   getAnalysisIfAvailable<MachineModuleInfo>());
+  }
+
+  DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
+               << MF.getName() << "\'");
+
+  if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
+    DEBUG(dbgs() << " skipped\n");
+    return false;
+  }
+  DEBUG(dbgs() << "\n");
+
+  MF.RenumberBlocks();
+  BBAnalysis.resize(MF.getNumBlockIDs());
+
+  std::vector<IfcvtToken*> Tokens;
+  MadeChange = false;
+  unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
+    NumTriangleRev + NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+  while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
+    // Do an initial analysis for each basic block and find all the potential
+    // candidates to perform if-conversion.
+    bool Change = false;
+    AnalyzeBlocks(MF, Tokens);
+    while (!Tokens.empty()) {
+      IfcvtToken *Token = Tokens.back();
+      Tokens.pop_back();
+      BBInfo &BBI = Token->BBI;
+      IfcvtKind Kind = Token->Kind;
+      unsigned NumDups = Token->NumDups;
+      unsigned NumDups2 = Token->NumDups2;
+
+      delete Token;
+
+      // If the block has been evicted out of the queue or it has already been
+      // marked dead (due to it being predicated), then skip it.
+      if (BBI.IsDone)
+        BBI.IsEnqueued = false;
+      if (!BBI.IsEnqueued)
+        continue;
+
+      BBI.IsEnqueued = false;
+
+      bool RetVal = false;
+      switch (Kind) {
+      default: llvm_unreachable("Unexpected!");
+      case ICSimple:
+      case ICSimpleFalse: {
+        bool isFalse = Kind == ICSimpleFalse;
+        if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
+        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ?
+                                            " false" : "")
+                     << "): BB#" << BBI.BB->getNumber() << " ("
+                     << ((Kind == ICSimpleFalse)
+                         ? BBI.FalseBB->getNumber()
+                         : BBI.TrueBB->getNumber()) << ") ");
+        RetVal = IfConvertSimple(BBI, Kind);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) ++NumSimpleFalse;
+          else         ++NumSimple;
+        }
+       break;
+      }
+      case ICTriangle:
+      case ICTriangleRev:
+      case ICTriangleFalse:
+      case ICTriangleFRev: {
+        bool isFalse = Kind == ICTriangleFalse;
+        bool isRev   = (Kind == ICTriangleRev || Kind == ICTriangleFRev);
+        if (DisableTriangle && !isFalse && !isRev) break;
+        if (DisableTriangleR && !isFalse && isRev) break;
+        if (DisableTriangleF && isFalse && !isRev) break;
+        if (DisableTriangleFR && isFalse && isRev) break;
+        DEBUG(dbgs() << "Ifcvt (Triangle");
+        if (isFalse)
+          DEBUG(dbgs() << " false");
+        if (isRev)
+          DEBUG(dbgs() << " rev");
+        DEBUG(dbgs() << "): BB#" << BBI.BB->getNumber() << " (T:"
+                     << BBI.TrueBB->getNumber() << ",F:"
+                     << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertTriangle(BBI, Kind);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) {
+            if (isRev) ++NumTriangleFRev;
+            else       ++NumTriangleFalse;
+          } else {
+            if (isRev) ++NumTriangleRev;
+            else       ++NumTriangle;
+          }
+        }
+        break;
+      }
+      case ICDiamond: {
+        if (DisableDiamond) break;
+        DEBUG(dbgs() << "Ifcvt (Diamond): BB#" << BBI.BB->getNumber() << " (T:"
+                     << BBI.TrueBB->getNumber() << ",F:"
+                     << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) ++NumDiamonds;
+        break;
+      }
+      }
+
+      Change |= RetVal;
+
+      NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle + NumTriangleRev +
+        NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+      if (IfCvtLimit != -1 && (int)NumIfCvts >= IfCvtLimit)
+        break;
+    }
+
+    if (!Change)
+      break;
+    MadeChange |= Change;
+  }
+
+  // Delete tokens in case of early exit.
+  while (!Tokens.empty()) {
+    IfcvtToken *Token = Tokens.back();
+    Tokens.pop_back();
+    delete Token;
+  }
+
+  Tokens.clear();
+  BBAnalysis.clear();
+
+  if (MadeChange && IfCvtBranchFold) {
+    BranchFolder BF(false, false);
+    BF.OptimizeFunction(MF, TII,
+                        MF.getTarget().getRegisterInfo(),
+                        getAnalysisIfAvailable<MachineModuleInfo>());
+  }
+
+  MadeChange |= BFChange;
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return NULL;
+}
+
+/// ReverseBranchCondition - Reverse the condition of the end of the block
+/// branch. Swap block's 'true' and 'false' successors.
+bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (!TII->ReverseBranchCondition(BBI.BrCond)) {
+    TII->RemoveBranch(*BBI.BB);
+    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond, dl);
+    std::swap(BBI.TrueBB, BBI.FalseBB);
+    return true;
+  }
+  return false;
+}
+
+/// getNextBlock - Returns the next block in the function blocks ordering. If
+/// it is the end, returns NULL.
+static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
+  MachineFunction::iterator I = BB;
+  MachineFunction::iterator E = BB->getParent()->end();
+  if (++I == E)
+    return NULL;
+  return I;
+}
+
+/// ValidSimple - Returns true if the 'true' block (along with its
+/// predecessor) forms a valid simple shape for ifcvt. It also returns the
+/// number of instructions that the ifcvt would need to duplicate if performed
+/// in Dups.
+bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                              const BranchProbability &Prediction) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.IsBrAnalyzable)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied ||
+        !TII->isProfitableToDupForIfCvt(*TrueBBI.BB, TrueBBI.NonPredSize,
+                                        Prediction))
+      return false;
+    Dups = TrueBBI.NonPredSize;
+  }
+
+  return true;
+}
+
+/// ValidTriangle - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid triangle shape for ifcvt.
+/// If 'FalseBranch' is true, it checks if 'true' block's false branch
+/// branches to the 'false' block rather than the other way around. It also
+/// returns the number of instructions that the ifcvt would need to duplicate
+/// if performed in 'Dups'.
+bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                                bool FalseBranch, unsigned &Dups,
+                                const BranchProbability &Prediction) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied)
+      return false;
+
+    unsigned Size = TrueBBI.NonPredSize;
+    if (TrueBBI.IsBrAnalyzable) {
+      if (TrueBBI.TrueBB && TrueBBI.BrCond.empty())
+        // Ends with an unconditional branch. It will be removed.
+        --Size;
+      else {
+        MachineBasicBlock *FExit = FalseBranch
+          ? TrueBBI.TrueBB : TrueBBI.FalseBB;
+        if (FExit)
+          // Require a conditional branch
+          ++Size;
+      }
+    }
+    if (!TII->isProfitableToDupForIfCvt(*TrueBBI.BB, Size, Prediction))
+      return false;
+    Dups = Size;
+  }
+
+  MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB;
+  if (!TExit && blockAlwaysFallThrough(TrueBBI)) {
+    MachineFunction::iterator I = TrueBBI.BB;
+    if (++I == TrueBBI.BB->getParent()->end())
+      return false;
+    TExit = I;
+  }
+  return TExit && TExit == FalseBBI.BB;
+}
+
+/// ValidDiamond - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid diamond shape for ifcvt.
+bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                               unsigned &Dups1, unsigned &Dups2) const {
+  Dups1 = Dups2 = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone ||
+      FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
+    return false;
+
+  MachineBasicBlock *TT = TrueBBI.TrueBB;
+  MachineBasicBlock *FT = FalseBBI.TrueBB;
+
+  if (!TT && blockAlwaysFallThrough(TrueBBI))
+    TT = getNextBlock(TrueBBI.BB);
+  if (!FT && blockAlwaysFallThrough(FalseBBI))
+    FT = getNextBlock(FalseBBI.BB);
+  if (TT != FT)
+    return false;
+  if (TT == NULL && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
+    return false;
+  if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
+    return false;
+
+  // FIXME: Allow true block to have an early exit?
+  if (TrueBBI.FalseBB || FalseBBI.FalseBB ||
+      (TrueBBI.ClobbersPred && FalseBBI.ClobbersPred))
+    return false;
+
+  // Count duplicate instructions at the beginning of the true and false blocks.
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(FIB))
+      break;
+    ++Dups1;
+    ++TIB;
+    ++FIB;
+  }
+
+  // Now, in preparation for counting duplicate instructions at the ends of the
+  // blocks, move the end iterators up past any branch instructions.
+  while (TIE != TIB) {
+    --TIE;
+    if (!TIE->isBranch())
+      break;
+  }
+  while (FIE != FIB) {
+    --FIE;
+    if (!FIE->isBranch())
+      break;
+  }
+
+  // If Dups1 includes all of a block, then don't count duplicate
+  // instructions at the end of the blocks.
+  if (TIB == TIE || FIB == FIE)
+    return true;
+
+  // Count duplicate instructions at the ends of the blocks.
+  while (TIE != TIB && FIE != FIB) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIE->isDebugValue()) {
+      while (TIE != TIB && TIE->isDebugValue())
+        --TIE;
+      if (TIE == TIB)
+        break;
+    }
+    if (FIE->isDebugValue()) {
+      while (FIE != FIB && FIE->isDebugValue())
+        --FIE;
+      if (FIE == FIB)
+        break;
+    }
+    if (!TIE->isIdenticalTo(FIE))
+      break;
+    ++Dups2;
+    --TIE;
+    --FIE;
+  }
+
+  return true;
+}
+
+/// ScanInstructions - Scan all the instructions in the block to determine if
+/// the block is predicable. In most cases, that means all the instructions
+/// in the block are isPredicable(). Also checks if the block contains any
+/// instruction which can clobber a predicate (e.g. condition code register).
+/// If so, the block is not predicable unless it's the last instruction.
+void IfConverter::ScanInstructions(BBInfo &BBI) {
+  if (BBI.IsDone)
+    return;
+
+  bool AlreadyPredicated = !BBI.Predicate.empty();
+  // First analyze the end of BB branches.
+  BBI.TrueBB = BBI.FalseBB = NULL;
+  BBI.BrCond.clear();
+  BBI.IsBrAnalyzable =
+    !TII->AnalyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == NULL;
+
+  if (BBI.BrCond.size()) {
+    // No false branch. This BB must end with a conditional branch and a
+    // fallthrough.
+    if (!BBI.FalseBB)
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);
+    if (!BBI.FalseBB) {
+      // Malformed bcc? True and false blocks are the same?
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+
+  // Then scan all the instructions.
+  BBI.NonPredSize = 0;
+  BBI.ExtraCost = 0;
+  BBI.ExtraCost2 = 0;
+  BBI.ClobbersPred = false;
+  for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
+       I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (I->isNotDuplicable())
+      BBI.CannotBeCopied = true;
+
+    bool isPredicated = TII->isPredicated(I);
+    bool isCondBr = BBI.IsBrAnalyzable && I->isConditionalBranch();
+
+    if (!isCondBr) {
+      if (!isPredicated) {
+        BBI.NonPredSize++;
+        unsigned ExtraPredCost = 0;
+        unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I,
+                                                  &ExtraPredCost);
+        if (NumCycles > 1)
+          BBI.ExtraCost += NumCycles-1;
+        BBI.ExtraCost2 += ExtraPredCost;
+      } else if (!AlreadyPredicated) {
+        // FIXME: This instruction is already predicated before the
+        // if-conversion pass. It's probably something like a conditional move.
+        // Mark this block unpredicable for now.
+        BBI.IsUnpredicable = true;
+        return;
+      }
+    }
+
+    if (BBI.ClobbersPred && !isPredicated) {
+      // Predicate modification instruction should end the block (except for
+      // already predicated instructions and end of block branches).
+      if (isCondBr) {
+        // A conditional branch is not predicable, but it may be eliminated.
+        continue;
+      }
+
+      // Predicate may have been modified, the subsequent (currently)
+      // unpredicated instructions cannot be correctly predicated.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
+    // still potentially predicable.
+    std::vector<MachineOperand> PredDefs;
+    if (TII->DefinesPredicate(I, PredDefs))
+      BBI.ClobbersPred = true;
+
+    if (!TII->isPredicable(I)) {
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+}
+
+/// FeasibilityAnalysis - Determine if the block is a suitable candidate to be
+/// predicated by the specified predicate.
+bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
+                                      SmallVectorImpl<MachineOperand> &Pred,
+                                      bool isTriangle, bool RevBranch) {
+  // If the block is dead or unpredicable, then it cannot be predicated.
+  if (BBI.IsDone || BBI.IsUnpredicable)
+    return false;
+
+  // If it is already predicated, check if its predicate subsumes the new
+  // predicate.
+  if (BBI.Predicate.size() && !TII->SubsumesPredicate(BBI.Predicate, Pred))
+    return false;
+
+  if (BBI.BrCond.size()) {
+    if (!isTriangle)
+      return false;
+
+    // Test predicate subsumption.
+    SmallVector<MachineOperand, 4> RevPred(Pred.begin(), Pred.end());
+    SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+    if (RevBranch) {
+      if (TII->ReverseBranchCondition(Cond))
+        return false;
+    }
+    if (TII->ReverseBranchCondition(RevPred) ||
+        !TII->SubsumesPredicate(Cond, RevPred))
+      return false;
+  }
+
+  return true;
+}
+
+/// AnalyzeBlock - Analyze the structure of the sub-CFG starting from
+/// the specified block. Record its successors and whether it looks like an
+/// if-conversion candidate.
+IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
+                                             std::vector<IfcvtToken*> &Tokens) {
+  BBInfo &BBI = BBAnalysis[BB->getNumber()];
+
+  if (BBI.IsAnalyzed || BBI.IsBeingAnalyzed)
+    return BBI;
+
+  BBI.BB = BB;
+  BBI.IsBeingAnalyzed = true;
+
+  ScanInstructions(BBI);
+
+  // Unanalyzable or ends with fallthrough or unconditional branch, or if is not
+  // considered for ifcvt anymore.
+  if (!BBI.IsBrAnalyzable || BBI.BrCond.empty() || BBI.IsDone) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  // Do not ifcvt if either path is a back edge to the entry block.
+  if (BBI.TrueBB == BB || BBI.FalseBB == BB) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  // Do not ifcvt if true and false fallthrough blocks are the same.
+  if (!BBI.FalseBB) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  BBInfo &TrueBBI  = AnalyzeBlock(BBI.TrueBB, Tokens);
+  BBInfo &FalseBBI = AnalyzeBlock(BBI.FalseBB, Tokens);
+
+  if (TrueBBI.IsDone && FalseBBI.IsDone) {
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    return BBI;
+  }
+
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  bool CanRevCond = !TII->ReverseBranchCondition(RevCond);
+
+  unsigned Dups = 0;
+  unsigned Dups2 = 0;
+  bool TNeedSub = !TrueBBI.Predicate.empty();
+  bool FNeedSub = !FalseBBI.Predicate.empty();
+  bool Enqueued = false;
+
+  BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
+
+  if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, (TrueBBI.NonPredSize - (Dups + Dups2) +
+                                       TrueBBI.ExtraCost), TrueBBI.ExtraCost2,
+                         *FalseBBI.BB, (FalseBBI.NonPredSize - (Dups + Dups2) +
+                                        FalseBBI.ExtraCost),FalseBBI.ExtraCost2,
+                         Prediction) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
+      FeasibilityAnalysis(FalseBBI, RevCond)) {
+    // Diamond:
+    //   EBB
+    //   / \_
+    //  |   |
+    // TBB FBB
+    //   \ /
+    //  TailBB
+    // Note TailBB can be empty.
+    Tokens.push_back(new IfcvtToken(BBI, ICDiamond, TNeedSub|FNeedSub, Dups,
+                                    Dups2));
+    Enqueued = true;
+  }
+
+  if (ValidTriangle(TrueBBI, FalseBBI, false, Dups, Prediction) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                         TrueBBI.ExtraCost2, Prediction) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
+    // Triangle:
+    //   EBB
+    //   | \_
+    //   |  |
+    //   | TBB
+    //   |  /
+    //   FBB
+    Tokens.push_back(new IfcvtToken(BBI, ICTriangle, TNeedSub, Dups));
+    Enqueued = true;
+  }
+
+  if (ValidTriangle(TrueBBI, FalseBBI, true, Dups, Prediction) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                         TrueBBI.ExtraCost2, Prediction) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
+    Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
+    Enqueued = true;
+  }
+
+  if (ValidSimple(TrueBBI, Dups, Prediction) &&
+      MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                         TrueBBI.ExtraCost2, Prediction) &&
+      FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
+    // Simple (split, no rejoin):
+    //   EBB
+    //   | \_
+    //   |  |
+    //   | TBB---> exit
+    //   |
+    //   FBB
+    Tokens.push_back(new IfcvtToken(BBI, ICSimple, TNeedSub, Dups));
+    Enqueued = true;
+  }
+
+  if (CanRevCond) {
+    // Try the other path...
+    if (ValidTriangle(FalseBBI, TrueBBI, false, Dups,
+                      Prediction.getCompl()) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB,
+                           FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                           FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidTriangle(FalseBBI, TrueBBI, true, Dups,
+                      Prediction.getCompl()) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB,
+                           FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                           FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidSimple(FalseBBI, Dups, Prediction.getCompl()) &&
+        MeetIfcvtSizeLimit(*FalseBBI.BB,
+                           FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                           FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+        FeasibilityAnalysis(FalseBBI, RevCond)) {
+      Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
+      Enqueued = true;
+    }
+  }
+
+  BBI.IsEnqueued = Enqueued;
+  BBI.IsBeingAnalyzed = false;
+  BBI.IsAnalyzed = true;
+  return BBI;
+}
+
+/// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
+/// candidates.
+void IfConverter::AnalyzeBlocks(MachineFunction &MF,
+                                std::vector<IfcvtToken*> &Tokens) {
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+    AnalyzeBlock(BB, Tokens);
+  }
+
+  // Sort to favor more complex ifcvt scheme.
+  std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
+}
+
+/// canFallThroughTo - Returns true either if ToBB is the next block after BB or
+/// that all the intervening blocks are empty (given BB can fall through to its
+/// next block).
+static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
+  MachineFunction::iterator PI = BB;
+  MachineFunction::iterator I = llvm::next(PI);
+  MachineFunction::iterator TI = ToBB;
+  MachineFunction::iterator E = BB->getParent()->end();
+  while (I != TI) {
+    // Check isSuccessor to avoid case where the next block is empty, but
+    // it's not a successor.
+    if (I == E || !I->empty() || !PI->isSuccessor(I))
+      return false;
+    PI = I++;
+  }
+  return true;
+}
+
+/// InvalidatePreds - Invalidate predecessor BB info so it would be re-analyzed
+/// to determine if it can be if-converted. If predecessor is already enqueued,
+/// dequeue it!
+void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    BBInfo &PBBI = BBAnalysis[(*PI)->getNumber()];
+    if (PBBI.IsDone || PBBI.BB == BB)
+      continue;
+    PBBI.IsAnalyzed = false;
+    PBBI.IsEnqueued = false;
+  }
+}
+
+/// InsertUncondBranch - Inserts an unconditional branch from BB to ToBB.
+///
+static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
+                               const TargetInstrInfo *TII) {
+  DebugLoc dl;  // FIXME: this is nowhere
+  SmallVector<MachineOperand, 0> NoCond;
+  TII->InsertBranch(*BB, ToBB, NULL, NoCond, dl);
+}
+
+/// RemoveExtraEdges - Remove true / false edges if either / both are no longer
+/// successors.
+void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
+  MachineBasicBlock *TBB = NULL, *FBB = NULL;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->AnalyzeBranch(*BBI.BB, TBB, FBB, Cond))
+    BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+}
+
+/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
+/// modeled as read + write (sort like two-address instructions). These
+/// routines track register liveness and add implicit uses to if-converted
+/// instructions to conform to the model.
+static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
+                           const TargetRegisterInfo *TRI) {
+  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
+         E = BB->livein_end(); I != E; ++I) {
+    unsigned Reg = *I;
+    Redefs.insert(Reg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Redefs.insert(*SubRegs);
+  }
+}
+
+static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI,
+                             bool AddImpUse = false) {
+  SmallVector<unsigned, 4> Defs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef())
+      Defs.push_back(Reg);
+    else if (MO.isKill()) {
+      Redefs.erase(Reg);
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.erase(*SubRegs);
+    }
+  }
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    unsigned Reg = Defs[i];
+    if (!Redefs.insert(Reg)) {
+      if (AddImpUse)
+        // Treat predicated update as read + write.
+        MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
+    } else {
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        Redefs.insert(*SubRegs);
+    }
+  }
+}
+
+static void UpdatePredRedefs(MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator E,
+                             SmallSet<unsigned,4> &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  while (I != E) {
+    UpdatePredRedefs(I, Redefs, TRI);
+    ++I;
+  }
+}
+
+/// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
+///
+bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICSimpleFalse)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (Kind == ICSimpleFalse)
+    if (TII->ReverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  // Initialize liveins to the first BB. These are potentiall redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
+  if (CvtBBI->BB->pred_size() > 1) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
+  } else {
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+
+    // Merge converted block into entry block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI);
+  }
+
+  bool IterIfcvt = true;
+  if (!canFallThroughTo(BBI.BB, NextBBI->BB)) {
+    InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+    BBI.HasFallThrough = false;
+    // Now ifcvt'd block will look like this:
+    // BB:
+    // ...
+    // t, f = cmp
+    // if t op
+    // b BBf
+    //
+    // We cannot further ifcvt this block because the unconditional branch
+    // will have to be predicated on the new condition, that will not be
+    // available if cmp executes.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertTriangle - If convert a triangle sub-CFG.
+///
+bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    if (TII->ReverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  if (Kind == ICTriangleRev || Kind == ICTriangleFRev) {
+    if (ReverseBranchCondition(*CvtBBI)) {
+      // BB has been changed, modify its predecessors (except for this
+      // one) so they don't get ifcvt'ed based on bad intel.
+      for (MachineBasicBlock::pred_iterator PI = CvtBBI->BB->pred_begin(),
+             E = CvtBBI->BB->pred_end(); PI != E; ++PI) {
+        MachineBasicBlock *PBB = *PI;
+        if (PBB == BBI.BB)
+          continue;
+        BBInfo &PBBI = BBAnalysis[PBB->getNumber()];
+        if (PBBI.IsEnqueued) {
+          PBBI.IsAnalyzed = false;
+          PBBI.IsEnqueued = false;
+        }
+      }
+    }
+  }
+
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
+  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+
+  bool HasEarlyExit = CvtBBI->FalseBB != NULL;
+  if (CvtBBI->BB->pred_size() > 1) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
+  } else {
+    // Predicate the 'true' block after removing its branch.
+    CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+
+    // Now merge the entry of the triangle with the true block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI, false);
+  }
+
+  // If 'true' block has a 'false' successor, add an exit branch to it.
+  if (HasEarlyExit) {
+    SmallVector<MachineOperand, 4> RevCond(CvtBBI->BrCond.begin(),
+                                           CvtBBI->BrCond.end());
+    if (TII->ReverseBranchCondition(RevCond))
+      llvm_unreachable("Unable to reverse branch condition!");
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, NULL, RevCond, dl);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB);
+  }
+
+  // Merge in the 'false' block if the 'false' block has no other
+  // predecessors. Otherwise, add an unconditional branch to 'false'.
+  bool FalseBBDead = false;
+  bool IterIfcvt = true;
+  bool isFallThrough = canFallThroughTo(BBI.BB, NextBBI->BB);
+  if (!isFallThrough) {
+    // Only merge them if the true block does not fallthrough to the false
+    // block. By not merging them, we make it possible to iteratively
+    // ifcvt the blocks.
+    if (!HasEarlyExit &&
+        NextBBI->BB->pred_size() == 1 && !NextBBI->HasFallThrough) {
+      MergeBlocks(BBI, *NextBBI);
+      FalseBBDead = true;
+    } else {
+      InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+      BBI.HasFallThrough = false;
+    }
+    // Mixed predicated and unpredicated code. This cannot be iteratively
+    // predicated.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+  if (FalseBBDead)
+    NextBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertDiamond - If convert a diamond sub-CFG.
+///
+bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                                   unsigned NumDups1, unsigned NumDups2) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  MachineBasicBlock *TailBB = TrueBBI.TrueBB;
+  // True block must fall through or end with an unanalyzable terminator.
+  if (!TailBB) {
+    if (blockAlwaysFallThrough(TrueBBI))
+      TailBB = FalseBBI.TrueBB;
+    assert((TailBB || !TrueBBI.IsBrAnalyzable) && "Unexpected!");
+  }
+
+  if (TrueBBI.IsDone || FalseBBI.IsDone ||
+      TrueBBI.BB->pred_size() > 1 ||
+      FalseBBI.BB->pred_size() > 1) {
+    // Something has changed. It's no longer safe to predicate these blocks.
+    BBI.IsAnalyzed = false;
+    TrueBBI.IsAnalyzed = false;
+    FalseBBI.IsAnalyzed = false;
+    return false;
+  }
+
+  // Put the predicated instructions from the 'true' block before the
+  // instructions from the 'false' block, unless the true block would clobber
+  // the predicate, in which case, do the opposite.
+  BBInfo *BBI1 = &TrueBBI;
+  BBInfo *BBI2 = &FalseBBI;
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (TII->ReverseBranchCondition(RevCond))
+    llvm_unreachable("Unable to reverse branch condition!");
+  SmallVector<MachineOperand, 4> *Cond1 = &BBI.BrCond;
+  SmallVector<MachineOperand, 4> *Cond2 = &RevCond;
+
+  // Figure out the more profitable ordering.
+  bool DoSwap = false;
+  if (TrueBBI.ClobbersPred && !FalseBBI.ClobbersPred)
+    DoSwap = true;
+  else if (TrueBBI.ClobbersPred == FalseBBI.ClobbersPred) {
+    if (TrueBBI.NonPredSize > FalseBBI.NonPredSize)
+      DoSwap = true;
+  }
+  if (DoSwap) {
+    std::swap(BBI1, BBI2);
+    std::swap(Cond1, Cond2);
+  }
+
+  // Remove the conditional branch from entry to the blocks.
+  BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  SmallSet<unsigned, 4> Redefs;
+  InitPredRedefs(BBI1->BB, Redefs, TRI);
+
+  // Remove the duplicated instructions at the beginnings of both paths.
+  MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
+  MachineBasicBlock::iterator DI2 = BBI2->BB->begin();
+  MachineBasicBlock::iterator DIE1 = BBI1->BB->end();
+  MachineBasicBlock::iterator DIE2 = BBI2->BB->end();
+  // Skip dbg_value instructions
+  while (DI1 != DIE1 && DI1->isDebugValue())
+    ++DI1;
+  while (DI2 != DIE2 && DI2->isDebugValue())
+    ++DI2;
+  BBI1->NonPredSize -= NumDups1;
+  BBI2->NonPredSize -= NumDups1;
+
+  // Skip past the dups on each side separately since there may be
+  // differing dbg_value entries.
+  for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (!DI1->isDebugValue())
+      ++i;
+  }
+  while (NumDups1 != 0) {
+    ++DI2;
+    if (!DI2->isDebugValue())
+      --NumDups1;
+  }
+
+  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
+  BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
+  BBI2->BB->erase(BBI2->BB->begin(), DI2);
+
+  // Remove branch from 'true' block and remove duplicated instructions.
+  BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
+  DI1 = BBI1->BB->end();
+  for (unsigned i = 0; i != NumDups2; ) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI1 != BBI1->BB->begin());
+    --DI1;
+    // skip dbg_value instructions
+    if (!DI1->isDebugValue())
+      ++i;
+  }
+  BBI1->BB->erase(DI1, BBI1->BB->end());
+
+  // Remove 'false' block branch and find the last instruction to predicate.
+  BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
+  DI2 = BBI2->BB->end();
+  while (NumDups2 != 0) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI2 != BBI2->BB->begin());
+    --DI2;
+    // skip dbg_value instructions
+    if (!DI2->isDebugValue())
+      --NumDups2;
+  }
+
+  // Remember which registers would later be defined by the false block.
+  // This allows us not to predicate instructions in the true block that would
+  // later be re-defined. That is, rather than
+  //   subeq  r0, r1, #1
+  //   addne  r0, r1, #1
+  // generate:
+  //   sub    r0, r1, #1
+  //   addne  r0, r1, #1
+  SmallSet<unsigned, 4> RedefsByFalse;
+  SmallSet<unsigned, 4> ExtUses;
+  if (TII->isProfitableToUnpredicate(*BBI1->BB, *BBI2->BB)) {
+    for (MachineBasicBlock::iterator FI = BBI2->BB->begin(); FI != DI2; ++FI) {
+      if (FI->isDebugValue())
+        continue;
+      SmallVector<unsigned, 4> Defs;
+      for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = FI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        if (MO.isDef()) {
+          Defs.push_back(Reg);
+        } else if (!RedefsByFalse.count(Reg)) {
+          // These are defined before ctrl flow reach the 'false' instructions.
+          // They cannot be modified by the 'true' instructions.
+          ExtUses.insert(Reg);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            ExtUses.insert(*SubRegs);
+        }
+      }
+
+      for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+        unsigned Reg = Defs[i];
+        if (!ExtUses.count(Reg)) {
+          RedefsByFalse.insert(Reg);
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            RedefsByFalse.insert(*SubRegs);
+        }
+      }
+    }
+  }
+
+  // Predicate the 'true' block.
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs, &RedefsByFalse);
+
+  // Predicate the 'false' block.
+  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
+
+  // Merge the true block into the entry of the diamond.
+  MergeBlocks(BBI, *BBI1, TailBB == 0);
+  MergeBlocks(BBI, *BBI2, TailBB == 0);
+
+  // If the if-converted block falls through or unconditionally branches into
+  // the tail block, and the tail block does not have other predecessors, then
+  // fold the tail block in as well. Otherwise, unless it falls through to the
+  // tail, add a unconditional branch to it.
+  if (TailBB) {
+    BBInfo &TailBBI = BBAnalysis[TailBB->getNumber()];
+    bool CanMergeTail = !TailBBI.HasFallThrough;
+    // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
+    // check if there are any other predecessors besides those.
+    unsigned NumPreds = TailBB->pred_size();
+    if (NumPreds > 1)
+      CanMergeTail = false;
+    else if (NumPreds == 1 && CanMergeTail) {
+      MachineBasicBlock::pred_iterator PI = TailBB->pred_begin();
+      if (*PI != BBI1->BB && *PI != BBI2->BB)
+        CanMergeTail = false;
+    }
+    if (CanMergeTail) {
+      MergeBlocks(BBI, TailBBI);
+      TailBBI.IsDone = true;
+    } else {
+      BBI.BB->addSuccessor(TailBB);
+      InsertUncondBranch(BBI.BB, TailBB, TII);
+      BBI.HasFallThrough = false;
+    }
+  }
+
+  // RemoveExtraEdges won't work if the block has an unanalyzable branch,
+  // which can happen here if TailBB is unanalyzable and is merged, so
+  // explicitly remove BBI1 and BBI2 as successors.
+  BBI.BB->removeSuccessor(BBI1->BB);
+  BBI.BB->removeSuccessor(BBI2->BB);
+  RemoveExtraEdges(BBI);
+
+  // Update block info.
+  BBI.IsDone = TrueBBI.IsDone = FalseBBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+static bool MaySpeculate(const MachineInstr *MI,
+                         SmallSet<unsigned, 4> &LaterRedefs,
+                         const TargetInstrInfo *TII) {
+  bool SawStore = true;
+  if (!MI->isSafeToMove(TII, 0, SawStore))
+    return false;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef() && !LaterRedefs.count(Reg))
+      return false;
+  }
+
+  return true;
+}
+
+/// PredicateBlock - Predicate instructions from the start of the block to the
+/// specified end with the specified condition.
+void IfConverter::PredicateBlock(BBInfo &BBI,
+                                 MachineBasicBlock::iterator E,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 SmallSet<unsigned, 4> &Redefs,
+                                 SmallSet<unsigned, 4> *LaterRedefs) {
+  bool AnyUnpred = false;
+  bool MaySpec = LaterRedefs != 0;
+  for (MachineBasicBlock::iterator I = BBI.BB->begin(); I != E; ++I) {
+    if (I->isDebugValue() || TII->isPredicated(I))
+      continue;
+    // It may be possible not to predicate an instruction if it's the 'true'
+    // side of a diamond and the 'false' side may re-define the instruction's
+    // defs.
+    if (MaySpec && MaySpeculate(I, *LaterRedefs, TII)) {
+      AnyUnpred = true;
+      continue;
+    }
+    // If any instruction is predicated, then every instruction after it must
+    // be predicated.
+    MaySpec = false;
+    if (!TII->PredicateInstruction(I, Cond)) {
+#ifndef NDEBUG
+      dbgs() << "Unable to predicate " << *I << "!\n";
+#endif
+      llvm_unreachable(0);
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(I, Redefs, TRI, true);
+  }
+
+  std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
+
+  BBI.IsAnalyzed = false;
+  BBI.NonPredSize = 0;
+
+  ++NumIfConvBBs;
+  if (AnyUnpred)
+    ++NumUnpred;
+}
+
+/// CopyAndPredicateBlock - Copy and predicate instructions from source BB to
+/// the destination block. Skip end of block branches if IgnoreBr is true.
+void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                                        SmallVectorImpl<MachineOperand> &Cond,
+                                        SmallSet<unsigned, 4> &Redefs,
+                                        bool IgnoreBr) {
+  MachineFunction &MF = *ToBBI.BB->getParent();
+
+  for (MachineBasicBlock::iterator I = FromBBI.BB->begin(),
+         E = FromBBI.BB->end(); I != E; ++I) {
+    // Do not copy the end of the block branches.
+    if (IgnoreBr && I->isBranch())
+      break;
+
+    MachineInstr *MI = MF.CloneMachineInstr(I);
+    ToBBI.BB->insert(ToBBI.BB->end(), MI);
+    ToBBI.NonPredSize++;
+    unsigned ExtraPredCost = 0;
+    unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost);
+    if (NumCycles > 1)
+      ToBBI.ExtraCost += NumCycles-1;
+    ToBBI.ExtraCost2 += ExtraPredCost;
+
+    if (!TII->isPredicated(I) && !MI->isDebugValue()) {
+      if (!TII->PredicateInstruction(MI, Cond)) {
+#ifndef NDEBUG
+        dbgs() << "Unable to predicate " << *I << "!\n";
+#endif
+        llvm_unreachable(0);
+      }
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(MI, Redefs, TRI, true);
+  }
+
+  if (!IgnoreBr) {
+    std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                           FromBBI.BB->succ_end());
+    MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      MachineBasicBlock *Succ = Succs[i];
+      // Fallthrough edge can't be transferred.
+      if (Succ == FallThrough)
+        continue;
+      ToBBI.BB->addSuccessor(Succ);
+    }
+  }
+
+  std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
+            std::back_inserter(ToBBI.Predicate));
+  std::copy(Cond.begin(), Cond.end(), std::back_inserter(ToBBI.Predicate));
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.IsAnalyzed = false;
+
+  ++NumDupBBs;
+}
+
+/// MergeBlocks - Move all instructions from FromBB to the end of ToBB.
+/// This will leave FromBB as an empty block, so remove all of its
+/// successor edges except for the fall-through edge.  If AddEdges is true,
+/// i.e., when FromBBI's branch is being moved, add those successor edges to
+/// ToBBI.
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
+  ToBBI.BB->splice(ToBBI.BB->end(),
+                   FromBBI.BB, FromBBI.BB->begin(), FromBBI.BB->end());
+
+  std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                         FromBBI.BB->succ_end());
+  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : NULL;
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = Succs[i];
+    // Fallthrough edge can't be transferred.
+    if (Succ == FallThrough)
+      continue;
+    FromBBI.BB->removeSuccessor(Succ);
+    if (AddEdges && !ToBBI.BB->isSuccessor(Succ))
+      ToBBI.BB->addSuccessor(Succ);
+  }
+
+  // Now FromBBI always falls through to the next block!
+  if (NBB && !FromBBI.BB->isSuccessor(NBB))
+    FromBBI.BB->addSuccessor(NBB);
+
+  std::copy(FromBBI.Predicate.begin(), FromBBI.Predicate.end(),
+            std::back_inserter(ToBBI.Predicate));
+  FromBBI.Predicate.clear();
+
+  ToBBI.NonPredSize += FromBBI.NonPredSize;
+  ToBBI.ExtraCost += FromBBI.ExtraCost;
+  ToBBI.ExtraCost2 += FromBBI.ExtraCost2;
+  FromBBI.NonPredSize = 0;
+  FromBBI.ExtraCost = 0;
+  FromBBI.ExtraCost2 = 0;
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.HasFallThrough = FromBBI.HasFallThrough;
+  ToBBI.IsAnalyzed = false;
+  FromBBI.IsAnalyzed = false;
+}
diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 000000000000..c6d1a18dbd06
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,1295 @@
+//===-------- InlineSpiller.cpp - Insert spills and restores inline -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "Spiller.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
+STATISTIC(NumSnippets,        "Number of spilled snippets");
+STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumSpillsRemoved,   "Number of spills removed");
+STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumReloadsRemoved,  "Number of reloads removed");
+STATISTIC(NumFolded,          "Number of folded stack accesses");
+STATISTIC(NumFoldedLoads,     "Number of folded loads");
+STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
+STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads");
+STATISTIC(NumHoists,          "Number of hoisted spills");
+
+static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
+                                     cl::desc("Disable inline spill hoisting"));
+
+namespace {
+class InlineSpiller : public Spiller {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveRangeEdit *Edit;
+  LiveInterval *StackInt;
+  int StackSlot;
+  unsigned Original;
+
+  // All registers to spill to StackSlot, including the main register.
+  SmallVector<unsigned, 8> RegsToSpill;
+
+  // All COPY instructions to/from snippets.
+  // They are ignored since both operands refer to the same stack slot.
+  SmallPtrSet<MachineInstr*, 8> SnippetCopies;
+
+  // Values that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> UsedValues;
+
+public:
+  // Information about a value that was defined by a copy from a sibling
+  // register.
+  struct SibValueInfo {
+    // True when all reaching defs were reloads: No spill is necessary.
+    bool AllDefsAreReloads;
+
+    // True when value is defined by an original PHI not from splitting.
+    bool DefByOrigPHI;
+
+    // True when the COPY defining this value killed its source.
+    bool KillsSource;
+
+    // The preferred register to spill.
+    unsigned SpillReg;
+
+    // The value of SpillReg that should be spilled.
+    VNInfo *SpillVNI;
+
+    // The block where SpillVNI should be spilled. Currently, this must be the
+    // block containing SpillVNI->def.
+    MachineBasicBlock *SpillMBB;
+
+    // A defining instruction that is not a sibling copy or a reload, or NULL.
+    // This can be used as a template for rematerialization.
+    MachineInstr *DefMI;
+
+    // List of values that depend on this one.  These values are actually the
+    // same, but live range splitting has placed them in different registers,
+    // or SSA update needed to insert PHI-defs to preserve SSA form.  This is
+    // copies of the current value and phi-kills.  Usually only phi-kills cause
+    // more than one dependent value.
+    TinyPtrVector<VNInfo*> Deps;
+
+    SibValueInfo(unsigned Reg, VNInfo *VNI)
+      : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
+        SpillReg(Reg), SpillVNI(VNI), SpillMBB(0), DefMI(0) {}
+
+    // Returns true when a def has been found.
+    bool hasDef() const { return DefByOrigPHI || DefMI; }
+  };
+
+private:
+  // Values in RegsToSpill defined by sibling copies.
+  typedef DenseMap<VNInfo*, SibValueInfo> SibValueMap;
+  SibValueMap SibValues;
+
+  // Dead defs generated during spilling.
+  SmallVector<MachineInstr*, 8> DeadDefs;
+
+  ~InlineSpiller() {}
+
+public:
+  InlineSpiller(MachineFunctionPass &pass,
+                MachineFunction &mf,
+                VirtRegMap &vrm)
+    : MF(mf),
+      LIS(pass.getAnalysis<LiveIntervals>()),
+      LSS(pass.getAnalysis<LiveStacks>()),
+      AA(&pass.getAnalysis<AliasAnalysis>()),
+      MDT(pass.getAnalysis<MachineDominatorTree>()),
+      Loops(pass.getAnalysis<MachineLoopInfo>()),
+      VRM(vrm),
+      MFI(*mf.getFrameInfo()),
+      MRI(mf.getRegInfo()),
+      TII(*mf.getTarget().getInstrInfo()),
+      TRI(*mf.getTarget().getRegisterInfo()) {}
+
+  void spill(LiveRangeEdit &);
+
+private:
+  bool isSnippet(const LiveInterval &SnipLI);
+  void collectRegsToSpill();
+
+  bool isRegToSpill(unsigned Reg) {
+    return std::find(RegsToSpill.begin(),
+                     RegsToSpill.end(), Reg) != RegsToSpill.end();
+  }
+
+  bool isSibling(unsigned Reg);
+  MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
+  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = 0);
+  void analyzeSiblingValues();
+
+  bool hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI);
+  void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
+
+  void markValueUsed(LiveInterval*, VNInfo*);
+  bool reMaterializeFor(LiveInterval&, MachineBasicBlock::iterator MI);
+  void reMaterializeAll();
+
+  bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
+  bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
+                         MachineInstr *LoadMI = 0);
+  void insertReload(LiveInterval &NewLI, SlotIndex,
+                    MachineBasicBlock::iterator MI);
+  void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
+                   SlotIndex, MachineBasicBlock::iterator MI);
+
+  void spillAroundUses(unsigned Reg);
+  void spillAll();
+};
+}
+
+namespace llvm {
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm) {
+  return new InlineSpiller(pass, mf, vrm);
+}
+}
+
+//===----------------------------------------------------------------------===//
+//                                Snippets
+//===----------------------------------------------------------------------===//
+
+// When spilling a virtual register, we also spill any snippets it is connected
+// to. The snippets are small live ranges that only have a single real use,
+// leftovers from live range splitting. Spilling them enables memory operand
+// folding or tightens the live range around the single use.
+//
+// This minimizes register pressure and maximizes the store-to-load distance for
+// spill slots which can be important in tight loops.
+
+/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
+/// otherwise return 0.
+static unsigned isFullCopyOf(const MachineInstr *MI, unsigned Reg) {
+  if (!MI->isFullCopy())
+    return 0;
+  if (MI->getOperand(0).getReg() == Reg)
+      return MI->getOperand(1).getReg();
+  if (MI->getOperand(1).getReg() == Reg)
+      return MI->getOperand(0).getReg();
+  return 0;
+}
+
+/// isSnippet - Identify if a live interval is a snippet that should be spilled.
+/// It is assumed that SnipLI is a virtual register with the same original as
+/// Edit->getReg().
+bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
+  unsigned Reg = Edit->getReg();
+
+  // A snippet is a tiny live range with only a single instruction using it
+  // besides copies to/from Reg or spills/fills. We accept:
+  //
+  //   %snip = COPY %Reg / FILL fi#
+  //   %snip = USE %snip
+  //   %Reg = COPY %snip / SPILL %snip, fi#
+  //
+  if (SnipLI.getNumValNums() > 2 || !LIS.intervalIsInOneMBB(SnipLI))
+    return false;
+
+  MachineInstr *UseMI = 0;
+
+  // Check that all uses satisfy our criteria.
+  for (MachineRegisterInfo::reg_nodbg_iterator
+         RI = MRI.reg_nodbg_begin(SnipLI.reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+
+    // Allow copies to/from Reg.
+    if (isFullCopyOf(MI, Reg))
+      continue;
+
+    // Allow stack slot loads.
+    int FI;
+    if (SnipLI.reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    // Allow stack slot stores.
+    if (SnipLI.reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    // Allow a single additional instruction.
+    if (UseMI && MI != UseMI)
+      return false;
+    UseMI = MI;
+  }
+  return true;
+}
+
+/// collectRegsToSpill - Collect live range snippets that only have a single
+/// real use.
+void InlineSpiller::collectRegsToSpill() {
+  unsigned Reg = Edit->getReg();
+
+  // Main register always spills.
+  RegsToSpill.assign(1, Reg);
+  SnippetCopies.clear();
+
+  // Snippets all have the same original, so there can't be any for an original
+  // register.
+  if (Original == Reg)
+    return;
+
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Reg);
+       MachineInstr *MI = RI.skipInstruction();) {
+    unsigned SnipReg = isFullCopyOf(MI, Reg);
+    if (!isSibling(SnipReg))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(SnipReg);
+    if (!isSnippet(SnipLI))
+      continue;
+    SnippetCopies.insert(MI);
+    if (isRegToSpill(SnipReg))
+      continue;
+    RegsToSpill.push_back(SnipReg);
+    DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    ++NumSnippets;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Sibling Values
+//===----------------------------------------------------------------------===//
+
+// After live range splitting, some values to be spilled may be defined by
+// copies from sibling registers. We trace the sibling copies back to the
+// original value if it still exists. We need it for rematerialization.
+//
+// Even when the value can't be rematerialized, we still want to determine if
+// the value has already been spilled, or we may want to hoist the spill from a
+// loop.
+
+bool InlineSpiller::isSibling(unsigned Reg) {
+  return TargetRegisterInfo::isVirtualRegister(Reg) &&
+           VRM.getOriginal(Reg) == Original;
+}
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const InlineSpiller::SibValueInfo &SVI) {
+  OS << "spill " << PrintReg(SVI.SpillReg) << ':'
+     << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def;
+  if (SVI.SpillMBB)
+    OS << " in BB#" << SVI.SpillMBB->getNumber();
+  if (SVI.AllDefsAreReloads)
+    OS << " all-reloads";
+  if (SVI.DefByOrigPHI)
+    OS << " orig-phi";
+  if (SVI.KillsSource)
+    OS << " kill";
+  OS << " deps[";
+  for (unsigned i = 0, e = SVI.Deps.size(); i != e; ++i)
+    OS << ' ' << SVI.Deps[i]->id << '@' << SVI.Deps[i]->def;
+  OS << " ]";
+  if (SVI.DefMI)
+    OS << " def: " << *SVI.DefMI;
+  else
+    OS << '\n';
+  return OS;
+}
+#endif
+
+/// propagateSiblingValue - Propagate the value in SVI to dependents if it is
+/// known.  Otherwise remember the dependency for later.
+///
+/// @param SVI SibValues entry to propagate.
+/// @param VNI Dependent value, or NULL to propagate to all saved dependents.
+void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVI,
+                                          VNInfo *VNI) {
+  // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
+  TinyPtrVector<VNInfo*> FirstDeps;
+  if (VNI) {
+    FirstDeps.push_back(VNI);
+    SVI->second.Deps.push_back(VNI);
+  }
+
+  // Has the value been completely determined yet?  If not, defer propagation.
+  if (!SVI->second.hasDef())
+    return;
+
+  // Work list of values to propagate.  It would be nice to use a SetVector
+  // here, but then we would be forced to use a SmallSet.
+  SmallVector<SibValueMap::iterator, 8> WorkList(1, SVI);
+  SmallPtrSet<VNInfo*, 8> WorkSet;
+
+  do {
+    SVI = WorkList.pop_back_val();
+    WorkSet.erase(SVI->first);
+    TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
+    VNI = 0;
+
+    SibValueInfo &SV = SVI->second;
+    if (!SV.SpillMBB)
+      SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def);
+
+    DEBUG(dbgs() << "  prop to " << Deps->size() << ": "
+                 << SVI->first->id << '@' << SVI->first->def << ":\t" << SV);
+
+    assert(SV.hasDef() && "Propagating undefined value");
+
+    // Should this value be propagated as a preferred spill candidate?  We don't
+    // propagate values of registers that are about to spill.
+    bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
+    unsigned SpillDepth = ~0u;
+
+    for (TinyPtrVector<VNInfo*>::iterator DepI = Deps->begin(),
+         DepE = Deps->end(); DepI != DepE; ++DepI) {
+      SibValueMap::iterator DepSVI = SibValues.find(*DepI);
+      assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
+      SibValueInfo &DepSV = DepSVI->second;
+      if (!DepSV.SpillMBB)
+        DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def);
+
+      bool Changed = false;
+
+      // Propagate defining instruction.
+      if (!DepSV.hasDef()) {
+        Changed = true;
+        DepSV.DefMI = SV.DefMI;
+        DepSV.DefByOrigPHI = SV.DefByOrigPHI;
+      }
+
+      // Propagate AllDefsAreReloads.  For PHI values, this computes an AND of
+      // all predecessors.
+      if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) {
+        Changed = true;
+        DepSV.AllDefsAreReloads = false;
+      }
+
+      // Propagate best spill value.
+      if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) {
+        if (SV.SpillMBB == DepSV.SpillMBB) {
+          // DepSV is in the same block.  Hoist when dominated.
+          if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) {
+            // This is an alternative def earlier in the same MBB.
+            // Hoist the spill as far as possible in SpillMBB. This can ease
+            // register pressure:
+            //
+            //   x = def
+            //   y = use x
+            //   s = copy x
+            //
+            // Hoisting the spill of s to immediately after the def removes the
+            // interference between x and y:
+            //
+            //   x = def
+            //   spill x
+            //   y = use x<kill>
+            //
+            // This hoist only helps when the DepSV copy kills its source.
+            Changed = true;
+            DepSV.SpillReg = SV.SpillReg;
+            DepSV.SpillVNI = SV.SpillVNI;
+            DepSV.SpillMBB = SV.SpillMBB;
+          }
+        } else {
+          // DepSV is in a different block.
+          if (SpillDepth == ~0u)
+            SpillDepth = Loops.getLoopDepth(SV.SpillMBB);
+
+          // Also hoist spills to blocks with smaller loop depth, but make sure
+          // that the new value dominates.  Non-phi dependents are always
+          // dominated, phis need checking.
+          if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) &&
+              (!DepSVI->first->isPHIDef() ||
+               MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) {
+            Changed = true;
+            DepSV.SpillReg = SV.SpillReg;
+            DepSV.SpillVNI = SV.SpillVNI;
+            DepSV.SpillMBB = SV.SpillMBB;
+          }
+        }
+      }
+
+      if (!Changed)
+        continue;
+
+      // Something changed in DepSVI. Propagate to dependents.
+      if (WorkSet.insert(DepSVI->first))
+        WorkList.push_back(DepSVI);
+
+      DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
+            << DepSVI->first->def << " to:\t" << DepSV);
+    }
+  } while (!WorkList.empty());
+}
+
+/// traceSiblingValue - Trace a value that is about to be spilled back to the
+/// real defining instructions by looking through sibling copies. Always stay
+/// within the range of OrigVNI so the registers are known to carry the same
+/// value.
+///
+/// Determine if the value is defined by all reloads, so spilling isn't
+/// necessary - the value is already in the stack slot.
+///
+/// Return a defining instruction that may be a candidate for rematerialization.
+///
+MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
+                                               VNInfo *OrigVNI) {
+  // Check if a cached value already exists.
+  SibValueMap::iterator SVI;
+  bool Inserted;
+  tie(SVI, Inserted) =
+    SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI)));
+  if (!Inserted) {
+    DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':'
+                 << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second);
+    return SVI->second.DefMI;
+  }
+
+  DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':'
+               << UseVNI->id << '@' << UseVNI->def << '\n');
+
+  // List of (Reg, VNI) that have been inserted into SibValues, but need to be
+  // processed.
+  SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(UseReg, UseVNI));
+
+  do {
+    unsigned Reg;
+    VNInfo *VNI;
+    tie(Reg, VNI) = WorkList.pop_back_val();
+    DEBUG(dbgs() << "  " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
+                 << ":\t");
+
+    // First check if this value has already been computed.
+    SVI = SibValues.find(VNI);
+    assert(SVI != SibValues.end() && "Missing SibValues entry");
+
+    // Trace through PHI-defs created by live range splitting.
+    if (VNI->isPHIDef()) {
+      // Stop at original PHIs.  We don't know the value at the predecessors.
+      if (VNI->def == OrigVNI->def) {
+        DEBUG(dbgs() << "orig phi value\n");
+        SVI->second.DefByOrigPHI = true;
+        SVI->second.AllDefsAreReloads = false;
+        propagateSiblingValue(SVI);
+        continue;
+      }
+
+      // This is a PHI inserted by live range splitting.  We could trace the
+      // live-out value from predecessor blocks, but that search can be very
+      // expensive if there are many predecessors and many more PHIs as
+      // generated by tail-dup when it sees an indirectbr.  Instead, look at
+      // all the non-PHI defs that have the same value as OrigVNI.  They must
+      // jointly dominate VNI->def.  This is not optimal since VNI may actually
+      // be jointly dominated by a smaller subset of defs, so there is a change
+      // we will miss a AllDefsAreReloads optimization.
+
+      // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
+      SmallVector<VNInfo*, 8> PHIs, NonPHIs;
+      LiveInterval &LI = LIS.getInterval(Reg);
+      LiveInterval &OrigLI = LIS.getInterval(Original);
+
+      for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
+           VI != VE; ++VI) {
+        VNInfo *VNI2 = *VI;
+        if (VNI2->isUnused())
+          continue;
+        if (!OrigLI.containsOneValue() &&
+            OrigLI.getVNInfoAt(VNI2->def) != OrigVNI)
+          continue;
+        if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def)
+          PHIs.push_back(VNI2);
+        else
+          NonPHIs.push_back(VNI2);
+      }
+      DEBUG(dbgs() << "split phi value, checking " << PHIs.size()
+                   << " phi-defs, and " << NonPHIs.size()
+                   << " non-phi/orig defs\n");
+
+      // Create entries for all the PHIs.  Don't add them to the worklist, we
+      // are processing all of them in one go here.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+        SibValues.insert(std::make_pair(PHIs[i], SibValueInfo(Reg, PHIs[i])));
+
+      // Add every PHI as a dependent of all the non-PHIs.
+      for (unsigned i = 0, e = NonPHIs.size(); i != e; ++i) {
+        VNInfo *NonPHI = NonPHIs[i];
+        // Known value? Try an insertion.
+        tie(SVI, Inserted) =
+          SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
+        // Add all the PHIs as dependents of NonPHI.
+        for (unsigned pi = 0, pe = PHIs.size(); pi != pe; ++pi)
+          SVI->second.Deps.push_back(PHIs[pi]);
+        // This is the first time we see NonPHI, add it to the worklist.
+        if (Inserted)
+          WorkList.push_back(std::make_pair(Reg, NonPHI));
+        else
+          // Propagate to all inserted PHIs, not just VNI.
+          propagateSiblingValue(SVI);
+      }
+
+      // Next work list item.
+      continue;
+    }
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Missing def");
+
+    // Trace through sibling copies.
+    if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
+      if (isSibling(SrcReg)) {
+        LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        assert(SrcQ.valueIn() && "Copy from non-existing value");
+        // Check if this COPY kills its source.
+        SVI->second.KillsSource = SrcQ.isKill();
+        VNInfo *SrcVNI = SrcQ.valueIn();
+        DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
+                     << SrcVNI->id << '@' << SrcVNI->def
+                     << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
+        // Known sibling source value? Try an insertion.
+        tie(SVI, Inserted) = SibValues.insert(std::make_pair(SrcVNI,
+                                                 SibValueInfo(SrcReg, SrcVNI)));
+        // This is the first time we see Src, add it to the worklist.
+        if (Inserted)
+          WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
+        propagateSiblingValue(SVI, VNI);
+        // Next work list item.
+        continue;
+      }
+    }
+
+    // Track reachable reloads.
+    SVI->second.DefMI = MI;
+    SVI->second.SpillMBB = MI->getParent();
+    int FI;
+    if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) {
+      DEBUG(dbgs() << "reload\n");
+      propagateSiblingValue(SVI);
+      // Next work list item.
+      continue;
+    }
+
+    // Potential remat candidate.
+    DEBUG(dbgs() << "def " << *MI);
+    SVI->second.AllDefsAreReloads = false;
+    propagateSiblingValue(SVI);
+  } while (!WorkList.empty());
+
+  // Look up the value we were looking for.  We already did this lookup at the
+  // top of the function, but SibValues may have been invalidated.
+  SVI = SibValues.find(UseVNI);
+  assert(SVI != SibValues.end() && "Didn't compute requested info");
+  DEBUG(dbgs() << "  traced to:\t" << SVI->second);
+  return SVI->second.DefMI;
+}
+
+/// analyzeSiblingValues - Trace values defined by sibling copies back to
+/// something that isn't a sibling copy.
+///
+/// Keep track of values that may be rematerializable.
+void InlineSpiller::analyzeSiblingValues() {
+  SibValues.clear();
+
+  // No siblings at all?
+  if (Edit->getReg() == Original)
+    return;
+
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    unsigned Reg = RegsToSpill[i];
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (LiveInterval::const_vni_iterator VI = LI.vni_begin(),
+         VE = LI.vni_end(); VI != VE; ++VI) {
+      VNInfo *VNI = *VI;
+      if (VNI->isUnused())
+        continue;
+      MachineInstr *DefMI = 0;
+      if (!VNI->isPHIDef()) {
+       DefMI = LIS.getInstructionFromIndex(VNI->def);
+       assert(DefMI && "No defining instruction");
+      }
+      // Check possible sibling copies.
+      if (VNI->isPHIDef() || DefMI->isCopy()) {
+        VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+        assert(OrigVNI && "Def outside original live range");
+        if (OrigVNI->def != VNI->def)
+          DefMI = traceSiblingValue(Reg, VNI, OrigVNI);
+      }
+      if (DefMI && Edit->checkRematerializable(VNI, DefMI, AA)) {
+        DEBUG(dbgs() << "Value " << PrintReg(Reg) << ':' << VNI->id << '@'
+                     << VNI->def << " may remat from " << *DefMI);
+      }
+    }
+  }
+}
+
+/// hoistSpill - Given a sibling copy that defines a value to be spilled, insert
+/// a spill at a better location.
+bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
+  SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
+  VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
+  assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
+  SibValueMap::iterator I = SibValues.find(VNI);
+  if (I == SibValues.end())
+    return false;
+
+  const SibValueInfo &SVI = I->second;
+
+  // Let the normal folding code deal with the boring case.
+  if (!SVI.AllDefsAreReloads && SVI.SpillVNI == VNI)
+    return false;
+
+  // SpillReg may have been deleted by remat and DCE.
+  if (!LIS.hasInterval(SVI.SpillReg)) {
+    DEBUG(dbgs() << "Stale interval: " << PrintReg(SVI.SpillReg) << '\n');
+    SibValues.erase(I);
+    return false;
+  }
+
+  LiveInterval &SibLI = LIS.getInterval(SVI.SpillReg);
+  if (!SibLI.containsValue(SVI.SpillVNI)) {
+    DEBUG(dbgs() << "Stale value: " << PrintReg(SVI.SpillReg) << '\n');
+    SibValues.erase(I);
+    return false;
+  }
+
+  // Conservatively extend the stack slot range to the range of the original
+  // value. We may be able to do better with stack slot coloring by being more
+  // careful here.
+  assert(StackInt && "No stack slot assigned yet.");
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+  StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0));
+  DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
+               << *StackInt << '\n');
+
+  // Already spilled everywhere.
+  if (SVI.AllDefsAreReloads) {
+    DEBUG(dbgs() << "\tno spill needed: " << SVI);
+    ++NumOmitReloadSpill;
+    return true;
+  }
+  // We are going to spill SVI.SpillVNI immediately after its def, so clear out
+  // any later spills of the same value.
+  eliminateRedundantSpills(SibLI, SVI.SpillVNI);
+
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SVI.SpillVNI->def);
+  MachineBasicBlock::iterator MII;
+  if (SVI.SpillVNI->isPHIDef())
+    MII = MBB->SkipPHIsAndLabels(MBB->begin());
+  else {
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SVI.SpillVNI->def);
+    assert(DefMI && "Defining instruction disappeared");
+    MII = DefMI;
+    ++MII;
+  }
+  // Insert spill without kill flag immediately after def.
+  TII.storeRegToStackSlot(*MBB, MII, SVI.SpillReg, false, StackSlot,
+                          MRI.getRegClass(SVI.SpillReg), &TRI);
+  --MII; // Point to store instruction.
+  LIS.InsertMachineInstrInMaps(MII);
+  DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
+
+  ++NumSpills;
+  ++NumHoists;
+  return true;
+}
+
+/// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
+/// redundant spills of this value in SLI.reg and sibling copies.
+void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
+  assert(VNI && "Missing value");
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(&SLI, VNI));
+  assert(StackInt && "No stack slot assigned yet.");
+
+  do {
+    LiveInterval *LI;
+    tie(LI, VNI) = WorkList.pop_back_val();
+    unsigned Reg = LI->reg;
+    DEBUG(dbgs() << "Checking redundant spills for "
+                 << VNI->id << '@' << VNI->def << " in " << *LI << '\n');
+
+    // Regs to spill are taken care of.
+    if (isRegToSpill(Reg))
+      continue;
+
+    // Add all of VNI's live range to StackInt.
+    StackInt->MergeValueInAsValue(*LI, VNI, StackInt->getValNumInfo(0));
+    DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
+
+    // Find all spills and copies of VNI.
+    for (MachineRegisterInfo::use_nodbg_iterator UI = MRI.use_nodbg_begin(Reg);
+         MachineInstr *MI = UI.skipInstruction();) {
+      if (!MI->isCopy() && !MI->mayStore())
+        continue;
+      SlotIndex Idx = LIS.getInstructionIndex(MI);
+      if (LI->getVNInfoAt(Idx) != VNI)
+        continue;
+
+      // Follow sibling copies down the dominator tree.
+      if (unsigned DstReg = isFullCopyOf(MI, Reg)) {
+        if (isSibling(DstReg)) {
+           LiveInterval &DstLI = LIS.getInterval(DstReg);
+           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
+           assert(DstVNI && "Missing defined value");
+           assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot");
+           WorkList.push_back(std::make_pair(&DstLI, DstVNI));
+        }
+        continue;
+      }
+
+      // Erase spills.
+      int FI;
+      if (Reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) {
+        DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << *MI);
+        // eliminateDeadDefs won't normally remove stores, so switch opcode.
+        MI->setDesc(TII.get(TargetOpcode::KILL));
+        DeadDefs.push_back(MI);
+        ++NumSpillsRemoved;
+        --NumSpills;
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Rematerialization
+//===----------------------------------------------------------------------===//
+
+/// markValueUsed - Remember that VNI failed to rematerialize, so its defining
+/// instruction cannot be eliminated. See through snippet copies
+void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(LI, VNI));
+  do {
+    tie(LI, VNI) = WorkList.pop_back_val();
+    if (!UsedValues.insert(VNI))
+      continue;
+
+    if (VNI->isPHIDef()) {
+      MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+        VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI));
+        if (PVNI)
+          WorkList.push_back(std::make_pair(LI, PVNI));
+      }
+      continue;
+    }
+
+    // Follow snippet copies.
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    if (!SnippetCopies.count(MI))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(MI->getOperand(1).getReg());
+    assert(isRegToSpill(SnipLI.reg) && "Unexpected register in copy");
+    VNInfo *SnipVNI = SnipLI.getVNInfoAt(VNI->def.getRegSlot(true));
+    assert(SnipVNI && "Snippet undefined before copy");
+    WorkList.push_back(std::make_pair(&SnipLI, SnipVNI));
+  } while (!WorkList.empty());
+}
+
+/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
+bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
+                                     MachineBasicBlock::iterator MI) {
+  SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
+  VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
+
+  if (!ParentVNI) {
+    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg)
+        MO.setIsUndef();
+    }
+    DEBUG(dbgs() << UseIdx << '\t' << *MI);
+    return true;
+  }
+
+  if (SnippetCopies.count(MI))
+    return false;
+
+  // Use an OrigVNI from traceSiblingValue when ParentVNI is a sibling copy.
+  LiveRangeEdit::Remat RM(ParentVNI);
+  SibValueMap::const_iterator SibI = SibValues.find(ParentVNI);
+  if (SibI != SibValues.end())
+    RM.OrigMI = SibI->second.DefMI;
+  if (!Edit->canRematerializeAt(RM, UseIdx, false)) {
+    markValueUsed(&VirtReg, ParentVNI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << *MI);
+    return false;
+  }
+
+  // If the instruction also writes VirtReg.reg, it had better not require the
+  // same register for uses and defs.
+  SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
+  MIBundleOperands::VirtRegInfo RI =
+    MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+  if (RI.Tied) {
+    markValueUsed(&VirtReg, ParentVNI);
+    DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << *MI);
+    return false;
+  }
+
+  // Before rematerializing into a register for a single instruction, try to
+  // fold a load into the instruction. That avoids allocating a new register.
+  if (RM.OrigMI->canFoldAsLoad() &&
+      foldMemoryOperand(Ops, RM.OrigMI)) {
+    Edit->markRematerialized(RM.ParentVNI);
+    ++NumFoldedLoads;
+    return true;
+  }
+
+  // Alocate a new register for the remat.
+  LiveInterval &NewLI = Edit->createFrom(Original);
+  NewLI.markNotSpillable();
+
+  // Finally we can rematerialize OrigMI before MI.
+  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewLI.reg, RM,
+                                           TRI);
+  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
+               << *LIS.getInstructionFromIndex(DefIdx));
+
+  // Replace operands
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(Ops[i].second);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
+      MO.setReg(NewLI.reg);
+      MO.setIsKill();
+    }
+  }
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+
+  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, LIS.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(DefIdx, UseIdx.getRegSlot(), DefVNI));
+  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  ++NumRemats;
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  // analyzeSiblingValues has already tested all relevant defining instructions.
+  if (!Edit->anyRematerializable(AA))
+    return;
+
+  UsedValues.clear();
+
+  // Try to remat before all uses of snippets.
+  bool anyRemat = false;
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    unsigned Reg = RegsToSpill[i];
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (MachineRegisterInfo::use_nodbg_iterator
+         RI = MRI.use_nodbg_begin(Reg);
+         MachineInstr *MI = RI.skipBundle();)
+      anyRemat |= reMaterializeFor(LI, MI);
+  }
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    unsigned Reg = RegsToSpill[i];
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+         I != E; ++I) {
+      VNInfo *VNI = *I;
+      if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI))
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+      MI->addRegisterDead(Reg, &TRI);
+      if (!MI->allDefsAreDead())
+        continue;
+      DEBUG(dbgs() << "All defs dead: " << *MI);
+      DeadDefs.push_back(MI);
+    }
+  }
+
+  // Eliminate dead code after remat. Note that some snippet copies may be
+  // deleted here.
+  if (DeadDefs.empty())
+    return;
+  DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
+
+  // Get rid of deleted and empty intervals.
+  for (unsigned i = RegsToSpill.size(); i != 0; --i) {
+    unsigned Reg = RegsToSpill[i-1];
+    if (!LIS.hasInterval(Reg)) {
+      RegsToSpill.erase(RegsToSpill.begin() + (i - 1));
+      continue;
+    }
+    LiveInterval &LI = LIS.getInterval(Reg);
+    if (!LI.empty())
+      continue;
+    Edit->eraseVirtReg(Reg);
+    RegsToSpill.erase(RegsToSpill.begin() + (i - 1));
+  }
+  DEBUG(dbgs() << RegsToSpill.size() << " registers to spill after remat.\n");
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                 Spilling
+//===----------------------------------------------------------------------===//
+
+/// If MI is a load or store of StackSlot, it can be removed.
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
+  int FI = 0;
+  unsigned InstrReg = TII.isLoadFromStackSlot(MI, FI);
+  bool IsLoad = InstrReg;
+  if (!IsLoad)
+    InstrReg = TII.isStoreToStackSlot(MI, FI);
+
+  // We have a stack access. Is it the right register and slot?
+  if (InstrReg != Reg || FI != StackSlot)
+    return false;
+
+  DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  LIS.RemoveMachineInstrFromMaps(MI);
+  MI->eraseFromParent();
+
+  if (IsLoad) {
+    ++NumReloadsRemoved;
+    --NumReloads;
+  } else {
+    ++NumSpillsRemoved;
+    --NumSpills;
+  }
+
+  return true;
+}
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into their
+/// instructions.
+///
+/// @param Ops    Operand indices from analyzeVirtReg().
+/// @param LoadMI Load instruction to use instead of stack slot when non-null.
+/// @return       True on success.
+bool InlineSpiller::
+foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
+                  MachineInstr *LoadMI) {
+  if (Ops.empty())
+    return false;
+  // Don't attempt folding in bundles.
+  MachineInstr *MI = Ops.front().first;
+  if (Ops.back().first != MI || MI->isBundled())
+    return false;
+
+  bool WasCopy = MI->isCopy();
+  unsigned ImpReg = 0;
+
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    unsigned Idx = Ops[i].second;
+    MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isImplicit()) {
+      ImpReg = MO.getReg();
+      continue;
+    }
+    // FIXME: Teach targets to deal with subregs.
+    if (MO.getSubReg())
+      return false;
+    // We cannot fold a load instruction into a def.
+    if (LoadMI && MO.isDef())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (!MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  MachineInstr *FoldMI =
+                LoadMI ? TII.foldMemoryOperand(MI, FoldOps, LoadMI)
+                       : TII.foldMemoryOperand(MI, FoldOps, StackSlot);
+  if (!FoldMI)
+    return false;
+  LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
+  MI->eraseFromParent();
+
+  // TII.foldMemoryOperand may have left some implicit operands on the
+  // instruction.  Strip them.
+  if (ImpReg)
+    for (unsigned i = FoldMI->getNumOperands(); i; --i) {
+      MachineOperand &MO = FoldMI->getOperand(i - 1);
+      if (!MO.isReg() || !MO.isImplicit())
+        break;
+      if (MO.getReg() == ImpReg)
+        FoldMI->RemoveOperand(i - 1);
+    }
+
+  DEBUG(dbgs() << "\tfolded:  " << LIS.getInstructionIndex(FoldMI) << '\t'
+               << *FoldMI);
+  if (!WasCopy)
+    ++NumFolded;
+  else if (Ops.front().second == 0)
+    ++NumSpills;
+  else
+    ++NumReloads;
+  return true;
+}
+
+/// insertReload - Insert a reload of NewLI.reg before MI.
+void InlineSpiller::insertReload(LiveInterval &NewLI,
+                                 SlotIndex Idx,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot,
+                           MRI.getRegClass(NewLI.reg), &TRI);
+  --MI; // Point to load instruction.
+  SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
+  // Some (out-of-tree) targets have EC reload instructions.
+  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
+    if (MO->isEarlyClobber())
+      LoadIdx = LoadIdx.getRegSlot(true);
+  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
+  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+  ++NumReloads;
+}
+
+/// insertSpill - Insert a spill of NewLI.reg after MI.
+void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
+                                SlotIndex Idx, MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+  TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot,
+                          MRI.getRegClass(NewLI.reg), &TRI);
+  --MI; // Point to store instruction.
+  SlotIndex StoreIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
+  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
+  VNInfo *StoreVNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
+  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+  ++NumSpills;
+}
+
+/// spillAroundUses - insert spill code around each use of Reg.
+void InlineSpiller::spillAroundUses(unsigned Reg) {
+  DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n');
+  LiveInterval &OldLI = LIS.getInterval(Reg);
+
+  // Iterate over instructions using Reg.
+  for (MachineRegisterInfo::reg_iterator RegI = MRI.reg_begin(Reg);
+       MachineInstr *MI = RegI.skipBundle();) {
+
+    // Debug values are not allowed to affect codegen.
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      uint64_t Offset = MI->getOperand(1).getImm();
+      const MDNode *MDPtr = MI->getOperand(2).getMetadata();
+      DebugLoc DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV = TII.emitFrameIndexDebugValue(MF, StackSlot,
+                                                           Offset, MDPtr, DL)) {
+        DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+        MachineBasicBlock *MBB = MI->getParent();
+        MBB->insert(MBB->erase(MI), NewDV);
+      } else {
+        DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
+        MI->eraseFromParent();
+      }
+      continue;
+    }
+
+    // Ignore copies to/from snippets. We'll delete them.
+    if (SnippetCopies.count(MI))
+      continue;
+
+    // Stack slot accesses may coalesce away.
+    if (coalesceStackAccess(MI, Reg))
+      continue;
+
+    // Analyze instruction.
+    SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
+    MIBundleOperands::VirtRegInfo RI =
+      MIBundleOperands(MI).analyzeVirtReg(Reg, &Ops);
+
+    // Find the slot index where this instruction reads and writes OldLI.
+    // This is usually the def slot, except for tied early clobbers.
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true)))
+      if (SlotIndex::isSameInstr(Idx, VNI->def))
+        Idx = VNI->def;
+
+    // Check for a sibling copy.
+    unsigned SibReg = isFullCopyOf(MI, Reg);
+    if (SibReg && isSibling(SibReg)) {
+      // This may actually be a copy between snippets.
+      if (isRegToSpill(SibReg)) {
+        DEBUG(dbgs() << "Found new snippet copy: " << *MI);
+        SnippetCopies.insert(MI);
+        continue;
+      }
+      if (RI.Writes) {
+        // Hoist the spill of a sib-reg copy.
+        if (hoistSpill(OldLI, MI)) {
+          // This COPY is now dead, the value is already in the stack slot.
+          MI->getOperand(0).setIsDead();
+          DeadDefs.push_back(MI);
+          continue;
+        }
+      } else {
+        // This is a reload for a sib-reg copy. Drop spills downstream.
+        LiveInterval &SibLI = LIS.getInterval(SibReg);
+        eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx));
+        // The COPY will fold to a reload below.
+      }
+    }
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(Ops))
+      continue;
+
+    // Allocate interval around instruction.
+    // FIXME: Infer regclass from instruction alone.
+    LiveInterval &NewLI = Edit->createFrom(Reg);
+    NewLI.markNotSpillable();
+
+    if (RI.Reads)
+      insertReload(NewLI, Idx, MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
+      MO.setReg(NewLI.reg);
+      if (MO.isUse()) {
+        if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (RI.Writes) {
+      if (hasLiveDef)
+        insertSpill(NewLI, OldLI, Idx, MI);
+      else {
+        // This instruction defines a dead value.  We don't need to spill it,
+        // but do create a live range for the dead value.
+        VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
+        NewLI.addRange(LiveRange(Idx, Idx.getDeadSlot(), VNI));
+      }
+    }
+
+    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+  }
+}
+
+/// spillAll - Spill all registers remaining after rematerialization.
+void InlineSpiller::spillAll() {
+  // Update LiveStacks now that we are committed to spilling.
+  if (StackSlot == VirtRegMap::NO_STACK_SLOT) {
+    StackSlot = VRM.assignVirt2StackSlot(Original);
+    StackInt = &LSS.getOrCreateInterval(StackSlot, MRI.getRegClass(Original));
+    StackInt->getNextValue(SlotIndex(), LSS.getVNInfoAllocator());
+  } else
+    StackInt = &LSS.getInterval(StackSlot);
+
+  if (Original != Edit->getReg())
+    VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot);
+
+  assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
+    StackInt->MergeRangesInAsValue(LIS.getInterval(RegsToSpill[i]),
+                                   StackInt->getValNumInfo(0));
+  DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
+
+  // Spill around uses of all RegsToSpill.
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
+    spillAroundUses(RegsToSpill[i]);
+
+  // Hoisted spills may cause dead code.
+  if (!DeadDefs.empty()) {
+    DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
+  }
+
+  // Finally delete the SnippetCopies.
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i) {
+    for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(RegsToSpill[i]);
+         MachineInstr *MI = RI.skipInstruction();) {
+      assert(SnippetCopies.count(MI) && "Remaining use wasn't a snippet copy");
+      // FIXME: Do this with a LiveRangeEdit callback.
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  // Delete all spilled registers.
+  for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
+    Edit->eraseVirtReg(RegsToSpill[i]);
+}
+
+void InlineSpiller::spill(LiveRangeEdit &edit) {
+  ++NumSpilledRanges;
+  Edit = &edit;
+  assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
+         && "Trying to spill a stack slot.");
+  // Share a stack slot among all descendants of Original.
+  Original = VRM.getOriginal(edit.getReg());
+  StackSlot = VRM.getStackSlot(Original);
+  StackInt = 0;
+
+  DEBUG(dbgs() << "Inline spilling "
+               << MRI.getRegClass(edit.getReg())->getName()
+               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
+               << "\nFrom original " << LIS.getInterval(Original) << '\n');
+  assert(edit.getParent().isSpillable() &&
+         "Attempting to spill already spilled value.");
+  assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
+
+  collectRegsToSpill();
+  analyzeSiblingValues();
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (!RegsToSpill.empty())
+    spillAll();
+
+  Edit->calculateRegClassAndHint(MF, Loops);
+}
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
new file mode 100644
index 000000000000..a8e711e33bdf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -0,0 +1,231 @@
+//===-- InterferenceCache.cpp - Caching per-block interference ---------*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference in LiveIntervalUnions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "InterferenceCache.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+// Static member used for null interference cursors.
+InterferenceCache::BlockInterference InterferenceCache::Cursor::NoInterference;
+
+void InterferenceCache::init(MachineFunction *mf,
+                             LiveIntervalUnion *liuarray,
+                             SlotIndexes *indexes,
+                             LiveIntervals *lis,
+                             const TargetRegisterInfo *tri) {
+  MF = mf;
+  LIUArray = liuarray;
+  TRI = tri;
+  PhysRegEntries.assign(TRI->getNumRegs(), 0);
+  for (unsigned i = 0; i != CacheEntries; ++i)
+    Entries[i].clear(mf, indexes, lis);
+}
+
+InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
+  unsigned E = PhysRegEntries[PhysReg];
+  if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
+    if (!Entries[E].valid(LIUArray, TRI))
+      Entries[E].revalidate(LIUArray, TRI);
+    return &Entries[E];
+  }
+  // No valid entry exists, pick the next round-robin entry.
+  E = RoundRobin;
+  if (++RoundRobin == CacheEntries)
+    RoundRobin = 0;
+  for (unsigned i = 0; i != CacheEntries; ++i) {
+    // Skip entries that are in use.
+    if (Entries[E].hasRefs()) {
+      if (++E == CacheEntries)
+        E = 0;
+      continue;
+    }
+    Entries[E].reset(PhysReg, LIUArray, TRI, MF);
+    PhysRegEntries[PhysReg] = E;
+    return &Entries[E];
+  }
+  llvm_unreachable("Ran out of interference cache entries.");
+}
+
+/// revalidate - LIU contents have changed, update tags.
+void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
+                                          const TargetRegisterInfo *TRI) {
+  // Invalidate all block entries.
+  ++Tag;
+  // Invalidate all iterators.
+  PrevPos = SlotIndex();
+  unsigned i = 0;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i)
+    RegUnits[i].VirtTag = LIUArray[*Units].getTag();
+}
+
+void InterferenceCache::Entry::reset(unsigned physReg,
+                                     LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI,
+                                     const MachineFunction *MF) {
+  assert(!hasRefs() && "Cannot reset cache entry with references");
+  // LIU's changed, invalidate cache.
+  ++Tag;
+  PhysReg = physReg;
+  Blocks.resize(MF->getNumBlockIDs());
+
+  // Reset iterators.
+  PrevPos = SlotIndex();
+  RegUnits.clear();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    RegUnits.push_back(LIUArray[*Units]);
+    RegUnits.back().Fixed = &LIS->getRegUnit(*Units);
+  }
+}
+
+bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI) {
+  unsigned i = 0, e = RegUnits.size();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) {
+    if (i == e)
+      return false;
+    if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag))
+      return false;
+  }
+  return i == e;
+}
+
+void InterferenceCache::Entry::update(unsigned MBBNum) {
+  SlotIndex Start, Stop;
+  tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+
+  // Use advanceTo only when possible.
+  if (PrevPos != Start) {
+    if (!PrevPos.isValid() || Start < PrevPos) {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.find(Start);
+        RUI.FixedI = RUI.Fixed->find(Start);
+      }
+    } else {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.advanceTo(Start);
+        if (RUI.FixedI != RUI.Fixed->end())
+          RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start);
+      }
+    }
+    PrevPos = Start;
+  }
+
+  MachineFunction::const_iterator MFI = MF->getBlockNumbered(MBBNum);
+  BlockInterference *BI = &Blocks[MBBNum];
+  ArrayRef<SlotIndex> RegMaskSlots;
+  ArrayRef<const uint32_t*> RegMaskBits;
+  for (;;) {
+    BI->Tag = Tag;
+    BI->First = BI->Last = SlotIndex();
+
+    // Check for first interference from virtregs.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+      if (!I.valid())
+        continue;
+      SlotIndex StartI = I.start();
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Same thing for fixed interference.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveInterval::const_iterator I = RegUnits[i].FixedI;
+      LiveInterval::const_iterator E = RegUnits[i].Fixed->end();
+      if (I == E)
+        continue;
+      SlotIndex StartI = I->start;
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Also check for register mask interference.
+    RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum);
+    RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum);
+    SlotIndex Limit = BI->First.isValid() ? BI->First : Stop;
+    for (unsigned i = 0, e = RegMaskSlots.size();
+         i != e && RegMaskSlots[i] < Limit; ++i)
+      if (MachineOperand::clobbersPhysReg(RegMaskBits[i], PhysReg)) {
+        // Register mask i clobbers PhysReg before the LIU interference.
+        BI->First = RegMaskSlots[i];
+        break;
+      }
+
+    PrevPos = Stop;
+    if (BI->First.isValid())
+      break;
+
+    // No interference in this block? Go ahead and precompute the next block.
+    if (++MFI == MF->end())
+      return;
+    MBBNum = MFI->getNumber();
+    BI = &Blocks[MBBNum];
+    if (BI->Tag == Tag)
+      return;
+    tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+  }
+
+  // Check for last interference in block.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+    if (!I.valid() || I.start() >= Stop)
+      continue;
+    I.advanceTo(Stop);
+    bool Backup = !I.valid() || I.start() >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I.stop();
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Fixed interference.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveInterval::iterator &I = RegUnits[i].FixedI;
+    LiveInterval *LI = RegUnits[i].Fixed;
+    if (I == LI->end() || I->start >= Stop)
+      continue;
+    I = LI->advanceTo(I, Stop);
+    bool Backup = I == LI->end() || I->start >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I->end;
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Also check for register mask interference.
+  SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start;
+  for (unsigned i = RegMaskSlots.size();
+       i && RegMaskSlots[i-1].getDeadSlot() > Limit; --i)
+    if (MachineOperand::clobbersPhysReg(RegMaskBits[i-1], PhysReg)) {
+      // Register mask i-1 clobbers PhysReg after the LIU interference.
+      // Model the regmask clobber as a dead def.
+      BI->Last = RegMaskSlots[i-1].getDeadSlot();
+      break;
+    }
+}
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.h b/contrib/llvm/lib/CodeGen/InterferenceCache.h
new file mode 100644
index 000000000000..c02fb9a1ee24
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.h
@@ -0,0 +1,228 @@
+//===-- InterferenceCache.h - Caching per-block interference ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference from LiveIntervalUnions,
+// fixed RegUnit interference, and register masks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_INTERFERENCECACHE
+#define LLVM_CODEGEN_INTERFERENCECACHE
+
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+
+namespace llvm {
+
+class LiveIntervals;
+
+class InterferenceCache {
+  const TargetRegisterInfo *TRI;
+  LiveIntervalUnion *LIUArray;
+  MachineFunction *MF;
+
+  /// BlockInterference - information about the interference in a single basic
+  /// block.
+  struct BlockInterference {
+    BlockInterference() : Tag(0) {}
+    unsigned Tag;
+    SlotIndex First;
+    SlotIndex Last;
+  };
+
+  /// Entry - A cache entry containing interference information for all aliases
+  /// of PhysReg in all basic blocks.
+  class Entry {
+    /// PhysReg - The register currently represented.
+    unsigned PhysReg;
+
+    /// Tag - Cache tag is changed when any of the underlying LiveIntervalUnions
+    /// change.
+    unsigned Tag;
+
+    /// RefCount - The total number of Cursor instances referring to this Entry.
+    unsigned RefCount;
+
+    /// MF - The current function.
+    MachineFunction *MF;
+
+    /// Indexes - Mapping block numbers to SlotIndex ranges.
+    SlotIndexes *Indexes;
+
+    /// LIS - Used for accessing register mask interference maps.
+    LiveIntervals *LIS;
+
+    /// PrevPos - The previous position the iterators were moved to.
+    SlotIndex PrevPos;
+
+    /// RegUnitInfo - Information tracked about each RegUnit in PhysReg.
+    /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos)
+    /// had just been called.
+    struct RegUnitInfo {
+      /// Iterator pointing into the LiveIntervalUnion containing virtual
+      /// register interference.
+      LiveIntervalUnion::SegmentIter VirtI;
+
+      /// Tag of the LIU last time we looked.
+      unsigned VirtTag;
+
+      /// Fixed interference in RegUnit.
+      LiveInterval *Fixed;
+
+      /// Iterator pointing into the fixed RegUnit interference.
+      LiveInterval::iterator FixedI;
+
+      RegUnitInfo(LiveIntervalUnion &LIU) : VirtTag(LIU.getTag()), Fixed(0) {
+        VirtI.setMap(LIU.getMap());
+      }
+    };
+
+    /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have
+    /// more than 4 RegUnits.
+    SmallVector<RegUnitInfo, 4> RegUnits;
+
+    /// Blocks - Interference for each block in the function.
+    SmallVector<BlockInterference, 8> Blocks;
+
+    /// update - Recompute Blocks[MBBNum]
+    void update(unsigned MBBNum);
+
+  public:
+    Entry() : PhysReg(0), Tag(0), RefCount(0), Indexes(0), LIS(0) {}
+
+    void clear(MachineFunction *mf, SlotIndexes *indexes, LiveIntervals *lis) {
+      assert(!hasRefs() && "Cannot clear cache entry with references");
+      PhysReg = 0;
+      MF = mf;
+      Indexes = indexes;
+      LIS = lis;
+    }
+
+    unsigned getPhysReg() const { return PhysReg; }
+
+    void addRef(int Delta) { RefCount += Delta; }
+
+    bool hasRefs() const { return RefCount > 0; }
+
+    void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// valid - Return true if this is a valid entry for physReg.
+    bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// reset - Initialize entry to represent physReg's aliases.
+    void reset(unsigned physReg,
+               LiveIntervalUnion *LIUArray,
+               const TargetRegisterInfo *TRI,
+               const MachineFunction *MF);
+
+    /// get - Return an up to date BlockInterference.
+    BlockInterference *get(unsigned MBBNum) {
+      if (Blocks[MBBNum].Tag != Tag)
+        update(MBBNum);
+      return &Blocks[MBBNum];
+    }
+  };
+
+  // We don't keep a cache entry for every physical register, that would use too
+  // much memory. Instead, a fixed number of cache entries are used in a round-
+  // robin manner.
+  enum { CacheEntries = 32 };
+
+  // Point to an entry for each physreg. The entry pointed to may not be up to
+  // date, and it may have been reused for a different physreg.
+  SmallVector<unsigned char, 2> PhysRegEntries;
+
+  // Next round-robin entry to be picked.
+  unsigned RoundRobin;
+
+  // The actual cache entries.
+  Entry Entries[CacheEntries];
+
+  // get - Get a valid entry for PhysReg.
+  Entry *get(unsigned PhysReg);
+
+public:
+  InterferenceCache() : TRI(0), LIUArray(0), MF(0), RoundRobin(0) {}
+
+  /// init - Prepare cache for a new function.
+  void init(MachineFunction*, LiveIntervalUnion*, SlotIndexes*, LiveIntervals*,
+            const TargetRegisterInfo *);
+
+  /// getMaxCursors - Return the maximum number of concurrent cursors that can
+  /// be supported.
+  unsigned getMaxCursors() const { return CacheEntries; }
+
+  /// Cursor - The primary query interface for the block interference cache.
+  class Cursor {
+    Entry *CacheEntry;
+    BlockInterference *Current;
+    static BlockInterference NoInterference;
+
+    void setEntry(Entry *E) {
+      Current = 0;
+      // Update reference counts. Nothing happens when RefCount reaches 0, so
+      // we don't have to check for E == CacheEntry etc.
+      if (CacheEntry)
+        CacheEntry->addRef(-1);
+      CacheEntry = E;
+      if (CacheEntry)
+        CacheEntry->addRef(+1);
+    }
+
+  public:
+    /// Cursor - Create a dangling cursor.
+    Cursor() : CacheEntry(0), Current(0) {}
+    ~Cursor() { setEntry(0); }
+
+    Cursor(const Cursor &O) : CacheEntry(0), Current(0) {
+      setEntry(O.CacheEntry);
+    }
+
+    Cursor &operator=(const Cursor &O) {
+      setEntry(O.CacheEntry);
+      return *this;
+    }
+
+    /// setPhysReg - Point this cursor to PhysReg's interference.
+    void setPhysReg(InterferenceCache &Cache, unsigned PhysReg) {
+      // Release reference before getting a new one. That guarantees we can
+      // actually have CacheEntries live cursors.
+      setEntry(0);
+      if (PhysReg)
+        setEntry(Cache.get(PhysReg));
+    }
+
+    /// moveTo - Move cursor to basic block MBBNum.
+    void moveToBlock(unsigned MBBNum) {
+      Current = CacheEntry ? CacheEntry->get(MBBNum) : &NoInterference;
+    }
+
+    /// hasInterference - Return true if the current block has any interference.
+    bool hasInterference() {
+      return Current->First.isValid();
+    }
+
+    /// first - Return the starting index of the first interfering range in the
+    /// current block.
+    SlotIndex first() {
+      return Current->First;
+    }
+
+    /// last - Return the ending index of the last interfering range in the
+    /// current block.
+    SlotIndex last() {
+      return Current->Last;
+    }
+  };
+
+  friend class Cursor;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
new file mode 100644
index 000000000000..07f0ccf52f8c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -0,0 +1,565 @@
+//===-- IntrinsicLowering.cpp - Intrinsic Lowering default implementation -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IntrinsicLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+template <class ArgIt>
+static void EnsureFunctionExists(Module &M, const char *Name,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 Type *RetTy) {
+  // Insert a correctly-typed definition now.
+  std::vector<Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back(I->getType());
+  M.getOrInsertFunction(Name, FunctionType::get(RetTy, ParamTys, false));
+}
+
+static void EnsureFPIntrinsicsExist(Module &M, Function *Fn,
+                                    const char *FName,
+                                    const char *DName, const char *LDName) {
+  // Insert definitions for all the floating point types.
+  switch((int)Fn->arg_begin()->getType()->getTypeID()) {
+  case Type::FloatTyID:
+    EnsureFunctionExists(M, FName, Fn->arg_begin(), Fn->arg_end(),
+                         Type::getFloatTy(M.getContext()));
+    break;
+  case Type::DoubleTyID:
+    EnsureFunctionExists(M, DName, Fn->arg_begin(), Fn->arg_end(),
+                         Type::getDoubleTy(M.getContext()));
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    EnsureFunctionExists(M, LDName, Fn->arg_begin(), Fn->arg_end(),
+                         Fn->arg_begin()->getType());
+    break;
+  }
+}
+
+/// ReplaceCallWith - This function is used when we want to lower an intrinsic
+/// call to a call of an external function.  This handles hard cases such as
+/// when there was already a prototype for the external function, and if that
+/// prototype doesn't match the arguments we expect to pass in.
+template <class ArgIt>
+static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 Type *RetTy) {
+  // If we haven't already looked up this function, check to see if the
+  // program already contains a function with this name.
+  Module *M = CI->getParent()->getParent()->getParent();
+  // Get or insert the definition now.
+  std::vector<Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back((*I)->getType());
+  Constant* FCache = M->getOrInsertFunction(NewFn,
+                                  FunctionType::get(RetTy, ParamTys, false));
+
+  IRBuilder<> Builder(CI->getParent(), CI);
+  SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+  CallInst *NewCI = Builder.CreateCall(FCache, Args);
+  NewCI->setName(CI->getName());
+  if (!CI->use_empty())
+    CI->replaceAllUsesWith(NewCI);
+  return NewCI;
+}
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                         !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+void IntrinsicLowering::AddPrototypes(Module &M) {
+  LLVMContext &Context = M.getContext();
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->isDeclaration() && !I->use_empty())
+      switch (I->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::setjmp:
+        EnsureFunctionExists(M, "setjmp", I->arg_begin(), I->arg_end(),
+                             Type::getInt32Ty(M.getContext()));
+        break;
+      case Intrinsic::longjmp:
+        EnsureFunctionExists(M, "longjmp", I->arg_begin(), I->arg_end(),
+                             Type::getVoidTy(M.getContext()));
+        break;
+      case Intrinsic::siglongjmp:
+        EnsureFunctionExists(M, "abort", I->arg_end(), I->arg_end(),
+                             Type::getVoidTy(M.getContext()));
+        break;
+      case Intrinsic::memcpy:
+        M.getOrInsertFunction("memcpy",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt8PtrTy(Context), 
+                              TD.getIntPtrType(Context), (Type *)0);
+        break;
+      case Intrinsic::memmove:
+        M.getOrInsertFunction("memmove",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt8PtrTy(Context), 
+                              TD.getIntPtrType(Context), (Type *)0);
+        break;
+      case Intrinsic::memset:
+        M.getOrInsertFunction("memset",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt32Ty(M.getContext()), 
+                              TD.getIntPtrType(Context), (Type *)0);
+        break;
+      case Intrinsic::sqrt:
+        EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl");
+        break;
+      case Intrinsic::sin:
+        EnsureFPIntrinsicsExist(M, I, "sinf", "sin", "sinl");
+        break;
+      case Intrinsic::cos:
+        EnsureFPIntrinsicsExist(M, I, "cosf", "cos", "cosl");
+        break;
+      case Intrinsic::pow:
+        EnsureFPIntrinsicsExist(M, I, "powf", "pow", "powl");
+        break;
+      case Intrinsic::log:
+        EnsureFPIntrinsicsExist(M, I, "logf", "log", "logl");
+        break;
+      case Intrinsic::log2:
+        EnsureFPIntrinsicsExist(M, I, "log2f", "log2", "log2l");
+        break;
+      case Intrinsic::log10:
+        EnsureFPIntrinsicsExist(M, I, "log10f", "log10", "log10l");
+        break;
+      case Intrinsic::exp:
+        EnsureFPIntrinsicsExist(M, I, "expf", "exp", "expl");
+        break;
+      case Intrinsic::exp2:
+        EnsureFPIntrinsicsExist(M, I, "exp2f", "exp2", "exp2l");
+        break;
+      }
+}
+
+/// LowerBSWAP - Emit the code to lower bswap of V before the specified
+/// instruction IP.
+static Value *LowerBSWAP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntegerTy() && "Can't bswap a non-integer type!");
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  switch(BitSize) {
+  default: llvm_unreachable("Unhandled type size of value to byteswap!");
+  case 16: {
+    Value *Tmp1 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.2");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.1");
+    V = Builder.CreateOr(Tmp1, Tmp2, "bswap.i16");
+    break;
+  }
+  case 32: {
+    Value *Tmp4 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.4");
+    Value *Tmp3 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.3");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.2");
+    Value *Tmp1 = Builder.CreateLShr(V,ConstantInt::get(V->getType(), 24),
+                                     "bswap.1");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                         ConstantInt::get(Type::getInt32Ty(Context), 0xFF0000),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                           ConstantInt::get(Type::getInt32Ty(Context), 0xFF00),
+                             "bswap.and2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or1");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or2");
+    V = Builder.CreateOr(Tmp4, Tmp2, "bswap.i32");
+    break;
+  }
+  case 64: {
+    Value *Tmp8 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 56),
+                                    "bswap.8");
+    Value *Tmp7 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 40),
+                                    "bswap.7");
+    Value *Tmp6 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.6");
+    Value *Tmp5 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.5");
+    Value* Tmp4 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.4");
+    Value* Tmp3 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 24),
+                                     "bswap.3");
+    Value* Tmp2 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 40),
+                                     "bswap.2");
+    Value* Tmp1 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 56),
+                                     "bswap.1");
+    Tmp7 = Builder.CreateAnd(Tmp7,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                                              0xFF000000000000ULL),
+                             "bswap.and7");
+    Tmp6 = Builder.CreateAnd(Tmp6,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                                              0xFF0000000000ULL),
+                             "bswap.and6");
+    Tmp5 = Builder.CreateAnd(Tmp5,
+                        ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF00000000ULL),
+                             "bswap.and5");
+    Tmp4 = Builder.CreateAnd(Tmp4,
+                        ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF000000ULL),
+                             "bswap.and4");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF0000ULL),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF00ULL),
+                             "bswap.and2");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp7, "bswap.or1");
+    Tmp6 = Builder.CreateOr(Tmp6, Tmp5, "bswap.or2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or3");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or4");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp6, "bswap.or5");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp2, "bswap.or6");
+    V = Builder.CreateOr(Tmp8, Tmp4, "bswap.i64");
+    break;
+  }
+  }
+  return V;
+}
+
+/// LowerCTPOP - Emit the code to lower ctpop of V before the specified
+/// instruction IP.
+static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntegerTy() && "Can't ctpop a non-integer type!");
+
+  static const uint64_t MaskValues[6] = {
+    0x5555555555555555ULL, 0x3333333333333333ULL,
+    0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+    0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+  };
+
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  unsigned WordSize = (BitSize + 63) / 64;
+  Value *Count = ConstantInt::get(V->getType(), 0);
+
+  for (unsigned n = 0; n < WordSize; ++n) {
+    Value *PartValue = V;
+    for (unsigned i = 1, ct = 0; i < (BitSize>64 ? 64 : BitSize); 
+         i <<= 1, ++ct) {
+      Value *MaskCst = ConstantInt::get(V->getType(), MaskValues[ct]);
+      Value *LHS = Builder.CreateAnd(PartValue, MaskCst, "cppop.and1");
+      Value *VShift = Builder.CreateLShr(PartValue,
+                                        ConstantInt::get(V->getType(), i),
+                                         "ctpop.sh");
+      Value *RHS = Builder.CreateAnd(VShift, MaskCst, "cppop.and2");
+      PartValue = Builder.CreateAdd(LHS, RHS, "ctpop.step");
+    }
+    Count = Builder.CreateAdd(PartValue, Count, "ctpop.part");
+    if (BitSize > 64) {
+      V = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 64),
+                             "ctpop.part.sh");
+      BitSize -= 64;
+    }
+  }
+
+  return Count;
+}
+
+/// LowerCTLZ - Emit the code to lower ctlz of V before the specified
+/// instruction IP.
+static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
+
+  IRBuilder<> Builder(IP->getParent(), IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  for (unsigned i = 1; i < BitSize; i <<= 1) {
+    Value *ShVal = ConstantInt::get(V->getType(), i);
+    ShVal = Builder.CreateLShr(V, ShVal, "ctlz.sh");
+    V = Builder.CreateOr(V, ShVal, "ctlz.step");
+  }
+
+  V = Builder.CreateNot(V);
+  return LowerCTPOP(Context, V, IP);
+}
+
+static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
+                                       const char *Dname,
+                                       const char *LDname) {
+  CallSite CS(CI);
+  switch (CI->getArgOperand(0)->getType()->getTypeID()) {
+  default: llvm_unreachable("Invalid type in intrinsic");
+  case Type::FloatTyID:
+    ReplaceCallWith(Fname, CI, CS.arg_begin(), CS.arg_end(),
+                  Type::getFloatTy(CI->getContext()));
+    break;
+  case Type::DoubleTyID:
+    ReplaceCallWith(Dname, CI, CS.arg_begin(), CS.arg_end(),
+                  Type::getDoubleTy(CI->getContext()));
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    ReplaceCallWith(LDname, CI, CS.arg_begin(), CS.arg_end(),
+                  CI->getArgOperand(0)->getType());
+    break;
+  }
+}
+
+void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
+  IRBuilder<> Builder(CI->getParent(), CI);
+  LLVMContext &Context = CI->getContext();
+
+  const Function *Callee = CI->getCalledFunction();
+  assert(Callee && "Cannot lower an indirect call!");
+
+  CallSite CS(CI);
+  switch (Callee->getIntrinsicID()) {
+  case Intrinsic::not_intrinsic:
+    report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
+                      Callee->getName() + "'!");
+  default:
+    report_fatal_error("Code generator does not support intrinsic function '"+
+                      Callee->getName()+"'!");
+
+  case Intrinsic::expect: {
+    // Just replace __builtin_expect(exp, c) with EXP.
+    Value *V = CI->getArgOperand(0);
+    CI->replaceAllUsesWith(V);
+    break;
+  }
+
+    // The setjmp/longjmp intrinsics should only exist in the code if it was
+    // never optimized (ie, right out of the CFE), or if it has been hacked on
+    // by the lowerinvoke pass.  In both cases, the right thing to do is to
+    // convert the call to an explicit setjmp or longjmp call.
+  case Intrinsic::setjmp: {
+    Value *V = ReplaceCallWith("setjmp", CI, CS.arg_begin(), CS.arg_end(),
+                               Type::getInt32Ty(Context));
+    if (!CI->getType()->isVoidTy())
+      CI->replaceAllUsesWith(V);
+    break;
+  }
+  case Intrinsic::sigsetjmp:
+     if (!CI->getType()->isVoidTy())
+       CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+     break;
+
+  case Intrinsic::longjmp: {
+    ReplaceCallWith("longjmp", CI, CS.arg_begin(), CS.arg_end(),
+                    Type::getVoidTy(Context));
+    break;
+  }
+
+  case Intrinsic::siglongjmp: {
+    // Insert the call to abort
+    ReplaceCallWith("abort", CI, CS.arg_end(), CS.arg_end(), 
+                    Type::getVoidTy(Context));
+    break;
+  }
+  case Intrinsic::ctpop:
+    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::bswap:
+    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getArgOperand(0), CI));
+    break;
+    
+  case Intrinsic::ctlz:
+    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::cttz: {
+    // cttz(x) -> ctpop(~X & (X-1))
+    Value *Src = CI->getArgOperand(0);
+    Value *NotSrc = Builder.CreateNot(Src);
+    NotSrc->setName(Src->getName() + ".not");
+    Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
+    SrcM1 = Builder.CreateSub(Src, SrcM1);
+    Src = LowerCTPOP(Context, Builder.CreateAnd(NotSrc, SrcM1), CI);
+    CI->replaceAllUsesWith(Src);
+    break;
+  }
+
+  case Intrinsic::stacksave:
+  case Intrinsic::stackrestore: {
+    if (!Warned)
+      errs() << "WARNING: this target does not support the llvm.stack"
+             << (Callee->getIntrinsicID() == Intrinsic::stacksave ?
+               "save" : "restore") << " intrinsic.\n";
+    Warned = true;
+    if (Callee->getIntrinsicID() == Intrinsic::stacksave)
+      CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    break;
+  }
+    
+  case Intrinsic::returnaddress:
+  case Intrinsic::frameaddress:
+    errs() << "WARNING: this target does not support the llvm."
+           << (Callee->getIntrinsicID() == Intrinsic::returnaddress ?
+             "return" : "frame") << "address intrinsic.\n";
+    CI->replaceAllUsesWith(ConstantPointerNull::get(
+                                            cast<PointerType>(CI->getType())));
+    break;
+
+  case Intrinsic::prefetch:
+    break;    // Simply strip out prefetches on unsupported architectures
+
+  case Intrinsic::pcmarker:
+    break;    // Simply strip out pcmarker on unsupported architectures
+  case Intrinsic::readcyclecounter: {
+    errs() << "WARNING: this target does not support the llvm.readcyclecoun"
+           << "ter intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
+    break;
+  }
+
+  case Intrinsic::dbg_declare:
+    break;    // Simply strip out debugging intrinsics
+
+  case Intrinsic::eh_typeid_for:
+    // Return something different to eh_selector.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+    break;
+
+  case Intrinsic::var_annotation:
+    break;   // Strip out annotate intrinsic
+    
+  case Intrinsic::memcpy: {
+    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memmove: {
+    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memset: {
+    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    // Extend the amount to i32.
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
+                                   Type::getInt32Ty(Context),
+                                   /* isSigned */ false);
+    Ops[2] = Size;
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::sqrt: {
+    ReplaceFPIntrinsicWithCall(CI, "sqrtf", "sqrt", "sqrtl");
+    break;
+  }
+  case Intrinsic::log: {
+    ReplaceFPIntrinsicWithCall(CI, "logf", "log", "logl");
+    break;
+  }
+  case Intrinsic::log2: {
+    ReplaceFPIntrinsicWithCall(CI, "log2f", "log2", "log2l");
+    break;
+  }
+  case Intrinsic::log10: {
+    ReplaceFPIntrinsicWithCall(CI, "log10f", "log10", "log10l");
+    break;
+  }
+  case Intrinsic::exp: {
+    ReplaceFPIntrinsicWithCall(CI, "expf", "exp", "expl");
+    break;
+  }
+  case Intrinsic::exp2: {
+    ReplaceFPIntrinsicWithCall(CI, "exp2f", "exp2", "exp2l");
+    break;
+  }
+  case Intrinsic::pow: {
+    ReplaceFPIntrinsicWithCall(CI, "powf", "pow", "powl");
+    break;
+  }
+  case Intrinsic::flt_rounds:
+     // Lower to "round to the nearest"
+     if (!CI->getType()->isVoidTy())
+       CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+     break;
+  case Intrinsic::invariant_start:
+  case Intrinsic::lifetime_start:
+    // Discard region information.
+    CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+    break;
+  case Intrinsic::invariant_end:
+  case Intrinsic::lifetime_end:
+    // Discard region information.
+    break;
+  }
+
+  assert(CI->use_empty() &&
+         "Lowering should have eliminated any uses of the intrinsic call!");
+  CI->eraseFromParent();
+}
+
+bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
+  // Verify this is a simple bswap.
+  if (CI->getNumArgOperands() != 1 ||
+      CI->getType() != CI->getArgOperand(0)->getType() ||
+      !CI->getType()->isIntegerTy())
+    return false;
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
+
+  Value *Op = CI->getArgOperand(0);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/JITCodeEmitter.cpp b/contrib/llvm/lib/CodeGen/JITCodeEmitter.cpp
new file mode 100644
index 000000000000..96a53892f6d3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/JITCodeEmitter.cpp
@@ -0,0 +1,14 @@
+//===-- llvm/CodeGen/JITCodeEmitter.cpp - Code emission --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+
+using namespace llvm;
+
+void JITCodeEmitter::anchor() { }
diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
new file mode 100644
index 000000000000..1a0983783484
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -0,0 +1,296 @@
+//===-- LLVMTargetMachine.cpp - Implement the LLVMTargetMachine class -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVMTargetMachine class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+// Enable or disable FastISel. Both options are needed, because
+// FastISel is enabled by default with -fast, and we wish to be
+// able to enable or disable fast-isel independently from -O0.
+static cl::opt<cl::boolOrDefault>
+EnableFastISelOption("fast-isel", cl::Hidden,
+  cl::desc("Enable the \"fast\" instruction selector"));
+
+static cl::opt<bool> ShowMCEncoding("show-mc-encoding", cl::Hidden,
+    cl::desc("Show encoding in .s output"));
+static cl::opt<bool> ShowMCInst("show-mc-inst", cl::Hidden,
+    cl::desc("Show instruction structure in .s output"));
+
+static cl::opt<cl::boolOrDefault>
+AsmVerbose("asm-verbose", cl::desc("Add comments to directives."),
+           cl::init(cl::BOU_UNSET));
+
+static bool getVerboseAsm() {
+  switch (AsmVerbose) {
+  case cl::BOU_UNSET: return TargetMachine::getAsmVerbosityDefault();
+  case cl::BOU_TRUE:  return true;
+  case cl::BOU_FALSE: return false;
+  }
+  llvm_unreachable("Invalid verbose asm state");
+}
+
+LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
+                                     StringRef CPU, StringRef FS,
+                                     TargetOptions Options,
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL)
+  : TargetMachine(T, Triple, CPU, FS, Options) {
+  CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM, OL);
+  AsmInfo = T.createMCAsmInfo(Triple);
+  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
+  // and if the old one gets included then MCAsmInfo will be NULL and
+  // we'll crash later.
+  // Provide the user with a useful error message about what's wrong.
+  assert(AsmInfo && "MCAsmInfo not initialized."
+         "Make sure you include the correct TargetSelect.h"
+         "and that InitializeAllTargetMCs() is being invoked!");
+}
+
+void LLVMTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+}
+
+/// addPassesToX helper drives creation and initialization of TargetPassConfig.
+static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
+                                          PassManagerBase &PM,
+                                          bool DisableVerify,
+                                          AnalysisID StartAfter,
+                                          AnalysisID StopAfter) {
+  // Targets may override createPassConfig to provide a target-specific sublass.
+  TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  PassConfig->setStartStopPasses(StartAfter, StopAfter);
+
+  // Set PassConfig options provided by TargetMachine.
+  PassConfig->setDisableVerify(DisableVerify);
+
+  PM.add(PassConfig);
+
+  PassConfig->addIRPasses();
+
+  PassConfig->addCodeGenPrepare();
+
+  PassConfig->addPassesToHandleExceptions();
+
+  PassConfig->addISelPrepare();
+
+  // Install a MachineModuleInfo class, which is an immutable pass that holds
+  // all the per-module stuff we're generating, including MCContext.
+  MachineModuleInfo *MMI =
+    new MachineModuleInfo(*TM->getMCAsmInfo(), *TM->getRegisterInfo(),
+                          &TM->getTargetLowering()->getObjFileLowering());
+  PM.add(MMI);
+  MCContext *Context = &MMI->getContext(); // Return the MCContext by-ref.
+
+  // Set up a MachineFunction for the rest of CodeGen to work on.
+  PM.add(new MachineFunctionAnalysis(*TM));
+
+  // Enable FastISel with -fast, but allow that to be overridden.
+  if (EnableFastISelOption == cl::BOU_TRUE ||
+      (TM->getOptLevel() == CodeGenOpt::None &&
+       EnableFastISelOption != cl::BOU_FALSE))
+    TM->setFastISel(true);
+
+  // Ask the target for an isel.
+  if (PassConfig->addInstSelector())
+    return NULL;
+
+  PassConfig->addMachinePasses();
+
+  PassConfig->setInitialized();
+
+  return Context;
+}
+
+bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                            formatted_raw_ostream &Out,
+                                            CodeGenFileType FileType,
+                                            bool DisableVerify,
+                                            AnalysisID StartAfter,
+                                            AnalysisID StopAfter) {
+  // Add common CodeGen passes.
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify,
+                                               StartAfter, StopAfter);
+  if (!Context)
+    return true;
+
+  if (StopAfter) {
+    // FIXME: The intent is that this should eventually write out a YAML file,
+    // containing the LLVM IR, the machine-level IR (when stopping after a
+    // machine-level pass), and whatever other information is needed to
+    // deserialize the code and resume compilation.  For now, just write the
+    // LLVM IR.
+    PM.add(createPrintModulePass(&Out));
+    return false;
+  }
+
+  if (hasMCSaveTempLabels())
+    Context->setAllowTemporaryLabels(false);
+
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getRegisterInfo();
+  const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  OwningPtr<MCStreamer> AsmStreamer;
+
+  switch (FileType) {
+  case CGFT_AssemblyFile: {
+    MCInstPrinter *InstPrinter =
+      getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI,
+                                      *getInstrInfo(),
+                                      Context->getRegisterInfo(), STI);
+
+    // Create a code emitter if asked to show the encoding.
+    MCCodeEmitter *MCE = 0;
+    MCAsmBackend *MAB = 0;
+    if (ShowMCEncoding) {
+      const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
+                                            *Context);
+      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
+    }
+
+    MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
+                                                  getVerboseAsm(),
+                                                  hasMCUseLoc(),
+                                                  hasMCUseCFI(),
+                                                  hasMCUseDwarfDirectory(),
+                                                  InstPrinter,
+                                                  MCE, MAB,
+                                                  ShowMCInst);
+    AsmStreamer.reset(S);
+    break;
+  }
+  case CGFT_ObjectFile: {
+    // Create the code emitter for the target if it exists.  If not, .o file
+    // emission fails.
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                         STI, *Context);
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(),
+                                                       TargetCPU);
+    if (MCE == 0 || MAB == 0)
+      return true;
+
+    AsmStreamer.reset(getTarget().createMCObjectStreamer(getTargetTriple(),
+                                                         *Context, *MAB, Out,
+                                                         MCE, hasMCRelaxAll(),
+                                                         hasMCNoExecStack()));
+    AsmStreamer.get()->setAutoInitSections(true);
+    break;
+  }
+  case CGFT_Null:
+    // The Null output is intended for use for performance analysis and testing,
+    // not real users.
+    AsmStreamer.reset(createNullStreamer(*Context));
+    break;
+  }
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  if (Printer == 0)
+    return true;
+
+  // If successful, createAsmPrinter took ownership of AsmStreamer.
+  AsmStreamer.take();
+
+  PM.add(Printer);
+
+  return false;
+}
+
+/// addPassesToEmitMachineCode - Add passes to the specified pass manager to
+/// get machine code emitted.  This uses a JITCodeEmitter object to handle
+/// actually outputting the machine code and resolving things like the address
+/// of functions.  This method should returns true if machine code emission is
+/// not supported.
+///
+bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
+                                                   JITCodeEmitter &JCE,
+                                                   bool DisableVerify) {
+  // Add common CodeGen passes.
+  MCContext *Context = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
+  if (!Context)
+    return true;
+
+  addCodeEmitter(PM, JCE);
+
+  return false; // success!
+}
+
+/// addPassesToEmitMC - Add passes to the specified pass manager to get
+/// machine code emitted with the MCJIT. This method returns true if machine
+/// code is not supported. It fills the MCContext Ctx pointer which can be
+/// used to build custom MCStreamer.
+///
+bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
+                                          MCContext *&Ctx,
+                                          raw_ostream &Out,
+                                          bool DisableVerify) {
+  // Add common CodeGen passes.
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, 0, 0);
+  if (!Ctx)
+    return true;
+
+  if (hasMCSaveTempLabels())
+    Ctx->setAllowTemporaryLabels(false);
+
+  // Create the code emitter for the target if it exists.  If not, .o file
+  // emission fails.
+  const MCRegisterInfo &MRI = *getRegisterInfo();
+  const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
+                                                       STI, *Ctx);
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
+  if (MCE == 0 || MAB == 0)
+    return true;
+
+  OwningPtr<MCStreamer> AsmStreamer;
+  AsmStreamer.reset(getTarget().createMCObjectStreamer(getTargetTriple(), *Ctx,
+                                                       *MAB, Out, MCE,
+                                                       hasMCRelaxAll(),
+                                                       hasMCNoExecStack()));
+  AsmStreamer.get()->InitSections();
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  if (Printer == 0)
+    return true;
+
+  // If successful, createAsmPrinter took ownership of AsmStreamer.
+  AsmStreamer.take();
+
+  PM.add(Printer);
+
+  return false; // success!
+}
diff --git a/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
new file mode 100644
index 000000000000..deab05a412c9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -0,0 +1,152 @@
+//===---- LatencyPriorityQueue.cpp - A latency-oriented priority queue ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LatencyPriorityQueue class, which is a
+// SchedulingPriorityQueue that schedules using latency information to
+// reduce the length of the critical path through the basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scheduler"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return RHSNum < LHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+
+  return OnlyAvailablePred;
+}
+
+void LatencyPriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+  }
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+
+  Queue.push_back(SU);
+}
+
+
+// scheduledNode - As nodes are scheduled, we look to see if there are any
+// successor nodes that have a single unscheduled predecessor.  If so, that
+// single predecessor has a higher priority, since scheduling it will make
+// the node available.
+void LatencyPriorityQueue::scheduledNode(SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    AdjustPriorityOfUnscheduledPreds(I->getSUnit());
+  }
+}
+
+/// AdjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable) return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+SUnit *LatencyPriorityQueue::pop() {
+  if (empty()) return NULL;
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != prior(Queue.end()))
+    std::swap(*Best, Queue.back());
+  Queue.pop_back();
+  return V;
+}
+
+void LatencyPriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(), SU);
+  if (I != prior(Queue.end()))
+    std::swap(*I, Queue.back());
+  Queue.pop_back();
+}
+
+#ifdef NDEBUG
+void LatencyPriorityQueue::dump(ScheduleDAG *DAG) const {}
+#else
+void LatencyPriorityQueue::dump(ScheduleDAG *DAG) const {
+  LatencyPriorityQueue q = *this;
+  while (!q.empty()) {
+    SUnit *su = q.pop();
+    dbgs() << "Height " << su->getHeight() << ": ";
+    su->dump(DAG);
+  }
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
new file mode 100644
index 000000000000..81721541cd89
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -0,0 +1,335 @@
+//===- LexicalScopes.cpp - Collecting lexical scope info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LexicalScopes analysis.
+//
+// This pass collects lexical scope information and maps machine instructions
+// to respective lexical scopes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lexicalscopes"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+LexicalScopes::~LexicalScopes() {
+  releaseMemory();
+}
+
+/// releaseMemory - release memory.
+void LexicalScopes::releaseMemory() {
+  MF = NULL;
+  CurrentFnLexicalScope = NULL;
+  DeleteContainerSeconds(LexicalScopeMap);
+  DeleteContainerSeconds(AbstractScopeMap);
+  InlinedLexicalScopeMap.clear();
+  AbstractScopesList.clear();
+}
+
+/// initialize - Scan machine function and constuct lexical scope nest.
+void LexicalScopes::initialize(const MachineFunction &Fn) {
+  releaseMemory();
+  MF = &Fn;
+  SmallVector<InsnRange, 4> MIRanges;
+  DenseMap<const MachineInstr *, LexicalScope *> MI2ScopeMap;
+  extractLexicalScopes(MIRanges, MI2ScopeMap);
+  if (CurrentFnLexicalScope) {
+    constructScopeNest(CurrentFnLexicalScope);
+    assignInstructionRanges(MIRanges, MI2ScopeMap);
+  }
+}
+
+/// extractLexicalScopes - Extract instruction ranges for each lexical scopes
+/// for the given machine function.
+void LexicalScopes::
+extractLexicalScopes(SmallVectorImpl<InsnRange> &MIRanges,
+                  DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+
+  // Scan each instruction and create scopes. First build working set of scopes.
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    const MachineInstr *RangeBeginMI = NULL;
+    const MachineInstr *PrevMI = NULL;
+    DebugLoc PrevDL;
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      const MachineInstr *MInsn = II;
+
+      // Check if instruction has valid location information.
+      const DebugLoc MIDL = MInsn->getDebugLoc();
+      if (MIDL.isUnknown()) {
+        PrevMI = MInsn;
+        continue;
+      }
+
+      // If scope has not changed then skip this instruction.
+      if (MIDL == PrevDL) {
+        PrevMI = MInsn;
+        continue;
+      }
+
+      // Ignore DBG_VALUE. It does not contribute to any instruction in output.
+      if (MInsn->isDebugValue())
+        continue;
+
+      if (RangeBeginMI) {
+        // If we have already seen a beginning of an instruction range and
+        // current instruction scope does not match scope of first instruction
+        // in this range then create a new instruction range.
+        InsnRange R(RangeBeginMI, PrevMI);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+        MIRanges.push_back(R);
+      }
+
+      // This is a beginning of a new instruction range.
+      RangeBeginMI = MInsn;
+
+      // Reset previous markers.
+      PrevMI = MInsn;
+      PrevDL = MIDL;
+    }
+
+    // Create last instruction range.
+    if (RangeBeginMI && PrevMI && !PrevDL.isUnknown()) {
+      InsnRange R(RangeBeginMI, PrevMI);
+      MIRanges.push_back(R);
+      MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+    }
+  }
+}
+
+/// findLexicalScope - Find lexical scope, either regular or inlined, for the
+/// given DebugLoc. Return NULL if not found.
+LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
+  MDNode *Scope = NULL;
+  MDNode *IA = NULL;
+  DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
+  if (!Scope) return NULL;
+
+  // The scope that we were created with could have an extra file - which
+  // isn't what we care about in this case.
+  DIDescriptor D = DIDescriptor(Scope);
+  if (D.isLexicalBlockFile())
+    Scope = DILexicalBlockFile(Scope).getScope();
+  
+  if (IA)
+    return InlinedLexicalScopeMap.lookup(DebugLoc::getFromDILocation(IA));
+  return LexicalScopeMap.lookup(Scope);
+}
+
+/// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
+/// not available then create new lexical scope.
+LexicalScope *LexicalScopes::getOrCreateLexicalScope(DebugLoc DL) {
+  MDNode *Scope = NULL;
+  MDNode *InlinedAt = NULL;
+  DL.getScopeAndInlinedAt(Scope, InlinedAt, MF->getFunction()->getContext());
+
+  if (InlinedAt) {
+    // Create an abstract scope for inlined function.
+    getOrCreateAbstractScope(Scope);
+    // Create an inlined scope for inlined function.
+    return getOrCreateInlinedScope(Scope, InlinedAt);
+  }
+   
+  return getOrCreateRegularScope(Scope);
+}
+
+/// getOrCreateRegularScope - Find or create a regular lexical scope.
+LexicalScope *LexicalScopes::getOrCreateRegularScope(MDNode *Scope) {
+  DIDescriptor D = DIDescriptor(Scope);
+  if (D.isLexicalBlockFile()) {
+    Scope = DILexicalBlockFile(Scope).getScope();
+    D = DIDescriptor(Scope);
+  }
+ 
+  LexicalScope *WScope = LexicalScopeMap.lookup(Scope);
+  if (WScope)
+    return WScope;
+
+  LexicalScope *Parent = NULL;
+  if (D.isLexicalBlock())
+    Parent = getOrCreateLexicalScope(DebugLoc::getFromDILexicalBlock(Scope));
+  WScope = new LexicalScope(Parent, DIDescriptor(Scope), NULL, false);
+  LexicalScopeMap.insert(std::make_pair(Scope, WScope));
+  if (!Parent && DIDescriptor(Scope).isSubprogram()
+      && DISubprogram(Scope).describes(MF->getFunction()))
+    CurrentFnLexicalScope = WScope;
+  
+  return WScope;
+}
+
+/// getOrCreateInlinedScope - Find or create an inlined lexical scope.
+LexicalScope *LexicalScopes::getOrCreateInlinedScope(MDNode *Scope, 
+                                                     MDNode *InlinedAt) {
+  LexicalScope *InlinedScope = LexicalScopeMap.lookup(InlinedAt);
+  if (InlinedScope)
+    return InlinedScope;
+
+  DebugLoc InlinedLoc = DebugLoc::getFromDILocation(InlinedAt);
+  InlinedScope = new LexicalScope(getOrCreateLexicalScope(InlinedLoc),
+                                  DIDescriptor(Scope), InlinedAt, false);
+  InlinedLexicalScopeMap[InlinedLoc] = InlinedScope;
+  LexicalScopeMap[InlinedAt] = InlinedScope;
+  return InlinedScope;
+}
+
+/// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+LexicalScope *LexicalScopes::getOrCreateAbstractScope(const MDNode *N) {
+  assert(N && "Invalid Scope encoding!");
+
+  DIDescriptor Scope(N);
+  if (Scope.isLexicalBlockFile())
+    Scope = DILexicalBlockFile(Scope).getScope();
+  LexicalScope *AScope = AbstractScopeMap.lookup(N);
+  if (AScope)
+    return AScope;
+
+  LexicalScope *Parent = NULL;
+  if (Scope.isLexicalBlock()) {
+    DILexicalBlock DB(N);
+    DIDescriptor ParentDesc = DB.getContext();
+    Parent = getOrCreateAbstractScope(ParentDesc);
+  }
+  AScope = new LexicalScope(Parent, DIDescriptor(N), NULL, true);
+  AbstractScopeMap[N] = AScope;
+  if (DIDescriptor(N).isSubprogram())
+    AbstractScopesList.push_back(AScope);
+  return AScope;
+}
+
+/// constructScopeNest
+void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
+  assert (Scope && "Unable to calculate scop edominance graph!");
+  SmallVector<LexicalScope *, 4> WorkStack;
+  WorkStack.push_back(Scope);
+  unsigned Counter = 0;
+  while (!WorkStack.empty()) {
+    LexicalScope *WS = WorkStack.back();
+    const SmallVector<LexicalScope *, 4> &Children = WS->getChildren();
+    bool visitedChildren = false;
+    for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
+           SE = Children.end(); SI != SE; ++SI) {
+      LexicalScope *ChildScope = *SI;
+      if (!ChildScope->getDFSOut()) {
+        WorkStack.push_back(ChildScope);
+        visitedChildren = true;
+        ChildScope->setDFSIn(++Counter);
+        break;
+      }
+    }
+    if (!visitedChildren) {
+      WorkStack.pop_back();
+      WS->setDFSOut(++Counter);
+    }
+  }
+}
+
+/// assignInstructionRanges - Find ranges of instructions covered by each
+/// lexical scope.
+void LexicalScopes::
+assignInstructionRanges(SmallVectorImpl<InsnRange> &MIRanges,
+                    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap)
+{
+  
+  LexicalScope *PrevLexicalScope = NULL;
+  for (SmallVectorImpl<InsnRange>::const_iterator RI = MIRanges.begin(),
+         RE = MIRanges.end(); RI != RE; ++RI) {
+    const InsnRange &R = *RI;
+    LexicalScope *S = MI2ScopeMap.lookup(R.first);
+    assert (S && "Lost LexicalScope for a machine instruction!");
+    if (PrevLexicalScope && !PrevLexicalScope->dominates(S))
+      PrevLexicalScope->closeInsnRange(S);
+    S->openInsnRange(R.first);
+    S->extendInsnRange(R.second);
+    PrevLexicalScope = S;
+  }
+
+  if (PrevLexicalScope)
+    PrevLexicalScope->closeInsnRange();
+}
+
+/// getMachineBasicBlocks - Populate given set using machine basic blocks which
+/// have machine instructions that belong to lexical scope identified by 
+/// DebugLoc.
+void LexicalScopes::
+getMachineBasicBlocks(DebugLoc DL, 
+                      SmallPtrSet<const MachineBasicBlock*, 4> &MBBs) {
+  MBBs.clear();
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return;
+  
+  if (Scope == CurrentFnLexicalScope) {
+    for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+         I != E; ++I)
+      MBBs.insert(I);
+    return;
+  }
+
+  SmallVector<InsnRange, 4> &InsnRanges = Scope->getRanges();
+  for (SmallVector<InsnRange, 4>::iterator I = InsnRanges.begin(),
+         E = InsnRanges.end(); I != E; ++I) {
+    InsnRange &R = *I;
+    MBBs.insert(R.first->getParent());
+  }
+}
+
+/// dominates - Return true if DebugLoc's lexical scope dominates at least one
+/// machine instruction's lexical scope in a given machine basic block.
+bool LexicalScopes::dominates(DebugLoc DL, MachineBasicBlock *MBB) {
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return false;
+
+  // Current function scope covers all basic blocks in the function.
+  if (Scope == CurrentFnLexicalScope && MBB->getParent() == MF)
+    return true;
+
+  bool Result = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    DebugLoc IDL = I->getDebugLoc();
+    if (IDL.isUnknown())
+      continue;
+    if (LexicalScope *IScope = getOrCreateLexicalScope(IDL))
+      if (Scope->dominates(IScope))
+        return true;
+  }
+  return Result;
+}
+
+void LexicalScope::anchor() { }
+
+/// dump - Print data structures.
+void LexicalScope::dump(unsigned Indent) const {
+#ifndef NDEBUG
+  raw_ostream &err = dbgs();
+  err.indent(Indent);
+  err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
+  const MDNode *N = Desc;
+  err.indent(Indent);
+  N->dump();
+  if (AbstractScope)
+    err << std::string(Indent, ' ') << "Abstract Scope\n";
+
+  if (!Children.empty())
+    err << std::string(Indent + 2, ' ') << "Children ...\n";
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    if (Children[i] != this)
+      Children[i]->dump(Indent + 2);
+#endif
+}
+
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
new file mode 100644
index 000000000000..0b117ac6566b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -0,0 +1,995 @@
+//===- LiveDebugVariables.cpp - Tracking debug info variables -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveDebugVariables analysis.
+//
+// Remove all DBG_VALUE instructions referencing virtual registers and replace
+// them with a data structure tracking where live user variables are kept - in a
+// virtual register or in a stack slot.
+//
+// Allow the data structure to be updated during register allocation when values
+// are moved between registers and stack slots. Finally emit new DBG_VALUE
+// instructions after register allocation is complete.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "livedebug"
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableLDV("live-debug-variables", cl::init(true),
+          cl::desc("Enable the live debug variables pass"), cl::Hidden);
+
+STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
+char LiveDebugVariables::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars",
+                "Debug Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars",
+                "Debug Variable Analysis", false, false)
+
+void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID), pImpl(0) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+}
+
+/// LocMap - Map of where a user value is live, and its location.
+typedef IntervalMap<SlotIndex, unsigned, 4> LocMap;
+
+namespace {
+/// UserValueScopes - Keeps track of lexical scopes associated with an
+/// user value's source location.
+class UserValueScopes {
+  DebugLoc DL;
+  LexicalScopes &LS;
+  SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
+
+public:
+  UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(D), LS(L) {}
+
+  /// dominates - Return true if current scope dominates at least one machine
+  /// instruction in a given machine basic block.
+  bool dominates(MachineBasicBlock *MBB) {
+    if (LBlocks.empty())
+      LS.getMachineBasicBlocks(DL, LBlocks);
+    if (LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB))
+      return true;
+    return false;
+  }
+};
+} // end anonymous namespace
+
+/// UserValue - A user value is a part of a debug info user variable.
+///
+/// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
+/// holds part of a user variable. The part is identified by a byte offset.
+///
+/// UserValues are grouped into equivalence classes for easier searching. Two
+/// user values are related if they refer to the same variable, or if they are
+/// held by the same virtual register. The equivalence class is the transitive
+/// closure of that relation.
+namespace {
+class LDVImpl;
+class UserValue {
+  const MDNode *variable; ///< The debug info variable we are part of.
+  unsigned offset;        ///< Byte offset into variable.
+  DebugLoc dl;            ///< The debug location for the variable. This is
+                          ///< used by dwarf writer to find lexical scope.
+  UserValue *leader;      ///< Equivalence class leader.
+  UserValue *next;        ///< Next value in equivalence class, or null.
+
+  /// Numbered locations referenced by locmap.
+  SmallVector<MachineOperand, 4> locations;
+
+  /// Map of slot indices where this value is live.
+  LocMap locInts;
+
+  /// coalesceLocation - After LocNo was changed, check if it has become
+  /// identical to another location, and coalesce them. This may cause LocNo or
+  /// a later location to be erased, but no earlier location will be erased.
+  void coalesceLocation(unsigned LocNo);
+
+  /// insertDebugValue - Insert a DBG_VALUE into MBB at Idx for LocNo.
+  void insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, unsigned LocNo,
+                        LiveIntervals &LIS, const TargetInstrInfo &TII);
+
+  /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// is live. Returns true if any changes were made.
+  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
+public:
+  /// UserValue - Create a new UserValue.
+  UserValue(const MDNode *var, unsigned o, DebugLoc L,
+            LocMap::Allocator &alloc)
+    : variable(var), offset(o), dl(L), leader(this), next(0), locInts(alloc)
+  {}
+
+  /// getLeader - Get the leader of this value's equivalence class.
+  UserValue *getLeader() {
+    UserValue *l = leader;
+    while (l != l->leader)
+      l = l->leader;
+    return leader = l;
+  }
+
+  /// getNext - Return the next UserValue in the equivalence class.
+  UserValue *getNext() const { return next; }
+
+  /// match - Does this UserValue match the parameters?
+  bool match(const MDNode *Var, unsigned Offset) const {
+    return Var == variable && Offset == offset;
+  }
+
+  /// merge - Merge equivalence classes.
+  static UserValue *merge(UserValue *L1, UserValue *L2) {
+    L2 = L2->getLeader();
+    if (!L1)
+      return L2;
+    L1 = L1->getLeader();
+    if (L1 == L2)
+      return L1;
+    // Splice L2 before L1's members.
+    UserValue *End = L2;
+    while (End->next)
+      End->leader = L1, End = End->next;
+    End->leader = L1;
+    End->next = L1->next;
+    L1->next = L2;
+    return L1;
+  }
+
+  /// getLocationNo - Return the location number that matches Loc.
+  unsigned getLocationNo(const MachineOperand &LocMO) {
+    if (LocMO.isReg()) {
+      if (LocMO.getReg() == 0)
+        return ~0u;
+      // For register locations we dont care about use/def and other flags.
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (locations[i].isReg() &&
+            locations[i].getReg() == LocMO.getReg() &&
+            locations[i].getSubReg() == LocMO.getSubReg())
+          return i;
+    } else
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (LocMO.isIdenticalTo(locations[i]))
+          return i;
+    locations.push_back(LocMO);
+    // We are storing a MachineOperand outside a MachineInstr.
+    locations.back().clearParent();
+    // Don't store def operands.
+    if (locations.back().isReg())
+      locations.back().setIsUse();
+    return locations.size() - 1;
+  }
+
+  /// mapVirtRegs - Ensure that all virtual register locations are mapped.
+  void mapVirtRegs(LDVImpl *LDV);
+
+  /// addDef - Add a definition point to this value.
+  void addDef(SlotIndex Idx, const MachineOperand &LocMO) {
+    // Add a singular (Idx,Idx) -> Loc mapping.
+    LocMap::iterator I = locInts.find(Idx);
+    if (!I.valid() || I.start() != Idx)
+      I.insert(Idx, Idx.getNextSlot(), getLocationNo(LocMO));
+    else
+      // A later DBG_VALUE at the same SlotIndex overrides the old location.
+      I.setValue(getLocationNo(LocMO));
+  }
+
+  /// extendDef - Extend the current definition as far as possible down the
+  /// dominator tree. Stop when meeting an existing def or when leaving the live
+  /// range of VNI.
+  /// End points where VNI is no longer live are added to Kills.
+  /// @param Idx   Starting point for the definition.
+  /// @param LocNo Location number to propagate.
+  /// @param LI    Restrict liveness to where LI has the value VNI. May be null.
+  /// @param VNI   When LI is not null, this is the value to restrict to.
+  /// @param Kills Append end points of VNI's live range to Kills.
+  /// @param LIS   Live intervals analysis.
+  /// @param MDT   Dominator tree.
+  void extendDef(SlotIndex Idx, unsigned LocNo,
+                 LiveInterval *LI, const VNInfo *VNI,
+                 SmallVectorImpl<SlotIndex> *Kills,
+                 LiveIntervals &LIS, MachineDominatorTree &MDT,
+                 UserValueScopes &UVS);
+
+  /// addDefsFromCopies - The value in LI/LocNo may be copies to other
+  /// registers. Determine if any of the copies are available at the kill
+  /// points, and add defs if possible.
+  /// @param LI      Scan for copies of the value in LI->reg.
+  /// @param LocNo   Location number of LI->reg.
+  /// @param Kills   Points where the range of LocNo could be extended.
+  /// @param NewDefs Append (Idx, LocNo) of inserted defs here.
+  void addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
+                      const SmallVectorImpl<SlotIndex> &Kills,
+                      SmallVectorImpl<std::pair<SlotIndex, unsigned> > &NewDefs,
+                      MachineRegisterInfo &MRI,
+                      LiveIntervals &LIS);
+
+  /// computeIntervals - Compute the live intervals of all locations after
+  /// collecting all their def points.
+  void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                        LiveIntervals &LIS, MachineDominatorTree &MDT,
+                        UserValueScopes &UVS);
+
+  /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// live. Returns true if any changes were made.
+  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+
+  /// rewriteLocations - Rewrite virtual register locations according to the
+  /// provided virtual register map.
+  void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI);
+
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM,
+                       LiveIntervals &LIS, const TargetInstrInfo &TRI);
+
+  /// findDebugLoc - Return DebugLoc used for this DBG_VALUE instruction. A
+  /// variable may have more than one corresponding DBG_VALUE instructions. 
+  /// Only first one needs DebugLoc to identify variable's lexical scope
+  /// in source file.
+  DebugLoc findDebugLoc();
+
+  /// getDebugLoc - Return DebugLoc of this UserValue.
+  DebugLoc getDebugLoc() { return dl;}
+  void print(raw_ostream&, const TargetMachine*);
+};
+} // namespace
+
+/// LDVImpl - Implementation of the LiveDebugVariables pass.
+namespace {
+class LDVImpl {
+  LiveDebugVariables &pass;
+  LocMap::Allocator allocator;
+  MachineFunction *MF;
+  LiveIntervals *LIS;
+  LexicalScopes LS;
+  MachineDominatorTree *MDT;
+  const TargetRegisterInfo *TRI;
+
+  /// Whether emitDebugValues is called.
+  bool EmitDone;
+  /// Whether the machine function is modified during the pass.
+  bool ModifiedMF;
+
+  /// userValues - All allocated UserValue instances.
+  SmallVector<UserValue*, 8> userValues;
+
+  /// Map virtual register to eq class leader.
+  typedef DenseMap<unsigned, UserValue*> VRMap;
+  VRMap virtRegToEqClass;
+
+  /// Map user variable to eq class leader.
+  typedef DenseMap<const MDNode *, UserValue*> UVMap;
+  UVMap userVarMap;
+
+  /// getUserValue - Find or create a UserValue.
+  UserValue *getUserValue(const MDNode *Var, unsigned Offset, DebugLoc DL);
+
+  /// lookupVirtReg - Find the EC leader for VirtReg or null.
+  UserValue *lookupVirtReg(unsigned VirtReg);
+
+  /// handleDebugValue - Add DBG_VALUE instruction to our maps.
+  /// @param MI  DBG_VALUE instruction
+  /// @param Idx Last valid SLotIndex before instruction.
+  /// @return    True if the DBG_VALUE instruction should be deleted.
+  bool handleDebugValue(MachineInstr *MI, SlotIndex Idx);
+
+  /// collectDebugValues - Collect and erase all DBG_VALUE instructions, adding
+  /// a UserValue def for each instruction.
+  /// @param mf MachineFunction to be scanned.
+  /// @return True if any debug values were found.
+  bool collectDebugValues(MachineFunction &mf);
+
+  /// computeIntervals - Compute the live intervals of all user values after
+  /// collecting all their def points.
+  void computeIntervals();
+
+public:
+  LDVImpl(LiveDebugVariables *ps) : pass(*ps), EmitDone(false),
+                                    ModifiedMF(false) {}
+  bool runOnMachineFunction(MachineFunction &mf);
+
+  /// clear - Release all memory.
+  void clear() {
+    DeleteContainerPointers(userValues);
+    userValues.clear();
+    virtRegToEqClass.clear();
+    userVarMap.clear();
+    // Make sure we call emitDebugValues if the machine function was modified.
+    assert((!ModifiedMF || EmitDone) &&
+           "Dbg values are not emitted in LDV");
+    EmitDone = false;
+    ModifiedMF = false;
+  }
+
+  /// mapVirtReg - Map virtual register to an equivalence class.
+  void mapVirtReg(unsigned VirtReg, UserValue *EC);
+
+  /// splitRegister -  Replace all references to OldReg with NewRegs.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  void print(raw_ostream&);
+};
+} // namespace
+
+void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
+  DIVariable DV(variable);
+  OS << "!\""; 
+  DV.printExtendedName(OS);
+  OS << "\"\t";
+  if (offset)
+    OS << '+' << offset;
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
+    OS << " [" << I.start() << ';' << I.stop() << "):";
+    if (I.value() == ~0u)
+      OS << "undef";
+    else
+      OS << I.value();
+  }
+  for (unsigned i = 0, e = locations.size(); i != e; ++i) {
+    OS << " Loc" << i << '=';
+    locations[i].print(OS, TM);
+  }
+  OS << '\n';
+}
+
+void LDVImpl::print(raw_ostream &OS) {
+  OS << "********** DEBUG VARIABLES **********\n";
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i)
+    userValues[i]->print(OS, &MF->getTarget());
+}
+
+void UserValue::coalesceLocation(unsigned LocNo) {
+  unsigned KeepLoc = 0;
+  for (unsigned e = locations.size(); KeepLoc != e; ++KeepLoc) {
+    if (KeepLoc == LocNo)
+      continue;
+    if (locations[KeepLoc].isIdenticalTo(locations[LocNo]))
+      break;
+  }
+  // No matches.
+  if (KeepLoc == locations.size())
+    return;
+
+  // Keep the smaller location, erase the larger one.
+  unsigned EraseLoc = LocNo;
+  if (KeepLoc > EraseLoc)
+    std::swap(KeepLoc, EraseLoc);
+  locations.erase(locations.begin() + EraseLoc);
+
+  // Rewrite values.
+  for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
+    unsigned v = I.value();
+    if (v == EraseLoc)
+      I.setValue(KeepLoc);      // Coalesce when possible.
+    else if (v > EraseLoc)
+      I.setValueUnchecked(v-1); // Avoid coalescing with untransformed values.
+  }
+}
+
+void UserValue::mapVirtRegs(LDVImpl *LDV) {
+  for (unsigned i = 0, e = locations.size(); i != e; ++i)
+    if (locations[i].isReg() &&
+        TargetRegisterInfo::isVirtualRegister(locations[i].getReg()))
+      LDV->mapVirtReg(locations[i].getReg(), this);
+}
+
+UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
+                                 DebugLoc DL) {
+  UserValue *&Leader = userVarMap[Var];
+  if (Leader) {
+    UserValue *UV = Leader->getLeader();
+    Leader = UV;
+    for (; UV; UV = UV->getNext())
+      if (UV->match(Var, Offset))
+        return UV;
+  }
+
+  UserValue *UV = new UserValue(Var, Offset, DL, allocator);
+  userValues.push_back(UV);
+  Leader = UserValue::merge(Leader, UV);
+  return UV;
+}
+
+void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Only map VirtRegs");
+  UserValue *&Leader = virtRegToEqClass[VirtReg];
+  Leader = UserValue::merge(Leader, EC);
+}
+
+UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
+  if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
+    return UV->getLeader();
+  return 0;
+}
+
+bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
+  // DBG_VALUE loc, offset, variable
+  if (MI->getNumOperands() != 3 ||
+      !MI->getOperand(1).isImm() || !MI->getOperand(2).isMetadata()) {
+    DEBUG(dbgs() << "Can't handle " << *MI);
+    return false;
+  }
+
+  // Get or create the UserValue for (variable,offset).
+  unsigned Offset = MI->getOperand(1).getImm();
+  const MDNode *Var = MI->getOperand(2).getMetadata();
+  UserValue *UV = getUserValue(Var, Offset, MI->getDebugLoc());
+  UV->addDef(Idx, MI->getOperand(0));
+  return true;
+}
+
+bool LDVImpl::collectDebugValues(MachineFunction &mf) {
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = mf.begin(), MFE = mf.end(); MFI != MFE;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+         MBBI != MBBE;) {
+      if (!MBBI->isDebugValue()) {
+        ++MBBI;
+        continue;
+      }
+      // DBG_VALUE has no slot index, use the previous instruction instead.
+      SlotIndex Idx = MBBI == MBB->begin() ?
+        LIS->getMBBStartIdx(MBB) :
+        LIS->getInstructionIndex(llvm::prior(MBBI)).getRegSlot();
+      // Handle consecutive DBG_VALUE instructions with the same slot index.
+      do {
+        if (handleDebugValue(MBBI, Idx)) {
+          MBBI = MBB->erase(MBBI);
+          Changed = true;
+        } else
+          ++MBBI;
+      } while (MBBI != MBBE && MBBI->isDebugValue());
+    }
+  }
+  return Changed;
+}
+
+void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
+                          LiveInterval *LI, const VNInfo *VNI,
+                          SmallVectorImpl<SlotIndex> *Kills,
+                          LiveIntervals &LIS, MachineDominatorTree &MDT,
+                          UserValueScopes &UVS) {
+  SmallVector<SlotIndex, 16> Todo;
+  Todo.push_back(Idx);
+  do {
+    SlotIndex Start = Todo.pop_back_val();
+    MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
+    SlotIndex Stop = LIS.getMBBEndIdx(MBB);
+    LocMap::iterator I = locInts.find(Start);
+
+    // Limit to VNI's live range.
+    bool ToEnd = true;
+    if (LI && VNI) {
+      LiveRange *Range = LI->getLiveRangeContaining(Start);
+      if (!Range || Range->valno != VNI) {
+        if (Kills)
+          Kills->push_back(Start);
+        continue;
+      }
+      if (Range->end < Stop)
+        Stop = Range->end, ToEnd = false;
+    }
+
+    // There could already be a short def at Start.
+    if (I.valid() && I.start() <= Start) {
+      // Stop when meeting a different location or an already extended interval.
+      Start = Start.getNextSlot();
+      if (I.value() != LocNo || I.stop() != Start)
+        continue;
+      // This is a one-slot placeholder. Just skip it.
+      ++I;
+    }
+
+    // Limited by the next def.
+    if (I.valid() && I.start() < Stop)
+      Stop = I.start(), ToEnd = false;
+    // Limited by VNI's live range.
+    else if (!ToEnd && Kills)
+      Kills->push_back(Stop);
+
+    if (Start >= Stop)
+      continue;
+
+    I.insert(Start, Stop, LocNo);
+
+    // If we extended to the MBB end, propagate down the dominator tree.
+    if (!ToEnd)
+      continue;
+    const std::vector<MachineDomTreeNode*> &Children =
+      MDT.getNode(MBB)->getChildren();
+    for (unsigned i = 0, e = Children.size(); i != e; ++i) {
+      MachineBasicBlock *MBB = Children[i]->getBlock();
+      if (UVS.dominates(MBB))
+        Todo.push_back(LIS.getMBBStartIdx(MBB));
+    }
+  } while (!Todo.empty());
+}
+
+void
+UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
+                      const SmallVectorImpl<SlotIndex> &Kills,
+                      SmallVectorImpl<std::pair<SlotIndex, unsigned> > &NewDefs,
+                      MachineRegisterInfo &MRI, LiveIntervals &LIS) {
+  if (Kills.empty())
+    return;
+  // Don't track copies from physregs, there are too many uses.
+  if (!TargetRegisterInfo::isVirtualRegister(LI->reg))
+    return;
+
+  // Collect all the (vreg, valno) pairs that are copies of LI.
+  SmallVector<std::pair<LiveInterval*, const VNInfo*>, 8> CopyValues;
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI.use_nodbg_begin(LI->reg),
+         UE = MRI.use_nodbg_end(); UI != UE; ++UI) {
+    // Copies of the full value.
+    if (UI.getOperand().getSubReg() || !UI->isCopy())
+      continue;
+    MachineInstr *MI = &*UI;
+    unsigned DstReg = MI->getOperand(0).getReg();
+
+    // Don't follow copies to physregs. These are usually setting up call
+    // arguments, and the argument registers are always call clobbered. We are
+    // better off in the source register which could be a callee-saved register,
+    // or it could be spilled.
+    if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+      continue;
+
+    // Is LocNo extended to reach this copy? If not, another def may be blocking
+    // it, or we are looking at a wrong value of LI.
+    SlotIndex Idx = LIS.getInstructionIndex(MI);
+    LocMap::iterator I = locInts.find(Idx.getRegSlot(true));
+    if (!I.valid() || I.value() != LocNo)
+      continue;
+
+    if (!LIS.hasInterval(DstReg))
+      continue;
+    LiveInterval *DstLI = &LIS.getInterval(DstReg);
+    const VNInfo *DstVNI = DstLI->getVNInfoAt(Idx.getRegSlot());
+    assert(DstVNI && DstVNI->def == Idx.getRegSlot() && "Bad copy value");
+    CopyValues.push_back(std::make_pair(DstLI, DstVNI));
+  }
+
+  if (CopyValues.empty())
+    return;
+
+  DEBUG(dbgs() << "Got " << CopyValues.size() << " copies of " << *LI << '\n');
+
+  // Try to add defs of the copied values for each kill point.
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+    SlotIndex Idx = Kills[i];
+    for (unsigned j = 0, e = CopyValues.size(); j != e; ++j) {
+      LiveInterval *DstLI = CopyValues[j].first;
+      const VNInfo *DstVNI = CopyValues[j].second;
+      if (DstLI->getVNInfoAt(Idx) != DstVNI)
+        continue;
+      // Check that there isn't already a def at Idx
+      LocMap::iterator I = locInts.find(Idx);
+      if (I.valid() && I.start() <= Idx)
+        continue;
+      DEBUG(dbgs() << "Kill at " << Idx << " covered by valno #"
+                   << DstVNI->id << " in " << *DstLI << '\n');
+      MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
+      assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
+      unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
+      I.insert(Idx, Idx.getNextSlot(), LocNo);
+      NewDefs.push_back(std::make_pair(Idx, LocNo));
+      break;
+    }
+  }
+}
+
+void
+UserValue::computeIntervals(MachineRegisterInfo &MRI,
+                            const TargetRegisterInfo &TRI,
+                            LiveIntervals &LIS,
+                            MachineDominatorTree &MDT,
+                            UserValueScopes &UVS) {
+  SmallVector<std::pair<SlotIndex, unsigned>, 16> Defs;
+
+  // Collect all defs to be extended (Skipping undefs).
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I)
+    if (I.value() != ~0u)
+      Defs.push_back(std::make_pair(I.start(), I.value()));
+
+  // Extend all defs, and possibly add new ones along the way.
+  for (unsigned i = 0; i != Defs.size(); ++i) {
+    SlotIndex Idx = Defs[i].first;
+    unsigned LocNo = Defs[i].second;
+    const MachineOperand &Loc = locations[LocNo];
+
+    if (!Loc.isReg()) {
+      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
+      continue;
+    }
+
+    // Register locations are constrained to where the register value is live.
+    if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
+      LiveInterval *LI = 0;
+      const VNInfo *VNI = 0;
+      if (LIS.hasInterval(Loc.getReg())) {
+        LI = &LIS.getInterval(Loc.getReg());
+        VNI = LI->getVNInfoAt(Idx);
+      }
+      SmallVector<SlotIndex, 16> Kills;
+      extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS);
+      if (LI)
+        addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
+      continue;
+    }
+
+    // For physregs, use the live range of the first regunit as a guide.
+    unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
+    LiveInterval *LI = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    // Don't track copies from physregs, it is too expensive.
+    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
+  }
+
+  // Finally, erase all the undefs.
+  for (LocMap::iterator I = locInts.begin(); I.valid();)
+    if (I.value() == ~0u)
+      I.erase();
+    else
+      ++I;
+}
+
+void LDVImpl::computeIntervals() {
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    UserValueScopes UVS(userValues[i]->getDebugLoc(), LS);
+    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, *MDT, UVS);
+    userValues[i]->mapVirtRegs(this);
+  }
+}
+
+bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  LIS = &pass.getAnalysis<LiveIntervals>();
+  MDT = &pass.getAnalysis<MachineDominatorTree>();
+  TRI = mf.getTarget().getRegisterInfo();
+  clear();
+  LS.initialize(mf);
+  DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
+               << mf.getName() << " **********\n");
+
+  bool Changed = collectDebugValues(mf);
+  computeIntervals();
+  DEBUG(print(dbgs()));
+  LS.releaseMemory();
+  ModifiedMF = Changed;
+  return Changed;
+}
+
+bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
+  if (!EnableLDV)
+    return false;
+  if (!pImpl)
+    pImpl = new LDVImpl(this);
+  return static_cast<LDVImpl*>(pImpl)->runOnMachineFunction(mf);
+}
+
+void LiveDebugVariables::releaseMemory() {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->clear();
+}
+
+LiveDebugVariables::~LiveDebugVariables() {
+  if (pImpl)
+    delete static_cast<LDVImpl*>(pImpl);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+bool
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+  DEBUG({
+    dbgs() << "Splitting Loc" << OldLocNo << '\t';
+    print(dbgs(), 0);
+  });
+  bool DidChange = false;
+  LocMap::iterator LocMapI;
+  LocMapI.setMap(locInts);
+  for (unsigned i = 0; i != NewRegs.size(); ++i) {
+    LiveInterval *LI = NewRegs[i];
+    if (LI->empty())
+      continue;
+
+    // Don't allocate the new LocNo until it is needed.
+    unsigned NewLocNo = ~0u;
+
+    // Iterate over the overlaps between locInts and LI.
+    LocMapI.find(LI->beginIndex());
+    if (!LocMapI.valid())
+      continue;
+    LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start());
+    LiveInterval::iterator LIE = LI->end();
+    while (LocMapI.valid() && LII != LIE) {
+      // At this point, we know that LocMapI.stop() > LII->start.
+      LII = LI->advanceTo(LII, LocMapI.start());
+      if (LII == LIE)
+        break;
+
+      // Now LII->end > LocMapI.start(). Do we have an overlap?
+      if (LocMapI.value() == OldLocNo && LII->start < LocMapI.stop()) {
+        // Overlapping correct location. Allocate NewLocNo now.
+        if (NewLocNo == ~0u) {
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
+          MO.setSubReg(locations[OldLocNo].getSubReg());
+          NewLocNo = getLocationNo(MO);
+          DidChange = true;
+        }
+
+        SlotIndex LStart = LocMapI.start();
+        SlotIndex LStop  = LocMapI.stop();
+
+        // Trim LocMapI down to the LII overlap.
+        if (LStart < LII->start)
+          LocMapI.setStartUnchecked(LII->start);
+        if (LStop > LII->end)
+          LocMapI.setStopUnchecked(LII->end);
+
+        // Change the value in the overlap. This may trigger coalescing.
+        LocMapI.setValue(NewLocNo);
+
+        // Re-insert any removed OldLocNo ranges.
+        if (LStart < LocMapI.start()) {
+          LocMapI.insert(LStart, LocMapI.start(), OldLocNo);
+          ++LocMapI;
+          assert(LocMapI.valid() && "Unexpected coalescing");
+        }
+        if (LStop > LocMapI.stop()) {
+          ++LocMapI;
+          LocMapI.insert(LII->end, LStop, OldLocNo);
+          --LocMapI;
+        }
+      }
+
+      // Advance to the next overlap.
+      if (LII->end < LocMapI.stop()) {
+        if (++LII == LIE)
+          break;
+        LocMapI.advanceTo(LII->start);
+      } else {
+        ++LocMapI;
+        if (!LocMapI.valid())
+          break;
+        LII = LI->advanceTo(LII, LocMapI.start());
+      }
+    }
+  }
+
+  // Finally, remove any remaining OldLocNo intervals and OldLocNo itself.
+  locations.erase(locations.begin() + OldLocNo);
+  LocMapI.goToBegin();
+  while (LocMapI.valid()) {
+    unsigned v = LocMapI.value();
+    if (v == OldLocNo) {
+      DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
+                   << LocMapI.stop() << ")\n");
+      LocMapI.erase();
+    } else {
+      if (v > OldLocNo)
+        LocMapI.setValueUnchecked(v-1);
+      ++LocMapI;
+    }
+  }
+
+  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), 0);});
+  return DidChange;
+}
+
+bool
+UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  // Split locations referring to OldReg. Iterate backwards so splitLocation can
+  // safely erase unused locations.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    const MachineOperand *Loc = &locations[LocNo];
+    if (!Loc->isReg() || Loc->getReg() != OldReg)
+      continue;
+    DidChange |= splitLocation(LocNo, NewRegs);
+  }
+  return DidChange;
+}
+
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  bool DidChange = false;
+  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
+    DidChange |= UV->splitRegister(OldReg, NewRegs);
+
+  if (!DidChange)
+    return;
+
+  // Map all of the new virtual registers.
+  UserValue *UV = lookupVirtReg(OldReg);
+  for (unsigned i = 0; i != NewRegs.size(); ++i)
+    mapVirtReg(NewRegs[i]->reg, UV);
+}
+
+void LiveDebugVariables::
+splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+}
+
+void
+UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
+  // Iterate over locations in reverse makes it easier to handle coalescing.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    MachineOperand &Loc = locations[LocNo];
+    // Only virtual registers are rewritten.
+    if (!Loc.isReg() || !Loc.getReg() ||
+        !TargetRegisterInfo::isVirtualRegister(Loc.getReg()))
+      continue;
+    unsigned VirtReg = Loc.getReg();
+    if (VRM.isAssignedReg(VirtReg) &&
+        TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+      // This can create a %noreg operand in rare cases when the sub-register
+      // index is no longer available. That means the user value is in a
+      // non-existent sub-register, and %noreg is exactly what we want.
+      Loc.substPhysReg(VRM.getPhys(VirtReg), TRI);
+    } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT) {
+      // FIXME: Translate SubIdx to a stackslot offset.
+      Loc = MachineOperand::CreateFI(VRM.getStackSlot(VirtReg));
+    } else {
+      Loc.setReg(0);
+      Loc.setSubReg(0);
+    }
+    coalesceLocation(LocNo);
+  }
+}
+
+/// findInsertLocation - Find an iterator for inserting a DBG_VALUE
+/// instruction.
+static MachineBasicBlock::iterator
+findInsertLocation(MachineBasicBlock *MBB, SlotIndex Idx,
+                   LiveIntervals &LIS) {
+  SlotIndex Start = LIS.getMBBStartIdx(MBB);
+  Idx = Idx.getBaseIndex();
+
+  // Try to find an insert location by going backwards from Idx.
+  MachineInstr *MI;
+  while (!(MI = LIS.getInstructionFromIndex(Idx))) {
+    // We've reached the beginning of MBB.
+    if (Idx == Start) {
+      MachineBasicBlock::iterator I = MBB->SkipPHIsAndLabels(MBB->begin());
+      return I;
+    }
+    Idx = Idx.getPrevIndex();
+  }
+
+  // Don't insert anything after the first terminator, though.
+  return MI->isTerminator() ? MBB->getFirstTerminator() :
+                              llvm::next(MachineBasicBlock::iterator(MI));
+}
+
+DebugLoc UserValue::findDebugLoc() {
+  DebugLoc D = dl;
+  dl = DebugLoc();
+  return D;
+}
+void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
+                                 unsigned LocNo,
+                                 LiveIntervals &LIS,
+                                 const TargetInstrInfo &TII) {
+  MachineBasicBlock::iterator I = findInsertLocation(MBB, Idx, LIS);
+  MachineOperand &Loc = locations[LocNo];
+  ++NumInsertedDebugValues;
+
+  // Frame index locations may require a target callback.
+  if (Loc.isFI()) {
+    MachineInstr *MI = TII.emitFrameIndexDebugValue(*MBB->getParent(),
+                                          Loc.getIndex(), offset, variable, 
+                                                    findDebugLoc());
+    if (MI) {
+      MBB->insert(I, MI);
+      return;
+    }
+  }
+  // This is not a frame index, or the target is happy with a standard FI.
+  BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
+    .addOperand(Loc).addImm(offset).addMetadata(variable);
+}
+
+void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
+                                const TargetInstrInfo &TII) {
+  MachineFunction::iterator MFEnd = VRM->getMachineFunction().end();
+
+  for (LocMap::const_iterator I = locInts.begin(); I.valid();) {
+    SlotIndex Start = I.start();
+    SlotIndex Stop = I.stop();
+    unsigned LocNo = I.value();
+    DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << LocNo);
+    MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start);
+    SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB);
+
+    DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
+    insertDebugValue(MBB, Start, LocNo, LIS, TII);
+    // This interval may span multiple basic blocks.
+    // Insert a DBG_VALUE into each one.
+    while(Stop > MBBEnd) {
+      // Move to the next block.
+      Start = MBBEnd;
+      if (++MBB == MFEnd)
+        break;
+      MBBEnd = LIS.getMBBEndIdx(MBB);
+      DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
+      insertDebugValue(MBB, Start, LocNo, LIS, TII);
+    }
+    DEBUG(dbgs() << '\n');
+    if (MBB == MFEnd)
+      break;
+
+    ++I;
+  }
+}
+
+void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
+  DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    DEBUG(userValues[i]->print(dbgs(), &MF->getTarget()));
+    userValues[i]->rewriteLocations(*VRM, *TRI);
+    userValues[i]->emitDebugValues(VRM, *LIS, *TII);
+  }
+  EmitDone = true;
+}
+
+void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
+}
+
+
+#ifndef NDEBUG
+void LiveDebugVariables::dump() {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->print(dbgs());
+}
+#endif
+
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
new file mode 100644
index 000000000000..3ce3c398bd4b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -0,0 +1,70 @@
+//===- LiveDebugVariables.h - Tracking debug info variables ----*- c++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to the LiveDebugVariables analysis.
+//
+// The analysis removes DBG_VALUE instructions for virtual registers and tracks
+// live user variables in a data structure that can be updated during register
+// allocation.
+//
+// After register allocation new DBG_VALUE instructions are emitted to reflect
+// the new locations of user variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+#define LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class LiveInterval;
+class VirtRegMap;
+
+class LiveDebugVariables : public MachineFunctionPass {
+  void *pImpl;
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  LiveDebugVariables();
+  ~LiveDebugVariables();
+
+  /// renameRegister - Move any user variables in OldReg to NewReg:SubIdx.
+  /// @param OldReg Old virtual register that is going away.
+  /// @param NewReg New register holding the user variables.
+  /// @param SubIdx If NewReg is a virtual register, SubIdx may indicate a sub-
+  ///               register.
+  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
+
+  /// splitRegister - Move any user variables in OldReg to the live ranges in
+  /// NewRegs where they are live. Mark the values as unavailable where no new
+  /// register is live.
+  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+
+  /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
+  /// that happened during register allocation.
+  /// @param VRM Rename virtual registers according to map.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  /// dump - Print data structures to dbgs().
+  void dump();
+
+private:
+
+  virtual bool runOnMachineFunction(MachineFunction &);
+  virtual void releaseMemory();
+  virtual void getAnalysisUsage(AnalysisUsage &) const;
+
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEDEBUGVARIABLES_H
diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
new file mode 100644
index 000000000000..dccd847d070c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
@@ -0,0 +1,951 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRange and LiveInterval classes.  Given some
+// numbering of each the machine instructions an interval [i, j) is said to be a
+// live interval for register v if there is no instruction with number j' > j
+// such that v is live at j' and there is no instruction with number i' < i such
+// that v is live at i'. In this implementation intervals can have holes,
+// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
+// individual range is represented as an instance of LiveRange, and the whole
+// interval is represented as an instance of LiveInterval.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
+  // This algorithm is basically std::upper_bound.
+  // Unfortunately, std::upper_bound cannot be used with mixed types until we
+  // adopt C++0x. Many libraries can do it, but not all.
+  if (empty() || Pos >= endIndex())
+    return end();
+  iterator I = begin();
+  size_t Len = ranges.size();
+  do {
+    size_t Mid = Len >> 1;
+    if (Pos < I[Mid].end)
+      Len = Mid;
+    else
+      I += Mid + 1, Len -= Mid + 1;
+  } while (Len);
+  return I;
+}
+
+VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
+                                    VNInfo::Allocator &VNInfoAllocator) {
+  assert(!Def.isDead() && "Cannot define a value at the dead slot");
+  iterator I = find(Def);
+  if (I == end()) {
+    VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+  if (SlotIndex::isSameInstr(Def, I->start)) {
+    assert(I->valno->def == I->start && "Inconsistent existing value def");
+
+    // It is possible to have both normal and early-clobber defs of the same
+    // register on an instruction. It doesn't make a lot of sense, but it is
+    // possible to specify in inline assembly.
+    //
+    // Just convert everything to early-clobber.
+    Def = std::min(Def, I->start);
+    if (Def != I->start)
+      I->start = I->valno->def = Def;
+    return I->valno;
+  }
+  assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
+  VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
+  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  return VNI;
+}
+
+// overlaps - Return true if the intersection of the two live intervals is
+// not empty.
+//
+// An example for overlaps():
+//
+// 0: A = ...
+// 4: B = ...
+// 8: C = A + B ;; last use of A
+//
+// The live intervals should look like:
+//
+// A = [3, 11)
+// B = [7, x)
+// C = [11, y)
+//
+// A->overlaps(C) should return false since we want to be able to join
+// A and C.
+//
+bool LiveInterval::overlapsFrom(const LiveInterval& other,
+                                const_iterator StartPos) const {
+  assert(!empty() && "empty interval");
+  const_iterator i = begin();
+  const_iterator ie = end();
+  const_iterator j = StartPos;
+  const_iterator je = other.end();
+
+  assert((StartPos->start <= i->start || StartPos == other.begin()) &&
+         StartPos != other.end() && "Bogus start position hint!");
+
+  if (i->start < j->start) {
+    i = std::upper_bound(i, ie, j->start);
+    if (i != ranges.begin()) --i;
+  } else if (j->start < i->start) {
+    ++StartPos;
+    if (StartPos != other.end() && StartPos->start <= i->start) {
+      assert(StartPos < other.end() && i < end());
+      j = std::upper_bound(j, je, i->start);
+      if (j != other.ranges.begin()) --j;
+    }
+  } else {
+    return true;
+  }
+
+  if (j == je) return false;
+
+  while (i != ie) {
+    if (i->start > j->start) {
+      std::swap(i, j);
+      std::swap(ie, je);
+    }
+
+    if (i->end > j->start)
+      return true;
+    ++i;
+  }
+
+  return false;
+}
+
+bool LiveInterval::overlaps(const LiveInterval &Other,
+                            const CoalescerPair &CP,
+                            const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty interval");
+  if (Other.empty())
+    return false;
+
+  // Use binary searches to find initial positions.
+  const_iterator I = find(Other.beginIndex());
+  const_iterator IE = end();
+  if (I == IE)
+    return false;
+  const_iterator J = Other.find(I->start);
+  const_iterator JE = Other.end();
+  if (J == JE)
+    return false;
+
+  for (;;) {
+    // J has just been advanced to satisfy:
+    assert(J->end >= I->start);
+    // Check for an overlap.
+    if (J->start < I->end) {
+      // I and J are overlapping. Find the later start.
+      SlotIndex Def = std::max(I->start, J->start);
+      // Allow the overlap if Def is a coalescable copy.
+      if (Def.isBlock() ||
+          !CP.isCoalescable(Indexes.getInstructionFromIndex(Def)))
+        return true;
+    }
+    // Advance the iterator that ends first to check for more overlaps.
+    if (J->end > I->end) {
+      std::swap(I, J);
+      std::swap(IE, JE);
+    }
+    // Advance J until J->end >= I->start.
+    do
+      if (++J == JE)
+        return false;
+    while (J->end < I->start);
+  }
+}
+
+/// overlaps - Return true if the live interval overlaps a range specified
+/// by [Start, End).
+bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
+  assert(Start < End && "Invalid range");
+  const_iterator I = std::lower_bound(begin(), end(), End);
+  return I != begin() && (--I)->end > Start;
+}
+
+
+/// ValNo is dead, remove it.  If it is the largest value number, just nuke it
+/// (and any other deleted values neighboring it), otherwise mark it as ~1U so
+/// it can be nuked later.
+void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      valnos.pop_back();
+    } while (!valnos.empty() && valnos.back()->isUnused());
+  } else {
+    ValNo->markUnused();
+  }
+}
+
+/// RenumberValues - Renumber all values in order of appearance and delete the
+/// remaining unused values.
+void LiveInterval::RenumberValues(LiveIntervals &lis) {
+  SmallPtrSet<VNInfo*, 8> Seen;
+  valnos.clear();
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    VNInfo *VNI = I->valno;
+    if (!Seen.insert(VNI))
+      continue;
+    assert(!VNI->isUnused() && "Unused valno used by live range");
+    VNI->id = (unsigned)valnos.size();
+    valnos.push_back(VNI);
+  }
+}
+
+/// extendIntervalEndTo - This method is used when we want to extend the range
+/// specified by I to end at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.  The iterator is
+/// not invalidated.
+void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = llvm::next(I);
+  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+  }
+
+  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  I->end = std::max(NewEnd, prior(MergeTo)->end);
+
+  // If the newly formed range now touches the range after it and if they have
+  // the same value number, merge the two ranges into one range.
+  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+      MergeTo->valno == ValNo) {
+    I->end = MergeTo->end;
+    ++MergeTo;
+  }
+
+  // Erase any dead ranges.
+  ranges.erase(llvm::next(I), MergeTo);
+}
+
+
+/// extendIntervalStartTo - This method is used when we want to extend the range
+/// specified by I to start at the specified endpoint.  To do this, we should
+/// merge and eliminate all ranges that this will overlap with.
+LiveInterval::Ranges::iterator
+LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
+  assert(I != ranges.end() && "Not a valid interval!");
+  VNInfo *ValNo = I->valno;
+
+  // Search for the first interval that we can't merge with.
+  Ranges::iterator MergeTo = I;
+  do {
+    if (MergeTo == ranges.begin()) {
+      I->start = NewStart;
+      ranges.erase(MergeTo, I);
+      return I;
+    }
+    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+    --MergeTo;
+  } while (NewStart <= MergeTo->start);
+
+  // If we start in the middle of another interval, just delete a range and
+  // extend that interval.
+  if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+    MergeTo->end = I->end;
+  } else {
+    // Otherwise, extend the interval right after.
+    ++MergeTo;
+    MergeTo->start = NewStart;
+    MergeTo->end = I->end;
+  }
+
+  ranges.erase(llvm::next(MergeTo), llvm::next(I));
+  return MergeTo;
+}
+
+LiveInterval::iterator
+LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
+  SlotIndex Start = LR.start, End = LR.end;
+  iterator it = std::upper_bound(From, ranges.end(), Start);
+
+  // If the inserted interval starts in the middle or right at the end of
+  // another interval, just extend that interval to contain the range of LR.
+  if (it != ranges.begin()) {
+    iterator B = prior(it);
+    if (LR.valno == B->valno) {
+      if (B->start <= Start && B->end >= Start) {
+        extendIntervalEndTo(B, End);
+        return B;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(B->end <= Start &&
+             "Cannot overlap two LiveRanges with differing ValID's"
+             " (did you def the same reg twice in a MachineInstr?)");
+    }
+  }
+
+  // Otherwise, if this range ends in the middle of, or right next to, another
+  // interval, merge it into that interval.
+  if (it != ranges.end()) {
+    if (LR.valno == it->valno) {
+      if (it->start <= End) {
+        it = extendIntervalStartTo(it, Start);
+
+        // If LR is a complete superset of an interval, we may need to grow its
+        // endpoint as well.
+        if (End > it->end)
+          extendIntervalEndTo(it, End);
+        return it;
+      }
+    } else {
+      // Check to make sure that we are not overlapping two live ranges with
+      // different valno's.
+      assert(it->start >= End &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+  }
+
+  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Insert it.
+  return ranges.insert(it, LR);
+}
+
+/// extendInBlock - If this interval is live before Kill in the basic
+/// block that starts at StartIdx, extend it to be live up to Kill and return
+/// the value. If there is no live range before Kill, return NULL.
+VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+  if (empty())
+    return 0;
+  iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
+  if (I == begin())
+    return 0;
+  --I;
+  if (I->end <= StartIdx)
+    return 0;
+  if (I->end < Kill)
+    extendIntervalEndTo(I, Kill);
+  return I->valno;
+}
+
+/// removeRange - Remove the specified range from this interval.  Note that
+/// the range must be in a single LiveRange in its entirety.
+void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
+                               bool RemoveDeadValNo) {
+  // Find the LiveRange containing this span.
+  Ranges::iterator I = find(Start);
+  assert(I != ranges.end() && "Range is not in interval!");
+  assert(I->containsRange(Start, End) && "Range is not entirely in interval!");
+
+  // If the span we are removing is at the start of the LiveRange, adjust it.
+  VNInfo *ValNo = I->valno;
+  if (I->start == Start) {
+    if (I->end == End) {
+      if (RemoveDeadValNo) {
+        // Check if val# is dead.
+        bool isDead = true;
+        for (const_iterator II = begin(), EE = end(); II != EE; ++II)
+          if (II != I && II->valno == ValNo) {
+            isDead = false;
+            break;
+          }
+        if (isDead) {
+          // Now that ValNo is dead, remove it.
+          markValNoForDeletion(ValNo);
+        }
+      }
+
+      ranges.erase(I);  // Removed the whole LiveRange.
+    } else
+      I->start = End;
+    return;
+  }
+
+  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // adjust the other way.
+  if (I->end == End) {
+    I->end = Start;
+    return;
+  }
+
+  // Otherwise, we are splitting the LiveRange into two pieces.
+  SlotIndex OldEnd = I->end;
+  I->end = Start;   // Trim the old interval.
+
+  // Insert the new one.
+  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
+}
+
+/// removeValNo - Remove all the ranges defined by the specified value#.
+/// Also remove the value# from value# list.
+void LiveInterval::removeValNo(VNInfo *ValNo) {
+  if (empty()) return;
+  Ranges::iterator I = ranges.end();
+  Ranges::iterator E = ranges.begin();
+  do {
+    --I;
+    if (I->valno == ValNo)
+      ranges.erase(I);
+  } while (I != E);
+  // Now that ValNo is dead, remove it.
+  markValNoForDeletion(ValNo);
+}
+
+/// join - Join two live intervals (this, and other) together.  This applies
+/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
+/// the intervals are not joinable, this aborts.
+void LiveInterval::join(LiveInterval &Other,
+                        const int *LHSValNoAssignments,
+                        const int *RHSValNoAssignments,
+                        SmallVector<VNInfo*, 16> &NewVNInfo,
+                        MachineRegisterInfo *MRI) {
+  verify();
+
+  // Determine if any of our live range values are mapped.  This is uncommon, so
+  // we want to avoid the interval scan if not.
+  bool MustMapCurValNos = false;
+  unsigned NumVals = getNumValNums();
+  unsigned NumNewVals = NewVNInfo.size();
+  for (unsigned i = 0; i != NumVals; ++i) {
+    unsigned LHSValID = LHSValNoAssignments[i];
+    if (i != LHSValID ||
+        (NewVNInfo[LHSValID] && NewVNInfo[LHSValID] != getValNumInfo(i))) {
+      MustMapCurValNos = true;
+      break;
+    }
+  }
+
+  // If we have to apply a mapping to our base interval assignment, rewrite it
+  // now.
+  if (MustMapCurValNos && !empty()) {
+    // Map the first live range.
+
+    iterator OutIt = begin();
+    OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
+    for (iterator I = llvm::next(OutIt), E = end(); I != E; ++I) {
+      VNInfo* nextValNo = NewVNInfo[LHSValNoAssignments[I->valno->id]];
+      assert(nextValNo != 0 && "Huh?");
+
+      // If this live range has the same value # as its immediate predecessor,
+      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
+      if (OutIt->valno == nextValNo && OutIt->end == I->start) {
+        OutIt->end = I->end;
+      } else {
+        // Didn't merge. Move OutIt to the next interval,
+        ++OutIt;
+        OutIt->valno = nextValNo;
+        if (OutIt != I) {
+          OutIt->start = I->start;
+          OutIt->end = I->end;
+        }
+      }
+    }
+    // If we merge some live ranges, chop off the end.
+    ++OutIt;
+    ranges.erase(OutIt, end());
+  }
+
+  // Rewrite Other values before changing the VNInfo ids.
+  // This can leave Other in an invalid state because we're not coalescing
+  // touching segments that now have identical values. That's OK since Other is
+  // not supposed to be valid after calling join();
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
+    I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
+
+  // Update val# info. Renumber them and make sure they all belong to this
+  // LiveInterval now. Also remove dead val#'s.
+  unsigned NumValNos = 0;
+  for (unsigned i = 0; i < NumNewVals; ++i) {
+    VNInfo *VNI = NewVNInfo[i];
+    if (VNI) {
+      if (NumValNos >= NumVals)
+        valnos.push_back(VNI);
+      else
+        valnos[NumValNos] = VNI;
+      VNI->id = NumValNos++;  // Renumber val#.
+    }
+  }
+  if (NumNewVals < NumVals)
+    valnos.resize(NumNewVals);  // shrinkify
+
+  // Okay, now insert the RHS live ranges into the LHS.
+  LiveRangeUpdater Updater(this);
+  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
+    Updater.add(*I);
+}
+
+/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
+/// interval as the specified value number.  The LiveRanges in RHS are
+/// allowed to overlap with LiveRanges in the current interval, but only if
+/// the overlapping LiveRanges have the specified value number.
+void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
+                                        VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
+    Updater.add(I->start, I->end, LHSValNo);
+}
+
+/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
+/// in RHS into this live interval as the specified value number.
+/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
+/// current interval, it will replace the value numbers of the overlaped
+/// live ranges with the specified value number.
+void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
+                                       const VNInfo *RHSValNo,
+                                       VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
+    if (I->valno == RHSValNo)
+      Updater.add(I->start, I->end, LHSValNo);
+}
+
+/// MergeValueNumberInto - This method is called when two value nubmers
+/// are found to be equivalent.  This eliminates V1, replacing all
+/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// cause merging of V1/V2 values numbers and compaction of the value space.
+VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+  assert(V1 != V2 && "Identical value#'s are always equivalent!");
+
+  // This code actually merges the (numerically) larger value number into the
+  // smaller value number, which is likely to allow us to compactify the value
+  // space.  The only thing we have to be careful of is to preserve the
+  // instruction that defines the result value.
+
+  // Make sure V2 is smaller than V1.
+  if (V1->id < V2->id) {
+    V1->copyFrom(*V2);
+    std::swap(V1, V2);
+  }
+
+  // Merge V1 live ranges into V2.
+  for (iterator I = begin(); I != end(); ) {
+    iterator LR = I++;
+    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+
+    // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
+    // range, extend it.
+    if (LR != begin()) {
+      iterator Prev = LR-1;
+      if (Prev->valno == V2 && Prev->end == LR->start) {
+        Prev->end = LR->end;
+
+        // Erase this live-range.
+        ranges.erase(LR);
+        I = Prev+1;
+        LR = Prev;
+      }
+    }
+
+    // Okay, now we have a V1 or V2 live range that is maximally merged forward.
+    // Ensure that it is a V2 live-range.
+    LR->valno = V2;
+
+    // If we can merge it into later V2 live ranges, do so now.  We ignore any
+    // following V1 live ranges, as they will be merged in subsequent iterations
+    // of the loop.
+    if (I != end()) {
+      if (I->start == LR->end && I->valno == V2) {
+        LR->end = I->end;
+        ranges.erase(I);
+        I = LR+1;
+      }
+    }
+  }
+
+  // Now that V1 is dead, remove it.
+  markValNoForDeletion(V1);
+
+  return V2;
+}
+
+unsigned LiveInterval::getSize() const {
+  unsigned Sum = 0;
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    Sum += I->start.distance(I->end);
+  return Sum;
+}
+
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
+  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveRange::dump() const {
+  dbgs() << *this << "\n";
+}
+#endif
+
+void LiveInterval::print(raw_ostream &OS) const {
+  if (empty())
+    OS << "EMPTY";
+  else {
+    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
+           E = ranges.end(); I != E; ++I) {
+      OS << *I;
+      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    }
+  }
+
+  // Print value number info.
+  if (getNumValNums()) {
+    OS << "  ";
+    unsigned vnum = 0;
+    for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
+         ++i, ++vnum) {
+      const VNInfo *vni = *i;
+      if (vnum) OS << " ";
+      OS << vnum << "@";
+      if (vni->isUnused()) {
+        OS << "x";
+      } else {
+        OS << vni->def;
+        if (vni->isPHIDef())
+          OS << "-phi";
+      }
+    }
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveInterval::dump() const {
+  dbgs() << *this << "\n";
+}
+#endif
+
+#ifndef NDEBUG
+void LiveInterval::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != 0);
+    assert(I->valno == valnos[I->valno->id]);
+    if (llvm::next(I) != E) {
+      assert(I->end <= llvm::next(I)->start);
+      if (I->end == llvm::next(I)->start)
+        assert(I->valno != llvm::next(I)->valno);
+    }
+  }
+}
+#endif
+
+
+void LiveRange::print(raw_ostream &os) const {
+  os << *this;
+}
+
+//===----------------------------------------------------------------------===//
+//                           LiveRangeUpdater class
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeUpdater class always maintains these invariants:
+//
+// - When LastStart is invalid, Spills is empty and the iterators are invalid.
+//   This is the initial state, and the state created by flush().
+//   In this state, isDirty() returns false.
+//
+// Otherwise, segments are kept in three separate areas:
+//
+// 1. [begin; WriteI) at the front of LI.
+// 2. [ReadI; end) at the back of LI.
+// 3. Spills.
+//
+// - LI.begin() <= WriteI <= ReadI <= LI.end().
+// - Segments in all three areas are fully ordered and coalesced.
+// - Segments in area 1 precede and can't coalesce with segments in area 2.
+// - Segments in Spills precede and can't coalesce with segments in area 2.
+// - No coalescing is possible between segments in Spills and segments in area
+//   1, and there are no overlapping segments.
+//
+// The segments in Spills are not ordered with respect to the segments in area
+// 1. They need to be merged.
+//
+// When they exist, Spills.back().start <= LastStart,
+//                 and WriteI[-1].start <= LastStart.
+
+void LiveRangeUpdater::print(raw_ostream &OS) const {
+  if (!isDirty()) {
+    if (LI)
+      OS << "Clean " << PrintReg(LI->reg) << " updater: " << *LI << '\n';
+    else
+      OS << "Null updater.\n";
+    return;
+  }
+  assert(LI && "Can't have null LI in dirty updater.");
+  OS << PrintReg(LI->reg) << " updater with gap = " << (ReadI - WriteI)
+     << ", last start = " << LastStart
+     << ":\n  Area 1:";
+  for (LiveInterval::const_iterator I = LI->begin(); I != WriteI; ++I)
+    OS << ' ' << *I;
+  OS << "\n  Spills:";
+  for (unsigned I = 0, E = Spills.size(); I != E; ++I)
+    OS << ' ' << Spills[I];
+  OS << "\n  Area 2:";
+  for (LiveInterval::const_iterator I = ReadI, E = LI->end(); I != E; ++I)
+    OS << ' ' << *I;
+  OS << '\n';
+}
+
+void LiveRangeUpdater::dump() const
+{
+  print(errs());
+}
+
+// Determine if A and B should be coalesced.
+static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
+  assert(A.start <= B.start && "Unordered live ranges.");
+  if (A.end == B.start)
+    return A.valno == B.valno;
+  if (A.end < B.start)
+    return false;
+  assert(A.valno == B.valno && "Cannot overlap different values");
+  return true;
+}
+
+void LiveRangeUpdater::add(LiveRange Seg) {
+  assert(LI && "Cannot add to a null destination");
+
+  // Flush the state if Start moves backwards.
+  if (!LastStart.isValid() || LastStart > Seg.start) {
+    if (isDirty())
+      flush();
+    // This brings us to an uninitialized state. Reinitialize.
+    assert(Spills.empty() && "Leftover spilled segments");
+    WriteI = ReadI = LI->begin();
+  }
+
+  // Remember start for next time.
+  LastStart = Seg.start;
+
+  // Advance ReadI until it ends after Seg.start.
+  LiveInterval::iterator E = LI->end();
+  if (ReadI != E && ReadI->end <= Seg.start) {
+    // First try to close the gap between WriteI and ReadI with spills.
+    if (ReadI != WriteI)
+      mergeSpills();
+    // Then advance ReadI.
+    if (ReadI == WriteI)
+      ReadI = WriteI = LI->find(Seg.start);
+    else
+      while (ReadI != E && ReadI->end <= Seg.start)
+        *WriteI++ = *ReadI++;
+  }
+
+  assert(ReadI == E || ReadI->end > Seg.start);
+
+  // Check if the ReadI segment begins early.
+  if (ReadI != E && ReadI->start <= Seg.start) {
+    assert(ReadI->valno == Seg.valno && "Cannot overlap different values");
+    // Bail if Seg is completely contained in ReadI.
+    if (ReadI->end >= Seg.end)
+      return;
+    // Coalesce into Seg.
+    Seg.start = ReadI->start;
+    ++ReadI;
+  }
+
+  // Coalesce as much as possible from ReadI into Seg.
+  while (ReadI != E && coalescable(Seg, *ReadI)) {
+    Seg.end = std::max(Seg.end, ReadI->end);
+    ++ReadI;
+  }
+
+  // Try coalescing Spills.back() into Seg.
+  if (!Spills.empty() && coalescable(Spills.back(), Seg)) {
+    Seg.start = Spills.back().start;
+    Seg.end = std::max(Spills.back().end, Seg.end);
+    Spills.pop_back();
+  }
+
+  // Try coalescing Seg into WriteI[-1].
+  if (WriteI != LI->begin() && coalescable(WriteI[-1], Seg)) {
+    WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
+    return;
+  }
+
+  // Seg doesn't coalesce with anything, and needs to be inserted somewhere.
+  if (WriteI != ReadI) {
+    *WriteI++ = Seg;
+    return;
+  }
+
+  // Finally, append to LI or Spills.
+  if (WriteI == E) {
+    LI->ranges.push_back(Seg);
+    WriteI = ReadI = LI->ranges.end();
+  } else
+    Spills.push_back(Seg);
+}
+
+// Merge as many spilled segments as possible into the gap between WriteI
+// and ReadI. Advance WriteI to reflect the inserted instructions.
+void LiveRangeUpdater::mergeSpills() {
+  // Perform a backwards merge of Spills and [SpillI;WriteI).
+  size_t GapSize = ReadI - WriteI;
+  size_t NumMoved = std::min(Spills.size(), GapSize);
+  LiveInterval::iterator Src = WriteI;
+  LiveInterval::iterator Dst = Src + NumMoved;
+  LiveInterval::iterator SpillSrc = Spills.end();
+  LiveInterval::iterator B = LI->begin();
+
+  // This is the new WriteI position after merging spills.
+  WriteI = Dst;
+
+  // Now merge Src and Spills backwards.
+  while (Src != Dst) {
+    if (Src != B && Src[-1].start > SpillSrc[-1].start)
+      *--Dst = *--Src;
+    else
+      *--Dst = *--SpillSrc;
+  }
+  assert(NumMoved == size_t(Spills.end() - SpillSrc));
+  Spills.erase(SpillSrc, Spills.end());
+}
+
+void LiveRangeUpdater::flush() {
+  if (!isDirty())
+    return;
+  // Clear the dirty state.
+  LastStart = SlotIndex();
+
+  assert(LI && "Cannot add to a null destination");
+
+  // Nothing to merge?
+  if (Spills.empty()) {
+    LI->ranges.erase(WriteI, ReadI);
+    LI->verify();
+    return;
+  }
+
+  // Resize the WriteI - ReadI gap to match Spills.
+  size_t GapSize = ReadI - WriteI;
+  if (GapSize < Spills.size()) {
+    // The gap is too small. Make some room.
+    size_t WritePos = WriteI - LI->begin();
+    LI->ranges.insert(ReadI, Spills.size() - GapSize, LiveRange());
+    // This also invalidated ReadI, but it is recomputed below.
+    WriteI = LI->ranges.begin() + WritePos;
+  } else {
+    // Shrink the gap if necessary.
+    LI->ranges.erase(WriteI + Spills.size(), ReadI);
+  }
+  ReadI = WriteI + Spills.size();
+  mergeSpills();
+  LI->verify();
+}
+
+unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
+  // Create initial equivalence classes.
+  EqClass.clear();
+  EqClass.grow(LI->getNumValNums());
+
+  const VNInfo *used = 0, *unused = 0;
+
+  // Determine connections.
+  for (LiveInterval::const_vni_iterator I = LI->vni_begin(), E = LI->vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    // Group all unused values into one class.
+    if (VNI->isUnused()) {
+      if (unused)
+        EqClass.join(unused->id, VNI->id);
+      unused = VNI;
+      continue;
+    }
+    used = VNI;
+    if (VNI->isPHIDef()) {
+      const MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      assert(MBB && "Phi-def has no defining MBB");
+      // Connect to values live out of predecessors.
+      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+           PE = MBB->pred_end(); PI != PE; ++PI)
+        if (const VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(*PI)))
+          EqClass.join(VNI->id, PVNI->id);
+    } else {
+      // Normal value defined by an instruction. Check for two-addr redef.
+      // FIXME: This could be coincidental. Should we really check for a tied
+      // operand constraint?
+      // Note that VNI->def may be a use slot for an early clobber def.
+      if (const VNInfo *UVNI = LI->getVNInfoBefore(VNI->def))
+        EqClass.join(VNI->id, UVNI->id);
+    }
+  }
+
+  // Lump all the unused values in with the last used value.
+  if (used && unused)
+    EqClass.join(used->id, unused->id);
+
+  EqClass.compress();
+  return EqClass.getNumClasses();
+}
+
+void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
+                                          MachineRegisterInfo &MRI) {
+  assert(LIV[0] && "LIV[0] must be set");
+  LiveInterval &LI = *LIV[0];
+
+  // Rewrite instructions.
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg),
+       RE = MRI.reg_end(); RI != RE;) {
+    MachineOperand &MO = RI.getOperand();
+    MachineInstr *MI = MO.getParent();
+    ++RI;
+    // DBG_VALUE instructions should have been eliminated earlier.
+    LiveRangeQuery LRQ(LI, LIS.getInstructionIndex(MI));
+    const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
+    // In the case of an <undef> use that isn't tied to any def, VNI will be
+    // NULL. If the use is tied to a def, VNI will be the defined value.
+    if (!VNI)
+      continue;
+    MO.setReg(LIV[getEqClass(VNI)]->reg);
+  }
+
+  // Move runs to new intervals.
+  LiveInterval::iterator J = LI.begin(), E = LI.end();
+  while (J != E && EqClass[J->valno->id] == 0)
+    ++J;
+  for (LiveInterval::iterator I = J; I != E; ++I) {
+    if (unsigned eq = EqClass[I->valno->id]) {
+      assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) &&
+             "New intervals should be empty");
+      LIV[eq]->ranges.push_back(*I);
+    } else
+      *J++ = *I;
+  }
+  LI.ranges.erase(J, E);
+
+  // Transfer VNInfos to their new owners and renumber them.
+  unsigned j = 0, e = LI.getNumValNums();
+  while (j != e && EqClass[j] == 0)
+    ++j;
+  for (unsigned i = j; i != e; ++i) {
+    VNInfo *VNI = LI.getValNumInfo(i);
+    if (unsigned eq = EqClass[i]) {
+      VNI->id = LIV[eq]->getNumValNums();
+      LIV[eq]->valnos.push_back(VNI);
+    } else {
+      VNI->id = j;
+      LI.valnos[j++] = VNI;
+    }
+  }
+  LI.valnos.resize(j);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
new file mode 100644
index 000000000000..f1b839481131
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -0,0 +1,1172 @@
+//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveInterval analysis pass which is used
+// by the Linear Scan Register allocator. This pass linearizes the
+// basic blocks of the function in DFS order and uses the
+// LiveVariables pass to conservatively compute live intervals for
+// each virtual and physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "LiveRangeCalc.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+using namespace llvm;
+
+char LiveIntervals::ID = 0;
+char &llvm::LiveIntervalsID = LiveIntervals::ID;
+INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false)
+
+void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  // LiveVariables isn't really required by this analysis, it is only required
+  // here to make sure it is live during TwoAddressInstructionPass and
+  // PHIElimination. This is temporary.
+  AU.addRequired<LiveVariables>();
+  AU.addPreserved<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addRequiredTransitiveID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
+  DomTree(0), LRCalc(0) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+LiveIntervals::~LiveIntervals() {
+  delete LRCalc;
+}
+
+void LiveIntervals::releaseMemory() {
+  // Free the live intervals themselves.
+  for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
+    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+  VirtRegIntervals.clear();
+  RegMaskSlots.clear();
+  RegMaskBits.clear();
+  RegMaskBlocks.clear();
+
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    delete RegUnitIntervals[i];
+  RegUnitIntervals.clear();
+
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  MRI = &MF->getRegInfo();
+  TM = &fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  if (!LRCalc)
+    LRCalc = new LiveRangeCalc();
+
+  // Allocate space for all virtual registers.
+  VirtRegIntervals.resize(MRI->getNumVirtRegs());
+
+  computeVirtRegs();
+  computeRegMasks();
+  computeLiveInRegUnits();
+
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
+  OS << "********** INTERVALS **********\n";
+
+  // Dump the regunits.
+  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
+    if (LiveInterval *LI = RegUnitIntervals[i])
+      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
+
+  // Dump the virtregs.
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (hasInterval(Reg))
+      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+  }
+
+  OS << "RegMasks:";
+  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+    OS << ' ' << RegMaskSlots[i];
+  OS << '\n';
+
+  printInstrs(OS);
+}
+
+void LiveIntervals::printInstrs(raw_ostream &OS) const {
+  OS << "********** MACHINEINSTRS **********\n";
+  MF->print(OS, Indexes);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveIntervals::dumpInstrs() const {
+  printInstrs(dbgs());
+}
+#endif
+
+LiveInterval* LiveIntervals::createInterval(unsigned reg) {
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  return new LiveInterval(reg, Weight);
+}
+
+
+/// computeVirtRegInterval - Compute the live interval of a virtual register,
+/// based on defs and uses.
+void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  assert(LI->empty() && "Should only compute empty intervals.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->createDeadDefs(LI);
+  LRCalc->extendToUses(LI);
+}
+
+void LiveIntervals::computeVirtRegs() {
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval *LI = createInterval(Reg);
+    VirtRegIntervals[Reg] = LI;
+    computeVirtRegInterval(LI);
+  }
+}
+
+void LiveIntervals::computeRegMasks() {
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
+
+  // Find all instructions with regmask operands.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
+    RMB.first = RegMaskSlots.size();
+    for (MachineBasicBlock::iterator MI = MBB->begin(), ME = MBB->end();
+         MI != ME; ++MI)
+      for (MIOperands MO(MI); MO.isValid(); ++MO) {
+        if (!MO->isRegMask())
+          continue;
+          RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
+          RegMaskBits.push_back(MO->getRegMask());
+      }
+    // Compute the number of register mask instructions in this block.
+    RMB.second = RegMaskSlots.size() - RMB.first;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           Register Unit Liveness
+//===----------------------------------------------------------------------===//
+//
+// Fixed interference typically comes from ABI boundaries: Function arguments
+// and return values are passed in fixed registers, and so are exception
+// pointers entering landing pads. Certain instructions require values to be
+// present in specific registers. That is also represented through fixed
+// interference.
+//
+
+/// computeRegUnitInterval - Compute the live interval of a register unit, based
+/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// or contain only dead phi-defs from ABI blocks.
+void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
+  unsigned Unit = LI->reg;
+
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+
+  // The physregs aliasing Unit are the roots and their super-registers.
+  // Create all values as dead defs before extending to uses. Note that roots
+  // may share super-registers. That's OK because createDeadDefs() is
+  // idempotent. It is very rare for a register unit to have multiple roots, so
+  // uniquing super-registers is probably not worthwhile.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!MRI->reg_empty(Root))
+      LRCalc->createDeadDefs(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      if (!MRI->reg_empty(*Supers))
+        LRCalc->createDeadDefs(LI, *Supers);
+    }
+  }
+
+  // Now extend LI to reach all uses.
+  // Ignore uses of reserved registers. We only track defs of those.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    unsigned Root = *Roots;
+    if (!MRI->isReserved(Root) && !MRI->reg_empty(Root))
+      LRCalc->extendToUses(LI, Root);
+    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+      unsigned Reg = *Supers;
+      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
+        LRCalc->extendToUses(LI, Reg);
+    }
+  }
+}
+
+
+/// computeLiveInRegUnits - Precompute the live ranges of any register units
+/// that are live-in to an ABI block somewhere. Register values can appear
+/// without a corresponding def when entering the entry block or a landing pad.
+///
+void LiveIntervals::computeLiveInRegUnits() {
+  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+
+  // Keep track of the intervals allocated.
+  SmallVector<LiveInterval*, 8> NewIntvs;
+
+  // Check all basic blocks for live-ins.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock *MBB = MFI;
+
+    // We only care about ABI blocks: Entry + landing pads.
+    if ((MFI != MF->begin() && !MBB->isLandingPad()) || MBB->livein_empty())
+      continue;
+
+    // Create phi-defs at Begin for all live-in registers.
+    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
+    for (MachineBasicBlock::livein_iterator LII = MBB->livein_begin(),
+         LIE = MBB->livein_end(); LII != LIE; ++LII) {
+      for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
+        unsigned Unit = *Units;
+        LiveInterval *Intv = RegUnitIntervals[Unit];
+        if (!Intv) {
+          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
+          NewIntvs.push_back(Intv);
+        }
+        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        (void)VNI;
+        DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+
+  // Compute the 'normal' part of the intervals.
+  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
+    computeRegUnitInterval(NewIntvs[i]);
+}
+
+
+/// shrinkToUses - After removing some uses of a register, shrink its live
+/// range to just the remaining uses. This method does not compute reaching
+/// defs for new uses, and it doesn't remove dead defs.
+bool LiveIntervals::shrinkToUses(LiveInterval *li,
+                                 SmallVectorImpl<MachineInstr*> *dead) {
+  DEBUG(dbgs() << "Shrink: " << *li << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(li->reg)
+         && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  SmallVector<std::pair<SlotIndex, VNInfo*>, 16> WorkList;
+
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
+  // Visit all instructions reading li->reg.
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(li->reg);
+       MachineInstr *UseMI = I.skipInstruction();) {
+    if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
+      continue;
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    LiveRangeQuery LRQ(*li, Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    if (!VNI) {
+      // This shouldn't happen: readsVirtualRegister returns true, but there is
+      // no live value. It is likely caused by a target getting <undef> flags
+      // wrong.
+      DEBUG(dbgs() << Idx << '\t' << *UseMI
+                   << "Warning: Instr claims to read non-existent value in "
+                    << *li << '\n');
+      continue;
+    }
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create a new live interval with only minimal live segments per def.
+  LiveInterval NewLI(li->reg, 0);
+  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
+       I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    NewLI.addRange(LiveRange(VNI->def, VNI->def.getDeadSlot(), VNI));
+  }
+
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx)) {
+      (void)ExtVNI;
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart || !UsedPHIs.insert(VNI))
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+           PE = MBB->pred_end(); PI != PE; ++PI) {
+        if (!LiveOut.insert(*PI))
+          continue;
+        SlotIndex Stop = getMBBEndIdx(*PI);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = li->getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    NewLI.addRange(LiveRange(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+      if (!LiveOut.insert(*PI))
+        continue;
+      SlotIndex Stop = getMBBEndIdx(*PI);
+      assert(li->getVNInfoBefore(Stop) == VNI &&
+             "Wrong value out of predecessor");
+      WorkList.push_back(std::make_pair(Stop, VNI));
+    }
+  }
+
+  // Handle dead values.
+  bool CanSeparate = false;
+  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
+       I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    LiveInterval::iterator LII = NewLI.FindLiveRangeContaining(VNI->def);
+    assert(LII != NewLI.end() && "Missing live range for PHI");
+    if (LII->end != VNI->def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      NewLI.removeRange(*LII);
+      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
+      CanSeparate = true;
+    } else {
+      // This is a dead def. Make sure the instruction knows.
+      MachineInstr *MI = getInstructionFromIndex(VNI->def);
+      assert(MI && "No instruction defining live value");
+      MI->addRegisterDead(li->reg, TRI);
+      if (dead && MI->allDefsAreDead()) {
+        DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
+        dead->push_back(MI);
+      }
+    }
+  }
+
+  // Move the trimmed ranges back.
+  li->ranges.swap(NewLI.ranges);
+  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
+
+void LiveIntervals::extendToIndices(LiveInterval *LI,
+                                    ArrayRef<SlotIndex> Indices) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+    LRCalc->extend(LI, Indices[i]);
+}
+
+void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  LiveRangeQuery LRQ(*LI, Kill);
+  VNInfo *VNI = LRQ.valueOut();
+  if (!VNI)
+    return;
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
+  SlotIndex MBBStart, MBBEnd;
+  tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+
+  // If VNI isn't live out from KillMBB, the value is trivially pruned.
+  if (LRQ.endPoint() < MBBEnd) {
+    LI->removeRange(Kill, LRQ.endPoint());
+    if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+    return;
+  }
+
+  // VNI is live out of KillMBB.
+  LI->removeRange(Kill, MBBEnd);
+  if (EndPoints) EndPoints->push_back(MBBEnd);
+
+  // Find all blocks that are reachable from KillMBB without leaving VNI's live
+  // range. It is possible that KillMBB itself is reachable, so start a DFS
+  // from each successor.
+  typedef SmallPtrSet<MachineBasicBlock*, 9> VisitedTy;
+  VisitedTy Visited;
+  for (MachineBasicBlock::succ_iterator
+       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end();
+       SuccI != SuccE; ++SuccI) {
+    for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
+         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited);
+         I != E;) {
+      MachineBasicBlock *MBB = *I;
+
+      // Check if VNI is live in to MBB.
+      tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
+      LiveRangeQuery LRQ(*LI, MBBStart);
+      if (LRQ.valueIn() != VNI) {
+        // This block isn't part of the VNI live range. Prune the search.
+        I.skipChildren();
+        continue;
+      }
+
+      // Prune the search if VNI is killed in MBB.
+      if (LRQ.endPoint() < MBBEnd) {
+        LI->removeRange(MBBStart, LRQ.endPoint());
+        if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+        I.skipChildren();
+        continue;
+      }
+
+      // VNI is live through MBB.
+      LI->removeRange(MBBStart, MBBEnd);
+      if (EndPoints) EndPoints->push_back(MBBEnd);
+      ++I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register allocator hooks.
+//
+
+void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
+  // Keep track of regunit ranges.
+  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval *LI = &getInterval(Reg);
+    if (LI->empty())
+      continue;
+
+    // Find the regunit intervals for the assigned register. They may overlap
+    // the virtual register live range, cancelling any kills.
+    RU.clear();
+    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
+         ++Units) {
+      LiveInterval *RUInt = &getRegUnit(*Units);
+      if (RUInt->empty())
+        continue;
+      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+    }
+
+    // Every instruction that kills Reg corresponds to a live range end point.
+    for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
+         ++RI) {
+      // A block index indicates an MBB edge.
+      if (RI->end.isBlock())
+        continue;
+      MachineInstr *MI = getInstructionFromIndex(RI->end);
+      if (!MI)
+        continue;
+
+      // Check if any of the reguints are live beyond the end of RI. That could
+      // happen when a physreg is defined as a copy of a virtreg:
+      //
+      //   %EAX = COPY %vreg5
+      //   FOO %vreg5         <--- MI, cancel kill because %EAX is live.
+      //   BAR %EAX<kill>
+      //
+      // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
+      bool CancelKill = false;
+      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
+        LiveInterval *RInt = RU[u].first;
+        LiveInterval::iterator &I = RU[u].second;
+        if (I == RInt->end())
+          continue;
+        I = RInt->advanceTo(I, RI->end);
+        if (I == RInt->end() || I->start >= RI->end)
+          continue;
+        // I is overlapping RI.
+        CancelKill = true;
+        break;
+      }
+      if (CancelKill)
+        MI->clearRegisterKills(Reg, NULL);
+      else
+        MI->addRegisterKilled(Reg, NULL);
+    }
+  }
+}
+
+MachineBasicBlock*
+LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
+  // A local live range must be fully contained inside the block, meaning it is
+  // defined and killed at instructions, not at block boundaries. It is not
+  // live in or or out of any block.
+  //
+  // It is technically possible to have a PHI-defined live range identical to a
+  // single block, but we are going to return false in that case.
+
+  SlotIndex Start = LI.beginIndex();
+  if (Start.isBlock())
+    return NULL;
+
+  SlotIndex Stop = LI.endIndex();
+  if (Stop.isBlock())
+    return NULL;
+
+  // getMBBFromIndex doesn't need to search the MBB table when both indexes
+  // belong to proper instructions.
+  MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
+  MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
+  return MBB1 == MBB2 ? MBB1 : NULL;
+}
+
+bool
+LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *PHI = *I;
+    if (PHI->isUnused() || !PHI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
+    // Conservatively return true instead of scanning huge predecessor lists.
+    if (PHIMBB->pred_size() > 100)
+      return true;
+    for (MachineBasicBlock::const_pred_iterator
+         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+        return true;
+  }
+  return false;
+}
+
+float
+LiveIntervals::getSpillWeight(bool isDef, bool isUse, unsigned loopDepth) {
+  // Limit the loop depth ridiculousness.
+  if (loopDepth > 200)
+    loopDepth = 200;
+
+  // The loop depth is used to roughly estimate the number of times the
+  // instruction is executed. Something like 10^d is simple, but will quickly
+  // overflow a float. This expression behaves like 10^d for small d, but is
+  // more tempered for large d. At d=200 we get 6.7e33 which leaves a bit of
+  // headroom before overflow.
+  // By the way, powf() might be unavailable here. For consistency,
+  // We may take pow(double,double).
+  float lc = std::pow(1 + (100.0 / (loopDepth + 10)), (double)loopDepth);
+
+  return (isDef + isUse) * lc;
+}
+
+LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
+                                                  MachineInstr* startInst) {
+  LiveInterval& Interval = getOrCreateInterval(reg);
+  VNInfo* VN = Interval.getNextValue(
+    SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+    getVNInfoAllocator());
+  LiveRange LR(
+     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+     getMBBEndIdx(startInst->getParent()), VN);
+  Interval.addRange(LR);
+
+  return LR;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                          Register mask functions
+//===----------------------------------------------------------------------===//
+
+bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
+                                             BitVector &UsableRegs) {
+  if (LI.empty())
+    return false;
+  LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
+
+  // Use a smaller arrays for local live ranges.
+  ArrayRef<SlotIndex> Slots;
+  ArrayRef<const uint32_t*> Bits;
+  if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
+    Slots = getRegMaskSlotsInBlock(MBB->getNumber());
+    Bits = getRegMaskBitsInBlock(MBB->getNumber());
+  } else {
+    Slots = getRegMaskSlots();
+    Bits = getRegMaskBits();
+  }
+
+  // We are going to enumerate all the register mask slots contained in LI.
+  // Start with a binary search of RegMaskSlots to find a starting point.
+  ArrayRef<SlotIndex>::iterator SlotI =
+    std::lower_bound(Slots.begin(), Slots.end(), LiveI->start);
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // No slots in range, LI begins after the last call.
+  if (SlotI == SlotE)
+    return false;
+
+  bool Found = false;
+  for (;;) {
+    assert(*SlotI >= LiveI->start);
+    // Loop over all slots overlapping this segment.
+    while (*SlotI < LiveI->end) {
+      // *SlotI overlaps LI. Collect mask bits.
+      if (!Found) {
+        // This is the first overlap. Initialize UsableRegs to all ones.
+        UsableRegs.clear();
+        UsableRegs.resize(TRI->getNumRegs(), true);
+        Found = true;
+      }
+      // Remove usable registers clobbered by this mask.
+      UsableRegs.clearBitsNotInMask(Bits[SlotI-Slots.begin()]);
+      if (++SlotI == SlotE)
+        return Found;
+    }
+    // *SlotI is beyond the current LI segment.
+    LiveI = LI.advanceTo(LiveI, *SlotI);
+    if (LiveI == LiveE)
+      return Found;
+    // Advance SlotI until it overlaps.
+    while (*SlotI < LiveI->start)
+      if (++SlotI == SlotE)
+        return Found;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         IntervalUpdate class.
+//===----------------------------------------------------------------------===//
+
+// HMEditor is a toolkit used by handleMove to trim or extend live intervals.
+class LiveIntervals::HMEditor {
+private:
+  LiveIntervals& LIS;
+  const MachineRegisterInfo& MRI;
+  const TargetRegisterInfo& TRI;
+  SlotIndex OldIdx;
+  SlotIndex NewIdx;
+  SmallPtrSet<LiveInterval*, 8> Updated;
+  bool UpdateFlags;
+
+public:
+  HMEditor(LiveIntervals& LIS, const MachineRegisterInfo& MRI,
+           const TargetRegisterInfo& TRI,
+           SlotIndex OldIdx, SlotIndex NewIdx, bool UpdateFlags)
+    : LIS(LIS), MRI(MRI), TRI(TRI), OldIdx(OldIdx), NewIdx(NewIdx),
+      UpdateFlags(UpdateFlags) {}
+
+  // FIXME: UpdateFlags is a workaround that creates live intervals for all
+  // physregs, even those that aren't needed for regalloc, in order to update
+  // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
+  // flags, and postRA passes will use a live register utility instead.
+  LiveInterval *getRegUnitLI(unsigned Unit) {
+    if (UpdateFlags)
+      return &LIS.getRegUnit(Unit);
+    return LIS.getCachedRegUnit(Unit);
+  }
+
+  /// Update all live ranges touched by MI, assuming a move from OldIdx to
+  /// NewIdx.
+  void updateAllRanges(MachineInstr *MI) {
+    DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": " << *MI);
+    bool hasRegMask = false;
+    for (MIOperands MO(MI); MO.isValid(); ++MO) {
+      if (MO->isRegMask())
+        hasRegMask = true;
+      if (!MO->isReg())
+        continue;
+      // Aggressively clear all kill flags.
+      // They are reinserted by VirtRegRewriter.
+      if (MO->isUse())
+        MO->setIsKill(false);
+
+      unsigned Reg = MO->getReg();
+      if (!Reg)
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        updateRange(LIS.getInterval(Reg));
+        continue;
+      }
+
+      // For physregs, only update the regunits that actually have a
+      // precomputed live range.
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        if (LiveInterval *LI = getRegUnitLI(*Units))
+          updateRange(*LI);
+    }
+    if (hasRegMask)
+      updateRegMaskSlots();
+  }
+
+private:
+  /// Update a single live range, assuming an instruction has been moved from
+  /// OldIdx to NewIdx.
+  void updateRange(LiveInterval &LI) {
+    if (!Updated.insert(&LI))
+      return;
+    DEBUG({
+      dbgs() << "     ";
+      if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+        dbgs() << PrintReg(LI.reg);
+      else
+        dbgs() << PrintRegUnit(LI.reg, &TRI);
+      dbgs() << ":\t" << LI << '\n';
+    });
+    if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
+      handleMoveDown(LI);
+    else
+      handleMoveUp(LI);
+    DEBUG(dbgs() << "        -->\t" << LI << '\n');
+    LI.verify();
+  }
+
+  /// Update LI to reflect an instruction has been moved downwards from OldIdx
+  /// to NewIdx.
+  ///
+  /// 1. Live def at OldIdx:
+  ///    Move def to NewIdx, assert endpoint after NewIdx.
+  ///
+  /// 2. Live def at OldIdx, killed at NewIdx:
+  ///    Change to dead def at NewIdx.
+  ///    (Happens when bundling def+kill together).
+  ///
+  /// 3. Dead def at OldIdx:
+  ///    Move def to NewIdx, possibly across another live value.
+  ///
+  /// 4. Def at OldIdx AND at NewIdx:
+  ///    Remove live range [OldIdx;NewIdx) and value defined at OldIdx.
+  ///    (Happens when bundling multiple defs together).
+  ///
+  /// 5. Value read at OldIdx, killed before NewIdx:
+  ///    Extend kill to NewIdx.
+  ///
+  void handleMoveDown(LiveInterval &LI) {
+    // First look for a kill at OldIdx.
+    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
+    LiveInterval::iterator E = LI.end();
+    // Is LI even live at OldIdx?
+    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
+      return;
+
+    // Handle a live-in value.
+    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
+      bool isKill = SlotIndex::isSameInstr(OldIdx, I->end);
+      // If the live-in value already extends to NewIdx, there is nothing to do.
+      if (!SlotIndex::isEarlierInstr(I->end, NewIdx))
+        return;
+      // Aggressively remove all kill flags from the old kill point.
+      // Kill flags shouldn't be used while live intervals exist, they will be
+      // reinserted by VirtRegRewriter.
+      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(I->end))
+        for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
+          if (MO->isReg() && MO->isUse())
+            MO->setIsKill(false);
+      // Adjust I->end to reach NewIdx. This may temporarily make LI invalid by
+      // overlapping ranges. Case 5 above.
+      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
+      // If this was a kill, there may also be a def. Otherwise we're done.
+      if (!isKill)
+        return;
+      ++I;
+    }
+
+    // Check for a def at OldIdx.
+    if (I == E || !SlotIndex::isSameInstr(OldIdx, I->start))
+      return;
+    // We have a def at OldIdx.
+    VNInfo *DefVNI = I->valno;
+    assert(DefVNI->def == I->start && "Inconsistent def");
+    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
+    // If the defined value extends beyond NewIdx, just move the def down.
+    // This is case 1 above.
+    if (SlotIndex::isEarlierInstr(NewIdx, I->end)) {
+      I->start = DefVNI->def;
+      return;
+    }
+    // The remaining possibilities are now:
+    // 2. Live def at OldIdx, killed at NewIdx: isSameInstr(I->end, NewIdx).
+    // 3. Dead def at OldIdx: I->end = OldIdx.getDeadSlot().
+    // In either case, it is possible that there is an existing def at NewIdx.
+    assert((I->end == OldIdx.getDeadSlot() ||
+            SlotIndex::isSameInstr(I->end, NewIdx)) &&
+            "Cannot move def below kill");
+    LiveInterval::iterator NewI = LI.advanceTo(I, NewIdx.getRegSlot());
+    if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
+      // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
+      // coalesced into that value.
+      assert(NewI->valno != DefVNI && "Multiple defs of value?");
+      LI.removeValNo(DefVNI);
+      return;
+    }
+    // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
+    // If the def at OldIdx was dead, we allow it to be moved across other LI
+    // values. The new range should be placed immediately before NewI, move any
+    // intermediate ranges up.
+    assert(NewI != I && "Inconsistent iterators");
+    std::copy(llvm::next(I), NewI, I);
+    *llvm::prior(NewI) = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+  }
+
+  /// Update LI to reflect an instruction has been moved upwards from OldIdx
+  /// to NewIdx.
+  ///
+  /// 1. Live def at OldIdx:
+  ///    Hoist def to NewIdx.
+  ///
+  /// 2. Dead def at OldIdx:
+  ///    Hoist def+end to NewIdx, possibly move across other values.
+  ///
+  /// 3. Dead def at OldIdx AND existing def at NewIdx:
+  ///    Remove value defined at OldIdx, coalescing it with existing value.
+  ///
+  /// 4. Live def at OldIdx AND existing def at NewIdx:
+  ///    Remove value defined at NewIdx, hoist OldIdx def to NewIdx.
+  ///    (Happens when bundling multiple defs together).
+  ///
+  /// 5. Value killed at OldIdx:
+  ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
+  ///    OldIdx.
+  ///
+  void handleMoveUp(LiveInterval &LI) {
+    // First look for a kill at OldIdx.
+    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
+    LiveInterval::iterator E = LI.end();
+    // Is LI even live at OldIdx?
+    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
+      return;
+
+    // Handle a live-in value.
+    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
+      // If the live-in value isn't killed here, there is nothing to do.
+      if (!SlotIndex::isSameInstr(OldIdx, I->end))
+        return;
+      // Adjust I->end to end at NewIdx. If we are hoisting a kill above
+      // another use, we need to search for that use. Case 5 above.
+      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
+      ++I;
+      // If OldIdx also defines a value, there couldn't have been another use.
+      if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
+        // No def, search for the new kill.
+        // This can never be an early clobber kill since there is no def.
+        llvm::prior(I)->end = findLastUseBefore(LI.reg).getRegSlot();
+        return;
+      }
+    }
+
+    // Now deal with the def at OldIdx.
+    assert(I != E && SlotIndex::isSameInstr(I->start, OldIdx) && "No def?");
+    VNInfo *DefVNI = I->valno;
+    assert(DefVNI->def == I->start && "Inconsistent def");
+    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
+
+    // Check for an existing def at NewIdx.
+    LiveInterval::iterator NewI = LI.find(NewIdx.getRegSlot());
+    if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
+      assert(NewI->valno != DefVNI && "Same value defined more than once?");
+      // There is an existing def at NewIdx.
+      if (I->end.isDead()) {
+        // Case 3: Remove the dead def at OldIdx.
+        LI.removeValNo(DefVNI);
+        return;
+      }
+      // Case 4: Replace def at NewIdx with live def at OldIdx.
+      I->start = DefVNI->def;
+      LI.removeValNo(NewI->valno);
+      return;
+    }
+
+    // There is no existing def at NewIdx. Hoist DefVNI.
+    if (!I->end.isDead()) {
+      // Leave the end point of a live def.
+      I->start = DefVNI->def;
+      return;
+    }
+
+    // DefVNI is a dead def. It may have been moved across other values in LI,
+    // so move I up to NewI. Slide [NewI;I) down one position.
+    std::copy_backward(NewI, I, llvm::next(I));
+    *NewI = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+  }
+
+  void updateRegMaskSlots() {
+    SmallVectorImpl<SlotIndex>::iterator RI =
+      std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
+                       OldIdx);
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*llvm::prior(RI), *RI)) &&
+            "Cannot move regmask instruction above another call");
+    assert((llvm::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *llvm::next(RI))) &&
+            "Cannot move regmask instruction below another call");
+  }
+
+  // Return the last use of reg between NewIdx and OldIdx.
+  SlotIndex findLastUseBefore(unsigned Reg) {
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      SlotIndex LastUse = NewIdx;
+      for (MachineRegisterInfo::use_nodbg_iterator
+             UI = MRI.use_nodbg_begin(Reg),
+             UE = MRI.use_nodbg_end();
+           UI != UE; UI.skipInstruction()) {
+        const MachineInstr* MI = &*UI;
+        SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
+        if (InstSlot > LastUse && InstSlot < OldIdx)
+          LastUse = InstSlot;
+      }
+      return LastUse;
+    }
+
+    // This is a regunit interval, so scanning the use list could be very
+    // expensive. Scan upwards from OldIdx instead.
+    assert(NewIdx < OldIdx && "Expected upwards move");
+    SlotIndexes *Indexes = LIS.getSlotIndexes();
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(NewIdx);
+
+    // OldIdx may not correspond to an instruction any longer, so set MII to
+    // point to the next instruction after OldIdx, or MBB->end().
+    MachineBasicBlock::iterator MII = MBB->end();
+    if (MachineInstr *MI = Indexes->getInstructionFromIndex(
+                           Indexes->getNextNonNullIndex(OldIdx)))
+      if (MI->getParent() == MBB)
+        MII = MI;
+
+    MachineBasicBlock::iterator Begin = MBB->begin();
+    while (MII != Begin) {
+      if ((--MII)->isDebugValue())
+        continue;
+      SlotIndex Idx = Indexes->getInstructionIndex(MII);
+
+      // Stop searching when NewIdx is reached.
+      if (!SlotIndex::isEarlierInstr(NewIdx, Idx))
+        return NewIdx;
+
+      // Check if MII uses Reg.
+      for (MIBundleOperands MO(MII); MO.isValid(); ++MO)
+        if (MO->isReg() &&
+            TargetRegisterInfo::isPhysicalRegister(MO->getReg()) &&
+            TRI.hasRegUnit(MO->getReg(), Reg))
+          return Idx;
+    }
+    // Didn't reach NewIdx. It must be the first instruction in the block.
+    return NewIdx;
+  }
+};
+
+void LiveIntervals::handleMove(MachineInstr* MI, bool UpdateFlags) {
+  assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  Indexes->removeMachineInstrFromMaps(MI);
+  SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
+  assert(getMBBStartIdx(MI->getParent()) <= OldIndex &&
+         OldIndex < getMBBEndIdx(MI->getParent()) &&
+         "Cannot handle moves across basic block boundaries.");
+
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(MI);
+}
+
+void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
+                                         MachineInstr* BundleStart,
+                                         bool UpdateFlags) {
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(MI);
+}
+
+void
+LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      ArrayRef<unsigned> OrigRegs) {
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !Indexes->hasIndex(Begin))
+    --Begin;
+  while (End != MBB->end() && !Indexes->hasIndex(End))
+    ++End;
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB).getPrevSlot();
+  else
+    endIdx = getInstructionIndex(End);
+
+  Indexes->repairIndexesInRange(MBB, Begin, End);
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+    for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
+         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+      if (MOI->isReg() &&
+          TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
+          !hasInterval(MOI->getReg())) {
+        LiveInterval &LI = getOrCreateInterval(MOI->getReg());
+        computeVirtRegInterval(&LI);
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) {
+    unsigned Reg = OrigRegs[i];
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    LiveInterval &LI = getInterval(Reg);
+    // FIXME: Should we support undefs that gain defs?
+    if (!LI.hasAtLeastOneValue())
+      continue;
+
+    LiveInterval::iterator LII = LI.find(endIdx);
+    SlotIndex lastUseIdx;
+    if (LII != LI.end() && LII->start < endIdx)
+      lastUseIdx = LII->end;
+    else
+      --LII;
+
+    for (MachineBasicBlock::iterator I = End; I != Begin;) {
+      --I;
+      MachineInstr *MI = I;
+      if (MI->isDebugValue())
+        continue;
+
+      SlotIndex instrIdx = getInstructionIndex(MI);
+      bool isStartValid = getInstructionFromIndex(LII->start);
+      bool isEndValid = getInstructionFromIndex(LII->end);
+
+      // FIXME: This doesn't currently handle early-clobber or multiple removed
+      // defs inside of the region to repair.
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        const MachineOperand &MO = *OI;
+        if (!MO.isReg() || MO.getReg() != Reg)
+          continue;
+
+        if (MO.isDef()) {
+          if (!isStartValid) {
+            if (LII->end.isDead()) {
+              SlotIndex prevStart;
+              if (LII != LI.begin())
+                prevStart = llvm::prior(LII)->start;
+
+              // FIXME: This could be more efficient if there was a removeRange
+              // method that returned an iterator.
+              LI.removeRange(*LII, true);
+              if (prevStart.isValid())
+                LII = LI.find(prevStart);
+              else
+                LII = LI.begin();
+            } else {
+              LII->start = instrIdx.getRegSlot();
+              LII->valno->def = instrIdx.getRegSlot();
+              if (MO.getSubReg() && !MO.isUndef())
+                lastUseIdx = instrIdx.getRegSlot();
+              else
+                lastUseIdx = SlotIndex();
+              continue;
+            }
+          }
+
+          if (!lastUseIdx.isValid()) {
+            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
+                                          VNInfoAllocator);
+            LiveRange LR(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI);
+            LII = LI.addRange(LR);
+          } else if (LII->start != instrIdx.getRegSlot()) {
+            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
+                                          VNInfoAllocator);
+            LiveRange LR(instrIdx.getRegSlot(), lastUseIdx, VNI);
+            LII = LI.addRange(LR);
+          }
+
+          if (MO.getSubReg() && !MO.isUndef())
+            lastUseIdx = instrIdx.getRegSlot();
+          else
+            lastUseIdx = SlotIndex();
+        } else if (MO.isUse()) {
+          // FIXME: This should probably be handled outside of this branch,
+          // either as part of the def case (for defs inside of the region) or
+          // after the loop over the region.
+          if (!isEndValid && !LII->end.isBlock())
+            LII->end = instrIdx.getRegSlot();
+          if (!lastUseIdx.isValid())
+            lastUseIdx = instrIdx.getRegSlot();
+        }
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
new file mode 100644
index 000000000000..d5a81a311c64
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -0,0 +1,204 @@
+//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// LiveIntervalUnion represents a coalesced set of live intervals. This may be
+// used during coalescing to represent a congruence class, or during register
+// allocation to model liveness of a physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+
+// Merge a LiveInterval's segments. Guarantee no overlaps.
+void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
+  if (VirtReg.empty())
+    return;
+  ++Tag;
+
+  // Insert each of the virtual register's live segments into the map.
+  LiveInterval::iterator RegPos = VirtReg.begin();
+  LiveInterval::iterator RegEnd = VirtReg.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  while (SegPos.valid()) {
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+    if (++RegPos == RegEnd)
+      return;
+    SegPos.advanceTo(RegPos->start);
+  }
+
+  // We have reached the end of Segments, so it is no longer necessary to search
+  // for the insertion position.
+  // It is faster to insert the end first.
+  --RegEnd;
+  SegPos.insert(RegEnd->start, RegEnd->end, &VirtReg);
+  for (; RegPos != RegEnd; ++RegPos, ++SegPos)
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+}
+
+// Remove a live virtual register's segments from this union.
+void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
+  if (VirtReg.empty())
+    return;
+  ++Tag;
+
+  // Remove each of the virtual register's live segments from the map.
+  LiveInterval::iterator RegPos = VirtReg.begin();
+  LiveInterval::iterator RegEnd = VirtReg.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  for (;;) {
+    assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
+    SegPos.erase();
+    if (!SegPos.valid())
+      return;
+
+    // Skip all segments that may have been coalesced.
+    RegPos = VirtReg.advanceTo(RegPos, SegPos.start());
+    if (RegPos == RegEnd)
+      return;
+
+    SegPos.advanceTo(RegPos->start);
+  }
+}
+
+void
+LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+  if (empty()) {
+    OS << " empty\n";
+    return;
+  }
+  for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+    OS << " [" << SI.start() << ' ' << SI.stop() << "):"
+       << PrintReg(SI.value()->reg, TRI);
+  }
+  OS << '\n';
+}
+
+#ifndef NDEBUG
+// Verify the live intervals in this union and add them to the visited set.
+void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
+  for (SegmentIter SI = Segments.begin(); SI.valid(); ++SI)
+    VisitedVRegs.set(SI.value()->reg);
+}
+#endif //!NDEBUG
+
+// Scan the vector of interfering virtual registers in this union. Assume it's
+// quite small.
+bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
+  SmallVectorImpl<LiveInterval*>::const_iterator I =
+    std::find(InterferingVRegs.begin(), InterferingVRegs.end(), VirtReg);
+  return I != InterferingVRegs.end();
+}
+
+// Collect virtual registers in this union that interfere with this
+// query's live virtual register.
+//
+// The query state is one of:
+//
+// 1. CheckedFirstInterference == false: Iterators are uninitialized.
+// 2. SeenAllInterferences == true: InterferingVRegs complete, iterators unused.
+// 3. Iterators left at the last seen intersection.
+//
+unsigned LiveIntervalUnion::Query::
+collectInterferingVRegs(unsigned MaxInterferingRegs) {
+  // Fast path return if we already have the desired information.
+  if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs)
+    return InterferingVRegs.size();
+
+  // Set up iterators on the first call.
+  if (!CheckedFirstInterference) {
+    CheckedFirstInterference = true;
+
+    // Quickly skip interference check for empty sets.
+    if (VirtReg->empty() || LiveUnion->empty()) {
+      SeenAllInterferences = true;
+      return 0;
+    }
+
+    // In most cases, the union will start before VirtReg.
+    VirtRegI = VirtReg->begin();
+    LiveUnionI.setMap(LiveUnion->getMap());
+    LiveUnionI.find(VirtRegI->start);
+  }
+
+  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveInterval *RecentReg = 0;
+  while (LiveUnionI.valid()) {
+    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+
+    // Check for overlapping interference.
+    while (VirtRegI->start < LiveUnionI.stop() &&
+           VirtRegI->end > LiveUnionI.start()) {
+      // This is an overlap, record the interfering register.
+      LiveInterval *VReg = LiveUnionI.value();
+      if (VReg != RecentReg && !isSeenInterference(VReg)) {
+        RecentReg = VReg;
+        InterferingVRegs.push_back(VReg);
+        if (InterferingVRegs.size() >= MaxInterferingRegs)
+          return InterferingVRegs.size();
+      }
+      // This LiveUnion segment is no longer interesting.
+      if (!(++LiveUnionI).valid()) {
+        SeenAllInterferences = true;
+        return InterferingVRegs.size();
+      }
+    }
+
+    // The iterators are now not overlapping, LiveUnionI has been advanced
+    // beyond VirtRegI.
+    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
+
+    // Advance the iterator that ends first.
+    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
+    if (VirtRegI == VirtRegEnd)
+      break;
+
+    // Detect overlap, handle above.
+    if (VirtRegI->start < LiveUnionI.stop())
+      continue;
+
+    // Still not overlapping. Catch up LiveUnionI.
+    LiveUnionI.advanceTo(VirtRegI->start);
+  }
+  SeenAllInterferences = true;
+  return InterferingVRegs.size();
+}
+
+void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
+                                    unsigned NSize) {
+  // Reuse existing allocation.
+  if (NSize == Size)
+    return;
+  clear();
+  Size = NSize;
+  LIUs = static_cast<LiveIntervalUnion*>(
+    malloc(sizeof(LiveIntervalUnion)*NSize));
+  for (unsigned i = 0; i != Size; ++i)
+    new(LIUs + i) LiveIntervalUnion(Alloc);
+}
+
+void LiveIntervalUnion::Array::clear() {
+  if (!LIUs)
+    return;
+  for (unsigned i = 0; i != Size; ++i)
+    LIUs[i].~LiveIntervalUnion();
+  free(LIUs);
+  Size =  0;
+  LIUs = 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
new file mode 100644
index 000000000000..dede490d91ba
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -0,0 +1,380 @@
+//===---- LiveRangeCalc.cpp - Calculate live ranges -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveRangeCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "LiveRangeCalc.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+void LiveRangeCalc::reset(const MachineFunction *mf,
+                          SlotIndexes *SI,
+                          MachineDominatorTree *MDT,
+                          VNInfo::Allocator *VNIA) {
+  MF = mf;
+  MRI = &MF->getRegInfo();
+  Indexes = SI;
+  DomTree = MDT;
+  Alloc = VNIA;
+
+  unsigned N = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(N);
+  LiveOut.resize(N);
+  LiveIn.clear();
+}
+
+
+void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LI->createDeadDef() will deduplicate.
+  for (MachineRegisterInfo::def_iterator
+       I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
+    const MachineInstr *MI = &*I;
+    // Find the corresponding slot index.
+    SlotIndex Idx;
+    if (MI->isPHI())
+      // PHI defs begin at the basic block start index.
+      Idx = Indexes->getMBBStartIdx(MI->getParent());
+    else
+      // Instructions are either normal 'r', or early clobber 'e'.
+      Idx = Indexes->getInstructionIndex(MI)
+        .getRegSlot(I.getOperand().isEarlyClobber());
+
+    // Create the def in LI. This may find an existing def.
+    LI->createDeadDef(Idx, *Alloc);
+  }
+}
+
+
+void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all operands that read Reg. This may include partial defs.
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervalAnalysis::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
+    if (!MO.readsReg())
+      continue;
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    const MachineInstr *MI = &*I;
+
+    // Find the SlotIndex being read.
+    SlotIndex Idx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // PHI operands are paired: (Reg, PredMBB).
+      // Extend the live range to be live-out from PredMBB.
+      Idx = Indexes->getMBBEndIdx(MI->getOperand(I.getOperandNo()+1).getMBB());
+    } else {
+      // This is a normal instruction.
+      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
+      // Check for early-clobber redefs.
+      unsigned DefIdx;
+      if (MO.isDef()) {
+        if (MO.isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      } else if (MI->isRegTiedToDefOperand(I.getOperandNo(), &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        if (MI->getOperand(DefIdx).isEarlyClobber())
+          Idx = Idx.getRegSlot(true);
+      }
+    }
+    extend(LI, Idx, Reg);
+  }
+}
+
+
+// Transfer information from the LiveIn vector to the live ranges.
+void LiveRangeCalc::updateLiveIns() {
+  LiveRangeUpdater Updater;
+  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
+         E = LiveIn.end(); I != E; ++I) {
+    if (!I->DomNode)
+      continue;
+    MachineBasicBlock *MBB = I->DomNode->getBlock();
+    assert(I->Value && "No live-in value found");
+    SlotIndex Start, End;
+    tie(Start, End) = Indexes->getMBBRange(MBB);
+
+    if (I->Kill.isValid())
+      // Value is killed inside this block.
+      End = I->Kill;
+    else {
+      // The value is live-through, update LiveOut as well.
+      // Defer the Domtree lookup until it is needed.
+      assert(Seen.test(MBB->getNumber()));
+      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
+    }
+    Updater.setDest(I->LI);
+    Updater.add(Start, End, I->Value);
+  }
+  LiveIn.clear();
+}
+
+
+void LiveRangeCalc::extend(LiveInterval *LI,
+                           SlotIndex Kill,
+                           unsigned PhysReg) {
+  assert(LI && "Missing live range");
+  assert(Kill.isValid() && "Invalid SlotIndex");
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill.getPrevSlot());
+  assert(KillMBB && "No MBB at Kill");
+
+  // Is there a def in the same MBB we can extend?
+  if (LI->extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+    return;
+
+  // Find the single reaching def, or determine if Kill is jointly dominated by
+  // multiple values, and we may need to create even more phi-defs to preserve
+  // VNInfo SSA form.  Perform a search for all predecessor blocks where we
+  // know the dominating VNInfo.
+  if (findReachingDefs(LI, KillMBB, Kill, PhysReg))
+    return;
+
+  // When there were multiple different values, we may need new PHIs.
+  calculateValues();
+}
+
+
+// This function is called by a client after using the low-level API to add
+// live-out and live-in blocks.  The unique value optimization is not
+// available, SplitEditor::transferValues handles that case directly anyway.
+void LiveRangeCalc::calculateValues() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+  updateSSA();
+  updateLiveIns();
+}
+
+
+bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
+                                     MachineBasicBlock *KillMBB,
+                                     SlotIndex Kill,
+                                     unsigned PhysReg) {
+  unsigned KillMBBNum = KillMBB->getNumber();
+
+  // Block numbers where LI should be live-in.
+  SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
+
+  // Remember if we have seen more than one value.
+  bool UniqueVNI = true;
+  VNInfo *TheVNI = 0;
+
+  // Using Seen as a visited set, perform a BFS for all reaching defs.
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(WorkList[i]);
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      llvm_unreachable("Use not jointly dominated by defs.");
+    }
+
+    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+        !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      errs() << "The register needs to be live in to BB#" << MBB->getNumber()
+             << ", but is missing from the live-in list.\n";
+      llvm_unreachable("Invalid global physical register");
+    }
+#endif
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+           PE = MBB->pred_end(); PI != PE; ++PI) {
+       MachineBasicBlock *Pred = *PI;
+
+       // Is this a known live-out block?
+       if (Seen.test(Pred->getNumber())) {
+         if (VNInfo *VNI = LiveOut[Pred].first) {
+           if (TheVNI && TheVNI != VNI)
+             UniqueVNI = false;
+           TheVNI = VNI;
+         }
+         continue;
+       }
+
+       SlotIndex Start, End;
+       tie(Start, End) = Indexes->getMBBRange(Pred);
+
+       // First time we see Pred.  Try to determine the live-out value, but set
+       // it as null if Pred is live-through with an unknown value.
+       VNInfo *VNI = LI->extendInBlock(Start, End);
+       setLiveOutValue(Pred, VNI);
+       if (VNI) {
+         if (TheVNI && TheVNI != VNI)
+           UniqueVNI = false;
+         TheVNI = VNI;
+         continue;
+       }
+
+       // No, we need a live-in value for Pred as well
+       if (Pred != KillMBB)
+          WorkList.push_back(Pred->getNumber());
+       else
+          // Loopback to KillMBB, so value is really live through.
+         Kill = SlotIndex();
+    }
+  }
+
+  LiveIn.clear();
+
+  // Both updateSSA() and LiveRangeUpdater benefit from ordered blocks, but
+  // neither require it. Skip the sorting overhead for small updates.
+  if (WorkList.size() > 4)
+    array_pod_sort(WorkList.begin(), WorkList.end());
+
+  // If a unique reaching def was found, blit in the live ranges immediately.
+  if (UniqueVNI) {
+    LiveRangeUpdater Updater(LI);
+    for (SmallVectorImpl<unsigned>::const_iterator
+         I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+       SlotIndex Start, End;
+       tie(Start, End) = Indexes->getMBBRange(*I);
+       // Trim the live range in KillMBB.
+       if (*I == KillMBBNum && Kill.isValid())
+         End = Kill;
+       else
+         LiveOut[MF->getBlockNumbered(*I)] =
+           LiveOutPair(TheVNI, (MachineDomTreeNode *)0);
+       Updater.add(Start, End, TheVNI);
+    }
+    return true;
+  }
+
+  // Multiple values were found, so transfer the work list to the LiveIn array
+  // where UpdateSSA will use it as a work list.
+  LiveIn.reserve(WorkList.size());
+  for (SmallVectorImpl<unsigned>::const_iterator
+       I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
+    addLiveInBlock(LI, DomTree->getNode(MBB));
+    if (MBB == KillMBB)
+      LiveIn.back().Kill = Kill;
+  }
+
+  return false;
+}
+
+
+// This is essentially the same iterative algorithm that SSAUpdater uses,
+// except we already have a dominator tree, so we don't have to recompute it.
+void LiveRangeCalc::updateSSA() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  // Interate until convergence.
+  unsigned Changes;
+  do {
+    Changes = 0;
+    // Propagate live-out values down the dominator tree, inserting phi-defs
+    // when necessary.
+    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
+           E = LiveIn.end(); I != E; ++I) {
+      MachineDomTreeNode *Node = I->DomNode;
+      // Skip block if the live-in value has already been determined.
+      if (!Node)
+        continue;
+      MachineBasicBlock *MBB = Node->getBlock();
+      MachineDomTreeNode *IDom = Node->getIDom();
+      LiveOutPair IDomValue;
+
+      // We need a live-in value to a block with no immediate dominator?
+      // This is probably an unreachable block that has survived somehow.
+      bool needPHI = !IDom || !Seen.test(IDom->getBlock()->getNumber());
+
+      // IDom dominates all of our predecessors, but it may not be their
+      // immediate dominator. Check if any of them have live-out values that are
+      // properly dominated by IDom. If so, we need a phi-def here.
+      if (!needPHI) {
+        IDomValue = LiveOut[IDom->getBlock()];
+
+        // Cache the DomTree node that defined the value.
+        if (IDomValue.first && !IDomValue.second)
+          LiveOut[IDom->getBlock()].second = IDomValue.second =
+            DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+
+        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+               PE = MBB->pred_end(); PI != PE; ++PI) {
+          LiveOutPair &Value = LiveOut[*PI];
+          if (!Value.first || Value.first == IDomValue.first)
+            continue;
+
+          // Cache the DomTree node that defined the value.
+          if (!Value.second)
+            Value.second =
+              DomTree->getNode(Indexes->getMBBFromIndex(Value.first->def));
+
+          // This predecessor is carrying something other than IDomValue.
+          // It could be because IDomValue hasn't propagated yet, or it could be
+          // because MBB is in the dominance frontier of that value.
+          if (DomTree->dominates(IDom, Value.second)) {
+            needPHI = true;
+            break;
+          }
+        }
+      }
+
+      // The value may be live-through even if Kill is set, as can happen when
+      // we are called from extendRange. In that case LiveOutSeen is true, and
+      // LiveOut indicates a foreign or missing value.
+      LiveOutPair &LOP = LiveOut[MBB];
+
+      // Create a phi-def if required.
+      if (needPHI) {
+        ++Changes;
+        assert(Alloc && "Need VNInfo allocator to create PHI-defs");
+        SlotIndex Start, End;
+        tie(Start, End) = Indexes->getMBBRange(MBB);
+        VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
+        I->Value = VNI;
+        // This block is done, we know the final value.
+        I->DomNode = 0;
+
+        // Add liveness since updateLiveIns now skips this node.
+        if (I->Kill.isValid())
+          I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+        else {
+          I->LI->addRange(LiveRange(Start, End, VNI));
+          LOP = LiveOutPair(VNI, Node);
+        }
+      } else if (IDomValue.first) {
+        // No phi-def here. Remember incoming value.
+        I->Value = IDomValue.first;
+
+        // If the IDomValue is killed in the block, don't propagate through.
+        if (I->Kill.isValid())
+          continue;
+
+        // Propagate IDomValue if it isn't killed:
+        // MBB is live-out and doesn't define its own value.
+        if (LOP.first == IDomValue.first)
+          continue;
+        ++Changes;
+        LOP = IDomValue;
+      }
+    }
+  } while (Changes);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
new file mode 100644
index 000000000000..57cab7b34220
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
@@ -0,0 +1,242 @@
+//===---- LiveRangeCalc.h - Calculate live ranges ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
+// caches information about values in the CFG to speed up repeated operations
+// on the same live range.  The cache can be shared by non-overlapping live
+// ranges.  SplitKit uses that when computing the live range of split products.
+//
+// A low-level interface is available to clients that know where a variable is
+// live, but don't know which value it has as every point.  LiveRangeCalc will
+// propagate values down the dominator tree, and even insert PHI-defs where
+// needed.  SplitKit uses this faster interface when possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVERANGECALC_H
+#define LLVM_CODEGEN_LIVERANGECALC_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Forward declarations for MachineDominators.h:
+class MachineDominatorTree;
+template <class NodeT> class DomTreeNodeBase;
+typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
+
+class LiveRangeCalc {
+  const MachineFunction *MF;
+  const MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  MachineDominatorTree *DomTree;
+  VNInfo::Allocator *Alloc;
+
+  /// Seen - Bit vector of active entries in LiveOut, also used as a visited
+  /// set by findReachingDefs.  One entry per basic block, indexed by block
+  /// number.  This is kept as a separate bit vector because it can be cleared
+  /// quickly when switching live ranges.
+  BitVector Seen;
+
+  /// LiveOutPair - A value and the block that defined it.  The domtree node is
+  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
+  typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
+
+  /// LiveOutMap - Map basic blocks to the value leaving the block.
+  typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
+
+  /// LiveOut - Map each basic block where a live range is live out to the
+  /// live-out value and its defining block.
+  ///
+  /// For every basic block, MBB, one of these conditions shall be true:
+  ///
+  ///  1. !Seen.count(MBB->getNumber())
+  ///     Blocks without a Seen bit are ignored.
+  ///  2. LiveOut[MBB].second.getNode() == MBB
+  ///     The live-out value is defined in MBB.
+  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
+  ///     The live-out value passses through MBB. All predecessors must carry
+  ///     the same value.
+  ///
+  /// The domtree node may be null, it can be computed.
+  ///
+  /// The map can be shared by multiple live ranges as long as no two are
+  /// live-out of the same block.
+  LiveOutMap LiveOut;
+
+  /// LiveInBlock - Information about a basic block where a live range is known
+  /// to be live-in, but the value has not yet been determined.
+  struct LiveInBlock {
+    // LI - The live range that is live-in to this block.  The algorithms can
+    // handle multiple non-overlapping live ranges simultaneously.
+    LiveInterval *LI;
+
+    // DomNode - Dominator tree node for the block.
+    // Cleared when the final value has been determined and LI has been updated.
+    MachineDomTreeNode *DomNode;
+
+    // Position in block where the live-in range ends, or SlotIndex() if the
+    // range passes through the block.  When the final value has been
+    // determined, the range from the block start to Kill will be added to LI.
+    SlotIndex Kill;
+
+    // Live-in value filled in by updateSSA once it is known.
+    VNInfo *Value;
+
+    LiveInBlock(LiveInterval *li, MachineDomTreeNode *node, SlotIndex kill)
+      : LI(li), DomNode(node), Kill(kill), Value(0) {}
+  };
+
+  /// LiveIn - Work list of blocks where the live-in value has yet to be
+  /// determined.  This list is typically computed by findReachingDefs() and
+  /// used as a work list by updateSSA().  The low-level interface may also be
+  /// used to add entries directly.
+  SmallVector<LiveInBlock, 16> LiveIn;
+
+  /// Assuming that LI is live-in to KillMBB and killed at Kill, find the set
+  /// of defs that can reach it.
+  ///
+  /// If only one def can reach Kill, all paths from the def to kill are added
+  /// to LI, and the function returns true.
+  ///
+  /// If multiple values can reach Kill, the blocks that need LI to be live in
+  /// are added to the LiveIn array, and the function returns false.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  bool findReachingDefs(LiveInterval *LI,
+                        MachineBasicBlock *KillMBB,
+                        SlotIndex Kill,
+                        unsigned PhysReg);
+
+  /// updateSSA - Compute the values that will be live in to all requested
+  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
+  ///
+  /// Every live-in block must be jointly dominated by the added live-out
+  /// blocks.  No values are read from the live ranges.
+  void updateSSA();
+
+  /// Add liveness as specified in the LiveIn vector.
+  void updateLiveIns();
+
+public:
+  LiveRangeCalc() : MF(0), MRI(0), Indexes(0), DomTree(0), Alloc(0) {}
+
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Calculate live ranges from scratch.
+  //
+
+  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
+  /// caches must be reset before attempting calculations with a live range
+  /// that may overlap a previously computed live range, and before the first
+  /// live range in a function.  If live ranges are not known to be
+  /// non-overlapping, call reset before each.
+  void reset(const MachineFunction *MF,
+             SlotIndexes*,
+             MachineDominatorTree*,
+             VNInfo::Allocator*);
+
+  /// calculate - Calculate the live range of a virtual register from its defs
+  /// and uses.  LI must be empty with no values.
+  void calculate(LiveInterval *LI);
+
+  //===--------------------------------------------------------------------===//
+  // Mid-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Modify existing live ranges.
+  //
+
+  /// extend - Extend the live range of LI to reach Kill.
+  ///
+  /// The existing values in LI must be live so they jointly dominate Kill.  If
+  /// Kill is not dominated by a single existing value, PHI-defs are inserted
+  /// as required to preserve SSA form.  If Kill is known to be dominated by a
+  /// single existing value, Alloc may be null.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+
+  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
+  /// Each instruction defining Reg gets a new VNInfo with a corresponding
+  /// minimal live range.
+  void createDeadDefs(LiveInterval *LI, unsigned Reg);
+
+  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
+  void createDeadDefs(LiveInterval *LI) {
+    createDeadDefs(LI, LI->reg);
+  }
+
+  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveInterval *LI, unsigned Reg);
+
+  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
+  void extendToUses(LiveInterval *LI) {
+    extendToUses(LI, LI->reg);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // These functions can be used to compute live ranges where the live-in and
+  // live-out blocks are already known, but the SSA value in each block is
+  // unknown.
+  //
+  // After calling reset(), add known live-out values and known live-in blocks.
+  // Then call calculateValues() to compute the actual value that is
+  // live-in to each block, and add liveness to the live ranges.
+  //
+
+  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
+  /// calculateValues() function will not add liveness for MBB, the caller
+  /// should take care of that.
+  ///
+  /// VNI may be null only if MBB is a live-through block also passed to
+  /// addLiveInBlock().
+  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
+    Seen.set(MBB->getNumber());
+    LiveOut[MBB] = LiveOutPair(VNI, (MachineDomTreeNode *)0);
+  }
+
+  /// addLiveInBlock - Add a block with an unknown live-in value.  This
+  /// function can only be called once per basic block.  Once the live-in value
+  /// has been determined, calculateValues() will add liveness to LI.
+  ///
+  /// @param LI      The live range that is live-in to the block.
+  /// @param DomNode The domtree node for the block.
+  /// @param Kill    Index in block where LI is killed.  If the value is
+  ///                live-through, set Kill = SLotIndex() and also call
+  ///                setLiveOutValue(MBB, 0).
+  void addLiveInBlock(LiveInterval *LI,
+                      MachineDomTreeNode *DomNode,
+                      SlotIndex Kill = SlotIndex()) {
+    LiveIn.push_back(LiveInBlock(LI, DomNode, Kill));
+  }
+
+  /// calculateValues - Calculate the value that will be live-in to each block
+  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
+  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
+  /// whole block for live-through blocks.
+  ///
+  /// Every predecessor of a live-in block must have been given a value with
+  /// setLiveOutValue, the value may be null for live-trough blocks.
+  void calculateValues();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
new file mode 100644
index 000000000000..7793e96c3540
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -0,0 +1,387 @@
+//===-- LiveRangeEdit.cpp - Basic tools for editing a register live range -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeEdit class represents changes done to a virtual register when it
+// is spilled or split.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumDCEDeleted,     "Number of instructions deleted by DCE");
+STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE");
+STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
+
+void LiveRangeEdit::Delegate::anchor() { }
+
+LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->grow();
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  LiveInterval &LI = LIS.getOrCreateInterval(VReg);
+  NewRegs.push_back(&LI);
+  return LI;
+}
+
+bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
+                                          const MachineInstr *DefMI,
+                                          AliasAnalysis *aa) {
+  assert(DefMI && "Missing instruction");
+  ScannedRemattable = true;
+  if (!TII.isTriviallyReMaterializable(DefMI, aa))
+    return false;
+  Remattable.insert(VNI);
+  return true;
+}
+
+void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
+  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
+       E = getParent().vni_end(); I != E; ++I) {
+    VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
+    if (!DefMI)
+      continue;
+    checkRematerializable(VNI, DefMI, aa);
+  }
+  ScannedRemattable = true;
+}
+
+bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
+  if (!ScannedRemattable)
+    scanRemattable(aa);
+  return !Remattable.empty();
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) const {
+  OrigIdx = OrigIdx.getRegSlot(true);
+  UseIdx = UseIdx.getRegSlot(true);
+  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = OrigMI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+      continue;
+
+    // We can't remat physreg uses, unless it is a constant.
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MRI.isConstantPhysReg(MO.getReg(), *OrigMI->getParent()->getParent()))
+        continue;
+      return false;
+    }
+
+    LiveInterval &li = LIS.getInterval(MO.getReg());
+    const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+
+    // Don't allow rematerialization immediately after the original def.
+    // It would be incorrect if OrigMI redefines the register.
+    // See PR14098.
+    if (SlotIndex::isSameInstr(OrigIdx, UseIdx))
+      return false;
+
+    if (OVNI != li.getVNInfoAt(UseIdx))
+      return false;
+  }
+  return true;
+}
+
+bool LiveRangeEdit::canRematerializeAt(Remat &RM,
+                                       SlotIndex UseIdx,
+                                       bool cheapAsAMove) {
+  assert(ScannedRemattable && "Call anyRematerializable first");
+
+  // Use scanRemattable info.
+  if (!Remattable.count(RM.ParentVNI))
+    return false;
+
+  // No defining instruction provided.
+  SlotIndex DefIdx;
+  if (RM.OrigMI)
+    DefIdx = LIS.getInstructionIndex(RM.OrigMI);
+  else {
+    DefIdx = RM.ParentVNI->def;
+    RM.OrigMI = LIS.getInstructionFromIndex(DefIdx);
+    assert(RM.OrigMI && "No defining instruction for remattable value");
+  }
+
+  // If only cheap remats were requested, bail out early.
+  if (cheapAsAMove && !RM.OrigMI->isAsCheapAsAMove())
+    return false;
+
+  // Verify that all used registers are available with the same values.
+  if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx))
+    return false;
+
+  return true;
+}
+
+SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         unsigned DestReg,
+                                         const Remat &RM,
+                                         const TargetRegisterInfo &tri,
+                                         bool Late) {
+  assert(RM.OrigMI && "Invalid remat");
+  TII.reMaterialize(MBB, MI, DestReg, 0, RM.OrigMI, tri);
+  Rematted.insert(RM.ParentVNI);
+  return LIS.getSlotIndexes()->insertMachineInstrInMaps(--MI, Late)
+           .getRegSlot();
+}
+
+void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
+  if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
+    LIS.removeInterval(Reg);
+}
+
+bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
+                               SmallVectorImpl<MachineInstr*> &Dead) {
+  MachineInstr *DefMI = 0, *UseMI = 0;
+
+  // Check that there is a single def and a single use.
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI.reg_nodbg_begin(LI->reg),
+       E = MRI.reg_nodbg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MO.isDef()) {
+      if (DefMI && DefMI != MI)
+        return false;
+      if (!MI->canFoldAsLoad())
+        return false;
+      DefMI = MI;
+    } else if (!MO.isUndef()) {
+      if (UseMI && UseMI != MI)
+        return false;
+      // FIXME: Targets don't know how to fold subreg uses.
+      if (MO.getSubReg())
+        return false;
+      UseMI = MI;
+    }
+  }
+  if (!DefMI || !UseMI)
+    return false;
+
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI,
+                          LIS.getInstructionIndex(DefMI),
+                          LIS.getInstructionIndex(UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(&TII, 0, SawStore))
+    return false;
+
+  DEBUG(dbgs() << "Try to fold single def: " << *DefMI
+               << "       into single use: " << *UseMI);
+
+  SmallVector<unsigned, 8> Ops;
+  if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
+    return false;
+
+  MachineInstr *FoldMI = TII.foldMemoryOperand(UseMI, Ops, DefMI);
+  if (!FoldMI)
+    return false;
+  DEBUG(dbgs() << "                folded: " << *FoldMI);
+  LIS.ReplaceMachineInstrInMaps(UseMI, FoldMI);
+  UseMI->eraseFromParent();
+  DefMI->addRegisterDead(LI->reg, 0);
+  Dead.push_back(DefMI);
+  ++NumDCEFoldedLoads;
+  return true;
+}
+
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled) {
+  SetVector<LiveInterval*,
+            SmallVector<LiveInterval*, 8>,
+            SmallPtrSet<LiveInterval*, 8> > ToShrink;
+
+  for (;;) {
+    // Erase all dead defs.
+    while (!Dead.empty()) {
+      MachineInstr *MI = Dead.pop_back_val();
+      assert(MI->allDefsAreDead() && "Def isn't really dead");
+      SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+
+      // Never delete inline asm.
+      if (MI->isInlineAsm()) {
+        DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
+        continue;
+      }
+
+      // Use the same criteria as DeadMachineInstructionElim.
+      bool SawStore = false;
+      if (!MI->isSafeToMove(&TII, 0, SawStore)) {
+        DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
+        continue;
+      }
+
+      DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
+
+      // Collect virtual registers to be erased after MI is gone.
+      SmallVector<unsigned, 8> RegsToErase;
+      bool ReadsPhysRegs = false;
+
+      // Check for live intervals that may shrink
+      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+             MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+        if (!MOI->isReg())
+          continue;
+        unsigned Reg = MOI->getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+          // Check if MI reads any unreserved physregs.
+          if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
+            ReadsPhysRegs = true;
+          continue;
+        }
+        LiveInterval &LI = LIS.getInterval(Reg);
+
+        // Shrink read registers, unless it is likely to be expensive and
+        // unlikely to change anything. We typically don't want to shrink the
+        // PIC base register that has lots of uses everywhere.
+        // Always shrink COPY uses that probably come from live range splitting.
+        if (MI->readsVirtualRegister(Reg) &&
+            (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
+             LI.killedAt(Idx)))
+          ToShrink.insert(&LI);
+
+        // Remove defined value.
+        if (MOI->isDef()) {
+          if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
+            if (TheDelegate)
+              TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+            LI.removeValNo(VNI);
+            if (LI.empty())
+              RegsToErase.push_back(Reg);
+          }
+        }
+      }
+
+      // Currently, we don't support DCE of physreg live ranges. If MI reads
+      // any unreserved physregs, don't erase the instruction, but turn it into
+      // a KILL instead. This way, the physreg live ranges don't end up
+      // dangling.
+      // FIXME: It would be better to have something like shrinkToUses() for
+      // physregs. That could potentially enable more DCE and it would free up
+      // the physreg. It would not happen often, though.
+      if (ReadsPhysRegs) {
+        MI->setDesc(TII.get(TargetOpcode::KILL));
+        // Remove all operands that aren't physregs.
+        for (unsigned i = MI->getNumOperands(); i; --i) {
+          const MachineOperand &MO = MI->getOperand(i-1);
+          if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+            continue;
+          MI->RemoveOperand(i-1);
+        }
+        DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+      } else {
+        if (TheDelegate)
+          TheDelegate->LRE_WillEraseInstruction(MI);
+        LIS.RemoveMachineInstrFromMaps(MI);
+        MI->eraseFromParent();
+        ++NumDCEDeleted;
+      }
+
+      // Erase any virtregs that are now empty and unused. There may be <undef>
+      // uses around. Keep the empty live range in that case.
+      for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+        unsigned Reg = RegsToErase[i];
+        if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+          ToShrink.remove(&LIS.getInterval(Reg));
+          eraseVirtReg(Reg);
+        }
+      }
+    }
+
+    if (ToShrink.empty())
+      break;
+
+    // Shrink just one live interval. Then delete new dead defs.
+    LiveInterval *LI = ToShrink.back();
+    ToShrink.pop_back();
+    if (foldAsLoad(LI, Dead))
+      continue;
+    if (TheDelegate)
+      TheDelegate->LRE_WillShrinkVirtReg(LI->reg);
+    if (!LIS.shrinkToUses(LI, &Dead))
+      continue;
+    
+    // Don't create new intervals for a register being spilled.
+    // The new intervals would have to be spilled anyway so its not worth it.
+    // Also they currently aren't spilled so creating them and not spilling
+    // them results in incorrect code.
+    bool BeingSpilled = false;
+    for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) {
+      if (LI->reg == RegsBeingSpilled[i]) {
+        BeingSpilled = true;
+        break;
+      }
+    }
+    
+    if (BeingSpilled) continue;
+
+    // LI may have been separated, create new intervals.
+    LI->RenumberValues(LIS);
+    ConnectedVNInfoEqClasses ConEQ(LIS);
+    unsigned NumComp = ConEQ.Classify(LI);
+    if (NumComp <= 1)
+      continue;
+    ++NumFracRanges;
+    bool IsOriginal = VRM && VRM->getOriginal(LI->reg) == LI->reg;
+    DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
+    SmallVector<LiveInterval*, 8> Dups(1, LI);
+    for (unsigned i = 1; i != NumComp; ++i) {
+      Dups.push_back(&createFrom(LI->reg));
+      // If LI is an original interval that hasn't been split yet, make the new
+      // intervals their own originals instead of referring to LI. The original
+      // interval must contain all the split products, and LI doesn't.
+      if (IsOriginal)
+        VRM->setIsSplitFromReg(Dups.back()->reg, 0);
+      if (TheDelegate)
+        TheDelegate->LRE_DidCloneVirtReg(Dups.back()->reg, LI->reg);
+    }
+    ConEQ.Distribute(&Dups[0], MRI);
+    DEBUG({
+      for (unsigned i = 0; i != NumComp; ++i)
+        dbgs() << '\t' << *Dups[i] << '\n';
+    });
+  }
+}
+
+void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
+                                             const MachineLoopInfo &Loops) {
+  VirtRegAuxInfo VRAI(MF, LIS, Loops);
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    LiveInterval &LI = **I;
+    if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
+      DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
+                   << MRI.getRegClass(LI.reg)->getName() << '\n');
+    VRAI.CalculateWeightAndHint(LI);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
new file mode 100644
index 000000000000..0ef069f47827
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -0,0 +1,154 @@
+//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LiveRegMatrix analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumAssigned   , "Number of registers assigned");
+STATISTIC(NumUnassigned , "Number of registers unassigned");
+
+char LiveRegMatrix::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix",
+                      "Live Register Matrix", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
+                    "Live Register Matrix", false, false)
+
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
+  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+
+void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.addRequiredTransitive<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  if (NumRegUnits != Matrix.size())
+    Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
+  Matrix.init(LIUAlloc, NumRegUnits);
+
+  // Make sure no stale queries get reused.
+  invalidateVirtRegs();
+  return false;
+}
+
+void LiveRegMatrix::releaseMemory() {
+  for (unsigned i = 0, e = Matrix.size(); i != e; ++i) {
+    Matrix[i].clear();
+    Queries[i].clear();
+  }
+}
+
+void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
+  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
+               << " to " << PrintReg(PhysReg, TRI) << ':');
+  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+  MRI->setPhysRegUsed(PhysReg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].unify(VirtReg);
+  }
+  ++NumAssigned;
+  DEBUG(dbgs() << '\n');
+}
+
+void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
+  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
+               << " from " << PrintReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
+    Matrix[*Units].extract(VirtReg);
+  }
+  ++NumUnassigned;
+  DEBUG(dbgs() << '\n');
+}
+
+bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  // Check if the cached information is valid.
+  // The same BitVector can be reused for all PhysRegs.
+  // We could cache multiple VirtRegs if it becomes necessary.
+  if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg;
+    RegMaskTag = UserTag;
+    RegMaskUsable.clear();
+    LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
+  }
+
+  // The BitVector is indexed by PhysReg, not register unit.
+  // Regmask interference is more fine grained than regunits.
+  // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
+  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+}
+
+bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  if (VirtReg.empty())
+    return false;
+  CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
+      return true;
+  return false;
+}
+
+LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+                                               unsigned RegUnit) {
+  LiveIntervalUnion::Query &Q = Queries[RegUnit];
+  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  return Q;
+}
+
+LiveRegMatrix::InterferenceKind
+LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
+  if (VirtReg.empty())
+    return IK_Free;
+
+  // Regmask interference is the fastest check.
+  if (checkRegMaskInterference(VirtReg, PhysReg))
+    return IK_RegMask;
+
+  // Check for fixed interference.
+  if (checkRegUnitInterference(VirtReg, PhysReg))
+    return IK_RegUnit;
+
+  // Check the matrix for virtual register interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (query(VirtReg, *Units).checkInterference())
+      return IK_VirtReg;
+
+  return IK_Free;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
new file mode 100644
index 000000000000..be11a8fa86ef
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
@@ -0,0 +1,86 @@
+//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the live stack slot analysis pass. It is analogous to
+// live interval analysis except it's analyzing liveness of stack slots rather
+// than registers.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "livestacks"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <limits>
+using namespace llvm;
+
+char LiveStacks::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+
+char &llvm::LiveStacksID = LiveStacks::ID;
+
+void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveStacks::releaseMemory() {
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+  S2IMap.clear();
+  S2RCMap.clear();
+}
+
+bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  // FIXME: No analysis is being done right now. We are relying on the
+  // register allocators to provide the information.
+  return false;
+}
+
+LiveInterval &
+LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
+  assert(Slot >= 0 && "Spill slot indice must be >= 0");
+  SS2IntervalMap::iterator I = S2IMap.find(Slot);
+  if (I == S2IMap.end()) {
+    I = S2IMap.insert(I, std::make_pair(Slot,
+            LiveInterval(TargetRegisterInfo::index2StackSlot(Slot), 0.0F)));
+    S2RCMap.insert(std::make_pair(Slot, RC));
+  } else {
+    // Use the largest common subclass register class.
+    const TargetRegisterClass *OldRC = S2RCMap[Slot];
+    S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC);
+  }
+  return I->second;
+}
+
+/// print - Implement the dump method.
+void LiveStacks::print(raw_ostream &OS, const Module*) const {
+
+  OS << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second.print(OS);
+    int Slot = I->first;
+    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+    if (RC)
+      OS << " [" << RC->getName() << "]\n";
+    else
+      OS << " [Unknown]\n";
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
new file mode 100644
index 000000000000..789eddc42774
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
@@ -0,0 +1,826 @@
+//===-- LiveVariables.cpp - Live Variable Analysis for Machine Code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveVariable analysis pass.  For each machine
+// instruction in the function, this pass calculates the set of registers that
+// are immediately dead after the instruction (i.e., the instruction calculates
+// the value, but it is never used) and the set of registers that are used by
+// the instruction, but are never used after the instruction (i.e., they are
+// killed).
+//
+// This class computes live variables using a sparse implementation based on
+// the machine code SSA form.  This class computes live variable information for
+// each virtual and _register allocatable_ physical register in a function.  It
+// uses the dominance properties of SSA form to efficiently compute live
+// variables for virtual registers, and assumes that physical registers are only
+// live within a single basic block (allowing it to do a single local analysis
+// to resolve physical register lifetimes in each basic block).  If a physical
+// register is not register allocatable, it is not tracked.  This is useful for
+// things like the stack pointer and condition codes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+char LiveVariables::ID = 0;
+char &llvm::LiveVariablesID = LiveVariables::ID;
+INITIALIZE_PASS_BEGIN(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(UnreachableMachineBlockElim)
+INITIALIZE_PASS_END(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+
+
+void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredID(UnreachableMachineBlockElimID);
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *
+LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    if (Kills[i]->getParent() == MBB)
+      return Kills[i];
+  return NULL;
+}
+
+void LiveVariables::VarInfo::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  Alive in blocks: ";
+  for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
+           E = AliveBlocks.end(); I != E; ++I)
+    dbgs() << *I << ", ";
+  dbgs() << "\n  Killed by:";
+  if (Kills.empty())
+    dbgs() << " No instructions.\n";
+  else {
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+      dbgs() << "\n    #" << i << ": " << *Kills[i];
+    dbgs() << "\n";
+  }
+#endif
+}
+
+/// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
+LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
+  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+         "getVarInfo: not a virtual register!");
+  VirtRegInfo.grow(RegIdx);
+  return VirtRegInfo[RegIdx];
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB,
+                                    std::vector<MachineBasicBlock*> &WorkList) {
+  unsigned BBNum = MBB->getNumber();
+
+  // Check to see if this basic block is one of the killing blocks.  If so,
+  // remove it.
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    if (VRInfo.Kills[i]->getParent() == MBB) {
+      VRInfo.Kills.erase(VRInfo.Kills.begin()+i);  // Erase entry
+      break;
+    }
+
+  if (MBB == DefBlock) return;  // Terminate recursion
+
+  if (VRInfo.AliveBlocks.test(BBNum))
+    return;  // We already know the block is live
+
+  // Mark the variable known alive in this bb
+  VRInfo.AliveBlocks.set(BBNum);
+
+  assert(MBB != &MF->front() && "Can't find reaching def for virtreg");
+  WorkList.insert(WorkList.end(), MBB->pred_rbegin(), MBB->pred_rend());
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB) {
+  std::vector<MachineBasicBlock*> WorkList;
+  MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *Pred = WorkList.back();
+    WorkList.pop_back();
+    MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
+  }
+}
+
+void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
+                                     MachineInstr *MI) {
+  assert(MRI->getVRegDef(reg) && "Register use before def!");
+
+  unsigned BBNum = MBB->getNumber();
+
+  VarInfo& VRInfo = getVarInfo(reg);
+
+  // Check to see if this basic block is already a kill block.
+  if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
+    // Yes, this register is killed in this basic block already. Increase the
+    // live range by updating the kill instruction.
+    VRInfo.Kills.back() = MI;
+    return;
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!");
+#endif
+
+  // This situation can occur:
+  //
+  //     ,------.
+  //     |      |
+  //     |      v
+  //     |   t2 = phi ... t1 ...
+  //     |      |
+  //     |      v
+  //     |   t1 = ...
+  //     |  ... = ... t1 ...
+  //     |      |
+  //     `------'
+  //
+  // where there is a use in a PHI node that's a predecessor to the defining
+  // block. We don't want to mark all predecessors as having the value "alive"
+  // in this case.
+  if (MBB == MRI->getVRegDef(reg)->getParent()) return;
+
+  // Add a new kill entry for this basic block. If this virtual register is
+  // already marked as alive in this basic block, that means it is alive in at
+  // least one of the successor blocks, it's not a kill.
+  if (!VRInfo.AliveBlocks.test(BBNum))
+    VRInfo.Kills.push_back(MI);
+
+  // Update all dominating blocks to mark them as "known live".
+  for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+         E = MBB->pred_end(); PI != E; ++PI)
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
+}
+
+void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr *MI) {
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  if (VRInfo.AliveBlocks.empty())
+    // If vr is not alive in any block, then defaults to dead.
+    VRInfo.Kills.push_back(MI);
+}
+
+/// FindLastPartialDef - Return the last partial def of the specified register.
+/// Also returns the sub-registers that're defined by the instruction.
+MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
+                                            SmallSet<unsigned,4> &PartDefRegs) {
+  unsigned LastDefReg = 0;
+  unsigned LastDefDist = 0;
+  MachineInstr *LastDef = NULL;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (!Def)
+      continue;
+    unsigned Dist = DistanceMap[Def];
+    if (Dist > LastDefDist) {
+      LastDefReg  = SubReg;
+      LastDef     = Def;
+      LastDefDist = Dist;
+    }
+  }
+
+  if (!LastDef)
+    return 0;
+
+  PartDefRegs.insert(LastDefReg);
+  for (unsigned i = 0, e = LastDef->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = LastDef->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
+      continue;
+    unsigned DefReg = MO.getReg();
+    if (TRI->isSubRegister(Reg, DefReg)) {
+      PartDefRegs.insert(DefReg);
+      for (MCSubRegIterator SubRegs(DefReg, TRI); SubRegs.isValid(); ++SubRegs)
+        PartDefRegs.insert(*SubRegs);
+    }
+  }
+  return LastDef;
+}
+
+/// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
+/// implicit defs to a machine instruction if there was an earlier def of its
+/// super-register.
+void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  // If there was a previous use or a "full" def all is well.
+  if (!LastDef && !PhysRegUse[Reg]) {
+    // Otherwise, the last sub-register def implicitly defines this register.
+    // e.g.
+    // AH =
+    // AL = ... <imp-def EAX>, <imp-kill AH>
+    //    = AH
+    // ...
+    //    = EAX
+    // All of the sub-registers must have been defined before the use of Reg!
+    SmallSet<unsigned, 4> PartDefRegs;
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+    // If LastPartialDef is NULL, it must be using a livein register.
+    if (LastPartialDef) {
+      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                           true/*IsImp*/));
+      PhysRegDef[Reg] = LastPartialDef;
+      SmallSet<unsigned, 8> Processed;
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubReg = *SubRegs;
+        if (Processed.count(SubReg))
+          continue;
+        if (PartDefRegs.count(SubReg))
+          continue;
+        // This part of Reg was defined before the last partial def. It's killed
+        // here.
+        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
+                                                             false/*IsDef*/,
+                                                             true/*IsImp*/));
+        PhysRegDef[SubReg] = LastPartialDef;
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          Processed.insert(*SS);
+      }
+    }
+  } else if (LastDef && !PhysRegUse[Reg] &&
+             !LastDef->findRegisterDefOperand(Reg))
+    // Last def defines the super register, add an implicit def of reg.
+    LastDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                  true/*IsImp*/));
+
+  // Remember this use.
+  PhysRegUse[Reg]  = MI;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    PhysRegUse[*SubRegs] =  MI;
+}
+
+/// FindLastRefOrPartRef - Return the last reference or partial reference of
+/// the specified register.
+MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return 0;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  unsigned LastPartDefDist = 0;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist)
+        LastPartDefDist = Dist;
+    } else if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  return LastRefOrPartRef;
+}
+
+bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return false;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  // The whole register is used.
+  // AL =
+  // AH =
+  //
+  //    = AX
+  //    = AL, AX<imp-use, kill>
+  // AX =
+  //
+  // Or whole register is defined, but not used at all.
+  // AX<dead> =
+  // ...
+  // AX =
+  //
+  // Or whole register is defined, but only partly used.
+  // AX<dead> = AL<imp-def>
+  //    = AL<kill>
+  // AX =
+  MachineInstr *LastPartDef = 0;
+  unsigned LastPartDefDist = 0;
+  SmallSet<unsigned, 8> PartUses;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist) {
+        LastPartDefDist = Dist;
+        LastPartDef = Def;
+      }
+      continue;
+    }
+    if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      PartUses.insert(SubReg);
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+        PartUses.insert(*SS);
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  if (!PhysRegUse[Reg]) {
+    // Partial uses. Mark register def dead and add implicit def of
+    // sub-registers which are used.
+    // EAX<dead>  = op  AL<imp-def>
+    // That is, EAX def is dead but AL def extends pass it.
+    PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      if (!PartUses.count(SubReg))
+        continue;
+      bool NeedDef = true;
+      if (PhysRegDef[Reg] == PhysRegDef[SubReg]) {
+        MachineOperand *MO = PhysRegDef[Reg]->findRegisterDefOperand(SubReg);
+        if (MO) {
+          NeedDef = false;
+          assert(!MO->isDead());
+        }
+      }
+      if (NeedDef)
+        PhysRegDef[Reg]->addOperand(MachineOperand::CreateReg(SubReg,
+                                                 true/*IsDef*/, true/*IsImp*/));
+      MachineInstr *LastSubRef = FindLastRefOrPartRef(SubReg);
+      if (LastSubRef)
+        LastSubRef->addRegisterKilled(SubReg, TRI, true);
+      else {
+        LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
+        PhysRegUse[SubReg] = LastRefOrPartRef;
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          PhysRegUse[*SS] = LastRefOrPartRef;
+      }
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+        PartUses.erase(*SS);
+    }
+  } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
+    if (LastPartDef)
+      // The last partial def kills the register.
+      LastPartDef->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/, true/*IsKill*/));
+    else {
+      MachineOperand *MO =
+        LastRefOrPartRef->findRegisterDefOperand(Reg, false, TRI);
+      bool NeedEC = MO->isEarlyClobber() && MO->getReg() != Reg;
+      // If the last reference is the last def, then it's not used at all.
+      // That is, unless we are currently processing the last reference itself.
+      LastRefOrPartRef->addRegisterDead(Reg, TRI, true);
+      if (NeedEC) {
+        // If we are adding a subreg def and the superreg def is marked early
+        // clobber, add an early clobber marker to the subreg def.
+        MO = LastRefOrPartRef->findRegisterDefOperand(Reg);
+        if (MO)
+          MO->setIsEarlyClobber();
+      }
+    }
+  } else
+    LastRefOrPartRef->addRegisterKilled(Reg, TRI, true);
+  return true;
+}
+
+void LiveVariables::HandleRegMask(const MachineOperand &MO) {
+  // Call HandlePhysRegKill() for all live registers clobbered by Mask.
+  // Clobbered registers are always dead, sp there is no need to use
+  // HandlePhysRegDef().
+  for (unsigned Reg = 1, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg) {
+    // Skip dead regs.
+    if (!PhysRegDef[Reg] && !PhysRegUse[Reg])
+      continue;
+    // Skip mask-preserved regs.
+    if (!MO.clobbersPhysReg(Reg))
+      continue;
+    // Kill the largest clobbered super-register.
+    // This avoids needless implicit operands.
+    unsigned Super = Reg;
+    for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+      if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR))
+        Super = *SR;
+    HandlePhysRegKill(Super, 0);
+  }
+}
+
+void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
+                                     SmallVector<unsigned, 4> &Defs) {
+  // What parts of the register are previously defined?
+  SmallSet<unsigned, 32> Live;
+  if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
+    Live.insert(Reg);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      Live.insert(*SubRegs);
+  } else {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      // If a register isn't itself defined, but all parts that make up of it
+      // are defined, then consider it also defined.
+      // e.g.
+      // AL =
+      // AH =
+      //    = AX
+      if (Live.count(SubReg))
+        continue;
+      if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
+        Live.insert(SubReg);
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          Live.insert(*SS);
+      }
+    }
+  }
+
+  // Start from the largest piece, find the last time any part of the register
+  // is referenced.
+  HandlePhysRegKill(Reg, MI);
+  // Only some of the sub-registers are used.
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    if (!Live.count(SubReg))
+      // Skip if this sub-register isn't defined.
+      continue;
+    HandlePhysRegKill(SubReg, MI);
+  }
+
+  if (MI)
+    Defs.push_back(Reg);  // Remember this def.
+}
+
+void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
+                                      SmallVector<unsigned, 4> &Defs) {
+  while (!Defs.empty()) {
+    unsigned Reg = Defs.back();
+    Defs.pop_back();
+    PhysRegDef[Reg]  = MI;
+    PhysRegUse[Reg]  = NULL;
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      PhysRegDef[SubReg]  = MI;
+      PhysRegUse[SubReg]  = NULL;
+    }
+  }
+}
+
+bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+
+  unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef  = new MachineInstr*[NumRegs];
+  PhysRegUse  = new MachineInstr*[NumRegs];
+  PHIVarInfo = new SmallVector<unsigned, 4>[MF->getNumBlockIDs()];
+  std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+  std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  PHIJoins.clear();
+
+  // FIXME: LiveIntervals will be updated to remove its dependence on
+  // LiveVariables to improve compilation time and eliminate bizarre pass
+  // dependencies. Until then, we can't change much in -O0.
+  if (!MRI->isSSA())
+    report_fatal_error("regalloc=... not currently supported with -O0");
+
+  analyzePHINodes(mf);
+
+  // Calculate live variable information in depth first order on the CFG of the
+  // function.  This guarantees that we will see the definition of a virtual
+  // register before its uses due to dominance properties of SSA (except for PHI
+  // nodes, which are treated as a special case).
+  MachineBasicBlock *Entry = MF->begin();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*,16> >
+         DFI = df_ext_begin(Entry, Visited), E = df_ext_end(Entry, Visited);
+       DFI != E; ++DFI) {
+    MachineBasicBlock *MBB = *DFI;
+
+    // Mark live-in registers as live-in.
+    SmallVector<unsigned, 4> Defs;
+    for (MachineBasicBlock::livein_iterator II = MBB->livein_begin(),
+           EE = MBB->livein_end(); II != EE; ++II) {
+      assert(TargetRegisterInfo::isPhysicalRegister(*II) &&
+             "Cannot have a live-in virtual register!");
+      HandlePhysRegDef(*II, 0, Defs);
+    }
+
+    // Loop over all of the instructions, processing them.
+    DistanceMap.clear();
+    unsigned Dist = 0;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+      if (MI->isDebugValue())
+        continue;
+      DistanceMap.insert(std::make_pair(MI, Dist++));
+
+      // Process all of the operands of the instruction...
+      unsigned NumOperandsToProcess = MI->getNumOperands();
+
+      // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+      // of the uses.  They will be handled in other basic blocks.
+      if (MI->isPHI())
+        NumOperandsToProcess = 1;
+
+      // Clear kill and dead markers. LV will recompute them.
+      SmallVector<unsigned, 4> UseRegs;
+      SmallVector<unsigned, 4> DefRegs;
+      SmallVector<unsigned, 1> RegMasks;
+      for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isRegMask()) {
+          RegMasks.push_back(i);
+          continue;
+        }
+        if (!MO.isReg() || MO.getReg() == 0)
+          continue;
+        unsigned MOReg = MO.getReg();
+        if (MO.isUse()) {
+          MO.setIsKill(false);
+          if (MO.readsReg())
+            UseRegs.push_back(MOReg);
+        } else /*MO.isDef()*/ {
+          MO.setIsDead(false);
+          DefRegs.push_back(MOReg);
+        }
+      }
+
+      // Process all uses.
+      for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+        unsigned MOReg = UseRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegUse(MOReg, MBB, MI);
+        else if (!MRI->isReserved(MOReg))
+          HandlePhysRegUse(MOReg, MI);
+      }
+
+      // Process all masked registers. (Call clobbers).
+      for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
+        HandleRegMask(MI->getOperand(RegMasks[i]));
+
+      // Process all defs.
+      for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+        unsigned MOReg = DefRegs[i];
+        if (TargetRegisterInfo::isVirtualRegister(MOReg))
+          HandleVirtRegDef(MOReg, MI);
+        else if (!MRI->isReserved(MOReg))
+          HandlePhysRegDef(MOReg, MI, Defs);
+      }
+      UpdatePhysRegDefs(MI, Defs);
+    }
+
+    // Handle any virtual assignments from PHI nodes which might be at the
+    // bottom of this basic block.  We check all of our successor blocks to see
+    // if they have PHI nodes, and if so, we simulate an assignment at the end
+    // of the current block.
+    if (!PHIVarInfo[MBB->getNumber()].empty()) {
+      SmallVector<unsigned, 4>& VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+      for (SmallVector<unsigned, 4>::iterator I = VarInfoVec.begin(),
+             E = VarInfoVec.end(); I != E; ++I)
+        // Mark it alive only in the block we are representing.
+        MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                                MBB);
+    }
+
+    // MachineCSE may CSE instructions which write to non-allocatable physical
+    // registers across MBBs. Remember if any reserved register is liveout.
+    SmallSet<unsigned, 4> LiveOuts;
+    for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SuccMBB = *SI;
+      if (SuccMBB->isLandingPad())
+        continue;
+      for (MachineBasicBlock::livein_iterator LI = SuccMBB->livein_begin(),
+             LE = SuccMBB->livein_end(); LI != LE; ++LI) {
+        unsigned LReg = *LI;
+        if (!TRI->isInAllocatableClass(LReg))
+          // Ignore other live-ins, e.g. those that are live into landing pads.
+          LiveOuts.insert(LReg);
+      }
+    }
+
+    // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+    // available at the end of the basic block.
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
+        HandlePhysRegDef(i, 0, Defs);
+
+    std::fill(PhysRegDef,  PhysRegDef  + NumRegs, (MachineInstr*)0);
+    std::fill(PhysRegUse,  PhysRegUse  + NumRegs, (MachineInstr*)0);
+  }
+
+  // Convert and transfer the dead / killed information we have gathered into
+  // VirtRegInfo onto MI's.
+  for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
+      if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
+        VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
+      else
+        VirtRegInfo[Reg].Kills[j]->addRegisterKilled(Reg, TRI);
+  }
+
+  // Check to make sure there are no unreachable blocks in the MC CFG for the
+  // function.  If so, it is due to a bug in the instruction selector or some
+  // other part of the code generator if this happens.
+#ifndef NDEBUG
+  for(MachineFunction::iterator i = MF->begin(), e = MF->end(); i != e; ++i)
+    assert(Visited.count(&*i) != 0 && "unreachable basic block found");
+#endif
+
+  delete[] PhysRegDef;
+  delete[] PhysRegUse;
+  delete[] PHIVarInfo;
+
+  return false;
+}
+
+/// replaceKillInstruction - Update register kill info by replacing a kill
+/// instruction with a new one.
+void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr *OldMI,
+                                           MachineInstr *NewMI) {
+  VarInfo &VI = getVarInfo(Reg);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), OldMI, NewMI);
+}
+
+/// removeVirtualRegistersKilled - Remove all killed info for the specified
+/// instruction.
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isKill()) {
+      MO.setIsKill(false);
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        bool removed = getVarInfo(Reg).removeKill(MI);
+        assert(removed && "kill not in register's VarInfo?");
+        (void)removed;
+      }
+    }
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the variable information of a virtual register
+/// which is used in a PHI node. We map that to the BB the vreg is coming from.
+///
+void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
+  for (MachineFunction::const_iterator I = Fn.begin(), E = Fn.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        if (BBI->getOperand(i).readsReg())
+          PHIVarInfo[BBI->getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI->getOperand(i).getReg());
+}
+
+bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
+                                      unsigned Reg,
+                                      MachineRegisterInfo &MRI) {
+  unsigned Num = MBB.getNumber();
+
+  // Reg is live-through.
+  if (AliveBlocks.test(Num))
+    return true;
+
+  // Registers defined in MBB cannot be live in.
+  const MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def && Def->getParent() == &MBB)
+    return false;
+
+ // Reg was not defined in MBB, was it killed here?
+  return findKill(&MBB);
+}
+
+bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  SmallVector<MachineBasicBlock*, 8> OpSuccBlocks;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+         E = MBB.succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    OpSuccBlocks.push_back(SuccMBB);
+  }
+
+  // Check to see if this value is live because there is a use in a successor
+  // that kills it.
+  switch (OpSuccBlocks.size()) {
+  case 1: {
+    MachineBasicBlock *SuccMBB = OpSuccBlocks[0];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB)
+        return true;
+    break;
+  }
+  case 2: {
+    MachineBasicBlock *SuccMBB1 = OpSuccBlocks[0], *SuccMBB2 = OpSuccBlocks[1];
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (VI.Kills[i]->getParent() == SuccMBB1 ||
+          VI.Kills[i]->getParent() == SuccMBB2)
+        return true;
+    break;
+  }
+  default:
+    std::sort(OpSuccBlocks.begin(), OpSuccBlocks.end());
+    for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+      if (std::binary_search(OpSuccBlocks.begin(), OpSuccBlocks.end(),
+                             VI.Kills[i]->getParent()))
+        return true;
+  }
+  return false;
+}
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB.
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB) {
+  const unsigned NumNew = BB->getNumber();
+
+  SmallSet<unsigned, 16> Defs, Kills;
+
+  MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
+  for (; BBI != BBE && BBI->isPHI(); ++BBI) {
+    // Record the def of the PHI node.
+    Defs.insert(BBI->getOperand(0).getReg());
+
+    // All registers used by PHI nodes in SuccBB must be live through BB.
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i+1).getMBB() == BB)
+        getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+  }
+
+  // Record all vreg defs and kills of all instructions in SuccBB.
+  for (; BBI != BBE; ++BBI) {
+    for (MachineInstr::mop_iterator I = BBI->operands_begin(),
+         E = BBI->operands_end(); I != E; ++I) {
+      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+        if (I->isDef())
+          Defs.insert(I->getReg());
+        else if (I->isKill())
+          Kills.insert(I->getReg());
+      }
+    }
+  }
+
+  // Update info for all live variables
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // If the Defs is defined in the successor it can't be live in BB.
+    if (Defs.count(Reg))
+      continue;
+
+    // If the register is either killed in or live through SuccBB it's also live
+    // through BB.
+    VarInfo &VI = getVarInfo(Reg);
+    if (Kills.count(Reg) || VI.AliveBlocks.test(SuccBB->getNumber()))
+      VI.AliveBlocks.set(NumNew);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
new file mode 100644
index 000000000000..352ef942591f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -0,0 +1,356 @@
+//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass assigns local frame indices to stack slots relative to one another
+// and allocates additional base registers to access them when the target
+// estimates they are likely to be out of range of stack pointer and frame
+// pointer relative addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "localstackalloc"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
+STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
+STATISTIC(NumReplacements, "Number of frame indices references replaced");
+
+namespace {
+  class FrameRef {
+    MachineBasicBlock::iterator MI; // Instr referencing the frame
+    int64_t LocalOffset;            // Local offset of the frame idx referenced
+  public:
+    FrameRef(MachineBasicBlock::iterator I, int64_t Offset) :
+      MI(I), LocalOffset(Offset) {}
+    bool operator<(const FrameRef &RHS) const {
+      return LocalOffset < RHS.LocalOffset;
+    }
+    MachineBasicBlock::iterator getMachineInstr() { return MI; }
+  };
+
+  class LocalStackSlotPass: public MachineFunctionPass {
+    SmallVector<int64_t,16> LocalOffsets;
+
+    void AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, int64_t &Offset,
+                           bool StackGrowsDown, unsigned &MaxAlign);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    bool insertFrameReferenceRegisters(MachineFunction &Fn);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LocalStackSlotPass() : MachineFunctionPass(ID) { }
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+  };
+} // end anonymous namespace
+
+char LocalStackSlotPass::ID = 0;
+char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
+INITIALIZE_PASS(LocalStackSlotPass, "localstackalloc",
+                "Local Stack Slot Allocation", false, false)
+
+bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  unsigned LocalObjectCount = MFI->getObjectIndexEnd();
+
+  // If the target doesn't want/need this pass, or if there are no locals
+  // to consider, early exit.
+  if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0)
+    return true;
+
+  // Make sure we have enough space to store the local offsets.
+  LocalOffsets.resize(MFI->getObjectIndexEnd());
+
+  // Lay out the local blob.
+  calculateFrameObjectOffsets(MF);
+
+  // Insert virtual base registers to resolve frame index references.
+  bool UsedBaseRegs = insertFrameReferenceRegisters(MF);
+
+  // Tell MFI whether any base registers were allocated. PEI will only
+  // want to use the local block allocations from this pass if there were any.
+  // Otherwise, PEI can do a bit better job of getting the alignment right
+  // without a hole at the start since it knows the alignment of the stack
+  // at the start of local allocation, and this pass doesn't.
+  MFI->setUseLocalStackAllocationBlock(UsedBaseRegs);
+
+  return true;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo *MFI,
+                                           int FrameIdx, int64_t &Offset,
+                                           bool StackGrowsDown,
+                                           unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
+  DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
+        << LocalOffset << "\n");
+  // Keep the offset available for base register allocation
+  LocalOffsets[FrameIdx] = LocalOffset;
+  // And tell MFI about it for PEI to use later
+  MFI->mapLocalFrameObject(FrameIdx, LocalOffset);
+
+  if (!StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  ++NumAllocations;
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  int64_t Offset = 0;
+  unsigned MaxAlign = 0;
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> LargeStackObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), Offset,
+                      StackGrowsDown, MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+      if (!MFI->MayNeedStackProtector(i))
+        continue;
+
+      AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+      LargeStackObjs.insert(i);
+    }
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (LargeStackObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+  }
+
+  // Remember how big this blob of stack space is
+  MFI->setLocalFrameSize(Offset);
+  MFI->setLocalFrameMaxAlign(MaxAlign);
+}
+
+static inline bool
+lookupCandidateBaseReg(const SmallVector<std::pair<unsigned, int64_t>, 8> &Regs,
+                       std::pair<unsigned, int64_t> &RegOffset,
+                       int64_t FrameSizeAdjust,
+                       int64_t LocalFrameOffset,
+                       const MachineInstr *MI,
+                       const TargetRegisterInfo *TRI) {
+  unsigned e = Regs.size();
+  for (unsigned i = 0; i < e; ++i) {
+    RegOffset = Regs[i];
+    // Check if the relative offset from the where the base register references
+    // to the target address is in range for the instruction.
+    int64_t Offset = FrameSizeAdjust + LocalFrameOffset - RegOffset.second;
+    if (TRI->isFrameOffsetLegal(MI, Offset))
+      return true;
+  }
+  return false;
+}
+
+bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
+  // Scan the function's instructions looking for frame index references.
+  // For each, ask the target if it wants a virtual base register for it
+  // based on what we can tell it about where the local will end up in the
+  // stack frame. If it wants one, re-use a suitable one we've previously
+  // allocated, or if there isn't one that fits the bill, allocate a new one
+  // and ask the target to create a defining instruction for it.
+  bool UsedBaseReg = false;
+
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Collect all of the instructions in the block that reference
+  // a frame index. Also store the frame index referenced to ease later
+  // lookup. (For any insn that has more than one FI reference, we arbitrarily
+  // choose the first one).
+  SmallVector<FrameRef, 64> FrameReferenceInsns;
+
+  // A base register definition is a register + offset pair.
+  SmallVector<std::pair<unsigned, int64_t>, 8> BaseRegisters;
+
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      MachineInstr *MI = I;
+
+      // Debug value instructions can't be out of range, so they don't need
+      // any updates.
+      if (MI->isDebugValue())
+        continue;
+
+      // For now, allocate the base register(s) within the basic block
+      // where they're used, and don't try to keep them around outside
+      // of that. It may be beneficial to try sharing them more broadly
+      // than that, but the increased register pressure makes that a
+      // tricky thing to balance. Investigate if re-materializing these
+      // becomes an issue.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        // Consider replacing all frame index operands that reference
+        // an object allocated in the local block.
+        if (MI->getOperand(i).isFI()) {
+          // Don't try this with values not in the local block.
+          if (!MFI->isObjectPreAllocated(MI->getOperand(i).getIndex()))
+            break;
+          FrameReferenceInsns.
+            push_back(FrameRef(MI, LocalOffsets[MI->getOperand(i).getIndex()]));
+          break;
+        }
+      }
+    }
+  }
+
+  // Sort the frame references by local offset
+  array_pod_sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
+
+  MachineBasicBlock *Entry = Fn.begin();
+
+  // Loop through the frame references and allocate for them as necessary.
+  for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
+    MachineBasicBlock::iterator I =
+      FrameReferenceInsns[ref].getMachineInstr();
+    MachineInstr *MI = I;
+    for (unsigned idx = 0, e = MI->getNumOperands(); idx != e; ++idx) {
+      // Consider replacing all frame index operands that reference
+      // an object allocated in the local block.
+      if (MI->getOperand(idx).isFI()) {
+        int FrameIdx = MI->getOperand(idx).getIndex();
+
+        assert(MFI->isObjectPreAllocated(FrameIdx) &&
+               "Only pre-allocated locals expected!");
+
+        DEBUG(dbgs() << "Considering: " << *MI);
+        if (TRI->needsFrameBaseReg(MI, LocalOffsets[FrameIdx])) {
+          unsigned BaseReg = 0;
+          int64_t Offset = 0;
+          int64_t FrameSizeAdjust =
+            StackGrowsDown ? MFI->getLocalFrameSize() : 0;
+
+          DEBUG(dbgs() << "  Replacing FI in: " << *MI);
+
+          // If we have a suitable base register available, use it; otherwise
+          // create a new one. Note that any offset encoded in the
+          // instruction itself will be taken into account by the target,
+          // so we don't have to adjust for it here when reusing a base
+          // register.
+          std::pair<unsigned, int64_t> RegOffset;
+          if (lookupCandidateBaseReg(BaseRegisters, RegOffset,
+                                     FrameSizeAdjust,
+                                     LocalOffsets[FrameIdx],
+                                     MI, TRI)) {
+            DEBUG(dbgs() << "  Reusing base register " <<
+                  RegOffset.first << "\n");
+            // We found a register to reuse.
+            BaseReg = RegOffset.first;
+            Offset = FrameSizeAdjust + LocalOffsets[FrameIdx] -
+              RegOffset.second;
+          } else {
+            // No previously defined register was in range, so create a
+            // new one.
+            int64_t InstrOffset = TRI->getFrameIndexInstrOffset(MI, idx);
+            const MachineFunction *MF = MI->getParent()->getParent();
+            const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
+            BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
+
+            DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
+                  " at frame local offset " <<
+                  LocalOffsets[FrameIdx] + InstrOffset << "\n");
+
+            // Tell the target to insert the instruction to initialize
+            // the base register.
+            //            MachineBasicBlock::iterator InsertionPt = Entry->begin();
+            TRI->materializeFrameBaseRegister(Entry, BaseReg, FrameIdx,
+                                              InstrOffset);
+
+            // The base register already includes any offset specified
+            // by the instruction, so account for that so it doesn't get
+            // applied twice.
+            Offset = -InstrOffset;
+
+            int64_t BaseOffset = FrameSizeAdjust + LocalOffsets[FrameIdx] +
+              InstrOffset;
+            BaseRegisters.push_back(
+              std::pair<unsigned, int64_t>(BaseReg, BaseOffset));
+            ++NumBaseRegisters;
+            UsedBaseReg = true;
+          }
+          assert(BaseReg != 0 && "Unable to allocate virtual base register!");
+
+          // Modify the instruction to use the new base register rather
+          // than the frame index operand.
+          TRI->resolveFrameIndex(I, BaseReg, Offset);
+          DEBUG(dbgs() << "Resolved: " << *MI);
+
+          ++NumReplacements;
+        }
+      }
+    }
+  }
+  return UsedBaseReg;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
new file mode 100644
index 000000000000..898e165feeab
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -0,0 +1,1181 @@
+//===-- llvm/CodeGen/MachineBasicBlock.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect the sequence of machine instructions for a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+MachineBasicBlock::MachineBasicBlock(MachineFunction &mf, const BasicBlock *bb)
+  : BB(bb), Number(-1), xParent(&mf), Alignment(0), IsLandingPad(false),
+    AddressTaken(false) {
+  Insts.Parent = this;
+}
+
+MachineBasicBlock::~MachineBasicBlock() {
+  LeakDetector::removeGarbageObject(this);
+}
+
+/// getSymbol - Return the MCSymbol for this basic block.
+///
+MCSymbol *MachineBasicBlock::getSymbol() const {
+  const MachineFunction *MF = getParent();
+  MCContext &Ctx = MF->getContext();
+  const char *Prefix = Ctx.getAsmInfo().getPrivateGlobalPrefix();
+  return Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
+                               Twine(MF->getFunctionNumber()) + "_" +
+                               Twine(getNumber()));
+}
+
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
+  MBB.print(OS);
+  return OS;
+}
+
+/// addNodeToList (MBB) - When an MBB is added to an MF, we need to update the
+/// parent pointer of the MBB, the MBB numbering, and any instructions in the
+/// MBB to be on the right operand list for registers.
+///
+/// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
+/// gets the next available unique MBB number. If it is removed from a
+/// MachineFunction, it goes back to being #-1.
+void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock *N) {
+  MachineFunction &MF = *N->getParent();
+  N->Number = MF.addToMBBNumbering(N);
+
+  // Make sure the instructions have their operands in the reginfo lists.
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  for (MachineBasicBlock::instr_iterator
+         I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
+    I->AddRegOperandsToUseLists(RegInfo);
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
+  N->getParent()->removeFromMBBNumbering(N->Number);
+  N->Number = -1;
+  LeakDetector::addGarbageObject(N);
+}
+
+
+/// addNodeToList (MI) - When we add an instruction to a basic block
+/// list, we update its parent pointer and add its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
+  assert(N->getParent() == 0 && "machine instruction already in a basic block");
+  N->setParent(Parent);
+
+  // Add the instruction's register operands to their corresponding
+  // use/def lists.
+  MachineFunction *MF = Parent->getParent();
+  N->AddRegOperandsToUseLists(MF->getRegInfo());
+
+  LeakDetector::removeGarbageObject(N);
+}
+
+/// removeNodeFromList (MI) - When we remove an instruction from a basic block
+/// list, we update its parent pointer and remove its operands from reg use/def
+/// lists if appropriate.
+void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
+  assert(N->getParent() != 0 && "machine instruction not in a basic block");
+
+  // Remove from the use/def lists.
+  if (MachineFunction *MF = N->getParent()->getParent())
+    N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
+
+  N->setParent(0);
+
+  LeakDetector::addGarbageObject(N);
+}
+
+/// transferNodesFromList (MI) - When moving a range of instructions from one
+/// MBB list to another, we need to update the parent pointers and the use/def
+/// lists.
+void ilist_traits<MachineInstr>::
+transferNodesFromList(ilist_traits<MachineInstr> &fromList,
+                      ilist_iterator<MachineInstr> first,
+                      ilist_iterator<MachineInstr> last) {
+  assert(Parent->getParent() == fromList.Parent->getParent() &&
+        "MachineInstr parent mismatch!");
+
+  // Splice within the same MBB -> no change.
+  if (Parent == fromList.Parent) return;
+
+  // If splicing between two blocks within the same function, just update the
+  // parent pointers.
+  for (; first != last; ++first)
+    first->setParent(Parent);
+}
+
+void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
+  assert(!MI->getParent() && "MI is still in a block!");
+  Parent->getParent()->DeleteMachineInstr(MI);
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  instr_iterator I = instr_begin(), E = instr_end();
+  while (I != E && I->isPHI())
+    ++I;
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi MI cannot be inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  iterator E = end();
+  while (I != E && (I->isPHI() || I->isLabel() || I->isDebugValue()))
+    ++I;
+  // FIXME: This needs to change if we wish to bundle labels / dbg_values
+  // inside the bundle.
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi / non-label instruction is inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
+  iterator B = begin(), E = end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::const_iterator
+MachineBasicBlock::getFirstTerminator() const {
+  const_iterator B = begin(), E = end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::instr_iterator MachineBasicBlock::getFirstInstrTerminator() {
+  instr_iterator B = instr_begin(), E = instr_end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() {
+  // Skip over end-of-block dbg_value instructions.
+  instr_iterator B = instr_begin(), I = instr_end();
+  while (I != B) {
+    --I;
+    // Return instruction that starts a bundle.
+    if (I->isDebugValue() || I->isInsideBundle())
+      continue;
+    return I;
+  }
+  // The block is all debug values.
+  return end();
+}
+
+MachineBasicBlock::const_iterator
+MachineBasicBlock::getLastNonDebugInstr() const {
+  // Skip over end-of-block dbg_value instructions.
+  const_instr_iterator B = instr_begin(), I = instr_end();
+  while (I != B) {
+    --I;
+    // Return instruction that starts a bundle.
+    if (I->isDebugValue() || I->isInsideBundle())
+      continue;
+    return I;
+  }
+  // The block is all debug values.
+  return end();
+}
+
+const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
+  // A block with a landing pad successor only has one other successor.
+  if (succ_size() > 2)
+    return 0;
+  for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
+    if ((*I)->isLandingPad())
+      return *I;
+  return 0;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineBasicBlock::dump() const {
+  print(dbgs());
+}
+#endif
+
+StringRef MachineBasicBlock::getName() const {
+  if (const BasicBlock *LBB = getBasicBlock())
+    return LBB->getName();
+  else
+    return "(null)";
+}
+
+/// Return a hopefully unique identifier for this block.
+std::string MachineBasicBlock::getFullName() const {
+  std::string Name;
+  if (getParent())
+    Name = (getParent()->getName() + ":").str();
+  if (getBasicBlock())
+    Name += getBasicBlock()->getName();
+  else
+    Name += (Twine("BB") + Twine(getNumber())).str();
+  return Name;
+}
+
+void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+
+  if (Indexes)
+    OS << Indexes->getMBBStartIdx(this) << '\t';
+
+  OS << "BB#" << getNumber() << ": ";
+
+  const char *Comma = "";
+  if (const BasicBlock *LBB = getBasicBlock()) {
+    OS << Comma << "derived from LLVM BB ";
+    WriteAsOperand(OS, LBB, /*PrintType=*/false);
+    Comma = ", ";
+  }
+  if (isLandingPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
+  if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
+  if (Alignment)
+    OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
+       << " bytes)";
+
+  OS << '\n';
+
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  if (!livein_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Live Ins:";
+    for (livein_iterator I = livein_begin(),E = livein_end(); I != E; ++I)
+      OS << ' ' << PrintReg(*I, TRI);
+    OS << '\n';
+  }
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Predecessors according to CFG:";
+    for (const_pred_iterator PI = pred_begin(), E = pred_end(); PI != E; ++PI)
+      OS << " BB#" << (*PI)->getNumber();
+    OS << '\n';
+  }
+
+  for (const_instr_iterator I = instr_begin(); I != instr_end(); ++I) {
+    if (Indexes) {
+      if (Indexes->hasIndex(I))
+        OS << Indexes->getInstructionIndex(I);
+      OS << '\t';
+    }
+    OS << '\t';
+    if (I->isInsideBundle())
+      OS << "  * ";
+    I->print(OS, &getParent()->getTarget());
+  }
+
+  // Print the successors of this block according to the CFG.
+  if (!succ_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Successors according to CFG:";
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
+      OS << " BB#" << (*SI)->getNumber();
+      if (!Weights.empty())
+        OS << '(' << *getWeightIterator(SI) << ')';
+    }
+    OS << '\n';
+  }
+}
+
+void MachineBasicBlock::removeLiveIn(unsigned Reg) {
+  std::vector<unsigned>::iterator I =
+    std::find(LiveIns.begin(), LiveIns.end(), Reg);
+  if (I != LiveIns.end())
+    LiveIns.erase(I);
+}
+
+bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
+  livein_iterator I = std::find(livein_begin(), livein_end(), Reg);
+  return I != livein_end();
+}
+
+void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
+  getParent()->splice(NewAfter, this);
+}
+
+void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
+  MachineFunction::iterator BBI = NewBefore;
+  getParent()->splice(++BBI, this);
+}
+
+void MachineBasicBlock::updateTerminator() {
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  // A block with no successors has no concerns with fall-through edges.
+  if (this->succ_empty()) return;
+
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
+  bool B = TII->AnalyzeBranch(*this, TBB, FBB, Cond);
+  (void) B;
+  assert(!B && "UpdateTerminators requires analyzable predecessors!");
+  if (Cond.empty()) {
+    if (TBB) {
+      // The block has an unconditional branch. If its successor is now
+      // its layout successor, delete the branch.
+      if (isLayoutSuccessor(TBB))
+        TII->RemoveBranch(*this);
+    } else {
+      // The block has an unconditional fallthrough. If its successor is not
+      // its layout successor, insert a branch. First we have to locate the
+      // only non-landing-pad successor, as that is the fallthrough block.
+      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+        if ((*SI)->isLandingPad())
+          continue;
+        assert(!TBB && "Found more than one non-landing-pad successor!");
+        TBB = *SI;
+      }
+
+      // If there is no non-landing-pad successor, the block has no
+      // fall-through edges to be concerned with.
+      if (!TBB)
+        return;
+
+      // Finally update the unconditional successor to be reached via a branch
+      // if it would not be reached by fallthrough.
+      if (!isLayoutSuccessor(TBB))
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+    }
+  } else {
+    if (FBB) {
+      // The block has a non-fallthrough conditional branch. If one of its
+      // successors is its layout successor, rewrite it to a fallthrough
+      // conditional branch.
+      if (isLayoutSuccessor(TBB)) {
+        if (TII->ReverseBranchCondition(Cond))
+          return;
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, FBB, 0, Cond, dl);
+      } else if (isLayoutSuccessor(FBB)) {
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, TBB, 0, Cond, dl);
+      }
+    } else {
+      // Walk through the successors and find the successor which is not
+      // a landing pad and is not the conditional branch destination (in TBB)
+      // as the fallthrough successor.
+      MachineBasicBlock *FallthroughBB = 0;
+      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+        if ((*SI)->isLandingPad() || *SI == TBB)
+          continue;
+        assert(!FallthroughBB && "Found more than one fallthrough successor.");
+        FallthroughBB = *SI;
+      }
+      if (!FallthroughBB && canFallThrough()) {
+        // We fallthrough to the same basic block as the conditional jump
+        // targets. Remove the conditional jump, leaving unconditional
+        // fallthrough.
+        // FIXME: This does not seem like a reasonable pattern to support, but it
+        // has been seen in the wild coming out of degenerate ARM test cases.
+        TII->RemoveBranch(*this);
+
+        // Finally update the unconditional successor to be reached via a branch
+        // if it would not be reached by fallthrough.
+        if (!isLayoutSuccessor(TBB))
+          TII->InsertBranch(*this, TBB, 0, Cond, dl);
+        return;
+      }
+
+      // The block has a fallthrough conditional branch.
+      if (isLayoutSuccessor(TBB)) {
+        if (TII->ReverseBranchCondition(Cond)) {
+          // We can't reverse the condition, add an unconditional branch.
+          Cond.clear();
+          TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
+          return;
+        }
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, FallthroughBB, 0, Cond, dl);
+      } else if (!isLayoutSuccessor(FallthroughBB)) {
+        TII->RemoveBranch(*this);
+        TII->InsertBranch(*this, TBB, FallthroughBB, Cond, dl);
+      }
+    }
+  }
+}
+
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *succ, uint32_t weight) {
+
+  // If we see non-zero value for the first time it means we actually use Weight
+  // list, so we fill all Weights with 0's.
+  if (weight != 0 && Weights.empty())
+    Weights.resize(Successors.size());
+
+  if (weight != 0 || !Weights.empty())
+    Weights.push_back(weight);
+
+   Successors.push_back(succ);
+   succ->addPredecessor(this);
+ }
+
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *succ) {
+  succ->removePredecessor(this);
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
+  assert(I != Successors.end() && "Not a current successor!");
+
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(I);
+    Weights.erase(WI);
+  }
+
+  Successors.erase(I);
+}
+
+MachineBasicBlock::succ_iterator
+MachineBasicBlock::removeSuccessor(succ_iterator I) {
+  assert(I != Successors.end() && "Not a current successor!");
+
+  // If Weight list is empty it means we don't use it (disabled optimization).
+  if (!Weights.empty()) {
+    weight_iterator WI = getWeightIterator(I);
+    Weights.erase(WI);
+  }
+
+  (*I)->removePredecessor(this);
+  return Successors.erase(I);
+}
+
+void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
+                                         MachineBasicBlock *New) {
+  if (Old == New)
+    return;
+
+  succ_iterator E = succ_end();
+  succ_iterator NewI = E;
+  succ_iterator OldI = E;
+  for (succ_iterator I = succ_begin(); I != E; ++I) {
+    if (*I == Old) {
+      OldI = I;
+      if (NewI != E)
+        break;
+    }
+    if (*I == New) {
+      NewI = I;
+      if (OldI != E)
+        break;
+    }
+  }
+  assert(OldI != E && "Old is not a successor of this block");
+  Old->removePredecessor(this);
+
+  // If New isn't already a successor, let it take Old's place.
+  if (NewI == E) {
+    New->addPredecessor(this);
+    *OldI = New;
+    return;
+  }
+
+  // New is already a successor.
+  // Update its weight instead of adding a duplicate edge.
+  if (!Weights.empty()) {
+    weight_iterator OldWI = getWeightIterator(OldI);
+    *getWeightIterator(NewI) += *OldWI;
+    Weights.erase(OldWI);
+  }
+  Successors.erase(OldI);
+}
+
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
+  Predecessors.push_back(pred);
+}
+
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *pred) {
+  pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), pred);
+  assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
+  Predecessors.erase(I);
+}
+
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
+
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    uint32_t Weight = 0;
+
+    // If Weight list is empty it means we don't use it (disabled optimization).
+    if (!fromMBB->Weights.empty())
+      Weight = *fromMBB->Weights.begin();
+
+    addSuccessor(Succ, Weight);
+    fromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
+  if (this == fromMBB)
+    return;
+
+  while (!fromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *fromMBB->succ_begin();
+    uint32_t Weight = 0;
+    if (!fromMBB->Weights.empty())
+      Weight = *fromMBB->Weights.begin();
+    addSuccessor(Succ, Weight);
+    fromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(),
+           ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI)
+      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.getMBB() == fromMBB)
+          MO.setMBB(this);
+      }
+  }
+}
+
+bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
+  return std::find(pred_begin(), pred_end(), MBB) != pred_end();
+}
+
+bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
+  return std::find(succ_begin(), succ_end(), MBB) != succ_end();
+}
+
+bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
+  MachineFunction::const_iterator I(this);
+  return llvm::next(I) == MachineFunction::const_iterator(MBB);
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  MachineFunction::iterator Fallthrough = this;
+  ++Fallthrough;
+  // If FallthroughBlock is off the end of the function, it can't fall through.
+  if (Fallthrough == getParent()->end())
+    return false;
+
+  // If FallthroughBlock isn't a successor, no fallthrough is possible.
+  if (!isSuccessor(Fallthrough))
+    return false;
+
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getTarget().getInstrInfo();
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicated check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier.
+    return empty() || !back().isBarrier() || TII->isPredicated(&back());
+  }
+
+  // If there is no branch, control always falls through.
+  if (TBB == 0) return true;
+
+  // If there is some explicit branch to the fallthrough block, it can obviously
+  // reach, even though the branch should get folded to fall through implicitly.
+  if (MachineFunction::iterator(TBB) == Fallthrough ||
+      MachineFunction::iterator(FBB) == Fallthrough)
+    return true;
+
+  // If it's an unconditional branch to some block not the fall through, it
+  // doesn't fall through.
+  if (Cond.empty()) return false;
+
+  // Otherwise, if it is conditional and has no explicit false block, it falls
+  // through.
+  return FBB == 0;
+}
+
+MachineBasicBlock *
+MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isLandingPad())
+    return NULL;
+
+  MachineFunction *MF = getParent();
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  // We may need to update this's terminator, but we can't do that if
+  // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->AnalyzeBranch(*this, TBB, FBB, Cond))
+    return NULL;
+
+  // Avoid bugpoint weirdness: A block may end with a conditional branch but
+  // jumps to the same MBB is either case. We have duplicate CFG edges in that
+  // case that we can't handle. Since this never happens in properly optimized
+  // code, just skip those edges.
+  if (TBB && TBB == FBB) {
+    DEBUG(dbgs() << "Won't split critical edge after degenerate BB#"
+                 << getNumber() << '\n');
+    return NULL;
+  }
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  MF->insert(llvm::next(MachineFunction::iterator(this)), NMBB);
+  DEBUG(dbgs() << "Splitting critical edge:"
+        " BB#" << getNumber()
+        << " -- BB#" << NMBB->getNumber()
+        << " -- BB#" << Succ->getNumber() << '\n');
+
+  LiveIntervals *LIS = P->getAnalysisIfAvailable<LiveIntervals>();
+  SlotIndexes *Indexes = P->getAnalysisIfAvailable<SlotIndexes>();
+  if (LIS)
+    LIS->insertMBBInMaps(NMBB);
+  else if (Indexes)
+    Indexes->insertMBBInMaps(NMBB);
+
+  // On some targets like Mips, branches may kill virtual registers. Make sure
+  // that LiveVariables is properly updated after updateTerminator replaces the
+  // terminators.
+  LiveVariables *LV = P->getAnalysisIfAvailable<LiveVariables>();
+
+  // Collect a list of virtual registers killed by the terminators.
+  SmallVector<unsigned, 4> KilledRegs;
+  if (LV)
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || OI->getReg() == 0 ||
+            !OI->isUse() || !OI->isKill() || OI->isUndef())
+          continue;
+        unsigned Reg = OI->getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+            LV->getVarInfo(Reg).removeKill(MI)) {
+          KilledRegs.push_back(Reg);
+          DEBUG(dbgs() << "Removing terminator kill: " << *MI);
+          OI->setIsKill(false);
+        }
+      }
+    }
+
+  SmallVector<unsigned, 4> UsedRegs;
+  if (LIS) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I) {
+      MachineInstr *MI = I;
+
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || OI->getReg() == 0)
+          continue;
+
+        unsigned Reg = OI->getReg();
+        if (std::find(UsedRegs.begin(), UsedRegs.end(), Reg) == UsedRegs.end())
+          UsedRegs.push_back(Reg);
+      }
+    }
+  }
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+
+  // If updateTerminator() removes instructions, we need to remove them from
+  // SlotIndexes.
+  SmallVector<MachineInstr*, 4> Terminators;
+  if (Indexes) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      Terminators.push_back(I);
+  }
+
+  updateTerminator();
+
+  if (Indexes) {
+    SmallVector<MachineInstr*, 4> NewTerminators;
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      NewTerminators.push_back(I);
+
+    for (SmallVectorImpl<MachineInstr*>::iterator I = Terminators.begin(),
+        E = Terminators.end(); I != E; ++I) {
+      if (std::find(NewTerminators.begin(), NewTerminators.end(), *I) ==
+          NewTerminators.end())
+       Indexes->removeMachineInstrFromMaps(*I);
+    }
+  }
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    Cond.clear();
+    MF->getTarget().getInstrInfo()->InsertBranch(*NMBB, Succ, NULL, Cond, dl);
+
+    if (Indexes) {
+      for (instr_iterator I = NMBB->instr_begin(), E = NMBB->instr_end();
+           I != E; ++I) {
+        // Some instructions may have been moved to NMBB by updateTerminator(),
+        // so we first remove any instruction that already has an index.
+        if (Indexes->hasIndex(I))
+          Indexes->removeMachineInstrFromMaps(I);
+        Indexes->insertMachineInstrInMaps(I);
+      }
+    }
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this
+  for (MachineBasicBlock::instr_iterator
+         i = Succ->instr_begin(),e = Succ->instr_end();
+       i != e && i->isPHI(); ++i)
+    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
+      if (i->getOperand(ni+1).getMBB() == this)
+        i->getOperand(ni+1).setMBB(NMBB);
+
+  // Inherit live-ins from the successor
+  for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
+         E = Succ->livein_end(); I != E; ++I)
+    NMBB->addLiveIn(*I);
+
+  // Update LiveVariables.
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  if (LV) {
+    // Restore kills of virtual registers that were killed by the terminators.
+    while (!KilledRegs.empty()) {
+      unsigned Reg = KilledRegs.pop_back_val();
+      for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
+        if (!(--I)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
+          continue;
+        if (TargetRegisterInfo::isVirtualRegister(Reg))
+          LV->getVarInfo(Reg).Kills.push_back(I);
+        DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        break;
+      }
+    }
+    // Update relevant live-through information.
+    LV->addNewBlock(NMBB, this, Succ);
+  }
+
+  if (LIS) {
+    // After splitting the edge and updating SlotIndexes, live intervals may be
+    // in one of two situations, depending on whether this block was the last in
+    // the function. If the original block was the last in the function, all live
+    // intervals will end prior to the beginning of the new split block. If the
+    // original block was not at the end of the function, all live intervals will
+    // extend to the end of the new split block.
+
+    bool isLastMBB =
+      llvm::next(MachineFunction::iterator(NMBB)) == getParent()->end();
+
+    SlotIndex StartIndex = Indexes->getMBBEndIdx(this);
+    SlotIndex PrevIndex = StartIndex.getPrevSlot();
+    SlotIndex EndIndex = Indexes->getMBBEndIdx(NMBB);
+
+    // Find the registers used from NMBB in PHIs in Succ.
+    SmallSet<unsigned, 8> PHISrcRegs;
+    for (MachineBasicBlock::instr_iterator
+         I = Succ->instr_begin(), E = Succ->instr_end();
+         I != E && I->isPHI(); ++I) {
+      for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
+        if (I->getOperand(ni+1).getMBB() == NMBB) {
+          MachineOperand &MO = I->getOperand(ni);
+          unsigned Reg = MO.getReg();
+          PHISrcRegs.insert(Reg);
+          if (MO.isUndef())
+            continue;
+
+          LiveInterval &LI = LIS->getInterval(Reg);
+          VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+          assert(VNI && "PHI sources should be live out of their predecessors.");
+          LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+        }
+      }
+    }
+
+    MachineRegisterInfo *MRI = &getParent()->getRegInfo();
+    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+      unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+      if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
+        continue;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (!LI.liveAt(PrevIndex))
+        continue;
+
+      bool isLiveOut = LI.liveAt(LIS->getMBBStartIdx(Succ));
+      if (isLiveOut && isLastMBB) {
+        VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+        assert(VNI && "LiveInterval should have VNInfo where it is live.");
+        LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+      } else if (!isLiveOut && !isLastMBB) {
+        LI.removeRange(StartIndex, EndIndex);
+      }
+    }
+
+    // Update all intervals for registers whose uses may have been modified by
+    // updateTerminator().
+    LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs);
+  }
+
+  if (MachineDominatorTree *MDT =
+      P->getAnalysisIfAvailable<MachineDominatorTree>()) {
+    // Update dominator information.
+    MachineDomTreeNode *SucccDTNode = MDT->getNode(Succ);
+
+    bool IsNewIDom = true;
+    for (const_pred_iterator PI = Succ->pred_begin(), E = Succ->pred_end();
+         PI != E; ++PI) {
+      MachineBasicBlock *PredBB = *PI;
+      if (PredBB == NMBB)
+        continue;
+      if (!MDT->dominates(SucccDTNode, MDT->getNode(PredBB))) {
+        IsNewIDom = false;
+        break;
+      }
+    }
+
+    // We know "this" dominates the newly created basic block.
+    MachineDomTreeNode *NewDTNode = MDT->addNewBlock(NMBB, this);
+
+    // If all the other predecessors of "Succ" are dominated by "Succ" itself
+    // then the new block is the new immediate dominator of "Succ". Otherwise,
+    // the new block doesn't dominate anything.
+    if (IsNewIDom)
+      MDT->changeImmediateDominator(SucccDTNode, NewDTNode);
+  }
+
+  if (MachineLoopInfo *MLI = P->getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
+/// Prepare MI to be removed from its bundle. This fixes bundle flags on MI's
+/// neighboring instructions so the bundle won't be broken by removing MI.
+static void unbundleSingleMI(MachineInstr *MI) {
+  // Removing the first instruction in a bundle.
+  if (MI->isBundledWithSucc() && !MI->isBundledWithPred())
+    MI->unbundleFromSucc();
+  // Removing the last instruction in a bundle.
+  if (MI->isBundledWithPred() && !MI->isBundledWithSucc())
+    MI->unbundleFromPred();
+  // If MI is not bundled, or if it is internal to a bundle, the neighbor flags
+  // are already fine.
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) {
+  unbundleSingleMI(I);
+  return Insts.erase(I);
+}
+
+MachineInstr *MachineBasicBlock::remove_instr(MachineInstr *MI) {
+  unbundleSingleMI(MI);
+  MI->clearFlag(MachineInstr::BundledPred);
+  MI->clearFlag(MachineInstr::BundledSucc);
+  return Insts.remove(MI);
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) {
+  assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
+         "Cannot insert instruction with bundle flags");
+  // Set the bundle flags when inserting inside a bundle.
+  if (I != instr_end() && I->isBundledWithPred()) {
+    MI->setFlag(MachineInstr::BundledPred);
+    MI->setFlag(MachineInstr::BundledSucc);
+  }
+  return Insts.insert(I, MI);
+}
+
+/// removeFromParent - This method unlinks 'this' from the containing function,
+/// and returns it, but does not delete it.
+MachineBasicBlock *MachineBasicBlock::removeFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->remove(this);
+  return this;
+}
+
+
+/// eraseFromParent - This method unlinks 'this' from the containing function,
+/// and deletes it.
+void MachineBasicBlock::eraseFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->erase(this);
+}
+
+
+/// ReplaceUsesOfBlockWith - Given a machine basic block that branched to
+/// 'Old', change the code and CFG so that it branches to 'New' instead.
+void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
+                                               MachineBasicBlock *New) {
+  assert(Old != New && "Cannot replace self with self!");
+
+  MachineBasicBlock::instr_iterator I = instr_end();
+  while (I != instr_begin()) {
+    --I;
+    if (!I->isTerminator()) break;
+
+    // Scan the operands of this machine instruction, replacing any uses of Old
+    // with New.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      if (I->getOperand(i).isMBB() &&
+          I->getOperand(i).getMBB() == Old)
+        I->getOperand(i).setMBB(New);
+  }
+
+  // Update the successor information.
+  replaceSuccessor(Old, New);
+}
+
+/// CorrectExtraCFGEdges - Various pieces of code can cause excess edges in the
+/// CFG to be inserted.  If we have proven that MBB can only branch to DestA and
+/// DestB, remove any other MBB successors from the CFG.  DestA and DestB can be
+/// null.
+///
+/// Besides DestA and DestB, retain other edges leading to LandingPads
+/// (currently there can be only one; we don't check or require that here).
+/// Note it is possible that DestA and/or DestB are LandingPads.
+bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
+                                             MachineBasicBlock *DestB,
+                                             bool isCond) {
+  // The values of DestA and DestB frequently come from a call to the
+  // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial
+  // values from there.
+  //
+  // 1. If both DestA and DestB are null, then the block ends with no branches
+  //    (it falls through to its successor).
+  // 2. If DestA is set, DestB is null, and isCond is false, then the block ends
+  //    with only an unconditional branch.
+  // 3. If DestA is set, DestB is null, and isCond is true, then the block ends
+  //    with a conditional branch that falls through to a successor (DestB).
+  // 4. If DestA and DestB is set and isCond is true, then the block ends with a
+  //    conditional branch followed by an unconditional branch. DestA is the
+  //    'true' destination and DestB is the 'false' destination.
+
+  bool Changed = false;
+
+  MachineFunction::iterator FallThru =
+    llvm::next(MachineFunction::iterator(this));
+
+  if (DestA == 0 && DestB == 0) {
+    // Block falls through to successor.
+    DestA = FallThru;
+    DestB = FallThru;
+  } else if (DestA != 0 && DestB == 0) {
+    if (isCond)
+      // Block ends in conditional jump that falls through to successor.
+      DestB = FallThru;
+  } else {
+    assert(DestA && DestB && isCond &&
+           "CFG in a bad state. Cannot correct CFG edges");
+  }
+
+  // Remove superfluous edges. I.e., those which aren't destinations of this
+  // basic block, duplicate edges, or landing pads.
+  SmallPtrSet<const MachineBasicBlock*, 8> SeenMBBs;
+  MachineBasicBlock::succ_iterator SI = succ_begin();
+  while (SI != succ_end()) {
+    const MachineBasicBlock *MBB = *SI;
+    if (!SeenMBBs.insert(MBB) ||
+        (MBB != DestA && MBB != DestB && !MBB->isLandingPad())) {
+      // This is a superfluous edge, remove it.
+      SI = removeSuccessor(SI);
+      Changed = true;
+    } else {
+      ++SI;
+    }
+  }
+
+  return Changed;
+}
+
+/// findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping
+/// any DBG_VALUE instructions.  Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
+  DebugLoc DL;
+  instr_iterator E = instr_end();
+  if (MBBI == E)
+    return DL;
+
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  while (MBBI != E && MBBI->isDebugValue())
+    MBBI++;
+  if (MBBI != E)
+    DL = MBBI->getDebugLoc();
+  return DL;
+}
+
+/// getSuccWeight - Return weight of the edge from this block to MBB.
+///
+uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
+  if (Weights.empty())
+    return 0;
+
+  return *getWeightIterator(Succ);
+}
+
+/// getWeightIterator - Return wight iterator corresonding to the I successor
+/// iterator
+MachineBasicBlock::weight_iterator MachineBasicBlock::
+getWeightIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Weights.size() == Successors.size() && "Async weight list!");
+  size_t index = std::distance(Successors.begin(), I);
+  assert(index < Weights.size() && "Not a current successor!");
+  return Weights.begin() + index;
+}
+
+/// getWeightIterator - Return wight iterator corresonding to the I successor
+/// iterator
+MachineBasicBlock::const_weight_iterator MachineBasicBlock::
+getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
+  assert(Weights.size() == Successors.size() && "Async weight list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Weights.size() && "Not a current successor!");
+  return Weights.begin() + index;
+}
+
+/// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
+/// as of just before "MI".
+/// 
+/// Search is localised to a neighborhood of
+/// Neighborhood instructions before (searching for defs or kills) and N
+/// instructions after (searching just for defs) MI.
+MachineBasicBlock::LivenessQueryResult
+MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                           unsigned Reg, MachineInstr *MI,
+                                           unsigned Neighborhood) {
+  unsigned N = Neighborhood;
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Start by searching backwards from MI, looking for kills, reads or defs.
+
+  MachineBasicBlock::iterator I(MI);
+  // If this is the first insn in the block, don't search backwards.
+  if (I != MBB->begin()) {
+    do {
+      --I;
+
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.Defines)
+        // Outputs happen after inputs so they take precedence if both are
+        // present.
+        return Analysis.DefinesDead ? LQR_Dead : LQR_Live;
+
+      if (Analysis.Kills || Analysis.Clobbers)
+        // Register killed, so isn't live.
+        return LQR_Dead;
+
+      else if (Analysis.ReadsOverlap)
+        // Defined or read without a previous kill - live.
+        return Analysis.Reads ? LQR_Live : LQR_OverlappingLive;
+
+    } while (I != MBB->begin() && --N > 0);
+  }
+
+  // Did we get to the start of the block?
+  if (I == MBB->begin()) {
+    // If so, the register's state is definitely defined by the live-in state.
+    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true);
+         RAI.isValid(); ++RAI) {
+      if (MBB->isLiveIn(*RAI))
+        return (*RAI == Reg) ? LQR_Live : LQR_OverlappingLive;
+    }
+
+    return LQR_Dead;
+  }
+
+  N = Neighborhood;
+
+  // Try searching forwards from MI, looking for reads or defs.
+  I = MachineBasicBlock::iterator(MI);
+  // If this is the last insn in the block, don't search forwards.
+  if (I != MBB->end()) {
+    for (++I; I != MBB->end() && N > 0; ++I, --N) {
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.ReadsOverlap)
+        // Used, therefore must have been live.
+        return (Analysis.Reads) ?
+          LQR_Live : LQR_OverlappingLive;
+
+      else if (Analysis.Clobbers || Analysis.Defines)
+        // Defined (but not read) therefore cannot have been live.
+        return LQR_Dead;
+    }
+  }
+
+  // At this point we have no idea of the liveness of the register.
+  return LQR_Unknown;
+}
+
+void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB,
+                          bool t) {
+  OS << "BB#" << MBB->getNumber();
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..070daf2e2ba2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -0,0 +1,61 @@
+//====----- MachineBlockFrequencyInfo.cpp - Machine Block Frequency Analysis ----====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyImpl.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
+                      "Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
+                    "Machine Block Frequency Analysis", true, true)
+
+char MachineBlockFrequencyInfo::ID = 0;
+
+
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo() : MachineFunctionPass(ID) {
+  initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
+  MBFI = new BlockFrequencyImpl<MachineBasicBlock, MachineFunction,
+                                MachineBranchProbabilityInfo>();
+}
+
+MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {
+  delete MBFI;
+}
+
+void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI->doFunction(&F, &MBPI);
+  return false;
+}
+
+/// getblockFreq - Return block frequency. Return 0 if we don't have the
+/// information. Please note that initial frequency is equal to 1024. It means
+/// that we should not rely on the value itself, but only on the comparison to
+/// the other block frequencies. We do this to avoid using of floating points.
+///
+BlockFrequency MachineBlockFrequencyInfo::
+getBlockFreq(const MachineBasicBlock *MBB) const {
+  return MBFI->getBlockFreq(MBB);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
new file mode 100644
index 000000000000..cd948e24a6b2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -0,0 +1,1165 @@
+//===-- MachineBlockPlacement.cpp - Basic Block Code Layout optimization --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations using the CFG
+// structure and branch probability estimates.
+//
+// The pass strives to preserve the structure of the CFG (that is, retain
+// a topological ordering of basic blocks) in the absence of a *strong* signal
+// to the contrary from probabilities. However, within the CFG structure, it
+// attempts to choose an ordering which favors placing more likely sequences of
+// blocks adjacent to each other.
+//
+// The algorithm works from the inner-most loop within a function outward, and
+// at each stage walks through the basic blocks, trying to coalesce them into
+// sequential chains where allowed by the CFG (or demanded by heavy
+// probabilities). Finally, it walks the blocks in topological order, and the
+// first time it reaches a chain of basic blocks, it schedules them in the
+// function in-order.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "block-placement2"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumCondBranches, "Number of conditional branches");
+STATISTIC(NumUncondBranches, "Number of uncondittional branches");
+STATISTIC(CondBranchTakenFreq,
+          "Potential frequency of taking conditional branches");
+STATISTIC(UncondBranchTakenFreq,
+          "Potential frequency of taking unconditional branches");
+
+namespace {
+class BlockChain;
+/// \brief Type for our function-wide basic block -> block chain mapping.
+typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+}
+
+namespace {
+/// \brief A chain of blocks which will be laid out contiguously.
+///
+/// This is the datastructure representing a chain of consecutive blocks that
+/// are profitable to layout together in order to maximize fallthrough
+/// probabilities and code locality. We also can use a block chain to represent
+/// a sequence of basic blocks which have some external (correctness)
+/// requirement for sequential layout.
+///
+/// Chains can be built around a single basic block and can be merged to grow
+/// them. They participate in a block-to-chain mapping, which is updated
+/// automatically as chains are merged together.
+class BlockChain {
+  /// \brief The sequence of blocks belonging to this chain.
+  ///
+  /// This is the sequence of blocks for a particular chain. These will be laid
+  /// out in-order within the function.
+  SmallVector<MachineBasicBlock *, 4> Blocks;
+
+  /// \brief A handle to the function-wide basic block to block chain mapping.
+  ///
+  /// This is retained in each block chain to simplify the computation of child
+  /// block chains for SCC-formation and iteration. We store the edges to child
+  /// basic blocks, and map them back to their associated chains using this
+  /// structure.
+  BlockToChainMapType &BlockToChain;
+
+public:
+  /// \brief Construct a new BlockChain.
+  ///
+  /// This builds a new block chain representing a single basic block in the
+  /// function. It also registers itself as the chain that block participates
+  /// in with the BlockToChain mapping.
+  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
+    : Blocks(1, BB), BlockToChain(BlockToChain), LoopPredecessors(0) {
+    assert(BB && "Cannot create a chain with a null basic block");
+    BlockToChain[BB] = this;
+  }
+
+  /// \brief Iterator over blocks within the chain.
+  typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+
+  /// \brief Beginning of blocks within the chain.
+  iterator begin() { return Blocks.begin(); }
+
+  /// \brief End of blocks within the chain.
+  iterator end() { return Blocks.end(); }
+
+  /// \brief Merge a block chain into this one.
+  ///
+  /// This routine merges a block chain into this one. It takes care of forming
+  /// a contiguous sequence of basic blocks, updating the edge list, and
+  /// updating the block -> chain mapping. It does not free or tear down the
+  /// old chain, but the old chain's block list is no longer valid.
+  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
+    assert(BB);
+    assert(!Blocks.empty());
+
+    // Fast path in case we don't have a chain already.
+    if (!Chain) {
+      assert(!BlockToChain[BB]);
+      Blocks.push_back(BB);
+      BlockToChain[BB] = this;
+      return;
+    }
+
+    assert(BB == *Chain->begin());
+    assert(Chain->begin() != Chain->end());
+
+    // Update the incoming blocks to point to this chain, and add them to the
+    // chain structure.
+    for (BlockChain::iterator BI = Chain->begin(), BE = Chain->end();
+         BI != BE; ++BI) {
+      Blocks.push_back(*BI);
+      assert(BlockToChain[*BI] == Chain && "Incoming blocks not in chain");
+      BlockToChain[*BI] = this;
+    }
+  }
+
+#ifndef NDEBUG
+  /// \brief Dump the blocks in this chain.
+  void dump() LLVM_ATTRIBUTE_USED {
+    for (iterator I = begin(), E = end(); I != E; ++I)
+      (*I)->dump();
+  }
+#endif // NDEBUG
+
+  /// \brief Count of predecessors within the loop currently being processed.
+  ///
+  /// This count is updated at each loop we process to represent the number of
+  /// in-loop predecessors of this chain.
+  unsigned LoopPredecessors;
+};
+}
+
+namespace {
+class MachineBlockPlacement : public MachineFunctionPass {
+  /// \brief A typedef for a block filter set.
+  typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+
+  /// \brief A handle to the function-wide block frequency pass.
+  const MachineBlockFrequencyInfo *MBFI;
+
+  /// \brief A handle to the loop info.
+  const MachineLoopInfo *MLI;
+
+  /// \brief A handle to the target's instruction info.
+  const TargetInstrInfo *TII;
+
+  /// \brief A handle to the target's lowering info.
+  const TargetLoweringBase *TLI;
+
+  /// \brief Allocator and owner of BlockChain structures.
+  ///
+  /// We build BlockChains lazily while processing the loop structure of
+  /// a function. To reduce malloc traffic, we allocate them using this
+  /// slab-like allocator, and destroy them after the pass completes. An
+  /// important guarantee is that this allocator produces stable pointers to
+  /// the chains.
+  SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
+
+  /// \brief Function wide BasicBlock to BlockChain mapping.
+  ///
+  /// This mapping allows efficiently moving from any given basic block to the
+  /// BlockChain it participates in, if any. We use it to, among other things,
+  /// allow implicitly defining edges between chains as the existing edges
+  /// between basic blocks.
+  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+
+  void markChainSuccessors(BlockChain &Chain,
+                           MachineBasicBlock *LoopHeaderBB,
+                           SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+                           const BlockFilterSet *BlockFilter = 0);
+  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
+                                         BlockChain &Chain,
+                                         const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      MachineFunction &F,
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
+  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+                  SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+                  const BlockFilterSet *BlockFilter = 0);
+  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(MachineFunction &F,
+                                      MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet);
+  void buildLoopChains(MachineFunction &F, MachineLoop &L);
+  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
+                  const BlockFilterSet &LoopBlockSet);
+  void buildCFGChains(MachineFunction &F);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MachineBlockPlacement() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char MachineBlockPlacement::ID = 0;
+char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
+INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement2",
+                      "Branch Probability Basic Block Placement", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement2",
+                    "Branch Probability Basic Block Placement", false, false)
+
+#ifndef NDEBUG
+/// \brief Helper to print the name of a MBB.
+///
+/// Only used by debug logging.
+static std::string getBlockName(MachineBasicBlock *BB) {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  OS << "BB#" << BB->getNumber()
+     << " (derived from LLVM BB '" << BB->getName() << "')";
+  OS.flush();
+  return Result;
+}
+
+/// \brief Helper to print the number of a MBB.
+///
+/// Only used by debug logging.
+static std::string getBlockNum(MachineBasicBlock *BB) {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  OS << "BB#" << BB->getNumber();
+  OS.flush();
+  return Result;
+}
+#endif
+
+/// \brief Mark a chain's successors as having one fewer preds.
+///
+/// When a chain is being merged into the "placed" chain, this routine will
+/// quickly walk the successors of each block in the chain and mark them as
+/// having one fewer active predecessor. It also adds any successors of this
+/// chain which reach the zero-predecessor state to the worklist passed in.
+void MachineBlockPlacement::markChainSuccessors(
+    BlockChain &Chain,
+    MachineBasicBlock *LoopHeaderBB,
+    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+    const BlockFilterSet *BlockFilter) {
+  // Walk all the blocks in this chain, marking their successors as having
+  // a predecessor placed.
+  for (BlockChain::iterator CBI = Chain.begin(), CBE = Chain.end();
+       CBI != CBE; ++CBI) {
+    // Add any successors for which this is the only un-placed in-loop
+    // predecessor to the worklist as a viable candidate for CFG-neutral
+    // placement. No subsequent placement of this block will violate the CFG
+    // shape, so we get to use heuristics to choose a favorable placement.
+    for (MachineBasicBlock::succ_iterator SI = (*CBI)->succ_begin(),
+                                          SE = (*CBI)->succ_end();
+         SI != SE; ++SI) {
+      if (BlockFilter && !BlockFilter->count(*SI))
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      // Disregard edges within a fixed chain, or edges to the loop header.
+      if (&Chain == &SuccChain || *SI == LoopHeaderBB)
+        continue;
+
+      // This is a cross-chain edge that is within the loop, so decrement the
+      // loop predecessor count of the destination chain.
+      if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
+        BlockWorkList.push_back(*SuccChain.begin());
+    }
+  }
+}
+
+/// \brief Select the best successor for a block.
+///
+/// This looks across all successors of a particular block and attempts to
+/// select the "best" one to be the layout successor. It only considers direct
+/// successors which also pass the block filter. It will attempt to avoid
+/// breaking CFG structure, but cave and break such structures in the case of
+/// very hot successor edges.
+///
+/// \returns The best successor block found, or null if none are viable.
+MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
+    MachineBasicBlock *BB, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  const BranchProbability HotProb(4, 5); // 80%
+
+  MachineBasicBlock *BestSucc = 0;
+  // FIXME: Due to the performance of the probability and weight routines in
+  // the MBPI analysis, we manually compute probabilities using the edge
+  // weights. This is suboptimal as it means that the somewhat subtle
+  // definition of edge weight semantics is encoded here as well. We should
+  // improve the MBPI interface to efficiently support query patterns such as
+  // this.
+  uint32_t BestWeight = 0;
+  uint32_t WeightScale = 0;
+  uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
+  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                        SE = BB->succ_end();
+       SI != SE; ++SI) {
+    if (BlockFilter && !BlockFilter->count(*SI))
+      continue;
+    BlockChain &SuccChain = *BlockToChain[*SI];
+    if (&SuccChain == &Chain) {
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
+      continue;
+    }
+    if (*SI != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+      continue;
+    }
+
+    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
+    BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
+
+    // Only consider successors which are either "hot", or wouldn't violate
+    // any CFG constraints.
+    if (SuccChain.LoopPredecessors != 0) {
+      if (SuccProb < HotProb) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
+        continue;
+      }
+
+      // Make sure that a hot successor doesn't have a globally more important
+      // predecessor.
+      BlockFrequency CandidateEdgeFreq
+        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      bool BadCFGConflict = false;
+      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
+                                            PE = (*SI)->pred_end();
+           PI != PE; ++PI) {
+        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
+            BlockToChain[*PI] == &Chain)
+          continue;
+        BlockFrequency PredEdgeFreq
+          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        if (PredEdgeFreq >= CandidateEdgeFreq) {
+          BadCFGConflict = true;
+          break;
+        }
+      }
+      if (BadCFGConflict) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI)
+                               << " -> non-cold CFG conflict\n");
+        continue;
+      }
+    }
+
+    DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+                 << " (prob)"
+                 << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
+                 << "\n");
+    if (BestSucc && BestWeight >= SuccWeight)
+      continue;
+    BestSucc = *SI;
+    BestWeight = SuccWeight;
+  }
+  return BestSucc;
+}
+
+namespace {
+/// \brief Predicate struct to detect blocks already placed.
+class IsBlockPlaced {
+  const BlockChain &PlacedChain;
+  const BlockToChainMapType &BlockToChain;
+
+public:
+  IsBlockPlaced(const BlockChain &PlacedChain,
+                const BlockToChainMapType &BlockToChain)
+      : PlacedChain(PlacedChain), BlockToChain(BlockToChain) {}
+
+  bool operator()(MachineBasicBlock *BB) const {
+    return BlockToChain.lookup(BB) == &PlacedChain;
+  }
+};
+}
+
+/// \brief Select the best block from a worklist.
+///
+/// This looks through the provided worklist as a list of candidate basic
+/// blocks and select the most profitable one to place. The definition of
+/// profitable only really makes sense in the context of a loop. This returns
+/// the most frequently visited block in the worklist, which in the case of
+/// a loop, is the one most desirable to be physically close to the rest of the
+/// loop body in order to improve icache behavior.
+///
+/// \returns The best block found, or null if none are viable.
+MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
+    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
+    const BlockFilterSet *BlockFilter) {
+  // Once we need to walk the worklist looking for a candidate, cleanup the
+  // worklist of already placed entries.
+  // FIXME: If this shows up on profiles, it could be folded (at the cost of
+  // some code complexity) into the loop below.
+  WorkList.erase(std::remove_if(WorkList.begin(), WorkList.end(),
+                                IsBlockPlaced(Chain, BlockToChain)),
+                 WorkList.end());
+
+  MachineBasicBlock *BestBlock = 0;
+  BlockFrequency BestFreq;
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator WBI = WorkList.begin(),
+                                                      WBE = WorkList.end();
+       WBI != WBE; ++WBI) {
+    BlockChain &SuccChain = *BlockToChain[*WBI];
+    if (&SuccChain == &Chain) {
+      DEBUG(dbgs() << "    " << getBlockName(*WBI)
+                   << " -> Already merged!\n");
+      continue;
+    }
+    assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block");
+
+    BlockFrequency CandidateFreq = MBFI->getBlockFreq(*WBI);
+    DEBUG(dbgs() << "    " << getBlockName(*WBI) << " -> " << CandidateFreq
+                 << " (freq)\n");
+    if (BestBlock && BestFreq >= CandidateFreq)
+      continue;
+    BestBlock = *WBI;
+    BestFreq = CandidateFreq;
+  }
+  return BestBlock;
+}
+
+/// \brief Retrieve the first unplaced basic block.
+///
+/// This routine is called when we are unable to use the CFG to walk through
+/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
+MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
+    MachineFunction &F, const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(I))
+      continue;
+    if (BlockToChain[I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[I]->begin();
+    }
+  }
+  return 0;
+}
+
+void MachineBlockPlacement::buildChain(
+    MachineBasicBlock *BB,
+    BlockChain &Chain,
+    SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
+    const BlockFilterSet *BlockFilter) {
+  assert(BB);
+  assert(BlockToChain[BB] == &Chain);
+  MachineFunction &F = *BB->getParent();
+  MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
+
+  MachineBasicBlock *LoopHeaderBB = BB;
+  markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
+  BB = *llvm::prior(Chain.end());
+  for (;;) {
+    assert(BB);
+    assert(BlockToChain[BB] == &Chain);
+    assert(*llvm::prior(Chain.end()) == BB);
+
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+
+    // If an immediate successor isn't available, look for the best viable
+    // block among those we've identified as not violating the loop's CFG at
+    // this point. This won't be a fallthrough, but it will increase locality.
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter);
+
+    if (!BestSucc) {
+      BestSucc = getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt,
+                                       BlockFilter);
+      if (!BestSucc)
+        break;
+
+      DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
+                      "layout successor until the CFG reduces\n");
+    }
+
+    // Place this block, updating the datastructures to reflect its placement.
+    BlockChain &SuccChain = *BlockToChain[BestSucc];
+    // Zero out LoopPredecessors for the successor we're about to merge in case
+    // we selected a successor that didn't fit naturally into the CFG.
+    SuccChain.LoopPredecessors = 0;
+    DEBUG(dbgs() << "Merging from " << getBlockNum(BB)
+                 << " to " << getBlockNum(BestSucc) << "\n");
+    markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter);
+    Chain.merge(BestSucc, &SuccChain);
+    BB = *llvm::prior(Chain.end());
+  }
+
+  DEBUG(dbgs() << "Finished forming chain for header block "
+               << getBlockNum(*Chain.begin()) << "\n");
+}
+
+/// \brief Find the best loop top block for layout.
+///
+/// Look for a block which is strictly better than the loop header for laying
+/// out at the top of the loop. This looks for one and only one pattern:
+/// a latch block with no conditional exit. This block will cause a conditional
+/// jump around it or will be the bottom of the loop if we lay it out in place,
+/// but if it it doesn't end up at the bottom of the loop for any reason,
+/// rotation alone won't fix it. Because such a block will always result in an
+/// unconditional jump (for the backedge) rotating it in front of the loop
+/// header is always profitable.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  // Check that the header hasn't been fused with a preheader block due to
+  // crazy branches. If it has, we need to start with the header at the top to
+  // prevent pulling the preheader into the loop body.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return L.getHeader();
+
+  DEBUG(dbgs() << "Finding best loop top for: "
+               << getBlockName(L.getHeader()) << "\n");
+
+  BlockFrequency BestPredFreq;
+  MachineBasicBlock *BestPred = 0;
+  for (MachineBasicBlock::pred_iterator PI = L.getHeader()->pred_begin(),
+                                        PE = L.getHeader()->pred_end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *Pred = *PI;
+    if (!LoopBlockSet.count(Pred))
+      continue;
+    DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", "
+                 << Pred->succ_size() << " successors, "
+                 << MBFI->getBlockFreq(Pred) << " freq\n");
+    if (Pred->succ_size() > 1)
+      continue;
+
+    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+    if (!BestPred || PredFreq > BestPredFreq ||
+        (!(PredFreq < BestPredFreq) &&
+         Pred->isLayoutSuccessor(L.getHeader()))) {
+      BestPred = Pred;
+      BestPredFreq = PredFreq;
+    }
+  }
+
+  // If no direct predecessor is fine, just use the loop header.
+  if (!BestPred)
+    return L.getHeader();
+
+  // Walk backwards through any straight line of predecessors.
+  while (BestPred->pred_size() == 1 &&
+         (*BestPred->pred_begin())->succ_size() == 1 &&
+         *BestPred->pred_begin() != L.getHeader())
+    BestPred = *BestPred->pred_begin();
+
+  DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
+  return BestPred;
+}
+
+
+/// \brief Find the best loop exiting block for layout.
+///
+/// This routine implements the logic to analyze the loop looking for the best
+/// block to layout at the top of the loop. Typically this is done to maximize
+/// fallthrough opportunities.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
+                                        MachineLoop &L,
+                                        const BlockFilterSet &LoopBlockSet) {
+  // We don't want to layout the loop linearly in all cases. If the loop header
+  // is just a normal basic block in the loop, we want to look for what block
+  // within the loop is the best one to layout at the top. However, if the loop
+  // header has be pre-merged into a chain due to predecessors not having
+  // analyzable branches, *and* the predecessor it is merged with is *not* part
+  // of the loop, rotating the header into the middle of the loop will create
+  // a non-contiguous range of blocks which is Very Bad. So start with the
+  // header and only rotate if safe.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return 0;
+
+  BlockFrequency BestExitEdgeFreq;
+  unsigned BestExitLoopDepth = 0;
+  MachineBasicBlock *ExitingBB = 0;
+  // If there are exits to outer loops, loop rotation can severely limit
+  // fallthrough opportunites unless it selects such an exit. Keep a set of
+  // blocks where rotating to exit with that block will reach an outer loop.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
+
+  DEBUG(dbgs() << "Finding best loop exit for: "
+               << getBlockName(L.getHeader()) << "\n");
+  for (MachineLoop::block_iterator I = L.block_begin(),
+                                   E = L.block_end();
+       I != E; ++I) {
+    BlockChain &Chain = *BlockToChain[*I];
+    // Ensure that this block is at the end of a chain; otherwise it could be
+    // mid-way through an inner loop or a successor of an analyzable branch.
+    if (*I != *llvm::prior(Chain.end()))
+      continue;
+
+    // Now walk the successors. We need to establish whether this has a viable
+    // exiting successor and whether it has a viable non-exiting successor.
+    // We store the old exiting state and restore it if a viable looping
+    // successor isn't found.
+    MachineBasicBlock *OldExitingBB = ExitingBB;
+    BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
+    bool HasLoopingSucc = false;
+    // FIXME: Due to the performance of the probability and weight routines in
+    // the MBPI analysis, we use the internal weights and manually compute the
+    // probabilities to avoid quadratic behavior.
+    uint32_t WeightScale = 0;
+    uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
+    for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
+                                          SE = (*I)->succ_end();
+         SI != SE; ++SI) {
+      if ((*SI)->isLandingPad())
+        continue;
+      if (*SI == *I)
+        continue;
+      BlockChain &SuccChain = *BlockToChain[*SI];
+      // Don't split chains, either this chain or the successor's chain.
+      if (&Chain == &SuccChain) {
+        DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (chain conflict)\n");
+        continue;
+      }
+
+      uint32_t SuccWeight = MBPI->getEdgeWeight(*I, *SI);
+      if (LoopBlockSet.count(*SI)) {
+        DEBUG(dbgs() << "    looping: " << getBlockName(*I) << " -> "
+                     << getBlockName(*SI) << " (" << SuccWeight << ")\n");
+        HasLoopingSucc = true;
+        continue;
+      }
+
+      unsigned SuccLoopDepth = 0;
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) {
+        SuccLoopDepth = ExitLoop->getLoopDepth();
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(*I);
+      }
+
+      BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
+      BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
+      DEBUG(dbgs() << "    exiting: " << getBlockName(*I) << " -> "
+                   << getBlockName(*SI) << " [L:" << SuccLoopDepth
+                   << "] (" << ExitEdgeFreq << ")\n");
+      // Note that we slightly bias this toward an existing layout successor to
+      // retain incoming order in the absence of better information.
+      // FIXME: Should we bias this more strongly? It's pretty weak.
+      if (!ExitingBB || BestExitLoopDepth < SuccLoopDepth ||
+          ExitEdgeFreq > BestExitEdgeFreq ||
+          ((*I)->isLayoutSuccessor(*SI) &&
+           !(ExitEdgeFreq < BestExitEdgeFreq))) {
+        BestExitEdgeFreq = ExitEdgeFreq;
+        ExitingBB = *I;
+      }
+    }
+
+    // Restore the old exiting state, no viable looping successor was found.
+    if (!HasLoopingSucc) {
+      ExitingBB = OldExitingBB;
+      BestExitEdgeFreq = OldBestExitEdgeFreq;
+      continue;
+    }
+  }
+  // Without a candidate exiting block or with only a single block in the
+  // loop, just use the loop header to layout the loop.
+  if (!ExitingBB || L.getNumBlocks() == 1)
+    return 0;
+
+  // Also, if we have exit blocks which lead to outer loops but didn't select
+  // one of them as the exiting block we are rotating toward, disable loop
+  // rotation altogether.
+  if (!BlocksExitingToOuterLoop.empty() &&
+      !BlocksExitingToOuterLoop.count(ExitingBB))
+    return 0;
+
+  DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
+  return ExitingBB;
+}
+
+/// \brief Attempt to rotate an exiting block to the bottom of the loop.
+///
+/// Once we have built a chain, try to rotate it to line up the hot exit block
+/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
+/// branches. For example, if the loop has fallthrough into its header and out
+/// of its bottom already, don't rotate it.
+void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
+                                       MachineBasicBlock *ExitingBB,
+                                       const BlockFilterSet &LoopBlockSet) {
+  if (!ExitingBB)
+    return;
+
+  MachineBasicBlock *Top = *LoopChain.begin();
+  bool ViableTopFallthrough = false;
+  for (MachineBasicBlock::pred_iterator PI = Top->pred_begin(),
+                                        PE = Top->pred_end();
+       PI != PE; ++PI) {
+    BlockChain *PredChain = BlockToChain[*PI];
+    if (!LoopBlockSet.count(*PI) &&
+        (!PredChain || *PI == *llvm::prior(PredChain->end()))) {
+      ViableTopFallthrough = true;
+      break;
+    }
+  }
+
+  // If the header has viable fallthrough, check whether the current loop
+  // bottom is a viable exiting block. If so, bail out as rotating will
+  // introduce an unnecessary branch.
+  if (ViableTopFallthrough) {
+    MachineBasicBlock *Bottom = *llvm::prior(LoopChain.end());
+    for (MachineBasicBlock::succ_iterator SI = Bottom->succ_begin(),
+                                          SE = Bottom->succ_end();
+         SI != SE; ++SI) {
+      BlockChain *SuccChain = BlockToChain[*SI];
+      if (!LoopBlockSet.count(*SI) &&
+          (!SuccChain || *SI == *SuccChain->begin()))
+        return;
+    }
+  }
+
+  BlockChain::iterator ExitIt = std::find(LoopChain.begin(), LoopChain.end(),
+                                          ExitingBB);
+  if (ExitIt == LoopChain.end())
+    return;
+
+  std::rotate(LoopChain.begin(), llvm::next(ExitIt), LoopChain.end());
+}
+
+/// \brief Forms basic block chains from the natural loop structures.
+///
+/// These chains are designed to preserve the existing *structure* of the code
+/// as much as possible. We can then stitch the chains together in a way which
+/// both preserves the topological structure and minimizes taken conditional
+/// branches.
+void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
+                                            MachineLoop &L) {
+  // First recurse through any nested loops, building chains for those inner
+  // loops.
+  for (MachineLoop::iterator LI = L.begin(), LE = L.end(); LI != LE; ++LI)
+    buildLoopChains(F, **LI);
+
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
+
+  // First check to see if there is an obviously preferable top block for the
+  // loop. This will default to the header, but may end up as one of the
+  // predecessors to the header if there is one which will result in strictly
+  // fewer branches in the loop body.
+  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
+
+  // If we selected just the header for the loop top, look for a potentially
+  // profitable exit block in the event that rotating the loop can eliminate
+  // branches by placing an exit edge at the bottom.
+  MachineBasicBlock *ExitingBB = 0;
+  if (LoopTop == L.getHeader())
+    ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
+
+  BlockChain &LoopChain = *BlockToChain[LoopTop];
+
+  // FIXME: This is a really lame way of walking the chains in the loop: we
+  // walk the blocks, and use a set to prevent visiting a particular chain
+  // twice.
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  assert(LoopChain.LoopPredecessors == 0);
+  UpdatedPreds.insert(&LoopChain);
+  for (MachineLoop::block_iterator BI = L.block_begin(),
+                                   BE = L.block_end();
+       BI != BE; ++BI) {
+    BlockChain &Chain = *BlockToChain[*BI];
+    if (!UpdatedPreds.insert(&Chain))
+      continue;
+
+    assert(Chain.LoopPredecessors == 0);
+    for (BlockChain::iterator BCI = Chain.begin(), BCE = Chain.end();
+         BCI != BCE; ++BCI) {
+      assert(BlockToChain[*BCI] == &Chain);
+      for (MachineBasicBlock::pred_iterator PI = (*BCI)->pred_begin(),
+                                            PE = (*BCI)->pred_end();
+           PI != PE; ++PI) {
+        if (BlockToChain[*PI] == &Chain || !LoopBlockSet.count(*PI))
+          continue;
+        ++Chain.LoopPredecessors;
+      }
+    }
+
+    if (Chain.LoopPredecessors == 0)
+      BlockWorkList.push_back(*Chain.begin());
+  }
+
+  buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet);
+  rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
+
+  DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadLoop = false;
+    if (LoopChain.LoopPredecessors) {
+      BadLoop = true;
+      dbgs() << "Loop chain contains a block without its preds placed!\n"
+             << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+             << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
+    }
+    for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end();
+         BCI != BCE; ++BCI) {
+      dbgs() << "          ... " << getBlockName(*BCI) << "\n";
+      if (!LoopBlockSet.erase(*BCI)) {
+        // We don't mark the loop as bad here because there are real situations
+        // where this can occur. For example, with an unanalyzable fallthrough
+        // from a loop block to a non-loop block or vice versa.
+        dbgs() << "Loop chain contains a block not contained by the loop!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(*BCI) << "\n";
+      }
+    }
+
+    if (!LoopBlockSet.empty()) {
+      BadLoop = true;
+      for (BlockFilterSet::iterator LBI = LoopBlockSet.begin(),
+                                    LBE = LoopBlockSet.end();
+           LBI != LBE; ++LBI)
+        dbgs() << "Loop contains blocks never placed into a chain!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(*LBI) << "\n";
+    }
+    assert(!BadLoop && "Detected problems with the placement of this loop.");
+  });
+}
+
+void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
+  // Ensure that every BB in the function has an associated chain to simplify
+  // the assumptions of the remaining algorithm.
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = FI;
+    BlockChain *Chain
+      = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    for (;;) {
+      Cond.clear();
+      MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+      if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI(llvm::next(FI));
+      MachineBasicBlock *NextBB = NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                   << "\n");
+      Chain->merge(NextBB, 0);
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
+
+  // Build any loop-based chains.
+  for (MachineLoopInfo::iterator LI = MLI->begin(), LE = MLI->end(); LI != LE;
+       ++LI)
+    buildLoopChains(F, **LI);
+
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = &*FI;
+    BlockChain &Chain = *BlockToChain[BB];
+    if (!UpdatedPreds.insert(&Chain))
+      continue;
+
+    assert(Chain.LoopPredecessors == 0);
+    for (BlockChain::iterator BCI = Chain.begin(), BCE = Chain.end();
+         BCI != BCE; ++BCI) {
+      assert(BlockToChain[*BCI] == &Chain);
+      for (MachineBasicBlock::pred_iterator PI = (*BCI)->pred_begin(),
+                                            PE = (*BCI)->pred_end();
+           PI != PE; ++PI) {
+        if (BlockToChain[*PI] == &Chain)
+          continue;
+        ++Chain.LoopPredecessors;
+      }
+    }
+
+    if (Chain.LoopPredecessors == 0)
+      BlockWorkList.push_back(*Chain.begin());
+  }
+
+  BlockChain &FunctionChain = *BlockToChain[&F.front()];
+  buildChain(&F.front(), FunctionChain, BlockWorkList);
+
+  typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
+  DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadFunc = false;
+    FunctionBlockSetType FunctionBlockSet;
+    for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+      FunctionBlockSet.insert(FI);
+
+    for (BlockChain::iterator BCI = FunctionChain.begin(),
+                              BCE = FunctionChain.end();
+         BCI != BCE; ++BCI)
+      if (!FunctionBlockSet.erase(*BCI)) {
+        BadFunc = true;
+        dbgs() << "Function chain contains a block not in the function!\n"
+               << "  Bad block:    " << getBlockName(*BCI) << "\n";
+      }
+
+    if (!FunctionBlockSet.empty()) {
+      BadFunc = true;
+      for (FunctionBlockSetType::iterator FBI = FunctionBlockSet.begin(),
+                                          FBE = FunctionBlockSet.end();
+           FBI != FBE; ++FBI)
+        dbgs() << "Function contains blocks never placed into a chain!\n"
+               << "  Bad block:    " << getBlockName(*FBI) << "\n";
+    }
+    assert(!BadFunc && "Detected problems with the block placement.");
+  });
+
+  // Splice the blocks into place.
+  MachineFunction::iterator InsertPos = F.begin();
+  for (BlockChain::iterator BI = FunctionChain.begin(),
+                            BE = FunctionChain.end();
+       BI != BE; ++BI) {
+    DEBUG(dbgs() << (BI == FunctionChain.begin() ? "Placing chain "
+                                                  : "          ... ")
+          << getBlockName(*BI) << "\n");
+    if (InsertPos != MachineFunction::iterator(*BI))
+      F.splice(InsertPos, *BI);
+    else
+      ++InsertPos;
+
+    // Update the terminator of the previous block.
+    if (BI == FunctionChain.begin())
+      continue;
+    MachineBasicBlock *PrevBB = llvm::prior(MachineFunction::iterator(*BI));
+
+    // FIXME: It would be awesome of updateTerminator would just return rather
+    // than assert when the branch cannot be analyzed in order to remove this
+    // boiler plate.
+    Cond.clear();
+    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      // If PrevBB has a two-way branch, try to re-order the branches
+      // such that we branch to the successor with higher weight first.
+      if (TBB && !Cond.empty() && FBB &&
+          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          !TII->ReverseBranchCondition(Cond)) {
+        DEBUG(dbgs() << "Reverse order of the two branches: "
+                     << getBlockName(PrevBB) << "\n");
+        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
+                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DebugLoc dl;  // FIXME: this is nowhere
+        TII->RemoveBranch(*PrevBB);
+        TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
+      }
+      PrevBB->updateTerminator();
+    }
+  }
+
+  // Fixup the last block.
+  Cond.clear();
+  MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+  if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
+    F.back().updateTerminator();
+
+  // Walk through the backedges of the function now that we have fully laid out
+  // the basic blocks and align the destination of each backedge. We don't rely
+  // exclusively on the loop info here so that we can align backedges in
+  // unnatural CFGs and backedges that were introduced purely because of the
+  // loop rotations done during this layout pass.
+  if (F.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+    return;
+  unsigned Align = TLI->getPrefLoopAlignment();
+  if (!Align)
+    return;  // Don't care about loop alignment.
+  if (FunctionChain.begin() == FunctionChain.end())
+    return;  // Empty chain.
+
+  const BranchProbability ColdProb(1, 5); // 20%
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(F.begin());
+  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
+  for (BlockChain::iterator BI = llvm::next(FunctionChain.begin()),
+                            BE = FunctionChain.end();
+       BI != BE; ++BI) {
+    // Don't align non-looping basic blocks. These are unlikely to execute
+    // enough times to matter in practice. Note that we'll still handle
+    // unnatural CFGs inside of a natural outer loop (the common case) and
+    // rotated loops.
+    MachineLoop *L = MLI->getLoopFor(*BI);
+    if (!L)
+      continue;
+
+    // If the block is cold relative to the function entry don't waste space
+    // aligning it.
+    BlockFrequency Freq = MBFI->getBlockFreq(*BI);
+    if (Freq < WeightedEntryFreq)
+      continue;
+
+    // If the block is cold relative to its loop header, don't align it
+    // regardless of what edges into the block exist.
+    MachineBasicBlock *LoopHeader = L->getHeader();
+    BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
+    if (Freq < (LoopHeaderFreq * ColdProb))
+      continue;
+
+    // Check for the existence of a non-layout predecessor which would benefit
+    // from aligning this block.
+    MachineBasicBlock *LayoutPred = *llvm::prior(BI);
+
+    // Force alignment if all the predecessors are jumps. We already checked
+    // that the block isn't cold above.
+    if (!LayoutPred->isSuccessor(*BI)) {
+      (*BI)->setAlignment(Align);
+      continue;
+    }
+
+    // Align this block if the layout predecessor's edge into this block is
+    // cold relative to the block. When this is true, other predecessors make up
+    // all of the hot entries into the block and thus alignment is likely to be
+    // important.
+    BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, *BI);
+    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
+    if (LayoutEdgeFreq <= (Freq * ColdProb))
+      (*BI)->setAlignment(Align);
+  }
+}
+
+bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
+  // Check for single-block functions and skip them.
+  if (llvm::next(F.begin()) == F.end())
+    return false;
+
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  TII = F.getTarget().getInstrInfo();
+  TLI = F.getTarget().getTargetLowering();
+  assert(BlockToChain.empty());
+
+  buildCFGChains(F);
+
+  BlockToChain.clear();
+  ChainAllocator.DestroyAll();
+
+  // We always return true as we have no way to track whether the final order
+  // differs from the original order.
+  return true;
+}
+
+namespace {
+/// \brief A pass to compute block placement statistics.
+///
+/// A separate pass to compute interesting statistics for evaluating block
+/// placement. This is separate from the actual placement pass so that they can
+/// be computed in the absence of any placement transformations or when using
+/// alternative placement strategies.
+class MachineBlockPlacementStats : public MachineFunctionPass {
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+
+  /// \brief A handle to the function-wide block frequency pass.
+  const MachineBlockFrequencyInfo *MBFI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char MachineBlockPlacementStats::ID = 0;
+char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
+INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
+                      "Basic Block Placement Stats", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
+                    "Basic Block Placement Stats", false, false)
+
+bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
+  // Check for single-block functions and skip them.
+  if (llvm::next(F.begin()) == F.end())
+    return false;
+
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BlockFrequency BlockFreq = MBFI->getBlockFreq(I);
+    Statistic &NumBranches = (I->succ_size() > 1) ? NumCondBranches
+                                                  : NumUncondBranches;
+    Statistic &BranchTakenFreq = (I->succ_size() > 1) ? CondBranchTakenFreq
+                                                      : UncondBranchTakenFreq;
+    for (MachineBasicBlock::succ_iterator SI = I->succ_begin(),
+                                          SE = I->succ_end();
+         SI != SE; ++SI) {
+      // Skip if this successor is a fallthrough.
+      if (I->isLayoutSuccessor(*SI))
+        continue;
+
+      BlockFrequency EdgeFreq = BlockFreq * MBPI->getEdgeProbability(I, *SI);
+      ++NumBranches;
+      BranchTakenFreq += EdgeFreq.getFrequency();
+    }
+  }
+
+  return false;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
new file mode 100644
index 000000000000..ae70912b6c69
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -0,0 +1,126 @@
+//===- MachineBranchProbabilityInfo.cpp - Machine Branch Probability Info -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis uses probability info stored in Machine Basic Blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
+                      "Machine Branch Probability Analysis", false, true)
+INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
+                    "Machine Branch Probability Analysis", false, true)
+
+char MachineBranchProbabilityInfo::ID = 0;
+
+void MachineBranchProbabilityInfo::anchor() { }
+
+uint32_t MachineBranchProbabilityInfo::
+getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
+  // First we compute the sum with 64-bits of precision, ensuring that cannot
+  // overflow by bounding the number of weights considered. Hopefully no one
+  // actually needs 2^32 successors.
+  assert(MBB->succ_size() < UINT32_MAX);
+  uint64_t Sum = 0;
+  Scale = 1;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    Sum += Weight;
+  }
+
+  // If the computed sum fits in 32-bits, we're done.
+  if (Sum <= UINT32_MAX)
+    return Sum;
+
+  // Otherwise, compute the scale necessary to cause the weights to fit, and
+  // re-sum with that scale applied.
+  assert((Sum / UINT32_MAX) < UINT32_MAX);
+  Scale = (Sum / UINT32_MAX) + 1;
+  Sum = 0;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    Sum += Weight / Scale;
+  }
+  assert(Sum <= UINT32_MAX);
+  return Sum;
+}
+
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              MachineBasicBlock::const_succ_iterator Dst) const {
+  uint32_t Weight = Src->getSuccWeight(Dst);
+  if (!Weight)
+    return DEFAULT_WEIGHT;
+  return Weight;
+}
+
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+}
+
+bool MachineBranchProbabilityInfo::isEdgeHot(MachineBasicBlock *Src,
+                                             MachineBasicBlock *Dst) const {
+  // Hot probability is at least 4/5 = 80%
+  // FIXME: Compare against a static "hot" BranchProbability.
+  return getEdgeProbability(Src, Dst) > BranchProbability(4, 5);
+}
+
+MachineBasicBlock *
+MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
+  uint32_t MaxWeight = 0;
+  MachineBasicBlock *MaxSucc = 0;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    uint32_t Weight = getEdgeWeight(MBB, I);
+    if (Weight > MaxWeight) {
+      MaxWeight = Weight;
+      MaxSucc = *I;
+    }
+  }
+
+  if (getEdgeProbability(MBB, MaxSucc) >= BranchProbability(4, 5))
+    return MaxSucc;
+
+  return 0;
+}
+
+BranchProbability
+MachineBranchProbabilityInfo::getEdgeProbability(MachineBasicBlock *Src,
+                                                 MachineBasicBlock *Dst) const {
+  uint32_t Scale = 1;
+  uint32_t D = getSumForBlock(Src, Scale);
+  uint32_t N = getEdgeWeight(Src, Dst) / Scale;
+
+  return BranchProbability(N, D);
+}
+
+raw_ostream &MachineBranchProbabilityInfo::
+printEdgeProbability(raw_ostream &OS, MachineBasicBlock *Src,
+                     MachineBasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge MBB#" << Src->getNumber() << " -> MBB#" << Dst->getNumber()
+     << " probability is "  << Prob 
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
new file mode 100644
index 000000000000..61d8d384cd38
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
@@ -0,0 +1,661 @@
+//===-- MachineCSE.cpp - Machine Common Subexpression Elimination Pass ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global common subexpression elimination on machine
+// instructions using a scoped hash table based value numbering scheme. It
+// must be run while the machine function is still in SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-cse"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
+STATISTIC(NumPhysCSEs,
+          "Number of physreg referencing common subexpr eliminated");
+STATISTIC(NumCrossBBCSEs,
+          "Number of cross-MBB physreg referencing CS eliminated");
+STATISTIC(NumCommutes,  "Number of copies coalesced after commuting");
+
+namespace {
+  class MachineCSE : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    AliasAnalysis *AA;
+    MachineDominatorTree *DT;
+    MachineRegisterInfo *MRI;
+  public:
+    static char ID; // Pass identification
+    MachineCSE() : MachineFunctionPass(ID), LookAheadLimit(5), CurrVN(0) {
+      initializeMachineCSEPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+    }
+
+    virtual void releaseMemory() {
+      ScopeMap.clear();
+      Exps.clear();
+    }
+
+  private:
+    const unsigned LookAheadLimit;
+    typedef RecyclingAllocator<BumpPtrAllocator,
+        ScopedHashTableVal<MachineInstr*, unsigned> > AllocatorTy;
+    typedef ScopedHashTable<MachineInstr*, unsigned,
+        MachineInstrExpressionTrait, AllocatorTy> ScopedHTType;
+    typedef ScopedHTType::ScopeTy ScopeType;
+    DenseMap<MachineBasicBlock*, ScopeType*> ScopeMap;
+    ScopedHTType VNT;
+    SmallVector<MachineInstr*, 64> Exps;
+    unsigned CurrVN;
+
+    bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool isPhysDefTriviallyDead(unsigned Reg,
+                                MachineBasicBlock::const_iterator I,
+                                MachineBasicBlock::const_iterator E) const;
+    bool hasLivePhysRegDefUses(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB,
+                               SmallSet<unsigned,8> &PhysRefs,
+                               SmallVector<unsigned,2> &PhysDefs,
+                               bool &PhysUseDef) const;
+    bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                          SmallSet<unsigned,8> &PhysRefs,
+                          SmallVector<unsigned,2> &PhysDefs,
+                          bool &NonLocal) const;
+    bool isCSECandidate(MachineInstr *MI);
+    bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                           MachineInstr *CSMI, MachineInstr *MI);
+    void EnterScope(MachineBasicBlock *MBB);
+    void ExitScope(MachineBasicBlock *MBB);
+    bool ProcessBlock(MachineBasicBlock *MBB);
+    void ExitScopeIfDone(MachineDomTreeNode *Node,
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
+    bool PerformCSE(MachineDomTreeNode *Node);
+  };
+} // end anonymous namespace
+
+char MachineCSE::ID = 0;
+char &llvm::MachineCSEID = MachineCSE::ID;
+INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false)
+
+bool MachineCSE::PerformTrivialCoalescing(MachineInstr *MI,
+                                          MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (!MRI->hasOneNonDBGUse(Reg))
+      // Only coalesce single use copies. This ensure the copy will be
+      // deleted.
+      continue;
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI->isCopy())
+      continue;
+    unsigned SrcReg = DefMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      continue;
+    if (DefMI->getOperand(0).getSubReg() || DefMI->getOperand(1).getSubReg())
+      continue;
+    if (!MRI->constrainRegClass(SrcReg, MRI->getRegClass(Reg)))
+      continue;
+    DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    DEBUG(dbgs() << "***     to: " << *MI);
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    DefMI->eraseFromParent();
+    ++NumCoalesces;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool
+MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
+                                   MachineBasicBlock::const_iterator I,
+                                   MachineBasicBlock::const_iterator E) const {
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I->isDebugValue())
+      ++I;
+
+    if (I == E)
+      // Reached end of block, register is obviously dead.
+      return true;
+
+    bool SeenDef = false;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = I->getOperand(i);
+      if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+        SeenDef = true;
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (!TRI->regsOverlap(MO.getReg(), Reg))
+        continue;
+      if (MO.isUse())
+        // Found a use!
+        return false;
+      SeenDef = true;
+    }
+    if (SeenDef)
+      // See a def of Reg (or an alias) before encountering any use, it's
+      // trivially dead.
+      return true;
+
+    --LookAheadLeft;
+    ++I;
+  }
+  return false;
+}
+
+/// hasLivePhysRegDefUses - Return true if the specified instruction read/write
+/// physical registers (except for dead defs of physical registers). It also
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
+bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       SmallSet<unsigned,8> &PhysRefs,
+                                       SmallVector<unsigned,2> &PhysDefs,
+                                       bool &PhysUseDef) const{
+  // First, add all uses to PhysRefs.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // Reading constant physregs is ok.
+    if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRefs.insert(*AI);
+  }
+
+  // Next, collect all defs into PhysDefs.  If any is already in PhysRefs
+  // (which currently contains only uses), set the PhysUseDef flag.
+  PhysUseDef = false;
+  MachineBasicBlock::const_iterator I = MI; I = llvm::next(I);
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // Check against PhysRefs even if the def is "dead".
+    if (PhysRefs.count(Reg))
+      PhysUseDef = true;
+    // If the def is dead, it's ok. But the def may not marked "dead". That's
+    // common since this pass is run before livevariables. We can scan
+    // forward a few instructions and check if it is obviously dead.
+    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
+      PhysDefs.push_back(Reg);
+  }
+
+  // Finally, add all defs to PhysRefs as well.
+  for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
+    for (MCRegAliasIterator AI(PhysDefs[i], TRI, true); AI.isValid(); ++AI)
+      PhysRefs.insert(*AI);
+
+  return !PhysRefs.empty();
+}
+
+bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                                  SmallSet<unsigned,8> &PhysRefs,
+                                  SmallVector<unsigned,2> &PhysDefs,
+                                  bool &NonLocal) const {
+  // For now conservatively returns false if the common subexpression is
+  // not in the same basic block as the given instruction. The only exception
+  // is if the common subexpression is in the sole predecessor block.
+  const MachineBasicBlock *MBB = MI->getParent();
+  const MachineBasicBlock *CSMBB = CSMI->getParent();
+
+  bool CrossMBB = false;
+  if (CSMBB != MBB) {
+    if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB)
+      return false;
+
+    for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
+      if (MRI->isAllocatable(PhysDefs[i]) || MRI->isReserved(PhysDefs[i]))
+        // Avoid extending live range of physical registers if they are
+        //allocatable or reserved.
+        return false;
+    }
+    CrossMBB = true;
+  }
+  MachineBasicBlock::const_iterator I = CSMI; I = llvm::next(I);
+  MachineBasicBlock::const_iterator E = MI;
+  MachineBasicBlock::const_iterator EE = CSMBB->end();
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I != EE && I->isDebugValue())
+      ++I;
+
+    if (I == EE) {
+      assert(CrossMBB && "Reaching end-of-MBB without finding MI?");
+      (void)CrossMBB;
+      CrossMBB = false;
+      NonLocal = true;
+      I = MBB->begin();
+      EE = MBB->end();
+      continue;
+    }
+
+    if (I == E)
+      return true;
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = I->getOperand(i);
+      // RegMasks go on instructions like calls that clobber lots of physregs.
+      // Don't attempt to CSE across such an instruction.
+      if (MO.isRegMask())
+        return false;
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(MOReg))
+        continue;
+      if (PhysRefs.count(MOReg))
+        return false;
+    }
+
+    --LookAheadLeft;
+    ++I;
+  }
+
+  return false;
+}
+
+bool MachineCSE::isCSECandidate(MachineInstr *MI) {
+  if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
+      MI->isKill() || MI->isInlineAsm() || MI->isDebugValue())
+    return false;
+
+  // Ignore copies.
+  if (MI->isCopyLike())
+    return false;
+
+  // Ignore stuff that we obviously can't move.
+  if (MI->mayStore() || MI->isCall() || MI->isTerminator() ||
+      MI->hasUnmodeledSideEffects())
+    return false;
+
+  if (MI->mayLoad()) {
+    // Okay, this instruction does a load. As a refinement, we allow the target
+    // to decide whether the loaded value is actually a constant. If so, we can
+    // actually use it as a load.
+    if (!MI->isInvariantLoad(AA))
+      // FIXME: we should be able to hoist loads with no other side effects if
+      // there are no other instructions which can change memory in this loop.
+      // This is a trivial form of alias analysis.
+      return false;
+  }
+  return true;
+}
+
+/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
+/// common expression that defines Reg.
+bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                                   MachineInstr *CSMI, MachineInstr *MI) {
+  // FIXME: Heuristics that works around the lack the live range splitting.
+
+  // If CSReg is used at all uses of Reg, CSE should not increase register
+  // pressure of CSReg.
+  bool MayIncreasePressure = true;
+  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
+      TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MayIncreasePressure = false;
+    SmallPtrSet<MachineInstr*, 8> CSUses;
+    for (MachineRegisterInfo::use_nodbg_iterator I =MRI->use_nodbg_begin(CSReg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      CSUses.insert(Use);
+    }
+    for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      if (!CSUses.count(Use)) {
+        MayIncreasePressure = true;
+        break;
+      }
+    }
+  }
+  if (!MayIncreasePressure) return true;
+
+  // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
+  // an immediate predecessor. We don't want to increase register pressure and
+  // end up causing other computation to be spilled.
+  if (MI->isAsCheapAsAMove()) {
+    MachineBasicBlock *CSBB = CSMI->getParent();
+    MachineBasicBlock *BB = MI->getParent();
+    if (CSBB != BB && !CSBB->isSuccessor(BB))
+      return false;
+  }
+
+  // Heuristics #2: If the expression doesn't not use a vr and the only use
+  // of the redundant computation are copies, do not cse.
+  bool HasVRegUse = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isUse() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      HasVRegUse = true;
+      break;
+    }
+  }
+  if (!HasVRegUse) {
+    bool HasNonCopyUse = false;
+    for (MachineRegisterInfo::use_nodbg_iterator I =  MRI->use_nodbg_begin(Reg),
+           E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      // Ignore copies.
+      if (!Use->isCopyLike()) {
+        HasNonCopyUse = true;
+        break;
+      }
+    }
+    if (!HasNonCopyUse)
+      return false;
+  }
+
+  // Heuristics #3: If the common subexpression is used by PHIs, do not reuse
+  // it unless the defined value is already used in the BB of the new use.
+  bool HasPHI = false;
+  SmallPtrSet<MachineBasicBlock*, 4> CSBBs;
+  for (MachineRegisterInfo::use_nodbg_iterator I =  MRI->use_nodbg_begin(CSReg),
+       E = MRI->use_nodbg_end(); I != E; ++I) {
+    MachineInstr *Use = &*I;
+    HasPHI |= Use->isPHI();
+    CSBBs.insert(Use->getParent());
+  }
+
+  if (!HasPHI)
+    return true;
+  return CSBBs.count(MI->getParent());
+}
+
+void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  ScopeType *Scope = new ScopeType(VNT);
+  ScopeMap[MBB] = Scope;
+}
+
+void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
+  assert(SI != ScopeMap.end());
+  delete SI->second;
+  ScopeMap.erase(SI);
+}
+
+bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (!isCSECandidate(MI))
+      continue;
+
+    bool FoundCSE = VNT.count(MI);
+    if (!FoundCSE) {
+      // Look for trivial copy coalescing opportunities.
+      if (PerformTrivialCoalescing(MI, MBB)) {
+        Changed = true;
+
+        // After coalescing MI itself may become a copy.
+        if (MI->isCopyLike())
+          continue;
+        FoundCSE = VNT.count(MI);
+      }
+    }
+
+    // Commute commutable instructions.
+    bool Commuted = false;
+    if (!FoundCSE && MI->isCommutable()) {
+      MachineInstr *NewMI = TII->commuteInstruction(MI);
+      if (NewMI) {
+        Commuted = true;
+        FoundCSE = VNT.count(NewMI);
+        if (NewMI != MI) {
+          // New instruction. It doesn't need to be kept.
+          NewMI->eraseFromParent();
+          Changed = true;
+        } else if (!FoundCSE)
+          // MI was changed but it didn't help, commute it back!
+          (void)TII->commuteInstruction(MI);
+      }
+    }
+
+    // If the instruction defines physical registers and the values *may* be
+    // used, then it's not safe to replace it with a common subexpression.
+    // It's also not safe if the instruction uses physical registers.
+    bool CrossMBBPhysDef = false;
+    SmallSet<unsigned, 8> PhysRefs;
+    SmallVector<unsigned, 2> PhysDefs;
+    bool PhysUseDef = false;
+    if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
+                                          PhysDefs, PhysUseDef)) {
+      FoundCSE = false;
+
+      // ... Unless the CS is local or is in the sole predecessor block
+      // and it also defines the physical register which is not clobbered
+      // in between and the physical register uses were not clobbered.
+      // This can never be the case if the instruction both uses and
+      // defines the same physical register, which was detected above.
+      if (!PhysUseDef) {
+        unsigned CSVN = VNT.lookup(MI);
+        MachineInstr *CSMI = Exps[CSVN];
+        if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
+          FoundCSE = true;
+      }
+    }
+
+    if (!FoundCSE) {
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+      continue;
+    }
+
+    // Found a common subexpression, eliminate it.
+    unsigned CSVN = VNT.lookup(MI);
+    MachineInstr *CSMI = Exps[CSVN];
+    DEBUG(dbgs() << "Examining: " << *MI);
+    DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
+
+    // Check if it's profitable to perform this CSE.
+    bool DoCSE = true;
+    unsigned NumDefs = MI->getDesc().getNumDefs() +
+                       MI->getDesc().getNumImplicitDefs();
+    
+    for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned OldReg = MO.getReg();
+      unsigned NewReg = CSMI->getOperand(i).getReg();
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
+        ImplicitDefsToUpdate.push_back(i);
+      if (OldReg == NewReg) {
+        --NumDefs;
+        continue;
+      }
+
+      assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
+             TargetRegisterInfo::isVirtualRegister(NewReg) &&
+             "Do not CSE physical register defs!");
+
+      if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      // Don't perform CSE if the result of the old instruction cannot exist
+      // within the register class of the new instruction.
+      const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
+      if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      CSEPairs.push_back(std::make_pair(OldReg, NewReg));
+      --NumDefs;
+    }
+
+    // Actually perform the elimination.
+    if (DoCSE) {
+      for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
+        MRI->replaceRegWith(CSEPairs[i].first, CSEPairs[i].second);
+        MRI->clearKillFlags(CSEPairs[i].second);
+      }
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
+        CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+
+      if (CrossMBBPhysDef) {
+        // Add physical register defs now coming in from a predecessor to MBB
+        // livein list.
+        while (!PhysDefs.empty()) {
+          unsigned LiveIn = PhysDefs.pop_back_val();
+          if (!MBB->isLiveIn(LiveIn))
+            MBB->addLiveIn(LiveIn);
+        }
+        ++NumCrossBBCSEs;
+      }
+
+      MI->eraseFromParent();
+      ++NumCSEs;
+      if (!PhysRefs.empty())
+        ++NumPhysCSEs;
+      if (Commuted)
+        ++NumCommutes;
+      Changed = true;
+    } else {
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+    }
+    CSEPairs.clear();
+    ImplicitDefsToUpdate.clear();
+  }
+
+  return Changed;
+}
+
+/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
+/// dominator tree node if its a leaf or all of its children are done. Walk
+/// up the dominator tree to destroy ancestors which are now done.
+void
+MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  CurrVN = 0;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(Node);
+  do {
+    Node = WorkList.pop_back_val();
+    Scopes.push_back(Node);
+    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    OpenChildren[Node] = NumChildren;
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      WorkList.push_back(Child);
+    }
+  } while (!WorkList.empty());
+
+  // Now perform CSE.
+  bool Changed = false;
+  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = Scopes[i];
+    MachineBasicBlock *MBB = Node->getBlock();
+    EnterScope(MBB);
+    Changed |= ProcessBlock(MBB);
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren);
+  }
+
+  return Changed;
+}
+
+bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<MachineDominatorTree>();
+  return PerformCSE(DT->getRootNode());
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineCodeEmitter.cpp b/contrib/llvm/lib/CodeGen/MachineCodeEmitter.cpp
new file mode 100644
index 000000000000..81b49784c052
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCodeEmitter.cpp
@@ -0,0 +1,14 @@
+//===-- llvm/CodeGen/MachineCodeEmitter.cpp - Code emission -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+
+using namespace llvm;
+
+void MachineCodeEmitter::anchor() { }
diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
new file mode 100644
index 000000000000..dc8a2241c7e1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -0,0 +1,334 @@
+//===- MachineCopyPropagation.cpp - Machine Copy Propagation Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level copy propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegen-cp"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+STATISTIC(NumDeletes, "Number of dead copies deleted");
+
+namespace {
+  class MachineCopyPropagation : public MachineFunctionPass {
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+    MachineRegisterInfo *MRI;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineCopyPropagation() : MachineFunctionPass(ID) {
+     initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    typedef SmallVector<unsigned, 4> DestList;
+    typedef DenseMap<unsigned, DestList> SourceMap;
+
+    void SourceNoLongerAvailable(unsigned Reg,
+                                 SourceMap &SrcMap,
+                                 DenseMap<unsigned, MachineInstr*> &AvailCopyMap);
+    bool CopyPropagateBlock(MachineBasicBlock &MBB);
+    void removeCopy(MachineInstr *MI);
+  };
+}
+char MachineCopyPropagation::ID = 0;
+char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
+
+INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
+                "Machine Copy Propagation Pass", false, false)
+
+void
+MachineCopyPropagation::SourceNoLongerAvailable(unsigned Reg,
+                              SourceMap &SrcMap,
+                              DenseMap<unsigned, MachineInstr*> &AvailCopyMap) {
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    SourceMap::iterator SI = SrcMap.find(*AI);
+    if (SI != SrcMap.end()) {
+      const DestList& Defs = SI->second;
+      for (DestList::const_iterator I = Defs.begin(), E = Defs.end();
+           I != E; ++I) {
+        unsigned MappedDef = *I;
+        // Source of copy is no longer available for propagation.
+        if (AvailCopyMap.erase(MappedDef)) {
+          for (MCSubRegIterator SR(MappedDef, TRI); SR.isValid(); ++SR)
+            AvailCopyMap.erase(*SR);
+        }
+      }
+    }
+  }
+}
+
+static bool NoInterveningSideEffect(const MachineInstr *CopyMI,
+                                    const MachineInstr *MI) {
+  const MachineBasicBlock *MBB = CopyMI->getParent();
+  if (MI->getParent() != MBB)
+    return false;
+  MachineBasicBlock::const_iterator I = CopyMI;
+  MachineBasicBlock::const_iterator E = MBB->end();
+  MachineBasicBlock::const_iterator E2 = MI;
+
+  ++I;
+  while (I != E && I != E2) {
+    if (I->hasUnmodeledSideEffects() || I->isCall() ||
+        I->isTerminator())
+      return false;
+    ++I;
+  }
+  return true;
+}
+
+/// isNopCopy - Return true if the specified copy is really a nop. That is
+/// if the source of the copy is the same of the definition of the copy that
+/// supplied the source. If the source of the copy is a sub-register than it
+/// must check the sub-indices match. e.g.
+/// ecx = mov eax
+/// al  = mov cl
+/// But not
+/// ecx = mov eax
+/// al  = mov ch
+static bool isNopCopy(MachineInstr *CopyMI, unsigned Def, unsigned Src,
+                      const TargetRegisterInfo *TRI) {
+  unsigned SrcSrc = CopyMI->getOperand(1).getReg();
+  if (Def == SrcSrc)
+    return true;
+  if (TRI->isSubRegister(SrcSrc, Def)) {
+    unsigned SrcDef = CopyMI->getOperand(0).getReg();
+    unsigned SubIdx = TRI->getSubRegIndex(SrcSrc, Def);
+    if (!SubIdx)
+      return false;
+    return SubIdx == TRI->getSubRegIndex(SrcDef, Src);
+  }
+
+  return false;
+}
+
+// Remove MI from the function because it has been determined it is dead.
+// Turn it into a noop KILL instruction if it has super-register liveness
+// adjustments.
+void MachineCopyPropagation::removeCopy(MachineInstr *MI) {
+  if (MI->getNumOperands() == 2)
+    MI->eraseFromParent();
+  else
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+}
+
+bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
+  SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;  // Candidates for deletion
+  DenseMap<unsigned, MachineInstr*> AvailCopyMap;    // Def -> available copies map
+  DenseMap<unsigned, MachineInstr*> CopyMap;         // Def -> copies map
+  SourceMap SrcMap; // Src -> Def map
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (MI->isCopy()) {
+      unsigned Def = MI->getOperand(0).getReg();
+      unsigned Src = MI->getOperand(1).getReg();
+
+      if (TargetRegisterInfo::isVirtualRegister(Def) ||
+          TargetRegisterInfo::isVirtualRegister(Src))
+        report_fatal_error("MachineCopyPropagation should be run after"
+                           " register allocation!");
+
+      DenseMap<unsigned, MachineInstr*>::iterator CI = AvailCopyMap.find(Src);
+      if (CI != AvailCopyMap.end()) {
+        MachineInstr *CopyMI = CI->second;
+        if (!MRI->isReserved(Def) &&
+            (!MRI->isReserved(Src) || NoInterveningSideEffect(CopyMI, MI)) &&
+            isNopCopy(CopyMI, Def, Src, TRI)) {
+          // The two copies cancel out and the source of the first copy
+          // hasn't been overridden, eliminate the second one. e.g.
+          //  %ECX<def> = COPY %EAX<kill>
+          //  ... nothing clobbered EAX.
+          //  %EAX<def> = COPY %ECX
+          // =>
+          //  %ECX<def> = COPY %EAX
+          //
+          // Also avoid eliminating a copy from reserved registers unless the
+          // definition is proven not clobbered. e.g.
+          // %RSP<def> = COPY %RAX
+          // CALL
+          // %RAX<def> = COPY %RSP
+
+          // Clear any kills of Def between CopyMI and MI. This extends the
+          // live range.
+          for (MachineBasicBlock::iterator I = CopyMI, E = MI; I != E; ++I)
+            I->clearRegisterKills(Def, TRI);
+
+          removeCopy(MI);
+          Changed = true;
+          ++NumDeletes;
+          continue;
+        }
+      }
+
+      // If Src is defined by a previous copy, it cannot be eliminated.
+      for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
+        CI = CopyMap.find(*AI);
+        if (CI != CopyMap.end())
+          MaybeDeadCopies.remove(CI->second);
+      }
+
+      // Copy is now a candidate for deletion.
+      MaybeDeadCopies.insert(MI);
+
+      // If 'Src' is previously source of another copy, then this earlier copy's
+      // source is no longer available. e.g.
+      // %xmm9<def> = copy %xmm2
+      // ...
+      // %xmm2<def> = copy %xmm0
+      // ...
+      // %xmm2<def> = copy %xmm9
+      SourceNoLongerAvailable(Def, SrcMap, AvailCopyMap);
+
+      // Remember Def is defined by the copy.
+      // ... Make sure to clear the def maps of aliases first.
+      for (MCRegAliasIterator AI(Def, TRI, false); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
+      }
+      CopyMap[Def] = MI;
+      AvailCopyMap[Def] = MI;
+      for (MCSubRegIterator SR(Def, TRI); SR.isValid(); ++SR) {
+        CopyMap[*SR] = MI;
+        AvailCopyMap[*SR] = MI;
+      }
+
+      // Remember source that's copied to Def. Once it's clobbered, then
+      // it's no longer available for copy propagation.
+      if (std::find(SrcMap[Src].begin(), SrcMap[Src].end(), Def) ==
+          SrcMap[Src].end()) {
+        SrcMap[Src].push_back(Def);
+      }
+
+      continue;
+    }
+
+    // Not a copy.
+    SmallVector<unsigned, 2> Defs;
+    int RegMaskOpNum = -1;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isRegMask())
+        RegMaskOpNum = i;
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        report_fatal_error("MachineCopyPropagation should be run after"
+                           " register allocation!");
+
+      if (MO.isDef()) {
+        Defs.push_back(Reg);
+        continue;
+      }
+
+      // If 'Reg' is defined by a copy, the copy is no longer a candidate
+      // for elimination.
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        DenseMap<unsigned, MachineInstr*>::iterator CI = CopyMap.find(*AI);
+        if (CI != CopyMap.end())
+          MaybeDeadCopies.remove(CI->second);
+      }
+    }
+
+    // The instruction has a register mask operand which means that it clobbers
+    // a large set of registers.  It is possible to use the register mask to
+    // prune the available copies, but treat it like a basic block boundary for
+    // now.
+    if (RegMaskOpNum >= 0) {
+      // Erase any MaybeDeadCopies whose destination register is clobbered.
+      const MachineOperand &MaskMO = MI->getOperand(RegMaskOpNum);
+      for (SmallSetVector<MachineInstr*, 8>::iterator
+           DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
+           DI != DE; ++DI) {
+        unsigned Reg = (*DI)->getOperand(0).getReg();
+        if (MRI->isReserved(Reg) || !MaskMO.clobbersPhysReg(Reg))
+          continue;
+        removeCopy(*DI);
+        Changed = true;
+        ++NumDeletes;
+      }
+
+      // Clear all data structures as if we were beginning a new basic block.
+      MaybeDeadCopies.clear();
+      AvailCopyMap.clear();
+      CopyMap.clear();
+      SrcMap.clear();
+      continue;
+    }
+
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      unsigned Reg = Defs[i];
+
+      // No longer defined by a copy.
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        CopyMap.erase(*AI);
+        AvailCopyMap.erase(*AI);
+      }
+
+      // If 'Reg' is previously source of a copy, it is no longer available for
+      // copy propagation.
+      SourceNoLongerAvailable(Reg, SrcMap, AvailCopyMap);
+    }
+  }
+
+  // If MBB doesn't have successors, delete the copies whose defs are not used.
+  // If MBB does have successors, then conservative assume the defs are live-out
+  // since we don't want to trust live-in lists.
+  if (MBB.succ_empty()) {
+    for (SmallSetVector<MachineInstr*, 8>::iterator
+           DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
+         DI != DE; ++DI) {
+      if (!MRI->isReserved((*DI)->getOperand(0).getReg())) {
+        removeCopy(*DI);
+        Changed = true;
+        ++NumDeletes;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    Changed |= CopyPropagateBlock(*I);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
new file mode 100644
index 000000000000..04c8ecbf9bdc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
@@ -0,0 +1,59 @@
+//===- MachineDominators.cpp - Machine Dominator Calculation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace llvm {
+TEMPLATE_INSTANTIATION(class DomTreeNodeBase<MachineBasicBlock>);
+TEMPLATE_INSTANTIATION(class DominatorTreeBase<MachineBasicBlock>);
+}
+
+char MachineDominatorTree::ID = 0;
+
+INITIALIZE_PASS(MachineDominatorTree, "machinedomtree",
+                "MachineDominator Tree Construction", true, true)
+
+char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
+
+void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+
+  return false;
+}
+
+MachineDominatorTree::MachineDominatorTree()
+    : MachineFunctionPass(ID) {
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(false);
+}
+
+MachineDominatorTree::~MachineDominatorTree() {
+  delete DT;
+}
+
+void MachineDominatorTree::releaseMemory() {
+  DT->releaseMemory();
+}
+
+void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
+  DT->print(OS);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
new file mode 100644
index 000000000000..04321f329282
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@@ -0,0 +1,898 @@
+//===-- MachineFunction.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect native machine code information for a function.  This allows
+// target-specific information about the generated code to be stored with each
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineFunction implementation
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+MachineFunctionInfo::~MachineFunctionInfo() {}
+
+void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
+  MBB->getParent()->DeleteMachineBasicBlock(MBB);
+}
+
+MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
+                                 unsigned FunctionNum, MachineModuleInfo &mmi,
+                                 GCModuleInfo* gmi)
+  : Fn(F), Target(TM), Ctx(mmi.getContext()), MMI(mmi), GMI(gmi) {
+  if (TM.getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(*TM.getRegisterInfo());
+  else
+    RegInfo = 0;
+  MFInfo = 0;
+  FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering(),
+                                               TM.Options.RealignStack);
+  if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::StackAlignment))
+    FrameInfo->ensureMaxAlignment(Fn->getAttributes().
+                                getStackAlignment(AttributeSet::FunctionIndex));
+  ConstantPool = new (Allocator) MachineConstantPool(TM.getDataLayout());
+  Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+  // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
+  if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                        Attribute::OptimizeForSize))
+    Alignment = std::max(Alignment,
+                         TM.getTargetLowering()->getPrefFunctionAlignment());
+  FunctionNumber = FunctionNum;
+  JumpTableInfo = 0;
+}
+
+MachineFunction::~MachineFunction() {
+  // Don't call destructors on MachineInstr and MachineOperand. All of their
+  // memory comes from the BumpPtrAllocator which is about to be purged.
+  //
+  // Do call MachineBasicBlock destructors, it contains std::vectors.
+  for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I))
+    I->Insts.clearAndLeakNodesUnsafely();
+
+  InstructionRecycler.clear(Allocator);
+  OperandRecycler.clear(Allocator);
+  BasicBlockRecycler.clear(Allocator);
+  if (RegInfo) {
+    RegInfo->~MachineRegisterInfo();
+    Allocator.Deallocate(RegInfo);
+  }
+  if (MFInfo) {
+    MFInfo->~MachineFunctionInfo();
+    Allocator.Deallocate(MFInfo);
+  }
+
+  FrameInfo->~MachineFrameInfo();
+  Allocator.Deallocate(FrameInfo);
+
+  ConstantPool->~MachineConstantPool();
+  Allocator.Deallocate(ConstantPool);
+
+  if (JumpTableInfo) {
+    JumpTableInfo->~MachineJumpTableInfo();
+    Allocator.Deallocate(JumpTableInfo);
+  }
+}
+
+/// getOrCreateJumpTableInfo - Get the JumpTableInfo for this function, if it
+/// does already exist, allocate one.
+MachineJumpTableInfo *MachineFunction::
+getOrCreateJumpTableInfo(unsigned EntryKind) {
+  if (JumpTableInfo) return JumpTableInfo;
+
+  JumpTableInfo = new (Allocator)
+    MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
+  return JumpTableInfo;
+}
+
+/// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
+/// recomputes them.  This guarantees that the MBB numbers are sequential,
+/// dense, and match the ordering of the blocks within the function.  If a
+/// specific MachineBasicBlock is specified, only that block and those after
+/// it are renumbered.
+void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
+  if (empty()) { MBBNumbering.clear(); return; }
+  MachineFunction::iterator MBBI, E = end();
+  if (MBB == 0)
+    MBBI = begin();
+  else
+    MBBI = MBB;
+
+  // Figure out the block number this should have.
+  unsigned BlockNo = 0;
+  if (MBBI != begin())
+    BlockNo = prior(MBBI)->getNumber()+1;
+
+  for (; MBBI != E; ++MBBI, ++BlockNo) {
+    if (MBBI->getNumber() != (int)BlockNo) {
+      // Remove use of the old number.
+      if (MBBI->getNumber() != -1) {
+        assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
+               "MBB number mismatch!");
+        MBBNumbering[MBBI->getNumber()] = 0;
+      }
+
+      // If BlockNo is already taken, set that block's number to -1.
+      if (MBBNumbering[BlockNo])
+        MBBNumbering[BlockNo]->setNumber(-1);
+
+      MBBNumbering[BlockNo] = MBBI;
+      MBBI->setNumber(BlockNo);
+    }
+  }
+
+  // Okay, all the blocks are renumbered.  If we have compactified the block
+  // numbering, shrink MBBNumbering now.
+  assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
+  MBBNumbering.resize(BlockNo);
+}
+
+/// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
+/// of `new MachineInstr'.
+///
+MachineInstr *
+MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
+                                    DebugLoc DL, bool NoImp) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+    MachineInstr(*this, MCID, DL, NoImp);
+}
+
+/// CloneMachineInstr - Create a new MachineInstr which is a copy of the
+/// 'Orig' instruction, identical in all ways except the instruction
+/// has no parent, prev, or next.
+///
+MachineInstr *
+MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+             MachineInstr(*this, *Orig);
+}
+
+/// DeleteMachineInstr - Delete the given MachineInstr.
+///
+/// This function also serves as the MachineInstr destructor - the real
+/// ~MachineInstr() destructor must be empty.
+void
+MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
+  // Strip it for parts. The operand array and the MI object itself are
+  // independently recyclable.
+  if (MI->Operands)
+    deallocateOperandArray(MI->CapOperands, MI->Operands);
+  // Don't call ~MachineInstr() which must be trivial anyway because
+  // ~MachineFunction drops whole lists of MachineInstrs wihout calling their
+  // destructors.
+  InstructionRecycler.Deallocate(Allocator, MI);
+}
+
+/// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
+/// instead of `new MachineBasicBlock'.
+///
+MachineBasicBlock *
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+  return new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
+             MachineBasicBlock(*this, bb);
+}
+
+/// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
+///
+void
+MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
+  assert(MBB->getParent() == this && "MBB parent mismatch!");
+  MBB->~MachineBasicBlock();
+  BasicBlockRecycler.Deallocate(Allocator, MBB);
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(MachinePointerInfo PtrInfo, unsigned f,
+                                      uint64_t s, unsigned base_alignment,
+                                      const MDNode *TBAAInfo,
+                                      const MDNode *Ranges) {
+  return new (Allocator) MachineMemOperand(PtrInfo, f, s, base_alignment,
+                                           TBAAInfo, Ranges);
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      int64_t Offset, uint64_t Size) {
+  return new (Allocator)
+             MachineMemOperand(MachinePointerInfo(MMO->getValue(),
+                                                  MMO->getOffset()+Offset),
+                               MMO->getFlags(), Size,
+                               MMO->getBaseAlignment(), 0);
+}
+
+MachineInstr::mmo_iterator
+MachineFunction::allocateMemRefsArray(unsigned long Num) {
+  return Allocator.Allocate<MachineMemOperand *>(Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
+                                    MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isLoad())
+      ++Num;
+
+  // Allocate a new array and populate it with the load information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isLoad()) {
+      if (!(*I)->isStore())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the store flag.
+        MachineMemOperand *JustLoad =
+          getMachineMemOperand((*I)->getPointerInfo(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOStore,
+                               (*I)->getSize(), (*I)->getBaseAlignment(),
+                               (*I)->getTBAAInfo());
+        Result[Index] = JustLoad;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
+                                     MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isStore())
+      ++Num;
+
+  // Allocate a new array and populate it with the store information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isStore()) {
+      if (!(*I)->isLoad())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the load flag.
+        MachineMemOperand *JustStore =
+          getMachineMemOperand((*I)->getPointerInfo(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOLoad,
+                               (*I)->getSize(), (*I)->getBaseAlignment(),
+                               (*I)->getTBAAInfo());
+        Result[Index] = JustStore;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineFunction::dump() const {
+  print(dbgs());
+}
+#endif
+
+StringRef MachineFunction::getName() const {
+  assert(getFunction() && "No function!");
+  return getFunction()->getName();
+}
+
+void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
+  OS << "# Machine code for function " << getName() << ": ";
+  if (RegInfo) {
+    OS << (RegInfo->isSSA() ? "SSA" : "Post SSA");
+    if (!RegInfo->tracksLiveness())
+      OS << ", not tracking liveness";
+  }
+  OS << '\n';
+
+  // Print Frame Information
+  FrameInfo->print(*this, OS);
+
+  // Print JumpTable Information
+  if (JumpTableInfo)
+    JumpTableInfo->print(OS);
+
+  // Print Constant Pool
+  ConstantPool->print(OS);
+
+  const TargetRegisterInfo *TRI = getTarget().getRegisterInfo();
+
+  if (RegInfo && !RegInfo->livein_empty()) {
+    OS << "Function Live Ins: ";
+    for (MachineRegisterInfo::livein_iterator
+         I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
+      OS << PrintReg(I->first, TRI);
+      if (I->second)
+        OS << " in " << PrintReg(I->second, TRI);
+      if (llvm::next(I) != E)
+        OS << ", ";
+    }
+    OS << '\n';
+  }
+
+  for (const_iterator BB = begin(), E = end(); BB != E; ++BB) {
+    OS << '\n';
+    BB->print(OS, Indexes);
+  }
+
+  OS << "\n# End machine code for function " << getName() << ".\n\n";
+}
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const MachineFunction *F) {
+      return "CFG for '" + F->getName().str() + "' function";
+    }
+
+    std::string getNodeLabel(const MachineBasicBlock *Node,
+                             const MachineFunction *Graph) {
+      std::string OutStr;
+      {
+        raw_string_ostream OSS(OutStr);
+
+        if (isSimple()) {
+          OSS << "BB#" << Node->getNumber();
+          if (const BasicBlock *BB = Node->getBasicBlock())
+            OSS << ": " << BB->getName();
+        } else
+          Node->print(OSS);
+      }
+
+      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+      // Process string output to make it nicer...
+      for (unsigned i = 0; i != OutStr.length(); ++i)
+        if (OutStr[i] == '\n') {                            // Left justify
+          OutStr[i] = '\\';
+          OutStr.insert(OutStr.begin()+i+1, 'l');
+        }
+      return OutStr;
+    }
+  };
+}
+
+void MachineFunction::viewCFG() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName());
+#else
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+void MachineFunction::viewCFGOnly() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName(), true);
+#else
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+/// addLiveIn - Add the specified physical register as a live-in value and
+/// create a corresponding virtual register for it.
+unsigned MachineFunction::addLiveIn(unsigned PReg,
+                                    const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = getRegInfo();
+  unsigned VReg = MRI.getLiveInVirtReg(PReg);
+  if (VReg) {
+    assert(MRI.getRegClass(VReg) == RC && "Register class mismatch!");
+    return VReg;
+  }
+  VReg = MRI.createVirtualRegister(RC);
+  MRI.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+/// getJTISymbol - Return the MCSymbol for the specified non-empty jump table.
+/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
+/// normal 'L' label is returned.
+MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx, 
+                                        bool isLinkerPrivate) const {
+  assert(JumpTableInfo && "No jump tables");
+  assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
+  const MCAsmInfo &MAI = *getTarget().getMCAsmInfo();
+
+  const char *Prefix = isLinkerPrivate ? MAI.getLinkerPrivateGlobalPrefix() :
+                                         MAI.getPrivateGlobalPrefix();
+  SmallString<60> Name;
+  raw_svector_ostream(Name)
+    << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+/// getPICBaseSymbol - Return a function-local symbol to represent the PIC
+/// base.
+MCSymbol *MachineFunction::getPICBaseSymbol() const {
+  const MCAsmInfo &MAI = *Target.getMCAsmInfo();
+  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
+                               Twine(getFunctionNumber())+"$pb");
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineFrameInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// ensureMaxAlignment - Make sure the function is at least Align bytes
+/// aligned.
+void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
+  if (!TFI.isStackRealignable() || !RealignOption)
+    assert(Align <= TFI.getStackAlignment() &&
+           "For targets without stack realignment, Align is out of limit!");
+  if (MaxAlignment < Align) MaxAlignment = Align;
+}
+
+/// clampStackAlignment - Clamp the alignment if requested and emit a warning.
+static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
+                                           unsigned StackAlign) {
+  if (!ShouldClamp || Align <= StackAlign)
+    return Align;
+  DEBUG(dbgs() << "Warning: requested alignment " << Align
+               << " exceeds the stack alignment " << StackAlign
+               << " when stack realignment is off" << '\n');
+  return StackAlign;
+}
+
+/// CreateStackObject - Create a new statically sized stack object, returning
+/// a nonnegative identifier to represent it.
+///
+int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
+                      bool isSS, bool MayNeedSP, const AllocaInst *Alloca) {
+  assert(Size != 0 && "Cannot allocate zero size stack objects!");
+  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
+                                  Alignment, TFI.getStackAlignment());
+  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP,
+                                Alloca));
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  assert(Index >= 0 && "Bad frame index!");
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+/// CreateSpillStackObject - Create a new statically sized stack object that
+/// represents a spill slot, returning a nonnegative identifier to represent
+/// it.
+///
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
+                                             unsigned Alignment) {
+  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
+                                  Alignment, TFI.getStackAlignment()); 
+  CreateStackObject(Size, Alignment, true, false);
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+/// CreateVariableSizedObject - Notify the MachineFrameInfo object that a
+/// variable sized object has been created.  This must be created whenever a
+/// variable sized object is created, whether or not the index returned is
+/// actually used.
+///
+int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment) {
+  HasVarSizedObjects = true;
+  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
+                                  Alignment, TFI.getStackAlignment()); 
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, true, 0));
+  ensureMaxAlignment(Alignment);
+  return (int)Objects.size()-NumFixedObjects-1;
+}
+
+/// CreateFixedObject - Create a new object at a fixed location on the stack.
+/// All fixed objects should be created before other objects are created for
+/// efficiency. By default, fixed objects are immutable. This returns an
+/// index with a negative value.
+///
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool Immutable) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned.
+  unsigned StackAlign = TFI.getStackAlignment();
+  unsigned Align = MinAlign(SPOffset, StackAlign);
+  Align = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
+                              Align, TFI.getStackAlignment()); 
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/   false,
+                                              /*NeedSP*/ false,
+                                              /*Alloca*/ 0));
+  return -++NumFixedObjects;
+}
+
+
+BitVector
+MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
+  assert(MBB && "MBB must be valid");
+  const MachineFunction *MF = MBB->getParent();
+  assert(MF && "MBB must be part of a MachineFunction");
+  const TargetMachine &TM = MF->getTarget();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  BitVector BV(TRI->getNumRegs());
+
+  // Before CSI is calculated, no registers are considered pristine. They can be
+  // freely used and PEI will make sure they are saved.
+  if (!isCalleeSavedInfoValid())
+    return BV;
+
+  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(MF); CSR && *CSR; ++CSR)
+    BV.set(*CSR);
+
+  // The entry MBB always has all CSRs pristine.
+  if (MBB == &MF->front())
+    return BV;
+
+  // On other MBBs the saved CSRs are not pristine.
+  const std::vector<CalleeSavedInfo> &CSI = getCalleeSavedInfo();
+  for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+         E = CSI.end(); I != E; ++I)
+    BV.reset(I->getReg());
+
+  return BV;
+}
+
+unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  unsigned MaxAlign = getMaxAlignment();
+  int Offset = 0;
+
+  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
+  // It really should be refactored to share code. Until then, changes
+  // should keep in mind that there's tight coupling between the two.
+
+  for (int i = getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) {
+    if (isDeadObjectIndex(i))
+      continue;
+    Offset += getObjectSize(i);
+    unsigned Align = getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  if (adjustsStack() && TFI->hasReservedCallFrame(MF))
+    Offset += getMaxCallFrameSize();
+
+  // Round up the size to a multiple of the alignment.  If the function has
+  // any calls or alloca's, align to the target's StackAlignment value to
+  // ensure that the callee's frame or the alloca data is suitably aligned;
+  // otherwise, for leaf functions, align to the TransientStackAlignment
+  // value.
+  unsigned StackAlign;
+  if (adjustsStack() || hasVarSizedObjects() ||
+      (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0))
+    StackAlign = TFI->getStackAlignment();
+  else
+    StackAlign = TFI->getTransientStackAlignment();
+
+  // If the frame pointer is eliminated, all frame offsets will be relative to
+  // SP not FP. Align to MaxAlign so this works.
+  StackAlign = std::max(StackAlign, MaxAlign);
+  unsigned AlignMask = StackAlign - 1;
+  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+
+  return (unsigned)Offset;
+}
+
+void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
+  if (Objects.empty()) return;
+
+  const TargetFrameLowering *FI = MF.getTarget().getFrameLowering();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  OS << "Frame Objects:\n";
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size=" << SO.Size;
+    OS << ", align=" << SO.Alignment;
+
+    if (i < NumFixedObjects)
+      OS << ", fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << ", at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//  MachineJumpTableInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// getEntrySize - Return the size of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const {
+  // The size of a jump table entry is 4 bytes unless the entry is just the
+  // address of a block, in which case it is the pointer size.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerSize();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return 8;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return 4;
+  case MachineJumpTableInfo::EK_Inline:
+    return 0;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// getEntryAlignment - Return the alignment of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
+  // The alignment of a jump table entry is the alignment of int32 unless the
+  // entry is just the address of a block, in which case it is the pointer
+  // alignment.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerABIAlignment();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return TD.getABIIntegerTypeAlignment(64);
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return TD.getABIIntegerTypeAlignment(32);
+  case MachineJumpTableInfo::EK_Inline:
+    return 1;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// createJumpTableIndex - Create a new jump table entry in the jump table info.
+///
+unsigned MachineJumpTableInfo::createJumpTableIndex(
+                               const std::vector<MachineBasicBlock*> &DestBBs) {
+  assert(!DestBBs.empty() && "Cannot create an empty jump table!");
+  JumpTables.push_back(MachineJumpTableEntry(DestBBs));
+  return JumpTables.size()-1;
+}
+
+/// ReplaceMBBInJumpTables - If Old is the target of any jump tables, update
+/// the jump tables to branch to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
+                                                  MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  for (size_t i = 0, e = JumpTables.size(); i != e; ++i)
+    ReplaceMBBInJumpTable(i, Old, New);
+  return MadeChange;
+}
+
+/// ReplaceMBBInJumpTable - If Old is a target of the jump tables, update
+/// the jump table to branch to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
+                                                 MachineBasicBlock *Old,
+                                                 MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  MachineJumpTableEntry &JTE = JumpTables[Idx];
+  for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
+    if (JTE.MBBs[j] == Old) {
+      JTE.MBBs[j] = New;
+      MadeChange = true;
+    }
+  return MadeChange;
+}
+
+void MachineJumpTableInfo::print(raw_ostream &OS) const {
+  if (JumpTables.empty()) return;
+
+  OS << "Jump Tables:\n";
+
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
+    OS << "  jt#" << i << ": ";
+    for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j)
+      OS << " BB#" << JumpTables[i].MBBs[j]->getNumber();
+  }
+
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineJumpTableInfo::dump() const { print(dbgs()); }
+#endif
+
+
+//===----------------------------------------------------------------------===//
+//  MachineConstantPool implementation
+//===----------------------------------------------------------------------===//
+
+void MachineConstantPoolValue::anchor() { }
+
+Type *MachineConstantPoolEntry::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+
+unsigned MachineConstantPoolEntry::getRelocationInfo() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getRelocationInfo();
+  return Val.ConstVal->getRelocationInfo();
+}
+
+MachineConstantPool::~MachineConstantPool() {
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (Constants[i].isMachineConstantPoolEntry())
+      delete Constants[i].Val.MachineCPVal;
+  for (DenseSet<MachineConstantPoolValue*>::iterator I =
+       MachineCPVsSharingEntries.begin(), E = MachineCPVsSharingEntries.end();
+       I != E; ++I)
+    delete *I;
+}
+
+/// CanShareConstantPoolEntry - Test whether the given two constants
+/// can be allocated the same constant pool entry.
+static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
+                                      const DataLayout *TD) {
+  // Handle the trivial case quickly.
+  if (A == B) return true;
+
+  // If they have the same type but weren't the same constant, quickly
+  // reject them.
+  if (A->getType() == B->getType()) return false;
+
+  // We can't handle structs or arrays.
+  if (isa<StructType>(A->getType()) || isa<ArrayType>(A->getType()) ||
+      isa<StructType>(B->getType()) || isa<ArrayType>(B->getType()))
+    return false;
+  
+  // For now, only support constants with the same size.
+  uint64_t StoreSize = TD->getTypeStoreSize(A->getType());
+  if (StoreSize != TD->getTypeStoreSize(B->getType()) || 
+      StoreSize > 128)
+    return false;
+
+  Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8);
+
+  // Try constant folding a bitcast of both instructions to an integer.  If we
+  // get two identical ConstantInt's, then we are good to share them.  We use
+  // the constant folding APIs to do this so that we get the benefit of
+  // DataLayout.
+  if (isa<PointerType>(A->getType()))
+    A = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
+                                 const_cast<Constant*>(A), TD);
+  else if (A->getType() != IntTy)
+    A = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
+                                 const_cast<Constant*>(A), TD);
+  if (isa<PointerType>(B->getType()))
+    B = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
+                                 const_cast<Constant*>(B), TD);
+  else if (B->getType() != IntTy)
+    B = ConstantFoldInstOperands(Instruction::BitCast, IntTy,
+                                 const_cast<Constant*>(B), TD);
+
+  return A == B;
+}
+
+/// getConstantPoolIndex - Create a new entry in the constant pool or return
+/// an existing one.  User must specify the log2 of the minimum required
+/// alignment for the object.
+///
+unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C, 
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (!Constants[i].isMachineConstantPoolEntry() &&
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, TD)) {
+      if ((unsigned)Constants[i].getAlignment() < Alignment)
+        Constants[i].Alignment = Alignment;
+      return i;
+    }
+
+  Constants.push_back(MachineConstantPoolEntry(C, Alignment));
+  return Constants.size()-1;
+}
+
+unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  int Idx = V->getExistingMachineCPValue(this, Alignment);
+  if (Idx != -1) {
+    MachineCPVsSharingEntries.insert(V);
+    return (unsigned)Idx;
+  }
+
+  Constants.push_back(MachineConstantPoolEntry(V, Alignment));
+  return Constants.size()-1;
+}
+
+void MachineConstantPool::print(raw_ostream &OS) const {
+  if (Constants.empty()) return;
+
+  OS << "Constant Pool:\n";
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    OS << "  cp#" << i << ": ";
+    if (Constants[i].isMachineConstantPoolEntry())
+      Constants[i].Val.MachineCPVal->print(OS);
+    else
+      OS << *(const Value*)Constants[i].Val.ConstVal;
+    OS << ", align=" << Constants[i].getAlignment();
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineConstantPool::dump() const { print(dbgs()); }
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
new file mode 100644
index 000000000000..35591e1649d3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -0,0 +1,57 @@
+//===-- MachineFunctionAnalysis.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionAnalysis members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+using namespace llvm;
+
+char MachineFunctionAnalysis::ID = 0;
+
+MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm) :
+  FunctionPass(ID), TM(tm), MF(0) {
+  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineFunctionAnalysis::~MachineFunctionAnalysis() {
+  releaseMemory();
+  assert(!MF && "MachineFunctionAnalysis left initialized!");
+}
+
+void MachineFunctionAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+}
+
+bool MachineFunctionAnalysis::doInitialization(Module &M) {
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI && "MMI not around yet??");
+  MMI->setModule(&M);
+  NextFnNum = 0;
+  return false;
+}
+
+
+bool MachineFunctionAnalysis::runOnFunction(Function &F) {
+  assert(!MF && "MachineFunctionAnalysis already initialized!");
+  MF = new MachineFunction(&F, TM, NextFnNum++,
+                           getAnalysis<MachineModuleInfo>(),
+                           getAnalysisIfAvailable<GCModuleInfo>());
+  return false;
+}
+
+void MachineFunctionAnalysis::releaseMemory() {
+  delete MF;
+  MF = 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
new file mode 100644
index 000000000000..674cc80a006c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -0,0 +1,56 @@
+//===-- MachineFunctionPass.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionPass members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+using namespace llvm;
+
+Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
+                                             const std::string &Banner) const {
+  return createMachineFunctionPrinterPass(O, Banner);
+}
+
+bool MachineFunctionPass::runOnFunction(Function &F) {
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  MachineFunction &MF = getAnalysis<MachineFunctionAnalysis>().getMF();
+  return runOnMachineFunction(MF);
+}
+
+void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addPreserved<MachineFunctionAnalysis>();
+
+  // MachineFunctionPass preserves all LLVM IR passes, but there's no
+  // high-level way to express this. Instead, just list a bunch of
+  // passes explicitly. This does not include setPreservesCFG,
+  // because CodeGen overloads that to mean preserving the MachineBasicBlock
+  // CFG in addition to the LLVM IR CFG.
+  AU.addPreserved<AliasAnalysis>();
+  AU.addPreserved("scalar-evolution");
+  AU.addPreserved("iv-users");
+  AU.addPreserved("memdep");
+  AU.addPreserved("live-values");
+  AU.addPreserved("domtree");
+  AU.addPreserved("domfrontier");
+  AU.addPreserved("loops");
+  AU.addPreserved("lda");
+
+  FunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
new file mode 100644
index 000000000000..fa9c821b2af7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -0,0 +1,67 @@
+//===-- MachineFunctionPrinterPass.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineFunctionPrinterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+/// MachineFunctionPrinterPass - This is a pass to dump the IR of a
+/// MachineFunction.
+///
+struct MachineFunctionPrinterPass : public MachineFunctionPass {
+  static char ID;
+
+  raw_ostream &OS;
+  const std::string Banner;
+
+  MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
+  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner) 
+      : MachineFunctionPass(ID), OS(os), Banner(banner) {}
+
+  const char *getPassName() const { return "MachineFunction Printer"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    OS << "# " << Banner << ":\n";
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
+    return false;
+  }
+};
+
+char MachineFunctionPrinterPass::ID = 0;
+}
+
+char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+                "Machine Function Printer", false, false)
+
+namespace llvm {
+/// Returns a newly-created MachineFunction Printer pass. The
+/// default banner is empty.
+///
+MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
+                                                      const std::string &Banner){
+  return new MachineFunctionPrinterPass(OS, Banner);
+}
+
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
new file mode 100644
index 000000000000..32d066894b5b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
@@ -0,0 +1,1867 @@
+//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods common to all machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineOperand Implementation
+//===----------------------------------------------------------------------===//
+
+void MachineOperand::setReg(unsigned Reg) {
+  if (getReg() == Reg) return; // No change.
+
+  // Otherwise, we have to change the register.  If this operand is embedded
+  // into a machine function, we need to update the old and new register's
+  // use/def lists.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        SmallContents.RegNo = Reg;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+
+  // Otherwise, just change the register, no problem.  :)
+  SmallContents.RegNo = Reg;
+}
+
+void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    // Note that getSubReg() may return 0 if the sub-register doesn't exist.
+    // That won't happen in legal code.
+    setSubReg(0);
+  }
+  setReg(Reg);
+}
+
+/// Change a def to a use, or a use to a def.
+void MachineOperand::setIsDef(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert((!Val || !isDebug()) && "Marking a debug operation as def");
+  if (IsDef == Val)
+    return;
+  // MRI may keep uses and defs in different list positions.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        IsDef = Val;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+  IsDef = Val;
+}
+
+/// ChangeToImmediate - Replace this operand with a new immediate operand of
+/// the specified value.  If an operand is known to be an immediate already,
+/// the setImm method should be used.
+void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+  // If this operand is currently a register operand, and if this is in a
+  // function, deregister the operand from the register's use/def list.
+  if (isReg() && isOnRegUseList())
+    if (MachineInstr *MI = getParent())
+      if (MachineBasicBlock *MBB = MI->getParent())
+        if (MachineFunction *MF = MBB->getParent())
+          MF->getRegInfo().removeRegOperandFromUseList(this);
+
+  OpKind = MO_Immediate;
+  Contents.ImmVal = ImmVal;
+}
+
+/// ChangeToRegister - Replace this operand with a new register operand of
+/// the specified value.  If an operand is known to be an register already,
+/// the setReg method should be used.
+void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+                                      bool isKill, bool isDead, bool isUndef,
+                                      bool isDebug) {
+  MachineRegisterInfo *RegInfo = 0;
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent())
+        RegInfo = &MF->getRegInfo();
+  // If this operand is already a register operand, remove it from the
+  // register's use/def lists.
+  bool WasReg = isReg();
+  if (RegInfo && WasReg)
+    RegInfo->removeRegOperandFromUseList(this);
+
+  // Change this to a register and set the reg#.
+  OpKind = MO_Register;
+  SmallContents.RegNo = Reg;
+  SubReg_TargetFlags = 0;
+  IsDef = isDef;
+  IsImp = isImp;
+  IsKill = isKill;
+  IsDead = isDead;
+  IsUndef = isUndef;
+  IsInternalRead = false;
+  IsEarlyClobber = false;
+  IsDebug = isDebug;
+  // Ensure isOnRegUseList() returns false.
+  Contents.Reg.Prev = 0;
+  // Preserve the tie when the operand was already a register.
+  if (!WasReg)
+    TiedTo = 0;
+
+  // If this operand is embedded in a function, add the operand to the
+  // register's use/def list.
+  if (RegInfo)
+    RegInfo->addRegOperandToUseList(this);
+}
+
+/// isIdenticalTo - Return true if this operand is identical to the specified
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
+bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
+  if (getType() != Other.getType() ||
+      getTargetFlags() != Other.getTargetFlags())
+    return false;
+
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    return getReg() == Other.getReg() && isDef() == Other.isDef() &&
+           getSubReg() == Other.getSubReg();
+  case MachineOperand::MO_Immediate:
+    return getImm() == Other.getImm();
+  case MachineOperand::MO_CImmediate:
+    return getCImm() == Other.getCImm();
+  case MachineOperand::MO_FPImmediate:
+    return getFPImm() == Other.getFPImm();
+  case MachineOperand::MO_MachineBasicBlock:
+    return getMBB() == Other.getMBB();
+  case MachineOperand::MO_FrameIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_JumpTableIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_GlobalAddress:
+    return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_ExternalSymbol:
+    return !strcmp(getSymbolName(), Other.getSymbolName()) &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_BlockAddress:
+    return getBlockAddress() == Other.getBlockAddress() &&
+           getOffset() == Other.getOffset();
+  case MO_RegisterMask:
+    return getRegMask() == Other.getRegMask();
+  case MachineOperand::MO_MCSymbol:
+    return getMCSymbol() == Other.getMCSymbol();
+  case MachineOperand::MO_Metadata:
+    return getMetadata() == Other.getMetadata();
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    // Register operands don't have target flags.
+    return hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        MO.getSymbolName());
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getBlockAddress(), MO.getOffset());
+  case MachineOperand::MO_RegisterMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+/// print - Print the specified machine operand.
+///
+void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
+  // If the instruction is embedded into a basic block, we can find the
+  // target info for the instruction.
+  if (!TM)
+    if (const MachineInstr *MI = getParent())
+      if (const MachineBasicBlock *MBB = MI->getParent())
+        if (const MachineFunction *MF = MBB->getParent())
+          TM = &MF->getTarget();
+  const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    OS << PrintReg(getReg(), TRI, getSubReg());
+
+    if (isDef() || isKill() || isDead() || isImplicit() || isUndef() ||
+        isInternalRead() || isEarlyClobber() || isTied()) {
+      OS << '<';
+      bool NeedComma = false;
+      if (isDef()) {
+        if (NeedComma) OS << ',';
+        if (isEarlyClobber())
+          OS << "earlyclobber,";
+        if (isImplicit())
+          OS << "imp-";
+        OS << "def";
+        NeedComma = true;
+        // <def,read-undef> only makes sense when getSubReg() is set.
+        // Don't clutter the output otherwise.
+        if (isUndef() && getSubReg())
+          OS << ",read-undef";
+      } else if (isImplicit()) {
+          OS << "imp-use";
+          NeedComma = true;
+      }
+
+      if (isKill()) {
+        if (NeedComma) OS << ',';
+        OS << "kill";
+        NeedComma = true;
+      }
+      if (isDead()) {
+        if (NeedComma) OS << ',';
+        OS << "dead";
+        NeedComma = true;
+      }
+      if (isUndef() && isUse()) {
+        if (NeedComma) OS << ',';
+        OS << "undef";
+        NeedComma = true;
+      }
+      if (isInternalRead()) {
+        if (NeedComma) OS << ',';
+        OS << "internal";
+        NeedComma = true;
+      }
+      if (isTied()) {
+        if (NeedComma) OS << ',';
+        OS << "tied";
+        if (TiedTo != 15)
+          OS << unsigned(TiedTo - 1);
+        NeedComma = true;
+      }
+      OS << '>';
+    }
+    break;
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
+    break;
+  case MachineOperand::MO_CImmediate:
+    getCImm()->getValue().print(OS, false);
+    break;
+  case MachineOperand::MO_FPImmediate:
+    if (getFPImm()->getType()->isFloatTy())
+      OS << getFPImm()->getValueAPF().convertToFloat();
+    else
+      OS << getFPImm()->getValueAPF().convertToDouble();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    OS << "<BB#" << getMBB()->getNumber() << ">";
+    break;
+  case MachineOperand::MO_FrameIndex:
+    OS << "<fi#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "<cp#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_TargetIndex:
+    OS << "<ti#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "<jt#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    OS << "<ga:";
+    WriteAsOperand(OS, getGlobal(), /*PrintType=*/false);
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << "<es:" << getSymbolName();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_BlockAddress:
+    OS << '<';
+    WriteAsOperand(OS, getBlockAddress(), /*PrintType=*/false);
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_RegisterMask:
+    OS << "<regmask>";
+    break;
+  case MachineOperand::MO_Metadata:
+    OS << '<';
+    WriteAsOperand(OS, getMetadata(), /*PrintType=*/false);
+    OS << '>';
+    break;
+  case MachineOperand::MO_MCSymbol:
+    OS << "<MCSym=" << *getMCSymbol() << '>';
+    break;
+  }
+
+  if (unsigned TF = getTargetFlags())
+    OS << "[TF=" << TF << ']';
+}
+
+//===----------------------------------------------------------------------===//
+// MachineMemOperand Implementation
+//===----------------------------------------------------------------------===//
+
+/// getAddrSpace - Return the LLVM IR address space number that this pointer
+/// points into.
+unsigned MachinePointerInfo::getAddrSpace() const {
+  if (V == 0) return 0;
+  return cast<PointerType>(V->getType())->getAddressSpace();
+}
+
+/// getConstantPool - Return a MachinePointerInfo record that refers to the
+/// constant pool.
+MachinePointerInfo MachinePointerInfo::getConstantPool() {
+  return MachinePointerInfo(PseudoSourceValue::getConstantPool());
+}
+
+/// getFixedStack - Return a MachinePointerInfo record that refers to the
+/// the specified FrameIndex.
+MachinePointerInfo MachinePointerInfo::getFixedStack(int FI, int64_t offset) {
+  return MachinePointerInfo(PseudoSourceValue::getFixedStack(FI), offset);
+}
+
+MachinePointerInfo MachinePointerInfo::getJumpTable() {
+  return MachinePointerInfo(PseudoSourceValue::getJumpTable());
+}
+
+MachinePointerInfo MachinePointerInfo::getGOT() {
+  return MachinePointerInfo(PseudoSourceValue::getGOT());
+}
+
+MachinePointerInfo MachinePointerInfo::getStack(int64_t Offset) {
+  return MachinePointerInfo(PseudoSourceValue::getStack(), Offset);
+}
+
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, unsigned f,
+                                     uint64_t s, unsigned int a,
+                                     const MDNode *TBAAInfo,
+                                     const MDNode *Ranges)
+  : PtrInfo(ptrinfo), Size(s),
+    Flags((f & ((1 << MOMaxBits) - 1)) | ((Log2_32(a) + 1) << MOMaxBits)),
+    TBAAInfo(TBAAInfo), Ranges(Ranges) {
+  assert((PtrInfo.V == 0 || isa<PointerType>(PtrInfo.V->getType())) &&
+         "invalid pointer value");
+  assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
+  assert((isLoad() || isStore()) && "Not a load/store!");
+}
+
+/// Profile - Gather unique data for the object.
+///
+void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(getOffset());
+  ID.AddInteger(Size);
+  ID.AddPointer(getValue());
+  ID.AddInteger(Flags);
+}
+
+void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
+  // The Value and Offset may differ due to CSE. But the flags and size
+  // should be the same.
+  assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
+  assert(MMO->getSize() == getSize() && "Size mismatch!");
+
+  if (MMO->getBaseAlignment() >= getBaseAlignment()) {
+    // Update the alignment value.
+    Flags = (Flags & ((1 << MOMaxBits) - 1)) |
+      ((Log2_32(MMO->getBaseAlignment()) + 1) << MOMaxBits);
+    // Also update the base and offset, because the new alignment may
+    // not be applicable with the old ones.
+    PtrInfo = MMO->PtrInfo;
+  }
+}
+
+/// getAlignment - Return the minimum known alignment in bytes of the
+/// actual memory reference.
+uint64_t MachineMemOperand::getAlignment() const {
+  return MinAlign(getBaseAlignment(), getOffset());
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
+  assert((MMO.isLoad() || MMO.isStore()) &&
+         "SV has to be a load, store or both.");
+
+  if (MMO.isVolatile())
+    OS << "Volatile ";
+
+  if (MMO.isLoad())
+    OS << "LD";
+  if (MMO.isStore())
+    OS << "ST";
+  OS << MMO.getSize();
+
+  // Print the address information.
+  OS << "[";
+  if (!MMO.getValue())
+    OS << "<unknown>";
+  else
+    WriteAsOperand(OS, MMO.getValue(), /*PrintType=*/false);
+
+  // If the alignment of the memory reference itself differs from the alignment
+  // of the base pointer, print the base alignment explicitly, next to the base
+  // pointer.
+  if (MMO.getBaseAlignment() != MMO.getAlignment())
+    OS << "(align=" << MMO.getBaseAlignment() << ")";
+
+  if (MMO.getOffset() != 0)
+    OS << "+" << MMO.getOffset();
+  OS << "]";
+
+  // Print the alignment of the reference.
+  if (MMO.getBaseAlignment() != MMO.getAlignment() ||
+      MMO.getBaseAlignment() != MMO.getSize())
+    OS << "(align=" << MMO.getAlignment() << ")";
+
+  // Print TBAA info.
+  if (const MDNode *TBAAInfo = MMO.getTBAAInfo()) {
+    OS << "(tbaa=";
+    if (TBAAInfo->getNumOperands() > 0)
+      WriteAsOperand(OS, TBAAInfo->getOperand(0), /*PrintType=*/false);
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print nontemporal info.
+  if (MMO.isNonTemporal())
+    OS << "(nontemporal)";
+
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineInstr Implementation
+//===----------------------------------------------------------------------===//
+
+void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
+  if (MCID->ImplicitDefs)
+    for (const uint16_t *ImpDefs = MCID->getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (MCID->ImplicitUses)
+    for (const uint16_t *ImpUses = MCID->getImplicitUses(); *ImpUses; ++ImpUses)
+      addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
+}
+
+/// MachineInstr ctor - This constructor creates a MachineInstr and adds the
+/// implicit operands. It reserves space for the number of operands specified by
+/// the MCInstrDesc.
+MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
+                           const DebugLoc dl, bool NoImp)
+  : MCID(&tid), Parent(0), Operands(0), NumOperands(0),
+    Flags(0), AsmPrinterFlags(0),
+    NumMemRefs(0), MemRefs(0), debugLoc(dl) {
+  // Reserve space for the expected number of operands.
+  if (unsigned NumOps = MCID->getNumOperands() +
+    MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
+    CapOperands = OperandCapacity::get(NumOps);
+    Operands = MF.allocateOperandArray(CapOperands);
+  }
+
+  if (!NoImp)
+    addImplicitDefUseOperands(MF);
+}
+
+/// MachineInstr ctor - Copies MachineInstr arg exactly
+///
+MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
+  : MCID(&MI.getDesc()), Parent(0), Operands(0), NumOperands(0),
+    Flags(0), AsmPrinterFlags(0),
+    NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
+    debugLoc(MI.getDebugLoc()) {
+  CapOperands = OperandCapacity::get(MI.getNumOperands());
+  Operands = MF.allocateOperandArray(CapOperands);
+
+  // Copy operands.
+  for (unsigned i = 0; i != MI.getNumOperands(); ++i)
+    addOperand(MF, MI.getOperand(i));
+
+  // Copy all the sensible flags.
+  setFlags(MI.Flags);
+}
+
+/// getRegInfo - If this instruction is embedded into a MachineFunction,
+/// return the MachineRegisterInfo object for the current function, otherwise
+/// return null.
+MachineRegisterInfo *MachineInstr::getRegInfo() {
+  if (MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return 0;
+}
+
+/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands already be on their use lists.
+void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (Operands[i].isReg())
+      MRI.removeRegOperandFromUseList(&Operands[i]);
+}
+
+/// AddRegOperandsToUseLists - Add all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands not be on their use lists yet.
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (Operands[i].isReg())
+      MRI.addRegOperandToUseList(&Operands[i]);
+}
+
+void MachineInstr::addOperand(const MachineOperand &Op) {
+  MachineBasicBlock *MBB = getParent();
+  assert(MBB && "Use MachineInstrBuilder to add operands to dangling instrs");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Use MachineInstrBuilder to add operands to dangling instrs");
+  addOperand(*MF, Op);
+}
+
+/// Move NumOps MachineOperands from Src to Dst, with support for overlapping
+/// ranges. If MRI is non-null also update use-def chains.
+static void moveOperands(MachineOperand *Dst, MachineOperand *Src,
+                         unsigned NumOps, MachineRegisterInfo *MRI) {
+  if (MRI)
+    return MRI->moveOperands(Dst, Src, NumOps);
+
+  // Here it would be convenient to call memmove, so that isn't allowed because
+  // MachineOperand has a constructor and so isn't a POD type.
+  if (Dst < Src)
+    for (unsigned i = 0; i != NumOps; ++i)
+      new (Dst + i) MachineOperand(Src[i]);
+  else
+    for (unsigned i = NumOps; i ; --i)
+      new (Dst + i - 1) MachineOperand(Src[i - 1]);
+}
+
+/// addOperand - Add the specified operand to the instruction.  If it is an
+/// implicit operand, it is added to the end of the operand list.  If it is
+/// an explicit operand it is added at the end of the explicit operand list
+/// (before the first implicit operand).
+void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
+  assert(MCID && "Cannot add operands before providing an instr descriptor");
+
+  // Check if we're adding one of our existing operands.
+  if (&Op >= Operands && &Op < Operands + NumOperands) {
+    // This is unusual: MI->addOperand(MI->getOperand(i)).
+    // If adding Op requires reallocating or moving existing operands around,
+    // the Op reference could go stale. Support it by copying Op.
+    MachineOperand CopyOp(Op);
+    return addOperand(MF, CopyOp);
+  }
+
+  // Find the insert location for the new operand.  Implicit registers go at
+  // the end, everything else goes before the implicit regs.
+  //
+  // FIXME: Allow mixed explicit and implicit operands on inline asm.
+  // InstrEmitter::EmitSpecialNode() is marking inline asm clobbers as
+  // implicit-defs, but they must not be moved around.  See the FIXME in
+  // InstrEmitter.cpp.
+  unsigned OpNo = getNumOperands();
+  bool isImpReg = Op.isReg() && Op.isImplicit();
+  if (!isImpReg && !isInlineAsm()) {
+    while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
+      --OpNo;
+      assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
+    }
+  }
+
+  // OpNo now points as the desired insertion point.  Unless this is a variadic
+  // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
+  // RegMask operands go between the explicit and implicit operands.
+  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+          OpNo < MCID->getNumOperands()) &&
+         "Trying to add an operand to a machine instr that is already done!");
+
+  MachineRegisterInfo *MRI = getRegInfo();
+
+  // Determine if the Operands array needs to be reallocated.
+  // Save the old capacity and operand array.
+  OperandCapacity OldCap = CapOperands;
+  MachineOperand *OldOperands = Operands;
+  if (!OldOperands || OldCap.getSize() == getNumOperands()) {
+    CapOperands = OldOperands ? OldCap.getNext() : OldCap.get(1);
+    Operands = MF.allocateOperandArray(CapOperands);
+    // Move the operands before the insertion point.
+    if (OpNo)
+      moveOperands(Operands, OldOperands, OpNo, MRI);
+  }
+
+  // Move the operands following the insertion point.
+  if (OpNo != NumOperands)
+    moveOperands(Operands + OpNo + 1, OldOperands + OpNo, NumOperands - OpNo,
+                 MRI);
+  ++NumOperands;
+
+  // Deallocate the old operand array.
+  if (OldOperands != Operands && OldOperands)
+    MF.deallocateOperandArray(OldCap, OldOperands);
+
+  // Copy Op into place. It still needs to be inserted into the MRI use lists.
+  MachineOperand *NewMO = new (Operands + OpNo) MachineOperand(Op);
+  NewMO->ParentMI = this;
+
+  // When adding a register operand, tell MRI about it.
+  if (NewMO->isReg()) {
+    // Ensure isOnRegUseList() returns false, regardless of Op's status.
+    NewMO->Contents.Reg.Prev = 0;
+    // Ignore existing ties. This is not a property that can be copied.
+    NewMO->TiedTo = 0;
+    // Add the new operand to MRI, but only for instructions in an MBB.
+    if (MRI)
+      MRI->addRegOperandToUseList(NewMO);
+    // The MCID operand information isn't accurate until we start adding
+    // explicit operands. The implicit operands are added first, then the
+    // explicits are inserted before them.
+    if (!isImpReg) {
+      // Tie uses to defs as indicated in MCInstrDesc.
+      if (NewMO->isUse()) {
+        int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
+        if (DefIdx != -1)
+          tieOperands(DefIdx, OpNo);
+      }
+      // If the register operand is flagged as early, mark the operand as such.
+      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+        NewMO->setIsEarlyClobber(true);
+    }
+  }
+}
+
+/// RemoveOperand - Erase an operand  from an instruction, leaving it with one
+/// fewer operand than it started with.
+///
+void MachineInstr::RemoveOperand(unsigned OpNo) {
+  assert(OpNo < getNumOperands() && "Invalid operand number");
+  untieRegOperand(OpNo);
+
+#ifndef NDEBUG
+  // Moving tied operands would break the ties.
+  for (unsigned i = OpNo + 1, e = getNumOperands(); i != e; ++i)
+    if (Operands[i].isReg())
+      assert(!Operands[i].isTied() && "Cannot move tied operands");
+#endif
+
+  MachineRegisterInfo *MRI = getRegInfo();
+  if (MRI && Operands[OpNo].isReg())
+    MRI->removeRegOperandFromUseList(Operands + OpNo);
+
+  // Don't call the MachineOperand destructor. A lot of this code depends on
+  // MachineOperand having a trivial destructor anyway, and adding a call here
+  // wouldn't make it 'destructor-correct'.
+
+  if (unsigned N = NumOperands - 1 - OpNo)
+    moveOperands(Operands + OpNo, Operands + OpNo + 1, N, MRI);
+  --NumOperands;
+}
+
+/// addMemOperand - Add a MachineMemOperand to the machine instruction.
+/// This function should be used only occasionally. The setMemRefs function
+/// is the primary method for setting up a MachineInstr's MemRefs list.
+void MachineInstr::addMemOperand(MachineFunction &MF,
+                                 MachineMemOperand *MO) {
+  mmo_iterator OldMemRefs = MemRefs;
+  unsigned OldNumMemRefs = NumMemRefs;
+
+  unsigned NewNum = NumMemRefs + 1;
+  mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NewNum);
+
+  std::copy(OldMemRefs, OldMemRefs + OldNumMemRefs, NewMemRefs);
+  NewMemRefs[NewNum - 1] = MO;
+  setMemRefs(NewMemRefs, NewMemRefs + NewNum);
+}
+
+bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
+  assert(!isBundledWithPred() && "Must be called on bundle header");
+  for (MachineBasicBlock::const_instr_iterator MII = this;; ++MII) {
+    if (MII->getDesc().getFlags() & Mask) {
+      if (Type == AnyInBundle)
+        return true;
+    } else {
+      if (Type == AllInBundle && !MII->isBundle())
+        return false;
+    }
+    // This was the last instruction in the bundle.
+    if (!MII->isBundledWithSucc())
+      return Type == AllInBundle;
+  }
+}
+
+bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
+                                 MICheckType Check) const {
+  // If opcodes or number of operands are not the same then the two
+  // instructions are obviously not identical.
+  if (Other->getOpcode() != getOpcode() ||
+      Other->getNumOperands() != getNumOperands())
+    return false;
+
+  if (isBundle()) {
+    // Both instructions are bundles, compare MIs inside the bundle.
+    MachineBasicBlock::const_instr_iterator I1 = *this;
+    MachineBasicBlock::const_instr_iterator E1 = getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I2 = *Other;
+    MachineBasicBlock::const_instr_iterator E2= Other->getParent()->instr_end();
+    while (++I1 != E1 && I1->isInsideBundle()) {
+      ++I2;
+      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(I2, Check))
+        return false;
+    }
+  }
+
+  // Check operands to make sure they match.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    const MachineOperand &OMO = Other->getOperand(i);
+    if (!MO.isReg()) {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      continue;
+    }
+
+    // Clients may or may not want to ignore defs when testing for equality.
+    // For example, machine CSE pass only cares about finding common
+    // subexpressions, so it's safe to ignore virtual register defs.
+    if (MO.isDef()) {
+      if (Check == IgnoreDefs)
+        continue;
+      else if (Check == IgnoreVRegDefs) {
+        if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+            TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
+          if (MO.getReg() != OMO.getReg())
+            return false;
+      } else {
+        if (!MO.isIdenticalTo(OMO))
+          return false;
+        if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
+          return false;
+      }
+    } else {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
+        return false;
+    }
+  }
+  // If DebugLoc does not match then two dbg.values are not identical.
+  if (isDebugValue())
+    if (!getDebugLoc().isUnknown() && !Other->getDebugLoc().isUnknown()
+        && getDebugLoc() != Other->getDebugLoc())
+      return false;
+  return true;
+}
+
+MachineInstr *MachineInstr::removeFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove(this);
+}
+
+MachineInstr *MachineInstr::removeFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove_instr(this);
+}
+
+void MachineInstr::eraseFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase(this);
+}
+
+void MachineInstr::eraseFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase_instr(this);
+}
+
+/// getNumExplicitOperands - Returns the number of non-implicit operands.
+///
+unsigned MachineInstr::getNumExplicitOperands() const {
+  unsigned NumOperands = MCID->getNumOperands();
+  if (!MCID->isVariadic())
+    return NumOperands;
+
+  for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit())
+      NumOperands++;
+  }
+  return NumOperands;
+}
+
+void MachineInstr::bundleWithPred() {
+  assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
+  setFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = this;
+  --Pred;
+  assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->setFlag(BundledSucc);
+}
+
+void MachineInstr::bundleWithSucc() {
+  assert(!isBundledWithSucc() && "MI is already bundled with its successor");
+  setFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = this;
+  ++Succ;
+  assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->setFlag(BundledPred);
+}
+
+void MachineInstr::unbundleFromPred() {
+  assert(isBundledWithPred() && "MI isn't bundled with its predecessor");
+  clearFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = this;
+  --Pred;
+  assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->clearFlag(BundledSucc);
+}
+
+void MachineInstr::unbundleFromSucc() {
+  assert(isBundledWithSucc() && "MI isn't bundled with its successor");
+  clearFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = this;
+  ++Succ;
+  assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->clearFlag(BundledPred);
+}
+
+bool MachineInstr::isStackAligningInlineAsm() const {
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      return true;
+  }
+  return false;
+}
+
+InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
+  assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
+  unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+}
+
+int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
+                                       unsigned *GroupNo) const {
+  assert(isInlineAsm() && "Expected an inline asm instruction");
+  assert(OpIdx < getNumOperands() && "OpIdx out of range");
+
+  // Ignore queries about the initial operands.
+  if (OpIdx < InlineAsm::MIOp_FirstOperand)
+    return -1;
+
+  unsigned Group = 0;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    // If we reach the implicit register operands, stop looking.
+    if (!FlagMO.isImm())
+      return -1;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    if (i + NumOps > OpIdx) {
+      if (GroupNo)
+        *GroupNo = Group;
+      return i;
+    }
+    ++Group;
+  }
+  return -1;
+}
+
+const TargetRegisterClass*
+MachineInstr::getRegClassConstraint(unsigned OpIdx,
+                                    const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI) const {
+  assert(getParent() && "Can't have an MBB reference here!");
+  assert(getParent()->getParent() && "Can't have an MF reference here!");
+  const MachineFunction &MF = *getParent()->getParent();
+
+  // Most opcodes have fixed constraints in their MCInstrDesc.
+  if (!isInlineAsm())
+    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
+
+  if (!getOperand(OpIdx).isReg())
+    return NULL;
+
+  // For tied uses on inline asm, get the constraint from the def.
+  unsigned DefIdx;
+  if (getOperand(OpIdx).isUse() && isRegTiedToDefOperand(OpIdx, &DefIdx))
+    OpIdx = DefIdx;
+
+  // Inline asm stores register class constraints in the flag word.
+  int FlagIdx = findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0)
+    return NULL;
+
+  unsigned Flag = getOperand(FlagIdx).getImm();
+  unsigned RCID;
+  if (InlineAsm::hasRegClassConstraint(Flag, RCID))
+    return TRI->getRegClass(RCID);
+
+  // Assume that all registers in a memory operand are pointers.
+  if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
+    return TRI->getPointerRegClass(MF);
+
+  return NULL;
+}
+
+/// Return the number of instructions inside the MI bundle, not counting the
+/// header instruction.
+unsigned MachineInstr::getBundleSize() const {
+  MachineBasicBlock::const_instr_iterator I = this;
+  unsigned Size = 0;
+  while (I->isBundledWithSucc())
+    ++Size, ++I;
+  return Size;
+}
+
+/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
+/// the specific register or -1 if it is not found. It further tightens
+/// the search criteria to a use that kills the register if isKill is true.
+int MachineInstr::findRegisterUseOperandIdx(unsigned Reg, bool isKill,
+                                          const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MOReg == Reg ||
+        (TRI &&
+         TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+         TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         TRI->isSubRegister(MOReg, Reg)))
+      if (!isKill || MO.isKill())
+        return i;
+  }
+  return -1;
+}
+
+/// readsWritesVirtualRegister - Return a pair of bools (reads, writes)
+/// indicating if this instruction reads or writes Reg. This also considers
+/// partial defines.
+std::pair<bool,bool>
+MachineInstr::readsWritesVirtualRegister(unsigned Reg,
+                                         SmallVectorImpl<unsigned> *Ops) const {
+  bool PartDef = false; // Partial redefine.
+  bool FullDef = false; // Full define.
+  bool Use = false;
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+    if (Ops)
+      Ops->push_back(i);
+    if (MO.isUse())
+      Use |= !MO.isUndef();
+    else if (MO.getSubReg() && !MO.isUndef())
+      // A partial <def,undef> doesn't count as reading the register.
+      PartDef = true;
+    else
+      FullDef = true;
+  }
+  // A partial redefine uses Reg unless there is also a full define.
+  return std::make_pair(Use || (PartDef && !FullDef), PartDef || FullDef);
+}
+
+/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
+/// the specified register or -1 if it is not found. If isDead is true, defs
+/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
+/// also checks if there is a def of a super-register.
+int
+MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
+                                        const TargetRegisterInfo *TRI) const {
+  bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    // Accept regmask operands when Overlap is set.
+    // Ignore them when looking for a specific def operand (Overlap == false).
+    if (isPhys && Overlap && MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      return i;
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned MOReg = MO.getReg();
+    bool Found = (MOReg == Reg);
+    if (!Found && TRI && isPhys &&
+        TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      if (Overlap)
+        Found = TRI->regsOverlap(MOReg, Reg);
+      else
+        Found = TRI->isSubRegister(MOReg, Reg);
+    }
+    if (Found && (!isDead || MO.isDead()))
+      return i;
+  }
+  return -1;
+}
+
+/// findFirstPredOperandIdx() - Find the index of the first operand in the
+/// operand list that is used to represent the predicate. It returns -1 if
+/// none is found.
+int MachineInstr::findFirstPredOperandIdx() const {
+  // Don't call MCID.findFirstPredOperandIdx() because this variant
+  // is sometimes called on an instruction that's not yet complete, and
+  // so the number of operands is less than the MCID indicates. In
+  // particular, the PTX target does this.
+  const MCInstrDesc &MCID = getDesc();
+  if (MCID.isPredicable()) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (MCID.OpInfo[i].isPredicate())
+        return i;
+  }
+
+  return -1;
+}
+
+// MachineOperand::TiedTo is 4 bits wide.
+const unsigned TiedMax = 15;
+
+/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
+///
+/// Use and def operands can be tied together, indicated by a non-zero TiedTo
+/// field. TiedTo can have these values:
+///
+/// 0:              Operand is not tied to anything.
+/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
+/// TiedMax:        Tied to an operand >= TiedMax-1.
+///
+/// The tied def must be one of the first TiedMax operands on a normal
+/// instruction. INLINEASM instructions allow more tied defs.
+///
+void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
+  MachineOperand &DefMO = getOperand(DefIdx);
+  MachineOperand &UseMO = getOperand(UseIdx);
+  assert(DefMO.isDef() && "DefIdx must be a def operand");
+  assert(UseMO.isUse() && "UseIdx must be a use operand");
+  assert(!DefMO.isTied() && "Def is already tied to another use");
+  assert(!UseMO.isTied() && "Use is already tied to another def");
+
+  if (DefIdx < TiedMax)
+    UseMO.TiedTo = DefIdx + 1;
+  else {
+    // Inline asm can use the group descriptors to find tied operands, but on
+    // normal instruction, the tied def must be within the first TiedMax
+    // operands.
+    assert(isInlineAsm() && "DefIdx out of range");
+    UseMO.TiedTo = TiedMax;
+  }
+
+  // UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
+  DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
+}
+
+/// Given the index of a tied register operand, find the operand it is tied to.
+/// Defs are tied to uses and vice versa. Returns the index of the tied operand
+/// which must exist.
+unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isTied() && "Operand isn't tied");
+
+  // Normally TiedTo is in range.
+  if (MO.TiedTo < TiedMax)
+    return MO.TiedTo - 1;
+
+  // Uses on normal instructions can be out of range.
+  if (!isInlineAsm()) {
+    // Normal tied defs must be in the 0..TiedMax-1 range.
+    if (MO.isUse())
+      return TiedMax - 1;
+    // MO is a def. Search for the tied use.
+    for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &UseMO = getOperand(i);
+      if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
+        return i;
+    }
+    llvm_unreachable("Can't find tied use");
+  }
+
+  // Now deal with inline asm by parsing the operand group descriptor flags.
+  // Find the beginning of each operand group.
+  SmallVector<unsigned, 8> GroupIdx;
+  unsigned OpIdxGroup = ~0u;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
+    unsigned CurGroup = GroupIdx.size();
+    GroupIdx.push_back(i);
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    // OpIdx belongs to this operand group.
+    if (OpIdx > i && OpIdx < i + NumOps)
+      OpIdxGroup = CurGroup;
+    unsigned TiedGroup;
+    if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
+      continue;
+    // Operands in this group are tied to operands in TiedGroup which must be
+    // earlier. Find the number of operands between the two groups.
+    unsigned Delta = i - GroupIdx[TiedGroup];
+
+    // OpIdx is a use tied to TiedGroup.
+    if (OpIdxGroup == CurGroup)
+      return OpIdx - Delta;
+
+    // OpIdx is a def tied to this use group.
+    if (OpIdxGroup == TiedGroup)
+      return OpIdx + Delta;
+  }
+  llvm_unreachable("Invalid tied operand on inline asm");
+}
+
+/// clearKillInfo - Clears kill flags on all operands.
+///
+void MachineInstr::clearKillInfo() {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse())
+      MO.setIsKill(false);
+  }
+}
+
+void MachineInstr::substituteRegister(unsigned FromReg,
+                                      unsigned ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = getOperand(i);
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
+/// isSafeToMove - Return true if it is safe to move this instruction. If
+/// SawStore is set to true, it means that there is a store (or call) between
+/// the instruction's location and its intended destination.
+bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
+                                AliasAnalysis *AA,
+                                bool &SawStore) const {
+  // Ignore stuff that we obviously can't move.
+  //
+  // Treat volatile loads as stores. This is not strictly necessary for
+  // volatiles, but it is required for atomic loads. It is not allowed to move
+  // a load across an atomic load with Ordering > Monotonic.
+  if (mayStore() || isCall() ||
+      (mayLoad() && hasOrderedMemoryRef())) {
+    SawStore = true;
+    return false;
+  }
+
+  if (isLabel() || isDebugValue() ||
+      isTerminator() || hasUnmodeledSideEffects())
+    return false;
+
+  // See if this instruction does a load.  If so, we have to guarantee that the
+  // loaded value doesn't change between the load and the its intended
+  // destination. The check for isInvariantLoad gives the targe the chance to
+  // classify the load as always returning a constant, e.g. a constant pool
+  // load.
+  if (mayLoad() && !isInvariantLoad(AA))
+    // Otherwise, this is a real load.  If there is a store between the load and
+    // end of block, we can't move it.
+    return !SawStore;
+
+  return true;
+}
+
+/// isSafeToReMat - Return true if it's safe to rematerialize the specified
+/// instruction which defined the specified register instead of copying it.
+bool MachineInstr::isSafeToReMat(const TargetInstrInfo *TII,
+                                 AliasAnalysis *AA,
+                                 unsigned DstReg) const {
+  bool SawStore = false;
+  if (!TII->isTriviallyReMaterializable(this, AA) ||
+      !isSafeToMove(TII, AA, SawStore))
+    return false;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg())
+      continue;
+    // FIXME: For now, do not remat any instruction with register operands.
+    // Later on, we can loosen the restriction is the register operands have
+    // not been modified between the def and use. Note, this is different from
+    // MachineSink because the code is no longer in two-address form (at least
+    // partially).
+    if (MO.isUse())
+      return false;
+    else if (!MO.isDead() && MO.getReg() != DstReg)
+      return false;
+  }
+  return true;
+}
+
+/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+/// or volatile memory reference, or if the information describing the memory
+/// reference is not available. Return false if it is known to have no ordered
+/// memory references.
+bool MachineInstr::hasOrderedMemoryRef() const {
+  // An instruction known never to access memory won't have a volatile access.
+  if (!mayStore() &&
+      !mayLoad() &&
+      !isCall() &&
+      !hasUnmodeledSideEffects())
+    return false;
+
+  // Otherwise, if the instruction has no memory reference information,
+  // conservatively assume it wasn't preserved.
+  if (memoperands_empty())
+    return true;
+
+  // Check the memory reference information for ordered references.
+  for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
+    if (!(*I)->isUnordered())
+      return true;
+
+  return false;
+}
+
+/// isInvariantLoad - Return true if this instruction is loading from a
+/// location whose value is invariant across the function.  For example,
+/// loading a value from the constant pool or from the argument area
+/// of a function if it does not change.  This should only return true of
+/// *all* loads the instruction does are invariant (if it does multiple loads).
+bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
+  // If the instruction doesn't load at all, it isn't an invariant load.
+  if (!mayLoad())
+    return false;
+
+  // If the instruction has lost its memoperands, conservatively assume that
+  // it may not be an invariant load.
+  if (memoperands_empty())
+    return false;
+
+  const MachineFrameInfo *MFI = getParent()->getParent()->getFrameInfo();
+
+  for (mmo_iterator I = memoperands_begin(),
+       E = memoperands_end(); I != E; ++I) {
+    if ((*I)->isVolatile()) return false;
+    if ((*I)->isStore()) return false;
+    if ((*I)->isInvariant()) return true;
+
+    if (const Value *V = (*I)->getValue()) {
+      // A load from a constant PseudoSourceValue is invariant.
+      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V))
+        if (PSV->isConstant(MFI))
+          continue;
+      // If we have an AliasAnalysis, ask it whether the memory is constant.
+      if (AA && AA->pointsToConstantMemory(
+                      AliasAnalysis::Location(V, (*I)->getSize(),
+                                              (*I)->getTBAAInfo())))
+        continue;
+    }
+
+    // Otherwise assume conservatively.
+    return false;
+  }
+
+  // Everything checks out.
+  return true;
+}
+
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (!isPHI())
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  unsigned Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
+bool MachineInstr::hasUnmodeledSideEffects() const {
+  if (hasProperty(MCID::UnmodeledSideEffects))
+    return true;
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      return true;
+  }
+
+  return false;
+}
+
+/// allDefsAreDead - Return true if all the defs of this instruction are dead.
+///
+bool MachineInstr::allDefsAreDead() const {
+  for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  return true;
+}
+
+/// copyImplicitOps - Copy implicit register operands from specified
+/// instruction to this instruction.
+void MachineInstr::copyImplicitOps(MachineFunction &MF,
+                                   const MachineInstr *MI) {
+  for (unsigned i = MI->getDesc().getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit())
+      addOperand(MF, MO);
+  }
+}
+
+void MachineInstr::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  " << *this;
+#endif
+}
+
+static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
+                         raw_ostream &CommentOS) {
+  const LLVMContext &Ctx = MF->getFunction()->getContext();
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, MF, CommentOS);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM,
+                         bool SkipOpers) const {
+  // We can be a bit tidier if we know the TargetMachine and/or MachineFunction.
+  const MachineFunction *MF = 0;
+  const MachineRegisterInfo *MRI = 0;
+  if (const MachineBasicBlock *MBB = getParent()) {
+    MF = MBB->getParent();
+    if (!TM && MF)
+      TM = &MF->getTarget();
+    if (MF)
+      MRI = &MF->getRegInfo();
+  }
+
+  // Save a list of virtual registers.
+  SmallVector<unsigned, 8> VirtRegs;
+
+  // Print explicitly defined operands on the left of an assignment syntax.
+  unsigned StartOp = 0, e = getNumOperands();
+  for (; StartOp < e && getOperand(StartOp).isReg() &&
+         getOperand(StartOp).isDef() &&
+         !getOperand(StartOp).isImplicit();
+       ++StartOp) {
+    if (StartOp != 0) OS << ", ";
+    getOperand(StartOp).print(OS, TM);
+    unsigned Reg = getOperand(StartOp).getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      VirtRegs.push_back(Reg);
+  }
+
+  if (StartOp != 0)
+    OS << " = ";
+
+  // Print the opcode name.
+  if (TM && TM->getInstrInfo())
+    OS << TM->getInstrInfo()->getName(getOpcode());
+  else
+    OS << "UNKNOWN";
+
+  if (SkipOpers)
+    return;
+
+  // Print the rest of the operands.
+  bool OmittedAnyCallClobbers = false;
+  bool FirstOp = true;
+  unsigned AsmDescOp = ~0u;
+  unsigned AsmOpCount = 0;
+
+  if (isInlineAsm() && e >= InlineAsm::MIOp_FirstOperand) {
+    // Print asm string.
+    OS << " ";
+    getOperand(InlineAsm::MIOp_AsmString).print(OS, TM);
+
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      OS << " [sideeffect]";
+    if (ExtraInfo & InlineAsm::Extra_MayLoad)
+      OS << " [mayload]";
+    if (ExtraInfo & InlineAsm::Extra_MayStore)
+      OS << " [maystore]";
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      OS << " [alignstack]";
+    if (getInlineAsmDialect() == InlineAsm::AD_ATT)
+      OS << " [attdialect]";
+    if (getInlineAsmDialect() == InlineAsm::AD_Intel)
+      OS << " [inteldialect]";
+
+    StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
+    FirstOp = false;
+  }
+
+
+  for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+
+    if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegs.push_back(MO.getReg());
+
+    // Omit call-clobbered registers which aren't used anywhere. This makes
+    // call instructions much less noisy on targets where calls clobber lots
+    // of registers. Don't rely on MO.isDead() because we may be called before
+    // LiveVariables is run, or we may be looking at a non-allocatable reg.
+    if (MF && isCall() &&
+        MO.isReg() && MO.isImplicit() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        const MachineRegisterInfo &MRI = MF->getRegInfo();
+        if (MRI.use_empty(Reg)) {
+          bool HasAliasLive = false;
+          for (MCRegAliasIterator AI(Reg, TM->getRegisterInfo(), true);
+               AI.isValid(); ++AI) {
+            unsigned AliasReg = *AI;
+            if (!MRI.use_empty(AliasReg)) {
+              HasAliasLive = true;
+              break;
+            }
+          }
+          if (!HasAliasLive) {
+            OmittedAnyCallClobbers = true;
+            continue;
+          }
+        }
+      }
+    }
+
+    if (FirstOp) FirstOp = false; else OS << ",";
+    OS << " ";
+    if (i < getDesc().NumOperands) {
+      const MCOperandInfo &MCOI = getDesc().OpInfo[i];
+      if (MCOI.isPredicate())
+        OS << "pred:";
+      if (MCOI.isOptionalDef())
+        OS << "opt:";
+    }
+    if (isDebugValue() && MO.isMetadata()) {
+      // Pretty print DBG_VALUE instructions.
+      const MDNode *MD = MO.getMetadata();
+      if (const MDString *MDS = dyn_cast<MDString>(MD->getOperand(2)))
+        OS << "!\"" << MDS->getString() << '\"';
+      else
+        MO.print(OS, TM);
+    } else if (TM && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+      OS << TM->getRegisterInfo()->getSubRegIndexName(MO.getImm());
+    } else if (i == AsmDescOp && MO.isImm()) {
+      // Pretty print the inline asm operand descriptor.
+      OS << '$' << AsmOpCount++;
+      unsigned Flag = MO.getImm();
+      switch (InlineAsm::getKind(Flag)) {
+      case InlineAsm::Kind_RegUse:             OS << ":[reguse"; break;
+      case InlineAsm::Kind_RegDef:             OS << ":[regdef"; break;
+      case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec"; break;
+      case InlineAsm::Kind_Clobber:            OS << ":[clobber"; break;
+      case InlineAsm::Kind_Imm:                OS << ":[imm"; break;
+      case InlineAsm::Kind_Mem:                OS << ":[mem"; break;
+      default: OS << ":[??" << InlineAsm::getKind(Flag); break;
+      }
+
+      unsigned RCID = 0;
+      if (InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+        if (TM)
+          OS << ':' << TM->getRegisterInfo()->getRegClass(RCID)->getName();
+        else
+          OS << ":RC" << RCID;
+      }
+
+      unsigned TiedTo = 0;
+      if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
+        OS << " tiedto:$" << TiedTo;
+
+      OS << ']';
+
+      // Compute the index of the next operand descriptor.
+      AsmDescOp += 1 + InlineAsm::getNumOperandRegisters(Flag);
+    } else
+      MO.print(OS, TM);
+  }
+
+  // Briefly indicate whether any call clobbers were omitted.
+  if (OmittedAnyCallClobbers) {
+    if (!FirstOp) OS << ",";
+    OS << " ...";
+  }
+
+  bool HaveSemi = false;
+  const unsigned PrintableFlags = FrameSetup;
+  if (Flags & PrintableFlags) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    OS << " flags: ";
+
+    if (Flags & FrameSetup)
+      OS << "FrameSetup";
+  }
+
+  if (!memoperands_empty()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+
+    OS << " mem:";
+    for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
+         i != e; ++i) {
+      OS << **i;
+      if (llvm::next(i) != e)
+        OS << " ";
+    }
+  }
+
+  // Print the regclass of any virtual registers encountered.
+  if (MRI && !VirtRegs.empty()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
+      const TargetRegisterClass *RC = MRI->getRegClass(VirtRegs[i]);
+      OS << " " << RC->getName() << ':' << PrintReg(VirtRegs[i]);
+      for (unsigned j = i+1; j != VirtRegs.size();) {
+        if (MRI->getRegClass(VirtRegs[j]) != RC) {
+          ++j;
+          continue;
+        }
+        if (VirtRegs[i] != VirtRegs[j])
+          OS << "," << PrintReg(VirtRegs[j]);
+        VirtRegs.erase(VirtRegs.begin()+j);
+      }
+    }
+  }
+
+  // Print debug location information.
+  if (isDebugValue() && getOperand(e - 1).isMetadata()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    DIVariable DV(getOperand(e - 1).getMetadata());
+    OS << " line no:" <<  DV.getLineNumber();
+    if (MDNode *InlinedAt = DV.getInlinedAt()) {
+      DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+      if (!InlinedAtDL.isUnknown()) {
+        OS << " inlined @[ ";
+        printDebugLoc(InlinedAtDL, MF, OS);
+        OS << " ]";
+      }
+    }
+  } else if (!debugLoc.isUnknown() && MF) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    OS << " dbg:";
+    printDebugLoc(debugLoc, MF, OS);
+  }
+
+  OS << '\n';
+}
+
+bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+                                     const TargetRegisterInfo *RegInfo,
+                                     bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isKill())
+          // The register is already marked kill.
+          return true;
+        if (isPhysReg && isRegTiedToDefOperand(i))
+          // Two-address uses of physregs must not be marked kill.
+          return true;
+        MO.setIsKill();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isKill() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // A super-register kill already exists.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded kill operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsKill(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is killed. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         false /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         true  /*IsKill*/));
+    return true;
+  }
+  return Found;
+}
+
+void MachineInstr::clearRegisterKills(unsigned Reg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    RegInfo = 0;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+      continue;
+    unsigned OpReg = MO.getReg();
+    if (OpReg == Reg || (RegInfo && RegInfo->isSuperRegister(Reg, OpReg)))
+      MO.setIsKill(false);
+  }
+}
+
+bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+                                   const TargetRegisterInfo *RegInfo,
+                                   bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      MO.setIsDead();
+      Found = true;
+    } else if (hasAliases && MO.isDead() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // There exists a super-register that's marked dead.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded dead operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsDead(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is dead. Add a
+  // new implicit operand if required.
+  if (Found || !AddIfNotFound)
+    return Found;
+
+  addOperand(MachineOperand::CreateReg(IncomingReg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/,
+                                       false /*IsKill*/,
+                                       true  /*IsDead*/));
+  return true;
+}
+
+void MachineInstr::addRegisterDefined(unsigned IncomingReg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(IncomingReg)) {
+    MachineOperand *MO = findRegisterDefOperand(IncomingReg, false, RegInfo);
+    if (MO)
+      return;
+  } else {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = getOperand(i);
+      if (MO.isReg() && MO.getReg() == IncomingReg && MO.isDef() &&
+          MO.getSubReg() == 0)
+        return;
+    }
+  }
+  addOperand(MachineOperand::CreateReg(IncomingReg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/));
+}
+
+void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  bool HasRegMask = false;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (MO.isRegMask()) {
+      HasRegMask = true;
+      continue;
+    }
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    bool Dead = true;
+    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
+         I != E; ++I)
+      if (TRI.regsOverlap(*I, Reg)) {
+        Dead = false;
+        break;
+      }
+    // If there are no uses, including partial uses, the def is dead.
+    if (Dead) MO.setIsDead();
+  }
+
+  // This is a call with a register mask operand.
+  // Mask clobbers are always dead, so add defs for the non-dead defines.
+  if (HasRegMask)
+    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
+         I != E; ++I)
+      addRegisterDefined(*I, &TRI);
+}
+
+unsigned
+MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
+  // Build up a buffer of hash code components.
+  SmallVector<size_t, 8> HashComponents;
+  HashComponents.reserve(MI->getNumOperands() + 1);
+  HashComponents.push_back(MI->getOpcode());
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
+  }
+  return hash_combine_range(HashComponents.begin(), HashComponents.end());
+}
+
+void MachineInstr::emitError(StringRef Msg) const {
+  // Find the source location cookie.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = 0;
+  for (unsigned i = getNumOperands(); i != 0; --i) {
+    if (getOperand(i-1).isMetadata() &&
+        (LocMD = getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  if (const MachineBasicBlock *MBB = getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
+  report_fatal_error(Msg);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
new file mode 100644
index 000000000000..77bcd1d7c8e3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -0,0 +1,330 @@
+//===-- lib/CodeGen/MachineInstrBundle.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+  class UnpackMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    UnpackMachineBundles() : MachineFunctionPass(ID) {
+      initializeUnpackMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+  };
+} // end anonymous namespace
+
+char UnpackMachineBundles::ID = 0;
+char &llvm::UnpackMachineBundlesID = UnpackMachineBundles::ID;
+INITIALIZE_PASS(UnpackMachineBundles, "unpack-mi-bundles",
+                "Unpack machine instruction bundles", false, false)
+
+bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    for (MachineBasicBlock::instr_iterator MII = MBB->instr_begin(),
+           MIE = MBB->instr_end(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // Remove BUNDLE instruction and the InsideBundle flags from bundled
+      // instructions.
+      if (MI->isBundle()) {
+        while (++MII != MIE && MII->isBundledWithPred()) {
+          MII->unbundleFromPred();
+          for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = MII->getOperand(i);
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+        }
+        MI->eraseFromParent();
+
+        Changed = true;
+        continue;
+      }
+
+      ++MII;
+    }
+  }
+
+  return Changed;
+}
+
+
+namespace {
+  class FinalizeMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    FinalizeMachineBundles() : MachineFunctionPass(ID) {
+      initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+  };
+} // end anonymous namespace
+
+char FinalizeMachineBundles::ID = 0;
+char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID;
+INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles",
+                "Finalize machine instruction bundles", false, false)
+
+bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  return llvm::finalizeBundles(MF);
+}
+
+
+/// finalizeBundle - Finalize a machine instruction bundle which includes
+/// a sequence of instructions starting from FirstMI to LastMI (exclusive).
+/// This routine adds a BUNDLE instruction to represent the bundle, it adds
+/// IsInternalRead markers to MachineOperands which are defined inside the
+/// bundle, and it copies externally visible defs and uses to the BUNDLE
+/// instruction.
+void llvm::finalizeBundle(MachineBasicBlock &MBB,
+                          MachineBasicBlock::instr_iterator FirstMI,
+                          MachineBasicBlock::instr_iterator LastMI) {
+  assert(FirstMI != LastMI && "Empty bundle?");
+  MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
+
+  const TargetMachine &TM = MBB.getParent()->getTarget();
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  MachineInstrBuilder MIB = BuildMI(*MBB.getParent(), FirstMI->getDebugLoc(),
+                                    TII->get(TargetOpcode::BUNDLE));
+  Bundle.prepend(MIB);
+
+  SmallVector<unsigned, 32> LocalDefs;
+  SmallSet<unsigned, 32> LocalDefSet;
+  SmallSet<unsigned, 8> DeadDefSet;
+  SmallSet<unsigned, 16> KilledDefSet;
+  SmallVector<unsigned, 8> ExternUses;
+  SmallSet<unsigned, 8> ExternUseSet;
+  SmallSet<unsigned, 8> KilledUseSet;
+  SmallSet<unsigned, 8> UndefUseSet;
+  SmallVector<MachineOperand*, 4> Defs;
+  for (; FirstMI != LastMI; ++FirstMI) {
+    for (unsigned i = 0, e = FirstMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = FirstMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        Defs.push_back(&MO);
+        continue;
+      }
+
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      if (LocalDefSet.count(Reg)) {
+        MO.setIsInternalRead();
+        if (MO.isKill())
+          // Internal def is now killed.
+          KilledDefSet.insert(Reg);
+      } else {
+        if (ExternUseSet.insert(Reg)) {
+          ExternUses.push_back(Reg);
+          if (MO.isUndef())
+            UndefUseSet.insert(Reg);
+        }
+        if (MO.isKill())
+          // External def is now killed.
+          KilledUseSet.insert(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      MachineOperand &MO = *Defs[i];
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      if (LocalDefSet.insert(Reg)) {
+        LocalDefs.push_back(Reg);
+        if (MO.isDead()) {
+          DeadDefSet.insert(Reg);
+        }
+      } else {
+        // Re-defined inside the bundle, it's no longer killed.
+        KilledDefSet.erase(Reg);
+        if (!MO.isDead())
+          // Previously defined but dead.
+          DeadDefSet.erase(Reg);
+      }
+
+      if (!MO.isDead()) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          unsigned SubReg = *SubRegs;
+          if (LocalDefSet.insert(SubReg))
+            LocalDefs.push_back(SubReg);
+        }
+      }
+    }
+
+    Defs.clear();
+  }
+
+  SmallSet<unsigned, 32> Added;
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    if (Added.insert(Reg)) {
+      // If it's not live beyond end of the bundle, mark it dead.
+      bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
+      MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
+                 getImplRegState(true));
+    }
+  }
+
+  for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
+    unsigned Reg = ExternUses[i];
+    bool isKill = KilledUseSet.count(Reg);
+    bool isUndef = UndefUseSet.count(Reg);
+    MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
+               getImplRegState(true));
+  }
+}
+
+/// finalizeBundle - Same functionality as the previous finalizeBundle except
+/// the last instruction in the bundle is not provided as an input. This is
+/// used in cases where bundles are pre-determined by marking instructions
+/// with 'InsideBundle' marker. It returns the MBB instruction iterator that
+/// points to the end of the bundle.
+MachineBasicBlock::instr_iterator
+llvm::finalizeBundle(MachineBasicBlock &MBB,
+                     MachineBasicBlock::instr_iterator FirstMI) {
+  MachineBasicBlock::instr_iterator E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator LastMI = llvm::next(FirstMI);
+  while (LastMI != E && LastMI->isInsideBundle())
+    ++LastMI;
+  finalizeBundle(MBB, FirstMI, LastMI);
+  return LastMI;
+}
+
+/// finalizeBundles - Finalize instruction bundles in the specified
+/// MachineFunction. Return true if any bundles are finalized.
+bool llvm::finalizeBundles(MachineFunction &MF) {
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = *I;
+    MachineBasicBlock::instr_iterator MII = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator MIE = MBB.instr_end();
+    if (MII == MIE)
+      continue;
+    assert(!MII->isInsideBundle() &&
+           "First instr cannot be inside bundle before finalization!");
+
+    for (++MII; MII != MIE; ) {
+      if (!MII->isInsideBundle())
+        ++MII;
+      else {
+        MII = finalizeBundle(MBB, llvm::prior(MII));
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineOperand iterator
+//===----------------------------------------------------------------------===//
+
+MachineOperandIteratorBase::VirtRegInfo
+MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
+                    SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops) {
+  VirtRegInfo RI = { false, false, false };
+  for(; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    // Remember each (MI, OpNo) that refers to Reg.
+    if (Ops)
+      Ops->push_back(std::make_pair(MO.getParent(), getOperandNo()));
+
+    // Both defs and uses can read virtual registers.
+    if (MO.readsReg()) {
+      RI.Reads = true;
+      if (MO.isDef())
+        RI.Tied = true;
+    }
+
+    // Only defs can write.
+    if (MO.isDef())
+      RI.Writes = true;
+    else if (!RI.Tied && MO.getParent()->isRegTiedToDefOperand(getOperandNo()))
+      RI.Tied = true;
+  }
+  return RI;
+}
+
+MachineOperandIteratorBase::PhysRegInfo
+MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
+                                           const TargetRegisterInfo *TRI) {
+  bool AllDefsDead = true;
+  PhysRegInfo PRI = {false, false, false, false, false, false};
+
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "analyzePhysReg not given a physical register!");
+  for (; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      PRI.Clobbers = true;    // Regmask clobbers Reg.
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned MOReg = MO.getReg();
+    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+      continue;
+
+    bool IsRegOrSuperReg = MOReg == Reg || TRI->isSubRegister(MOReg, Reg);
+    bool IsRegOrOverlapping = MOReg == Reg || TRI->regsOverlap(MOReg, Reg);
+
+    if (IsRegOrSuperReg && MO.readsReg()) {
+      // Reg or a super-reg is read, and perhaps killed also.
+      PRI.Reads = true;
+      PRI.Kills = MO.isKill();
+    }
+
+    if (IsRegOrOverlapping && MO.readsReg()) {
+      PRI.ReadsOverlap = true;// Reg or an overlapping register is read.
+    }
+
+    if (!MO.isDef())
+      continue;
+
+    if (IsRegOrSuperReg) {
+      PRI.Defines = true;     // Reg or a super-register is defined.
+      if (!MO.isDead())
+        AllDefsDead = false;
+    }
+    if (IsRegOrOverlapping)
+      PRI.Clobbers = true;    // Reg or an overlapping reg is defined.
+  }
+
+  if (AllDefsDead && PRI.Defines)
+    PRI.DefinesDead = true;   // Reg or super-register was defined and was dead.
+
+  return PRI;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
new file mode 100644
index 000000000000..ed3ed4d4d916
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
@@ -0,0 +1,1489 @@
+//===-- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion on machine instructions. We
+// attempt to remove as much code from the body of a loop as possible.
+//
+// This pass does not attempt to throttle itself to limit register pressure.
+// The register allocation phases are expected to perform rematerialization
+// to recover when register pressure is high.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for the LLVM-IR-level LICM pass. It is only designed to hoist simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-licm"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+AvoidSpeculation("avoid-speculation",
+                 cl::desc("MachineLICM should avoid speculation"),
+                 cl::init(true), cl::Hidden);
+
+STATISTIC(NumHoisted,
+          "Number of machine instructions hoisted out of loops");
+STATISTIC(NumLowRP,
+          "Number of instructions hoisted in low reg pressure situation");
+STATISTIC(NumHighLatency,
+          "Number of high latency instructions hoisted");
+STATISTIC(NumCSEed,
+          "Number of hoisted machine instructions CSEed");
+STATISTIC(NumPostRAHoisted,
+          "Number of machine instructions hoisted out of loops post regalloc");
+
+namespace {
+  class MachineLICM : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    const TargetLoweringBase *TLI;
+    const TargetRegisterInfo *TRI;
+    const MachineFrameInfo *MFI;
+    MachineRegisterInfo *MRI;
+    const InstrItineraryData *InstrItins;
+    bool PreRegAlloc;
+
+    // Various analyses that we use...
+    AliasAnalysis        *AA;      // Alias analysis info.
+    MachineLoopInfo      *MLI;     // Current MachineLoopInfo
+    MachineDominatorTree *DT;      // Machine dominator tree for the cur loop
+
+    // State that is updated as we process loops
+    bool         Changed;          // True if a loop is changed.
+    bool         FirstInLoop;      // True if it's the first LICM in the loop.
+    MachineLoop *CurLoop;          // The current loop we are working on.
+    MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
+
+    // Exit blocks for CurLoop.
+    SmallVector<MachineBasicBlock*, 8> ExitBlocks;
+
+    bool isExitBlock(const MachineBasicBlock *MBB) const {
+      return std::find(ExitBlocks.begin(), ExitBlocks.end(), MBB) !=
+        ExitBlocks.end();
+    }
+
+    // Track 'estimated' register pressure.
+    SmallSet<unsigned, 32> RegSeen;
+    SmallVector<unsigned, 8> RegPressure;
+
+    // Register pressure "limit" per register class. If the pressure
+    // is higher than the limit, then it's considered high.
+    SmallVector<unsigned, 8> RegLimit;
+
+    // Register pressure on path leading from loop preheader to current BB.
+    SmallVector<SmallVector<unsigned, 8>, 16> BackTrace;
+
+    // For each opcode, keep a list of potential CSE instructions.
+    DenseMap<unsigned, std::vector<const MachineInstr*> > CSEMap;
+
+    enum {
+      SpeculateFalse   = 0,
+      SpeculateTrue    = 1,
+      SpeculateUnknown = 2
+    };
+
+    // If a MBB does not dominate loop exiting blocks then it may not safe
+    // to hoist loads from this block.
+    // Tri-state: 0 - false, 1 - true, 2 - unknown
+    unsigned SpeculationState;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineLICM() :
+      MachineFunctionPass(ID), PreRegAlloc(true) {
+        initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+      }
+
+    explicit MachineLICM(bool PreRA) :
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {
+        initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual void releaseMemory() {
+      RegSeen.clear();
+      RegPressure.clear();
+      RegLimit.clear();
+      BackTrace.clear();
+      for (DenseMap<unsigned,std::vector<const MachineInstr*> >::iterator
+             CI = CSEMap.begin(), CE = CSEMap.end(); CI != CE; ++CI)
+        CI->second.clear();
+      CSEMap.clear();
+    }
+
+  private:
+    /// CandidateInfo - Keep track of information about hoisting candidates.
+    struct CandidateInfo {
+      MachineInstr *MI;
+      unsigned      Def;
+      int           FI;
+      CandidateInfo(MachineInstr *mi, unsigned def, int fi)
+        : MI(mi), Def(def), FI(fi) {}
+    };
+
+    /// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
+    /// invariants out to the preheader.
+    void HoistRegionPostRA();
+
+    /// HoistPostRA - When an instruction is found to only use loop invariant
+    /// operands that is safe to hoist, this instruction is called to do the
+    /// dirty work.
+    void HoistPostRA(MachineInstr *MI, unsigned Def);
+
+    /// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+    /// gather register def and frame object update information.
+    void ProcessMI(MachineInstr *MI,
+                   BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers,
+                   SmallSet<int, 32> &StoredFIs,
+                   SmallVector<CandidateInfo, 32> &Candidates);
+
+    /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
+    /// current loop.
+    void AddToLiveIns(unsigned Reg);
+
+    /// IsLICMCandidate - Returns true if the instruction may be a suitable
+    /// candidate for LICM. e.g. If the instruction is a call, then it's
+    /// obviously not safe to hoist it.
+    bool IsLICMCandidate(MachineInstr &I);
+
+    /// IsLoopInvariantInst - Returns true if the instruction is loop
+    /// invariant. I.e., all virtual register operands are defined outside of
+    /// the loop, physical registers aren't accessed (explicitly or implicitly),
+    /// and the instruction is hoistable.
+    ///
+    bool IsLoopInvariantInst(MachineInstr &I);
+
+    /// HasLoopPHIUse - Return true if the specified instruction is used by any
+    /// phi node in the current loop.
+    bool HasLoopPHIUse(const MachineInstr *MI) const;
+
+    /// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
+    /// and an use in the current loop, return true if the target considered
+    /// it 'high'.
+    bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
+                               unsigned Reg) const;
+
+    bool IsCheapInstruction(MachineInstr &MI) const;
+
+    /// CanCauseHighRegPressure - Visit BBs from header to current BB,
+    /// check if hoisting an instruction of the given cost matrix can cause high
+    /// register pressure.
+    bool CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost, bool Cheap);
+
+    /// UpdateBackTraceRegPressure - Traverse the back trace from header to
+    /// the current block and update their register pressures to reflect the
+    /// effect of hoisting MI from the current block to the preheader.
+    void UpdateBackTraceRegPressure(const MachineInstr *MI);
+
+    /// IsProfitableToHoist - Return true if it is potentially profitable to
+    /// hoist the given loop invariant.
+    bool IsProfitableToHoist(MachineInstr &MI);
+
+    /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
+    /// If not then a load from this mbb may not be safe to hoist.
+    bool IsGuaranteedToExecute(MachineBasicBlock *BB);
+
+    void EnterScope(MachineBasicBlock *MBB);
+
+    void ExitScope(MachineBasicBlock *MBB);
+
+    /// ExitScopeIfDone - Destroy scope for the MBB that corresponds to given
+    /// dominator tree node if its a leaf or all of its children are done. Walk
+    /// up the dominator tree to destroy ancestors which are now done.
+    void ExitScopeIfDone(MachineDomTreeNode *Node,
+                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
+
+    /// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
+    /// blocks dominated by the specified header block, and that are in the
+    /// current loop) in depth first order w.r.t the DominatorTree. This allows
+    /// us to visit definitions before uses, allowing us to hoist a loop body in
+    /// one pass without iteration.
+    ///
+    void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+    void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
+
+    /// getRegisterClassIDAndCost - For a given MI, register, and the operand
+    /// index, return the ID and cost of its representative register class by
+    /// reference.
+    void getRegisterClassIDAndCost(const MachineInstr *MI,
+                                   unsigned Reg, unsigned OpIdx,
+                                   unsigned &RCId, unsigned &RCCost) const;
+
+    /// InitRegPressure - Find all virtual register references that are liveout
+    /// of the preheader to initialize the starting "register pressure". Note
+    /// this does not count live through (livein but not used) registers.
+    void InitRegPressure(MachineBasicBlock *BB);
+
+    /// UpdateRegPressure - Update estimate of register pressure after the
+    /// specified instruction.
+    void UpdateRegPressure(const MachineInstr *MI);
+
+    /// ExtractHoistableLoad - Unfold a load from the given machineinstr if
+    /// the load itself could be hoisted. Return the unfolded and hoistable
+    /// load, or null if the load couldn't be unfolded or if it wouldn't
+    /// be hoistable.
+    MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
+
+    /// LookForDuplicate - Find an instruction amount PrevMIs that is a
+    /// duplicate of MI. Return this instruction if it's found.
+    const MachineInstr *LookForDuplicate(const MachineInstr *MI,
+                                     std::vector<const MachineInstr*> &PrevMIs);
+
+    /// EliminateCSE - Given a LICM'ed instruction, look for an instruction on
+    /// the preheader that compute the same value. If it's found, do a RAU on
+    /// with the definition of the existing instruction rather than hoisting
+    /// the instruction to the preheader.
+    bool EliminateCSE(MachineInstr *MI,
+           DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
+
+    /// MayCSE - Return true if the given instruction will be CSE'd if it's
+    /// hoisted out of the loop.
+    bool MayCSE(MachineInstr *MI);
+
+    /// Hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    /// It returns true if the instruction is hoisted.
+    bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
+
+    /// InitCSEMap - Initialize the CSE map with instructions that are in the
+    /// current loop preheader that may become duplicates of instructions that
+    /// are hoisted out of the loop.
+    void InitCSEMap(MachineBasicBlock *BB);
+
+    /// getCurPreheader - Get the preheader for the current loop, splitting
+    /// a critical edge if needed.
+    MachineBasicBlock *getCurPreheader();
+  };
+} // end anonymous namespace
+
+char MachineLICM::ID = 0;
+char &llvm::MachineLICMID = MachineLICM::ID;
+INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false)
+
+/// LoopIsOuterMostWithPredecessor - Test if the given loop is the outer-most
+/// loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
+  for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
+    if (L->getLoopPredecessor())
+      return false;
+  // None of them did, so this is the outermost with a unique predecessor.
+  return true;
+}
+
+bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
+  Changed = FirstInLoop = false;
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  TLI = TM->getTargetLowering();
+  TRI = TM->getRegisterInfo();
+  MFI = MF.getFrameInfo();
+  MRI = &MF.getRegInfo();
+  InstrItins = TM->getInstrItineraryData();
+
+  PreRegAlloc = MRI->isSSA();
+
+  if (PreRegAlloc)
+    DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
+  else
+    DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
+  DEBUG(dbgs() << MF.getName() << " ********\n");
+
+  if (PreRegAlloc) {
+    // Estimate register pressure during pre-regalloc pass.
+    unsigned NumRC = TRI->getNumRegClasses();
+    RegPressure.resize(NumRC);
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+    RegLimit.resize(NumRC);
+    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+           E = TRI->regclass_end(); I != E; ++I)
+      RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, MF);
+  }
+
+  // Get our Loop information...
+  MLI = &getAnalysis<MachineLoopInfo>();
+  DT  = &getAnalysis<MachineDominatorTree>();
+  AA  = &getAnalysis<AliasAnalysis>();
+
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = 0;
+    ExitBlocks.clear();
+
+    // If this is done before regalloc, only visit outer-most preheader-sporting
+    // loops.
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
+      continue;
+    }
+
+    CurLoop->getExitBlocks(ExitBlocks);
+
+    if (!PreRegAlloc)
+      HoistRegionPostRA();
+    else {
+      // CSEMap is initialized for loop header when the first instruction is
+      // being hoisted.
+      MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
+      HoistOutOfLoop(N);
+      CSEMap.clear();
+    }
+  }
+
+  return Changed;
+}
+
+/// InstructionStoresToFI - Return true if instruction stores to the
+/// specified frame.
+static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end(); o != oe; ++o) {
+    if (!(*o)->isStore() || !(*o)->getValue())
+      continue;
+    if (const FixedStackPseudoSourceValue *Value =
+        dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+      if (Value->getFrameIndex() == FI)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// ProcessMI - Examine the instruction for potentai LICM candidate. Also
+/// gather register def and frame object update information.
+void MachineLICM::ProcessMI(MachineInstr *MI,
+                            BitVector &PhysRegDefs,
+                            BitVector &PhysRegClobbers,
+                            SmallSet<int, 32> &StoredFIs,
+                            SmallVector<CandidateInfo, 32> &Candidates) {
+  bool RuledOut = false;
+  bool HasNonInvariantUse = false;
+  unsigned Def = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isFI()) {
+      // Remember if the instruction stores to the frame index.
+      int FI = MO.getIndex();
+      if (!StoredFIs.count(FI) &&
+          MFI->isSpillSlotObjectIndex(FI) &&
+          InstructionStoresToFI(MI, FI))
+        StoredFIs.insert(FI);
+      HasNonInvariantUse = true;
+      continue;
+    }
+
+    // We can't hoist an instruction defining a physreg that is clobbered in
+    // the loop.
+    if (MO.isRegMask()) {
+      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+           "Not expecting virtual register!");
+
+    if (!MO.isDef()) {
+      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
+        // If it's using a non-loop-invariant register, then it's obviously not
+        // safe to hoist.
+        HasNonInvariantUse = true;
+      continue;
+    }
+
+    if (MO.isImplicit()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegClobbers.set(*AI);
+      if (!MO.isDead())
+        // Non-dead implicit def? This cannot be hoisted.
+        RuledOut = true;
+      // No need to check if a dead implicit def is also defined by
+      // another instruction.
+      continue;
+    }
+
+    // FIXME: For now, avoid instructions with multiple defs, unless
+    // it's a dead implicit def.
+    if (Def)
+      RuledOut = true;
+    else
+      Def = Reg;
+
+    // If we have already seen another instruction that defines the same
+    // register, then this is not safe.  Two defs is indicated by setting a
+    // PhysRegClobbers bit.
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
+      if (PhysRegDefs.test(*AS))
+        PhysRegClobbers.set(*AS);
+      if (PhysRegClobbers.test(*AS))
+        // MI defined register is seen defined by another instruction in
+        // the loop, it cannot be a LICM candidate.
+        RuledOut = true;
+      PhysRegDefs.set(*AS);
+    }
+  }
+
+  // Only consider reloads for now and remats which do not have register
+  // operands. FIXME: Consider unfold load folding instructions.
+  if (Def && !RuledOut) {
+    int FI = INT_MIN;
+    if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) ||
+        (TII->isLoadFromStackSlot(MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
+      Candidates.push_back(CandidateInfo(MI, Def, FI));
+  }
+}
+
+/// HoistRegionPostRA - Walk the specified region of the CFG and hoist loop
+/// invariants out to the preheader.
+void MachineLICM::HoistRegionPostRA() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  unsigned NumRegs = TRI->getNumRegs();
+  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
+  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+
+  SmallVector<CandidateInfo, 32> Candidates;
+  SmallSet<int, 32> StoredFIs;
+
+  // Walk the entire region, count number of defs for each register, and
+  // collect potential LICM candidates.
+  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *BB = Blocks[i];
+
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isLandingPad()) continue;
+
+    // Conservatively treat live-in's as an external def.
+    // FIXME: That means a reload that're reused in successor block(s) will not
+    // be LICM'ed.
+    for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
+           E = BB->livein_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegDefs.set(*AI);
+    }
+
+    SpeculationState = SpeculateUnknown;
+    for (MachineBasicBlock::iterator
+           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
+      MachineInstr *MI = &*MII;
+      ProcessMI(MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
+    }
+  }
+
+  // Gather the registers read / clobbered by the terminator.
+  BitVector TermRegs(NumRegs);
+  MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
+  if (TI != Preheader->end()) {
+    for (unsigned i = 0, e = TI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = TI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        TermRegs.set(*AI);
+    }
+  }
+
+  // Now evaluate whether the potential candidates qualify.
+  // 1. Check if the candidate defined register is defined by another
+  //    instruction in the loop.
+  // 2. If the candidate is a load from stack slot (always true for now),
+  //    check if the slot is stored anywhere in the loop.
+  // 3. Make sure candidate def should not clobber
+  //    registers read by the terminator. Similarly its def should not be
+  //    clobbered by the terminator.
+  for (unsigned i = 0, e = Candidates.size(); i != e; ++i) {
+    if (Candidates[i].FI != INT_MIN &&
+        StoredFIs.count(Candidates[i].FI))
+      continue;
+
+    unsigned Def = Candidates[i].Def;
+    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
+      bool Safe = true;
+      MachineInstr *MI = Candidates[i].MI;
+      for (unsigned j = 0, ee = MI->getNumOperands(); j != ee; ++j) {
+        const MachineOperand &MO = MI->getOperand(j);
+        if (!MO.isReg() || MO.isDef() || !MO.getReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (PhysRegDefs.test(Reg) ||
+            PhysRegClobbers.test(Reg)) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          Safe = false;
+          break;
+        }
+      }
+      if (Safe)
+        HoistPostRA(MI, Candidates[i].Def);
+    }
+  }
+}
+
+/// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
+/// loop, and make sure it is not killed by any instructions in the loop.
+void MachineLICM::AddToLiveIns(unsigned Reg) {
+  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *BB = Blocks[i];
+    if (!BB->isLiveIn(Reg))
+      BB->addLiveIn(Reg);
+    for (MachineBasicBlock::iterator
+           MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
+      MachineInstr *MI = &*MII;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue;
+        if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg()))
+          MO.setIsKill(false);
+      }
+    }
+  }
+}
+
+/// HoistPostRA - When an instruction is found to only use loop invariant
+/// operands that is safe to hoist, this instruction is called to do the
+/// dirty work.
+void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG(dbgs() << "Hoisting to BB#" << Preheader->getNumber() << " from BB#"
+               << MI->getParent()->getNumber() << ": " << *MI);
+
+  // Splice the instruction to the preheader.
+  MachineBasicBlock *MBB = MI->getParent();
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
+
+  // Add register to livein list to all the BBs in the current loop since a
+  // loop invariant must be kept live throughout the whole loop. This is
+  // important to ensure later passes do not scavenge the def register.
+  AddToLiveIns(Def);
+
+  ++NumPostRAHoisted;
+  Changed = true;
+}
+
+// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
+// If not then a load from this mbb may not be safe to hoist.
+bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
+  if (SpeculationState != SpeculateUnknown)
+    return SpeculationState == SpeculateFalse;
+
+  if (BB != CurLoop->getHeader()) {
+    // Check loop exiting blocks.
+    SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks;
+    CurLoop->getExitingBlocks(CurrentLoopExitingBlocks);
+    for (unsigned i = 0, e = CurrentLoopExitingBlocks.size(); i != e; ++i)
+      if (!DT->dominates(BB, CurrentLoopExitingBlocks[i])) {
+        SpeculationState = SpeculateTrue;
+        return false;
+      }
+  }
+
+  SpeculationState = SpeculateFalse;
+  return true;
+}
+
+void MachineLICM::EnterScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+
+  // Remember livein register pressure.
+  BackTrace.push_back(RegPressure);
+}
+
+void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  BackTrace.pop_back();
+}
+
+/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
+/// dominator tree node if its a leaf or all of its children are done. Walk
+/// up the dominator tree to destroy ancestors which are now done.
+void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
+                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+/// HoistOutOfLoop - Walk the specified loop in the CFG (defined by all
+/// blocks dominated by the specified header block, and that are in the
+/// current loop) in depth first order w.r.t the DominatorTree. This allows
+/// us to visit definitions before uses, allowing us to hoist a loop body in
+/// one pass without iteration.
+///
+void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(HeaderN);
+  do {
+    MachineDomTreeNode *Node = WorkList.pop_back_val();
+    assert(Node != 0 && "Null dominator tree node?");
+    MachineBasicBlock *BB = Node->getBlock();
+
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isLandingPad())
+      continue;
+
+    // If this subregion is not in the top level loop at all, exit.
+    if (!CurLoop->contains(BB))
+      continue;
+
+    Scopes.push_back(Node);
+    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+
+    // Don't hoist things out of a large switch statement.  This often causes
+    // code to be hoisted that wasn't going to be executed, and increases
+    // register pressure in a situation where it's likely to matter.
+    if (BB->succ_size() >= 25)
+      NumChildren = 0;
+
+    OpenChildren[Node] = NumChildren;
+    // Add children in reverse order as then the next popped worklist node is
+    // the first child of this node.  This means we ultimately traverse the
+    // DOM tree in exactly the same order as if we'd recursed.
+    for (int i = (int)NumChildren-1; i >= 0; --i) {
+      MachineDomTreeNode *Child = Children[i];
+      ParentMap[Child] = Node;
+      WorkList.push_back(Child);
+    }
+  } while (!WorkList.empty());
+
+  if (Scopes.size() != 0) {
+    MachineBasicBlock *Preheader = getCurPreheader();
+    if (!Preheader)
+      return;
+
+    // Compute registers which are livein into the loop headers.
+    RegSeen.clear();
+    BackTrace.clear();
+    InitRegPressure(Preheader);
+  }
+
+  // Now perform LICM.
+  for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = Scopes[i];
+    MachineBasicBlock *MBB = Node->getBlock();
+
+    MachineBasicBlock *Preheader = getCurPreheader();
+    if (!Preheader)
+      continue;
+
+    EnterScope(MBB);
+
+    // Process the block
+    SpeculationState = SpeculateUnknown;
+    for (MachineBasicBlock::iterator
+         MII = MBB->begin(), E = MBB->end(); MII != E; ) {
+      MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+      MachineInstr *MI = &*MII;
+      if (!Hoist(MI, Preheader))
+        UpdateRegPressure(MI);
+      MII = NextMII;
+    }
+
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+  }
+}
+
+static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
+  return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
+}
+
+/// getRegisterClassIDAndCost - For a given MI, register, and the operand
+/// index, return the ID and cost of its representative register class.
+void
+MachineLICM::getRegisterClassIDAndCost(const MachineInstr *MI,
+                                       unsigned Reg, unsigned OpIdx,
+                                       unsigned &RCId, unsigned &RCCost) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  MVT VT = *RC->vt_begin();
+  if (VT == MVT::Untyped) {
+    RCId = RC->getID();
+    RCCost = 1;
+  } else {
+    RCId = TLI->getRepRegClassFor(VT)->getID();
+    RCCost = TLI->getRepRegClassCostFor(VT);
+  }
+}
+
+/// InitRegPressure - Find all virtual register references that are liveout of
+/// the preheader to initialize the starting "register pressure". Note this
+/// does not count live through (livein but not used) registers.
+void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+
+  // If the preheader has only a single predecessor and it ends with a
+  // fallthrough or an unconditional branch, then scan its predecessor for live
+  // defs as well. This happens whenever the preheader is created by splitting
+  // the critical edge from the loop predecessor to the loop header.
+  if (BB->pred_size() == 1) {
+    MachineBasicBlock *TBB = 0, *FBB = 0;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
+      InitRegPressure(*BB->pred_begin());
+  }
+
+  for (MachineBasicBlock::iterator MII = BB->begin(), E = BB->end();
+       MII != E; ++MII) {
+    MachineInstr *MI = &*MII;
+    for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || MO.isImplicit())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+
+      bool isNew = RegSeen.insert(Reg);
+      unsigned RCId, RCCost;
+      getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
+      if (MO.isDef())
+        RegPressure[RCId] += RCCost;
+      else {
+        bool isKill = isOperandKill(MO, MRI);
+        if (isNew && !isKill)
+          // Haven't seen this, it must be a livein.
+          RegPressure[RCId] += RCCost;
+        else if (!isNew && isKill)
+          RegPressure[RCId] -= RCCost;
+      }
+    }
+  }
+}
+
+/// UpdateRegPressure - Update estimate of register pressure after the
+/// specified instruction.
+void MachineLICM::UpdateRegPressure(const MachineInstr *MI) {
+  if (MI->isImplicitDef())
+    return;
+
+  SmallVector<unsigned, 4> Defs;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    bool isNew = RegSeen.insert(Reg);
+    if (MO.isDef())
+      Defs.push_back(Reg);
+    else if (!isNew && isOperandKill(MO, MRI)) {
+      unsigned RCId, RCCost;
+      getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
+      if (RCCost > RegPressure[RCId])
+        RegPressure[RCId] = 0;
+      else
+        RegPressure[RCId] -= RCCost;
+    }
+  }
+
+  unsigned Idx = 0;
+  while (!Defs.empty()) {
+    unsigned Reg = Defs.pop_back_val();
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(MI, Reg, Idx, RCId, RCCost);
+    RegPressure[RCId] += RCCost;
+    ++Idx;
+  }
+}
+
+/// isLoadFromGOTOrConstantPool - Return true if this machine instruction
+/// loads from global offset table or constant pool.
+static bool isLoadFromGOTOrConstantPool(MachineInstr &MI) {
+  assert (MI.mayLoad() && "Expected MI that loads!");
+  for (MachineInstr::mmo_iterator I = MI.memoperands_begin(),
+         E = MI.memoperands_end(); I != E; ++I) {
+    if (const Value *V = (*I)->getValue()) {
+      if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V))
+        if (PSV == PSV->getGOT() || PSV == PSV->getConstantPool())
+          return true;
+    }
+  }
+  return false;
+}
+
+/// IsLICMCandidate - Returns true if the instruction may be a suitable
+/// candidate for LICM. e.g. If the instruction is a call, then it's obviously
+/// not safe to hoist it.
+bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if (!I.isSafeToMove(TII, AA, DontMoveAcrossStore))
+    return false;
+
+  // If it is load then check if it is guaranteed to execute by making sure that
+  // it dominates all exiting blocks. If it doesn't, then there is a path out of
+  // the loop which does not execute this load, so we can't hoist it. Loads
+  // from constant memory are not safe to speculate all the time, for example
+  // indexed load from a jump table.
+  // Stores and side effects are already checked by isSafeToMove.
+  if (I.mayLoad() && !isLoadFromGOTOrConstantPool(I) &&
+      !IsGuaranteedToExecute(I.getParent()))
+    return false;
+
+  return true;
+}
+
+/// IsLoopInvariantInst - Returns true if the instruction is loop
+/// invariant. I.e., all virtual register operands are defined outside of the
+/// loop, physical registers aren't accessed explicitly, and there are no side
+/// effects that aren't captured by the operands or other flags.
+///
+bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
+  if (!IsLICMCandidate(I))
+    return false;
+
+  // The instruction is loop invariant if all of its operands are.
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = I.getOperand(i);
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // Don't hoist an instruction that uses or defines a physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI->isConstantPhysReg(Reg, *I.getParent()->getParent()))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return false;
+      } else if (CurLoop->getHeader()->isLiveIn(Reg)) {
+        // If the reg is live into the loop, we can't hoist an instruction
+        // which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (CurLoop->contains(MRI->getVRegDef(Reg)))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+
+/// HasLoopPHIUse - Return true if the specified instruction is used by a
+/// phi node and hoisting it could cause a copy to be inserted.
+bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
+  SmallVector<const MachineInstr*, 8> Work(1, MI);
+  do {
+    MI = Work.pop_back_val();
+    for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+           UE = MRI->use_end(); UI != UE; ++UI) {
+        MachineInstr *UseMI = &*UI;
+        // A PHI may cause a copy to be inserted.
+        if (UseMI->isPHI()) {
+          // A PHI inside the loop causes a copy because the live range of Reg is
+          // extended across the PHI.
+          if (CurLoop->contains(UseMI))
+            return true;
+          // A PHI in an exit block can cause a copy to be inserted if the PHI
+          // has multiple predecessors in the loop with different values.
+          // For now, approximate by rejecting all exit blocks.
+          if (isExitBlock(UseMI->getParent()))
+            return true;
+          continue;
+        }
+        // Look past copies as well.
+        if (UseMI->isCopy() && CurLoop->contains(UseMI))
+          Work.push_back(UseMI);
+      }
+    }
+  } while (!Work.empty());
+  return false;
+}
+
+/// HasHighOperandLatency - Compute operand latency between a def of 'Reg'
+/// and an use in the current loop, return true if the target considered
+/// it 'high'.
+bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
+                                        unsigned DefIdx, unsigned Reg) const {
+  if (!InstrItins || InstrItins->isEmpty() || MRI->use_nodbg_empty(Reg))
+    return false;
+
+  for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+    MachineInstr *UseMI = &*I;
+    if (UseMI->isCopyLike())
+      continue;
+    if (!CurLoop->contains(UseMI->getParent()))
+      continue;
+    for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = UseMI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (MOReg != Reg)
+        continue;
+
+      if (TII->hasHighOperandLatency(InstrItins, MRI, &MI, DefIdx, UseMI, i))
+        return true;
+    }
+
+    // Only look at the first in loop use.
+    break;
+  }
+
+  return false;
+}
+
+/// IsCheapInstruction - Return true if the instruction is marked "cheap" or
+/// the operand latency between its def and a use is one or less.
+bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
+  if (MI.isAsCheapAsAMove() || MI.isCopyLike())
+    return true;
+  if (!InstrItins || InstrItins->isEmpty())
+    return false;
+
+  bool isCheap = false;
+  unsigned NumDefs = MI.getDesc().getNumDefs();
+  for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) {
+    MachineOperand &DefMO = MI.getOperand(i);
+    if (!DefMO.isReg() || !DefMO.isDef())
+      continue;
+    --NumDefs;
+    unsigned Reg = DefMO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    if (!TII->hasLowDefLatency(InstrItins, &MI, i))
+      return false;
+    isCheap = true;
+  }
+
+  return isCheap;
+}
+
+/// CanCauseHighRegPressure - Visit BBs from header to current BB, check
+/// if hoisting an instruction of the given cost matrix can cause high
+/// register pressure.
+bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost,
+                                          bool CheapInstr) {
+  for (DenseMap<unsigned, int>::iterator CI = Cost.begin(), CE = Cost.end();
+       CI != CE; ++CI) {
+    if (CI->second <= 0)
+      continue;
+
+    unsigned RCId = CI->first;
+    unsigned Limit = RegLimit[RCId];
+    int Cost = CI->second;
+
+    // Don't hoist cheap instructions if they would increase register pressure,
+    // even if we're under the limit.
+    if (CheapInstr)
+      return true;
+
+    for (unsigned i = BackTrace.size(); i != 0; --i) {
+      SmallVector<unsigned, 8> &RP = BackTrace[i-1];
+      if (RP[RCId] + Cost >= Limit)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// UpdateBackTraceRegPressure - Traverse the back trace from header to the
+/// current block and update their register pressures to reflect the effect
+/// of hoisting MI from the current block to the preheader.
+void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
+  if (MI->isImplicitDef())
+    return;
+
+  // First compute the 'cost' of the instruction, i.e. its contribution
+  // to register pressure.
+  DenseMap<unsigned, int> Cost;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
+    if (MO.isDef()) {
+      DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
+      if (CI != Cost.end())
+        CI->second += RCCost;
+      else
+        Cost.insert(std::make_pair(RCId, RCCost));
+    } else if (isOperandKill(MO, MRI)) {
+      DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
+      if (CI != Cost.end())
+        CI->second -= RCCost;
+      else
+        Cost.insert(std::make_pair(RCId, -RCCost));
+    }
+  }
+
+  // Update register pressure of blocks from loop header to current block.
+  for (unsigned i = 0, e = BackTrace.size(); i != e; ++i) {
+    SmallVector<unsigned, 8> &RP = BackTrace[i];
+    for (DenseMap<unsigned, int>::iterator CI = Cost.begin(), CE = Cost.end();
+         CI != CE; ++CI) {
+      unsigned RCId = CI->first;
+      RP[RCId] += CI->second;
+    }
+  }
+}
+
+/// IsProfitableToHoist - Return true if it is potentially profitable to hoist
+/// the given loop invariant.
+bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
+  if (MI.isImplicitDef())
+    return true;
+
+  // Besides removing computation from the loop, hoisting an instruction has
+  // these effects:
+  //
+  // - The value defined by the instruction becomes live across the entire
+  //   loop. This increases register pressure in the loop.
+  //
+  // - If the value is used by a PHI in the loop, a copy will be required for
+  //   lowering the PHI after extending the live range.
+  //
+  // - When hoisting the last use of a value in the loop, that value no longer
+  //   needs to be live in the loop. This lowers register pressure in the loop.
+
+  bool CheapInstr = IsCheapInstruction(MI);
+  bool CreatesCopy = HasLoopPHIUse(&MI);
+
+  // Don't hoist a cheap instruction if it would create a copy in the loop.
+  if (CheapInstr && CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Rematerializable instructions should always be hoisted since the register
+  // allocator can just pull them down again when needed.
+  if (TII->isTriviallyReMaterializable(&MI, AA))
+    return true;
+
+  // Estimate register pressure to determine whether to LICM the instruction.
+  // In low register pressure situation, we can be more aggressive about
+  // hoisting. Also, favors hoisting long latency instructions even in
+  // moderately high pressure situation.
+  // Cheap instructions will only be hoisted if they don't increase register
+  // pressure at all.
+  // FIXME: If there are long latency loop-invariant instructions inside the
+  // loop at this point, why didn't the optimizer's LICM hoist them?
+  DenseMap<unsigned, int> Cost;
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(&MI, Reg, i, RCId, RCCost);
+    if (MO.isDef()) {
+      if (HasHighOperandLatency(MI, i, Reg)) {
+        DEBUG(dbgs() << "Hoist High Latency: " << MI);
+        ++NumHighLatency;
+        return true;
+      }
+      Cost[RCId] += RCCost;
+    } else if (isOperandKill(MO, MRI)) {
+      // Is a virtual register use is a kill, hoisting it out of the loop
+      // may actually reduce register pressure or be register pressure
+      // neutral.
+      Cost[RCId] -= RCCost;
+    }
+  }
+
+  // Visit BBs from header to current BB, if hoisting this doesn't cause
+  // high register pressure, then it's safe to proceed.
+  if (!CanCauseHighRegPressure(Cost, CheapInstr)) {
+    DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
+    ++NumLowRP;
+    return true;
+  }
+
+  // Don't risk increasing register pressure if it would create copies.
+  if (CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Do not "speculate" in high register pressure situation. If an
+  // instruction is not guaranteed to be executed in the loop, it's best to be
+  // conservative.
+  if (AvoidSpeculation &&
+      (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) {
+    DEBUG(dbgs() << "Won't speculate: " << MI);
+    return false;
+  }
+
+  // High register pressure situation, only hoist if the instruction is going
+  // to be remat'ed.
+  if (!TII->isTriviallyReMaterializable(&MI, AA) &&
+      !MI.isInvariantLoad(AA)) {
+    DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
+    return false;
+  }
+
+  return true;
+}
+
+MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
+  // Don't unfold simple loads.
+  if (MI->canFoldAsLoad())
+    return 0;
+
+  // If not, we may be able to unfold a load and hoist that.
+  // First test whether the instruction is loading from an amenable
+  // memory location.
+  if (!MI->isInvariantLoad(AA))
+    return 0;
+
+  // Next determine the register class for a temporary register.
+  unsigned LoadRegIndex;
+  unsigned NewOpc =
+    TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),
+                                    /*UnfoldLoad=*/true,
+                                    /*UnfoldStore=*/false,
+                                    &LoadRegIndex);
+  if (NewOpc == 0) return 0;
+  const MCInstrDesc &MID = TII->get(NewOpc);
+  if (MID.getNumDefs() != 1) return 0;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
+  // Ok, we're unfolding. Create a temporary register and do the unfold.
+  unsigned Reg = MRI->createVirtualRegister(RC);
+
+  SmallVector<MachineInstr *, 2> NewMIs;
+  bool Success =
+    TII->unfoldMemoryOperand(MF, MI, Reg,
+                             /*UnfoldLoad=*/true, /*UnfoldStore=*/false,
+                             NewMIs);
+  (void)Success;
+  assert(Success &&
+         "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
+         "succeeded!");
+  assert(NewMIs.size() == 2 &&
+         "Unfolded a load into multiple instructions!");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock::iterator Pos = MI;
+  MBB->insert(Pos, NewMIs[0]);
+  MBB->insert(Pos, NewMIs[1]);
+  // If unfolding produced a load that wasn't loop-invariant or profitable to
+  // hoist, discard the new instructions and bail.
+  if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) {
+    NewMIs[0]->eraseFromParent();
+    NewMIs[1]->eraseFromParent();
+    return 0;
+  }
+
+  // Update register pressure for the unfolded instruction.
+  UpdateRegPressure(NewMIs[1]);
+
+  // Otherwise we successfully unfolded a load that we can hoist.
+  MI->eraseFromParent();
+  return NewMIs[0];
+}
+
+void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
+  for (MachineBasicBlock::iterator I = BB->begin(),E = BB->end(); I != E; ++I) {
+    const MachineInstr *MI = &*I;
+    unsigned Opcode = MI->getOpcode();
+    DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+      CI = CSEMap.find(Opcode);
+    if (CI != CSEMap.end())
+      CI->second.push_back(MI);
+    else {
+      std::vector<const MachineInstr*> CSEMIs;
+      CSEMIs.push_back(MI);
+      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
+    }
+  }
+}
+
+const MachineInstr*
+MachineLICM::LookForDuplicate(const MachineInstr *MI,
+                              std::vector<const MachineInstr*> &PrevMIs) {
+  for (unsigned i = 0, e = PrevMIs.size(); i != e; ++i) {
+    const MachineInstr *PrevMI = PrevMIs[i];
+    if (TII->produceSameValue(MI, PrevMI, (PreRegAlloc ? MRI : 0)))
+      return PrevMI;
+  }
+  return 0;
+}
+
+bool MachineLICM::EliminateCSE(MachineInstr *MI,
+          DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
+    DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
+
+    // Replace virtual registers defined by MI by their counterparts defined
+    // by Dup.
+    SmallVector<unsigned, 2> Defs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+
+      // Physical registers may not differ here.
+      assert((!MO.isReg() || MO.getReg() == 0 ||
+              !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+              MO.getReg() == Dup->getOperand(i).getReg()) &&
+             "Instructions with different phys regs are not identical!");
+
+      if (MO.isReg() && MO.isDef() &&
+          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        Defs.push_back(i);
+    }
+
+    SmallVector<const TargetRegisterClass*, 2> OrigRCs;
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      unsigned Idx = Defs[i];
+      unsigned Reg = MI->getOperand(Idx).getReg();
+      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      OrigRCs.push_back(MRI->getRegClass(DupReg));
+
+      if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) {
+        // Restore old RCs if more than one defs.
+        for (unsigned j = 0; j != i; ++j)
+          MRI->setRegClass(Dup->getOperand(Defs[j]).getReg(), OrigRCs[j]);
+        return false;
+      }
+    }
+
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      unsigned Idx = Defs[i];
+      unsigned Reg = MI->getOperand(Idx).getReg();
+      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      MRI->replaceRegWith(Reg, DupReg);
+      MRI->clearKillFlags(DupReg);
+    }
+
+    MI->eraseFromParent();
+    ++NumCSEed;
+    return true;
+  }
+  return false;
+}
+
+/// MayCSE - Return true if the given instruction will be CSE'd if it's
+/// hoisted out of the loop.
+bool MachineLICM::MayCSE(MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  return LookForDuplicate(MI, CI->second) != 0;
+}
+
+/// Hoist - When an instruction is found to use only loop invariant operands
+/// that are safe to hoist, this instruction is called to do the dirty work.
+///
+bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
+  // First check whether we should hoist this instruction.
+  if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
+    // If not, try unfolding a hoistable load.
+    MI = ExtractHoistableLoad(MI);
+    if (!MI) return false;
+  }
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG({
+      dbgs() << "Hoisting " << *MI;
+      if (Preheader->getBasicBlock())
+        dbgs() << " to MachineBasicBlock "
+               << Preheader->getName();
+      if (MI->getParent()->getBasicBlock())
+        dbgs() << " from MachineBasicBlock "
+               << MI->getParent()->getName();
+      dbgs() << "\n";
+    });
+
+  // If this is the first instruction being hoisted to the preheader,
+  // initialize the CSE map with potential common expressions.
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
+
+  // Look for opportunity to CSE the hoisted instruction.
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  if (!EliminateCSE(MI, CI)) {
+    // Otherwise, splice the instruction to the preheader.
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
+
+    // Update register pressure for BBs from header to this block.
+    UpdateBackTraceRegPressure(MI);
+
+    // Clear the kill flags of any register this instruction defines,
+    // since they may need to be live throughout the entire loop
+    // rather than just live for part of it.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDef() && !MO.isDead())
+        MRI->clearKillFlags(MO.getReg());
+    }
+
+    // Add to the CSE map.
+    if (CI != CSEMap.end())
+      CI->second.push_back(MI);
+    else {
+      std::vector<const MachineInstr*> CSEMIs;
+      CSEMIs.push_back(MI);
+      CSEMap.insert(std::make_pair(Opcode, CSEMIs));
+    }
+  }
+
+  ++NumHoisted;
+  Changed = true;
+
+  return true;
+}
+
+MachineBasicBlock *MachineLICM::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return 0;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return 0;
+      }
+    }
+  }
+  return CurPreheader;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
new file mode 100644
index 000000000000..4e2cfdc4e568
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -0,0 +1,81 @@
+//===- MachineLoopInfo.cpp - Natural Loop Calculator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachineLoopInfo class that is used to identify natural
+// loops and determine the loop depth of various nodes of the CFG.  Note that
+// the loops identified may actually be several natural loops that share the
+// same header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
+template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
+
+char MachineLoopInfo::ID = 0;
+INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+
+char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
+
+bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  LI.Analyze(getAnalysis<MachineDominatorTree>().getBase());
+  return false;
+}
+
+void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineBasicBlock *MachineLoop::getTopBlock() {
+  MachineBasicBlock *TopMBB = getHeader();
+  MachineFunction::iterator Begin = TopMBB->getParent()->begin();
+  if (TopMBB != Begin) {
+    MachineBasicBlock *PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    while (contains(PriorMBB)) {
+      TopMBB = PriorMBB;
+      if (TopMBB == Begin) break;
+      PriorMBB = prior(MachineFunction::iterator(TopMBB));
+    }
+  }
+  return TopMBB;
+}
+
+MachineBasicBlock *MachineLoop::getBottomBlock() {
+  MachineBasicBlock *BotMBB = getHeader();
+  MachineFunction::iterator End = BotMBB->getParent()->end();
+  if (BotMBB != prior(End)) {
+    MachineBasicBlock *NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
+    while (contains(NextMBB)) {
+      BotMBB = NextMBB;
+      if (BotMBB == llvm::next(MachineFunction::iterator(BotMBB))) break;
+      NextMBB = llvm::next(MachineFunction::iterator(BotMBB));
+    }
+  }
+  return BotMBB;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineLoop::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
new file mode 100644
index 000000000000..0ea9ae0fcc89
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -0,0 +1,578 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+using namespace llvm::dwarf;
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false)
+char MachineModuleInfo::ID = 0;
+
+// Out of line virtual method.
+MachineModuleInfoImpl::~MachineModuleInfoImpl() {}
+
+namespace llvm {
+class MMIAddrLabelMapCallbackPtr : CallbackVH {
+  MMIAddrLabelMap *Map;
+public:
+  MMIAddrLabelMapCallbackPtr() : Map(0) {}
+  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(0) {}
+
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+
+  void setMap(MMIAddrLabelMap *map) { Map = map; }
+
+  virtual void deleted();
+  virtual void allUsesReplacedWith(Value *V2);
+};
+
+class MMIAddrLabelMap {
+  MCContext &Context;
+  struct AddrLabelSymEntry {
+    /// Symbols - The symbols for the label.  This is a pointer union that is
+    /// either one symbol (the common case) or a list of symbols.
+    PointerUnion<MCSymbol *, std::vector<MCSymbol*>*> Symbols;
+
+    Function *Fn;   // The containing function of the BasicBlock.
+    unsigned Index; // The index in BBCallbacks for the BasicBlock.
+  };
+
+  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
+
+  /// BBCallbacks - Callbacks for the BasicBlock's that we have entries for.  We
+  /// use this so we get notified if a block is deleted or RAUWd.
+  std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
+
+  /// DeletedAddrLabelsNeedingEmission - This is a per-function list of symbols
+  /// whose corresponding BasicBlock got deleted.  These symbols need to be
+  /// emitted at some point in the file, so AsmPrinter emits them after the
+  /// function body.
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >
+    DeletedAddrLabelsNeedingEmission;
+public:
+
+  MMIAddrLabelMap(MCContext &context) : Context(context) {}
+  ~MMIAddrLabelMap() {
+    assert(DeletedAddrLabelsNeedingEmission.empty() &&
+           "Some labels for deleted blocks never got emitted");
+
+    // Deallocate any of the 'list of symbols' case.
+    for (DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry>::iterator
+         I = AddrLabelSymbols.begin(), E = AddrLabelSymbols.end(); I != E; ++I)
+      if (I->second.Symbols.is<std::vector<MCSymbol*>*>())
+        delete I->second.Symbols.get<std::vector<MCSymbol*>*>();
+  }
+
+  MCSymbol *getAddrLabelSymbol(BasicBlock *BB);
+  std::vector<MCSymbol*> getAddrLabelSymbolToEmit(BasicBlock *BB);
+
+  void takeDeletedSymbolsForFunction(Function *F,
+                                     std::vector<MCSymbol*> &Result);
+
+  void UpdateForDeletedBlock(BasicBlock *BB);
+  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
+};
+}
+
+MCSymbol *MMIAddrLabelMap::getAddrLabelSymbol(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+
+  // If we already had an entry for this block, just return it.
+  if (!Entry.Symbols.isNull()) {
+    assert(BB->getParent() == Entry.Fn && "Parent changed");
+    if (Entry.Symbols.is<MCSymbol*>())
+      return Entry.Symbols.get<MCSymbol*>();
+    return (*Entry.Symbols.get<std::vector<MCSymbol*>*>())[0];
+  }
+
+  // Otherwise, this is a new entry, create a new symbol for it and add an
+  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
+  BBCallbacks.push_back(BB);
+  BBCallbacks.back().setMap(this);
+  Entry.Index = BBCallbacks.size()-1;
+  Entry.Fn = BB->getParent();
+  MCSymbol *Result = Context.CreateTempSymbol();
+  Entry.Symbols = Result;
+  return Result;
+}
+
+std::vector<MCSymbol*>
+MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+
+  std::vector<MCSymbol*> Result;
+
+  // If we already had an entry for this block, just return it.
+  if (Entry.Symbols.isNull())
+    Result.push_back(getAddrLabelSymbol(BB));
+  else if (MCSymbol *Sym = Entry.Symbols.dyn_cast<MCSymbol*>())
+    Result.push_back(Sym);
+  else
+    Result = *Entry.Symbols.get<std::vector<MCSymbol*>*>();
+  return Result;
+}
+
+
+/// takeDeletedSymbolsForFunction - If we have any deleted symbols for F, return
+/// them.
+void MMIAddrLabelMap::
+takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I =
+    DeletedAddrLabelsNeedingEmission.find(F);
+
+  // If there are no entries for the function, just return.
+  if (I == DeletedAddrLabelsNeedingEmission.end()) return;
+
+  // Otherwise, take the list.
+  std::swap(Result, I->second);
+  DeletedAddrLabelsNeedingEmission.erase(I);
+}
+
+
+void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
+  // If the block got deleted, there is no need for the symbol.  If the symbol
+  // was already emitted, we can just forget about it, otherwise we need to
+  // queue it up for later emission when the function is output.
+  AddrLabelSymEntry Entry = AddrLabelSymbols[BB];
+  AddrLabelSymbols.erase(BB);
+  assert(!Entry.Symbols.isNull() && "Didn't have a symbol, why a callback?");
+  BBCallbacks[Entry.Index] = 0;  // Clear the callback.
+
+  assert((BB->getParent() == 0 || BB->getParent() == Entry.Fn) &&
+         "Block/parent mismatch");
+
+  // Handle both the single and the multiple symbols cases.
+  if (MCSymbol *Sym = Entry.Symbols.dyn_cast<MCSymbol*>()) {
+    if (Sym->isDefined())
+      return;
+
+    // If the block is not yet defined, we need to emit it at the end of the
+    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+    // for the containing Function.  Since the block is being deleted, its
+    // parent may already be removed, we have to get the function from 'Entry'.
+    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+  } else {
+    std::vector<MCSymbol*> *Syms = Entry.Symbols.get<std::vector<MCSymbol*>*>();
+
+    for (unsigned i = 0, e = Syms->size(); i != e; ++i) {
+      MCSymbol *Sym = (*Syms)[i];
+      if (Sym->isDefined()) continue;  // Ignore already emitted labels.
+
+      // If the block is not yet defined, we need to emit it at the end of the
+      // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+      // for the containing Function.  Since the block is being deleted, its
+      // parent may already be removed, we have to get the function from
+      // 'Entry'.
+      DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+    }
+
+    // The entry is deleted, free the memory associated with the symbol list.
+    delete Syms;
+  }
+}
+
+void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
+  // Get the entry for the RAUW'd block and remove it from our map.
+  AddrLabelSymEntry OldEntry = AddrLabelSymbols[Old];
+  AddrLabelSymbols.erase(Old);
+  assert(!OldEntry.Symbols.isNull() && "Didn't have a symbol, why a callback?");
+
+  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
+
+  // If New is not address taken, just move our symbol over to it.
+  if (NewEntry.Symbols.isNull()) {
+    BBCallbacks[OldEntry.Index].setPtr(New);    // Update the callback.
+    NewEntry = OldEntry;     // Set New's entry.
+    return;
+  }
+
+  BBCallbacks[OldEntry.Index] = 0;    // Update the callback.
+
+  // Otherwise, we need to add the old symbol to the new block's set.  If it is
+  // just a single entry, upgrade it to a symbol list.
+  if (MCSymbol *PrevSym = NewEntry.Symbols.dyn_cast<MCSymbol*>()) {
+    std::vector<MCSymbol*> *SymList = new std::vector<MCSymbol*>();
+    SymList->push_back(PrevSym);
+    NewEntry.Symbols = SymList;
+  }
+
+  std::vector<MCSymbol*> *SymList =
+    NewEntry.Symbols.get<std::vector<MCSymbol*>*>();
+
+  // If the old entry was a single symbol, add it.
+  if (MCSymbol *Sym = OldEntry.Symbols.dyn_cast<MCSymbol*>()) {
+    SymList->push_back(Sym);
+    return;
+  }
+
+  // Otherwise, concatenate the list.
+  std::vector<MCSymbol*> *Syms =OldEntry.Symbols.get<std::vector<MCSymbol*>*>();
+  SymList->insert(SymList->end(), Syms->begin(), Syms->end());
+  delete Syms;
+}
+
+
+void MMIAddrLabelMapCallbackPtr::deleted() {
+  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
+}
+
+void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
+  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
+}
+
+
+//===----------------------------------------------------------------------===//
+
+MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
+                                     const MCRegisterInfo &MRI,
+                                     const MCObjectFileInfo *MOFI)
+  : ImmutablePass(ID), Context(MAI, MRI, MOFI, 0, false) {
+  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineModuleInfo::MachineModuleInfo()
+  : ImmutablePass(ID),
+    Context(*(MCAsmInfo*)0, *(MCRegisterInfo*)0, (MCObjectFileInfo*)0) {
+  llvm_unreachable("This MachineModuleInfo constructor should never be called, "
+                   "MMI should always be explicitly constructed by "
+                   "LLVMTargetMachine");
+}
+
+MachineModuleInfo::~MachineModuleInfo() {
+}
+
+bool MachineModuleInfo::doInitialization(Module &M) {
+
+  ObjFileMMI = 0;
+  CompactUnwindEncoding = 0;
+  CurCallSite = 0;
+  CallsEHReturn = 0;
+  CallsUnwindInit = 0;
+  DbgInfoAvailable = UsesVAFloatArgument = false; 
+  // Always emit some info, by default "no personality" info.
+  Personalities.push_back(NULL);
+  AddrLabelSymbols = 0;
+  TheModule = 0;
+
+  return false;
+}
+
+bool MachineModuleInfo::doFinalization(Module &M) {
+
+  Personalities.clear();
+
+  delete AddrLabelSymbols;
+  AddrLabelSymbols = 0;
+
+  Context.reset();
+
+  delete ObjFileMMI;
+  ObjFileMMI = 0;
+
+  return false;
+}
+
+/// EndFunction - Discard function meta information.
+///
+void MachineModuleInfo::EndFunction() {
+  // Clean up frame info.
+  FrameMoves.clear();
+
+  // Clean up exception info.
+  LandingPads.clear();
+  CallSiteMap.clear();
+  TypeInfos.clear();
+  FilterIds.clear();
+  FilterEnds.clear();
+  CallsEHReturn = 0;
+  CallsUnwindInit = 0;
+  CompactUnwindEncoding = 0;
+  VariableDbgInfo.clear();
+}
+
+/// AnalyzeModule - Scan the module for global debug information.
+///
+void MachineModuleInfo::AnalyzeModule(const Module &M) {
+  // Insert functions in the llvm.used array (but not llvm.compiler.used) into
+  // UsedFunctions.
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0) return;
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const Function *F =
+          dyn_cast<Function>(InitList->getOperand(i)->stripPointerCasts()))
+      UsedFunctions.insert(F);
+}
+
+//===- Address of Block Management ----------------------------------------===//
+
+
+/// getAddrLabelSymbol - Return the symbol to be used for the specified basic
+/// block when its address is taken.  This cannot be its normal LBB label
+/// because the block may be accessed outside its containing function.
+MCSymbol *MachineModuleInfo::getAddrLabelSymbol(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (AddrLabelSymbols == 0)
+    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+  return AddrLabelSymbols->getAddrLabelSymbol(const_cast<BasicBlock*>(BB));
+}
+
+/// getAddrLabelSymbolToEmit - Return the symbol to be used for the specified
+/// basic block when its address is taken.  If other blocks were RAUW'd to
+/// this one, we may have to emit them as well, return the whole set.
+std::vector<MCSymbol*> MachineModuleInfo::
+getAddrLabelSymbolToEmit(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (AddrLabelSymbols == 0)
+    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+ return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
+}
+
+
+/// takeDeletedSymbolsForFunction - If the specified function has had any
+/// references to address-taken blocks generated, but the block got deleted,
+/// return the symbol now so we can emit it.  This prevents emitting a
+/// reference to a symbol that has no definition.
+void MachineModuleInfo::
+takeDeletedSymbolsForFunction(const Function *F,
+                              std::vector<MCSymbol*> &Result) {
+  // If no blocks have had their addresses taken, we're done.
+  if (AddrLabelSymbols == 0) return;
+  return AddrLabelSymbols->
+     takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
+}
+
+//===- EH -----------------------------------------------------------------===//
+
+/// getOrCreateLandingPadInfo - Find or create an LandingPadInfo for the
+/// specified MachineBasicBlock.
+LandingPadInfo &MachineModuleInfo::getOrCreateLandingPadInfo
+    (MachineBasicBlock *LandingPad) {
+  unsigned N = LandingPads.size();
+  for (unsigned i = 0; i < N; ++i) {
+    LandingPadInfo &LP = LandingPads[i];
+    if (LP.LandingPadBlock == LandingPad)
+      return LP;
+  }
+
+  LandingPads.push_back(LandingPadInfo(LandingPad));
+  return LandingPads[N];
+}
+
+/// addInvoke - Provide the begin and end labels of an invoke style call and
+/// associate it with a try landing pad block.
+void MachineModuleInfo::addInvoke(MachineBasicBlock *LandingPad,
+                                  MCSymbol *BeginLabel, MCSymbol *EndLabel) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.BeginLabels.push_back(BeginLabel);
+  LP.EndLabels.push_back(EndLabel);
+}
+
+/// addLandingPad - Provide the label of a try LandingPad block.
+///
+MCSymbol *MachineModuleInfo::addLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *LandingPadLabel = Context.CreateTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.LandingPadLabel = LandingPadLabel;
+  return LandingPadLabel;
+}
+
+/// addPersonality - Provide the personality function for the exception
+/// information.
+void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
+                                       const Function *Personality) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.Personality = Personality;
+
+  for (unsigned i = 0; i < Personalities.size(); ++i)
+    if (Personalities[i] == Personality)
+      return;
+
+  // If this is the first personality we're adding go
+  // ahead and add it at the beginning.
+  if (Personalities[0] == NULL)
+    Personalities[0] = Personality;
+  else
+    Personalities.push_back(Personality);
+}
+
+/// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
+///
+void MachineModuleInfo::
+addCatchTypeInfo(MachineBasicBlock *LandingPad,
+                 ArrayRef<const GlobalVariable *> TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  for (unsigned N = TyInfo.size(); N; --N)
+    LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
+}
+
+/// addFilterTypeInfo - Provide the filter typeinfo for a landing pad.
+///
+void MachineModuleInfo::
+addFilterTypeInfo(MachineBasicBlock *LandingPad,
+                  ArrayRef<const GlobalVariable *> TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  std::vector<unsigned> IdsInFilter(TyInfo.size());
+  for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
+    IdsInFilter[I] = getTypeIDFor(TyInfo[I]);
+  LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
+}
+
+/// addCleanup - Add a cleanup action for a landing pad.
+///
+void MachineModuleInfo::addCleanup(MachineBasicBlock *LandingPad) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.TypeIds.push_back(0);
+}
+
+/// TidyLandingPads - Remap landing pad labels and remove any deleted landing
+/// pads.
+void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
+  for (unsigned i = 0; i != LandingPads.size(); ) {
+    LandingPadInfo &LandingPad = LandingPads[i];
+    if (LandingPad.LandingPadLabel &&
+        !LandingPad.LandingPadLabel->isDefined() &&
+        (!LPMap || (*LPMap)[LandingPad.LandingPadLabel] == 0))
+      LandingPad.LandingPadLabel = 0;
+
+    // Special case: we *should* emit LPs with null LP MBB. This indicates
+    // "nounwind" case.
+    if (!LandingPad.LandingPadLabel && LandingPad.LandingPadBlock) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+      MCSymbol *EndLabel = LandingPad.EndLabels[j];
+      if ((BeginLabel->isDefined() ||
+           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+          (EndLabel->isDefined() ||
+           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
+
+      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+      --j, --e;
+    }
+
+    // Remove landing pads with no try-ranges.
+    if (LandingPads[i].BeginLabels.empty()) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    // If there is no landing pad, ensure that the list of typeids is empty.
+    // If the only typeid is a cleanup, this is the same as having no typeids.
+    if (!LandingPad.LandingPadBlock ||
+        (LandingPad.TypeIds.size() == 1 && !LandingPad.TypeIds[0]))
+      LandingPad.TypeIds.clear();
+    ++i;
+  }
+}
+
+/// setCallSiteLandingPad - Map the landing pad's EH symbol to the call site
+/// indexes.
+void MachineModuleInfo::setCallSiteLandingPad(MCSymbol *Sym,
+                                              ArrayRef<unsigned> Sites) {
+  LPadToCallSiteMap[Sym].append(Sites.begin(), Sites.end());
+}
+
+/// getTypeIDFor - Return the type id for the specified typeinfo.  This is
+/// function wide.
+unsigned MachineModuleInfo::getTypeIDFor(const GlobalVariable *TI) {
+  for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
+    if (TypeInfos[i] == TI) return i + 1;
+
+  TypeInfos.push_back(TI);
+  return TypeInfos.size();
+}
+
+/// getFilterIDFor - Return the filter id for the specified typeinfos.  This is
+/// function wide.
+int MachineModuleInfo::getFilterIDFor(std::vector<unsigned> &TyIds) {
+  // If the new filter coincides with the tail of an existing filter, then
+  // re-use the existing filter.  Folding filters more than this requires
+  // re-ordering filters and/or their elements - probably not worth it.
+  for (std::vector<unsigned>::iterator I = FilterEnds.begin(),
+       E = FilterEnds.end(); I != E; ++I) {
+    unsigned i = *I, j = TyIds.size();
+
+    while (i && j)
+      if (FilterIds[--i] != TyIds[--j])
+        goto try_next;
+
+    if (!j)
+      // The new filter coincides with range [i, end) of the existing filter.
+      return -(1 + i);
+
+try_next:;
+  }
+
+  // Add the new filter.
+  int FilterID = -(1 + FilterIds.size());
+  FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
+  FilterIds.insert(FilterIds.end(), TyIds.begin(), TyIds.end());
+  FilterEnds.push_back(FilterIds.size());
+  FilterIds.push_back(0); // terminator
+  return FilterID;
+}
+
+/// getPersonality - Return the personality function for the current function.
+const Function *MachineModuleInfo::getPersonality() const {
+  // FIXME: Until PR1414 will be fixed, we're using 1 personality function per
+  // function
+  return !LandingPads.empty() ? LandingPads[0].Personality : NULL;
+}
+
+/// getPersonalityIndex - Return unique index for current personality
+/// function. NULL/first personality function should always get zero index.
+unsigned MachineModuleInfo::getPersonalityIndex() const {
+  const Function* Personality = NULL;
+
+  // Scan landing pads. If there is at least one non-NULL personality - use it.
+  for (unsigned i = 0, e = LandingPads.size(); i != e; ++i)
+    if (LandingPads[i].Personality) {
+      Personality = LandingPads[i].Personality;
+      break;
+    }
+
+  for (unsigned i = 0, e = Personalities.size(); i < e; ++i) {
+    if (Personalities[i] == Personality)
+      return i;
+  }
+
+  // This will happen if the current personality function is
+  // in the zero index.
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
new file mode 100644
index 000000000000..a1c7e9f5fb22
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -0,0 +1,45 @@
+//===-- llvm/CodeGen/MachineModuleInfoImpls.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements object-file format specific implementations of
+// MachineModuleInfoImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineModuleInfoMachO
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void MachineModuleInfoMachO::anchor() {}
+void MachineModuleInfoELF::anchor() {}
+
+static int SortSymbolPair(const void *LHS, const void *RHS) {
+  typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
+  const MCSymbol *LHSS = ((const PairTy *)LHS)->first;
+  const MCSymbol *RHSS = ((const PairTy *)RHS)->first;
+  return LHSS->getName().compare(RHSS->getName());
+}
+
+/// GetSortedStubs - Return the entries from a DenseMap in a deterministic
+/// sorted orer.
+MachineModuleInfoImpl::SymbolListTy
+MachineModuleInfoImpl::GetSortedStubs(const DenseMap<MCSymbol*,
+                                      MachineModuleInfoImpl::StubValueTy>&Map) {
+  MachineModuleInfoImpl::SymbolListTy List(Map.begin(), Map.end());
+
+  if (!List.empty())
+    qsort(&List[0], List.size(), sizeof(List[0]), SortSymbolPair);
+  return List;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp b/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp
new file mode 100644
index 000000000000..cb204fdb3492
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp
@@ -0,0 +1,55 @@
+//===-- CodeGen/MachineInstr.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the machine function pass registry for register allocators
+// and instruction schedulers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassRegistry.h"
+
+using namespace llvm;
+
+void MachinePassRegistryListener::anchor() { }
+
+/// setDefault - Set the default constructor by name.
+void MachinePassRegistry::setDefault(StringRef Name) {
+  MachinePassCtor Ctor = 0;
+  for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
+    if (R->getName() == Name) {
+      Ctor = R->getCtor();
+      break;
+    }
+  }
+  assert(Ctor && "Unregistered pass name");
+  setDefault(Ctor);
+}
+
+/// Add - Adds a function pass to the registration list.
+///
+void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
+  Node->setNext(List);
+  List = Node;
+  if (Listener) Listener->NotifyAdd(Node->getName(),
+                                    Node->getCtor(),
+                                    Node->getDescription());
+}
+
+
+/// Remove - Removes a function pass from the registration list.
+///
+void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) {
+  for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) {
+    if (*I == Node) {
+      if (Listener) Listener->NotifyRemove(Node->getName());
+      *I = (*I)->getNext();
+      break;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
new file mode 100644
index 000000000000..c3f6e9249e7d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -0,0 +1,55 @@
+//===- MachinePostDominators.cpp -Machine Post Dominator Calculation ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// post dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+using namespace llvm;
+
+char MachinePostDominatorTree::ID = 0;
+
+//declare initializeMachinePostDominatorTreePass
+INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
+                "MachinePostDominator Tree Construction", true, true)
+
+MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+  initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+                                                       // postdominator
+}
+
+FunctionPass *
+MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+  return new MachinePostDominatorTree();
+}
+
+bool
+MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+void
+MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void
+MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
+  DT->print(OS);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
new file mode 100644
index 000000000000..1af00e84a6ed
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -0,0 +1,360 @@
+//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MachineRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
+  : TRI(&TRI), IsSSA(true), TracksLiveness(true) {
+  VRegInfo.reserve(256);
+  RegAllocHints.reserve(256);
+  UsedRegUnits.resize(TRI.getNumRegUnits());
+  UsedPhysRegMask.resize(TRI.getNumRegs());
+
+  // Create the physreg use/def lists.
+  PhysRegUseDefLists = new MachineOperand*[TRI.getNumRegs()];
+  memset(PhysRegUseDefLists, 0, sizeof(MachineOperand*)*TRI.getNumRegs());
+}
+
+MachineRegisterInfo::~MachineRegisterInfo() {
+  delete [] PhysRegUseDefLists;
+}
+
+/// setRegClass - Set the register class of the specified virtual register.
+///
+void
+MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
+  assert(RC && RC->isAllocatable() && "Invalid RC for virtual register");
+  VRegInfo[Reg].first = RC;
+}
+
+const TargetRegisterClass *
+MachineRegisterInfo::constrainRegClass(unsigned Reg,
+                                       const TargetRegisterClass *RC,
+                                       unsigned MinNumRegs) {
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  if (OldRC == RC)
+    return RC;
+  const TargetRegisterClass *NewRC = TRI->getCommonSubClass(OldRC, RC);
+  if (!NewRC || NewRC == OldRC)
+    return NewRC;
+  if (NewRC->getNumRegs() < MinNumRegs)
+    return 0;
+  setRegClass(Reg, NewRC);
+  return NewRC;
+}
+
+bool
+MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC);
+
+  // Stop early if there is no room to grow.
+  if (NewRC == OldRC)
+    return false;
+
+  // Accumulate constraints from all uses.
+  for (reg_nodbg_iterator I = reg_nodbg_begin(Reg), E = reg_nodbg_end(); I != E;
+       ++I) {
+    const TargetRegisterClass *OpRC =
+      I->getRegClassConstraint(I.getOperandNo(), TII, TRI);
+    if (unsigned SubIdx = I.getOperand().getSubReg()) {
+      if (OpRC)
+        NewRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, SubIdx);
+      else
+        NewRC = TRI->getSubClassWithSubReg(NewRC, SubIdx);
+    } else if (OpRC)
+      NewRC = TRI->getCommonSubClass(NewRC, OpRC);
+    if (!NewRC || NewRC == OldRC)
+      return false;
+  }
+  setRegClass(Reg, NewRC);
+  return true;
+}
+
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register class.
+///
+unsigned
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
+  assert(RegClass && "Cannot create register without RegClass!");
+  assert(RegClass->isAllocatable() &&
+         "Virtual register RegClass must be allocatable.");
+
+  // New virtual register number.
+  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  VRegInfo.grow(Reg);
+  VRegInfo[Reg].first = RegClass;
+  RegAllocHints.grow(Reg);
+  return Reg;
+}
+
+/// clearVirtRegs - Remove all virtual registers (after physreg assignment).
+void MachineRegisterInfo::clearVirtRegs() {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
+    assert(VRegInfo[TargetRegisterInfo::index2VirtReg(i)].second == 0 &&
+           "Vreg use list non-empty still?");
+#endif
+  VRegInfo.clear();
+}
+
+/// Add MO to the linked list of operands for its register.
+void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
+  assert(!MO->isOnRegUseList() && "Already on list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+
+  // Head points to the first list element.
+  // Next is NULL on the last list element.
+  // Prev pointers are circular, so Head->Prev == Last.
+
+  // Head is NULL for an empty list.
+  if (!Head) {
+    MO->Contents.Reg.Prev = MO;
+    MO->Contents.Reg.Next = 0;
+    HeadRef = MO;
+    return;
+  }
+  assert(MO->getReg() == Head->getReg() && "Different regs on the same list!");
+
+  // Insert MO between Last and Head in the circular Prev chain.
+  MachineOperand *Last = Head->Contents.Reg.Prev;
+  assert(Last && "Inconsistent use list");
+  assert(MO->getReg() == Last->getReg() && "Different regs on the same list!");
+  Head->Contents.Reg.Prev = MO;
+  MO->Contents.Reg.Prev = Last;
+
+  // Def operands always precede uses. This allows def_iterator to stop early.
+  // Insert def operands at the front, and use operands at the back.
+  if (MO->isDef()) {
+    // Insert def at the front.
+    MO->Contents.Reg.Next = Head;
+    HeadRef = MO;
+  } else {
+    // Insert use at the end.
+    MO->Contents.Reg.Next = 0;
+    Last->Contents.Reg.Next = MO;
+  }
+}
+
+/// Remove MO from its use-def list.
+void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
+  assert(MO->isOnRegUseList() && "Operand not on use list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+  assert(Head && "List already empty");
+
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *Next = MO->Contents.Reg.Next;
+  MachineOperand *Prev = MO->Contents.Reg.Prev;
+
+  // Prev links are circular, next link is NULL instead of looping back to Head.
+  if (MO == Head)
+    HeadRef = Next;
+  else
+    Prev->Contents.Reg.Next = Next;
+
+  (Next ? Next : Head)->Contents.Reg.Prev = Prev;
+
+  MO->Contents.Reg.Prev = 0;
+  MO->Contents.Reg.Next = 0;
+}
+
+/// Move NumOps operands from Src to Dst, updating use-def lists as needed.
+///
+/// The Dst range is assumed to be uninitialized memory. (Or it may contain
+/// operands that won't be destroyed, which is OK because the MO destructor is
+/// trivial anyway).
+///
+/// The Src and Dst ranges may overlap.
+void MachineRegisterInfo::moveOperands(MachineOperand *Dst,
+                                       MachineOperand *Src,
+                                       unsigned NumOps) {
+  assert(Src != Dst && NumOps && "Noop moveOperands");
+
+  // Copy backwards if Dst is within the Src range.
+  int Stride = 1;
+  if (Dst >= Src && Dst < Src + NumOps) {
+    Stride = -1;
+    Dst += NumOps - 1;
+    Src += NumOps - 1;
+  }
+
+  // Copy one operand at a time.
+  do {
+    new (Dst) MachineOperand(*Src);
+
+    // Dst takes Src's place in the use-def chain.
+    if (Src->isReg()) {
+      MachineOperand *&Head = getRegUseDefListHead(Src->getReg());
+      MachineOperand *Prev = Src->Contents.Reg.Prev;
+      MachineOperand *Next = Src->Contents.Reg.Next;
+      assert(Head && "List empty, but operand is chained");
+      assert(Prev && "Operand was not on use-def list");
+
+      // Prev links are circular, next link is NULL instead of looping back to
+      // Head.
+      if (Src == Head)
+        Head = Dst;
+      else
+        Prev->Contents.Reg.Next = Dst;
+
+      // Update Prev pointer. This also works when Src was pointing to itself
+      // in a 1-element list. In that case Head == Dst.
+      (Next ? Next : Head)->Contents.Reg.Prev = Dst;
+    }
+
+    Dst += Stride;
+    Src += Stride;
+  } while (--NumOps);
+}
+
+/// replaceRegWith - Replace all instances of FromReg with ToReg in the
+/// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
+/// except that it also changes any definitions of the register as well.
+void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+
+  // TODO: This could be more efficient by bulk changing the operands.
+  for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
+    MachineOperand &O = I.getOperand();
+    ++I;
+    O.setReg(ToReg);
+  }
+}
+
+
+/// getVRegDef - Return the machine instr that defines the specified virtual
+/// register or null if none is found.  This assumes that the code is in SSA
+/// form, so there should only be one definition.
+MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
+  // Since we are in SSA form, we can use the first definition.
+  def_iterator I = def_begin(Reg);
+  assert((I.atEnd() || llvm::next(I) == def_end()) &&
+         "getVRegDef assumes a single definition or no definition");
+  return !I.atEnd() ? &*I : 0;
+}
+
+/// getUniqueVRegDef - Return the unique machine instr that defines the
+/// specified virtual register or null if none is found.  If there are
+/// multiple definitions or no definition, return null.
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
+  if (def_empty(Reg)) return 0;
+  def_iterator I = def_begin(Reg);
+  if (llvm::next(I) != def_end())
+    return 0;
+  return &*I;
+}
+
+bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
+  use_nodbg_iterator UI = use_nodbg_begin(RegNo);
+  if (UI == use_nodbg_end())
+    return false;
+  return ++UI == use_nodbg_end();
+}
+
+/// clearKillFlags - Iterate over all the uses of the given register and
+/// clear the kill flag from the MachineOperand. This function is used by
+/// optimization passes which extend register lifetimes and need only
+/// preserve conservative kill flag information.
+void MachineRegisterInfo::clearKillFlags(unsigned Reg) const {
+  for (use_iterator UI = use_begin(Reg), UE = use_end(); UI != UE; ++UI)
+    UI.getOperand().setIsKill(false);
+}
+
+bool MachineRegisterInfo::isLiveIn(unsigned Reg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == Reg || I->second == Reg)
+      return true;
+  return false;
+}
+
+/// getLiveInPhysReg - If VReg is a live-in virtual register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInPhysReg(unsigned VReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->second == VReg)
+      return I->first;
+  return 0;
+}
+
+/// getLiveInVirtReg - If PReg is a live-in physical register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == PReg)
+      return I->second;
+  return 0;
+}
+
+/// EmitLiveInCopies - Emit copies to initialize livein virtual registers
+/// into the given entry block.
+void
+MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII) {
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_empty(LiveIns[i].second)) {
+        // The livein has no uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
+      }
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
+}
+
+#ifndef NDEBUG
+void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+  for (use_iterator I = use_begin(Reg), E = use_end(); I != E; ++I)
+    I.getOperand().getParent()->dump();
+}
+#endif
+
+void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
+  ReservedRegs = TRI->getReservedRegs(MF);
+  assert(ReservedRegs.size() == TRI->getNumRegs() &&
+         "Invalid ReservedRegs vector from target");
+}
+
+bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
+                                            const MachineFunction &MF) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+
+  // Check if any overlapping register is modified, or allocatable so it may be
+  // used later.
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+    if (!def_empty(*AI) || isAllocatable(*AI))
+      return false;
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 000000000000..bb6aad7f948e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,364 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+using namespace llvm;
+
+typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : AV(0), InsertedPHIs(NewPHI) {
+  TII = MF.getTarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.  ProtoValue is the value used to name PHI nodes.
+void MachineSSAUpdater::Initialize(unsigned V) {
+  if (AV == 0)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  VR = V;
+  VRC = MRI->getRegClass(VR);
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
+          SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> &PredValues) {
+  if (BB->empty())
+    return 0;
+
+  MachineBasicBlock::iterator I = BB->begin();
+  if (!I->isPHI())
+    return 0;
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->isPHI()) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return 0;
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstrBuilder InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI,
+                           const TargetInstrInfo *TII) {
+  unsigned NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
+}
+
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlockInternal(BB);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
+  unsigned SingularValue = 0;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+  MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
+                                                 Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    InsertedPHI.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI->getOperand(0).getReg();
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  unsigned NewVR = 0;
+  if (UseMI->isPHI()) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+void MachineSSAUpdater::ReplaceRegWith(unsigned OldReg, unsigned NewReg) {
+  MRI->replaceRegWith(OldReg, NewReg);
+
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  for (DenseMap<MachineBasicBlock*, unsigned>::iterator
+         I = AvailableVals.begin(), E = AvailableVals.end(); I != E; ++I)
+    if (I->second == OldReg)
+      I->second = NewReg;
+}
+
+/// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
+/// template, specialized for MachineSSAUpdater.
+namespace llvm {
+template<>
+class SSAUpdaterTraits<MachineSSAUpdater> {
+public:
+  typedef MachineBasicBlock BlkT;
+  typedef unsigned ValT;
+  typedef MachineInstr PhiT;
+
+  typedef MachineBasicBlock::succ_iterator BlkSucc_iterator;
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+ 
+  public:
+    explicit PHI_iterator(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    PHI_iterator(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    PHI_iterator &operator++() { idx += 2; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
+  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static inline PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(MachineBasicBlock *BB,
+                                    SmallVectorImpl<MachineBasicBlock*> *Preds){
+    for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+           E = BB->pred_end(); PI != E; ++PI)
+      Preds->push_back(*PI);
+  }
+
+  /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned GetUndefVal(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        Updater->VRC, Updater->MRI,
+                                        Updater->TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
+                                 MachineSSAUpdater *Updater) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
+                                     Updater->VRC, Updater->MRI,
+                                     Updater->TII);
+    return PHI->getOperand(0).getReg();
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(MachineInstr *PHI, unsigned Val,
+                            MachineBasicBlock *Pred) {
+    MachineInstrBuilder(*Pred->getParent(), PHI).addReg(Val).addMBB(Pred);
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  ///
+  static MachineInstr *InstrIsPHI(MachineInstr *I) {
+    if (I && I->isPHI())
+      return I;
+    return 0;
+  }
+
+  /// ValueIsPHI - Check if the instruction that defines the specified register
+  /// is a PHI instruction.
+  static MachineInstr *ValueIsPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    return InstrIsPHI(Updater->MRI->getVRegDef(Val));
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static MachineInstr *ValueIsNewPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    MachineInstr *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumOperands() <= 1)
+      return PHI;
+    return 0;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the register
+  /// that it defines.
+  static unsigned GetPHIValue(MachineInstr *PHI) {
+    return PHI->getOperand(0).getReg();
+  }
+};
+
+} // End llvm namespace
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (unsigned V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
new file mode 100644
index 000000000000..5bd2349b50f6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -0,0 +1,2396 @@
+//===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <queue>
+
+using namespace llvm;
+
+namespace llvm {
+cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
+                           cl::desc("Force top-down list scheduling"));
+cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
+                            cl::desc("Force bottom-up list scheduling"));
+}
+
+#ifndef NDEBUG
+static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
+  cl::desc("Pop up a window to show MISched dags after they are processed"));
+
+static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+#else
+static bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
+// Experimental heuristics
+static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
+  cl::desc("Enable load clustering."), cl::init(true));
+
+// Experimental heuristics
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
+  cl::desc("Verify machine instrs before and after machine scheduling"));
+
+// DAG subtrees must have at least this many nodes.
+static const unsigned MinSubtreeSize = 8;
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Scheduling Pass and Registry
+//===----------------------------------------------------------------------===//
+
+MachineSchedContext::MachineSchedContext():
+    MF(0), MLI(0), MDT(0), PassConfig(0), AA(0), LIS(0) {
+  RegClassInfo = new RegisterClassInfo();
+}
+
+MachineSchedContext::~MachineSchedContext() {
+  delete RegClassInfo;
+}
+
+namespace {
+/// MachineScheduler runs after coalescing and before register allocation.
+class MachineScheduler : public MachineSchedContext,
+                         public MachineFunctionPass {
+public:
+  MachineScheduler();
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual void releaseMemory() {}
+
+  virtual bool runOnMachineFunction(MachineFunction&);
+
+  virtual void print(raw_ostream &O, const Module* = 0) const;
+
+  static char ID; // Class identification, replacement for typeinfo
+};
+} // namespace
+
+char MachineScheduler::ID = 0;
+
+char &llvm::MachineSchedulerID = MachineScheduler::ID;
+
+INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
+                      "Machine Instruction Scheduler", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(MachineScheduler, "misched",
+                    "Machine Instruction Scheduler", false, false)
+
+MachineScheduler::MachineScheduler()
+: MachineFunctionPass(ID) {
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachinePassRegistry MachineSchedRegistry::Registry;
+
+/// A dummy default scheduler factory indicates whether the scheduler
+/// is overridden on the command line.
+static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
+  return 0;
+}
+
+/// MachineSchedOpt allows command line selection of the scheduler.
+static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
+               RegisterPassParser<MachineSchedRegistry> >
+MachineSchedOpt("misched",
+                cl::init(&useDefaultMachineSched), cl::Hidden,
+                cl::desc("Machine instruction scheduler to use"));
+
+static MachineSchedRegistry
+DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
+                     useDefaultMachineSched);
+
+/// Forward declare the standard machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
+
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Top-level MachineScheduler pass driver.
+///
+/// Visit blocks in function order. Divide each block into scheduling regions
+/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
+/// consistent with the DAG builder, which traverses the interior of the
+/// scheduling regions bottom-up.
+///
+/// This design avoids exposing scheduling boundaries to the DAG builder,
+/// simplifying the DAG builder's support for "special" target instructions.
+/// At the same time the design allows target schedulers to operate across
+/// scheduling boundaries, for example to bundle the boudary instructions
+/// without reordering them. This creates complexity, because the target
+/// scheduler must update the RegionBegin and RegionEnd positions cached by
+/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
+/// design would be to split blocks at scheduling boundaries, but LLVM has a
+/// general bias against block splitting purely for implementation simplicity.
+bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "Before MISsched:\n"; mf.print(dbgs()));
+
+  // Initialize the context of the pass.
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PassConfig = &getAnalysis<TargetPassConfig>();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  LIS = &getAnalysis<LiveIntervals>();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  if (VerifyScheduling) {
+    DEBUG(LIS->print(dbgs()));
+    MF->verify(this, "Before machine scheduling.");
+  }
+  RegClassInfo->runOnMachineFunction(*MF);
+
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor == useDefaultMachineSched) {
+    // Get the default scheduler set by the target.
+    Ctor = MachineSchedRegistry::getDefault();
+    if (!Ctor) {
+      Ctor = createConvergingSched;
+      MachineSchedRegistry::setDefault(Ctor);
+    }
+  }
+  // Instantiate the selected scheduler.
+  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+
+  // Visit all machine basic blocks.
+  //
+  // TODO: Visit blocks in global postorder or postorder within the bottom-up
+  // loop tree. Then we can optionally compute global RegPressure.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+       MBB != MBBEnd; ++MBB) {
+
+    Scheduler->startBlock(MBB);
+
+    // Break the block into scheduling regions [I, RegionEnd), and schedule each
+    // region as soon as it is discovered. RegionEnd points the scheduling
+    // boundary at the bottom of the region. The DAG does not include RegionEnd,
+    // but the region does (i.e. the next RegionEnd is above the previous
+    // RegionBegin). If the current block has no terminator then RegionEnd ==
+    // MBB->end() for the bottom region.
+    //
+    // The Scheduler may insert instructions during either schedule() or
+    // exitRegion(), even for empty regions. So the local iterators 'I' and
+    // 'RegionEnd' are invalid across these calls.
+    unsigned RemainingInstrs = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
+
+      // Avoid decrementing RegionEnd for blocks with no terminator.
+      if (RegionEnd != MBB->end()
+          || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
+        --RegionEnd;
+        // Count the boundary instruction.
+        --RemainingInstrs;
+      }
+
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
+        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
+          break;
+      }
+      // Notify the scheduler of the region, even if we may skip scheduling
+      // it. Perhaps it still needs to be bundled.
+      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
+        // Close the current region. Bundle the terminator if needed.
+        // This invalidates 'RegionEnd' and 'I'.
+        Scheduler->exitRegion();
+        continue;
+      }
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF->getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *I << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " Remaining: " << RemainingInstrs << "\n");
+
+      // Schedule a region: possibly reorder instructions.
+      // This invalidates 'RegionEnd' and 'I'.
+      Scheduler->schedule();
+
+      // Close the current region.
+      Scheduler->exitRegion();
+
+      // Scheduling has invalidated the current iterator 'I'. Ask the
+      // scheduler for the top of it's scheduled region.
+      RegionEnd = Scheduler->begin();
+    }
+    assert(RemainingInstrs == 0 && "Instruction count mismatch!");
+    Scheduler->finishBlock();
+  }
+  Scheduler->finalizeSchedule();
+  DEBUG(LIS->print(dbgs()));
+  if (VerifyScheduling)
+    MF->verify(this, "After machine scheduling.");
+  return true;
+}
+
+void MachineScheduler::print(raw_ostream &O, const Module* m) const {
+  // unimplemented
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ReadyQueue::dump() {
+  dbgs() << "  " << Name << ": ";
+  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+    dbgs() << Queue[i]->NodeNum << " ";
+  dbgs() << "\n";
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals
+// preservation.
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGMI::~ScheduleDAGMI() {
+  delete DFSResult;
+  DeleteContainerPointers(Mutations);
+  delete SchedImpl;
+}
+
+bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
+  if (SuccSU != &ExitSU) {
+    // Do not use WillCreateCycle, it assumes SD scheduling.
+    // If Pred is reachable from Succ, then the edge creates a cycle.
+    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+      return false;
+    Topo.AddPred(SuccSU, PredDep.getSUnit());
+  }
+  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
+  // Return true regardless of whether a new edge needed to be inserted.
+  return true;
+}
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
+/// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
+void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    if (SuccEdge->isCluster())
+      NextClusterSucc = SuccSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    SchedImpl->releaseTopNode(SuccSU);
+}
+
+/// releaseSuccessors - Call releaseSucc on each of SU's successors.
+void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    releaseSucc(SU, &*I);
+  }
+}
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
+void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+  if (PredEdge->isWeak()) {
+    --PredSU->WeakSuccsLeft;
+    if (PredEdge->isCluster())
+      NextClusterPred = PredSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    releasePred(SU, &*I);
+  }
+}
+
+void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
+                                    MachineBasicBlock::iterator InsertPos) {
+  // Advance RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    ++RegionBegin;
+
+  // Update the instruction stream.
+  BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
+  LIS->handleMove(MI, /*UpdateFlags=*/true);
+
+  // Recede RegionBegin if an instruction moves above the first.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool ScheduleDAGMI::checkSchedLimit() {
+#ifndef NDEBUG
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned endcount)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd =
+    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void ScheduleDAGMI::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.getPressure().dump(TRI));
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    BotRPTracker.recede();
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  const std::vector<unsigned> &RegionPressure =
+    RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+          << "Limit " << Limit
+          << " Actual " << RegionPressure[i] << "\n");
+    if (RegionPressure[i] > Limit)
+      RegionCriticalPSets.push_back(PressureElement(i, 0));
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].PSetID) << " ";
+        dbgs() << "\n");
+}
+
+// FIXME: When the pressure tracker deals in pressure differences then we won't
+// iterate over all RegionCriticalPSets[i].
+void ScheduleDAGMI::
+updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
+  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
+    unsigned ID = RegionCriticalPSets[i].PSetID;
+    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
+    if ((int)NewMaxPressure[ID] > MaxUnits)
+      MaxUnits = NewMaxPressure[ID];
+  }
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+///
+/// This is a skeletal driver, with all the functionality pushed into helpers,
+/// so that it can be easilly extended by experimental schedulers. Generally,
+/// implementing MachineSchedStrategy should be sufficient to implement a new
+/// scheduling algorithm. However, if a scheduler further subclasses
+/// ScheduleDAGMI then it will want to override this virtual method in order to
+/// update any specialized state.
+void ScheduleDAGMI::schedule() {
+  buildDAGWithRegPressure();
+
+  Topo.InitDAGTopologicalSorting();
+
+  postprocessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+/// Build the DAG and setup three register pressure trackers.
+void ScheduleDAGMI::buildDAGWithRegPressure() {
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void ScheduleDAGMI::postprocessDAG() {
+  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
+    Mutations[i]->apply(this);
+  }
+}
+
+void ScheduleDAGMI::computeDFSResult() {
+  if (!DFSResult)
+    DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
+  DFSResult->clear();
+  ScheduledTrees.clear();
+  DFSResult->resize(SUnits.size());
+  DFSResult->compute(SUnits);
+  ScheduledTrees.resize(DFSResult->getNumSubtrees());
+}
+
+void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+                                          SmallVectorImpl<SUnit*> &BotRoots) {
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = &(*I);
+    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
+
+    // Order predecessors so DFSResult follows the critical path.
+    SU->biasCriticalPath();
+
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (!I->NumPredsLeft)
+      TopRoots.push_back(SU);
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (!I->NumSuccsLeft)
+      BotRoots.push_back(SU);
+  }
+  ExitSU.biasCriticalPath();
+}
+
+/// Identify DAG roots and setup scheduler queues.
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
+                               ArrayRef<SUnit*> BotRoots) {
+  NextClusterSucc = NULL;
+  NextClusterPred = NULL;
+
+  // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
+  //
+  // Nodes with unreleased weak edges can still be roots.
+  // Release top roots in forward order.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
+    SchedImpl->releaseTopNode(*I);
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
+    SchedImpl->releaseBottomNode(*I);
+  }
+
+  releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
+
+  SchedImpl->registerRoots();
+
+  // Advance past initial DebugValues.
+  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
+  TopRPTracker.setPos(CurrentTop);
+
+  CurrentBottom = RegionEnd;
+}
+
+/// Move an instruction and update register pressure.
+void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
+  // Move the instruction to its new location in the instruction stream.
+  MachineInstr *MI = SU->getInstr();
+
+  if (IsTopNode) {
+    assert(SU->isTopReady() && "node still has unscheduled dependencies");
+    if (&*CurrentTop == MI)
+      CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+    else {
+      moveInstruction(MI, CurrentTop);
+      TopRPTracker.setPos(MI);
+    }
+
+    // Update top scheduled pressure.
+    TopRPTracker.advance();
+    assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+    updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+  }
+  else {
+    assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+    MachineBasicBlock::iterator priorII =
+      priorNonDebug(CurrentBottom, CurrentTop);
+    if (&*priorII == MI)
+      CurrentBottom = priorII;
+    else {
+      if (&*CurrentTop == MI) {
+        CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        TopRPTracker.setPos(CurrentTop);
+      }
+      moveInstruction(MI, CurrentBottom);
+      CurrentBottom = MI;
+    }
+    // Update bottom scheduled pressure.
+    BotRPTracker.recede();
+    assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+    updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+  }
+}
+
+/// Update scheduler queues after scheduling an instruction.
+void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
+  // Release dependent instructions for scheduling.
+  if (IsTopNode)
+    releaseSuccessors(SU);
+  else
+    releasePredecessors(SU);
+
+  SU->isScheduled = true;
+
+  if (DFSResult) {
+    unsigned SubtreeID = DFSResult->getSubtreeID(SU);
+    if (!ScheduledTrees.test(SubtreeID)) {
+      ScheduledTrees.set(SubtreeID);
+      DFSResult->scheduleTree(SubtreeID);
+      SchedImpl->scheduleTree(SubtreeID);
+    }
+  }
+
+  // Notify the scheduling strategy after updating the DAG.
+  SchedImpl->schedNode(SU, IsTopNode);
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void ScheduleDAGMI::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    if (&*RegionBegin == DbgValue)
+      ++RegionBegin;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == llvm::prior(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGMI::dumpSchedule() const {
+  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
+    if (SUnit *SU = getSUnit(&(*MI)))
+      SU->dump(this);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// LoadClusterMutation - DAG post-processing to cluster loads.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between neighboring
+/// loads.
+class LoadClusterMutation : public ScheduleDAGMutation {
+  struct LoadInfo {
+    SUnit *SU;
+    unsigned BaseReg;
+    unsigned Offset;
+    LoadInfo(SUnit *su, unsigned reg, unsigned ofs)
+      : SU(su), BaseReg(reg), Offset(ofs) {}
+  };
+  static bool LoadInfoLess(const LoadClusterMutation::LoadInfo &LHS,
+                           const LoadClusterMutation::LoadInfo &RHS);
+
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii,
+                      const TargetRegisterInfo *tri)
+    : TII(tii), TRI(tri) {}
+
+  virtual void apply(ScheduleDAGMI *DAG);
+protected:
+  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+bool LoadClusterMutation::LoadInfoLess(
+  const LoadClusterMutation::LoadInfo &LHS,
+  const LoadClusterMutation::LoadInfo &RHS) {
+  if (LHS.BaseReg != RHS.BaseReg)
+    return LHS.BaseReg < RHS.BaseReg;
+  return LHS.Offset < RHS.Offset;
+}
+
+void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
+                                                  ScheduleDAGMI *DAG) {
+  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
+  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
+    SUnit *SU = Loads[Idx];
+    unsigned BaseReg;
+    unsigned Offset;
+    if (TII->getLdStBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
+      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+  }
+  if (LoadRecords.size() < 2)
+    return;
+  std::sort(LoadRecords.begin(), LoadRecords.end(), LoadInfoLess);
+  unsigned ClusterLength = 1;
+  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
+    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+      ClusterLength = 1;
+      continue;
+    }
+
+    SUnit *SUa = LoadRecords[Idx].SU;
+    SUnit *SUb = LoadRecords[Idx+1].SU;
+    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
+        && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+
+      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+            << SUb->NodeNum << ")\n");
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since nearby
+      // loads should have effectively the same inputs.
+      for (SUnit::const_succ_iterator
+             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
+        if (SI->getSUnit() == SUb)
+          continue;
+        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+      ++ClusterLength;
+    }
+    else
+      ClusterLength = 1;
+  }
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+void LoadClusterMutation::apply(ScheduleDAGMI *DAG) {
+  // Map DAG NodeNum to store chain ID.
+  DenseMap<unsigned, unsigned> StoreChainIDs;
+  // Map each store chain to a set of dependent loads.
+  SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    SUnit *SU = &DAG->SUnits[Idx];
+    if (!SU->getInstr()->mayLoad())
+      continue;
+    unsigned ChainPredID = DAG->SUnits.size();
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->isCtrl()) {
+        ChainPredID = PI->getSUnit()->NodeNum;
+        break;
+      }
+    }
+    // Check if this chain-like pred has been seen
+    // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+    unsigned NumChains = StoreChainDependents.size();
+    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+      StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+    if (Result.second)
+      StoreChainDependents.resize(NumChains + 1);
+    StoreChainDependents[Result.first->second].push_back(SU);
+  }
+  // Iterate over the store chains.
+  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
+    clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  const TargetInstrInfo *TII;
+public:
+  MacroFusion(const TargetInstrInfo *tii): TII(tii) {}
+
+  virtual void apply(ScheduleDAGMI *DAG);
+};
+} // anonymous
+
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void MacroFusion::apply(ScheduleDAGMI *DAG) {
+  // For now, assume targets can only fuse with the branch.
+  MachineInstr *Branch = DAG->ExitSU.getInstr();
+  if (!Branch)
+    return;
+
+  for (unsigned Idx = DAG->SUnits.size(); Idx > 0;) {
+    SUnit *SU = &DAG->SUnits[--Idx];
+    if (!TII->shouldScheduleAdjacent(SU->getInstr(), Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&DAG->ExitSU, SDep(SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    DEBUG(dbgs() << "Macro Fuse SU(" << SU->NodeNum << ")\n");
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class ConvergingScheduler : public MachineSchedStrategy {
+public:
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, SingleExcess, SingleCritical, Cluster,
+    ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
+    TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
+    NodeOrder};
+
+#ifndef NDEBUG
+  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+#endif
+
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
+
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
+
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
+
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
+
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
+
+  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    CandPolicy Policy;
+
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // The reason for this candidate.
+    CandReason Reason;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+    : Policy(policy), SU(NULL), Reason(NoCand) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+  /// Summarize the unscheduled region.
+  struct SchedRemainder {
+    // Critical path through the DAG in expected latency.
+    unsigned CriticalPath;
+
+    // Unscheduled resources
+    SmallVector<unsigned, 16> RemainingCounts;
+    // Critical resource for the unscheduled zone.
+    unsigned CritResIdx;
+    // Number of micro-ops left to schedule.
+    unsigned RemainingMicroOps;
+
+    void reset() {
+      CriticalPath = 0;
+      RemainingCounts.clear();
+      CritResIdx = 0;
+      RemainingMicroOps = 0;
+    }
+
+    SchedRemainder() { reset(); }
+
+    void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
+
+    unsigned getMaxRemainingCount(const TargetSchedModel *SchedModel) const {
+      if (!SchedModel->hasInstrSchedModel())
+        return 0;
+
+      return std::max(
+        RemainingMicroOps * SchedModel->getMicroOpFactor(),
+        RemainingCounts[CritResIdx]);
+    }
+  };
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in the direction of movement, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    ScheduleDAGMI *DAG;
+    const TargetSchedModel *SchedModel;
+    SchedRemainder *Rem;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    // For heuristics, keep a list of the nodes that immediately depend on the
+    // most recently scheduled node.
+    SmallPtrSet<const SUnit*, 8> NextSUs;
+
+    ScheduleHazardRecognizer *HazardRec;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // The expected latency of the critical path in this scheduled zone.
+    unsigned ExpectedLatency;
+
+    // Resources used in the scheduled zone beyond this boundary.
+    SmallVector<unsigned, 16> ResourceCounts;
+
+    // Cache the critical resources ID in this scheduled zone.
+    unsigned CritResIdx;
+
+    // Is the scheduled region resource limited vs. latency limited.
+    bool IsResourceLimited;
+
+    unsigned ExpectedCount;
+
+#ifndef NDEBUG
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+#endif
+
+    void reset() {
+      // A new HazardRec is created for each DAG and owned by SchedBoundary.
+      delete HazardRec;
+
+      Available.clear();
+      Pending.clear();
+      CheckPending = false;
+      NextSUs.clear();
+      HazardRec = 0;
+      CurrCycle = 0;
+      IssueCount = 0;
+      MinReadyCycle = UINT_MAX;
+      ExpectedLatency = 0;
+      ResourceCounts.resize(1);
+      assert(!ResourceCounts[0] && "nonzero count for bad resource");
+      CritResIdx = 0;
+      IsResourceLimited = false;
+      ExpectedCount = 0;
+#ifndef NDEBUG
+      MaxMinLatency = 0;
+#endif
+      // Reserve a zero-count for invalid CritResIdx.
+      ResourceCounts.resize(1);
+    }
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      HazardRec(0) {
+      reset();
+    }
+
+    ~SchedBoundary() { delete HazardRec; }
+
+    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
+              SchedRemainder *rem);
+
+    bool isTop() const {
+      return Available.getID() == ConvergingScheduler::TopQID;
+    }
+
+    unsigned getUnscheduledLatency(SUnit *SU) const {
+      if (isTop())
+        return SU->getHeight();
+      return SU->getDepth() + SU->Latency;
+    }
+
+    unsigned getCriticalCount() const {
+      return ResourceCounts[CritResIdx];
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void setLatencyPolicy(CandPolicy &Policy);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void countResource(unsigned PIdx, unsigned Cycles);
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+private:
+  ScheduleDAGMI *DAG;
+  const TargetSchedModel *SchedModel;
+  const TargetRegisterInfo *TRI;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedRemainder Rem;
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingScheduler():
+    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+  virtual void registerRoots();
+
+protected:
+  void balanceZones(
+    ConvergingScheduler::SchedBoundary &CriticalZone,
+    ConvergingScheduler::SchedCandidate &CriticalCand,
+    ConvergingScheduler::SchedBoundary &OppositeZone,
+    ConvergingScheduler::SchedCandidate &OppositeCand);
+
+  void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
+                           ConvergingScheduler::SchedCandidate &BotCand);
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
+
+#ifndef NDEBUG
+  void traceCandidate(const SchedCandidate &Cand);
+#endif
+};
+} // namespace
+
+void ConvergingScheduler::SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (std::vector<SUnit>::iterator
+         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+    RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      RemainingCounts[PIdx] += (Factor * PI->Cycles);
+    }
+  }
+  for (unsigned PIdx = 0, PEnd = SchedModel->getNumProcResourceKinds();
+       PIdx != PEnd; ++PIdx) {
+    if ((int)(RemainingCounts[PIdx] - RemainingCounts[CritResIdx])
+        >= (int)SchedModel->getLatencyFactor()) {
+      CritResIdx = PIdx;
+    }
+  }
+}
+
+void ConvergingScheduler::SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel())
+    ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
+}
+
+void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = dag;
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isWeak())
+      continue;
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+void ConvergingScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (std::vector<SUnit*>::const_iterator
+         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
+    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
+    return true;
+  }
+  return false;
+}
+
+/// Compute the remaining latency to determine whether ILP should be increased.
+void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
+  // FIXME: compile time. In all, we visit four queues here one we should only
+  // need to visit the one that was last popped if we cache the result.
+  unsigned RemLatency = 0;
+  for (ReadyQueue::iterator I = Available.begin(), E = Available.end();
+       I != E; ++I) {
+    unsigned L = getUnscheduledLatency(*I);
+    if (L > RemLatency)
+      RemLatency = L;
+  }
+  for (ReadyQueue::iterator I = Pending.begin(), E = Pending.end();
+       I != E; ++I) {
+    unsigned L = getUnscheduledLatency(*I);
+    if (L > RemLatency)
+      RemLatency = L;
+  }
+  unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
+  if (RemLatency + ExpectedLatency >= CriticalPathLimit
+      && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
+    Policy.ReduceLatency = true;
+    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << '\n');
+  }
+}
+
+void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+    Pending.push(SU);
+  else
+    Available.push(SU);
+
+  // Record this node as an immediate dependent of the scheduled node.
+  NextSUs.insert(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  unsigned NextCycle = CurrCycle + 1;
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  if (MinReadyCycle > NextCycle) {
+    IssueCount = 0;
+    NextCycle = MinReadyCycle;
+  }
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  }
+  else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+
+  DEBUG(dbgs() << "  " << Available.getName()
+        << " Cycle: " << CurrCycle << '\n');
+}
+
+/// Add the given processor resource to this scheduled zone.
+void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
+                                                       unsigned Cycles) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  DEBUG(dbgs() << "  " << SchedModel->getProcResource(PIdx)->Name
+        << " +(" << Cycles << "x" << Factor
+        << ") / " << SchedModel->getLatencyFactor() << '\n');
+
+  unsigned Count = Factor * Cycles;
+  ResourceCounts[PIdx] += Count;
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Check if this resource exceeds the current critical resource by a full
+  // cycle. If so, it becomes the critical resource.
+  if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
+      >= (int)SchedModel->getLatencyFactor()) {
+    CritResIdx = PIdx;
+    DEBUG(dbgs() << "  *** Critical resource "
+          << SchedModel->getProcResource(PIdx)->Name << " x"
+          << ResourceCounts[PIdx] << '\n');
+  }
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      countResource(PI->ProcResourceIdx, PI->Cycles);
+    }
+  }
+  if (isTop()) {
+    if (SU->getDepth() > ExpectedLatency)
+      ExpectedLatency = SU->getDepth();
+  }
+  else {
+    if (SU->getHeight() > ExpectedLatency)
+      ExpectedLatency = SU->getHeight();
+  }
+
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+
+  // checkHazard prevents scheduling multiple instructions per cycle that exceed
+  // issue width. However, we commonly reach the maximum. In this case
+  // opportunistically bump the cycle to avoid uselessly checking everything in
+  // the readyQ. Furthermore, a single instruction may produce more than one
+  // cycle's worth of micro-ops.
+  if (IssueCount >= SchedModel->getIssueWidth()) {
+    DEBUG(dbgs() << "  *** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  DEBUG(if (!Pending.empty()) Pending.dump());
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
+SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  if (IssueCount > 0) {
+    // Defer any ready instrs that now have a hazard.
+    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+      if (checkHazard(*I)) {
+        Pending.push(*I);
+        I = Available.remove(I);
+        continue;
+      }
+      ++I;
+    }
+  }
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+/// Record the candidate policy for opposite zones with different critical
+/// resources.
+///
+/// If the CriticalZone is latency limited, don't force a policy for the
+/// candidates here. Instead, setLatencyPolicy sets ReduceLatency if needed.
+void ConvergingScheduler::balanceZones(
+  ConvergingScheduler::SchedBoundary &CriticalZone,
+  ConvergingScheduler::SchedCandidate &CriticalCand,
+  ConvergingScheduler::SchedBoundary &OppositeZone,
+  ConvergingScheduler::SchedCandidate &OppositeCand) {
+
+  if (!CriticalZone.IsResourceLimited)
+    return;
+  assert(SchedModel->hasInstrSchedModel() && "required schedmodel");
+
+  SchedRemainder *Rem = CriticalZone.Rem;
+
+  // If the critical zone is overconsuming a resource relative to the
+  // remainder, try to reduce it.
+  unsigned RemainingCritCount =
+    Rem->RemainingCounts[CriticalZone.CritResIdx];
+  if ((int)(Rem->getMaxRemainingCount(SchedModel) - RemainingCritCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
+          << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
+          << '\n');
+  }
+  // If the other zone is underconsuming a resource relative to the full zone,
+  // try to increase it.
+  unsigned OppositeCount =
+    OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
+  if ((int)(OppositeZone.ExpectedCount - OppositeCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
+          << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
+          << '\n');
+  }
+}
+
+/// Determine if the scheduled zones exceed resource limits or critical path and
+/// set each candidate's ReduceHeight policy accordingly.
+void ConvergingScheduler::checkResourceLimits(
+  ConvergingScheduler::SchedCandidate &TopCand,
+  ConvergingScheduler::SchedCandidate &BotCand) {
+
+  // Set ReduceLatency to true if needed.
+  Bot.setLatencyPolicy(BotCand.Policy);
+  Top.setLatencyPolicy(TopCand.Policy);
+
+  // Handle resource-limited regions.
+  if (Top.IsResourceLimited && Bot.IsResourceLimited
+      && Top.CritResIdx == Bot.CritResIdx) {
+    // If the scheduled critical resource in both zones is no longer the
+    // critical remaining resource, attempt to reduce resource height both ways.
+    if (Top.CritResIdx != Rem.CritResIdx) {
+      TopCand.Policy.ReduceResIdx = Top.CritResIdx;
+      BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
+      DEBUG(dbgs() << "Reduce scheduled "
+            << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
+    }
+    return;
+  }
+  // Handle latency-limited regions.
+  if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
+    // If the total scheduled expected latency exceeds the region's critical
+    // path then reduce latency both ways.
+    //
+    // Just because a zone is not resource limited does not mean it is latency
+    // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
+    // to exceed expected latency.
+    if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
+        && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
+      TopCand.Policy.ReduceLatency = true;
+      BotCand.Policy.ReduceLatency = true;
+      DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
+            << " + " << Bot.ExpectedLatency << '\n');
+    }
+    return;
+  }
+  // The critical resource is different in each zone, so request balancing.
+
+  // Compute the cost of each zone.
+  Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
+  Top.ExpectedCount = std::max(
+    Top.getCriticalCount(),
+    Top.ExpectedCount * SchedModel->getLatencyFactor());
+  Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
+  Bot.ExpectedCount = std::max(
+    Bot.getCriticalCount(),
+    Bot.ExpectedCount * SchedModel->getLatencyFactor());
+
+  balanceZones(Top, TopCand, Bot, BotCand);
+  balanceZones(Bot, BotCand, Top, TopCand);
+}
+
+void ConvergingScheduler::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Return true if this heuristic determines order.
+static bool tryLess(int TryVal, int CandVal,
+                    ConvergingScheduler::SchedCandidate &TryCand,
+                    ConvergingScheduler::SchedCandidate &Cand,
+                    ConvergingScheduler::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       ConvergingScheduler::SchedCandidate &TryCand,
+                       ConvergingScheduler::SchedCandidate &Cand,
+                       ConvergingScheduler::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+  return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending.
+/// \param RPTracker describes reg pressure within the scheduled zone.
+/// \param TempTracker is a scratch pressure tracker to reuse in queries.
+void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+                                       SchedCandidate &TryCand,
+                                       SchedBoundary &Zone,
+                                       const RegPressureTracker &RPTracker,
+                                       RegPressureTracker &TempTracker) {
+
+  // Always initialize TryCand's RPDelta.
+  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
+                                  DAG->getRegionCriticalPSets(),
+                                  DAG->getRegPressure().MaxSetPressure);
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+  // Avoid exceeding the target's limit.
+  if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
+              Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
+    return;
+  if (Cand.Reason == SingleExcess)
+    Cand.Reason = MultiPressure;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
+              Cand.RPDelta.CriticalMax.UnitIncrease,
+              TryCand, Cand, SingleCritical))
+    return;
+  if (Cand.Reason == SingleCritical)
+    Cand.Reason = MultiPressure;
+
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *NextClusterSU =
+    Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
+                 TryCand, Cand, Cluster))
+    return;
+  // Currently, weak edges are for clustering, so we hard-code that reason.
+  // However, deferring the current TryCand will not change Cand's reason.
+  CandReason OrigReason = Cand.Reason;
+  if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
+              getWeakLeft(Cand.SU, Zone.isTop()),
+              TryCand, Cand, Cluster)) {
+    Cand.Reason = OrigReason;
+    return;
+  }
+  // Avoid critical resource consumption and balance the schedule.
+  TryCand.initResourceDelta(DAG, SchedModel);
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency) {
+    if (Zone.isTop()) {
+      if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                    TryCand, Cand, TopDepthReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                     TryCand, Cand, TopPathReduce))
+        return;
+    }
+    else {
+      if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                    TryCand, Cand, BotHeightReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                     TryCand, Cand, BotPathReduce))
+        return;
+    }
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
+              Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
+    return;
+  if (Cand.Reason == SingleMax)
+    Cand.Reason = MultiPressure;
+
+  // Prefer immediate defs/users of the last scheduled instruction. This is a
+  // nice pressure avoidance strategy that also conserves the processor's
+  // register renaming resources and keeps the machine code readable.
+  if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
+                 TryCand, Cand, NextDefUse))
+    return;
+
+  // Fall through to original instruction order.
+  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+    TryCand.Reason = NodeOrder;
+  }
+}
+
+/// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
+/// more desirable than RHS from scheduling standpoint.
+static bool compareRPDelta(const RegPressureDelta &LHS,
+                           const RegPressureDelta &RHS) {
+  // Compare each component of pressure in decreasing order of importance
+  // without checking if any are valid. Invalid PressureElements are assumed to
+  // have UnitIncrease==0, so are neutral.
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
+    DEBUG(dbgs() << "RP excess top - bot: "
+          << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
+    return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
+  }
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP critical top - bot: "
+          << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
+          << '\n');
+    return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
+  }
+  // Avoid increasing the max pressure of the entire region.
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP current top - bot: "
+          << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
+          << '\n');
+    return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
+  }
+  return false;
+}
+
+#ifndef NDEBUG
+const char *ConvergingScheduler::getReasonStr(
+  ConvergingScheduler::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case SingleExcess:   return "REG-EXCESS";
+  case SingleCritical: return "REG-CRIT  ";
+  case Cluster:        return "CLUSTER   ";
+  case SingleMax:      return "REG-MAX   ";
+  case MultiPressure:  return "REG-MULTI ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+  llvm_unreachable("Unknown reason!");
+}
+
+void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
+  PressureElement P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case SingleExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case SingleCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case SingleMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  if (P.isValid())
+    dbgs() << " " << TRI->getRegPressureSetName(P.PSetID)
+           << ":" << P.UnitIncrease << " ";
+  else
+    dbgs() << "      ";
+  if (ResIdx)
+    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "         ";
+  if (Latency)
+    dbgs() << " " << Latency << " cycles ";
+  else
+    dbgs() << "          ";
+  dbgs() << '\n';
+}
+#endif
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                            const RegPressureTracker &RPTracker,
+                                            SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
+
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = *I;
+    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+                      bool IsTop) {
+  DEBUG(dbgs() << "Pick " << (IsTop ? "Top" : "Bot")
+        << " SU(" << Cand.SU->NodeNum << ") "
+        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  CandPolicy NoPolicy;
+  SchedCandidate BotCand(NoPolicy);
+  SchedCandidate TopCand(NoPolicy);
+  checkResourceLimits(TopCand, BotCand);
+
+  // Prefer bottom scheduling when heuristics are silent.
+  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
+    IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
+    if (TopCand.Reason < BotCand.Reason) {
+      IsTopNode = true;
+      tracePick(TopCand, IsTopNode);
+      return TopCand.SU;
+    }
+    IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
+    return BotCand.SU;
+  }
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+    IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate, in node order if all else failed.
+  if (TopCand.Reason < BotCand.Reason) {
+    IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
+    return TopCand.SU;
+  }
+  IsTopNode = false;
+  tracePick(BotCand, IsTopNode);
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
+  }
+  SUnit *SU;
+  do {
+    if (ForceTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
+    }
+    else if (ForceBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
+    }
+    else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "Scheduling " << *SU->getInstr());
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, ScheduleDAGMI needs to update
+/// it's state based on the current cycle before MachineSchedStrategy does.
+void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  }
+  else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
+
+/// Create the standard converging machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+  // Register DAG post-processors.
+  if (EnableLoadCluster)
+    DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
+  if (EnableMacroFusion)
+    DAG->addMutation(new MacroFusion(DAG->TII));
+  return DAG;
+}
+static MachineSchedRegistry
+ConvergingSchedRegistry("converge", "Standard converging scheduler.",
+                        createConvergingSched);
+
+//===----------------------------------------------------------------------===//
+// ILP Scheduler. Currently for experimental analysis of heuristics.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Order nodes by the ILP metric.
+struct ILPOrder {
+  const SchedDFSResult *DFSResult;
+  const BitVector *ScheduledTrees;
+  bool MaximizeILP;
+
+  ILPOrder(bool MaxILP): DFSResult(0), ScheduledTrees(0), MaximizeILP(MaxILP) {}
+
+  /// \brief Apply a less-than relation on node priority.
+  ///
+  /// (Return true if A comes after B in the Q.)
+  bool operator()(const SUnit *A, const SUnit *B) const {
+    unsigned SchedTreeA = DFSResult->getSubtreeID(A);
+    unsigned SchedTreeB = DFSResult->getSubtreeID(B);
+    if (SchedTreeA != SchedTreeB) {
+      // Unscheduled trees have lower priority.
+      if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
+        return ScheduledTrees->test(SchedTreeB);
+
+      // Trees with shallower connections have have lower priority.
+      if (DFSResult->getSubtreeLevel(SchedTreeA)
+          != DFSResult->getSubtreeLevel(SchedTreeB)) {
+        return DFSResult->getSubtreeLevel(SchedTreeA)
+          < DFSResult->getSubtreeLevel(SchedTreeB);
+      }
+    }
+    if (MaximizeILP)
+      return DFSResult->getILP(A) < DFSResult->getILP(B);
+    else
+      return DFSResult->getILP(A) > DFSResult->getILP(B);
+  }
+};
+
+/// \brief Schedule based on the ILP metric.
+class ILPScheduler : public MachineSchedStrategy {
+  /// In case all subtrees are eventually connected to a common root through
+  /// data dependence (e.g. reduction), place an upper limit on their size.
+  ///
+  /// FIXME: A subtree limit is generally good, but in the situation commented
+  /// above, where multiple similar subtrees feed a common root, we should
+  /// only split at a point where the resulting subtrees will be balanced.
+  /// (a motivating test case must be found).
+  static const unsigned SubtreeLimit = 16;
+
+  ScheduleDAGMI *DAG;
+  ILPOrder Cmp;
+
+  std::vector<SUnit*> ReadyQ;
+public:
+  ILPScheduler(bool MaximizeILP): DAG(0), Cmp(MaximizeILP) {}
+
+  virtual void initialize(ScheduleDAGMI *dag) {
+    DAG = dag;
+    DAG->computeDFSResult();
+    Cmp.DFSResult = DAG->getDFSResult();
+    Cmp.ScheduledTrees = &DAG->getScheduledTrees();
+    ReadyQ.clear();
+  }
+
+  virtual void registerRoots() {
+    // Restore the heap in ReadyQ with the updated DFS results.
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  /// Callback to select the highest priority node from the ready Q.
+  virtual SUnit *pickNode(bool &IsTopNode) {
+    if (ReadyQ.empty()) return NULL;
+    std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+    SUnit *SU = ReadyQ.back();
+    ReadyQ.pop_back();
+    IsTopNode = false;
+    DEBUG(dbgs() << "*** Scheduling " << "SU(" << SU->NodeNum << "): "
+          << *SU->getInstr()
+          << " ILP: " << DAG->getDFSResult()->getILP(SU)
+          << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
+          << DAG->getDFSResult()->getSubtreeLevel(
+            DAG->getDFSResult()->getSubtreeID(SU)) << '\n');
+    return SU;
+  }
+
+  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  virtual void scheduleTree(unsigned SubtreeID) {
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
+  /// DFSResults, and resort the priority Q.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) {
+    assert(!IsTopNode && "SchedDFSResult needs bottom-up");
+  }
+
+  virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ }
+
+  virtual void releaseBottomNode(SUnit *SU) {
+    ReadyQ.push_back(SU);
+    std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+};
+} // namespace
+
+static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new ILPScheduler(true));
+}
+static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new ILPScheduler(false));
+}
+static MachineSchedRegistry ILPMaxRegistry(
+  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
+static MachineSchedRegistry ILPMinRegistry(
+  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Shuffler for Correctness Testing
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace {
+/// Apply a less-than relation on the node order, which corresponds to the
+/// instruction order prior to scheduling. IsReverse implements greater-than.
+template<bool IsReverse>
+struct SUnitOrder {
+  bool operator()(SUnit *A, SUnit *B) const {
+    if (IsReverse)
+      return A->NodeNum > B->NodeNum;
+    else
+      return A->NodeNum < B->NodeNum;
+  }
+};
+
+/// Reorder instructions as much as possible.
+class InstructionShuffler : public MachineSchedStrategy {
+  bool IsAlternating;
+  bool IsTopDown;
+
+  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
+  // gives nodes with a higher number higher priority causing the latest
+  // instructions to be scheduled first.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+    TopQ;
+  // When scheduling bottom-up, use greater-than as the queue priority.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+    BottomQ;
+public:
+  InstructionShuffler(bool alternate, bool topdown)
+    : IsAlternating(alternate), IsTopDown(topdown) {}
+
+  virtual void initialize(ScheduleDAGMI *) {
+    TopQ.clear();
+    BottomQ.clear();
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  virtual SUnit *pickNode(bool &IsTopNode) {
+    SUnit *SU;
+    if (IsTopDown) {
+      do {
+        if (TopQ.empty()) return NULL;
+        SU = TopQ.top();
+        TopQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = true;
+    }
+    else {
+      do {
+        if (BottomQ.empty()) return NULL;
+        SU = BottomQ.top();
+        BottomQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = false;
+    }
+    if (IsAlternating)
+      IsTopDown = !IsTopDown;
+    return SU;
+  }
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode) {}
+
+  virtual void releaseTopNode(SUnit *SU) {
+    TopQ.push(SU);
+  }
+  virtual void releaseBottomNode(SUnit *SU) {
+    BottomQ.push(SU);
+  }
+};
+} // namespace
+
+static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
+  bool Alternate = !ForceTopDown && !ForceBottomUp;
+  bool TopDown = !ForceBottomUp;
+  assert((TopDown || !ForceTopDown) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  return new ScheduleDAGMI(C, new InstructionShuffler(Alternate, TopDown));
+}
+static MachineSchedRegistry ShufflerRegistry(
+  "shuffle", "Shuffle machine instructions alternating directions",
+  createInstructionShuffler);
+#endif // !NDEBUG
+
+//===----------------------------------------------------------------------===//
+// GraphWriter support for ScheduleDAGMI.
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace llvm {
+
+template<> struct GraphTraits<
+  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+
+template<>
+struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return G->MF.getName();
+  }
+
+  static bool renderGraphFromBottomUp() {
+    return true;
+  }
+
+  static bool isNodeHidden(const SUnit *Node) {
+    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+  }
+
+  static bool hasNodeAddressLabel(const SUnit *Node,
+                                  const ScheduleDAG *Graph) {
+    return false;
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node,
+                                       SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    SS << "SU(" << SU->NodeNum << ')';
+    return SS.str();
+  }
+  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+
+  static std::string getNodeAttributes(const SUnit *N,
+                                       const ScheduleDAG *Graph) {
+    std::string Str("shape=Mrecord");
+    const SchedDFSResult *DFS =
+      static_cast<const ScheduleDAGMI*>(Graph)->getDFSResult();
+    if (DFS) {
+      Str += ",style=filled,fillcolor=\"#";
+      Str += DOT::getColorString(DFS->getSubtreeID(N));
+      Str += '"';
+    }
+    return Str;
+  }
+};
+} // namespace llvm
+#endif // NDEBUG
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAGMI::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp
new file mode 100644
index 000000000000..4dafbe5a3e3a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp
@@ -0,0 +1,712 @@
+//===-- MachineSink.cpp - Sinking for machine instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for an LLVM-IR-level sinking pass. It is only designed to sink simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-sink"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+SplitEdges("machine-sink-split",
+           cl::desc("Split critical edges during machine sinking"),
+           cl::init(true), cl::Hidden);
+
+STATISTIC(NumSunk,      "Number of machine instructions sunk");
+STATISTIC(NumSplit,     "Number of critical edges split");
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+
+namespace {
+  class MachineSinking : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo  *MRI;  // Machine register information
+    MachineDominatorTree *DT;   // Machine dominator tree
+    MachineLoopInfo *LI;
+    AliasAnalysis *AA;
+
+    // Remember which edges have been considered for breaking.
+    SmallSet<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 8>
+    CEBCandidates;
+
+  public:
+    static char ID; // Pass identification
+    MachineSinking() : MachineFunctionPass(ID) {
+      initializeMachineSinkingPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+    }
+
+    virtual void releaseMemory() {
+      CEBCandidates.clear();
+    }
+
+  private:
+    bool ProcessBlock(MachineBasicBlock &MBB);
+    bool isWorthBreakingCriticalEdge(MachineInstr *MI,
+                                     MachineBasicBlock *From,
+                                     MachineBasicBlock *To);
+    MachineBasicBlock *SplitCriticalEdge(MachineInstr *MI,
+                                         MachineBasicBlock *From,
+                                         MachineBasicBlock *To,
+                                         bool BreakPHIEdge);
+    bool SinkInstruction(MachineInstr *MI, bool &SawStore);
+    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
+                                 MachineBasicBlock *DefMBB,
+                                 bool &BreakPHIEdge, bool &LocalUse) const;
+    MachineBasicBlock *FindSuccToSinkTo(MachineInstr *MI, MachineBasicBlock *MBB,
+               bool &BreakPHIEdge);
+    bool isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
+                              MachineBasicBlock *MBB,
+                              MachineBasicBlock *SuccToSinkTo);
+
+    bool PerformTrivialForwardCoalescing(MachineInstr *MI,
+                                         MachineBasicBlock *MBB);
+  };
+
+  // SuccessorSorter - Sort Successors according to their loop depth. 
+  struct SuccessorSorter {
+    SuccessorSorter(MachineLoopInfo *LoopInfo) : LI(LoopInfo) {}
+    bool operator()(const MachineBasicBlock *LHS,
+                    const MachineBasicBlock *RHS) const {
+      return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS);
+    }
+    MachineLoopInfo *LI;
+  };
+} // end anonymous namespace
+
+char MachineSinking::ID = 0;
+char &llvm::MachineSinkingID = MachineSinking::ID;
+INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false)
+
+bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr *MI,
+                                                     MachineBasicBlock *MBB) {
+  if (!MI->isCopy())
+    return false;
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg) ||
+      !MRI->hasOneNonDBGUse(SrcReg))
+    return false;
+
+  const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
+  const TargetRegisterClass *DRC = MRI->getRegClass(DstReg);
+  if (SRC != DRC)
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+  if (DefMI->isCopyLike())
+    return false;
+  DEBUG(dbgs() << "Coalescing: " << *DefMI);
+  DEBUG(dbgs() << "*** to: " << *MI);
+  MRI->replaceRegWith(DstReg, SrcReg);
+  MI->eraseFromParent();
+  ++NumCoalesces;
+  return true;
+}
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified register
+/// occur in blocks dominated by the specified block. If any use is in the
+/// definition block, then return false since it is never legal to move def
+/// after uses.
+bool
+MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
+                                        MachineBasicBlock *MBB,
+                                        MachineBasicBlock *DefMBB,
+                                        bool &BreakPHIEdge,
+                                        bool &LocalUse) const {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Only makes sense for vregs");
+
+  // Ignore debug uses because debug info doesn't affect the code.
+  if (MRI->use_nodbg_empty(Reg))
+    return true;
+
+  // BreakPHIEdge is true if all the uses are in the successor MBB being sunken
+  // into and they are all PHI nodes. In this case, machine-sink must break
+  // the critical edge first. e.g.
+  //
+  // BB#1: derived from LLVM BB %bb4.preheader
+  //   Predecessors according to CFG: BB#0
+  //     ...
+  //     %reg16385<def> = DEC64_32r %reg16437, %EFLAGS<imp-def,dead>
+  //     ...
+  //     JE_4 <BB#37>, %EFLAGS<imp-use>
+  //   Successors according to CFG: BB#37 BB#2
+  //
+  // BB#2: derived from LLVM BB %bb.nph
+  //   Predecessors according to CFG: BB#0 BB#1
+  //     %reg16386<def> = PHI %reg16434, <BB#0>, %reg16385, <BB#1>
+  BreakPHIEdge = true;
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end();
+       I != E; ++I) {
+    MachineInstr *UseInst = &*I;
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (!(UseBlock == MBB && UseInst->isPHI() &&
+          UseInst->getOperand(I.getOperandNo()+1).getMBB() == DefMBB)) {
+      BreakPHIEdge = false;
+      break;
+    }
+  }
+  if (BreakPHIEdge)
+    return true;
+
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end();
+       I != E; ++I) {
+    // Determine the block of the use.
+    MachineInstr *UseInst = &*I;
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (UseInst->isPHI()) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      UseBlock = UseInst->getOperand(I.getOperandNo()+1).getMBB();
+    } else if (UseBlock == DefMBB) {
+      LocalUse = true;
+      return false;
+    }
+
+    // Check that it dominates.
+    if (!DT->dominates(MBB, UseBlock))
+      return false;
+  }
+
+  return true;
+}
+
+bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "******** Machine Sinking ********\n");
+
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  LI = &getAnalysis<MachineLoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  bool EverMadeChange = false;
+
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    CEBCandidates.clear();
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+         I != E; ++I)
+      MadeChange |= ProcessBlock(*I);
+
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+  return EverMadeChange;
+}
+
+bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&MBB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  MachineBasicBlock::iterator I = MBB.end();
+  --I;
+  bool ProcessedBegin, SawStore = false;
+  do {
+    MachineInstr *MI = I;  // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == MBB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (MI->isDebugValue())
+      continue;
+
+    bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
+    if (Joined) {
+      MadeChange = true;
+      continue;
+    }
+
+    if (SinkInstruction(MI, SawStore))
+      ++NumSunk, MadeChange = true;
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
+                                                 MachineBasicBlock *From,
+                                                 MachineBasicBlock *To) {
+  // FIXME: Need much better heuristics.
+
+  // If the pass has already considered breaking this edge (during this pass
+  // through the function), then let's go ahead and break it. This means
+  // sinking multiple "cheap" instructions into the same block.
+  if (!CEBCandidates.insert(std::make_pair(From, To)))
+    return true;
+
+  if (!MI->isCopy() && !MI->isAsCheapAsAMove())
+    return true;
+
+  // MI is cheap, we probably don't want to break the critical edge for it.
+  // However, if this would allow some definitions of its source operands
+  // to be sunk then it's probably worth it.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MRI->hasOneNonDBGUse(Reg))
+      return true;
+  }
+
+  return false;
+}
+
+MachineBasicBlock *MachineSinking::SplitCriticalEdge(MachineInstr *MI,
+                                                     MachineBasicBlock *FromBB,
+                                                     MachineBasicBlock *ToBB,
+                                                     bool BreakPHIEdge) {
+  if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
+    return 0;
+
+  // Avoid breaking back edge. From == To means backedge for single BB loop.
+  if (!SplitEdges || FromBB == ToBB)
+    return 0;
+
+  // Check for backedges of more "complex" loops.
+  if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
+      LI->isLoopHeader(ToBB))
+    return 0;
+
+  // It's not always legal to break critical edges and sink the computation
+  // to the edge.
+  //
+  // BB#1:
+  // v1024
+  // Beq BB#3
+  // <fallthrough>
+  // BB#2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // BB#3:
+  // ...
+  //       = v1024
+  //
+  // If BB#1 -> BB#3 edge is broken and computation of v1024 is inserted:
+  //
+  // BB#1:
+  // ...
+  // Bne BB#2
+  // BB#4:
+  // v1024 =
+  // B BB#3
+  // BB#2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // BB#3:
+  // ...
+  //       = v1024
+  //
+  // This is incorrect since v1024 is not computed along the BB#1->BB#2->BB#3
+  // flow. We need to ensure the new basic block where the computation is
+  // sunk to dominates all the uses.
+  // It's only legal to break critical edge and sink the computation to the
+  // new block if all the predecessors of "To", except for "From", are
+  // not dominated by "From". Given SSA property, this means these
+  // predecessors are dominated by "To".
+  //
+  // There is no need to do this check if all the uses are PHI nodes. PHI
+  // sources are only defined on the specific predecessor edges.
+  if (!BreakPHIEdge) {
+    for (MachineBasicBlock::pred_iterator PI = ToBB->pred_begin(),
+           E = ToBB->pred_end(); PI != E; ++PI) {
+      if (*PI == FromBB)
+        continue;
+      if (!DT->dominates(ToBB, *PI))
+        return 0;
+    }
+  }
+
+  return FromBB->SplitCriticalEdge(ToBB, this);
+}
+
+static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
+  return MI->isInsertSubreg() || MI->isSubregToReg() || MI->isRegSequence();
+}
+
+/// collectDebgValues - Scan instructions following MI and collect any
+/// matching DBG_VALUEs.
+static void collectDebugValues(MachineInstr *MI,
+                               SmallVector<MachineInstr *, 2> & DbgValues) {
+  DbgValues.clear();
+  if (!MI->getOperand(0).isReg())
+    return;
+
+  MachineBasicBlock::iterator DI = MI; ++DI;
+  for (MachineBasicBlock::iterator DE = MI->getParent()->end();
+       DI != DE; ++DI) {
+    if (!DI->isDebugValue())
+      return;
+    if (DI->getOperand(0).isReg() &&
+        DI->getOperand(0).getReg() == MI->getOperand(0).getReg())
+      DbgValues.push_back(DI);
+  }
+}
+
+/// isPostDominatedBy - Return true if A is post dominated by B.
+static bool isPostDominatedBy(MachineBasicBlock *A, MachineBasicBlock *B) {
+
+  // FIXME - Use real post dominator.
+  if (A->succ_size() != 2)
+    return false;
+  MachineBasicBlock::succ_iterator I = A->succ_begin();
+  if (B == *I)
+    ++I;
+  MachineBasicBlock *OtherSuccBlock = *I;
+  if (OtherSuccBlock->succ_size() != 1 ||
+      *(OtherSuccBlock->succ_begin()) != B)
+    return false;
+
+  return true;
+}
+
+/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
+bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr *MI,
+                                          MachineBasicBlock *MBB,
+                                          MachineBasicBlock *SuccToSinkTo) {
+  assert (MI && "Invalid MachineInstr!");
+  assert (SuccToSinkTo && "Invalid SinkTo Candidate BB");
+
+  if (MBB == SuccToSinkTo)
+    return false;
+
+  // It is profitable if SuccToSinkTo does not post dominate current block.
+  if (!isPostDominatedBy(MBB, SuccToSinkTo))
+      return true;
+
+  // Check if only use in post dominated block is PHI instruction.
+  bool NonPHIUse = false;
+  for (MachineRegisterInfo::use_nodbg_iterator
+         I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end();
+       I != E; ++I) {
+    MachineInstr *UseInst = &*I;
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (UseBlock == SuccToSinkTo && !UseInst->isPHI())
+      NonPHIUse = true;
+  }
+  if (!NonPHIUse)
+    return true;
+
+  // If SuccToSinkTo post dominates then also it may be profitable if MI
+  // can further profitably sinked into another block in next round.
+  bool BreakPHIEdge = false;
+  // FIXME - If finding successor is compile time expensive then catch results.
+  if (MachineBasicBlock *MBB2 = FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge))
+    return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2);
+
+  // If SuccToSinkTo is final destination and it is a post dominator of current
+  // block then it is not profitable to sink MI into SuccToSinkTo block.
+  return false;
+}
+
+/// FindSuccToSinkTo - Find a successor to sink this instruction to.
+MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
+                                   MachineBasicBlock *MBB,
+                                   bool &BreakPHIEdge) {
+
+  assert (MI && "Invalid MachineInstr!");
+  assert (MBB && "Invalid MachineBasicBlock!");
+
+  // Loop over all the operands of the specified instruction.  If there is
+  // anything we can't handle, bail out.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  MachineBasicBlock *SuccToSinkTo = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;  // Ignore non-register operands.
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+          return NULL;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return NULL;
+      }
+    } else {
+      // Virtual register uses are always safe to sink.
+      if (MO.isUse()) continue;
+
+      // If it's not safe to move defs of the register class, then abort.
+      if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
+        return NULL;
+
+      // FIXME: This picks a successor to sink into based on having one
+      // successor that dominates all the uses.  However, there are cases where
+      // sinking can happen but where the sink point isn't a successor.  For
+      // example:
+      //
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      //
+      // the instruction could be sunk over the whole diamond for the
+      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
+      // after that.
+
+      // Virtual register defs can only be sunk if all their uses are in blocks
+      // dominated by one of the successors.
+      if (SuccToSinkTo) {
+        // If a previous operand picked a block to sink to, then this operand
+        // must be sinkable to the same block.
+        bool LocalUse = false;
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB,
+                                     BreakPHIEdge, LocalUse))
+          return NULL;
+
+        continue;
+      }
+
+      // Otherwise, we should look at all the successors and decide which one
+      // we should sink to.
+      // We give successors with smaller loop depth higher priority.
+      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
+      std::stable_sort(Succs.begin(), Succs.end(), SuccessorSorter(LI));
+      for (SmallVector<MachineBasicBlock*, 4>::iterator SI = Succs.begin(),
+           E = Succs.end(); SI != E; ++SI) {
+        MachineBasicBlock *SuccBlock = *SI;
+        bool LocalUse = false;
+        if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
+                                    BreakPHIEdge, LocalUse)) {
+          SuccToSinkTo = SuccBlock;
+          break;
+        }
+        if (LocalUse)
+          // Def is used locally, it's never safe to move this def.
+          return NULL;
+      }
+
+      // If we couldn't find a block to sink to, ignore this instruction.
+      if (SuccToSinkTo == 0)
+        return NULL;
+      else if (!isProfitableToSinkTo(Reg, MI, MBB, SuccToSinkTo))
+        return NULL;
+    }
+  }
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (MBB == SuccToSinkTo)
+    return NULL;
+
+  // It's not safe to sink instructions to EH landing pad. Control flow into
+  // landing pad is implicitly defined.
+  if (SuccToSinkTo && SuccToSinkTo->isLandingPad())
+    return NULL;
+
+  return SuccToSinkTo;
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
+  // Don't sink insert_subreg, subreg_to_reg, reg_sequence. These are meant to
+  // be close to the source to make it easier to coalesce.
+  if (AvoidsSinking(MI, MRI))
+    return false;
+
+  // Check if it's safe to move the instruction.
+  if (!MI->isSafeToMove(TII, AA, SawStore))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  bool BreakPHIEdge = false;
+  MachineBasicBlock *ParentBlock = MI->getParent();
+  MachineBasicBlock *SuccToSinkTo = FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge);
+
+  // If there are no outputs, it must have side-effects.
+  if (SuccToSinkTo == 0)
+    return false;
+
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
+
+  // If the block has multiple predecessors, this would introduce computation on
+  // a path that it doesn't already exist.  We could split the critical edge,
+  // but for now we just punt.
+  if (SuccToSinkTo->pred_size() > 1) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    bool TryBreak = false;
+    bool store = true;
+    if (!MI->isSafeToMove(TII, AA, store)) {
+      DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
+      TryBreak = true;
+    }
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!TryBreak && !DT->dominates(ParentBlock, SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
+      TryBreak = true;
+    }
+
+    // Don't sink instructions into a loop.
+    if (!TryBreak && LI->isLoopHeader(SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Loop header found\n");
+      TryBreak = true;
+    }
+
+    // Otherwise we are OK with sinking along a critical edge.
+    if (!TryBreak)
+      DEBUG(dbgs() << "Sinking along critical edge.\n");
+    else {
+      MachineBasicBlock *NewSucc =
+        SplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
+      if (!NewSucc) {
+        DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+                        "break critical edge\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << " *** Splitting critical edge:"
+              " BB#" << ParentBlock->getNumber()
+              << " -- BB#" << NewSucc->getNumber()
+              << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
+        SuccToSinkTo = NewSucc;
+        ++NumSplit;
+        BreakPHIEdge = false;
+      }
+    }
+  }
+
+  if (BreakPHIEdge) {
+    // BreakPHIEdge is true if all the uses are in the successor MBB being
+    // sunken into and they are all PHI nodes. In this case, machine-sink must
+    // break the critical edge first.
+    MachineBasicBlock *NewSucc = SplitCriticalEdge(MI, ParentBlock,
+                                                   SuccToSinkTo, BreakPHIEdge);
+    if (!NewSucc) {
+      DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+            "break critical edge\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << " *** Splitting critical edge:"
+          " BB#" << ParentBlock->getNumber()
+          << " -- BB#" << NewSucc->getNumber()
+          << " -- BB#" << SuccToSinkTo->getNumber() << '\n');
+    SuccToSinkTo = NewSucc;
+    ++NumSplit;
+  }
+
+  // Determine where to insert into. Skip phi nodes.
+  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
+  while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
+    ++InsertPos;
+
+  // collect matching debug values.
+  SmallVector<MachineInstr *, 2> DbgValuesToSink;
+  collectDebugValues(MI, DbgValuesToSink);
+
+  // Move the instruction.
+  SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
+                       ++MachineBasicBlock::iterator(MI));
+
+  // Move debug values.
+  for (SmallVector<MachineInstr *, 2>::iterator DBI = DbgValuesToSink.begin(),
+         DBE = DbgValuesToSink.end(); DBI != DBE; ++DBI) {
+    MachineInstr *DbgMI = *DBI;
+    SuccToSinkTo->splice(InsertPos, ParentBlock,  DbgMI,
+                         ++MachineBasicBlock::iterator(DbgMI));
+  }
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MI->clearKillInfo();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
new file mode 100644
index 000000000000..49d8c4e9470d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -0,0 +1,1290 @@
+//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-trace-metrics"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+char MachineTraceMetrics::ID = 0;
+char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+
+MachineTraceMetrics::MachineTraceMetrics()
+  : MachineFunctionPass(ID), MF(0), TII(0), TRI(0), MRI(0), Loops(0) {
+  std::fill(Ensembles, array_endof(Ensembles), (Ensemble*)0);
+}
+
+void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  TII = MF->getTarget().getInstrInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &ST =
+    MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+  BlockInfo.resize(MF->getNumBlockIDs());
+  ProcResourceCycles.resize(MF->getNumBlockIDs() *
+                            SchedModel.getNumProcResourceKinds());
+  return false;
+}
+
+void MachineTraceMetrics::releaseMemory() {
+  MF = 0;
+  BlockInfo.clear();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i) {
+    delete Ensembles[i];
+    Ensembles[i] = 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                          Fixed block information
+//===----------------------------------------------------------------------===//
+//
+// The number of instructions in a basic block and the CPU resources used by
+// those instructions don't depend on any given trace strategy.
+
+/// Compute the resource usage in basic block MBB.
+const MachineTraceMetrics::FixedBlockInfo*
+MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
+  assert(MBB && "No basic block");
+  FixedBlockInfo *FBI = &BlockInfo[MBB->getNumber()];
+  if (FBI->hasResources())
+    return FBI;
+
+  // Compute resource usage in the block.
+  FBI->HasCalls = false;
+  unsigned InstrCount = 0;
+
+  // Add up per-processor resource cycles as well.
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  SmallVector<unsigned, 32> PRCycles(PRKinds);
+
+  for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    const MachineInstr *MI = I;
+    if (MI->isTransient())
+      continue;
+    ++InstrCount;
+    if (MI->isCall())
+      FBI->HasCalls = true;
+
+    // Count processor resources used.
+    if (!SchedModel.hasInstrSchedModel())
+      continue;
+    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(MI);
+    if (!SC->isValid())
+      continue;
+
+    for (TargetSchedModel::ProcResIter
+         PI = SchedModel.getWriteProcResBegin(SC),
+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+      assert(PI->ProcResourceIdx < PRKinds && "Bad processor resource kind");
+      PRCycles[PI->ProcResourceIdx] += PI->Cycles;
+    }
+  }
+  FBI->InstrCount = InstrCount;
+
+  // Scale the resource cycles so they are comparable.
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceCycles[PROffset + K] =
+      PRCycles[K] * SchedModel.getResourceFactor(K);
+
+  return FBI;
+}
+
+ArrayRef<unsigned>
+MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
+  assert(BlockInfo[MBBNum].hasResources() &&
+         "getResources() must be called before getProcResourceCycles()");
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceCycles.size());
+  return ArrayRef<unsigned>(ProcResourceCycles.data() + MBBNum * PRKinds,
+                            PRKinds);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Ensemble utility functions
+//===----------------------------------------------------------------------===//
+
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
+  BlockInfo.resize(MTM.BlockInfo.size());
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);
+  ProcResourceHeights.resize(MTM.BlockInfo.size() * PRKinds);
+}
+
+// Virtual destructor serves as an anchor.
+MachineTraceMetrics::Ensemble::~Ensemble() {}
+
+const MachineLoop*
+MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
+  return MTM.Loops->getLoopFor(MBB);
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace above MBB.
+void MachineTraceMetrics::Ensemble::
+computeDepthResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources from trace above. The top block is simple.
+  if (!TBI->Pred) {
+    TBI->InstrDepth = 0;
+    TBI->Head = MBB->getNumber();
+    std::fill(ProcResourceDepths.begin() + PROffset,
+              ProcResourceDepths.begin() + PROffset + PRKinds, 0);
+    return;
+  }
+
+  // Compute from the block above. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned PredNum = TBI->Pred->getNumber();
+  TraceBlockInfo *PredTBI = &BlockInfo[PredNum];
+  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");
+  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);
+  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;
+  TBI->Head = PredTBI->Head;
+
+  // Compute per-resource depths.
+  ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);
+  ArrayRef<unsigned> PredPRCycles = MTM.getProcResourceCycles(PredNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace below MBB.
+void MachineTraceMetrics::Ensemble::
+computeHeightResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources for the current block.
+  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;
+  ArrayRef<unsigned> PRCycles = MTM.getProcResourceCycles(MBB->getNumber());
+
+  // The trace tail is done.
+  if (!TBI->Succ) {
+    TBI->Tail = MBB->getNumber();
+    std::copy(PRCycles.begin(), PRCycles.end(),
+              ProcResourceHeights.begin() + PROffset);
+    return;
+  }
+
+  // Compute from the block below. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned SuccNum = TBI->Succ->getNumber();
+  TraceBlockInfo *SuccTBI = &BlockInfo[SuccNum];
+  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");
+  TBI->InstrHeight += SuccTBI->InstrHeight;
+  TBI->Tail = SuccTBI->Tail;
+
+  // Compute per-resource heights.
+  ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];
+}
+
+// Check if depth resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getDepthResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidDepth() ? TBI : 0;
+}
+
+// Check if height resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getHeightResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidHeight() ? TBI : 0;
+}
+
+/// Get an array of processor resource depths for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by all blocks preceding MBB in its trace. It does not include instructions
+/// in MBB.
+///
+/// Compare TraceBlockInfo::InstrDepth.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceDepths(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
+  return ArrayRef<unsigned>(ProcResourceDepths.data() + MBBNum * PRKinds,
+                            PRKinds);
+}
+
+/// Get an array of processor resource heights for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by this block and all blocks following it in its trace.
+///
+/// Compare TraceBlockInfo::InstrHeight.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceHeights(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
+  return ArrayRef<unsigned>(ProcResourceHeights.data() + MBBNum * PRKinds,
+                            PRKinds);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Trace Selection Strategies
+//===----------------------------------------------------------------------===//
+//
+// A trace selection strategy is implemented as a sub-class of Ensemble. The
+// trace through a block B is computed by two DFS traversals of the CFG
+// starting from B. One upwards, and one downwards. During the upwards DFS,
+// pickTracePred() is called on the post-ordered blocks. During the downwards
+// DFS, pickTraceSucc() is called in a post-order.
+//
+
+// We never allow traces that leave loops, but we do allow traces to enter
+// nested loops. We also never allow traces to contain back-edges.
+//
+// This means that a loop header can never appear above the center block of a
+// trace, except as the trace head. Below the center block, loop exiting edges
+// are banned.
+//
+// Return true if an edge from the From loop to the To loop is leaving a loop.
+// Either of To and From can be null.
+static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
+  return From && !From->contains(To);
+}
+
+// MinInstrCountEnsemble - Pick the trace that executes the least number of
+// instructions.
+namespace {
+class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const { return "MinInstr"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock*);
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*);
+
+public:
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
+};
+}
+
+// Select the preferred predecessor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  // Don't leave loops, and never follow back-edges.
+  if (CurLoop && MBB == CurLoop->getHeader())
+    return 0;
+  unsigned CurCount = MTM.getResources(MBB)->InstrCount;
+  const MachineBasicBlock *Best = 0;
+  unsigned BestDepth = 0;
+  for (MachineBasicBlock::const_pred_iterator
+       I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *Pred = *I;
+    const MachineTraceMetrics::TraceBlockInfo *PredTBI =
+      getDepthResources(Pred);
+    // Ignore cycles that aren't natural loops.
+    if (!PredTBI)
+      continue;
+    // Pick the predecessor that would give this block the smallest InstrDepth.
+    unsigned Depth = PredTBI->InstrDepth + CurCount;
+    if (!Best || Depth < BestDepth)
+      Best = Pred, BestDepth = Depth;
+  }
+  return Best;
+}
+
+// Select the preferred successor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  const MachineBasicBlock *Best = 0;
+  unsigned BestHeight = 0;
+  for (MachineBasicBlock::const_succ_iterator
+       I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    // Don't consider back-edges.
+    if (CurLoop && Succ == CurLoop->getHeader())
+      continue;
+    // Don't consider successors exiting CurLoop.
+    if (isExitingLoop(CurLoop, getLoopFor(Succ)))
+      continue;
+    const MachineTraceMetrics::TraceBlockInfo *SuccTBI =
+      getHeightResources(Succ);
+    // Ignore cycles that aren't natural loops.
+    if (!SuccTBI)
+      continue;
+    // Pick the successor that would give this block the smallest InstrHeight.
+    unsigned Height = SuccTBI->InstrHeight;
+    if (!Best || Height < BestHeight)
+      Best = Succ, BestHeight = Height;
+  }
+  return Best;
+}
+
+// Get an Ensemble sub-class for the requested trace strategy.
+MachineTraceMetrics::Ensemble *
+MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) {
+  assert(strategy < TS_NumStrategies && "Invalid trace strategy enum");
+  Ensemble *&E = Ensembles[strategy];
+  if (E)
+    return E;
+
+  // Allocate new Ensemble on demand.
+  switch (strategy) {
+  case TS_MinInstrCount: return (E = new MinInstrCountEnsemble(this));
+  default: llvm_unreachable("Invalid trace strategy enum");
+  }
+}
+
+void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Invalidate traces through BB#" << MBB->getNumber() << '\n');
+  BlockInfo[MBB->getNumber()].invalidate();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->invalidate(MBB);
+}
+
+void MachineTraceMetrics::verifyAnalysis() const {
+  if (!MF)
+    return;
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->verify();
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                               Trace building
+//===----------------------------------------------------------------------===//
+//
+// Traces are built by two CFG traversals. To avoid recomputing too much, use a
+// set abstraction that confines the search to the current loop, and doesn't
+// revisit blocks.
+
+namespace {
+struct LoopBounds {
+  MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
+  SmallPtrSet<const MachineBasicBlock*, 8> Visited;
+  const MachineLoopInfo *Loops;
+  bool Downward;
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops)
+    : Blocks(blocks), Loops(loops), Downward(false) {}
+};
+}
+
+// Specialize po_iterator_storage in order to prune the post-order traversal so
+// it is limited to the current loop and doesn't traverse the loop back edges.
+namespace llvm {
+template<>
+class po_iterator_storage<LoopBounds, true> {
+  LoopBounds &LB;
+public:
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+  void finishPostorder(const MachineBasicBlock*) {}
+
+  bool insertEdge(const MachineBasicBlock *From, const MachineBasicBlock *To) {
+    // Skip already visited To blocks.
+    MachineTraceMetrics::TraceBlockInfo &TBI = LB.Blocks[To->getNumber()];
+    if (LB.Downward ? TBI.hasValidHeight() : TBI.hasValidDepth())
+      return false;
+    // From is null once when To is the trace center block.
+    if (From) {
+      if (const MachineLoop *FromLoop = LB.Loops->getLoopFor(From)) {
+        // Don't follow backedges, don't leave FromLoop when going upwards.
+        if ((LB.Downward ? To : From) == FromLoop->getHeader())
+          return false;
+        // Don't leave FromLoop.
+        if (isExitingLoop(FromLoop, LB.Loops->getLoopFor(To)))
+          return false;
+      }
+    }
+    // To is a new block. Mark the block as visited in case the CFG has cycles
+    // that MachineLoopInfo didn't recognize as a natural loop.
+    return LB.Visited.insert(To);
+  }
+};
+}
+
+/// Compute the trace through MBB.
+void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Computing " << getName() << " trace through BB#"
+               << MBB->getNumber() << '\n');
+  // Set up loop bounds for the backwards post-order traversal.
+  LoopBounds Bounds(BlockInfo, MTM.Loops);
+
+  // Run an upwards post-order search for the trace start.
+  Bounds.Downward = false;
+  Bounds.Visited.clear();
+  typedef ipo_ext_iterator<const MachineBasicBlock*, LoopBounds> UpwardPO;
+  for (UpwardPO I = ipo_ext_begin(MBB, Bounds), E = ipo_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  pred for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the predecessors have been visited, pick the preferred one.
+    TBI.Pred = pickTracePred(*I);
+    DEBUG({
+      if (TBI.Pred)
+        dbgs() << "BB#" << TBI.Pred->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leading to I is now known, compute the depth resources.
+    computeDepthResources(*I);
+  }
+
+  // Run a downwards post-order search for the trace end.
+  Bounds.Downward = true;
+  Bounds.Visited.clear();
+  typedef po_ext_iterator<const MachineBasicBlock*, LoopBounds> DownwardPO;
+  for (DownwardPO I = po_ext_begin(MBB, Bounds), E = po_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  succ for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the successors have been visited, pick the preferred one.
+    TBI.Succ = pickTraceSucc(*I);
+    DEBUG({
+      if (TBI.Succ)
+        dbgs() << "BB#" << TBI.Succ->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leaving I is now known, compute the height resources.
+    computeHeightResources(*I);
+  }
+}
+
+/// Invalidate traces through BadMBB.
+void
+MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
+  SmallVector<const MachineBasicBlock*, 16> WorkList;
+  TraceBlockInfo &BadTBI = BlockInfo[BadMBB->getNumber()];
+
+  // Invalidate height resources of blocks above MBB.
+  if (BadTBI.hasValidHeight()) {
+    BadTBI.invalidateHeight();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " height.\n");
+      // Find any MBB predecessors that have MBB as their preferred successor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_pred_iterator
+           I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidHeight())
+          continue;
+        if (TBI.Succ == MBB) {
+          TBI.invalidateHeight();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Succ is actually a *I successor.
+        assert((!TBI.Succ || (*I)->isSuccessor(TBI.Succ)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Invalidate depth resources of blocks below MBB.
+  if (BadTBI.hasValidDepth()) {
+    BadTBI.invalidateDepth();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " depth.\n");
+      // Find any MBB successors that have MBB as their preferred predecessor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_succ_iterator
+           I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidDepth())
+          continue;
+        if (TBI.Pred == MBB) {
+          TBI.invalidateDepth();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Pred is actually a *I predecessor.
+        assert((!TBI.Pred || (*I)->isPredecessor(TBI.Pred)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Clear any per-instruction data. We only have to do this for BadMBB itself
+  // because the instructions in that block may change. Other blocks may be
+  // invalidated, but their instructions will stay the same, so there is no
+  // need to erase the Cycle entries. They will be overwritten when we
+  // recompute.
+  for (MachineBasicBlock::const_iterator I = BadMBB->begin(), E = BadMBB->end();
+       I != E; ++I)
+    Cycles.erase(I);
+}
+
+void MachineTraceMetrics::Ensemble::verify() const {
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
+         "Outdated BlockInfo size");
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+    const TraceBlockInfo &TBI = BlockInfo[Num];
+    if (TBI.hasValidDepth() && TBI.Pred) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isPredecessor(TBI.Pred) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Pred->getNumber()].hasValidDepth() &&
+             "Trace is broken, depth should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      assert(!(Loop && MBB == Loop->getHeader()) && "Trace contains backedge");
+    }
+    if (TBI.hasValidHeight() && TBI.Succ) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isSuccessor(TBI.Succ) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Succ->getNumber()].hasValidHeight() &&
+             "Trace is broken, height should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      const MachineLoop *SuccLoop = getLoopFor(TBI.Succ);
+      assert(!(Loop && Loop == SuccLoop && TBI.Succ == Loop->getHeader()) &&
+             "Trace contains backedge");
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                             Data Dependencies
+//===----------------------------------------------------------------------===//
+//
+// Compute the depth and height of each instruction based on data dependencies
+// and instruction latencies. These cycle numbers assume that the CPU can issue
+// an infinite number of instructions per cycle as long as their dependencies
+// are ready.
+
+// A data dependency is represented as a defining MI and operand numbers on the
+// defining and using MI.
+namespace {
+struct DataDep {
+  const MachineInstr *DefMI;
+  unsigned DefOp;
+  unsigned UseOp;
+
+  DataDep(const MachineInstr *DefMI, unsigned DefOp, unsigned UseOp)
+    : DefMI(DefMI), DefOp(DefOp), UseOp(UseOp) {}
+
+  /// Create a DataDep from an SSA form virtual register.
+  DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
+    : UseOp(UseOp) {
+    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
+    assert(!DefI.atEnd() && "Register has no defs");
+    DefMI = &*DefI;
+    DefOp = DefI.getOperandNo();
+    assert((++DefI).atEnd() && "Register has multiple defs");
+  }
+};
+}
+
+// Get the input data dependencies that must be ready before UseMI can issue.
+// Return true if UseMI has any physreg operands.
+static bool getDataDeps(const MachineInstr *UseMI,
+                        SmallVectorImpl<DataDep> &Deps,
+                        const MachineRegisterInfo *MRI) {
+  bool HasPhysRegs = false;
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      HasPhysRegs = true;
+      continue;
+    }
+    // Collect virtual register reads.
+    if (MO->readsReg())
+      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
+  }
+  return HasPhysRegs;
+}
+
+// Get the input data dependencies of a PHI instruction, using Pred as the
+// preferred predecessor.
+// This will add at most one dependency to Deps.
+static void getPHIDeps(const MachineInstr *UseMI,
+                       SmallVectorImpl<DataDep> &Deps,
+                       const MachineBasicBlock *Pred,
+                       const MachineRegisterInfo *MRI) {
+  // No predecessor at the beginning of a trace. Ignore dependencies.
+  if (!Pred)
+    return;
+  assert(UseMI->isPHI() && UseMI->getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI->getNumOperands(); i += 2) {
+    if (UseMI->getOperand(i + 1).getMBB() == Pred) {
+      unsigned Reg = UseMI->getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
+      return;
+    }
+  }
+}
+
+// Keep track of physreg data dependencies by recording each live register unit.
+// Associate each regunit with an instruction operand. Depending on the
+// direction instructions are scanned, it could be the operand that defined the
+// regunit, or the highest operand to read the regunit.
+namespace {
+struct LiveRegUnit {
+  unsigned RegUnit;
+  unsigned Cycle;
+  const MachineInstr *MI;
+  unsigned Op;
+
+  unsigned getSparseSetIndex() const { return RegUnit; }
+
+  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(0), Op(0) {}
+};
+}
+
+// Identify physreg dependencies for UseMI, and update the live regunit
+// tracking set when scanning instructions downwards.
+static void updatePhysDepsDownwards(const MachineInstr *UseMI,
+                                    SmallVectorImpl<DataDep> &Deps,
+                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> Kills;
+  SmallVector<unsigned, 8> LiveDefOps;
+
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    // Track live defs and kills for updating RegUnits.
+    if (MO->isDef()) {
+      if (MO->isDead())
+        Kills.push_back(Reg);
+      else
+        LiveDefOps.push_back(MO.getOperandNo());
+    } else if (MO->isKill())
+      Kills.push_back(Reg);
+    // Identify dependencies.
+    if (!MO->readsReg())
+      continue;
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
+      break;
+    }
+  }
+
+  // Update RegUnits to reflect live registers after UseMI.
+  // First kills.
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+      RegUnits.erase(*Units);
+
+  // Second, live defs.
+  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
+    unsigned DefOp = LiveDefOps[i];
+    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
+         Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      LRU.MI = UseMI;
+      LRU.Op = DefOp;
+    }
+  }
+}
+
+/// The length of the critical path through a trace is the maximum of two path
+/// lengths:
+///
+/// 1. The maximum height+depth over all instructions in the trace center block.
+///
+/// 2. The longest cross-block dependency chain. For small blocks, it is
+///    possible that the critical path through the trace doesn't include any
+///    instructions in the block.
+///
+/// This function computes the second number from the live-in list of the
+/// center block.
+unsigned MachineTraceMetrics::Ensemble::
+computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
+  assert(TBI.HasValidInstrDepths && "Missing depth info");
+  assert(TBI.HasValidInstrHeights && "Missing height info");
+  unsigned MaxLen = 0;
+  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+    const LiveInReg &LIR = TBI.LiveIns[i];
+    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+      continue;
+    const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+    // Ignore dependencies outside the current trace.
+    const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
+    if (!DefTBI.isUsefulDominator(TBI))
+      continue;
+    unsigned Len = LIR.Height + Cycles[DefMI].Depth;
+    MaxLen = std::max(MaxLen, Len);
+  }
+  return MaxLen;
+}
+
+/// Compute instruction depths for all instructions above or in MBB in its
+/// trace. This assumes that the trace through MBB has already been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrDepths(const MachineBasicBlock *MBB) {
+  // The top of the trace may already be computed, and HasValidInstrDepths
+  // implies Head->HasValidInstrDepths, so we only need to start from the first
+  // block in the trace that needs to be recomputed.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidDepth() && "Incomplete trace");
+    if (TBI.HasValidInstrDepths)
+      break;
+    Stack.push_back(MBB);
+    MBB = TBI.Pred;
+  } while (MBB);
+
+  // FIXME: If MBB is non-null at this point, it is the last pre-computed block
+  // in the trace. We should track any live-out physregs that were defined in
+  // the trace. This is quite rare in SSA form, typically created by CSE
+  // hoisting a compare.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // Go through trace blocks in top-down order, stopping after the center block.
+  SmallVector<DataDep, 8> Deps;
+  while (!Stack.empty()) {
+    MBB = Stack.pop_back_val();
+    DEBUG(dbgs() << "\nDepths for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrDepths = true;
+    TBI.CriticalPath = 0;
+
+    // Print out resource depths here as well.
+    DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
+      ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
+      for (unsigned K = 0; K != PRDepths.size(); ++K)
+        if (PRDepths[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRDepths[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Also compute the critical path length through MBB when possible.
+    if (TBI.HasValidInstrHeights)
+      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
+
+    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      const MachineInstr *UseMI = I;
+
+      // Collect all data dependencies.
+      Deps.clear();
+      if (UseMI->isPHI())
+        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(UseMI, Deps, MTM.MRI))
+        updatePhysDepsDownwards(UseMI, Deps, RegUnits, MTM.TRI);
+
+      // Filter and process dependencies, computing the earliest issue cycle.
+      unsigned Cycle = 0;
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+        const DataDep &Dep = Deps[i];
+        const TraceBlockInfo&DepTBI =
+          BlockInfo[Dep.DefMI->getParent()->getNumber()];
+        // Ignore dependencies from outside the current trace.
+        if (!DepTBI.isUsefulDominator(TBI))
+          continue;
+        assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
+        unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
+        // Add latency if DefMI is a real instruction. Transients get latency 0.
+        if (!Dep.DefMI->isTransient())
+          DepCycle += MTM.SchedModel
+            .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp,
+                                   /* FindMin = */ false);
+        Cycle = std::max(Cycle, DepCycle);
+      }
+      // Remember the instruction depth.
+      InstrCycles &MICycles = Cycles[UseMI];
+      MICycles.Depth = Cycle;
+
+      if (!TBI.HasValidInstrHeights) {
+        DEBUG(dbgs() << Cycle << '\t' << *UseMI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *UseMI);
+    }
+  }
+}
+
+// Identify physreg dependencies for MI when scanning instructions upwards.
+// Return the issue height of MI after considering any live regunits.
+// Height is the issue height computed from virtual register dependencies alone.
+static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
+                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      const TargetSchedModel &SchedModel,
+                                      const TargetInstrInfo *TII,
+                                      const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> ReadOps;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO->readsReg())
+      ReadOps.push_back(MO.getOperandNo());
+    if (!MO->isDef())
+      continue;
+    // This is a def of Reg. Remove corresponding entries from RegUnits, and
+    // update MI Height to consider the physreg dependencies.
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      unsigned DepHeight = I->Cycle;
+      if (!MI->isTransient()) {
+        // We may not know the UseMI of this dependency, if it came from the
+        // live-in list. SchedModel can handle a NULL UseMI.
+        DepHeight += SchedModel
+          .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op,
+                                 /* FindMin = */ false);
+      }
+      Height = std::max(Height, DepHeight);
+      // This regunit is dead above MI.
+      RegUnits.erase(I);
+    }
+  }
+
+  // Now we know the height of MI. Update any regunits read.
+  for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
+    unsigned Reg = MI->getOperand(ReadOps[i]).getReg();
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      // Set the height to the highest reader of the unit.
+      if (LRU.Cycle <= Height && LRU.MI != MI) {
+        LRU.Cycle = Height;
+        LRU.MI = MI;
+        LRU.Op = ReadOps[i];
+      }
+    }
+  }
+
+  return Height;
+}
+
+
+typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
+
+// Push the height of DefMI upwards if required to match UseMI.
+// Return true if this is the first time DefMI was seen.
+static bool pushDepHeight(const DataDep &Dep,
+                          const MachineInstr *UseMI, unsigned UseHeight,
+                          MIHeightMap &Heights,
+                          const TargetSchedModel &SchedModel,
+                          const TargetInstrInfo *TII) {
+  // Adjust height by Dep.DefMI latency.
+  if (!Dep.DefMI->isTransient())
+    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
+                                                  UseMI, Dep.UseOp, false);
+
+  // Update Heights[DefMI] to be the maximum height seen.
+  MIHeightMap::iterator I;
+  bool New;
+  tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
+    return true;
+
+  // DefMI has been pushed before. Give it the max height.
+  if (I->second < UseHeight)
+    I->second = UseHeight;
+  return false;
+}
+
+/// Assuming that the virtual register defined by DefMI:DefOp was used by
+/// Trace.back(), add it to the live-in lists of all the blocks in Trace. Stop
+/// when reaching the block that contains DefMI.
+void MachineTraceMetrics::Ensemble::
+addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
+           ArrayRef<const MachineBasicBlock*> Trace) {
+  assert(!Trace.empty() && "Trace should contain at least one block");
+  unsigned Reg = DefMI->getOperand(DefOp).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const MachineBasicBlock *DefMBB = DefMI->getParent();
+
+  // Reg is live-in to all blocks in Trace that follow DefMBB.
+  for (unsigned i = Trace.size(); i; --i) {
+    const MachineBasicBlock *MBB = Trace[i-1];
+    if (MBB == DefMBB)
+      return;
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    // Just add the register. The height will be updated later.
+    TBI.LiveIns.push_back(Reg);
+  }
+}
+
+/// Compute instruction heights in the trace through MBB. This updates MBB and
+/// the blocks below it in the trace. It is assumed that the trace has already
+/// been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrHeights(const MachineBasicBlock *MBB) {
+  // The bottom of the trace may already be computed.
+  // Find the blocks that need updating.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidHeight() && "Incomplete trace");
+    if (TBI.HasValidInstrHeights)
+      break;
+    Stack.push_back(MBB);
+    TBI.LiveIns.clear();
+    MBB = TBI.Succ;
+  } while (MBB);
+
+  // As we move upwards in the trace, keep track of instructions that are
+  // required by deeper trace instructions. Map MI -> height required so far.
+  MIHeightMap Heights;
+
+  // For physregs, the def isn't known when we see the use.
+  // Instead, keep track of the highest use of each regunit.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // If the bottom of the trace was already precomputed, initialize heights
+  // from its live-in list.
+  // MBB is the highest precomputed block in the trace.
+  if (MBB) {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg LI = TBI.LiveIns[i];
+      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+        // For virtual registers, the def latency is included.
+        unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
+        if (Height < LI.Height)
+          Height = LI.Height;
+      } else {
+        // For register units, the def latency is not included because we don't
+        // know the def yet.
+        RegUnits[LI.Reg].Cycle = LI.Height;
+      }
+    }
+  }
+
+  // Go through the trace blocks in bottom-up order.
+  SmallVector<DataDep, 8> Deps;
+  for (;!Stack.empty(); Stack.pop_back()) {
+    MBB = Stack.back();
+    DEBUG(dbgs() << "Heights for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrHeights = true;
+    TBI.CriticalPath = 0;
+
+    DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
+      ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
+      for (unsigned K = 0; K != PRHeights.size(); ++K)
+        if (PRHeights[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRHeights[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Get dependencies from PHIs in the trace successor.
+    const MachineBasicBlock *Succ = TBI.Succ;
+    // If MBB is the last block in the trace, and it has a back-edge to the
+    // loop header, get loop-carried dependencies from PHIs in the header. For
+    // that purpose, pretend that all the loop header PHIs have height 0.
+    if (!Succ)
+      if (const MachineLoop *Loop = getLoopFor(MBB))
+        if (MBB->isSuccessor(Loop->getHeader()))
+          Succ = Loop->getHeader();
+
+    if (Succ) {
+      for (MachineBasicBlock::const_iterator I = Succ->begin(), E = Succ->end();
+           I != E && I->isPHI(); ++I) {
+        const MachineInstr *PHI = I;
+        Deps.clear();
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        if (!Deps.empty()) {
+          // Loop header PHI heights are all 0.
+          unsigned Height = TBI.Succ ? Cycles.lookup(PHI).Height : 0;
+          DEBUG(dbgs() << "pred\t" << Height << '\t' << *PHI);
+          if (pushDepHeight(Deps.front(), PHI, Height,
+                            Heights, MTM.SchedModel, MTM.TII))
+            addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
+        }
+      }
+    }
+
+    // Go through the block backwards.
+    for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin();
+         BI != BB;) {
+      const MachineInstr *MI = --BI;
+
+      // Find the MI height as determined by virtual register uses in the
+      // trace below.
+      unsigned Cycle = 0;
+      MIHeightMap::iterator HeightI = Heights.find(MI);
+      if (HeightI != Heights.end()) {
+        Cycle = HeightI->second;
+        // We won't be seeing any more MI uses.
+        Heights.erase(HeightI);
+      }
+
+      // Don't process PHI deps. They depend on the specific predecessor, and
+      // we'll get them when visiting the predecessor.
+      Deps.clear();
+      bool HasPhysRegs = !MI->isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+
+      // There may also be regunit dependencies to include in the height.
+      if (HasPhysRegs)
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits,
+                                      MTM.SchedModel, MTM.TII, MTM.TRI);
+
+      // Update the required height of any virtual registers read by MI.
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i)
+        if (pushDepHeight(Deps[i], MI, Cycle, Heights, MTM.SchedModel, MTM.TII))
+          addLiveIns(Deps[i].DefMI, Deps[i].DefOp, Stack);
+
+      InstrCycles &MICycles = Cycles[MI];
+      MICycles.Height = Cycle;
+      if (!TBI.HasValidInstrDepths) {
+        DEBUG(dbgs() << Cycle << '\t' << *MI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *MI);
+    }
+
+    // Update virtual live-in heights. They were added by addLiveIns() with a 0
+    // height because the final height isn't known until now.
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() <<  " Live-ins:");
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg &LIR = TBI.LiveIns[i];
+      const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+      LIR.Height = Heights.lookup(DefMI);
+      DEBUG(dbgs() << ' ' << PrintReg(LIR.Reg) << '@' << LIR.Height);
+    }
+
+    // Transfer the live regunits to the live-in list.
+    for (SparseSet<LiveRegUnit>::const_iterator
+         RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) {
+      TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle));
+      DEBUG(dbgs() << ' ' << PrintRegUnit(RI->RegUnit, MTM.TRI)
+                   << '@' << RI->Cycle);
+    }
+    DEBUG(dbgs() << '\n');
+
+    if (!TBI.HasValidInstrDepths)
+      continue;
+    // Add live-ins to the critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath,
+                                computeCrossBlockCriticalPath(TBI));
+    DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+  }
+}
+
+MachineTraceMetrics::Trace
+MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
+  // FIXME: Check cache tags, recompute as needed.
+  computeTrace(MBB);
+  computeInstrDepths(MBB);
+  computeInstrHeights(MBB);
+  return Trace(*this, BlockInfo[MBB->getNumber()]);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr *MI) const {
+  assert(MI && "Not an instruction.");
+  assert(getBlockNum() == unsigned(MI->getParent()->getNumber()) &&
+         "MI must be in the trace center block");
+  InstrCycles Cyc = getInstrCycles(MI);
+  return getCriticalPath() - (Cyc.Depth + Cyc.Height);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
+  const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
+  SmallVector<DataDep, 1> Deps;
+  getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
+  assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
+  DataDep &Dep = Deps.front();
+  unsigned DepCycle = getInstrCycles(Dep.DefMI).Depth;
+  // Add latency if DefMI is a real instruction. Transients get latency 0.
+  if (!Dep.DefMI->isTransient())
+    DepCycle += TE.MTM.SchedModel
+      .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp, false);
+  return DepCycle;
+}
+
+unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
+  // Find the limiting processor resource.
+  // Numbers have been pre-scaled to be comparable.
+  unsigned PRMax = 0;
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  if (Bottom) {
+    ArrayRef<unsigned> PRCycles = TE.MTM.getProcResourceCycles(getBlockNum());
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
+      PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
+  } else {
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
+      PRMax = std::max(PRMax, PRDepths[K]);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  unsigned Instrs = TBI.InstrDepth;
+  if (Bottom)
+    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+unsigned MachineTraceMetrics::Trace::
+getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks) const {
+  // Add up resources above and below the center block.
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());
+  unsigned PRMax = 0;
+  for (unsigned K = 0; K != PRDepths.size(); ++K) {
+    unsigned PRCycles = PRDepths[K] + PRHeights[K];
+    for (unsigned I = 0; I != Extrablocks.size(); ++I)
+      PRCycles += TE.MTM.getProcResourceCycles(Extrablocks[I]->getNumber())[K];
+    PRMax = std::max(PRMax, PRCycles);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
+    Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
+  OS << getName() << " ensemble:\n";
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  BB#" << i << '\t';
+    BlockInfo[i].print(OS);
+    OS << '\n';
+  }
+}
+
+void MachineTraceMetrics::TraceBlockInfo::print(raw_ostream &OS) const {
+  if (hasValidDepth()) {
+    OS << "depth=" << InstrDepth;
+    if (Pred)
+      OS << " pred=BB#" << Pred->getNumber();
+    else
+      OS << " pred=null";
+    OS << " head=BB#" << Head;
+    if (HasValidInstrDepths)
+      OS << " +instrs";
+  } else
+    OS << "depth invalid";
+  OS << ", ";
+  if (hasValidHeight()) {
+    OS << "height=" << InstrHeight;
+    if (Succ)
+      OS << " succ=BB#" << Succ->getNumber();
+    else
+      OS << " succ=null";
+    OS << " tail=BB#" << Tail;
+    if (HasValidInstrHeights)
+      OS << " +instrs";
+  } else
+    OS << "height invalid";
+  if (HasValidInstrDepths && HasValidInstrHeights)
+    OS << ", crit=" << CriticalPath;
+}
+
+void MachineTraceMetrics::Trace::print(raw_ostream &OS) const {
+  unsigned MBBNum = &TBI - &TE.BlockInfo[0];
+
+  OS << TE.getName() << " trace BB#" << TBI.Head << " --> BB#" << MBBNum
+     << " --> BB#" << TBI.Tail << ':';
+  if (TBI.hasValidHeight() && TBI.hasValidDepth())
+    OS << ' ' << getInstrCount() << " instrs.";
+  if (TBI.HasValidInstrDepths && TBI.HasValidInstrHeights)
+    OS << ' ' << TBI.CriticalPath << " cycles.";
+
+  const MachineTraceMetrics::TraceBlockInfo *Block = &TBI;
+  OS << "\nBB#" << MBBNum;
+  while (Block->hasValidDepth() && Block->Pred) {
+    unsigned Num = Block->Pred->getNumber();
+    OS << " <- BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+
+  Block = &TBI;
+  OS << "\n    ";
+  while (Block->hasValidHeight() && Block->Succ) {
+    unsigned Num = Block->Succ->getNumber();
+    OS << " -> BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+  OS << '\n';
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
new file mode 100644
index 000000000000..4b1230029a74
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -0,0 +1,1605 @@
+//===-- MachineVerifier.cpp - Machine Code Verifier -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass to verify generated machine code. The following is checked:
+//
+// Operand counts: All explicit operands must be present.
+//
+// Register classes: All physical and virtual register operands must be
+// compatible with the register class required by the instruction descriptor.
+//
+// Register live intervals: Registers must be defined only once, and must be
+// defined before use.
+//
+// The machine code verifier is enabled from LLVMTargetMachine.cpp with the
+// command-line option -verify-machineinstrs, or by defining the environment
+// variable LLVM_VERIFY_MACHINEINSTRS to the name of a file that will receive
+// the verifier errors.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+  struct MachineVerifier {
+
+    MachineVerifier(Pass *pass, const char *b) :
+      PASS(pass),
+      Banner(b),
+      OutFileName(getenv("LLVM_VERIFY_MACHINEINSTRS"))
+      {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    Pass *const PASS;
+    const char *Banner;
+    const char *const OutFileName;
+    raw_ostream *OS;
+    const MachineFunction *MF;
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+
+    unsigned foundErrors;
+
+    typedef SmallVector<unsigned, 16> RegVector;
+    typedef SmallVector<const uint32_t*, 4> RegMaskVector;
+    typedef DenseSet<unsigned> RegSet;
+    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+    typedef SmallPtrSet<const MachineBasicBlock*, 8> BlockSet;
+
+    const MachineInstr *FirstTerminator;
+    BlockSet FunctionBlocks;
+
+    BitVector regsReserved;
+    RegSet regsLive;
+    RegVector regsDefined, regsDead, regsKilled;
+    RegMaskVector regMasks;
+    RegSet regsLiveInButUnused;
+
+    SlotIndex lastIndex;
+
+    // Add Reg and any sub-registers to RV
+    void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
+      RV.push_back(Reg);
+      if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          RV.push_back(*SubRegs);
+    }
+
+    struct BBInfo {
+      // Is this MBB reachable from the MF entry point?
+      bool reachable;
+
+      // Vregs that must be live in because they are used without being
+      // defined. Map value is the user.
+      RegMap vregsLiveIn;
+
+      // Regs killed in MBB. They may be defined again, and will then be in both
+      // regsKilled and regsLiveOut.
+      RegSet regsKilled;
+
+      // Regs defined in MBB and live out. Note that vregs passing through may
+      // be live out without being mentioned here.
+      RegSet regsLiveOut;
+
+      // Vregs that pass through MBB untouched. This set is disjoint from
+      // regsKilled and regsLiveOut.
+      RegSet vregsPassed;
+
+      // Vregs that must pass through MBB because they are needed by a successor
+      // block. This set is disjoint from regsLiveOut.
+      RegSet vregsRequired;
+
+      // Set versions of block's predecessor and successor lists.
+      BlockSet Preds, Succs;
+
+      BBInfo() : reachable(false) {}
+
+      // Add register to vregsPassed if it belongs there. Return true if
+      // anything changed.
+      bool addPassed(unsigned Reg) {
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          return false;
+        if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
+          return false;
+        return vregsPassed.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addPassed(const RegSet &RS) {
+        bool changed = false;
+        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
+          if (addPassed(*I))
+            changed = true;
+        return changed;
+      }
+
+      // Add register to vregsRequired if it belongs there. Return true if
+      // anything changed.
+      bool addRequired(unsigned Reg) {
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          return false;
+        if (regsLiveOut.count(Reg))
+          return false;
+        return vregsRequired.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addRequired(const RegSet &RS) {
+        bool changed = false;
+        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
+          if (addRequired(*I))
+            changed = true;
+        return changed;
+      }
+
+      // Same for a full map.
+      bool addRequired(const RegMap &RM) {
+        bool changed = false;
+        for (RegMap::const_iterator I = RM.begin(), E = RM.end(); I != E; ++I)
+          if (addRequired(I->first))
+            changed = true;
+        return changed;
+      }
+
+      // Live-out registers are either in regsLiveOut or vregsPassed.
+      bool isLiveOut(unsigned Reg) const {
+        return regsLiveOut.count(Reg) || vregsPassed.count(Reg);
+      }
+    };
+
+    // Extra register info per MBB.
+    DenseMap<const MachineBasicBlock*, BBInfo> MBBInfoMap;
+
+    bool isReserved(unsigned Reg) {
+      return Reg < regsReserved.size() && regsReserved.test(Reg);
+    }
+
+    bool isAllocatable(unsigned Reg) {
+      return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg);
+    }
+
+    // Analysis information if available
+    LiveVariables *LiveVars;
+    LiveIntervals *LiveInts;
+    LiveStacks *LiveStks;
+    SlotIndexes *Indexes;
+
+    void visitMachineFunctionBefore();
+    void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineBundleBefore(const MachineInstr *MI);
+    void visitMachineInstrBefore(const MachineInstr *MI);
+    void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
+    void visitMachineInstrAfter(const MachineInstr *MI);
+    void visitMachineBundleAfter(const MachineInstr *MI);
+    void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
+    void visitMachineFunctionAfter();
+
+    void report(const char *msg, const MachineFunction *MF);
+    void report(const char *msg, const MachineBasicBlock *MBB);
+    void report(const char *msg, const MachineInstr *MI);
+    void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveInterval &LI);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveInterval &LI);
+
+    void verifyInlineAsm(const MachineInstr *MI);
+
+    void checkLiveness(const MachineOperand *MO, unsigned MONum);
+    void markReachable(const MachineBasicBlock *MBB);
+    void calcRegsPassed();
+    void checkPHIOps(const MachineBasicBlock *MBB);
+
+    void calcRegsRequired();
+    void verifyLiveVariables();
+    void verifyLiveIntervals();
+    void verifyLiveInterval(const LiveInterval&);
+    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
+    void verifyLiveIntervalSegment(const LiveInterval&,
+                                   LiveInterval::const_iterator);
+  };
+
+  struct MachineVerifierPass : public MachineFunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+    const char *const Banner;
+
+    MachineVerifierPass(const char *b = 0)
+      : MachineFunctionPass(ID), Banner(b) {
+        initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
+      }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) {
+      MF.verify(this, Banner);
+      return false;
+    }
+  };
+
+}
+
+char MachineVerifierPass::ID = 0;
+INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
+                "Verify generated machine code", false, false)
+
+FunctionPass *llvm::createMachineVerifierPass(const char *Banner) {
+  return new MachineVerifierPass(Banner);
+}
+
+void MachineFunction::verify(Pass *p, const char *Banner) const {
+  MachineVerifier(p, Banner)
+    .runOnMachineFunction(const_cast<MachineFunction&>(*this));
+}
+
+bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
+  raw_ostream *OutFile = 0;
+  if (OutFileName) {
+    std::string ErrorInfo;
+    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
+                                 raw_fd_ostream::F_Append);
+    if (!ErrorInfo.empty()) {
+      errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n';
+      exit(1);
+    }
+
+    OS = OutFile;
+  } else {
+    OS = &errs();
+  }
+
+  foundErrors = 0;
+
+  this->MF = &MF;
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  TRI = TM->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  LiveVars = NULL;
+  LiveInts = NULL;
+  LiveStks = NULL;
+  Indexes = NULL;
+  if (PASS) {
+    LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
+    // We don't want to verify LiveVariables if LiveIntervals is available.
+    if (!LiveInts)
+      LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
+    LiveStks = PASS->getAnalysisIfAvailable<LiveStacks>();
+    Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>();
+  }
+
+  visitMachineFunctionBefore();
+  for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
+       MFI!=MFE; ++MFI) {
+    visitMachineBasicBlockBefore(MFI);
+    // Keep track of the current bundle header.
+    const MachineInstr *CurBundle = 0;
+    // Do we expect the next instruction to be part of the same bundle?
+    bool InBundle = false;
+
+    for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
+           MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
+      if (MBBI->getParent() != MFI) {
+        report("Bad instruction parent pointer", MFI);
+        *OS << "Instruction: " << *MBBI;
+        continue;
+      }
+
+      // Check for consistent bundle flags.
+      if (InBundle && !MBBI->isBundledWithPred())
+        report("Missing BundledPred flag, "
+               "BundledSucc was set on predecessor", MBBI);
+      if (!InBundle && MBBI->isBundledWithPred())
+        report("BundledPred flag is set, "
+               "but BundledSucc not set on predecessor", MBBI);
+
+      // Is this a bundle header?
+      if (!MBBI->isInsideBundle()) {
+        if (CurBundle)
+          visitMachineBundleAfter(CurBundle);
+        CurBundle = MBBI;
+        visitMachineBundleBefore(CurBundle);
+      } else if (!CurBundle)
+        report("No bundle header", MBBI);
+      visitMachineInstrBefore(MBBI);
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I)
+        visitMachineOperand(&MBBI->getOperand(I), I);
+      visitMachineInstrAfter(MBBI);
+
+      // Was this the last bundled instruction?
+      InBundle = MBBI->isBundledWithSucc();
+    }
+    if (CurBundle)
+      visitMachineBundleAfter(CurBundle);
+    if (InBundle)
+      report("BundledSucc flag set on last instruction in block", &MFI->back());
+    visitMachineBasicBlockAfter(MFI);
+  }
+  visitMachineFunctionAfter();
+
+  if (OutFile)
+    delete OutFile;
+  else if (foundErrors)
+    report_fatal_error("Found "+Twine(foundErrors)+" machine code errors.");
+
+  // Clean up.
+  regsLive.clear();
+  regsDefined.clear();
+  regsDead.clear();
+  regsKilled.clear();
+  regMasks.clear();
+  regsLiveInButUnused.clear();
+  MBBInfoMap.clear();
+
+  return false;                 // no changes
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
+  assert(MF);
+  *OS << '\n';
+  if (!foundErrors++) {
+    if (Banner)
+      *OS << "# " << Banner << '\n';
+    MF->print(*OS, Indexes);
+  }
+  *OS << "*** Bad machine code: " << msg << " ***\n"
+      << "- function:    " << MF->getName() << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
+  assert(MBB);
+  report(msg, MBB->getParent());
+  *OS << "- basic block: BB#" << MBB->getNumber()
+      << ' ' << MBB->getName()
+      << " (" << (const void*)MBB << ')';
+  if (Indexes)
+    *OS << " [" << Indexes->getMBBStartIdx(MBB)
+        << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
+  *OS << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
+  assert(MI);
+  report(msg, MI->getParent());
+  *OS << "- instruction: ";
+  if (Indexes && Indexes->hasIndex(MI))
+    *OS << Indexes->getInstructionIndex(MI) << '\t';
+  MI->print(*OS, TM);
+}
+
+void MachineVerifier::report(const char *msg,
+                             const MachineOperand *MO, unsigned MONum) {
+  assert(MO);
+  report(msg, MO->getParent());
+  *OS << "- operand " << MONum << ":   ";
+  MO->print(*OS, TM);
+  *OS << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveInterval &LI) {
+  report(msg, MF);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveInterval &LI) {
+  report(msg, MBB);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
+void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
+  BBInfo &MInfo = MBBInfoMap[MBB];
+  if (!MInfo.reachable) {
+    MInfo.reachable = true;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI)
+      markReachable(*SuI);
+  }
+}
+
+void MachineVerifier::visitMachineFunctionBefore() {
+  lastIndex = SlotIndex();
+  regsReserved = MRI->getReservedRegs();
+
+  // A sub-register of a reserved register is also reserved
+  for (int Reg = regsReserved.find_first(); Reg>=0;
+       Reg = regsReserved.find_next(Reg)) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      // FIXME: This should probably be:
+      // assert(regsReserved.test(*SubRegs) && "Non-reserved sub-register");
+      regsReserved.set(*SubRegs);
+    }
+  }
+
+  markReachable(&MF->front());
+
+  // Build a set of the basic blocks in the function.
+  FunctionBlocks.clear();
+  for (MachineFunction::const_iterator
+       I = MF->begin(), E = MF->end(); I != E; ++I) {
+    FunctionBlocks.insert(I);
+    BBInfo &MInfo = MBBInfoMap[I];
+
+    MInfo.Preds.insert(I->pred_begin(), I->pred_end());
+    if (MInfo.Preds.size() != I->pred_size())
+      report("MBB has duplicate entries in its predecessor list.", I);
+
+    MInfo.Succs.insert(I->succ_begin(), I->succ_end());
+    if (MInfo.Succs.size() != I->succ_size())
+      report("MBB has duplicate entries in its successor list.", I);
+  }
+}
+
+// Does iterator point to a and b as the first two elements?
+static bool matchPair(MachineBasicBlock::const_succ_iterator i,
+                      const MachineBasicBlock *a, const MachineBasicBlock *b) {
+  if (*i == a)
+    return *++i == b;
+  if (*i == b)
+    return *++i == a;
+  return false;
+}
+
+void
+MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
+  FirstTerminator = 0;
+
+  if (MRI->isSSA()) {
+    // If this block has allocatable physical registers live-in, check that
+    // it is an entry block or landing pad.
+    for (MachineBasicBlock::livein_iterator LI = MBB->livein_begin(),
+           LE = MBB->livein_end();
+         LI != LE; ++LI) {
+      unsigned reg = *LI;
+      if (isAllocatable(reg) && !MBB->isLandingPad() &&
+          MBB != MBB->getParent()->begin()) {
+        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
+      }
+    }
+  }
+
+  // Count the number of landing pad successors.
+  SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    if ((*I)->isLandingPad())
+      LandingPadSuccs.insert(*I);
+    if (!FunctionBlocks.count(*I))
+      report("MBB has successor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Preds.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the predecessor list of the successor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  // Check the predecessor list.
+  for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+       E = MBB->pred_end(); I != E; ++I) {
+    if (!FunctionBlocks.count(*I))
+      report("MBB has predecessor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Succs.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the successor list of the predecessor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  if (LandingPadSuccs.size() > 1 &&
+      !(AsmInfo &&
+        AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
+        BB && isa<SwitchInst>(BB->getTerminator())))
+    report("MBB has more than one landing pad successor", MBB);
+
+  // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->AnalyzeBranch(*const_cast<MachineBasicBlock *>(MBB),
+                          TBB, FBB, Cond)) {
+    // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
+    // check whether its answers match up with reality.
+    if (!TBB && !FBB) {
+      // Block falls through to its successor.
+      MachineFunction::const_iterator MBBI = MBB;
+      ++MBBI;
+      if (MBBI == MF->end()) {
+        // It's possible that the block legitimately ends with a noreturn
+        // call or an unreachable, in which case it won't actually fall
+        // out the bottom of the function.
+      } else if (MBB->succ_size() == LandingPadSuccs.size()) {
+        // It's possible that the block legitimately ends with a noreturn
+        // call or an unreachable, in which case it won't actuall fall
+        // out of the block.
+      } else if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+        report("MBB exits via unconditional fall-through but doesn't have "
+               "exactly one CFG successor!", MBB);
+      } else if (!MBB->isSuccessor(MBBI)) {
+        report("MBB exits via unconditional fall-through but its successor "
+               "differs from its CFG successor!", MBB);
+      }
+      if (!MBB->empty() && getBundleStart(&MBB->back())->isBarrier() &&
+          !TII->isPredicated(getBundleStart(&MBB->back()))) {
+        report("MBB exits via unconditional fall-through but ends with a "
+               "barrier instruction!", MBB);
+      }
+      if (!Cond.empty()) {
+        report("MBB exits via unconditional fall-through but has a condition!",
+               MBB);
+      }
+    } else if (TBB && !FBB && Cond.empty()) {
+      // Block unconditionally branches somewhere.
+      if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+        report("MBB exits via unconditional branch but doesn't have "
+               "exactly one CFG successor!", MBB);
+      } else if (!MBB->isSuccessor(TBB)) {
+        report("MBB exits via unconditional branch but the CFG "
+               "successor doesn't match the actual successor!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via unconditional branch but doesn't contain "
+               "any instructions!", MBB);
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+        report("MBB exits via unconditional branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+        report("MBB exits via unconditional branch but the branch isn't a "
+               "terminator instruction!", MBB);
+      }
+    } else if (TBB && !FBB && !Cond.empty()) {
+      // Block conditionally branches somewhere, otherwise falls through.
+      MachineFunction::const_iterator MBBI = MBB;
+      ++MBBI;
+      if (MBBI == MF->end()) {
+        report("MBB conditionally falls through out of function!", MBB);
+      } else if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (&*MBBI != TBB)
+          report("MBB exits via conditional branch/fall-through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/fall-through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
+        report("MBB exits via conditional branch/fall-through but doesn't have "
+               "exactly two CFG successors!", MBB);
+      } else if (!matchPair(MBB->succ_begin(), TBB, MBBI)) {
+        report("MBB exits via conditional branch/fall-through but the CFG "
+               "successors don't match the actual successors!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/fall-through but doesn't "
+               "contain any instructions!", MBB);
+      } else if (getBundleStart(&MBB->back())->isBarrier()) {
+        report("MBB exits via conditional branch/fall-through but ends with a "
+               "barrier instruction!", MBB);
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+        report("MBB exits via conditional branch/fall-through but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+    } else if (TBB && FBB) {
+      // Block conditionally branches somewhere, otherwise branches
+      // somewhere else.
+      if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (FBB != TBB)
+          report("MBB exits via conditional branch/branch through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/branch through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
+        report("MBB exits via conditional branch/branch but doesn't have "
+               "exactly two CFG successors!", MBB);
+      } else if (!matchPair(MBB->succ_begin(), TBB, FBB)) {
+        report("MBB exits via conditional branch/branch but the CFG "
+               "successors don't match the actual successors!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/branch but doesn't "
+               "contain any instructions!", MBB);
+      } else if (!getBundleStart(&MBB->back())->isBarrier()) {
+        report("MBB exits via conditional branch/branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!getBundleStart(&MBB->back())->isTerminator()) {
+        report("MBB exits via conditional branch/branch but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+      if (Cond.empty()) {
+        report("MBB exits via conditinal branch/branch but there's no "
+               "condition!", MBB);
+      }
+    } else {
+      report("AnalyzeBranch returned invalid data!", MBB);
+    }
+  }
+
+  regsLive.clear();
+  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I) {
+    if (!TargetRegisterInfo::isPhysicalRegister(*I)) {
+      report("MBB live-in list contains non-physical register", MBB);
+      continue;
+    }
+    regsLive.insert(*I);
+    for (MCSubRegIterator SubRegs(*I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
+  }
+  regsLiveInButUnused = regsLive;
+
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  assert(MFI && "Function has no frame info");
+  BitVector PR = MFI->getPristineRegs(MBB);
+  for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
+    regsLive.insert(I);
+    for (MCSubRegIterator SubRegs(I, TRI); SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
+  }
+
+  regsKilled.clear();
+  regsDefined.clear();
+
+  if (Indexes)
+    lastIndex = Indexes->getMBBStartIdx(MBB);
+}
+
+// This function gets called for all bundle headers, including normal
+// stand-alone unbundled instructions.
+void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
+  if (Indexes && Indexes->hasIndex(MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(MI);
+    if (!(idx > lastIndex)) {
+      report("Instruction index out of order", MI);
+      *OS << "Last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = idx;
+  }
+
+  // Ensure non-terminators don't follow terminators.
+  // Ignore predicated terminators formed by if conversion.
+  // FIXME: If conversion shouldn't need to violate this rule.
+  if (MI->isTerminator() && !TII->isPredicated(MI)) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    report("Non-terminator instruction after the first terminator", MI);
+    *OS << "First terminator was:\t" << *FirstTerminator;
+  }
+}
+
+// The operands on an INLINEASM instruction must follow a template.
+// Verify that the flag operands make sense.
+void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
+  // The first two operands on INLINEASM are the asm string and global flags.
+  if (MI->getNumOperands() < 2) {
+    report("Too few operands on inline asm", MI);
+    return;
+  }
+  if (!MI->getOperand(0).isSymbol())
+    report("Asm string must be an external symbol", MI);
+  if (!MI->getOperand(1).isImm())
+    report("Asm flags must be an immediate", MI);
+  // Allowed flags are Extra_HasSideEffects = 1, Extra_IsAlignStack = 2,
+  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16.
+  if (!isUInt<5>(MI->getOperand(1).getImm()))
+    report("Unknown asm flags", &MI->getOperand(1), 1);
+
+  assert(InlineAsm::MIOp_FirstOperand == 2 && "Asm format changed");
+
+  unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+  unsigned NumOps;
+  for (unsigned e = MI->getNumOperands(); OpNo < e; OpNo += NumOps) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // There may be implicit ops after the fixed operands.
+    if (!MO.isImm())
+      break;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(MO.getImm());
+  }
+
+  if (OpNo > MI->getNumOperands())
+    report("Missing operands in last group", MI);
+
+  // An optional MDNode follows the groups.
+  if (OpNo < MI->getNumOperands() && MI->getOperand(OpNo).isMetadata())
+    ++OpNo;
+
+  // All trailing operands must be implicit registers.
+  for (unsigned e = MI->getNumOperands(); OpNo < e; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (!MO.isReg() || !MO.isImplicit())
+      report("Expected implicit register after groups", &MO, OpNo);
+  }
+}
+
+void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->getNumOperands() < MCID.getNumOperands()) {
+    report("Too few operands", MI);
+    *OS << MCID.getNumOperands() << " operands expected, but "
+        << MI->getNumExplicitOperands() << " given.\n";
+  }
+
+  // Check the tied operands.
+  if (MI->isInlineAsm())
+    verifyInlineAsm(MI);
+
+  // Check the MachineMemOperands for basic consistency.
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+       E = MI->memoperands_end(); I != E; ++I) {
+    if ((*I)->isLoad() && !MI->mayLoad())
+      report("Missing mayLoad flag", MI);
+    if ((*I)->isStore() && !MI->mayStore())
+      report("Missing mayStore flag", MI);
+  }
+
+  // Debug values must not have a slot index.
+  // Other instructions must have one, unless they are inside a bundle.
+  if (LiveInts) {
+    bool mapped = !LiveInts->isNotInMIMap(MI);
+    if (MI->isDebugValue()) {
+      if (mapped)
+        report("Debug instruction has a slot index", MI);
+    } else if (MI->isInsideBundle()) {
+      if (mapped)
+        report("Instruction inside bundle has a slot index", MI);
+    } else {
+      if (!mapped)
+        report("Missing slot index", MI);
+    }
+  }
+
+  StringRef ErrorInfo;
+  if (!TII->verifyInstruction(MI, ErrorInfo))
+    report(ErrorInfo.data(), MI);
+}
+
+void
+MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  // The first MCID.NumDefs operands must be explicit register defines
+  if (MONum < MCID.getNumDefs()) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
+    if (!MO->isReg())
+      report("Explicit definition must be a register", MO, MONum);
+    else if (!MO->isDef() && !MCOI.isOptionalDef())
+      report("Explicit definition marked as use", MO, MONum);
+    else if (MO->isImplicit())
+      report("Explicit definition marked as implicit", MO, MONum);
+  } else if (MONum < MCID.getNumOperands()) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
+    // Don't check if it's the last operand in a variadic instruction. See,
+    // e.g., LDM_RET in the arm back end.
+    if (MO->isReg() &&
+        !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
+      if (MO->isDef() && !MCOI.isOptionalDef())
+          report("Explicit operand marked as def", MO, MONum);
+      if (MO->isImplicit())
+        report("Explicit operand marked as implicit", MO, MONum);
+    }
+
+    int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      if (!MO->isReg())
+        report("Tied use must be a register", MO, MONum);
+      else if (!MO->isTied())
+        report("Operand should be tied", MO, MONum);
+      else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
+        report("Tied def doesn't match MCInstrDesc", MO, MONum);
+    } else if (MO->isReg() && MO->isTied())
+      report("Explicit operand should not be tied", MO, MONum);
+  } else {
+    // ARM adds %reg0 operands to indicate predicates. We'll allow that.
+    if (MO->isReg() && !MO->isImplicit() && !MI->isVariadic() && MO->getReg())
+      report("Extra explicit operand on non-variadic instruction", MO, MONum);
+  }
+
+  switch (MO->getType()) {
+  case MachineOperand::MO_Register: {
+    const unsigned Reg = MO->getReg();
+    if (!Reg)
+      return;
+    if (MRI->tracksLiveness() && !MI->isDebugValue())
+      checkLiveness(MO, MONum);
+
+    // Verify the consistency of tied operands.
+    if (MO->isTied()) {
+      unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
+      const MachineOperand &OtherMO = MI->getOperand(OtherIdx);
+      if (!OtherMO.isReg())
+        report("Must be tied to a register", MO, MONum);
+      if (!OtherMO.isTied())
+        report("Missing tie flags on tied operand", MO, MONum);
+      if (MI->findTiedOperandIdx(OtherIdx) != MONum)
+        report("Inconsistent tie links", MO, MONum);
+      if (MONum < MCID.getNumDefs()) {
+        if (OtherIdx < MCID.getNumOperands()) {
+          if (-1 == MCID.getOperandConstraint(OtherIdx, MCOI::TIED_TO))
+            report("Explicit def tied to explicit use without tie constraint",
+                   MO, MONum);
+        } else {
+          if (!OtherMO.isImplicit())
+            report("Explicit def should be tied to implicit use", MO, MONum);
+        }
+      }
+    }
+
+    // Verify two-address constraints after leaving SSA form.
+    unsigned DefIdx;
+    if (!MRI->isSSA() && MO->isUse() &&
+        MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+        Reg != MI->getOperand(DefIdx).getReg())
+      report("Two-address instruction operands must be identical", MO, MONum);
+
+    // Check register classes.
+    if (MONum < MCID.getNumOperands() && !MO->isImplicit()) {
+      unsigned SubIdx = MO->getSubReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (SubIdx) {
+          report("Illegal subregister index for physical register", MO, MONum);
+          return;
+        }
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (!DRC->contains(Reg)) {
+            report("Illegal physical register for instruction", MO, MONum);
+            *OS << TRI->getName(Reg) << " is not a "
+                << DRC->getName() << " register.\n";
+          }
+        }
+      } else {
+        // Virtual register.
+        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        if (SubIdx) {
+          const TargetRegisterClass *SRC =
+            TRI->getSubClassWithSubReg(RC, SubIdx);
+          if (!SRC) {
+            report("Invalid subregister index for virtual register", MO, MONum);
+            *OS << "Register class " << RC->getName()
+                << " does not support subreg index " << SubIdx << "\n";
+            return;
+          }
+          if (RC != SRC) {
+            report("Invalid register class for subregister index", MO, MONum);
+            *OS << "Register class " << RC->getName()
+                << " does not fully support subreg index " << SubIdx << "\n";
+            return;
+          }
+        }
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (SubIdx) {
+            const TargetRegisterClass *SuperRC =
+              TRI->getLargestLegalSuperClass(RC);
+            if (!SuperRC) {
+              report("No largest legal super class exists.", MO, MONum);
+              return;
+            }
+            DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx);
+            if (!DRC) {
+              report("No matching super-reg register class.", MO, MONum);
+              return;
+            }
+          }
+          if (!RC->hasSuperClassEq(DRC)) {
+            report("Illegal virtual register for instruction", MO, MONum);
+            *OS << "Expected a " << DRC->getName() << " register, but got a "
+                << RC->getName() << " register\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  case MachineOperand::MO_RegisterMask:
+    regMasks.push_back(MO->getRegMask());
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    if (MI->isPHI() && !MO->getMBB()->isSuccessor(MI->getParent()))
+      report("PHI operand is not in the CFG", MO, MONum);
+    break;
+
+  case MachineOperand::MO_FrameIndex:
+    if (LiveStks && LiveStks->hasInterval(MO->getIndex()) &&
+        LiveInts && !LiveInts->isNotInMIMap(MI)) {
+      LiveInterval &LI = LiveStks->getInterval(MO->getIndex());
+      SlotIndex Idx = LiveInts->getInstructionIndex(MI);
+      if (MI->mayLoad() && !LI.liveAt(Idx.getRegSlot(true))) {
+        report("Instruction loads from dead spill slot", MO, MONum);
+        *OS << "Live stack: " << LI << '\n';
+      }
+      if (MI->mayStore() && !LI.liveAt(Idx.getRegSlot())) {
+        report("Instruction stores to dead spill slot", MO, MONum);
+        *OS << "Live stack: " << LI << '\n';
+      }
+    }
+    break;
+
+  default:
+    break;
+  }
+}
+
+void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const unsigned Reg = MO->getReg();
+
+  // Both use and def operands can read a register.
+  if (MO->readsReg()) {
+    regsLiveInButUnused.erase(Reg);
+
+    if (MO->isKill())
+      addRegWithSubRegs(regsKilled, Reg);
+
+    // Check that LiveVars knows this kill.
+    if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        MO->isKill()) {
+      LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+      if (std::find(VI.Kills.begin(), VI.Kills.end(), MI) == VI.Kills.end())
+        report("Kill missing from LiveVariables", MO, MONum);
+    }
+
+    // Check LiveInts liveness and kill.
+    if (LiveInts && !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI);
+      // Check the cached regunit intervals.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
+            LiveRangeQuery LRQ(*LI, UseIdx);
+            if (!LRQ.valueIn()) {
+              report("No live range at use", MO, MONum);
+              *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
+                  << ' ' << *LI << '\n';
+            }
+            if (MO->isKill() && !LRQ.isKill()) {
+              report("Live range continues after kill flag", MO, MONum);
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+            }
+          }
+        }
+      }
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          // This is a virtual register interval.
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          LiveRangeQuery LRQ(LI, UseIdx);
+          if (!LRQ.valueIn()) {
+            report("No live range at use", MO, MONum);
+            *OS << UseIdx << " is not live in " << LI << '\n';
+          }
+          // Check for extra kill flags.
+          // Note that we allow missing kill flags for now.
+          if (MO->isKill() && !LRQ.isKill()) {
+            report("Live range continues after kill flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        } else {
+          report("Virtual register has no live interval", MO, MONum);
+        }
+      }
+    }
+
+    // Use of a dead register.
+    if (!regsLive.count(Reg)) {
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        // Reserved registers may be used even when 'dead'.
+        if (!isReserved(Reg))
+          report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
+      } else {
+        BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+        // We don't know which virtual registers are live in, so only complain
+        // if vreg was killed in this MBB. Otherwise keep track of vregs that
+        // must be live in. PHI instructions are handled separately.
+        if (MInfo.regsKilled.count(Reg))
+          report("Using a killed virtual register", MO, MONum);
+        else if (!MI->isPHI())
+          MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
+      }
+    }
+  }
+
+  if (MO->isDef()) {
+    // Register defined.
+    // TODO: verify that earlyclobber ops are not used.
+    if (MO->isDead())
+      addRegWithSubRegs(regsDead, Reg);
+    else
+      addRegWithSubRegs(regsDefined, Reg);
+
+    // Verify SSA form.
+    if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
+      report("Multiple virtual register defs in SSA form", MO, MONum);
+
+    // Check LiveInts for a live range, but only for virtual registers.
+    if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
+      DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
+      if (LiveInts->hasInterval(Reg)) {
+        const LiveInterval &LI = LiveInts->getInterval(Reg);
+        if (const VNInfo *VNI = LI.getVNInfoAt(DefIdx)) {
+          assert(VNI && "NULL valno is not allowed");
+          if (VNI->def != DefIdx) {
+            report("Inconsistent valno->def", MO, MONum);
+            *OS << "Valno " << VNI->id << " is not defined at "
+              << DefIdx << " in " << LI << '\n';
+          }
+        } else {
+          report("No live range at def", MO, MONum);
+          *OS << DefIdx << " is not live in " << LI << '\n';
+        }
+      } else {
+        report("Virtual register has no Live interval", MO, MONum);
+      }
+    }
+  }
+}
+
+void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
+}
+
+// This function gets called after visiting all instructions in a bundle. The
+// argument points to the bundle header.
+// Normal stand-alone instructions are also considered 'bundles', and this
+// function is called for all of them.
+void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
+  BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+  set_union(MInfo.regsKilled, regsKilled);
+  set_subtract(regsLive, regsKilled); regsKilled.clear();
+  // Kill any masked registers.
+  while (!regMasks.empty()) {
+    const uint32_t *Mask = regMasks.pop_back_val();
+    for (RegSet::iterator I = regsLive.begin(), E = regsLive.end(); I != E; ++I)
+      if (TargetRegisterInfo::isPhysicalRegister(*I) &&
+          MachineOperand::clobbersPhysReg(Mask, *I))
+        regsDead.push_back(*I);
+  }
+  set_subtract(regsLive, regsDead);   regsDead.clear();
+  set_union(regsLive, regsDefined);   regsDefined.clear();
+}
+
+void
+MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
+  MBBInfoMap[MBB].regsLiveOut = regsLive;
+  regsLive.clear();
+
+  if (Indexes) {
+    SlotIndex stop = Indexes->getMBBEndIdx(MBB);
+    if (!(stop > lastIndex)) {
+      report("Block ends before last instruction index", MBB);
+      *OS << "Block ends at " << stop
+          << " last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = stop;
+  }
+}
+
+// Calculate the largest possible vregsPassed sets. These are the registers that
+// can pass through an MBB live, but may not be live every time. It is assumed
+// that all vregsPassed sets are empty before the call.
+void MachineVerifier::calcRegsPassed() {
+  // First push live-out regs to successors' vregsPassed. Remember the MBBs that
+  // have any vregsPassed.
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock &MBB(*MFI);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    if (!MInfo.reachable)
+      continue;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB.succ_begin(),
+           SuE = MBB.succ_end(); SuI != SuE; ++SuI) {
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.regsLiveOut))
+        todo.insert(*SuI);
+    }
+  }
+
+  // Iteratively push vregsPassed to successors. This will converge to the same
+  // final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI) {
+      if (*SuI == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.vregsPassed))
+        todo.insert(*SuI);
+    }
+  }
+}
+
+// Calculate the set of virtual registers that must be passed through each basic
+// block in order to satisfy the requirements of successor blocks. This is very
+// similar to calcRegsPassed, only backwards.
+void MachineVerifier::calcRegsRequired() {
+  // First push live-in regs to predecessors' vregsRequired.
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock &MBB(*MFI);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB.pred_begin(),
+           PrE = MBB.pred_end(); PrI != PrE; ++PrI) {
+      BBInfo &PInfo = MBBInfoMap[*PrI];
+      if (PInfo.addRequired(MInfo.vregsLiveIn))
+        todo.insert(*PrI);
+    }
+  }
+
+  // Iteratively push vregsRequired to predecessors. This will converge to the
+  // same final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+      if (*PrI == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[*PrI];
+      if (SInfo.addRequired(MInfo.vregsRequired))
+        todo.insert(*PrI);
+    }
+  }
+}
+
+// Check PHI instructions at the beginning of MBB. It is assumed that
+// calcRegsPassed has been run so BBInfo::isLiveOut is valid.
+void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
+  SmallPtrSet<const MachineBasicBlock*, 8> seen;
+  for (MachineBasicBlock::const_iterator BBI = MBB->begin(), BBE = MBB->end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    seen.clear();
+
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI->getOperand(i).getReg();
+      const MachineBasicBlock *Pre = BBI->getOperand(i + 1).getMBB();
+      if (!Pre->isSuccessor(MBB))
+        continue;
+      seen.insert(Pre);
+      BBInfo &PrInfo = MBBInfoMap[Pre];
+      if (PrInfo.reachable && !PrInfo.isLiveOut(Reg))
+        report("PHI operand is not live-out from predecessor",
+               &BBI->getOperand(i), i);
+    }
+
+    // Did we see all predecessors?
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+      if (!seen.count(*PrI)) {
+        report("Missing PHI operand", BBI);
+        *OS << "BB#" << (*PrI)->getNumber()
+            << " is a predecessor according to the CFG.\n";
+      }
+    }
+  }
+}
+
+void MachineVerifier::visitMachineFunctionAfter() {
+  calcRegsPassed();
+
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+
+    // Skip unreachable MBBs.
+    if (!MInfo.reachable)
+      continue;
+
+    checkPHIOps(MFI);
+  }
+
+  // Now check liveness info if available
+  calcRegsRequired();
+
+  // Check for killed virtual registers that should be live out.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    BBInfo &MInfo = MBBInfoMap[MFI];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      if (MInfo.regsKilled.count(*I)) {
+        report("Virtual register killed in block, but needed live out.", MFI);
+        *OS << "Virtual register " << PrintReg(*I)
+            << " is used after the block.\n";
+      }
+  }
+
+  if (!MF->empty()) {
+    BBInfo &MInfo = MBBInfoMap[&MF->front()];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      report("Virtual register def doesn't dominate all uses.",
+             MRI->getVRegDef(*I));
+  }
+
+  if (LiveVars)
+    verifyLiveVariables();
+  if (LiveInts)
+    verifyLiveIntervals();
+}
+
+void MachineVerifier::verifyLiveVariables() {
+  assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+    for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+         MFI != MFE; ++MFI) {
+      BBInfo &MInfo = MBBInfoMap[MFI];
+
+      // Our vregsRequired should be identical to LiveVariables' AliveBlocks
+      if (MInfo.vregsRequired.count(Reg)) {
+        if (!VI.AliveBlocks.test(MFI->getNumber())) {
+          report("LiveVariables: Block missing from AliveBlocks", MFI);
+          *OS << "Virtual register " << PrintReg(Reg)
+              << " must be live through the block.\n";
+        }
+      } else {
+        if (VI.AliveBlocks.test(MFI->getNumber())) {
+          report("LiveVariables: Block should not be in AliveBlocks", MFI);
+          *OS << "Virtual register " << PrintReg(Reg)
+              << " is not needed live through the block.\n";
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveIntervals() {
+  assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Spilling and splitting may leave unused registers around. Skip them.
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+
+    if (!LiveInts->hasInterval(Reg)) {
+      report("Missing live interval for virtual register", MF);
+      *OS << PrintReg(Reg, TRI) << " still has defs or uses\n";
+      continue;
+    }
+
+    const LiveInterval &LI = LiveInts->getInterval(Reg);
+    assert(Reg == LI.reg && "Invalid reg to interval mapping");
+    verifyLiveInterval(LI);
+  }
+
+  // Verify all the cached regunit intervals.
+  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
+      verifyLiveInterval(*LI);
+}
+
+void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
+                                              VNInfo *VNI) {
+  if (VNI->isUnused())
+    return;
+
+  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+
+  if (!DefVNI) {
+    report("Valno not live at def and not marked unused", MF, LI);
+    *OS << "Valno #" << VNI->id << '\n';
+    return;
+  }
+
+  if (DefVNI != VNI) {
+    report("Live range at def has different valno", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " where valno #" << DefVNI->id << " is live\n";
+    return;
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
+  if (!MBB) {
+    report("Invalid definition index", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " in " << LI << '\n';
+    return;
+  }
+
+  if (VNI->isPHIDef()) {
+    if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
+      report("PHIDef value is not defined at MBB start", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+          << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
+    }
+    return;
+  }
+
+  // Non-PHI def.
+  const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
+  if (!MI) {
+    report("No instruction at def index", MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    return;
+  }
+
+  bool hasDef = false;
+  bool isEarlyClobber = false;
+  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+    if (!MOI->isReg() || !MOI->isDef())
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+      if (MOI->getReg() != LI.reg)
+        continue;
+    } else {
+      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+        continue;
+    }
+    hasDef = true;
+    if (MOI->isEarlyClobber())
+      isEarlyClobber = true;
+  }
+
+  if (!hasDef) {
+    report("Defining instruction does not modify register", MI);
+    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
+  }
+
+  // Early clobber defs begin at USE slots, but other defs must begin at
+  // DEF slots.
+  if (isEarlyClobber) {
+    if (!VNI->def.isEarlyClobber()) {
+      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    }
+  } else if (!VNI->def.isRegister()) {
+    report("Non-PHI, non-early clobber def must be at a register slot",
+           MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+  }
+}
+
+void
+MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
+                                           LiveInterval::const_iterator I) {
+  const VNInfo *VNI = I->valno;
+  assert(VNI && "Live range has no valno");
+
+  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live range", MF, LI);
+    *OS << *I << " has a bad valno\n";
+  }
+
+  if (VNI->isUnused()) {
+    report("Live range valno is marked unused", MF, LI);
+    *OS << *I << '\n';
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  if (!MBB) {
+    report("Bad start of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
+  SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
+  if (I->start != MBBStartIdx && I->start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LI);
+    *OS << *I << '\n';
+  }
+
+  const MachineBasicBlock *EndMBB =
+    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+  if (!EndMBB) {
+    report("Bad end of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
+
+  // No more checks for live-out segments.
+  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+    return;
+
+  // RegUnit intervals are allowed dead phis.
+  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
+      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+    return;
+
+  // The live segment is ending inside EndMBB
+  const MachineInstr *MI =
+    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+  if (!MI) {
+    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
+    *OS << *I << '\n';
+    return;
+  }
+
+  // The block slot must refer to a basic block boundary.
+  if (I->end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LI);
+    *OS << *I << '\n';
+  }
+
+  if (I->end.isDead()) {
+    // Segment ends on the dead slot.
+    // That means there must be a dead def.
+    if (!SlotIndex::isSameInstr(I->start, I->end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // A live segment can only end at an early-clobber slot if it is being
+  // redefined by an early-clobber def.
+  if (I->end.isEarlyClobber()) {
+    if (I+1 == LI.end() || (I+1)->start != I->end) {
+      report("Live segment ending at early clobber slot must be "
+             "redefined by an EC def in the same instruction", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // The following checks only apply to virtual registers. Physreg liveness
+  // is too weird to check.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    // A live range can end with either a redefinition, a kill flag on a
+    // use, or a dead flag on a def.
+    bool hasRead = false;
+    bool hasDeadDef = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || MOI->getReg() != LI.reg)
+        continue;
+      if (MOI->readsReg())
+        hasRead = true;
+      if (MOI->isDef() && MOI->isDead())
+        hasDeadDef = true;
+    }
+
+    if (I->end.isDead()) {
+      if (!hasDeadDef) {
+        report("Instruction doesn't have a dead def operand", MI);
+        I->print(*OS);
+        *OS << " in " << LI << '\n';
+      }
+    } else {
+      if (!hasRead) {
+        report("Instruction ending live range doesn't read the register", MI);
+        *OS << *I << " in " << LI << '\n';
+      }
+    }
+  }
+
+  // Now check all the basic blocks in this live segment.
+  MachineFunction::const_iterator MFI = MBB;
+  // Is this live range the beginning of a non-PHIDef VN?
+  if (I->start == VNI->def && !VNI->isPHIDef()) {
+    // Not live-in to any blocks.
+    if (MBB == EndMBB)
+      return;
+    // Skip this block.
+    ++MFI;
+  }
+  for (;;) {
+    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    // We don't know how to track physregs into a landing pad.
+    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+        MFI->isLandingPad()) {
+      if (&*MFI == EndMBB)
+        break;
+      ++MFI;
+      continue;
+    }
+
+    // Is VNI a PHI-def in the current block?
+    bool IsPHI = VNI->isPHIDef() &&
+      VNI->def == LiveInts->getMBBStartIdx(MFI);
+
+    // Check that VNI is live-out of all predecessors.
+    for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
+         PE = MFI->pred_end(); PI != PE; ++PI) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
+      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+
+      // All predecessors must have a live-out value.
+      if (!PVNI) {
+        report("Register not marked live out of predecessor", *PI, LI);
+        *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
+            << PEnd << '\n';
+        continue;
+      }
+
+      // Only PHI-defs can take different predecessor values.
+      if (!IsPHI && PVNI != VNI) {
+        report("Different value live out of predecessor", *PI, LI);
+        *OS << "Valno #" << PVNI->id << " live out of BB#"
+            << (*PI)->getNumber() << '@' << PEnd
+            << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
+      }
+    }
+    if (&*MFI == EndMBB)
+      break;
+    ++MFI;
+  }
+}
+
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I!=E; ++I)
+    verifyLiveIntervalValue(LI, *I);
+
+  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
+    verifyLiveIntervalSegment(LI, I);
+
+  // Check the LI only has one connected component.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    ConnectedVNInfoEqClasses ConEQ(*LiveInts);
+    unsigned NumComp = ConEQ.Classify(&LI);
+    if (NumComp > 1) {
+      report("Multiple connected components in live interval", MF, LI);
+      for (unsigned comp = 0; comp != NumComp; ++comp) {
+        *OS << comp << ": valnos";
+        for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
+             E = LI.vni_end(); I!=E; ++I)
+          if (comp == ConEQ.getEqClass(*I))
+            *OS << ' ' << (*I)->id;
+        *OS << '\n';
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/OcamlGC.cpp b/contrib/llvm/lib/CodeGen/OcamlGC.cpp
new file mode 100644
index 000000000000..48db200c513c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/OcamlGC.cpp
@@ -0,0 +1,37 @@
+//===-- OcamlGC.cpp - Ocaml frametable GC strategy ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering for the llvm.gc* intrinsics compatible with
+// Objective Caml 3.10.0, which uses a liveness-accurate static stack map.
+//
+// The frametable emitter is in OcamlGCPrinter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCStrategy.h"
+
+using namespace llvm;
+
+namespace {
+  class OcamlGC : public GCStrategy {
+  public:
+    OcamlGC();
+  };
+}
+
+static GCRegistry::Add<OcamlGC>
+X("ocaml", "ocaml 3.10-compatible GC");
+
+void llvm::linkOcamlGC() { }
+
+OcamlGC::OcamlGC() {
+  NeededSafePoints = 1 << GC::PostCall;
+  UsesMetadata = true;
+}
diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
new file mode 100644
index 000000000000..3982612e8c11
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -0,0 +1,193 @@
+//===-- OptimizePHIs.cpp - Optimize machine instruction PHIs --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes machine instruction PHIs to take advantage of
+// opportunities created during DAG legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "phi-opt"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
+STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
+
+namespace {
+  class OptimizePHIs : public MachineFunctionPass {
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID; // Pass identification
+    OptimizePHIs() : MachineFunctionPass(ID) {
+      initializeOptimizePHIsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef SmallPtrSet<MachineInstr*, 16> InstrSet;
+    typedef SmallPtrSetIterator<MachineInstr*> InstrSetIterator;
+
+    bool IsSingleValuePHICycle(MachineInstr *MI, unsigned &SingleValReg,
+                               InstrSet &PHIsInCycle);
+    bool IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle);
+    bool OptimizeBB(MachineBasicBlock &MBB);
+  };
+}
+
+char OptimizePHIs::ID = 0;
+char &llvm::OptimizePHIsID = OptimizePHIs::ID;
+INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+                "Optimize machine instruction PHIs", false, false)
+
+bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
+  MRI = &Fn.getRegInfo();
+  TII = Fn.getTarget().getInstrInfo();
+
+  // Find dead PHI cycles and PHI cycles that can be replaced by a single
+  // value.  InstCombine does these optimizations, but DAG legalization may
+  // introduce new opportunities, e.g., when i64 values are split up for
+  // 32-bit targets.
+  bool Changed = false;
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    Changed |= OptimizeBB(*I);
+
+  return Changed;
+}
+
+/// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands
+/// are copies of SingleValReg, possibly via copies through other PHIs.  If
+/// SingleValReg is zero on entry, it is set to the register with the single
+/// non-copy value.  PHIsInCycle is a set used to keep track of the PHIs that
+/// have been scanned.
+bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
+                                         unsigned &SingleValReg,
+                                         InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI))
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  // Scan the PHI operands.
+  for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
+    unsigned SrcReg = MI->getOperand(i).getReg();
+    if (SrcReg == DstReg)
+      continue;
+    MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+
+    // Skip over register-to-register moves.
+    if (SrcMI && SrcMI->isCopy() &&
+        !SrcMI->getOperand(0).getSubReg() &&
+        !SrcMI->getOperand(1).getSubReg() &&
+        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+    if (!SrcMI)
+      return false;
+
+    if (SrcMI->isPHI()) {
+      if (!IsSingleValuePHICycle(SrcMI, SingleValReg, PHIsInCycle))
+        return false;
+    } else {
+      // Fail if there is more than one non-phi/non-move register.
+      if (SingleValReg != 0)
+        return false;
+      SingleValReg = SrcReg;
+    }
+  }
+  return true;
+}
+
+/// IsDeadPHICycle - Check if the register defined by a PHI is only used by
+/// other PHIs in a cycle.
+bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+         "PHI destination is not a virtual register");
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI))
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DstReg),
+         E = MRI->use_end(); I != E; ++I) {
+    MachineInstr *UseMI = &*I;
+    if (!UseMI->isPHI() || !IsDeadPHICycle(UseMI, PHIsInCycle))
+      return false;
+  }
+
+  return true;
+}
+
+/// OptimizeBB - Remove dead PHI cycles and PHI cycles that can be replaced by
+/// a single value.
+bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator
+         MII = MBB.begin(), E = MBB.end(); MII != E; ) {
+    MachineInstr *MI = &*MII++;
+    if (!MI->isPHI())
+      break;
+
+    // Check for single-value PHI cycles.
+    unsigned SingleValReg = 0;
+    InstrSet PHIsInCycle;
+    if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
+        SingleValReg != 0) {
+      unsigned OldReg = MI->getOperand(0).getReg();
+      if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
+        continue;
+
+      MRI->replaceRegWith(OldReg, SingleValReg);
+      MI->eraseFromParent();
+      ++NumPHICycles;
+      Changed = true;
+      continue;
+    }
+
+    // Check for dead PHI cycles.
+    PHIsInCycle.clear();
+    if (IsDeadPHICycle(MI, PHIsInCycle)) {
+      for (InstrSetIterator PI = PHIsInCycle.begin(), PE = PHIsInCycle.end();
+           PI != PE; ++PI) {
+        MachineInstr *PhiMI = *PI;
+        if (&*MII == PhiMI)
+          ++MII;
+        PhiMI->eraseFromParent();
+      }
+      ++NumDeadPHICycles;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
new file mode 100644
index 000000000000..5584708eae36
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
@@ -0,0 +1,644 @@
+//===-- PhiElimination.cpp - Eliminate PHI nodes by inserting copies ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions.  This destroys SSA information, but is the desired input for
+// some register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "phielim"
+#include "llvm/CodeGen/Passes.h"
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<bool>
+DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
+                     cl::Hidden, cl::desc("Disable critical edge splitting "
+                                          "during PHI elimination"));
+
+static cl::opt<bool>
+SplitAllCriticalEdges("phi-elim-split-all-critical-edges", cl::init(false),
+                      cl::Hidden, cl::desc("Split all critical edges during "
+                                           "PHI elimination"));
+
+namespace {
+  class PHIElimination : public MachineFunctionPass {
+    MachineRegisterInfo *MRI; // Machine register information
+    LiveVariables *LV;
+    LiveIntervals *LIS;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PHIElimination() : MachineFunctionPass(ID) {
+      initializePHIEliminationPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  private:
+    /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
+    /// in predecessor basic blocks.
+    ///
+    bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
+    void LowerPHINode(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator AfterPHIsIt);
+
+    /// analyzePHINodes - Gather information about the PHI nodes in
+    /// here. In particular, we want to map the number of uses of a virtual
+    /// register which is used in a PHI node. We map that to the BB the
+    /// vreg is coming from. This is used later to determine when the vreg
+    /// is killed in the BB.
+    ///
+    void analyzePHINodes(const MachineFunction& Fn);
+
+    /// Split critical edges where necessary for good coalescer performance.
+    bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineLoopInfo *MLI);
+
+    // These functions are temporary abstractions around LiveVariables and
+    // LiveIntervals, so they can go away when LiveVariables does.
+    bool isLiveIn(unsigned Reg, MachineBasicBlock *MBB);
+    bool isLiveOutPastPHIs(unsigned Reg, MachineBasicBlock *MBB);
+
+    typedef std::pair<unsigned, unsigned> BBVRegPair;
+    typedef DenseMap<BBVRegPair, unsigned> VRegPHIUse;
+
+    VRegPHIUse VRegPHIUseCount;
+
+    // Defs of PHI sources which are implicit_def.
+    SmallPtrSet<MachineInstr*, 4> ImpDefs;
+
+    // Map reusable lowered PHI node -> incoming join register.
+    typedef DenseMap<MachineInstr*, unsigned,
+                     MachineInstrExpressionTrait> LoweredPHIMap;
+    LoweredPHIMap LoweredPHIs;
+  };
+}
+
+STATISTIC(NumLowered, "Number of phis lowered");
+STATISTIC(NumCriticalEdgesSplit, "Number of critical edges split");
+STATISTIC(NumReused, "Number of reused lowered phis");
+
+char PHIElimination::ID = 0;
+char& llvm::PHIEliminationID = PHIElimination::ID;
+
+INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination",
+                      "Eliminate PHI nodes for register allocation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
+                    "Eliminate PHI nodes for register allocation", false, false)
+
+void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<LiveVariables>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  bool Changed = false;
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  // Split critical edges to help the coalescer. This does not yet support
+  // updating LiveIntervals, so we disable it.
+  if (!DisableEdgeSplitting && (LV || LIS)) {
+    MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= SplitPHIEdges(MF, *I, MLI);
+  }
+
+  // Populate VRegPHIUseCount
+  analyzePHINodes(MF);
+
+  // Eliminate PHI instructions by inserting copies into predecessor blocks.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    Changed |= EliminatePHINodes(MF, *I);
+
+  // Remove dead IMPLICIT_DEF instructions.
+  for (SmallPtrSet<MachineInstr*, 4>::iterator I = ImpDefs.begin(),
+         E = ImpDefs.end(); I != E; ++I) {
+    MachineInstr *DefMI = *I;
+    unsigned DefReg = DefMI->getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(DefReg)) {
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(DefMI);
+      DefMI->eraseFromParent();
+    }
+  }
+
+  // Clean up the lowered PHI instructions.
+  for (LoweredPHIMap::iterator I = LoweredPHIs.begin(), E = LoweredPHIs.end();
+       I != E; ++I) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(I->first);
+    MF.DeleteMachineInstr(I->first);
+  }
+
+  LoweredPHIs.clear();
+  ImpDefs.clear();
+  VRegPHIUseCount.clear();
+
+  return Changed;
+}
+
+/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in
+/// predecessor basic blocks.
+///
+bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
+                                             MachineBasicBlock &MBB) {
+  if (MBB.empty() || !MBB.front().isPHI())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  // Get an iterator to the first instruction after the last PHI node (this may
+  // also be the end of the basic block).
+  MachineBasicBlock::iterator AfterPHIsIt = MBB.SkipPHIsAndLabels(MBB.begin());
+
+  while (MBB.front().isPHI())
+    LowerPHINode(MBB, AfterPHIsIt);
+
+  return true;
+}
+
+/// isImplicitlyDefined - Return true if all defs of VirtReg are implicit-defs.
+/// This includes registers with no defs.
+static bool isImplicitlyDefined(unsigned VirtReg,
+                                const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(VirtReg),
+       DE = MRI->def_end(); DI != DE; ++DI)
+    if (!DI->isImplicitDef())
+      return false;
+  return true;
+}
+
+/// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
+/// are implicit_def's.
+static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
+                                         const MachineRegisterInfo *MRI) {
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    if (!isImplicitlyDefined(MPhi->getOperand(i).getReg(), MRI))
+      return false;
+  return true;
+}
+
+
+/// LowerPHINode - Lower the PHI node at the top of the specified block,
+///
+void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator AfterPHIsIt) {
+  ++NumLowered;
+  // Unlink the PHI node from the basic block, but don't delete the PHI yet.
+  MachineInstr *MPhi = MBB.remove(MBB.begin());
+
+  unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
+  unsigned DestReg = MPhi->getOperand(0).getReg();
+  assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
+  bool isDead = MPhi->getOperand(0).isDead();
+
+  // Create a new register for the incoming PHI arguments.
+  MachineFunction &MF = *MBB.getParent();
+  unsigned IncomingReg = 0;
+  bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
+
+  // Insert a register to register copy at the top of the current block (but
+  // after any remaining phi nodes) which copies the new incoming register
+  // into the phi node destination.
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  if (isSourceDefinedByImplicitDef(MPhi, MRI))
+    // If all sources of a PHI node are implicit_def, just emit an
+    // implicit_def instead of a copy.
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
+  else {
+    // Can we reuse an earlier PHI node? This only happens for critical edges,
+    // typically those created by tail duplication.
+    unsigned &entry = LoweredPHIs[MPhi];
+    if (entry) {
+      // An identical PHI node was already lowered. Reuse the incoming register.
+      IncomingReg = entry;
+      reusedIncoming = true;
+      ++NumReused;
+      DEBUG(dbgs() << "Reusing " << PrintReg(IncomingReg) << " for " << *MPhi);
+    } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
+      entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
+    }
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), DestReg)
+      .addReg(IncomingReg);
+  }
+
+  // Update live variable information if there is any.
+  if (LV) {
+    MachineInstr *PHICopy = prior(AfterPHIsIt);
+
+    if (IncomingReg) {
+      LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
+
+      // Increment use count of the newly created virtual register.
+      LV->setPHIJoin(IncomingReg);
+
+      // When we are reusing the incoming register, it may already have been
+      // killed in this block. The old kill will also have been inserted at
+      // AfterPHIsIt, so it appears before the current PHICopy.
+      if (reusedIncoming)
+        if (MachineInstr *OldKill = VI.findKill(&MBB)) {
+          DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+          LV->removeVirtualRegisterKilled(IncomingReg, OldKill);
+          DEBUG(MBB.dump());
+        }
+
+      // Add information to LiveVariables to know that the incoming value is
+      // killed.  Note that because the value is defined in several places (once
+      // each for each incoming block), the "def" block and instruction fields
+      // for the VarInfo is not filled in.
+      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+    }
+
+    // Since we are going to be deleting the PHI node, if it is the last use of
+    // any registers, or if the value itself is dead, we need to move this
+    // information over to the new copy we just inserted.
+    LV->removeVirtualRegistersKilled(MPhi);
+
+    // If the result is dead, update LV.
+    if (isDead) {
+      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->removeVirtualRegisterDead(DestReg, MPhi);
+    }
+  }
+
+  // Update LiveIntervals for the new copy or implicit def.
+  if (LIS) {
+    MachineInstr *NewInstr = prior(AfterPHIsIt);
+    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(NewInstr);
+
+    SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
+    if (IncomingReg) {
+      // Add the region from the beginning of MBB to the copy instruction to
+      // IncomingReg's live interval.
+      LiveInterval &IncomingLI = LIS->getOrCreateInterval(IncomingReg);
+      VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
+      if (!IncomingVNI)
+        IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
+                                              LIS->getVNInfoAllocator());
+      IncomingLI.addRange(LiveRange(MBBStartIndex,
+                                    DestCopyIndex.getRegSlot(),
+                                    IncomingVNI));
+    }
+
+    LiveInterval &DestLI = LIS->getInterval(DestReg);
+    assert(DestLI.begin() != DestLI.end() &&
+           "PHIs should have nonempty LiveIntervals.");
+    if (DestLI.endIndex().isDead()) {
+      // A dead PHI's live range begins and ends at the start of the MBB, but
+      // the lowered copy, which will still be dead, needs to begin and end at
+      // the copy instruction.
+      VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
+      assert(OrigDestVNI && "PHI destination should be live at block entry.");
+      DestLI.removeRange(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
+                           LIS->getVNInfoAllocator());
+      DestLI.removeValNo(OrigDestVNI);
+    } else {
+      // Otherwise, remove the region from the beginning of MBB to the copy
+      // instruction from DestReg's live interval.
+      DestLI.removeRange(MBBStartIndex, DestCopyIndex.getRegSlot());
+      VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
+      assert(DestVNI && "PHI destination should be live at its definition.");
+      DestVNI->def = DestCopyIndex.getRegSlot();
+    }
+  }
+
+  // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i+1).getMBB()->getNumber(),
+                                 MPhi->getOperand(i).getReg())];
+
+  // Now loop over all of the incoming arguments, changing them to copy into the
+  // IncomingReg register in the corresponding predecessor basic block.
+  SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
+  for (int i = NumSrcs - 1; i >= 0; --i) {
+    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
+    bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
+      isImplicitlyDefined(SrcReg, MRI);
+    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
+    // path the PHI.
+    MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
+
+    // Check to make sure we haven't already emitted the copy for this block.
+    // This can happen because PHI nodes may have multiple entries for the same
+    // basic block.
+    if (!MBBsInsertedInto.insert(&opBlock))
+      continue;  // If the copy has already been emitted, we're done.
+
+    // Find a safe location to insert the copy, this may be the first terminator
+    // in the block (or end()).
+    MachineBasicBlock::iterator InsertPos =
+      findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
+
+    // Insert the copy.
+    MachineInstr *NewSrcInstr = 0;
+    if (!reusedIncoming && IncomingReg) {
+      if (SrcUndef) {
+        // The source register is undefined, so there is no need for a real
+        // COPY, but we still need to ensure joint dominance by defs.
+        // Insert an IMPLICIT_DEF instruction.
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                              TII->get(TargetOpcode::IMPLICIT_DEF),
+                              IncomingReg);
+
+        // Clean up the old implicit-def, if there even was one.
+        if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
+          if (DefMI->isImplicitDef())
+            ImpDefs.insert(DefMI);
+      } else {
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                            TII->get(TargetOpcode::COPY), IncomingReg)
+                        .addReg(SrcReg, 0, SrcSubReg);
+      }
+    }
+
+    // We only need to update the LiveVariables kill of SrcReg if this was the
+    // last PHI use of SrcReg to be lowered on this CFG edge and it is not live
+    // out of the predecessor. We can also ignore undef sources.
+    if (LV && !SrcUndef &&
+        !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)] &&
+        !LV->isLiveOut(SrcReg, opBlock)) {
+      // We want to be able to insert a kill of the register if this PHI (aka,
+      // the copy we just inserted) is the last use of the source value. Live
+      // variable analysis conservatively handles this by saying that the value
+      // is live until the end of the block the PHI entry lives in. If the value
+      // really is dead at the PHI copy, there will be no successor blocks which
+      // have the value live-in.
+
+      // Okay, if we now know that the value is not live out of the block, we
+      // can add a kill marker in this block saying that it kills the incoming
+      // value!
+
+      // In our final twist, we have to decide which instruction kills the
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+      for (MachineBasicBlock::iterator Term = FirstTerm;
+          Term != opBlock.end(); ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = FirstTerm;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugValue())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = prior(InsertPos);
+        }
+      }
+      assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
+
+      // Finally, mark it killed.
+      LV->addVirtualRegisterKilled(SrcReg, KillInst);
+
+      // This vreg no longer lives all of the way through opBlock.
+      unsigned opBlockNum = opBlock.getNumber();
+      LV->getVarInfo(SrcReg).AliveBlocks.reset(opBlockNum);
+    }
+
+    if (LIS) {
+      if (NewSrcInstr) {
+        LIS->InsertMachineInstrInMaps(NewSrcInstr);
+        LIS->addLiveRangeToEndOfBlock(IncomingReg, NewSrcInstr);
+      }
+
+      if (!SrcUndef &&
+          !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)]) {
+        LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+
+        bool isLiveOut = false;
+        for (MachineBasicBlock::succ_iterator SI = opBlock.succ_begin(),
+             SE = opBlock.succ_end(); SI != SE; ++SI) {
+          SlotIndex startIdx = LIS->getMBBStartIdx(*SI);
+          VNInfo *VNI = SrcLI.getVNInfoAt(startIdx);
+
+          // Definitions by other PHIs are not truly live-in for our purposes.
+          if (VNI && VNI->def != startIdx) {
+            isLiveOut = true;
+            break;
+          }
+        }
+
+        if (!isLiveOut) {
+          MachineBasicBlock::iterator KillInst = opBlock.end();
+          MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+          for (MachineBasicBlock::iterator Term = FirstTerm;
+              Term != opBlock.end(); ++Term) {
+            if (Term->readsRegister(SrcReg))
+              KillInst = Term;
+          }
+
+          if (KillInst == opBlock.end()) {
+            // No terminator uses the register.
+
+            if (reusedIncoming || !IncomingReg) {
+              // We may have to rewind a bit if we didn't just insert a copy.
+              KillInst = FirstTerm;
+              while (KillInst != opBlock.begin()) {
+                --KillInst;
+                if (KillInst->isDebugValue())
+                  continue;
+                if (KillInst->readsRegister(SrcReg))
+                  break;
+              }
+            } else {
+              // We just inserted this copy.
+              KillInst = prior(InsertPos);
+            }
+          }
+          assert(KillInst->readsRegister(SrcReg) &&
+                 "Cannot find kill instruction");
+
+          SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
+          SrcLI.removeRange(LastUseIndex.getRegSlot(),
+                            LIS->getMBBEndIdx(&opBlock));
+        }
+      }
+    }
+  }
+
+  // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
+  if (reusedIncoming || !IncomingReg) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(MPhi);
+    MF.DeleteMachineInstr(MPhi);
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the number of uses of a virtual register which is
+/// used in a PHI node. We map that to the BB the vreg is coming from. This is
+/// used later to determine when the vreg is killed in the BB.
+///
+void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I)
+    for (MachineBasicBlock::const_iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI)
+      for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+        ++VRegPHIUseCount[BBVRegPair(BBI->getOperand(i+1).getMBB()->getNumber(),
+                                     BBI->getOperand(i).getReg())];
+}
+
+bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
+                                   MachineBasicBlock &MBB,
+                                   MachineLoopInfo *MLI) {
+  if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI->getOperand(i).getReg();
+      MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
+      // Avoid splitting backedges of loops. It would introduce small
+      // out-of-line blocks into the loop which is very bad for code placement.
+      if (PreMBB == &MBB && !SplitAllCriticalEdges)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
+      if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      if (!isLiveOutPastPHIs(Reg, PreMBB) && !SplitAllCriticalEdges)
+        continue;
+
+      DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
+                   << PreMBB->getNumber() << " -> BB#" << MBB.getNumber()
+                   << ": " << *BBI);
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      bool ShouldSplit = !isLiveIn(Reg, &MBB) || SplitAllCriticalEdges;
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, this)) {
+        DEBUG(dbgs() << "Failed to split ciritcal edge.\n");
+        continue;
+      }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
+    }
+  }
+  return Changed;
+}
+
+bool PHIElimination::isLiveIn(unsigned Reg, MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveIn() requires either LiveVariables or LiveIntervals");
+  if (LIS)
+    return LIS->isLiveInToMBB(LIS->getInterval(Reg), MBB);
+  else
+    return LV->isLiveIn(Reg, *MBB);
+}
+
+bool PHIElimination::isLiveOutPastPHIs(unsigned Reg, MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals");
+  // LiveVariables considers uses in PHIs to be in the predecessor basic block,
+  // so that a register used only in a PHI is not live out of the block. In
+  // contrast, LiveIntervals considers uses in PHIs to be on the edge rather than
+  // in the predecessor basic block, so that a register used only in a PHI is live
+  // out of the block.
+  if (LIS) {
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+      if (LI.liveAt(LIS->getMBBStartIdx(*SI)))
+        return true;
+    }
+    return false;
+  } else {
+    return LV->isLiveOut(Reg, *MBB);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
new file mode 100644
index 000000000000..e1b56e962fa9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -0,0 +1,61 @@
+//===-- PHIEliminationUtils.cpp - Helper functions for PHI elimination ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+// findCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg
+// when following the CFG edge to SuccMBB. This needs to be after any def of
+// SrcReg, but before any subsequent point where control flow might jump out of
+// the basic block.
+MachineBasicBlock::iterator
+llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                             unsigned SrcReg) {
+  // Handle the trivial case trivially.
+  if (MBB->empty())
+    return MBB->begin();
+
+  // Usually, we just want to insert the copy before the first terminator
+  // instruction. However, for the edge going to a landing pad, we must insert
+  // the copy before the call/invoke instruction.
+  if (!SuccMBB->isLandingPad())
+    return MBB->getFirstTerminator();
+
+  // Discover any defs/uses in this basic block.
+  SmallPtrSet<MachineInstr*, 8> DefUsesInMBB;
+  MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(SrcReg),
+         RE = MRI.reg_end(); RI != RE; ++RI) {
+    MachineInstr* DefUseMI = &*RI;
+    if (DefUseMI->getParent() == MBB)
+      DefUsesInMBB.insert(DefUseMI);
+  }
+
+  MachineBasicBlock::iterator InsertPoint;
+  if (DefUsesInMBB.empty()) {
+    // No defs.  Insert the copy at the start of the basic block.
+    InsertPoint = MBB->begin();
+  } else if (DefUsesInMBB.size() == 1) {
+    // Insert the copy immediately after the def/use.
+    InsertPoint = *DefUsesInMBB.begin();
+    ++InsertPoint;
+  } else {
+    // Insert the copy immediately after the last def/use.
+    InsertPoint = MBB->end();
+    while (!DefUsesInMBB.count(&*--InsertPoint)) {}
+    ++InsertPoint;
+  }
+
+  // Make sure the copy goes after any phi nodes however.
+  return MBB->SkipPHIsAndLabels(InsertPoint);
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h
new file mode 100644
index 000000000000..9ac47fb4c505
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h
@@ -0,0 +1,25 @@
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination *- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PHIELIMINATIONUTILS_H
+#define LLVM_CODEGEN_PHIELIMINATIONUTILS_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+    /// findPHICopyInsertPoint - Find a safe place in MBB to insert a copy from
+    /// SrcReg when following the CFG edge to SuccMBB. This needs to be after
+    /// any def of SrcReg, but before any subsequent point where control flow
+    /// might jump out of the basic block.
+    MachineBasicBlock::iterator
+    findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                           unsigned SrcReg);
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/Passes.cpp b/contrib/llvm/lib/CodeGen/Passes.cpp
new file mode 100644
index 000000000000..1af65c88abeb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Passes.cpp
@@ -0,0 +1,746 @@
+//===-- Passes.cpp - Target independent code generation passes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisablePostRA("disable-post-ra", cl::Hidden,
+    cl::desc("Disable Post Regalloc"));
+static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
+    cl::desc("Disable branch folding"));
+static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
+    cl::desc("Disable tail duplication"));
+static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
+    cl::desc("Disable pre-register allocation tail duplication"));
+static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
+    cl::Hidden, cl::desc("Disable probability-driven block placement"));
+static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
+    cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
+static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
+    cl::desc("Disable Stack Slot Coloring"));
+static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
+    cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
+    cl::desc("Disable Early If-conversion"));
+static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
+    cl::desc("Disable Machine Common Subexpression Elimination"));
+static cl::opt<cl::boolOrDefault>
+OptimizeRegAlloc("optimize-regalloc", cl::Hidden,
+    cl::desc("Enable optimized register allocation compilation path."));
+static cl::opt<cl::boolOrDefault>
+EnableMachineSched("enable-misched", cl::Hidden,
+    cl::desc("Enable the machine instruction scheduling pass."));
+static cl::opt<bool> EnableStrongPHIElim("strong-phi-elim", cl::Hidden,
+    cl::desc("Use strong PHI elimination."));
+static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
+    cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
+    cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
+    cl::desc("Disable Loop Strength Reduction Pass"));
+static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
+    cl::desc("Disable Codegen Prepare"));
+static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
+    cl::desc("Disable Copy Propagation pass"));
+static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
+    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
+static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
+    cl::desc("Print LLVM IR input to isel pass"));
+static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
+    cl::desc("Dump garbage collector data"));
+static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
+    cl::desc("Verify generated machine code"),
+    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=NULL));
+static cl::opt<std::string>
+PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
+                   cl::desc("Print machine instrs"),
+                   cl::value_desc("pass-name"), cl::init("option-unspecified"));
+
+// Experimental option to run live interval analysis early.
+static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
+    cl::desc("Run live interval analysis earlier in the pipeline"));
+
+/// Allow standard passes to be disabled by command line options. This supports
+/// simple binary flags that either suppress the pass or do nothing.
+/// i.e. -disable-mypass=false has no effect.
+/// These should be converted to boolOrDefault in order to use applyOverride.
+static AnalysisID applyDisable(AnalysisID PassID, bool Override) {
+  if (Override)
+    return 0;
+  return PassID;
+}
+
+/// Allow Pass selection to be overriden by command line options. This supports
+/// flags with ternary conditions. TargetID is passed through by default. The
+/// pass is suppressed when the option is false. When the option is true, the
+/// StandardID is selected if the target provides no default.
+static AnalysisID applyOverride(AnalysisID TargetID, cl::boolOrDefault Override,
+                                AnalysisID StandardID) {
+  switch (Override) {
+  case cl::BOU_UNSET:
+    return TargetID;
+  case cl::BOU_TRUE:
+    if (TargetID)
+      return TargetID;
+    if (StandardID == 0)
+      report_fatal_error("Target cannot enable pass");
+    return StandardID;
+  case cl::BOU_FALSE:
+    return 0;
+  }
+  llvm_unreachable("Invalid command line option state");
+}
+
+/// Allow standard passes to be disabled by the command line, regardless of who
+/// is adding the pass.
+///
+/// StandardID is the pass identified in the standard pass pipeline and provided
+/// to addPass(). It may be a target-specific ID in the case that the target
+/// directly adds its own pass, but in that case we harmlessly fall through.
+///
+/// TargetID is the pass that the target has configured to override StandardID.
+///
+/// StandardID may be a pseudo ID. In that case TargetID is the name of the real
+/// pass to run. This allows multiple options to control a single pass depending
+/// on where in the pipeline that pass is added.
+static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) {
+  if (StandardID == &PostRASchedulerID)
+    return applyDisable(TargetID, DisablePostRA);
+
+  if (StandardID == &BranchFolderPassID)
+    return applyDisable(TargetID, DisableBranchFold);
+
+  if (StandardID == &TailDuplicateID)
+    return applyDisable(TargetID, DisableTailDuplicate);
+
+  if (StandardID == &TargetPassConfig::EarlyTailDuplicateID)
+    return applyDisable(TargetID, DisableEarlyTailDup);
+
+  if (StandardID == &MachineBlockPlacementID)
+    return applyDisable(TargetID, DisableBlockPlacement);
+
+  if (StandardID == &StackSlotColoringID)
+    return applyDisable(TargetID, DisableSSC);
+
+  if (StandardID == &DeadMachineInstructionElimID)
+    return applyDisable(TargetID, DisableMachineDCE);
+
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, DisableEarlyIfConversion);
+
+  if (StandardID == &MachineLICMID)
+    return applyDisable(TargetID, DisableMachineLICM);
+
+  if (StandardID == &MachineCSEID)
+    return applyDisable(TargetID, DisableMachineCSE);
+
+  if (StandardID == &MachineSchedulerID)
+    return applyOverride(TargetID, EnableMachineSched, StandardID);
+
+  if (StandardID == &TargetPassConfig::PostRAMachineLICMID)
+    return applyDisable(TargetID, DisablePostRAMachineLICM);
+
+  if (StandardID == &MachineSinkingID)
+    return applyDisable(TargetID, DisableMachineSink);
+
+  if (StandardID == &MachineCopyPropagationID)
+    return applyDisable(TargetID, DisableCopyProp);
+
+  return TargetID;
+}
+
+//===---------------------------------------------------------------------===//
+/// TargetPassConfig
+//===---------------------------------------------------------------------===//
+
+INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
+                "Target Pass Configuration", false, false)
+char TargetPassConfig::ID = 0;
+
+// Pseudo Pass IDs.
+char TargetPassConfig::EarlyTailDuplicateID = 0;
+char TargetPassConfig::PostRAMachineLICMID = 0;
+
+namespace llvm {
+class PassConfigImpl {
+public:
+  // List of passes explicitly substituted by this target. Normally this is
+  // empty, but it is a convenient way to suppress or replace specific passes
+  // that are part of a standard pass pipeline without overridding the entire
+  // pipeline. This mechanism allows target options to inherit a standard pass's
+  // user interface. For example, a target may disable a standard pass by
+  // default by substituting a pass ID of zero, and the user may still enable
+  // that standard pass with an explicit command line option.
+  DenseMap<AnalysisID,AnalysisID> TargetPasses;
+
+  /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
+  /// is inserted after each instance of the first one.
+  SmallVector<std::pair<AnalysisID, AnalysisID>, 4> InsertedPasses;
+};
+} // namespace llvm
+
+// Out of line virtual method.
+TargetPassConfig::~TargetPassConfig() {
+  delete Impl;
+}
+
+// Out of line constructor provides default values for pass options and
+// registers all common codegen passes.
+TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
+  : ImmutablePass(ID), PM(&pm), StartAfter(0), StopAfter(0),
+    Started(true), Stopped(false), TM(tm), Impl(0), Initialized(false),
+    DisableVerify(false),
+    EnableTailMerge(true) {
+
+  Impl = new PassConfigImpl();
+
+  // Register all target independent codegen passes to activate their PassIDs,
+  // including this pass itself.
+  initializeCodeGen(*PassRegistry::getPassRegistry());
+
+  // Substitute Pseudo Pass IDs for real ones.
+  substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
+  substitutePass(&PostRAMachineLICMID, &MachineLICMID);
+
+  // Temporarily disable experimental passes.
+  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  if (!ST.enableMachineScheduler())
+    disablePass(&MachineSchedulerID);
+}
+
+/// Insert InsertedPassID pass after TargetPassID.
+void TargetPassConfig::insertPass(AnalysisID TargetPassID,
+                                  AnalysisID InsertedPassID) {
+  assert(TargetPassID != InsertedPassID && "Insert a pass after itself!");
+  std::pair<AnalysisID, AnalysisID> P(TargetPassID, InsertedPassID);
+  Impl->InsertedPasses.push_back(P);
+}
+
+/// createPassConfig - Create a pass configuration object to be used by
+/// addPassToEmitX methods for generating a pipeline of CodeGen passes.
+///
+/// Targets may override this to extend TargetPassConfig.
+TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new TargetPassConfig(this, PM);
+}
+
+TargetPassConfig::TargetPassConfig()
+  : ImmutablePass(ID), PM(0) {
+  llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
+}
+
+// Helper to verify the analysis is really immutable.
+void TargetPassConfig::setOpt(bool &Opt, bool Val) {
+  assert(!Initialized && "PassConfig is immutable");
+  Opt = Val;
+}
+
+void TargetPassConfig::substitutePass(AnalysisID StandardID,
+                                      AnalysisID TargetID) {
+  Impl->TargetPasses[StandardID] = TargetID;
+}
+
+AnalysisID TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
+  DenseMap<AnalysisID, AnalysisID>::const_iterator
+    I = Impl->TargetPasses.find(ID);
+  if (I == Impl->TargetPasses.end())
+    return ID;
+  return I->second;
+}
+
+/// Add a pass to the PassManager if that pass is supposed to be run.  If the
+/// Started/Stopped flags indicate either that the compilation should start at
+/// a later pass or that it should stop after an earlier pass, then do not add
+/// the pass.  Finally, compare the current pass against the StartAfter
+/// and StopAfter options and change the Started/Stopped flags accordingly.
+void TargetPassConfig::addPass(Pass *P) {
+  assert(!Initialized && "PassConfig is immutable");
+
+  // Cache the Pass ID here in case the pass manager finds this pass is
+  // redundant with ones already scheduled / available, and deletes it.
+  // Fundamentally, once we add the pass to the manager, we no longer own it
+  // and shouldn't reference it.
+  AnalysisID PassID = P->getPassID();
+
+  if (Started && !Stopped)
+    PM->add(P);
+  if (StopAfter == PassID)
+    Stopped = true;
+  if (StartAfter == PassID)
+    Started = true;
+  if (Stopped && !Started)
+    report_fatal_error("Cannot stop compilation after pass that is not run");
+}
+
+/// Add a CodeGen pass at this point in the pipeline after checking for target
+/// and command line overrides.
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+  AnalysisID TargetID = getPassSubstitution(PassID);
+  AnalysisID FinalID = overridePass(PassID, TargetID);
+  if (FinalID == 0)
+    return FinalID;
+
+  Pass *P = Pass::createPass(FinalID);
+  if (!P)
+    llvm_unreachable("Pass ID not registered");
+  addPass(P);
+  // Add the passes after the pass P if there is any.
+  for (SmallVector<std::pair<AnalysisID, AnalysisID>, 4>::iterator
+         I = Impl->InsertedPasses.begin(), E = Impl->InsertedPasses.end();
+       I != E; ++I) {
+    if ((*I).first == PassID) {
+      assert((*I).second && "Illegal Pass ID!");
+      Pass *NP = Pass::createPass((*I).second);
+      assert(NP && "Pass ID not registered");
+      addPass(NP);
+    }
+  }
+  return FinalID;
+}
+
+void TargetPassConfig::printAndVerify(const char *Banner) {
+  if (TM->shouldPrintMachineCode())
+    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
+
+  if (VerifyMachineCode)
+    addPass(createMachineVerifierPass(Banner));
+}
+
+/// Add common target configurable passes that perform LLVM IR to IR transforms
+/// following machine independent optimization.
+void TargetPassConfig::addIRPasses() {
+  // Basic AliasAnalysis support.
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  addPass(createTypeBasedAliasAnalysisPass());
+  addPass(createBasicAliasAnalysisPass());
+
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+
+  // Run loop strength reduction before anything else.
+  if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
+    addPass(createLoopStrengthReducePass());
+    if (PrintLSR)
+      addPass(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+  }
+
+  addPass(createGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  addPass(createUnreachableBlockEliminationPass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+void TargetPassConfig::addPassesToHandleExceptions() {
+  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(createSjLjEHPreparePass(TM->getTargetLowering()));
+    // FALLTHROUGH
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+  case ExceptionHandling::Win64:
+    addPass(createDwarfEHPass(TM));
+    break;
+  case ExceptionHandling::None:
+    addPass(createLowerInvokePass(TM->getTargetLowering()));
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(createUnreachableBlockEliminationPass());
+    break;
+  }
+}
+
+/// Add pass to prepare the LLVM IR for code generation. This should be done
+/// before exception handling preparation passes.
+void TargetPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
+    addPass(createCodeGenPreparePass(getTargetLowering()));
+}
+
+/// Add common passes that perform LLVM IR to IR transforms in preparation for
+/// instruction selection.
+void TargetPassConfig::addISelPrepare() {
+  addPass(createStackProtectorPass(getTargetLowering()));
+
+  addPreISel();
+
+  if (PrintISelInput)
+    addPass(createPrintFunctionPass("\n\n"
+                                    "*** Final LLVM Code input to ISel ***\n",
+                                    &dbgs()));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+}
+
+/// Add the complete set of target-independent postISel code generator passes.
+///
+/// This can be read as the standard order of major LLVM CodeGen stages. Stages
+/// with nontrivial configuration or multiple passes are broken out below in
+/// add%Stage routines.
+///
+/// Any TargetPassConfig::addXX routine may be overriden by the Target. The
+/// addPre/Post methods with empty header implementations allow injecting
+/// target-specific fixups just before or after major stages. Additionally,
+/// targets have the flexibility to change pass order within a stage by
+/// overriding default implementation of add%Stage routines below. Each
+/// technique has maintainability tradeoffs because alternate pass orders are
+/// not well supported. addPre/Post works better if the target pass is easily
+/// tied to a common pass. But if it has subtle dependencies on multiple passes,
+/// the target should override the stage instead.
+///
+/// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
+/// before/after any target-independent pass. But it's currently overkill.
+void TargetPassConfig::addMachinePasses() {
+  // Insert a machine instr printer pass after the specified pass.
+  // If -print-machineinstrs specified, print machineinstrs after all passes.
+  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
+    TM->Options.PrintMachineCode = true;
+  else if (!StringRef(PrintMachineInstrs.getValue())
+           .equals("option-unspecified")) {
+    const PassRegistry *PR = PassRegistry::getPassRegistry();
+    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
+    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    assert (TPI && IPI && "Pass ID not registered!");
+    const char *TID = (const char *)(TPI->getTypeInfo());
+    const char *IID = (const char *)(IPI->getTypeInfo());
+    insertPass(TID, IID);
+  }
+
+  // Print the instruction selected machine code...
+  printAndVerify("After Instruction Selection");
+
+  // Expand pseudo-instructions emitted by ISel.
+  if (addPass(&ExpandISelPseudosID))
+    printAndVerify("After ExpandISelPseudos");
+
+  // Add passes that optimize machine instructions in SSA form.
+  if (getOptLevel() != CodeGenOpt::None) {
+    addMachineSSAOptimization();
+  } else {
+    // If the target requests it, assign local variables to stack slots relative
+    // to one another and simplify frame index references where possible.
+    addPass(&LocalStackSlotAllocationID);
+  }
+
+  // Run pre-ra passes.
+  if (addPreRegAlloc())
+    printAndVerify("After PreRegAlloc passes");
+
+  // Run register allocation and passes that are tightly coupled with it,
+  // including phi elimination and scheduling.
+  if (getOptimizeRegAlloc())
+    addOptimizedRegAlloc(createRegAllocPass(true));
+  else
+    addFastRegAlloc(createRegAllocPass(false));
+
+  // Run post-ra passes.
+  if (addPostRegAlloc())
+    printAndVerify("After PostRegAlloc passes");
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  addPass(&PrologEpilogCodeInserterID);
+  printAndVerify("After PrologEpilogCodeInserter");
+
+  /// Add passes that optimize machine instructions after register allocation.
+  if (getOptLevel() != CodeGenOpt::None)
+    addMachineLateOptimization();
+
+  // Expand pseudo instructions before second scheduling pass.
+  addPass(&ExpandPostRAPseudosID);
+  printAndVerify("After ExpandPostRAPseudos");
+
+  // Run pre-sched2 passes.
+  if (addPreSched2())
+    printAndVerify("After PreSched2 passes");
+
+  // Second pass scheduler.
+  if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&PostRASchedulerID);
+    printAndVerify("After PostRAScheduler");
+  }
+
+  // GC
+  if (addGCPasses()) {
+    if (PrintGCInfo)
+      addPass(createGCInfoPrinter(dbgs()));
+  }
+
+  // Basic block placement.
+  if (getOptLevel() != CodeGenOpt::None)
+    addBlockPlacement();
+
+  if (addPreEmitPass())
+    printAndVerify("After PreEmit passes");
+}
+
+/// Add passes that optimize machine instructions in SSA form.
+void TargetPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  if (addPass(&EarlyTailDuplicateID))
+    printAndVerify("After Pre-RegAlloc TailDuplicate");
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+  printAndVerify("After codegen DCE pass");
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  if (addILPOpts())
+    printAndVerify("After ILP optimizations");
+
+  addPass(&MachineLICMID);
+  addPass(&MachineCSEID);
+  addPass(&MachineSinkingID);
+  printAndVerify("After Machine LICM, CSE and Sinking passes");
+
+  addPass(&PeepholeOptimizerID);
+  printAndVerify("After codegen peephole optimization pass");
+}
+
+//===---------------------------------------------------------------------===//
+/// Register Allocation Pass Configuration
+//===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::getOptimizeRegAlloc() const {
+  switch (OptimizeRegAlloc) {
+  case cl::BOU_UNSET: return getOptLevel() != CodeGenOpt::None;
+  case cl::BOU_TRUE:  return true;
+  case cl::BOU_FALSE: return false;
+  }
+  llvm_unreachable("Invalid optimize-regalloc state");
+}
+
+/// RegisterRegAlloc's global Registry tracks allocator registration.
+MachinePassRegistry RegisterRegAlloc::Registry;
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+static FunctionPass *useDefaultRegisterAllocator() { return 0; }
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                useDefaultRegisterAllocator);
+
+/// -regalloc=... command line option.
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+         cl::init(&useDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
+
+
+/// Instantiate the default register allocator pass for this target for either
+/// the optimized or unoptimized allocation path. This will be added to the pass
+/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
+/// in the optimized case.
+///
+/// A target that uses the standard regalloc pass order for fast or optimized
+/// allocation may still override this for per-target regalloc
+/// selection. But -regalloc=... always takes precedence.
+FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
+  if (Optimized)
+    return createGreedyRegisterAllocator();
+  else
+    return createFastRegisterAllocator();
+}
+
+/// Find and instantiate the register allocation pass requested by this target
+/// at the current optimization level.  Different register allocators are
+/// defined as separate passes because they may require different analysis.
+///
+/// This helper ensures that the regalloc= option is always available,
+/// even for targets that override the default allocator.
+///
+/// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
+/// this can be folded into addPass.
+FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+
+  // Initialize the global default.
+  if (!Ctor) {
+    Ctor = RegAlloc;
+    RegisterRegAlloc::setDefault(RegAlloc);
+  }
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  // With no -regalloc= override, ask the target for a regalloc pass.
+  return createTargetRegisterAllocator(Optimized);
+}
+
+/// Add the minimum set of target-independent passes that are required for
+/// register allocation. No coalescing or scheduling.
+void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
+
+  addPass(RegAllocPass);
+  printAndVerify("After Register Allocation");
+}
+
+/// Add standard target-independent passes that are tightly coupled with
+/// optimized register allocation, including coalescing, machine instruction
+/// scheduling, and register allocation itself.
+void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&ProcessImplicitDefsID);
+
+  // LiveVariables currently requires pure SSA form.
+  //
+  // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
+  // LiveVariables can be removed completely, and LiveIntervals can be directly
+  // computed. (We still either need to regenerate kill flags after regalloc, or
+  // preferably fix the scavenger to not depend on them).
+  addPass(&LiveVariablesID);
+
+  // Add passes that move from transformed SSA into conventional SSA. This is a
+  // "copy coalescing" problem.
+  //
+  if (!EnableStrongPHIElim) {
+    // Edge splitting is smarter with machine loop info.
+    addPass(&MachineLoopInfoID);
+    addPass(&PHIEliminationID);
+  }
+
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (EarlyLiveIntervals)
+    addPass(&LiveIntervalsID);
+
+  addPass(&TwoAddressInstructionPassID);
+
+  if (EnableStrongPHIElim)
+    addPass(&StrongPHIEliminationID);
+
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+  // Add the selected register allocation pass.
+  addPass(RegAllocPass);
+  printAndVerify("After Register Allocation, before rewriter");
+
+  // Allow targets to change the register assignments before rewriting.
+  if (addPreRewrite())
+    printAndVerify("After pre-rewrite passes");
+
+  // Finally rewrite virtual registers.
+  addPass(&VirtRegRewriterID);
+  printAndVerify("After Virtual Register Rewriter");
+
+  // FinalizeRegAlloc is convenient until MachineInstrBundles is more mature,
+  // but eventually, all users of it should probably be moved to addPostRA and
+  // it can go away.  Currently, it's the intended place for targets to run
+  // FinalizeMachineBundles, because passes other than MachineScheduling an
+  // RegAlloc itself may not be aware of bundles.
+  if (addFinalizeRegAlloc())
+    printAndVerify("After RegAlloc finalization");
+
+  // Perform stack slot coloring and post-ra machine LICM.
+  //
+  // FIXME: Re-enable coloring with register when it's capable of adding
+  // kill markers.
+  addPass(&StackSlotColoringID);
+
+  // Run post-ra machine LICM to hoist reloads / remats.
+  //
+  // FIXME: can this move into MachineLateOptimization?
+  addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring and postra Machine LICM");
+}
+
+//===---------------------------------------------------------------------===//
+/// Post RegAlloc Pass Configuration
+//===---------------------------------------------------------------------===//
+
+/// Add passes that optimize machine instructions after register allocation.
+void TargetPassConfig::addMachineLateOptimization() {
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  if (addPass(&BranchFolderPassID))
+    printAndVerify("After BranchFolding");
+
+  // Tail duplication.
+  if (addPass(&TailDuplicateID))
+    printAndVerify("After TailDuplicate");
+
+  // Copy propagation.
+  if (addPass(&MachineCopyPropagationID))
+    printAndVerify("After copy propagation pass");
+}
+
+/// Add standard GC passes.
+bool TargetPassConfig::addGCPasses() {
+  addPass(&GCMachineCodeAnalysisID);
+  return true;
+}
+
+/// Add standard basic block placement passes.
+void TargetPassConfig::addBlockPlacement() {
+  if (addPass(&MachineBlockPlacementID)) {
+    // Run a separate pass to collect block placement statistics.
+    if (EnableBlockPlacementStats)
+      addPass(&MachineBlockPlacementStatsID);
+
+    printAndVerify("After machine block placement.");
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
new file mode 100644
index 000000000000..a7439b5129b5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -0,0 +1,577 @@
+//===-- PeepholeOptimizer.cpp - Peephole Optimizations --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Perform peephole optimizations on the machine code:
+//
+// - Optimize Extensions
+//
+//     Optimization of sign / zero extension instructions. It may be extended to
+//     handle other instructions with similar properties.
+//
+//     On some targets, some instructions, e.g. X86 sign / zero extension, may
+//     leave the source value in the lower part of the result. This optimization
+//     will replace some uses of the pre-extension value with uses of the
+//     sub-register of the results.
+//
+// - Optimize Comparisons
+//
+//     Optimization of comparison instructions. For instance, in this code:
+//
+//       sub r1, 1
+//       cmp r1, 0
+//       bz  L1
+//
+//     If the "sub" instruction all ready sets (or could be modified to set) the
+//     same flag that the "cmp" instruction sets and that "bz" uses, then we can
+//     eliminate the "cmp" instruction.
+//
+//     Another instance, in this code:
+//
+//       sub r1, r3 | sub r1, imm
+//       cmp r3, r1 or cmp r1, r3 | cmp r1, imm
+//       bge L1
+//
+//     If the branch instruction can use flag from "sub", then we can replace
+//     "sub" with "subs" and eliminate the "cmp" instruction.
+//
+// - Optimize Bitcast pairs:
+//
+//     v1 = bitcast v0
+//     v2 = bitcast v1
+//        = v2
+//   =>
+//     v1 = bitcast v0
+//        = v0
+//
+// - Optimize Loads:
+//
+//     Loads that can be folded into a later instruction. A load is foldable
+//     if it loads to virtual registers and the virtual register defined has 
+//     a single use.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "peephole-opt"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+// Optimize Extensions
+static cl::opt<bool>
+Aggressive("aggressive-ext-opt", cl::Hidden,
+           cl::desc("Aggressive extension optimization"));
+
+static cl::opt<bool>
+DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
+                cl::desc("Disable the peephole optimizer"));
+
+STATISTIC(NumReuse,      "Number of extension results reused");
+STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
+STATISTIC(NumCmps,       "Number of compares eliminated");
+STATISTIC(NumImmFold,    "Number of move immediate folded");
+STATISTIC(NumLoadFold,   "Number of loads folded");
+STATISTIC(NumSelects,    "Number of selects optimized");
+
+namespace {
+  class PeepholeOptimizer : public MachineFunctionPass {
+    const TargetMachine   *TM;
+    const TargetInstrInfo *TII;
+    MachineRegisterInfo   *MRI;
+    MachineDominatorTree  *DT;  // Machine dominator tree
+
+  public:
+    static char ID; // Pass identification
+    PeepholeOptimizer() : MachineFunctionPass(ID) {
+      initializePeepholeOptimizerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      if (Aggressive) {
+        AU.addRequired<MachineDominatorTree>();
+        AU.addPreserved<MachineDominatorTree>();
+      }
+    }
+
+  private:
+    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                          SmallPtrSet<MachineInstr*, 8> &LocalMIs);
+    bool optimizeSelect(MachineInstr *MI);
+    bool isMoveImmediate(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &ImmDefRegs,
+                         DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+                       SmallSet<unsigned, 4> &ImmDefRegs,
+                       DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg);
+  };
+}
+
+char PeepholeOptimizer::ID = 0;
+char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
+INITIALIZE_PASS_BEGIN(PeepholeOptimizer, "peephole-opts",
+                "Peephole Optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PeepholeOptimizer, "peephole-opts",
+                "Peephole Optimizations", false, false)
+
+/// optimizeExtInstr - If instruction is a copy-like instruction, i.e. it reads
+/// a single register and writes a single register and it does not modify the
+/// source, and if the source value is preserved as a sub-register of the
+/// result, then replace all reachable uses of the source with the subreg of the
+/// result.
+///
+/// Do not generate an EXTRACT that is used only in a debug use, as this changes
+/// the code. Since this code does not currently share EXTRACTs, just ignore all
+/// debug uses.
+bool PeepholeOptimizer::
+optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                 SmallPtrSet<MachineInstr*, 8> &LocalMIs) {
+  unsigned SrcReg, DstReg, SubIdx;
+  if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
+    return false;
+
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  if (MRI->hasOneNonDBGUse(SrcReg))
+    // No other uses.
+    return false;
+
+  // Ensure DstReg can get a register class that actually supports
+  // sub-registers. Don't change the class until we commit.
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+  DstRC = TM->getRegisterInfo()->getSubClassWithSubReg(DstRC, SubIdx);
+  if (!DstRC)
+    return false;
+
+  // The ext instr may be operating on a sub-register of SrcReg as well.
+  // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
+  // register.
+  // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
+  // SrcReg:SubIdx should be replaced.
+  bool UseSrcSubIdx = TM->getRegisterInfo()->
+    getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != 0;
+
+  // The source has other uses. See if we can replace the other uses with use of
+  // the result of the extension.
+  SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
+       UI != UE; ++UI)
+    ReachedBBs.insert(UI->getParent());
+
+  // Uses that are in the same BB of uses of the result of the instruction.
+  SmallVector<MachineOperand*, 8> Uses;
+
+  // Uses that the result of the instruction can reach.
+  SmallVector<MachineOperand*, 8> ExtendedUses;
+
+  bool ExtendLife = true;
+  for (MachineRegisterInfo::use_nodbg_iterator
+       UI = MRI->use_nodbg_begin(SrcReg), UE = MRI->use_nodbg_end();
+       UI != UE; ++UI) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    if (UseMI == MI)
+      continue;
+
+    if (UseMI->isPHI()) {
+      ExtendLife = false;
+      continue;
+    }
+
+    // Only accept uses of SrcReg:SubIdx.
+    if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
+      continue;
+
+    // It's an error to translate this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+    //
+    // into this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1027 = COPY %reg1025:4
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+    //
+    // The problem here is that SUBREG_TO_REG is there to assert that an
+    // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+    // the COPY here, it will give us the value after the <sext>, not the
+    // original value of %reg1024 before <sext>.
+    if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      // Local uses that come after the extension.
+      if (!LocalMIs.count(UseMI))
+        Uses.push_back(&UseMO);
+    } else if (ReachedBBs.count(UseMBB)) {
+      // Non-local uses where the result of the extension is used. Always
+      // replace these unless it's a PHI.
+      Uses.push_back(&UseMO);
+    } else if (Aggressive && DT->dominates(MBB, UseMBB)) {
+      // We may want to extend the live range of the extension result in order
+      // to replace these uses.
+      ExtendedUses.push_back(&UseMO);
+    } else {
+      // Both will be live out of the def MBB anyway. Don't extend live range of
+      // the extension result.
+      ExtendLife = false;
+      break;
+    }
+  }
+
+  if (ExtendLife && !ExtendedUses.empty())
+    // Extend the liveness of the extension result.
+    std::copy(ExtendedUses.begin(), ExtendedUses.end(),
+              std::back_inserter(Uses));
+
+  // Now replace all uses.
+  bool Changed = false;
+  if (!Uses.empty()) {
+    SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
+
+    // Look for PHI uses of the extended result, we don't want to extend the
+    // liveness of a PHI input. It breaks all kinds of assumptions down
+    // stream. A PHI use is expected to be the kill of its source values.
+    for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(DstReg), UE = MRI->use_nodbg_end();
+         UI != UE; ++UI)
+      if (UI->isPHI())
+        PHIBBs.insert(UI->getParent());
+
+    const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
+      MachineOperand *UseMO = Uses[i];
+      MachineInstr *UseMI = UseMO->getParent();
+      MachineBasicBlock *UseMBB = UseMI->getParent();
+      if (PHIBBs.count(UseMBB))
+        continue;
+
+      // About to add uses of DstReg, clear DstReg's kill flags.
+      if (!Changed) {
+        MRI->clearKillFlags(DstReg);
+        MRI->constrainRegClass(DstReg, DstRC);
+      }
+
+      unsigned NewVR = MRI->createVirtualRegister(RC);
+      MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(DstReg, 0, SubIdx);
+      // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set.
+      if (UseSrcSubIdx) {
+        Copy->getOperand(0).setSubReg(SubIdx);
+        Copy->getOperand(0).setIsUndef();
+      }
+      UseMO->setReg(NewVR);
+      ++NumReuse;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
+/// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
+/// a value cross register classes), and the source is defined by another
+/// bitcast instruction B. And if the register class of source of B matches
+/// the register class of instruction A, then it is legal to replace all uses
+/// of the def of A with source of B. e.g.
+///   %vreg0<def> = VMOVSR %vreg1
+///   %vreg3<def> = VMOVRS %vreg0
+///   Replace all uses of vreg3 with vreg1.
+
+bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
+                                             MachineBasicBlock *MBB) {
+  unsigned NumDefs = MI->getDesc().getNumDefs();
+  unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
+  if (NumDefs != 1)
+    return false;
+
+  unsigned Def = 0;
+  unsigned Src = 0;
+  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef())
+      Def = Reg;
+    else if (Src)
+      // Multiple sources?
+      return false;
+    else
+      Src = Reg;
+  }
+
+  assert(Def && Src && "Malformed bitcast instruction!");
+
+  MachineInstr *DefMI = MRI->getVRegDef(Src);
+  if (!DefMI || !DefMI->isBitcast())
+    return false;
+
+  unsigned SrcSrc = 0;
+  NumDefs = DefMI->getDesc().getNumDefs();
+  NumSrcs = DefMI->getDesc().getNumOperands() - NumDefs;
+  if (NumDefs != 1)
+    return false;
+  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
+    const MachineOperand &MO = DefMI->getOperand(i);
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (!MO.isDef()) {
+      if (SrcSrc)
+        // Multiple sources?
+        return false;
+      else
+        SrcSrc = Reg;
+    }
+  }
+
+  if (MRI->getRegClass(SrcSrc) != MRI->getRegClass(Def))
+    return false;
+
+  MRI->replaceRegWith(Def, SrcSrc);
+  MRI->clearKillFlags(SrcSrc);
+  MI->eraseFromParent();
+  ++NumBitcasts;
+  return true;
+}
+
+/// optimizeCmpInstr - If the instruction is a compare and the previous
+/// instruction it's comparing against all ready sets (or could be modified to
+/// set) the same flag as the compare, then we can remove the comparison and use
+/// the flag from the previous instruction.
+bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) {
+  // If this instruction is a comparison against zero and isn't comparing a
+  // physical register, we can try to optimize it.
+  unsigned SrcReg, SrcReg2;
+  int CmpMask, CmpValue;
+  if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
+    return false;
+
+  // Attempt to optimize the comparison instruction.
+  if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
+    ++NumCmps;
+    return true;
+  }
+
+  return false;
+}
+
+/// Optimize a select instruction.
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
+  unsigned TrueOp = 0;
+  unsigned FalseOp = 0;
+  bool Optimizable = false;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeSelect(MI, Cond, TrueOp, FalseOp, Optimizable))
+    return false;
+  if (!Optimizable)
+    return false;
+  if (!TII->optimizeSelect(MI))
+    return false;
+  MI->eraseFromParent();
+  ++NumSelects;
+  return true;
+}
+
+/// isLoadFoldable - Check whether MI is a candidate for folding into a later
+/// instruction. We only fold loads to virtual registers and the virtual
+/// register defined has a single use.
+bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI,
+                                       unsigned &FoldAsLoadDefReg) {
+  if (!MI->canFoldAsLoad() || !MI->mayLoad())
+    return false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+
+  unsigned Reg = MI->getOperand(0).getReg();
+  // To reduce compilation time, we check MRI->hasOneUse when inserting
+  // loads. It should be checked when processing uses of the load, since
+  // uses can be removed during peephole.
+  if (!MI->getOperand(0).getSubReg() &&
+      TargetRegisterInfo::isVirtualRegister(Reg) &&
+      MRI->hasOneUse(Reg)) {
+    FoldAsLoadDefReg = Reg;
+    return true;
+  }
+  return false;
+}
+
+bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
+                                        SmallSet<unsigned, 4> &ImmDefRegs,
+                                 DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MI->isMoveImmediate())
+    return false;
+  if (MCID.getNumDefs() != 1)
+    return false;
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    ImmDefMIs.insert(std::make_pair(Reg, MI));
+    ImmDefRegs.insert(Reg);
+    return true;
+  }
+
+  return false;
+}
+
+/// foldImmediate - Try folding register operands that are defined by move
+/// immediate instructions, i.e. a trivial constant folding optimization, if
+/// and only if the def and use are in the same BB.
+bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+                                      SmallSet<unsigned, 4> &ImmDefRegs,
+                                 DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (ImmDefRegs.count(Reg) == 0)
+      continue;
+    DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
+    assert(II != ImmDefMIs.end());
+    if (TII->FoldImmediate(MI, II->second, Reg, MRI)) {
+      ++NumImmFold;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
+  DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
+
+  if (DisablePeephole)
+    return false;
+
+  TM  = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  MRI = &MF.getRegInfo();
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : 0;
+
+  bool Changed = false;
+
+  SmallPtrSet<MachineInstr*, 8> LocalMIs;
+  SmallSet<unsigned, 4> ImmDefRegs;
+  DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+  unsigned FoldAsLoadDefReg;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    bool SeenMoveImm = false;
+    LocalMIs.clear();
+    ImmDefRegs.clear();
+    ImmDefMIs.clear();
+    FoldAsLoadDefReg = 0;
+
+    for (MachineBasicBlock::iterator
+           MII = I->begin(), MIE = I->end(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+      // We may be erasing MI below, increment MII now.
+      ++MII;
+      LocalMIs.insert(MI);
+
+      // If there exists an instruction which belongs to the following
+      // categories, we will discard the load candidate.
+      if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
+          MI->isKill() || MI->isInlineAsm() || MI->isDebugValue() ||
+          MI->hasUnmodeledSideEffects()) {
+        FoldAsLoadDefReg = 0;
+        continue;
+      }
+      if (MI->mayStore() || MI->isCall())
+        FoldAsLoadDefReg = 0;
+
+      if ((MI->isBitcast() && optimizeBitcastInstr(MI, MBB)) ||
+          (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
+          (MI->isSelect() && optimizeSelect(MI))) {
+        // MI is deleted.
+        LocalMIs.erase(MI);
+        Changed = true;
+        continue;
+      }
+
+      if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
+        SeenMoveImm = true;
+      } else {
+        Changed |= optimizeExtInstr(MI, MBB, LocalMIs);
+        // optimizeExtInstr might have created new instructions after MI
+        // and before the already incremented MII. Adjust MII so that the
+        // next iteration sees the new instructions.
+        MII = MI;
+        ++MII;
+        if (SeenMoveImm)
+          Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
+      }
+
+      // Check whether MI is a load candidate for folding into a later
+      // instruction. If MI is not a candidate, check whether we can fold an
+      // earlier load into MI.
+      if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) {
+        // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr
+        // can enable folding by converting SUB to CMP.
+        MachineInstr *DefMI = 0;
+        MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
+                                                      FoldAsLoadDefReg, DefMI);
+        if (FoldMI) {
+          // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
+          DEBUG(dbgs() << "Replacing: " << *MI);
+          DEBUG(dbgs() << "     With: " << *FoldMI);
+          LocalMIs.erase(MI);
+          LocalMIs.erase(DefMI);
+          LocalMIs.insert(FoldMI);
+          MI->eraseFromParent();
+          DefMI->eraseFromParent();
+          ++NumLoadFold;
+
+          // MI is replaced with FoldMI.
+          Changed = true;
+          continue;
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
new file mode 100644
index 000000000000..53fe273a1032
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -0,0 +1,776 @@
+//===----- SchedulePostRAList.cpp - list scheduler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "post-RA-sched"
+#include "llvm/CodeGen/Passes.h"
+#include "AggressiveAntiDepBreaker.h"
+#include "AntiDepBreaker.h"
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+STATISTIC(NumNoops, "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+STATISTIC(NumFixedAnti, "Number of fixed anti-dependencies");
+
+// Post-RA scheduling is enabled with
+// TargetSubtargetInfo.enablePostRAScheduler(). This flag can be used to
+// override the target.
+static cl::opt<bool>
+EnablePostRAScheduler("post-RA-scheduler",
+                       cl::desc("Enable scheduling after register allocation"),
+                       cl::init(false), cl::Hidden);
+static cl::opt<std::string>
+EnableAntiDepBreaking("break-anti-dependencies",
+                      cl::desc("Break post-RA scheduling anti-dependencies: "
+                               "\"critical\", \"all\", or \"none\""),
+                      cl::init("none"), cl::Hidden);
+
+// If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("postra-sched-debugdiv",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("postra-sched-debugmod",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+
+AntiDepBreaker::~AntiDepBreaker() { }
+
+namespace {
+  class PostRAScheduler : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
+
+  public:
+    static char ID;
+    PostRAScheduler() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetPassConfig>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char PostRAScheduler::ID = 0;
+
+  class SchedulePostRATDList : public ScheduleDAGInstrs {
+    /// AvailableQueue - The priority queue to use for the available SUnits.
+    ///
+    LatencyPriorityQueue AvailableQueue;
+
+    /// PendingQueue - This contains all of the instructions whose operands have
+    /// been issued, but their results are not ready yet (due to the latency of
+    /// the operation).  Once the operands becomes available, the instruction is
+    /// added to the AvailableQueue.
+    std::vector<SUnit*> PendingQueue;
+
+    /// HazardRec - The hazard recognizer to use.
+    ScheduleHazardRecognizer *HazardRec;
+
+    /// AntiDepBreak - Anti-dependence breaking object, or NULL if none
+    AntiDepBreaker *AntiDepBreak;
+
+    /// AA - AliasAnalysis for making memory reference queries.
+    AliasAnalysis *AA;
+
+    /// LiveRegs - true if the register is live.
+    BitVector LiveRegs;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+  public:
+    SchedulePostRATDList(
+      MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
+      AliasAnalysis *AA, const RegisterClassInfo&,
+      TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+      SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs);
+
+    ~SchedulePostRATDList();
+
+    /// startBlock - Initialize register live-range state for scheduling in
+    /// this block.
+    ///
+    void startBlock(MachineBasicBlock *BB);
+
+    /// Initialize the scheduler state for the next scheduling region.
+    virtual void enterRegion(MachineBasicBlock *bb,
+                             MachineBasicBlock::iterator begin,
+                             MachineBasicBlock::iterator end,
+                             unsigned endcount);
+
+    /// Notify that the scheduler has finished scheduling the current region.
+    virtual void exitRegion();
+
+    /// Schedule - Schedule the instruction range using list scheduling.
+    ///
+    void schedule();
+
+    void EmitSchedule();
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr *MI, unsigned Count);
+
+    /// finishBlock - Clean up register live-range state.
+    ///
+    void finishBlock();
+
+    /// FixupKills - Fix register kill flags that have been made
+    /// invalid due to scheduling
+    ///
+    void FixupKills(MachineBasicBlock *MBB);
+
+  private:
+    void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
+    void ReleaseSuccessors(SUnit *SU);
+    void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+    void ListScheduleTopDown();
+    void StartBlockForKills(MachineBasicBlock *BB);
+
+    // ToggleKillFlag - Toggle a register operand kill flag. Other
+    // adjustments may be made to the instruction if necessary. Return
+    // true if the operand has been deleted, false if not.
+    bool ToggleKillFlag(MachineInstr *MI, MachineOperand &MO);
+
+    void dumpSchedule() const;
+  };
+}
+
+char &llvm::PostRASchedulerID = PostRAScheduler::ID;
+
+INITIALIZE_PASS(PostRAScheduler, "post-RA-sched",
+                "Post RA top-down list latency scheduler", false, false)
+
+SchedulePostRATDList::SchedulePostRATDList(
+  MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
+  AliasAnalysis *AA, const RegisterClassInfo &RCI,
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+  SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
+  : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA),
+    LiveRegs(TRI->getNumRegs())
+{
+  const TargetMachine &TM = MF.getTarget();
+  const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
+  HazardRec =
+    TM.getInstrInfo()->CreateTargetPostRAHazardRecognizer(InstrItins, this);
+
+  assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
+          MRI.tracksLiveness()) &&
+         "Live-ins must be accurate for anti-dependency breaking");
+  AntiDepBreak =
+    ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
+     (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
+     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL) ?
+      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : NULL));
+}
+
+SchedulePostRATDList::~SchedulePostRATDList() {
+  delete HazardRec;
+  delete AntiDepBreak;
+}
+
+/// Initialize state associated with the next scheduling region.
+void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
+                 MachineBasicBlock::iterator begin,
+                 MachineBasicBlock::iterator end,
+                 unsigned endcount) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+  Sequence.clear();
+}
+
+/// Print the schedule before exiting the region.
+void SchedulePostRATDList::exitRegion() {
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+  ScheduleDAGInstrs::exitRegion();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dumpSchedule - dump the scheduled Sequence.
+void SchedulePostRATDList::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
+  TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  AliasAnalysis *AA = &getAnalysis<AliasAnalysis>();
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+
+  RegClassInfo.runOnMachineFunction(Fn);
+
+  // Check for explicit enable/disable of post-ra scheduling.
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
+    TargetSubtargetInfo::ANTIDEP_NONE;
+  SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
+  if (EnablePostRAScheduler.getPosition() > 0) {
+    if (!EnablePostRAScheduler)
+      return false;
+  } else {
+    // Check that post-RA scheduling is enabled for this target.
+    // This may upgrade the AntiDepMode.
+    const TargetSubtargetInfo &ST = Fn.getTarget().getSubtarget<TargetSubtargetInfo>();
+    if (!ST.enablePostRAScheduler(PassConfig->getOptLevel(), AntiDepMode,
+                                  CriticalPathRCs))
+      return false;
+  }
+
+  // Check for antidep breaking override...
+  if (EnableAntiDepBreaking.getPosition() > 0) {
+    AntiDepMode = (EnableAntiDepBreaking == "all")
+      ? TargetSubtargetInfo::ANTIDEP_ALL
+      : ((EnableAntiDepBreaking == "critical")
+         ? TargetSubtargetInfo::ANTIDEP_CRITICAL
+         : TargetSubtargetInfo::ANTIDEP_NONE);
+  }
+
+  DEBUG(dbgs() << "PostRAScheduler\n");
+
+  SchedulePostRATDList Scheduler(Fn, MLI, MDT, AA, RegClassInfo, AntiDepMode,
+                                 CriticalPathRCs);
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+#ifndef NDEBUG
+    // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+    if (DebugDiv > 0) {
+      static int bbcnt = 0;
+      if (bbcnt++ % DebugDiv != DebugMod)
+        continue;
+      dbgs() << "*** DEBUG scheduling " << Fn.getName()
+             << ":BB#" << MBB->getNumber() << " ***\n";
+    }
+#endif
+
+    // Initialize register live-range state for scheduling in this block.
+    Scheduler.startBlock(MBB);
+
+    // Schedule each sequence of instructions not interrupted by a label
+    // or anything else that effectively needs to shut down scheduling.
+    MachineBasicBlock::iterator Current = MBB->end();
+    unsigned Count = MBB->size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
+      MachineInstr *MI = llvm::prior(I);
+      // Calls are not scheduling boundaries before register allocation, but
+      // post-ra we don't gain anything by scheduling across calls since we
+      // don't need to worry about register pressure.
+      if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
+        Scheduler.enterRegion(MBB, I, Current, CurrentCount);
+        Scheduler.schedule();
+        Scheduler.exitRegion();
+        Scheduler.EmitSchedule();
+        Current = MI;
+        CurrentCount = Count - 1;
+        Scheduler.Observe(MI, CurrentCount);
+      }
+      I = MI;
+      --Count;
+      if (MI->isBundle())
+        Count -= MI->getBundleSize();
+    }
+    assert(Count == 0 && "Instruction count mismatch!");
+    assert((MBB->begin() == Current || CurrentCount != 0) &&
+           "Instruction count mismatch!");
+    Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.schedule();
+    Scheduler.exitRegion();
+    Scheduler.EmitSchedule();
+
+    // Clean up register live-range state.
+    Scheduler.finishBlock();
+
+    // Update register kills
+    Scheduler.FixupKills(MBB);
+  }
+
+  return true;
+}
+
+/// StartBlock - Initialize register live-range state for scheduling in
+/// this block.
+///
+void SchedulePostRATDList::startBlock(MachineBasicBlock *BB) {
+  // Call the superclass.
+  ScheduleDAGInstrs::startBlock(BB);
+
+  // Reset the hazard recognizer and anti-dep breaker.
+  HazardRec->Reset();
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->StartBlock(BB);
+}
+
+/// Schedule - Schedule the instruction range using list scheduling.
+///
+void SchedulePostRATDList::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(AA);
+
+  if (AntiDepBreak != NULL) {
+    unsigned Broken =
+      AntiDepBreak->BreakAntiDependencies(SUnits, RegionBegin, RegionEnd,
+                                          EndIndex, DbgValues);
+
+    if (Broken != 0) {
+      // We made changes. Update the dependency graph.
+      // Theoretically we could update the graph in place:
+      // When a live range is changed to use a different register, remove
+      // the def's anti-dependence *and* output-dependence edges due to
+      // that register, and add new anti-dependence and output-dependence
+      // edges based on the next live range of the register.
+      ScheduleDAG::clearDAG();
+      buildSchedGraph(AA);
+
+      NumFixedAnti += Broken;
+    }
+  }
+
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  AvailableQueue.initNodes(SUnits);
+  ListScheduleTopDown();
+  AvailableQueue.releaseState();
+}
+
+/// Observe - Update liveness information to account for the current
+/// instruction, which will not be scheduled.
+///
+void SchedulePostRATDList::Observe(MachineInstr *MI, unsigned Count) {
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->Observe(MI, Count, EndIndex);
+}
+
+/// FinishBlock - Clean up register live-range state.
+///
+void SchedulePostRATDList::finishBlock() {
+  if (AntiDepBreak != NULL)
+    AntiDepBreak->FinishBlock();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::finishBlock();
+}
+
+/// StartBlockForKills - Initialize register live-range state for updating kills
+///
+void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
+  // Start with no live registers.
+  LiveRegs.reset();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+       SE = BB->succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+         E = (*SI)->livein_end(); I != E; ++I) {
+      unsigned Reg = *I;
+      LiveRegs.set(Reg);
+      // Repeat, for all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
+    }
+  }
+}
+
+bool SchedulePostRATDList::ToggleKillFlag(MachineInstr *MI,
+                                          MachineOperand &MO) {
+  // Setting kill flag...
+  if (!MO.isKill()) {
+    MO.setIsKill(true);
+    return false;
+  }
+
+  // If MO itself is live, clear the kill flag...
+  if (LiveRegs.test(MO.getReg())) {
+    MO.setIsKill(false);
+    return false;
+  }
+
+  // If any subreg of MO is live, then create an imp-def for that
+  // subreg and keep MO marked as killed.
+  MO.setIsKill(false);
+  bool AllDead = true;
+  const unsigned SuperReg = MO.getReg();
+  MachineInstrBuilder MIB(MF, MI);
+  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
+    if (LiveRegs.test(*SubRegs)) {
+      MIB.addReg(*SubRegs, RegState::ImplicitDefine);
+      AllDead = false;
+    }
+  }
+
+  if(AllDead)
+    MO.setIsKill(true);
+  return false;
+}
+
+/// FixupKills - Fix the register kill flags, they may have been made
+/// incorrect by instruction reordering.
+///
+void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
+
+  BitVector killedRegs(TRI->getNumRegs());
+
+  StartBlockForKills(MBB);
+
+  // Examine block from end to start...
+  unsigned Count = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
+       I != E; --Count) {
+    MachineInstr *MI = --I;
+    if (MI->isDebugValue())
+      continue;
+
+    // Update liveness.  Registers that are defed but not used in this
+    // instruction are now dead. Mark register and all subregs as they
+    // are completely defined.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isRegMask())
+        LiveRegs.clearBitsNotInMask(MO.getRegMask());
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI->isRegTiedToUseOperand(i)) continue;
+
+      LiveRegs.reset(Reg);
+
+      // Repeat for all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.reset(*SubRegs);
+    }
+
+    // Examine all used registers and set/clear kill flag. When a
+    // register is used multiple times we only set the kill flag on
+    // the first use.
+    killedRegs.reset();
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
+
+      bool kill = false;
+      if (!killedRegs.test(Reg)) {
+        kill = true;
+        // A register is not killed if any subregs are live...
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          if (LiveRegs.test(*SubRegs)) {
+            kill = false;
+            break;
+          }
+        }
+
+        // If subreg is not live, then register is killed if it became
+        // live in this instruction
+        if (kill)
+          kill = !LiveRegs.test(Reg);
+      }
+
+      if (MO.isKill() != kill) {
+        DEBUG(dbgs() << "Fixing " << MO << " in ");
+        // Warning: ToggleKillFlag may invalidate MO.
+        ToggleKillFlag(MI, MO);
+        DEBUG(MI->dump());
+      }
+
+      killedRegs.set(Reg);
+    }
+
+    // Mark any used register (that is not using undef) and subregs as
+    // now live...
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
+
+      LiveRegs.set(Reg);
+
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero.
+void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  // Standard scheduler algorithms will recompute the depth of the successor
+  // here as such:
+  //   SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  //
+  // However, we lazily compute node depth instead. Note that
+  // ScheduleNodeTopDown has already updated the depth of this node which causes
+  // all descendents to be marked dirty. Setting the successor depth explicitly
+  // here would cause depth to be recomputed for all its ancestors. If the
+  // successor is not yet ready (because of a transitively redundant edge) then
+  // this causes depth computation to be quadratic in the size of the DAG.
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+/// ReleaseSuccessors - Call ReleaseSucc on each of SU's successors.
+void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() &&
+         "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue.scheduledNode(SU);
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void SchedulePostRATDList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // We're scheduling top-down but we're visiting the regions in
+  // bottom-up order, so we don't know the hazards at the start of a
+  // region. So assume no hazards (this should usually be ok as most
+  // blocks are a single region).
+  HazardRec->Reset();
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // Add all leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (!SUnits[i].NumPredsLeft && !SUnits[i].isAvailable) {
+      AvailableQueue.push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // In any cycle where we can't schedule any instructions, we must
+  // stall or emit a noop, depending on the target.
+  bool CycleHasInsts = false;
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    unsigned MinDepth = ~0u;
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() <= CurCycle) {
+        AvailableQueue.push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else if (PendingQueue[i]->getDepth() < MinDepth)
+        MinDepth = PendingQueue[i]->getDepth();
+    }
+
+    DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this));
+
+    SUnit *FoundSUnit = 0;
+    bool HasNoopHazards = false;
+    while (!AvailableQueue.empty()) {
+      SUnit *CurSUnit = AvailableQueue.pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue.push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule...
+    if (FoundSUnit) {
+      // ... schedule the node...
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+      CycleHasInsts = true;
+      if (HazardRec->atIssueLimit()) {
+        DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++CurCycle;
+        CycleHasInsts = false;
+      }
+    } else {
+      if (CycleHasInsts) {
+        DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+      } else if (!HasNoopHazards) {
+        // Otherwise, we have a pipeline stall, but no other problem,
+        // just advance the current cycle and try again.
+        DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++NumStalls;
+      } else {
+        // Otherwise, we have no instructions to issue and we have instructions
+        // that will fault if we don't do this right.  This is the case for
+        // processors without pipeline interlocks and other cases.
+        DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
+        HazardRec->EmitNoop();
+        Sequence.push_back(0);   // NULL here means noop
+        ++NumNoops;
+      }
+
+      ++CurCycle;
+      CycleHasInsts = false;
+    }
+  }
+
+#ifndef NDEBUG
+  unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+#endif // NDEBUG
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+void SchedulePostRATDList::EmitSchedule() {
+  RegionBegin = RegionEnd;
+
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->splice(RegionEnd, BB, FirstDbgValue);
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      BB->splice(RegionEnd, BB, SU->getInstr());
+    else
+      // Null SUnit* is a noop.
+      TII->insertNoop(*BB, RegionEnd);
+
+    // Update the Begin iterator, as the first instruction in the block
+    // may have been scheduled later.
+    if (i == 0)
+      RegionBegin = prior(RegionEnd);
+  }
+
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrivMI = P.second;
+    BB->splice(++OrigPrivMI, BB, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
+}
diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
new file mode 100644
index 000000000000..e4e18c3bb54b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -0,0 +1,170 @@
+//===---------------------- ProcessImplicitDefs.cpp -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "processimplicitdefs"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
+/// for each use. Add isUndef marker to implicit_def defs and their uses.
+class ProcessImplicitDefs : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  SmallSetVector<MachineInstr*, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  static char ID;
+
+  ProcessImplicitDefs() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+  virtual bool runOnMachineFunction(MachineFunction &fn);
+};
+} // end anonymous namespace
+
+char ProcessImplicitDefs::ID = 0;
+char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
+
+INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions", false, false)
+INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions", false, false)
+
+void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<AliasAnalysis>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
+  if (!MI->isCopyLike() &&
+      !MI->isInsertSubreg() &&
+      !MI->isRegSequence() &&
+      !MI->isPHI())
+    return false;
+  for (MIOperands MO(MI); MO.isValid(); ++MO)
+    if (MO->isReg() && MO->isUse() && MO->readsReg())
+      return false;
+  return true;
+}
+
+void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
+  DEBUG(dbgs() << "Processing " << *MI);
+  unsigned Reg = MI->getOperand(0).getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // implicit-def when possible.
+    for (MachineRegisterInfo::use_nodbg_iterator UI =
+         MRI->use_nodbg_begin(Reg),
+         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineOperand &MO = UI.getOperand();
+      MO.setIsUndef();
+      MachineInstr *UserMI = MO.getParent();
+      if (!canTurnIntoImplicitDef(UserMI))
+        continue;
+      DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      WorkList.insert(UserMI);
+    }
+    MI->eraseFromParent();
+    return;
+  }
+
+  // This is a physreg implicit-def.
+  // Look for the first instruction to use or define an alias.
+  MachineBasicBlock::instr_iterator UserMI = MI;
+  MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
+  bool Found = false;
+  for (++UserMI; UserMI != UserE; ++UserMI) {
+    for (MIOperands MO(UserMI); MO.isValid(); ++MO) {
+      if (!MO->isReg())
+        continue;
+      unsigned UserReg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+          !TRI->regsOverlap(Reg, UserReg))
+        continue;
+      // UserMI uses or redefines Reg. Set <undef> flags on all uses.
+      Found = true;
+      if (MO->isUse())
+        MO->setIsUndef();
+    }
+    if (Found)
+      break;
+  }
+
+  // If we found the using MI, we can erase the IMPLICIT_DEF.
+  if (Found) {
+    DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  // Using instr wasn't found, it could be in another block.
+  // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
+  for (unsigned i = MI->getNumOperands() - 1; i; --i)
+    MI->RemoveOperand(i);
+  DEBUG(dbgs() << "Keeping physreg: " << *MI);
+}
+
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
+/// <undef> operands.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  bool Changed = false;
+
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
+  assert(WorkList.empty() && "Inconsistent worklist state");
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+       MFI != MFE; ++MFI) {
+    // Scan the basic block for implicit defs.
+    for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(),
+         MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI)
+      if (MBBI->isImplicitDef())
+        WorkList.insert(MBBI);
+
+    if (WorkList.empty())
+      continue;
+
+    DEBUG(dbgs() << "BB#" << MFI->getNumber() << " has " << WorkList.size()
+                 << " implicit defs.\n");
+    Changed = true;
+
+    // Drain the WorkList to recursively process any new implicit defs.
+    do processImplicitDef(WorkList.pop_back_val());
+    while (!WorkList.empty());
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
new file mode 100644
index 000000000000..e5872df731a0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -0,0 +1,892 @@
+//===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass provides an optional shrink wrapping variant of prolog/epilog
+// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pei"
+#include "PrologEpilogInserter.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <climits>
+
+using namespace llvm;
+
+char PEI::ID = 0;
+char &llvm::PrologEpilogCodeInserterID = PEI::ID;
+
+INITIALIZE_PASS_BEGIN(PEI, "prologepilog",
+                "Prologue/Epilogue Insertion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(PEI, "prologepilog",
+                    "Prologue/Epilogue Insertion & Frame Finalization",
+                    false, false)
+
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+STATISTIC(NumBytesStackSpace,
+          "Number of bytes used for stack in all functions");
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  const Function* F = Fn.getFunction();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+
+  assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
+
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : NULL;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+
+  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+  // function's frame information. Also eliminates call frame pseudo
+  // instructions.
+  calculateCallsInformation(Fn);
+
+  // Allow the target machine to make some adjustments to the function
+  // e.g. UsedPhysRegs before calculateCalleeSavedRegisters.
+  TFI->processFunctionBeforeCalleeSavedScan(Fn, RS);
+
+  // Scan the function for modified callee saved registers and insert spill code
+  // for any callee saved registers that are modified.
+  calculateCalleeSavedRegisters(Fn);
+
+  // Determine placement of CSR spill/restore code:
+  //  - With shrink wrapping, place spills and restores to tightly
+  //    enclose regions in the Machine CFG of the function where
+  //    they are used.
+  //  - Without shink wrapping (default), place all spills in the
+  //    entry block, all restores in return blocks.
+  placeCSRSpillsAndRestores(Fn);
+
+  // Add the code to save and restore the callee saved registers
+  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::Naked))
+    insertCSRSpillsAndRestores(Fn);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(Fn);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters()
+  // must be called before this function in order to set the AdjustsStack
+  // and MaxCallFrameSize variables.
+  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::Naked))
+    insertPrologEpilogCode(Fn);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(Fn);
+
+  // If register scavenging is needed, as we've enabled doing it as a
+  // post-pass, scavenge the virtual registers that frame index elimiation
+  // inserted.
+  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
+    scavengeFrameVirtualRegs(Fn);
+
+  // Clear any vregs created by virtual scavenging.
+  Fn.getRegInfo().clearVirtRegs();
+
+  delete RS;
+  clearAllSets();
+  return true;
+}
+
+/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void PEI::calculateCallsInformation(MachineFunction &Fn) {
+  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  unsigned MaxCallFrameSize = 0;
+  bool AdjustsStack = MFI->adjustsStack();
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  // Early exit for targets which have no call frame setup/destroy pseudo
+  // instructions.
+  if (FrameSetupOpcode == -1 && FrameDestroyOpcode == -1)
+    return;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+               " instructions should have a single immediate argument!");
+        unsigned Size = I->getOperand(0).getImm();
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        AdjustsStack = true;
+        FrameSDOps.push_back(I);
+      } else if (I->isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+
+  MFI->setAdjustsStack(AdjustsStack);
+  MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (std::vector<MachineBasicBlock::iterator>::iterator
+         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
+    MachineBasicBlock::iterator I = *i;
+
+    // If call frames are not being included as part of the stack frame, and
+    // the target doesn't indicate otherwise, remove the call frame pseudos
+    // here. The sub/add sp instruction pairs are still inserted, but we don't
+    // need to track the SP adjustment for frame index elimination.
+    if (TFI->canSimplifyCallFramePseudos(Fn))
+      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+  }
+}
+
+
+/// calculateCalleeSavedRegisters - Scan the function for modified callee saved
+/// registers.
+void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
+  const TargetRegisterInfo *RegInfo = F.getTarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = F.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+
+  // Get the callee saved register list...
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+
+  // These are used to keep track the callee-save area. Initialize them.
+  MinCSFrameIndex = INT_MAX;
+  MaxCSFrameIndex = 0;
+
+  // Early exit for targets which have no callee saved registers.
+  if (CSRegs == 0 || CSRegs[0] == 0)
+    return;
+
+  // In Naked functions we aren't going to save any registers.
+  if (F.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                    Attribute::Naked))
+    return;
+
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (F.getRegInfo().isPhysRegUsed(Reg)) {
+      // If the reg is modified, save it!
+      CSI.push_back(CalleeSavedInfo(Reg));
+    }
+  }
+
+  if (CSI.empty())
+    return;   // Early exit if no callee saved registers are modified!
+
+  unsigned NumFixedSpillSlots;
+  const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+    TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+  // Now that we know which registers need to be saved and restored, allocate
+  // stack slots for them.
+  for (std::vector<CalleeSavedInfo>::iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    unsigned Reg = I->getReg();
+    const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+    int FrameIdx;
+    if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+      I->setFrameIdx(FrameIdx);
+      continue;
+    }
+
+    // Check to see if this physreg must be spilled to a particular stack slot
+    // on this target.
+    const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+    while (FixedSlot != FixedSpillSlots+NumFixedSpillSlots &&
+           FixedSlot->Reg != Reg)
+      ++FixedSlot;
+
+    if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+      // Nope, just spill it anywhere convenient.
+      unsigned Align = RC->getAlignment();
+      unsigned StackAlign = TFI->getStackAlignment();
+
+      // We may not be able to satisfy the desired alignment specification of
+      // the TargetRegisterClass if the stack alignment is smaller. Use the
+      // min.
+      Align = std::min(Align, StackAlign);
+      FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+      if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+      if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+    } else {
+      // Spill it to the stack where we must.
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), FixedSlot->Offset, true);
+    }
+
+    I->setFrameIdx(FrameIdx);
+  }
+
+  MFI->setCalleeSavedInfo(CSI);
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function, handling shrink wrapping.
+///
+void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
+  // Get callee saved register information.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  MFI->setCalleeSavedInfoValid(true);
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty())
+    return;
+
+  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+  MachineBasicBlock::iterator I;
+
+  if (!ShrinkWrapThisFunction) {
+    // Spill using target interface.
+    I = EntryBlock->begin();
+    if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Add the callee-saved register as live-in.
+        // It's killed at the spill.
+        EntryBlock->addLiveIn(CSI[i].getReg());
+
+        // Insert the spill to the stack frame.
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
+                                CSI[i].getFrameIdx(), RC, TRI);
+      }
+    }
+
+    // Restore using target interface.
+    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+      MachineBasicBlock* MBB = ReturnBlocks[ri];
+      I = MBB->end(); --I;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = I;
+      while (I2 != MBB->begin() && (--I2)->isTerminator())
+        I = I2;
+
+      bool AtStart = I == MBB->begin();
+      MachineBasicBlock::iterator BeforeI = I;
+      if (!AtStart)
+        --BeforeI;
+
+      // Restore all registers immediately before the return and any
+      // terminators that precede it.
+      if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+          unsigned Reg = CSI[i].getReg();
+          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+          TII.loadRegFromStackSlot(*MBB, I, Reg,
+                                   CSI[i].getFrameIdx(),
+                                   RC, TRI);
+          assert(I != MBB->begin() &&
+                 "loadRegFromStackSlot didn't insert any code!");
+          // Insert in reverse order.  loadRegFromStackSlot can insert
+          // multiple instructions.
+          if (AtStart)
+            I = MBB->begin();
+          else {
+            I = BeforeI;
+            ++I;
+          }
+        }
+      }
+    }
+    return;
+  }
+
+  // Insert spills.
+  std::vector<CalleeSavedInfo> blockCSI;
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet save = BI->second;
+
+    if (save.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = save.begin(),
+           RE = save.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not collect callee saved register info");
+
+    I = MBB->begin();
+
+    // When shrink wrapping, use stack slot stores/loads.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      MBB->addLiveIn(blockCSI[i].getReg());
+
+      // Insert the spill to the stack frame.
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MBB, I, Reg,
+                              true,
+                              blockCSI[i].getFrameIdx(),
+                              RC, TRI);
+    }
+  }
+
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restore = BI->second;
+
+    if (restore.empty())
+      continue;
+
+    blockCSI.clear();
+    for (CSRegSet::iterator RI = restore.begin(),
+           RE = restore.end(); RI != RE; ++RI) {
+      blockCSI.push_back(CSI[*RI]);
+    }
+    assert(blockCSI.size() > 0 &&
+           "Could not find callee saved register info");
+
+    // If MBB is empty and needs restores, insert at the _beginning_.
+    if (MBB->empty()) {
+      I = MBB->begin();
+    } else {
+      I = MBB->end();
+      --I;
+
+      // Skip over all terminator instructions, which are part of the
+      // return sequence.
+      if (! I->isTerminator()) {
+        ++I;
+      } else {
+        MachineBasicBlock::iterator I2 = I;
+        while (I2 != MBB->begin() && (--I2)->isTerminator())
+          I = I2;
+      }
+    }
+
+    bool AtStart = I == MBB->begin();
+    MachineBasicBlock::iterator BeforeI = I;
+    if (!AtStart)
+      --BeforeI;
+
+    // Restore all registers immediately before the return and any
+    // terminators that precede it.
+    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+      unsigned Reg = blockCSI[i].getReg();
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(*MBB, I, Reg,
+                               blockCSI[i].getFrameIdx(),
+                               RC, TRI);
+      assert(I != MBB->begin() &&
+             "loadRegFromStackSlot didn't insert any code!");
+      // Insert in reverse order.  loadRegFromStackSlot can insert
+      // multiple instructions.
+      if (AtStart)
+        I = MBB->begin();
+      else {
+        I = BeforeI;
+        ++I;
+      }
+    }
+  }
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (StackGrowsDown) {
+    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+      // If the stack grows down, we need to add the size to find the lowest
+      // address of the object.
+      Offset += MFI->getObjectSize(i);
+
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
+    }
+  } else {
+    int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
+    for (int i = MaxCSFI; i >= MinCSFI ; --i) {
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = (Offset+Align-1)/Align*Align;
+
+      MFI->setObjectOffset(i, Offset);
+      Offset += MFI->getObjectSize(i);
+    }
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // frame pointer if a frame pointer is required.
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+  if (RS && TFI.hasFP(Fn) && RegInfo->useFPForScavengingIndex(Fn) &&
+      !RegInfo->needsStackRealignment(Fn)) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVector<int, 2>::iterator I = SFIs.begin(),
+         IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> LargeStackObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
+                      Offset, MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isObjectPreAllocated(i) &&
+          MFI->getUseLocalStackAllocationBlock())
+        continue;
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && RS->isScavengingFrameIndex((int)i))
+        continue;
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+      if (!MFI->MayNeedStackProtector(i))
+        continue;
+
+      AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+      LargeStackObjs.insert(i);
+    }
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && RS->isScavengingFrameIndex((int)i))
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (LargeStackObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && (!TFI.hasFP(Fn) || RegInfo->needsStackRealignment(Fn) ||
+             !RegInfo->useFPForScavengingIndex(Fn))) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVector<int, 2>::iterator I = SFIs.begin(),
+         IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+  NumBytesStackSpace += StackSize;
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+
+  // Add prologue to the function...
+  TFI.emitPrologue(Fn);
+
+  // Add epilogue to restore the callee-save registers in each exiting block
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn())
+      TFI.emitEpilogue(Fn, *I);
+  }
+
+  // Emit additional code that is required to support segmented stacks, if
+  // we've been asked for it.  This, when linked with a runtime with support
+  // for segmented stacks (libgcc is one), will result in allocating stack
+  // space in small chunks instead of one large contiguous block.
+  if (Fn.getTarget().Options.EnableSegmentedStacks)
+    TFI.adjustForSegmentedStacks(Fn);
+
+  // Emit additional code that is required to explicitly handle the stack in
+  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+  // approach is rather similar to that of Segmented Stacks, but it uses a
+  // different conditional check and another BIF for allocating more stack
+  // space.
+  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
+    TFI.adjustForHiPEPrologue(Fn);
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void PEI::replaceFrameIndices(MachineFunction &Fn) {
+  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+
+  const TargetMachine &TM = Fn.getTarget();
+  assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetFrameLowering *TFI = TM.getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  for (MachineFunction::iterator BB = Fn.begin(),
+         E = Fn.end(); BB != E; ++BB) {
+#ifndef NDEBUG
+    int SPAdjCount = 0; // frame setup / destroy count.
+#endif
+    int SPAdj = 0;  // SP offset due to call frame setup / destroy.
+    if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+#ifndef NDEBUG
+        // Track whether we see even pairs of them
+        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
+#endif
+        // Remember how much SP has been adjusted to create the call
+        // frame.
+        int Size = I->getOperand(0).getImm();
+
+        if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
+            (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
+          Size = -Size;
+
+        SPAdj += Size;
+
+        MachineBasicBlock::iterator PrevI = BB->end();
+        if (I != BB->begin()) PrevI = prior(I);
+        TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+
+        // Visit the instructions created by eliminateCallFramePseudoInstr().
+        if (PrevI == BB->end())
+          I = BB->begin();     // The replaced instr was the first in the block.
+        else
+          I = llvm::next(PrevI);
+        continue;
+      }
+
+      MachineInstr *MI = I;
+      bool DoIncr = true;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (!MI->getOperand(i).isFI())
+            continue;
+
+        // Some instructions (e.g. inline asm instructions) can have
+        // multiple frame indices and/or cause eliminateFrameIndex
+        // to insert more than one instruction. We need the register
+        // scavenger to go through all of these instructions so that
+        // it can update its register information. We keep the
+        // iterator at the point before insertion so that we can
+        // revisit them in full.
+        bool AtBeginning = (I == BB->begin());
+        if (!AtBeginning) --I;
+
+        // If this instruction has a FrameIndex operand, we need to
+        // use that target machine register info object to eliminate
+        // it.
+        TRI.eliminateFrameIndex(MI, SPAdj, i,
+                                FrameIndexVirtualScavenging ?  NULL : RS);
+
+        // Reset the iterator if we were at the beginning of the BB.
+        if (AtBeginning) {
+          I = BB->begin();
+          DoIncr = false;
+        }
+
+        MI = 0;
+        break;
+      }
+
+      if (DoIncr && I != BB->end()) ++I;
+
+      // Update register states.
+      if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+    }
+
+    // If we have evenly matched pairs of frame setup / destroy instructions,
+    // make sure the adjustments come out to zero. If we don't have matched
+    // pairs, we can't be sure the missing bit isn't in another basic block
+    // due to a custom inserter playing tricks, so just asserting SPAdj==0
+    // isn't sufficient. See tMOVCC on Thumb1, for example.
+    assert((SPAdjCount || SPAdj == 0) &&
+           "Unbalanced call frame setup / destroy pairs?");
+  }
+}
+
+/// scavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// with physical registers. Use the register scavenger to find an
+/// appropriate register to use.
+///
+/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
+/// iterate over the vreg use list, which at this point only contains machine
+/// operands for which eliminateFrameIndex need a new scratch reg.
+void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
+  // Run through the instructions and find any virtual registers.
+  for (MachineFunction::iterator BB = Fn.begin(),
+       E = Fn.end(); BB != E; ++BB) {
+    RS->enterBasicBlock(BB);
+
+    int SPAdj = 0;
+
+    // The instruction stream may change in the loop, so check BB->end()
+    // directly.
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+      MachineInstr *MI = I;
+      MachineBasicBlock::iterator J = llvm::next(I);
+      MachineBasicBlock::iterator P = I == BB->begin() ?
+        MachineBasicBlock::iterator(NULL) : llvm::prior(I);
+
+      // RS should process this instruction before we might scavenge at this
+      // location. This is because we might be replacing a virtual register
+      // defined by this instruction, and if so, registers killed by this
+      // instruction are available, and defined registers are not.
+      RS->forward(I);
+
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (MI->getOperand(i).isReg()) {
+          MachineOperand &MO = MI->getOperand(i);
+          unsigned Reg = MO.getReg();
+          if (Reg == 0)
+            continue;
+          if (!TargetRegisterInfo::isVirtualRegister(Reg))
+            continue;
+
+          // When we first encounter a new virtual register, it
+          // must be a definition.
+          assert(MI->getOperand(i).isDef() &&
+                 "frame index virtual missing def!");
+          // Scavenge a new scratch register
+          const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
+          unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
+
+          ++NumScavengedRegs;
+
+          // Replace this reference to the virtual register with the
+          // scratch register.
+          assert (ScratchReg && "Missing scratch register!");
+          Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
+
+          // Because this instruction was processed by the RS before this
+          // register was allocated, make sure that the RS now records the
+          // register as being used.
+          RS->setUsed(ScratchReg);
+        }
+      }
+
+      // If the scavenger needed to use one of its spill slots, the
+      // spill code will have been inserted in between I and J. This is a
+      // problem because we need the spill code before I: Move I to just
+      // prior to J.
+      if (I != llvm::prior(J)) {
+        BB->splice(J, BB, I);
+
+        // Before we move I, we need to prepare the RS to visit I again.
+        // Specifically, RS will assert if it sees uses of registers that
+        // it believes are undefined. Because we have already processed
+        // register kills in I, when it visits I again, it will believe that
+        // those registers are undefined. To avoid this situation, unprocess
+        // the instruction I.
+        assert(RS->getCurrentPosition() == I &&
+          "The register scavenger has an unexpected position");
+        I = P;
+        RS->unprocess(P);
+
+        // RS->skipTo(I == BB->begin() ? NULL : llvm::prior(I));
+      } else
+        ++I;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.h b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.h
new file mode 100644
index 000000000000..87fff9afb309
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.h
@@ -0,0 +1,173 @@
+//===-- PrologEpilogInserter.h - Prolog/Epilog code insertion -*- C++ -* --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+// This pass also implements a shrink wrapping variant of prolog/epilog
+// insertion.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PEI_H
+#define LLVM_CODEGEN_PEI_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class RegScavenger;
+  class MachineBasicBlock;
+
+  class PEI : public MachineFunctionPass {
+  public:
+    static char ID;
+    PEI() : MachineFunctionPass(ID) {
+      initializePEIPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+    /// frame indexes with appropriate references.
+    ///
+    bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+    RegScavenger *RS;
+
+    // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+    // stack frame indexes.
+    unsigned MinCSFrameIndex, MaxCSFrameIndex;
+
+    // Analysis info for spill/restore placement.
+    // "CSR": "callee saved register".
+
+    // CSRegSet contains indices into the Callee Saved Register Info
+    // vector built by calculateCalleeSavedRegisters() and accessed
+    // via MF.getFrameInfo()->getCalleeSavedInfo().
+    typedef SparseBitVector<> CSRegSet;
+
+    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
+    // saved register indices.
+    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
+
+    // Set and maps for computing CSR spill/restore placement:
+    //  used in function (UsedCSRegs)
+    //  used in a basic block (CSRUsed)
+    //  anticipatable in a basic block (Antic{In,Out})
+    //  available in a basic block (Avail{In,Out})
+    //  to be spilled at the entry to a basic block (CSRSave)
+    //  to be restored at the end of a basic block (CSRRestore)
+    CSRegSet UsedCSRegs;
+    CSRegBlockMap CSRUsed;
+    CSRegBlockMap AnticIn, AnticOut;
+    CSRegBlockMap AvailIn, AvailOut;
+    CSRegBlockMap CSRSave;
+    CSRegBlockMap CSRRestore;
+
+    // Entry and return blocks of the current function.
+    MachineBasicBlock* EntryBlock;
+    SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
+
+    // Map of MBBs to top level MachineLoops.
+    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
+
+    // Flag to control shrink wrapping per-function:
+    // may choose to skip shrink wrapping for certain
+    // functions.
+    bool ShrinkWrapThisFunction;
+
+    // Flag to control whether to use the register scavenger to resolve
+    // frame index materialization registers. Set according to
+    // TRI->requiresFrameIndexScavenging() for the curren function.
+    bool FrameIndexVirtualScavenging;
+
+#ifndef NDEBUG
+    // Machine function handle.
+    MachineFunction* MF;
+
+    // Flag indicating that the current function
+    // has at least one "short" path in the machine
+    // CFG from the entry block to an exit block.
+    bool HasFastExitPath;
+#endif
+
+    bool calculateSets(MachineFunction &Fn);
+    bool calcAnticInOut(MachineBasicBlock* MBB);
+    bool calcAvailInOut(MachineBasicBlock* MBB);
+    void calculateAnticAvail(MachineFunction &Fn);
+    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4>& blks);
+    bool addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks);
+    bool calcSpillPlacements(MachineBasicBlock* MBB,
+                             SmallVector<MachineBasicBlock*, 4> &blks,
+                             CSRegBlockMap &prevSpills);
+    bool calcRestorePlacements(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4> &blks,
+                               CSRegBlockMap &prevRestores);
+    void placeSpillsAndRestores(MachineFunction &Fn);
+    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateCallsInformation(MachineFunction &Fn);
+    void calculateCalleeSavedRegisters(MachineFunction &Fn);
+    void insertCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    void replaceFrameIndices(MachineFunction &Fn);
+    void scavengeFrameVirtualRegs(MachineFunction &Fn);
+    void insertPrologEpilogCode(MachineFunction &Fn);
+
+    // Initialize DFA sets, called before iterations.
+    void clearAnticAvailSets();
+    // Clear all sets constructed by shrink wrapping.
+    void clearAllSets();
+
+    // Initialize all shrink wrapping data.
+    void initShrinkWrappingInfo();
+
+    // Convienences for dealing with machine loops.
+    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
+    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
+
+    // Propgate CSRs used in MBB to all MBBs of loop LP.
+    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
+
+    // Convenience for recognizing return blocks.
+    bool isReturnBlock(MachineBasicBlock* MBB);
+
+#ifndef NDEBUG
+    // Debugging methods.
+
+    // Mark this function as having fast exit paths.
+    void findFastExitPath();
+
+    // Verify placement of spills/restores.
+    void verifySpillRestorePlacement();
+
+    std::string getBasicBlockName(const MachineBasicBlock* MBB);
+    std::string stringifyCSRegSet(const CSRegSet& s);
+    void dumpSet(const CSRegSet& s);
+    void dumpUsed(MachineBasicBlock* MBB);
+    void dumpAllUsed();
+    void dumpSets(MachineBasicBlock* MBB);
+    void dumpSets1(MachineBasicBlock* MBB);
+    void dumpAllSets();
+    void dumpSRSets();
+#endif
+
+  };
+} // End llvm namespace
+#endif
diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
new file mode 100644
index 000000000000..85649111d7f1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -0,0 +1,132 @@
+//===-- llvm/CodeGen/PseudoSourceValue.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PseudoSourceValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+using namespace llvm;
+
+namespace {
+struct PSVGlobalsTy {
+  // PseudoSourceValues are immutable so don't need locking.
+  const PseudoSourceValue PSVs[4];
+  sys::Mutex Lock;  // Guards FSValues, but not the values inside it.
+  std::map<int, const PseudoSourceValue *> FSValues;
+
+  PSVGlobalsTy() : PSVs() {}
+  ~PSVGlobalsTy() {
+    for (std::map<int, const PseudoSourceValue *>::iterator
+           I = FSValues.begin(), E = FSValues.end(); I != E; ++I) {
+      delete I->second;
+    }
+  }
+};
+
+static ManagedStatic<PSVGlobalsTy> PSVGlobals;
+
+}  // anonymous namespace
+
+const PseudoSourceValue *PseudoSourceValue::getStack()
+{ return &PSVGlobals->PSVs[0]; }
+const PseudoSourceValue *PseudoSourceValue::getGOT()
+{ return &PSVGlobals->PSVs[1]; }
+const PseudoSourceValue *PseudoSourceValue::getJumpTable()
+{ return &PSVGlobals->PSVs[2]; }
+const PseudoSourceValue *PseudoSourceValue::getConstantPool()
+{ return &PSVGlobals->PSVs[3]; }
+
+static const char *const PSVNames[] = {
+  "Stack",
+  "GOT",
+  "JumpTable",
+  "ConstantPool"
+};
+
+// FIXME: THIS IS A HACK!!!!
+// Eventually these should be uniqued on LLVMContext rather than in a managed
+// static.  For now, we can safely use the global context for the time being to
+// squeak by.
+PseudoSourceValue::PseudoSourceValue(enum ValueTy Subclass) :
+  Value(Type::getInt8PtrTy(getGlobalContext()),
+        Subclass) {}
+
+void PseudoSourceValue::printCustom(raw_ostream &O) const {
+  O << PSVNames[this - PSVGlobals->PSVs];
+}
+
+const PseudoSourceValue *PseudoSourceValue::getFixedStack(int FI) {
+  PSVGlobalsTy &PG = *PSVGlobals;
+  sys::ScopedLock locked(PG.Lock);
+  const PseudoSourceValue *&V = PG.FSValues[FI];
+  if (!V)
+    V = new FixedStackPseudoSourceValue(FI);
+  return V;
+}
+
+bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  if (this == getStack())
+    return false;
+  if (this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return true;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  if (this == getStack() ||
+      this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return false;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (this == getGOT() ||
+      this == getConstantPool() ||
+      this == getJumpTable())
+    return false;
+  return true;
+}
+
+bool FixedStackPseudoSourceValue::isConstant(const MachineFrameInfo *MFI) const{
+  return MFI && MFI->isImmutableObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  // Negative frame indices are used for special things that don't
+  // appear in LLVM IR. Non-negative indices may be used for things
+  // like static allocas.
+  if (!MFI)
+    return FI >= 0;
+  // Spill slots should not alias others.
+  return !MFI->isFixedObjectIndex(FI) && !MFI->isSpillSlotObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  // Spill slots will not alias any LLVM IR value.
+  return !MFI->isSpillSlotObjectIndex(FI);
+}
+
+void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
+  OS << "FixedStack" << FI;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
new file mode 100644
index 000000000000..c0355903574f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -0,0 +1,145 @@
+//===-- RegAllocBase.cpp - Register Allocator Base Class ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class which provides comon functionality
+// for LiveIntervalUnion-based register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "RegAllocBase.h"
+#include "Spiller.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#ifndef NDEBUG
+#include "llvm/ADT/SparseBitVector.h"
+#endif
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+
+using namespace llvm;
+
+STATISTIC(NumNewQueued    , "Number of new live ranges queued");
+
+// Temporary verification option until we can put verification inside
+// MachineVerifier.
+static cl::opt<bool, true>
+VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
+               cl::desc("Verify during register allocation"));
+
+const char *RegAllocBase::TimerGroupName = "Register Allocation";
+bool RegAllocBase::VerifyEnabled = false;
+
+//===----------------------------------------------------------------------===//
+//                         RegAllocBase Implementation
+//===----------------------------------------------------------------------===//
+
+void RegAllocBase::init(VirtRegMap &vrm,
+                        LiveIntervals &lis,
+                        LiveRegMatrix &mat) {
+  TRI = &vrm.getTargetRegInfo();
+  MRI = &vrm.getRegInfo();
+  VRM = &vrm;
+  LIS = &lis;
+  Matrix = &mat;
+  MRI->freezeReservedRegs(vrm.getMachineFunction());
+  RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
+}
+
+// Visit all the live registers. If they are already assigned to a physical
+// register, unify them with the corresponding LiveIntervalUnion, otherwise push
+// them on the priority queue for later assignment.
+void RegAllocBase::seedLiveRegs() {
+  NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled);
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    enqueue(&LIS->getInterval(Reg));
+  }
+}
+
+// Top-level driver to manage the queue of unassigned VirtRegs and call the
+// selectOrSplit implementation.
+void RegAllocBase::allocatePhysRegs() {
+  seedLiveRegs();
+
+  // Continue assigning vregs one at a time to available physical registers.
+  while (LiveInterval *VirtReg = dequeue()) {
+    assert(!VRM->hasPhys(VirtReg->reg) && "Register already assigned");
+
+    // Unused registers can appear when the spiller coalesces snippets.
+    if (MRI->reg_nodbg_empty(VirtReg->reg)) {
+      DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      LIS->removeInterval(VirtReg->reg);
+      continue;
+    }
+
+    // Invalidate all interference queries, live ranges could have changed.
+    Matrix->invalidateVirtRegs();
+
+    // selectOrSplit requests the allocator to return an available physical
+    // register if possible and populate a list of new live intervals that
+    // result from splitting.
+    DEBUG(dbgs() << "\nselectOrSplit "
+                 << MRI->getRegClass(VirtReg->reg)->getName()
+                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
+    typedef SmallVector<LiveInterval*, 4> VirtRegVec;
+    VirtRegVec SplitVRegs;
+    unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
+
+    if (AvailablePhysReg == ~0u) {
+      // selectOrSplit failed to find a register!
+      const char *Msg = "ran out of registers during register allocation";
+      // Probably caused by an inline asm.
+      MachineInstr *MI;
+      for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
+           (MI = I.skipInstruction());)
+        if (MI->isInlineAsm())
+          break;
+      if (MI)
+        MI->emitError(Msg);
+      else
+        report_fatal_error(Msg);
+      // Keep going after reporting the error.
+      VRM->assignVirt2Phys(VirtReg->reg,
+                 RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
+      continue;
+    }
+
+    if (AvailablePhysReg)
+      Matrix->assign(*VirtReg, AvailablePhysReg);
+
+    for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
+         I != E; ++I) {
+      LiveInterval *SplitVirtReg = *I;
+      assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
+      if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
+        DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        LIS->removeInterval(SplitVirtReg->reg);
+        continue;
+      }
+      DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
+      assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
+             "expect split value in virtual register");
+      enqueue(SplitVirtReg);
+      ++NumNewQueued;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.h b/contrib/llvm/lib/CodeGen/RegAllocBase.h
new file mode 100644
index 000000000000..064e40f06b7b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.h
@@ -0,0 +1,108 @@
+//===-- RegAllocBase.h - basic regalloc interface and driver --*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class, which is the skeleton of a basic
+// register allocation algorithm and interface for extending it. It provides the
+// building blocks on which to construct other experimental allocators and test
+// the validity of two principles:
+//
+// - If virtual and physical register liveness is modeled using intervals, then
+// on-the-fly interference checking is cheap. Furthermore, interferences can be
+// lazily cached and reused.
+//
+// - Register allocation complexity, and generated code performance is
+// determined by the effectiveness of live range splitting rather than optimal
+// coloring.
+//
+// Following the first principle, interfering checking revolves around the
+// LiveIntervalUnion data structure.
+//
+// To fulfill the second principle, the basic allocator provides a driver for
+// incremental splitting. It essentially punts on the problem of register
+// coloring, instead driving the assignment of virtual to physical registers by
+// the cost of splitting. The basic allocator allows for heuristic reassignment
+// of registers, if a more sophisticated allocator chooses to do that.
+//
+// This framework provides a way to engineer the compile time vs. code
+// quality trade-off without relying on a particular theoretical solver.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCBASE
+#define LLVM_CODEGEN_REGALLOCBASE
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+namespace llvm {
+
+template<typename T> class SmallVectorImpl;
+class TargetRegisterInfo;
+class VirtRegMap;
+class LiveIntervals;
+class LiveRegMatrix;
+class Spiller;
+
+/// RegAllocBase provides the register allocation driver and interface that can
+/// be extended to add interesting heuristics.
+///
+/// Register allocators must override the selectOrSplit() method to implement
+/// live range splitting. They must also override enqueue/dequeue to provide an
+/// assignment order.
+class RegAllocBase {
+protected:
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  VirtRegMap *VRM;
+  LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
+  RegisterClassInfo RegClassInfo;
+
+  RegAllocBase(): TRI(0), MRI(0), VRM(0), LIS(0), Matrix(0) {}
+
+  virtual ~RegAllocBase() {}
+
+  // A RegAlloc pass should call this before allocatePhysRegs.
+  void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat);
+
+  // The top-level driver. The output is a VirtRegMap that us updated with
+  // physical register assignments.
+  void allocatePhysRegs();
+
+  // Get a temporary reference to a Spiller instance.
+  virtual Spiller &spiller() = 0;
+
+  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
+  virtual void enqueue(LiveInterval *LI) = 0;
+
+  /// dequeue - Return the next unassigned register, or NULL.
+  virtual LiveInterval *dequeue() = 0;
+
+  // A RegAlloc pass should override this to provide the allocation heuristics.
+  // Each call must guarantee forward progess by returning an available PhysReg
+  // or new set of split live virtual registers. It is up to the splitter to
+  // converge quickly toward fully spilled live ranges.
+  virtual unsigned selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
+
+  // Use this group name for NamedRegionTimer.
+  static const char *TimerGroupName;
+
+public:
+  /// VerifyEnabled - True when -verify-regalloc is given.
+  static bool VerifyEnabled;
+
+private:
+  void seedLiveRegs();
+};
+
+} // end namespace llvm
+
+#endif // !defined(LLVM_CODEGEN_REGALLOCBASE)
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
new file mode 100644
index 000000000000..0b6dc68cdf09
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -0,0 +1,293 @@
+//===-- RegAllocBasic.cpp - Basic Register Allocator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RABasic function pass, which provides a minimal
+// implementation of the basic register allocator.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/Passes.h"
+#include "AllocationOrder.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "Spiller.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cstdlib>
+#include <queue>
+
+using namespace llvm;
+
+static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
+                                      createBasicRegisterAllocator);
+
+namespace {
+  struct CompSpillWeight {
+    bool operator()(LiveInterval *A, LiveInterval *B) const {
+      return A->weight < B->weight;
+    }
+  };
+}
+
+namespace {
+/// RABasic provides a minimal implementation of the basic register allocation
+/// algorithm. It prioritizes live virtual registers by spill weight and spills
+/// whenever a register is unavailable. This is not practical in production but
+/// provides a useful baseline both for measuring other allocators and comparing
+/// the speed of the basic algorithm against other styles of allocators.
+class RABasic : public MachineFunctionPass, public RegAllocBase
+{
+  // context
+  MachineFunction *MF;
+
+  // state
+  std::auto_ptr<Spiller> SpillerInstance;
+  std::priority_queue<LiveInterval*, std::vector<LiveInterval*>,
+                      CompSpillWeight> Queue;
+
+  // Scratch space.  Allocated here to avoid repeated malloc calls in
+  // selectOrSplit().
+  BitVector UsableRegs;
+
+public:
+  RABasic();
+
+  /// Return the pass name.
+  virtual const char* getPassName() const {
+    return "Basic Register Allocator";
+  }
+
+  /// RABasic analysis usage.
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual void releaseMemory();
+
+  virtual Spiller &spiller() { return *SpillerInstance; }
+
+  virtual float getPriority(LiveInterval *LI) { return LI->weight; }
+
+  virtual void enqueue(LiveInterval *LI) {
+    Queue.push(LI);
+  }
+
+  virtual LiveInterval *dequeue() {
+    if (Queue.empty())
+      return 0;
+    LiveInterval *LI = Queue.top();
+    Queue.pop();
+    return LI;
+  }
+
+  virtual unsigned selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<LiveInterval*> &SplitVRegs);
+
+  /// Perform register allocation.
+  virtual bool runOnMachineFunction(MachineFunction &mf);
+
+  // Helper for spilling all live virtual registers currently unified under preg
+  // that interfere with the most recently queried lvr.  Return true if spilling
+  // was successful, and append any new spilled/split intervals to splitLVRs.
+  bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
+                          SmallVectorImpl<LiveInterval*> &SplitVRegs);
+
+  static char ID;
+};
+
+char RABasic::ID = 0;
+
+} // end anonymous namespace
+
+RABasic::RABasic(): MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
+  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
+}
+
+void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<CalculateSpillWeights>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RABasic::releaseMemory() {
+  SpillerInstance.reset(0);
+}
+
+
+// Spill or split all live virtual registers currently unified under PhysReg
+// that interfere with VirtReg. The newly spilled or split live intervals are
+// returned by appending them to SplitVRegs.
+bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
+                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+  // Record each interference and determine if all are spillable before mutating
+  // either the union or live intervals.
+  SmallVector<LiveInterval*, 8> Intfs;
+
+  // Collect interferences assigned to any alias of the physical register.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    Q.collectInterferingVRegs();
+    if (Q.seenUnspillableVReg())
+      return false;
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
+        return false;
+      Intfs.push_back(Intf);
+    }
+  }
+  DEBUG(dbgs() << "spilling " << TRI->getName(PhysReg) <<
+        " interferences with " << VirtReg << "\n");
+  assert(!Intfs.empty() && "expected interference");
+
+  // Spill each interfering vreg allocated to PhysReg or an alias.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval &Spill = *Intfs[i];
+
+    // Skip duplicates.
+    if (!VRM->hasPhys(Spill.reg))
+      continue;
+
+    // Deallocate the interfering vreg by removing it from the union.
+    // A LiveInterval instance may not be in a union during modification!
+    Matrix->unassign(Spill);
+
+    // Spill the extracted interval.
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM);
+    spiller().spill(LRE);
+  }
+  return true;
+}
+
+// Driver for the register assignment and splitting heuristics.
+// Manages iteration over the LiveIntervalUnions.
+//
+// This is a minimal implementation of register assignment and splitting that
+// spills whenever we run out of registers.
+//
+// selectOrSplit can only be called once per live virtual register. We then do a
+// single interference test for each register the correct class until we find an
+// available register. So, the number of interference tests in the worst case is
+// |vregs| * |machineregs|. And since the number of interference tests is
+// minimal, there is no value in caching them outside the scope of
+// selectOrSplit().
+unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
+                                SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+  // Populate a list of physical register spill candidates.
+  SmallVector<unsigned, 8> PhysRegSpillCands;
+
+  // Check for an available register in this class.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  while (unsigned PhysReg = Order.next()) {
+    // Check for interference in PhysReg
+    switch (Matrix->checkInterference(VirtReg, PhysReg)) {
+    case LiveRegMatrix::IK_Free:
+      // PhysReg is available, allocate it.
+      return PhysReg;
+
+    case LiveRegMatrix::IK_VirtReg:
+      // Only virtual registers in the way, we may be able to spill them.
+      PhysRegSpillCands.push_back(PhysReg);
+      continue;
+
+    default:
+      // RegMask or RegUnit interference.
+      continue;
+    }
+  }
+
+  // Try to spill another interfering reg with less spill weight.
+  for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(),
+       PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
+    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
+      continue;
+
+    assert(!Matrix->checkInterference(VirtReg, *PhysRegI) &&
+           "Interference after spill.");
+    // Tell the caller to allocate to this newly freed physical register.
+    return *PhysRegI;
+  }
+
+  // No other spill candidates were found, so spill the current VirtReg.
+  DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  if (!VirtReg.isSpillable())
+    return ~0u;
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM);
+  spiller().spill(LRE);
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+bool RABasic::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
+               << "********** Function: "
+               << mf.getName() << '\n');
+
+  MF = &mf;
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+
+  allocatePhysRegs();
+
+  // Diagnostic output before rewriting
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
+
+  releaseMemory();
+  return true;
+}
+
+FunctionPass* llvm::createBasicRegisterAllocator()
+{
+  return new RABasic();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
new file mode 100644
index 000000000000..bb9c05c5f42d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -0,0 +1,1117 @@
+//===-- RegAllocFast.cpp - A fast register allocator for debug code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This register allocator allocates registers to a basic block at a time,
+// attempting to keep values in registers and reusing registers as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+STATISTIC(NumCopies, "Number of copies coalesced");
+
+static RegisterRegAlloc
+  fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
+
+namespace {
+  class RAFast : public MachineFunctionPass {
+  public:
+    static char ID;
+    RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
+               isBulkSpilling(false) {}
+  private:
+    const TargetMachine *TM;
+    MachineFunction *MF;
+    MachineRegisterInfo *MRI;
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
+
+    // Basic block currently being allocated.
+    MachineBasicBlock *MBB;
+
+    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
+    // values are spilled.
+    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    // Everything we know about a live virtual register.
+    struct LiveReg {
+      MachineInstr *LastUse;    // Last instr to use reg.
+      unsigned VirtReg;         // Virtual register number.
+      unsigned PhysReg;         // Currently held here.
+      unsigned short LastOpNum; // OpNum on LastUse.
+      bool Dirty;               // Register needs spill.
+
+      explicit LiveReg(unsigned v)
+        : LastUse(0), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false) {}
+
+      unsigned getSparseSetIndex() const {
+        return TargetRegisterInfo::virtReg2Index(VirtReg);
+      }
+    };
+
+    typedef SparseSet<LiveReg> LiveRegMap;
+
+    // LiveVirtRegs - This map contains entries for each virtual register
+    // that is currently available in a physical register.
+    LiveRegMap LiveVirtRegs;
+
+    DenseMap<unsigned, SmallVector<MachineInstr *, 4> > LiveDbgValueMap;
+
+    // RegState - Track the state of a physical register.
+    enum RegState {
+      // A disabled register is not available for allocation, but an alias may
+      // be in use. A register can only be moved out of the disabled state if
+      // all aliases are disabled.
+      regDisabled,
+
+      // A free register is not currently in use and can be allocated
+      // immediately without checking aliases.
+      regFree,
+
+      // A reserved register has been assigned explicitly (e.g., setting up a
+      // call parameter), and it remains reserved until it is used.
+      regReserved
+
+      // A register state may also be a virtual register number, indication that
+      // the physical register is currently allocated to a virtual register. In
+      // that case, LiveVirtRegs contains the inverse mapping.
+    };
+
+    // PhysRegState - One of the RegState enums, or a virtreg.
+    std::vector<unsigned> PhysRegState;
+
+    // Set of register units.
+    typedef SparseSet<unsigned> UsedInInstrSet;
+
+    // Set of register units that are used in the current instruction, and so
+    // cannot be allocated.
+    UsedInInstrSet UsedInInstr;
+
+    // Mark a physreg as used in this instruction.
+    void markRegUsedInInstr(unsigned PhysReg) {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        UsedInInstr.insert(*Units);
+    }
+
+    // Check if a physreg or any of its aliases are used in this instruction.
+    bool isRegUsedInInstr(unsigned PhysReg) const {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        if (UsedInInstr.count(*Units))
+          return true;
+      return false;
+    }
+
+    // SkippedInstrs - Descriptors of instructions whose clobber list was
+    // ignored because all registers were spilled. It is still necessary to
+    // mark all the clobbered registers as used by the function.
+    SmallPtrSet<const MCInstrDesc*, 4> SkippedInstrs;
+
+    // isBulkSpilling - This flag is set when LiveRegMap will be cleared
+    // completely after spilling all live registers. LiveRegMap entries should
+    // not be erased.
+    bool isBulkSpilling;
+
+    enum {
+      spillClean = 1,
+      spillDirty = 100,
+      spillImpossible = ~0u
+    };
+  public:
+    virtual const char *getPassName() const {
+      return "Fast Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool runOnMachineFunction(MachineFunction &Fn);
+    void AllocateBasicBlock();
+    void handleThroughOperands(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &VirtDead);
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+    bool isLastUseOfLocalReg(MachineOperand&);
+
+    void addKillFlag(const LiveReg&);
+    void killVirtReg(LiveRegMap::iterator);
+    void killVirtReg(unsigned VirtReg);
+    void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
+    void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
+
+    void usePhysReg(MachineOperand&);
+    void definePhysReg(MachineInstr *MI, unsigned PhysReg, RegState NewState);
+    unsigned calcSpillCost(unsigned PhysReg) const;
+    void assignVirtToPhysReg(LiveReg&, unsigned PhysReg);
+    LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
+      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+    }
+    LiveRegMap::const_iterator findLiveVirtReg(unsigned VirtReg) const {
+      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+    }
+    LiveRegMap::iterator assignVirtToPhysReg(unsigned VReg, unsigned PhysReg);
+    LiveRegMap::iterator allocVirtReg(MachineInstr *MI, LiveRegMap::iterator,
+                                      unsigned Hint);
+    LiveRegMap::iterator defineVirtReg(MachineInstr *MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    LiveRegMap::iterator reloadVirtReg(MachineInstr *MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    void spillAll(MachineBasicBlock::iterator MI);
+    bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
+  };
+  char RAFast::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual register
+/// to be held on the stack.
+int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  if (SS != -1)
+    return SS;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                            RC->getAlignment());
+
+  // Assign the slot.
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+/// isLastUseOfLocalReg - Return true if MO is the only remaining reference to
+/// its virtual register, and it is guaranteed to be a block-local register.
+///
+bool RAFast::isLastUseOfLocalReg(MachineOperand &MO) {
+  // If the register has ever been spilled or reloaded, we conservatively assume
+  // it is a global register used in multiple blocks.
+  if (StackSlotForVirtReg[MO.getReg()] != -1)
+    return false;
+
+  // Check that the use/def chain has exactly one operand - MO.
+  MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
+  if (&I.getOperand() != &MO)
+    return false;
+  return ++I == MRI->reg_nodbg_end();
+}
+
+/// addKillFlag - Set kill flags on last use of a virtual register.
+void RAFast::addKillFlag(const LiveReg &LR) {
+  if (!LR.LastUse) return;
+  MachineOperand &MO = LR.LastUse->getOperand(LR.LastOpNum);
+  if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
+    if (MO.getReg() == LR.PhysReg)
+      MO.setIsKill();
+    else
+      LR.LastUse->addRegisterKilled(LR.PhysReg, TRI, true);
+  }
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(LiveRegMap::iterator LRI) {
+  addKillFlag(*LRI);
+  assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
+         "Broken RegState mapping");
+  PhysRegState[LRI->PhysReg] = regFree;
+  // Erase from LiveVirtRegs unless we're spilling in bulk.
+  if (!isBulkSpilling)
+    LiveVirtRegs.erase(LRI);
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "killVirtReg needs a virtual register");
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  if (LRI != LiveVirtRegs.end())
+    killVirtReg(LRI);
+}
+
+/// spillVirtReg - This method spills the value specified by VirtReg into the
+/// corresponding stack slot if needed.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Spilling a physical register is illegal!");
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
+  spillVirtReg(MI, LRI);
+}
+
+/// spillVirtReg - Do the actual work of spilling.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
+                          LiveRegMap::iterator LRI) {
+  LiveReg &LR = *LRI;
+  assert(PhysRegState[LR.PhysReg] == LRI->VirtReg && "Broken RegState mapping");
+
+  if (LR.Dirty) {
+    // If this physreg is used by the instruction, we want to kill it on the
+    // instruction, not on the spill.
+    bool SpillKill = LR.LastUse != MI;
+    LR.Dirty = false;
+    DEBUG(dbgs() << "Spilling " << PrintReg(LRI->VirtReg, TRI)
+                 << " in " << PrintReg(LR.PhysReg, TRI));
+    const TargetRegisterClass *RC = MRI->getRegClass(LRI->VirtReg);
+    int FI = getStackSpaceFor(LRI->VirtReg, RC);
+    DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+    TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, RC, TRI);
+    ++NumStores;   // Update statistics
+
+    // If this register is used by DBG_VALUE then insert new DBG_VALUE to
+    // identify spilled location as the place to find corresponding variable's
+    // value.
+    SmallVector<MachineInstr *, 4> &LRIDbgValues =
+      LiveDbgValueMap[LRI->VirtReg];
+    for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
+      MachineInstr *DBG = LRIDbgValues[li];
+      const MDNode *MDPtr =
+        DBG->getOperand(DBG->getNumOperands()-1).getMetadata();
+      int64_t Offset = 0;
+      if (DBG->getOperand(1).isImm())
+        Offset = DBG->getOperand(1).getImm();
+      DebugLoc DL;
+      if (MI == MBB->end()) {
+        // If MI is at basic block end then use last instruction's location.
+        MachineBasicBlock::iterator EI = MI;
+        DL = (--EI)->getDebugLoc();
+      }
+      else
+        DL = MI->getDebugLoc();
+      if (MachineInstr *NewDV =
+          TII->emitFrameIndexDebugValue(*MF, FI, Offset, MDPtr, DL)) {
+        MachineBasicBlock *MBB = DBG->getParent();
+        MBB->insert(MI, NewDV);
+        DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
+      }
+    }
+    // Now this register is spilled there is should not be any DBG_VALUE
+    // pointing to this register because they are all pointing to spilled value
+    // now.
+    LRIDbgValues.clear();
+    if (SpillKill)
+      LR.LastUse = 0; // Don't kill register again
+  }
+  killVirtReg(LRI);
+}
+
+/// spillAll - Spill all dirty virtregs without killing them.
+void RAFast::spillAll(MachineBasicBlock::iterator MI) {
+  if (LiveVirtRegs.empty()) return;
+  isBulkSpilling = true;
+  // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
+  // of spilling here is deterministic, if arbitrary.
+  for (LiveRegMap::iterator i = LiveVirtRegs.begin(), e = LiveVirtRegs.end();
+       i != e; ++i)
+    spillVirtReg(MI, i);
+  LiveVirtRegs.clear();
+  isBulkSpilling = false;
+}
+
+/// usePhysReg - Handle the direct use of a physical register.
+/// Check that the register is not used by a virtreg.
+/// Kill the physreg, marking it free.
+/// This may add implicit kills to MO->getParent() and invalidate MO.
+void RAFast::usePhysReg(MachineOperand &MO) {
+  unsigned PhysReg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+         "Bad usePhysReg operand");
+  markRegUsedInInstr(PhysReg);
+  switch (PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regReserved:
+    PhysRegState[PhysReg] = regFree;
+    // Fall through
+  case regFree:
+    MO.setIsKill();
+    return;
+  default:
+    // The physreg was allocated to a virtual register. That means the value we
+    // wanted has been clobbered.
+    llvm_unreachable("Instruction uses an allocated register");
+  }
+
+  // Maybe a superregister is reserved?
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regReserved:
+      assert(TRI->isSuperRegister(PhysReg, Alias) &&
+             "Instruction is not using a subregister of a reserved register");
+      // Leave the superregister in the working set.
+      PhysRegState[Alias] = regFree;
+      MO.getParent()->addRegisterKilled(Alias, TRI, true);
+      return;
+    case regFree:
+      if (TRI->isSuperRegister(PhysReg, Alias)) {
+        // Leave the superregister in the working set.
+        MO.getParent()->addRegisterKilled(Alias, TRI, true);
+        return;
+      }
+      // Some other alias was in the working set - clear it.
+      PhysRegState[Alias] = regDisabled;
+      break;
+    default:
+      llvm_unreachable("Instruction uses an alias of an allocated register");
+    }
+  }
+
+  // All aliases are disabled, bring register into working set.
+  PhysRegState[PhysReg] = regFree;
+  MO.setIsKill();
+}
+
+/// definePhysReg - Mark PhysReg as reserved or free after spilling any
+/// virtregs. This is very similar to defineVirtReg except the physreg is
+/// reserved instead of allocated.
+void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
+                           RegState NewState) {
+  markRegUsedInInstr(PhysReg);
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  default:
+    spillVirtReg(MI, VirtReg);
+    // Fall through.
+  case regFree:
+  case regReserved:
+    PhysRegState[PhysReg] = NewState;
+    return;
+  }
+
+  // This is a disabled register, disable all aliases.
+  PhysRegState[PhysReg] = NewState;
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    default:
+      spillVirtReg(MI, VirtReg);
+      // Fall through.
+    case regFree:
+    case regReserved:
+      PhysRegState[Alias] = regDisabled;
+      if (TRI->isSuperRegister(PhysReg, Alias))
+        return;
+      break;
+    }
+  }
+}
+
+
+// calcSpillCost - Return the cost of spilling clearing out PhysReg and
+// aliases so it is free for allocation.
+// Returns 0 when PhysReg is free or disabled with all aliases disabled - it
+// can be allocated directly.
+// Returns spillImpossible when PhysReg or an alias can't be spilled.
+unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
+  if (isRegUsedInInstr(PhysReg)) {
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n");
+    return spillImpossible;
+  }
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regFree:
+    return 0;
+  case regReserved:
+    DEBUG(dbgs() << PrintReg(VirtReg, TRI) << " corresponding "
+                 << PrintReg(PhysReg, TRI) << " is reserved already.\n");
+    return spillImpossible;
+  default: {
+    LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+    assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+    return I->Dirty ? spillDirty : spillClean;
+  }
+  }
+
+  // This is a disabled register, add up cost of aliases.
+  DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is disabled.\n");
+  unsigned Cost = 0;
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regFree:
+      ++Cost;
+      break;
+    case regReserved:
+      return spillImpossible;
+    default: {
+      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+      Cost += I->Dirty ? spillDirty : spillClean;
+      break;
+    }
+    }
+  }
+  return Cost;
+}
+
+
+/// assignVirtToPhysReg - This method updates local state so that we know
+/// that PhysReg is the proper container for VirtReg now.  The physical
+/// register must not be used for anything else when this is called.
+///
+void RAFast::assignVirtToPhysReg(LiveReg &LR, unsigned PhysReg) {
+  DEBUG(dbgs() << "Assigning " << PrintReg(LR.VirtReg, TRI) << " to "
+               << PrintReg(PhysReg, TRI) << "\n");
+  PhysRegState[PhysReg] = LR.VirtReg;
+  assert(!LR.PhysReg && "Already assigned a physreg");
+  LR.PhysReg = PhysReg;
+}
+
+RAFast::LiveRegMap::iterator
+RAFast::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  assert(LRI != LiveVirtRegs.end() && "VirtReg disappeared");
+  assignVirtToPhysReg(*LRI, PhysReg);
+  return LRI;
+}
+
+/// allocVirtReg - Allocate a physical register for VirtReg.
+RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
+                                                  LiveRegMap::iterator LRI,
+                                                  unsigned Hint) {
+  const unsigned VirtReg = LRI->VirtReg;
+
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Can only allocate virtual registers");
+
+  const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+
+  // Ignore invalid hints.
+  if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
+               !RC->contains(Hint) || !MRI->isAllocatable(Hint)))
+    Hint = 0;
+
+  // Take hint when possible.
+  if (Hint) {
+    // Ignore the hint if we would have to spill a dirty register.
+    unsigned Cost = calcSpillCost(Hint);
+    if (Cost < spillDirty) {
+      if (Cost)
+        definePhysReg(MI, Hint, regFree);
+      // definePhysReg may kill virtual registers and modify LiveVirtRegs.
+      // That invalidates LRI, so run a new lookup for VirtReg.
+      return assignVirtToPhysReg(VirtReg, Hint);
+    }
+  }
+
+  ArrayRef<MCPhysReg> AO = RegClassInfo.getOrder(RC);
+
+  // First try to find a completely free register.
+  for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
+    unsigned PhysReg = *I;
+    if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
+      assignVirtToPhysReg(*LRI, PhysReg);
+      return LRI;
+    }
+  }
+
+  DEBUG(dbgs() << "Allocating " << PrintReg(VirtReg) << " from "
+               << RC->getName() << "\n");
+
+  unsigned BestReg = 0, BestCost = spillImpossible;
+  for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
+    unsigned Cost = calcSpillCost(*I);
+    DEBUG(dbgs() << "\tRegister: " << PrintReg(*I, TRI) << "\n");
+    DEBUG(dbgs() << "\tCost: " << Cost << "\n");
+    DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
+    // Cost is 0 when all aliases are already disabled.
+    if (Cost == 0) {
+      assignVirtToPhysReg(*LRI, *I);
+      return LRI;
+    }
+    if (Cost < BestCost)
+      BestReg = *I, BestCost = Cost;
+  }
+
+  if (BestReg) {
+    definePhysReg(MI, BestReg, regFree);
+    // definePhysReg may kill virtual registers and modify LiveVirtRegs.
+    // That invalidates LRI, so run a new lookup for VirtReg.
+    return assignVirtToPhysReg(VirtReg, BestReg);
+  }
+
+  // Nothing we can do. Report an error and keep going with a bad allocation.
+  MI->emitError("ran out of registers during register allocation");
+  definePhysReg(MI, *AO.begin(), regFree);
+  return assignVirtToPhysReg(VirtReg, *AO.begin());
+}
+
+/// defineVirtReg - Allocate a register for VirtReg and mark it as dirty.
+RAFast::LiveRegMap::iterator
+RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
+                      unsigned VirtReg, unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  if (New) {
+    // If there is no hint, peek at the only use of this register.
+    if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
+        MRI->hasOneNonDBGUse(VirtReg)) {
+      const MachineInstr &UseMI = *MRI->use_nodbg_begin(VirtReg);
+      // It's a copy, use the destination register as a hint.
+      if (UseMI.isCopyLike())
+        Hint = UseMI.getOperand(0).getReg();
+    }
+    LRI = allocVirtReg(MI, LRI, Hint);
+  } else if (LRI->LastUse) {
+    // Redefining a live register - kill at the last use, unless it is this
+    // instruction defining VirtReg multiple times.
+    if (LRI->LastUse != MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
+      addKillFlag(*LRI);
+  }
+  assert(LRI->PhysReg && "Register not assigned");
+  LRI->LastUse = MI;
+  LRI->LastOpNum = OpNum;
+  LRI->Dirty = true;
+  markRegUsedInInstr(LRI->PhysReg);
+  return LRI;
+}
+
+/// reloadVirtReg - Make sure VirtReg is available in a physreg and return it.
+RAFast::LiveRegMap::iterator
+RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
+                      unsigned VirtReg, unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  MachineOperand &MO = MI->getOperand(OpNum);
+  if (New) {
+    LRI = allocVirtReg(MI, LRI, Hint);
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    DEBUG(dbgs() << "Reloading " << PrintReg(VirtReg, TRI) << " into "
+                 << PrintReg(LRI->PhysReg, TRI) << "\n");
+    TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, RC, TRI);
+    ++NumLoads;
+  } else if (LRI->Dirty) {
+    if (isLastUseOfLocalReg(MO)) {
+      DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      if (MO.isUse())
+        MO.setIsKill();
+      else
+        MO.setIsDead();
+    } else if (MO.isKill()) {
+      DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      MO.setIsKill(false);
+    } else if (MO.isDead()) {
+      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      MO.setIsDead(false);
+    }
+  } else if (MO.isKill()) {
+    // We must remove kill flags from uses of reloaded registers because the
+    // register would be killed immediately, and there might be a second use:
+    //   %foo = OR %x<kill>, %x
+    // This would cause a second reload of %x into a different register.
+    DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    MO.setIsKill(false);
+  } else if (MO.isDead()) {
+    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    MO.setIsDead(false);
+  }
+  assert(LRI->PhysReg && "Register not assigned");
+  LRI->LastUse = MI;
+  LRI->LastOpNum = OpNum;
+  markRegUsedInInstr(LRI->PhysReg);
+  return LRI;
+}
+
+// setPhysReg - Change operand OpNum in MI the refer the PhysReg, considering
+// subregs. This may invalidate any operand pointers.
+// Return true if the operand kills its register.
+bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
+  MachineOperand &MO = MI->getOperand(OpNum);
+  bool Dead = MO.isDead();
+  if (!MO.getSubReg()) {
+    MO.setReg(PhysReg);
+    return MO.isKill() || Dead;
+  }
+
+  // Handle subregister index.
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
+  MO.setSubReg(0);
+
+  // A kill flag implies killing the full register. Add corresponding super
+  // register kill.
+  if (MO.isKill()) {
+    MI->addRegisterKilled(PhysReg, TRI, true);
+    return true;
+  }
+
+  // A <def,read-undef> of a sub-register requires an implicit def of the full
+  // register.
+  if (MO.isDef() && MO.isUndef())
+    MI->addRegisterDefined(PhysReg, TRI);
+
+  return Dead;
+}
+
+// Handle special instruction operand like early clobbers and tied ops when
+// there are additional physreg defines.
+void RAFast::handleThroughOperands(MachineInstr *MI,
+                                   SmallVectorImpl<unsigned> &VirtDead) {
+  DEBUG(dbgs() << "Scanning for through registers:");
+  SmallSet<unsigned, 8> ThroughRegs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
+        (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
+      if (ThroughRegs.insert(Reg))
+        DEBUG(dbgs() << ' ' << PrintReg(Reg));
+    }
+  }
+
+  // If any physreg defines collide with preallocated through registers,
+  // we must spill and reallocate.
+  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    markRegUsedInInstr(Reg);
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      if (ThroughRegs.count(PhysRegState[*AI]))
+        definePhysReg(MI, *AI, regFree);
+    }
+  }
+
+  SmallVector<unsigned, 8> PartialDefs;
+  DEBUG(dbgs() << "Allocating tied uses.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (MO.isUse()) {
+      unsigned DefIdx = 0;
+      if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
+      DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
+        << DefIdx << ".\n");
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      unsigned PhysReg = LRI->PhysReg;
+      setPhysReg(MI, i, PhysReg);
+      // Note: we don't update the def operand yet. That would cause the normal
+      // def-scan to attempt spilling.
+    } else if (MO.getSubReg() && MI->readsVirtualRegister(Reg)) {
+      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      // Reload the register, but don't assign to the operand just yet.
+      // That would confuse the later phys-def processing pass.
+      LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
+      PartialDefs.push_back(LRI->PhysReg);
+    }
+  }
+
+  DEBUG(dbgs() << "Allocating early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (!MO.isEarlyClobber())
+      continue;
+    // Note: defineVirtReg may invalidate MO.
+    LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+    unsigned PhysReg = LRI->PhysReg;
+    if (setPhysReg(MI, i, PhysReg))
+      VirtDead.push_back(Reg);
+  }
+
+  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
+  UsedInInstr.clear();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI)
+                 << " as used in instr\n");
+    markRegUsedInInstr(Reg);
+  }
+
+  // Also mark PartialDefs as used to avoid reallocation.
+  for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
+    markRegUsedInInstr(PartialDefs[i]);
+}
+
+void RAFast::AllocateBasicBlock() {
+  DEBUG(dbgs() << "\nAllocating " << *MBB);
+
+  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
+  assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
+
+  MachineBasicBlock::iterator MII = MBB->begin();
+
+  // Add live-in registers as live.
+  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I)
+    if (MRI->isAllocatable(*I))
+      definePhysReg(MII, *I, regReserved);
+
+  SmallVector<unsigned, 8> VirtDead;
+  SmallVector<MachineInstr*, 32> Coalesced;
+
+  // Otherwise, sequentially allocate each instruction in the MBB.
+  while (MII != MBB->end()) {
+    MachineInstr *MI = MII++;
+    const MCInstrDesc &MCID = MI->getDesc();
+    DEBUG({
+        dbgs() << "\n>> " << *MI << "Regs:";
+        for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
+          if (PhysRegState[Reg] == regDisabled) continue;
+          dbgs() << " " << TRI->getName(Reg);
+          switch(PhysRegState[Reg]) {
+          case regFree:
+            break;
+          case regReserved:
+            dbgs() << "*";
+            break;
+          default: {
+            dbgs() << '=' << PrintReg(PhysRegState[Reg]);
+            LiveRegMap::iterator I = findLiveVirtReg(PhysRegState[Reg]);
+            assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+            if (I->Dirty)
+              dbgs() << "*";
+            assert(I->PhysReg == Reg && "Bad inverse map");
+            break;
+          }
+          }
+        }
+        dbgs() << '\n';
+        // Check that LiveVirtRegs is the inverse.
+        for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
+             e = LiveVirtRegs.end(); i != e; ++i) {
+           assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
+                  "Bad map key");
+           assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
+                  "Bad map value");
+           assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
+        }
+      });
+
+    // Debug values are not allowed to change codegen in any way.
+    if (MI->isDebugValue()) {
+      bool ScanDbgValue = true;
+      while (ScanDbgValue) {
+        ScanDbgValue = false;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg()) continue;
+          unsigned Reg = MO.getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+          LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
+          if (LRI != LiveVirtRegs.end())
+            setPhysReg(MI, i, LRI->PhysReg);
+          else {
+            int SS = StackSlotForVirtReg[Reg];
+            if (SS == -1) {
+              // We can't allocate a physreg for a DebugValue, sorry!
+              DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
+              MO.setReg(0);
+            }
+            else {
+              // Modify DBG_VALUE now that the value is in a spill slot.
+              int64_t Offset = MI->getOperand(1).getImm();
+              const MDNode *MDPtr =
+                MI->getOperand(MI->getNumOperands()-1).getMetadata();
+              DebugLoc DL = MI->getDebugLoc();
+              if (MachineInstr *NewDV =
+                  TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
+                DEBUG(dbgs() << "Modifying debug info due to spill:" <<
+                      "\t" << *MI);
+                MachineBasicBlock *MBB = MI->getParent();
+                MBB->insert(MBB->erase(MI), NewDV);
+                // Scan NewDV operands from the beginning.
+                MI = NewDV;
+                ScanDbgValue = true;
+                break;
+              } else {
+                // We can't allocate a physreg for a DebugValue; sorry!
+                DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
+                MO.setReg(0);
+              }
+            }
+          }
+          LiveDbgValueMap[Reg].push_back(MI);
+        }
+      }
+      // Next instruction.
+      continue;
+    }
+
+    // If this is a copy, we may be able to coalesce.
+    unsigned CopySrc = 0, CopyDst = 0, CopySrcSub = 0, CopyDstSub = 0;
+    if (MI->isCopy()) {
+      CopyDst = MI->getOperand(0).getReg();
+      CopySrc = MI->getOperand(1).getReg();
+      CopyDstSub = MI->getOperand(0).getSubReg();
+      CopySrcSub = MI->getOperand(1).getSubReg();
+    }
+
+    // Track registers used by instruction.
+    UsedInInstr.clear();
+
+    // First scan.
+    // Mark physreg uses and early clobbers as used.
+    // Find the end of the virtreg operands
+    unsigned VirtOpEnd = 0;
+    bool hasTiedOps = false;
+    bool hasEarlyClobbers = false;
+    bool hasPartialRedefs = false;
+    bool hasPhysDefs = false;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      // Make sure MRI knows about registers clobbered by regmasks.
+      if (MO.isRegMask()) {
+        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+        continue;
+      }
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg) continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        VirtOpEnd = i+1;
+        if (MO.isUse()) {
+          hasTiedOps = hasTiedOps ||
+                              MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
+        } else {
+          if (MO.isEarlyClobber())
+            hasEarlyClobbers = true;
+          if (MO.getSubReg() && MI->readsVirtualRegister(Reg))
+            hasPartialRedefs = true;
+        }
+        continue;
+      }
+      if (!MRI->isAllocatable(Reg)) continue;
+      if (MO.isUse()) {
+        usePhysReg(MO);
+      } else if (MO.isEarlyClobber()) {
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        hasEarlyClobbers = true;
+      } else
+        hasPhysDefs = true;
+    }
+
+    // The instruction may have virtual register operands that must be allocated
+    // the same register at use-time and def-time: early clobbers and tied
+    // operands. If there are also physical defs, these registers must avoid
+    // both physical defs and uses, making them more constrained than normal
+    // operands.
+    // Similarly, if there are multiple defs and tied operands, we must make
+    // sure the same register is allocated to uses and defs.
+    // We didn't detect inline asm tied operands above, so just make this extra
+    // pass for all inline asm.
+    if (MI->isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
+        (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
+      handleThroughOperands(MI, VirtDead);
+      // Don't attempt coalescing when we have funny stuff going on.
+      CopyDst = 0;
+      // Pretend we have early clobbers so the use operands get marked below.
+      // This is not necessary for the common case of a single tied use.
+      hasEarlyClobbers = true;
+    }
+
+    // Second scan.
+    // Allocate virtreg uses.
+    for (unsigned i = 0; i != VirtOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+      if (MO.isUse()) {
+        LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, CopyDst);
+        unsigned PhysReg = LRI->PhysReg;
+        CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
+        if (setPhysReg(MI, i, PhysReg))
+          killVirtReg(LRI);
+      }
+    }
+
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
+    // Track registers defined by instruction - early clobbers and tied uses at
+    // this point.
+    UsedInInstr.clear();
+    if (hasEarlyClobbers) {
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        // Look for physreg defs and tied uses.
+        if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
+        markRegUsedInInstr(Reg);
+      }
+    }
+
+    unsigned DefOpEnd = MI->getNumOperands();
+    if (MI->isCall()) {
+      // Spill all virtregs before a call. This serves two purposes: 1. If an
+      // exception is thrown, the landing pad is going to expect to find
+      // registers in their spill slots, and 2. we don't have to wade through
+      // all the <imp-def> operands on the call instruction.
+      DefOpEnd = VirtOpEnd;
+      DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+      spillAll(MI);
+
+      // The imp-defs are skipped below, but we still need to mark those
+      // registers as used by the function.
+      SkippedInstrs.insert(&MCID);
+    }
+
+    // Third scan.
+    // Allocate defs and collect dead defs.
+    for (unsigned i = 0; i != DefOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (!MRI->isAllocatable(Reg)) continue;
+        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
+                               regFree : regReserved);
+        continue;
+      }
+      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);
+      unsigned PhysReg = LRI->PhysReg;
+      if (setPhysReg(MI, i, PhysReg)) {
+        VirtDead.push_back(Reg);
+        CopyDst = 0; // cancel coalescing;
+      } else
+        CopyDst = (CopyDst == Reg || CopyDst == PhysReg) ? PhysReg : 0;
+    }
+
+    // Kill dead defs after the scan to ensure that multiple defs of the same
+    // register are allocated identically. We didn't need to do this for uses
+    // because we are crerating our own kill flags, and they are always at the
+    // last use.
+    for (unsigned i = 0, e = VirtDead.size(); i != e; ++i)
+      killVirtReg(VirtDead[i]);
+    VirtDead.clear();
+
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setRegUnitUsed(*I);
+
+    if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
+      DEBUG(dbgs() << "-- coalescing: " << *MI);
+      Coalesced.push_back(MI);
+    } else {
+      DEBUG(dbgs() << "<< " << *MI);
+    }
+  }
+
+  // Spill all physical registers holding virtual registers now.
+  DEBUG(dbgs() << "Spilling live registers at end of block.\n");
+  spillAll(MBB->getFirstTerminator());
+
+  // Erase all the coalesced copies. We are delaying it until now because
+  // LiveVirtRegs might refer to the instrs.
+  for (unsigned i = 0, e = Coalesced.size(); i != e; ++i)
+    MBB->erase(Coalesced[i]);
+  NumCopies += Coalesced.size();
+
+  DEBUG(MBB->dump());
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
+  DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
+               << "********** Function: " << Fn.getName() << '\n');
+  MF = &Fn;
+  MRI = &MF->getRegInfo();
+  TM = &Fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  MRI->freezeReservedRegs(Fn);
+  RegClassInfo.runOnMachineFunction(Fn);
+  UsedInInstr.clear();
+  UsedInInstr.setUniverse(TRI->getNumRegUnits());
+
+  assert(!MRI->isSSA() && "regalloc requires leaving SSA");
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  StackSlotForVirtReg.resize(MRI->getNumVirtRegs());
+  LiveVirtRegs.setUniverse(MRI->getNumVirtRegs());
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBBi = Fn.begin(), MBBe = Fn.end();
+       MBBi != MBBe; ++MBBi) {
+    MBB = &*MBBi;
+    AllocateBasicBlock();
+  }
+
+  // Add the clobber lists for all the instructions we skipped earlier.
+  for (SmallPtrSet<const MCInstrDesc*, 4>::const_iterator
+       I = SkippedInstrs.begin(), E = SkippedInstrs.end(); I != E; ++I)
+    if (const uint16_t *Defs = (*I)->getImplicitDefs())
+      while (*Defs)
+        MRI->setPhysRegUsed(*Defs++);
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers.
+  MRI->clearVirtRegs();
+
+  SkippedInstrs.clear();
+  StackSlotForVirtReg.clear();
+  LiveDbgValueMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createFastRegisterAllocator() {
+  return new RAFast();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
new file mode 100644
index 000000000000..6d84176af261
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -0,0 +1,1791 @@
+//===-- RegAllocGreedy.cpp - greedy register allocator --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RAGreedy function pass for register allocation in
+// optimized builds.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/Passes.h"
+#include "AllocationOrder.h"
+#include "InterferenceCache.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "SpillPlacement.h"
+#include "Spiller.h"
+#include "SplitKit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <queue>
+
+using namespace llvm;
+
+STATISTIC(NumGlobalSplits, "Number of split global live ranges");
+STATISTIC(NumLocalSplits,  "Number of split local live ranges");
+STATISTIC(NumEvicted,      "Number of interferences evicted");
+
+static cl::opt<SplitEditor::ComplementSpillMode>
+SplitSpillMode("split-spill-mode", cl::Hidden,
+  cl::desc("Spill mode for splitting live ranges"),
+  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
+             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+             clEnumValEnd),
+  cl::init(SplitEditor::SM_Partition));
+
+static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
+                                       createGreedyRegisterAllocator);
+
+namespace {
+class RAGreedy : public MachineFunctionPass,
+                 public RegAllocBase,
+                 private LiveRangeEdit::Delegate {
+
+  // context
+  MachineFunction *MF;
+
+  // analyses
+  SlotIndexes *Indexes;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  EdgeBundles *Bundles;
+  SpillPlacement *SpillPlacer;
+  LiveDebugVariables *DebugVars;
+
+  // state
+  std::auto_ptr<Spiller> SpillerInstance;
+  std::priority_queue<std::pair<unsigned, unsigned> > Queue;
+  unsigned NextCascade;
+
+  // Live ranges pass through a number of stages as we try to allocate them.
+  // Some of the stages may also create new live ranges:
+  //
+  // - Region splitting.
+  // - Per-block splitting.
+  // - Local splitting.
+  // - Spilling.
+  //
+  // Ranges produced by one of the stages skip the previous stages when they are
+  // dequeued. This improves performance because we can skip interference checks
+  // that are unlikely to give any results. It also guarantees that the live
+  // range splitting algorithm terminates, something that is otherwise hard to
+  // ensure.
+  enum LiveRangeStage {
+    /// Newly created live range that has never been queued.
+    RS_New,
+
+    /// Only attempt assignment and eviction. Then requeue as RS_Split.
+    RS_Assign,
+
+    /// Attempt live range splitting if assignment is impossible.
+    RS_Split,
+
+    /// Attempt more aggressive live range splitting that is guaranteed to make
+    /// progress.  This is used for split products that may not be making
+    /// progress.
+    RS_Split2,
+
+    /// Live range will be spilled.  No more splitting will be attempted.
+    RS_Spill,
+
+    /// There is nothing more we can do to this live range.  Abort compilation
+    /// if it can't be assigned.
+    RS_Done
+  };
+
+  static const char *const StageName[];
+
+  // RegInfo - Keep additional information about each live range.
+  struct RegInfo {
+    LiveRangeStage Stage;
+
+    // Cascade - Eviction loop prevention. See canEvictInterference().
+    unsigned Cascade;
+
+    RegInfo() : Stage(RS_New), Cascade(0) {}
+  };
+
+  IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
+
+  LiveRangeStage getStage(const LiveInterval &VirtReg) const {
+    return ExtraRegInfo[VirtReg.reg].Stage;
+  }
+
+  void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+    ExtraRegInfo.resize(MRI->getNumVirtRegs());
+    ExtraRegInfo[VirtReg.reg].Stage = Stage;
+  }
+
+  template<typename Iterator>
+  void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
+    ExtraRegInfo.resize(MRI->getNumVirtRegs());
+    for (;Begin != End; ++Begin) {
+      unsigned Reg = (*Begin)->reg;
+      if (ExtraRegInfo[Reg].Stage == RS_New)
+        ExtraRegInfo[Reg].Stage = NewStage;
+    }
+  }
+
+  /// Cost of evicting interference.
+  struct EvictionCost {
+    unsigned BrokenHints; ///< Total number of broken hints.
+    float MaxWeight;      ///< Maximum spill weight evicted.
+
+    EvictionCost(unsigned B = 0) : BrokenHints(B), MaxWeight(0) {}
+
+    bool operator<(const EvictionCost &O) const {
+      if (BrokenHints != O.BrokenHints)
+        return BrokenHints < O.BrokenHints;
+      return MaxWeight < O.MaxWeight;
+    }
+  };
+
+  // splitting state.
+  std::auto_ptr<SplitAnalysis> SA;
+  std::auto_ptr<SplitEditor> SE;
+
+  /// Cached per-block interference maps
+  InterferenceCache IntfCache;
+
+  /// All basic blocks where the current register has uses.
+  SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
+
+  /// Global live range splitting candidate info.
+  struct GlobalSplitCandidate {
+    // Register intended for assignment, or 0.
+    unsigned PhysReg;
+
+    // SplitKit interval index for this candidate.
+    unsigned IntvIdx;
+
+    // Interference for PhysReg.
+    InterferenceCache::Cursor Intf;
+
+    // Bundles where this candidate should be live.
+    BitVector LiveBundles;
+    SmallVector<unsigned, 8> ActiveBlocks;
+
+    void reset(InterferenceCache &Cache, unsigned Reg) {
+      PhysReg = Reg;
+      IntvIdx = 0;
+      Intf.setPhysReg(Cache, Reg);
+      LiveBundles.clear();
+      ActiveBlocks.clear();
+    }
+
+    // Set B[i] = C for every live bundle where B[i] was NoCand.
+    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
+      unsigned Count = 0;
+      for (int i = LiveBundles.find_first(); i >= 0;
+           i = LiveBundles.find_next(i))
+        if (B[i] == NoCand) {
+          B[i] = C;
+          Count++;
+        }
+      return Count;
+    }
+  };
+
+  /// Candidate info for for each PhysReg in AllocationOrder.
+  /// This vector never shrinks, but grows to the size of the largest register
+  /// class.
+  SmallVector<GlobalSplitCandidate, 32> GlobalCand;
+
+  enum { NoCand = ~0u };
+
+  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
+  /// NoCand which indicates the stack interval.
+  SmallVector<unsigned, 32> BundleCand;
+
+public:
+  RAGreedy();
+
+  /// Return the pass name.
+  virtual const char* getPassName() const {
+    return "Greedy Register Allocator";
+  }
+
+  /// RAGreedy analysis usage.
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void releaseMemory();
+  virtual Spiller &spiller() { return *SpillerInstance; }
+  virtual void enqueue(LiveInterval *LI);
+  virtual LiveInterval *dequeue();
+  virtual unsigned selectOrSplit(LiveInterval&,
+                                 SmallVectorImpl<LiveInterval*>&);
+
+  /// Perform register allocation.
+  virtual bool runOnMachineFunction(MachineFunction &mf);
+
+  static char ID;
+
+private:
+  bool LRE_CanEraseVirtReg(unsigned);
+  void LRE_WillShrinkVirtReg(unsigned);
+  void LRE_DidCloneVirtReg(unsigned, unsigned);
+
+  float calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, float&);
+  void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
+  void growRegion(GlobalSplitCandidate &Cand);
+  float calcGlobalSplitCost(GlobalSplitCandidate&);
+  bool calcCompactRegion(GlobalSplitCandidate&);
+  void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
+  void calcGapWeights(unsigned, SmallVectorImpl<float>&);
+  bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
+  bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
+  void evictInterference(LiveInterval&, unsigned,
+                         SmallVectorImpl<LiveInterval*>&);
+
+  unsigned tryAssign(LiveInterval&, AllocationOrder&,
+                     SmallVectorImpl<LiveInterval*>&);
+  unsigned tryEvict(LiveInterval&, AllocationOrder&,
+                    SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
+  unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
+                          SmallVectorImpl<LiveInterval*>&);
+  unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
+                         SmallVectorImpl<LiveInterval*>&);
+  unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
+                               SmallVectorImpl<LiveInterval*>&);
+  unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
+    SmallVectorImpl<LiveInterval*>&);
+  unsigned trySplit(LiveInterval&, AllocationOrder&,
+                    SmallVectorImpl<LiveInterval*>&);
+};
+} // end anonymous namespace
+
+char RAGreedy::ID = 0;
+
+#ifndef NDEBUG
+const char *const RAGreedy::StageName[] = {
+    "RS_New",
+    "RS_Assign",
+    "RS_Split",
+    "RS_Split2",
+    "RS_Spill",
+    "RS_Done"
+};
+#endif
+
+// Hysteresis to use when comparing floats.
+// This helps stabilize decisions based on float comparisons.
+const float Hysteresis = 0.98f;
+
+
+FunctionPass* llvm::createGreedyRegisterAllocator() {
+  return new RAGreedy();
+}
+
+RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
+  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
+  initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+  initializeSpillPlacementPass(*PassRegistry::getPassRegistry());
+}
+
+void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<CalculateSpillWeights>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  AU.addRequired<EdgeBundles>();
+  AU.addRequired<SpillPlacement>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                     LiveRangeEdit delegate methods
+//===----------------------------------------------------------------------===//
+
+bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
+  if (VRM->hasPhys(VirtReg)) {
+    Matrix->unassign(LIS->getInterval(VirtReg));
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  return false;
+}
+
+void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  enqueue(&LI);
+}
+
+void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+  // Cloning a register we haven't even heard about yet?  Just ignore it.
+  if (!ExtraRegInfo.inBounds(Old))
+    return;
+
+  // LRE may clone a virtual register because dead code elimination causes it to
+  // be split into connected components. The new components are much smaller
+  // than the original, so they should get a new chance at being assigned.
+  // same stage as the parent.
+  ExtraRegInfo[Old].Stage = RS_Assign;
+  ExtraRegInfo.grow(New);
+  ExtraRegInfo[New] = ExtraRegInfo[Old];
+}
+
+void RAGreedy::releaseMemory() {
+  SpillerInstance.reset(0);
+  ExtraRegInfo.clear();
+  GlobalCand.clear();
+}
+
+void RAGreedy::enqueue(LiveInterval *LI) {
+  // Prioritize live ranges by size, assigning larger ranges first.
+  // The queue holds (size, reg) pairs.
+  const unsigned Size = LI->getSize();
+  const unsigned Reg = LI->reg;
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Can only enqueue virtual registers");
+  unsigned Prio;
+
+  ExtraRegInfo.grow(Reg);
+  if (ExtraRegInfo[Reg].Stage == RS_New)
+    ExtraRegInfo[Reg].Stage = RS_Assign;
+
+  if (ExtraRegInfo[Reg].Stage == RS_Split) {
+    // Unsplit ranges that couldn't be allocated immediately are deferred until
+    // everything else has been allocated.
+    Prio = Size;
+  } else {
+    // Everything is allocated in long->short order. Long ranges that don't fit
+    // should be spilled (or split) ASAP so they don't create interference.
+    Prio = (1u << 31) + Size;
+
+    // Boost ranges that have a physical register hint.
+    if (VRM->hasKnownPreference(Reg))
+      Prio |= (1u << 30);
+  }
+
+  Queue.push(std::make_pair(Prio, ~Reg));
+}
+
+LiveInterval *RAGreedy::dequeue() {
+  if (Queue.empty())
+    return 0;
+  LiveInterval *LI = &LIS->getInterval(~Queue.top().second);
+  Queue.pop();
+  return LI;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Direct Assignment
+//===----------------------------------------------------------------------===//
+
+/// tryAssign - Try to assign VirtReg to an available register.
+unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
+                             AllocationOrder &Order,
+                             SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  Order.rewind();
+  unsigned PhysReg;
+  while ((PhysReg = Order.next()))
+    if (!Matrix->checkInterference(VirtReg, PhysReg))
+      break;
+  if (!PhysReg || Order.isHint())
+    return PhysReg;
+
+  // PhysReg is available, but there may be a better choice.
+
+  // If we missed a simple hint, try to cheaply evict interference from the
+  // preferred register.
+  if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
+    if (Order.isHint(Hint)) {
+      DEBUG(dbgs() << "missed hint " << PrintReg(Hint, TRI) << '\n');
+      EvictionCost MaxCost(1);
+      if (canEvictInterference(VirtReg, Hint, true, MaxCost)) {
+        evictInterference(VirtReg, Hint, NewVRegs);
+        return Hint;
+      }
+    }
+
+  // Try to evict interference from a cheaper alternative.
+  unsigned Cost = TRI->getCostPerUse(PhysReg);
+
+  // Most registers have 0 additional cost.
+  if (!Cost)
+    return PhysReg;
+
+  DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is available at cost " << Cost
+               << '\n');
+  unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost);
+  return CheapReg ? CheapReg : PhysReg;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Interference eviction
+//===----------------------------------------------------------------------===//
+
+/// shouldEvict - determine if A should evict the assigned live range B. The
+/// eviction policy defined by this function together with the allocation order
+/// defined by enqueue() decides which registers ultimately end up being split
+/// and spilled.
+///
+/// Cascade numbers are used to prevent infinite loops if this function is a
+/// cyclic relation.
+///
+/// @param A          The live range to be assigned.
+/// @param IsHint     True when A is about to be assigned to its preferred
+///                   register.
+/// @param B          The live range to be evicted.
+/// @param BreaksHint True when B is already assigned to its preferred register.
+bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
+                           LiveInterval &B, bool BreaksHint) {
+  bool CanSplit = getStage(B) < RS_Spill;
+
+  // Be fairly aggressive about following hints as long as the evictee can be
+  // split.
+  if (CanSplit && IsHint && !BreaksHint)
+    return true;
+
+  return A.weight > B.weight;
+}
+
+/// canEvictInterference - Return true if all interferences between VirtReg and
+/// PhysReg can be evicted.  When OnlyCheap is set, don't do anything
+///
+/// @param VirtReg Live range that is about to be assigned.
+/// @param PhysReg Desired register for assignment.
+/// @param IsHint  True when PhysReg is VirtReg's preferred register.
+/// @param MaxCost Only look for cheaper candidates and update with new cost
+///                when returning true.
+/// @returns True when interference can be evicted cheaper than MaxCost.
+bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
+                                    bool IsHint, EvictionCost &MaxCost) {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
+    return false;
+
+  // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
+  // involved in an eviction before. If a cascade number was assigned, deny
+  // evicting anything with the same or a newer cascade number. This prevents
+  // infinite eviction loops.
+  //
+  // This works out so a register without a cascade number is allowed to evict
+  // anything, and it can be evicted by anything.
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  if (!Cascade)
+    Cascade = NextCascade;
+
+  EvictionCost Cost;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    // If there is 10 or more interferences, chances are one is heavier.
+    if (Q.collectInterferingVRegs(10) >= 10)
+      return false;
+
+    // Check if any interfering live range is heavier than MaxWeight.
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+             "Only expecting virtual register interference from query");
+      // Never evict spill products. They cannot split or spill.
+      if (getStage(*Intf) == RS_Done)
+        return false;
+      // Once a live range becomes small enough, it is urgent that we find a
+      // register for it. This is indicated by an infinite spill weight. These
+      // urgent live ranges get to evict almost anything.
+      //
+      // Also allow urgent evictions of unspillable ranges from a strictly
+      // larger allocation order.
+      bool Urgent = !VirtReg.isSpillable() &&
+        (Intf->isSpillable() ||
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) <
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg)));
+      // Only evict older cascades or live ranges without a cascade.
+      unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade;
+      if (Cascade <= IntfCascade) {
+        if (!Urgent)
+          return false;
+        // We permit breaking cascades for urgent evictions. It should be the
+        // last resort, though, so make it really expensive.
+        Cost.BrokenHints += 10;
+      }
+      // Would this break a satisfied hint?
+      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg);
+      // Update eviction cost.
+      Cost.BrokenHints += BreaksHint;
+      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight);
+      // Abort if this would be too expensive.
+      if (!(Cost < MaxCost))
+        return false;
+      // Finally, apply the eviction policy for non-urgent evictions.
+      if (!Urgent && !shouldEvict(VirtReg, IsHint, *Intf, BreaksHint))
+        return false;
+    }
+  }
+  MaxCost = Cost;
+  return true;
+}
+
+/// evictInterference - Evict any interferring registers that prevent VirtReg
+/// from being assigned to Physreg. This assumes that canEvictInterference
+/// returned true.
+void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
+                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  // Make sure that VirtReg has a cascade number, and assign that cascade
+  // number to every evicted register. These live ranges than then only be
+  // evicted by a newer cascade, preventing infinite loops.
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  if (!Cascade)
+    Cascade = ExtraRegInfo[VirtReg.reg].Cascade = NextCascade++;
+
+  DEBUG(dbgs() << "evicting " << PrintReg(PhysReg, TRI)
+               << " interference: Cascade " << Cascade << '\n');
+
+  // Collect all interfering virtregs first.
+  SmallVector<LiveInterval*, 8> Intfs;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
+    ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
+    Intfs.append(IVR.begin(), IVR.end());
+  }
+
+  // Evict them second. This will invalidate the queries.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval *Intf = Intfs[i];
+    // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
+    if (!VRM->hasPhys(Intf->reg))
+      continue;
+    Matrix->unassign(*Intf);
+    assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
+            VirtReg.isSpillable() < Intf->isSpillable()) &&
+           "Cannot decrease cascade number, illegal eviction");
+    ExtraRegInfo[Intf->reg].Cascade = Cascade;
+    ++NumEvicted;
+    NewVRegs.push_back(Intf);
+  }
+}
+
+/// tryEvict - Try to evict all interferences for a physreg.
+/// @param  VirtReg Currently unassigned virtual register.
+/// @param  Order   Physregs to try.
+/// @return         Physreg to assign VirtReg, or 0.
+unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
+                            AllocationOrder &Order,
+                            SmallVectorImpl<LiveInterval*> &NewVRegs,
+                            unsigned CostPerUseLimit) {
+  NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
+
+  // Keep track of the cheapest interference seen so far.
+  EvictionCost BestCost(~0u);
+  unsigned BestPhys = 0;
+  unsigned OrderLimit = Order.getOrder().size();
+
+  // When we are just looking for a reduced cost per use, don't break any
+  // hints, and only evict smaller spill weights.
+  if (CostPerUseLimit < ~0u) {
+    BestCost.BrokenHints = 0;
+    BestCost.MaxWeight = VirtReg.weight;
+
+    // Check of any registers in RC are below CostPerUseLimit.
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
+    unsigned MinCost = RegClassInfo.getMinCost(RC);
+    if (MinCost >= CostPerUseLimit) {
+      DEBUG(dbgs() << RC->getName() << " minimum cost = " << MinCost
+                   << ", no cheaper registers to be found.\n");
+      return 0;
+    }
+
+    // It is normal for register classes to have a long tail of registers with
+    // the same cost. We don't need to look at them if they're too expensive.
+    if (TRI->getCostPerUse(Order.getOrder().back()) >= CostPerUseLimit) {
+      OrderLimit = RegClassInfo.getLastCostChange(RC);
+      DEBUG(dbgs() << "Only trying the first " << OrderLimit << " regs.\n");
+    }
+  }
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.nextWithDups(OrderLimit)) {
+    if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
+      continue;
+    // The first use of a callee-saved register in a function has cost 1.
+    // Don't start using a CSR when the CostPerUseLimit is low.
+    if (CostPerUseLimit == 1)
+     if (unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg))
+       if (!MRI->isPhysRegUsed(CSR)) {
+         DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " would clobber CSR "
+                      << PrintReg(CSR, TRI) << '\n');
+         continue;
+       }
+
+    if (!canEvictInterference(VirtReg, PhysReg, false, BestCost))
+      continue;
+
+    // Best so far.
+    BestPhys = PhysReg;
+
+    // Stop if the hint can be used.
+    if (Order.isHint())
+      break;
+  }
+
+  if (!BestPhys)
+    return 0;
+
+  evictInterference(VirtReg, BestPhys, NewVRegs);
+  return BestPhys;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                              Region Splitting
+//===----------------------------------------------------------------------===//
+
+/// addSplitConstraints - Fill out the SplitConstraints vector based on the
+/// interference pattern in Physreg and its aliases. Add the constraints to
+/// SpillPlacement and return the static cost of this split in Cost, assuming
+/// that all preferences in SplitConstraints are met.
+/// Return false if there are no bundles with positive bias.
+bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
+                                   float &Cost) {
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+
+  // Reset interference dependent info.
+  SplitConstraints.resize(UseBlocks.size());
+  float StaticCost = 0;
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+
+    BC.Number = BI.MBB->getNumber();
+    Intf.moveToBlock(BC.Number);
+    BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.ChangesValue = BI.FirstDef;
+
+    if (!Intf.hasInterference())
+      continue;
+
+    // Number of spill code instructions to insert.
+    unsigned Ins = 0;
+
+    // Interference for the live-in value.
+    if (BI.LiveIn) {
+      if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number))
+        BC.Entry = SpillPlacement::MustSpill, ++Ins;
+      else if (Intf.first() < BI.FirstInstr)
+        BC.Entry = SpillPlacement::PrefSpill, ++Ins;
+      else if (Intf.first() < BI.LastInstr)
+        ++Ins;
+    }
+
+    // Interference for the live-out value.
+    if (BI.LiveOut) {
+      if (Intf.last() >= SA->getLastSplitPoint(BC.Number))
+        BC.Exit = SpillPlacement::MustSpill, ++Ins;
+      else if (Intf.last() > BI.LastInstr)
+        BC.Exit = SpillPlacement::PrefSpill, ++Ins;
+      else if (Intf.last() > BI.FirstInstr)
+        ++Ins;
+    }
+
+    // Accumulate the total frequency of inserted spill code.
+    if (Ins)
+      StaticCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
+  }
+  Cost = StaticCost;
+
+  // Add constraints for use-blocks. Note that these are the only constraints
+  // that may add a positive bias, it is downhill from here.
+  SpillPlacer->addConstraints(SplitConstraints);
+  return SpillPlacer->scanActiveBundles();
+}
+
+
+/// addThroughConstraints - Add constraints and links to SpillPlacer from the
+/// live-through blocks in Blocks.
+void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
+                                     ArrayRef<unsigned> Blocks) {
+  const unsigned GroupSize = 8;
+  SpillPlacement::BlockConstraint BCS[GroupSize];
+  unsigned TBS[GroupSize];
+  unsigned B = 0, T = 0;
+
+  for (unsigned i = 0; i != Blocks.size(); ++i) {
+    unsigned Number = Blocks[i];
+    Intf.moveToBlock(Number);
+
+    if (!Intf.hasInterference()) {
+      assert(T < GroupSize && "Array overflow");
+      TBS[T] = Number;
+      if (++T == GroupSize) {
+        SpillPlacer->addLinks(makeArrayRef(TBS, T));
+        T = 0;
+      }
+      continue;
+    }
+
+    assert(B < GroupSize && "Array overflow");
+    BCS[B].Number = Number;
+
+    // Interference for the live-in value.
+    if (Intf.first() <= Indexes->getMBBStartIdx(Number))
+      BCS[B].Entry = SpillPlacement::MustSpill;
+    else
+      BCS[B].Entry = SpillPlacement::PrefSpill;
+
+    // Interference for the live-out value.
+    if (Intf.last() >= SA->getLastSplitPoint(Number))
+      BCS[B].Exit = SpillPlacement::MustSpill;
+    else
+      BCS[B].Exit = SpillPlacement::PrefSpill;
+
+    if (++B == GroupSize) {
+      ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
+      SpillPlacer->addConstraints(Array);
+      B = 0;
+    }
+  }
+
+  ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
+  SpillPlacer->addConstraints(Array);
+  SpillPlacer->addLinks(makeArrayRef(TBS, T));
+}
+
+void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
+  // Keep track of through blocks that have not been added to SpillPlacer.
+  BitVector Todo = SA->getThroughBlocks();
+  SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks;
+  unsigned AddedTo = 0;
+#ifndef NDEBUG
+  unsigned Visited = 0;
+#endif
+
+  for (;;) {
+    ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
+    // Find new through blocks in the periphery of PrefRegBundles.
+    for (int i = 0, e = NewBundles.size(); i != e; ++i) {
+      unsigned Bundle = NewBundles[i];
+      // Look at all blocks connected to Bundle in the full graph.
+      ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
+      for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        unsigned Block = *I;
+        if (!Todo.test(Block))
+          continue;
+        Todo.reset(Block);
+        // This is a new through block. Add it to SpillPlacer later.
+        ActiveBlocks.push_back(Block);
+#ifndef NDEBUG
+        ++Visited;
+#endif
+      }
+    }
+    // Any new blocks to add?
+    if (ActiveBlocks.size() == AddedTo)
+      break;
+
+    // Compute through constraints from the interference, or assume that all
+    // through blocks prefer spilling when forming compact regions.
+    ArrayRef<unsigned> NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
+    if (Cand.PhysReg)
+      addThroughConstraints(Cand.Intf, NewBlocks);
+    else
+      // Provide a strong negative bias on through blocks to prevent unwanted
+      // liveness on loop backedges.
+      SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
+    AddedTo = ActiveBlocks.size();
+
+    // Perhaps iterating can enable more bundles?
+    SpillPlacer->iterate();
+  }
+  DEBUG(dbgs() << ", v=" << Visited);
+}
+
+/// calcCompactRegion - Compute the set of edge bundles that should be live
+/// when splitting the current live range into compact regions.  Compact
+/// regions can be computed without looking at interference.  They are the
+/// regions formed by removing all the live-through blocks from the live range.
+///
+/// Returns false if the current live range is already compact, or if the
+/// compact regions would form single block regions anyway.
+bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
+  // Without any through blocks, the live range is already compact.
+  if (!SA->getNumThroughBlocks())
+    return false;
+
+  // Compact regions don't correspond to any physreg.
+  Cand.reset(IntfCache, 0);
+
+  DEBUG(dbgs() << "Compact region bundles");
+
+  // Use the spill placer to determine the live bundles. GrowRegion pretends
+  // that all the through blocks have interference when PhysReg is unset.
+  SpillPlacer->prepare(Cand.LiveBundles);
+
+  // The static split cost will be zero since Cand.Intf reports no interference.
+  float Cost;
+  if (!addSplitConstraints(Cand.Intf, Cost)) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  growRegion(Cand);
+  SpillPlacer->finish();
+
+  if (!Cand.LiveBundles.any()) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  DEBUG({
+    for (int i = Cand.LiveBundles.find_first(); i>=0;
+         i = Cand.LiveBundles.find_next(i))
+    dbgs() << " EB#" << i;
+    dbgs() << ".\n";
+  });
+  return true;
+}
+
+/// calcSpillCost - Compute how expensive it would be to split the live range in
+/// SA around all use blocks instead of forming bundle regions.
+float RAGreedy::calcSpillCost() {
+  float Cost = 0;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    unsigned Number = BI.MBB->getNumber();
+    // We normally only need one spill instruction - a load or a store.
+    Cost += SpillPlacer->getBlockFrequency(Number);
+
+    // Unless the value is redefined in the block.
+    if (BI.LiveIn && BI.LiveOut && BI.FirstDef)
+      Cost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return Cost;
+}
+
+/// calcGlobalSplitCost - Return the global split cost of following the split
+/// pattern in LiveBundles. This cost should be added to the local cost of the
+/// interference pattern in SplitConstraints.
+///
+float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
+  float GlobalCost = 0;
+  const BitVector &LiveBundles = Cand.LiveBundles;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+    bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, 0)];
+    bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, 1)];
+    unsigned Ins = 0;
+
+    if (BI.LiveIn)
+      Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
+    if (BI.LiveOut)
+      Ins += RegOut != (BC.Exit == SpillPlacement::PrefReg);
+    if (Ins)
+      GlobalCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
+  }
+
+  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
+    unsigned Number = Cand.ActiveBlocks[i];
+    bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
+    bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
+    if (!RegIn && !RegOut)
+      continue;
+    if (RegIn && RegOut) {
+      // We need double spill code if this block has interference.
+      Cand.Intf.moveToBlock(Number);
+      if (Cand.Intf.hasInterference())
+        GlobalCost += 2*SpillPlacer->getBlockFrequency(Number);
+      continue;
+    }
+    // live-in / stack-out or stack-in live-out.
+    GlobalCost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return GlobalCost;
+}
+
+/// splitAroundRegion - Split the current live range around the regions
+/// determined by BundleCand and GlobalCand.
+///
+/// Before calling this function, GlobalCand and BundleCand must be initialized
+/// so each bundle is assigned to a valid candidate, or NoCand for the
+/// stack-bound bundles.  The shared SA/SE SplitAnalysis and SplitEditor
+/// objects must be initialized for the current live range, and intervals
+/// created for the used candidates.
+///
+/// @param LREdit    The LiveRangeEdit object handling the current split.
+/// @param UsedCands List of used GlobalCand entries. Every BundleCand value
+///                  must appear in this list.
+void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
+                                 ArrayRef<unsigned> UsedCands) {
+  // These are the intervals created for new global ranges. We may create more
+  // intervals for local ranges.
+  const unsigned NumGlobalIntvs = LREdit.size();
+  DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs << " globals.\n");
+  assert(NumGlobalIntvs && "No global intervals configured");
+
+  // Isolate even single instructions when dealing with a proper sub-class.
+  // That guarantees register class inflation for the stack interval because it
+  // is all copies.
+  unsigned Reg = SA->getParent().reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+
+  // First handle all the blocks with uses.
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    unsigned Number = BI.MBB->getNumber();
+    unsigned IntvIn = 0, IntvOut = 0;
+    SlotIndex IntfIn, IntfOut;
+    if (BI.LiveIn) {
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+    }
+    if (BI.LiveOut) {
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+    }
+
+    // Create separate intervals for isolated blocks with multiple uses.
+    if (!IntvIn && !IntvOut) {
+      DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " isolated.\n");
+      if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+        SE->splitSingleBlock(BI);
+      continue;
+    }
+
+    if (IntvIn && IntvOut)
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    else if (IntvIn)
+      SE->splitRegInBlock(BI, IntvIn, IntfIn);
+    else
+      SE->splitRegOutBlock(BI, IntvOut, IntfOut);
+  }
+
+  // Handle live-through blocks. The relevant live-through blocks are stored in
+  // the ActiveBlocks list with each candidate. We need to filter out
+  // duplicates.
+  BitVector Todo = SA->getThroughBlocks();
+  for (unsigned c = 0; c != UsedCands.size(); ++c) {
+    ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks;
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      unsigned Number = Blocks[i];
+      if (!Todo.test(Number))
+        continue;
+      Todo.reset(Number);
+
+      unsigned IntvIn = 0, IntvOut = 0;
+      SlotIndex IntfIn, IntfOut;
+
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+      if (!IntvIn && !IntvOut)
+        continue;
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    }
+  }
+
+  ++NumGlobalSplits;
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(Reg, LREdit.regs());
+
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+  unsigned OrigBlocks = SA->getNumLiveBlocks();
+
+  // Sort out the new intervals created by splitting. We get four kinds:
+  // - Remainder intervals should not be split again.
+  // - Candidate intervals can be assigned to Cand.PhysReg.
+  // - Block-local splits are candidates for local splitting.
+  // - DCE leftovers should go back on the queue.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    LiveInterval &Reg = *LREdit.get(i);
+
+    // Ignore old intervals from DCE.
+    if (getStage(Reg) != RS_New)
+      continue;
+
+    // Remainder interval. Don't try splitting again, spill if it doesn't
+    // allocate.
+    if (IntvMap[i] == 0) {
+      setStage(Reg, RS_Spill);
+      continue;
+    }
+
+    // Global intervals. Allow repeated splitting as long as the number of live
+    // blocks is strictly decreasing.
+    if (IntvMap[i] < NumGlobalIntvs) {
+      if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
+        DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
+                     << " blocks as original.\n");
+        // Don't allow repeated splitting as a safe guard against looping.
+        setStage(Reg, RS_Split2);
+      }
+      continue;
+    }
+
+    // Other intervals are treated as new. This includes local intervals created
+    // for blocks with multiple uses, and anything created by DCE.
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around region");
+}
+
+unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  unsigned NumCands = 0;
+  unsigned BestCand = NoCand;
+  float BestCost;
+  SmallVector<unsigned, 8> UsedCands;
+
+  // Check if we can split this live range around a compact region.
+  bool HasCompact = calcCompactRegion(GlobalCand.front());
+  if (HasCompact) {
+    // Yes, keep GlobalCand[0] as the compact region candidate.
+    NumCands = 1;
+    BestCost = HUGE_VALF;
+  } else {
+    // No benefit from the compact region, our fallback will be per-block
+    // splitting. Make sure we find a solution that is cheaper than spilling.
+    BestCost = Hysteresis * calcSpillCost();
+    DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n');
+  }
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.next()) {
+    // Discard bad candidates before we run out of interference cache cursors.
+    // This will only affect register classes with a lot of registers (>32).
+    if (NumCands == IntfCache.getMaxCursors()) {
+      unsigned WorstCount = ~0u;
+      unsigned Worst = 0;
+      for (unsigned i = 0; i != NumCands; ++i) {
+        if (i == BestCand || !GlobalCand[i].PhysReg)
+          continue;
+        unsigned Count = GlobalCand[i].LiveBundles.count();
+        if (Count < WorstCount)
+          Worst = i, WorstCount = Count;
+      }
+      --NumCands;
+      GlobalCand[Worst] = GlobalCand[NumCands];
+      if (BestCand == NumCands)
+        BestCand = Worst;
+    }
+
+    if (GlobalCand.size() <= NumCands)
+      GlobalCand.resize(NumCands+1);
+    GlobalSplitCandidate &Cand = GlobalCand[NumCands];
+    Cand.reset(IntfCache, PhysReg);
+
+    SpillPlacer->prepare(Cand.LiveBundles);
+    float Cost;
+    if (!addSplitConstraints(Cand.Intf, Cost)) {
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bundles\n");
+      continue;
+    }
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tstatic = " << Cost);
+    if (Cost >= BestCost) {
+      DEBUG({
+        if (BestCand == NoCand)
+          dbgs() << " worse than no bundles\n";
+        else
+          dbgs() << " worse than "
+                 << PrintReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
+      });
+      continue;
+    }
+    growRegion(Cand);
+
+    SpillPlacer->finish();
+
+    // No live bundles, defer to splitSingleBlocks().
+    if (!Cand.LiveBundles.any()) {
+      DEBUG(dbgs() << " no bundles.\n");
+      continue;
+    }
+
+    Cost += calcGlobalSplitCost(Cand);
+    DEBUG({
+      dbgs() << ", total = " << Cost << " with bundles";
+      for (int i = Cand.LiveBundles.find_first(); i>=0;
+           i = Cand.LiveBundles.find_next(i))
+        dbgs() << " EB#" << i;
+      dbgs() << ".\n";
+    });
+    if (Cost < BestCost) {
+      BestCand = NumCands;
+      BestCost = Hysteresis * Cost; // Prevent rounding effects.
+    }
+    ++NumCands;
+  }
+
+  // No solutions found, fall back to single block splitting.
+  if (!HasCompact && BestCand == NoCand)
+    return 0;
+
+  // Prepare split editor.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit, SplitSpillMode);
+
+  // Assign all edge bundles to the preferred candidate, or NoCand.
+  BundleCand.assign(Bundles->getNumBundles(), NoCand);
+
+  // Assign bundles for the best candidate region.
+  if (BestCand != NoCand) {
+    GlobalSplitCandidate &Cand = GlobalCand[BestCand];
+    if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
+      UsedCands.push_back(BestCand);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for " << PrintReg(Cand.PhysReg, TRI) << " in "
+                   << B << " bundles, intv " << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  // Assign bundles for the compact region.
+  if (HasCompact) {
+    GlobalSplitCandidate &Cand = GlobalCand.front();
+    assert(!Cand.PhysReg && "Compact region has no physreg");
+    if (unsigned B = Cand.getBundles(BundleCand, 0)) {
+      UsedCands.push_back(0);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for compact region in " << B << " bundles, intv "
+                   << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  splitAroundRegion(LREdit, UsedCands);
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Per-Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryBlockSplit - Split a global live range around every block with uses. This
+/// creates a lot of local live ranges, that will be split by tryLocalSplit if
+/// they don't allocate.
+unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
+  unsigned Reg = VirtReg.reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit, SplitSpillMode);
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+      SE->splitSingleBlock(BI);
+  }
+  // No blocks were split.
+  if (LREdit.empty())
+    return 0;
+
+  // We did split for some blocks.
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+
+  // Tell LiveDebugVariables about the new ranges.
+  DebugVars->splitRegister(Reg, LREdit.regs());
+
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Sort out the new intervals created by splitting. The remainder interval
+  // goes straight to spilling, the new local ranges get to stay RS_New.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    LiveInterval &LI = *LREdit.get(i);
+    if (getStage(LI) == RS_New && IntvMap[i] == 0)
+      setStage(LI, RS_Spill);
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around basic blocks");
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Per-Instruction Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryInstructionSplit - Split a live range around individual instructions.
+/// This is normally not worthwhile since the spiller is doing essentially the
+/// same thing. However, when the live range is in a constrained register
+/// class, it may help to insert copies such that parts of the live range can
+/// be moved to a larger register class.
+///
+/// This is similar to spilling to a larger register class.
+unsigned
+RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  // There is no point to this if there are no larger sub-classes.
+  if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
+    return 0;
+
+  // Always enable split spill mode, since we're effectively spilling to a
+  // register.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit, SplitEditor::SM_Size);
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 1)
+    return 0;
+
+  DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n");
+
+  // Split around every non-copy instruction.
+  for (unsigned i = 0; i != Uses.size(); ++i) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]))
+      if (MI->isFullCopy()) {
+        DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+        continue;
+      }
+    SE->openIntv();
+    SlotIndex SegStart = SE->enterIntvBefore(Uses[i]);
+    SlotIndex SegStop  = SE->leaveIntvAfter(Uses[i]);
+    SE->useIntv(SegStart, SegStop);
+  }
+
+  if (LREdit.empty()) {
+    DEBUG(dbgs() << "All uses were copies.\n");
+    return 0;
+  }
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Assign all new registers to RS_Spill. This was the last chance.
+  setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                             Local Splitting
+//===----------------------------------------------------------------------===//
+
+
+/// calcGapWeights - Compute the maximum spill weight that needs to be evicted
+/// in order to use PhysReg between two entries in SA->UseSlots.
+///
+/// GapWeight[i] represents the gap between UseSlots[i] and UseSlots[i+1].
+///
+void RAGreedy::calcGapWeights(unsigned PhysReg,
+                              SmallVectorImpl<float> &GapWeight) {
+  assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  const unsigned NumGaps = Uses.size()-1;
+
+  // Start and end points for the interference check.
+  SlotIndex StartIdx =
+    BI.LiveIn ? BI.FirstInstr.getBaseIndex() : BI.FirstInstr;
+  SlotIndex StopIdx =
+    BI.LiveOut ? BI.LastInstr.getBoundaryIndex() : BI.LastInstr;
+
+  GapWeight.assign(NumGaps, 0.0f);
+
+  // Add interference from each overlapping register.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units)
+          .checkInterference())
+      continue;
+
+    // We know that VirtReg is a continuous interval from FirstInstr to
+    // LastInstr, so we don't need InterferenceQuery.
+    //
+    // Interference that overlaps an instruction is counted in both gaps
+    // surrounding the instruction. The exception is interference before
+    // StartIdx and after StopIdx.
+    //
+    LiveIntervalUnion::SegmentIter IntI =
+      Matrix->getLiveUnions()[*Units] .find(StartIdx);
+    for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
+      // Skip the gaps before IntI.
+      while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      // Update the gaps covered by IntI.
+      const float weight = IntI.value()->weight;
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = std::max(GapWeight[Gap], weight);
+        if (Uses[Gap+1].getBaseIndex() >= IntI.stop())
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+
+  // Add fixed interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveInterval &LI = LIS->getRegUnit(*Units);
+    LiveInterval::const_iterator I = LI.find(StartIdx);
+    LiveInterval::const_iterator E = LI.end();
+
+    // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
+    for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
+      while (Uses[Gap+1].getBoundaryIndex() < I->start)
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = HUGE_VALF;
+        if (Uses[Gap+1].getBaseIndex() >= I->end)
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+}
+
+/// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
+/// basic block.
+///
+unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+
+  // Note that it is possible to have an interval that is live-in or live-out
+  // while only covering a single block - A phi-def can use undef values from
+  // predecessors, and the block could be a single-block loop.
+  // We don't bother doing anything clever about such a case, we simply assume
+  // that the interval is continuous from FirstInstr to LastInstr. We should
+  // make sure that we don't do anything illegal to such an interval, though.
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 2)
+    return 0;
+  const unsigned NumGaps = Uses.size()-1;
+
+  DEBUG({
+    dbgs() << "tryLocalSplit: ";
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+      dbgs() << ' ' << Uses[i];
+    dbgs() << '\n';
+  });
+
+  // If VirtReg is live across any register mask operands, compute a list of
+  // gaps with register masks.
+  SmallVector<unsigned, 8> RegMaskGaps;
+  if (Matrix->checkRegMaskInterference(VirtReg)) {
+    // Get regmask slots for the whole block.
+    ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
+    DEBUG(dbgs() << RMS.size() << " regmasks in block:");
+    // Constrain to VirtReg's live range.
+    unsigned ri = std::lower_bound(RMS.begin(), RMS.end(),
+                                   Uses.front().getRegSlot()) - RMS.begin();
+    unsigned re = RMS.size();
+    for (unsigned i = 0; i != NumGaps && ri != re; ++i) {
+      // Look for Uses[i] <= RMS <= Uses[i+1].
+      assert(!SlotIndex::isEarlierInstr(RMS[ri], Uses[i]));
+      if (SlotIndex::isEarlierInstr(Uses[i+1], RMS[ri]))
+        continue;
+      // Skip a regmask on the same instruction as the last use. It doesn't
+      // overlap the live range.
+      if (SlotIndex::isSameInstr(Uses[i+1], RMS[ri]) && i+1 == NumGaps)
+        break;
+      DEBUG(dbgs() << ' ' << RMS[ri] << ':' << Uses[i] << '-' << Uses[i+1]);
+      RegMaskGaps.push_back(i);
+      // Advance ri to the next gap. A regmask on one of the uses counts in
+      // both gaps.
+      while (ri != re && SlotIndex::isEarlierInstr(RMS[ri], Uses[i+1]))
+        ++ri;
+    }
+    DEBUG(dbgs() << '\n');
+  }
+
+  // Since we allow local split results to be split again, there is a risk of
+  // creating infinite loops. It is tempting to require that the new live
+  // ranges have less instructions than the original. That would guarantee
+  // convergence, but it is too strict. A live range with 3 instructions can be
+  // split 2+3 (including the COPY), and we want to allow that.
+  //
+  // Instead we use these rules:
+  //
+  // 1. Allow any split for ranges with getStage() < RS_Split2. (Except for the
+  //    noop split, of course).
+  // 2. Require progress be made for ranges with getStage() == RS_Split2. All
+  //    the new ranges must have fewer instructions than before the split.
+  // 3. New ranges with the same number of instructions are marked RS_Split2,
+  //    smaller ranges are marked RS_New.
+  //
+  // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
+  // excessive splitting and infinite loops.
+  //
+  bool ProgressRequired = getStage(VirtReg) >= RS_Split2;
+
+  // Best split candidate.
+  unsigned BestBefore = NumGaps;
+  unsigned BestAfter = 0;
+  float BestDiff = 0;
+
+  const float blockFreq = SpillPlacer->getBlockFrequency(BI.MBB->getNumber());
+  SmallVector<float, 8> GapWeight;
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.next()) {
+    // Keep track of the largest spill weight that would need to be evicted in
+    // order to make use of PhysReg between UseSlots[i] and UseSlots[i+1].
+    calcGapWeights(PhysReg, GapWeight);
+
+    // Remove any gaps with regmask clobbers.
+    if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
+      for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
+        GapWeight[RegMaskGaps[i]] = HUGE_VALF;
+
+    // Try to find the best sequence of gaps to close.
+    // The new spill weight must be larger than any gap interference.
+
+    // We will split before Uses[SplitBefore] and after Uses[SplitAfter].
+    unsigned SplitBefore = 0, SplitAfter = 1;
+
+    // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
+    // It is the spill weight that needs to be evicted.
+    float MaxGap = GapWeight[0];
+
+    for (;;) {
+      // Live before/after split?
+      const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
+      const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
+
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << ' '
+                   << Uses[SplitBefore] << '-' << Uses[SplitAfter]
+                   << " i=" << MaxGap);
+
+      // Stop before the interval gets so big we wouldn't be making progress.
+      if (!LiveBefore && !LiveAfter) {
+        DEBUG(dbgs() << " all\n");
+        break;
+      }
+      // Should the interval be extended or shrunk?
+      bool Shrink = true;
+
+      // How many gaps would the new range have?
+      unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
+
+      // Legally, without causing looping?
+      bool Legal = !ProgressRequired || NewGaps < NumGaps;
+
+      if (Legal && MaxGap < HUGE_VALF) {
+        // Estimate the new spill weight. Each instruction reads or writes the
+        // register. Conservatively assume there are no read-modify-write
+        // instructions.
+        //
+        // Try to guess the size of the new interval.
+        const float EstWeight = normalizeSpillWeight(blockFreq * (NewGaps + 1),
+                                 Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                                 (LiveBefore + LiveAfter)*SlotIndex::InstrDist);
+        // Would this split be possible to allocate?
+        // Never allocate all gaps, we wouldn't be making progress.
+        DEBUG(dbgs() << " w=" << EstWeight);
+        if (EstWeight * Hysteresis >= MaxGap) {
+          Shrink = false;
+          float Diff = EstWeight - MaxGap;
+          if (Diff > BestDiff) {
+            DEBUG(dbgs() << " (best)");
+            BestDiff = Hysteresis * Diff;
+            BestBefore = SplitBefore;
+            BestAfter = SplitAfter;
+          }
+        }
+      }
+
+      // Try to shrink.
+      if (Shrink) {
+        if (++SplitBefore < SplitAfter) {
+          DEBUG(dbgs() << " shrink\n");
+          // Recompute the max when necessary.
+          if (GapWeight[SplitBefore - 1] >= MaxGap) {
+            MaxGap = GapWeight[SplitBefore];
+            for (unsigned i = SplitBefore + 1; i != SplitAfter; ++i)
+              MaxGap = std::max(MaxGap, GapWeight[i]);
+          }
+          continue;
+        }
+        MaxGap = 0;
+      }
+
+      // Try to extend the interval.
+      if (SplitAfter >= NumGaps) {
+        DEBUG(dbgs() << " end\n");
+        break;
+      }
+
+      DEBUG(dbgs() << " extend\n");
+      MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
+    }
+  }
+
+  // Didn't find any candidates?
+  if (BestBefore == NumGaps)
+    return 0;
+
+  DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore]
+               << '-' << Uses[BestAfter] << ", " << BestDiff
+               << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
+
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  SE->reset(LREdit);
+
+  SE->openIntv();
+  SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
+  SlotIndex SegStop  = SE->leaveIntvAfter(Uses[BestAfter]);
+  SE->useIntv(SegStart, SegStop);
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+
+  // If the new range has the same number of instructions as before, mark it as
+  // RS_Split2 so the next split will be forced to make progress. Otherwise,
+  // leave the new intervals as RS_New so they can compete.
+  bool LiveBefore = BestBefore != 0 || BI.LiveIn;
+  bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
+  unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
+  if (NewGaps >= NumGaps) {
+    DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    assert(!ProgressRequired && "Didn't make progress when it was required.");
+    for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
+      if (IntvMap[i] == 1) {
+        setStage(*LREdit.get(i), RS_Split2);
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+      }
+    DEBUG(dbgs() << '\n');
+  }
+  ++NumLocalSplits;
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                          Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+/// trySplit - Try to split VirtReg or one of its interferences, making it
+/// assignable.
+/// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
+unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                            SmallVectorImpl<LiveInterval*>&NewVRegs) {
+  // Ranges must be Split2 or less.
+  if (getStage(VirtReg) >= RS_Spill)
+    return 0;
+
+  // Local intervals are handled separately.
+  if (LIS->intervalIsInOneMBB(VirtReg)) {
+    NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled);
+    SA->analyze(&VirtReg);
+    unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+    return tryInstructionSplit(VirtReg, Order, NewVRegs);
+  }
+
+  NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled);
+
+  SA->analyze(&VirtReg);
+
+  // FIXME: SplitAnalysis may repair broken live ranges coming from the
+  // coalescer. That may cause the range to become allocatable which means that
+  // tryRegionSplit won't be making progress. This check should be replaced with
+  // an assertion when the coalescer is fixed.
+  if (SA->didRepairRange()) {
+    // VirtReg has changed, so all cached queries are invalid.
+    Matrix->invalidateVirtRegs();
+    if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
+      return PhysReg;
+  }
+
+  // First try to split around a region spanning multiple blocks. RS_Split2
+  // ranges already made dubious progress with region splitting, so they go
+  // straight to single block splitting.
+  if (getStage(VirtReg) < RS_Split2) {
+    unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+  }
+
+  // Then isolate blocks.
+  return tryBlockSplit(VirtReg, Order, NewVRegs);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Main Entry Point
+//===----------------------------------------------------------------------===//
+
+unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  // First try assigning a free register.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
+    return PhysReg;
+
+  LiveRangeStage Stage = getStage(VirtReg);
+  DEBUG(dbgs() << StageName[Stage]
+               << " Cascade " << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
+
+  // Try to evict a less worthy live range, but only for ranges from the primary
+  // queue. The RS_Split ranges already failed to do this, and they should not
+  // get a second chance until they have been split.
+  if (Stage != RS_Split)
+    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
+      return PhysReg;
+
+  assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
+
+  // The first time we see a live range, don't try to split or spill.
+  // Wait until the second time, when all smaller ranges have been allocated.
+  // This gives a better picture of the interference to split around.
+  if (Stage < RS_Split) {
+    setStage(VirtReg, RS_Split);
+    DEBUG(dbgs() << "wait for second round\n");
+    NewVRegs.push_back(&VirtReg);
+    return 0;
+  }
+
+  // If we couldn't allocate a register from spilling, there is probably some
+  // invalid inline assembly. The base class wil report it.
+  if (Stage >= RS_Done || !VirtReg.isSpillable())
+    return ~0u;
+
+  // Try splitting VirtReg or interferences.
+  unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs);
+  if (PhysReg || !NewVRegs.empty())
+    return PhysReg;
+
+  // Finally spill VirtReg itself.
+  NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
+  LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this);
+  spiller().spill(LRE);
+  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
+
+  if (VerifyEnabled)
+    MF->verify(this, "After spilling");
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
+               << "********** Function: " << mf.getName() << '\n');
+
+  MF = &mf;
+  if (VerifyEnabled)
+    MF->verify(this, "Before greedy register allocator");
+
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+  Loops = &getAnalysis<MachineLoopInfo>();
+  Bundles = &getAnalysis<EdgeBundles>();
+  SpillPlacer = &getAnalysis<SpillPlacement>();
+  DebugVars = &getAnalysis<LiveDebugVariables>();
+
+  SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
+  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree));
+  ExtraRegInfo.clear();
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+  NextCascade = 1;
+  IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
+  GlobalCand.resize(32);  // This will grow as needed.
+
+  allocatePhysRegs();
+  releaseMemory();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
new file mode 100644
index 000000000000..607edac24bd2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -0,0 +1,638 @@
+//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a Partitioned Boolean Quadratic Programming (PBQP) based
+// register allocator for LLVM. This allocator works by constructing a PBQP
+// problem representing the register allocation problem under consideration,
+// solving this using a PBQP solver, and mapping the solution back to a
+// register assignment. If any variables are selected for spilling then spill
+// code is inserted and the process repeated.
+//
+// The PBQP solver (pbqp.c) provided for this allocator uses a heuristic tuned
+// for register allocation. For more information on PBQP for register
+// allocation, see the following papers:
+//
+//   (1) Hames, L. and Scholz, B. 2006. Nearly optimal register allocation with
+//   PBQP. In Proceedings of the 7th Joint Modular Languages Conference
+//   (JMLC'06). LNCS, vol. 4228. Springer, New York, NY, USA. 346-361.
+//
+//   (2) Scholz, B., Eckstein, E. 2002. Register allocation for irregular
+//   architectures. In Proceedings of the Joint Conference on Languages,
+//   Compilers and Tools for Embedded Systems (LCTES'02), ACM Press, New York,
+//   NY, USA, 139-148.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "RegisterCoalescer.h"
+#include "Spiller.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/HeuristicSolver.h"
+#include "llvm/CodeGen/PBQP/Heuristics/Briggs.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <limits>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <vector>
+
+using namespace llvm;
+
+static RegisterRegAlloc
+registerPBQPRepAlloc("pbqp", "PBQP register allocator",
+                       createDefaultPBQPRegisterAllocator);
+
+static cl::opt<bool>
+pbqpCoalescing("pbqp-coalescing",
+                cl::desc("Attempt coalescing during PBQP register allocation."),
+                cl::init(false), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<bool>
+pbqpDumpGraphs("pbqp-dump-graphs",
+               cl::desc("Dump graphs for each function/round in the compilation unit."),
+               cl::init(false), cl::Hidden);
+#endif
+
+namespace {
+
+///
+/// PBQP based allocators solve the register allocation problem by mapping
+/// register allocation problems to Partitioned Boolean Quadratic
+/// Programming problems.
+class RegAllocPBQP : public MachineFunctionPass {
+public:
+
+  static char ID;
+
+  /// Construct a PBQP register allocator.
+  RegAllocPBQP(std::auto_ptr<PBQPBuilder> b, char *cPassID=0)
+      : MachineFunctionPass(ID), builder(b), customPassID(cPassID) {
+    initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+    initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+    initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
+    initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+    initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+    initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Return the pass name.
+  virtual const char* getPassName() const {
+    return "PBQP Register Allocator";
+  }
+
+  /// PBQP analysis usage.
+  virtual void getAnalysisUsage(AnalysisUsage &au) const;
+
+  /// Perform register allocation
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+
+  typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
+  typedef std::vector<const LiveInterval*> Node2LIMap;
+  typedef std::vector<unsigned> AllowedSet;
+  typedef std::vector<AllowedSet> AllowedSetMap;
+  typedef std::pair<unsigned, unsigned> RegPair;
+  typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
+  typedef std::set<unsigned> RegSet;
+
+
+  std::auto_ptr<PBQPBuilder> builder;
+
+  char *customPassID;
+
+  MachineFunction *mf;
+  const TargetMachine *tm;
+  const TargetRegisterInfo *tri;
+  const TargetInstrInfo *tii;
+  const MachineLoopInfo *loopInfo;
+  MachineRegisterInfo *mri;
+
+  std::auto_ptr<Spiller> spiller;
+  LiveIntervals *lis;
+  LiveStacks *lss;
+  VirtRegMap *vrm;
+
+  RegSet vregsToAlloc, emptyIntervalVRegs;
+
+  /// \brief Finds the initial set of vreg intervals to allocate.
+  void findVRegIntervalsToAlloc();
+
+  /// \brief Given a solved PBQP problem maps this solution back to a register
+  /// assignment.
+  bool mapPBQPToRegAlloc(const PBQPRAProblem &problem,
+                         const PBQP::Solution &solution);
+
+  /// \brief Postprocessing before final spilling. Sets basic block "live in"
+  /// variables.
+  void finalizeAlloc() const;
+
+};
+
+char RegAllocPBQP::ID = 0;
+
+} // End anonymous namespace.
+
+unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::ConstNodeItr node) const {
+  Node2VReg::const_iterator vregItr = node2VReg.find(node);
+  assert(vregItr != node2VReg.end() && "No vreg for node.");
+  return vregItr->second;
+}
+
+PBQP::Graph::NodeItr PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
+  VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
+  assert(nodeItr != vreg2Node.end() && "No node for vreg.");
+  return nodeItr->second;
+
+}
+
+const PBQPRAProblem::AllowedSet&
+  PBQPRAProblem::getAllowedSet(unsigned vreg) const {
+  AllowedSetMap::const_iterator allowedSetItr = allowedSets.find(vreg);
+  assert(allowedSetItr != allowedSets.end() && "No pregs for vreg.");
+  const AllowedSet &allowedSet = allowedSetItr->second;
+  return allowedSet;
+}
+
+unsigned PBQPRAProblem::getPRegForOption(unsigned vreg, unsigned option) const {
+  assert(isPRegOption(vreg, option) && "Not a preg option.");
+
+  const AllowedSet& allowedSet = getAllowedSet(vreg);
+  assert(option <= allowedSet.size() && "Option outside allowed set.");
+  return allowedSet[option - 1];
+}
+
+std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
+                                                const LiveIntervals *lis,
+                                                const MachineLoopInfo *loopInfo,
+                                                const RegSet &vregs) {
+
+  LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
+  MachineRegisterInfo *mri = &mf->getRegInfo();
+  const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
+
+  std::auto_ptr<PBQPRAProblem> p(new PBQPRAProblem());
+  PBQP::Graph &g = p->getGraph();
+  RegSet pregs;
+
+  // Collect the set of preg intervals, record that they're used in the MF.
+  for (unsigned Reg = 1, e = tri->getNumRegs(); Reg != e; ++Reg) {
+    if (mri->def_empty(Reg))
+      continue;
+    pregs.insert(Reg);
+    mri->setPhysRegUsed(Reg);
+  }
+
+  // Iterate over vregs.
+  for (RegSet::const_iterator vregItr = vregs.begin(), vregEnd = vregs.end();
+       vregItr != vregEnd; ++vregItr) {
+    unsigned vreg = *vregItr;
+    const TargetRegisterClass *trc = mri->getRegClass(vreg);
+    LiveInterval *vregLI = &LIS->getInterval(vreg);
+
+    // Record any overlaps with regmask operands.
+    BitVector regMaskOverlaps;
+    LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
+
+    // Compute an initial allowed set for the current vreg.
+    typedef std::vector<unsigned> VRAllowed;
+    VRAllowed vrAllowed;
+    ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf);
+    for (unsigned i = 0; i != rawOrder.size(); ++i) {
+      unsigned preg = rawOrder[i];
+      if (mri->isReserved(preg))
+        continue;
+
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!regMaskOverlaps.empty() && !regMaskOverlaps.test(preg))
+        continue;
+
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(preg, tri); Units.isValid(); ++Units) {
+        if (vregLI->overlaps(LIS->getRegUnit(*Units))) {
+          Interference = true;
+          break;
+        }
+      }
+      if (Interference)
+        continue;
+
+      // preg is usable for this virtual register.
+      vrAllowed.push_back(preg);
+    }
+
+    // Construct the node.
+    PBQP::Graph::NodeItr node =
+      g.addNode(PBQP::Vector(vrAllowed.size() + 1, 0));
+
+    // Record the mapping and allowed set in the problem.
+    p->recordVReg(vreg, node, vrAllowed.begin(), vrAllowed.end());
+
+    PBQP::PBQPNum spillCost = (vregLI->weight != 0.0) ?
+        vregLI->weight : std::numeric_limits<PBQP::PBQPNum>::min();
+
+    addSpillCosts(g.getNodeCosts(node), spillCost);
+  }
+
+  for (RegSet::const_iterator vr1Itr = vregs.begin(), vrEnd = vregs.end();
+         vr1Itr != vrEnd; ++vr1Itr) {
+    unsigned vr1 = *vr1Itr;
+    const LiveInterval &l1 = lis->getInterval(vr1);
+    const PBQPRAProblem::AllowedSet &vr1Allowed = p->getAllowedSet(vr1);
+
+    for (RegSet::const_iterator vr2Itr = llvm::next(vr1Itr);
+         vr2Itr != vrEnd; ++vr2Itr) {
+      unsigned vr2 = *vr2Itr;
+      const LiveInterval &l2 = lis->getInterval(vr2);
+      const PBQPRAProblem::AllowedSet &vr2Allowed = p->getAllowedSet(vr2);
+
+      assert(!l2.empty() && "Empty interval in vreg set?");
+      if (l1.overlaps(l2)) {
+        PBQP::Graph::EdgeItr edge =
+          g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
+                    PBQP::Matrix(vr1Allowed.size()+1, vr2Allowed.size()+1, 0));
+
+        addInterferenceCosts(g.getEdgeCosts(edge), vr1Allowed, vr2Allowed, tri);
+      }
+    }
+  }
+
+  return p;
+}
+
+void PBQPBuilder::addSpillCosts(PBQP::Vector &costVec,
+                                PBQP::PBQPNum spillCost) {
+  costVec[0] = spillCost;
+}
+
+void PBQPBuilder::addInterferenceCosts(
+                                    PBQP::Matrix &costMat,
+                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
+                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
+                                    const TargetRegisterInfo *tri) {
+  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Matrix height mismatch.");
+  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Matrix width mismatch.");
+
+  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
+    unsigned preg1 = vr1Allowed[i];
+
+    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
+      unsigned preg2 = vr2Allowed[j];
+
+      if (tri->regsOverlap(preg1, preg2)) {
+        costMat[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+      }
+    }
+  }
+}
+
+std::auto_ptr<PBQPRAProblem> PBQPBuilderWithCoalescing::build(
+                                                MachineFunction *mf,
+                                                const LiveIntervals *lis,
+                                                const MachineLoopInfo *loopInfo,
+                                                const RegSet &vregs) {
+
+  std::auto_ptr<PBQPRAProblem> p = PBQPBuilder::build(mf, lis, loopInfo, vregs);
+  PBQP::Graph &g = p->getGraph();
+
+  const TargetMachine &tm = mf->getTarget();
+  CoalescerPair cp(*tm.getRegisterInfo());
+
+  // Scan the machine function and add a coalescing cost whenever CoalescerPair
+  // gives the Ok.
+  for (MachineFunction::const_iterator mbbItr = mf->begin(),
+                                       mbbEnd = mf->end();
+       mbbItr != mbbEnd; ++mbbItr) {
+    const MachineBasicBlock *mbb = &*mbbItr;
+
+    for (MachineBasicBlock::const_iterator miItr = mbb->begin(),
+                                           miEnd = mbb->end();
+         miItr != miEnd; ++miItr) {
+      const MachineInstr *mi = &*miItr;
+
+      if (!cp.setRegisters(mi)) {
+        continue; // Not coalescable.
+      }
+
+      if (cp.getSrcReg() == cp.getDstReg()) {
+        continue; // Already coalesced.
+      }
+
+      unsigned dst = cp.getDstReg(),
+               src = cp.getSrcReg();
+
+      const float copyFactor = 0.5; // Cost of copy relative to load. Current
+      // value plucked randomly out of the air.
+
+      PBQP::PBQPNum cBenefit =
+        copyFactor * LiveIntervals::getSpillWeight(false, true,
+                                                   loopInfo->getLoopDepth(mbb));
+
+      if (cp.isPhys()) {
+        if (!mf->getRegInfo().isAllocatable(dst)) {
+          continue;
+        }
+
+        const PBQPRAProblem::AllowedSet &allowed = p->getAllowedSet(src);
+        unsigned pregOpt = 0;
+        while (pregOpt < allowed.size() && allowed[pregOpt] != dst) {
+          ++pregOpt;
+        }
+        if (pregOpt < allowed.size()) {
+          ++pregOpt; // +1 to account for spill option.
+          PBQP::Graph::NodeItr node = p->getNodeForVReg(src);
+          addPhysRegCoalesce(g.getNodeCosts(node), pregOpt, cBenefit);
+        }
+      } else {
+        const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
+        const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
+        PBQP::Graph::NodeItr node1 = p->getNodeForVReg(dst);
+        PBQP::Graph::NodeItr node2 = p->getNodeForVReg(src);
+        PBQP::Graph::EdgeItr edge = g.findEdge(node1, node2);
+        if (edge == g.edgesEnd()) {
+          edge = g.addEdge(node1, node2, PBQP::Matrix(allowed1->size() + 1,
+                                                      allowed2->size() + 1,
+                                                      0));
+        } else {
+          if (g.getEdgeNode1(edge) == node2) {
+            std::swap(node1, node2);
+            std::swap(allowed1, allowed2);
+          }
+        }
+
+        addVirtRegCoalesce(g.getEdgeCosts(edge), *allowed1, *allowed2,
+                           cBenefit);
+      }
+    }
+  }
+
+  return p;
+}
+
+void PBQPBuilderWithCoalescing::addPhysRegCoalesce(PBQP::Vector &costVec,
+                                                   unsigned pregOption,
+                                                   PBQP::PBQPNum benefit) {
+  costVec[pregOption] += -benefit;
+}
+
+void PBQPBuilderWithCoalescing::addVirtRegCoalesce(
+                                    PBQP::Matrix &costMat,
+                                    const PBQPRAProblem::AllowedSet &vr1Allowed,
+                                    const PBQPRAProblem::AllowedSet &vr2Allowed,
+                                    PBQP::PBQPNum benefit) {
+
+  assert(costMat.getRows() == vr1Allowed.size() + 1 && "Size mismatch.");
+  assert(costMat.getCols() == vr2Allowed.size() + 1 && "Size mismatch.");
+
+  for (unsigned i = 0; i != vr1Allowed.size(); ++i) {
+    unsigned preg1 = vr1Allowed[i];
+    for (unsigned j = 0; j != vr2Allowed.size(); ++j) {
+      unsigned preg2 = vr2Allowed[j];
+
+      if (preg1 == preg2) {
+        costMat[i + 1][j + 1] += -benefit;
+      }
+    }
+  }
+}
+
+
+void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesCFG();
+  au.addRequired<AliasAnalysis>();
+  au.addPreserved<AliasAnalysis>();
+  au.addRequired<SlotIndexes>();
+  au.addPreserved<SlotIndexes>();
+  au.addRequired<LiveIntervals>();
+  au.addPreserved<LiveIntervals>();
+  //au.addRequiredID(SplitCriticalEdgesID);
+  if (customPassID)
+    au.addRequiredID(*customPassID);
+  au.addRequired<CalculateSpillWeights>();
+  au.addRequired<LiveStacks>();
+  au.addPreserved<LiveStacks>();
+  au.addRequired<MachineDominatorTree>();
+  au.addPreserved<MachineDominatorTree>();
+  au.addRequired<MachineLoopInfo>();
+  au.addPreserved<MachineLoopInfo>();
+  au.addRequired<VirtRegMap>();
+  au.addPreserved<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void RegAllocPBQP::findVRegIntervalsToAlloc() {
+
+  // Iterate over all live ranges.
+  for (unsigned i = 0, e = mri->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (mri->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval *li = &lis->getInterval(Reg);
+
+    // If this live interval is non-empty we will use pbqp to allocate it.
+    // Empty intervals we allocate in a simple post-processing stage in
+    // finalizeAlloc.
+    if (!li->empty()) {
+      vregsToAlloc.insert(li->reg);
+    } else {
+      emptyIntervalVRegs.insert(li->reg);
+    }
+  }
+}
+
+bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
+                                     const PBQP::Solution &solution) {
+  // Set to true if we have any spills
+  bool anotherRoundNeeded = false;
+
+  // Clear the existing allocation.
+  vrm->clearAllVirt();
+
+  const PBQP::Graph &g = problem.getGraph();
+  // Iterate over the nodes mapping the PBQP solution to a register
+  // assignment.
+  for (PBQP::Graph::ConstNodeItr node = g.nodesBegin(),
+                                 nodeEnd = g.nodesEnd();
+       node != nodeEnd; ++node) {
+    unsigned vreg = problem.getVRegForNode(node);
+    unsigned alloc = solution.getSelection(node);
+
+    if (problem.isPRegOption(vreg, alloc)) {
+      unsigned preg = problem.getPRegForOption(vreg, alloc);
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> "
+            << tri->getName(preg) << "\n");
+      assert(preg != 0 && "Invalid preg selected.");
+      vrm->assignVirt2Phys(vreg, preg);
+    } else if (problem.isSpillOption(vreg, alloc)) {
+      vregsToAlloc.erase(vreg);
+      SmallVector<LiveInterval*, 8> newSpills;
+      LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
+      spiller->spill(LRE);
+
+      DEBUG(dbgs() << "VREG " << PrintReg(vreg, tri) << " -> SPILLED (Cost: "
+                   << LRE.getParent().weight << ", New vregs: ");
+
+      // Copy any newly inserted live intervals into the list of regs to
+      // allocate.
+      for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
+           itr != end; ++itr) {
+        assert(!(*itr)->empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
+        vregsToAlloc.insert((*itr)->reg);
+      }
+
+      DEBUG(dbgs() << ")\n");
+
+      // We need another round if spill intervals were added.
+      anotherRoundNeeded |= !LRE.empty();
+    } else {
+      llvm_unreachable("Unknown allocation option.");
+    }
+  }
+
+  return !anotherRoundNeeded;
+}
+
+
+void RegAllocPBQP::finalizeAlloc() const {
+  // First allocate registers for the empty intervals.
+  for (RegSet::const_iterator
+         itr = emptyIntervalVRegs.begin(), end = emptyIntervalVRegs.end();
+         itr != end; ++itr) {
+    LiveInterval *li = &lis->getInterval(*itr);
+
+    unsigned physReg = mri->getSimpleHint(li->reg);
+
+    if (physReg == 0) {
+      const TargetRegisterClass *liRC = mri->getRegClass(li->reg);
+      physReg = liRC->getRawAllocationOrder(*mf).front();
+    }
+
+    vrm->assignVirt2Phys(li->reg, physReg);
+  }
+}
+
+bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+
+  mf = &MF;
+  tm = &mf->getTarget();
+  tri = tm->getRegisterInfo();
+  tii = tm->getInstrInfo();
+  mri = &mf->getRegInfo();
+
+  lis = &getAnalysis<LiveIntervals>();
+  lss = &getAnalysis<LiveStacks>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  vrm = &getAnalysis<VirtRegMap>();
+  spiller.reset(createInlineSpiller(*this, MF, *vrm));
+
+  mri->freezeReservedRegs(MF);
+
+  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getName() << "\n");
+
+  // Allocator main loop:
+  //
+  // * Map current regalloc problem to a PBQP problem
+  // * Solve the PBQP problem
+  // * Map the solution back to a register allocation
+  // * Spill if necessary
+  //
+  // This process is continued till no more spills are generated.
+
+  // Find the vreg intervals in need of allocation.
+  findVRegIntervalsToAlloc();
+
+#ifndef NDEBUG
+  const Function* func = mf->getFunction();
+  std::string fqn =
+    func->getParent()->getModuleIdentifier() + "." +
+    func->getName().str();
+#endif
+
+  // If there are non-empty intervals allocate them using pbqp.
+  if (!vregsToAlloc.empty()) {
+
+    bool pbqpAllocComplete = false;
+    unsigned round = 0;
+
+    while (!pbqpAllocComplete) {
+      DEBUG(dbgs() << "  PBQP Regalloc round " << round << ":\n");
+
+      std::auto_ptr<PBQPRAProblem> problem =
+        builder->build(mf, lis, loopInfo, vregsToAlloc);
+
+#ifndef NDEBUG
+      if (pbqpDumpGraphs) {
+        std::ostringstream rs;
+        rs << round;
+        std::string graphFileName(fqn + "." + rs.str() + ".pbqpgraph");
+        std::string tmp;
+        raw_fd_ostream os(graphFileName.c_str(), tmp);
+        DEBUG(dbgs() << "Dumping graph for round " << round << " to \""
+              << graphFileName << "\"\n");
+        problem->getGraph().dump(os);
+      }
+#endif
+
+      PBQP::Solution solution =
+        PBQP::HeuristicSolver<PBQP::Heuristics::Briggs>::solve(
+          problem->getGraph());
+
+      pbqpAllocComplete = mapPBQPToRegAlloc(*problem, solution);
+
+      ++round;
+    }
+  }
+
+  // Finalise allocation, allocate empty ranges.
+  finalizeAlloc();
+  vregsToAlloc.clear();
+  emptyIntervalVRegs.clear();
+
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *vrm << "\n");
+
+  return true;
+}
+
+FunctionPass* llvm::createPBQPRegisterAllocator(
+                                           std::auto_ptr<PBQPBuilder> builder,
+                                           char *customPassID) {
+  return new RegAllocPBQP(builder, customPassID);
+}
+
+FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
+  if (pbqpCoalescing) {
+    return createPBQPRegisterAllocator(
+             std::auto_ptr<PBQPBuilder>(new PBQPBuilderWithCoalescing()));
+  } // else
+  return createPBQPRegisterAllocator(
+           std::auto_ptr<PBQPBuilder>(new PBQPBuilder()));
+}
+
+#undef DEBUG_TYPE
diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
new file mode 100644
index 000000000000..87382d8f7c42
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -0,0 +1,146 @@
+//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee saved and reserved
+// registers depends on calling conventions and other dynamic information, so
+// some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
+         cl::desc("Limit all regclasses to N registers"));
+
+RegisterClassInfo::RegisterClassInfo() : Tag(0), MF(0), TRI(0), CalleeSaved(0)
+{}
+
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+  bool Update = false;
+  MF = &mf;
+
+  // Allocate new array the first time we see a new target.
+  if (MF->getTarget().getRegisterInfo() != TRI) {
+    TRI = MF->getTarget().getRegisterInfo();
+    RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    Update = true;
+  }
+
+  // Does this MF have different CSRs?
+  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+  if (Update || CSR != CalleeSaved) {
+    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+    // overlapping CSR.
+    CSRNum.clear();
+    CSRNum.resize(TRI->getNumRegs(), 0);
+    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    Update = true;
+  }
+  CalleeSaved = CSR;
+
+  // Different reserved registers?
+  const BitVector &RR = MF->getRegInfo().getReservedRegs();
+  if (Reserved.size() != RR.size() || RR != Reserved) {
+    Update = true;
+    Reserved = RR;
+  }
+
+  // Invalidate cached information from previous function.
+  if (Update)
+    ++Tag;
+}
+
+/// compute - Compute the preferred allocation order for RC with reserved
+/// registers filtered out. Volatile registers come first followed by CSR
+/// aliases ordered according to the CSR order specified by the target.
+void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  RCInfo &RCI = RegClass[RC->getID()];
+
+  // Raw register count, including all reserved regs.
+  unsigned NumRegs = RC->getNumRegs();
+
+  if (!RCI.Order)
+    RCI.Order.reset(new MCPhysReg[NumRegs]);
+
+  unsigned N = 0;
+  SmallVector<MCPhysReg, 16> CSRAlias;
+  unsigned MinCost = 0xff;
+  unsigned LastCost = ~0u;
+  unsigned LastCostChange = 0;
+
+  // FIXME: Once targets reserve registers instead of removing them from the
+  // allocation order, we can simply use begin/end here.
+  ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
+  for (unsigned i = 0; i != RawOrder.size(); ++i) {
+    unsigned PhysReg = RawOrder[i];
+    // Remove reserved registers from the allocation order.
+    if (Reserved.test(PhysReg))
+      continue;
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    MinCost = std::min(MinCost, Cost);
+
+    if (CSRNum[PhysReg])
+      // PhysReg aliases a CSR, save it for later.
+      CSRAlias.push_back(PhysReg);
+    else {
+      if (Cost != LastCost)
+        LastCostChange = N;
+      RCI.Order[N++] = PhysReg;
+      LastCost = Cost;
+    }
+  }
+  RCI.NumRegs = N + CSRAlias.size();
+  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+
+  // CSR aliases go after the volatile registers, preserve the target's order.
+  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
+    unsigned PhysReg = CSRAlias[i];
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    if (Cost != LastCost)
+      LastCostChange = N;
+    RCI.Order[N++] = PhysReg;
+    LastCost = Cost;
+  }
+
+  // Register allocator stress test.  Clip register class to N registers.
+  if (StressRA && RCI.NumRegs > StressRA)
+    RCI.NumRegs = StressRA;
+
+  // Check if RC is a proper sub-class.
+  if (const TargetRegisterClass *Super = TRI->getLargestLegalSuperClass(RC))
+    if (Super != RC && getNumAllocatableRegs(Super) > RCI.NumRegs)
+      RCI.ProperSubClass = true;
+
+  RCI.MinCost = uint8_t(MinCost);
+  RCI.LastCostChange = LastCostChange;
+
+  DEBUG({
+    dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
+    for (unsigned I = 0; I != RCI.NumRegs; ++I)
+      dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
+    dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");
+  });
+
+  // RCI is now up-to-date.
+  RCI.Tag = Tag;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
new file mode 100644
index 000000000000..d85646dd3c58
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -0,0 +1,2193 @@
+//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic RegisterCoalescer interface which
+// is used as the common interface used by all clients and
+// implementations of register coalescing.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+STATISTIC(numJoins    , "Number of interval joins performed");
+STATISTIC(numCrossRCs , "Number of cross class joins performed");
+STATISTIC(numCommutes , "Number of instruction commuting performed");
+STATISTIC(numExtends  , "Number of copies extended");
+STATISTIC(NumReMats   , "Number of instructions re-materialized");
+STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
+STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
+
+static cl::opt<bool>
+EnableJoining("join-liveintervals",
+              cl::desc("Coalesce copies (default=true)"),
+              cl::init(true));
+
+// Temporary flag to test critical edge unsplitting.
+static cl::opt<bool>
+EnableJoinSplits("join-splitedges",
+  cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
+
+// Temporary flag to test global copy optimization.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalCopies("join-globalcopies",
+  cl::desc("Coalesce copies that span blocks (default=subtarget)"),
+  cl::init(cl::BOU_UNSET), cl::Hidden);
+
+static cl::opt<bool>
+VerifyCoalescing("verify-coalescing",
+         cl::desc("Verify machine instrs before and after register coalescing"),
+         cl::Hidden);
+
+namespace {
+  class RegisterCoalescer : public MachineFunctionPass,
+                            private LiveRangeEdit::Delegate {
+    MachineFunction* MF;
+    MachineRegisterInfo* MRI;
+    const TargetMachine* TM;
+    const TargetRegisterInfo* TRI;
+    const TargetInstrInfo* TII;
+    LiveIntervals *LIS;
+    const MachineLoopInfo* Loops;
+    AliasAnalysis *AA;
+    RegisterClassInfo RegClassInfo;
+
+    /// \brief True if the coalescer should aggressively coalesce global copies
+    /// in favor of keeping local copies.
+    bool JoinGlobalCopies;
+
+    /// \brief True if the coalescer should aggressively coalesce fall-thru
+    /// blocks exclusively containing copies.
+    bool JoinSplitEdges;
+
+    /// WorkList - Copy instructions yet to be coalesced.
+    SmallVector<MachineInstr*, 8> WorkList;
+    SmallVector<MachineInstr*, 8> LocalWorkList;
+
+    /// ErasedInstrs - Set of instruction pointers that have been erased, and
+    /// that may be present in WorkList.
+    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
+
+    /// Dead instructions that are about to be deleted.
+    SmallVector<MachineInstr*, 8> DeadDefs;
+
+    /// Virtual registers to be considered for register class inflation.
+    SmallVector<unsigned, 8> InflateRegs;
+
+    /// Recursively eliminate dead defs in DeadDefs.
+    void eliminateDeadDefs();
+
+    /// LiveRangeEdit callback.
+    void LRE_WillEraseInstruction(MachineInstr *MI);
+
+    /// coalesceLocals - coalesce the LocalWorkList.
+    void coalesceLocals();
+
+    /// joinAllIntervals - join compatible live intervals
+    void joinAllIntervals();
+
+    /// copyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into WorkList.
+    void copyCoalesceInMBB(MachineBasicBlock *MBB);
+
+    /// copyCoalesceWorkList - Try to coalesce all copies in CurrList. Return
+    /// true if any progress was made.
+    bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
+
+    /// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// which are the src/dst of the copy instruction CopyMI.  This returns
+    /// true if the copy was successfully coalesced away. If it is not
+    /// currently possible to coalesce this interval, but it may be possible if
+    /// other things get coalesced, then it returns true by reference in
+    /// 'Again'.
+    bool joinCopy(MachineInstr *TheCopy, bool &Again);
+
+    /// joinIntervals - Attempt to join these two intervals.  On failure, this
+    /// returns false.  The output "SrcInt" will not have been modified, so we
+    /// can use this information below to update aliases.
+    bool joinIntervals(CoalescerPair &CP);
+
+    /// Attempt joining two virtual registers. Return true on success.
+    bool joinVirtRegs(CoalescerPair &CP);
+
+    /// Attempt joining with a reserved physreg.
+    bool joinReservedPhysReg(CoalescerPair &CP);
+
+    /// adjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// the source value number is defined by a copy from the destination reg
+    /// see if we can merge these two destination reg valno# into a single
+    /// value number, eliminating a copy.
+    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// hasOtherReachingDefs - Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// removeCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+
+    /// reMaterializeTrivialDef - If the source of a copy is defined by a
+    /// trivial computation, replace the copy by rematerialize the definition.
+    bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// canJoinPhys - Return true if a physreg copy should be joined.
+    bool canJoinPhys(const CoalescerPair &CP);
+
+    /// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// update the subregister number if it is not zero. If DstReg is a
+    /// physical register and the existing subregister number of the def / use
+    /// being updated is not zero, make sure to set it to the correct physical
+    /// subregister.
+    void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+
+    /// eliminateUndefCopy - Handle copies of undef values.
+    bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    RegisterCoalescer() : MachineFunctionPass(ID) {
+      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    virtual void releaseMemory();
+
+    /// runOnMachineFunction - pass entry point
+    virtual bool runOnMachineFunction(MachineFunction&);
+
+    /// print - Implement the dump method.
+    virtual void print(raw_ostream &O, const Module* = 0) const;
+  };
+} /// end anonymous namespace
+
+char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
+
+INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
+                      "Simple Register Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
+                    "Simple Register Coalescing", false, false)
+
+char RegisterCoalescer::ID = 0;
+
+static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
+                        unsigned &Src, unsigned &Dst,
+                        unsigned &SrcSub, unsigned &DstSub) {
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
+                                      MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else
+    return false;
+  return true;
+}
+
+// Return true if this block should be vacated by the coalescer to eliminate
+// branches. The important cases to handle in the coalescer are critical edges
+// split during phi elimination which contain only copies. Simple blocks that
+// contain non-branches should also be vacated, but this can be handled by an
+// earlier pass similar to early if-conversion.
+static bool isSplitEdge(const MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  for (MachineBasicBlock::const_iterator MII = MBB->begin(), E = MBB->end();
+       MII != E; ++MII) {
+    if (!MII->isCopyLike() && !MII->isUnconditionalBranch())
+      return false;
+  }
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  SrcReg = DstReg = 0;
+  SrcIdx = DstIdx = 0;
+  NewRC = 0;
+  Flipped = CrossClass = false;
+
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  Partial = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
+    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    Flipped = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = TRI.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+      SrcSub = 0;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // Copies between different sub-registers are never coalescable.
+      if (Src == Dst && SrcSub != DstSub)
+        return false;
+
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+                                         SrcIdx, DstIdx);
+      if (!NewRC)
+        return false;
+    } else if (DstSub) {
+      // SrcReg will be merged with a sub-register of DstReg.
+      SrcIdx = DstSub;
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    } else if (SrcSub) {
+      // DstReg will be merged with a sub-register of SrcReg.
+      DstIdx = SrcSub;
+      NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+    } else {
+      // This is a straight copy without sub-registers.
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+    }
+
+    // The combined constraint may be impossible to satisfy.
+    if (!NewRC)
+      return false;
+
+    // Prefer SrcReg to be a sub-register of DstReg.
+    // FIXME: Coalescer should support subregs symmetrically.
+    if (DstIdx && !SrcIdx) {
+      std::swap(Src, Dst);
+      std::swap(SrcIdx, DstIdx);
+      Flipped = !Flipped;
+    }
+
+    CrossClass = NewRC != DstRC || NewRC != SrcRC;
+  }
+  // Check our invariants
+  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+         "Cannot have a physical SubIdx");
+  SrcReg = Src;
+  DstReg = Dst;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return false;
+  std::swap(SrcReg, DstReg);
+  std::swap(SrcIdx, DstIdx);
+  Flipped = !Flipped;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is SrcReg.
+  if (Dst == SrcReg) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != SrcReg) {
+    return false;
+  }
+
+  // Now check that Dst matches DstReg.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = TRI.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return DstReg == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return TRI.getSubReg(DstReg, SrcSub) == Dst;
+  } else {
+    // DstReg is virtual.
+    if (DstReg != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+           TRI.composeSubRegIndices(DstIdx, DstSub);
+  }
+}
+
+void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreservedID(MachineDominatorsID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RegisterCoalescer::eliminateDeadDefs() {
+  SmallVector<LiveInterval*, 8> NewRegs;
+  LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
+}
+
+// Callback from eliminateDeadDefs().
+void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
+  // MI may be in WorkList. Make sure we don't visit it.
+  ErasedInstrs.insert(MI);
+}
+
+/// adjustCopiesBackFrom - We found a non-trivially-coalescable copy with IntA
+/// being the source and IntB being the dest, thus this defines a value number
+/// in IntB.  If the source value number (in IntA) is defined by a copy from B,
+/// see if we can merge these two pieces of B into a single value number,
+/// eliminating a copy.  For example:
+///
+///  A3 = B0
+///    ...
+///  B1 = A3      <- this copy
+///
+/// In this case, B0 can be extended to where the B1 copy lives, allowing the B1
+/// value number to be replaced with B0 (which simplifies the B liveinterval).
+///
+/// This returns true if an interval was modified.
+///
+bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
+                                             MachineInstr *CopyMI) {
+  assert(!CP.isPartial() && "This doesn't work for partial copies.");
+  assert(!CP.isPhys() && "This doesn't work for physreg copies.");
+
+  LiveInterval &IntA =
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+
+  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // the example above.
+  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
+  if (BLR == IntB.end()) return false;
+  VNInfo *BValNo = BLR->valno;
+
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we
+  // can't process it.
+  if (BValNo->def != CopyIdx) return false;
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
+  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
+  // The live range might not exist after fun with physreg coalescing.
+  if (ALR == IntA.end()) return false;
+  VNInfo *AValNo = ALR->valno;
+
+  // If AValNo is defined as a copy from IntB, we can potentially process this.
+  // Get the instruction that defines this value number.
+  MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
+  // Don't allow any partial copies, even if isCoalescable() allows them.
+  if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
+    return false;
+
+  // Get the LiveRange in IntB that this value number starts with.
+  LiveInterval::iterator ValLR =
+    IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
+  if (ValLR == IntB.end())
+    return false;
+
+  // Make sure that the end of the live range is inside the same block as
+  // CopyMI.
+  MachineInstr *ValLREndInst =
+    LIS->getInstructionFromIndex(ValLR->end.getPrevSlot());
+  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+    return false;
+
+  // Okay, we now know that ValLR ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live ranges between them in
+  // IntB, we can merge them.
+  if (ValLR+1 != BLR) return false;
+
+  DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
+
+  SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
+  // We are about to delete CopyMI, so need to remove it as the 'instruction
+  // that defines this value #'. Update the valnum with the new defining
+  // instruction #.
+  BValNo->def = FillerStart;
+
+  // Okay, we can merge them.  We need to insert a new liverange:
+  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // two value numbers.
+  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+
+  // Okay, merge "B1" into the same value number as "B0".
+  if (BValNo != ValLR->valno)
+    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  DEBUG(dbgs() << "   result = " << IntB << '\n');
+
+  // If the source instruction was killing the source register before the
+  // merge, unset the isKill marker given the live range has been extended.
+  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  if (UIdx != -1) {
+    ValLREndInst->getOperand(UIdx).setIsKill(false);
+  }
+
+  // Rewrite the copy. If the copy instruction was killing the destination
+  // register before the merge, find the last use and trim the live range. That
+  // will also add the isKill marker.
+  CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
+  if (ALR->end == CopyIdx)
+    LIS->shrinkToUses(&IntA);
+
+  ++numExtends;
+  return true;
+}
+
+/// hasOtherReachingDefs - Return true if there are definitions of IntB
+/// other than BValNo val# that can reach uses of AValno val# of IntA.
+bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
+                                             LiveInterval &IntB,
+                                             VNInfo *AValNo,
+                                             VNInfo *BValNo) {
+  // If AValNo has PHI kills, conservatively assume that IntB defs can reach
+  // the PHI values.
+  if (LIS->hasPHIKill(IntA, AValNo))
+    return true;
+
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    LiveInterval::Ranges::iterator BI =
+      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
+    if (BI != IntB.ranges.begin())
+      --BI;
+    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+      if (BI->valno == BValNo)
+        continue;
+      if (BI->start <= AI->start && BI->end > AI->start)
+        return true;
+      if (BI->start > AI->start && BI->start < AI->end)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// removeCopyByCommutingDef - We found a non-trivially-coalescable copy with
+/// IntA being the source and IntB being the dest, thus this defines a value
+/// number in IntB.  If the source value number (in IntA) is defined by a
+/// commutable instruction and its other operand is coalesced to the copy dest
+/// register, see if we can transform the copy into a noop by commuting the
+/// definition. For example,
+///
+///  A3 = op A2 B0<kill>
+///    ...
+///  B1 = A3      <- this copy
+///    ...
+///     = op A3   <- more uses
+///
+/// ==>
+///
+///  B2 = op B0 A2<kill>
+///    ...
+///  B1 = B2      <- now an identify copy
+///    ...
+///     = op B2   <- more uses
+///
+/// This returns true if an interval was modified.
+///
+bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                 MachineInstr *CopyMI) {
+  assert (!CP.isPhys());
+
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+
+  LiveInterval &IntA =
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // the example above.
+  VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
+  if (!BValNo || BValNo->def != CopyIdx)
+    return false;
+
+  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
+  assert(AValNo && "COPY source not live");
+  if (AValNo->isPHIDef() || AValNo->isUnused())
+    return false;
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return false;
+  if (!DefMI->isCommutable())
+    return false;
+  // If DefMI is a two-address instruction then commuting it will change the
+  // destination register.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
+  assert(DefIdx != -1);
+  unsigned UseOpIdx;
+  if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
+    return false;
+  unsigned Op1, Op2, NewDstIdx;
+  if (!TII->findCommutedOpIndices(DefMI, Op1, Op2))
+    return false;
+  if (Op1 == UseOpIdx)
+    NewDstIdx = Op2;
+  else if (Op2 == UseOpIdx)
+    NewDstIdx = Op1;
+  else
+    return false;
+
+  MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+  unsigned NewReg = NewDstMO.getReg();
+  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
+    return false;
+
+  // Make sure there are no other definitions of IntB that would reach the
+  // uses which the new definition can reach.
+  if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+    return false;
+
+  // If some of the uses of IntA.reg is already coalesced away, return false.
+  // It's not possible to determine whether it's safe to perform the coalescing.
+  for (MachineRegisterInfo::use_nodbg_iterator UI =
+         MRI->use_nodbg_begin(IntA.reg),
+       UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end() || ULR->valno != AValNo)
+      continue;
+    // If this use is tied to a def, we can't rewrite the register.
+    if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
+      return false;
+  }
+
+  DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
+               << *DefMI);
+
+  // At this point we have decided that it is legal to do this
+  // transformation.  Start by commuting the instruction.
+  MachineBasicBlock *MBB = DefMI->getParent();
+  MachineInstr *NewMI = TII->commuteInstruction(DefMI);
+  if (!NewMI)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
+      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+      !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
+    return false;
+  if (NewMI != DefMI) {
+    LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);
+    MachineBasicBlock::iterator Pos = DefMI;
+    MBB->insert(Pos, NewMI);
+    MBB->erase(DefMI);
+  }
+  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
+  NewMI->getOperand(OpIdx).setIsKill();
+
+  // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
+  // A = or A, B
+  // ...
+  // B = A
+  // ...
+  // C = A<kill>
+  // ...
+  //   = B
+
+  // Update uses of IntA of the specific Val# with IntB.
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
+         UE = MRI->use_end(); UI != UE;) {
+    MachineOperand &UseMO = UI.getOperand();
+    MachineInstr *UseMI = &*UI;
+    ++UI;
+    if (UseMI->isDebugValue()) {
+      // FIXME These don't have an instruction index.  Not clear we have enough
+      // info to decide whether to do this replacement or not.  For now do it.
+      UseMO.setReg(NewReg);
+      continue;
+    }
+    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
+    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
+    if (ULR == IntA.end() || ULR->valno != AValNo)
+      continue;
+    // Kill flags are no longer accurate. They are recomputed after RA.
+    UseMO.setIsKill(false);
+    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+      UseMO.substPhysReg(NewReg, *TRI);
+    else
+      UseMO.setReg(NewReg);
+    if (UseMI == CopyMI)
+      continue;
+    if (!UseMI->isCopy())
+      continue;
+    if (UseMI->getOperand(0).getReg() != IntB.reg ||
+        UseMI->getOperand(0).getSubReg())
+      continue;
+
+    // This copy will become a noop. If it's defining a new val#, merge it into
+    // BValNo.
+    SlotIndex DefIdx = UseIdx.getRegSlot();
+    VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
+    if (!DVNI)
+      continue;
+    DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
+    assert(DVNI->def == DefIdx);
+    BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
+    ErasedInstrs.insert(UseMI);
+    LIS->RemoveMachineInstrFromMaps(UseMI);
+    UseMI->eraseFromParent();
+  }
+
+  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // is updated.
+  VNInfo *ValNo = BValNo;
+  ValNo->def = AValNo->def;
+  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
+       AI != AE; ++AI) {
+    if (AI->valno != AValNo) continue;
+    IntB.addRange(LiveRange(AI->start, AI->end, ValNo));
+  }
+  DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
+
+  IntA.removeValNo(AValNo);
+  DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
+  ++numCommutes;
+  return true;
+}
+
+/// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
+/// computation, replace the copy by rematerialize the definition.
+bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
+                                                MachineInstr *CopyMI) {
+  unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
+  LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
+  assert(SrcLR != SrcInt.end() && "Live range not found!");
+  VNInfo *ValNo = SrcLR->valno;
+  if (ValNo->isPHIDef() || ValNo->isUnused())
+    return false;
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
+  if (!DefMI)
+    return false;
+  assert(DefMI && "Defining instruction disappeared");
+  if (!DefMI->isAsCheapAsAMove())
+    return false;
+  if (!TII->isTriviallyReMaterializable(DefMI, AA))
+    return false;
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(TII, AA, SawStore))
+    return false;
+  const MCInstrDesc &MCID = DefMI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
+  if (!DefMI->isImplicitDef()) {
+    // Make sure the copy destination register class fits the instruction
+    // definition register class. The mismatch can happen as a result of earlier
+    // extract_subreg, insert_subreg, subreg_to_reg coalescing.
+    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF);
+    if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      if (!MRI->constrainRegClass(DstReg, RC))
+        return false;
+    } else if (!RC->contains(DstReg))
+      return false;
+  }
+
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  MachineBasicBlock::iterator MII =
+    llvm::next(MachineBasicBlock::iterator(CopyMI));
+  TII->reMaterialize(*MBB, MII, DstReg, 0, DefMI, *TRI);
+  MachineInstr *NewMI = prior(MII);
+
+  // The original DefMI may have been a subregister def, but the full register
+  // class of its destination matches the destination of CopyMI, and CopyMI is
+  // either a full register def or is read-undef. Therefore we can clear the
+  // subregister index on the rematerialized instruction.
+  NewMI->getOperand(0).setSubReg(0);
+
+  // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
+  // We need to remember these so we can add intervals once we insert
+  // NewMI into SlotIndexes.
+  SmallVector<unsigned, 4> NewMIImplDefs;
+  for (unsigned i = NewMI->getDesc().getNumOperands(),
+         e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (MO.isReg()) {
+      assert(MO.isDef() && MO.isImplicit() && MO.isDead() &&
+             TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
+      NewMIImplDefs.push_back(MO.getReg());
+    }
+  }
+
+  // CopyMI may have implicit operands, transfer them over to the newly
+  // rematerialized instruction. And update implicit def interval valnos.
+  for (unsigned i = CopyMI->getDesc().getNumOperands(),
+         e = CopyMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = CopyMI->getOperand(i);
+    if (MO.isReg()) {
+      assert(MO.isImplicit() && "No explicit operands after implict operands.");
+      // Discard VReg implicit defs.
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+        NewMI->addOperand(MO);
+      }
+    }
+  }
+
+  LIS->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+
+  SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+  for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
+    unsigned Reg = NewMIImplDefs[i];
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
+        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+  }
+
+  CopyMI->eraseFromParent();
+  ErasedInstrs.insert(CopyMI);
+  DEBUG(dbgs() << "Remat: " << *NewMI);
+  ++NumReMats;
+
+  // The source interval can become smaller because we removed a use.
+  LIS->shrinkToUses(&SrcInt, &DeadDefs);
+  if (!DeadDefs.empty())
+    eliminateDeadDefs();
+
+  return true;
+}
+
+/// eliminateUndefCopy - ProcessImpicitDefs may leave some copies of <undef>
+/// values, it only removes local variables. When we have a copy like:
+///
+///   %vreg1 = COPY %vreg2<undef>
+///
+/// We delete the copy and remove the corresponding value number from %vreg1.
+/// Any uses of that value number are marked as <undef>.
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
+                                           const CoalescerPair &CP) {
+  SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
+  LiveInterval *SrcInt = &LIS->getInterval(CP.getSrcReg());
+  if (SrcInt->liveAt(Idx))
+    return false;
+  LiveInterval *DstInt = &LIS->getInterval(CP.getDstReg());
+  if (DstInt->liveAt(Idx))
+    return false;
+
+  // No intervals are live-in to CopyMI - it is undef.
+  if (CP.isFlipped())
+    DstInt = SrcInt;
+  SrcInt = 0;
+
+  VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getRegSlot());
+  assert(DeadVNI && "No value defined in DstInt");
+  DstInt->removeValNo(DeadVNI);
+
+  // Find new undef uses.
+  for (MachineRegisterInfo::reg_nodbg_iterator
+         I = MRI->reg_nodbg_begin(DstInt->reg), E = MRI->reg_nodbg_end();
+       I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    if (MO.isDef() || MO.isUndef())
+      continue;
+    MachineInstr *MI = MO.getParent();
+    SlotIndex Idx = LIS->getInstructionIndex(MI);
+    if (DstInt->liveAt(Idx))
+      continue;
+    MO.setIsUndef(true);
+    DEBUG(dbgs() << "\tnew undef: " << Idx << '\t' << *MI);
+  }
+  return true;
+}
+
+/// updateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+/// update the subregister number if it is not zero. If DstReg is a
+/// physical register and the existing subregister number of the def / use
+/// being updated is not zero, make sure to set it to the correct physical
+/// subregister.
+void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
+                                          unsigned DstReg,
+                                          unsigned SubIdx) {
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  LiveInterval *DstInt = DstIsPhys ? 0 : &LIS->getInterval(DstReg);
+
+  SmallPtrSet<MachineInstr*, 8> Visited;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
+       MachineInstr *UseMI = I.skipInstruction();) {
+    // Each instruction can only be rewritten once because sub-register
+    // composition is not always idempotent. When SrcReg != DstReg, rewriting
+    // the UseMI operands removes them from the SrcReg use-def chain, but when
+    // SrcReg is DstReg we could encounter UseMI twice if it has multiple
+    // operands mentioning the virtual register.
+    if (SrcReg == DstReg && !Visited.insert(UseMI))
+      continue;
+
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+
+    // If SrcReg wasn't read, it may still be the case that DstReg is live-in
+    // because SrcReg is a sub-register.
+    if (DstInt && !Reads && SubIdx)
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(UseMI));
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+
+      // Adjust <undef> flags in case of sub-register joins. We don't want to
+      // turn a full def into a read-modify-write sub-register def and vice
+      // versa.
+      if (SubIdx && MO.isDef())
+        MO.setIsUndef(!Reads);
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *TRI);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *TRI);
+    }
+
+    DEBUG({
+        dbgs() << "\t\tupdated: ";
+        if (!UseMI->isDebugValue())
+          dbgs() << LIS->getInstructionIndex(UseMI) << "\t";
+        dbgs() << *UseMI;
+      });
+  }
+}
+
+/// canJoinPhys - Return true if a copy involving a physreg should be joined.
+bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
+  /// Always join simple intervals that are defined by a single copy from a
+  /// reserved register. This doesn't increase register pressure, so it is
+  /// always beneficial.
+  if (!MRI->isReserved(CP.getDstReg())) {
+    DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
+    return false;
+  }
+
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
+  if (CP.isFlipped() && JoinVInt.containsOneValue())
+    return true;
+
+  DEBUG(dbgs() << "\tCannot join defs into reserved register.\n");
+  return false;
+}
+
+/// joinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+/// which are the src/dst of the copy instruction CopyMI.  This returns true
+/// if the copy was successfully coalesced away. If it is not currently
+/// possible to coalesce this interval, but it may be possible if other
+/// things get coalesced, then it returns true by reference in 'Again'.
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+
+  Again = false;
+  DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
+
+  CoalescerPair CP(*TRI);
+  if (!CP.setRegisters(CopyMI)) {
+    DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
+  }
+
+  // Dead code elimination. This really should be handled by MachineDCE, but
+  // sometimes dead copies slip through, and we can't generate invalid live
+  // ranges.
+  if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
+    DEBUG(dbgs() << "\tCopy is dead.\n");
+    DeadDefs.push_back(CopyMI);
+    eliminateDeadDefs();
+    return true;
+  }
+
+  // Eliminate undefs.
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
+    DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
+    return false;  // Not coalescable.
+  }
+
+  // Coalesced copies are normally removed immediately, but transformations
+  // like removeCopyByCommutingDef() can inadvertently create identity copies.
+  // When that happens, just join the values and remove the copy.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
+    DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    if (VNInfo *DefVNI = LRQ.valueDefined()) {
+      VNInfo *ReadVNI = LRQ.valueIn();
+      assert(ReadVNI && "No value before copy and no <undef> flag.");
+      assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+      DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+    }
+    LIS->RemoveMachineInstrFromMaps(CopyMI);
+    CopyMI->eraseFromParent();
+    return true;
+  }
+
+  // Enforce policies.
+  if (CP.isPhys()) {
+    DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
+                 << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
+                 << '\n');
+    if (!canJoinPhys(CP)) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      if (reMaterializeTrivialDef(CP, CopyMI))
+        return true;
+      return false;
+    }
+  } else {
+    DEBUG({
+      dbgs() << "\tConsidering merging to " << CP.getNewRC()->getName()
+             << " with ";
+      if (CP.getDstIdx() && CP.getSrcIdx())
+        dbgs() << PrintReg(CP.getDstReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
+               << PrintReg(CP.getSrcReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
+      else
+        dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
+               << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
+    });
+
+    // When possible, let DstReg be the larger interval.
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
+                           LIS->getInterval(CP.getDstReg()).ranges.size())
+      CP.flip();
+  }
+
+  // Okay, attempt to join these two intervals.  On failure, this returns false.
+  // Otherwise, if one of the intervals being joined is a physreg, this method
+  // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
+  // been modified, so we can use this information below to update aliases.
+  if (!joinIntervals(CP)) {
+    // Coalescing failed.
+
+    // If definition of source is defined by trivial computation, try
+    // rematerializing it.
+    if (reMaterializeTrivialDef(CP, CopyMI))
+      return true;
+
+    // If we can eliminate the copy without merging the live ranges, do so now.
+    if (!CP.isPartial() && !CP.isPhys()) {
+      if (adjustCopiesBackFrom(CP, CopyMI) ||
+          removeCopyByCommutingDef(CP, CopyMI)) {
+        LIS->RemoveMachineInstrFromMaps(CopyMI);
+        CopyMI->eraseFromParent();
+        DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
+    }
+
+    // Otherwise, we are unable to join the intervals.
+    DEBUG(dbgs() << "\tInterference!\n");
+    Again = true;  // May be possible to coalesce later.
+    return false;
+  }
+
+  // Coalescing to a virtual register that is of a sub-register class of the
+  // other. Make sure the resulting register is set to the right register class.
+  if (CP.isCrossClass()) {
+    ++numCrossRCs;
+    MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
+
+  // Removing sub-register copies can ease the register class constraints.
+  // Make sure we attempt to inflate the register class of DstReg.
+  if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
+    InflateRegs.push_back(CP.getDstReg());
+
+  // CopyMI has been erased by joinIntervals at this point. Remove it from
+  // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
+  // to the work list. This keeps ErasedInstrs from growing needlessly.
+  ErasedInstrs.erase(CopyMI);
+
+  // Rewrite all SrcReg operands to DstReg.
+  // Also update DstReg operands to include DstIdx if it is set.
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+
+  // SrcReg is guaranteed to be the register whose live interval that is
+  // being merged.
+  LIS->removeInterval(CP.getSrcReg());
+
+  // Update regalloc hint.
+  TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
+
+  DEBUG({
+    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
+    if (!CP.isPhys())
+      dbgs() << LIS->getInterval(CP.getDstReg());
+     dbgs() << '\n';
+  });
+
+  ++numJoins;
+  return true;
+}
+
+/// Attempt joining with a reserved physreg.
+bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  assert(CP.isPhys() && "Must be a physreg copy");
+  assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << '\n');
+
+  assert(CP.isFlipped() && RHS.containsOneValue() &&
+         "Invalid join with reserved register");
+
+  // Optimization for reserved registers like ESP. We can only merge with a
+  // reserved physreg if RHS has a single value that is a copy of CP.DstReg().
+  // The live range of the reserved register will look like a set of dead defs
+  // - we don't properly track the live range of reserved registers.
+
+  // Deny any overlapping intervals.  This depends on all the reserved
+  // register live ranges to look like dead defs.
+  for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI)
+    if (RHS.overlaps(LIS->getRegUnit(*UI))) {
+      DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
+      return false;
+    }
+
+  // Skip any value computations, we are not adding new values to the
+  // reserved register.  Also skip merging the live ranges, the reserved
+  // register live range doesn't need to be accurate as long as all the
+  // defs are there.
+
+  // Delete the identity copy.
+  MachineInstr *CopyMI = MRI->getVRegDef(RHS.reg);
+  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  CopyMI->eraseFromParent();
+
+  // We don't track kills for reserved registers.
+  MRI->clearKillFlags(CP.getSrcReg());
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                 Interference checking and interval joining
+//===----------------------------------------------------------------------===//
+//
+// In the easiest case, the two live ranges being joined are disjoint, and
+// there is no interference to consider. It is quite common, though, to have
+// overlapping live ranges, and we need to check if the interference can be
+// resolved.
+//
+// The live range of a single SSA value forms a sub-tree of the dominator tree.
+// This means that two SSA values overlap if and only if the def of one value
+// is contained in the live range of the other value. As a special case, the
+// overlapping values can be defined at the same index.
+//
+// The interference from an overlapping def can be resolved in these cases:
+//
+// 1. Coalescable copies. The value is defined by a copy that would become an
+//    identity copy after joining SrcReg and DstReg. The copy instruction will
+//    be removed, and the value will be merged with the source value.
+//
+//    There can be several copies back and forth, causing many values to be
+//    merged into one. We compute a list of ultimate values in the joined live
+//    range as well as a mappings from the old value numbers.
+//
+// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
+//    predecessors have a live out value. It doesn't cause real interference,
+//    and can be merged into the value it overlaps. Like a coalescable copy, it
+//    can be erased after joining.
+//
+// 3. Copy of external value. The overlapping def may be a copy of a value that
+//    is already in the other register. This is like a coalescable copy, but
+//    the live range of the source register must be trimmed after erasing the
+//    copy instruction:
+//
+//      %src = COPY %ext
+//      %dst = COPY %ext  <-- Remove this COPY, trim the live range of %ext.
+//
+// 4. Clobbering undefined lanes. Vector registers are sometimes built by
+//    defining one lane at a time:
+//
+//      %dst:ssub0<def,read-undef> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    The live range of %src overlaps the %dst value defined by FOO, but
+//    merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
+//    which was undef anyway.
+//
+//    The value mapping is more complicated in this case. The final live range
+//    will have different value numbers for both FOO and BAR, but there is no
+//    simple mapping from old to new values. It may even be necessary to add
+//    new PHI values.
+//
+// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
+//    is live, but never read. This can happen because we don't compute
+//    individual live ranges per lane.
+//
+//      %dst<def> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    This kind of interference is only resolved locally. If the clobbered
+//    lane value escapes the block, the join is aborted.
+
+namespace {
+/// Track information about values in a single virtual register about to be
+/// joined. Objects of this class are always created in pairs - one for each
+/// side of the CoalescerPair.
+class JoinVals {
+  LiveInterval &LI;
+
+  // Location of this register in the final joined register.
+  // Either CP.DstIdx or CP.SrcIdx.
+  unsigned SubIdx;
+
+  // Values that will be present in the final live range.
+  SmallVectorImpl<VNInfo*> &NewVNInfo;
+
+  const CoalescerPair &CP;
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  const TargetRegisterInfo *TRI;
+
+  // Value number assignments. Maps value numbers in LI to entries in NewVNInfo.
+  // This is suitable for passing to LiveInterval::join().
+  SmallVector<int, 8> Assignments;
+
+  // Conflict resolution for overlapping values.
+  enum ConflictResolution {
+    // No overlap, simply keep this value.
+    CR_Keep,
+
+    // Merge this value into OtherVNI and erase the defining instruction.
+    // Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    // values.
+    CR_Erase,
+
+    // Merge this value into OtherVNI but keep the defining instruction.
+    // This is for the special case where OtherVNI is defined by the same
+    // instruction.
+    CR_Merge,
+
+    // Keep this value, and have it replace OtherVNI where possible. This
+    // complicates value mapping since OtherVNI maps to two different values
+    // before and after this def.
+    // Used when clobbering undefined or dead lanes.
+    CR_Replace,
+
+    // Unresolved conflict. Visit later when all values have been mapped.
+    CR_Unresolved,
+
+    // Unresolvable conflict. Abort the join.
+    CR_Impossible
+  };
+
+  // Per-value info for LI. The lane bit masks are all relative to the final
+  // joined register, so they can be compared directly between SrcReg and
+  // DstReg.
+  struct Val {
+    ConflictResolution Resolution;
+
+    // Lanes written by this def, 0 for unanalyzed values.
+    unsigned WriteLanes;
+
+    // Lanes with defined values in this register. Other lanes are undef and
+    // safe to clobber.
+    unsigned ValidLanes;
+
+    // Value in LI being redefined by this def.
+    VNInfo *RedefVNI;
+
+    // Value in the other live range that overlaps this def, if any.
+    VNInfo *OtherVNI;
+
+    // Is this value an IMPLICIT_DEF that can be erased?
+    //
+    // IMPLICIT_DEF values should only exist at the end of a basic block that
+    // is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
+    // safely erased if they are overlapping a live value in the other live
+    // interval.
+    //
+    // Weird control flow graphs and incomplete PHI handling in
+    // ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
+    // longer live ranges. Such IMPLICIT_DEF values should be treated like
+    // normal values.
+    bool ErasableImplicitDef;
+
+    // True when the live range of this value will be pruned because of an
+    // overlapping CR_Replace value in the other live range.
+    bool Pruned;
+
+    // True once Pruned above has been computed.
+    bool PrunedComputed;
+
+    Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
+            RedefVNI(0), OtherVNI(0), ErasableImplicitDef(false),
+            Pruned(false), PrunedComputed(false) {}
+
+    bool isAnalyzed() const { return WriteLanes != 0; }
+  };
+
+  // One entry per value number in LI.
+  SmallVector<Val, 8> Vals;
+
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
+  VNInfo *stripCopies(VNInfo *VNI);
+  ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+  void computeAssignment(unsigned ValNo, JoinVals &Other);
+  bool taintExtent(unsigned, unsigned, JoinVals&,
+                   SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
+  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+  bool isPrunedValue(unsigned ValNo, JoinVals &Other);
+
+public:
+  JoinVals(LiveInterval &li, unsigned subIdx,
+           SmallVectorImpl<VNInfo*> &newVNInfo,
+           const CoalescerPair &cp,
+           LiveIntervals *lis,
+           const TargetRegisterInfo *tri)
+    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
+      Indexes(LIS->getSlotIndexes()), TRI(tri),
+      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  {}
+
+  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool mapValues(JoinVals &Other);
+
+  /// Try to resolve conflicts that require all values to be mapped.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool resolveConflicts(JoinVals &Other);
+
+  /// Prune the live range of values in Other.LI where they would conflict with
+  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// after joining.
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+
+  /// Erase any machine instructions that have been coalesced away.
+  /// Add erased instructions to ErasedInstrs.
+  /// Add foreign virtual registers to ShrinkRegs if their live range ended at
+  /// the erased instrs.
+  void eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                   SmallVectorImpl<unsigned> &ShrinkRegs);
+
+  /// Get the value assignments suitable for passing to LiveInterval::join.
+  const int *getAssignments() const { return Assignments.data(); }
+};
+} // end anonymous namespace
+
+/// Compute the bitmask of lanes actually written by DefMI.
+/// Set Redef if there are any partial register definitions that depend on the
+/// previous value of the register.
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+  unsigned L = 0;
+  for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+      continue;
+    L |= TRI->getSubRegIndexLaneMask(
+           TRI->composeSubRegIndices(SubIdx, MO->getSubReg()));
+    if (MO->readsReg())
+      Redef = true;
+  }
+  return L;
+}
+
+/// Find the ultimate value that VNI was copied from.
+VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+  while (!VNI->isPHIDef()) {
+    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    assert(MI && "No defining instruction");
+    if (!MI->isFullCopy())
+      break;
+    unsigned Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      break;
+    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    if (!LRQ.valueIn())
+      break;
+    VNI = LRQ.valueIn();
+  }
+  return VNI;
+}
+
+/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+/// Return a conflict resolution when possible, but leave the hard cases as
+/// CR_Unresolved.
+/// Recursively calls computeAssignment() on this and Other, guaranteeing that
+/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+/// The recursion always goes upwards in the dominator tree, making loops
+/// impossible.
+JoinVals::ConflictResolution
+JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  assert(!V.isAnalyzed() && "Value has already been analyzed!");
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  if (VNI->isUnused()) {
+    V.WriteLanes = ~0u;
+    return CR_Keep;
+  }
+
+  // Get the instruction defining this value, compute the lanes written.
+  const MachineInstr *DefMI = 0;
+  if (VNI->isPHIDef()) {
+    // Conservatively assume that all lanes in a PHI are valid.
+    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+  } else {
+    DefMI = Indexes->getInstructionFromIndex(VNI->def);
+    bool Redef = false;
+    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+    // If this is a read-modify-write instruction, there may be more valid
+    // lanes than the ones written by this instruction.
+    // This only covers partial redef operands. DefMI may have normal use
+    // operands reading the register. They don't contribute valid lanes.
+    //
+    // This adds ssub1 to the set of valid lanes in %src:
+    //
+    //   %src:ssub1<def> = FOO
+    //
+    // This leaves only ssub1 valid, making any other lanes undef:
+    //
+    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+    //
+    // The <read-undef> flag on the def operand means that old lane values are
+    // not important.
+    if (Redef) {
+      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      assert(V.RedefVNI && "Instruction is reading nonexistent value");
+      computeAssignment(V.RedefVNI->id, Other);
+      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+    }
+
+    // An IMPLICIT_DEF writes undef values.
+    if (DefMI->isImplicitDef()) {
+      // We normally expect IMPLICIT_DEF values to be live only until the end
+      // of their block. If the value is really live longer and gets pruned in
+      // another block, this flag is cleared again.
+      V.ErasableImplicitDef = true;
+      V.ValidLanes &= ~V.WriteLanes;
+    }
+  }
+
+  // Find the value in Other that overlaps VNI->def, if any.
+  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+
+  // It is possible that both values are defined by the same instruction, or
+  // the values are PHIs defined in the same block. When that happens, the two
+  // values should be merged into one, but not into any preceding value.
+  // The first value defined or visited gets CR_Keep, the other gets CR_Merge.
+  if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
+    assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
+
+    // One value stays, the other is merged. Keep the earlier one, or the first
+    // one we see.
+    if (OtherVNI->def < VNI->def)
+      Other.computeAssignment(OtherVNI->id, *this);
+    else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
+      // This is an early-clobber def overlapping a live-in value in the other
+      // register. Not mergeable.
+      V.OtherVNI = OtherLRQ.valueIn();
+      return CR_Impossible;
+    }
+    V.OtherVNI = OtherVNI;
+    Val &OtherV = Other.Vals[OtherVNI->id];
+    // Keep this value, check for conflicts when analyzing OtherVNI.
+    if (!OtherV.isAnalyzed())
+      return CR_Keep;
+    // Both sides have been analyzed now.
+    // Allow overlapping PHI values. Any real interference would show up in a
+    // predecessor, the PHI itself can't introduce any conflicts.
+    if (VNI->isPHIDef())
+      return CR_Merge;
+    if (V.ValidLanes & OtherV.ValidLanes)
+      // Overlapping lanes can't be resolved.
+      return CR_Impossible;
+    else
+      return CR_Merge;
+  }
+
+  // No simultaneous def. Is Other live at the def?
+  V.OtherVNI = OtherLRQ.valueIn();
+  if (!V.OtherVNI)
+    // No overlap, no conflict.
+    return CR_Keep;
+
+  assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
+
+  // We have overlapping values, or possibly a kill of Other.
+  // Recursively compute assignments up the dominator tree.
+  Other.computeAssignment(V.OtherVNI->id, *this);
+  Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+  // Check if OtherV is an IMPLICIT_DEF that extends beyond its basic block.
+  // This shouldn't normally happen, but ProcessImplicitDefs can leave such
+  // IMPLICIT_DEF instructions behind, and there is nothing wrong with it
+  // technically.
+  //
+  // WHen it happens, treat that IMPLICIT_DEF as a normal value, and don't try
+  // to erase the IMPLICIT_DEF instruction.
+  if (OtherV.ErasableImplicitDef && DefMI &&
+      DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
+    DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
+                 << " extends into BB#" << DefMI->getParent()->getNumber()
+                 << ", keeping it.\n");
+    OtherV.ErasableImplicitDef = false;
+  }
+
+  // Allow overlapping PHI values. Any real interference would show up in a
+  // predecessor, the PHI itself can't introduce any conflicts.
+  if (VNI->isPHIDef())
+    return CR_Replace;
+
+  // Check for simple erasable conflicts.
+  if (DefMI->isImplicitDef())
+    return CR_Erase;
+
+  // Include the non-conflict where DefMI is a coalescable copy that kills
+  // OtherVNI. We still want the copy erased and value numbers merged.
+  if (CP.isCoalescable(DefMI)) {
+    // Some of the lanes copied from OtherVNI may be undef, making them undef
+    // here too.
+    V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
+    return CR_Erase;
+  }
+
+  // This may not be a real conflict if DefMI simply kills Other and defines
+  // VNI.
+  if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
+    return CR_Keep;
+
+  // Handle the case where VNI and OtherVNI can be proven to be identical:
+  //
+  //   %other = COPY %ext
+  //   %this  = COPY %ext <-- Erase this copy
+  //
+  if (DefMI->isFullCopy() && !CP.isPartial() &&
+      stripCopies(VNI) == stripCopies(V.OtherVNI))
+    return CR_Erase;
+
+  // If the lanes written by this instruction were all undef in OtherVNI, it is
+  // still safe to join the live ranges. This can't be done with a simple value
+  // mapping, though - OtherVNI will map to multiple values:
+  //
+  //   1 %dst:ssub0 = FOO                <-- OtherVNI
+  //   2 %src = BAR                      <-- VNI
+  //   3 %dst:ssub1 = COPY %src<kill>    <-- Eliminate this copy.
+  //   4 BAZ %dst<kill>
+  //   5 QUUX %src<kill>
+  //
+  // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
+  // handles this complex value mapping.
+  if ((V.WriteLanes & OtherV.ValidLanes) == 0)
+    return CR_Replace;
+
+  // If the other live range is killed by DefMI and the live ranges are still
+  // overlapping, it must be because we're looking at an early clobber def:
+  //
+  //   %dst<def,early-clobber> = ASM %src<kill>
+  //
+  // In this case, it is illegal to merge the two live ranges since the early
+  // clobber def would clobber %src before it was read.
+  if (OtherLRQ.isKill()) {
+    // This case where the def doesn't overlap the kill is handled above.
+    assert(VNI->def.isEarlyClobber() &&
+           "Only early clobber defs can overlap a kill");
+    return CR_Impossible;
+  }
+
+  // VNI is clobbering live lanes in OtherVNI, but there is still the
+  // possibility that no instructions actually read the clobbered lanes.
+  // If we're clobbering all the lanes in OtherVNI, at least one must be read.
+  // Otherwise Other.LI wouldn't be live here.
+  if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
+    return CR_Impossible;
+
+  // We need to verify that no instructions are reading the clobbered lanes. To
+  // save compile time, we'll only check that locally. Don't allow the tainted
+  // value to escape the basic block.
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
+    return CR_Impossible;
+
+  // There are still some things that could go wrong besides clobbered lanes
+  // being read, for example OtherVNI may be only partially redefined in MBB,
+  // and some clobbered lanes could escape the block. Save this analysis for
+  // resolveConflicts() when all values have been mapped. We need to know
+  // RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
+  // that now - the recursive analyzeValue() calls must go upwards in the
+  // dominator tree.
+  return CR_Unresolved;
+}
+
+/// Compute the value assignment for ValNo in LI.
+/// This may be called recursively by analyzeValue(), but never for a ValNo on
+/// the stack.
+void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.isAnalyzed()) {
+    // Recursion should always move up the dominator tree, so ValNo is not
+    // supposed to reappear before it has been assigned.
+    assert(Assignments[ValNo] != -1 && "Bad recursion?");
+    return;
+  }
+  switch ((V.Resolution = analyzeValue(ValNo, Other))) {
+  case CR_Erase:
+  case CR_Merge:
+    // Merge this ValNo into OtherVNI.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
+    assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
+    Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
+                 << LI.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+                 << V.OtherVNI->def << " --> @"
+                 << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    break;
+  case CR_Replace:
+  case CR_Unresolved:
+    // The other value is going to be pruned if this join is successful.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
+    Other.Vals[V.OtherVNI->id].Pruned = true;
+    // Fall through.
+  default:
+    // This value number needs to go in the final joined live range.
+    Assignments[ValNo] = NewVNInfo.size();
+    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    break;
+  }
+}
+
+bool JoinVals::mapValues(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    computeAssignment(i, Other);
+    if (Vals[i].Resolution == CR_Impossible) {
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
+                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
+/// the extent of the tainted lanes in the block.
+///
+/// Multiple values in Other.LI can be affected since partial redefinitions can
+/// preserve previously tainted lanes.
+///
+///   1 %dst = VLOAD           <-- Define all lanes in %dst
+///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+///
+/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+/// entry to TaintedVals.
+///
+/// Returns false if the tainted lanes extend beyond the basic block.
+bool JoinVals::
+taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
+
+  // Scan Other.LI from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
+  assert(OtherI != Other.LI.end() && "No conflict?");
+  do {
+    // OtherI is pointing to a tainted value. Abort the join if the tainted
+    // lanes escape the block.
+    SlotIndex End = OtherI->end;
+    if (End >= MBBEnd) {
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+                   << OtherI->valno->id << '@' << OtherI->start << '\n');
+      return false;
+    }
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+                 << OtherI->valno->id << '@' << OtherI->start
+                 << " to " << End << '\n');
+    // A dead def is not a problem.
+    if (End.isDead())
+      break;
+    TaintExtent.push_back(std::make_pair(End, TaintedLanes));
+
+    // Check for another def in the MBB.
+    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+      break;
+
+    // Lanes written by the new def are no longer tainted.
+    const Val &OV = Other.Vals[OtherI->valno->id];
+    TaintedLanes &= ~OV.WriteLanes;
+    if (!OV.RedefVNI)
+      break;
+  } while (TaintedLanes);
+  return true;
+}
+
+/// Return true if MI uses any of the given Lanes from Reg.
+/// This does not include partial redefinitions of Reg.
+bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) {
+  if (MI->isDebugValue())
+    return false;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->isDef() || MO->getReg() != Reg)
+      continue;
+    if (!MO->readsReg())
+      continue;
+    if (Lanes & TRI->getSubRegIndexLaneMask(
+                  TRI->composeSubRegIndices(SubIdx, MO->getSubReg())))
+      return true;
+  }
+  return false;
+}
+
+bool JoinVals::resolveConflicts(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
+    if (V.Resolution != CR_Unresolved)
+      continue;
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
+                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    ++NumLaneConflicts;
+    assert(V.OtherVNI && "Inconsistent conflict resolution.");
+    VNInfo *VNI = LI.getValNumInfo(i);
+    const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+    // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
+    // join, those lanes will be tainted with a wrong value. Get the extent of
+    // the tainted lanes.
+    unsigned TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, unsigned>, 8> TaintExtent;
+    if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
+      // Tainted lanes would extend beyond the basic block.
+      return false;
+
+    assert(!TaintExtent.empty() && "There should be at least one conflict.");
+
+    // Now look at the instructions from VNI->def to TaintExtent (inclusive).
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+    MachineBasicBlock::iterator MI = MBB->begin();
+    if (!VNI->isPHIDef()) {
+      MI = Indexes->getInstructionFromIndex(VNI->def);
+      // No need to check the instruction defining VNI for reads.
+      ++MI;
+    }
+    assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
+           "Interference ends on VNI->def. Should have been handled earlier");
+    MachineInstr *LastMI =
+      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+    assert(LastMI && "Range must end at a proper instruction");
+    unsigned TaintNum = 0;
+    for(;;) {
+      assert(MI != MBB->end() && "Bad LastMI");
+      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+        DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
+        return false;
+      }
+      // LastMI is the last instruction to use the current value.
+      if (&*MI == LastMI) {
+        if (++TaintNum == TaintExtent.size())
+          break;
+        LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
+        assert(LastMI && "Range must end at a proper instruction");
+        TaintedLanes = TaintExtent[TaintNum].second;
+      }
+      ++MI;
+    }
+
+    // The tainted lanes are unused.
+    V.Resolution = CR_Replace;
+    ++NumLaneResolves;
+  }
+  return true;
+}
+
+// Determine if ValNo is a copy of a value number in LI or Other.LI that will
+// be pruned:
+//
+//   %dst = COPY %src
+//   %src = COPY %dst  <-- This value to be pruned.
+//   %dst = COPY %src  <-- This value is a copy of a pruned value.
+//
+bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.Pruned || V.PrunedComputed)
+    return V.Pruned;
+
+  if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
+    return V.Pruned;
+
+  // Follow copies up the dominator tree and check if any intermediate value
+  // has been pruned.
+  V.PrunedComputed = true;
+  V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
+  return V.Pruned;
+}
+
+void JoinVals::pruneValues(JoinVals &Other,
+                           SmallVectorImpl<SlotIndex> &EndPoints) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      break;
+    case CR_Replace: {
+      // This value takes precedence over the value in Other.LI.
+      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
+      // instructions are only inserted to provide a live-out value for PHI
+      // predecessors, so the instruction should simply go away once its value
+      // has been replaced.
+      Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
+      bool EraseImpDef = OtherV.ErasableImplicitDef &&
+                         OtherV.Resolution == CR_Keep;
+      if (!Def.isBlock()) {
+        // Remove <def,read-undef> flags. This def is now a partial redef.
+        // Also remove <def,dead> flags since the joined live range will
+        // continue past this instruction.
+        for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
+             MO.isValid(); ++MO)
+          if (MO->isReg() && MO->isDef() && MO->getReg() == LI.reg) {
+            MO->setIsUndef(EraseImpDef);
+            MO->setIsDead(false);
+          }
+        // This value will reach instructions below, but we need to make sure
+        // the live range also reaches the instruction at Def.
+        if (!EraseImpDef)
+          EndPoints.push_back(Def);
+      }
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
+                   << ": " << Other.LI << '\n');
+      break;
+    }
+    case CR_Erase:
+    case CR_Merge:
+      if (isPrunedValue(i, Other)) {
+        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // We can no longer trust the value mapping computed by
+        // computeAssignment(), the value that was originally copied could have
+        // been replaced.
+        LIS->pruneValue(&LI, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
+                     << Def << ": " << LI << '\n');
+      }
+      break;
+    case CR_Unresolved:
+    case CR_Impossible:
+      llvm_unreachable("Unresolved conflicts");
+    }
+  }
+}
+
+void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                           SmallVectorImpl<unsigned> &ShrinkRegs) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    // Get the def location before markUnused() below invalidates it.
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
+      // longer. The IMPLICIT_DEF instructions are only inserted by
+      // PHIElimination to guarantee that all PHI predecessors have a value.
+      if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
+        break;
+      // Remove value number i from LI. Note that this VNInfo is still present
+      // in NewVNInfo, so it will appear as an unused value number in the final
+      // joined interval.
+      LI.getValNumInfo(i)->markUnused();
+      LI.removeValNo(LI.getValNumInfo(i));
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LI << '\n');
+      // FALL THROUGH.
+
+    case CR_Erase: {
+      MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+      assert(MI && "No instruction to erase");
+      if (MI->isCopy()) {
+        unsigned Reg = MI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+            Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+          ShrinkRegs.push_back(Reg);
+      }
+      ErasedInstrs.insert(MI);
+      DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+      LIS->RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
+  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
+  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+               << '\n');
+
+  // First compute NewVNInfo and the simple value mappings.
+  // Detect impossible conflicts early.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    return false;
+
+  // Some conflicts can only be resolved after all values have been mapped.
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    return false;
+
+  // All clear, the live ranges can be merged.
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints);
+  RHSVals.pruneValues(LHSVals, EndPoints);
+
+  // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
+  // registers to require trimming.
+  SmallVector<unsigned, 8> ShrinkRegs;
+  LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  while (!ShrinkRegs.empty())
+    LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
+
+  // Join RHS into LHS.
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
+           MRI);
+
+  // Kill flags are going to be wrong if the live ranges were overlapping.
+  // Eventually, we should simply clear all kill flags when computing live
+  // ranges. They are reinserted after register allocation.
+  MRI->clearKillFlags(LHS.reg);
+  MRI->clearKillFlags(RHS.reg);
+
+  if (EndPoints.empty())
+    return true;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LHS << '\n');
+  LIS->extendToIndices(&LHS, EndPoints);
+  return true;
+}
+
+/// joinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
+}
+
+namespace {
+// Information concerning MBB coalescing priority.
+struct MBBPriorityInfo {
+  MachineBasicBlock *MBB;
+  unsigned Depth;
+  bool IsSplit;
+
+  MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
+    : MBB(mbb), Depth(depth), IsSplit(issplit) {}
+};
+}
+
+// C-style comparator that sorts first based on the loop depth of the basic
+// block (the unsigned), and then on the MBB number.
+//
+// EnableGlobalCopies assumes that the primary sort key is loop depth.
+static int compareMBBPriority(const void *L, const void *R) {
+  const MBBPriorityInfo *LHS = static_cast<const MBBPriorityInfo*>(L);
+  const MBBPriorityInfo *RHS = static_cast<const MBBPriorityInfo*>(R);
+  // Deeper loops first
+  if (LHS->Depth != RHS->Depth)
+    return LHS->Depth > RHS->Depth ? -1 : 1;
+
+  // Try to unsplit critical edges next.
+  if (LHS->IsSplit != RHS->IsSplit)
+    return LHS->IsSplit ? -1 : 1;
+
+  // Prefer blocks that are more connected in the CFG. This takes care of
+  // the most difficult copies first while intervals are short.
+  unsigned cl = LHS->MBB->pred_size() + LHS->MBB->succ_size();
+  unsigned cr = RHS->MBB->pred_size() + RHS->MBB->succ_size();
+  if (cl != cr)
+    return cl > cr ? -1 : 1;
+
+  // As a last resort, sort by block number.
+  return LHS->MBB->getNumber() < RHS->MBB->getNumber() ? -1 : 1;
+}
+
+/// \returns true if the given copy uses or defines a local live range.
+static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
+  if (!Copy->isCopy())
+    return false;
+
+  unsigned SrcReg = Copy->getOperand(1).getReg();
+  unsigned DstReg = Copy->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
+      || TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return false;
+
+  return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
+    || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
+}
+
+// Try joining WorkList copies starting from index From.
+// Null out any successful joins.
+bool RegisterCoalescer::
+copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
+  bool Progress = false;
+  for (unsigned i = 0, e = CurrList.size(); i != e; ++i) {
+    if (!CurrList[i])
+      continue;
+    // Skip instruction pointers that have already been erased, for example by
+    // dead code elimination.
+    if (ErasedInstrs.erase(CurrList[i])) {
+      CurrList[i] = 0;
+      continue;
+    }
+    bool Again = false;
+    bool Success = joinCopy(CurrList[i], Again);
+    Progress |= Success;
+    if (Success || !Again)
+      CurrList[i] = 0;
+  }
+  return Progress;
+}
+
+void
+RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  // Collect all copy-like instructions in MBB. Don't start coalescing anything
+  // yet, it might invalidate the iterator.
+  const unsigned PrevSize = WorkList.size();
+  if (JoinGlobalCopies) {
+    // Coalesce copies bottom-up to coalesce local defs before local uses. They
+    // are not inherently easier to resolve, but slightly preferable until we
+    // have local live range splitting. In particular this is required by
+    // cmp+jmp macro fusion.
+    for (MachineBasicBlock::reverse_iterator
+           MII = MBB->rbegin(), E = MBB->rend(); MII != E; ++MII) {
+      if (!MII->isCopyLike())
+        continue;
+      if (isLocalCopy(&(*MII), LIS))
+        LocalWorkList.push_back(&(*MII));
+      else
+        WorkList.push_back(&(*MII));
+    }
+  }
+  else {
+     for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+          MII != E; ++MII)
+       if (MII->isCopyLike())
+         WorkList.push_back(MII);
+  }
+  // Try coalescing the collected copies immediately, and remove the nulls.
+  // This prevents the WorkList from getting too large since most copies are
+  // joinable on the first attempt.
+  MutableArrayRef<MachineInstr*>
+    CurrList(WorkList.begin() + PrevSize, WorkList.end());
+  if (copyCoalesceWorkList(CurrList))
+    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
+                               (MachineInstr*)0), WorkList.end());
+}
+
+void RegisterCoalescer::coalesceLocals() {
+  copyCoalesceWorkList(LocalWorkList);
+  for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
+    if (LocalWorkList[j])
+      WorkList.push_back(LocalWorkList[j]);
+  }
+  LocalWorkList.clear();
+}
+
+void RegisterCoalescer::joinAllIntervals() {
+  DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");
+
+  std::vector<MBBPriorityInfo> MBBs;
+  MBBs.reserve(MF->size());
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end();I != E;++I){
+    MachineBasicBlock *MBB = I;
+    MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
+                                   JoinSplitEdges && isSplitEdge(MBB)));
+  }
+  array_pod_sort(MBBs.begin(), MBBs.end(), compareMBBPriority);
+
+  // Coalesce intervals in MBB priority order.
+  unsigned CurrDepth = UINT_MAX;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    // Try coalescing the collected local copies for deeper loops.
+    if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) {
+      coalesceLocals();
+      CurrDepth = MBBs[i].Depth;
+    }
+    copyCoalesceInMBB(MBBs[i].MBB);
+  }
+  coalesceLocals();
+
+  // Joining intervals can allow other intervals to be joined.  Iteratively join
+  // until we make no progress.
+  while (copyCoalesceWorkList(WorkList))
+    /* empty */ ;
+}
+
+void RegisterCoalescer::releaseMemory() {
+  ErasedInstrs.clear();
+  WorkList.clear();
+  DeadDefs.clear();
+  InflateRegs.clear();
+}
+
+bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  MRI = &fn.getRegInfo();
+  TM = &fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  AA = &getAnalysis<AliasAnalysis>();
+  Loops = &getAnalysis<MachineLoopInfo>();
+
+  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  if (EnableGlobalCopies == cl::BOU_UNSET)
+    JoinGlobalCopies = ST.enableMachineScheduler();
+  else
+    JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
+
+  // The MachineScheduler does not currently require JoinSplitEdges. This will
+  // either be enabled unconditionally or replaced by a more general live range
+  // splitting optimization.
+  JoinSplitEdges = EnableJoinSplits;
+
+  DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
+               << "********** Function: " << MF->getName() << '\n');
+
+  if (VerifyCoalescing)
+    MF->verify(this, "Before register coalescing");
+
+  RegClassInfo.runOnMachineFunction(fn);
+
+  // Join (coalesce) intervals if requested.
+  if (EnableJoining)
+    joinAllIntervals();
+
+  // After deleting a lot of copies, register classes may be less constrained.
+  // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // DPR inflation.
+  array_pod_sort(InflateRegs.begin(), InflateRegs.end());
+  InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
+                    InflateRegs.end());
+  DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
+  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
+    unsigned Reg = InflateRegs[i];
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    if (MRI->recomputeRegClass(Reg, *TM)) {
+      DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
+                   << MRI->getRegClass(Reg)->getName() << '\n');
+      ++NumInflated;
+    }
+  }
+
+  DEBUG(dump());
+  if (VerifyCoalescing)
+    MF->verify(this, "After register coalescing");
+  return true;
+}
+
+/// print - Implement the dump method.
+void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
+   LIS->print(O, m);
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.h b/contrib/llvm/lib/CodeGen/RegisterCoalescer.h
new file mode 100644
index 000000000000..47c3df14606d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.h
@@ -0,0 +1,120 @@
+//===-- RegisterCoalescer.h - Register Coalescing Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the abstract interface for register coalescers,
+// allowing them to interact with and query register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGISTER_COALESCER_H
+#define LLVM_CODEGEN_REGISTER_COALESCER_H
+
+namespace llvm {
+
+  class MachineInstr;
+  class TargetRegisterInfo;
+  class TargetRegisterClass;
+  class TargetInstrInfo;
+
+  /// CoalescerPair - A helper class for register coalescers. When deciding if
+  /// two registers can be coalesced, CoalescerPair can determine if a copy
+  /// instruction would become an identity copy after coalescing.
+  class CoalescerPair {
+    const TargetRegisterInfo &TRI;
+
+    /// DstReg - The register that will be left after coalescing. It can be a
+    /// virtual or physical register.
+    unsigned DstReg;
+
+    /// SrcReg - the virtual register that will be coalesced into dstReg.
+    unsigned SrcReg;
+
+    /// DstIdx - The sub-register index of the old DstReg in the new coalesced
+    /// register.
+    unsigned DstIdx;
+
+    /// SrcIdx - The sub-register index of the old SrcReg in the new coalesced
+    /// register.
+    unsigned SrcIdx;
+
+    /// Partial - True when the original copy was a partial subregister copy.
+    bool Partial;
+
+    /// CrossClass - True when both regs are virtual, and newRC is constrained.
+    bool CrossClass;
+
+    /// Flipped - True when DstReg and SrcReg are reversed from the original
+    /// copy instruction.
+    bool Flipped;
+
+    /// NewRC - The register class of the coalesced register, or NULL if DstReg
+    /// is a physreg. This register class may be a super-register of both
+    /// SrcReg and DstReg.
+    const TargetRegisterClass *NewRC;
+
+  public:
+    CoalescerPair(const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+
+    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+    /// No need to call setRegisters().
+    CoalescerPair(unsigned VirtReg, unsigned PhysReg,
+                  const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+
+    /// setRegisters - set registers to match the copy instruction MI. Return
+    /// false if MI is not a coalescable copy instruction.
+    bool setRegisters(const MachineInstr*);
+
+    /// flip - Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// because DstReg is a physical register, or SubIdx is set.
+    bool flip();
+
+    /// isCoalescable - Return true if MI is a copy instruction that will become
+    /// an identity copy after coalescing.
+    bool isCoalescable(const MachineInstr*) const;
+
+    /// isPhys - Return true if DstReg is a physical register.
+    bool isPhys() const { return !NewRC; }
+
+    /// isPartial - Return true if the original copy instruction did not copy
+    /// the full register, but was a subreg operation.
+    bool isPartial() const { return Partial; }
+
+    /// isCrossClass - Return true if DstReg is virtual and NewRC is a smaller
+    /// register class than DstReg's.
+    bool isCrossClass() const { return CrossClass; }
+
+    /// isFlipped - Return true when getSrcReg is the register being defined by
+    /// the original copy instruction.
+    bool isFlipped() const { return Flipped; }
+
+    /// getDstReg - Return the register (virtual or physical) that will remain
+    /// after coalescing.
+    unsigned getDstReg() const { return DstReg; }
+
+    /// getSrcReg - Return the virtual register that will be coalesced away.
+    unsigned getSrcReg() const { return SrcReg; }
+
+    /// getDstIdx - Return the subregister index that DstReg will be coalesced
+    /// into, or 0.
+    unsigned getDstIdx() const { return DstIdx; }
+
+    /// getSrcIdx - Return the subregister index that SrcReg will be coalesced
+    /// into, or 0.
+    unsigned getSrcIdx() const { return SrcIdx; }
+
+    /// getNewRC - Return the register class of the coalesced register.
+    const TargetRegisterClass *getNewRC() const { return NewRC; }
+  };
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
new file mode 100644
index 000000000000..97f22e1049f6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -0,0 +1,793 @@
+//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+/// Increase pressure for each pressure set provided by TargetRegisterInfo.
+static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                std::vector<unsigned> &MaxSetPressure,
+                                const int *PSet, unsigned Weight) {
+  for (; *PSet != -1; ++PSet) {
+    CurrSetPressure[*PSet] += Weight;
+    if (&CurrSetPressure != &MaxSetPressure
+        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
+      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
+    }
+  }
+}
+
+/// Decrease pressure for each pressure set provided by TargetRegisterInfo.
+static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const int *PSet, unsigned Weight) {
+  for (; *PSet != -1; ++PSet) {
+    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSet] -= Weight;
+  }
+}
+
+/// Directly increase pressure only within this RegisterPressure result.
+void RegisterPressure::increase(unsigned Reg, const TargetRegisterInfo *TRI,
+                                const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+    increaseSetPressure(MaxSetPressure, MaxSetPressure,
+                        TRI->getRegClassPressureSets(RC),
+                        TRI->getRegClassWeight(RC).RegWeight);
+  }
+  else {
+    increaseSetPressure(MaxSetPressure, MaxSetPressure,
+                        TRI->getRegUnitPressureSets(Reg),
+                        TRI->getRegUnitWeight(Reg));
+  }
+}
+
+/// Directly decrease pressure only within this RegisterPressure result.
+void RegisterPressure::decrease(unsigned Reg, const TargetRegisterInfo *TRI,
+                                const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+    decreaseSetPressure(MaxSetPressure, TRI->getRegClassPressureSets(RC),
+                        TRI->getRegClassWeight(RC).RegWeight);
+  }
+  else {
+    decreaseSetPressure(MaxSetPressure, TRI->getRegUnitPressureSets(Reg),
+                        TRI->getRegUnitWeight(Reg));
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+static void dumpSetPressure(const std::vector<unsigned> &SetPressure,
+                            const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = SetPressure.size(); i < e; ++i) {
+    if (SetPressure[i] != 0)
+      dbgs() << TRI->getRegPressureSetName(i) << "=" << SetPressure[i] << '\n';
+  }
+}
+
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
+  dbgs() << "Max Pressure: ";
+  dumpSetPressure(MaxSetPressure, TRI);
+  dbgs() << "Live In: ";
+  for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
+  dbgs() << '\n';
+  dbgs() << "Live Out: ";
+  for (unsigned i = 0, e = LiveOutRegs.size(); i < e; ++i)
+    dbgs() << PrintReg(LiveOutRegs[i], TRI) << " ";
+  dbgs() << '\n';
+}
+
+void RegPressureTracker::dump() const {
+  dbgs() << "Curr Pressure: ";
+  dumpSetPressure(CurrSetPressure, TRI);
+  P.dump(TRI);
+}
+#endif
+
+/// Increase the current pressure as impacted by these registers and bump
+/// the high water mark if needed.
+void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
+    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
+      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                          TRI->getRegClassPressureSets(RC),
+                          TRI->getRegClassWeight(RC).RegWeight);
+    }
+    else {
+      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
+                          TRI->getRegUnitPressureSets(Regs[I]),
+                          TRI->getRegUnitWeight(Regs[I]));
+    }
+  }
+}
+
+/// Simply decrease the current pressure as impacted by these registers.
+void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> Regs) {
+  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
+    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
+      decreaseSetPressure(CurrSetPressure,
+                          TRI->getRegClassPressureSets(RC),
+                          TRI->getRegClassWeight(RC).RegWeight);
+    }
+    else {
+      decreaseSetPressure(CurrSetPressure, TRI->getRegUnitPressureSets(Regs[I]),
+                          TRI->getRegUnitWeight(Regs[I]));
+    }
+  }
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void IntervalPressure::reset() {
+  TopIdx = BottomIdx = SlotIndex();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void RegionPressure::reset() {
+  TopPos = BottomPos = MachineBasicBlock::const_iterator();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// If the current top is not less than or equal to the next index, open it.
+/// We happen to need the SlotIndex for the next top for pressure update.
+void IntervalPressure::openTop(SlotIndex NextTop) {
+  if (TopIdx <= NextTop)
+    return;
+  TopIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current top is the previous instruction (before receding), open it.
+void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) {
+  if (TopPos != PrevTop)
+    return;
+  TopPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is not greater than the previous index, open it.
+void IntervalPressure::openBottom(SlotIndex PrevBottom) {
+  if (BottomIdx > PrevBottom)
+    return;
+  BottomIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is the previous instr (before advancing), open it.
+void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
+  if (BottomPos != PrevBottom)
+    return;
+  BottomPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+const LiveInterval *RegPressureTracker::getInterval(unsigned Reg) const {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return &LIS->getInterval(Reg);
+  return LIS->getCachedRegUnit(Reg);
+}
+
+/// Setup the RegPressureTracker.
+///
+/// TODO: Add support for pressure without LiveIntervals.
+void RegPressureTracker::init(const MachineFunction *mf,
+                              const RegisterClassInfo *rci,
+                              const LiveIntervals *lis,
+                              const MachineBasicBlock *mbb,
+                              MachineBasicBlock::const_iterator pos)
+{
+  MF = mf;
+  TRI = MF->getTarget().getRegisterInfo();
+  RCI = rci;
+  MRI = &MF->getRegInfo();
+  MBB = mbb;
+
+  if (RequireIntervals) {
+    assert(lis && "IntervalPressure requires LiveIntervals");
+    LIS = lis;
+  }
+
+  CurrPos = pos;
+  CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+  P.MaxSetPressure = CurrSetPressure;
+
+  LiveRegs.PhysRegs.clear();
+  LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs());
+  LiveRegs.VirtRegs.clear();
+  LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs());
+}
+
+/// Does this pressure result have a valid top position and live ins.
+bool RegPressureTracker::isTopClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).TopIdx.isValid();
+  return (static_cast<RegionPressure&>(P).TopPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Does this pressure result have a valid bottom position and live outs.
+bool RegPressureTracker::isBottomClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).BottomIdx.isValid();
+  return (static_cast<RegionPressure&>(P).BottomPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+
+SlotIndex RegPressureTracker::getCurrSlot() const {
+  MachineBasicBlock::const_iterator IdxPos = CurrPos;
+  while (IdxPos != MBB->end() && IdxPos->isDebugValue())
+    ++IdxPos;
+  if (IdxPos == MBB->end())
+    return LIS->getMBBEndIdx(MBB);
+  return LIS->getInstructionIndex(IdxPos).getRegSlot();
+}
+
+/// Set the boundary for the top of the region and summarize live ins.
+void RegPressureTracker::closeTop() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).TopIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).TopPos = CurrPos;
+
+  assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
+  P.LiveInRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size());
+  P.LiveInRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I)
+    P.LiveInRegs.push_back(*I);
+  std::sort(P.LiveInRegs.begin(), P.LiveInRegs.end());
+  P.LiveInRegs.erase(std::unique(P.LiveInRegs.begin(), P.LiveInRegs.end()),
+                     P.LiveInRegs.end());
+}
+
+/// Set the boundary for the bottom of the region and summarize live outs.
+void RegPressureTracker::closeBottom() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).BottomIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).BottomPos = CurrPos;
+
+  assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
+  P.LiveOutRegs.reserve(LiveRegs.PhysRegs.size() + LiveRegs.VirtRegs.size());
+  P.LiveOutRegs.append(LiveRegs.PhysRegs.begin(), LiveRegs.PhysRegs.end());
+  for (SparseSet<unsigned>::const_iterator I =
+         LiveRegs.VirtRegs.begin(), E = LiveRegs.VirtRegs.end(); I != E; ++I)
+    P.LiveOutRegs.push_back(*I);
+  std::sort(P.LiveOutRegs.begin(), P.LiveOutRegs.end());
+  P.LiveOutRegs.erase(std::unique(P.LiveOutRegs.begin(), P.LiveOutRegs.end()),
+                      P.LiveOutRegs.end());
+}
+
+/// Finalize the region boundaries and record live ins and live outs.
+void RegPressureTracker::closeRegion() {
+  if (!isTopClosed() && !isBottomClosed()) {
+    assert(LiveRegs.PhysRegs.empty() && LiveRegs.VirtRegs.empty() &&
+           "no region boundary");
+    return;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+  else if (!isTopClosed())
+    closeTop();
+  // If both top and bottom are closed, do nothing.
+}
+
+/// \brief Convenient wrapper for checking membership in RegisterOperands.
+static bool containsReg(ArrayRef<unsigned> Regs, unsigned Reg) {
+  return std::find(Regs.begin(), Regs.end(), Reg) != Regs.end();
+}
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+class RegisterOperands {
+  const TargetRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+
+public:
+  SmallVector<unsigned, 8> Uses;
+  SmallVector<unsigned, 8> Defs;
+  SmallVector<unsigned, 8> DeadDefs;
+
+  RegisterOperands(const TargetRegisterInfo *tri,
+                   const MachineRegisterInfo *mri): TRI(tri), MRI(mri) {}
+
+  /// Push this operand's register onto the correct vector.
+  void collect(const MachineOperand &MO) {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    if (MO.readsReg())
+      pushRegUnits(MO.getReg(), Uses);
+    if (MO.isDef()) {
+      if (MO.isDead())
+        pushRegUnits(MO.getReg(), DeadDefs);
+      else
+        pushRegUnits(MO.getReg(), Defs);
+    }
+  }
+
+protected:
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &Regs) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (containsReg(Regs, Reg))
+        return;
+      Regs.push_back(Reg);
+    }
+    else if (MRI->isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+        if (containsReg(Regs, *Units))
+          continue;
+        Regs.push_back(*Units);
+      }
+    }
+  }
+};
+
+/// Collect physical and virtual register operands.
+static void collectOperands(const MachineInstr *MI,
+                            RegisterOperands &RegOpers) {
+  for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+    RegOpers.collect(*OperI);
+
+  // Remove redundant physreg dead defs.
+  SmallVectorImpl<unsigned>::iterator I =
+    std::remove_if(RegOpers.DeadDefs.begin(), RegOpers.DeadDefs.end(),
+                   std::bind1st(std::ptr_fun(containsReg), RegOpers.Defs));
+  RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
+}
+
+/// Force liveness of registers.
+void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    if (LiveRegs.insert(Regs[i]))
+      increaseRegPressure(Regs[i]);
+  }
+}
+
+/// Add Reg to the live in set and increase max pressure.
+void RegPressureTracker::discoverLiveIn(unsigned Reg) {
+  assert(!LiveRegs.contains(Reg) && "avoid bumping max pressure twice");
+  if (containsReg(P.LiveInRegs, Reg))
+    return;
+
+  // At live in discovery, unconditionally increase the high water mark.
+  P.LiveInRegs.push_back(Reg);
+  P.increase(Reg, TRI, MRI);
+}
+
+/// Add Reg to the live out set and increase max pressure.
+void RegPressureTracker::discoverLiveOut(unsigned Reg) {
+  assert(!LiveRegs.contains(Reg) && "avoid bumping max pressure twice");
+  if (containsReg(P.LiveOutRegs, Reg))
+    return;
+
+  // At live out discovery, unconditionally increase the high water mark.
+  P.LiveOutRegs.push_back(Reg);
+  P.increase(Reg, TRI, MRI);
+}
+
+/// Recede across the previous instruction.
+bool RegPressureTracker::recede() {
+  // Check for the top of the analyzable region.
+  if (CurrPos == MBB->begin()) {
+    closeRegion();
+    return false;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  do
+    --CurrPos;
+  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+
+  if (CurrPos->isDebugValue()) {
+    closeRegion();
+    return false;
+  }
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+
+  RegisterOperands RegOpers(TRI, MRI);
+  collectOperands(CurrPos, RegOpers);
+
+  // Boost pressure for all dead defs together.
+  increaseRegPressure(RegOpers.DeadDefs);
+  decreaseRegPressure(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  // TODO: consider earlyclobbers?
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Defs[i];
+    if (LiveRegs.erase(Reg))
+      decreaseRegPressure(Reg);
+    else
+      discoverLiveOut(Reg);
+  }
+
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Uses[i];
+    if (!LiveRegs.contains(Reg)) {
+      // Adjust liveouts if LiveIntervals are available.
+      if (RequireIntervals) {
+        const LiveInterval *LI = getInterval(Reg);
+        if (LI && !LI->killedAt(SlotIdx))
+          discoverLiveOut(Reg);
+      }
+      increaseRegPressure(Reg);
+      LiveRegs.insert(Reg);
+    }
+  }
+  return true;
+}
+
+/// Advance across the current instruction.
+bool RegPressureTracker::advance() {
+  // Check for the bottom of the analyzable region.
+  if (CurrPos == MBB->end()) {
+    closeRegion();
+    return false;
+  }
+  if (!isTopClosed())
+    closeTop();
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = getCurrSlot();
+
+  // Open the bottom of the region using slot indexes.
+  if (isBottomClosed()) {
+    if (RequireIntervals)
+      static_cast<IntervalPressure&>(P).openBottom(SlotIdx);
+    else
+      static_cast<RegionPressure&>(P).openBottom(CurrPos);
+  }
+
+  RegisterOperands RegOpers(TRI, MRI);
+  collectOperands(CurrPos, RegOpers);
+
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Uses[i];
+    // Discover live-ins.
+    bool isLive = LiveRegs.contains(Reg);
+    if (!isLive)
+      discoverLiveIn(Reg);
+    // Kill liveness at last uses.
+    bool lastUse = false;
+    if (RequireIntervals) {
+      const LiveInterval *LI = getInterval(Reg);
+      lastUse = LI && LI->killedAt(SlotIdx);
+    }
+    else {
+      // Allocatable physregs are always single-use before register rewriting.
+      lastUse = !TargetRegisterInfo::isVirtualRegister(Reg);
+    }
+    if (lastUse && isLive) {
+      LiveRegs.erase(Reg);
+      decreaseRegPressure(Reg);
+    }
+    else if (!lastUse && !isLive)
+      increaseRegPressure(Reg);
+  }
+
+  // Generate liveness for defs.
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Defs[i];
+    if (LiveRegs.insert(Reg))
+      increaseRegPressure(Reg);
+  }
+
+  // Boost pressure for all dead defs together.
+  increaseRegPressure(RegOpers.DeadDefs);
+  decreaseRegPressure(RegOpers.DeadDefs);
+
+  // Find the next instruction.
+  do
+    ++CurrPos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue());
+  return true;
+}
+
+/// Find the max change in excess pressure across all sets.
+static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
+                                       ArrayRef<unsigned> NewPressureVec,
+                                       RegPressureDelta &Delta,
+                                       const TargetRegisterInfo *TRI) {
+  int ExcessUnits = 0;
+  unsigned PSetID = ~0U;
+  for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldPressureVec[i];
+    unsigned PNew = NewPressureVec[i];
+    int PDiff = (int)PNew - (int)POld;
+    if (!PDiff) // No change in this set in the common case.
+      continue;
+    // Only consider change beyond the limit.
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    if (Limit > POld) {
+      if (Limit > PNew)
+        PDiff = 0;            // Under the limit
+      else
+        PDiff = PNew - Limit; // Just exceeded limit.
+    }
+    else if (Limit > PNew)
+      PDiff = Limit - POld;   // Just obeyed limit.
+
+    if (std::abs(PDiff) > std::abs(ExcessUnits)) {
+      ExcessUnits = PDiff;
+      PSetID = i;
+    }
+  }
+  Delta.Excess.PSetID = PSetID;
+  Delta.Excess.UnitIncrease = ExcessUnits;
+}
+
+/// Find the max change in max pressure that either surpasses a critical PSet
+/// limit or exceeds the current MaxPressureLimit.
+///
+/// FIXME: comparing each element of the old and new MaxPressure vectors here is
+/// silly. It's done now to demonstrate the concept but will go away with a
+/// RegPressureTracker API change to work with pressure differences.
+static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
+                                    ArrayRef<unsigned> NewMaxPressureVec,
+                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<unsigned> MaxPressureLimit,
+                                    RegPressureDelta &Delta) {
+  Delta.CriticalMax = PressureElement();
+  Delta.CurrentMax = PressureElement();
+
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldMaxPressureVec[i];
+    unsigned PNew = NewMaxPressureVec[i];
+    if (PNew == POld) // No change in this set in the common case.
+      continue;
+
+    while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
+      ++CritIdx;
+
+    if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
+      int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
+      if (PDiff > Delta.CriticalMax.UnitIncrease) {
+        Delta.CriticalMax.PSetID = i;
+        Delta.CriticalMax.UnitIncrease = PDiff;
+      }
+    }
+
+    // Find the greatest increase above MaxPressureLimit.
+    // (Ignores negative MDiff).
+    int MDiff = (int)PNew - (int)MaxPressureLimit[i];
+    if (MDiff > Delta.CurrentMax.UnitIncrease) {
+      Delta.CurrentMax.PSetID = i;
+      Delta.CurrentMax.UnitIncrease = PNew;
+    }
+  }
+}
+
+/// Record the upward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers(TRI, MRI);
+  collectOperands(MI, RegOpers);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  increaseRegPressure(RegOpers.DeadDefs);
+  decreaseRegPressure(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Defs[i];
+    if (!containsReg(RegOpers.Uses, Reg))
+      decreaseRegPressure(Reg);
+  }
+  // Generate liveness for uses.
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Uses[i];
+    if (!LiveRegs.contains(Reg))
+      increaseRegPressure(Reg);
+  }
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// bottom-up. Find the pressure set with the most change beyond its pressure
+/// limit based on the tracker's current pressure, and return the change in
+/// number of register units of that pressure set introduced by this
+/// instruction.
+///
+/// This assumes that the current LiveOut set is sufficient.
+///
+/// FIXME: This is expensive for an on-the-fly query. We need to cache the
+/// result per-SUnit with enough information to adjust for the current
+/// scheduling position. But this works as a proof of concept.
+void RegPressureTracker::
+getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                          ArrayRef<PressureElement> CriticalPSets,
+                          ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  // FIXME: The snapshot heap space should persist. But I'm planning to
+  // summarize the pressure effect so we don't need to snapshot at all.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+static bool findUseBetween(unsigned Reg,
+                           SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                           const MachineRegisterInfo *MRI,
+                           const LiveIntervals *LIS) {
+  for (MachineRegisterInfo::use_nodbg_iterator
+         UI = MRI->use_nodbg_begin(Reg), UE = MRI->use_nodbg_end();
+         UI != UE; UI.skipInstruction()) {
+      const MachineInstr* MI = &*UI;
+      if (MI->isDebugValue())
+        continue;
+      SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot();
+      if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx)
+        return true;
+  }
+  return false;
+}
+
+/// Record the downward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers(TRI, MRI);
+  collectOperands(MI, RegOpers);
+
+  // Kill liveness at last uses. Assume allocatable physregs are single-use
+  // rather than checking LiveIntervals.
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(MI).getRegSlot();
+
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
+    unsigned Reg = RegOpers.Uses[i];
+    if (RequireIntervals) {
+      // FIXME: allow the caller to pass in the list of vreg uses that remain
+      // to be bottom-scheduled to avoid searching uses at each query.
+      SlotIndex CurrIdx = getCurrSlot();
+      const LiveInterval *LI = getInterval(Reg);
+      if (LI && LI->killedAt(SlotIdx)
+          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+        decreaseRegPressure(Reg);
+      }
+    }
+    else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      // Allocatable physregs are always single-use before register rewriting.
+      decreaseRegPressure(Reg);
+    }
+  }
+
+  // Generate liveness for defs.
+  increaseRegPressure(RegOpers.Defs);
+
+  // Boost pressure for all dead defs together.
+  increaseRegPressure(RegOpers.DeadDefs);
+  decreaseRegPressure(RegOpers.DeadDefs);
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// top-down. Find the register class with the most change in its pressure limit
+/// based on the tracker's current pressure, and return the number of excess
+/// register units of that pressure set introduced by this instruction.
+///
+/// This assumes that the current LiveIn set is sufficient.
+void RegPressureTracker::
+getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
+         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Get the pressure of each PSet after traversing this instruction bottom-up.
+void RegPressureTracker::
+getUpwardPressure(const MachineInstr *MI,
+                  std::vector<unsigned> &PressureResult,
+                  std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
+
+/// Get the pressure of each PSet after traversing this instruction top-down.
+void RegPressureTracker::
+getDownwardPressure(const MachineInstr *MI,
+                    std::vector<unsigned> &PressureResult,
+                    std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
new file mode 100644
index 000000000000..07ace7a436c7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -0,0 +1,443 @@
+//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine register scavenger. It can provide
+// information, such as unused registers, at any point in a machine basic block.
+// It also provides a mechanism to make registers available by evicting them to
+// spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg-scavenging"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+/// setUsed - Set the register and its sub-registers as being used.
+void RegScavenger::setUsed(unsigned Reg) {
+  RegsAvailable.reset(Reg);
+
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    RegsAvailable.reset(*SubRegs);
+}
+
+bool RegScavenger::isAliasUsed(unsigned Reg) const {
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (isUsed(*AI, *AI == Reg))
+      return true;
+  return false;
+}
+
+void RegScavenger::initRegState() {
+  for (SmallVector<ScavengedInfo, 2>::iterator I = Scavenged.begin(),
+       IE = Scavenged.end(); I != IE; ++I) {
+    I->Reg = 0;
+    I->Restore = NULL;
+  }
+
+  // All registers started out unused.
+  RegsAvailable.set();
+
+  if (!MBB)
+    return;
+
+  // Live-in registers are in use.
+  for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+         E = MBB->livein_end(); I != E; ++I)
+    setUsed(*I);
+
+  // Pristine CSRs are also unavailable.
+  BitVector PR = MBB->getParent()->getFrameInfo()->getPristineRegs(MBB);
+  for (int I = PR.find_first(); I>0; I = PR.find_next(I))
+    setUsed(I);
+}
+
+void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
+  MachineFunction &MF = *mbb->getParent();
+  const TargetMachine &TM = MF.getTarget();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  assert((NumPhysRegs == 0 || NumPhysRegs == TRI->getNumRegs()) &&
+         "Target changed?");
+
+  // It is not possible to use the register scavenger after late optimization
+  // passes that don't preserve accurate liveness information.
+  assert(MRI->tracksLiveness() &&
+         "Cannot use register scavenger with inaccurate liveness");
+
+  // Self-initialize.
+  if (!MBB) {
+    NumPhysRegs = TRI->getNumRegs();
+    RegsAvailable.resize(NumPhysRegs);
+    KillRegs.resize(NumPhysRegs);
+    DefRegs.resize(NumPhysRegs);
+
+    // Create callee-saved registers bitvector.
+    CalleeSavedRegs.resize(NumPhysRegs);
+    const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
+    if (CSRegs != NULL)
+      for (unsigned i = 0; CSRegs[i]; ++i)
+        CalleeSavedRegs.set(CSRegs[i]);
+  }
+
+  MBB = mbb;
+  initRegState();
+
+  Tracking = false;
+}
+
+void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
+  BV.set(Reg);
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    BV.set(*SubRegs);
+}
+
+void RegScavenger::determineKillsAndDefs() {
+  assert(Tracking && "Must be tracking to determine kills and defs");
+
+  MachineInstr *MI = MBBI;
+  assert(!MI->isDebugValue() && "Debug values have no kills or defs");
+
+  // Find out which registers are early clobbered, killed, defined, and marked
+  // def-dead in this instruction.
+  // FIXME: The scavenger is not predication aware. If the instruction is
+  // predicated, conservatively assume "kill" markers do not actually kill the
+  // register. Similarly ignores "dead" markers.
+  bool isPred = TII->isPredicated(MI);
+  KillRegs.reset();
+  DefRegs.reset();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      (isPred ? DefRegs : KillRegs).setBitsNotInMask(MO.getRegMask());
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || isReserved(Reg))
+      continue;
+
+    if (MO.isUse()) {
+      // Ignore undef uses.
+      if (MO.isUndef())
+        continue;
+      if (!isPred && MO.isKill())
+        addRegWithSubRegs(KillRegs, Reg);
+    } else {
+      assert(MO.isDef());
+      if (!isPred && MO.isDead())
+        addRegWithSubRegs(KillRegs, Reg);
+      else
+        addRegWithSubRegs(DefRegs, Reg);
+    }
+  }
+}
+
+void RegScavenger::unprocess() {
+  assert(Tracking && "Cannot unprocess because we're not tracking");
+
+  MachineInstr *MI = MBBI;
+  if (MI->isDebugValue())
+    return;
+
+  determineKillsAndDefs();
+
+  // Commit the changes.
+  setUsed(KillRegs);
+  setUnused(DefRegs);
+
+  if (MBBI == MBB->begin()) {
+    MBBI = MachineBasicBlock::iterator(NULL);
+    Tracking = false;
+  } else
+    --MBBI;
+}
+
+void RegScavenger::forward() {
+  // Move ptr forward.
+  if (!Tracking) {
+    MBBI = MBB->begin();
+    Tracking = true;
+  } else {
+    assert(MBBI != MBB->end() && "Already past the end of the basic block!");
+    MBBI = llvm::next(MBBI);
+  }
+  assert(MBBI != MBB->end() && "Already at the end of the basic block!");
+
+  MachineInstr *MI = MBBI;
+
+  for (SmallVector<ScavengedInfo, 2>::iterator I = Scavenged.begin(),
+       IE = Scavenged.end(); I != IE; ++I) {
+    if (I->Restore != MI)
+      continue;
+
+    I->Reg = 0;
+    I->Restore = NULL;
+  }
+
+  if (MI->isDebugValue())
+    return;
+
+  determineKillsAndDefs();
+
+  // Verify uses and defs.
+#ifndef NDEBUG
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) || isReserved(Reg))
+      continue;
+    if (MO.isUse()) {
+      if (MO.isUndef())
+        continue;
+      if (!isUsed(Reg)) {
+        // Check if it's partial live: e.g.
+        // D0 = insert_subreg D0<undef>, S0
+        // ... D0
+        // The problem is the insert_subreg could be eliminated. The use of
+        // D0 is using a partially undef value. This is not *incorrect* since
+        // S1 is can be freely clobbered.
+        // Ideally we would like a way to model this, but leaving the
+        // insert_subreg around causes both correctness and performance issues.
+        bool SubUsed = false;
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          if (isUsed(*SubRegs)) {
+            SubUsed = true;
+            break;
+          }
+        if (!SubUsed) {
+          MBB->getParent()->verify(NULL, "In Register Scavenger");
+          llvm_unreachable("Using an undefined register!");
+        }
+        (void)SubUsed;
+      }
+    } else {
+      assert(MO.isDef());
+#if 0
+      // FIXME: Enable this once we've figured out how to correctly transfer
+      // implicit kills during codegen passes like the coalescer.
+      assert((KillRegs.test(Reg) || isUnused(Reg) ||
+              isLiveInButUnusedBefore(Reg, MI, MBB, TRI, MRI)) &&
+             "Re-defining a live register!");
+#endif
+    }
+  }
+#endif // NDEBUG
+
+  // Commit the changes.
+  setUnused(KillRegs);
+  setUsed(DefRegs);
+}
+
+void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
+  used = RegsAvailable;
+  used.flip();
+  if (includeReserved)
+    used |= MRI->getReservedRegs();
+  else
+    used.reset(MRI->getReservedRegs());
+}
+
+unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I)) {
+      DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(*I) <<
+            "\n");
+      return *I;
+    }
+  return 0;
+}
+
+/// getRegsAvailable - Return all available registers in the register class
+/// in Mask.
+BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
+  BitVector Mask(TRI->getNumRegs());
+  for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
+    if (!isAliasUsed(*I))
+      Mask.set(*I);
+  return Mask;
+}
+
+/// findSurvivorReg - Return the candidate register that is unused for the
+/// longest after StargMII. UseMI is set to the instruction where the search
+/// stopped.
+///
+/// No more than InstrLimit instructions are inspected.
+///
+unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
+                                       BitVector &Candidates,
+                                       unsigned InstrLimit,
+                                       MachineBasicBlock::iterator &UseMI) {
+  int Survivor = Candidates.find_first();
+  assert(Survivor > 0 && "No candidates for scavenging");
+
+  MachineBasicBlock::iterator ME = MBB->getFirstTerminator();
+  assert(StartMI != ME && "MI already at terminator");
+  MachineBasicBlock::iterator RestorePointMI = StartMI;
+  MachineBasicBlock::iterator MI = StartMI;
+
+  bool inVirtLiveRange = false;
+  for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
+    if (MI->isDebugValue()) {
+      ++InstrLimit; // Don't count debug instructions
+      continue;
+    }
+    bool isVirtKillInsn = false;
+    bool isVirtDefInsn = false;
+    // Remove any candidates touched by instruction.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isRegMask())
+        Candidates.clearBitsNotInMask(MO.getRegMask());
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg())
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        if (MO.isDef())
+          isVirtDefInsn = true;
+        else if (MO.isKill())
+          isVirtKillInsn = true;
+        continue;
+      }
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
+    }
+    // If we're not in a virtual reg's live range, this is a valid
+    // restore point.
+    if (!inVirtLiveRange) RestorePointMI = MI;
+
+    // Update whether we're in the live range of a virtual register
+    if (isVirtKillInsn) inVirtLiveRange = false;
+    if (isVirtDefInsn) inVirtLiveRange = true;
+
+    // Was our survivor untouched by this instruction?
+    if (Candidates.test(Survivor))
+      continue;
+
+    // All candidates gone?
+    if (Candidates.none())
+      break;
+
+    Survivor = Candidates.find_first();
+  }
+  // If we ran off the end, that's where we want to restore.
+  if (MI == ME) RestorePointMI = ME;
+  assert (RestorePointMI != StartMI &&
+          "No available scavenger restore location!");
+
+  // We ran out of candidates, so stop the search.
+  UseMI = RestorePointMI;
+  return Survivor;
+}
+
+static unsigned getFrameIndexOperandNum(MachineInstr *MI) {
+  unsigned i = 0;
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  return i;
+}
+
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  // Consider all allocatable registers in the register class initially
+  BitVector Candidates =
+    TRI->getAllocatableSet(*I->getParent()->getParent(), RC);
+
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = I->getOperand(i);
+    if (MO.isReg() && MO.getReg() != 0 &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill. Search explicitly rather than masking out based on
+  // RegsAvailable, as RegsAvailable does not take aliases into account.
+  // That's what getRegsAvailable() is for.
+  BitVector Available = getRegsAvailable(RC);
+  Available &= Candidates;
+  if (Available.any())
+    Candidates = Available;
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+
+  // If we found an unused register there is no reason to spill it.
+  if (!isAliasUsed(SReg)) {
+    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
+    return SReg;
+  }
+
+  // Find an available scavenging slot.
+  unsigned SI;
+  for (SI = 0; SI < Scavenged.size(); ++SI)
+    if (Scavenged[SI].Reg == 0)
+      break;
+
+  if (SI == Scavenged.size()) {
+    // We need to scavenge a register but have no spill slot, the target
+    // must know how to do it (if not, we'll assert below).
+    Scavenged.push_back(ScavengedInfo());
+  }
+
+  // Avoid infinite regress
+  Scavenged[SI].Reg = SReg;
+
+  // If the target knows how to save/restore the register, let it do so;
+  // otherwise, use the emergency stack spill slot.
+  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
+    // Spill the scavenged register before I.
+    assert(Scavenged[SI].FrameIndex >= 0 &&
+           "Cannot scavenge register without an emergency spill slot!");
+    TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex,
+                             RC, TRI);
+    MachineBasicBlock::iterator II = prior(I);
+
+    unsigned FIOperandNum = getFrameIndexOperandNum(II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+
+    // Restore the scavenged register before its use (or first terminator).
+    TII->loadRegFromStackSlot(*MBB, UseMI, SReg, Scavenged[SI].FrameIndex,
+                              RC, TRI);
+    II = prior(UseMI);
+
+    FIOperandNum = getFrameIndexOperandNum(II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+  }
+
+  Scavenged[SI].Restore = prior(UseMI);
+
+  // Doing this here leads to infinite regress.
+  // Scavenged[SI].Reg = SReg;
+
+  DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) <<
+        "\n");
+
+  return SReg;
+}
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
new file mode 100644
index 000000000000..07e5b470fb1e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -0,0 +1,642 @@
+//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <climits>
+using namespace llvm;
+
+#ifndef NDEBUG
+static cl::opt<bool> StressSchedOpt(
+  "stress-sched", cl::Hidden, cl::init(false),
+  cl::desc("Stress test instruction scheduling"));
+#endif
+
+void SchedulingPriorityQueue::anchor() { }
+
+ScheduleDAG::ScheduleDAG(MachineFunction &mf)
+  : TM(mf.getTarget()),
+    TII(TM.getInstrInfo()),
+    TRI(TM.getRegisterInfo()),
+    MF(mf), MRI(mf.getRegInfo()),
+    EntrySU(), ExitSU() {
+#ifndef NDEBUG
+  StressSched = StressSchedOpt;
+#endif
+}
+
+ScheduleDAG::~ScheduleDAG() {}
+
+/// Clear the DAG state (e.g. between scheduling regions).
+void ScheduleDAG::clearDAG() {
+  SUnits.clear();
+  EntrySU = SUnit();
+  ExitSU = SUnit();
+}
+
+/// getInstrDesc helper to handle SDNodes.
+const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
+  if (!Node || !Node->isMachineOpcode()) return NULL;
+  return &TII->get(Node->getMachineOpcode());
+}
+
+/// addPred - This adds the specified edge as a pred of the current node if
+/// not already.  It also adds the current node as a successor of the
+/// specified node.
+bool SUnit::addPred(const SDep &D, bool Required) {
+  // If this node already has this depenence, don't add a redundant one.
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+    // Zero-latency weak edges may be added purely for heuristic ordering. Don't
+    // add them if another kind of edge already exists.
+    if (!Required && I->getSUnit() == D.getSUnit())
+      return false;
+    if (I->overlaps(D)) {
+      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
+      if (I->getLatency() < D.getLatency()) {
+        SUnit *PredSU = I->getSUnit();
+        // Find the corresponding successor in N.
+        SDep ForwardD = *I;
+        ForwardD.setSUnit(this);
+        for (SmallVector<SDep, 4>::iterator II = PredSU->Succs.begin(),
+               EE = PredSU->Succs.end(); II != EE; ++II) {
+          if (*II == ForwardD) {
+            II->setLatency(D.getLatency());
+            break;
+          }
+        }
+        I->setLatency(D.getLatency());
+      }
+      return false;
+    }
+  }
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  // Update the bookkeeping.
+  if (D.getKind() == SDep::Data) {
+    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
+    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    ++NumPreds;
+    ++N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak()) {
+      ++WeakPredsLeft;
+    }
+    else {
+      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+      ++NumPredsLeft;
+    }
+  }
+  if (!isScheduled) {
+    if (D.isWeak()) {
+      ++N->WeakSuccsLeft;
+    }
+    else {
+      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+      ++N->NumSuccsLeft;
+    }
+  }
+  Preds.push_back(D);
+  N->Succs.push_back(P);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+  return true;
+}
+
+/// removePred - This removes the specified edge as a pred of the current
+/// node if it exists.  It also removes the current node as a successor of
+/// the specified node.
+void SUnit::removePred(const SDep &D) {
+  // Find the matching predecessor.
+  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I)
+    if (*I == D) {
+      // Find the corresponding successor in N.
+      SDep P = D;
+      P.setSUnit(this);
+      SUnit *N = D.getSUnit();
+      SmallVectorImpl<SDep>::iterator Succ = std::find(N->Succs.begin(),
+                                                       N->Succs.end(), P);
+      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+      N->Succs.erase(Succ);
+      Preds.erase(I);
+      // Update the bookkeeping.
+      if (P.getKind() == SDep::Data) {
+        assert(NumPreds > 0 && "NumPreds will underflow!");
+        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+        --NumPreds;
+        --N->NumSuccs;
+      }
+      if (!N->isScheduled) {
+        if (D.isWeak())
+          --WeakPredsLeft;
+        else {
+          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+          --NumPredsLeft;
+        }
+      }
+      if (!isScheduled) {
+        if (D.isWeak())
+          --N->WeakSuccsLeft;
+        else {
+          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+          --N->NumSuccsLeft;
+        }
+      }
+      if (P.getLatency() != 0) {
+        this->setDepthDirty();
+        N->setHeightDirty();
+      }
+      return;
+    }
+}
+
+void SUnit::setDepthDirty() {
+  if (!isDepthCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isDepthCurrent = false;
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isDepthCurrent)
+        WorkList.push_back(SuccSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setHeightDirty() {
+  if (!isHeightCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isHeightCurrent = false;
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isHeightCurrent)
+        WorkList.push_back(PredSU);
+    }
+  } while (!WorkList.empty());
+}
+
+/// setDepthToAtLeast - Update this node's successors to reflect the
+/// fact that this node's depth just increased.
+///
+void SUnit::setDepthToAtLeast(unsigned NewDepth) {
+  if (NewDepth <= getDepth())
+    return;
+  setDepthDirty();
+  Depth = NewDepth;
+  isDepthCurrent = true;
+}
+
+/// setHeightToAtLeast - Update this node's predecessors to reflect the
+/// fact that this node's height just increased.
+///
+void SUnit::setHeightToAtLeast(unsigned NewHeight) {
+  if (NewHeight <= getHeight())
+    return;
+  setHeightDirty();
+  Height = NewHeight;
+  isHeightCurrent = true;
+}
+
+/// ComputeDepth - Calculate the maximal path from the node to the exit.
+///
+void SUnit::ComputeDepth() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxPredDepth = 0;
+    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
+         E = Cur->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isDepthCurrent)
+        MaxPredDepth = std::max(MaxPredDepth,
+                                PredSU->Depth + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(PredSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxPredDepth != Cur->Depth) {
+        Cur->setDepthDirty();
+        Cur->Depth = MaxPredDepth;
+      }
+      Cur->isDepthCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// ComputeHeight - Calculate the maximal path from the node to the entry.
+///
+void SUnit::ComputeHeight() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxSuccHeight = 0;
+    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
+         E = Cur->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isHeightCurrent)
+        MaxSuccHeight = std::max(MaxSuccHeight,
+                                 SuccSU->Height + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(SuccSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxSuccHeight != Cur->Height) {
+        Cur->setHeightDirty();
+        Cur->Height = MaxSuccHeight;
+      }
+      Cur->isHeightCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::biasCriticalPath() {
+  if (NumPreds < 2)
+    return;
+
+  SUnit::pred_iterator BestI = Preds.begin();
+  unsigned MaxDepth = BestI->getSUnit()->getDepth();
+  for (SUnit::pred_iterator
+         I = llvm::next(BestI), E = Preds.end(); I != E; ++I) {
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+      BestI = I;
+  }
+  if (BestI != Preds.begin())
+    std::swap(*Preds.begin(), *BestI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
+/// a group of nodes flagged together.
+void SUnit::dump(const ScheduleDAG *G) const {
+  dbgs() << "SU(" << NodeNum << "): ";
+  G->dumpNode(this);
+}
+
+void SUnit::dumpAll(const ScheduleDAG *G) const {
+  dump(G);
+
+  dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
+  dbgs() << "  # succs left       : " << NumSuccsLeft << "\n";
+  if (WeakPredsLeft)
+    dbgs() << "  # weak preds left  : " << WeakPredsLeft << "\n";
+  if (WeakSuccsLeft)
+    dbgs() << "  # weak succs left  : " << WeakSuccsLeft << "\n";
+  dbgs() << "  # rdefs left       : " << NumRegDefsLeft << "\n";
+  dbgs() << "  Latency            : " << Latency << "\n";
+  dbgs() << "  Depth              : " << getDepth() << "\n";
+  dbgs() << "  Height             : " << getHeight() << "\n";
+
+  if (Preds.size() != 0) {
+    dbgs() << "  Predecessors:\n";
+    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      if (I->isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << "\n";
+    }
+  }
+  if (Succs.size() != 0) {
+    dbgs() << "  Successors:\n";
+    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      if (I->isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << "\n";
+    }
+  }
+  dbgs() << "\n";
+}
+#endif
+
+#ifndef NDEBUG
+/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
+/// their state is consistent. Return the number of scheduled nodes.
+///
+unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    if (!SUnits[i].isScheduled) {
+      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnits[i].isScheduled &&
+        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
+          unsigned(INT_MAX)) {
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has an unexpected "
+           << (isBottomUp ? "Height" : "Depth") << " value!\n";
+      AnyNotSched = true;
+    }
+    if (isBottomUp) {
+      if (SUnits[i].NumSuccsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has successors left!\n";
+        AnyNotSched = true;
+      }
+    } else {
+      if (SUnits[i].NumPredsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has predecessors left!\n";
+        AnyNotSched = true;
+      }
+    }
+  }
+  assert(!AnyNotSched);
+  return SUnits.size() - DeadNodes;
+}
+#endif
+
+/// InitDAGTopologicalSorting - create the initial topological
+/// ordering from the DAG to be scheduled.
+///
+/// The idea of the algorithm is taken from
+/// "Online algorithms for managing the topological order of
+/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+/// This is the MNR algorithm, which was first introduced by
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+/// "Maintaining a topological order under edge insertions".
+///
+/// Short description of the algorithm:
+///
+/// Topological ordering, ord, of a DAG maps each node to a topological
+/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+///
+/// This means that if there is a path from the node X to the node Z,
+/// then ord(X) < ord(Z).
+///
+/// This property can be used to check for reachability of nodes:
+/// if Z is reachable from X, then an insertion of the edge Z->X would
+/// create a cycle.
+///
+/// The algorithm first computes a topological ordering for the DAG by
+/// initializing the Index2Node and Node2Index arrays and then tries to keep
+/// the ordering up-to-date after edge insertions by reordering the DAG.
+///
+/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
+/// the nodes reachable from Y, and then shifts them using Shift to lie
+/// immediately after X in Index2Node.
+void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+  WorkList.reserve(DAGSize);
+
+  Index2Node.resize(DAGSize);
+  Node2Index.resize(DAGSize);
+
+  // Initialize the data structures.
+  if (ExitSU)
+    WorkList.push_back(ExitSU);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    // Temporarily use the Node2Index array as scratch space for degree counts.
+    Node2Index[NodeNum] = Degree;
+
+    // Is it a node without dependencies?
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      // Collect leaf nodes.
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize)
+      Allocate(SU->NodeNum, --Id);
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
+        // If all dependencies of the node are processed already,
+        // then the node can be computed now.
+        WorkList.push_back(SU);
+    }
+  }
+
+  Visited.resize(DAGSize);
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+      "Wrong topological sorting");
+    }
+  }
+#endif
+}
+
+/// AddPred - Updates the topological ordering to accommodate an edge
+/// to be added from SUnit X to SUnit Y.
+void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[Y->NodeNum];
+  UpperBound = Node2Index[X->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(X) < Ord(Y) ?
+  if (LowerBound < UpperBound) {
+    // Update the topological order.
+    Visited.reset();
+    DFS(Y, UpperBound, HasLoop);
+    assert(!HasLoop && "Inserted edge creates a loop!");
+    // Recompute topological indexes.
+    Shift(Visited, LowerBound, UpperBound);
+  }
+}
+
+/// RemovePred - Updates the topological ordering to accommodate an
+/// an edge to be removed from the specified node N from the predecessors
+/// of the current node M.
+void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
+  // InitDAGTopologicalSorting();
+}
+
+/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
+/// all nodes affected by the edge insertion. These nodes will later get new
+/// topological indexes by means of the Shift method.
+void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
+                                     bool &HasLoop) {
+  std::vector<const SUnit*> WorkList;
+  WorkList.reserve(SUnits.size());
+
+  WorkList.push_back(SU);
+  do {
+    SU = WorkList.back();
+    WorkList.pop_back();
+    Visited.set(SU->NodeNum);
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      unsigned s = SU->Succs[I].getSUnit()->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (s >= Node2Index.size())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        HasLoop = true;
+        return;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        WorkList.push_back(SU->Succs[I].getSUnit());
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+/// Shift - Renumber the nodes so that the topological ordering is
+/// preserved.
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
+                                       int UpperBound) {
+  std::vector<int> L;
+  int shift = 0;
+  int i;
+
+  for (i = LowerBound; i <= UpperBound; ++i) {
+    // w is node at topological index i.
+    int w = Index2Node[i];
+    if (Visited.test(w)) {
+      // Unmark.
+      Visited.reset(w);
+      L.push_back(w);
+      shift = shift + 1;
+    } else {
+      Allocate(w, i - shift);
+    }
+  }
+
+  for (unsigned j = 0; j < L.size(); ++j) {
+    Allocate(L[j], i - shift);
+    i = i + 1;
+  }
+}
+
+
+/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will
+/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).
+bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
+  // Is SU reachable from TargetSU via successor edges?
+  if (IsReachable(SU, TargetSU))
+    return true;
+  for (SUnit::pred_iterator
+         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I)
+    if (I->isAssignedRegDep() &&
+        IsReachable(SU, I->getSUnit()))
+      return true;
+  return false;
+}
+
+/// IsReachable - Checks if SU is reachable from TargetSU.
+bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
+                                             const SUnit *TargetSU) {
+  // If insertion of the edge SU->TargetSU would create a cycle
+  // then there is a path from TargetSU to SU.
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[TargetSU->NodeNum];
+  UpperBound = Node2Index[SU->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(TargetSU) < Ord(SU) ?
+  if (LowerBound < UpperBound) {
+    Visited.reset();
+    // There may be a path from TargetSU to SU. Check for it.
+    DFS(TargetSU, UpperBound, HasLoop);
+  }
+  return HasLoop;
+}
+
+/// Allocate - assign the topological index to the node n.
+void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
+  Node2Index[n] = index;
+  Index2Node[index] = n;
+}
+
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
+  : SUnits(sunits), ExitSU(exitsu) {}
+
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
new file mode 100644
index 000000000000..71e7a21ef2bc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -0,0 +1,1322 @@
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAGInstrs class, which implements re-scheduling
+// of MachineInstrs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Enable use of AA during MI GAD construction"));
+
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo &mli,
+                                     const MachineDominatorTree &mdt,
+                                     bool IsPostRAFlag,
+                                     LiveIntervals *lis)
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), LIS(lis),
+    IsPostRA(IsPostRAFlag), CanHandleTerminators(false), FirstDbgValue(0) {
+  assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
+  DbgValues.clear();
+  assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
+         "Virtual registers must be removed prior to PostRA scheduling");
+
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+}
+
+/// getUnderlyingObjectFromInt - This is the function that does the work of
+/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const Operator *U = dyn_cast<Operator>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (U->getOpcode() == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant, a multiplied value, or a phi, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed by the multiply,
+      // because our callers only care when the result is an
+      // identifiable object.
+      if (U->getOpcode() != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
+           !isa<PHINode>(U->getOperand(1))))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
+  } while (1);
+}
+
+/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects
+/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+static void getUnderlyingObjects(const Value *V,
+                                 SmallVectorImpl<Value *> &Objects) {
+  SmallPtrSet<const Value*, 16> Visited;
+  SmallVector<const Value *, 4> Working(1, V);
+  do {
+    V = Working.pop_back_val();
+
+    SmallVector<Value *, 4> Objs;
+    GetUnderlyingObjects(const_cast<Value *>(V), Objs);
+
+    for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
+         I != IE; ++I) {
+      V = *I;
+      if (!Visited.insert(V))
+        continue;
+      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
+        const Value *O =
+          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+        if (O->getType()->isPointerTy()) {
+          Working.push_back(O);
+          continue;
+        }
+      }
+      Objects.push_back(const_cast<Value *>(V));
+    }
+  } while (!Working.empty());
+}
+
+/// getUnderlyingObjectsForInstr - If this machine instr has memory reference
+/// information and it can be tracked to a normal reference to a known
+/// object, return the Value for that object.
+static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
+              const MachineFrameInfo *MFI,
+              SmallVectorImpl<std::pair<const Value *, bool> > &Objects) {
+  if (!MI->hasOneMemOperand() ||
+      !(*MI->memoperands_begin())->getValue() ||
+      (*MI->memoperands_begin())->isVolatile())
+    return;
+
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return;
+
+  SmallVector<Value *, 4> Objs;
+  getUnderlyingObjects(V, Objs);
+
+  for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
+       I != IE; ++I) {
+    bool MayAlias = true;
+    V = *I;
+
+    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+      // For now, ignore PseudoSourceValues which may alias LLVM IR values
+      // because the code that uses this function has no way to cope with
+      // such aliases.
+
+      if (PSV->isAliased(MFI)) {
+        Objects.clear();
+        return;
+      }
+
+      MayAlias = PSV->mayAlias(MFI);
+    } else if (!isIdentifiedObject(V)) {
+      Objects.clear();
+      return;
+    }
+
+    Objects.push_back(std::make_pair(V, MayAlias));
+  }
+}
+
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
+  BB = bb;
+}
+
+void ScheduleDAGInstrs::finishBlock() {
+  // Subclasses should no longer refer to the old block.
+  BB = 0;
+}
+
+/// Initialize the DAG and common scheduler state for the current scheduling
+/// region. This does not actually create the DAG, only clears it. The
+/// scheduling driver may call BuildSchedGraph multiple times per scheduling
+/// region.
+void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
+                                    MachineBasicBlock::iterator begin,
+                                    MachineBasicBlock::iterator end,
+                                    unsigned endcount) {
+  assert(bb == BB && "startBlock should set BB");
+  RegionBegin = begin;
+  RegionEnd = end;
+  EndIndex = endcount;
+  MISUnitMap.clear();
+
+  ScheduleDAG::clearDAG();
+}
+
+/// Close the current scheduling region. Don't clear any state in case the
+/// driver wants to refer to the previous scheduling region.
+void ScheduleDAGInstrs::exitRegion() {
+  // Nothing to do.
+}
+
+/// addSchedBarrierDeps - Add dependencies from instructions in the current
+/// list of instructions being scheduled to scheduling barrier by adding
+/// the exit SU to the register defs and use list. This is because we want to
+/// make sure instructions which define registers that are either used by
+/// the terminator or are live-out are properly scheduled. This is
+/// especially important when the definition latency of the return value(s)
+/// are too high to be hidden by the branch or when the liveout registers
+/// used by instructions in the fallthrough block.
+void ScheduleDAGInstrs::addSchedBarrierDeps() {
+  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : 0;
+  ExitSU.setInstr(ExitMI);
+  bool AllDepKnown = ExitMI &&
+    (ExitMI->isCall() || ExitMI->isBarrier());
+  if (ExitMI && AllDepKnown) {
+    // If it's a call or a barrier, add dependencies on the defs and uses of
+    // instruction.
+    for (unsigned i = 0, e = ExitMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = ExitMI->getOperand(i);
+      if (!MO.isReg() || MO.isDef()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+      else {
+        assert(!IsPostRA && "Virtual register encountered after regalloc.");
+        if (MO.readsReg()) // ignore undef operands
+          addVRegUseDeps(&ExitSU, i);
+      }
+    }
+  } else {
+    // For others, e.g. fallthrough, conditional branch, assume the exit
+    // uses all the registers that are livein to the successor blocks.
+    assert(Uses.empty() && "Uses in set before adding deps?");
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI)
+      for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
+             E = (*SI)->livein_end(); I != E; ++I) {
+        unsigned Reg = *I;
+        if (!Uses.contains(Reg))
+          Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+      }
+  }
+}
+
+/// MO is an operand of SU's instruction that defines a physical register. Add
+/// data dependencies from SU to any uses of the physical register.
+void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
+  assert(MO.isDef() && "expect physreg def");
+
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
+    if (!Uses.contains(*Alias))
+      continue;
+    for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
+      SUnit *UseSU = I->SU;
+      if (UseSU == SU)
+        continue;
+
+      // Adjust the dependence latency using operand def/use information,
+      // then allow the target to perform its own adjustments.
+      int UseOp = I->OpIdx;
+      MachineInstr *RegUse = 0;
+      SDep Dep;
+      if (UseOp < 0)
+        Dep = SDep(SU, SDep::Artificial);
+      else {
+        Dep = SDep(SU, SDep::Data, *Alias);
+        RegUse = UseSU->getInstr();
+        Dep.setMinLatency(
+          SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                           RegUse, UseOp, /*FindMin=*/true));
+      }
+      Dep.setLatency(
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                         RegUse, UseOp, /*FindMin=*/false));
+
+      ST.adjustSchedDependency(SU, UseSU, Dep);
+      UseSU->addPred(Dep);
+    }
+  }
+}
+
+/// addPhysRegDeps - Add register dependencies (data, anti, and output) from
+/// this SUnit to following instructions in the same scheduling region that
+/// depend the physical register referenced at OperIdx.
+void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+
+  // Optionally add output and anti dependencies. For anti
+  // dependencies we use a latency of 0 because for a multi-issue
+  // target we want to allow the defining instruction to issue
+  // in the same cycle as the using instruction.
+  // TODO: Using a latency of 1 here for output dependencies assumes
+  //       there's no cost for reusing registers.
+  SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
+    if (!Defs.contains(*Alias))
+      continue;
+    for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
+      SUnit *DefSU = I->SU;
+      if (DefSU == &ExitSU)
+        continue;
+      if (DefSU != SU &&
+          (Kind != SDep::Output || !MO.isDead() ||
+           !DefSU->getInstr()->registerDefIsDead(*Alias))) {
+        if (Kind == SDep::Anti)
+          DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
+        else {
+          SDep Dep(SU, Kind, /*Reg=*/*Alias);
+          unsigned OutLatency =
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
+          Dep.setMinLatency(OutLatency);
+          Dep.setLatency(OutLatency);
+          DefSU->addPred(Dep);
+        }
+      }
+    }
+  }
+
+  if (!MO.isDef()) {
+    // Either insert a new Reg2SUnits entry with an empty SUnits list, or
+    // retrieve the existing SUnits list for this register's uses.
+    // Push this SUnit on the use list.
+    Uses.insert(PhysRegSUOper(SU, OperIdx, MO.getReg()));
+  }
+  else {
+    addPhysRegDataDeps(SU, OperIdx);
+    unsigned Reg = MO.getReg();
+
+    // clear this register's use list
+    if (Uses.contains(Reg))
+      Uses.eraseAll(Reg);
+
+    if (!MO.isDead()) {
+      Defs.eraseAll(Reg);
+    } else if (SU->isCall) {
+      // Calls will not be reordered because of chain dependencies (see
+      // below). Since call operands are dead, calls may continue to be added
+      // to the DefList making dependence checking quadratic in the size of
+      // the block. Instead, we leave only one call at the back of the
+      // DefList.
+      Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
+      Reg2SUnitsMap::iterator B = P.first;
+      Reg2SUnitsMap::iterator I = P.second;
+      for (bool isBegin = I == B; !isBegin; /* empty */) {
+        isBegin = (--I) == B;
+        if (!I->SU->isCall)
+          break;
+        I = Defs.erase(I);
+      }
+    }
+
+    // Defs are pushed in the order they are visited and never reordered.
+    Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
+  }
+}
+
+/// addVRegDefDeps - Add register output and data dependencies from this SUnit
+/// to instructions that occur later in the same scheduling region if they read
+/// from or write to the virtual register defined at OperIdx.
+///
+/// TODO: Hoist loop induction variable increments. This has to be
+/// reevaluated. Generally, IV scheduling should be done before coalescing.
+void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineInstr *MI = SU->getInstr();
+  unsigned Reg = MI->getOperand(OperIdx).getReg();
+
+  // Singly defined vregs do not have output/anti dependencies.
+  // The current operand is a def, so we have at least one.
+  // Check here if there are any others...
+  if (MRI.hasOneDef(Reg))
+    return;
+
+  // Add output dependence to the next nearest def of this vreg.
+  //
+  // Unless this definition is dead, the output dependence should be
+  // transitively redundant with antidependencies from this definition's
+  // uses. We're conservative for now until we have a way to guarantee the uses
+  // are not eliminated sometime during scheduling. The output dependence edge
+  // is also useful if output latency exceeds def-use latency.
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
+  if (DefI == VRegDefs.end())
+    VRegDefs.insert(VReg2SUnit(Reg, SU));
+  else {
+    SUnit *DefSU = DefI->SU;
+    if (DefSU != SU && DefSU != &ExitSU) {
+      SDep Dep(SU, SDep::Output, Reg);
+      unsigned OutLatency =
+        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
+      Dep.setMinLatency(OutLatency);
+      Dep.setLatency(OutLatency);
+      DefSU->addPred(Dep);
+    }
+    DefI->SU = SU;
+  }
+}
+
+/// addVRegUseDeps - Add a register data dependency if the instruction that
+/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a
+/// register antidependency from this SUnit to instructions that occur later in
+/// the same scheduling region if they write the virtual register.
+///
+/// TODO: Handle ExitSU "uses" properly.
+void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
+  MachineInstr *MI = SU->getInstr();
+  unsigned Reg = MI->getOperand(OperIdx).getReg();
+
+  // Lookup this operand's reaching definition.
+  assert(LIS && "vreg dependencies requires LiveIntervals");
+  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  VNInfo *VNI = LRQ.valueIn();
+
+  // VNI will be valid because MachineOperand::readsReg() is checked by caller.
+  assert(VNI && "No value to read by operand");
+  MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def);
+  // Phis and other noninstructions (after coalescing) have a NULL Def.
+  if (Def) {
+    SUnit *DefSU = getSUnit(Def);
+    if (DefSU) {
+      // The reaching Def lives within this scheduling region.
+      // Create a data dependence.
+      SDep dep(DefSU, SDep::Data, Reg);
+      // Adjust the dependence latency using operand def/use information, then
+      // allow the target to perform its own adjustments.
+      int DefOp = Def->findRegisterDefOperandIdx(Reg);
+      dep.setLatency(
+        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false));
+      dep.setMinLatency(
+        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true));
+
+      const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
+      SU->addPred(dep);
+    }
+  }
+
+  // Add antidependence to the following def of the vreg it uses.
+  VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
+  if (DefI != VRegDefs.end() && DefI->SU != SU)
+    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
+}
+
+/// Return true if MI is an instruction we are unable to reason about
+/// (like a call or something with unmodeled side effects).
+static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+  if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
+      (MI->hasOrderedMemoryRef() &&
+       (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
+    return true;
+  return false;
+}
+
+// This MI might have either incomplete info, or known to be unsafe
+// to deal with (i.e. volatile object).
+static inline bool isUnsafeMemoryObject(MachineInstr *MI,
+                                        const MachineFrameInfo *MFI) {
+  if (!MI || MI->memoperands_empty())
+    return true;
+  // We purposefully do no check for hasOneMemOperand() here
+  // in hope to trigger an assert downstream in order to
+  // finish implementation.
+  if ((*MI->memoperands_begin())->isVolatile() ||
+       MI->hasUnmodeledSideEffects())
+    return true;
+  const Value *V = (*MI->memoperands_begin())->getValue();
+  if (!V)
+    return true;
+
+  SmallVector<Value *, 4> Objs;
+  getUnderlyingObjects(V, Objs);
+  for (SmallVector<Value *, 4>::iterator I = Objs.begin(),
+       IE = Objs.end(); I != IE; ++I) {
+    V = *I;
+
+    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
+      // Similarly to getUnderlyingObjectForInstr:
+      // For now, ignore PseudoSourceValues which may alias LLVM IR values
+      // because the code that uses this function has no way to cope with
+      // such aliases.
+      if (PSV->isAliased(MFI))
+        return true;
+    }
+
+    // Does this pointer refer to a distinct and identifiable object?
+    if (!isIdentifiedObject(V))
+      return true;
+  }
+
+  return false;
+}
+
+/// This returns true if the two MIs need a chain edge betwee them.
+/// If these are not even memory operations, we still may need
+/// chain deps between them. The question really is - could
+/// these two MIs be reordered during scheduling from memory dependency
+/// point of view.
+static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                             MachineInstr *MIa,
+                             MachineInstr *MIb) {
+  // Cover a trivial case - no edge is need to itself.
+  if (MIa == MIb)
+    return false;
+
+  if (isUnsafeMemoryObject(MIa, MFI) || isUnsafeMemoryObject(MIb, MFI))
+    return true;
+
+  // If we are dealing with two "normal" loads, we do not need an edge
+  // between them - they could be reordered.
+  if (!MIa->mayStore() && !MIb->mayStore())
+    return false;
+
+  // To this point analysis is generic. From here on we do need AA.
+  if (!AA)
+    return true;
+
+  MachineMemOperand *MMOa = *MIa->memoperands_begin();
+  MachineMemOperand *MMOb = *MIb->memoperands_begin();
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
+    llvm_unreachable("Multiple memory operands.");
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasAnalysis::AliasResult AAResult = AA->alias(
+  AliasAnalysis::Location(MMOa->getValue(), Overlapa,
+                          MMOa->getTBAAInfo()),
+  AliasAnalysis::Location(MMOb->getValue(), Overlapb,
+                          MMOb->getTBAAInfo()));
+
+  return (AAResult != AliasAnalysis::NoAlias);
+}
+
+/// This recursive function iterates over chain deps of SUb looking for
+/// "latest" node that needs a chain edge to SUa.
+static unsigned
+iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                 SUnit *SUa, SUnit *SUb, SUnit *ExitSU, unsigned *Depth,
+                 SmallPtrSet<const SUnit*, 16> &Visited) {
+  if (!SUa || !SUb || SUb == ExitSU)
+    return *Depth;
+
+  // Remember visited nodes.
+  if (!Visited.insert(SUb))
+      return *Depth;
+  // If there is _some_ dependency already in place, do not
+  // descend any further.
+  // TODO: Need to make sure that if that dependency got eliminated or ignored
+  // for any reason in the future, we would not violate DAG topology.
+  // Currently it does not happen, but makes an implicit assumption about
+  // future implementation.
+  //
+  // Independently, if we encounter node that is some sort of global
+  // object (like a call) we already have full set of dependencies to it
+  // and we can stop descending.
+  if (SUa->isSucc(SUb) ||
+      isGlobalMemoryObject(AA, SUb->getInstr()))
+    return *Depth;
+
+  // If we do need an edge, or we have exceeded depth budget,
+  // add that edge to the predecessors chain of SUb,
+  // and stop descending.
+  if (*Depth > 200 ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SUb->addPred(SDep(SUa, SDep::MayAliasMem));
+    return *Depth;
+  }
+  // Track current depth.
+  (*Depth)++;
+  // Iterate over chain dependencies only.
+  for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
+  return *Depth;
+}
+
+/// This function assumes that "downward" from SU there exist
+/// tail/leaf of already constructed DAG. It iterates downward and
+/// checks whether SU can be aliasing any node dominated
+/// by it.
+static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                            SUnit *SU, SUnit *ExitSU, std::set<SUnit *> &CheckList,
+                            unsigned LatencyToLoad) {
+  if (!SU)
+    return;
+
+  SmallPtrSet<const SUnit*, 16> Visited;
+  unsigned Depth = 0;
+
+  for (std::set<SUnit *>::iterator I = CheckList.begin(), IE = CheckList.end();
+       I != IE; ++I) {
+    if (SU == *I)
+      continue;
+    if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) {
+      SDep Dep(SU, SDep::MayAliasMem);
+      Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
+      (*I)->addPred(Dep);
+    }
+    // Now go through all the chain successors and iterate from them.
+    // Keep track of visited nodes.
+    for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
+         JE = (*I)->Succs.end(); J != JE; ++J)
+      if (J->isCtrl())
+        iterateChainSucc (AA, MFI, SU, J->getSUnit(),
+                          ExitSU, &Depth, Visited);
+  }
+}
+
+/// Check whether two objects need a chain edge, if so, add it
+/// otherwise remember the rejected SU.
+static inline
+void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                         SUnit *SUa, SUnit *SUb,
+                         std::set<SUnit *> &RejectList,
+                         unsigned TrueMemOrderLatency = 0,
+                         bool isNormalMemory = false) {
+  // If this is a false dependency,
+  // do not add the edge, but rememeber the rejected node.
+  if (!EnableAASchedMI ||
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
+    Dep.setLatency(TrueMemOrderLatency);
+    SUb->addPred(Dep);
+  }
+  else {
+    // Duplicate entries should be ignored.
+    RejectList.insert(SUb);
+    DEBUG(dbgs() << "\tReject chain dep between SU("
+          << SUa->NodeNum << ") and SU("
+          << SUb->NodeNum << ")\n");
+  }
+}
+
+/// Create an SUnit for each real instruction, numbered in top-down toplological
+/// order. The instruction order A < B, implies that no edge exists from B to A.
+///
+/// Map each real instruction to its SUnit.
+///
+/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
+/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
+/// instead of pointers.
+///
+/// MachineScheduler relies on initSUnits numbering the nodes by their order in
+/// the original instruction list.
+void ScheduleDAGInstrs::initSUnits() {
+  // We'll be allocating one SUnit for each real instruction in the region,
+  // which is contained within a basic block.
+  SUnits.reserve(BB->size());
+
+  for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+
+    SUnit *SU = newSUnit(MI);
+    MISUnitMap[MI] = SU;
+
+    SU->isCall = MI->isCall();
+    SU->isCommutable = MI->isCommutable();
+
+    // Assign the Latency field of SU using target-provided information.
+    SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
+  }
+}
+
+/// If RegPressure is non null, compute register pressure as a side effect. The
+/// DAG builder is an efficient place to do it because it already visits
+/// operands.
+void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+                                        RegPressureTracker *RPTracker) {
+  // Create an SUnit for each real instruction.
+  initSUnits();
+
+  // We build scheduling units by walking a block's instruction list from bottom
+  // to top.
+
+  // Remember where a generic side-effecting instruction is as we procede.
+  SUnit *BarrierChain = 0, *AliasChain = 0;
+
+  // Memory references to specific known memory locations are tracked
+  // so that they can be given more precise dependencies. We track
+  // separately the known memory locations that may alias and those
+  // that are known not to alias
+  MapVector<const Value *, SUnit *> AliasMemDefs, NonAliasMemDefs;
+  MapVector<const Value *, std::vector<SUnit *> > AliasMemUses, NonAliasMemUses;
+  std::set<SUnit*> RejectMemNodes;
+
+  // Remove any stale debug info; sometimes BuildSchedGraph is called again
+  // without emitting the info from the previous call.
+  DbgValues.clear();
+  FirstDbgValue = NULL;
+
+  assert(Defs.empty() && Uses.empty() &&
+         "Only BuildGraph should update Defs/Uses");
+  Defs.setUniverse(TRI->getNumRegs());
+  Uses.setUniverse(TRI->getNumRegs());
+
+  assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
+  // FIXME: Allow SparseSet to reserve space for the creation of virtual
+  // registers during scheduling. Don't artificially inflate the Universe
+  // because we want to assert that vregs are not created during DAG building.
+  VRegDefs.setUniverse(MRI.getNumVirtRegs());
+
+  // Model data dependencies between instructions being scheduled and the
+  // ExitSU.
+  addSchedBarrierDeps();
+
+  // Walk the list of instructions, from bottom moving up.
+  MachineInstr *DbgMI = NULL;
+  for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
+       MII != MIE; --MII) {
+    MachineInstr *MI = prior(MII);
+    if (MI && DbgMI) {
+      DbgValues.push_back(std::make_pair(DbgMI, MI));
+      DbgMI = NULL;
+    }
+
+    if (MI->isDebugValue()) {
+      DbgMI = MI;
+      continue;
+    }
+    if (RPTracker) {
+      RPTracker->recede();
+      assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
+    }
+
+    assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
+           "Cannot schedule terminators or labels!");
+
+    SUnit *SU = MISUnitMap[MI];
+    assert(SU && "No SUnit mapped to this MI");
+
+    // Add register-based dependencies (data, anti, and output).
+    bool HasVRegDef = false;
+    for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI->getOperand(j);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        addPhysRegDeps(SU, j);
+      else {
+        assert(!IsPostRA && "Virtual register encountered!");
+        if (MO.isDef()) {
+          HasVRegDef = true;
+          addVRegDefDeps(SU, j);
+        }
+        else if (MO.readsReg()) // ignore undef operands
+          addVRegUseDeps(SU, j);
+      }
+    }
+    // If we haven't seen any uses in this scheduling region, create a
+    // dependence edge to ExitSU to model the live-out latency. This is required
+    // for vreg defs with no in-region use, and prefetches with no vreg def.
+    //
+    // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
+    // check currently relies on being called before adding chain deps.
+    if (SU->NumSuccs == 0 && SU->Latency > 1
+        && (HasVRegDef || MI->mayLoad())) {
+      SDep Dep(SU, SDep::Artificial);
+      Dep.setLatency(SU->Latency - 1);
+      ExitSU.addPred(Dep);
+    }
+
+    // Add chain dependencies.
+    // Chain dependencies used to enforce memory order should have
+    // latency of 0 (except for true dependency of Store followed by
+    // aliased Load... we estimate that with a single cycle of latency
+    // assuming the hardware will bypass)
+    // Note that isStoreToStackSlot and isLoadFromStackSLot are not usable
+    // after stack slots are lowered to actual addresses.
+    // TODO: Use an AliasAnalysis and do real alias-analysis queries, and
+    // produce more precise dependence information.
+    unsigned TrueMemOrderLatency = MI->mayStore() ? 1 : 0;
+    if (isGlobalMemoryObject(AA, MI)) {
+      // Be conservative with these and add dependencies on all memory
+      // references, even those that are known to not alias.
+      for (MapVector<const Value *, SUnit *>::iterator I =
+             NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
+        I->second->addPred(SDep(SU, SDep::Barrier));
+      }
+      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+             NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
+          SDep Dep(SU, SDep::Barrier);
+          Dep.setLatency(TrueMemOrderLatency);
+          I->second[i]->addPred(Dep);
+        }
+      }
+      // Add SU to the barrier chain.
+      if (BarrierChain)
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
+      BarrierChain = SU;
+      // This is a barrier event that acts as a pivotal node in the DAG,
+      // so it is safe to clear list of exposed nodes.
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
+      RejectMemNodes.clear();
+      NonAliasMemDefs.clear();
+      NonAliasMemUses.clear();
+
+      // fall-through
+    new_alias_chain:
+      // Chain all possibly aliasing memory references though SU.
+      if (AliasChain) {
+        unsigned ChainLatency = 0;
+        if (AliasChain->getInstr()->mayLoad())
+          ChainLatency = TrueMemOrderLatency;
+        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+                           ChainLatency);
+      }
+      AliasChain = SU;
+      for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                           TrueMemOrderLatency);
+      for (MapVector<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
+           E = AliasMemDefs.end(); I != E; ++I)
+        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+      for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
+           AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
+          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+                             TrueMemOrderLatency);
+      }
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
+      PendingLoads.clear();
+      AliasMemDefs.clear();
+      AliasMemUses.clear();
+    } else if (MI->mayStore()) {
+      SmallVector<std::pair<const Value *, bool>, 4> Objs;
+      getUnderlyingObjectsForInstr(MI, MFI, Objs);
+
+      if (Objs.empty()) {
+        // Treat all other stores conservatively.
+        goto new_alias_chain;
+      }
+
+      bool MayAlias = false;
+      for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
+           K = Objs.begin(), KE = Objs.end(); K != KE; ++K) {
+        const Value *V = K->first;
+        bool ThisMayAlias = K->second;
+        if (ThisMayAlias)
+          MayAlias = true;
+
+        // A store to a specific PseudoSourceValue. Add precise dependencies.
+        // Record the def in MemDefs, first adding a dep if there is
+        // an existing def.
+        MapVector<const Value *, SUnit *>::iterator I =
+          ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
+        MapVector<const Value *, SUnit *>::iterator IE =
+          ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
+        if (I != IE) {
+          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          I->second = SU;
+        } else {
+          if (ThisMayAlias)
+            AliasMemDefs[V] = SU;
+          else
+            NonAliasMemDefs[V] = SU;
+        }
+        // Handle the uses in MemUses, if there are any.
+        MapVector<const Value *, std::vector<SUnit *> >::iterator J =
+          ((ThisMayAlias) ? AliasMemUses.find(V) : NonAliasMemUses.find(V));
+        MapVector<const Value *, std::vector<SUnit *> >::iterator JE =
+          ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
+        if (J != JE) {
+          for (unsigned i = 0, e = J->second.size(); i != e; ++i)
+            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+                               TrueMemOrderLatency, true);
+          J->second.clear();
+        }
+      }
+      if (MayAlias) {
+        // Add dependencies from all the PendingLoads, i.e. loads
+        // with no underlying object.
+        for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
+          addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+                             TrueMemOrderLatency);
+        // Add dependence on alias chain, if needed.
+        if (AliasChain)
+          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+        // But we also should check dependent instructions for the
+        // SU in question.
+        adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                        TrueMemOrderLatency);
+      }
+      // Add dependence on barrier chain, if needed.
+      // There is no point to check aliasing on barrier event. Even if
+      // SU and barrier _could_ be reordered, they should not. In addition,
+      // we have lost all RejectMemNodes below barrier.
+      if (BarrierChain)
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
+
+      if (!ExitSU.isPred(SU))
+        // Push store's up a bit to avoid them getting in between cmp
+        // and branches.
+        ExitSU.addPred(SDep(SU, SDep::Artificial));
+    } else if (MI->mayLoad()) {
+      bool MayAlias = true;
+      if (MI->isInvariantLoad(AA)) {
+        // Invariant load, no chain dependencies needed!
+      } else {
+        SmallVector<std::pair<const Value *, bool>, 4> Objs;
+        getUnderlyingObjectsForInstr(MI, MFI, Objs);
+
+        if (Objs.empty()) {
+          // A load with no underlying object. Depend on all
+          // potentially aliasing stores.
+          for (MapVector<const Value *, SUnit *>::iterator I =
+                 AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+
+          PendingLoads.push_back(SU);
+          MayAlias = true;
+        } else {
+          MayAlias = false;
+        }
+
+        for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
+             J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
+          const Value *V = J->first;
+          bool ThisMayAlias = J->second;
+
+          if (ThisMayAlias)
+            MayAlias = true;
+
+          // A load from a specific PseudoSourceValue. Add precise dependencies.
+          MapVector<const Value *, SUnit *>::iterator I =
+            ((ThisMayAlias) ? AliasMemDefs.find(V) : NonAliasMemDefs.find(V));
+          MapVector<const Value *, SUnit *>::iterator IE =
+            ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
+          if (I != IE)
+            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          if (ThisMayAlias)
+            AliasMemUses[V].push_back(SU);
+          else
+            NonAliasMemUses[V].push_back(SU);
+        }
+        if (MayAlias)
+          adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
+        // Add dependencies on alias and barrier chains, if needed.
+        if (MayAlias && AliasChain)
+          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+        if (BarrierChain)
+          BarrierChain->addPred(SDep(SU, SDep::Barrier));
+      }
+    }
+  }
+  if (DbgMI)
+    FirstDbgValue = DbgMI;
+
+  Defs.clear();
+  Uses.clear();
+  VRegDefs.clear();
+  PendingLoads.clear();
+}
+
+void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  SU->getInstr()->dump();
+#endif
+}
+
+std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss, &TM, /*SkipOpers=*/true);
+  return oss.str();
+}
+
+/// Return the basic block label. It is not necessarilly unique because a block
+/// contains multiple scheduling regions. But it is fine for visualization.
+std::string ScheduleDAGInstrs::getDAGName() const {
+  return "dag." + BB->getFullName();
+}
+
+//===----------------------------------------------------------------------===//
+// SchedDFSResult Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+/// \brief Internal state used to compute SchedDFSResult.
+class SchedDFSImpl {
+  SchedDFSResult &R;
+
+  /// Join DAG nodes into equivalence classes by their subtree.
+  IntEqClasses SubtreeClasses;
+  /// List PredSU, SuccSU pairs that represent data edges between subtrees.
+  std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
+
+  struct RootData {
+    unsigned NodeID;
+    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
+    unsigned SubInstrCount; // Instr count in this tree only, not children.
+
+    RootData(unsigned id): NodeID(id),
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
+                           SubInstrCount(0) {}
+
+    unsigned getSparseSetIndex() const { return NodeID; }
+  };
+
+  SparseSet<RootData> RootSet;
+
+public:
+  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
+    RootSet.setUniverse(R.DFSNodeData.size());
+  }
+
+  /// Return true if this node been visited by the DFS traversal.
+  ///
+  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
+  /// ID. Later, SubtreeID is updated but remains valid.
+  bool isVisited(const SUnit *SU) const {
+    return R.DFSNodeData[SU->NodeNum].SubtreeID
+      != SchedDFSResult::InvalidSubtreeID;
+  }
+
+  /// Initialize this node's instruction count. We don't need to flag the node
+  /// visited until visitPostorder because the DAG cannot have cycles.
+  void visitPreorder(const SUnit *SU) {
+    R.DFSNodeData[SU->NodeNum].InstrCount =
+      SU->getInstr()->isTransient() ? 0 : 1;
+  }
+
+  /// Called once for each node after all predecessors are visited. Revisit this
+  /// node's predecessors and potentially join them now that we know the ILP of
+  /// the other predecessors.
+  void visitPostorderNode(const SUnit *SU) {
+    // Mark this node as the root of a subtree. It may be joined with its
+    // successors later.
+    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
+    RootData RData(SU->NodeNum);
+    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+
+    // If any predecessors are still in their own subtree, they either cannot be
+    // joined or are large enough to remain separate. If this parent node's
+    // total instruction count is not greater than a child subtree by at least
+    // the subtree limit, then try to join it now since splitting subtrees is
+    // only useful if multiple high-pressure paths are possible.
+    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->getKind() != SDep::Data)
+        continue;
+      unsigned PredNum = PI->getSUnit()->NodeNum;
+      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
+        joinPredSubtree(*PI, SU, /*CheckLimit=*/false);
+
+      // Either link or merge the TreeData entry from the child to the parent.
+      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
+        // If the predecessor's parent is invalid, this is a tree edge and the
+        // current node is the parent.
+        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
+          RootSet[PredNum].ParentNodeID = SU->NodeNum;
+      }
+      else if (RootSet.count(PredNum)) {
+        // The predecessor is not a root, but is still in the root set. This
+        // must be the new parent that it was just joined to. Note that
+        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
+        // set to the original parent.
+        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
+        RootSet.erase(PredNum);
+      }
+    }
+    RootSet[SU->NodeNum] = RData;
+  }
+
+  /// Called once for each tree edge after calling visitPostOrderNode on the
+  /// predecessor. Increment the parent node's instruction count and
+  /// preemptively join this subtree to its parent's if it is small enough.
+  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
+    R.DFSNodeData[Succ->NodeNum].InstrCount
+      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
+    joinPredSubtree(PredDep, Succ);
+  }
+
+  /// Add a connection for cross edges.
+  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
+    ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
+  }
+
+  /// Set each node's subtree ID to the representative ID and record connections
+  /// between trees.
+  void finalize() {
+    SubtreeClasses.compress();
+    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
+    assert(SubtreeClasses.getNumClasses() == RootSet.size()
+           && "number of roots should match trees");
+    for (SparseSet<RootData>::const_iterator
+           RI = RootSet.begin(), RE = RootSet.end(); RI != RE; ++RI) {
+      unsigned TreeID = SubtreeClasses[RI->NodeID];
+      if (RI->ParentNodeID != SchedDFSResult::InvalidSubtreeID)
+        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[RI->ParentNodeID];
+      R.DFSTreeData[TreeID].SubInstrCount = RI->SubInstrCount;
+      // Note that SubInstrCount may be greater than InstrCount if we joined
+      // subtrees across a cross edge. InstrCount will be attributed to the
+      // original parent, while SubInstrCount will be attributed to the joined
+      // parent.
+    }
+    R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
+    R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
+    DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
+    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
+      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
+      DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
+            << R.DFSNodeData[Idx].SubtreeID << '\n');
+    }
+    for (std::vector<std::pair<const SUnit*, const SUnit*> >::const_iterator
+           I = ConnectionPairs.begin(), E = ConnectionPairs.end();
+         I != E; ++I) {
+      unsigned PredTree = SubtreeClasses[I->first->NodeNum];
+      unsigned SuccTree = SubtreeClasses[I->second->NodeNum];
+      if (PredTree == SuccTree)
+        continue;
+      unsigned Depth = I->first->getDepth();
+      addConnection(PredTree, SuccTree, Depth);
+      addConnection(SuccTree, PredTree, Depth);
+    }
+  }
+
+protected:
+  /// Join the predecessor subtree with the successor that is its DFS
+  /// parent. Apply some heuristics before joining.
+  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
+                       bool CheckLimit = true) {
+    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
+
+    // Check if the predecessor is already joined.
+    const SUnit *PredSU = PredDep.getSUnit();
+    unsigned PredNum = PredSU->NodeNum;
+    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
+      return false;
+
+    // Four is the magic number of successors before a node is considered a
+    // pinch point.
+    unsigned NumDataSucs = 0;
+    for (SUnit::const_succ_iterator SI = PredSU->Succs.begin(),
+           SE = PredSU->Succs.end(); SI != SE; ++SI) {
+      if (SI->getKind() == SDep::Data) {
+        if (++NumDataSucs >= 4)
+          return false;
+      }
+    }
+    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
+      return false;
+    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
+    SubtreeClasses.join(Succ->NodeNum, PredNum);
+    return true;
+  }
+
+  /// Called by finalize() to record a connection between trees.
+  void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
+    if (!Depth)
+      return;
+
+    do {
+      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
+        R.SubtreeConnections[FromTree];
+      for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
+             I = Connections.begin(), E = Connections.end(); I != E; ++I) {
+        if (I->TreeID == ToTree) {
+          I->Level = std::max(I->Level, Depth);
+          return;
+        }
+      }
+      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
+    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
+  }
+};
+} // namespace llvm
+
+namespace {
+/// \brief Manage the stack used by a reverse depth-first search over the DAG.
+class SchedDAGReverseDFS {
+  std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
+public:
+  bool isComplete() const { return DFSStack.empty(); }
+
+  void follow(const SUnit *SU) {
+    DFSStack.push_back(std::make_pair(SU, SU->Preds.begin()));
+  }
+  void advance() { ++DFSStack.back().second; }
+
+  const SDep *backtrack() {
+    DFSStack.pop_back();
+    return DFSStack.empty() ? 0 : llvm::prior(DFSStack.back().second);
+  }
+
+  const SUnit *getCurr() const { return DFSStack.back().first; }
+
+  SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
+
+  SUnit::const_pred_iterator getPredEnd() const {
+    return getCurr()->Preds.end();
+  }
+};
+} // anonymous
+
+static bool hasDataSucc(const SUnit *SU) {
+  for (SUnit::const_succ_iterator
+         SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) {
+    if (SI->getKind() == SDep::Data && !SI->getSUnit()->isBoundaryNode())
+      return true;
+  }
+  return false;
+}
+
+/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
+/// search from this root.
+void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
+  if (!IsBottomUp)
+    llvm_unreachable("Top-down ILP metric is unimplemnted");
+
+  SchedDFSImpl Impl(*this);
+  for (ArrayRef<SUnit>::const_iterator
+         SI = SUnits.begin(), SE = SUnits.end(); SI != SE; ++SI) {
+    const SUnit *SU = &*SI;
+    if (Impl.isVisited(SU) || hasDataSucc(SU))
+      continue;
+
+    SchedDAGReverseDFS DFS;
+    Impl.visitPreorder(SU);
+    DFS.follow(SU);
+    for (;;) {
+      // Traverse the leftmost path as far as possible.
+      while (DFS.getPred() != DFS.getPredEnd()) {
+        const SDep &PredDep = *DFS.getPred();
+        DFS.advance();
+        // Ignore non-data edges.
+        if (PredDep.getKind() != SDep::Data
+            || PredDep.getSUnit()->isBoundaryNode()) {
+          continue;
+        }
+        // An already visited edge is a cross edge, assuming an acyclic DAG.
+        if (Impl.isVisited(PredDep.getSUnit())) {
+          Impl.visitCrossEdge(PredDep, DFS.getCurr());
+          continue;
+        }
+        Impl.visitPreorder(PredDep.getSUnit());
+        DFS.follow(PredDep.getSUnit());
+      }
+      // Visit the top of the stack in postorder and backtrack.
+      const SUnit *Child = DFS.getCurr();
+      const SDep *PredDep = DFS.backtrack();
+      Impl.visitPostorderNode(Child);
+      if (PredDep)
+        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
+      if (DFS.isComplete())
+        break;
+    }
+  }
+  Impl.finalize();
+}
+
+/// The root of the given SubtreeID was just scheduled. For all subtrees
+/// connected to this tree, record the depth of the connection so that the
+/// nearest connected subtrees can be prioritized.
+void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
+  for (SmallVectorImpl<Connection>::const_iterator
+         I = SubtreeConnections[SubtreeID].begin(),
+         E = SubtreeConnections[SubtreeID].end(); I != E; ++I) {
+    SubtreeConnectLevels[I->TreeID] =
+      std::max(SubtreeConnectLevels[I->TreeID], I->Level);
+    DEBUG(dbgs() << "  Tree: " << I->TreeID
+          << " @" << SubtreeConnectLevels[I->TreeID] << '\n');
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ILPValue::print(raw_ostream &OS) const {
+  OS << InstrCount << " / " << Length << " = ";
+  if (!Length)
+    OS << "BADILP";
+  else
+    OS << format("%g", ((double)InstrCount / Length));
+}
+
+void ILPValue::dump() const {
+  dbgs() << *this << '\n';
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+  Val.print(OS);
+  return OS;
+}
+
+} // namespace llvm
+#endif // !NDEBUG || LLVM_ENABLE_DUMP
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
new file mode 100644
index 000000000000..8ddb3e892f25
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -0,0 +1,100 @@
+//===-- ScheduleDAGPrinter.cpp - Implement ScheduleDAG::viewGraph() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <fstream>
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const ScheduleDAG *G) {
+      return G->MF.getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static bool isNodeHidden(const SUnit *Node) {
+      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    }
+
+    static bool hasNodeAddressLabel(const SUnit *Node,
+                                    const ScheduleDAG *Graph) {
+      return true;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    static std::string getEdgeAttributes(const SUnit *Node,
+                                         SUnitIterator EI,
+                                         const ScheduleDAG *Graph) {
+      if (EI.isArtificialDep())
+        return "color=cyan,style=dashed";
+      if (EI.isCtrlDep())
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    std::string getNodeLabel(const SUnit *Node, const ScheduleDAG *Graph);
+    static std::string getNodeAttributes(const SUnit *N,
+                                         const ScheduleDAG *Graph) {
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(ScheduleDAG *G,
+                                       GraphWriter<ScheduleDAG*> &GW) {
+      return G->addCustomGraphFeatures(GW);
+    }
+  };
+}
+
+std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
+                                                       const ScheduleDAG *G) {
+  return G->getGraphNodeLabel(SU);
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAG::viewGraph(const Twine &Name, const Twine &Title) {
+  // This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAG::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
new file mode 100644
index 000000000000..2cd84d670aaa
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -0,0 +1,248 @@
+//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScoreboardHazardRecognizer class, which
+// encapsultes hazard-avoidance heuristics for scheduling, based on the
+// scheduling itineraries specified for the target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE ::llvm::ScoreboardHazardRecognizer::DebugType
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#ifndef NDEBUG
+const char *ScoreboardHazardRecognizer::DebugType = "";
+#endif
+
+ScoreboardHazardRecognizer::
+ScoreboardHazardRecognizer(const InstrItineraryData *II,
+                           const ScheduleDAG *SchedDAG,
+                           const char *ParentDebugType) :
+  ScheduleHazardRecognizer(), ItinData(II), DAG(SchedDAG), IssueWidth(0),
+  IssueCount(0) {
+
+#ifndef NDEBUG
+  DebugType = ParentDebugType;
+#endif
+
+  // Determine the maximum depth of any itinerary. This determines the depth of
+  // the scoreboard. We always make the scoreboard at least 1 cycle deep to
+  // avoid dealing with the boundary condition.
+  unsigned ScoreboardDepth = 1;
+  if (ItinData && !ItinData->isEmpty()) {
+    for (unsigned idx = 0; ; ++idx) {
+      if (ItinData->isEndMarker(idx))
+        break;
+
+      const InstrStage *IS = ItinData->beginStage(idx);
+      const InstrStage *E = ItinData->endStage(idx);
+      unsigned CurCycle = 0;
+      unsigned ItinDepth = 0;
+      for (; IS != E; ++IS) {
+        unsigned StageDepth = CurCycle + IS->getCycles();
+        if (ItinDepth < StageDepth) ItinDepth = StageDepth;
+        CurCycle += IS->getNextCycles();
+      }
+
+      // Find the next power-of-2 >= ItinDepth
+      while (ItinDepth > ScoreboardDepth) {
+        ScoreboardDepth *= 2;
+        // Don't set MaxLookAhead until we find at least one nonzero stage.
+        // This way, an itinerary with no stages has MaxLookAhead==0, which
+        // completely bypasses the scoreboard hazard logic.
+        MaxLookAhead = ScoreboardDepth;
+      }
+    }
+  }
+
+  ReservedScoreboard.reset(ScoreboardDepth);
+  RequiredScoreboard.reset(ScoreboardDepth);
+
+  // If MaxLookAhead is not set above, then we are not enabled.
+  if (!isEnabled())
+    DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+  else {
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel->IssueWidth;
+    DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+          << ScoreboardDepth << '\n');
+  }
+}
+
+void ScoreboardHazardRecognizer::Reset() {
+  IssueCount = 0;
+  RequiredScoreboard.reset();
+  ReservedScoreboard.reset();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScoreboardHazardRecognizer::Scoreboard::dump() const {
+  dbgs() << "Scoreboard:\n";
+
+  unsigned last = Depth - 1;
+  while ((last > 0) && ((*this)[last] == 0))
+    last--;
+
+  for (unsigned i = 0; i <= last; i++) {
+    unsigned FUs = (*this)[i];
+    dbgs() << "\t";
+    for (int j = 31; j >= 0; j--)
+      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    dbgs() << '\n';
+  }
+}
+#endif
+
+bool ScoreboardHazardRecognizer::atIssueLimit() const {
+  if (IssueWidth == 0)
+    return false;
+
+  return IssueCount == IssueWidth;
+}
+
+ScheduleHazardRecognizer::HazardType
+ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (!ItinData || ItinData->isEmpty())
+    return NoHazard;
+
+  // Note that stalls will be negative for bottom-up scheduling.
+  int cycle = Stalls;
+
+  // Use the itinerary for the underlying instruction to check for
+  // free FU's in the scoreboard at the appropriate future cycles.
+
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (MCID == NULL) {
+    // Don't check hazards for non-machineinstr Nodes.
+    return NoHazard;
+  }
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must find one of the stage's units free for every cycle the
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      int StageCycle = cycle + (int)i;
+      if (StageCycle < 0)
+        continue;
+
+      if (StageCycle >= (int)RequiredScoreboard.getDepth()) {
+        assert((StageCycle - Stalls) < (int)RequiredScoreboard.getDepth() &&
+               "Scoreboard depth exceeded!");
+        // This stage was stalled beyond pipeline depth, so cannot conflict.
+        break;
+      }
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[StageCycle];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[StageCycle];
+        break;
+      }
+
+      if (!freeUnits) {
+        DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
+        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
+        DEBUG(DAG->dumpNode(SU));
+        return Hazard;
+      }
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  return NoHazard;
+}
+
+void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
+  if (!ItinData || ItinData->isEmpty())
+    return;
+
+  // Use the itinerary for the underlying instruction to reserve FU's
+  // in the scoreboard at the appropriate future cycles.
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  assert(MCID && "The scheduler must filter non-machineinstrs");
+  if (DAG->TII->isZeroCost(MCID->Opcode))
+    return;
+
+  ++IssueCount;
+
+  unsigned cycle = 0;
+
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must reserve one of the stage's units for every cycle the
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      // reduce to a single unit
+      unsigned freeUnit = 0;
+      do {
+        freeUnit = freeUnits;
+        freeUnits = freeUnit & (freeUnit - 1);
+      } while (freeUnits);
+
+      if (IS->getReservationKind() == InstrStage::Required)
+        RequiredScoreboard[cycle + i] |= freeUnit;
+      else
+        ReservedScoreboard[cycle + i] |= freeUnit;
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  DEBUG(ReservedScoreboard.dump());
+  DEBUG(RequiredScoreboard.dump());
+}
+
+void ScoreboardHazardRecognizer::AdvanceCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
+  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
+}
+
+void ScoreboardHazardRecognizer::RecedeCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[ReservedScoreboard.getDepth()-1] = 0;
+  ReservedScoreboard.recede();
+  RequiredScoreboard[RequiredScoreboard.getDepth()-1] = 0;
+  RequiredScoreboard.recede();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
new file mode 100644
index 000000000000..eb1609575016
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -0,0 +1,10214 @@
+//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
+// both before and after the DAG is legalized.
+//
+// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
+// primarily intended to handle simplification opportunities that are implicit
+// in the LLVM IR and exposed by the various codegen lowering phases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dagcombine"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NodesCombined   , "Number of dag nodes combined");
+STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
+STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
+STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+
+namespace {
+  static cl::opt<bool>
+    CombinerAA("combiner-alias-analysis", cl::Hidden,
+               cl::desc("Turn on alias analysis during testing"));
+
+  static cl::opt<bool>
+    CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+               cl::desc("Include global information in alias analysis"));
+
+//------------------------------ DAGCombiner ---------------------------------//
+
+  class DAGCombiner {
+    SelectionDAG &DAG;
+    const TargetLowering &TLI;
+    CombineLevel Level;
+    CodeGenOpt::Level OptLevel;
+    bool LegalOperations;
+    bool LegalTypes;
+
+    // Worklist of all of the nodes that need to be simplified.
+    //
+    // This has the semantics that when adding to the worklist,
+    // the item added must be next to be processed. It should
+    // also only appear once. The naive approach to this takes
+    // linear time.
+    //
+    // To reduce the insert/remove time to logarithmic, we use
+    // a set and a vector to maintain our worklist.
+    //
+    // The set contains the items on the worklist, but does not
+    // maintain the order they should be visited.
+    //
+    // The vector maintains the order nodes should be visited, but may
+    // contain duplicate or removed nodes. When choosing a node to
+    // visit, we pop off the order stack until we find an item that is
+    // also in the contents set. All operations are O(log N).
+    SmallPtrSet<SDNode*, 64> WorkListContents;
+    SmallVector<SDNode*, 64> WorkListOrder;
+
+    // AA - Used for DAG load/store alias analysis.
+    AliasAnalysis &AA;
+
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(SDNode *N) {
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           UI != UE; ++UI)
+        AddToWorkList(*UI);
+    }
+
+    /// visit - call the node-specific routine that knows how to fold each
+    /// particular type of node.
+    SDValue visit(SDNode *N);
+
+  public:
+    /// AddToWorkList - Add to the work list making sure its instance is at the
+    /// back (next to be processed.)
+    void AddToWorkList(SDNode *N) {
+      WorkListContents.insert(N);
+      WorkListOrder.push_back(N);
+    }
+
+    /// removeFromWorkList - remove all instances of N from the worklist.
+    ///
+    void removeFromWorkList(SDNode *N) {
+      WorkListContents.erase(N);
+    }
+
+    SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                      bool AddTo = true);
+
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
+      return CombineTo(N, &Res, 1, AddTo);
+    }
+
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
+                      bool AddTo = true) {
+      SDValue To[] = { Res0, Res1 };
+      return CombineTo(N, To, 2, AddTo);
+    }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
+
+  private:
+
+    /// SimplifyDemandedBits - Check the specified integer node value to see if
+    /// it can be simplified or if things it uses can be simplified by bit
+    /// propagation.  If so, return true.
+    bool SimplifyDemandedBits(SDValue Op) {
+      unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
+      return SimplifyDemandedBits(Op, Demanded);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+
+    bool CombineToPreIndexedLoadStore(SDNode *N);
+    bool CombineToPostIndexedLoadStore(SDNode *N);
+
+    void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
+    SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
+    SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue PromoteIntBinOp(SDValue Op);
+    SDValue PromoteIntShiftOp(SDValue Op);
+    SDValue PromoteExtend(SDValue Op);
+    bool PromoteLoad(SDValue Op);
+
+    void ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
+                         SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+                         ISD::NodeType ExtType);
+
+    /// combine - call the node-specific routine that knows how to fold each
+    /// particular type of node. If that doesn't do anything, try the
+    /// target-specific DAG combines.
+    SDValue combine(SDNode *N);
+
+    // Visitation implementation - Implement dag node combining for different
+    // node types.  The semantics are as follows:
+    // Return Value:
+    //   SDValue.getNode() == 0 - No change was made
+    //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
+    //   otherwise              - N should be replaced by the returned Operand.
+    //
+    SDValue visitTokenFactor(SDNode *N);
+    SDValue visitMERGE_VALUES(SDNode *N);
+    SDValue visitADD(SDNode *N);
+    SDValue visitSUB(SDNode *N);
+    SDValue visitADDC(SDNode *N);
+    SDValue visitSUBC(SDNode *N);
+    SDValue visitADDE(SDNode *N);
+    SDValue visitSUBE(SDNode *N);
+    SDValue visitMUL(SDNode *N);
+    SDValue visitSDIV(SDNode *N);
+    SDValue visitUDIV(SDNode *N);
+    SDValue visitSREM(SDNode *N);
+    SDValue visitUREM(SDNode *N);
+    SDValue visitMULHU(SDNode *N);
+    SDValue visitMULHS(SDNode *N);
+    SDValue visitSMUL_LOHI(SDNode *N);
+    SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSMULO(SDNode *N);
+    SDValue visitUMULO(SDNode *N);
+    SDValue visitSDIVREM(SDNode *N);
+    SDValue visitUDIVREM(SDNode *N);
+    SDValue visitAND(SDNode *N);
+    SDValue visitOR(SDNode *N);
+    SDValue visitXOR(SDNode *N);
+    SDValue SimplifyVBinOp(SDNode *N);
+    SDValue SimplifyVUnaryOp(SDNode *N);
+    SDValue visitSHL(SDNode *N);
+    SDValue visitSRA(SDNode *N);
+    SDValue visitSRL(SDNode *N);
+    SDValue visitCTLZ(SDNode *N);
+    SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTTZ(SDNode *N);
+    SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTPOP(SDNode *N);
+    SDValue visitSELECT(SDNode *N);
+    SDValue visitSELECT_CC(SDNode *N);
+    SDValue visitSETCC(SDNode *N);
+    SDValue visitSIGN_EXTEND(SDNode *N);
+    SDValue visitZERO_EXTEND(SDNode *N);
+    SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitSIGN_EXTEND_INREG(SDNode *N);
+    SDValue visitTRUNCATE(SDNode *N);
+    SDValue visitBITCAST(SDNode *N);
+    SDValue visitBUILD_PAIR(SDNode *N);
+    SDValue visitFADD(SDNode *N);
+    SDValue visitFSUB(SDNode *N);
+    SDValue visitFMUL(SDNode *N);
+    SDValue visitFMA(SDNode *N);
+    SDValue visitFDIV(SDNode *N);
+    SDValue visitFREM(SDNode *N);
+    SDValue visitFCOPYSIGN(SDNode *N);
+    SDValue visitSINT_TO_FP(SDNode *N);
+    SDValue visitUINT_TO_FP(SDNode *N);
+    SDValue visitFP_TO_SINT(SDNode *N);
+    SDValue visitFP_TO_UINT(SDNode *N);
+    SDValue visitFP_ROUND(SDNode *N);
+    SDValue visitFP_ROUND_INREG(SDNode *N);
+    SDValue visitFP_EXTEND(SDNode *N);
+    SDValue visitFNEG(SDNode *N);
+    SDValue visitFABS(SDNode *N);
+    SDValue visitFCEIL(SDNode *N);
+    SDValue visitFTRUNC(SDNode *N);
+    SDValue visitFFLOOR(SDNode *N);
+    SDValue visitBRCOND(SDNode *N);
+    SDValue visitBR_CC(SDNode *N);
+    SDValue visitLOAD(SDNode *N);
+    SDValue visitSTORE(SDNode *N);
+    SDValue visitINSERT_VECTOR_ELT(SDNode *N);
+    SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
+    SDValue visitBUILD_VECTOR(SDNode *N);
+    SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
+    SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitMEMBARRIER(SDNode *N);
+
+    SDValue XformToShuffleWithZero(SDNode *N);
+    SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
+
+    SDValue visitShiftByConstant(SDNode *N, unsigned Amt);
+
+    bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
+    SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
+    SDValue SimplifySelect(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2,
+                             SDValue N3, ISD::CondCode CC,
+                             bool NotExtCompare = false);
+    SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          DebugLoc DL, bool foldBooleans = true);
+    SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                         unsigned HiOp);
+    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
+    SDValue BuildSDIV(SDNode *N);
+    SDValue BuildUDIV(SDNode *N);
+    SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                               bool DemandHighBits = true);
+    SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
+    SDValue ReduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
+
+    SDValue GetDemandedBits(SDValue V, const APInt &Mask);
+
+    /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for aliasing nodes and adding them to the Aliases vector.
+    void GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                          SmallVector<SDValue, 8> &Aliases);
+
+    /// isAlias - Return true if there is any possibility that the two addresses
+    /// overlap.
+    bool isAlias(SDValue Ptr1, int64_t Size1,
+                 const Value *SrcValue1, int SrcValueOffset1,
+                 unsigned SrcValueAlign1,
+                 const MDNode *TBAAInfo1,
+                 SDValue Ptr2, int64_t Size2,
+                 const Value *SrcValue2, int SrcValueOffset2,
+                 unsigned SrcValueAlign2,
+                 const MDNode *TBAAInfo2) const;
+
+    /// isAlias - Return true if there is any possibility that the two addresses
+    /// overlap.
+    bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1);
+
+    /// FindAliasInfo - Extracts the relevant alias information from the memory
+    /// node.  Returns true if the operand was a load.
+    bool FindAliasInfo(SDNode *N,
+                       SDValue &Ptr, int64_t &Size,
+                       const Value *&SrcValue, int &SrcValueOffset,
+                       unsigned &SrcValueAlignment,
+                       const MDNode *&TBAAInfo) const;
+
+    /// FindBetterChain - Walk up chain skipping non-aliasing memory nodes,
+    /// looking for a better chain (aliasing node.)
+    SDValue FindBetterChain(SDNode *N, SDValue Chain);
+
+    /// Merge consecutive store operations into a wide store.
+    /// This optimization uses wide integers or vectors when possible.
+    /// \return True if some memory operations were changed.
+    bool MergeConsecutiveStores(StoreSDNode *N);
+
+  public:
+    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
+
+    /// Run - runs the dag combiner on all nodes in the work list
+    void Run(CombineLevel AtLevel);
+
+    SelectionDAG &getDAG() const { return DAG; }
+
+    /// getShiftAmountTy - Returns a type large enough to hold any valid
+    /// shift amount - before type legalization these can be huge.
+    EVT getShiftAmountTy(EVT LHSTy) {
+      return LegalTypes ? TLI.getShiftAmountTy(LHSTy) : TLI.getPointerTy();
+    }
+
+    /// isTypeLegal - This method returns true if we are running before type
+    /// legalization or if the specified VT is legal.
+    bool isTypeLegal(const EVT &VT) {
+      if (!LegalTypes) return true;
+      return TLI.isTypeLegal(VT);
+    }
+  };
+}
+
+
+namespace {
+/// WorkListRemover - This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class WorkListRemover : public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+public:
+  explicit WorkListRemover(DAGCombiner &dc)
+    : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
+
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    DC.removeFromWorkList(N);
+  }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetLowering::DAGCombinerInfo implementation
+//===----------------------------------------------------------------------===//
+
+void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->AddToWorkList(N);
+}
+
+void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->removeFromWorkList(N);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, const std::vector<SDValue> &To, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
+}
+
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
+}
+
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isNegatibleForFree - Return 1 if we can compute the negated form of the
+/// specified expression for the same cost as the expression itself, or 2 if we
+/// can compute the negated form more cheaply than the expression itself.
+static char isNegatibleForFree(SDValue Op, bool LegalOperations,
+                               const TargetLowering &TLI,
+                               const TargetOptions *Options,
+                               unsigned Depth = 0) {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return 2;
+
+  // Don't allow anything with multiple uses.
+  if (!Op.hasOneUse()) return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > 6) return 0;
+
+  switch (Op.getOpcode()) {
+  default: return false;
+  case ISD::ConstantFP:
+    // Don't invert constant FP values after legalize.  The negated constant
+    // isn't necessarily legal.
+    return LegalOperations ? 0 : 1;
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    if (!Options->UnsafeFPMath) return 0;
+
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOperations &&
+        !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
+      return 0;
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
+                              Depth + 1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!Options->UnsafeFPMath) return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    if (Options->HonorSignDependentRoundingFPMath()) return 0;
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
+      return V;
+
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
+                              Depth + 1);
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
+                              Depth + 1);
+  }
+}
+
+/// GetNegatedExpression - If isNegatibleForFree returns true, this function
+/// returns the newly negated expression.
+static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                    bool LegalOperations, unsigned Depth = 0) {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
+
+  // Don't allow anything with multiple uses.
+  assert(Op.hasOneUse() && "Unknown reuse!");
+
+  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown code");
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, Op.getValueType());
+  }
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    assert(DAG.getTarget().Options.UnsafeFPMath);
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(),
+                           &DAG.getTarget().Options, Depth+1))
+      return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1),
+                       Op.getOperand(0));
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    assert(DAG.getTarget().Options.UnsafeFPMath);
+
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
+      if (N0CFP->getValueAPF().isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0));
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    assert(!DAG.getTarget().Options.HonorSignDependentRoundingFPMath());
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(),
+                           &DAG.getTarget().Options, Depth+1))
+      return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(0),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1));
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, Depth+1));
+  case ISD::FP_ROUND:
+      return DAG.getNode(ISD::FP_ROUND, Op.getDebugLoc(), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+  }
+}
+
+
+// isSetCCEquivalent - Return true if this node is a setcc, or is a select_cc
+// that selects between the values 1 and 0, making it equivalent to a setcc.
+// Also, set the incoming LHS, RHS, and CC references to the appropriate
+// nodes based on the type of node we are checking.  This simplifies life a
+// bit for the callers.
+static bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                              SDValue &CC) {
+  if (N.getOpcode() == ISD::SETCC) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(2);
+    return true;
+  }
+  if (N.getOpcode() == ISD::SELECT_CC &&
+      N.getOperand(2).getOpcode() == ISD::Constant &&
+      N.getOperand(3).getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(N.getOperand(2))->getAPIntValue() == 1 &&
+      cast<ConstantSDNode>(N.getOperand(3))->isNullValue()) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(4);
+    return true;
+  }
+  return false;
+}
+
+// isOneUseSetCC - Return true if this is a SetCC-equivalent operation with only
+// one use.  If this is true, it allows the users to invert the operation for
+// free when it is profitable to do so.
+static bool isOneUseSetCC(SDValue N) {
+  SDValue N0, N1, N2;
+  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+    return true;
+  return false;
+}
+
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
+                                    SDValue N0, SDValue N1) {
+  EVT VT = N0.getValueType();
+  if (N0.getOpcode() == Opc && isa<ConstantSDNode>(N0.getOperand(1))) {
+    if (isa<ConstantSDNode>(N1)) {
+      // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N0.getOperand(1)),
+                                   cast<ConstantSDNode>(N1));
+      return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+    }
+    if (N0.hasOneUse()) {
+      // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+    }
+  }
+
+  if (N1.getOpcode() == Opc && isa<ConstantSDNode>(N1.getOperand(1))) {
+    if (isa<ConstantSDNode>(N0)) {
+      // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
+      SDValue OpNode =
+        DAG.FoldConstantArithmetic(Opc, VT,
+                                   cast<ConstantSDNode>(N1.getOperand(1)),
+                                   cast<ConstantSDNode>(N0));
+      return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+    }
+    if (N1.hasOneUse()) {
+      // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
+      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+                                   N1.getOperand(0), N0);
+      AddToWorkList(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                               bool AddTo) {
+  assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.1 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        To[0].getNode()->dump(&DAG);
+        dbgs() << " and " << NumTo-1 << " other values\n";
+        for (unsigned i = 0, e = NumTo; i != e; ++i)
+          assert((!To[i].getNode() ||
+                  N->getValueType(i) == To[i].getValueType()) &&
+                 "Cannot combine value to value of different type!"));
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesWith(N, To);
+  if (AddTo) {
+    // Push the new nodes and any users onto the worklist
+    for (unsigned i = 0, e = NumTo; i != e; ++i) {
+      if (To[i].getNode()) {
+        AddToWorkList(To[i].getNode());
+        AddUsersToWorkList(To[i].getNode());
+      }
+    }
+  }
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (N->use_empty()) {
+    // Nodes can be reintroduced into the worklist.  Make sure we do not
+    // process a node that has been replaced.
+    removeFromWorkList(N);
+
+    // Finally, since the node is now dead, remove it from the graph.
+    DAG.DeleteNode(N);
+  }
+  return SDValue(N, 0);
+}
+
+void DAGCombiner::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace all uses.  If any nodes become isomorphic to other nodes and
+  // are deleted, make sure to remove them from our worklist.
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
+
+  // Push the new node and any (possibly new) users onto the worklist.
+  AddToWorkList(TLO.New.getNode());
+  AddUsersToWorkList(TLO.New.getNode());
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (TLO.Old.getNode()->use_empty()) {
+    removeFromWorkList(TLO.Old.getNode());
+
+    // If the operands of this node are only used by the node, they will now
+    // be dead.  Make sure to visit them first to delete dead nodes early.
+    for (unsigned i = 0, e = TLO.Old.getNode()->getNumOperands(); i != e; ++i)
+      if (TLO.Old.getNode()->getOperand(i).getNode()->hasOneUse())
+        AddToWorkList(TLO.Old.getNode()->getOperand(i).getNode());
+
+    DAG.DeleteNode(TLO.Old.getNode());
+  }
+}
+
+/// SimplifyDemandedBits - Check the specified integer node value to see if
+/// it can be simplified or if things it uses can be simplified by bit
+/// propagation.  If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  APInt KnownZero, KnownOne;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    return false;
+
+  // Revisit the node.
+  AddToWorkList(Op.getNode());
+
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.2 ";
+        TLO.Old.getNode()->dump(&DAG);
+        dbgs() << "\nWith: ";
+        TLO.New.getNode()->dump(&DAG);
+        dbgs() << '\n');
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
+  DebugLoc dl = Load->getDebugLoc();
+  EVT VT = Load->getValueType(0);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, SDValue(ExtLoad, 0));
+
+  DEBUG(dbgs() << "\nReplacing.9 ";
+        Load->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Trunc.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
+  removeFromWorkList(Load);
+  DAG.DeleteNode(Load);
+  AddToWorkList(Trunc.getNode());
+}
+
+SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
+  Replace = false;
+  DebugLoc dl = Op.getDebugLoc();
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
+                                                  : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    Replace = true;
+    return DAG.getExtLoad(ExtType, dl, PVT,
+                          LD->getChain(), LD->getBasePtr(),
+                          LD->getPointerInfo(),
+                          MemVT, LD->isVolatile(),
+                          LD->isNonTemporal(), LD->getAlignment());
+  }
+
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::AssertSext:
+    return DAG.getNode(ISD::AssertSext, dl, PVT,
+                       SExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::AssertZext:
+    return DAG.getNode(ISD::AssertZext, dl, PVT,
+                       ZExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::Constant: {
+    unsigned ExtOpc =
+      Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    return DAG.getNode(ExtOpc, dl, PVT, Op);
+  }
+  }
+
+  if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
+    return SDValue();
+  return DAG.getNode(ISD::ANY_EXTEND, dl, PVT, Op);
+}
+
+SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
+  if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
+    return SDValue();
+  EVT OldVT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (NewOp.getNode() == 0)
+    return SDValue();
+  AddToWorkList(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NewOp.getValueType(), NewOp,
+                     DAG.getValueType(OldVT));
+}
+
+SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
+  EVT OldVT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (NewOp.getNode() == 0)
+    return SDValue();
+  AddToWorkList(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getZeroExtendInReg(NewOp, dl, OldVT);
+}
+
+/// PromoteIntBinOp - Promote the specified integer binary operation if the
+/// target indicates it is beneficial. e.g. On x86, it's usually better to
+/// promote i16 operations to i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace0 = false;
+    SDValue N0 = Op.getOperand(0);
+    SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
+    if (NN0.getNode() == 0)
+      return SDValue();
+
+    bool Replace1 = false;
+    SDValue N1 = Op.getOperand(1);
+    SDValue NN1;
+    if (N0 == N1)
+      NN1 = NN0;
+    else {
+      NN1 = PromoteOperand(N1, PVT, Replace1);
+      if (NN1.getNode() == 0)
+        return SDValue();
+    }
+
+    AddToWorkList(NN0.getNode());
+    if (NN1.getNode())
+      AddToWorkList(NN1.getNode());
+
+    if (Replace0)
+      ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
+    if (Replace1)
+      ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, NN0, NN1));
+  }
+  return SDValue();
+}
+
+/// PromoteIntShiftOp - Promote the specified integer shift operation if the
+/// target indicates it is beneficial. e.g. On x86, it's usually better to
+/// promote i16 operations to i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace = false;
+    SDValue N0 = Op.getOperand(0);
+    if (Opc == ISD::SRA)
+      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+    else if (Opc == ISD::SRL)
+      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+    else
+      N0 = PromoteOperand(N0, PVT, Replace);
+    if (N0.getNode() == 0)
+      return SDValue();
+
+    AddToWorkList(N0.getNode());
+    if (Replace)
+      ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, N0, Op.getOperand(1)));
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::PromoteExtend(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+    // fold (aext (aext x)) -> (aext x)
+    // fold (aext (zext x)) -> (zext x)
+    // fold (aext (sext x)) -> (sext x)
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), VT, Op.getOperand(0));
+  }
+  return SDValue();
+}
+
+bool DAGCombiner::PromoteLoad(SDValue Op) {
+  if (!LegalOperations)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return false;
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return false;
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    DebugLoc dl = Op.getDebugLoc();
+    SDNode *N = Op.getNode();
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
+                                                  : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
+                                   LD->getChain(), LD->getBasePtr(),
+                                   LD->getPointerInfo(),
+                                   MemVT, LD->isVolatile(),
+                                   LD->isNonTemporal(), LD->getAlignment());
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
+
+    DEBUG(dbgs() << "\nPromoting ";
+          N->dump(&DAG);
+          dbgs() << "\nTo: ";
+          Result.getNode()->dump(&DAG);
+          dbgs() << '\n');
+    WorkListRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
+    removeFromWorkList(N);
+    DAG.DeleteNode(N);
+    AddToWorkList(Result.getNode());
+    return true;
+  }
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Main DAG Combiner implementation
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::Run(CombineLevel AtLevel) {
+  // set the instance variables, so that the various visit routines may use it.
+  Level = AtLevel;
+  LegalOperations = Level >= AfterLegalizeVectorOps;
+  LegalTypes = Level >= AfterLegalizeTypes;
+
+  // Add all the dag nodes to the worklist.
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I)
+    AddToWorkList(I);
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+
+  // The root of the dag may dangle to deleted nodes until the dag combiner is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // while the worklist isn't empty, find a node and
+  // try and combine it.
+  while (!WorkListContents.empty()) {
+    SDNode *N;
+    // The WorkListOrder holds the SDNodes in order, but it may contain duplicates.
+    // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
+    // worklist *should* contain, and check the node we want to visit is should
+    // actually be visited.
+    do {
+      N = WorkListOrder.pop_back_val();
+    } while (!WorkListContents.erase(N));
+
+    // If N has no uses, it is dead.  Make sure to revisit all N's operands once
+    // N is deleted from the DAG, since they too may now be dead or may have a
+    // reduced number of uses, allowing other xforms.
+    if (N->use_empty() && N != &Dummy) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        AddToWorkList(N->getOperand(i).getNode());
+
+      DAG.DeleteNode(N);
+      continue;
+    }
+
+    SDValue RV = combine(N);
+
+    if (RV.getNode() == 0)
+      continue;
+
+    ++NodesCombined;
+
+    // If we get back the same node we passed in, rather than a new node or
+    // zero, we know that the node must have defined multiple values and
+    // CombineTo was used.  Since CombineTo takes care of the worklist
+    // mechanics for us, we have no work to do in this case.
+    if (RV.getNode() == N)
+      continue;
+
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned new node!");
+
+    DEBUG(dbgs() << "\nReplacing.3 ";
+          N->dump(&DAG);
+          dbgs() << "\nWith: ";
+          RV.getNode()->dump(&DAG);
+          dbgs() << '\n');
+
+    // Transfer debug value.
+    DAG.TransferDbgValues(SDValue(N, 0), RV);
+    WorkListRemover DeadNodes(*this);
+    if (N->getNumValues() == RV.getNode()->getNumValues())
+      DAG.ReplaceAllUsesWith(N, RV.getNode());
+    else {
+      assert(N->getValueType(0) == RV.getValueType() &&
+             N->getNumValues() == 1 && "Type mismatch");
+      SDValue OpV = RV;
+      DAG.ReplaceAllUsesWith(N, &OpV);
+    }
+
+    // Push the new node and any users onto the worklist
+    AddToWorkList(RV.getNode());
+    AddUsersToWorkList(RV.getNode());
+
+    // Add any uses of the old node to the worklist in case this node is the
+    // last one that uses them.  They may become dead after this node is
+    // deleted.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      AddToWorkList(N->getOperand(i).getNode());
+
+    // Finally, if the node is now dead, remove it from the graph.  The node
+    // may not be dead if the replacement process recursively simplified to
+    // something else needing this node.
+    if (N->use_empty()) {
+      // Nodes can be reintroduced into the worklist.  Make sure we do not
+      // process a node that has been replaced.
+      removeFromWorkList(N);
+
+      // Finally, since the node is now dead, remove it from the graph.
+      DAG.DeleteNode(N);
+    }
+  }
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  DAG.setRoot(Dummy.getValue());
+  DAG.RemoveDeadNodes();
+}
+
+SDValue DAGCombiner::visit(SDNode *N) {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::TokenFactor:        return visitTokenFactor(N);
+  case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
+  case ISD::ADD:                return visitADD(N);
+  case ISD::SUB:                return visitSUB(N);
+  case ISD::ADDC:               return visitADDC(N);
+  case ISD::SUBC:               return visitSUBC(N);
+  case ISD::ADDE:               return visitADDE(N);
+  case ISD::SUBE:               return visitSUBE(N);
+  case ISD::MUL:                return visitMUL(N);
+  case ISD::SDIV:               return visitSDIV(N);
+  case ISD::UDIV:               return visitUDIV(N);
+  case ISD::SREM:               return visitSREM(N);
+  case ISD::UREM:               return visitUREM(N);
+  case ISD::MULHU:              return visitMULHU(N);
+  case ISD::MULHS:              return visitMULHS(N);
+  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
+  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SMULO:              return visitSMULO(N);
+  case ISD::UMULO:              return visitUMULO(N);
+  case ISD::SDIVREM:            return visitSDIVREM(N);
+  case ISD::UDIVREM:            return visitUDIVREM(N);
+  case ISD::AND:                return visitAND(N);
+  case ISD::OR:                 return visitOR(N);
+  case ISD::XOR:                return visitXOR(N);
+  case ISD::SHL:                return visitSHL(N);
+  case ISD::SRA:                return visitSRA(N);
+  case ISD::SRL:                return visitSRL(N);
+  case ISD::CTLZ:               return visitCTLZ(N);
+  case ISD::CTLZ_ZERO_UNDEF:    return visitCTLZ_ZERO_UNDEF(N);
+  case ISD::CTTZ:               return visitCTTZ(N);
+  case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
+  case ISD::CTPOP:              return visitCTPOP(N);
+  case ISD::SELECT:             return visitSELECT(N);
+  case ISD::SELECT_CC:          return visitSELECT_CC(N);
+  case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
+  case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
+  case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
+  case ISD::TRUNCATE:           return visitTRUNCATE(N);
+  case ISD::BITCAST:            return visitBITCAST(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
+  case ISD::FADD:               return visitFADD(N);
+  case ISD::FSUB:               return visitFSUB(N);
+  case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FMA:                return visitFMA(N);
+  case ISD::FDIV:               return visitFDIV(N);
+  case ISD::FREM:               return visitFREM(N);
+  case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
+  case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
+  case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
+  case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
+  case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
+  case ISD::FP_ROUND:           return visitFP_ROUND(N);
+  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
+  case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
+  case ISD::FNEG:               return visitFNEG(N);
+  case ISD::FABS:               return visitFABS(N);
+  case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FCEIL:              return visitFCEIL(N);
+  case ISD::FTRUNC:             return visitFTRUNC(N);
+  case ISD::BRCOND:             return visitBRCOND(N);
+  case ISD::BR_CC:              return visitBR_CC(N);
+  case ISD::LOAD:               return visitLOAD(N);
+  case ISD::STORE:              return visitSTORE(N);
+  case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
+  case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
+  case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
+  case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
+  case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::MEMBARRIER:         return visitMEMBARRIER(N);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::combine(SDNode *N) {
+  SDValue RV = visit(N);
+
+  // If nothing happened, try a target-specific DAG combine.
+  if (RV.getNode() == 0) {
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned NULL!");
+
+    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
+        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
+
+      // Expose the DAG combiner to the target combiner impls.
+      TargetLowering::DAGCombinerInfo
+        DagCombineInfo(DAG, Level, false, this);
+
+      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
+    }
+  }
+
+  // If nothing happened still, try promoting the operation.
+  if (RV.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      RV = PromoteIntBinOp(SDValue(N, 0));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      RV = PromoteIntShiftOp(SDValue(N, 0));
+      break;
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      RV = PromoteExtend(SDValue(N, 0));
+      break;
+    case ISD::LOAD:
+      if (PromoteLoad(SDValue(N, 0)))
+        RV = SDValue(N, 0);
+      break;
+    }
+  }
+
+  // If N is a commutative binary node, try commuting it to enable more
+  // sdisel CSE.
+  if (RV.getNode() == 0 &&
+      SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+      N->getNumValues() == 1) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    // Constant operands are canonicalized to RHS.
+    if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
+      SDValue Ops[] = { N1, N0 };
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(),
+                                            Ops, 2);
+      if (CSENode)
+        return SDValue(CSENode, 0);
+    }
+  }
+
+  return RV;
+}
+
+/// getInputChainForNode - Given a node, return its input chain if it has one,
+/// otherwise return a null sd operand.
+static SDValue getInputChainForNode(SDNode *N) {
+  if (unsigned NumOps = N->getNumOperands()) {
+    if (N->getOperand(0).getValueType() == MVT::Other)
+      return N->getOperand(0);
+    else if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+      return N->getOperand(NumOps-1);
+    for (unsigned i = 1; i < NumOps-1; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        return N->getOperand(i);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
+  // If N has two operands, where one has an input chain equal to the other,
+  // the 'other' chain is redundant.
+  if (N->getNumOperands() == 2) {
+    if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
+      return N->getOperand(0);
+    if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
+      return N->getOperand(1);
+  }
+
+  SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
+  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallPtrSet<SDNode*, 16> SeenOps;
+  bool Changed = false;             // If we should replace this token factor.
+
+  // Start out with this token factor.
+  TFs.push_back(N);
+
+  // Iterate through token factors.  The TFs grows when new token factors are
+  // encountered.
+  for (unsigned i = 0; i < TFs.size(); ++i) {
+    SDNode *TF = TFs[i];
+
+    // Check each of the operands.
+    for (unsigned i = 0, ie = TF->getNumOperands(); i != ie; ++i) {
+      SDValue Op = TF->getOperand(i);
+
+      switch (Op.getOpcode()) {
+      case ISD::EntryToken:
+        // Entry tokens don't need to be added to the list. They are
+        // rededundant.
+        Changed = true;
+        break;
+
+      case ISD::TokenFactor:
+        if (Op.hasOneUse() &&
+            std::find(TFs.begin(), TFs.end(), Op.getNode()) == TFs.end()) {
+          // Queue up for processing.
+          TFs.push_back(Op.getNode());
+          // Clean up in case the token factor is removed.
+          AddToWorkList(Op.getNode());
+          Changed = true;
+          break;
+        }
+        // Fall thru
+
+      default:
+        // Only add if it isn't already in the list.
+        if (SeenOps.insert(Op.getNode()))
+          Ops.push_back(Op);
+        else
+          Changed = true;
+        break;
+      }
+    }
+  }
+
+  SDValue Result;
+
+  // If we've change things around then replace token factor.
+  if (Changed) {
+    if (Ops.empty()) {
+      // The entry token is the only possible outcome.
+      Result = DAG.getEntryNode();
+    } else {
+      // New and improved token factor.
+      Result = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                           MVT::Other, &Ops[0], Ops.size());
+    }
+
+    // Don't add users to work list.
+    return CombineTo(N, Result, false);
+  }
+
+  return Result;
+}
+
+/// MERGE_VALUES can always be eliminated.
+SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
+  WorkListRemover DeadNodes(*this);
+  // Replacing results may cause a different MERGE_VALUES to suddenly
+  // be CSE'd with N, and carry its uses with it. Iterate until no
+  // uses remain, to ensure that the node can be safely deleted.
+  // First add the users of this node to the work list so that they
+  // can be tried again once they have new operands.
+  AddUsersToWorkList(N);
+  do {
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
+  } while (!N->use_empty());
+  removeFromWorkList(N);
+  DAG.DeleteNode(N);
+  return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+}
+
+static
+SDValue combineShlAddConstant(DebugLoc DL, SDValue N0, SDValue N1,
+                              SelectionDAG &DAG) {
+  EVT VT = N0.getValueType();
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N01);
+
+  if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N00.getOperand(1))) {
+    // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+    N0 = DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                     DAG.getNode(ISD::SHL, N00.getDebugLoc(), VT,
+                                 N00.getOperand(0), N01),
+                     DAG.getNode(ISD::SHL, N01.getDebugLoc(), VT,
+                                 N00.getOperand(1), N01));
+    return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+
+    // fold (add x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+  }
+
+  // fold (add x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (add c1, c2) -> c1+c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (add x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (add Sym, c) -> Sym+c
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
+        GA->getOpcode() == ISD::GlobalAddress)
+      return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+                                  GA->getOffset() +
+                                    (uint64_t)N1C->getSExtValue());
+  // fold ((c1-A)+c2) -> (c1+c2)-A
+  if (N1C && N0.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getConstant(N1C->getAPIntValue()+
+                                         N0C->getAPIntValue(), VT),
+                         N0.getOperand(1));
+  // reassociate add
+  SDValue RADD = ReassociateOps(ISD::ADD, N->getDebugLoc(), N0, N1);
+  if (RADD.getNode() != 0)
+    return RADD;
+  // fold ((0-A) + B) -> B-A
+  if (N0.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N0.getOperand(0)) &&
+      cast<ConstantSDNode>(N0.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1, N0.getOperand(1));
+  // fold (A + (0-B)) -> A-B
+  if (N1.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N1.getOperand(0)) &&
+      cast<ConstantSDNode>(N1.getOperand(0))->isNullValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1.getOperand(1));
+  // fold (A+(B-A)) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
+    return N1.getOperand(0);
+  // fold ((B-A)+A) -> B
+  if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
+    return N0.getOperand(0);
+  // fold (A+(B-(A+C))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(0))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(1));
+  // fold (A+(B-(C+A))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(1))
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(0));
+  // fold (A+((B-A)+or-C)) to (B+or-C)
+  if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
+      N1.getOperand(0).getOpcode() == ISD::SUB &&
+      N0 == N1.getOperand(0).getOperand(1))
+    return DAG.getNode(N1.getOpcode(), N->getDebugLoc(), VT,
+                       N1.getOperand(0).getOperand(0), N1.getOperand(1));
+
+  // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    SDValue N10 = N1.getOperand(0);
+    SDValue N11 = N1.getOperand(1);
+
+    if (isa<ConstantSDNode>(N00) || isa<ConstantSDNode>(N10))
+      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, N1.getDebugLoc(), VT, N01, N11));
+  }
+
+  if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (a+b) -> (a|b) iff a and b share no bits.
+  if (VT.isInteger() && !VT.isVector()) {
+    APInt LHSZero, LHSOne;
+    APInt RHSZero, RHSOne;
+    DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
+
+    if (LHSZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
+
+      // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+      // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+      if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
+        return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1);
+    }
+  }
+
+  // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
+  if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N0, N1, DAG);
+    if (Result.getNode()) return Result;
+  }
+  if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N1, N0, DAG);
+    if (Result.getNode()) return Result;
+  }
+
+  // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
+  if (N1.getOpcode() == ISD::SHL &&
+      N1.getOperand(0).getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(0)))
+      if (C->getAPIntValue() == 0)
+        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0,
+                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                                       N1.getOperand(0).getOperand(1),
+                                       N1.getOperand(1)));
+  if (N0.getOpcode() == ISD::SHL &&
+      N0.getOperand(0).getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(N0.getOperand(0).getOperand(0)))
+      if (C->getAPIntValue() == 0)
+        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1,
+                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                                       N0.getOperand(0).getOperand(1),
+                                       N0.getOperand(1)));
+
+  if (N1.getOpcode() == ISD::AND) {
+    SDValue AndOp0 = N1.getOperand(0);
+    ConstantSDNode *AndOp1 = dyn_cast<ConstantSDNode>(N1->getOperand(1));
+    unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0);
+    unsigned DestBits = VT.getScalarType().getSizeInBits();
+
+    // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
+    // and similar xforms where the inner op is either ~0 or 0.
+    if (NumSignBits == DestBits && AndOp1 && AndOp1->isOne()) {
+      DebugLoc DL = N->getDebugLoc();
+      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
+    }
+  }
+
+  // add (sext i1), X -> sub X, (zext i1)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND &&
+      N0.getOperand(0).getValueType() == MVT::i1 &&
+      !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
+    DebugLoc DL = N->getDebugLoc();
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE,
+                                 N->getDebugLoc(), MVT::Glue));
+
+  // canonicalize constant to RHS.
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+
+  // fold (addc x, 0) -> x + no carry out
+  if (N1C && N1C->isNullValue())
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
+                                        N->getDebugLoc(), MVT::Glue));
+
+  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
+  APInt LHSZero, LHSOne;
+  APInt RHSZero, RHSOne;
+  DAG.ComputeMaskedBits(N0, LHSZero, LHSOne);
+
+  if (LHSZero.getBoolValue()) {
+    DAG.ComputeMaskedBits(N1, RHSZero, RHSOne);
+
+    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
+      return CombineTo(N, DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1),
+                       DAG.getNode(ISD::CARRY_FALSE,
+                                   N->getDebugLoc(), MVT::Glue));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDE, N->getDebugLoc(), N->getVTList(),
+                       N1, N0, CarryIn);
+
+  // fold (adde x, y, false) -> (addc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+// Since it may not be valid to emit a fold to zero for vector initializers
+// check if we can before folding.
+static SDValue tryFoldToZero(DebugLoc DL, const TargetLowering &TLI, EVT VT,
+                             SelectionDAG &DAG, bool LegalOperations) {
+  if (!VT.isVector()) {
+    return DAG.getConstant(0, VT);
+  }
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
+    // Produce a vector of zeros.
+    SDValue El = DAG.getConstant(0, VT.getVectorElementType());
+    std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+      &Ops[0], Ops.size());
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? 0 :
+    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+
+    // fold (sub x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (sub x, x) -> 0
+  // FIXME: Refactor this and xor and other similar operations together.
+  if (N0 == N1)
+    return tryFoldToZero(N->getDebugLoc(), TLI, VT, DAG, LegalOperations);
+  // fold (sub c1, c2) -> c1-c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
+  // fold (sub x, c) -> (add x, -c)
+  if (N1C)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), VT));
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
+  if (N0C && N0C->isAllOnesValue())
+    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+  // fold A-(A-B) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
+    return N1.getOperand(1);
+  // fold (A+B)-A -> B
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
+    return N0.getOperand(1);
+  // fold (A+B)-B -> A
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
+    return N0.getOperand(0);
+  // fold C2-(A+C1) -> (C2-C1)-A
+  if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
+    SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
+                                   VT);
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
+                       N1.getOperand(0));
+  }
+  // fold ((A+(B+or-C))-B) -> A+or-C
+  if (N0.getOpcode() == ISD::ADD &&
+      (N0.getOperand(1).getOpcode() == ISD::SUB ||
+       N0.getOperand(1).getOpcode() == ISD::ADD) &&
+      N0.getOperand(1).getOperand(0) == N1)
+    return DAG.getNode(N0.getOperand(1).getOpcode(), N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(1));
+  // fold ((A+(C+B))-B) -> A+C
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+  // fold ((A-(B-C))-C) -> A-B
+  if (N0.getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+
+  // If either operand of a sub is undef, the result is undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  // If the relocation model supports it, consider symbol offsets.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
+      // fold (sub Sym, c) -> Sym-c
+      if (N1C && GA->getOpcode() == ISD::GlobalAddress)
+        return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+                                    GA->getOffset() -
+                                      (uint64_t)N1C->getSExtValue());
+      // fold (sub Sym+c1, Sym+c2) -> c1-c2
+      if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
+        if (GA->getGlobal() == GB->getGlobal())
+          return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
+                                 VT);
+    }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+                                 MVT::Glue));
+
+  // fold (subc x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, VT),
+                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+                                 MVT::Glue));
+
+  // fold (subc x, 0) -> x + no borrow
+  if (N1C && N1C->isNullValue())
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+                                        MVT::Glue));
+
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (N0C && N0C->isAllOnesValue())
+    return CombineTo(N, DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+                                 MVT::Glue));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (sube x, y, false) -> (subc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::SUBC, N->getDebugLoc(), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (mul x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (mul c1, c2) -> c1*c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (mul x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mul x, -1) -> 0-x
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // fold (mul x, (1 << c)) -> x << c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy(N0.getValueType())));
+  // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
+  if (N1C && (-N1C->getAPIntValue()).isPowerOf2()) {
+    unsigned Log2Val = (-N1C->getAPIntValue()).logBase2();
+    // FIXME: If the input is something that is easily negated (e.g. a
+    // single-use add), we should put the negate there.
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT),
+                       DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                            DAG.getConstant(Log2Val,
+                                      getShiftAmountTy(N0.getValueType()))));
+  }
+  // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(N0.getOperand(1))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                             N1, N0.getOperand(1));
+    AddToWorkList(C3.getNode());
+    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                       N0.getOperand(0), C3);
+  }
+
+  // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
+  // use.
+  {
+    SDValue Sh(0,0), Y(0,0);
+    // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
+    if (N0.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N0.getOperand(1)) &&
+        N0.getNode()->hasOneUse()) {
+      Sh = N0; Y = N1;
+    } else if (N1.getOpcode() == ISD::SHL &&
+               isa<ConstantSDNode>(N1.getOperand(1)) &&
+               N1.getNode()->hasOneUse()) {
+      Sh = N1; Y = N0;
+    }
+
+    if (Sh.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+                         Mul, Sh.getOperand(1));
+    }
+  }
+
+  // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      isa<ConstantSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+                       DAG.getNode(ISD::MUL, N0.getDebugLoc(), VT,
+                                   N0.getOperand(0), N1),
+                       DAG.getNode(ISD::MUL, N1.getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  // reassociate mul
+  SDValue RMUL = ReassociateOps(ISD::MUL, N->getDebugLoc(), N0, N1);
+  if (RMUL.getNode() != 0)
+    return RMUL;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (sdiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C);
+  // fold (sdiv X, 1) -> X
+  if (N1C && N1C->getAPIntValue() == 1LL)
+    return N0;
+  // fold (sdiv X, -1) -> 0-X
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), N0);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UDIV, N->getDebugLoc(), N1.getValueType(),
+                         N0, N1);
+  }
+  // fold (sdiv X, pow2) -> simple ops after legalize
+  if (N1C && !N1C->isNullValue() &&
+      (N1C->getAPIntValue().isPowerOf2() ||
+       (-N1C->getAPIntValue()).isPowerOf2())) {
+    // If dividing by powers of two is cheap, then don't perform the following
+    // fold.
+    if (TLI.isPow2DivCheap())
+      return SDValue();
+
+    unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
+
+    // Splat the sign bit into the register
+    SDValue SGN = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                              DAG.getConstant(VT.getSizeInBits()-1,
+                                       getShiftAmountTy(N0.getValueType())));
+    AddToWorkList(SGN.getNode());
+
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue SRL = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, SGN,
+                              DAG.getConstant(VT.getSizeInBits() - lg2,
+                                       getShiftAmountTy(SGN.getValueType())));
+    SDValue ADD = DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, SRL);
+    AddToWorkList(SRL.getNode());
+    AddToWorkList(ADD.getNode());    // Divide by pow2
+    SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, ADD,
+                  DAG.getConstant(lg2, getShiftAmountTy(ADD.getValueType())));
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (N1C->getAPIntValue().isNonNegative())
+      return SRA;
+
+    AddToWorkList(SRA.getNode());
+    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+                       DAG.getConstant(0, VT), SRA);
+  }
+
+  // if integer divide is expensive and we satisfy the requirements, emit an
+  // alternate sequence.
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+    SDValue Op = BuildSDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (udiv c1, c2) -> c1/c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
+  // fold (udiv x, (1 << c)) -> x >>u c
+  if (N1C && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+                                       getShiftAmountTy(N0.getValueType())));
+  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        EVT ADDVT = N1.getOperand(1).getValueType();
+        SDValue Add = DAG.getNode(ISD::ADD, N->getDebugLoc(), ADDVT,
+                                  N1.getOperand(1),
+                                  DAG.getConstant(SHC->getAPIntValue()
+                                                                  .logBase2(),
+                                                  ADDVT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+  // fold (udiv x, c) -> alternate
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap()) {
+    SDValue Op = BuildUDIV(N);
+    if (Op.getNode()) return Op;
+  }
+
+  // undef / X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X / undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (srem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C);
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UREM, N->getDebugLoc(), VT, N0, N1);
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::SDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (urem c1, c2) -> c1%c2
+  if (N0C && N1C && !N1C->isNullValue())
+    return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
+  // fold (urem x, pow2) -> (and x, pow2-1)
+  if (N1C && !N1C->isNullValue() && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue()-1,VT));
+  // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        SDValue Add =
+          DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1,
+                 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()),
+                                 VT));
+        AddToWorkList(Add.getNode());
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, Add);
+      }
+    }
+  }
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  if (N1C && !N1C->isNullValue()) {
+    SDValue Div = DAG.getNode(ISD::UDIV, N->getDebugLoc(), VT, N0, N1);
+    AddToWorkList(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+                                OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      AddToWorkList(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // undef % X -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // X % undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  // fold (mulhs x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhs x, 1) -> (sra x, size(x)-1)
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::SRA, N->getDebugLoc(), N0.getValueType(), N0,
+                       DAG.getConstant(N0.getValueType().getSizeInBits() - 1,
+                                       getShiftAmountTy(N0.getValueType())));
+  // fold (mulhs x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  // If the type twice as wide is legal, transform the mulhs to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHU(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  // fold (mulhu x, 0) -> 0
+  if (N1C && N1C->isNullValue())
+    return N1;
+  // fold (mulhu x, 1) -> 0
+  if (N1C && N1C->getAPIntValue() == 1)
+    return DAG.getConstant(0, N0.getValueType());
+  // fold (mulhu x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  // If the type twice as wide is legal, transform the mulhu to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifyNodeWithTwoResults - Perform optimizations common to nodes that
+/// compute two values. LoOp and HiOp give the opcodes for the two computations
+/// that are being performed. Return true if a simplification was made.
+///
+SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                                unsigned HiOp) {
+  // If the high half is not needed, just compute the low half.
+  bool HiExists = N->hasAnyUseOfValue(1);
+  if (!HiExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(LoOp, N->getValueType(0)))) {
+    SDValue Res = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If the low half is not needed, just compute the high half.
+  bool LoExists = N->hasAnyUseOfValue(0);
+  if (!LoExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
+    SDValue Res = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                              N->op_begin(), N->getNumOperands());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If both halves are used, return as it is.
+  if (LoExists && HiExists)
+    return SDValue();
+
+  // If the two computed results can be simplified separately, separate them.
+  if (LoExists) {
+    SDValue Lo = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Lo.getNode());
+    SDValue LoOpt = combine(Lo.getNode());
+    if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
+      return CombineTo(N, LoOpt, LoOpt);
+  }
+
+  if (HiExists) {
+    SDValue Hi = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+                             N->op_begin(), N->getNumOperands());
+    AddToWorkList(Hi.getNode());
+    SDValue HiOpt = combine(Hi.getNode());
+    if (HiOpt.getNode() && HiOpt != Hi &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
+      return CombineTo(N, HiOpt, HiOpt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS);
+  if (Res.getNode()) return Res;
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  // If the type twice as wide is legal, transform the mulhu to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
+      SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU);
+  if (Res.getNode()) return Res;
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  // If the type twice as wide is legal, transform the mulhu to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
+      SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMULO(SDNode *N) {
+  // (smulo x, 2) -> (saddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::SADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMULO(SDNode *N) {
+  // (umulo x, 2) -> (uaddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::UADDO, N->getDebugLoc(), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::SDIV, ISD::SREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIVREM(SDNode *N) {
+  SDValue Res = SimplifyNodeWithTwoResults(N, ISD::UDIV, ISD::UREM);
+  if (Res.getNode()) return Res;
+
+  return SDValue();
+}
+
+/// SimplifyBinOpWithSameOpcodeHands - If this is a binary operator with
+/// two operands of the same opcode, try to simplify it.
+SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
+
+  // Bail early if none of these transforms apply.
+  if (N0.getNode()->getNumOperands() == 0) return SDValue();
+
+  // For each of OP in AND/OR/XOR:
+  // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
+  // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
+  // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  //
+  // do not sink logical op inside of a vector extend, since it may combine
+  // into a vsetcc.
+  EVT Op0VT = N0.getOperand(0).getValueType();
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+       N0.getOpcode() == ISD::SIGN_EXTEND ||
+       // Avoid infinite looping with PromoteIntBinOp.
+       (N0.getOpcode() == ISD::ANY_EXTEND &&
+        (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        (!TLI.isZExtFree(VT, Op0VT) ||
+         !TLI.isTruncateFree(Op0VT, VT)) &&
+        TLI.isTypeLegal(Op0VT))) &&
+      !VT.isVector() &&
+      Op0VT == N1.getOperand(0).getValueType() &&
+      (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, ORNode);
+  }
+
+  // For each of OP in SHL/SRL/SRA/AND...
+  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
+  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
+  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
+       N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
+      N0.getOperand(1) == N1.getOperand(1)) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorkList(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                       ORNode, N0.getOperand(1));
+  }
+
+  // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
+  // Only perform this optimization after type legalization and before
+  // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
+  // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
+  // we don't want to undo this promotion.
+  // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
+  // on scalars.
+  if ((N0.getOpcode() == ISD::BITCAST ||
+       N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
+      Level == AfterLegalizeTypes) {
+    SDValue In0 = N0.getOperand(0);
+    SDValue In1 = N1.getOperand(0);
+    EVT In0Ty = In0.getValueType();
+    EVT In1Ty = In1.getValueType();
+    DebugLoc DL = N->getDebugLoc();
+    // If both incoming values are integers, and the original types are the
+    // same.
+    if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
+      SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
+      SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
+      AddToWorkList(Op.getNode());
+      return BC;
+    }
+  }
+
+  // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
+  // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
+  // If both shuffles use the same mask, and both shuffle within a single
+  // vector, then it is worthwhile to move the swizzle after the operation.
+  // The type-legalizer generates this pattern when loading illegal
+  // vector types from memory. In many cases this allows additional shuffle
+  // optimizations.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      N0.getOperand(1).getOpcode() == ISD::UNDEF &&
+      N1.getOperand(1).getOpcode() == ISD::UNDEF) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
+
+    assert(N0.getOperand(0).getValueType() == N1.getOperand(1).getValueType() &&
+           "Inputs to shuffles are not the same type");
+
+    unsigned NumElts = VT.getVectorNumElements();
+
+    // Check that both shuffles use the same mask. The masks are known to be of
+    // the same length because the result vector type is the same.
+    bool SameMask = true;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx0 = SVN0->getMaskElt(i);
+      int Idx1 = SVN1->getMaskElt(i);
+      if (Idx0 != Idx1) {
+        SameMask = false;
+        break;
+      }
+    }
+
+    if (SameMask) {
+      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+                               N0.getOperand(0), N1.getOperand(0));
+      AddToWorkList(Op.getNode());
+      return DAG.getVectorShuffle(VT, N->getDebugLoc(), Op,
+                                  DAG.getUNDEF(VT), &SVN0->getMask()[0]);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitAND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N1.getValueType();
+  unsigned BitWidth = VT.getScalarType().getSizeInBits();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+
+    // fold (and x, 0) -> 0, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N0;
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N1;
+
+    // fold (and x, -1) -> x, vector edition
+    if (ISD::isBuildVectorAllOnes(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllOnes(N1.getNode()))
+      return N0;
+  }
+
+  // fold (and x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (and c1, c2) -> c1&c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N1, N0);
+  // fold (and x, -1) -> x
+  if (N1C && N1C->isAllOnesValue())
+    return N0;
+  // if (and x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(BitWidth)))
+    return DAG.getConstant(0, VT);
+  // reassociate and
+  SDValue RAND = ReassociateOps(ISD::AND, N->getDebugLoc(), N0, N1);
+  if (RAND.getNode() != 0)
+    return RAND;
+  // fold (and (or x, C), D) -> D if (C & D) == D
+  if (N1C && N0.getOpcode() == ISD::OR)
+    if (ConstantSDNode *ORI = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+        return N1;
+  // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N0Op0 = N0.getOperand(0);
+    APInt Mask = ~N1C->getAPIntValue();
+    Mask = Mask.trunc(N0Op0.getValueSizeInBits());
+    if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
+      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(),
+                                 N0.getValueType(), N0Op0);
+
+      // Replace uses of the AND with uses of the Zero extend node.
+      CombineTo(N, Zext);
+
+      // We actually want to replace all uses of the any_extend with the
+      // zero_extend, to avoid duplicating things.  This will later cause this
+      // AND to be folded.
+      CombineTo(N0.getNode(), Zext);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> 
+  // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
+  // already be zero by virtue of the width of the base type of the load.
+  //
+  // the 'X' node here can either be nothing or an extract_vector_elt to catch
+  // more cases.
+  if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+       N0.getOperand(0).getOpcode() == ISD::LOAD) ||
+      N0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
+                                         N0 : N0.getOperand(0) );
+
+    // Get the constant (if applicable) the zero'th operand is being ANDed with.
+    // This can be a pure constant or a vector splat, in which case we treat the
+    // vector as a scalar and use the splat value.
+    APInt Constant = APInt::getNullValue(1);
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      Constant = C->getAPIntValue();
+    } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
+      APInt SplatValue, SplatUndef;
+      unsigned SplatBitSize;
+      bool HasAnyUndefs;
+      bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
+                                             SplatBitSize, HasAnyUndefs);
+      if (IsSplat) {
+        // Undef bits can contribute to a possible optimisation if set, so
+        // set them.
+        SplatValue |= SplatUndef;
+
+        // The splat value may be something like "0x00FFFFFF", which means 0 for
+        // the first vector value and FF for the rest, repeating. We need a mask
+        // that will apply equally to all members of the vector, so AND all the
+        // lanes of the constant together.
+        EVT VT = Vector->getValueType(0);
+        unsigned BitWidth = VT.getVectorElementType().getSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (BitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
+               SplatBitSize < BitWidth;
+               SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
+        Constant = APInt::getAllOnesValue(BitWidth);
+        for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
+          Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
+      }
+    }
+
+    // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
+    // actually legal and isn't going to get expanded, else this is a false
+    // optimisation.
+    bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getMemoryVT());
+
+    // Resize the constant to the same size as the original memory access before
+    // extension. If it is still the AllOnesValue then this AND is completely
+    // unneeded.
+    Constant =
+      Constant.zextOrTrunc(Load->getMemoryVT().getScalarType().getSizeInBits());
+
+    bool B;
+    switch (Load->getExtensionType()) {
+    default: B = false; break;
+    case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
+    case ISD::ZEXTLOAD:
+    case ISD::NON_EXTLOAD: B = true; break;
+    }
+
+    if (B && Constant.isAllOnesValue()) {
+      // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
+      // preserve semantics once we get rid of the AND.
+      SDValue NewLoad(Load, 0);
+      if (Load->getExtensionType() == ISD::EXTLOAD) {
+        NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
+                              Load->getValueType(0), Load->getDebugLoc(),
+                              Load->getChain(), Load->getBasePtr(),
+                              Load->getOffset(), Load->getMemoryVT(),
+                              Load->getMemOperand());
+        // Replace uses of the EXTLOAD with the new ZEXTLOAD.
+        if (Load->getNumValues() == 3) {
+          // PRE/POST_INC loads have 3 values.
+          SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
+                           NewLoad.getValue(2) };
+          CombineTo(Load, To, 3, true);
+        } else {
+          CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        }
+      }
+
+      // Fold the AND away, taking care not to fold to the old load node if we
+      // replaced it.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() && Op1 == ISD::SETEQ) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, N0.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+      // fold (and (setgt X,  -1), (setgt Y,  -1)) -> (setgt (or X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETGT) {
+        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations ||
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC,
+                            TLI.getSetCCResultType(N0.getSimpleValueType())))))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
+  // fold (and (sra)) -> (and (srl)) when possible.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (zext_inreg (extload x)) -> (zextload x)
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       LN0->getPointerInfo(), MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
+  if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      AddToWorkList(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (and (load x), 255) -> (zextload x, i8)
+  // fold (and (extload x, i16), 255) -> (zextload x, i8)
+  // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
+  if (N1C && (N0.getOpcode() == ISD::LOAD ||
+              (N0.getOpcode() == ISD::ANY_EXTEND &&
+               N0.getOperand(0).getOpcode() == ISD::LOAD))) {
+    bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND;
+    LoadSDNode *LN0 = HasAnyExt
+      ? cast<LoadSDNode>(N0.getOperand(0))
+      : cast<LoadSDNode>(N0);
+    if (LN0->getExtensionType() != ISD::SEXTLOAD &&
+        LN0->isUnindexed() && N0.hasOneUse() && LN0->hasOneUse()) {
+      uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
+      if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
+        EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+        EVT LoadedVT = LN0->getMemoryVT();
+
+        if (ExtVT == LoadedVT &&
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+
+          SDValue NewLoad =
+            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+                           LN0->getChain(), LN0->getBasePtr(),
+                           LN0->getPointerInfo(),
+                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                           LN0->getAlignment());
+          AddToWorkList(N);
+          CombineTo(LN0, NewLoad, NewLoad.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+
+        // Do not change the width of a volatile load.
+        // Do not generate loads of non-round integer types since these can
+        // be expensive (and would be wrong if the type is not byte sized).
+        if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+          EVT PtrType = LN0->getOperand(1).getValueType();
+
+          unsigned Alignment = LN0->getAlignment();
+          SDValue NewPtr = LN0->getBasePtr();
+
+          // For big endian targets, we need to add an offset to the pointer
+          // to load the correct bytes.  For little endian systems, we merely
+          // need to read fewer bytes from the same pointer.
+          if (TLI.isBigEndian()) {
+            unsigned LVTStoreBytes = LoadedVT.getStoreSize();
+            unsigned EVTStoreBytes = ExtVT.getStoreSize();
+            unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
+            NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
+                                 NewPtr, DAG.getConstant(PtrOff, PtrType));
+            Alignment = MinAlign(Alignment, PtrOff);
+          }
+
+          AddToWorkList(NewPtr.getNode());
+
+          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+          SDValue Load =
+            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+                           LN0->getChain(), NewPtr,
+                           LN0->getPointerInfo(),
+                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                           Alignment);
+          AddToWorkList(N);
+          CombineTo(LN0, Load, Load.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+  }
+
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      APInt ADDC = ADDI->getAPIntValue();
+      if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLI->getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              return SDValue(N, 0); // Return N so it doesn't get rechecked!
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// MatchBSwapHWord - Match (a >> 8) | (a << 8) as (bswap a) >> 16
+///
+SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                                        bool DemandHighBits) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
+    return SDValue();
+  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00)
+  bool LookPassAnd0 = false;
+  bool LookPassAnd1 = false;
+  if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
+      std::swap(N0, N1);
+  if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
+      std::swap(N0, N1);
+  if (N0.getOpcode() == ISD::AND) {
+    if (!N0.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01C || N01C->getZExtValue() != 0xFF00)
+      return SDValue();
+    N0 = N0.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  if (N1.getOpcode() == ISD::AND) {
+    if (!N1.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C || N11C->getZExtValue() != 0xFF)
+      return SDValue();
+    N1 = N1.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.getNode()->hasOneUse() ||
+      !N1.getNode()->hasOneUse())
+    return SDValue();
+
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+  if (!N01C || !N11C)
+    return SDValue();
+  if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
+    return SDValue();
+
+  // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
+  SDValue N00 = N0->getOperand(0);
+  if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
+    if (!N00.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
+    if (!N001C || N001C->getZExtValue() != 0xFF)
+      return SDValue();
+    N00 = N00.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  SDValue N10 = N1->getOperand(0);
+  if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
+    if (!N10.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
+    if (!N101C || N101C->getZExtValue() != 0xFF00)
+      return SDValue();
+    N10 = N10.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N00 != N10)
+    return SDValue();
+
+  // Make sure everything beyond the low halfword is zero since the SRL 16
+  // will clear the top bits.
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  if (DemandHighBits && OpSizeInBits > 16 &&
+      (!LookPassAnd0 || !LookPassAnd1) &&
+      !DAG.MaskedValueIsZero(N10, APInt::getHighBitsSet(OpSizeInBits, 16)))
+    return SDValue();
+
+  SDValue Res = DAG.getNode(ISD::BSWAP, N->getDebugLoc(), VT, N00);
+  if (OpSizeInBits > 16)
+    Res = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Res,
+                      DAG.getConstant(OpSizeInBits-16, getShiftAmountTy(VT)));
+  return Res;
+}
+
+/// isBSwapHWordElement - Return true if the specified node is an element
+/// that makes up a 32-bit packed halfword byteswap. i.e.
+/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
+static bool isBSwapHWordElement(SDValue N, SmallVector<SDNode*,4> &Parts) {
+  if (!N.getNode()->hasOneUse())
+    return false;
+
+  unsigned Opc = N.getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
+    return false;
+
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!N1C)
+    return false;
+
+  unsigned Num;
+  switch (N1C->getZExtValue()) {
+  default:
+    return false;
+  case 0xFF:       Num = 0; break;
+  case 0xFF00:     Num = 1; break;
+  case 0xFF0000:   Num = 2; break;
+  case 0xFF000000: Num = 3; break;
+  }
+
+  // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
+  SDValue N0 = N.getOperand(0);
+  if (Opc == ISD::AND) {
+    if (Num == 0 || Num == 2) {
+      // (x >> 8) & 0xff
+      // (x >> 8) & 0xff0000
+      if (N0.getOpcode() != ISD::SRL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    } else {
+      // (x << 8) & 0xff00
+      // (x << 8) & 0xff000000
+      if (N0.getOpcode() != ISD::SHL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    }
+  } else if (Opc == ISD::SHL) {
+    // (x & 0xff) << 8
+    // (x & 0xff0000) << 8
+    if (Num != 0 && Num != 2)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  } else { // Opc == ISD::SRL
+    // (x & 0xff00) >> 8
+    // (x & 0xff000000) >> 8
+    if (Num != 1 && Num != 3)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  }
+
+  if (Parts[Num])
+    return false;
+
+  Parts[Num] = N0.getOperand(0).getNode();
+  return true;
+}
+
+/// MatchBSwapHWord - Match a 32-bit packed halfword bswap. That is
+/// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
+/// => (rotl (bswap x), 16)
+SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  SmallVector<SDNode*,4> Parts(4, (SDNode*)0);
+  // Look for either
+  // (or (or (and), (and)), (or (and), (and)))
+  // (or (or (or (and), (and)), (and)), (and))
+  if (N0.getOpcode() != ISD::OR)
+    return SDValue();
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+
+  if (N1.getOpcode() == ISD::OR &&
+      N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
+    // (or (or (and), (and)), (or (and), (and)))
+    SDValue N000 = N00.getOperand(0);
+    if (!isBSwapHWordElement(N000, Parts))
+      return SDValue();
+
+    SDValue N001 = N00.getOperand(1);
+    if (!isBSwapHWordElement(N001, Parts))
+      return SDValue();
+    SDValue N010 = N01.getOperand(0);
+    if (!isBSwapHWordElement(N010, Parts))
+      return SDValue();
+    SDValue N011 = N01.getOperand(1);
+    if (!isBSwapHWordElement(N011, Parts))
+      return SDValue();
+  } else {
+    // (or (or (or (and), (and)), (and)), (and))
+    if (!isBSwapHWordElement(N1, Parts))
+      return SDValue();
+    if (!isBSwapHWordElement(N01, Parts))
+      return SDValue();
+    if (N00.getOpcode() != ISD::OR)
+      return SDValue();
+    SDValue N000 = N00.getOperand(0);
+    if (!isBSwapHWordElement(N000, Parts))
+      return SDValue();
+    SDValue N001 = N00.getOperand(1);
+    if (!isBSwapHWordElement(N001, Parts))
+      return SDValue();
+  }
+
+  // Make sure the parts are all coming from the same node.
+  if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
+    return SDValue();
+
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, N->getDebugLoc(), VT,
+                              SDValue(Parts[0],0));
+
+  // Result of the bswap should be rotated by 16. If it's not legal, than
+  // do  (x << 16) | (x >> 16).
+  SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT));
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
+    return DAG.getNode(ISD::ROTL, N->getDebugLoc(), VT, BSwap, ShAmt);
+  if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+    return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, BSwap, ShAmt);
+  return DAG.getNode(ISD::OR, N->getDebugLoc(), VT,
+                     DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, BSwap, ShAmt),
+                     DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, BSwap, ShAmt));
+}
+
+SDValue DAGCombiner::visitOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N1.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+
+    // fold (or x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+
+    // fold (or x, -1) -> -1, vector edition
+    if (ISD::isBuildVectorAllOnes(N0.getNode()))
+      return N0;
+    if (ISD::isBuildVectorAllOnes(N1.getNode()))
+      return N1;
+  }
+
+  // fold (or x, undef) -> -1
+  if (!LegalOperations &&
+      (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)) {
+    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  }
+  // fold (or c1, c2) -> c1|c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N1, N0);
+  // fold (or x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (or x, -1) -> -1
+  if (N1C && N1C->isAllOnesValue())
+    return N1;
+  // fold (or x, c) -> c iff (x & ~c) == 0
+  if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
+    return N1;
+
+  // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
+  SDValue BSwap = MatchBSwapHWord(N, N0, N1);
+  if (BSwap.getNode() != 0)
+    return BSwap;
+  BSwap = MatchBSwapHWordLow(N, N0, N1);
+  if (BSwap.getNode() != 0)
+    return BSwap;
+
+  // reassociate or
+  SDValue ROR = ReassociateOps(ISD::OR, N->getDebugLoc(), N0, N1);
+  if (ROR.getNode() != 0)
+    return ROR;
+  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+  // iff (c1 & c2) == 0.
+  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+             isa<ConstantSDNode>(N0.getOperand(1))) {
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
+    if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0)
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                                     N0.getOperand(0), N1),
+                         DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1));
+  }
+  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
+      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
+      if (cast<ConstantSDNode>(LR)->isNullValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
+        SDValue ORNode = DAG.getNode(ISD::OR, LR.getDebugLoc(),
+                                     LR.getValueType(), LL, RL);
+        AddToWorkList(ORNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+      }
+      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
+      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
+      if (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+          (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, LR.getDebugLoc(),
+                                      LR.getValueType(), LL, RL);
+        AddToWorkList(ANDNode.getNode());
+        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations ||
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC,
+              TLI.getSetCCResultType(N0.getValueType())))))
+        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            LL, LR, Result);
+    }
+  }
+
+  // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      N1.getOperand(1).getOpcode() == ISD::Constant &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    // We can only do this xform if we know that bits from X that are set in C2
+    // but not in C1 are already zero.  Likewise for Y.
+    const APInt &LHSMask =
+      cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    const APInt &RHSMask =
+      cast<ConstantSDNode>(N1.getOperand(1))->getAPIntValue();
+
+    if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
+        DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
+      SDValue X = DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1.getOperand(0));
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, X,
+                         DAG.getConstant(LHSMask | RHSMask, VT));
+    }
+  }
+
+  // See if this is some rotate idiom.
+  if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
+    return SDValue(Rot, 0);
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// MatchRotateHalf - Match "(X shl/srl V1) & V2" where V2 may not be present.
+static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
+  if (Op.getOpcode() == ISD::AND) {
+    if (isa<ConstantSDNode>(Op.getOperand(1))) {
+      Mask = Op.getOperand(1);
+      Op = Op.getOperand(0);
+    } else {
+      return false;
+    }
+  }
+
+  if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
+    Shift = Op;
+    return true;
+  }
+
+  return false;
+}
+
+// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
+// idioms for rotate, and if the target supports rotation instructions, generate
+// a rot[lr].
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
+  // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
+  EVT VT = LHS.getValueType();
+  if (!TLI.isTypeLegal(VT)) return 0;
+
+  // The target must have at least one rotate flavor.
+  bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
+  bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
+  if (!HasROTL && !HasROTR) return 0;
+
+  // Match "(X shl/srl V1) & V2" where V2 may not be present.
+  SDValue LHSShift;   // The shift.
+  SDValue LHSMask;    // AND value if any.
+  if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
+    return 0; // Not part of a rotate.
+
+  SDValue RHSShift;   // The shift.
+  SDValue RHSMask;    // AND value if any.
+  if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
+    return 0; // Not part of a rotate.
+
+  if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
+    return 0;   // Not shifting the same value.
+
+  if (LHSShift.getOpcode() == RHSShift.getOpcode())
+    return 0;   // Shifts must disagree.
+
+  // Canonicalize shl to left side in a shl/srl pair.
+  if (RHSShift.getOpcode() == ISD::SHL) {
+    std::swap(LHS, RHS);
+    std::swap(LHSShift, RHSShift);
+    std::swap(LHSMask , RHSMask );
+  }
+
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  SDValue LHSShiftArg = LHSShift.getOperand(0);
+  SDValue LHSShiftAmt = LHSShift.getOperand(1);
+  SDValue RHSShiftAmt = RHSShift.getOperand(1);
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  if (LHSShiftAmt.getOpcode() == ISD::Constant &&
+      RHSShiftAmt.getOpcode() == ISD::Constant) {
+    uint64_t LShVal = cast<ConstantSDNode>(LHSShiftAmt)->getZExtValue();
+    uint64_t RShVal = cast<ConstantSDNode>(RHSShiftAmt)->getZExtValue();
+    if ((LShVal + RShVal) != OpSizeInBits)
+      return 0;
+
+    SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                              LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
+
+    // If there is an AND of either shifted operand, apply it to the result.
+    if (LHSMask.getNode() || RHSMask.getNode()) {
+      APInt Mask = APInt::getAllOnesValue(OpSizeInBits);
+
+      if (LHSMask.getNode()) {
+        APInt RHSBits = APInt::getLowBitsSet(OpSizeInBits, LShVal);
+        Mask &= cast<ConstantSDNode>(LHSMask)->getAPIntValue() | RHSBits;
+      }
+      if (RHSMask.getNode()) {
+        APInt LHSBits = APInt::getHighBitsSet(OpSizeInBits, RShVal);
+        Mask &= cast<ConstantSDNode>(RHSMask)->getAPIntValue() | LHSBits;
+      }
+
+      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, DAG.getConstant(Mask, VT));
+    }
+
+    return Rot.getNode();
+  }
+
+  // If there is a mask here, and we have a variable shift, we can't be sure
+  // that we're masking out the right stuff.
+  if (LHSMask.getNode() || RHSMask.getNode())
+    return 0;
+
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
+  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
+  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
+      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
+  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
+  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
+      RHSShiftAmt == LHSShiftAmt.getOperand(1)) {
+    if (ConstantSDNode *SUBC =
+          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0))) {
+      if (SUBC->getAPIntValue() == OpSizeInBits) {
+        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
+                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
+      }
+    }
+  }
+
+  // Look for sign/zext/any-extended or truncate cases:
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
+    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
+    if (RExtOp0.getOpcode() == ISD::SUB &&
+        RExtOp0.getOperand(1) == LExtOp0) {
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotl x, y)
+      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+      //   (rotr x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                             LHSShiftArg,
+                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+        }
+      }
+    } else if (LExtOp0.getOpcode() == ISD::SUB &&
+               RExtOp0 == LExtOp0.getOperand(1)) {
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotr x, y)
+      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+      //   (rotl x, (sub 32, y))
+      if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
+        if (SUBC->getAPIntValue() == OpSizeInBits) {
+          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
+                             LHSShiftArg,
+                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+SDValue DAGCombiner::visitXOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue LHS, RHS, CC;
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+
+    // fold (xor x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // fold (xor x, undef) -> undef
+  if (N0.getOpcode() == ISD::UNDEF)
+    return N0;
+  if (N1.getOpcode() == ISD::UNDEF)
+    return N1;
+  // fold (xor c1, c2) -> c1^c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+  // fold (xor x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // reassociate xor
+  SDValue RXOR = ReassociateOps(ISD::XOR, N->getDebugLoc(), N0, N1);
+  if (RXOR.getNode() != 0)
+    return RXOR;
+
+  // fold !(x cc y) -> (x !cc y)
+  if (N1C && N1C->getAPIntValue() == 1 && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+    bool isInt = LHS.getValueType().isInteger();
+    ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               isInt);
+
+    if (!LegalOperations ||
+        TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
+      switch (N0.getOpcode()) {
+      default:
+        llvm_unreachable("Unhandled SetCC Equivalent!");
+      case ISD::SETCC:
+        return DAG.getSetCC(N->getDebugLoc(), VT, LHS, RHS, NotCC);
+      case ISD::SELECT_CC:
+        return DAG.getSelectCC(N->getDebugLoc(), LHS, RHS, N0.getOperand(2),
+                               N0.getOperand(3), NotCC);
+      }
+    }
+  }
+
+  // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
+  if (N1C && N1C->getAPIntValue() == 1 && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getNode()->hasOneUse() &&
+      isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
+    SDValue V = N0.getOperand(0);
+    V = DAG.getNode(ISD::XOR, N0.getDebugLoc(), V.getValueType(), V,
+                    DAG.getConstant(1, V.getValueType()));
+    AddToWorkList(V.getNode());
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, V);
+  }
+
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  if (N1C && N1C->getAPIntValue() == 1 && VT == MVT::i1 &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
+  if (N1C && N1C->isAllOnesValue() &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
+      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+    }
+  }
+  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
+  if (N1C && N0.getOpcode() == ISD::XOR) {
+    ConstantSDNode *N00C = dyn_cast<ConstantSDNode>(N0.getOperand(0));
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N00C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(1),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N00C->getAPIntValue(), VT));
+    if (N01C)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N01C->getAPIntValue(), VT));
+  }
+  // fold (xor x, x) -> 0
+  if (N0 == N1)
+    return tryFoldToZero(N->getDebugLoc(), TLI, VT, DAG, LegalOperations);
+
+  // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
+  if (N0.getOpcode() == N1.getOpcode()) {
+    SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N);
+    if (Tmp.getNode()) return Tmp;
+  }
+
+  // Simplify the expression using non-local knowledge.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// visitShiftByConstant - Handle transforms common to the three shifts, when
+/// the shift amount is a constant.
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N, unsigned Amt) {
+  SDNode *LHS = N->getOperand(0).getNode();
+  if (!LHS->hasOneUse()) return SDValue();
+
+  // We want to pull some binops through shifts, so that we have (and (shift))
+  // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
+  // thing happens with address calculations, so it's important to canonicalize
+  // it.
+  bool HighBitSet = false;  // Can we transform this if the high bit is set?
+
+  switch (LHS->getOpcode()) {
+  default: return SDValue();
+  case ISD::OR:
+  case ISD::XOR:
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  case ISD::AND:
+    HighBitSet = true;  // We can only transform sra if the high bit is set.
+    break;
+  case ISD::ADD:
+    if (N->getOpcode() != ISD::SHL)
+      return SDValue(); // only shl(add) not sr[al](add).
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  }
+
+  // We require the RHS of the binop to be a constant as well.
+  ConstantSDNode *BinOpCst = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
+  if (!BinOpCst) return SDValue();
+
+  // FIXME: disable this unless the input to the binop is a shift by a constant.
+  // If it is not a shift, it pessimizes some common cases like:
+  //
+  //    void foo(int *X, int i) { X[i & 1235] = 1; }
+  //    int bar(int *X, int i) { return X[i & 255]; }
+  SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
+  if ((BinOpLHSVal->getOpcode() != ISD::SHL &&
+       BinOpLHSVal->getOpcode() != ISD::SRA &&
+       BinOpLHSVal->getOpcode() != ISD::SRL) ||
+      !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // If this is a signed shift right, and the high bit is modified by the
+  // logical operation, do not perform the transformation. The highBitSet
+  // boolean indicates the value of the high bit of the constant which would
+  // cause it to be modified for this operation.
+  if (N->getOpcode() == ISD::SRA) {
+    bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
+    if (BinOpRHSSignSet != HighBitSet)
+      return SDValue();
+  }
+
+  // Fold the constants, shifting the binop RHS by the shift amount.
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), LHS->getOperand(1).getDebugLoc(),
+                               N->getValueType(0),
+                               LHS->getOperand(1), N->getOperand(1));
+
+  // Create the new shift.
+  SDValue NewShift = DAG.getNode(N->getOpcode(),
+                                 LHS->getOperand(0).getDebugLoc(),
+                                 VT, LHS->getOperand(0), N->getOperand(1));
+
+  // Create the new binop.
+  return DAG.getNode(LHS->getOpcode(), N->getDebugLoc(), VT, NewShift, NewRHS);
+}
+
+SDValue DAGCombiner::visitSHL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (shl c1, c2) -> c1<<c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
+  // fold (shl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (shl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (shl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (shl undef, x) -> 0
+  if (N0.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+  // if (shl x, c) is known to be zero, return 0
+  if (DAG.MaskedValueIsZero(SDValue(N, 0),
+                            APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, VT);
+  // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC = TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(), TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SHL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 >= OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+
+  // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
+  // For this to be valid, the second form must not preserve any of the bits
+  // that are shifted out by the inner shift in the first form.  This means
+  // the outer shift size must be >= the number of bits added by the ext.
+  // As a corollary, we don't care what kind of ext it is.
+  if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND ||
+              N0.getOpcode() == ISD::ANY_EXTEND ||
+              N0.getOpcode() == ISD::SIGN_EXTEND) &&
+      N0.getOperand(0).getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    EVT InnerShiftVT = N0.getOperand(0).getValueType();
+    uint64_t InnerShiftSize = InnerShiftVT.getScalarType().getSizeInBits();
+    if (c2 >= OpSizeInBits - InnerShiftSize) {
+      if (c1 + c2 >= OpSizeInBits)
+        return DAG.getConstant(0, VT);
+      return DAG.getNode(ISD::SHL, N0->getDebugLoc(), VT,
+                         DAG.getNode(N0.getOpcode(), N0->getDebugLoc(), VT,
+                                     N0.getOperand(0)->getOperand(0)),
+                         DAG.getConstant(c1 + c2, N1.getValueType()));
+    }
+  }
+
+  // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+  //                               (and (srl x, (sub c1, c2), MASK)
+  // Only fold this if the inner shift has no other uses -- if it does, folding
+  // this will increase the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    if (c1 < VT.getSizeInBits()) {
+      uint64_t c2 = N1C->getZExtValue();
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - c1);
+      SDValue Shift;
+      if (c2 > c1) {
+        Mask = Mask.shl(c2-c1);
+        Shift = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c2-c1, N1.getValueType()));
+      } else {
+        Mask = Mask.lshr(c1-c2);
+        Shift = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                            DAG.getConstant(c1-c2, N1.getValueType()));
+      }
+      return DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, Shift,
+                         DAG.getConstant(Mask, VT));
+    }
+  }
+  // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
+  if (N1C && N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1)) {
+    SDValue HiBitsMask =
+      DAG.getConstant(APInt::getHighBitsSet(VT.getSizeInBits(),
+                                            VT.getSizeInBits() -
+                                              N1C->getZExtValue()),
+                      VT);
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       HiBitsMask);
+  }
+
+  if (N1C) {
+    SDValue NewSHL = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSHL.getNode())
+      return NewSHL;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (sra c1, c2) -> (sra c1, c2)
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
+  // fold (sra 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (sra -1, x) -> -1
+  if (N0C && N0C->isAllOnesValue())
+    return N0;
+  // fold (sra x, (setge c, size(x))) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (sra x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
+  // sext_inreg.
+  if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
+    EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
+    if (VT.isVector())
+      ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               ExtVT, VT.getVectorNumElements());
+    if ((!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                         N0.getOperand(0), DAG.getValueType(ExtVT));
+  }
+
+  // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRA) {
+    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
+      if (Sum >= OpSizeInBits) Sum = OpSizeInBits-1;
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
+                         DAG.getConstant(Sum, N1C->getValueType(0)));
+    }
+  }
+
+  // fold (sra (shl X, m), (sub result_size, n))
+  // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
+  // result_size - n != m.
+  // If truncate is free for the target sext(shl) is likely to result in better
+  // code.
+  if (N0.getOpcode() == ISD::SHL) {
+    // Get the two constanst of the shifts, CN0 = m, CN = n.
+    const ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (N01C && N1C) {
+      // Determine what the truncate's result bitsize and type would be.
+      EVT TruncVT =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          OpSizeInBits - N1C->getZExtValue());
+      // Determine the residual right-shift amount.
+      signed ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+
+      // If the shift is not a no-op (in which case this should be just a sign
+      // extend already), the truncated to type is legal, sign_extend is legal
+      // on that type, and the truncate to that type is both legal and free,
+      // perform the transform.
+      if ((ShiftAmt > 0) &&
+          TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
+          TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
+
+          SDValue Amt = DAG.getConstant(ShiftAmt,
+              getShiftAmountTy(N0.getOperand(0).getValueType()));
+          SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT,
+                                      N0.getOperand(0), Amt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), TruncVT,
+                                      Shift);
+          return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(),
+                             N->getValueType(0), Trunc);
+      }
+    }
+  }
+
+  // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC = TruncC.trunc(TruncVT.getScalarType().getSizeInBits());
+      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // fold (sra (trunc (sr x, c1)), c2) -> (trunc (sra x, c1+c2))
+  //      if c1 is equal to the number of bits the trunc removes
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (N0.getOperand(0).getOpcode() == ISD::SRL ||
+       N0.getOperand(0).getOpcode() == ISD::SRA) &&
+      N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOperand(1).hasOneUse() &&
+      N1C && isa<ConstantSDNode>(N0.getOperand(0).getOperand(1))) {
+    EVT LargeVT = N0.getOperand(0).getValueType();
+    ConstantSDNode *LargeShiftAmt =
+      cast<ConstantSDNode>(N0.getOperand(0).getOperand(1));
+
+    if (LargeVT.getScalarType().getSizeInBits() - OpSizeInBits ==
+        LargeShiftAmt->getZExtValue()) {
+      SDValue Amt =
+        DAG.getConstant(LargeShiftAmt->getZExtValue() + N1C->getZExtValue(),
+              getShiftAmountTy(N0.getOperand(0).getOperand(0).getValueType()));
+      SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), LargeVT,
+                                N0.getOperand(0).getOperand(0), Amt);
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, SRA);
+    }
+  }
+
+  // Simplify, based on bits shifted out of the LHS.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+
+  // If the sign bit is known to be zero, switch this to a SRL.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, N1);
+
+  if (N1C) {
+    SDValue NewSRA = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSRA.getNode())
+      return NewSRA;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold (srl c1, c2) -> c1 >>u c2
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
+  // fold (srl 0, x) -> 0
+  if (N0C && N0C->isNullValue())
+    return N0;
+  // fold (srl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (srl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (srl x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, VT);
+
+  // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    uint64_t c1 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    if (c1 + c2 >= OpSizeInBits)
+      return DAG.getConstant(0, VT);
+    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(c1 + c2, N1.getValueType()));
+  }
+
+  // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
+  if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    EVT InnerShiftVT = N0.getOperand(0).getValueType();
+    EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType();
+    uint64_t InnerShiftSize = InnerShiftVT.getScalarType().getSizeInBits();
+    // This is only valid if the OpSizeInBits + c1 = size of inner shift.
+    if (c1 + OpSizeInBits == InnerShiftSize) {
+      if (c1 + c2 >= InnerShiftSize)
+        return DAG.getConstant(0, VT);
+      return DAG.getNode(ISD::TRUNCATE, N0->getDebugLoc(), VT,
+                         DAG.getNode(ISD::SRL, N0->getDebugLoc(), InnerShiftVT,
+                                     N0.getOperand(0)->getOperand(0),
+                                     DAG.getConstant(c1 + c2, ShiftCountVT)));
+    }
+  }
+
+  // fold (srl (shl x, c), c) -> (and x, cst2)
+  if (N1C && N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
+      N0.getValueSizeInBits() <= 64) {
+    uint64_t ShAmt = N1C->getZExtValue()+64-N0.getValueSizeInBits();
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getConstant(~0ULL >> ShAmt, VT));
+  }
+
+
+  // fold (srl (anyextend x), c) -> (anyextend (srl x, c))
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    // Shifting in all undef bits?
+    EVT SmallVT = N0.getOperand(0).getValueType();
+    if (N1C->getZExtValue() >= SmallVT.getSizeInBits())
+      return DAG.getUNDEF(VT);
+
+    if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
+      uint64_t ShiftAmt = N1C->getZExtValue();
+      SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT,
+                                       N0.getOperand(0),
+                          DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT)));
+      AddToWorkList(SmallShift.getNode());
+      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
+    }
+  }
+
+  // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
+  // bit, which is unmodified by sra.
+  if (N1C && N1C->getZExtValue() + 1 == VT.getSizeInBits()) {
+    if (N0.getOpcode() == ISD::SRA)
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0), N1);
+  }
+
+  // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
+  if (N1C && N0.getOpcode() == ISD::CTLZ &&
+      N1C->getAPIntValue() == Log2_32(VT.getSizeInBits())) {
+    APInt KnownZero, KnownOne;
+    DAG.ComputeMaskedBits(N0.getOperand(0), KnownZero, KnownOne);
+
+    // If any of the input bits are KnownOne, then the input couldn't be all
+    // zeros, thus the result of the srl will always be zero.
+    if (KnownOne.getBoolValue()) return DAG.getConstant(0, VT);
+
+    // If all of the bits input the to ctlz node are known to be zero, then
+    // the result of the ctlz is "32" and the result of the shift is one.
+    APInt UnknownBits = ~KnownZero;
+    if (UnknownBits == 0) return DAG.getConstant(1, VT);
+
+    // Otherwise, check to see if there is exactly one bit input to the ctlz.
+    if ((UnknownBits & (UnknownBits - 1)) == 0) {
+      // Okay, we know that only that the single bit specified by UnknownBits
+      // could be set on input to the CTLZ node. If this bit is set, the SRL
+      // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
+      // to an SRL/XOR pair, which is likely to simplify more.
+      unsigned ShAmt = UnknownBits.countTrailingZeros();
+      SDValue Op = N0.getOperand(0);
+
+      if (ShAmt) {
+        Op = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT, Op,
+                  DAG.getConstant(ShAmt, getShiftAmountTy(Op.getValueType())));
+        AddToWorkList(Op.getNode());
+      }
+
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         Op, DAG.getConstant(1, VT));
+    }
+  }
+
+  // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND &&
+      N1.hasOneUse() && N1.getOperand(0).hasOneUse()) {
+    SDValue N101 = N1.getOperand(0).getOperand(1);
+    if (ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N101)) {
+      EVT TruncVT = N1.getValueType();
+      SDValue N100 = N1.getOperand(0).getOperand(0);
+      APInt TruncC = N101C->getAPIntValue();
+      TruncC = TruncC.trunc(TruncVT.getSizeInBits());
+      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+                                     TruncVT,
+                                     DAG.getNode(ISD::TRUNCATE,
+                                                 N->getDebugLoc(),
+                                                 TruncVT, N100),
+                                     DAG.getConstant(TruncC, TruncVT)));
+    }
+  }
+
+  // fold operands of srl based on knowledge that the low bits are not
+  // demanded.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (N1C) {
+    SDValue NewSRL = visitShiftByConstant(N, N1C->getZExtValue());
+    if (NewSRL.getNode())
+      return NewSRL;
+  }
+
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
+  // Here is a common situation. We want to optimize:
+  //
+  //   %a = ...
+  //   %b = and i32 %a, 2
+  //   %c = srl i32 %b, 1
+  //   brcond i32 %c ...
+  //
+  // into
+  //
+  //   %a = ...
+  //   %b = and %a, 2
+  //   %c = setcc eq %b, 0
+  //   brcond %c ...
+  //
+  // However when after the source operand of SRL is optimized into AND, the SRL
+  // itself may not be optimized further. Look for it and add the BRCOND into
+  // the worklist.
+  if (N->hasOneUse()) {
+    SDNode *Use = *N->use_begin();
+    if (Use->getOpcode() == ISD::BRCOND)
+      AddToWorkList(Use);
+    else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
+      // Also look pass the truncate.
+      Use = *Use->use_begin();
+      if (Use->getOpcode() == ISD::BRCOND)
+        AddToWorkList(Use);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTLZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz_zero_undef c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTTZ, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz_zero_undef c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTPOP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctpop c1) -> c2
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+
+  // fold (select C, X, X) -> X
+  if (N1 == N2)
+    return N1;
+  // fold (select true, X, Y) -> X
+  if (N0C && !N0C->isNullValue())
+    return N1;
+  // fold (select false, X, Y) -> Y
+  if (N0C && N0C->isNullValue())
+    return N2;
+  // fold (select C, 1, X) -> (or C, X)
+  if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select C, 0, 1) -> (xor C, 1)
+  if (VT.isInteger() &&
+      (VT0 == MVT::i1 ||
+       (VT0.isInteger() &&
+        TLI.getBooleanContents(false) ==
+        TargetLowering::ZeroOrOneBooleanContent)) &&
+      N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
+    SDValue XORNode;
+    if (VT == VT0)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT0,
+                         N0, DAG.getConstant(1, VT0));
+    XORNode = DAG.getNode(ISD::XOR, N0.getDebugLoc(), VT0,
+                          N0, DAG.getConstant(1, VT0));
+    AddToWorkList(XORNode.getNode());
+    if (VT.bitsGT(VT0))
+      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, XORNode);
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, XORNode);
+  }
+  // fold (select C, 0, X) -> (and (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, NOTNode, N2);
+  }
+  // fold (select C, X, 1) -> (or (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) {
+    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    AddToWorkList(NOTNode.getNode());
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, NOTNode, N1);
+  }
+  // fold (select C, X, 0) -> (and C, X)
+  if (VT == MVT::i1 && N2C && N2C->isNullValue())
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or X, Y)
+  if (VT == MVT::i1 && (N0 == N1 || (N1C && N1C->getAPIntValue() == 1)))
+    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+  // fold (select X, Y, X) -> (and X, Y)
+  // fold (select X, Y, 0) -> (and X, Y)
+  if (VT == MVT::i1 && (N0 == N2 || (N2C && N2C->getAPIntValue() == 0)))
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // FIXME:
+    // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+    // having to say they don't support SELECT_CC on every type the DAG knows
+    // about, since there is no way to mark an opcode illegal at all value types
+    if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other) &&
+        TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue N3 = N->getOperand(3);
+  SDValue N4 = N->getOperand(4);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
+
+  // fold select_cc lhs, rhs, x, x, cc -> x
+  if (N2 == N3)
+    return N2;
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, N->getDebugLoc(), false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+
+  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
+    if (!SCCC->isNullValue())
+      return N2;    // cond always true -> true val
+    else
+      return N3;    // cond always false -> false val
+  }
+
+  // Fold to a simpler select_cc
+  if (SCC.getNode() && SCC.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N2.getValueType(),
+                       SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                       SCC.getOperand(2));
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N2, N3))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold select_cc into other things, such as min/max/abs
+  return SimplifySelectCC(N->getDebugLoc(), N0, N1, N2, N3, CC);
+}
+
+SDValue DAGCombiner::visitSETCC(SDNode *N) {
+  return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
+                       cast<CondCodeSDNode>(N->getOperand(2))->get(),
+                       N->getDebugLoc());
+}
+
+// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
+// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
+// transformation. Returns true if extension are possible and the above
+// mentioned transformation is profitable.
+static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
+                                    unsigned ExtOpc,
+                                    SmallVector<SDNode*, 4> &ExtendNodes,
+                                    const TargetLowering &TLI) {
+  bool HasCopyToRegUses = false;
+  bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
+  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+                            UE = N0.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == N)
+      continue;
+    if (UI.getUse().getResNo() != N0.getResNo())
+      continue;
+    // FIXME: Only extend SETCC N, N and SETCC N, c for now.
+    if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
+        // Sign bits will be lost after a zext.
+        return false;
+      bool Add = false;
+      for (unsigned i = 0; i != 2; ++i) {
+        SDValue UseOp = User->getOperand(i);
+        if (UseOp == N0)
+          continue;
+        if (!isa<ConstantSDNode>(UseOp))
+          return false;
+        Add = true;
+      }
+      if (Add)
+        ExtendNodes.push_back(User);
+      continue;
+    }
+    // If truncates aren't free and there are users we can't
+    // extend, it isn't worthwhile.
+    if (!isTruncFree)
+      return false;
+    // Remember if this value is live-out.
+    if (User->getOpcode() == ISD::CopyToReg)
+      HasCopyToRegUses = true;
+  }
+
+  if (HasCopyToRegUses) {
+    bool BothLiveOut = false;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
+        BothLiveOut = true;
+        break;
+      }
+    }
+    if (BothLiveOut)
+      // Both unextended and extended values are live out. There had better be
+      // a good reason for the transformation.
+      return ExtendNodes.size();
+  }
+  return true;
+}
+
+void DAGCombiner::ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
+                                  SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+                                  ISD::NodeType ExtType) {
+  // Extend SetCC uses if necessary.
+  for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+    SDNode *SetCC = SetCCs[i];
+    SmallVector<SDValue, 4> Ops;
+
+    for (unsigned j = 0; j != 2; ++j) {
+      SDValue SOp = SetCC->getOperand(j);
+      if (SOp == Trunc)
+        Ops.push_back(ExtLoad);
+      else
+        Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
+    }
+
+    Ops.push_back(SetCC->getOperand(2));
+    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0),
+                                 &Ops[0], Ops.size()));
+  }
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (sext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // fold (sext (sext x)) -> (sext x)
+  // fold (sext (aext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (sext (truncate (load x))) -> (sext (smaller load x))
+    // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    // See if the value being truncated is already sign extended.  If so, just
+    // eliminate the trunc/sext pair.
+    SDValue Op = N0.getOperand(0);
+    unsigned OpBits   = Op.getValueType().getScalarType().getSizeInBits();
+    unsigned MidBits  = N0.getValueType().getScalarType().getSizeInBits();
+    unsigned DestBits = VT.getScalarType().getSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return Op;
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, Op);
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+    }
+
+    // fold (sext (truncate x)) -> (sextinreg x).
+    if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
+                                                 N0.getValueType())) {
+      if (OpBits < DestBits)
+        Op = DAG.getNode(ISD::ANY_EXTEND, N0.getDebugLoc(), VT, Op);
+      else if (OpBits > DestBits)
+        Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
+                         DAG.getValueType(N0.getValueType()));
+    }
+  }
+
+  // fold (sext (load x)) -> (sext (truncate (sextload x)))
+  // None of the supported targets knows how to perform load and sign extend
+  // on vectors in one instruction.  We only perform this transformation on
+  // scalars.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::SIGN_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
+  // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
+  if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (sextload x), (sext cst))
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::ZEXTLOAD) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse())
+        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
+                                          SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, LN0->getDebugLoc(), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getPointerInfo(),
+                                         LN0->getMemoryVT(),
+                                         LN0->isVolatile(),
+                                         LN0->isNonTemporal(),
+                                         LN0->getAlignment());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.sext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                                  ExtLoad, DAG.getConstant(Mask, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    N0.getOperand(0).getDebugLoc(),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                        ISD::SIGN_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N0VT = N0.getOperand(0).getValueType();
+      // On some architectures (such as SSE/NEON/etc) the SETCC result type is
+      // of the same size as the compared operands. Only optimize sext(setcc())
+      // if this is the case.
+      EVT SVT = TLI.getSetCCResultType(N0VT);
+
+      // We know that the # elements of the results is the same as the
+      // # elements of the compare (and the # elements of the compare result
+      // for that matter).  Check to see that they are the same size.  If so,
+      // we know that the element size of the sext'd result matches the
+      // element size of the compare operands.
+      if (VT.getSizeInBits() == SVT.getSizeInBits())
+        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      EVT MatchingElementType =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          N0VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType =
+        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                         N0VT.getVectorNumElements());
+
+      if (SVT == MatchingVectorType) {
+        SDValue VsetCC = DAG.getSetCC(N->getDebugLoc(), MatchingVectorType,
+                               N0.getOperand(0), N0.getOperand(1),
+                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+      }
+    }
+
+    // sext(setcc x, y, cc) -> (select_cc x, y, -1, 0, cc)
+    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    SDValue NegOne =
+      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), VT);
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       NegOne, DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+    if (!VT.isVector() && (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(VT))))
+      return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                         DAG.getSetCC(N->getDebugLoc(),
+                                      TLI.getSetCCResultType(VT),
+                                      N0.getOperand(0), N0.getOperand(1),
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                         NegOne, DAG.getConstant(0, VT));
+  }
+
+  // fold (sext x) -> (zext x) if the sign bit is known zero.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
+      DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+// isTruncateOf - If N is a truncate of some other value, return true, record
+// the value being truncated in Op and which of Op's bits are zero in KnownZero.
+// This function computes KnownZero to avoid a duplicated call to
+// ComputeMaskedBits in the caller.
+static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
+                         APInt &KnownZero) {
+  APInt KnownOne;
+  if (N->getOpcode() == ISD::TRUNCATE) {
+    Op = N->getOperand(0);
+    DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+    return true;
+  }
+
+  if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
+      cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
+    return false;
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  assert(Op0.getValueType() == Op1.getValueType());
+
+  ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
+  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+  if (COp0 && COp0->isNullValue())
+    Op = Op1;
+  else if (COp1 && COp1->isNullValue())
+    Op = Op0;
+  else
+    return false;
+
+  DAG.ComputeMaskedBits(Op, KnownZero, KnownOne);
+
+  if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue())
+    return false;
+
+  return true;
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (zext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (zext (zext x)) -> (zext x)
+  // fold (zext (aext x)) -> (zext x)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (zext (truncate x)) -> (zext x) or
+  //      (zext (truncate x)) -> (truncate x)
+  // This is valid when the truncated bits of x are already zero.
+  // FIXME: We should extend this to work for vectors too.
+  SDValue Op;
+  APInt KnownZero;
+  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) {
+    APInt TruncatedBits =
+      (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
+      APInt(Op.getValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getValueSizeInBits(),
+                        N0.getValueSizeInBits(),
+                        std::min(Op.getValueSizeInBits(),
+                                 VT.getSizeInBits()));
+    if (TruncatedBits == (KnownZero & TruncatedBits)) {
+      if (VT.bitsGT(Op.getValueType()))
+        return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, Op);
+      if (VT.bitsLT(Op.getValueType()))
+        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+
+      return Op;
+    }
+  }
+
+  // fold (zext (truncate (load x))) -> (zext (smaller load x))
+  // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (truncate x)) -> (and x, mask)
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT))) {
+
+    // fold (zext (truncate (load x))) -> (zext (smaller load x))
+    // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    SDValue Op = N0.getOperand(0);
+    if (Op.getValueType().bitsLT(VT)) {
+      Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
+    } else if (Op.getValueType().bitsGT(VT)) {
+      Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+      AddToWorkList(Op.getNode());
+    }
+    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
+                                  N0.getValueType().getScalarType());
+  }
+
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, X.getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask = Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (zext (load x)) -> (zext (truncate (zextload x)))
+  // None of the supported targets knows how to perform load and vector_zext
+  // on vectors in one instruction.  We only perform this transformation on
+  // scalars.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::ZERO_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (zextload x), (zext cst))
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::SEXTLOAD) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse())
+        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND,
+                                          SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getPointerInfo(),
+                                         LN0->getMemoryVT(),
+                                         LN0->isVolatile(),
+                                         LN0->isNonTemporal(),
+                                         LN0->getAlignment());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.zext(VT.getSizeInBits());
+        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                                  ExtLoad, DAG.getConstant(Mask, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    N0.getOperand(0).getDebugLoc(),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                        ISD::ZERO_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
+  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
+  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       MemVT,
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(),
+                            ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    if (!LegalOperations && VT.isVector()) {
+      // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
+      // Only do this before legalize for now.
+      EVT N0VT = N0.getOperand(0).getValueType();
+      EVT EltVT = VT.getVectorElementType();
+      SmallVector<SDValue,8> OneOps(VT.getVectorNumElements(),
+                                    DAG.getConstant(1, EltVT));
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                                         N0.getOperand(1),
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                       &OneOps[0], OneOps.size()));
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      EVT MatchingElementType =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          N0VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType =
+        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                         N0VT.getVectorNumElements());
+      SDValue VsetCC =
+        DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                         DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
+                         DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                                     &OneOps[0], OneOps.size()));
+    }
+
+    // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode()) return SCC;
+  }
+
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse()) {
+    SDValue ShAmt = N0.getOperand(1);
+    unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    if (N0.getOpcode() == ISD::SHL) {
+      SDValue InnerZExt = N0.getOperand(0);
+      // If the original shl may be shifting out bits, do not perform this
+      // transformation.
+      unsigned KnownZeroBits = InnerZExt.getValueType().getSizeInBits() -
+        InnerZExt.getOperand(0).getValueType().getSizeInBits();
+      if (ShAmtVal > KnownZeroBits)
+        return SDValue();
+    }
+
+    DebugLoc DL = N->getDebugLoc();
+
+    // Ensure that the shift amount is wide enough for the shifted value.
+    if (VT.getSizeInBits() >= 256)
+      ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
+
+    return DAG.getNode(N0.getOpcode(), DL, VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
+                       ShAmt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (aext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, N0);
+  // fold (aext (aext x)) -> (aext x)
+  // fold (aext (zext x)) -> (zext x)
+  // fold (aext (sext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND  ||
+      N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)
+    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // fold (aext (truncate (load x))) -> (aext (smaller load x))
+  // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue NarrowLoad = ReduceLoadWidth(N0.getNode());
+    if (NarrowLoad.getNode()) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorkList(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (truncate x))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue TruncOp = N0.getOperand(0);
+    if (TruncOp.getValueType() == VT)
+      return TruncOp; // x iff x size == zext size.
+    if (TruncOp.getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, TruncOp);
+    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, TruncOp);
+  }
+
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask = Mask.zext(VT.getSizeInBits());
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       X, DAG.getConstant(Mask, VT));
+  }
+
+  // fold (aext (load x)) -> (aext (truncate (extload x)))
+  // None of the supported targets knows how to perform load and any_ext
+  // on vectors in one instruction.  We only perform this transformation on
+  // scalars.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), LN0->getPointerInfo(),
+                                       N0.getValueType(),
+                                       LN0->isVolatile(), LN0->isNonTemporal(),
+                                       LN0->getAlignment());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+                      ISD::ANY_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
+  // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
+  // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
+  if (N0.getOpcode() == ISD::LOAD &&
+      !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), N->getDebugLoc(),
+                                     VT, LN0->getChain(), LN0->getBasePtr(),
+                                     LN0->getPointerInfo(), MemVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // aext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N0VT = N0.getOperand(0).getValueType();
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      else {
+        EVT MatchingElementType =
+          EVT::getIntegerVT(*DAG.getContext(),
+                            N0VT.getScalarType().getSizeInBits());
+        EVT MatchingVectorType =
+          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                           N0VT.getVectorNumElements());
+        SDValue VsetCC =
+          DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+      }
+    }
+
+    // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDValue SCC =
+      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+                       DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
+    if (SCC.getNode())
+      return SCC;
+  }
+
+  return SDValue();
+}
+
+/// GetDemandedBits - See if the specified operand can be simplified with the
+/// knowledge that only the bits specified by Mask are used.  If so, return the
+/// simpler operand, otherwise return a null SDValue.
+SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
+    assert(CV != 0 && "Const value should be ConstSDNode.");
+    const APInt &CVal = CV->getAPIntValue();
+    APInt NewVal = CVal & Mask;
+    if (NewVal != CVal) {
+      return DAG.getConstant(NewVal, V.getValueType());
+    }
+    break;
+  }
+  case ISD::OR:
+  case ISD::XOR:
+    // If the LHS or RHS don't contribute bits to the or, drop them.
+    if (DAG.MaskedValueIsZero(V.getOperand(0), Mask))
+      return V.getOperand(1);
+    if (DAG.MaskedValueIsZero(V.getOperand(1), Mask))
+      return V.getOperand(0);
+    break;
+  case ISD::SRL:
+    // Only look at single-use SRLs.
+    if (!V.getNode()->hasOneUse())
+      break;
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+      // See if we can recursively simplify the LHS.
+      unsigned Amt = RHSC->getZExtValue();
+
+      // Watch out for shift count overflow though.
+      if (Amt >= Mask.getBitWidth()) break;
+      APInt NewMask = Mask << Amt;
+      SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
+      if (SimplifyLHS.getNode())
+        return DAG.getNode(ISD::SRL, V.getDebugLoc(), V.getValueType(),
+                           SimplifyLHS, V.getOperand(1));
+    }
+  }
+  return SDValue();
+}
+
+/// ReduceLoadWidth - If the result of a wider load is shifted to right of N
+/// bits and then truncated to a narrower type and where N is a multiple
+/// of number of bits of the narrower type, transform it to a narrower load
+/// from address + N / num of bits of new type. If the result is to be
+/// extended, also fold the extension to form a extending load.
+SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT ExtVT = VT;
+
+  // This transformation isn't valid for vector loads.
+  if (VT.isVector())
+    return SDValue();
+
+  // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
+  // extended to VT.
+  if (Opc == ISD::SIGN_EXTEND_INREG) {
+    ExtType = ISD::SEXTLOAD;
+    ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  } else if (Opc == ISD::SRL) {
+    // Another special-case: SRL is basically zero-extending a narrower value.
+    ExtType = ISD::ZEXTLOAD;
+    N0 = SDValue(N, 0);
+    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01) return SDValue();
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+                              VT.getSizeInBits() - N01->getZExtValue());
+  }
+  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, ExtVT))
+    return SDValue();
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!ExtVT.isRound())
+    return SDValue();
+
+  unsigned ShAmt = 0;
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShAmt = N01->getZExtValue();
+      // Is the shift amount a multiple of size of VT?
+      if ((ShAmt & (EVTBits-1)) == 0) {
+        N0 = N0.getOperand(0);
+        // Is the load width a multiple of size of VT?
+        if ((N0.getValueType().getSizeInBits() & (EVTBits-1)) != 0)
+          return SDValue();
+      }
+
+      // At this point, we must have a load or else we can't do the transform.
+      if (!isa<LoadSDNode>(N0)) return SDValue();
+
+      // Because a SRL must be assumed to *need* to zero-extend the high bits
+      // (as opposed to anyext the high bits), we can't combine the zextload
+      // lowering of SRL and an sextload.
+      if (cast<LoadSDNode>(N0)->getExtensionType() == ISD::SEXTLOAD)
+        return SDValue();
+
+      // If the shift amount is larger than the input type then we're not
+      // accessing any of the loaded bytes.  If the load was a zextload/extload
+      // then the result of the shift+trunc is zero/undef (handled elsewhere).
+      if (ShAmt >= cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits())
+        return SDValue();
+    }
+  }
+
+  // If the load is shifted left (and the result isn't shifted back right),
+  // we can fold the truncate through the shift.
+  unsigned ShLeftAmt = 0;
+  if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShLeftAmt = N01->getZExtValue();
+      N0 = N0.getOperand(0);
+    }
+  }
+
+  // If we haven't found a load, we can't narrow it.  Don't transform one with
+  // multiple uses, this would require adding a new load.
+  if (!isa<LoadSDNode>(N0) || !N0.hasOneUse())
+    return SDValue();
+
+  // Don't change the width of a volatile load.
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  if (LN0->isVolatile())
+    return SDValue();
+
+  // Verify that we are actually reducing a load width here.
+  if (LN0->getMemoryVT().getSizeInBits() < EVTBits)
+    return SDValue();
+
+  // For the transform to be legal, the load must produce only two values
+  // (the value loaded and the chain).  Don't transform a pre-increment
+  // load, for example, which produces an extra value.  Otherwise the 
+  // transformation is not equivalent, and the downstream logic to replace
+  // uses gets things wrong.
+  if (LN0->getNumValues() > 2)
+    return SDValue();
+
+  EVT PtrType = N0.getOperand(1).getValueType();
+
+  if (PtrType == MVT::Untyped || PtrType.isExtended())
+    // It's not possible to generate a constant of extended or untyped type.
+    return SDValue();
+
+  // For big endian targets, we need to adjust the offset to the pointer to
+  // load the correct bytes.
+  if (TLI.isBigEndian()) {
+    unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
+    unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
+    ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
+  }
+
+  uint64_t PtrOff = ShAmt / 8;
+  unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
+  SDValue NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(),
+                               PtrType, LN0->getBasePtr(),
+                               DAG.getConstant(PtrOff, PtrType));
+  AddToWorkList(NewPtr.getNode());
+
+  SDValue Load;
+  if (ExtType == ISD::NON_EXTLOAD)
+    Load =  DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
+                        LN0->getPointerInfo().getWithOffset(PtrOff),
+                        LN0->isVolatile(), LN0->isNonTemporal(),
+                        LN0->isInvariant(), NewAlign);
+  else
+    Load = DAG.getExtLoad(ExtType, N0.getDebugLoc(), VT, LN0->getChain(),NewPtr,
+                          LN0->getPointerInfo().getWithOffset(PtrOff),
+                          ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                          NewAlign);
+
+  // Replace the old load's chain with the new load's chain.
+  WorkListRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+
+  // Shift the result left, if we've swallowed a left shift.
+  SDValue Result = Load;
+  if (ShLeftAmt != 0) {
+    EVT ShImmTy = getShiftAmountTy(Result.getValueType());
+    if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
+      ShImmTy = VT;
+    // If the shift amount is as large as the result size (but, presumably,
+    // no larger than the source) then the useful bits of the result are
+    // zero; we can't simply return the shortened shift, because the result
+    // of that operation is undefined.
+    if (ShLeftAmt >= VT.getSizeInBits())
+      Result = DAG.getConstant(0, VT);
+    else
+      Result = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT,
+                          Result, DAG.getConstant(ShLeftAmt, ShImmTy));
+  }
+
+  // Return the new loaded value.
+  return Result;
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned EVTBits = EVT.getScalarType().getSizeInBits();
+
+  // fold (sext_in_reg c1) -> c1
+  if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
+
+  // If the input is already sign extended, just drop the extension.
+  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
+    return N0;
+
+  // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) {
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+  }
+
+  // fold (sext_in_reg (sext x)) -> (sext x)
+  // fold (sext_in_reg (aext x)) -> (sext x)
+  // if x is small enough.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getValueType().getScalarType().getSizeInBits() <= EVTBits &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
+  }
+
+  // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
+  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+    return DAG.getZeroExtendInReg(N0, N->getDebugLoc(), EVT);
+
+  // fold operands of sext_in_reg based on knowledge that the top bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (sext_in_reg (load x)) -> (smaller sextload x)
+  // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
+  SDValue NarrowLoad = ReduceLoadWidth(N);
+  if (NarrowLoad.getNode())
+    return NarrowLoad;
+
+  // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
+  // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
+  // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
+  if (N0.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
+        // We can turn this into an SRA iff the input to the SRL is already sign
+        // extended enough.
+        unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
+        if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+          return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
+                             N0.getOperand(0), N0.getOperand(1));
+      }
+  }
+
+  // fold (sext_inreg (extload x)) -> (sextload x)
+  if (ISD::isEXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getPointerInfo(),
+                                     EVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    AddToWorkList(ExtLoad.getNode());
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
+  if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getPointerInfo(),
+                                     EVT,
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
+  if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
+    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                       N0.getOperand(1), false);
+    if (BSwap.getNode() != 0)
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+                         BSwap, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool isLE = TLI.isLittleEndian();
+
+  // noop truncate
+  if (N0.getValueType() == N->getValueType(0))
+    return N0;
+  // fold (truncate c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0);
+  // fold (truncate (truncate x)) -> (truncate x)
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+  // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
+  if (N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND ||
+      N0.getOpcode() == ISD::ANY_EXTEND) {
+    if (N0.getOperand(0).getValueType().bitsLT(VT))
+      // if the source is smaller than the dest, we still need an extend
+      return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+                         N0.getOperand(0));
+    if (N0.getOperand(0).getValueType().bitsGT(VT))
+      // if the source is larger than the dest, than we just need the truncate
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+    // if the source and dest are the same type, we can drop both the extend
+    // and the truncate.
+    return N0.getOperand(0);
+  }
+
+  // Fold extract-and-trunc into a narrow extract. For example:
+  //   i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
+  //   i32 y = TRUNCATE(i64 x)
+  //        -- becomes --
+  //   v16i8 b = BITCAST (v2i64 val)
+  //   i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
+  //
+  // Note: We only run this optimization after type legalization (which often
+  // creates this pattern) and before operation legalization after which
+  // we need to be more careful about the vector instructions that we generate.
+  if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      LegalTypes && !LegalOperations && N0->hasOneUse()) {
+
+    EVT VecTy = N0.getOperand(0).getValueType();
+    EVT ExTy = N0.getValueType();
+    EVT TrTy = N->getValueType(0);
+
+    unsigned NumElem = VecTy.getVectorNumElements();
+    unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem);
+    assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+
+    SDValue EltNo = N0->getOperand(1);
+    if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
+      int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+      EVT IndexTy = N0->getOperand(1).getValueType();
+      int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
+
+      SDValue V = DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                              NVT, N0.getOperand(0));
+
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+                         N->getDebugLoc(), TrTy, V,
+                         DAG.getConstant(Index, IndexTy));
+    }
+  }
+
+  // Fold a series of buildvector, bitcast, and truncate if possible.
+  // For example fold
+  //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
+  //   (2xi32 (buildvector x, y)).
+  if (Level == AfterLegalizeVectorOps && VT.isVector() &&
+      N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getOperand(0).hasOneUse()) {
+
+    SDValue BuildVect = N0.getOperand(0);
+    EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
+    EVT TruncVecEltTy = VT.getVectorElementType();
+
+    // Check that the element types match.
+    if (BuildVectEltTy == TruncVecEltTy) {
+      // Now we only need to compute the offset of the truncated elements.
+      unsigned BuildVecNumElts =  BuildVect.getNumOperands();
+      unsigned TruncVecNumElts = VT.getVectorNumElements();
+      unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
+
+      assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
+             "Invalid number of elements");
+
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
+        Opnds.push_back(BuildVect.getOperand(i));
+
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &Opnds[0],
+                         Opnds.size());
+    }
+  }
+
+  // See if we can simplify the input to this truncate through knowledge that
+  // only the low bits are being used.
+  // For example "trunc (or (shl x, 8), y)" // -> trunc y
+  // Currently we only perform this optimization on scalars because vectors
+  // may have different active low bits.
+  if (!VT.isVector()) {
+    SDValue Shorter =
+      GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
+                                               VT.getSizeInBits()));
+    if (Shorter.getNode())
+      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Shorter);
+  }
+  // fold (truncate (load x)) -> (smaller load x)
+  // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    SDValue Reduced = ReduceLoadWidth(N);
+    if (Reduced.getNode())
+      return Reduced;
+  }
+  // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
+  // where ... are all 'undef'.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
+    SmallVector<EVT, 8> VTs;
+    SDValue V;
+    unsigned Idx = 0;
+    unsigned NumDefs = 0;
+
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+      SDValue X = N0.getOperand(i);
+      if (X.getOpcode() != ISD::UNDEF) {
+        V = X;
+        Idx = i;
+        NumDefs++;
+      }
+      // Stop if more than one members are non-undef.
+      if (NumDefs > 1)
+        break;
+      VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
+                                     VT.getVectorElementType(),
+                                     X.getValueType().getVectorNumElements()));
+    }
+
+    if (NumDefs == 0)
+      return DAG.getUNDEF(VT);
+
+    if (NumDefs == 1) {
+      assert(V.getNode() && "The single defined operand is empty!");
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+        if (i != Idx) {
+          Opnds.push_back(DAG.getUNDEF(VTs[i]));
+          continue;
+        }
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, V.getDebugLoc(), VTs[i], V);
+        AddToWorkList(NV.getNode());
+        Opnds.push_back(NV);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                         &Opnds[0], Opnds.size());
+    }
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDValue Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.getNode();
+  return Elt.getOperand(Elt.getResNo()).getNode();
+}
+
+/// CombineConsecutiveLoads - build_pair (load, load) -> load
+/// if load locations are consecutive.
+SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
+      LD1->getPointerInfo().getAddrSpace() !=
+         LD2->getPointerInfo().getAddrSpace())
+    return SDValue();
+  EVT LD1VT = LD1->getValueType(0);
+
+  if (ISD::isNON_EXTLoad(LD2) &&
+      LD2->hasOneUse() &&
+      // If both are volatile this would reduce the number of volatile loads.
+      // If one is volatile it might be ok, but play conservative and bail out.
+      !LD1->isVolatile() &&
+      !LD2->isVolatile() &&
+      DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
+    unsigned Align = LD1->getAlignment();
+    unsigned NewAlign = TLI.getDataLayout()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign <= Align &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
+      return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(),
+                         LD1->getBasePtr(), LD1->getPointerInfo(),
+                         false, false, false, Align);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBITCAST(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If the input is a BUILD_VECTOR with all constant elements, fold this now.
+  // Only do this before legalize, since afterward the target may be depending
+  // on the bitconvert.
+  // First check to see if this is all constant.
+  if (!LegalTypes &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      VT.isVector()) {
+    bool isSimple = true;
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i)
+      if (N0.getOperand(i).getOpcode() != ISD::UNDEF &&
+          N0.getOperand(i).getOpcode() != ISD::Constant &&
+          N0.getOperand(i).getOpcode() != ISD::ConstantFP) {
+        isSimple = false;
+        break;
+      }
+
+    EVT DestEltVT = N->getValueType(0).getVectorElementType();
+    assert(!DestEltVT.isVector() &&
+           "Element type of vector ValueType must not be vector!");
+    if (isSimple)
+      return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT);
+  }
+
+  // If the input is a constant, let getNode fold it.
+  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
+    SDValue Res = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, N0);
+    if (Res.getNode() != N) {
+      if (!LegalOperations ||
+          TLI.isOperationLegal(Res.getNode()->getOpcode(), VT))
+        return Res;
+
+      // Folding it resulted in an illegal node, and it's too late to
+      // do that. Clean up the old node and forego the transformation.
+      // Ideally this won't happen very often, because instcombine
+      // and the earlier dagcombine runs (where illegal nodes are
+      // permitted) should have folded most of them already.
+      DAG.DeleteNode(Res.getNode());
+    }
+  }
+
+  // (conv (conv x, t1), t2) -> (conv x, t2)
+  if (N0.getOpcode() == ISD::BITCAST)
+    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT,
+                       N0.getOperand(0));
+
+  // fold (conv (load x)) -> (load (conv*)x)
+  // If the resultant load doesn't need a higher alignment than the original!
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile() &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    unsigned Align = TLI.getDataLayout()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+    unsigned OrigAlign = LN0->getAlignment();
+
+    if (Align <= OrigAlign) {
+      SDValue Load = DAG.getLoad(VT, N->getDebugLoc(), LN0->getChain(),
+                                 LN0->getBasePtr(), LN0->getPointerInfo(),
+                                 LN0->isVolatile(), LN0->isNonTemporal(),
+                                 LN0->isInvariant(), OrigAlign);
+      AddToWorkList(N);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+                            N0.getValueType(), Load),
+                Load.getValue(1));
+      return Load;
+    }
+  }
+
+  // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+  // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  // This often reduces constant pool loads.
+  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(VT)) ||
+       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(VT))) &&
+      N0.getNode()->hasOneUse() && VT.isInteger() &&
+      !VT.isVector() && !N0.getValueType().isVector()) {
+    SDValue NewConv = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(), VT,
+                                  N0.getOperand(0));
+    AddToWorkList(NewConv.getNode());
+
+    APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+    if (N0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+                         NewConv, DAG.getConstant(SignBit, VT));
+    assert(N0.getOpcode() == ISD::FABS);
+    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                       NewConv, DAG.getConstant(~SignBit, VT));
+  }
+
+  // fold (bitconvert (fcopysign cst, x)) ->
+  //         (or (and (bitconvert x), sign), (and cst, (not sign)))
+  // Note that we don't handle (copysign x, cst) because this can always be
+  // folded to an fneg or fabs.
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
+      VT.isInteger() && !VT.isVector()) {
+    unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
+    EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
+    if (isTypeLegal(IntXVT)) {
+      SDValue X = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+                              IntXVT, N0.getOperand(1));
+      AddToWorkList(X.getNode());
+
+      // If X has a different width than the result/lhs, sext it or truncate it.
+      unsigned VTWidth = VT.getSizeInBits();
+      if (OrigXWidth < VTWidth) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      } else if (OrigXWidth > VTWidth) {
+        // To get the sign bit in the right place, we have to shift it right
+        // before truncating.
+        X = DAG.getNode(ISD::SRL, X.getDebugLoc(),
+                        X.getValueType(), X,
+                        DAG.getConstant(OrigXWidth-VTWidth, X.getValueType()));
+        AddToWorkList(X.getNode());
+        X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+        AddToWorkList(X.getNode());
+      }
+
+      APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+      X = DAG.getNode(ISD::AND, X.getDebugLoc(), VT,
+                      X, DAG.getConstant(SignBit, VT));
+      AddToWorkList(X.getNode());
+
+      SDValue Cst = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+                                VT, N0.getOperand(0));
+      Cst = DAG.getNode(ISD::AND, Cst.getDebugLoc(), VT,
+                        Cst, DAG.getConstant(~SignBit, VT));
+      AddToWorkList(Cst.getNode());
+
+      return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, X, Cst);
+    }
+  }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
+  if (N0.getOpcode() == ISD::BUILD_PAIR) {
+    SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT);
+    if (CombineLD.getNode())
+      return CombineLD;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
+/// ConstantFoldBITCASTofBUILD_VECTOR - We know that BV is a build_vector
+/// node with Constant, ConstantFP or Undef operands.  DstEltVT indicates the
+/// destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
+  EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+
+    // Due to the FP element handling below calling this routine recursively,
+    // we can end up with a scalar-to-vector node here.
+    if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
+                         DAG.getNode(ISD::BITCAST, BV->getDebugLoc(),
+                                     DstEltVT, BV->getOperand(0)));
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+      SDValue Op = BV->getOperand(i);
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, BV->getDebugLoc(), SrcEltVT, Op);
+      Ops.push_back(DAG.getNode(ISD::BITCAST, BV->getDebugLoc(),
+                                DstEltVT, Op));
+      AddToWorkList(Ops.back().getNode());
+    }
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
+    EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  // Handling growing first.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+  if (SrcBitSize < DstBitSize) {
+    unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e;
+         i += NumInputsPerOutput) {
+      bool isLE = TLI.isLittleEndian();
+      APInt NewBits = APInt(DstBitSize, 0);
+      bool EltIsUndef = true;
+      for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
+        // Shift the previously computed bits over.
+        NewBits <<= SrcBitSize;
+        SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
+        if (Op.getOpcode() == ISD::UNDEF) continue;
+        EltIsUndef = false;
+
+        NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
+                   zextOrTrunc(SrcBitSize).zext(DstBitSize);
+      }
+
+      if (EltIsUndef)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      else
+        Ops.push_back(DAG.getConstant(NewBits, DstEltVT));
+    }
+
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                       &Ops[0], Ops.size());
+  }
+
+  // Finally, this must be the case where we are shrinking elements: each input
+  // turns into multiple outputs.
+  bool isS2V = ISD::isScalarToVector(BV);
+  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                            NumOutputsPerInput*BV->getNumOperands());
+  SmallVector<SDValue, 8> Ops;
+
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
+      for (unsigned j = 0; j != NumOutputsPerInput; ++j)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      continue;
+    }
+
+    APInt OpVal = cast<ConstantSDNode>(BV->getOperand(i))->
+                  getAPIntValue().zextOrTrunc(SrcBitSize);
+
+    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
+      APInt ThisVal = OpVal.trunc(DstBitSize);
+      Ops.push_back(DAG.getConstant(ThisVal, DstEltVT));
+      if (isS2V && i == 0 && j == 0 && ThisVal.zext(SrcBitSize) == OpVal)
+        // Simply turn this into a SCALAR_TO_VECTOR of the new type.
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
+                           Ops[0]);
+      OpVal = OpVal.lshr(DstBitSize);
+    }
+
+    // For big endian targets, swap the order of the pieces of each element.
+    if (TLI.isBigEndian())
+      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+                     &Ops[0], Ops.size());
+}
+
+SDValue DAGCombiner::visitFADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fadd c1, c2) -> c1 + c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N0);
+  // fold (fadd A, 0) -> A
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fadd A, (fneg B)) -> (fsub A, B)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+    isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+  // fold (fadd (fneg A), B) -> (fsub B, A)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+    isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
+                       GetNegatedExpression(N0, DAG, LegalOperations));
+
+  // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  // No FP constant should be created after legalization as Instruction
+  // Selection pass has hard time in dealing with FP constant.
+  //
+  // We don't need test this condition for transformation like following, as
+  // the DAG being transformed implies it is legal to take FP constant as
+  // operand.
+  // 
+  //  (fadd (fmul c, x), x) -> (fmul c+1, x)
+  // 
+  bool AllowNewFpConst = (Level < AfterLegalizeDAG);
+
+  // If allow, fold (fadd (fneg x), x) -> 0.0
+  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
+      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) {
+    return DAG.getConstantFP(0.0, VT);
+  }
+
+    // If allow, fold (fadd x, (fneg x)) -> 0.0
+  if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
+      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) {
+    return DAG.getConstantFP(0.0, VT);
+  }
+
+  // In unsafe math mode, we can fold chains of FADD's of the same value
+  // into multiplications.  This transform is not safe in general because
+  // we are reducing the number of rounding steps.
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
+      !N0CFP && !N1CFP) {
+    if (N0.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+      ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+
+      // (fadd (fmul c, x), x) -> (fmul c+1, x)
+      if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fmul x, c), x) -> (fmul c+1, x)
+      if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fmul c, x), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fmul x, c), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    if (N1.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+      ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+
+      // (fadd x, (fmul c, x)) -> (fmul c+1, x)
+      if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+      // (fadd x, (fmul x, c)) -> (fmul c+1, x)
+      if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+
+      // (fadd (fadd x, x), (fmul c, x)) -> (fmul c+2, x)
+      if (CFP10 && !CFP11 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fadd x, x), (fmul x, c)) -> (fmul c+2, x)
+      if (CFP11 && !CFP10 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    if (N0.getOpcode() == ISD::FADD && AllowNewFpConst) {
+      ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
+      if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
+          (N0.getOperand(0) == N1)) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, DAG.getConstantFP(3.0, VT));
+      }
+    }
+
+    if (N1.getOpcode() == ISD::FADD && AllowNewFpConst) {
+      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
+      if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+          N1.getOperand(0) == N0) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, DAG.getConstantFP(3.0, VT));
+      }
+    }
+
+    // (fadd (fadd x, x), (fadd x, x)) -> (fmul 4.0, x)
+    if (AllowNewFpConst &&
+        N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+        N0.getOperand(0) == N0.getOperand(1) &&
+        N1.getOperand(0) == N1.getOperand(1) &&
+        N0.getOperand(0) == N1.getOperand(0)) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getConstantFP(4.0, VT));
+    }
+  }
+
+  // FADD -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+
+    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+
+    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+    // Note: Commutes FADD operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+                         N1.getOperand(0), N1.getOperand(1), N0);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fsub c1, c2) -> c1-c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0, N1);
+  // fold (fsub A, 0) -> A
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N1CFP && N1CFP->getValueAPF().isZero())
+    return N0;
+  // fold (fsub 0, B) -> -B
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N0CFP && N0CFP->getValueAPF().isZero()) {
+    if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
+      return GetNegatedExpression(N1, DAG, LegalOperations);
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, dl, VT, N1);
+  }
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
+    return DAG.getNode(ISD::FADD, dl, VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations));
+
+  // If 'unsafe math' is enabled, fold
+  //    (fsub x, x) -> 0.0 &
+  //    (fsub x, (fadd x, y)) -> (fneg y) &
+  //    (fsub x, (fadd y, x)) -> (fneg y)
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N0 == N1)
+      return DAG.getConstantFP(0.0f, VT);
+
+    if (N1.getOpcode() == ISD::FADD) {
+      SDValue N10 = N1->getOperand(0);
+      SDValue N11 = N1->getOperand(1);
+
+      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI,
+                                          &DAG.getTarget().Options))
+        return GetNegatedExpression(N11, DAG, LegalOperations);
+      else if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
+                                               &DAG.getTarget().Options))
+        return GetNegatedExpression(N10, DAG, LegalOperations);
+    }
+  }
+
+  // FSUB -> FMA combines:
+  if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
+       DAG.getTarget().Options.UnsafeFPMath) &&
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+
+    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    }
+
+    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+    // Note: Commutes FSUB operands.
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT,
+                         N1.getOperand(0)),
+                         N1.getOperand(1), N0);
+    }
+
+    // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+    if (N0.getOpcode() == ISD::FNEG && 
+        N0.getOperand(0).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+      SDValue N00 = N0.getOperand(0).getOperand(0);
+      SDValue N01 = N0.getOperand(0).getOperand(1);
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fmul c1, c2) -> c1*c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0, N1);
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N1, N0);
+  // fold (fmul A, 0) -> 0
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N1CFP && N1CFP->getValueAPF().isZero())
+    return N1;
+  // fold (fmul A, 0) -> 0, vector edition.
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      ISD::isBuildVectorAllZeros(N1.getNode()))
+    return N1;
+  // fold (fmul A, 1.0) -> A
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return N0;
+  // fold (fmul X, 2.0) -> (fadd X, X)
+  if (N1CFP && N1CFP->isExactlyValue(+2.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
+  // fold (fmul X, -1.0) -> (fneg X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0))
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N0);
+
+  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
+                                       &DAG.getTarget().Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, 
+                                         &DAG.getTarget().Options)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  // If allowed, fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N1CFP && N0.getOpcode() == ISD::FMUL &&
+      N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
+    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                                   N0.getOperand(1), N1));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N0CFP && N0CFP->isZero())
+      return N2;
+    if (N1CFP && N1CFP->isZero())
+      return N2;
+  }
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N2);
+
+  // Canonicalize (fma c, x, y) -> (fma x, c, y)
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
+
+  // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FMUL &&
+      N0 == N2.getOperand(0) &&
+      N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                       DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1)));
+  }
+
+
+  // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N0.getOpcode() == ISD::FMUL && N1CFP &&
+      N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMA, dl, VT,
+                       N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)),
+                       N2);
+  }
+
+  // (fma x, 1, y) -> (fadd x, y)
+  // (fma x, -1, y) -> (fadd (fneg x), y)
+  if (N1CFP) {
+    if (N1CFP->isExactlyValue(1.0))
+      return DAG.getNode(ISD::FADD, dl, VT, N0, N2);
+
+    if (N1CFP->isExactlyValue(-1.0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
+      SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
+      AddToWorkList(RHSNeg.getNode());
+      return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
+    }
+  }
+
+  // (fma x, c, x) -> (fmul x, (c+1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(1.0, VT)));
+  }
+
+  // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(-1.0, VT)));
+  }
+
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fdiv c1, c2) -> c1/c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
+
+  // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+  if (N1CFP && DAG.getTarget().Options.UnsafeFPMath) {
+    // Compute the reciprocal 1.0 / c2.
+    APFloat N1APF = N1CFP->getValueAPF();
+    APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+    // Only do the transform if the reciprocal is a legal fp immediate that
+    // isn't too nasty (eg NaN, denormal, ...).
+    if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+        (!LegalOperations ||
+         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+         // backend)... we should handle this gracefully after Legalize.
+         // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
+         TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+         TLI.isFPImmLegal(Recip, VT)))
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0,
+                         DAG.getConstantFP(Recip, VT));
+  }
+
+  // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
+                                       &DAG.getTarget().Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
+                                         &DAG.getTarget().Options)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (frem c1, c2) -> fmod(c1,c2)
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FREM, N->getDebugLoc(), VT, N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  if (N0CFP && N1CFP)  // Constant fold
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT, N0, N1);
+
+  if (N1CFP) {
+    const APFloat& V = N1CFP->getValueAPF();
+    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
+    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
+    if (!V.isNegative()) {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
+        return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+    } else {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                           DAG.getNode(ISD::FABS, N0.getDebugLoc(), VT, N0));
+    }
+  }
+
+  // copysign(fabs(x), y) -> copysign(x, y)
+  // copysign(fneg(x), y) -> copysign(x, y)
+  // copysign(copysign(x,z), y) -> copysign(x, y)
+  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
+      N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0.getOperand(0), N1);
+
+  // copysign(x, abs(y)) -> abs(x)
+  if (N1.getOpcode() == ISD::FABS)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+
+  // copysign(x, copysign(y,z)) -> copysign(x, z)
+  if (N1.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(1));
+
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       N0, N1.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (sint_to_fp c1) -> c1fp
+  if (N0C &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and SINT_TO_FP is not legal on this target,
+  // but UINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+        !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+
+    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+    //      (select_cc x, y, 1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
+          DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(0).getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (uint_to_fp c1) -> c1fp
+  if (N0C &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+
+  // If the input is a legal type, and UINT_TO_FP is not legal on this target,
+  // but SINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+  }
+
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+
+    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_sint c1fp) -> c1
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_TO_SINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_uint c1fp) -> c1
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_TO_UINT, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_round c1fp) -> c1fp
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0, N1);
+
+  // fold (fp_round (fp_extend x)) -> x
+  if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
+    return N0.getOperand(0);
+
+  // fold (fp_round (fp_round x)) -> (fp_round x)
+  if (N0.getOpcode() == ISD::FP_ROUND) {
+    // This is a value preserving truncation if both round's are.
+    bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
+                   N0.getNode()->getConstantOperandVal(1) == 1;
+    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0.getOperand(0),
+                       DAG.getIntPtrConstant(IsTrunc));
+  }
+
+  // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(), VT,
+                              N0.getOperand(0), N1);
+    AddToWorkList(Tmp.getNode());
+    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+                       Tmp, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+
+  // fold (fp_round_inreg c1fp) -> c1fp
+  if (N0CFP && isTypeLegal(EVT)) {
+    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT);
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, Round);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() &&
+      N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fp_extend c1fp) -> c1fp
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, N0);
+
+  // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
+  // value of X.
+  if (N0.getOpcode() == ISD::FP_ROUND
+      && N0.getNode()->getConstantOperandVal(1) == 1) {
+    SDValue In = N0.getOperand(0);
+    if (In.getValueType() == VT) return In;
+    if (VT.bitsLT(In.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT,
+                         In, N0.getOperand(1));
+    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, In);
+  }
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), LN0->getPointerInfo(),
+                                     N0.getValueType(),
+                                     LN0->isVolatile(), LN0->isNonTemporal(),
+                                     LN0->getAlignment());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(),
+                          N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1)),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVUnaryOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
+                         &DAG.getTarget().Options))
+    return GetNegatedExpression(N0, DAG, LegalOperations);
+
+  // Transform fneg(bitconvert(x)) -> bitconvert(x^sign) to avoid loading
+  // constant pool values.
+  if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST &&
+      !VT.isVector() &&
+      N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::XOR, N0.getDebugLoc(), IntVT, Int,
+              DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                         VT, Int);
+    }
+  }
+
+  // (fneg (fmul c, x)) -> (fmul -c, x)
+  if (N0.getOpcode() == ISD::FMUL) {
+    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CFP1) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                                     N0.getOperand(1)));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FCEIL, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FTRUNC, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FFLOOR, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVUnaryOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
+  // fold (fabs c1) -> fabs(c1)
+  if (N0CFP)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+  // fold (fabs (fabs x)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FABS)
+    return N->getOperand(0);
+  // fold (fabs (fneg x)) -> (fabs x)
+  // fold (fabs (fcopysign x, y)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0.getOperand(0));
+
+  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // constant pool values.
+  if (!TLI.isFAbsFree(VT) && 
+      N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
+      N0.getOperand(0).getValueType().isInteger() &&
+      !N0.getOperand(0).getValueType().isVector()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      Int = DAG.getNode(ISD::AND, N0.getDebugLoc(), IntVT, Int,
+             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+      AddToWorkList(Int.getNode());
+      return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                         N->getValueType(0), Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBRCOND(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
+  // on the target.
+  if (N1.getOpcode() == ISD::SETCC &&
+      TLI.isOperationLegalOrCustom(ISD::BR_CC,
+                                   N1.getOperand(0).getValueType())) {
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       Chain, N1.getOperand(2),
+                       N1.getOperand(0), N1.getOperand(1), N2);
+  }
+
+  if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) ||
+      ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
+       (N1.getOperand(0).hasOneUse() &&
+        N1.getOperand(0).getOpcode() == ISD::SRL))) {
+    SDNode *Trunc = 0;
+    if (N1.getOpcode() == ISD::TRUNCATE) {
+      // Look pass the truncate.
+      Trunc = N1.getNode();
+      N1 = N1.getOperand(0);
+    }
+
+    // Match this pattern so that we can generate simpler code:
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = srl i32 %b, 1
+    //   brcond i32 %c ...
+    //
+    // into
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = setcc eq %b, 0
+    //   brcond %c ...
+    //
+    // This applies only when the AND constant value has one bit set and the
+    // SRL constant is equal to the log2 of the AND constant. The back-end is
+    // smart enough to convert the result into a TEST/JMP sequence.
+    SDValue Op0 = N1.getOperand(0);
+    SDValue Op1 = N1.getOperand(1);
+
+    if (Op0.getOpcode() == ISD::AND &&
+        Op1.getOpcode() == ISD::Constant) {
+      SDValue AndOp1 = Op0.getOperand(1);
+
+      if (AndOp1.getOpcode() == ISD::Constant) {
+        const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
+
+        if (AndConst.isPowerOf2() &&
+            cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
+          SDValue SetCC =
+            DAG.getSetCC(N->getDebugLoc(),
+                         TLI.getSetCCResultType(Op0.getValueType()),
+                         Op0, DAG.getConstant(0, Op0.getValueType()),
+                         ISD::SETNE);
+
+          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                                          MVT::Other, Chain, SetCC, N2);
+          // Don't add the new BRCond into the worklist or else SimplifySelectCC
+          // will convert it back to (X & C1) >> C2.
+          CombineTo(N, NewBRCond, false);
+          // Truncate is dead.
+          if (Trunc) {
+            removeFromWorkList(Trunc);
+            DAG.DeleteNode(Trunc);
+          }
+          // Replace the uses of SRL with SETCC
+          WorkListRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
+          removeFromWorkList(N1.getNode());
+          DAG.DeleteNode(N1.getNode());
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+
+    if (Trunc)
+      // Restore N1 if the above transformation doesn't match.
+      N1 = N->getOperand(1);
+  }
+
+  // Transform br(xor(x, y)) -> br(x != y)
+  // Transform br(xor(xor(x,y), 1)) -> br (x == y)
+  if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
+    SDNode *TheXor = N1.getNode();
+    SDValue Op0 = TheXor->getOperand(0);
+    SDValue Op1 = TheXor->getOperand(1);
+    if (Op0.getOpcode() == Op1.getOpcode()) {
+      // Avoid missing important xor optimizations.
+      SDValue Tmp = visitXOR(TheXor);
+      if (Tmp.getNode()) {
+        if (Tmp.getNode() != TheXor) {
+          DEBUG(dbgs() << "\nReplacing.8 ";
+                TheXor->dump(&DAG);
+                dbgs() << "\nWith: ";
+                Tmp.getNode()->dump(&DAG);
+                dbgs() << '\n');
+          WorkListRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
+          removeFromWorkList(TheXor);
+          DAG.DeleteNode(TheXor);
+          return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                             MVT::Other, Chain, Tmp, N2);
+        }
+
+        // visitXOR has changed XOR's operands or replaced the XOR completely,
+        // bail out.
+        return SDValue(N, 0);
+      }
+    }
+
+    if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
+      bool Equal = false;
+      if (ConstantSDNode *RHSCI = dyn_cast<ConstantSDNode>(Op0))
+        if (RHSCI->getAPIntValue() == 1 && Op0.hasOneUse() &&
+            Op0.getOpcode() == ISD::XOR) {
+          TheXor = Op0.getNode();
+          Equal = true;
+        }
+
+      EVT SetCCVT = N1.getValueType();
+      if (LegalTypes)
+        SetCCVT = TLI.getSetCCResultType(SetCCVT);
+      SDValue SetCC = DAG.getSetCC(TheXor->getDebugLoc(),
+                                   SetCCVT,
+                                   Op0, Op1,
+                                   Equal ? ISD::SETEQ : ISD::SETNE);
+      // Replace the uses of XOR with SETCC
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
+      removeFromWorkList(N1.getNode());
+      DAG.DeleteNode(N1.getNode());
+      return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+                         MVT::Other, Chain, SetCC, N2);
+    }
+  }
+
+  return SDValue();
+}
+
+// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
+//
+SDValue DAGCombiner::visitBR_CC(SDNode *N) {
+  CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
+  SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // Use SimplifySetCC to simplify SETCC's.
+  SDValue Simp = SimplifySetCC(TLI.getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), N->getDebugLoc(),
+                               false);
+  if (Simp.getNode()) AddToWorkList(Simp.getNode());
+
+  // fold to a simpler setcc
+  if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+                       N->getOperand(0), Simp.getOperand(2),
+                       Simp.getOperand(0), Simp.getOperand(1),
+                       N->getOperand(4));
+
+  return SDValue();
+}
+
+/// canFoldInAddressingMode - Return true if 'Use' is a load or a store that
+/// uses N as its base pointer and that N may be folded in the load / store
+/// addressing mode.
+static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
+                                    SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = Use->getValueType(0);
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getValue().getValueType();
+  } else
+    return false;
+
+  TargetLowering::AddrMode AM;
+  if (N->getOpcode() == ISD::ADD) {
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else if (N->getOpcode() == ISD::SUB) {
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = -Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else
+    return false;
+
+  return TLI.isLegalAddressingMode(AM, VT.getTypeForEVT(*DAG.getContext()));
+}
+
+/// CombineToPreIndexedLoadStore - Try turning a load / store into a
+/// pre-indexed load / store when the base pointer is an add or subtract
+/// and it has other uses besides the load / store. After the
+/// transformation, the new indexed load / store has effectively folded
+/// the add / subtract in and all of its other uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
+  // out.  There is no reason to make this a preinc/predec.
+  if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
+      Ptr.getNode()->hasOneUse())
+    return false;
+
+  // Ask the target to do addressing mode selection.
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
+    return false;
+
+  // Backends without true r+i pre-indexed forms may need to pass a
+  // constant base with a variable offset so that constant coercion
+  // will work with the patterns in canonical form.
+  bool Swapped = false;
+  if (isa<ConstantSDNode>(BasePtr)) {
+    std::swap(BasePtr, Offset);
+    Swapped = true;
+  }
+
+  // Don't create a indexed load / store with zero offset.
+  if (isa<ConstantSDNode>(Offset) &&
+      cast<ConstantSDNode>(Offset)->isNullValue())
+    return false;
+
+  // Try turning it into a pre-indexed load / store except when:
+  // 1) The new base ptr is a frame index.
+  // 2) If N is a store and the new base ptr is either the same as or is a
+  //    predecessor of the value being stored.
+  // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
+  //    that would create a cycle.
+  // 4) All uses are load / store ops that use it as old base ptr.
+
+  // Check #1.  Preinc'ing a frame index would require copying the stack pointer
+  // (plus the implicit offset) to a register to preinc anyway.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  // Check #2.
+  if (!isLoad) {
+    SDValue Val = cast<StoreSDNode>(N)->getValue();
+    if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode()))
+      return false;
+  }
+
+  // If the offset is a constant, there may be other adds of constants that
+  // can be folded with this one. We should do this to avoid having to keep
+  // a copy of the original base pointer.
+  SmallVector<SDNode *, 16> OtherUses;
+  if (isa<ConstantSDNode>(Offset))
+    for (SDNode::use_iterator I = BasePtr.getNode()->use_begin(),
+         E = BasePtr.getNode()->use_end(); I != E; ++I) {
+      SDNode *Use = *I;
+      if (Use == Ptr.getNode())
+        continue;
+
+      if (Use->isPredecessorOf(N))
+        continue;
+
+      if (Use->getOpcode() != ISD::ADD && Use->getOpcode() != ISD::SUB) {
+        OtherUses.clear();
+        break;
+      }
+
+      SDValue Op0 = Use->getOperand(0), Op1 = Use->getOperand(1);
+      if (Op1.getNode() == BasePtr.getNode())
+        std::swap(Op0, Op1);
+      assert(Op0.getNode() == BasePtr.getNode() &&
+             "Use of ADD/SUB but not an operand");
+
+      if (!isa<ConstantSDNode>(Op1)) {
+        OtherUses.clear();
+        break;
+      }
+
+      // FIXME: In some cases, we can be smarter about this.
+      if (Op1.getValueType() != Offset.getValueType()) {
+        OtherUses.clear();
+        break;
+      }
+
+      OtherUses.push_back(Use);
+    }
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Now check for #3 and #4.
+  bool RealUse = false;
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Use = *I;
+    if (Use == N)
+      continue;
+    if (N->hasPredecessorHelper(Use, Visited, Worklist))
+      return false;
+
+    // If Ptr may be folded in addressing mode of other use, then it's
+    // not profitable to do this transformation.
+    if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
+      RealUse = true;
+  }
+
+  if (!RealUse)
+    return false;
+
+  SDValue Result;
+  if (isLoad)
+    Result = DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+  else
+    Result = DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                 BasePtr, Offset, AM);
+  ++PreIndexedNodes;
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.4 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Result.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorkListRemover DeadNodes(*this);
+  if (isLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  DAG.DeleteNode(N);
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Replace other uses of BasePtr that can be updated to use Ptr
+  for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
+    unsigned OffsetIdx = 1;
+    if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
+      OffsetIdx = 0;
+    assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
+           BasePtr.getNode() && "Expected BasePtr operand");
+
+    APInt OV =
+      cast<ConstantSDNode>(Offset)->getAPIntValue();
+    if (AM == ISD::PRE_DEC)
+      OV = -OV;
+
+    ConstantSDNode *CN =
+      cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
+    APInt CNV = CN->getAPIntValue();
+    if (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1)
+      CNV += OV;
+    else
+      CNV -= OV;
+
+    SDValue NewOp1 = Result.getValue(isLoad ? 1 : 0);
+    SDValue NewOp2 = DAG.getConstant(CNV, CN->getValueType(0));
+    if (OffsetIdx == 0)
+      std::swap(NewOp1, NewOp2);
+
+    SDValue NewUse = DAG.getNode(OtherUses[i]->getOpcode(),
+                                 OtherUses[i]->getDebugLoc(),
+                                 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
+    removeFromWorkList(OtherUses[i]);
+    DAG.DeleteNode(OtherUses[i]);
+  }
+
+  // Replace the uses of Ptr with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
+  removeFromWorkList(Ptr.getNode());
+  DAG.DeleteNode(Ptr.getNode());
+
+  return true;
+}
+
+/// CombineToPostIndexedLoadStore - Try to combine a load / store with a
+/// add / sub of the base pointer node into a post-indexed load / store.
+/// The transformation folded the add / subtract into the new indexed
+/// load / store effectively and all of its uses are redirected to the
+/// new load / store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  if (Ptr.getNode()->hasOneUse())
+    return false;
+
+  for (SDNode::use_iterator I = Ptr.getNode()->use_begin(),
+         E = Ptr.getNode()->use_end(); I != E; ++I) {
+    SDNode *Op = *I;
+    if (Op == N ||
+        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
+      continue;
+
+    SDValue BasePtr;
+    SDValue Offset;
+    ISD::MemIndexedMode AM = ISD::UNINDEXED;
+    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
+      // Don't create a indexed load / store with zero offset.
+      if (isa<ConstantSDNode>(Offset) &&
+          cast<ConstantSDNode>(Offset)->isNullValue())
+        continue;
+
+      // Try turning it into a post-indexed load / store except when
+      // 1) All uses are load / store ops that use it as base ptr (and
+      //    it may be folded as addressing mmode).
+      // 2) Op must be independent of N, i.e. Op is neither a predecessor
+      //    nor a successor of N. Otherwise, if Op is folded that would
+      //    create a cycle.
+
+      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+        continue;
+
+      // Check for #1.
+      bool TryNext = false;
+      for (SDNode::use_iterator II = BasePtr.getNode()->use_begin(),
+             EE = BasePtr.getNode()->use_end(); II != EE; ++II) {
+        SDNode *Use = *II;
+        if (Use == Ptr.getNode())
+          continue;
+
+        // If all the uses are load / store addresses, then don't do the
+        // transformation.
+        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+          bool RealUse = false;
+          for (SDNode::use_iterator III = Use->use_begin(),
+                 EEE = Use->use_end(); III != EEE; ++III) {
+            SDNode *UseUse = *III;
+            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) 
+              RealUse = true;
+          }
+
+          if (!RealUse) {
+            TryNext = true;
+            break;
+          }
+        }
+      }
+
+      if (TryNext)
+        continue;
+
+      // Check for #2
+      if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
+        SDValue Result = isLoad
+          ? DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+                               BasePtr, Offset, AM)
+          : DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+                                BasePtr, Offset, AM);
+        ++PostIndexedNodes;
+        ++NodesCombined;
+        DEBUG(dbgs() << "\nReplacing.5 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Result.getNode()->dump(&DAG);
+              dbgs() << '\n');
+        WorkListRemover DeadNodes(*this);
+        if (isLoad) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+        } else {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+        }
+
+        // Finally, since the node is now dead, remove it from the graph.
+        DAG.DeleteNode(N);
+
+        // Replace the uses of Use with uses of the updated base value.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                      Result.getValue(isLoad ? 1 : 0));
+        removeFromWorkList(Op);
+        DAG.DeleteNode(Op);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+SDValue DAGCombiner::visitLOAD(SDNode *N) {
+  LoadSDNode *LD  = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr   = LD->getBasePtr();
+
+  // If load is not volatile and there are no uses of the loaded value (and
+  // the updated indexed value in case of indexed loads), change uses of the
+  // chain value into uses of the chain input (i.e. delete the dead load).
+  if (!LD->isVolatile()) {
+    if (N->getValueType(1) == MVT::Other) {
+      // Unindexed loads.
+      if (!N->hasAnyUseOfValue(0)) {
+        // It's not safe to use the two value CombineTo variant here. e.g.
+        // v1, chain2 = load chain1, loc
+        // v2, chain3 = load chain2, loc
+        // v3         = add v2, c
+        // Now we replace use of chain2 with chain1.  This makes the second load
+        // isomorphic to the one we are deleting, and thus makes this load live.
+        DEBUG(dbgs() << "\nReplacing.6 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith chain: ";
+              Chain.getNode()->dump(&DAG);
+              dbgs() << "\n");
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+
+        if (N->use_empty()) {
+          removeFromWorkList(N);
+          DAG.DeleteNode(N);
+        }
+
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    } else {
+      // Indexed loads.
+      assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
+      if (!N->hasAnyUseOfValue(0) && !N->hasAnyUseOfValue(1)) {
+        SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        DEBUG(dbgs() << "\nReplacing.7 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Undef.getNode()->dump(&DAG);
+              dbgs() << " and 2 other values\n");
+        WorkListRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                      DAG.getUNDEF(N->getValueType(1)));
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
+        removeFromWorkList(N);
+        DAG.DeleteNode(N);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // If this load is directly stored, replace the load value with the stored
+  // value.
+  // TODO: Handle store large -> read small portion.
+  // TODO: Handle TRUNCSTORE/LOADEXT
+  if (ISD::isNormalLoad(N) && !LD->isVolatile()) {
+    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
+      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+      if (PrevST->getBasePtr() == Ptr &&
+          PrevST->getValue().getValueType() == N->getValueType(0))
+      return CombineTo(N, Chain.getOperand(1), Chain);
+    }
+  }
+
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > LD->getMemOperand()->getBaseAlignment()) {
+        SDValue NewLoad =
+               DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
+                              LD->getValueType(0),
+                              Chain, Ptr, LD->getPointerInfo(),
+                              LD->getMemoryVT(),
+                              LD->isVolatile(), LD->isNonTemporal(), Align);
+        return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
+      }
+    }
+  }
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplLoad;
+
+      // Replace the chain to void dependency.
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        ReplLoad = DAG.getLoad(N->getValueType(0), LD->getDebugLoc(),
+                               BetterChain, Ptr, LD->getPointerInfo(),
+                               LD->isVolatile(), LD->isNonTemporal(),
+                               LD->isInvariant(), LD->getAlignment());
+      } else {
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(),
+                                  LD->getValueType(0),
+                                  BetterChain, Ptr, LD->getPointerInfo(),
+                                  LD->getMemoryVT(),
+                                  LD->isVolatile(),
+                                  LD->isNonTemporal(),
+                                  LD->getAlignment());
+      }
+
+      // Create token factor to keep old chain connected.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplLoad.getValue(1));
+
+      // Make sure the new and old chains are cleaned up.
+      AddToWorkList(Token.getNode());
+
+      // Replace uses with load result and token factor. Don't add users
+      // to work list.
+      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
+/// load is having specific bytes cleared out.  If so, return the byte size
+/// being masked out and the shift amount.
+static std::pair<unsigned, unsigned>
+CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
+  std::pair<unsigned, unsigned> Result(0, 0);
+
+  // Check for the structure we're looking for.
+  if (V->getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(V->getOperand(1)) ||
+      !ISD::isNormalLoad(V->getOperand(0).getNode()))
+    return Result;
+
+  // Check the chain and pointer.
+  LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
+  if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
+
+  // The store should be chained directly to the load or be an operand of a
+  // tokenfactor.
+  if (LD == Chain.getNode())
+    ; // ok.
+  else if (Chain->getOpcode() != ISD::TokenFactor)
+    return Result; // Fail.
+  else {
+    bool isOk = false;
+    for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i)
+      if (Chain->getOperand(i).getNode() == LD) {
+        isOk = true;
+        break;
+      }
+    if (!isOk) return Result;
+  }
+
+  // This only handles simple types.
+  if (V.getValueType() != MVT::i16 &&
+      V.getValueType() != MVT::i32 &&
+      V.getValueType() != MVT::i64)
+    return Result;
+
+  // Check the constant mask.  Invert it so that the bits being masked out are
+  // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
+  // follow the sign bit for uniformity.
+  uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
+  unsigned NotMaskLZ = CountLeadingZeros_64(NotMask);
+  if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
+  unsigned NotMaskTZ = CountTrailingZeros_64(NotMask);
+  if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
+  if (NotMaskLZ == 64) return Result;  // All zero mask.
+
+  // See if we have a continuous run of bits.  If so, we have 0*1+0*
+  if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64)
+    return Result;
+
+  // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
+  if (V.getValueType() != MVT::i64 && NotMaskLZ)
+    NotMaskLZ -= 64-V.getValueSizeInBits();
+
+  unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
+  switch (MaskedBytes) {
+  case 1:
+  case 2:
+  case 4: break;
+  default: return Result; // All one mask, or 5-byte mask.
+  }
+
+  // Verify that the first bit starts at a multiple of mask so that the access
+  // is aligned the same as the access width.
+  if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
+
+  Result.first = MaskedBytes;
+  Result.second = NotMaskTZ/8;
+  return Result;
+}
+
+
+/// ShrinkLoadReplaceStoreWithStore - Check to see if IVal is something that
+/// provides a value as specified by MaskInfo.  If so, replace the specified
+/// store with a narrower store of truncated IVal.
+static SDNode *
+ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
+                                SDValue IVal, StoreSDNode *St,
+                                DAGCombiner *DC) {
+  unsigned NumBytes = MaskInfo.first;
+  unsigned ByteShift = MaskInfo.second;
+  SelectionDAG &DAG = DC->getDAG();
+
+  // Check to see if IVal is all zeros in the part being masked in by the 'or'
+  // that uses this.  If not, this is not a replacement.
+  APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
+                                  ByteShift*8, (ByteShift+NumBytes)*8);
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return 0;
+
+  // Check that it is legal on the target to do this.  It is legal if the new
+  // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
+  // legalization.
+  MVT VT = MVT::getIntegerVT(NumBytes*8);
+  if (!DC->isTypeLegal(VT))
+    return 0;
+
+  // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
+  // shifted by ByteShift and truncated down to NumBytes.
+  if (ByteShift)
+    IVal = DAG.getNode(ISD::SRL, IVal->getDebugLoc(), IVal.getValueType(), IVal,
+                       DAG.getConstant(ByteShift*8,
+                                    DC->getShiftAmountTy(IVal.getValueType())));
+
+  // Figure out the offset for the store and the alignment of the access.
+  unsigned StOffset;
+  unsigned NewAlign = St->getAlignment();
+
+  if (DAG.getTargetLoweringInfo().isLittleEndian())
+    StOffset = ByteShift;
+  else
+    StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
+
+  SDValue Ptr = St->getBasePtr();
+  if (StOffset) {
+    Ptr = DAG.getNode(ISD::ADD, IVal->getDebugLoc(), Ptr.getValueType(),
+                      Ptr, DAG.getConstant(StOffset, Ptr.getValueType()));
+    NewAlign = MinAlign(NewAlign, StOffset);
+  }
+
+  // Truncate down to the new size.
+  IVal = DAG.getNode(ISD::TRUNCATE, IVal->getDebugLoc(), VT, IVal);
+
+  ++OpsNarrowed;
+  return DAG.getStore(St->getChain(), St->getDebugLoc(), IVal, Ptr,
+                      St->getPointerInfo().getWithOffset(StOffset),
+                      false, false, NewAlign).getNode();
+}
+
+
+/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
+/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
+/// of the loaded bits, try narrowing the load and store if it would end up
+/// being a win for performance or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  if (ST->isVolatile())
+    return SDValue();
+
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  EVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+    return SDValue();
+
+  unsigned Opc = Value.getOpcode();
+
+  // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+  // is a byte mask indicating a consecutive number of bytes, check to see if
+  // Y is known to provide just those bytes.  If so, we try to replace the
+  // load + replace + store sequence with a single (narrower) store, which makes
+  // the load dead.
+  if (Opc == ISD::OR) {
+    std::pair<unsigned, unsigned> MaskedLoad;
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(1), ST,this))
+        return SDValue(NewST, 0);
+
+    // Or is commutative, so try swapping X and Y.
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(0), ST,this))
+        return SDValue(NewST, 0);
+  }
+
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      Chain == SDValue(N0.getNode(), 1)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr ||
+        LD->getPointerInfo().getAddrSpace() !=
+        ST->getPointerInfo().getAddrSpace())
+      return SDValue();
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnesValue(BitWidth);
+    if (Imm == 0 || Imm.isAllOnesValue())
+      return SDValue();
+    unsigned ShAmt = Imm.countTrailingZeros();
+    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    while (NewBW < BitWidth &&
+           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
+             TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    }
+    if (NewBW >= BitWidth)
+      return SDValue();
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
+                                   std::min(BitWidth, ShAmt + NewBW));
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnesValue(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (TLI.isBigEndian())
+        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+
+      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
+      if (NewAlign < TLI.getDataLayout()->getABITypeAlignment(NewVTTy))
+        return SDValue();
+
+      SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
+                                   Ptr.getValueType(), Ptr,
+                                   DAG.getConstant(PtrOff, Ptr.getValueType()));
+      SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
+                                  LD->getChain(), NewPtr,
+                                  LD->getPointerInfo().getWithOffset(PtrOff),
+                                  LD->isVolatile(), LD->isNonTemporal(),
+                                  LD->isInvariant(), NewAlign);
+      SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, NewVT));
+      SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
+                                   NewVal, NewPtr,
+                                   ST->getPointerInfo().getWithOffset(PtrOff),
+                                   false, false, NewAlign);
+
+      AddToWorkList(NewPtr.getNode());
+      AddToWorkList(NewLD.getNode());
+      AddToWorkList(NewVal.getNode());
+      WorkListRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue();
+}
+
+/// TransformFPLoadStorePair - For a given floating point load / store pair,
+/// if the load value isn't used by any other operations, then consider
+/// transforming the pair to integer load / store operations if the target
+/// deems the transformation profitable.
+SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
+      Value.hasOneUse() &&
+      Chain == SDValue(Value.getNode(), 1)) {
+    LoadSDNode *LD = cast<LoadSDNode>(Value);
+    EVT VT = LD->getMemoryVT();
+    if (!VT.isFloatingPoint() ||
+        VT != ST->getMemoryVT() ||
+        LD->isNonTemporal() ||
+        ST->isNonTemporal() ||
+        LD->getPointerInfo().getAddrSpace() != 0 ||
+        ST->getPointerInfo().getAddrSpace() != 0)
+      return SDValue();
+
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
+        !TLI.isOperationLegal(ISD::STORE, IntVT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
+      return SDValue();
+
+    unsigned LDAlign = LD->getAlignment();
+    unsigned STAlign = ST->getAlignment();
+    Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlign = TLI.getDataLayout()->getABITypeAlignment(IntVTTy);
+    if (LDAlign < ABIAlign || STAlign < ABIAlign)
+      return SDValue();
+
+    SDValue NewLD = DAG.getLoad(IntVT, Value.getDebugLoc(),
+                                LD->getChain(), LD->getBasePtr(),
+                                LD->getPointerInfo(),
+                                false, false, false, LDAlign);
+
+    SDValue NewST = DAG.getStore(NewLD.getValue(1), N->getDebugLoc(),
+                                 NewLD, ST->getBasePtr(),
+                                 ST->getPointerInfo(),
+                                 false, false, STAlign);
+
+    AddToWorkList(NewLD.getNode());
+    AddToWorkList(NewST.getNode());
+    WorkListRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
+    ++LdStFP2Int;
+    return NewST;
+  }
+
+  return SDValue();
+}
+
+/// Helper struct to parse and store a memory address as base + index + offset.
+/// We ignore sign extensions when it is safe to do so.
+/// The following two expressions are not equivalent. To differentiate we need
+/// to store whether there was a sign extension involved in the index
+/// computation.
+///  (load (i64 add (i64 copyfromreg %c)
+///                 (i64 signextend (add (i8 load %index)
+///                                      (i8 1))))
+/// vs
+///
+/// (load (i64 add (i64 copyfromreg %c)
+///                (i64 signextend (i32 add (i32 signextend (i8 load %index))
+///                                         (i32 1)))))
+struct BaseIndexOffset {
+  SDValue Base;
+  SDValue Index;
+  int64_t Offset;
+  bool IsIndexSignExt;
+
+  BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {}
+
+  BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset,
+                  bool IsIndexSignExt) :
+    Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {}
+
+  bool equalBaseIndex(const BaseIndexOffset &Other) {
+    return Other.Base == Base && Other.Index == Index &&
+      Other.IsIndexSignExt == IsIndexSignExt;
+  }
+
+  /// Parses tree in Ptr for base, index, offset addresses.
+  static BaseIndexOffset match(SDValue Ptr) {
+    bool IsIndexSignExt = false;
+
+    // Just Base or possibly anything else.
+    if (Ptr->getOpcode() != ISD::ADD)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
+    // Base + offset.
+    if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
+      int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
+      return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
+                              IsIndexSignExt);
+    }
+
+    // Look at Base + Index + Offset cases.
+    SDValue Base = Ptr->getOperand(0);
+    SDValue IndexOffset = Ptr->getOperand(1);
+
+    // Skip signextends.
+    if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) {
+      IndexOffset = IndexOffset->getOperand(0);
+      IsIndexSignExt = true;
+    }
+
+    // Either the case of Base + Index (no offset) or something else.
+    if (IndexOffset->getOpcode() != ISD::ADD)
+      return BaseIndexOffset(Base, IndexOffset, 0, IsIndexSignExt);
+
+    // Now we have the case of Base + Index + offset.
+    SDValue Index = IndexOffset->getOperand(0);
+    SDValue Offset = IndexOffset->getOperand(1);
+
+    if (!isa<ConstantSDNode>(Offset))
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
+    // Ignore signextends.
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    } else IsIndexSignExt = false;
+
+    int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
+    return BaseIndexOffset(Base, Index, Off, IsIndexSignExt);
+  }
+};
+
+/// Holds a pointer to an LSBaseSDNode as well as information on where it
+/// is located in a sequence of memory operations connected by a chain.
+struct MemOpLink {
+  MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
+    MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+  // Ptr to the mem node.
+  LSBaseSDNode *MemNode;
+  // Offset from the base ptr.
+  int64_t OffsetFromBase;
+  // What is the sequence number of this mem node.
+  // Lowest mem operand in the DAG starts at zero.
+  unsigned SequenceNum;
+};
+
+/// Sorts store nodes in a link according to their offset from a shared
+// base ptr.
+struct ConsecutiveMemoryChainSorter {
+  bool operator()(MemOpLink LHS, MemOpLink RHS) {
+    return LHS.OffsetFromBase < RHS.OffsetFromBase;
+  }
+};
+
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
+  EVT MemVT = St->getMemoryVT();
+  int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
+  bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+
+  // Don't merge vectors into wider inputs.
+  if (MemVT.isVector() || !MemVT.isSimple())
+    return false;
+
+  // Perform an early exit check. Do not bother looking at stored values that
+  // are not constants or loads.
+  SDValue StoredVal = St->getValue();
+  bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
+  if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
+      !IsLoadSrc)
+    return false;
+
+  // Only look at ends of store sequences.
+  SDValue Chain = SDValue(St, 1);
+  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
+    return false;
+
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer.
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr());
+
+  // We must have a base and an offset.
+  if (!BasePtr.Base.getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.Base.getOpcode() == ISD::UNDEF)
+    return false;
+
+  // Save the LoadSDNodes that we find in the chain.
+  // We need to make sure that these nodes do not interfere with
+  // any of the store nodes.
+  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
+
+  // Save the StoreSDNodes that we find in the chain.
+  SmallVector<MemOpLink, 8> StoreNodes;
+
+  // Walk up the chain and look for nodes with offsets from the same
+  // base pointer. Stop when reaching an instruction with a different kind
+  // or instruction which has a different base pointer.
+  unsigned Seq = 0;
+  StoreSDNode *Index = St;
+  while (Index) {
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (Index != St && !SDValue(Index, 1)->hasOneUse())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr());
+
+    // Check that the base pointer is the same as the original one.
+    if (!Ptr.equalBaseIndex(BasePtr))
+      break;
+
+    // Check that the alignment is the same.
+    if (Index->getAlignment() != St->getAlignment())
+      break;
+
+    // The memory operands must not be volatile.
+    if (Index->isVolatile() || Index->isIndexed())
+      break;
+
+    // No truncation.
+    if (StoreSDNode *St = dyn_cast<StoreSDNode>(Index))
+      if (St->isTruncatingStore())
+        break;
+
+    // The stored memory type must be the same.
+    if (Index->getMemoryVT() != MemVT)
+      break;
+
+    // We do not allow unaligned stores because we want to prevent overriding
+    // stores.
+    if (Index->getAlignment()*8 != MemVT.getSizeInBits())
+      break;
+
+    // We found a potential memory operand to merge.
+    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
+
+    // Find the next memory operand in the chain. If the next operand in the
+    // chain is a store then move up and continue the scan with the next
+    // memory operand. If the next operand is a load save it and use alias
+    // information to check if it interferes with anything.
+    SDNode *NextInChain = Index->getChain().getNode();
+    while (1) {
+      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
+        // We found a store node. Use it for the next iteration.
+        Index = STn;
+        break;
+      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        // Save the load node for later. Continue the scan.
+        AliasLoadNodes.push_back(Ldn);
+        NextInChain = Ldn->getChain().getNode();
+        continue;
+      } else {
+        Index = NULL;
+        break;
+      }
+    }
+  }
+
+  // Check if there is anything to merge.
+  if (StoreNodes.size() < 2)
+    return false;
+
+  // Sort the memory operands according to their distance from the base pointer.
+  std::sort(StoreNodes.begin(), StoreNodes.end(),
+            ConsecutiveMemoryChainSorter());
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // store memory address.
+  unsigned LastConsecutiveStore = 0;
+  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
+  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
+
+    // Check that the addresses are consecutive starting from the second
+    // element in the list of stores.
+    if (i > 0) {
+      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+    }
+
+    bool Alias = false;
+    // Check if this store interferes with any of the loads that we found.
+    for (unsigned ld = 0, lde = AliasLoadNodes.size(); ld < lde; ++ld)
+      if (isAlias(AliasLoadNodes[ld], StoreNodes[i].MemNode)) {
+        Alias = true;
+        break;
+      }
+    // We found a load that alias with this store. Stop the sequence.
+    if (Alias)
+      break;
+
+    // Mark this node as useful.
+    LastConsecutiveStore = i;
+  }
+
+  // The node with the lowest store address.
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+
+  // Store the constants into memory as one consecutive store.
+  if (!IsLoadSrc) {
+    unsigned LastLegalType = 0;
+    unsigned LastLegalVectorType = 0;
+    bool NonZero = false;
+    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = St->getValue();
+
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+        NonZero |= !C->isNullValue();
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
+        NonZero |= !C->getConstantFPValue()->isNullValue();
+      } else {
+        // Non constant.
+        break;
+      }
+
+      // Find a legal type for the constant store.
+      unsigned StoreBW = (i+1) * ElementSizeBytes * 8;
+      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+      if (TLI.isTypeLegal(StoreTy))
+        LastLegalType = i+1;
+      // Or check whether a truncstore is legal.
+      else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+               TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValueTy =
+          TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy))
+          LastLegalType = i+1;
+      }
+
+      // Find a legal type for the vector store.
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      if (TLI.isTypeLegal(Ty))
+        LastLegalVectorType = i + 1;
+    }
+
+    // We only use vectors if the constant is known to be zero and the
+    // function is not marked with the noimplicitfloat attribute.
+    if (NonZero || NoVectors)
+      LastLegalVectorType = 0;
+
+    // Check if we found a legal integer type to store.
+    if (LastLegalType == 0 && LastLegalVectorType == 0)
+      return false;
+
+    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
+
+    // Make sure we have something to merge.
+    if (NumElem < 2)
+      return false;
+
+    unsigned EarliestNodeUsed = 0;
+    for (unsigned i=0; i < NumElem; ++i) {
+      // Find a chain for the new wide-store operand. Notice that some
+      // of the store nodes that we found may not be selected for inclusion
+      // in the wide store. The chain we use needs to be the chain of the
+      // earliest store node which is *used* and replaced by the wide store.
+      if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+        EarliestNodeUsed = i;
+    }
+
+    // The earliest Node in the DAG.
+    LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+    DebugLoc DL = StoreNodes[0].MemNode->getDebugLoc();
+
+    SDValue StoredVal;
+    if (UseVector) {
+      // Find a legal type for the vector store.
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+      StoredVal = DAG.getConstant(0, Ty);
+    } else {
+      unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+      APInt StoreInt(StoreBW, 0);
+
+      // Construct a single integer constant which is made of the smaller
+      // constant inputs.
+      bool IsLE = TLI.isLittleEndian();
+      for (unsigned i = 0; i < NumElem ; ++i) {
+        unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
+        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+        SDValue Val = St->getValue();
+        StoreInt<<=ElementSizeBytes*8;
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+          StoreInt|=C->getAPIntValue().zext(StoreBW);
+        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+          StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
+        } else {
+          assert(false && "Invalid constant element type");
+        }
+      }
+
+      // Create the new Load and Store operations.
+      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+      StoredVal = DAG.getConstant(StoreInt, StoreTy);
+    }
+
+    SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
+                                    FirstInChain->getBasePtr(),
+                                    FirstInChain->getPointerInfo(),
+                                    false, false,
+                                    FirstInChain->getAlignment());
+
+    // Replace the first store with the new store
+    CombineTo(EarliestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumElem ; ++i) {
+      if (StoreNodes[i].MemNode == EarliestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      // ReplaceAllUsesWith will replace all uses that existed when it was
+      // called, but graph optimizations may cause new ones to appear. For
+      // example, the case in pr14333 looks like
+      //
+      //  St's chain -> St -> another store -> X
+      //
+      // And the only difference from St to the other store is the chain.
+      // When we change it's chain to be St's chain they become identical,
+      // get CSEed and the net result is that X is now a use of St.
+      // Since we know that St is redundant, just iterate.
+      while (!St->use_empty())
+        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+      removeFromWorkList(St);
+      DAG.DeleteNode(St);
+    }
+
+    return true;
+  }
+
+  // Below we handle the case of multiple consecutive stores that
+  // come from multiple consecutive loads. We merge them into a single
+  // wide load and a single wide store.
+
+  // Look for load nodes which are used by the stored values.
+  SmallVector<MemOpLink, 8> LoadNodes;
+
+  // Find acceptable loads. Loads need to have the same chain (token factor),
+  // must not be zext, volatile, indexed, and they must be consecutive.
+  BaseIndexOffset LdBasePtr;
+  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+    StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+    if (!Ld) break;
+
+    // Loads must only have one use.
+    if (!Ld->hasNUsesOfValue(1, 0))
+      break;
+
+    // Check that the alignment is the same as the stores.
+    if (Ld->getAlignment() != St->getAlignment())
+      break;
+
+    // The memory operands must not be volatile.
+    if (Ld->isVolatile() || Ld->isIndexed())
+      break;
+
+    // We do not accept ext loads.
+    if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
+      break;
+
+    // The stored memory type must be the same.
+    if (Ld->getMemoryVT() != MemVT)
+      break;
+
+    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr());
+    // If this is not the first ptr that we check.
+    if (LdBasePtr.Base.getNode()) {
+      // The base ptr must be the same.
+      if (!LdPtr.equalBaseIndex(LdBasePtr))
+        break;
+    } else {
+      // Check that all other base pointers are the same as this one.
+      LdBasePtr = LdPtr;
+    }
+
+    // We found a potential memory operand to merge.
+    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
+  }
+
+  if (LoadNodes.size() < 2)
+    return false;
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // load memory address. These variables hold the index in the store node
+  // array.
+  unsigned LastConsecutiveLoad = 0;
+  // This variable refers to the size and not index in the array.
+  unsigned LastLegalVectorType = 0;
+  unsigned LastLegalIntegerType = 0;
+  StartAddress = LoadNodes[0].OffsetFromBase;
+  SDValue FirstChain = LoadNodes[0].MemNode->getChain();
+  for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+    // All loads much share the same chain.
+    if (LoadNodes[i].MemNode->getChain() != FirstChain)
+      break;
+
+    int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+      break;
+    LastConsecutiveLoad = i;
+
+    // Find a legal type for the vector store.
+    EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+    if (TLI.isTypeLegal(StoreTy))
+      LastLegalVectorType = i + 1;
+
+    // Find a legal type for the integer store.
+    unsigned StoreBW = (i+1) * ElementSizeBytes * 8;
+    StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+    if (TLI.isTypeLegal(StoreTy))
+      LastLegalIntegerType = i + 1;
+    // Or check whether a truncstore and extload is legal.
+    else if (TLI.getTypeAction(*DAG.getContext(), StoreTy) ==
+             TargetLowering::TypePromoteInteger) {
+      EVT LegalizedStoredValueTy =
+        TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
+      if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::ZEXTLOAD, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::SEXTLOAD, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::EXTLOAD, StoreTy))
+        LastLegalIntegerType = i+1;
+    }
+  }
+
+  // Only use vector types if the vector type is larger than the integer type.
+  // If they are the same, use integers.
+  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
+  unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
+
+  // We add +1 here because the LastXXX variables refer to location while
+  // the NumElem refers to array/index size.
+  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
+  NumElem = std::min(LastLegalType, NumElem);
+
+  if (NumElem < 2)
+    return false;
+
+  // The earliest Node in the DAG.
+  unsigned EarliestNodeUsed = 0;
+  LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+  for (unsigned i=1; i<NumElem; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // earliest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+      EarliestNodeUsed = i;
+  }
+
+  // Find if it is better to use vectors or integers to load and store
+  // to memory.
+  EVT JointMemOpVT;
+  if (UseVectorTy) {
+    JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+  } else {
+    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+    JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+  }
+
+  DebugLoc LoadDL = LoadNodes[0].MemNode->getDebugLoc();
+  DebugLoc StoreDL = StoreNodes[0].MemNode->getDebugLoc();
+
+  LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL,
+                                FirstLoad->getChain(),
+                                FirstLoad->getBasePtr(),
+                                FirstLoad->getPointerInfo(),
+                                false, false, false,
+                                FirstLoad->getAlignment());
+
+  SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad,
+                                  FirstInChain->getBasePtr(),
+                                  FirstInChain->getPointerInfo(), false, false,
+                                  FirstInChain->getAlignment());
+
+  // Replace one of the loads with the new load.
+  LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[0].MemNode);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                SDValue(NewLoad.getNode(), 1));
+
+  // Remove the rest of the load chains.
+  for (unsigned i = 1; i < NumElem ; ++i) {
+    // Replace all chain users of the old load nodes with the chain of the new
+    // load node.
+    LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain());
+  }
+
+  // Replace the first store with the new store.
+  CombineTo(EarliestOp, NewStore);
+  // Erase all other stores.
+  for (unsigned i = 0; i < NumElem ; ++i) {
+    // Remove all Store nodes.
+    if (StoreNodes[i].MemNode == EarliestOp)
+      continue;
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
+    removeFromWorkList(St);
+    DAG.DeleteNode(St);
+  }
+
+  return true;
+}
+
+SDValue DAGCombiner::visitSTORE(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+
+  // If this is a store of a bit convert, store the input value if the
+  // resultant store does not need a higher alignment than the original.
+  if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
+      ST->isUnindexed()) {
+    unsigned OrigAlign = ST->getAlignment();
+    EVT SVT = Value.getOperand(0).getValueType();
+    unsigned Align = TLI.getDataLayout()->
+      getABITypeAlignment(SVT.getTypeForEVT(*DAG.getContext()));
+    if (Align <= OrigAlign &&
+        ((!LegalOperations && !ST->isVolatile()) ||
+         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
+      return DAG.getStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                          Ptr, ST->getPointerInfo(), ST->isVolatile(),
+                          ST->isNonTemporal(), OrigAlign);
+  }
+
+  // Turn 'store undef, Ptr' -> nothing.
+  if (Value.getOpcode() == ISD::UNDEF && ST->isUnindexed())
+    return Chain;
+
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Value)) {
+    // NOTE: If the original store is volatile, this transform must not increase
+    // the number of stores.  For example, on x86-32 an f64 can be stored in one
+    // processor operation but an i64 (which is not legal) requires two.  So the
+    // transform should not be done in this case.
+    if (Value.getOpcode() != ISD::TargetConstantFP) {
+      SDValue Tmp;
+      switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unknown FP type");
+      case MVT::f16:    // We don't do this for these yet.
+      case MVT::f80:
+      case MVT::f128:
+      case MVT::ppcf128:
+        break;
+      case MVT::f32:
+        if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                              bitcastToAPInt().getZExtValue(), MVT::i32);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
+                              ST->isNonTemporal(), ST->getAlignment());
+        }
+        break;
+      case MVT::f64:
+        if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
+             !ST->isVolatile()) ||
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+          Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                getZExtValue(), MVT::i64);
+          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
+                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
+                              ST->isNonTemporal(), ST->getAlignment());
+        }
+
+        if (!ST->isVolatile() &&
+            TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+          // Many FP stores are not made apparent until after legalize, e.g. for
+          // argument passing.  Since this is so common, custom legalize the
+          // 64-bit integer store into two 32-bit stores.
+          uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+          SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, MVT::i32);
+          SDValue Hi = DAG.getConstant(Val >> 32, MVT::i32);
+          if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+          unsigned Alignment = ST->getAlignment();
+          bool isVolatile = ST->isVolatile();
+          bool isNonTemporal = ST->isNonTemporal();
+
+          SDValue St0 = DAG.getStore(Chain, ST->getDebugLoc(), Lo,
+                                     Ptr, ST->getPointerInfo(),
+                                     isVolatile, isNonTemporal,
+                                     ST->getAlignment());
+          Ptr = DAG.getNode(ISD::ADD, N->getDebugLoc(), Ptr.getValueType(), Ptr,
+                            DAG.getConstant(4, Ptr.getValueType()));
+          Alignment = MinAlign(Alignment, 4U);
+          SDValue St1 = DAG.getStore(Chain, ST->getDebugLoc(), Hi,
+                                     Ptr, ST->getPointerInfo().getWithOffset(4),
+                                     isVolatile, isNonTemporal,
+                                     Alignment);
+          return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                             St0, St1);
+        }
+
+        break;
+      }
+    }
+  }
+
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > ST->getAlignment())
+        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
+                                 Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
+                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+    }
+  }
+
+  // Try transforming a pair floating point load / store ops to integer
+  // load / store ops.
+  SDValue NewST = TransformFPLoadStorePair(N);
+  if (NewST.getNode())
+    return NewST;
+
+  if (CombinerAA) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplStore;
+
+      // Replace the chain to avoid dependency.
+      if (ST->isTruncatingStore()) {
+        ReplStore = DAG.getTruncStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                      ST->getPointerInfo(),
+                                      ST->getMemoryVT(), ST->isVolatile(),
+                                      ST->isNonTemporal(), ST->getAlignment());
+      } else {
+        ReplStore = DAG.getStore(BetterChain, N->getDebugLoc(), Value, Ptr,
+                                 ST->getPointerInfo(),
+                                 ST->isVolatile(), ST->isNonTemporal(),
+                                 ST->getAlignment());
+      }
+
+      // Create token to keep both nodes around.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+                                  MVT::Other, Chain, ReplStore);
+
+      // Make sure the new and old chains are cleaned up.
+      AddToWorkList(Token.getNode());
+
+      // Don't add users to work list.
+      return CombineTo(N, Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // FIXME: is there such a thing as a truncating indexed store?
+  if (ST->isTruncatingStore() && ST->isUnindexed() &&
+      Value.getValueType().isInteger()) {
+    // See if we can simplify the input to this truncstore with knowledge that
+    // only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    SDValue Shorter =
+      GetDemandedBits(Value,
+                      APInt::getLowBitsSet(
+                        Value.getValueType().getScalarType().getSizeInBits(),
+                        ST->getMemoryVT().getScalarType().getSizeInBits()));
+    AddToWorkList(Value.getNode());
+    if (Shorter.getNode())
+      return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter,
+                               Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
+                               ST->isVolatile(), ST->isNonTemporal(),
+                               ST->getAlignment());
+
+    // Otherwise, see if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value,
+                        APInt::getLowBitsSet(
+                          Value.getValueType().getScalarType().getSizeInBits(),
+                          ST->getMemoryVT().getScalarType().getSizeInBits())))
+      return SDValue(N, 0);
+  }
+
+  // If this is a load followed by a store to the same location, then the store
+  // is dead/noop.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
+    if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
+        ST->isUnindexed() && !ST->isVolatile() &&
+        // There can't be any side effects between the load and store, such as
+        // a call or store.
+        Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
+      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+                            ST->getMemoryVT())) {
+    return DAG.getTruncStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+                             Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
+                             ST->isVolatile(), ST->isNonTemporal(),
+                             ST->getAlignment());
+  }
+
+  // Only perform this optimization before the types are legal, because we
+  // don't want to perform this optimization on every DAGCombine invocation.
+  if (!LegalTypes) {
+    bool EverChanged = false;
+
+    do {
+      // There can be multiple store sequences on the same chain.
+      // Keep trying to merge store sequences until we are unable to do so
+      // or until we merge the last store on the chain.
+      bool Changed = MergeConsecutiveStores(ST);
+      EverChanged |= Changed;
+      if (!Changed) break;
+    } while (ST->getOpcode() != ISD::DELETED_NODE);
+
+    if (EverChanged)
+      return SDValue(N, 0);
+  }
+
+  return ReduceLoadOpStoreWidth(N);
+}
+
+SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InVec = N->getOperand(0);
+  SDValue InVal = N->getOperand(1);
+  SDValue EltNo = N->getOperand(2);
+  DebugLoc dl = N->getDebugLoc();
+
+  // If the inserted element is an UNDEF, just use the input vector.
+  if (InVal.getOpcode() == ISD::UNDEF)
+    return InVec;
+
+  EVT VT = InVec.getValueType();
+
+  // If we can't generate a legal BUILD_VECTOR, exit
+  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return SDValue();
+
+  // Check that we know which element is being inserted
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+  // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+  // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+  // vector elements.
+  SmallVector<SDValue, 8> Ops;
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+    Ops.append(InVec.getNode()->op_begin(),
+               InVec.getNode()->op_end());
+  } else if (InVec.getOpcode() == ISD::UNDEF) {
+    unsigned NElts = VT.getVectorNumElements();
+    Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+  } else {
+    return SDValue();
+  }
+
+  // Insert the element
+  if (Elt < Ops.size()) {
+    // All the operands of BUILD_VECTOR must have the same type;
+    // we enforce that here.
+    EVT OpVT = Ops[0].getValueType();
+    if (InVal.getValueType() != OpVT)
+      InVal = OpVT.bitsGT(InVal.getValueType()) ?
+                DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+                DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+    Ops[Elt] = InVal;
+  }
+
+  // Return the new vector
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,
+                     VT, &Ops[0], Ops.size());
+}
+
+SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
+  // (vextract (scalar_to_vector val, 0) -> val
+  SDValue InVec = N->getOperand(0);
+  EVT VT = InVec.getValueType();
+  EVT NVT = N->getValueType(0);
+
+  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Check if the result type doesn't match the inserted element type. A
+    // SCALAR_TO_VECTOR may truncate the inserted element and the
+    // EXTRACT_VECTOR_ELT may widen the extracted vector.
+    SDValue InOp = InVec.getOperand(0);
+    if (InOp.getValueType() != NVT) {
+      assert(InOp.getValueType().isInteger() && NVT.isInteger());
+      return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
+    }
+    return InOp;
+  }
+
+  SDValue EltNo = N->getOperand(1);
+  bool ConstEltNo = isa<ConstantSDNode>(EltNo);
+
+  // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
+  // We only perform this optimization before the op legalization phase because
+  // we may introduce new vector instructions which are not backed by TD
+  // patterns. For example on AVX, extracting elements from a wide vector
+  // without using extract_subvector.
+  if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE
+      && ConstEltNo && !LegalOperations) {
+    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    int NumElem = VT.getVectorNumElements();
+    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
+    // Find the new index to extract from.
+    int OrigElt = SVOp->getMaskElt(Elt);
+
+    // Extracting an undef index is undef.
+    if (OrigElt == -1)
+      return DAG.getUNDEF(NVT);
+
+    // Select the right vector half to extract from.
+    if (OrigElt < NumElem) {
+      InVec = InVec->getOperand(0);
+    } else {
+      InVec = InVec->getOperand(1);
+      OrigElt -= NumElem;
+    }
+
+    EVT IndexTy = N->getOperand(1).getValueType();
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), NVT,
+                       InVec, DAG.getConstant(OrigElt, IndexTy));
+  }
+
+  // Perform only after legalization to ensure build_vector / vector_shuffle
+  // optimizations have already been done.
+  if (!LegalOperations) return SDValue();
+
+  // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+
+  if (ConstEltNo) {
+    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+    bool NewLoad = false;
+    bool BCNumEltsChanged = false;
+    EVT ExtVT = VT.getVectorElementType();
+    EVT LVT = ExtVT;
+
+    // If the result of load has to be truncated, then it's not necessarily
+    // profitable.
+    if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+      return SDValue();
+
+    if (InVec.getOpcode() == ISD::BITCAST) {
+      // Don't duplicate a load with other uses.
+      if (!InVec.hasOneUse())
+        return SDValue();
+
+      EVT BCVT = InVec.getOperand(0).getValueType();
+      if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+        return SDValue();
+      if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+        BCNumEltsChanged = true;
+      InVec = InVec.getOperand(0);
+      ExtVT = BCVT.getVectorElementType();
+      NewLoad = true;
+    }
+
+    LoadSDNode *LN0 = NULL;
+    const ShuffleVectorSDNode *SVN = NULL;
+    if (ISD::isNormalLoad(InVec.getNode())) {
+      LN0 = cast<LoadSDNode>(InVec);
+    } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+               InVec.getOperand(0).getValueType() == ExtVT &&
+               ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
+      // Don't duplicate a load with other uses.
+      if (!InVec.hasOneUse())
+        return SDValue();
+
+      LN0 = cast<LoadSDNode>(InVec.getOperand(0));
+    } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
+      // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+      // =>
+      // (load $addr+1*size)
+
+      // Don't duplicate a load with other uses.
+      if (!InVec.hasOneUse())
+        return SDValue();
+
+      // If the bit convert changed the number of elements, it is unsafe
+      // to examine the mask.
+      if (BCNumEltsChanged)
+        return SDValue();
+
+      // Select the input vector, guarding against out of range extract vector.
+      unsigned NumElems = VT.getVectorNumElements();
+      int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
+      InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
+
+      if (InVec.getOpcode() == ISD::BITCAST) {
+        // Don't duplicate a load with other uses.
+        if (!InVec.hasOneUse())
+          return SDValue();
+
+        InVec = InVec.getOperand(0);
+      }
+      if (ISD::isNormalLoad(InVec.getNode())) {
+        LN0 = cast<LoadSDNode>(InVec);
+        Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+      }
+    }
+
+    // Make sure we found a non-volatile load and the extractelement is
+    // the only use.
+    if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+      return SDValue();
+
+    // If Idx was -1 above, Elt is going to be -1, so just return undef.
+    if (Elt == -1)
+      return DAG.getUNDEF(LVT);
+
+    unsigned Align = LN0->getAlignment();
+    if (NewLoad) {
+      // Check the resultant load doesn't need a higher alignment than the
+      // original load.
+      unsigned NewAlign =
+        TLI.getDataLayout()
+            ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
+
+      if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
+        return SDValue();
+
+      Align = NewAlign;
+    }
+
+    SDValue NewPtr = LN0->getBasePtr();
+    unsigned PtrOff = 0;
+
+    if (Elt) {
+      PtrOff = LVT.getSizeInBits() * Elt / 8;
+      EVT PtrType = NewPtr.getValueType();
+      if (TLI.isBigEndian())
+        PtrOff = VT.getSizeInBits() / 8 - PtrOff;
+      NewPtr = DAG.getNode(ISD::ADD, N->getDebugLoc(), PtrType, NewPtr,
+                           DAG.getConstant(PtrOff, PtrType));
+    }
+
+    // The replacement we need to do here is a little tricky: we need to
+    // replace an extractelement of a load with a load.
+    // Use ReplaceAllUsesOfValuesWith to do the replacement.
+    // Note that this replacement assumes that the extractvalue is the only
+    // use of the load; that's okay because we don't want to perform this
+    // transformation in other cases anyway.
+    SDValue Load;
+    SDValue Chain;
+    if (NVT.bitsGT(LVT)) {
+      // If the result type of vextract is wider than the load, then issue an
+      // extending load instead.
+      ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
+        ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+      Load = DAG.getExtLoad(ExtType, N->getDebugLoc(), NVT, LN0->getChain(),
+                            NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
+                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),Align);
+      Chain = Load.getValue(1);
+    } else {
+      Load = DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+                         LN0->getPointerInfo().getWithOffset(PtrOff),
+                         LN0->isVolatile(), LN0->isNonTemporal(), 
+                         LN0->isInvariant(), Align);
+      Chain = Load.getValue(1);
+      if (NVT.bitsLT(LVT))
+        Load = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), NVT, Load);
+      else
+        Load = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), NVT, Load);
+    }
+    WorkListRemover DeadNodes(*this);
+    SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
+    SDValue To[] = { Load, Chain };
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+    // Since we're explcitly calling ReplaceAllUses, add the new node to the
+    // worklist explicitly as well.
+    AddToWorkList(Load.getNode());
+    AddUsersToWorkList(Load.getNode()); // Add users too
+    // Make sure to revisit this node to clean it up; it will usually be dead.
+    AddToWorkList(N);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
+// Simplify (build_vec (ext )) to (bitcast (build_vec ))
+SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may create bit-casts which
+  // will be type-legalized to complex code sequences.
+  // We perform this optimization only before the operation legalizer because we
+  // may introduce illegal operations.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return SDValue();
+
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of values
+  // which come from any_extend or zero_extend nodes. If so, we can create
+  // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
+  // optimizations. We do not handle sign-extend because we can't fill the sign
+  // using shuffles.
+  EVT SourceType = MVT::Other;
+  bool AllAnyExt = true;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    // Ignore undef inputs.
+    if (In.getOpcode() == ISD::UNDEF) continue;
+
+    bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
+    bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
+
+    // Abort if the element is not an extension.
+    if (!ZeroExt && !AnyExt) {
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // The input is a ZeroExt or AnyExt. Check the original type.
+    EVT InTy = In.getOperand(0).getValueType();
+
+    // Check that all of the widened source types are the same.
+    if (SourceType == MVT::Other)
+      // First time.
+      SourceType = InTy;
+    else if (InTy != SourceType) {
+      // Multiple income types. Abort.
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // Check if all of the extends are ANY_EXTENDs.
+    AllAnyExt &= AnyExt;
+  }
+
+  // In order to have valid types, all of the inputs must be extended from the
+  // same source type and all of the inputs must be any or zero extend.
+  // Scalar sizes must be a power of two.
+  EVT OutScalarTy = VT.getScalarType();
+  bool ValidTypes = SourceType != MVT::Other &&
+                 isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
+                 isPowerOf2_32(SourceType.getSizeInBits());
+
+  // Create a new simpler BUILD_VECTOR sequence which other optimizations can
+  // turn into a single shuffle instruction.
+  if (!ValidTypes)
+    return SDValue();
+
+  bool isLE = TLI.isLittleEndian();
+  unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
+  assert(ElemRatio > 1 && "Invalid element size ratio");
+  SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
+                               DAG.getConstant(0, SourceType);
+
+  unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
+
+  // Populate the new build_vector
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Cast = N->getOperand(i);
+    assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
+            Cast.getOpcode() == ISD::ZERO_EXTEND ||
+            Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
+    SDValue In;
+    if (Cast.getOpcode() == ISD::UNDEF)
+      In = DAG.getUNDEF(SourceType);
+    else
+      In = Cast->getOperand(0);
+    unsigned Index = isLE ? (i * ElemRatio) :
+                            (i * ElemRatio + (ElemRatio - 1));
+
+    assert(Index < Ops.size() && "Invalid index");
+    Ops[Index] = In;
+  }
+
+  // The type of the new BUILD_VECTOR node.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
+  assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
+         "Invalid vector size");
+  // Check if the new vector type is legal.
+  if (!isTypeLegal(VecVT)) return SDValue();
+
+  // Make the new BUILD_VECTOR.
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], Ops.size());
+
+  // The new BUILD_VECTOR node has the potential to be further optimized.
+  AddToWorkList(BV.getNode());
+  // Bitcast to the desired type.
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT SrcVT = MVT::Other;
+  unsigned Opcode = ISD::DELETED_NODE;
+  unsigned NumDefs = 0;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opc = In.getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    // If all scalar values are floats and converted from integers.
+    if (Opcode == ISD::DELETED_NODE &&
+        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
+      Opcode = Opc;
+    }
+
+    if (Opc != Opcode)
+      return SDValue();
+
+    EVT InVT = In.getOperand(0).getValueType();
+
+    // If all scalar values are typed differently, bail out. It's chosen to
+    // simplify BUILD_VECTOR of integer types.
+    if (SrcVT == MVT::Other)
+      SrcVT = InVT;
+    if (SrcVT != InVT)
+      return SDValue();
+    NumDefs++;
+  }
+
+  // If the vector has just one element defined, it's not worth to fold it into
+  // a vectorized one.
+  if (NumDefs < 2)
+    return SDValue();
+
+  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
+         && "Should only handle conversion from integer to float.");
+  assert(SrcVT != MVT::Other && "Cannot determine source type!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
+
+  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+
+    if (In.getOpcode() == ISD::UNDEF)
+      Opnds.push_back(DAG.getUNDEF(SrcVT));
+    else
+      Opnds.push_back(In.getOperand(0));
+  }
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
+                           &Opnds[0], Opnds.size());
+  AddToWorkList(BV.getNode());
+
+  return DAG.getNode(Opcode, dl, VT, BV);
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  SDValue V = reduceBuildVecExtToExtBuildVec(N);
+  if (V.getNode())
+    return V;
+
+  V = reduceBuildVecConvertToConvertBuildVec(N);
+  if (V.getNode())
+    return V;
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+  // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
+  // at most two distinct vectors, turn this into a shuffle node.
+
+  // May only combine to shuffle after legalize if shuffle is legal.
+  if (LegalOperations &&
+      !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))
+    return SDValue();
+
+  SDValue VecIn1, VecIn2;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    // Ignore undef inputs.
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    // If this input is something other than a EXTRACT_VECTOR_ELT with a
+    // constant index, bail out.
+    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+
+    // We allow up to two distinct input vectors.
+    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
+      continue;
+
+    if (VecIn1.getNode() == 0) {
+      VecIn1 = ExtractedFromVec;
+    } else if (VecIn2.getNode() == 0) {
+      VecIn2 = ExtractedFromVec;
+    } else {
+      // Too many inputs.
+      VecIn1 = VecIn2 = SDValue(0, 0);
+      break;
+    }
+  }
+
+    // If everything is good, we can make a shuffle operation.
+  if (VecIn1.getNode()) {
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumInScalars; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // If extracting from the first vector, just use the index directly.
+      SDValue Extract = N->getOperand(i);
+      SDValue ExtVal = Extract.getOperand(1);
+      if (Extract.getOperand(0) == VecIn1) {
+        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+        if (ExtIndex > VT.getVectorNumElements())
+          return SDValue();
+
+        Mask.push_back(ExtIndex);
+        continue;
+      }
+
+      // Otherwise, use InIdx + VecSize
+      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+      Mask.push_back(Idx+NumInScalars);
+    }
+
+    // We can't generate a shuffle node with mismatched input and output types.
+    // Attempt to transform a single input vector to the correct type.
+    if ((VT != VecIn1.getValueType())) {
+      // We don't support shuffeling between TWO values of different types.
+      if (VecIn2.getNode() != 0)
+        return SDValue();
+
+      // We only support widening of vectors which are half the size of the
+      // output registers. For example XMM->YMM widening on X86 with AVX.
+      if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
+        return SDValue();
+
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      if (VecIn1.getValueType().getVectorElementType() !=
+          VT.getVectorElementType())
+        return SDValue();
+
+      // Widen the input vector by adding undef values.
+      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+    }
+
+    // If VecIn2 is unused then change it to undef.
+    VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+
+    // Check that we were able to transform all incoming values to the same
+    // type.
+    if (VecIn2.getValueType() != VecIn1.getValueType() ||
+        VecIn1.getValueType() != VT)
+          return SDValue();
+
+    // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
+    if (!isTypeLegal(VT))
+      return SDValue();
+
+    // Return the new VECTOR_SHUFFLE node.
+    SDValue Ops[2];
+    Ops[0] = VecIn1;
+    Ops[1] = VecIn2;
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
+  // TODO: Check to see if this is a CONCAT_VECTORS of a bunch of
+  // EXTRACT_SUBVECTOR operations.  If so, and if the EXTRACT_SUBVECTOR vector
+  // inputs come from at most two distinct vectors, turn this into a shuffle
+  // node.
+
+  // If we only have one input vector, we don't need to do any concatenation.
+  if (N->getNumOperands() == 1)
+    return N->getOperand(0);
+
+  // Check if all of the operands are undefs.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(N->getValueType(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
+  EVT NVT = N->getValueType(0);
+  SDValue V = N->getOperand(0);
+
+  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
+    // Combine:
+    //    (extract_subvec (concat V1, V2, ...), i)
+    // Into:
+    //    Vi if possible
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    if (V->getOperand(0).getValueType() != NVT)
+      return SDValue();
+    unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned NumElems = NVT.getVectorNumElements();
+    assert((Idx % NumElems) == 0 &&
+           "IDX in concat is not a multiple of the result vector length.");
+    return V->getOperand(Idx / NumElems);
+  }
+
+  // Skip bitcasting
+  if (V->getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    DebugLoc dl = N->getDebugLoc();
+    // Handle only simple case where vector being inserted and vector
+    // being extracted are of same type, and are half size of larger vectors.
+    EVT BigVT = V->getOperand(0).getValueType();
+    EVT SmallVT = V->getOperand(1).getValueType();
+    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+      return SDValue();
+
+    // Only handle cases where both indexes are constants with the same type.
+    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
+
+    if (InsIdx && ExtIdx &&
+        InsIdx->getValueType(0).getSizeInBits() <= 64 &&
+        ExtIdx->getValueType(0).getSizeInBits() <= 64) {
+      // Combine:
+      //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+      // Into:
+      //    indices are equal or bit offsets are equal => V1
+      //    otherwise => (extract_subvec V1, ExtIdx)
+      if (InsIdx->getZExtValue() * SmallVT.getScalarType().getSizeInBits() ==
+          ExtIdx->getZExtValue() * NVT.getScalarType().getSizeInBits())
+        return DAG.getNode(ISD::BITCAST, dl, NVT, V->getOperand(1));
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT,
+                         DAG.getNode(ISD::BITCAST, dl,
+                                     N->getOperand(0).getValueType(),
+                                     V->getOperand(0)), N->getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N0.getOpcode() == ISD::UNDEF && N1.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(VT);
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N0 == N1) {
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= (int)NumElts) Idx -= NumElts;
+      NewMask.push_back(Idx);
+    }
+    return DAG.getVectorShuffle(VT, N->getDebugLoc(), N0, DAG.getUNDEF(VT),
+                                &NewMask[0]);
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N0.getOpcode() == ISD::UNDEF) {
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= 0) {
+        if (Idx < (int)NumElts)
+          Idx += NumElts;
+        else
+          Idx -= NumElts;
+      }
+      NewMask.push_back(Idx);
+    }
+    return DAG.getVectorShuffle(VT, N->getDebugLoc(), N1, DAG.getUNDEF(VT),
+                                &NewMask[0]);
+  }
+
+  // Remove references to rhs if it is undef
+  if (N1.getOpcode() == ISD::UNDEF) {
+    bool Changed = false;
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= (int)NumElts) {
+        Idx = -1;
+        Changed = true;
+      }
+      NewMask.push_back(Idx);
+    }
+    if (Changed)
+      return DAG.getVectorShuffle(VT, N->getDebugLoc(), N0, N1, &NewMask[0]);
+  }
+
+  // If it is a splat, check if the argument vector is another splat or a
+  // build_vector with all scalar elements the same.
+  if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
+    SDNode *V = N0.getNode();
+
+    // If this is a bit convert that changes the element type of the vector but
+    // not the number of vector elements, look through it.  Be careful not to
+    // look though conversions that change things like v4f32 to v2f64.
+    if (V->getOpcode() == ISD::BITCAST) {
+      SDValue ConvInput = V->getOperand(0);
+      if (ConvInput.getValueType().isVector() &&
+          ConvInput.getValueType().getVectorNumElements() == NumElts)
+        V = ConvInput.getNode();
+    }
+
+    if (V->getOpcode() == ISD::BUILD_VECTOR) {
+      assert(V->getNumOperands() == NumElts &&
+             "BUILD_VECTOR has wrong number of operands");
+      SDValue Base;
+      bool AllSame = true;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (V->getOperand(i).getOpcode() != ISD::UNDEF) {
+          Base = V->getOperand(i);
+          break;
+        }
+      }
+      // Splat of <u, u, u, u>, return <u, u, u, u>
+      if (!Base.getNode())
+        return N0;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (V->getOperand(i) != Base) {
+          AllSame = false;
+          break;
+        }
+      }
+      // Splat of <x, x, x, x>, return <x, x, x, x>
+      if (AllSame)
+        return N0;
+    }
+  }
+
+  // If this shuffle node is simply a swizzle of another shuffle node,
+  // and it reverses the swizzle of the previous shuffle then we can
+  // optimize shuffle(shuffle(x, undef), undef) -> x.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      N1.getOpcode() == ISD::UNDEF) {
+
+    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
+
+    // Shuffle nodes can only reverse shuffles with a single non-undef value.
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(OtherSV->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      assert(Idx < (int)NumElts && "Index references undef operand");
+      // Next, this index comes from the first value, which is the incoming
+      // shuffle. Adopt the incoming index.
+      if (Idx >= 0)
+        Idx = OtherSV->getMaskElt(Idx);
+
+      // The combined shuffle must map each index to itself.
+      if (Idx >= 0 && (unsigned)Idx != i)
+        return SDValue();
+    }
+
+    return OtherSV->getOperand(0);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMEMBARRIER(SDNode* N) {
+  if (!TLI.getShouldFoldAtomicFences())
+    return SDValue();
+
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3)), atomic.getResNo());
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return SDValue(DAG.UpdateNodeOperands(atomic.getNode(),
+                                    fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2)),
+                     atomic.getResNo());
+    default:
+      return SDValue();
+  }
+}
+
+/// XformToShuffleWithZero - Returns a vector_shuffle if it able to transform
+/// an AND to a vector_shuffle with the destination vector and a zero vector.
+/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
+///      vector_shuffle V, Zero, <0, 4, 2, 4>
+SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (N->getOpcode() == ISD::AND) {
+    if (RHS.getOpcode() == ISD::BITCAST)
+      RHS = RHS.getOperand(0);
+    if (RHS.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<int, 8> Indices;
+      unsigned NumElts = RHS.getNumOperands();
+      for (unsigned i = 0; i != NumElts; ++i) {
+        SDValue Elt = RHS.getOperand(i);
+        if (!isa<ConstantSDNode>(Elt))
+          return SDValue();
+
+        if (cast<ConstantSDNode>(Elt)->isAllOnesValue())
+          Indices.push_back(i);
+        else if (cast<ConstantSDNode>(Elt)->isNullValue())
+          Indices.push_back(NumElts);
+        else
+          return SDValue();
+      }
+
+      // Let's see if the target supports this vector_shuffle.
+      EVT RVT = RHS.getValueType();
+      if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+        return SDValue();
+
+      // Return the new VECTOR_SHUFFLE node.
+      EVT EltVT = RVT.getVectorElementType();
+      SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
+                                     DAG.getConstant(0, EltVT));
+      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                                 RVT, &ZeroOps[0], ZeroOps.size());
+      LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
+      SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Shuf);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifyVBinOp - Visit a binary vector operation, like ADD.
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         "SimplifyVBinOp only works on vectors!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Shuffle = XformToShuffleWithZero(N);
+  if (Shuffle.getNode()) return Shuffle;
+
+  // If the LHS and RHS are BUILD_VECTOR nodes, see if we can constant fold
+  // this operation.
+  if (LHS.getOpcode() == ISD::BUILD_VECTOR &&
+      RHS.getOpcode() == ISD::BUILD_VECTOR) {
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
+      SDValue LHSOp = LHS.getOperand(i);
+      SDValue RHSOp = RHS.getOperand(i);
+      // If these two elements can't be folded, bail out.
+      if ((LHSOp.getOpcode() != ISD::UNDEF &&
+           LHSOp.getOpcode() != ISD::Constant &&
+           LHSOp.getOpcode() != ISD::ConstantFP) ||
+          (RHSOp.getOpcode() != ISD::UNDEF &&
+           RHSOp.getOpcode() != ISD::Constant &&
+           RHSOp.getOpcode() != ISD::ConstantFP))
+        break;
+
+      // Can't fold divide by zero.
+      if (N->getOpcode() == ISD::SDIV || N->getOpcode() == ISD::UDIV ||
+          N->getOpcode() == ISD::FDIV) {
+        if ((RHSOp.getOpcode() == ISD::Constant &&
+             cast<ConstantSDNode>(RHSOp.getNode())->isNullValue()) ||
+            (RHSOp.getOpcode() == ISD::ConstantFP &&
+             cast<ConstantFPSDNode>(RHSOp.getNode())->getValueAPF().isZero()))
+          break;
+      }
+
+      EVT VT = LHSOp.getValueType();
+      EVT RVT = RHSOp.getValueType();
+      if (RVT != VT) {
+        // Integer BUILD_VECTOR operands may have types larger than the element
+        // size (e.g., when the element type is not legal).  Prior to type
+        // legalization, the types may not match between the two BUILD_VECTORS.
+        // Truncate one of the operands to make them match.
+        if (RVT.getSizeInBits() > VT.getSizeInBits()) {
+          RHSOp = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, RHSOp);
+        } else {
+          LHSOp = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), RVT, LHSOp);
+          VT = RVT;
+        }
+      }
+      SDValue FoldOp = DAG.getNode(N->getOpcode(), LHS.getDebugLoc(), VT,
+                                   LHSOp, RHSOp);
+      if (FoldOp.getOpcode() != ISD::UNDEF &&
+          FoldOp.getOpcode() != ISD::Constant &&
+          FoldOp.getOpcode() != ISD::ConstantFP)
+        break;
+      Ops.push_back(FoldOp);
+      AddToWorkList(FoldOp.getNode());
+    }
+
+    if (Ops.size() == LHS.getNumOperands())
+      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                         LHS.getValueType(), &Ops[0], Ops.size());
+  }
+
+  return SDValue();
+}
+
+/// SimplifyVUnaryOp - Visit a binary vector operation, like FABS/FNEG.
+SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         "SimplifyVUnaryOp only works on vectors!");
+
+  SDValue N0 = N->getOperand(0);
+
+  if (N0.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Operand is a BUILD_VECTOR node, see if we can constant fold it.
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+    SDValue Op = N0.getOperand(i);
+    if (Op.getOpcode() != ISD::UNDEF &&
+        Op.getOpcode() != ISD::ConstantFP)
+      break;
+    EVT EltVT = Op.getValueType();
+    SDValue FoldOp = DAG.getNode(N->getOpcode(), N0.getDebugLoc(), EltVT, Op);
+    if (FoldOp.getOpcode() != ISD::UNDEF &&
+        FoldOp.getOpcode() != ISD::ConstantFP)
+      break;
+    Ops.push_back(FoldOp);
+    AddToWorkList(FoldOp.getNode());
+  }
+
+  if (Ops.size() != N0.getNumOperands())
+    return SDValue();
+
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                     N0.getValueType(), &Ops[0], Ops.size());
+}
+
+SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
+                                    SDValue N1, SDValue N2){
+  assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
+
+  SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+  // If we got a simplified select_cc node back from SimplifySelectCC, then
+  // break it down into a new SETCC node, and a new SELECT node, and then return
+  // the SELECT node, since we were called with a SELECT node.
+  if (SCC.getNode()) {
+    // Check to see if we got a select_cc back (to turn into setcc/select).
+    // Otherwise, just return whatever node we got back, like fabs.
+    if (SCC.getOpcode() == ISD::SELECT_CC) {
+      SDValue SETCC = DAG.getNode(ISD::SETCC, N0.getDebugLoc(),
+                                  N0.getValueType(),
+                                  SCC.getOperand(0), SCC.getOperand(1),
+                                  SCC.getOperand(4));
+      AddToWorkList(SETCC.getNode());
+      return DAG.getNode(ISD::SELECT, SCC.getDebugLoc(), SCC.getValueType(),
+                         SCC.getOperand(2), SCC.getOperand(3), SETCC);
+    }
+
+    return SCC;
+  }
+  return SDValue();
+}
+
+/// SimplifySelectOps - Given a SELECT or a SELECT_CC node, where LHS and RHS
+/// are the two values being selected between, see if we can simplify the
+/// select.  Callers of this should assume that TheSelect is deleted if this
+/// returns true.  As such, they should return the appropriate thing (e.g. the
+/// node) back to the top-level of the DAG combiner loop to avoid it being
+/// looked at.
+bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
+                                    SDValue RHS) {
+
+  // Cannot simplify select with vector condition
+  if (TheSelect->getOperand(0).getValueType().isVector()) return false;
+
+  // If this is a select from two identical things, try to pull the operation
+  // through the select.
+  if (LHS.getOpcode() != RHS.getOpcode() ||
+      !LHS.hasOneUse() || !RHS.hasOneUse())
+    return false;
+
+  // If this is a load and the token chain is identical, replace the select
+  // of two loads with a load through a select of the address to load from.
+  // This triggers in things like "select bool X, 10.0, 123.0" after the FP
+  // constants have been dropped into the constant pool.
+  if (LHS.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LLD = cast<LoadSDNode>(LHS);
+    LoadSDNode *RLD = cast<LoadSDNode>(RHS);
+
+    // Token chains must be identical.
+    if (LHS.getOperand(0) != RHS.getOperand(0) ||
+        // Do not let this transformation reduce the number of volatile loads.
+        LLD->isVolatile() || RLD->isVolatile() ||
+        // If this is an EXTLOAD, the VT's must match.
+        LLD->getMemoryVT() != RLD->getMemoryVT() ||
+        // If this is an EXTLOAD, the kind of extension must match.
+        (LLD->getExtensionType() != RLD->getExtensionType() &&
+         // The only exception is if one of the extensions is anyext.
+         LLD->getExtensionType() != ISD::EXTLOAD &&
+         RLD->getExtensionType() != ISD::EXTLOAD) ||
+        // FIXME: this discards src value information.  This is
+        // over-conservative. It would be beneficial to be able to remember
+        // both potential memory locations.  Since we are discarding
+        // src value info, don't do the transformation if the memory
+        // locations are not in the default address space.
+        LLD->getPointerInfo().getAddrSpace() != 0 ||
+        RLD->getPointerInfo().getAddrSpace() != 0 ||
+        !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
+                                      LLD->getBasePtr().getValueType()))
+      return false;
+
+    // Check that the select condition doesn't reach either load.  If so,
+    // folding this will induce a cycle into the DAG.  If not, this is safe to
+    // xform, so create a select of the addresses.
+    SDValue Addr;
+    if (TheSelect->getOpcode() == ISD::SELECT) {
+      SDNode *CondNode = TheSelect->getOperand(0).getNode();
+      if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) ||
+          (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
+        return false;
+      // The loads must not depend on one another.
+      if (LLD->isPredecessorOf(RLD) ||
+          RLD->isPredecessorOf(LLD))
+        return false;
+      Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
+                         LLD->getBasePtr().getValueType(),
+                         TheSelect->getOperand(0), LLD->getBasePtr(),
+                         RLD->getBasePtr());
+    } else {  // Otherwise SELECT_CC
+      SDNode *CondLHS = TheSelect->getOperand(0).getNode();
+      SDNode *CondRHS = TheSelect->getOperand(1).getNode();
+
+      if ((LLD->hasAnyUseOfValue(1) &&
+           (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) ||
+          (RLD->hasAnyUseOfValue(1) &&
+           (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS))))
+        return false;
+
+      Addr = DAG.getNode(ISD::SELECT_CC, TheSelect->getDebugLoc(),
+                         LLD->getBasePtr().getValueType(),
+                         TheSelect->getOperand(0),
+                         TheSelect->getOperand(1),
+                         LLD->getBasePtr(), RLD->getBasePtr(),
+                         TheSelect->getOperand(4));
+    }
+
+    SDValue Load;
+    if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
+      Load = DAG.getLoad(TheSelect->getValueType(0),
+                         TheSelect->getDebugLoc(),
+                         // FIXME: Discards pointer info.
+                         LLD->getChain(), Addr, MachinePointerInfo(),
+                         LLD->isVolatile(), LLD->isNonTemporal(),
+                         LLD->isInvariant(), LLD->getAlignment());
+    } else {
+      Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ?
+                            RLD->getExtensionType() : LLD->getExtensionType(),
+                            TheSelect->getDebugLoc(),
+                            TheSelect->getValueType(0),
+                            // FIXME: Discards pointer info.
+                            LLD->getChain(), Addr, MachinePointerInfo(),
+                            LLD->getMemoryVT(), LLD->isVolatile(),
+                            LLD->isNonTemporal(), LLD->getAlignment());
+    }
+
+    // Users of the select now use the result of the load.
+    CombineTo(TheSelect, Load);
+
+    // Users of the old loads now use the new load's chain.  We know the
+    // old-load value is dead now.
+    CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
+    CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
+    return true;
+  }
+
+  return false;
+}
+
+/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
+SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3,
+                                      ISD::CondCode CC, bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+
+  EVT VT = N2.getValueType();
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, DL, false);
+  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+  ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode());
+
+  // fold select_cc true, x, y -> x
+  if (SCCC && !SCCC->isNullValue())
+    return N2;
+  // fold select_cc false, x, y -> y
+  if (SCCC && SCCC->isNullValue())
+    return N3;
+
+  // Check to see if we can simplify the select into an fabs node
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
+    // Allow either -0.0 or 0.0
+    if (CFP->getValueAPF().isZero()) {
+      // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
+      if ((CC == ISD::SETGE || CC == ISD::SETGT) &&
+          N0 == N2 && N3.getOpcode() == ISD::FNEG &&
+          N2 == N3.getOperand(0))
+        return DAG.getNode(ISD::FABS, DL, VT, N0);
+
+      // select (setl[te] X, +/-0.0), fneg(X), X -> fabs
+      if ((CC == ISD::SETLT || CC == ISD::SETLE) &&
+          N0 == N3 && N2.getOpcode() == ISD::FNEG &&
+          N2.getOperand(0) == N3)
+        return DAG.getNode(ISD::FABS, DL, VT, N3);
+    }
+  }
+
+  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+  // in it.  This is a win when the constant is not otherwise available because
+  // it replaces two constant pool loads with one.  We only do this if the FP
+  // type is known to be legal, because if it isn't, then we are before legalize
+  // types an we want the other legalization to happen first (e.g. to avoid
+  // messing with soft float) and if the ConstantFP is not legal, because if
+  // it is legal, we may not need to store the FP constant in a constant pool.
+  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
+    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
+      if (TLI.isTypeLegal(N2.getValueType()) &&
+          (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
+           TargetLowering::Legal) &&
+          // If both constants have multiple uses, then we won't need to do an
+          // extra load, they are likely around in registers for other users.
+          (TV->hasOneUse() || FV->hasOneUse())) {
+        Constant *Elts[] = {
+          const_cast<ConstantFP*>(FV->getConstantFPValue()),
+          const_cast<ConstantFP*>(TV->getConstantFPValue())
+        };
+        Type *FPTy = Elts[0]->getType();
+        const DataLayout &TD = *TLI.getDataLayout();
+
+        // Create a ConstantArray of the two constants.
+        Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
+        SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(),
+                                            TD.getPrefTypeAlignment(FPTy));
+        unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+
+        // Get the offsets to the 0 and 1 element of the array so that we can
+        // select between them.
+        SDValue Zero = DAG.getIntPtrConstant(0);
+        unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+        SDValue One = DAG.getIntPtrConstant(EltSize);
+
+        SDValue Cond = DAG.getSetCC(DL,
+                                    TLI.getSetCCResultType(N0.getValueType()),
+                                    N0, N1, CC);
+        AddToWorkList(Cond.getNode());
+        SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
+                                        Cond, One, Zero);
+        AddToWorkList(CstOffset.getNode());
+        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+                            CstOffset);
+        AddToWorkList(CPIdx.getNode());
+        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                           MachinePointerInfo::getConstantPool(), false,
+                           false, false, Alignment);
+
+      }
+    }
+
+  // Check to see if we can perform the "gzip trick", transforming
+  // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A)
+  if (N1C && N3C && N3C->isNullValue() && CC == ISD::SETLT &&
+      (N1C->isNullValue() ||                         // (a < 0) ? b : 0
+       (N1C->getAPIntValue() == 1 && N0 == N2))) {   // (a < 1) ? a : 0
+    EVT XType = N0.getValueType();
+    EVT AType = N2.getValueType();
+    if (XType.bitsGE(AType)) {
+      // and (sra X, size(X)-1, A) -> "and (srl X, C2), A" iff A is a
+      // single-bit constant.
+      if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue()-1)) == 0)) {
+        unsigned ShCtV = N2C->getAPIntValue().logBase2();
+        ShCtV = XType.getSizeInBits()-ShCtV-1;
+        SDValue ShCt = DAG.getConstant(ShCtV,
+                                       getShiftAmountTy(N0.getValueType()));
+        SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(),
+                                    XType, N0, ShCt);
+        AddToWorkList(Shift.getNode());
+
+        if (XType.bitsGT(AType)) {
+          Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+          AddToWorkList(Shift.getNode());
+        }
+
+        return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+      }
+
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(),
+                                  XType, N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy(N0.getValueType())));
+      AddToWorkList(Shift.getNode());
+
+      if (XType.bitsGT(AType)) {
+        Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+        AddToWorkList(Shift.getNode());
+      }
+
+      return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+    }
+  }
+
+  // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
+  // where y is has a single bit set.
+  // A plaintext description would be, we can turn the SELECT_CC into an AND
+  // when the condition can be materialized as an all-ones register.  Any
+  // single bit-test can be materialized as an all-ones register with
+  // shift-left and shift-right-arith.
+  if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
+      N0->getValueType(0) == VT &&
+      N1C && N1C->isNullValue() &&
+      N2C && N2C->isNullValue()) {
+    SDValue AndLHS = N0->getOperand(0);
+    ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
+      // Shift the tested bit over the sign bit.
+      APInt AndMask = ConstAndRHS->getAPIntValue();
+      SDValue ShlAmt =
+        DAG.getConstant(AndMask.countLeadingZeros(),
+                        getShiftAmountTy(AndLHS.getValueType()));
+      SDValue Shl = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT, AndLHS, ShlAmt);
+
+      // Now arithmetic right shift it all the way over, so the result is either
+      // all-ones, or zero.
+      SDValue ShrAmt =
+        DAG.getConstant(AndMask.getBitWidth()-1,
+                        getShiftAmountTy(Shl.getValueType()));
+      SDValue Shr = DAG.getNode(ISD::SRA, N0.getDebugLoc(), VT, Shl, ShrAmt);
+
+      return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
+    }
+  }
+
+  // fold select C, 16, 0 -> shl C, 4
+  if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() &&
+    TLI.getBooleanContents(N0.getValueType().isVector()) ==
+      TargetLowering::ZeroOrOneBooleanContent) {
+
+    // If the caller doesn't want us to simplify this into a zext of a compare,
+    // don't do it.
+    if (NotExtCompare && N2C->getAPIntValue() == 1)
+      return SDValue();
+
+    // Get a SetCC of the condition
+    // NOTE: Don't create a SETCC if it's not legal on this target.
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC,
+          LegalTypes ? TLI.getSetCCResultType(N0.getValueType()) : MVT::i1)) {
+      SDValue Temp, SCC;
+      // cast from setcc result type to select result type
+      if (LegalTypes) {
+        SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
+                            N0, N1, CC);
+        if (N2.getValueType().bitsLT(SCC.getValueType()))
+          Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(),
+                                        N2.getValueType());
+        else
+          Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                             N2.getValueType(), SCC);
+      } else {
+        SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                           N2.getValueType(), SCC);
+      }
+
+      AddToWorkList(SCC.getNode());
+      AddToWorkList(Temp.getNode());
+
+      if (N2C->getAPIntValue() == 1)
+        return Temp;
+
+      // shl setcc result by log2 n2c
+      return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                         DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                                         getShiftAmountTy(Temp.getValueType())));
+    }
+  }
+
+  // Check to see if this is the equivalent of setcc
+  // FIXME: Turn all of these into setcc if setcc if setcc is legal
+  // otherwise, go ahead with the folds.
+  if (0 && N3C && N3C->isNullValue() && N2C && (N2C->getAPIntValue() == 1ULL)) {
+    EVT XType = N0.getValueType();
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(XType))) {
+      SDValue Res = DAG.getSetCC(DL, TLI.getSetCCResultType(XType), N0, N1, CC);
+      if (Res.getValueType() != VT)
+        Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
+      return Res;
+    }
+
+    // fold (seteq X, 0) -> (srl (ctlz X, log2(size(X))))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETEQ &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::CTLZ, XType))) {
+      SDValue Ctlz = DAG.getNode(ISD::CTLZ, N0.getDebugLoc(), XType, N0);
+      return DAG.getNode(ISD::SRL, DL, XType, Ctlz,
+                         DAG.getConstant(Log2_32(XType.getSizeInBits()),
+                                       getShiftAmountTy(Ctlz.getValueType())));
+    }
+    // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1))
+    if (N1C && N1C->isNullValue() && CC == ISD::SETGT) {
+      SDValue NegN0 = DAG.getNode(ISD::SUB, N0.getDebugLoc(),
+                                  XType, DAG.getConstant(0, XType), N0);
+      SDValue NotN0 = DAG.getNOT(N0.getDebugLoc(), N0, XType);
+      return DAG.getNode(ISD::SRL, DL, XType,
+                         DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0),
+                         DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy(XType)));
+    }
+    // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1))
+    if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT) {
+      SDValue Sign = DAG.getNode(ISD::SRL, N0.getDebugLoc(), XType, N0,
+                                 DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy(N0.getValueType())));
+      return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, XType));
+    }
+  }
+
+  // Check to see if this is an integer abs.
+  // select_cc setg[te] X,  0,  X, -X ->
+  // select_cc setgt    X, -1,  X, -X ->
+  // select_cc setl[te] X,  0, -X,  X ->
+  // select_cc setlt    X,  1, -X,  X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N1C) {
+    ConstantSDNode *SubC = NULL;
+    if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
+        N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
+    else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
+              (N1C->isOne() && CC == ISD::SETLT)) &&
+             N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
+
+    EVT XType = N0.getValueType();
+    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+                                  N0,
+                                  DAG.getConstant(XType.getSizeInBits()-1,
+                                         getShiftAmountTy(N0.getValueType())));
+      SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+                                XType, N0, Shift);
+      AddToWorkList(Shift.getNode());
+      AddToWorkList(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+    }
+  }
+
+  return SDValue();
+}
+
+/// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
+SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
+                                   SDValue N1, ISD::CondCode Cond,
+                                   DebugLoc DL, bool foldBooleans) {
+  TargetLowering::DAGCombinerInfo
+    DagCombineInfo(DAG, Level, false, this);
+  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  std::vector<SDNode*> Built;
+  SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, &Built);
+
+  for (std::vector<SDNode*>::iterator ii = Built.begin(), ee = Built.end();
+       ii != ee; ++ii)
+    AddToWorkList(*ii);
+  return S;
+}
+
+/// FindBaseOffset - Return true if base is a frame index, which is known not
+// to alias with anything but itself.  Provides base object and offset as
+// results.
+static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
+                           const GlobalValue *&GV, const void *&CV) {
+  // Assume it is a primitive operation.
+  Base = Ptr; Offset = 0; GV = 0; CV = 0;
+
+  // If it's an adding a simple constant then integrate the offset.
+  if (Base.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
+      Base = Base.getOperand(0);
+      Offset += C->getZExtValue();
+    }
+  }
+
+  // Return the underlying GlobalValue, and update the Offset.  Return false
+  // for GlobalAddressSDNode since the same GlobalAddress may be represented
+  // by multiple nodes with different offsets.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Base)) {
+    GV = G->getGlobal();
+    Offset += G->getOffset();
+    return false;
+  }
+
+  // Return the underlying Constant value, and update the Offset.  Return false
+  // for ConstantSDNodes since the same constant pool entry may be represented
+  // by multiple nodes with different offsets.
+  if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
+    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal()
+                                         : (const void *)C->getConstVal();
+    Offset += C->getOffset();
+    return false;
+  }
+  // If it's any of the following then it can't alias with anything but itself.
+  return isa<FrameIndexSDNode>(Base);
+}
+
+/// isAlias - Return true if there is any possibility that the two addresses
+/// overlap.
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+                          const Value *SrcValue1, int SrcValueOffset1,
+                          unsigned SrcValueAlign1,
+                          const MDNode *TBAAInfo1,
+                          SDValue Ptr2, int64_t Size2,
+                          const Value *SrcValue2, int SrcValueOffset2,
+                          unsigned SrcValueAlign2,
+                          const MDNode *TBAAInfo2) const {
+  // If they are the same then they must be aliases.
+  if (Ptr1 == Ptr2) return true;
+
+  // Gather base node and offset information.
+  SDValue Base1, Base2;
+  int64_t Offset1, Offset2;
+  const GlobalValue *GV1, *GV2;
+  const void *CV1, *CV2;
+  bool isFrameIndex1 = FindBaseOffset(Ptr1, Base1, Offset1, GV1, CV1);
+  bool isFrameIndex2 = FindBaseOffset(Ptr2, Base2, Offset2, GV2, CV2);
+
+  // If they have a same base address then check to see if they overlap.
+  if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2)))
+    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+
+  // It is possible for different frame indices to alias each other, mostly
+  // when tail call optimization reuses return address slots for arguments.
+  // To catch this case, look up the actual index of frame indices to compute
+  // the real alias relationship.
+  if (isFrameIndex1 && isFrameIndex2) {
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    Offset1 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex());
+    Offset2 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex());
+    return !((Offset1 + Size1) <= Offset2 || (Offset2 + Size2) <= Offset1);
+  }
+
+  // Otherwise, if we know what the bases are, and they aren't identical, then
+  // we know they cannot alias.
+  if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2))
+    return false;
+
+  // If we know required SrcValue1 and SrcValue2 have relatively large alignment
+  // compared to the size and offset of the access, we may be able to prove they
+  // do not alias.  This check is conservative for now to catch cases created by
+  // splitting vector types.
+  if ((SrcValueAlign1 == SrcValueAlign2) &&
+      (SrcValueOffset1 != SrcValueOffset2) &&
+      (Size1 == Size2) && (SrcValueAlign1 > Size1)) {
+    int64_t OffAlign1 = SrcValueOffset1 % SrcValueAlign1;
+    int64_t OffAlign2 = SrcValueOffset2 % SrcValueAlign1;
+
+    // There is no overlap between these relatively aligned accesses of similar
+    // size, return no alias.
+    if ((OffAlign1 + Size1) <= OffAlign2 || (OffAlign2 + Size2) <= OffAlign1)
+      return false;
+  }
+
+  if (CombinerGlobalAA) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
+    int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
+    int64_t Overlap2 = Size2 + SrcValueOffset2 - MinOffset;
+    AliasAnalysis::AliasResult AAResult =
+      AA.alias(AliasAnalysis::Location(SrcValue1, Overlap1, TBAAInfo1),
+               AliasAnalysis::Location(SrcValue2, Overlap2, TBAAInfo2));
+    if (AAResult == AliasAnalysis::NoAlias)
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) {
+  SDValue Ptr0, Ptr1;
+  int64_t Size0, Size1;
+  const Value *SrcValue0, *SrcValue1;
+  int SrcValueOffset0, SrcValueOffset1;
+  unsigned SrcValueAlign0, SrcValueAlign1;
+  const MDNode *SrcTBAAInfo0, *SrcTBAAInfo1;
+  FindAliasInfo(Op0, Ptr0, Size0, SrcValue0, SrcValueOffset0,
+                SrcValueAlign0, SrcTBAAInfo0);
+  FindAliasInfo(Op1, Ptr1, Size1, SrcValue1, SrcValueOffset1,
+                SrcValueAlign1, SrcTBAAInfo1);
+  return isAlias(Ptr0, Size0, SrcValue0, SrcValueOffset0,
+                 SrcValueAlign0, SrcTBAAInfo0,
+                 Ptr1, Size1, SrcValue1, SrcValueOffset1,
+                 SrcValueAlign1, SrcTBAAInfo1);
+}
+
+/// FindAliasInfo - Extracts the relevant alias information from the memory
+/// node.  Returns true if the operand was a load.
+bool DAGCombiner::FindAliasInfo(SDNode *N,
+                                SDValue &Ptr, int64_t &Size,
+                                const Value *&SrcValue,
+                                int &SrcValueOffset,
+                                unsigned &SrcValueAlign,
+                                const MDNode *&TBAAInfo) const {
+  LSBaseSDNode *LS = cast<LSBaseSDNode>(N);
+
+  Ptr = LS->getBasePtr();
+  Size = LS->getMemoryVT().getSizeInBits() >> 3;
+  SrcValue = LS->getSrcValue();
+  SrcValueOffset = LS->getSrcValueOffset();
+  SrcValueAlign = LS->getOriginalAlignment();
+  TBAAInfo = LS->getTBAAInfo();
+  return isa<LoadSDNode>(LS);
+}
+
+/// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
+/// looking for aliasing nodes and adding them to the Aliases vector.
+void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                                   SmallVector<SDValue, 8> &Aliases) {
+  SmallVector<SDValue, 8> Chains;     // List of chains to visit.
+  SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
+
+  // Get alias information for node.
+  SDValue Ptr;
+  int64_t Size;
+  const Value *SrcValue;
+  int SrcValueOffset;
+  unsigned SrcValueAlign;
+  const MDNode *SrcTBAAInfo;
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset,
+                              SrcValueAlign, SrcTBAAInfo);
+
+  // Starting off.
+  Chains.push_back(OriginalChain);
+  unsigned Depth = 0;
+
+  // Look at each chain and determine if it is an alias.  If so, add it to the
+  // aliases list.  If not, then continue up the chain looking for the next
+  // candidate.
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.back();
+    Chains.pop_back();
+
+    // For TokenFactor nodes, look at each operand and only continue up the
+    // chain until we find two aliases.  If we've seen two aliases, assume we'll
+    // find more and revert to original chain since the xform is unlikely to be
+    // profitable.
+    //
+    // FIXME: The depth check could be made to return the last non-aliasing
+    // chain we found before we hit a tokenfactor rather than the original
+    // chain.
+    if (Depth > 6 || Aliases.size() == 2) {
+      Aliases.clear();
+      Aliases.push_back(OriginalChain);
+      break;
+    }
+
+    // Don't bother if we've been before.
+    if (!Visited.insert(Chain.getNode()))
+      continue;
+
+    switch (Chain.getOpcode()) {
+    case ISD::EntryToken:
+      // Entry token is ideal chain operand, but handled in FindBetterChain.
+      break;
+
+    case ISD::LOAD:
+    case ISD::STORE: {
+      // Get alias information for Chain.
+      SDValue OpPtr;
+      int64_t OpSize;
+      const Value *OpSrcValue;
+      int OpSrcValueOffset;
+      unsigned OpSrcValueAlign;
+      const MDNode *OpSrcTBAAInfo;
+      bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
+                                    OpSrcValue, OpSrcValueOffset,
+                                    OpSrcValueAlign,
+                                    OpSrcTBAAInfo);
+
+      // If chain is alias then stop here.
+      if (!(IsLoad && IsOpLoad) &&
+          isAlias(Ptr, Size, SrcValue, SrcValueOffset, SrcValueAlign,
+                  SrcTBAAInfo,
+                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset,
+                  OpSrcValueAlign, OpSrcTBAAInfo)) {
+        Aliases.push_back(Chain);
+      } else {
+        // Look further up the chain.
+        Chains.push_back(Chain.getOperand(0));
+        ++Depth;
+      }
+      break;
+    }
+
+    case ISD::TokenFactor:
+      // We have to check each of the operands of the token factor for "small"
+      // token factors, so we queue them up.  Adding the operands to the queue
+      // (stack) in reverse order maintains the original order and increases the
+      // likelihood that getNode will find a matching token factor (CSE.)
+      if (Chain.getNumOperands() > 16) {
+        Aliases.push_back(Chain);
+        break;
+      }
+      for (unsigned n = Chain.getNumOperands(); n;)
+        Chains.push_back(Chain.getOperand(--n));
+      ++Depth;
+      break;
+
+    default:
+      // For all other instructions we will just have to take what we can get.
+      Aliases.push_back(Chain);
+      break;
+    }
+  }
+}
+
+/// FindBetterChain - Walk up chain skipping non-aliasing memory nodes, looking
+/// for a better chain (aliasing node.)
+SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
+  SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
+
+  // Accumulate all the aliases to this node.
+  GatherAllAliases(N, OldChain, Aliases);
+
+  // If no operands then chain to entry token.
+  if (Aliases.size() == 0)
+    return DAG.getEntryNode();
+
+  // If a single operand then chain to it.  We don't need to revisit it.
+  if (Aliases.size() == 1)
+    return Aliases[0];
+
+  // Construct a custom tailored token factor.
+  return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                     &Aliases[0], Aliases.size());
+}
+
+// SelectionDAG::Combine - This is the entry point for the file.
+//
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+                           CodeGenOpt::Level OptLevel) {
+  /// run - This is the main entry point to this class.
+  ///
+  DAGCombiner(*this, AA, OptLevel).Run(Level);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
new file mode 100644
index 000000000000..9ac738e50726
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -0,0 +1,1507 @@
+//===-- FastISel.cpp - Implementation of the FastISel class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the FastISel class.
+//
+// "Fast" instruction selection is designed to emit very poor code quickly.
+// Also, it is not designed to be able to do much lowering, so most illegal
+// types (e.g. i64 on 32-bit targets) and operations are not supported.  It is
+// also not intended to be able to do much optimization, except in a few cases
+// where doing optimizations reduces overall compile time.  For example, folding
+// constants into immediate fields is often done, because it's cheap and it
+// reduces the number of instructions later phases have to examine.
+//
+// "Fast" instruction selection is able to fail gracefully and transfer
+// control to the SelectionDAG selector for operations that it doesn't
+// support.  In many cases, this allows us to avoid duplicating a lot of
+// the complicated lowering logic that SelectionDAG currently has.
+//
+// The intended use for "fast" instruction selection is "-O0" mode
+// compilation, where the quality of the generated code is irrelevant when
+// weighed against the speed at which the code can be generated.  Also,
+// at -O0, the LLVM optimizers are not running, and this makes the
+// compile time of codegen a much higher portion of the overall compile
+// time.  Despite its limitations, "fast" instruction selection is able to
+// handle enough code on its own to provide noticeable overall speedups
+// in -O0 compiles.
+//
+// Basic operations are supported in a target-independent way, by reading
+// the same instruction descriptions that the SelectionDAG selector reads,
+// and identifying simple arithmetic operations that can be directly selected
+// from simple operators.  More complicated operations currently require
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
+          "target-independent selector");
+STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
+          "target-specific selector");
+STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
+
+/// startNewBlock - Set the current block to which generated machine
+/// instructions will be appended, and clear the local CSE map.
+///
+void FastISel::startNewBlock() {
+  LocalValueMap.clear();
+
+  EmitStartPt = 0;
+
+  // Advance the emit start point past any EH_LABEL instructions.
+  MachineBasicBlock::iterator
+    I = FuncInfo.MBB->begin(), E = FuncInfo.MBB->end();
+  while (I != E && I->getOpcode() == TargetOpcode::EH_LABEL) {
+    EmitStartPt = I;
+    ++I;
+  }
+  LastLocalValue = EmitStartPt;
+}
+
+bool FastISel::LowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    // Fallback to SDISel argument lowering code to deal with sret pointer
+    // parameter.
+    return false;
+  
+  if (!FastLowerArguments())
+    return false;
+
+  // Enter non-dead arguments into ValueMap for uses in non-entry BBs.
+  for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
+         E = FuncInfo.Fn->arg_end(); I != E; ++I) {
+    if (!I->use_empty()) {
+      DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
+      assert(VI != LocalValueMap.end() && "Missed an argument?");
+      FuncInfo.ValueMap[I] = VI->second;
+    }
+  }
+  return true;
+}
+
+void FastISel::flushLocalValueMap() {
+  LocalValueMap.clear();
+  LastLocalValue = EmitStartPt;
+  recomputeInsertPt();
+}
+
+bool FastISel::hasTrivialKill(const Value *V) const {
+  // Don't consider constants or arguments to have trivial kills.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // No-op casts are trivially coalesced by fast-isel.
+  if (const CastInst *Cast = dyn_cast<CastInst>(I))
+    if (Cast->isNoopCast(TD.getIntPtrType(Cast->getContext())) &&
+        !hasTrivialKill(Cast->getOperand(0)))
+      return false;
+
+  // GEPs with all zero indices are trivially coalesced by fast-isel.
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
+      return false;
+
+  // Only instructions with a single use in the same basic block are considered
+  // to have trivial kills.
+  return I->hasOneUse() &&
+         !(I->getOpcode() == Instruction::BitCast ||
+           I->getOpcode() == Instruction::PtrToInt ||
+           I->getOpcode() == Instruction::IntToPtr) &&
+         cast<Instruction>(*I->use_begin())->getParent() == I->getParent();
+}
+
+unsigned FastISel::getRegForValue(const Value *V) {
+  EVT RealVT = TLI.getValueType(V->getType(), /*AllowUnknown=*/true);
+  // Don't handle non-simple values in FastISel.
+  if (!RealVT.isSimple())
+    return 0;
+
+  // Ignore illegal types. We must do this before looking up the value
+  // in ValueMap because Arguments are given virtual registers regardless
+  // of whether FastISel can handle them.
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT)) {
+    // Handle integer promotions, though, because they're common and easy.
+    if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+      VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
+    else
+      return 0;
+  }
+
+  // Look up the value to see if we already have a register for it.
+  unsigned Reg = lookUpRegForValue(V);
+  if (Reg != 0)
+    return Reg;
+
+  // In bottom-up mode, just create the virtual register which will be used
+  // to hold the value. It will be materialized later.
+  if (isa<Instruction>(V) &&
+      (!isa<AllocaInst>(V) ||
+       !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V))))
+    return FuncInfo.InitializeRegForValue(V);
+
+  SavePoint SaveInsertPt = enterLocalValueArea();
+
+  // Materialize the value in a register. Emit any instructions in the
+  // local value area.
+  Reg = materializeRegForValue(V, VT);
+
+  leaveLocalValueArea(SaveInsertPt);
+
+  return Reg;
+}
+
+/// materializeRegForValue - Helper for getRegForValue. This function is
+/// called when the value isn't already available in a register and must
+/// be materialized with new instructions.
+unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  unsigned Reg = 0;
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getValue().getActiveBits() <= 64)
+      Reg = FastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V)) {
+    Reg = TargetMaterializeAlloca(cast<AllocaInst>(V));
+  } else if (isa<ConstantPointerNull>(V)) {
+    // Translate this as an integer zero so that it can be
+    // local-CSE'd with actual integer zeros.
+    Reg =
+      getRegForValue(Constant::getNullValue(TD.getIntPtrType(V->getContext())));
+  } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+    if (CF->isNullValue()) {
+      Reg = TargetMaterializeFloatZero(CF);
+    } else {
+      // Try to emit the constant directly.
+      Reg = FastEmit_f(VT, VT, ISD::ConstantFP, CF);
+    }
+
+    if (!Reg) {
+      // Try to emit the constant by using an integer constant with a cast.
+      const APFloat &Flt = CF->getValueAPF();
+      EVT IntVT = TLI.getPointerTy();
+
+      uint64_t x[2];
+      uint32_t IntBitWidth = IntVT.getSizeInBits();
+      bool isExact;
+      (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                                  APFloat::rmTowardZero, &isExact);
+      if (isExact) {
+        APInt IntVal(IntBitWidth, x);
+
+        unsigned IntegerReg =
+          getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+        if (IntegerReg != 0)
+          Reg = FastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP,
+                           IntegerReg, /*Kill=*/false);
+      }
+    }
+  } else if (const Operator *Op = dyn_cast<Operator>(V)) {
+    if (!SelectOperator(Op, Op->getOpcode()))
+      if (!isa<Instruction>(Op) ||
+          !TargetSelectInstruction(cast<Instruction>(Op)))
+        return 0;
+    Reg = lookUpRegForValue(Op);
+  } else if (isa<UndefValue>(V)) {
+    Reg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
+  }
+
+  // If target-independent code couldn't handle the value, give target-specific
+  // code a try.
+  if (!Reg && isa<Constant>(V))
+    Reg = TargetMaterializeConstant(cast<Constant>(V));
+
+  // Don't cache constant materializations in the general ValueMap.
+  // To do so would require tracking what uses they dominate.
+  if (Reg != 0) {
+    LocalValueMap[V] = Reg;
+    LastLocalValue = MRI.getVRegDef(Reg);
+  }
+  return Reg;
+}
+
+unsigned FastISel::lookUpRegForValue(const Value *V) {
+  // Look up the value to see if we already have a register for it. We
+  // cache values defined by Instructions across blocks, and other values
+  // only locally. This is because Instructions already have the SSA
+  // def-dominates-use requirement enforced.
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
+  return LocalValueMap[V];
+}
+
+/// UpdateValueMap - Update the value map to include the new mapping for this
+/// instruction, or insert an extra copy to get the result in a previous
+/// determined register.
+/// NOTE: This is only necessary because we might select a block that uses
+/// a value before we select the block that defines the value.  It might be
+/// possible to fix this by selecting blocks in reverse postorder.
+void FastISel::UpdateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
+  if (!isa<Instruction>(I)) {
+    LocalValueMap[I] = Reg;
+    return;
+  }
+
+  unsigned &AssignedReg = FuncInfo.ValueMap[I];
+  if (AssignedReg == 0)
+    // Use the new register.
+    AssignedReg = Reg;
+  else if (Reg != AssignedReg) {
+    // Arrange for uses of AssignedReg to be replaced by uses of Reg.
+    for (unsigned i = 0; i < NumRegs; i++)
+      FuncInfo.RegFixups[AssignedReg+i] = Reg+i;
+
+    AssignedReg = Reg;
+  }
+}
+
+std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return std::pair<unsigned, bool>(0, false);
+
+  bool IdxNIsKill = hasTrivialKill(Idx);
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy();
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND,
+                      IdxN, IdxNIsKill);
+    IdxNIsKill = true;
+  }
+  else if (IdxVT.bitsGT(PtrVT)) {
+    IdxN = FastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE,
+                      IdxN, IdxNIsKill);
+    IdxNIsKill = true;
+  }
+  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+void FastISel::recomputeInsertPt() {
+  if (getLastLocalValue()) {
+    FuncInfo.InsertPt = getLastLocalValue();
+    FuncInfo.MBB = FuncInfo.InsertPt->getParent();
+    ++FuncInfo.InsertPt;
+  } else
+    FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
+
+  // Now skip past any EH_LABELs, which must remain at the beginning.
+  while (FuncInfo.InsertPt != FuncInfo.MBB->end() &&
+         FuncInfo.InsertPt->getOpcode() == TargetOpcode::EH_LABEL)
+    ++FuncInfo.InsertPt;
+}
+
+void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
+                              MachineBasicBlock::iterator E) {
+  assert (I && E && std::distance(I, E) > 0 && "Invalid iterator!");
+  while (I != E) {
+    MachineInstr *Dead = &*I;
+    ++I;
+    Dead->eraseFromParent();
+    ++NumFastIselDead;
+  }
+  recomputeInsertPt();
+}
+
+FastISel::SavePoint FastISel::enterLocalValueArea() {
+  MachineBasicBlock::iterator OldInsertPt = FuncInfo.InsertPt;
+  DebugLoc OldDL = DL;
+  recomputeInsertPt();
+  DL = DebugLoc();
+  SavePoint SP = { OldInsertPt, OldDL };
+  return SP;
+}
+
+void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
+  if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
+    LastLocalValue = llvm::prior(FuncInfo.InsertPt);
+
+  // Restore the previous insert position.
+  FuncInfo.InsertPt = OldInsertPt.InsertPt;
+  DL = OldInsertPt.DL;
+}
+
+/// SelectBinaryOp - Select and emit code for a binary operator instruction,
+/// which has an opcode which directly corresponds to the given ISD opcode.
+///
+bool FastISel::SelectBinaryOp(const User *I, unsigned ISDOpcode) {
+  EVT VT = EVT::getEVT(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  if (!TLI.isTypeLegal(VT)) {
+    // MVT::i1 is special. Allow AND, OR, or XOR because they
+    // don't require additional zeroing, which makes them easy.
+    if (VT == MVT::i1 &&
+        (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
+         ISDOpcode == ISD::XOR))
+      VT = TLI.getTypeToTransformTo(I->getContext(), VT);
+    else
+      return false;
+  }
+
+  // Check if the first operand is a constant, and handle it as "ri".  At -O0,
+  // we don't have anything that canonicalizes operand order.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+    if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
+      unsigned Op1 = getRegForValue(I->getOperand(1));
+      if (Op1 == 0) return false;
+
+      bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+      unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1,
+                                        Op1IsKill, CI->getZExtValue(),
+                                        VT.getSimpleVT());
+      if (ResultReg == 0) return false;
+
+      // We successfully emitted code for the given LLVM Instruction.
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+
+
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (Op0 == 0)   // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  // Check if the second operand is a constant and handle it appropriately.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    uint64_t Imm = CI->getZExtValue();
+
+    // Transform "sdiv exact X, 8" -> "sra X, 3".
+    if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
+        cast<BinaryOperator>(I)->isExact() &&
+        isPowerOf2_64(Imm)) {
+      Imm = Log2_64(Imm);
+      ISDOpcode = ISD::SRA;
+    }
+
+    // Transform "urem x, pow2" -> "and x, pow2-1".
+    if (ISDOpcode == ISD::UREM && isa<BinaryOperator>(I) &&
+        isPowerOf2_64(Imm)) {
+      --Imm;
+      ISDOpcode = ISD::AND;
+    }
+
+    unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+                                      Op0IsKill, Imm, VT.getSimpleVT());
+    if (ResultReg == 0) return false;
+
+    // We successfully emitted code for the given LLVM Instruction.
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Check if the second operand is a constant float.
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
+    unsigned ResultReg = FastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
+                                     ISDOpcode, Op0, Op0IsKill, CF);
+    if (ResultReg != 0) {
+      // We successfully emitted code for the given LLVM Instruction.
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  unsigned Op1 = getRegForValue(I->getOperand(1));
+  if (Op1 == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+  // Now we have both operands in registers. Emit the instruction.
+  unsigned ResultReg = FastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode,
+                                   Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+  if (ResultReg == 0)
+    // Target-specific code wasn't able to find a machine opcode for
+    // the given ISD opcode and type. Halt "fast" selection and bail.
+    return false;
+
+  // We successfully emitted code for the given LLVM Instruction.
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::SelectGetElementPtr(const User *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (N == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  // FIXME: What's a good SWAG number for MaxOffs?
+  uint64_t MaxOffs = 2048;
+  Type *Ty = I->getOperand(0)->getType();
+  MVT VT = TLI.getPointerTy();
+  for (GetElementPtrInst::const_op_iterator OI = I->op_begin()+1,
+       E = I->op_end(); OI != E; ++OI) {
+    const Value *Idx = *OI;
+    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        TotalOffs += TD.getStructLayout(StTy)->getElementOffset(Field);
+        if (TotalOffs >= MaxOffs) {
+          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (N == 0)
+            // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
+      }
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero()) continue;
+        // N = N + Offset
+        TotalOffs +=
+          TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+        if (TotalOffs >= MaxOffs) {
+          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (N == 0)
+            // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
+        continue;
+      }
+      if (TotalOffs) {
+        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+        if (N == 0)
+          // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        NIsKill = true;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = TD.getTypeAllocSize(Ty);
+      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+      unsigned IdxN = Pair.first;
+      bool IdxNIsKill = Pair.second;
+      if (IdxN == 0)
+        // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+
+      if (ElementSize != 1) {
+        IdxN = FastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
+        if (IdxN == 0)
+          // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        IdxNIsKill = true;
+      }
+      N = FastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (N == 0)
+        // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+    if (N == 0)
+      // Unhandled operand. Halt "fast" selection and bail.
+      return false;
+  }
+
+  // We successfully emitted code for the given LLVM Instruction.
+  UpdateValueMap(I, N);
+  return true;
+}
+
+bool FastISel::SelectCall(const User *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  // Handle simple inline asms.
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledValue())) {
+    // Don't attempt to handle constraints.
+    if (!IA->getConstraintString().empty())
+      return false;
+
+    unsigned ExtraInfo = 0;
+    if (IA->hasSideEffects())
+      ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::INLINEASM))
+      .addExternalSymbol(IA->getAsmString().c_str())
+      .addImm(ExtraInfo);
+    return true;
+  }
+
+  MachineModuleInfo &MMI = FuncInfo.MF->getMMI();
+  ComputeUsesVAFloatArgument(*Call, &MMI);
+
+  const Function *F = Call->getCalledFunction();
+  if (!F) return false;
+
+  // Handle selected intrinsic function calls.
+  switch (F->getIntrinsicID()) {
+  default: break;
+    // At -O0 we don't care about the lifetime intrinsics.
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
+    return true;
+
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
+    if (!DIVariable(DI->getVariable()).Verify() ||
+        !FuncInfo.MF->getMMI().hasDebugInfo()) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      return true;
+    }
+
+    const Value *Address = DI->getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      return true;
+    }
+
+    unsigned Reg = 0;
+    unsigned Offset = 0;
+    if (const Argument *Arg = dyn_cast<Argument>(Address)) {
+      // Some arguments' frame index is recorded during argument lowering.
+      Offset = FuncInfo.getArgumentFrameIndex(Arg);
+      if (Offset)
+        Reg = TRI.getFrameRegister(*FuncInfo.MF);
+    }
+    if (!Reg)
+      Reg = lookUpRegForValue(Address);
+
+    // If we have a VLA that has a "use" in a metadata node that's then used
+    // here but it has no other uses, then we have a problem. E.g.,
+    //
+    //   int foo (const int *x) {
+    //     char a[*x];
+    //     return 0;
+    //   }
+    //
+    // If we assign 'a' a vreg and fast isel later on has to use the selection
+    // DAG isel, it will want to copy the value to the vreg. However, there are
+    // no uses, which goes counter to what selection DAG isel expects.
+    if (!Reg && !Address->use_empty() && isa<Instruction>(Address) &&
+        (!isa<AllocaInst>(Address) ||
+         !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
+      Reg = FuncInfo.InitializeRegForValue(Address);
+
+    if (Reg)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::DBG_VALUE))
+        .addReg(Reg, RegState::Debug).addImm(Offset)
+        .addMetadata(DI->getVariable());
+    else
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      DEBUG(dbgs() << "Dropping debug info for " << DI);
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst *DI = cast<DbgValueInst>(Call);
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    const Value *V = DI->getValue();
+    if (!V) {
+      // Currently the optimizer can produce this; insert an undef to
+      // help debugging.  Probably the optimizer should not do this.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(0U).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      if (CI->getBitWidth() > 64)
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+          .addCImm(CI).addImm(DI->getOffset())
+          .addMetadata(DI->getVariable());
+      else
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+          .addImm(CI->getZExtValue()).addImm(DI->getOffset())
+          .addMetadata(DI->getVariable());
+    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addFPImm(CF).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
+    } else if (unsigned Reg = lookUpRegForValue(V)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+        .addReg(Reg, RegState::Debug).addImm(DI->getOffset())
+        .addMetadata(DI->getVariable());
+    } else {
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      DEBUG(dbgs() << "Dropping debug info for " << DI);
+    }
+    return true;
+  }
+  case Intrinsic::objectsize: {
+    ConstantInt *CI = cast<ConstantInt>(Call->getArgOperand(1));
+    unsigned long long Res = CI->isZero() ? -1ULL : 0;
+    Constant *ResCI = ConstantInt::get(Call->getType(), Res);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (ResultReg == 0)
+      return false;
+    UpdateValueMap(Call, ResultReg);
+    return true;
+  }
+  case Intrinsic::expect: {
+    unsigned ResultReg = getRegForValue(Call->getArgOperand(0));
+    if (ResultReg == 0)
+      return false;
+    UpdateValueMap(Call, ResultReg);
+    return true;
+  }
+  }
+
+  // Usually, it does not make sense to initialize a value,
+  // make an unrelated function call and use the value, because
+  // it tends to be spilled on the stack. So, we move the pointer
+  // to the last local value to the beginning of the block, so that
+  // all the values which have already been materialized,
+  // appear after the call. It also makes sense to skip intrinsics
+  // since they tend to be inlined.
+  if (!isa<IntrinsicInst>(Call))
+    flushLocalValueMap();
+
+  // An arbitrary call. Bail.
+  return false;
+}
+
+bool FastISel::SelectCast(const User *I, unsigned Opcode) {
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(I->getType());
+
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
+      DstVT == MVT::Other || !DstVT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // Check if the destination type is legal.
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  // Check if the source operand is legal.
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned ResultReg = FastEmit_r(SrcVT.getSimpleVT(),
+                                  DstVT.getSimpleVT(),
+                                  Opcode,
+                                  InputReg, InputRegIsKill);
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::SelectBitCast(const User *I) {
+  // If the bitcast doesn't change the type, just use the operand value.
+  if (I->getType() == I->getOperand(0)->getType()) {
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0)
+      return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+
+  // Bitcasts of other values become reg-reg copies or BITCAST operators.
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstEVT = TLI.getValueType(I->getType());
+  if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
+      !TLI.isTypeLegal(SrcEVT) || !TLI.isTypeLegal(DstEVT))
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DstVT = DstEVT.getSimpleVT();
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (Op0 == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  // First, try to perform the bitcast by inserting a reg-reg copy.
+  unsigned ResultReg = 0;
+  if (SrcVT == DstVT) {
+    const TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
+    const TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
+    // Don't attempt a cross-class copy. It will likely fail.
+    if (SrcClass == DstClass) {
+      ResultReg = createResultReg(DstClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(Op0);
+    }
+  }
+
+  // If the reg-reg copy failed, select a BITCAST opcode.
+  if (!ResultReg)
+    ResultReg = FastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool
+FastISel::SelectInstruction(const Instruction *I) {
+  // Just before the terminator instruction, insert instructions to
+  // feed PHI nodes in successor blocks.
+  if (isa<TerminatorInst>(I))
+    if (!HandlePHINodesInSuccessorBlocks(I->getParent()))
+      return false;
+
+  DL = I->getDebugLoc();
+
+  MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
+
+  // As a special case, don't handle calls to builtin library functions that
+  // may be translated directly to target instructions.
+  if (const CallInst *Call = dyn_cast<CallInst>(I)) {
+    const Function *F = Call->getCalledFunction();
+    LibFunc::Func Func;
+    if (F && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func))
+      return false;
+  }
+
+  // First, try doing target-independent selection.
+  if (SelectOperator(I, I->getOpcode())) {
+    ++NumFastIselSuccessIndependent;
+    DL = DebugLoc();
+    return true;
+  }
+  // Remove dead code.  However, ignore call instructions since we've flushed
+  // the local value map and recomputed the insert point.
+  if (!isa<CallInst>(I)) {
+    recomputeInsertPt();
+    if (SavedInsertPt != FuncInfo.InsertPt)
+      removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+  }
+
+  // Next, try calling the target to attempt to handle the instruction.
+  SavedInsertPt = FuncInfo.InsertPt;
+  if (TargetSelectInstruction(I)) {
+    ++NumFastIselSuccessTarget;
+    DL = DebugLoc();
+    return true;
+  }
+  // Check for dead code and remove as necessary.
+  recomputeInsertPt();
+  if (SavedInsertPt != FuncInfo.InsertPt)
+    removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+
+  DL = DebugLoc();
+  return false;
+}
+
+/// FastEmitBranch - Emit an unconditional branch to the given block,
+/// unless it is the immediate (fall-through) successor, and update
+/// the CFG.
+void
+FastISel::FastEmitBranch(MachineBasicBlock *MSucc, DebugLoc DL) {
+
+  if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
+      FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
+    // For more accurate line information if this is the only instruction
+    // in the block then emit it, otherwise we have the unconditional
+    // fall-through case, which needs no instructions.
+  } else {
+    // The unconditional branch case.
+    TII.InsertBranch(*FuncInfo.MBB, MSucc, NULL,
+                     SmallVector<MachineOperand, 0>(), DL);
+  }
+  FuncInfo.MBB->addSuccessor(MSucc);
+}
+
+/// SelectFNeg - Emit an FNeg operation.
+///
+bool
+FastISel::SelectFNeg(const User *I) {
+  unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
+  if (OpReg == 0) return false;
+
+  bool OpRegIsKill = hasTrivialKill(I);
+
+  // If the target has ISD::FNEG, use it.
+  EVT VT = TLI.getValueType(I->getType());
+  unsigned ResultReg = FastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(),
+                                  ISD::FNEG, OpReg, OpRegIsKill);
+  if (ResultReg != 0) {
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Bitcast the value to integer, twiddle the sign bit with xor,
+  // and then bitcast it back to floating-point.
+  if (VT.getSizeInBits() > 64) return false;
+  EVT IntVT = EVT::getIntegerVT(I->getContext(), VT.getSizeInBits());
+  if (!TLI.isTypeLegal(IntVT))
+    return false;
+
+  unsigned IntReg = FastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+                               ISD::BITCAST, OpReg, OpRegIsKill);
+  if (IntReg == 0)
+    return false;
+
+  unsigned IntResultReg = FastEmit_ri_(IntVT.getSimpleVT(), ISD::XOR,
+                                       IntReg, /*Kill=*/true,
+                                       UINT64_C(1) << (VT.getSizeInBits()-1),
+                                       IntVT.getSimpleVT());
+  if (IntResultReg == 0)
+    return false;
+
+  ResultReg = FastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(),
+                         ISD::BITCAST, IntResultReg, /*Kill=*/true);
+  if (ResultReg == 0)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool
+FastISel::SelectExtractValue(const User *U) {
+  const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
+  if (!EVI)
+    return false;
+
+  // Make sure we only try to handle extracts with a legal result.  But also
+  // allow i1 because it's easy.
+  EVT RealVT = TLI.getValueType(EVI->getType(), /*AllowUnknown=*/true);
+  if (!RealVT.isSimple())
+    return false;
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT) && VT != MVT::i1)
+    return false;
+
+  const Value *Op0 = EVI->getOperand(0);
+  Type *AggTy = Op0->getType();
+
+  // Get the base result register.
+  unsigned ResultReg;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0);
+  if (I != FuncInfo.ValueMap.end())
+    ResultReg = I->second;
+  else if (isa<Instruction>(Op0))
+    ResultReg = FuncInfo.InitializeRegForValue(Op0);
+  else
+    return false; // fast-isel can't handle aggregate constants at the moment
+
+  // Get the actual result register, which is an offset from the base register.
+  unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->getIndices());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+
+  for (unsigned i = 0; i < VTIndex; i++)
+    ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
+
+  UpdateValueMap(EVI, ResultReg);
+  return true;
+}
+
+bool
+FastISel::SelectOperator(const User *I, unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::Add:
+    return SelectBinaryOp(I, ISD::ADD);
+  case Instruction::FAdd:
+    return SelectBinaryOp(I, ISD::FADD);
+  case Instruction::Sub:
+    return SelectBinaryOp(I, ISD::SUB);
+  case Instruction::FSub:
+    // FNeg is currently represented in LLVM IR as a special case of FSub.
+    if (BinaryOperator::isFNeg(I))
+      return SelectFNeg(I);
+    return SelectBinaryOp(I, ISD::FSUB);
+  case Instruction::Mul:
+    return SelectBinaryOp(I, ISD::MUL);
+  case Instruction::FMul:
+    return SelectBinaryOp(I, ISD::FMUL);
+  case Instruction::SDiv:
+    return SelectBinaryOp(I, ISD::SDIV);
+  case Instruction::UDiv:
+    return SelectBinaryOp(I, ISD::UDIV);
+  case Instruction::FDiv:
+    return SelectBinaryOp(I, ISD::FDIV);
+  case Instruction::SRem:
+    return SelectBinaryOp(I, ISD::SREM);
+  case Instruction::URem:
+    return SelectBinaryOp(I, ISD::UREM);
+  case Instruction::FRem:
+    return SelectBinaryOp(I, ISD::FREM);
+  case Instruction::Shl:
+    return SelectBinaryOp(I, ISD::SHL);
+  case Instruction::LShr:
+    return SelectBinaryOp(I, ISD::SRL);
+  case Instruction::AShr:
+    return SelectBinaryOp(I, ISD::SRA);
+  case Instruction::And:
+    return SelectBinaryOp(I, ISD::AND);
+  case Instruction::Or:
+    return SelectBinaryOp(I, ISD::OR);
+  case Instruction::Xor:
+    return SelectBinaryOp(I, ISD::XOR);
+
+  case Instruction::GetElementPtr:
+    return SelectGetElementPtr(I);
+
+  case Instruction::Br: {
+    const BranchInst *BI = cast<BranchInst>(I);
+
+    if (BI->isUnconditional()) {
+      const BasicBlock *LLVMSucc = BI->getSuccessor(0);
+      MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
+      FastEmitBranch(MSucc, BI->getDebugLoc());
+      return true;
+    }
+
+    // Conditional branches are not handed yet.
+    // Halt "fast" selection and bail.
+    return false;
+  }
+
+  case Instruction::Unreachable:
+    // Nothing to emit.
+    return true;
+
+  case Instruction::Alloca:
+    // FunctionLowering has the static-sized case covered.
+    if (FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(I)))
+      return true;
+
+    // Dynamic-sized alloca is not handled yet.
+    return false;
+
+  case Instruction::Call:
+    return SelectCall(I);
+
+  case Instruction::BitCast:
+    return SelectBitCast(I);
+
+  case Instruction::FPToSI:
+    return SelectCast(I, ISD::FP_TO_SINT);
+  case Instruction::ZExt:
+    return SelectCast(I, ISD::ZERO_EXTEND);
+  case Instruction::SExt:
+    return SelectCast(I, ISD::SIGN_EXTEND);
+  case Instruction::Trunc:
+    return SelectCast(I, ISD::TRUNCATE);
+  case Instruction::SIToFP:
+    return SelectCast(I, ISD::SINT_TO_FP);
+
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return SelectCast(I, ISD::ZERO_EXTEND);
+    if (DstVT.bitsLT(SrcVT))
+      return SelectCast(I, ISD::TRUNCATE);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+
+  case Instruction::ExtractValue:
+    return SelectExtractValue(I);
+
+  case Instruction::PHI:
+    llvm_unreachable("FastISel shouldn't visit PHI nodes!");
+
+  default:
+    // Unhandled instruction. Halt "fast" selection and bail.
+    return false;
+  }
+}
+
+FastISel::FastISel(FunctionLoweringInfo &funcInfo,
+                   const TargetLibraryInfo *libInfo)
+  : FuncInfo(funcInfo),
+    MRI(FuncInfo.MF->getRegInfo()),
+    MFI(*FuncInfo.MF->getFrameInfo()),
+    MCP(*FuncInfo.MF->getConstantPool()),
+    TM(FuncInfo.MF->getTarget()),
+    TD(*TM.getDataLayout()),
+    TII(*TM.getInstrInfo()),
+    TLI(*TM.getTargetLowering()),
+    TRI(*TM.getRegisterInfo()),
+    LibInfo(libInfo) {
+}
+
+FastISel::~FastISel() {}
+
+bool FastISel::FastLowerArguments() {
+  return false;
+}
+
+unsigned FastISel::FastEmit_(MVT, MVT,
+                             unsigned) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_r(MVT, MVT,
+                              unsigned,
+                              unsigned /*Op0*/, bool /*Op0IsKill*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rr(MVT, MVT,
+                               unsigned,
+                               unsigned /*Op0*/, bool /*Op0IsKill*/,
+                               unsigned /*Op1*/, bool /*Op1IsKill*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_f(MVT, MVT,
+                              unsigned, const ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_ri(MVT, MVT,
+                               unsigned,
+                               unsigned /*Op0*/, bool /*Op0IsKill*/,
+                               uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rf(MVT, MVT,
+                               unsigned,
+                               unsigned /*Op0*/, bool /*Op0IsKill*/,
+                               const ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::FastEmit_rri(MVT, MVT,
+                                unsigned,
+                                unsigned /*Op0*/, bool /*Op0IsKill*/,
+                                unsigned /*Op1*/, bool /*Op1IsKill*/,
+                                uint64_t /*Imm*/) {
+  return 0;
+}
+
+/// FastEmit_ri_ - This method is a wrapper of FastEmit_ri. It first tries
+/// to emit an instruction with an immediate operand using FastEmit_ri.
+/// If that fails, it materializes the immediate into a register and try
+/// FastEmit_rr instead.
+unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
+                                unsigned Op0, bool Op0IsKill,
+                                uint64_t Imm, MVT ImmType) {
+  // If this is a multiply by a power of two, emit this as a shift left.
+  if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
+    Opcode = ISD::SHL;
+    Imm = Log2_64(Imm);
+  } else if (Opcode == ISD::UDIV && isPowerOf2_64(Imm)) {
+    // div x, 8 -> srl x, 3
+    Opcode = ISD::SRL;
+    Imm = Log2_64(Imm);
+  }
+
+  // Horrible hack (to be removed), check to make sure shift amounts are
+  // in-range.
+  if ((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+      Imm >= VT.getSizeInBits())
+    return 0;
+
+  // First check if immediate type is legal. If not, we can't use the ri form.
+  unsigned ResultReg = FastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
+  if (ResultReg != 0)
+    return ResultReg;
+  unsigned MaterialReg = FastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  if (MaterialReg == 0) {
+    // This is a bit ugly/slow, but failing here means falling out of
+    // fast-isel, which would be very slow.
+    IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
+                                              VT.getSizeInBits());
+    MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
+    assert (MaterialReg != 0 && "Unable to materialize imm.");
+    if (MaterialReg == 0) return 0;
+  }
+  return FastEmit_rr(VT, VT, Opcode,
+                     Op0, Op0IsKill,
+                     MaterialReg, /*Kill=*/true);
+}
+
+unsigned FastISel::createResultReg(const TargetRegisterClass* RC) {
+  return MRI.createVirtualRegister(RC);
+}
+
+unsigned FastISel::FastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass* RC) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg);
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  unsigned Op0, bool Op0IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   unsigned Op1, bool Op1IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   unsigned Op1, bool Op1IsKill,
+                                   unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addReg(Op2, Op2IsKill * RegState::Kill);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rii(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addImm(Imm1)
+      .addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addImm(Imm1)
+      .addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC,
+                                   unsigned Op0, bool Op0IsKill,
+                                   const ConstantFP *FPImm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addFPImm(FPImm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addFPImm(FPImm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill,
+                                    unsigned Op1, bool Op1IsKill,
+                                    uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_rrii(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill,
+                                     uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+      .addReg(Op0, Op0IsKill * RegState::Kill)
+      .addReg(Op1, Op1IsKill * RegState::Kill)
+      .addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg).addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+      .addImm(Imm1).addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II).addImm(Imm1).addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::FastEmitInst_extractsubreg(MVT RetVT,
+                                              unsigned Op0, bool Op0IsKill,
+                                              uint32_t Idx) {
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  const TargetRegisterClass *RC = MRI.getRegClass(Op0);
+  MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+          DL, TII.get(TargetOpcode::COPY), ResultReg)
+    .addReg(Op0, getKillRegState(Op0IsKill), Idx);
+  return ResultReg;
+}
+
+/// FastEmitZExtFromI1 - Emit MachineInstrs to compute the value of Op
+/// with all but the least significant bit set to zero.
+unsigned FastISel::FastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
+  return FastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
+}
+
+/// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
+/// Emit code to ensure constants are copied into registers when needed.
+/// Remember the virtual registers that need to be added to the Machine PHI
+/// nodes as input.  We cannot just directly add them, because expansion
+/// might result in multiple MBB's for one BB.  As such, the start of the
+/// BB might correspond to a different MBB than the end.
+bool FastISel::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+  unsigned OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB)) continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::const_iterator I = SuccBB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      // Only handle legal types. Two interesting things to note here. First,
+      // by bailing out early, we may leave behind some dead instructions,
+      // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
+      // own moves. Second, this check is necessary because FastISel doesn't
+      // use CreateRegs to create registers, so it always creates
+      // exactly one register for each non-void instruction.
+      EVT VT = TLI.getValueType(PN->getType(), /*AllowUnknown=*/true);
+      if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
+        // Handle integer promotions, though, because they're common and easy.
+        if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+          VT = TLI.getTypeToTransformTo(LLVMBB->getContext(), VT);
+        else {
+          FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+          return false;
+        }
+      }
+
+      const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      // Set the DebugLoc for the copy. Prefer the location of the operand
+      // if there is one; use the location of the PHI otherwise.
+      DL = PN->getDebugLoc();
+      if (const Instruction *Inst = dyn_cast<Instruction>(PHIOp))
+        DL = Inst->getDebugLoc();
+
+      unsigned Reg = getRegForValue(PHIOp);
+      if (Reg == 0) {
+        FuncInfo.PHINodesToUpdate.resize(OrigNumPHINodesToUpdate);
+        return false;
+      }
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg));
+      DL = DebugLoc();
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
new file mode 100644
index 000000000000..b46edad7a3d4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -0,0 +1,483 @@
+//===-- FunctionLoweringInfo.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating functions from LLVM IR into
+// Machine IR.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "function-lowering-info"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+/// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
+/// PHI nodes or outside of the basic block that defines it, or used by a
+/// switch or atomic instruction, which may expand to multiple basic blocks.
+static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
+  if (I->use_empty()) return false;
+  if (isa<PHINode>(I)) return true;
+  const BasicBlock *BB = I->getParent();
+  for (Value::const_use_iterator UI = I->use_begin(), E = I->use_end();
+        UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
+      return true;
+  }
+  return false;
+}
+
+FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
+  : TLI(tli) {
+}
+
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
+  Fn = &fn;
+  MF = &mf;
+  RegInfo = &MF->getRegInfo();
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, TLI);
+  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), *MF,
+                                      Fn->isVarArg(),
+                                      Outs, Fn->getContext());
+
+  // Initialize the mapping of values to registers.  This is only set up for
+  // instruction values that are used outside of the block that defines
+  // them.
+  Function::const_iterator BB = Fn->begin(), EB = Fn->end();
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
+        Type *Ty = AI->getAllocatedType();
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align =
+          std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+                   AI->getAlignment());
+
+        TySize *= CUI->getZExtValue();   // Get total allocated size.
+        if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+
+        // The object may need to be placed onto the stack near the stack
+        // protector if one exists. Determine here if this object is a suitable
+        // candidate. I.e., it would trigger the creation of a stack protector.
+        bool MayNeedSP =
+          (AI->isArrayAllocation() ||
+           (TySize >= 8 && isa<ArrayType>(Ty) &&
+            cast<ArrayType>(Ty)->getElementType()->isIntegerTy(8)));
+        StaticAllocaMap[AI] =
+          MF->getFrameInfo()->CreateStackObject(TySize, Align, false,
+                                                MayNeedSP, AI);
+      }
+
+  for (; BB != EB; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      // Mark values used outside their block as exported, by allocating
+      // a virtual register for them.
+      if (isUsedOutsideOfDefiningBlock(I))
+        if (!isa<AllocaInst>(I) ||
+            !StaticAllocaMap.count(cast<AllocaInst>(I)))
+          InitializeRegForValue(I);
+
+      // Collect llvm.dbg.declare information. This is done now instead of
+      // during the initial isel pass through the IR so that it is done
+      // in a predictable order.
+      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(I)) {
+        MachineModuleInfo &MMI = MF->getMMI();
+        if (MMI.hasDebugInfo() &&
+            DIVariable(DI->getVariable()).Verify() &&
+            !DI->getDebugLoc().isUnknown()) {
+          // Don't handle byval struct arguments or VLAs, for example.
+          // Non-byval arguments are handled here (they refer to the stack
+          // temporary alloca at this point).
+          const Value *Address = DI->getAddress();
+          if (Address) {
+            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+              Address = BCI->getOperand(0);
+            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
+              DenseMap<const AllocaInst *, int>::iterator SI =
+                StaticAllocaMap.find(AI);
+              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
+                int FI = SI->second;
+                MMI.setVariableDbgInfo(DI->getVariable(),
+                                       FI, DI->getDebugLoc());
+              }
+            }
+          }
+        }
+      }
+    }
+
+  // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
+  // also creates the initial PHI MachineInstrs, though none of the input
+  // operands are populated.
+  for (BB = Fn->begin(); BB != EB; ++BB) {
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(BB);
+    MBBMap[BB] = MBB;
+    MF->push_back(MBB);
+
+    // Transfer the address-taken flag. This is necessary because there could
+    // be multiple MachineBasicBlocks corresponding to one BasicBlock, and only
+    // the first one should be marked.
+    if (BB->hasAddressTaken())
+      MBB->setHasAddressTaken();
+
+    // Create Machine PHI nodes for LLVM PHI nodes, lowering them as
+    // appropriate.
+    for (BasicBlock::const_iterator I = BB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      if (PN->use_empty()) continue;
+
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
+      DebugLoc DL = PN->getDebugLoc();
+      unsigned PHIReg = ValueMap[PN];
+      assert(PHIReg && "PHI node does not have an assigned virtual register!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(Fn->getContext(), VT);
+        const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
+        PHIReg += NumRegisters;
+      }
+    }
+  }
+
+  // Mark landing pad blocks.
+  for (BB = Fn->begin(); BB != EB; ++BB)
+    if (const InvokeInst *Invoke = dyn_cast<InvokeInst>(BB->getTerminator()))
+      MBBMap[Invoke->getSuccessor(1)]->setIsLandingPad();
+}
+
+/// clear - Clear out all the function-specific state. This returns this
+/// FunctionLoweringInfo to an empty state, ready to be used for a
+/// different function.
+void FunctionLoweringInfo::clear() {
+  assert(CatchInfoFound.size() == CatchInfoLost.size() &&
+         "Not all catch info was assigned to a landing pad!");
+
+  MBBMap.clear();
+  ValueMap.clear();
+  StaticAllocaMap.clear();
+#ifndef NDEBUG
+  CatchInfoLost.clear();
+  CatchInfoFound.clear();
+#endif
+  LiveOutRegInfo.clear();
+  VisitedBBs.clear();
+  ArgDbgValues.clear();
+  ByValArgFrameIndexMap.clear();
+  RegFixups.clear();
+}
+
+/// CreateReg - Allocate a single virtual register for the given type.
+unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
+  return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
+}
+
+/// CreateRegs - Allocate the appropriate number of virtual registers of
+/// the correctly promoted or expanded types.  Assign these registers
+/// consecutive vreg numbers and return the first assigned number.
+///
+/// In the case that the given value has struct or array type, this function
+/// will assign registers for each member or element.
+///
+unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, Ty, ValueVTs);
+
+  unsigned FirstReg = 0;
+  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    MVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
+
+    unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      unsigned R = CreateReg(RegisterVT);
+      if (!FirstReg) FirstReg = R;
+    }
+  }
+  return FirstReg;
+}
+
+/// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
+/// register is a PHI destination and the PHI's LiveOutInfo is not valid. If
+/// the register's LiveOutInfo is for a smaller bit width, it is extended to
+/// the larger bit width by zero extension. The bit width must be no smaller
+/// than the LiveOutInfo's existing bit width.
+const FunctionLoweringInfo::LiveOutInfo *
+FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
+  if (!LiveOutRegInfo.inBounds(Reg))
+    return NULL;
+
+  LiveOutInfo *LOI = &LiveOutRegInfo[Reg];
+  if (!LOI->IsValid)
+    return NULL;
+
+  if (BitWidth > LOI->KnownZero.getBitWidth()) {
+    LOI->NumSignBits = 1;
+    LOI->KnownZero = LOI->KnownZero.zextOrTrunc(BitWidth);
+    LOI->KnownOne = LOI->KnownOne.zextOrTrunc(BitWidth);
+  }
+
+  return LOI;
+}
+
+/// ComputePHILiveOutRegInfo - Compute LiveOutInfo for a PHI's destination
+/// register based on the LiveOutInfo of its operands.
+void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
+  Type *Ty = PN->getType();
+  if (!Ty->isIntegerTy() || Ty->isVectorTy())
+    return;
+
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(TLI, Ty, ValueVTs);
+  assert(ValueVTs.size() == 1 &&
+         "PHIs with non-vector integer types should have a single VT.");
+  EVT IntVT = ValueVTs[0];
+
+  if (TLI.getNumRegisters(PN->getContext(), IntVT) != 1)
+    return;
+  IntVT = TLI.getTypeToTransformTo(PN->getContext(), IntVT);
+  unsigned BitWidth = IntVT.getSizeInBits();
+
+  unsigned DestReg = ValueMap[PN];
+  if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+    return;
+  LiveOutRegInfo.grow(DestReg);
+  LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
+
+  Value *V = PN->getIncomingValue(0);
+  if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+    DestLOI.NumSignBits = 1;
+    APInt Zero(BitWidth, 0);
+    DestLOI.KnownZero = Zero;
+    DestLOI.KnownOne = Zero;
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+    DestLOI.NumSignBits = Val.getNumSignBits();
+    DestLOI.KnownZero = ~Val;
+    DestLOI.KnownOne = Val;
+  } else {
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
+                                "CopyToReg node was created.");
+    unsigned SrcReg = ValueMap[V];
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI = *SrcLOI;
+  }
+
+  assert(DestLOI.KnownZero.getBitWidth() == BitWidth &&
+         DestLOI.KnownOne.getBitWidth() == BitWidth &&
+         "Masks should have the same bit width as the type.");
+
+  for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+      DestLOI.NumSignBits = 1;
+      APInt Zero(BitWidth, 0);
+      DestLOI.KnownZero = Zero;
+      DestLOI.KnownOne = Zero;
+      return;
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+      DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
+      DestLOI.KnownZero &= ~Val;
+      DestLOI.KnownOne &= Val;
+      continue;
+    }
+
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
+                                "its CopyToReg node was created.");
+    unsigned SrcReg = ValueMap[V];
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
+    DestLOI.KnownZero &= SrcLOI->KnownZero;
+    DestLOI.KnownOne &= SrcLOI->KnownOne;
+  }
+}
+
+/// setArgumentFrameIndex - Record frame index for the byval
+/// argument. This overrides previous frame index entry for this argument,
+/// if any.
+void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
+                                                 int FI) {
+  ByValArgFrameIndexMap[A] = FI;
+}
+
+/// getArgumentFrameIndex - Get frame index for the byval argument.
+/// If the argument does not have any assigned frame index then 0 is
+/// returned.
+int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
+  DenseMap<const Argument *, int>::iterator I =
+    ByValArgFrameIndexMap.find(A);
+  if (I != ByValArgFrameIndexMap.end())
+    return I->second;
+  DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
+  return 0;
+}
+
+/// ComputeUsesVAFloatArgument - Determine if any floating-point values are
+/// being passed to this variadic function, and set the MachineModuleInfo's
+/// usesVAFloatArgument flag if so. This flag is used to emit an undefined
+/// reference to _fltused on Windows, which will link in MSVCRT's
+/// floating-point support.
+void llvm::ComputeUsesVAFloatArgument(const CallInst &I,
+                                      MachineModuleInfo *MMI)
+{
+  FunctionType *FT = cast<FunctionType>(
+    I.getCalledValue()->getType()->getContainedType(0));
+  if (FT->isVarArg() && !MMI->usesVAFloatArgument()) {
+    for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+      Type* T = I.getArgOperand(i)->getType();
+      for (po_iterator<Type*> i = po_begin(T), e = po_end(T);
+           i != e; ++i) {
+        if (i->isFloatingPointTy()) {
+          MMI->setUsesVAFloatArgument(true);
+          return;
+        }
+      }
+    }
+  }
+}
+
+/// AddCatchInfo - Extract the personality and type infos from an eh.selector
+/// call, and add them to the specified machine basic block.
+void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
+                        MachineBasicBlock *MBB) {
+  // Inform the MachineModuleInfo of the personality for this landing pad.
+  const ConstantExpr *CE = cast<ConstantExpr>(I.getArgOperand(1));
+  assert(CE->getOpcode() == Instruction::BitCast &&
+         isa<Function>(CE->getOperand(0)) &&
+         "Personality should be a function");
+  MMI->addPersonality(MBB, cast<Function>(CE->getOperand(0)));
+
+  // Gather all the type infos for this landing pad and pass them along to
+  // MachineModuleInfo.
+  std::vector<const GlobalVariable *> TyInfo;
+  unsigned N = I.getNumArgOperands();
+
+  for (unsigned i = N - 1; i > 1; --i) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(i))) {
+      unsigned FilterLength = CI->getZExtValue();
+      unsigned FirstCatch = i + FilterLength + !FilterLength;
+      assert(FirstCatch <= N && "Invalid filter length");
+
+      if (FirstCatch < N) {
+        TyInfo.reserve(N - FirstCatch);
+        for (unsigned j = FirstCatch; j < N; ++j)
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
+        MMI->addCatchTypeInfo(MBB, TyInfo);
+        TyInfo.clear();
+      }
+
+      if (!FilterLength) {
+        // Cleanup.
+        MMI->addCleanup(MBB);
+      } else {
+        // Filter.
+        TyInfo.reserve(FilterLength - 1);
+        for (unsigned j = i + 1; j < FirstCatch; ++j)
+          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
+        MMI->addFilterTypeInfo(MBB, TyInfo);
+        TyInfo.clear();
+      }
+
+      N = i;
+    }
+  }
+
+  if (N > 2) {
+    TyInfo.reserve(N - 2);
+    for (unsigned j = 2; j < N; ++j)
+      TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
+    MMI->addCatchTypeInfo(MBB, TyInfo);
+  }
+}
+
+/// AddLandingPadInfo - Extract the exception handling information from the
+/// landingpad instruction and add them to the specified machine module info.
+void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
+                             MachineBasicBlock *MBB) {
+  MMI.addPersonality(MBB,
+                     cast<Function>(I.getPersonalityFn()->stripPointerCasts()));
+
+  if (I.isCleanup())
+    MMI.addCleanup(MBB);
+
+  // FIXME: New EH - Add the clauses in reverse order. This isn't 100% correct,
+  //        but we need to do it this way because of how the DWARF EH emitter
+  //        processes the clauses.
+  for (unsigned i = I.getNumClauses(); i != 0; --i) {
+    Value *Val = I.getClause(i - 1);
+    if (I.isCatch(i - 1)) {
+      MMI.addCatchTypeInfo(MBB,
+                           dyn_cast<GlobalVariable>(Val->stripPointerCasts()));
+    } else {
+      // Add filters in a list.
+      Constant *CVal = cast<Constant>(Val);
+      SmallVector<const GlobalVariable*, 4> FilterList;
+      for (User::op_iterator
+             II = CVal->op_begin(), IE = CVal->op_end(); II != IE; ++II)
+        FilterList.push_back(cast<GlobalVariable>((*II)->stripPointerCasts()));
+
+      MMI.addFilterTypeInfo(MBB, FilterList);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
new file mode 100644
index 000000000000..3b1abd7c836e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -0,0 +1,982 @@
+//==--- InstrEmitter.cpp - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instr-emitter"
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+/// MinRCSize - Smallest register class we allow when constraining virtual
+/// registers.  If satisfying all register class constraints would require
+/// using a smaller register class, emit a COPY to a new virtual register
+/// instead.
+const unsigned MinRCSize = 4;
+
+/// CountResults - The results of target nodes have register or immediate
+/// operands first, then an optional chain, and optional glue operands (which do
+/// not go into the resulting MachineInstr).
+unsigned InstrEmitter::CountResults(SDNode *Node) {
+  unsigned N = Node->getNumValues();
+  while (N && Node->getValueType(N - 1) == MVT::Glue)
+    --N;
+  if (N && Node->getValueType(N - 1) == MVT::Other)
+    --N;    // Skip over chain result.
+  return N;
+}
+
+/// countOperands - The inputs to target nodes have any actual inputs first,
+/// followed by an optional chain operand, then an optional glue operand.
+/// Compute the number of actual operands that will go into the resulting
+/// MachineInstr.
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
+                              unsigned &NumImpUses) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
+    --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  NumImpUses = N - NumExpUses;
+  for (unsigned I = N; I > NumExpUses; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
+  return N;
+}
+
+/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+/// implicit physical register output.
+void InstrEmitter::
+EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
+                unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VRBase = 0;
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    // Just use the input register directly!
+    SDValue Op(Node, ResNo);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
+    (void)isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+    return;
+  }
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  bool MatchReg = true;
+  const TargetRegisterClass *UseRC = NULL;
+  MVT VT = Node->getSimpleValueType(ResNo);
+
+  // Stick to the preferred register classes for legal types.
+  if (TLI->isTypeLegal(VT))
+    UseRC = TLI->getRegClassFor(VT);
+
+  if (!IsClone && !IsCloned)
+    for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      bool Match = true;
+      if (User->getOpcode() == ISD::CopyToReg &&
+          User->getOperand(2).getNode() == Node &&
+          User->getOperand(2).getResNo() == ResNo) {
+        unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+          VRBase = DestReg;
+          Match = false;
+        } else if (DestReg != SrcReg)
+          Match = false;
+      } else {
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          SDValue Op = User->getOperand(i);
+          if (Op.getNode() != Node || Op.getResNo() != ResNo)
+            continue;
+          MVT VT = Node->getSimpleValueType(Op.getResNo());
+          if (VT == MVT::Other || VT == MVT::Glue)
+            continue;
+          Match = false;
+          if (User->isMachineOpcode()) {
+            const MCInstrDesc &II = TII->get(User->getMachineOpcode());
+            const TargetRegisterClass *RC = 0;
+            if (i+II.getNumDefs() < II.getNumOperands()) {
+              RC = TRI->getAllocatableClass(
+                TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF));
+            }
+            if (!UseRC)
+              UseRC = RC;
+            else if (RC) {
+              const TargetRegisterClass *ComRC =
+                TRI->getCommonSubClass(UseRC, RC);
+              // If multiple uses expect disjoint register classes, we emit
+              // copies in AddRegisterOperand.
+              if (ComRC)
+                UseRC = ComRC;
+            }
+          }
+        }
+      }
+      MatchReg &= Match;
+      if (VRBase)
+        break;
+    }
+
+  const TargetRegisterClass *SrcRC = 0, *DstRC = 0;
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+
+  // Figure out the register class to create for the destreg.
+  if (VRBase) {
+    DstRC = MRI->getRegClass(VRBase);
+  } else if (UseRC) {
+    assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!");
+    DstRC = UseRC;
+  } else {
+    DstRC = TLI->getRegClassFor(VT);
+  }
+
+  // If all uses are reading from the src physical register and copying the
+  // register is either impossible or very expensive, then don't create a copy.
+  if (MatchReg && SrcRC->getCopyCost() < 0) {
+    VRBase = SrcReg;
+  } else {
+    // Create the reg, emit the copy.
+    VRBase = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
+  }
+
+  SDValue Op(Node, ResNo);
+  if (IsClone)
+    VRBaseMap.erase(Op);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// getDstOfCopyToRegUse - If the only use of the specified result number of
+/// node is a CopyToReg, return its destination register. Return 0 otherwise.
+unsigned InstrEmitter::getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                                unsigned ResNo) const {
+  if (!Node->hasOneUse())
+    return 0;
+
+  SDNode *User = *Node->use_begin();
+  if (User->getOpcode() == ISD::CopyToReg &&
+      User->getOperand(2).getNode() == Node &&
+      User->getOperand(2).getResNo() == ResNo) {
+    unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return Reg;
+  }
+  return 0;
+}
+
+void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
+                                       MachineInstrBuilder &MIB,
+                                       const MCInstrDesc &II,
+                                       bool IsClone, bool IsCloned,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
+         "IMPLICIT_DEF should have been handled as a special case elsewhere!");
+
+  for (unsigned i = 0; i < II.getNumDefs(); ++i) {
+    // If the specific node value is only used by a CopyToReg and the dest reg
+    // is a vreg in the same register class, use the CopyToReg'd destination
+    // register instead of creating a new vreg.
+    unsigned VRBase = 0;
+    const TargetRegisterClass *RC =
+      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    if (II.OpInfo[i].isOptionalDef()) {
+      // Optional def must be a physical register.
+      unsigned NumResults = CountResults(Node);
+      VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    if (!VRBase && !IsClone && !IsCloned)
+      for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+           UI != E; ++UI) {
+        SDNode *User = *UI;
+        if (User->getOpcode() == ISD::CopyToReg &&
+            User->getOperand(2).getNode() == Node &&
+            User->getOperand(2).getResNo() == i) {
+          unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+            const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
+            if (RegRC == RC) {
+              VRBase = Reg;
+              MIB.addReg(VRBase, RegState::Define);
+              break;
+            }
+          }
+        }
+      }
+
+    // Create the result registers for this node and add the result regs to
+    // the machine instruction.
+    if (VRBase == 0) {
+      assert(RC && "Isn't a register operand!");
+      VRBase = MRI->createVirtualRegister(RC);
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    SDValue Op(Node, i);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+    (void)isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+  }
+}
+
+/// getVR - Return the virtual register corresponding to the specified result
+/// of the specified node.
+unsigned InstrEmitter::getVR(SDValue Op,
+                             DenseMap<SDValue, unsigned> &VRBaseMap) {
+  if (Op.isMachineOpcode() &&
+      Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+    // Add an IMPLICIT_DEF instruction before every use.
+    unsigned VReg = getDstOfOnlyCopyToRegUse(Op.getNode(), Op.getResNo());
+    // IMPLICIT_DEF can produce any type of result so its MCInstrDesc
+    // does not include operand register class info.
+    if (!VReg) {
+      const TargetRegisterClass *RC =
+        TLI->getRegClassFor(Op.getSimpleValueType());
+      VReg = MRI->createVirtualRegister(RC);
+    }
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+    return VReg;
+  }
+
+  DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+  assert(I != VRBaseMap.end() && "Node emitted out of order - late");
+  return I->second;
+}
+
+
+/// AddRegisterOperand - Add the specified register as an operand to the
+/// specified machine instr. Insert register copies if the register is
+/// not in the required register class.
+void
+InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
+                                 SDValue Op,
+                                 unsigned IIOpNum,
+                                 const MCInstrDesc *II,
+                                 DenseMap<SDValue, unsigned> &VRBaseMap,
+                                 bool IsDebug, bool IsClone, bool IsCloned) {
+  assert(Op.getValueType() != MVT::Other &&
+         Op.getValueType() != MVT::Glue &&
+         "Chain and glue operands should occur at end of operand list!");
+  // Get/emit the operand.
+  unsigned VReg = getVR(Op, VRBaseMap);
+  assert(TargetRegisterInfo::isVirtualRegister(VReg) && "Not a vreg?");
+
+  const MCInstrDesc &MCID = MIB->getDesc();
+  bool isOptDef = IIOpNum < MCID.getNumOperands() &&
+    MCID.OpInfo[IIOpNum].isOptionalDef();
+
+  // If the instruction requires a register in a different class, create
+  // a new virtual register and copy the value into it, but first attempt to
+  // shrink VReg's register class within reason.  For example, if VReg == GR32
+  // and II requires a GR32_NOSP, just constrain VReg to GR32_NOSP.
+  if (II) {
+    const TargetRegisterClass *DstRC = 0;
+    if (IIOpNum < II->getNumOperands())
+      DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
+    if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
+      unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+      VReg = NewVReg;
+    }
+  }
+
+  // If this value has only one use, that use is a kill. This is a
+  // conservative approximation. InstrEmitter does trivial coalescing
+  // with CopyFromReg nodes, so don't emit kill flags for them.
+  // Avoid kill flags on Schedule cloned nodes, since there will be
+  // multiple uses.
+  // Tied operands are never killed, so we need to check that. And that
+  // means we need to determine the index of the operand.
+  bool isKill = Op.hasOneUse() &&
+                Op.getNode()->getOpcode() != ISD::CopyFromReg &&
+                !IsDebug &&
+                !(IsClone || IsCloned);
+  if (isKill) {
+    unsigned Idx = MIB->getNumOperands();
+    while (Idx > 0 &&
+           MIB->getOperand(Idx-1).isReg() &&
+           MIB->getOperand(Idx-1).isImplicit())
+      --Idx;
+    bool isTied = MCID.getOperandConstraint(Idx, MCOI::TIED_TO) != -1;
+    if (isTied)
+      isKill = false;
+  }
+
+  MIB.addReg(VReg, getDefRegState(isOptDef) | getKillRegState(isKill) |
+             getDebugRegState(IsDebug));
+}
+
+/// AddOperand - Add the specified operand to the specified machine instr.  II
+/// specifies the instruction information for the node, and IIOpNum is the
+/// operand number (in the II) that we are adding.
+void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
+                              SDValue Op,
+                              unsigned IIOpNum,
+                              const MCInstrDesc *II,
+                              DenseMap<SDValue, unsigned> &VRBaseMap,
+                              bool IsDebug, bool IsClone, bool IsCloned) {
+  if (Op.isMachineOpcode()) {
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    MIB.addImm(C->getSExtValue());
+  } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
+    MIB.addFPImm(F->getConstantFPValue());
+  } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MIB.addReg(R->getReg(), getImplRegState(Imp));
+  } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
+    MIB.addRegMask(RM->getRegMask());
+  } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+    MIB.addGlobalAddress(TGA->getGlobal(), TGA->getOffset(),
+                         TGA->getTargetFlags());
+  } else if (BasicBlockSDNode *BBNode = dyn_cast<BasicBlockSDNode>(Op)) {
+    MIB.addMBB(BBNode->getBasicBlock());
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+    MIB.addFrameIndex(FI->getIndex());
+  } else if (JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op)) {
+    MIB.addJumpTableIndex(JT->getIndex(), JT->getTargetFlags());
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
+    int Offset = CP->getOffset();
+    unsigned Align = CP->getAlignment();
+    Type *Type = CP->getType();
+    // MachineConstantPool wants an explicit alignment.
+    if (Align == 0) {
+      Align = TM->getDataLayout()->getPrefTypeAlignment(Type);
+      if (Align == 0) {
+        // Alignment of vector types.  FIXME!
+        Align = TM->getDataLayout()->getTypeAllocSize(Type);
+      }
+    }
+
+    unsigned Idx;
+    MachineConstantPool *MCP = MF->getConstantPool();
+    if (CP->isMachineConstantPoolEntry())
+      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Align);
+    else
+      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Align);
+    MIB.addConstantPoolIndex(Idx, Offset, CP->getTargetFlags());
+  } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+    MIB.addExternalSymbol(ES->getSymbol(), ES->getTargetFlags());
+  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+    MIB.addBlockAddress(BA->getBlockAddress(),
+                        BA->getOffset(),
+                        BA->getTargetFlags());
+  } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
+    MIB.addTargetIndex(TI->getIndex(), TI->getOffset(), TI->getTargetFlags());
+  } else {
+    assert(Op.getValueType() != MVT::Other &&
+           Op.getValueType() != MVT::Glue &&
+           "Chain and glue operands should occur at end of operand list!");
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  }
+}
+
+unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+                                          MVT VT, DebugLoc DL) {
+  const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
+  const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
+
+  // RC is a sub-class of VRC that supports SubIdx.  Try to constrain VReg
+  // within reason.
+  if (RC && RC != VRC)
+    RC = MRI->constrainRegClass(VReg, RC, MinRCSize);
+
+  // VReg has been adjusted.  It can be used with SubIdx operands now.
+  if (RC)
+    return VReg;
+
+  // VReg couldn't be reasonably constrained.  Emit a COPY to a new virtual
+  // register instead.
+  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT), SubIdx);
+  assert(RC && "No legal register class for VT supports that SubIdx");
+  unsigned NewReg = MRI->createVirtualRegister(RC);
+  BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
+    .addReg(VReg);
+  return NewReg;
+}
+
+/// EmitSubregNode - Generate machine code for subreg nodes.
+///
+void InstrEmitter::EmitSubregNode(SDNode *Node,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned VRBase = 0;
+  unsigned Opc = Node->getMachineOpcode();
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
+       UI != E; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() == ISD::CopyToReg &&
+        User->getOperand(2).getNode() == Node) {
+      unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        VRBase = DestReg;
+        break;
+      }
+    }
+  }
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
+    // constraints on the %dst register, COPY can target all legal register
+    // classes.
+    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const TargetRegisterClass *TRC =
+      TLI->getRegClassFor(Node->getSimpleValueType(0));
+
+    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+    MachineInstr *DefMI = MRI->getVRegDef(VReg);
+    unsigned SrcReg, DstReg, DefSubIdx;
+    if (DefMI &&
+        TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
+      // Optimize these:
+      // r1025 = s/zext r1024, 4
+      // r1026 = extract_subreg r1025, 4
+      // to a copy
+      // r1026 = copy r1024
+      VRBase = MRI->createVirtualRegister(TRC);
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+      MRI->clearKillFlags(SrcReg);
+    } else {
+      // VReg may not support a SubIdx sub-register, and we may need to
+      // constrain its register class or issue a COPY to a compatible register
+      // class.
+      VReg = ConstrainForSubReg(VReg, SubIdx,
+                                Node->getOperand(0).getSimpleValueType(),
+                                Node->getDebugLoc());
+
+      // Create the destreg if it is missing.
+      if (VRBase == 0)
+        VRBase = MRI->createVirtualRegister(TRC);
+
+      // Create the extract_subreg machine instruction.
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(VReg, 0, SubIdx);
+    }
+  } else if (Opc == TargetOpcode::INSERT_SUBREG ||
+             Opc == TargetOpcode::SUBREG_TO_REG) {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+    // Figure out the register class to create for the destreg.  It should be
+    // the largest legal register class supporting SubIdx sub-registers.
+    // RegisterCoalescer will constrain it further if it decides to eliminate
+    // the INSERT_SUBREG instruction.
+    //
+    //   %dst = INSERT_SUBREG %src, %sub, SubIdx
+    //
+    // is lowered by TwoAddressInstructionPass to:
+    //
+    //   %dst = COPY %src
+    //   %dst:SubIdx = COPY %sub
+    //
+    // There is no constraint on the %src register class.
+    //
+    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0));
+    SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
+    assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
+
+    if (VRBase == 0 || !SRC->hasSubClassEq(MRI->getRegClass(VRBase)))
+      VRBase = MRI->createVirtualRegister(SRC);
+
+    // Create the insert_subreg or subreg_to_reg machine instruction.
+    MachineInstrBuilder MIB =
+      BuildMI(*MF, Node->getDebugLoc(), TII->get(Opc), VRBase);
+
+    // If creating a subreg_to_reg, then the first input operand
+    // is an implicit value immediate, otherwise it's a register
+    if (Opc == TargetOpcode::SUBREG_TO_REG) {
+      const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
+      MIB.addImm(SD->getZExtValue());
+    } else
+      AddOperand(MIB, N0, 0, 0, VRBaseMap, /*IsDebug=*/false,
+                 IsClone, IsCloned);
+    // Add the subregster being inserted
+    AddOperand(MIB, N1, 0, 0, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+    MIB.addImm(SubIdx);
+    MBB->insert(InsertPos, MIB);
+  } else
+    llvm_unreachable("Node is not insert_subreg, extract_subreg, or subreg_to_reg");
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+/// COPY_TO_REGCLASS is just a normal copy, except that the destination
+/// register is constrained to be in a particular register class.
+///
+void
+InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
+                                     DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+
+  // Create the new VReg in the destination class and emit a copy.
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  const TargetRegisterClass *DstRC =
+    TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
+  unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+///
+void InstrEmitter::EmitRegSequence(SDNode *Node,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
+  const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
+  unsigned NumOps = Node->getNumOperands();
+  assert((NumOps & 1) == 1 &&
+         "REG_SEQUENCE must have an odd number of operands!");
+  for (unsigned i = 1; i != NumOps; ++i) {
+    SDValue Op = Node->getOperand(i);
+    if ((i & 1) == 0) {
+      RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(i-1));
+      // Skip physical registers as they don't have a vreg to get and we'll
+      // insert copies for them in TwoAddressInstructionPass anyway.
+      if (!R || !TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+        unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
+        unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
+        const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
+        const TargetRegisterClass *SRC =
+        TRI->getMatchingSuperRegClass(RC, TRC, SubIdx);
+        if (SRC && SRC != RC) {
+          MRI->setRegClass(NewVReg, SRC);
+          RC = SRC;
+        }
+      }
+    }
+    AddOperand(MIB, Op, i+1, &II, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+  }
+
+  MBB->insert(InsertPos, MIB);
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitDbgValue - Generate machine instruction for a dbg_value node.
+///
+MachineInstr *
+InstrEmitter::EmitDbgValue(SDDbgValue *SD,
+                           DenseMap<SDValue, unsigned> &VRBaseMap) {
+  uint64_t Offset = SD->getOffset();
+  MDNode* MDPtr = SD->getMDPtr();
+  DebugLoc DL = SD->getDebugLoc();
+
+  if (SD->getKind() == SDDbgValue::FRAMEIX) {
+    // Stack address; this needs to be lowered in target-dependent fashion.
+    // EmitTargetCodeForFrameDebugValue is responsible for allocation.
+    unsigned FrameIx = SD->getFrameIx();
+    return TII->emitFrameIndexDebugValue(*MF, FrameIx, Offset, MDPtr, DL);
+  }
+  // Otherwise, we're going to create an instruction here.
+  const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  if (SD->getKind() == SDDbgValue::SDNODE) {
+    SDNode *Node = SD->getSDNode();
+    SDValue Op = SDValue(Node, SD->getResNo());
+    // It's possible we replaced this SDNode with other(s) and therefore
+    // didn't generate code for it.  It's better to catch these cases where
+    // they happen and transfer the debug info, but trying to guarantee that
+    // in all cases would be very fragile; this is a safeguard for any
+    // that were missed.
+    DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+    if (I==VRBaseMap.end())
+      MIB.addReg(0U);       // undef
+    else
+      AddOperand(MIB, Op, (*MIB).getNumOperands(), &II, VRBaseMap,
+                 /*IsDebug=*/true, /*IsClone=*/false, /*IsCloned=*/false);
+  } else if (SD->getKind() == SDDbgValue::CONST) {
+    const Value *V = SD->getConst();
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      if (CI->getBitWidth() > 64)
+        MIB.addCImm(CI);
+      else
+        MIB.addImm(CI->getSExtValue());
+    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      MIB.addFPImm(CF);
+    } else {
+      // Could be an Undef.  In any case insert an Undef so we can see what we
+      // dropped.
+      MIB.addReg(0U);
+    }
+  } else {
+    // Insert an Undef so we can see what we dropped.
+    MIB.addReg(0U);
+  }
+
+  MIB.addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// EmitMachineNode - Generate machine code for a target-specific node and
+/// needed dependencies.
+///
+void InstrEmitter::
+EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned Opc = Node->getMachineOpcode();
+
+  // Handle subreg insert/extract specially
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::INSERT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG) {
+    EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  // Handle COPY_TO_REGCLASS specially.
+  if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
+    EmitCopyToRegClassNode(Node, VRBaseMap);
+    return;
+  }
+
+  // Handle REG_SEQUENCE specially.
+  if (Opc == TargetOpcode::REG_SEQUENCE) {
+    EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  if (Opc == TargetOpcode::IMPLICIT_DEF)
+    // We want a unique VR for each IMPLICIT_DEF use.
+    return;
+
+  const MCInstrDesc &II = TII->get(Opc);
+  unsigned NumResults = CountResults(Node);
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands =
+    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
+  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+#ifndef NDEBUG
+  unsigned NumMIOperands = NodeOperands + NumResults;
+  if (II.isVariadic())
+    assert(NumMIOperands >= II.getNumOperands() &&
+           "Too few operands for a variadic node!");
+  else
+    assert(NumMIOperands >= II.getNumOperands() &&
+           NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() +
+                            NumImpUses &&
+           "#operands for dag node doesn't match .td file!");
+#endif
+
+  // Create the new machine instruction.
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // Add result register values for things that are defined by this
+  // instruction.
+  if (NumResults)
+    CreateVirtualRegisters(Node, MIB, II, IsClone, IsCloned, VRBaseMap);
+
+  // Emit all of the actual operands of this instruction, adding them to the
+  // instruction as appropriate.
+  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  assert((!HasOptPRefs || !HasPhysRegOuts) &&
+         "Unable to cope with optional defs and phys regs defs!");
+  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  for (unsigned i = NumSkip; i != NodeOperands; ++i)
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+               VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
+
+  // Transfer all of the memory reference descriptions of this instruction.
+  MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
+                 cast<MachineSDNode>(Node)->memoperands_end());
+
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MIB);
+
+  // The MachineInstr may also define physregs instead of virtregs.  These
+  // physreg values can reach other instructions in different ways:
+  //
+  // 1. When there is a use of a Node value beyond the explicitly defined
+  //    virtual registers, we emit a CopyFromReg for one of the implicitly
+  //    defined physregs.  This only happens when HasPhysRegOuts is true.
+  //
+  // 2. A CopyFromReg reading a physreg may be glued to this instruction.
+  //
+  // 3. A glued instruction may implicitly use a physreg.
+  //
+  // 4. A glued instruction may use a RegisterSDNode operand.
+  //
+  // Collect all the used physreg defs, and make sure that any unused physreg
+  // defs are marked as dead.
+  SmallVector<unsigned, 8> UsedRegs;
+
+  // Additional results must be physical register defs.
+  if (HasPhysRegOuts) {
+    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+      if (!Node->hasAnyUseOfValue(i))
+        continue;
+      // This implicitly defined physreg has a use.
+      UsedRegs.push_back(Reg);
+      EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
+    }
+  }
+
+  // Scan the glue chain for any used physregs.
+  if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
+    for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
+      if (F->getOpcode() == ISD::CopyFromReg) {
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+        continue;
+      } else if (F->getOpcode() == ISD::CopyToReg) {
+        // Skip CopyToReg nodes that are internal to the glue chain.
+        continue;
+      }
+      // Collect declared implicit uses.
+      const MCInstrDesc &MCID = TII->get(F->getMachineOpcode());
+      UsedRegs.append(MCID.getImplicitUses(),
+                      MCID.getImplicitUses() + MCID.getNumImplicitUses());
+      // In addition to declared implicit uses, we must also check for
+      // direct RegisterSDNode operands.
+      for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+        if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+          unsigned Reg = R->getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            UsedRegs.push_back(Reg);
+        }
+    }
+  }
+
+  // Finally mark unused registers as dead.
+  if (!UsedRegs.empty() || II.getImplicitDefs())
+    MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
+
+  // Run post-isel target hook to adjust this instruction if needed.
+#ifdef NDEBUG
+  if (II.hasPostISelHook())
+#endif
+    TLI->AdjustInstrPostInstrSelection(MIB, Node);
+}
+
+/// EmitSpecialNode - Generate machine code for a target-independent node and
+/// needed dependencies.
+void InstrEmitter::
+EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    Node->dump();
+#endif
+    llvm_unreachable("This target-independent node should have been selected!");
+  case ISD::EntryToken:
+    llvm_unreachable("EntryToken should have been excluded from the schedule!");
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor: // fall thru
+    break;
+  case ISD::CopyToReg: {
+    unsigned SrcReg;
+    SDValue SrcVal = Node->getOperand(2);
+    if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
+      SrcReg = R->getReg();
+    else
+      SrcReg = getVR(SrcVal, VRBaseMap);
+
+    unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
+      break;
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
+    break;
+  }
+  case ISD::CopyFromReg: {
+    unsigned SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    EmitCopyFromReg(Node, 0, IsClone, IsCloned, SrcReg, VRBaseMap);
+    break;
+  }
+  case ISD::EH_LABEL: {
+    MCSymbol *S = cast<EHLabelSDNode>(Node)->getLabel();
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+            TII->get(TargetOpcode::EH_LABEL)).addSym(S);
+    break;
+  }
+
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END: {
+    unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START) ?
+    TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END;
+
+    FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Node->getOperand(1));
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+    .addFrameIndex(FI->getIndex());
+    break;
+  }
+
+  case ISD::INLINEASM: {
+    unsigned NumOps = Node->getNumOperands();
+    if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+      --NumOps;  // Ignore the glue operand.
+
+    // Create the inline asm machine instruction.
+    MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(),
+                                      TII->get(TargetOpcode::INLINEASM));
+
+    // Add the asm string as an external symbol operand.
+    SDValue AsmStrV = Node->getOperand(InlineAsm::Op_AsmString);
+    const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
+    MIB.addExternalSymbol(AsmStr);
+
+    // Add the HasSideEffect, isAlignStack, AsmDialect, MayLoad and MayStore
+    // bits.
+    int64_t ExtraInfo =
+      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
+                          getZExtValue();
+    MIB.addImm(ExtraInfo);
+
+    // Remember to operand index of the group flags.
+    SmallVector<unsigned, 8> GroupIdx;
+
+    // Add all of the operand registers to the instruction.
+    for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+      unsigned Flags =
+        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+      GroupIdx.push_back(MIB->getNumOperands());
+      MIB.addImm(Flags);
+      ++i;  // Skip the ID value.
+
+      switch (InlineAsm::getKind(Flags)) {
+      default: llvm_unreachable("Bad flags!");
+        case InlineAsm::Kind_RegDef:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          // FIXME: Add dead flags for physical and virtual registers defined.
+          // For now, mark physical register defs as implicit to help fast
+          // regalloc. This makes inline asm look a lot like calls.
+          MIB.addReg(Reg, RegState::Define |
+                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+        }
+        break;
+      case InlineAsm::Kind_RegDefEarlyClobber:
+      case InlineAsm::Kind_Clobber:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          MIB.addReg(Reg, RegState::Define | RegState::EarlyClobber |
+                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+        }
+        break;
+      case InlineAsm::Kind_RegUse:  // Use of register.
+      case InlineAsm::Kind_Imm:  // Immediate.
+      case InlineAsm::Kind_Mem:  // Addressing mode.
+        // The addressing mode has been selected, just add all of the
+        // operands to the machine instruction.
+        for (unsigned j = 0; j != NumVals; ++j, ++i)
+          AddOperand(MIB, Node->getOperand(i), 0, 0, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+
+        // Manually set isTied bits.
+        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
+          unsigned DefGroup = 0;
+          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+            unsigned DefIdx = GroupIdx[DefGroup] + 1;
+            unsigned UseIdx = GroupIdx.back() + 1;
+            for (unsigned j = 0; j != NumVals; ++j)
+              MIB->tieOperands(DefIdx + j, UseIdx + j);
+          }
+        }
+        break;
+      }
+    }
+
+    // Get the mdnode from the asm if it exists and add it to the instruction.
+    SDValue MDV = Node->getOperand(InlineAsm::Op_MDNode);
+    const MDNode *MD = cast<MDNodeSDNode>(MDV)->getMD();
+    if (MD)
+      MIB.addMetadata(MD);
+
+    MBB->insert(InsertPos, MIB);
+    break;
+  }
+  }
+}
+
+/// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+/// at the given position in the given block.
+InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
+                           MachineBasicBlock::iterator insertpos)
+  : MF(mbb->getParent()),
+    MRI(&MF->getRegInfo()),
+    TM(&MF->getTarget()),
+    TII(TM->getInstrInfo()),
+    TRI(TM->getRegisterInfo()),
+    TLI(TM->getTargetLowering()),
+    MBB(mbb), InsertPos(insertpos) {
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
new file mode 100644
index 000000000000..a9c2203e8400
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -0,0 +1,146 @@
+//===---- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INSTREMITTER_H
+#define INSTREMITTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCInstrDesc;
+class SDDbgValue;
+
+class InstrEmitter {
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+  const TargetMachine *TM;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+
+  MachineBasicBlock *MBB;
+  MachineBasicBlock::iterator InsertPos;
+
+  /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+  /// implicit physical register output.
+  void EmitCopyFromReg(SDNode *Node, unsigned ResNo,
+                       bool IsClone, bool IsCloned,
+                       unsigned SrcReg,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getDstOfCopyToRegUse - If the only use of the specified result number of
+  /// node is a CopyToReg, return its destination register. Return 0 otherwise.
+  unsigned getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                    unsigned ResNo) const;
+
+  void CreateVirtualRegisters(SDNode *Node,
+                              MachineInstrBuilder &MIB,
+                              const MCInstrDesc &II,
+                              bool IsClone, bool IsCloned,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getVR - Return the virtual register corresponding to the specified result
+  /// of the specified node.
+  unsigned getVR(SDValue Op,
+                 DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// AddRegisterOperand - Add the specified register as an operand to the
+  /// specified machine instr. Insert register copies if the register is
+  /// not in the required register class.
+  void AddRegisterOperand(MachineInstrBuilder &MIB,
+                          SDValue Op,
+                          unsigned IIOpNum,
+                          const MCInstrDesc *II,
+                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// AddOperand - Add the specified operand to the specified machine instr.  II
+  /// specifies the instruction information for the node, and IIOpNum is the
+  /// operand number (in the II) that we are adding. IIOpNum and II are used for
+  /// assertions only.
+  void AddOperand(MachineInstrBuilder &MIB,
+                  SDValue Op,
+                  unsigned IIOpNum,
+                  const MCInstrDesc *II,
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// ConstrainForSubReg - Try to constrain VReg to a register class that
+  /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
+  /// Return the virtual register to use.
+  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+                              MVT VT, DebugLoc DL);
+
+  /// EmitSubregNode - Generate machine code for subreg nodes.
+  ///
+  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                      bool IsClone, bool IsCloned);
+
+  /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+  /// COPY_TO_REGCLASS is just a normal copy, except that the destination
+  /// register is constrained to be in a particular register class.
+  ///
+  void EmitCopyToRegClassNode(SDNode *Node,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+  ///
+  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                       bool IsClone, bool IsCloned);
+public:
+  /// CountResults - The results of target nodes have register or immediate
+  /// operands first, then an optional chain, and optional flag operands
+  /// (which do not go into the machine instrs.)
+  static unsigned CountResults(SDNode *Node);
+
+  /// EmitDbgValue - Generate machine instruction for a dbg_value node.
+  ///
+  MachineInstr *EmitDbgValue(SDDbgValue *SD,
+                             DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitNode - Generate machine code for a node and needed dependencies.
+  ///
+  void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+    if (Node->isMachineOpcode())
+      EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
+    else
+      EmitSpecialNode(Node, IsClone, IsCloned, VRBaseMap);
+  }
+
+  /// getBlock - Return the current basic block.
+  MachineBasicBlock *getBlock() { return MBB; }
+
+  /// getInsertPos - Return the current insertion position.
+  MachineBasicBlock::iterator getInsertPos() { return InsertPos; }
+
+  /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+  /// at the given position in the given block.
+  InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
+  
+private:
+  void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+  void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
new file mode 100644
index 000000000000..51cc254b2c82
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -0,0 +1,3924 @@
+//===-- LegalizeDAG.cpp - Implement SelectionDAG::Legalize ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::Legalize method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// hacks on it until the target machine can handle it.  This involves
+/// eliminating value sizes the machine cannot handle (promoting small sizes to
+/// large sizes or splitting up large values into small values) as well as
+/// eliminating operations the machine cannot handle.
+///
+/// This code also does a small amount of optimization and recognition of idioms
+/// as part of its processing.  For example, if a target does not support a
+/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
+/// will attempt merge setcc and brc instructions into brcc's.
+///
+namespace {
+class SelectionDAGLegalize : public SelectionDAG::DAGUpdateListener {
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+
+  /// LegalizePosition - The iterator for walking through the node list.
+  SelectionDAG::allnodes_iterator LegalizePosition;
+
+  /// LegalizedNodes - The set of nodes which have already been legalized.
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+
+  // Libcall insertion helpers.
+
+public:
+  explicit SelectionDAGLegalize(SelectionDAG &DAG);
+
+  void LegalizeDAG();
+
+private:
+  /// LegalizeOp - Legalizes the given operation.
+  void LegalizeOp(SDNode *Node);
+
+  SDValue OptimizeFloatStore(StoreSDNode *ST);
+
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
+  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+  /// is necessary to spill the vector being inserted into to memory, perform
+  /// the insert there, and then read the result back.
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val,
+                                         SDValue Idx, DebugLoc dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                  SDValue Idx, DebugLoc dl);
+
+  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// performs the same shuffe in terms of order or result bytes, but on a type
+  /// whose vector element type is narrower than the original shuffle type.
+  /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl,
+                                     SDValue N1, SDValue N2,
+                                     ArrayRef<int> Mask) const;
+
+  void LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             DebugLoc dl);
+
+  SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
+                        unsigned NumOps, bool isSigned, DebugLoc dl);
+
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
+                          RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
+                          RTLIB::Libcall Call_F128,
+                          RTLIB::Libcall Call_PPCF128);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
+                           RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16,
+                           RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64,
+                           RTLIB::Libcall Call_I128);
+  void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl);
+  SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
+  void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results);
+  SDValue ExpandFCOPYSIGN(SDNode *Node);
+  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
+                               DebugLoc dl);
+  SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                DebugLoc dl);
+  SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                DebugLoc dl);
+
+  SDValue ExpandBSWAP(SDValue Op, DebugLoc dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, DebugLoc dl);
+
+  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+  SDValue ExpandInsertToVectorThroughStack(SDValue Op);
+  SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+
+  SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
+  void ExpandNode(SDNode *Node);
+  void PromoteNode(SDNode *Node);
+
+  void ForgetNode(SDNode *N) {
+    LegalizedNodes.erase(N);
+    if (LegalizePosition == SelectionDAG::allnodes_iterator(N))
+      ++LegalizePosition;
+  }
+
+public:
+  // DAGUpdateListener implementation.
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    ForgetNode(N);
+  }
+  virtual void NodeUpdated(SDNode *N) {}
+
+  // Node replacement helpers
+  void ReplacedNode(SDNode *N) {
+    if (N->use_empty()) {
+      DAG.RemoveDeadNode(N);
+    } else {
+      ForgetNode(N);
+    }
+  }
+  void ReplaceNode(SDNode *Old, SDNode *New) {
+    DAG.ReplaceAllUsesWith(Old, New);
+    ReplacedNode(Old);
+  }
+  void ReplaceNode(SDValue Old, SDValue New) {
+    DAG.ReplaceAllUsesWith(Old, New);
+    ReplacedNode(Old.getNode());
+  }
+  void ReplaceNode(SDNode *Old, const SDValue *New) {
+    DAG.ReplaceAllUsesWith(Old, New);
+    ReplacedNode(Old);
+  }
+};
+}
+
+/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// performs the same shuffe in terms of order or result bytes, but on a type
+/// whose vector element type is narrower than the original shuffle type.
+/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+SDValue
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
+                                                 SDValue N1, SDValue N2,
+                                                 ArrayRef<int> Mask) const {
+  unsigned NumMaskElts = VT.getVectorNumElements();
+  unsigned NumDestElts = NVT.getVectorNumElements();
+  unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
+
+  assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
+
+  if (NumEltsGrowth == 1)
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, &Mask[0]);
+
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int Idx = Mask[i];
+    for (unsigned j = 0; j != NumEltsGrowth; ++j) {
+      if (Idx < 0)
+        NewMask.push_back(-1);
+      else
+        NewMask.push_back(Idx * NumEltsGrowth + j);
+    }
+  }
+  assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
+  assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
+}
+
+SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
+  : SelectionDAG::DAGUpdateListener(dag),
+    TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+    DAG(dag) {
+}
+
+void SelectionDAGLegalize::LegalizeDAG() {
+  DAG.AssignTopologicalOrder();
+
+  // Visit all the nodes. We start in topological order, so that we see
+  // nodes with their original operands intact. Legalization can produce
+  // new nodes which may themselves need to be legalized. Iterate until all
+  // nodes have been legalized.
+  for (;;) {
+    bool AnyLegalized = false;
+    for (LegalizePosition = DAG.allnodes_end();
+         LegalizePosition != DAG.allnodes_begin(); ) {
+      --LegalizePosition;
+
+      SDNode *N = LegalizePosition;
+      if (LegalizedNodes.insert(N)) {
+        AnyLegalized = true;
+        LegalizeOp(N);
+      }
+    }
+    if (!AnyLegalized)
+      break;
+
+  }
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+}
+
+/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// a load from the constant pool.
+SDValue
+SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
+  bool Extend = false;
+  DebugLoc dl = CFP->getDebugLoc();
+
+  // If a FP immediate is precise when represented as a float and if the
+  // target can do an extending load from float to double, we put it into
+  // the constant pool as a float, even if it's is statically typed as a
+  // double.  This shrinks FP constants and canonicalizes them for targets where
+  // an FP extending load is the same cost as a normal load (such as on the x87
+  // fp stack or PPC FP unit).
+  EVT VT = CFP->getValueType(0);
+  ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
+  if (!UseCP) {
+    assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
+    return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(),
+                           (VT == MVT::f64) ? MVT::i64 : MVT::i32);
+  }
+
+  EVT OrigVT = VT;
+  EVT SVT = VT;
+  while (SVT != MVT::f32) {
+    SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
+    if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
+        // Only do this if the target has a native EXTLOAD instruction from
+        // smaller type.
+        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.ShouldShrinkFPConstant(OrigVT)) {
+      Type *SType = SVT.getTypeForEVT(*DAG.getContext());
+      LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+      VT = SVT;
+      Extend = true;
+    }
+  }
+
+  SDValue CPIdx = DAG.getConstantPool(LLVMC, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  if (Extend) {
+    SDValue Result =
+      DAG.getExtLoad(ISD::EXTLOAD, dl, OrigVT,
+                     DAG.getEntryNode(),
+                     CPIdx, MachinePointerInfo::getConstantPool(),
+                     VT, false, false, Alignment);
+    return Result;
+  }
+  SDValue Result =
+    DAG.getLoad(OrigVT, dl, DAG.getEntryNode(), CPIdx,
+                MachinePointerInfo::getConstantPool(), false, false, false,
+                Alignment);
+  return Result;
+}
+
+/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
+                                 const TargetLowering &TLI,
+                                 SelectionDAGLegalize *DAGLegalize) {
+  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed stores not implemented!");
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  EVT VT = Val.getValueType();
+  int Alignment = ST->getAlignment();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ST->getMemoryVT().isFloatingPoint() ||
+      ST->getMemoryVT().isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT)) {
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
+      Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
+                           ST->isVolatile(), ST->isNonTemporal(), Alignment);
+      DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
+      return;
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    EVT StoredVT = ST->getMemoryVT();
+    MVT RegVT =
+      TLI.getRegisterType(*DAG.getContext(),
+                          EVT::getIntegerVT(*DAG.getContext(),
+                                            StoredVT.getSizeInBits()));
+    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
+
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(Chain, dl,
+                                      Val, StackPtr, MachinePointerInfo(),
+                                      StoredVT, false, false, 0);
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load = DAG.getLoad(RegVT, dl, Store, StackPtr,
+                                 MachinePointerInfo(),
+                                 false, false, false, 0);
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                  ST->getPointerInfo().getWithOffset(Offset),
+                                    ST->isVolatile(), ST->isNonTemporal(),
+                                    MinAlign(ST->getAlignment(), Offset)));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             Increment);
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                  MachinePointerInfo(),
+                                  MemVT, false, false, 0);
+
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                                       ST->getPointerInfo()
+                                         .getWithOffset(Offset),
+                                       MemVT, ST->isVolatile(),
+                                       ST->isNonTemporal(),
+                                       MinAlign(ST->getAlignment(), Offset)));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue Result =
+      DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                  Stores.size());
+    DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
+    return;
+  }
+  assert(ST->getMemoryVT().isInteger() &&
+         !ST->getMemoryVT().isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  EVT NewStoredVT = ST->getMemoryVT().getHalfSizedIntegerVT(*DAG.getContext());
+  int NumBits = NewStoredVT.getSizeInBits();
+  int IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount = DAG.getConstant(NumBits,
+                                      TLI.getShiftAmountTy(Val.getValueType()));
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
+                             ST->getPointerInfo(), NewStoredVT,
+                             ST->isVolatile(), ST->isNonTemporal(), Alignment);
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                             NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
+                             Alignment);
+
+  SDValue Result =
+    DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+  DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
+}
+
+/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+static void
+ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
+                    const TargetLowering &TLI,
+                    SDValue &ValResult, SDValue &ChainResult) {
+  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed loads not implemented!");
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0);
+  EVT LoadedVT = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
+    if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                                    LD->isVolatile(),
+                                    LD->isNonTemporal(),
+                                    LD->isInvariant(), LD->getAlignment());
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
+
+      ValResult = Result;
+      ChainResult = Chain;
+      return;
+    }
+
+    // Copy the value to a (aligned) stack slot using (unaligned) integer
+    // loads and stores, then do a (aligned) load from the stack slot.
+    MVT RegVT = TLI.getRegisterType(*DAG.getContext(), intVT);
+    unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SmallVector<SDValue, 8> Stores;
+    SDValue StackPtr = StackBase;
+    unsigned Offset = 0;
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the original location.
+      SDValue Load = DAG.getLoad(RegVT, dl, Chain, Ptr,
+                                 LD->getPointerInfo().getWithOffset(Offset),
+                                 LD->isVolatile(), LD->isNonTemporal(),
+                                 LD->isInvariant(),
+                                 MinAlign(LD->getAlignment(), Offset));
+      // Follow the load with a store to the stack slot.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
+                                    MachinePointerInfo(), false, false, 0));
+      // Increment the pointers.
+      Offset += RegBytes;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             Increment);
+    }
+
+    // The last copy may be partial.  Do an extending load.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (LoadedBytes - Offset));
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+                                  LD->getPointerInfo().getWithOffset(Offset),
+                                  MemVT, LD->isVolatile(),
+                                  LD->isNonTemporal(),
+                                  MinAlign(LD->getAlignment(), Offset));
+    // Follow the load with a store to the stack slot.  Remember the store.
+    // On big-endian machines this requires a truncating store to ensure
+    // that the bits end up in the right place.
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
+                                       MachinePointerInfo(), MemVT,
+                                       false, false, 0));
+
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
+                             Stores.size());
+
+    // Finally, perform the original load only redirected to the stack slot.
+    Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+                          MachinePointerInfo(), LoadedVT, false, false, 0);
+
+    // Callers expect a MERGE_VALUES node.
+    ValResult = Load;
+    ChainResult = TF;
+    return;
+  }
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  EVT NewLoadedVT;
+  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
+  NumBits >>= 1;
+
+  unsigned Alignment = LD->getAlignment();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (TLI.isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, LD->isVolatile(),
+                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount = DAG.getConstant(NumBits,
+                                       TLI.getShiftAmountTy(Hi.getValueType()));
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  ValResult = Result;
+  ChainResult = TF;
+}
+
+/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// is necessary to spill the vector being inserted into to memory, perform
+/// the insert there, and then read the result back.
+SDValue SelectionDAGLegalize::
+PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                               DebugLoc dl) {
+  SDValue Tmp1 = Vec;
+  SDValue Tmp2 = Val;
+  SDValue Tmp3 = Idx;
+
+  // If the target doesn't support this, we have to spill the input vector
+  // to a temporary stack slot, update the element, then reload it.  This is
+  // badness.  We could also load the value into a vector register (either
+  // with a "move to register" or "extload into register" instruction, then
+  // permute it into place, if the idx is a constant and if the idx is
+  // supported by the target.
+  EVT VT    = Tmp1.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxVT = Tmp3.getValueType();
+  EVT PtrVT = TLI.getPointerTy();
+  SDValue StackPtr = DAG.CreateStackTemporary(VT);
+
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+  // Store the vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Tmp1, StackPtr,
+                            MachinePointerInfo::getFixedStack(SPFI),
+                            false, false, 0);
+
+  // Truncate or zero extend offset to target pointer type.
+  unsigned CastOpc = IdxVT.bitsGT(PtrVT) ? ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  Tmp3 = DAG.getNode(CastOpc, dl, PtrVT, Tmp3);
+  // Add the offset to the index.
+  unsigned EltSize = EltVT.getSizeInBits()/8;
+  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,DAG.getConstant(EltSize, IdxVT));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  // Store the scalar value.
+  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT,
+                         false, false, 0);
+  // Load the updated vector.
+  return DAG.getLoad(VT, dl, Ch, StackPtr,
+                     MachinePointerInfo::getFixedStack(SPFI), false, false, 
+                     false, 0);
+}
+
+
+SDValue SelectionDAGLegalize::
+ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, DebugLoc dl) {
+  if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
+    // SCALAR_TO_VECTOR requires that the type of the value being inserted
+    // match the element type of the vector being created, except for
+    // integers in which case the inserted value can be over width.
+    EVT EltVT = Vec.getValueType().getVectorElementType();
+    if (Val.getValueType() == EltVT ||
+        (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
+      SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                  Vec.getValueType(), Val);
+
+      unsigned NumElts = Vec.getValueType().getVectorNumElements();
+      // We generate a shuffle of InVec and ScVec, so the shuffle mask
+      // should be 0,1,2,3,4,5... with the appropriate element replaced with
+      // elt 0 of the RHS.
+      SmallVector<int, 8> ShufOps;
+      for (unsigned i = 0; i != NumElts; ++i)
+        ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
+
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec,
+                                  &ShufOps[0]);
+    }
+  }
+  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+}
+
+SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  // FIXME: We shouldn't do this for TargetConstantFP's.
+  // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+  // to phase ordering between legalized code and the dag combiner.  This
+  // probably means that we need to integrate dag combiner and legalizer
+  // together.
+  // We generally can't do this one for long doubles.
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  DebugLoc dl = ST->getDebugLoc();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+    if (CFP->getValueType(0) == MVT::f32 &&
+        TLI.isTypeLegal(MVT::i32)) {
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
+                                      bitcastToAPInt().zextOrTrunc(32),
+                              MVT::i32);
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                          isVolatile, isNonTemporal, Alignment);
+    }
+
+    if (CFP->getValueType(0) == MVT::f64) {
+      // If this target supports 64-bit registers, do a single 64-bit store.
+      if (TLI.isTypeLegal(MVT::i64)) {
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                  zextOrTrunc(64), MVT::i64);
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                            isVolatile, isNonTemporal, Alignment);
+      }
+
+      if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
+        // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+        // stores.  If the target supports neither 32- nor 64-bits, this
+        // xform is certainly not worth it.
+        const APInt &IntVal =CFP->getValueAPF().bitcastToAPInt();
+        SDValue Lo = DAG.getConstant(IntVal.trunc(32), MVT::i32);
+        SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
+        if (TLI.isBigEndian()) std::swap(Lo, Hi);
+
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
+                          isNonTemporal, Alignment);
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                            DAG.getIntPtrConstant(4));
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
+                          ST->getPointerInfo().getWithOffset(4),
+                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      }
+    }
+  }
+  return SDValue(0, 0);
+}
+
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    SDValue Chain = ST->getChain();
+    SDValue Ptr = ST->getBasePtr();
+    DebugLoc dl = Node->getDebugLoc();
+
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+    bool isNonTemporal = ST->isNonTemporal();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        ReplaceNode(ST, OptStore);
+        return;
+      }
+
+      {
+        SDValue Value = ST->getValue();
+        MVT VT = Value.getSimpleValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node),
+                                   DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Promote: {
+          MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
+          assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+                 "Can only promote stores to same size type");
+          Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr,
+                         ST->getPointerInfo(), isVolatile,
+                         isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+        }
+        return;
+      }
+    } else {
+      SDValue Value = ST->getValue();
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+        SDValue Result =
+          DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                            NVT, isVolatile, isNonTemporal, Alignment);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                 RoundVT,
+                                 isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(RoundWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(ExtraWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
+                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                              ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else {
+        switch (TLI.getTruncStoreAction(ST->getValue().getSimpleValueType(),
+                                        StVT.getSimpleVT())) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Expand:
+          assert(!StVT.isVector() &&
+                 "Vector Stores are handled in LegalizeVectorOps");
+
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(TLI.isTypeLegal(StVT) &&
+                 "Do not know how to expand this store!");
+          Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                         isVolatile, isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+      }
+    }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  DebugLoc dl = Node->getDebugLoc();
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    MVT VT = Node->getSimpleValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal:
+      // If this is an unaligned load and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+        Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+        unsigned ABIAlignment =
+          TLI.getDataLayout()->getABITypeAlignment(Ty);
+        if (LD->getAlignment() < ABIAlignment){
+          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
+        }
+      }
+      break;
+    case TargetLowering::Custom: {
+      SDValue Res = TLI.LowerOperation(RVal, DAG);
+      if (Res.getNode()) {
+        RVal = Res;
+        RChain = Res.getValue(1);
+      }
+      break;
+    }
+    case TargetLowering::Promote: {
+      MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+             "Can only promote loads to same size type");
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                         LD->isVolatile(), LD->isNonTemporal(),
+                         LD->isInvariant(), LD->getAlignment());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  EVT SrcVT = LD->getMemoryVT();
+  unsigned SrcWidth = SrcVT.getSizeInBits();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result =
+      DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                     Chain, Ptr, LD->getPointerInfo(),
+                     NVT, isVolatile, isNonTemporal, Alignment);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (SrcWidth & (SrcWidth - 1)) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+    assert(RoundWidth < SrcWidth);
+    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+
+    if (TLI.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
+                          Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(RoundWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                          dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(ExtraWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, SrcVT.getSimpleVT())) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+             isCustom = true;
+             // FALLTHROUGH
+    case TargetLowering::Legal: {
+             Value = SDValue(Node, 0);
+             Chain = SDValue(Node, 1);
+
+             if (isCustom) {
+               SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+               if (Res.getNode()) {
+                 Value = Res;
+                 Chain = Res.getValue(1);
+               }
+             } else {
+               // If this is an unaligned load and the target doesn't support it,
+               // expand it.
+               if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+                 Type *Ty =
+                   LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+                 unsigned ABIAlignment =
+                   TLI.getDataLayout()->getABITypeAlignment(Ty);
+                 if (LD->getAlignment() < ABIAlignment){
+                   ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                       DAG, TLI, Value, Chain);
+                 }
+               }
+             }
+             break;
+    }
+    case TargetLowering::Expand:
+             if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
+               SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
+                                          LD->getPointerInfo(),
+                                          LD->isVolatile(), LD->isNonTemporal(),
+                                          LD->isInvariant(), LD->getAlignment());
+               unsigned ExtendOp;
+               switch (ExtType) {
+               case ISD::EXTLOAD:
+                 ExtendOp = (SrcVT.isFloatingPoint() ?
+                             ISD::FP_EXTEND : ISD::ANY_EXTEND);
+                 break;
+               case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
+               case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
+               default: llvm_unreachable("Unexpected extend load type!");
+               }
+               Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+               Chain = Load.getValue(1);
+               break;
+             }
+
+             assert(!SrcVT.isVector() &&
+                    "Vector Loads are handled in LegalizeVectorOps");
+
+             // FIXME: This does not work for vectors on most targets.  Sign- and
+             // zero-extend operations are currently folded into extending loads,
+             // whether they are legal or not, and then we end up here without any
+             // support for legalizing them.
+             assert(ExtType != ISD::EXTLOAD &&
+                    "EXTLOAD should always be supported!");
+             // Turn the unsupported load into an EXTLOAD followed by an explicit
+             // zero/sign extend inreg.
+             SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
+                                             LD->isVolatile(), LD->isNonTemporal(),
+                                             LD->getAlignment());
+             SDValue ValRes;
+             if (ExtType == ISD::SEXTLOAD)
+               ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                    Result.getValueType(),
+                                    Result, DAG.getValueType(SrcVT));
+             else
+               ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+             Value = ValRes;
+             Chain = Result.getValue(1);
+             break;
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+  }
+}
+
+/// LegalizeOp - Return a legal replacement for the given operation, with
+/// all legal operands.
+void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
+  if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
+    return;
+
+  for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+             TargetLowering::TypeLegal &&
+           "Unexpected illegal type!");
+
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    assert((TLI.getTypeAction(*DAG.getContext(),
+                              Node->getOperand(i).getValueType()) ==
+              TargetLowering::TypeLegal ||
+            Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
+           "Unexpected illegal type!");
+
+  // Figure out the correct action; the way to query this varies by opcode
+  TargetLowering::LegalizeAction Action = TargetLowering::Legal;
+  bool SimpleFinishLegalizing = true;
+  switch (Node->getOpcode()) {
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::STACKSAVE:
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::VAARG:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    if (Action != TargetLowering::Promote)
+      Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::EXTRACT_VECTOR_ELT:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::FP_ROUND_INREG:
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(2).getValueType());
+    break;
+  }
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::BR_CC: {
+    unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
+                         Node->getOpcode() == ISD::SETCC ? 2 : 1;
+    unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
+    MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
+    ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal) {
+      if (Node->getOpcode() == ISD::SELECT_CC)
+        Action = TLI.getOperationAction(Node->getOpcode(),
+                                        Node->getValueType(0));
+      else
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    }
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+    // FIXME: Model these properly.  LOAD and STORE are complicated, and
+    // STORE expects the unlegalized operand in some cases.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    // FIXME: This shouldn't be necessary.  These nodes have special properties
+    // dealing with the recursive nature of legalization.  Removing this
+    // special case should be done as part of making LegalizeDAG non-recursive.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::EXTRACT_ELEMENT:
+  case ISD::FLT_ROUNDS_:
+  case ISD::SADDO:
+  case ISD::SSUBO:
+  case ISD::UADDO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+  case ISD::FPOWI:
+  case ISD::MERGE_VALUES:
+  case ISD::EH_RETURN:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+  case ISD::EH_SJLJ_SETJMP:
+  case ISD::EH_SJLJ_LONGJMP:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be expanded.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+  case ISD::INIT_TRAMPOLINE:
+  case ISD::ADJUST_TRAMPOLINE:
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Custom;
+    break;
+  case ISD::DEBUGTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::DEBUGTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, Node->getDebugLoc(), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+
+  default:
+    if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
+      Action = TargetLowering::Legal;
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    }
+    break;
+  }
+
+  if (SimpleFinishLegalizing) {
+    SDNode *NewNode = Node;
+    switch (Node->getOpcode()) {
+    default: break;
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Node->getOperand(1).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(1));
+        HandleSDNode Handle(SAO);
+        LegalizeOp(SAO.getNode());
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Handle.getValue());
+      }
+      break;
+    case ISD::SRL_PARTS:
+    case ISD::SRA_PARTS:
+    case ISD::SHL_PARTS:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Node->getOperand(2).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(2));
+        HandleSDNode Handle(SAO);
+        LegalizeOp(SAO.getNode());
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Node->getOperand(1),
+                                         Handle.getValue());
+      }
+      break;
+    }
+
+    if (NewNode != Node) {
+      DAG.ReplaceAllUsesWith(Node, NewNode);
+      for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+        DAG.TransferDbgValues(SDValue(Node, i), SDValue(NewNode, i));
+      ReplacedNode(Node);
+      Node = NewNode;
+    }
+    switch (Action) {
+    case TargetLowering::Legal:
+      return;
+    case TargetLowering::Custom: {
+      // FIXME: The handling for custom lowering with multiple results is
+      // a complete mess.
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res.getNode()) {
+        SmallVector<SDValue, 8> ResultVals;
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
+          if (e == 1)
+            ResultVals.push_back(Res);
+          else
+            ResultVals.push_back(Res.getValue(i));
+        }
+        if (Res.getNode() != Node || Res.getResNo() != 0) {
+          DAG.ReplaceAllUsesWith(Node, ResultVals.data());
+          for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+            DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
+          ReplacedNode(Node);
+        }
+        return;
+      }
+    }
+      // FALL THROUGH
+    case TargetLowering::Expand:
+      ExpandNode(Node);
+      return;
+    case TargetLowering::Promote:
+      PromoteNode(Node);
+      return;
+    }
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "NODE: ";
+    Node->dump( &DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to legalize this operator!");
+
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    break;
+  case ISD::LOAD: {
+    return LegalizeLoadOps(Node);
+  }
+  case ISD::STORE: {
+    return LegalizeStoreOps(Node);
+  }
+  }
+}
+
+SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                            MachinePointerInfo(), false, false, 0);
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, Idx.getValueType()));
+
+  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
+  else
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+
+  if (Op.getValueType().isVector())
+    return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,MachinePointerInfo(),
+                       false, false, false, 0);
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+                        MachinePointerInfo(),
+                        Vec.getValueType().getVectorElementType(),
+                        false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Non-vector insert subvector!");
+
+  SDValue Vec  = Op.getOperand(0);
+  SDValue Part = Op.getOperand(1);
+  SDValue Idx  = Op.getOperand(2);
+  DebugLoc dl  = Op.getDebugLoc();
+
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+
+  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI);
+
+  // First store the whole vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                            false, false, 0);
+
+  // Then store the inserted part.
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, Idx.getValueType()));
+
+  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
+  else
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                                    StackPtr);
+
+  // Store the subvector.
+  Ch = DAG.getStore(DAG.getEntryNode(), dl, Part, SubStackPtr,
+                    MachinePointerInfo(), false, false, 0);
+
+  // Finally, load the updated vector.
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo,
+                     false, false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  // We can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each element into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  EVT VT = Node->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getConstant(Offset, FIPtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
+
+    // If the destination vector element type is narrower than the source
+    // element type, only store the bits necessary.
+    if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+      Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
+                                         Node->getOperand(i), Idx,
+                                         PtrInfo.getWithOffset(Offset),
+                                         EltVT, false, false, 0));
+    } else
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl,
+                                    Node->getOperand(i), Idx,
+                                    PtrInfo.getWithOffset(Offset),
+                                    false, false, 0));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &Stores[0], Stores.size());
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo, 
+                     false, false, false, 0);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+
+  // Get the sign bit of the RHS.  First obtain a value that has the same
+  // sign as the sign bit, i.e. negative if and only if the sign bit is 1.
+  SDValue SignBit;
+  EVT FloatVT = Tmp2.getValueType();
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), FloatVT.getSizeInBits());
+  if (TLI.isTypeLegal(IVT)) {
+    // Convert to an integer with the same sign bit.
+    SignBit = DAG.getNode(ISD::BITCAST, dl, IVT, Tmp2);
+  } else {
+    // Store the float to memory, then load the sign part out as an integer.
+    MVT LoadTy = TLI.getPointerTy();
+    // First create a temporary that is aligned for both the load and store.
+    SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
+    // Then store the float to it.
+    SDValue Ch =
+      DAG.getStore(DAG.getEntryNode(), dl, Tmp2, StackPtr, MachinePointerInfo(),
+                   false, false, 0);
+    if (TLI.isBigEndian()) {
+      assert(FloatVT.isByteSized() && "Unsupported floating point type!");
+      // Load out a legal integer with the same sign bit as the float.
+      SignBit = DAG.getLoad(LoadTy, dl, Ch, StackPtr, MachinePointerInfo(),
+                            false, false, false, 0);
+    } else { // Little endian
+      SDValue LoadPtr = StackPtr;
+      // The float may be wider than the integer we are going to load.  Advance
+      // the pointer so that the loaded integer will contain the sign bit.
+      unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
+      unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
+      LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(),
+                            LoadPtr, DAG.getIntPtrConstant(ByteOffset));
+      // Load a legal integer containing the sign bit.
+      SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(),
+                            false, false, false, 0);
+      // Move the sign bit to the top bit of the loaded integer.
+      unsigned BitShift = LoadTy.getSizeInBits() -
+        (FloatVT.getSizeInBits() - 8 * ByteOffset);
+      assert(BitShift < LoadTy.getSizeInBits() && "Pointer advanced wrong?");
+      if (BitShift)
+        SignBit = DAG.getNode(ISD::SHL, dl, LoadTy, SignBit,
+                              DAG.getConstant(BitShift,
+                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+    }
+  }
+  // Now get the sign bit proper, by seeing whether the value is negative.
+  SignBit = DAG.getSetCC(dl, TLI.getSetCCResultType(SignBit.getValueType()),
+                         SignBit, DAG.getConstant(0, SignBit.getValueType()),
+                         ISD::SETLT);
+  // Get the absolute value of the result.
+  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1);
+  // Select between the nabs and abs value based on the sign bit of
+  // the input.
+  return DAG.getNode(ISD::SELECT, dl, AbsVal.getValueType(), SignBit,
+                     DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
+                     AbsVal);
+}
+
+void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
+                                           SmallVectorImpl<SDValue> &Results) {
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+          " not tell us which reg is the stack pointer!");
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = SDValue(Node, 0);
+  SDValue Tmp2 = SDValue(Node, 1);
+  SDValue Tmp3 = Node->getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  if (Align > StackAlign)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP,
+                      DAG.getConstant(-(uint64_t)Align, VT));
+  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
+
+  Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
+                            DAG.getIntPtrConstant(0, true), SDValue());
+
+  Results.push_back(Tmp1);
+  Results.push_back(Tmp2);
+}
+
+/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
+/// condition code CC on the current target. This routine expands SETCC with
+/// illegal condition code into AND / OR of multiple SETCC values.
+void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
+                                                 SDValue &LHS, SDValue &RHS,
+                                                 SDValue &CC,
+                                                 DebugLoc dl) {
+  MVT OpVT = LHS.getSimpleValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  switch (TLI.getCondCodeAction(CCCode, OpVT)) {
+  default: llvm_unreachable("Unknown condition code action!");
+  case TargetLowering::Legal:
+    // Nothing to do.
+    break;
+  case TargetLowering::Expand: {
+    ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    ISD::CondCode InvCC = ISD::SETCC_INVALID;
+    unsigned Opc = 0;
+    switch (CCCode) {
+    default: llvm_unreachable("Don't know how to expand this condition!");
+    case ISD::SETO: 
+        assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
+            == TargetLowering::Legal
+            && "If SETO is expanded, SETOEQ must be legal!");
+        CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
+    case ISD::SETUO:  
+        assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
+            == TargetLowering::Legal
+            && "If SETUO is expanded, SETUNE must be legal!");
+        CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
+    case ISD::SETOEQ:
+    case ISD::SETOGT:
+    case ISD::SETOGE:
+    case ISD::SETOLT:
+    case ISD::SETOLE:
+    case ISD::SETONE: 
+    case ISD::SETUEQ: 
+    case ISD::SETUNE: 
+    case ISD::SETUGT: 
+    case ISD::SETUGE: 
+    case ISD::SETULT: 
+    case ISD::SETULE:
+        // If we are floating point, assign and break, otherwise fall through.
+        if (!OpVT.isInteger()) {
+          // We can use the 4th bit to tell if we are the unordered
+          // or ordered version of the opcode.
+          CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+          Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
+          CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10);
+          break;
+        }
+        // Fallthrough if we are unsigned integer.
+    case ISD::SETLE:
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETLT:
+    case ISD::SETNE:
+    case ISD::SETEQ:
+      InvCC = ISD::getSetCCSwappedOperands(CCCode);
+      if (TLI.getCondCodeAction(InvCC, OpVT) == TargetLowering::Expand) {
+        // We only support using the inverted operation and not a
+        // different manner of supporting expanding these cases.
+        llvm_unreachable("Don't know how to expand this condition!");
+      }
+      LHS = DAG.getSetCC(dl, VT, RHS, LHS, InvCC);
+      RHS = SDValue();
+      CC = SDValue();
+      return;
+    }
+    
+    SDValue SetCC1, SetCC2;
+    if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
+      // If we aren't the ordered or unorder operation,
+      // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    } else {
+      // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
+    }
+    LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    RHS = SDValue();
+    CC  = SDValue();
+    break;
+  }
+  }
+}
+
+/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
+/// a load from the stack slot to DestVT, extending it if needed.
+/// The resultant code need not be legal.
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
+                                               EVT SlotVT,
+                                               EVT DestVT,
+                                               DebugLoc dl) {
+  // Create the stack frame object.
+  unsigned SrcAlign =
+    TLI.getDataLayout()->getPrefTypeAlignment(SrcOp.getValueType().
+                                              getTypeForEVT(*DAG.getContext()));
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
+  int SPFI = StackPtrFI->getIndex();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);
+
+  unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  unsigned DestAlign = TLI.getDataLayout()->getPrefTypeAlignment(DestType);
+
+  // Emit a store to the stack slot.  Use a truncstore if the input value is
+  // later than DestVT.
+  SDValue Store;
+
+  if (SrcSize > SlotSize)
+    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                              PtrInfo, SlotVT, false, false, SrcAlign);
+  else {
+    assert(SrcSize == SlotSize && "Invalid store");
+    Store = DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr,
+                         PtrInfo, false, false, SrcAlign);
+  }
+
+  // Result is a load from the stack slot.
+  if (SlotSize == DestSize)
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo,
+                       false, false, false, DestAlign);
+
+  assert(SlotSize < DestSize && "Unknown extension!");
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr,
+                        PtrInfo, SlotVT, false, false, DestAlign);
+}
+
+SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
+  DebugLoc dl = Node->getDebugLoc();
+  // Create a vector sized/aligned stack slot, store the value to element #0,
+  // then load the whole vector back out.
+  SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
+  int SPFI = StackPtrFI->getIndex();
+
+  SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), dl, Node->getOperand(0),
+                                 StackPtr,
+                                 MachinePointerInfo::getFixedStack(SPFI),
+                                 Node->getValueType(0).getVectorElementType(),
+                                 false, false, 0);
+  return DAG.getLoad(Node->getValueType(0), dl, Ch, StackPtr,
+                     MachinePointerInfo::getFixedStack(SPFI),
+                     false, false, false, 0);
+}
+
+
+/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// support the operation, but do support the resultant vector type.
+SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
+  unsigned NumElems = Node->getNumOperands();
+  SDValue Value1, Value2;
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  EVT OpVT = Node->getOperand(0).getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  // If the only non-undef value is the low element, turn this into a
+  // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
+  bool isOnlyLowElement = true;
+  bool MoreThanTwoValues = false;
+  bool isConstant = true;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue V = Node->getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (!Value1.getNode()) {
+      Value1 = V;
+    } else if (!Value2.getNode()) {
+      if (V != Value1)
+        Value2 = V;
+    } else if (V != Value1 && V != Value2) {
+      MoreThanTwoValues = true;
+    }
+  }
+
+  if (!Value1.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
+
+  // If all elements are constants, create a load from the constant pool.
+  if (isConstant) {
+    SmallVector<Constant*, 16> CV;
+    for (unsigned i = 0, e = NumElems; i != e; ++i) {
+      if (ConstantFPSDNode *V =
+          dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+      } else if (ConstantSDNode *V =
+                 dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
+        if (OpVT==EltVT)
+          CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+        else {
+          // If OpVT and EltVT don't match, EltVT is not legal and the
+          // element values have been promoted/truncated earlier.  Undo this;
+          // we don't want a v16i8 to become a v16i32 for example.
+          const ConstantInt *CI = V->getConstantIntValue();
+          CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
+                                        CI->getZExtValue()));
+        }
+      } else {
+        assert(Node->getOperand(i).getOpcode() == ISD::UNDEF);
+        Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+        CV.push_back(UndefValue::get(OpNTy));
+      }
+    }
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx = DAG.getConstantPool(CP, TLI.getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
+  }
+
+  if (!MoreThanTwoValues) {
+    SmallVector<int, 8> ShuffleVec(NumElems, -1);
+    for (unsigned i = 0; i < NumElems; ++i) {
+      SDValue V = Node->getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      ShuffleVec[i] = V == Value1 ? 0 : NumElems;
+    }
+    if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
+      // Get the splatted value into the low element of a vector register.
+      SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
+      SDValue Vec2;
+      if (Value2.getNode())
+        Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
+      else
+        Vec2 = DAG.getUNDEF(VT);
+
+      // Return shuffle(LowValVec, undef, <0,0,0,0>)
+      return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec.data());
+    }
+  }
+
+  // Otherwise, we can't handle this case efficiently.
+  return ExpandVectorBuildThroughStack(Node);
+}
+
+// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// does not fit into a register, return the lo part and set the hi part to the
+// by-reg argument.  If it does fit into a single register, return the result
+// and leave the Hi part unset.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            bool isSigned) {
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // TLI.isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
+  // isTailCall may be true since the callee does not reference caller stack
+  // frame. Check if it's in the right position.
+  SDValue TCChain = InChain;
+  bool isTailCall = TLI.isInTailCallPosition(DAG, Node, TCChain);
+  if (isTailCall)
+    InChain = TCChain;
+
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), isTailCall,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+
+  if (!CallInfo.second.getNode())
+    // It's a tailcall, return the chain (which is the DAG root).
+    return DAG.getRoot();
+
+  return CallInfo.first;
+}
+
+/// ExpandLibCall - Generate a libcall taking the given operands as arguments
+/// and returning a result of type RetVT.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
+                                            const SDValue *Ops, unsigned NumOps,
+                                            bool isSigned, DebugLoc dl) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                       false, 0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
+                  /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                  Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo.first;
+}
+
+// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo;
+}
+
+SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                              RTLIB::Libcall Call_F32,
+                                              RTLIB::Libcall Call_F64,
+                                              RTLIB::Libcall Call_F80,
+                                              RTLIB::Libcall Call_F128,
+                                              RTLIB::Libcall Call_PPCF128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32: LC = Call_F32; break;
+  case MVT::f64: LC = Call_F64; break;
+  case MVT::f80: LC = Call_F80; break;
+  case MVT::f128: LC = Call_F128; break;
+  case MVT::ppcf128: LC = Call_PPCF128; break;
+  }
+  return ExpandLibCall(LC, Node, false);
+}
+
+SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
+                                               RTLIB::Libcall Call_I8,
+                                               RTLIB::Libcall Call_I16,
+                                               RTLIB::Libcall Call_I32,
+                                               RTLIB::Libcall Call_I64,
+                                               RTLIB::Libcall Call_I128) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC = Call_I8; break;
+  case MVT::i16:  LC = Call_I16; break;
+  case MVT::i32:  LC = Call_I32; break;
+  case MVT::i64:  LC = Call_I64; break;
+  case MVT::i128: LC = Call_I128; break;
+  }
+  return ExpandLibCall(LC, Node, isSigned);
+}
+
+/// isDivRemLibcallAvailable - Return true if divmod libcall is available.
+static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
+                                     const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  return TLI.getLibcallName(LC) != 0;
+}
+
+/// useDivRem - Only issue divrem libcall if both quotient and remainder are
+/// needed.
+static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
+  // The other use might have been replaced with a divrem already.
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+  unsigned OtherOpcode = 0;
+  if (isSigned)
+    OtherOpcode = isDIV ? ISD::SREM : ISD::SDIV;
+  else
+    OtherOpcode = isDIV ? ISD::UREM : ISD::UDIV;
+
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node)
+      continue;
+    if ((User->getOpcode() == OtherOpcode || User->getOpcode() == DivRemOpc) &&
+        User->getOperand(0) == Op0 &&
+        User->getOperand(1) == Op1)
+      return true;
+  }
+  return false;
+}
+
+/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
+/// pairs.
+void
+SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = Opcode == ISD::SDIVREM;
+
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the return address of the remainder.
+  SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = FIPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = isSigned;
+  Entry.isZExt = !isSigned;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  DebugLoc dl = Node->getDebugLoc();
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  // Remainder is loaded back from the stack frame.
+  SDValue Rem = DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr,
+                            MachinePointerInfo(), false, false, false, 0);
+  Results.push_back(CallInfo.first);
+  Results.push_back(Rem);
+}
+
+/// isSinCosLibcallAvailable - Return true if sincos libcall is available.
+static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  return TLI.getLibcallName(LC) != 0;
+}
+
+/// canCombineSinCosLibcall - Return true if sincos libcall is available and
+/// can be used to combine sin and cos.
+static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
+                                    const TargetMachine &TM) {
+  if (!isSinCosLibcallAvailable(Node, TLI))
+    return false;
+  // GNU sin/cos functions set errno while sincos does not. Therefore
+  // combining sin and cos is only safe if unsafe-fpmath is enabled.
+  bool isGNU = Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU;
+  if (isGNU && !TM.Options.UnsafeFPMath)
+    return false;
+  return true;
+}
+
+/// useSinCos - Only issue sincos libcall if both sin and cos are
+/// needed.
+static bool useSinCos(SDNode *Node) {
+  unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
+    ? ISD::FCOS : ISD::FSIN;
+  
+  SDValue Op0 = Node->getOperand(0);
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+       UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node)
+      continue;
+    // The other user might have been turned into sincos already.
+    if (User->getOpcode() == OtherOpcode || User->getOpcode() == ISD::FSINCOS)
+      return true;
+  }
+  return false;
+}
+
+/// ExpandSinCosLibCall - Issue libcalls to sincos to compute sin / cos
+/// pairs.
+void
+SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  RTLIB::Libcall LC;
+  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+  
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  // Pass the return address of sin.
+  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = SinPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  // Also pass the return address of the cos.
+  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = CosPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+  
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+  
+  DebugLoc dl = Node->getDebugLoc();
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, Type::getVoidTy(*DAG.getContext()),
+                       false, false, false, false,
+                       0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr,
+                                MachinePointerInfo(), false, false, false, 0));
+  Results.push_back(DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr,
+                                MachinePointerInfo(), false, false, false, 0));
+}
+
+/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// INT_TO_FP operation of the specified operand when the target requests that
+/// we expand it.  At this point, we know that the result and operand types are
+/// legal for the target.
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
+                                                   SDValue Op0,
+                                                   EVT DestVT,
+                                                   DebugLoc dl) {
+  if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
+    // simple 32-bit [signed|unsigned] integer to float/double expansion
+
+    // Get the stack frame index of a 8 byte buffer.
+    SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
+
+    // word offset constant for Hi/Lo address computation
+    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    // set up Hi and Lo (into buffer) address based on endian
+    SDValue Hi = StackSlot;
+    SDValue Lo = DAG.getNode(ISD::ADD, dl,
+                             TLI.getPointerTy(), StackSlot, WordOff);
+    if (TLI.isLittleEndian())
+      std::swap(Hi, Lo);
+
+    // if signed map to unsigned space
+    SDValue Op0Mapped;
+    if (isSigned) {
+      // constant used to invert sign bit (signed to unsigned mapping)
+      SDValue SignBit = DAG.getConstant(0x80000000u, MVT::i32);
+      Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
+    } else {
+      Op0Mapped = Op0;
+    }
+    // store the lo of the constructed double - based on integer input
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl,
+                                  Op0Mapped, Lo, MachinePointerInfo(),
+                                  false, false, 0);
+    // initial hi portion of constructed double
+    SDValue InitialHi = DAG.getConstant(0x43300000u, MVT::i32);
+    // store the hi of the constructed double - biased exponent
+    SDValue Store2 = DAG.getStore(Store1, dl, InitialHi, Hi,
+                                  MachinePointerInfo(),
+                                  false, false, 0);
+    // load the constructed double
+    SDValue Load = DAG.getLoad(MVT::f64, dl, Store2, StackSlot,
+                               MachinePointerInfo(), false, false, false, 0);
+    // FP constant to bias correct the final result
+    SDValue Bias = DAG.getConstantFP(isSigned ?
+                                     BitsToDouble(0x4330000080000000ULL) :
+                                     BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    // subtract the bias
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
+    // final result
+    SDValue Result;
+    // handle final rounding
+    if (DestVT == MVT::f64) {
+      // do nothing
+      Result = Sub;
+    } else if (DestVT.bitsLT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                           DAG.getIntPtrConstant(0));
+    } else if (DestVT.bitsGT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+    }
+    return Result;
+  }
+  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+  // Code below here assumes !isSigned without checking again.
+
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt. This implementation has the advantage
+  // of performing rounding correctly, both in the default rounding mode
+  // and in all alternate rounding modes.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
+    SDValue TwoP52 =
+      DAG.getConstant(UINT64_C(0x4330000000000000), MVT::i64);
+    SDValue TwoP84PlusTwoP52 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), MVT::f64);
+    SDValue TwoP84 =
+      DAG.getConstant(UINT64_C(0x4530000000000000), MVT::i64);
+
+    SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
+                             DAG.getConstant(32, MVT::i64));
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
+    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
+    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
+                                TwoP84PlusTwoP52);
+    return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
+  }
+
+  // Implementation of unsigned i64 to f32.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundidf in compiler_rt.
+    if (!isSigned) {
+      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
+
+      SDValue ShiftConst =
+          DAG.getConstant(1, TLI.getShiftAmountTy(Op0.getValueType()));
+      SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
+      SDValue AndConst = DAG.getConstant(1, MVT::i64);
+      SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
+      SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);
+
+      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
+      SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);
+
+      // TODO: This really should be implemented using a branch rather than a
+      // select.  We happen to get lucky and machinesink does the right
+      // thing most of the time.  This would be a good candidate for a
+      //pseudo-op, or, even better, for whole-function isel.
+      SDValue SignBitTest = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+        Op0, DAG.getConstant(0, MVT::i64), ISD::SETLT);
+      return DAG.getNode(ISD::SELECT, dl, MVT::f32, SignBitTest, Slow, Fast);
+    }
+
+    // Otherwise, implement the fully general conversion.
+
+    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0xfffffffffffff800), MVT::i64));
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
+         DAG.getConstant(UINT64_C(0x800), MVT::i64));
+    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0x7ff), MVT::i64));
+    SDValue Ne = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+                   And2, DAG.getConstant(UINT64_C(0), MVT::i64), ISD::SETNE);
+    SDValue Sel = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ne, Or, Op0);
+    SDValue Ge = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+                   Op0, DAG.getConstant(UINT64_C(0x0020000000000000), MVT::i64),
+                   ISD::SETUGE);
+    SDValue Sel2 = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ge, Sel, Op0);
+    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType());
+
+    SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
+                             DAG.getConstant(32, SHVT));
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
+    SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
+    SDValue TwoP32 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), MVT::f64);
+    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
+    SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
+    SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
+    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+  SDValue SignSet = DAG.getSetCC(dl, TLI.getSetCCResultType(Op0.getValueType()),
+                                 Op0, DAG.getConstant(0, Op0.getValueType()),
+                                 ISD::SETLT);
+  SDValue Zero = DAG.getIntPtrConstant(0), Four = DAG.getIntPtrConstant(4);
+  SDValue CstOffset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(),
+                                    SignSet, Four, Zero);
+
+  // If the sign bit of the integer is set, the large number will be treated
+  // as a negative number.  To counteract this, the dynamic code adds an
+  // offset depending on the data type.
+  uint64_t FF;
+  switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unsupported integer type!");
+  case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
+  case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
+  case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
+  case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
+  }
+  if (TLI.isLittleEndian()) FF <<= 32;
+  Constant *FudgeFactor = ConstantInt::get(
+                                       Type::getInt64Ty(*DAG.getContext()), FF);
+
+  SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  Alignment = std::min(Alignment, 4u);
+  SDValue FudgeInReg;
+  if (DestVT == MVT::f32)
+    FudgeInReg = DAG.getLoad(MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  else {
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT,
+                                  DAG.getEntryNode(), CPIdx,
+                                  MachinePointerInfo::getConstantPool(),
+                                  MVT::f32, false, false, Alignment);
+    HandleSDNode Handle(Load);
+    LegalizeOp(Load.getNode());
+    FudgeInReg = Handle.getValue();
+  }
+
+  return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
+}
+
+/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// *INT_TO_FP operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
+/// operation that takes a larger input.
+SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
+                                                    EVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate *INT_TO_FP operation to use.
+  EVT NewInTy = LegalOp.getValueType();
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
+    assert(NewInTy.isInteger() && "Ran out of possibilities!");
+
+    // If the target supports SINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::SINT_TO_FP;
+      break;
+    }
+    if (isSigned) continue;
+
+    // If the target supports UINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::UINT_TO_FP;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.  Zero extend our input to the
+  // desired type then run the operation on it.
+  return DAG.getNode(OpToUse, dl, DestVT,
+                     DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                 dl, NewInTy, LegalOp));
+}
+
+/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// FP_TO_*INT operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
+/// operation that returns a larger result.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
+                                                    EVT DestVT,
+                                                    bool isSigned,
+                                                    DebugLoc dl) {
+  // First step, figure out the appropriate FP_TO*INT operation to use.
+  EVT NewOutTy = DestVT;
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_SINT;
+      break;
+    }
+
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_UINT;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+
+  // Okay, we found the operation and type to use.
+  SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
+
+  // Truncate the result of the extended FP_TO_*INT operation to the desired
+  // size.
+  return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
+}
+
+/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
+///
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
+  EVT VT = Op.getValueType();
+  EVT SHVT = TLI.getShiftAmountTy(VT);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unhandled Expand type in BSWAP!");
+  case MVT::i16:
+    Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(0xFF0000, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, VT));
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, SHVT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7, DAG.getConstant(255ULL<<48, VT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6, DAG.getConstant(255ULL<<40, VT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5, DAG.getConstant(255ULL<<32, VT));
+    Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4, DAG.getConstant(255ULL<<24, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3, DAG.getConstant(255ULL<<16, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(255ULL<<8 , VT));
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
+    Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
+  }
+}
+
+/// ExpandBitCount - Expand the specified bitcount instruction into operations.
+///
+SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
+                                             DebugLoc dl) {
+  switch (Opc) {
+  default: llvm_unreachable("Cannot expand this yet!");
+  case ISD::CTPOP: {
+    EVT VT = Op.getValueType();
+    EVT ShVT = TLI.getShiftAmountTy(VT);
+    unsigned Len = VT.getSizeInBits();
+
+    assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
+           "CTPOP not implemented for this type.");
+
+    // This is the "best" algorithm from
+    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+    SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), VT);
+    SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), VT);
+    SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), VT);
+    SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), VT);
+
+    // v = v - ((v >> 1) & 0x55555555...)
+    Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+                     DAG.getNode(ISD::AND, dl, VT,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(1, ShVT)),
+                                 Mask55));
+    // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+    Op = DAG.getNode(ISD::ADD, dl, VT,
+                     DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+                     DAG.getNode(ISD::AND, dl, VT,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(2, ShVT)),
+                                 Mask33));
+    // v = (v + (v >> 4)) & 0x0F0F0F0F...
+    Op = DAG.getNode(ISD::AND, dl, VT,
+                     DAG.getNode(ISD::ADD, dl, VT, Op,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(4, ShVT))),
+                     Mask0F);
+    // v = (v * 0x01010101...) >> (Len - 8)
+    Op = DAG.getNode(ISD::SRL, dl, VT,
+                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                     DAG.getConstant(Len - 8, ShVT));
+
+    return Op;
+  }
+  case ISD::CTLZ_ZERO_UNDEF:
+    // This trivially expands to CTLZ.
+    return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
+  case ISD::CTLZ: {
+    // for now, we do this:
+    // x = x | (x >> 1);
+    // x = x | (x >> 2);
+    // ...
+    // x = x | (x >>16);
+    // x = x | (x >>32); // for 64-bit input
+    // return popcount(~x);
+    //
+    // but see also: http://www.hackersdelight.org/HDcode/nlz.cc
+    EVT VT = Op.getValueType();
+    EVT ShVT = TLI.getShiftAmountTy(VT);
+    unsigned len = VT.getSizeInBits();
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, ShVT);
+      Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
+    }
+    Op = DAG.getNOT(dl, Op, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  }
+  case ISD::CTTZ_ZERO_UNDEF:
+    // This trivially expands to CTTZ.
+    return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
+  case ISD::CTTZ: {
+    // for now, we use: { return popcount(~x & (x - 1)); }
+    // unless the target has ctlz but not ctpop, in which case we use:
+    // { return 32 - nlz(~x & (x-1)); }
+    // see also http://www.hackersdelight.org/HDcode/ntz.cc
+    EVT VT = Op.getValueType();
+    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNOT(dl, Op, VT),
+                               DAG.getNode(ISD::SUB, dl, VT, Op,
+                                           DAG.getConstant(1, VT)));
+    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
+      return DAG.getNode(ISD::SUB, dl, VT,
+                         DAG.getConstant(VT.getSizeInBits(), VT),
+                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
+    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
+  }
+  }
+}
+
+std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC;
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic intrinsic Expand!");
+  case ISD::ATOMIC_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    }
+    break;
+  case ISD::ATOMIC_CMP_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_ADD_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_SUB_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_AND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_OR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_XOR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_NAND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_NAND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    }
+    break;
+  }
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
+void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  switch (Node->getOpcode()) {
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BSWAP:
+    Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
+    break;
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    Results.push_back(DAG.getConstant(0, Node->getValueType(0)));
+    break;
+  case ISD::FLT_ROUNDS_:
+    Results.push_back(DAG.getConstant(1, Node->getValueType(0)));
+    break;
+  case ISD::EH_RETURN:
+  case ISD::EH_LABEL:
+  case ISD::PREFETCH:
+  case ISD::VAEND:
+  case ISD::EH_SJLJ_LONGJMP:
+    // If the target didn't expand these, there's nothing to do, so just
+    // preserve the chain and be done.
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_SJLJ_SETJMP:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.push_back(DAG.getConstant(0, MVT::i32));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::ATOMIC_FENCE:
+  case ISD::MEMBARRIER: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    // FIXME: handle "fence singlethread" more efficiently.
+    TargetLowering::ArgListTy Args;
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C,
+                      /*isTailCall=*/false,
+                      /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("__sync_synchronize",
+                                            TLI.getPointerTy()),
+                      Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::ATOMIC_LOAD: {
+    // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
+    SDValue Zero = DAG.getConstant(0, Node->getValueType(0));
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Zero, Zero,
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(0));
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(Node);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::DYNAMIC_STACKALLOC:
+    ExpandDYNAMIC_STACKALLOC(Node, Results);
+    break;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0; i < Node->getNumValues(); i++)
+      Results.push_back(Node->getOperand(i));
+    break;
+  case ISD::UNDEF: {
+    EVT VT = Node->getValueType(0);
+    if (VT.isInteger())
+      Results.push_back(DAG.getConstant(0, VT));
+    else {
+      assert(VT.isFloatingPoint() && "Unknown value type!");
+      Results.push_back(DAG.getConstantFP(0, VT));
+    }
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    TargetLowering::
+    CallLoweringInfo CLI(Node->getOperand(0),
+                         Type::getVoidTy(*DAG.getContext()),
+                      false, false, false, false, 0, CallingConv::C,
+                      /*isTailCall=*/false,
+                      /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                      DAG.getExternalSymbol("abort", TLI.getPointerTy()),
+                      Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FP_ROUND:
+  case ISD::BITCAST:
+    Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_EXTEND:
+    Tmp1 = EmitStackConvert(Node->getOperand(0),
+                            Node->getOperand(0).getValueType(),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    // NOTE: we could fall back on load/store here too for targets without
+    // SAR.  However, it is doubtful that any exist.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    EVT VT = Node->getValueType(0);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT);
+    if (VT.isVector())
+      ShiftAmountTy = VT;
+    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                        ExtraVT.getScalarType().getSizeInBits();
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, ShiftAmountTy);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
+                       Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_ROUND_INREG: {
+    // The only way we can lower this is to turn it into a TRUNCSTORE,
+    // EXTLOAD pair, targeting a temporary location (a stack slot).
+
+    // NOTE: there is a choice here between constantly creating new stack
+    // slots and always reusing the same one.  We currently always create
+    // new ones, as reuse may inhibit scheduling.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
+                                Node->getOperand(0), Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_TO_UINT: {
+    SDValue True, False;
+    EVT VT =  Node->getOperand(0).getValueType();
+    EVT NVT = Node->getValueType(0);
+    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
+                APInt::getNullValue(VT.getSizeInBits()));
+    APInt x = APInt::getSignBit(NVT.getSizeInBits());
+    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+    Tmp1 = DAG.getConstantFP(apf, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+                        Node->getOperand(0),
+                        Tmp1, ISD::SETLT);
+    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
+    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
+                        DAG.getNode(ISD::FSUB, dl, VT,
+                                    Node->getOperand(0), Tmp1));
+    False = DAG.getNode(ISD::XOR, dl, NVT, False,
+                        DAG.getConstant(x, NVT));
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2, True, False);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VAARG: {
+    const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+    EVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    unsigned Align = Node->getConstantOperandVal(3);
+
+    SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2,
+                                     MachinePointerInfo(V), 
+                                     false, false, false, 0);
+    SDValue VAList = VAListLoad;
+
+    if (Align > TLI.getMinStackArgumentAlignment()) {
+      assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(Align - 1,
+                                           TLI.getPointerTy()));
+
+      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+                           DAG.getConstant(-(int64_t)Align,
+                                           TLI.getPointerTy()));
+    }
+
+    // Increment the pointer, VAList, to the next vaarg
+    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+                       DAG.getConstant(TLI.getDataLayout()->
+                          getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
+                                       TLI.getPointerTy()));
+    // Store the incremented VAList to the legalized pointer
+    Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
+                        MachinePointerInfo(V), false, false, 0);
+    // Load the actual argument out of the pointer VAList
+    Results.push_back(DAG.getLoad(VT, dl, Tmp3, VAList, MachinePointerInfo(),
+                                  false, false, false, 0));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::VACOPY: {
+    // This defaults to loading a pointer from the input and storing it to the
+    // output, returning the chain.
+    const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+    const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+    Tmp1 = DAG.getLoad(TLI.getPointerTy(), dl, Node->getOperand(0),
+                       Node->getOperand(2), MachinePointerInfo(VS),
+                       false, false, false, 0);
+    Tmp1 = DAG.getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
+                        MachinePointerInfo(VD), false, false, 0);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
+      // This must be an access of the only element.  Return it.
+      Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    else
+      Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::INSERT_SUBVECTOR:
+    Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::CONCAT_VECTORS: {
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR:
+    Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
+                                              Node->getOperand(1),
+                                              Node->getOperand(2), dl));
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 32> NewMask;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    EVT VT = Node->getValueType(0);
+    EVT EltVT = VT.getVectorElementType();
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    if (!TLI.isTypeLegal(EltVT)) {
+
+      EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);
+
+      // BUILD_VECTOR operands are allowed to be wider than the element type.
+      // But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept it
+      if (NewEltVT.bitsLT(EltVT)) {
+
+        // Convert shuffle node.
+        // If original node was v4i64 and the new EltVT is i32,
+        // cast operands to v8i32 and re-build the mask.
+
+        // Calculate new VT, the size of the new VT should be equal to original.
+        EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, 
+                                      VT.getSizeInBits()/NewEltVT.getSizeInBits());
+        assert(NewVT.bitsEq(VT));
+
+        // cast operands to new VT
+        Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
+        Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);
+
+        // Convert the shuffle mask
+        unsigned int factor = NewVT.getVectorNumElements()/VT.getVectorNumElements();
+
+        // EltVT gets smaller
+        assert(factor > 0);
+
+        for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+          if (Mask[i] < 0) {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]);
+          }
+          else {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]*factor+fi);
+          }
+        }
+        Mask = NewMask;
+        VT = NewVT;
+      }
+      EltVT = NewEltVT;
+    }
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Mask[i] < 0) {
+        Ops.push_back(DAG.getUNDEF(EltVT));
+        continue;
+      }
+      unsigned Idx = Mask[i];
+      if (Idx < NumElems)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Op0,
+                                  DAG.getIntPtrConstant(Idx)));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  Op1,
+                                  DAG.getIntPtrConstant(Idx - NumElems)));
+    }
+
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+    // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    EVT OpTy = Node->getOperand(0).getValueType();
+    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+      // 1 -> Hi
+      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
+                         DAG.getConstant(OpTy.getSizeInBits()/2,
+                    TLI.getShiftAmountTy(Node->getOperand(0).getValueType())));
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
+    } else {
+      // 0 -> Lo
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::STACKSAVE:
+    // Expand to CopyFromReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
+                                           Node->getValueType(0)));
+      Results.push_back(Results[0].getValue(1));
+    } else {
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::STACKRESTORE:
+    // Expand to CopyToReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
+                                         Node->getOperand(1)));
+    } else {
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::FCOPYSIGN:
+    Results.push_back(ExpandFCOPYSIGN(Node));
+    break;
+  case ISD::FNEG:
+    // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
+    Tmp1 = DAG.getConstantFP(-0.0, Node->getValueType(0));
+    Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
+                       Node->getOperand(0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FABS: {
+    // Expand Y = FABS(X) -> Y = (X >u 0.0) ? X : fneg(X).
+    EVT VT = Node->getValueType(0);
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = DAG.getConstantFP(0.0, VT);
+    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(Tmp1.getValueType()),
+                        Tmp1, Tmp2, ISD::SETUGT);
+    Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, VT, Tmp2, Tmp1, Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FSQRT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                      RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                      RTLIB::SQRT_PPCF128));
+    break;
+  case ISD::FSIN:
+  case ISD::FCOS: {
+    EVT VT = Node->getValueType(0);
+    bool isSIN = Node->getOpcode() == ISD::FSIN;
+    // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
+    // fcos which share the same operand and both are used.
+    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
+         canCombineSinCosLibcall(Node, TLI, TM))
+        && useSinCos(Node)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
+      if (!isSIN)
+        Tmp1 = Tmp1.getValue(1);
+      Results.push_back(Tmp1);
+    } else if (isSIN) {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                        RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                        RTLIB::SIN_PPCF128));
+    } else {
+      Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                        RTLIB::COS_F80, RTLIB::COS_F128,
+                                        RTLIB::COS_PPCF128));
+    }
+    break;
+  }
+  case ISD::FSINCOS:
+    // Expand into sincos libcall.
+    ExpandSinCosLibCall(Node, Results);
+    break;
+  case ISD::FLOG:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                      RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                      RTLIB::LOG_PPCF128));
+    break;
+  case ISD::FLOG2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                      RTLIB::LOG2_PPCF128));
+    break;
+  case ISD::FLOG10:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                      RTLIB::LOG10_PPCF128));
+    break;
+  case ISD::FEXP:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                      RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                      RTLIB::EXP_PPCF128));
+    break;
+  case ISD::FEXP2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                      RTLIB::EXP2_PPCF128));
+    break;
+  case ISD::FTRUNC:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                      RTLIB::TRUNC_PPCF128));
+    break;
+  case ISD::FFLOOR:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                      RTLIB::FLOOR_PPCF128));
+    break;
+  case ISD::FCEIL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                      RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                      RTLIB::CEIL_PPCF128));
+    break;
+  case ISD::FRINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                      RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                      RTLIB::RINT_PPCF128));
+    break;
+  case ISD::FNEARBYINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                                      RTLIB::NEARBYINT_F64,
+                                      RTLIB::NEARBYINT_F80,
+                                      RTLIB::NEARBYINT_F128,
+                                      RTLIB::NEARBYINT_PPCF128));
+    break;
+  case ISD::FPOWI:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                      RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                      RTLIB::POWI_PPCF128));
+    break;
+  case ISD::FPOW:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                      RTLIB::POW_F80, RTLIB::POW_F128,
+                                      RTLIB::POW_PPCF128));
+    break;
+  case ISD::FDIV:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                                      RTLIB::DIV_F80, RTLIB::DIV_F128,
+                                      RTLIB::DIV_PPCF128));
+    break;
+  case ISD::FREM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                                      RTLIB::REM_F80, RTLIB::REM_F128,
+                                      RTLIB::REM_PPCF128));
+    break;
+  case ISD::FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
+                                      RTLIB::FMA_PPCF128));
+    break;
+  case ISD::FP16_TO_FP32:
+    Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+    break;
+  case ISD::FP32_TO_FP16:
+    Results.push_back(ExpandLibCall(RTLIB::FPROUND_F32_F16, Node, false));
+    break;
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
+    // Check to see if this FP immediate is already legal.
+    // If this is a legal constant, turn it into a TargetConstantFP node.
+    if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
+      Results.push_back(ExpandConstantFP(CFP, true));
+    break;
+  }
+  case ISD::EHSELECTION: {
+    unsigned Reg = TLI.getExceptionSelectorRegister();
+    assert(Reg && "Can't expand to unknown register!");
+    Results.push_back(DAG.getCopyFromReg(Node->getOperand(1), dl, Reg,
+                                         Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::EXCEPTIONADDR: {
+    unsigned Reg = TLI.getExceptionPointerRegister();
+    assert(Reg && "Can't expand to unknown register!");
+    Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, Reg,
+                                         Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  }
+  case ISD::FSUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+           "Don't know how to expand this FP subtraction!");
+    Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+    Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+           "Don't know how to expand this subtraction!");
+    Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
+               DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, VT));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
+    break;
+  }
+  case ISD::UREM:
+  case ISD::SREM: {
+    EVT VT = Node->getValueType(0);
+    bool isSigned = Node->getOpcode() == ISD::SREM;
+    unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    Tmp2 = Node->getOperand(0);
+    Tmp3 = Node->getOperand(1);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
+        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
+         // If div is legal, it's better to do the normal expansion
+         !TLI.isOperationLegalOrCustom(DivOpc, Node->getValueType(0)) &&
+         useDivRem(Node, isSigned, false))) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
+    } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
+      // X % Y -> X-X/Y*Y
+      Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
+    } else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SREM_I8,
+                              RTLIB::SREM_I16, RTLIB::SREM_I32,
+                              RTLIB::SREM_I64, RTLIB::SREM_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UREM_I8,
+                              RTLIB::UREM_I16, RTLIB::UREM_I32,
+                              RTLIB::UREM_I64, RTLIB::UREM_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::UDIV:
+  case ISD::SDIV: {
+    bool isSigned = Node->getOpcode() == ISD::SDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
+        (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
+         useDivRem(Node, isSigned, true)))
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
+                         Node->getOperand(1));
+    else if (isSigned)
+      Tmp1 = ExpandIntLibCall(Node, true,
+                              RTLIB::SDIV_I8,
+                              RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                              RTLIB::SDIV_I64, RTLIB::SDIV_I128);
+    else
+      Tmp1 = ExpandIntLibCall(Node, false,
+                              RTLIB::UDIV_I8,
+                              RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                              RTLIB::UDIV_I64, RTLIB::UDIV_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    unsigned ExpandOpcode = Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI :
+                                                              ISD::SMUL_LOHI;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    assert(TLI.isOperationLegalOrCustom(ExpandOpcode, VT) &&
+           "If this wasn't legal, it shouldn't have been created!");
+    Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
+                       Node->getOperand(1));
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  }
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    // Expand into divrem libcall
+    ExpandDivRemLibCall(Node, Results);
+    break;
+  case ISD::MUL: {
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    // See if multiply or divide can be lowered using two-result operations.
+    // We just need the low half of the multiply; try both the signed
+    // and unsigned forms. If the target supports both SMUL_LOHI and
+    // UMUL_LOHI, form a preference by checking which forms of plain
+    // MULH it supports.
+    bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
+    bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
+    bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
+    bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
+    unsigned OpToUse = 0;
+    if (HasSMUL_LOHI && !HasMULHS) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI && !HasMULHU) {
+      OpToUse = ISD::UMUL_LOHI;
+    } else if (HasSMUL_LOHI) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI) {
+      OpToUse = ISD::UMUL_LOHI;
+    }
+    if (OpToUse) {
+      Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
+                                    Node->getOperand(1)));
+      break;
+    }
+    Tmp1 = ExpandIntLibCall(Node, false,
+                            RTLIB::MUL_I8,
+                            RTLIB::MUL_I16, RTLIB::MUL_I32,
+                            RTLIB::MUL_I64, RTLIB::MUL_I128);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SADDO:
+  case ISD::SSUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    EVT OType = Node->getValueType(1);
+
+    SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+    //   LHSSign -> LHS >= 0
+    //   RHSSign -> RHS >= 0
+    //   SumSign -> Sum >= 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+    SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+    SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                      Node->getOpcode() == ISD::SADDO ?
+                                      ISD::SETEQ : ISD::SETNE);
+
+    SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+    SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+    SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+    Results.push_back(Cmp);
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    Results.push_back(DAG.getSetCC(dl, Node->getValueType(1), Sum, LHS,
+                                   Node->getOpcode () == ISD::UADDO ?
+                                   ISD::SETULT : ISD::SETUGT));
+    break;
+  }
+  case ISD::UMULO:
+  case ISD::SMULO: {
+    EVT VT = Node->getValueType(0);
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue BottomHalf;
+    SDValue TopHalf;
+    static const unsigned Ops[2][3] =
+        { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
+          { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
+    bool isSigned = Node->getOpcode() == ISD::SMULO;
+    if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
+      BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+      TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
+    } else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
+      BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
+                               RHS);
+      TopHalf = BottomHalf.getValue(1);
+    } else if (TLI.isTypeLegal(EVT::getIntegerVT(*DAG.getContext(),
+                                                 VT.getSizeInBits() * 2))) {
+      LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
+      RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                               DAG.getIntPtrConstant(0));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                            DAG.getIntPtrConstant(1));
+    } else {
+      // We can fall back to a libcall with an illegal type for the MUL if we
+      // have a libcall big enough.
+      // Also, we can fall back to a division in some cases, but that's a big
+      // performance hit in the general case.
+      RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+      if (WideVT == MVT::i16)
+        LC = RTLIB::MUL_I16;
+      else if (WideVT == MVT::i32)
+        LC = RTLIB::MUL_I32;
+      else if (WideVT == MVT::i64)
+        LC = RTLIB::MUL_I64;
+      else if (WideVT == MVT::i128)
+        LC = RTLIB::MUL_I128;
+      assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
+
+      // The high part is obtained by SRA'ing all but one of the bits of low
+      // part.
+      unsigned LoSize = VT.getSizeInBits();
+      SDValue HiLHS = DAG.getNode(ISD::SRA, dl, VT, RHS,
+                                DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+      SDValue HiRHS = DAG.getNode(ISD::SRA, dl, VT, LHS,
+                                DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+
+      // Here we're passing the 2 arguments explicitly as 4 arguments that are
+      // pre-lowered to the correct types. This all depends upon WideVT not
+      // being a legal type for the architecture and thus has to be split to
+      // two arguments.
+      SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
+      SDValue Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                               DAG.getIntPtrConstant(0));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                            DAG.getIntPtrConstant(1));
+      // Ret is a node with an illegal type. Because such things are not
+      // generally permitted during this phase of legalization, delete the
+      // node. The above EXTRACT_ELEMENT nodes should have been folded.
+      DAG.DeleteNode(Ret.getNode());
+    }
+
+    if (isSigned) {
+      Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1,
+                             TLI.getShiftAmountTy(BottomHalf.getValueType()));
+      Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
+      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf, Tmp1,
+                             ISD::SETNE);
+    } else {
+      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf,
+                             DAG.getConstant(0, VT), ISD::SETNE);
+    }
+    Results.push_back(BottomHalf);
+    Results.push_back(TopHalf);
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT PairTy = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
+    Tmp2 = DAG.getNode(ISD::SHL, dl, PairTy, Tmp2,
+                       DAG.getConstant(PairTy.getSizeInBits()/2,
+                                       TLI.getShiftAmountTy(PairTy)));
+    Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
+    break;
+  }
+  case ISD::SELECT:
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    if (Tmp1.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
+                             Tmp2, Tmp3,
+                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+    } else {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1,
+                             DAG.getConstant(0, Tmp1.getValueType()),
+                             Tmp2, Tmp3, ISD::SETNE);
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BR_JT: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue Table = Node->getOperand(1);
+    SDValue Index = Node->getOperand(2);
+
+    EVT PTy = TLI.getPointerTy();
+
+    const DataLayout &TD = *TLI.getDataLayout();
+    unsigned EntrySize =
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
+
+    Index = DAG.getNode(ISD::MUL, dl, PTy,
+                        Index, DAG.getConstant(EntrySize, PTy));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+    SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+                                MachinePointerInfo::getJumpTable(), MemVT,
+                                false, false, 0);
+    Addr = LD;
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      // For PIC, the sequence is:
+      // BRIND(load(Jumptable + index) + RelocBase)
+      // RelocBase can be JumpTable, GOT or some sort of global base.
+      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
+                          TLI.getPICJumpTableRelocBase(Table, DAG));
+    }
+    Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BRCOND:
+    // Expand brcond's setcc into its constituent parts and create a BR_CC
+    // Node.
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    if (Tmp2.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
+                         Tmp1, Tmp2.getOperand(2),
+                         Tmp2.getOperand(0), Tmp2.getOperand(1),
+                         Node->getOperand(2));
+    } else {
+      // We test only the i1 bit.  Skip the AND if UNDEF.
+      Tmp3 = (Tmp2.getOpcode() == ISD::UNDEF) ? Tmp2 :
+        DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
+                    DAG.getConstant(1, Tmp2.getValueType()));
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
+                         DAG.getCondCode(ISD::SETNE), Tmp3,
+                         DAG.getConstant(0, Tmp3.getValueType()),
+                         Node->getOperand(2));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SETCC: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+
+    // If we expanded the SETCC into an AND/OR, return the new node
+    if (Tmp2.getNode() == 0) {
+      Results.push_back(Tmp1);
+      break;
+    }
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    EVT VT = Node->getValueType(0);
+    int TrueValue;
+    switch (TLI.getBooleanContents(VT.isVector())) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+    case TargetLowering::UndefinedBooleanContent:
+      TrueValue = 1;
+      break;
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      TrueValue = -1;
+      break;
+    }
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
+                       DAG.getConstant(TrueValue, VT), DAG.getConstant(0, VT),
+                       Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SELECT_CC: {
+    Tmp1 = Node->getOperand(0);   // LHS
+    Tmp2 = Node->getOperand(1);   // RHS
+    Tmp3 = Node->getOperand(2);   // True
+    Tmp4 = Node->getOperand(3);   // False
+    SDValue CC = Node->getOperand(4);
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp1.getValueType()),
+                          Tmp1, Tmp2, CC, dl);
+
+    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
+    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+    CC = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                       Tmp3, Tmp4, CC);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BR_CC: {
+    Tmp1 = Node->getOperand(0);              // Chain
+    Tmp2 = Node->getOperand(2);              // LHS
+    Tmp3 = Node->getOperand(3);              // RHS
+    Tmp4 = Node->getOperand(1);              // CC
+
+    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
+                          Tmp2, Tmp3, Tmp4, dl);
+
+    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
+    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+    Tmp4 = DAG.getCondCode(ISD::SETNE);
+    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                       Tmp3, Node->getOperand(4));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    Results.push_back(ExpandBUILD_VECTOR(Node));
+    break;
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL: {
+    // Scalarize vector SRA/SRL/SHL.
+    EVT VT = Node->getValueType(0);
+    assert(VT.isVector() && "Unable to legalize non-vector shift");
+    assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
+    unsigned NumElem = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 8> Scalars;
+    for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               VT.getScalarType(),
+                               Node->getOperand(0), DAG.getIntPtrConstant(Idx));
+      SDValue Sh = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               VT.getScalarType(),
+                               Node->getOperand(1), DAG.getIntPtrConstant(Idx));
+      Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
+                                    VT.getScalarType(), Ex, Sh));
+    }
+    SDValue Result =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
+                  &Scalars[0], Scalars.size());
+    ReplaceNode(SDValue(Node, 0), Result);
+    break;
+  }
+  case ISD::GLOBAL_OFFSET_TABLE:
+  case ISD::GlobalAddress:
+  case ISD::GlobalTLSAddress:
+  case ISD::ExternalSymbol:
+  case ISD::ConstantPool:
+  case ISD::JumpTable:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // FIXME: Custom lowering for these operations shouldn't return null!
+    break;
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty())
+    ReplaceNode(Node, Results.data());
+}
+
+void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  MVT OVT = Node->getSimpleValueType(0);
+  if (Node->getOpcode() == ISD::UINT_TO_FP ||
+      Node->getOpcode() == ISD::SINT_TO_FP ||
+      Node->getOpcode() == ISD::SETCC) {
+    OVT = Node->getOperand(0).getSimpleValueType();
+  }
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Tmp1, Tmp2, Tmp3;
+  switch (Node->getOpcode()) {
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+    // Zero extend the argument.
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
+    // already the correct result.
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    if (Node->getOpcode() == ISD::CTTZ) {
+      // FIXME: This should set a bit in the zero extended value instead.
+      Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT),
+                          Tmp1, DAG.getConstant(NVT.getSizeInBits(), NVT),
+                          ISD::SETEQ);
+      Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2,
+                          DAG.getConstant(OVT.getSizeInBits(), NVT), Tmp1);
+    } else if (Node->getOpcode() == ISD::CTLZ ||
+               Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
+      // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+      Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+                          DAG.getConstant(NVT.getSizeInBits() -
+                                          OVT.getSizeInBits(), NVT));
+    }
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    break;
+  case ISD::BSWAP: {
+    unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp1 = DAG.getNode(ISD::BSWAP, dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(ISD::SRL, dl, NVT, Tmp1,
+                          DAG.getConstant(DiffBits, TLI.getShiftAmountTy(NVT)));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::FP_TO_SINT, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::SINT_TO_FP, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::VAARG: {
+    SDValue Chain = Node->getOperand(0); // Get the chain.
+    SDValue Ptr = Node->getOperand(1); // Get the pointer.
+
+    unsigned TruncOp;
+    if (OVT.isVector()) {
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger()
+        && "VAARG promotion is supported only for vectors or integer types");
+      TruncOp = ISD::TRUNCATE;
+    }
+
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
+             Node->getConstantOperandVal(3));
+    Chain = Tmp1.getValue(1);
+
+    Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);
+
+    // Modified the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    unsigned ExtOp, TruncOp;
+    if (OVT.isVector()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger() && "Cannot promote logic operation");
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    }
+    // Promote each of the values to the new type.
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
+    break;
+  }
+  case ISD::SELECT: {
+    unsigned ExtOp, TruncOp;
+    if (Node->getValueType(0).isVector()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else if (Node->getValueType(0).isInteger()) {
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    } else {
+      ExtOp   = ISD::FP_EXTEND;
+      TruncOp = ISD::FP_ROUND;
+    }
+    Tmp1 = Node->getOperand(0);
+    // Promote each of the values to the new type.
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    // Perform the larger operation, then round down.
+    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp1, Tmp2, Tmp3);
+    if (TruncOp != ISD::FP_ROUND)
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
+    else
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
+                         DAG.getIntPtrConstant(0));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    // Cast the two input vectors.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));
+
+    // Convert the shuffle mask to the right # elements.
+    Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SETCC: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(2))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                                  Tmp1, Tmp2, Node->getOperand(2)));
+    break;
+  }
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FPOW: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                                  Tmp3, DAG.getIntPtrConstant(0)));
+    break;
+  }
+  case ISD::FLOG2:
+  case ISD::FEXP2:
+  case ISD::FLOG:
+  case ISD::FEXP: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                                  Tmp2, DAG.getIntPtrConstant(0)));
+    break;
+  }
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty())
+    ReplaceNode(Node, Results.data());
+}
+
+// SelectionDAG::Legalize - This is the entry point for the file.
+//
+void SelectionDAG::Legalize() {
+  /// run - This is the main entry point to this class.
+  ///
+  SelectionDAGLegalize(*this).LegalizeDAG();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
new file mode 100644
index 000000000000..de217d8571ff
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -0,0 +1,1451 @@
+//===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements float type expansion and softening for LegalizeTypes.
+// Softening is the act of turning a computation in an illegal floating point
+// type into a computation in an integer type of the same size; also known as
+// "soft float".  For example, turning f32 arithmetic into operations using i32.
+// The resulting integer value is the same as what you would get by performing
+// the floating point operation and bitcasting the result to the integer type.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  For example,
+// implementing ppcf128 arithmetic in two f64 registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// GetFPLibCall - Return the right libcall for the given floating point type.
+static RTLIB::Libcall GetFPLibCall(EVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_F128,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::f128 ? Call_F128 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Float to Integer Conversion.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften the result of this operator!");
+
+    case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
+    case ISD::ConstantFP:
+      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
+    case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
+    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
+    case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
+    case ISD::FP16_TO_FP32:R = SoftenFloatRes_FP16_TO_FP32(N); break;
+    case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
+    case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
+    case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetSoftenedFloat(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
+                                                      unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return BitConvertToInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
+  // Convert the inputs to integers, and build a new pair out of them.
+  return DAG.getNode(ISD::BUILD_PAIR, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                                              N->getValueType(0)),
+                     BitConvertToInteger(N->getOperand(0)),
+                     BitConvertToInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
+  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(),
+                         TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                  N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     NewOp.getValueType().getVectorElementType(),
+                     NewOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Size = NVT.getSizeInBits();
+
+  // Mask = ~(1 << (Size-1))
+  APInt API = APInt::getAllOnesValue(Size);
+  API.clearBit(Size-1);
+  SDValue Mask = DAG.getConstant(API, NVT);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return DAG.getNode(ISD::AND, N->getDebugLoc(), NVT, Op, Mask);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ADD_F32,
+                                           RTLIB::ADD_F64,
+                                           RTLIB::ADD_F80,
+                                           RTLIB::ADD_F128,
+                                           RTLIB::ADD_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::CEIL_F32,
+                                           RTLIB::CEIL_F64,
+                                           RTLIB::CEIL_F80,
+                                           RTLIB::CEIL_F128,
+                                           RTLIB::CEIL_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(ISD::SHL, dl, RVT, DAG.getConstant(1, RVT),
+                                  DAG.getConstant(RSize - 1,
+                                                  TLI.getShiftAmountTy(RVT)));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit = DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                          DAG.getConstant(SizeDiff,
+                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit = DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                          DAG.getConstant(-SizeDiff,
+                                 TLI.getShiftAmountTy(SignBit.getValueType())));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(ISD::SHL, dl, LVT, DAG.getConstant(1, LVT),
+                               DAG.getConstant(LSize - 1,
+                                               TLI.getShiftAmountTy(LVT)));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::COS_F32,
+                                           RTLIB::COS_F64,
+                                           RTLIB::COS_F80,
+                                           RTLIB::COS_F128,
+                                           RTLIB::COS_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::DIV_F32,
+                                           RTLIB::DIV_F64,
+                                           RTLIB::DIV_F80,
+                                           RTLIB::DIV_F128,
+                                           RTLIB::DIV_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP_F32,
+                                           RTLIB::EXP_F64,
+                                           RTLIB::EXP_F80,
+                                           RTLIB::EXP_F128,
+                                           RTLIB::EXP_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP2_F32,
+                                           RTLIB::EXP2_F64,
+                                           RTLIB::EXP2_F80,
+                                           RTLIB::EXP2_F128,
+                                           RTLIB::EXP2_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FLOOR_F32,
+                                           RTLIB::FLOOR_F64,
+                                           RTLIB::FLOOR_F80,
+                                           RTLIB::FLOOR_F128,
+                                           RTLIB::FLOOR_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG_F32,
+                                           RTLIB::LOG_F64,
+                                           RTLIB::LOG_F80,
+                                           RTLIB::LOG_F128,
+                                           RTLIB::LOG_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG2_F32,
+                                           RTLIB::LOG2_F64,
+                                           RTLIB::LOG2_F80,
+                                           RTLIB::LOG2_F128,
+                                           RTLIB::LOG2_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG10_F32,
+                                           RTLIB::LOG10_F64,
+                                           RTLIB::LOG10_F80,
+                                           RTLIB::LOG10_F128,
+                                           RTLIB::LOG10_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)),
+                     GetSoftenedFloat(N->getOperand(2)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMA_F32,
+                                           RTLIB::FMA_F64,
+                                           RTLIB::FMA_F80,
+                                           RTLIB::FMA_F128,
+                                           RTLIB::FMA_PPCF128),
+                         NVT, Ops, 3, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::MUL_F32,
+                                           RTLIB::MUL_F64,
+                                           RTLIB::MUL_F80,
+                                           RTLIB::MUL_F128,
+                                           RTLIB::MUL_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::NEARBYINT_F32,
+                                           RTLIB::NEARBYINT_F64,
+                                           RTLIB::NEARBYINT_F80,
+                                           RTLIB::NEARBYINT_F128,
+                                           RTLIB::NEARBYINT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  // Expand Y = FNEG(X) -> Y = SUB -0.0, X
+  SDValue Ops[2] = { DAG.getConstantFP(-0.0, N->getValueType(0)),
+                     GetSoftenedFloat(N->getOperand(0)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_F128,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
+// nodes?
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
+                         N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POW_F32,
+                                           RTLIB::POW_F64,
+                                           RTLIB::POW_F80,
+                                           RTLIB::POW_F128,
+                                           RTLIB::POW_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::i32 &&
+         "Unsupported power type!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POWI_F32,
+                                           RTLIB::POWI_F64,
+                                           RTLIB::POWI_F80,
+                                           RTLIB::POWI_F128,
+                                           RTLIB::POWI_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::REM_F32,
+                                           RTLIB::REM_F64,
+                                           RTLIB::REM_F80,
+                                           RTLIB::REM_F128,
+                                           RTLIB::REM_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::RINT_F32,
+                                           RTLIB::RINT_F64,
+                                           RTLIB::RINT_F80,
+                                           RTLIB::RINT_F128,
+                                           RTLIB::RINT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SIN_F32,
+                                           RTLIB::SIN_F64,
+                                           RTLIB::SIN_F80,
+                                           RTLIB::SIN_F128,
+                                           RTLIB::SIN_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SQRT_F32,
+                                           RTLIB::SQRT_F64,
+                                           RTLIB::SQRT_F80,
+                                           RTLIB::SQRT_F128,
+                                           RTLIB::SQRT_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_F128,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, 2, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::TRUNC_F32,
+                                           RTLIB::TRUNC_F64,
+                                           RTLIB::TRUNC_F80,
+                                           RTLIB::TRUNC_F128,
+                                           RTLIB::TRUNC_PPCF128),
+                         NVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewL;
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
+                       NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getPointerInfo(), NVT, L->isVolatile(), 
+                       L->isNonTemporal(), false, L->getAlignment());
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  // Do a non-extending load followed by FP_EXTEND.
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD,
+                     L->getMemoryVT(), dl, L->getChain(),
+                     L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
+                     L->getMemoryVT(), L->isVolatile(),
+                     L->isNonTemporal(), false, L->getAlignment());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetSoftenedFloat(N->getOperand(2));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue NewVAARG;
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  return NewVAARG;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
+  // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
+  // match.  Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+       t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
+    NVT = (MVT::SimpleValueType)t;
+    // The source needs to big enough to hold the operand.
+    if (NVT.bitsGE(SVT))
+      LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+  // Sign/zero extend the argument if the libcall takes a larger type.
+  SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                           NVT, N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC,
+                         TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
+                         &Op, 1, false, dl);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Float to Integer Conversion..
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften this operator's operand!");
+
+  case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
+  case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT:  Res = SoftenFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
+  case ISD::FP32_TO_FP16:Res = SoftenFloatOp_FP32_TO_FP16(N); break;
+  case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
+  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getValueType(0),
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If softenSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  DebugLoc dl = N->getDebugLoc();
+
+  if (ST->isTruncatingStore())
+    // Do an FP_ROUND followed by a non-truncating store.
+    Val = BitConvertToInteger(DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(),
+                                          Val, DAG.getIntPtrConstant(0)));
+  else
+    Val = GetSoftenedFloat(Val);
+
+  return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
+                      ST->getPointerInfo(),
+                      ST->isVolatile(), ST->isNonTemporal(),
+                      ST->getAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+
+  case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
+  case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
+  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
+  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
+  case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(NVT.getSizeInBits() == integerPartWidth &&
+         "Do not know how to expand this float constant!");
+  APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[1])),
+                         NVT);
+  Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[0])),
+                         NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Tmp;
+  GetExpandedFloat(N->getOperand(0), Lo, Tmp);
+  Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
+  // Lo = Hi==fabs(Hi) ? Lo : -Lo;
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, Lo.getValueType(), Tmp, Hi, Lo,
+                   DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
+                   DAG.getCondCode(ISD::SETEQ));
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                         RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                         RTLIB::ADD_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                         RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                         RTLIB::CEIL_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COPYSIGN_F32,
+                                         RTLIB::COPYSIGN_F64,
+                                         RTLIB::COPYSIGN_F80,
+                                         RTLIB::COPYSIGN_F128,
+                                         RTLIB::COPYSIGN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COS_F32, RTLIB::COS_F64,
+                                         RTLIB::COS_F80, RTLIB::COS_F128,
+                                         RTLIB::COS_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::DIV_F32,
+                                                   RTLIB::DIV_F64,
+                                                   RTLIB::DIV_F80,
+                                                   RTLIB::DIV_F128,
+                                                   RTLIB::DIV_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                         RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                         RTLIB::EXP_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                         RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                         RTLIB::EXP2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                         RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                         RTLIB::FLOOR_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                         RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                         RTLIB::LOG_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                         RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                         RTLIB::LOG2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                         RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                         RTLIB::LOG10_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::FMA_F32,
+                                                   RTLIB::FMA_F64,
+                                                   RTLIB::FMA_F80,
+                                                   RTLIB::FMA_F128,
+                                                   RTLIB::FMA_PPCF128),
+                                 N->getValueType(0), Ops, 3, false,
+                                 N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::MUL_F32,
+                                                   RTLIB::MUL_F64,
+                                                   RTLIB::MUL_F80,
+                                                   RTLIB::MUL_F128,
+                                                   RTLIB::MUL_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::NEARBYINT_F32,
+                                         RTLIB::NEARBYINT_F64,
+                                         RTLIB::NEARBYINT_F80,
+                                         RTLIB::NEARBYINT_F128,
+                                         RTLIB::NEARBYINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  Hi = DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), NVT, N->getOperand(0));
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POW_F32, RTLIB::POW_F64,
+                                         RTLIB::POW_F80, RTLIB::POW_F128,
+                                         RTLIB::POW_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                         RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                         RTLIB::POWI_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::REM_F32, RTLIB::REM_F64,
+                                         RTLIB::REM_F80, RTLIB::REM_F128,
+                                         RTLIB::REM_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                         RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                         RTLIB::RINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                         RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                         RTLIB::SIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                         RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                         RTLIB::SQRT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::SUB_F32,
+                                                   RTLIB::SUB_F64,
+                                                   RTLIB::SUB_F80,
+                                                   RTLIB::SUB_F128,
+                                                   RTLIB::SUB_PPCF128),
+                                 N->getValueType(0), Ops, 2, false,
+                                 N->getDebugLoc());
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                         RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                         RTLIB::TRUNC_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+                      LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(),
+                      LD->isNonTemporal(), LD->getAlignment());
+
+  // Remember the chain.
+  Chain = Hi.getValue(1);
+
+  // The low part is zero.
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), NVT);
+
+  // Modified the chain - switch anything that used the old chain to use the
+  // new one.
+  ReplaceValueWith(SDValue(LD, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  DebugLoc dl = N->getDebugLoc();
+
+  // First do an SINT_TO_FP, whether the original was signed or unsigned.
+  // When promoting partial word types to i32 we must honor the signedness,
+  // though.
+  if (SrcVT.bitsLE(MVT::i32)) {
+    // The integer can be represented exactly in an f64.
+    Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                      MVT::i32, Src);
+    Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                   APInt(NVT.getSizeInBits(), 0)), NVT);
+    Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
+  } else {
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (SrcVT.bitsLE(MVT::i64)) {
+      Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                        MVT::i64, Src);
+      LC = RTLIB::SINTTOFP_I64_PPCF128;
+    } else if (SrcVT.bitsLE(MVT::i128)) {
+      Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
+      LC = RTLIB::SINTTOFP_I128_PPCF128;
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl);
+    GetPairElements(Hi, Lo, Hi);
+  }
+
+  if (isSigned)
+    return;
+
+  // Unsigned - fix up the SINT_TO_FP value just calculated.
+  Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
+  SrcVT = Src.getValueType();
+
+  // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
+  static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
+  static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
+  static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
+  ArrayRef<uint64_t> Parts;
+
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unsupported UINT_TO_FP!");
+  case MVT::i32:
+    Parts = TwoE32;
+    break;
+  case MVT::i64:
+    Parts = TwoE64;
+    break;
+  case MVT::i128:
+    Parts = TwoE128;
+    break;
+  }
+
+  Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
+                   DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble,
+                                             APInt(128, Parts)),
+                                     MVT::ppcf128));
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
+                   Lo, Hi, DAG.getCondCode(ISD::SETLT));
+  GetPairElements(Lo, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatOperand - This method is called when the specified operand of the
+/// specified node is found to need expansion.  At this point, all of the result
+/// types of the node are known to be legal, but other operands of the node may
+/// need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+  case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+  case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+  case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                  OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
+                                                SDValue &NewRHS,
+                                                ISD::CondCode &CCCode,
+                                                DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedFloat(NewLHS, LHSLo, LHSHi);
+  GetExpandedFloat(NewRHS, RHSLo, RHSHi);
+
+  assert(NewLHS.getValueType() == MVT::ppcf128 && "Unsupported setcc type!");
+
+  // FIXME:  This generated code sucks.  We want to generate
+  //         FCMPU crN, hi1, hi2
+  //         BNE crN, L:
+  //         FCMPU crN, lo1, lo2
+  // The following can be improved, but not that much.
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETOEQ);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                      LHSLo, RHSLo, CCCode);
+  Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETUNE);
+  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, CCCode);
+  Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
+  NewRHS = SDValue();   // LHS is the result, not a compare.
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
+  assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  // Round it the rest of the way (e.g. to f32) if needed.
+  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+                     N->getValueType(0), Hi, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    SDValue Res = DAG.getNode(ISD::FP_ROUND_INREG, dl, MVT::ppcf128,
+                              N->getOperand(0), DAG.getValueType(MVT::f64));
+    Res = DAG.getNode(ISD::FP_ROUND, dl, MVT::f64, Res,
+                      DAG.getIntPtrConstant(1));
+    return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+    APFloat APF = APFloat(APFloat::PPCDoubleDouble, APInt(128, TwoE31));
+    SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
+    //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+    // FIXME: generated code sucks.
+    return DAG.getNode(ISD::SELECT_CC, dl, MVT::i32, N->getOperand(0), Tmp,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                               DAG.getNode(ISD::FSUB, dl,
+                                                           MVT::ppcf128,
+                                                           N->getOperand(0),
+                                                           Tmp)),
+                                   DAG.getConstant(0x80000000, MVT::i32)),
+                       DAG.getNode(ISD::FP_TO_SINT, dl,
+                                   MVT::i32, N->getOperand(0)),
+                       DAG.getCondCode(ISD::SETGE));
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
+                         false, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     ST->getValue().getValueType());
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+  (void)NVT;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(ST->getValue(), Lo, Hi);
+
+  return DAG.getTruncStore(Chain, N->getDebugLoc(), Hi, Ptr,
+                           ST->getPointerInfo(),
+                           ST->getMemoryVT(), ST->isVolatile(),
+                           ST->isNonTemporal(), ST->getAlignment());
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
new file mode 100644
index 000000000000..d19c13b8ff13
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -0,0 +1,3042 @@
+//===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements integer type expansion and promotion for LegalizeTypes.
+// Promotion is the act of changing a computation in an illegal type into a
+// computation in a larger type.  For example, implementing i8 arithmetic in an
+// i32 register (often needed on powerpc).
+// Expansion is the act of changing a computation in an illegal type into a
+// computation in two identical registers of a smaller type.  For example,
+// implementing i64 arithmetic in two i32 registers (often needed on 32-bit
+// targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerResult - This method is called when a result of a node is
+/// found to be in need of promotion to a larger type.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// expansion, we just know that (at least) one result needs promotion.
+void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "PromoteIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to promote this operator!");
+  case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
+  case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
+  case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
+  case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
+  case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
+  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
+  case ISD::CONVERT_RNDSAT:
+                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+                         Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
+  case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
+  case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
+  case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
+  case ISD::SIGN_EXTEND_INREG:
+                         Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
+  case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
+  case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
+  case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
+  case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
+  case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+
+  case ISD::EXTRACT_SUBVECTOR:
+                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::VECTOR_SHUFFLE:
+                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::BUILD_VECTOR:
+                         Res = PromoteIntRes_BUILD_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                         Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:
+                         Res = PromoteIntRes_CONCAT_VECTORS(N); break;
+
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
+
+  case ISD::FP32_TO_FP16:Res = PromoteIntRes_FP32_TO_FP16(N); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
+
+  case ISD::SDIV:
+  case ISD::SREM:        Res = PromoteIntRes_SDIV(N); break;
+
+  case ISD::UDIV:
+  case ISD::UREM:        Res = PromoteIntRes_UDIV(N); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
+  case ISD::UADDO:
+  case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
+  case ISD::SMULO:
+  case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
+
+  case ISD::ATOMIC_LOAD:
+    Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+    Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    Res = PromoteIntRes_Atomic2(cast<AtomicSDNode>(N)); break;
+  }
+
+  // If the result is null then the sub-method took care of registering it.
+  if (Res.getNode())
+    SetPromotedInteger(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
+                                                     unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetPromotedInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
+  // Sign-extend the new bits, and continue the assertion.
+  SDValue Op = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertSext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
+  // Zero the new bits, and continue the assertion.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertZext, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
+  EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(), ResVT,
+                              N->getChain(), N->getBasePtr(),
+                              N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(),
+                              N->getChain(), N->getBasePtr(),
+                              Op2, N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(), N->getChain(), N->getBasePtr(),
+                              Op2, Op3, N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypePromoteInteger:
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
+      // The input promotes to the same size.  Convert the promoted value.
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
+    break;
+  case TargetLowering::TypeSoftenFloat:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    break;
+  case TargetLowering::TypeScalarizeVector:
+    // Convert the element to an integer and promote it by hand.
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         BitConvertToInteger(GetScalarizedVector(InOp)));
+    break;
+  case TargetLowering::TypeSplitVector: {
+    // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
+    // pieces of the input into integers and reassemble in the final type.
+    SDValue Lo, Hi;
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    Lo = BitConvertToInteger(Lo);
+    Hi = BitConvertToInteger(Hi);
+
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+
+    InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       EVT::getIntegerVT(*DAG.getContext(),
+                                         NOutVT.getSizeInBits()),
+                       JoinIntegers(Lo, Hi));
+    return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
+  }
+  case TargetLowering::TypeWidenVector:
+    // The input is widened to the same size. Convert to the widened value.
+    // Make sure that the outgoing value is not a vector, because this would
+    // make us bitcast between two vectors which are legalized in different ways.
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector())
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
+  }
+
+  return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                     CreateStackStoreLoad(InOp, OutVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, TLI.getPointerTy()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
+  // The pair element type may be legal, or may not promote to the same type as
+  // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                     N->getValueType(0)), JoinIntegers(N->getOperand(0),
+                     N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = N->getDebugLoc();
+  // Zero extend things like i1, sign extend everything else.  It shouldn't
+  // matter in theory which one we pick, but this tends to give better code?
+  unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  SDValue Result = DAG.getNode(Opc, dl,
+                               TLI.getTypeToTransformTo(*DAG.getContext(), VT),
+                               SDValue(N, 0));
+  assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
+          "can only promote integers");
+  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getConvertRndSat(OutVT, N->getDebugLoc(), N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+  // Subtract off the extra leading bits in the bigger type.
+  return DAG.getNode(ISD::SUB, dl, NVT, Op,
+                     DAG.getConstant(NVT.getSizeInBits() -
+                                     OVT.getSizeInBits(), NVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), Op.getValueType(), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getOpcode() == ISD::CTTZ) {
+    // The count is the same in the promoted type except if the original
+    // value was zero.  This can be handled by setting the bit just off
+    // the top of the original type.
+    APInt TopBit(NVT.getSizeInBits(), 0);
+    TopBit.setBit(OVT.getSizeInBits());
+    Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, NVT));
+  }
+  return DAG.getNode(N->getOpcode(), dl, NVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0),
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NewOpc = N->getOpcode();
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
+  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
+  // and SINT conversions are Custom, there is no way to tell which is
+  // preferable. We choose SINT because that's the right thing on PPC.)
+  if (N->getOpcode() == ISD::FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+                     ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
+                     DAG.getValueType(N->getValueType(0).getScalarType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+
+  return DAG.getNode(ISD::AssertZext, dl,
+                     NVT, Res, DAG.getValueType(N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
+    SDValue Res = GetPromotedInteger(N->getOperand(0));
+    assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
+
+    // If the result and operand types are the same after promotion, simplify
+    // to an in-register extension.
+    if (NVT == Res.getValueType()) {
+      // The high bits are not guaranteed to be anything.  Insert an extend.
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                           DAG.getValueType(N->getOperand(0).getValueType()));
+      if (N->getOpcode() == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendInReg(Res, dl,
+                      N->getOperand(0).getValueType().getScalarType());
+      assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
+      return Res;
+    }
+  }
+
+  // Otherwise, just extend the original operand all the way to the larger type.
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ISD::LoadExtType ExtType =
+    ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+                               N->getPointerInfo(),
+                               N->getMemoryVT(), N->isVolatile(),
+                               N->isNonTemporal(), N->getAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+/// Promote the overflow flag of an overflowing arithmetic node.
+SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
+  // Simply change the return type of the boolean result.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+  EVT ValueVTs[] = { N->getValueType(0), NVT };
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Res = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                            DAG.getVTList(ValueVTs, 2), Ops, 2);
+
+  // Modified the sum result - switch anything that used the old sum to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 0), Res);
+
+  return SDValue(Res.getNode(), 1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // sign extension of its truncation to the original type.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: sign extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                            DAG.getValueType(OVT));
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
+  SDValue Mask = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  Mask = PromoteTargetBoolean(Mask, TLI.getSetCCResultType(OpTy));
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::VSELECT, N->getDebugLoc(),
+                     LHS.getValueType(), Mask, LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(2));
+  SDValue RHS = GetPromotedInteger(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
+  EVT SVT = TLI.getSetCCResultType(N->getOperand(0).getValueType());
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  // Only use the result of getSetCCResultType if it is legal,
+  // otherwise just use the promoted result type (NVT).
+  if (!TLI.isTypeLegal(SVT))
+    SVT = NVT;
+
+  DebugLoc dl = N->getDebugLoc();
+  assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
+         "Vector compare must return a vector result!");
+
+  // Get the SETCC result using the canonical SETCC type.
+  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2));
+
+  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
+  // Convert to the expected type.
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+  SDValue Res = GetPromotedInteger(N->getOperand(0));
+  SDValue Amt = N->getOperand(1);
+  Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
+  return DAG.getNode(ISD::SHL, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+  // The input may have strange things in the top bits of the registers, but
+  // these operations don't care.  They may have weird bits going out, but
+  // that too is okay if they are integer operations.
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = GetPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+  // The input value must be properly sign extended.
+  SDValue Res = SExtPromotedInteger(N->getOperand(0));
+  SDValue Amt = N->getOperand(1);
+  Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
+  return DAG.getNode(ISD::SRA, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+  // The input value must be properly zero extended.
+  SDValue Res = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Amt = N->getOperand(1);
+  Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
+  return DAG.getNode(ISD::SRL, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res;
+  SDValue InOp = N->getOperand(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InOp.getValueType())) {
+  default: llvm_unreachable("Unknown type action!");
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypeExpandInteger:
+    Res = InOp;
+    break;
+  case TargetLowering::TypePromoteInteger:
+    Res = GetPromotedInteger(InOp);
+    break;
+  case TargetLowering::TypeSplitVector:
+    EVT InVT = InOp.getValueType();
+    assert(InVT.isVector() && "Cannot split scalar types");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert(NumElts == NVT.getVectorNumElements() &&
+           "Dst and Src must have the same number of elements");
+    assert(isPowerOf2_32(NumElts) &&
+           "Promoted vector type must be a power of two");
+
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
+    EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
+                                   NumElts/2);
+    EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
+    EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
+  }
+
+  // Truncate to NVT instead of VT
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // zero extension of its truncation to the original type.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: zero extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
+  // Promote the overflow bit trivially.
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  EVT SmallVT = LHS.getValueType();
+
+  // To determine if the result overflowed in a larger type, we extend the
+  // input to the larger type, do the multiply (checking if it overflows),
+  // then also check the high bits of the result to see if overflow happened
+  // there.
+  if (N->getOpcode() == ISD::SMULO) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+  } else {
+    LHS = ZExtPromotedInteger(LHS);
+    RHS = ZExtPromotedInteger(RHS);
+  }
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1));
+  SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS);
+
+  // Overflow occurred if it occurred in the larger type, or if the high part
+  // of the result does not zero/sign-extend the low part.  Check this second
+  // possibility first.
+  SDValue Overflow;
+  if (N->getOpcode() == ISD::UMULO) {
+    // Unsigned overflow occurred if the high part is non-zero.
+    SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
+                             DAG.getIntPtrConstant(SmallVT.getSizeInBits()));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
+                            DAG.getConstant(0, Hi.getValueType()), ISD::SETNE);
+  } else {
+    // Signed overflow occurred if the high part does not sign extend the low.
+    SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(),
+                               Mul, DAG.getValueType(SmallVT));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE);
+  }
+
+  // The only other way for overflow to occur is if the multiplication in the
+  // larger type itself overflowed.
+  Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow,
+                         SDValue(Mul.getNode(), 1));
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Overflow);
+  return Mul;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
+  unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
+  // The argument is passed as NumRegs registers of type RegVT.
+
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
+    Chain = Parts[i].getValue(1);
+  }
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::reverse(Parts.begin(), Parts.end());
+
+  // Assemble the parts in the promoted type.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
+  for (unsigned i = 1; i < NumRegs; ++i) {
+    SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
+    // Shift it to the right position and "or" it in.
+    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
+                       DAG.getConstant(i * RegVT.getSizeInBits(),
+                                       TLI.getPointerTy()));
+    Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
+  }
+
+  // Modified the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Res;
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need promotion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+    dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to promote this operator's operand!");
+
+  case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::ATOMIC_STORE:
+    Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+    break;
+  case ISD::BITCAST:      Res = PromoteIntOp_BITCAST(N); break;
+  case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
+  case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
+  case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
+  case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::CONVERT_RNDSAT:
+                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                          Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
+  case ISD::MEMBARRIER:   Res = PromoteIntOp_MEMBARRIER(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                          Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
+  case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
+                                                   OpNo); break;
+  case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
+  case ISD::FP16_TO_FP32:
+  case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends.  Note that we could
+  // insert sign extends for ALL conditions, but zero extend is cheaper on
+  // many machines (an AND instead of two shifts), so prefer it.
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer comparison!");
+  case ISD::SETEQ:
+  case ISD::SETNE:
+  case ISD::SETUGE:
+  case ISD::SETUGT:
+  case ISD::SETULE:
+  case ISD::SETULT:
+    // ALL of these operations will work if we either sign or zero extend
+    // the operands (including the unsigned comparisons!).  Zero extend is
+    // usually a simpler/cheaper operation, so prefer it.
+    NewLHS = ZExtPromotedInteger(NewLHS);
+    NewRHS = ZExtPromotedInteger(NewRHS);
+    break;
+  case ISD::SETGE:
+  case ISD::SETGT:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    NewLHS = SExtPromotedInteger(NewLHS);
+    NewRHS = SExtPromotedInteger(NewRHS);
+    break;
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  return DAG.getAtomic(N->getOpcode(), N->getDebugLoc(), N->getMemoryVT(),
+                       N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(),
+                       N->getOrdering(), N->getSynchScope());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+  // This should only occur in unusual situations like bitcasting to an
+  // x86_fp80, so just turn it into a store+load
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
+
+  // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
+  // legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT SVT = TLI.getSetCCResultType(MVT::Other);
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
+
+  // The chain (Op#0) and basic block destination (Op#2) are always legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
+  // Since the result type is legal, the operands must promote to it.
+  EVT OVT = N->getOperand(0).getValueType();
+  SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Hi = GetPromotedInteger(N->getOperand(1));
+  assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
+  DebugLoc dl = N->getDebugLoc();
+
+  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
+                   DAG.getConstant(OVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type is not.  This implies
+  // that the vector is a power-of-two in length and that the element
+  // type does not have a strange size (eg: it is not i1).
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+
+  // Promote the inserted value.  The type does not need to match the
+  // vector element type.  Check that any extra bits introduced will be
+  // truncated away.
+  assert(N->getOperand(0).getValueType().getSizeInBits() >=
+         N->getValueType(0).getVectorElementType().getSizeInBits() &&
+         "Type of inserted value narrower than vector element type!");
+
+  SmallVector<SDValue, 16> NewOps;
+  for (unsigned i = 0; i < NumElts; ++i)
+    NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
+
+  return SDValue(DAG.UpdateNodeOperands(N, &NewOps[0], NumElts), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
+           "can only promote integer arguments");
+  SDValue InOp = GetPromotedInteger(N->getOperand(0));
+  return DAG.getConvertRndSat(N->getValueType(0), N->getDebugLoc(), InOp,
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
+                                                         unsigned OpNo) {
+  if (OpNo == 1) {
+    // Promote the inserted value.  This is valid because the type does not
+    // have to match the vector element type.
+
+    // Check that any extra bits introduced will be truncated away.
+    assert(N->getOperand(1).getValueType().getSizeInBits() >=
+           N->getValueType(0).getVectorElementType().getSizeInBits() &&
+           "Type of inserted value narrower than vector element type!");
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                  GetPromotedInteger(N->getOperand(1)),
+                                  N->getOperand(2)),
+                   0);
+  }
+
+  assert(OpNo == 2 && "Different operand and result vector types?");
+
+  // Promote the index.
+  SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MEMBARRIER(SDNode *N) {
+  SDValue NewOps[6];
+  DebugLoc dl = N->getDebugLoc();
+  NewOps[0] = N->getOperand(0);
+  for (unsigned i = 1; i < array_lengthof(NewOps); ++i) {
+    SDValue Flag = GetPromotedInteger(N->getOperand(i));
+    NewOps[i] = DAG.getZeroExtendInReg(Flag, dl, MVT::i1);
+  }
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps, array_lengthof(NewOps)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
+  // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
+  // the operand in place.
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT SVT = TLI.getSetCCResultType(N->getOpcode() == ISD::SELECT ?
+                                   OpTy.getScalarType() : OpTy);
+  Cond = PromoteTargetBoolean(Cond, SVT);
+
+  return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
+
+  // The CC (#4) and the possible return values (#2 and #3) have legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
+
+  // The CC (#2) is always legal.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  DebugLoc dl = N->getDebugLoc();
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
+                     Op, DAG.getValueType(N->getOperand(0).getValueType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
+
+  // Truncate the value and store the result.
+  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getPointerInfo(),
+                           N->getMemoryVT(),
+                           isVolatile, isNonTemporal, Alignment);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getZeroExtendInReg(Op, dl,
+                                N->getOperand(0).getValueType().getScalarType());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
+  case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
+  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
+  case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
+  case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
+  case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
+  case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
+  case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
+  case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(N);
+    SplitInteger(Tmp.first, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    break;
+  }
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
+
+  case ISD::ADDC:
+  case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
+  case ISD::UMULO:
+  case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
+}
+
+/// Lower an atomic node to the appropriate builtin call.
+std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC;
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic intrinsic Expand!");
+  case ISD::ATOMIC_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_LOCK_TEST_AND_SET_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    }
+    break;
+  case ISD::ATOMIC_CMP_SWAP:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_ADD_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_SUB:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_SUB_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_AND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_AND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_OR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_OR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_XOR:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_XOR_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_NAND:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_NAND_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    }
+    break;
+  }
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
+/// ExpandShiftByConstant - N is a shift by a value that needs to be expanded,
+/// and the shift amount is a constant 'Amt'.  Expand the operation.
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                                             SDValue &Lo, SDValue &Hi) {
+  DebugLoc DL = N->getDebugLoc();
+  // Expand the incoming operand to be shifted, so that we have its parts
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  EVT NVT = InL.getValueType();
+  unsigned VTBits = N->getValueType(0).getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  EVT ShTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::SHL) {
+    if (Amt > VTBits) {
+      Lo = Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getNode(ISD::SHL, DL,
+                       NVT, InL, DAG.getConstant(Amt-NVTBits, ShTy));
+    } else if (Amt == NVTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = InL;
+    } else if (Amt == 1 &&
+               TLI.isOperationLegalOrCustom(ISD::ADDC,
+                              TLI.getTypeToExpandTo(*DAG.getContext(), NVT))) {
+      // Emit this X << 1 as X+X.
+      SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
+      SDValue LoOps[2] = { InL, InL };
+      Lo = DAG.getNode(ISD::ADDC, DL, VTList, LoOps, 2);
+      SDValue HiOps[3] = { InH, InH, Lo.getValue(1) };
+      Hi = DAG.getNode(ISD::ADDE, DL, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, ShTy));
+      Hi = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::SRL) {
+    if (Amt > VTBits) {
+      Lo = DAG.getConstant(0, NVT);
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt > NVTBits) {
+      Lo = DAG.getNode(ISD::SRL, DL,
+                       NVT, InH, DAG.getConstant(Amt-NVTBits,ShTy));
+      Hi = DAG.getConstant(0, NVT);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      Lo = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(Amt, ShTy)),
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(NVTBits-Amt, ShTy)));
+      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, ShTy));
+    }
+    return;
+  }
+
+  assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+  if (Amt > VTBits) {
+    Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                          DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt > NVTBits) {
+    Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(Amt-NVTBits, ShTy));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else if (Amt == NVTBits) {
+    Lo = InH;
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits-1, ShTy));
+  } else {
+    Lo = DAG.getNode(ISD::OR, DL, NVT,
+                     DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                 DAG.getConstant(Amt, ShTy)),
+                     DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                 DAG.getConstant(NVTBits-Amt, ShTy)));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, ShTy));
+  }
+}
+
+/// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
+/// this shift based on knowledge of the high bit of the shift amount.  If we
+/// can tell this, we know that it is >= 32 or < 32, without knowing the actual
+/// shift amount.
+bool DAGTypeLegalizer::
+ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned ShBits = ShTy.getScalarType().getSizeInBits();
+  unsigned NVTBits = NVT.getScalarType().getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(N->getOperand(1), KnownZero, KnownOne);
+
+  // If we don't know anything about the high bits, exit.
+  if (((KnownZero|KnownOne) & HighBitMask) == 0)
+    return false;
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // If we know that any of the high bits of the shift amount are one, then we
+  // can do this as a couple of simple shifts.
+  if (KnownOne.intersects(HighBitMask)) {
+    // Mask out the high bit, which we know is set.
+    Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
+                      DAG.getConstant(~HighBitMask, ShTy));
+
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:
+      Lo = DAG.getConstant(0, NVT);              // Low part is zero.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+      return true;
+    case ISD::SRL:
+      Hi = DAG.getConstant(0, NVT);              // Hi part is zero.
+      Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    case ISD::SRA:
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits-1, ShTy));
+      Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    }
+  }
+
+  // If we know that all of the high bits of the shift amount are zero, then we
+  // can do this as a couple of simple shifts.
+  if ((KnownZero & HighBitMask) == HighBitMask) {
+    // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined
+    // shift if x is zero.  We can use XOR here because x is known to be smaller
+    // than 32.
+    SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt,
+                               DAG.getConstant(NVTBits-1, ShTy));
+
+    unsigned Op1, Op2;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
+    case ISD::SRL:
+    case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
+    }
+
+    // When shifting right the arithmetic for Lo and Hi is swapped.
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(InL, InH);
+
+    // Use a little trick to get the bits that move from Lo to Hi. First
+    // shift by one bit.
+    SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, ShTy));
+    // Then compute the remaining shift with amount-1.
+    SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2);
+
+    Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt);
+    Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2);
+
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(Hi, Lo);
+    return true;
+  }
+
+  return false;
+}
+
+/// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
+/// of any size.
+bool DAGTypeLegalizer::
+ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
+
+  SDValue LoS, HiS, LoL, HiL;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unknown shift");
+  case ISD::SHL:
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, NVT);                        // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRL:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, NVT);                        // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRA:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits-1, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    return true;
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  EVT NVT = LHSL.getValueType();
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
+  // them.  TODO: Teach operation legalization how to expand unsupported
+  // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
+  // a carry of type MVT::Glue, but there doesn't seem to be any way to
+  // generate a value of this type in the expanded code sequence.
+  bool hasCarry =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::ADDC : ISD::SUBC,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+
+  if (hasCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+    } else {
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps, 2);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, HiOps, 2);
+    SDValue Cmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[0],
+                                ISD::SETULT);
+    SDValue Carry1 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp1,
+                                 DAG.getConstant(1, NVT),
+                                 DAG.getConstant(0, NVT));
+    SDValue Cmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[1],
+                                ISD::SETULT);
+    SDValue Carry2 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp2,
+                                 DAG.getConstant(1, NVT), Carry1);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
+  } else {
+    Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps, 2);
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps, 2);
+    SDValue Cmp =
+      DAG.getSetCC(dl, TLI.getSetCCResultType(LoOps[0].getValueType()),
+                   LoOps[0], LoOps[1], ISD::SETULT);
+    SDValue Borrow = DAG.getNode(ISD::SELECT, dl, NVT, Cmp,
+                                 DAG.getConstant(1, NVT),
+                                 DAG.getConstant(0, NVT));
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  if (N->getOpcode() == ISD::ADDC) {
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps, 3);
+  } else {
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps, 2);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps, 3);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps, 3);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Res = DisintegrateMERGE_VALUES(N, ResNo);
+  SplitInteger(Res, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is any extension of the input (which degenerates to a copy).
+    Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
+    Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part replicates the sign bit of Lo, make it explicit.
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(NVTBits-1, TLI.getPointerTy()));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part must be zero, make it explicit.
+    Hi = DAG.getConstant(0, NVT);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+  const APInt &Cst = cast<ConstantSDNode>(N)->getAPIntValue();
+  Lo = DAG.getConstant(Cst.trunc(NBitWidth), NVT);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue HiNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Hi,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, HiNotZero, HiLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
+                   DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue LoNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo,
+                                   DAG.getConstant(0, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi);
+
+  Lo = DAG.getNode(ISD::SELECT, dl, NVT, LoNotZero, LoLZ,
+                   DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Hi = DAG.getConstant(0, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl),
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl),
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  bool isInvariant = N->isInvariant();
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    EVT MemVT = N->getMemoryVT();
+
+    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
+                        MemVT, isVolatile, isNonTemporal, Alignment);
+
+    // Remember the chain.
+    Ch = Lo.getValue(1);
+
+    if (ExtType == ISD::SEXTLOAD) {
+      // The high part is obtained by SRA'ing all but one of the bits of the
+      // lo part.
+      unsigned LoSize = Lo.getValueType().getSizeInBits();
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                       DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      // The high part is just a zero.
+      Hi = DAG.getConstant(0, NVT);
+    } else {
+      assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
+      // The high part is undefined.
+      Hi = DAG.getUNDEF(NVT);
+    }
+  } else if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
+                     isVolatile, isNonTemporal, isInvariant, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
+                        isVolatile, isNonTemporal,
+                        MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned loads at
+    // the cost of some bit-fiddling.
+    EVT MemVT = N->getMemoryVT();
+    unsigned EBytes = MemVT.getStoreSize();
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+
+    // Load both the high bits and maybe some of the low bits.
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
+                        EVT::getIntegerVT(*DAG.getContext(),
+                                          MemVT.getSizeInBits() - ExcessBits),
+                        isVolatile, isNonTemporal, Alignment);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    // Load the rest of the low bits.
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize),
+                        EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                        isVolatile, isNonTemporal,
+                        MinAlign(Alignment, IncrementSize));
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer low bits from the bottom of Hi to the top of Lo.
+      Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
+                       DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                                   DAG.getConstant(ExcessBits,
+                                                   TLI.getPointerTy())));
+      // Move high bits to the right position in Hi.
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl,
+                       NVT, Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                       TLI.getPointerTy()));
+    }
+  }
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
+                                        SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, NVT);
+  bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, NVT);
+  bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, NVT);
+  bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, NVT);
+  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
+    SDValue LL, LH, RL, RH;
+    GetExpandedInteger(N->getOperand(0), LL, LH);
+    GetExpandedInteger(N->getOperand(1), RL, RH);
+    unsigned OuterBitSize = VT.getSizeInBits();
+    unsigned InnerBitSize = NVT.getSizeInBits();
+    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
+    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
+
+    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
+        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+      // The inputs are both zero-extended.
+      if (HasUMUL_LOHI) {
+        // We can emit a umul_lohi.
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHU) {
+        // We can emit a mulhu+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
+      // The input values are both sign-extended.
+      if (HasSMUL_LOHI) {
+        // We can emit a smul_lohi.
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(NVT, NVT), LL, RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return;
+      }
+      if (HasMULHS) {
+        // We can emit a mulhs+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHS, dl, NVT, LL, RL);
+        return;
+      }
+    }
+    if (HasUMUL_LOHI) {
+      // Lo,Hi = umul LHS, RHS.
+      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                       DAG.getVTList(NVT, NVT), LL, RL);
+      Lo = UMulLOHI;
+      Hi = UMulLOHI.getValue(1);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+    if (HasMULHU) {
+      Lo = DAG.getNode(ISD::MUL, dl, NVT, LL, RL);
+      Hi = DAG.getNode(ISD::MULHU, dl, NVT, LL, RL);
+      RH = DAG.getNode(ISD::MUL, dl, NVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, NVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, LH);
+      return;
+    }
+  }
+
+  // If nothing else, we can make a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl),
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Compute the overflow.
+  //
+  //   LHSSign -> LHS >= 0
+  //   RHSSign -> RHS >= 0
+  //   SumSign -> Sum >= 0
+  //
+  //   Add:
+  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+  //   Sub:
+  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+  //
+  EVT OType = Node->getValueType(1);
+  SDValue Zero = DAG.getConstant(0, LHS.getValueType());
+
+  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                    Node->getOpcode() == ISD::SADDO ?
+                                    ISD::SETEQ : ISD::SETNE);
+
+  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Cmp);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // If we can emit an efficient shift operation, do so now.  Check to see if
+  // the RHS is a constant.
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return ExpandShiftByConstant(N, CN->getZExtValue(), Lo, Hi);
+
+  // If we can determine that the high bit of the shift is zero or one, even if
+  // the low bits are variable, emit this shift in an optimized form.
+  if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
+    return;
+
+  // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
+  unsigned PartsOpc;
+  if (N->getOpcode() == ISD::SHL) {
+    PartsOpc = ISD::SHL_PARTS;
+  } else if (N->getOpcode() == ISD::SRL) {
+    PartsOpc = ISD::SRL_PARTS;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    PartsOpc = ISD::SRA_PARTS;
+  }
+
+  // Next check to see if the target supports this SHL_PARTS operation or if it
+  // will custom expand it.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
+  if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
+      Action == TargetLowering::Custom) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH;
+    GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+    EVT VT = LHSL.getValueType();
+
+    // If the shift amount operand is coming from a vector legalization it may
+    // have an illegal type.  Fix that first by casting the operand, otherwise
+    // the new SHL_PARTS operation would need further legalization.
+    SDValue ShiftOp = N->getOperand(1);
+    EVT ShiftTy = TLI.getShiftAmountTy(VT);
+    assert(ShiftTy.getScalarType().getSizeInBits() >=
+           Log2_32_Ceil(VT.getScalarType().getSizeInBits()) &&
+           "ShiftAmountTy is too small to cover the range of this type!");
+    if (ShiftOp.getValueType() != ShiftTy)
+      ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
+
+    SDValue Ops[] = { LHSL, LHSH, ShiftOp };
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops, 3);
+    Hi = Lo.getValue(1);
+    return;
+  }
+
+  // Otherwise, emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  bool isSigned;
+  if (N->getOpcode() == ISD::SHL) {
+    isSigned = false; /*sign irrelevant*/
+    if (VT == MVT::i16)
+      LC = RTLIB::SHL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SHL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SHL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SHL_I128;
+  } else if (N->getOpcode() == ISD::SRL) {
+    isSigned = false;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRL_I128;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    isSigned = true;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRA_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRA_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRA_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRA_I128;
+  }
+
+  if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    return;
+  }
+
+  if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
+    llvm_unreachable("Unsupported shift!");
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is sign extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
+    // The high part is obtained by SRA'ing all but one of the bits of low part.
+    unsigned LoSize = NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(LoSize-1, TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::
+ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (EVT.bitsLE(Lo.getValueType())) {
+    // sext_inreg the low part if needed.
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
+                     N->getOperand(1));
+
+    // The high part gets the sign extension from the lo-part.  This handles
+    // things like sextinreg V:i64 from i8.
+    Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
+                     DAG.getConstant(Hi.getValueType().getSizeInBits()-1,
+                                     TLI.getPointerTy()));
+  } else {
+    // For example, extension of an i48 to an i64.  Leave the low part alone,
+    // sext_inreg the high part.
+    unsigned ExcessBits =
+      EVT.getSizeInBits() - Lo.getValueType().getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::SRL, dl,
+                   N->getOperand(0).getValueType(), N->getOperand(0),
+                   DAG.getConstant(NVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+  // overflows iff a - b > a.
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
+                             N->getOpcode () == ISD::UADDO ?
+                             ISD::SETULT : ISD::SETUGT);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // A divide for UMULO should be faster than a function call.
+  if (N->getOpcode() == ISD::UMULO) {
+    SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+
+    SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
+    SplitInteger(MUL, Lo, Hi);
+
+    // A divide for UMULO will be faster than a function call. Select to
+    // make sure we aren't using 0.
+    SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+                                  RHS, DAG.getConstant(0, VT), ISD::SETEQ);
+    SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
+                                  DAG.getConstant(1, VT), RHS);
+    SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
+    SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
+                                    ISD::SETNE);
+    Overflow = DAG.getNode(ISD::SELECT, dl, N->getValueType(1), isZero,
+                           DAG.getConstant(0, N->getValueType(1)),
+                           Overflow);
+    ReplaceValueWith(SDValue(N, 1), Overflow);
+    return;
+  }
+
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  EVT PtrVT = TLI.getPointerTy();
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+  
+  // Replace this with a libcall that will check overflow.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::MULO_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MULO_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MULO_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!");
+
+  SDValue Temp = DAG.CreateStackTemporary(PtrVT);
+  // Temporary for the overflow value, default it to zero.
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl,
+                               DAG.getConstant(0, PtrVT), Temp,
+                               MachinePointerInfo(), false, false, 0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = N->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = N->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = true;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the address of the overflow check.
+  Entry.Node = Temp;
+  Entry.Ty = PtrTy->getPointerTo();
+  Entry.isSExt = true;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, RetTy, true, false, false, false,
+                       0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Func, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  SplitInteger(CallInfo.first, Lo, Hi);
+  SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp,
+                              MachinePointerInfo(), false, false, false, 0);
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
+                             DAG.getConstant(0, PtrVT),
+                             ISD::SETNE);
+  // Use the overflow from the libcall everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is zero extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
+    Hi = DAG.getConstant(0, NVT);   // The high part is just a zero.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getZeroExtendInReg(Hi, dl,
+                                EVT::getIntegerVT(*DAG.getContext(),
+                                                  ExcessBits));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+                               N->getOperand(0),
+                               N->getOperand(1), Zero, Zero,
+                               cast<AtomicSDNode>(N)->getMemOperand(),
+                               cast<AtomicSDNode>(N)->getOrdering(),
+                               cast<AtomicSDNode>(N)->getSynchScope());
+  ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(1));
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need expansion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:           Res = ExpandOp_BITCAST(N); break;
+  case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
+  case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+  case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
+  case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
+
+  case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
+                                                  SDValue &NewRHS,
+                                                  ISD::CondCode &CCCode,
+                                                  DebugLoc dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(NewLHS, LHSLo, LHSHi);
+  GetExpandedInteger(NewRHS, RHSLo, RHSHi);
+
+  if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
+    if (RHSLo == RHSHi) {
+      if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
+        if (RHSCST->isAllOnesValue()) {
+          // Equality comparison to -1.
+          NewLHS = DAG.getNode(ISD::AND, dl,
+                               LHSLo.getValueType(), LHSLo, LHSHi);
+          NewRHS = RHSLo;
+          return;
+        }
+      }
+    }
+
+    NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
+    NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
+    NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    return;
+  }
+
+  // If this is a comparison of the sign bit, just look at the top part.
+  // X > -1,  x < 0
+  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
+    if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
+      NewLHS = LHSHi;
+      NewRHS = RHSHi;
+      return;
+    }
+
+  // FIXME: This generated code sucks.
+  ISD::CondCode LowCC;
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer setcc!");
+  case ISD::SETLT:
+  case ISD::SETULT: LowCC = ISD::SETULT; break;
+  case ISD::SETGT:
+  case ISD::SETUGT: LowCC = ISD::SETUGT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: LowCC = ISD::SETULE; break;
+  case ISD::SETGE:
+  case ISD::SETUGE: LowCC = ISD::SETUGE; break;
+  }
+
+  // Tmp1 = lo(op1) < lo(op2)   // Always unsigned comparison
+  // Tmp2 = hi(op1) < hi(op2)   // Signedness depends on operands
+  // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2;
+
+  // NOTE: on targets without efficient SELECT of bools, we can always use
+  // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true, NULL);
+  SDValue Tmp1, Tmp2;
+  Tmp1 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSLo.getValueType()),
+                           LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl);
+  if (!Tmp1.getNode())
+    Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+                        LHSLo, RHSLo, LowCC);
+  Tmp2 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                           LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl);
+  if (!Tmp2.getNode())
+    Tmp2 = DAG.getNode(ISD::SETCC, dl,
+                       TLI.getSetCCResultType(LHSHi.getValueType()),
+                       LHSHi, RHSHi, DAG.getCondCode(CCCode));
+
+  ConstantSDNode *Tmp1C = dyn_cast<ConstantSDNode>(Tmp1.getNode());
+  ConstantSDNode *Tmp2C = dyn_cast<ConstantSDNode>(Tmp2.getNode());
+  if ((Tmp1C && Tmp1C->isNullValue()) ||
+      (Tmp2C && Tmp2C->isNullValue() &&
+       (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
+        CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) ||
+      (Tmp2C && Tmp2C->getAPIntValue() == 1 &&
+       (CCCode == ISD::SETLT || CCCode == ISD::SETGT ||
+        CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) {
+    // low part is known false, returns high part.
+    // For LE / GE, if high part is known false, ignore the low part.
+    // For LT / GT, if high part is known true, ignore the low part.
+    NewLHS = Tmp2;
+    NewRHS = SDValue();
+    return;
+  }
+
+  NewLHS = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+                             LHSHi, RHSHi, ISD::SETEQ, false,
+                             DagCombineInfo, dl);
+  if (!NewLHS.getNode())
+    NewLHS = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+                          LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getNode(ISD::SELECT, dl, Tmp1.getValueType(),
+                       NewLHS, Tmp1, Tmp2);
+  NewRHS = SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (NewRHS.getNode() == 0) {
+    NewRHS = DAG.getConstant(0, NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (NewRHS.getNode() == 0) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
+  // The value being shifted is legal, but the shift amount is too big.
+  // It follows that either the result of the shift is undefined, or the
+  // upper half of the shift amount is zero.  Just use the lower half.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(1), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this SINT_TO_FP!");
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, N->getDebugLoc());
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+
+  EVT VT = N->getOperand(1).getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  unsigned Alignment = N->getAlignment();
+  bool isVolatile = N->isVolatile();
+  bool isNonTemporal = N->isNonTemporal();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+    return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                             N->getMemoryVT(), isVolatile, isNonTemporal,
+                             Alignment);
+  }
+
+  if (TLI.isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                      isVolatile, isNonTemporal, Alignment);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getIntPtrConstant(IncrementSize));
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
+                           N->getPointerInfo().getWithOffset(IncrementSize),
+                           NEVT, isVolatile, isNonTemporal,
+                           MinAlign(Alignment, IncrementSize));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  }
+
+  // Big-endian - high bits are at low addresses.  Favor aligned stores at
+  // the cost of some bit-fiddling.
+  GetExpandedInteger(N->getValue(), Lo, Hi);
+
+  EVT ExtVT = N->getMemoryVT();
+  unsigned EBytes = ExtVT.getStoreSize();
+  unsigned IncrementSize = NVT.getSizeInBits()/8;
+  unsigned ExcessBits = (EBytes - IncrementSize)*8;
+  EVT HiVT = EVT::getIntegerVT(*DAG.getContext(),
+                               ExtVT.getSizeInBits() - ExcessBits);
+
+  if (ExcessBits < NVT.getSizeInBits()) {
+    // Transfer high bits from the top of Lo to the bottom of Hi.
+    Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                     DAG.getConstant(NVT.getSizeInBits() - ExcessBits,
+                                     TLI.getPointerTy()));
+    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
+                     DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                                 DAG.getConstant(ExcessBits,
+                                                 TLI.getPointerTy())));
+  }
+
+  // Store both the high bits and maybe some of the low bits.
+  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
+                         HiVT, isVolatile, isNonTemporal, Alignment);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  // Store the lowest ExcessBits bits in the second half.
+  Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
+                         N->getPointerInfo().getWithOffset(IncrementSize),
+                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                         isVolatile, isNonTemporal,
+                         MinAlign(Alignment, IncrementSize));
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+  // Just truncate the low part of the source.
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), InL);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // The following optimization is valid only if every value in SrcVT (when
+  // treated as signed) is representable in DstVT.  Check that the mantissa
+  // size of DstVT is >= than the number of bits in SrcVT -1.
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT);
+  if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 &&
+      TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
+    // Do a signed conversion then adjust the result.
+    SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
+    SignedConv = TLI.LowerOperation(SignedConv, DAG);
+
+    // The result of the signed conversion needs adjusting if the 'sign bit' of
+    // the incoming integer was set.  To handle this, we dynamically test to see
+    // if it is set, and, if so, add a fudge factor.
+
+    const uint64_t F32TwoE32  = 0x4F800000ULL;
+    const uint64_t F32TwoE64  = 0x5F800000ULL;
+    const uint64_t F32TwoE128 = 0x7F800000ULL;
+
+    APInt FF(32, 0);
+    if (SrcVT == MVT::i32)
+      FF = APInt(32, F32TwoE32);
+    else if (SrcVT == MVT::i64)
+      FF = APInt(32, F32TwoE64);
+    else if (SrcVT == MVT::i128)
+      FF = APInt(32, F32TwoE128);
+    else
+      llvm_unreachable("Unsupported UINT_TO_FP!");
+
+    // Check whether the sign bit is set.
+    SDValue Lo, Hi;
+    GetExpandedInteger(Op, Lo, Hi);
+    SDValue SignSet = DAG.getSetCC(dl,
+                                   TLI.getSetCCResultType(Hi.getValueType()),
+                                   Hi, DAG.getConstant(0, Hi.getValueType()),
+                                   ISD::SETLT);
+
+    // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+    SDValue FudgePtr = DAG.getConstantPool(
+                               ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                                           TLI.getPointerTy());
+
+    // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+    SDValue Zero = DAG.getIntPtrConstant(0);
+    SDValue Four = DAG.getIntPtrConstant(4);
+    if (TLI.isBigEndian()) std::swap(Zero, Four);
+    SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                                 Zero, Four);
+    unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
+    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    Alignment = std::min(Alignment, 4u);
+
+    // Load the value out, extending it from f32 to the destination float type.
+    // FIXME: Avoid the extend by constructing the right constant pool?
+    SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(),
+                                   FudgePtr,
+                                   MachinePointerInfo::getConstantPool(),
+                                   MVT::f32,
+                                   false, false, Alignment);
+    return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
+  }
+
+  // Otherwise, use a libcall.
+  RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this UINT_TO_FP!");
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                               cast<AtomicSDNode>(N)->getMemoryVT(),
+                               N->getOperand(0),
+                               N->getOperand(1), N->getOperand(2),
+                               cast<AtomicSDNode>(N)->getMemOperand(),
+                               cast<AtomicSDNode>(N)->getOrdering(),
+                               cast<AtomicSDNode>(N)->getSynchScope());
+  return Swap.getValue(1);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned OutNumElems = OutVT.getVectorNumElements();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue BaseIdx = N->getOperand(1);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(OutNumElems);
+  for (unsigned i = 0; i != OutNumElems; ++i) {
+
+    // Extract the element from the original vector.
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
+      BaseIdx, DAG.getIntPtrConstant(i));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+      InVT.getVectorElementType(), N->getOperand(0), Index);
+
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
+    // Insert the converted element to the new vector.
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
+  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    NewMask.push_back(SV->getMaskElt(i));
+  }
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getVectorShuffle(OutVT, dl, V0, V1, &NewMask[0]);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned NumElems = N->getNumOperands();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(NumElems);
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
+
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(!N->getOperand(0).getValueType().isVector() &&
+         "Input must be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT InElemTy = OutVT.getVectorElementType();
+  EVT OutElemTy = NOutVT.getVectorElementType();
+
+  unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
+  unsigned NumOutElem = NOutVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+  assert(NumElem * NumOperands == NumOutElem &&
+         "Unexpected number of elements");
+
+  // Take the elements from the first vector.
+  SmallVector<SDValue, 8> Ops(NumOutElem);
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue Op = N->getOperand(i);
+    for (unsigned j = 0; j < NumElem; ++j) {
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                InElemTy, Op, DAG.getIntPtrConstant(j));
+      Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+
+  SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
+    NOutVTElem, N->getOperand(1));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT,
+    V0, ConvElem, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = N->getOperand(1);
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+    V0->getValueType(0).getScalarType(), V0, V1);
+
+  // EXTRACT_VECTOR_ELT can return types which are wider than the incoming
+  // element types. If this is the case then we need to expand the outgoing
+  // value and not truncate it.
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElems = N->getNumOperands();
+
+  EVT RetSclrTy = N->getValueType(0).getVectorElementType();
+
+  SmallVector<SDValue, 8> NewOps;
+  NewOps.reserve(NumElems);
+
+  // For each incoming vector
+  for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) {
+    SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
+    EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
+    unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
+
+    for (unsigned i=0; i<NumElem; ++i) {
+      // Extract element from incoming vector
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy,
+      Incoming, DAG.getIntPtrConstant(i));
+      SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
+      NewOps.push_back(Tr);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0),
+    &NewOps[0], NewOps.size());
+  }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
new file mode 100644
index 000000000000..b6436bf42741
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -0,0 +1,1145 @@
+//===-- LegalizeTypes.cpp - Common code for DAG type legalizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeTypes method.  It transforms
+// an arbitrary well-formed SelectionDAG to only consist of legal types.  This
+// is common code shared among the LegalizeTypes*.cpp files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
+
+/// PerformExpensiveChecks - Do extensive, expensive, sanity checking.
+void DAGTypeLegalizer::PerformExpensiveChecks() {
+  // If a node is not processed, then none of its values should be mapped by any
+  // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+
+  // If a node is processed, then each value with an illegal type must be mapped
+  // by exactly one of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+  // Values with a legal type may be mapped by ReplacedValues, but not by any of
+  // the other maps.
+
+  // Note that these invariants may not hold momentarily when processing a node:
+  // the node being processed may be put in a map before being marked Processed.
+
+  // Note that it is possible to have nodes marked NewNode in the DAG.  This can
+  // occur in two ways.  Firstly, a node may be created during legalization but
+  // never passed to the legalization core.  This is usually due to the implicit
+  // folding that occurs when using the DAG.getNode operators.  Secondly, a new
+  // node may be passed to the legalization core, but when analyzed may morph
+  // into a different node, leaving the original node as a NewNode in the DAG.
+  // A node may morph if one of its operands changes during analysis.  Whether
+  // it actually morphs or not depends on whether, after updating its operands,
+  // it is equivalent to an existing node: if so, it morphs into that existing
+  // node (CSE).  An operand can change during analysis if the operand is a new
+  // node that morphs, or it is a processed value that was mapped to some other
+  // value (as recorded in ReplacedValues) in which case the operand is turned
+  // into that other value.  If a node morphs then the node it morphed into will
+  // be used instead of it for legalization, however the original node continues
+  // to live on in the DAG.
+  // The conclusion is that though there may be nodes marked NewNode in the DAG,
+  // all uses of such nodes are also marked NewNode: the result is a fungus of
+  // NewNodes growing on top of the useful nodes, and perhaps using them, but
+  // not used by them.
+
+  // If a value is mapped by ReplacedValues, then it must have no uses, except
+  // by nodes marked NewNode (see above).
+
+  // The final node obtained by mapping by ReplacedValues is not marked NewNode.
+  // Note that ReplacedValues should be applied iteratively.
+
+  // Note that the ReplacedValues map may also map deleted nodes (by iterating
+  // over the DAG we never dereference deleted nodes).  This means that it may
+  // also map nodes marked NewNode if the deallocated memory was reallocated as
+  // another node, and that new node was not seen by the LegalizeTypes machinery
+  // (for example because it was created but not used).  In general, we cannot
+  // distinguish between new nodes and deleted nodes.
+  SmallVector<SDNode*, 16> NewNodes;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    // Remember nodes marked NewNode - they are subject to extra checking below.
+    if (I->getNodeId() == NewNode)
+      NewNodes.push_back(I);
+
+    for (unsigned i = 0, e = I->getNumValues(); i != e; ++i) {
+      SDValue Res(I, i);
+      bool Failed = false;
+
+      unsigned Mapped = 0;
+      if (ReplacedValues.find(Res) != ReplacedValues.end()) {
+        Mapped |= 1;
+        // Check that remapped values are only used by nodes marked NewNode.
+        for (SDNode::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          if (UI.getUse().getResNo() == i)
+            assert(UI->getNodeId() == NewNode &&
+                   "Remapped value has non-trivial use!");
+
+        // Check that the final result of applying ReplacedValues is not
+        // marked NewNode.
+        SDValue NewVal = ReplacedValues[Res];
+        DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(NewVal);
+        while (I != ReplacedValues.end()) {
+          NewVal = I->second;
+          I = ReplacedValues.find(NewVal);
+        }
+        assert(NewVal.getNode()->getNodeId() != NewNode &&
+               "ReplacedValues maps to a new node!");
+      }
+      if (PromotedIntegers.find(Res) != PromotedIntegers.end())
+        Mapped |= 2;
+      if (SoftenedFloats.find(Res) != SoftenedFloats.end())
+        Mapped |= 4;
+      if (ScalarizedVectors.find(Res) != ScalarizedVectors.end())
+        Mapped |= 8;
+      if (ExpandedIntegers.find(Res) != ExpandedIntegers.end())
+        Mapped |= 16;
+      if (ExpandedFloats.find(Res) != ExpandedFloats.end())
+        Mapped |= 32;
+      if (SplitVectors.find(Res) != SplitVectors.end())
+        Mapped |= 64;
+      if (WidenedVectors.find(Res) != WidenedVectors.end())
+        Mapped |= 128;
+
+      if (I->getNodeId() != Processed) {
+        // Since we allow ReplacedValues to map deleted nodes, it may map nodes
+        // marked NewNode too, since a deleted node may have been reallocated as
+        // another node that has not been seen by the LegalizeTypes machinery.
+        if ((I->getNodeId() == NewNode && Mapped > 1) ||
+            (I->getNodeId() != NewNode && Mapped != 0)) {
+          dbgs() << "Unprocessed value in a map!";
+          Failed = true;
+        }
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(I)) {
+        if (Mapped > 1) {
+          dbgs() << "Value with legal type was transformed!";
+          Failed = true;
+        }
+      } else {
+        if (Mapped == 0) {
+          dbgs() << "Processed value not in any map!";
+          Failed = true;
+        } else if (Mapped & (Mapped - 1)) {
+          dbgs() << "Value in multiple maps!";
+          Failed = true;
+        }
+      }
+
+      if (Failed) {
+        if (Mapped & 1)
+          dbgs() << " ReplacedValues";
+        if (Mapped & 2)
+          dbgs() << " PromotedIntegers";
+        if (Mapped & 4)
+          dbgs() << " SoftenedFloats";
+        if (Mapped & 8)
+          dbgs() << " ScalarizedVectors";
+        if (Mapped & 16)
+          dbgs() << " ExpandedIntegers";
+        if (Mapped & 32)
+          dbgs() << " ExpandedFloats";
+        if (Mapped & 64)
+          dbgs() << " SplitVectors";
+        if (Mapped & 128)
+          dbgs() << " WidenedVectors";
+        dbgs() << "\n";
+        llvm_unreachable(0);
+      }
+    }
+  }
+
+  // Checked that NewNodes are only used by other NewNodes.
+  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
+    SDNode *N = NewNodes[i];
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI)
+      assert(UI->getNodeId() == NewNode && "NewNode used by non-NewNode!");
+  }
+}
+
+/// run - This is the main entry point for the type legalizer.  This does a
+/// top-down traversal of the dag, legalizing types as it goes.  Returns "true"
+/// if it made any changes.
+bool DAGTypeLegalizer::run() {
+  bool Changed = false;
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+  Dummy.setNodeId(Unanalyzed);
+
+  // The root of the dag may dangle to deleted nodes until the type legalizer is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
+  // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
+  // non-leaves.
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    if (I->getNumOperands() == 0) {
+      I->setNodeId(ReadyToProcess);
+      Worklist.push_back(I);
+    } else {
+      I->setNodeId(Unanalyzed);
+    }
+  }
+
+  // Now that we have a set of nodes to process, handle them all.
+  while (!Worklist.empty()) {
+#ifndef XDEBUG
+    if (EnableExpensiveChecks)
+#endif
+      PerformExpensiveChecks();
+
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+    assert(N->getNodeId() == ReadyToProcess &&
+           "Node should be ready if on worklist!");
+
+    if (IgnoreNodeResults(N))
+      goto ScanOperands;
+
+    // Scan the values produced by the node, checking to see if any result
+    // types are illegal.
+    for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
+      EVT ResultVT = N->getValueType(i);
+      switch (getTypeAction(ResultVT)) {
+      case TargetLowering::TypeLegal:
+        break;
+      // The following calls must take care of *all* of the node's results,
+      // not just the illegal result they were passed (this includes results
+      // with a legal type).  Results can be remapped using ReplaceValueWith,
+      // or their promoted/expanded/etc values registered in PromotedIntegers,
+      // ExpandedIntegers etc.
+      case TargetLowering::TypePromoteInteger:
+        PromoteIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeExpandInteger:
+        ExpandIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSoftenFloat:
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeExpandFloat:
+        ExpandFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeScalarizeVector:
+        ScalarizeVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSplitVector:
+        SplitVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeWidenVector:
+        WidenVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      }
+    }
+
+ScanOperands:
+    // Scan the operand list for the node, handling any nodes with operands that
+    // are illegal.
+    {
+    unsigned NumOperands = N->getNumOperands();
+    bool NeedsReanalyzing = false;
+    unsigned i;
+    for (i = 0; i != NumOperands; ++i) {
+      if (IgnoreNodeResults(N->getOperand(i).getNode()))
+        continue;
+
+      EVT OpVT = N->getOperand(i).getValueType();
+      switch (getTypeAction(OpVT)) {
+      case TargetLowering::TypeLegal:
+        continue;
+      // The following calls must either replace all of the node's results
+      // using ReplaceValueWith, and return "false"; or update the node's
+      // operands in place, and return "true".
+      case TargetLowering::TypePromoteInteger:
+        NeedsReanalyzing = PromoteIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandInteger:
+        NeedsReanalyzing = ExpandIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSoftenFloat:
+        NeedsReanalyzing = SoftenFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandFloat:
+        NeedsReanalyzing = ExpandFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeScalarizeVector:
+        NeedsReanalyzing = ScalarizeVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSplitVector:
+        NeedsReanalyzing = SplitVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeWidenVector:
+        NeedsReanalyzing = WidenVectorOperand(N, i);
+        Changed = true;
+        break;
+      }
+      break;
+    }
+
+    // The sub-method updated N in place.  Check to see if any operands are new,
+    // and if so, mark them.  If the node needs revisiting, don't add all users
+    // to the worklist etc.
+    if (NeedsReanalyzing) {
+      assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+      N->setNodeId(NewNode);
+      // Recompute the NodeId and correct processed operands, adding the node to
+      // the worklist if ready.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M == N)
+        // The node didn't morph - nothing special to do, it will be revisited.
+        continue;
+
+      // The node morphed - this is equivalent to legalizing by replacing every
+      // value of N with the corresponding value of M.  So do that now.
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+        // Replacing the value takes care of remapping the new value.
+        ReplaceValueWith(SDValue(N, i), SDValue(M, i));
+      assert(N->getNodeId() == NewNode && "Unexpected node state!");
+      // The node continues to live on as part of the NewNode fungus that
+      // grows on top of the useful nodes.  Nothing more needs to be done
+      // with it - move on to the next node.
+      continue;
+    }
+
+    if (i == NumOperands) {
+      DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG); dbgs() << "\n");
+    }
+    }
+NodeDone:
+
+    // If we reach here, the node was processed, potentially creating new nodes.
+    // Mark it as processed and add its users to the worklist as appropriate.
+    assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+    N->setNodeId(Processed);
+
+    for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      int NodeId = User->getNodeId();
+
+      // This node has two options: it can either be a new node or its Node ID
+      // may be a count of the number of operands it has that are not ready.
+      if (NodeId > 0) {
+        User->setNodeId(NodeId-1);
+
+        // If this was the last use it was waiting on, add it to the ready list.
+        if (NodeId-1 == ReadyToProcess)
+          Worklist.push_back(User);
+        continue;
+      }
+
+      // If this is an unreachable new node, then ignore it.  If it ever becomes
+      // reachable by being used by a newly created node then it will be handled
+      // by AnalyzeNewNode.
+      if (NodeId == NewNode)
+        continue;
+
+      // Otherwise, this node is new: this is the first operand of it that
+      // became ready.  Its new NodeId is the number of operands it has minus 1
+      // (as this node is now processed).
+      assert(NodeId == Unanalyzed && "Unknown node ID!");
+      User->setNodeId(User->getNumOperands() - 1);
+
+      // If the node only has a single operand, it is now ready.
+      if (User->getNumOperands() == 1)
+        Worklist.push_back(User);
+    }
+  }
+
+#ifndef XDEBUG
+  if (EnableExpensiveChecks)
+#endif
+    PerformExpensiveChecks();
+
+  // If the root changed (e.g. it was a dead load) update the root.
+  DAG.setRoot(Dummy.getValue());
+
+  // Remove dead nodes.  This is important to do for cleanliness but also before
+  // the checking loop below.  Implicit folding by the DAG.getNode operators and
+  // node morphing can cause unreachable nodes to be around with their flags set
+  // to new.
+  DAG.RemoveDeadNodes();
+
+  // In a debug build, scan all the nodes to make sure we found them all.  This
+  // ensures that there are no cycles and that everything got processed.
+#ifndef NDEBUG
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = DAG.allnodes_end(); I != E; ++I) {
+    bool Failed = false;
+
+    // Check that all result types are legal.
+    if (!IgnoreNodeResults(I))
+      for (unsigned i = 0, NumVals = I->getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(I->getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal!\n";
+          Failed = true;
+        }
+
+    // Check that all operand types are legal.
+    for (unsigned i = 0, NumOps = I->getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(I->getOperand(i).getNode()) &&
+          !isTypeLegal(I->getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal!\n";
+        Failed = true;
+      }
+
+    if (I->getNodeId() != Processed) {
+       if (I->getNodeId() == NewNode)
+         dbgs() << "New node not analyzed?\n";
+       else if (I->getNodeId() == Unanalyzed)
+         dbgs() << "Unanalyzed node not noticed?\n";
+       else if (I->getNodeId() > 0)
+         dbgs() << "Operand not processed?\n";
+       else if (I->getNodeId() == ReadyToProcess)
+         dbgs() << "Not added to worklist?\n";
+       Failed = true;
+    }
+
+    if (Failed) {
+      I->dump(&DAG); dbgs() << "\n";
+      llvm_unreachable(0);
+    }
+  }
+#endif
+
+  return Changed;
+}
+
+/// AnalyzeNewNode - The specified node is the root of a subtree of potentially
+/// new nodes.  Correct any processed operands (this may change the node) and
+/// calculate the NodeId.  If the node itself changes to a processed node, it
+/// is not remapped - the caller needs to take care of this.
+/// Returns the potentially changed node.
+SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
+  // If this was an existing node that is already done, we're done.
+  if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
+    return N;
+
+  // Remove any stale map entries.
+  ExpungeNode(N);
+
+  // Okay, we know that this node is new.  Recursively walk all of its operands
+  // to see if they are new also.  The depth of this walk is bounded by the size
+  // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
+  // about revisiting of nodes.
+  //
+  // As we walk the operands, keep track of the number of nodes that are
+  // processed.  If non-zero, this will become the new nodeid of this node.
+  // Operands may morph when they are analyzed.  If so, the node will be
+  // updated after all operands have been analyzed.  Since this is rare,
+  // the code tries to minimize overhead in the non-morphing case.
+
+  SmallVector<SDValue, 8> NewOps;
+  unsigned NumProcessed = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue OrigOp = N->getOperand(i);
+    SDValue Op = OrigOp;
+
+    AnalyzeNewValue(Op); // Op may morph.
+
+    if (Op.getNode()->getNodeId() == Processed)
+      ++NumProcessed;
+
+    if (!NewOps.empty()) {
+      // Some previous operand changed.  Add this one to the list.
+      NewOps.push_back(Op);
+    } else if (Op != OrigOp) {
+      // This is the first operand to change - add all operands so far.
+      NewOps.append(N->op_begin(), N->op_begin() + i);
+      NewOps.push_back(Op);
+    }
+  }
+
+  // Some operands changed - update the node.
+  if (!NewOps.empty()) {
+    SDNode *M = DAG.UpdateNodeOperands(N, &NewOps[0], NewOps.size());
+    if (M != N) {
+      // The node morphed into a different node.  Normally for this to happen
+      // the original node would have to be marked NewNode.  However this can
+      // in theory momentarily not be the case while ReplaceValueWith is doing
+      // its stuff.  Mark the original node NewNode to help sanity checking.
+      N->setNodeId(NewNode);
+      if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
+        // It morphed into a previously analyzed node - nothing more to do.
+        return M;
+
+      // It morphed into a different new node.  Do the equivalent of passing
+      // it to AnalyzeNewNode: expunge it and calculate the NodeId.  No need
+      // to remap the operands, since they are the same as the operands we
+      // remapped above.
+      N = M;
+      ExpungeNode(N);
+    }
+  }
+
+  // Calculate the NodeId.
+  N->setNodeId(N->getNumOperands() - NumProcessed);
+  if (N->getNodeId() == ReadyToProcess)
+    Worklist.push_back(N);
+
+  return N;
+}
+
+/// AnalyzeNewValue - Call AnalyzeNewNode, updating the node in Val if needed.
+/// If the node changes to a processed node, then remap it.
+void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
+  Val.setNode(AnalyzeNewNode(Val.getNode()));
+  if (Val.getNode()->getNodeId() == Processed)
+    // We were passed a processed node, or it morphed into one - remap it.
+    RemapValue(Val);
+}
+
+/// ExpungeNode - If N has a bogus mapping in ReplacedValues, eliminate it.
+/// This can occur when a node is deleted then reallocated as a new node -
+/// the mapping in ReplacedValues applies to the deleted node, not the new
+/// one.
+/// The only map that can have a deleted node as a source is ReplacedValues.
+/// Other maps can have deleted nodes as targets, but since their looked-up
+/// values are always immediately remapped using RemapValue, resulting in a
+/// not-deleted node, this is harmless as long as ReplacedValues/RemapValue
+/// always performs correct mappings.  In order to keep the mapping correct,
+/// ExpungeNode should be called on any new nodes *before* adding them as
+/// either source or target to ReplacedValues (which typically means calling
+/// Expunge when a new node is first seen, since it may no longer be marked
+/// NewNode by the time it is added to ReplacedValues).
+void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
+  if (N->getNodeId() != NewNode)
+    return;
+
+  // If N is not remapped by ReplacedValues then there is nothing to do.
+  unsigned i, e;
+  for (i = 0, e = N->getNumValues(); i != e; ++i)
+    if (ReplacedValues.find(SDValue(N, i)) != ReplacedValues.end())
+      break;
+
+  if (i == e)
+    return;
+
+  // Remove N from all maps - this is expensive but rare.
+
+  for (DenseMap<SDValue, SDValue>::iterator I = PromotedIntegers.begin(),
+       E = PromotedIntegers.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = SoftenedFloats.begin(),
+       E = SoftenedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ScalarizedVectors.begin(),
+       E = ScalarizedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = WidenedVectors.begin(),
+       E = WidenedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedIntegers.begin(), E = ExpandedIntegers.end(); I != E; ++I){
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedFloats.begin(), E = ExpandedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = SplitVectors.begin(), E = SplitVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.begin(),
+       E = ReplacedValues.end(); I != E; ++I)
+    RemapValue(I->second);
+
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    ReplacedValues.erase(SDValue(N, i));
+}
+
+/// RemapValue - If the specified value was already legalized to another value,
+/// replace it by that value.
+void DAGTypeLegalizer::RemapValue(SDValue &N) {
+  DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
+  if (I != ReplacedValues.end()) {
+    // Use path compression to speed up future lookups if values get multiply
+    // replaced with other values.
+    RemapValue(I->second);
+    N = I->second;
+    assert(N.getNode()->getNodeId() != NewNode && "Mapped to new node!");
+  }
+}
+
+namespace {
+  /// NodeUpdateListener - This class is a DAGUpdateListener that listens for
+  /// updates to nodes and recomputes their ready state.
+  class NodeUpdateListener : public SelectionDAG::DAGUpdateListener {
+    DAGTypeLegalizer &DTL;
+    SmallSetVector<SDNode*, 16> &NodesToAnalyze;
+  public:
+    explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
+                                SmallSetVector<SDNode*, 16> &nta)
+      : SelectionDAG::DAGUpdateListener(dtl.getDAG()),
+        DTL(dtl), NodesToAnalyze(nta) {}
+
+    virtual void NodeDeleted(SDNode *N, SDNode *E) {
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      // It is possible, though rare, for the deleted node N to occur as a
+      // target in a map, so note the replacement N -> E in ReplacedValues.
+      assert(E && "Node not replaced?");
+      DTL.NoteDeletion(N, E);
+
+      // In theory the deleted node could also have been scheduled for analysis.
+      // So remove it from the set of nodes which will be analyzed.
+      NodesToAnalyze.remove(N);
+
+      // In general nothing needs to be done for E, since it didn't change but
+      // only gained new uses.  However N -> E was just added to ReplacedValues,
+      // and the result of a ReplacedValues mapping is not allowed to be marked
+      // NewNode.  So if E is marked NewNode, then it needs to be analyzed.
+      if (E->getNodeId() == DAGTypeLegalizer::NewNode)
+        NodesToAnalyze.insert(E);
+    }
+
+    virtual void NodeUpdated(SDNode *N) {
+      // Node updates can mean pretty much anything.  It is possible that an
+      // operand was set to something already processed (f.e.) in which case
+      // this node could become ready.  Recompute its flags.
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      N->setNodeId(DAGTypeLegalizer::NewNode);
+      NodesToAnalyze.insert(N);
+    }
+  };
+}
+
+
+/// ReplaceValueWith - The specified value was legalized to the specified other
+/// value.  Update the DAG and NodeIds replacing any uses of From to use To
+/// instead.
+void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
+  assert(From.getNode() != To.getNode() && "Potential legalization loop!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  ExpungeNode(From.getNode());
+  AnalyzeNewValue(To); // Expunges To.
+
+  // Anything that used the old node should now use the new one.  Note that this
+  // can potentially cause recursive merging.
+  SmallSetVector<SDNode*, 16> NodesToAnalyze;
+  NodeUpdateListener NUL(*this, NodesToAnalyze);
+  do {
+    DAG.ReplaceAllUsesOfValueWith(From, To);
+
+    // The old node may still be present in a map like ExpandedIntegers or
+    // PromotedIntegers.  Inform maps about the replacement.
+    ReplacedValues[From] = To;
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.back();
+      NodesToAnalyze.pop_back();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
+
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
+          // OldVal may be a target of the ReplacedValues map which was marked
+          // NewNode to force reanalysis because it was updated.  Ensure that
+          // anything that ReplacedValues mapped to OldVal will now be mapped
+          // all the way to NewVal.
+          ReplacedValues[OldVal] = NewVal;
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
+      }
+    }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
+}
+
+void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted integer");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = PromotedIntegers[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already promoted!");
+  OpEntry = Result;
+
+  // Propagate node ordering
+  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for softened float");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = SoftenedFloats[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already converted to integer!");
+  OpEntry = Result;
+
+  // Propagate node ordering
+  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
+  // Note that in some cases vector operation operands may be greater than
+  // the vector element type. For example BUILD_VECTOR of type <1 x i1> with
+  // a constant i8 operand.
+  assert(Result.getValueType().getSizeInBits() >=
+         Op.getValueType().getVectorElementType().getSizeInBits() &&
+         "Invalid type for scalarized vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = ScalarizedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node is already scalarized!");
+  OpEntry = Result;
+
+  // Propagate node ordering
+  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
+                                          SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
+                                          SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded integer");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+
+  // Propagate ordering
+  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
+  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
+                                        SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
+                                        SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded float");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  assert(Entry.first.getNode() == 0 && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+
+  // Propagate ordering
+  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
+  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
+                                      SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't split");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
+                                      SDValue Hi) {
+  assert(Lo.getValueType().getVectorElementType() ==
+         Op.getValueType().getVectorElementType() &&
+         2*Lo.getValueType().getVectorNumElements() ==
+         Op.getValueType().getVectorNumElements() &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for split vector");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  assert(Entry.first.getNode() == 0 && "Node already split");
+  Entry.first = Lo;
+  Entry.second = Hi;
+
+  // Propagate ordering
+  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
+  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for widened vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = WidenedVectors[Op];
+  assert(OpEntry.getNode() == 0 && "Node already widened!");
+  OpEntry = Result;
+
+  // Propagate node ordering
+  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+/// BitConvertToInteger - Convert to an integer of the same size.
+SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
+  unsigned BitWidth = Op.getValueType().getSizeInBits();
+  return DAG.getNode(ISD::BITCAST, Op.getDebugLoc(),
+                     EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
+}
+
+/// BitConvertVectorToIntegerVector - Convert to a vector of integers of the
+/// same size.
+SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Only applies to vectors!");
+  unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
+  EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
+  unsigned NumElts = Op.getValueType().getVectorNumElements();
+  return DAG.getNode(ISD::BITCAST, Op.getDebugLoc(),
+                     EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op);
+}
+
+SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
+                                               EVT DestVT) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+                               MachinePointerInfo(), false, false, 0);
+  // Result is a load from the stack slot.
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+/// CustomLowerNode - Replace the node's results with custom code provided
+/// by the target and return "true", or do nothing and return "false".
+/// The last parameter is FALSE if we are dealing with a node with legal
+/// result types and illegal operand. The second parameter denotes the type of
+/// illegal OperandNo in that case.
+/// The last parameter being TRUE means we are dealing with a
+/// node with illegal result types. The second parameter denotes the type of
+/// illegal ResNo in that case.
+bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  if (LegalizeResult)
+    TLI.ReplaceNodeResults(N, Results, DAG);
+  else
+    TLI.LowerOperationWrapper(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom lower it after all.
+    return false;
+
+  // Make everything that once used N's values now use those in Results instead.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i) {
+    ReplaceValueWith(SDValue(N, i), Results[i]);
+    // Propagate node ordering
+    DAG.AssignOrdering(Results[i].getNode(), DAG.GetOrdering(N));
+  }
+  return true;
+}
+
+
+/// CustomWidenLowerNode - Widen the node's results with custom code provided
+/// by the target and return "true", or do nothing and return "false".
+bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  TLI.ReplaceNodeResults(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom widen lower its result  after all.
+    return false;
+
+  // Update the widening map.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i)
+    SetWidenedVector(SDValue(N, i), Results[i]);
+  return true;
+}
+
+SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    if (i != ResNo)
+      ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+  return SDValue(N->getOperand(ResNo));
+}
+
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split into two not necessarily identical pieces.
+void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
+  // Currently all types are split in half.
+  if (!InVT.isVector()) {
+    LoVT = HiVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  } else {
+    unsigned NumElements = InVT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                   InVT.getVectorElementType(), NumElements/2);
+  }
+}
+
+/// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+/// high parts of the given value.
+void DAGTypeLegalizer::GetPairElements(SDValue Pair,
+                                       SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Pair.getDebugLoc();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType());
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(1));
+}
+
+SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
+                                                  SDValue Index) {
+  DebugLoc dl = Index.getDebugLoc();
+  // Make sure the index type is big enough to compute in.
+  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
+    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
+  else
+    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+
+  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                      DAG.getConstant(EltSize, Index.getValueType()));
+  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
+}
+
+/// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
+SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
+  // Arbitrarily use dlHi for result DebugLoc
+  DebugLoc dlHi = Hi.getDebugLoc();
+  DebugLoc dlLo = Lo.getDebugLoc();
+  EVT LVT = Lo.getValueType();
+  EVT HVT = Hi.getValueType();
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                              LVT.getSizeInBits() + HVT.getSizeInBits());
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
+  Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
+                   DAG.getConstant(LVT.getSizeInBits(), TLI.getPointerTy()));
+  return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
+}
+
+/// LibCallify - Convert the node into a libcall with the same prototype.
+SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
+                                     bool isSigned) {
+  unsigned NumOps = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  if (NumOps == 0) {
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned, dl);
+  } else if (NumOps == 1) {
+    SDValue Op = N->getOperand(0);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, dl);
+  } else if (NumOps == 2) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, dl);
+  }
+  SmallVector<SDValue, 8> Ops(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    Ops[i] = N->getOperand(i);
+
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                         &Ops[0], NumOps, isSigned, dl);
+}
+
+// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy());
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
+                    0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Node->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo;
+}
+
+/// PromoteTargetBoolean - Promote the given target boolean to a target boolean
+/// of the given type.  A target boolean is an integer value, not necessarily of
+/// type i1, the bits of which conform to getBooleanContents.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
+  DebugLoc dl = Bool.getDebugLoc();
+  ISD::NodeType ExtendCode =
+    TargetLowering::getExtendForContent(TLI.getBooleanContents(VT.isVector()));
+  return DAG.getNode(ExtendCode, dl, VT, Bool);
+}
+
+/// SplitInteger - Return the lower LoVT bits of Op in Lo and the upper HiVT
+/// bits in Hi.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    EVT LoVT, EVT HiVT,
+                                    SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+         Op.getValueType().getSizeInBits() && "Invalid integer splitting!");
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
+  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
+                   DAG.getConstant(LoVT.getSizeInBits(), TLI.getPointerTy()));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+/// SplitInteger - Return the lower and upper halves of Op's bits in a value
+/// type half the size of Op's.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    SDValue &Lo, SDValue &Hi) {
+  EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
+                                 Op.getValueType().getSizeInBits()/2);
+  SplitInteger(Op, HalfVT, HalfVT, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Entry Point
+//===----------------------------------------------------------------------===//
+
+/// LegalizeTypes - This transforms the SelectionDAG into a SelectionDAG that
+/// only uses types natively supported by the target.  Returns "true" if it made
+/// any changes.
+///
+/// Note that this is an involved process that may invalidate pointers into
+/// the graph.
+bool SelectionDAG::LegalizeTypes() {
+  return DAGTypeLegalizer(*this).run();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
new file mode 100644
index 000000000000..54ea926241cf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -0,0 +1,750 @@
+//===-- LegalizeTypes.h - Definition of the DAG Type Legalizer class ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DAGTypeLegalizer class.  This is a private interface
+// shared between the code that implements the SelectionDAG::LegalizeTypes
+// method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAG_LEGALIZETYPES_H
+#define SELECTIONDAG_LEGALIZETYPES_H
+
+#define DEBUG_TYPE "legalize-types"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// DAGTypeLegalizer - This takes an arbitrary SelectionDAG as input and hacks
+/// on it until only value types the target machine can handle are left.  This
+/// involves promoting small sizes to large sizes or splitting up large values
+/// into small values.
+///
+class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+public:
+  // NodeIdFlags - This pass uses the NodeId on the SDNodes to hold information
+  // about the state of the node.  The enum has all the values.
+  enum NodeIdFlags {
+    /// ReadyToProcess - All operands have been processed, so this node is ready
+    /// to be handled.
+    ReadyToProcess = 0,
+
+    /// NewNode - This is a new node, not before seen, that was created in the
+    /// process of legalizing some other node.
+    NewNode = -1,
+
+    /// Unanalyzed - This node's ID needs to be set to the number of its
+    /// unprocessed operands.
+    Unanalyzed = -2,
+
+    /// Processed - This is a node that has already been processed.
+    Processed = -3
+
+    // 1+ - This is a node which has this many unprocessed operands.
+  };
+private:
+
+  /// ValueTypeActions - This is a bitvector that contains two bits for each
+  /// simple value type, where the two bits correspond to the LegalizeAction
+  /// enum from TargetLowering.  This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// getTypeAction - Return how we should legalize values of this type.
+  TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT);
+  }
+
+  /// isTypeLegal - Return true if this type is legal on this target.
+  bool isTypeLegal(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
+  }
+
+  /// IgnoreNodeResults - Pretend all of this node's results are legal.
+  bool IgnoreNodeResults(SDNode *N) const {
+    return N->getOpcode() == ISD::TargetConstant;
+  }
+
+  /// PromotedIntegers - For integer nodes that are below legal width, this map
+  /// indicates what promoted value to use.
+  SmallDenseMap<SDValue, SDValue, 8> PromotedIntegers;
+
+  /// ExpandedIntegers - For integer nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedIntegers;
+
+  /// SoftenedFloats - For floating point nodes converted to integers of
+  /// the same size, this map indicates the converted value to use.
+  SmallDenseMap<SDValue, SDValue, 8> SoftenedFloats;
+
+  /// ExpandedFloats - For float nodes that need to be expanded this map
+  /// indicates which operands are the expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedFloats;
+
+  /// ScalarizedVectors - For nodes that are <1 x ty>, this map indicates the
+  /// scalar value of type 'ty' to use.
+  SmallDenseMap<SDValue, SDValue, 8> ScalarizedVectors;
+
+  /// SplitVectors - For nodes that need to be split this map indicates
+  /// which operands are the expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> SplitVectors;
+
+  /// WidenedVectors - For vector nodes that need to be widened, indicates
+  /// the widened value to use.
+  SmallDenseMap<SDValue, SDValue, 8> WidenedVectors;
+
+  /// ReplacedValues - For values that have been replaced with another,
+  /// indicates the replacement value to use.
+  SmallDenseMap<SDValue, SDValue, 8> ReplacedValues;
+
+  /// Worklist - This defines a worklist of nodes to process.  In order to be
+  /// pushed onto this worklist, all operands of a node must have already been
+  /// processed.
+  SmallVector<SDNode*, 128> Worklist;
+
+public:
+  explicit DAGTypeLegalizer(SelectionDAG &dag)
+    : TLI(dag.getTargetLoweringInfo()), DAG(dag),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+    assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+           "Too many value types for ValueTypeActions to hold!");
+  }
+
+  /// run - This is the main entry point for the type legalizer.  This does a
+  /// top-down traversal of the dag, legalizing types as it goes.  Returns
+  /// "true" if it made any changes.
+  bool run();
+
+  void NoteDeletion(SDNode *Old, SDNode *New) {
+    ExpungeNode(Old);
+    ExpungeNode(New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
+  }
+
+  SelectionDAG &getDAG() const { return DAG; }
+
+private:
+  SDNode *AnalyzeNewNode(SDNode *N);
+  void AnalyzeNewValue(SDValue &Val);
+  void ExpungeNode(SDNode *N);
+  void PerformExpensiveChecks();
+  void RemapValue(SDValue &N);
+
+  // Common routines.
+  SDValue BitConvertToInteger(SDValue Op);
+  SDValue BitConvertVectorToIntegerVector(SDValue Op);
+  SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
+  bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
+  bool CustomWidenLowerNode(SDNode *N, EVT VT);
+
+  /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES
+  /// node with the corresponding input operand, except for the result 'ResNo',
+  /// for which the corresponding input operand is returned.
+  SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
+
+  SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
+  SDValue JoinIntegers(SDValue Lo, SDValue Hi);
+  SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
+  
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
+  SDValue PromoteTargetBoolean(SDValue Bool, EVT VT);
+  void ReplaceValueWith(SDValue From, SDValue To);
+  void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
+                    SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Promotion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetPromotedInteger - Given a processed operand Op which was promoted to a
+  /// larger integer type, this returns the promoted value.  The low bits of the
+  /// promoted value corresponding to the original type are exactly equal to Op.
+  /// The extra bits contain rubbish, so the promoted value may need to be zero-
+  /// or sign-extended from the original type before it is usable (the helpers
+  /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
+  /// For example, if Op is an i16 and was promoted to an i32, then this method
+  /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
+  /// 16 bits of which contain rubbish.
+  SDValue GetPromotedInteger(SDValue Op) {
+    SDValue &PromotedOp = PromotedIntegers[Op];
+    RemapValue(PromotedOp);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedInteger(SDValue Op, SDValue Result);
+
+  /// SExtPromotedInteger - Get a promoted operand and sign extend it to the
+  /// final size.
+  SDValue SExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
+                       DAG.getValueType(OldVT));
+  }
+
+  /// ZExtPromotedInteger - Get a promoted operand and zero extend it to the
+  /// final size.
+  SDValue ZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    DebugLoc dl = Op.getDebugLoc();
+    Op = GetPromotedInteger(Op);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
+  }
+
+  // Integer Result Promotion.
+  void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_AssertSext(SDNode *N);
+  SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
+  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntRes_BITCAST(SDNode *N);
+  SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntRes_Constant(SDNode *N);
+  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntRes_CTLZ(SDNode *N);
+  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_FP32_TO_FP16(SDNode *N);
+  SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
+  SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_Overflow(SDNode *N);
+  SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SDIV(SDNode *N);
+  SDValue PromoteIntRes_SELECT(SDNode *N);
+  SDValue PromoteIntRes_VSELECT(SDNode *N);
+  SDValue PromoteIntRes_SELECT_CC(SDNode *N);
+  SDValue PromoteIntRes_SETCC(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_TRUNCATE(SDNode *N);
+  SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UDIV(SDNode *N);
+  SDValue PromoteIntRes_UNDEF(SDNode *N);
+  SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+
+  // Integer Operand Promotion.
+  bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
+  SDValue PromoteIntOp_BITCAST(SDNode *N);
+  SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntOp_MEMBARRIER(SDNode *N);
+  SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VSETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_TRUNCATE(SDNode *N);
+  SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+
+  void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Expansion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedInteger - Given a processed operand Op which was expanded into
+  /// two integers of half the size, this returns the two halves.  The low bits
+  /// of Op are exactly equal to the bits of Lo; the high bits exactly equal Hi.
+  /// For example, if Op is an i64 which was expanded into two i32's, then this
+  /// method returns the two i32's, with Lo being equal to the lower 32 bits of
+  /// Op, and Hi being equal to the upper 32 bits.
+  void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Integer Result Expansion.
+  void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                       SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Constant          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ZERO_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandShiftByConstant(SDNode *N, unsigned Amt,
+                             SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Integer Operand Expansion.
+  bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandIntOp_BITCAST(SDNode *N);
+  SDValue ExpandIntOp_BR_CC(SDNode *N);
+  SDValue ExpandIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue ExpandIntOp_EXTRACT_ELEMENT(SDNode *N);
+  SDValue ExpandIntOp_SELECT_CC(SDNode *N);
+  SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_Shift(SDNode *N);
+  SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_TRUNCATE(SDNode *N);
+  SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
+  SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
+
+  void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                  ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op.
+  SDValue GetSoftenedFloat(SDValue Op) {
+    SDValue &SoftenedOp = SoftenedFloats[Op];
+    RemapValue(SoftenedOp);
+    assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
+    return SoftenedOp;
+  }
+  void SetSoftenedFloat(SDValue Op, SDValue Result);
+
+  // Result Float to Integer Conversion.
+  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FADD(SDNode *N);
+  SDValue SoftenFloatRes_FCEIL(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOS(SDNode *N);
+  SDValue SoftenFloatRes_FDIV(SDNode *N);
+  SDValue SoftenFloatRes_FEXP(SDNode *N);
+  SDValue SoftenFloatRes_FEXP2(SDNode *N);
+  SDValue SoftenFloatRes_FFLOOR(SDNode *N);
+  SDValue SoftenFloatRes_FLOG(SDNode *N);
+  SDValue SoftenFloatRes_FLOG2(SDNode *N);
+  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMA(SDNode *N);
+  SDValue SoftenFloatRes_FMUL(SDNode *N);
+  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatRes_FP16_TO_FP32(SDNode *N);
+  SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatRes_FPOW(SDNode *N);
+  SDValue SoftenFloatRes_FPOWI(SDNode *N);
+  SDValue SoftenFloatRes_FREM(SDNode *N);
+  SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FSIN(SDNode *N);
+  SDValue SoftenFloatRes_FSQRT(SDNode *N);
+  SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTRUNC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_UNDEF(SDNode *N);
+  SDValue SoftenFloatRes_VAARG(SDNode *N);
+  SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+
+  // Operand Float to Integer Conversion.
+  bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_BITCAST(SDNode *N);
+  SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue SoftenFloatOp_FP32_TO_FP16(SDNode *N);
+  SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatOp_SETCC(SDNode *N);
+  SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Float Expansion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetExpandedFloat - Given a processed operand Op which was expanded into
+  /// two floating point values of half the size, this returns the two halves.
+  /// The low bits of Op are exactly equal to the bits of Lo; the high bits
+  /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
+  /// into two f64's, then this method returns the two f64's, with Lo being
+  /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits.
+  void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Float Result Expansion.
+  void ExpandFloatResult(SDNode *N, unsigned ResNo);
+  void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FDIV      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FFLOOR    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMA       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Float Operand Expansion.
+  bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue ExpandFloatOp_SELECT_CC(SDNode *N);
+  SDValue ExpandFloatOp_SETCC(SDNode *N);
+  SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                ISD::CondCode &CCCode, DebugLoc dl);
+
+  //===--------------------------------------------------------------------===//
+  // Scalarization Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetScalarizedVector - Given a processed one-element vector Op which was
+  /// scalarized to its element type, this returns the element.  For example,
+  /// if Op is a v1i32, Op = < i32 val >, this method returns val, an i32.
+  SDValue GetScalarizedVector(SDValue Op) {
+    SDValue &ScalarizedOp = ScalarizedVectors[Op];
+    RemapValue(ScalarizedOp);
+    assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
+    return ScalarizedOp;
+  }
+  void SetScalarizedVector(SDValue Op, SDValue Result);
+
+  // Vector Result Scalarization: <1 x ty> -> ty.
+  void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_InregOp(SDNode *N);
+
+  SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
+  SDValue ScalarizeVecRes_FPOWI(SDNode *N);
+  SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue ScalarizeVecRes_VSELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
+  SDValue ScalarizeVecRes_SETCC(SDNode *N);
+  SDValue ScalarizeVecRes_UNDEF(SDNode *N);
+  SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_VSETCC(SDNode *N);
+
+  // Vector Operand Scalarization: <1 x ty> -> ty.
+  bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_BITCAST(SDNode *N);
+  SDValue ScalarizeVecOp_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Splitting Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetSplitVector - Given a processed vector Op which was split into vectors
+  /// of half the size, this method returns the halves.  The first elements of
+  /// Op coincide with the elements of Lo; the remaining elements of Op coincide
+  /// with the elements of Hi: Op is what you would get by concatenating Lo and
+  /// Hi.  For example, if Op is a v8i32 that was split into two v4i32's, then
+  /// this method returns the two v4i32's, with Lo corresponding to the first 4
+  /// elements of Op, and Hi to the last 4 elements.
+  void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
+  void SplitVectorResult(SDNode *N, unsigned OpNo);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_PAIR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
+                                  SDValue &Hi);
+
+  // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
+  bool SplitVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_UnaryOp(SDNode *N);
+
+  SDValue SplitVecOp_BITCAST(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_VSETCC(SDNode *N);
+  SDValue SplitVecOp_FP_ROUND(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// GetWidenedVector - Given a processed vector Op which was widened into a
+  /// larger vector, this method returns the larger vector.  The elements of
+  /// the returned vector consist of the elements of Op followed by elements
+  /// containing rubbish.  For example, if Op is a v2i32 that was widened to a
+  /// v4i32, then this method returns a v4i32 for which the first two elements
+  /// are the same as those of Op, while the last two elements contain rubbish.
+  SDValue GetWidenedVector(SDValue Op) {
+    SDValue &WidenedOp = WidenedVectors[Op];
+    RemapValue(WidenedOp);
+    assert(WidenedOp.getNode() && "Operand wasn't widened?");
+    return WidenedOp;
+  }
+  void SetWidenedVector(SDValue Op, SDValue Result);
+
+  // Widen Vector Result Promotion.
+  void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
+  SDValue WidenVecRes_BITCAST(SDNode* N);
+  SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
+  SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
+  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
+  SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
+  SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVecRes_SELECT_CC(SDNode* N);
+  SDValue WidenVecRes_SETCC(SDNode* N);
+  SDValue WidenVecRes_UNDEF(SDNode *N);
+  SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
+  SDValue WidenVecRes_VSETCC(SDNode* N);
+
+  SDValue WidenVecRes_Ternary(SDNode *N);
+  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_POWI(SDNode *N);
+  SDValue WidenVecRes_Shift(SDNode *N);
+  SDValue WidenVecRes_Unary(SDNode *N);
+  SDValue WidenVecRes_InregOp(SDNode *N);
+
+  // Widen Vector Operand.
+  bool WidenVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue WidenVecOp_BITCAST(SDNode *N);
+  SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_SETCC(SDNode* N);
+
+  SDValue WidenVecOp_Convert(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Helper GenWidenVectorLoads - Helper function to generate a set of
+  /// loads to load a vector with a resulting wider type. It takes
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
+                              LoadSDNode *LD);
+
+  /// GenWidenVectorExtLoads - Helper function to generate a set of extension
+  /// loads to load a ector with a resulting wider type.  It takes
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  ///   ExtType: extension element type
+  SDValue GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
+                                 LoadSDNode *LD, ISD::LoadExtType ExtType);
+
+  /// Helper genWidenVectorStores - Helper function to generate a set of
+  /// stores to store a widen vector into non widen memory
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, StoreSDNode *ST);
+
+  /// Helper genWidenVectorTruncStores - Helper function to generate a set of
+  /// stores to store a truncate widen vector into non widen memory
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+                                 StoreSDNode *ST);
+
+  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+  /// input vector must have the same element type as NVT.
+  SDValue ModifyToType(SDValue InOp, EVT WidenVT);
+
+
+  //===--------------------------------------------------------------------===//
+  // Generic Splitting: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // not necessarily identical types.  As such they can be used for splitting
+  // vectors and expanding integers and floats.
+
+  void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isVector())
+      GetSplitVector(Op, Lo, Hi);
+    else if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+  /// which is split (or expanded) into two not necessarily identical pieces.
+  void GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT);
+
+  /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
+  /// high parts of the given value.
+  void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
+
+  // Generic Result Splitting.
+  void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                             SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Expansion: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // identical types of half the size, and that the Lo/Hi part is stored first
+  // in memory on little/big-endian machines, followed by the Hi/Lo part.  As
+  // such they can be used for expanding integers and floats.
+
+  void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  // Generic Result Expansion.
+  void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                    SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BITCAST           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_NormalLoad        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_VAARG             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Generic Operand Expansion.
+  SDValue ExpandOp_BITCAST          (SDNode *N);
+  SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
+  SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
+  SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
new file mode 100644
index 000000000000..222d1c043a63
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -0,0 +1,525 @@
+//===-------- LegalizeTypesGeneric.cpp - Generic type legalization --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements generic type expansion and splitting for LegalizeTypes.
+// The routines here perform legalization when the details of the type (such as
+// whether it is an integer or a float) do not matter.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  The Lo/Hi part
+// is required to be stored first in memory on little/big-endian machines.
+// Splitting is the act of changing a computation in an illegal type to be a
+// computation in two not necessarily identical registers of a smaller type.
+// There are no requirements on how the type is represented in memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DataLayout.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Generic Result Expansion.
+//===----------------------------------------------------------------------===//
+
+// These routines assume that the Lo/Hi part is stored first in memory on
+// little/big-endian machines, followed by the Hi/Lo part.  This means that
+// they cannot be used as is on vectors, for which Lo is always stored first.
+void DAGTypeLegalizer::ExpandRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetExpandedOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+    case TargetLowering::TypeLegal:
+    case TargetLowering::TypePromoteInteger:
+      break;
+    case TargetLowering::TypeSoftenFloat:
+      // Convert the integer operand instead.
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeExpandInteger:
+    case TargetLowering::TypeExpandFloat:
+      // Convert the expanded pieces of the input.
+      GetExpandedOp(InOp, Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeSplitVector:
+      GetSplitVector(InOp, Lo, Hi);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeScalarizeVector:
+      // Convert the element instead.
+      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeWidenVector: {
+      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
+      InOp = GetWidenedVector(InOp);
+      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                   InVT.getVectorNumElements()/2);
+      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(0));
+      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
+                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+  }
+
+  if (InVT.isVector() && OutVT.isInteger()) {
+    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
+    // is legal but the result is not.
+    unsigned NumElems = 2;
+    EVT ElemVT = NOutVT;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+
+    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
+    while (!isTypeLegal(NVT)) {
+      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
+      // If the element size is smaller than byte, bail.
+      if (NewSizeInBits < 8)
+        break;
+      NumElems *= 2;
+      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
+      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+    }
+
+    if (isTypeLegal(NVT)) {
+      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
+
+      SmallVector<SDValue, 8> Vals;
+      for (unsigned i = 0; i < NumElems; ++i)
+        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
+                                   CastInOp, DAG.getIntPtrConstant(i)));
+
+      // Build Lo, Hi pair by pairing extracted elements if needed.
+      unsigned Slot = 0;
+      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
+        // Each iteration will BUILD_PAIR two nodes and append the result until
+        // there are only two nodes left, i.e. Lo and Hi.
+        SDValue LHS = Vals[Slot];
+        SDValue RHS = Vals[Slot + 1];
+
+        if (TLI.isBigEndian())
+          std::swap(LHS, RHS);
+
+        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
+                                   EVT::getIntegerVT(
+                                     *DAG.getContext(),
+                                     LHS.getValueType().getSizeInBits() << 1),
+                                   LHS, RHS));
+      }
+      Lo = Vals[Slot++];
+      Hi = Vals[Slot++];
+
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+
+      return;
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack.
+  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");
+
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and expanded destination types.
+  unsigned Alignment =
+    TLI.getDataLayout()->getPrefTypeAlignment(NOutVT.
+                                              getTypeForEVT(*DAG.getContext()));
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SPFI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo,
+                               false, false, 0);
+
+  // Load the first half from the stack slot.
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, 
+                   false, false, false, 0);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the second half from the stack slot.
+  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
+                   PtrInfo.getWithOffset(IncrementSize), false,
+                   false, false, MinAlign(Alignment, IncrementSize));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BUILD_PAIR(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  // Return the operands.
+  Lo = N->getOperand(0);
+  Hi = N->getOperand(1);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_ELEMENT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  SDValue Part = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ?
+                   Hi : Lo;
+
+  assert(Part.getValueType() == N->getValueType(0) &&
+         "Type twice as big as expanded type not itself expanded!");
+
+  GetPairElements(Part, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue OldVec = N->getOperand(0);
+  unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Convert to a vector of the expanded element type, for example
+  // <3 x i64> -> <6 x i32>.
+  EVT OldVT = N->getValueType(0);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, 2*OldElts),
+                               OldVec);
+
+  // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
+  SDValue Idx = N->getOperand(1);
+
+  // Make sure the type of Idx is big enough to hold the new values.
+  if (Idx.getValueType().bitsLT(TLI.getPointerTy()))
+    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(1, Idx.getValueType()));
+  Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
+  DebugLoc dl = N->getDebugLoc();
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+  bool isInvariant = LD->isInvariant();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                   isVolatile, isNonTemporal, isInvariant, Alignment);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
+                   LD->getPointerInfo().getWithOffset(IncrementSize),
+                   isVolatile, isNonTemporal, isInvariant,
+                   MinAlign(Alignment, IncrementSize));
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  const unsigned Align = N->getConstantOperandVal(3);
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
+
+  // Handle endianness of the load.
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Operand Expansion.
+//===--------------------------------------------------------------------===//
+
+SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0).isVector()) {
+    // An illegal expanding type is being converted to a legal vector type.
+    // Make a two element vector out of the expanded parts and convert that
+    // instead, but only if the new vector type is legal (otherwise there
+    // is no point, and it might create expansion loops).  For example, on
+    // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    EVT OVT = N->getOperand(0).getValueType();
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(),
+                               TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
+                               2);
+
+    if (isTypeLegal(NVT)) {
+      SDValue Parts[2];
+      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+
+      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
+      return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
+    }
+  }
+
+  // Otherwise, store to a temporary and load out again as the new type.
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  assert(OldVT == VecVT.getVectorElementType() &&
+         "BUILD_VECTOR operand type doesn't match vector element type!");
+
+  // Build a vector of twice the length out of the expanded elements.
+  // For example <3 x i64> -> <6 x i32>.
+  std::vector<SDValue> NewElts;
+  NewElts.reserve(NumElts*2);
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Lo, Hi;
+    GetExpandedOp(N->getOperand(i), Lo, Hi);
+    if (TLI.isBigEndian())
+      std::swap(Lo, Hi);
+    NewElts.push_back(Lo);
+    NewElts.push_back(Hi);
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, NewElts.size()),
+                               &NewElts[0], NewElts.size());
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
+  SDValue Lo, Hi;
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ? Hi : Lo;
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Val = N->getOperand(1);
+  EVT OldEVT = Val.getValueType();
+  EVT NewEVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldEVT);
+
+  assert(OldEVT == VecVT.getVectorElementType() &&
+         "Inserted element type doesn't match vector element type!");
+
+  // Bitconvert to a vector of twice the length with elements of the expanded
+  // type, insert the expanded vector elements, and then convert back.
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEVT, NumElts*2);
+  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
+                               NewVecVT, N->getOperand(0));
+
+  SDValue Lo, Hi;
+  GetExpandedOp(Val, Lo, Hi);
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  SDValue Idx = N->getOperand(2);
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl,
+                    Idx.getValueType(), Idx, DAG.getIntPtrConstant(1));
+  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
+         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  Ops[0] = N->getOperand(0);
+  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
+  for (unsigned i = 1; i < NumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
+  assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  DebugLoc dl = N->getDebugLoc();
+
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     St->getValue().getValueType());
+  SDValue Chain = St->getChain();
+  SDValue Ptr = St->getBasePtr();
+  unsigned Alignment = St->getAlignment();
+  bool isVolatile = St->isVolatile();
+  bool isNonTemporal = St->isNonTemporal();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(St->getValue(), Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
+                    isVolatile, isNonTemporal, Alignment);
+
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+  Hi = DAG.getStore(Chain, dl, Hi, Ptr,
+                    St->getPointerInfo().getWithOffset(IncrementSize),
+                    isVolatile, isNonTemporal,
+                    MinAlign(Alignment, IncrementSize));
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Result Splitting.
+//===--------------------------------------------------------------------===//
+
+// Be careful to make no assumptions about which of Lo/Hi is stored first in
+// memory (for vectors it is always Lo first followed by Hi in the following
+// bytes; for integers and floats it is Lo first if and only if the machine is
+// little-endian).
+
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetSplitOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
+                                       SDValue &Hi) {
+  SDValue LL, LH, RL, RH, CL, CH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  CL = CH = Cond;
+  if (Cond.getValueType().isVector()) {
+    assert(Cond.getValueType().getVectorElementType() == MVT::i1 &&
+           "Condition legalized before result?");
+    unsigned NumElements = Cond.getValueType().getVectorNumElements();
+    EVT VCondTy = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElements / 2);
+    CL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
+                     DAG.getIntPtrConstant(0));
+    CH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
+                     DAG.getIntPtrConstant(NumElements / 2));
+  }
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitOp(N->getOperand(2), LL, LH);
+  GetSplitOp(N->getOperand(3), RL, RH);
+
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, LL.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LL, RL, N->getOperand(4));
+  Hi = DAG.getNode(ISD::SELECT_CC, dl, LH.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LH, RH, N->getOperand(4));
+}
+
+void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getUNDEF(LoVT);
+  Hi = DAG.getUNDEF(HiVT);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
new file mode 100644
index 000000000000..c6e066e2709b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -0,0 +1,770 @@
+//===-- LegalizeVectorOps.cpp - Implement SelectionDAG::LegalizeVectors ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeVectors method.
+//
+// The vector legalizer looks for vector operations which might need to be
+// scalarized and legalizes them. This is a separate step from Legalize because
+// scalarizing can introduce illegal types.  For example, suppose we have an
+// ISD::SDIV of type v2i64 on x86-32.  The type is legal (for example, addition
+// on a v2i64 is legal), but ISD::SDIV isn't legal, so we have to unroll the
+// operation, which introduces nodes with the illegal type i64 which must be
+// expanded.  Similarly, suppose we have an ISD::SRA of type v16i8 on PowerPC;
+// the operation must be unrolled, which introduces nodes with the illegal
+// type i8 which must be promoted.
+//
+// This does not legalize vector manipulations like ISD::BUILD_VECTOR,
+// or operations that happen to take a vector which are custom-lowered;
+// the legalization for such operations never produces nodes
+// with illegal types, so it's okay to put off legalizing them until
+// SelectionDAG::Legalize runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+class VectorLegalizer {
+  SelectionDAG& DAG;
+  const TargetLowering &TLI;
+  bool Changed; // Keep track of whether anything changed
+
+  /// LegalizedNodes - For nodes that are of legal width, and that have more
+  /// than one use, this map indicates what regularized operand to use.  This
+  /// allows us to avoid legalizing the same thing more than once.
+  SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
+
+  // Adds a node to the translation cache
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+  // Legalizes the given node
+  SDValue LegalizeOp(SDValue Op);
+  // Assuming the node is legal, "legalize" the results
+  SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
+  // Implements unrolling a VSETCC.
+  SDValue UnrollVSETCC(SDValue Op);
+  // Implements expansion for FNEG; falls back to UnrollVectorOp if FSUB
+  // isn't legal.
+  // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
+  // SINT_TO_FLOAT and SHR on vectors isn't legal.
+  SDValue ExpandUINT_TO_FLOAT(SDValue Op);
+  // Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+  SDValue ExpandSEXTINREG(SDValue Op);
+  // Implement vselect in terms of XOR, AND, OR when blend is not supported
+  // by the target.
+  SDValue ExpandVSELECT(SDValue Op);
+  SDValue ExpandSELECT(SDValue Op);
+  SDValue ExpandLoad(SDValue Op);
+  SDValue ExpandStore(SDValue Op);
+  SDValue ExpandFNEG(SDValue Op);
+  // Implements vector promotion; this is essentially just bitcasting the
+  // operands to a different type and bitcasting the result back to the
+  // original type.
+  SDValue PromoteVectorOp(SDValue Op);
+  // Implements [SU]INT_TO_FP vector promotion; this is a [zs]ext of the input
+  // operand to the next size up.
+  SDValue PromoteVectorOpINT_TO_FP(SDValue Op);
+
+  public:
+  bool Run();
+  VectorLegalizer(SelectionDAG& dag) :
+      DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {}
+};
+
+bool VectorLegalizer::Run() {
+  // Before we start legalizing vector nodes, check if there are any vectors.
+  bool HasVectors = false;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I) {
+    // Check if the values of the nodes contain vectors. We don't need to check
+    // the operands because we are going to check their values at some point.
+    for (SDNode::value_iterator J = I->value_begin(), E = I->value_end();
+         J != E; ++J)
+      HasVectors |= J->isVector();
+
+    // If we found a vector node we can start the legalization.
+    if (HasVectors)
+      break;
+  }
+
+  // If this basic block has no vectors then no need to legalize vectors.
+  if (!HasVectors)
+    return false;
+
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = prior(DAG.allnodes_end()); I != llvm::next(E); ++I)
+    LegalizeOp(SDValue(I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+
+  return Changed;
+}
+
+SDValue VectorLegalizer::TranslateLegalizeResults(SDValue Op, SDValue Result) {
+  // Generic legalization: just pass the operand through.
+  for (unsigned i = 0, e = Op.getNode()->getNumValues(); i != e; ++i)
+    AddLegalizedOperand(Op.getValue(i), Result.getValue(i));
+  return Result.getValue(Op.getResNo());
+}
+
+SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDNode* Node = Op.getNode();
+
+  // Legalize the operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i)
+    Ops.push_back(LegalizeOp(Node->getOperand(i)));
+
+  SDValue Result =
+    SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops.data(), Ops.size()), 0);
+
+  if (Op.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) {
+      if (TLI.isLoadExtLegal(LD->getExtensionType(), LD->getMemoryVT()))
+        return TranslateLegalizeResults(Op, Result);
+      Changed = true;
+      return LegalizeOp(ExpandLoad(Op));
+    }
+  } else if (Op.getOpcode() == ISD::STORE) {
+    StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+    EVT StVT = ST->getMemoryVT();
+    MVT ValVT = ST->getValue().getSimpleValueType();
+    if (StVT.isVector() && ST->isTruncatingStore())
+      switch (TLI.getTruncStoreAction(ValVT, StVT.getSimpleVT())) {
+      default: llvm_unreachable("This action is not supported yet!");
+      case TargetLowering::Legal:
+        return TranslateLegalizeResults(Op, Result);
+      case TargetLowering::Custom:
+        Changed = true;
+        return LegalizeOp(TLI.LowerOperation(Result, DAG));
+      case TargetLowering::Expand:
+        Changed = true;
+        return LegalizeOp(ExpandStore(Op));
+      }
+  }
+
+  bool HasVectorValue = false;
+  for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
+       J != E;
+       ++J)
+    HasVectorValue |= J->isVector();
+  if (!HasVectorValue)
+    return TranslateLegalizeResults(Op, Result);
+
+  EVT QueryType;
+  switch (Op.getOpcode()) {
+  default:
+    return TranslateLegalizeResults(Op, Result);
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::SELECT:
+  case ISD::VSELECT:
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FFLOOR:
+  case ISD::FP_ROUND:
+  case ISD::FP_EXTEND:
+  case ISD::FMA:
+  case ISD::SIGN_EXTEND_INREG:
+    QueryType = Node->getValueType(0);
+    break;
+  case ISD::FP_ROUND_INREG:
+    QueryType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    QueryType = Node->getOperand(0).getValueType();
+    break;
+  }
+
+  switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
+  case TargetLowering::Promote:
+    switch (Op.getOpcode()) {
+    default:
+      // "Promote" the operation by bitcasting
+      Result = PromoteVectorOp(Op);
+      Changed = true;
+      break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      // "Promote" the operation by extending the operand.
+      Result = PromoteVectorOpINT_TO_FP(Op);
+      Changed = true;
+      break;
+    }
+    break;
+  case TargetLowering::Legal: break;
+  case TargetLowering::Custom: {
+    SDValue Tmp1 = TLI.LowerOperation(Op, DAG);
+    if (Tmp1.getNode()) {
+      Result = Tmp1;
+      break;
+    }
+    // FALL THROUGH
+  }
+  case TargetLowering::Expand:
+    if (Node->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Result = ExpandSEXTINREG(Op);
+    else if (Node->getOpcode() == ISD::VSELECT)
+      Result = ExpandVSELECT(Op);
+    else if (Node->getOpcode() == ISD::SELECT)
+      Result = ExpandSELECT(Op);
+    else if (Node->getOpcode() == ISD::UINT_TO_FP)
+      Result = ExpandUINT_TO_FLOAT(Op);
+    else if (Node->getOpcode() == ISD::FNEG)
+      Result = ExpandFNEG(Op);
+    else if (Node->getOpcode() == ISD::SETCC)
+      Result = UnrollVSETCC(Op);
+    else
+      Result = DAG.UnrollVectorOp(Op.getNode());
+    break;
+  }
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op) {
+    Result = LegalizeOp(Result);
+    Changed = true;
+  }
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
+  // Vector "promotion" is basically just bitcasting and doing the operation
+  // in a different type.  For example, x86 promotes ISD::AND on v2i32 to
+  // v1i64.
+  MVT VT = Op.getSimpleValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT);
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, &Operands[0], Operands.size());
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
+  // INT_TO_FP operations may require the input operand be promoted even
+  // when the type is otherwise legal.
+  EVT VT = Op.getOperand(0).getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+
+  // Normal getTypeToPromoteTo() doesn't work here, as that will promote
+  // by widening the vector w/ the same element width and twice the number
+  // of elements. We want the other way around, the same number of elements,
+  // each twice the width.
+  //
+  // Increase the bitwidth of the element to the next pow-of-two
+  // (which is greater than 8 bits).
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  EltVT = EVT::getIntegerVT(*DAG.getContext(), 2 * EltVT.getSizeInBits());
+  assert(EltVT.isSimple() && "Promoting to a non-simple vector type!");
+
+  // Build a new vector type and check if it is legal.
+  MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND :
+    ISD::SIGN_EXTEND;
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(Opc, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), &Operands[0],
+                     Operands.size());
+}
+
+
+SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
+  DebugLoc dl = Op.getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+  SDValue Chain = LD->getChain();
+  SDValue BasePTR = LD->getBasePtr();
+  EVT SrcVT = LD->getMemoryVT();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  SmallVector<SDValue, 8> Vals;
+  SmallVector<SDValue, 8> LoadChains;
+  unsigned NumElem = SrcVT.getVectorNumElements();
+
+  EVT SrcEltVT = SrcVT.getScalarType();
+  EVT DstEltVT = Op.getNode()->getValueType(0).getScalarType();
+
+  if (SrcVT.getVectorNumElements() > 1 && !SrcEltVT.isByteSized()) {
+    // When elements in a vector is not byte-addressable, we cannot directly
+    // load each element by advancing pointer, which could only address bytes.
+    // Instead, we load all significant words, mask bits off, and concatenate
+    // them to form each element. Finally, they are extended to destination
+    // scalar type to build the destination vector.
+    EVT WideVT = TLI.getPointerTy();
+
+    assert(WideVT.isRound() &&
+           "Could not handle the sophisticated case when the widest integer is"
+           " not power of 2.");
+    assert(WideVT.bitsGE(SrcEltVT) &&
+           "Type is not legalized?");
+
+    unsigned WideBytes = WideVT.getStoreSize();
+    unsigned Offset = 0;
+    unsigned RemainingBytes = SrcVT.getStoreSize();
+    SmallVector<SDValue, 8> LoadVals;
+
+    while (RemainingBytes > 0) {
+      SDValue ScalarLoad;
+      unsigned LoadBytes = WideBytes;
+
+      if (RemainingBytes >= LoadBytes) {
+        ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
+                                 LD->getPointerInfo().getWithOffset(Offset),
+                                 LD->isVolatile(), LD->isNonTemporal(),
+                                 LD->isInvariant(), LD->getAlignment());
+      } else {
+        EVT LoadVT = WideVT;
+        while (RemainingBytes < LoadBytes) {
+          LoadBytes >>= 1; // Reduce the load size by half.
+          LoadVT = EVT::getIntegerVT(*DAG.getContext(), LoadBytes << 3);
+        }
+        ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
+                                    LD->getPointerInfo().getWithOffset(Offset),
+                                    LoadVT, LD->isVolatile(),
+                                    LD->isNonTemporal(), LD->getAlignment());
+      }
+
+      RemainingBytes -= LoadBytes;
+      Offset += LoadBytes;
+      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                            DAG.getIntPtrConstant(LoadBytes));
+
+      LoadVals.push_back(ScalarLoad.getValue(0));
+      LoadChains.push_back(ScalarLoad.getValue(1));
+    }
+
+    // Extract bits, pack and extend/trunc them into destination type.
+    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
+    SDValue SrcEltBitMask = DAG.getConstant((1U << SrcEltBits) - 1, WideVT);
+
+    unsigned BitOffset = 0;
+    unsigned WideIdx = 0;
+    unsigned WideBits = WideVT.getSizeInBits();
+
+    for (unsigned Idx = 0; Idx != NumElem; ++Idx) {
+      SDValue Lo, Hi, ShAmt;
+
+      if (BitOffset < WideBits) {
+        ShAmt = DAG.getConstant(BitOffset, TLI.getShiftAmountTy(WideVT));
+        Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+        Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask);
+      }
+
+      BitOffset += SrcEltBits;
+      if (BitOffset >= WideBits) {
+        WideIdx++;
+        Offset -= WideBits;
+        if (Offset > 0) {
+          ShAmt = DAG.getConstant(SrcEltBits - Offset,
+                                  TLI.getShiftAmountTy(WideVT));
+          Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+          Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
+        }
+      }
+
+      if (Hi.getNode())
+        Lo = DAG.getNode(ISD::OR, dl, WideVT, Lo, Hi);
+
+      switch (ExtType) {
+      default: llvm_unreachable("Unknown extended-load op!");
+      case ISD::EXTLOAD:
+        Lo = DAG.getAnyExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::ZEXTLOAD:
+        Lo = DAG.getZExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::SEXTLOAD:
+        ShAmt = DAG.getConstant(WideBits - SrcEltBits,
+                                TLI.getShiftAmountTy(WideVT));
+        Lo = DAG.getNode(ISD::SHL, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getNode(ISD::SRA, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getSExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      }
+      Vals.push_back(Lo);
+    }
+  } else {
+    unsigned Stride = SrcVT.getScalarType().getSizeInBits()/8;
+
+    for (unsigned Idx=0; Idx<NumElem; Idx++) {
+      SDValue ScalarLoad = DAG.getExtLoad(ExtType, dl,
+                Op.getNode()->getValueType(0).getScalarType(),
+                Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
+                SrcVT.getScalarType(),
+                LD->isVolatile(), LD->isNonTemporal(),
+                LD->getAlignment());
+
+      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                         DAG.getIntPtrConstant(Stride));
+
+      Vals.push_back(ScalarLoad.getValue(0));
+      LoadChains.push_back(ScalarLoad.getValue(1));
+    }
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+            &LoadChains[0], LoadChains.size());
+  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
+            Op.getNode()->getValueType(0), &Vals[0], Vals.size());
+
+  AddLegalizedOperand(Op.getValue(0), Value);
+  AddLegalizedOperand(Op.getValue(1), NewChain);
+
+  return (Op.getResNo() ? NewChain : Value);
+}
+
+SDValue VectorLegalizer::ExpandStore(SDValue Op) {
+  DebugLoc dl = Op.getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+  SDValue Chain = ST->getChain();
+  SDValue BasePTR = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  EVT StVT = ST->getMemoryVT();
+
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+
+  unsigned NumElem = StVT.getVectorNumElements();
+  // The type of the data we want to save
+  EVT RegVT = Value.getValueType();
+  EVT RegSclVT = RegVT.getScalarType();
+  // The type of data as saved in memory.
+  EVT MemSclVT = StVT.getScalarType();
+
+  // Cast floats into integers
+  unsigned ScalarSize = MemSclVT.getSizeInBits();
+
+  // Round odd types to the next pow of two.
+  if (!isPowerOf2_32(ScalarSize))
+    ScalarSize = NextPowerOf2(ScalarSize);
+
+  // Store Stride in bytes
+  unsigned Stride = ScalarSize/8;
+  // Extract each of the elements from the original vector
+  // and save them into memory individually.
+  SmallVector<SDValue, 8> Stores;
+  for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+    SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+               RegSclVT, Value, DAG.getIntPtrConstant(Idx));
+
+    // This scalar TruncStore may be illegal, but we legalize it later.
+    SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
+               ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
+               isVolatile, isNonTemporal, Alignment);
+
+    BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                                DAG.getIntPtrConstant(Stride));
+
+    Stores.push_back(Store);
+  }
+  SDValue TF =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &Stores[0], Stores.size());
+  AddLegalizedOperand(Op, TF);
+  return TF;
+}
+
+SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
+  // Lower a select instruction where the condition is a scalar and the
+  // operands are vectors. Lower this select to VSELECT and implement it
+  // using XOR AND OR. The selector bit is broadcasted. 
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  assert(VT.isVector() && !Mask.getValueType().isVector()
+         && Op1.getValueType() == Op2.getValueType() && "Invalid type");
+
+  unsigned NumElem = VT.getVectorNumElements();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // Also, we need to be able to construct a splat vector using BUILD_VECTOR.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::BUILD_VECTOR,  VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Generate a mask operand.
+  EVT MaskTy = TLI.getSetCCResultType(VT);
+  assert(MaskTy.isVector() && "Invalid CC type");
+  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
+         && "Invalid mask size");
+
+  // What is the size of each element in the vector mask.
+  EVT BitTy = MaskTy.getScalarType();
+
+  Mask = DAG.getNode(ISD::SELECT, DL, BitTy, Mask,
+          DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), BitTy),
+          DAG.getConstant(0, BitTy));
+
+  // Broadcast the mask so that the entire vector is all-one or all zero.
+  SmallVector<SDValue, 8> Ops(NumElem, Mask);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, &Ops[0], Ops.size());
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+            APInt::getAllOnesValue(BitTy.getSizeInBits()), MaskTy);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, MaskTy, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, MaskTy, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
+SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Make sure that the SRA and SHL instructions are available.
+  if (TLI.getOperationAction(ISD::SRA, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SHL, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OrigTy = cast<VTSDNode>(Op->getOperand(1))->getVT();
+
+  unsigned BW = VT.getScalarType().getSizeInBits();
+  unsigned OrigBW = OrigTy.getScalarType().getSizeInBits();
+  SDValue ShiftSz = DAG.getConstant(BW - OrigBW, VT);
+
+  Op = Op.getOperand(0);
+  Op =   DAG.getNode(ISD::SHL, DL, VT, Op, ShiftSz);
+  return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
+}
+
+SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
+  // Implement VSELECT in terms of XOR, AND, OR
+  // on platforms which do not support blend natively.
+  EVT VT =  Op.getOperand(0).getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // This operation also isn't safe with AND, OR, XOR when the boolean
+  // type is 0/1 as we need an all ones vector constant to mask with.
+  // FIXME: Sign extend 1 to all ones if thats legal on the target.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getBooleanContents(true) !=
+      TargetLowering::ZeroOrNegativeOneBooleanContent)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  assert(VT.getSizeInBits() == Op1.getValueType().getSizeInBits()
+         && "Invalid mask size");
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+    APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()), VT);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
+SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
+  EVT VT = Op.getOperand(0).getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Make sure that the SINT_TO_FP and SRL instructions are available.
+  if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SRL,        VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+ EVT SVT = VT.getScalarType();
+  assert((SVT.getSizeInBits() == 64 || SVT.getSizeInBits() == 32) &&
+      "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
+
+  unsigned BW = SVT.getSizeInBits();
+  SDValue HalfWord = DAG.getConstant(BW/2, VT);
+
+  // Constants to clear the upper part of the word.
+  // Notice that we can also use SHL+SHR, but using a constant is slightly
+  // faster on x86.
+  uint64_t HWMask = (SVT.getSizeInBits()==64)?0x00000000FFFFFFFF:0x0000FFFF;
+  SDValue HalfWordMask = DAG.getConstant(HWMask, VT);
+
+  // Two to the power of half-word-size.
+  SDValue TWOHW = DAG.getConstantFP((1<<(BW/2)), Op.getValueType());
+
+  // Clear upper part of LO, lower HI
+  SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord);
+  SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask);
+
+  // Convert hi and lo to floats
+  // Convert the hi part back to the upper values
+  SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), HI);
+          fHI = DAG.getNode(ISD::FMUL, DL, Op.getValueType(), fHI, TWOHW);
+  SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), LO);
+
+  // Add the two halves
+  return DAG.getNode(ISD::FADD, DL, Op.getValueType(), fHI, fLO);
+}
+
+
+SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
+  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
+    SDValue Zero = DAG.getConstantFP(-0.0, Op.getValueType());
+    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+                       Zero, Op.getOperand(0));
+  }
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1), CC = Op.getOperand(2);
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  SmallVector<SDValue, 8> Ops(NumElems);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getIntPtrConstant(i));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(TmpEltVT),
+                         LHSElem, RHSElem, CC);
+    Ops[i] = DAG.getNode(ISD::SELECT, dl, EltVT, Ops[i],
+                         DAG.getConstant(APInt::getAllOnesValue
+                                         (EltVT.getSizeInBits()), EltVT),
+                         DAG.getConstant(0, EltVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElems);
+}
+
+}
+
+bool SelectionDAG::LegalizeVectors() {
+  return VectorLegalizer(*this).Run();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
new file mode 100644
index 000000000000..5ec853563888
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -0,0 +1,2795 @@
+//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file performs vector type splitting and scalarization for LegalizeTypes.
+// Scalarization is the act of changing a computation in an illegal one-element
+// vector type to be a computation in its scalar element type.  For example,
+// implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
+// as a base case when scalarizing vector arithmetic like <4 x f32>, which
+// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
+// types.
+// Splitting is the act of changing a computation in an invalid vector type to
+// be a computation in two vectors of half the size.  For example, implementing
+// <128 x f32> operations in terms of two <64 x f32> operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Scalarization: <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Scalarize node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to scalarize the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
+  case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
+  case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
+  case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
+  case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
+  case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::ANY_EXTEND:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+    R = ScalarizeVecRes_UnaryOp(N);
+    break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::FADD:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SUB:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    R = ScalarizeVecRes_BinOp(N);
+    break;
+  case ISD::FMA:
+    R = ScalarizeVecRes_TernaryOp(N);
+    break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetScalarizedVector(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     Op0.getValueType(), Op0, Op1, Op2);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
+                                                       unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetScalarizedVector(Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                     NewVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  return DAG.getConvertRndSat(NewVT, N->getDebugLoc(),
+                              Op0, DAG.getValueType(NewVT),
+                              DAG.getValueType(Op0.getValueType()),
+                              N->getOperand(3),
+                              N->getOperand(4),
+                              cast<CvtRndSatSDNode>(N)->getCvtCode());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     N->getValueType(0).getVectorElementType(),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+                     NewVT, Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FPOWI, N->getDebugLoc(),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  // The value to insert may have a wider type than the vector element type,
+  // so be sure to truncate it to the element type if necessary.
+  SDValue Op = N->getOperand(1);
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  if (Op.getValueType() != EltVT)
+    // FIXME: Can this happen for floating point types?
+    Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, Op);
+  return Op;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
+  assert(N->isUnindexed() && "Indexed vector load?");
+
+  SDValue Result = DAG.getLoad(ISD::UNINDEXED,
+                               N->getExtensionType(),
+                               N->getValueType(0).getVectorElementType(),
+                               N->getDebugLoc(),
+                               N->getChain(), N->getBasePtr(),
+                               DAG.getUNDEF(N->getBasePtr().getValueType()),
+                               N->getPointerInfo(),
+                               N->getMemoryVT().getVectorElementType(),
+                               N->isVolatile(), N->isNonTemporal(),
+                               N->isInvariant(), N->getOriginalAlignment());
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
+  // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
+  EVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), DestVT, Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), EltVT,
+                     LHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  // If the operand is wider than the vector element type then it is implicitly
+  // truncated.  Make that explicit here.
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  if (InOp.getValueType() != EltVT)
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
+  SDValue Cond = GetScalarizedVector(N->getOperand(0));
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  TargetLowering::BooleanContent ScalarBool = TLI.getBooleanContents(false);
+  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true);
+  if (ScalarBool != VecBool) {
+    EVT CondVT = Cond.getValueType();
+    switch (ScalarBool) {
+      case TargetLowering::UndefinedBooleanContent:
+        break;
+      case TargetLowering::ZeroOrOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
+        // Vector read from all ones, scalar expects a single 1 so mask.
+        Cond = DAG.getNode(ISD::AND, N->getDebugLoc(), CondVT,
+                           Cond, DAG.getConstant(1, CondVT));
+        break;
+      case TargetLowering::ZeroOrNegativeOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrOneBooleanContent);
+        // Vector reads from a one, scalar from all ones so sign extend.
+        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), CondVT,
+                           Cond, DAG.getValueType(MVT::i1));
+        break;
+    }
+  }
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), Cond, LHS,
+                     GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
+                     LHS.getValueType(), N->getOperand(0), LHS,
+                     GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), LHS.getValueType(),
+                     N->getOperand(0), N->getOperand(1),
+                     LHS, GetScalarizedVector(N->getOperand(3)),
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+
+  if (N->getValueType(0).isVector()) return ScalarizeVecRes_VSETCC(N);
+
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  DebugLoc DL = N->getDebugLoc();
+
+  // Turn it into a scalar SETCC.
+  return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
+  // Figure out if the scalar is the LHS or RHS and return it.
+  SDValue Arg = N->getOperand(2).getOperand(0);
+  if (Arg.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
+  return GetScalarizedVector(N->getOperand(Op));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  EVT NVT = N->getValueType(0).getVectorElementType();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+                            N->getOperand(2));
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+    TargetLowering::getExtendForContent(TLI.getBooleanContents(true));
+  return DAG.getNode(ExtendCode, DL, NVT, Res);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Scalarization <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      llvm_unreachable("Do not know how to scalarize this operator's operand!");
+    case ISD::BITCAST:
+      Res = ScalarizeVecOp_BITCAST(N);
+      break;
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND:
+      Res = ScalarizeVecOp_EXTEND(N);
+      break;
+    case ISD::CONCAT_VECTORS:
+      Res = ScalarizeVecOp_CONCAT_VECTORS(N);
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
+      break;
+    case ISD::STORE:
+      Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// ScalarizeVecOp_BITCAST - If the value to convert is a vector that needs
+/// to be scalarized, it must be <1 x ty>.  Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                     N->getValueType(0), Elt);
+}
+
+/// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
+/// to be scalarized, it must be <1 x ty>.  Extend the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTEND(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SmallVector<SDValue, 1> Ops(1);
+  Ops[0] = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                       N->getValueType(0).getScalarType(), Elt);
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+                     &Ops[0], 1);
+}
+
+/// ScalarizeVecOp_CONCAT_VECTORS - The vectors to concatenate have length one -
+/// use a BUILD_VECTOR instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
+  SmallVector<SDValue, 8> Ops(N->getNumOperands());
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    Ops[i] = GetScalarizedVector(N->getOperand(i));
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+                     &Ops[0], Ops.size());
+}
+
+/// ScalarizeVecOp_EXTRACT_VECTOR_ELT - If the input is a vector that needs to
+/// be scalarized, it must be <1 x ty>, so just return the element, ignoring the
+/// index.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Res = GetScalarizedVector(N->getOperand(0));
+  if (Res.getValueType() != N->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0),
+                      Res);
+  return Res;
+}
+
+/// ScalarizeVecOp_STORE - If the value to store is a vector that needs to be
+/// scalarized, it must be <1 x ty>.  Just store the element.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(N->isUnindexed() && "Indexed store of one-element vector?");
+  assert(OpNo == 1 && "Do not know how to scalarize this operand!");
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isTruncatingStore())
+    return DAG.getTruncStore(N->getChain(), dl,
+                             GetScalarizedVector(N->getOperand(1)),
+                             N->getBasePtr(), N->getPointerInfo(),
+                             N->getMemoryVT().getVectorElementType(),
+                             N->isVolatile(), N->isNonTemporal(),
+                             N->getAlignment());
+
+  return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+                      N->getBasePtr(), N->getPointerInfo(),
+                      N->isVolatile(), N->isNonTemporal(),
+                      N->getOriginalAlignment());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorResult - This method is called when the specified result of the
+/// specified node is found to need vector splitting.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// legalization, we just know that (at least) one result needs vector
+/// splitting.
+void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Split node result: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Lo, Hi;
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SplitVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::BITCAST:           SplitVecRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
+  case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
+  case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
+  case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::LOAD:
+    SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::SETCC:
+    SplitVecRes_SETCC(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::CONVERT_RNDSAT:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+    SplitVecRes_UnaryOp(N, Lo, Hi);
+    break;
+
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::FDIV:
+  case ISD::FPOW:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::UREM:
+  case ISD::SREM:
+  case ISD::FREM:
+    SplitVecRes_BinOp(N, Lo, Hi);
+    break;
+  case ISD::FMA:
+    SplitVecRes_TernaryOp(N, Lo, Hi);
+    break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetSplitVector(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo);
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue Op0Lo, Op0Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  SDValue Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  SDValue Op2Lo, Op2Hi;
+  GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
+  DebugLoc dl = N->getDebugLoc();
+
+  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
+                   Op0Lo, Op1Lo, Op2Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
+                   Op0Hi, Op1Hi, Op2Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // We know the result is a vector.  The input may be either a vector or a
+  // scalar value.
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeWidenVector:
+    break;
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    // A scalar to vector conversion, where the scalar needs expansion.
+    // If the vector is being split in two then we can just convert the
+    // expanded pieces.
+    if (LoVT == HiVT) {
+      GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+      return;
+    }
+    break;
+  case TargetLowering::TypeSplitVector:
+    // If the input is a vector that needs to be split, convert each split
+    // piece of the input now.
+    GetSplitVector(InOp, Lo, Hi);
+    Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+    return;
+  }
+
+  // In the general case, convert the input to an integer and split it by hand.
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  if (TLI.isBigEndian())
+    std::swap(LoIntVT, HiIntVT);
+
+  SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+  Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
+  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
+  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumSubvectors = N->getNumOperands() / 2;
+  if (NumSubvectors == 1) {
+    Lo = N->getOperand(0);
+    Hi = N->getOperand(1);
+    return;
+  }
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, &HiOps[0], HiOps.size());
+}
+
+void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
+                   DAG.getIntPtrConstant(IdxVal + LoVT.getVectorNumElements()));
+}
+
+void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT LoVT, HiVT;
+  GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT(), LoVT, HiVT);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   DAG.getValueType(LoVT));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   DAG.getValueType(HiVT));
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Elt = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(Vec, Lo, Hi);
+
+  if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
+    if (IdxVal < LoNumElts)
+      Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                       Lo.getValueType(), Lo, Elt, Idx);
+    else
+      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                       DAG.getIntPtrConstant(IdxVal - LoNumElts));
+    return;
+  }
+
+  // Spill the vector to the stack.
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                               MachinePointerInfo(), false, false, 0);
+
+  // Store the new element.  This may be larger than the vector element type,
+  // so use a truncating store.
+  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  unsigned Alignment =
+    TLI.getDataLayout()->getPrefTypeAlignment(VecType);
+  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT,
+                            false, false, 0);
+
+  // Load the Lo part from the stack slot.
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   false, false, false, 0);
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getIntPtrConstant(IncrementSize));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   false, false, false, MinAlign(Alignment, IncrementSize));
+}
+
+void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
+  Hi = DAG.getUNDEF(HiVT);
+}
+
+void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
+                                        SDValue &Hi) {
+  assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
+  EVT LoVT, HiVT;
+  DebugLoc dl = LD->getDebugLoc();
+  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  EVT MemoryVT = LD->getMemoryVT();
+  unsigned Alignment = LD->getOriginalAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+  bool isInvariant = LD->isInvariant();
+
+  EVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
+                   LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
+                   isInvariant, Alignment);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
+                   LD->getPointerInfo().getWithOffset(IncrementSize),
+                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
+  EVT LoVT, HiVT;
+  DebugLoc DL = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  // Split the input.
+  EVT InVT = N->getOperand(0).getValueType();
+  SDValue LL, LH, RL, RH;
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  LL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
+                   DAG.getIntPtrConstant(0));
+  LH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+
+  RL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
+                   DAG.getIntPtrConstant(0));
+  RH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+}
+
+void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // Get the dest types - they may not match the input types, e.g. int_to_fp.
+  EVT LoVT, HiVT;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+
+  // If the input also splits, handle it directly for a compile time speedup.
+  // Otherwise split it by hand.
+  EVT InVT = N->getOperand(0).getValueType();
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+  } else {
+    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
+                                 LoVT.getVectorNumElements());
+    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(0));
+    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
+                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  }
+
+  if (N->getOpcode() == ISD::FP_ROUND) {
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
+  } else if (N->getOpcode() == ISD::CONVERT_RNDSAT) {
+    SDValue DTyOpLo = DAG.getValueType(LoVT);
+    SDValue DTyOpHi = DAG.getValueType(HiVT);
+    SDValue STyOpLo = DAG.getValueType(Lo.getValueType());
+    SDValue STyOpHi = DAG.getValueType(Hi.getValueType());
+    SDValue RndOp = N->getOperand(3);
+    SDValue SatOp = N->getOperand(4);
+    ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+    Lo = DAG.getConvertRndSat(LoVT, dl, Lo, DTyOpLo, STyOpLo, RndOp, SatOp,
+                              CvtCode);
+    Hi = DAG.getConvertRndSat(HiVT, dl, Hi, DTyOpHi, STyOpHi, RndOp, SatOp,
+                              CvtCode);
+  } else {
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
+                                                  SDValue &Lo, SDValue &Hi) {
+  // The low and high parts of the original input give four input vectors.
+  SDValue Inputs[4];
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
+  GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
+  EVT NewVT = Inputs[0].getValueType();
+  unsigned NewElts = NewVT.getVectorNumElements();
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int, 16> Ops;
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
+    unsigned FirstMaskIdx = High * NewElts;
+    bool useBuildVector = false;
+    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+      // The mask element.  This indexes into the input.
+      int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+      // The input vector this mask element indexes into.
+      unsigned Input = (unsigned)Idx / NewElts;
+
+      if (Input >= array_lengthof(Inputs)) {
+        // The mask element does not index into any input vector.
+        Ops.push_back(-1);
+        continue;
+      }
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NewElts;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input) {
+          // This input vector is already an operand.
+          break;
+        } else if (InputUsed[OpNo] == -1U) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        useBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Ops.push_back(Idx + OpNo * NewElts);
+    }
+
+    if (useBuildVector) {
+      EVT EltVT = NewVT.getVectorElementType();
+      SmallVector<SDValue, 16> SVOps;
+
+      // Extract the input elements by hand.
+      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+        // The mask element.  This indexes into the input.
+        int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+        // The input vector this mask element indexes into.
+        unsigned Input = (unsigned)Idx / NewElts;
+
+        if (Input >= array_lengthof(Inputs)) {
+          // The mask element is "undef" or indexes off the end of the input.
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NewElts;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    Inputs[Input], DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the Lo/Hi output using a BUILD_VECTOR.
+      Output = DAG.getNode(ISD::BUILD_VECTOR,dl,NewVT, &SVOps[0], SVOps.size());
+    } else if (InputUsed[0] == -1U) {
+      // No input vectors were used!  The result is undefined.
+      Output = DAG.getUNDEF(NewVT);
+    } else {
+      SDValue Op0 = Inputs[InputUsed[0]];
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = InputUsed[1] == -1U ?
+        DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
+      // At least one input vector was used.  Create a new shuffle vector.
+      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, &Ops[0]);
+    }
+
+    Ops.clear();
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// SplitVectorOperand - This method is called when the specified operand of the
+/// specified node is found to need vector splitting.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need legalization as well as the specified one.
+bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Split node operand: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (Res.getNode() == 0) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      report_fatal_error("Do not know how to split this operator's "
+                         "operand!\n");
+
+    case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
+    case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
+    case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+    case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
+    case ISD::STORE:
+      Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+    case ISD::VSELECT:
+      Res = SplitVecOp_VSELECT(N, OpNo);
+      break;
+    case ISD::CTTZ:
+    case ISD::CTLZ:
+    case ISD::CTPOP:
+    case ISD::FP_EXTEND:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+    case ISD::FTRUNC:
+    case ISD::TRUNCATE:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      Res = SplitVecOp_UnaryOp(N);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
+  // The only possibility for an illegal operand is the mask, since result type
+  // legalization would have handled this node already otherwise.
+  assert(OpNo == 0 && "Illegal operand must be mask");
+
+  SDValue Mask = N->getOperand(0);
+  SDValue Src0 = N->getOperand(1);
+  SDValue Src1 = N->getOperand(2);
+  DebugLoc DL = N->getDebugLoc();
+  EVT MaskVT = Mask.getValueType();
+  assert(MaskVT.isVector() && "VSELECT without a vector mask?");
+
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  assert(Lo.getValueType() == Hi.getValueType() &&
+         "Lo and Hi have differing types");;
+
+  unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
+  unsigned HiNumElts = Hi.getValueType().getVectorNumElements();
+  assert(LoNumElts == HiNumElts && "Asymmetric vector split?");
+
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Zero = DAG.getIntPtrConstant(0);
+  SDValue LoElts = DAG.getIntPtrConstant(LoNumElts);
+  EVT Src0VT = Src0.getValueType();
+  EVT Src0EltTy = Src0VT.getVectorElementType();
+  EVT MaskEltTy = MaskVT.getVectorElementType();
+
+  EVT LoOpVT = EVT::getVectorVT(Ctx, Src0EltTy, LoNumElts);
+  EVT LoMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, LoNumElts);
+  EVT HiOpVT = EVT::getVectorVT(Ctx, Src0EltTy, HiNumElts);
+  EVT HiMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, HiNumElts);
+
+  SDValue LoOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src0, Zero);
+  SDValue LoOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src1, Zero);
+
+  SDValue HiOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src0, LoElts);
+  SDValue HiOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src1, LoElts);
+
+  SDValue LoMask =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoMaskVT, Mask, Zero);
+  SDValue HiMask =
+    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiMaskVT, Mask, LoElts);
+
+  SDValue LoSelect =
+    DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
+  SDValue HiSelect =
+    DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  DebugLoc dl = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
+  // For example, i64 = BITCAST v4i16 on alpha.  Typically the vector will
+  // end up being split all the way down to individual components.  Convert the
+  // split pieces into integers and reassemble.
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = BitConvertToInteger(Lo);
+  Hi = BitConvertToInteger(Hi);
+
+  if (TLI.isBigEndian())
+    std::swap(Lo, Hi);
+
+  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getValueType(0),
+                     JoinIntegers(Lo, Hi));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  // We know that the extracted result type is legal.
+  EVT SubVT = N->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+
+  uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  if (IdxVal < LoElts) {
+    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+           "Extracted subvector crosses vector split!");
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
+  } else {
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
+                       DAG.getConstant(IdxVal - LoElts, Idx.getValueType()));
+  }
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  EVT VecVT = Vec.getValueType();
+
+  if (isa<ConstantSDNode>(Idx)) {
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");
+
+    SDValue Lo, Hi;
+    GetSplitVector(Vec, Lo, Hi);
+
+    uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+
+    if (IdxVal < LoElts)
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    return SDValue(DAG.UpdateNodeOperands(N, Hi,
+                                  DAG.getConstant(IdxVal - LoElts,
+                                                  Idx.getValueType())), 0);
+  }
+
+  // Store the vector to the stack.
+  EVT EltVT = VecVT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                               MachinePointerInfo(), false, false, 0);
+
+  // Load back the required element.
+  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
+                        MachinePointerInfo(), EltVT, false, false, 0);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed store of vector?");
+  assert(OpNo == 1 && "Can only split the stored value");
+  DebugLoc DL = N->getDebugLoc();
+
+  bool isTruncating = N->isTruncatingStore();
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  bool isVol = N->isVolatile();
+  bool isNT = N->isNonTemporal();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+
+  EVT LoMemVT, HiMemVT;
+  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+
+  if (isTruncating)
+    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
+                           LoMemVT, isVol, isNT, Alignment);
+  else
+    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
+                      isVol, isNT, Alignment);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getIntPtrConstant(IncrementSize));
+
+  if (isTruncating)
+    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
+                           N->getPointerInfo().getWithOffset(IncrementSize),
+                           HiMemVT, isVol, isNT, Alignment);
+  else
+    Hi = DAG.getStore(Ch, DL, Hi, Ptr,
+                      N->getPointerInfo().getWithOffset(IncrementSize),
+                      isVol, isNT, Alignment);
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
+  DebugLoc DL = N->getDebugLoc();
+
+  // The input operands all must have the same type, and we know the result the
+  // result type is valid.  Convert this to a buildvector which extracts all the
+  // input elements.
+  // TODO: If the input elements are power-two vectors, we could convert this to
+  // a new CONCAT_VECTORS node with elements that are half-wide.
+  SmallVector<SDValue, 32> Elts;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  for (unsigned op = 0, e = N->getNumOperands(); op != e; ++op) {
+    SDValue Op = N->getOperand(op);
+    for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
+         i != e; ++i) {
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                                 Op, DAG.getIntPtrConstant(i)));
+
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0),
+                     &Elts[0], Elts.size());
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  // The result has a legal vector type, but the input needs splitting.
+  SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
+  DebugLoc DL = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  unsigned PartElements = Lo0.getValueType().getVectorNumElements();
+  EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
+  EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements);
+
+  LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
+  HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
+  return PromoteTargetBoolean(Con, N->getValueType(0));
+}
+
+
+SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  DebugLoc DL = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Widening
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Widen node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+
+  // See if the target wants to custom widen this node.
+  if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
+    return;
+
+  SDValue Res = SDValue();
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen the result of this operator!");
+
+  case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
+  case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:
+    Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
+    break;
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::BSWAP:
+  case ISD::FADD:
+  case ISD::FCOPYSIGN:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::SUB:
+  case ISD::XOR:
+    Res = WidenVecRes_Binary(N);
+    break;
+
+  case ISD::FPOWI:
+    Res = WidenVecRes_POWI(N);
+    break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    Res = WidenVecRes_Shift(N);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+    Res = WidenVecRes_Convert(N);
+    break;
+
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FRINT:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+    Res = WidenVecRes_Unary(N);
+    break;
+  case ISD::FMA:
+    Res = WidenVecRes_Ternary(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (Res.getNode())
+    SetWidenedVector(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
+  // Ternary op widening.
+  DebugLoc dl = N->getDebugLoc();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = GetWidenedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+  // Binary op widening.
+  unsigned Opcode = N->getOpcode();
+  DebugLoc dl = N->getDebugLoc();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  EVT VT = WidenVT;
+  unsigned NumElts =  VT.getVectorNumElements();
+  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+    NumElts = NumElts / 2;
+    VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+  }
+
+  if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
+    // Operation doesn't trap so just widen as normal.
+    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+  }
+
+  // No legal vector version so unroll the vector operation and then widen.
+  if (NumElts == 1)
+    return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+
+  // Since the operation can trap, apply operation on the original vector.
+  EVT MaxVT = VT;
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+  SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+  unsigned ConcatEnd = 0;  // Current ConcatOps index.
+  int Idx = 0;        // Current Idx into input vectors.
+
+  // NumElts := greatest legal vector size (at most WidenVT)
+  // while (orig. vector has unhandled elements) {
+  //   take munches of size NumElts from the beginning and add to ConcatOps
+  //   NumElts := next smaller supported vector size or 1
+  // }
+  while (CurNumElts != 0) {
+    while (CurNumElts >= NumElts) {
+      SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+                                 DAG.getIntPtrConstant(Idx));
+      SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+                                 DAG.getIntPtrConstant(Idx));
+      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
+      Idx += NumElts;
+      CurNumElts -= NumElts;
+    }
+    do {
+      NumElts = NumElts / 2;
+      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+
+    if (NumElts == 1) {
+      for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp1, DAG.getIntPtrConstant(Idx));
+        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp2, DAG.getIntPtrConstant(Idx));
+        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                             EOp1, EOp2);
+      }
+      CurNumElts = 0;
+    }
+  }
+
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  // while (Some element of ConcatOps is not of type MaxVT) {
+  //   From the end of ConcatOps, collect elements of the same type and put
+  //   them into an op of the next larger supported type
+  // }
+  while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+    Idx = ConcatEnd - 1;
+    VT = ConcatOps[Idx--].getValueType();
+    while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+      Idx--;
+
+    int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+    EVT NextVT;
+    do {
+      NextSize *= 2;
+      NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+    } while (!TLI.isTypeLegal(NextVT));
+
+    if (!VT.isVector()) {
+      // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+      SDValue VecOp = DAG.getUNDEF(NextVT);
+      unsigned NumToInsert = ConcatEnd - Idx - 1;
+      for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+        VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                            ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
+      }
+      ConcatOps[Idx+1] = VecOp;
+      ConcatEnd = Idx + 2;
+    } else {
+      // Vector type, create a CONCAT_VECTORS of type NextVT
+      SDValue undefVec = DAG.getUNDEF(VT);
+      unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+      SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+      unsigned RealVals = ConcatEnd - Idx - 1;
+      unsigned SubConcatEnd = 0;
+      unsigned SubConcatIdx = Idx + 1;
+      while (SubConcatEnd < RealVals)
+        SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+      while (SubConcatEnd < OpsToConcat)
+        SubConcatOps[SubConcatEnd++] = undefVec;
+      ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                            NextVT, &SubConcatOps[0],
+                                            OpsToConcat);
+      ConcatEnd = SubConcatIdx + 1;
+    }
+  }
+
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+  unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
+  if (NumOps != ConcatEnd ) {
+    SDValue UndefVal = DAG.getUNDEF(MaxVT);
+    for (unsigned j = ConcatEnd; j < NumOps; ++j)
+      ConcatOps[j] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0], NumOps);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  unsigned Opcode = N->getOpcode();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(N->getOperand(0));
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts) {
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
+      return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1));
+    }
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT,
+                                  &Ops[0], NumConcat);
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVec);
+      return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1));
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT,
+                                  InOp, DAG.getIntPtrConstant(0));
+      // Extract the input and convert the shorten input vector.
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVal);
+      return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1));
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+                              DAG.getIntPtrConstant(i));
+    if (N->getNumOperands() == 1)
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
+    else
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1));
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+
+  EVT ShVT = ShOp.getValueType();
+  if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
+    ShOp = GetWidenedVector(ShOp);
+    ShVT = ShOp.getValueType();
+  }
+  EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
+                                   ShVT.getVectorElementType(),
+                                   WidenVT.getVectorNumElements());
+  if (ShVT != ShWidenVT)
+    ShOp = ModifyToType(ShOp, ShWidenVT);
+
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
+  // Unary op widening.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               cast<VTSDNode>(N->getOperand(1))->getVT()
+                                 .getVectorElementType(),
+                               WidenVT.getVectorNumElements());
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     WidenVT, WidenLHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
+  SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetWidenedVector(WidenVec);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypePromoteInteger:
+    // If the incoming type is a vector that is being promoted, then
+    // we know that the elements are arranged differently and that we
+    // must perform the conversion using a stack slot.
+    if (InVT.isVector())
+      break;
+
+    // If the InOp is promoted to the same size, convert it.  Otherwise,
+    // fall out of the switch and widen the promoted input.
+    InOp = GetPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
+    break;
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeSplitVector:
+    break;
+  case TargetLowering::TypeWidenVector:
+    // If the InOp is widened to the same size, convert it.  Otherwise, fall
+    // out of the switch and widen the widened input.
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      // The input widens to the same size. Convert to the widen value.
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
+    break;
+  }
+
+  unsigned WidenSize = WidenVT.getSizeInBits();
+  unsigned InSize = InVT.getSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
+    // Determine new input vector type.  The new input vector type will use
+    // the same element type (if its a vector) or use the input type as a
+    // vector.  It is the same size as the type to widen to.
+    EVT NewInVT;
+    unsigned NewNumElts = WidenSize / InSize;
+    if (InVT.isVector()) {
+      EVT InEltVT = InVT.getVectorElementType();
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
+                                 WidenSize / InEltVT.getSizeInBits());
+    } else {
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
+    }
+
+    if (TLI.isTypeLegal(NewInVT)) {
+      // Because the result and the input are different vector types, widening
+      // the result could create a legal type but widening the input might make
+      // it an illegal type that might lead to repeatedly splitting the input
+      // and then widening it. To avoid this, we widen the input only if
+      // it results in a legal type.
+      SmallVector<SDValue, 16> Ops(NewNumElts);
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      Ops[0] = InOp;
+      for (unsigned i = 1; i < NewNumElts; ++i)
+        Ops[i] = UndefVal;
+
+      SDValue NewVec;
+      if (InVT.isVector())
+        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      else
+        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                             NewInVT, &Ops[0], NewNumElts);
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  // Build a vector with undefined for the new nodes.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
+  NewOps.reserve(WidenNumElts);
+  for (unsigned i = NumElts; i < WidenNumElts; ++i)
+    NewOps.push_back(DAG.getUNDEF(EltVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &NewOps[0], NewOps.size());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
+  EVT InVT = N->getOperand(0).getValueType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  DebugLoc dl = N->getDebugLoc();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumInElts = InVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+
+  bool InputWidened = false; // Indicates we need to widen the input.
+  if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
+    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+      // Add undef vectors to widen to correct length.
+      unsigned NumConcat = WidenVT.getVectorNumElements() /
+                           InVT.getVectorNumElements();
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      for (unsigned i=0; i < NumOperands; ++i)
+        Ops[i] = N->getOperand(i);
+      for (unsigned i = NumOperands; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &Ops[0], NumConcat);
+    }
+  } else {
+    InputWidened = true;
+    if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+      // The inputs and the result are widen to the same value.
+      unsigned i;
+      for (i=1; i < NumOperands; ++i)
+        if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+          break;
+
+      if (i == NumOperands)
+        // Everything but the first operand is an UNDEF so just return the
+        // widened first operand.
+        return GetWidenedVector(N->getOperand(0));
+
+      if (NumOperands == 2) {
+        // Replace concat of two operands with a shuffle.
+        SmallVector<int, 16> MaskOps(WidenNumElts, -1);
+        for (unsigned i = 0; i < NumInElts; ++i) {
+          MaskOps[i] = i;
+          MaskOps[i + NumInElts] = i + WidenNumElts;
+        }
+        return DAG.getVectorShuffle(WidenVT, dl,
+                                    GetWidenedVector(N->getOperand(0)),
+                                    GetWidenedVector(N->getOperand(1)),
+                                    &MaskOps[0]);
+      }
+    }
+  }
+
+  // Fall back to use extracts and build vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (InputWidened)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
+  }
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue InOp  = N->getOperand(0);
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  SDValue DTyOp = DAG.getValueType(WidenVT);
+  SDValue STyOp = DAG.getValueType(InWidenVT);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+
+      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, &Ops[0],NumConcat);
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
+                         DAG.getIntPtrConstant(0));
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  DTyOp = DAG.getValueType(EltVT);
+  STyOp = DAG.getValueType(InEltVT);
+
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                 DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  EVT      VT = N->getValueType(0);
+  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SDValue  InOp = N->getOperand(0);
+  SDValue  Idx  = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+    InOp = GetWidenedVector(InOp);
+
+  EVT InVT = InOp.getValueType();
+
+  // Check if we can just return the input vector after widening.
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0 && InVT == WidenVT)
+    return InOp;
+
+  // Check if we can extract from the vector.
+  unsigned InNumElts = InVT.getVectorNumElements();
+  if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+
+  // We could try widening the input to the right length but for now, extract
+  // the original elements, fill the rest with undefs and build a vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned i;
+  for (i=0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                         DAG.getIntPtrConstant(IdxVal+i));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], WidenNumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, N->getDebugLoc(),
+                     InOp.getValueType(), InOp,
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  SDValue Result;
+  SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
+  if (ExtType != ISD::NON_EXTLOAD)
+    Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
+  else
+    Result = GenWidenVectorLoads(LdChain, LD);
+
+  // If we generate a single load, we can use that for the chain.  Otherwise,
+  // build a factor node to remember the multiple loads are independent and
+  // chain to that.
+  SDValue NewChain;
+  if (LdChain.size() == 1)
+    NewChain = LdChain[0];
+  else
+    NewChain = DAG.getNode(ISD::TokenFactor, LD->getDebugLoc(), MVT::Other,
+                           &LdChain[0], LdChain.size());
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, N->getDebugLoc(),
+                     WidenVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue Cond1 = N->getOperand(0);
+  EVT CondVT = Cond1.getValueType();
+  if (CondVT.isVector()) {
+    EVT CondEltVT = CondVT.getVectorElementType();
+    EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
+                                        CondEltVT, WidenNumElts);
+    if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
+      Cond1 = GetWidenedVector(Cond1);
+
+    if (Cond1.getValueType() != CondWidenVT)
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
+  }
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+  assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+                     WidenVT, Cond1, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
+  SDValue InOp1 = GetWidenedVector(N->getOperand(2));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+                     InOp1.getValueType(), N->getOperand(0),
+                     N->getOperand(1), InOp1, InOp2, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+  if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SETCC, N->getDebugLoc(), WidenVT,
+                     InOp1, InOp2, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ return DAG.getUNDEF(WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx < (int)NumElts)
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned i = NumElts; i != WidenNumElts; ++i)
+    NewMask.push_back(-1);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, &NewMask[0]);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operands must be vectors");
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = N->getOperand(0);
+  EVT InVT = InOp1.getValueType();
+  assert(InVT.isVector() && "can not widen non vector type");
+  EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
+                                   InVT.getVectorElementType(), WidenNumElts);
+  InOp1 = GetWidenedVector(InOp1);
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Assume that the input and output will be widen appropriately.  If not,
+  // we will have to unroll it at some point.
+  assert(InOp1.getValueType() == WidenInVT &&
+         InOp2.getValueType() == WidenInVT &&
+         "Input not widened to expected type!");
+  (void)WidenInVT;
+  return DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                     WidenVT, InOp1, InOp2, N->getOperand(2));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Widen Vector Operand
+//===----------------------------------------------------------------------===//
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Widen node operand " << OpNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom widen this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen this operator's operand!");
+
+  case ISD::BITCAST:            Res = WidenVecOp_BITCAST(N); break;
+  case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Res = WidenVecOp_Convert(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
+  // Since the result is legal and the input is illegal, it is unlikely
+  // that we can fix the input to a legal type so unroll the convert
+  // into some scalar code and create a nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InOp = N->getOperand(0);
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+    InOp = GetWidenedVector(InOp);
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+
+  unsigned Opcode = N->getOpcode();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  for (unsigned i=0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                                     DAG.getIntPtrConstant(i)));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  EVT InWidenVT = InOp.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check if we can convert between two legal vector types and extract.
+  unsigned InWidenSize = InWidenVT.getSizeInBits();
+  unsigned Size = VT.getSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
+    unsigned NewNumElts = InWidenSize / Size;
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
+    if (TLI.isTypeLegal(NewVT)) {
+      SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+                         DAG.getIntPtrConstant(0));
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, VT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
+  // If the input vector is not legal, it is likely that we will not find a
+  // legal vector of the same size. Replace the concatenate vector with a
+  // nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+
+  EVT InVT = N->getOperand(0).getValueType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  unsigned Idx = 0;
+  unsigned NumOperands = N->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, N->getDebugLoc(),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
+  // We have to widen the value but we want only to store the original
+  // vector type.
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SmallVector<SDValue, 16> StChain;
+  if (ST->isTruncatingStore())
+    GenWidenVectorTruncStores(StChain, ST);
+  else
+    GenWidenVectorStores(StChain, ST);
+
+  if (StChain.size() == 1)
+    return StChain[0];
+  else
+    return DAG.getNode(ISD::TokenFactor, ST->getDebugLoc(),
+                       MVT::Other,&StChain[0],StChain.size());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
+  SDValue InOp0 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+
+  // WARNING: In this code we widen the compare instruction with garbage.
+  // This garbage may contain denormal floats which may be slow. Is this a real
+  // concern ? Should we zero the unused lanes if this is a float compare ?
+
+  // Get a new SETCC node to compare the newly widened operands.
+  // Only some of the compared elements are legal.
+  EVT SVT = TLI.getSetCCResultType(InOp0.getValueType());
+  SDValue WideSETCC = DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+                     SVT, InOp0, InOp1, N->getOperand(2));
+
+  // Extract the needed results from the result vector.
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
+                               SVT.getVectorElementType(),
+                               N->getValueType(0).getVectorNumElements());
+  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
+                           ResVT, WideSETCC, DAG.getIntPtrConstant(0));
+
+  return PromoteTargetBoolean(CC, N->getValueType(0));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector Widening Utilities
+//===----------------------------------------------------------------------===//
+
+// Utility function to find the type to chop up a widen vector for load/store
+//  TLI:       Target lowering used to determine legal types.
+//  Width:     Width left need to load/store.
+//  WidenVT:   The widen vector type to load to/store from
+//  Align:     If 0, don't allow use of a wider type
+//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+
+static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
+                       unsigned Width, EVT WidenVT,
+                       unsigned Align = 0, unsigned WidenEx = 0) {
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
+  unsigned AlignInBits = Align*8;
+
+  // If we have one element to load/store, return it.
+  EVT RetVT = WidenEltVT;
+  if (Width == WidenEltWidth)
+    return RetVT;
+
+  // See if there is larger legal integer than the element type to load/store
+  unsigned VT;
+  for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
+    EVT MemVT((MVT::SimpleValueType) VT);
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (MemVT.getSizeInBits() <= WidenEltWidth)
+      break;
+    if (TLI.isTypeLegal(MemVT) && (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      RetVT = MemVT;
+      break;
+    }
+  }
+
+  // See if there is a larger vector type to load/store that has the same vector
+  // element type and is evenly divisible with the WidenVT.
+  for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
+    EVT MemVT = (MVT::SimpleValueType) VT;
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+        (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+        return MemVT;
+    }
+  }
+
+  return RetVT;
+}
+
+// Builds a vector type from scalar loads
+//  VecTy: Resulting Vector type
+//  LDOps: Load operators to build a vector type
+//  [Start,End) the list of loads to use.
+static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
+                                     SmallVector<SDValue, 16>& LdOps,
+                                     unsigned Start, unsigned End) {
+  DebugLoc dl = LdOps[Start].getDebugLoc();
+  EVT LdTy = LdOps[Start].getValueType();
+  unsigned Width = VecTy.getSizeInBits();
+  unsigned NumElts = Width / LdTy.getSizeInBits();
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
+
+  unsigned Idx = 1;
+  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
+
+  for (unsigned i = Start + 1; i != End; ++i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      NumElts = Width / NewLdTy.getSizeInBits();
+      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
+      VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
+      // Readjust position and vector position based on new load type
+      Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
+      LdTy = NewLdTy;
+    }
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+                        DAG.getIntPtrConstant(Idx++));
+  }
+  return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
+}
+
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
+                                              LoadSDNode *LD) {
+  // The strategy assumes that we can efficiently load powers of two widths.
+  // The routines chops the vector into the largest vector loads with the same
+  // element type or scalar loads and then recombines it to the widen vector
+  // type.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  EVT LdVT    = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+
+  // Load information
+  SDValue   Chain = LD->getChain();
+  SDValue   BasePtr = LD->getBasePtr();
+  unsigned  Align    = LD->getAlignment();
+  bool      isVolatile = LD->isVolatile();
+  bool      isNonTemporal = LD->isNonTemporal();
+  bool      isInvariant = LD->isInvariant();
+
+  int LdWidth = LdVT.getSizeInBits();
+  int WidthDiff = WidenWidth - LdWidth;          // Difference
+  unsigned LdAlign = (isVolatile) ? 0 : Align; // Allow wider loads
+
+  // Find the vector type that can load from.
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+  int NewVTWidth = NewVT.getSizeInBits();
+  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
+                             isVolatile, isNonTemporal, isInvariant, Align);
+  LdChain.push_back(LdOp.getValue(1));
+
+  // Check if we can load the element with one instruction
+  if (LdWidth <= NewVTWidth) {
+    if (!NewVT.isVector()) {
+      unsigned NumElts = WidenWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+    }
+    if (NewVT == WidenVT)
+      return LdOp;
+
+    assert(WidenWidth % NewVTWidth == 0);
+    unsigned NumConcat = WidenWidth / NewVTWidth;
+    SmallVector<SDValue, 16> ConcatOps(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(NewVT);
+    ConcatOps[0] = LdOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      ConcatOps[i] = UndefVal;
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &ConcatOps[0],
+                       NumConcat);
+  }
+
+  // Load vector by using multiple loads from largest vector to scalar
+  SmallVector<SDValue, 16> LdOps;
+  LdOps.push_back(LdOp);
+
+  LdWidth -= NewVTWidth;
+  unsigned Offset = 0;
+
+  while (LdWidth > 0) {
+    unsigned Increment = NewVTWidth / 8;
+    Offset += Increment;
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getIntPtrConstant(Increment));
+
+    SDValue L;
+    if (LdWidth < NewVTWidth) {
+      // Our current type we are using is too large, find a better size
+      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVTWidth = NewVT.getSizeInBits();
+      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
+                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+      LdChain.push_back(L.getValue(1));
+      if (L->getValueType(0).isVector()) {
+        SmallVector<SDValue, 16> Loads;
+        Loads.push_back(L);
+        unsigned size = L->getValueSizeInBits(0);
+        while (size < LdOp->getValueSizeInBits(0)) {
+          Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
+          size += L->getValueSizeInBits(0);
+        }
+        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0),
+                        &Loads[0], Loads.size());
+      }
+    } else {
+      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
+                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+      LdChain.push_back(L.getValue(1));
+    }
+
+    LdOps.push_back(L);
+
+
+    LdWidth -= NewVTWidth;
+  }
+
+  // Build the vector from the loads operations
+  unsigned End = LdOps.size();
+  if (!LdOps[0].getValueType().isVector())
+    // All the loads are scalar loads.
+    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+
+  // If the load contains vectors, build the vector using concat vector.
+  // All of the vectors used to loads are power of 2 and the scalars load
+  // can be combined to make a power of 2 vector.
+  SmallVector<SDValue, 16> ConcatOps(End);
+  int i = End - 1;
+  int Idx = End;
+  EVT LdTy = LdOps[i].getValueType();
+  // First combine the scalar loads to a vector
+  if (!LdTy.isVector())  {
+    for (--i; i >= 0; --i) {
+      LdTy = LdOps[i].getValueType();
+      if (LdTy.isVector())
+        break;
+    }
+    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i+1, End);
+  }
+  ConcatOps[--Idx] = LdOps[i];
+  for (--i; i >= 0; --i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      // Create a larger vector
+      ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
+                                     &ConcatOps[Idx], End - Idx);
+      Idx = End - 1;
+      LdTy = NewLdTy;
+    }
+    ConcatOps[--Idx] = LdOps[i];
+  }
+
+  if (WidenWidth == LdTy.getSizeInBits()*(End - Idx))
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                       &ConcatOps[Idx], End - Idx);
+
+  // We need to fill the rest with undefs to build the vector
+  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  SmallVector<SDValue, 16> WidenOps(NumOps);
+  SDValue UndefVal = DAG.getUNDEF(LdTy);
+  {
+    unsigned i = 0;
+    for (; i != End-Idx; ++i)
+      WidenOps[i] = ConcatOps[Idx+i];
+    for (; i != NumOps; ++i)
+      WidenOps[i] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, &WidenOps[0],NumOps);
+}
+
+SDValue
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
+                                         LoadSDNode * LD,
+                                         ISD::LoadExtType ExtType) {
+  // For extension loads, it may not be more efficient to chop up the vector
+  // and then extended it.  Instead, we unroll the load and build a new vector.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  EVT LdVT    = LD->getMemoryVT();
+  DebugLoc dl = LD->getDebugLoc();
+  assert(LdVT.isVector() && WidenVT.isVector());
+
+  // Load information
+  SDValue   Chain = LD->getChain();
+  SDValue   BasePtr = LD->getBasePtr();
+  unsigned  Align    = LD->getAlignment();
+  bool      isVolatile = LD->isVolatile();
+  bool      isNonTemporal = LD->isNonTemporal();
+
+  EVT EltVT = WidenVT.getVectorElementType();
+  EVT LdEltVT = LdVT.getVectorElementType();
+  unsigned NumElts = LdVT.getVectorNumElements();
+
+  // Load each element and widen
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Increment = LdEltVT.getSizeInBits() / 8;
+  Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
+                          LD->getPointerInfo(),
+                          LdEltVT, isVolatile, isNonTemporal, Align);
+  LdChain.push_back(Ops[0].getValue(1));
+  unsigned i = 0, Offset = Increment;
+  for (i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr, DAG.getIntPtrConstant(Offset));
+    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
+                            LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
+                            isVolatile, isNonTemporal, Align);
+    LdChain.push_back(Ops[i].getValue(1));
+  }
+
+  // Fill the rest with undefs
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i != WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &Ops[0], Ops.size());
+}
+
+
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
+                                            StoreSDNode *ST) {
+  // The strategy assumes that we can efficiently store powers of two widths.
+  // The routines chops the vector into the largest vector stores with the same
+  // element type or scalar stores.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  unsigned Align = ST->getAlignment();
+  bool     isVolatile = ST->isVolatile();
+  bool     isNonTemporal = ST->isNonTemporal();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  DebugLoc dl = ST->getDebugLoc();
+
+  EVT StVT = ST->getMemoryVT();
+  unsigned StWidth = StVT.getSizeInBits();
+  EVT ValVT = ValOp.getValueType();
+  unsigned ValWidth = ValVT.getSizeInBits();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned ValEltWidth = ValEltVT.getSizeInBits();
+  assert(StVT.getVectorElementType() == ValEltVT);
+
+  int Idx = 0;          // current index to store
+  unsigned Offset = 0;  // offset from base to store
+  while (StWidth != 0) {
+    // Find the largest vector type we can store with
+    EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
+    unsigned NewVTWidth = NewVT.getSizeInBits();
+    unsigned Increment = NewVTWidth / 8;
+    if (NewVT.isVector()) {
+      unsigned NumVTElts = NewVT.getVectorNumElements();
+      do {
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+                                   DAG.getIntPtrConstant(Idx));
+        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
+                                    ST->getPointerInfo().getWithOffset(Offset),
+                                       isVolatile, isNonTemporal,
+                                       MinAlign(Align, Offset)));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        Idx += NumVTElts;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getIntPtrConstant(Increment));
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
+    } else {
+      // Cast the vector to the scalar type we can store
+      unsigned NumElts = ValWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
+      // Readjust index position based on new vector type
+      Idx = Idx * ValEltWidth / NewVTWidth;
+      do {
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+                      DAG.getIntPtrConstant(Idx++));
+        StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
+                                    ST->getPointerInfo().getWithOffset(Offset),
+                                       isVolatile, isNonTemporal,
+                                       MinAlign(Align, Offset)));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getIntPtrConstant(Increment));
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
+      // Restore index back to be relative to the original widen element type
+      Idx = Idx * NewVTWidth / ValEltWidth;
+    }
+  }
+}
+
+void
+DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+                                            StoreSDNode *ST) {
+  // For extension loads, it may not be more efficient to truncate the vector
+  // and then store it.  Instead, we extract each element and then store it.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  unsigned Align = ST->getAlignment();
+  bool     isVolatile = ST->isVolatile();
+  bool     isNonTemporal = ST->isNonTemporal();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  DebugLoc dl = ST->getDebugLoc();
+
+  EVT StVT = ST->getMemoryVT();
+  EVT ValVT = ValOp.getValueType();
+
+  // It must be true that we the widen vector type is bigger than where
+  // we need to store.
+  assert(StVT.isVector() && ValOp.getValueType().isVector());
+  assert(StVT.bitsLT(ValOp.getValueType()));
+
+  // For truncating stores, we can not play the tricks of chopping legal
+  // vector types and bit cast it to the right type.  Instead, we unroll
+  // the store.
+  EVT StEltVT  = StVT.getVectorElementType();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned Increment = ValEltVT.getSizeInBits() / 8;
+  unsigned NumElts = StVT.getVectorNumElements();
+  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                            DAG.getIntPtrConstant(0));
+  StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
+                                      ST->getPointerInfo(), StEltVT,
+                                      isVolatile, isNonTemporal, Align));
+  unsigned Offset = Increment;
+  for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr, DAG.getIntPtrConstant(Offset));
+    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                            DAG.getIntPtrConstant(0));
+    StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
+                                      ST->getPointerInfo().getWithOffset(Offset),
+                                        StEltVT, isVolatile, isNonTemporal,
+                                        MinAlign(Align, Offset)));
+  }
+}
+
+/// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
+  // Note that InOp might have been widened so it might already have
+  // the right width or it might need be narrowed.
+  EVT InVT = InOp.getValueType();
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+  DebugLoc dl = InOp.getDebugLoc();
+
+  // Check if InOp already has the right width.
+  if (InVT == NVT)
+    return InOp;
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
+    unsigned NumConcat = WidenNumElts / InNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(InVT);
+    Ops[0] = InOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = UndefVal;
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, &Ops[0], NumConcat);
+  }
+
+  if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                       DAG.getIntPtrConstant(0));
+
+  // Fall back to extract and build.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = NVT.getVectorElementType();
+  unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
+  unsigned Idx;
+  for (Idx = 0; Idx < MinNumElts; ++Idx)
+    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getIntPtrConstant(Idx));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for ( ; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], WidenNumElts);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
new file mode 100644
index 000000000000..473e1384e399
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -0,0 +1,655 @@
+//===- ResourcePriorityQueue.cpp - A DFA-oriented priority queue -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ResourcePriorityQueue class, which is a
+// SchedulingPriorityQueue that prioritizes instructions using DFA state to
+// reduce the length of the critical path through the basic block
+// on VLIW platforms.
+// The scheduler is basically a top-down adaptable list scheduler with DFA
+// resource tracking added to the cost function.
+// DFA is queried as a state machine to model "packets/bundles" during
+// schedule. Currently packets/bundles are discarded at the end of
+// scheduling, affecting only order of instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scheduler"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableDFASched("disable-dfa-sched", cl::Hidden,
+  cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable use of DFA during scheduling"));
+
+static cl::opt<signed> RegPressureThreshold(
+  "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
+  cl::desc("Track reg pressure and switch priority to in-depth"));
+
+
+ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
+  Picker(this),
+  InstrItins(IS->getTargetLowering().getTargetMachine().getInstrItineraryData())
+{
+   TII = IS->getTargetLowering().getTargetMachine().getInstrInfo();
+   TRI = IS->getTargetLowering().getTargetMachine().getRegisterInfo();
+   TLI = &IS->getTargetLowering();
+
+   const TargetMachine &tm = (*IS->MF).getTarget();
+   ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL);
+   // This hard requirement could be relaxed, but for now
+   // do not let it procede.
+   assert (ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+   unsigned NumRC = TRI->getNumRegClasses();
+   RegLimit.resize(NumRC);
+   RegPressure.resize(NumRC);
+   std::fill(RegLimit.begin(), RegLimit.end(), 0);
+   std::fill(RegPressure.begin(), RegPressure.end(), 0);
+   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+        E = TRI->regclass_end(); I != E; ++I)
+     RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
+
+   ParallelLiveRanges = 0;
+   HorizontalVerticalBalance = 0;
+}
+
+unsigned
+ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+
+    SUnit *PredSU = I->getSUnit();
+    const SDNode *ScegN = PredSU->getNode();
+
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    NumberDeps++;  break;
+      case ISD::CopyToReg:      break;
+      case ISD::INLINEASM:      break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
+                                                    unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+
+    SUnit *SuccSU = I->getSUnit();
+    const SDNode *ScegN = SuccSU->getNode();
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    break;
+      case ISD::CopyToReg:      NumberDeps++;  break;
+      case ISD::INLINEASM:      break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+static unsigned numberCtrlDepsInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+static unsigned numberCtrlPredInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+///
+/// Initialize nodes.
+///
+void ResourcePriorityQueue::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  NumNodesSolelyBlocking.resize(SUnits->size(), 0);
+
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    initNumRegDefsLeft(SU);
+    SU->NodeQueueId = 0;
+  }
+}
+
+/// This heuristic is used if DFA scheduling is not desired
+/// for some VLIW platform.
+bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return LHSNum < RHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+void ResourcePriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+  Queue.push_back(SU);
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getNode())
+    return false;
+
+  // If this is a compound instruction,
+  // it is likely to be a call. Do not delay it.
+  if (SU->getNode()->getGluedNode())
+    return true;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  if (SU->getNode()->isMachineOpcode())
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      if (!ResourcesModel->canReserveResources(&TII->get(
+          SU->getNode()->getMachineOpcode())))
+           return false;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+        break;
+    }
+
+  // Now see if there are no other dependencies
+  // to instructions alredy in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignor order deps.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+
+  return true;
+}
+
+/// Keep track of available resources.
+void ResourcePriorityQueue::reserveResources(SUnit *SU) {
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU) || SU->getNode()->getGluedNode()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  if (SU->getNode() && SU->getNode()->isMachineOpcode()) {
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      ResourcesModel->reserveResources(&TII->get(
+        SU->getNode()->getMachineOpcode()));
+      break;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+      break;
+    }
+    Packet.push_back(SU);
+  }
+  // Forcefully end packet for PseudoOps.
+  else {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+}
+
+signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
+  signed RegBalance    = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  // Gen estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumValues(); i != e; ++i) {
+      MVT VT = SU->getNode()->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance += numberRCValSuccInSU(SU, RCId);
+  }
+  // Kill estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = SU->getNode()->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (isa<ConstantSDNode>(Op.getNode()))
+        continue;
+
+      if (TLI->isTypeLegal(VT) && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance -= numberRCValPredInSU(SU, RCId);
+  }
+  return RegBalance;
+}
+
+/// Estimates change in reg pressure from this SU.
+/// It is achieved by trivial tracking of defined
+/// and used vregs in dependent instructions.
+/// The RawPressure flag makes this function to ignore
+/// existing reg file sizes, and report raw def/use
+/// balance.
+signed ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
+  signed RegBalance    = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  if (RawPressure) {
+    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I) {
+      const TargetRegisterClass *RC = *I;
+      RegBalance += rawRegPressureDelta(SU, RC->getID());
+    }
+  }
+  else {
+    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I) {
+      const TargetRegisterClass *RC = *I;
+      if ((RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID()) > 0) &&
+          (RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID())  >= RegLimit[RC->getID()]))
+        RegBalance += rawRegPressureDelta(SU, RC->getID());
+    }
+  }
+
+  return RegBalance;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 100;
+static const unsigned PriorityThree = 50;
+static const unsigned PriorityFour = 15;
+static const unsigned PriorityFive = 5;
+static const unsigned ScaleOne = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned ScaleThree = 5;
+static const unsigned FactorOne = 2;
+
+/// Returns single number reflecting benefit of scheduling SU
+/// in the current cycle.
+signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
+  // Initial trivial priority.
+  signed ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Adaptable scheduling
+  // A small, but very parallel
+  // region, where reg pressure is an issue.
+  if (HorizontalVerticalBalance > RegPressureThreshold) {
+    // Critical path first
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    // Consider change to reg pressure from scheduling
+    // this SU.
+    ResCount -= (regPressureDelta(SU,true) * ScaleOne);
+  }
+  // Default heuristic, greeady and
+  // critical path driven.
+  else {
+    // Critical path first.
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // Now see how many instructions is blocked by this SU.
+    ResCount += (NumNodesSolelyBlocking[SU->NodeNum] * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    ResCount -= (regPressureDelta(SU) * ScaleTwo);
+  }
+
+  // These are platform specific things.
+  // Will need to go into the back end
+  // and accessed from here via a hook.
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      if (TID.isCall())
+        ResCount += (PriorityThree + (ScaleThree*N->getNumValues()));
+    }
+    else
+      switch (N->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:
+      case ISD::CopyFromReg:
+      case ISD::CopyToReg:
+        ResCount += PriorityFive;
+        break;
+
+      case ISD::INLINEASM:
+        ResCount += PriorityFour;
+        break;
+      }
+  }
+  return ResCount;
+}
+
+
+/// Main resource tracking point.
+void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
+  // Use NULL entry as an event marker to reset
+  // the DFA state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    return;
+  }
+
+  const SDNode *ScegN = SU->getNode();
+  // Update reg pressure tracking.
+  // First update current node.
+  if (ScegN->isMachineOpcode()) {
+    // Estimate generated regs.
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC)
+          RegPressure[RC->getID()] += numberRCValSuccInSU(SU, RC->getID());
+      }
+    }
+    // Estimate killed regs.
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC) {
+          if (RegPressure[RC->getID()] >
+            (numberRCValPredInSU(SU, RC->getID())))
+            RegPressure[RC->getID()] -= numberRCValPredInSU(SU, RC->getID());
+          else RegPressure[RC->getID()] = 0;
+        }
+      }
+    }
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+                              I != E; ++I) {
+      if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0))
+        continue;
+      --I->getSUnit()->NumRegDefsLeft;
+    }
+  }
+
+  // Reserve resources for this SU.
+  reserveResources(SU);
+
+  // Adjust number of parallel live ranges.
+  // Heuristic is simple - node with no data successors reduces
+  // number of live ranges. All others, increase it.
+  unsigned NumberNonControlDeps = 0;
+
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+                                  I != E; ++I) {
+    adjustPriorityOfUnscheduledPreds(I->getSUnit());
+    if (!I->isCtrl())
+      NumberNonControlDeps++;
+  }
+
+  if (!NumberNonControlDeps) {
+    if (ParallelLiveRanges >= SU->NumPreds)
+      ParallelLiveRanges -= SU->NumPreds;
+    else
+      ParallelLiveRanges = 0;
+
+  }
+  else
+    ParallelLiveRanges += SU->NumRegDefsLeft;
+
+  // Track parallel live chains.
+  HorizontalVerticalBalance += (SU->Succs.size() - numberCtrlDepsInSU(SU));
+  HorizontalVerticalBalance -= (SU->Preds.size() - numberCtrlPredInSU(SU));
+}
+
+void ResourcePriorityQueue::initNumRegDefsLeft(SUnit *SU) {
+  unsigned  NodeNumDefs = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      // No register need be allocated for this.
+      if (N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+        NodeNumDefs = 0;
+        break;
+      }
+      NodeNumDefs = std::min(N->getNumValues(), TID.getNumDefs());
+    }
+    else
+      switch(N->getOpcode()) {
+        default:     break;
+        case ISD::CopyFromReg:
+          NodeNumDefs++;
+          break;
+        case ISD::INLINEASM:
+          NodeNumDefs++;
+          break;
+      }
+
+  SU->NumRegDefsLeft = NodeNumDefs;
+}
+
+/// adjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void ResourcePriorityQueue::adjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (OnlyAvailablePred == 0 || !OnlyAvailablePred->isAvailable)
+    return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+
+/// Main access point - returns next instructions
+/// to be placed in scheduling sequence.
+SUnit *ResourcePriorityQueue::pop() {
+  if (empty())
+    return 0;
+
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  if (!DisableDFASched) {
+    signed BestCost = SUSchedulingCost(*Best);
+    for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
+           E = Queue.end(); I != E; ++I) {
+
+      if (SUSchedulingCost(*I) > BestCost) {
+        BestCost = SUSchedulingCost(*I);
+        Best = I;
+      }
+    }
+  }
+  // Use default TD scheduling mechanism.
+  else {
+    for (std::vector<SUnit *>::iterator I = llvm::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+      if (Picker(*Best, *I))
+        Best = I;
+  }
+
+  SUnit *V = *Best;
+  if (Best != prior(Queue.end()))
+    std::swap(*Best, Queue.back());
+
+  Queue.pop_back();
+
+  return V;
+}
+
+
+void ResourcePriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(), SU);
+  if (I != prior(Queue.end()))
+    std::swap(*I, Queue.back());
+
+  Queue.pop_back();
+}
+
+
+#ifdef NDEBUG
+void ResourcePriorityQueue::dump(ScheduleDAG *DAG) const {}
+#else
+void ResourcePriorityQueue::dump(ScheduleDAG *DAG) const {
+  ResourcePriorityQueue q = *this;
+  while (!q.empty()) {
+    SUnit *su = q.pop();
+    dbgs() << "Height " << su->getHeight() << ": ";
+    su->dump(DAG);
+  }
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
new file mode 100644
index 000000000000..4af7172847d7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -0,0 +1,114 @@
+//===-- llvm/CodeGen/SDNodeDbgValue.h - SelectionDAG dbg_value --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDDbgValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDNODEDBGVALUE_H
+#define LLVM_CODEGEN_SDNODEDBGVALUE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/DebugLoc.h"
+
+namespace llvm {
+
+class MDNode;
+class SDNode;
+class Value;
+
+/// SDDbgValue - Holds the information from a dbg_value node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+
+class SDDbgValue {
+public:
+  enum DbgValueKind {
+    SDNODE = 0,             // value is the result of an expression
+    CONST = 1,              // value is a constant
+    FRAMEIX = 2             // value is contents of a stack location
+  };
+private:
+  enum DbgValueKind kind;
+  union {
+    struct {
+      SDNode *Node;         // valid for expressions
+      unsigned ResNo;       // valid for expressions
+    } s;
+    const Value *Const;     // valid for constants
+    unsigned FrameIx;       // valid for stack objects
+  } u;
+  MDNode *mdPtr;
+  uint64_t Offset;
+  DebugLoc DL;
+  unsigned Order;
+  bool Invalid;
+public:
+  // Constructor for non-constants.
+  SDDbgValue(MDNode *mdP, SDNode *N, unsigned R, uint64_t off, DebugLoc dl,
+             unsigned O) : mdPtr(mdP), Offset(off), DL(dl), Order(O),
+                           Invalid(false) {
+    kind = SDNODE;
+    u.s.Node = N;
+    u.s.ResNo = R;
+  }
+
+  // Constructor for constants.
+  SDDbgValue(MDNode *mdP, const Value *C, uint64_t off, DebugLoc dl,
+             unsigned O) : 
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    kind = CONST;
+    u.Const = C;
+  }
+
+  // Constructor for frame indices.
+  SDDbgValue(MDNode *mdP, unsigned FI, uint64_t off, DebugLoc dl, unsigned O) : 
+    mdPtr(mdP), Offset(off), DL(dl), Order(O), Invalid(false) {
+    kind = FRAMEIX;
+    u.FrameIx = FI;
+  }
+
+  // Returns the kind.
+  DbgValueKind getKind() { return kind; }
+
+  // Returns the MDNode pointer.
+  MDNode *getMDPtr() { return mdPtr; }
+
+  // Returns the SDNode* for a register ref
+  SDNode *getSDNode() { assert (kind==SDNODE); return u.s.Node; }
+
+  // Returns the ResNo for a register ref
+  unsigned getResNo() { assert (kind==SDNODE); return u.s.ResNo; }
+
+  // Returns the Value* for a constant
+  const Value *getConst() { assert (kind==CONST); return u.Const; }
+
+  // Returns the FrameIx for a stack object
+  unsigned getFrameIx() { assert (kind==FRAMEIX); return u.FrameIx; }
+
+  // Returns the offset.
+  uint64_t getOffset() { return Offset; }
+
+  // Returns the DebugLoc.
+  DebugLoc getDebugLoc() { return DL; }
+
+  // Returns the SDNodeOrder.  This is the order of the preceding node in the
+  // input.
+  unsigned getOrder() { return Order; }
+
+  // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
+  // property. A SDDbgValue is invalid if the SDNode that produces the value is
+  // deleted.
+  void setIsInvalidated() { Invalid = true; }
+  bool isInvalidated() { return Invalid; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeOrdering.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
new file mode 100644
index 000000000000..7e7b8974be48
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
@@ -0,0 +1,56 @@
+//===-- llvm/CodeGen/SDNodeOrdering.h - SDNode Ordering ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDNodeOrdering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SDNODEORDERING_H
+#define LLVM_CODEGEN_SDNODEORDERING_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class SDNode;
+
+/// SDNodeOrdering - Maps a unique (monotonically increasing) value to each
+/// SDNode that roughly corresponds to the ordering of the original LLVM
+/// instruction. This is used for turning off scheduling, because we'll forgo
+/// the normal scheduling algorithms and output the instructions according to
+/// this ordering.
+class SDNodeOrdering {
+  DenseMap<const SDNode*, unsigned> OrderMap;
+
+  void operator=(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
+  SDNodeOrdering(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
+public:
+  SDNodeOrdering() {}
+
+  void add(const SDNode *Node, unsigned NewOrder) {
+    unsigned &OldOrder = OrderMap[Node];
+    if (OldOrder == 0 || (OldOrder > 0 && NewOrder < OldOrder))
+      OldOrder = NewOrder;
+  }
+  void remove(const SDNode *Node) {
+    DenseMap<const SDNode*, unsigned>::iterator Itr = OrderMap.find(Node);
+    if (Itr != OrderMap.end())
+      OrderMap.erase(Itr);
+  }
+  void clear() {
+    OrderMap.clear();
+  }
+  unsigned getOrder(const SDNode *Node) {
+    return OrderMap[Node];
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
new file mode 100644
index 000000000000..d1f36cb647dc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -0,0 +1,799 @@
+//===----- ScheduleDAGFast.cpp - Fast poor list scheduler -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a fast scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "InstrEmitter.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical copies");
+
+static RegisterScheduler
+  fastDAGScheduler("fast", "Fast suboptimal list scheduling",
+                   createFastDAGScheduler);
+static RegisterScheduler
+  linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling",
+                        createDAGLinearizer);
+
+
+namespace {
+  /// FastPriorityQueue - A degenerate priority queue that considers
+  /// all nodes to have the same priority.
+  ///
+  struct FastPriorityQueue {
+    SmallVector<SUnit *, 16> Queue;
+
+    bool empty() const { return Queue.empty(); }
+
+    void push(SUnit *U) {
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return NULL;
+      SUnit *V = Queue.back();
+      Queue.pop_back();
+      return V;
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGFast - The actual "fast" list scheduler implementation.
+///
+class ScheduleDAGFast : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  FastPriorityQueue AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+public:
+  ScheduleDAGFast(MachineFunction &mf)
+    : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  void AddPred(SUnit *SU, const SDep &D) {
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+  void ListScheduleBottomUp();
+
+  /// forceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool forceUnitLatencies() const { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGFast::Schedule() {
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), NULL);
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph(NULL);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue.push(PredSU);
+  }
+}
+
+void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = NULL;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getGluedNode())
+    return NULL;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Glue)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    SUnit *NewSU = newSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = newSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+    }
+
+    SDep ChainPred;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPred = *I;
+      else if (I->getSUnit()->getNode() &&
+               I->getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    if (ChainPred.getSUnit()) {
+      RemovePred(SU, ChainPred);
+      if (isNewLoad)
+        AddPred(LoadSU, ChainPred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad) {
+        AddPred(LoadSU, Pred);
+      }
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    }
+    if (isNewLoad) {
+      SDep D(LoadSU, SDep::Barrier);
+      D.setLatency(LoadSU->Latency);
+      AddPred(NewSU, D);
+    }
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  NewSU = Clone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(NULL));
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(NULL));
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  }
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
+
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = MCID.getNumDefs();
+  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVector<unsigned, 4> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  bool Added = false;
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
+      if (RegAdded.insert(*AI)) {
+        LRegs.push_back(*AI);
+        Added = true;
+      }
+    }
+  }
+  return Added;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
+                                               SmallVector<unsigned, 4> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+    }
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+    if (!Node->isMachineOpcode())
+      continue;
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (!MCID.ImplicitDefs)
+      continue;
+    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+    }
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGFast::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue.push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue.pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue.pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try code duplication or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      if (!CurSU) {
+        // Try duplicating the nodes that produces these
+        // "expensive to copy" values to break the dependency. In case even
+        // that doesn't work, insert cross class copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is the same as RC, then it must be
+        // possible copy the value directly. Do not try duplicate the def.
+        // If cross copy register class is not the same as RC, then it's
+        // possible to copy the value but it require cross register class copies
+        // and it is expensive.
+        // If cross copy register class is null, then it's not possible to copy
+        // the value at all.
+        SUnit *NewDef = 0;
+        if (DestRC != RC) {
+          NewDef = CopyAndMoveSuccessors(LRDef);
+          if (!DestRC && !NewDef)
+            report_fatal_error("Can't handle live physical "
+                               "register dependency!");
+        }
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
+                       << " to SU #" << Copies.front()->NodeNum << "\n");
+          AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
+          NewDef = Copies.back();
+        }
+
+        DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
+                     << " to SU #" << TrySU->NodeNum << "\n");
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Artificial));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      if (!CurSU) {
+        llvm_unreachable("Unable to resolve live physical register dependencies!");
+      }
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue.push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order since it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the
+// DAG in topological order.
+// IMPORTANT: this may not work for targets with phyreg dependency.
+//
+class ScheduleDAGLinearize : public ScheduleDAGSDNodes {
+public:
+  ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+private:
+  std::vector<SDNode*> Sequence;
+  DenseMap<SDNode*, SDNode*> GluedMap;  // Cache glue to its user
+
+  void ScheduleNode(SDNode *N);
+};
+} // end anonymous namespace
+
+void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
+  if (N->getNodeId() != 0)
+    llvm_unreachable(0);
+
+  if (!N->isMachineOpcode() &&
+      (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
+    // These nodes do not need to be translated into MIs.
+    return;
+
+  DEBUG(dbgs() << "\n*** Scheduling: ");
+  DEBUG(N->dump(DAG));
+  Sequence.push_back(N);
+
+  unsigned NumOps = N->getNumOperands();
+  if (unsigned NumLeft = NumOps) {
+    SDNode *GluedOpN = 0;
+    do {
+      const SDValue &Op = N->getOperand(NumLeft-1);
+      SDNode *OpN = Op.getNode();
+
+      if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) {
+        // Schedule glue operand right above N.
+        GluedOpN = OpN;
+        assert(OpN->getNodeId() != 0 && "Glue operand not ready?");
+        OpN->setNodeId(0);
+        ScheduleNode(OpN);
+        continue;
+      }
+
+      if (OpN == GluedOpN)
+        // Glue operand is already scheduled.
+        continue;
+
+      DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN);
+      if (DI != GluedMap.end() && DI->second != N)
+        // Users of glues are counted against the glued users.
+        OpN = DI->second;
+
+      unsigned Degree = OpN->getNodeId();
+      assert(Degree > 0 && "Predecessor over-released!");
+      OpN->setNodeId(--Degree);
+      if (Degree == 0)
+        ScheduleNode(OpN);
+    } while (--NumLeft);
+  }
+}
+
+/// findGluedUser - Find the representative use of a glue value by walking
+/// the use chain.
+static SDNode *findGluedUser(SDNode *N) {
+  while (SDNode *Glued = N->getGluedUser())
+    N = Glued;
+  return N;
+}
+
+void ScheduleDAGLinearize::Schedule() {
+  DEBUG(dbgs() << "********** DAG Linearization **********\n");
+
+  SmallVector<SDNode*, 8> Glues;
+  unsigned DAGSize = 0;
+  for (SelectionDAG::allnodes_iterator I = DAG->allnodes_begin(),
+         E = DAG->allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+
+    // Use node id to record degree.
+    unsigned Degree = N->use_size();
+    N->setNodeId(Degree);
+    unsigned NumVals = N->getNumValues();
+    if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&
+        N->hasAnyUseOfValue(NumVals-1)) {
+      SDNode *User = findGluedUser(N);
+      if (User) {
+        Glues.push_back(N);
+        GluedMap.insert(std::make_pair(N, User));
+      }
+    }
+
+    if (N->isMachineOpcode() ||
+        (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))
+      ++DAGSize;
+  }
+
+  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
+    SDNode *Glue = Glues[i];
+    SDNode *GUser = GluedMap[Glue];
+    unsigned Degree = Glue->getNodeId();
+    unsigned UDegree = GUser->getNodeId();
+
+    // Glue user must be scheduled together with the glue operand. So other
+    // users of the glue operand must be treated as its users.
+    SDNode *ImmGUser = Glue->getGluedUser();
+    for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end();
+         ui != ue; ++ui)
+      if (*ui == ImmGUser)
+        --Degree;
+    GUser->setNodeId(UDegree + Degree);
+    Glue->setNodeId(1);
+  }
+
+  Sequence.reserve(DAGSize);
+  ScheduleNode(DAG->getRoot().getNode());
+}
+
+MachineBasicBlock*
+ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+
+  DEBUG({
+      dbgs() << "\n*** Final schedule ***\n";
+    });
+
+  // FIXME: Handle dbg_values.
+  unsigned NumNodes = Sequence.size();
+  for (unsigned i = 0; i != NumNodes; ++i) {
+    SDNode *N = Sequence[NumNodes-i-1];
+    DEBUG(N->dump(DAG));
+    Emitter.EmitNode(N, false, false, VRBaseMap);
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGFast(*IS->MF);
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGLinearize(*IS->MF);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
new file mode 100644
index 000000000000..c009cfcc516d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -0,0 +1,3039 @@
+//===----- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements bottom-up and top-down register pressure reduction list
+// schedulers, using standard algorithms.  The basic approach uses a priority
+// queue of available nodes to schedule.  One at a time, nodes are taken from
+// the priority queue (thus in priority order), checked for legality to
+// schedule, and emitted if legal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical register copies");
+
+static RegisterScheduler
+  burrListDAGScheduler("list-burr",
+                       "Bottom-up register reduction list scheduling",
+                       createBURRListDAGScheduler);
+static RegisterScheduler
+  sourceListDAGScheduler("source",
+                         "Similar to list-burr but schedules in source "
+                         "order when possible",
+                         createSourceListDAGScheduler);
+
+static RegisterScheduler
+  hybridListDAGScheduler("list-hybrid",
+                         "Bottom-up register pressure aware list scheduling "
+                         "which tries to balance latency and register pressure",
+                         createHybridListDAGScheduler);
+
+static RegisterScheduler
+  ILPListDAGScheduler("list-ilp",
+                      "Bottom-up register pressure aware list scheduling "
+                      "which tries to balance ILP and register pressure",
+                      createILPListDAGScheduler);
+
+static cl::opt<bool> DisableSchedCycles(
+  "disable-sched-cycles", cl::Hidden, cl::init(false),
+  cl::desc("Disable cycle-level precision during preRA scheduling"));
+
+// Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
+static cl::opt<bool> DisableSchedRegPressure(
+  "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
+  cl::desc("Disable regpressure priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedLiveUses(
+  "disable-sched-live-uses", cl::Hidden, cl::init(true),
+  cl::desc("Disable live use priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedVRegCycle(
+  "disable-sched-vrcycle", cl::Hidden, cl::init(false),
+  cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
+static cl::opt<bool> DisableSchedStalls(
+  "disable-sched-stalls", cl::Hidden, cl::init(true),
+  cl::desc("Disable no-stall priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedCriticalPath(
+  "disable-sched-critical-path", cl::Hidden, cl::init(false),
+  cl::desc("Disable critical path priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedHeight(
+  "disable-sched-height", cl::Hidden, cl::init(false),
+  cl::desc("Disable scheduled-height priority in sched=list-ilp"));
+static cl::opt<bool> Disable2AddrHack(
+  "disable-2addr-hack", cl::Hidden, cl::init(true),
+  cl::desc("Disable scheduler's two-address hack"));
+
+static cl::opt<int> MaxReorderWindow(
+  "max-sched-reorder", cl::Hidden, cl::init(6),
+  cl::desc("Number of instructions to allow ahead of the critical path "
+           "in sched=list-ilp"));
+
+static cl::opt<unsigned> AvgIPC(
+  "sched-avg-ipc", cl::Hidden, cl::init(1),
+  cl::desc("Average inst/cycle whan no target itinerary exists."));
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGRRList - The actual register reduction list scheduler
+/// implementation.  This supports both top-down and bottom-up scheduling.
+///
+class ScheduleDAGRRList : public ScheduleDAGSDNodes {
+private:
+  /// NeedLatency - True if the scheduler will make use of latency information.
+  ///
+  bool NeedLatency;
+
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands becomes available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// CurCycle - The current scheduler state corresponds to this cycle.
+  unsigned CurCycle;
+
+  /// MinAvailableCycle - Cycle of the soonest available instruction.
+  unsigned MinAvailableCycle;
+
+  /// IssueCount - Count instructions issued in this cycle
+  /// Currently valid only for bottom-up scheduling.
+  unsigned IssueCount;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<SUnit*> LiveRegGens;
+
+  // Collect interferences between physical register use/defs.
+  // Each interference is an SUnit and set of physical registers.
+  SmallVector<SUnit*, 4> Interferences;
+  typedef DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMapT;
+  LRegsMapT LRegsMap;
+
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
+  // Hack to keep track of the inverse of FindCallSeqStart without more crazy
+  // DAG crawling.
+  DenseMap<SUnit*, SUnit*> CallSeqEndForStart;
+
+public:
+  ScheduleDAGRRList(MachineFunction &mf, bool needlatency,
+                    SchedulingPriorityQueue *availqueue,
+                    CodeGenOpt::Level OptLevel)
+    : ScheduleDAGSDNodes(mf),
+      NeedLatency(needlatency), AvailableQueue(availqueue), CurCycle(0),
+      Topo(SUnits, NULL) {
+
+    const TargetMachine &tm = mf.getTarget();
+    if (DisableSchedCycles || !NeedLatency)
+      HazardRec = new ScheduleHazardRecognizer();
+    else
+      HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+  }
+
+  ~ScheduleDAGRRList() {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+  ScheduleHazardRecognizer *getHazardRec() { return HazardRec; }
+
+  /// IsReachable - Checks if SU is reachable from TargetSU.
+  bool IsReachable(const SUnit *SU, const SUnit *TargetSU) {
+    return Topo.IsReachable(SU, TargetSU);
+  }
+
+  /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+  /// create a cycle.
+  bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+    return Topo.WillCreateCycle(SU, TargetSU);
+  }
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Updates the topological ordering if required.
+  void AddPred(SUnit *SU, const SDep &D) {
+    Topo.AddPred(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  /// Updates the topological ordering if required.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    Topo.RemovePred(SU, D.getSUnit());
+    SU->removePred(D);
+  }
+
+private:
+  bool isReady(SUnit *SU) {
+    return DisableSchedCycles || !AvailableQueue->hasReadyFilter() ||
+      AvailableQueue->isReady(SU);
+  }
+
+  void ReleasePred(SUnit *SU, const SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU);
+  void ReleasePending();
+  void AdvanceToCycle(unsigned NextCycle);
+  void AdvancePastStalls(SUnit *SU);
+  void EmitNode(SUnit *SU);
+  void ScheduleNodeBottomUp(SUnit*);
+  void CapturePred(SDep *PredEdge);
+  void UnscheduleNodeBottomUp(SUnit*);
+  void RestoreHazardCheckerBottomUp();
+  void BacktrackBottomUp(SUnit*, SUnit*);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVector<SUnit*, 2>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+
+  void releaseInterferences(unsigned Reg = 0);
+
+  SUnit *PickNodeToScheduleBottomUp();
+  void ListScheduleBottomUp();
+
+  /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
+  /// Updates the topological ordering if required.
+  SUnit *CreateNewSUnit(SDNode *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = newSUnit(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// CreateClone - Creates a new SUnit from an existing one.
+  /// Updates the topological ordering if required.
+  SUnit *CreateClone(SUnit *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = Clone(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// forceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// need actual latency information but the hybrid scheduler does.
+  bool forceUnitLatencies() const {
+    return !NeedLatency;
+  }
+};
+}  // end anonymous namespace
+
+/// GetCostForDef - Looks up the register class and cost for a given definition.
+/// Typically this just means looking up the representative register class,
+/// but for untyped values (MVT::Untyped) it means inspecting the node's
+/// opcode to determine what register class is being generated.
+static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
+                          const TargetLowering *TLI,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          unsigned &RegClass, unsigned &Cost,
+                          const MachineFunction &MF) {
+  MVT VT = RegDefPos.GetValue();
+
+  // Special handling for untyped values.  These values can only come from
+  // the expansion of custom DAG-to-DAG patterns.
+  if (VT == MVT::Untyped) {
+    const SDNode *Node = RegDefPos.GetNode();
+
+    // Special handling for CopyFromReg of untyped values.
+    if (!Node->isMachineOpcode() && Node->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Opcode = Node->getMachineOpcode();
+    if (Opcode == TargetOpcode::REG_SEQUENCE) {
+      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Idx = RegDefPos.GetIdx();
+    const MCInstrDesc Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
+    RegClass = RC->getID();
+    // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
+    // better way to determine it.
+    Cost = 1;
+  } else {
+    RegClass = TLI->getRepRegClassFor(VT)->getID();
+    Cost = TLI->getRepRegClassCostFor(VT);
+  }
+}
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGRRList::Schedule() {
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " '" << BB->getName() << "' **********\n");
+
+  CurCycle = 0;
+  IssueCount = 0;
+  MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX;
+  NumLiveRegs = 0;
+  // Allocate slots for each physical register, plus one for a special register
+  // to track the virtual resource of a calling sequence.
+  LiveRegDefs.resize(TRI->getNumRegs() + 1, NULL);
+  LiveRegGens.resize(TRI->getNumRegs() + 1, NULL);
+  CallSeqEndForStart.clear();
+  assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(NULL);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+  Topo.InitDAGTopologicalSorting();
+
+  AvailableQueue->initNodes(SUnits);
+
+  HazardRec->Reset();
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+
+  AvailableQueue->releaseState();
+
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  if (!forceUnitLatencies()) {
+    // Updating predecessor's height. This is now the cycle when the
+    // predecessor can be scheduled without causing a pipeline stall.
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
+  }
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+
+    unsigned Height = PredSU->getHeight();
+    if (Height < MinAvailableCycle)
+      MinAvailableCycle = Height;
+
+    if (isReady(PredSU)) {
+      AvailableQueue->push(PredSU);
+    }
+    // CapturePred and others may have left the node in the pending queue, avoid
+    // adding it twice.
+    else if (!PredSU->isPending) {
+      PredSU->isPending = true;
+      PendingQueue.push_back(PredSU);
+    }
+  }
+}
+
+/// IsChainDependent - Test if Outer is reachable from Inner through
+/// chain dependencies.
+static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
+                             unsigned NestLevel,
+                             const TargetInstrInfo *TII) {
+  SDNode *N = Outer;
+  for (;;) {
+    if (N == Inner)
+      return true;
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        if (IsChainDependent(N->getOperand(i).getNode(), Inner, NestLevel, TII))
+          return true;
+      return false;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() ==
+          (unsigned)TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+      } else if (N->getMachineOpcode() ==
+                 (unsigned)TII->getCallFrameSetupOpcode()) {
+        if (NestLevel == 0)
+          return false;
+        --NestLevel;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other) {
+        N = N->getOperand(i).getNode();
+        goto found_chain_operand;
+      }
+    return false;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return false;
+  }
+}
+
+/// FindCallSeqStart - Starting from the (lowered) CALLSEQ_END node, locate
+/// the corresponding (lowered) CALLSEQ_BEGIN node.
+///
+/// NestLevel and MaxNested are used in recursion to indcate the current level
+/// of nesting of CALLSEQ_BEGIN and CALLSEQ_END pairs, as well as the maximum
+/// level seen so far.
+///
+/// TODO: It would be better to give CALLSEQ_END an explicit operand to point
+/// to the corresponding CALLSEQ_BEGIN to avoid needing to search for it.
+static SDNode *
+FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
+                 const TargetInstrInfo *TII) {
+  for (;;) {
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      SDNode *Best = 0;
+      unsigned BestMaxNest = MaxNest;
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        unsigned MyNestLevel = NestLevel;
+        unsigned MyMaxNest = MaxNest;
+        if (SDNode *New = FindCallSeqStart(N->getOperand(i).getNode(),
+                                           MyNestLevel, MyMaxNest, TII))
+          if (!Best || (MyMaxNest > BestMaxNest)) {
+            Best = New;
+            BestMaxNest = MyMaxNest;
+          }
+      }
+      assert(Best);
+      MaxNest = BestMaxNest;
+      return Best;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() ==
+          (unsigned)TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+        MaxNest = std::max(MaxNest, NestLevel);
+      } else if (N->getMachineOpcode() ==
+                 (unsigned)TII->getCallFrameSetupOpcode()) {
+        assert(NestLevel != 0);
+        --NestLevel;
+        if (NestLevel == 0)
+          return N;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other) {
+        N = N->getOperand(i).getNode();
+        goto found_chain_operand;
+      }
+    return 0;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return 0;
+  }
+}
+
+/// Call ReleasePred for each predecessor, then update register live def/gen.
+/// Always update LiveRegDefs for a register dependence even if the current SU
+/// also defines the register. This effectively create one large live range
+/// across a sequence of two-address node. This is important because the
+/// entire chain must be scheduled together. Example:
+///
+/// flags = (3) add
+/// flags = (2) addc flags
+/// flags = (1) addc flags
+///
+/// results in
+///
+/// LiveRegDefs[flags] = 3
+/// LiveRegGens[flags] = 1
+///
+/// If (2) addc is unscheduled, then (1) addc must also be unscheduled to avoid
+/// interference on flags.
+void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      SUnit *RegDef = LiveRegDefs[I->getReg()]; (void)RegDef;
+      assert((!RegDef || RegDef == SU || RegDef == I->getSUnit()) &&
+             "interference on register dependence");
+      LiveRegDefs[I->getReg()] = I->getSUnit();
+      if (!LiveRegGens[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegGens[I->getReg()] = SU;
+      }
+    }
+  }
+
+  // If we're scheduling a lowered CALLSEQ_END, find the corresponding
+  // CALLSEQ_BEGIN. Inject an artificial physical register dependence between
+  // these nodes, to prevent other calls from being interscheduled with them.
+  unsigned CallResource = TRI->getNumRegs();
+  if (!LiveRegDefs[CallResource])
+    for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
+      if (Node->isMachineOpcode() &&
+          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+        unsigned NestLevel = 0;
+        unsigned MaxNest = 0;
+        SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
+
+        SUnit *Def = &SUnits[N->getNodeId()];
+        CallSeqEndForStart[Def] = SU;
+
+        ++NumLiveRegs;
+        LiveRegDefs[CallResource] = Def;
+        LiveRegGens[CallResource] = SU;
+        break;
+      }
+}
+
+/// Check to see if any of the pending instructions are ready to issue.  If
+/// so, add them to the available queue.
+void ScheduleDAGRRList::ReleasePending() {
+  if (DisableSchedCycles) {
+    assert(PendingQueue.empty() && "pending instrs not allowed in this mode");
+    return;
+  }
+
+  // If the available queue is empty, it is safe to reset MinAvailableCycle.
+  if (AvailableQueue->empty())
+    MinAvailableCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+    unsigned ReadyCycle = PendingQueue[i]->getHeight();
+    if (ReadyCycle < MinAvailableCycle)
+      MinAvailableCycle = ReadyCycle;
+
+    if (PendingQueue[i]->isAvailable) {
+      if (!isReady(PendingQueue[i]))
+          continue;
+      AvailableQueue->push(PendingQueue[i]);
+    }
+    PendingQueue[i]->isPending = false;
+    PendingQueue[i] = PendingQueue.back();
+    PendingQueue.pop_back();
+    --i; --e;
+  }
+}
+
+/// Move the scheduler state forward by the specified number of Cycles.
+void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
+  if (NextCycle <= CurCycle)
+    return;
+
+  IssueCount = 0;
+  AvailableQueue->setCurCycle(NextCycle);
+  if (!HazardRec->isEnabled()) {
+    // Bypass lots of virtual calls in case of long latency.
+    CurCycle = NextCycle;
+  }
+  else {
+    for (; CurCycle != NextCycle; ++CurCycle) {
+      HazardRec->RecedeCycle();
+    }
+  }
+  // FIXME: Instead of visiting the pending Q each time, set a dirty flag on the
+  // available Q to release pending nodes at least once before popping.
+  ReleasePending();
+}
+
+/// Move the scheduler state forward until the specified node's dependents are
+/// ready and can be scheduled with no resource conflicts.
+void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
+  if (DisableSchedCycles)
+    return;
+
+  // FIXME: Nodes such as CopyFromReg probably should not advance the current
+  // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
+  // has predecessors the cycle will be advanced when they are scheduled.
+  // But given the crude nature of modeling latency though such nodes, we
+  // currently need to treat these nodes like real instructions.
+  // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return;
+
+  unsigned ReadyCycle = SU->getHeight();
+
+  // Bump CurCycle to account for latency. We assume the latency of other
+  // available instructions may be hidden by the stall (not a full pipe stall).
+  // This updates the hazard recognizer's cycle before reserving resources for
+  // this instruction.
+  AdvanceToCycle(ReadyCycle);
+
+  // Calls are scheduled in their preceding cycle, so don't conflict with
+  // hazards from instructions after the call. EmitNode will reset the
+  // scoreboard state before emitting the call.
+  if (SU->isCall)
+    return;
+
+  // FIXME: For resource conflicts in very long non-pipelined stages, we
+  // should probably skip ahead here to avoid useless scoreboard checks.
+  int Stalls = 0;
+  while (true) {
+    ScheduleHazardRecognizer::HazardType HT =
+      HazardRec->getHazardType(SU, -Stalls);
+
+    if (HT == ScheduleHazardRecognizer::NoHazard)
+      break;
+
+    ++Stalls;
+  }
+  AdvanceToCycle(CurCycle + Stalls);
+}
+
+/// Record this SUnit in the HazardRecognizer.
+/// Does not update CurCycle.
+void ScheduleDAGRRList::EmitNode(SUnit *SU) {
+  if (!HazardRec->isEnabled())
+    return;
+
+  // Check for phys reg copy.
+  if (!SU->getNode())
+    return;
+
+  switch (SU->getNode()->getOpcode()) {
+  default:
+    assert(SU->getNode()->isMachineOpcode() &&
+           "This target-independent node should not be scheduled.");
+    break;
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+  case ISD::CopyToReg:
+  case ISD::CopyFromReg:
+  case ISD::EH_LABEL:
+    // Noops don't affect the scoreboard state. Copies are likely to be
+    // removed.
+    return;
+  case ISD::INLINEASM:
+    // For inline asm, clear the pipeline state.
+    HazardRec->Reset();
+    return;
+  }
+  if (SU->isCall) {
+    // Calls are scheduled with their preceding instructions. For bottom-up
+    // scheduling, clear the pipeline state before emitting.
+    HazardRec->Reset();
+  }
+
+  HazardRec->EmitInstruction(SU);
+}
+
+static void resetVRegCycle(SUnit *SU);
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
+  DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+#ifndef NDEBUG
+  if (CurCycle < SU->getHeight())
+    DEBUG(dbgs() << "   Height [" << SU->getHeight()
+          << "] pipeline stall!\n");
+#endif
+
+  // FIXME: Do not modify node height. It may interfere with
+  // backtracking. Instead add a "ready cycle" to SUnit. Before scheduling the
+  // node its ready cycle can aid heuristics, and after scheduling it can
+  // indicate the scheduled cycle.
+  SU->setHeightToAtLeast(CurCycle);
+
+  // Reserve resources for the scheduled intruction.
+  EmitNode(SU);
+
+  Sequence.push_back(SU);
+
+  AvailableQueue->scheduledNode(SU);
+
+  // If HazardRec is disabled, and each inst counts as one cycle, then
+  // advance CurCycle before ReleasePredecessors to avoid useless pushes to
+  // PendingQueue for schedulers that implement HasReadyFilter.
+  if (!HazardRec->isEnabled() && AvgIPC < 2)
+    AdvanceToCycle(CurCycle + 1);
+
+  // Update liveness of predecessors before successors to avoid treating a
+  // two-address node as a live range def.
+  ReleasePredecessors(SU);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    // LiveRegDegs[I->getReg()] != SU when SU is a two-address node.
+    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) {
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = NULL;
+      LiveRegGens[I->getReg()] = NULL;
+      releaseInterferences(I->getReg());
+    }
+  }
+  // Release the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  if (LiveRegDefs[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = NULL;
+        LiveRegGens[CallResource] = NULL;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  resetVRegCycle(SU);
+
+  SU->isScheduled = true;
+
+  // Conditions under which the scheduler should eagerly advance the cycle:
+  // (1) No available instructions
+  // (2) All pipelines full, so available instructions must have hazards.
+  //
+  // If HazardRec is disabled, the cycle was pre-advanced before calling
+  // ReleasePredecessors. In that case, IssueCount should remain 0.
+  //
+  // Check AvailableQueue after ReleasePredecessors in case of zero latency.
+  if (HazardRec->isEnabled() || AvgIPC > 1) {
+    if (SU->getNode() && SU->getNode()->isMachineOpcode())
+      ++IssueCount;
+    if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+        || (!HazardRec->isEnabled() && IssueCount == AvgIPC))
+      AdvanceToCycle(CurCycle + 1);
+  }
+}
+
+/// CapturePred - This does the opposite of ReleasePred. Since SU is being
+/// unscheduled, incrcease the succ left count of its predecessors. Remove
+/// them from AvailableQueue if necessary.
+void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+  if (PredSU->isAvailable) {
+    PredSU->isAvailable = false;
+    if (!PredSU->isPending)
+      AvailableQueue->remove(PredSU);
+  }
+
+  assert(PredSU->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+  ++PredSU->NumSuccsLeft;
+}
+
+/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
+/// its predecessor states to reflect the change.
+void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
+  DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
+  DEBUG(SU->dump(this));
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    CapturePred(&*I);
+    if (I->isAssignedRegDep() && SU == LiveRegGens[I->getReg()]){
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
+             "Physical register dependency violated?");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = NULL;
+      LiveRegGens[I->getReg()] = NULL;
+      releaseInterferences(I->getReg());
+    }
+  }
+
+  // Reclaim the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isMachineOpcode() &&
+        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+      ++NumLiveRegs;
+      LiveRegDefs[CallResource] = SU;
+      LiveRegGens[CallResource] = CallSeqEndForStart[SU];
+    }
+  }
+
+  // Release the special call resource dependence, if this is the end
+  // of a call.
+  if (LiveRegGens[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = NULL;
+        LiveRegGens[CallResource] = NULL;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (!LiveRegDefs[I->getReg()])
+        ++NumLiveRegs;
+      // This becomes the nearest def. Note that an earlier def may still be
+      // pending if this is a two-address node.
+      LiveRegDefs[I->getReg()] = SU;
+      if (LiveRegGens[I->getReg()] == NULL ||
+          I->getSUnit()->getHeight() < LiveRegGens[I->getReg()]->getHeight())
+        LiveRegGens[I->getReg()] = I->getSUnit();
+    }
+  }
+  if (SU->getHeight() < MinAvailableCycle)
+    MinAvailableCycle = SU->getHeight();
+
+  SU->setHeightDirty();
+  SU->isScheduled = false;
+  SU->isAvailable = true;
+  if (!DisableSchedCycles && AvailableQueue->hasReadyFilter()) {
+    // Don't make available until backtracking is complete.
+    SU->isPending = true;
+    PendingQueue.push_back(SU);
+  }
+  else {
+    AvailableQueue->push(SU);
+  }
+  AvailableQueue->unscheduledNode(SU);
+}
+
+/// After backtracking, the hazard checker needs to be restored to a state
+/// corresponding the current cycle.
+void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
+  HazardRec->Reset();
+
+  unsigned LookAhead = std::min((unsigned)Sequence.size(),
+                                HazardRec->getMaxLookAhead());
+  if (LookAhead == 0)
+    return;
+
+  std::vector<SUnit*>::const_iterator I = (Sequence.end() - LookAhead);
+  unsigned HazardCycle = (*I)->getHeight();
+  for (std::vector<SUnit*>::const_iterator E = Sequence.end(); I != E; ++I) {
+    SUnit *SU = *I;
+    for (; SU->getHeight() > HazardCycle; ++HazardCycle) {
+      HazardRec->RecedeCycle();
+    }
+    EmitNode(SU);
+  }
+}
+
+/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
+/// BTCycle in order to schedule a specific node.
+void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, SUnit *BtSU) {
+  SUnit *OldSU = Sequence.back();
+  while (true) {
+    Sequence.pop_back();
+    // FIXME: use ready cycle instead of height
+    CurCycle = OldSU->getHeight();
+    UnscheduleNodeBottomUp(OldSU);
+    AvailableQueue->setCurCycle(CurCycle);
+    if (OldSU == BtSU)
+      break;
+    OldSU = Sequence.back();
+  }
+
+  assert(!SU->isSucc(OldSU) && "Something is wrong!");
+
+  RestoreHazardCheckerBottomUp();
+
+  ReleasePending();
+
+  ++NumBacktracks;
+}
+
+static bool isOperandOf(const SUnit *SU, SDNode *N) {
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isOperandOf(N))
+      return true;
+  }
+  return false;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
+  SDNode *N = SU->getNode();
+  if (!N)
+    return NULL;
+
+  if (SU->getNode()->getGluedNode())
+    return NULL;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue)
+      return NULL;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDValue &Op = N->getOperand(i);
+    EVT VT = Op.getNode()->getValueType(Op.getResNo());
+    if (VT == MVT::Glue)
+      return NULL;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return NULL;
+
+    // unfolding an x86 DEC64m operation results in store, dec, load which
+    // can't be handled here so quit
+    if (NewNodes.size() == 3)
+      return NULL;
+
+    DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = CreateNewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+
+      InitNumRegDefsLeft(LoadSU);
+      computeLatency(LoadSU);
+    }
+
+    SUnit *NewSU = CreateNewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    InitNumRegDefsLeft(NewSU);
+    computeLatency(NewSU);
+
+    // Record all the edges to and from the old SU, by category.
+    SmallVector<SDep, 4> ChainPreds;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPreds.push_back(*I);
+      else if (isOperandOf(I->getSUnit(), LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    // Now assign edges to the newly-created nodes.
+    for (unsigned i = 0, e = ChainPreds.size(); i != e; ++i) {
+      const SDep &Pred = ChainPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+      // Balance register pressure.
+      if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled
+          && !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
+        --NewSU->NumRegDefsLeft;
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    }
+
+    // Add a data dependency to reflect that NewSU reads the value defined
+    // by LoadSU.
+    SDep D(LoadSU, SDep::Data, 0);
+    D.setLatency(LoadSU->Latency);
+    AddPred(NewSU, D);
+
+    if (isNewLoad)
+      AvailableQueue->addNode(LoadSU);
+    AvailableQueue->addNode(NewSU);
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
+  NewSU = CreateClone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                               const TargetRegisterClass *DestRC,
+                                               const TargetRegisterClass *SrcRC,
+                                               SmallVector<SUnit*, 2> &Copies) {
+  SUnit *CopyFromSU = CreateNewSUnit(NULL);
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = CreateNewSUnit(NULL);
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+    else {
+      // Avoid scheduling the def-side copy before other successors. Otherwise
+      // we could introduce another physreg interference on the copy and
+      // continue inserting copies indefinitely.
+      AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(CopyFromSU);
+  AvailableQueue->addNode(CopyToSU);
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+  assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+  unsigned NumRes = MCID.getNumDefs();
+  for (const uint16_t *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+    if (Reg == *ImpDef)
+      break;
+    ++NumRes;
+  }
+  return N->getValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVector<unsigned, 4> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
+
+    // Check if Ref is live.
+    if (!LiveRegDefs[*AliasI]) continue;
+
+    // Allow multiple uses of the same def.
+    if (LiveRegDefs[*AliasI] == SU) continue;
+
+    // Add Reg to the set of interfering live regs.
+    if (RegAdded.insert(*AliasI)) {
+      LRegs.push_back(*AliasI);
+    }
+  }
+}
+
+/// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
+/// by RegMask, and add them to LRegs.
+static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
+                                     std::vector<SUnit*> &LiveRegDefs,
+                                     SmallSet<unsigned, 4> &RegAdded,
+                                     SmallVector<unsigned, 4> &LRegs) {
+  // Look at all live registers. Skip Reg0 and the special CallResource.
+  for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
+    if (!LiveRegDefs[i]) continue;
+    if (LiveRegDefs[i] == SU) continue;
+    if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
+    if (RegAdded.insert(i))
+      LRegs.push_back(i);
+  }
+}
+
+/// getNodeRegMask - Returns the register mask attached to an SDNode, if any.
+static const uint32_t *getNodeRegMask(const SDNode *N) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (const RegisterMaskSDNode *Op =
+        dyn_cast<RegisterMaskSDNode>(N->getOperand(i).getNode()))
+      return Op->getRegMask();
+  return NULL;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGRRList::
+DelayForLiveRegsBottomUp(SUnit *SU, SmallVector<unsigned, 4> &LRegs) {
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  //
+  // If SU is the currently live definition of the same register that it uses,
+  // then we are free to schedule it.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+
+    if (!Node->isMachineOpcode())
+      continue;
+    // If we're in the middle of scheduling a call, don't begin scheduling
+    // another call. Also, don't allow any physical registers to be live across
+    // the call.
+    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+      // Check the special calling-sequence resource.
+      unsigned CallResource = TRI->getNumRegs();
+      if (LiveRegDefs[CallResource]) {
+        SDNode *Gen = LiveRegGens[CallResource]->getNode();
+        while (SDNode *Glued = Gen->getGluedNode())
+          Gen = Glued;
+        if (!IsChainDependent(Gen, Node, 0, TII) && RegAdded.insert(CallResource))
+          LRegs.push_back(CallResource);
+      }
+    }
+    if (const uint32_t *RegMask = getNodeRegMask(Node))
+      CheckForLiveRegDefMasked(SU, RegMask, LiveRegDefs, RegAdded, LRegs);
+
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (!MCID.ImplicitDefs)
+      continue;
+    for (const uint16_t *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+  }
+
+  return !LRegs.empty();
+}
+
+void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
+  // Add the nodes that aren't ready back onto the available list.
+  for (unsigned i = Interferences.size(); i > 0; --i) {
+    SUnit *SU = Interferences[i-1];
+    LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
+    if (Reg) {
+      SmallVector<unsigned, 4> &LRegs = LRegsPos->second;
+      if (std::find(LRegs.begin(), LRegs.end(), Reg) == LRegs.end())
+        continue;
+    }
+    SU->isPending = false;
+    // The interfering node may no longer be available due to backtracking.
+    // Furthermore, it may have been made available again, in which case it is
+    // now already in the AvailableQueue.
+    if (SU->isAvailable && !SU->NodeQueueId) {
+      DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
+      AvailableQueue->push(SU);
+    }
+    if (i < Interferences.size())
+      Interferences[i-1] = Interferences.back();
+    Interferences.pop_back();
+    LRegsMap.erase(LRegsPos);
+  }
+}
+
+/// Return a node that can be scheduled in this cycle. Requirements:
+/// (1) Ready: latency has been satisfied
+/// (2) No Hazards: resources are available
+/// (3) No Interferences: may unschedule to break register interferences.
+SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
+  SUnit *CurSU = AvailableQueue->empty() ? 0 : AvailableQueue->pop();
+  while (CurSU) {
+    SmallVector<unsigned, 4> LRegs;
+    if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+      break;
+    DEBUG(dbgs() << "    Interfering reg " <<
+          (LRegs[0] == TRI->getNumRegs() ? "CallResource"
+           : TRI->getName(LRegs[0]))
+           << " SU #" << CurSU->NodeNum << '\n');
+    std::pair<LRegsMapT::iterator, bool> LRegsPair =
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+    if (LRegsPair.second) {
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      Interferences.push_back(CurSU);
+    }
+    else {
+      assert(CurSU->isPending && "Intereferences are pending");
+      // Update the interference with current live regs.
+      LRegsPair.first->second = LRegs;
+    }
+    CurSU = AvailableQueue->pop();
+  }
+  if (CurSU)
+    return CurSU;
+
+  // All candidates are delayed due to live physical reg dependencies.
+  // Try backtracking, code duplication, or inserting cross class copies
+  // to resolve it.
+  for (unsigned i = 0, e = Interferences.size(); i != e; ++i) {
+    SUnit *TrySU = Interferences[i];
+    SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+
+    // Try unscheduling up to the point where it's safe to schedule
+    // this node.
+    SUnit *BtSU = NULL;
+    unsigned LiveCycle = UINT_MAX;
+    for (unsigned j = 0, ee = LRegs.size(); j != ee; ++j) {
+      unsigned Reg = LRegs[j];
+      if (LiveRegGens[Reg]->getHeight() < LiveCycle) {
+        BtSU = LiveRegGens[Reg];
+        LiveCycle = BtSU->getHeight();
+      }
+    }
+    if (!WillCreateCycle(TrySU, BtSU))  {
+      // BacktrackBottomUp mutates Interferences!
+      BacktrackBottomUp(TrySU, BtSU);
+
+      // Force the current node to be scheduled before the node that
+      // requires the physical reg dep.
+      if (BtSU->isAvailable) {
+        BtSU->isAvailable = false;
+        if (!BtSU->isPending)
+          AvailableQueue->remove(BtSU);
+      }
+      DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum << ") to SU("
+            << TrySU->NodeNum << ")\n");
+      AddPred(TrySU, SDep(BtSU, SDep::Artificial));
+
+      // If one or more successors has been unscheduled, then the current
+      // node is no longer available.
+      if (!TrySU->isAvailable)
+        CurSU = AvailableQueue->pop();
+      else {
+        AvailableQueue->remove(TrySU);
+        CurSU = TrySU;
+      }
+      // Interferences has been mutated. We must break.
+      break;
+    }
+  }
+
+  if (!CurSU) {
+    // Can't backtrack. If it's too expensive to copy the value, then try
+    // duplicate the nodes that produces these "too expensive to copy"
+    // values to break the dependency. In case even that doesn't work,
+    // insert cross class copies.
+    // If it's not too expensive, i.e. cost != -1, issue copies.
+    SUnit *TrySU = Interferences[0];
+    SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+    assert(LRegs.size() == 1 && "Can't handle this yet!");
+    unsigned Reg = LRegs[0];
+    SUnit *LRDef = LiveRegDefs[Reg];
+    EVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+    const TargetRegisterClass *RC =
+      TRI->getMinimalPhysRegClass(Reg, VT);
+    const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+    // If cross copy register class is the same as RC, then it must be possible
+    // copy the value directly. Do not try duplicate the def.
+    // If cross copy register class is not the same as RC, then it's possible to
+    // copy the value but it require cross register class copies and it is
+    // expensive.
+    // If cross copy register class is null, then it's not possible to copy
+    // the value at all.
+    SUnit *NewDef = 0;
+    if (DestRC != RC) {
+      NewDef = CopyAndMoveSuccessors(LRDef);
+      if (!DestRC && !NewDef)
+        report_fatal_error("Can't handle live physical register dependency!");
+    }
+    if (!NewDef) {
+      // Issue copies, these can be expensive cross register class copies.
+      SmallVector<SUnit*, 2> Copies;
+      InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+      DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
+            << " to SU #" << Copies.front()->NodeNum << "\n");
+      AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
+      NewDef = Copies.back();
+    }
+
+    DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
+          << " to SU #" << TrySU->NodeNum << "\n");
+    LiveRegDefs[Reg] = NewDef;
+    AddPred(NewDef, SDep(TrySU, SDep::Artificial));
+    TrySU->isAvailable = false;
+    CurSU = NewDef;
+  }
+  assert(CurSU && "Unable to resolve live physical register dependencies!");
+  return CurSU;
+}
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleBottomUp() {
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue->push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !Interferences.empty()) {
+    DEBUG(dbgs() << "\nExamining Available:\n";
+          AvailableQueue->dump(this));
+
+    // Pick the best node to schedule taking all constraints into
+    // consideration.
+    SUnit *SU = PickNodeToScheduleBottomUp();
+
+    AdvancePastStalls(SU);
+
+    ScheduleNodeBottomUp(SU);
+
+    while (AvailableQueue->empty() && !PendingQueue.empty()) {
+      // Advance the cycle to free resources. Skip ahead to the next ready SU.
+      assert(MinAvailableCycle < UINT_MAX && "MinAvailableCycle uninitialized");
+      AdvanceToCycle(std::max(CurCycle + 1, MinAvailableCycle));
+    }
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                RegReductionPriorityQueue Definition
+//===----------------------------------------------------------------------===//
+//
+// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
+// to reduce register pressure.
+//
+namespace {
+class RegReductionPQBase;
+
+struct queue_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+  bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
+};
+
+#ifndef NDEBUG
+template<class SF>
+struct reverse_sort : public queue_sort {
+  SF &SortFunc;
+  reverse_sort(SF &sf) : SortFunc(sf) {}
+  reverse_sort(const reverse_sort &RHS) : SortFunc(RHS.SortFunc) {}
+
+  bool operator()(SUnit* left, SUnit* right) const {
+    // reverse left/right rather than simply !SortFunc(left, right)
+    // to expose different paths in the comparison logic.
+    return SortFunc(right, left);
+  }
+};
+#endif // NDEBUG
+
+/// bu_ls_rr_sort - Priority function for bottom up register pressure
+// reduction scheduler.
+struct bu_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  bu_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+  bu_ls_rr_sort(const bu_ls_rr_sort &RHS) : SPQ(RHS.SPQ) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// src_ls_rr_sort - Priority function for source order scheduler.
+struct src_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  src_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+  src_ls_rr_sort(const src_ls_rr_sort &RHS)
+    : SPQ(RHS.SPQ) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// hybrid_ls_rr_sort - Priority function for hybrid scheduler.
+struct hybrid_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  hybrid_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+  hybrid_ls_rr_sort(const hybrid_ls_rr_sort &RHS)
+    : SPQ(RHS.SPQ) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
+// scheduler.
+struct ilp_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  ilp_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+  ilp_ls_rr_sort(const ilp_ls_rr_sort &RHS)
+    : SPQ(RHS.SPQ) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+class RegReductionPQBase : public SchedulingPriorityQueue {
+protected:
+  std::vector<SUnit*> Queue;
+  unsigned CurQueueId;
+  bool TracksRegPressure;
+  bool SrcOrder;
+
+  // SUnits - The SUnits for the current graph.
+  std::vector<SUnit> *SUnits;
+
+  MachineFunction &MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+  ScheduleDAGRRList *scheduleDAG;
+
+  // SethiUllmanNumbers - The SethiUllman number for each node.
+  std::vector<unsigned> SethiUllmanNumbers;
+
+  /// RegPressure - Tracking current reg pressure per register class.
+  ///
+  std::vector<unsigned> RegPressure;
+
+  /// RegLimit - Tracking the number of allocatable registers per register
+  /// class.
+  std::vector<unsigned> RegLimit;
+
+public:
+  RegReductionPQBase(MachineFunction &mf,
+                     bool hasReadyFilter,
+                     bool tracksrp,
+                     bool srcorder,
+                     const TargetInstrInfo *tii,
+                     const TargetRegisterInfo *tri,
+                     const TargetLowering *tli)
+    : SchedulingPriorityQueue(hasReadyFilter),
+      CurQueueId(0), TracksRegPressure(tracksrp), SrcOrder(srcorder),
+      MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(NULL) {
+    if (TracksRegPressure) {
+      unsigned NumRC = TRI->getNumRegClasses();
+      RegLimit.resize(NumRC);
+      RegPressure.resize(NumRC);
+      std::fill(RegLimit.begin(), RegLimit.end(), 0);
+      std::fill(RegPressure.begin(), RegPressure.end(), 0);
+      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I)
+        RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF);
+    }
+  }
+
+  void setScheduleDAG(ScheduleDAGRRList *scheduleDag) {
+    scheduleDAG = scheduleDag;
+  }
+
+  ScheduleHazardRecognizer* getHazardRec() {
+    return scheduleDAG->getHazardRec();
+  }
+
+  void initNodes(std::vector<SUnit> &sunits);
+
+  void addNode(const SUnit *SU);
+
+  void updateNode(const SUnit *SU);
+
+  void releaseState() {
+    SUnits = 0;
+    SethiUllmanNumbers.clear();
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  }
+
+  unsigned getNodePriority(const SUnit *SU) const;
+
+  unsigned getNodeOrdering(const SUnit *SU) const {
+    if (!SU->getNode()) return 0;
+
+    return scheduleDAG->DAG->GetOrdering(SU->getNode());
+  }
+
+  bool empty() const { return Queue.empty(); }
+
+  void push(SUnit *U) {
+    assert(!U->NodeQueueId && "Node in the queue already");
+    U->NodeQueueId = ++CurQueueId;
+    Queue.push_back(U);
+  }
+
+  void remove(SUnit *SU) {
+    assert(!Queue.empty() && "Queue is empty!");
+    assert(SU->NodeQueueId != 0 && "Not in queue!");
+    std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(),
+                                                 SU);
+    if (I != prior(Queue.end()))
+      std::swap(*I, Queue.back());
+    Queue.pop_back();
+    SU->NodeQueueId = 0;
+  }
+
+  bool tracksRegPressure() const { return TracksRegPressure; }
+
+  void dumpRegPressure() const;
+
+  bool HighRegPressure(const SUnit *SU) const;
+
+  bool MayReduceRegPressure(SUnit *SU) const;
+
+  int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
+
+  void scheduledNode(SUnit *SU);
+
+  void unscheduledNode(SUnit *SU);
+
+protected:
+  bool canClobber(const SUnit *SU, const SUnit *Op);
+  void AddPseudoTwoAddrDeps();
+  void PrescheduleNodesWithMultipleUses();
+  void CalculateSethiUllmanNumbers();
+};
+
+template<class SF>
+static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) {
+  std::vector<SUnit *>::iterator Best = Q.begin();
+  for (std::vector<SUnit *>::iterator I = llvm::next(Q.begin()),
+         E = Q.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != prior(Q.end()))
+    std::swap(*Best, Q.back());
+  Q.pop_back();
+  return V;
+}
+
+template<class SF>
+SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker, ScheduleDAG *DAG) {
+#ifndef NDEBUG
+  if (DAG->StressSched) {
+    reverse_sort<SF> RPicker(Picker);
+    return popFromQueueImpl(Q, RPicker);
+  }
+#endif
+  (void)DAG;
+  return popFromQueueImpl(Q, Picker);
+}
+
+template<class SF>
+class RegReductionPriorityQueue : public RegReductionPQBase {
+  SF Picker;
+
+public:
+  RegReductionPriorityQueue(MachineFunction &mf,
+                            bool tracksrp,
+                            bool srcorder,
+                            const TargetInstrInfo *tii,
+                            const TargetRegisterInfo *tri,
+                            const TargetLowering *tli)
+    : RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, srcorder,
+                         tii, tri, tli),
+      Picker(this) {}
+
+  bool isBottomUp() const { return SF::IsBottomUp; }
+
+  bool isReady(SUnit *U) const {
+    return Picker.HasReadyFilter && Picker.isReady(U, getCurCycle());
+  }
+
+  SUnit *pop() {
+    if (Queue.empty()) return NULL;
+
+    SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
+    V->NodeQueueId = 0;
+    return V;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump(ScheduleDAG *DAG) const {
+    // Emulate pop() without clobbering NodeQueueIds.
+    std::vector<SUnit*> DumpQueue = Queue;
+    SF DumpPicker = Picker;
+    while (!DumpQueue.empty()) {
+      SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
+      dbgs() << "Height " << SU->getHeight() << ": ";
+      SU->dump(DAG);
+    }
+  }
+#endif
+};
+
+typedef RegReductionPriorityQueue<bu_ls_rr_sort>
+BURegReductionPriorityQueue;
+
+typedef RegReductionPriorityQueue<src_ls_rr_sort>
+SrcRegReductionPriorityQueue;
+
+typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
+HybridBURRPriorityQueue;
+
+typedef RegReductionPriorityQueue<ilp_ls_rr_sort>
+ILPBURRPriorityQueue;
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//           Static Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+  if (SethiUllmanNumber != 0)
+    return SethiUllmanNumber;
+
+  unsigned Extra = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = I->getSUnit();
+    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+    if (PredSethiUllman > SethiUllmanNumber) {
+      SethiUllmanNumber = PredSethiUllman;
+      Extra = 0;
+    } else if (PredSethiUllman == SethiUllmanNumber)
+      ++Extra;
+  }
+
+  SethiUllmanNumber += Extra;
+
+  if (SethiUllmanNumber == 0)
+    SethiUllmanNumber = 1;
+
+  return SethiUllmanNumber;
+}
+
+/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
+/// scheduling units.
+void RegReductionPQBase::CalculateSethiUllmanNumbers() {
+  SethiUllmanNumbers.assign(SUnits->size(), 0);
+
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i)
+    CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::addNode(const SUnit *SU) {
+  unsigned SUSize = SethiUllmanNumbers.size();
+  if (SUnits->size() > SUSize)
+    SethiUllmanNumbers.resize(SUSize*2, 0);
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::updateNode(const SUnit *SU) {
+  SethiUllmanNumbers[SU->NodeNum] = 0;
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+// Lower priority means schedule further down. For bottom-up scheduling, lower
+// priority SUs are scheduled before higher priority SUs.
+unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
+  assert(SU->NodeNum < SethiUllmanNumbers.size());
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return 0;
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return 0;
+  if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+    // If SU does not have a register use, i.e. it doesn't produce a value
+    // that would be consumed (e.g. store), then it terminates a chain of
+    // computation.  Give it a large SethiUllman number so it will be
+    // scheduled right before its predecessors that it doesn't lengthen
+    // their live ranges.
+    return 0xffff;
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return 0;
+#if 1
+  return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                     Register Pressure Tracking
+//===----------------------------------------------------------------------===//
+
+void RegReductionPQBase::dumpRegPressure() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I) {
+    const TargetRegisterClass *RC = *I;
+    unsigned Id = RC->getID();
+    unsigned RP = RegPressure[Id];
+    if (!RP) continue;
+    DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
+          << '\n');
+  }
+#endif
+}
+
+bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
+  if (!TLI)
+    return false;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+    SUnit *PredSU = I->getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+
+      if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+        return true;
+    }
+  }
+  return false;
+}
+
+bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
+  const SDNode *N = SU->getNode();
+
+  if (!N->isMachineOpcode() || !SU->NumSuccs)
+    return false;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      return true;
+  }
+  return false;
+}
+
+// Compute the register pressure contribution by this instruction by count up
+// for uses that are not live and down for defs. Only count register classes
+// that are already under high pressure. As a side effect, compute the number of
+// uses of registers that are already live.
+//
+// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
+// so could probably be factored.
+int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
+  LiveUses = 0;
+  int PDiff = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+    SUnit *PredSU = I->getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      if (PredSU->getNode()->isMachineOpcode())
+        ++LiveUses;
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      MVT VT = RegDefPos.GetValue();
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] >= RegLimit[RCId])
+        ++PDiff;
+    }
+  }
+  const SDNode *N = SU->getNode();
+
+  if (!N || !N->isMachineOpcode() || !SU->NumSuccs)
+    return PDiff;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      --PDiff;
+  }
+  return PDiff;
+}
+
+void RegReductionPQBase::scheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  if (!SU->getNode())
+    return;
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+    SUnit *PredSU = I->getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    // FIXME: The ScheduleDAG currently loses information about which of a
+    // node's values is consumed by each dependence. Consequently, if the node
+    // defines multiple register classes, we don't know which to pressurize
+    // here. Instead the following loop consumes the register defs in an
+    // arbitrary order. At least it handles the common case of clustered loads
+    // to the same class. For precise liveness, each SDep needs to indicate the
+    // result number. But that tightly couples the ScheduleDAG with the
+    // SelectionDAG making updates tricky. A simpler hack would be to attach a
+    // value type or register class to SDep.
+    //
+    // The most important aspect of register tracking is balancing the increase
+    // here with the reduction further below. Note that this SU may use multiple
+    // defs in PredSU. The can't be determined here, but we've already
+    // compensated by reducing NumRegDefsLeft in PredSU during
+    // ScheduleDAGSDNodes::AddSchedEdges.
+    --PredSU->NumRegDefsLeft;
+    unsigned SkipRegDefs = PredSU->NumRegDefsLeft;
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+      if (SkipRegDefs)
+        continue;
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+      RegPressure[RCId] += Cost;
+      break;
+    }
+  }
+
+  // We should have this assert, but there may be dead SDNodes that never
+  // materialize as SUnits, so they don't appear to generate liveness.
+  //assert(SU->NumRegDefsLeft == 0 && "not all regdefs have scheduled uses");
+  int SkipRegDefs = (int)SU->NumRegDefsLeft;
+  for (ScheduleDAGSDNodes::RegDefIter RegDefPos(SU, scheduleDAG);
+       RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+    if (SkipRegDefs > 0)
+      continue;
+    unsigned RCId, Cost;
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+    if (RegPressure[RCId] < Cost) {
+      // Register pressure tracking is imprecise. This can happen. But we try
+      // hard not to let it happen because it likely results in poor scheduling.
+      DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") has too many regdefs\n");
+      RegPressure[RCId] = 0;
+    }
+    else {
+      RegPressure[RCId] -= Cost;
+    }
+  }
+  dumpRegPressure();
+}
+
+void RegReductionPQBase::unscheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  const SDNode *N = SU->getNode();
+  if (!N) return;
+
+  if (!N->isMachineOpcode()) {
+    if (N->getOpcode() != ISD::CopyToReg)
+      return;
+  } else {
+    unsigned Opc = N->getMachineOpcode();
+    if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+        Opc == TargetOpcode::INSERT_SUBREG ||
+        Opc == TargetOpcode::SUBREG_TO_REG ||
+        Opc == TargetOpcode::REG_SEQUENCE ||
+        Opc == TargetOpcode::IMPLICIT_DEF)
+      return;
+  }
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+    SUnit *PredSU = I->getSUnit();
+    // NumSuccsLeft counts all deps. Don't compare it with NumSuccs which only
+    // counts data deps.
+    if (PredSU->NumSuccsLeft != PredSU->Succs.size())
+      continue;
+    const SDNode *PN = PredSU->getNode();
+    if (!PN->isMachineOpcode()) {
+      if (PN->getOpcode() == ISD::CopyFromReg) {
+        MVT VT = PN->getSimpleValueType(0);
+        unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+        RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      }
+      continue;
+    }
+    unsigned POpc = PN->getMachineOpcode();
+    if (POpc == TargetOpcode::IMPLICIT_DEF)
+      continue;
+    if (POpc == TargetOpcode::EXTRACT_SUBREG ||
+        POpc == TargetOpcode::INSERT_SUBREG ||
+        POpc == TargetOpcode::SUBREG_TO_REG) {
+      MVT VT = PN->getSimpleValueType(0);
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      continue;
+    }
+    unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+    for (unsigned i = 0; i != NumDefs; ++i) {
+      MVT VT = PN->getSimpleValueType(i);
+      if (!PN->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+        // Register pressure tracking is imprecise. This can happen.
+        RegPressure[RCId] = 0;
+      else
+        RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+  // may transfer data dependencies to CopyToReg.
+  if (SU->NumSuccs && N->isMachineOpcode()) {
+    unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getSimpleValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  dumpRegPressure();
+}
+
+//===----------------------------------------------------------------------===//
+//           Dynamic Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain succs
+    unsigned Height = I->getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (I->getSUnit()->getNode() &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(I->getSUnit())+1;
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
+/// CopyFromReg from a virtual register.
+static bool hasOnlyLiveInOpers(const SUnit *SU) {
+  bool RetVal = false;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *PredSU = I->getSUnit();
+    if (PredSU->getNode() &&
+        PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
+/// CopyToReg to a virtual register. This SU def is probably a liveout and
+/// it has no other use. It should be scheduled closer to the terminator.
+static bool hasOnlyLiveOutUses(const SUnit *SU) {
+  bool RetVal = false;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    const SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+// Set isVRegCycle for a node with only live in opers and live out uses. Also
+// set isVRegCycle for its CopyFromReg operands.
+//
+// This is only relevant for single-block loops, in which case the VRegCycle
+// node is likely an induction variable in which the operand and target virtual
+// registers should be coalesced (e.g. pre/post increment values). Setting the
+// isVRegCycle flag helps the scheduler prioritize other uses of the same
+// CopyFromReg so that this node becomes the virtual register "kill". This
+// avoids interference between the values live in and out of the block and
+// eliminates a copy inside the loop.
+static void initVRegCycle(SUnit *SU) {
+  if (DisableSchedVRegCycle)
+    return;
+
+  if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
+    return;
+
+  DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+
+  SU->isVRegCycle = true;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;
+    I->getSUnit()->isVRegCycle = true;
+  }
+}
+
+// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
+// CopyFromReg operands. We should no longer penalize other uses of this VReg.
+static void resetVRegCycle(SUnit *SU) {
+  if (!SU->isVRegCycle)
+    return;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = I->getSUnit();
+    if (PredSU->isVRegCycle) {
+      assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
+             "VRegCycle def must be CopyFromReg");
+      I->getSUnit()->isVRegCycle = 0;
+    }
+  }
+}
+
+// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
+// means a node that defines the VRegCycle has not been scheduled yet.
+static bool hasVRegCycleUse(const SUnit *SU) {
+  // If this SU also defines the VReg, don't hoist it as a "use".
+  if (SU->isVRegCycle)
+    return false;
+
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->isVRegCycle &&
+        I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+      DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Check for either a dependence (latency) or resource (hazard) stall.
+//
+// Note: The ScheduleHazardRecognizer interface requires a non-const SU.
+static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) {
+  if ((int)SPQ->getCurCycle() < Height) return true;
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return true;
+  return false;
+}
+
+// Return -1 if left has higher priority, 1 if right has higher priority.
+// Return 0 if latency-based priority is equivalent.
+static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
+                            RegReductionPQBase *SPQ) {
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
+  int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
+  int LHeight = (int)left->getHeight() + LPenalty;
+  int RHeight = (int)right->getHeight() + RPenalty;
+
+  bool LStall = (!checkPref || left->SchedulingPref == Sched::ILP) &&
+    BUHasStall(left, LHeight, SPQ);
+  bool RStall = (!checkPref || right->SchedulingPref == Sched::ILP) &&
+    BUHasStall(right, RHeight, SPQ);
+
+  // If scheduling one of the node will cause a pipeline stall, delay it.
+  // If scheduling either one of the node will cause a pipeline stall, sort
+  // them according to their height.
+  if (LStall) {
+    if (!RStall)
+      return 1;
+    if (LHeight != RHeight)
+      return LHeight > RHeight ? 1 : -1;
+  } else if (RStall)
+    return -1;
+
+  // If either node is scheduling for latency, sort them by height/depth
+  // and latency.
+  if (!checkPref || (left->SchedulingPref == Sched::ILP ||
+                     right->SchedulingPref == Sched::ILP)) {
+    // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+    // is enabled, grouping instructions by cycle, then its height is already
+    // covered so only its depth matters. We also reach this point if both stall
+    // but have the same height.
+    if (!SPQ->getHazardRec()->isEnabled()) {
+      if (LHeight != RHeight)
+        return LHeight > RHeight ? 1 : -1;
+    }
+    int LDepth = left->getDepth() - LPenalty;
+    int RDepth = right->getDepth() - RPenalty;
+    if (LDepth != RDepth) {
+      DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+            << ") depth " << LDepth << " vs SU (" << right->NodeNum
+            << ") depth " << RDepth << "\n");
+      return LDepth < RDepth ? 1 : -1;
+    }
+    if (left->Latency != right->Latency)
+      return left->Latency > right->Latency ? 1 : -1;
+  }
+  return 0;
+}
+
+static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      #ifndef NDEBUG
+      const char *const PhysRegMsg[] = {" has no physreg"," defines a physreg"};
+      #endif
+      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
+            << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
+  if (LPriority != RPriority)
+    return LPriority > RPriority;
+
+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+  }
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch;
+
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
+    int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  else {
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+
+    if (left->getDepth() != right->getDepth())
+      return left->getDepth() < right->getDepth();
+  }
+
+  assert(left->NodeQueueId && right->NodeQueueId &&
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+// Bottom up
+bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  return BURRSort(left, right, SPQ);
+}
+
+// Source order, otherwise bottom up.
+bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  unsigned LOrder = SPQ->getNodeOrdering(left);
+  unsigned ROrder = SPQ->getNodeOrdering(right);
+
+  // Prefer an ordering where the lower the non-zero order number, the higher
+  // the preference.
+  if ((LOrder || ROrder) && LOrder != ROrder)
+    return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+
+  return BURRSort(left, right, SPQ);
+}
+
+// If the time between now and when the instruction will be ready can cover
+// the spill code, then avoid adding it to the ready queue. This gives long
+// stalls highest priority and allows hoisting across calls. It should also
+// speed up processing the available queue.
+bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  static const unsigned ReadyDelay = 3;
+
+  if (SPQ->MayReduceRegPressure(SU)) return true;
+
+  if (SU->getHeight() > (CurCycle + ReadyDelay)) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, -ReadyDelay)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+// Return true if right should be scheduled with higher priority than left.
+bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh) {
+    DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
+          << right->NodeNum << ")\n");
+    return true;
+  }
+  else if (!LHigh && RHigh) {
+    DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
+          << left->NodeNum << ")\n");
+    return false;
+  }
+  if (!LHigh && !RHigh) {
+    int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  return BURRSort(left, right, SPQ);
+}
+
+// Schedule as many instructions in each cycle as possible. So don't make an
+// instruction available unless it is ready in the current cycle.
+bool ilp_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  if (SU->getHeight() > CurCycle) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+static bool canEnableCoalescing(SUnit *SU) {
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return true;
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return true;
+
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return true;
+
+  return false;
+}
+
+// list-ilp is currently an experimental scheduler that allows various
+// heuristics to be enabled prior to the normal register reduction logic.
+bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  unsigned LLiveUses = 0, RLiveUses = 0;
+  int LPDiff = 0, RPDiff = 0;
+  if (!DisableSchedRegPressure || !DisableSchedLiveUses) {
+    LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
+    RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
+  }
+  if (!DisableSchedRegPressure && LPDiff != RPDiff) {
+    DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum << "): " << LPDiff
+          << " != SU(" << right->NodeNum << "): " << RPDiff << "\n");
+    return LPDiff > RPDiff;
+  }
+
+  if (!DisableSchedRegPressure && (LPDiff > 0 || RPDiff > 0)) {
+    bool LReduce = canEnableCoalescing(left);
+    bool RReduce = canEnableCoalescing(right);
+    if (LReduce && !RReduce) return false;
+    if (RReduce && !LReduce) return true;
+  }
+
+  if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
+    DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
+          << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
+    return LLiveUses < RLiveUses;
+  }
+
+  if (!DisableSchedStalls) {
+    bool LStall = BUHasStall(left, left->getHeight(), SPQ);
+    bool RStall = BUHasStall(right, right->getHeight(), SPQ);
+    if (LStall != RStall)
+      return left->getHeight() > right->getHeight();
+  }
+
+  if (!DisableSchedCriticalPath) {
+    int spread = (int)left->getDepth() - (int)right->getDepth();
+    if (std::abs(spread) > MaxReorderWindow) {
+      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+            << left->getDepth() << " != SU(" << right->NodeNum << "): "
+            << right->getDepth() << "\n");
+      return left->getDepth() < right->getDepth();
+    }
+  }
+
+  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+    int spread = (int)left->getHeight() - (int)right->getHeight();
+    if (std::abs(spread) > MaxReorderWindow)
+      return left->getHeight() > right->getHeight();
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  // Add pseudo dependency edges for two-address nodes.
+  if (!Disable2AddrHack)
+    AddPseudoTwoAddrDeps();
+  // Reroute edges to nodes with multiple uses.
+  if (!TracksRegPressure && !SrcOrder)
+    PrescheduleNodesWithMultipleUses();
+  // Calculate node priorities.
+  CalculateSethiUllmanNumbers();
+
+  // For single block loops, mark nodes that look like canonical IV increments.
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) {
+    for (unsigned i = 0, e = sunits.size(); i != e; ++i) {
+      initVRegCycle(&sunits[i]);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                    Preschedule for Register Pressure
+//===----------------------------------------------------------------------===//
+
+bool RegReductionPQBase::canClobber(const SUnit *SU, const SUnit *Op) {
+  if (SU->isTwoAddress) {
+    unsigned Opc = SU->getNode()->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (MCID.getOperandConstraint(i+NumRes, MCOI::TIED_TO) != -1) {
+        SDNode *DU = SU->getNode()->getOperand(i).getNode();
+        if (DU->getNodeId() != -1 &&
+            Op->OrigNode == &(*SUnits)[DU->getNodeId()])
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// canClobberReachingPhysRegUse - True if SU would clobber one of it's
+/// successor's explicit physregs whose definition can reach DepSU.
+/// i.e. DepSU should not be scheduled above SU.
+static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
+                                         ScheduleDAGRRList *scheduleDAG,
+                                         const TargetInstrInfo *TII,
+                                         const TargetRegisterInfo *TRI) {
+  const uint16_t *ImpDefs
+    = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
+  const uint32_t *RegMask = getNodeRegMask(SU->getNode());
+  if(!ImpDefs && !RegMask)
+    return false;
+
+  for (SUnit::const_succ_iterator SI = SU->Succs.begin(), SE = SU->Succs.end();
+       SI != SE; ++SI) {
+    SUnit *SuccSU = SI->getSUnit();
+    for (SUnit::const_pred_iterator PI = SuccSU->Preds.begin(),
+           PE = SuccSU->Preds.end(); PI != PE; ++PI) {
+      if (!PI->isAssignedRegDep())
+        continue;
+
+      if (RegMask && MachineOperand::clobbersPhysReg(RegMask, PI->getReg()) &&
+          scheduleDAG->IsReachable(DepSU, PI->getSUnit()))
+        return true;
+
+      if (ImpDefs)
+        for (const uint16_t *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+          // Return true if SU clobbers this physical register use and the
+          // definition of the register reaches from DepSU. IsReachable queries
+          // a topological forward sort of the DAG (following the successors).
+          if (TRI->regsOverlap(*ImpDef, PI->getReg()) &&
+              scheduleDAG->IsReachable(DepSU, PI->getSUnit()))
+            return true;
+    }
+  }
+  return false;
+}
+
+/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
+/// physical register defs.
+static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
+                                  const TargetInstrInfo *TII,
+                                  const TargetRegisterInfo *TRI) {
+  SDNode *N = SuccSU->getNode();
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  const uint16_t *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  assert(ImpDefs && "Caller should check hasPhysRegDefs");
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (!SUNode->isMachineOpcode())
+      continue;
+    const uint16_t *SUImpDefs =
+      TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
+    const uint32_t *SURegMask = getNodeRegMask(SUNode);
+    if (!SUImpDefs && !SURegMask)
+      continue;
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      EVT VT = N->getValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned Reg = ImpDefs[i - NumDefs];
+      if (SURegMask && MachineOperand::clobbersPhysReg(SURegMask, Reg))
+        return true;
+      if (!SUImpDefs)
+        continue;
+      for (;*SUImpDefs; ++SUImpDefs) {
+        unsigned SUReg = *SUImpDefs;
+        if (TRI->regsOverlap(Reg, SUReg))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
+/// are not handled well by the general register pressure reduction
+/// heuristics. When presented with code like this:
+///
+///      N
+///    / |
+///   /  |
+///  U  store
+///  |
+/// ...
+///
+/// the heuristics tend to push the store up, but since the
+/// operand of the store has another use (U), this would increase
+/// the length of that other use (the U->N edge).
+///
+/// This function transforms code like the above to route U's
+/// dependence through the store when possible, like this:
+///
+///      N
+///      ||
+///      ||
+///     store
+///       |
+///       U
+///       |
+///      ...
+///
+/// This results in the store being scheduled immediately
+/// after N, which shortens the U->N live range, reducing
+/// register pressure.
+///
+void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
+  // Visit all the nodes in topological order, working top-down.
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    // For now, only look at nodes with no data successors, such as stores.
+    // These are especially important, due to the heuristics in
+    // getNodePriority for nodes with no data successors.
+    if (SU->NumSuccs != 0)
+      continue;
+    // For now, only look at nodes with exactly one data predecessor.
+    if (SU->NumPreds != 1)
+      continue;
+    // Avoid prescheduling copies to virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyToReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Locate the single data predecessor.
+    SUnit *PredSU = 0;
+    for (SUnit::const_pred_iterator II = SU->Preds.begin(),
+         EE = SU->Preds.end(); II != EE; ++II)
+      if (!II->isCtrl()) {
+        PredSU = II->getSUnit();
+        break;
+      }
+    assert(PredSU);
+
+    // Don't rewrite edges that carry physregs, because that requires additional
+    // support infrastructure.
+    if (PredSU->hasPhysRegDefs)
+      continue;
+    // Short-circuit the case where SU is PredSU's only data successor.
+    if (PredSU->NumSuccs == 1)
+      continue;
+    // Avoid prescheduling to copies from virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU->getNode())
+      if (N->getOpcode() == ISD::CopyFromReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Perform checks on the successors of PredSU.
+    for (SUnit::const_succ_iterator II = PredSU->Succs.begin(),
+         EE = PredSU->Succs.end(); II != EE; ++II) {
+      SUnit *PredSuccSU = II->getSUnit();
+      if (PredSuccSU == SU) continue;
+      // If PredSU has another successor with no data successors, for
+      // now don't attempt to choose either over the other.
+      if (PredSuccSU->NumSuccs == 0)
+        goto outer_loop_continue;
+      // Don't break physical register dependencies.
+      if (SU->hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, SU, TII, TRI))
+          goto outer_loop_continue;
+      // Don't introduce graph cycles.
+      if (scheduleDAG->IsReachable(SU, PredSuccSU))
+        goto outer_loop_continue;
+    }
+
+    // Ok, the transformation is safe and the heuristics suggest it is
+    // profitable. Update the graph.
+    DEBUG(dbgs() << "    Prescheduling SU #" << SU->NodeNum
+                 << " next to PredSU #" << PredSU->NodeNum
+                 << " to guide scheduling in the presence of multiple uses\n");
+    for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
+      SDep Edge = PredSU->Succs[i];
+      assert(!Edge.isAssignedRegDep());
+      SUnit *SuccSU = Edge.getSUnit();
+      if (SuccSU != SU) {
+        Edge.setSUnit(PredSU);
+        scheduleDAG->RemovePred(SuccSU, Edge);
+        scheduleDAG->AddPred(SU, Edge);
+        Edge.setSUnit(SU);
+        scheduleDAG->AddPred(SuccSU, Edge);
+        --i;
+      }
+    }
+  outer_loop_continue:;
+  }
+}
+
+/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
+/// it as a def&use operand. Add a pseudo control edge from it to the other
+/// node (if it won't create a cycle) so the two-address one will be scheduled
+/// first (lower in the schedule). If both nodes are two-address, favor the
+/// one that has a CopyToReg use (more likely to be a loop induction update).
+/// If both are two-address, but one is commutable while the other is not
+/// commutable, favor the one that's not commutable.
+void RegReductionPQBase::AddPseudoTwoAddrDeps() {
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    if (!SU->isTwoAddress)
+      continue;
+
+    SDNode *Node = SU->getNode();
+    if (!Node || !Node->isMachineOpcode() || SU->getNode()->getGluedNode())
+      continue;
+
+    bool isLiveOut = hasOnlyLiveOutUses(SU);
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      if (MCID.getOperandConstraint(j+NumRes, MCOI::TIED_TO) == -1)
+        continue;
+      SDNode *DU = SU->getNode()->getOperand(j).getNode();
+      if (DU->getNodeId() == -1)
+        continue;
+      const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
+      if (!DUSU) continue;
+      for (SUnit::const_succ_iterator I = DUSU->Succs.begin(),
+           E = DUSU->Succs.end(); I != E; ++I) {
+        if (I->isCtrl()) continue;
+        SUnit *SuccSU = I->getSUnit();
+        if (SuccSU == SU)
+          continue;
+        // Be conservative. Ignore if nodes aren't at roughly the same
+        // depth and height.
+        if (SuccSU->getHeight() < SU->getHeight() &&
+            (SU->getHeight() - SuccSU->getHeight()) > 1)
+          continue;
+        // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
+        // constrains whatever is using the copy, instead of the copy
+        // itself. In the case that the copy is coalesced, this
+        // preserves the intent of the pseudo two-address heurietics.
+        while (SuccSU->Succs.size() == 1 &&
+               SuccSU->getNode()->isMachineOpcode() &&
+               SuccSU->getNode()->getMachineOpcode() ==
+                 TargetOpcode::COPY_TO_REGCLASS)
+          SuccSU = SuccSU->Succs.front().getSUnit();
+        // Don't constrain non-instruction nodes.
+        if (!SuccSU->getNode() || !SuccSU->getNode()->isMachineOpcode())
+          continue;
+        // Don't constrain nodes with physical register defs if the
+        // predecessor can clobber them.
+        if (SuccSU->hasPhysRegDefs && SU->hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, SU, TII, TRI))
+            continue;
+        }
+        // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
+        // these may be coalesced away. We want them close to their uses.
+        unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
+        if (SuccOpc == TargetOpcode::EXTRACT_SUBREG ||
+            SuccOpc == TargetOpcode::INSERT_SUBREG ||
+            SuccOpc == TargetOpcode::SUBREG_TO_REG)
+          continue;
+        if (!canClobberReachingPhysRegUse(SuccSU, SU, scheduleDAG, TII, TRI) &&
+            (!canClobber(SuccSU, DUSU) ||
+             (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
+             (!SU->isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, SU)) {
+          DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
+                       << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Artificial));
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
+                                 CodeGenOpt::Level OptLevel) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  BURegReductionPriorityQueue *PQ =
+    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, 0);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+  SrcRegReductionPriorityQueue *PQ =
+    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, 0);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
+
+  HybridBURRPriorityQueue *PQ =
+    new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
+                                CodeGenOpt::Level OptLevel) {
+  const TargetMachine &TM = IS->TM;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  const TargetLowering *TLI = &IS->getTargetLowering();
+
+  ILPBURRPriorityQueue *PQ =
+    new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
new file mode 100644
index 000000000000..b22440daf16d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -0,0 +1,914 @@
+//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+STATISTIC(LoadsClustered, "Number of loads clustered together");
+
+// This allows latency based scheduler to notice high latency instructions
+// without a target itinerary. The choise if number here has more to do with
+// balancing scheduler heursitics than with the actual machine latency.
+static cl::opt<int> HighLatencyCycles(
+  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+  cl::desc("Roughly estimate the number of cycles that 'long latency'"
+           "instructions take for targets with no itinerary"));
+
+ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
+  : ScheduleDAG(mf), BB(0), DAG(0),
+    InstrItins(mf.getTarget().getInstrItineraryData()) {}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
+  BB = bb;
+  DAG = dag;
+
+  // Clear the scheduler's SUnit DAG.
+  ScheduleDAG::clearDAG();
+  Sequence.clear();
+
+  // Invoke the target's selection of scheduler.
+  Schedule();
+}
+
+/// NewSUnit - Creates a new SUnit and return a ptr to it.
+///
+SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
+#ifndef NDEBUG
+  const SUnit *Addr = 0;
+  if (!SUnits.empty())
+    Addr = &SUnits[0];
+#endif
+  SUnits.push_back(SUnit(N, (unsigned)SUnits.size()));
+  assert((Addr == 0 || Addr == &SUnits[0]) &&
+         "SUnits std::vector reallocated on the fly!");
+  SUnits.back().OrigNode = &SUnits.back();
+  SUnit *SU = &SUnits.back();
+  const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+  if (!N ||
+      (N->isMachineOpcode() &&
+       N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  return SU;
+}
+
+SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
+  SUnit *SU = newSUnit(Old->getNode());
+  SU->OrigNode = Old->OrigNode;
+  SU->Latency = Old->Latency;
+  SU->isVRegCycle = Old->isVRegCycle;
+  SU->isCall = Old->isCall;
+  SU->isCallOp = Old->isCallOp;
+  SU->isTwoAddress = Old->isTwoAddress;
+  SU->isCommutable = Old->isCommutable;
+  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
+  SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->isScheduleHigh = Old->isScheduleHigh;
+  SU->isScheduleLow = Old->isScheduleLow;
+  SU->SchedulingPref = Old->SchedulingPref;
+  Old->isCloned = true;
+  return SU;
+}
+
+/// CheckForPhysRegDependency - Check if the dependency between def and use of
+/// a specified operand is a physical register dependency. If so, returns the
+/// register and the cost of copying the register.
+static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                      const TargetRegisterInfo *TRI,
+                                      const TargetInstrInfo *TII,
+                                      unsigned &PhysReg, int &Cost) {
+  if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
+    return;
+
+  unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return;
+
+  unsigned ResNo = User->getOperand(2).getResNo();
+  if (Def->isMachineOpcode()) {
+    const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (ResNo >= II.getNumDefs() &&
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg) {
+      PhysReg = Reg;
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getValueType(ResNo));
+      Cost = RC->getCopyCost();
+    }
+  }
+}
+
+// Helper for AddGlue to clone node operands.
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
+                                SmallVectorImpl<EVT> &VTs,
+                                SDValue ExtraOper = SDValue()) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+    Ops.push_back(N->getOperand(I));
+
+  if (ExtraOper.getNode())
+    Ops.push_back(ExtraOper);
+
+  SDVTList VTList = DAG->getVTList(&VTs[0], VTs.size());
+  MachineSDNode::mmo_iterator Begin = 0, End = 0;
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  if (MN) {
+    Begin = MN->memoperands_begin();
+    End = MN->memoperands_end();
+  }
+
+  DAG->MorphNodeTo(N, N->getOpcode(), VTList, &Ops[0], Ops.size());
+
+  // Reset the memory references
+  if (MN)
+    MN->setMemRefs(Begin, End);
+}
+
+static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
+  SmallVector<EVT, 4> VTs;
+  SDNode *GlueDestNode = Glue.getNode();
+
+  // Don't add glue from a node to itself.
+  if (GlueDestNode == N) return false;
+
+  // Don't add a glue operand to something that already uses glue.
+  if (GlueDestNode &&
+      N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+    return false;
+  }
+  // Don't add glue to something that already has a glue value.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
+
+  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
+  if (AddGlue)
+    VTs.push_back(MVT::Glue);
+
+  CloneNodeWithValues(N, DAG, VTs, Glue);
+
+  return true;
+}
+
+// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
+// node even though simply shrinking the value list is sufficient.
+static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
+  assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
+          !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
+         "expected an unused glue value");
+
+  SmallVector<EVT, 4> VTs;
+  for (unsigned I = 0, E = N->getNumValues()-1; I != E; ++I)
+    VTs.push_back(N->getValueType(I));
+
+  CloneNodeWithValues(N, DAG, VTs);
+}
+
+/// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
+/// This function finds loads of the same base and different offsets. If the
+/// offsets are not far apart (target specific), it add MVT::Glue inputs and
+/// outputs to ensure they are scheduled together and in order. This
+/// optimization may benefit some targets by improving cache locality.
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDNode *Chain = 0;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1).getNode();
+  if (!Chain)
+    return;
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
+  SmallPtrSet<SDNode*, 16> Visited;
+  SmallVector<int64_t, 4> Offsets;
+  DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
+  bool Cluster = false;
+  SDNode *Base = Node;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E; ++I) {
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User))
+      continue;
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2)
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
+      continue;
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+  }
+
+  if (!Cluster)
+    return;
+
+  // Sort them in increasing order.
+  std::sort(Offsets.begin(), Offsets.end());
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
+
+  if (NumLoads == 0)
+    return;
+
+  // Cluster loads by adding MVT::Glue outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  SDValue InGlue = SDValue(0, 0);
+  if (AddGlue(Lead, InGlue, true, DAG))
+    InGlue = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutGlue = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    // If AddGlue fails, we could leave an unsused glue value. This should not
+    // cause any
+    if (AddGlue(Load, InGlue, OutGlue, DAG)) {
+      if (OutGlue)
+        InGlue = SDValue(Load, Load->getNumValues() - 1);
+
+      ++LoadsClustered;
+    }
+    else if (!OutGlue && InGlue.getNode())
+      RemoveUnusedGlue(InGlue.getNode(), DAG);
+  }
+}
+
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    SDNode *Node = &*NI;
+    if (!Node || !Node->isMachineOpcode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    if (MCID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
+  }
+}
+
+void ScheduleDAGSDNodes::BuildSchedUnits() {
+  // During scheduling, the NodeId field of SDNode is used to map SDNodes
+  // to their associated SUnits by holding SUnits table indices. A value
+  // of -1 means the SDNode does not yet have an associated SUnit.
+  unsigned NumNodes = 0;
+  for (SelectionDAG::allnodes_iterator NI = DAG->allnodes_begin(),
+       E = DAG->allnodes_end(); NI != E; ++NI) {
+    NI->setNodeId(-1);
+    ++NumNodes;
+  }
+
+  // Reserve entries in the vector for each of the SUnits we are creating.  This
+  // ensure that reallocation of the vector won't happen, so SUnit*'s won't get
+  // invalidated.
+  // FIXME: Multiply by 2 because we may clone nodes during scheduling.
+  // This is a temporary workaround.
+  SUnits.reserve(NumNodes * 2);
+
+  // Add all nodes in depth first order.
+  SmallVector<SDNode*, 64> Worklist;
+  SmallPtrSet<SDNode*, 64> Visited;
+  Worklist.push_back(DAG->getRoot().getNode());
+  Visited.insert(DAG->getRoot().getNode());
+
+  SmallVector<SUnit*, 8> CallSUnits;
+  while (!Worklist.empty()) {
+    SDNode *NI = Worklist.pop_back_val();
+
+    // Add all operands to the worklist unless they've already been added.
+    for (unsigned i = 0, e = NI->getNumOperands(); i != e; ++i)
+      if (Visited.insert(NI->getOperand(i).getNode()))
+        Worklist.push_back(NI->getOperand(i).getNode());
+
+    if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
+      continue;
+
+    // If this node has already been processed, stop now.
+    if (NI->getNodeId() != -1) continue;
+
+    SUnit *NodeSUnit = newSUnit(NI);
+
+    // See if anything is glued to this node, if so, add them to glued
+    // nodes.  Nodes can have at most one glue input and one glue output.  Glue
+    // is required to be the last operand and result of a node.
+
+    // Scan up to find glued preds.
+    SDNode *N = NI;
+    while (N->getNumOperands() &&
+           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+      N = N->getOperand(N->getNumOperands()-1).getNode();
+      assert(N->getNodeId() == -1 && "Node already inserted!");
+      N->setNodeId(NodeSUnit->NodeNum);
+      if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+        NodeSUnit->isCall = true;
+    }
+
+    // Scan down to find any glued succs.
+    N = NI;
+    while (N->getValueType(N->getNumValues()-1) == MVT::Glue) {
+      SDValue GlueVal(N, N->getNumValues()-1);
+
+      // There are either zero or one users of the Glue result.
+      bool HasGlueUse = false;
+      for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+           UI != E; ++UI)
+        if (GlueVal.isOperandOf(*UI)) {
+          HasGlueUse = true;
+          assert(N->getNodeId() == -1 && "Node already inserted!");
+          N->setNodeId(NodeSUnit->NodeNum);
+          N = *UI;
+          if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+            NodeSUnit->isCall = true;
+          break;
+        }
+      if (!HasGlueUse) break;
+    }
+
+    if (NodeSUnit->isCall)
+      CallSUnits.push_back(NodeSUnit);
+
+    // Schedule zero-latency TokenFactor below any nodes that may increase the
+    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
+    // have false stalls.
+    if (NI->getOpcode() == ISD::TokenFactor)
+      NodeSUnit->isScheduleLow = true;
+
+    // If there are glue operands involved, N is now the bottom-most node
+    // of the sequence of nodes that are glued together.
+    // Update the SUnit.
+    NodeSUnit->setNode(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NodeSUnit->NodeNum);
+
+    // Compute NumRegDefsLeft. This must be done before AddSchedEdges.
+    InitNumRegDefsLeft(NodeSUnit);
+
+    // Assign the Latency field of NodeSUnit using target-provided information.
+    computeLatency(NodeSUnit);
+  }
+
+  // Find all call operands.
+  while (!CallSUnits.empty()) {
+    SUnit *SU = CallSUnits.pop_back_val();
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->getOpcode() != ISD::CopyToReg)
+        continue;
+      SDNode *SrcN = SUNode->getOperand(2).getNode();
+      if (isPassiveNode(SrcN)) continue;   // Not scheduled.
+      SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
+      SrcSU->isCallOp = true;
+    }
+  }
+}
+
+void ScheduleDAGSDNodes::AddSchedEdges() {
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = forceUnitLatencies();
+
+  // Pass 2: add the preds, succs, etc.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    SUnit *SU = &SUnits[su];
+    SDNode *MainNode = SU->getNode();
+
+    if (MainNode->isMachineOpcode()) {
+      unsigned Opc = MainNode->getMachineOpcode();
+      const MCInstrDesc &MCID = TII->get(Opc);
+      for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+        if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+          SU->isTwoAddress = true;
+          break;
+        }
+      }
+      if (MCID.isCommutable())
+        SU->isCommutable = true;
+    }
+
+    // Find all predecessors and successors of the group.
+    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+      if (N->isMachineOpcode() &&
+          TII->get(N->getMachineOpcode()).getImplicitDefs()) {
+        SU->hasPhysRegClobbers = true;
+        unsigned NumUsed = InstrEmitter::CountResults(N);
+        while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
+          --NumUsed;    // Skip over unused values at the end.
+        if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
+          SU->hasPhysRegDefs = true;
+      }
+
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        SDNode *OpN = N->getOperand(i).getNode();
+        if (isPassiveNode(OpN)) continue;   // Not scheduled.
+        SUnit *OpSU = &SUnits[OpN->getNodeId()];
+        assert(OpSU && "Node has no SUnit!");
+        if (OpSU == SU) continue;           // In the same group.
+
+        EVT OpVT = N->getOperand(i).getValueType();
+        assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
+        bool isChain = OpVT == MVT::Other;
+
+        unsigned PhysReg = 0;
+        int Cost = 1;
+        // Determine if this is a physical register dependency.
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
+        assert((PhysReg == 0 || !isChain) &&
+               "Chain dependence via physreg data?");
+        // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
+        // emits a copy from the physical register to a virtual register unless
+        // it requires a cross class copy (cost < 0). That means we are only
+        // treating "expensive to copy" register dependency as physical register
+        // dependency. This may change in the future though.
+        if (Cost >= 0 && !StressSched)
+          PhysReg = 0;
+
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        // Special-case TokenFactor chains as zero-latency.
+        if(isChain && OpN->getOpcode() == ISD::TokenFactor)
+          OpLatency = 0;
+
+        SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
+          : SDep(OpSU, SDep::Data, PhysReg);
+        Dep.setLatency(OpLatency);
+        if (!isChain && !UnitLatencies) {
+          computeOperandLatency(OpN, N, i, Dep);
+          ST.adjustSchedDependency(OpSU, SU, Dep);
+        }
+
+        if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+          // Multiple register uses are combined in the same SUnit. For example,
+          // we could have a set of glued nodes with all their defs consumed by
+          // another set of glued nodes. Register pressure tracking sees this as
+          // a single use, so to keep pressure balanced we reduce the defs.
+          //
+          // We can't tell (without more book-keeping) if this results from
+          // glued nodes or duplicate operands. As long as we don't reduce
+          // NumRegDefsLeft to zero, we handle the common cases well.
+          --OpSU->NumRegDefsLeft;
+        }
+      }
+    }
+  }
+}
+
+/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+/// are input.  This SUnit graph is similar to the SelectionDAG, but
+/// excludes nodes that aren't interesting to scheduling, and represents
+/// glued together nodes with a single SUnit.
+void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
+  // Populate the SUnits array.
+  BuildSchedUnits();
+  // Compute all the scheduling dependencies between nodes.
+  AddSchedEdges();
+}
+
+// Initialize NumNodeDefs for the current Node's opcode.
+void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
+  // Check for phys reg copy.
+  if (!Node)
+    return;
+
+  if (!Node->isMachineOpcode()) {
+    if (Node->getOpcode() == ISD::CopyFromReg)
+      NodeNumDefs = 1;
+    else
+      NodeNumDefs = 0;
+    return;
+  }
+  unsigned POpc = Node->getMachineOpcode();
+  if (POpc == TargetOpcode::IMPLICIT_DEF) {
+    // No register need be allocated for this.
+    NodeNumDefs = 0;
+    return;
+  }
+  unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
+  // Some instructions define regs that are not represented in the selection DAG
+  // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
+  NodeNumDefs = std::min(Node->getNumValues(), NRegDefs);
+  DefIdx = 0;
+}
+
+// Construct a RegDefIter for this SUnit and find the first valid value.
+ScheduleDAGSDNodes::RegDefIter::RegDefIter(const SUnit *SU,
+                                           const ScheduleDAGSDNodes *SD)
+  : SchedDAG(SD), Node(SU->getNode()), DefIdx(0), NodeNumDefs(0) {
+  InitNodeNumDefs();
+  Advance();
+}
+
+// Advance to the next valid value defined by the SUnit.
+void ScheduleDAGSDNodes::RegDefIter::Advance() {
+  for (;Node;) { // Visit all glued nodes.
+    for (;DefIdx < NodeNumDefs; ++DefIdx) {
+      if (!Node->hasAnyUseOfValue(DefIdx))
+        continue;
+      ValueType = Node->getSimpleValueType(DefIdx);
+      ++DefIdx;
+      return; // Found a normal regdef.
+    }
+    Node = Node->getGluedNode();
+    if (Node == NULL) {
+      return; // No values left to visit.
+    }
+    InitNodeNumDefs();
+  }
+}
+
+void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
+  assert(SU->NumRegDefsLeft == 0 && "expect a new node");
+  for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
+    assert(SU->NumRegDefsLeft < USHRT_MAX && "overflow is ok but unexpected");
+    ++SU->NumRegDefsLeft;
+  }
+}
+
+void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
+  SDNode *N = SU->getNode();
+
+  // TokenFactor operands are considered zero latency, and some schedulers
+  // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
+  // whenever node latency is nonzero.
+  if (N && N->getOpcode() == ISD::TokenFactor) {
+    SU->Latency = 0;
+    return;
+  }
+
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies()) {
+    SU->Latency = 1;
+    return;
+  }
+
+  if (!InstrItins || InstrItins->isEmpty()) {
+    if (N && N->isMachineOpcode() &&
+        TII->isHighLatencyDef(N->getMachineOpcode()))
+      SU->Latency = HighLatencyCycles;
+    else
+      SU->Latency = 1;
+    return;
+  }
+
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes glued together into this SUnit.
+  SU->Latency = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode())
+      SU->Latency += TII->getInstrLatency(InstrItins, N);
+}
+
+void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
+                                               unsigned OpIdx, SDep& dep) const{
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies())
+    return;
+
+  if (dep.getKind() != SDep::Data)
+    return;
+
+  unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
+  if (Use->isMachineOpcode())
+    // Adjust the use operand index by num of defs.
+    OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
+  int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+  if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+      !BB->succ_empty()) {
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      // This copy is a liveout value. It is likely coalesced, so reduce the
+      // latency so not to penalize the def.
+      // FIXME: need target specific adjustment here?
+      Latency = (Latency > 1) ? Latency - 1 : 1;
+  }
+  if (Latency >= 0)
+    dep.setLatency(Latency);
+}
+
+void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (!SU->getNode()) {
+    dbgs() << "PHYS REG COPY\n";
+    return;
+  }
+
+  SU->getNode()->dump(DAG);
+  dbgs() << "\n";
+  SmallVector<SDNode *, 4> GluedNodes;
+  for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
+    GluedNodes.push_back(N);
+  while (!GluedNodes.empty()) {
+    dbgs() << "    ";
+    GluedNodes.back()->dump(DAG);
+    dbgs() << "\n";
+    GluedNodes.pop_back();
+  }
+#endif
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGSDNodes::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+#ifndef NDEBUG
+/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
+/// their state is consistent with the nodes listed in Sequence.
+///
+void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
+  unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif // NDEBUG
+
+namespace {
+  struct OrderSorter {
+    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
+                    const std::pair<unsigned, MachineInstr*> &B) {
+      return A.first < B.first;
+    }
+  };
+}
+
+/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
+static void ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG,
+                               InstrEmitter &Emitter,
+                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
+                            DenseMap<SDValue, unsigned> &VRBaseMap,
+                            unsigned Order) {
+  if (!N->getHasDebugValue())
+    return;
+
+  // Opportunistically insert immediate dbg_value uses, i.e. those with source
+  // order number right after the N.
+  MachineBasicBlock *BB = Emitter.getBlock();
+  MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+  ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N);
+  for (unsigned i = 0, e = DVs.size(); i != e; ++i) {
+    if (DVs[i]->isInvalidated())
+      continue;
+    unsigned DVOrder = DVs[i]->getOrder();
+    if (!Order || DVOrder == ++Order) {
+      MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);
+      if (DbgMI) {
+        Orders.push_back(std::make_pair(DVOrder, DbgMI));
+        BB->insert(InsertPos, DbgMI);
+      }
+      DVs[i]->setIsInvalidated();
+    }
+  }
+}
+
+// ProcessSourceNode - Process nodes with source order numbers. These are added
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
+// instructions in the right order.
+static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
+                           InstrEmitter &Emitter,
+                           DenseMap<SDValue, unsigned> &VRBaseMap,
+                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
+                           SmallSet<unsigned, 8> &Seen) {
+  unsigned Order = DAG->GetOrdering(N);
+  if (!Order || !Seen.insert(Order)) {
+    // Process any valid SDDbgValues even if node does not have any order
+    // assigned.
+    ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
+    return;
+  }
+
+  MachineBasicBlock *BB = Emitter.getBlock();
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
+    // Did not insert any instruction.
+    Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
+    return;
+  }
+
+  Orders.push_back(std::make_pair(Order, prior(Emitter.getInsertPos())));
+  ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
+}
+
+void ScheduleDAGSDNodes::
+EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                MachineBasicBlock::iterator InsertPos) {
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      unsigned Reg = 0;
+      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
+             EE = SU->Succs.end(); II != EE; ++II) {
+        if (II->isCtrl()) continue;  // ignore chain preds
+        if (II->getReg()) {
+          Reg = II->getReg();
+          break;
+        }
+      }
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
+    } else {
+      // Copy from physical register.
+      assert(I->getReg() && "Unknown physical register!");
+      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
+    }
+    break;
+  }
+}
+
+/// EmitSchedule - Emit the machine code in scheduled order. Return the new
+/// InsertPos and MachineBasicBlock that contains this insertion
+/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
+/// not necessarily refer to returned BB. The emitter may split blocks.
+MachineBasicBlock *ScheduleDAGSDNodes::
+EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+  DenseMap<SUnit*, unsigned> CopyVRBaseMap;
+  SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
+  SmallSet<unsigned, 8> Seen;
+  bool HasDbg = DAG->hasDebugValues();
+
+  // If this is the first BB, emit byval parameter dbg_value's.
+  if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
+    SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
+    SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
+    for (; PDI != PDE; ++PDI) {
+      MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
+      if (DbgMI)
+        BB->insert(InsertPos, DbgMI);
+    }
+  }
+
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      TII->insertNoop(*Emitter.getBlock(), InsertPos);
+      continue;
+    }
+
+    // For pre-regalloc scheduling, create instructions corresponding to the
+    // SDNode and any glued SDNodes and append them to the block.
+    if (!SU->getNode()) {
+      // Emit a copy.
+      EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
+      continue;
+    }
+
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      SDNode *N = GluedNodes.back();
+      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned,
+                       VRBaseMap);
+      // Remember the source order of the inserted instruction.
+      if (HasDbg)
+        ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
+      GluedNodes.pop_back();
+    }
+    Emitter.EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned,
+                     VRBaseMap);
+    // Remember the source order of the inserted instruction.
+    if (HasDbg)
+      ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders,
+                        Seen);
+  }
+
+  // Insert all the dbg_values which have not already been inserted in source
+  // order sequence.
+  if (HasDbg) {
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
+
+    // Sort the source order instructions and use the order to insert debug
+    // values.
+    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+
+    SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
+    SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
+    // Now emit the rest according to source order.
+    unsigned LastOrder = 0;
+    for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
+      unsigned Order = Orders[i].first;
+      MachineInstr *MI = Orders[i].second;
+      // Insert all SDDbgValue's whose order(s) are before "Order".
+      if (!MI)
+        continue;
+      for (; DI != DE &&
+             (*DI)->getOrder() >= LastOrder && (*DI)->getOrder() < Order; ++DI) {
+        if ((*DI)->isInvalidated())
+          continue;
+        MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(llvm::next(Pos), DbgMI);
+          }
+        }
+      }
+      LastOrder = Order;
+    }
+    // Add trailing DbgValue's before the terminator. FIXME: May want to add
+    // some of them before one or more conditional branches?
+    SmallVector<MachineInstr*, 8> DbgMIs;
+    while (DI != DE) {
+      if (!(*DI)->isInvalidated())
+        if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap))
+          DbgMIs.push_back(DbgMI);
+      ++DI;
+    }
+
+    MachineBasicBlock *InsertBB = Emitter.getBlock();
+    MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
+    InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
+  }
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
+/// Return the basic block label.
+std::string ScheduleDAGSDNodes::getDAGName() const {
+  return "sunit-dag." + BB->getFullName();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
new file mode 100644
index 000000000000..2ff37e0a15e1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -0,0 +1,185 @@
+//===---- ScheduleDAGSDNodes.h - SDNode Scheduling --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGSDNodes class, which implements
+// scheduling for an SDNode-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCHEDULEDAGSDNODES_H
+#define SCHEDULEDAGSDNODES_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+namespace llvm {
+  /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
+  ///
+  /// Edges between SUnits are initially based on edges in the SelectionDAG,
+  /// and additional edges can be added by the schedulers as heuristics.
+  /// SDNodes such as Constants, Registers, and a few others that are not
+  /// interesting to schedulers are not allocated SUnits.
+  ///
+  /// SDNodes with MVT::Glue operands are grouped along with the flagged
+  /// nodes into a single SUnit so that they are scheduled together.
+  ///
+  /// SDNode-based scheduling graphs do not use SDep::Anti or SDep::Output
+  /// edges.  Physical register dependence information is not carried in
+  /// the DAG and must be handled explicitly by schedulers.
+  ///
+  class ScheduleDAGSDNodes : public ScheduleDAG {
+  public:
+    MachineBasicBlock *BB;
+    SelectionDAG *DAG;                    // DAG of the current basic block
+    const InstrItineraryData *InstrItins;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+    explicit ScheduleDAGSDNodes(MachineFunction &mf);
+
+    virtual ~ScheduleDAGSDNodes() {}
+
+    /// Run - perform scheduling.
+    ///
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb);
+
+    /// isPassiveNode - Return true if the node is a non-scheduled leaf.
+    ///
+    static bool isPassiveNode(SDNode *Node) {
+      if (isa<ConstantSDNode>(Node))       return true;
+      if (isa<ConstantFPSDNode>(Node))     return true;
+      if (isa<RegisterSDNode>(Node))       return true;
+      if (isa<RegisterMaskSDNode>(Node))   return true;
+      if (isa<GlobalAddressSDNode>(Node))  return true;
+      if (isa<BasicBlockSDNode>(Node))     return true;
+      if (isa<FrameIndexSDNode>(Node))     return true;
+      if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<TargetIndexSDNode>(Node))    return true;
+      if (isa<JumpTableSDNode>(Node))      return true;
+      if (isa<ExternalSymbolSDNode>(Node)) return true;
+      if (isa<BlockAddressSDNode>(Node))   return true;
+      if (Node->getOpcode() == ISD::EntryToken ||
+          isa<MDNodeSDNode>(Node)) return true;
+      return false;
+    }
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *newSUnit(SDNode *N);
+
+    /// Clone - Creates a clone of the specified SUnit. It does not copy the
+    /// predecessors / successors info nor the temporary scheduling states.
+    ///
+    SUnit *Clone(SUnit *N);
+
+    /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+    /// are input.  This SUnit graph is similar to the SelectionDAG, but
+    /// excludes nodes that aren't interesting to scheduling, and represents
+    /// flagged together nodes with a single SUnit.
+    void BuildSchedGraph(AliasAnalysis *AA);
+
+    /// InitVRegCycleFlag - Set isVRegCycle if this node's single use is
+    /// CopyToReg and its only active data operands are CopyFromReg within a
+    /// single block loop.
+    ///
+    void InitVRegCycleFlag(SUnit *SU);
+
+    /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
+    ///
+    void InitNumRegDefsLeft(SUnit *SU);
+
+    /// computeLatency - Compute node latency.
+    ///
+    virtual void computeLatency(SUnit *SU);
+
+    virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
+                                       unsigned OpIdx, SDep& dep) const;
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    /// VerifyScheduledSequence - Verify that all SUnits are scheduled and
+    /// consistent with the Sequence of scheduled instructions.
+    void VerifyScheduledSequence(bool isBottomUp);
+
+    /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
+    /// according to the order specified in Sequence.
+    ///
+    virtual MachineBasicBlock*
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+    virtual void dumpNode(const SUnit *SU) const;
+
+    void dumpSchedule() const;
+
+    virtual std::string getGraphNodeLabel(const SUnit *SU) const;
+
+    virtual std::string getDAGName() const;
+
+    virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
+
+    /// RegDefIter - In place iteration over the values defined by an
+    /// SUnit. This does not need copies of the iterator or any other STLisms.
+    /// The iterator creates itself, rather than being provided by the SchedDAG.
+    class RegDefIter {
+      const ScheduleDAGSDNodes *SchedDAG;
+      const SDNode *Node;
+      unsigned DefIdx;
+      unsigned NodeNumDefs;
+      MVT ValueType;
+    public:
+      RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
+
+      bool IsValid() const { return Node != NULL; }
+
+      MVT GetValue() const {
+        assert(IsValid() && "bad iterator");
+        return ValueType;
+      }
+
+      const SDNode *GetNode() const {
+        return Node;
+      }
+
+      unsigned GetIdx() const {
+        return DefIdx-1;
+      }
+
+      void Advance();
+    private:
+      void InitNodeNumDefs();
+    };
+
+  protected:
+    /// ForceUnitLatencies - Return true if all scheduling edges should be given
+    /// a latency value of one.  The default is to return false; schedulers may
+    /// override this as needed.
+    virtual bool forceUnitLatencies() const { return false; }
+
+  private:
+    /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
+    /// combined SUnits.
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
+
+    /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
+    void BuildSchedUnits();
+    void AddSchedEdges();
+
+    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                         MachineBasicBlock::iterator InsertPos);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
new file mode 100644
index 000000000000..58aa1fe0ebbe
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -0,0 +1,278 @@
+//===- ScheduleDAGVLIW.cpp - SelectionDAG list scheduler for VLIW -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <climits>
+using namespace llvm;
+
+STATISTIC(NumNoops , "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static RegisterScheduler
+  VLIWScheduler("vliw-td", "VLIW scheduler",
+                createVLIWDAGScheduler);
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGVLIW - The actual DFA list scheduler implementation.  This
+/// supports / top-down scheduling.
+///
+class ScheduleDAGVLIW : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  ///
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands become available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// AA - AliasAnalysis for making memory reference queries.
+  AliasAnalysis *AA;
+
+public:
+  ScheduleDAGVLIW(MachineFunction &mf,
+                  AliasAnalysis *aa,
+                  SchedulingPriorityQueue *availqueue)
+    : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
+
+    const TargetMachine &tm = mf.getTarget();
+    HazardRec = tm.getInstrInfo()->CreateTargetHazardRecognizer(&tm, this);
+  }
+
+  ~ScheduleDAGVLIW() {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule();
+
+private:
+  void releaseSucc(SUnit *SU, const SDep &D);
+  void releaseSuccessors(SUnit *SU);
+  void scheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+  void listScheduleTopDown();
+};
+}  // end anonymous namespace
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGVLIW::Schedule() {
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " '" << BB->getName() << "' **********\n");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(AA);
+
+  AvailableQueue->initNodes(SUnits);
+
+  listScheduleTopDown();
+
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// releaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGVLIW::releaseSucc(SUnit *SU, const SDep &D) {
+  SUnit *SuccSU = D.getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  assert(!D.isWeak() && "unexpected artificial DAG edge");
+
+  --SuccSU->NumPredsLeft;
+
+  SuccSU->setDepthToAtLeast(SU->getDepth() + D.getLatency());
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) {
+    PendingQueue.push_back(SuccSU);
+  }
+}
+
+void ScheduleDAGVLIW::releaseSuccessors(SUnit *SU) {
+  // Top down: release successors.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-td scheduler doesn't yet support physreg dependencies!");
+
+    releaseSucc(SU, *I);
+  }
+}
+
+/// scheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGVLIW::scheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  releaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->scheduledNode(SU);
+}
+
+/// listScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGVLIW::listScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  releaseSuccessors(&EntrySU);
+
+  // All leaves to AvailableQueue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // While AvailableQueue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() == CurCycle) {
+        AvailableQueue->push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      }
+      else {
+        assert(PendingQueue[i]->getDepth() > CurCycle && "Negative latency?");
+      }
+    }
+
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue->empty()) {
+      // Reset DFA state.
+      AvailableQueue->scheduledNode(0);
+      ++CurCycle;
+      continue;
+    }
+
+    SUnit *FoundSUnit = 0;
+
+    bool HasNoopHazards = false;
+    while (!AvailableQueue->empty()) {
+      SUnit *CurSUnit = AvailableQueue->pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue->push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      scheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      DEBUG(dbgs() << "*** Emitting noop\n");
+      HazardRec->EmitNoop();
+      Sequence.push_back(0);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+/// createVLIWDAGScheduler - This creates a top-down list scheduler.
+ScheduleDAGSDNodes *
+llvm::createVLIWDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGVLIW(*IS->MF, IS->AA, new ResourcePriorityQueue(IS));
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
new file mode 100644
index 000000000000..64244313a326
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -0,0 +1,6382 @@
+//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SDNodeDbgValue.h"
+#include "SDNodeOrdering.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+/// makeVTList - Return an instance of the SDVTList struct initialized with the
+/// specified members.
+static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
+  SDVTList Res = {VTs, NumVTs};
+  return Res;
+}
+
+// Default null implementations of the callbacks.
+void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
+
+//===----------------------------------------------------------------------===//
+//                              ConstantFPSDNode Class
+//===----------------------------------------------------------------------===//
+
+/// isExactlyValue - We don't rely on operator== working on double values, as
+/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+/// As such, this method can be used to do an exact bit-for-bit comparison of
+/// two floating point values.
+bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+  return getValueAPF().bitwiseIsEqual(V);
+}
+
+bool ConstantFPSDNode::isValueValidForType(EVT VT,
+                                           const APFloat& Val) {
+  assert(VT.isFloatingPoint() && "Can only convert between FP types");
+
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven,
+                      &losesInfo);
+  return !losesInfo;
+}
+
+//===----------------------------------------------------------------------===//
+//                              ISD Namespace
+//===----------------------------------------------------------------------===//
+
+/// isBuildVectorAllOnes - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are ~0 or undef.
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements. We have to be a bit careful here, as the type of the constant
+  // may not be the same as the type of the vector elements due to type
+  // legalization (the elements are promoted to a legal type for the target and
+  // a vector of a type may be legal when the base element type is not).
+  // We only want to check enough bits to cover the vector elements, because
+  // we care if the resultant vector is all ones, not whether the individual
+  // constants are.
+  SDValue NotZero = N->getOperand(i);
+  unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
+    if (CN->getAPIntValue().countTrailingOnes() < EltSize)
+      return false;
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
+    if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs. Even with the above element type twiddling, this should be OK, as
+  // the same type legalization should have applied to all the elements.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != NotZero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+
+/// isBuildVectorAllZeros - Return true if the specified node is a
+/// BUILD_VECTOR where all of the elements are 0 or undef.
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).getOpcode() == ISD::UNDEF)
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-0
+  // elements.
+  SDValue Zero = N->getOperand(i);
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Zero)) {
+    if (!CN->isNullValue())
+      return false;
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Zero)) {
+    if (!CFPN->getValueAPF().isPosZero())
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one 0 value, check to see if the rest match or are
+  // undefs.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != Zero &&
+        N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+  return true;
+}
+
+/// isScalarToVector - Return true if the specified node is a
+/// ISD::SCALAR_TO_VECTOR node or a BUILD_VECTOR node where only the low
+/// element is not an undef.
+bool ISD::isScalarToVector(const SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return true;
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  if (N->getOperand(0).getOpcode() == ISD::UNDEF)
+    return false;
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems == 1)
+    return false;
+  for (unsigned i = 1; i < NumElems; ++i) {
+    SDValue V = N->getOperand(i);
+    if (V.getOpcode() != ISD::UNDEF)
+      return false;
+  }
+  return true;
+}
+
+/// allOperandsUndef - Return true if the node has at least one operand
+/// and all operands of the specified node are ISD::UNDEF.
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i)
+    if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+
+  return true;
+}
+
+/// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
+/// when given the operation for (X op Y).
+ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+  // To perform this operation, we just need to swap the L and G bits of the
+  // operation.
+  unsigned OldL = (Operation >> 2) & 1;
+  unsigned OldG = (Operation >> 1) & 1;
+  return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                       (OldL << 1) |       // New G bit
+                       (OldG << 2));       // New L bit.
+}
+
+/// getSetCCInverse - Return the operation corresponding to !(X op Y), where
+/// 'op' is a valid SetCC operation.
+ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
+  unsigned Operation = Op;
+  if (isInteger)
+    Operation ^= 7;   // Flip L, G, E bits, but not U.
+  else
+    Operation ^= 15;  // Flip all of the condition bits.
+
+  if (Operation > ISD::SETTRUE2)
+    Operation &= ~8;  // Don't let N and U bits get set.
+
+  return ISD::CondCode(Operation);
+}
+
+
+/// isSignedOp - For an integer comparison, return 1 if the comparison is a
+/// signed operation and 2 if the result is an unsigned comparison.  Return zero
+/// if the operation does not depend on the sign of the input (setne and seteq).
+static int isSignedOp(ISD::CondCode Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Illegal integer setcc operation!");
+  case ISD::SETEQ:
+  case ISD::SETNE: return 0;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return 1;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return 2;
+  }
+}
+
+/// getSetCCOrOperation - Return the result of a logical OR between different
+/// comparisons of identical values: ((X op1 Y) | (X op2 Y)).  This function
+/// returns SETCC_INVALID if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                       bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed integer setcc with an unsigned integer setcc.
+    return ISD::SETCC_INVALID;
+
+  unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+
+  // If the N and U bits get set then the resultant comparison DOES suddenly
+  // care about orderedness, and is true when ordered.
+  if (Op > ISD::SETTRUE2)
+    Op &= ~16;     // Clear the U bit if the N bit is set.
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+    Op = ISD::SETNE;
+
+  return ISD::CondCode(Op);
+}
+
+/// getSetCCAndOperation - Return the result of a logical AND between different
+/// comparisons of identical values: ((X op1 Y) & (X op2 Y)).  This
+/// function returns zero if it is not possible to represent the resultant
+/// comparison.
+ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed setcc with an unsigned setcc.
+    return ISD::SETCC_INVALID;
+
+  // Combine all of the condition bits.
+  ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger) {
+    switch (Result) {
+    default: break;
+    case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+    case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+    case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+    case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+    }
+  }
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                           SDNode Profile Support
+//===----------------------------------------------------------------------===//
+
+/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+///
+static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+  ID.AddInteger(OpC);
+}
+
+/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+/// solely with their pointer.
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+  ID.AddPointer(VTList.VTs);
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDValue *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              const SDUse *Ops, unsigned NumOps) {
+  for (; NumOps; --NumOps, ++Ops) {
+    ID.AddPointer(Ops->getNode());
+    ID.AddInteger(Ops->getResNo());
+  }
+}
+
+static void AddNodeIDNode(FoldingSetNodeID &ID,
+                          unsigned short OpC, SDVTList VTList,
+                          const SDValue *OpList, unsigned N) {
+  AddNodeIDOpcode(ID, OpC);
+  AddNodeIDValueTypes(ID, VTList);
+  AddNodeIDOperands(ID, OpList, N);
+}
+
+/// AddNodeIDCustom - If this is an SDNode with special info, add this info to
+/// the NodeID data.
+static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::TargetExternalSymbol:
+  case ISD::ExternalSymbol:
+    llvm_unreachable("Should only be used on nodes with operands");
+  default: break;  // Normal nodes don't need extra info.
+  case ISD::TargetConstant:
+  case ISD::Constant:
+    ID.AddPointer(cast<ConstantSDNode>(N)->getConstantIntValue());
+    break;
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+    break;
+  }
+  case ISD::TargetGlobalAddress:
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::GlobalTLSAddress: {
+    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+    ID.AddPointer(GA->getGlobal());
+    ID.AddInteger(GA->getOffset());
+    ID.AddInteger(GA->getTargetFlags());
+    ID.AddInteger(GA->getAddressSpace());
+    break;
+  }
+  case ISD::BasicBlock:
+    ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+    break;
+  case ISD::Register:
+    ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+    break;
+  case ISD::RegisterMask:
+    ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
+    break;
+  case ISD::SRCVALUE:
+    ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+    break;
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+    break;
+  case ISD::JumpTable:
+  case ISD::TargetJumpTable:
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
+    break;
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    ID.AddInteger(CP->getAlignment());
+    ID.AddInteger(CP->getOffset());
+    if (CP->isMachineConstantPoolEntry())
+      CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
+    else
+      ID.AddPointer(CP->getConstVal());
+    ID.AddInteger(CP->getTargetFlags());
+    break;
+  }
+  case ISD::TargetIndex: {
+    const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+    ID.AddInteger(TI->getIndex());
+    ID.AddInteger(TI->getOffset());
+    ID.AddInteger(TI->getTargetFlags());
+    break;
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *LD = cast<LoadSDNode>(N);
+    ID.AddInteger(LD->getMemoryVT().getRawBits());
+    ID.AddInteger(LD->getRawSubclassData());
+    ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::STORE: {
+    const StoreSDNode *ST = cast<StoreSDNode>(N);
+    ID.AddInteger(ST->getMemoryVT().getRawBits());
+    ID.AddInteger(ST->getRawSubclassData());
+    ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
+    const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+    ID.AddInteger(AT->getMemoryVT().getRawBits());
+    ID.AddInteger(AT->getRawSubclassData());
+    ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::PREFETCH: {
+    const MemSDNode *PF = cast<MemSDNode>(N);
+    ID.AddInteger(PF->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+    for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
+         i != e; ++i)
+      ID.AddInteger(SVN->getMaskElt(i));
+    break;
+  }
+  case ISD::TargetBlockAddress:
+  case ISD::BlockAddress: {
+    const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+    ID.AddPointer(BA->getBlockAddress());
+    ID.AddInteger(BA->getOffset());
+    ID.AddInteger(BA->getTargetFlags());
+    break;
+  }
+  } // end switch (N->getOpcode())
+
+  // Target specific memory nodes could also have address spaces to check.
+  if (N->isTargetMemoryOpcode())
+    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
+}
+
+/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+/// data.
+static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+  AddNodeIDOpcode(ID, N->getOpcode());
+  // Add the return value info.
+  AddNodeIDValueTypes(ID, N->getVTList());
+  // Add the operand info.
+  AddNodeIDOperands(ID, N->op_begin(), N->getNumOperands());
+
+  // Handle SDNode leafs with special info.
+  AddNodeIDCustom(ID, N);
+}
+
+/// encodeMemSDNodeFlags - Generic routine for computing a value for use in
+/// the CSE map that carries volatility, temporalness, indexing mode, and
+/// extension/truncation information.
+///
+static inline unsigned
+encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM, bool isVolatile,
+                     bool isNonTemporal, bool isInvariant) {
+  assert((ConvType & 3) == ConvType &&
+         "ConvType may not require more than 2 bits!");
+  assert((AM & 7) == AM &&
+         "AM may not require more than 3 bits!");
+  return ConvType |
+         (AM << 2) |
+         (isVolatile << 5) |
+         (isNonTemporal << 6) |
+         (isInvariant << 7);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectionDAG Class
+//===----------------------------------------------------------------------===//
+
+/// doNotCSE - Return true if CSE should not be performed for this node.
+static bool doNotCSE(SDNode *N) {
+  if (N->getValueType(0) == MVT::Glue)
+    return true; // Never CSE anything that produces a flag.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::HANDLENODE:
+  case ISD::EH_LABEL:
+    return true;   // Never CSE these nodes.
+  }
+
+  // Check that remaining values produced are not flags.
+  for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+    if (N->getValueType(i) == MVT::Glue)
+      return true; // Never CSE anything that produces a flag.
+
+  return false;
+}
+
+/// RemoveDeadNodes - This method deletes all unreachable nodes in the
+/// SelectionDAG.
+void SelectionDAG::RemoveDeadNodes() {
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted.
+  HandleSDNode Dummy(getRoot());
+
+  SmallVector<SDNode*, 128> DeadNodes;
+
+  // Add all obviously-dead nodes to the DeadNodes worklist.
+  for (allnodes_iterator I = allnodes_begin(), E = allnodes_end(); I != E; ++I)
+    if (I->use_empty())
+      DeadNodes.push_back(I);
+
+  RemoveDeadNodes(DeadNodes);
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  setRoot(Dummy.getValue());
+}
+
+/// RemoveDeadNodes - This method deletes the unreachable nodes in the
+/// given list, and any nodes that become unreachable as a result.
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
+
+  // Process the worklist, deleting the nodes and adding their uses to the
+  // worklist.
+  while (!DeadNodes.empty()) {
+    SDNode *N = DeadNodes.pop_back_val();
+
+    for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+      DUL->NodeDeleted(N, 0);
+
+    // Take the node out of the appropriate CSE map.
+    RemoveNodeFromCSEMaps(N);
+
+    // Next, brutally remove the operand list.  This is safe to do, as there are
+    // no cycles in the graph.
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+      SDUse &Use = *I++;
+      SDNode *Operand = Use.getNode();
+      Use.set(SDValue());
+
+      // Now that we removed this operand, see if there are no uses of it left.
+      if (Operand->use_empty())
+        DeadNodes.push_back(Operand);
+    }
+
+    DeallocateNode(N);
+  }
+}
+
+void SelectionDAG::RemoveDeadNode(SDNode *N){
+  SmallVector<SDNode*, 16> DeadNodes(1, N);
+
+  // Create a dummy node that adds a reference to the root node, preventing
+  // it from being deleted.  (This matters if the root is an operand of the
+  // dead node.)
+  HandleSDNode Dummy(getRoot());
+
+  RemoveDeadNodes(DeadNodes);
+}
+
+void SelectionDAG::DeleteNode(SDNode *N) {
+  // First take this out of the appropriate CSE map.
+  RemoveNodeFromCSEMaps(N);
+
+  // Finally, remove uses due to operands of this node, remove from the
+  // AllNodes list, and delete the node.
+  DeleteNodeNotInCSEMaps(N);
+}
+
+void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+  assert(N != AllNodes.begin() && "Cannot delete the entry node!");
+  assert(N->use_empty() && "Cannot delete a node that is not dead!");
+
+  // Drop all of the operands and decrement used node's use counts.
+  N->DropOperands();
+
+  DeallocateNode(N);
+}
+
+void SelectionDAG::DeallocateNode(SDNode *N) {
+  if (N->OperandsNeedDelete)
+    delete[] N->OperandList;
+
+  // Set the opcode to DELETED_NODE to help catch bugs when node
+  // memory is reallocated.
+  N->NodeType = ISD::DELETED_NODE;
+
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // Remove the ordering of this node.
+  Ordering->remove(N);
+
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
+  ArrayRef<SDDbgValue*> DbgVals = DbgInfo->getSDDbgValues(N);
+  for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
+    DbgVals[i]->setIsInvalidated();
+}
+
+/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+/// correspond to it.  This is useful when we're about to delete or repurpose
+/// the node.  We don't want future request for structurally identical nodes
+/// to return N anymore.
+bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+  bool Erased = false;
+  switch (N->getOpcode()) {
+  case ISD::HANDLENODE: return false;  // noop.
+  case ISD::CONDCODE:
+    assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+           "Cond code doesn't exist!");
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != 0;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = 0;
+    break;
+  case ISD::ExternalSymbol:
+    Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::TargetExternalSymbol: {
+    ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
+    Erased = TargetExternalSymbols.erase(
+               std::pair<std::string,unsigned char>(ESN->getSymbol(),
+                                                    ESN->getTargetFlags()));
+    break;
+  }
+  case ISD::VALUETYPE: {
+    EVT VT = cast<VTSDNode>(N)->getVT();
+    if (VT.isExtended()) {
+      Erased = ExtendedValueTypeNodes.erase(VT);
+    } else {
+      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != 0;
+      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = 0;
+    }
+    break;
+  }
+  default:
+    // Remove it from the CSE Map.
+    assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
+    assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
+    Erased = CSEMap.RemoveNode(N);
+    break;
+  }
+#ifndef NDEBUG
+  // Verify that the node was actually in one of the CSE maps, unless it has a
+  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // not subject to CSE.
+  if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
+      !N->isMachineOpcode() && !doNotCSE(N)) {
+    N->dump(this);
+    dbgs() << "\n";
+    llvm_unreachable("Node is not in map!");
+  }
+#endif
+  return Erased;
+}
+
+/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+/// maps and modified in place. Add it back to the CSE maps, unless an identical
+/// node already exists, in which case transfer all its users to the existing
+/// node. This transfer can potentially trigger recursive merging.
+///
+void
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
+  // For node types that aren't CSE'd, just act as if no identical node
+  // already exists.
+  if (!doNotCSE(N)) {
+    SDNode *Existing = CSEMap.GetOrInsertNode(N);
+    if (Existing != N) {
+      // If there was already an existing matching node, use ReplaceAllUsesWith
+      // to replace the dead one with the existing one.  This can cause
+      // recursive merging of other unrelated nodes down the line.
+      ReplaceAllUsesWith(N, Existing);
+
+      // N is now dead. Inform the listeners and delete it.
+      for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+        DUL->NodeDeleted(N, Existing);
+      DeleteNodeNotInCSEMaps(N);
+      return;
+    }
+  }
+
+  // If the node doesn't already exist, we updated it.  Inform listeners.
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeUpdated(N);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 1);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           SDValue Op1, SDValue Op2,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  SDValue Ops[] = { Op1, Op2 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, 2);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           const SDValue *Ops,unsigned NumOps,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return 0;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops, NumOps);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  return Node;
+}
+
+#ifndef NDEBUG
+/// VerifyNodeCommon - Sanity check the given node.  Aborts if it is invalid.
+static void VerifyNodeCommon(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+      assert(I->getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
+    break;
+  }
+  }
+}
+
+/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+static void VerifySDNode(SDNode *N) {
+  // The SDNode allocators cannot be used to allocate nodes with fields that are
+  // not present in an SDNode!
+  assert(!isa<MemSDNode>(N) && "Bad MemSDNode!");
+  assert(!isa<ShuffleVectorSDNode>(N) && "Bad ShuffleVectorSDNode!");
+  assert(!isa<ConstantSDNode>(N) && "Bad ConstantSDNode!");
+  assert(!isa<ConstantFPSDNode>(N) && "Bad ConstantFPSDNode!");
+  assert(!isa<GlobalAddressSDNode>(N) && "Bad GlobalAddressSDNode!");
+  assert(!isa<FrameIndexSDNode>(N) && "Bad FrameIndexSDNode!");
+  assert(!isa<JumpTableSDNode>(N) && "Bad JumpTableSDNode!");
+  assert(!isa<ConstantPoolSDNode>(N) && "Bad ConstantPoolSDNode!");
+  assert(!isa<BasicBlockSDNode>(N) && "Bad BasicBlockSDNode!");
+  assert(!isa<SrcValueSDNode>(N) && "Bad SrcValueSDNode!");
+  assert(!isa<MDNodeSDNode>(N) && "Bad MDNodeSDNode!");
+  assert(!isa<RegisterSDNode>(N) && "Bad RegisterSDNode!");
+  assert(!isa<BlockAddressSDNode>(N) && "Bad BlockAddressSDNode!");
+  assert(!isa<EHLabelSDNode>(N) && "Bad EHLabelSDNode!");
+  assert(!isa<ExternalSymbolSDNode>(N) && "Bad ExternalSymbolSDNode!");
+  assert(!isa<CondCodeSDNode>(N) && "Bad CondCodeSDNode!");
+  assert(!isa<CvtRndSatSDNode>(N) && "Bad CvtRndSatSDNode!");
+  assert(!isa<VTSDNode>(N) && "Bad VTSDNode!");
+  assert(!isa<MachineSDNode>(N) && "Bad MachineSDNode!");
+
+  VerifyNodeCommon(N);
+}
+
+/// VerifyMachineNode - Sanity check the given MachineNode.  Aborts if it is
+/// invalid.
+static void VerifyMachineNode(SDNode *N) {
+  // The MachineNode allocators cannot be used to allocate nodes with fields
+  // that are not present in a MachineNode!
+  // Currently there are no such nodes.
+
+  VerifyNodeCommon(N);
+}
+#endif // NDEBUG
+
+/// getEVTAlignment - Compute the default alignment value for the
+/// given type.
+///
+unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
+  Type *Ty = VT == MVT::iPTR ?
+                   PointerType::get(Type::getInt8Ty(*getContext()), 0) :
+                   VT.getTypeForEVT(*getContext());
+
+  return TLI.getDataLayout()->getABITypeAlignment(Ty);
+}
+
+// EntryNode could meaningfully have debug info if we can find it...
+SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
+  : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
+    TTI(0), OptLevel(OL), EntryNode(ISD::EntryToken, DebugLoc(),
+                                    getVTList(MVT::Other)),
+    Root(getEntryNode()), Ordering(0), UpdateListeners(0) {
+  AllNodes.push_back(&EntryNode);
+  Ordering = new SDNodeOrdering();
+  DbgInfo = new SDDbgInfo();
+}
+
+void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti) {
+  MF = &mf;
+  TTI = tti;
+  Context = &mf.getFunction()->getContext();
+}
+
+SelectionDAG::~SelectionDAG() {
+  assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
+  allnodes_clear();
+  delete Ordering;
+  delete DbgInfo;
+}
+
+void SelectionDAG::allnodes_clear() {
+  assert(&*AllNodes.begin() == &EntryNode);
+  AllNodes.remove(AllNodes.begin());
+  while (!AllNodes.empty())
+    DeallocateNode(AllNodes.begin());
+}
+
+void SelectionDAG::clear() {
+  allnodes_clear();
+  OperandAllocator.Reset();
+  CSEMap.clear();
+
+  ExtendedValueTypeNodes.clear();
+  ExternalSymbols.clear();
+  TargetExternalSymbols.clear();
+  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+            static_cast<CondCodeSDNode*>(0));
+  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+            static_cast<SDNode*>(0));
+
+  EntryNode.UseList = 0;
+  AllNodes.push_back(&EntryNode);
+  Root = getEntryNode();
+  Ordering->clear();
+  DbgInfo->clear();
+}
+
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ANY_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
+  assert(!VT.isVector() &&
+         "getZeroExtendInReg should use the vector element type instead of "
+         "the vector type!");
+  if (Op.getValueType() == VT) return Op;
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  APInt Imm = APInt::getLowBitsSet(BitWidth,
+                                   VT.getSizeInBits());
+  return getNode(ISD::AND, DL, Op.getValueType(), Op,
+                 getConstant(Imm, Op.getValueType()));
+}
+
+/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+///
+SDValue SelectionDAG::getNOT(DebugLoc DL, SDValue Val, EVT VT) {
+  EVT EltVT = VT.getScalarType();
+  SDValue NegOne =
+    getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
+  return getNode(ISD::XOR, DL, VT, Val, NegOne);
+}
+
+SDValue SelectionDAG::getConstant(uint64_t Val, EVT VT, bool isT) {
+  EVT EltVT = VT.getScalarType();
+  assert((EltVT.getSizeInBits() >= 64 ||
+         (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+         "getConstant with a uint64_t value that doesn't fit in the type!");
+  return getConstant(APInt(EltVT.getSizeInBits(), Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const APInt &Val, EVT VT, bool isT) {
+  return getConstant(*ConstantInt::get(*Context, Val), VT, isT);
+}
+
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
+  assert(VT.isInteger() && "Cannot create FP integer constant!");
+
+  EVT EltVT = VT.getScalarType();
+  const ConstantInt *Elt = &Val;
+
+  // In some cases the vector type is legal but the element type is illegal and
+  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
+  // inserted value (the type does not need to match the vector element type).
+  // Any extra bits introduced will be truncated away.
+  if (VT.isVector() && TLI.getTypeAction(*getContext(), EltVT) ==
+      TargetLowering::TypePromoteInteger) {
+   EltVT = TLI.getTypeToTransformTo(*getContext(), EltVT);
+   APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
+   Elt = ConstantInt::get(*getContext(), NewVal);
+  }
+
+  assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
+  unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(Elt);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = new (NodeAllocator) ConstantSDNode(isT, Elt, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
+  return getConstant(Val, TLI.getPointerTy(), isTarget);
+}
+
+
+SDValue SelectionDAG::getConstantFP(const APFloat& V, EVT VT, bool isTarget) {
+  return getConstantFP(*ConstantFP::get(*getContext(), V), VT, isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
+  assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+
+  EVT EltVT = VT.getScalarType();
+
+  // Do the map lookup using the actual bit pattern for the floating point
+  // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+  // we don't have issues with SNANs.
+  unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
+  ID.AddPointer(&V);
+  void *IP = 0;
+  SDNode *N = NULL;
+  if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = new (NodeAllocator) ConstantFPSDNode(isTarget, &V, EltVT);
+    CSEMap.InsertNode(N, IP);
+    AllNodes.push_back(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.assign(VT.getVectorNumElements(), Result);
+    // FIXME DebugLoc info might be appropriate here
+    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+  }
+  return Result;
+}
+
+SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
+  EVT EltVT = VT.getScalarType();
+  if (EltVT==MVT::f32)
+    return getConstantFP(APFloat((float)Val), VT, isTarget);
+  else if (EltVT==MVT::f64)
+    return getConstantFP(APFloat(Val), VT, isTarget);
+  else if (EltVT==MVT::f80 || EltVT==MVT::f128 || EltVT==MVT::ppcf128 ||
+           EltVT==MVT::f16) {
+    bool ignored;
+    APFloat apf = APFloat(Val);
+    apf.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                &ignored);
+    return getConstantFP(apf, VT, isTarget);
+  } else
+    llvm_unreachable("Unsupported type in getConstantFP");
+}
+
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
+                                       EVT VT, int64_t Offset,
+                                       bool isTargetGA,
+                                       unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTargetGA) &&
+         "Cannot set target flags on target-independent globals");
+
+  // Truncate (with sign-extension) the offset value to the pointer size.
+  unsigned BitWidth = TLI.getPointerTy().getSizeInBits();
+  if (BitWidth < 64)
+    Offset = SignExtend64(Offset, BitWidth);
+
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee for determining thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  unsigned Opc;
+  if (GVar && GVar->isThreadLocal())
+    Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+  else
+    Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddPointer(GV);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  ID.AddInteger(GV->getType()->getAddressSpace());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL, GV, VT,
+                                                      Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
+  unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(FI);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) FrameIndexSDNode(FI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
+                                   unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent jump tables");
+  unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(JTI);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) JumpTableSDNode(JTI, VT, isTarget,
+                                                  TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  ID.AddPointer(C);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
+                                                     Alignment, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  C->addSelectionDAGCSEId(ID);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) ConstantPoolSDNode(isTarget, C, VT, Offset,
+                                                     Alignment, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                     unsigned char TargetFlags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), 0, 0);
+  ID.AddInteger(Index);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
+                                                    TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(MBB);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) BasicBlockSDNode(MBB);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getValueType(EVT VT) {
+  if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
+      ValueTypeNodes.size())
+    ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
+
+  SDNode *&N = VT.isExtended() ?
+    ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
+
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) VTSDNode(VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+  SDNode *&N = ExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) ExternalSymbolSDNode(false, Sym, 0, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
+                                              unsigned char TargetFlags) {
+  SDNode *&N =
+    TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
+                                                               TargetFlags)];
+  if (N) return SDValue(N, 0);
+  N = new (NodeAllocator) ExternalSymbolSDNode(true, Sym, TargetFlags, VT);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+  if ((unsigned)Cond >= CondCodeNodes.size())
+    CondCodeNodes.resize(Cond+1);
+
+  if (CondCodeNodes[Cond] == 0) {
+    CondCodeSDNode *N = new (NodeAllocator) CondCodeSDNode(Cond);
+    CondCodeNodes[Cond] = N;
+    AllNodes.push_back(N);
+  }
+
+  return SDValue(CondCodeNodes[Cond], 0);
+}
+
+// commuteShuffle - swaps the values of N1 and N2, and swaps all indices in
+// the shuffle mask M that point at N1 to point at N2, and indices that point
+// N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
+  std::swap(N1, N2);
+  int NElts = M.size();
+  for (int i = 0; i != NElts; ++i) {
+    if (M[i] >= NElts)
+      M[i] -= NElts;
+    else if (M[i] >= 0)
+      M[i] += NElts;
+  }
+}
+
+SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
+                                       SDValue N2, const int *Mask) {
+  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
+  assert(VT.isVector() && N1.getValueType().isVector() &&
+         "Vector Shuffle VTs must be a vectors");
+  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
+         && "Vector Shuffle VTs must have same element type");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
+    return getUNDEF(VT);
+
+  // Validate that all indices in Mask are within the range of the elements
+  // input to the shuffle.
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NElts; ++i) {
+    assert(Mask[i] < (int)(NElts * 2) && "Index out of range");
+    MaskVec.push_back(Mask[i]);
+  }
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N1 == N2) {
+    N2 = getUNDEF(VT);
+    for (unsigned i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= (int)NElts) MaskVec[i] -= NElts;
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N1.getOpcode() == ISD::UNDEF)
+    commuteShuffle(N1, N2, MaskVec);
+
+  // Canonicalize all index into lhs, -> shuffle lhs, undef
+  // Canonicalize all index into rhs, -> shuffle rhs, undef
+  bool AllLHS = true, AllRHS = true;
+  bool N2Undef = N2.getOpcode() == ISD::UNDEF;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= (int)NElts) {
+      if (N2Undef)
+        MaskVec[i] = -1;
+      else
+        AllLHS = false;
+    } else if (MaskVec[i] >= 0) {
+      AllRHS = false;
+    }
+  }
+  if (AllLHS && AllRHS)
+    return getUNDEF(VT);
+  if (AllLHS && !N2Undef)
+    N2 = getUNDEF(VT);
+  if (AllRHS) {
+    N1 = getUNDEF(VT);
+    commuteShuffle(N1, N2, MaskVec);
+  }
+
+  // If Identity shuffle, or all shuffle in to undef, return that node.
+  bool AllUndef = true;
+  bool Identity = true;
+  for (unsigned i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+    if (MaskVec[i] >= 0) AllUndef = false;
+  }
+  if (Identity && NElts == N1.getValueType().getVectorNumElements())
+    return N1;
+  if (AllUndef)
+    return getUNDEF(VT);
+
+  FoldingSetNodeID ID;
+  SDValue Ops[2] = { N1, N2 };
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops, 2);
+  for (unsigned i = 0; i != NElts; ++i)
+    ID.AddInteger(MaskVec[i]);
+
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  // Allocate the mask array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the NodeAllocator is released.
+  int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+  memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
+
+  ShuffleVectorSDNode *N =
+    new (NodeAllocator) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
+                                       SDValue Val, SDValue DTy,
+                                       SDValue STy, SDValue Rnd, SDValue Sat,
+                                       ISD::CvtCode Code) {
+  // If the src and dest types are the same and the conversion is between
+  // integer types of the same sign or two floats, no conversion is necessary.
+  if (DTy == STy &&
+      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
+    return Val;
+
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
+  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), &Ops[0], 5);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl, Ops, 5,
+                                                           Code);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), 0, 0);
+  ID.AddInteger(RegNo);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) RegisterSDNode(RegNo, VT);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), 0, 0);
+  ID.AddPointer(RegMask);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) RegisterMaskSDNode(RegMask);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label) {
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Root };
+  AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), &Ops[0], 1);
+  ID.AddPointer(Label);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl, Root, Label);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      int64_t Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
+  ID.AddPointer(BA);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
+                                                     TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getSrcValue(const Value *V) {
+  assert((!V || V->getType()->isPointerTy()) &&
+         "SrcValue is not a pointer?");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(V);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) SrcValueSDNode(V);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getMDNode - Return an MDNodeSDNode which holds an MDNode.
+SDValue SelectionDAG::getMDNode(const MDNode *MD) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), 0, 0);
+  ID.AddPointer(MD);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) MDNodeSDNode(MD);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+
+/// getShiftAmountOperand - Return the specified value casted to
+/// the target's desired shift amount type.
+SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
+  EVT OpTy = Op.getValueType();
+  EVT ShTy = TLI.getShiftAmountTy(LHSTy);
+  if (OpTy == ShTy || OpTy.isVector()) return Op;
+
+  ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
+  return getNode(Opcode, Op.getDebugLoc(), ShTy, Op);
+}
+
+/// CreateStackTemporary - Create a stack temporary, suitable for holding the
+/// specified value type.
+SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  unsigned ByteSize = VT.getStoreSize();
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  unsigned StackAlign =
+  std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
+
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+/// CreateStackTemporary - Create a stack temporary suitable for holding
+/// either of the specified value types.
+SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
+  unsigned Bytes = std::max(VT1.getStoreSizeInBits(),
+                            VT2.getStoreSizeInBits())/8;
+  Type *Ty1 = VT1.getTypeForEVT(*getContext());
+  Type *Ty2 = VT2.getTypeForEVT(*getContext());
+  const DataLayout *TD = TLI.getDataLayout();
+  unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
+                            TD->getPrefTypeAlignment(Ty2));
+
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false);
+  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+}
+
+SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
+                                SDValue N2, ISD::CondCode Cond, DebugLoc dl) {
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return getConstant(1, VT);
+
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+  case ISD::SETO:
+  case ISD::SETUO:
+  case ISD::SETUEQ:
+  case ISD::SETUNE:
+    assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
+    break;
+  }
+
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode())) {
+    const APInt &C2 = N2C->getAPIntValue();
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+      const APInt &C1 = N1C->getAPIntValue();
+
+      switch (Cond) {
+      default: llvm_unreachable("Unknown integer setcc!");
+      case ISD::SETEQ:  return getConstant(C1 == C2, VT);
+      case ISD::SETNE:  return getConstant(C1 != C2, VT);
+      case ISD::SETULT: return getConstant(C1.ult(C2), VT);
+      case ISD::SETUGT: return getConstant(C1.ugt(C2), VT);
+      case ISD::SETULE: return getConstant(C1.ule(C2), VT);
+      case ISD::SETUGE: return getConstant(C1.uge(C2), VT);
+      case ISD::SETLT:  return getConstant(C1.slt(C2), VT);
+      case ISD::SETGT:  return getConstant(C1.sgt(C2), VT);
+      case ISD::SETLE:  return getConstant(C1.sle(C2), VT);
+      case ISD::SETGE:  return getConstant(C1.sge(C2), VT);
+      }
+    }
+  }
+  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2.getNode())) {
+      APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
+      switch (Cond) {
+      default: break;
+      case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, VT);
+      case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, VT);
+      case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, VT);
+      case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETO:   return getConstant(R!=APFloat::cmpUnordered, VT);
+      case ISD::SETUO:  return getConstant(R==APFloat::cmpUnordered, VT);
+      case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpEqual, VT);
+      case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, VT);
+      case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpLessThan, VT);
+      case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpUnordered, VT);
+      case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, VT);
+      case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, VT);
+      }
+    } else {
+      // Ensure that the constant occurs on the RHS.
+      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+    }
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+/// use this predicate to simplify operations downstream.
+bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+  // This predicate is not safe for vector operations.
+  if (Op.getValueType().isVector())
+    return false;
+
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
+                                     unsigned Depth) const {
+  APInt KnownZero, KnownOne;
+  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  return (KnownZero & Mask) == Mask;
+}
+
+/// ComputeMaskedBits - Determine which of the bits specified in Mask are
+/// known to be either zero or one and return them in the KnownZero/KnownOne
+/// bitsets.  This code only analyzes bits in Mask, in order to short-circuit
+/// processing.
+void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, unsigned Depth) const {
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  if (Depth == 6)
+    return;  // Limit search depth.
+
+  APInt KnownZero2, KnownOne2;
+
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
+    return;
+  case ISD::AND:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  case ISD::OR:
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  case ISD::XOR: {
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case ISD::MUL: {
+    ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clearAllBits();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    return;
+  }
+  case ISD::UDIV: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clearAllBits();
+    KnownZero2.clearAllBits();
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    return;
+  }
+  case ISD::SELECT:
+    ComputeMaskedBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SELECT_CC:
+    ComputeMaskedBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      return;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If we know the result of a setcc has the top bits zero, use this info.
+    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
+        TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+  case ISD::SHL:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero <<= ShAmt;
+      KnownOne  <<= ShAmt;
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt);
+    }
+    return;
+  case ISD::SRL:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    return;
+  case ISD::SRA:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        return;
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+
+      ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bits.
+      APInt SignBit = APInt::getSignBit(BitWidth);
+      SignBit = SignBit.lshr(ShAmt);  // Adjust to where it is now in the mask.
+
+      if (KnownZero.intersects(SignBit)) {
+        KnownZero |= HighBits;  // New bits are known zero.
+      } else if (KnownOne.intersects(SignBit)) {
+        KnownOne  |= HighBits;  // New bits are known one.
+      }
+    }
+    return;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned EBits = EVT.getScalarType().getSizeInBits();
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
+
+    APInt InSignBit = APInt::getSignBit(EBits);
+    APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
+
+    // If the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InSignBit = InSignBit.zext(BitWidth);
+    if (NewBits.getBoolValue())
+      InputDemandedBits |= InSignBit;
+
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownOne &= InputDemandedBits;
+    KnownZero &= InputDemandedBits;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero.intersects(InSignBit)) {         // Input sign bit known clear
+      KnownZero |= NewBits;
+      KnownOne  &= ~NewBits;
+    } else if (KnownOne.intersects(InSignBit)) {   // Input sign bit known set
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                              // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne  &= ~NewBits;
+    }
+    return;
+  }
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP: {
+    unsigned LowBits = Log2_32(BitWidth)+1;
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+    KnownOne.clearAllBits();
+    return;
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    // If this is a ZEXTLoad and we are looking at the loaded value.
+    if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      EVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+    } else if (const MDNode *Ranges = LD->getRanges()) {
+      computeMaskedBitsLoad(*Ranges, KnownZero);
+    }
+    return;
+  }
+  case ISD::ZERO_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    return;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InSignBit = APInt::getSignBit(InBits);
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
+
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+
+    // Note if the sign bit is known to be zero or one.
+    bool SignBitKnownZero = KnownZero.isNegative();
+    bool SignBitKnownOne  = KnownOne.isNegative();
+    assert(!(SignBitKnownZero && SignBitKnownOne) &&
+           "Sign bit can't be known to be both zero and one!");
+
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero or one, the top bits match.
+    if (SignBitKnownZero)
+      KnownZero |= NewBits;
+    else if (SignBitKnownOne)
+      KnownOne  |= NewBits;
+    return;
+  }
+  case ISD::ANY_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    return;
+  }
+  case ISD::TRUNCATE: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    KnownZero = KnownZero.zext(InBits);
+    KnownOne = KnownOne.zext(InBits);
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    break;
+  }
+  case ISD::AssertZext: {
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero |= (~InMask);
+    KnownOne  &= (~KnownZero);
+    return;
+  }
+  case ISD::FGETSIGN:
+    // All bits are zero except the low bit.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    return;
+
+  case ISD::SUB: {
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (CLHS->getAPIntValue().isNonNegative()) {
+        unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+        }
+      }
+    }
+  }
+  // fall through
+  case ISD::ADD:
+  case ISD::ADDE: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    unsigned KnownZeroOut = KnownZero2.countTrailingOnes();
+
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    KnownZeroOut = std::min(KnownZeroOut,
+                            KnownZero2.countTrailingOnes());
+
+    if (Op.getOpcode() == ISD::ADD) {
+      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroOut);
+      return;
+    }
+
+    // With ADDE, a carry bit may be added in, so we can only use this
+    // information if we know (at least) that the low two bits are clear.  We
+    // then return to the caller that the low bit is unknown but that other bits
+    // are known zero.
+    if (KnownZeroOut >= 2) // ADDE
+      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroOut);
+    return;
+  }
+  case ISD::SREM:
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue().abs();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
+
+        // The low bits of the first operand are unchanged by the srem.
+        KnownZero = KnownZero2 & LowBits;
+        KnownOne = KnownOne2 & LowBits;
+
+        // If the first operand is non-negative or has all low bits zero, then
+        // the upper bits are all zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero |= ~LowBits;
+
+        // If the first operand is negative and not all low bits are zero, then
+        // the upper bits are all one.
+        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+          KnownOne |= ~LowBits;
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+      }
+    }
+    return;
+  case ISD::UREM: {
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        KnownZero |= ~LowBits;
+        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne,Depth+1);
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+
+    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clearAllBits();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    return;
+  }
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    if (unsigned Align = InferPtrAlignment(Op)) {
+      // The low bits are known zero if the pointer is aligned.
+      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
+      return;
+    }
+    break;
+
+  default:
+    if (Op.getOpcode() < ISD::BUILTIN_OP_END)
+      break;
+    // Fallthrough
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // Allow the target to implement this method for its nodes.
+    TLI.computeMaskedBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    return;
+  }
+}
+
+/// ComputeNumSignBits - Return the number of times the sign bit of the
+/// register is replicated into the other bits.  We know that at least 1 bit
+/// is always equal to the sign bit (itself), but other cases can give us
+/// information.  For example, immediately after an "SRA X, 2", we know that
+/// the top 3 bits are all equal to each other, so we return 3.
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && "Invalid VT!");
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::AssertSext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp+1;
+  case ISD::AssertZext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp;
+
+  case ISD::Constant: {
+    const APInt &Val = cast<ConstantSDNode>(Op)->getAPIntValue();
+    return Val.getNumSignBits();
+  }
+
+  case ISD::SIGN_EXTEND:
+    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
+
+  case ISD::SIGN_EXTEND_INREG:
+    // Max of the input and what this extends.
+    Tmp =
+      cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarType().getSizeInBits();
+    Tmp = VTBits-Tmp+1;
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    return std::max(Tmp, Tmp2);
+
+  case ISD::SRA:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    // SRA X, C   -> adds C sign bits.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      Tmp += C->getZExtValue();
+      if (Tmp > VTBits) Tmp = VTBits;
+    }
+    return Tmp;
+  case ISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (C->getZExtValue() >= VTBits ||      // Bad shift.
+          C->getZExtValue() >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - C->getZExtValue();
+    }
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // ComputeMaskedBits, and pick whichever answer is better.
+    }
+    break;
+
+  case ISD::SELECT:
+    Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    return std::min(Tmp, Tmp2);
+
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+  case ISD::SETCC:
+    // If setcc returns 0/-1, all bits are sign bits.
+    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RotAmt = C->getZExtValue() & (VTBits-1);
+
+      // Handle rotate right by N like a rotate left by 32-N.
+      if (Op.getOpcode() == ISD::ROTR)
+        RotAmt = (VTBits-RotAmt) & (VTBits-1);
+
+      // If we aren't rotating out all of the known-in sign bits, return the
+      // number that are left.  This handles rotl(sext(x), 1) for example.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (Tmp > RotAmt+1) return Tmp-RotAmt;
+    }
+    break;
+  case ISD::ADD:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero, KnownOne;
+        ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+          return VTBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+    return std::min(Tmp, Tmp2)-1;
+
+  case ISD::SUB:
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+
+    // Handle NEG.
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero, KnownOne;
+        ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+          return VTBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    return std::min(Tmp, Tmp2)-1;
+  case ISD::TRUNCATE:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  }
+
+  // If we are looking at the loaded value of the SDNode.
+  if (Op.getResNo() == 0) {
+    // Handle LOADX separately here. EXTLOAD case will fallthrough.
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+      unsigned ExtType = LD->getExtensionType();
+      switch (ExtType) {
+        default: break;
+        case ISD::SEXTLOAD:    // '17' bits known
+          Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+          return VTBits-Tmp+1;
+        case ISD::ZEXTLOAD:    // '16' bits known
+          Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+          return VTBits-Tmp;
+      }
+    }
+  }
+
+  // Allow the target to implement this method for its nodes.
+  if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_VOID) {
+    unsigned NumBits = TLI.ComputeNumSignBitsForTargetNode(Op, Depth);
+    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero, KnownOne;
+  ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+
+  APInt Mask;
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-VTBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+}
+
+/// isBaseWithConstantOffset - Return true if the specified operand is an
+/// ISD::ADD with a ConstantSDNode on the right-hand side, or if it is an
+/// ISD::OR with a ConstantSDNode that is guaranteed to have the same
+/// semantics as an ADD.  This handles the equivalence:
+///     X|Cst == X+Cst iff X&Cst = 0.
+bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
+  if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
+      !isa<ConstantSDNode>(Op.getOperand(1)))
+    return false;
+
+  if (Op.getOpcode() == ISD::OR &&
+      !MaskedValueIsZero(Op.getOperand(0),
+                     cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
+    return false;
+
+  return true;
+}
+
+
+bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
+  // If we're told that NaNs won't happen, assume they won't.
+  if (getTarget().Options.NoNaNsFPMath)
+    return true;
+
+  // If the value is a constant, we can obviously see if it is a NaN or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->getValueAPF().isNaN();
+
+  // TODO: Recognize more cases here.
+
+  return false;
+}
+
+bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->isZero();
+
+  // TODO: Recognize more cases here.
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::OR:
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      return !C->isNullValue();
+    break;
+  }
+
+  return false;
+}
+
+bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
+  // Check the obvious case.
+  if (A == B) return true;
+
+  // For for negative and positive zero.
+  if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
+    if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
+      if (CA->isZero() && CB->isZero()) return true;
+
+  // Otherwise they may not be equal.
+  return false;
+}
+
+/// getNode - Gets or creates the specified node.
+///
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL, getVTList(VT));
+  CSEMap.InsertNode(N, IP);
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              EVT VT, SDValue Operand) {
+  // Constant fold unary operations with an integer constant operand.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand.getNode())) {
+    const APInt &Val = C->getAPIntValue();
+    switch (Opcode) {
+    default: break;
+    case ISD::SIGN_EXTEND:
+      return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), VT);
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::TRUNCATE:
+      return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), VT);
+    case ISD::UINT_TO_FP:
+    case ISD::SINT_TO_FP: {
+      APFloat apf(EVTToAPFloatSemantics(VT),
+                  APInt::getNullValue(VT.getSizeInBits()));
+      (void)apf.convertFromAPInt(Val,
+                                 Opcode==ISD::SINT_TO_FP,
+                                 APFloat::rmNearestTiesToEven);
+      return getConstantFP(apf, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+        return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT);
+      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+        return getConstantFP(APFloat(APFloat::IEEEdouble, Val), VT);
+      break;
+    case ISD::BSWAP:
+      return getConstant(Val.byteSwap(), VT);
+    case ISD::CTPOP:
+      return getConstant(Val.countPopulation(), VT);
+    case ISD::CTLZ:
+    case ISD::CTLZ_ZERO_UNDEF:
+      return getConstant(Val.countLeadingZeros(), VT);
+    case ISD::CTTZ:
+    case ISD::CTTZ_ZERO_UNDEF:
+      return getConstant(Val.countTrailingZeros(), VT);
+    }
+  }
+
+  // Constant fold unary operations with a floating point constant operand.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand.getNode())) {
+    APFloat V = C->getValueAPF();    // make copy
+    switch (Opcode) {
+    case ISD::FNEG:
+      V.changeSign();
+      return getConstantFP(V, VT);
+    case ISD::FABS:
+      V.clearSign();
+      return getConstantFP(V, VT);
+    case ISD::FCEIL: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, VT);
+      break;
+    }
+    case ISD::FTRUNC: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, VT);
+      break;
+    }
+    case ISD::FFLOOR: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, VT);
+      break;
+    }
+    case ISD::FP_EXTEND: {
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, VT);
+    }
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: {
+      integerPart x[2];
+      bool ignored;
+      assert(integerPartWidth >= 64);
+      // FIXME need to be more flexible about rounding mode.
+      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
+                            Opcode==ISD::FP_TO_SINT,
+                            APFloat::rmTowardZero, &ignored);
+      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+        break;
+      APInt api(VT.getSizeInBits(), x);
+      return getConstant(api, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+        return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
+      else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+        return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
+      break;
+    }
+  }
+
+  unsigned OpOpcode = Operand.getNode()->getOpcode();
+  switch (Opcode) {
+  case ISD::TokenFactor:
+  case ISD::MERGE_VALUES:
+  case ISD::CONCAT_VECTORS:
+    return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
+  case ISD::FP_EXTEND:
+    assert(VT.isFloatingPoint() &&
+           Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
+    if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (Operand.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SIGN_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid SIGN_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid sext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      // sext(undef) = 0, because the top bits will all be the same.
+      return getConstant(0, VT);
+    break;
+  case ISD::ZERO_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ZERO_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid zext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
+      return getNode(ISD::ZERO_EXTEND, DL, VT,
+                     Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      // zext(undef) = 0, because the top bits will be zero.
+      return getConstant(0, VT);
+    break;
+  case ISD::ANY_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ANY_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert(Operand.getValueType().getScalarType().bitsLT(VT.getScalarType()) &&
+           "Invalid anyext node, dst < src!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
+      // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // (ext (trunx x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = Operand.getNode()->getOperand(0);
+      if (OpOp.getValueType() == VT)
+        return OpOp;
+    }
+    break;
+  case ISD::TRUNCATE:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid TRUNCATE!");
+    if (Operand.getValueType() == VT) return Operand;   // noop truncate
+    assert(Operand.getValueType().getScalarType().bitsGT(VT.getScalarType()) &&
+           "Invalid truncate node, src < dst!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    if (OpOpcode == ISD::TRUNCATE)
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND) {
+      // If the source is smaller than the dest, we still need an extend.
+      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+            .bitsLT(VT.getScalarType()))
+        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      return Operand.getNode()->getOperand(0);
+    }
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::BITCAST:
+    // Basic sanity checking.
+    assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits()
+           && "Cannot BITCAST between types of different sizes!");
+    if (VT == Operand.getValueType()) return Operand;  // noop conversion.
+    if (OpOpcode == ISD::BITCAST)  // bitconv(bitconv(x)) -> bitconv(x)
+      return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    assert(VT.isVector() && !Operand.getValueType().isVector() &&
+           (VT.getVectorElementType() == Operand.getValueType() ||
+            (VT.getVectorElementType().isInteger() &&
+             Operand.getValueType().isInteger() &&
+             VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
+           "Illegal SCALAR_TO_VECTOR node!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+    if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(Operand.getOperand(1)) &&
+        Operand.getConstantOperandVal(1) == 0 &&
+        Operand.getOperand(0).getValueType() == VT)
+      return Operand.getOperand(0);
+    break;
+  case ISD::FNEG:
+    // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+    if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
+      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
+                     Operand.getNode()->getOperand(0));
+    if (OpOpcode == ISD::FNEG)  // --X -> X
+      return Operand.getNode()->getOperand(0);
+    break;
+  case ISD::FABS:
+    if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
+      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  }
+
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Glue) { // Don't CSE flag producing nodes
+    FoldingSetNodeID ID;
+    SDValue Ops[1] = { Operand };
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 1);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
+                                             SDNode *Cst1, SDNode *Cst2) {
+  SmallVector<std::pair<ConstantSDNode *, ConstantSDNode *>, 4> Inputs;
+  SmallVector<SDValue, 4> Outputs;
+  EVT SVT = VT.getScalarType();
+
+  ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1);
+  ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2);
+  if (Scalar1 && Scalar2) {
+    // Scalar instruction.
+    Inputs.push_back(std::make_pair(Scalar1, Scalar2));
+  } else {
+    // For vectors extract each constant element into Inputs so we can constant
+    // fold them individually.
+    BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
+    BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
+    if (!BV1 || !BV2)
+      return SDValue();
+
+    assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");
+
+    for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
+      ConstantSDNode *V1 = dyn_cast<ConstantSDNode>(BV1->getOperand(I));
+      ConstantSDNode *V2 = dyn_cast<ConstantSDNode>(BV2->getOperand(I));
+      if (!V1 || !V2) // Not a constant, bail.
+        return SDValue();
+
+      // Avoid BUILD_VECTOR nodes that perform implicit truncation.
+      // FIXME: This is valid and could be handled by truncating the APInts.
+      if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
+        return SDValue();
+
+      Inputs.push_back(std::make_pair(V1, V2));
+    }
+  }
+
+  // We have a number of constant values, constant fold them element by element.
+  for (unsigned I = 0, E = Inputs.size(); I != E; ++I) {
+    const APInt &C1 = Inputs[I].first->getAPIntValue();
+    const APInt &C2 = Inputs[I].second->getAPIntValue();
+
+    switch (Opcode) {
+    case ISD::ADD:
+      Outputs.push_back(getConstant(C1 + C2, SVT));
+      break;
+    case ISD::SUB:
+      Outputs.push_back(getConstant(C1 - C2, SVT));
+      break;
+    case ISD::MUL:
+      Outputs.push_back(getConstant(C1 * C2, SVT));
+      break;
+    case ISD::UDIV:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.udiv(C2), SVT));
+      break;
+    case ISD::UREM:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.urem(C2), SVT));
+      break;
+    case ISD::SDIV:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.sdiv(C2), SVT));
+      break;
+    case ISD::SREM:
+      if (!C2.getBoolValue())
+        return SDValue();
+      Outputs.push_back(getConstant(C1.srem(C2), SVT));
+      break;
+    case ISD::AND:
+      Outputs.push_back(getConstant(C1 & C2, SVT));
+      break;
+    case ISD::OR:
+      Outputs.push_back(getConstant(C1 | C2, SVT));
+      break;
+    case ISD::XOR:
+      Outputs.push_back(getConstant(C1 ^ C2, SVT));
+      break;
+    case ISD::SHL:
+      Outputs.push_back(getConstant(C1 << C2, SVT));
+      break;
+    case ISD::SRL:
+      Outputs.push_back(getConstant(C1.lshr(C2), SVT));
+      break;
+    case ISD::SRA:
+      Outputs.push_back(getConstant(C1.ashr(C2), SVT));
+      break;
+    case ISD::ROTL:
+      Outputs.push_back(getConstant(C1.rotl(C2), SVT));
+      break;
+    case ISD::ROTR:
+      Outputs.push_back(getConstant(C1.rotr(C2), SVT));
+      break;
+    default:
+      return SDValue();
+    }
+  }
+
+  // Handle the scalar case first.
+  if (Outputs.size() == 1)
+    return Outputs.back();
+
+  // Otherwise build a big vector out of the scalar elements we generated.
+  return getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, Outputs.data(),
+                 Outputs.size());
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
+                              SDValue N2) {
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+  switch (Opcode) {
+  default: break;
+  case ISD::TokenFactor:
+    assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+           N2.getValueType() == MVT::Other && "Invalid token factor!");
+    // Fold trivial token factors.
+    if (N1.getOpcode() == ISD::EntryToken) return N2;
+    if (N2.getOpcode() == ISD::EntryToken) return N1;
+    if (N1 == N2) return N1;
+    break;
+  case ISD::CONCAT_VECTORS:
+    // Concat of UNDEFs is UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF &&
+        N2.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::AND:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+    // worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N2;
+    if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+      return N1;
+    break;
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+    // it's worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::SREM:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    if (getTarget().Options.UnsafeFPMath) {
+      if (Opcode == ISD::FADD) {
+        // 0+x --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1))
+          if (CFP->getValueAPF().isZero())
+            return N2;
+        // x+0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      } else if (Opcode == ISD::FSUB) {
+        // x-0 --> x
+        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
+          if (CFP->getValueAPF().isZero())
+            return N1;
+      } else if (Opcode == ISD::FMUL) {
+        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1);
+        SDValue V = N2;
+
+        // If the first operand isn't the constant, try the second
+        if (!CFP) {
+          CFP = dyn_cast<ConstantFPSDNode>(N2);
+          V = N1;
+        }
+
+        if (CFP) {
+          // 0*x --> 0
+          if (CFP->isZero())
+            return SDValue(CFP,0);
+          // 1*x --> x
+          if (CFP->isExactlyValue(1.0))
+            return V;
+        }
+      }
+    }
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+    assert(N1.getValueType() == VT &&
+           N1.getValueType().isFloatingPoint() &&
+           N2.getValueType().isFloatingPoint() &&
+           "Invalid FCOPYSIGN!");
+    break;
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(VT == N1.getValueType() &&
+           "Shift operators return type must be the same as their first arg");
+    assert(VT.isInteger() && N2.getValueType().isInteger() &&
+           "Shifts only work on integers");
+    assert((!VT.isVector() || VT == N2.getValueType()) &&
+           "Vector shift amounts must be in the same as their first arg");
+    // Verify that the shift amount VT is bit enough to hold valid shift
+    // amounts.  This catches things like trying to shift an i1024 value by an
+    // i8, which is easy to fall into in generic code that uses
+    // TLI.getShiftAmount().
+    assert(N2.getValueType().getSizeInBits() >=
+                   Log2_32_Ceil(N1.getValueType().getSizeInBits()) &&
+           "Invalid use of small shift amount with oversized value!");
+
+    // Always fold shifts of i1 values so the code generator doesn't need to
+    // handle them.  Since we know the size of the shift has to be less than the
+    // size of the value, the shift/rotate count is guaranteed to be zero.
+    if (VT == MVT::i1)
+      return N1;
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::FP_ROUND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg round!");
+    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
+           "Cannot FP_ROUND_INREG integer types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "FP_ROUND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in FP_ROUND_INREG");
+    assert(EVT.bitsLE(VT) && "Not rounding down!");
+    (void)EVT;
+    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
+    break;
+  }
+  case ISD::FP_ROUND:
+    assert(VT.isFloatingPoint() &&
+           N1.getValueType().isFloatingPoint() &&
+           VT.bitsLE(N1.getValueType()) &&
+           isa<ConstantSDNode>(N2) && "Invalid FP_ROUND!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    break;
+  case ISD::AssertSext:
+  case ISD::AssertZext: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (VT == EVT) return N1; // noop assertion.
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "SIGN_EXTEND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in SIGN_EXTEND_INREG");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (EVT == VT) return N1;  // Not actually extending
+
+    if (N1C) {
+      APInt Val = N1C->getAPIntValue();
+      unsigned FromBits = EVT.getScalarType().getSizeInBits();
+      Val <<= Val.getBitWidth()-FromBits;
+      Val = Val.ashr(Val.getBitWidth()-FromBits);
+      return getConstant(Val, VT);
+    }
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+    // expanding copies of large vectors from registers.
+    if (N2C &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getNumOperands() > 0) {
+      unsigned Factor =
+        N1.getOperand(0).getValueType().getVectorNumElements();
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     N1.getOperand(N2C->getZExtValue() / Factor),
+                     getConstant(N2C->getZExtValue() % Factor,
+                                 N2.getValueType()));
+    }
+
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
+    // expanding large vector constants.
+    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+
+      if (VT != Elt.getValueType())
+        // If the vector element type is not legal, the BUILD_VECTOR operands
+        // are promoted and implicitly truncated, and the result implicitly
+        // extended. Make that explicit here.
+        Elt = getAnyExtOrTrunc(Elt, DL, VT);
+
+      return Elt;
+    }
+
+    // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+    // operations are lowered to scalars.
+    if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      // If the indices are the same, return the inserted element else
+      // if the indices are known different, extract the element from
+      // the original vector.
+      SDValue N1Op2 = N1.getOperand(2);
+      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2.getNode());
+
+      if (N1Op2C && N2C) {
+        if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
+          if (VT == N1.getOperand(1).getValueType())
+            return N1.getOperand(1);
+          else
+            return getSExtOrTrunc(N1.getOperand(1), DL, VT);
+        }
+
+        return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+      }
+    }
+    break;
+  case ISD::EXTRACT_ELEMENT:
+    assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+    assert(!N1.getValueType().isVector() && !VT.isVector() &&
+           (N1.getValueType().isInteger() == VT.isInteger()) &&
+           N1.getValueType() != VT &&
+           "Wrong types for EXTRACT_ELEMENT!");
+
+    // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+    // 64-bit integers into 32-bit parts.  Instead of building the extract of
+    // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+    if (N1.getOpcode() == ISD::BUILD_PAIR)
+      return N1.getOperand(N2C->getZExtValue());
+
+    // EXTRACT_ELEMENT of a constant int is also very common.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      unsigned ElementSize = VT.getSizeInBits();
+      unsigned Shift = ElementSize * N2C->getZExtValue();
+      APInt ShiftedVal = C->getAPIntValue().lshr(Shift);
+      return getConstant(ShiftedVal.trunc(ElementSize), VT);
+    }
+    break;
+  case ISD::EXTRACT_SUBVECTOR: {
+    SDValue Index = N2;
+    if (VT.isSimple() && N1.getValueType().isSimple()) {
+      assert(VT.isVector() && N1.getValueType().isVector() &&
+             "Extract subvector VTs must be a vectors!");
+      assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() &&
+             "Extract subvector VTs must have the same element type!");
+      assert(VT.getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+             "Extract subvector must be from larger vector to smaller vector!");
+
+      if (isa<ConstantSDNode>(Index.getNode())) {
+        assert((VT.getVectorNumElements() +
+                cast<ConstantSDNode>(Index.getNode())->getZExtValue()
+                <= N1.getValueType().getVectorNumElements())
+               && "Extract subvector overflow!");
+      }
+
+      // Trivial extraction.
+      if (VT.getSimpleVT() == N1.getValueType().getSimpleVT())
+        return N1;
+    }
+    break;
+  }
+  }
+
+  // Perform trivial constant folding.
+  SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode());
+  if (SV.getNode()) return SV;
+
+  // Canonicalize constant to RHS if commutative.
+  if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
+    std::swap(N1C, N2C);
+    std::swap(N1, N2);
+  }
+
+  // Constant fold FP operations.
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode());
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode());
+  if (N1CFP) {
+    if (!N2CFP && isCommutativeBinOp(Opcode)) {
+      // Canonicalize constant to RHS if commutative.
+      std::swap(N1CFP, N2CFP);
+      std::swap(N1, N2);
+    } else if (N2CFP) {
+      APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
+      APFloat::opStatus s;
+      switch (Opcode) {
+      case ISD::FADD:
+        s = V1.add(V2, APFloat::rmNearestTiesToEven);
+        if (s != APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FSUB:
+        s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FMUL:
+        s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FDIV:
+        s = V1.divide(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FREM :
+        s = V1.mod(V2, APFloat::rmNearestTiesToEven);
+        if (s!=APFloat::opInvalidOp && s!=APFloat::opDivByZero)
+          return getConstantFP(V1, VT);
+        break;
+      case ISD::FCOPYSIGN:
+        V1.copySign(V2);
+        return getConstantFP(V1, VT);
+      default: break;
+      }
+    }
+
+    if (Opcode == ISD::FP_ROUND) {
+      APFloat V = N1CFP->getValueAPF();    // make copy
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, VT);
+    }
+  }
+
+  // Canonicalize an UNDEF to the RHS, even over a constant.
+  if (N1.getOpcode() == ISD::UNDEF) {
+    if (isCommutativeBinOp(Opcode)) {
+      std::swap(N1, N2);
+    } else {
+      switch (Opcode) {
+      case ISD::FP_ROUND_INREG:
+      case ISD::SIGN_EXTEND_INREG:
+      case ISD::SUB:
+      case ISD::FSUB:
+      case ISD::FDIV:
+      case ISD::FREM:
+      case ISD::SRA:
+        return N1;     // fold op(undef, arg2) -> undef
+      case ISD::UDIV:
+      case ISD::SDIV:
+      case ISD::UREM:
+      case ISD::SREM:
+      case ISD::SRL:
+      case ISD::SHL:
+        if (!VT.isVector())
+          return getConstant(0, VT);    // fold op(undef, arg2) -> 0
+        // For vectors, we can't easily build an all zero vector, just return
+        // the LHS.
+        return N2;
+      }
+    }
+  }
+
+  // Fold a bunch of operators when the RHS is undef.
+  if (N2.getOpcode() == ISD::UNDEF) {
+    switch (Opcode) {
+    case ISD::XOR:
+      if (N1.getOpcode() == ISD::UNDEF)
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return getConstant(0, VT);
+      // fallthrough
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::SUB:
+    case ISD::UDIV:
+    case ISD::SDIV:
+    case ISD::UREM:
+    case ISD::SREM:
+      return N2;       // fold op(arg1, undef) -> undef
+    case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
+    case ISD::FDIV:
+    case ISD::FREM:
+      if (getTarget().Options.UnsafeFPMath)
+        return N2;
+      break;
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::SRL:
+    case ISD::SHL:
+      if (!VT.isVector())
+        return getConstant(0, VT);  // fold op(arg1, undef) -> 0
+      // For vectors, we can't easily build an all zero vector, just return
+      // the LHS.
+      return N1;
+    case ISD::OR:
+      if (!VT.isVector())
+        return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+      // For vectors, we can't easily build an all one vector, just return
+      // the LHS.
+      return N1;
+    case ISD::SRA:
+      return N1;
+    }
+  }
+
+  // Memoize this node if possible.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Glue) {
+    SDValue Ops[] = { N1, N2 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  // Perform various simplifications.
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  switch (Opcode) {
+  case ISD::CONCAT_VECTORS:
+    // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
+    // one big BUILD_VECTOR.
+    if (N1.getOpcode() == ISD::BUILD_VECTOR &&
+        N2.getOpcode() == ISD::BUILD_VECTOR &&
+        N3.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 16> Elts(N1.getNode()->op_begin(),
+                                    N1.getNode()->op_end());
+      Elts.append(N2.getNode()->op_begin(), N2.getNode()->op_end());
+      Elts.append(N3.getNode()->op_begin(), N3.getNode()->op_end());
+      return getNode(ISD::BUILD_VECTOR, DL, VT, &Elts[0], Elts.size());
+    }
+    break;
+  case ISD::SETCC: {
+    // Use FoldSetCC to simplify SETCC's.
+    SDValue Simp = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL);
+    if (Simp.getNode()) return Simp;
+    break;
+  }
+  case ISD::SELECT:
+    if (N1C) {
+     if (N1C->getZExtValue())
+       return N2;             // select true, X, Y -> X
+     return N3;             // select false, X, Y -> Y
+    }
+
+    if (N2 == N3) return N2;   // select C, X, X -> X
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    llvm_unreachable("should use getVectorShuffle constructor!");
+  case ISD::INSERT_SUBVECTOR: {
+    SDValue Index = N3;
+    if (VT.isSimple() && N1.getValueType().isSimple()
+        && N2.getValueType().isSimple()) {
+      assert(VT.isVector() && N1.getValueType().isVector() &&
+             N2.getValueType().isVector() &&
+             "Insert subvector VTs must be a vectors");
+      assert(VT == N1.getValueType() &&
+             "Dest and insert subvector source types must match!");
+      assert(N2.getValueType().getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+             "Insert subvector must be from smaller vector to larger vector!");
+      if (isa<ConstantSDNode>(Index.getNode())) {
+        assert((N2.getValueType().getVectorNumElements() +
+                cast<ConstantSDNode>(Index.getNode())->getZExtValue()
+                <= VT.getVectorNumElements())
+               && "Insert subvector overflow!");
+      }
+
+      // Trivial insertion.
+      if (VT.getSimpleVT() == N2.getValueType().getSimpleVT())
+        return N2;
+    }
+    break;
+  }
+  case ISD::BITCAST:
+    // Fold bit_convert nodes from a type to themselves.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  }
+
+  // Memoize node if it doesn't produce a flag.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Glue) {
+    SDValue Ops[] = { N1, N2, N3 };
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VT, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VT, Ops, 5);
+}
+
+/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
+/// the incoming stack arguments to be loaded from the stack.
+SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
+  SmallVector<SDValue, 8> ArgChains;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument.
+  for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
+       UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0)
+          ArgChains.push_back(SDValue(L, 1));
+
+  // Build a tokenfactor for all the chains.
+  return getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+                 &ArgChains[0], ArgChains.size());
+}
+
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
+                              DebugLoc dl) {
+  assert(Value.getOpcode() != ISD::UNDEF);
+
+  unsigned NumBits = VT.getScalarType().getSizeInBits();
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    assert(C->getAPIntValue().getBitWidth() == 8);
+    APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
+    if (VT.isInteger())
+      return DAG.getConstant(Val, VT);
+    return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), VT);
+  }
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Value);
+  if (NumBits > 8) {
+    // Use a multiplication with 0x010101... to extend the input to the
+    // required length.
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    Value = DAG.getNode(ISD::MUL, dl, VT, Value, DAG.getConstant(Magic, VT));
+  }
+
+  return Value;
+}
+
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
+                                  const TargetLowering &TLI, StringRef Str) {
+  // Handle vector with all elements zero.
+  if (Str.empty()) {
+    if (VT.isInteger())
+      return DAG.getConstant(0, VT);
+    else if (VT == MVT::f32 || VT == MVT::f64)
+      return DAG.getConstantFP(0.0, VT);
+    else if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      return DAG.getNode(ISD::BITCAST, dl, VT,
+                         DAG.getConstant(0, EVT::getVectorVT(*DAG.getContext(),
+                                                             EltVT, NumElts)));
+    } else
+      llvm_unreachable("Expected type!");
+  }
+
+  assert(!VT.isVector() && "Can't handle vector type here!");
+  unsigned NumVTBits = VT.getSizeInBits();
+  unsigned NumVTBytes = NumVTBits / 8;
+  unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
+
+  APInt Val(NumVTBits, 0);
+  if (TLI.isLittleEndian()) {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Str[i] << i*8;
+  } else {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
+  }
+
+  // If the "cost" of materializing the integer immediate is 1 or free, then
+  // it is cost effective to turn the load into the immediate.
+  const TargetTransformInfo *TTI = DAG.getTargetTransformInfo();
+  if (TTI->getIntImmCost(Val, VT.getTypeForEVT(*DAG.getContext())) < 2)
+    return DAG.getConstant(Val, VT);
+  return SDValue(0, 0);
+}
+
+/// getMemBasePlusOffset - Returns base and offset node for the
+///
+static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                      SelectionDAG &DAG) {
+  EVT VT = Base.getValueType();
+  return DAG.getNode(ISD::ADD, Base.getDebugLoc(),
+                     VT, Base, DAG.getConstant(Offset, VT));
+}
+
+/// isMemSrcFromString - Returns true if memcpy source is a string constant.
+///
+static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
+  unsigned SrcDelta = 0;
+  GlobalAddressSDNode *G = NULL;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+  }
+  if (!G)
+    return false;
+
+  return getConstantStringInfo(G->getGlobal(), Str, SrcDelta, false);
+}
+
+/// FindOptimalMemOpLowering - Determines the optimial series memory ops
+/// to replace the memset / memcpy. Return true if the number of memory ops
+/// is below the threshold. It returns the types of the sequence of
+/// memory ops to perform memset / memcpy by reference.
+static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
+                                     unsigned Limit, uint64_t Size,
+                                     unsigned DstAlign, unsigned SrcAlign,
+                                     bool IsMemset,
+                                     bool ZeroMemset,
+                                     bool MemcpyStrSrc,
+                                     bool AllowOverlap,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
+         "Expecting memcpy / memset source to meet alignment requirement!");
+  // If 'SrcAlign' is zero, that means the memory operation does not need to
+  // load the value, i.e. memset or memcpy from constant string. Otherwise,
+  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
+  // is the specified alignment of the memory operation. If it is zero, that
+  // means it's possible to change the alignment of the destination.
+  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
+  // not need to be loaded.
+  EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
+                                   IsMemset, ZeroMemset, MemcpyStrSrc,
+                                   DAG.getMachineFunction());
+
+  if (VT == MVT::Other) {
+    if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment() ||
+        TLI.allowsUnalignedMemoryAccesses(VT)) {
+      VT = TLI.getPointerTy();
+    } else {
+      switch (DstAlign & 7) {
+      case 0:  VT = MVT::i64; break;
+      case 4:  VT = MVT::i32; break;
+      case 2:  VT = MVT::i16; break;
+      default: VT = MVT::i8;  break;
+      }
+    }
+
+    MVT LVT = MVT::i64;
+    while (!TLI.isTypeLegal(LVT))
+      LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
+    assert(LVT.isInteger());
+
+    if (VT.bitsGT(LVT))
+      VT = LVT;
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    while (VTSize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      EVT NewVT = VT;
+      unsigned NewVTSize;
+
+      bool Found = false;
+      if (VT.isVector() || VT.isFloatingPoint()) {
+        NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+        if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
+            TLI.isSafeMemOpType(NewVT.getSimpleVT()))
+          Found = true;
+        else if (NewVT == MVT::i64 &&
+                 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
+                 TLI.isSafeMemOpType(MVT::f64)) {
+          // i64 is usually not legal on 32-bit targets, but f64 may be.
+          NewVT = MVT::f64;
+          Found = true;
+        }
+      }
+
+      if (!Found) {
+        do {
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+          if (NewVT == MVT::i8)
+            break;
+        } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
+      }
+      NewVTSize = NewVT.getSizeInBits() / 8;
+
+      // If the new VT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      // FIXME: Only does this for 64-bit or more since we don't have proper
+      // cost model for unaligned load / store.
+      bool Fast;
+      if (NumMemOps && AllowOverlap &&
+          VTSize >= 8 && NewVTSize < Size &&
+          TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
+        VTSize = Size;
+      else {
+        VT = NewVT;
+        VTSize = NewVTSize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                       SDValue Chain, SDValue Dst,
+                                       SDValue Src, uint64_t Size,
+                                       unsigned Align, bool isVol,
+                                       bool AlwaysInline,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo) {
+  // Turn a memcpy of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  // TODO: In the AlwaysInline case, if the size is big then generate a loop
+  // rather than maybe a humongous number of loads and stores.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize =
+    MF.getFunction()->getAttributes().
+      hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  StringRef Str;
+  bool CopyFromStr = isMemSrcFromString(Src, Str);
+  bool isZeroStr = CopyFromStr && Str.empty();
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                (isZeroStr ? 0 : SrcAlign),
+                                false, false, CopyFromStr, true, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.  
+    const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+       while (NewAlign > Align &&
+             TLI.getDataLayout()->exceedsNaturalStackAlignment(NewAlign))
+          NewAlign /= 2;
+
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  uint64_t SrcOff = 0, DstOff = 0;
+  for (unsigned i = 0; i != NumMemOps; ++i) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      SrcOff -= VTSize - Size;
+      DstOff -= VTSize - Size;
+    }
+
+    if (CopyFromStr &&
+        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // We only handle zero vectors here.
+      // FIXME: Handle other cases where store of vector immediate is done in
+      // a single instruction.
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
+      if (Value.getNode())
+        Store = DAG.getStore(Chain, dl, Value,
+                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             DstPtrInfo.getWithOffset(DstOff), isVol,
+                             false, Align);
+    }
+
+    if (!Store.getNode()) {
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
+      // to Load/Store if NVT==VT.
+      // FIXME does the case above also need this?
+      EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      assert(NVT.bitsGE(VT));
+      Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+                             getMemBasePlusOffset(Src, SrcOff, DAG),
+                             SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false,
+                             MinAlign(SrcAlign, SrcOff));
+      Store = DAG.getTruncStore(Chain, dl, Value,
+                                getMemBasePlusOffset(Dst, DstOff, DAG),
+                                DstPtrInfo.getWithOffset(DstOff), VT, isVol,
+                                false, Align);
+    }
+    OutChains.push_back(Store);
+    SrcOff += VTSize;
+    DstOff += VTSize;
+    Size -= VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+                                        SDValue Chain, SDValue Dst,
+                                        SDValue Src, uint64_t Size,
+                                        unsigned Align,  bool isVol,
+                                        bool AlwaysInline,
+                                        MachinePointerInfo DstPtrInfo,
+                                        MachinePointerInfo SrcPtrInfo) {
+  // Turn a memmove of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align), SrcAlign,
+                                false, false, false, false, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  uint64_t SrcOff = 0, DstOff = 0;
+  SmallVector<SDValue, 8> LoadValues;
+  SmallVector<SDValue, 8> LoadChains;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Value = DAG.getLoad(VT, dl, Chain,
+                        getMemBasePlusOffset(Src, SrcOff, DAG),
+                        SrcPtrInfo.getWithOffset(SrcOff), isVol,
+                        false, false, SrcAlign);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      &LoadChains[0], LoadChains.size());
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    Store = DAG.getStore(Chain, dl, LoadValues[i],
+                         getMemBasePlusOffset(Dst, DstOff, DAG),
+                         DstPtrInfo.getWithOffset(DstOff), isVol, false, Align);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
+                               SDValue Chain, SDValue Dst,
+                               SDValue Src, uint64_t Size,
+                               unsigned Align, bool isVol,
+                               MachinePointerInfo DstPtrInfo) {
+  // Turn a memset of undef to nop.
+  if (Src.getOpcode() == ISD::UNDEF)
+    return Chain;
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  bool IsZeroVal =
+    isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
+  if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
+                                Size, (DstAlignCanChange ? 0 : Align), 0,
+                                true, IsZeroVal, false, true, DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  uint64_t DstOff = 0;
+  unsigned NumMemOps = MemOps.size();
+
+  // Find the largest store and generate the bit pattern for it.
+  EVT LargestVT = MemOps[0];
+  for (unsigned i = 1; i < NumMemOps; i++)
+    if (MemOps[i].bitsGT(LargestVT))
+      LargestVT = MemOps[i];
+  SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
+
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      DstOff -= VTSize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    SDValue Value = MemSetValue;
+    if (VT.bitsLT(LargestVT)) {
+      if (!LargestVT.isVector() && !VT.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
+      else
+        Value = getMemsetValue(Src, VT, DAG, dl);
+    }
+    assert(Value.getValueType() == VT && "Value with wrong type.");
+    SDValue Store = DAG.getStore(Chain, dl, Value,
+                                 getMemBasePlusOffset(Dst, DstOff, DAG),
+                                 DstPtrInfo.getWithOffset(DstOff),
+                                 isVol, false, Align);
+    OutChains.push_back(Store);
+    DstOff += VT.getSizeInBits() / 8;
+    Size -= VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], OutChains.size());
+}
+
+SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align, bool isVol, bool AlwaysInline,
+                                MachinePointerInfo DstPtrInfo,
+                                MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                             ConstantSize->getZExtValue(),Align,
+                                isVol, false, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
+                                isVol, AlwaysInline,
+                                DstPtrInfo, SrcPtrInfo);
+  if (Result.getNode())
+    return Result;
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                   ConstantSize->getZExtValue(), Align, isVol,
+                                   true, DstPtrInfo, SrcPtrInfo);
+  }
+
+  // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
+  // memcpy is not guaranteed to be safe. libc memcpys aren't required to
+  // respect volatile, so they may do things like read or write memory
+  // beyond the given memory regions. But fixing this isn't easy, and most
+  // people don't care.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                    /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                 SDValue Src, SDValue Size,
+                                 unsigned Align, bool isVol,
+                                 MachinePointerInfo DstPtrInfo,
+                                 MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
+                               ConstantSize->getZExtValue(), Align, isVol,
+                               false, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size, Align, isVol,
+                                 DstPtrInfo, SrcPtrInfo);
+  if (Result.getNode())
+    return Result;
+
+  // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
+  // not be safe.  See memcpy above for more details.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME:  pass in DebugLoc
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMMOVE),
+                    /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
+                                SDValue Src, SDValue Size,
+                                unsigned Align, bool isVol,
+                                MachinePointerInfo DstPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+                      Align, isVol, DstPtrInfo);
+
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  SDValue Result =
+    TSI.EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src, Size, Align, isVol,
+                                DstPtrInfo);
+  if (Result.getNode())
+    return Result;
+
+  // Emit a library call.
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Dst; Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+  // FIXME: pass in DebugLoc
+  TargetLowering::
+  CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
+                    false, false, false, false, 0,
+                    TLI.getLibcallCallingConv(RTLIB::MEMSET),
+                    /*isTailCall=*/false,
+                    /*doesNotReturn*/false, /*isReturnValueUsed=*/false,
+                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                      TLI.getPointerTy()),
+                    Args, *this, dl);
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Cmp,
+                                SDValue Swp, MachinePointerInfo PtrInfo,
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+
+  // All atomics are load and store, except for ATMOIC_LOAD and ATOMIC_STORE.
+  // For now, atomics are considered to be volatile always.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Cmp,
+                                SDValue Swp, MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  EVT VT = Cmp.getValueType();
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, Cmp, Swp, MMO, Ordering,
+                                               SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Val,
+                                const Value* PtrVal,
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  // An atomic store does not load. An atomic load does not store.
+  // (An atomicrmw obviously both loads and stores.)
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                SDValue Chain,
+                                SDValue Ptr, SDValue Val,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+          Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_OR ||
+          Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND ||
+          Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX ||
+          Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX ||
+          Opcode == ISD::ATOMIC_SWAP ||
+          Opcode == ISD::ATOMIC_STORE) &&
+         "Invalid Atomic Op");
+
+  EVT VT = Val.getValueType();
+
+  SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
+                                               getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr, Val};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, Val, MMO,
+                                               Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                EVT VT, SDValue Chain,
+                                SDValue Ptr,
+                                const Value* PtrVal,
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  // An atomic store does not load. An atomic load does not store.
+  // (An atomicrmw obviously both loads and stores.)
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, VT, Chain, Ptr, MMO,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                EVT VT, SDValue Chain,
+                                SDValue Ptr,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, MMO, Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// getMergeValues - Create a MERGE_VALUES node from the given operands.
+SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
+                                     DebugLoc dl) {
+  if (NumOps == 1)
+    return Ops[0];
+
+  SmallVector<EVT, 4> VTs;
+  VTs.reserve(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    VTs.push_back(Ops[i].getValueType());
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(&VTs[0], NumOps),
+                 Ops, NumOps);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl,
+                                  const EVT *VTs, unsigned NumVTs,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, MachinePointerInfo PtrInfo,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  return getMemIntrinsicNode(Opcode, dl, makeVTList(VTs, NumVTs), Ops, NumOps,
+                             MemVT, PtrInfo, Align, Vol,
+                             ReadMem, WriteMem);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, MachinePointerInfo PtrInfo,
+                                  unsigned Align, bool Vol,
+                                  bool ReadMem, bool WriteMem) {
+  if (Align == 0)  // Ensure that codegen never sees alignment 0
+    Align = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  unsigned Flags = 0;
+  if (WriteMem)
+    Flags |= MachineMemOperand::MOStore;
+  if (ReadMem)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Vol)
+    Flags |= MachineMemOperand::MOVolatile;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Align);
+
+  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, NumOps, MemVT, MMO);
+}
+
+SDValue
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+                                  const SDValue *Ops, unsigned NumOps,
+                                  EVT MemVT, MachineMemOperand *MMO) {
+  assert((Opcode == ISD::INTRINSIC_VOID ||
+          Opcode == ISD::INTRINSIC_W_CHAIN ||
+          Opcode == ISD::PREFETCH ||
+          Opcode == ISD::LIFETIME_START ||
+          Opcode == ISD::LIFETIME_END ||
+          (Opcode <= INT_MAX &&
+           (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
+         "Opcode is not a memory-accessing opcode!");
+
+  // Memoize the node unless it returns a flag.
+  MemIntrinsicSDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      return SDValue(E, 0);
+    }
+
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
+                                               MemVT, MMO);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
+                                               MemVT, MMO);
+  }
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(SDValue Ptr, int64_t Offset = 0) {
+  // If this is FI+Offset, we can model it.
+  if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
+    return MachinePointerInfo::getFixedStack(FI->getIndex(), Offset);
+
+  // If this is (FI+Offset1)+Offset2, we can model it.
+  if (Ptr.getOpcode() != ISD::ADD ||
+      !isa<ConstantSDNode>(Ptr.getOperand(1)) ||
+      !isa<FrameIndexSDNode>(Ptr.getOperand(0)))
+    return MachinePointerInfo();
+
+  int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+  return MachinePointerInfo::getFixedStack(FI, Offset+
+                       cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(SDValue Ptr, SDValue OffsetOp) {
+  // If the 'Offset' value isn't a constant, we can't handle this.
+  if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
+    return InferPointerInfo(Ptr, OffsetNode->getSExtValue());
+  if (OffsetOp.getOpcode() == ISD::UNDEF)
+    return InferPointerInfo(Ptr);
+  return MachinePointerInfo();
+}
+
+
+SDValue
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                      EVT VT, DebugLoc dl, SDValue Chain,
+                      SDValue Ptr, SDValue Offset,
+                      MachinePointerInfo PtrInfo, EVT MemVT,
+                      bool isVolatile, bool isNonTemporal, bool isInvariant,
+                      unsigned Alignment, const MDNode *TBAAInfo,
+                      const MDNode *Ranges) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(VT);
+
+  unsigned Flags = MachineMemOperand::MOLoad;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  if (isInvariant)
+    Flags |= MachineMemOperand::MOInvariant;
+
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V == 0)
+    PtrInfo = InferPointerInfo(Ptr, Offset);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
+                            TBAAInfo, Ranges);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
+}
+
+SDValue
+SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                      EVT VT, DebugLoc dl, SDValue Chain,
+                      SDValue Ptr, SDValue Offset, EVT MemVT,
+                      MachineMemOperand *MMO) {
+  if (VT == MemVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == MemVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
+    assert(VT.isInteger() == MemVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use trunc store to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+           "Cannot use trunc store to change the number of vector elements!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.getOpcode() == ISD::UNDEF) &&
+         "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ?
+    getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<LoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl, VTs, AM, ExtType,
+                                             MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              MachinePointerInfo PtrInfo,
+                              bool isVolatile, bool isNonTemporal,
+                              bool isInvariant, unsigned Alignment,
+                              const MDNode *TBAAInfo,
+                              const MDNode *Ranges) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 PtrInfo, VT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                 TBAAInfo, Ranges);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
+                                 SDValue Chain, SDValue Ptr,
+                                 MachinePointerInfo PtrInfo, EVT MemVT,
+                                 bool isVolatile, bool isNonTemporal,
+                                 unsigned Alignment, const MDNode *TBAAInfo) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 PtrInfo, MemVT, isVolatile, isNonTemporal, false, Alignment,
+                 TBAAInfo);
+}
+
+
+SDValue
+SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
+                             SDValue Offset, ISD::MemIndexedMode AM) {
+  LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+  assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
+         "Load is already a indexed load!");
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                 LD->getChain(), Base, Offset, LD->getPointerInfo(),
+                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(),
+                 false, LD->getAlignment());
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                               SDValue Ptr, MachinePointerInfo PtrInfo,
+                               bool isVolatile, bool isNonTemporal,
+                               unsigned Alignment, const MDNode *TBAAInfo) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(Val.getValueType());
+
+  unsigned Flags = MachineMemOperand::MOStore;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  if (PtrInfo.V == 0)
+    PtrInfo = InferPointerInfo(Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags,
+                            Val.getValueType().getStoreSize(), Alignment,
+                            TBAAInfo);
+
+  return getStore(Chain, dl, Val, Ptr, MMO);
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                               SDValue Ptr, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
+                                              false, VT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                                    SDValue Ptr, MachinePointerInfo PtrInfo,
+                                    EVT SVT,bool isVolatile, bool isNonTemporal,
+                                    unsigned Alignment,
+                                    const MDNode *TBAAInfo) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(SVT);
+
+  unsigned Flags = MachineMemOperand::MOStore;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (isNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  if (PtrInfo.V == 0)
+    PtrInfo = InferPointerInfo(Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, SVT.getStoreSize(), Alignment,
+                            TBAAInfo);
+
+  return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+                                    SDValue Ptr, EVT SVT,
+                                    MachineMemOperand *MMO) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (VT == SVT)
+    return getStore(Chain, dl, Val, Ptr, MMO);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() &&
+         "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
+                                              true, SVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue
+SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
+                              SDValue Offset, ISD::MemIndexedMode AM) {
+  StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+  assert(ST->getOffset().getOpcode() == ISD::UNDEF &&
+         "Store is already a indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, AM,
+                                              ST->isTruncatingStore(),
+                                              ST->getMemoryVT(),
+                                              ST->getMemOperand());
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
+                               SDValue Chain, SDValue Ptr,
+                               SDValue SV,
+                               unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              const SDUse *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  // Copy from an SDUse array into an SDValue array for use with
+  // the regular getNode logic.
+  SmallVector<SDValue, 8> NewOps(Ops, Ops + NumOps);
+  return getNode(Opcode, DL, VT, &NewOps[0], NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+                              const SDValue *Ops, unsigned NumOps) {
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::SELECT_CC: {
+    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+           "LHS and RHS of condition must have same type!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "True and False arms of SelectCC must have same type!");
+    assert(Ops[2].getValueType() == VT &&
+           "select_cc node must be of same type as true and false value!");
+    break;
+  }
+  case ISD::BR_CC: {
+    assert(NumOps == 5 && "BR_CC takes 5 operands!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "LHS/RHS of comparison should match types!");
+    break;
+  }
+  }
+
+  // Memoize nodes.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops, NumOps);
+    void *IP = 0;
+
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+  }
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              ArrayRef<EVT> ResultTys,
+                              const SDValue *Ops, unsigned NumOps) {
+  return getNode(Opcode, DL, getVTList(&ResultTys[0], ResultTys.size()),
+                 Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+                              const EVT *VTs, unsigned NumVTs,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (NumVTs == 1)
+    return getNode(Opcode, DL, VTs[0], Ops, NumOps);
+  return getNode(Opcode, DL, makeVTList(VTs, NumVTs), Ops, NumOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              const SDValue *Ops, unsigned NumOps) {
+  if (VTList.NumVTs == 1)
+    return getNode(Opcode, DL, VTList.VTs[0], Ops, NumOps);
+
+#if 0
+  switch (Opcode) {
+  // FIXME: figure out how to safely handle things like
+  // int foo(int x) { return 1 << (x & 255); }
+  // int bar() { return foo(256); }
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+  case ISD::SHL_PARTS:
+    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+    else if (N3.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+        // If the and is only masking out bits that cannot effect the shift,
+        // eliminate the and.
+        unsigned NumBits = VT.getScalarType().getSizeInBits()*2;
+        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+      }
+    break;
+  }
+#endif
+
+  // Memoize the node unless it returns a flag.
+  SDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return SDValue(E, 0);
+
+    if (NumOps == 1) {
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
+                                            Ops[2]);
+    } else {
+      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+    CSEMap.InsertNode(N, IP);
+  } else {
+    if (NumOps == 1) {
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+    } else if (NumOps == 2) {
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+    } else if (NumOps == 3) {
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
+                                            Ops[2]);
+    } else {
+      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+    }
+  }
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifySDNode(N);
+#endif
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList) {
+  return getNode(Opcode, DL, VTList, 0, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1) {
+  SDValue Ops[] = { N1 };
+  return getNode(Opcode, DL, VTList, Ops, 1);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2) {
+  SDValue Ops[] = { N1, N2 };
+  return getNode(Opcode, DL, VTList, Ops, 2);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDValue Ops[] = { N1, N2, N3 };
+  return getNode(Opcode, DL, VTList, Ops, 3);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VTList, Ops, 4);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3,
+                              SDValue N4, SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VTList, Ops, 5);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT) {
+  return makeVTList(SDNode::getValueTypeList(VT), 1);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(2);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  SDVTList Result = makeVTList(Array, 2);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(3);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  SDVTList Result = makeVTList(Array, 3);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I)
+    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
+                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
+      return *I;
+
+  EVT *Array = Allocator.Allocate<EVT>(4);
+  Array[0] = VT1;
+  Array[1] = VT2;
+  Array[2] = VT3;
+  Array[3] = VT4;
+  SDVTList Result = makeVTList(Array, 4);
+  VTList.push_back(Result);
+  return Result;
+}
+
+SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
+  switch (NumVTs) {
+    case 0: llvm_unreachable("Cannot have nodes without results!");
+    case 1: return getVTList(VTs[0]);
+    case 2: return getVTList(VTs[0], VTs[1]);
+    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
+    case 4: return getVTList(VTs[0], VTs[1], VTs[2], VTs[3]);
+    default: break;
+  }
+
+  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
+       E = VTList.rend(); I != E; ++I) {
+    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
+      continue;
+
+    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
+      return *I;
+  }
+
+  EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+  std::copy(VTs, VTs+NumVTs, Array);
+  SDVTList Result = makeVTList(Array, NumVTs);
+  VTList.push_back(Result);
+  return Result;
+}
+
+
+/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+/// specified operands.  If the resultant node already exists in the DAG,
+/// this does not modify the specified node, instead it returns the node that
+/// already exists.  If the resultant node does not exist in the DAG, the
+/// input node is returned.  As a degenerate case, if you specify the same
+/// input operands as the node already has, the input node is returned.
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
+  assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op == N->getOperand(0)) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  N->OperandList[0].set(Op);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
+  assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+    return N;   // No operands changed, just return the input node.
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  if (N->OperandList[0] != Op1)
+    N->OperandList[0].set(Op1);
+  if (N->OperandList[1] != Op2)
+    N->OperandList[1].set(Op2);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return UpdateNodeOperands(N, Ops, 3);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+  return UpdateNodeOperands(N, Ops, 4);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4, SDValue Op5) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+  return UpdateNodeOperands(N, Ops, 5);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, const SDValue *Ops, unsigned NumOps) {
+  assert(N->getNumOperands() == NumOps &&
+         "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  bool AnyChange = false;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    if (Ops[i] != N->getOperand(i)) {
+      AnyChange = true;
+      break;
+    }
+  }
+
+  // No operands changed, just return the input node.
+  if (!AnyChange) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = 0;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, NumOps, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = 0;
+
+  // Now we update the operands.
+  for (unsigned i = 0; i != NumOps; ++i)
+    if (N->OperandList[i] != Ops[i])
+      N->OperandList[i].set(Ops[i]);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+/// DropOperands - Release the operands and set this node to have
+/// zero operands.
+void SDNode::DropOperands() {
+  // Unlike the code in MorphNodeTo that does this, we don't need to
+  // watch for dead nodes here.
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    Use.set(SDValue());
+  }
+}
+
+/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+/// machine opcode.
+///
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, 0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, const SDValue *Ops,
+                                   unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, (SDValue *)0, 0);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3, EVT VT4,
+                                   const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, NumOps);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 1);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 2);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops, 3);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   SDVTList VTs, const SDValue *Ops,
+                                   unsigned NumOps) {
+  N = MorphNodeTo(N, ~MachineOpc, VTs, Ops, NumOps);
+  // Reset the NodeID to -1.
+  N->setNodeId(-1);
+  return N;
+}
+
+/// UpdadeDebugLocOnMergedSDNode - If the opt level is -O0 then it throws away
+/// the line number information on the merged node since it is not possible to
+/// preserve the information that operation is associated with multiple lines.
+/// This will make the debugger working better at -O0, were there is a higher
+/// probability having other instructions associated with that line.
+///
+SDNode *SelectionDAG::UpdadeDebugLocOnMergedSDNode(SDNode *N, DebugLoc OLoc) {
+  DebugLoc NLoc = N->getDebugLoc();
+  if (!(NLoc.isUnknown()) && (OptLevel == CodeGenOpt::None) && (OLoc != NLoc)) {
+    N->setDebugLoc(DebugLoc());
+  }
+  return N;
+}
+
+/// MorphNodeTo - This *mutates* the specified node to have the specified
+/// return type, opcode, and operands.
+///
+/// Note that MorphNodeTo returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.  Note that the DebugLoc need not be the same.
+///
+/// Using MorphNodeTo is faster than creating a new node and swapping it in
+/// with ReplaceAllUsesWith both because it often avoids allocating a new
+/// node, and because it doesn't require CSE recalculation for any of
+/// the node's users.
+///
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  SDVTList VTs, const SDValue *Ops,
+                                  unsigned NumOps) {
+  // If an identical node already exists, use it.
+  void *IP = 0;
+  if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
+    if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return UpdadeDebugLocOnMergedSDNode(ON, N->getDebugLoc());
+  }
+
+  if (!RemoveNodeFromCSEMaps(N))
+    IP = 0;
+
+  // Start the morphing.
+  N->NodeType = Opc;
+  N->ValueList = VTs.VTs;
+  N->NumValues = VTs.NumVTs;
+
+  // Clear the operands list, updating used nodes to remove this from their
+  // use list.  Keep track of any operands that become dead as a result.
+  SmallPtrSet<SDNode*, 16> DeadNodeSet;
+  for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    SDNode *Used = Use.getNode();
+    Use.set(SDValue());
+    if (Used->use_empty())
+      DeadNodeSet.insert(Used);
+  }
+
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N)) {
+    // Initialize the memory references information.
+    MN->setMemRefs(0, 0);
+    // If NumOps is larger than the # of operands we can have in a
+    // MachineSDNode, reallocate the operand list.
+    if (NumOps > MN->NumOperands || !MN->OperandsNeedDelete) {
+      if (MN->OperandsNeedDelete)
+        delete[] MN->OperandList;
+      if (NumOps > array_lengthof(MN->LocalOperands))
+        // We're creating a final node that will live unmorphed for the
+        // remainder of the current SelectionDAG iteration, so we can allocate
+        // the operands directly out of a pool with no recycling metadata.
+        MN->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
+                         Ops, NumOps);
+      else
+        MN->InitOperands(MN->LocalOperands, Ops, NumOps);
+      MN->OperandsNeedDelete = false;
+    } else
+      MN->InitOperands(MN->OperandList, Ops, NumOps);
+  } else {
+    // If NumOps is larger than the # of operands we currently have, reallocate
+    // the operand list.
+    if (NumOps > N->NumOperands) {
+      if (N->OperandsNeedDelete)
+        delete[] N->OperandList;
+      N->InitOperands(new SDUse[NumOps], Ops, NumOps);
+      N->OperandsNeedDelete = true;
+    } else
+      N->InitOperands(N->OperandList, Ops, NumOps);
+  }
+
+  // Delete any nodes that are still dead after adding the uses for the
+  // new operands.
+  if (!DeadNodeSet.empty()) {
+    SmallVector<SDNode *, 16> DeadNodes;
+    for (SmallPtrSet<SDNode *, 16>::iterator I = DeadNodeSet.begin(),
+         E = DeadNodeSet.end(); I != E; ++I)
+      if ((*I)->use_empty())
+        DeadNodes.push_back(*I);
+    RemoveDeadNodes(DeadNodes);
+  }
+
+  if (IP)
+    CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  return N;
+}
+
+
+/// getMachineNode - These are used for target selectors to create a new node
+/// with specified return type(s), MachineInstr opcode, and operands.
+///
+/// Note that getMachineNode returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, 0, 0);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, 0, 0);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, SDValue Op1,
+                             SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             EVT VT1, EVT VT2, EVT VT3,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1,
+                             EVT VT2, EVT VT3, EVT VT4,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+                             ArrayRef<EVT> ResultTys,
+                             const SDValue *Ops, unsigned NumOps) {
+  SDVTList VTs = getVTList(&ResultTys[0], ResultTys.size());
+  return getMachineNode(Opcode, dl, VTs, Ops, NumOps);
+}
+
+MachineSDNode *
+SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
+                             const SDValue *Ops, unsigned NumOps) {
+  bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
+  MachineSDNode *N;
+  void *IP = 0;
+
+  if (DoCSE) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
+    IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+      return cast<MachineSDNode>(UpdadeDebugLocOnMergedSDNode(E, DL));
+    }
+  }
+
+  // Allocate a new MachineSDNode.
+  N = new (NodeAllocator) MachineSDNode(~Opcode, DL, VTs);
+
+  // Initialize the operands list.
+  if (NumOps > array_lengthof(N->LocalOperands))
+    // We're creating a final node that will live unmorphed for the
+    // remainder of the current SelectionDAG iteration, so we can allocate
+    // the operands directly out of a pool with no recycling metadata.
+    N->InitOperands(OperandAllocator.Allocate<SDUse>(NumOps),
+                    Ops, NumOps);
+  else
+    N->InitOperands(N->LocalOperands, Ops, NumOps);
+  N->OperandsNeedDelete = false;
+
+  if (DoCSE)
+    CSEMap.InsertNode(N, IP);
+
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  VerifyMachineNode(N);
+#endif
+  return N;
+}
+
+/// getTargetExtractSubreg - A convenience function for creating
+/// TargetOpcode::EXTRACT_SUBREG nodes.
+SDValue
+SelectionDAG::getTargetExtractSubreg(int SRIdx, DebugLoc DL, EVT VT,
+                                     SDValue Operand) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
+  SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                  VT, Operand, SRIdxVal);
+  return SDValue(Subreg, 0);
+}
+
+/// getTargetInsertSubreg - A convenience function for creating
+/// TargetOpcode::INSERT_SUBREG nodes.
+SDValue
+SelectionDAG::getTargetInsertSubreg(int SRIdx, DebugLoc DL, EVT VT,
+                                    SDValue Operand, SDValue Subreg) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
+  SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                  VT, Operand, Subreg, SRIdxVal);
+  return SDValue(Result, 0);
+}
+
+/// getNodeIfExists - Get the specified node if it's already available, or
+/// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      const SDValue *Ops, unsigned NumOps) {
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    void *IP = 0;
+    if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+      return E;
+  }
+  return NULL;
+}
+
+/// getDbgValue - Creates a SDDbgValue node.
+///
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, SDNode *N, unsigned R, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, N, R, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, const Value *C, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, C, Off, DL, O);
+}
+
+SDDbgValue *
+SelectionDAG::getDbgValue(MDNode *MDPtr, unsigned FI, uint64_t Off,
+                          DebugLoc DL, unsigned O) {
+  return new (Allocator) SDDbgValue(MDPtr, FI, Off, DL, O);
+}
+
+namespace {
+
+/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
+/// pointed to by a use iterator is deleted, increment the use iterator
+/// so that it doesn't dangle.
+///
+class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SDNode::use_iterator &UI;
+  SDNode::use_iterator &UE;
+
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    // Increment the iterator as needed.
+    while (UI != UE && N == *UI)
+      ++UI;
+  }
+
+public:
+  RAUWUpdateListener(SelectionDAG &d,
+                     SDNode::use_iterator &ui,
+                     SDNode::use_iterator &ue)
+    : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
+};
+
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes From has a single result value.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
+  SDNode *From = FromN.getNode();
+  assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+         "Cannot replace with this method!");
+  assert(From != To.getNode() && "Cannot replace uses of with self");
+
+  // Iterate over all the existing uses of From. New uses will be added
+  // to the beginning of the use list, which we avoid visiting.
+  // This specifically avoids visiting uses of From that arise while the
+  // replacement is happening, because any such uses would be the result
+  // of CSE: If an existing node looks like From after one of its operands
+  // is replaced by To, we don't want to replace of all its users with To
+  // too. See PR3018 for more info.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (FromN == getRoot())
+    setRoot(To);
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes that for each value of From, there is a
+/// corresponding value in To in the same position with the same type.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    assert((!From->hasAnyUseOfValue(i) ||
+            From->getValueType(i) == To->getValueType(i)) &&
+           "Cannot use this version of ReplaceAllUsesWith!");
+#endif
+
+  // Handle the trivial case.
+  if (From == To)
+    return;
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.setNode(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To, getRoot().getResNo()));
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version can replace From with any result values.  To must match the
+/// number and types of values returned by From.
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
+  if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      const SDValue &ToOp = To[Use.getResNo()];
+      ++UI;
+      Use.set(ToOp);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To[getRoot().getResNo()]));
+}
+
+/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The Deleted
+/// vector is handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
+  // Handle the really simple, really trivial case efficiently.
+  if (From == To) return;
+
+  // Handle the simple, trivial, case efficiently.
+  if (From.getNode()->getNumValues() == 1) {
+    ReplaceAllUsesWith(From, To);
+    return;
+  }
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From.getNode()->use_begin(),
+                       UE = From.getNode()->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+    bool UserRemovedFromCSEMaps = false;
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+
+      // Skip uses of different values from the same node.
+      if (Use.getResNo() != From.getResNo()) {
+        ++UI;
+        continue;
+      }
+
+      // If this node hasn't been modified yet, it's still in the CSE maps,
+      // so remove its old self from the CSE maps.
+      if (!UserRemovedFromCSEMaps) {
+        RemoveNodeFromCSEMaps(User);
+        UserRemovedFromCSEMaps = true;
+      }
+
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // We are iterating over all uses of the From node, so if a use
+    // doesn't use the specific value, no changes are made.
+    if (!UserRemovedFromCSEMaps)
+      continue;
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot())
+    setRoot(To);
+}
+
+namespace {
+  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+  /// to record information about a use.
+  struct UseMemo {
+    SDNode *User;
+    unsigned Index;
+    SDUse *Use;
+  };
+
+  /// operator< - Sort Memos by User.
+  bool operator<(const UseMemo &L, const UseMemo &R) {
+    return (intptr_t)L.User < (intptr_t)R.User;
+  }
+}
+
+/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The same value
+/// may appear in both the From and To list.  The Deleted vector is
+/// handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                              const SDValue *To,
+                                              unsigned Num){
+  // Handle the simple, trivial case efficiently.
+  if (Num == 1)
+    return ReplaceAllUsesOfValueWith(*From, *To);
+
+  // Read up all the uses and make records of them. This helps
+  // processing new uses that are introduced during the
+  // replacement process.
+  SmallVector<UseMemo, 4> Uses;
+  for (unsigned i = 0; i != Num; ++i) {
+    unsigned FromResNo = From[i].getResNo();
+    SDNode *FromNode = From[i].getNode();
+    for (SDNode::use_iterator UI = FromNode->use_begin(),
+         E = FromNode->use_end(); UI != E; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == FromResNo) {
+        UseMemo Memo = { *UI, i, &Use };
+        Uses.push_back(Memo);
+      }
+    }
+  }
+
+  // Sort the uses, so that all the uses from a given User are together.
+  std::sort(Uses.begin(), Uses.end());
+
+  for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+       UseIndex != UseIndexEnd; ) {
+    // We know that this user uses some value of From.  If it is the right
+    // value, update it.
+    SDNode *User = Uses[UseIndex].User;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // The Uses array is sorted, so all the uses for a given User
+    // are next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      unsigned i = Uses[UseIndex].Index;
+      SDUse &Use = *Uses[UseIndex].Use;
+      ++UseIndex;
+
+      Use.set(To[i]);
+    } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+}
+
+/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+/// based on their topological order. It returns the maximum id and a vector
+/// of the SDNodes* in assigned order by reference.
+unsigned SelectionDAG::AssignTopologicalOrder() {
+
+  unsigned DAGSize = 0;
+
+  // SortedPos tracks the progress of the algorithm. Nodes before it are
+  // sorted, nodes after it are unsorted. When the algorithm completes
+  // it is at the end of the list.
+  allnodes_iterator SortedPos = allnodes_begin();
+
+  // Visit all the nodes. Move nodes with no operands to the front of
+  // the list immediately. Annotate nodes that do have operands with their
+  // operand count. Before we do this, the Node Id fields of the nodes
+  // may contain arbitrary values. After, the Node Id fields for nodes
+  // before SortedPos will contain the topological sort index, and the
+  // Node Id fields for nodes At SortedPos and after will contain the
+  // count of outstanding operands.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
+    SDNode *N = I++;
+    checkForCycles(N);
+    unsigned Degree = N->getNumOperands();
+    if (Degree == 0) {
+      // A node with no uses, add it to the result array immediately.
+      N->setNodeId(DAGSize++);
+      allnodes_iterator Q = N;
+      if (Q != SortedPos)
+        SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+      assert(SortedPos != AllNodes.end() && "Overran node list");
+      ++SortedPos;
+    } else {
+      // Temporarily use the Node Id as scratch space for the degree count.
+      N->setNodeId(Degree);
+    }
+  }
+
+  // Visit all the nodes. As we iterate, move nodes into sorted order,
+  // such that by the time the end is reached all nodes will be sorted.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+    checkForCycles(N);
+    // N is in sorted position, so all its uses have one less operand
+    // that needs to be sorted.
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDNode *P = *UI;
+      unsigned Degree = P->getNodeId();
+      assert(Degree != 0 && "Invalid node degree");
+      --Degree;
+      if (Degree == 0) {
+        // All of P's operands are sorted, so P may sorted now.
+        P->setNodeId(DAGSize++);
+        if (P != SortedPos)
+          SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+        assert(SortedPos != AllNodes.end() && "Overran node list");
+        ++SortedPos;
+      } else {
+        // Update P's outstanding operand count.
+        P->setNodeId(Degree);
+      }
+    }
+    if (I == SortedPos) {
+#ifndef NDEBUG
+      SDNode *S = ++I;
+      dbgs() << "Overran sorted position:\n";
+      S->dumprFull();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+
+  assert(SortedPos == AllNodes.end() &&
+         "Topological sort incomplete!");
+  assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token!");
+  assert(AllNodes.front().getNodeId() == 0 &&
+         "First node in topological sort has non-zero id!");
+  assert(AllNodes.front().getNumOperands() == 0 &&
+         "First node in topological sort has operands!");
+  assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+         "Last node in topologic sort has unexpected id!");
+  assert(AllNodes.back().use_empty() &&
+         "Last node in topologic sort has users!");
+  assert(DAGSize == allnodes_size() && "Node count mismatch!");
+  return DAGSize;
+}
+
+/// AssignOrdering - Assign an order to the SDNode.
+void SelectionDAG::AssignOrdering(const SDNode *SD, unsigned Order) {
+  assert(SD && "Trying to assign an order to a null node!");
+  Ordering->add(SD, Order);
+}
+
+/// GetOrdering - Get the order for the SDNode.
+unsigned SelectionDAG::GetOrdering(const SDNode *SD) const {
+  assert(SD && "Trying to get the order of a null node!");
+  return Ordering->getOrder(SD);
+}
+
+/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+/// value is produced by SD.
+void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
+  DbgInfo->add(DB, SD, isParameter);
+  if (SD)
+    SD->setHasDebugValue(true);
+}
+
+/// TransferDbgValues - Transfer SDDbgValues.
+void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
+  if (From == To || !From.getNode()->getHasDebugValue())
+    return;
+  SDNode *FromNode = From.getNode();
+  SDNode *ToNode = To.getNode();
+  ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
+  SmallVector<SDDbgValue *, 2> ClonedDVs;
+  for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
+       I != E; ++I) {
+    SDDbgValue *Dbg = *I;
+    if (Dbg->getKind() == SDDbgValue::SDNODE) {
+      SDDbgValue *Clone = getDbgValue(Dbg->getMDPtr(), ToNode, To.getResNo(),
+                                      Dbg->getOffset(), Dbg->getDebugLoc(),
+                                      Dbg->getOrder());
+      ClonedDVs.push_back(Clone);
+    }
+  }
+  for (SmallVector<SDDbgValue *, 2>::iterator I = ClonedDVs.begin(),
+         E = ClonedDVs.end(); I != E; ++I)
+    AddDbgValue(*I, ToNode, false);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SDNode Class
+//===----------------------------------------------------------------------===//
+
+HandleSDNode::~HandleSDNode() {
+  DropOperands();
+}
+
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, DebugLoc DL,
+                                         const GlobalValue *GA,
+                                         EVT VT, int64_t o, unsigned char TF)
+  : SDNode(Opc, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  TheGlobal = GA;
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs, EVT memvt,
+                     MachineMemOperand *mmo)
+ : SDNode(Opc, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
+                                      MMO->isNonTemporal(), MMO->isInvariant());
+  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
+  assert(isNonTemporal() == MMO->isNonTemporal() &&
+         "Non-temporal encoding error!");
+  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+}
+
+MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs,
+                     const SDValue *Ops, unsigned NumOps, EVT memvt,
+                     MachineMemOperand *mmo)
+   : SDNode(Opc, dl, VTs, Ops, NumOps),
+     MemoryVT(memvt), MMO(mmo) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
+                                      MMO->isNonTemporal(), MMO->isInvariant());
+  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
+  assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
+}
+
+/// Profile - Gather unique data for the node.
+///
+void SDNode::Profile(FoldingSetNodeID &ID) const {
+  AddNodeIDNode(ID, this);
+}
+
+namespace {
+  struct EVTArray {
+    std::vector<EVT> VTs;
+
+    EVTArray() {
+      VTs.reserve(MVT::LAST_VALUETYPE);
+      for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
+        VTs.push_back(MVT((MVT::SimpleValueType)i));
+    }
+  };
+}
+
+static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs;
+static ManagedStatic<EVTArray> SimpleVTArray;
+static ManagedStatic<sys::SmartMutex<true> > VTMutex;
+
+/// getValueTypeList - Return a pointer to the specified value type.
+///
+const EVT *SDNode::getValueTypeList(EVT VT) {
+  if (VT.isExtended()) {
+    sys::SmartScopedLock<true> Lock(*VTMutex);
+    return &(*EVTs->insert(VT).first);
+  } else {
+    assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
+           "Value type out of range!");
+    return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
+  }
+}
+
+/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+/// indicated value.  This method ignores uses of other values defined by this
+/// operation.
+bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  // TODO: Only iterate over uses of a given value of the node
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    if (UI.getUse().getResNo() == Value) {
+      if (NUses == 0)
+        return false;
+      --NUses;
+    }
+  }
+
+  // Found exactly the right number of uses?
+  return NUses == 0;
+}
+
+
+/// hasAnyUseOfValue - Return true if there are any use of the indicated
+/// value. This method ignores uses of other values defined by this operation.
+bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+    if (UI.getUse().getResNo() == Value)
+      return true;
+
+  return false;
+}
+
+
+/// isOnlyUserOf - Return true if this node is the only use of N.
+///
+bool SDNode::isOnlyUserOf(SDNode *N) const {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (User == this)
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// isOperand - Return true if this node is an operand of N.
+///
+bool SDValue::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (*this == N->getOperand(i))
+      return true;
+  return false;
+}
+
+bool SDNode::isOperandOf(SDNode *N) const {
+  for (unsigned i = 0, e = N->NumOperands; i != e; ++i)
+    if (this == N->OperandList[i].getNode())
+      return true;
+  return false;
+}
+
+/// reachesChainWithoutSideEffects - Return true if this operand (which must
+/// be a chain) reaches the specified operand without crossing any
+/// side-effecting instructions on any chain path.  In practice, this looks
+/// through token factors and non-volatile loads.  In order to remain efficient,
+/// this only looks a couple of nodes in, it does not do an exhaustive search.
+bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                               unsigned Depth) const {
+  if (*this == Dest) return true;
+
+  // Don't search too deeply, we just want to be able to see through
+  // TokenFactor's etc.
+  if (Depth == 0) return false;
+
+  // If this is a token factor, all inputs to the TF happen in parallel.  If any
+  // of the operands of the TF does not reach dest, then we cannot do the xform.
+  if (getOpcode() == ISD::TokenFactor) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
+        return false;
+    return true;
+  }
+
+  // Loads don't have side effects, look through them.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+    if (!Ld->isVolatile())
+      return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+  }
+  return false;
+}
+
+/// hasPredecessor - Return true if N is a predecessor of this node.
+/// N is either an operand of this node, or can be reached by recursively
+/// traversing up the operands.
+/// NOTE: This is an expensive method. Use it carefully.
+bool SDNode::hasPredecessor(const SDNode *N) const {
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  return hasPredecessorHelper(N, Visited, Worklist);
+}
+
+bool SDNode::hasPredecessorHelper(const SDNode *N,
+                                  SmallPtrSet<const SDNode *, 32> &Visited,
+                                  SmallVector<const SDNode *, 16> &Worklist) const {
+  if (Visited.empty()) {
+    Worklist.push_back(this);
+  } else {
+    // Take a look in the visited set. If we've already encountered this node
+    // we needn't search further.
+    if (Visited.count(N))
+      return true;
+  }
+
+  // Haven't visited N yet. Continue the search.
+  while (!Worklist.empty()) {
+    const SDNode *M = Worklist.pop_back_val();
+    for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
+      SDNode *Op = M->getOperand(i).getNode();
+      if (Visited.insert(Op))
+        Worklist.push_back(Op);
+      if (Op == N)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
+  assert(Num < NumOperands && "Invalid child # of SDNode!");
+  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
+}
+
+SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
+  assert(N->getNumValues() == 1 &&
+         "Can't unroll a vector with multiple results!");
+
+  EVT VT = N->getValueType(0);
+  unsigned NE = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(N->getNumOperands());
+
+  // If ResNE is 0, fully unroll the vector op.
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  unsigned i;
+  for (i= 0; i != NE; ++i) {
+    for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+      SDValue Operand = N->getOperand(j);
+      EVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        // A vector operand; extract a single element.
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              OperandEltVT,
+                              Operand,
+                              getConstant(i, TLI.getPointerTy()));
+      } else {
+        // A scalar operand; just use it as is.
+        Operands[j] = Operand;
+      }
+    }
+
+    switch (N->getOpcode()) {
+    default:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                &Operands[0], Operands.size()));
+      break;
+    case ISD::VSELECT:
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT,
+                                &Operands[0], Operands.size()));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
+                                getShiftAmountOperand(Operands[0].getValueType(),
+                                                      Operands[1])));
+      break;
+    case ISD::SIGN_EXTEND_INREG:
+    case ISD::FP_ROUND_INREG: {
+      EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                Operands[0],
+                                getValueType(ExtVT)));
+    }
+    }
+  }
+
+  for (; i < ResNE; ++i)
+    Scalars.push_back(getUNDEF(EltVT));
+
+  return getNode(ISD::BUILD_VECTOR, dl,
+                 EVT::getVectorVT(*getContext(), EltVT, ResNE),
+                 &Scalars[0], Scalars.size());
+}
+
+
+/// isConsecutiveLoad - Return true if LD is loading 'Bytes' bytes from a
+/// location that is 'Dist' units away from the location that the 'Base' load
+/// is loading from.
+bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
+                                     unsigned Bytes, int Dist) const {
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  // Handle X+C
+  if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
+      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+    return true;
+
+  const GlobalValue *GV1 = NULL;
+  const GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+/// it cannot be inferred.
+unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  const GlobalValue *GV;
+  int64_t GVOffset = 0;
+  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    unsigned PtrWidth = TLI.getPointerTy().getSizeInBits();
+    APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
+    llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
+                            TLI.getDataLayout());
+    unsigned AlignBits = KnownZero.countTrailingOnes();
+    unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
+    if (Align)
+      return MinAlign(Align, GVOffset);
+  }
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (isBaseWithConstantOffset(Ptr) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    // Handle FI+Cst
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
+    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                    FrameOffset);
+    return FIInfoAlign;
+  }
+
+  return 0;
+}
+
+// getAddressSpace - Return the address space this GlobalAddress belongs to.
+unsigned GlobalAddressSDNode::getAddressSpace() const {
+  return getGlobal()->getType()->getAddressSpace();
+}
+
+
+Type *ConstantPoolSDNode::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
+                                        APInt &SplatUndef,
+                                        unsigned &SplatBitSize,
+                                        bool &HasAnyUndefs,
+                                        unsigned MinSplatBits,
+                                        bool isBigEndian) {
+  EVT VT = getValueType(0);
+  assert(VT.isVector() && "Expected a vector type");
+  unsigned sz = VT.getSizeInBits();
+  if (MinSplatBits > sz)
+    return false;
+
+  SplatValue = APInt(sz, 0);
+  SplatUndef = APInt(sz, 0);
+
+  // Get the bits.  Bits with undefined values (when the corresponding element
+  // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+  // in SplatValue.  If any of the values are not constant, give up and return
+  // false.
+  unsigned int nOps = getNumOperands();
+  assert(nOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltBitSize = VT.getVectorElementType().getSizeInBits();
+
+  for (unsigned j = 0; j < nOps; ++j) {
+    unsigned i = isBigEndian ? nOps-1-j : j;
+    SDValue OpVal = getOperand(i);
+    unsigned BitPos = j * EltBitSize;
+
+    if (OpVal.getOpcode() == ISD::UNDEF)
+      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
+    else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
+                    zextOrTrunc(sz) << BitPos;
+    else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
+     else
+      return false;
+  }
+
+  // The build_vector is all constants or undefs.  Find the smallest element
+  // size that splats the vector.
+
+  HasAnyUndefs = (SplatUndef != 0);
+  while (sz > 8) {
+
+    unsigned HalfSize = sz / 2;
+    APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
+    APInt LowValue = SplatValue.trunc(HalfSize);
+    APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
+    APInt LowUndef = SplatUndef.trunc(HalfSize);
+
+    // If the two halves do not match (ignoring undef bits), stop here.
+    if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+        MinSplatBits > HalfSize)
+      break;
+
+    SplatValue = HighValue | LowValue;
+    SplatUndef = HighUndef & LowUndef;
+
+    sz = HalfSize;
+  }
+
+  SplatBitSize = sz;
+  return true;
+}
+
+bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i, e;
+  for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+    /* search */;
+
+  assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
+
+  // Make sure all remaining elements are either undef or the same as the first
+  // non-undef value.
+  for (int Idx = Mask[i]; i != e; ++i)
+    if (Mask[i] >= 0 && Mask[i] != Idx)
+      return false;
+  return true;
+}
+
+#ifdef XDEBUG
+static void checkForCyclesHelper(const SDNode *N,
+                                 SmallPtrSet<const SDNode*, 32> &Visited,
+                                 SmallPtrSet<const SDNode*, 32> &Checked) {
+  // If this node has already been checked, don't check it again.
+  if (Checked.count(N))
+    return;
+
+  // If a node has already been visited on this depth-first walk, reject it as
+  // a cycle.
+  if (!Visited.insert(N)) {
+    dbgs() << "Offending node:\n";
+    N->dumprFull();
+    errs() << "Detected cycle in SelectionDAG\n";
+    abort();
+  }
+
+  for(unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    checkForCyclesHelper(N->getOperand(i).getNode(), Visited, Checked);
+
+  Checked.insert(N);
+  Visited.erase(N);
+}
+#endif
+
+void llvm::checkForCycles(const llvm::SDNode *N) {
+#ifdef XDEBUG
+  assert(N && "Checking nonexistant SDNode");
+  SmallPtrSet<const SDNode*, 32> visited;
+  SmallPtrSet<const SDNode*, 32> checked;
+  checkForCyclesHelper(N, visited, checked);
+#endif
+}
+
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG) {
+  checkForCycles(DAG->getRoot().getNode());
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
new file mode 100644
index 000000000000..ce40cd6a0c9c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -0,0 +1,6873 @@
+//===-- SelectionDAGBuilder.cpp - Selection-DAG building ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "SelectionDAGBuilder.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/IntegersSubsetMapping.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+using namespace llvm;
+
+/// LimitFloatPrecision - Generate low-precision inline sequences for
+/// some float libcalls (6, 8 or 12 bits).
+static unsigned LimitFloatPrecision;
+
+static cl::opt<unsigned, true>
+LimitFPPrecision("limit-float-precision",
+                 cl::desc("Generate low-precision inline sequences "
+                          "for some float libcalls"),
+                 cl::location(LimitFloatPrecision),
+                 cl::init(0));
+
+// Limit the width of DAG chains. This is important in general to prevent
+// prevent DAG-based analysis from blowing up. For example, alias analysis and
+// load clustering may not complete in reasonable time. It is difficult to
+// recognize and avoid this situation within each individual analysis, and
+// future analyses are likely to have the same behavior. Limiting DAG width is
+// the safe approach, and will be especially important with global DAGs.
+//
+// MaxParallelChains default is arbitrarily high to avoid affecting
+// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
+// sequence over this should have been converted to llvm.memcpy by the
+// frontend. It easy to induce this behavior with .ll code such as:
+// %buffer = alloca [4096 x i8]
+// %data = load [4096 x i8]* %argPtr
+// store [4096 x i8] %data, [4096 x i8]* %buffer
+static const unsigned MaxParallelChains = 64;
+
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V);
+
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger then ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
+                                const SDValue *Parts,
+                                unsigned NumParts, MVT PartVT, EVT ValueVT,
+                                const Value *V,
+                                ISD::NodeType AssertOp = ISD::DELETED_NODE) {
+  if (ValueVT.isVector())
+    return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
+                                  PartVT, ValueVT, V);
+
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  if (NumParts > 1) {
+    // Assemble the value from multiple parts.
+    if (ValueVT.isInteger()) {
+      unsigned PartBits = PartVT.getSizeInBits();
+      unsigned ValueBits = ValueVT.getSizeInBits();
+
+      // Assemble the power of 2 part.
+      unsigned RoundParts = NumParts & (NumParts - 1) ?
+        1 << Log2_32(NumParts) : NumParts;
+      unsigned RoundBits = PartBits * RoundParts;
+      EVT RoundVT = RoundBits == ValueBits ?
+        ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
+      SDValue Lo, Hi;
+
+      EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
+
+      if (RoundParts > 2) {
+        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
+                              PartVT, HalfVT, V);
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
+                              RoundParts / 2, PartVT, HalfVT, V);
+      } else {
+        Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
+      }
+
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
+
+      if (RoundParts < NumParts) {
+        // Assemble the trailing non-power-of-2 part.
+        unsigned OddParts = NumParts - RoundParts;
+        EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
+        Hi = getCopyFromParts(DAG, DL,
+                              Parts + RoundParts, OddParts, PartVT, OddVT, V);
+
+        // Combine the round and odd parts.
+        Lo = Val;
+        if (TLI.isBigEndian())
+          std::swap(Lo, Hi);
+        EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
+        Hi = DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
+                         DAG.getConstant(Lo.getValueType().getSizeInBits(),
+                                         TLI.getPointerTy()));
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
+      }
+    } else if (PartVT.isFloatingPoint()) {
+      // FP split into multiple FP parts (for ppcf128)
+      assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
+             "Unexpected split");
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
+    } else {
+      // FP split into integer parts (soft fp)
+      assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
+             !PartVT.isVector() && "Unexpected split");
+      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V);
+    }
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isInteger() && ValueVT.isInteger()) {
+    if (ValueVT.bitsLT(PartEVT)) {
+      // For a truncate, see if we have any information to
+      // indicate whether the truncated bits will always be
+      // zero or sign-extension.
+      if (AssertOp != ISD::DELETED_NODE)
+        Val = DAG.getNode(AssertOp, DL, PartEVT, Val,
+                          DAG.getValueType(ValueVT));
+      return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    }
+    return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
+  }
+
+  if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+    // FP_ROUND's are always exact here.
+    if (ValueVT.bitsLT(Val.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val,
+                         DAG.getTargetConstant(1, TLI.getPointerTy()));
+
+    return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
+  }
+
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  llvm_unreachable("Unknown mismatch!");
+}
+
+/// getCopyFromPartsVector - Create a value that contains the specified legal
+/// parts combined into the value they represent.  If the parts combine to a
+/// type larger then ValueVT then AssertOp can be used to specify whether the
+/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
+/// ValueVT (ISD::AssertSext).
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V) {
+  assert(ValueVT.isVector() && "Not a vector value");
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  // Handle a multi-element vector.
+  if (NumParts > 1) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    unsigned NumRegs =
+    TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                               NumIntermediates, RegisterVT);
+    assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+    NumParts = NumRegs; // Silence a compiler warning.
+    assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+    assert(RegisterVT == Parts[0].getSimpleValueType() &&
+           "Part type doesn't match part!");
+
+    // Assemble the parts into intermediate operands.
+    SmallVector<SDValue, 8> Ops(NumIntermediates);
+    if (NumIntermediates == NumParts) {
+      // If the register was not expanded, truncate or copy the value,
+      // as appropriate.
+      for (unsigned i = 0; i != NumParts; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
+                                  PartVT, IntermediateVT, V);
+    } else if (NumParts > 0) {
+      // If the intermediate type was expanded, build the intermediate
+      // operands from the parts.
+      assert(NumParts % NumIntermediates == 0 &&
+             "Must expand into a divisible number of parts!");
+      unsigned Factor = NumParts / NumIntermediates;
+      for (unsigned i = 0; i != NumIntermediates; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
+                                  PartVT, IntermediateVT, V);
+    }
+
+    // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
+    // intermediate operands.
+    Val = DAG.getNode(IntermediateVT.isVector() ?
+                      ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL,
+                      ValueVT, &Ops[0], NumIntermediates);
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isVector()) {
+    // If the element type of the source/dest vectors are the same, but the
+    // parts vector has more elements than the value vector, then we have a
+    // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
+    // elements we want.
+    if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
+      assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
+             "Cannot narrow, it would be a lossy transformation");
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    // Vector/Vector bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
+      "Cannot handle this kind of promotion");
+    // Promoted vector extract
+    bool Smaller = ValueVT.bitsLE(PartEVT);
+    return DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                       DL, ValueVT, Val);
+
+  }
+
+  // Trivial bitcast if the types are the same size and the destination
+  // vector type is legal.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
+      TLI.isTypeLegal(ValueVT))
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle cases such as i8 -> <1 x i1>
+  if (ValueVT.getVectorNumElements() != 1) {
+    LLVMContext &Ctx = *DAG.getContext();
+    Twine ErrMsg("non-trivial scalar-to-vector conversion");
+    if (const Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      if (const CallInst *CI = dyn_cast<CallInst>(I))
+        if (isa<InlineAsm>(CI->getCalledValue()))
+          ErrMsg = ErrMsg + ", possible invalid constraint for vector type";
+      Ctx.emitError(I, ErrMsg);
+    } else {
+      Ctx.emitError(ErrMsg);
+    }
+    report_fatal_error("Cannot handle scalar-to-vector conversion!");
+  }
+
+  if (ValueVT.getVectorNumElements() == 1 &&
+      ValueVT.getVectorElementType() != PartEVT) {
+    bool Smaller = ValueVT.bitsLE(PartEVT);
+    Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                       DL, ValueVT.getScalarType(), Val);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
+}
+
+static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V);
+
+/// getCopyToParts - Create a series of nodes that contain the specified value
+/// split into legal parts.  If the parts contain more bits than Val, then, for
+/// integers, ExtendKind can be used to specify how to generate the extra bits.
+static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
+                           SDValue Val, SDValue *Parts, unsigned NumParts,
+                           MVT PartVT, const Value *V,
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  EVT ValueVT = Val.getValueType();
+
+  // Handle the vector case separately.
+  if (ValueVT.isVector())
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned PartBits = PartVT.getSizeInBits();
+  unsigned OrigNumParts = NumParts;
+  assert(TLI.isTypeLegal(PartVT) && "Copying to an illegal type!");
+
+  if (NumParts == 0)
+    return;
+
+  assert(!ValueVT.isVector() && "Vector case handled elsewhere");
+  EVT PartEVT = PartVT;
+  if (PartEVT == ValueVT) {
+    assert(NumParts == 1 && "No-op copy with multiple parts!");
+    Parts[0] = Val;
+    return;
+  }
+
+  if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+    // If the parts cover more bits than the value has, promote the value.
+    if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+      assert(NumParts == 1 && "Do not know what to promote to!");
+      Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
+    } else {
+      assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+             ValueVT.isInteger() &&
+             "Unknown mismatch!");
+      ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+      Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
+      if (PartVT == MVT::x86mmx)
+        Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
+  } else if (PartBits == ValueVT.getSizeInBits()) {
+    // Different types of the same size.
+    assert(NumParts == 1 && PartEVT != ValueVT);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+    // If the parts cover less bits than value has, truncate the value.
+    assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+           ValueVT.isInteger() &&
+           "Unknown mismatch!");
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    if (PartVT == MVT::x86mmx)
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  }
+
+  // The value may have changed - recompute ValueVT.
+  ValueVT = Val.getValueType();
+  assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+         "Failed to tile the value with PartVT!");
+
+  if (NumParts == 1) {
+    if (PartEVT != ValueVT) {
+      LLVMContext &Ctx = *DAG.getContext();
+      Twine ErrMsg("scalar-to-vector conversion failed");
+      if (const Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+        if (const CallInst *CI = dyn_cast<CallInst>(I))
+          if (isa<InlineAsm>(CI->getCalledValue()))
+            ErrMsg = ErrMsg + ", possible invalid constraint for vector type";
+        Ctx.emitError(I, ErrMsg);
+      } else {
+        Ctx.emitError(ErrMsg);
+      }
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Expand the value into multiple parts.
+  if (NumParts & (NumParts - 1)) {
+    // The number of parts is not a power of 2.  Split off and copy the tail.
+    assert(PartVT.isInteger() && ValueVT.isInteger() &&
+           "Do not know what to expand to!");
+    unsigned RoundParts = 1 << Log2_32(NumParts);
+    unsigned RoundBits = RoundParts * PartBits;
+    unsigned OddParts = NumParts - RoundParts;
+    SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
+                                 DAG.getIntPtrConstant(RoundBits));
+    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V);
+
+    if (TLI.isBigEndian())
+      // The odd parts were reversed by getCopyToParts - unreverse them.
+      std::reverse(Parts + RoundParts, Parts + NumParts);
+
+    NumParts = RoundParts;
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  // The number of parts is a power of 2.  Repeatedly bisect the value using
+  // EXTRACT_ELEMENT.
+  Parts[0] = DAG.getNode(ISD::BITCAST, DL,
+                         EVT::getIntegerVT(*DAG.getContext(),
+                                           ValueVT.getSizeInBits()),
+                         Val);
+
+  for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+    for (unsigned i = 0; i < NumParts; i += StepSize) {
+      unsigned ThisBits = StepSize * PartBits / 2;
+      EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
+      SDValue &Part0 = Parts[i];
+      SDValue &Part1 = Parts[i+StepSize/2];
+
+      Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(1));
+      Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(0));
+
+      if (ThisBits == PartBits && ThisVT != PartVT) {
+        Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
+        Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
+      }
+    }
+  }
+
+  if (TLI.isBigEndian())
+    std::reverse(Parts, Parts + OrigNumParts);
+}
+
+
+/// getCopyToPartsVector - Create a series of nodes that contain the specified
+/// value split into legal parts.
+static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V) {
+  EVT ValueVT = Val.getValueType();
+  assert(ValueVT.isVector() && "Not a vector");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (NumParts == 1) {
+    EVT PartEVT = PartVT;
+    if (PartEVT == ValueVT) {
+      // Nothing to do.
+    } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+      // Bitconvert vector->vector case.
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    } else if (PartVT.isVector() &&
+               PartEVT.getVectorElementType() == ValueVT.getVectorElementType() &&
+               PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+      EVT ElementVT = PartVT.getVectorElementType();
+      // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+      // undef elements.
+      SmallVector<SDValue, 16> Ops;
+      for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                  ElementVT, Val, DAG.getIntPtrConstant(i)));
+
+      for (unsigned i = ValueVT.getVectorNumElements(),
+           e = PartVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getUNDEF(ElementVT));
+
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, &Ops[0], Ops.size());
+
+      // FIXME: Use CONCAT for 2x -> 4x.
+
+      //SDValue UndefElts = DAG.getUNDEF(VectorTy);
+      //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
+    } else if (PartVT.isVector() &&
+               PartEVT.getVectorElementType().bitsGE(
+                 ValueVT.getVectorElementType()) &&
+               PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
+
+      // Promoted vector extract
+      bool Smaller = PartEVT.bitsLE(ValueVT);
+      Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                        DL, PartVT, Val);
+    } else{
+      // Vector -> scalar conversion.
+      assert(ValueVT.getVectorNumElements() == 1 &&
+             "Only trivial vector-to-scalar conversions should get here!");
+      Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                        PartVT, Val, DAG.getIntPtrConstant(0));
+
+      bool Smaller = ValueVT.bitsLE(PartVT);
+      Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
+                         DL, PartVT, Val);
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Handle a multi-element vector.
+  EVT IntermediateVT;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
+                                                IntermediateVT,
+                                                NumIntermediates, RegisterVT);
+  unsigned NumElements = ValueVT.getVectorNumElements();
+
+  assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+  NumParts = NumRegs; // Silence a compiler warning.
+  assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+
+  // Split the vector into intermediate operands.
+  SmallVector<SDValue, 8> Ops(NumIntermediates);
+  for (unsigned i = 0; i != NumIntermediates; ++i) {
+    if (IntermediateVT.isVector())
+      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
+                           IntermediateVT, Val,
+                   DAG.getIntPtrConstant(i * (NumElements / NumIntermediates)));
+    else
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                           IntermediateVT, Val, DAG.getIntPtrConstant(i));
+  }
+
+  // Split the intermediate operands into legal parts.
+  if (NumParts == NumIntermediates) {
+    // If the register was not expanded, promote or copy the value,
+    // as appropriate.
+    for (unsigned i = 0; i != NumParts; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V);
+  } else if (NumParts > 0) {
+    // If the intermediate type was expanded, split each the value into
+    // legal parts.
+    assert(NumParts % NumIntermediates == 0 &&
+           "Must expand into a divisible number of parts!");
+    unsigned Factor = NumParts / NumIntermediates;
+    for (unsigned i = 0; i != NumIntermediates; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT, V);
+  }
+}
+
+namespace {
+  /// RegsForValue - This struct represents the registers (physical or virtual)
+  /// that a particular set of values is assigned, and the type information
+  /// about the value. The most common situation is to represent one value at a
+  /// time, but struct or array values are handled element-wise as multiple
+  /// values.  The splitting of aggregates is performed recursively, so that we
+  /// never have aggregate-typed registers. The values at this point do not
+  /// necessarily have legal types, so each value may require one or more
+  /// registers of some legal type.
+  ///
+  struct RegsForValue {
+    /// ValueVTs - The value types of the values, which may not be legal, and
+    /// may need be promoted or synthesized from one or more registers.
+    ///
+    SmallVector<EVT, 4> ValueVTs;
+
+    /// RegVTs - The value types of the registers. This is the same size as
+    /// ValueVTs and it records, for each value, what the type of the assigned
+    /// register or registers are. (Individual values are never synthesized
+    /// from more than one type of register.)
+    ///
+    /// With virtual registers, the contents of RegVTs is redundant with TLI's
+    /// getRegisterType member function, however when with physical registers
+    /// it is necessary to have a separate record of the types.
+    ///
+    SmallVector<MVT, 4> RegVTs;
+
+    /// Regs - This list holds the registers assigned to the values.
+    /// Each legal or promoted value requires one register, and each
+    /// expanded value requires multiple registers.
+    ///
+    SmallVector<unsigned, 4> Regs;
+
+    RegsForValue() {}
+
+    RegsForValue(const SmallVector<unsigned, 4> &regs,
+                 MVT regvt, EVT valuevt)
+      : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+
+    RegsForValue(LLVMContext &Context, const TargetLowering &tli,
+                 unsigned Reg, Type *Ty) {
+      ComputeValueVTs(tli, Ty, ValueVTs);
+
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        EVT ValueVT = ValueVTs[Value];
+        unsigned NumRegs = tli.getNumRegisters(Context, ValueVT);
+        MVT RegisterVT = tli.getRegisterType(Context, ValueVT);
+        for (unsigned i = 0; i != NumRegs; ++i)
+          Regs.push_back(Reg + i);
+        RegVTs.push_back(RegisterVT);
+        Reg += NumRegs;
+      }
+    }
+
+    /// areValueTypesLegal - Return true if types of all the values are legal.
+    bool areValueTypesLegal(const TargetLowering &TLI) {
+      for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+        MVT RegisterVT = RegVTs[Value];
+        if (!TLI.isTypeLegal(RegisterVT))
+          return false;
+      }
+      return true;
+    }
+
+    /// append - Add the specified values to this one.
+    void append(const RegsForValue &RHS) {
+      ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+      RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+      Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    }
+
+    /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+    /// this value and returns the result as a ValueVTs value.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                            DebugLoc dl,
+                            SDValue &Chain, SDValue *Flag,
+                            const Value *V = 0) const;
+
+    /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+    /// specified value into the registers specified by this object.  This uses
+    /// Chain/Flag as the input and updates them for the output Chain/Flag.
+    /// If the Flag pointer is NULL, no flag is used.
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                       SDValue &Chain, SDValue *Flag, const Value *V) const;
+
+    /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+    /// operand list.  This adds the code marker, matching input operand index
+    /// (if applicable), and includes the number of values added into it.
+    void AddInlineAsmOperands(unsigned Kind,
+                              bool HasMatching, unsigned MatchingIdx,
+                              SelectionDAG &DAG,
+                              std::vector<SDValue> &Ops) const;
+  };
+}
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      DebugLoc dl,
+                                      SDValue &Chain, SDValue *Flag,
+                                      const Value *V) const {
+  // A Value with type {} or [0 x %t] needs no registers.
+  if (ValueVTs.empty())
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (Flag == 0) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+      Parts[i] = P;
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
+          !RegisterVT.isInteger() || RegisterVT.isVector())
+        continue;
+
+      const FunctionLoweringInfo::LiveOutInfo *LOI =
+        FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
+      if (!LOI)
+        continue;
+
+      unsigned RegSize = RegisterVT.getSizeInBits();
+      unsigned NumSignBits = LOI->NumSignBits;
+      unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes();
+
+      // FIXME: We capture more information than the dag can represent.  For
+      // now, just use the tightest assertzext/assertsext possible.
+      bool isSExt = true;
+      EVT FromVT(MVT::Other);
+      if (NumSignBits == RegSize)
+        isSExt = true, FromVT = MVT::i1;   // ASSERT SEXT 1
+      else if (NumZeroBits >= RegSize-1)
+        isSExt = false, FromVT = MVT::i1;  // ASSERT ZEXT 1
+      else if (NumSignBits > RegSize-8)
+        isSExt = true, FromVT = MVT::i8;   // ASSERT SEXT 8
+      else if (NumZeroBits >= RegSize-8)
+        isSExt = false, FromVT = MVT::i8;  // ASSERT ZEXT 8
+      else if (NumSignBits > RegSize-16)
+        isSExt = true, FromVT = MVT::i16;  // ASSERT SEXT 16
+      else if (NumZeroBits >= RegSize-16)
+        isSExt = false, FromVT = MVT::i16; // ASSERT ZEXT 16
+      else if (NumSignBits > RegSize-32)
+        isSExt = true, FromVT = MVT::i32;  // ASSERT SEXT 32
+      else if (NumZeroBits >= RegSize-32)
+        isSExt = false, FromVT = MVT::i32; // ASSERT ZEXT 32
+      else
+        continue;
+
+      // Add an assertion node.
+      assert(FromVT != MVT::Other);
+      Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                             RegisterVT, P, DAG.getValueType(FromVT));
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT, V);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl,
+                     DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                     &Values[0], ValueVTs.size());
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+                                 SDValue &Chain, SDValue *Flag,
+                                 const Value *V) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+    ISD::NodeType ExtendKind =
+      TLI.isZExtFree(Val, RegisterVT)? ISD::ZERO_EXTEND: ISD::ANY_EXTEND;
+
+    getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT, V, ExtendKind);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (Flag == 0) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0], NumRegs);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  else if (!Regs.empty() &&
+           TargetRegisterInfo::isVirtualRegister(Regs.front())) {
+    // Put the register class of the virtual registers in the flag word.  That
+    // way, later passes can recompute register class constraints for inline
+    // assembly as well as normal instructions.
+    // Don't do this for tied operands that can use the regclass information
+    // from the def.
+    const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+    Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+  }
+
+  SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
+  Ops.push_back(Res);
+
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
+    MVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      Ops.push_back(DAG.getRegister(Regs[Reg++], RegisterVT));
+    }
+  }
+}
+
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+                               const TargetLibraryInfo *li) {
+  AA = &aa;
+  GFI = gfi;
+  LibInfo = li;
+  TD = DAG.getTarget().getDataLayout();
+  Context = DAG.getContext();
+  LPadToCallSiteMap.clear();
+}
+
+/// clear - Clear out the current SelectionDAG and the associated
+/// state and prepare this SelectionDAGBuilder object to be used
+/// for a new block. This doesn't clear out information about
+/// additional blocks that are needed to complete switch lowering
+/// or PHI node updating; that information is cleared out as it is
+/// consumed.
+void SelectionDAGBuilder::clear() {
+  NodeMap.clear();
+  UnusedArgNodeMap.clear();
+  PendingLoads.clear();
+  PendingExports.clear();
+  CurDebugLoc = DebugLoc();
+  HasTailCall = false;
+}
+
+/// clearDanglingDebugInfo - Clear the dangling debug information
+/// map. This function is separated from the clear so that debug
+/// information that is dangling in a basic block can be properly
+/// resolved in a different basic block. This allows the
+/// SelectionDAG to resolve dangling debug information attached
+/// to PHI nodes.
+void SelectionDAGBuilder::clearDanglingDebugInfo() {
+  DanglingDebugInfoMap.clear();
+}
+
+/// getRoot - Return the current virtual root of the Selection DAG,
+/// flushing any PendingLoad items. This must be done before emitting
+/// a store or any other node that may need to be ordered after any
+/// prior load instructions.
+///
+SDValue SelectionDAGBuilder::getRoot() {
+  if (PendingLoads.empty())
+    return DAG.getRoot();
+
+  if (PendingLoads.size() == 1) {
+    SDValue Root = PendingLoads[0];
+    DAG.setRoot(Root);
+    PendingLoads.clear();
+    return Root;
+  }
+
+  // Otherwise, we have to make a token factor node.
+  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                               &PendingLoads[0], PendingLoads.size());
+  PendingLoads.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+/// getControlRoot - Similar to getRoot, but instead of flushing all the
+/// PendingLoad items, flush all the PendingExports items. It is necessary
+/// to do this before emitting a terminator instruction.
+///
+SDValue SelectionDAGBuilder::getControlRoot() {
+  SDValue Root = DAG.getRoot();
+
+  if (PendingExports.empty())
+    return Root;
+
+  // Turn all of the CopyToReg chains into one factored node.
+  if (Root.getOpcode() != ISD::EntryToken) {
+    unsigned i = 0, e = PendingExports.size();
+    for (; i != e; ++i) {
+      assert(PendingExports[i].getNode()->getNumOperands() > 1);
+      if (PendingExports[i].getNode()->getOperand(0) == Root)
+        break;  // Don't add the root if we already indirectly depend on it.
+    }
+
+    if (i == e)
+      PendingExports.push_back(Root);
+  }
+
+  Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                     &PendingExports[0],
+                     PendingExports.size());
+  PendingExports.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+void SelectionDAGBuilder::AssignOrderingToNode(const SDNode *Node) {
+  if (DAG.GetOrdering(Node) != 0) return; // Already has ordering.
+  DAG.AssignOrdering(Node, SDNodeOrder);
+
+  for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I)
+    AssignOrderingToNode(Node->getOperand(I).getNode());
+}
+
+void SelectionDAGBuilder::visit(const Instruction &I) {
+  // Set up outgoing PHI node register values before emitting the terminator.
+  if (isa<TerminatorInst>(&I))
+    HandlePHINodesInSuccessorBlocks(I.getParent());
+
+  CurDebugLoc = I.getDebugLoc();
+
+  visit(I.getOpcode(), I);
+
+  if (!isa<TerminatorInst>(&I) && !HasTailCall)
+    CopyToExportRegsIfNeeded(&I);
+
+  CurDebugLoc = DebugLoc();
+}
+
+void SelectionDAGBuilder::visitPHI(const PHINode &) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
+}
+
+void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
+  // Note: this doesn't use InstVisitor, because it has to work with
+  // ConstantExpr's in addition to instructions.
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown instruction type encountered!");
+    // Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS) \
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
+#include "llvm/IR/Instruction.def"
+  }
+
+  // Assign the ordering to the freshly created DAG nodes.
+  if (NodeMap.count(&I)) {
+    ++SDNodeOrder;
+    AssignOrderingToNode(getValue(&I).getNode());
+  }
+}
+
+// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+// generate the debug data structures now that we've seen its definition.
+void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
+                                                   SDValue Val) {
+  DanglingDebugInfo &DDI = DanglingDebugInfoMap[V];
+  if (DDI.getDI()) {
+    const DbgValueInst *DI = DDI.getDI();
+    DebugLoc dl = DDI.getdl();
+    unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
+    MDNode *Variable = DI->getVariable();
+    uint64_t Offset = DI->getOffset();
+    SDDbgValue *SDV;
+    if (Val.getNode()) {
+      if (!EmitFuncArgumentDbgValue(V, Variable, Offset, Val)) {
+        SDV = DAG.getDbgValue(Variable, Val.getNode(),
+                              Val.getResNo(), Offset, dl, DbgSDNodeOrder);
+        DAG.AddDbgValue(SDV, Val.getNode(), false);
+      }
+    } else
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+    DanglingDebugInfoMap[V] = DanglingDebugInfo();
+  }
+}
+
+/// getValue - Return an SDValue for the given Value.
+SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  if (It != FuncInfo.ValueMap.end()) {
+    unsigned InReg = It->second;
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
+    SDValue Chain = DAG.getEntryNode();
+    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
+    resolveDanglingDebugInfo(V, N);
+    return N;
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getNonRegisterValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    EVT VT = TLI.getValueType(V->getType(), true);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return DAG.getConstant(*CI, VT);
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return DAG.getGlobalAddress(GV, getCurDebugLoc(), VT);
+
+    if (isa<ConstantPointerNull>(C))
+      return DAG.getConstant(0, TLI.getPointerTy());
+
+    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+      return DAG.getConstantFP(*CFP, VT);
+
+    if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
+      return DAG.getUNDEF(VT);
+
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      visit(CE->getOpcode(), *CE);
+      SDValue N1 = NodeMap[V];
+      assert(N1.getNode() && "visit didn't populate the NodeMap!");
+      return N1;
+    }
+
+    if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
+      SmallVector<SDValue, 4> Constants;
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI) {
+        SDNode *Val = getValue(*OI).getNode();
+        // If the operand is an empty aggregate, there are no values.
+        if (!Val) continue;
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Constants.push_back(SDValue(Val, i));
+      }
+
+      return DAG.getMergeValues(&Constants[0], Constants.size(),
+                                getCurDebugLoc());
+    }
+    
+    if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(C)) {
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+        SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Ops.push_back(SDValue(Val, i));
+      }
+
+      if (isa<ArrayType>(CDS->getType()))
+        return DAG.getMergeValues(&Ops[0], Ops.size(), getCurDebugLoc());
+      return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                                      VT, &Ops[0], Ops.size());
+    }
+
+    if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
+      assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
+             "Unknown struct or array constant!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, C->getType(), ValueVTs);
+      unsigned NumElts = ValueVTs.size();
+      if (NumElts == 0)
+        return SDValue(); // empty struct
+      SmallVector<SDValue, 4> Constants(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        EVT EltVT = ValueVTs[i];
+        if (isa<UndefValue>(C))
+          Constants[i] = DAG.getUNDEF(EltVT);
+        else if (EltVT.isFloatingPoint())
+          Constants[i] = DAG.getConstantFP(0, EltVT);
+        else
+          Constants[i] = DAG.getConstant(0, EltVT);
+      }
+
+      return DAG.getMergeValues(&Constants[0], NumElts,
+                                getCurDebugLoc());
+    }
+
+    if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
+      return DAG.getBlockAddress(BA, VT);
+
+    VectorType *VecTy = cast<VectorType>(V->getType());
+    unsigned NumElements = VecTy->getNumElements();
+
+    // Now that we know the number and type of the elements, get that number of
+    // elements into the Ops array based on what kind of constant it is.
+    SmallVector<SDValue, 16> Ops;
+    if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+      for (unsigned i = 0; i != NumElements; ++i)
+        Ops.push_back(getValue(CV->getOperand(i)));
+    } else {
+      assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
+      EVT EltVT = TLI.getValueType(VecTy->getElementType());
+
+      SDValue Op;
+      if (EltVT.isFloatingPoint())
+        Op = DAG.getConstantFP(0, EltVT);
+      else
+        Op = DAG.getConstant(0, EltVT);
+      Ops.assign(NumElements, Op);
+    }
+
+    // Create a BUILD_VECTOR node.
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                                    VT, &Ops[0], Ops.size());
+  }
+
+  // If this is a static alloca, generate it as the frameindex instead of
+  // computation.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
+  }
+
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
+  }
+
+  llvm_unreachable("Can't get register for value!");
+}
+
+void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
+  SDValue Chain = getControlRoot();
+  SmallVector<ISD::OutputArg, 8> Outs;
+  SmallVector<SDValue, 8> OutVals;
+
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
+    const Function *F = I.getParent()->getParent();
+
+    // Emit a store of the return value through the virtual register.
+    // Leave Outs empty so that LowerReturn won't try to load return
+    // registers the usual way.
+    SmallVector<EVT, 1> PtrValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
+                    PtrValueVTs);
+
+    SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
+    SDValue RetOp = getValue(I.getOperand(0));
+
+    SmallVector<EVT, 4> ValueVTs;
+    SmallVector<uint64_t, 4> Offsets;
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    unsigned NumValues = ValueVTs.size();
+
+    SmallVector<SDValue, 4> Chains(NumValues);
+    for (unsigned i = 0; i != NumValues; ++i) {
+      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                                RetPtr.getValueType(), RetPtr,
+                                DAG.getIntPtrConstant(Offsets[i]));
+      Chains[i] =
+        DAG.getStore(Chain, getCurDebugLoc(),
+                     SDValue(RetOp.getNode(), RetOp.getResNo() + i),
+                     // FIXME: better loc info would be nice.
+                     Add, MachinePointerInfo(), false, false, 0);
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                        MVT::Other, &Chains[0], NumValues);
+  } else if (I.getNumOperands() != 0) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues) {
+      SDValue RetOp = getValue(I.getOperand(0));
+      for (unsigned j = 0, f = NumValues; j != f; ++j) {
+        EVT VT = ValueVTs[j];
+
+        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+        const Function *F = I.getParent()->getParent();
+        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                            Attribute::SExt))
+          ExtendKind = ISD::SIGN_EXTEND;
+        else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                                 Attribute::ZExt))
+          ExtendKind = ISD::ZERO_EXTEND;
+
+        if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
+          VT = TLI.getTypeForExtArgOrReturn(VT.getSimpleVT(), ExtendKind);
+
+        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
+        MVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
+        SmallVector<SDValue, 4> Parts(NumParts);
+        getCopyToParts(DAG, getCurDebugLoc(),
+                       SDValue(RetOp.getNode(), RetOp.getResNo() + j),
+                       &Parts[0], NumParts, PartVT, &I, ExtendKind);
+
+        // 'inreg' on function refers to return value
+        ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                            Attribute::InReg))
+          Flags.setInReg();
+
+        // Propagate extension type if any
+        if (ExtendKind == ISD::SIGN_EXTEND)
+          Flags.setSExt();
+        else if (ExtendKind == ISD::ZERO_EXTEND)
+          Flags.setZExt();
+
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+                                        /*isfixed=*/true, 0, 0));
+          OutVals.push_back(Parts[i]);
+        }
+      }
+    }
+  }
+
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CallingConv::ID CallConv =
+    DAG.getMachineFunction().getFunction()->getCallingConv();
+  Chain = TLI.LowerReturn(Chain, CallConv, isVarArg,
+                          Outs, OutVals, getCurDebugLoc(), DAG);
+
+  // Verify that the target's LowerReturn behaved as expected.
+  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+         "LowerReturn didn't return a valid chain!");
+
+  // Update the DAG with the new chain value resulting from return lowering.
+  DAG.setRoot(Chain);
+}
+
+/// CopyToExportRegsIfNeeded - If the given value has virtual registers
+/// created for it, emit nodes to copy the value into the virtual
+/// registers.
+void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  // Skip empty types
+  if (V->getType()->isEmptyTy())
+    return;
+
+  DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+  if (VMI != FuncInfo.ValueMap.end()) {
+    assert(!V->use_empty() && "Unused value assigned virtual registers!");
+    CopyValueToVirtualRegister(V, VMI->second);
+  }
+}
+
+/// ExportFromCurrentBlock - If this condition isn't known to be exported from
+/// the current basic block, add it to ValueMap now so that we'll get a
+/// CopyTo/FromReg.
+void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
+  // No need to export constants.
+  if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
+
+  // Already exported?
+  if (FuncInfo.isExportedInst(V)) return;
+
+  unsigned Reg = FuncInfo.InitializeRegForValue(V);
+  CopyValueToVirtualRegister(V, Reg);
+}
+
+bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
+                                                     const BasicBlock *FromBB) {
+  // The operands of the setcc have to be in this block.  We don't know
+  // how to export them from some other block.
+  if (const Instruction *VI = dyn_cast<Instruction>(V)) {
+    // Can export from current BB.
+    if (VI->getParent() == FromBB)
+      return true;
+
+    // Is already exported, noop.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // If this is an argument, we can export it if the BB is the entry block or
+  // if it is already exported.
+  if (isa<Argument>(V)) {
+    if (FromBB == &FromBB->getParent()->getEntryBlock())
+      return true;
+
+    // Otherwise, can only export this if it is already exported.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // Otherwise, constants can always be exported.
+  return true;
+}
+
+/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
+uint32_t SelectionDAGBuilder::getEdgeWeight(const MachineBasicBlock *Src,
+                                            const MachineBasicBlock *Dst) const {
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  if (!BPI)
+    return 0;
+  const BasicBlock *SrcBB = Src->getBasicBlock();
+  const BasicBlock *DstBB = Dst->getBasicBlock();
+  return BPI->getEdgeWeight(SrcBB, DstBB);
+}
+
+void SelectionDAGBuilder::
+addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
+                       uint32_t Weight /* = 0 */) {
+  if (!Weight)
+    Weight = getEdgeWeight(Src, Dst);
+  Src->addSuccessor(Dst, Weight);
+}
+
+
+static bool InBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
+/// This function emits a branch and is used at the leaves of an OR or an
+/// AND operator tree.
+///
+void
+SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
+                                                  MachineBasicBlock *TBB,
+                                                  MachineBasicBlock *FBB,
+                                                  MachineBasicBlock *CurBB,
+                                                  MachineBasicBlock *SwitchBB) {
+  const BasicBlock *BB = CurBB->getBasicBlock();
+
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    // The operands of the cmp have to be in this block.  We don't know
+    // how to export them from some other block.  If this is the first block
+    // of the sequence, no exporting is needed.
+    if (CurBB == SwitchBB ||
+        (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
+         isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
+      ISD::CondCode Condition;
+      if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+        Condition = getICmpCondCode(IC->getPredicate());
+      } else if (const FCmpInst *FC = dyn_cast<FCmpInst>(Cond)) {
+        Condition = getFCmpCondCode(FC->getPredicate());
+        if (TM.Options.NoNaNsFPMath)
+          Condition = getFCmpCodeWithoutNaN(Condition);
+      } else {
+        Condition = ISD::SETEQ; // silence warning.
+        llvm_unreachable("Unknown compare instruction");
+      }
+
+      CaseBlock CB(Condition, BOp->getOperand(0),
+                   BOp->getOperand(1), NULL, TBB, FBB, CurBB);
+      SwitchCases.push_back(CB);
+      return;
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
+               NULL, TBB, FBB, CurBB);
+  SwitchCases.push_back(CB);
+}
+
+/// FindMergedConditions - If Cond is an expression like
+void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
+                                               MachineBasicBlock *TBB,
+                                               MachineBasicBlock *FBB,
+                                               MachineBasicBlock *CurBB,
+                                               MachineBasicBlock *SwitchBB,
+                                               unsigned Opc) {
+  // If this node is not part of the or/and tree, emit it as a branch.
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
+      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
+      !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
+    EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI = CurBB;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc);
+
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc);
+  }
+}
+
+/// If the set of cases should be emitted as a series of branches, return true.
+/// If we should emit this as a bunch of and/or'd together conditions, return
+/// false.
+bool
+SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
+  if (Cases.size() != 2) return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].CC == Cases[1].CC &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+
+  return true;
+}
+
+void SelectionDAGBuilder::visitBr(const BranchInst &I) {
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = BrMBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  if (I.isUnconditional()) {
+    // Update machine-CFG edges.
+    BrMBB->addSuccessor(Succ0MBB);
+
+    // If this is not a fall-through branch, emit the branch.
+    if (Succ0MBB != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Succ0MBB)));
+
+    return;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = I.getCondition();
+  MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // As long as jumps are not expensive, this should improve performance.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  //
+  if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
+    if (!TLI.isJumpExpensive() &&
+        BOp->hasOneUse() &&
+        (BOp->getOpcode() == Instruction::And ||
+         BOp->getOpcode() == Instruction::Or)) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
+                           BOp->getOpcode());
+      // If the compares in later blocks need to use values not currently
+      // exported from this block, export them now.  This block should always
+      // be the first entry.
+      assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (ShouldEmitAsBranches(SwitchCases)) {
+        for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
+          ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
+          ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
+        }
+
+        // Emit the branch for this block.
+        visitSwitchCase(SwitchCases[0], BrMBB);
+        SwitchCases.erase(SwitchCases.begin());
+        return;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
+        FuncInfo.MF->erase(SwitchCases[i].ThisBB);
+
+      SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
+               NULL, Succ0MBB, Succ1MBB, BrMBB);
+
+  // Use visitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  visitSwitchCase(CB, BrMBB);
+}
+
+/// visitSwitchCase - Emits the necessary code to represent a single node in
+/// the binary search tree resulting from lowering a switch instruction.
+void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
+                                          MachineBasicBlock *SwitchBB) {
+  SDValue Cond;
+  SDValue CondLHS = getValue(CB.CmpLHS);
+  DebugLoc dl = getCurDebugLoc();
+
+  // Build the setcc now.
+  if (CB.CmpMHS == NULL) {
+    // Fold "(X == true)" to X and "(X == false)" to !X to
+    // handle common cases produced by branch lowering.
+    if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
+        CB.CC == ISD::SETEQ)
+      Cond = CondLHS;
+    else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
+             CB.CC == ISD::SETEQ) {
+      SDValue True = DAG.getConstant(1, CondLHS.getValueType());
+      Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
+    } else
+      Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
+  } else {
+    assert(CB.CC == ISD::SETCC_INVALID &&
+           "Condition is undefined for to-the-range belonging check.");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    SDValue CmpOp = getValue(CB.CmpMHS);
+    EVT VT = CmpOp.getValueType();
+    
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
+      Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
+                          ISD::SETULE);
+    } else {
+      SDValue SUB = DAG.getNode(ISD::SUB, dl,
+                                VT, CmpOp, DAG.getConstant(Low, VT));
+      Cond = DAG.getSetCC(dl, MVT::i1, SUB,
+                          DAG.getConstant(High-Low, VT), ISD::SETULE);
+    }
+  }
+
+  // Update successor info
+  addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight);
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  // If the lhs block is the next block, invert the condition so that we can
+  // fall through to the lhs instead of the rhs block.
+  if (CB.TrueBB == NextBlock) {
+    std::swap(CB.TrueBB, CB.FalseBB);
+    SDValue True = DAG.getConstant(1, Cond.getValueType());
+    Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
+  }
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, getControlRoot(), Cond,
+                               DAG.getBasicBlock(CB.TrueBB));
+
+  // Insert the false branch. Do this even if it's a fall through branch,
+  // this makes it easier to do DAG optimizations which require inverting
+  // the branch condition.
+  BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                       DAG.getBasicBlock(CB.FalseBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitJumpTable - Emit JumpTable node in the current MBB
+void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  EVT PTy = TLI.getPointerTy();
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+                                     JT.Reg, PTy);
+  SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
+  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurDebugLoc(),
+                                    MVT::Other, Index.getValue(1),
+                                    Table, Index);
+  DAG.setRoot(BrJumpTable);
+}
+
+/// visitJumpTableHeader - This function emits necessary code to produce index
+/// in the JumpTable from switch case.
+void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
+                                               JumpTableHeader &JTH,
+                                               MachineBasicBlock *SwitchBB) {
+  // Subtract the lowest switch case value from the value being switched on and
+  // conditional branch to default mbb if the result is greater than the
+  // difference between smallest and largest cases.
+  SDValue SwitchOp = getValue(JTH.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(JTH.First, VT));
+
+  // The SDNode we just created, which holds the value being switched on minus
+  // the smallest case value, needs to be copied to a virtual register so it
+  // can be used as an index into the jump table in a subsequent basic block.
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), TLI.getPointerTy());
+
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    JumpTableReg, SwitchOp);
+  JT.Reg = JumpTableReg;
+
+  // Emit the range check for the jump table, and branch to the default block
+  // for the switch statement if the value being switched on exceeds the largest
+  // case in the switch.
+  SDValue CMP = DAG.getSetCC(getCurDebugLoc(),
+                             TLI.getSetCCResultType(Sub.getValueType()), Sub,
+                             DAG.getConstant(JTH.Last-JTH.First,VT),
+                             ISD::SETUGT);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                               MVT::Other, CopyTo, CMP,
+                               DAG.getBasicBlock(JT.Default));
+
+  if (JT.MBB != NextBlock)
+    BrCond = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrCond,
+                         DAG.getBasicBlock(JT.MBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitBitTestHeader - This function emits necessary code to produce value
+/// suitable for "bit tests"
+void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
+                                             MachineBasicBlock *SwitchBB) {
+  // Subtract the minimum value
+  SDValue SwitchOp = getValue(B.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+                            DAG.getConstant(B.First, VT));
+
+  // Check range
+  SDValue RangeCmp = DAG.getSetCC(getCurDebugLoc(),
+                                  TLI.getSetCCResultType(Sub.getValueType()),
+                                  Sub, DAG.getConstant(B.Range, VT),
+                                  ISD::SETUGT);
+
+  // Determine the type of the test operands.
+  bool UsePtrType = false;
+  if (!TLI.isTypeLegal(VT))
+    UsePtrType = true;
+  else {
+    for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
+      if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
+        // Switch table case range are encoded into series of masks.
+        // Just use pointer type, it's guaranteed to fit.
+        UsePtrType = true;
+        break;
+      }
+  }
+  if (UsePtrType) {
+    VT = TLI.getPointerTy();
+    Sub = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), VT);
+  }
+
+  B.RegVT = VT.getSimpleVT();
+  B.Reg = FuncInfo.CreateReg(B.RegVT);
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+                                    B.Reg, Sub);
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  MachineBasicBlock* MBB = B.Cases[0].ThisBB;
+
+  addSuccessorWithWeight(SwitchBB, B.Default);
+  addSuccessorWithWeight(SwitchBB, MBB);
+
+  SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                                MVT::Other, CopyTo, RangeCmp,
+                                DAG.getBasicBlock(B.Default));
+
+  if (MBB != NextBlock)
+    BrRange = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, CopyTo,
+                          DAG.getBasicBlock(MBB));
+
+  DAG.setRoot(BrRange);
+}
+
+/// visitBitTestCase - this function produces one "bit test"
+void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
+                                           MachineBasicBlock* NextMBB,
+                                           uint32_t BranchWeightToNext,
+                                           unsigned Reg,
+                                           BitTestCase &B,
+                                           MachineBasicBlock *SwitchBB) {
+  MVT VT = BB.RegVT;
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+                                       Reg, VT);
+  SDValue Cmp;
+  unsigned PopCount = CountPopulation_64(B.Mask);
+  if (PopCount == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(VT),
+                       ShiftOp,
+                       DAG.getConstant(CountTrailingZeros_64(B.Mask), VT),
+                       ISD::SETEQ);
+  } else if (PopCount == BB.Range) {
+    // There is only one zero bit in the range, test for it directly.
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(VT),
+                       ShiftOp,
+                       DAG.getConstant(CountTrailingOnes_64(B.Mask), VT),
+                       ISD::SETNE);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(), VT,
+                                    DAG.getConstant(1, VT), ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                                VT, SwitchVal, DAG.getConstant(B.Mask, VT));
+    Cmp = DAG.getSetCC(getCurDebugLoc(),
+                       TLI.getSetCCResultType(VT),
+                       AndOp, DAG.getConstant(0, VT),
+                       ISD::SETNE);
+  }
+
+  // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
+  addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight);
+  // The branch weight from SwitchBB to NextMBB is BranchWeightToNext.
+  addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext);
+
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
+
+  // Set NextBlock to be the MBB immediately after the current one, if any.
+  // This is used to avoid emitting unnecessary branches to the next block.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = SwitchBB;
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  if (NextMBB != NextBlock)
+    BrAnd = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrAnd,
+                        DAG.getBasicBlock(NextMBB));
+
+  DAG.setRoot(BrAnd);
+}
+
+void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
+  MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
+
+  // Retrieve successors.
+  MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
+  MachineBasicBlock *LandingPad = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  const Value *Callee(I.getCalledValue());
+  const Function *Fn = dyn_cast<Function>(Callee);
+  if (isa<InlineAsm>(Callee))
+    visitInlineAsm(&I);
+  else if (Fn && Fn->isIntrinsic()) {
+    assert(Fn->getIntrinsicID() == Intrinsic::donothing);
+    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+  } else
+    LowerCallTo(&I, getValue(Callee), false, LandingPad);
+
+  // If the value of the invoke is used outside of its defining block, make it
+  // available as a virtual register.
+  CopyToExportRegsIfNeeded(&I);
+
+  // Update successor info
+  addSuccessorWithWeight(InvokeMBB, Return);
+  addSuccessorWithWeight(InvokeMBB, LandingPad);
+
+  // Drop into normal successor.
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                          MVT::Other, getControlRoot(),
+                          DAG.getBasicBlock(Return)));
+}
+
+void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
+}
+
+void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
+  assert(FuncInfo.MBB->isLandingPad() &&
+         "Call to landingpad not in landing pad!");
+
+  MachineBasicBlock *MBB = FuncInfo.MBB;
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  AddLandingPadInfo(LP, MMI, MBB);
+
+  // If there aren't registers to copy the values into (e.g., during SjLj
+  // exceptions), then don't bother to create these DAG nodes.
+  if (TLI.getExceptionPointerRegister() == 0 &&
+      TLI.getExceptionSelectorRegister() == 0)
+    return;
+
+  SmallVector<EVT, 2> ValueVTs;
+  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
+
+  // Insert the EXCEPTIONADDR instruction.
+  assert(FuncInfo.MBB->isLandingPad() &&
+         "Call to eh.exception not in landing pad!");
+  SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+  SDValue Ops[2];
+  Ops[0] = DAG.getRoot();
+  SDValue Op1 = DAG.getNode(ISD::EXCEPTIONADDR, getCurDebugLoc(), VTs, Ops, 1);
+  SDValue Chain = Op1.getValue(1);
+
+  // Insert the EHSELECTION instruction.
+  VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+  Ops[0] = Op1;
+  Ops[1] = Chain;
+  SDValue Op2 = DAG.getNode(ISD::EHSELECTION, getCurDebugLoc(), VTs, Ops, 2);
+  Chain = Op2.getValue(1);
+  Op2 = DAG.getSExtOrTrunc(Op2, getCurDebugLoc(), MVT::i32);
+
+  Ops[0] = Op1;
+  Ops[1] = Op2;
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                            DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                            &Ops[0], 2);
+
+  std::pair<SDValue, SDValue> RetPair = std::make_pair(Res, Chain);
+  setValue(&LP, RetPair.first);
+  DAG.setRoot(RetPair.second);
+}
+
+/// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
+/// small case ranges).
+bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
+                                                 CaseRecVector& WorkList,
+                                                 const Value* SV,
+                                                 MachineBasicBlock *Default,
+                                                 MachineBasicBlock *SwitchBB) {
+  // Size is the number of Cases represented by this range.
+  size_t Size = CR.Range.second - CR.Range.first;
+  if (Size > 3)
+    return false;
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineFunction::iterator BBI = CR.CaseBB;
+
+  if (++BBI != FuncInfo.MF->end())
+    NextBlock = BBI;
+
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  // If any two of the cases has the same destination, and if one value
+  // is the same as the other, but has one bit unset that the other has set,
+  // use bit manipulation to do two compares at once.  For example:
+  // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
+  // TODO: This could be extended to merge any 2 cases in switches with 3 cases.
+  // TODO: Handle cases where CR.CaseBB != SwitchBB.
+  if (Size == 2 && CR.CaseBB == SwitchBB) {
+    Case &Small = *CR.Range.first;
+    Case &Big = *(CR.Range.second-1);
+
+    if (Small.Low == Small.High && Big.Low == Big.High && Small.BB == Big.BB) {
+      const APInt& SmallValue = cast<ConstantInt>(Small.Low)->getValue();
+      const APInt& BigValue = cast<ConstantInt>(Big.Low)->getValue();
+
+      // Check that there is only one bit different.
+      if (BigValue.countPopulation() == SmallValue.countPopulation() + 1 &&
+          (SmallValue | BigValue) == BigValue) {
+        // Isolate the common bit.
+        APInt CommonBit = BigValue & ~SmallValue;
+        assert((SmallValue | CommonBit) == BigValue &&
+               CommonBit.countPopulation() == 1 && "Not a common bit?");
+
+        SDValue CondLHS = getValue(SV);
+        EVT VT = CondLHS.getValueType();
+        DebugLoc DL = getCurDebugLoc();
+
+        SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
+                                 DAG.getConstant(CommonBit, VT));
+        SDValue Cond = DAG.getSetCC(DL, MVT::i1,
+                                    Or, DAG.getConstant(BigValue, VT),
+                                    ISD::SETEQ);
+
+        // Update successor info.
+        // Both Small and Big will jump to Small.BB, so we sum up the weights.
+        addSuccessorWithWeight(SwitchBB, Small.BB,
+                               Small.ExtraWeight + Big.ExtraWeight);
+        addSuccessorWithWeight(SwitchBB, Default,
+          // The default destination is the first successor in IR.
+          BPI ? BPI->getEdgeWeight(SwitchBB->getBasicBlock(), (unsigned)0) : 0);
+
+        // Insert the true branch.
+        SDValue BrCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other,
+                                     getControlRoot(), Cond,
+                                     DAG.getBasicBlock(Small.BB));
+
+        // Insert the false branch.
+        BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
+                             DAG.getBasicBlock(Default));
+
+        DAG.setRoot(BrCond);
+        return true;
+      }
+    }
+  }
+
+  // Order cases by weight so the most likely case will be checked first.
+  uint32_t UnhandledWeights = 0;
+  if (BPI) {
+    for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) {
+      uint32_t IWeight = I->ExtraWeight;
+      UnhandledWeights += IWeight;
+      for (CaseItr J = CR.Range.first; J < I; ++J) {
+        uint32_t JWeight = J->ExtraWeight;
+        if (IWeight > JWeight)
+          std::swap(*I, *J);
+      }
+    }
+  }
+  // Rearrange the case blocks so that the last one falls through if possible.
+  Case &BackCase = *(CR.Range.second-1);
+  if (Size > 1 &&
+      NextBlock && Default != NextBlock && BackCase.BB != NextBlock) {
+    // The last case block won't fall through into 'NextBlock' if we emit the
+    // branches in this order.  See if rearranging a case value would help.
+    // We start at the bottom as it's the case with the least weight.
+    for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I){
+      if (I->BB == NextBlock) {
+        std::swap(*I, BackCase);
+        break;
+      }
+    }
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the Case's target mbb if the value being switched on SV is equal
+  // to C.
+  MachineBasicBlock *CurBlock = CR.CaseBB;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+    MachineBasicBlock *FallThrough;
+    if (I != E-1) {
+      FallThrough = CurMF->CreateMachineBasicBlock(CurBlock->getBasicBlock());
+      CurMF->insert(BBI, FallThrough);
+
+      // Put SV in a virtual register to make it available from the new blocks.
+      ExportFromCurrentBlock(SV);
+    } else {
+      // If the last case doesn't match, go to the default block.
+      FallThrough = Default;
+    }
+
+    const Value *RHS, *LHS, *MHS;
+    ISD::CondCode CC;
+    if (I->High == I->Low) {
+      // This is just small small case range :) containing exactly 1 case
+      CC = ISD::SETEQ;
+      LHS = SV; RHS = I->High; MHS = NULL;
+    } else {
+      CC = ISD::SETCC_INVALID; 
+      LHS = I->Low; MHS = SV; RHS = I->High;
+    }
+
+    // The false weight should be sum of all un-handled cases.
+    UnhandledWeights -= I->ExtraWeight;
+    CaseBlock CB(CC, LHS, RHS, MHS, /* truebb */ I->BB, /* falsebb */ FallThrough,
+                 /* me */ CurBlock,
+                 /* trueweight */ I->ExtraWeight,
+                 /* falseweight */ UnhandledWeights);
+
+    // If emitting the first comparison, just call visitSwitchCase to emit the
+    // code into the current block.  Otherwise, push the CaseBlock onto the
+    // vector to be later processed by SDISel, and insert the node's MBB
+    // before the next MBB.
+    if (CurBlock == SwitchBB)
+      visitSwitchCase(CB, SwitchBB);
+    else
+      SwitchCases.push_back(CB);
+
+    CurBlock = FallThrough;
+  }
+
+  return true;
+}
+
+static inline bool areJTsAllowed(const TargetLowering &TLI) {
+  return TLI.supportJumpTables() &&
+          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+static APInt ComputeRange(const APInt &First, const APInt &Last) {
+  uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
+  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
+  return (LastExt - FirstExt + 1ULL);
+}
+
+/// handleJTSwitchCase - Emit jumptable for current switch case range
+bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
+                                             CaseRecVector &WorkList,
+                                             const Value *SV,
+                                             MachineBasicBlock *Default,
+                                             MachineBasicBlock *SwitchBB) {
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  const APInt &First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt &Last  = cast<ConstantInt>(BackCase.High)->getValue();
+
+  APInt TSize(First.getBitWidth(), 0);
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
+    TSize += I->size();
+
+  if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries()))
+    return false;
+
+  APInt Range = ComputeRange(First, Last);
+  // The density is TSize / Range. Require at least 40%.
+  // It should not be possible for IntTSize to saturate for sane code, but make
+  // sure we handle Range saturation correctly.
+  uint64_t IntRange = Range.getLimitedValue(UINT64_MAX/10);
+  uint64_t IntTSize = TSize.getLimitedValue(UINT64_MAX/10);
+  if (IntTSize * 10 < IntRange * 4)
+    return false;
+
+  DEBUG(dbgs() << "Lowering jump table\n"
+               << "First entry: " << First << ". Last entry: " << Last << '\n'
+               << "Range: " << Range << ". Size: " << TSize << ".\n\n");
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Create a new basic block to hold the code for loading the address
+  // of the jump table, and jumping to it.  Update successor information;
+  // we will either branch to the default case for the switch, or the jump
+  // table.
+  MachineBasicBlock *JumpTableBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+  CurMF->insert(BBI, JumpTableBB);
+
+  addSuccessorWithWeight(CR.CaseBB, Default);
+  addSuccessorWithWeight(CR.CaseBB, JumpTableBB);
+
+  // Build a vector of destination BBs, corresponding to each target
+  // of the jump table. If the value of the jump table slot corresponds to
+  // a case statement, push the case's BB onto the vector, otherwise, push
+  // the default BB.
+  std::vector<MachineBasicBlock*> DestBBs;
+  APInt TEI = First;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++TEI) {
+    const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
+    const APInt &High = cast<ConstantInt>(I->High)->getValue();
+
+    if (Low.ule(TEI) && TEI.ule(High)) {
+      DestBBs.push_back(I->BB);
+      if (TEI==High)
+        ++I;
+    } else {
+      DestBBs.push_back(Default);
+    }
+  }
+
+  // Calculate weight for each unique destination in CR.
+  DenseMap<MachineBasicBlock*, uint32_t> DestWeights;
+  if (FuncInfo.BPI)
+    for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(I->BB);
+      if (Itr != DestWeights.end()) 
+        Itr->second += I->ExtraWeight;
+      else
+        DestWeights[I->BB] = I->ExtraWeight;
+    }
+
+  // Update successor info. Add one edge to each unique successor.
+  BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs());
+  for (std::vector<MachineBasicBlock*>::iterator I = DestBBs.begin(),
+         E = DestBBs.end(); I != E; ++I) {
+    if (!SuccsHandled[(*I)->getNumber()]) {
+      SuccsHandled[(*I)->getNumber()] = true;
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(*I);
+      addSuccessorWithWeight(JumpTableBB, *I,
+                             Itr != DestWeights.end() ? Itr->second : 0);
+    }
+  }
+
+  // Create a jump table index for this jump table.
+  unsigned JTEncoding = TLI.getJumpTableEncoding();
+  unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding)
+                       ->createJumpTableIndex(DestBBs);
+
+  // Set the jump table information so that we can codegen it as a second
+  // MachineBasicBlock
+  JumpTable JT(-1U, JTI, JumpTableBB, Default);
+  JumpTableHeader JTH(First, Last, SV, CR.CaseBB, (CR.CaseBB == SwitchBB));
+  if (CR.CaseBB == SwitchBB)
+    visitJumpTableHeader(JT, JTH, SwitchBB);
+
+  JTCases.push_back(JumpTableBlock(JTH, JT));
+  return true;
+}
+
+/// handleBTSplitSwitchCase - emit comparison and split binary search tree into
+/// 2 subtrees.
+bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
+                                                  CaseRecVector& WorkList,
+                                                  const Value* SV,
+                                                  MachineBasicBlock *Default,
+                                                  MachineBasicBlock *SwitchBB) {
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  // Size is the number of Cases represented by this range.
+  unsigned Size = CR.Range.second - CR.Range.first;
+
+  const APInt &First = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt &Last  = cast<ConstantInt>(BackCase.High)->getValue();
+  double FMetric = 0;
+  CaseItr Pivot = CR.Range.first + Size/2;
+
+  // Select optimal pivot, maximizing sum density of LHS and RHS. This will
+  // (heuristically) allow us to emit JumpTable's later.
+  APInt TSize(First.getBitWidth(), 0);
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I)
+    TSize += I->size();
+
+  APInt LSize = FrontCase.size();
+  APInt RSize = TSize-LSize;
+  DEBUG(dbgs() << "Selecting best pivot: \n"
+               << "First: " << First << ", Last: " << Last <<'\n'
+               << "LSize: " << LSize << ", RSize: " << RSize << '\n');
+  for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second;
+       J!=E; ++I, ++J) {
+    const APInt &LEnd = cast<ConstantInt>(I->High)->getValue();
+    const APInt &RBegin = cast<ConstantInt>(J->Low)->getValue();
+    APInt Range = ComputeRange(LEnd, RBegin);
+    assert((Range - 2ULL).isNonNegative() &&
+           "Invalid case distance");
+    // Use volatile double here to avoid excess precision issues on some hosts,
+    // e.g. that use 80-bit X87 registers.
+    volatile double LDensity =
+       (double)LSize.roundToDouble() /
+                           (LEnd - First + 1ULL).roundToDouble();
+    volatile double RDensity =
+      (double)RSize.roundToDouble() /
+                           (Last - RBegin + 1ULL).roundToDouble();
+    double Metric = Range.logBase2()*(LDensity+RDensity);
+    // Should always split in some non-trivial place
+    DEBUG(dbgs() <<"=>Step\n"
+                 << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n'
+                 << "LDensity: " << LDensity
+                 << ", RDensity: " << RDensity << '\n'
+                 << "Metric: " << Metric << '\n');
+    if (FMetric < Metric) {
+      Pivot = J;
+      FMetric = Metric;
+      DEBUG(dbgs() << "Current metric set to: " << FMetric << '\n');
+    }
+
+    LSize += J->size();
+    RSize -= J->size();
+  }
+  if (areJTsAllowed(TLI)) {
+    // If our case is dense we *really* should handle it earlier!
+    assert((FMetric > 0) && "Should handle dense range earlier!");
+  } else {
+    Pivot = CR.Range.first + Size/2;
+  }
+
+  CaseRange LHSR(CR.Range.first, Pivot);
+  CaseRange RHSR(Pivot, CR.Range.second);
+  const Constant *C = Pivot->Low;
+  MachineBasicBlock *FalseBB = 0, *TrueBB = 0;
+
+  // We know that we branch to the LHS if the Value being switched on is
+  // less than the Pivot value, C.  We use this to optimize our binary
+  // tree a bit, by recognizing that if SV is greater than or equal to the
+  // LHS's Case Value, and that Case Value is exactly one less than the
+  // Pivot's Value, then we can branch directly to the LHS's Target,
+  // rather than creating a leaf node for it.
+  if ((LHSR.second - LHSR.first) == 1 &&
+      LHSR.first->High == CR.GE &&
+      cast<ConstantInt>(C)->getValue() ==
+      (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
+    TrueBB = LHSR.first->BB;
+  } else {
+    TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, TrueBB);
+    WorkList.push_back(CaseRec(TrueBB, C, CR.GE, LHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Similar to the optimization above, if the Value being switched on is
+  // known to be less than the Constant CR.LT, and the current Case Value
+  // is CR.LT - 1, then we can branch directly to the target block for
+  // the current Case Value, rather than emitting a RHS leaf node for it.
+  if ((RHSR.second - RHSR.first) == 1 && CR.LT &&
+      cast<ConstantInt>(RHSR.first->Low)->getValue() ==
+      (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
+    FalseBB = RHSR.first->BB;
+  } else {
+    FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, FalseBB);
+    WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  // Create a CaseBlock record representing a conditional branch to
+  // the LHS node if the value being switched on SV is less than C.
+  // Otherwise, branch to LHS.
+  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+
+  if (CR.CaseBB == SwitchBB)
+    visitSwitchCase(CB, SwitchBB);
+  else
+    SwitchCases.push_back(CB);
+
+  return true;
+}
+
+/// handleBitTestsSwitchCase - if current case range has few destination and
+/// range span less, than machine word bitwidth, encode case range into series
+/// of masks and emit bit tests with these masks.
+bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
+                                                   CaseRecVector& WorkList,
+                                                   const Value* SV,
+                                                   MachineBasicBlock* Default,
+                                                   MachineBasicBlock *SwitchBB){
+  EVT PTy = TLI.getPointerTy();
+  unsigned IntPtrBits = PTy.getSizeInBits();
+
+  Case& FrontCase = *CR.Range.first;
+  Case& BackCase  = *(CR.Range.second-1);
+
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // If target does not have legal shift left, do not emit bit tests at all.
+  if (!TLI.isOperationLegal(ISD::SHL, TLI.getPointerTy()))
+    return false;
+
+  size_t numCmps = 0;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second;
+       I!=E; ++I) {
+    // Single case counts one, case range - two.
+    numCmps += (I->Low == I->High ? 1 : 2);
+  }
+
+  // Count unique destinations
+  SmallSet<MachineBasicBlock*, 4> Dests;
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    Dests.insert(I->BB);
+    if (Dests.size() > 3)
+      // Don't bother the code below, if there are too much unique destinations
+      return false;
+  }
+  DEBUG(dbgs() << "Total number of unique destinations: "
+        << Dests.size() << '\n'
+        << "Total number of comparisons: " << numCmps << '\n');
+
+  // Compute span of values.
+  const APInt& minValue = cast<ConstantInt>(FrontCase.Low)->getValue();
+  const APInt& maxValue = cast<ConstantInt>(BackCase.High)->getValue();
+  APInt cmpRange = maxValue - minValue;
+
+  DEBUG(dbgs() << "Compare range: " << cmpRange << '\n'
+               << "Low bound: " << minValue << '\n'
+               << "High bound: " << maxValue << '\n');
+
+  if (cmpRange.uge(IntPtrBits) ||
+      (!(Dests.size() == 1 && numCmps >= 3) &&
+       !(Dests.size() == 2 && numCmps >= 5) &&
+       !(Dests.size() >= 3 && numCmps >= 6)))
+    return false;
+
+  DEBUG(dbgs() << "Emitting bit tests\n");
+  APInt lowBound = APInt::getNullValue(cmpRange.getBitWidth());
+
+  // Optimize the case where all the case values fit in a
+  // word without having to subtract minValue. In this case,
+  // we can optimize away the subtraction.
+  if (maxValue.ult(IntPtrBits)) {
+    cmpRange = maxValue;
+  } else {
+    lowBound = minValue;
+  }
+
+  CaseBitsVector CasesBits;
+  unsigned i, count = 0;
+
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+    MachineBasicBlock* Dest = I->BB;
+    for (i = 0; i < count; ++i)
+      if (Dest == CasesBits[i].BB)
+        break;
+
+    if (i == count) {
+      assert((count < 3) && "Too much destinations to test!");
+      CasesBits.push_back(CaseBits(0, Dest, 0, 0/*Weight*/));
+      count++;
+    }
+
+    const APInt& lowValue = cast<ConstantInt>(I->Low)->getValue();
+    const APInt& highValue = cast<ConstantInt>(I->High)->getValue();
+
+    uint64_t lo = (lowValue - lowBound).getZExtValue();
+    uint64_t hi = (highValue - lowBound).getZExtValue();
+    CasesBits[i].ExtraWeight += I->ExtraWeight;
+
+    for (uint64_t j = lo; j <= hi; j++) {
+      CasesBits[i].Mask |=  1ULL << j;
+      CasesBits[i].Bits++;
+    }
+
+  }
+  std::sort(CasesBits.begin(), CasesBits.end(), CaseBitsCmp());
+
+  BitTestInfo BTC;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
+
+  DEBUG(dbgs() << "Cases:\n");
+  for (unsigned i = 0, e = CasesBits.size(); i!=e; ++i) {
+    DEBUG(dbgs() << "Mask: " << CasesBits[i].Mask
+                 << ", Bits: " << CasesBits[i].Bits
+                 << ", BB: " << CasesBits[i].BB << '\n');
+
+    MachineBasicBlock *CaseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
+    CurMF->insert(BBI, CaseBB);
+    BTC.push_back(BitTestCase(CasesBits[i].Mask,
+                              CaseBB,
+                              CasesBits[i].BB, CasesBits[i].ExtraWeight));
+
+    // Put SV in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(SV);
+  }
+
+  BitTestBlock BTB(lowBound, cmpRange, SV,
+                   -1U, MVT::Other, (CR.CaseBB == SwitchBB),
+                   CR.CaseBB, Default, BTC);
+
+  if (CR.CaseBB == SwitchBB)
+    visitBitTestHeader(BTB, SwitchBB);
+
+  BitTestCases.push_back(BTB);
+
+  return true;
+}
+
+/// Clusterify - Transform simple list of Cases into list of CaseRange's
+size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
+                                       const SwitchInst& SI) {
+  
+  /// Use a shorter form of declaration, and also
+  /// show the we want to use CRSBuilder as Clusterifier.
+  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
+  
+  Clusterifier TheClusterifier;
+
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  // Start with "simple" cases
+  for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
+       i != e; ++i) {
+    const BasicBlock *SuccBB = i.getCaseSuccessor();
+    MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
+
+    TheClusterifier.add(i.getCaseValueEx(), SMBB, 
+        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
+  }
+  
+  TheClusterifier.optimize();
+  
+  size_t numCmps = 0;
+  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    Clusterifier::Cluster &C = *i;
+    // Update edge weight for the cluster.
+    unsigned W = C.first.Weight;
+
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(Case(C.first.getLow().toConstantInt(),
+                         C.first.getHigh().toConstantInt(), C.second, W));
+    
+    if (C.first.getLow() != C.first.getHigh())
+    // A range counts double, since it requires two compares.
+    ++numCmps;
+  }
+
+  return numCmps;
+}
+
+void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
+                                           MachineBasicBlock *Last) {
+  // Update JTCases.
+  for (unsigned i = 0, e = JTCases.size(); i != e; ++i)
+    if (JTCases[i].first.HeaderBB == First)
+      JTCases[i].first.HeaderBB = Last;
+
+  // Update BitTestCases.
+  for (unsigned i = 0, e = BitTestCases.size(); i != e; ++i)
+    if (BitTestCases[i].Parent == First)
+      BitTestCases[i].Parent = Last;
+}
+
+void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
+
+  // Figure out which block is immediately after the current one.
+  MachineBasicBlock *NextBlock = 0;
+  MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  // If there is only the default destination, branch to it if it is not the
+  // next basic block.  Otherwise, just fall through.
+  if (!SI.getNumCases()) {
+    // Update machine-CFG edges.
+
+    // If this is not a fall-through branch, emit the branch.
+    SwitchMBB->addSuccessor(Default);
+    if (Default != NextBlock)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Default)));
+
+    return;
+  }
+
+  // If there are any non-default case statements, create a vector of Cases
+  // representing each one, and sort the vector so that we can efficiently
+  // create a binary search tree from them.
+  CaseVector Cases;
+  size_t numCmps = Clusterify(Cases, SI);
+  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << '\n');
+  (void)numCmps;
+
+  // Get the Value to be switched on and default basic blocks, which will be
+  // inserted into CaseBlock records, representing basic blocks in the binary
+  // search tree.
+  const Value *SV = SI.getCondition();
+
+  // Push the initial CaseRec onto the worklist
+  CaseRecVector WorkList;
+  WorkList.push_back(CaseRec(SwitchMBB,0,0,
+                             CaseRange(Cases.begin(),Cases.end())));
+
+  while (!WorkList.empty()) {
+    // Grab a record representing a case range to process off the worklist
+    CaseRec CR = WorkList.back();
+    WorkList.pop_back();
+
+    if (handleBitTestsSwitchCase(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // If the range has few cases (two or less) emit a series of specific
+    // tests.
+    if (handleSmallSwitchRange(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // If the switch has more than N blocks, and is at least 40% dense, and the
+    // target supports indirect branches, then emit a jump table rather than
+    // lowering the switch to a binary tree of conditional branches.
+    // N defaults to 4 and is controlled via TLS.getMinimumJumpTableEntries().
+    if (handleJTSwitchCase(CR, WorkList, SV, Default, SwitchMBB))
+      continue;
+
+    // Emit binary tree. We need to pick a pivot, and push left and right ranges
+    // onto the worklist. Leafs are handled via handleSmallSwitchRange() call.
+    handleBTSplitSwitchCase(CR, WorkList, SV, Default, SwitchMBB);
+  }
+}
+
+void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges with unique successors.
+  SmallSet<BasicBlock*, 32> Done;
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
+    BasicBlock *BB = I.getSuccessor(i);
+    bool Inserted = Done.insert(BB);
+    if (!Inserted)
+        continue;
+
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
+    addSuccessorWithWeight(IndirectBrMBB, Succ);
+  }
+
+  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurDebugLoc(),
+                          MVT::Other, getControlRoot(),
+                          getValue(I.getAddress())));
+}
+
+void SelectionDAGBuilder::visitFSub(const User &I) {
+  // -0.0 - X --> fneg
+  Type *Ty = I.getType();
+  if (isa<Constant>(I.getOperand(0)) &&
+      I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
+    SDValue Op2 = getValue(I.getOperand(1));
+    setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+                             Op2.getValueType(), Op2));
+    return;
+  }
+
+  visitBinary(I, ISD::FSUB);
+}
+
+void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  setValue(&I, DAG.getNode(OpCode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  EVT ShiftTy = TLI.getShiftAmountTy(Op2.getValueType());
+
+  // Coerce the shift amount to the right type if we can.
+  if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
+    unsigned ShiftSize = ShiftTy.getSizeInBits();
+    unsigned Op2Size = Op2.getValueType().getSizeInBits();
+    DebugLoc DL = getCurDebugLoc();
+
+    // If the operand is smaller than the shift count type, promote it.
+    if (ShiftSize > Op2Size)
+      Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
+
+    // If the operand is larger than the shift count type but the shift
+    // count type has enough bits to represent any shift value, truncate
+    // it now. This is a common case and it exposes the truncate to
+    // optimization early.
+    else if (ShiftSize >= Log2_32_Ceil(Op2.getValueType().getSizeInBits()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
+    // Otherwise we'll need to temporarily settle for some other convenient
+    // type.  Type legalization will make adjustments once the shiftee is split.
+    else
+      Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
+  }
+
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(),
+                           Op1.getValueType(), Op1, Op2));
+}
+
+void SelectionDAGBuilder::visitSDiv(const User &I) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  // Turn exact SDivs into multiplications.
+  // FIXME: This should be in DAGCombiner, but it doesn't have access to the
+  // exact bit.
+  if (isa<BinaryOperator>(&I) && cast<BinaryOperator>(&I)->isExact() &&
+      !isa<ConstantSDNode>(Op1) &&
+      isa<ConstantSDNode>(Op2) && !cast<ConstantSDNode>(Op2)->isNullValue())
+    setValue(&I, TLI.BuildExactSDIV(Op1, Op2, getCurDebugLoc(), DAG));
+  else
+    setValue(&I, DAG.getNode(ISD::SDIV, getCurDebugLoc(), Op1.getValueType(),
+                             Op1, Op2));
+}
+
+void SelectionDAGBuilder::visitICmp(const User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Opcode));
+}
+
+void SelectionDAGBuilder::visitFCmp(const User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+  if (TM.Options.NoNaNsFPMath)
+    Condition = getFCmpCodeWithoutNaN(Condition);
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Condition));
+}
+
+void SelectionDAGBuilder::visitSelect(const User &I) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Cond     = getValue(I.getOperand(0));
+  SDValue TrueVal  = getValue(I.getOperand(1));
+  SDValue FalseVal = getValue(I.getOperand(2));
+  ISD::NodeType OpCode = Cond.getValueType().isVector() ?
+    ISD::VSELECT : ISD::SELECT;
+
+  for (unsigned i = 0; i != NumValues; ++i)
+    Values[i] = DAG.getNode(OpCode, getCurDebugLoc(),
+                            TrueVal.getNode()->getValueType(TrueVal.getResNo()+i),
+                            Cond,
+                            SDValue(TrueVal.getNode(),
+                                    TrueVal.getResNo() + i),
+                            SDValue(FalseVal.getNode(),
+                                    FalseVal.getResNo() + i));
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValueVTs[0], NumValues),
+                           &Values[0], NumValues));
+}
+
+void SelectionDAGBuilder::visitTrunc(const User &I) {
+  // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitZExt(const User &I) {
+  // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // ZExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSExt(const User &I) {
+  // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // SExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPTrunc(const User &I) {
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurDebugLoc(),
+                           DestVT, N,
+                           DAG.getTargetConstant(0, TLI.getPointerTy())));
+}
+
+void SelectionDAGBuilder::visitFPExt(const User &I){
+  // FPExt is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToUI(const User &I) {
+  // FPToUI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToSI(const User &I) {
+  // FPToSI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitUIToFP(const User &I) {
+  // UIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSIToFP(const User &I){
+  // SIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurDebugLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitPtrToInt(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitIntToPtr(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitBitCast(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = TLI.getValueType(I.getType());
+
+  // BitCast assures us that source and destination are the same size so this is
+  // either a BITCAST or a no-op.
+  if (DestVT != N.getValueType())
+    setValue(&I, DAG.getNode(ISD::BITCAST, getCurDebugLoc(),
+                             DestVT, N)); // convert types.
+  else
+    setValue(&I, N);            // noop cast.
+}
+
+void SelectionDAGBuilder::visitInsertElement(const User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InVal = getValue(I.getOperand(1));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                              TLI.getPointerTy(),
+                              getValue(I.getOperand(2)));
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()),
+                           InVec, InVal, InIdx));
+}
+
+void SelectionDAGBuilder::visitExtractElement(const User &I) {
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
+                              TLI.getPointerTy(),
+                              getValue(I.getOperand(1)));
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                           TLI.getValueType(I.getType()), InVec, InIdx));
+}
+
+// Utility for visitShuffleVector - Return true if every element in Mask,
+// beginning from position Pos and ending in Pos+Size, falls within the
+// specified sequential range [L, L+Pos). or is undef.
+static bool isSequentialInRange(const SmallVectorImpl<int> &Mask,
+                                unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (Mask[i] >= 0 && Mask[i] != Low)
+      return false;
+  return true;
+}
+
+void SelectionDAGBuilder::visitShuffleVector(const User &I) {
+  SDValue Src1 = getValue(I.getOperand(0));
+  SDValue Src2 = getValue(I.getOperand(1));
+
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
+  unsigned MaskNumElts = Mask.size();
+  
+  EVT VT = TLI.getValueType(I.getType());
+  EVT SrcVT = Src1.getValueType();
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                      &Mask[0]));
+    return;
+  }
+
+  // Normalize the shuffle vector since mask and vector length don't match.
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // lengths match.
+    if (SrcNumElts*2 == MaskNumElts) {
+      // First check for Src1 in low and Src2 in high
+      if (isSequentialInRange(Mask, 0, SrcNumElts, 0) &&
+          isSequentialInRange(Mask, SrcNumElts, SrcNumElts, SrcNumElts)) {
+        // The shuffle is concatenating two vectors together.
+        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+                                 VT, Src1, Src2));
+        return;
+      }
+      // Then check for Src2 in low and Src1 in high
+      if (isSequentialInRange(Mask, 0, SrcNumElts, SrcNumElts) &&
+          isSequentialInRange(Mask, SrcNumElts, SrcNumElts, 0)) {
+        // The shuffle is concatenating two vectors together.
+        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+                                 VT, Src2, Src1));
+        return;
+      }
+    }
+
+    // Pad both vectors with undefs to make them the same length as the mask.
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+    bool Src1U = Src1.getOpcode() == ISD::UNDEF;
+    bool Src2U = Src2.getOpcode() == ISD::UNDEF;
+    SDValue UndefVal = DAG.getUNDEF(SrcVT);
+
+    SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
+    SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
+    MOps1[0] = Src1;
+    MOps2[0] = Src2;
+
+    Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
+                                                  getCurDebugLoc(), VT,
+                                                  &MOps1[0], NumConcat);
+    Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
+                                                  getCurDebugLoc(), VT,
+                                                  &MOps2[0], NumConcat);
+
+    // Readjust mask for new input vector length.
+    SmallVector<int, 8> MappedOps;
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx >= (int)SrcNumElts)
+        Idx -= SrcNumElts - MaskNumElts;
+      MappedOps.push_back(Idx);
+    }
+
+    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                      &MappedOps[0]));
+    return;
+  }
+
+  if (SrcNumElts > MaskNumElts) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { static_cast<int>(SrcNumElts),
+                        static_cast<int>(SrcNumElts)};
+    int MaxRange[2] = {-1, -1};
+
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      unsigned Input = 0;
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= (int)SrcNumElts) {
+        Input = 1;
+        Idx -= SrcNumElts;
+      }
+      if (Idx > MaxRange[Input])
+        MaxRange[Input] = Idx;
+      if (Idx < MinRange[Input])
+        MinRange[Input] = Idx;
+    }
+
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
+                                   // Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (unsigned Input = 0; Input < 2; ++Input) {
+      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+        continue;
+      }
+
+      // Find a good start index that is a multiple of the mask length. Then
+      // see if the rest of the elements are in range.
+      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
+          StartIdx[Input] + MaskNumElts <= SrcNumElts)
+        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+      setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
+      return;
+    }
+    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (unsigned Input = 0; Input < 2; ++Input) {
+        SDValue &Src = Input == 0 ? Src1 : Src2;
+        if (RangeUse[Input] == 0)
+          Src = DAG.getUNDEF(VT);
+        else
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurDebugLoc(), VT,
+                            Src, DAG.getIntPtrConstant(StartIdx[Input]));
+      }
+
+      // Calculate new mask.
+      SmallVector<int, 8> MappedOps;
+      for (unsigned i = 0; i != MaskNumElts; ++i) {
+        int Idx = Mask[i];
+        if (Idx >= 0) {
+          if (Idx < (int)SrcNumElts)
+            Idx -= StartIdx[0];
+          else
+            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        }
+        MappedOps.push_back(Idx);
+      }
+
+      setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+                                        &MappedOps[0]));
+      return;
+    }
+  }
+
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
+  EVT EltVT = VT.getVectorElementType();
+  EVT PtrVT = TLI.getPointerTy();
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    int Idx = Mask[i];
+    SDValue Res;
+
+    if (Idx < 0) {
+      Res = DAG.getUNDEF(EltVT);
+    } else {
+      SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
+      if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
+
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
+                        EltVT, Src, DAG.getConstant(Idx, PtrVT));
+    }
+
+    Ops.push_back(Res);
+  }
+
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+                           VT, &Ops[0], Ops.size()));
+}
+
+void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Value *Op1 = I.getOperand(1);
+  Type *AggTy = I.getType();
+  Type *ValTy = Op1->getType();
+  bool IntoUndef = isa<UndefValue>(Op0);
+  bool FromUndef = isa<UndefValue>(Op1);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumAggValues = AggValueVTs.size();
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumAggValues);
+
+  SDValue Agg = getValue(Op0);
+  unsigned i = 0;
+  // Copy the beginning value(s) from the original aggregate.
+  for (; i != LinearIndex; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+  // Copy values from the inserted value(s).
+  if (NumValValues) {
+    SDValue Val = getValue(Op1);
+    for (; i != LinearIndex + NumValValues; ++i)
+      Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                  SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  }
+  // Copy remaining value(s) from the original aggregate.
+  for (; i != NumAggValues; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&AggValueVTs[0], NumAggValues),
+                           &Values[0], NumAggValues));
+}
+
+void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  Type *AggTy = Op0->getType();
+  Type *ValTy = I.getType();
+  bool OutOfUndef = isa<UndefValue>(Op0);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+
+  unsigned NumValValues = ValValueVTs.size();
+
+  // Ignore a extractvalue that produces an empty object
+  if (!NumValValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
+  SmallVector<SDValue, 4> Values(NumValValues);
+
+  SDValue Agg = getValue(Op0);
+  // Copy out the selected value(s).
+  for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
+    Values[i - LinearIndex] =
+      OutOfUndef ?
+        DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
+        SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValValueVTs[0], NumValValues),
+                           &Values[0], NumValValues));
+}
+
+void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  // Note that the pointer operand may be a vector of pointers. Take the scalar
+  // element which holds a pointer.
+  Type *Ty = I.getOperand(0)->getType()->getScalarType();
+
+  for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
+       OI != E; ++OI) {
+    const Value *Idx = *OI;
+    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
+      unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offset = TD->getStructLayout(StTy)->getElementOffset(Field);
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        DAG.getConstant(Offset, N.getValueType()));
+      }
+
+      Ty = StTy->getElementType(Field);
+    } else {
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero()) continue;
+        uint64_t Offs =
+            TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
+        SDValue OffsVal;
+        EVT PTy = TLI.getPointerTy();
+        unsigned PtrBits = PTy.getSizeInBits();
+        if (PtrBits < 64)
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
+                                TLI.getPointerTy(),
+                                DAG.getConstant(Offs, MVT::i64));
+        else
+          OffsVal = DAG.getIntPtrConstant(Offs);
+
+        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+                        OffsVal);
+        continue;
+      }
+
+      // N = N + Idx * ElementSize;
+      APInt ElementSize = APInt(TLI.getPointerTy().getSizeInBits(),
+                                TD->getTypeAllocSize(Ty));
+      SDValue IdxN = getValue(Idx);
+
+      // If the index is smaller or larger than intptr_t, truncate or extend
+      // it.
+      IdxN = DAG.getSExtOrTrunc(IdxN, getCurDebugLoc(), N.getValueType());
+
+      // If this is a multiply by a power of two, turn it into a shl
+      // immediately.  This is a very common case.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          unsigned Amt = ElementSize.logBase2();
+          IdxN = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+                             N.getValueType(), IdxN,
+                             DAG.getConstant(Amt, IdxN.getValueType()));
+        } else {
+          SDValue Scale = DAG.getConstant(ElementSize, IdxN.getValueType());
+          IdxN = DAG.getNode(ISD::MUL, getCurDebugLoc(),
+                             N.getValueType(), IdxN, Scale);
+        }
+      }
+
+      N = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                      N.getValueType(), N, IdxN);
+    }
+  }
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
+  // If this is a fixed sized alloca in the entry block of the function,
+  // allocate it statically on the stack.
+  if (FuncInfo.StaticAllocaMap.count(&I))
+    return;   // getValue will auto-populate this.
+
+  Type *Ty = I.getAllocatedType();
+  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+  unsigned Align =
+    std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+             I.getAlignment());
+
+  SDValue AllocSize = getValue(I.getArraySize());
+
+  EVT IntPtr = TLI.getPointerTy();
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+
+  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), IntPtr,
+                          AllocSize,
+                          DAG.getConstant(TySize, IntPtr));
+
+  // Handle alignment.  If the requested alignment is less than or equal to
+  // the stack alignment, ignore it.  If the size is greater than or equal to
+  // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
+  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size.
+  AllocSize = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(StackAlign-1));
+
+  // Mask out the low bits for alignment purposes.
+  AllocSize = DAG.getNode(ISD::AND, getCurDebugLoc(),
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(~(uint64_t)(StackAlign-1)));
+
+  SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) };
+  SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurDebugLoc(),
+                            VTs, Ops, 3);
+  setValue(&I, DSA);
+  DAG.setRoot(DSA.getValue(1));
+
+  // Inform the Frame Information that we have just allocated a variable-sized
+  // object.
+  FuncInfo.MF->getFrameInfo()->CreateVariableSizedObject(Align ? Align : 1);
+}
+
+void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
+  if (I.isAtomic())
+    return visitAtomicLoad(I);
+
+  const Value *SV = I.getOperand(0);
+  SDValue Ptr = getValue(SV);
+
+  Type *Ty = I.getType();
+
+  bool isVolatile = I.isVolatile();
+  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
+  bool isInvariant = I.getMetadata("invariant.load") != 0;
+  unsigned Alignment = I.getAlignment();
+  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  SDValue Root;
+  bool ConstantMemory = false;
+  if (I.isVolatile() || NumValues > MaxParallelChains)
+    // Serialize volatile loads with other side effects.
+    Root = getRoot();
+  else if (AA->pointsToConstantMemory(
+             AliasAnalysis::Location(SV, AA->getTypeStoreSize(Ty), TBAAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = DAG.getRoot();
+  }
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SmallVector<SDValue, 4> Chains(std::min(unsigned(MaxParallelChains),
+                                          NumValues));
+  EVT PtrVT = Ptr.getValueType();
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // Serializing loads here may result in excessive register pressure, and
+    // TokenFactor places arbitrary choke points on the scheduler. SD scheduling
+    // could recover a bit by hoisting nodes upward in the chain by recognizing
+    // they are side-effect free or do not alias. The optimizer should really
+    // avoid this case by converting large object/array copies to llvm.memcpy
+    // (MaxParallelChains should always remain as failsafe).
+    if (ChainI == MaxParallelChains) {
+      assert(PendingLoads.empty() && "PendingLoads must be serialized first");
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                  MVT::Other, &Chains[0], ChainI);
+      Root = Chain;
+      ChainI = 0;
+    }
+    SDValue A = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+                            PtrVT, Ptr,
+                            DAG.getConstant(Offsets[i], PtrVT));
+    SDValue L = DAG.getLoad(ValueVTs[i], getCurDebugLoc(), Root,
+                            A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
+                            isNonTemporal, isInvariant, Alignment, TBAAInfo,
+                            Ranges);
+
+    Values[i] = L;
+    Chains[ChainI] = L.getValue(1);
+  }
+
+  if (!ConstantMemory) {
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                MVT::Other, &Chains[0], ChainI);
+    if (isVolatile)
+      DAG.setRoot(Chain);
+    else
+      PendingLoads.push_back(Chain);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                           DAG.getVTList(&ValueVTs[0], NumValues),
+                           &Values[0], NumValues));
+}
+
+void SelectionDAGBuilder::visitStore(const StoreInst &I) {
+  if (I.isAtomic())
+    return visitAtomicStore(I);
+
+  const Value *SrcV = I.getOperand(0);
+  const Value *PtrV = I.getOperand(1);
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, SrcV->getType(), ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  // Get the lowered operands. Note that we do this after
+  // checking if NumResults is zero, because with zero results
+  // the operands won't have values in the map.
+  SDValue Src = getValue(SrcV);
+  SDValue Ptr = getValue(PtrV);
+
+  SDValue Root = getRoot();
+  SmallVector<SDValue, 4> Chains(std::min(unsigned(MaxParallelChains),
+                                          NumValues));
+  EVT PtrVT = Ptr.getValueType();
+  bool isVolatile = I.isVolatile();
+  bool isNonTemporal = I.getMetadata("nontemporal") != 0;
+  unsigned Alignment = I.getAlignment();
+  const MDNode *TBAAInfo = I.getMetadata(LLVMContext::MD_tbaa);
+
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // See visitLoad comments.
+    if (ChainI == MaxParallelChains) {
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                  MVT::Other, &Chains[0], ChainI);
+      Root = Chain;
+      ChainI = 0;
+    }
+    SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, Ptr,
+                              DAG.getConstant(Offsets[i], PtrVT));
+    SDValue St = DAG.getStore(Root, getCurDebugLoc(),
+                              SDValue(Src.getNode(), Src.getResNo() + i),
+                              Add, MachinePointerInfo(PtrV, Offsets[i]),
+                              isVolatile, isNonTemporal, Alignment, TBAAInfo);
+    Chains[ChainI] = St;
+  }
+
+  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                  MVT::Other, &Chains[0], ChainI);
+  ++SDNodeOrder;
+  AssignOrderingToNode(StoreNode.getNode());
+  DAG.setRoot(StoreNode);
+}
+
+static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
+                                    SynchronizationScope Scope,
+                                    bool Before, DebugLoc dl,
+                                    SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  // Fence, if necessary
+  if (Before) {
+    if (Order == AcquireRelease || Order == SequentiallyConsistent)
+      Order = Release;
+    else if (Order == Acquire || Order == Monotonic)
+      return Chain;
+  } else {
+    if (Order == AcquireRelease)
+      Order = Acquire;
+    else if (Order == Release || Order == Monotonic)
+      return Chain;
+  }
+  SDValue Ops[3];
+  Ops[0] = Chain;
+  Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
+  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3);
+}
+
+void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue L =
+    DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
+                  getValue(I.getCompareOperand()).getValueType().getSimpleVT(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getCompareOperand()),
+                  getValue(I.getNewValOperand()),
+                  MachinePointerInfo(I.getPointerOperand()), 0 /* Alignment */,
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  ISD::NodeType NT;
+  switch (I.getOperation()) {
+  default: llvm_unreachable("Unknown atomicrmw operation");
+  case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
+  case AtomicRMWInst::Add:  NT = ISD::ATOMIC_LOAD_ADD; break;
+  case AtomicRMWInst::Sub:  NT = ISD::ATOMIC_LOAD_SUB; break;
+  case AtomicRMWInst::And:  NT = ISD::ATOMIC_LOAD_AND; break;
+  case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
+  case AtomicRMWInst::Or:   NT = ISD::ATOMIC_LOAD_OR; break;
+  case AtomicRMWInst::Xor:  NT = ISD::ATOMIC_LOAD_XOR; break;
+  case AtomicRMWInst::Max:  NT = ISD::ATOMIC_LOAD_MAX; break;
+  case AtomicRMWInst::Min:  NT = ISD::ATOMIC_LOAD_MIN; break;
+  case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
+  case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
+  }
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue L =
+    DAG.getAtomic(NT, dl,
+                  getValue(I.getValOperand()).getValueType().getSimpleVT(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValOperand()),
+                  I.getPointerOperand(), 0 /* Alignment */,
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitFence(const FenceInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Ops[3];
+  Ops[0] = getRoot();
+  Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy());
+  DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3));
+}
+
+void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  EVT VT = TLI.getValueType(I.getType());
+
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic load");
+
+  SDValue L =
+    DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
+                  getValue(I.getPointerOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
+
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic store");
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue OutChain =
+    DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValueOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  DAG.setRoot(OutChain);
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  bool HasChain = !I.doesNotAccessMemory();
+  bool OnlyLoad = HasChain && I.onlyReadsMemory();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  if (HasChain) {  // If this intrinsic has side-effects, chainify it.
+    if (OnlyLoad) {
+      // We don't need to serialize loads against other loads.
+      Ops.push_back(DAG.getRoot());
+    } else {
+      Ops.push_back(getRoot());
+    }
+  }
+
+  // Info is set by getTgtMemInstrinsic
+  TargetLowering::IntrinsicInfo Info;
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
+
+  // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
+  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
+      Info.opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI.getPointerTy()));
+
+  // Add all operands of the call to the operand list.
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getArgOperand(i));
+    Ops.push_back(Op);
+  }
+
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+
+  if (HasChain)
+    ValueVTs.push_back(MVT::Other);
+
+  SDVTList VTs = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+
+  // Create the node.
+  SDValue Result;
+  if (IsTgtIntrinsic) {
+    // This is target intrinsic that touches memory
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurDebugLoc(),
+                                     VTs, &Ops[0], Ops.size(),
+                                     Info.memVT,
+                                   MachinePointerInfo(Info.ptrVal, Info.offset),
+                                     Info.align, Info.vol,
+                                     Info.readMem, Info.writeMem);
+  } else if (!HasChain) {
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  } else if (!I.getType()->isVoidTy()) {
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  } else {
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurDebugLoc(),
+                         VTs, &Ops[0], Ops.size());
+  }
+
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (!I.getType()->isVoidTy()) {
+    if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
+      EVT VT = TLI.getValueType(PTy);
+      Result = DAG.getNode(ISD::BITCAST, getCurDebugLoc(), VT, Result);
+    }
+
+    setValue(&I, Result);
+  } else {
+    // Assign order to result here. If the intrinsic does not produce a result,
+    // it won't be mapped to a SDNode and visit() will not assign it an order
+    // number.
+    ++SDNodeOrder;
+    AssignOrderingToNode(Result.getNode());
+  }
+}
+
+/// GetSignificand - Get the significand and build it into a floating-point
+/// number with exponent of 1:
+///
+///   Op = (Op & 0x007fffff) | 0x3f800000;
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue
+GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
+  SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x007fffff, MVT::i32));
+  SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
+                           DAG.getConstant(0x3f800000, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
+}
+
+/// GetExponent - Get the exponent:
+///
+///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue
+GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
+            DebugLoc dl) {
+  SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x7f800000, MVT::i32));
+  SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
+                           DAG.getConstant(23, TLI.getPointerTy()));
+  SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
+                           DAG.getConstant(127, MVT::i32));
+  return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
+}
+
+/// getF32Constant - Get 32-bit floating point constant.
+static SDValue
+getF32Constant(SelectionDAG &DAG, unsigned Flt) {
+  return DAG.getConstantFP(APFloat(APFloat::IEEEsingle, APInt(32, Flt)),
+                           MVT::f32);
+}
+
+/// expandExp - Lower an exp intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OFe 1.4426950f
+    //   IntegerPartOfX = ((int32_t)(X * LOG2OFe));
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x3fb8aa3b));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = (X * LOG2OFe) - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    SDValue TwoToFracPartOfX;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                                     getF32Constant(DAG, 0x3f7f5e7e));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // 0.000107046256 error, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                                     getF32Constant(DAG, 0x3f7ff8fd));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      //
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      TwoToFracPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                     getF32Constant(DAG, 0x3f800000));
+    }
+
+    // Add the exponent into the result in integer domain.
+    SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFracPartOfX);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   t13, IntegerPartOfX));
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
+}
+
+/// expandLog - Lower a log intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log(2) [0.69314718f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3f317218));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue LogOfMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   LogofMantissa =
+      //     -1.1609546f +
+      //       (1.4034025f - 0.23903021f * x) * x;
+      //
+      // error 0.0034276066, which is better than 8 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbe74c456));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3fb3a2b1));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                  getF32Constant(DAG, 0x3f949a29));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   LogOfMantissa =
+      //     -1.7417939f +
+      //       (2.8212026f +
+      //         (-1.4699568f +
+      //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
+      //
+      // error 0.000061011436, which is 14 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbd67b6d6));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ee4f4b8));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fbc278b));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40348e95));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                  getF32Constant(DAG, 0x3fdef31a));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   LogOfMantissa =
+      //     -2.1072184f +
+      //       (4.2372794f +
+      //         (-3.7029485f +
+      //           (2.2781945f +
+      //             (-0.87823314f +
+      //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
+      //
+      // error 0.0000023660568, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbc91e5ac));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e4350aa));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f60d3e3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x4011cdf0));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x406cfd1c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x408797cb));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                  getF32Constant(DAG, 0x4006dcab));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
+}
+
+/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Get the exponent.
+    SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    // Different possible minimax approximations of significand in
+    // floating-point for various degrees of accuracy over [1,2].
+    SDValue Log2ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
+      //
+      // error 0.0049451742, which is more than 7 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbeb08fe0));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x40019463));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                   getF32Constant(DAG, 0x3fd6633d));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log2ofMantissa =
+      //     -2.51285454f +
+      //       (4.07009056f +
+      //         (-2.12067489f +
+      //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
+      //
+      // error 0.0000876136000, which is better than 13 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbda7262e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f25280b));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x4007b923));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40823e2f));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                   getF32Constant(DAG, 0x4020d29c));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log2ofMantissa =
+      //     -3.0400495f +
+      //       (6.1129976f +
+      //         (-5.3420409f +
+      //           (3.2865683f +
+      //             (-1.2669343f +
+      //               (0.27515199f -
+      //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
+      //
+      // error 0.0000018516, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbcd2769e));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e8ce0b9));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fa22ae7));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40525723));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x40aaf200));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x40c39dad));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                   getF32Constant(DAG, 0x4042902c));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
+}
+
+/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog10(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+                           const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log10(2) [0.30102999f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3e9a209a));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue Log10ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log10ofMantissa =
+      //     -0.50419619f +
+      //       (0.60948995f - 0.10380950f * x) * x;
+      //
+      // error 0.0014886165, which is 6 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbdd49a13));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f1c0789));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                    getF32Constant(DAG, 0x3f011300));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log10ofMantissa =
+      //     -0.64831180f +
+      //       (0.91751397f +
+      //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
+      //
+      // error 0.00019228036, which is better than 12 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3d431f31));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ea21fb2));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f6ae232));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                                    getF32Constant(DAG, 0x3f25f7c3));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log10ofMantissa =
+      //     -0.84299375f +
+      //       (1.5327582f +
+      //         (-1.0688956f +
+      //           (0.49102474f +
+      //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
+      //
+      // error 0.0000037995730, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3c5d51ce));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e00685a));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3efb6798));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f88d192));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3fc4316c));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
+                                    getF32Constant(DAG, 0x3f57ce70));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
+}
+
+/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Op);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, Op, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    SDValue TwoToFractionalPartOfX;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                                           getF32Constant(DAG, 0x3f7f5e7e));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                                           getF32Constant(DAG, 0x3f7ff8fd));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                           getF32Constant(DAG, 0x3f800000));
+    }
+
+    // Add the exponent into the result in integer domain.
+    SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+                              TwoToFractionalPartOfX);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   t13, IntegerPartOfX));
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
+}
+
+/// visitPow - Lower a pow intrinsic. Handles the special sequences for
+/// limited-precision mode with x == 10.0f.
+static SDValue expandPow(DebugLoc dl, SDValue LHS, SDValue RHS,
+                         SelectionDAG &DAG, const TargetLowering &TLI) {
+  bool IsExp10 = false;
+  if (LHS.getValueType() == MVT::f32 && LHS.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
+      APFloat Ten(10.0f);
+      IsExp10 = LHSC->isExactlyValue(Ten);
+    }
+  }
+
+  if (IsExp10) {
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OF10 3.3219281f
+    //   IntegerPartOfX = (int32_t)(x * LOG2OF10);
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
+                             getF32Constant(DAG, 0x40549a78));
+    SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+    //   FractionalPartOfX = x - (float)IntegerPartOfX;
+    SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+    SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+    //   IntegerPartOfX <<= 23;
+    IntegerPartOfX = DAG.getNode(ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+                                 DAG.getConstant(23, TLI.getPointerTy()));
+
+    SDValue TwoToFractionalPartOfX;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   twoToFractionalPartOfX =
+      //     0.997535578f +
+      //       (0.735607626f + 0.252464424f * x) * x;
+      //
+      // error 0.0144103317, which is 6 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3e814304));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f3c50c8));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                                           getF32Constant(DAG, 0x3f7f5e7e));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999892986f +
+      //       (0.696457318f +
+      //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+      //
+      // error 0.000107046256, which is 13 to 14 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3da235e3));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3e65b8f3));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f324b07));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                                           getF32Constant(DAG, 0x3f7ff8fd));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   TwoToFractionalPartOfX =
+      //     0.999999982f +
+      //       (0.693148872f +
+      //         (0.240227044f +
+      //           (0.554906021e-1f +
+      //             (0.961591928e-2f +
+      //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+      // error 2.47208000*10^(-7), which is better than 18 bits
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3924b03e));
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3ab24b87));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3c1d8c17));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3d634a1d));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x3e75fe14));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                                getF32Constant(DAG, 0x3f317234));
+      SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+      TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                           getF32Constant(DAG, 0x3f800000));
+    }
+
+    SDValue t13 = DAG.getNode(ISD::BITCAST, dl,MVT::i32,TwoToFractionalPartOfX);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   t13, IntegerPartOfX));
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
+}
+
+
+/// ExpandPowI - Expand a llvm.powi intrinsic.
+static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
+                          SelectionDAG &DAG) {
+  // If RHS is a constant, we can expand this out to a multiplication tree,
+  // otherwise we end up lowering to a call to __powidf2 (for example).  When
+  // optimizing for size, we only want to do this if the expansion would produce
+  // a small number of multiplies, otherwise we do the full expansion.
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    // Get the exponent as a positive value.
+    unsigned Val = RHSC->getSExtValue();
+    if ((int)Val < 0) Val = -Val;
+
+    // powi(x, 0) -> 1.0
+    if (Val == 0)
+      return DAG.getConstantFP(1.0, LHS.getValueType());
+
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::OptimizeForSize) ||
+        // If optimizing for size, don't insert too many multiplies.  This
+        // inserts up to 5 multiplies.
+        CountPopulation_32(Val)+Log2_32(Val) < 7) {
+      // We use the simple binary decomposition method to generate the multiply
+      // sequence.  There are more optimal ways to do this (for example,
+      // powi(x,15) generates one more multiply than it should), but this has
+      // the benefit of being both really simple and much better than a libcall.
+      SDValue Res;  // Logically starts equal to 1.0
+      SDValue CurSquare = LHS;
+      while (Val) {
+        if (Val & 1) {
+          if (Res.getNode())
+            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+          else
+            Res = CurSquare;  // 1.0*CurSquare.
+        }
+
+        CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
+                                CurSquare, CurSquare);
+        Val >>= 1;
+      }
+
+      // If the original was negative, invert the result, producing 1/(x*x*x).
+      if (RHSC->getSExtValue() < 0)
+        Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
+                          DAG.getConstantFP(1.0, LHS.getValueType()), Res);
+      return Res;
+    }
+  }
+
+  // Otherwise, expand to a libcall.
+  return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
+}
+
+// getTruncatedArgReg - Find underlying register used for an truncated
+// argument.
+static unsigned getTruncatedArgReg(const SDValue &N) {
+  if (N.getOpcode() != ISD::TRUNCATE)
+    return 0;
+
+  const SDValue &Ext = N.getOperand(0);
+  if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext){
+    const SDValue &CFR = Ext.getOperand(0);
+    if (CFR.getOpcode() == ISD::CopyFromReg)
+      return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
+    if (CFR.getOpcode() == ISD::TRUNCATE)
+      return getTruncatedArgReg(CFR);
+  }
+  return 0;
+}
+
+/// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
+/// argument, create the corresponding DBG_VALUE machine instruction for it now.
+/// At the end of instruction selection, they will be inserted to the entry BB.
+bool
+SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+                                              int64_t Offset,
+                                              const SDValue &N) {
+  const Argument *Arg = dyn_cast<Argument>(V);
+  if (!Arg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetInstrInfo *TII = DAG.getTarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
+
+  // Ignore inlined function arguments here.
+  DIVariable DV(Variable);
+  if (DV.isInlinedFnArgument(MF.getFunction()))
+    return false;
+
+  unsigned Reg = 0;
+  // Some arguments' frame index is recorded during argument lowering.
+  Offset = FuncInfo.getArgumentFrameIndex(Arg);
+  if (Offset)
+    Reg = TRI->getFrameRegister(MF);
+
+  if (!Reg && N.getNode()) {
+    if (N.getOpcode() == ISD::CopyFromReg)
+      Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
+    else
+      Reg = getTruncatedArgReg(N);
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      unsigned PR = RegInfo.getLiveInPhysReg(Reg);
+      if (PR)
+        Reg = PR;
+    }
+  }
+
+  if (!Reg) {
+    // Check if ValueMap has reg number.
+    DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+    if (VMI != FuncInfo.ValueMap.end())
+      Reg = VMI->second;
+  }
+
+  if (!Reg && N.getNode()) {
+    // Check if frame index is available.
+    if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(N.getNode()))
+      if (FrameIndexSDNode *FINode =
+          dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) {
+        Reg = TRI->getFrameRegister(MF);
+        Offset = FINode->getIndex();
+      }
+  }
+
+  if (!Reg)
+    return false;
+
+  MachineInstrBuilder MIB = BuildMI(MF, getCurDebugLoc(),
+                                    TII->get(TargetOpcode::DBG_VALUE))
+    .addReg(Reg, RegState::Debug).addImm(Offset).addMetadata(Variable);
+  FuncInfo.ArgDbgValues.push_back(&*MIB);
+  return true;
+}
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                         !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
+/// we want to emit this as a call to a named external function, return the name
+/// otherwise lower it and return null.
+const char *
+SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Res;
+
+  switch (Intrinsic) {
+  default:
+    // By default, turn this into a target intrinsic node.
+    visitTargetIntrinsic(I, Intrinsic);
+    return 0;
+  case Intrinsic::vastart:  visitVAStart(I); return 0;
+  case Intrinsic::vaend:    visitVAEnd(I); return 0;
+  case Intrinsic::vacopy:   visitVACopy(I); return 0;
+  case Intrinsic::returnaddress:
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::frameaddress:
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::setjmp:
+    return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
+  case Intrinsic::longjmp:
+    return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
+  case Intrinsic::memcpy: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
+                              MachinePointerInfo(I.getArgOperand(0)),
+                              MachinePointerInfo(I.getArgOperand(1))));
+    return 0;
+  }
+  case Intrinsic::memset: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+                              MachinePointerInfo(I.getArgOperand(0))));
+    return 0;
+  }
+  case Intrinsic::memmove: {
+    // Assert for address < 256 since we support only user defined address
+    // spaces.
+    assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
+           < 256 &&
+           cast<PointerType>(I.getArgOperand(1)->getType())->getAddressSpace()
+           < 256 &&
+           "Unknown address space");
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+                               MachinePointerInfo(I.getArgOperand(0)),
+                               MachinePointerInfo(I.getArgOperand(1))));
+    return 0;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
+    MDNode *Variable = DI.getVariable();
+    const Value *Address = DI.getAddress();
+    if (!Address || !DIVariable(Variable).Verify()) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return 0;
+    }
+
+    // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
+    // but do not always have a corresponding SDNode built.  The SDNodeOrder
+    // absolute, but not relative, values are different depending on whether
+    // debug info exists.
+    ++SDNodeOrder;
+
+    // Check if address has undef value.
+    if (isa<UndefValue>(Address) ||
+        (Address->use_empty() && !isa<Argument>(Address))) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return 0;
+    }
+
+    SDValue &N = NodeMap[Address];
+    if (!N.getNode() && isa<Argument>(Address))
+      // Check unused arguments map.
+      N = UnusedArgNodeMap[Address];
+    SDDbgValue *SDV;
+    if (N.getNode()) {
+      if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+        Address = BCI->getOperand(0);
+      // Parameters are handled specially.
+      bool isParameter =
+        (DIVariable(Variable).getTag() == dwarf::DW_TAG_arg_variable ||
+         isa<Argument>(Address));
+
+      const AllocaInst *AI = dyn_cast<AllocaInst>(Address);
+
+      if (isParameter && !AI) {
+        FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+        if (FINode)
+          // Byval parameter.  We have a frame index at this point.
+          SDV = DAG.getDbgValue(Variable, FINode->getIndex(),
+                                0, dl, SDNodeOrder);
+        else {
+          // Address is an argument, so try to emit its dbg value using
+          // virtual register info from the FuncInfo.ValueMap.
+          EmitFuncArgumentDbgValue(Address, Variable, 0, N);
+          return 0;
+        }
+      } else if (AI)
+        SDV = DAG.getDbgValue(Variable, N.getNode(), N.getResNo(),
+                              0, dl, SDNodeOrder);
+      else {
+        // Can't do anything with other non-AI cases yet.
+        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+        DEBUG(dbgs() << "non-AllocaInst issue for Address: \n\t");
+        DEBUG(Address->dump());
+        return 0;
+      }
+      DAG.AddDbgValue(SDV, N.getNode(), isParameter);
+    } else {
+      // If Address is an argument then try to emit its dbg value using
+      // virtual register info from the FuncInfo.ValueMap.
+      if (!EmitFuncArgumentDbgValue(Address, Variable, 0, N)) {
+        // If variable is pinned by a alloca in dominating bb then
+        // use StaticAllocaMap.
+        if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
+          if (AI->getParent() != DI.getParent()) {
+            DenseMap<const AllocaInst*, int>::iterator SI =
+              FuncInfo.StaticAllocaMap.find(AI);
+            if (SI != FuncInfo.StaticAllocaMap.end()) {
+              SDV = DAG.getDbgValue(Variable, SI->second,
+                                    0, dl, SDNodeOrder);
+              DAG.AddDbgValue(SDV, 0, false);
+              return 0;
+            }
+          }
+        }
+        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      }
+    }
+    return 0;
+  }
+  case Intrinsic::dbg_value: {
+    const DbgValueInst &DI = cast<DbgValueInst>(I);
+    if (!DIVariable(DI.getVariable()).Verify())
+      return 0;
+
+    MDNode *Variable = DI.getVariable();
+    uint64_t Offset = DI.getOffset();
+    const Value *V = DI.getValue();
+    if (!V)
+      return 0;
+
+    // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
+    // but do not always have a corresponding SDNode built.  The SDNodeOrder
+    // absolute, but not relative, values are different depending on whether
+    // debug info exists.
+    ++SDNodeOrder;
+    SDDbgValue *SDV;
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
+      SDV = DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder);
+      DAG.AddDbgValue(SDV, 0, false);
+    } else {
+      // Do not use getValue() in here; we don't want to generate code at
+      // this point if it hasn't been done yet.
+      SDValue N = NodeMap[V];
+      if (!N.getNode() && isa<Argument>(V))
+        // Check unused arguments map.
+        N = UnusedArgNodeMap[V];
+      if (N.getNode()) {
+        if (!EmitFuncArgumentDbgValue(V, Variable, Offset, N)) {
+          SDV = DAG.getDbgValue(Variable, N.getNode(),
+                                N.getResNo(), Offset, dl, SDNodeOrder);
+          DAG.AddDbgValue(SDV, N.getNode(), false);
+        }
+      } else if (!V->use_empty() ) {
+        // Do not call getValue(V) yet, as we don't want to generate code.
+        // Remember it for later.
+        DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
+        DanglingDebugInfoMap[V] = DDI;
+      } else {
+        // We may expand this to cover more cases.  One case where we have no
+        // data available is an unreferenced parameter.
+        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      }
+    }
+
+    // Build a debug info table entry.
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+      V = BCI->getOperand(0);
+    const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+    // Don't handle byval struct arguments or VLAs, for example.
+    if (!AI) {
+      DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
+      DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
+      return 0;
+    }
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI == FuncInfo.StaticAllocaMap.end())
+      return 0; // VLAs.
+    int FI = SI->second;
+
+    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+    if (!DI.getDebugLoc().isUnknown() && MMI.hasDebugInfo())
+      MMI.setVariableDbgInfo(Variable, FI, DI.getDebugLoc());
+    return 0;
+  }
+
+  case Intrinsic::eh_typeid_for: {
+    // Find the type id for the given typeinfo.
+    GlobalVariable *GV = ExtractTypeInfo(I.getArgOperand(0));
+    unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
+    Res = DAG.getConstant(TypeID, MVT::i32);
+    setValue(&I, Res);
+    return 0;
+  }
+
+  case Intrinsic::eh_return_i32:
+  case Intrinsic::eh_return_i64:
+    DAG.getMachineFunction().getMMI().setCallsEHReturn(true);
+    DAG.setRoot(DAG.getNode(ISD::EH_RETURN, dl,
+                            MVT::Other,
+                            getControlRoot(),
+                            getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1))));
+    return 0;
+  case Intrinsic::eh_unwind_init:
+    DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
+    return 0;
+  case Intrinsic::eh_dwarf_cfa: {
+    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), dl,
+                                        TLI.getPointerTy());
+    SDValue Offset = DAG.getNode(ISD::ADD, dl,
+                                 TLI.getPointerTy(),
+                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
+                                             TLI.getPointerTy()),
+                                 CfaArg);
+    SDValue FA = DAG.getNode(ISD::FRAMEADDR, dl,
+                             TLI.getPointerTy(),
+                             DAG.getConstant(0, TLI.getPointerTy()));
+    setValue(&I, DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+                             FA, Offset));
+    return 0;
+  }
+  case Intrinsic::eh_sjlj_callsite: {
+    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
+    assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
+    assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
+
+    MMI.setCurrentCallSite(CI->getZExtValue());
+    return 0;
+  }
+  case Intrinsic::eh_sjlj_functioncontext: {
+    // Get and store the index of the function context.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    AllocaInst *FnCtx =
+      cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
+    int FI = FuncInfo.StaticAllocaMap[FnCtx];
+    MFI->setFunctionContextIndex(FI);
+    return 0;
+  }
+  case Intrinsic::eh_sjlj_setjmp: {
+    SDValue Ops[2];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, dl,
+                             DAG.getVTList(MVT::i32, MVT::Other),
+                             Ops, 2);
+    setValue(&I, Op.getValue(0));
+    DAG.setRoot(Op.getValue(1));
+    return 0;
+  }
+  case Intrinsic::eh_sjlj_longjmp: {
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, dl, MVT::Other,
+                            getRoot(), getValue(I.getArgOperand(0))));
+    return 0;
+  }
+
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDValue ShAmt = getValue(I.getArgOperand(1));
+    if (isa<ConstantSDNode>(ShAmt)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return 0;
+    }
+    unsigned NewIntrinsic = 0;
+    EVT ShAmtVT = MVT::v2i32;
+    switch (Intrinsic) {
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+      break;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+      break;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+      break;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+      break;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+      break;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+      break;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+      break;
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+      break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    }
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+    // to be zero.
+    // We must do this early because v2i32 is not a legal type.
+    SDValue ShOps[2];
+    ShOps[0] = ShAmt;
+    ShOps[1] = DAG.getConstant(0, MVT::i32);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    EVT DestVT = TLI.getValueType(I.getType());
+    ShAmt = DAG.getNode(ISD::BITCAST, dl, DestVT, ShAmt);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                       DAG.getConstant(NewIntrinsic, MVT::i32),
+                       getValue(I.getArgOperand(0)), ShAmt);
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::x86_avx_vinsertf128_pd_256:
+  case Intrinsic::x86_avx_vinsertf128_ps_256:
+  case Intrinsic::x86_avx_vinsertf128_si_256:
+  case Intrinsic::x86_avx2_vinserti128: {
+    EVT DestVT = TLI.getValueType(I.getType());
+    EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
+    uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue() & 1) *
+                   ElVT.getVectorNumElements();
+    Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, DestVT,
+                      getValue(I.getArgOperand(0)),
+                      getValue(I.getArgOperand(1)),
+                      DAG.getIntPtrConstant(Idx));
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::x86_avx_vextractf128_pd_256:
+  case Intrinsic::x86_avx_vextractf128_ps_256:
+  case Intrinsic::x86_avx_vextractf128_si_256:
+  case Intrinsic::x86_avx2_vextracti128: {
+    EVT DestVT = TLI.getValueType(I.getType());
+    uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
+                   DestVT.getVectorNumElements();
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                      getValue(I.getArgOperand(0)),
+                      DAG.getIntPtrConstant(Idx));
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::convertff:
+  case Intrinsic::convertfsi:
+  case Intrinsic::convertfui:
+  case Intrinsic::convertsif:
+  case Intrinsic::convertuif:
+  case Intrinsic::convertss:
+  case Intrinsic::convertsu:
+  case Intrinsic::convertus:
+  case Intrinsic::convertuu: {
+    ISD::CvtCode Code = ISD::CVT_INVALID;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
+    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
+    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
+    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
+    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
+    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
+    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
+    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
+    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
+    }
+    EVT DestVT = TLI.getValueType(I.getType());
+    const Value *Op1 = I.getArgOperand(0);
+    Res = DAG.getConvertRndSat(DestVT, dl, getValue(Op1),
+                               DAG.getValueType(DestVT),
+                               DAG.getValueType(getValue(Op1).getValueType()),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)),
+                               Code);
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::powi:
+    setValue(&I, ExpandPowI(dl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
+    return 0;
+  case Intrinsic::log:
+    setValue(&I, expandLog(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return 0;
+  case Intrinsic::log2:
+    setValue(&I, expandLog2(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return 0;
+  case Intrinsic::log10:
+    setValue(&I, expandLog10(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return 0;
+  case Intrinsic::exp:
+    setValue(&I, expandExp(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return 0;
+  case Intrinsic::exp2:
+    setValue(&I, expandExp2(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return 0;
+  case Intrinsic::pow:
+    setValue(&I, expandPow(dl, getValue(I.getArgOperand(0)),
+                           getValue(I.getArgOperand(1)), DAG, TLI));
+    return 0;
+  case Intrinsic::sqrt:
+  case Intrinsic::fabs:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint: {
+    unsigned Opcode;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+    case Intrinsic::fabs:      Opcode = ISD::FABS;       break;
+    case Intrinsic::sin:       Opcode = ISD::FSIN;       break;
+    case Intrinsic::cos:       Opcode = ISD::FCOS;       break;
+    case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+    case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+    case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+    case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+    case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    }
+
+    setValue(&I, DAG.getNode(Opcode, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  }
+  case Intrinsic::fma:
+    setValue(&I, DAG.getNode(ISD::FMA, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2))));
+    return 0;
+  case Intrinsic::fmuladd: {
+    EVT VT = TLI.getValueType(I.getType());
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanMulAndAdd(VT)){
+      setValue(&I, DAG.getNode(ISD::FMA, dl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2))));
+    } else {
+      SDValue Mul = DAG.getNode(ISD::FMUL, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                getValue(I.getArgOperand(0)),
+                                getValue(I.getArgOperand(1)));
+      SDValue Add = DAG.getNode(ISD::FADD, dl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                Mul,
+                                getValue(I.getArgOperand(2)));
+      setValue(&I, Add);
+    }
+    return 0;
+  }
+  case Intrinsic::convert_to_fp16:
+    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
+                             MVT::i16, getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::convert_from_fp16:
+    setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, dl,
+                             MVT::f32, getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::pcmarker: {
+    SDValue Tmp = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
+    return 0;
+  }
+  case Intrinsic::readcyclecounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READCYCLECOUNTER, dl,
+                      DAG.getVTList(MVT::i64, MVT::Other),
+                      &Op, 1);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return 0;
+  }
+  case Intrinsic::bswap:
+    setValue(&I, DAG.getNode(ISD::BSWAP, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  case Intrinsic::cttz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
+                             dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::ctlz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
+                             dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::ctpop: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTPOP, dl, Ty, Arg));
+    return 0;
+  }
+  case Intrinsic::stacksave: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::STACKSAVE, dl,
+                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), &Op, 1);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return 0;
+  }
+  case Intrinsic::stackrestore: {
+    Res = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Res));
+    return 0;
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code into the DAG to store the stack guard onto the stack.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    EVT PtrTy = TLI.getPointerTy();
+
+    SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+    int FI = FuncInfo.StaticAllocaMap[Slot];
+    MFI->setStackProtectorIndex(FI);
+
+    SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+
+    // Store the stack protector onto the stack.
+    Res = DAG.getStore(getRoot(), dl, Src, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       true, false, 0);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return 0;
+  }
+  case Intrinsic::objectsize: {
+    // If we don't know by now, we're never going to know.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
+
+    assert(CI && "Non-constant type in __builtin_object_size?");
+
+    SDValue Arg = getValue(I.getCalledValue());
+    EVT Ty = Arg.getValueType();
+
+    if (CI->isZero())
+      Res = DAG.getConstant(-1ULL, Ty);
+    else
+      Res = DAG.getConstant(0, Ty);
+
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::var_annotation:
+    // Discard annotate attributes
+    return 0;
+
+  case Intrinsic::init_trampoline: {
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
+
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
+    Ops[5] = DAG.getSrcValue(F);
+
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, dl, MVT::Other, Ops, 6);
+
+    DAG.setRoot(Res);
+    return 0;
+  }
+  case Intrinsic::adjust_trampoline: {
+    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, dl,
+                             TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
+  }
+  case Intrinsic::gcroot:
+    if (GFI) {
+      const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
+      const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
+
+      FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
+      GFI->addStackRoot(FI->getIndex(), TypeMap);
+    }
+    return 0;
+  case Intrinsic::gcread:
+  case Intrinsic::gcwrite:
+    llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
+  case Intrinsic::flt_rounds:
+    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
+    return 0;
+
+  case Intrinsic::expect: {
+    // Just replace __builtin_expect(exp, c) with EXP.
+    setValue(&I, getValue(I.getArgOperand(0)));
+    return 0;
+  }
+
+  case Intrinsic::debugtrap:
+  case Intrinsic::trap: {
+    StringRef TrapFuncName = TM.Options.getTrapFunctionName();
+    if (TrapFuncName.empty()) {
+      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? 
+        ISD::TRAP : ISD::DEBUGTRAP;
+      DAG.setRoot(DAG.getNode(Op, dl,MVT::Other, getRoot()));
+      return 0;
+    }
+    TargetLowering::ArgListTy Args;
+    TargetLowering::
+    CallLoweringInfo CLI(getRoot(), I.getType(),
+                 false, false, false, false, 0, CallingConv::C,
+                 /*isTailCall=*/false,
+                 /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
+                 Args, DAG, dl);
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return 0;
+  }
+
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow: {
+    ISD::NodeType Op;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
+    case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
+    case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
+    case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
+    case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
+    case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
+    }
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+
+    SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+    setValue(&I, DAG.getNode(Op, dl, VTs, Op1, Op2));
+    return 0;
+  }
+  case Intrinsic::prefetch: {
+    SDValue Ops[5];
+    unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = getValue(I.getArgOperand(3));
+    DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, dl,
+                                        DAG.getVTList(MVT::Other),
+                                        &Ops[0], 5,
+                                        EVT::getIntegerVT(*Context, 8),
+                                        MachinePointerInfo(I.getArgOperand(0)),
+                                        0, /* align */
+                                        false, /* volatile */
+                                        rw==0, /* read */
+                                        rw==1)); /* write */
+    return 0;
+  }
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
+    // Stack coloring is not enabled in O0, discard region information.
+    if (TM.getOptLevel() == CodeGenOpt::None)
+      return 0;
+
+    SmallVector<Value *, 4> Allocas;
+    GetUnderlyingObjects(I.getArgOperand(1), Allocas, TD);
+
+    for (SmallVector<Value*, 4>::iterator Object = Allocas.begin(),
+         E = Allocas.end(); Object != E; ++Object) {
+      AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
+
+      // Could not find an Alloca.
+      if (!LifetimeObject)
+        continue;
+
+      int FI = FuncInfo.StaticAllocaMap[LifetimeObject];
+
+      SDValue Ops[2];
+      Ops[0] = getRoot();
+      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
+      unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
+
+      Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2);
+      DAG.setRoot(Res);
+    }
+    return 0;
+  }
+  case Intrinsic::invariant_start:
+    // Discard region information.
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
+    return 0;
+  case Intrinsic::invariant_end:
+    // Discard region information.
+    return 0;
+  case Intrinsic::donothing:
+    // ignore
+    return 0;
+  }
+}
+
+void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+                                      bool isTailCall,
+                                      MachineBasicBlock *LandingPad) {
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FTy->getReturnType();
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  MCSymbol *BeginLabel = 0;
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(RetTy, CS.getAttributes(), Outs, TLI);
+
+  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+                                           DAG.getMachineFunction(),
+                                           FTy->isVarArg(), Outs,
+                                           FTy->getContext());
+
+  SDValue DemoteStackSlot;
+  int DemoteStackIdx = -100;
+
+  if (!CanLowerReturn) {
+    uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(
+                      FTy->getReturnType());
+    unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(
+                      FTy->getReturnType());
+    MachineFunction &MF = DAG.getMachineFunction();
+    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+    Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
+
+    DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI.getPointerTy());
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isInReg = false;
+    Entry.isSRet = true;
+    Entry.isNest = false;
+    Entry.isByVal = false;
+    Entry.Alignment = Align;
+    Args.push_back(Entry);
+    RetTy = Type::getVoidTy(FTy->getContext());
+  }
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
+
+    unsigned attrInd = i - CS.arg_begin() + 1;
+    Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
+    Entry.isZExt  = CS.paramHasAttr(attrInd, Attribute::ZExt);
+    Entry.isInReg = CS.paramHasAttr(attrInd, Attribute::InReg);
+    Entry.isSRet  = CS.paramHasAttr(attrInd, Attribute::StructRet);
+    Entry.isNest  = CS.paramHasAttr(attrInd, Attribute::Nest);
+    Entry.isByVal = CS.paramHasAttr(attrInd, Attribute::ByVal);
+    Entry.Alignment = CS.getParamAlignment(attrInd);
+    Args.push_back(Entry);
+  }
+
+  if (LandingPad) {
+    // Insert a label before the invoke call to mark the try range.  This can be
+    // used to detect deletion of the invoke via the MachineModuleInfo.
+    BeginLabel = MMI.getContext().CreateTempSymbol();
+
+    // For SjLj, keep track of which landing pads go with which invokes
+    // so as to maintain the ordering of pads in the LSDA.
+    unsigned CallSiteIndex = MMI.getCurrentCallSite();
+    if (CallSiteIndex) {
+      MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
+      LPadToCallSiteMap[LandingPad].push_back(CallSiteIndex);
+
+      // Now that the call site is handled, stop tracking it.
+      MMI.setCurrentCallSite(0);
+    }
+
+    // Both PendingLoads and PendingExports must be flushed here;
+    // this call might not return.
+    (void)getRoot();
+    DAG.setRoot(DAG.getEHLabel(getCurDebugLoc(), getControlRoot(), BeginLabel));
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI.LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CS, TLI))
+    isTailCall = false;
+
+  TargetLowering::
+  CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG,
+                       getCurDebugLoc(), CS);
+  std::pair<SDValue,SDValue> Result = TLI.LowerCallTo(CLI);
+  assert((isTailCall || Result.second.getNode()) &&
+         "Non-null chain expected with non-tail call!");
+  assert((Result.second.getNode() || !Result.first.getNode()) &&
+         "Null value expected with tail call!");
+  if (Result.first.getNode()) {
+    setValue(CS.getInstruction(), Result.first);
+  } else if (!CanLowerReturn && Result.second.getNode()) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
+
+    ComputeValueVTs(TLI, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
+
+    SmallVector<EVT, 4> RetTys;
+    SmallVector<uint64_t, 4> Offsets;
+    RetTy = FTy->getReturnType();
+    ComputeValueVTs(TLI, RetTy, RetTys, &Offsets);
+
+    unsigned NumValues = RetTys.size();
+    SmallVector<SDValue, 4> Values(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
+                                DemoteStackSlot,
+                                DAG.getConstant(Offsets[i], PtrVT));
+      SDValue L = DAG.getLoad(RetTys[i], getCurDebugLoc(), Result.second, Add,
+                  MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
+                              false, false, false, 1);
+      Values[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+                                MVT::Other, &Chains[0], NumValues);
+    PendingLoads.push_back(Chain);
+
+    setValue(CS.getInstruction(),
+             DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                         DAG.getVTList(&RetTys[0], RetTys.size()),
+                         &Values[0], Values.size()));
+  }
+
+  // Assign order to nodes here. If the call does not produce a result, it won't
+  // be mapped to a SDNode and visit() will not assign it an order number.
+  if (!Result.second.getNode()) {
+    // As a special case, a null chain means that a tail call has been emitted and
+    // the DAG root is already updated.
+    HasTailCall = true;
+    ++SDNodeOrder;
+    AssignOrderingToNode(DAG.getRoot().getNode());
+  } else {
+    DAG.setRoot(Result.second);
+    ++SDNodeOrder;
+    AssignOrderingToNode(Result.second.getNode());
+  }
+
+  if (LandingPad) {
+    // Insert a label at the end of the invoke call to mark the try range.  This
+    // can be used to detect deletion of the invoke via the MachineModuleInfo.
+    MCSymbol *EndLabel = MMI.getContext().CreateTempSymbol();
+    DAG.setRoot(DAG.getEHLabel(getCurDebugLoc(), getRoot(), EndLabel));
+
+    // Inform MachineModuleInfo of range.
+    MMI.addInvoke(LandingPad, BeginLabel, EndLabel);
+  }
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (const Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
+                             Type *LoadTy,
+                             SelectionDAGBuilder &Builder) {
+
+  // Check to see if this load can be trivially constant folded, e.g. if the
+  // input is from a string literal.
+  if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
+    // Cast pointer to the type we really want to load.
+    LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
+                                         PointerType::getUnqual(LoadTy));
+
+    if (const Constant *LoadCst =
+          ConstantFoldLoadFromConstPtr(const_cast<Constant *>(LoadInput),
+                                       Builder.TD))
+      return Builder.getValue(LoadCst);
+  }
+
+  // Otherwise, we have to emit the load.  If the pointer is to unfoldable but
+  // still constant memory, the input chain can be the entry node.
+  SDValue Root;
+  bool ConstantMemory = false;
+
+  // Do not serialize (non-volatile) loads of constant memory with anything.
+  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+    Root = Builder.DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = Builder.DAG.getRoot();
+  }
+
+  SDValue Ptr = Builder.getValue(PtrVal);
+  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurDebugLoc(), Root,
+                                        Ptr, MachinePointerInfo(PtrVal),
+                                        false /*volatile*/,
+                                        false /*nontemporal*/, 
+                                        false /*isinvariant*/, 1 /* align=1 */);
+
+  if (!ConstantMemory)
+    Builder.PendingLoads.push_back(LoadVal.getValue(1));
+  return LoadVal;
+}
+
+
+/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
+  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
+      !I.getArgOperand(2)->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+
+  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+    bool ActuallyDoIt = true;
+    MVT LoadVT;
+    Type *LoadTy;
+    switch (Size->getZExtValue()) {
+    default:
+      LoadVT = MVT::Other;
+      LoadTy = 0;
+      ActuallyDoIt = false;
+      break;
+    case 2:
+      LoadVT = MVT::i16;
+      LoadTy = Type::getInt16Ty(Size->getContext());
+      break;
+    case 4:
+      LoadVT = MVT::i32;
+      LoadTy = Type::getInt32Ty(Size->getContext());
+      break;
+    case 8:
+      LoadVT = MVT::i64;
+      LoadTy = Type::getInt64Ty(Size->getContext());
+      break;
+        /*
+    case 16:
+      LoadVT = MVT::v4i32;
+      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = VectorType::get(LoadTy, 4);
+      break;
+         */
+    }
+
+    // This turns into unaligned loads.  We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+
+    // Require that we can find a legal MVT, and only do this if the target
+    // supports unaligned loads of that type.  Expanding into byte loads would
+    // bloat the code.
+    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      if (!TLI.isTypeLegal(LoadVT) ||!TLI.allowsUnalignedMemoryAccesses(LoadVT))
+        ActuallyDoIt = false;
+    }
+
+    if (ActuallyDoIt) {
+      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
+      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+
+      SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal,
+                                 ISD::SETNE);
+      EVT CallVT = TLI.getValueType(I.getType(), true);
+      setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT));
+      return true;
+    }
+  }
+
+
+  return false;
+}
+
+/// visitUnaryFloatCall - If a call instruction is a unary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
+                                              unsigned Opcode) {
+  // Sanity check that it really is a unary floating-point call.
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp = getValue(I.getArgOperand(0));
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(), Tmp.getValueType(), Tmp));
+  return true;
+}
+
+void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  // Handle inline assembly differently.
+  if (isa<InlineAsm>(I.getCalledValue())) {
+    visitInlineAsm(&I);
+    return;
+  }
+
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  ComputeUsesVAFloatArgument(I, &MMI);
+
+  const char *RenameFn = 0;
+  if (Function *F = I.getCalledFunction()) {
+    if (F->isDeclaration()) {
+      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
+        if (unsigned IID = II->getIntrinsicID(F)) {
+          RenameFn = visitIntrinsicCall(I, IID);
+          if (!RenameFn)
+            return;
+        }
+      }
+      if (unsigned IID = F->getIntrinsicID()) {
+        RenameFn = visitIntrinsicCall(I, IID);
+        if (!RenameFn)
+          return;
+      }
+    }
+
+    // Check for well-known libc/libm calls.  If the function is internal, it
+    // can't be a library call.
+    LibFunc::Func Func;
+    if (!F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func)) {
+      switch (Func) {
+      default: break;
+      case LibFunc::copysign:
+      case LibFunc::copysignf:
+      case LibFunc::copysignl:
+        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.getType() == I.getArgOperand(1)->getType() &&
+            I.onlyReadsMemory()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
+                                   LHS.getValueType(), LHS, RHS));
+          return;
+        }
+        break;
+      case LibFunc::fabs:
+      case LibFunc::fabsf:
+      case LibFunc::fabsl:
+        if (visitUnaryFloatCall(I, ISD::FABS))
+          return;
+        break;
+      case LibFunc::sin:
+      case LibFunc::sinf:
+      case LibFunc::sinl:
+        if (visitUnaryFloatCall(I, ISD::FSIN))
+          return;
+        break;
+      case LibFunc::cos:
+      case LibFunc::cosf:
+      case LibFunc::cosl:
+        if (visitUnaryFloatCall(I, ISD::FCOS))
+          return;
+        break;
+      case LibFunc::sqrt:
+      case LibFunc::sqrtf:
+      case LibFunc::sqrtl:
+        if (visitUnaryFloatCall(I, ISD::FSQRT))
+          return;
+        break;
+      case LibFunc::floor:
+      case LibFunc::floorf:
+      case LibFunc::floorl:
+        if (visitUnaryFloatCall(I, ISD::FFLOOR))
+          return;
+        break;
+      case LibFunc::nearbyint:
+      case LibFunc::nearbyintf:
+      case LibFunc::nearbyintl:
+        if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
+          return;
+        break;
+      case LibFunc::ceil:
+      case LibFunc::ceilf:
+      case LibFunc::ceill:
+        if (visitUnaryFloatCall(I, ISD::FCEIL))
+          return;
+        break;
+      case LibFunc::rint:
+      case LibFunc::rintf:
+      case LibFunc::rintl:
+        if (visitUnaryFloatCall(I, ISD::FRINT))
+          return;
+        break;
+      case LibFunc::trunc:
+      case LibFunc::truncf:
+      case LibFunc::truncl:
+        if (visitUnaryFloatCall(I, ISD::FTRUNC))
+          return;
+        break;
+      case LibFunc::log2:
+      case LibFunc::log2f:
+      case LibFunc::log2l:
+        if (visitUnaryFloatCall(I, ISD::FLOG2))
+          return;
+        break;
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+      case LibFunc::exp2l:
+        if (visitUnaryFloatCall(I, ISD::FEXP2))
+          return;
+        break;
+      case LibFunc::memcmp:
+        if (visitMemCmpCall(I))
+          return;
+        break;
+      }
+    }
+  }
+
+  SDValue Callee;
+  if (!RenameFn)
+    Callee = getValue(I.getCalledValue());
+  else
+    Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
+
+  // Check if we can potentially perform a tail call. More detailed checking is
+  // be done within LowerCallTo, after more information about the call is known.
+  LowerCallTo(&I, Callee, I.isTailCall());
+}
+
+namespace {
+
+/// AsmOperandInfo - This contains information for each constraint that we are
+/// lowering.
+class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
+public:
+  /// CallOperand - If this is the result output operand or a clobber
+  /// this is null, otherwise it is the incoming operand to the CallInst.
+  /// This gets modified as the asm is processed.
+  SDValue CallOperand;
+
+  /// AssignedRegs - If this is a register or register class operand, this
+  /// contains the set of register corresponding to the operand.
+  RegsForValue AssignedRegs;
+
+  explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
+    : TargetLowering::AsmOperandInfo(info), CallOperand(0,0) {
+  }
+
+  /// getCallOperandValEVT - Return the EVT of the Value* that this operand
+  /// corresponds to.  If there is no Value* for this operand, it returns
+  /// MVT::Other.
+  EVT getCallOperandValEVT(LLVMContext &Context,
+                           const TargetLowering &TLI,
+                           const DataLayout *TD) const {
+    if (CallOperandVal == 0) return MVT::Other;
+
+    if (isa<BasicBlock>(CallOperandVal))
+      return TLI.getPointerTy();
+
+    llvm::Type *OpTy = CallOperandVal->getType();
+
+    // FIXME: code duplicated from TargetLowering::ParseConstraints().
+    // If this is an indirect operand, the operand is a pointer to the
+    // accessed type.
+    if (isIndirect) {
+      llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+      if (!PtrTy)
+        report_fatal_error("Indirect operand for inline asm not a pointer!");
+      OpTy = PtrTy->getElementType();
+    }
+
+    // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+    if (StructType *STy = dyn_cast<StructType>(OpTy))
+      if (STy->getNumElements() == 1)
+        OpTy = STy->getElementType(0);
+
+    // If OpTy is not a single value, it may be a struct/union that we
+    // can tile with integers.
+    if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+      unsigned BitSize = TD->getTypeSizeInBits(OpTy);
+      switch (BitSize) {
+      default: break;
+      case 1:
+      case 8:
+      case 16:
+      case 32:
+      case 64:
+      case 128:
+        OpTy = IntegerType::get(Context, BitSize);
+        break;
+      }
+    }
+
+    return TLI.getValueType(OpTy, true);
+  }
+};
+
+typedef SmallVector<SDISelAsmOperandInfo,16> SDISelAsmOperandInfoVector;
+
+} // end anonymous namespace
+
+/// GetRegistersForValue - Assign registers (virtual or physical) for the
+/// specified operand.  We prefer to assign virtual registers, to allow the
+/// register allocator to handle the assignment process.  However, if the asm
+/// uses features that we can't model on machineinstrs, we have SDISel do the
+/// allocation.  This produces generally horrible, but correct, code.
+///
+///   OpInfo describes the operand.
+///
+static void GetRegistersForValue(SelectionDAG &DAG,
+                                 const TargetLowering &TLI,
+                                 DebugLoc DL,
+                                 SDISelAsmOperandInfo &OpInfo) {
+  LLVMContext &Context = *DAG.getContext();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SmallVector<unsigned, 4> Regs;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  std::pair<unsigned, const TargetRegisterClass*> PhysReg =
+    TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                     OpInfo.ConstraintVT);
+
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other) {
+    // If this is a FP input in an integer register (or visa versa) insert a bit
+    // cast of the input value.  More generally, handle any case where the input
+    // value disagrees with the register class we plan to stick this in.
+    if (OpInfo.Type == InlineAsm::isInput &&
+        PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) {
+      // Try to convert to the first EVT that the reg class contains.  If the
+      // types are identical size, use a bitcast to convert (e.g. two differing
+      // vector types).
+      MVT RegVT = *PhysReg.second->vt_begin();
+      if (RegVT.getSizeInBits() == OpInfo.ConstraintVT.getSizeInBits()) {
+        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
+        // If the input is a FP value and we want it in FP registers, do a
+        // bitcast to the corresponding integer type.  This turns an f64 value
+        // into i64, which can be passed with two i32 values on a 32-bit
+        // machine.
+        RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
+        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      }
+    }
+
+    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
+  }
+
+  MVT RegVT;
+  EVT ValueVT = OpInfo.ConstraintVT;
+
+  // If this is a constraint for a specific physical register, like {r17},
+  // assign it now.
+  if (unsigned AssignedReg = PhysReg.first) {
+    const TargetRegisterClass *RC = PhysReg.second;
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = *RC->vt_begin();
+
+    // Get the actual register value type.  This is important, because the user
+    // may have asked for (e.g.) the AX register in i32 type.  We need to
+    // remember that AX is actually i16 to get the right extension.
+    RegVT = *RC->vt_begin();
+
+    // This is a explicit reference to a physical register.
+    Regs.push_back(AssignedReg);
+
+    // If this is an expanded reference, add the rest of the regs to Regs.
+    if (NumRegs != 1) {
+      TargetRegisterClass::iterator I = RC->begin();
+      for (; *I != AssignedReg; ++I)
+        assert(I != RC->end() && "Didn't find reg!");
+
+      // Already added the first reg.
+      --NumRegs; ++I;
+      for (; NumRegs; --NumRegs, ++I) {
+        assert(I != RC->end() && "Ran out of registers to allocate!");
+        Regs.push_back(*I);
+      }
+    }
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    return;
+  }
+
+  // Otherwise, if this was a reference to an LLVM register class, create vregs
+  // for this reference.
+  if (const TargetRegisterClass *RC = PhysReg.second) {
+    RegVT = *RC->vt_begin();
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = RegVT;
+
+    // Create the appropriate number of virtual registers.
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    for (; NumRegs; --NumRegs)
+      Regs.push_back(RegInfo.createVirtualRegister(RC));
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    return;
+  }
+
+  // Otherwise, we couldn't allocate enough registers for this.
+}
+
+/// visitInlineAsm - Handle a call to an InlineAsm object.
+///
+void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
+  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  SDISelAsmOperandInfoVector ConstraintOperands;
+
+  TargetLowering::AsmOperandInfoVector
+    TargetConstraints = TLI.ParseConstraints(CS);
+
+  bool hasMemory = false;
+
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0;   // ResNo - The result number of the next output.
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    ConstraintOperands.push_back(SDISelAsmOperandInfo(TargetConstraints[i]));
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    MVT OpVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+        OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpVT = TLI.getSimpleValueType(CS.getType());
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    // If this is an input or an indirect output, process the call argument.
+    // BasicBlocks are labels, currently appearing only in asm's.
+    if (OpInfo.CallOperandVal) {
+      if (const BasicBlock *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
+        OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
+      } else {
+        OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
+      }
+
+      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, TD).
+        getSimpleVT();
+    }
+
+    OpInfo.ConstraintVT = OpVT;
+
+    // Indirect operand accesses access memory.
+    if (OpInfo.isIndirect)
+      hasMemory = true;
+    else {
+      for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
+        TargetLowering::ConstraintType
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
+        if (CType == TargetLowering::C_Memory) {
+          hasMemory = true;
+          break;
+        }
+      }
+    }
+  }
+
+  SDValue Chain, Flag;
+
+  // We won't need to flush pending loads if this asm doesn't touch
+  // memory and is nonvolatile.
+  if (hasMemory || IA->hasSideEffects())
+    Chain = getRoot();
+  else
+    Chain = DAG.getRoot();
+
+  // Second pass over the constraints: compute which constraint option to use
+  // and assign registers to constraints that want a specific physreg.
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                           OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                           Input.ConstraintVT);
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (MatchRC.second != InputRC.second)) {
+          report_fatal_error("Unsupported asm: input constraint"
+                             " with a matching output constraint of"
+                             " incompatible type!");
+        }
+        Input.ConstraintVT = OpInfo.ConstraintVT;
+      }
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.Type == InlineAsm::isClobber)
+      continue;
+
+    // If this is a memory input, and if the operand is not indirect, do what we
+    // need to to provide an address for the memory input.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        !OpInfo.isIndirect) {
+      assert((OpInfo.isMultipleAlternative ||
+              (OpInfo.Type == InlineAsm::isInput)) &&
+             "Can only indirectify direct input operands!");
+
+      // Memory operands really want the address of the value.  If we don't have
+      // an indirect input, put it in the constpool if we can, otherwise spill
+      // it to a stack slot.
+      // TODO: This isn't quite right. We need to handle these according to
+      // the addressing mode that the constraint wants. Also, this may take
+      // an additional register for the computation and we don't want that
+      // either.
+
+      // If the operand is a float, integer, or vector constant, spill to a
+      // constant pool entry to get its address.
+      const Value *OpVal = OpInfo.CallOperandVal;
+      if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
+          isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
+        OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
+                                                 TLI.getPointerTy());
+      } else {
+        // Otherwise, create a stack slot and emit a store to it before the
+        // asm.
+        Type *Ty = OpVal->getType();
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
+        MachineFunction &MF = DAG.getMachineFunction();
+        int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
+        Chain = DAG.getStore(Chain, getCurDebugLoc(),
+                             OpInfo.CallOperand, StackSlot,
+                             MachinePointerInfo::getFixedStack(SSFI),
+                             false, false, 0);
+        OpInfo.CallOperand = StackSlot;
+      }
+
+      // There is no longer a Value* corresponding to this operand.
+      OpInfo.CallOperandVal = 0;
+
+      // It is now an indirect operand.
+      OpInfo.isIndirect = true;
+    }
+
+    // If this constraint is for a specific register, allocate it before
+    // anything else.
+    if (OpInfo.ConstraintType == TargetLowering::C_Register)
+      GetRegistersForValue(DAG, TLI, getCurDebugLoc(), OpInfo);
+  }
+
+  // Second pass - Loop over all of the operands, assigning virtual or physregs
+  // to register class operands.
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // C_Register operands have already been allocated, Other/Memory don't need
+    // to be.
+    if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
+      GetRegistersForValue(DAG, TLI, getCurDebugLoc(), OpInfo);
+  }
+
+  // AsmNodeOperands - The operands for the ISD::INLINEASM node.
+  std::vector<SDValue> AsmNodeOperands;
+  AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
+  AsmNodeOperands.push_back(
+          DAG.getTargetExternalSymbol(IA->getAsmString().c_str(),
+                                      TLI.getPointerTy()));
+
+  // If we have a !srcloc metadata node associated with it, we want to attach
+  // this to the ultimately generated inline asm machineinstr.  To do this, we
+  // pass in the third operand as this (potentially null) inline asm MDNode.
+  const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
+  AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
+
+  // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
+  // bits as operand 3.
+  unsigned ExtraInfo = 0;
+  if (IA->hasSideEffects())
+    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+  if (IA->isAlignStack())
+    ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+  // Set the asm dialect.
+  ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+
+  // Determine if this InlineAsm MayLoad or MayStore based on the constraints.
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for other constriants as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        ExtraInfo |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        ExtraInfo |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        ExtraInfo |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
+    }
+  }
+
+  AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
+                                                  TLI.getPointerTy()));
+
+  // Loop over all of the inputs, copying the operand values into the
+  // appropriate registers and processing the output regs.
+  RegsForValue RetValRegs;
+
+  // IndirectStoresToEmit - The set of stores to emit after the inline asm node.
+  std::vector<std::pair<RegsForValue, Value*> > IndirectStoresToEmit;
+
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput: {
+      if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass &&
+          OpInfo.ConstraintType != TargetLowering::C_Register) {
+        // Memory output, or 'other' output (e.g. 'X' constraint).
+        assert(OpInfo.isIndirect && "Memory output must be indirect operand");
+
+        // Add information to the INLINEASM node to know about this output.
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(OpInfo.CallOperand);
+        break;
+      }
+
+      // Otherwise, this is a register or register class output.
+
+      // Copy the output from the appropriate register.  Find a register that
+      // we can use.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.emitError(CS.getInstruction(),  
+                      "couldn't allocate output register for constraint '" +
+                           Twine(OpInfo.ConstraintCode) + "'");
+        break;
+      }
+
+      // If this is an indirect operand, store through the pointer after the
+      // asm.
+      if (OpInfo.isIndirect) {
+        IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs,
+                                                      OpInfo.CallOperandVal));
+      } else {
+        // This is the result value of the call.
+        assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+        // Concatenate this output onto the outputs list.
+        RetValRegs.append(OpInfo.AssignedRegs);
+      }
+
+      // Add information to the INLINEASM node to know that this register is
+      // set.
+      OpInfo.AssignedRegs.AddInlineAsmOperands(OpInfo.isEarlyClobber ?
+                                           InlineAsm::Kind_RegDefEarlyClobber :
+                                               InlineAsm::Kind_RegDef,
+                                               false,
+                                               0,
+                                               DAG,
+                                               AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isInput: {
+      SDValue InOperandVal = OpInfo.CallOperand;
+
+      if (OpInfo.isMatchingInputConstraint()) {   // Matching constraint?
+        // If this is required to match an output register we have already set,
+        // just use its register.
+        unsigned OperandNo = OpInfo.getMatchedOperand();
+
+        // Scan until we find the definition we already emitted of this operand.
+        // When we find it, create a RegsForValue operand.
+        unsigned CurOp = InlineAsm::Op_FirstOperand;
+        for (; OperandNo; --OperandNo) {
+          // Advance to the next operand.
+          unsigned OpFlag =
+            cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+          assert((InlineAsm::isRegDefKind(OpFlag) ||
+                  InlineAsm::isRegDefEarlyClobberKind(OpFlag) ||
+                  InlineAsm::isMemKind(OpFlag)) && "Skipped past definitions?");
+          CurOp += InlineAsm::getNumOperandRegisters(OpFlag)+1;
+        }
+
+        unsigned OpFlag =
+          cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+        if (InlineAsm::isRegDefKind(OpFlag) ||
+            InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
+          // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
+          if (OpInfo.isIndirect) {
+            // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
+            LLVMContext &Ctx = *DAG.getContext();
+            Ctx.emitError(CS.getInstruction(),  "inline asm not supported yet:"
+                          " don't know how to handle tied "
+                          "indirect register inputs");
+            report_fatal_error("Cannot handle indirect register inputs!");
+          }
+
+          RegsForValue MatchedRegs;
+          MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
+          MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
+          MatchedRegs.RegVTs.push_back(RegVT);
+          MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+          for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
+               i != e; ++i)
+            MatchedRegs.Regs.push_back
+              (RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT)));
+
+          // Use the produced MatchedRegs object to
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+                                    Chain, &Flag, CS.getInstruction());
+          MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
+                                           true, OpInfo.getMatchedOperand(),
+                                           DAG, AsmNodeOperands);
+          break;
+        }
+
+        assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
+        assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
+               "Unexpected number of operands");
+        // Add information to the INLINEASM node to know about this input.
+        // See InlineAsm.h isUseOperandTiedToDef.
+        OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
+                                                    OpInfo.getMatchedOperand());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
+        break;
+      }
+
+      // Treat indirect 'X' constraint as memory.
+      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          OpInfo.isIndirect)
+        OpInfo.ConstraintType = TargetLowering::C_Memory;
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+        std::vector<SDValue> Ops;
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+                                         Ops, DAG);
+        if (Ops.empty()) {
+          LLVMContext &Ctx = *DAG.getContext();
+          Ctx.emitError(CS.getInstruction(),
+                        "invalid operand for inline asm constraint '" +
+                        Twine(OpInfo.ConstraintCode) + "'");
+          break;
+        }
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType =
+          InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
+               "Memory operands expect pointer values");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        TLI.getPointerTy()));
+        AsmNodeOperands.push_back(InOperandVal);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+
+      // TODO: Support this.
+      if (OpInfo.isIndirect) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.emitError(CS.getInstruction(),
+                      "Don't know how to handle indirect register inputs yet "
+                      "for constraint '" + Twine(OpInfo.ConstraintCode) + "'");
+        break;
+      }
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.emitError(CS.getInstruction(), 
+                      "couldn't allocate input reg for constraint '" +
+                           Twine(OpInfo.ConstraintCode) + "'");
+        break;
+      }
+
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+                                        Chain, &Flag, CS.getInstruction());
+
+      OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
+                                               DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isClobber: {
+      // Add the clobbered value to the operand list, so that the register
+      // allocator is aware that the physreg got clobbered.
+      if (!OpInfo.AssignedRegs.Regs.empty())
+        OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
+                                                 false, 0, DAG,
+                                                 AsmNodeOperands);
+      break;
+    }
+    }
+  }
+
+  // Finish up input operands.  Set the input chain and add the flag last.
+  AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+  if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
+
+  Chain = DAG.getNode(ISD::INLINEASM, getCurDebugLoc(),
+                      DAG.getVTList(MVT::Other, MVT::Glue),
+                      &AsmNodeOperands[0], AsmNodeOperands.size());
+  Flag = Chain.getValue(1);
+
+  // If this asm returns a register value, copy the result from that register
+  // and set it as the value of the call.
+  if (!RetValRegs.Regs.empty()) {
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
+                                             Chain, &Flag, CS.getInstruction());
+
+    // FIXME: Why don't we do this for inline asms with MRVs?
+    if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
+      EVT ResultType = TLI.getValueType(CS.getType());
+
+      // If any of the results of the inline asm is a vector, it may have the
+      // wrong width/num elts.  This can happen for register classes that can
+      // contain multiple different value types.  The preg or vreg allocated may
+      // not have the same VT as was expected.  Convert it to the right type
+      // with bit_convert.
+      if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
+        Val = DAG.getNode(ISD::BITCAST, getCurDebugLoc(),
+                          ResultType, Val);
+
+      } else if (ResultType != Val.getValueType() &&
+                 ResultType.isInteger() && Val.getValueType().isInteger()) {
+        // If a result value was tied to an input value, the computed result may
+        // have a wider width than the expected result.  Extract the relevant
+        // portion.
+        Val = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), ResultType, Val);
+      }
+
+      assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
+    }
+
+    setValue(CS.getInstruction(), Val);
+    // Don't need to use this as a chain in this case.
+    if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty())
+      return;
+  }
+
+  std::vector<std::pair<SDValue, const Value *> > StoresToEmit;
+
+  // Process indirect outputs, first output all of the flagged copies out of
+  // physregs.
+  for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
+    RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
+    const Value *Ptr = IndirectStoresToEmit[i].second;
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
+                                             Chain, &Flag, IA);
+    StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
+  }
+
+  // Emit the non-flagged stores from the physregs.
+  SmallVector<SDValue, 8> OutChains;
+  for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
+    SDValue Val = DAG.getStore(Chain, getCurDebugLoc(),
+                               StoresToEmit[i].first,
+                               getValue(StoresToEmit[i].second),
+                               MachinePointerInfo(StoresToEmit[i].second),
+                               false, false, 0);
+    OutChains.push_back(Val);
+  }
+
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+                        &OutChains[0], OutChains.size());
+
+  DAG.setRoot(Chain);
+}
+
+void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
+  const DataLayout &TD = *TLI.getDataLayout();
+  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
+                           getRoot(), getValue(I.getOperand(0)),
+                           DAG.getSrcValue(I.getOperand(0)),
+                           TD.getABITypeAlignment(I.getType()));
+  setValue(&I, V);
+  DAG.setRoot(V.getValue(1));
+}
+
+void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurDebugLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          getValue(I.getArgOperand(1)),
+                          DAG.getSrcValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(1))));
+}
+
+/// TargetLowering::LowerCallTo - This is the default LowerCallTo
+/// implementation, which just calls LowerCall.
+/// FIXME: When all targets are
+/// migrated to using LowerCall, this hook should be integrated into SDISel.
+std::pair<SDValue, SDValue>
+TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
+  // Handle all of the outgoing arguments.
+  CLI.Outs.clear();
+  CLI.OutVals.clear();
+  ArgListTy &Args = CLI.Args;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*this, Args[i].Ty, ValueVTs);
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+      SDValue Op = SDValue(Args[i].Node.getNode(),
+                           Args[i].Node.getResNo() + Value);
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        getDataLayout()->getABITypeAlignment(ArgTy);
+
+      if (Args[i].isZExt)
+        Flags.setZExt();
+      if (Args[i].isSExt)
+        Flags.setSExt();
+      if (Args[i].isInReg)
+        Flags.setInReg();
+      if (Args[i].isSRet)
+        Flags.setSRet();
+      if (Args[i].isByVal) {
+        Flags.setByVal();
+        PointerType *Ty = cast<PointerType>(Args[i].Ty);
+        Type *ElementTy = Ty->getElementType();
+        Flags.setByValSize(getDataLayout()->getTypeAllocSize(ElementTy));
+        // For ByVal, alignment should come from FE.  BE will guess if this
+        // info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
+        if (Args[i].Alignment)
+          FrameAlign = Args[i].Alignment;
+        else
+          FrameAlign = getByValTypeAlignment(ElementTy);
+        Flags.setByValAlign(FrameAlign);
+      }
+      if (Args[i].isNest)
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      if (Args[i].isSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].isZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts,
+                     PartVT, CLI.CS ? CLI.CS->getInstruction() : 0, ExtendKind);
+
+      for (unsigned j = 0; j != NumParts; ++j) {
+        // if it isn't first piece, alignment must be 1
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+                               i < CLI.NumFixedArgs,
+                               i, j*Parts[j].getValueType().getStoreSize());
+        if (NumParts > 1 && j == 0)
+          MyFlags.Flags.setSplit();
+        else if (j != 0)
+          MyFlags.Flags.setOrigAlign(1);
+
+        CLI.Outs.push_back(MyFlags);
+        CLI.OutVals.push_back(Parts[j]);
+      }
+    }
+  }
+
+  // Handle the incoming return values from the call.
+  CLI.Ins.clear();
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(*this, CLI.RetTy, RetTys);
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
+        MyFlags.Flags.setSExt();
+      if (CLI.RetZExt)
+        MyFlags.Flags.setZExt();
+      if (CLI.IsInReg)
+        MyFlags.Flags.setInReg();
+      CLI.Ins.push_back(MyFlags);
+    }
+  }
+
+  SmallVector<SDValue, 4> InVals;
+  CLI.Chain = LowerCall(CLI, InVals);
+
+  // Verify that the target's LowerCall behaved as expected.
+  assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
+         "LowerCall didn't return a valid chain!");
+  assert((!CLI.IsTailCall || InVals.empty()) &&
+         "LowerCall emitted a return value for a tail call!");
+  assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
+         "LowerCall didn't emit the correct number of values!");
+
+  // For a tail call, the return value is merely live-out and there aren't
+  // any nodes in the DAG representing it. Return a special value to
+  // indicate that a tail call has been emitted and no more Instructions
+  // should be processed in the current block.
+  if (CLI.IsTailCall) {
+    CLI.DAG.setRoot(CLI.Chain);
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+  DEBUG(for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
+          assert(InVals[i].getNode() &&
+                 "LowerCall emitted a null value!");
+          assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
+                 "LowerCall emitted a value with the wrong type!");
+        });
+
+  // Collect the legal value parts into potentially illegal values
+  // that correspond to the original function's return values.
+  ISD::NodeType AssertOp = ISD::DELETED_NODE;
+  if (CLI.RetSExt)
+    AssertOp = ISD::AssertSext;
+  else if (CLI.RetZExt)
+    AssertOp = ISD::AssertZext;
+  SmallVector<SDValue, 4> ReturnValues;
+  unsigned CurReg = 0;
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+
+    ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
+                                            NumRegs, RegisterVT, VT, NULL,
+                                            AssertOp));
+    CurReg += NumRegs;
+  }
+
+  // For a function returning void, there is no return value. We can't create
+  // such a node, so we just return a null return value in that case. In
+  // that case, nothing will actually look at the value.
+  if (ReturnValues.empty())
+    return std::make_pair(SDValue(), CLI.Chain);
+
+  SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
+                                CLI.DAG.getVTList(&RetTys[0], RetTys.size()),
+                            &ReturnValues[0], ReturnValues.size());
+  return std::make_pair(Res, CLI.Chain);
+}
+
+void TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  llvm_unreachable("LowerOperation not implemented for this target!");
+}
+
+void
+SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
+  SDValue Op = getNonRegisterValue(V);
+  assert((Op.getOpcode() != ISD::CopyFromReg ||
+          cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
+         "Copy from a reg to the same reg!");
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+
+  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
+  SDValue Chain = DAG.getEntryNode();
+  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0, V);
+  PendingExports.push_back(Chain);
+}
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  if (FastISel)
+    return A->use_empty();
+
+  const BasicBlock *Entry = A->getParent()->begin();
+  for (Value::const_use_iterator UI = A->use_begin(), E = A->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (cast<Instruction>(U)->getParent() != Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+  }
+  return true;
+}
+
+void SelectionDAGISel::LowerArguments(const Function &F) {
+  SelectionDAG &DAG = SDB->DAG;
+  DebugLoc dl = SDB->getCurDebugLoc();
+  const DataLayout *TD = TLI.getDataLayout();
+  SmallVector<ISD::InputArg, 16> Ins;
+
+  if (!FuncInfo->CanLowerReturn) {
+    // Put in an sret pointer parameter before all the other parameters.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+
+    // NOTE: Assuming that a pointer will never break down to more than one VT
+    // or one register.
+    ISD::ArgFlagsTy Flags;
+    Flags.setSRet();
+    MVT RegisterVT = TLI.getRegisterType(*DAG.getContext(), ValueVTs[0]);
+    ISD::InputArg RetArg(Flags, RegisterVT, true, 0, 0);
+    Ins.push_back(RetArg);
+  }
+
+  // Set up the incoming argument description vector.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++Idx) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    bool isArgValueUsed = !I->use_empty();
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment =
+        TD->getABITypeAlignment(ArgTy);
+
+      if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+        Flags.setZExt();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+        Flags.setSExt();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
+        Flags.setInReg();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
+        Flags.setSRet();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal)) {
+        Flags.setByVal();
+        PointerType *Ty = cast<PointerType>(I->getType());
+        Type *ElementTy = Ty->getElementType();
+        Flags.setByValSize(TD->getTypeAllocSize(ElementTy));
+        // For ByVal, alignment should be passed from FE.  BE will guess if
+        // this info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
+        if (F.getParamAlignment(Idx))
+          FrameAlign = F.getParamAlignment(Idx);
+        else
+          FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+        Flags.setByValAlign(FrameAlign);
+      }
+      if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
+        Flags.setNest();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT RegisterVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumRegs = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed,
+                              Idx-1, i*RegisterVT.getStoreSize());
+        if (NumRegs > 1 && i == 0)
+          MyFlags.Flags.setSplit();
+        // if it isn't first piece, alignment must be 1
+        else if (i > 0)
+          MyFlags.Flags.setOrigAlign(1);
+        Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // Call the target to set up the argument values.
+  SmallVector<SDValue, 8> InVals;
+  SDValue NewRoot = TLI.LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
+                                             F.isVarArg(), Ins,
+                                             dl, DAG, InVals);
+
+  // Verify that the target's LowerFormalArguments behaved as expected.
+  assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
+         "LowerFormalArguments didn't return a valid chain!");
+  assert(InVals.size() == Ins.size() &&
+         "LowerFormalArguments didn't emit the correct number of values!");
+  DEBUG({
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        assert(InVals[i].getNode() &&
+               "LowerFormalArguments emitted a null value!");
+        assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+               "LowerFormalArguments emitted a value with the wrong type!");
+      }
+    });
+
+  // Update the DAG with the new chain value resulting from argument lowering.
+  DAG.setRoot(NewRoot);
+
+  // Set up the argument values.
+  unsigned i = 0;
+  Idx = 1;
+  if (!FuncInfo->CanLowerReturn) {
+    // Create a virtual register for the sret pointer, and put in a copy
+    // from the sret argument into it.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    MVT VT = ValueVTs[0].getSimpleVT();
+    MVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+    ISD::NodeType AssertOp = ISD::DELETED_NODE;
+    SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
+                                        RegVT, VT, NULL, AssertOp);
+
+    MachineFunction& MF = SDB->DAG.getMachineFunction();
+    MachineRegisterInfo& RegInfo = MF.getRegInfo();
+    unsigned SRetReg = RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT));
+    FuncInfo->DemoteRegister = SRetReg;
+    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurDebugLoc(),
+                                    SRetReg, ArgValue);
+    DAG.setRoot(NewRoot);
+
+    // i indexes lowered arguments.  Bump it past the hidden sret argument.
+    // Idx indexes LLVM arguments.  Don't touch it.
+    ++i;
+  }
+
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
+      ++I, ++Idx) {
+    SmallVector<SDValue, 4> ArgValues;
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    if (I->use_empty() && NumValues)
+      SDB->setUnusedArgValue(I, InVals[i]);
+
+    for (unsigned Val = 0; Val != NumValues; ++Val) {
+      EVT VT = ValueVTs[Val];
+      MVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+
+      if (!I->use_empty()) {
+        ISD::NodeType AssertOp = ISD::DELETED_NODE;
+        if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+          AssertOp = ISD::AssertSext;
+        else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+          AssertOp = ISD::AssertZext;
+
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
+                                             NumParts, PartVT, VT,
+                                             NULL, AssertOp));
+      }
+
+      i += NumParts;
+    }
+
+    // We don't need to do anything else for unused arguments.
+    if (ArgValues.empty())
+      continue;
+
+    // Note down frame index.
+    if (FrameIndexSDNode *FI =
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+      FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+
+    SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
+                                     SDB->getCurDebugLoc());
+
+    SDB->setValue(I, Res);
+    if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
+      if (LoadSDNode *LNode = 
+          dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
+        if (FrameIndexSDNode *FI =
+            dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+    }
+
+    // If this argument is live outside of the entry block, insert a copy from
+    // wherever we got it to the vreg that other BB's will reference it as.
+    if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
+      // If we can, though, try to skip creating an unnecessary vreg.
+      // FIXME: This isn't very clean... it would be nice to make this more
+      // general.  It's also subtly incompatible with the hacks FastISel
+      // uses with vregs.
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        FuncInfo->ValueMap[I] = Reg;
+        continue;
+      }
+    }
+    if (!isOnlyUsedInEntryBlock(I, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(I);
+      SDB->CopyToExportRegsIfNeeded(I);
+    }
+  }
+
+  assert(i == InVals.size() && "Argument register count mismatch!");
+
+  // Finally, if the target has anything special to do, allow it to do so.
+  // FIXME: this should insert code into the DAG!
+  EmitFunctionEntryCode();
+}
+
+/// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
+/// ensure constants are generated when needed.  Remember the virtual registers
+/// that need to be added to the Machine PHI nodes as input.  We cannot just
+/// directly add them, because expansion might result in multiple MBB's for one
+/// BB.  As such, the start of the BB might correspond to a different MBB than
+/// the end.
+///
+void
+SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB)) continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::const_iterator I = SuccBB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
+      unsigned Reg;
+      const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
+        unsigned &RegOut = ConstantsOut[C];
+        if (RegOut == 0) {
+          RegOut = FuncInfo.CreateRegs(C->getType());
+          CopyValueToVirtualRegister(C, RegOut);
+        }
+        Reg = RegOut;
+      } else {
+        DenseMap<const Value *, unsigned>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
+          assert(isa<AllocaInst>(PHIOp) &&
+                 FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
+                 "Didn't codegen value into a register!??");
+          Reg = FuncInfo.CreateRegs(PHIOp->getType());
+          CopyValueToVirtualRegister(PHIOp, Reg);
+        }
+      }
+
+      // Remember that this register needs to added to the machine PHI node as
+      // the input for this MBB.
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
+        Reg += NumRegisters;
+      }
+    }
+  }
+  ConstantsOut.clear();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
new file mode 100644
index 000000000000..9188945bd906
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -0,0 +1,559 @@
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SELECTIONDAGBUILDER_H
+#define SELECTIONDAGBUILDER_H
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <vector>
+
+namespace llvm {
+
+class AliasAnalysis;
+class AllocaInst;
+class BasicBlock;
+class BitCastInst;
+class BranchInst;
+class CallInst;
+class DbgValueInst;
+class ExtractElementInst;
+class ExtractValueInst;
+class FCmpInst;
+class FPExtInst;
+class FPToSIInst;
+class FPToUIInst;
+class FPTruncInst;
+class Function;
+class FunctionLoweringInfo;
+class GetElementPtrInst;
+class GCFunctionInfo;
+class ICmpInst;
+class IntToPtrInst;
+class IndirectBrInst;
+class InvokeInst;
+class InsertElementInst;
+class InsertValueInst;
+class Instruction;
+class LoadInst;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineRegisterInfo;
+class MDNode;
+class PHINode;
+class PtrToIntInst;
+class ReturnInst;
+class SDDbgValue;
+class SExtInst;
+class SelectInst;
+class ShuffleVectorInst;
+class SIToFPInst;
+class StoreInst;
+class SwitchInst;
+class DataLayout;
+class TargetLibraryInfo;
+class TargetLowering;
+class TruncInst;
+class UIToFPInst;
+class UnreachableInst;
+class VAArgInst;
+class ZExtInst;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGBuilder - This is the common target-independent lowering
+/// implementation that is parameterized by a TargetLowering object.
+///
+class SelectionDAGBuilder {
+  /// CurDebugLoc - current file + line number.  Changes as we build the DAG.
+  DebugLoc CurDebugLoc;
+
+  DenseMap<const Value*, SDValue> NodeMap;
+  
+  /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
+
+  /// DanglingDebugInfo - Helper type for DanglingDebugInfoMap.
+  class DanglingDebugInfo {
+    const DbgValueInst* DI;
+    DebugLoc dl;
+    unsigned SDNodeOrder;
+  public:
+    DanglingDebugInfo() : DI(0), dl(DebugLoc()), SDNodeOrder(0) { }
+    DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO) :
+      DI(di), dl(DL), SDNodeOrder(SDNO) { }
+    const DbgValueInst* getDI() { return DI; }
+    DebugLoc getdl() { return dl; }
+    unsigned getSDNodeOrder() { return SDNodeOrder; }
+  };
+
+  /// DanglingDebugInfoMap - Keeps track of dbg_values for which we have not
+  /// yet seen the referent.  We defer handling these until we do see it.
+  DenseMap<const Value*, DanglingDebugInfo> DanglingDebugInfoMap;
+
+public:
+  /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
+  /// them up and then emit token factor nodes when possible.  This allows us to
+  /// get simple disambiguation between loads without worrying about alias
+  /// analysis.
+  SmallVector<SDValue, 8> PendingLoads;
+private:
+
+  /// PendingExports - CopyToReg nodes that copy values to virtual registers
+  /// for export to other blocks need to be emitted before any terminator
+  /// instruction, but they have no other ordering requirements. We bunch them
+  /// up and the emit a single tokenfactor for them just before terminator
+  /// instructions.
+  SmallVector<SDValue, 8> PendingExports;
+
+  /// SDNodeOrder - A unique monotonically increasing number used to order the
+  /// SDNodes we create.
+  unsigned SDNodeOrder;
+
+  /// Case - A struct to record the Value for a switch case, and the
+  /// case's target basic block.
+  struct Case {
+    const Constant *Low;
+    const Constant *High;
+    MachineBasicBlock* BB;
+    uint32_t ExtraWeight;
+
+    Case() : Low(0), High(0), BB(0), ExtraWeight(0) { }
+    Case(const Constant *low, const Constant *high, MachineBasicBlock *bb,
+         uint32_t extraweight) : Low(low), High(high), BB(bb),
+         ExtraWeight(extraweight) { }
+
+    APInt size() const {
+      const APInt &rHigh = cast<ConstantInt>(High)->getValue();
+      const APInt &rLow  = cast<ConstantInt>(Low)->getValue();
+      return (rHigh - rLow + 1ULL);
+    }
+  };
+
+  struct CaseBits {
+    uint64_t Mask;
+    MachineBasicBlock* BB;
+    unsigned Bits;
+    uint32_t ExtraWeight;
+
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
+             uint32_t Weight):
+      Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { }
+  };
+
+  typedef std::vector<Case>           CaseVector;
+  typedef std::vector<CaseBits>       CaseBitsVector;
+  typedef CaseVector::iterator        CaseItr;
+  typedef std::pair<CaseItr, CaseItr> CaseRange;
+
+  /// CaseRec - A struct with ctor used in lowering switches to a binary tree
+  /// of conditional branches.
+  struct CaseRec {
+    CaseRec(MachineBasicBlock *bb, const Constant *lt, const Constant *ge,
+            CaseRange r) :
+    CaseBB(bb), LT(lt), GE(ge), Range(r) {}
+
+    /// CaseBB - The MBB in which to emit the compare and branch
+    MachineBasicBlock *CaseBB;
+    /// LT, GE - If nonzero, we know the current case value must be less-than or
+    /// greater-than-or-equal-to these Constants.
+    const Constant *LT;
+    const Constant *GE;
+    /// Range - A pair of iterators representing the range of case values to be
+    /// processed at this point in the binary search tree.
+    CaseRange Range;
+  };
+
+  typedef std::vector<CaseRec> CaseRecVector;
+
+  struct CaseBitsCmp {
+    bool operator()(const CaseBits &C1, const CaseBits &C2) {
+      return C1.Bits > C2.Bits;
+    }
+  };
+
+  size_t Clusterify(CaseVector &Cases, const SwitchInst &SI);
+
+  /// CaseBlock - This structure is used to communicate between
+  /// SelectionDAGBuilder and SDISel for the code generation of additional basic
+  /// blocks needed by multi-case switch statements.
+  struct CaseBlock {
+    CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs,
+              const Value *cmpmiddle,
+              MachineBasicBlock *truebb, MachineBasicBlock *falsebb,
+              MachineBasicBlock *me,
+              uint32_t trueweight = 0, uint32_t falseweight = 0)
+      : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
+        TrueBB(truebb), FalseBB(falsebb), ThisBB(me),
+        TrueWeight(trueweight), FalseWeight(falseweight) { }
+
+    // CC - the condition code to use for the case block's setcc node
+    ISD::CondCode CC;
+
+    // CmpLHS/CmpRHS/CmpMHS - The LHS/MHS/RHS of the comparison to emit.
+    // Emit by default LHS op RHS. MHS is used for range comparisons:
+    // If MHS is not null: (LHS <= MHS) and (MHS <= RHS).
+    const Value *CmpLHS, *CmpMHS, *CmpRHS;
+
+    // TrueBB/FalseBB - the block to branch to if the setcc is true/false.
+    MachineBasicBlock *TrueBB, *FalseBB;
+
+    // ThisBB - the block into which to emit the code for the setcc and branches
+    MachineBasicBlock *ThisBB;
+
+    // TrueWeight/FalseWeight - branch weights.
+    uint32_t TrueWeight, FalseWeight;
+  };
+
+  struct JumpTable {
+    JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
+              MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
+  
+    /// Reg - the virtual register containing the index of the jump table entry
+    //. to jump to.
+    unsigned Reg;
+    /// JTI - the JumpTableIndex for this jump table in the function.
+    unsigned JTI;
+    /// MBB - the MBB into which to emit the code for the indirect jump.
+    MachineBasicBlock *MBB;
+    /// Default - the MBB of the default bb, which is a successor of the range
+    /// check MBB.  This is when updating PHI nodes in successors.
+    MachineBasicBlock *Default;
+  };
+  struct JumpTableHeader {
+    JumpTableHeader(APInt F, APInt L, const Value *SV, MachineBasicBlock *H,
+                    bool E = false):
+      First(F), Last(L), SValue(SV), HeaderBB(H), Emitted(E) {}
+    APInt First;
+    APInt Last;
+    const Value *SValue;
+    MachineBasicBlock *HeaderBB;
+    bool Emitted;
+  };
+  typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
+
+  struct BitTestCase {
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr,
+                uint32_t Weight):
+      Mask(M), ThisBB(T), TargetBB(Tr), ExtraWeight(Weight) { }
+    uint64_t Mask;
+    MachineBasicBlock *ThisBB;
+    MachineBasicBlock *TargetBB;
+    uint32_t ExtraWeight;
+  };
+
+  typedef SmallVector<BitTestCase, 3> BitTestInfo;
+
+  struct BitTestBlock {
+    BitTestBlock(APInt F, APInt R, const Value* SV,
+                 unsigned Rg, MVT RgVT, bool E,
+                 MachineBasicBlock* P, MachineBasicBlock* D,
+                 const BitTestInfo& C):
+      First(F), Range(R), SValue(SV), Reg(Rg), RegVT(RgVT), Emitted(E),
+      Parent(P), Default(D), Cases(C) { }
+    APInt First;
+    APInt Range;
+    const Value *SValue;
+    unsigned Reg;
+    MVT RegVT;
+    bool Emitted;
+    MachineBasicBlock *Parent;
+    MachineBasicBlock *Default;
+    BitTestInfo Cases;
+  };
+
+public:
+  // TLI - This is information that describes the available target features we
+  // need for lowering.  This indicates when operations are unavailable,
+  // implemented with a libcall, etc.
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+  const DataLayout *TD;
+  AliasAnalysis *AA;
+  const TargetLibraryInfo *LibInfo;
+
+  /// SwitchCases - Vector of CaseBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<CaseBlock> SwitchCases;
+  /// JTCases - Vector of JumpTable structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<JumpTableBlock> JTCases;
+  /// BitTestCases - Vector of BitTestBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<BitTestBlock> BitTestCases;
+
+  // Emit PHI-node-operand constants only once even if used by multiple
+  // PHI nodes.
+  DenseMap<const Constant *, unsigned> ConstantsOut;
+
+  /// FuncInfo - Information about the function as a whole.
+  ///
+  FunctionLoweringInfo &FuncInfo;
+
+  /// OptLevel - What optimization level we're generating code for.
+  /// 
+  CodeGenOpt::Level OptLevel;
+  
+  /// GFI - Garbage collection metadata for the function.
+  GCFunctionInfo *GFI;
+
+  /// LPadToCallSiteMap - Map a landing pad to the call site indexes.
+  DenseMap<MachineBasicBlock*, SmallVector<unsigned, 4> > LPadToCallSiteMap;
+
+  /// HasTailCall - This is set to true if a call in the current
+  /// block has been translated as a tail call. In this case,
+  /// no subsequent DAG nodes should be created.
+  ///
+  bool HasTailCall;
+
+  LLVMContext *Context;
+
+  SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
+                      CodeGenOpt::Level ol)
+    : SDNodeOrder(0), TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+      DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
+      HasTailCall(false) {
+  }
+
+  void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+            const TargetLibraryInfo *li);
+
+  /// clear - Clear out the current SelectionDAG and the associated
+  /// state and prepare this SelectionDAGBuilder object to be used
+  /// for a new block. This doesn't clear out information about
+  /// additional blocks that are needed to complete switch lowering
+  /// or PHI node updating; that information is cleared out as it is
+  /// consumed.
+  void clear();
+
+  /// clearDanglingDebugInfo - Clear the dangling debug information
+  /// map. This function is separated from the clear so that debug
+  /// information that is dangling in a basic block can be properly
+  /// resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached
+  /// to PHI nodes.
+  void clearDanglingDebugInfo();
+
+  /// getRoot - Return the current virtual root of the Selection DAG,
+  /// flushing any PendingLoad items. This must be done before emitting
+  /// a store or any other node that may need to be ordered after any
+  /// prior load instructions.
+  ///
+  SDValue getRoot();
+
+  /// getControlRoot - Similar to getRoot, but instead of flushing all the
+  /// PendingLoad items, flush all the PendingExports items. It is necessary
+  /// to do this before emitting a terminator instruction.
+  ///
+  SDValue getControlRoot();
+
+  DebugLoc getCurDebugLoc() const { return CurDebugLoc; }
+
+  unsigned getSDNodeOrder() const { return SDNodeOrder; }
+
+  void CopyValueToVirtualRegister(const Value *V, unsigned Reg);
+
+  /// AssignOrderingToNode - Assign an ordering to the node. The order is gotten
+  /// from how the code appeared in the source. The ordering is used by the
+  /// scheduler to effectively turn off scheduling.
+  void AssignOrderingToNode(const SDNode *Node);
+
+  void visit(const Instruction &I);
+
+  void visit(unsigned Opcode, const User &I);
+
+  // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+  // generate the debug data structures now that we've seen its definition.
+  void resolveDanglingDebugInfo(const Value *V, SDValue Val);
+  SDValue getValue(const Value *V);
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
+
+  void setValue(const Value *V, SDValue NewN) {
+    SDValue &N = NodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+  
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(N.getNode() == 0 && "Already set a value for this node!");
+    N = NewN;
+  }
+
+  void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            MachineBasicBlock *SwitchBB, unsigned Opc);
+  void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB,
+                                    MachineBasicBlock *SwitchBB);
+  bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
+  bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
+  void CopyToExportRegsIfNeeded(const Value *V);
+  void ExportFromCurrentBlock(const Value *V);
+  void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
+                   MachineBasicBlock *LandingPad = NULL);
+
+  /// UpdateSplitBlock - When an MBB was split during scheduling, update the
+  /// references that ned to refer to the last resulting block.
+  void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
+
+private:
+  // Terminator instructions.
+  void visitRet(const ReturnInst &I);
+  void visitBr(const BranchInst &I);
+  void visitSwitch(const SwitchInst &I);
+  void visitIndirectBr(const IndirectBrInst &I);
+  void visitUnreachable(const UnreachableInst &I) { /* noop */ }
+
+  // Helpers for visitSwitch
+  bool handleSmallSwitchRange(CaseRec& CR,
+                              CaseRecVector& WorkList,
+                              const Value* SV,
+                              MachineBasicBlock* Default,
+                              MachineBasicBlock *SwitchBB);
+  bool handleJTSwitchCase(CaseRec& CR,
+                          CaseRecVector& WorkList,
+                          const Value* SV,
+                          MachineBasicBlock* Default,
+                          MachineBasicBlock *SwitchBB);
+  bool handleBTSplitSwitchCase(CaseRec& CR,
+                               CaseRecVector& WorkList,
+                               const Value* SV,
+                               MachineBasicBlock* Default,
+                               MachineBasicBlock *SwitchBB);
+  bool handleBitTestsSwitchCase(CaseRec& CR,
+                                CaseRecVector& WorkList,
+                                const Value* SV,
+                                MachineBasicBlock* Default,
+                                MachineBasicBlock *SwitchBB);
+
+  uint32_t getEdgeWeight(const MachineBasicBlock *Src,
+                         const MachineBasicBlock *Dst) const;
+  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
+                              uint32_t Weight = 0);
+public:
+  void visitSwitchCase(CaseBlock &CB,
+                       MachineBasicBlock *SwitchBB);
+  void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
+  void visitBitTestCase(BitTestBlock &BB,
+                        MachineBasicBlock* NextMBB,
+                        uint32_t BranchWeightToNext,
+                        unsigned Reg,
+                        BitTestCase &B,
+                        MachineBasicBlock *SwitchBB);
+  void visitJumpTable(JumpTable &JT);
+  void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
+                            MachineBasicBlock *SwitchBB);
+  
+private:
+  // These all get lowered before this pass.
+  void visitInvoke(const InvokeInst &I);
+  void visitResume(const ResumeInst &I);
+
+  void visitBinary(const User &I, unsigned OpCode);
+  void visitShift(const User &I, unsigned Opcode);
+  void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
+  void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
+  void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
+  void visitFSub(const User &I);
+  void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
+  void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
+  void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
+  void visitSRem(const User &I) { visitBinary(I, ISD::SREM); }
+  void visitFRem(const User &I) { visitBinary(I, ISD::FREM); }
+  void visitUDiv(const User &I) { visitBinary(I, ISD::UDIV); }
+  void visitSDiv(const User &I);
+  void visitFDiv(const User &I) { visitBinary(I, ISD::FDIV); }
+  void visitAnd (const User &I) { visitBinary(I, ISD::AND); }
+  void visitOr  (const User &I) { visitBinary(I, ISD::OR); }
+  void visitXor (const User &I) { visitBinary(I, ISD::XOR); }
+  void visitShl (const User &I) { visitShift(I, ISD::SHL); }
+  void visitLShr(const User &I) { visitShift(I, ISD::SRL); }
+  void visitAShr(const User &I) { visitShift(I, ISD::SRA); }
+  void visitICmp(const User &I);
+  void visitFCmp(const User &I);
+  // Visit the conversion instructions
+  void visitTrunc(const User &I);
+  void visitZExt(const User &I);
+  void visitSExt(const User &I);
+  void visitFPTrunc(const User &I);
+  void visitFPExt(const User &I);
+  void visitFPToUI(const User &I);
+  void visitFPToSI(const User &I);
+  void visitUIToFP(const User &I);
+  void visitSIToFP(const User &I);
+  void visitPtrToInt(const User &I);
+  void visitIntToPtr(const User &I);
+  void visitBitCast(const User &I);
+
+  void visitExtractElement(const User &I);
+  void visitInsertElement(const User &I);
+  void visitShuffleVector(const User &I);
+
+  void visitExtractValue(const ExtractValueInst &I);
+  void visitInsertValue(const InsertValueInst &I);
+  void visitLandingPad(const LandingPadInst &I);
+
+  void visitGetElementPtr(const User &I);
+  void visitSelect(const User &I);
+
+  void visitAlloca(const AllocaInst &I);
+  void visitLoad(const LoadInst &I);
+  void visitStore(const StoreInst &I);
+  void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
+  void visitAtomicRMW(const AtomicRMWInst &I);
+  void visitFence(const FenceInst &I);
+  void visitPHI(const PHINode &I);
+  void visitCall(const CallInst &I);
+  bool visitMemCmpCall(const CallInst &I);
+  bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
+  void visitAtomicLoad(const LoadInst &I);
+  void visitAtomicStore(const StoreInst &I);
+
+  void visitInlineAsm(ImmutableCallSite CS);
+  const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
+  void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+
+  void visitVAStart(const CallInst &I);
+  void visitVAArg(const VAArgInst &I);
+  void visitVAEnd(const CallInst &I);
+  void visitVACopy(const CallInst &I);
+
+  void visitUserOp1(const Instruction &I) {
+    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
+  }
+  void visitUserOp2(const Instruction &I) {
+    llvm_unreachable("UserOp2 should not exist at instruction selection time!");
+  }
+
+  void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
+
+  /// EmitFuncArgumentDbgValue - If V is an function argument then create
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of 
+  /// instruction selection, they will be inserted to the entry BB.
+  bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
+                                int64_t Offset, const SDValue &N);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
new file mode 100644
index 000000000000..3b5823bfb277
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -0,0 +1,645 @@
+//===-- SelectionDAGDumper.cpp - Implement SelectionDAG::dump() -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::dump method and friends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getTarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return TII->getName(getMachineOpcode());
+      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
+    }
+    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:               return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:                   return "Prefetch";
+  case ISD::MEMBARRIER:                 return "MemBarrier";
+  case ISD::ATOMIC_FENCE:               return "AtomicFence";
+  case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_SWAP:                return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:            return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:             return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:            return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:           return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:            return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:            return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:           return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:           return "AtomicLoadUMax";
+  case ISD::ATOMIC_LOAD:                return "AtomicLoad";
+  case ISD::ATOMIC_STORE:               return "AtomicStore";
+  case ISD::PCMARKER:                   return "PCMarker";
+  case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::SRCVALUE:                   return "SrcValue";
+  case ISD::MDNODE_SDNODE:              return "MDNode";
+  case ISD::EntryToken:                 return "EntryToken";
+  case ISD::TokenFactor:                return "TokenFactor";
+  case ISD::AssertSext:                 return "AssertSext";
+  case ISD::AssertZext:                 return "AssertZext";
+
+  case ISD::BasicBlock:                 return "BasicBlock";
+  case ISD::VALUETYPE:                  return "ValueType";
+  case ISD::Register:                   return "Register";
+  case ISD::RegisterMask:               return "RegisterMask";
+  case ISD::Constant:                   return "Constant";
+  case ISD::ConstantFP:                 return "ConstantFP";
+  case ISD::GlobalAddress:              return "GlobalAddress";
+  case ISD::GlobalTLSAddress:           return "GlobalTLSAddress";
+  case ISD::FrameIndex:                 return "FrameIndex";
+  case ISD::JumpTable:                  return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR:                 return "RETURNADDR";
+  case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
+  case ISD::EXCEPTIONADDR:              return "EXCEPTIONADDR";
+  case ISD::LSDAADDR:                   return "LSDAADDR";
+  case ISD::EHSELECTION:                return "EHSELECTION";
+  case ISD::EH_RETURN:                  return "EH_RETURN";
+  case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
+  case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
+  case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::TargetIndex:                return "TargetIndex";
+  case ISD::ExternalSymbol:             return "ExternalSymbol";
+  case ISD::BlockAddress:               return "BlockAddress";
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getName((Intrinsic::ID)IID);
+    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
+  }
+
+  case ISD::BUILD_VECTOR:               return "BUILD_VECTOR";
+  case ISD::TargetConstant:             return "TargetConstant";
+  case ISD::TargetConstantFP:           return "TargetConstantFP";
+  case ISD::TargetGlobalAddress:        return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress:     return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex:           return "TargetFrameIndex";
+  case ISD::TargetJumpTable:            return "TargetJumpTable";
+  case ISD::TargetConstantPool:         return "TargetConstantPool";
+  case ISD::TargetExternalSymbol:       return "TargetExternalSymbol";
+  case ISD::TargetBlockAddress:         return "TargetBlockAddress";
+
+  case ISD::CopyToReg:                  return "CopyToReg";
+  case ISD::CopyFromReg:                return "CopyFromReg";
+  case ISD::UNDEF:                      return "undef";
+  case ISD::MERGE_VALUES:               return "merge_values";
+  case ISD::INLINEASM:                  return "inlineasm";
+  case ISD::EH_LABEL:                   return "eh_label";
+  case ISD::HANDLENODE:                 return "handlenode";
+
+  // Unary operators
+  case ISD::FABS:                       return "fabs";
+  case ISD::FNEG:                       return "fneg";
+  case ISD::FSQRT:                      return "fsqrt";
+  case ISD::FSIN:                       return "fsin";
+  case ISD::FCOS:                       return "fcos";
+  case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::FFLOOR:                     return "ffloor";
+  case ISD::FCEIL:                      return "fceil";
+  case ISD::FRINT:                      return "frint";
+  case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FEXP:                       return "fexp";
+  case ISD::FEXP2:                      return "fexp2";
+  case ISD::FLOG:                       return "flog";
+  case ISD::FLOG2:                      return "flog2";
+  case ISD::FLOG10:                     return "flog10";
+
+  // Binary operators
+  case ISD::ADD:                        return "add";
+  case ISD::SUB:                        return "sub";
+  case ISD::MUL:                        return "mul";
+  case ISD::MULHU:                      return "mulhu";
+  case ISD::MULHS:                      return "mulhs";
+  case ISD::SDIV:                       return "sdiv";
+  case ISD::UDIV:                       return "udiv";
+  case ISD::SREM:                       return "srem";
+  case ISD::UREM:                       return "urem";
+  case ISD::SMUL_LOHI:                  return "smul_lohi";
+  case ISD::UMUL_LOHI:                  return "umul_lohi";
+  case ISD::SDIVREM:                    return "sdivrem";
+  case ISD::UDIVREM:                    return "udivrem";
+  case ISD::AND:                        return "and";
+  case ISD::OR:                         return "or";
+  case ISD::XOR:                        return "xor";
+  case ISD::SHL:                        return "shl";
+  case ISD::SRA:                        return "sra";
+  case ISD::SRL:                        return "srl";
+  case ISD::ROTL:                       return "rotl";
+  case ISD::ROTR:                       return "rotr";
+  case ISD::FADD:                       return "fadd";
+  case ISD::FSUB:                       return "fsub";
+  case ISD::FMUL:                       return "fmul";
+  case ISD::FDIV:                       return "fdiv";
+  case ISD::FMA:                        return "fma";
+  case ISD::FREM:                       return "frem";
+  case ISD::FCOPYSIGN:                  return "fcopysign";
+  case ISD::FGETSIGN:                   return "fgetsign";
+  case ISD::FPOW:                       return "fpow";
+
+  case ISD::FPOWI:                      return "fpowi";
+  case ISD::SETCC:                      return "setcc";
+  case ISD::SELECT:                     return "select";
+  case ISD::VSELECT:                    return "vselect";
+  case ISD::SELECT_CC:                  return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:          return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:         return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:             return "concat_vectors";
+  case ISD::INSERT_SUBVECTOR:           return "insert_subvector";
+  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
+  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::CARRY_FALSE:                return "carry_false";
+  case ISD::ADDC:                       return "addc";
+  case ISD::ADDE:                       return "adde";
+  case ISD::SADDO:                      return "saddo";
+  case ISD::UADDO:                      return "uaddo";
+  case ISD::SSUBO:                      return "ssubo";
+  case ISD::USUBO:                      return "usubo";
+  case ISD::SMULO:                      return "smulo";
+  case ISD::UMULO:                      return "umulo";
+  case ISD::SUBC:                       return "subc";
+  case ISD::SUBE:                       return "sube";
+  case ISD::SHL_PARTS:                  return "shl_parts";
+  case ISD::SRA_PARTS:                  return "sra_parts";
+  case ISD::SRL_PARTS:                  return "srl_parts";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND:                return "sign_extend";
+  case ISD::ZERO_EXTEND:                return "zero_extend";
+  case ISD::ANY_EXTEND:                 return "any_extend";
+  case ISD::SIGN_EXTEND_INREG:          return "sign_extend_inreg";
+  case ISD::TRUNCATE:                   return "truncate";
+  case ISD::FP_ROUND:                   return "fp_round";
+  case ISD::FLT_ROUNDS_:                return "flt_rounds";
+  case ISD::FP_ROUND_INREG:             return "fp_round_inreg";
+  case ISD::FP_EXTEND:                  return "fp_extend";
+
+  case ISD::SINT_TO_FP:                 return "sint_to_fp";
+  case ISD::UINT_TO_FP:                 return "uint_to_fp";
+  case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::BITCAST:                    return "bitcast";
+  case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
+  case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
+
+  case ISD::CONVERT_RNDSAT: {
+    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
+    default: llvm_unreachable("Unknown cvt code!");
+    case ISD::CVT_FF:                   return "cvt_ff";
+    case ISD::CVT_FS:                   return "cvt_fs";
+    case ISD::CVT_FU:                   return "cvt_fu";
+    case ISD::CVT_SF:                   return "cvt_sf";
+    case ISD::CVT_UF:                   return "cvt_uf";
+    case ISD::CVT_SS:                   return "cvt_ss";
+    case ISD::CVT_SU:                   return "cvt_su";
+    case ISD::CVT_US:                   return "cvt_us";
+    case ISD::CVT_UU:                   return "cvt_uu";
+    }
+  }
+
+    // Control flow instructions
+  case ISD::BR:                         return "br";
+  case ISD::BRIND:                      return "brind";
+  case ISD::BR_JT:                      return "br_jt";
+  case ISD::BRCOND:                     return "brcond";
+  case ISD::BR_CC:                      return "br_cc";
+  case ISD::CALLSEQ_START:              return "callseq_start";
+  case ISD::CALLSEQ_END:                return "callseq_end";
+
+    // Other operators
+  case ISD::LOAD:                       return "load";
+  case ISD::STORE:                      return "store";
+  case ISD::VAARG:                      return "vaarg";
+  case ISD::VACOPY:                     return "vacopy";
+  case ISD::VAEND:                      return "vaend";
+  case ISD::VASTART:                    return "vastart";
+  case ISD::DYNAMIC_STACKALLOC:         return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:            return "extract_element";
+  case ISD::BUILD_PAIR:                 return "build_pair";
+  case ISD::STACKSAVE:                  return "stacksave";
+  case ISD::STACKRESTORE:               return "stackrestore";
+  case ISD::TRAP:                       return "trap";
+  case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::LIFETIME_START:             return "lifetime.start";
+  case ISD::LIFETIME_END:               return "lifetime.end";
+
+  // Bit manipulation
+  case ISD::BSWAP:                      return "bswap";
+  case ISD::CTPOP:                      return "ctpop";
+  case ISD::CTTZ:                       return "cttz";
+  case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
+  case ISD::CTLZ:                       return "ctlz";
+  case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+
+  // Trampolines
+  case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
+  case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: llvm_unreachable("Unknown setcc condition!");
+    case ISD::SETOEQ:                   return "setoeq";
+    case ISD::SETOGT:                   return "setogt";
+    case ISD::SETOGE:                   return "setoge";
+    case ISD::SETOLT:                   return "setolt";
+    case ISD::SETOLE:                   return "setole";
+    case ISD::SETONE:                   return "setone";
+
+    case ISD::SETO:                     return "seto";
+    case ISD::SETUO:                    return "setuo";
+    case ISD::SETUEQ:                   return "setue";
+    case ISD::SETUGT:                   return "setugt";
+    case ISD::SETUGE:                   return "setuge";
+    case ISD::SETULT:                   return "setult";
+    case ISD::SETULE:                   return "setule";
+    case ISD::SETUNE:                   return "setune";
+
+    case ISD::SETEQ:                    return "seteq";
+    case ISD::SETGT:                    return "setgt";
+    case ISD::SETGE:                    return "setge";
+    case ISD::SETLT:                    return "setlt";
+    case ISD::SETLE:                    return "setle";
+    case ISD::SETNE:                    return "setne";
+
+    case ISD::SETTRUE:                  return "settrue";
+    case ISD::SETTRUE2:                 return "settrue2";
+    case ISD::SETFALSE:                 return "setfalse";
+    case ISD::SETFALSE2:                return "setfalse2";
+    }
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:              return "";
+  case ISD::PRE_INC:    return "<pre-inc>";
+  case ISD::PRE_DEC:    return "<pre-dec>";
+  case ISD::POST_INC:   return "<post-inc>";
+  case ISD::POST_DEC:   return "<post-dec>";
+  }
+}
+
+void SDNode::dump() const { dump(0); }
+void SDNode::dump(const SelectionDAG *G) const {
+  print(dbgs(), G);
+  dbgs() << '\n';
+}
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << (const void*)this << ": ";
+
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getEVTString();
+  }
+  OS << " = " << getOperationName(G);
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
+    if (!MN->memoperands_empty()) {
+      OS << "<";
+      OS << "Mem:";
+      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
+           e = MN->memoperands_end(); i != e; ++i) {
+        OS << **i;
+        if (llvm::next(i) != e)
+          OS << " ";
+      }
+      OS << ">";
+    }
+  } else if (const ShuffleVectorSDNode *SVN =
+               dyn_cast<ShuffleVectorSDNode>(this)) {
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().dump();
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    WriteAsOperand(OS, GADN->getGlobal());
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = GADN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+    if (unsigned int TF = JTDN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = CP->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(this)) {
+    OS << "<" << TI->getIndex() << '+' << TI->getOffset() << ">";
+    if (unsigned TF = TI->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    OS << ' ' << PrintReg(R->getReg(), G ? G->getTarget().getRegisterInfo() :0);
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+    if (unsigned int TF = ES->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
+    if (MD->getMD())
+      OS << "<" << MD->getMD() << ">";
+    else
+      OS << "<null>";
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT().getEVTString();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    OS << "<" << *LD->getMemOperand();
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << LD->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    OS << "<" << *ST->getMemOperand();
+
+    if (ST->isTruncatingStore())
+      OS << ", trunc to " << ST->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+    OS << "<" << *M->getMemOperand() << ">";
+  } else if (const BlockAddressSDNode *BA =
+               dyn_cast<BlockAddressSDNode>(this)) {
+    int64_t offset = BA->getOffset();
+    OS << "<";
+    WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
+    OS << ", ";
+    WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
+    OS << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = BA->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  }
+
+  if (G)
+    if (unsigned Order = G->GetOrdering(this))
+      OS << " [ORD=" << Order << ']';
+
+  if (getNodeId() != -1)
+    OS << " [ID=" << getNodeId() << ']';
+
+  DebugLoc dl = getDebugLoc();
+  if (G && !dl.isUnknown()) {
+    DIScope
+      Scope(dl.getScope(G->getMachineFunction().getFunction()->getContext()));
+    OS << " dbg:";
+    // Omit the directory, since it's usually long and uninteresting.
+    if (Scope.Verify())
+      OS << Scope.getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << dl.getLine();
+    if (dl.getCol() != 0)
+      OS << ':' << dl.getCol();
+  }
+}
+
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i).getNode()->hasOneUse())
+      DumpNodes(N->getOperand(i).getNode(), indent+2, G);
+    else
+      dbgs() << "\n" << std::string(indent+2, ' ')
+             << (void*)N->getOperand(i).getNode() << ": <multiple use>";
+
+  dbgs() << '\n';
+  dbgs().indent(indent);
+  N->dump(G);
+}
+
+void SelectionDAG::dump() const {
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:";
+
+  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
+       I != E; ++I) {
+    const SDNode *N = I;
+    if (!N->hasOneUse() && N != getRoot().getNode())
+      DumpNodes(N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+  dbgs() << "\n\n";
+}
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  print_details(OS, G);
+}
+
+typedef SmallPtrSet<const SDNode *, 128> VisitedSDNodeSet;
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N))          // If we've been here before, return now.
+    return;
+
+  // Dump the current SDNode, but don't end the line yet.
+  OS.indent(indent);
+  N->printr(OS, G);
+
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+
+    if (i) OS << ",";
+    OS << " ";
+
+    if (child->getNumOperands() == 0) {
+      // This child has no grandchildren; print it inline right here.
+      child->printr(OS, G);
+      once.insert(child);
+    } else {         // Just the address. FIXME: also print the child's opcode.
+      OS << (const void*)child;
+      if (unsigned RN = N->getOperand(i).getResNo())
+        OS << ":" << RN;
+    }
+  }
+
+  OS << "\n";
+
+  // Dump children that have grandchildren on their own line(s).
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    const SDNode *child = N->getOperand(i).getNode();
+    DumpNodesr(OS, child, indent+2, G, once);
+  }
+}
+
+void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, 0, once);
+}
+
+void SDNode::dumpr(const SelectionDAG *G) const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, G, once);
+}
+
+static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
+                                  const SelectionDAG *G, unsigned depth,
+                                  unsigned indent) {
+  if (depth == 0)
+    return;
+
+  OS.indent(indent);
+
+  N->print(OS, G);
+
+  if (depth < 1)
+    return;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    // Don't follow chain operands.
+    if (N->getOperand(i).getValueType() == MVT::Other)
+      continue;
+    OS << '\n';
+    printrWithDepthHelper(OS, N->getOperand(i).getNode(), G, depth-1, indent+2);
+  }
+}
+
+void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
+                            unsigned depth) const {
+  printrWithDepthHelper(OS, this, G, depth, 0);
+}
+
+void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  printrWithDepth(OS, G, 10);
+}
+
+void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
+  printrWithDepth(dbgs(), G, depth);
+}
+
+void SDNode::dumprFull(const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  dumprWithDepth(G, 10);
+}
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  print_types(OS, G);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", "; else OS << " ";
+    OS << (void*)getOperand(i).getNode();
+    if (unsigned RN = getOperand(i).getResNo())
+      OS << ":" << RN;
+  }
+  print_details(OS, G);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
new file mode 100644
index 000000000000..eeea9e4cfcff
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -0,0 +1,3019 @@
+//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGISel class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "isel"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "ScheduleDAGSDNodes.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
+STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
+STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
+STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
+
+#ifndef NDEBUG
+static cl::opt<bool>
+EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden,
+          cl::desc("Enable extra verbose messages in the \"fast\" "
+                   "instruction selector"));
+  // Terminators
+STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret");
+STATISTIC(NumFastIselFailBr,"Fast isel fails on Br");
+STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch");
+STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr");
+STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke");
+STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume");
+STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable");
+
+  // Standard binary operators...
+STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add");
+STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd");
+STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub");
+STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub");
+STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul");
+STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul");
+STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv");
+STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv");
+STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv");
+STATISTIC(NumFastIselFailURem,"Fast isel fails on URem");
+STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem");
+STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem");
+
+  // Logical operators...
+STATISTIC(NumFastIselFailAnd,"Fast isel fails on And");
+STATISTIC(NumFastIselFailOr,"Fast isel fails on Or");
+STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor");
+
+  // Memory instructions...
+STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca");
+STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load");
+STATISTIC(NumFastIselFailStore,"Fast isel fails on Store");
+STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg");
+STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM");
+STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence");
+STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr");
+
+  // Convert instructions...
+STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc");
+STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt");
+STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt");
+STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc");
+STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt");
+STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI");
+STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI");
+STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP");
+STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP");
+STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr");
+STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt");
+STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast");
+
+  // Other instructions...
+STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp");
+STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp");
+STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI");
+STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select");
+STATISTIC(NumFastIselFailCall,"Fast isel fails on Call");
+STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl");
+STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr");
+STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr");
+STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg");
+STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement");
+STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement");
+STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
+STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
+STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
+STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
+#endif
+
+static cl::opt<bool>
+EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
+          cl::desc("Enable verbose messages in the \"fast\" "
+                   "instruction selector"));
+static cl::opt<bool>
+EnableFastISelAbort("fast-isel-abort", cl::Hidden,
+          cl::desc("Enable abort calls when \"fast\" instruction selection "
+                   "fails to lower an instruction"));
+static cl::opt<bool>
+EnableFastISelAbortArgs("fast-isel-abort-args", cl::Hidden,
+          cl::desc("Enable abort calls when \"fast\" instruction selection "
+                   "fails to lower a formal argument"));
+
+static cl::opt<bool>
+UseMBPI("use-mbpi",
+        cl::desc("use Machine Branch Probability Info"),
+        cl::init(true), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<bool>
+ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the first "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize types"));
+static cl::opt<bool>
+ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize"));
+static cl::opt<bool>
+ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the second "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the post legalize types"
+                   " dag combine pass"));
+static cl::opt<bool>
+ViewISelDAGs("view-isel-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags as they are selected"));
+static cl::opt<bool>
+ViewSchedDAGs("view-sched-dags", cl::Hidden,
+          cl::desc("Pop up a window to show sched dags as they are processed"));
+static cl::opt<bool>
+ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
+      cl::desc("Pop up a window to show SUnit dags after they are processed"));
+#else
+static const bool ViewDAGCombine1 = false,
+                  ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false,
+                  ViewDAGCombineLT = false,
+                  ViewISelDAGs = false, ViewSchedDAGs = false,
+                  ViewSUnitDAGs = false;
+#endif
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterScheduler class - Track the registration of instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterScheduler::Registry;
+
+//===---------------------------------------------------------------------===//
+///
+/// ISHeuristic command line option for instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterScheduler::FunctionPassCtor, false,
+               RegisterPassParser<RegisterScheduler> >
+ISHeuristic("pre-RA-sched",
+            cl::init(&createDefaultScheduler),
+            cl::desc("Instruction schedulers available (before register"
+                     " allocation):"));
+
+static RegisterScheduler
+defaultListDAGScheduler("default", "Best scheduler for the target",
+                        createDefaultScheduler);
+
+namespace llvm {
+  //===--------------------------------------------------------------------===//
+  /// createDefaultScheduler - This creates an instruction scheduler appropriate
+  /// for the target.
+  ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
+                                             CodeGenOpt::Level OptLevel) {
+    const TargetLowering &TLI = IS->getTargetLowering();
+    const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
+
+    if (OptLevel == CodeGenOpt::None || ST.enableMachineScheduler() ||
+        TLI.getSchedulingPreference() == Sched::Source)
+      return createSourceListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::RegPressure)
+      return createBURRListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::Hybrid)
+      return createHybridListDAGScheduler(IS, OptLevel);
+    if (TLI.getSchedulingPreference() == Sched::VLIW)
+      return createVLIWDAGScheduler(IS, OptLevel);
+    assert(TLI.getSchedulingPreference() == Sched::ILP &&
+           "Unknown sched type!");
+    return createILPListDAGScheduler(IS, OptLevel);
+  }
+}
+
+// EmitInstrWithCustomInserter - This method should be implemented by targets
+// that mark instructions with the 'usesCustomInserter' flag.  These
+// instructions are special in various ways, which require special support to
+// insert.  The specified MachineInstr is created but not inserted into any
+// basic blocks, and this method is called to expand it into a sequence of
+// instructions, potentially also creating new basic blocks and control flow.
+// When new basic blocks are inserted and the edges from MBB to its successors
+// are modified, the method should insert pairs of <OldSucc, NewSucc> into the
+// DenseMap.
+MachineBasicBlock *
+TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                            MachineBasicBlock *MBB) const {
+#ifndef NDEBUG
+  dbgs() << "If a target marks an instruction with "
+          "'usesCustomInserter', it must implement "
+          "TargetLowering::EmitInstrWithCustomInserter!";
+#endif
+  llvm_unreachable(0);
+}
+
+void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                   SDNode *Node) const {
+  assert(!MI->hasPostISelHook() &&
+         "If a target marks an instruction with 'hasPostISelHook', "
+         "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
+}
+
+//===----------------------------------------------------------------------===//
+// SelectionDAGISel code
+//===----------------------------------------------------------------------===//
+
+SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm,
+                                   CodeGenOpt::Level OL) :
+  MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
+  FuncInfo(new FunctionLoweringInfo(TLI)),
+  CurDAG(new SelectionDAG(tm, OL)),
+  SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
+  GFI(),
+  OptLevel(OL),
+  DAGSize(0) {
+    initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+    initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry());
+    initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
+    initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+SelectionDAGISel::~SelectionDAGISel() {
+  delete SDB;
+  delete CurDAG;
+  delete FuncInfo;
+}
+
+void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addPreserved<AliasAnalysis>();
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<GCModuleInfo>();
+  AU.addRequired<TargetLibraryInfo>();
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    AU.addRequired<BranchProbabilityInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
+/// may trap on it.  In this case we have to split the edge so that the path
+/// through the predecessor block that doesn't go to the phi block doesn't
+/// execute the possibly trapping instruction.
+///
+/// This is required for correctness, so it must be done at -O0.
+///
+static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
+  // Loop for blocks with phi nodes.
+  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    PHINode *PN = dyn_cast<PHINode>(BB->begin());
+    if (PN == 0) continue;
+
+  ReprocessBlock:
+    // For each block with a PHI node, check to see if any of the input values
+    // are potentially trapping constant expressions.  Constant expressions are
+    // the only potentially trapping value that can occur as the argument to a
+    // PHI.
+    for (BasicBlock::iterator I = BB->begin(); (PN = dyn_cast<PHINode>(I)); ++I)
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
+        if (CE == 0 || !CE->canTrap()) continue;
+
+        // The only case we have to worry about is when the edge is critical.
+        // Since this block has a PHI Node, we assume it has multiple input
+        // edges: check to see if the pred has multiple successors.
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        if (Pred->getTerminator()->getNumSuccessors() == 1)
+          continue;
+
+        // Okay, we have to split this edge.
+        SplitCriticalEdge(Pred->getTerminator(),
+                          GetSuccessorNumber(Pred, BB), SDISel, true);
+        goto ReprocessBlock;
+      }
+  }
+}
+
+bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
+  // Do some sanity-checking on the command-line options.
+  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) &&
+         "-fast-isel-verbose requires -fast-isel");
+  assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
+         "-fast-isel-abort requires -fast-isel");
+
+  const Function &Fn = *mf.getFunction();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+
+  MF = &mf;
+  RegInfo = &MF->getRegInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  LibInfo = &getAnalysis<TargetLibraryInfo>();
+  TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : 0;
+
+  TargetSubtargetInfo &ST =
+    const_cast<TargetSubtargetInfo&>(TM.getSubtarget<TargetSubtargetInfo>());
+  ST.resetSubtargetFeatures(MF);
+  TM.resetTargetOptions(MF);
+
+  DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
+
+  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
+
+  CurDAG->init(*MF, TTI);
+  FuncInfo->set(Fn, *MF);
+
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfo>();
+  else
+    FuncInfo->BPI = 0;
+
+  SDB->init(GFI, *AA, LibInfo);
+
+  MF->setHasMSInlineAsm(false);
+  SelectAllBasicBlocks(Fn);
+
+  // If the first basic block in the function has live ins that need to be
+  // copied into vregs, emit the copies into the top of the block before
+  // emitting the code for the block.
+  MachineBasicBlock *EntryMBB = MF->begin();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, TII);
+
+  DenseMap<unsigned, unsigned> LiveInMap;
+  if (!FuncInfo->ArgDbgValues.empty())
+    for (MachineRegisterInfo::livein_iterator LI = RegInfo->livein_begin(),
+           E = RegInfo->livein_end(); LI != E; ++LI)
+      if (LI->second)
+        LiveInMap.insert(std::make_pair(LI->first, LI->second));
+
+  // Insert DBG_VALUE instructions for function arguments to the entry block.
+  for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
+    MachineInstr *MI = FuncInfo->ArgDbgValues[e-i-1];
+    unsigned Reg = MI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      EntryMBB->insert(EntryMBB->begin(), MI);
+    else {
+      MachineInstr *Def = RegInfo->getVRegDef(Reg);
+      MachineBasicBlock::iterator InsertPos = Def;
+      // FIXME: VR def may not be in entry block.
+      Def->getParent()->insert(llvm::next(InsertPos), MI);
+    }
+
+    // If Reg is live-in then update debug info to track its copy in a vreg.
+    DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
+    if (LDI != LiveInMap.end()) {
+      MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
+      MachineBasicBlock::iterator InsertPos = Def;
+      const MDNode *Variable =
+        MI->getOperand(MI->getNumOperands()-1).getMetadata();
+      unsigned Offset = MI->getOperand(1).getImm();
+      // Def is never a terminator here, so it is ok to increment InsertPos.
+      BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
+              TII.get(TargetOpcode::DBG_VALUE))
+        .addReg(LDI->second, RegState::Debug)
+        .addImm(Offset).addMetadata(Variable);
+
+      // If this vreg is directly copied into an exported register then
+      // that COPY instructions also need DBG_VALUE, if it is the only
+      // user of LDI->second.
+      MachineInstr *CopyUseMI = NULL;
+      for (MachineRegisterInfo::use_iterator
+             UI = RegInfo->use_begin(LDI->second);
+           MachineInstr *UseMI = UI.skipInstruction();) {
+        if (UseMI->isDebugValue()) continue;
+        if (UseMI->isCopy() && !CopyUseMI && UseMI->getParent() == EntryMBB) {
+          CopyUseMI = UseMI; continue;
+        }
+        // Otherwise this is another use or second copy use.
+        CopyUseMI = NULL; break;
+      }
+      if (CopyUseMI) {
+        MachineInstr *NewMI =
+          BuildMI(*MF, CopyUseMI->getDebugLoc(),
+                  TII.get(TargetOpcode::DBG_VALUE))
+          .addReg(CopyUseMI->getOperand(0).getReg(), RegState::Debug)
+          .addImm(Offset).addMetadata(Variable);
+        MachineBasicBlock::iterator Pos = CopyUseMI;
+        EntryMBB->insertAfter(Pos, NewMI);
+      }
+    }
+  }
+
+  // Determine if there are any calls in this machine function.
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
+
+    if (MFI->hasCalls() && MF->hasMSInlineAsm())
+      break;
+
+    const MachineBasicBlock *MBB = I;
+    for (MachineBasicBlock::const_iterator II = MBB->begin(), IE = MBB->end();
+         II != IE; ++II) {
+      const MCInstrDesc &MCID = TM.getInstrInfo()->get(II->getOpcode());
+      if ((MCID.isCall() && !MCID.isReturn()) ||
+          II->isStackAligningInlineAsm()) {
+        MFI->setHasCalls(true);
+      }
+      if (II->isMSInlineAsm()) {
+        MF->setHasMSInlineAsm(true);
+      }
+    }
+  }
+
+  // Determine if there is a call to setjmp in the machine function.
+  MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
+
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<unsigned, unsigned>::iterator
+       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    unsigned From = I->first;
+    unsigned To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    for (;;) {
+      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
+      if (J == E) break;
+      To = J->second;
+    }
+    // Replace it.
+    MRI.replaceRegWith(From, To);
+  }
+
+  // Freeze the set of reserved registers now that MachineFrameInfo has been
+  // set up. All the information required by getReservedRegs() should be
+  // available now.
+  MRI.freezeReservedRegs(*MF);
+
+  // Release function-specific state. SDB and CurDAG are already cleared
+  // at this point.
+  FuncInfo->clear();
+
+  return true;
+}
+
+void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
+                                        BasicBlock::const_iterator End,
+                                        bool &HadTailCall) {
+  // Lower all of the non-terminator instructions. If a call is emitted
+  // as a tail call, cease emitting nodes for this block. Terminators
+  // are handled below.
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
+    SDB->visit(*I);
+
+  // Make sure the root of the DAG is up-to-date.
+  CurDAG->setRoot(SDB->getControlRoot());
+  HadTailCall = SDB->HasTailCall;
+  SDB->clear();
+
+  // Final step, emit the lowered DAG as machine code.
+  CodeGenAndEmitDAG();
+}
+
+void SelectionDAGISel::ComputeLiveOutVRegInfo() {
+  SmallPtrSet<SDNode*, 128> VisitedNodes;
+  SmallVector<SDNode*, 128> Worklist;
+
+  Worklist.push_back(CurDAG->getRoot().getNode());
+
+  APInt KnownZero;
+  APInt KnownOne;
+
+  do {
+    SDNode *N = Worklist.pop_back_val();
+
+    // If we've already seen this node, ignore it.
+    if (!VisitedNodes.insert(N))
+      continue;
+
+    // Otherwise, add all chain operands to the worklist.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        Worklist.push_back(N->getOperand(i).getNode());
+
+    // If this is a CopyToReg with a vreg dest, process it.
+    if (N->getOpcode() != ISD::CopyToReg)
+      continue;
+
+    unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+      continue;
+
+    // Ignore non-scalar or non-integer values.
+    SDValue Src = N->getOperand(2);
+    EVT SrcVT = Src.getValueType();
+    if (!SrcVT.isInteger() || SrcVT.isVector())
+      continue;
+
+    unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
+    CurDAG->ComputeMaskedBits(Src, KnownZero, KnownOne);
+    FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne);
+  } while (!Worklist.empty());
+}
+
+void SelectionDAGISel::CodeGenAndEmitDAG() {
+  std::string GroupName;
+  if (TimePassesIsEnabled)
+    GroupName = "Instruction Selection and Scheduling";
+  std::string BlockName;
+  int BlockNumber = -1;
+  (void)BlockNumber;
+#ifdef NDEBUG
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
+      ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
+      ViewSUnitDAGs)
+#endif
+  {
+    BlockNumber = FuncInfo->MBB->getNumber();
+    BlockName = MF->getName().str() + ":" +
+                FuncInfo->MBB->getBasicBlock()->getName().str();
+  }
+  DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+
+  // Run the DAG combiner in pre-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 1", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  // Second step, hack on the DAG until it only uses operations and types that
+  // the target supports.
+  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
+                                               BlockName);
+
+  bool Changed;
+  {
+    NamedRegionTimer T("Type Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (Changed) {
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+  }
+
+  {
+    NamedRegionTimer T("Vector Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeVectors();
+  }
+
+  if (Changed) {
+    {
+      NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
+      CurDAG->LegalizeTypes();
+    }
+
+    if (ViewDAGCombineLT)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
+          << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
+  }
+
+  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+
+  {
+    NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
+    CurDAG->Legalize();
+  }
+
+  DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+
+  // Run the DAG combiner in post-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (OptLevel != CodeGenOpt::None)
+    ComputeLiveOutVRegInfo();
+
+  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+
+  // Third, instruction select all of the operations to machine code, adding the
+  // code to the MachineBasicBlock.
+  {
+    NamedRegionTimer T("Instruction Selection", GroupName, TimePassesIsEnabled);
+    DoInstructionSelection();
+  }
+
+  DEBUG(dbgs() << "Selected selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+
+  // Schedule machine code.
+  ScheduleDAGSDNodes *Scheduler = CreateScheduler();
+  {
+    NamedRegionTimer T("Instruction Scheduling", GroupName,
+                       TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB);
+  }
+
+  if (ViewSUnitDAGs) Scheduler->viewGraph();
+
+  // Emit machine code to BB.  This can change 'BB' to the last block being
+  // inserted into.
+  MachineBasicBlock *FirstMBB = FuncInfo->MBB, *LastMBB;
+  {
+    NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
+
+    // FuncInfo->InsertPt is passed by reference and set to the end of the
+    // scheduled instructions.
+    LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule(FuncInfo->InsertPt);
+  }
+
+  // If the block was split, make sure we update any references that are used to
+  // update PHI nodes later on.
+  if (FirstMBB != LastMBB)
+    SDB->UpdateSplitBlock(FirstMBB, LastMBB);
+
+  // Free the scheduler state.
+  {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName,
+                       TimePassesIsEnabled);
+    delete Scheduler;
+  }
+
+  // Free the SelectionDAG state, now that we're finished with it.
+  CurDAG->clear();
+}
+
+namespace {
+/// ISelUpdater - helper class to handle updates of the instruction selection
+/// graph.
+class ISelUpdater : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::allnodes_iterator &ISelPosition;
+public:
+  ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
+    : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
+
+  /// NodeDeleted - Handle nodes deleted from the graph. If the node being
+  /// deleted is the current ISelPosition node, update ISelPosition.
+  ///
+  virtual void NodeDeleted(SDNode *N, SDNode *E) {
+    if (ISelPosition == SelectionDAG::allnodes_iterator(N))
+      ++ISelPosition;
+  }
+};
+} // end anonymous namespace
+
+void SelectionDAGISel::DoInstructionSelection() {
+  DEBUG(errs() << "===== Instruction selection begins: BB#"
+        << FuncInfo->MBB->getNumber()
+        << " '" << FuncInfo->MBB->getName() << "'\n");
+
+  PreprocessISelDAG();
+
+  // Select target instructions for the DAG.
+  {
+    // Number all nodes with a topological order and set DAGSize.
+    DAGSize = CurDAG->AssignTopologicalOrder();
+
+    // Create a dummy node (which is not added to allnodes), that adds
+    // a reference to the root node, preventing it from being deleted,
+    // and tracking any changes of the root.
+    HandleSDNode Dummy(CurDAG->getRoot());
+    SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
+    ++ISelPosition;
+
+    // Make sure that ISelPosition gets properly updated when nodes are deleted
+    // in calls made from this function.
+    ISelUpdater ISU(*CurDAG, ISelPosition);
+
+    // The AllNodes list is now topological-sorted. Visit the
+    // nodes by starting at the end of the list (the root of the
+    // graph) and preceding back toward the beginning (the entry
+    // node).
+    while (ISelPosition != CurDAG->allnodes_begin()) {
+      SDNode *Node = --ISelPosition;
+      // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
+      // but there are currently some corner cases that it misses. Also, this
+      // makes it theoretically possible to disable the DAGCombiner.
+      if (Node->use_empty())
+        continue;
+
+      SDNode *ResNode = Select(Node);
+
+      // FIXME: This is pretty gross.  'Select' should be changed to not return
+      // anything at all and this code should be nuked with a tactical strike.
+
+      // If node should not be replaced, continue with the next one.
+      if (ResNode == Node || Node->getOpcode() == ISD::DELETED_NODE)
+        continue;
+      // Replace node.
+      if (ResNode) {
+        // Propagate ordering
+        CurDAG->AssignOrdering(ResNode, CurDAG->GetOrdering(Node));
+
+        ReplaceUses(Node, ResNode);
+      }
+
+      // If after the replacement this node is not used any more,
+      // remove this dead node.
+      if (Node->use_empty()) // Don't delete EntryToken, etc.
+        CurDAG->RemoveDeadNode(Node);
+    }
+
+    CurDAG->setRoot(Dummy.getValue());
+  }
+
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  PostprocessISelDAG();
+}
+
+/// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
+/// do other setup for EH landing-pad blocks.
+void SelectionDAGISel::PrepareEHLandingPad() {
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+
+  // Add a label to mark the beginning of the landing pad.  Deletion of the
+  // landing pad can thus be detected via the MachineModuleInfo.
+  MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
+
+  // Assign the call site to the landing pad's begin label.
+  MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+
+  const MCInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
+  BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+    .addSym(Label);
+
+  // Mark exception register as live in.
+  unsigned Reg = TLI.getExceptionPointerRegister();
+  if (Reg) MBB->addLiveIn(Reg);
+
+  // Mark exception selector register as live in.
+  Reg = TLI.getExceptionSelectorRegister();
+  if (Reg) MBB->addLiveIn(Reg);
+}
+
+/// TryToFoldFastISelLoad - We're checking to see if we can fold the specified
+/// load into the specified FoldInst.  Note that we could have a sequence where
+/// multiple LLVM IR instructions are folded into the same machineinstr.  For
+/// example we could have:
+///   A: x = load i32 *P
+///   B: y = icmp A, 42
+///   C: br y, ...
+///
+/// In this scenario, LI is "A", and FoldInst is "C".  We know about "B" (and
+/// any other folded instructions) because it is between A and C.
+///
+/// If we succeed in folding the load into the operation, return true.
+///
+bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
+                                             const Instruction *FoldInst,
+                                             FastISel *FastIS) {
+  // We know that the load has a single use, but don't know what it is.  If it
+  // isn't one of the folded instructions, then we can't succeed here.  Handle
+  // this by scanning the single-use users of the load until we get to FoldInst.
+  unsigned MaxUsers = 6;  // Don't scan down huge single-use chains of instrs.
+
+  const Instruction *TheUser = LI->use_back();
+  while (TheUser != FoldInst &&   // Scan up until we find FoldInst.
+         // Stay in the right block.
+         TheUser->getParent() == FoldInst->getParent() &&
+         --MaxUsers) {  // Don't scan too far.
+    // If there are multiple or no uses of this instruction, then bail out.
+    if (!TheUser->hasOneUse())
+      return false;
+
+    TheUser = TheUser->use_back();
+  }
+
+  // If we didn't find the fold instruction, then we failed to collapse the
+  // sequence.
+  if (TheUser != FoldInst)
+    return false;
+
+  // Don't try to fold volatile loads.  Target has to deal with alignment
+  // constraints.
+  if (LI->isVolatile()) return false;
+
+  // Figure out which vreg this is going into.  If there is no assigned vreg yet
+  // then there actually was no reference to it.  Perhaps the load is referenced
+  // by a dead instruction.
+  unsigned LoadReg = FastIS->getRegForValue(LI);
+  if (LoadReg == 0)
+    return false;
+
+  // Check to see what the uses of this vreg are.  If it has no uses, or more
+  // than one use (at the machine instr level) then we can't fold it.
+  MachineRegisterInfo::reg_iterator RI = RegInfo->reg_begin(LoadReg);
+  if (RI == RegInfo->reg_end())
+    return false;
+
+  // See if there is exactly one use of the vreg.  If there are multiple uses,
+  // then the instruction got lowered to multiple machine instructions or the
+  // use of the loaded value ended up being multiple operands of the result, in
+  // either case, we can't fold this.
+  MachineRegisterInfo::reg_iterator PostRI = RI; ++PostRI;
+  if (PostRI != RegInfo->reg_end())
+    return false;
+
+  assert(RI.getOperand().isUse() &&
+         "The only use of the vreg must be a use, we haven't emitted the def!");
+
+  MachineInstr *User = &*RI;
+
+  // Set the insertion point properly.  Folding the load can cause generation of
+  // other random instructions (like sign extends) for addressing modes, make
+  // sure they get inserted in a logical place before the new instruction.
+  FuncInfo->InsertPt = User;
+  FuncInfo->MBB = User->getParent();
+
+  // Ask the target to try folding the load.
+  return FastIS->TryToFoldLoad(User, RI.getOperandNo(), LI);
+}
+
+/// isFoldedOrDeadInstruction - Return true if the specified instruction is
+/// side-effect free and is either dead or folded into a generated instruction.
+/// Return false if it needs to be emitted.
+static bool isFoldedOrDeadInstruction(const Instruction *I,
+                                      FunctionLoweringInfo *FuncInfo) {
+  return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
+         !isa<TerminatorInst>(I) && // Terminators aren't folded.
+         !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
+         !isa<LandingPadInst>(I) &&    // Landingpad instructions aren't folded.
+         !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
+}
+
+#ifndef NDEBUG
+// Collect per Instruction statistics for fast-isel misses.  Only those
+// instructions that cause the bail are accounted for.  It does not account for
+// instructions higher in the block.  Thus, summing the per instructions stats
+// will not add up to what is reported by NumFastIselFailures.
+static void collectFailStats(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default: assert (0 && "<Invalid operator> ");
+
+  // Terminators
+  case Instruction::Ret:         NumFastIselFailRet++; return;
+  case Instruction::Br:          NumFastIselFailBr++; return;
+  case Instruction::Switch:      NumFastIselFailSwitch++; return;
+  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return;
+  case Instruction::Invoke:      NumFastIselFailInvoke++; return;
+  case Instruction::Resume:      NumFastIselFailResume++; return;
+  case Instruction::Unreachable: NumFastIselFailUnreachable++; return;
+
+  // Standard binary operators...
+  case Instruction::Add:  NumFastIselFailAdd++; return;
+  case Instruction::FAdd: NumFastIselFailFAdd++; return;
+  case Instruction::Sub:  NumFastIselFailSub++; return;
+  case Instruction::FSub: NumFastIselFailFSub++; return;
+  case Instruction::Mul:  NumFastIselFailMul++; return;
+  case Instruction::FMul: NumFastIselFailFMul++; return;
+  case Instruction::UDiv: NumFastIselFailUDiv++; return;
+  case Instruction::SDiv: NumFastIselFailSDiv++; return;
+  case Instruction::FDiv: NumFastIselFailFDiv++; return;
+  case Instruction::URem: NumFastIselFailURem++; return;
+  case Instruction::SRem: NumFastIselFailSRem++; return;
+  case Instruction::FRem: NumFastIselFailFRem++; return;
+
+  // Logical operators...
+  case Instruction::And: NumFastIselFailAnd++; return;
+  case Instruction::Or:  NumFastIselFailOr++; return;
+  case Instruction::Xor: NumFastIselFailXor++; return;
+
+  // Memory instructions...
+  case Instruction::Alloca:        NumFastIselFailAlloca++; return;
+  case Instruction::Load:          NumFastIselFailLoad++; return;
+  case Instruction::Store:         NumFastIselFailStore++; return;
+  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return;
+  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return;
+  case Instruction::Fence:         NumFastIselFailFence++; return;
+  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return;
+
+  // Convert instructions...
+  case Instruction::Trunc:    NumFastIselFailTrunc++; return;
+  case Instruction::ZExt:     NumFastIselFailZExt++; return;
+  case Instruction::SExt:     NumFastIselFailSExt++; return;
+  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return;
+  case Instruction::FPExt:    NumFastIselFailFPExt++; return;
+  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return;
+  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
+  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
+  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
+  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
+  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
+  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
+
+  // Other instructions...
+  case Instruction::ICmp:           NumFastIselFailICmp++; return;
+  case Instruction::FCmp:           NumFastIselFailFCmp++; return;
+  case Instruction::PHI:            NumFastIselFailPHI++; return;
+  case Instruction::Select:         NumFastIselFailSelect++; return;
+  case Instruction::Call:           NumFastIselFailCall++; return;
+  case Instruction::Shl:            NumFastIselFailShl++; return;
+  case Instruction::LShr:           NumFastIselFailLShr++; return;
+  case Instruction::AShr:           NumFastIselFailAShr++; return;
+  case Instruction::VAArg:          NumFastIselFailVAArg++; return;
+  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return;
+  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return;
+  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return;
+  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return;
+  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return;
+  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
+  }
+}
+#endif
+
+void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  // Initialize the Fast-ISel state, if needed.
+  FastISel *FastIS = 0;
+  if (TM.Options.EnableFastISel)
+    FastIS = TLI.createFastISel(*FuncInfo, LibInfo);
+
+  // Iterate over all basic blocks in the function.
+  ReversePostOrderTraversal<const Function*> RPOT(&Fn);
+  for (ReversePostOrderTraversal<const Function*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    const BasicBlock *LLVMBB = *I;
+
+    if (OptLevel != CodeGenOpt::None) {
+      bool AllPredsVisited = true;
+      for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+           PI != PE; ++PI) {
+        if (!FuncInfo->VisitedBBs.count(*PI)) {
+          AllPredsVisited = false;
+          break;
+        }
+      }
+
+      if (AllPredsVisited) {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             const PHINode *PN = dyn_cast<PHINode>(I); ++I)
+          FuncInfo->ComputePHILiveOutRegInfo(PN);
+      } else {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             const PHINode *PN = dyn_cast<PHINode>(I); ++I)
+          FuncInfo->InvalidatePHILiveOutRegInfo(PN);
+      }
+
+      FuncInfo->VisitedBBs.insert(LLVMBB);
+    }
+
+    BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHI();
+    BasicBlock::const_iterator const End = LLVMBB->end();
+    BasicBlock::const_iterator BI = End;
+
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+
+    // Setup an EH landing-pad block.
+    if (FuncInfo->MBB->isLandingPad())
+      PrepareEHLandingPad();
+
+    // Before doing SelectionDAG ISel, see if FastISel has been requested.
+    if (FastIS) {
+      FastIS->startNewBlock();
+
+      // Emit code for any incoming arguments. This must happen before
+      // beginning FastISel on the entry block.
+      if (LLVMBB == &Fn.getEntryBlock()) {
+        // Lower any arguments needed in this block if this is the entry block.
+        if (!FastIS->LowerArguments()) {
+          // Fast isel failed to lower these arguments
+          if (EnableFastISelAbortArgs)
+            llvm_unreachable("FastISel didn't lower all arguments");
+
+          // Use SelectionDAG argument lowering
+          LowerArguments(Fn);
+          CurDAG->setRoot(SDB->getControlRoot());
+          SDB->clear();
+          CodeGenAndEmitDAG();
+        }
+
+        // If we inserted any instructions at the beginning, make a note of
+        // where they are, so we can be sure to emit subsequent instructions
+        // after them.
+        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+          FastIS->setLastLocalValue(llvm::prior(FuncInfo->InsertPt));
+        else
+          FastIS->setLastLocalValue(0);
+      }
+
+      unsigned NumFastIselRemaining = std::distance(Begin, End);
+      // Do FastISel on as many instructions as possible.
+      for (; BI != Begin; --BI) {
+        const Instruction *Inst = llvm::prior(BI);
+
+        // If we no longer require this instruction, skip it.
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+          --NumFastIselRemaining;
+          continue;
+        }
+
+        // Bottom-up: reset the insert pos at the top, after any local-value
+        // instructions.
+        FastIS->recomputeInsertPt();
+
+        // Try to select the instruction with FastISel.
+        if (FastIS->SelectInstruction(Inst)) {
+          --NumFastIselRemaining;
+          ++NumFastIselSuccess;
+          // If fast isel succeeded, skip over all the folded instructions, and
+          // then see if there is a load right before the selected instructions.
+          // Try to fold the load if so.
+          const Instruction *BeforeInst = Inst;
+          while (BeforeInst != Begin) {
+            BeforeInst = llvm::prior(BasicBlock::const_iterator(BeforeInst));
+            if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo))
+              break;
+          }
+          if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
+              BeforeInst->hasOneUse() &&
+              TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), Inst, FastIS)) {
+            // If we succeeded, don't re-select the load.
+            BI = llvm::next(BasicBlock::const_iterator(BeforeInst));
+            --NumFastIselRemaining;
+            ++NumFastIselSuccess;
+          }
+          continue;
+        }
+
+#ifndef NDEBUG
+        if (EnableFastISelVerbose2)
+          collectFailStats(Inst);
+#endif
+
+        // Then handle certain instructions as single-LLVM-Instruction blocks.
+        if (isa<CallInst>(Inst)) {
+
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel missed call: ";
+            Inst->dump();
+          }
+
+          if (!Inst->getType()->isVoidTy() && !Inst->use_empty()) {
+            unsigned &R = FuncInfo->ValueMap[Inst];
+            if (!R)
+              R = FuncInfo->CreateRegs(Inst->getType());
+          }
+
+          bool HadTailCall = false;
+          MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt;
+          SelectBasicBlock(Inst, BI, HadTailCall);
+
+          // If the call was emitted as a tail call, we're done with the block.
+          // We also need to delete any previously emitted instructions.
+          if (HadTailCall) {
+            FastIS->removeDeadCode(SavedInsertPt, FuncInfo->MBB->end());
+            --BI;
+            break;
+          }
+
+          // Recompute NumFastIselRemaining as Selection DAG instruction
+          // selection may have handled the call, input args, etc.
+          unsigned RemainingNow = std::distance(Begin, BI);
+          NumFastIselFailures += NumFastIselRemaining - RemainingNow;
+          NumFastIselRemaining = RemainingNow;
+          continue;
+        }
+
+        if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) {
+          // Don't abort, and use a different message for terminator misses.
+          NumFastIselFailures += NumFastIselRemaining;
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel missed terminator: ";
+            Inst->dump();
+          }
+        } else {
+          NumFastIselFailures += NumFastIselRemaining;
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel miss: ";
+            Inst->dump();
+          }
+          if (EnableFastISelAbort)
+            // The "fast" selector couldn't handle something and bailed.
+            // For the purpose of debugging, just abort.
+            llvm_unreachable("FastISel didn't select the entire block");
+        }
+        break;
+      }
+
+      FastIS->recomputeInsertPt();
+    } else {
+      // Lower any arguments needed in this block if this is the entry block.
+      if (LLVMBB == &Fn.getEntryBlock())
+        LowerArguments(Fn);
+    }
+
+    if (Begin != BI)
+      ++NumDAGBlocks;
+    else
+      ++NumFastIselBlocks;
+
+    if (Begin != BI) {
+      // Run SelectionDAG instruction selection on the remainder of the block
+      // not handled by FastISel. If FastISel is not run, this is the entire
+      // block.
+      bool HadTailCall;
+      SelectBasicBlock(Begin, BI, HadTailCall);
+    }
+
+    FinishBasicBlock();
+    FuncInfo->PHINodesToUpdate.clear();
+  }
+
+  delete FastIS;
+  SDB->clearDanglingDebugInfo();
+}
+
+void
+SelectionDAGISel::FinishBasicBlock() {
+
+  DEBUG(dbgs() << "Total amount of phi nodes to update: "
+               << FuncInfo->PHINodesToUpdate.size() << "\n";
+        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
+          dbgs() << "Node " << i << " : ("
+                 << FuncInfo->PHINodesToUpdate[i].first
+                 << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
+
+  // Next, now that we know what the last MBB the LLVM BB expanded is, update
+  // PHI nodes in successors.
+  if (SDB->SwitchCases.empty() &&
+      SDB->JTCases.empty() &&
+      SDB->BitTestCases.empty()) {
+    for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
+        continue;
+      PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
+    }
+    return;
+  }
+
+  for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->BitTestCases[i].Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->BitTestCases[i].Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitBitTestHeader(SDB->BitTestCases[i], FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    uint32_t UnhandledWeight = 0;
+    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j)
+      UnhandledWeight += SDB->BitTestCases[i].Cases[j].ExtraWeight;
+
+    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
+      UnhandledWeight -= SDB->BitTestCases[i].Cases[j].ExtraWeight;
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      if (j+1 != ej)
+        SDB->visitBitTestCase(SDB->BitTestCases[i],
+                              SDB->BitTestCases[i].Cases[j+1].ThisBB,
+                              UnhandledWeight,
+                              SDB->BitTestCases[i].Reg,
+                              SDB->BitTestCases[i].Cases[j],
+                              FuncInfo->MBB);
+      else
+        SDB->visitBitTestCase(SDB->BitTestCases[i],
+                              SDB->BitTestCases[i].Default,
+                              UnhandledWeight,
+                              SDB->BitTestCases[i].Reg,
+                              SDB->BitTestCases[i].Cases[j],
+                              FuncInfo->MBB);
+
+
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // This is "default" BB. We have two jumps to it. From "header" BB and
+      // from last "case" BB.
+      if (PHIBB == SDB->BitTestCases[i].Default)
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+           .addMBB(SDB->BitTestCases[i].Parent)
+           .addReg(FuncInfo->PHINodesToUpdate[pi].second)
+           .addMBB(SDB->BitTestCases[i].Cases.back().ThisBB);
+      // One of "cases" BB.
+      for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size();
+           j != ej; ++j) {
+        MachineBasicBlock* cBB = SDB->BitTestCases[i].Cases[j].ThisBB;
+        if (cBB->isSuccessor(PHIBB))
+          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB);
+      }
+    }
+  }
+  SDB->BitTestCases.clear();
+
+  // If the JumpTable record is filled in, then we need to emit a jump table.
+  // Updating the PHI nodes is tricky in this case, since we need to determine
+  // whether the PHI is a successor of the range check MBB or the jump table MBB
+  for (unsigned i = 0, e = SDB->JTCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->JTCases[i].first.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
+                                FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+    // Emit the code
+    SDB->visitJumpTable(SDB->JTCases[i].second);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // "default" BB. We can go there only from header BB.
+      if (PHIBB == SDB->JTCases[i].second.Default)
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+           .addMBB(SDB->JTCases[i].first.HeaderBB);
+      // JT BB. Just iterate over successors here
+      if (FuncInfo->MBB->isSuccessor(PHIBB))
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(FuncInfo->MBB);
+    }
+  }
+  SDB->JTCases.clear();
+
+  // If the switch block involved a branch to one of the actual successors, we
+  // need to update PHI nodes in that block.
+  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
+    assert(PHI->isPHI() &&
+           "This is not a machine PHI node that we are updating!");
+    if (FuncInfo->MBB->isSuccessor(PHI->getParent()))
+      PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
+  }
+
+  // If we generated any switch lowering information, build and codegen any
+  // additional DAGs necessary.
+  for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+
+    // Determine the unique successors.
+    SmallVector<MachineBasicBlock *, 2> Succs;
+    Succs.push_back(SDB->SwitchCases[i].TrueBB);
+    if (SDB->SwitchCases[i].TrueBB != SDB->SwitchCases[i].FalseBB)
+      Succs.push_back(SDB->SwitchCases[i].FalseBB);
+
+    // Emit the code. Note that this could result in FuncInfo->MBB being split.
+    SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Remember the last block, now that any splitting is done, for use in
+    // populating PHI nodes in successors.
+    MachineBasicBlock *ThisBB = FuncInfo->MBB;
+
+    // Handle any PHI nodes in successors of this chunk, as if we were coming
+    // from the original BB before switch expansion.  Note that PHI nodes can
+    // occur multiple times in PHINodesToUpdate.  We have to be very careful to
+    // handle them the right number of times.
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator
+             MBBI = FuncInfo->MBB->begin(), MBBE = FuncInfo->MBB->end();
+             MBBI != MBBE && MBBI->isPHI(); ++MBBI) {
+          MachineInstrBuilder PHI(*MF, MBBI);
+          // This value for this PHI node is recorded in PHINodesToUpdate.
+          for (unsigned pn = 0; ; ++pn) {
+            assert(pn != FuncInfo->PHINodesToUpdate.size() &&
+                   "Didn't find PHI entry!");
+            if (FuncInfo->PHINodesToUpdate[pn].first == PHI) {
+              PHI.addReg(FuncInfo->PHINodesToUpdate[pn].second).addMBB(ThisBB);
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  SDB->SwitchCases.clear();
+}
+
+
+/// Create the scheduler. If a specific scheduler was specified
+/// via the SchedulerRegistry, use it, otherwise select the
+/// one preferred by the target.
+///
+ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
+  RegisterScheduler::FunctionPassCtor Ctor = RegisterScheduler::getDefault();
+
+  if (!Ctor) {
+    Ctor = ISHeuristic;
+    RegisterScheduler::setDefault(Ctor);
+  }
+
+  return Ctor(this, OptLevel);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions used by the generated instruction selector.
+//===----------------------------------------------------------------------===//
+// Calls to these methods are generated by tblgen.
+
+/// CheckAndMask - The isel is trying to match something like (and X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
+                                    int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// CheckOrMask - The isel is trying to match something like (or X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
+                                   int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+
+  APInt KnownZero, KnownOne;
+  CurDAG->ComputeMaskedBits(LHS, KnownZero, KnownOne);
+
+  // If all the missing bits in the or are already known to be set, match!
+  if ((NeededMask & KnownOne) == NeededMask)
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+
+/// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
+/// by tblgen.  Others should not call it.
+void SelectionDAGISel::
+SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops) {
+  std::vector<SDValue> InOps;
+  std::swap(InOps, Ops);
+
+  Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
+  Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
+  Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_ExtraInfo]);  // 3 (SideEffect, AlignStack)
+
+  unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
+  if (InOps[e-1].getValueType() == MVT::Glue)
+    --e;  // Don't process a glue operand if it is here.
+
+  while (i != e) {
+    unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
+    if (!InlineAsm::isMemKind(Flags)) {
+      // Just skip over this operand, copying the operands verbatim.
+      Ops.insert(Ops.end(), InOps.begin()+i,
+                 InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
+      i += InlineAsm::getNumOperandRegisters(Flags) + 1;
+    } else {
+      assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
+             "Memory operand with multiple values?");
+      // Otherwise, this is a memory operand.  Ask the target to select it.
+      std::vector<SDValue> SelOps;
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], 'm', SelOps))
+        report_fatal_error("Could not match memory address.  Inline asm"
+                           " failure!");
+
+      // Add this to the output node.
+      unsigned NewFlags =
+        InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
+      Ops.push_back(CurDAG->getTargetConstant(NewFlags, MVT::i32));
+      Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
+      i += 2;
+    }
+  }
+
+  // Add the glue input back if present.
+  if (e != InOps.size())
+    Ops.push_back(InOps.back());
+}
+
+/// findGlueUse - Return use of MVT::Glue value produced by the specified
+/// SDNode.
+///
+static SDNode *findGlueUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDUse &Use = I.getUse();
+    if (Use.getResNo() == FlagResNo)
+      return Use.getUser();
+  }
+  return NULL;
+}
+
+/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
+/// This function recursively traverses up the operand chain, ignoring
+/// certain nodes.
+static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SmallPtrSet<SDNode*, 16> &Visited,
+                          bool IgnoreChains) {
+  // The NodeID's are given uniques ID's where a node ID is guaranteed to be
+  // greater than all of its (recursive) operands.  If we scan to a point where
+  // 'use' is smaller than the node we're scanning for, then we know we will
+  // never find it.
+  //
+  // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
+  // happen because we scan down to newly selected nodes in the case of glue
+  // uses.
+  if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1))
+    return false;
+
+  // Don't revisit nodes if we already scanned it and didn't fail, we know we
+  // won't fail if we scan it again.
+  if (!Visited.insert(Use))
+    return false;
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    // Ignore chain uses, they are validated by HandleMergeInputChains.
+    if (Use->getOperand(i).getValueType() == MVT::Other && IgnoreChains)
+      continue;
+
+    SDNode *N = Use->getOperand(i).getNode();
+    if (N == Def) {
+      if (Use == ImmedUse || Use == Root)
+        continue;  // We are not looking for immediate use.
+      assert(N != Root);
+      return true;
+    }
+
+    // Traverse up the operand chain.
+    if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains))
+      return true;
+  }
+  return false;
+}
+
+/// IsProfitableToFold - Returns true if it's profitable to fold the specific
+/// operand node N of U during instruction selection that starts at Root.
+bool SelectionDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                          SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+  return N.hasOneUse();
+}
+
+/// IsLegalToFold - Returns true if the specific operand node N of
+/// U can be folded during instruction selection that starts at Root.
+bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
+                                     CodeGenOpt::Level OptLevel,
+                                     bool IgnoreChains) {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  // If Root use can somehow reach N through a path that that doesn't contain
+  // U then folding N would create a cycle. e.g. In the following
+  // diagram, Root can reach N through X. If N is folded into into Root, then
+  // X is both a predecessor and a successor of U.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^     ^          //
+  //         \   /           //
+  //          \ /            //
+  //         [Root*]         //
+  //
+  // * indicates nodes to be folded together.
+  //
+  // If Root produces glue, then it gets (even more) interesting. Since it
+  // will be "glued" together with its glue use in the scheduler, we need to
+  // check if it might reach N.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^       ^        //
+  //         \       \       //
+  //          \      |       //
+  //         [Root*] |       //
+  //          ^      |       //
+  //          f      |       //
+  //          |      /       //
+  //         [Y]    /        //
+  //           ^   /         //
+  //           f  /          //
+  //           | /           //
+  //          [GU]           //
+  //
+  // If GU (glue use) indirectly reaches N (the load), and Root folds N
+  // (call it Fold), then X is a predecessor of GU and a successor of
+  // Fold. But since Fold and GU are glued together, this will create
+  // a cycle in the scheduling graph.
+
+  // If the node has glue, walk down the graph to the "lowest" node in the
+  // glueged set.
+  EVT VT = Root->getValueType(Root->getNumValues()-1);
+  while (VT == MVT::Glue) {
+    SDNode *GU = findGlueUse(Root);
+    if (GU == NULL)
+      break;
+    Root = GU;
+    VT = Root->getValueType(Root->getNumValues()-1);
+
+    // If our query node has a glue result with a use, we've walked up it.  If
+    // the user (which has already been selected) has a chain or indirectly uses
+    // the chain, our WalkChainUsers predicate will not consider it.  Because of
+    // this, we cannot ignore chains in this predicate.
+    IgnoreChains = false;
+  }
+
+
+  SmallPtrSet<SDNode*, 16> Visited;
+  return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
+}
+
+SDNode *SelectionDAGISel::Select_INLINEASM(SDNode *N) {
+  std::vector<SDValue> Ops(N->op_begin(), N->op_end());
+  SelectInlineAsmMemoryOperands(Ops);
+
+  EVT VTs[] = { MVT::Other, MVT::Glue };
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+                                VTs, &Ops[0], Ops.size());
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+SDNode *SelectionDAGISel::Select_UNDEF(SDNode *N) {
+  return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF,N->getValueType(0));
+}
+
+/// GetVBR - decode a vbr encoding whose top bit is set.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
+GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
+  assert(Val >= 128 && "Not a VBR");
+  Val &= 127;  // Remove first vbr bit.
+
+  unsigned Shift = 7;
+  uint64_t NextBits;
+  do {
+    NextBits = MatcherTable[Idx++];
+    Val |= (NextBits&127) << Shift;
+    Shift += 7;
+  } while (NextBits & 128);
+
+  return Val;
+}
+
+
+/// UpdateChainsAndGlue - When a match is complete, this method updates uses of
+/// interior glue and chain results to use the new glue and chain results.
+void SelectionDAGISel::
+UpdateChainsAndGlue(SDNode *NodeToMatch, SDValue InputChain,
+                    const SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                    SDValue InputGlue,
+                    const SmallVectorImpl<SDNode*> &GlueResultNodesMatched,
+                    bool isMorphNodeTo) {
+  SmallVector<SDNode*, 4> NowDeadNodes;
+
+  // Now that all the normal results are replaced, we replace the chain and
+  // glue results if present.
+  if (!ChainNodesMatched.empty()) {
+    assert(InputChain.getNode() != 0 &&
+           "Matched input chains but didn't produce a chain");
+    // Loop over all of the nodes we matched that produced a chain result.
+    // Replace all the chain results with the final chain we ended up with.
+    for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+      SDNode *ChainNode = ChainNodesMatched[i];
+
+      // If this node was already deleted, don't look at it.
+      if (ChainNode->getOpcode() == ISD::DELETED_NODE)
+        continue;
+
+      // Don't replace the results of the root node if we're doing a
+      // MorphNodeTo.
+      if (ChainNode == NodeToMatch && isMorphNodeTo)
+        continue;
+
+      SDValue ChainVal = SDValue(ChainNode, ChainNode->getNumValues()-1);
+      if (ChainVal.getValueType() == MVT::Glue)
+        ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
+      assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
+      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
+
+      // If the node became dead and we haven't already seen it, delete it.
+      if (ChainNode->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
+        NowDeadNodes.push_back(ChainNode);
+    }
+  }
+
+  // If the result produces glue, update any glue results in the matched
+  // pattern with the glue result.
+  if (InputGlue.getNode() != 0) {
+    // Handle any interior nodes explicitly marked.
+    for (unsigned i = 0, e = GlueResultNodesMatched.size(); i != e; ++i) {
+      SDNode *FRN = GlueResultNodesMatched[i];
+
+      // If this node was already deleted, don't look at it.
+      if (FRN->getOpcode() == ISD::DELETED_NODE)
+        continue;
+
+      assert(FRN->getValueType(FRN->getNumValues()-1) == MVT::Glue &&
+             "Doesn't have a glue result");
+      CurDAG->ReplaceAllUsesOfValueWith(SDValue(FRN, FRN->getNumValues()-1),
+                                        InputGlue);
+
+      // If the node became dead and we haven't already seen it, delete it.
+      if (FRN->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), FRN))
+        NowDeadNodes.push_back(FRN);
+    }
+  }
+
+  if (!NowDeadNodes.empty())
+    CurDAG->RemoveDeadNodes(NowDeadNodes);
+
+  DEBUG(errs() << "ISEL: Match complete!\n");
+}
+
+enum ChainResult {
+  CR_Simple,
+  CR_InducesCycle,
+  CR_LeadsToInteriorNode
+};
+
+/// WalkChainUsers - Walk down the users of the specified chained node that is
+/// part of the pattern we're matching, looking at all of the users we find.
+/// This determines whether something is an interior node, whether we have a
+/// non-pattern node in between two pattern nodes (which prevent folding because
+/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
+/// between pattern nodes (in which case the TF becomes part of the pattern).
+///
+/// The walk we do here is guaranteed to be small because we quickly get down to
+/// already selected nodes "below" us.
+static ChainResult
+WalkChainUsers(const SDNode *ChainedNode,
+               SmallVectorImpl<SDNode*> &ChainedNodesInPattern,
+               SmallVectorImpl<SDNode*> &InteriorChainedNodes) {
+  ChainResult Result = CR_Simple;
+
+  for (SDNode::use_iterator UI = ChainedNode->use_begin(),
+         E = ChainedNode->use_end(); UI != E; ++UI) {
+    // Make sure the use is of the chain, not some other value we produce.
+    if (UI.getUse().getValueType() != MVT::Other) continue;
+
+    SDNode *User = *UI;
+
+    // If we see an already-selected machine node, then we've gone beyond the
+    // pattern that we're selecting down into the already selected chunk of the
+    // DAG.
+    if (User->isMachineOpcode() ||
+        User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+
+    unsigned UserOpcode = User->getOpcode();
+    if (UserOpcode == ISD::CopyToReg ||
+        UserOpcode == ISD::CopyFromReg ||
+        UserOpcode == ISD::INLINEASM ||
+        UserOpcode == ISD::EH_LABEL ||
+        UserOpcode == ISD::LIFETIME_START ||
+        UserOpcode == ISD::LIFETIME_END) {
+      // If their node ID got reset to -1 then they've already been selected.
+      // Treat them like a MachineOpcode.
+      if (User->getNodeId() == -1)
+        continue;
+    }
+
+    // If we have a TokenFactor, we handle it specially.
+    if (User->getOpcode() != ISD::TokenFactor) {
+      // If the node isn't a token factor and isn't part of our pattern, then it
+      // must be a random chained node in between two nodes we're selecting.
+      // This happens when we have something like:
+      //   x = load ptr
+      //   call
+      //   y = x+4
+      //   store y -> ptr
+      // Because we structurally match the load/store as a read/modify/write,
+      // but the call is chained between them.  We cannot fold in this case
+      // because it would induce a cycle in the graph.
+      if (!std::count(ChainedNodesInPattern.begin(),
+                      ChainedNodesInPattern.end(), User))
+        return CR_InducesCycle;
+
+      // Otherwise we found a node that is part of our pattern.  For example in:
+      //   x = load ptr
+      //   y = x+4
+      //   store y -> ptr
+      // This would happen when we're scanning down from the load and see the
+      // store as a user.  Record that there is a use of ChainedNode that is
+      // part of the pattern and keep scanning uses.
+      Result = CR_LeadsToInteriorNode;
+      InteriorChainedNodes.push_back(User);
+      continue;
+    }
+
+    // If we found a TokenFactor, there are two cases to consider: first if the
+    // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
+    // uses of the TF are in our pattern) we just want to ignore it.  Second,
+    // the TokenFactor can be sandwiched in between two chained nodes, like so:
+    //     [Load chain]
+    //         ^
+    //         |
+    //       [Load]
+    //       ^    ^
+    //       |    \                    DAG's like cheese
+    //      /       \                       do you?
+    //     /         |
+    // [TokenFactor] [Op]
+    //     ^          ^
+    //     |          |
+    //      \        /
+    //       \      /
+    //       [Store]
+    //
+    // In this case, the TokenFactor becomes part of our match and we rewrite it
+    // as a new TokenFactor.
+    //
+    // To distinguish these two cases, do a recursive walk down the uses.
+    switch (WalkChainUsers(User, ChainedNodesInPattern, InteriorChainedNodes)) {
+    case CR_Simple:
+      // If the uses of the TokenFactor are just already-selected nodes, ignore
+      // it, it is "below" our pattern.
+      continue;
+    case CR_InducesCycle:
+      // If the uses of the TokenFactor lead to nodes that are not part of our
+      // pattern that are not selected, folding would turn this into a cycle,
+      // bail out now.
+      return CR_InducesCycle;
+    case CR_LeadsToInteriorNode:
+      break;  // Otherwise, keep processing.
+    }
+
+    // Okay, we know we're in the interesting interior case.  The TokenFactor
+    // is now going to be considered part of the pattern so that we rewrite its
+    // uses (it may have uses that are not part of the pattern) with the
+    // ultimate chain result of the generated code.  We will also add its chain
+    // inputs as inputs to the ultimate TokenFactor we create.
+    Result = CR_LeadsToInteriorNode;
+    ChainedNodesInPattern.push_back(User);
+    InteriorChainedNodes.push_back(User);
+    continue;
+  }
+
+  return Result;
+}
+
+/// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
+/// operation for when the pattern matched at least one node with a chains.  The
+/// input vector contains a list of all of the chained nodes that we match.  We
+/// must determine if this is a valid thing to cover (i.e. matching it won't
+/// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
+/// be used as the input node chain for the generated nodes.
+static SDValue
+HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                       SelectionDAG *CurDAG) {
+  // Walk all of the chained nodes we've matched, recursively scanning down the
+  // users of the chain result. This adds any TokenFactor nodes that are caught
+  // in between chained nodes to the chained and interior nodes list.
+  SmallVector<SDNode*, 3> InteriorChainedNodes;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
+                       InteriorChainedNodes) == CR_InducesCycle)
+      return SDValue(); // Would induce a cycle.
+  }
+
+  // Okay, we have walked all the matched nodes and collected TokenFactor nodes
+  // that we are interested in.  Form our input TokenFactor node.
+  SmallVector<SDValue, 3> InputChains;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    // Add the input chain of this node to the InputChains list (which will be
+    // the operands of the generated TokenFactor) if it's not an interior node.
+    SDNode *N = ChainNodesMatched[i];
+    if (N->getOpcode() != ISD::TokenFactor) {
+      if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
+        continue;
+
+      // Otherwise, add the input chain.
+      SDValue InChain = ChainNodesMatched[i]->getOperand(0);
+      assert(InChain.getValueType() == MVT::Other && "Not a chain");
+      InputChains.push_back(InChain);
+      continue;
+    }
+
+    // If we have a token factor, we want to add all inputs of the token factor
+    // that are not part of the pattern we're matching.
+    for (unsigned op = 0, e = N->getNumOperands(); op != e; ++op) {
+      if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
+                      N->getOperand(op).getNode()))
+        InputChains.push_back(N->getOperand(op));
+    }
+  }
+
+  SDValue Res;
+  if (InputChains.size() == 1)
+    return InputChains[0];
+  return CurDAG->getNode(ISD::TokenFactor, ChainNodesMatched[0]->getDebugLoc(),
+                         MVT::Other, &InputChains[0], InputChains.size());
+}
+
+/// MorphNode - Handle morphing a node in place for the selector.
+SDNode *SelectionDAGISel::
+MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
+          const SDValue *Ops, unsigned NumOps, unsigned EmitNodeInfo) {
+  // It is possible we're using MorphNodeTo to replace a node with no
+  // normal results with one that has a normal result (or we could be
+  // adding a chain) and the input could have glue and chains as well.
+  // In this case we need to shift the operands down.
+  // FIXME: This is a horrible hack and broken in obscure cases, no worse
+  // than the old isel though.
+  int OldGlueResultNo = -1, OldChainResultNo = -1;
+
+  unsigned NTMNumResults = Node->getNumValues();
+  if (Node->getValueType(NTMNumResults-1) == MVT::Glue) {
+    OldGlueResultNo = NTMNumResults-1;
+    if (NTMNumResults != 1 &&
+        Node->getValueType(NTMNumResults-2) == MVT::Other)
+      OldChainResultNo = NTMNumResults-2;
+  } else if (Node->getValueType(NTMNumResults-1) == MVT::Other)
+    OldChainResultNo = NTMNumResults-1;
+
+  // Call the underlying SelectionDAG routine to do the transmogrification. Note
+  // that this deletes operands of the old node that become dead.
+  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops, NumOps);
+
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  }
+
+  unsigned ResNumResults = Res->getNumValues();
+  // Move the glue if needed.
+  if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
+      (unsigned)OldGlueResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
+                                      SDValue(Res, ResNumResults-1));
+
+  if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
+    --ResNumResults;
+
+  // Move the chain reference if needed.
+  if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
+      (unsigned)OldChainResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
+                                      SDValue(Res, ResNumResults-1));
+
+  // Otherwise, no replacement happened because the node already exists. Replace
+  // Uses of the old node with the new one.
+  if (Res != Node)
+    CurDAG->ReplaceAllUsesWith(Node, Res);
+
+  return Res;
+}
+
+/// CheckSame - Implements OP_CheckSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+          SDValue N,
+          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+  // Accept if it is exactly the same as a previously recorded node.
+  unsigned RecNo = MatcherTable[MatcherIndex++];
+  assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+  return N == RecordedNodes[RecNo].first;
+}
+
+/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                      const SelectionDAGISel &SDISel) {
+  return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
+}
+
+/// CheckNodePredicate - Implements OP_CheckNodePredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                   const SelectionDAGISel &SDISel, SDNode *N) {
+  return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDNode *N) {
+  uint16_t Opc = MatcherTable[MatcherIndex++];
+  Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+  return N->getOpcode() == Opc;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+          SDValue N, const TargetLowering &TLI) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (N.getValueType() == VT) return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && N.getValueType() == TLI.getPointerTy();
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering &TLI,
+               unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI);
+}
+
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+              SDValue N) {
+  return cast<CondCodeSDNode>(N)->get() ==
+      (ISD::CondCode)MatcherTable[MatcherIndex++];
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering &TLI) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (cast<VTSDNode>(N)->getVT() == VT)
+    return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI.getPointerTy();
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  return C != 0 && C->getSExtValue() == Val;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDValue N, const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::AND) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C != 0 && SDISel.CheckAndMask(N.getOperand(0), C, Val);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+           SDValue N, const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::OR) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C != 0 && SDISel.CheckOrMask(N.getOperand(0), C, Val);
+}
+
+/// IsPredicateKnownToFail - If we know how and can do so without pushing a
+/// scope, evaluate the current node.  If the current predicate is known to
+/// fail, set Result=true and return anything.  If the current predicate is
+/// known to pass, set Result=false and return the MatcherIndex to continue
+/// with.  If the current predicate is unknown, set Result=false and return the
+/// MatcherIndex to continue with.
+static unsigned IsPredicateKnownToFail(const unsigned char *Table,
+                                       unsigned Index, SDValue N,
+                                       bool &Result,
+                                       const SelectionDAGISel &SDISel,
+                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+  switch (Table[Index++]) {
+  default:
+    Result = false;
+    return Index-1;  // Could not evaluate this predicate.
+  case SelectionDAGISel::OPC_CheckSame:
+    Result = !::CheckSame(Table, Index, N, RecordedNodes);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPatternPredicate:
+    Result = !::CheckPatternPredicate(Table, Index, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPredicate:
+    Result = !::CheckNodePredicate(Table, Index, SDISel, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckOpcode:
+    Result = !::CheckOpcode(Table, Index, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckType:
+    Result = !::CheckType(Table, Index, N, SDISel.TLI);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Type:
+  case SelectionDAGISel::OPC_CheckChild1Type:
+  case SelectionDAGISel::OPC_CheckChild2Type:
+  case SelectionDAGISel::OPC_CheckChild3Type:
+  case SelectionDAGISel::OPC_CheckChild4Type:
+  case SelectionDAGISel::OPC_CheckChild5Type:
+  case SelectionDAGISel::OPC_CheckChild6Type:
+  case SelectionDAGISel::OPC_CheckChild7Type:
+    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Type);
+    return Index;
+  case SelectionDAGISel::OPC_CheckCondCode:
+    Result = !::CheckCondCode(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckValueType:
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
+    return Index;
+  case SelectionDAGISel::OPC_CheckInteger:
+    Result = !::CheckInteger(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckAndImm:
+    Result = !::CheckAndImm(Table, Index, N, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckOrImm:
+    Result = !::CheckOrImm(Table, Index, N, SDISel);
+    return Index;
+  }
+}
+
+namespace {
+
+struct MatchScope {
+  /// FailIndex - If this match fails, this is the index to continue with.
+  unsigned FailIndex;
+
+  /// NodeStack - The node stack when the scope was formed.
+  SmallVector<SDValue, 4> NodeStack;
+
+  /// NumRecordedNodes - The number of recorded nodes when the scope was formed.
+  unsigned NumRecordedNodes;
+
+  /// NumMatchedMemRefs - The number of matched memref entries.
+  unsigned NumMatchedMemRefs;
+
+  /// InputChain/InputGlue - The current chain/glue
+  SDValue InputChain, InputGlue;
+
+  /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
+  bool HasChainNodesMatched, HasGlueResultNodesMatched;
+};
+
+}
+
+SDNode *SelectionDAGISel::
+SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
+                 unsigned TableSize) {
+  // FIXME: Should these even be selected?  Handle these cases in the caller?
+  switch (NodeToMatch->getOpcode()) {
+  default:
+    break;
+  case ISD::EntryToken:       // These nodes remain the same.
+  case ISD::BasicBlock:
+  case ISD::Register:
+  case ISD::RegisterMask:
+  //case ISD::VALUETYPE:
+  //case ISD::CONDCODE:
+  case ISD::HANDLENODE:
+  case ISD::MDNODE_SDNODE:
+  case ISD::TargetConstant:
+  case ISD::TargetConstantFP:
+  case ISD::TargetConstantPool:
+  case ISD::TargetFrameIndex:
+  case ISD::TargetExternalSymbol:
+  case ISD::TargetBlockAddress:
+  case ISD::TargetJumpTable:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::TargetGlobalAddress:
+  case ISD::TokenFactor:
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+  case ISD::EH_LABEL:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+    NodeToMatch->setNodeId(-1); // Mark selected.
+    return 0;
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
+                                      NodeToMatch->getOperand(0));
+    return 0;
+  case ISD::INLINEASM: return Select_INLINEASM(NodeToMatch);
+  case ISD::UNDEF:     return Select_UNDEF(NodeToMatch);
+  }
+
+  assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
+
+  // Set up the node stack with NodeToMatch as the only node on the stack.
+  SmallVector<SDValue, 8> NodeStack;
+  SDValue N = SDValue(NodeToMatch, 0);
+  NodeStack.push_back(N);
+
+  // MatchScopes - Scopes used when matching, if a match failure happens, this
+  // indicates where to continue checking.
+  SmallVector<MatchScope, 8> MatchScopes;
+
+  // RecordedNodes - This is the set of nodes that have been recorded by the
+  // state machine.  The second value is the parent of the node, or null if the
+  // root is recorded.
+  SmallVector<std::pair<SDValue, SDNode*>, 8> RecordedNodes;
+
+  // MatchedMemRefs - This is the set of MemRef's we've seen in the input
+  // pattern.
+  SmallVector<MachineMemOperand*, 2> MatchedMemRefs;
+
+  // These are the current input chain and glue for use when generating nodes.
+  // Various Emit operations change these.  For example, emitting a copytoreg
+  // uses and updates these.
+  SDValue InputChain, InputGlue;
+
+  // ChainNodesMatched - If a pattern matches nodes that have input/output
+  // chains, the OPC_EmitMergeInputChains operation is emitted which indicates
+  // which ones they are.  The result is captured into this list so that we can
+  // update the chain results when the pattern is complete.
+  SmallVector<SDNode*, 3> ChainNodesMatched;
+  SmallVector<SDNode*, 3> GlueResultNodesMatched;
+
+  DEBUG(errs() << "ISEL: Starting pattern match on root node: ";
+        NodeToMatch->dump(CurDAG);
+        errs() << '\n');
+
+  // Determine where to start the interpreter.  Normally we start at opcode #0,
+  // but if the state machine starts with an OPC_SwitchOpcode, then we
+  // accelerate the first lookup (which is guaranteed to be hot) with the
+  // OpcodeOffset table.
+  unsigned MatcherIndex = 0;
+
+  if (!OpcodeOffset.empty()) {
+    // Already computed the OpcodeOffset table, just index into it.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+    DEBUG(errs() << "  Initial Opcode index to " << MatcherIndex << "\n");
+
+  } else if (MatcherTable[0] == OPC_SwitchOpcode) {
+    // Otherwise, the table isn't computed, but the state machine does start
+    // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
+    // is the first time we're selecting an instruction.
+    unsigned Idx = 1;
+    while (1) {
+      // Get the size of this case.
+      unsigned CaseSize = MatcherTable[Idx++];
+      if (CaseSize & 128)
+        CaseSize = GetVBR(CaseSize, MatcherTable, Idx);
+      if (CaseSize == 0) break;
+
+      // Get the opcode, add the index to the table.
+      uint16_t Opc = MatcherTable[Idx++];
+      Opc |= (unsigned short)MatcherTable[Idx++] << 8;
+      if (Opc >= OpcodeOffset.size())
+        OpcodeOffset.resize((Opc+1)*2);
+      OpcodeOffset[Opc] = Idx;
+      Idx += CaseSize;
+    }
+
+    // Okay, do the lookup for the first opcode.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+  }
+
+  while (1) {
+    assert(MatcherIndex < TableSize && "Invalid index");
+#ifndef NDEBUG
+    unsigned CurrentOpcodeIndex = MatcherIndex;
+#endif
+    BuiltinOpcodes Opcode = (BuiltinOpcodes)MatcherTable[MatcherIndex++];
+    switch (Opcode) {
+    case OPC_Scope: {
+      // Okay, the semantics of this operation are that we should push a scope
+      // then evaluate the first child.  However, pushing a scope only to have
+      // the first check fail (which then pops it) is inefficient.  If we can
+      // determine immediately that the first check (or first several) will
+      // immediately fail, don't even bother pushing a scope for them.
+      unsigned FailIndex;
+
+      while (1) {
+        unsigned NumToSkip = MatcherTable[MatcherIndex++];
+        if (NumToSkip & 128)
+          NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+        // Found the end of the scope with no match.
+        if (NumToSkip == 0) {
+          FailIndex = 0;
+          break;
+        }
+
+        FailIndex = MatcherIndex+NumToSkip;
+
+        unsigned MatcherIndexOfPredicate = MatcherIndex;
+        (void)MatcherIndexOfPredicate; // silence warning.
+
+        // If we can't evaluate this predicate without pushing a scope (e.g. if
+        // it is a 'MoveParent') or if the predicate succeeds on this node, we
+        // push the scope and evaluate the full predicate chain.
+        bool Result;
+        MatcherIndex = IsPredicateKnownToFail(MatcherTable, MatcherIndex, N,
+                                              Result, *this, RecordedNodes);
+        if (!Result)
+          break;
+
+        DEBUG(errs() << "  Skipped scope entry (due to false predicate) at "
+                     << "index " << MatcherIndexOfPredicate
+                     << ", continuing at " << FailIndex << "\n");
+        ++NumDAGIselRetries;
+
+        // Otherwise, we know that this case of the Scope is guaranteed to fail,
+        // move to the next case.
+        MatcherIndex = FailIndex;
+      }
+
+      // If the whole scope failed to match, bail.
+      if (FailIndex == 0) break;
+
+      // Push a MatchScope which indicates where to go if the first child fails
+      // to match.
+      MatchScope NewEntry;
+      NewEntry.FailIndex = FailIndex;
+      NewEntry.NodeStack.append(NodeStack.begin(), NodeStack.end());
+      NewEntry.NumRecordedNodes = RecordedNodes.size();
+      NewEntry.NumMatchedMemRefs = MatchedMemRefs.size();
+      NewEntry.InputChain = InputChain;
+      NewEntry.InputGlue = InputGlue;
+      NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
+      NewEntry.HasGlueResultNodesMatched = !GlueResultNodesMatched.empty();
+      MatchScopes.push_back(NewEntry);
+      continue;
+    }
+    case OPC_RecordNode: {
+      // Remember this node, it may end up being an operand in the pattern.
+      SDNode *Parent = 0;
+      if (NodeStack.size() > 1)
+        Parent = NodeStack[NodeStack.size()-2].getNode();
+      RecordedNodes.push_back(std::make_pair(N, Parent));
+      continue;
+    }
+
+    case OPC_RecordChild0: case OPC_RecordChild1:
+    case OPC_RecordChild2: case OPC_RecordChild3:
+    case OPC_RecordChild4: case OPC_RecordChild5:
+    case OPC_RecordChild6: case OPC_RecordChild7: {
+      unsigned ChildNo = Opcode-OPC_RecordChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+
+      RecordedNodes.push_back(std::make_pair(N->getOperand(ChildNo),
+                                             N.getNode()));
+      continue;
+    }
+    case OPC_RecordMemRef:
+      MatchedMemRefs.push_back(cast<MemSDNode>(N)->getMemOperand());
+      continue;
+
+    case OPC_CaptureGlueInput:
+      // If the current node has an input glue, capture it in InputGlue.
+      if (N->getNumOperands() != 0 &&
+          N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue)
+        InputGlue = N->getOperand(N->getNumOperands()-1);
+      continue;
+
+    case OPC_MoveChild: {
+      unsigned ChildNo = MatcherTable[MatcherIndex++];
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
+    case OPC_MoveParent:
+      // Pop the current node off the NodeStack.
+      NodeStack.pop_back();
+      assert(!NodeStack.empty() && "Node stack imbalance!");
+      N = NodeStack.back();
+      continue;
+
+    case OPC_CheckSame:
+      if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
+      continue;
+    case OPC_CheckPatternPredicate:
+      if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
+      continue;
+    case OPC_CheckPredicate:
+      if (!::CheckNodePredicate(MatcherTable, MatcherIndex, *this,
+                                N.getNode()))
+        break;
+      continue;
+    case OPC_CheckComplexPat: {
+      unsigned CPNum = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+      if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
+                               RecordedNodes[RecNo].first, CPNum,
+                               RecordedNodes))
+        break;
+      continue;
+    }
+    case OPC_CheckOpcode:
+      if (!::CheckOpcode(MatcherTable, MatcherIndex, N.getNode())) break;
+      continue;
+
+    case OPC_CheckType:
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI)) break;
+      continue;
+
+    case OPC_SwitchOpcode: {
+      unsigned CurNodeOpcode = N.getOpcode();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        uint16_t Opc = MatcherTable[MatcherIndex++];
+        Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+
+        // If the opcode matches, then we will execute this case.
+        if (CurNodeOpcode == Opc)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      DEBUG(errs() << "  OpcodeSwitch from " << SwitchStart
+                   << " to " << MatcherIndex << "\n");
+      continue;
+    }
+
+    case OPC_SwitchType: {
+      MVT CurNodeVT = N.getValueType().getSimpleVT();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (CaseVT == MVT::iPTR)
+          CaseVT = TLI.getPointerTy();
+
+        // If the VT matches, then we will execute this case.
+        if (CurNodeVT == CaseVT)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      DEBUG(errs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
+                   << "] from " << SwitchStart << " to " << MatcherIndex<<'\n');
+      continue;
+    }
+    case OPC_CheckChild0Type: case OPC_CheckChild1Type:
+    case OPC_CheckChild2Type: case OPC_CheckChild3Type:
+    case OPC_CheckChild4Type: case OPC_CheckChild5Type:
+    case OPC_CheckChild6Type: case OPC_CheckChild7Type:
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
+                            Opcode-OPC_CheckChild0Type))
+        break;
+      continue;
+    case OPC_CheckCondCode:
+      if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckValueType:
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI)) break;
+      continue;
+    case OPC_CheckInteger:
+      if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckAndImm:
+      if (!::CheckAndImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+    case OPC_CheckOrImm:
+      if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+
+    case OPC_CheckFoldableChainNode: {
+      assert(NodeStack.size() != 1 && "No parent node");
+      // Verify that all intermediate nodes between the root and this one have
+      // a single use.
+      bool HasMultipleUses = false;
+      for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
+        if (!NodeStack[i].hasOneUse()) {
+          HasMultipleUses = true;
+          break;
+        }
+      if (HasMultipleUses) break;
+
+      // Check to see that the target thinks this is profitable to fold and that
+      // we can fold it without inducing cycles in the graph.
+      if (!IsProfitableToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                              NodeToMatch) ||
+          !IsLegalToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                         NodeToMatch, OptLevel,
+                         true/*We validate our own chains*/))
+        break;
+
+      continue;
+    }
+    case OPC_EmitInteger: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      int64_t Val = MatcherTable[MatcherIndex++];
+      if (Val & 128)
+        Val = GetVBR(Val, MatcherTable, MatcherIndex);
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getTargetConstant(Val, VT), (SDNode*)0));
+      continue;
+    }
+    case OPC_EmitRegister: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), (SDNode*)0));
+      continue;
+    }
+    case OPC_EmitRegister2: {
+      // For targets w/ more than 256 register names, the register enum
+      // values are stored in two bytes in the matcher table (just like
+      // opcodes).
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RegNo |= MatcherTable[MatcherIndex++] << 8;
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), (SDNode*)0));
+      continue;
+    }
+
+    case OPC_EmitConvertToTarget:  {
+      // Convert from IMM/FPIMM to target version.
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      SDValue Imm = RecordedNodes[RecNo].first;
+
+      if (Imm->getOpcode() == ISD::Constant) {
+        const ConstantInt *Val=cast<ConstantSDNode>(Imm)->getConstantIntValue();
+        Imm = CurDAG->getConstant(*Val, Imm.getValueType(), true);
+      } else if (Imm->getOpcode() == ISD::ConstantFP) {
+        const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
+        Imm = CurDAG->getConstantFP(*Val, Imm.getValueType(), true);
+      }
+
+      RecordedNodes.push_back(std::make_pair(Imm, RecordedNodes[RecNo].second));
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
+    case OPC_EmitMergeInputChains1_1: {  // OPC_EmitMergeInputChains, 1, 1
+      // These are space-optimized forms of OPC_EmitMergeInputChains.
+      assert(InputChain.getNode() == 0 &&
+             "EmitMergeInputChains should be the first chain producing node");
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+      // FIXME: What if other value results of the node have uses not matched
+      // by this pattern?
+      if (ChainNodesMatched.back() != NodeToMatch &&
+          !RecordedNodes[RecNo].first.hasOneUse()) {
+        ChainNodesMatched.clear();
+        break;
+      }
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (InputChain.getNode() == 0)
+        break;  // Failed to merge.
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains: {
+      assert(InputChain.getNode() == 0 &&
+             "EmitMergeInputChains should be the first chain producing node");
+      // This node gets a list of nodes we matched in the input that have
+      // chains.  We want to token factor all of the input chains to these nodes
+      // together.  However, if any of the input chains is actually one of the
+      // nodes matched in this pattern, then we have an intra-match reference.
+      // Ignore these because the newly token factored chain should not refer to
+      // the old nodes.
+      unsigned NumChains = MatcherTable[MatcherIndex++];
+      assert(NumChains != 0 && "Can't TF zero chains");
+
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      for (unsigned i = 0; i != NumChains; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+        // FIXME: What if other value results of the node have uses not matched
+        // by this pattern?
+        if (ChainNodesMatched.back() != NodeToMatch &&
+            !RecordedNodes[RecNo].first.hasOneUse()) {
+          ChainNodesMatched.clear();
+          break;
+        }
+      }
+
+      // If the inner loop broke out, the match fails.
+      if (ChainNodesMatched.empty())
+        break;
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (InputChain.getNode() == 0)
+        break;  // Failed to merge.
+
+      continue;
+    }
+
+    case OPC_EmitCopyToReg: {
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+
+      if (InputChain.getNode() == 0)
+        InputChain = CurDAG->getEntryNode();
+
+      InputChain = CurDAG->getCopyToReg(InputChain, NodeToMatch->getDebugLoc(),
+                                        DestPhysReg, RecordedNodes[RecNo].first,
+                                        InputGlue);
+
+      InputGlue = InputChain.getValue(1);
+      continue;
+    }
+
+    case OPC_EmitNodeXForm: {
+      unsigned XFormNo = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
+      RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, (SDNode*) 0));
+      continue;
+    }
+
+    case OPC_EmitNode:
+    case OPC_MorphNodeTo: {
+      uint16_t TargetOpc = MatcherTable[MatcherIndex++];
+      TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+      unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
+      // Get the result VT list.
+      unsigned NumVTs = MatcherTable[MatcherIndex++];
+      SmallVector<EVT, 4> VTs;
+      for (unsigned i = 0; i != NumVTs; ++i) {
+        MVT::SimpleValueType VT =
+          (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (VT == MVT::iPTR) VT = TLI.getPointerTy().SimpleTy;
+        VTs.push_back(VT);
+      }
+
+      if (EmitNodeInfo & OPFL_Chain)
+        VTs.push_back(MVT::Other);
+      if (EmitNodeInfo & OPFL_GlueOutput)
+        VTs.push_back(MVT::Glue);
+
+      // This is hot code, so optimize the two most common cases of 1 and 2
+      // results.
+      SDVTList VTList;
+      if (VTs.size() == 1)
+        VTList = CurDAG->getVTList(VTs[0]);
+      else if (VTs.size() == 2)
+        VTList = CurDAG->getVTList(VTs[0], VTs[1]);
+      else
+        VTList = CurDAG->getVTList(VTs.data(), VTs.size());
+
+      // Get the operand list.
+      unsigned NumOps = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i != NumOps; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitNode");
+        Ops.push_back(RecordedNodes[RecNo].first);
+      }
+
+      // If there are variadic operands to add, handle them now.
+      if (EmitNodeInfo & OPFL_VariadicInfo) {
+        // Determine the start index to copy from.
+        unsigned FirstOpToCopy = getNumFixedFromVariadicInfo(EmitNodeInfo);
+        FirstOpToCopy += (EmitNodeInfo & OPFL_Chain) ? 1 : 0;
+        assert(NodeToMatch->getNumOperands() >= FirstOpToCopy &&
+               "Invalid variadic node");
+        // Copy all of the variadic operands, not including a potential glue
+        // input.
+        for (unsigned i = FirstOpToCopy, e = NodeToMatch->getNumOperands();
+             i != e; ++i) {
+          SDValue V = NodeToMatch->getOperand(i);
+          if (V.getValueType() == MVT::Glue) break;
+          Ops.push_back(V);
+        }
+      }
+
+      // If this has chain/glue inputs, add them.
+      if (EmitNodeInfo & OPFL_Chain)
+        Ops.push_back(InputChain);
+      if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != 0)
+        Ops.push_back(InputGlue);
+
+      // Create the node.
+      SDNode *Res = 0;
+      if (Opcode != OPC_MorphNodeTo) {
+        // If this is a normal EmitNode command, just create the new node and
+        // add the results to the RecordedNodes list.
+        Res = CurDAG->getMachineNode(TargetOpc, NodeToMatch->getDebugLoc(),
+                                     VTList, Ops.data(), Ops.size());
+
+        // Add all the non-glue/non-chain results to the RecordedNodes list.
+        for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+          if (VTs[i] == MVT::Other || VTs[i] == MVT::Glue) break;
+          RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
+                                                             (SDNode*) 0));
+        }
+
+      } else if (NodeToMatch->getOpcode() != ISD::DELETED_NODE) {
+        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops.data(), Ops.size(),
+                        EmitNodeInfo);
+      } else {
+        // NodeToMatch was eliminated by CSE when the target changed the DAG.
+        // We will visit the equivalent node later.
+        DEBUG(dbgs() << "Node was eliminated by CSE\n");
+        return 0;
+      }
+
+      // If the node had chain/glue results, update our notion of the current
+      // chain and glue.
+      if (EmitNodeInfo & OPFL_GlueOutput) {
+        InputGlue = SDValue(Res, VTs.size()-1);
+        if (EmitNodeInfo & OPFL_Chain)
+          InputChain = SDValue(Res, VTs.size()-2);
+      } else if (EmitNodeInfo & OPFL_Chain)
+        InputChain = SDValue(Res, VTs.size()-1);
+
+      // If the OPFL_MemRefs glue is set on this node, slap all of the
+      // accumulated memrefs onto it.
+      //
+      // FIXME: This is vastly incorrect for patterns with multiple outputs
+      // instructions that access memory and for ComplexPatterns that match
+      // loads.
+      if (EmitNodeInfo & OPFL_MemRefs) {
+        // Only attach load or store memory operands if the generated
+        // instruction may load or store.
+        const MCInstrDesc &MCID = TM.getInstrInfo()->get(TargetOpc);
+        bool mayLoad = MCID.mayLoad();
+        bool mayStore = MCID.mayStore();
+
+        unsigned NumMemRefs = 0;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              ++NumMemRefs;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              ++NumMemRefs;
+          } else {
+            ++NumMemRefs;
+          }
+        }
+
+        MachineSDNode::mmo_iterator MemRefs =
+          MF->allocateMemRefsArray(NumMemRefs);
+
+        MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
+        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
+             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              *MemRefsPos++ = *I;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              *MemRefsPos++ = *I;
+          } else {
+            *MemRefsPos++ = *I;
+          }
+        }
+
+        cast<MachineSDNode>(Res)
+          ->setMemRefs(MemRefs, MemRefs + NumMemRefs);
+      }
+
+      DEBUG(errs() << "  "
+                   << (Opcode == OPC_MorphNodeTo ? "Morphed" : "Created")
+                   << " node: "; Res->dump(CurDAG); errs() << "\n");
+
+      // If this was a MorphNodeTo then we're completely done!
+      if (Opcode == OPC_MorphNodeTo) {
+        // Update chain and glue uses.
+        UpdateChainsAndGlue(NodeToMatch, InputChain, ChainNodesMatched,
+                            InputGlue, GlueResultNodesMatched, true);
+        return Res;
+      }
+
+      continue;
+    }
+
+    case OPC_MarkGlueResults: {
+      unsigned NumNodes = MatcherTable[MatcherIndex++];
+
+      // Read and remember all the glue-result nodes.
+      for (unsigned i = 0; i != NumNodes; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+
+        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        GlueResultNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+      }
+      continue;
+    }
+
+    case OPC_CompleteMatch: {
+      // The match has been completed, and any new nodes (if any) have been
+      // created.  Patch up references to the matched dag to use the newly
+      // created nodes.
+      unsigned NumResults = MatcherTable[MatcherIndex++];
+
+      for (unsigned i = 0; i != NumResults; ++i) {
+        unsigned ResSlot = MatcherTable[MatcherIndex++];
+        if (ResSlot & 128)
+          ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
+
+        assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
+        SDValue Res = RecordedNodes[ResSlot].first;
+
+        assert(i < NodeToMatch->getNumValues() &&
+               NodeToMatch->getValueType(i) != MVT::Other &&
+               NodeToMatch->getValueType(i) != MVT::Glue &&
+               "Invalid number of results to complete!");
+        assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
+                NodeToMatch->getValueType(i) == MVT::iPTR ||
+                Res.getValueType() == MVT::iPTR ||
+                NodeToMatch->getValueType(i).getSizeInBits() ==
+                    Res.getValueType().getSizeInBits()) &&
+               "invalid replacement");
+        CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
+      }
+
+      // If the root node defines glue, add it to the glue nodes to update list.
+      if (NodeToMatch->getValueType(NodeToMatch->getNumValues()-1) == MVT::Glue)
+        GlueResultNodesMatched.push_back(NodeToMatch);
+
+      // Update chain and glue uses.
+      UpdateChainsAndGlue(NodeToMatch, InputChain, ChainNodesMatched,
+                          InputGlue, GlueResultNodesMatched, false);
+
+      assert(NodeToMatch->use_empty() &&
+             "Didn't replace all uses of the node?");
+
+      // FIXME: We just return here, which interacts correctly with SelectRoot
+      // above.  We should fix this to not return an SDNode* anymore.
+      return 0;
+    }
+    }
+
+    // If the code reached this point, then the match failed.  See if there is
+    // another child to try in the current 'Scope', otherwise pop it until we
+    // find a case to check.
+    DEBUG(errs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
+    ++NumDAGIselRetries;
+    while (1) {
+      if (MatchScopes.empty()) {
+        CannotYetSelect(NodeToMatch);
+        return 0;
+      }
+
+      // Restore the interpreter state back to the point where the scope was
+      // formed.
+      MatchScope &LastScope = MatchScopes.back();
+      RecordedNodes.resize(LastScope.NumRecordedNodes);
+      NodeStack.clear();
+      NodeStack.append(LastScope.NodeStack.begin(), LastScope.NodeStack.end());
+      N = NodeStack.back();
+
+      if (LastScope.NumMatchedMemRefs != MatchedMemRefs.size())
+        MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
+      MatcherIndex = LastScope.FailIndex;
+
+      DEBUG(errs() << "  Continuing at " << MatcherIndex << "\n");
+
+      InputChain = LastScope.InputChain;
+      InputGlue = LastScope.InputGlue;
+      if (!LastScope.HasChainNodesMatched)
+        ChainNodesMatched.clear();
+      if (!LastScope.HasGlueResultNodesMatched)
+        GlueResultNodesMatched.clear();
+
+      // Check to see what the offset is at the new MatcherIndex.  If it is zero
+      // we have reached the end of this scope, otherwise we have another child
+      // in the current scope to try.
+      unsigned NumToSkip = MatcherTable[MatcherIndex++];
+      if (NumToSkip & 128)
+        NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+
+      // If we have another child in this scope to match, update FailIndex and
+      // try it.
+      if (NumToSkip != 0) {
+        LastScope.FailIndex = MatcherIndex+NumToSkip;
+        break;
+      }
+
+      // End of this scope, pop it and try the next child in the containing
+      // scope.
+      MatchScopes.pop_back();
+    }
+  }
+}
+
+
+
+void SelectionDAGISel::CannotYetSelect(SDNode *N) {
+  std::string msg;
+  raw_string_ostream Msg(msg);
+  Msg << "Cannot select: ";
+
+  if (N->getOpcode() != ISD::INTRINSIC_W_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_VOID) {
+    N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getName();
+  } else {
+    bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
+    unsigned iid =
+      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    if (iid < Intrinsic::num_intrinsics)
+      Msg << "intrinsic %" << Intrinsic::getName((Intrinsic::ID)iid);
+    else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
+      Msg << "target intrinsic %" << TII->getName(iid);
+    else
+      Msg << "unknown intrinsic #" << iid;
+  }
+  report_fatal_error(Msg.str());
+}
+
+char SelectionDAGISel::ID = 0;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
new file mode 100644
index 000000000000..b752b482e3a1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -0,0 +1,299 @@
+//===-- SelectionDAGPrinter.cpp - Implement SelectionDAG::viewGraph() -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
+
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
+
+    static bool hasEdgeDestLabels() {
+      return true;
+    }
+
+    static unsigned numEdgeDestLabels(const void *Node) {
+      return ((const SDNode *) Node)->getNumValues();
+    }
+
+    static std::string getEdgeDestLabel(const void *Node, unsigned i) {
+      return ((const SDNode *) Node)->getValueType(i).getEVTString();
+    }
+
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((const SDNode *) Node));
+    }
+
+    /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
+    /// should actually target another edge source, not a node.  If this method
+    /// is implemented, getEdgeTarget should be implemented.
+    template<typename EdgeIter>
+    static bool edgeTargetsEdgeSource(const void *Node, EdgeIter I) {
+      return true;
+    }
+
+    /// getEdgeTarget - If edgeTargetsEdgeSource returns true, this method is
+    /// called to determine which outgoing edge of Node is the target of this
+    /// edge.
+    template<typename EdgeIter>
+    static EdgeIter getEdgeTarget(const void *Node, EdgeIter I) {
+      SDNode *TargetNode = *I;
+      SDNodeIterator NI = SDNodeIterator::begin(TargetNode);
+      std::advance(NI, I.getNode()->getOperand(I.getOperand()).getResNo());
+      return NI;
+    }
+
+    static std::string getGraphName(const SelectionDAG *G) {
+      return G->getMachineFunction().getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static bool hasNodeAddressLabel(const SDNode *Node,
+                                    const SelectionDAG *Graph) {
+      return true;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    template<typename EdgeIter>
+    static std::string getEdgeAttributes(const void *Node, EdgeIter EI,
+                                         const SelectionDAG *Graph) {
+      SDValue Op = EI.getNode()->getOperand(EI.getOperand());
+      EVT VT = Op.getValueType();
+      if (VT == MVT::Glue)
+        return "color=red,style=bold";
+      else if (VT == MVT::Other)
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    static std::string getSimpleNodeLabel(const SDNode *Node,
+                                          const SelectionDAG *G) {
+      std::string Result = Node->getOperationName(G);
+      {
+        raw_string_ostream OS(Result);
+        Node->print_details(OS, G);
+      }
+      return Result;
+    }
+    std::string getNodeLabel(const SDNode *Node, const SelectionDAG *Graph);
+    static std::string getNodeAttributes(const SDNode *N,
+                                         const SelectionDAG *Graph) {
+#ifndef NDEBUG
+      const std::string &Attrs = Graph->getGraphAttrs(N);
+      if (!Attrs.empty()) {
+        if (Attrs.find("shape=") == std::string::npos)
+          return std::string("shape=Mrecord,") + Attrs;
+        else
+          return Attrs;
+      }
+#endif
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(SelectionDAG *G,
+                                       GraphWriter<SelectionDAG*> &GW) {
+      GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+      if (G->getRoot().getNode())
+        GW.emitEdge(0, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+                    "color=blue,style=dashed");
+    }
+  };
+}
+
+std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
+                                                        const SelectionDAG *G) {
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
+}
+
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void SelectionDAG::viewGraph(const std::string &Title) {
+// This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, "dag." + getMachineFunction().getName(),
+            false, Title);
+#else
+  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+// This overload is defined out-of-line here instead of just using a
+// default parameter because this is easiest for gdb to call.
+void SelectionDAG::viewGraph() {
+  viewGraph("");
+}
+
+/// clearGraphAttrs - Clear all previously defined node graph attributes.
+/// Intended to be used from a debugging tool (eg. gdb).
+void SelectionDAG::clearGraphAttrs() {
+#ifndef NDEBUG
+  NodeGraphAttrs.clear();
+#else
+  errs() << "SelectionDAG::clearGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
+///
+void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = Attrs;
+#else
+  errs() << "SelectionDAG::setGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
+/// Used from getNodeAttributes.
+const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
+#ifndef NDEBUG
+  std::map<const SDNode *, std::string>::const_iterator I =
+    NodeGraphAttrs.find(N);
+
+  if (I != NodeGraphAttrs.end())
+    return I->second;
+  else
+    return "";
+#else
+  errs() << "SelectionDAG::getGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+  return std::string();
+#endif
+}
+
+/// setGraphColor - Convenience for setting node color attribute.
+///
+void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = std::string("color=") + Color;
+#else
+  errs() << "SelectionDAG::setGraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+/// setSubgraphColorHelper - Implement setSubgraphColor.  Return
+/// whether we truncated the search.
+///
+bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet<SDNode *> &visited,
+                                          int level, bool &printed) {
+  bool hit_limit = false;
+
+#ifndef NDEBUG
+  if (level >= 20) {
+    if (!printed) {
+      printed = true;
+      DEBUG(dbgs() << "setSubgraphColor hit max level\n");
+    }
+    return true;
+  }
+
+  unsigned oldSize = visited.size();
+  visited.insert(N);
+  if (visited.size() != oldSize) {
+    setGraphColor(N, Color);
+    for(SDNodeIterator i = SDNodeIterator::begin(N), iend = SDNodeIterator::end(N);
+        i != iend;
+        ++i) {
+      hit_limit = setSubgraphColorHelper(*i, Color, visited, level+1, printed) || hit_limit;
+    }
+  }
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+  return hit_limit;
+}
+
+/// setSubgraphColor - Convenience for setting subgraph color attribute.
+///
+void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  DenseSet<SDNode *> visited;
+  bool printed = false;
+  if (setSubgraphColorHelper(N, Color, visited, 0, printed)) {
+    // Visually mark that we hit the limit
+    if (strcmp(Color, "red") == 0) {
+      setSubgraphColorHelper(N, "blue", visited, 0, printed);
+    } else if (strcmp(Color, "yellow") == 0) {
+      setSubgraphColorHelper(N, "green", visited, 0, printed);
+    }
+  }
+
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream O(s);
+  O << "SU(" << SU->NodeNum << "): ";
+  if (SU->getNode()) {
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      O << DOTGraphTraits<SelectionDAG*>
+        ::getSimpleNodeLabel(GluedNodes.back(), DAG);
+      GluedNodes.pop_back();
+      if (!GluedNodes.empty())
+        O << "\n    ";
+    }
+  } else {
+    O << "CROSS RC COPY";
+  }
+  return O.str();
+}
+
+void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
+  if (DAG) {
+    // Draw a special "GraphRoot" node to indicate the root of the graph.
+    GW.emitSimpleNode(0, "plaintext=circle", "GraphRoot");
+    const SDNode *N = DAG->getRoot().getNode();
+    if (N && N->getNodeId() != -1)
+      GW.emitEdge(0, -1, &SUnits[N->getNodeId()], -1,
+                  "color=blue,style=dashed");
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
new file mode 100644
index 000000000000..f5fc66c4d3da
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -0,0 +1,2593 @@
+//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cctype>
+using namespace llvm;
+
+/// NOTE: The constructor takes ownership of TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm,
+                               const TargetLoweringObjectFile *tlof)
+  : TargetLoweringBase(tm, tlof) {}
+
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return NULL;
+}
+
+/// Check whether a given call node is in tail position within its function. If
+/// so, it sets Chain to the input chain of the tail call.
+bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
+                                          SDValue &Chain) const {
+  const Function *F = DAG.getMachineFunction().getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore noalias because it doesn't affect the call sequence.
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
+      .removeAttribute(Attribute::NoAlias).hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Check if the only use is a function return node.
+  return isUsedByReturnOnly(Node, Chain);
+}
+
+
+/// Generate a libcall taking the given operands as arguments and returning a
+/// result of type RetVT.
+SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
+                                    RTLIB::Libcall LC, EVT RetVT,
+                                    const SDValue *Ops, unsigned NumOps,
+                                    bool isSigned, DebugLoc dl) const {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy());
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                    false, 0, getLibcallCallingConv(LC),
+                    /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallInfo = LowerCallTo(CLI);
+
+  return CallInfo.first;
+}
+
+
+/// SoftenSetCCOperands - Soften the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                                         SDValue &NewLHS, SDValue &NewRHS,
+                                         ISD::CondCode &CCCode,
+                                         DebugLoc dl) const {
+  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+         && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
+          (VT == MVT::f64) ? RTLIB::UNE_F64 : RTLIB::UNE_F128;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+          (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+          (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+    break;
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+    break;
+  case ISD::SETO:
+    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
+          (VT == MVT::f64) ? RTLIB::O_F64 : RTLIB::O_F128;
+    break;
+  default:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 : RTLIB::UO_F128;
+    switch (CCCode) {
+    case ISD::SETONE:
+      // SETONE = SETOLT | SETOGT
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+      // Fallthrough
+    case ISD::SETUGT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+            (VT == MVT::f64) ? RTLIB::OGT_F64 : RTLIB::OGT_F128;
+      break;
+    case ISD::SETUGE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+            (VT == MVT::f64) ? RTLIB::OGE_F64 : RTLIB::OGE_F128;
+      break;
+    case ISD::SETULT:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 : RTLIB::OLT_F128;
+      break;
+    case ISD::SETULE:
+      LC2 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+            (VT == MVT::f64) ? RTLIB::OLE_F64 : RTLIB::OLE_F128;
+      break;
+    case ISD::SETUEQ:
+      LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+            (VT == MVT::f64) ? RTLIB::OEQ_F64 : RTLIB::OEQ_F128;
+      break;
+    default: llvm_unreachable("Do not know how to soften this setcc!");
+    }
+  }
+
+  // Use the target specific return value for comparions lib calls.
+  EVT RetVT = getCmpLibcallReturnType();
+  SDValue Ops[2] = { NewLHS, NewRHS };
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewRHS = DAG.getConstant(0, RetVT);
+  CCCode = getCmpLibcallCC(LC1);
+  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
+    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT),
+                              NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+    NewLHS = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT), NewLHS,
+                         NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
+  }
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned TargetLowering::getJumpTableEncoding() const {
+  // In non-pic modes, just use the address of a block.
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+    return MachineJumpTableInfo::EK_BlockAddress;
+
+  // In PIC mode, if the target supports a GPRel32 directive, use it.
+  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != 0)
+    return MachineJumpTableInfo::EK_GPRel32BlockAddress;
+
+  // Otherwise, use a label difference.
+  return MachineJumpTableInfo::EK_LabelDifference32;
+}
+
+SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                 SelectionDAG &DAG) const {
+  // If our PIC model is GP relative, use the global offset table as the base.
+  unsigned JTEncoding = getJumpTableEncoding();
+
+  if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
+      (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(0));
+
+  return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *
+TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                             unsigned JTI,MCContext &Ctx) const{
+  // The normal PIC reloc base is the label at the start of the jump table.
+  return MCSymbolRefExpr::Create(MF->getJTISymbol(JTI, Ctx), Ctx);
+}
+
+bool
+TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // Assume that everything is safe in static mode.
+  if (getTargetMachine().getRelocationModel() == Reloc::Static)
+    return true;
+
+  // In dynamic-no-pic mode, assume that known defined values are safe.
+  if (getTargetMachine().getRelocationModel() == Reloc::DynamicNoPIC &&
+      GA &&
+      !GA->getGlobal()->isDeclaration() &&
+      !GA->getGlobal()->isWeakForLinker())
+    return true;
+
+  // Otherwise assume nothing is safe.
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//  Optimization Methods
+//===----------------------------------------------------------------------===//
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
+                                                        const APInt &Demanded) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // FIXME: ISD::SELECT, ISD::SELECT_CC
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR: {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!C) return false;
+
+    if (Op.getOpcode() == ISD::XOR &&
+        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+
+    // if we can expand it to have all bits set, do it
+    if (C->getAPIntValue().intersects(~Demanded)) {
+      EVT VT = Op.getValueType();
+      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
+                                DAG.getConstant(Demanded &
+                                                C->getAPIntValue(),
+                                                VT));
+      return CombineTo(Op, New);
+    }
+
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+/// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+/// cast, but it could be generalized for targets with other types of
+/// implicit widening casts.
+bool
+TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                    unsigned BitWidth,
+                                                    const APInt &Demanded,
+                                                    DebugLoc dl) {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned DemandedSize = BitWidth - Demanded.countLeadingZeros();
+  unsigned SmallVTBits = DemandedSize;
+  if (!isPowerOf2_32(SmallVTBits))
+    SmallVTBits = NextPowerOf2(SmallVTBits);
+  for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
+    if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
+        TLI.isZExtFree(SmallVT, Op.getValueType())) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(0)),
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(1)));
+      bool NeedZext = DemandedSize > SmallVTBits;
+      SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
+                              dl, Op.getValueType(), X);
+      return CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
+/// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
+/// DemandedMask bits of the result of Op are ever used downstream.  If we can
+/// use this information to simplify Op, create a new simplified DAG node and
+/// return true, returning the original and new nodes in Old and New. Otherwise,
+/// analyze the expression and return a mask of KnownOne and KnownZero bits for
+/// the expression (used to simplify the caller).  The KnownZero/One bits may
+/// only be accurate for those bits in the DemandedMask.
+bool TargetLowering::SimplifyDemandedBits(SDValue Op,
+                                          const APInt &DemandedMask,
+                                          APInt &KnownZero,
+                                          APInt &KnownOne,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth) const {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
+         "Mask size mismatches value type size!");
+  APInt NewMask = DemandedMask;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+
+  // Other users may use these bits.
+  if (!Op.getNode()->hasOneUse()) {
+    if (Depth != 0) {
+      // If not at the root, Just compute the KnownZero/KnownOne bits to
+      // simplify things downstream.
+      TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+      return false;
+    }
+    // If this is the root being simplified, allow it to have multiple uses,
+    // just set the NewMask to all bits.
+    NewMask = APInt::getAllOnesValue(BitWidth);
+  } else if (DemandedMask == 0) {
+    // Not demanding any bits from Op.
+    if (Op.getOpcode() != ISD::UNDEF)
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
+    return false;
+  } else if (Depth == 6) {        // Limit search depth.
+    return false;
+  }
+
+  APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut;
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
+    return false;   // Don't fall through, will infinitely loop.
+  case ISD::AND:
+    // If the RHS is a constant, check to see if the LHS would be zero without
+    // using the bits from the RHS.  Below, we use knowledge about the RHS to
+    // simplify the LHS, here we're using information from the LHS to simplify
+    // the RHS.
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt LHSZero, LHSOne;
+      // Do not increment Depth here; that can cause an infinite loop.
+      TLO.DAG.ComputeMaskedBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
+      // If the LHS already has zeros where RHSC does, this and is dead.
+      if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
+        return TLO.CombineTo(Op, Op.getOperand(0));
+      // If any of the set bits in the RHS are known zero on the LHS, shrink
+      // the constant.
+      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+        return true;
+    }
+
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, Op.getValueType()));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case ISD::OR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case ISD::XOR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((KnownZero & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((KnownZero2 & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // If all of the unknown bits are known to be zero on one side or the other
+    // (but not both) turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((NewMask & ~KnownZero & ~KnownZero2) == 0)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
+                                               Op.getOperand(0),
+                                               Op.getOperand(1)));
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    // NB: it is okay if more bits are known than are requested
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side 
+      if (KnownOne == KnownOne2) { // set bits are the same on both sides
+        EVT VT = Op.getValueType();
+        SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT,
+                                                 Op.getOperand(0), ANDC));
+      }
+    }
+
+    // If the RHS is a constant, see if we can simplify it.
+    // for XOR, we prefer to force bits to 1 if they will make a -1.
+    // if we can't force bits, try to shrink constant
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt Expanded = C->getAPIntValue() | (~NewMask);
+      // if we can expand it to have all bits set, do it
+      if (Expanded.isAllOnesValue()) {
+        if (Expanded != C->getAPIntValue()) {
+          EVT VT = Op.getValueType();
+          SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0),
+                                          TLO.DAG.getConstant(Expanded, VT));
+          return TLO.CombineTo(Op, New);
+        }
+        // if it already has all the bits set, nothing to change
+        // but don't shrink either!
+      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+        return true;
+      }
+    }
+
+    KnownZero = KnownZeroOut;
+    KnownOne  = KnownOneOut;
+    break;
+  case ISD::SELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SELECT_CC:
+    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SHL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the bottom bits (which are shifted
+      // out) are never demanded.
+      if (InOp.getOpcode() == ISD::SRL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SHL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SRL;
+          }
+
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          EVT VT = Op.getValueType();
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }
+
+      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+
+      // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
+      // are not demanded. This will likely allow the anyext to be folded away.
+      if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
+        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        EVT InnerVT = InnerOp.getValueType();
+        unsigned InnerBits = InnerVT.getSizeInBits();
+        if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 &&
+            isTypeDesirableForOp(ISD::SHL, InnerVT)) {
+          EVT ShTy = getShiftAmountTy(InnerVT);
+          if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
+            ShTy = InnerVT;
+          SDValue NarrowShl =
+            TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
+                            TLO.DAG.getConstant(ShAmt, ShTy));
+          return
+            TLO.CombineTo(Op,
+                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
+                                          NarrowShl));
+        }
+      }
+
+      KnownZero <<= SA->getZExtValue();
+      KnownOne  <<= SA->getZExtValue();
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+    }
+    break;
+  case ISD::SRL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      unsigned VTSize = VT.getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the top bits (which are shifted out)
+      // are never demanded.
+      if (InOp.getOpcode() == ISD::SHL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SRL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SHL;
+          }
+
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, Op.getOperand(1).getValueType());
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }
+
+      // Compute the new bits that are at the top now.
+      if (SimplifyDemandedBits(InOp, (NewMask << ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    break;
+  case ISD::SRA:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (NewMask == 1)
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                                           Op.getOperand(0), Op.getOperand(1)));
+
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      APInt InDemandedMask = (NewMask << ShAmt);
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      if (HighBits.intersects(NewMask))
+        InDemandedMask |= APInt::getSignBit(VT.getScalarType().getSizeInBits());
+
+      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bit, adjusted to where it is now in the mask.
+      APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
+                                                 Op.getOperand(0),
+                                                 Op.getOperand(1)));
+      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
+        KnownOne |= HighBits;
+      }
+    }
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+    APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1);
+    // If we only care about the highest bit, don't bother shifting right.
+    if (MsbMask == DemandedMask) {
+      unsigned ShAmt = ExVT.getScalarType().getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+
+      // Compute the correct shift amount type, which must be getShiftAmountTy
+      // for scalar types after legalization.
+      EVT ShiftAmtTy = Op.getValueType();
+      if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
+        ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
+
+      SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, ShiftAmtTy);
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                            Op.getValueType(), InOp, ShiftAmt));
+    }
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth,
+                            BitWidth - ExVT.getScalarType().getSizeInBits());
+
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if ((NewBits & NewMask) == 0)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+
+    APInt InSignBit =
+      APInt::getSignBit(ExVT.getScalarType().getSizeInBits()).zext(BitWidth);
+    APInt InputDemandedBits =
+      APInt::getLowBitsSet(BitWidth,
+                           ExVT.getScalarType().getSizeInBits()) &
+      NewMask;
+
+    // Since the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InputDemandedBits |= InSignBit;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, convert this into a zero extension.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op,
+                          TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,ExVT));
+
+    if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
+      KnownOne |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                       // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask.trunc(OperandBitWidth);
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - OperandBitWidth) & NewMask;
+    if (!NewBits.intersects(NewMask))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                               Op.getValueType(),
+                                               Op.getOperand(0)));
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
+    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt NewBits   = ~InMask & NewMask;
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (NewBits == 0)
+      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                              Op.getValueType(),
+                                              Op.getOperand(0)));
+
+    // Since some of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    APInt InDemandedBits = InMask & NewMask;
+    InDemandedBits |= InSignBit;
+    InDemandedBits = InDemandedBits.trunc(InBits);
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero, convert this to a zero extend.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                               Op.getValueType(),
+                                               Op.getOperand(0)));
+
+    // If the sign bit is known one, the top bits match.
+    if (KnownOne.intersects(InSignBit)) {
+      KnownOne |= NewBits;
+      assert((KnownZero & NewBits) == 0);
+    } else {   // Otherwise, top bits aren't known.
+      assert((KnownOne & NewBits) == 0);
+      assert((KnownZero & NewBits) == 0);
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask.trunc(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    // Simplify the input, using demanded bit information, and compute the known
+    // zero/one bits live out.
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt TruncMask = NewMask.zext(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+
+    // If the input is only used by this truncate, see if we can shrink it based
+    // on the known demanded bits.
+    if (Op.getOperand(0).getNode()->hasOneUse()) {
+      SDValue In = Op.getOperand(0);
+      switch (In.getOpcode()) {
+      default: break;
+      case ISD::SRL:
+        // Shrink SRL by a constant if none of the high bits shifted in are
+        // demanded.
+        if (TLO.LegalTypes() &&
+            !isTypeDesirableForOp(ISD::SRL, Op.getValueType()))
+          // Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
+          // undesirable.
+          break;
+        ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+        if (!ShAmt)
+          break;
+        SDValue Shift = In.getOperand(1);
+        if (TLO.LegalTypes()) {
+          uint64_t ShVal = ShAmt->getZExtValue();
+          Shift =
+            TLO.DAG.getConstant(ShVal, getShiftAmountTy(Op.getValueType()));
+        }
+
+        APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
+                                               OperandBitWidth - BitWidth);
+        HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth);
+
+        if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) {
+          // None of the shifted in bits are needed.  Add a truncate of the
+          // shift input, then shift it.
+          SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
+                                             Op.getValueType(),
+                                             In.getOperand(0));
+          return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
+                                                   Op.getValueType(),
+                                                   NewTrunc,
+                                                   Shift));
+        }
+        break;
+      }
+    }
+
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    break;
+  }
+  case ISD::AssertZext: {
+    // AssertZext demands all of the high bits, plus any of the low bits
+    // demanded by its users.
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    KnownZero |= ~InMask & NewMask;
+    break;
+  }
+  case ISD::BITCAST:
+    // If this is an FP->Int bitcast and if the sign bit is the only
+    // thing demanded, turn this into a FGETSIGN.
+    if (!TLO.LegalOperations() &&
+        !Op.getValueType().isVector() &&
+        !Op.getOperand(0).getValueType().isVector() &&
+        NewMask == APInt::getSignBit(Op.getValueType().getSizeInBits()) &&
+        Op.getOperand(0).getValueType().isFloatingPoint()) {
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
+      bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+        EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
+        // Make a FGETSIGN + SHL to move the sign bit into the appropriate
+        // place.  We expect the SHL to be eliminated by other optimizations.
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
+        unsigned OpVTSizeInBits = Op.getValueType().getSizeInBits();
+        if (!OpVTLegal && OpVTSizeInBits > 32)
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign);
+        unsigned ShVal = Op.getValueType().getSizeInBits()-1;
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, Op.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(),
+                                                 Sign, ShAmt));
+      }
+    }
+    break;
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    APInt LoMask = APInt::getLowBitsSet(BitWidth,
+                                        BitWidth - NewMask.countLeadingZeros());
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    // See if the operation should be performed at a smaller bit width.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+  }
+  // FALL THROUGH
+  default:
+    // Just use ComputeMaskedBits to compute output bits.
+    TLO.DAG.ComputeMaskedBits(Op, KnownZero, KnownOne, Depth);
+    break;
+  }
+
+  // If we know the value of all of the demanded bits, return this as a
+  // constant.
+  if ((NewMask & (KnownZero|KnownOne)) == NewMask)
+    return TLO.CombineTo(Op, TLO.DAG.getConstant(KnownOne, Op.getValueType()));
+
+  return false;
+}
+
+/// computeMaskedBitsForTargetNode - Determine which of the bits specified
+/// in Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                    APInt &KnownZero,
+                                                    APInt &KnownOne,
+                                                    const SelectionDAG &DAG,
+                                                    unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+}
+
+/// ComputeNumSignBitsForTargetNode - This method can be implemented by
+/// targets that want to expose additional information about sign bits to the
+/// DAG Combiner.
+unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use ComputeNumSignBits if you don't know whether Op"
+         " is a target node!");
+  return 1;
+}
+
+/// ValueHasExactlyOneBitSet - Test if the given value is known to have exactly
+/// one bit set. This differs from ComputeMaskedBits in that it doesn't need to
+/// determine which bit is set.
+///
+static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
+  // A left-shift of a constant one will have exactly one bit set, because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue() == 1)
+        return true;
+
+  // Similarly, a right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL)
+    if (ConstantSDNode *C =
+         dyn_cast<ConstantSDNode>(Val.getNode()->getOperand(0)))
+      if (C->getAPIntValue().isSignBit())
+        return true;
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to ComputeMaskedBits to catch other known cases.
+  EVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(Val, KnownZero, KnownOne);
+  return (KnownZero.countPopulation() == BitWidth - 1) &&
+         (KnownOne.countPopulation() == 1);
+}
+
+/// SimplifySetCC - Try to simplify a setcc built with the specified operands
+/// and cc. If it is unable to simplify it, return a null SDValue.
+SDValue
+TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                              ISD::CondCode Cond, bool foldBooleans,
+                              DAGCombinerInfo &DCI, DebugLoc dl) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return DAG.getConstant(0, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  }
+
+  // Ensure that the constant occurs on the RHS, and fold constant
+  // comparisons.
+  if (isa<ConstantSDNode>(N0.getNode()))
+    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+
+  if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
+    // equality comparison, then we're just comparing whether X itself is
+    // zero.
+    if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+        N0.getOperand(0).getOpcode() == ISD::CTLZ &&
+        N0.getOperand(1).getOpcode() == ISD::Constant) {
+      const APInt &ShAmt
+        = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          ShAmt == Log2_32(N0.getValueType().getSizeInBits())) {
+        if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+          // (srl (ctlz x), 5) == 0  -> X != 0
+          // (srl (ctlz x), 5) != 1  -> X != 0
+          Cond = ISD::SETNE;
+        } else {
+          // (srl (ctlz x), 5) != 0  -> X == 0
+          // (srl (ctlz x), 5) == 1  -> X == 0
+          Cond = ISD::SETEQ;
+        }
+        SDValue Zero = DAG.getConstant(0, N0.getValueType());
+        return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
+                            Zero, Cond);
+      }
+    }
+
+    SDValue CTPOP = N0;
+    // Look through truncs that don't change the value of a ctpop.
+    if (N0.hasOneUse() && N0.getOpcode() == ISD::TRUNCATE)
+      CTPOP = N0.getOperand(0);
+
+    if (CTPOP.hasOneUse() && CTPOP.getOpcode() == ISD::CTPOP &&
+        (N0 == CTPOP || N0.getValueType().getSizeInBits() >
+                        Log2_32_Ceil(CTPOP.getValueType().getSizeInBits()))) {
+      EVT CTVT = CTPOP.getValueType();
+      SDValue CTOp = CTPOP.getOperand(0);
+
+      // (ctpop x) u< 2 -> (x & x-1) == 0
+      // (ctpop x) u> 1 -> (x & x-1) != 0
+      if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
+        SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
+                                  DAG.getConstant(1, CTVT));
+        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
+        ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
+        return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, CTVT), CC);
+      }
+
+      // TODO: (ctpop x) == 1 -> x && (x & x-1) == 0 iff ctpop is illegal.
+    }
+
+    // (zext x) == C --> x == (trunc C)
+    if (DCI.isBeforeLegalize() && N0->hasOneUse() &&
+        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      unsigned MinBits = N0.getValueSizeInBits();
+      SDValue PreZExt;
+      if (N0->getOpcode() == ISD::ZERO_EXTEND) {
+        // ZExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreZExt = N0->getOperand(0);
+      } else if (N0->getOpcode() == ISD::AND) {
+        // DAGCombine turns costly ZExts into ANDs
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
+          if ((C->getAPIntValue()+1).isPowerOf2()) {
+            MinBits = C->getAPIntValue().countTrailingOnes();
+            PreZExt = N0->getOperand(0);
+          }
+      } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
+        // ZEXTLOAD
+        if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreZExt = N0;
+        }
+      }
+
+      // Make sure we're not losing bits from the constant.
+      if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) {
+        EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
+        if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
+          // Will get folded away.
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt);
+          SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT);
+          return DAG.getSetCC(dl, VT, Trunc, C, Cond);
+        }
+      }
+    }
+
+    // If the LHS is '(and load, const)', the RHS is 0,
+    // the test is for equality or unsigned, and all 1 bits of the const are
+    // in the same partial word, see if we can shorten the load.
+    if (DCI.isBeforeLegalize() &&
+        N0.getOpcode() == ISD::AND && C1 == 0 &&
+        N0.getNode()->hasOneUse() &&
+        isa<LoadSDNode>(N0.getOperand(0)) &&
+        N0.getOperand(0).getNode()->hasOneUse() &&
+        isa<ConstantSDNode>(N0.getOperand(1))) {
+      LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
+      APInt bestMask;
+      unsigned bestWidth = 0, bestOffset = 0;
+      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+        unsigned origWidth = N0.getValueType().getSizeInBits();
+        unsigned maskWidth = origWidth;
+        // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
+        // 8 bits, but have to be careful...
+        if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
+          origWidth = Lod->getMemoryVT().getSizeInBits();
+        const APInt &Mask =
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        for (unsigned width = origWidth / 2; width>=8; width /= 2) {
+          APInt newMask = APInt::getLowBitsSet(maskWidth, width);
+          for (unsigned offset=0; offset<origWidth/width; offset++) {
+            if ((newMask & Mask) == Mask) {
+              if (!getDataLayout()->isLittleEndian())
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
+              else
+                bestOffset = (uint64_t)offset * (width/8);
+              bestMask = Mask.lshr(offset * (width/8) * 8);
+              bestWidth = width;
+              break;
+            }
+            newMask = newMask << width;
+          }
+        }
+      }
+      if (bestWidth) {
+        EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
+        if (newVT.isRound()) {
+          EVT PtrType = Lod->getOperand(1).getValueType();
+          SDValue Ptr = Lod->getBasePtr();
+          if (bestOffset != 0)
+            Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
+                              DAG.getConstant(bestOffset, PtrType));
+          unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
+          SDValue NewLoad = DAG.getLoad(newVT, dl, Lod->getChain(), Ptr,
+                                Lod->getPointerInfo().getWithOffset(bestOffset),
+                                        false, false, false, NewAlign);
+          return DAG.getSetCC(dl, VT,
+                              DAG.getNode(ISD::AND, dl, newVT, NewLoad,
+                                      DAG.getConstant(bestMask.trunc(bestWidth),
+                                                      newVT)),
+                              DAG.getConstant(0LL, newVT), Cond);
+        }
+      }
+    }
+
+    // If the LHS is a ZERO_EXTEND, perform the comparison on the input.
+    if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+      unsigned InSize = N0.getOperand(0).getValueType().getSizeInBits();
+
+      // If the comparison constant has bits in the upper part, the
+      // zero-extended value could never match.
+      if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
+                                              C1.getBitWidth() - InSize))) {
+        switch (Cond) {
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+        case ISD::SETEQ: return DAG.getConstant(0, VT);
+        case ISD::SETULT:
+        case ISD::SETULE:
+        case ISD::SETNE: return DAG.getConstant(1, VT);
+        case ISD::SETGT:
+        case ISD::SETGE:
+          // True if the sign bit of C1 is set.
+          return DAG.getConstant(C1.isNegative(), VT);
+        case ISD::SETLT:
+        case ISD::SETLE:
+          // True if the sign bit of C1 isn't set.
+          return DAG.getConstant(C1.isNonNegative(), VT);
+        default:
+          break;
+        }
+      }
+
+      // Otherwise, we can perform the comparison with the low bits.
+      switch (Cond) {
+      case ISD::SETEQ:
+      case ISD::SETNE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETULT:
+      case ISD::SETULE: {
+        EVT newVT = N0.getOperand(0).getValueType();
+        if (DCI.isBeforeLegalizeOps() ||
+            (isOperationLegal(ISD::SETCC, newVT) &&
+             getCondCodeAction(Cond, newVT.getSimpleVT())==Legal))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                              DAG.getConstant(C1.trunc(InSize), newVT),
+                              Cond);
+        break;
+      }
+      default:
+        break;   // todo, be more careful with signed comparisons
+      }
+    } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
+      unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
+      EVT ExtDstTy = N0.getValueType();
+      unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
+
+      // If the constant doesn't fit into the number of bits for the source of
+      // the sign extension, it is impossible for both sides to be equal.
+      if (C1.getMinSignedBits() > ExtSrcTyBits)
+        return DAG.getConstant(Cond == ISD::SETNE, VT);
+
+      SDValue ZextOp;
+      EVT Op0Ty = N0.getOperand(0).getValueType();
+      if (Op0Ty == ExtSrcTy) {
+        ZextOp = N0.getOperand(0);
+      } else {
+        APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+        ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
+                              DAG.getConstant(Imm, Op0Ty));
+      }
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(ZextOp.getNode());
+      // Otherwise, make this a use of a zext.
+      return DAG.getSetCC(dl, VT, ZextOp,
+                          DAG.getConstant(C1 & APInt::getLowBitsSet(
+                                                              ExtDstTyBits,
+                                                              ExtSrcTyBits),
+                                          ExtDstTy),
+                          Cond);
+    } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+                (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
+      if (N0.getOpcode() == ISD::SETCC &&
+          isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1);
+        if (TrueWhenTrue)
+          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+        // Invert the condition.
+        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+        CC = ISD::getSetCCInverse(CC,
+                                  N0.getOperand(0).getValueType().isInteger());
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+      }
+
+      if ((N0.getOpcode() == ISD::XOR ||
+           (N0.getOpcode() == ISD::AND &&
+            N0.getOperand(0).getOpcode() == ISD::XOR &&
+            N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
+          isa<ConstantSDNode>(N0.getOperand(1)) &&
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+        // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
+        // can only do this if the top bits are known zero.
+        unsigned BitWidth = N0.getValueSizeInBits();
+        if (DAG.MaskedValueIsZero(N0,
+                                  APInt::getHighBitsSet(BitWidth,
+                                                        BitWidth-1))) {
+          // Okay, get the un-inverted input value.
+          SDValue Val;
+          if (N0.getOpcode() == ISD::XOR)
+            Val = N0.getOperand(0);
+          else {
+            assert(N0.getOpcode() == ISD::AND &&
+                    N0.getOperand(0).getOpcode() == ISD::XOR);
+            // ((X^1)&1)^1 -> X & 1
+            Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
+                              N0.getOperand(0).getOperand(0),
+                              N0.getOperand(1));
+          }
+
+          return DAG.getSetCC(dl, VT, Val, N1,
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+      } else if (N1C->getAPIntValue() == 1 &&
+                 (VT == MVT::i1 ||
+                  getBooleanContents(false) == ZeroOrOneBooleanContent)) {
+        SDValue Op0 = N0;
+        if (Op0.getOpcode() == ISD::TRUNCATE)
+          Op0 = Op0.getOperand(0);
+
+        if ((Op0.getOpcode() == ISD::XOR) &&
+            Op0.getOperand(0).getOpcode() == ISD::SETCC &&
+            Op0.getOperand(1).getOpcode() == ISD::SETCC) {
+          // (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
+          Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
+          return DAG.getSetCC(dl, VT, Op0.getOperand(0), Op0.getOperand(1),
+                              Cond);
+        }
+        if (Op0.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(Op0.getOperand(1)) &&
+            cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) {
+          // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
+          if (Op0.getValueType().bitsGT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                          DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
+                          DAG.getConstant(1, VT));
+          else if (Op0.getValueType().bitsLT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                        DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
+                        DAG.getConstant(1, VT));
+
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+        if (Op0.getOpcode() == ISD::AssertZext &&
+            cast<VTSDNode>(Op0.getOperand(1))->getVT() == MVT::i1)
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+      }
+    }
+
+    APInt MinVal, MaxVal;
+    unsigned OperandBitSize = N1C->getValueType(0).getSizeInBits();
+    if (ISD::isSignedIntSetCC(Cond)) {
+      MinVal = APInt::getSignedMinValue(OperandBitSize);
+      MaxVal = APInt::getSignedMaxValue(OperandBitSize);
+    } else {
+      MinVal = APInt::getMinValue(OperandBitSize);
+      MaxVal = APInt::getMaxValue(OperandBitSize);
+    }
+
+    // Canonicalize GE/LE comparisons to use GT/LT comparisons.
+    if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
+      if (C1 == MinVal) return DAG.getConstant(1, VT);   // X >= MIN --> true
+      // X >= C0 --> X > (C0-1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(C1-1, N1.getValueType()),
+                          (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT);
+    }
+
+    if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
+      if (C1 == MaxVal) return DAG.getConstant(1, VT);   // X <= MAX --> true
+      // X <= C0 --> X < (C0+1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(C1+1, N1.getValueType()),
+                          (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT);
+    }
+
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal)
+      return DAG.getConstant(0, VT);      // X < MIN --> false
+    if ((Cond == ISD::SETGE || Cond == ISD::SETUGE) && C1 == MinVal)
+      return DAG.getConstant(1, VT);      // X >= MIN --> true
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal)
+      return DAG.getConstant(0, VT);      // X > MAX --> false
+    if ((Cond == ISD::SETLE || Cond == ISD::SETULE) && C1 == MaxVal)
+      return DAG.getConstant(1, VT);      // X <= MAX --> true
+
+    // Canonicalize setgt X, Min --> setne X, Min
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MinVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+    // Canonicalize setlt X, Max --> setne X, Max
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MaxVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+    // If we have setult X, 1, turn it into seteq X, 0
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal+1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(MinVal, N0.getValueType()),
+                          ISD::SETEQ);
+    // If we have setugt X, Max-1, turn it into seteq X, Max
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal-1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(MaxVal, N0.getValueType()),
+                          ISD::SETEQ);
+
+    // If we have "setcc X, C0", check to see if we can shrink the immediate
+    // by changing cc.
+
+    // SETUGT X, SINTMAX  -> SETLT X, 0
+    if (Cond == ISD::SETUGT &&
+        C1 == APInt::getSignedMaxValue(OperandBitSize))
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(0, N1.getValueType()),
+                          ISD::SETLT);
+
+    // SETULT X, SINTMIN  -> SETGT X, -1
+    if (Cond == ISD::SETULT &&
+        C1 == APInt::getSignedMinValue(OperandBitSize)) {
+      SDValue ConstMinusOne =
+          DAG.getConstant(APInt::getAllOnesValue(OperandBitSize),
+                          N1.getValueType());
+      return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+    }
+
+    // Fold bit comparisons when we can.
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        (VT == N0.getValueType() ||
+         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        N0.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS =
+                  dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
+          getPointerTy() : getShiftAmountTy(N0.getValueType());
+        if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
+          // Perform the xform if the AND RHS is a single bit.
+          if (AndRHS->getAPIntValue().isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                   DAG.getConstant(AndRHS->getAPIntValue().logBase2(), ShiftTy)));
+          }
+        } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
+          // (X & 8) == 8  -->  (X & 8) >> 3
+          // Perform the xform if C1 is a single bit.
+          if (C1.isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                                      DAG.getConstant(C1.logBase2(), ShiftTy)));
+          }
+        }
+      }
+
+    if (C1.getMinSignedBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (ConstantSDNode *AndRHS =
+            dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countTrailingZeros();
+            EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
+              getPointerTy() : getShiftAmountTy(N0.getValueType());
+            EVT CmpTy = N0.getValueType();
+            SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
+                                        DAG.getConstant(ShiftBits, ShiftTy));
+            SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), CmpTy);
+            return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countTrailingOnes();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countTrailingZeros();
+        }
+        NewC = NewC.lshr(ShiftBits);
+        if (ShiftBits && isLegalICmpImmediate(NewC.getSExtValue())) {
+          EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
+            getPointerTy() : getShiftAmountTy(N0.getValueType());
+          EVT CmpTy = N0.getValueType();
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
+                                      DAG.getConstant(ShiftBits, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, CmpTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
+  }
+
+  if (isa<ConstantFPSDNode>(N0.getNode())) {
+    // Constant fold or commute setcc.
+    SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl);
+    if (O.getNode()) return O;
+  } else if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    // If the RHS of an FP comparison is a constant, simplify it away in
+    // some cases.
+    if (CFP->getValueAPF().isNaN()) {
+      // If an operand is known to be a nan, we can fold it.
+      switch (ISD::getUnorderedFlavor(Cond)) {
+      default: llvm_unreachable("Unknown flavor!");
+      case 0:  // Known false.
+        return DAG.getConstant(0, VT);
+      case 1:  // Known true.
+        return DAG.getConstant(1, VT);
+      case 2:  // Undefined.
+        return DAG.getUNDEF(VT);
+      }
+    }
+
+    // Otherwise, we know the RHS is not a NaN.  Simplify the node to drop the
+    // constant if knowing that the operand is non-nan is enough.  We prefer to
+    // have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
+    // materialize 0.0.
+    if (Cond == ISD::SETO || Cond == ISD::SETUO)
+      return DAG.getSetCC(dl, VT, N0, N0, Cond);
+
+    // If the condition is not legal, see if we can find an equivalent one
+    // which is legal.
+    if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+      // If the comparison was an awkward floating-point == or != and one of
+      // the comparison operands is infinity or negative infinity, convert the
+      // condition to a less-awkward <= or >=.
+      if (CFP->getValueAPF().isInfinity()) {
+        if (CFP->getValueAPF().isNegative()) {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
+        } else {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
+        }
+      }
+    }
+  }
+
+  if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    uint64_t EqVal = 0;
+    switch (getBooleanContents(N0.getValueType().isVector())) {
+    case UndefinedBooleanContent:
+    case ZeroOrOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond);
+      break;
+    case ZeroOrNegativeOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
+      break;
+    }
+
+    // We can always fold X == X for integer setcc's.
+    if (N0.getValueType().isInteger()) {
+      return DAG.getConstant(EqVal, VT);
+    }
+    unsigned UOF = ISD::getUnorderedFlavor(Cond);
+    if (UOF == 2)   // FP operators that are undefined on NaNs.
+      return DAG.getConstant(EqVal, VT);
+    if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
+      return DAG.getConstant(EqVal, VT);
+    // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
+    // if it is not already.
+    ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
+    if (NewCond != Cond && (DCI.isBeforeLegalizeOps() ||
+          getCondCodeAction(NewCond, N0.getSimpleValueType()) == Legal))
+      return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+  }
+
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+      N0.getValueType().isInteger()) {
+    if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
+        N0.getOpcode() == ISD::XOR) {
+      // Simplify (X+Y) == (X+Z) -->  Y == Z
+      if (N0.getOpcode() == N1.getOpcode()) {
+        if (N0.getOperand(0) == N1.getOperand(0))
+          return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
+        if (N0.getOperand(1) == N1.getOperand(1))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
+        if (DAG.isCommutativeBinOp(N0.getOpcode())) {
+          // If X op Y == Y op X, try other combinations.
+          if (N0.getOperand(0) == N1.getOperand(1))
+            return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
+                                Cond);
+          if (N0.getOperand(1) == N1.getOperand(0))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1),
+                                Cond);
+        }
+      }
+
+      // If RHS is a legal immediate value for a compare instruction, we need
+      // to be careful about increasing register pressure needlessly.
+      bool LegalRHSImm = false;
+
+      if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (ConstantSDNode *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          // Turn (X+C1) == C2 --> X == C2-C1
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(RHSC->getAPIntValue()-
+                                                LHSR->getAPIntValue(),
+                                N0.getValueType()), Cond);
+          }
+
+          // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
+          if (N0.getOpcode() == ISD::XOR)
+            // If we know that all of the inverted bits are zero, don't bother
+            // performing the inversion.
+            if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
+              return
+                DAG.getSetCC(dl, VT, N0.getOperand(0),
+                             DAG.getConstant(LHSR->getAPIntValue() ^
+                                               RHSC->getAPIntValue(),
+                                             N0.getValueType()),
+                             Cond);
+        }
+
+        // Turn (C1-X) == C2 --> X == C1-C2
+        if (ConstantSDNode *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
+            return
+              DAG.getSetCC(dl, VT, N0.getOperand(1),
+                           DAG.getConstant(SUBC->getAPIntValue() -
+                                             RHSC->getAPIntValue(),
+                                           N0.getValueType()),
+                           Cond);
+          }
+        }
+
+        // Could RHSC fold directly into a compare?
+        if (RHSC->getValueType(0).getSizeInBits() <= 64)
+          LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
+      }
+
+      // Simplify (X+Z) == X -->  Z == 0
+      // Don't do this if X is an immediate that can fold into a cmp
+      // instruction and X+Z has other uses. It could be an induction variable
+      // chain, and the transform would increase register pressure.
+      if (!LegalRHSImm || N0.getNode()->hasOneUse()) {
+        if (N0.getOperand(0) == N1)
+          return DAG.getSetCC(dl, VT, N0.getOperand(1),
+                              DAG.getConstant(0, N0.getValueType()), Cond);
+        if (N0.getOperand(1) == N1) {
+          if (DAG.isCommutativeBinOp(N0.getOpcode()))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(0, N0.getValueType()), Cond);
+          if (N0.getNode()->hasOneUse()) {
+            assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+            // (Z-X) == X  --> Z == X<<1
+            SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N1,
+                       DAG.getConstant(1, getShiftAmountTy(N1.getValueType())));
+            if (!DCI.isCalledByLegalizer())
+              DCI.AddToWorklist(SH.getNode());
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+          }
+        }
+      }
+    }
+
+    if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
+        N1.getOpcode() == ISD::XOR) {
+      // Simplify  X == (X+Z) -->  Z == 0
+      if (N1.getOperand(0) == N0)
+        return DAG.getSetCC(dl, VT, N1.getOperand(1),
+                        DAG.getConstant(0, N1.getValueType()), Cond);
+      if (N1.getOperand(1) == N0) {
+        if (DAG.isCommutativeBinOp(N1.getOpcode()))
+          return DAG.getSetCC(dl, VT, N1.getOperand(0),
+                          DAG.getConstant(0, N1.getValueType()), Cond);
+        if (N1.getNode()->hasOneUse()) {
+          assert(N1.getOpcode() == ISD::SUB && "Unexpected operation!");
+          // X == (Z-X)  --> X<<1 == Z
+          SDValue SH = DAG.getNode(ISD::SHL, dl, N1.getValueType(), N0,
+                       DAG.getConstant(1, getShiftAmountTy(N0.getValueType())));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
+        }
+      }
+    }
+
+    // Simplify x&y == y to x&y != 0 if y has exactly one bit set.
+    // Note that where y is variable and is known to have at most
+    // one bit set (for example, if it is z&1) we cannot do this;
+    // the expressions are not equivalent when y==0.
+    if (N0.getOpcode() == ISD::AND)
+      if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
+        if (ValueHasExactlyOneBitSet(N1, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N1.getValueType());
+          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+        }
+      }
+    if (N1.getOpcode() == ISD::AND)
+      if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
+        if (ValueHasExactlyOneBitSet(N0, DAG)) {
+          Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+          SDValue Zero = DAG.getConstant(0, N0.getValueType());
+          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+        }
+      }
+  }
+
+  // Fold away ALL boolean setcc's.
+  SDValue Temp;
+  if (N0.getValueType() == MVT::i1 && foldBooleans) {
+    switch (Cond) {
+    default: llvm_unreachable("Unknown integer setcc!");
+    case ISD::SETEQ:  // X == Y  -> ~(X^Y)
+      Temp = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, MVT::i1);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETNE:  // X != Y   -->  (X^Y)
+      N0 = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      break;
+    case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+    case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+    case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N0, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+    case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+    case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N0, Temp);
+      break;
+    }
+    if (VT != MVT::i1) {
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(N0.getNode());
+      // FIXME: If running after legalize, we probably can't do this.
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0);
+    }
+    return N0;
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
+                                    int64_t &Offset) const {
+  if (isa<GlobalAddressSDNode>(N)) {
+    GlobalAddressSDNode *GASD = cast<GlobalAddressSDNode>(N);
+    GA = GASD->getGlobal();
+    Offset += GASD->getOffset();
+    return true;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue N1 = N->getOperand(0);
+    SDValue N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+      if (V) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+
+SDValue TargetLowering::
+PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  // Default implementation: no optimization.
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Assembler Implementation Methods
+//===----------------------------------------------------------------------===//
+
+
+TargetLowering::ConstraintType
+TargetLowering::getConstraintType(const std::string &Constraint) const {
+  unsigned S = Constraint.size();
+
+  if (S == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'r': return C_RegisterClass;
+    case 'm':    // memory
+    case 'o':    // offsetable
+    case 'V':    // not offsetable
+      return C_Memory;
+    case 'i':    // Simple Integer or Relocatable Constant
+    case 'n':    // Simple Integer
+    case 'E':    // Floating Point Constant
+    case 'F':    // Floating Point Constant
+    case 's':    // Relocatable Constant
+    case 'p':    // Address.
+    case 'X':    // Allow ANY value.
+    case 'I':    // Target registers.
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'O':
+    case 'P':
+    case '<':
+    case '>':
+      return C_Other;
+    }
+  }
+
+  if (S > 1 && Constraint[0] == '{' && Constraint[S-1] == '}') {
+    if (S == 8 && !Constraint.compare(1, 6, "memory", 6))  // "{memory}"
+      return C_Memory;
+    return C_Register;
+  }
+  return C_Unknown;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
+  if (ConstraintVT.isInteger())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "f";      // works for many targets
+  return 0;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  std::string &Constraint,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const {
+
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'X':     // Allows any operand; labels (basic block) use this.
+    if (Op.getOpcode() == ISD::BasicBlock) {
+      Ops.push_back(Op);
+      return;
+    }
+    // fall through
+  case 'i':    // Simple Integer or Relocatable Constant
+  case 'n':    // Simple Integer
+  case 's': {  // Relocatable Constant
+    // These operands are interested in values of the form (GV+C), where C may
+    // be folded in as an offset of GV, or it may be explicitly added.  Also, it
+    // is possible and fine if either GV or C are missing.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+
+    // If we have "(add GV, C)", pull out GV/C
+    if (Op.getOpcode() == ISD::ADD) {
+      C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (C == 0 || GA == 0) {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
+      }
+      if (C == 0 || GA == 0)
+        C = 0, GA = 0;
+    }
+
+    // If we find a valid operand, map to the TargetXXX version so that the
+    // value itself doesn't get selected.
+    if (GA) {   // Either &GV   or   &GV+C
+      if (ConstraintLetter != 'n') {
+        int64_t Offs = GA->getOffset();
+        if (C) Offs += C->getZExtValue();
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                                 C ? C->getDebugLoc() : DebugLoc(),
+                                                 Op.getValueType(), Offs));
+        return;
+      }
+    }
+    if (C) {   // just C, no GV.
+      // Simple constants are not allowed for 's'.
+      if (ConstraintLetter != 's') {
+        // gcc prints these as sign extended.  Sign extend value to 64 bits
+        // now; without this it would get ZExt'd later in
+        // ScheduleDAGSDNodes::EmitNode, which is very generic.
+        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+                                            MVT::i64));
+        return;
+      }
+    }
+    break;
+  }
+  }
+}
+
+std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint[0] != '{')
+    return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+  assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
+
+  // Remove the braces from around the name.
+  StringRef RegName(Constraint.data()+1, Constraint.size()-2);
+
+  std::pair<unsigned, const TargetRegisterClass*> R =
+    std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+
+  // Figure out which register class contains this reg.
+  const TargetRegisterInfo *RI = getTargetMachine().getRegisterInfo();
+  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
+       E = RI->regclass_end(); RCI != E; ++RCI) {
+    const TargetRegisterClass *RC = *RCI;
+
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    if (!isLegalRC(RC))
+      continue;
+
+    for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+         I != E; ++I) {
+      if (RegName.equals_lower(RI->getName(*I))) {
+        std::pair<unsigned, const TargetRegisterClass*> S =
+          std::make_pair(*I, RC);
+
+        // If this register class has the requested value type, return it,
+        // otherwise keep searching and return the first class found
+        // if no other is found which explicitly has the requested type.
+        if (RC->hasType(VT))
+          return S;
+        else if (!R.second)
+          R = S;
+      }
+    }
+  }
+
+  return R;
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint Selection.
+
+/// isMatchingInputConstraint - Return true of this is an input operand that is
+/// a matching constraint like "4".
+bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
+}
+
+/// getMatchedOperand - If this is an input matching constraint, this method
+/// returns the output operand it matches.
+unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return atoi(ConstraintCode.c_str());
+}
+
+
+/// ParseConstraints - Split up the constraint string from the inline
+/// assembly value into the specific constraints and their prefixes,
+/// and also tie in the associated operand values.
+/// If this returns an empty vector, and if the constraint string itself
+/// isn't empty, there was an error parsing.
+TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
+    ImmutableCallSite CS) const {
+  /// ConstraintOperands - Information about all of the constraints.
+  AsmOperandInfoVector ConstraintOperands;
+  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+  unsigned maCount = 0; // Largest number of multiple alternative constraints.
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  InlineAsm::ConstraintInfoVector
+    ConstraintInfos = IA->ParseConstraints();
+
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0;   // ResNo - The result number of the next output.
+
+  for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) {
+    ConstraintOperands.push_back(AsmOperandInfo(ConstraintInfos[i]));
+    AsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Update multiple alternative constraint count.
+    if (OpInfo.multipleAlternatives.size() > maCount)
+      maCount = OpInfo.multipleAlternatives.size();
+
+    OpInfo.ConstraintVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(!CS.getType()->isVoidTy() &&
+             "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+        OpInfo.ConstraintVT = getSimpleValueType(STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpInfo.ConstraintVT = getSimpleValueType(CS.getType());
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    if (OpInfo.CallOperandVal) {
+      llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
+      if (OpInfo.isIndirect) {
+        llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+        if (!PtrTy)
+          report_fatal_error("Indirect operand for inline asm not a pointer!");
+        OpTy = PtrTy->getElementType();
+      }
+
+      // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+      if (StructType *STy = dyn_cast<StructType>(OpTy))
+        if (STy->getNumElements() == 1)
+          OpTy = STy->getElementType(0);
+
+      // If OpTy is not a single value, it may be a struct/union that we
+      // can tile with integers.
+      if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+        unsigned BitSize = getDataLayout()->getTypeSizeInBits(OpTy);
+        switch (BitSize) {
+        default: break;
+        case 1:
+        case 8:
+        case 16:
+        case 32:
+        case 64:
+        case 128:
+          OpInfo.ConstraintVT =
+            MVT::getVT(IntegerType::get(OpTy->getContext(), BitSize), true);
+          break;
+        }
+      } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
+        OpInfo.ConstraintVT = MVT::getIntegerVT(
+            8*getDataLayout()->getPointerSize(PT->getAddressSpace()));
+      } else {
+        OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
+      }
+    }
+  }
+
+  // If we have multiple alternative constraints, select the best alternative.
+  if (ConstraintInfos.size()) {
+    if (maCount) {
+      unsigned bestMAIndex = 0;
+      int bestWeight = -1;
+      // weight:  -1 = invalid match, and 0 = so-so match to 5 = good match.
+      int weight = -1;
+      unsigned maIndex;
+      // Compute the sums of the weights for each alternative, keeping track
+      // of the best (highest weight) one so far.
+      for (maIndex = 0; maIndex < maCount; ++maIndex) {
+        int weightSum = 0;
+        for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+            cIndex != eIndex; ++cIndex) {
+          AsmOperandInfo& OpInfo = ConstraintOperands[cIndex];
+          if (OpInfo.Type == InlineAsm::isClobber)
+            continue;
+
+          // If this is an output operand with a matching input operand,
+          // look up the matching input. If their types mismatch, e.g. one
+          // is an integer, the other is floating point, or their sizes are
+          // different, flag it as an maCantMatch.
+          if (OpInfo.hasMatchingInput()) {
+            AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+            if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+              if ((OpInfo.ConstraintVT.isInteger() !=
+                   Input.ConstraintVT.isInteger()) ||
+                  (OpInfo.ConstraintVT.getSizeInBits() !=
+                   Input.ConstraintVT.getSizeInBits())) {
+                weightSum = -1;  // Can't match.
+                break;
+              }
+            }
+          }
+          weight = getMultipleConstraintMatchWeight(OpInfo, maIndex);
+          if (weight == -1) {
+            weightSum = -1;
+            break;
+          }
+          weightSum += weight;
+        }
+        // Update best.
+        if (weightSum > bestWeight) {
+          bestWeight = weightSum;
+          bestMAIndex = maIndex;
+        }
+      }
+
+      // Now select chosen alternative in each constraint.
+      for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+          cIndex != eIndex; ++cIndex) {
+        AsmOperandInfo& cInfo = ConstraintOperands[cIndex];
+        if (cInfo.Type == InlineAsm::isClobber)
+          continue;
+        cInfo.selectAlternative(bestMAIndex);
+      }
+    }
+  }
+
+  // Check and hook up tied operands, choose constraint code to use.
+  for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+      cIndex != eIndex; ++cIndex) {
+    AsmOperandInfo& OpInfo = ConstraintOperands[cIndex];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                       Input.ConstraintVT);
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (MatchRC.second != InputRC.second)) {
+          report_fatal_error("Unsupported asm: input constraint"
+                             " with a matching output constraint of"
+                             " incompatible type!");
+        }
+      }
+
+    }
+  }
+
+  return ConstraintOperands;
+}
+
+
+/// getConstraintGenerality - Return an integer indicating how general CT
+/// is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+    return 3;
+  }
+  llvm_unreachable("Invalid constraint type");
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getMultipleConstraintMatchWeight(
+    AsmOperandInfo &info, int maIndex) const {
+  InlineAsm::ConstraintCodeVector *rCodes;
+  if (maIndex >= (int)info.multipleAlternatives.size())
+    rCodes = &info.Codes;
+  else
+    rCodes = &info.multipleAlternatives[maIndex].Codes;
+  ConstraintWeight BestWeight = CW_Invalid;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = rCodes->size(); i != e; ++i) {
+    ConstraintWeight weight =
+      getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str());
+    if (weight > BestWeight)
+      BestWeight = weight;
+  }
+
+  return BestWeight;
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*constraint) {
+    case 'i': // immediate integer.
+    case 'n': // immediate integer with a known value.
+      if (isa<ConstantInt>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 's': // non-explicit intregal immediate.
+      if (isa<GlobalValue>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 'E': // immediate float if host format.
+    case 'F': // immediate float.
+      if (isa<ConstantFP>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case '<': // memory operand with autodecrement.
+    case '>': // memory operand with autoincrement.
+    case 'm': // memory operand.
+    case 'o': // offsettable memory operand
+    case 'V': // non-offsettable memory operand
+      weight = CW_Memory;
+      break;
+    case 'r': // general register.
+    case 'g': // general register, memory operand or immediate integer.
+              // note: Clang converts "g" to "imr".
+      if (CallOperandVal->getType()->isIntegerTy())
+        weight = CW_Register;
+      break;
+    case 'X': // any operand.
+    default:
+      weight = CW_Default;
+      break;
+  }
+  return weight;
+}
+
+/// ChooseConstraint - If there are multiple different constraints that we
+/// could pick for this operand (e.g. "imr") try to pick the 'best' one.
+/// This is somewhat tricky: constraints fall into four classes:
+///    Other         -> immediates and magic values
+///    Register      -> one specific register
+///    RegisterClass -> a group of regs
+///    Memory        -> memory
+/// Ideally, we would pick the most specific constraint possible: if we have
+/// something that fits into a register, we would pick it.  The problem here
+/// is that if we have something that could either be in a register or in
+/// memory that use of the register could cause selection of *other*
+/// operands to fail: they might only succeed if we pick memory.  Because of
+/// this the heuristic we use is:
+///
+///  1) If there is an 'other' constraint, and if the operand is valid for
+///     that constraint, use it.  This makes us take advantage of 'i'
+///     constraints when available.
+///  2) Otherwise, pick the most general constraint present.  This prefers
+///     'm' over 'r', for example.
+///
+static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering &TLI,
+                             SDValue Op, SelectionDAG *DAG) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+      TLI.getConstraintType(OpInfo.Codes[i]);
+
+    // If this is an 'other' constraint, see if the operand is valid for it.
+    // For example, on X86 we might have an 'rI' constraint.  If the operand
+    // is an integer in the range [0..31] we want to use I (saving a load
+    // of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other && Op.getNode()) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      std::vector<SDValue> ResultOps;
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
+                                       ResultOps, *DAG);
+      if (!ResultOps.empty()) {
+        BestType = CType;
+        BestIdx = i;
+        break;
+      }
+    }
+
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+/// ComputeConstraintToUse - Determines the constraint code and constraint
+/// type to use for the specific AsmOperandInfo, setting
+/// OpInfo.ConstraintCode and OpInfo.ConstraintType.
+void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                            SDValue Op,
+                                            SelectionDAG *DAG) const {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    ChooseConstraint(OpInfo, *this, Op, DAG);
+  }
+
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).  For Functions, the type here is the type of
+    // the result, which is not what we want to look at; leave them alone.
+    Value *v = OpInfo.CallOperandVal;
+    if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) {
+      OpInfo.CallOperandVal = v;
+      return;
+    }
+
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+/// BuildExactDiv - Given an exact SDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
+                                       SelectionDAG &DAG) const {
+  ConstantSDNode *C = cast<ConstantSDNode>(Op2);
+  APInt d = C->getAPIntValue();
+  assert(d != 0 && "Division by zero!");
+
+  // Shift the value upfront if it is even, so the LSB is one.
+  unsigned ShAmt = d.countTrailingZeros();
+  if (ShAmt) {
+    // TODO: For UDIV use SRL instead of SRA.
+    SDValue Amt = DAG.getConstant(ShAmt, getShiftAmountTy(Op1.getValueType()));
+    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt);
+    d = d.ashr(ShAmt);
+  }
+
+  // Calculate the multiplicative inverse, using Newton's method.
+  APInt t, xn = d;
+  while ((t = d*xn) != 1)
+    xn *= APInt(d.getBitWidth(), 2) - t;
+
+  Op2 = DAG.getConstant(xn, Op1.getValueType());
+  return DAG.getNode(ISD::MUL, dl, Op1.getValueType(), Op1, Op2);
+}
+
+/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::
+BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+          std::vector<SDNode*> *Created) const {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl= N->getDebugLoc();
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  APInt d = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  APInt::ms magics = d.magic();
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) :
+                            isOperationLegalOrCustom(ISD::MULHS, VT))
+    Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, VT));
+  else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) :
+                                 isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhs or equvialent
+  // If d > 0 and m < 0, add the numerator
+  if (d.isStrictlyPositive() && magics.m.isNegative()) {
+    Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // If d < 0 and m > 0, subtract the numerator.
+  if (d.isNegative() && magics.m.isStrictlyPositive()) {
+    Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Shift right algebraic if shift value is nonzero
+  if (magics.s > 0) {
+    Q = DAG.getNode(ISD::SRA, dl, VT, Q,
+                 DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
+    if (Created)
+      Created->push_back(Q.getNode());
+  }
+  // Extract the sign bit and add it to the quotient
+  SDValue T =
+    DAG.getNode(ISD::SRL, dl, VT, Q, DAG.getConstant(VT.getSizeInBits()-1,
+                                           getShiftAmountTy(Q.getValueType())));
+  if (Created)
+    Created->push_back(T.getNode());
+  return DAG.getNode(ISD::ADD, dl, VT, Q, T);
+}
+
+/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.  See:
+/// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
+SDValue TargetLowering::
+BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
+          std::vector<SDNode*> *Created) const {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // FIXME: We should use a narrower constant when the upper
+  // bits are known to be zero.
+  const APInt &N1C = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  APInt::mu magics = N1C.magicu();
+
+  SDValue Q = N->getOperand(0);
+
+  // If the divisor is even, we can avoid using the expensive fixup by shifting
+  // the divided value upfront.
+  if (magics.a != 0 && !N1C[0]) {
+    unsigned Shift = N1C.countTrailingZeros();
+    Q = DAG.getNode(ISD::SRL, dl, VT, Q,
+                    DAG.getConstant(Shift, getShiftAmountTy(Q.getValueType())));
+    if (Created)
+      Created->push_back(Q.getNode());
+
+    // Get magic number for the shifted divisor.
+    magics = N1C.lshr(Shift).magicu(Shift);
+    assert(magics.a == 0 && "Should use cheap fixup now");
+  }
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) :
+                            isOperationLegalOrCustom(ISD::MULHU, VT))
+    Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, VT));
+  else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) :
+                                 isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q,
+                            DAG.getConstant(magics.m, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhu or equvialent
+  if (Created)
+    Created->push_back(Q.getNode());
+
+  if (magics.a == 0) {
+    assert(magics.s < N1C.getBitWidth() &&
+           "We shouldn't generate an undefined shift!");
+    return DAG.getNode(ISD::SRL, dl, VT, Q,
+                 DAG.getConstant(magics.s, getShiftAmountTy(Q.getValueType())));
+  } else {
+    SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::SRL, dl, VT, NPQ,
+                      DAG.getConstant(1, getShiftAmountTy(NPQ.getValueType())));
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+    if (Created)
+      Created->push_back(NPQ.getNode());
+    return DAG.getNode(ISD::SRL, dl, VT, NPQ,
+             DAG.getConstant(magics.s-1, getShiftAmountTy(NPQ.getValueType())));
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..f769b44efbb3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- TargetSelectionDAGInfo.cpp - SelectionDAG Info --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+TargetSelectionDAGInfo::TargetSelectionDAGInfo(const TargetMachine &TM)
+  : TD(TM.getDataLayout()) {
+}
+
+TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGC.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGC.cpp
new file mode 100644
index 000000000000..10f64c709c7a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ShadowStackGC.cpp
@@ -0,0 +1,452 @@
+//===-- ShadowStackGC.cpp - GC support for uncooperative targets ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering for the llvm.gc* intrinsics for targets that do
+// not natively support them (which includes the C backend). Note that the code
+// generated is not quite as efficient as algorithms which generate stack maps
+// to identify roots.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
+//
+// In runtime/GC/SemiSpace.cpp is a prototype runtime which is compatible with
+// ShadowStackGC.
+//
+// In order to support this particular transformation, all stack roots are
+// coallocated in the stack. This allows a fully target-independent stack map
+// while introducing only minor runtime overhead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "shadowstackgc"
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CallSite.h"
+
+using namespace llvm;
+
+namespace {
+
+  class ShadowStackGC : public GCStrategy {
+    /// RootChain - This is the global linked-list that contains the chain of GC
+    /// roots.
+    GlobalVariable *Head;
+
+    /// StackEntryTy - Abstract type of a link in the shadow stack.
+    ///
+    StructType *StackEntryTy;
+    StructType *FrameMapTy;
+
+    /// Roots - GC roots in the current function. Each is a pair of the
+    /// intrinsic call and its corresponding alloca.
+    std::vector<std::pair<CallInst*,AllocaInst*> > Roots;
+
+  public:
+    ShadowStackGC();
+
+    bool initializeCustomLowering(Module &M);
+    bool performCustomLowering(Function &F);
+
+  private:
+    bool IsNullValue(Value *V);
+    Constant *GetFrameMap(Function &F);
+    Type* GetConcreteStackEntryType(Function &F);
+    void CollectRoots(Function &F);
+    static GetElementPtrInst *CreateGEP(LLVMContext &Context, 
+                                        IRBuilder<> &B, Value *BasePtr,
+                                        int Idx1, const char *Name);
+    static GetElementPtrInst *CreateGEP(LLVMContext &Context,
+                                        IRBuilder<> &B, Value *BasePtr,
+                                        int Idx1, int Idx2, const char *Name);
+  };
+
+}
+
+static GCRegistry::Add<ShadowStackGC>
+X("shadow-stack", "Very portable GC for uncooperative code generators");
+
+namespace {
+  /// EscapeEnumerator - This is a little algorithm to find all escape points
+  /// from a function so that "finally"-style code can be inserted. In addition
+  /// to finding the existing return and unwind instructions, it also (if
+  /// necessary) transforms any call instructions into invokes and sends them to
+  /// a landing pad.
+  ///
+  /// It's wrapped up in a state machine using the same transform C# uses for
+  /// 'yield return' enumerators, This transform allows it to be non-allocating.
+  class EscapeEnumerator {
+    Function &F;
+    const char *CleanupBBName;
+
+    // State.
+    int State;
+    Function::iterator StateBB, StateE;
+    IRBuilder<> Builder;
+
+  public:
+    EscapeEnumerator(Function &F, const char *N = "cleanup")
+      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
+
+    IRBuilder<> *Next() {
+      switch (State) {
+      default:
+        return 0;
+
+      case 0:
+        StateBB = F.begin();
+        StateE = F.end();
+        State = 1;
+
+      case 1:
+        // Find all 'return', 'resume', and 'unwind' instructions.
+        while (StateBB != StateE) {
+          BasicBlock *CurBB = StateBB++;
+
+          // Branches and invokes do not escape, only unwind, resume, and return
+          // do.
+          TerminatorInst *TI = CurBB->getTerminator();
+          if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+            continue;
+
+          Builder.SetInsertPoint(TI->getParent(), TI);
+          return &Builder;
+        }
+
+        State = 2;
+
+        // Find all 'call' instructions.
+        SmallVector<Instruction*,16> Calls;
+        for (Function::iterator BB = F.begin(),
+                                E = F.end(); BB != E; ++BB)
+          for (BasicBlock::iterator II = BB->begin(),
+                                    EE = BB->end(); II != EE; ++II)
+            if (CallInst *CI = dyn_cast<CallInst>(II))
+              if (!CI->getCalledFunction() ||
+                  !CI->getCalledFunction()->getIntrinsicID())
+                Calls.push_back(CI);
+
+        if (Calls.empty())
+          return 0;
+
+        // Create a cleanup block.
+        LLVMContext &C = F.getContext();
+        BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+        Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
+                                      Type::getInt32Ty(C), NULL);
+        Constant *PersFn =
+          F.getParent()->
+          getOrInsertFunction("__gcc_personality_v0",
+                              FunctionType::get(Type::getInt32Ty(C), true));
+        LandingPadInst *LPad = LandingPadInst::Create(ExnTy, PersFn, 1,
+                                                      "cleanup.lpad",
+                                                      CleanupBB);
+        LPad->setCleanup(true);
+        ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+        // Transform the 'call' instructions into 'invoke's branching to the
+        // cleanup block. Go in reverse order to make prettier BB names.
+        SmallVector<Value*,16> Args;
+        for (unsigned I = Calls.size(); I != 0; ) {
+          CallInst *CI = cast<CallInst>(Calls[--I]);
+
+          // Split the basic block containing the function call.
+          BasicBlock *CallBB = CI->getParent();
+          BasicBlock *NewBB =
+            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
+
+          // Remove the unconditional branch inserted at the end of CallBB.
+          CallBB->getInstList().pop_back();
+          NewBB->getInstList().remove(CI);
+
+          // Create a new invoke instruction.
+          Args.clear();
+          CallSite CS(CI);
+          Args.append(CS.arg_begin(), CS.arg_end());
+
+          InvokeInst *II = InvokeInst::Create(CI->getCalledValue(),
+                                              NewBB, CleanupBB,
+                                              Args, CI->getName(), CallBB);
+          II->setCallingConv(CI->getCallingConv());
+          II->setAttributes(CI->getAttributes());
+          CI->replaceAllUsesWith(II);
+          delete CI;
+        }
+
+        Builder.SetInsertPoint(RI->getParent(), RI);
+        return &Builder;
+      }
+    }
+  };
+}
+
+// -----------------------------------------------------------------------------
+
+void llvm::linkShadowStackGC() { }
+
+ShadowStackGC::ShadowStackGC() : Head(0), StackEntryTy(0) {
+  InitRoots = true;
+  CustomRoots = true;
+}
+
+Constant *ShadowStackGC::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant*, 16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+  Metadata.resize(NumMeta);
+
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  
+  Constant *BaseElts[] = {
+    ConstantInt::get(Int32Ty, Roots.size(), false),
+    ConstantInt::get(Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+    ConstantStruct::get(FrameMapTy, BaseElts),
+    ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)
+  };
+
+  Type *EltTys[] = { DescriptorElts[0]->getType(),DescriptorElts[1]->getType()};
+  StructType *STy = StructType::create(EltTys, "gc_map."+utostr(NumMeta));
+  
+  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage,
+                                    FrameMap, "__gc_" + F.getName());
+
+  Constant *GEPIndices[2] = {
+                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)
+                          };
+  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
+}
+
+Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<Type*> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (size_t I = 0; I != Roots.size(); I++)
+    EltTys.push_back(Roots[I].second->getAllocatedType());
+  
+  return StructType::create(EltTys, "gc_stackentry."+F.getName().str());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGC::initializeCustomLowering(Module &M) {
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<Type*> EltTys;
+  // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  // Specifies length of variable length array. 
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  FrameMapTy = StructType::create(EltTys, "gc_map");
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+  
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
+  
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
+  EltTys.push_back(FrameMapPtrTy);
+  StackEntryTy->setBody(EltTys);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(M, StackEntryPtrTy, false,
+                              GlobalValue::LinkOnceAnyLinkage,
+                              Constant::getNullValue(StackEntryPtrTy),
+                              "llvm_gc_root_chain");
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGC::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGC::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst*, AllocaInst*>, 16> MetaRoots;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst*, AllocaInst*> Pair = std::make_pair(
+              CI, cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *
+ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
+                         int Idx, int Idx2, const char *Name) {
+  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
+                       ConstantInt::get(Type::getInt32Ty(Context), Idx),
+                       ConstantInt::get(Type::getInt32Ty(Context), Idx2) };
+  Value* Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *
+ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
+                         int Idx, const char *Name) {
+  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
+                       ConstantInt::get(Type::getInt32Ty(Context), Idx) };
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGC::performCustomLowering(Function &F) {
+  LLVMContext &Context = F.getContext();
+  
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry   = AtEntry.CreateAlloca(ConcreteStackEntryTy, 0,
+                                                   "gc_frame");
+
+  while (isa<AllocaInst>(IP)) ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead  = AtEntry.CreateLoad(Head, "gc_currhead");
+  Instruction *EntryMapPtr  = CreateGEP(Context, AtEntry, StackEntry,
+                                        0,1,"gc_frame.map");
+  AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(Context, AtEntry, StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP)) ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr = CreateGEP(Context, AtEntry,
+                                        StackEntry,0,0,"gc_frame.next");
+  Instruction *NewHeadVal   = CreateGEP(Context, AtEntry, 
+                                        StackEntry, 0, "gc_newhead");
+  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+  AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 = CreateGEP(Context, *AtExit, StackEntry, 0, 0,
+                                           "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
+                       AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    Roots[I].first->eraseFromParent();
+    Roots[I].second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrapping.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrapping.cpp
new file mode 100644
index 000000000000..9ab491808fe5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ShrinkWrapping.cpp
@@ -0,0 +1,1152 @@
+//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a shrink wrapping variant of prolog/epilog insertion:
+// - Spills and restores of callee-saved registers (CSRs) are placed in the
+//   machine CFG to tightly surround their uses so that execution paths that
+//   do not use CSRs do not pay the spill/restore penalty.
+//
+// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
+//   loop the spills are placed in the loop preheader, and restores are
+//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
+//
+// - Covering paths without CSR uses:
+//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
+//   the use info for the CSRs inside the region is propagated outward in the
+//   CFG to ensure validity of the spill/restore placements. This decreases
+//   the effectiveness of shrink wrapping but does not require edge splitting
+//   in the machine CFG.
+//
+// This shrink wrapping implementation uses an iterative analysis to determine
+// which basic blocks require spills and restores for CSRs.
+//
+// This pass uses MachineDominators and MachineLoopInfo. Loop information
+// is used to prevent placement of callee-saved register spills/restores
+// in the bodies of loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "shrink-wrap"
+
+#include "PrologEpilogInserter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+using namespace llvm;
+
+STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
+
+// Shrink Wrapping:
+static cl::opt<bool>
+ShrinkWrapping("shrink-wrap",
+               cl::desc("Shrink wrap callee-saved register spills/restores"));
+
+// Shrink wrap only the specified function, a debugging aid.
+static cl::opt<std::string>
+ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
+               cl::desc("Shrink wrap the specified function"),
+               cl::value_desc("funcname"),
+               cl::init(""));
+
+// Debugging level for shrink wrapping.
+enum ShrinkWrapDebugLevel {
+  None, BasicInfo, Iterations, Details
+};
+
+static cl::opt<enum ShrinkWrapDebugLevel>
+ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
+  cl::desc("Print shrink wrapping debugging information"),
+  cl::values(
+    clEnumVal(None      , "disable debug output"),
+    clEnumVal(BasicInfo , "print basic DF sets"),
+    clEnumVal(Iterations, "print SR sets for each iteration"),
+    clEnumVal(Details   , "print all DF sets"),
+    clEnumValEnd));
+
+
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  if (ShrinkWrapping || ShrinkWrapFunc != "") {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<MachineDominatorTree>();
+  }
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+//===----------------------------------------------------------------------===//
+//  ShrinkWrapping implementation
+//===----------------------------------------------------------------------===//
+
+// Convienences for dealing with machine loops.
+MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
+  assert(LP && "Machine loop is NULL.");
+  MachineBasicBlock* PHDR = LP->getLoopPreheader();
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    PHDR = PLP->getLoopPreheader();
+    PLP = PLP->getParentLoop();
+  }
+  return PHDR;
+}
+
+MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
+  if (LP == 0)
+    return 0;
+  MachineLoop* PLP = LP->getParentLoop();
+  while (PLP) {
+    LP = PLP;
+    PLP = PLP->getParentLoop();
+  }
+  return LP;
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().isReturn());
+}
+
+// Initialize shrink wrapping DFA sets, called before iterations.
+void PEI::clearAnticAvailSets() {
+  AnticIn.clear();
+  AnticOut.clear();
+  AvailIn.clear();
+  AvailOut.clear();
+}
+
+// Clear all sets constructed by shrink wrapping.
+void PEI::clearAllSets() {
+  ReturnBlocks.clear();
+  clearAnticAvailSets();
+  UsedCSRegs.clear();
+  CSRUsed.clear();
+  TLLoops.clear();
+  CSRSave.clear();
+  CSRRestore.clear();
+}
+
+// Initialize all shrink wrapping data.
+void PEI::initShrinkWrappingInfo() {
+  clearAllSets();
+  EntryBlock = 0;
+#ifndef NDEBUG
+  HasFastExitPath = false;
+#endif
+  ShrinkWrapThisFunction = ShrinkWrapping;
+  // DEBUG: enable or disable shrink wrapping for the current function
+  // via --shrink-wrap-func=<funcname>.
+#ifndef NDEBUG
+  if (ShrinkWrapFunc != "") {
+    std::string MFName = MF->getName().str();
+    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
+  }
+#endif
+}
+
+
+/// placeCSRSpillsAndRestores - determine which MBBs of the function
+/// need save, restore code for callee-saved registers by doing a DF analysis
+/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
+/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
+/// is used to ensure that CSR save/restore code is not placed inside loops.
+/// This function computes the maps of MBBs -> CSRs to spill and restore
+/// in CSRSave, CSRRestore.
+///
+/// If shrink wrapping is not being performed, place all spills in
+/// the entry block, all restores in return blocks. In this case,
+/// CSRSave has a single mapping, CSRRestore has mappings for each
+/// return block.
+///
+void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
+
+  DEBUG(MF = &Fn);
+
+  initShrinkWrappingInfo();
+
+  DEBUG(if (ShrinkWrapThisFunction) {
+      dbgs() << "Place CSR spills/restores for "
+             << MF->getName() << "\n";
+    });
+
+  if (calculateSets(Fn))
+    placeSpillsAndRestores(Fn);
+}
+
+/// calcAnticInOut - calculate the anticipated in/out reg sets
+/// for the given MBB by looking forward in the MCFG at MBB's
+/// successors.
+///
+bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    CSRegSet prevAnticOut = AnticOut[MBB];
+    MachineBasicBlock* SUCC = successors[i];
+
+    AnticOut[MBB] = AnticIn[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      AnticOut[MBB] &= AnticIn[SUCC];
+    }
+    if (prevAnticOut != AnticOut[MBB])
+      changed = true;
+  }
+
+  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
+  CSRegSet prevAnticIn = AnticIn[MBB];
+  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
+  if (prevAnticIn != AnticIn[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calcAvailInOut - calculate the available in/out reg sets
+/// for the given MBB by looking backward in the MCFG at MBB's
+/// predecessors.
+///
+bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
+  bool changed = false;
+
+  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    CSRegSet prevAvailIn = AvailIn[MBB];
+    MachineBasicBlock* PRED = predecessors[i];
+
+    AvailIn[MBB] = AvailOut[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      AvailIn[MBB] &= AvailOut[PRED];
+    }
+    if (prevAvailIn != AvailIn[MBB])
+      changed = true;
+  }
+
+  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
+  CSRegSet prevAvailOut = AvailOut[MBB];
+  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
+  if (prevAvailOut != AvailOut[MBB])
+    changed = true;
+  return changed;
+}
+
+/// calculateAnticAvail - build the sets anticipated and available
+/// registers in the MCFG of the current function iteratively,
+/// doing a combined forward and backward analysis.
+///
+void PEI::calculateAnticAvail(MachineFunction &Fn) {
+  // Initialize data flow sets.
+  clearAnticAvailSets();
+
+  // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
+  bool changed = true;
+  unsigned iterations = 0;
+  while (changed) {
+    changed = false;
+    ++iterations;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Calculate anticipated in, out regs at MBB from
+      // anticipated at successors of MBB.
+      changed |= calcAnticInOut(MBB);
+
+      // Calculate available in, out regs at MBB from
+      // available at predecessors of MBB.
+      changed |= calcAvailInOut(MBB);
+    }
+  }
+
+  DEBUG({
+      if (ShrinkWrapDebugging >= Details) {
+        dbgs()
+          << "-----------------------------------------------------------\n"
+          << " Antic/Avail Sets:\n"
+          << "-----------------------------------------------------------\n"
+          << "iterations = " << iterations << "\n"
+          << "-----------------------------------------------------------\n"
+          << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n"
+          << "-----------------------------------------------------------\n";
+
+        for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+             MBBI != MBBE; ++MBBI) {
+          MachineBasicBlock* MBB = MBBI;
+          dumpSets(MBB);
+        }
+
+        dbgs()
+          << "-----------------------------------------------------------\n";
+      }
+    });
+}
+
+/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
+/// of the loop given by LP and its parent loops. This prevents spills/restores
+/// from being placed in the bodies of loops.
+///
+void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
+  if (! MBB || !LP)
+    return;
+
+  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
+  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
+    MachineBasicBlock* LBB = loopBlocks[i];
+    if (LBB == MBB)
+      continue;
+    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
+      continue;
+    CSRUsed[LBB] |= CSRUsed[MBB];
+  }
+}
+
+/// calculateSets - collect the CSRs used in this function, compute
+/// the DF sets that describe the initial minimal regions in the
+/// Machine CFG around which CSR spills and restores must be placed.
+///
+/// Additionally, this function decides if shrink wrapping should
+/// be disabled for the current function, checking the following:
+///  1. the current function has more than 500 MBBs: heuristic limit
+///     on function size to reduce compile time impact of the current
+///     iterative algorithm.
+///  2. all CSRs are used in the entry block.
+///  3. all CSRs are used in all immediate successors of the entry block.
+///  4. all CSRs are used in a subset of blocks, each of which dominates
+///     all return blocks. These blocks, taken as a subgraph of the MCFG,
+///     are equivalent to the entry block since all execution paths pass
+///     through them.
+///
+bool PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty()) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            dbgs() << "DISABLED: " << Fn.getName()
+                   << ": uses no callee-saved registers\n");
+    return false;
+  }
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  // Determine if this function has fast exit paths.
+  DEBUG(if (ShrinkWrapThisFunction)
+          findFastExitPath());
+
+  // Limit shrink wrapping via the current iterative bit vector
+  // implementation to functions with <= 500 MBBs.
+  if (Fn.size() > 500) {
+    DEBUG(if (ShrinkWrapThisFunction)
+            dbgs() << "DISABLED: " << Fn.getName()
+                   << ": too large (" << Fn.size() << " MBBs)\n");
+    ShrinkWrapThisFunction = false;
+  }
+
+  // Return now if not shrink wrapping.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  // Collect set of used CSRs.
+  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+    UsedCSRegs.set(inx);
+  }
+
+  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
+  // whether or not to shrink wrap this function.
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
+
+  bool allCSRUsesInEntryBlock = true;
+  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
+      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
+        unsigned Reg = CSI[inx].getReg();
+        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
+        // CSRUsed map for the current block.
+        for (unsigned opInx = 0, opEnd = I->getNumOperands();
+             opInx != opEnd; ++opInx) {
+          const MachineOperand &MO = I->getOperand(opInx);
+          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
+            continue;
+          unsigned MOReg = MO.getReg();
+          if (!MOReg)
+            continue;
+          if (MOReg == Reg ||
+              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+               TargetRegisterInfo::isPhysicalRegister(Reg) &&
+               TRI->isSubRegister(Reg, MOReg))) {
+            // CSR Reg is defined/used in block MBB.
+            CSRUsed[MBB].set(inx);
+            // Check for uses in EntryBlock.
+            if (MBB != EntryBlock)
+              allCSRUsesInEntryBlock = false;
+          }
+        }
+      }
+    }
+
+    if (CSRUsed[MBB].empty())
+      continue;
+
+    // Propagate CSRUsed[MBB] in loops
+    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
+      // Add top level loop to work list.
+      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
+      MachineLoop* PLP = getTopLevelLoopParent(LP);
+
+      if (! HDR) {
+        HDR = PLP->getHeader();
+        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
+        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
+        HDR = *PI;
+      }
+      TLLoops[HDR] = PLP;
+
+      // Push uses from inside loop to its parent loops,
+      // or to all other MBBs in its loop.
+      if (LP->getLoopDepth() > 1) {
+        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
+             PLP = PLP->getParentLoop()) {
+          propagateUsesAroundLoop(MBB, PLP);
+        }
+      } else {
+        propagateUsesAroundLoop(MBB, LP);
+      }
+    }
+  }
+
+  if (allCSRUsesInEntryBlock) {
+    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
+                 << ": all CSRs used in EntryBlock\n");
+    ShrinkWrapThisFunction = false;
+  } else {
+    bool allCSRsUsedInEntryFanout = true;
+    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (CSRUsed[SUCC] != UsedCSRegs)
+        allCSRsUsedInEntryFanout = false;
+    }
+    if (allCSRsUsedInEntryFanout) {
+      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
+                   << ": all CSRs used in imm successors of EntryBlock\n");
+      ShrinkWrapThisFunction = false;
+    }
+  }
+
+  if (ShrinkWrapThisFunction) {
+    // Check if MBB uses CSRs and dominates all exit nodes.
+    // Such nodes are equiv. to the entry node w.r.t.
+    // CSR uses: every path through the function must
+    // pass through this node. If each CSR is used at least
+    // once by these nodes, shrink wrapping is disabled.
+    CSRegSet CSRUsedInChokePoints;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
+        continue;
+      bool dominatesExitNodes = true;
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
+          dominatesExitNodes = false;
+          break;
+        }
+      if (dominatesExitNodes) {
+        CSRUsedInChokePoints |= CSRUsed[MBB];
+        if (CSRUsedInChokePoints == UsedCSRegs) {
+          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
+                       << ": all CSRs used in choke point(s) at "
+                       << getBasicBlockName(MBB) << "\n");
+          ShrinkWrapThisFunction = false;
+          break;
+        }
+      }
+    }
+  }
+
+  // Return now if we have decided not to apply shrink wrapping
+  // to the current function.
+  if (! ShrinkWrapThisFunction)
+    return false;
+
+  DEBUG({
+      dbgs() << "ENABLED: " << Fn.getName();
+      if (HasFastExitPath)
+        dbgs() << " (fast exit path)";
+      dbgs() << "\n";
+      if (ShrinkWrapDebugging >= BasicInfo) {
+        dbgs() << "------------------------------"
+             << "-----------------------------\n";
+        dbgs() << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
+        if (ShrinkWrapDebugging >= Details) {
+          dbgs() << "------------------------------"
+               << "-----------------------------\n";
+          dumpAllUsed();
+        }
+      }
+    });
+
+  // Build initial DF sets to determine minimal regions in the
+  // Machine CFG around which CSRs must be spilled and restored.
+  calculateAnticAvail(Fn);
+
+  return true;
+}
+
+/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
+/// multi-entry, multi-exit (MEME) regions so spill and restore
+/// placement will not break code that enters or leaves a
+/// shrink-wrapped region by inducing spills with no matching
+/// restores or restores with no matching spills. A MEME region
+/// is a subgraph of the MCFG with multiple entry edges, multiple
+/// exit edges, or both. This code propagates use information
+/// through the MCFG until all paths requiring spills and restores
+/// _outside_ the computed minimal placement regions have been covered.
+///
+bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
+                               SmallVector<MachineBasicBlock*, 4>& blks) {
+  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
+    bool processThisBlock = false;
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+           SE = MBB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock* SUCC = *SI;
+      if (SUCC->pred_size() > 1) {
+        processThisBlock = true;
+        break;
+      }
+    }
+    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
+      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+        MachineBasicBlock* PRED = *PI;
+        if (PRED->succ_size() > 1) {
+          processThisBlock = true;
+          break;
+        }
+      }
+    }
+    if (! processThisBlock)
+      return false;
+  }
+
+  CSRegSet prop;
+  if (!CSRSave[MBB].empty())
+    prop = CSRSave[MBB];
+  else if (!CSRRestore[MBB].empty())
+    prop = CSRRestore[MBB];
+  else
+    prop = CSRUsed[MBB];
+  if (prop.empty())
+    return false;
+
+  // Propagate selected bits to successors, predecessors of MBB.
+  bool addedUses = false;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    // Self-loop
+    if (SUCC == MBB)
+      continue;
+    if (! CSRUsed[SUCC].contains(prop)) {
+      CSRUsed[SUCC] |= prop;
+      addedUses = true;
+      blks.push_back(SUCC);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              dbgs() << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "successor " << getBasicBlockName(SUCC) << "\n");
+    }
+  }
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    // Self-loop
+    if (PRED == MBB)
+      continue;
+    if (! CSRUsed[PRED].contains(prop)) {
+      CSRUsed[PRED] |= prop;
+      addedUses = true;
+      blks.push_back(PRED);
+      DEBUG(if (ShrinkWrapDebugging >= Iterations)
+              dbgs() << getBasicBlockName(MBB)
+                   << "(" << stringifyCSRegSet(prop) << ")->"
+                   << "predecessor " << getBasicBlockName(PRED) << "\n");
+    }
+  }
+  return addedUses;
+}
+
+/// addUsesForTopLevelLoops - add uses for CSRs used inside top
+/// level loops to the exit blocks of those loops.
+///
+bool PEI::addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks) {
+  bool addedUses = false;
+
+  // Place restores for top level loops where needed.
+  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
+         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
+    MachineBasicBlock* MBB = I->first;
+    MachineLoop* LP = I->second;
+    MachineBasicBlock* HDR = LP->getHeader();
+    SmallVector<MachineBasicBlock*, 4> exitBlocks;
+    CSRegSet loopSpills;
+
+    loopSpills = CSRSave[MBB];
+    if (CSRSave[MBB].empty()) {
+      loopSpills = CSRUsed[HDR];
+      assert(!loopSpills.empty() && "No CSRs used in loop?");
+    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
+      continue;
+
+    LP->getExitBlocks(exitBlocks);
+    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
+    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
+      MachineBasicBlock* EXB = exitBlocks[i];
+      if (! CSRUsed[EXB].contains(loopSpills)) {
+        CSRUsed[EXB] |= loopSpills;
+        addedUses = true;
+        DEBUG(if (ShrinkWrapDebugging >= Iterations)
+                dbgs() << "LOOP " << getBasicBlockName(MBB)
+                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
+                     << getBasicBlockName(EXB) << "\n");
+        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
+          blks.push_back(EXB);
+      }
+    }
+  }
+  return addedUses;
+}
+
+/// calcSpillPlacements - determine which CSRs should be spilled
+/// in MBB using AnticIn sets of MBB's predecessors, keeping track
+/// of changes to spilled reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
+                              SmallVector<MachineBasicBlock*, 4> &blks,
+                              CSRegBlockMap &prevSpills) {
+  bool placedSpills = false;
+  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
+  CSRegSet anticInPreds;
+  SmallVector<MachineBasicBlock*, 4> predecessors;
+  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock* PRED = *PI;
+    if (PRED != MBB)
+      predecessors.push_back(PRED);
+  }
+  unsigned i = 0, e = predecessors.size();
+  if (i != e) {
+    MachineBasicBlock* PRED = predecessors[i];
+    anticInPreds = UsedCSRegs - AnticIn[PRED];
+    for (++i; i != e; ++i) {
+      PRED = predecessors[i];
+      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
+    }
+  } else {
+    // Handle uses in entry blocks (which have no predecessors).
+    // This is necessary because the DFA formulation assumes the
+    // entry and (multiple) exit nodes cannot have CSR uses, which
+    // is not the case in the real world.
+    anticInPreds = UsedCSRegs;
+  }
+  // Compute spills required at MBB:
+  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
+
+  if (! CSRSave[MBB].empty()) {
+    if (MBB == EntryBlock) {
+      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
+        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
+    } else {
+      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
+      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
+        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
+      }
+    }
+  }
+  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
+  prevSpills[MBB] = CSRSave[MBB];
+  // Remember this block for adding restores to successor
+  // blocks for multi-entry region.
+  if (placedSpills)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
+
+  return placedSpills;
+}
+
+/// calcRestorePlacements - determine which CSRs should be restored
+/// in MBB using AvailOut sets of MBB's succcessors, keeping track
+/// of changes to restored reg sets. Add MBB to the set of blocks
+/// that need to be processed for propagating use info to cover
+/// multi-entry/exit regions.
+///
+bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
+                                SmallVector<MachineBasicBlock*, 4> &blks,
+                                CSRegBlockMap &prevRestores) {
+  bool placedRestores = false;
+  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
+  CSRegSet availOutSucc;
+  SmallVector<MachineBasicBlock*, 4> successors;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+    if (SUCC != MBB)
+      successors.push_back(SUCC);
+  }
+  unsigned i = 0, e = successors.size();
+  if (i != e) {
+    MachineBasicBlock* SUCC = successors[i];
+    availOutSucc = UsedCSRegs - AvailOut[SUCC];
+    for (++i; i != e; ++i) {
+      SUCC = successors[i];
+      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
+    }
+  } else {
+    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
+      // Handle uses in return blocks (which have no successors).
+      // This is necessary because the DFA formulation assumes the
+      // entry and (multiple) exit nodes cannot have CSR uses, which
+      // is not the case in the real world.
+      availOutSucc = UsedCSRegs;
+    }
+  }
+  // Compute restores required at MBB:
+  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
+
+  // Postprocess restore placements at MBB.
+  // Remove the CSRs that are restored in the return blocks.
+  // Lest this be confusing, note that:
+  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
+  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
+    if (! CSRSave[EntryBlock].empty())
+      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
+  }
+  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
+  prevRestores[MBB] = CSRRestore[MBB];
+  // Remember this block for adding saves to predecessor
+  // blocks for multi-entry region.
+  if (placedRestores)
+    blks.push_back(MBB);
+
+  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
+          dbgs() << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
+
+  return placedRestores;
+}
+
+/// placeSpillsAndRestores - place spills and restores of CSRs
+/// used in MBBs in minimal regions that contain the uses.
+///
+void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
+  CSRegBlockMap prevCSRSave;
+  CSRegBlockMap prevCSRRestore;
+  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
+  bool changed = true;
+  unsigned iterations = 0;
+
+  // Iterate computation of spill and restore placements in the MCFG until:
+  //   1. CSR use info has been fully propagated around the MCFG, and
+  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
+  while (changed) {
+    changed = false;
+    ++iterations;
+
+    DEBUG(if (ShrinkWrapDebugging >= Iterations)
+            dbgs() << "iter " << iterations
+                 << " --------------------------------------------------\n");
+
+    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
+    // which determines the placements of spills and restores.
+    // Keep track of changes to spills, restores in each iteration to
+    // minimize the total iterations.
+    bool SRChanged = false;
+    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+
+      // Place spills for CSRs in MBB.
+      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
+
+      // Place restores for CSRs in MBB.
+      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
+    }
+
+    // Add uses of CSRs used inside loops where needed.
+    changed |= addUsesForTopLevelLoops(cvBlocks);
+
+    // Add uses for CSRs spilled or restored at branch, join points.
+    if (changed || SRChanged) {
+      while (! cvBlocks.empty()) {
+        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
+        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
+      }
+      if (! ncvBlocks.empty()) {
+        cvBlocks = ncvBlocks;
+        ncvBlocks.clear();
+      }
+    }
+
+    if (changed) {
+      calculateAnticAvail(Fn);
+      CSRSave.clear();
+      CSRRestore.clear();
+    }
+  }
+
+  // Check for effectiveness:
+  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
+  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
+  // Gives a measure of how many CSR spills have been moved from EntryBlock
+  // to minimal regions enclosing their uses.
+  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
+  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
+  numSRReduced += numSRReducedThisFunc;
+  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
+      dbgs() << "-----------------------------------------------------------\n";
+      dbgs() << "total iterations = " << iterations << " ( "
+           << Fn.getName()
+           << " " << numSRReducedThisFunc
+           << " " << Fn.size()
+           << " )\n";
+      dbgs() << "-----------------------------------------------------------\n";
+      dumpSRSets();
+      dbgs() << "-----------------------------------------------------------\n";
+      if (numSRReducedThisFunc)
+        verifySpillRestorePlacement();
+    });
+}
+
+// Debugging methods.
+#ifndef NDEBUG
+/// findFastExitPath - debugging method used to detect functions
+/// with at least one path from the entry block to a return block
+/// directly or which has a very small number of edges.
+///
+void PEI::findFastExitPath() {
+  if (! EntryBlock)
+    return;
+  // Fina a path from EntryBlock to any return block that does not branch:
+  //        Entry
+  //          |     ...
+  //          v      |
+  //         B1<-----+
+  //          |
+  //          v
+  //       Return
+  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
+         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock* SUCC = *SI;
+
+    // Assume positive, disprove existence of fast path.
+    HasFastExitPath = true;
+
+    // Check the immediate successors.
+    if (isReturnBlock(SUCC)) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << getBasicBlockName(SUCC) << "\n";
+      break;
+    }
+    // Traverse df from SUCC, look for a branch block.
+    std::string exitPath = getBasicBlockName(SUCC);
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
+           BE = df_end(SUCC); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      // Reject paths with branch nodes.
+      if (SBB->succ_size() > 1) {
+        HasFastExitPath = false;
+        break;
+      }
+      exitPath += "->" + getBasicBlockName(SBB);
+    }
+    if (HasFastExitPath) {
+      if (ShrinkWrapDebugging >= BasicInfo)
+        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
+             << "->" << exitPath << "\n";
+      break;
+    }
+  }
+}
+
+/// verifySpillRestorePlacement - check the current spill/restore
+/// sets for safety. Attempt to find spills without restores or
+/// restores without spills.
+/// Spills: walk df from each MBB in spill set ensuring that
+///         all CSRs spilled at MMBB are restored on all paths
+///         from MBB to all exit blocks.
+/// Restores: walk idf from each MBB in restore set ensuring that
+///           all CSRs restored at MBB are spilled on all paths
+///           reaching MBB.
+///
+void PEI::verifySpillRestorePlacement() {
+  unsigned numReturnBlocks = 0;
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock* MBB = MBBI;
+    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
+      ++numReturnBlocks;
+  }
+  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
+         BE = CSRSave.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet spilled = BI->second;
+    CSRegSet restored;
+
+    if (spilled.empty())
+      continue;
+
+    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(spilled)
+                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
+
+    if (CSRRestore[MBB].intersects(spilled)) {
+      restored |= (CSRRestore[MBB] & spilled);
+    }
+
+    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
+    // we must find restores for all spills w/no intervening spills on all
+    // paths from MBB to all return blocks.
+    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
+           BE = df_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* SBB = *BI;
+      if (SBB == MBB)
+        continue;
+      // Stop when we encounter spills of any CSRs spilled at MBB that
+      // have not yet been seen to be restored.
+      if (CSRSave[SBB].intersects(spilled) &&
+          !restored.contains(CSRSave[SBB] & spilled))
+        break;
+      // Collect the CSRs spilled at MBB that are restored
+      // at this DF successor of MBB.
+      if (CSRRestore[SBB].intersects(spilled))
+        restored |= (CSRRestore[SBB] & spilled);
+      // If we are at a retun block, check that the restores
+      // we have seen so far exhaust the spills at MBB, then
+      // reset the restores.
+      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
+        if (restored != spilled) {
+          CSRegSet notRestored = (spilled - restored);
+          DEBUG(dbgs() << MF->getName() << ": "
+                       << stringifyCSRegSet(notRestored)
+                       << " spilled at " << getBasicBlockName(MBB)
+                       << " are never restored on path to return "
+                       << getBasicBlockName(SBB) << "\n");
+        }
+        restored.clear();
+      }
+    }
+  }
+
+  // Check restore placements.
+  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
+         BE = CSRRestore.end(); BI != BE; ++BI) {
+    MachineBasicBlock* MBB = BI->first;
+    CSRegSet restored = BI->second;
+    CSRegSet spilled;
+
+    if (restored.empty())
+      continue;
+
+    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRSave[MBB])
+                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(restored) << "\n");
+
+    if (CSRSave[MBB].intersects(restored)) {
+      spilled |= (CSRSave[MBB] & restored);
+    }
+    // Walk inverse depth first from MBB to find spills of all
+    // CSRs restored at MBB:
+    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
+           BE = idf_end(MBB); BI != BE; ++BI) {
+      MachineBasicBlock* PBB = *BI;
+      if (PBB == MBB)
+        continue;
+      // Stop when we encounter restores of any CSRs restored at MBB that
+      // have not yet been seen to be spilled.
+      if (CSRRestore[PBB].intersects(restored) &&
+          !spilled.contains(CSRRestore[PBB] & restored))
+        break;
+      // Collect the CSRs restored at MBB that are spilled
+      // at this DF predecessor of MBB.
+      if (CSRSave[PBB].intersects(restored))
+        spilled |= (CSRSave[PBB] & restored);
+    }
+    if (spilled != restored) {
+      CSRegSet notSpilled = (restored - spilled);
+      DEBUG(dbgs() << MF->getName() << ": "
+                   << stringifyCSRegSet(notSpilled)
+                   << " restored at " << getBasicBlockName(MBB)
+                   << " are never spilled\n");
+    }
+  }
+}
+
+// Debugging print methods.
+std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
+  if (!MBB)
+    return "";
+
+  if (MBB->getBasicBlock())
+    return MBB->getBasicBlock()->getName().str();
+
+  std::ostringstream name;
+  name << "_MBB_" << MBB->getNumber();
+  return name.str();
+}
+
+std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
+  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
+  const std::vector<CalleeSavedInfo> CSI =
+    MF->getFrameInfo()->getCalleeSavedInfo();
+
+  std::ostringstream srep;
+  if (CSI.size() == 0) {
+    srep << "[]";
+    return srep.str();
+  }
+  srep << "[";
+  CSRegSet::iterator I = s.begin(), E = s.end();
+  if (I != E) {
+    unsigned reg = CSI[*I].getReg();
+    srep << TRI->getName(reg);
+    for (++I; I != E; ++I) {
+      reg = CSI[*I].getReg();
+      srep << ",";
+      srep << TRI->getName(reg);
+    }
+  }
+  srep << "]";
+  return srep.str();
+}
+
+void PEI::dumpSet(const CSRegSet& s) {
+  DEBUG(dbgs() << stringifyCSRegSet(s) << "\n");
+}
+
+void PEI::dumpUsed(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
+               << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
+    });
+}
+
+void PEI::dumpAllUsed() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpUsed(MBB);
+    }
+}
+
+void PEI::dumpSets(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << getBasicBlockName(MBB)           << " | "
+               << stringifyCSRegSet(CSRUsed[MBB])  << " | "
+               << stringifyCSRegSet(AnticIn[MBB])  << " | "
+               << stringifyCSRegSet(AnticOut[MBB]) << " | "
+               << stringifyCSRegSet(AvailIn[MBB])  << " | "
+               << stringifyCSRegSet(AvailOut[MBB]) << "\n";
+    });
+}
+
+void PEI::dumpSets1(MachineBasicBlock* MBB) {
+  DEBUG({
+      if (MBB)
+        dbgs() << getBasicBlockName(MBB)             << " | "
+               << stringifyCSRegSet(CSRUsed[MBB])    << " | "
+               << stringifyCSRegSet(AnticIn[MBB])    << " | "
+               << stringifyCSRegSet(AnticOut[MBB])   << " | "
+               << stringifyCSRegSet(AvailIn[MBB])    << " | "
+               << stringifyCSRegSet(AvailOut[MBB])   << " | "
+               << stringifyCSRegSet(CSRSave[MBB])    << " | "
+               << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+    });
+}
+
+void PEI::dumpAllSets() {
+    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+         MBBI != MBBE; ++MBBI) {
+      MachineBasicBlock* MBB = MBBI;
+      dumpSets1(MBB);
+    }
+}
+
+void PEI::dumpSRSets() {
+  DEBUG({
+      for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
+           MBB != E; ++MBB) {
+        if (!CSRSave[MBB].empty()) {
+          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRSave[MBB]);
+          if (CSRRestore[MBB].empty())
+            dbgs() << '\n';
+        }
+
+        if (!CSRRestore[MBB].empty() && !CSRSave[MBB].empty())
+          dbgs() << "    "
+                 << "RESTORE[" << getBasicBlockName(MBB) << "] = "
+                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
+      }
+    });
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
new file mode 100644
index 000000000000..3903743878b4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -0,0 +1,502 @@
+//===- SjLjEHPrepare.cpp - Eliminate Invoke & Unwind instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which use SjLj
+// based exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sjljehprepare"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+namespace {
+  class SjLjEHPrepare : public FunctionPass {
+    const TargetLoweringBase *TLI;
+    Type *FunctionContextTy;
+    Constant *RegisterFn;
+    Constant *UnregisterFn;
+    Constant *BuiltinSetjmpFn;
+    Constant *FrameAddrFn;
+    Constant *StackAddrFn;
+    Constant *StackRestoreFn;
+    Constant *LSDAAddrFn;
+    Value *PersonalityFn;
+    Constant *CallSiteFn;
+    Constant *FuncCtxFn;
+    AllocaInst *FuncCtx;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit SjLjEHPrepare(const TargetLoweringBase *tli = NULL)
+      : FunctionPass(ID), TLI(tli) { }
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
+    const char *getPassName() const {
+      return "SJLJ Exception Handling preparation";
+    }
+
+  private:
+    bool setupEntryBlockAndCallSites(Function &F);
+    void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
+                              Value *SelVal);
+    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads);
+    void lowerIncomingArguments(Function &F);
+    void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst*> Invokes);
+    void insertCallSiteStore(Instruction *I, int Number);
+  };
+} // end anonymous namespace
+
+char SjLjEHPrepare::ID = 0;
+
+// Public Interface To the SjLjEHPrepare pass.
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetLoweringBase *TLI) {
+  return new SjLjEHPrepare(TLI);
+}
+// doInitialization - Set up decalarations and types needed to process
+// exceptions.
+bool SjLjEHPrepare::doInitialization(Module &M) {
+  // Build the function context structure.
+  // builtin_setjmp uses a five word jbuf
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  FunctionContextTy =
+    StructType::get(VoidPtrTy,                        // __prev
+                    Int32Ty,                          // call_site
+                    ArrayType::get(Int32Ty, 4),       // __data
+                    VoidPtrTy,                        // __personality
+                    VoidPtrTy,                        // __lsda
+                    ArrayType::get(VoidPtrTy, 5),     // __jbuf
+                    NULL);
+  RegisterFn = M.getOrInsertFunction("_Unwind_SjLj_Register",
+                                     Type::getVoidTy(M.getContext()),
+                                     PointerType::getUnqual(FunctionContextTy),
+                                     (Type *)0);
+  UnregisterFn =
+    M.getOrInsertFunction("_Unwind_SjLj_Unregister",
+                          Type::getVoidTy(M.getContext()),
+                          PointerType::getUnqual(FunctionContextTy),
+                          (Type *)0);
+  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
+  BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
+  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
+  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
+  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
+  PersonalityFn = 0;
+
+  return true;
+}
+
+/// insertCallSiteStore - Insert a store of the call-site value to the
+/// function context
+void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
+  IRBuilder<> Builder(I);
+
+  // Get a reference to the call_site field.
+  Type *Int32Ty = Type::getInt32Ty(I->getContext());
+  Value *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *One = ConstantInt::get(Int32Ty, 1);
+  Value *Idxs[2] = { Zero, One };
+  Value *CallSite = Builder.CreateGEP(FuncCtx, Idxs, "call_site");
+
+  // Insert a store of the call-site number
+  ConstantInt *CallSiteNoC = ConstantInt::get(Type::getInt32Ty(I->getContext()),
+                                              Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true/*volatile*/);
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB,
+                             SmallPtrSet<BasicBlock*, 64> &LiveBBs) {
+  if (!LiveBBs.insert(BB)) return; // already been here.
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    MarkBlocksLiveIn(*PI, LiveBBs);
+}
+
+/// substituteLPadValues - Substitute the values returned by the landingpad
+/// instruction with those returned by the personality function.
+void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
+                                         Value *SelVal) {
+  SmallVector<Value*, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
+  while (!UseWorkList.empty()) {
+    Value *Val = UseWorkList.pop_back_val();
+    ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val);
+    if (!EVI) continue;
+    if (EVI->getNumIndices() != 1) continue;
+    if (*EVI->idx_begin() == 0)
+      EVI->replaceAllUsesWith(ExnVal);
+    else if (*EVI->idx_begin() == 1)
+      EVI->replaceAllUsesWith(SelVal);
+    if (EVI->getNumUses() == 0)
+      EVI->eraseFromParent();
+  }
+
+  if (LPI->getNumUses() == 0)  return;
+
+  // There are still some uses of LPI. Construct an aggregate with the exception
+  // values and replace the LPI with that aggregate.
+  Type *LPadType = LPI->getType();
+  Value *LPadVal = UndefValue::get(LPadType);
+  IRBuilder<>
+    Builder(llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
+  LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
+  LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
+
+  LPI->replaceAllUsesWith(LPadVal);
+}
+
+/// setupFunctionContext - Allocate the function context on the stack and fill
+/// it with all of the data that we know at this point.
+Value *SjLjEHPrepare::
+setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
+  BasicBlock *EntryBB = F.begin();
+
+  // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+  // that needs to be restored on all exits from the function. This is an alloca
+  // because the value needs to be added to the global context list.
+  unsigned Align =
+    TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
+  FuncCtx =
+    new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
+
+  // Fill in the function context structure.
+  for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
+    LandingPadInst *LPI = LPads[I];
+    IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt());
+
+    // Reference the __data field.
+    Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
+
+    // The exception values come back in context->__data[0].
+    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
+                                                      "exception_gep");
+    Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
+
+    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
+                                                     "exn_selector_gep");
+    Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
+
+    substituteLPadValues(LPI, ExnVal, SelVal);
+  }
+
+  // Personality function
+  IRBuilder<> Builder(EntryBB->getTerminator());
+  if (!PersonalityFn)
+    PersonalityFn = LPads[0]->getPersonalityFn();
+  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
+                                                          "pers_fn_gep");
+  Builder.CreateStore(PersonalityFn, PersonalityFieldPtr, /*isVolatile=*/true);
+
+  // LSDA address
+  Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
+  Value *LSDAFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 4, "lsda_gep");
+  Builder.CreateStore(LSDA, LSDAFieldPtr, /*isVolatile=*/true);
+
+  return FuncCtx;
+}
+
+/// lowerIncomingArguments - To avoid having to handle incoming arguments
+/// specially, we lower each arg to a copy instruction in the entry block. This
+/// ensures that the argument value itself cannot be live out of the entry
+/// block.
+void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
+  BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsPt) &&
+         isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
+    ++AfterAllocaInsPt;
+
+  for (Function::arg_iterator
+         AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+    Type *Ty = AI->getType();
+
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC =
+        new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
+                        AfterAllocaInsPt);
+      AI->replaceAllUsesWith(NC);
+
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However, we're
+      // replacing it here with the same value it was constructed with.  We do
+      // this because the above replaceAllUsesWith() clobbered the operand, but
+      // we want this one to remain.
+      NC->setOperand(0, AI);
+    }
+  }
+}
+
+/// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
+/// edge and spill them.
+void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
+                                           ArrayRef<InvokeInst*> Invokes) {
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (Function::iterator
+         BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator
+           II = BB->begin(), IIE = BB->end(); II != IIE; ++II) {
+      // Ignore obvious cases we don't have to handle. In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block. Ignore them quickly.
+      Instruction *Inst = II;
+      if (Inst->use_empty()) continue;
+      if (Inst->hasOneUse() &&
+          cast<Instruction>(Inst->use_back())->getParent() == BB &&
+          !isa<PHINode>(Inst->use_back())) continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+        if (isa<ConstantInt>(AI->getArraySize()) && BB == F.begin())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      SmallVector<Instruction*, 16> Users;
+      for (Value::use_iterator
+             UI = Inst->use_begin(), E = Inst->use_end(); UI != E; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (User->getParent() != BB || isa<PHINode>(User))
+          Users.push_back(User);
+      }
+
+      // Find all of the blocks that this value is live in.
+      SmallPtrSet<BasicBlock*, 64> LiveBBs;
+      LiveBBs.insert(Inst->getParent());
+      while (!Users.empty()) {
+        Instruction *U = Users.back();
+        Users.pop_back();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+          DEBUG(dbgs() << "SJLJ Spill: " << *Inst << " around "
+                << UnwindBlock->getName() << "\n");
+          NeedsSpill = true;
+          break;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
+      if (NeedsSpill) {
+        DemoteRegToStack(*Inst, true);
+        ++NumSpilled;
+      }
+    }
+  }
+
+  // Go through the landing pads and remove any PHIs there.
+  for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+    BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+    LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
+
+    // Place PHIs into a set to avoid invalidating the iterator.
+    SmallPtrSet<PHINode*, 8> PHIsToDemote;
+    for (BasicBlock::iterator
+           PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+      PHIsToDemote.insert(cast<PHINode>(PN));
+    if (PHIsToDemote.empty()) continue;
+
+    // Demote the PHIs to the stack.
+    for (SmallPtrSet<PHINode*, 8>::iterator
+           I = PHIsToDemote.begin(), E = PHIsToDemote.end(); I != E; ++I)
+      DemotePHIToStack(*I);
+
+    // Move the landingpad instruction back to the top of the landing pad block.
+    LPI->moveBefore(UnwindBlock->begin());
+  }
+}
+
+/// setupEntryBlockAndCallSites - Setup the entry block by creating and filling
+/// the function context and marking the call sites with the appropriate
+/// values. These values are used by the DWARF EH emitter.
+bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
+  SmallVector<ReturnInst*, 16> Returns;
+  SmallVector<InvokeInst*, 16> Invokes;
+  SmallSetVector<LandingPadInst*, 16> LPads;
+
+  // Look through the terminators of the basic blocks to find invokes.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      if (Function *Callee = II->getCalledFunction())
+        if (Callee->isIntrinsic() &&
+            Callee->getIntrinsicID() == Intrinsic::donothing) {
+          // Remove the NOP invoke.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->eraseFromParent();
+          continue;
+        }
+
+      Invokes.push_back(II);
+      LPads.insert(II->getUnwindDest()->getLandingPadInst());
+    } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      Returns.push_back(RI);
+    }
+
+  if (Invokes.empty()) return false;
+
+  NumInvokes += Invokes.size();
+
+  lowerIncomingArguments(F);
+  lowerAcrossUnwindEdges(F, Invokes);
+
+  Value *FuncCtx =
+    setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
+  BasicBlock *EntryBB = F.begin();
+  IRBuilder<> Builder(EntryBB->getTerminator());
+
+  // Get a reference to the jump buffer.
+  Value *JBufPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 5, "jbuf_gep");
+
+  // Save the frame pointer.
+  Value *FramePtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 0, "jbuf_fp_gep");
+
+  Value *Val = Builder.CreateCall(FrameAddrFn, Builder.getInt32(0), "fp");
+  Builder.CreateStore(Val, FramePtr, /*isVolatile=*/true);
+
+  // Save the stack pointer.
+  Value *StackPtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 2, "jbuf_sp_gep");
+
+  Val = Builder.CreateCall(StackAddrFn, "sp");
+  Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
+
+  // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
+  Value *SetjmpArg = Builder.CreateBitCast(JBufPtr, Builder.getInt8PtrTy());
+  Builder.CreateCall(BuiltinSetjmpFn, SetjmpArg);
+
+  // Store a pointer to the function context so that the back-end will know
+  // where to look for it.
+  Value *FuncCtxArg = Builder.CreateBitCast(FuncCtx, Builder.getInt8PtrTy());
+  Builder.CreateCall(FuncCtxFn, FuncCtxArg);
+
+  // At this point, we are all set up, update the invoke instructions to mark
+  // their call_site values.
+  for (unsigned I = 0, E = Invokes.size(); I != E; ++I) {
+    insertCallSiteStore(Invokes[I], I + 1);
+
+    ConstantInt *CallSiteNum =
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+
+    // Record the call site value for the back end so it stays associated with
+    // the invoke.
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+  }
+
+  // Mark call instructions that aren't nounwind as no-action (call_site ==
+  // -1). Skip the entry block, as prior to then, no function context has been
+  // created for this function and any unexpected exceptions thrown will go
+  // directly to the caller's context, which is what we want anyway, so no need
+  // to do anything here.
+  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;)
+    for (BasicBlock::iterator I = BB->begin(), end = BB->end(); I != end; ++I)
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (!CI->doesNotThrow())
+          insertCallSiteStore(CI, -1);
+      } else if (ResumeInst *RI = dyn_cast<ResumeInst>(I)) {
+        insertCallSiteStore(RI, -1);
+      }
+
+  // Register the function context and make sure it's known to not throw
+  CallInst *Register = CallInst::Create(RegisterFn, FuncCtx, "",
+                                        EntryBB->getTerminator());
+  Register->setDoesNotThrow();
+
+  // Following any allocas not in the entry block, update the saved SP in the
+  // jmpbuf to the new value.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (BB == F.begin())
+      continue;
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (CI->getCalledFunction() != StackRestoreFn)
+          continue;
+      } else if (!isa<AllocaInst>(I)) {
+        continue;
+      }
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(I);
+      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
+      StoreStackAddr->insertAfter(StackAddr);
+    }
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, add a call to unregister the function context.
+  for (unsigned I = 0, E = Returns.size(); I != E; ++I)
+    CallInst::Create(UnregisterFn, FuncCtx, "", Returns[I]);
+
+  return true;
+}
+
+bool SjLjEHPrepare::runOnFunction(Function &F) {
+  bool Res = setupEntryBlockAndCallSites(F);
+  return Res;
+}
diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
new file mode 100644
index 000000000000..20049a89d15d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -0,0 +1,250 @@
+//===-- SlotIndexes.cpp - Slot Indexes Pass  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "slotindexes"
+
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+char SlotIndexes::ID = 0;
+INITIALIZE_PASS(SlotIndexes, "slotindexes",
+                "Slot index numbering", false, false)
+
+STATISTIC(NumLocalRenum,  "Number of local renumberings");
+STATISTIC(NumGlobalRenum, "Number of global renumberings");
+
+void SlotIndexes::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void SlotIndexes::releaseMemory() {
+  mi2iMap.clear();
+  MBBRanges.clear();
+  idx2MBBMap.clear();
+  indexList.clear();
+  ileAllocator.Reset();
+}
+
+bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
+
+  // Compute numbering as follows:
+  // Grab an iterator to the start of the index list.
+  // Iterate over all MBBs, and within each MBB all MIs, keeping the MI
+  // iterator in lock-step (though skipping it over indexes which have
+  // null pointers in the instruction field).
+  // At each iteration assert that the instruction pointed to in the index
+  // is the same one pointed to by the MI iterator. This
+
+  // FIXME: This can be simplified. The mi2iMap_, Idx2MBBMap, etc. should
+  // only need to be set up once after the first numbering is computed.
+
+  mf = &fn;
+
+  // Check that the list contains only the sentinal.
+  assert(indexList.empty() && "Index list non-empty at initial numbering?");
+  assert(idx2MBBMap.empty() &&
+         "Index -> MBB mapping non-empty at initial numbering?");
+  assert(MBBRanges.empty() &&
+         "MBB -> Index mapping non-empty at initial numbering?");
+  assert(mi2iMap.empty() &&
+         "MachineInstr -> Index mapping non-empty at initial numbering?");
+
+  unsigned index = 0;
+  MBBRanges.resize(mf->getNumBlockIDs());
+  idx2MBBMap.reserve(mf->size());
+
+  indexList.push_back(createEntry(0, index));
+
+  // Iterate over the function.
+  for (MachineFunction::iterator mbbItr = mf->begin(), mbbEnd = mf->end();
+       mbbItr != mbbEnd; ++mbbItr) {
+    MachineBasicBlock *mbb = &*mbbItr;
+
+    // Insert an index for the MBB start.
+    SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
+
+    for (MachineBasicBlock::iterator miItr = mbb->begin(), miEnd = mbb->end();
+         miItr != miEnd; ++miItr) {
+      MachineInstr *mi = miItr;
+      if (mi->isDebugValue())
+        continue;
+
+      // Insert a store index for the instr.
+      indexList.push_back(createEntry(mi, index += SlotIndex::InstrDist));
+
+      // Save this base index in the maps.
+      mi2iMap.insert(std::make_pair(mi, SlotIndex(&indexList.back(),
+                                                  SlotIndex::Slot_Block)));
+    }
+
+    // We insert one blank instructions between basic blocks.
+    indexList.push_back(createEntry(0, index += SlotIndex::InstrDist));
+
+    MBBRanges[mbb->getNumber()].first = blockStartIndex;
+    MBBRanges[mbb->getNumber()].second = SlotIndex(&indexList.back(),
+                                                   SlotIndex::Slot_Block);
+    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, mbb));
+  }
+
+  // Sort the Idx2MBBMap
+  std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
+
+  DEBUG(mf->print(dbgs(), this));
+
+  // And we're done!
+  return false;
+}
+
+void SlotIndexes::renumberIndexes() {
+  // Renumber updates the index of every element of the index list.
+  DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
+  ++NumGlobalRenum;
+
+  unsigned index = 0;
+
+  for (IndexList::iterator I = indexList.begin(), E = indexList.end();
+       I != E; ++I) {
+    I->setIndex(index);
+    index += SlotIndex::InstrDist;
+  }
+}
+
+// Renumber indexes locally after curItr was inserted, but failed to get a new
+// index.
+void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
+  // Number indexes with half the default spacing so we can catch up quickly.
+  const unsigned Space = SlotIndex::InstrDist/2;
+  assert((Space & 3) == 0 && "InstrDist must be a multiple of 2*NUM");
+
+  IndexList::iterator startItr = prior(curItr);
+  unsigned index = startItr->getIndex();
+  do {
+    curItr->setIndex(index += Space);
+    ++curItr;
+    // If the next index is bigger, we have caught up.
+  } while (curItr != indexList.end() && curItr->getIndex() <= index);
+
+  DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex() << '-'
+               << index << " ***\n");
+  ++NumLocalRenum;
+}
+
+// Repair indexes after adding and removing instructions.
+void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator Begin,
+                                       MachineBasicBlock::iterator End) {
+  // FIXME: Is this really necessary? The only caller repairIntervalsForRange()
+  // does the same thing.
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !hasIndex(Begin))
+    --Begin;
+  while (End != MBB->end() && !hasIndex(End))
+    ++End;
+
+  bool includeStart = (Begin == MBB->begin());
+  SlotIndex startIdx;
+  if (includeStart)
+    startIdx = getMBBStartIdx(MBB);
+  else
+    startIdx = getInstructionIndex(Begin);
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB);
+  else
+    endIdx = getInstructionIndex(End);
+
+  // FIXME: Conceptually, this code is implementing an iterator on MBB that
+  // optionally includes an additional position prior to MBB->begin(), indicated
+  // by the includeStart flag. This is done so that we can iterate MIs in a MBB
+  // in parallel with SlotIndexes, but there should be a better way to do this.
+  IndexList::iterator ListB = startIdx.listEntry();
+  IndexList::iterator ListI = endIdx.listEntry();
+  MachineBasicBlock::iterator MBBI = End;
+  bool pastStart = false;
+  while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
+    assert(ListI->getIndex() >= startIdx.getIndex() &&
+           (includeStart || !pastStart) &&
+           "Decremented past the beginning of region to repair.");
+
+    MachineInstr *SlotMI = ListI->getInstr();
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? MBBI : 0;
+    bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
+
+    if (SlotMI == MI && !MBBIAtBegin) {
+      --ListI;
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else if (MI && mi2iMap.find(MI) == mi2iMap.end()) {
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else {
+      --ListI;
+      if (SlotMI)
+        removeMachineInstrFromMaps(SlotMI);
+    }
+  }
+
+  // In theory this could be combined with the previous loop, but it is tricky
+  // to update the IndexList while we are iterating it.
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (!MI->isDebugValue() && mi2iMap.find(MI) == mi2iMap.end())
+      insertMachineInstrInMaps(MI);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SlotIndexes::dump() const {
+  for (IndexList::const_iterator itr = indexList.begin();
+       itr != indexList.end(); ++itr) {
+    dbgs() << itr->getIndex() << " ";
+
+    if (itr->getInstr() != 0) {
+      dbgs() << *itr->getInstr();
+    } else {
+      dbgs() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MBBRanges.size(); i != e; ++i)
+    dbgs() << "BB#" << i << "\t[" << MBBRanges[i].first << ';'
+           << MBBRanges[i].second << ")\n";
+}
+#endif
+
+// Print a SlotIndex to a raw_ostream.
+void SlotIndex::print(raw_ostream &os) const {
+  if (isValid())
+    os << listEntry()->getIndex() << "Berd"[getSlot()];
+  else
+    os << "invalid";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+// Dump a SlotIndex to stderr.
+void SlotIndex::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
new file mode 100644
index 000000000000..c5bbba3ffccc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -0,0 +1,381 @@
+//===-- SpillPlacement.cpp - Optimal Spill Code Placement -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the spill code placement analysis.
+//
+// Each edge bundle corresponds to a node in a Hopfield network. Constraints on
+// basic blocks are weighted by the block frequency and added to become the node
+// bias.
+//
+// Transparent basic blocks have the variable live through, but don't care if it
+// is spilled or in a register. These blocks become connections in the Hopfield
+// network, again weighted by block frequency.
+//
+// The Hopfield network minimizes (possibly locally) its energy function:
+//
+//   E = -sum_n V_n * ( B_n + sum_{n, m linked by b} V_m * F_b )
+//
+// The energy function represents the expected spill code execution frequency,
+// or the cost of spilling. This is a Lyapunov function which never increases
+// when a node is updated. It is guaranteed to converge to a local minimum.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spillplacement"
+#include "SpillPlacement.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+
+char SpillPlacement::ID = 0;
+INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement",
+                      "Spill Code Placement Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement",
+                    "Spill Code Placement Analysis", true, true)
+
+char &llvm::SpillPlacementID = SpillPlacement::ID;
+
+void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<EdgeBundles>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Node - Each edge bundle corresponds to a Hopfield node.
+///
+/// The node contains precomputed frequency data that only depends on the CFG,
+/// but Bias and Links are computed each time placeSpills is called.
+///
+/// The node Value is positive when the variable should be in a register. The
+/// value can change when linked nodes change, but convergence is very fast
+/// because all weights are positive.
+///
+struct SpillPlacement::Node {
+  /// Scale - Inverse block frequency feeding into[0] or out of[1] the bundle.
+  /// Ideally, these two numbers should be identical, but inaccuracies in the
+  /// block frequency estimates means that we need to normalize ingoing and
+  /// outgoing frequencies separately so they are commensurate.
+  float Scale[2];
+
+  /// Bias - Normalized contributions from non-transparent blocks.
+  /// A bundle connected to a MustSpill block has a huge negative bias,
+  /// otherwise it is a number in the range [-2;2].
+  float Bias;
+
+  /// Value - Output value of this node computed from the Bias and links.
+  /// This is always in the range [-1;1]. A positive number means the variable
+  /// should go in a register through this bundle.
+  float Value;
+
+  typedef SmallVector<std::pair<float, unsigned>, 4> LinkVector;
+
+  /// Links - (Weight, BundleNo) for all transparent blocks connecting to other
+  /// bundles. The weights are all positive and add up to at most 2, weights
+  /// from ingoing and outgoing nodes separately add up to a most 1. The weight
+  /// sum can be less than 2 when the variable is not live into / out of some
+  /// connected basic blocks.
+  LinkVector Links;
+
+  /// preferReg - Return true when this node prefers to be in a register.
+  bool preferReg() const {
+    // Undecided nodes (Value==0) go on the stack.
+    return Value > 0;
+  }
+
+  /// mustSpill - Return True if this node is so biased that it must spill.
+  bool mustSpill() const {
+    // Actually, we must spill if Bias < sum(weights).
+    // It may be worth it to compute the weight sum here?
+    return Bias < -2.0f;
+  }
+
+  /// Node - Create a blank Node.
+  Node() {
+    Scale[0] = Scale[1] = 0;
+  }
+
+  /// clear - Reset per-query data, but preserve frequencies that only depend on
+  // the CFG.
+  void clear() {
+    Bias = Value = 0;
+    Links.clear();
+  }
+
+  /// addLink - Add a link to bundle b with weight w.
+  /// out=0 for an ingoing link, and 1 for an outgoing link.
+  void addLink(unsigned b, float w, bool out) {
+    // Normalize w relative to all connected blocks from that direction.
+    w *= Scale[out];
+
+    // There can be multiple links to the same bundle, add them up.
+    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I)
+      if (I->second == b) {
+        I->first += w;
+        return;
+      }
+    // This must be the first link to b.
+    Links.push_back(std::make_pair(w, b));
+  }
+
+  /// addBias - Bias this node from an ingoing[0] or outgoing[1] link.
+  /// Return the change to the total number of positive biases.
+  void addBias(float w, bool out) {
+    // Normalize w relative to all connected blocks from that direction.
+    w *= Scale[out];
+    Bias += w;
+  }
+
+  /// update - Recompute Value from Bias and Links. Return true when node
+  /// preference changes.
+  bool update(const Node nodes[]) {
+    // Compute the weighted sum of inputs.
+    float Sum = Bias;
+    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I)
+      Sum += I->first * nodes[I->second].Value;
+
+    // The weighted sum is going to be in the range [-2;2]. Ideally, we should
+    // simply set Value = sign(Sum), but we will add a dead zone around 0 for
+    // two reasons:
+    //  1. It avoids arbitrary bias when all links are 0 as is possible during
+    //     initial iterations.
+    //  2. It helps tame rounding errors when the links nominally sum to 0.
+    const float Thres = 1e-4f;
+    bool Before = preferReg();
+    if (Sum < -Thres)
+      Value = -1;
+    else if (Sum > Thres)
+      Value = 1;
+    else
+      Value = 0;
+    return Before != preferReg();
+  }
+};
+
+bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  bundles = &getAnalysis<EdgeBundles>();
+  loops = &getAnalysis<MachineLoopInfo>();
+
+  assert(!nodes && "Leaking node array");
+  nodes = new Node[bundles->getNumBundles()];
+
+  // Compute total ingoing and outgoing block frequencies for all bundles.
+  BlockFrequency.resize(mf.getNumBlockIDs());
+  for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) {
+    float Freq = LiveIntervals::getSpillWeight(true, false,
+                                               loops->getLoopDepth(I));
+    unsigned Num = I->getNumber();
+    BlockFrequency[Num] = Freq;
+    nodes[bundles->getBundle(Num, 1)].Scale[0] += Freq;
+    nodes[bundles->getBundle(Num, 0)].Scale[1] += Freq;
+  }
+
+  // Scales are reciprocal frequencies.
+  for (unsigned i = 0, e = bundles->getNumBundles(); i != e; ++i)
+    for (unsigned d = 0; d != 2; ++d)
+      if (nodes[i].Scale[d] > 0)
+        nodes[i].Scale[d] = 1 / nodes[i].Scale[d];
+
+  // We never change the function.
+  return false;
+}
+
+void SpillPlacement::releaseMemory() {
+  delete[] nodes;
+  nodes = 0;
+}
+
+/// activate - mark node n as active if it wasn't already.
+void SpillPlacement::activate(unsigned n) {
+  if (ActiveNodes->test(n))
+    return;
+  ActiveNodes->set(n);
+  nodes[n].clear();
+
+  // Very large bundles usually come from big switches, indirect branches,
+  // landing pads, or loops with many 'continue' statements. It is difficult to
+  // allocate registers when so many different blocks are involved.
+  //
+  // Give a small negative bias to large bundles such that 1/32 of the
+  // connected blocks need to be interested before we consider expanding the
+  // region through the bundle. This helps compile time by limiting the number
+  // of blocks visited and the number of links in the Hopfield network.
+  if (bundles->getBlocks(n).size() > 100)
+    nodes[n].Bias = -0.0625f;
+}
+
+
+/// addConstraints - Compute node biases and weights from a set of constraints.
+/// Set a bit in NodeMask for each active node.
+void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
+  for (ArrayRef<BlockConstraint>::iterator I = LiveBlocks.begin(),
+       E = LiveBlocks.end(); I != E; ++I) {
+    float Freq = getBlockFrequency(I->Number);
+    const float Bias[] = {
+      0,           // DontCare,
+      1,           // PrefReg,
+      -1,          // PrefSpill
+      0,           // PrefBoth
+      -HUGE_VALF   // MustSpill
+    };
+
+    // Live-in to block?
+    if (I->Entry != DontCare) {
+      unsigned ib = bundles->getBundle(I->Number, 0);
+      activate(ib);
+      nodes[ib].addBias(Freq * Bias[I->Entry], 1);
+    }
+
+    // Live-out from block?
+    if (I->Exit != DontCare) {
+      unsigned ob = bundles->getBundle(I->Number, 1);
+      activate(ob);
+      nodes[ob].addBias(Freq * Bias[I->Exit], 0);
+    }
+  }
+}
+
+/// addPrefSpill - Same as addConstraints(PrefSpill)
+void SpillPlacement::addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong) {
+  for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I) {
+    float Freq = getBlockFrequency(*I);
+    if (Strong)
+      Freq += Freq;
+    unsigned ib = bundles->getBundle(*I, 0);
+    unsigned ob = bundles->getBundle(*I, 1);
+    activate(ib);
+    activate(ob);
+    nodes[ib].addBias(-Freq, 1);
+    nodes[ob].addBias(-Freq, 0);
+  }
+}
+
+void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
+  for (ArrayRef<unsigned>::iterator I = Links.begin(), E = Links.end(); I != E;
+       ++I) {
+    unsigned Number = *I;
+    unsigned ib = bundles->getBundle(Number, 0);
+    unsigned ob = bundles->getBundle(Number, 1);
+
+    // Ignore self-loops.
+    if (ib == ob)
+      continue;
+    activate(ib);
+    activate(ob);
+    if (nodes[ib].Links.empty() && !nodes[ib].mustSpill())
+      Linked.push_back(ib);
+    if (nodes[ob].Links.empty() && !nodes[ob].mustSpill())
+      Linked.push_back(ob);
+    float Freq = getBlockFrequency(Number);
+    nodes[ib].addLink(ob, Freq, 1);
+    nodes[ob].addLink(ib, Freq, 0);
+  }
+}
+
+bool SpillPlacement::scanActiveBundles() {
+  Linked.clear();
+  RecentPositive.clear();
+  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
+    nodes[n].update(nodes);
+    // A node that must spill, or a node without any links is not going to
+    // change its value ever again, so exclude it from iterations.
+    if (nodes[n].mustSpill())
+      continue;
+    if (!nodes[n].Links.empty())
+      Linked.push_back(n);
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+  return !RecentPositive.empty();
+}
+
+/// iterate - Repeatedly update the Hopfield nodes until stability or the
+/// maximum number of iterations is reached.
+/// @param Linked - Numbers of linked nodes that need updating.
+void SpillPlacement::iterate() {
+  // First update the recently positive nodes. They have likely received new
+  // negative bias that will turn them off.
+  while (!RecentPositive.empty())
+    nodes[RecentPositive.pop_back_val()].update(nodes);
+
+  if (Linked.empty())
+    return;
+
+  // Run up to 10 iterations. The edge bundle numbering is closely related to
+  // basic block numbering, so there is a strong tendency towards chains of
+  // linked nodes with sequential numbers. By scanning the linked nodes
+  // backwards and forwards, we make it very likely that a single node can
+  // affect the entire network in a single iteration. That means very fast
+  // convergence, usually in a single iteration.
+  for (unsigned iteration = 0; iteration != 10; ++iteration) {
+    // Scan backwards, skipping the last node which was just updated.
+    bool Changed = false;
+    for (SmallVectorImpl<unsigned>::const_reverse_iterator I =
+           llvm::next(Linked.rbegin()), E = Linked.rend(); I != E; ++I) {
+      unsigned n = *I;
+      if (nodes[n].update(nodes)) {
+        Changed = true;
+        if (nodes[n].preferReg())
+          RecentPositive.push_back(n);
+      }
+    }
+    if (!Changed || !RecentPositive.empty())
+      return;
+
+    // Scan forwards, skipping the first node which was just updated.
+    Changed = false;
+    for (SmallVectorImpl<unsigned>::const_iterator I =
+           llvm::next(Linked.begin()), E = Linked.end(); I != E; ++I) {
+      unsigned n = *I;
+      if (nodes[n].update(nodes)) {
+        Changed = true;
+        if (nodes[n].preferReg())
+          RecentPositive.push_back(n);
+      }
+    }
+    if (!Changed || !RecentPositive.empty())
+      return;
+  }
+}
+
+void SpillPlacement::prepare(BitVector &RegBundles) {
+  Linked.clear();
+  RecentPositive.clear();
+  // Reuse RegBundles as our ActiveNodes vector.
+  ActiveNodes = &RegBundles;
+  ActiveNodes->clear();
+  ActiveNodes->resize(bundles->getNumBundles());
+}
+
+bool
+SpillPlacement::finish() {
+  assert(ActiveNodes && "Call prepare() first");
+
+  // Write preferences back to ActiveNodes.
+  bool Perfect = true;
+  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n))
+    if (!nodes[n].preferReg()) {
+      ActiveNodes->reset(n);
+      Perfect = false;
+    }
+  ActiveNodes = 0;
+  return Perfect;
+}
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.h b/contrib/llvm/lib/CodeGen/SpillPlacement.h
new file mode 100644
index 000000000000..fc412f817cdb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.h
@@ -0,0 +1,156 @@
+//===-- SpillPlacement.h - Optimal Spill Code Placement --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis computes the optimal spill code placement between basic blocks.
+//
+// The runOnMachineFunction() method only precomputes some profiling information
+// about the CFG. The real work is done by prepare(), addConstraints(), and
+// finish() which are called by the register allocator.
+//
+// Given a variable that is live across multiple basic blocks, and given
+// constraints on the basic blocks where the variable is live, determine which
+// edge bundles should have the variable in a register and which edge bundles
+// should have the variable in a stack slot.
+//
+// The returned bit vector can be used to place optimal spill code at basic
+// block entries and exits. Spill code placement inside a basic block is not
+// considered.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPILLPLACEMENT_H
+#define LLVM_CODEGEN_SPILLPLACEMENT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class BitVector;
+class EdgeBundles;
+class MachineBasicBlock;
+class MachineLoopInfo;
+
+class SpillPlacement  : public MachineFunctionPass {
+  struct Node;
+  const MachineFunction *MF;
+  const EdgeBundles *bundles;
+  const MachineLoopInfo *loops;
+  Node *nodes;
+
+  // Nodes that are active in the current computation. Owned by the prepare()
+  // caller.
+  BitVector *ActiveNodes;
+
+  // Nodes with active links. Populated by scanActiveBundles.
+  SmallVector<unsigned, 8> Linked;
+
+  // Nodes that went positive during the last call to scanActiveBundles or
+  // iterate.
+  SmallVector<unsigned, 8> RecentPositive;
+
+  // Block frequencies are computed once. Indexed by block number.
+  SmallVector<float, 4> BlockFrequency;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  SpillPlacement() : MachineFunctionPass(ID), nodes(0) {}
+  ~SpillPlacement() { releaseMemory(); }
+
+  /// BorderConstraint - A basic block has separate constraints for entry and
+  /// exit.
+  enum BorderConstraint {
+    DontCare,  ///< Block doesn't care / variable not live.
+    PrefReg,   ///< Block entry/exit prefers a register.
+    PrefSpill, ///< Block entry/exit prefers a stack slot.
+    PrefBoth,  ///< Block entry prefers both register and stack.
+    MustSpill  ///< A register is impossible, variable must be spilled.
+  };
+
+  /// BlockConstraint - Entry and exit constraints for a basic block.
+  struct BlockConstraint {
+    unsigned Number;            ///< Basic block number (from MBB::getNumber()).
+    BorderConstraint Entry : 8; ///< Constraint on block entry.
+    BorderConstraint Exit : 8;  ///< Constraint on block exit.
+
+    /// True when this block changes the value of the live range. This means
+    /// the block has a non-PHI def.  When this is false, a live-in value on
+    /// the stack can be live-out on the stack without inserting a spill.
+    bool ChangesValue;
+  };
+
+  /// prepare - Reset state and prepare for a new spill placement computation.
+  /// @param RegBundles Bit vector to receive the edge bundles where the
+  ///                   variable should be kept in a register. Each bit
+  ///                   corresponds to an edge bundle, a set bit means the
+  ///                   variable should be kept in a register through the
+  ///                   bundle. A clear bit means the variable should be
+  ///                   spilled. This vector is retained.
+  void prepare(BitVector &RegBundles);
+
+  /// addConstraints - Add constraints and biases. This method may be called
+  /// more than once to accumulate constraints.
+  /// @param LiveBlocks Constraints for blocks that have the variable live in or
+  ///                   live out.
+  void addConstraints(ArrayRef<BlockConstraint> LiveBlocks);
+
+  /// addPrefSpill - Add PrefSpill constraints to all blocks listed.  This is
+  /// equivalent to calling addConstraint with identical BlockConstraints with
+  /// Entry = Exit = PrefSpill, and ChangesValue = false.
+  ///
+  /// @param Blocks Array of block numbers that prefer to spill in and out.
+  /// @param Strong When true, double the negative bias for these blocks.
+  void addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong);
+
+  /// addLinks - Add transparent blocks with the given numbers.
+  void addLinks(ArrayRef<unsigned> Links);
+
+  /// scanActiveBundles - Perform an initial scan of all bundles activated by
+  /// addConstraints and addLinks, updating their state. Add all the bundles
+  /// that now prefer a register to RecentPositive.
+  /// Prepare internal data structures for iterate.
+  /// Return true is there are any positive nodes.
+  bool scanActiveBundles();
+
+  /// iterate - Update the network iteratively until convergence, or new bundles
+  /// are found.
+  void iterate();
+
+  /// getRecentPositive - Return an array of bundles that became positive during
+  /// the previous call to scanActiveBundles or iterate.
+  ArrayRef<unsigned> getRecentPositive() { return RecentPositive; }
+
+  /// finish - Compute the optimal spill code placement given the
+  /// constraints. No MustSpill constraints will be violated, and the smallest
+  /// possible number of PrefX constraints will be violated, weighted by
+  /// expected execution frequencies.
+  /// The selected bundles are returned in the bitvector passed to prepare().
+  /// @return True if a perfect solution was found, allowing the variable to be
+  ///         in a register through all relevant bundles.
+  bool finish();
+
+  /// getBlockFrequency - Return the estimated block execution frequency per
+  /// function invocation.
+  float getBlockFrequency(unsigned Number) const {
+    return BlockFrequency[Number];
+  }
+
+private:
+  virtual bool runOnMachineFunction(MachineFunction&);
+  virtual void getAnalysisUsage(AnalysisUsage&) const;
+  virtual void releaseMemory();
+
+  void activate(unsigned);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/Spiller.cpp b/contrib/llvm/lib/CodeGen/Spiller.cpp
new file mode 100644
index 000000000000..209792fd407b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Spiller.cpp
@@ -0,0 +1,194 @@
+//===-- llvm/CodeGen/Spiller.cpp -  Spiller -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "spiller"
+
+#include "Spiller.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+  enum SpillerName { trivial, inline_ };
+}
+
+static cl::opt<SpillerName>
+spillerOpt("spiller",
+           cl::desc("Spiller to use: (default: standard)"),
+           cl::Prefix,
+           cl::values(clEnumVal(trivial,   "trivial spiller"),
+                      clEnumValN(inline_,  "inline", "inline spiller"),
+                      clEnumValEnd),
+           cl::init(trivial));
+
+// Spiller virtual destructor implementation.
+Spiller::~Spiller() {}
+
+namespace {
+
+/// Utility class for spillers.
+class SpillerBase : public Spiller {
+protected:
+  MachineFunctionPass *pass;
+  MachineFunction *mf;
+  VirtRegMap *vrm;
+  LiveIntervals *lis;
+  MachineFrameInfo *mfi;
+  MachineRegisterInfo *mri;
+  const TargetInstrInfo *tii;
+  const TargetRegisterInfo *tri;
+
+  /// Construct a spiller base.
+  SpillerBase(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+    : pass(&pass), mf(&mf), vrm(&vrm)
+  {
+    lis = &pass.getAnalysis<LiveIntervals>();
+    mfi = mf.getFrameInfo();
+    mri = &mf.getRegInfo();
+    tii = mf.getTarget().getInstrInfo();
+    tri = mf.getTarget().getRegisterInfo();
+  }
+
+  /// Add spill ranges for every use/def of the live interval, inserting loads
+  /// immediately before each use, and stores after each def. No folding or
+  /// remat is attempted.
+  void trivialSpillEverywhere(LiveRangeEdit& LRE) {
+    LiveInterval* li = &LRE.getParent();
+
+    DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
+
+    assert(li->weight != HUGE_VALF &&
+           "Attempting to spill already spilled value.");
+
+    assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
+           "Trying to spill a stack slot.");
+
+    DEBUG(dbgs() << "Trivial spill everywhere of reg" << li->reg << "\n");
+
+    const TargetRegisterClass *trc = mri->getRegClass(li->reg);
+    unsigned ss = vrm->assignVirt2StackSlot(li->reg);
+
+    // Iterate over reg uses/defs.
+    for (MachineRegisterInfo::reg_iterator
+         regItr = mri->reg_begin(li->reg); regItr != mri->reg_end();) {
+
+      // Grab the use/def instr.
+      MachineInstr *mi = &*regItr;
+
+      DEBUG(dbgs() << "  Processing " << *mi);
+
+      // Step regItr to the next use/def instr.
+      do {
+        ++regItr;
+      } while (regItr != mri->reg_end() && (&*regItr == mi));
+
+      // Collect uses & defs for this instr.
+      SmallVector<unsigned, 2> indices;
+      bool hasUse = false;
+      bool hasDef = false;
+      for (unsigned i = 0; i != mi->getNumOperands(); ++i) {
+        MachineOperand &op = mi->getOperand(i);
+        if (!op.isReg() || op.getReg() != li->reg)
+          continue;
+        hasUse |= mi->getOperand(i).isUse();
+        hasDef |= mi->getOperand(i).isDef();
+        indices.push_back(i);
+      }
+
+      // Create a new vreg & interval for this instr.
+      LiveInterval *newLI = &LRE.create();
+      newLI->weight = HUGE_VALF;
+
+      // Update the reg operands & kill flags.
+      for (unsigned i = 0; i < indices.size(); ++i) {
+        unsigned mopIdx = indices[i];
+        MachineOperand &mop = mi->getOperand(mopIdx);
+        mop.setReg(newLI->reg);
+        if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
+          mop.setIsKill(true);
+        }
+      }
+      assert(hasUse || hasDef);
+
+      // Insert reload if necessary.
+      MachineBasicBlock::iterator miItr(mi);
+      if (hasUse) {
+        tii->loadRegFromStackSlot(*mi->getParent(), miItr, newLI->reg, ss, trc,
+                                  tri);
+        MachineInstr *loadInstr(prior(miItr));
+        SlotIndex loadIndex =
+          lis->InsertMachineInstrInMaps(loadInstr).getRegSlot();
+        SlotIndex endIndex = loadIndex.getNextIndex();
+        VNInfo *loadVNI =
+          newLI->getNextValue(loadIndex, lis->getVNInfoAllocator());
+        newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
+      }
+
+      // Insert store if necessary.
+      if (hasDef) {
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr),newLI->reg,
+                                 true, ss, trc, tri);
+        MachineInstr *storeInstr(llvm::next(miItr));
+        SlotIndex storeIndex =
+          lis->InsertMachineInstrInMaps(storeInstr).getRegSlot();
+        SlotIndex beginIndex = storeIndex.getPrevIndex();
+        VNInfo *storeVNI =
+          newLI->getNextValue(beginIndex, lis->getVNInfoAllocator());
+        newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
+      }
+    }
+  }
+};
+
+} // end anonymous namespace
+
+namespace {
+
+/// Spills any live range using the spill-everywhere method with no attempt at
+/// folding.
+class TrivialSpiller : public SpillerBase {
+public:
+
+  TrivialSpiller(MachineFunctionPass &pass, MachineFunction &mf,
+                 VirtRegMap &vrm)
+    : SpillerBase(pass, mf, vrm) {}
+
+  void spill(LiveRangeEdit &LRE) {
+    // Ignore spillIs - we don't use it.
+    trivialSpillEverywhere(LRE);
+  }
+};
+
+} // end anonymous namespace
+
+void Spiller::anchor() { }
+
+llvm::Spiller* llvm::createSpiller(MachineFunctionPass &pass,
+                                   MachineFunction &mf,
+                                   VirtRegMap &vrm) {
+  switch (spillerOpt) {
+  case trivial: return new TrivialSpiller(pass, mf, vrm);
+  case inline_: return createInlineSpiller(pass, mf, vrm);
+  }
+  llvm_unreachable("Invalid spiller optimization");
+}
diff --git a/contrib/llvm/lib/CodeGen/Spiller.h b/contrib/llvm/lib/CodeGen/Spiller.h
new file mode 100644
index 000000000000..b7d5beaab1b2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Spiller.h
@@ -0,0 +1,47 @@
+//===-- llvm/CodeGen/Spiller.h - Spiller -*- C++ -*------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPILLER_H
+#define LLVM_CODEGEN_SPILLER_H
+
+namespace llvm {
+
+  class LiveRangeEdit;
+  class MachineFunction;
+  class MachineFunctionPass;
+  class VirtRegMap;
+
+  /// Spiller interface.
+  ///
+  /// Implementations are utility classes which insert spill or remat code on
+  /// demand.
+  class Spiller {
+    virtual void anchor();
+  public:
+    virtual ~Spiller() = 0;
+
+    /// spill - Spill the LRE.getParent() live interval.
+    virtual void spill(LiveRangeEdit &LRE) = 0;
+
+  };
+
+  /// Create and return a spiller object, as specified on the command line.
+  Spiller* createSpiller(MachineFunctionPass &pass,
+                         MachineFunction &mf,
+                         VirtRegMap &vrm);
+
+  /// Create and return a spiller that will insert spill code directly instead
+  /// of deferring though VirtRegMap.
+  Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                               MachineFunction &mf,
+                               VirtRegMap &vrm);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp
new file mode 100644
index 000000000000..0a3818e43ff9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp
@@ -0,0 +1,1432 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "SplitKit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+STATISTIC(NumFinished, "Number of splits finished");
+STATISTIC(NumSimple,   "Number of splits that were simple");
+STATISTIC(NumCopies,   "Number of copies inserted for splitting");
+STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
+STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm,
+                             const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+  : MF(vrm.getMachineFunction()),
+    VRM(vrm),
+    LIS(lis),
+    Loops(mli),
+    TII(*MF.getTarget().getInstrInfo()),
+    CurLI(0),
+    LastSplitPoint(MF.getNumBlockIDs()) {}
+
+void SplitAnalysis::clear() {
+  UseSlots.clear();
+  UseBlocks.clear();
+  ThroughBlocks.clear();
+  CurLI = 0;
+  DidRepairRange = false;
+}
+
+SlotIndex SplitAnalysis::computeLastSplitPoint(unsigned Num) {
+  const MachineBasicBlock *MBB = MF.getBlockNumbered(Num);
+  const MachineBasicBlock *LPad = MBB->getLandingPadSuccessor();
+  std::pair<SlotIndex, SlotIndex> &LSP = LastSplitPoint[Num];
+  SlotIndex MBBEnd = LIS.getMBBEndIdx(MBB);
+
+  // Compute split points on the first call. The pair is independent of the
+  // current live interval.
+  if (!LSP.first.isValid()) {
+    MachineBasicBlock::const_iterator FirstTerm = MBB->getFirstTerminator();
+    if (FirstTerm == MBB->end())
+      LSP.first = MBBEnd;
+    else
+      LSP.first = LIS.getInstructionIndex(FirstTerm);
+
+    // If there is a landing pad successor, also find the call instruction.
+    if (!LPad)
+      return LSP.first;
+    // There may not be a call instruction (?) in which case we ignore LPad.
+    LSP.second = LSP.first;
+    for (MachineBasicBlock::const_iterator I = MBB->end(), E = MBB->begin();
+         I != E;) {
+      --I;
+      if (I->isCall()) {
+        LSP.second = LIS.getInstructionIndex(I);
+        break;
+      }
+    }
+  }
+
+  // If CurLI is live into a landing pad successor, move the last split point
+  // back to the call that may throw.
+  if (!LPad || !LSP.second || !LIS.isLiveInToMBB(*CurLI, LPad))
+    return LSP.first;
+
+  // Find the value leaving MBB.
+  const VNInfo *VNI = CurLI->getVNInfoBefore(MBBEnd);
+  if (!VNI)
+    return LSP.first;
+
+  // If the value leaving MBB was defined after the call in MBB, it can't
+  // really be live-in to the landing pad.  This can happen if the landing pad
+  // has a PHI, and this register is undef on the exceptional edge.
+  // <rdar://problem/10664933>
+  if (!SlotIndex::isEarlierInstr(VNI->def, LSP.second) && VNI->def < MBBEnd)
+    return LSP.first;
+
+  // Value is properly live-in to the landing pad.
+  // Only allow splits before the call.
+  return LSP.second;
+}
+
+MachineBasicBlock::iterator
+SplitAnalysis::getLastSplitPointIter(MachineBasicBlock *MBB) {
+  SlotIndex LSP = getLastSplitPoint(MBB->getNumber());
+  if (LSP == LIS.getMBBEndIdx(MBB))
+    return MBB->end();
+  return LIS.getInstructionFromIndex(LSP);
+}
+
+/// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
+void SplitAnalysis::analyzeUses() {
+  assert(UseSlots.empty() && "Call clear first");
+
+  // First get all the defs from the interval values. This provides the correct
+  // slots for early clobbers.
+  for (LiveInterval::const_vni_iterator I = CurLI->vni_begin(),
+       E = CurLI->vni_end(); I != E; ++I)
+    if (!(*I)->isPHIDef() && !(*I)->isUnused())
+      UseSlots.push_back((*I)->def);
+
+  // Get use slots form the use-def chain.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineRegisterInfo::use_nodbg_iterator
+       I = MRI.use_nodbg_begin(CurLI->reg), E = MRI.use_nodbg_end(); I != E;
+       ++I)
+    if (!I.getOperand().isUndef())
+      UseSlots.push_back(LIS.getInstructionIndex(&*I).getRegSlot());
+
+  array_pod_sort(UseSlots.begin(), UseSlots.end());
+
+  // Remove duplicates, keeping the smaller slot for each instruction.
+  // That is what we want for early clobbers.
+  UseSlots.erase(std::unique(UseSlots.begin(), UseSlots.end(),
+                             SlotIndex::isSameInstr),
+                 UseSlots.end());
+
+  // Compute per-live block info.
+  if (!calcLiveBlockInfo()) {
+    // FIXME: calcLiveBlockInfo found inconsistencies in the live range.
+    // I am looking at you, RegisterCoalescer!
+    DidRepairRange = true;
+    ++NumRepairs;
+    DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
+    const_cast<LiveIntervals&>(LIS)
+      .shrinkToUses(const_cast<LiveInterval*>(CurLI));
+    UseBlocks.clear();
+    ThroughBlocks.clear();
+    bool fixed = calcLiveBlockInfo();
+    (void)fixed;
+    assert(fixed && "Couldn't fix broken live interval");
+  }
+
+  DEBUG(dbgs() << "Analyze counted "
+               << UseSlots.size() << " instrs in "
+               << UseBlocks.size() << " blocks, through "
+               << NumThroughBlocks << " blocks.\n");
+}
+
+/// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
+/// where CurLI is live.
+bool SplitAnalysis::calcLiveBlockInfo() {
+  ThroughBlocks.resize(MF.getNumBlockIDs());
+  NumThroughBlocks = NumGapBlocks = 0;
+  if (CurLI->empty())
+    return true;
+
+  LiveInterval::const_iterator LVI = CurLI->begin();
+  LiveInterval::const_iterator LVE = CurLI->end();
+
+  SmallVectorImpl<SlotIndex>::const_iterator UseI, UseE;
+  UseI = UseSlots.begin();
+  UseE = UseSlots.end();
+
+  // Loop over basic blocks where CurLI is live.
+  MachineFunction::iterator MFI = LIS.getMBBFromIndex(LVI->start);
+  for (;;) {
+    BlockInfo BI;
+    BI.MBB = MFI;
+    SlotIndex Start, Stop;
+    tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+    // If the block contains no uses, the range must be live through. At one
+    // point, RegisterCoalescer could create dangling ranges that ended
+    // mid-block.
+    if (UseI == UseE || *UseI >= Stop) {
+      ++NumThroughBlocks;
+      ThroughBlocks.set(BI.MBB->getNumber());
+      // The range shouldn't end mid-block if there are no uses. This shouldn't
+      // happen.
+      if (LVI->end < Stop)
+        return false;
+    } else {
+      // This block has uses. Find the first and last uses in the block.
+      BI.FirstInstr = *UseI;
+      assert(BI.FirstInstr >= Start);
+      do ++UseI;
+      while (UseI != UseE && *UseI < Stop);
+      BI.LastInstr = UseI[-1];
+      assert(BI.LastInstr < Stop);
+
+      // LVI is the first live segment overlapping MBB.
+      BI.LiveIn = LVI->start <= Start;
+
+      // When not live in, the first use should be a def.
+      if (!BI.LiveIn) {
+        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        assert(LVI->start == BI.FirstInstr && "First instr should be a def");
+        BI.FirstDef = BI.FirstInstr;
+      }
+
+      // Look for gaps in the live range.
+      BI.LiveOut = true;
+      while (LVI->end < Stop) {
+        SlotIndex LastStop = LVI->end;
+        if (++LVI == LVE || LVI->start >= Stop) {
+          BI.LiveOut = false;
+          BI.LastInstr = LastStop;
+          break;
+        }
+
+        if (LastStop < LVI->start) {
+          // There is a gap in the live range. Create duplicate entries for the
+          // live-in snippet and the live-out snippet.
+          ++NumGapBlocks;
+
+          // Push the Live-in part.
+          BI.LiveOut = false;
+          UseBlocks.push_back(BI);
+          UseBlocks.back().LastInstr = LastStop;
+
+          // Set up BI for the live-out part.
+          BI.LiveIn = false;
+          BI.LiveOut = true;
+          BI.FirstInstr = BI.FirstDef = LVI->start;
+        }
+
+        // A LiveRange that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        if (!BI.FirstDef)
+          BI.FirstDef = LVI->start;
+      }
+
+      UseBlocks.push_back(BI);
+
+      // LVI is now at LVE or LVI->end >= Stop.
+      if (LVI == LVE)
+        break;
+    }
+
+    // Live segment ends exactly at Stop. Move to the next segment.
+    if (LVI->end == Stop && ++LVI == LVE)
+      break;
+
+    // Pick the next basic block.
+    if (LVI->start < Stop)
+      ++MFI;
+    else
+      MFI = LIS.getMBBFromIndex(LVI->start);
+  }
+
+  assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
+  return true;
+}
+
+unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
+  if (cli->empty())
+    return 0;
+  LiveInterval *li = const_cast<LiveInterval*>(cli);
+  LiveInterval::iterator LVI = li->begin();
+  LiveInterval::iterator LVE = li->end();
+  unsigned Count = 0;
+
+  // Loop over basic blocks where li is live.
+  MachineFunction::const_iterator MFI = LIS.getMBBFromIndex(LVI->start);
+  SlotIndex Stop = LIS.getMBBEndIdx(MFI);
+  for (;;) {
+    ++Count;
+    LVI = li->advanceTo(LVI, Stop);
+    if (LVI == LVE)
+      return Count;
+    do {
+      ++MFI;
+      Stop = LIS.getMBBEndIdx(MFI);
+    } while (Stop <= LVI->start);
+  }
+}
+
+bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
+  unsigned OrigReg = VRM.getOriginal(CurLI->reg);
+  const LiveInterval &Orig = LIS.getInterval(OrigReg);
+  assert(!Orig.empty() && "Splitting empty interval?");
+  LiveInterval::const_iterator I = Orig.find(Idx);
+
+  // Range containing Idx should begin at Idx.
+  if (I != Orig.end() && I->start <= Idx)
+    return I->start == Idx;
+
+  // Range does not contain Idx, previous must end at Idx.
+  return I != Orig.begin() && (--I)->end == Idx;
+}
+
+void SplitAnalysis::analyze(const LiveInterval *li) {
+  clear();
+  CurLI = li;
+  analyzeUses();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               Split Editor
+//===----------------------------------------------------------------------===//
+
+/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+SplitEditor::SplitEditor(SplitAnalysis &sa,
+                         LiveIntervals &lis,
+                         VirtRegMap &vrm,
+                         MachineDominatorTree &mdt)
+  : SA(sa), LIS(lis), VRM(vrm),
+    MRI(vrm.getMachineFunction().getRegInfo()),
+    MDT(mdt),
+    TII(*vrm.getMachineFunction().getTarget().getInstrInfo()),
+    TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
+    Edit(0),
+    OpenIdx(0),
+    SpillMode(SM_Partition),
+    RegAssign(Allocator)
+{}
+
+void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
+  Edit = &LRE;
+  SpillMode = SM;
+  OpenIdx = 0;
+  RegAssign.clear();
+  Values.clear();
+
+  // Reset the LiveRangeCalc instances needed for this spill mode.
+  LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                  &LIS.getVNInfoAllocator());
+  if (SpillMode)
+    LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                    &LIS.getVNInfoAllocator());
+
+  // We don't need an AliasAnalysis since we will only be performing
+  // cheap-as-a-copy remats anyway.
+  Edit->anyRematerializable(0);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void SplitEditor::dump() const {
+  if (RegAssign.empty()) {
+    dbgs() << " empty\n";
+    return;
+  }
+
+  for (RegAssignMap::const_iterator I = RegAssign.begin(); I.valid(); ++I)
+    dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
+  dbgs() << '\n';
+}
+#endif
+
+VNInfo *SplitEditor::defValue(unsigned RegIdx,
+                              const VNInfo *ParentVNI,
+                              SlotIndex Idx) {
+  assert(ParentVNI && "Mapping  NULL value");
+  assert(Idx.isValid() && "Invalid SlotIndex");
+  assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
+  LiveInterval *LI = Edit->get(RegIdx);
+
+  // Create a new value.
+  VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
+
+  // Use insert for lookup, so we can add missing values with a second lookup.
+  std::pair<ValueMap::iterator, bool> InsP =
+    Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id),
+                                 ValueForcePair(VNI, false)));
+
+  // This was the first time (RegIdx, ParentVNI) was mapped.
+  // Keep it as a simple def without any liveness.
+  if (InsP.second)
+    return VNI;
+
+  // If the previous value was a simple mapping, add liveness for it now.
+  if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
+    SlotIndex Def = OldVNI->def;
+    LI->addRange(LiveRange(Def, Def.getDeadSlot(), OldVNI));
+    // No longer a simple mapping.  Switch to a complex, non-forced mapping.
+    InsP.first->second = ValueForcePair();
+  }
+
+  // This is a complex mapping, add liveness for VNI
+  SlotIndex Def = VNI->def;
+  LI->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+
+  return VNI;
+}
+
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
+  assert(ParentVNI && "Mapping  NULL value");
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+  VNInfo *VNI = VFP.getPointer();
+
+  // ParentVNI was either unmapped or already complex mapped. Either way, just
+  // set the force bit.
+  if (!VNI) {
+    VFP.setInt(true);
+    return;
+  }
+
+  // This was previously a single mapping. Make sure the old def is represented
+  // by a trivial live range.
+  SlotIndex Def = VNI->def;
+  Edit->get(RegIdx)->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  // Mark as complex mapped, forced.
+  VFP = ValueForcePair(0, true);
+}
+
+VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
+                                   VNInfo *ParentVNI,
+                                   SlotIndex UseIdx,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) {
+  MachineInstr *CopyMI = 0;
+  SlotIndex Def;
+  LiveInterval *LI = Edit->get(RegIdx);
+
+  // We may be trying to avoid interference that ends at a deleted instruction,
+  // so always begin RegIdx 0 early and all others late.
+  bool Late = RegIdx != 0;
+
+  // Attempt cheap-as-a-copy rematerialization.
+  LiveRangeEdit::Remat RM(ParentVNI);
+  if (Edit->canRematerializeAt(RM, UseIdx, true)) {
+    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
+    ++NumRemats;
+  } else {
+    // Can't remat, just insert a copy from parent.
+    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
+               .addReg(Edit->getReg());
+    Def = LIS.getSlotIndexes()->insertMachineInstrInMaps(CopyMI, Late)
+            .getRegSlot();
+    ++NumCopies;
+  }
+
+  // Define the value in Reg.
+  return defValue(RegIdx, ParentVNI, Def);
+}
+
+/// Create a new virtual register and live interval.
+unsigned SplitEditor::openIntv() {
+  // Create the complement as index 0.
+  if (Edit->empty())
+    Edit->create();
+
+  // Create the open interval.
+  OpenIdx = Edit->size();
+  Edit->create();
+  return OpenIdx;
+}
+
+void SplitEditor::selectIntv(unsigned Idx) {
+  assert(Idx != 0 && "Cannot select the complement interval");
+  assert(Idx < Edit->size() && "Can only select previously opened interval");
+  DEBUG(dbgs() << "    selectIntv " << OpenIdx << " -> " << Idx << '\n');
+  OpenIdx = Idx;
+}
+
+SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvBefore");
+  DEBUG(dbgs() << "    enterIntvBefore " << Idx);
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvBefore called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvAfter");
+  DEBUG(dbgs() << "    enterIntvAfter " << Idx);
+  Idx = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvAfter called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(),
+                              llvm::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAtEnd(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before enterIntvAtEnd");
+  SlotIndex End = LIS.getMBBEndIdx(&MBB);
+  SlotIndex Last = End.getPrevSlot();
+  DEBUG(dbgs() << "    enterIntvAtEnd BB#" << MBB.getNumber() << ", " << Last);
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Last);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return End;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id);
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Last, MBB,
+                              SA.getLastSplitPointIter(&MBB));
+  RegAssign.insert(VNI->def, End, OpenIdx);
+  DEBUG(dump());
+  return VNI->def;
+}
+
+/// useIntv - indicate that all instructions in MBB should use OpenLI.
+void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
+  useIntv(LIS.getMBBStartIdx(&MBB), LIS.getMBBEndIdx(&MBB));
+}
+
+void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before useIntv");
+  DEBUG(dbgs() << "    useIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  DEBUG(dump());
+}
+
+SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAfter");
+  DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
+
+  // The interval must be live beyond the instruction at Idx.
+  SlotIndex Boundary = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Boundary.getNextSlot();
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
+  assert(MI && "No instruction at index");
+
+  // In spill mode, make live ranges as short as possible by inserting the copy
+  // before MI.  This is only possible if that instruction doesn't redefine the
+  // value.  The inserted COPY is not a kill, and we don't need to recompute
+  // the source live range.  The spiller also won't try to hoist this copy.
+  if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
+      MI->readsVirtualRegister(Edit->getReg())) {
+    forceRecompute(0, ParentVNI);
+    defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+    return Idx;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Boundary, *MI->getParent(),
+                              llvm::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvBefore");
+  DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
+
+  // The interval must be live into the instruction at Idx.
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx.getNextSlot();
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "No instruction at index");
+  VNInfo *VNI = defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAtTop");
+  SlotIndex Start = LIS.getMBBStartIdx(&MBB);
+  DEBUG(dbgs() << "    leaveIntvAtTop BB#" << MBB.getNumber() << ", " << Start);
+
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Start;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
+                              MBB.SkipPHIsAndLabels(MBB.begin()));
+  RegAssign.insert(Start, VNI->def, OpenIdx);
+  DEBUG(dump());
+  return VNI->def;
+}
+
+void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before overlapIntv");
+  const VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  assert(ParentVNI == Edit->getParent().getVNInfoBefore(End) &&
+         "Parent changes value in extended range");
+  assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
+         "Range cannot span basic blocks");
+
+  // The complement interval will be extended as needed by LRCalc.extend().
+  if (ParentVNI)
+    forceRecompute(0, ParentVNI);
+  DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  DEBUG(dump());
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Spill modes
+//===----------------------------------------------------------------------===//
+
+void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
+  LiveInterval *LI = Edit->get(0);
+  DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
+  RegAssignMap::iterator AssignI;
+  AssignI.setMap(RegAssign);
+
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    VNInfo *VNI = Copies[i];
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = LIS.getInstructionFromIndex(Def);
+    assert(MI && "No instruction for back-copy");
+
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::iterator MBBI(MI);
+    bool AtBegin;
+    do AtBegin = MBBI == MBB->begin();
+    while (!AtBegin && (--MBBI)->isDebugValue());
+
+    DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
+    LI->removeValNo(VNI);
+    LIS.RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+
+    // Adjust RegAssign if a register assignment is killed at VNI->def.  We
+    // want to avoid calculating the live range of the source register if
+    // possible.
+    AssignI.find(Def.getPrevSlot());
+    if (!AssignI.valid() || AssignI.start() >= Def)
+      continue;
+    // If MI doesn't kill the assigned register, just leave it.
+    if (AssignI.stop() != Def)
+      continue;
+    unsigned RegIdx = AssignI.value();
+    if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
+      DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
+      forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+    } else {
+      SlotIndex Kill = LIS.getInstructionIndex(MBBI).getRegSlot();
+      DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
+      AssignI.setStop(Kill);
+    }
+  }
+}
+
+MachineBasicBlock*
+SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *DefMBB) {
+  if (MBB == DefMBB)
+    return MBB;
+  assert(MDT.dominates(DefMBB, MBB) && "MBB must be dominated by the def.");
+
+  const MachineLoopInfo &Loops = SA.Loops;
+  const MachineLoop *DefLoop = Loops.getLoopFor(DefMBB);
+  MachineDomTreeNode *DefDomNode = MDT[DefMBB];
+
+  // Best candidate so far.
+  MachineBasicBlock *BestMBB = MBB;
+  unsigned BestDepth = UINT_MAX;
+
+  for (;;) {
+    const MachineLoop *Loop = Loops.getLoopFor(MBB);
+
+    // MBB isn't in a loop, it doesn't get any better.  All dominators have a
+    // higher frequency by definition.
+    if (!Loop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth 0\n");
+      return MBB;
+    }
+
+    // We'll never be able to exit the DefLoop.
+    if (Loop == DefLoop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " in the same loop\n");
+      return MBB;
+    }
+
+    // Least busy dominator seen so far.
+    unsigned Depth = Loop->getLoopDepth();
+    if (Depth < BestDepth) {
+      BestMBB = MBB;
+      BestDepth = Depth;
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth " << Depth << '\n');
+    }
+
+    // Leave loop by going to the immediate dominator of the loop header.
+    // This is a bigger stride than simply walking up the dominator tree.
+    MachineDomTreeNode *IDom = MDT[Loop->getHeader()]->getIDom();
+
+    // Too far up the dominator tree?
+    if (!IDom || !MDT.dominates(DefDomNode, IDom))
+      return BestMBB;
+
+    MBB = IDom->getBlock();
+  }
+}
+
+void SplitEditor::hoistCopiesForSize() {
+  // Get the complement interval, always RegIdx 0.
+  LiveInterval *LI = Edit->get(0);
+  LiveInterval *Parent = &Edit->getParent();
+
+  // Track the nearest common dominator for all back-copies for each ParentVNI,
+  // indexed by ParentVNI->id.
+  typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
+  SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+
+  // Find the nearest common dominator for parent values with multiple
+  // back-copies.  If a single back-copy dominates, put it in DomPair.second.
+  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
+       VI != VE; ++VI) {
+    VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    assert(ParentVNI && "Parent not live at complement def");
+
+    // Don't hoist remats.  The complement is probably going to disappear
+    // completely anyway.
+    if (Edit->didRematerialize(ParentVNI))
+      continue;
+
+    MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+    DomPair &Dom = NearestDom[ParentVNI->id];
+
+    // Keep directly defined parent values.  This is either a PHI or an
+    // instruction in the complement range.  All other copies of ParentVNI
+    // should be eliminated.
+    if (VNI->def == ParentVNI->def) {
+      DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
+      Dom = DomPair(ValMBB, VNI->def);
+      continue;
+    }
+    // Skip the singly mapped values.  There is nothing to gain from hoisting a
+    // single back-copy.
+    if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
+      DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
+      continue;
+    }
+
+    if (!Dom.first) {
+      // First time we see ParentVNI.  VNI dominates itself.
+      Dom = DomPair(ValMBB, VNI->def);
+    } else if (Dom.first == ValMBB) {
+      // Two defs in the same block.  Pick the earlier def.
+      if (!Dom.second.isValid() || VNI->def < Dom.second)
+        Dom.second = VNI->def;
+    } else {
+      // Different basic blocks. Check if one dominates.
+      MachineBasicBlock *Near =
+        MDT.findNearestCommonDominator(Dom.first, ValMBB);
+      if (Near == ValMBB)
+        // Def ValMBB dominates.
+        Dom = DomPair(ValMBB, VNI->def);
+      else if (Near != Dom.first)
+        // None dominate. Hoist to common dominator, need new def.
+        Dom = DomPair(Near, SlotIndex());
+    }
+
+    DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
+                 << " for parent " << ParentVNI->id << '@' << ParentVNI->def
+                 << " hoist to BB#" << Dom.first->getNumber() << ' '
+                 << Dom.second << '\n');
+  }
+
+  // Insert the hoisted copies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    DomPair &Dom = NearestDom[i];
+    if (!Dom.first || Dom.second.isValid())
+      continue;
+    // This value needs a hoisted copy inserted at the end of Dom.first.
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
+    // Get a less loopy dominator than Dom.first.
+    Dom.first = findShallowDominator(Dom.first, DefMBB);
+    SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
+    Dom.second =
+      defFromParent(0, ParentVNI, Last, *Dom.first,
+                    SA.getLastSplitPointIter(Dom.first))->def;
+  }
+
+  // Remove redundant back-copies that are now known to be dominated by another
+  // def with the same value.
+  SmallVector<VNInfo*, 8> BackCopies;
+  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
+       VI != VE; ++VI) {
+    VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    const DomPair &Dom = NearestDom[ParentVNI->id];
+    if (!Dom.first || Dom.second == VNI->def)
+      continue;
+    BackCopies.push_back(VNI);
+    forceRecompute(0, ParentVNI);
+  }
+  removeBackCopies(BackCopies);
+}
+
+
+/// transferValues - Transfer all possible values to the new live ranges.
+/// Values that were rematerialized are left alone, they need LRCalc.extend().
+bool SplitEditor::transferValues() {
+  bool Skipped = false;
+  RegAssignMap::const_iterator AssignI = RegAssign.begin();
+  for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
+         ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
+    DEBUG(dbgs() << "  blit " << *ParentI << ':');
+    VNInfo *ParentVNI = ParentI->valno;
+    // RegAssign has holes where RegIdx 0 should be used.
+    SlotIndex Start = ParentI->start;
+    AssignI.advanceTo(Start);
+    do {
+      unsigned RegIdx;
+      SlotIndex End = ParentI->end;
+      if (!AssignI.valid()) {
+        RegIdx = 0;
+      } else if (AssignI.start() <= Start) {
+        RegIdx = AssignI.value();
+        if (AssignI.stop() < End) {
+          End = AssignI.stop();
+          ++AssignI;
+        }
+      } else {
+        RegIdx = 0;
+        End = std::min(End, AssignI.start());
+      }
+
+      // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
+      DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
+      LiveInterval *LI = Edit->get(RegIdx);
+
+      // Check for a simply defined value that can be blitted directly.
+      ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
+      if (VNInfo *VNI = VFP.getPointer()) {
+        DEBUG(dbgs() << ':' << VNI->id);
+        LI->addRange(LiveRange(Start, End, VNI));
+        Start = End;
+        continue;
+      }
+
+      // Skip values with forced recomputation.
+      if (VFP.getInt()) {
+        DEBUG(dbgs() << "(recalc)");
+        Skipped = true;
+        Start = End;
+        continue;
+      }
+
+      LiveRangeCalc &LRC = getLRCalc(RegIdx);
+
+      // This value has multiple defs in RegIdx, but it wasn't rematerialized,
+      // so the live range is accurate. Add live-in blocks in [Start;End) to the
+      // LiveInBlocks.
+      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start);
+      SlotIndex BlockStart, BlockEnd;
+      tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(MBB);
+
+      // The first block may be live-in, or it may have its own def.
+      if (Start != BlockStart) {
+        VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+        assert(VNI && "Missing def for complex mapped value");
+        DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
+        // MBB has its own def. Is it also live-out?
+        if (BlockEnd <= End)
+          LRC.setLiveOutValue(MBB, VNI);
+
+        // Skip to the next block for live-in.
+        ++MBB;
+        BlockStart = BlockEnd;
+      }
+
+      // Handle the live-in blocks covered by [Start;End).
+      assert(Start <= BlockStart && "Expected live-in block");
+      while (BlockStart < End) {
+        DEBUG(dbgs() << ">BB#" << MBB->getNumber());
+        BlockEnd = LIS.getMBBEndIdx(MBB);
+        if (BlockStart == ParentVNI->def) {
+          // This block has the def of a parent PHI, so it isn't live-in.
+          assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
+          VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+          assert(VNI && "Missing def for complex mapped parent PHI");
+          if (End >= BlockEnd)
+            LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
+        } else {
+          // This block needs a live-in value.  The last block covered may not
+          // be live-out.
+          if (End < BlockEnd)
+            LRC.addLiveInBlock(LI, MDT[MBB], End);
+          else {
+            // Live-through, and we don't know the value.
+            LRC.addLiveInBlock(LI, MDT[MBB]);
+            LRC.setLiveOutValue(MBB, 0);
+          }
+        }
+        BlockStart = BlockEnd;
+        ++MBB;
+      }
+      Start = End;
+    } while (Start != ParentI->end);
+    DEBUG(dbgs() << '\n');
+  }
+
+  LRCalc[0].calculateValues();
+  if (SpillMode)
+    LRCalc[1].calculateValues();
+
+  return Skipped;
+}
+
+void SplitEditor::extendPHIKillRanges() {
+    // Extend live ranges to be live-out for successor PHI values.
+  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
+       E = Edit->getParent().vni_end(); I != E; ++I) {
+    const VNInfo *PHIVNI = *I;
+    if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
+      continue;
+    unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
+    LiveInterval *LI = Edit->get(RegIdx);
+    LiveRangeCalc &LRC = getLRCalc(RegIdx);
+    MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+      SlotIndex End = LIS.getMBBEndIdx(*PI);
+      SlotIndex LastUse = End.getPrevSlot();
+      // The predecessor may not have a live-out value. That is OK, like an
+      // undef PHI operand.
+      if (Edit->getParent().liveAt(LastUse)) {
+        assert(RegAssign.lookup(LastUse) == RegIdx &&
+               "Different register assignment in phi predecessor");
+        LRC.extend(LI, End);
+      }
+    }
+  }
+}
+
+/// rewriteAssigned - Rewrite all uses of Edit->getReg().
+void SplitEditor::rewriteAssigned(bool ExtendRanges) {
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg()),
+       RE = MRI.reg_end(); RI != RE;) {
+    MachineOperand &MO = RI.getOperand();
+    MachineInstr *MI = MO.getParent();
+    ++RI;
+    // LiveDebugVariables should have handled all DBG_VALUE instructions.
+    if (MI->isDebugValue()) {
+      DEBUG(dbgs() << "Zapping " << *MI);
+      MO.setReg(0);
+      continue;
+    }
+
+    // <undef> operands don't really read the register, so it doesn't matter
+    // which register we choose.  When the use operand is tied to a def, we must
+    // use the same register as the def, so just do that always.
+    SlotIndex Idx = LIS.getInstructionIndex(MI);
+    if (MO.isDef() || MO.isUndef())
+      Idx = Idx.getRegSlot(MO.isEarlyClobber());
+
+    // Rewrite to the mapped register at Idx.
+    unsigned RegIdx = RegAssign.lookup(Idx);
+    LiveInterval *LI = Edit->get(RegIdx);
+    MO.setReg(LI->reg);
+    DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
+                 << Idx << ':' << RegIdx << '\t' << *MI);
+
+    // Extend liveness to Idx if the instruction reads reg.
+    if (!ExtendRanges || MO.isUndef())
+      continue;
+
+    // Skip instructions that don't read Reg.
+    if (MO.isDef()) {
+      if (!MO.getSubReg() && !MO.isEarlyClobber())
+        continue;
+      // We may wan't to extend a live range for a partial redef, or for a use
+      // tied to an early clobber.
+      Idx = Idx.getPrevSlot();
+      if (!Edit->getParent().liveAt(Idx))
+        continue;
+    } else
+      Idx = Idx.getRegSlot(true);
+
+    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
+  }
+}
+
+void SplitEditor::deleteRematVictims() {
+  SmallVector<MachineInstr*, 8> Dead;
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
+    LiveInterval *LI = *I;
+    for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
+           LII != LIE; ++LII) {
+      // Dead defs end at the dead slot.
+      if (LII->end != LII->valno->def.getDeadSlot())
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(LII->valno->def);
+      assert(MI && "Missing instruction for dead def");
+      MI->addRegisterDead(LI->reg, &TRI);
+
+      if (!MI->allDefsAreDead())
+        continue;
+
+      DEBUG(dbgs() << "All defs dead: " << *MI);
+      Dead.push_back(MI);
+    }
+  }
+
+  if (Dead.empty())
+    return;
+
+  Edit->eliminateDeadDefs(Dead);
+}
+
+void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
+  ++NumFinished;
+
+  // At this point, the live intervals in Edit contain VNInfos corresponding to
+  // the inserted copies.
+
+  // Add the original defs from the parent interval.
+  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
+         E = Edit->getParent().vni_end(); I != E; ++I) {
+    const VNInfo *ParentVNI = *I;
+    if (ParentVNI->isUnused())
+      continue;
+    unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
+    defValue(RegIdx, ParentVNI, ParentVNI->def);
+
+    // Force rematted values to be recomputed everywhere.
+    // The new live ranges may be truncated.
+    if (Edit->didRematerialize(ParentVNI))
+      for (unsigned i = 0, e = Edit->size(); i != e; ++i)
+        forceRecompute(i, ParentVNI);
+  }
+
+  // Hoist back-copies to the complement interval when in spill mode.
+  switch (SpillMode) {
+  case SM_Partition:
+    // Leave all back-copies as is.
+    break;
+  case SM_Size:
+    hoistCopiesForSize();
+    break;
+  case SM_Speed:
+    llvm_unreachable("Spill mode 'speed' not implemented yet");
+  }
+
+  // Transfer the simply mapped values, check if any are skipped.
+  bool Skipped = transferValues();
+  if (Skipped)
+    extendPHIKillRanges();
+  else
+    ++NumSimple;
+
+  // Rewrite virtual registers, possibly extending ranges.
+  rewriteAssigned(Skipped);
+
+  // Delete defs that were rematted everywhere.
+  if (Skipped)
+    deleteRematVictims();
+
+  // Get rid of unused values and set phi-kill flags.
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I)
+    (*I)->RenumberValues(LIS);
+
+  // Provide a reverse mapping from original indices to Edit ranges.
+  if (LRMap) {
+    LRMap->clear();
+    for (unsigned i = 0, e = Edit->size(); i != e; ++i)
+      LRMap->push_back(i);
+  }
+
+  // Now check if any registers were separated into multiple components.
+  ConnectedVNInfoEqClasses ConEQ(LIS);
+  for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
+    // Don't use iterators, they are invalidated by create() below.
+    LiveInterval *li = Edit->get(i);
+    unsigned NumComp = ConEQ.Classify(li);
+    if (NumComp <= 1)
+      continue;
+    DEBUG(dbgs() << "  " << NumComp << " components: " << *li << '\n');
+    SmallVector<LiveInterval*, 8> dups;
+    dups.push_back(li);
+    for (unsigned j = 1; j != NumComp; ++j)
+      dups.push_back(&Edit->create());
+    ConEQ.Distribute(&dups[0], MRI);
+    // The new intervals all map back to i.
+    if (LRMap)
+      LRMap->resize(Edit->size(), i);
+  }
+
+  // Calculate spill weight and allocation hints for new intervals.
+  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops);
+
+  assert(!LRMap || LRMap->size() == Edit->size());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Single Block Splitting
+//===----------------------------------------------------------------------===//
+
+bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
+                                           bool SingleInstrs) const {
+  // Always split for multiple instructions.
+  if (!BI.isOneInstr())
+    return true;
+  // Don't split for single instructions unless explicitly requested.
+  if (!SingleInstrs)
+    return false;
+  // Splitting a live-through range always makes progress.
+  if (BI.LiveIn && BI.LiveOut)
+    return true;
+  // No point in isolating a copy. It has no register class constraints.
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
+    return false;
+  // Finally, don't isolate an end point that was created by earlier splits.
+  return isOriginalEndpoint(BI.FirstInstr);
+}
+
+void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
+  openIntv();
+  SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
+  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstInstr,
+    LastSplitPoint));
+  if (!BI.LiveOut || BI.LastInstr < LastSplitPoint) {
+    useIntv(SegStart, leaveIntvAfter(BI.LastInstr));
+  } else {
+      // The last use is after the last valid split point.
+    SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
+    useIntv(SegStart, SegStop);
+    overlapIntv(SegStop, BI.LastInstr);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                    Global Live Range Splitting Support
+//===----------------------------------------------------------------------===//
+
+// These methods support a method of global live range splitting that uses a
+// global algorithm to decide intervals for CFG edges. They will insert split
+// points and color intervals in basic blocks while avoiding interference.
+//
+// Note that splitSingleBlock is also useful for blocks where both CFG edges
+// are on the stack.
+
+void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
+                                        unsigned IntvIn, SlotIndex LeaveBefore,
+                                        unsigned IntvOut, SlotIndex EnterAfter){
+  SlotIndex Start, Stop;
+  tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(MBBNum);
+
+  DEBUG(dbgs() << "BB#" << MBBNum << " [" << Start << ';' << Stop
+               << ") intf " << LeaveBefore << '-' << EnterAfter
+               << ", live-through " << IntvIn << " -> " << IntvOut);
+
+  assert((IntvIn || IntvOut) && "Use splitSingleBlock for isolated blocks");
+
+  assert((!LeaveBefore || LeaveBefore < Stop) && "Interference after block");
+  assert((!IntvIn || !LeaveBefore || LeaveBefore > Start) && "Impossible intf");
+  assert((!EnterAfter || EnterAfter >= Start) && "Interference before block");
+
+  MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
+
+  if (!IntvOut) {
+    DEBUG(dbgs() << ", spill on entry.\n");
+    //
+    //        <<<<<<<<<    Possible LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    -____________    Spill on entry.
+    //
+    selectIntv(IntvIn);
+    SlotIndex Idx = leaveIntvAtTop(*MBB);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (!IntvIn) {
+    DEBUG(dbgs() << ", reload on exit.\n");
+    //
+    //    >>>>>>>          Possible EnterAfter interference.
+    //    |-----------|    Live through.
+    //    ___________--    Reload on exit.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvAtEnd(*MBB);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (IntvIn == IntvOut && !LeaveBefore && !EnterAfter) {
+    DEBUG(dbgs() << ", straight through.\n");
+    //
+    //    |-----------|    Live through.
+    //    -------------    Straight through, same intv, no interference.
+    //
+    selectIntv(IntvOut);
+    useIntv(Start, Stop);
+    return;
+  }
+
+  // We cannot legally insert splits after LSP.
+  SlotIndex LSP = SA.getLastSplitPoint(MBBNum);
+  assert((!IntvOut || !EnterAfter || EnterAfter < LSP) && "Impossible intf");
+
+  if (IntvIn != IntvOut && (!LeaveBefore || !EnterAfter ||
+                  LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
+    DEBUG(dbgs() << ", switch avoiding interference.\n");
+    //
+    //    >>>>     <<<<    Non-overlapping EnterAfter/LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    ------=======    Switch intervals between interference.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx;
+    if (LeaveBefore && LeaveBefore < LSP) {
+      Idx = enterIntvBefore(LeaveBefore);
+      useIntv(Idx, Stop);
+    } else {
+      Idx = enterIntvAtEnd(*MBB);
+    }
+    selectIntv(IntvIn);
+    useIntv(Start, Idx);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  DEBUG(dbgs() << ", create local intv for interference.\n");
+  //
+  //    >>><><><><<<<    Overlapping EnterAfter/LeaveBefore interference.
+  //    |-----------|    Live through.
+  //    ==---------==    Switch intervals before/after interference.
+  //
+  assert(LeaveBefore <= EnterAfter && "Missed case");
+
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  selectIntv(IntvIn);
+  Idx = leaveIntvBefore(LeaveBefore);
+  useIntv(Start, Idx);
+  assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+}
+
+
+void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                                  unsigned IntvIn, SlotIndex LeaveBefore) {
+  SlotIndex Start, Stop;
+  tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
+               << ", reg-in " << IntvIn << ", leave before " << LeaveBefore
+               << (BI.LiveOut ? ", stack-out" : ", killed in block"));
+
+  assert(IntvIn && "Must have register in");
+  assert(BI.LiveIn && "Must be live-in");
+  assert((!LeaveBefore || LeaveBefore > Start) && "Bad interference");
+
+  if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastInstr)) {
+    DEBUG(dbgs() << " before interference.\n");
+    //
+    //               <<<    Interference after kill.
+    //     |---o---x   |    Killed in block.
+    //     =========        Use IntvIn everywhere.
+    //
+    selectIntv(IntvIn);
+    useIntv(Start, BI.LastInstr);
+    return;
+  }
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
+
+  if (!LeaveBefore || LeaveBefore > BI.LastInstr.getBoundaryIndex()) {
+    //
+    //               <<<    Possible interference after last use.
+    //     |---o---o---|    Live-out on stack.
+    //     =========____    Leave IntvIn after last use.
+    //
+    //                 <    Interference after last use.
+    //     |---o---o--o|    Live-out on stack, late last use.
+    //     ============     Copy to stack after LSP, overlap IntvIn.
+    //            \_____    Stack interval is live-out.
+    //
+    if (BI.LastInstr < LSP) {
+      DEBUG(dbgs() << ", spill after last use before interference.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    } else {
+      DEBUG(dbgs() << ", spill before last split point.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvBefore(LSP);
+      overlapIntv(Idx, BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    }
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvIn. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  unsigned LocalIntv = openIntv();
+  (void)LocalIntv;
+  DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
+
+  if (!BI.LiveOut || BI.LastInstr < LSP) {
+    //
+    //           <<<<<<<    Interference overlapping uses.
+    //     |---o---o---|    Live-out on stack.
+    //     =====----____    Leave IntvIn before interference, then spill.
+    //
+    SlotIndex To = leaveIntvAfter(BI.LastInstr);
+    SlotIndex From = enterIntvBefore(LeaveBefore);
+    useIntv(From, To);
+    selectIntv(IntvIn);
+    useIntv(Start, From);
+    assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+    return;
+  }
+
+  //           <<<<<<<    Interference overlapping uses.
+  //     |---o---o--o|    Live-out on stack, late last use.
+  //     =====-------     Copy to stack before LSP, overlap LocalIntv.
+  //            \_____    Stack interval is live-out.
+  //
+  SlotIndex To = leaveIntvBefore(LSP);
+  overlapIntv(To, BI.LastInstr);
+  SlotIndex From = enterIntvBefore(std::min(To, LeaveBefore));
+  useIntv(From, To);
+  selectIntv(IntvIn);
+  useIntv(Start, From);
+  assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+}
+
+void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                                   unsigned IntvOut, SlotIndex EnterAfter) {
+  SlotIndex Start, Stop;
+  tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
+               << ", reg-out " << IntvOut << ", enter after " << EnterAfter
+               << (BI.LiveIn ? ", stack-in" : ", defined in block"));
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
+
+  assert(IntvOut && "Must have register out");
+  assert(BI.LiveOut && "Must be live-out");
+  assert((!EnterAfter || EnterAfter < LSP) && "Bad interference");
+
+  if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstInstr)) {
+    DEBUG(dbgs() << " after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |   o---o---|    Defined in block.
+    //        =========    Use IntvOut everywhere.
+    //
+    selectIntv(IntvOut);
+    useIntv(BI.FirstInstr, Stop);
+    return;
+  }
+
+  if (!EnterAfter || EnterAfter < BI.FirstInstr.getBaseIndex()) {
+    DEBUG(dbgs() << ", reload after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |---o---o---|    Live-through, stack-in.
+    //    ____=========    Enter IntvOut before first use.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstInstr));
+    useIntv(Idx, Stop);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvOut. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  DEBUG(dbgs() << ", interference overlaps uses.\n");
+  //
+  //    >>>>>>>          Interference overlapping uses.
+  //    |---o---o---|    Live-through, stack-in.
+  //    ____---======    Create local interval for interference range.
+  //
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  openIntv();
+  SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstInstr));
+  useIntv(From, Idx);
+}
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h
new file mode 100644
index 000000000000..4005a3d5cbbf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SplitKit.h
@@ -0,0 +1,469 @@
+//===-------- SplitKit.h - Toolkit for splitting live ranges ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SPLITKIT_H
+#define LLVM_CODEGEN_SPLITKIT_H
+
+#include "LiveRangeCalc.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+
+class ConnectedVNInfoEqClasses;
+class LiveInterval;
+class LiveIntervals;
+class LiveRangeEdit;
+class MachineInstr;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+class VNInfo;
+class raw_ostream;
+
+/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
+/// opportunities.
+class SplitAnalysis {
+public:
+  const MachineFunction &MF;
+  const VirtRegMap &VRM;
+  const LiveIntervals &LIS;
+  const MachineLoopInfo &Loops;
+  const TargetInstrInfo &TII;
+
+  /// Additional information about basic blocks where the current variable is
+  /// live. Such a block will look like one of these templates:
+  ///
+  ///  1. |   o---x   | Internal to block. Variable is only live in this block.
+  ///  2. |---x       | Live-in, kill.
+  ///  3. |       o---| Def, live-out.
+  ///  4. |---x   o---| Live-in, kill, def, live-out. Counted by NumGapBlocks.
+  ///  5. |---o---o---| Live-through with uses or defs.
+  ///  6. |-----------| Live-through without uses. Counted by NumThroughBlocks.
+  ///
+  /// Two BlockInfo entries are created for template 4. One for the live-in
+  /// segment, and one for the live-out segment. These entries look as if the
+  /// block were split in the middle where the live range isn't live.
+  ///
+  /// Live-through blocks without any uses don't get BlockInfo entries. They
+  /// are simply listed in ThroughBlocks instead.
+  ///
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    SlotIndex FirstInstr; ///< First instr accessing current reg.
+    SlotIndex LastInstr;  ///< Last instr accessing current reg.
+    SlotIndex FirstDef;   ///< First non-phi valno->def, or SlotIndex().
+    bool LiveIn;          ///< Current reg is live in.
+    bool LiveOut;         ///< Current reg is live out.
+
+    /// isOneInstr - Returns true when this BlockInfo describes a single
+    /// instruction.
+    bool isOneInstr() const {
+      return SlotIndex::isSameInstr(FirstInstr, LastInstr);
+    }
+  };
+
+private:
+  // Current live interval.
+  const LiveInterval *CurLI;
+
+  // Sorted slot indexes of using instructions.
+  SmallVector<SlotIndex, 8> UseSlots;
+
+  /// LastSplitPoint - Last legal split point in each basic block in the current
+  /// function. The first entry is the first terminator, the second entry is the
+  /// last valid split point for a variable that is live in to a landing pad
+  /// successor.
+  SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastSplitPoint;
+
+  /// UseBlocks - Blocks where CurLI has uses.
+  SmallVector<BlockInfo, 8> UseBlocks;
+
+  /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
+  /// the live range has a gap.
+  unsigned NumGapBlocks;
+
+  /// ThroughBlocks - Block numbers where CurLI is live through without uses.
+  BitVector ThroughBlocks;
+
+  /// NumThroughBlocks - Number of live-through blocks.
+  unsigned NumThroughBlocks;
+
+  /// DidRepairRange - analyze was forced to shrinkToUses().
+  bool DidRepairRange;
+
+  SlotIndex computeLastSplitPoint(unsigned Num);
+
+  // Sumarize statistics by counting instructions using CurLI.
+  void analyzeUses();
+
+  /// calcLiveBlockInfo - Compute per-block information about CurLI.
+  bool calcLiveBlockInfo();
+
+public:
+  SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                const MachineLoopInfo &mli);
+
+  /// analyze - set CurLI to the specified interval, and analyze how it may be
+  /// split.
+  void analyze(const LiveInterval *li);
+
+  /// didRepairRange() - Returns true if CurLI was invalid and has been repaired
+  /// by analyze(). This really shouldn't happen, but sometimes the coalescer
+  /// can create live ranges that end in mid-air.
+  bool didRepairRange() const { return DidRepairRange; }
+
+  /// clear - clear all data structures so SplitAnalysis is ready to analyze a
+  /// new interval.
+  void clear();
+
+  /// getParent - Return the last analyzed interval.
+  const LiveInterval &getParent() const { return *CurLI; }
+
+  /// getLastSplitPoint - Return the base index of the last valid split point
+  /// in the basic block numbered Num.
+  SlotIndex getLastSplitPoint(unsigned Num) {
+    // Inline the common simple case.
+    if (LastSplitPoint[Num].first.isValid() &&
+        !LastSplitPoint[Num].second.isValid())
+      return LastSplitPoint[Num].first;
+    return computeLastSplitPoint(Num);
+  }
+
+  /// getLastSplitPointIter - Returns the last split point as an iterator.
+  MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock*);
+
+  /// isOriginalEndpoint - Return true if the original live range was killed or
+  /// (re-)defined at Idx. Idx should be the 'def' slot for a normal kill/def,
+  /// and 'use' for an early-clobber def.
+  /// This can be used to recognize code inserted by earlier live range
+  /// splitting.
+  bool isOriginalEndpoint(SlotIndex Idx) const;
+
+  /// getUseSlots - Return an array of SlotIndexes of instructions using CurLI.
+  /// This include both use and def operands, at most one entry per instruction.
+  ArrayRef<SlotIndex> getUseSlots() const { return UseSlots; }
+
+  /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
+  /// where CurLI has uses.
+  ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }
+
+  /// getNumThroughBlocks - Return the number of through blocks.
+  unsigned getNumThroughBlocks() const { return NumThroughBlocks; }
+
+  /// isThroughBlock - Return true if CurLI is live through MBB without uses.
+  bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); }
+
+  /// getThroughBlocks - Return the set of through blocks.
+  const BitVector &getThroughBlocks() const { return ThroughBlocks; }
+
+  /// getNumLiveBlocks - Return the number of blocks where CurLI is live.
+  unsigned getNumLiveBlocks() const {
+    return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
+  }
+
+  /// countLiveBlocks - Return the number of blocks where li is live. This is
+  /// guaranteed to return the same number as getNumLiveBlocks() after calling
+  /// analyze(li).
+  unsigned countLiveBlocks(const LiveInterval *li) const;
+
+  typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
+
+  /// shouldSplitSingleBlock - Returns true if it would help to create a local
+  /// live range for the instructions in BI. There is normally no benefit to
+  /// creating a live range for a single instruction, but it does enable
+  /// register class inflation if the instruction has a restricted register
+  /// class.
+  ///
+  /// @param BI           The block to be isolated.
+  /// @param SingleInstrs True when single instructions should be isolated.
+  bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;
+};
+
+
+/// SplitEditor - Edit machine code and LiveIntervals for live range
+/// splitting.
+///
+/// - Create a SplitEditor from a SplitAnalysis.
+/// - Start a new live interval with openIntv.
+/// - Mark the places where the new interval is entered using enterIntv*
+/// - Mark the ranges where the new interval is used with useIntv* 
+/// - Mark the places where the interval is exited with exitIntv*.
+/// - Finish the current interval with closeIntv and repeat from 2.
+/// - Rewrite instructions with finish().
+///
+class SplitEditor {
+  SplitAnalysis &SA;
+  LiveIntervals &LIS;
+  VirtRegMap &VRM;
+  MachineRegisterInfo &MRI;
+  MachineDominatorTree &MDT;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+
+public:
+
+  /// ComplementSpillMode - Select how the complement live range should be
+  /// created.  SplitEditor automatically creates interval 0 to contain
+  /// anything that isn't added to another interval.  This complement interval
+  /// can get quite complicated, and it can sometimes be an advantage to allow
+  /// it to overlap the other intervals.  If it is going to spill anyway, no
+  /// registers are wasted by keeping a value in two places at the same time.
+  enum ComplementSpillMode {
+    /// SM_Partition(Default) - Try to create the complement interval so it
+    /// doesn't overlap any other intervals, and the original interval is
+    /// partitioned.  This may require a large number of back copies and extra
+    /// PHI-defs.  Only segments marked with overlapIntv will be overlapping.
+    SM_Partition,
+
+    /// SM_Size - Overlap intervals to minimize the number of inserted COPY
+    /// instructions.  Copies to the complement interval are hoisted to their
+    /// common dominator, so only one COPY is required per value in the
+    /// complement interval.  This also means that no extra PHI-defs need to be
+    /// inserted in the complement interval.
+    SM_Size,
+
+    /// SM_Speed - Overlap intervals to minimize the expected execution
+    /// frequency of the inserted copies.  This is very similar to SM_Size, but
+    /// the complement interval may get some extra PHI-defs.
+    SM_Speed
+  };
+
+private:
+
+  /// Edit - The current parent register and new intervals created.
+  LiveRangeEdit *Edit;
+
+  /// Index into Edit of the currently open interval.
+  /// The index 0 is used for the complement, so the first interval started by
+  /// openIntv will be 1.
+  unsigned OpenIdx;
+
+  /// The current spill mode, selected by reset().
+  ComplementSpillMode SpillMode;
+
+  typedef IntervalMap<SlotIndex, unsigned> RegAssignMap;
+
+  /// Allocator for the interval map. This will eventually be shared with
+  /// SlotIndexes and LiveIntervals.
+  RegAssignMap::Allocator Allocator;
+
+  /// RegAssign - Map of the assigned register indexes.
+  /// Edit.get(RegAssign.lookup(Idx)) is the register that should be live at
+  /// Idx.
+  RegAssignMap RegAssign;
+
+  typedef PointerIntPair<VNInfo*, 1> ValueForcePair;
+  typedef DenseMap<std::pair<unsigned, unsigned>, ValueForcePair> ValueMap;
+
+  /// Values - keep track of the mapping from parent values to values in the new
+  /// intervals. Given a pair (RegIdx, ParentVNI->id), Values contains:
+  ///
+  /// 1. No entry - the value is not mapped to Edit.get(RegIdx).
+  /// 2. (Null, false) - the value is mapped to multiple values in
+  ///    Edit.get(RegIdx).  Each value is represented by a minimal live range at
+  ///    its def.  The full live range can be inferred exactly from the range
+  ///    of RegIdx in RegAssign.
+  /// 3. (Null, true).  As above, but the ranges in RegAssign are too large, and
+  ///    the live range must be recomputed using LiveRangeCalc::extend().
+  /// 4. (VNI, false) The value is mapped to a single new value.
+  ///    The new value has no live ranges anywhere.
+  ValueMap Values;
+
+  /// LRCalc - Cache for computing live ranges and SSA update.  Each instance
+  /// can only handle non-overlapping live ranges, so use a separate
+  /// LiveRangeCalc instance for the complement interval when in spill mode.
+  LiveRangeCalc LRCalc[2];
+
+  /// getLRCalc - Return the LRCalc to use for RegIdx.  In spill mode, the
+  /// complement interval can overlap the other intervals, so it gets its own
+  /// LRCalc instance.  When not in spill mode, all intervals can share one.
+  LiveRangeCalc &getLRCalc(unsigned RegIdx) {
+    return LRCalc[SpillMode != SM_Partition && RegIdx != 0];
+  }
+
+  /// defValue - define a value in RegIdx from ParentVNI at Idx.
+  /// Idx does not have to be ParentVNI->def, but it must be contained within
+  /// ParentVNI's live range in ParentLI. The new value is added to the value
+  /// map.
+  /// Return the new LI value.
+  VNInfo *defValue(unsigned RegIdx, const VNInfo *ParentVNI, SlotIndex Idx);
+
+  /// forceRecompute - Force the live range of ParentVNI in RegIdx to be
+  /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
+  /// This is used for values whose live range doesn't match RegAssign exactly.
+  /// They could have rematerialized, or back-copies may have been moved.
+  void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
+
+  /// defFromParent - Define Reg from ParentVNI at UseIdx using either
+  /// rematerialization or a COPY from parent. Return the new value.
+  VNInfo *defFromParent(unsigned RegIdx,
+                        VNInfo *ParentVNI,
+                        SlotIndex UseIdx,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I);
+
+  /// removeBackCopies - Remove the copy instructions that defines the values
+  /// in the vector in the complement interval.
+  void removeBackCopies(SmallVectorImpl<VNInfo*> &Copies);
+
+  /// getShallowDominator - Returns the least busy dominator of MBB that is
+  /// also dominated by DefMBB.  Busy is measured by loop depth.
+  MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
+                                          MachineBasicBlock *DefMBB);
+
+  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
+  /// way that minimizes code size. This implements the SM_Size spill mode.
+  void hoistCopiesForSize();
+
+  /// transferValues - Transfer values to the new ranges.
+  /// Return true if any ranges were skipped.
+  bool transferValues();
+
+  /// extendPHIKillRanges - Extend the ranges of all values killed by original
+  /// parent PHIDefs.
+  void extendPHIKillRanges();
+
+  /// rewriteAssigned - Rewrite all uses of Edit.getReg() to assigned registers.
+  void rewriteAssigned(bool ExtendRanges);
+
+  /// deleteRematVictims - Delete defs that are dead after rematerializing.
+  void deleteRematVictims();
+
+public:
+  /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+  /// Newly created intervals will be appended to newIntervals.
+  SplitEditor(SplitAnalysis &SA, LiveIntervals&, VirtRegMap&,
+              MachineDominatorTree&);
+
+  /// reset - Prepare for a new split.
+  void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
+
+  /// Create a new virtual register and live interval.
+  /// Return the interval index, starting from 1. Interval index 0 is the
+  /// implicit complement interval.
+  unsigned openIntv();
+
+  /// currentIntv - Return the current interval index.
+  unsigned currentIntv() const { return OpenIdx; }
+
+  /// selectIntv - Select a previously opened interval index.
+  void selectIntv(unsigned Idx);
+
+  /// enterIntvBefore - Enter the open interval before the instruction at Idx.
+  /// If the parent interval is not live before Idx, a COPY is not inserted.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvBefore(SlotIndex Idx);
+
+  /// enterIntvAfter - Enter the open interval after the instruction at Idx.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAfter(SlotIndex Idx);
+
+  /// enterIntvAtEnd - Enter the open interval at the end of MBB.
+  /// Use the open interval from he inserted copy to the MBB end.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAtEnd(MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in MBB should use OpenLI.
+  void useIntv(const MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in range should use OpenLI.
+  void useIntv(SlotIndex Start, SlotIndex End);
+
+  /// leaveIntvAfter - Leave the open interval after the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAfter(SlotIndex Idx);
+
+  /// leaveIntvBefore - Leave the open interval before the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvBefore(SlotIndex Idx);
+
+  /// leaveIntvAtTop - Leave the interval at the top of MBB.
+  /// Add liveness from the MBB top to the copy.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAtTop(MachineBasicBlock &MBB);
+
+  /// overlapIntv - Indicate that all instructions in range should use the open
+  /// interval, but also let the complement interval be live.
+  ///
+  /// This doubles the register pressure, but is sometimes required to deal with
+  /// register uses after the last valid split point.
+  ///
+  /// The Start index should be a return value from a leaveIntv* call, and End
+  /// should be in the same basic block. The parent interval must have the same
+  /// value across the range.
+  ///
+  void overlapIntv(SlotIndex Start, SlotIndex End);
+
+  /// finish - after all the new live ranges have been created, compute the
+  /// remaining live range, and rewrite instructions to use the new registers.
+  /// @param LRMap When not null, this vector will map each live range in Edit
+  ///              back to the indices returned by openIntv.
+  ///              There may be extra indices created by dead code elimination.
+  void finish(SmallVectorImpl<unsigned> *LRMap = 0);
+
+  /// dump - print the current interval maping to dbgs().
+  void dump() const;
+
+  // ===--- High level methods ---===
+
+  /// splitSingleBlock - Split CurLI into a separate live interval around the
+  /// uses in a single block. This is intended to be used as part of a larger
+  /// split, and doesn't call finish().
+  void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);
+
+  /// splitLiveThroughBlock - Split CurLI in the given block such that it
+  /// enters the block in IntvIn and leaves it in IntvOut. There may be uses in
+  /// the block, but they will be ignored when placing split points.
+  ///
+  /// @param MBBNum      Block number.
+  /// @param IntvIn      Interval index entering the block.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitLiveThroughBlock(unsigned MBBNum,
+                             unsigned IntvIn, SlotIndex LeaveBefore,
+                             unsigned IntvOut, SlotIndex EnterAfter);
+
+  /// splitRegInBlock - Split CurLI in the given block such that it enters the
+  /// block in IntvIn and leaves it on the stack (or not at all). Split points
+  /// are placed in a way that avoids putting uses in the stack interval. This
+  /// may require creating a local interval when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvIn      Interval index entering the block. Not 0.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  void splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                       unsigned IntvIn, SlotIndex LeaveBefore);
+
+  /// splitRegOutBlock - Split CurLI in the given block such that it enters the
+  /// block on the stack (or isn't live-in at all) and leaves it in IntvOut.
+  /// Split points are placed to avoid interference and such that the uses are
+  /// not in the stack interval. This may require creating a local interval
+  /// when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                        unsigned IntvOut, SlotIndex EnterAfter);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp
new file mode 100644
index 000000000000..a789a2596dbf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp
@@ -0,0 +1,802 @@
+//===-- StackColoring.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the stack-coloring optimization that looks for
+// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
+// which represent the possible lifetime of stack slots. It attempts to
+// merge disjoint stack slots and reduce the used stack space.
+// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
+//
+// TODO: In the future we plan to improve stack coloring in the following ways:
+// 1. Allow merging multiple small slots into a single larger slot at different
+//    offsets.
+// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
+//    spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackcoloring"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableColoring("no-stack-coloring",
+        cl::init(false), cl::Hidden,
+        cl::desc("Disable stack coloring"));
+
+/// The user may write code that uses allocas outside of the declared lifetime
+/// zone. This can happen when the user returns a reference to a local
+/// data-structure. We can detect these cases and decide not to optimize the
+/// code. If this flag is enabled, we try to save the user.
+static cl::opt<bool>
+ProtectFromEscapedAllocas("protect-from-escaped-allocas",
+                          cl::init(false), cl::Hidden,
+                          cl::desc("Do not optimize lifetime zones that "
+                                   "are broken"));
+
+STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
+STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
+
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// StackColoring - A machine pass for merging disjoint stack allocations,
+/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
+class StackColoring : public MachineFunctionPass {
+  MachineFrameInfo *MFI;
+  MachineFunction *MF;
+
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+  /// Maps active slots (per bit) for each basic block.
+  typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
+
+  /// Maps serial numbers to basic blocks.
+  DenseMap<const MachineBasicBlock*, int> BasicBlocks;
+  /// Maps basic blocks to a serial number.
+  SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;
+
+  /// Maps liveness intervals for each slot.
+  SmallVector<LiveInterval*, 16> Intervals;
+  /// VNInfo is used for the construction of LiveIntervals.
+  VNInfo::Allocator VNInfoAllocator;
+  /// SlotIndex analysis object.
+  SlotIndexes *Indexes;
+
+  /// The list of lifetime markers found. These markers are to be removed
+  /// once the coloring is done.
+  SmallVector<MachineInstr*, 8> Markers;
+
+  /// SlotSizeSorter - A Sort utility for arranging stack slots according
+  /// to their size.
+  struct SlotSizeSorter {
+    MachineFrameInfo *MFI;
+    SlotSizeSorter(MachineFrameInfo *mfi) : MFI(mfi) { }
+    bool operator()(int LHS, int RHS) {
+      // We use -1 to denote a uninteresting slot. Place these slots at the end.
+      if (LHS == -1) return false;
+      if (RHS == -1) return true;
+      // Sort according to size.
+      return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+  }
+};
+
+public:
+  static char ID;
+  StackColoring() : MachineFunctionPass(ID) {
+    initializeStackColoringPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  /// Debug.
+  void dump() const;
+
+  /// Removes all of the lifetime marker instructions from the function.
+  /// \returns true if any markers were removed.
+  bool removeAllMarkers();
+
+  /// Scan the machine function and find all of the lifetime markers.
+  /// Record the findings in the BEGIN and END vectors.
+  /// \returns the number of markers found.
+  unsigned collectMarkers(unsigned NumSlot);
+
+  /// Perform the dataflow calculation and calculate the lifetime for each of
+  /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
+  /// LifetimeLIVE_OUT maps that represent which stack slots are live coming
+  /// in and out blocks.
+  void calculateLocalLiveness();
+
+  /// Construct the LiveIntervals for the slots.
+  void calculateLiveIntervals(unsigned NumSlots);
+
+  /// Go over the machine function and change instructions which use stack
+  /// slots to use the joint slots.
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
+
+  /// The input program may contain intructions which are not inside lifetime
+  /// markers. This can happen due to a bug in the compiler or due to a bug in
+  /// user code (for example, returning a reference to a local variable).
+  /// This procedure checks all of the instructions in the function and
+  /// invalidates lifetime ranges which do not contain all of the instructions
+  /// which access that frame slot.
+  void removeInvalidSlotRanges();
+
+  /// Map entries which point to other entries to their destination.
+  ///   A->B->C becomes A->C.
+   void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+};
+} // end anonymous namespace
+
+char StackColoring::ID = 0;
+char &llvm::StackColoringID = StackColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+
+void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void StackColoring::dump() const {
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+    DEBUG(dbgs()<<"Inspecting block #"<<BasicBlocks.lookup(*FI)<<
+          " ["<<FI->getName()<<"]\n");
+
+    LivenessMap::const_iterator BI = BlockLiveness.find(*FI);
+    assert(BI != BlockLiveness.end() && "Block not found");
+    const BlockLifetimeInfo &BlockInfo = BI->second;
+
+    DEBUG(dbgs()<<"BEGIN  : {");
+    for (unsigned i=0; i < BlockInfo.Begin.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.Begin.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"END    : {");
+    for (unsigned i=0; i < BlockInfo.End.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.End.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"LIVE_IN: {");
+    for (unsigned i=0; i < BlockInfo.LiveIn.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.LiveIn.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+    DEBUG(dbgs()<<"LIVEOUT: {");
+    for (unsigned i=0; i < BlockInfo.LiveOut.size(); ++i)
+      DEBUG(dbgs()<<BlockInfo.LiveOut.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+  }
+}
+
+unsigned StackColoring::collectMarkers(unsigned NumSlot) {
+  unsigned MarkersFound = 0;
+  // Scan the function to find all lifetime markers.
+  // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
+  // deterministic numbering, and because we'll need a post-order iteration
+  // later for solving the liveness dataflow problem.
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+
+    // Assign a serial number to this basic block.
+    BasicBlocks[*FI] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(*FI);
+
+    // Keep a reference to avoid repeated lookups.
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[*FI];
+
+    BlockInfo.Begin.resize(NumSlot);
+    BlockInfo.End.resize(NumSlot);
+
+    for (MachineBasicBlock::iterator BI = (*FI)->begin(), BE = (*FI)->end();
+         BI != BE; ++BI) {
+
+      if (BI->getOpcode() != TargetOpcode::LIFETIME_START &&
+          BI->getOpcode() != TargetOpcode::LIFETIME_END)
+        continue;
+
+      Markers.push_back(BI);
+
+      bool IsStart = BI->getOpcode() == TargetOpcode::LIFETIME_START;
+      const MachineOperand &MI = BI->getOperand(0);
+      unsigned Slot = MI.getIndex();
+
+      MarkersFound++;
+
+      const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+      if (Allocation) {
+        DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<<
+              " with allocation: "<< Allocation->getName()<<"\n");
+      }
+
+      if (IsStart) {
+        BlockInfo.Begin.set(Slot);
+      } else {
+        if (BlockInfo.Begin.test(Slot)) {
+          // Allocas that start and end within a single block are handled
+          // specially when computing the LiveIntervals to avoid pessimizing
+          // the liveness propagation.
+          BlockInfo.Begin.reset(Slot);
+        } else {
+          BlockInfo.End.set(Slot);
+        }
+      }
+    }
+  }
+
+  // Update statistics.
+  NumMarkerSeen += MarkersFound;
+  return MarkersFound;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  // Perform a standard reverse dataflow computation to solve for
+  // global liveness.  The BEGIN set here is equivalent to KILL in the standard
+  // formulation, and END is equivalent to GEN.  The result of this computation
+  // is a map from blocks to bitvectors where the bitvectors represent which
+  // allocas are live in/out of that block.
+  SmallPtrSet<const MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
+                                                 BasicBlockNumbering.end());
+  unsigned NumSSMIters = 0;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    ++NumSSMIters;
+
+    SmallPtrSet<const MachineBasicBlock*, 8> NextBBSet;
+
+    for (SmallVector<const MachineBasicBlock*, 8>::iterator
+         PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
+         PI != PE; ++PI) {
+
+      const MachineBasicBlock *BB = *PI;
+      if (!BBSet.count(BB)) continue;
+
+      // Use an iterator to avoid repeated lookups.
+      LivenessMap::iterator BI = BlockLiveness.find(BB);
+      assert(BI != BlockLiveness.end() && "Block not found");
+      BlockLifetimeInfo &BlockInfo = BI->second;
+
+      BitVector LocalLiveIn;
+      BitVector LocalLiveOut;
+
+      // Forward propagation from begins to ends.
+      for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
+           PE = BB->pred_end(); PI != PE; ++PI) {
+        LivenessMap::const_iterator I = BlockLiveness.find(*PI);
+        assert(I != BlockLiveness.end() && "Predecessor not found");
+        LocalLiveIn |= I->second.LiveOut;
+      }
+      LocalLiveIn |= BlockInfo.End;
+      LocalLiveIn.reset(BlockInfo.Begin);
+
+      // Reverse propagation from ends to begins.
+      for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI) {
+        LivenessMap::const_iterator I = BlockLiveness.find(*SI);
+        assert(I != BlockLiveness.end() && "Successor not found");
+        LocalLiveOut |= I->second.LiveIn;
+      }
+      LocalLiveOut |= BlockInfo.Begin;
+      LocalLiveOut.reset(BlockInfo.End);
+
+      LocalLiveIn |= LocalLiveOut;
+      LocalLiveOut |= LocalLiveIn;
+
+      // After adopting the live bits, we need to turn-off the bits which
+      // are de-activated in this block.
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveIn.reset(BlockInfo.Begin);
+
+      // If we have both BEGIN and END markers in the same basic block then
+      // we know that the BEGIN marker comes after the END, because we already
+      // handle the case where the BEGIN comes before the END when collecting
+      // the markers (and building the BEGIN/END vectore).
+      // Want to enable the LIVE_IN and LIVE_OUT of slots that have both
+      // BEGIN and END because it means that the value lives before and after
+      // this basic block.
+      BitVector LocalEndBegin = BlockInfo.End;
+      LocalEndBegin &= BlockInfo.Begin;
+      LocalLiveIn |= LocalEndBegin;
+      LocalLiveOut |= LocalEndBegin;
+
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+        changed = true;
+        BlockInfo.LiveIn |= LocalLiveIn;
+
+        for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
+             PE = BB->pred_end(); PI != PE; ++PI)
+          NextBBSet.insert(*PI);
+      }
+
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+        changed = true;
+        BlockInfo.LiveOut |= LocalLiveOut;
+
+        for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
+             SE = BB->succ_end(); SI != SE; ++SI)
+          NextBBSet.insert(*SI);
+      }
+    }
+
+    BBSet = NextBBSet;
+  }// while changed.
+}
+
+void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
+  SmallVector<SlotIndex, 16> Starts;
+  SmallVector<SlotIndex, 16> Finishes;
+
+  // For each block, find which slots are active within this block
+  // and update the live intervals.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBe = MF->end();
+       MBB != MBBe; ++MBB) {
+    Starts.clear();
+    Starts.resize(NumSlots);
+    Finishes.clear();
+    Finishes.resize(NumSlots);
+
+    // Create the interval for the basic blocks with lifetime markers in them.
+    for (SmallVectorImpl<MachineInstr*>::const_iterator it = Markers.begin(),
+         e = Markers.end(); it != e; ++it) {
+      const MachineInstr *MI = *it;
+      if (MI->getParent() != MBB)
+        continue;
+
+      assert((MI->getOpcode() == TargetOpcode::LIFETIME_START ||
+              MI->getOpcode() == TargetOpcode::LIFETIME_END) &&
+             "Invalid Lifetime marker");
+
+      bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START;
+      const MachineOperand &Mo = MI->getOperand(0);
+      int Slot = Mo.getIndex();
+      assert(Slot >= 0 && "Invalid slot");
+
+      SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
+
+      if (IsStart) {
+        if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+          Starts[Slot] = ThisIndex;
+      } else {
+        if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
+          Finishes[Slot] = ThisIndex;
+      }
+    }
+
+    // Create the interval of the blocks that we previously found to be 'alive'.
+    BitVector Alive = BlockLiveness[MBB].LiveIn;
+    Alive |= BlockLiveness[MBB].LiveOut;
+
+    if (Alive.any()) {
+      for (int pos = Alive.find_first(); pos != -1;
+           pos = Alive.find_next(pos)) {
+        if (!Starts[pos].isValid())
+          Starts[pos] = Indexes->getMBBStartIdx(MBB);
+        if (!Finishes[pos].isValid())
+          Finishes[pos] = Indexes->getMBBEndIdx(MBB);
+      }
+    }
+
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      assert(Starts[i].isValid() == Finishes[i].isValid() && "Unmatched range");
+      if (!Starts[i].isValid())
+        continue;
+
+      assert(Starts[i] && Finishes[i] && "Invalid interval");
+      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
+      SlotIndex S = Starts[i];
+      SlotIndex F = Finishes[i];
+      if (S < F) {
+        // We have a single consecutive region.
+        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+      } else {
+        // We have two non consecutive regions. This happens when
+        // LIFETIME_START appears after the LIFETIME_END marker.
+        SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
+        SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
+        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
+        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+      }
+    }
+  }
+}
+
+bool StackColoring::removeAllMarkers() {
+  unsigned Count = 0;
+  for (unsigned i = 0; i < Markers.size(); ++i) {
+    Markers[i]->eraseFromParent();
+    Count++;
+  }
+  Markers.clear();
+
+  DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
+  return Count;
+}
+
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+  unsigned FixedInstr = 0;
+  unsigned FixedMemOp = 0;
+  unsigned FixedDbg = 0;
+  MachineModuleInfo *MMI = &MF->getMMI();
+
+  // Remap debug information that refers to stack slots.
+  MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo();
+  for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(),
+       VE = VMap.end(); VI != VE; ++VI) {
+    const MDNode *Var = VI->first;
+    if (!Var) continue;
+    std::pair<unsigned, DebugLoc> &VP = VI->second;
+    if (SlotRemap.count(VP.first)) {
+      DEBUG(dbgs()<<"Remapping debug info for ["<<Var->getName()<<"].\n");
+      VP.first = SlotRemap[VP.first];
+      FixedDbg++;
+    }
+  }
+
+  // Keep a list of *allocas* which need to be remapped.
+  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+  for (DenseMap<int, int>::const_iterator it = SlotRemap.begin(),
+       e = SlotRemap.end(); it != e; ++it) {
+    const AllocaInst *From = MFI->getObjectAllocation(it->first);
+    const AllocaInst *To = MFI->getObjectAllocation(it->second);
+    assert(To && From && "Invalid allocation object");
+    Allocas[From] = To;
+  }
+
+  // Remap all instructions to the new stack slots.
+  MachineFunction::iterator BB, BBE;
+  MachineBasicBlock::iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      // Skip lifetime markers. We'll remove them soon.
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END)
+        continue;
+
+      // Update the MachineMemOperand to use the new alloca.
+      for (MachineInstr::mmo_iterator MM = I->memoperands_begin(),
+           E = I->memoperands_end(); MM != E; ++MM) {
+        MachineMemOperand *MMO = *MM;
+
+        const Value *V = MMO->getValue();
+
+        if (!V)
+          continue;
+
+        // Climb up and find the original alloca.
+        V = GetUnderlyingObject(V);
+        // If we did not find one, or if the one that we found is not in our
+        // map, then move on.
+        if (!V || !isa<AllocaInst>(V)) {
+          // Clear mem operand since we don't know for sure that it doesn't
+          // alias a merged alloca.
+          MMO->setValue(0);
+          continue;
+        }
+        const AllocaInst *AI= cast<AllocaInst>(V);
+        if (!Allocas.count(AI))
+          continue;
+
+        MMO->setValue(Allocas[AI]);
+        FixedMemOp++;
+      }
+
+      // Update all of the machine instruction operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+        int FromSlot = MO.getIndex();
+
+        // Don't touch arguments.
+        if (FromSlot<0)
+          continue;
+
+        // Only look at mapped slots.
+        if (!SlotRemap.count(FromSlot))
+          continue;
+
+        // In a debug build, check that the instruction that we are modifying is
+        // inside the expected live range. If the instruction is not inside
+        // the calculated range then it means that the alloca usage moved
+        // outside of the lifetime markers, or that the user has a bug.
+        // NOTE: Alloca address calculations which happen outside the lifetime
+        // zone are are okay, despite the fact that we don't have a good way
+        // for validating all of the usages of the calculation.
+#ifndef NDEBUG
+        bool TouchesMemory = I->mayLoad() || I->mayStore();
+        // If we *don't* protect the user from escaped allocas, don't bother
+        // validating the instructions.
+        if (!I->isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
+          SlotIndex Index = Indexes->getInstructionIndex(I);
+          LiveInterval *Interval = Intervals[FromSlot];
+          assert(Interval->find(Index) != Interval->end() &&
+                 "Found instruction usage outside of live range.");
+        }
+#endif
+
+        // Fix the machine instructions.
+        int ToSlot = SlotRemap[FromSlot];
+        MO.setIndex(ToSlot);
+        FixedInstr++;
+      }
+    }
+
+  DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
+}
+
+void StackColoring::removeInvalidSlotRanges() {
+  MachineFunction::const_iterator BB, BBE;
+  MachineBasicBlock::const_iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END || I->isDebugValue())
+        continue;
+
+      // Some intervals are suspicious! In some cases we find address
+      // calculations outside of the lifetime zone, but not actual memory
+      // read or write. Memory accesses outside of the lifetime zone are a clear
+      // violation, but address calculations are okay. This can happen when
+      // GEPs are hoisted outside of the lifetime zone.
+      // So, in here we only check instructions which can read or write memory.
+      if (!I->mayLoad() && !I->mayStore())
+        continue;
+
+      // Check all of the machine operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        const MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+
+        int Slot = MO.getIndex();
+
+        if (Slot<0)
+          continue;
+
+        if (Intervals[Slot]->empty())
+          continue;
+
+        // Check that the used slot is inside the calculated lifetime range.
+        // If it is not, warn about it and invalidate the range.
+        LiveInterval *Interval = Intervals[Slot];
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        if (Interval->find(Index) == Interval->end()) {
+          Intervals[Slot]->clear();
+          DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
+          EscapedAllocas++;
+        }
+      }
+    }
+}
+
+void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
+                                   unsigned NumSlots) {
+  // Expunge slot remap map.
+  for (unsigned i=0; i < NumSlots; ++i) {
+    // If we are remapping i
+    if (SlotRemap.count(i)) {
+      int Target = SlotRemap[i];
+      // As long as our target is mapped to something else, follow it.
+      while (SlotRemap.count(Target)) {
+        Target = SlotRemap[Target];
+        SlotRemap[i] = Target;
+      }
+    }
+  }
+}
+
+bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
+  DEBUG(dbgs() << "********** Stack Coloring **********\n"
+               << "********** Function: "
+               << ((const Value*)Func.getFunction())->getName() << '\n');
+  MF = &Func;
+  MFI = MF->getFrameInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  BlockLiveness.clear();
+  BasicBlocks.clear();
+  BasicBlockNumbering.clear();
+  Markers.clear();
+  Intervals.clear();
+  VNInfoAllocator.Reset();
+
+  unsigned NumSlots = MFI->getObjectIndexEnd();
+
+  // If there are no stack slots then there are no markers to remove.
+  if (!NumSlots)
+    return false;
+
+  SmallVector<int, 8> SortedSlots;
+
+  SortedSlots.reserve(NumSlots);
+  Intervals.reserve(NumSlots);
+
+  unsigned NumMarkers = collectMarkers(NumSlots);
+
+  unsigned TotalSize = 0;
+  DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
+  DEBUG(dbgs()<<"Slot structure:\n");
+
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
+    DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
+    TotalSize += MFI->getObjectSize(i);
+  }
+
+  DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
+
+  // Don't continue because there are not enough lifetime markers, or the
+  // stack is too small, or we are told not to optimize the slots.
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
+    DEBUG(dbgs()<<"Will not try to merge slots.\n");
+    return removeAllMarkers();
+  }
+
+  for (unsigned i=0; i < NumSlots; ++i) {
+    LiveInterval *LI = new LiveInterval(i, 0);
+    Intervals.push_back(LI);
+    LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    SortedSlots.push_back(i);
+  }
+
+  // Calculate the liveness of each block.
+  calculateLocalLiveness();
+
+  // Propagate the liveness information.
+  calculateLiveIntervals(NumSlots);
+
+  // Search for allocas which are used outside of the declared lifetime
+  // markers.
+  if (ProtectFromEscapedAllocas)
+    removeInvalidSlotRanges();
+
+  // Maps old slots to new slots.
+  DenseMap<int, int> SlotRemap;
+  unsigned RemovedSlots = 0;
+  unsigned ReducedSize = 0;
+
+  // Do not bother looking at empty intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    if (Intervals[SortedSlots[I]]->empty())
+      SortedSlots[I] = -1;
+  }
+
+  // This is a simple greedy algorithm for merging allocas. First, sort the
+  // slots, placing the largest slots first. Next, perform an n^2 scan and look
+  // for disjoint slots. When you find disjoint slots, merge the samller one
+  // into the bigger one and update the live interval. Remove the small alloca
+  // and continue.
+
+  // Sort the slots according to their size. Place unused slots at the end.
+  // Use stable sort to guarantee deterministic code generation.
+  std::stable_sort(SortedSlots.begin(), SortedSlots.end(),
+                   SlotSizeSorter(MFI));
+
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (unsigned I = 0; I < NumSlots; ++I) {
+      if (SortedSlots[I] == -1)
+        continue;
+
+      for (unsigned J=I+1; J < NumSlots; ++J) {
+        if (SortedSlots[J] == -1)
+          continue;
+
+        int FirstSlot = SortedSlots[I];
+        int SecondSlot = SortedSlots[J];
+        LiveInterval *First = Intervals[FirstSlot];
+        LiveInterval *Second = Intervals[SecondSlot];
+        assert (!First->empty() && !Second->empty() && "Found an empty range");
+
+        // Merge disjoint slots.
+        if (!First->overlaps(*Second)) {
+          Changed = true;
+          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          SlotRemap[SecondSlot] = FirstSlot;
+          SortedSlots[J] = -1;
+          DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
+                SecondSlot<<" together.\n");
+          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
+                                           MFI->getObjectAlignment(SecondSlot));
+
+          assert(MFI->getObjectSize(FirstSlot) >=
+                 MFI->getObjectSize(SecondSlot) &&
+                 "Merging a small object into a larger one");
+
+          RemovedSlots+=1;
+          ReducedSize += MFI->getObjectSize(SecondSlot);
+          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
+          MFI->RemoveStackObject(SecondSlot);
+        }
+      }
+    }
+  }// While changed.
+
+  // Record statistics.
+  StackSpaceSaved += ReducedSize;
+  StackSlotMerged += RemovedSlots;
+  DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
+        ReducedSize<<" bytes\n");
+
+  // Scan the entire function and update all machine operands that use frame
+  // indices to use the remapped frame index.
+  expungeSlotMap(SlotRemap, NumSlots);
+  remapInstructions(SlotRemap);
+
+  // Release the intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    delete Intervals[I];
+  }
+
+  return removeAllMarkers();
+}
diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp
new file mode 100644
index 000000000000..fbef34772b08
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp
@@ -0,0 +1,370 @@
+//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+STATISTIC(NumFunProtected, "Number of functions protected");
+STATISTIC(NumAddrTaken, "Number of local variables that have their address"
+                        " taken.");
+
+namespace {
+  class StackProtector : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLoweringBase *TLI;
+
+    Function *F;
+    Module *M;
+
+    DominatorTree *DT;
+
+    /// VisitedPHIs - The set of PHI nodes visited when determining
+    /// if a variable's reference has been taken.  This set 
+    /// is maintained to ensure we don't visit the same PHI node multiple
+    /// times.
+    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
+
+    /// InsertStackProtectors - Insert code into the prologue and epilogue of
+    /// the function.
+    ///
+    ///  - The prologue code loads and stores the stack guard onto the stack.
+    ///  - The epilogue checks the value stored in the prologue against the
+    ///    original value. It calls __stack_chk_fail if they differ.
+    bool InsertStackProtectors();
+
+    /// CreateFailBB - Create a basic block to jump to when the stack protector
+    /// check fails.
+    BasicBlock *CreateFailBB();
+
+    /// ContainsProtectableArray - Check whether the type either is an array or
+    /// contains an array of sufficient size so that we need stack protectors
+    /// for it.
+    bool ContainsProtectableArray(Type *Ty, bool Strong = false,
+                                  bool InStruct = false) const;
+
+    /// \brief Check whether a stack allocation has its address taken.
+    bool HasAddressTaken(const Instruction *AI);
+
+    /// RequiresStackProtector - Check whether or not this function needs a
+    /// stack protector based upon the stack protector level.
+    bool RequiresStackProtector();
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    StackProtector() : FunctionPass(ID), TLI(0) {
+      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+    }
+    StackProtector(const TargetLoweringBase *tli)
+      : FunctionPass(ID), TLI(tli) {
+      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+    }
+
+    virtual bool runOnFunction(Function &Fn);
+  };
+} // end anonymous namespace
+
+char StackProtector::ID = 0;
+INITIALIZE_PASS(StackProtector, "stack-protector",
+                "Insert stack protectors", false, false)
+
+FunctionPass *llvm::createStackProtectorPass(const TargetLoweringBase *tli) {
+  return new StackProtector(tli);
+}
+
+bool StackProtector::runOnFunction(Function &Fn) {
+  F = &Fn;
+  M = F->getParent();
+  DT = getAnalysisIfAvailable<DominatorTree>();
+
+  if (!RequiresStackProtector()) return false;
+
+  ++NumFunProtected;
+  return InsertStackProtectors();
+}
+
+/// ContainsProtectableArray - Check whether the type either is an array or
+/// contains a char array of sufficient size so that we need stack protectors
+/// for it.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool Strong,
+                                              bool InStruct) const {
+  if (!Ty) return false;
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    // In strong mode any array, regardless of type and size, triggers a
+    // protector
+    if (Strong)
+      return true;
+    const TargetMachine &TM = TLI->getTargetMachine();
+    if (!AT->getElementType()->isIntegerTy(8)) {
+      Triple Trip(TM.getTargetTriple());
+
+      // If we're on a non-Darwin platform or we're inside of a structure, don't
+      // add stack protectors unless the array is a character array.
+      if (InStruct || !Trip.isOSDarwin())
+          return false;
+    }
+
+    // If an array has more than SSPBufferSize bytes of allocated space, then we
+    // emit stack protectors.
+    if (TM.Options.SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT))
+      return true;
+  }
+
+  const StructType *ST = dyn_cast<StructType>(Ty);
+  if (!ST) return false;
+
+  for (StructType::element_iterator I = ST->element_begin(),
+         E = ST->element_end(); I != E; ++I)
+    if (ContainsProtectableArray(*I, Strong, true))
+      return true;
+
+  return false;
+}
+
+bool StackProtector::HasAddressTaken(const Instruction *AI) {
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
+        UI != UE; ++UI) {
+    const User *U = *UI;
+    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (AI == SI->getValueOperand())
+        return true;
+    } else if (const PtrToIntInst *SI = dyn_cast<PtrToIntInst>(U)) {
+      if (AI == SI->getOperand(0))
+        return true;
+    } else if (isa<CallInst>(U)) {
+      return true;
+    } else if (isa<InvokeInst>(U)) {
+      return true;
+    } else if (const SelectInst *SI = dyn_cast<SelectInst>(U)) {
+      if (HasAddressTaken(SI))
+        return true;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      if (VisitedPHIs.insert(PN))
+        if (HasAddressTaken(PN))
+          return true;
+    } else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      if (HasAddressTaken(GEP))
+        return true;
+    } else if (const BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+      if (HasAddressTaken(BI))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// \brief Check whether or not this function needs a stack protector based
+/// upon the stack protector level.
+///
+/// We use two heuristics: a standard (ssp) and strong (sspstrong).
+/// The standard heuristic which will add a guard variable to functions that
+/// call alloca with a either a variable size or a size >= SSPBufferSize,
+/// functions with character buffers larger than SSPBufferSize, and functions
+/// with aggregates containing character buffers larger than SSPBufferSize. The
+/// strong heuristic will add a guard variables to functions that call alloca
+/// regardless of size, functions with any buffer regardless of type and size,
+/// functions with aggregates that contain any buffer regardless of type and
+/// size, and functions that contain stack-based variables that have had their
+/// address taken.
+bool StackProtector::RequiresStackProtector() {
+  bool Strong = false;
+  if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::StackProtectReq))
+    return true;
+  else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::StackProtectStrong))
+    Strong = true;
+  else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                            Attribute::StackProtect))
+    return false;
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    BasicBlock *BB = I;
+
+    for (BasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+        if (AI->isArrayAllocation()) {
+          // SSP-Strong: Enable protectors for any call to alloca, regardless
+          // of size.
+          if (Strong)
+            return true;
+  
+          if (const ConstantInt *CI =
+               dyn_cast<ConstantInt>(AI->getArraySize())) {
+            unsigned BufferSize = TLI->getTargetMachine().Options.SSPBufferSize;
+            if (CI->getLimitedValue(BufferSize) >= BufferSize)
+              // A call to alloca with size >= SSPBufferSize requires
+              // stack protectors.
+              return true;
+          } else // A call to alloca with a variable size requires protectors.
+            return true;
+        }
+
+        if (ContainsProtectableArray(AI->getAllocatedType(), Strong))
+          return true;
+
+        if (Strong && HasAddressTaken(AI)) {
+          ++NumAddrTaken; 
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// InsertStackProtectors - Insert code into the prologue and epilogue of the
+/// function.
+///
+///  - The prologue code loads and stores the stack guard onto the stack.
+///  - The epilogue checks the value stored in the prologue against the original
+///    value. It calls __stack_chk_fail if they differ.
+bool StackProtector::InsertStackProtectors() {
+  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
+  BasicBlock *FailBBDom = 0;    // FailBB's dominator.
+  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
+  Value *StackGuardVar = 0;  // The stack guard variable.
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+    BasicBlock *BB = I++;
+    ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+    if (!RI) continue;
+
+    if (!FailBB) {
+      // Insert code into the entry block that stores the __stack_chk_guard
+      // variable onto the stack:
+      //
+      //   entry:
+      //     StackGuardSlot = alloca i8*
+      //     StackGuard = load __stack_chk_guard
+      //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
+      //
+      PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+      unsigned AddressSpace, Offset;
+      if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
+        Constant *OffsetVal =
+          ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+
+        StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
+                                      PointerType::get(PtrTy, AddressSpace));
+      } else {
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+      }
+
+      BasicBlock &Entry = F->getEntryBlock();
+      Instruction *InsPt = &Entry.front();
+
+      AI = new AllocaInst(PtrTy, "StackGuardSlot", InsPt);
+      LoadInst *LI = new LoadInst(StackGuardVar, "StackGuard", false, InsPt);
+
+      Value *Args[] = { LI, AI };
+      CallInst::
+        Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               Args, "", InsPt);
+
+      // Create the basic block to jump to when the guard check fails.
+      FailBB = CreateFailBB();
+    }
+
+    // For each block with a return instruction, convert this:
+    //
+    //   return:
+    //     ...
+    //     ret ...
+    //
+    // into this:
+    //
+    //   return:
+    //     ...
+    //     %1 = load __stack_chk_guard
+    //     %2 = load StackGuardSlot
+    //     %3 = cmp i1 %1, %2
+    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+    //
+    //   SP_return:
+    //     ret ...
+    //
+    //   CallStackCheckFailBlk:
+    //     call void @__stack_chk_fail()
+    //     unreachable
+
+    // Split the basic block before the return instruction.
+    BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+
+    if (DT && DT->isReachableFromEntry(BB)) {
+      DT->addNewBlock(NewBB, BB);
+      FailBBDom = FailBBDom ? DT->findNearestCommonDominator(FailBBDom, BB) :BB;
+    }
+
+    // Remove default branch instruction to the new BB.
+    BB->getTerminator()->eraseFromParent();
+
+    // Move the newly created basic block to the point right after the old basic
+    // block so that it's in the "fall through" position.
+    NewBB->moveAfter(BB);
+
+    // Generate the stack protector instructions in the old basic block.
+    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
+    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
+    ICmpInst *Cmp = new ICmpInst(*BB, CmpInst::ICMP_EQ, LI1, LI2, "");
+    BranchInst::Create(NewBB, FailBB, Cmp, BB);
+  }
+
+  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // statements in the function.
+  if (!FailBB) return false;
+
+  if (DT && FailBBDom)
+    DT->addNewBlock(FailBB, FailBBDom);
+
+  return true;
+}
+
+/// CreateFailBB - Create a basic block to jump to when the stack protector
+/// check fails.
+BasicBlock *StackProtector::CreateFailBB() {
+  BasicBlock *FailBB = BasicBlock::Create(F->getContext(),
+                                          "CallStackCheckFailBlk", F);
+  Constant *StackChkFail =
+    M->getOrInsertFunction("__stack_chk_fail",
+                           Type::getVoidTy(F->getContext()), NULL);
+  CallInst::Create(StackChkFail, "", FailBB);
+  new UnreachableInst(F->getContext(), FailBB);
+  return FailBB;
+}
diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
new file mode 100644
index 000000000000..f9515610d7e9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -0,0 +1,439 @@
+//===-- StackSlotColoring.cpp - Stack slot coloring pass. -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stack slot coloring pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackslotcoloring"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <vector>
+using namespace llvm;
+
+static cl::opt<bool>
+DisableSharing("no-stack-slot-sharing",
+             cl::init(false), cl::Hidden,
+             cl::desc("Suppress slot sharing during stack coloring"));
+
+static cl::opt<int> DCELimit("ssc-dce-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of stack slots eliminated due to coloring");
+STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
+
+namespace {
+  class StackSlotColoring : public MachineFunctionPass {
+    LiveStacks* LS;
+    MachineFrameInfo *MFI;
+    const TargetInstrInfo  *TII;
+    const MachineLoopInfo *loopInfo;
+
+    // SSIntervals - Spill slot intervals.
+    std::vector<LiveInterval*> SSIntervals;
+
+    // SSRefs - Keep a list of frame index references for each spill slot.
+    SmallVector<SmallVector<MachineInstr*, 8>, 16> SSRefs;
+
+    // OrigAlignments - Alignments of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigAlignments;
+
+    // OrigSizes - Sizess of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigSizes;
+
+    // AllColors - If index is set, it's a spill slot, i.e. color.
+    // FIXME: This assumes PEI locate spill slot with smaller indices
+    // closest to stack pointer / frame pointer. Therefore, smaller
+    // index == better color.
+    BitVector AllColors;
+
+    // NextColor - Next "color" that's not yet used.
+    int NextColor;
+
+    // UsedColors - "Colors" that have been assigned.
+    BitVector UsedColors;
+
+    // Assignments - Color to intervals mapping.
+    SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments;
+
+  public:
+    static char ID; // Pass identification
+    StackSlotColoring() :
+      MachineFunctionPass(ID), NextColor(-1) {
+        initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveStacks>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    void InitializeSlots();
+    void ScanForSpillSlotRefs(MachineFunction &MF);
+    bool OverlapWithAssignments(LiveInterval *li, int Color) const;
+    int ColorSlot(LiveInterval *li);
+    bool ColorSlots(MachineFunction &MF);
+    void RewriteInstruction(MachineInstr *MI, int OldFI, int NewFI,
+                            MachineFunction &MF);
+    bool RemoveDeadStores(MachineBasicBlock* MBB);
+  };
+} // end anonymous namespace
+
+char StackSlotColoring::ID = 0;
+char &llvm::StackSlotColoringID = StackSlotColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false)
+
+namespace {
+  // IntervalSorter - Comparison predicate that sort live intervals by
+  // their weight.
+  struct IntervalSorter {
+    bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
+      return LHS->weight > RHS->weight;
+    }
+  };
+}
+
+/// ScanForSpillSlotRefs - Scan all the machine instructions for spill slot
+/// references and update spill slot weights.
+void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
+  SSRefs.resize(MFI->getObjectIndexEnd());
+
+  // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
+         MII != EE; ++MII) {
+      MachineInstr *MI = &*MII;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isFI())
+          continue;
+        int FI = MO.getIndex();
+        if (FI < 0)
+          continue;
+        if (!LS->hasInterval(FI))
+          continue;
+        LiveInterval &li = LS->getInterval(FI);
+        if (!MI->isDebugValue())
+          li.weight += LiveIntervals::getSpillWeight(false, true, loopDepth);
+        SSRefs[FI].push_back(MI);
+      }
+    }
+  }
+}
+
+/// InitializeSlots - Process all spill stack slot liveintervals and add them
+/// to a sorted (by weight) list.
+void StackSlotColoring::InitializeSlots() {
+  int LastFI = MFI->getObjectIndexEnd();
+  OrigAlignments.resize(LastFI);
+  OrigSizes.resize(LastFI);
+  AllColors.resize(LastFI);
+  UsedColors.resize(LastFI);
+  Assignments.resize(LastFI);
+
+  // Gather all spill slots into a list.
+  DEBUG(dbgs() << "Spill slot intervals:\n");
+  for (LiveStacks::iterator i = LS->begin(), e = LS->end(); i != e; ++i) {
+    LiveInterval &li = i->second;
+    DEBUG(li.dump());
+    int FI = TargetRegisterInfo::stackSlot2Index(li.reg);
+    if (MFI->isDeadObjectIndex(FI))
+      continue;
+    SSIntervals.push_back(&li);
+    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigSizes[FI]      = MFI->getObjectSize(FI);
+    AllColors.set(FI);
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Sort them by weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+  // Get first "color".
+  NextColor = AllColors.find_first();
+}
+
+/// OverlapWithAssignments - Return true if LiveInterval overlaps with any
+/// LiveIntervals that have already been assigned to the specified color.
+bool
+StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
+  const SmallVector<LiveInterval*,4> &OtherLIs = Assignments[Color];
+  for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) {
+    LiveInterval *OtherLI = OtherLIs[i];
+    if (OtherLI->overlaps(*li))
+      return true;
+  }
+  return false;
+}
+
+/// ColorSlot - Assign a "color" (stack slot) to the specified stack slot.
+///
+int StackSlotColoring::ColorSlot(LiveInterval *li) {
+  int Color = -1;
+  bool Share = false;
+  if (!DisableSharing) {
+    // Check if it's possible to reuse any of the used colors.
+    Color = UsedColors.find_first();
+    while (Color != -1) {
+      if (!OverlapWithAssignments(li, Color)) {
+        Share = true;
+        ++NumEliminated;
+        break;
+      }
+      Color = UsedColors.find_next(Color);
+    }
+  }
+
+  // Assign it to the first available color (assumed to be the best) if it's
+  // not possible to share a used color with other objects.
+  if (!Share) {
+    assert(NextColor != -1 && "No more spill slots?");
+    Color = NextColor;
+    UsedColors.set(Color);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  // Record the assignment.
+  Assignments[Color].push_back(li);
+  int FI = TargetRegisterInfo::stackSlot2Index(li->reg);
+  DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
+
+  // Change size and alignment of the allocated slot. If there are multiple
+  // objects sharing the same slot, then make sure the size and alignment
+  // are large enough for all.
+  unsigned Align = OrigAlignments[FI];
+  if (!Share || Align > MFI->getObjectAlignment(Color))
+    MFI->setObjectAlignment(Color, Align);
+  int64_t Size = OrigSizes[FI];
+  if (!Share || Size > MFI->getObjectSize(Color))
+    MFI->setObjectSize(Color, Size);
+  return Color;
+}
+
+/// Colorslots - Color all spill stack slots and rewrite all frameindex machine
+/// operands in the function.
+bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
+  unsigned NumObjs = MFI->getObjectIndexEnd();
+  SmallVector<int, 16> SlotMapping(NumObjs, -1);
+  SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
+  SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
+  BitVector UsedColors(NumObjs);
+
+  DEBUG(dbgs() << "Color spill slot intervals:\n");
+  bool Changed = false;
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int NewSS = ColorSlot(li);
+    assert(NewSS >= 0 && "Stack coloring failed?");
+    SlotMapping[SS] = NewSS;
+    RevMap[NewSS].push_back(SS);
+    SlotWeights[NewSS] += li->weight;
+    UsedColors.set(NewSS);
+    Changed |= (SS != NewSS);
+  }
+
+  DEBUG(dbgs() << "\nSpill slots after coloring:\n");
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    li->weight = SlotWeights[SS];
+  }
+  // Sort them by new weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
+    DEBUG(SSIntervals[i]->dump());
+  DEBUG(dbgs() << '\n');
+#endif
+
+  if (!Changed)
+    return false;
+
+  // Rewrite all MO_FrameIndex operands.
+  SmallVector<SmallSet<unsigned, 4>, 4> NewDefs(MF.getNumBlockIDs());
+  for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
+    int NewFI = SlotMapping[SS];
+    if (NewFI == -1 || (NewFI == (int)SS))
+      continue;
+
+    SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMIs.size(); i != e; ++i)
+      RewriteInstruction(RefMIs[i], SS, NewFI, MF);
+  }
+
+  // Delete unused stack slots.
+  while (NextColor != -1) {
+    DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
+    MFI->RemoveStackObject(NextColor);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  return true;
+}
+
+/// RewriteInstruction - Rewrite specified instruction by replacing references
+/// to old frame index with new one.
+void StackSlotColoring::RewriteInstruction(MachineInstr *MI, int OldFI,
+                                           int NewFI, MachineFunction &MF) {
+  // Update the operands.
+  for (unsigned i = 0, ee = MI->getNumOperands(); i != ee; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int FI = MO.getIndex();
+    if (FI != OldFI)
+      continue;
+    MO.setIndex(NewFI);
+  }
+
+  // Update the memory references. This changes the MachineMemOperands
+  // directly. They may be in use by multiple instructions, however all
+  // instructions using OldFI are being rewritten to use NewFI.
+  const Value *OldSV = PseudoSourceValue::getFixedStack(OldFI);
+  const Value *NewSV = PseudoSourceValue::getFixedStack(NewFI);
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+       E = MI->memoperands_end(); I != E; ++I)
+    if ((*I)->getValue() == OldSV)
+      (*I)->setValue(NewSV);
+}
+
+
+/// RemoveDeadStores - Scan through a basic block and look for loads followed
+/// by stores.  If they're both using the same stack slot, then the store is
+/// definitely dead.  This could obviously be much more aggressive (consider
+/// pairs with instructions between them), but such extensions might have a
+/// considerable compile time impact.
+bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
+  // FIXME: This could be much more aggressive, but we need to investigate
+  // the compile time impact of doing so.
+  bool changed = false;
+
+  SmallVector<MachineInstr*, 4> toErase;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    if (DCELimit != -1 && (int)NumDead >= DCELimit)
+      break;
+
+    MachineBasicBlock::iterator NextMI = llvm::next(I);
+    if (NextMI == MBB->end()) continue;
+
+    int FirstSS, SecondSS;
+    unsigned LoadReg = 0;
+    unsigned StoreReg = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(I, FirstSS))) continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(NextMI, SecondSS))) continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
+
+    ++NumDead;
+    changed = true;
+
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, 0) != -1) {
+      ++NumDead;
+      toErase.push_back(I);
+    }
+
+    toErase.push_back(NextMI);
+    ++I;
+  }
+
+  for (SmallVector<MachineInstr*, 4>::iterator I = toErase.begin(),
+       E = toErase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+
+  return changed;
+}
+
+
+bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+      dbgs() << "********** Stack Slot Coloring **********\n"
+             << "********** Function: " << MF.getName() << '\n';
+    });
+
+  MFI = MF.getFrameInfo();
+  TII = MF.getTarget().getInstrInfo();
+  LS = &getAnalysis<LiveStacks>();
+  loopInfo = &getAnalysis<MachineLoopInfo>();
+
+  bool Changed = false;
+
+  unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots == 0)
+    // Nothing to do!
+    return false;
+
+  // If there are calls to setjmp or sigsetjmp, don't perform stack slot
+  // coloring. The stack could be modified before the longjmp is executed,
+  // resulting in the wrong value being used afterwards. (See
+  // <rdar://problem/8007500>.)
+  if (MF.exposesReturnsTwice())
+    return false;
+
+  // Gather spill slot references
+  ScanForSpillSlotRefs(MF);
+  InitializeSlots();
+  Changed = ColorSlots(MF);
+
+  NextColor = -1;
+  SSIntervals.clear();
+  for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
+    SSRefs[i].clear();
+  SSRefs.clear();
+  OrigAlignments.clear();
+  OrigSizes.clear();
+  AllColors.clear();
+  UsedColors.clear();
+  for (unsigned i = 0, e = Assignments.size(); i != e; ++i)
+    Assignments[i].clear();
+  Assignments.clear();
+
+  if (Changed) {
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      Changed |= RemoveDeadStores(I);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/StrongPHIElimination.cpp b/contrib/llvm/lib/CodeGen/StrongPHIElimination.cpp
new file mode 100644
index 000000000000..b337c5393343
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StrongPHIElimination.cpp
@@ -0,0 +1,825 @@
+//===- StrongPHIElimination.cpp - Eliminate PHI nodes by inserting copies -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates PHI instructions by aggressively coalescing the copies
+// that would be inserted by a naive algorithm and only inserting the copies
+// that are necessary. The coalescing technique initially assumes that all
+// registers appearing in a PHI instruction do not interfere. It then eliminates
+// proven interferences, using dominators to only perform a linear number of
+// interference tests instead of the quadratic number of interference tests
+// that this would naively require. This is a technique derived from:
+// 
+//    Budimlic, et al. Fast copy coalescing and live-range identification.
+//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
+//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
+//    PLDI '02. ACM, New York, NY, 25-32.
+//
+// The original implementation constructs a data structure they call a dominance
+// forest for this purpose. The dominance forest was shown to be unnecessary,
+// as it is possible to emulate the creation and traversal of a dominance forest
+// by directly using the dominator tree, rather than actually constructing the
+// dominance forest.  This technique is explained in:
+//
+//   Boissinot, et al. Revisiting Out-of-SSA Translation for Correctness, Code
+//     Quality and Efficiency,
+//   In Proceedings of the 7th annual IEEE/ACM International Symposium on Code
+//   Generation and Optimization (Seattle, Washington, March 22 - 25, 2009).
+//   CGO '09. IEEE, Washington, DC, 114-125.
+//
+// Careful implementation allows for all of the dominator forest interference
+// checks to be performed at once in a single depth-first traversal of the
+// dominator tree, which is what is implemented here.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strongphielim"
+#include "llvm/CodeGen/Passes.h"
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+  class StrongPHIElimination : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    StrongPHIElimination() : MachineFunctionPass(ID) {
+      initializeStrongPHIEliminationPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage&) const;
+    bool runOnMachineFunction(MachineFunction&);
+
+  private:
+    /// This struct represents a single node in the union-find data structure
+    /// representing the variable congruence classes. There is one difference
+    /// from a normal union-find data structure. We steal two bits from the parent
+    /// pointer . One of these bits is used to represent whether the register
+    /// itself has been isolated, and the other is used to represent whether the
+    /// PHI with that register as its destination has been isolated.
+    ///
+    /// Note that this leads to the strange situation where the leader of a
+    /// congruence class may no longer logically be a member, due to being
+    /// isolated.
+    struct Node {
+      enum Flags {
+        kRegisterIsolatedFlag = 1,
+        kPHIIsolatedFlag = 2
+      };
+      Node(unsigned v) : value(v), rank(0) { parent.setPointer(this); }
+
+      Node *getLeader();
+
+      PointerIntPair<Node*, 2> parent;
+      unsigned value;
+      unsigned rank;
+    };
+
+    /// Add a register in a new congruence class containing only itself.
+    void addReg(unsigned);
+
+    /// Join the congruence classes of two registers. This function is biased
+    /// towards the left argument, i.e. after
+    ///
+    /// addReg(r2);
+    /// unionRegs(r1, r2);
+    ///
+    /// the leader of the unioned congruence class is the same as the leader of
+    /// r1's congruence class prior to the union. This is actually relied upon
+    /// in the copy insertion code.
+    void unionRegs(unsigned, unsigned);
+
+    /// Get the color of a register. The color is 0 if the register has been
+    /// isolated.
+    unsigned getRegColor(unsigned);
+
+    // Isolate a register.
+    void isolateReg(unsigned);
+
+    /// Get the color of a PHI. The color of a PHI is 0 if the PHI has been
+    /// isolated. Otherwise, it is the original color of its destination and
+    /// all of its operands (before they were isolated, if they were).
+    unsigned getPHIColor(MachineInstr*);
+
+    /// Isolate a PHI.
+    void isolatePHI(MachineInstr*);
+
+    /// Traverses a basic block, splitting any interferences found between
+    /// registers in the same congruence class. It takes two DenseMaps as
+    /// arguments that it also updates: CurrentDominatingParent, which maps
+    /// a color to the register in that congruence class whose definition was
+    /// most recently seen, and ImmediateDominatingParent, which maps a register
+    /// to the register in the same congruence class that most immediately
+    /// dominates it.
+    ///
+    /// This function assumes that it is being called in a depth-first traversal
+    /// of the dominator tree.
+    void SplitInterferencesForBasicBlock(
+      MachineBasicBlock&,
+      DenseMap<unsigned, unsigned> &CurrentDominatingParent,
+      DenseMap<unsigned, unsigned> &ImmediateDominatingParent);
+
+    // Lowers a PHI instruction, inserting copies of the source and destination
+    // registers as necessary.
+    void InsertCopiesForPHI(MachineInstr*, MachineBasicBlock*);
+
+    // Merges the live interval of Reg into NewReg and renames Reg to NewReg
+    // everywhere that Reg appears. Requires Reg and NewReg to have non-
+    // overlapping lifetimes.
+    void MergeLIsAndRename(unsigned Reg, unsigned NewReg);
+
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    MachineDominatorTree *DT;
+    LiveIntervals *LI;
+
+    BumpPtrAllocator Allocator;
+
+    DenseMap<unsigned, Node*> RegNodeMap;
+
+    // Maps a basic block to a list of its defs of registers that appear as PHI
+    // sources.
+    DenseMap<MachineBasicBlock*, std::vector<MachineInstr*> > PHISrcDefs;
+
+    // Maps a color to a pair of a MachineInstr* and a virtual register, which
+    // is the operand of that PHI corresponding to the current basic block.
+    DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > CurrentPHIForColor;
+
+    // FIXME: Can these two data structures be combined? Would a std::multimap
+    // be any better?
+
+    // Stores pairs of predecessor basic blocks and the source registers of
+    // inserted copy instructions.
+    typedef DenseSet<std::pair<MachineBasicBlock*, unsigned> > SrcCopySet;
+    SrcCopySet InsertedSrcCopySet;
+
+    // Maps pairs of predecessor basic blocks and colors to their defining copy
+    // instructions.
+    typedef DenseMap<std::pair<MachineBasicBlock*, unsigned>, MachineInstr*>
+      SrcCopyMap;
+    SrcCopyMap InsertedSrcCopyMap;
+
+    // Maps inserted destination copy registers to their defining copy
+    // instructions.
+    typedef DenseMap<unsigned, MachineInstr*> DestCopyMap;
+    DestCopyMap InsertedDestCopies;
+  };
+
+  struct MIIndexCompare {
+    MIIndexCompare(LiveIntervals *LiveIntervals) : LI(LiveIntervals) { }
+
+    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
+      return LI->getInstructionIndex(LHS) < LI->getInstructionIndex(RHS);
+    }
+
+    LiveIntervals *LI;
+  };
+} // namespace
+
+STATISTIC(NumPHIsLowered, "Number of PHIs lowered");
+STATISTIC(NumDestCopiesInserted, "Number of destination copies inserted");
+STATISTIC(NumSrcCopiesInserted, "Number of source copies inserted");
+
+char StrongPHIElimination::ID = 0;
+INITIALIZE_PASS_BEGIN(StrongPHIElimination, "strong-phi-node-elimination",
+  "Eliminate PHI nodes for register allocation, intelligently", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(StrongPHIElimination, "strong-phi-node-elimination",
+  "Eliminate PHI nodes for register allocation, intelligently", false, false)
+
+char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
+
+void StrongPHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+static MachineOperand *findLastUse(MachineBasicBlock *MBB, unsigned Reg) {
+  // FIXME: This only needs to check from the first terminator, as only the
+  // first terminator can use a virtual register.
+  for (MachineBasicBlock::reverse_iterator RI = MBB->rbegin(); ; ++RI) {
+    assert (RI != MBB->rend());
+    MachineInstr *MI = &*RI;
+
+    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+         OE = MI->operands_end(); OI != OE; ++OI) {
+      MachineOperand &MO = *OI;
+      if (MO.isReg() && MO.isUse() && MO.getReg() == Reg)
+        return &MO;
+    }
+  }
+}
+
+bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+  TII = MF.getTarget().getInstrInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  LI = &getAnalysis<LiveIntervals>();
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI) {
+      unsigned DestReg = BBI->getOperand(0).getReg();
+      addReg(DestReg);
+      PHISrcDefs[I].push_back(BBI);
+
+      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
+        MachineOperand &SrcMO = BBI->getOperand(i);
+        unsigned SrcReg = SrcMO.getReg();
+        addReg(SrcReg);
+        unionRegs(DestReg, SrcReg);
+
+        MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+        if (DefMI)
+          PHISrcDefs[DefMI->getParent()].push_back(DefMI);
+      }
+    }
+  }
+
+  // Perform a depth-first traversal of the dominator tree, splitting
+  // interferences amongst PHI-congruence classes.
+  DenseMap<unsigned, unsigned> CurrentDominatingParent;
+  DenseMap<unsigned, unsigned> ImmediateDominatingParent;
+  for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
+    SplitInterferencesForBasicBlock(*DI->getBlock(),
+                                    CurrentDominatingParent,
+                                    ImmediateDominatingParent);
+  }
+
+  // Insert copies for all PHI source and destination registers.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI) {
+      InsertCopiesForPHI(BBI, I);
+    }
+  }
+
+  // FIXME: Preserve the equivalence classes during copy insertion and use
+  // the preversed equivalence classes instead of recomputing them.
+  RegNodeMap.clear();
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
+         BBI != BBE && BBI->isPHI(); ++BBI) {
+      unsigned DestReg = BBI->getOperand(0).getReg();
+      addReg(DestReg);
+
+      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
+        unsigned SrcReg = BBI->getOperand(i).getReg();
+        addReg(SrcReg);
+        unionRegs(DestReg, SrcReg);
+      }
+    }
+  }
+
+  DenseMap<unsigned, unsigned> RegRenamingMap;
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
+    while (BBI != BBE && BBI->isPHI()) {
+      MachineInstr *PHI = BBI;
+
+      assert(PHI->getNumOperands() > 0);
+
+      unsigned SrcReg = PHI->getOperand(1).getReg();
+      unsigned SrcColor = getRegColor(SrcReg);
+      unsigned NewReg = RegRenamingMap[SrcColor];
+      if (!NewReg) {
+        NewReg = SrcReg;
+        RegRenamingMap[SrcColor] = SrcReg;
+      }
+      MergeLIsAndRename(SrcReg, NewReg);
+
+      unsigned DestReg = PHI->getOperand(0).getReg();
+      if (!InsertedDestCopies.count(DestReg))
+        MergeLIsAndRename(DestReg, NewReg);
+
+      for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) {
+        unsigned SrcReg = PHI->getOperand(i).getReg();
+        MergeLIsAndRename(SrcReg, NewReg);
+      }
+
+      ++BBI;
+      LI->RemoveMachineInstrFromMaps(PHI);
+      PHI->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  // Due to the insertion of copies to split live ranges, the live intervals are
+  // guaranteed to not overlap, except in one case: an original PHI source and a
+  // PHI destination copy. In this case, they have the same value and thus don't
+  // truly intersect, so we merge them into the value live at that point.
+  // FIXME: Is there some better way we can handle this?
+  for (DestCopyMap::iterator I = InsertedDestCopies.begin(),
+       E = InsertedDestCopies.end(); I != E; ++I) {
+    unsigned DestReg = I->first;
+    unsigned DestColor = getRegColor(DestReg);
+    unsigned NewReg = RegRenamingMap[DestColor];
+
+    LiveInterval &DestLI = LI->getInterval(DestReg);
+    LiveInterval &NewLI = LI->getInterval(NewReg);
+
+    assert(DestLI.ranges.size() == 1
+           && "PHI destination copy's live interval should be a single live "
+               "range from the beginning of the BB to the copy instruction.");
+    LiveRange *DestLR = DestLI.begin();
+    VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start);
+    if (!NewVNI) {
+      NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator());
+      MachineInstr *CopyInstr = I->second;
+      CopyInstr->getOperand(1).setIsKill(true);
+    }
+
+    LiveRange NewLR(DestLR->start, DestLR->end, NewVNI);
+    NewLI.addRange(NewLR);
+
+    LI->removeInterval(DestReg);
+    MRI->replaceRegWith(DestReg, NewReg);
+  }
+
+  // Adjust the live intervals of all PHI source registers to handle the case
+  // where the PHIs in successor blocks were the only later uses of the source
+  // register.
+  for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(),
+       E = InsertedSrcCopySet.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I->first;
+    unsigned SrcReg = I->second;
+    if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)])
+      SrcReg = RenamedRegister;
+
+    LiveInterval &SrcLI = LI->getInterval(SrcReg);
+
+    bool isLiveOut = false;
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+      if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) {
+        isLiveOut = true;
+        break;
+      }
+    }
+
+    if (isLiveOut)
+      continue;
+
+    MachineOperand *LastUse = findLastUse(MBB, SrcReg);
+    assert(LastUse);
+    SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent());
+    SrcLI.removeRange(LastUseIndex.getRegSlot(), LI->getMBBEndIdx(MBB));
+    LastUse->setIsKill(true);
+  }
+
+  Allocator.Reset();
+  RegNodeMap.clear();
+  PHISrcDefs.clear();
+  InsertedSrcCopySet.clear();
+  InsertedSrcCopyMap.clear();
+  InsertedDestCopies.clear();
+
+  return Changed;
+}
+
+void StrongPHIElimination::addReg(unsigned Reg) {
+  Node *&N = RegNodeMap[Reg];
+  if (!N)
+    N = new (Allocator) Node(Reg);
+}
+
+StrongPHIElimination::Node*
+StrongPHIElimination::Node::getLeader() {
+  Node *N = this;
+  Node *Parent = parent.getPointer();
+  Node *Grandparent = Parent->parent.getPointer();
+
+  while (Parent != Grandparent) {
+    N->parent.setPointer(Grandparent);
+    N = Grandparent;
+    Parent = Parent->parent.getPointer();
+    Grandparent = Parent->parent.getPointer();
+  }
+
+  return Parent;
+}
+
+unsigned StrongPHIElimination::getRegColor(unsigned Reg) {
+  DenseMap<unsigned, Node*>::iterator RI = RegNodeMap.find(Reg);
+  if (RI == RegNodeMap.end())
+    return 0;
+  Node *Node = RI->second;
+  if (Node->parent.getInt() & Node::kRegisterIsolatedFlag)
+    return 0;
+  return Node->getLeader()->value;
+}
+
+void StrongPHIElimination::unionRegs(unsigned Reg1, unsigned Reg2) {
+  Node *Node1 = RegNodeMap[Reg1]->getLeader();
+  Node *Node2 = RegNodeMap[Reg2]->getLeader();
+
+  if (Node1->rank > Node2->rank) {
+    Node2->parent.setPointer(Node1->getLeader());
+  } else if (Node1->rank < Node2->rank) {
+    Node1->parent.setPointer(Node2->getLeader());
+  } else if (Node1 != Node2) {
+    Node2->parent.setPointer(Node1->getLeader());
+    Node1->rank++;
+  }
+}
+
+void StrongPHIElimination::isolateReg(unsigned Reg) {
+  Node *Node = RegNodeMap[Reg];
+  Node->parent.setInt(Node->parent.getInt() | Node::kRegisterIsolatedFlag);
+}
+
+unsigned StrongPHIElimination::getPHIColor(MachineInstr *PHI) {
+  assert(PHI->isPHI());
+
+  unsigned DestReg = PHI->getOperand(0).getReg();
+  Node *DestNode = RegNodeMap[DestReg];
+  if (DestNode->parent.getInt() & Node::kPHIIsolatedFlag)
+    return 0;
+
+  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
+    unsigned SrcColor = getRegColor(PHI->getOperand(i).getReg());
+    if (SrcColor)
+      return SrcColor;
+  }
+  return 0;
+}
+
+void StrongPHIElimination::isolatePHI(MachineInstr *PHI) {
+  assert(PHI->isPHI());
+  Node *Node = RegNodeMap[PHI->getOperand(0).getReg()];
+  Node->parent.setInt(Node->parent.getInt() | Node::kPHIIsolatedFlag);
+}
+
+/// SplitInterferencesForBasicBlock - traverses a basic block, splitting any
+/// interferences found between registers in the same congruence class. It
+/// takes two DenseMaps as arguments that it also updates:
+///
+/// 1) CurrentDominatingParent, which maps a color to the register in that
+///    congruence class whose definition was most recently seen.
+///
+/// 2) ImmediateDominatingParent, which maps a register to the register in the
+///    same congruence class that most immediately dominates it.
+///
+/// This function assumes that it is being called in a depth-first traversal
+/// of the dominator tree.
+///
+/// The algorithm used here is a generalization of the dominance-based SSA test
+/// for two variables. If there are variables a_1, ..., a_n such that
+///
+///   def(a_1) dom ... dom def(a_n),
+///
+/// then we can test for an interference between any two a_i by only using O(n)
+/// interference tests between pairs of variables. If i < j and a_i and a_j
+/// interfere, then a_i is alive at def(a_j), so it is also alive at def(a_i+1).
+/// Thus, in order to test for an interference involving a_i, we need only check
+/// for a potential interference with a_i+1.
+///
+/// This method can be generalized to arbitrary sets of variables by performing
+/// a depth-first traversal of the dominator tree. As we traverse down a branch
+/// of the dominator tree, we keep track of the current dominating variable and
+/// only perform an interference test with that variable. However, when we go to
+/// another branch of the dominator tree, the definition of the current dominating
+/// variable may no longer dominate the current block. In order to correct this,
+/// we need to use a stack of past choices of the current dominating variable
+/// and pop from this stack until we find a variable whose definition actually
+/// dominates the current block.
+/// 
+/// There will be one push on this stack for each variable that has become the
+/// current dominating variable, so instead of using an explicit stack we can
+/// simply associate the previous choice for a current dominating variable with
+/// the new choice. This works better in our implementation, where we test for
+/// interference in multiple distinct sets at once.
+void
+StrongPHIElimination::SplitInterferencesForBasicBlock(
+    MachineBasicBlock &MBB,
+    DenseMap<unsigned, unsigned> &CurrentDominatingParent,
+    DenseMap<unsigned, unsigned> &ImmediateDominatingParent) {
+  // Sort defs by their order in the original basic block, as the code below
+  // assumes that it is processing definitions in dominance order.
+  std::vector<MachineInstr*> &DefInstrs = PHISrcDefs[&MBB];
+  std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LI));
+
+  for (std::vector<MachineInstr*>::const_iterator BBI = DefInstrs.begin(),
+       BBE = DefInstrs.end(); BBI != BBE; ++BBI) {
+    for (MachineInstr::const_mop_iterator I = (*BBI)->operands_begin(),
+         E = (*BBI)->operands_end(); I != E; ++I) {
+      const MachineOperand &MO = *I;
+
+      // FIXME: This would be faster if it were possible to bail out of checking
+      // an instruction's operands after the explicit defs, but this is incorrect
+      // for variadic instructions, which may appear before register allocation
+      // in the future.
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+
+      unsigned DestReg = MO.getReg();
+      if (!DestReg || !TargetRegisterInfo::isVirtualRegister(DestReg))
+        continue;
+
+      // If the virtual register being defined is not used in any PHI or has
+      // already been isolated, then there are no more interferences to check.
+      unsigned DestColor = getRegColor(DestReg);
+      if (!DestColor)
+        continue;
+
+      // The input to this pass sometimes is not in SSA form in every basic
+      // block, as some virtual registers have redefinitions. We could eliminate
+      // this by fixing the passes that generate the non-SSA code, or we could
+      // handle it here by tracking defining machine instructions rather than
+      // virtual registers. For now, we just handle the situation conservatively
+      // in a way that will possibly lead to false interferences.
+      unsigned &CurrentParent = CurrentDominatingParent[DestColor];
+      unsigned NewParent = CurrentParent;
+      if (NewParent == DestReg)
+        continue;
+
+      // Pop registers from the stack represented by ImmediateDominatingParent
+      // until we find a parent that dominates the current instruction.
+      while (NewParent && (!DT->dominates(MRI->getVRegDef(NewParent), *BBI)
+                           || !getRegColor(NewParent)))
+        NewParent = ImmediateDominatingParent[NewParent];
+
+      // If NewParent is nonzero, then its definition dominates the current
+      // instruction, so it is only necessary to check for the liveness of
+      // NewParent in order to check for an interference.
+      if (NewParent
+          && LI->getInterval(NewParent).liveAt(LI->getInstructionIndex(*BBI))) {
+        // If there is an interference, always isolate the new register. This
+        // could be improved by using a heuristic that decides which of the two
+        // registers to isolate.
+        isolateReg(DestReg);
+        CurrentParent = NewParent;
+      } else {
+        // If there is no interference, update ImmediateDominatingParent and set
+        // the CurrentDominatingParent for this color to the current register.
+        ImmediateDominatingParent[DestReg] = NewParent;
+        CurrentParent = DestReg;
+      }
+    }
+  }
+
+  // We now walk the PHIs in successor blocks and check for interferences. This
+  // is necessary because the use of a PHI's operands are logically contained in
+  // the predecessor block. The def of a PHI's destination register is processed
+  // along with the other defs in a basic block.
+
+  CurrentPHIForColor.clear();
+
+  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+       SE = MBB.succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::iterator BBI = (*SI)->begin(), BBE = (*SI)->end();
+         BBI != BBE && BBI->isPHI(); ++BBI) {
+      MachineInstr *PHI = BBI;
+
+      // If a PHI is already isolated, either by being isolated directly or
+      // having all of its operands isolated, ignore it.
+      unsigned Color = getPHIColor(PHI);
+      if (!Color)
+        continue;
+
+      // Find the index of the PHI operand that corresponds to this basic block.
+      unsigned PredIndex;
+      for (PredIndex = 1; PredIndex < PHI->getNumOperands(); PredIndex += 2) {
+        if (PHI->getOperand(PredIndex + 1).getMBB() == &MBB)
+          break;
+      }
+      assert(PredIndex < PHI->getNumOperands());
+      unsigned PredOperandReg = PHI->getOperand(PredIndex).getReg();
+
+      // Pop registers from the stack represented by ImmediateDominatingParent
+      // until we find a parent that dominates the current instruction.
+      unsigned &CurrentParent = CurrentDominatingParent[Color];
+      unsigned NewParent = CurrentParent;
+      while (NewParent
+             && (!DT->dominates(MRI->getVRegDef(NewParent)->getParent(), &MBB)
+                 || !getRegColor(NewParent)))
+        NewParent = ImmediateDominatingParent[NewParent];
+      CurrentParent = NewParent;
+
+      // If there is an interference with a register, always isolate the
+      // register rather than the PHI. It is also possible to isolate the
+      // PHI, but that introduces copies for all of the registers involved
+      // in that PHI.
+      if (NewParent && LI->isLiveOutOfMBB(LI->getInterval(NewParent), &MBB)
+                    && NewParent != PredOperandReg)
+        isolateReg(NewParent);
+
+      std::pair<MachineInstr*, unsigned>
+        &CurrentPHI = CurrentPHIForColor[Color];
+
+      // If two PHIs have the same operand from every shared predecessor, then
+      // they don't actually interfere. Otherwise, isolate the current PHI. This
+      // could possibly be improved, e.g. we could isolate the PHI with the
+      // fewest operands.
+      if (CurrentPHI.first && CurrentPHI.second != PredOperandReg)
+        isolatePHI(PHI);
+      else
+        CurrentPHI = std::make_pair(PHI, PredOperandReg);
+    }
+  }
+}
+
+void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
+                                              MachineBasicBlock *MBB) {
+  assert(PHI->isPHI());
+  ++NumPHIsLowered;
+  unsigned PHIColor = getPHIColor(PHI);
+
+  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
+    MachineOperand &SrcMO = PHI->getOperand(i);
+
+    // If a source is defined by an implicit def, there is no need to insert a
+    // copy in the predecessor.
+    if (SrcMO.isUndef())
+      continue;
+
+    unsigned SrcReg = SrcMO.getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    MachineBasicBlock *PredBB = PHI->getOperand(i + 1).getMBB();
+    unsigned SrcColor = getRegColor(SrcReg);
+
+    // If neither the PHI nor the operand were isolated, then we only need to
+    // set the phi-kill flag on the VNInfo at this PHI.
+    if (PHIColor && SrcColor == PHIColor) {
+      LiveInterval &SrcInterval = LI->getInterval(SrcReg);
+      SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
+      VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
+      (void)SrcVNI;
+      assert(SrcVNI);
+      continue;
+    }
+
+    unsigned CopyReg = 0;
+    if (PHIColor) {
+      SrcCopyMap::const_iterator I
+        = InsertedSrcCopyMap.find(std::make_pair(PredBB, PHIColor));
+      CopyReg
+        = I != InsertedSrcCopyMap.end() ? I->second->getOperand(0).getReg() : 0;
+    }
+
+    if (!CopyReg) {
+      const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+      CopyReg = MRI->createVirtualRegister(RC);
+
+      MachineBasicBlock::iterator
+        CopyInsertPoint = findPHICopyInsertPoint(PredBB, MBB, SrcReg);
+      unsigned SrcSubReg = SrcMO.getSubReg();
+      MachineInstr *CopyInstr = BuildMI(*PredBB,
+                                        CopyInsertPoint,
+                                        PHI->getDebugLoc(),
+                                        TII->get(TargetOpcode::COPY),
+                                        CopyReg).addReg(SrcReg, 0, SrcSubReg);
+      LI->InsertMachineInstrInMaps(CopyInstr);
+      ++NumSrcCopiesInserted;
+
+      // addLiveRangeToEndOfBlock() also adds the phikill flag to the VNInfo for
+      // the newly added range.
+      LI->addLiveRangeToEndOfBlock(CopyReg, CopyInstr);
+      InsertedSrcCopySet.insert(std::make_pair(PredBB, SrcReg));
+
+      addReg(CopyReg);
+      if (PHIColor) {
+        unionRegs(PHIColor, CopyReg);
+        assert(getRegColor(CopyReg) != CopyReg);
+      } else {
+        PHIColor = CopyReg;
+        assert(getRegColor(CopyReg) == CopyReg);
+      }
+
+      // Insert into map if not already there.
+      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
+                                               CopyInstr));
+    }
+
+    SrcMO.setReg(CopyReg);
+
+    // If SrcReg is not live beyond the PHI, trim its interval so that it is no
+    // longer live-in to MBB. Note that SrcReg may appear in other PHIs that are
+    // processed later, but this is still correct to do at this point because we
+    // never rely on LiveIntervals being correct while inserting copies.
+    // FIXME: Should this just count uses at PHIs like the normal PHIElimination
+    // pass does?
+    LiveInterval &SrcLI = LI->getInterval(SrcReg);
+    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
+    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
+    SlotIndex NextInstrIndex = PHIIndex.getNextIndex();
+    if (SrcLI.liveAt(MBBStartIndex) && SrcLI.expiredAt(NextInstrIndex))
+      SrcLI.removeRange(MBBStartIndex, PHIIndex, true);
+  }
+
+  unsigned DestReg = PHI->getOperand(0).getReg();
+  unsigned DestColor = getRegColor(DestReg);
+
+  if (PHIColor && DestColor == PHIColor) {
+    LiveInterval &DestLI = LI->getInterval(DestReg);
+
+    // Set the phi-def flag for the VN at this PHI.
+    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
+    VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
+    assert(DestVNI);
+  
+    // Prior to PHI elimination, the live ranges of PHIs begin at their defining
+    // instruction. After PHI elimination, PHI instructions are replaced by VNs
+    // with the phi-def flag set, and the live ranges of these VNs start at the
+    // beginning of the basic block.
+    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
+    DestVNI->def = MBBStartIndex;
+    DestLI.addRange(LiveRange(MBBStartIndex,
+                              PHIIndex.getRegSlot(),
+                              DestVNI));
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(DestReg);
+  unsigned CopyReg = MRI->createVirtualRegister(RC);
+
+  MachineInstr *CopyInstr = BuildMI(*MBB,
+                                    MBB->SkipPHIsAndLabels(MBB->begin()),
+                                    PHI->getDebugLoc(),
+                                    TII->get(TargetOpcode::COPY),
+                                    DestReg).addReg(CopyReg);
+  LI->InsertMachineInstrInMaps(CopyInstr);
+  PHI->getOperand(0).setReg(CopyReg);
+  ++NumDestCopiesInserted;
+
+  // Add the region from the beginning of MBB to the copy instruction to
+  // CopyReg's live interval, and give the VNInfo the phidef flag.
+  LiveInterval &CopyLI = LI->getOrCreateInterval(CopyReg);
+  SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
+  SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
+  VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
+                                        LI->getVNInfoAllocator());
+  CopyLI.addRange(LiveRange(MBBStartIndex,
+                            DestCopyIndex.getRegSlot(),
+                            CopyVNI));
+
+  // Adjust DestReg's live interval to adjust for its new definition at
+  // CopyInstr.
+  LiveInterval &DestLI = LI->getOrCreateInterval(DestReg);
+  SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
+  DestLI.removeRange(PHIIndex.getRegSlot(), DestCopyIndex.getRegSlot());
+
+  VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
+  assert(DestVNI);
+  DestVNI->def = DestCopyIndex.getRegSlot();
+
+  InsertedDestCopies[CopyReg] = CopyInstr;
+}
+
+void StrongPHIElimination::MergeLIsAndRename(unsigned Reg, unsigned NewReg) {
+  if (Reg == NewReg)
+    return;
+
+  LiveInterval &OldLI = LI->getInterval(Reg);
+  LiveInterval &NewLI = LI->getInterval(NewReg);
+
+  // Merge the live ranges of the two registers.
+  DenseMap<VNInfo*, VNInfo*> VNMap;
+  for (LiveInterval::iterator LRI = OldLI.begin(), LRE = OldLI.end();
+       LRI != LRE; ++LRI) {
+    LiveRange OldLR = *LRI;
+    VNInfo *OldVN = OldLR.valno;
+
+    VNInfo *&NewVN = VNMap[OldVN];
+    if (!NewVN) {
+      NewVN = NewLI.createValueCopy(OldVN, LI->getVNInfoAllocator());
+      VNMap[OldVN] = NewVN;
+    }
+
+    LiveRange LR(OldLR.start, OldLR.end, NewVN);
+    NewLI.addRange(LR);
+  }
+
+  // Remove the LiveInterval for the register being renamed and replace all
+  // of its defs and uses with the new register.
+  LI->removeInterval(Reg);
+  MRI->replaceRegWith(Reg, NewReg);
+}
diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
new file mode 100644
index 000000000000..1ec88172a0b0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
@@ -0,0 +1,970 @@
+//===-- TailDuplication.cpp - Duplicate blocks into predecessors' tails ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass duplicates basic blocks ending in unconditional branches into
+// the tails of their predecessors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailduplication"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+STATISTIC(NumTails     , "Number of tails duplicated");
+STATISTIC(NumTailDups  , "Number of tail duplicated blocks");
+STATISTIC(NumInstrDups , "Additional instructions due to tail duplication");
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs , "Number of phis added");
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned>
+TailDuplicateSize("tail-dup-size",
+                  cl::desc("Maximum instructions to consider tail duplicating"),
+                  cl::init(2), cl::Hidden);
+
+static cl::opt<bool>
+TailDupVerify("tail-dup-verify",
+              cl::desc("Verify sanity of PHI instructions during taildup"),
+              cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned>
+TailDupLimit("tail-dup-limit", cl::init(~0U), cl::Hidden);
+
+typedef std::vector<std::pair<MachineBasicBlock*,unsigned> > AvailableValsTy;
+
+namespace {
+  /// TailDuplicatePass - Perform tail duplication.
+  class TailDuplicatePass : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineModuleInfo *MMI;
+    MachineRegisterInfo *MRI;
+    OwningPtr<RegScavenger> RS;
+    bool PreRegAlloc;
+
+    // SSAUpdateVRs - A list of virtual registers for which to update SSA form.
+    SmallVector<unsigned, 16> SSAUpdateVRs;
+
+    // SSAUpdateVals - For each virtual register in SSAUpdateVals keep a list of
+    // source virtual registers.
+    DenseMap<unsigned, AvailableValsTy> SSAUpdateVals;
+
+  public:
+    static char ID;
+    explicit TailDuplicatePass() :
+      MachineFunctionPass(ID), PreRegAlloc(false) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    void AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                           MachineBasicBlock *BB);
+    void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
+                    MachineBasicBlock *PredBB,
+                    DenseMap<unsigned, unsigned> &LocalVRMap,
+                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                    const DenseSet<unsigned> &UsedByPhi,
+                    bool Remove);
+    void DuplicateInstruction(MachineInstr *MI,
+                              MachineBasicBlock *TailBB,
+                              MachineBasicBlock *PredBB,
+                              MachineFunction &MF,
+                              DenseMap<unsigned, unsigned> &LocalVRMap,
+                              const DenseSet<unsigned> &UsedByPhi);
+    void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                              SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                              SmallSetVector<MachineBasicBlock*, 8> &Succs);
+    bool TailDuplicateBlocks(MachineFunction &MF);
+    bool shouldTailDuplicate(const MachineFunction &MF,
+                             bool IsSimple, MachineBasicBlock &TailBB);
+    bool isSimpleBB(MachineBasicBlock *TailBB);
+    bool canCompletelyDuplicateBB(MachineBasicBlock &BB);
+    bool duplicateSimpleBB(MachineBasicBlock *TailBB,
+                           SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                           const DenseSet<unsigned> &RegsUsedByPhi,
+                           SmallVector<MachineInstr*, 16> &Copies);
+    bool TailDuplicate(MachineBasicBlock *TailBB,
+                       bool IsSimple,
+                       MachineFunction &MF,
+                       SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                       SmallVector<MachineInstr*, 16> &Copies);
+    bool TailDuplicateAndUpdate(MachineBasicBlock *MBB,
+                                bool IsSimple,
+                                MachineFunction &MF);
+
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+  };
+
+  char TailDuplicatePass::ID = 0;
+}
+
+char &llvm::TailDuplicateID = TailDuplicatePass::ID;
+
+INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication",
+                false, false)
+
+bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  PreRegAlloc = MRI->isSSA();
+  RS.reset();
+  if (MRI->tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF))
+    RS.reset(new RegScavenger());
+
+  bool MadeChange = false;
+  while (TailDuplicateBlocks(MF))
+    MadeChange = true;
+
+  return MadeChange;
+}
+
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    SmallSetVector<MachineBasicBlock*, 8> Preds(MBB->pred_begin(),
+                                                MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (!MI->isPHI())
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+             PE = Preds.end(); PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i+1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber()
+                 << ": " << *MI;
+          dbgs() << "  extra input from predecessor BB#"
+                 << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+        if (PHIBB->getNumber() < 0) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(0);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
+/// TailDuplicateAndUpdate - Tail duplicate the block and cleanup.
+bool
+TailDuplicatePass::TailDuplicateAndUpdate(MachineBasicBlock *MBB,
+                                          bool IsSimple,
+                                          MachineFunction &MF) {
+  // Save the successors list.
+  SmallSetVector<MachineBasicBlock*, 8> Succs(MBB->succ_begin(),
+                                              MBB->succ_end());
+
+  SmallVector<MachineBasicBlock*, 8> TDBBs;
+  SmallVector<MachineInstr*, 16> Copies;
+  if (!TailDuplicate(MBB, IsSimple, MF, TDBBs, Copies))
+    return false;
+
+  ++NumTails;
+
+  SmallVector<MachineInstr*, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
+  // TailBB's immediate successors are now successors of those predecessors
+  // which duplicated TailBB. Add the predecessors as sources to the PHI
+  // instructions.
+  bool isDead = MBB->pred_empty() && !MBB->hasAddressTaken();
+  if (PreRegAlloc)
+    UpdateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+  // If it is dead, remove it.
+  if (isDead) {
+    NumInstrDups -= MBB->size();
+    RemoveDeadBlock(MBB);
+    ++NumDeadBlocks;
+  }
+
+  // Update SSA form.
+  if (!SSAUpdateVRs.empty()) {
+    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+      unsigned VReg = SSAUpdateVRs[i];
+      SSAUpdate.Initialize(VReg);
+
+      // If the original definition is still around, add it as an available
+      // value.
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      MachineBasicBlock *DefBB = 0;
+      if (DefMI) {
+        DefBB = DefMI->getParent();
+        SSAUpdate.AddAvailableValue(DefBB, VReg);
+      }
+
+      // Add the new vregs as available values.
+      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+        SSAUpdateVals.find(VReg);
+      for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+        MachineBasicBlock *SrcBB = LI->second[j].first;
+        unsigned SrcReg = LI->second[j].second;
+        SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+      }
+
+      // Rewrite uses that are outside of the original def's block.
+      MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+      while (UI != MRI->use_end()) {
+        MachineOperand &UseMO = UI.getOperand();
+        MachineInstr *UseMI = &*UI;
+        ++UI;
+        if (UseMI->isDebugValue()) {
+          // SSAUpdate can replace the use with an undef. That creates
+          // a debug instruction that is a kill.
+          // FIXME: Should it SSAUpdate job to delete debug instructions
+          // instead of replacing the use with undef?
+          UseMI->eraseFromParent();
+          continue;
+        }
+        if (UseMI->getParent() == DefBB && !UseMI->isPHI())
+          continue;
+        SSAUpdate.RewriteUse(UseMO);
+      }
+    }
+
+    SSAUpdateVRs.clear();
+    SSAUpdateVals.clear();
+  }
+
+  // Eliminate some of the copies inserted by tail duplication to maintain
+  // SSA form.
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    MachineInstr *Copy = Copies[i];
+    if (!Copy->isCopy())
+      continue;
+    unsigned Dst = Copy->getOperand(0).getReg();
+    unsigned Src = Copy->getOperand(1).getReg();
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
+      // Copy is the only use. Do trivial copy propagation here.
+      MRI->replaceRegWith(Dst, Src);
+      Copy->eraseFromParent();
+    }
+  }
+
+  if (NewPHIs.size())
+    NumAddedPHIs += NewPHIs.size();
+
+  return true;
+}
+
+/// TailDuplicateBlocks - Look for small blocks that are unconditionally
+/// branched to and do not fall through. Tail-duplicate their instructions
+/// into their predecessors to eliminate (dynamic) branches.
+bool TailDuplicatePass::TailDuplicateBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = I++;
+
+    if (NumTails == TailDupLimit)
+      break;
+
+    bool IsSimple = isSimpleBB(MBB);
+
+    if (!shouldTailDuplicate(MF, IsSimple, *MBB))
+      continue;
+
+    MadeChange |= TailDuplicateAndUpdate(MBB, IsSimple, MF);
+  }
+
+  if (PreRegAlloc && TailDupVerify)
+    VerifyPHIs(MF, false);
+
+  return MadeChange;
+}
+
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
+         UE = MRI->use_end(); UI != UE; ++UI) {
+    MachineInstr *UseMI = &*UI;
+    if (UseMI->isDebugValue())
+      continue;
+    if (UseMI->getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i+1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<unsigned> *UsedByPhi) {
+  for(MachineBasicBlock::const_iterator I = BB.begin(), E = BB.end();
+      I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
+/// AddSSAUpdateEntry - Add a definition and source virtual registers pair for
+/// SSA update.
+void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                          MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI= SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
+/// Remember the source register that's contributed by PredBB and update SSA
+/// update map.
+void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
+                                   MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *PredBB,
+                                   DenseMap<unsigned, unsigned> &LocalVRMap,
+                           SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                                   const DenseSet<unsigned> &RegsUsedByPhi,
+                                   bool Remove) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, SrcReg));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, SrcReg));
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
+    AddSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  if (!Remove)
+    return;
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx+1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// DuplicateInstruction - Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
+                                     MachineBasicBlock *TailBB,
+                                     MachineBasicBlock *PredBB,
+                                     MachineFunction &MF,
+                                     DenseMap<unsigned, unsigned> &LocalVRMap,
+                                     const DenseSet<unsigned> &UsedByPhi) {
+  MachineInstr *NewMI = TII->duplicate(MI, MF);
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (MO.isDef()) {
+      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+      unsigned NewReg = MRI->createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      LocalVRMap.insert(std::make_pair(Reg, NewReg));
+      if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
+        AddSSAUpdateEntry(Reg, NewReg, PredBB);
+    } else {
+      DenseMap<unsigned, unsigned>::iterator VI = LocalVRMap.find(Reg);
+      if (VI != LocalVRMap.end()) {
+        MO.setReg(VI->second);
+        MRI->constrainRegClass(VI->second, MRI->getRegClass(Reg));
+      }
+    }
+  }
+  PredBB->insert(PredBB->instr_end(), NewMI);
+}
+
+/// UpdateSuccessorsPHIs - After FromBB is tail duplicated into its predecessor
+/// blocks, the successors have gained new predecessors. Update the PHI
+/// instructions in them accordingly.
+void
+TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
+                                  SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                  SmallSetVector<MachineBasicBlock*,8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
+         SE = Succs.end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (!II->isPHI())
+        break;
+      MachineInstrBuilder MIB(*FromBB->getParent(), II);
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i+1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands()-2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i+1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i+1);
+            II->RemoveOperand(i);
+          }
+        }
+      } else
+        Idx = 0;
+
+      // If Idx is set, the operands at Idx and Idx+1 must be removed.
+      // We reuse the location to avoid expensive RemoveOperand calls.
+
+      DenseMap<unsigned,AvailableValsTy>::iterator LI=SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
+          unsigned SrcReg = LI->second[j].second;
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(SrcReg);
+            II->getOperand(Idx+1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(SrcReg).addMBB(SrcBB);
+          }
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(Reg);
+            II->getOperand(Idx+1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(Reg).addMBB(SrcBB);
+          }
+        }
+      }
+      if (Idx != 0) {
+        II->RemoveOperand(Idx+1);
+        II->RemoveOperand(Idx);
+      }
+    }
+  }
+}
+
+/// shouldTailDuplicate - Determine if it is profitable to duplicate this block.
+bool
+TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
+                                       bool IsSimple,
+                                       MachineBasicBlock &TailBB) {
+  // Only duplicate blocks that end with unconditional branches.
+  if (TailBB.canFallThrough())
+    return false;
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB.isSuccessor(&TailBB))
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
+  // duplicate only one, because one branch instruction can be eliminated to
+  // compensate for the duplication.
+  unsigned MaxDuplicateCount;
+  if (TailDuplicateSize.getNumOccurrences() == 0 &&
+      MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+    MaxDuplicateCount = 1;
+  else
+    MaxDuplicateCount = TailDuplicateSize;
+
+  // If the target has hardware branch prediction that can handle indirect
+  // branches, duplicating them can often make them predictable when there
+  // are common paths through the code.  The limit needs to be high enough
+  // to allow undoing the effects of tail merging and other optimizations
+  // that rearrange the predecessors of the indirect branch.
+
+  bool HasIndirectbr = false;
+  if (!TailBB.empty())
+    HasIndirectbr = TailBB.back().isIndirectBranch();
+
+  if (HasIndirectbr && PreRegAlloc)
+    MaxDuplicateCount = 20;
+
+  // Check the instructions in the block to determine whether tail-duplication
+  // is invalid or unlikely to be profitable.
+  unsigned InstrCount = 0;
+  for (MachineBasicBlock::iterator I = TailBB.begin(); I != TailBB.end(); ++I) {
+    // Non-duplicable things shouldn't be tail-duplicated.
+    if (I->isNotDuplicable())
+      return false;
+
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && I->isReturn())
+      return false;
+
+    // Avoid duplicating calls before register allocation. Calls presents a
+    // barrier to register allocation so duplicating them may end up increasing
+    // spills.
+    if (PreRegAlloc && I->isCall())
+      return false;
+
+    if (!I->isPHI() && !I->isDebugValue())
+      InstrCount += 1;
+
+    if (InstrCount > MaxDuplicateCount)
+      return false;
+  }
+
+  if (HasIndirectbr && PreRegAlloc)
+    return true;
+
+  if (IsSimple)
+    return true;
+
+  if (!PreRegAlloc)
+    return true;
+
+  return canCompletelyDuplicateBB(TailBB);
+}
+
+/// isSimpleBB - True if this BB has only one unconditional jump.
+bool
+TailDuplicatePass::isSimpleBB(MachineBasicBlock *TailBB) {
+  if (TailBB->succ_size() != 1)
+    return false;
+  if (TailBB->pred_empty())
+    return false;
+  MachineBasicBlock::iterator I = TailBB->begin();
+  MachineBasicBlock::iterator E = TailBB->end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  if (I == E)
+    return true;
+  return I->isUnconditionalBranch();
+}
+
+static bool
+bothUsedInPHI(const MachineBasicBlock &A,
+              SmallPtrSet<MachineBasicBlock*, 8> SuccsB) {
+  for (MachineBasicBlock::const_succ_iterator SI = A.succ_begin(),
+         SE = A.succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *BB = *SI;
+    if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
+      return true;
+  }
+
+  return false;
+}
+
+bool
+TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
+  SmallPtrSet<MachineBasicBlock*, 8> Succs(BB.succ_begin(), BB.succ_end());
+
+  for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(),
+       PE = BB.pred_end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    if (PredBB->succ_size() > 1)
+      return false;
+
+    MachineBasicBlock *PredTBB = NULL, *PredFBB = NULL;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      return false;
+
+    if (!PredCond.empty())
+      return false;
+  }
+  return true;
+}
+
+bool
+TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
+                                     SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                     const DenseSet<unsigned> &UsedByPhi,
+                                     SmallVector<MachineInstr*, 16> &Copies) {
+  SmallPtrSet<MachineBasicBlock*, 8> Succs(TailBB->succ_begin(),
+                                           TailBB->succ_end());
+  SmallVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
+                                           TailBB->pred_end());
+  bool Changed = false;
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    if (PredBB->getLandingPadSuccessor())
+      continue;
+
+    if (bothUsedInPHI(*PredBB, Succs))
+      continue;
+
+    MachineBasicBlock *PredTBB = NULL, *PredFBB = NULL;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From simple Succ: " << *TailBB);
+
+    MachineBasicBlock *NewTarget = *TailBB->succ_begin();
+    MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(PredBB));
+
+    // Make PredFBB explicit.
+    if (PredCond.empty())
+      PredFBB = PredTBB;
+
+    // Make fall through explicit.
+    if (!PredTBB)
+      PredTBB = NextBB;
+    if (!PredFBB)
+      PredFBB = NextBB;
+
+    // Redirect
+    if (PredFBB == TailBB)
+      PredFBB = NewTarget;
+    if (PredTBB == TailBB)
+      PredTBB = NewTarget;
+
+    // Make the branch unconditional if possible
+    if (PredTBB == PredFBB) {
+      PredCond.clear();
+      PredFBB = NULL;
+    }
+
+    // Avoid adding fall through branches.
+    if (PredFBB == NextBB)
+      PredFBB = NULL;
+    if (PredTBB == NextBB && PredFBB == NULL)
+      PredTBB = NULL;
+
+    TII->RemoveBranch(*PredBB);
+
+    if (PredTBB)
+      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+
+    PredBB->removeSuccessor(TailBB);
+    unsigned NumSuccessors = PredBB->succ_size();
+    assert(NumSuccessors <= 1);
+    if (NumSuccessors == 0 || *PredBB->succ_begin() != NewTarget)
+      PredBB->addSuccessor(NewTarget);
+
+    TDBBs.push_back(PredBB);
+  }
+  return Changed;
+}
+
+/// TailDuplicate - If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool
+TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
+                                 bool IsSimple,
+                                 MachineFunction &MF,
+                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                 SmallVector<MachineInstr*, 16> &Copies) {
+  DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
+  DenseSet<unsigned> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
+
+  if (IsSimple)
+    return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi, Copies);
+
+  // Iterate through all the unique predecessors and tail-duplicate this
+  // block into them, if possible. Copying the list ahead of time also
+  // avoids trouble with the predecessor list reallocating.
+  bool Changed = false;
+  SmallSetVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
+                                              TailBB->pred_end());
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    assert(TailBB != PredBB &&
+           "Single-block loop should have been rejected earlier!");
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() > 1)
+      continue;
+
+    MachineBasicBlock *PredTBB, *PredFBB;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+    if (!PredCond.empty())
+      continue;
+    // Don't duplicate into a fall-through predecessor (at least for now).
+    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      continue;
+
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From Succ: " << *TailBB);
+
+    TDBBs.push_back(PredBB);
+
+    // Remove PredBB's unconditional branch.
+    TII->RemoveBranch(*PredBB);
+
+    if (RS && !TailBB->livein_empty()) {
+      // Update PredBB livein.
+      RS->enterBasicBlock(PredBB);
+      if (!PredBB->empty())
+        RS->forward(prior(PredBB->end()));
+      BitVector RegsLiveAtExit(TRI->getNumRegs());
+      RS->getRegsUsed(RegsLiveAtExit, false);
+      for (MachineBasicBlock::livein_iterator I = TailBB->livein_begin(),
+             E = TailBB->livein_end(); I != E; ++I) {
+        if (!RegsLiveAtExit[*I])
+          // If a register is previously livein to the tail but it's not live
+          // at the end of predecessor BB, then it should be added to its
+          // livein list.
+          PredBB->addLiveIn(*I);
+      }
+    }
+
+    // Clone the contents of TailBB into PredBB.
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    // Use instr_iterator here to properly handle bundles, e.g.
+    // ARM Thumb2 IT block.
+    MachineBasicBlock::instr_iterator I = TailBB->instr_begin();
+    while (I != TailBB->instr_end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        DuplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
+      }
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
+    }
+
+    // Simplify
+    TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
+
+    NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
+
+    // Update the CFG.
+    PredBB->removeSuccessor(PredBB->succ_begin());
+    assert(PredBB->succ_empty() &&
+           "TailDuplicate called on block with multiple successors!");
+    for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
+           E = TailBB->succ_end(); I != E; ++I)
+      PredBB->addSuccessor(*I);
+
+    Changed = true;
+    ++NumTailDups;
+  }
+
+  // If TailBB was duplicated into all its predecessors except for the prior
+  // block, which falls through unconditionally, move the contents of this
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = prior(MachineFunction::iterator(TailBB));
+  MachineBasicBlock *PriorTBB = 0, *PriorFBB = 0;
+  SmallVector<MachineOperand, 4> PriorCond;
+  // This has to check PrevBB->succ_size() because EH edges are ignored by
+  // AnalyzeBranch.
+  if (PrevBB->succ_size() == 1 &&
+      !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
+      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
+      !TailBB->hasAddressTaken()) {
+    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+          << "From MBB: " << *TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, unsigned> LocalVRMap;
+      SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        ProcessPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+        if (MI->getParent())
+          MI->eraseFromParent();
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
+        DuplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
+        MI->eraseFromParent();
+      }
+      MachineBasicBlock::iterator Loc = PrevBB->getFirstTerminator();
+      for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+        Copies.push_back(BuildMI(*PrevBB, Loc, DebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 CopyInfos[i].first)
+                           .addReg(CopyInfos[i].second));
+      }
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
+    Changed = true;
+  }
+
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+       PE = Preds.end(); PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<unsigned, unsigned> LocalVRMap;
+    SmallVector<std::pair<unsigned,unsigned>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    // Process PHI instructions first.
+    while (I != TailBB->end() && I->isPHI()) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      MachineInstr *MI = &*I++;
+      ProcessPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    MachineBasicBlock::iterator Loc = PredBB->getFirstTerminator();
+    for (unsigned i = 0, e = CopyInfos.size(); i != e; ++i) {
+      Copies.push_back(BuildMI(*PredBB, Loc, DebugLoc(),
+                               TII->get(TargetOpcode::COPY),
+                               CopyInfos[i].first).addReg(CopyInfos[i].second));
+    }
+  }
+
+  return Changed;
+}
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void TailDuplicatePass::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  // Remove all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Remove the block.
+  MBB->eraseFromParent();
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
new file mode 100644
index 000000000000..883e9d1846d9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -0,0 +1,44 @@
+//===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+TargetFrameLowering::~TargetFrameLowering() {
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index. This is the default implementation
+/// which is overridden for some targets.
+int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+    getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
+int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                             int FI, unsigned &FrameReg) const {
+  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+
+  // By default, assume all frame indices are referenced via whatever
+  // getFrameRegister() says. The target can override this if it's doing
+  // something different.
+  FrameReg = RI->getFrameRegister(MF);
+  return getFrameIndexOffset(MF, FI);
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
new file mode 100644
index 000000000000..20eb91879317
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -0,0 +1,739 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cctype>
+using namespace llvm;
+
+static cl::opt<bool> DisableHazardRecognizer(
+  "disable-sched-hazard", cl::Hidden, cl::init(false),
+  cl::desc("Disable hazard detection during preRA scheduling"));
+
+TargetInstrInfo::~TargetInstrInfo() {
+}
+
+const TargetRegisterClass*
+TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
+                             const TargetRegisterInfo *TRI,
+                             const MachineFunction &MF) const {
+  if (OpNum >= MCID.getNumOperands())
+    return 0;
+
+  short RegClass = MCID.OpInfo[OpNum].RegClass;
+  if (MCID.OpInfo[OpNum].isLookupPtrRegClass())
+    return TRI->getPointerRegClass(MF, RegClass);
+
+  // Instructions like INSERT_SUBREG do not have fixed register classes.
+  if (RegClass < 0)
+    return 0;
+
+  // Otherwise just look it up normally.
+  return TRI->getRegClass(RegClass);
+}
+
+/// insertNoop - Insert a noop into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const {
+  llvm_unreachable("Target didn't implement insertNoop!");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Remove all the dead instructions from the end of MBB.
+  MBB->erase(Tail, MBB->end());
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    InsertBranch(*MBB, NewDest, 0, SmallVector<MachineOperand, 0>(),
+                 Tail->getDebugLoc());
+  MBB->addSuccessor(NewDest);
+}
+
+// commuteInstruction - The default implementation of this method just exchanges
+// the two operands returned by findCommutedOpIndices.
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr *MI,
+                                                  bool NewMI) const {
+  const MCInstrDesc &MCID = MI->getDesc();
+  bool HasDef = MCID.getNumDefs();
+  if (HasDef && !MI->getOperand(0).isReg())
+    // No idea how to commute this instruction. Target should implement its own.
+    return 0;
+  unsigned Idx1, Idx2;
+  if (!findCommutedOpIndices(MI, Idx1, Idx2)) {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Don't know how to commute: " << *MI;
+    report_fatal_error(Msg.str());
+  }
+
+  assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
+         "This only knows how to commute register operands so far");
+  unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
+  unsigned Reg1 = MI->getOperand(Idx1).getReg();
+  unsigned Reg2 = MI->getOperand(Idx2).getReg();
+  unsigned SubReg0 = HasDef ? MI->getOperand(0).getSubReg() : 0;
+  unsigned SubReg1 = MI->getOperand(Idx1).getSubReg();
+  unsigned SubReg2 = MI->getOperand(Idx2).getSubReg();
+  bool Reg1IsKill = MI->getOperand(Idx1).isKill();
+  bool Reg2IsKill = MI->getOperand(Idx2).isKill();
+  // If destination is tied to either of the commuted source register, then
+  // it must be updated.
+  if (HasDef && Reg0 == Reg1 &&
+      MI->getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
+    Reg2IsKill = false;
+    Reg0 = Reg2;
+    SubReg0 = SubReg2;
+  } else if (HasDef && Reg0 == Reg2 &&
+             MI->getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
+    Reg1IsKill = false;
+    Reg0 = Reg1;
+    SubReg0 = SubReg1;
+  }
+
+  if (NewMI) {
+    // Create a new instruction.
+    MachineFunction &MF = *MI->getParent()->getParent();
+    MI = MF.CloneMachineInstr(MI);
+  }
+
+  if (HasDef) {
+    MI->getOperand(0).setReg(Reg0);
+    MI->getOperand(0).setSubReg(SubReg0);
+  }
+  MI->getOperand(Idx2).setReg(Reg1);
+  MI->getOperand(Idx1).setReg(Reg2);
+  MI->getOperand(Idx2).setSubReg(SubReg1);
+  MI->getOperand(Idx1).setSubReg(SubReg2);
+  MI->getOperand(Idx2).setIsKill(Reg1IsKill);
+  MI->getOperand(Idx1).setIsKill(Reg2IsKill);
+  return MI;
+}
+
+/// findCommutedOpIndices - If specified MI is commutable, return the two
+/// operand indices that would swap value. Return true if the instruction
+/// is not in a form which this routine understands.
+bool TargetInstrInfo::findCommutedOpIndices(MachineInstr *MI,
+                                            unsigned &SrcOpIdx1,
+                                            unsigned &SrcOpIdx2) const {
+  assert(!MI->isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MCID.isCommutable())
+    return false;
+  // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
+  // is not true, then the target must implement this.
+  SrcOpIdx1 = MCID.getNumDefs();
+  SrcOpIdx2 = SrcOpIdx1 + 1;
+  if (!MI->getOperand(SrcOpIdx1).isReg() ||
+      !MI->getOperand(SrcOpIdx2).isReg())
+    // No idea.
+    return false;
+  return true;
+}
+
+
+bool
+TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+
+bool TargetInstrInfo::PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const {
+  bool MadeChange = false;
+
+  assert(!MI->isBundle() &&
+         "TargetInstrInfo::PredicateInstruction() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MI->isPredicable())
+    return false;
+
+  for (unsigned j = 0, i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (MCID.OpInfo[i].isPredicate()) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg()) {
+        MO.setReg(Pred[j].getReg());
+        MadeChange = true;
+      } else if (MO.isImm()) {
+        MO.setImm(Pred[j].getImm());
+        MadeChange = true;
+      } else if (MO.isMBB()) {
+        MO.setMBB(Pred[j].getMBB());
+        MadeChange = true;
+      }
+      ++j;
+    }
+  }
+  return MadeChange;
+}
+
+bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isLoad() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
+bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isStore() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
+void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg,
+                                    unsigned SubIdx,
+                                    const MachineInstr *Orig,
+                                    const TargetRegisterInfo &TRI) const {
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+  MBB.insert(I, MI);
+}
+
+bool
+TargetInstrInfo::produceSameValue(const MachineInstr *MI0,
+                                  const MachineInstr *MI1,
+                                  const MachineRegisterInfo *MRI) const {
+  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+MachineInstr *TargetInstrInfo::duplicate(MachineInstr *Orig,
+                                         MachineFunction &MF) const {
+  assert(!Orig->isNotDuplicable() &&
+         "Instruction cannot be duplicated");
+  return MF.CloneMachineInstr(Orig);
+}
+
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr *MI,
+                                              unsigned FoldIdx) {
+  assert(MI->isCopy() && "MI must be a COPY instruction");
+  if (MI->getNumOperands() != 2)
+    return 0;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI->getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI->getOperand(1-FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return 0;
+
+  unsigned FoldReg = FoldOp.getReg();
+  unsigned LiveReg = LiveOp.getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
+         "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+    return RC->contains(LiveOp.getReg()) ? RC : 0;
+
+  if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return 0;
+}
+
+bool TargetInstrInfo::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  return MI->isCopy() && Ops.size() == 1 && canFoldCopy(MI, Ops[0]);
+}
+
+/// foldMemoryOperand - Attempt to fold a load or store of the specified stack
+/// slot into the specified machine instruction for the specified operand(s).
+/// If this is possible, a new instruction is returned with the specified
+/// operand folded, otherwise NULL is returned. The client is responsible for
+/// removing the old instruction and adding the new one in the instruction
+/// stream.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   int FI) const {
+  unsigned Flags = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (MI->getOperand(Ops[i]).isDef())
+      Flags |= MachineMemOperand::MOStore;
+    else
+      Flags |= MachineMemOperand::MOLoad;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
+  // Ask the target to do the actual folding.
+  if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->mayLoad()) &&
+           "Folded a use to a non-load!");
+    const MachineFrameInfo &MFI = *MF.getFrameInfo();
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              Flags, MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    NewMI->addMemOperand(MF, MMO);
+
+    // FIXME: change foldMemoryOperandImpl semantics to also insert NewMI.
+    return MBB->insert(MI, NewMI);
+  }
+
+  // Straight COPY may fold as load/store.
+  if (!MI->isCopy() || Ops.size() != 1)
+    return 0;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return 0;
+
+  const MachineOperand &MO = MI->getOperand(1-Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+  return --Pos;
+}
+
+/// foldMemoryOperand - Same as the previous version except it allows folding
+/// of any load and store from / to any address, not just from a specific
+/// stack slot.
+MachineInstr*
+TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
+                                   const SmallVectorImpl<unsigned> &Ops,
+                                   MachineInstr* LoadMI) const {
+  assert(LoadMI->canFoldAsLoad() && "LoadMI isn't foldable!");
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    assert(MI->getOperand(Ops[i]).isUse() && "Folding load into def!");
+#endif
+  MachineBasicBlock &MBB = *MI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, LoadMI);
+  if (!NewMI) return 0;
+
+  NewMI = MBB.insert(MI, NewMI);
+
+  // Copy the memoperands from the load to the folded instruction.
+  NewMI->setMemRefs(LoadMI->memoperands_begin(),
+                    LoadMI->memoperands_end());
+
+  return NewMI;
+}
+
+bool TargetInstrInfo::
+isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const {
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+
+  // Remat clients assume operand 0 is the defined register.
+  if (!MI->getNumOperands() || !MI->getOperand(0).isReg())
+    return false;
+  unsigned DefReg = MI->getOperand(0).getReg();
+
+  // A sub-register definition can only be rematerialized if the instruction
+  // doesn't read the other parts of the register.  Otherwise it is really a
+  // read-modify-write operation on the full virtual register which cannot be
+  // moved safely.
+  if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
+      MI->getOperand(0).getSubReg() && MI->readsVirtualRegister(DefReg))
+    return false;
+
+  // A load from a fixed stack slot can be rematerialized. This may be
+  // redundant with subsequent checks, but it's target-independent,
+  // simple, and a common case.
+  int FrameIdx = 0;
+  if (TII.isLoadFromStackSlot(MI, FrameIdx) &&
+      MF.getFrameInfo()->isImmutableObjectIndex(FrameIdx))
+    return true;
+
+  // Avoid instructions obviously unsafe for remat.
+  if (MI->isNotDuplicable() || MI->mayStore() ||
+      MI->hasUnmodeledSideEffects())
+    return false;
+
+  // Don't remat inline asm. We have no idea how expensive it is
+  // even if it's side effect free.
+  if (MI->isInlineAsm())
+    return false;
+
+  // Avoid instructions which load from potentially varying memory.
+  if (MI->mayLoad() && !MI->isInvariantLoad(AA))
+    return false;
+
+  // If any of the registers accessed are non-constant, conservatively assume
+  // the instruction is not rematerializable.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // Check for a well-behaved physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI.isConstantPhysReg(Reg, MF))
+          return false;
+      } else {
+        // A physreg def. We can't remat it.
+        return false;
+      }
+      continue;
+    }
+
+    // Only allow one virtual-register def.  There may be multiple defs of the
+    // same virtual register, though.
+    if (MO.isDef() && Reg != DefReg)
+      return false;
+
+    // Don't allow any virtual-register uses. Rematting an instruction with
+    // virtual register uses would length the live ranges of the uses, which
+    // is not necessarily a good idea, certainly not "trivial".
+    if (MO.isUse())
+      return false;
+  }
+
+  // Everything checked out.
+  return true;
+}
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                           const MachineBasicBlock *MBB,
+                                           const MachineFunction &MF) const {
+  // Terminators and labels can't be scheduled around.
+  if (MI->isTerminator() || MI->isLabel())
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getTarget().getTargetLowering();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  if (MI->modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI))
+    return true;
+
+  return false;
+}
+
+// Provide a global flag for disabling the PreRA hazard recognizer that targets
+// may choose to honor.
+bool TargetInstrInfo::usePreRAHazardRecognizer() const {
+  return !DisableHazardRecognizer;
+}
+
+// Default implementation of CreateTargetRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetHazardRecognizer(const TargetMachine *TM,
+                             const ScheduleDAG *DAG) const {
+  // Dummy hazard recognizer allows all instructions to issue.
+  return new ScheduleHazardRecognizer();
+}
+
+// Default implementation of CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                               const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "misched");
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
+}
+
+//===----------------------------------------------------------------------===//
+//  SelectionDAG latency interface.
+//===----------------------------------------------------------------------===//
+
+int
+TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                   SDNode *DefNode, unsigned DefIdx,
+                                   SDNode *UseNode, unsigned UseIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  if (!DefNode->isMachineOpcode())
+    return -1;
+
+  unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
+  if (!UseNode->isMachineOpcode())
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+  unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                     SDNode *N) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  if (!N->isMachineOpcode())
+    return 1;
+
+  return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineInstr latency interface.
+//===----------------------------------------------------------------------===//
+
+unsigned
+TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Class = MI->getDesc().getSchedClass();
+  int UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps >= 0)
+    return UOps;
+
+  // The # of u-ops is dynamically determined. The specific target should
+  // override this function to return the right number.
+  return 1;
+}
+
+/// Return the default expected latency for a def based on it's opcode.
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
+                                            const MachineInstr *DefMI) const {
+  if (DefMI->isTransient())
+    return 0;
+  if (DefMI->mayLoad())
+    return SchedModel->LoadLatency;
+  if (isHighLatencyDef(DefMI->getOpcode()))
+    return SchedModel->HighLatency;
+  return 1;
+}
+
+unsigned TargetInstrInfo::
+getInstrLatency(const InstrItineraryData *ItinData,
+                const MachineInstr *MI,
+                unsigned *PredCost) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return MI->mayLoad() ? 2 : 1;
+
+  return ItinData->getStageLatency(MI->getDesc().getSchedClass());
+}
+
+bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr *DefMI,
+                                       unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+  return (DefCycle != -1 && DefCycle <= 1);
+}
+
+/// Both DefMI and UseMI must be valid.  By default, call directly to the
+/// itinerary. This may be overriden by the target.
+int TargetInstrInfo::
+getOperandLatency(const InstrItineraryData *ItinData,
+                  const MachineInstr *DefMI, unsigned DefIdx,
+                  const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DefClass = DefMI->getDesc().getSchedClass();
+  unsigned UseClass = UseMI->getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+/// If we can determine the operand latency from the def only, without itinerary
+/// lookup, do so. Otherwise return -1.
+int TargetInstrInfo::computeDefOperandLatency(
+  const InstrItineraryData *ItinData,
+  const MachineInstr *DefMI, bool FindMin) const {
+
+  // Let the target hook getInstrLatency handle missing itineraries.
+  if (!ItinData)
+    return getInstrLatency(ItinData, DefMI);
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
+    // it exists before defaulting to MinLatency.
+    if (ItinData->SchedModel->MinLatency >= 0)
+      return getInstrLatency(ItinData, DefMI);
+
+    // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
+    // For empty itineraries, short-cirtuit the check and default to one cycle.
+    if (ItinData->isEmpty())
+      return 1;
+  }
+  else if(ItinData->isEmpty())
+    return defaultDefLatency(ItinData->SchedModel, DefMI);
+
+  // ...operand lookup required
+  return -1;
+}
+
+/// computeOperandLatency - Compute and return the latency of the given data
+/// dependent def and use when the operand indices are already known. UseMI may
+/// be NULL for an unknown use.
+///
+/// FindMin may be set to get the minimum vs. expected latency. Minimum
+/// latency is used for scheduling groups, while expected latency is for
+/// instruction cost and critical path.
+///
+/// Depending on the subtarget's itinerary properties, this may or may not need
+/// to call getOperandLatency(). For most subtargets, we don't need DefIdx or
+/// UseIdx to compute min latency.
+unsigned TargetInstrInfo::
+computeOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx,
+                      bool FindMin) const {
+
+  int DefLatency = computeDefOperandLatency(ItinData, DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  int OperLatency = 0;
+  if (UseMI)
+    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  else {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+  }
+  if (OperLatency >= 0)
+    return OperLatency;
+
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency,
+                            defaultDefLatency(ItinData->SchedModel, DefMI));
+  return InstrLatency;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
new file mode 100644
index 000000000000..f42bdbd27643
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -0,0 +1,1305 @@
+//===-- TargetLoweringBase.cpp - Implement the TargetLoweringBase class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLoweringBase class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cctype>
+using namespace llvm;
+
+/// InitLibcallNames - Set default libcall names.
+///
+static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
+  Names[RTLIB::SHL_I16] = "__ashlhi3";
+  Names[RTLIB::SHL_I32] = "__ashlsi3";
+  Names[RTLIB::SHL_I64] = "__ashldi3";
+  Names[RTLIB::SHL_I128] = "__ashlti3";
+  Names[RTLIB::SRL_I16] = "__lshrhi3";
+  Names[RTLIB::SRL_I32] = "__lshrsi3";
+  Names[RTLIB::SRL_I64] = "__lshrdi3";
+  Names[RTLIB::SRL_I128] = "__lshrti3";
+  Names[RTLIB::SRA_I16] = "__ashrhi3";
+  Names[RTLIB::SRA_I32] = "__ashrsi3";
+  Names[RTLIB::SRA_I64] = "__ashrdi3";
+  Names[RTLIB::SRA_I128] = "__ashrti3";
+  Names[RTLIB::MUL_I8] = "__mulqi3";
+  Names[RTLIB::MUL_I16] = "__mulhi3";
+  Names[RTLIB::MUL_I32] = "__mulsi3";
+  Names[RTLIB::MUL_I64] = "__muldi3";
+  Names[RTLIB::MUL_I128] = "__multi3";
+  Names[RTLIB::MULO_I32] = "__mulosi4";
+  Names[RTLIB::MULO_I64] = "__mulodi4";
+  Names[RTLIB::MULO_I128] = "__muloti4";
+  Names[RTLIB::SDIV_I8] = "__divqi3";
+  Names[RTLIB::SDIV_I16] = "__divhi3";
+  Names[RTLIB::SDIV_I32] = "__divsi3";
+  Names[RTLIB::SDIV_I64] = "__divdi3";
+  Names[RTLIB::SDIV_I128] = "__divti3";
+  Names[RTLIB::UDIV_I8] = "__udivqi3";
+  Names[RTLIB::UDIV_I16] = "__udivhi3";
+  Names[RTLIB::UDIV_I32] = "__udivsi3";
+  Names[RTLIB::UDIV_I64] = "__udivdi3";
+  Names[RTLIB::UDIV_I128] = "__udivti3";
+  Names[RTLIB::SREM_I8] = "__modqi3";
+  Names[RTLIB::SREM_I16] = "__modhi3";
+  Names[RTLIB::SREM_I32] = "__modsi3";
+  Names[RTLIB::SREM_I64] = "__moddi3";
+  Names[RTLIB::SREM_I128] = "__modti3";
+  Names[RTLIB::UREM_I8] = "__umodqi3";
+  Names[RTLIB::UREM_I16] = "__umodhi3";
+  Names[RTLIB::UREM_I32] = "__umodsi3";
+  Names[RTLIB::UREM_I64] = "__umoddi3";
+  Names[RTLIB::UREM_I128] = "__umodti3";
+
+  // These are generally not available.
+  Names[RTLIB::SDIVREM_I8] = 0;
+  Names[RTLIB::SDIVREM_I16] = 0;
+  Names[RTLIB::SDIVREM_I32] = 0;
+  Names[RTLIB::SDIVREM_I64] = 0;
+  Names[RTLIB::SDIVREM_I128] = 0;
+  Names[RTLIB::UDIVREM_I8] = 0;
+  Names[RTLIB::UDIVREM_I16] = 0;
+  Names[RTLIB::UDIVREM_I32] = 0;
+  Names[RTLIB::UDIVREM_I64] = 0;
+  Names[RTLIB::UDIVREM_I128] = 0;
+
+  Names[RTLIB::NEG_I32] = "__negsi2";
+  Names[RTLIB::NEG_I64] = "__negdi2";
+  Names[RTLIB::ADD_F32] = "__addsf3";
+  Names[RTLIB::ADD_F64] = "__adddf3";
+  Names[RTLIB::ADD_F80] = "__addxf3";
+  Names[RTLIB::ADD_F128] = "__addtf3";
+  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
+  Names[RTLIB::SUB_F32] = "__subsf3";
+  Names[RTLIB::SUB_F64] = "__subdf3";
+  Names[RTLIB::SUB_F80] = "__subxf3";
+  Names[RTLIB::SUB_F128] = "__subtf3";
+  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
+  Names[RTLIB::MUL_F32] = "__mulsf3";
+  Names[RTLIB::MUL_F64] = "__muldf3";
+  Names[RTLIB::MUL_F80] = "__mulxf3";
+  Names[RTLIB::MUL_F128] = "__multf3";
+  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
+  Names[RTLIB::DIV_F32] = "__divsf3";
+  Names[RTLIB::DIV_F64] = "__divdf3";
+  Names[RTLIB::DIV_F80] = "__divxf3";
+  Names[RTLIB::DIV_F128] = "__divtf3";
+  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
+  Names[RTLIB::REM_F32] = "fmodf";
+  Names[RTLIB::REM_F64] = "fmod";
+  Names[RTLIB::REM_F80] = "fmodl";
+  Names[RTLIB::REM_F128] = "fmodl";
+  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::FMA_F32] = "fmaf";
+  Names[RTLIB::FMA_F64] = "fma";
+  Names[RTLIB::FMA_F80] = "fmal";
+  Names[RTLIB::FMA_F128] = "fmal";
+  Names[RTLIB::FMA_PPCF128] = "fmal";
+  Names[RTLIB::POWI_F32] = "__powisf2";
+  Names[RTLIB::POWI_F64] = "__powidf2";
+  Names[RTLIB::POWI_F80] = "__powixf2";
+  Names[RTLIB::POWI_F128] = "__powitf2";
+  Names[RTLIB::POWI_PPCF128] = "__powitf2";
+  Names[RTLIB::SQRT_F32] = "sqrtf";
+  Names[RTLIB::SQRT_F64] = "sqrt";
+  Names[RTLIB::SQRT_F80] = "sqrtl";
+  Names[RTLIB::SQRT_F128] = "sqrtl";
+  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
+  Names[RTLIB::LOG_F32] = "logf";
+  Names[RTLIB::LOG_F64] = "log";
+  Names[RTLIB::LOG_F80] = "logl";
+  Names[RTLIB::LOG_F128] = "logl";
+  Names[RTLIB::LOG_PPCF128] = "logl";
+  Names[RTLIB::LOG2_F32] = "log2f";
+  Names[RTLIB::LOG2_F64] = "log2";
+  Names[RTLIB::LOG2_F80] = "log2l";
+  Names[RTLIB::LOG2_F128] = "log2l";
+  Names[RTLIB::LOG2_PPCF128] = "log2l";
+  Names[RTLIB::LOG10_F32] = "log10f";
+  Names[RTLIB::LOG10_F64] = "log10";
+  Names[RTLIB::LOG10_F80] = "log10l";
+  Names[RTLIB::LOG10_F128] = "log10l";
+  Names[RTLIB::LOG10_PPCF128] = "log10l";
+  Names[RTLIB::EXP_F32] = "expf";
+  Names[RTLIB::EXP_F64] = "exp";
+  Names[RTLIB::EXP_F80] = "expl";
+  Names[RTLIB::EXP_F128] = "expl";
+  Names[RTLIB::EXP_PPCF128] = "expl";
+  Names[RTLIB::EXP2_F32] = "exp2f";
+  Names[RTLIB::EXP2_F64] = "exp2";
+  Names[RTLIB::EXP2_F80] = "exp2l";
+  Names[RTLIB::EXP2_F128] = "exp2l";
+  Names[RTLIB::EXP2_PPCF128] = "exp2l";
+  Names[RTLIB::SIN_F32] = "sinf";
+  Names[RTLIB::SIN_F64] = "sin";
+  Names[RTLIB::SIN_F80] = "sinl";
+  Names[RTLIB::SIN_F128] = "sinl";
+  Names[RTLIB::SIN_PPCF128] = "sinl";
+  Names[RTLIB::COS_F32] = "cosf";
+  Names[RTLIB::COS_F64] = "cos";
+  Names[RTLIB::COS_F80] = "cosl";
+  Names[RTLIB::COS_F128] = "cosl";
+  Names[RTLIB::COS_PPCF128] = "cosl";
+  Names[RTLIB::POW_F32] = "powf";
+  Names[RTLIB::POW_F64] = "pow";
+  Names[RTLIB::POW_F80] = "powl";
+  Names[RTLIB::POW_F128] = "powl";
+  Names[RTLIB::POW_PPCF128] = "powl";
+  Names[RTLIB::CEIL_F32] = "ceilf";
+  Names[RTLIB::CEIL_F64] = "ceil";
+  Names[RTLIB::CEIL_F80] = "ceill";
+  Names[RTLIB::CEIL_F128] = "ceill";
+  Names[RTLIB::CEIL_PPCF128] = "ceill";
+  Names[RTLIB::TRUNC_F32] = "truncf";
+  Names[RTLIB::TRUNC_F64] = "trunc";
+  Names[RTLIB::TRUNC_F80] = "truncl";
+  Names[RTLIB::TRUNC_F128] = "truncl";
+  Names[RTLIB::TRUNC_PPCF128] = "truncl";
+  Names[RTLIB::RINT_F32] = "rintf";
+  Names[RTLIB::RINT_F64] = "rint";
+  Names[RTLIB::RINT_F80] = "rintl";
+  Names[RTLIB::RINT_F128] = "rintl";
+  Names[RTLIB::RINT_PPCF128] = "rintl";
+  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
+  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
+  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::FLOOR_F32] = "floorf";
+  Names[RTLIB::FLOOR_F64] = "floor";
+  Names[RTLIB::FLOOR_F80] = "floorl";
+  Names[RTLIB::FLOOR_F128] = "floorl";
+  Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::COPYSIGN_F32] = "copysignf";
+  Names[RTLIB::COPYSIGN_F64] = "copysign";
+  Names[RTLIB::COPYSIGN_F80] = "copysignl";
+  Names[RTLIB::COPYSIGN_F128] = "copysignl";
+  Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
+  Names[RTLIB::FPEXT_F64_F128] = "__extenddftf2";
+  Names[RTLIB::FPEXT_F32_F128] = "__extendsftf2";
+  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
+  Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
+  Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
+  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
+  Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
+  Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPTOSINT_F32_I8] = "__fixsfqi";
+  Names[RTLIB::FPTOSINT_F32_I16] = "__fixsfhi";
+  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
+  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
+  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I8] = "__fixdfqi";
+  Names[RTLIB::FPTOSINT_F64_I16] = "__fixdfhi";
+  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
+  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
+  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
+  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
+  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
+  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
+  Names[RTLIB::FPTOSINT_F128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_F128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_F128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOUINT_F32_I8] = "__fixunssfqi";
+  Names[RTLIB::FPTOUINT_F32_I16] = "__fixunssfhi";
+  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
+  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
+  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I8] = "__fixunsdfqi";
+  Names[RTLIB::FPTOUINT_F64_I16] = "__fixunsdfhi";
+  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
+  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
+  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
+  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
+  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
+  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
+  Names[RTLIB::FPTOUINT_F128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_F128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_F128_I128] = "__fixunstfti";
+  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
+  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
+  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
+  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
+  Names[RTLIB::SINTTOFP_I32_F128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
+  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
+  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
+  Names[RTLIB::SINTTOFP_I64_F128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
+  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
+  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
+  Names[RTLIB::SINTTOFP_I128_F128] = "__floattitf";
+  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
+  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
+  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
+  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
+  Names[RTLIB::UINTTOFP_I32_F128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
+  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
+  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
+  Names[RTLIB::UINTTOFP_I64_F128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
+  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
+  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
+  Names[RTLIB::UINTTOFP_I128_F128] = "__floatuntitf";
+  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
+  Names[RTLIB::OEQ_F32] = "__eqsf2";
+  Names[RTLIB::OEQ_F64] = "__eqdf2";
+  Names[RTLIB::OEQ_F128] = "__eqtf2";
+  Names[RTLIB::UNE_F32] = "__nesf2";
+  Names[RTLIB::UNE_F64] = "__nedf2";
+  Names[RTLIB::UNE_F128] = "__netf2";
+  Names[RTLIB::OGE_F32] = "__gesf2";
+  Names[RTLIB::OGE_F64] = "__gedf2";
+  Names[RTLIB::OGE_F128] = "__getf2";
+  Names[RTLIB::OLT_F32] = "__ltsf2";
+  Names[RTLIB::OLT_F64] = "__ltdf2";
+  Names[RTLIB::OLT_F128] = "__lttf2";
+  Names[RTLIB::OLE_F32] = "__lesf2";
+  Names[RTLIB::OLE_F64] = "__ledf2";
+  Names[RTLIB::OLE_F128] = "__letf2";
+  Names[RTLIB::OGT_F32] = "__gtsf2";
+  Names[RTLIB::OGT_F64] = "__gtdf2";
+  Names[RTLIB::OGT_F128] = "__gttf2";
+  Names[RTLIB::UO_F32] = "__unordsf2";
+  Names[RTLIB::UO_F64] = "__unorddf2";
+  Names[RTLIB::UO_F128] = "__unordtf2";
+  Names[RTLIB::O_F32] = "__unordsf2";
+  Names[RTLIB::O_F64] = "__unorddf2";
+  Names[RTLIB::O_F128] = "__unordtf2";
+  Names[RTLIB::MEMCPY] = "memcpy";
+  Names[RTLIB::MEMMOVE] = "memmove";
+  Names[RTLIB::MEMSET] = "memset";
+  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  
+  if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
+    Names[RTLIB::SINCOS_F32] = "sincosf";
+    Names[RTLIB::SINCOS_F64] = "sincos";
+    Names[RTLIB::SINCOS_F80] = "sincosl";
+    Names[RTLIB::SINCOS_F128] = "sincosl";
+    Names[RTLIB::SINCOS_PPCF128] = "sincosl";
+  } else {
+    // These are generally not available.
+    Names[RTLIB::SINCOS_F32] = 0;
+    Names[RTLIB::SINCOS_F64] = 0;
+    Names[RTLIB::SINCOS_F80] = 0;
+    Names[RTLIB::SINCOS_F128] = 0;
+    Names[RTLIB::SINCOS_PPCF128] = 0;
+  }
+}
+
+/// InitLibcallCallingConvs - Set default libcall CallingConvs.
+///
+static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
+  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+    CCs[i] = CallingConv::C;
+  }
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+    if (RetVT == MVT::f128)
+      return FPEXT_F32_F128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F64_F128;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
+  if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOSINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOSINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F32_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F32_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i8)
+      return FPTOUINT_F64_I8;
+    if (RetVT == MVT::i16)
+      return FPTOUINT_F64_I16;
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+///
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::O_F32] = ISD::SETEQ;
+  CCs[RTLIB::O_F64] = ISD::SETEQ;
+  CCs[RTLIB::O_F128] = ISD::SETEQ;
+}
+
+/// NOTE: The constructor takes ownership of TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
+                                       const TargetLoweringObjectFile *tlof)
+  : TM(tm), TD(TM.getDataLayout()), TLOF(*tlof) {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+
+  // Set default actions for various operations.
+  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+    }
+
+    // These operations default to expand.
+    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+
+  // ConstantFP nodes default to expand.  Targets can either change this to
+  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f128, Expand);
+
+  // These library functions default to expand.
+  setOperationAction(ISD::FLOG ,  MVT::f16, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f16, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f16, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f16, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f16, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f16, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f16, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f16, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f16, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f16, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f32, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f32, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f64, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f64, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f64, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f64, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f64, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f64, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
+  setOperationAction(ISD::FLOG ,  MVT::f128, Expand);
+  setOperationAction(ISD::FLOG2,  MVT::f128, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f128, Expand);
+  setOperationAction(ISD::FEXP ,  MVT::f128, Expand);
+  setOperationAction(ISD::FEXP2,  MVT::f128, Expand);
+  setOperationAction(ISD::FFLOOR, MVT::f128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::f128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::f128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::f128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  //
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
+  IsLittleEndian = TD->isLittleEndian();
+  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
+  memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
+  memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
+    = MaxStoresPerMemmoveOptSize = 4;
+  UseUnderscoreSetJmp = false;
+  UseUnderscoreLongJmp = false;
+  SelectIsExpensive = false;
+  IntDivIsCheap = false;
+  Pow2DivIsCheap = false;
+  JumpIsExpensive = false;
+  PredictableSelectIsExpensive = false;
+  StackPointerRegisterToSaveRestore = 0;
+  ExceptionPointerRegister = 0;
+  ExceptionSelectorRegister = 0;
+  BooleanContents = UndefinedBooleanContent;
+  BooleanVectorContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = Sched::ILP;
+  JumpBufSize = 0;
+  JumpBufAlignment = 0;
+  MinFunctionAlignment = 0;
+  PrefFunctionAlignment = 0;
+  PrefLoopAlignment = 0;
+  MinStackArgumentAlignment = 1;
+  ShouldFoldAtomicFences = false;
+  InsertFencesForAtomic = false;
+  SupportJumpTables = true;
+  MinimumJumpTableEntries = 4;
+
+  InitLibcallNames(LibcallRoutineNames, TM);
+  InitCmpLibcallCCs(CmpLibcallCCs);
+  InitLibcallCallingConvs(LibcallCallingConvs);
+}
+
+TargetLoweringBase::~TargetLoweringBase() {
+  delete &TLOF;
+}
+
+MVT TargetLoweringBase::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::getIntegerVT(8*TD->getPointerSize(0));
+}
+
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy) const {
+  assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+  if (LHSTy.isVector())
+    return LHSTy;
+  return getScalarShiftAmountTy(LHSTy);
+}
+
+/// canOpTrap - Returns true if the operation can trap for the value type.
+/// VT must be a legal type.
+bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
+  assert(isTypeLegal(VT));
+  switch (Op) {
+  default:
+    return false;
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return true;
+  }
+}
+
+
+static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
+                                          unsigned &NumIntermediates,
+                                          MVT &RegisterVT,
+                                          TargetLoweringBase *TLI) {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  if (!TLI->isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  MVT DestVT = TLI->getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+       I != E; ++I) {
+    if (isTypeLegal(*I))
+      return true;
+  }
+  return false;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+std::pair<const TargetRegisterClass*, uint8_t>
+TargetLoweringBase::findRepresentativeClass(MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
+  const TargetRegisterClass *BestRC = RC;
+  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (SuperRC->getSize() <= BestRC->getSize())
+      continue;
+    if (!isLegalRC(SuperRC))
+      continue;
+    BestRC = SuperRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLoweringBase::computeRegisterProperties() {
+  assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE &&
+         "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == 0; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1;
+       ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
+                                   TypeExpandInteger);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    MVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (const MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+    RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+    TransformToType[MVT::ppcf128] = MVT::f64;
+    ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+  }
+
+  // Decide how to handle f128. If the target does not have native f128 support,
+  // expand it to i128 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f128)) {
+    NumRegistersForVT[MVT::f128] = NumRegistersForVT[MVT::i128];
+    RegisterTypeForVT[MVT::f128] = RegisterTypeForVT[MVT::i128];
+    TransformToType[MVT::f128] = MVT::i128;
+    ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f32. If the target does not have native support for
+  // f32, promote it to f64 if it is legal. Otherwise, expand it to i32.
+  if (!isTypeLegal(MVT::f32)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::f64];
+      TransformToType[MVT::f32] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::f32, TypePromoteInteger);
+    } else {
+      NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+      RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+      TransformToType[MVT::f32] = MVT::i32;
+      ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
+    }
+  }
+
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    if (isTypeLegal(VT)) continue;
+
+    // Determine if there is a legal wider type.  If so, we should promote to
+    // that wider vector type.
+    MVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    if (NElts != 1 && !shouldSplitVectorElementType(EltVT)) {
+      bool IsLegalWiderType = false;
+      // First try to promote the elements of integer vectors. If no legal
+      // promotion was found, fallback to the widen-vector method.
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType)nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits()
+            && SVT.getVectorNumElements() == NElts &&
+            isTypeLegal(SVT) && SVT.getScalarType().isInteger()) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+
+      if (IsLegalWiderType) continue;
+
+      // Try to widen the vector.
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType)nVT;
+        if (SVT.getVectorElementType() == EltVT &&
+            SVT.getVectorNumElements() > NElts &&
+            isTypeLegal(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType) continue;
+    }
+
+    MVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    NumRegistersForVT[i] =
+      getVectorTypeBreakdownMVT(VT, IntermediateVT, NumIntermediates,
+                                RegisterVT, this);
+    RegisterTypeForVT[i] = RegisterVT;
+
+    MVT NVT = VT.getPow2VectorType();
+    if (NVT == VT) {
+      // Type is already a power of 2.  The default action is to split.
+      TransformToType[i] = MVT::Other;
+      unsigned NumElts = VT.getVectorNumElements();
+      ValueTypeActions.setTypeAction(VT,
+            NumElts > 1 ? TypeSplitVector : TypeScalarizeVector);
+    } else {
+      TransformToType[i] = NVT;
+      ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+    }
+  }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    tie(RRC, Cost) =  findRepresentativeClass((MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
+}
+
+EVT TargetLoweringBase::getSetCCResultType(EVT VT) const {
+  assert(!VT.isVector() && "No default SetCC type for vectors!");
+  return getPointerTy(0).SimpleTy;
+}
+
+MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
+  return MVT::i32; // return the default value
+}
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+///
+unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If there is a wider vector type with the same element type as this one,
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterEVT)) {
+      IntermediateVT = RegisterEVT;
+      RegisterVT = RegisterEVT.getSimpleVT();
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+
+  // Figure out the right, legal destination reg to copy into.
+  EVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !isTypeLegal(
+                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  MVT DestVT = getRegisterType(Context, NewVT);
+  RegisterVT = DestVT;
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// Get the EVTs and ArgFlags collections that represent the legalized return
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      Flags.setSExt();
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i)
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty) const {
+  return TD->getCallFrameTypeAlignment(Ty);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetTransformInfo Helpers
+//===----------------------------------------------------------------------===//
+
+int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
+  enum InstructionOpcodes {
+#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
+#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
+#include "llvm/IR/Instruction.def"
+  };
+  switch (static_cast<InstructionOpcodes>(Opcode)) {
+  case Ret:            return 0;
+  case Br:             return 0;
+  case Switch:         return 0;
+  case IndirectBr:     return 0;
+  case Invoke:         return 0;
+  case Resume:         return 0;
+  case Unreachable:    return 0;
+  case Add:            return ISD::ADD;
+  case FAdd:           return ISD::FADD;
+  case Sub:            return ISD::SUB;
+  case FSub:           return ISD::FSUB;
+  case Mul:            return ISD::MUL;
+  case FMul:           return ISD::FMUL;
+  case UDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::UDIV;
+  case FDiv:           return ISD::FDIV;
+  case URem:           return ISD::UREM;
+  case SRem:           return ISD::SREM;
+  case FRem:           return ISD::FREM;
+  case Shl:            return ISD::SHL;
+  case LShr:           return ISD::SRL;
+  case AShr:           return ISD::SRA;
+  case And:            return ISD::AND;
+  case Or:             return ISD::OR;
+  case Xor:            return ISD::XOR;
+  case Alloca:         return 0;
+  case Load:           return ISD::LOAD;
+  case Store:          return ISD::STORE;
+  case GetElementPtr:  return 0;
+  case Fence:          return 0;
+  case AtomicCmpXchg:  return 0;
+  case AtomicRMW:      return 0;
+  case Trunc:          return ISD::TRUNCATE;
+  case ZExt:           return ISD::ZERO_EXTEND;
+  case SExt:           return ISD::SIGN_EXTEND;
+  case FPToUI:         return ISD::FP_TO_UINT;
+  case FPToSI:         return ISD::FP_TO_SINT;
+  case UIToFP:         return ISD::UINT_TO_FP;
+  case SIToFP:         return ISD::SINT_TO_FP;
+  case FPTrunc:        return ISD::FP_ROUND;
+  case FPExt:          return ISD::FP_EXTEND;
+  case PtrToInt:       return ISD::BITCAST;
+  case IntToPtr:       return ISD::BITCAST;
+  case BitCast:        return ISD::BITCAST;
+  case ICmp:           return ISD::SETCC;
+  case FCmp:           return ISD::SETCC;
+  case PHI:            return 0;
+  case Call:           return 0;
+  case Select:         return ISD::SELECT;
+  case UserOp1:        return 0;
+  case UserOp2:        return 0;
+  case VAArg:          return 0;
+  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
+  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
+  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
+  case ExtractValue:   return ISD::MERGE_VALUES;
+  case InsertValue:    return ISD::MERGE_VALUES;
+  case LandingPad:     return 0;
+  }
+
+  llvm_unreachable("Unknown instruction type encountered!");
+}
+
+std::pair<unsigned, MVT>
+TargetLoweringBase::getTypeLegalizationCost(Type *Ty) const {
+  LLVMContext &C = Ty->getContext();
+  EVT MTy = getValueType(Ty);
+
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    LegalizeKind LK = getTypeConversion(C, MTy);
+
+    if (LK.first == TypeLegal)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
+    if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    MTy = LK.second;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLoweringBase::isLegalAddressingMode(const AddrMode &AM,
+                                           Type *Ty) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
new file mode 100644
index 000000000000..3bdca4c64078
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -0,0 +1,784 @@
+//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+//                                  ELF
+//===----------------------------------------------------------------------===//
+
+MCSymbol *
+TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
+                                                     Mangler *Mang,
+                                                MachineModuleInfo *MMI) const {
+  unsigned Encoding = getPersonalityEncoding();
+  switch (Encoding & 0x70) {
+  default:
+    report_fatal_error("We do not support this DWARF encoding yet!");
+  case dwarf::DW_EH_PE_absptr:
+    return  Mang->getSymbol(GV);
+  case dwarf::DW_EH_PE_pcrel: {
+    return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
+                                          Mang->getSymbol(GV)->getName());
+  }
+  }
+}
+
+void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
+                                                       const TargetMachine &TM,
+                                                       const MCSymbol *Sym) const {
+  SmallString<64> NameData("DW.ref.");
+  NameData += Sym->getName();
+  MCSymbol *Label = getContext().GetOrCreateSymbol(NameData);
+  Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
+  Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
+  StringRef Prefix = ".data.";
+  NameData.insert(NameData.begin(), Prefix.begin(), Prefix.end());
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
+  const MCSection *Sec = getContext().getELFSection(NameData,
+                                                    ELF::SHT_PROGBITS,
+                                                    Flags,
+                                                    SectionKind::getDataRel(),
+                                                    0, Label->getName());
+  unsigned Size = TM.getDataLayout()->getPointerSize();
+  Streamer.SwitchSection(Sec);
+  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
+  const MCExpr *E = MCConstantExpr::Create(Size, getContext());
+  Streamer.EmitELFSize(Label, E);
+  Streamer.EmitLabel(Label);
+
+  Streamer.EmitSymbolValue(Sym, Size);
+}
+
+const MCExpr *TargetLoweringObjectFileELF::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI, unsigned Encoding,
+                        MCStreamer &Streamer) const {
+
+  if (Encoding & dwarf::DW_EH_PE_indirect) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    SmallString<128> Name;
+    Mang->getNameWithPrefix(Name, GV, true);
+    Name += ".DW.stub";
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (StubSym.getPointer() == 0) {
+      MCSymbol *Sym = Mang->getSymbol(GV);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::Create(SSym, getContext()),
+                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::
+    getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+}
+
+static SectionKind
+getELFKindForNamedSection(StringRef Name, SectionKind K) {
+  // N.B.: The defaults used in here are no the same ones used in MC.
+  // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
+  // both gas and MC will produce a section with no flags. Given
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
+  if (Name.empty() || Name[0] != '.') return K;
+
+  // Some lame default implementation based on some magic section names.
+  if (Name == ".bss" ||
+      Name.startswith(".bss.") ||
+      Name.startswith(".gnu.linkonce.b.") ||
+      Name.startswith(".llvm.linkonce.b.") ||
+      Name == ".sbss" ||
+      Name.startswith(".sbss.") ||
+      Name.startswith(".gnu.linkonce.sb.") ||
+      Name.startswith(".llvm.linkonce.sb."))
+    return SectionKind::getBSS();
+
+  if (Name == ".tdata" ||
+      Name.startswith(".tdata.") ||
+      Name.startswith(".gnu.linkonce.td.") ||
+      Name.startswith(".llvm.linkonce.td."))
+    return SectionKind::getThreadData();
+
+  if (Name == ".tbss" ||
+      Name.startswith(".tbss.") ||
+      Name.startswith(".gnu.linkonce.tb.") ||
+      Name.startswith(".llvm.linkonce.tb."))
+    return SectionKind::getThreadBSS();
+
+  return K;
+}
+
+
+static unsigned getELFSectionType(StringRef Name, SectionKind K) {
+
+  if (Name == ".init_array")
+    return ELF::SHT_INIT_ARRAY;
+
+  if (Name == ".fini_array")
+    return ELF::SHT_FINI_ARRAY;
+
+  if (Name == ".preinit_array")
+    return ELF::SHT_PREINIT_ARRAY;
+
+  if (K.isBSS() || K.isThreadBSS())
+    return ELF::SHT_NOBITS;
+
+  return ELF::SHT_PROGBITS;
+}
+
+
+static unsigned
+getELFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isThreadLocal())
+    Flags |= ELF::SHF_TLS;
+
+  // K.isMergeableConst() is left out to honour PR4650
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+
+const MCSection *TargetLoweringObjectFileELF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  StringRef SectionName = GV->getSection();
+
+  // Infer section flags from the section name if we can.
+  Kind = getELFKindForNamedSection(SectionName, Kind);
+
+  return getContext().getELFSection(SectionName,
+                                    getELFSectionType(SectionName, Kind),
+                                    getELFSectionFlags(Kind), Kind);
+}
+
+/// getSectionPrefixForGlobal - Return the section prefix name used by options
+/// FunctionsSections and DataSections.
+static const char *getSectionPrefixForGlobal(SectionKind Kind) {
+  if (Kind.isText())                 return ".text.";
+  if (Kind.isReadOnly())             return ".rodata.";
+  if (Kind.isBSS())                  return ".bss.";
+
+  if (Kind.isThreadData())           return ".tdata.";
+  if (Kind.isThreadBSS())            return ".tbss.";
+
+  if (Kind.isDataNoRel())            return ".data.";
+  if (Kind.isDataRelLocal())         return ".data.rel.local.";
+  if (Kind.isDataRel())              return ".data.rel.";
+  if (Kind.isReadOnlyWithRelLocal()) return ".data.rel.ro.local.";
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return ".data.rel.ro.";
+}
+
+
+const MCSection *TargetLoweringObjectFileELF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniquedSection;
+  if (Kind.isText())
+    EmitUniquedSection = TM.getFunctionSections();
+  else
+    EmitUniquedSection = TM.getDataSections();
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if ((GV->isWeakForLinker() || EmitUniquedSection) &&
+      !Kind.isCommon()) {
+    const char *Prefix;
+    Prefix = getSectionPrefixForGlobal(Kind);
+
+    SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    Name.append(Sym->getName().begin(), Sym->getName().end());
+    StringRef Group = "";
+    unsigned Flags = getELFSectionFlags(Kind);
+    if (GV->isWeakForLinker()) {
+      Group = Sym->getName();
+      Flags |= ELF::SHF_GROUP;
+    }
+
+    return getContext().getELFSection(Name.str(),
+                                      getELFSectionType(Name.str(), Kind),
+                                      Flags, Kind, 0, Group);
+  }
+
+  if (Kind.isText()) return TextSection;
+
+  if (Kind.isMergeable1ByteCString() ||
+      Kind.isMergeable2ByteCString() ||
+      Kind.isMergeable4ByteCString()) {
+
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    unsigned Align =
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
+
+    const char *SizeSpec = ".rodata.str1.";
+    if (Kind.isMergeable2ByteCString())
+      SizeSpec = ".rodata.str2.";
+    else if (Kind.isMergeable4ByteCString())
+      SizeSpec = ".rodata.str4.";
+    else
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+
+
+    std::string Name = SizeSpec + utostr(Align);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |
+                                      ELF::SHF_MERGE |
+                                      ELF::SHF_STRINGS,
+                                      Kind);
+  }
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4() && MergeableConst4Section)
+      return MergeableConst4Section;
+    if (Kind.isMergeableConst8() && MergeableConst8Section)
+      return MergeableConst8Section;
+    if (Kind.isMergeableConst16() && MergeableConst16Section)
+      return MergeableConst16Section;
+    return ReadOnlySection;  // .const
+  }
+
+  if (Kind.isReadOnly())             return ReadOnlySection;
+
+  if (Kind.isThreadData())           return TLSDataSection;
+  if (Kind.isThreadBSS())            return TLSBSSSection;
+
+  // Note: we claim that common symbols are put in BSSSection, but they are
+  // really emitted with the magic .comm directive, which creates a symbol table
+  // entry but not a section.
+  if (Kind.isBSS() || Kind.isCommon()) return BSSSection;
+
+  if (Kind.isDataNoRel())            return DataSection;
+  if (Kind.isDataRelLocal())         return DataRelLocalSection;
+  if (Kind.isDataRel())              return DataRelSection;
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+/// getSectionForConstant - Given a mergeable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *TargetLoweringObjectFileELF::
+getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isMergeableConst4() && MergeableConst4Section)
+    return MergeableConst4Section;
+  if (Kind.isMergeableConst8() && MergeableConst8Section)
+    return MergeableConst8Section;
+  if (Kind.isMergeableConst16() && MergeableConst16Section)
+    return MergeableConst16Section;
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+const MCSection *
+TargetLoweringObjectFileELF::getStaticCtorSection(unsigned Priority) const {
+  // The default scheme is .ctor / .dtor, so we have to invert the priority
+  // numbering.
+  if (Priority == 65535)
+    return StaticCtorSection;
+
+  if (UseInitArray) {
+    std::string Name = std::string(".init_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_INIT_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".ctors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
+}
+
+const MCSection *
+TargetLoweringObjectFileELF::getStaticDtorSection(unsigned Priority) const {
+  // The default scheme is .ctor / .dtor, so we have to invert the priority
+  // numbering.
+  if (Priority == 65535)
+    return StaticDtorSection;
+
+  if (UseInitArray) {
+    std::string Name = std::string(".fini_array.") + utostr(Priority);
+    return getContext().getELFSection(Name, ELF::SHT_FINI_ARRAY,
+                                      ELF::SHF_ALLOC | ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  } else {
+    std::string Name = std::string(".dtors.") + utostr(65535 - Priority);
+    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
+                                      ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                                      SectionKind::getDataRel());
+  }
+}
+
+void
+TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
+  UseInitArray = UseInitArray_;
+  if (!UseInitArray)
+    return;
+
+  StaticCtorSection =
+    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  StaticDtorSection =
+    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+}
+
+//===----------------------------------------------------------------------===//
+//                                 MachO
+//===----------------------------------------------------------------------===//
+
+/// emitModuleFlags - Perform code emission for module flags.
+void TargetLoweringObjectFileMachO::
+emitModuleFlags(MCStreamer &Streamer,
+                ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
+                Mangler *Mang, const TargetMachine &TM) const {
+  unsigned VersionVal = 0;
+  unsigned ImageInfoFlags = 0;
+  MDNode *LinkerOptions = 0;
+  StringRef SectionVal;
+
+  for (ArrayRef<Module::ModuleFlagEntry>::iterator
+         i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
+    const Module::ModuleFlagEntry &MFE = *i;
+
+    // Ignore flags with 'Require' behavior.
+    if (MFE.Behavior == Module::Require)
+      continue;
+
+    StringRef Key = MFE.Key->getString();
+    Value *Val = MFE.Val;
+
+    if (Key == "Objective-C Image Info Version") {
+      VersionVal = cast<ConstantInt>(Val)->getZExtValue();
+    } else if (Key == "Objective-C Garbage Collection" ||
+               Key == "Objective-C GC Only" ||
+               Key == "Objective-C Is Simulated") {
+      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
+    } else if (Key == "Objective-C Image Info Section") {
+      SectionVal = cast<MDString>(Val)->getString();
+    } else if (Key == "Linker Options") {
+      LinkerOptions = cast<MDNode>(Val);
+    }
+  }
+
+  // Emit the linker options if present.
+  if (LinkerOptions) {
+    for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
+      MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
+      SmallVector<std::string, 4> StrOptions;
+
+      // Convert to strings.
+      for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
+        MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
+        StrOptions.push_back(MDOption->getString());
+      }
+
+      Streamer.EmitLinkerOptions(StrOptions);
+    }
+  }
+
+  // The section is mandatory. If we don't have it, then we don't have GC info.
+  if (SectionVal.empty()) return;
+
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(SectionVal, Segment, Section,
+                                          TAA, TAAParsed, StubSize);
+  if (!ErrorCode.empty())
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Invalid section specifier '" + Section + "': " +
+                       ErrorCode + ".");
+
+  // Get the section.
+  const MCSectionMachO *S =
+    getContext().getMachOSection(Segment, Section, TAA, StubSize,
+                                 SectionKind::getDataNoRel());
+  Streamer.SwitchSection(S);
+  Streamer.EmitLabel(getContext().
+                     GetOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
+  Streamer.EmitIntValue(VersionVal, 4);
+  Streamer.EmitIntValue(ImageInfoFlags, 4);
+  Streamer.AddBlankLine();
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  // Parse the section specifier and create it if valid.
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section,
+                                          TAA, TAAParsed, StubSize);
+  if (!ErrorCode.empty()) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' has an invalid section specifier '" +
+                       GV->getSection() + "': " + ErrorCode + ".");
+  }
+
+  // Get the section.
+  const MCSectionMachO *S =
+    getContext().getMachOSection(Segment, Section, TAA, StubSize, Kind);
+
+  // If TAA wasn't set by ParseSectionSpecifier() above,
+  // use the value returned by getMachOSection() as a default.
+  if (!TAAParsed)
+    TAA = S->getTypeAndAttributes();
+
+  // Okay, now that we got the section, verify that the TAA & StubSize agree.
+  // If the user declared multiple globals with different section flags, we need
+  // to reject it here.
+  if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' section type or attributes does not match previous"
+                       " section specifier");
+  }
+
+  return S;
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
+  if (Kind.isText())
+    return GV->isWeakForLinker() ? TextCoalSection : TextSection;
+
+  // If this is weak/linkonce, put this in a coalescable section, either in text
+  // or data depending on if it is writable.
+  if (GV->isWeakForLinker()) {
+    if (Kind.isReadOnly())
+      return ConstTextCoalSection;
+    return DataCoalSection;
+  }
+
+  // FIXME: Alignment check should be handled by section classifier.
+  if (Kind.isMergeable1ByteCString() &&
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+    return CStringSection;
+
+  // Do not put 16-bit arrays in the UString section if they have an
+  // externally visible label, this runs into issues with certain linker
+  // versions.
+  if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+    return UStringSection;
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4())
+      return FourByteConstantSection;
+    if (Kind.isMergeableConst8())
+      return EightByteConstantSection;
+    if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+      return SixteenByteConstantSection;
+  }
+
+  // Otherwise, if it is readonly, but not something we can specially optimize,
+  // just drop it in .const.
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  // If this is marked const, put it into a const section.  But if the dynamic
+  // linker needs to write to it, put it in the data segment.
+  if (Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  // Put zero initialized globals with strong external linkage in the
+  // DATA, __common section with the .zerofill directive.
+  if (Kind.isBSSExtern())
+    return DataCommonSection;
+
+  // Put zero initialized globals with local linkage in __DATA,__bss directive
+  // with the .zerofill directive (aka .lcomm).
+  if (Kind.isBSSLocal())
+    return DataBSSSection;
+
+  // Otherwise, just drop the variable in the normal data section.
+  return DataSection;
+}
+
+const MCSection *
+TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind) const {
+  // If this constant requires a relocation, we have to put it in the data
+  // segment, not in the text segment.
+  if (Kind.isDataRel() || Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  if (Kind.isMergeableConst4())
+    return FourByteConstantSection;
+  if (Kind.isMergeableConst8())
+    return EightByteConstantSection;
+  if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+    return SixteenByteConstantSection;
+  return ReadOnlySection;  // .const
+}
+
+/// shouldEmitUsedDirectiveFor - This hook allows targets to selectively decide
+/// not to emit the UsedDirective for some symbols in llvm.used.
+// FIXME: REMOVE this (rdar://7071300)
+bool TargetLoweringObjectFileMachO::
+shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
+  /// On Darwin, internally linked data beginning with "L" or "l" does not have
+  /// the directive emitted (this occurs in ObjC metadata).
+  if (!GV) return false;
+
+  // Check whether the mangled name has the "Private" or "LinkerPrivate" prefix.
+  if (GV->hasLocalLinkage() && !isa<Function>(GV)) {
+    // FIXME: ObjC metadata is currently emitted as internal symbols that have
+    // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
+    // this horrible hack can go away.
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    if (Sym->getName()[0] == 'L' || Sym->getName()[0] == 'l')
+      return false;
+  }
+
+  return true;
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI, unsigned Encoding,
+                        MCStreamer &Streamer) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+
+  if (Encoding & DW_EH_PE_indirect) {
+    MachineModuleInfoMachO &MachOMMI =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    SmallString<128> Name;
+    Mang->getNameWithPrefix(Name, GV, true);
+    Name += "$non_lazy_ptr";
+
+    // Add information about the stub reference to MachOMMI so that the stub
+    // gets emitted by the asmprinter.
+    MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
+                                  MachOMMI.getGVStubEntry(SSym);
+    if (StubSym.getPointer() == 0) {
+      MCSymbol *Sym = Mang->getSymbol(GV);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::Create(SSym, getContext()),
+                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::
+    getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+}
+
+MCSymbol *TargetLoweringObjectFileMachO::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+
+  // Add information about the stub reference to MachOMMI so that the stub
+  // gets emitted by the asmprinter.
+  MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+  if (StubSym.getPointer() == 0) {
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+  }
+
+  return SSym;
+}
+
+//===----------------------------------------------------------------------===//
+//                                  COFF
+//===----------------------------------------------------------------------===//
+
+static unsigned
+getCOFFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (K.isMetadata())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_DISCARDABLE;
+  else if (K.isText())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_EXECUTE |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_CNT_CODE;
+  else if (K.isBSS ())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isThreadLocal())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isReadOnly())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ;
+  else if (K.isWriteable())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+
+  return Flags;
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  int Selection = 0;
+  unsigned Characteristics = getCOFFSectionFlags(Kind);
+  SmallString<128> Name(GV->getSection().c_str());
+  if (GV->isWeakForLinker()) {
+    Selection = COFF::IMAGE_COMDAT_SELECT_ANY;
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    Name.append("$");
+    Name.append(Sym->getName().begin() + 1, Sym->getName().end());
+  }
+  return getContext().getCOFFSection(Name,
+                                     Characteristics,
+                                     Selection,
+                                     Kind);
+}
+
+static const char *getCOFFSectionPrefixForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text$";
+  if (Kind.isBSS ())
+    return ".bss$";
+  if (Kind.isThreadLocal()) {
+    // 'LLVM' is just an arbitary string to ensure that the section name gets
+    // sorted in between '.tls$AAA' and '.tls$ZZZ' by the linker.
+    return ".tls$LLVM";
+  }
+  if (Kind.isWriteable())
+    return ".data$";
+  return ".rdata$";
+}
+
+
+const MCSection *TargetLoweringObjectFileCOFF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if (GV->isWeakForLinker()) {
+    const char *Prefix = getCOFFSectionPrefixForUniqueGlobal(Kind);
+    SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
+    MCSymbol *Sym = Mang->getSymbol(GV);
+    Name.append(Sym->getName().begin() + 1, Sym->getName().end());
+
+    unsigned Characteristics = getCOFFSectionFlags(Kind);
+
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+
+    return getContext().getCOFFSection(Name.str(), Characteristics,
+                          COFF::IMAGE_COMDAT_SELECT_ANY, Kind);
+  }
+
+  if (Kind.isText())
+    return getTextSection();
+
+  if (Kind.isThreadLocal())
+    return getTLSDataSection();
+
+  return getDataSection();
+}
+
diff --git a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
new file mode 100644
index 000000000000..0f59d0169e18
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -0,0 +1,52 @@
+//===-- TargetOptionsImpl.cpp - Options that apply to all targets ----------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the methods in the TargetOptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+/// DisableFramePointerElim - This returns true if frame pointer elimination
+/// optimization should be disabled for the given machine function.
+bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
+  // Check to see if we should eliminate non-leaf frame pointers and then
+  // check to see if we should eliminate all frame pointers.
+  if (NoFramePointerElimNonLeaf && !NoFramePointerElim) {
+    const MachineFrameInfo *MFI = MF.getFrameInfo();
+    return MFI->hasCalls();
+  }
+
+  return NoFramePointerElim;
+}
+
+/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
+/// is specified on the command line.  When this flag is off(default), the
+/// code generator is not allowed to generate mad (multiply add) if the
+/// result is "less precise" than doing those operations individually.
+bool TargetOptions::LessPreciseFPMAD() const {
+  return UnsafeFPMath || LessPreciseFPMADOption;
+}
+
+/// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+/// that the rounding mode of the FPU can change from its default.
+bool TargetOptions::HonorSignDependentRoundingFPMath() const {
+  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+}
+
+/// getTrapFunctionName - If this returns a non-empty string, this means isel
+/// should lower Intrinsic::trap to a call to the specified function name
+/// instead of an ISD::TRAP node.
+StringRef TargetOptions::getTrapFunctionName() const {
+  return TrapFuncName;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
new file mode 100644
index 000000000000..84b4bfc33221
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -0,0 +1,285 @@
+//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             const char *const *SRINames,
+                             const unsigned *SRILaneMasks)
+  : InfoDesc(ID), SubRegIndexNames(SRINames),
+    SubRegIndexLaneMasks(SRILaneMasks),
+    RegClassBegin(RCB), RegClassEnd(RCE) {
+}
+
+TargetRegisterInfo::~TargetRegisterInfo() {}
+
+void PrintReg::print(raw_ostream &OS) const {
+  if (!Reg)
+    OS << "%noreg";
+  else if (TargetRegisterInfo::isStackSlot(Reg))
+    OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+  else if (TargetRegisterInfo::isVirtualRegister(Reg))
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+  else if (TRI && Reg < TRI->getNumRegs())
+    OS << '%' << TRI->getName(Reg);
+  else
+    OS << "%physreg" << Reg;
+  if (SubIdx) {
+    if (TRI)
+      OS << ':' << TRI->getSubRegIndexName(SubIdx);
+    else
+      OS << ":sub(" << SubIdx << ')';
+  }
+}
+
+void PrintRegUnit::print(raw_ostream &OS) const {
+  // Generic printout when TRI is missing.
+  if (!TRI) {
+    OS << "Unit~" << Unit;
+    return;
+  }
+
+  // Check for invalid register units.
+  if (Unit >= TRI->getNumRegUnits()) {
+    OS << "BadUnit~" << Unit;
+    return;
+  }
+
+  // Normal units have at least one root.
+  MCRegUnitRootIterator Roots(Unit, TRI);
+  assert(Roots.isValid() && "Unit has no roots.");
+  OS << TRI->getName(*Roots);
+  for (++Roots; Roots.isValid(); ++Roots)
+    OS << '~' << TRI->getName(*Roots);
+}
+
+/// getAllocatableClass - Return the maximal subclass of the given register
+/// class that is alloctable, or NULL.
+const TargetRegisterClass *
+TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
+  if (!RC || RC->isAllocatable())
+    return RC;
+
+  const unsigned *SubClass = RC->getSubClassMask();
+  for (unsigned Base = 0, BaseE = getNumRegClasses();
+       Base < BaseE; Base += 32) {
+    unsigned Idx = Base;
+    for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) {
+      unsigned Offset = CountTrailingZeros_32(Mask);
+      const TargetRegisterClass *SubRC = getRegClass(Idx + Offset);
+      if (SubRC->isAllocatable())
+        return SubRC;
+      Mask >>= Offset;
+      Idx += Offset + 1;
+    }
+  }
+  return NULL;
+}
+
+/// getMinimalPhysRegClass - Returns the Register Class of a physical
+/// register of the given type, picking the most sub register class of
+/// the right type that contains this physreg.
+const TargetRegisterClass *
+TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, EVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the most sub register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = 0;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || BestRC->hasSubClass(RC)))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(const MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){
+  assert(RC->isAllocatable() && "invalid for nonallocatable sets");
+  ArrayRef<uint16_t> Order = RC->getRawAllocationOrder(MF);
+  for (unsigned i = 0; i != Order.size(); ++i)
+    R.set(Order[i]);
+}
+
+BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
+                                          const TargetRegisterClass *RC) const {
+  BitVector Allocatable(getNumRegs());
+  if (RC) {
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
+  } else {
+    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+      if ((*I)->isAllocatable())
+        getAllocatableSetForRC(MF, *I, Allocatable);
+  }
+
+  // Mask out the reserved registers
+  BitVector Reserved = getReservedRegs(MF);
+  Allocatable &= Reserved.flip();
+
+  return Allocatable;
+}
+
+static inline
+const TargetRegisterClass *firstCommonClass(const uint32_t *A,
+                                            const uint32_t *B,
+                                            const TargetRegisterInfo *TRI) {
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + CountTrailingZeros_32(Common));
+  return 0;
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
+                                      const TargetRegisterClass *B) const {
+  // First take care of the trivial cases.
+  if (A == B)
+    return A;
+  if (!A || !B)
+    return 0;
+
+  // Register classes are ordered topologically, so the largest common
+  // sub-class it the common sub-class with the smallest ID.
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                             const TargetRegisterClass *B,
+                                             unsigned Idx) const {
+  assert(A && B && "Missing register class");
+  assert(Idx && "Bad sub-register index");
+
+  // Find Idx in the list of super-register indices.
+  for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI)
+    if (RCI.getSubReg() == Idx)
+      // The bit mask contains all register classes that are projected into B
+      // by Idx. Find a class that is also a sub-class of A.
+      return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
+  return 0;
+}
+
+const TargetRegisterClass *TargetRegisterInfo::
+getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                       const TargetRegisterClass *RCB, unsigned SubB,
+                       unsigned &PreA, unsigned &PreB) const {
+  assert(RCA && SubA && RCB && SubB && "Invalid arguments");
+
+  // Search all pairs of sub-register indices that project into RCA and RCB
+  // respectively. This is quadratic, but usually the sets are very small. On
+  // most targets like X86, there will only be a single sub-register index
+  // (e.g., sub_16bit projecting into GR16).
+  //
+  // The worst case is a register class like DPR on ARM.
+  // We have indices dsub_0..dsub_7 projecting into that class.
+  //
+  // It is very common that one register class is a sub-register of the other.
+  // Arrange for RCA to be the larger register so the answer will be found in
+  // the first iteration. This makes the search linear for the most common
+  // case.
+  const TargetRegisterClass *BestRC = 0;
+  unsigned *BestPreA = &PreA;
+  unsigned *BestPreB = &PreB;
+  if (RCA->getSize() < RCB->getSize()) {
+    std::swap(RCA, RCB);
+    std::swap(SubA, SubB);
+    std::swap(BestPreA, BestPreB);
+  }
+
+  // Also terminate the search one we have found a register class as small as
+  // RCA.
+  unsigned MinSize = RCA->getSize();
+
+  for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
+    unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
+    for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) {
+      // Check if a common super-register class exists for this index pair.
+      const TargetRegisterClass *RC =
+        firstCommonClass(IA.getMask(), IB.getMask(), this);
+      if (!RC || RC->getSize() < MinSize)
+        continue;
+
+      // The indexes must compose identically: PreA+SubA == PreB+SubB.
+      unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB);
+      if (FinalA != FinalB)
+        continue;
+
+      // Is RC a better candidate than BestRC?
+      if (BestRC && RC->getSize() >= BestRC->getSize())
+        continue;
+
+      // Yes, RC is the smallest super-register seen so far.
+      BestRC = RC;
+      *BestPreA = IA.getSubReg();
+      *BestPreB = IB.getSubReg();
+
+      // Bail early if we reached MinSize. We won't find a better candidate.
+      if (BestRC->getSize() == MinSize)
+        return BestRC;
+    }
+  }
+  return BestRC;
+}
+
+// Compute target-independent register allocator hints to help eliminate copies.
+void
+TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                          ArrayRef<MCPhysReg> Order,
+                                          SmallVectorImpl<MCPhysReg> &Hints,
+                                          const MachineFunction &MF,
+                                          const VirtRegMap *VRM) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  // Hints with HintType != 0 were set by target-dependent code.
+  // Such targets must provide their own implementation of
+  // TRI::getRegAllocationHints to interpret those hint types.
+  assert(Hint.first == 0 && "Target must implement TRI::getRegAllocationHints");
+
+  // Target-independent hints are either a physical or a virtual register.
+  unsigned Phys = Hint.second;
+  if (VRM && isVirtualRegister(Phys))
+    Phys = VRM->getPhys(Phys);
+
+  // Check that Phys is a valid hint in VirtReg's register class.
+  if (!isPhysicalRegister(Phys))
+    return;
+  if (MRI.isReserved(Phys))
+    return;
+  // Check that Phys is in the allocation order. We shouldn't heed hints
+  // from VirtReg's register class if they aren't in the allocation order. The
+  // target probably has a reason for removing the register.
+  if (std::find(Order.begin(), Order.end(), Phys) == Order.end())
+    return;
+
+  // All clear, tell the register allocator to prefer this register.
+  Hints.push_back(Phys);
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
new file mode 100644
index 000000000000..783bfa1c1a18
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -0,0 +1,309 @@
+//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper around MCSchedModel that allows the interface
+// to benefit from information currently only available in TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSchedModel("schedmodel", cl::Hidden, cl::init(true),
+  cl::desc("Use TargetSchedModel for latency lookup"));
+
+static cl::opt<bool> EnableSchedItins("scheditins", cl::Hidden, cl::init(true),
+  cl::desc("Use InstrItineraryData for latency lookup"));
+
+bool TargetSchedModel::hasInstrSchedModel() const {
+  return EnableSchedModel && SchedModel.hasInstrSchedModel();
+}
+
+bool TargetSchedModel::hasInstrItineraries() const {
+  return EnableSchedItins && !InstrItins.isEmpty();
+}
+
+static unsigned gcd(unsigned Dividend, unsigned Divisor) {
+  // Dividend and Divisor will be naturally swapped as needed.
+  while(Divisor) {
+    unsigned Rem = Dividend % Divisor;
+    Dividend = Divisor;
+    Divisor = Rem;
+  };
+  return Dividend;
+}
+static unsigned lcm(unsigned A, unsigned B) {
+  unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
+  assert((LCM >= A && LCM >= B) && "LCM overflow");
+  return LCM;
+}
+
+void TargetSchedModel::init(const MCSchedModel &sm,
+                            const TargetSubtargetInfo *sti,
+                            const TargetInstrInfo *tii) {
+  SchedModel = sm;
+  STI = sti;
+  TII = tii;
+  STI->initInstrItins(InstrItins);
+
+  unsigned NumRes = SchedModel.getNumProcResourceKinds();
+  ResourceFactors.resize(NumRes);
+  ResourceLCM = SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    if (NumUnits > 0)
+      ResourceLCM = lcm(ResourceLCM, NumUnits);
+  }
+  MicroOpFactor = ResourceLCM / SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    ResourceFactors[Idx] = NumUnits ? (ResourceLCM / NumUnits) : 0;
+  }
+}
+
+unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
+                                          const MCSchedClassDesc *SC) const {
+  if (hasInstrItineraries()) {
+    int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, MI);
+  }
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->NumMicroOps;
+  }
+  return MI->isTransient() ? 0 : 1;
+}
+
+// The machine model may explicitly specify an invalid latency, which
+// effectively means infinite latency. Since users of the TargetSchedule API
+// don't know how to handle this, we convert it to a very large latency that is
+// easy to distinguish when debugging the DAG but won't induce overflow.
+static unsigned convertLatency(int Cycles) {
+  return Cycles >= 0 ? Cycles : 1000;
+}
+
+/// If we can determine the operand latency from the def only, without machine
+/// model or itinerary lookup, do so. Otherwise return -1.
+int TargetSchedModel::getDefLatency(const MachineInstr *DefMI,
+                                    bool FindMin) const {
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is invalid, then use the itinerary for MinLatency. If no
+    // itinerary exists either, then use single cycle latency.
+    if (SchedModel.MinLatency < 0 && !hasInstrItineraries()) {
+      return 1;
+    }
+    return SchedModel.MinLatency;
+  }
+  else if (!hasInstrSchedModel() && !hasInstrItineraries()) {
+    return TII->defaultDefLatency(&SchedModel, DefMI);
+  }
+  // ...operand lookup required
+  return -1;
+}
+
+/// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
+/// evaluation of predicates that depend on instruction operands or flags.
+const MCSchedClassDesc *TargetSchedModel::
+resolveSchedClass(const MachineInstr *MI) const {
+
+  // Get the definition's scheduling class descriptor from this machine model.
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+
+#ifndef NDEBUG
+  unsigned NIter = 0;
+#endif
+  while (SCDesc->isVariant()) {
+    assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+
+    SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
+    SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  }
+  return SCDesc;
+}
+
+/// Find the def index of this operand. This index maps to the machine model and
+/// is independent of use operands. Def operands may be reordered with uses or
+/// merged with uses without affecting the def index (e.g. before/after
+/// regalloc). However, an instruction's def operands must never be reordered
+/// with respect to each other.
+static unsigned findDefIdx(const MachineInstr *MI, unsigned DefOperIdx) {
+  unsigned DefIdx = 0;
+  for (unsigned i = 0; i != DefOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef())
+      ++DefIdx;
+  }
+  return DefIdx;
+}
+
+/// Find the use index of this operand. This is independent of the instruction's
+/// def operands.
+///
+/// Note that uses are not determined by the operand's isUse property, which
+/// is simply the inverse of isDef. Here we consider any readsReg operand to be
+/// a "use". The machine model allows an operand to be both a Def and Use.
+static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
+  unsigned UseIdx = 0;
+  for (unsigned i = 0; i != UseOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.readsReg())
+      ++UseIdx;
+  }
+  return UseIdx;
+}
+
+// Top-level API for clients that know the operand indices.
+unsigned TargetSchedModel::computeOperandLatency(
+  const MachineInstr *DefMI, unsigned DefOperIdx,
+  const MachineInstr *UseMI, unsigned UseOperIdx,
+  bool FindMin) const {
+
+  int DefLatency = getDefLatency(DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  if (hasInstrItineraries()) {
+    int OperLatency = 0;
+    if (UseMI) {
+      OperLatency =
+        TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, UseMI, UseOperIdx);
+    }
+    else {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
+    }
+    if (OperLatency >= 0)
+      return OperLatency;
+
+    // No operand latency was found.
+    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, DefMI);
+
+    // Expected latency is the max of the stage latency and itinerary props.
+    // Rather than directly querying InstrItins stage latency, we call a TII
+    // hook to allow subtargets to specialize latency. This hook is only
+    // applicable to the InstrItins model. InstrSchedModel should model all
+    // special cases without TII hooks.
+    if (!FindMin)
+      InstrLatency = std::max(InstrLatency,
+                              TII->defaultDefLatency(&SchedModel, DefMI));
+    return InstrLatency;
+  }
+  assert(!FindMin && hasInstrSchedModel() &&
+         "Expected a SchedModel for this cpu");
+  const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+  unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
+  if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+      STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    unsigned WriteID = WLEntry->WriteResourceID;
+    unsigned Latency = convertLatency(WLEntry->Cycles);
+    if (!UseMI)
+      return Latency;
+
+    // Lookup the use's latency adjustment in SubtargetInfo.
+    const MCSchedClassDesc *UseDesc = resolveSchedClass(UseMI);
+    if (UseDesc->NumReadAdvanceEntries == 0)
+      return Latency;
+    unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
+    return Latency - STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+  }
+  // If DefIdx does not exist in the model (e.g. implicit defs), then return
+  // unit latency (defaultDefLatency may be too conservative).
+#ifndef NDEBUG
+  if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+    std::string Err;
+    raw_string_ostream ss(Err);
+    ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
+       << *DefMI;
+    report_fatal_error(ss.str());
+  }
+#endif
+  // FIXME: Automatically giving all implicit defs defaultDefLatency is
+  // undesirable. We should only do it for defs that are known to the MC
+  // desc like flags. Truly implicit defs should get 1 cycle latency.
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
+}
+
+unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
+  // For the itinerary model, fall back to the old subtarget hook.
+  // Allow subtargets to compute Bundle latencies outside the machine model.
+  if (hasInstrItineraries() || MI->isBundle())
+    return TII->getInstrLatency(&InstrItins, MI);
+
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
+    if (SCDesc->isValid()) {
+      unsigned Latency = 0;
+      for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+           DefIdx != DefEnd; ++DefIdx) {
+        // Lookup the definition's write latency in SubtargetInfo.
+        const MCWriteLatencyEntry *WLEntry =
+          STI->getWriteLatencyEntry(SCDesc, DefIdx);
+        Latency = std::max(Latency, convertLatency(WLEntry->Cycles));
+      }
+      return Latency;
+    }
+  }
+  return TII->defaultDefLatency(&SchedModel, MI);
+}
+
+unsigned TargetSchedModel::
+computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                     const MachineInstr *DepMI) const {
+  // MinLatency == -1 is for in-order processors that always have unit
+  // MinLatency. MinLatency > 0 is for in-order processors with varying min
+  // latencies, but since this is not a RAW dep, we always use unit latency.
+  if (SchedModel.MinLatency != 0)
+    return 1;
+
+  // MinLatency == 0 indicates an out-of-order processor that can dispatch
+  // WAW dependencies in the same cycle.
+
+  // Treat predication as a data dependency for out-of-order cpus. In-order
+  // cpus do not need to treat predicated writes specially.
+  //
+  // TODO: The following hack exists because predication passes do not
+  // correctly append imp-use operands, and readsReg() strangely returns false
+  // for predicated defs.
+  unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
+  const MachineFunction &MF = *DefMI->getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(DepMI))
+    return computeInstrLatency(DefMI);
+
+  // If we have a per operand scheduling model, check if this def is writing
+  // an unbuffered resource. If so, it treated like an in-order cpu.
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+    if (SCDesc->isValid()) {
+      for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc),
+             *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) {
+        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->IsBuffered)
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
new file mode 100644
index 000000000000..e6dfe104c82f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -0,0 +1,1678 @@
+//===-- TwoAddressInstructionPass.cpp - Two-Address instruction pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TwoAddress instruction pass which is used
+// by most register allocators. Two-Address instructions are rewritten
+// from:
+//
+//     A = B op C
+//
+// to:
+//
+//     A = B
+//     A op= C
+//
+// Note that if a register allocator chooses to use this pass, that it
+// has to be capable of handling the non-SSA nature of these rewritten
+// virtual registers.
+//
+// It is also worth noting that the duplicate operand of the two
+// address instruction is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "twoaddrinstr"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
+STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
+STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
+STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
+STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
+STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
+
+namespace {
+class TwoAddressInstructionPass : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const InstrItineraryData *InstrItins;
+  MachineRegisterInfo *MRI;
+  LiveVariables *LV;
+  LiveIntervals *LIS;
+  AliasAnalysis *AA;
+  CodeGenOpt::Level OptLevel;
+
+  // The current basic block being processed.
+  MachineBasicBlock *MBB;
+
+  // DistanceMap - Keep track the distance of a MI from the start of the
+  // current basic block.
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+  // Set of already processed instructions in the current block.
+  SmallPtrSet<MachineInstr*, 8> Processed;
+
+  // SrcRegMap - A map from virtual registers to physical registers which are
+  // likely targets to be coalesced to due to copies from physical registers to
+  // virtual registers. e.g. v1024 = move r0.
+  DenseMap<unsigned, unsigned> SrcRegMap;
+
+  // DstRegMap - A map from virtual registers to physical registers which are
+  // likely targets to be coalesced to due to copies to physical registers from
+  // virtual registers. e.g. r1 = move v1024.
+  DenseMap<unsigned, unsigned> DstRegMap;
+
+  bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
+                            MachineBasicBlock::iterator OldPos);
+
+  bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
+
+  bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                             MachineInstr *MI, unsigned Dist);
+
+  bool commuteInstruction(MachineBasicBlock::iterator &mi,
+                          unsigned RegB, unsigned RegC, unsigned Dist);
+
+  bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
+
+  bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                          MachineBasicBlock::iterator &nmi,
+                          unsigned RegA, unsigned RegB, unsigned Dist);
+
+  bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);
+
+  bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
+  bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
+
+  bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                               MachineBasicBlock::iterator &nmi,
+                               unsigned SrcIdx, unsigned DstIdx,
+                               unsigned Dist, bool shouldOnlyCommute);
+
+  void scanUses(unsigned DstReg);
+
+  void processCopy(MachineInstr *MI);
+
+  typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
+  typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
+  bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+  void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+  void eliminateRegSequence(MachineBasicBlock::iterator&);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  TwoAddressInstructionPass() : MachineFunctionPass(ID) {
+    initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<AliasAnalysis>();
+    AU.addPreserved<LiveVariables>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// runOnMachineFunction - Pass entry point.
+  bool runOnMachineFunction(MachineFunction&);
+};
+} // end anonymous namespace
+
+char TwoAddressInstructionPass::ID = 0;
+INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
+                "Two-Address instruction pass", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
+                "Two-Address instruction pass", false, false)
+
+char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
+
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
+
+/// sink3AddrInstruction - A two-address instruction has been converted to a
+/// three-address instruction to avoid clobbering a register. Try to sink it
+/// past the instruction that would kill the above mentioned register to reduce
+/// register pressure.
+bool TwoAddressInstructionPass::
+sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
+                     MachineBasicBlock::iterator OldPos) {
+  // FIXME: Shouldn't we be trying to do this before we three-addressify the
+  // instruction?  After this transformation is done, we no longer need
+  // the instruction to be in three-address form.
+
+  // Check if it's safe to move this instruction.
+  bool SeenStore = true; // Be conservative.
+  if (!MI->isSafeToMove(TII, AA, SeenStore))
+    return false;
+
+  unsigned DefReg = 0;
+  SmallSet<unsigned, 4> UseRegs;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isUse() && MOReg != SavedReg)
+      UseRegs.insert(MO.getReg());
+    if (!MO.isDef())
+      continue;
+    if (MO.isImplicit())
+      // Don't try to move it if it implicitly defines a register.
+      return false;
+    if (DefReg)
+      // For now, don't move any instructions that define multiple registers.
+      return false;
+    DefReg = MO.getReg();
+  }
+
+  // Find the instruction that kills SavedReg.
+  MachineInstr *KillMI = NULL;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(SavedReg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  }
+  if (!KillMI) {
+    for (MachineRegisterInfo::use_nodbg_iterator
+           UI = MRI->use_nodbg_begin(SavedReg),
+           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
+      MachineOperand &UseMO = UI.getOperand();
+      if (!UseMO.isKill())
+        continue;
+      KillMI = UseMO.getParent();
+      break;
+    }
+  }
+
+  // If we find the instruction that kills SavedReg, and it is in an
+  // appropriate location, we can try to sink the current instruction
+  // past it.
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
+      KillMI == OldPos || KillMI->isTerminator())
+    return false;
+
+  // If any of the definitions are used by another instruction between the
+  // position and the kill use, then it's not safe to sink it.
+  //
+  // FIXME: This can be sped up if there is an easy way to query whether an
+  // instruction is before or after another instruction. Then we can use
+  // MachineRegisterInfo def / use instead.
+  MachineOperand *KillMO = NULL;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+
+  unsigned NumVisited = 0;
+  for (MachineBasicBlock::iterator I = llvm::next(OldPos); I != KillPos; ++I) {
+    MachineInstr *OtherMI = I;
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI->isDebugValue())
+      continue;
+    if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = OtherMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (DefReg == MOReg)
+        return false;
+
+      if (MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))) {
+        if (OtherMI == KillMI && MOReg == SavedReg)
+          // Save the operand that kills the register. We want to unset the kill
+          // marker if we can sink MI past it.
+          KillMO = &MO;
+        else if (UseRegs.count(MOReg))
+          // One of the uses is killed before the destination.
+          return false;
+      }
+    }
+  }
+  assert(KillMO && "Didn't find kill");
+
+  if (!LIS) {
+    // Update kill and LV information.
+    KillMO->setIsKill(false);
+    KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
+    KillMO->setIsKill(true);
+
+    if (LV)
+      LV->replaceKillInstruction(SavedReg, KillMI, MI);
+  }
+
+  // Move instruction to its destination.
+  MBB->remove(MI);
+  MBB->insert(KillPos, MI);
+
+  if (LIS)
+    LIS->handleMove(MI);
+
+  ++Num3AddrSunk;
+  return true;
+}
+
+/// noUseAfterLastDef - Return true if there are no intervening uses between the
+/// last instruction in the MBB that defines the specified register and the
+/// two-address instruction which is being processed. It also returns the last
+/// def location by reference
+bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
+                                                  unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
+         E = MRI->reg_end(); I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB || MI->isDebugValue())
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+/// isCopyToReg - Return true if the specified MI is a copy instruction or
+/// a extract_subreg instruction. It also returns the source and destination
+/// registers and whether they are physical registers by reference.
+static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
+                        unsigned &SrcReg, unsigned &DstReg,
+                        bool &IsSrcPhys, bool &IsDstPhys) {
+  SrcReg = 0;
+  DstReg = 0;
+  if (MI.isCopy()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+  } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+  } else
+    return false;
+
+  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  return true;
+}
+
+/// isPLainlyKilled - Test if the given register value, which is used by the
+// given instruction, is killed by the given instruction.
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
+                            LiveIntervals *LIS) {
+  if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
+      !LIS->isNotInMIMap(MI)) {
+    // FIXME: Sometimes tryInstructionTransform() will add instructions and
+    // test whether they can be folded before keeping them. In this case it
+    // sets a kill before recursively calling tryInstructionTransform() again.
+    // If there is no interval available, we assume that this instruction is
+    // one of those. A kill flag is manually inserted on the operand so the
+    // check below will handle it.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    // This is to match the kill flag version where undefs don't have kill
+    // flags.
+    if (!LI.hasAtLeastOneValue())
+      return false;
+
+    SlotIndex useIdx = LIS->getInstructionIndex(MI);
+    LiveInterval::const_iterator I = LI.find(useIdx);
+    assert(I != LI.end() && "Reg must be live-in to use.");
+    return !I->end.isBlock() && SlotIndex::isSameInstr(I->end, useIdx);
+  }
+
+  return MI->killsRegister(Reg);
+}
+
+/// isKilled - Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy %reg1025<kill>
+///   %reg1036 = add %reg1034<kill>, %reg1035<kill>
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+/// If allowFalsePositives is true then likely kills are treated as kills even
+/// if it can't be proven that they are kills.
+static bool isKilled(MachineInstr &MI, unsigned Reg,
+                     const MachineRegisterInfo *MRI,
+                     const TargetInstrInfo *TII,
+                     LiveIntervals *LIS,
+                     bool allowFalsePositives) {
+  MachineInstr *DefMI = &MI;
+  for (;;) {
+    // All uses of physical registers are likely to be kills.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        (allowFalsePositives || MRI->hasOneUse(Reg)))
+      return true;
+    if (!isPlainlyKilled(DefMI, Reg, LIS))
+      return false;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (llvm::next(Begin) != MRI->def_end())
+      return true;
+    DefMI = &*Begin;
+    bool IsSrcPhys, IsDstPhys;
+    unsigned SrcReg,  DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
+/// isTwoAddrUse - Return true if the specified MI uses the specified register
+/// as a two-address use. If so, return the destination register by reference.
+static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned NumOps = MI.isInlineAsm()
+    ? MI.getNumOperands() : MCID.getNumOperands();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+      continue;
+    unsigned ti;
+    if (MI.isRegTiedToDefOperand(i, &ti)) {
+      DstReg = MI.getOperand(ti).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// findOnlyInterestingUse - Given a register, if has a single in-basic block
+/// use, return the use instruction if it's a copy or a two-address use.
+static
+MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
+                                     MachineRegisterInfo *MRI,
+                                     const TargetInstrInfo *TII,
+                                     bool &IsCopy,
+                                     unsigned &DstReg, bool &IsDstPhys) {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    // None or more than one use.
+    return 0;
+  MachineInstr &UseMI = *MRI->use_nodbg_begin(Reg);
+  if (UseMI.getParent() != MBB)
+    return 0;
+  unsigned SrcReg;
+  bool IsSrcPhys;
+  if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
+    IsCopy = true;
+    return &UseMI;
+  }
+  IsDstPhys = false;
+  if (isTwoAddrUse(UseMI, Reg, DstReg)) {
+    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    return &UseMI;
+  }
+  return 0;
+}
+
+/// getMappedReg - Return the physical register the specified virtual register
+/// might be mapped to.
+static unsigned
+getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
+  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+    DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
+    if (SI == RegMap.end())
+      return 0;
+    Reg = SI->second;
+  }
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Reg;
+  return 0;
+}
+
+/// regsAreCompatible - Return true if the two registers are equal or aliased.
+///
+static bool
+regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
+  if (RegA == RegB)
+    return true;
+  if (!RegA || !RegB)
+    return false;
+  return TRI->regsOverlap(RegA, RegB);
+}
+
+
+/// isProfitableToCommute - Return true if it's potentially profitable to commute
+/// the two-address instruction that's being processed.
+bool
+TwoAddressInstructionPass::
+isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                      MachineInstr *MI, unsigned Dist) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1028
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // In this case, it might not be possible to coalesce the second MOV8rr
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1029
+  // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>
+
+  if (!isPlainlyKilled(MI, regC, LIS))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // let's see if it's worth commuting it.
+
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r0            = MOV %reg1026
+  // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
+  unsigned ToRegA = getMappedReg(regA, DstRegMap);
+  if (ToRegA) {
+    unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+    unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+    bool BComp = !FromRegB || regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CComp = !FromRegC || regsAreCompatible(FromRegC, ToRegA, TRI);
+    if (BComp != CComp)
+      return !BComp && CComp;
+  }
+
+  // If there is a use of regC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!noUseAfterLastDef(regC, Dist, LastDefC))
+    return false;
+
+  // If there is a use of regB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!noUseAfterLastDef(regB, Dist, LastDefB))
+    return true;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of regC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
+/// commuteInstruction - Commute a two-address instruction and update the basic
+/// block, distance map, and live variables if needed. Return true if it is
+/// successful.
+bool TwoAddressInstructionPass::
+commuteInstruction(MachineBasicBlock::iterator &mi,
+                   unsigned RegB, unsigned RegC, unsigned Dist) {
+  MachineInstr *MI = mi;
+  DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
+  MachineInstr *NewMI = TII->commuteInstruction(MI);
+
+  if (NewMI == 0) {
+    DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
+  assert(NewMI == MI &&
+         "TargetInstrInfo::commuteInstruction() should not return a new "
+         "instruction unless it was requested.");
+
+  // Update source register map.
+  unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
+  if (FromRegC) {
+    unsigned RegA = MI->getOperand(0).getReg();
+    SrcRegMap[RegA] = FromRegC;
+  }
+
+  return true;
+}
+
+/// isProfitableToConv3Addr - Return true if it is profitable to convert the
+/// given 2-address instruction to a 3-address one.
+bool
+TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r2            = MOV %reg1026
+  // Turn ADD into a 3-address instruction to avoid a copy.
+  unsigned FromRegB = getMappedReg(RegB, SrcRegMap);
+  if (!FromRegB)
+    return false;
+  unsigned ToRegA = getMappedReg(RegA, DstRegMap);
+  return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
+}
+
+/// convertInstTo3Addr - Convert the specified two-address instruction into a
+/// three address one. Return true if this transformation was successful.
+bool
+TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                                              MachineBasicBlock::iterator &nmi,
+                                              unsigned RegA, unsigned RegB,
+                                              unsigned Dist) {
+  // FIXME: Why does convertToThreeAddress() need an iterator reference?
+  MachineFunction::iterator MFI = MBB;
+  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, mi, LV);
+  assert(MBB == MFI && "convertToThreeAddress changed iterator reference");
+  if (!NewMI)
+    return false;
+
+  DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+  DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+  bool Sunk = false;
+
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(mi, NewMI);
+
+  if (NewMI->findRegisterUseOperand(RegB, false, TRI))
+    // FIXME: Temporary workaround. If the new instruction doesn't
+    // uses RegB, convertToThreeAddress must have created more
+    // then one instruction.
+    Sunk = sink3AddrInstruction(NewMI, RegB, mi);
+
+  MBB->erase(mi); // Nuke the old inst.
+
+  if (!Sunk) {
+    DistanceMap.insert(std::make_pair(NewMI, Dist));
+    mi = NewMI;
+    nmi = llvm::next(mi);
+  }
+
+  // Update source and destination register maps.
+  SrcRegMap.erase(RegA);
+  DstRegMap.erase(RegB);
+  return true;
+}
+
+/// scanUses - Scan forward recursively for only uses, update maps if the use
+/// is a copy or a two-address instruction.
+void
+TwoAddressInstructionPass::scanUses(unsigned DstReg) {
+  SmallVector<unsigned, 4> VirtRegPairs;
+  bool IsDstPhys;
+  bool IsCopy = false;
+  unsigned NewReg = 0;
+  unsigned Reg = DstReg;
+  while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
+                                                      NewReg, IsDstPhys)) {
+    if (IsCopy && !Processed.insert(UseMI))
+      break;
+
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+    if (DI != DistanceMap.end())
+      // Earlier in the same MBB.Reached via a back edge.
+      break;
+
+    if (IsDstPhys) {
+      VirtRegPairs.push_back(NewReg);
+      break;
+    }
+    bool isNew = SrcRegMap.insert(std::make_pair(NewReg, Reg)).second;
+    if (!isNew)
+      assert(SrcRegMap[NewReg] == Reg && "Can't map to two src registers!");
+    VirtRegPairs.push_back(NewReg);
+    Reg = NewReg;
+  }
+
+  if (!VirtRegPairs.empty()) {
+    unsigned ToReg = VirtRegPairs.back();
+    VirtRegPairs.pop_back();
+    while (!VirtRegPairs.empty()) {
+      unsigned FromReg = VirtRegPairs.back();
+      VirtRegPairs.pop_back();
+      bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
+      if (!isNew)
+        assert(DstRegMap[FromReg] == ToReg &&"Can't map to two dst registers!");
+      ToReg = FromReg;
+    }
+    bool isNew = DstRegMap.insert(std::make_pair(DstReg, ToReg)).second;
+    if (!isNew)
+      assert(DstRegMap[DstReg] == ToReg && "Can't map to two dst registers!");
+  }
+}
+
+/// processCopy - If the specified instruction is not yet processed, process it
+/// if it's a copy. For a copy instruction, we find the physical registers the
+/// source and destination registers might be mapped to. These are kept in
+/// point-to maps used to determine future optimizations. e.g.
+/// v1024 = mov r0
+/// v1025 = mov r1
+/// v1026 = add v1024, v1025
+/// r1    = mov r1026
+/// If 'add' is a two-address instruction, v1024, v1026 are both potentially
+/// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
+/// potentially joined with r1 on the output side. It's worthwhile to commute
+/// 'add' to eliminate a copy.
+void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
+  if (Processed.count(MI))
+    return;
+
+  bool IsSrcPhys, IsDstPhys;
+  unsigned SrcReg, DstReg;
+  if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+    return;
+
+  if (IsDstPhys && !IsSrcPhys)
+    DstRegMap.insert(std::make_pair(SrcReg, DstReg));
+  else if (!IsDstPhys && IsSrcPhys) {
+    bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
+    if (!isNew)
+      assert(SrcRegMap[DstReg] == SrcReg &&
+             "Can't map to two src physical registers!");
+
+    scanUses(DstReg);
+  }
+
+  Processed.insert(MI);
+  return;
+}
+
+/// rescheduleMIBelowKill - If there is one more local instruction that reads
+/// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
+/// instruction in order to eliminate the need for the copy.
+bool TwoAddressInstructionPass::
+rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = 0;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  if (KillMI->hasUnmodeledSideEffects() || KillMI->isCall() ||
+      KillMI->isBranch() || KillMI->isTerminator())
+    // Don't move pass calls, etc.
+    return false;
+
+  unsigned DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!MI->isSafeToMove(TII, AA, SeenStore))
+    return false;
+
+  if (TII->getInstrLatency(InstrItins, MI) > 1)
+    // FIXME: Needs more sophisticated heuristics.
+    return false;
+
+  SmallSet<unsigned, 2> Uses;
+  SmallSet<unsigned, 2> Kills;
+  SmallSet<unsigned, 2> Defs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isDef())
+      Defs.insert(MOReg);
+    else {
+      Uses.insert(MOReg);
+      if (MOReg != Reg && (MO.isKill() ||
+                           (LIS && isPlainlyKilled(MI, MOReg, LIS))))
+        Kills.insert(MOReg);
+    }
+  }
+
+  // Move the copies connected to MI down as well.
+  MachineBasicBlock::iterator Begin = MI;
+  MachineBasicBlock::iterator AfterMI = llvm::next(Begin);
+
+  MachineBasicBlock::iterator End = AfterMI;
+  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
+    Defs.insert(End->getOperand(0).getReg());
+    ++End;
+  }
+
+  // Check if the reschedule will not break depedencies.
+  unsigned NumVisited = 0;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+  for (MachineBasicBlock::iterator I = End; I != KillPos; ++I) {
+    MachineInstr *OtherMI = I;
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI->isDebugValue())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI->hasUnmodeledSideEffects() || OtherMI->isCall() ||
+        OtherMI->isBranch() || OtherMI->isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = OtherMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(MOReg))
+          // Physical register use would be clobbered.
+          return false;
+        if (!MO.isDead() && Defs.count(MOReg))
+          // May clobber a physical register def.
+          // FIXME: This may be too conservative. It's ok if the instruction
+          // is sunken completely below the use.
+          return false;
+      } else {
+        if (Defs.count(MOReg))
+          return false;
+        bool isKill = MO.isKill() ||
+                      (LIS && isPlainlyKilled(OtherMI, MOReg, LIS));
+        if (MOReg != Reg &&
+            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (MOReg == Reg && !isKill)
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
+      }
+    }
+  }
+
+  // Move debug info as well.
+  while (Begin != MBB->begin() && llvm::prior(Begin)->isDebugValue())
+    --Begin;
+
+  nmi = End;
+  MachineBasicBlock::iterator InsertPos = KillPos;
+  if (LIS) {
+    // We have to move the copies first so that the MBB is still well-formed
+    // when calling handleMove().
+    for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
+      MachineInstr *CopyMI = MBBI;
+      ++MBBI;
+      MBB->splice(InsertPos, MBB, CopyMI);
+      LIS->handleMove(CopyMI);
+      InsertPos = CopyMI;
+    }
+    End = llvm::next(MachineBasicBlock::iterator(MI));
+  }
+
+  // Copies following MI may have been moved as well.
+  MBB->splice(InsertPos, MBB, Begin, End);
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(MI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, KillMI);
+    LV->addVirtualRegisterKilled(Reg, MI);
+  }
+
+  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
+  return true;
+}
+
+/// isDefTooClose - Return true if the re-scheduling will put the given
+/// instruction too close to the defs of its register dependencies.
+bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
+                                              MachineInstr *MI) {
+  for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg),
+         DE = MRI->def_end(); DI != DE; ++DI) {
+    MachineInstr *DefMI = &*DI;
+    if (DefMI->getParent() != MBB || DefMI->isCopy() || DefMI->isCopyLike())
+      continue;
+    if (DefMI == MI)
+      return true; // MI is defining something KillMI uses
+    DenseMap<MachineInstr*, unsigned>::iterator DDI = DistanceMap.find(DefMI);
+    if (DDI == DistanceMap.end())
+      return true;  // Below MI
+    unsigned DefDist = DDI->second;
+    assert(Dist > DefDist && "Visited def already?");
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
+      return true;
+  }
+  return false;
+}
+
+/// rescheduleKillAboveMI - If there is one more local instruction that reads
+/// 'Reg' and it kills 'Reg, consider moving the kill instruction above the
+/// current two-address instruction in order to eliminate the need for the
+/// copy.
+bool TwoAddressInstructionPass::
+rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = 0;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  unsigned DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!KillMI->isSafeToMove(TII, AA, SeenStore))
+    return false;
+
+  SmallSet<unsigned, 2> Uses;
+  SmallSet<unsigned, 2> Kills;
+  SmallSet<unsigned, 2> Defs;
+  SmallSet<unsigned, 2> LiveDefs;
+  for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = KillMI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MO.isUse()) {
+      if (!MOReg)
+        continue;
+      if (isDefTooClose(MOReg, DI->second, MI))
+        return false;
+      bool isKill = MO.isKill() || (LIS && isPlainlyKilled(KillMI, MOReg, LIS));
+      if (MOReg == Reg && !isKill)
+        return false;
+      Uses.insert(MOReg);
+      if (isKill && MOReg != Reg)
+        Kills.insert(MOReg);
+    } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      Defs.insert(MOReg);
+      if (!MO.isDead())
+        LiveDefs.insert(MOReg);
+    }
+  }
+
+  // Check if the reschedule will not break depedencies.
+  unsigned NumVisited = 0;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  for (MachineBasicBlock::iterator I = mi; I != KillPos; ++I) {
+    MachineInstr *OtherMI = I;
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI->isDebugValue())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI->hasUnmodeledSideEffects() || OtherMI->isCall() ||
+        OtherMI->isBranch() || OtherMI->isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    SmallVector<unsigned, 2> OtherDefs;
+    for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = OtherMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isUse()) {
+        if (Defs.count(MOReg))
+          // Moving KillMI can clobber the physical register if the def has
+          // not been seen.
+          return false;
+        if (Kills.count(MOReg))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (OtherMI != MI && MOReg == Reg &&
+            !(MO.isKill() || (LIS && isPlainlyKilled(OtherMI, MOReg, LIS))))
+          // We can't schedule across a use of the register in question.
+          return false;
+      } else {
+        OtherDefs.push_back(MOReg);
+      }
+    }
+
+    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
+      unsigned MOReg = OtherDefs[i];
+      if (Uses.count(MOReg))
+        return false;
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+          LiveDefs.count(MOReg))
+        return false;
+      // Physical register def is seen.
+      Defs.erase(MOReg);
+    }
+  }
+
+  // Move the old kill above MI, don't forget to move debug info as well.
+  MachineBasicBlock::iterator InsertPos = mi;
+  while (InsertPos != MBB->begin() && llvm::prior(InsertPos)->isDebugValue())
+    --InsertPos;
+  MachineBasicBlock::iterator From = KillMI;
+  MachineBasicBlock::iterator To = llvm::next(From);
+  while (llvm::prior(From)->isDebugValue())
+    --From;
+  MBB->splice(InsertPos, MBB, From, To);
+
+  nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr.
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(KillMI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, KillMI);
+    LV->addVirtualRegisterKilled(Reg, MI);
+  }
+
+  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
+  return true;
+}
+
+/// tryInstructionTransform - For the case where an instruction has a single
+/// pair of tied register operands, attempt some transformations that may
+/// either eliminate the tied operands or improve the opportunities for
+/// coalescing away the register copy.  Returns true if no copy needs to be
+/// inserted to untie mi's operands (either because they were untied, or
+/// because mi was rescheduled, and will be visited again later). If the
+/// shouldOnlyCommute flag is true, only instruction commutation is attempted.
+bool TwoAddressInstructionPass::
+tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                        MachineBasicBlock::iterator &nmi,
+                        unsigned SrcIdx, unsigned DstIdx,
+                        unsigned Dist, bool shouldOnlyCommute) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  MachineInstr &MI = *mi;
+  unsigned regA = MI.getOperand(DstIdx).getReg();
+  unsigned regB = MI.getOperand(SrcIdx).getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+         "cannot make instruction into two-address form");
+  bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+
+  if (TargetRegisterInfo::isVirtualRegister(regA))
+    scanUses(regA);
+
+  // Check if it is profitable to commute the operands.
+  unsigned SrcOp1, SrcOp2;
+  unsigned regC = 0;
+  unsigned regCIdx = ~0U;
+  bool TryCommute = false;
+  bool AggressiveCommute = false;
+  if (MI.isCommutable() && MI.getNumOperands() >= 3 &&
+      TII->findCommutedOpIndices(&MI, SrcOp1, SrcOp2)) {
+    if (SrcIdx == SrcOp1)
+      regCIdx = SrcOp2;
+    else if (SrcIdx == SrcOp2)
+      regCIdx = SrcOp1;
+
+    if (regCIdx != ~0U) {
+      regC = MI.getOperand(regCIdx).getReg();
+      if (!regBKilled && isKilled(MI, regC, MRI, TII, LIS, false))
+        // If C dies but B does not, swap the B and C operands.
+        // This makes the live ranges of A and C joinable.
+        TryCommute = true;
+      else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) {
+        TryCommute = true;
+        AggressiveCommute = true;
+      }
+    }
+  }
+
+  // If it's profitable to commute, try to do so.
+  if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
+    ++NumCommuted;
+    if (AggressiveCommute)
+      ++NumAggrCommuted;
+    return false;
+  }
+
+  if (shouldOnlyCommute)
+    return false;
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule this MI below it.
+  if (rescheduleMIBelowKill(mi, nmi, regB)) {
+    ++NumReSchedDowns;
+    return true;
+  }
+
+  if (MI.isConvertibleTo3Addr()) {
+    // This instruction is potentially convertible to a true
+    // three-address instruction.  Check if it is profitable.
+    if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
+      // Try to convert it.
+      if (convertInstTo3Addr(mi, nmi, regA, regB, Dist)) {
+        ++NumConvertedTo3Addr;
+        return true; // Done with this instruction.
+      }
+    }
+  }
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule it before this MI if it's legal.
+  if (rescheduleKillAboveMI(mi, nmi, regB)) {
+    ++NumReSchedUps;
+    return true;
+  }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (MI.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
+      if (UnfoldMCID.getNumDefs() == 1) {
+        // Unfold the load.
+        DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
+        const TargetRegisterClass *RC =
+          TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
+        unsigned Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(*MF, &MI, Reg,
+                                      /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
+                                      NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        MBB->insert(mi, NewMIs[0]);
+        MBB->insert(mi, NewMIs[1]);
+
+        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                     << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformResult =
+          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist, true);
+        (void)TransformResult;
+        assert(!TransformResult &&
+               "tryInstructionTransform() should return false.");
+        if (NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = MI.getOperand(i);
+              if (MO.isReg() &&
+                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), &MI, NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), &MI)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, NewMIs[1]);
+          }
+
+          SmallVector<unsigned, 4> OrigRegs;
+          if (LIS) {
+            for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                 MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+              if (MOI->isReg())
+                OrigRegs.push_back(MOI->getReg());
+            }
+          }
+
+          MI.eraseFromParent();
+
+          // Update LiveIntervals.
+          if (LIS) {
+            MachineBasicBlock::iterator Begin(NewMIs[0]);
+            MachineBasicBlock::iterator End(NewMIs[1]);
+            LIS->repairIntervalsInRange(MBB, Begin, End, OrigRegs);
+          }
+
+          mi = NewMIs[1];
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Collect tied operands of MI that need to be handled.
+// Rewrite trivial cases immediately.
+// Return true if any tied operands where found, including the trivial ones.
+bool TwoAddressInstructionPass::
+collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  bool AnyOps = false;
+  unsigned NumOps = MI->getNumOperands();
+
+  for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+    unsigned DstIdx = 0;
+    if (!MI->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+      continue;
+    AnyOps = true;
+    MachineOperand &SrcMO = MI->getOperand(SrcIdx);
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned SrcReg = SrcMO.getReg();
+    unsigned DstReg = DstMO.getReg();
+    // Tied constraint already satisfied?
+    if (SrcReg == DstReg)
+      continue;
+
+    assert(SrcReg && SrcMO.isUse() && "two address instruction invalid");
+
+    // Deal with <undef> uses immediately - simply rewrite the src operand.
+    if (SrcMO.isUndef()) {
+      // Constrain the DstReg register class if required.
+      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+        if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
+                                                             TRI, *MF))
+          MRI->constrainRegClass(DstReg, RC);
+      SrcMO.setReg(DstReg);
+      DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      continue;
+    }
+    TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
+  }
+  return AnyOps;
+}
+
+// Process a list of tied MI operands that all use the same source register.
+// The tied pairs are of the form (SrcIdx, DstIdx).
+void
+TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
+                                            TiedPairList &TiedPairs,
+                                            unsigned &Dist) {
+  bool IsEarlyClobber = false;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    const MachineOperand &DstMO = MI->getOperand(TiedPairs[tpi].second);
+    IsEarlyClobber |= DstMO.isEarlyClobber();
+  }
+
+  bool RemovedKillFlag = false;
+  bool AllUsesCopied = true;
+  unsigned LastCopiedReg = 0;
+  SlotIndex LastCopyIdx;
+  unsigned RegB = 0;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    unsigned SrcIdx = TiedPairs[tpi].first;
+    unsigned DstIdx = TiedPairs[tpi].second;
+
+    const MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned RegA = DstMO.getReg();
+
+    // Grab RegB from the instruction because it may have changed if the
+    // instruction was commuted.
+    RegB = MI->getOperand(SrcIdx).getReg();
+
+    if (RegA == RegB) {
+      // The register is tied to multiple destinations (or else we would
+      // not have continued this far), but this use of the register
+      // already matches the tied destination.  Leave it.
+      AllUsesCopied = false;
+      continue;
+    }
+    LastCopiedReg = RegA;
+
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+    // First, verify that we don't have a use of "a" in the instruction
+    // (a = b + a for example) because our transformation will not
+    // work. This should never occur because we are in SSA form.
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i)
+      assert(i == DstIdx ||
+             !MI->getOperand(i).isReg() ||
+             MI->getOperand(i).getReg() != RegA);
+#endif
+
+    // Emit a copy.
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), RegA).addReg(RegB);
+
+    // Update DistanceMap.
+    MachineBasicBlock::iterator PrevMI = MI;
+    --PrevMI;
+    DistanceMap.insert(std::make_pair(PrevMI, Dist));
+    DistanceMap[MI] = ++Dist;
+
+    if (LIS) {
+      LastCopyIdx = LIS->InsertMachineInstrInMaps(PrevMI).getRegSlot();
+
+      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+        LiveInterval &LI = LIS->getInterval(RegA);
+        VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+        SlotIndex endIdx =
+          LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
+        LI.addRange(LiveRange(LastCopyIdx, endIdx, VNI));
+      }
+    }
+
+    DEBUG(dbgs() << "\t\tprepend:\t" << *PrevMI);
+
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
+           "inconsistent operand info for 2-reg pass");
+    if (MO.isKill()) {
+      MO.setIsKill(false);
+      RemovedKillFlag = true;
+    }
+
+    // Make sure regA is a legal regclass for the SrcIdx operand.
+    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
+        TargetRegisterInfo::isVirtualRegister(RegB))
+      MRI->constrainRegClass(RegA, MRI->getRegClass(RegB));
+
+    MO.setReg(RegA);
+
+    // Propagate SrcRegMap.
+    SrcRegMap[RegA] = RegB;
+  }
+
+
+  if (AllUsesCopied) {
+    if (!IsEarlyClobber) {
+      // Replace other (un-tied) uses of regB with LastCopiedReg.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(LastCopiedReg);
+        }
+      }
+    }
+
+    // Update live variables for regB.
+    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(MI)) {
+      MachineBasicBlock::iterator PrevMI = MI;
+      --PrevMI;
+      LV->addVirtualRegisterKilled(RegB, PrevMI);
+    }
+
+    // Update LiveIntervals.
+    if (LIS) {
+      LiveInterval &LI = LIS->getInterval(RegB);
+      SlotIndex MIIdx = LIS->getInstructionIndex(MI);
+      LiveInterval::const_iterator I = LI.find(MIIdx);
+      assert(I != LI.end() && "RegB must be live-in to use.");
+
+      SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
+      if (I->end == UseIdx)
+        LI.removeRange(LastCopyIdx, UseIdx);
+    }
+
+  } else if (RemovedKillFlag) {
+    // Some tied uses of regB matched their destination registers, so
+    // regB is still used in this instruction, but a kill flag was
+    // removed from a different tied use of regB, so now we need to add
+    // a kill flag to one of the remaining uses of regB.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+        MO.setIsKill(true);
+        break;
+      }
+    }
+  }
+}
+
+/// runOnMachineFunction - Reduce two-address instructions to two operands.
+///
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetMachine &TM = MF->getTarget();
+  MRI = &MF->getRegInfo();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  InstrItins = TM.getInstrItineraryData();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  AA = &getAnalysis<AliasAnalysis>();
+  OptLevel = TM.getOptLevel();
+
+  bool MadeChange = false;
+
+  DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
+  DEBUG(dbgs() << "********** Function: "
+        << MF->getName() << '\n');
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  TiedOperandMap TiedOperands;
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MBB = MBBI;
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    SrcRegMap.clear();
+    DstRegMap.clear();
+    Processed.clear();
+    for (MachineBasicBlock::iterator mi = MBB->begin(), me = MBB->end();
+         mi != me; ) {
+      MachineBasicBlock::iterator nmi = llvm::next(mi);
+      if (mi->isDebugValue()) {
+        mi = nmi;
+        continue;
+      }
+
+      // Expand REG_SEQUENCE instructions. This will position mi at the first
+      // expanded instruction.
+      if (mi->isRegSequence())
+        eliminateRegSequence(mi);
+
+      DistanceMap.insert(std::make_pair(mi, ++Dist));
+
+      processCopy(&*mi);
+
+      // First scan through all the tied register uses in this instruction
+      // and record a list of pairs of tied operands for each register.
+      if (!collectTiedOperands(mi, TiedOperands)) {
+        mi = nmi;
+        continue;
+      }
+
+      ++NumTwoAddressInstrs;
+      MadeChange = true;
+      DEBUG(dbgs() << '\t' << *mi);
+
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
+          unsigned SrcIdx = TiedPairs[0].first;
+          unsigned DstIdx = TiedPairs[0].second;
+          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
+            // The tied operands have been eliminated or shifted further down the
+            // block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            mi = nmi;
+            continue;
+          }
+        }
+      }
+
+      // Now iterate over the information collected above.
+      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
+             OE = TiedOperands.end(); OI != OE; ++OI) {
+        processTiedPairs(mi, OI->second, Dist);
+        DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
+
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+      }
+
+      // Clear TiedOperands here instead of at the top of the loop
+      // since most instructions do not have tied operands.
+      TiedOperands.clear();
+      mi = nmi;
+    }
+  }
+
+  if (LIS)
+    MF->verify(this, "After two-address instruction pass");
+
+  return MadeChange;
+}
+
+/// Eliminate a REG_SEQUENCE instruction as part of the de-ssa process.
+///
+/// The instruction is turned into a sequence of sub-register copies:
+///
+///   %dst = REG_SEQUENCE %v1, ssub0, %v2, ssub1
+///
+/// Becomes:
+///
+///   %dst:ssub0<def,undef> = COPY %v1
+///   %dst:ssub1<def> = COPY %v2
+///
+void TwoAddressInstructionPass::
+eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = MBBI;
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (MI->getOperand(0).getSubReg() ||
+      TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      !(MI->getNumOperands() & 1)) {
+    DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI);
+    llvm_unreachable(0);
+  }
+
+  SmallVector<unsigned, 4> OrigRegs;
+  if (LIS) {
+    OrigRegs.push_back(MI->getOperand(0).getReg());
+    for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2)
+      OrigRegs.push_back(MI->getOperand(i).getReg());
+  }
+
+  bool DefEmitted = false;
+  for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) {
+    MachineOperand &UseMO = MI->getOperand(i);
+    unsigned SrcReg = UseMO.getReg();
+    unsigned SubIdx = MI->getOperand(i+1).getImm();
+    // Nothing needs to be inserted for <undef> operands.
+    if (UseMO.isUndef())
+      continue;
+
+    // Defer any kill flag to the last operand using SrcReg. Otherwise, we
+    // might insert a COPY that uses SrcReg after is was killed.
+    bool isKill = UseMO.isKill();
+    if (isKill)
+      for (unsigned j = i + 2; j < e; j += 2)
+        if (MI->getOperand(j).getReg() == SrcReg) {
+          MI->getOperand(j).setIsKill();
+          UseMO.setIsKill(false);
+          isKill = false;
+          break;
+        }
+
+    // Insert the sub-register copy.
+    MachineInstr *CopyMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+      .addReg(DstReg, RegState::Define, SubIdx)
+      .addOperand(UseMO);
+
+    // The first def needs an <undef> flag because there is no live register
+    // before it.
+    if (!DefEmitted) {
+      CopyMI->getOperand(0).setIsUndef(true);
+      // Return an iterator pointing to the first inserted instr.
+      MBBI = CopyMI;
+    }
+    DefEmitted = true;
+
+    // Update LiveVariables' kill info.
+    if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
+      LV->replaceKillInstruction(SrcReg, MI, CopyMI);
+
+    DEBUG(dbgs() << "Inserted: " << *CopyMI);
+  }
+
+  MachineBasicBlock::iterator EndMBBI =
+      llvm::next(MachineBasicBlock::iterator(MI));
+
+  if (!DefEmitted) {
+    DEBUG(dbgs() << "Turned: " << *MI << " into an IMPLICIT_DEF");
+    MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+    for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j)
+      MI->RemoveOperand(j);
+  } else {
+    DEBUG(dbgs() << "Eliminated: " << *MI);
+    MI->eraseFromParent();
+  }
+
+  // Udpate LiveIntervals.
+  if (LIS)
+    LIS->repairIntervalsInRange(MBB, MBBI, EndMBBI, OrigRegs);
+}
diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
new file mode 100644
index 000000000000..a95ebcd16da8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -0,0 +1,215 @@
+//===-- UnreachableBlockElim.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is an extremely simple version of the SimplifyCFG pass.  Its sole
+// job is to delete LLVM basic blocks that are not reachable from the entry
+// node.  To do this, it performs a simple depth first traversal of the CFG,
+// then deletes any unvisited nodes.
+//
+// Note that this pass is really a hack.  In particular, the instruction
+// selectors for various targets should just not generate code for unreachable
+// blocks.  Until LLVM has a more systematic way of defining instruction
+// selectors, however, we cannot really expect them to handle additional
+// complexity.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+  class UnreachableBlockElim : public FunctionPass {
+    virtual bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableBlockElim() : FunctionPass(ID) {
+      initializeUnreachableBlockElimPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<ProfileInfo>();
+    }
+  };
+}
+char UnreachableBlockElim::ID = 0;
+INITIALIZE_PASS(UnreachableBlockElim, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false)
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElim();
+}
+
+bool UnreachableBlockElim::runOnFunction(Function &F) {
+  SmallPtrSet<BasicBlock*, 8> Reachable;
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<Function*, SmallPtrSet<BasicBlock*, 8> > I =
+       df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable); I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (!Reachable.count(I)) {
+      BasicBlock *BB = I;
+      DeadBlocks.push_back(BB);
+      while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+        PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
+        BB->getInstList().pop_front();
+      }
+      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+        (*SI)->removePredecessor(BB);
+      BB->dropAllReferences();
+    }
+
+  // Actually remove the blocks now.
+  ProfileInfo *PI = getAnalysisIfAvailable<ProfileInfo>();
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    if (PI) PI->removeBlock(DeadBlocks[i]);
+    DeadBlocks[i]->eraseFromParent();
+  }
+
+  return DeadBlocks.size();
+}
+
+
+namespace {
+  class UnreachableMachineBlockElim : public MachineFunctionPass {
+    virtual bool runOnMachineFunction(MachineFunction &F);
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    MachineModuleInfo *MMI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
+  };
+}
+char UnreachableMachineBlockElim::ID = 0;
+
+INITIALIZE_PASS(UnreachableMachineBlockElim, "unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks", false, false)
+
+char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
+
+void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+  bool ModifiedPHI = false;
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  // Mark all reachable blocks.
+  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
+       I = df_ext_begin(&F, Reachable), E = df_ext_end(&F, Reachable);
+       I != E; ++I)
+    /* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<MachineBasicBlock*> DeadBlocks;
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+
+    // Test for deadness.
+    if (!Reachable.count(BB)) {
+      DeadBlocks.push_back(BB);
+
+      // Update dominator and loop info.
+      if (MLI) MLI->removeBlock(BB);
+      if (MDT && MDT->getNode(BB)) MDT->eraseNode(BB);
+
+      while (BB->succ_begin() != BB->succ_end()) {
+        MachineBasicBlock* succ = *BB->succ_begin();
+
+        MachineBasicBlock::iterator start = succ->begin();
+        while (start != succ->end() && start->isPHI()) {
+          for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2)
+            if (start->getOperand(i).isMBB() &&
+                start->getOperand(i).getMBB() == BB) {
+              start->RemoveOperand(i);
+              start->RemoveOperand(i-1);
+            }
+
+          start++;
+        }
+
+        BB->removeSuccessor(BB->succ_begin());
+      }
+    }
+  }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+    DeadBlocks[i]->eraseFromParent();
+
+  // Cleanup PHI nodes.
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = I;
+    // Prune unneeded PHI entries.
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
+                                             BB->pred_end());
+    MachineBasicBlock::iterator phi = BB->begin();
+    while (phi != BB->end() && phi->isPHI()) {
+      for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
+        if (!preds.count(phi->getOperand(i).getMBB())) {
+          phi->RemoveOperand(i);
+          phi->RemoveOperand(i-1);
+          ModifiedPHI = true;
+        }
+
+      if (phi->getNumOperands() == 3) {
+        unsigned Input = phi->getOperand(1).getReg();
+        unsigned Output = phi->getOperand(0).getReg();
+
+        MachineInstr* temp = phi;
+        ++phi;
+        temp->eraseFromParent();
+        ModifiedPHI = true;
+
+        if (Input != Output) {
+          MachineRegisterInfo &MRI = F.getRegInfo();
+          MRI.constrainRegClass(Input, MRI.getRegClass(Output));
+          MRI.replaceRegWith(Output, Input);
+        }
+
+        continue;
+      }
+
+      ++phi;
+    }
+  }
+
+  F.RenumberBlocks();
+
+  return (DeadBlocks.size() || ModifiedPHI);
+}
diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
new file mode 100644
index 000000000000..cd012d297489
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -0,0 +1,359 @@
+//===-- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VirtRegMap class.
+//
+// It also contains implementations of the Spiller interface, which, given a
+// virtual register map and a machine function, eliminates all virtual
+// references by replacing them with physical register references - adding spill
+// code as necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSpillSlots, "Number of spill slots allocated");
+STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
+
+//===----------------------------------------------------------------------===//
+//  VirtRegMap implementation
+//===----------------------------------------------------------------------===//
+
+char VirtRegMap::ID = 0;
+
+INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false)
+
+bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
+  MRI = &mf.getRegInfo();
+  TII = mf.getTarget().getInstrInfo();
+  TRI = mf.getTarget().getRegisterInfo();
+  MF = &mf;
+
+  Virt2PhysMap.clear();
+  Virt2StackSlotMap.clear();
+  Virt2SplitMap.clear();
+
+  grow();
+  return false;
+}
+
+void VirtRegMap::grow() {
+  unsigned NumRegs = MF->getRegInfo().getNumVirtRegs();
+  Virt2PhysMap.resize(NumRegs);
+  Virt2StackSlotMap.resize(NumRegs);
+  Virt2SplitMap.resize(NumRegs);
+}
+
+unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
+  int SS = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                      RC->getAlignment());
+  ++NumSpillSlots;
+  return SS;
+}
+
+bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
+  unsigned Hint = MRI->getSimpleHint(VirtReg);
+  if (!Hint)
+    return 0;
+  if (TargetRegisterInfo::isVirtualRegister(Hint))
+    Hint = getPhys(Hint);
+  return getPhys(VirtReg) == Hint;
+}
+
+bool VirtRegMap::hasKnownPreference(unsigned VirtReg) {
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
+  if (TargetRegisterInfo::isPhysicalRegister(Hint.second))
+    return true;
+  if (TargetRegisterInfo::isVirtualRegister(Hint.second))
+    return hasPhys(Hint.second);
+  return false;
+}
+
+int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
+  return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
+}
+
+void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  assert((SS >= 0 ||
+          (SS >= MF->getFrameInfo()->getObjectIndexBegin())) &&
+         "illegal fixed frame index");
+  Virt2StackSlotMap[virtReg] = SS;
+}
+
+void VirtRegMap::print(raw_ostream &OS, const Module*) const {
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> "
+         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
+         << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
+         << "] " << MRI->getRegClass(Reg)->getName() << "\n";
+    }
+  }
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VirtRegMap::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//                              VirtRegRewriter
+//===----------------------------------------------------------------------===//
+//
+// The VirtRegRewriter is the last of the register allocator passes.
+// It rewrites virtual registers to physical registers as specified in the
+// VirtRegMap analysis. It also updates live-in information on basic blocks
+// according to LiveIntervals.
+//
+namespace {
+class VirtRegRewriter : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  void rewrite();
+  void addMBBLiveIns();
+public:
+  static char ID;
+  VirtRegRewriter() : MachineFunctionPass(ID) {}
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual bool runOnMachineFunction(MachineFunction&);
+};
+} // end anonymous namespace
+
+char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;
+
+INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
+                      "Virtual Register Rewriter", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
+                    "Virtual Register Rewriter", false, false)
+
+char VirtRegRewriter::ID = 0;
+
+void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  TM = &MF->getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  MRI = &MF->getRegInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+  DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
+               << "********** Function: "
+               << MF->getName() << '\n');
+  DEBUG(VRM->dump());
+
+  // Add kill flags while we still have virtual registers.
+  LIS->addKillFlags(VRM);
+
+  // Live-in lists on basic blocks are required for physregs.
+  addMBBLiveIns();
+
+  // Rewrite virtual registers.
+  rewrite();
+
+  // Write out new DBG_VALUE instructions.
+  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers and release all the transient data.
+  VRM->clearAllVirt();
+  MRI->clearVirtRegs();
+  return true;
+}
+
+// Compute MBB live-in lists from virtual register live ranges and their
+// assignments.
+void VirtRegRewriter::addMBBLiveIns() {
+  SmallVector<MachineBasicBlock*, 16> LiveIn;
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    if (LI.empty() || LIS->intervalIsInOneMBB(LI))
+      continue;
+    // This is a virtual register that is live across basic blocks. Its
+    // assigned PhysReg must be marked as live-in to those blocks.
+    unsigned PhysReg = VRM->getPhys(VirtReg);
+    assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
+
+    // Scan the segments of LI.
+    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
+         ++I) {
+      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
+        continue;
+      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+        if (!LiveIn[i]->isLiveIn(PhysReg))
+          LiveIn[i]->addLiveIn(PhysReg);
+      LiveIn.clear();
+    }
+  }
+}
+
+void VirtRegRewriter::rewrite() {
+  SmallVector<unsigned, 8> SuperDeads;
+  SmallVector<unsigned, 8> SuperDefs;
+  SmallVector<unsigned, 8> SuperKills;
+
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    DEBUG(MBBI->print(dbgs(), Indexes));
+    for (MachineBasicBlock::instr_iterator
+           MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
+      MachineInstr *MI = MII;
+      ++MII;
+
+      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+           MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+        MachineOperand &MO = *MOI;
+
+        // Make sure MRI knows about registers clobbered by regmasks.
+        if (MO.isRegMask())
+          MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+
+        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+          continue;
+        unsigned VirtReg = MO.getReg();
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+               "Instruction uses unmapped VirtReg");
+        assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
+
+        // Preserve semantics of sub-register operands.
+        if (MO.getSubReg()) {
+          // A virtual register kill refers to the whole register, so we may
+          // have to add <imp-use,kill> operands for the super-register.  A
+          // partial redef always kills and redefines the super-register.
+          if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+            SuperKills.push_back(PhysReg);
+
+          if (MO.isDef()) {
+            // The <def,undef> flag only makes sense for sub-register defs, and
+            // we are substituting a full physreg.  An <imp-use,kill> operand
+            // from the SuperKills list will represent the partial read of the
+            // super-register.
+            MO.setIsUndef(false);
+
+            // Also add implicit defs for the super-register.
+            if (MO.isDead())
+              SuperDeads.push_back(PhysReg);
+            else
+              SuperDefs.push_back(PhysReg);
+          }
+
+          // PhysReg operands cannot have subregister indexes.
+          PhysReg = TRI->getSubReg(PhysReg, MO.getSubReg());
+          assert(PhysReg && "Invalid SubReg for physical register");
+          MO.setSubReg(0);
+        }
+        // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
+        // we need the inlining here.
+        MO.setReg(PhysReg);
+      }
+
+      // Add any missing super-register kills after rewriting the whole
+      // instruction.
+      while (!SuperKills.empty())
+        MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
+
+      while (!SuperDeads.empty())
+        MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
+
+      while (!SuperDefs.empty())
+        MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);
+
+      DEBUG(dbgs() << "> " << *MI);
+
+      // Finally, remove any identity copies.
+      if (MI->isIdentityCopy()) {
+        ++NumIdCopies;
+        if (MI->getNumOperands() == 2) {
+          DEBUG(dbgs() << "Deleting identity copy.\n");
+          if (Indexes)
+            Indexes->removeMachineInstrFromMaps(MI);
+          // It's safe to erase MI because MII has already been incremented.
+          MI->eraseFromParent();
+        } else {
+          // Transform identity copy to a KILL to deal with subregisters.
+          MI->setDesc(TII->get(TargetOpcode::KILL));
+          DEBUG(dbgs() << "Identity copy: " << *MI);
+        }
+      }
+    }
+  }
+
+  // Tell MRI about physical registers in use.
+  for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
+    if (!MRI->reg_nodbg_empty(Reg))
+      MRI->setPhysRegUsed(Reg);
+}
diff --git a/contrib/llvm/lib/DebugInfo/DIContext.cpp b/contrib/llvm/lib/DebugInfo/DIContext.cpp
new file mode 100644
index 000000000000..49a44097d3e2
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DIContext.cpp
@@ -0,0 +1,18 @@
+//===-- DIContext.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DIContext.h"
+#include "DWARFContext.h"
+using namespace llvm;
+
+DIContext::~DIContext() {}
+
+DIContext *DIContext::getDWARFContext(object::ObjectFile *Obj) {
+  return new DWARFContextInMemory(Obj);
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
new file mode 100644
index 000000000000..2de62ab9380d
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
@@ -0,0 +1,83 @@
+//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFAbbreviationDeclaration.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr){
+  return extract(data, offset_ptr, data.getULEB128(offset_ptr));
+}
+
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr,
+                                      uint32_t code) {
+  Code = code;
+  Attribute.clear();
+  if (Code) {
+    Tag = data.getULEB128(offset_ptr);
+    HasChildren = data.getU8(offset_ptr);
+
+    while (data.isValidOffset(*offset_ptr)) {
+      uint16_t attr = data.getULEB128(offset_ptr);
+      uint16_t form = data.getULEB128(offset_ptr);
+
+      if (attr && form)
+        Attribute.push_back(DWARFAttribute(attr, form));
+      else
+        break;
+    }
+
+    return Tag != 0;
+  } else {
+    Tag = 0;
+    HasChildren = false;
+  }
+
+  return false;
+}
+
+void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
+  const char *tagString = TagString(getTag());
+  OS << '[' << getCode() << "] ";
+  if (tagString)
+    OS << tagString;
+  else
+    OS << format("DW_TAG_Unknown_%x", getTag());
+  OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
+  for (unsigned i = 0, e = Attribute.size(); i != e; ++i) {
+    OS << '\t';
+    const char *attrString = AttributeString(Attribute[i].getAttribute());
+    if (attrString)
+      OS << attrString;
+    else
+      OS << format("DW_AT_Unknown_%x", Attribute[i].getAttribute());
+    OS << '\t';
+    const char *formString = FormEncodingString(Attribute[i].getForm());
+    if (formString)
+      OS << formString;
+    else
+      OS << format("DW_FORM_Unknown_%x", Attribute[i].getForm());
+    OS << '\n';
+  }
+  OS << '\n';
+}
+
+uint32_t
+DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
+  for (uint32_t i = 0, e = Attribute.size(); i != e; ++i) {
+    if (Attribute[i].getAttribute() == attr)
+      return i;
+  }
+  return -1U;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.h
new file mode 100644
index 000000000000..9a3fcd8a783c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFAbbreviationDeclaration.h
@@ -0,0 +1,54 @@
+//===-- DWARFAbbreviationDeclaration.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+
+#include "DWARFAttribute.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFAbbreviationDeclaration {
+  uint32_t Code;
+  uint32_t Tag;
+  bool HasChildren;
+  SmallVector<DWARFAttribute, 8> Attribute;
+public:
+  enum { InvalidCode = 0 };
+  DWARFAbbreviationDeclaration()
+    : Code(InvalidCode), Tag(0), HasChildren(0) {}
+
+  uint32_t getCode() const { return Code; }
+  uint32_t getTag() const { return Tag; }
+  bool hasChildren() const { return HasChildren; }
+  uint32_t getNumAttributes() const { return Attribute.size(); }
+  uint16_t getAttrByIndex(uint32_t idx) const {
+    return Attribute.size() > idx ? Attribute[idx].getAttribute() : 0;
+  }
+  uint16_t getFormByIndex(uint32_t idx) const {
+    return Attribute.size() > idx ? Attribute[idx].getForm() : 0;
+  }
+
+  uint32_t findAttributeIndex(uint16_t attr) const;
+  bool extract(DataExtractor data, uint32_t* offset_ptr);
+  bool extract(DataExtractor data, uint32_t* offset_ptr, uint32_t code);
+  bool isValid() const { return Code != 0 && Tag != 0; }
+  void dump(raw_ostream &OS) const;
+  const SmallVectorImpl<DWARFAttribute> &getAttributes() const {
+    return Attribute;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFAttribute.h b/contrib/llvm/lib/DebugInfo/DWARFAttribute.h
new file mode 100644
index 000000000000..6f49b637c6a6
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFAttribute.h
@@ -0,0 +1,30 @@
+//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
+#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class DWARFAttribute {
+  uint16_t Attribute;
+  uint16_t Form;
+  public:
+  DWARFAttribute(uint16_t attr, uint16_t form)
+    : Attribute(attr), Form(form) {}
+
+  uint16_t getAttribute() const { return Attribute; }
+  uint16_t getForm() const { return Form; }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.cpp
new file mode 100644
index 000000000000..e3e4ccd7d9e1
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -0,0 +1,272 @@
+//===-- DWARFCompileUnit.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+DataExtractor DWARFCompileUnit::getDebugInfoExtractor() const {
+  return DataExtractor(InfoSection, isLittleEndian, AddrSize);
+}
+
+bool DWARFCompileUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    uint64_t abbrOffset;
+    Length = debug_info.getU32(offset_ptr);
+    Version = debug_info.getU16(offset_ptr);
+    abbrOffset = debug_info.getU32(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+
+    bool lengthOK = debug_info.isValidOffset(getNextCompileUnitOffset()-1);
+    bool versionOK = DWARFContext::isSupportedVersion(Version);
+    bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
+    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+    if (lengthOK && versionOK && addrSizeOK && abbrOffsetOK && Abbrev != NULL) {
+      Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
+      return true;
+    }
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+uint32_t
+DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
+                          const DWARFAbbreviationDeclarationSet *abbrevs) {
+  clear();
+
+  Offset = offset;
+
+  if (debug_info_data.isValidOffset(offset)) {
+    Length = debug_info_data.getU32(&offset);
+    Version = debug_info_data.getU16(&offset);
+    bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
+    Abbrevs = abbrevs;
+    AddrSize = debug_info_data.getU8(&offset);
+
+    bool versionOK = DWARFContext::isSupportedVersion(Version);
+    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+    if (versionOK && addrSizeOK && abbrevsOK &&
+        debug_info_data.isValidOffset(offset))
+      return offset;
+  }
+  return 0;
+}
+
+bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  return RangeList.extract(RangesData, &RangeListOffset);
+}
+
+void DWARFCompileUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = 0;
+  AddrSize = 0;
+  BaseAddr = 0;
+  clearDIEs(false);
+}
+
+void DWARFCompileUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", Offset) << ": Compile Unit:"
+     << " length = " << format("0x%08x", Length)
+     << " version = " << format("0x%04x", Version)
+     << " abbr_offset = " << format("0x%04x", Abbrevs->getOffset())
+     << " addr_size = " << format("0x%02x", AddrSize)
+     << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
+
+const char *DWARFCompileUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
+void DWARFCompileUnit::setDIERelations() {
+  if (DieArray.empty())
+    return;
+  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
+  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
+  DWARFDebugInfoEntryMinimal *curr_die;
+  // We purposely are skipping the last element in the array in the loop below
+  // so that we can always have a valid next item
+  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
+    // Since our loop doesn't include the last element, we can always
+    // safely access the next die in the array.
+    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
+
+    const DWARFAbbreviationDeclaration *curr_die_abbrev =
+      curr_die->getAbbreviationDeclarationPtr();
+
+    if (curr_die_abbrev) {
+      // Normal DIE
+      if (curr_die_abbrev->hasChildren())
+        next_die->setParent(curr_die);
+      else
+        curr_die->setSibling(next_die);
+    } else {
+      // NULL DIE that terminates a sibling chain
+      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
+      if (parent)
+        parent->setSibling(next_die);
+    }
+  }
+
+  // Since we skipped the last element, we need to fix it up!
+  if (die_array_begin < die_array_end)
+    curr_die->setParent(die_array_begin);
+}
+
+size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
+  const size_t initial_die_array_size = DieArray.size();
+  if ((cu_die_only && initial_die_array_size > 0) ||
+      initial_die_array_size > 1)
+    return 0; // Already parsed
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t offset = getFirstDIEOffset();
+  uint32_t next_cu_offset = getNextCompileUnitOffset();
+
+  DWARFDebugInfoEntryMinimal die;
+  // Keep a flat array of the DIE for binary lookup by DIE offset
+  uint32_t depth = 0;
+  // We are in our compile unit, parse starting at the offset
+  // we were told to parse
+
+  const uint8_t *fixed_form_sizes =
+    DWARFFormValue::getFixedFormSizesForAddressSize(getAddressByteSize());
+
+  while (offset < next_cu_offset &&
+         die.extractFast(this, fixed_form_sizes, &offset)) {
+
+    if (depth == 0) {
+      uint64_t base_addr =
+        die.getAttributeValueAsUnsigned(this, DW_AT_low_pc, -1U);
+      if (base_addr == -1U)
+        base_addr = die.getAttributeValueAsUnsigned(this, DW_AT_entry_pc, 0);
+      setBaseAddress(base_addr);
+    }
+
+    if (cu_die_only) {
+      addDIE(die);
+      return 1;
+    }
+    else if (depth == 0 && initial_die_array_size == 1)
+      // Don't append the CU die as we already did that
+      ;
+    else
+      addDIE(die);
+
+    const DWARFAbbreviationDeclaration *abbrDecl =
+      die.getAbbreviationDeclarationPtr();
+    if (abbrDecl) {
+      // Normal DIE
+      if (abbrDecl->hasChildren())
+        ++depth;
+    } else {
+      // NULL DIE.
+      if (depth > 0)
+        --depth;
+      if (depth == 0)
+        break;  // We are done with this compile unit!
+    }
+
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (offset > next_cu_offset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) {
+  if (DieArray.size() > (unsigned)keep_compile_unit_die) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when "tmp_array" goes out of scope, it will destroy the
+    // contents.
+
+    // Save at least the compile unit DIE
+    std::vector<DWARFDebugInfoEntryMinimal> tmpArray;
+    DieArray.swap(tmpArray);
+    if (keep_compile_unit_die)
+      DieArray.push_back(tmpArray.front());
+  }
+}
+
+void
+DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                                         bool clear_dies_if_already_not_parsed){
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
+  DieArray[0].buildAddressRangeTable(this, debug_aranges);
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (clear_dies)
+    clearDIEs(true);
+}
+
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
+  extractDIEsIfNeeded(false);
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
+  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      SubprogramDIE = &DieArray[i];
+      break;
+    }
+  }
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryMinimal::InlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(this, Address);
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.h b/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.h
new file mode 100644
index 000000000000..2a74605fcb2d
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFCompileUnit.h
@@ -0,0 +1,143 @@
+//===-- DWARFCompileUnit.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+#define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+
+#include "DWARFDebugAbbrev.h"
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
+#include "DWARFRelocMap.h"
+#include <vector>
+
+namespace llvm {
+
+class DWARFDebugAbbrev;
+class StringRef;
+class raw_ostream;
+
+class DWARFCompileUnit {
+  const DWARFDebugAbbrev *Abbrev;
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef RangeSection;
+  StringRef StringSection;
+  StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
+  const RelocAddrMap *RelocMap;
+  bool isLittleEndian;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry item.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+public:
+
+  DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                   StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                   const RelocAddrMap *M, bool LE) :
+    Abbrev(DA), InfoSection(IS), AbbrevSection(AS),
+    RangeSection(RS), StringSection(SS), StringOffsetSection(SOS),
+    AddrOffsetSection(AOS), RelocMap(M), isLittleEndian(LE) {
+    clear();
+  }
+
+  StringRef getStringSection() const { return StringSection; }
+  StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  StringRef getAddrOffsetSection() const { return AddrOffsetSection; }
+  const RelocAddrMap *getRelocMap() const { return RelocMap; }
+  DataExtractor getDebugInfoExtractor() const;
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+  uint32_t extract(uint32_t offset, DataExtractor debug_info_data,
+                   const DWARFAbbreviationDeclarationSet *abbrevs);
+
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
+  size_t extractDIEsIfNeeded(bool cu_die_only);
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
+  void clear();
+  void dump(raw_ostream &OS);
+  uint32_t getOffset() const { return Offset; }
+  /// Size in bytes of the compile unit header.
+  uint32_t getSize() const { return 11; }
+  bool containsDIEOffset(uint32_t die_offset) const {
+    return die_offset >= getFirstDIEOffset() &&
+      die_offset < getNextCompileUnitOffset();
+  }
+  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
+  uint32_t getNextCompileUnitOffset() const { return Offset + Length + 4; }
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    if (DieArray.empty())
+      return NULL;
+    return &DieArray[0];
+  }
+
+  const char *getCompilationDir();
+
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+
+  void addDIE(DWARFDebugInfoEntryMinimal &die) {
+    // The average bytes per DIE entry has been seen to be
+    // around 14-20 so lets pre-reserve the needed memory for
+    // our DIE entries accordingly. Search forward for "Compute
+    // average bytes per DIE" to see #if'ed out code that does
+    // that determination.
+
+    // Only reserve the memory if we are adding children of
+    // the main compile unit DIE. The compile unit DIE is always
+    // the first entry, so if our size is 1, then we are adding
+    // the first compile unit child DIE and should reserve
+    // the memory.
+    if (DieArray.empty())
+      DieArray.reserve(getDebugInfoSize() / 14);
+    DieArray.push_back(die);
+  }
+
+  void clearDIEs(bool keep_compile_unit_die);
+
+  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                              bool clear_dies_if_already_not_parsed);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address.
+  DWARFDebugInfoEntryMinimal::InlinedChain getInlinedChainForAddress(
+      uint64_t Address);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFContext.cpp b/contrib/llvm/lib/DebugInfo/DWARFContext.cpp
new file mode 100644
index 000000000000..9e19310a99c0
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFContext.cpp
@@ -0,0 +1,596 @@
+//===-- DWARFContext.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+
+typedef DWARFDebugLine::LineTable DWARFLineTable;
+
+void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
+  if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
+    OS << ".debug_abbrev contents:\n";
+    getDebugAbbrev()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Info) {
+    OS << "\n.debug_info contents:\n";
+    for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i)
+      getCompileUnitAtIndex(i)->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
+    OS << "\n.debug_frame contents:\n";
+    getDebugFrame()->dump(OS);
+  }
+
+  uint32_t offset = 0;
+  if (DumpType == DIDT_All || DumpType == DIDT_Aranges) {
+    OS << "\n.debug_aranges contents:\n";
+    DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+    DWARFDebugArangeSet set;
+    while (set.extract(arangesData, &offset))
+      set.dump(OS);
+  }
+
+  uint8_t savedAddressByteSize = 0;
+  if (DumpType == DIDT_All || DumpType == DIDT_Line) {
+    OS << "\n.debug_line contents:\n";
+    for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
+      DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
+      savedAddressByteSize = cu->getAddressByteSize();
+      unsigned stmtOffset =
+        cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
+                                                             -1U);
+      if (stmtOffset != -1U) {
+        DataExtractor lineData(getLineSection(), isLittleEndian(),
+                               savedAddressByteSize);
+        DWARFDebugLine::DumpingState state(OS);
+        DWARFDebugLine::parseStatementTable(lineData, &lineRelocMap(), &stmtOffset, state);
+      }
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Str) {
+    OS << "\n.debug_str contents:\n";
+    DataExtractor strData(getStringSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strOffset = 0;
+    while (const char *s = strData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strOffset, s);
+      strOffset = offset;
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Ranges) {
+    OS << "\n.debug_ranges contents:\n";
+    // In fact, different compile units may have different address byte
+    // sizes, but for simplicity we just use the address byte size of the last
+    // compile unit (there is no easy and fast way to associate address range
+    // list and the compile unit it describes).
+    DataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                             savedAddressByteSize);
+    offset = 0;
+    DWARFDebugRangeList rangeList;
+    while (rangeList.extract(rangesData, &offset))
+      rangeList.dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames) {
+    OS << "\n.debug_pubnames contents:\n";
+    DataExtractor pubNames(getPubNamesSection(), isLittleEndian(), 0);
+    offset = 0;
+    OS << "Length:                " << pubNames.getU32(&offset) << "\n";
+    OS << "Version:               " << pubNames.getU16(&offset) << "\n";
+    OS << "Offset in .debug_info: " << pubNames.getU32(&offset) << "\n";
+    OS << "Size:                  " << pubNames.getU32(&offset) << "\n";
+    OS << "\n  Offset    Name\n";
+    while (offset < getPubNamesSection().size()) {
+      uint32_t n = pubNames.getU32(&offset);
+      if (n == 0)
+        break;
+      OS << format("%8x    ", n);
+      OS << pubNames.getCStr(&offset) << "\n";
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo) {
+    OS << "\n.debug_abbrev.dwo contents:\n";
+    getDebugAbbrevDWO()->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_InfoDwo) {
+    OS << "\n.debug_info.dwo contents:\n";
+    for (unsigned i = 0, e = getNumDWOCompileUnits(); i != e; ++i)
+      getDWOCompileUnitAtIndex(i)->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_StrDwo) {
+    OS << "\n.debug_str.dwo contents:\n";
+    DataExtractor strDWOData(getStringDWOSection(), isLittleEndian(), 0);
+    offset = 0;
+    uint32_t strDWOOffset = 0;
+    while (const char *s = strDWOData.getCStr(&offset)) {
+      OS << format("0x%8.8x: \"%s\"\n", strDWOOffset, s);
+      strDWOOffset = offset;
+    }
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) {
+    OS << "\n.debug_str_offsets.dwo contents:\n";
+    DataExtractor strOffsetExt(getStringOffsetDWOSection(), isLittleEndian(), 0);
+    offset = 0;
+    while (offset < getStringOffsetDWOSection().size()) {
+      OS << format("0x%8.8x: ", offset);
+      OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
+    }
+  }
+}
+
+const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
+  if (Abbrev)
+    return Abbrev.get();
+
+  DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);
+
+  Abbrev.reset(new DWARFDebugAbbrev());
+  Abbrev->parse(abbrData);
+  return Abbrev.get();
+}
+
+const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
+  if (AbbrevDWO)
+    return AbbrevDWO.get();
+
+  DataExtractor abbrData(getAbbrevDWOSection(), isLittleEndian(), 0);
+  AbbrevDWO.reset(new DWARFDebugAbbrev());
+  AbbrevDWO->parse(abbrData);
+  return AbbrevDWO.get();
+}
+
+const DWARFDebugAranges *DWARFContext::getDebugAranges() {
+  if (Aranges)
+    return Aranges.get();
+
+  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+
+  Aranges.reset(new DWARFDebugAranges());
+  Aranges->extract(arangesData);
+  // Generate aranges from DIEs: even if .debug_aranges section is present,
+  // it may describe only a small subset of compilation units, so we need to
+  // manually build aranges for the rest of them.
+  Aranges->generate(this);
+  return Aranges.get();
+}
+
+const DWARFDebugFrame *DWARFContext::getDebugFrame() {
+  if (DebugFrame)
+    return DebugFrame.get();
+
+  // There's a "bug" in the DWARFv3 standard with respect to the target address
+  // size within debug frame sections. While DWARF is supposed to be independent
+  // of its container, FDEs have fields with size being "target address size",
+  // which isn't specified in DWARF in general. It's only specified for CUs, but
+  // .eh_frame can appear without a .debug_info section. Follow the example of
+  // other tools (libdwarf) and extract this from the container (ObjectFile
+  // provides this information). This problem is fixed in DWARFv4
+  // See this dwarf-discuss discussion for more details:
+  // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
+  DataExtractor debugFrameData(getDebugFrameSection(), isLittleEndian(),
+                               getAddressSize());
+  DebugFrame.reset(new DWARFDebugFrame());
+  DebugFrame->parse(debugFrameData);
+  return DebugFrame.get();
+}
+
+const DWARFLineTable *
+DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
+  if (!Line)
+    Line.reset(new DWARFDebugLine(&lineRelocMap()));
+
+  unsigned stmtOffset =
+    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
+                                                         -1U);
+  if (stmtOffset == -1U)
+    return 0; // No line table for this compile unit.
+
+  // See if the line table is cached.
+  if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
+    return lt;
+
+  // We have to parse it first.
+  DataExtractor lineData(getLineSection(), isLittleEndian(),
+                         cu->getAddressByteSize());
+  return Line->getOrParseLineTable(lineData, stmtOffset);
+}
+
+void DWARFContext::parseCompileUnits() {
+  uint32_t offset = 0;
+  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+                                              isLittleEndian(), 0);
+  while (DIData.isValidOffset(offset)) {
+    CUs.push_back(DWARFCompileUnit(getDebugAbbrev(), getInfoSection(),
+                                   getAbbrevSection(), getRangeSection(),
+                                   getStringSection(), StringRef(),
+                                   getAddrSection(),
+                                   &infoRelocMap(),
+                                   isLittleEndian()));
+    if (!CUs.back().extract(DIData, &offset)) {
+      CUs.pop_back();
+      break;
+    }
+
+    offset = CUs.back().getNextCompileUnitOffset();
+  }
+}
+
+void DWARFContext::parseDWOCompileUnits() {
+  uint32_t offset = 0;
+  const DataExtractor &DIData = DataExtractor(getInfoDWOSection(),
+                                              isLittleEndian(), 0);
+  while (DIData.isValidOffset(offset)) {
+    DWOCUs.push_back(DWARFCompileUnit(getDebugAbbrevDWO(), getInfoDWOSection(),
+                                      getAbbrevDWOSection(),
+                                      getRangeDWOSection(),
+                                      getStringDWOSection(),
+                                      getStringOffsetDWOSection(),
+                                      getAddrSection(),
+                                      &infoDWORelocMap(),
+                                      isLittleEndian()));
+    if (!DWOCUs.back().extract(DIData, &offset)) {
+      DWOCUs.pop_back();
+      break;
+    }
+
+    offset = DWOCUs.back().getNextCompileUnitOffset();
+  }
+}
+
+namespace {
+  struct OffsetComparator {
+    bool operator()(const DWARFCompileUnit &LHS,
+                    const DWARFCompileUnit &RHS) const {
+      return LHS.getOffset() < RHS.getOffset();
+    }
+    bool operator()(const DWARFCompileUnit &LHS, uint32_t RHS) const {
+      return LHS.getOffset() < RHS;
+    }
+    bool operator()(uint32_t LHS, const DWARFCompileUnit &RHS) const {
+      return LHS < RHS.getOffset();
+    }
+  };
+}
+
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
+  if (CUs.empty())
+    parseCompileUnits();
+
+  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
+                                          OffsetComparator());
+  if (CU != CUs.end())
+    return &*CU;
+  return 0;
+}
+
+DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
+  // First, get the offset of the compile unit.
+  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
+  // Retrieve the compile unit.
+  return getCompileUnitForOffset(CUOffset);
+}
+
+static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
+                                      const DWARFLineTable *LineTable,
+                                      uint64_t FileIndex,
+                                      bool NeedsAbsoluteFilePath,
+                                      std::string &FileName) {
+  if (CU == 0 ||
+      LineTable == 0 ||
+      !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath,
+                                     FileName))
+    return false;
+  if (NeedsAbsoluteFilePath && sys::path::is_relative(FileName)) {
+    // We may still need to append compilation directory of compile unit.
+    SmallString<16> AbsolutePath;
+    if (const char *CompilationDir = CU->getCompilationDir()) {
+      sys::path::append(AbsolutePath, CompilationDir);
+    }
+    sys::path::append(AbsolutePath, FileName);
+    FileName = AbsolutePath.str();
+  }
+  return true;
+}
+
+static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
+                                          const DWARFLineTable *LineTable,
+                                          uint64_t Address,
+                                          bool NeedsAbsoluteFilePath,
+                                          std::string &FileName,
+                                          uint32_t &Line, uint32_t &Column) {
+  if (CU == 0 || LineTable == 0)
+    return false;
+  // Get the index of row we're looking for in the line table.
+  uint32_t RowIndex = LineTable->lookupAddress(Address);
+  if (RowIndex == -1U)
+    return false;
+  // Take file number and line/column from the row.
+  const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+  if (!getFileNameForCompileUnit(CU, LineTable, Row.File,
+                                 NeedsAbsoluteFilePath, FileName))
+    return false;
+  Line = Row.Line;
+  Column = Row.Column;
+  return true;
+}
+
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return DILineInfo();
+  std::string FileName = "<invalid>";
+  std::string FunctionName = "<invalid>";
+  uint32_t Line = 0;
+  uint32_t Column = 0;
+  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    // The address may correspond to instruction in some inlined function,
+    // so we have to build the chain of inlined functions and take the
+    // name of the topmost function in it.
+    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+        CU->getInlinedChainForAddress(Address);
+    if (InlinedChain.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+  }
+  if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
+    const bool NeedsAbsoluteFilePath =
+        Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+    getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                  NeedsAbsoluteFilePath,
+                                  FileName, Line, Column);
+  }
+  return DILineInfo(StringRef(FileName), StringRef(FunctionName),
+                    Line, Column);
+}
+
+DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
+    uint64_t Size,
+    DILineInfoSpecifier Specifier) {
+  DILineInfoTable  Lines;
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return Lines;
+
+  std::string FunctionName = "<invalid>";
+  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    // The address may correspond to instruction in some inlined function,
+    // so we have to build the chain of inlined functions and take the
+    // name of the topmost function in it.
+    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+        CU->getInlinedChainForAddress(Address);
+    if (InlinedChain.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+  }
+
+  StringRef  FuncNameRef = StringRef(FunctionName);
+
+  // If the Specifier says we don't need FileLineInfo, just
+  // return the top-most function at the starting address.
+  if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    Lines.push_back(std::make_pair(Address, 
+                                   DILineInfo(StringRef("<invalid>"), 
+                                              FuncNameRef, 0, 0)));
+    return Lines;
+  }
+
+  const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
+  const bool NeedsAbsoluteFilePath =
+      Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+
+  // Get the index of row we're looking for in the line table.
+  std::vector<uint32_t> RowVector;
+  if (!LineTable->lookupAddressRange(Address, Size, RowVector))
+    return Lines;
+
+  uint32_t NumRows = RowVector.size();
+  for (uint32_t i = 0; i < NumRows; ++i) {
+    uint32_t RowIndex = RowVector[i];
+    // Take file number and line/column from the row.
+    const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+    std::string FileName = "<invalid>";
+    getFileNameForCompileUnit(CU, LineTable, Row.File,
+                              NeedsAbsoluteFilePath, FileName);
+    Lines.push_back(std::make_pair(Row.Address, 
+                                   DILineInfo(StringRef(FileName),
+                                         FuncNameRef, Row.Line, Row.Column)));
+  }
+
+  return Lines;
+}
+
+DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return DIInliningInfo();
+
+  const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.size() == 0)
+    return DIInliningInfo();
+
+  DIInliningInfo InliningInfo;
+  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
+  const DWARFLineTable *LineTable = 0;
+  for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
+    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i];
+    std::string FileName = "<invalid>";
+    std::string FunctionName = "<invalid>";
+    uint32_t Line = 0;
+    uint32_t Column = 0;
+    // Get function name if necessary.
+    if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+      if (const char *Name = FunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+    if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+      const bool NeedsAbsoluteFilePath =
+          Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+      if (i == 0) {
+        // For the topmost frame, initialize the line table of this
+        // compile unit and fetch file/line info from it.
+        LineTable = getLineTableForCompileUnit(CU);
+        // For the topmost routine, get file/line info from line table.
+        getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                      NeedsAbsoluteFilePath,
+                                      FileName, Line, Column);
+      } else {
+        // Otherwise, use call file, call line and call column from
+        // previous DIE in inlined chain.
+        getFileNameForCompileUnit(CU, LineTable, CallFile,
+                                  NeedsAbsoluteFilePath, FileName);
+        Line = CallLine;
+        Column = CallColumn;
+      }
+      // Get call file/line/column of a current DIE.
+      if (i + 1 < n) {
+        FunctionDIE.getCallerFrame(CU, CallFile, CallLine, CallColumn);
+      }
+    }
+    DILineInfo Frame(StringRef(FileName), StringRef(FunctionName),
+                     Line, Column);
+    InliningInfo.addFrame(Frame);
+  }
+  return InliningInfo;
+}
+
+DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
+  IsLittleEndian(Obj->isLittleEndian()),
+  AddressSize(Obj->getBytesInAddress()) {
+  error_code ec;
+  for (object::section_iterator i = Obj->begin_sections(),
+         e = Obj->end_sections();
+       i != e; i.increment(ec)) {
+    StringRef name;
+    i->getName(name);
+    StringRef data;
+    i->getContents(data);
+
+    name = name.substr(name.find_first_not_of("._")); // Skip . and _ prefixes.
+    if (name == "debug_info")
+      InfoSection = data;
+    else if (name == "debug_abbrev")
+      AbbrevSection = data;
+    else if (name == "debug_line")
+      LineSection = data;
+    else if (name == "debug_aranges")
+      ARangeSection = data;
+    else if (name == "debug_frame")
+      DebugFrameSection = data;
+    else if (name == "debug_str")
+      StringSection = data;
+    else if (name == "debug_ranges") {
+      // FIXME: Use the other dwo range section when we emit it.
+      RangeDWOSection = data;
+      RangeSection = data;
+    }
+    else if (name == "debug_pubnames")
+      PubNamesSection = data;
+    else if (name == "debug_info.dwo")
+      InfoDWOSection = data;
+    else if (name == "debug_abbrev.dwo")
+      AbbrevDWOSection = data;
+    else if (name == "debug_str.dwo")
+      StringDWOSection = data;
+    else if (name == "debug_str_offsets.dwo")
+      StringOffsetDWOSection = data;
+    else if (name == "debug_addr")
+      AddrSection = data;
+    // Any more debug info sections go here.
+    else
+      continue;
+
+    // TODO: Add support for relocations in other sections as needed.
+    // Record relocations for the debug_info and debug_line sections.
+    RelocAddrMap *Map;
+    if (name == "debug_info")
+      Map = &InfoRelocMap;
+    else if (name == "debug_info.dwo")
+      Map = &InfoDWORelocMap;
+    else if (name == "debug_line")
+      Map = &LineRelocMap;
+    else
+      continue;
+
+    if (i->begin_relocations() != i->end_relocations()) {
+      uint64_t SectionSize;
+      i->getSize(SectionSize);
+      for (object::relocation_iterator reloc_i = i->begin_relocations(),
+             reloc_e = i->end_relocations();
+           reloc_i != reloc_e; reloc_i.increment(ec)) {
+        uint64_t Address;
+        reloc_i->getAddress(Address);
+        uint64_t Type;
+        reloc_i->getType(Type);
+        uint64_t SymAddr = 0;
+        // ELF relocations may need the symbol address
+        if (Obj->isELF()) {
+          object::SymbolRef Sym;
+          reloc_i->getSymbol(Sym);
+          Sym.getAddress(SymAddr);
+        }
+
+        object::RelocVisitor V(Obj->getFileFormatName());
+        // The section address is always 0 for debug sections.
+        object::RelocToApply R(V.visit(Type, *reloc_i, 0, SymAddr));
+        if (V.error()) {
+          SmallString<32> Name;
+          error_code ec(reloc_i->getTypeName(Name));
+          if (ec) {
+            errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
+          }
+          errs() << "error: failed to compute relocation: "
+                 << Name << "\n";
+          continue;
+        }
+
+        if (Address + R.Width > SectionSize) {
+          errs() << "error: " << R.Width << "-byte relocation starting "
+                 << Address << " bytes into section " << name << " which is "
+                 << SectionSize << " bytes long.\n";
+          continue;
+        }
+        if (R.Width > 8) {
+          errs() << "error: can't handle a relocation of more than 8 bytes at "
+                    "a time.\n";
+          continue;
+        }
+        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
+                     << " at " << format("%p", Address)
+                     << " with width " << format("%d", R.Width)
+                     << "\n");
+        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+      }
+    }
+  }
+}
+
+void DWARFContextInMemory::anchor() { }
diff --git a/contrib/llvm/lib/DebugInfo/DWARFContext.h b/contrib/llvm/lib/DebugInfo/DWARFContext.h
new file mode 100644
index 000000000000..37b272993f37
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFContext.h
@@ -0,0 +1,197 @@
+//===-- DWARFContext.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===/
+
+#ifndef LLVM_DEBUGINFO_DWARFCONTEXT_H
+#define LLVM_DEBUGINFO_DWARFCONTEXT_H
+
+#include "DWARFCompileUnit.h"
+#include "DWARFDebugAranges.h"
+#include "DWARFDebugFrame.h"
+#include "DWARFDebugLine.h"
+#include "DWARFDebugRangeList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/DIContext.h"
+
+namespace llvm {
+
+/// DWARFContext
+/// This data structure is the top level entity that deals with dwarf debug
+/// information parsing. The actual data is supplied through pure virtual
+/// methods that a concrete implementation provides.
+class DWARFContext : public DIContext {
+  SmallVector<DWARFCompileUnit, 1> CUs;
+  OwningPtr<DWARFDebugAbbrev> Abbrev;
+  OwningPtr<DWARFDebugAranges> Aranges;
+  OwningPtr<DWARFDebugLine> Line;
+  OwningPtr<DWARFDebugFrame> DebugFrame;
+
+  SmallVector<DWARFCompileUnit, 1> DWOCUs;
+  OwningPtr<DWARFDebugAbbrev> AbbrevDWO;
+
+  DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
+  DWARFContext &operator=(DWARFContext &) LLVM_DELETED_FUNCTION;
+
+  /// Read compile units from the debug_info section and store them in CUs.
+  void parseCompileUnits();
+
+  /// Read compile units from the debug_info.dwo section and store them in
+  /// DWOCUs.
+  void parseDWOCompileUnits();
+
+public:
+  DWARFContext() {}
+  virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All);
+
+  /// Get the number of compile units in this context.
+  unsigned getNumCompileUnits() {
+    if (CUs.empty())
+      parseCompileUnits();
+    return CUs.size();
+  }
+
+  /// Get the number of compile units in the DWO context.
+  unsigned getNumDWOCompileUnits() {
+    if (DWOCUs.empty())
+      parseDWOCompileUnits();
+    return DWOCUs.size();
+  }
+
+  /// Get the compile unit at the specified index for this compile unit.
+  DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
+    if (CUs.empty())
+      parseCompileUnits();
+    return &CUs[index];
+  }
+
+  /// Get the compile unit at the specified index for the DWO compile units.
+  DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
+    if (DWOCUs.empty())
+      parseDWOCompileUnits();
+    return &DWOCUs[index];
+  }
+
+  /// Get a pointer to the parsed DebugAbbrev object.
+  const DWARFDebugAbbrev *getDebugAbbrev();
+
+  /// Get a pointer to the parsed dwo abbreviations object.
+  const DWARFDebugAbbrev *getDebugAbbrevDWO();
+
+  /// Get a pointer to the parsed DebugAranges object.
+  const DWARFDebugAranges *getDebugAranges();
+
+  /// Get a pointer to the parsed frame information object.
+  const DWARFDebugFrame *getDebugFrame();
+
+  /// Get a pointer to a parsed line table corresponding to a compile unit.
+  const DWARFDebugLine::LineTable *
+  getLineTableForCompileUnit(DWARFCompileUnit *cu);
+
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+  virtual DILineInfoTable getLineInfoForAddressRange(uint64_t Address,
+      uint64_t Size, DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+
+  virtual bool isLittleEndian() const = 0;
+  virtual uint8_t getAddressSize() const = 0;
+  virtual const RelocAddrMap &infoRelocMap() const = 0;
+  virtual const RelocAddrMap &lineRelocMap() const = 0;
+  virtual StringRef getInfoSection() = 0;
+  virtual StringRef getAbbrevSection() = 0;
+  virtual StringRef getARangeSection() = 0;
+  virtual StringRef getDebugFrameSection() = 0;
+  virtual StringRef getLineSection() = 0;
+  virtual StringRef getStringSection() = 0;
+  virtual StringRef getRangeSection() = 0;
+  virtual StringRef getPubNamesSection() = 0;
+
+  // Sections for DWARF5 split dwarf proposal.
+  virtual StringRef getInfoDWOSection() = 0;
+  virtual StringRef getAbbrevDWOSection() = 0;
+  virtual StringRef getStringDWOSection() = 0;
+  virtual StringRef getStringOffsetDWOSection() = 0;
+  virtual StringRef getRangeDWOSection() = 0;
+  virtual StringRef getAddrSection() = 0;
+  virtual const RelocAddrMap &infoDWORelocMap() const = 0;
+
+  static bool isSupportedVersion(unsigned version) {
+    return version == 2 || version == 3;
+  }
+private:
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
+
+  /// Return the compile unit which contains instruction with provided
+  /// address.
+  DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
+};
+
+/// DWARFContextInMemory is the simplest possible implementation of a
+/// DWARFContext. It assumes all content is available in memory and stores
+/// pointers to it.
+class DWARFContextInMemory : public DWARFContext {
+  virtual void anchor();
+  bool IsLittleEndian;
+  uint8_t AddressSize;
+  RelocAddrMap InfoRelocMap;
+  RelocAddrMap LineRelocMap;
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef ARangeSection;
+  StringRef DebugFrameSection;
+  StringRef LineSection;
+  StringRef StringSection;
+  StringRef RangeSection;
+  StringRef PubNamesSection;
+
+  // Sections for DWARF5 split dwarf proposal.
+  RelocAddrMap InfoDWORelocMap;
+  StringRef InfoDWOSection;
+  StringRef AbbrevDWOSection;
+  StringRef StringDWOSection;
+  StringRef StringOffsetDWOSection;
+  StringRef RangeDWOSection;
+  StringRef AddrSection;
+
+public:
+  DWARFContextInMemory(object::ObjectFile *);
+  virtual bool isLittleEndian() const { return IsLittleEndian; }
+  virtual uint8_t getAddressSize() const { return AddressSize; }
+  virtual const RelocAddrMap &infoRelocMap() const { return InfoRelocMap; }
+  virtual const RelocAddrMap &lineRelocMap() const { return LineRelocMap; }
+  virtual StringRef getInfoSection() { return InfoSection; }
+  virtual StringRef getAbbrevSection() { return AbbrevSection; }
+  virtual StringRef getARangeSection() { return ARangeSection; }
+  virtual StringRef getDebugFrameSection() { return DebugFrameSection; }
+  virtual StringRef getLineSection() { return LineSection; }
+  virtual StringRef getStringSection() { return StringSection; }
+  virtual StringRef getRangeSection() { return RangeSection; }
+  virtual StringRef getPubNamesSection() { return PubNamesSection; }
+
+  // Sections for DWARF5 split dwarf proposal.
+  virtual StringRef getInfoDWOSection() { return InfoDWOSection; }
+  virtual StringRef getAbbrevDWOSection() { return AbbrevDWOSection; }
+  virtual StringRef getStringDWOSection() { return StringDWOSection; }
+  virtual StringRef getStringOffsetDWOSection() {
+    return StringOffsetDWOSection;
+  }
+  virtual StringRef getRangeDWOSection() { return RangeDWOSection; }
+  virtual StringRef getAddrSection() {
+    return AddrSection;
+  }
+  virtual const RelocAddrMap &infoDWORelocMap() const {
+    return InfoDWORelocMap;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.cpp
new file mode 100644
index 000000000000..6e6c37e30945
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.cpp
@@ -0,0 +1,106 @@
+//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugAbbrev.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+bool DWARFAbbreviationDeclarationSet::extract(DataExtractor data,
+                                              uint32_t* offset_ptr) {
+  const uint32_t beginOffset = *offset_ptr;
+  Offset = beginOffset;
+  clear();
+  DWARFAbbreviationDeclaration abbrevDeclaration;
+  uint32_t prevAbbrAode = 0;
+  while (abbrevDeclaration.extract(data, offset_ptr)) {
+    Decls.push_back(abbrevDeclaration);
+    if (IdxOffset == 0) {
+      IdxOffset = abbrevDeclaration.getCode();
+    } else {
+      if (prevAbbrAode + 1 != abbrevDeclaration.getCode())
+        IdxOffset = UINT32_MAX;// Out of order indexes, we can't do O(1) lookups
+    }
+    prevAbbrAode = abbrevDeclaration.getCode();
+  }
+  return beginOffset != *offset_ptr;
+}
+
+void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
+  for (unsigned i = 0, e = Decls.size(); i != e; ++i)
+    Decls[i].dump(OS);
+}
+
+const DWARFAbbreviationDeclaration*
+DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(uint32_t abbrCode)
+  const {
+  if (IdxOffset == UINT32_MAX) {
+    DWARFAbbreviationDeclarationCollConstIter pos;
+    DWARFAbbreviationDeclarationCollConstIter end = Decls.end();
+    for (pos = Decls.begin(); pos != end; ++pos) {
+      if (pos->getCode() == abbrCode)
+        return &(*pos);
+    }
+  } else {
+    uint32_t idx = abbrCode - IdxOffset;
+    if (idx < Decls.size())
+      return &Decls[idx];
+  }
+  return NULL;
+}
+
+DWARFDebugAbbrev::DWARFDebugAbbrev() :
+  AbbrevCollMap(),
+  PrevAbbrOffsetPos(AbbrevCollMap.end()) {}
+
+
+void DWARFDebugAbbrev::parse(DataExtractor data) {
+  uint32_t offset = 0;
+
+  while (data.isValidOffset(offset)) {
+    uint32_t initial_cu_offset = offset;
+    DWARFAbbreviationDeclarationSet abbrevDeclSet;
+
+    if (abbrevDeclSet.extract(data, &offset))
+      AbbrevCollMap[initial_cu_offset] = abbrevDeclSet;
+    else
+      break;
+  }
+  PrevAbbrOffsetPos = AbbrevCollMap.end();
+}
+
+void DWARFDebugAbbrev::dump(raw_ostream &OS) const {
+  if (AbbrevCollMap.empty()) {
+    OS << "< EMPTY >\n";
+    return;
+  }
+
+  DWARFAbbreviationDeclarationCollMapConstIter pos;
+  for (pos = AbbrevCollMap.begin(); pos != AbbrevCollMap.end(); ++pos) {
+    OS << format("Abbrev table for offset: 0x%8.8" PRIx64 "\n", pos->first);
+    pos->second.dump(OS);
+  }
+}
+
+const DWARFAbbreviationDeclarationSet*
+DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const {
+  DWARFAbbreviationDeclarationCollMapConstIter end = AbbrevCollMap.end();
+  DWARFAbbreviationDeclarationCollMapConstIter pos;
+  if (PrevAbbrOffsetPos != end &&
+      PrevAbbrOffsetPos->first == cu_abbr_offset) {
+    return &(PrevAbbrOffsetPos->second);
+  } else {
+    pos = AbbrevCollMap.find(cu_abbr_offset);
+    PrevAbbrOffsetPos = pos;
+  }
+
+  if (pos != AbbrevCollMap.end())
+    return &(pos->second);
+  return NULL;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.h b/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.h
new file mode 100644
index 000000000000..c7c0436866c4
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugAbbrev.h
@@ -0,0 +1,73 @@
+//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+#define LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+
+#include "DWARFAbbreviationDeclaration.h"
+#include <list>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+typedef std::vector<DWARFAbbreviationDeclaration>
+  DWARFAbbreviationDeclarationColl;
+typedef DWARFAbbreviationDeclarationColl::iterator
+  DWARFAbbreviationDeclarationCollIter;
+typedef DWARFAbbreviationDeclarationColl::const_iterator
+  DWARFAbbreviationDeclarationCollConstIter;
+
+class DWARFAbbreviationDeclarationSet {
+  uint32_t Offset;
+  uint32_t IdxOffset;
+  std::vector<DWARFAbbreviationDeclaration> Decls;
+  public:
+  DWARFAbbreviationDeclarationSet()
+    : Offset(0), IdxOffset(0) {}
+
+  DWARFAbbreviationDeclarationSet(uint32_t offset, uint32_t idxOffset)
+    : Offset(offset), IdxOffset(idxOffset) {}
+
+  void clear() {
+    IdxOffset = 0;
+    Decls.clear();
+  }
+  uint32_t getOffset() const { return Offset; }
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor data, uint32_t* offset_ptr);
+
+  const DWARFAbbreviationDeclaration *
+    getAbbreviationDeclaration(uint32_t abbrCode) const;
+};
+
+class DWARFDebugAbbrev {
+public:
+  typedef std::map<uint64_t, DWARFAbbreviationDeclarationSet>
+    DWARFAbbreviationDeclarationCollMap;
+  typedef DWARFAbbreviationDeclarationCollMap::iterator
+    DWARFAbbreviationDeclarationCollMapIter;
+  typedef DWARFAbbreviationDeclarationCollMap::const_iterator
+    DWARFAbbreviationDeclarationCollMapConstIter;
+
+private:
+  DWARFAbbreviationDeclarationCollMap AbbrevCollMap;
+  mutable DWARFAbbreviationDeclarationCollMapConstIter PrevAbbrOffsetPos;
+
+public:
+  DWARFDebugAbbrev();
+  const DWARFAbbreviationDeclarationSet *
+    getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const;
+  void dump(raw_ostream &OS) const;
+  void parse(DataExtractor data);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.cpp
new file mode 100644
index 000000000000..7dff9ff49a62
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.cpp
@@ -0,0 +1,151 @@
+//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugArangeSet.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+using namespace llvm;
+
+void DWARFDebugArangeSet::clear() {
+  Offset = -1U;
+  std::memset(&HeaderData, 0, sizeof(Header));
+  ArangeDescriptors.clear();
+}
+
+void DWARFDebugArangeSet::compact() {
+  if (ArangeDescriptors.empty())
+    return;
+
+  // Iterate through all arange descriptors and combine any ranges that
+  // overlap or have matching boundaries. The ArangeDescriptors are assumed
+  // to be in ascending order.
+  uint32_t i = 0;
+  while (i + 1 < ArangeDescriptors.size()) {
+    if (ArangeDescriptors[i].getEndAddress() >= ArangeDescriptors[i+1].Address){
+      // The current range ends at or exceeds the start of the next address
+      // range. Compute the max end address between the two and use that to
+      // make the new length.
+      const uint64_t max_end_addr =
+        std::max(ArangeDescriptors[i].getEndAddress(),
+                 ArangeDescriptors[i+1].getEndAddress());
+      ArangeDescriptors[i].Length = max_end_addr - ArangeDescriptors[i].Address;
+      // Now remove the next entry as it was just combined with the previous one
+      ArangeDescriptors.erase(ArangeDescriptors.begin()+i+1);
+    } else {
+      // Discontiguous address range, just proceed to the next one.
+      ++i;
+    }
+  }
+}
+
+bool
+DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
+  if (data.isValidOffset(*offset_ptr)) {
+    ArangeDescriptors.clear();
+    Offset = *offset_ptr;
+
+    // 7.20 Address Range Table
+    //
+    // Each set of entries in the table of address ranges contained in
+    // the .debug_aranges section begins with a header consisting of: a
+    // 4-byte length containing the length of the set of entries for this
+    // compilation unit, not including the length field itself; a 2-byte
+    // version identifier containing the value 2 for DWARF Version 2; a
+    // 4-byte offset into the.debug_infosection; a 1-byte unsigned integer
+    // containing the size in bytes of an address (or the offset portion of
+    // an address for segmented addressing) on the target system; and a
+    // 1-byte unsigned integer containing the size in bytes of a segment
+    // descriptor on the target system. This header is followed by a series
+    // of tuples. Each tuple consists of an address and a length, each in
+    // the size appropriate for an address on the target architecture.
+    HeaderData.Length = data.getU32(offset_ptr);
+    HeaderData.Version = data.getU16(offset_ptr);
+    HeaderData.CuOffset = data.getU32(offset_ptr);
+    HeaderData.AddrSize = data.getU8(offset_ptr);
+    HeaderData.SegSize = data.getU8(offset_ptr);
+
+    // Perform basic validation of the header fields.
+    if (!data.isValidOffsetForDataOfSize(Offset, HeaderData.Length) ||
+        (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)) {
+      clear();
+      return false;
+    }
+
+    // The first tuple following the header in each set begins at an offset
+    // that is a multiple of the size of a single tuple (that is, twice the
+    // size of an address). The header is padded, if necessary, to the
+    // appropriate boundary.
+    const uint32_t header_size = *offset_ptr - Offset;
+    const uint32_t tuple_size = HeaderData.AddrSize * 2;
+    uint32_t first_tuple_offset = 0;
+    while (first_tuple_offset < header_size)
+      first_tuple_offset += tuple_size;
+
+    *offset_ptr = Offset + first_tuple_offset;
+
+    Descriptor arangeDescriptor;
+
+    assert(sizeof(arangeDescriptor.Address) == sizeof(arangeDescriptor.Length));
+    assert(sizeof(arangeDescriptor.Address) >= HeaderData.AddrSize);
+
+    while (data.isValidOffset(*offset_ptr)) {
+      arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+      arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+
+      // Each set of tuples is terminated by a 0 for the address and 0
+      // for the length.
+      if (arangeDescriptor.Address || arangeDescriptor.Length)
+        ArangeDescriptors.push_back(arangeDescriptor);
+      else
+        break; // We are done if we get a zero address and length
+    }
+
+    return !ArangeDescriptors.empty();
+  }
+  return false;
+}
+
+void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
+  OS << format("Address Range Header: length = 0x%8.8x, version = 0x%4.4x, ",
+               HeaderData.Length, HeaderData.Version)
+     << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
+               HeaderData.CuOffset, HeaderData.AddrSize, HeaderData.SegSize);
+
+  const uint32_t hex_width = HeaderData.AddrSize * 2;
+  for (DescriptorConstIter pos = ArangeDescriptors.begin(),
+       end = ArangeDescriptors.end(); pos != end; ++pos)
+    OS << format("[0x%*.*" PRIx64 " -", hex_width, hex_width, pos->Address)
+       << format(" 0x%*.*" PRIx64 ")\n",
+                 hex_width, hex_width, pos->getEndAddress());
+}
+
+
+namespace {
+  class DescriptorContainsAddress {
+    const uint64_t Address;
+  public:
+    DescriptorContainsAddress(uint64_t address) : Address(address) {}
+    bool operator()(const DWARFDebugArangeSet::Descriptor &desc) const {
+      return Address >= desc.Address && Address < (desc.Address + desc.Length);
+    }
+  };
+}
+
+uint32_t DWARFDebugArangeSet::findAddress(uint64_t address) const {
+  DescriptorConstIter end = ArangeDescriptors.end();
+  DescriptorConstIter pos =
+    std::find_if(ArangeDescriptors.begin(), end, // Range
+                 DescriptorContainsAddress(address)); // Predicate
+  if (pos != end)
+    return HeaderData.CuOffset;
+
+  return -1U;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.h b/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.h
new file mode 100644
index 000000000000..d76867615aa1
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -0,0 +1,75 @@
+//===-- DWARFDebugArangeSet.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugArangeSet {
+public:
+  struct Header {
+    // The total length of the entries for that set, not including the length
+    // field itself.
+    uint32_t Length;
+    // The offset from the beginning of the .debug_info section of the
+    // compilation unit entry referenced by the table.
+    uint32_t CuOffset;
+    // The DWARF version number.
+    uint16_t Version;
+    // The size in bytes of an address on the target architecture. For segmented
+    // addressing, this is the size of the offset portion of the address.
+    uint8_t AddrSize;
+    // The size in bytes of a segment descriptor on the target architecture.
+    // If the target system uses a flat address space, this value is 0.
+    uint8_t SegSize;
+  };
+
+  struct Descriptor {
+    uint64_t Address;
+    uint64_t Length;
+    uint64_t getEndAddress() const { return Address + Length; }
+  };
+
+private:
+  typedef std::vector<Descriptor> DescriptorColl;
+  typedef DescriptorColl::iterator DescriptorIter;
+  typedef DescriptorColl::const_iterator DescriptorConstIter;
+
+  uint32_t Offset;
+  Header HeaderData;
+  DescriptorColl ArangeDescriptors;
+
+public:
+  DWARFDebugArangeSet() { clear(); }
+  void clear();
+  void compact();
+  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  void dump(raw_ostream &OS) const;
+
+  uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
+  uint32_t getOffsetOfNextEntry() const { return Offset + HeaderData.Length + 4; }
+  uint32_t findAddress(uint64_t address) const;
+  uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
+  const struct Header &getHeader() const { return HeaderData; }
+  const Descriptor *getDescriptor(uint32_t i) const {
+    if (i < ArangeDescriptors.size())
+      return &ArangeDescriptors[i];
+    return NULL;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.cpp
new file mode 100644
index 000000000000..f79862d606f5
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -0,0 +1,230 @@
+//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugAranges.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+using namespace llvm;
+
+// Compare function DWARFDebugAranges::Range structures
+static bool RangeLessThan(const DWARFDebugAranges::Range &range1,
+                          const DWARFDebugAranges::Range &range2) {
+  return range1.LoPC < range2.LoPC;
+}
+
+namespace {
+  class CountArangeDescriptors {
+  public:
+    CountArangeDescriptors(uint32_t &count_ref) : Count(count_ref) {}
+    void operator()(const DWARFDebugArangeSet &Set) {
+      Count += Set.getNumDescriptors();
+    }
+    uint32_t &Count;
+  };
+
+  class AddArangeDescriptors {
+  public:
+    AddArangeDescriptors(DWARFDebugAranges::RangeColl &Ranges,
+                         DWARFDebugAranges::ParsedCUOffsetColl &CUOffsets)
+      : RangeCollection(Ranges),
+        CUOffsetCollection(CUOffsets) {}
+    void operator()(const DWARFDebugArangeSet &Set) {
+      DWARFDebugAranges::Range Range;
+      Range.Offset = Set.getCompileUnitDIEOffset();
+      CUOffsetCollection.insert(Range.Offset);
+
+      for (uint32_t i = 0, n = Set.getNumDescriptors(); i < n; ++i) {
+        const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
+            Set.getDescriptor(i);
+        Range.LoPC = ArangeDescPtr->Address;
+        Range.Length = ArangeDescPtr->Length;
+
+        // Insert each item in increasing address order so binary searching
+        // can later be done!
+        DWARFDebugAranges::RangeColl::iterator InsertPos =
+          std::lower_bound(RangeCollection.begin(), RangeCollection.end(),
+                           Range, RangeLessThan);
+        RangeCollection.insert(InsertPos, Range);
+      }
+
+    }
+    DWARFDebugAranges::RangeColl &RangeCollection;
+    DWARFDebugAranges::ParsedCUOffsetColl &CUOffsetCollection;
+  };
+}
+
+bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
+  if (debug_aranges_data.isValidOffset(0)) {
+    uint32_t offset = 0;
+
+    typedef std::vector<DWARFDebugArangeSet> SetCollection;
+    SetCollection sets;
+
+    DWARFDebugArangeSet set;
+    Range range;
+    while (set.extract(debug_aranges_data, &offset))
+      sets.push_back(set);
+
+    uint32_t count = 0;
+
+    std::for_each(sets.begin(), sets.end(), CountArangeDescriptors(count));
+
+    if (count > 0) {
+      Aranges.reserve(count);
+      AddArangeDescriptors range_adder(Aranges, ParsedCUOffsets);
+      std::for_each(sets.begin(), sets.end(), range_adder);
+    }
+  }
+  return false;
+}
+
+bool DWARFDebugAranges::generate(DWARFContext *ctx) {
+  if (ctx) {
+    const uint32_t num_compile_units = ctx->getNumCompileUnits();
+    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) {
+      if (DWARFCompileUnit *cu = ctx->getCompileUnitAtIndex(cu_idx)) {
+        uint32_t CUOffset = cu->getOffset();
+        if (ParsedCUOffsets.insert(CUOffset).second)
+          cu->buildAddressRangeTable(this, true);
+      }
+    }
+  }
+  sort(true, /* overlap size */ 0);
+  return !isEmpty();
+}
+
+void DWARFDebugAranges::dump(raw_ostream &OS) const {
+  const uint32_t num_ranges = getNumRanges();
+  for (uint32_t i = 0; i < num_ranges; ++i) {
+    const Range &range = Aranges[i];
+    OS << format("0x%8.8x: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
+                 range.Offset, (uint64_t)range.LoPC, (uint64_t)range.HiPC());
+  }
+}
+
+void DWARFDebugAranges::Range::dump(raw_ostream &OS) const {
+  OS << format("{0x%8.8x}: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
+               Offset, LoPC, HiPC());
+}
+
+void DWARFDebugAranges::appendRange(uint32_t offset, uint64_t low_pc,
+                                    uint64_t high_pc) {
+  if (!Aranges.empty()) {
+    if (Aranges.back().Offset == offset && Aranges.back().HiPC() == low_pc) {
+      Aranges.back().setHiPC(high_pc);
+      return;
+    }
+  }
+  Aranges.push_back(Range(low_pc, high_pc, offset));
+}
+
+void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
+  const size_t orig_arange_size = Aranges.size();
+  // Size of one? If so, no sorting is needed
+  if (orig_arange_size <= 1)
+    return;
+  // Sort our address range entries
+  std::stable_sort(Aranges.begin(), Aranges.end(), RangeLessThan);
+
+  if (!minimize)
+    return;
+
+  // Most address ranges are contiguous from function to function
+  // so our new ranges will likely be smaller. We calculate the size
+  // of the new ranges since although std::vector objects can be resized,
+  // the will never reduce their allocated block size and free any excesss
+  // memory, so we might as well start a brand new collection so it is as
+  // small as possible.
+
+  // First calculate the size of the new minimal arange vector
+  // so we don't have to do a bunch of re-allocations as we
+  // copy the new minimal stuff over to the new collection.
+  size_t minimal_size = 1;
+  for (size_t i = 1; i < orig_arange_size; ++i) {
+    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i], n))
+      ++minimal_size;
+  }
+
+  // If the sizes are the same, then no consecutive aranges can be
+  // combined, we are done.
+  if (minimal_size == orig_arange_size)
+    return;
+
+  // Else, make a new RangeColl that _only_ contains what we need.
+  RangeColl minimal_aranges;
+  minimal_aranges.resize(minimal_size);
+  uint32_t j = 0;
+  minimal_aranges[j] = Aranges[0];
+  for (size_t i = 1; i < orig_arange_size; ++i) {
+    if(Range::SortedOverlapCheck (minimal_aranges[j], Aranges[i], n)) {
+      minimal_aranges[j].setHiPC (Aranges[i].HiPC());
+    } else {
+      // Only increment j if we aren't merging
+      minimal_aranges[++j] = Aranges[i];
+    }
+  }
+  assert (j+1 == minimal_size);
+
+  // Now swap our new minimal aranges into place. The local
+  // minimal_aranges will then contian the old big collection
+  // which will get freed.
+  minimal_aranges.swap(Aranges);
+}
+
+uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
+  if (!Aranges.empty()) {
+    Range range(address);
+    RangeCollIterator begin = Aranges.begin();
+    RangeCollIterator end = Aranges.end();
+    RangeCollIterator pos = std::lower_bound(begin, end, range, RangeLessThan);
+
+    if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
+      return pos->Offset;
+    } else if (pos != begin) {
+      --pos;
+      if (pos->LoPC <= address && address < pos->HiPC())
+        return (*pos).Offset;
+    }
+  }
+  return -1U;
+}
+
+bool
+DWARFDebugAranges::allRangesAreContiguous(uint64_t &LoPC, uint64_t &HiPC) const{
+  if (Aranges.empty())
+    return false;
+
+  uint64_t next_addr = 0;
+  RangeCollIterator begin = Aranges.begin();
+  for (RangeCollIterator pos = begin, end = Aranges.end(); pos != end;
+       ++pos) {
+    if (pos != begin && pos->LoPC != next_addr)
+      return false;
+    next_addr = pos->HiPC();
+  }
+  // We checked for empty at the start of function so front() will be valid.
+  LoPC = Aranges.front().LoPC;
+  // We checked for empty at the start of function so back() will be valid.
+  HiPC = Aranges.back().HiPC();
+  return true;
+}
+
+bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
+  if (Aranges.empty())
+    return false;
+  // We checked for empty at the start of function so front() will be valid.
+  LoPC = Aranges.front().LoPC;
+  // We checked for empty at the start of function so back() will be valid.
+  HiPC = Aranges.back().HiPC();
+  return true;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.h b/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.h
new file mode 100644
index 000000000000..1509ffad41f1
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugAranges.h
@@ -0,0 +1,104 @@
+//===-- DWARFDebugAranges.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+
+#include "DWARFDebugArangeSet.h"
+#include "llvm/ADT/DenseSet.h"
+#include <list>
+
+namespace llvm {
+
+class DWARFContext;
+
+class DWARFDebugAranges {
+public:
+  struct Range {
+    explicit Range(uint64_t lo = -1ULL, uint64_t hi = -1ULL,
+                   uint32_t off = -1U)
+      : LoPC(lo), Length(hi-lo), Offset(off) {}
+
+    void clear() {
+      LoPC = -1ULL;
+      Length = 0;
+      Offset = -1U;
+    }
+
+    void setHiPC(uint64_t HiPC) {
+      if (HiPC == -1ULL || HiPC <= LoPC)
+        Length = 0;
+      else
+        Length = HiPC - LoPC;
+    }
+    uint64_t HiPC() const {
+      if (Length)
+        return LoPC + Length;
+      return -1ULL;
+    }
+    bool isValidRange() const { return Length > 0; }
+
+    static bool SortedOverlapCheck(const Range &curr_range,
+                                   const Range &next_range, uint32_t n) {
+      if (curr_range.Offset != next_range.Offset)
+        return false;
+      return curr_range.HiPC() + n >= next_range.LoPC;
+    }
+
+    bool contains(const Range &range) const {
+      return LoPC <= range.LoPC && range.HiPC() <= HiPC();
+    }
+
+    void dump(raw_ostream &OS) const;
+    uint64_t LoPC; // Start of address range
+    uint32_t Length; // End of address range (not including this address)
+    uint32_t Offset; // Offset of the compile unit or die
+  };
+
+  void clear() {
+    Aranges.clear();
+    ParsedCUOffsets.clear();
+  }
+  bool allRangesAreContiguous(uint64_t& LoPC, uint64_t& HiPC) const;
+  bool getMaxRange(uint64_t& LoPC, uint64_t& HiPC) const;
+  bool extract(DataExtractor debug_aranges_data);
+  bool generate(DWARFContext *ctx);
+
+  // Use append range multiple times and then call sort
+  void appendRange(uint32_t cu_offset, uint64_t low_pc, uint64_t high_pc);
+  void sort(bool minimize, uint32_t n);
+
+  const Range *rangeAtIndex(uint32_t idx) const {
+    if (idx < Aranges.size())
+      return &Aranges[idx];
+    return NULL;
+  }
+  void dump(raw_ostream &OS) const;
+  uint32_t findAddress(uint64_t address) const;
+  bool isEmpty() const { return Aranges.empty(); }
+  uint32_t getNumRanges() const { return Aranges.size(); }
+
+  uint32_t offsetAtIndex(uint32_t idx) const {
+    if (idx < Aranges.size())
+      return Aranges[idx].Offset;
+    return -1U;
+  }
+
+  typedef std::vector<Range>              RangeColl;
+  typedef RangeColl::const_iterator       RangeCollIterator;
+  typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
+
+private:
+  RangeColl Aranges;
+  ParsedCUOffsetColl ParsedCUOffsets;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.cpp
new file mode 100644
index 000000000000..3efe6a1ebd30
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.cpp
@@ -0,0 +1,391 @@
+//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugFrame.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace dwarf;
+
+
+/// \brief Abstract frame entry defining the common interface concrete
+/// entries implement.
+class llvm::FrameEntry {
+public:
+  enum FrameKind {FK_CIE, FK_FDE};
+  FrameEntry(FrameKind K, DataExtractor D, uint64_t Offset, uint64_t Length)
+    : Kind(K), Data(D), Offset(Offset), Length(Length) {}
+
+  virtual ~FrameEntry() {
+  }
+
+  FrameKind getKind() const { return Kind; }
+  virtual uint64_t getOffset() const { return Offset; }
+
+  /// \brief Parse and store a sequence of CFI instructions from our data
+  /// stream, starting at *Offset and ending at EndOffset. If everything
+  /// goes well, *Offset should be equal to EndOffset when this method
+  /// returns. Otherwise, an error occurred.
+  virtual void parseInstructions(uint32_t *Offset, uint32_t EndOffset);
+
+  /// \brief Dump the entry header to the given output stream.
+  virtual void dumpHeader(raw_ostream &OS) const = 0;
+
+  /// \brief Dump the entry's instructions to the given output stream.
+  virtual void dumpInstructions(raw_ostream &OS) const;
+
+protected:
+  const FrameKind Kind;
+
+  /// \brief The data stream holding the section from which the entry was
+  /// parsed.
+  DataExtractor Data;
+
+  /// \brief Offset of this entry in the section.
+  uint64_t Offset;
+
+  /// \brief Entry length as specified in DWARF.
+  uint64_t Length;
+
+  /// An entry may contain CFI instructions. An instruction consists of an
+  /// opcode and an optional sequence of operands.
+  typedef std::vector<uint64_t> Operands;
+  struct Instruction {
+    Instruction(uint8_t Opcode)
+      : Opcode(Opcode)
+    {}
+
+    uint8_t Opcode;
+    Operands Ops;
+  };
+
+  std::vector<Instruction> Instructions;
+
+  /// Convenience methods to add a new instruction with the given opcode and
+  /// operands to the Instructions vector.
+  void addInstruction(uint8_t Opcode) {
+    Instructions.push_back(Instruction(Opcode));
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+  }
+
+  void addInstruction(uint8_t Opcode, uint64_t Operand1, uint64_t Operand2) {
+    Instructions.push_back(Instruction(Opcode));
+    Instructions.back().Ops.push_back(Operand1);
+    Instructions.back().Ops.push_back(Operand2);
+  }
+};
+
+
+// See DWARF standard v3, section 7.23
+const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
+const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
+
+
+void FrameEntry::parseInstructions(uint32_t *Offset, uint32_t EndOffset) {
+  while (*Offset < EndOffset) {
+    uint8_t Opcode = Data.getU8(Offset);
+    // Some instructions have a primary opcode encoded in the top bits.
+    uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
+
+    if (Primary) {
+      // If it's a primary opcode, the first operand is encoded in the bottom
+      // bits of the opcode itself.
+      uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
+      switch (Primary) {
+        default: llvm_unreachable("Impossible primary CFI opcode");
+        case DW_CFA_advance_loc:
+        case DW_CFA_restore:
+          addInstruction(Primary, Op1);
+          break;
+        case DW_CFA_offset:
+          addInstruction(Primary, Op1, Data.getULEB128(Offset));
+          break;
+      }
+    } else {
+      // Extended opcode - its value is Opcode itself.
+      switch (Opcode) {
+        default: llvm_unreachable("Invalid extended CFI opcode");
+        case DW_CFA_nop:
+        case DW_CFA_remember_state:
+        case DW_CFA_restore_state:
+          // No operands
+          addInstruction(Opcode);
+          break;
+        case DW_CFA_set_loc:
+          // Operands: Address
+          addInstruction(Opcode, Data.getAddress(Offset));
+          break;
+        case DW_CFA_advance_loc1:
+          // Operands: 1-byte delta
+          addInstruction(Opcode, Data.getU8(Offset));
+          break;
+        case DW_CFA_advance_loc2:
+          // Operands: 2-byte delta
+          addInstruction(Opcode, Data.getU16(Offset));
+          break;
+        case DW_CFA_advance_loc4:
+          // Operands: 4-byte delta
+          addInstruction(Opcode, Data.getU32(Offset));
+          break;
+        case DW_CFA_restore_extended:
+        case DW_CFA_undefined:
+        case DW_CFA_same_value:
+        case DW_CFA_def_cfa_register:
+        case DW_CFA_def_cfa_offset:
+          // Operands: ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_offset_sf:
+          // Operands: SLEB128
+          addInstruction(Opcode, Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_offset_extended:
+        case DW_CFA_register:
+        case DW_CFA_def_cfa:
+        case DW_CFA_val_offset:
+          // Operands: ULEB128, ULEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getULEB128(Offset));
+          break;
+        case DW_CFA_offset_extended_sf:
+        case DW_CFA_def_cfa_sf:
+        case DW_CFA_val_offset_sf:
+          // Operands: ULEB128, SLEB128
+          addInstruction(Opcode, Data.getULEB128(Offset),
+                                 Data.getSLEB128(Offset));
+          break;
+        case DW_CFA_def_cfa_expression:
+        case DW_CFA_expression:
+        case DW_CFA_val_expression:
+          // TODO: implement this
+          report_fatal_error("Values with expressions not implemented yet!");
+      }
+    }
+  }
+}
+
+
+void FrameEntry::dumpInstructions(raw_ostream &OS) const {
+  // TODO: at the moment only instruction names are dumped. Expand this to
+  // dump operands as well.
+  for (std::vector<Instruction>::const_iterator I = Instructions.begin(),
+                                                E = Instructions.end();
+       I != E; ++I) {
+    uint8_t Opcode = I->Opcode;
+    if (Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK)
+      Opcode &= DWARF_CFI_PRIMARY_OPCODE_MASK;
+    OS << "  " << CallFrameString(Opcode) << ":\n";
+  }
+}
+
+
+namespace {
+/// \brief DWARF Common Information Entry (CIE)
+class CIE : public FrameEntry {
+public:
+  // CIEs (and FDEs) are simply container classes, so the only sensible way to
+  // create them is by providing the full parsed contents in the constructor.
+  CIE(DataExtractor D, uint64_t Offset, uint64_t Length, uint8_t Version,
+      SmallString<8> Augmentation, uint64_t CodeAlignmentFactor,
+      int64_t DataAlignmentFactor, uint64_t ReturnAddressRegister)
+   : FrameEntry(FK_CIE, D, Offset, Length), Version(Version),
+     Augmentation(Augmentation), CodeAlignmentFactor(CodeAlignmentFactor),
+     DataAlignmentFactor(DataAlignmentFactor),
+     ReturnAddressRegister(ReturnAddressRegister) {}
+
+  ~CIE() {
+  }
+
+  void dumpHeader(raw_ostream &OS) const {
+    OS << format("%08x %08x %08x CIE",
+                 (uint32_t)Offset, (uint32_t)Length, DW_CIE_ID)
+       << "\n";
+    OS << format("  Version:               %d\n", Version);
+    OS << "  Augmentation:          \"" << Augmentation << "\"\n";
+    OS << format("  Code alignment factor: %u\n",
+                 (uint32_t)CodeAlignmentFactor);
+    OS << format("  Data alignment factor: %d\n",
+                 (int32_t)DataAlignmentFactor);
+    OS << format("  Return address column: %d\n",
+                 (int32_t)ReturnAddressRegister);
+    OS << "\n";
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_CIE;
+  } 
+
+private:
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint8_t Version;
+  SmallString<8> Augmentation;
+  uint64_t CodeAlignmentFactor;
+  int64_t DataAlignmentFactor;
+  uint64_t ReturnAddressRegister;
+};
+
+
+/// \brief DWARF Frame Description Entry (FDE)
+class FDE : public FrameEntry {
+public:
+  // Each FDE has a CIE it's "linked to". Our FDE contains is constructed with
+  // an offset to the CIE (provided by parsing the FDE header). The CIE itself
+  // is obtained lazily once it's actually required.
+  FDE(DataExtractor D, uint64_t Offset, uint64_t Length,
+      int64_t LinkedCIEOffset, uint64_t InitialLocation, uint64_t AddressRange)
+   : FrameEntry(FK_FDE, D, Offset, Length), LinkedCIEOffset(LinkedCIEOffset),
+     InitialLocation(InitialLocation), AddressRange(AddressRange),
+     LinkedCIE(NULL) {}
+
+  ~FDE() {
+  }
+
+  void dumpHeader(raw_ostream &OS) const {
+    OS << format("%08x %08x %08x FDE ",
+                 (uint32_t)Offset, (uint32_t)Length, (int32_t)LinkedCIEOffset);
+    OS << format("cie=%08x pc=%08x...%08x\n",
+                 (int32_t)LinkedCIEOffset,
+                 (uint32_t)InitialLocation,
+                 (uint32_t)InitialLocation + (uint32_t)AddressRange);
+    if (LinkedCIE) {
+      OS << format("%p\n", LinkedCIE);
+    }
+  }
+
+  static bool classof(const FrameEntry *FE) {
+    return FE->getKind() == FK_FDE;
+  } 
+private:
+
+  /// The following fields are defined in section 6.4.1 of the DWARF standard v3
+  uint64_t LinkedCIEOffset;
+  uint64_t InitialLocation;
+  uint64_t AddressRange;
+  CIE *LinkedCIE;
+};
+} // end anonymous namespace
+
+
+DWARFDebugFrame::DWARFDebugFrame() {
+}
+
+
+DWARFDebugFrame::~DWARFDebugFrame() {
+  for (EntryVector::iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    delete *I;
+  }
+}
+
+
+static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
+                                              uint32_t Offset, int Length) {
+  errs() << "DUMP: ";
+  for (int i = 0; i < Length; ++i) {
+    uint8_t c = Data.getU8(&Offset);
+    errs().write_hex(c); errs() << " ";
+  }
+  errs() << "\n";
+}
+
+
+void DWARFDebugFrame::parse(DataExtractor Data) {
+  uint32_t Offset = 0;
+
+  while (Data.isValidOffset(Offset)) {
+    uint32_t StartOffset = Offset;
+
+    bool IsDWARF64 = false;
+    uint64_t Length = Data.getU32(&Offset);
+    uint64_t Id;
+
+    if (Length == UINT32_MAX) {
+      // DWARF-64 is distinguished by the first 32 bits of the initial length
+      // field being 0xffffffff. Then, the next 64 bits are the actual entry
+      // length.
+      IsDWARF64 = true;
+      Length = Data.getU64(&Offset);
+    }
+
+    // At this point, Offset points to the next field after Length.
+    // Length is the structure size excluding itself. Compute an offset one
+    // past the end of the structure (needed to know how many instructions to
+    // read).
+    // TODO: For honest DWARF64 support, DataExtractor will have to treat
+    //       offset_ptr as uint64_t*
+    uint32_t EndStructureOffset = Offset + static_cast<uint32_t>(Length);
+
+    // The Id field's size depends on the DWARF format
+    Id = Data.getUnsigned(&Offset, IsDWARF64 ? 8 : 4);
+    bool IsCIE = ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID);
+
+    FrameEntry *Entry = 0;
+    if (IsCIE) {
+      // Note: this is specifically DWARFv3 CIE header structure. It was
+      // changed in DWARFv4. We currently don't support reading DWARFv4
+      // here because LLVM itself does not emit it (and LLDB doesn't
+      // support it either).
+      uint8_t Version = Data.getU8(&Offset);
+      const char *Augmentation = Data.getCStr(&Offset);
+      uint64_t CodeAlignmentFactor = Data.getULEB128(&Offset);
+      int64_t DataAlignmentFactor = Data.getSLEB128(&Offset);
+      uint64_t ReturnAddressRegister = Data.getULEB128(&Offset);
+
+      Entry = new CIE(Data, StartOffset, Length, Version,
+                      StringRef(Augmentation), CodeAlignmentFactor,
+                      DataAlignmentFactor, ReturnAddressRegister);
+    } else {
+      // FDE
+      uint64_t CIEPointer = Id;
+      uint64_t InitialLocation = Data.getAddress(&Offset);
+      uint64_t AddressRange = Data.getAddress(&Offset);
+
+      Entry = new FDE(Data, StartOffset, Length, CIEPointer,
+                      InitialLocation, AddressRange);
+    }
+
+    assert(Entry && "Expected Entry to be populated with CIE or FDE");
+    Entry->parseInstructions(&Offset, EndStructureOffset);
+
+    if (Offset == EndStructureOffset) {
+      // Entry instrucitons parsed successfully.
+      Entries.push_back(Entry);
+    } else {
+      std::string Str;
+      raw_string_ostream OS(Str);
+      OS << format("Parsing entry instructions at %lx failed",
+                   Entry->getOffset());
+      report_fatal_error(Str);
+    }
+  }
+}
+
+
+void DWARFDebugFrame::dump(raw_ostream &OS) const {
+  OS << "\n";
+  for (EntryVector::const_iterator I = Entries.begin(), E = Entries.end();
+       I != E; ++I) {
+    FrameEntry *Entry = *I;
+    Entry->dumpHeader(OS);
+    Entry->dumpInstructions(OS);
+    OS << "\n";
+  }
+}
+
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.h b/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.h
new file mode 100644
index 000000000000..48b8d63a5a64
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugFrame.h
@@ -0,0 +1,46 @@
+//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
+#define LLVM_DEBUGINFO_DWARFDEBUGFRAME_H
+
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+
+namespace llvm {
+
+class FrameEntry;
+
+
+/// \brief A parsed .debug_frame section
+///
+class DWARFDebugFrame {
+public:
+  DWARFDebugFrame();
+  ~DWARFDebugFrame();
+
+  /// \brief Dump the section data into the given stream.
+  void dump(raw_ostream &OS) const;
+
+  /// \brief Parse the section from raw data.
+  /// data is assumed to be pointing to the beginning of the section.
+  void parse(DataExtractor Data);
+
+private:
+  typedef std::vector<FrameEntry *> EntryVector;
+  EntryVector Entries;
+};
+
+
+} // namespace llvm
+
+#endif 
+
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.cpp
new file mode 100644
index 000000000000..02b15d69043f
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -0,0 +1,589 @@
+//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "DWARFDebugAbbrev.h"
+#include "DWARFFormValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
+                                      const DWARFCompileUnit *cu,
+                                      unsigned recurseDepth,
+                                      unsigned indent) const {
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint32_t offset = Offset;
+
+  if (debug_info_data.isValidOffset(offset)) {
+    uint32_t abbrCode = debug_info_data.getULEB128(&offset);
+
+    OS << format("\n0x%8.8x: ", Offset);
+    if (abbrCode) {
+      if (AbbrevDecl) {
+        const char *tagString = TagString(getTag());
+        if (tagString)
+          OS.indent(indent) << tagString;
+        else
+          OS.indent(indent) << format("DW_TAG_Unknown_%x", getTag());
+        OS << format(" [%u] %c\n", abbrCode,
+                     AbbrevDecl->hasChildren() ? '*' : ' ');
+
+        // Dump all data in the DIE for the attributes.
+        const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+        for (uint32_t i = 0; i != numAttributes; ++i) {
+          uint16_t attr = AbbrevDecl->getAttrByIndex(i);
+          uint16_t form = AbbrevDecl->getFormByIndex(i);
+          dumpAttribute(OS, cu, &offset, attr, form, indent);
+        }
+
+        const DWARFDebugInfoEntryMinimal *child = getFirstChild();
+        if (recurseDepth > 0 && child) {
+          while (child) {
+            child->dump(OS, cu, recurseDepth-1, indent+2);
+            child = child->getSibling();
+          }
+        }
+      } else {
+        OS << "Abbreviation code not found in 'debug_abbrev' class for code: "
+           << abbrCode << '\n';
+      }
+    } else {
+      OS.indent(indent) << "NULL\n";
+    }
+  }
+}
+
+void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
+                                               const DWARFCompileUnit *cu,
+                                               uint32_t* offset_ptr,
+                                               uint16_t attr,
+                                               uint16_t form,
+                                               unsigned indent) const {
+  OS << format("0x%8.8x: ", *offset_ptr);
+  OS.indent(indent+2);
+  const char *attrString = AttributeString(attr);
+  if (attrString)
+    OS << attrString;
+  else
+    OS << format("DW_AT_Unknown_%x", attr);
+  const char *formString = FormEncodingString(form);
+  if (formString)
+    OS << " [" << formString << ']';
+  else
+    OS << format(" [DW_FORM_Unknown_%x]", form);
+
+  DWARFFormValue formValue(form);
+
+  if (!formValue.extractValue(cu->getDebugInfoExtractor(), offset_ptr, cu))
+    return;
+
+  OS << "\t(";
+  formValue.dump(OS, cu);
+  OS << ")\n";
+}
+
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
+                                             const uint8_t *fixed_form_sizes,
+                                             uint32_t *offset_ptr) {
+  Offset = *offset_ptr;
+
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint64_t abbrCode = debug_info_data.getULEB128(offset_ptr);
+
+  assert(fixed_form_sizes); // For best performance this should be specified!
+
+  if (abbrCode) {
+    uint32_t offset = *offset_ptr;
+
+    AbbrevDecl = cu->getAbbreviations()->getAbbreviationDeclaration(abbrCode);
+
+    // Skip all data in the .debug_info for the attributes
+    const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+    uint32_t i;
+    uint16_t form;
+    for (i=0; i<numAttributes; ++i) {
+
+      form = AbbrevDecl->getFormByIndex(i);
+
+      // FIXME: Currently we're checking if this is less than the last
+      // entry in the fixed_form_sizes table, but this should be changed
+      // to use dynamic dispatch.
+      const uint8_t fixed_skip_size = (form < DW_FORM_ref_sig8) ?
+                                       fixed_form_sizes[form] : 0;
+      if (fixed_skip_size)
+        offset += fixed_skip_size;
+      else {
+        bool form_is_indirect = false;
+        do {
+          form_is_indirect = false;
+          uint32_t form_size = 0;
+          switch (form) {
+          // Blocks if inlined data that have a length field and the data bytes
+          // inlined in the .debug_info.
+          case DW_FORM_exprloc:
+          case DW_FORM_block:
+            form_size = debug_info_data.getULEB128(&offset);
+            break;
+          case DW_FORM_block1:
+            form_size = debug_info_data.getU8(&offset);
+            break;
+          case DW_FORM_block2:
+            form_size = debug_info_data.getU16(&offset);
+            break;
+          case DW_FORM_block4:
+            form_size = debug_info_data.getU32(&offset);
+            break;
+
+          // Inlined NULL terminated C-strings
+          case DW_FORM_string:
+            debug_info_data.getCStr(&offset);
+            break;
+
+          // Compile unit address sized values
+          case DW_FORM_addr:
+          case DW_FORM_ref_addr:
+            form_size = cu->getAddressByteSize();
+            break;
+
+          // 0 sized form.
+          case DW_FORM_flag_present:
+            form_size = 0;
+            break;
+
+          // 1 byte values
+          case DW_FORM_data1:
+          case DW_FORM_flag:
+          case DW_FORM_ref1:
+            form_size = 1;
+            break;
+
+          // 2 byte values
+          case DW_FORM_data2:
+          case DW_FORM_ref2:
+            form_size = 2;
+            break;
+
+          // 4 byte values
+          case DW_FORM_strp:
+          case DW_FORM_data4:
+          case DW_FORM_ref4:
+            form_size = 4;
+            break;
+
+          // 8 byte values
+          case DW_FORM_data8:
+          case DW_FORM_ref8:
+          case DW_FORM_ref_sig8:
+            form_size = 8;
+            break;
+
+          // signed or unsigned LEB 128 values
+          case DW_FORM_sdata:
+          case DW_FORM_udata:
+          case DW_FORM_ref_udata:
+          case DW_FORM_GNU_str_index:
+          case DW_FORM_GNU_addr_index:
+            debug_info_data.getULEB128(&offset);
+            break;
+
+          case DW_FORM_indirect:
+            form_is_indirect = true;
+            form = debug_info_data.getULEB128(&offset);
+            break;
+
+            // FIXME: 64-bit for DWARF64
+          case DW_FORM_sec_offset:
+            debug_info_data.getU32(offset_ptr);
+            break;
+
+          default:
+            *offset_ptr = Offset;
+            return false;
+          }
+          offset += form_size;
+        } while (form_is_indirect);
+      }
+    }
+    *offset_ptr = offset;
+    return true;
+  } else {
+    AbbrevDecl = NULL;
+    return true; // NULL debug tag entry
+  }
+}
+
+bool
+DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
+                                    uint32_t *offset_ptr) {
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  const uint32_t cu_end_offset = cu->getNextCompileUnitOffset();
+  const uint8_t cu_addr_size = cu->getAddressByteSize();
+  uint32_t offset = *offset_ptr;
+  if ((offset < cu_end_offset) && debug_info_data.isValidOffset(offset)) {
+    Offset = offset;
+
+    uint64_t abbrCode = debug_info_data.getULEB128(&offset);
+
+    if (abbrCode) {
+      AbbrevDecl = cu->getAbbreviations()->getAbbreviationDeclaration(abbrCode);
+
+      if (AbbrevDecl) {
+        uint16_t tag = AbbrevDecl->getTag();
+
+        bool isCompileUnitTag = tag == DW_TAG_compile_unit;
+        if(cu && isCompileUnitTag)
+          const_cast<DWARFCompileUnit*>(cu)->setBaseAddress(0);
+
+        // Skip all data in the .debug_info for the attributes
+        const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+        for (uint32_t i = 0; i != numAttributes; ++i) {
+          uint16_t attr = AbbrevDecl->getAttrByIndex(i);
+          uint16_t form = AbbrevDecl->getFormByIndex(i);
+
+          if (isCompileUnitTag &&
+              ((attr == DW_AT_entry_pc) || (attr == DW_AT_low_pc))) {
+            DWARFFormValue form_value(form);
+            if (form_value.extractValue(debug_info_data, &offset, cu)) {
+              if (attr == DW_AT_low_pc || attr == DW_AT_entry_pc)
+                const_cast<DWARFCompileUnit*>(cu)
+                  ->setBaseAddress(form_value.getUnsigned());
+            }
+          } else {
+            bool form_is_indirect = false;
+            do {
+              form_is_indirect = false;
+              register uint32_t form_size = 0;
+              switch (form) {
+              // Blocks if inlined data that have a length field and the data
+              // bytes // inlined in the .debug_info
+              case DW_FORM_exprloc:
+              case DW_FORM_block:
+                form_size = debug_info_data.getULEB128(&offset);
+                break;
+              case DW_FORM_block1:
+                form_size = debug_info_data.getU8(&offset);
+                break;
+              case DW_FORM_block2:
+                form_size = debug_info_data.getU16(&offset);
+                break;
+              case DW_FORM_block4:
+                form_size = debug_info_data.getU32(&offset);
+                break;
+
+              // Inlined NULL terminated C-strings
+              case DW_FORM_string:
+                debug_info_data.getCStr(&offset);
+                break;
+
+              // Compile unit address sized values
+              case DW_FORM_addr:
+              case DW_FORM_ref_addr:
+                form_size = cu_addr_size;
+                break;
+
+              // 0 byte value
+              case DW_FORM_flag_present:
+                form_size = 0;
+                break;
+
+              // 1 byte values
+              case DW_FORM_data1:
+              case DW_FORM_flag:
+              case DW_FORM_ref1:
+                form_size = 1;
+                break;
+
+              // 2 byte values
+              case DW_FORM_data2:
+              case DW_FORM_ref2:
+                form_size = 2;
+                break;
+
+                // 4 byte values
+              case DW_FORM_strp:
+                form_size = 4;
+                break;
+
+              case DW_FORM_data4:
+              case DW_FORM_ref4:
+                form_size = 4;
+                break;
+
+              // 8 byte values
+              case DW_FORM_data8:
+              case DW_FORM_ref8:
+              case DW_FORM_ref_sig8:
+                form_size = 8;
+                break;
+
+              // signed or unsigned LEB 128 values
+              case DW_FORM_sdata:
+              case DW_FORM_udata:
+              case DW_FORM_ref_udata:
+              case DW_FORM_GNU_str_index:
+              case DW_FORM_GNU_addr_index:
+                debug_info_data.getULEB128(&offset);
+                break;
+
+              case DW_FORM_indirect:
+                form = debug_info_data.getULEB128(&offset);
+                form_is_indirect = true;
+                break;
+
+                // FIXME: 64-bit for DWARF64.
+              case DW_FORM_sec_offset:
+                debug_info_data.getU32(offset_ptr);
+                break;
+
+              default:
+                *offset_ptr = offset;
+                return false;
+              }
+
+              offset += form_size;
+            } while (form_is_indirect);
+          }
+        }
+        *offset_ptr = offset;
+        return true;
+      }
+    } else {
+      AbbrevDecl = NULL;
+      *offset_ptr = offset;
+      return true;    // NULL debug tag entry
+    }
+  }
+
+  return false;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubprogramDIE() const {
+  return getTag() == DW_TAG_subprogram;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
+  uint32_t Tag = getTag();
+  return Tag == DW_TAG_subprogram ||
+         Tag == DW_TAG_inlined_subroutine;
+}
+
+uint32_t
+DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
+                                              const uint16_t attr,
+                                              DWARFFormValue &form_value,
+                                              uint32_t *end_attr_offset_ptr)
+                                              const {
+  if (AbbrevDecl) {
+    uint32_t attr_idx = AbbrevDecl->findAttributeIndex(attr);
+
+    if (attr_idx != -1U) {
+      uint32_t offset = getOffset();
+
+      DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+
+      // Skip the abbreviation code so we are at the data for the attributes
+      debug_info_data.getULEB128(&offset);
+
+      uint32_t idx = 0;
+      while (idx < attr_idx)
+        DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(idx++),
+                                  debug_info_data, &offset, cu);
+
+      const uint32_t attr_offset = offset;
+      form_value = DWARFFormValue(AbbrevDecl->getFormByIndex(idx));
+      if (form_value.extractValue(debug_info_data, &offset, cu)) {
+        if (end_attr_offset_ptr)
+          *end_attr_offset_ptr = offset;
+        return attr_offset;
+      }
+    }
+  }
+
+  return 0;
+}
+
+const char*
+DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     const char* fail_value)
+                                                     const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value)) {
+    DataExtractor stringExtractor(cu->getStringSection(), false, 0);
+    return form_value.getAsCString(&stringExtractor);
+  }
+  return fail_value;
+}
+
+uint64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
+                                                    const DWARFCompileUnit* cu,
+                                                    const uint16_t attr,
+                                                    uint64_t fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getUnsigned();
+  return fail_value;
+}
+
+int64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     int64_t fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getSigned();
+  return fail_value;
+}
+
+uint64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     uint64_t fail_value)
+                                                     const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getReference(cu);
+  return fail_value;
+}
+
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+                                                 uint64_t &LowPC,
+                                                 uint64_t &HighPC) const {
+  HighPC = -1ULL;
+  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
+  if (LowPC != -1ULL)
+    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  return (HighPC != -1ULL);
+}
+
+void
+DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
+                                               DWARFDebugAranges *DebugAranges)
+                                                   const {
+  if (AbbrevDecl) {
+    if (isSubprogramDIE()) {
+      uint64_t LowPC, HighPC;
+      if (getLowAndHighPC(CU, LowPC, HighPC)) {
+        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
+      }
+      // FIXME: try to append ranges from .debug_ranges section.
+    }
+
+    const DWARFDebugInfoEntryMinimal *child = getFirstChild();
+    while (child) {
+      child->buildAddressRangeTable(CU, DebugAranges);
+      child = child->getSibling();
+    }
+  }
+}
+
+bool
+DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+                                                     const DWARFCompileUnit *CU,
+                                                     const uint64_t Address)
+                                                     const {
+  if (isNULL())
+    return false;
+  uint64_t LowPC, HighPC;
+  if (getLowAndHighPC(CU, LowPC, HighPC))
+    return (LowPC <= Address && Address <= HighPC);
+  // Try to get address ranges from .debug_ranges section.
+  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  if (RangesOffset != -1U) {
+    DWARFDebugRangeList RangeList;
+    if (CU->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(CU->getBaseAddress(), Address);
+  }
+  return false;
+}
+
+const char*
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFCompileUnit *CU)
+                                                                         const {
+  if (!isSubroutineDIE())
+    return 0;
+  // Try to get mangled name if possible.
+  if (const char *name =
+      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
+    return name;
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
+    return name;
+  // Try to get name from specification DIE.
+  uint32_t spec_ref =
+      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
+  if (spec_ref != -1U) {
+    DWARFDebugInfoEntryMinimal spec_die;
+    if (spec_die.extract(CU, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(CU))
+        return name;
+    }
+  }
+  // Try to get name from abstract origin DIE.
+  uint32_t abs_origin_ref =
+      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
+  if (abs_origin_ref != -1U) {
+    DWARFDebugInfoEntryMinimal abs_origin_die;
+    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(CU))
+        return name;
+    }
+  }
+  return 0;
+}
+
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFCompileUnit *CU,
+                                                uint32_t &CallFile,
+                                                uint32_t &CallLine,
+                                                uint32_t &CallColumn) const {
+  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+}
+
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
+                                                     const DWARFCompileUnit *CU,
+                                                     const uint64_t Address)
+                                                     const {
+  DWARFDebugInfoEntryMinimal::InlinedChain InlinedChain;
+  if (isNULL())
+    return InlinedChain;
+  for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
+    // Append current DIE to inlined chain only if it has correct tag
+    // (e.g. it is not a lexical block).
+    if (DIE->isSubroutineDIE()) {
+      InlinedChain.push_back(*DIE);
+    }
+    // Try to get child which also contains provided address.
+    const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
+    while (Child) {
+      if (Child->addressRangeContainsAddress(CU, Address)) {
+        // Assume there is only one such child.
+        break;
+      }
+      Child = Child->getSibling();
+    }
+    DIE = Child;
+  }
+  // Reverse the obtained chain to make the root of inlined chain last.
+  std::reverse(InlinedChain.begin(), InlinedChain.end());
+  return InlinedChain;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.h b/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.h
new file mode 100644
index 000000000000..9c1b2be0a71f
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -0,0 +1,176 @@
+//===-- DWARFDebugInfoEntry.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+
+#include "DWARFAbbreviationDeclaration.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class DWARFDebugAranges;
+class DWARFCompileUnit;
+class DWARFContext;
+class DWARFFormValue;
+class DWARFInlinedSubroutineChain;
+
+/// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
+class DWARFDebugInfoEntryMinimal {
+  /// Offset within the .debug_info of the start of this entry.
+  uint32_t Offset;
+
+  /// How many to subtract from "this" to get the parent.
+  /// If zero this die has no parent.
+  uint32_t ParentIdx;
+
+  /// How many to add to "this" to get the sibling.
+  uint32_t SiblingIdx;
+
+  const DWARFAbbreviationDeclaration *AbbrevDecl;
+public:
+  DWARFDebugInfoEntryMinimal()
+    : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
+
+  void dump(raw_ostream &OS, const DWARFCompileUnit *cu,
+            unsigned recurseDepth, unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, const DWARFCompileUnit *cu,
+                     uint32_t *offset_ptr, uint16_t attr, uint16_t form,
+                     unsigned indent = 0) const;
+
+  bool extractFast(const DWARFCompileUnit *cu, const uint8_t *fixed_form_sizes,
+                   uint32_t *offset_ptr);
+
+  /// Extract a debug info entry for a given compile unit from the
+  /// .debug_info and .debug_abbrev data starting at the given offset.
+  bool extract(const DWARFCompileUnit *cu, uint32_t *offset_ptr);
+
+  uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
+  bool isNULL() const { return AbbrevDecl == 0; }
+
+  /// Returns true if DIE represents a subprogram (not inlined).
+  bool isSubprogramDIE() const;
+  /// Returns true if DIE represents a subprogram or an inlined
+  /// subroutine.
+  bool isSubroutineDIE() const;
+
+  uint32_t getOffset() const { return Offset; }
+  uint32_t getNumAttributes() const {
+    return !isNULL() ? AbbrevDecl->getNumAttributes() : 0;
+  }
+  bool hasChildren() const { return !isNULL() && AbbrevDecl->hasChildren(); }
+
+  // We know we are kept in a vector of contiguous entries, so we know
+  // our parent will be some index behind "this".
+  DWARFDebugInfoEntryMinimal *getParent() {
+    return ParentIdx > 0 ? this - ParentIdx : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getParent() const {
+    return ParentIdx > 0 ? this - ParentIdx : 0;
+  }
+  // We know we are kept in a vector of contiguous entries, so we know
+  // our sibling will be some index after "this".
+  DWARFDebugInfoEntryMinimal *getSibling() {
+    return SiblingIdx > 0 ? this + SiblingIdx : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getSibling() const {
+    return SiblingIdx > 0 ? this + SiblingIdx : 0;
+  }
+  // We know we are kept in a vector of contiguous entries, so we know
+  // we don't need to store our child pointer, if we have a child it will
+  // be the next entry in the list...
+  DWARFDebugInfoEntryMinimal *getFirstChild() {
+    return hasChildren() ? this + 1 : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getFirstChild() const {
+    return hasChildren() ? this + 1 : 0;
+  }
+
+  void setParent(DWARFDebugInfoEntryMinimal *parent) {
+    if (parent) {
+      // We know we are kept in a vector of contiguous entries, so we know
+      // our parent will be some index behind "this".
+      ParentIdx = this - parent;
+    } else
+      ParentIdx = 0;
+  }
+  void setSibling(DWARFDebugInfoEntryMinimal *sibling) {
+    if (sibling) {
+      // We know we are kept in a vector of contiguous entries, so we know
+      // our sibling will be some index after "this".
+      SiblingIdx = sibling - this;
+      sibling->setParent(getParent());
+    } else
+      SiblingIdx = 0;
+  }
+
+  const DWARFAbbreviationDeclaration *getAbbreviationDeclarationPtr() const {
+    return AbbrevDecl;
+  }
+
+  uint32_t getAttributeValue(const DWARFCompileUnit *cu,
+                             const uint16_t attr, DWARFFormValue &formValue,
+                             uint32_t *end_attr_offset_ptr = 0) const;
+
+  const char* getAttributeValueAsString(const DWARFCompileUnit* cu,
+                                        const uint16_t attr,
+                                        const char *fail_value) const;
+
+  uint64_t getAttributeValueAsUnsigned(const DWARFCompileUnit *cu,
+                                       const uint16_t attr,
+                                       uint64_t fail_value) const;
+
+  uint64_t getAttributeValueAsReference(const DWARFCompileUnit *cu,
+                                        const uint16_t attr,
+                                        uint64_t fail_value) const;
+
+  int64_t getAttributeValueAsSigned(const DWARFCompileUnit* cu,
+                                    const uint16_t attr,
+                                    int64_t fail_value) const;
+
+  /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
+  /// Returns true if both attributes are present.
+  bool getLowAndHighPC(const DWARFCompileUnit *CU,
+                       uint64_t &LowPC, uint64_t &HighPC) const;
+
+  void buildAddressRangeTable(const DWARFCompileUnit *CU,
+                              DWARFDebugAranges *DebugAranges) const;
+
+  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+                                   const uint64_t Address) const;
+
+  /// If a DIE represents a subprogram (or inlined subroutine),
+  /// returns its mangled name (or short name, if mangled is missing).
+  /// This name may be fetched from specification or abstract origin
+  /// for this subprogram. Returns null if no name is found.
+  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+
+  /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
+  /// DW_AT_call_column from DIE (or zeroes if they are missing).
+  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+                      uint32_t &CallLine, uint32_t &CallColumn) const;
+
+  /// InlinedChain - represents a chain of inlined_subroutine
+  /// DIEs, (possibly ending with subprogram DIE), all of which are contained
+  /// in some concrete inlined instance tree. Address range for each DIE
+  /// (except the last DIE) in this chain is contained in address
+  /// range for next DIE in the chain.
+  typedef SmallVector<DWARFDebugInfoEntryMinimal, 4> InlinedChain;
+
+  /// Get inlined chain for a given address, rooted at the current DIE.
+  /// Returns empty chain if address is not contained in address range
+  /// of current DIE.
+  InlinedChain getInlinedChainForAddress(const DWARFCompileUnit *CU,
+                                         const uint64_t Address) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugLine.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugLine.cpp
new file mode 100644
index 000000000000..192381c6f7c6
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugLine.cpp
@@ -0,0 +1,628 @@
+//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugLine.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
+  OS << "Line table prologue:\n"
+     << format("   total_length: 0x%8.8x\n", TotalLength)
+     << format("        version: %u\n", Version)
+     << format("prologue_length: 0x%8.8x\n", PrologueLength)
+     << format("min_inst_length: %u\n", MinInstLength)
+     << format("default_is_stmt: %u\n", DefaultIsStmt)
+     << format("      line_base: %i\n", LineBase)
+     << format("     line_range: %u\n", LineRange)
+     << format("    opcode_base: %u\n", OpcodeBase);
+
+  for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
+    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i+1),
+                 StandardOpcodeLengths[i]);
+
+  if (!IncludeDirectories.empty())
+    for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
+      OS << format("include_directories[%3u] = '", i+1)
+         << IncludeDirectories[i] << "'\n";
+
+  if (!FileNames.empty()) {
+    OS << "                Dir  Mod Time   File Len   File Name\n"
+       << "                ---- ---------- ---------- -----------"
+          "----------------\n";
+    for (uint32_t i = 0; i < FileNames.size(); ++i) {
+      const FileNameEntry& fileEntry = FileNames[i];
+      OS << format("file_names[%3u] %4" PRIu64 " ", i+1, fileEntry.DirIdx)
+         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ",
+                   fileEntry.ModTime, fileEntry.Length)
+         << fileEntry.Name << '\n';
+    }
+  }
+}
+
+void DWARFDebugLine::Row::postAppend() {
+  BasicBlock = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::reset(bool default_is_stmt) {
+  Address = 0;
+  Line = 1;
+  Column = 0;
+  File = 1;
+  Isa = 0;
+  IsStmt = default_is_stmt;
+  BasicBlock = false;
+  EndSequence = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
+  OS << format("0x%16.16" PRIx64 " %6u %6u", Address, Line, Column)
+     << format(" %6u %3u ", File, Isa)
+     << (IsStmt ? " is_stmt" : "")
+     << (BasicBlock ? " basic_block" : "")
+     << (PrologueEnd ? " prologue_end" : "")
+     << (EpilogueBegin ? " epilogue_begin" : "")
+     << (EndSequence ? " end_sequence" : "")
+     << '\n';
+}
+
+void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
+  Prologue.dump(OS);
+  OS << '\n';
+
+  if (!Rows.empty()) {
+    OS << "Address            Line   Column File   ISA Flags\n"
+       << "------------------ ------ ------ ------ --- -------------\n";
+    for (std::vector<Row>::const_iterator pos = Rows.begin(),
+         end = Rows.end(); pos != end; ++pos)
+      pos->dump(OS);
+  }
+}
+
+DWARFDebugLine::State::~State() {}
+
+void DWARFDebugLine::State::appendRowToMatrix(uint32_t offset) {
+  if (Sequence::Empty) {
+    // Record the beginning of instruction sequence.
+    Sequence::Empty = false;
+    Sequence::LowPC = Address;
+    Sequence::FirstRowIndex = row;
+  }
+  ++row;  // Increase the row number.
+  LineTable::appendRow(*this);
+  if (EndSequence) {
+    // Record the end of instruction sequence.
+    Sequence::HighPC = Address;
+    Sequence::LastRowIndex = row;
+    if (Sequence::isValid())
+      LineTable::appendSequence(*this);
+    Sequence::reset();
+  }
+  Row::postAppend();
+}
+
+void DWARFDebugLine::State::finalize() {
+  row = DoneParsingLineTable;
+  if (!Sequence::Empty) {
+    fprintf(stderr, "warning: last sequence in debug line table is not"
+                    "terminated!\n");
+  }
+  // Sort all sequences so that address lookup will work faster.
+  if (!Sequences.empty()) {
+    std::sort(Sequences.begin(), Sequences.end(), Sequence::orderByLowPC);
+    // Note: actually, instruction address ranges of sequences should not
+    // overlap (in shared objects and executables). If they do, the address
+    // lookup would still work, though, but result would be ambiguous.
+    // We don't report warning in this case. For example,
+    // sometimes .so compiled from multiple object files contains a few
+    // rudimentary sequences for address ranges [0x0, 0xsomething).
+  }
+}
+
+DWARFDebugLine::DumpingState::~DumpingState() {}
+
+void DWARFDebugLine::DumpingState::finalize() {
+  LineTable::dump(OS);
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getLineTable(uint32_t offset) const {
+  LineTableConstIter pos = LineTableMap.find(offset);
+  if (pos != LineTableMap.end())
+    return &pos->second;
+  return 0;
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
+                                    uint32_t offset) {
+  std::pair<LineTableIter, bool> pos =
+    LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
+  if (pos.second) {
+    // Parse and cache the line table for at this offset.
+    State state;
+    if (!parseStatementTable(debug_line_data, RelocMap, &offset, state))
+      return 0;
+    pos.first->second = state;
+  }
+  return &pos.first->second;
+}
+
+bool
+DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
+                              uint32_t *offset_ptr, Prologue *prologue) {
+  const uint32_t prologue_offset = *offset_ptr;
+
+  prologue->clear();
+  prologue->TotalLength = debug_line_data.getU32(offset_ptr);
+  prologue->Version = debug_line_data.getU16(offset_ptr);
+  if (prologue->Version != 2)
+    return false;
+
+  prologue->PrologueLength = debug_line_data.getU32(offset_ptr);
+  const uint32_t end_prologue_offset = prologue->PrologueLength + *offset_ptr;
+  prologue->MinInstLength = debug_line_data.getU8(offset_ptr);
+  prologue->DefaultIsStmt = debug_line_data.getU8(offset_ptr);
+  prologue->LineBase = debug_line_data.getU8(offset_ptr);
+  prologue->LineRange = debug_line_data.getU8(offset_ptr);
+  prologue->OpcodeBase = debug_line_data.getU8(offset_ptr);
+
+  prologue->StandardOpcodeLengths.reserve(prologue->OpcodeBase-1);
+  for (uint32_t i = 1; i < prologue->OpcodeBase; ++i) {
+    uint8_t op_len = debug_line_data.getU8(offset_ptr);
+    prologue->StandardOpcodeLengths.push_back(op_len);
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *s = debug_line_data.getCStr(offset_ptr);
+    if (s && s[0])
+      prologue->IncludeDirectories.push_back(s);
+    else
+      break;
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *name = debug_line_data.getCStr(offset_ptr);
+    if (name && name[0]) {
+      FileNameEntry fileEntry;
+      fileEntry.Name = name;
+      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+      prologue->FileNames.push_back(fileEntry);
+    } else {
+      break;
+    }
+  }
+
+  if (*offset_ptr != end_prologue_offset) {
+    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
+                    " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
+            prologue_offset, end_prologue_offset, *offset_ptr);
+    return false;
+  }
+  return true;
+}
+
+bool
+DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data, 
+                                    const RelocAddrMap *RMap,
+                                    uint32_t *offset_ptr, State &state) {
+  const uint32_t debug_line_offset = *offset_ptr;
+
+  Prologue *prologue = &state.Prologue;
+
+  if (!parsePrologue(debug_line_data, offset_ptr, prologue)) {
+    // Restore our offset and return false to indicate failure!
+    *offset_ptr = debug_line_offset;
+    return false;
+  }
+
+  const uint32_t end_offset = debug_line_offset + prologue->TotalLength +
+                              sizeof(prologue->TotalLength);
+
+  state.reset();
+
+  while (*offset_ptr < end_offset) {
+    uint8_t opcode = debug_line_data.getU8(offset_ptr);
+
+    if (opcode == 0) {
+      // Extended Opcodes always start with a zero opcode followed by
+      // a uleb128 length so you can skip ones you don't know about
+      uint32_t ext_offset = *offset_ptr;
+      uint64_t len = debug_line_data.getULEB128(offset_ptr);
+      uint32_t arg_size = len - (*offset_ptr - ext_offset);
+
+      uint8_t sub_opcode = debug_line_data.getU8(offset_ptr);
+      switch (sub_opcode) {
+      case DW_LNE_end_sequence:
+        // Set the end_sequence register of the state machine to true and
+        // append a row to the matrix using the current values of the
+        // state-machine registers. Then reset the registers to the initial
+        // values specified above. Every statement program sequence must end
+        // with a DW_LNE_end_sequence instruction which creates a row whose
+        // address is that of the byte after the last target machine instruction
+        // of the sequence.
+        state.EndSequence = true;
+        state.appendRowToMatrix(*offset_ptr);
+        state.reset();
+        break;
+
+      case DW_LNE_set_address:
+        // Takes a single relocatable address as an operand. The size of the
+        // operand is the size appropriate to hold an address on the target
+        // machine. Set the address register to the value given by the
+        // relocatable address. All of the other statement program opcodes
+        // that affect the address register add a delta to it. This instruction
+        // stores a relocatable value into it instead.
+        {
+          // If this address is in our relocation map, apply the relocation.
+          RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
+          if (AI != RMap->end()) {
+             const std::pair<uint8_t, int64_t> &R = AI->second;
+             state.Address = debug_line_data.getAddress(offset_ptr) + R.second;
+          } else
+            state.Address = debug_line_data.getAddress(offset_ptr);
+        }
+        break;
+
+      case DW_LNE_define_file:
+        // Takes 4 arguments. The first is a null terminated string containing
+        // a source file name. The second is an unsigned LEB128 number
+        // representing the directory index of the directory in which the file
+        // was found. The third is an unsigned LEB128 number representing the
+        // time of last modification of the file. The fourth is an unsigned
+        // LEB128 number representing the length in bytes of the file. The time
+        // and length fields may contain LEB128(0) if the information is not
+        // available.
+        //
+        // The directory index represents an entry in the include_directories
+        // section of the statement program prologue. The index is LEB128(0)
+        // if the file was found in the current directory of the compilation,
+        // LEB128(1) if it was found in the first directory in the
+        // include_directories section, and so on. The directory index is
+        // ignored for file names that represent full path names.
+        //
+        // The files are numbered, starting at 1, in the order in which they
+        // appear; the names in the prologue come before names defined by
+        // the DW_LNE_define_file instruction. These numbers are used in the
+        // the file register of the state machine.
+        {
+          FileNameEntry fileEntry;
+          fileEntry.Name = debug_line_data.getCStr(offset_ptr);
+          fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+          prologue->FileNames.push_back(fileEntry);
+        }
+        break;
+
+      default:
+        // Length doesn't include the zero opcode byte or the length itself, but
+        // it does include the sub_opcode, so we have to adjust for that below
+        (*offset_ptr) += arg_size;
+        break;
+      }
+    } else if (opcode < prologue->OpcodeBase) {
+      switch (opcode) {
+      // Standard Opcodes
+      case DW_LNS_copy:
+        // Takes no arguments. Append a row to the matrix using the
+        // current values of the state-machine registers. Then set
+        // the basic_block register to false.
+        state.appendRowToMatrix(*offset_ptr);
+        break;
+
+      case DW_LNS_advance_pc:
+        // Takes a single unsigned LEB128 operand, multiplies it by the
+        // min_inst_length field of the prologue, and adds the
+        // result to the address register of the state machine.
+        state.Address += debug_line_data.getULEB128(offset_ptr) *
+                         prologue->MinInstLength;
+        break;
+
+      case DW_LNS_advance_line:
+        // Takes a single signed LEB128 operand and adds that value to
+        // the line register of the state machine.
+        state.Line += debug_line_data.getSLEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_file:
+        // Takes a single unsigned LEB128 operand and stores it in the file
+        // register of the state machine.
+        state.File = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_column:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        state.Column = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_negate_stmt:
+        // Takes no arguments. Set the is_stmt register of the state
+        // machine to the logical negation of its current value.
+        state.IsStmt = !state.IsStmt;
+        break;
+
+      case DW_LNS_set_basic_block:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        state.BasicBlock = true;
+        break;
+
+      case DW_LNS_const_add_pc:
+        // Takes no arguments. Add to the address register of the state
+        // machine the address increment value corresponding to special
+        // opcode 255. The motivation for DW_LNS_const_add_pc is this:
+        // when the statement program needs to advance the address by a
+        // small amount, it can use a single special opcode, which occupies
+        // a single byte. When it needs to advance the address by up to
+        // twice the range of the last special opcode, it can use
+        // DW_LNS_const_add_pc followed by a special opcode, for a total
+        // of two bytes. Only if it needs to advance the address by more
+        // than twice that range will it need to use both DW_LNS_advance_pc
+        // and a special opcode, requiring three or more bytes.
+        {
+          uint8_t adjust_opcode = 255 - prologue->OpcodeBase;
+          uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
+                                 prologue->MinInstLength;
+          state.Address += addr_offset;
+        }
+        break;
+
+      case DW_LNS_fixed_advance_pc:
+        // Takes a single uhalf operand. Add to the address register of
+        // the state machine the value of the (unencoded) operand. This
+        // is the only extended opcode that takes an argument that is not
+        // a variable length number. The motivation for DW_LNS_fixed_advance_pc
+        // is this: existing assemblers cannot emit DW_LNS_advance_pc or
+        // special opcodes because they cannot encode LEB128 numbers or
+        // judge when the computation of a special opcode overflows and
+        // requires the use of DW_LNS_advance_pc. Such assemblers, however,
+        // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
+        state.Address += debug_line_data.getU16(offset_ptr);
+        break;
+
+      case DW_LNS_set_prologue_end:
+        // Takes no arguments. Set the prologue_end register of the
+        // state machine to true
+        state.PrologueEnd = true;
+        break;
+
+      case DW_LNS_set_epilogue_begin:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        state.EpilogueBegin = true;
+        break;
+
+      case DW_LNS_set_isa:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        state.Isa = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      default:
+        // Handle any unknown standard opcodes here. We know the lengths
+        // of such opcodes because they are specified in the prologue
+        // as a multiple of LEB128 operands for each opcode.
+        {
+          assert(opcode - 1U < prologue->StandardOpcodeLengths.size());
+          uint8_t opcode_length = prologue->StandardOpcodeLengths[opcode - 1];
+          for (uint8_t i=0; i<opcode_length; ++i)
+            debug_line_data.getULEB128(offset_ptr);
+        }
+        break;
+      }
+    } else {
+      // Special Opcodes
+
+      // A special opcode value is chosen based on the amount that needs
+      // to be added to the line and address registers. The maximum line
+      // increment for a special opcode is the value of the line_base
+      // field in the header, plus the value of the line_range field,
+      // minus 1 (line base + line range - 1). If the desired line
+      // increment is greater than the maximum line increment, a standard
+      // opcode must be used instead of a special opcode. The "address
+      // advance" is calculated by dividing the desired address increment
+      // by the minimum_instruction_length field from the header. The
+      // special opcode is then calculated using the following formula:
+      //
+      //  opcode = (desired line increment - line_base) +
+      //           (line_range * address advance) + opcode_base
+      //
+      // If the resulting opcode is greater than 255, a standard opcode
+      // must be used instead.
+      //
+      // To decode a special opcode, subtract the opcode_base from the
+      // opcode itself to give the adjusted opcode. The amount to
+      // increment the address register is the result of the adjusted
+      // opcode divided by the line_range multiplied by the
+      // minimum_instruction_length field from the header. That is:
+      //
+      //  address increment = (adjusted opcode / line_range) *
+      //                      minimum_instruction_length
+      //
+      // The amount to increment the line register is the line_base plus
+      // the result of the adjusted opcode modulo the line_range. That is:
+      //
+      // line increment = line_base + (adjusted opcode % line_range)
+
+      uint8_t adjust_opcode = opcode - prologue->OpcodeBase;
+      uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
+                             prologue->MinInstLength;
+      int32_t line_offset = prologue->LineBase +
+                            (adjust_opcode % prologue->LineRange);
+      state.Line += line_offset;
+      state.Address += addr_offset;
+      state.appendRowToMatrix(*offset_ptr);
+    }
+  }
+
+  state.finalize();
+
+  return end_offset;
+}
+
+uint32_t
+DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
+  uint32_t unknown_index = UINT32_MAX;
+  if (Sequences.empty())
+    return unknown_index;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  DWARFDebugLine::Sequence found_seq;
+  if (seq_pos == last_seq) {
+    found_seq = Sequences.back();
+  } else if (seq_pos->LowPC == address) {
+    found_seq = *seq_pos;
+  } else {
+    if (seq_pos == first_seq)
+      return unknown_index;
+    found_seq = *(seq_pos - 1);
+  }
+  if (!found_seq.containsPC(address))
+    return unknown_index;
+  // Search for instruction address in the rows describing the sequence.
+  // Rows are stored in a vector, so we may use arithmetical operations with
+  // iterators.
+  DWARFDebugLine::Row row;
+  row.Address = address;
+  RowIter first_row = Rows.begin() + found_seq.FirstRowIndex;
+  RowIter last_row = Rows.begin() + found_seq.LastRowIndex;
+  RowIter row_pos = std::lower_bound(first_row, last_row, row,
+      DWARFDebugLine::Row::orderByAddress);
+  if (row_pos == last_row) {
+    return found_seq.LastRowIndex - 1;
+  }
+  uint32_t index = found_seq.FirstRowIndex + (row_pos - first_row);
+  if (row_pos->Address > address) {
+    if (row_pos == first_row)
+      return unknown_index;
+    else
+      index--;
+  }
+  return index;
+}
+
+bool
+DWARFDebugLine::LineTable::lookupAddressRange(uint64_t address,
+                                       uint64_t size, 
+                                       std::vector<uint32_t>& result) const {
+  if (Sequences.empty())
+    return false;
+  uint64_t end_addr = address + size;
+  // First, find an instruction sequence containing the given address.
+  DWARFDebugLine::Sequence sequence;
+  sequence.LowPC = address;
+  SequenceIter first_seq = Sequences.begin();
+  SequenceIter last_seq = Sequences.end();
+  SequenceIter seq_pos = std::lower_bound(first_seq, last_seq, sequence,
+      DWARFDebugLine::Sequence::orderByLowPC);
+  if (seq_pos == last_seq || seq_pos->LowPC != address) {
+    if (seq_pos == first_seq)
+      return false;
+    seq_pos--;
+  }
+  if (!seq_pos->containsPC(address))
+    return false;
+
+  SequenceIter start_pos = seq_pos;
+
+  // Add the rows from the first sequence to the vector, starting with the
+  // index we just calculated
+
+  while (seq_pos != last_seq && seq_pos->LowPC < end_addr) {
+    DWARFDebugLine::Sequence cur_seq = *seq_pos;
+    uint32_t first_row_index;
+    uint32_t last_row_index;
+    if (seq_pos == start_pos) {
+      // For the first sequence, we need to find which row in the sequence is the
+      // first in our range. Rows are stored in a vector, so we may use
+      // arithmetical operations with iterators.
+      DWARFDebugLine::Row row;
+      row.Address = address;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our start address. Unless that's the first row, we want to start at
+      // the row before that.
+      first_row_index = cur_seq.FirstRowIndex + (row_pos - first_row);
+      if (row_pos != first_row)
+        --first_row_index;
+    } else
+      first_row_index = cur_seq.FirstRowIndex;
+
+    // For the last sequence in our range, we need to figure out the last row in
+    // range.  For all other sequences we can go to the end of the sequence.
+    if (cur_seq.HighPC > end_addr) {
+      DWARFDebugLine::Row row;
+      row.Address = end_addr;
+      RowIter first_row = Rows.begin() + cur_seq.FirstRowIndex;
+      RowIter last_row = Rows.begin() + cur_seq.LastRowIndex;
+      RowIter row_pos = std::upper_bound(first_row, last_row, row,
+                                         DWARFDebugLine::Row::orderByAddress);
+      // The 'row_pos' iterator references the first row that is greater than
+      // our end address.  The row before that is the last row we want.
+      last_row_index = cur_seq.FirstRowIndex + (row_pos - first_row) - 1;
+    } else
+      // Contrary to what you might expect, DWARFDebugLine::SequenceLastRowIndex
+      // isn't a valid index within the current sequence.  It's that plus one.
+      last_row_index = cur_seq.LastRowIndex - 1;
+
+    for (uint32_t i = first_row_index; i <= last_row_index; ++i) {
+      result.push_back(i);
+    }
+
+    ++seq_pos;
+  }
+
+  return true;
+}
+
+bool
+DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                              bool NeedsAbsoluteFilePath,
+                                              std::string &Result) const {
+  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size())
+    return false;
+  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
+  const char *FileName = Entry.Name;
+  if (!NeedsAbsoluteFilePath ||
+      sys::path::is_absolute(FileName)) {
+    Result = FileName;
+    return true;
+  }
+  SmallString<16> FilePath;
+  uint64_t IncludeDirIndex = Entry.DirIdx;
+  // Be defensive about the contents of Entry.
+  if (IncludeDirIndex > 0 &&
+      IncludeDirIndex <= Prologue.IncludeDirectories.size()) {
+    const char *IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+    sys::path::append(FilePath, IncludeDir);
+  }
+  sys::path::append(FilePath, FileName);
+  Result = FilePath.str();
+  return true;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugLine.h b/contrib/llvm/lib/DebugInfo/DWARFDebugLine.h
new file mode 100644
index 000000000000..2990756bd7c9
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugLine.h
@@ -0,0 +1,254 @@
+//===-- DWARFDebugLine.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+#define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+
+#include "DWARFRelocMap.h"
+#include "llvm/Support/DataExtractor.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugLine {
+public:
+  DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
+  struct FileNameEntry {
+    FileNameEntry() : Name(0), DirIdx(0), ModTime(0), Length(0) {}
+
+    const char *Name;
+    uint64_t DirIdx;
+    uint64_t ModTime;
+    uint64_t Length;
+  };
+
+  struct Prologue {
+    Prologue()
+      : TotalLength(0), Version(0), PrologueLength(0), MinInstLength(0),
+        DefaultIsStmt(0), LineBase(0), LineRange(0), OpcodeBase(0) {}
+
+    // The size in bytes of the statement information for this compilation unit
+    // (not including the total_length field itself).
+    uint32_t TotalLength;
+    // Version identifier for the statement information format.
+    uint16_t Version;
+    // The number of bytes following the prologue_length field to the beginning
+    // of the first byte of the statement program itself.
+    uint32_t PrologueLength;
+    // The size in bytes of the smallest target machine instruction. Statement
+    // program opcodes that alter the address register first multiply their
+    // operands by this value.
+    uint8_t MinInstLength;
+    // The initial value of theis_stmtregister.
+    uint8_t DefaultIsStmt;
+    // This parameter affects the meaning of the special opcodes. See below.
+    int8_t LineBase;
+    // This parameter affects the meaning of the special opcodes. See below.
+    uint8_t LineRange;
+    // The number assigned to the first special opcode.
+    uint8_t OpcodeBase;
+    std::vector<uint8_t> StandardOpcodeLengths;
+    std::vector<const char*> IncludeDirectories;
+    std::vector<FileNameEntry> FileNames;
+
+    // Length of the prologue in bytes.
+    uint32_t getLength() const {
+      return PrologueLength + sizeof(TotalLength) + sizeof(Version) +
+             sizeof(PrologueLength);
+    }
+    // Length of the line table data in bytes (not including the prologue).
+    uint32_t getStatementTableLength() const {
+      return TotalLength + sizeof(TotalLength) - getLength();
+    }
+    int32_t getMaxLineIncrementForSpecialOpcode() const {
+      return LineBase + (int8_t)LineRange - 1;
+    }
+    void dump(raw_ostream &OS) const;
+    void clear() {
+      TotalLength = Version = PrologueLength = 0;
+      MinInstLength = LineBase = LineRange = OpcodeBase = 0;
+      StandardOpcodeLengths.clear();
+      IncludeDirectories.clear();
+      FileNames.clear();
+    }
+  };
+
+  // Standard .debug_line state machine structure.
+  struct Row {
+    Row(bool default_is_stmt = false) { reset(default_is_stmt); }
+    /// Called after a row is appended to the matrix.
+    void postAppend();
+    void reset(bool default_is_stmt);
+    void dump(raw_ostream &OS) const;
+
+    static bool orderByAddress(const Row& LHS, const Row& RHS) {
+      return LHS.Address < RHS.Address;
+    }
+
+    // The program-counter value corresponding to a machine instruction
+    // generated by the compiler.
+    uint64_t Address;
+    // An unsigned integer indicating a source line number. Lines are numbered
+    // beginning at 1. The compiler may emit the value 0 in cases where an
+    // instruction cannot be attributed to any source line.
+    uint32_t Line;
+    // An unsigned integer indicating a column number within a source line.
+    // Columns are numbered beginning at 1. The value 0 is reserved to indicate
+    // that a statement begins at the 'left edge' of the line.
+    uint16_t Column;
+    // An unsigned integer indicating the identity of the source file
+    // corresponding to a machine instruction.
+    uint16_t File;
+    // An unsigned integer whose value encodes the applicable instruction set
+    // architecture for the current instruction.
+    uint8_t Isa;
+    // A boolean indicating that the current instruction is the beginning of a
+    // statement.
+    uint8_t IsStmt:1,
+            // A boolean indicating that the current instruction is the
+            // beginning of a basic block.
+            BasicBlock:1,
+            // A boolean indicating that the current address is that of the
+            // first byte after the end of a sequence of target machine
+            // instructions.
+            EndSequence:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an entry breakpoint
+            // of a function.
+            PrologueEnd:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an exit breakpoint
+            // of a function.
+            EpilogueBegin:1;
+  };
+
+  // Represents a series of contiguous machine instructions. Line table for each
+  // compilation unit may consist of multiple sequences, which are not
+  // guaranteed to be in the order of ascending instruction address.
+  struct Sequence {
+    // Sequence describes instructions at address range [LowPC, HighPC)
+    // and is described by line table rows [FirstRowIndex, LastRowIndex).
+    uint64_t LowPC;
+    uint64_t HighPC;
+    unsigned FirstRowIndex;
+    unsigned LastRowIndex;
+    bool Empty;
+
+    Sequence() { reset(); }
+    void reset() {
+      LowPC = 0;
+      HighPC = 0;
+      FirstRowIndex = 0;
+      LastRowIndex = 0;
+      Empty = true;
+    }
+    static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
+      return LHS.LowPC < RHS.LowPC;
+    }
+    bool isValid() const {
+      return !Empty && (LowPC < HighPC) && (FirstRowIndex < LastRowIndex);
+    }
+    bool containsPC(uint64_t pc) const {
+      return (LowPC <= pc && pc < HighPC);
+    }
+  };
+
+  struct LineTable {
+    void appendRow(const DWARFDebugLine::Row &state) { Rows.push_back(state); }
+    void appendSequence(const DWARFDebugLine::Sequence &sequence) {
+      Sequences.push_back(sequence);
+    }
+    void clear() {
+      Prologue.clear();
+      Rows.clear();
+      Sequences.clear();
+    }
+
+    // Returns the index of the row with file/line info for a given address,
+    // or -1 if there is no such row.
+    uint32_t lookupAddress(uint64_t address) const;
+
+    bool lookupAddressRange(uint64_t address,
+                            uint64_t size, 
+                            std::vector<uint32_t>& result) const;
+
+    // Extracts filename by its index in filename table in prologue.
+    // Returns true on success.
+    bool getFileNameByIndex(uint64_t FileIndex,
+                            bool NeedsAbsoluteFilePath,
+                            std::string &Result) const;
+
+    void dump(raw_ostream &OS) const;
+
+    struct Prologue Prologue;
+    typedef std::vector<Row> RowVector;
+    typedef RowVector::const_iterator RowIter;
+    typedef std::vector<Sequence> SequenceVector;
+    typedef SequenceVector::const_iterator SequenceIter;
+    RowVector Rows;
+    SequenceVector Sequences;
+  };
+
+  struct State : public Row, public Sequence, public LineTable {
+    // Special row codes.
+    enum {
+      StartParsingLineTable = 0,
+      DoneParsingLineTable = -1
+    };
+
+    State() : row(StartParsingLineTable) {}
+    virtual ~State();
+
+    virtual void appendRowToMatrix(uint32_t offset);
+    virtual void finalize();
+    virtual void reset() {
+      Row::reset(Prologue.DefaultIsStmt);
+      Sequence::reset();
+    }
+
+    // The row number that starts at zero for the prologue, and increases for
+    // each row added to the matrix.
+    unsigned row;
+  };
+
+  struct DumpingState : public State {
+    DumpingState(raw_ostream &OS) : OS(OS) {}
+    virtual ~DumpingState();
+    virtual void finalize();
+  private:
+    raw_ostream &OS;
+  };
+
+  static bool parsePrologue(DataExtractor debug_line_data, uint32_t *offset_ptr,
+                            Prologue *prologue);
+  /// Parse a single line table (prologue and all rows).
+  static bool parseStatementTable(DataExtractor debug_line_data,
+                                  const RelocAddrMap *RMap,
+                                  uint32_t *offset_ptr, State &state);
+
+  const LineTable *getLineTable(uint32_t offset) const;
+  const LineTable *getOrParseLineTable(DataExtractor debug_line_data,
+                                       uint32_t offset);
+
+private:
+  typedef std::map<uint32_t, LineTable> LineTableMapTy;
+  typedef LineTableMapTy::iterator LineTableIter;
+  typedef LineTableMapTy::const_iterator LineTableConstIter;
+
+  const RelocAddrMap *RelocMap;
+  LineTableMapTy LineTableMap;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.cpp b/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.cpp
new file mode 100644
index 000000000000..1806beee7285
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.cpp
@@ -0,0 +1,67 @@
+//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugRangeList.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugRangeList::clear() {
+  Offset = -1U;
+  AddressSize = 0;
+  Entries.clear();
+}
+
+bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
+  clear();
+  if (!data.isValidOffset(*offset_ptr))
+    return false;
+  AddressSize = data.getAddressSize();
+  if (AddressSize != 4 && AddressSize != 8)
+    return false;
+  Offset = *offset_ptr;
+  while (true) {
+    RangeListEntry entry;
+    uint32_t prev_offset = *offset_ptr;
+    entry.StartAddress = data.getAddress(offset_ptr);
+    entry.EndAddress = data.getAddress(offset_ptr);
+    // Check that both values were extracted correctly.
+    if (*offset_ptr != prev_offset + 2 * AddressSize) {
+      clear();
+      return false;
+    }
+    if (entry.isEndOfListEntry())
+      break;
+    Entries.push_back(entry);
+  }
+  return true;
+}
+
+void DWARFDebugRangeList::dump(raw_ostream &OS) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    const char *format_str = (AddressSize == 4
+                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
+                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    OS << format(format_str, Offset, Entries[i].StartAddress,
+                                     Entries[i].EndAddress);
+  }
+  OS << format("%08x <End of list>\n", Offset);
+}
+
+bool DWARFDebugRangeList::containsAddress(uint64_t BaseAddress,
+                                          uint64_t Address) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    if (Entries[i].isBaseAddressSelectionEntry(AddressSize))
+      BaseAddress = Entries[i].EndAddress;
+    else if (Entries[i].containsAddress(BaseAddress, Address))
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.h b/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.h
new file mode 100644
index 000000000000..4e34a916f4a3
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFDebugRangeList.h
@@ -0,0 +1,78 @@
+//===-- DWARFDebugRangeList.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#define LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugRangeList {
+public:
+  struct RangeListEntry {
+    // A beginning address offset. This address offset has the size of an
+    // address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the beginning
+    // of an address range.
+    uint64_t StartAddress;
+    // An ending address offset. This address offset again has the size of
+    // an address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the first
+    // address past the end of the address range. The ending address must
+    // be greater than or equal to the beginning address.
+    uint64_t EndAddress;
+    // The end of any given range list is marked by an end of list entry,
+    // which consists of a 0 for the beginning address offset
+    // and a 0 for the ending address offset.
+    bool isEndOfListEntry() const {
+      return (StartAddress == 0) && (EndAddress == 0);
+    }
+    // A base address selection entry consists of:
+    // 1. The value of the largest representable address offset
+    // (for example, 0xffffffff when the size of an address is 32 bits).
+    // 2. An address, which defines the appropriate base address for
+    // use in interpreting the beginning and ending address offsets of
+    // subsequent entries of the location list.
+    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
+      assert(AddressSize == 4 || AddressSize == 8);
+      if (AddressSize == 4)
+        return StartAddress == -1U;
+      else
+        return StartAddress == -1ULL;
+    }
+    bool containsAddress(uint64_t BaseAddress, uint64_t Address) const {
+      return (BaseAddress + StartAddress <= Address) &&
+             (Address < BaseAddress + EndAddress);
+    }
+  };
+
+private:
+  // Offset in .debug_ranges section.
+  uint32_t Offset;
+  uint8_t AddressSize;
+  std::vector<RangeListEntry> Entries;
+
+public:
+  DWARFDebugRangeList() { clear(); }
+  void clear();
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  /// containsAddress - Returns true if range list contains the given
+  /// address. Has to be passed base address of the compile unit that
+  /// references this range list.
+  bool containsAddress(uint64_t BaseAddress, uint64_t Address) const;
+};
+
+}  // namespace llvm
+
+#endif  // LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
diff --git a/contrib/llvm/lib/DebugInfo/DWARFFormValue.cpp b/contrib/llvm/lib/DebugInfo/DWARFFormValue.cpp
new file mode 100644
index 000000000000..9f807aac5fd4
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFFormValue.cpp
@@ -0,0 +1,534 @@
+//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFFormValue.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+using namespace llvm;
+using namespace dwarf;
+
+static const uint8_t form_sizes_addr4[] = {
+  0, // 0x00 unused
+  4, // 0x01 DW_FORM_addr
+  0, // 0x02 unused
+  0, // 0x03 DW_FORM_block2
+  0, // 0x04 DW_FORM_block4
+  2, // 0x05 DW_FORM_data2
+  4, // 0x06 DW_FORM_data4
+  8, // 0x07 DW_FORM_data8
+  0, // 0x08 DW_FORM_string
+  0, // 0x09 DW_FORM_block
+  0, // 0x0a DW_FORM_block1
+  1, // 0x0b DW_FORM_data1
+  1, // 0x0c DW_FORM_flag
+  0, // 0x0d DW_FORM_sdata
+  4, // 0x0e DW_FORM_strp
+  0, // 0x0f DW_FORM_udata
+  4, // 0x10 DW_FORM_ref_addr
+  1, // 0x11 DW_FORM_ref1
+  2, // 0x12 DW_FORM_ref2
+  4, // 0x13 DW_FORM_ref4
+  8, // 0x14 DW_FORM_ref8
+  0, // 0x15 DW_FORM_ref_udata
+  0, // 0x16 DW_FORM_indirect
+  4, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
+};
+
+static const uint8_t form_sizes_addr8[] = {
+  0, // 0x00 unused
+  8, // 0x01 DW_FORM_addr
+  0, // 0x02 unused
+  0, // 0x03 DW_FORM_block2
+  0, // 0x04 DW_FORM_block4
+  2, // 0x05 DW_FORM_data2
+  4, // 0x06 DW_FORM_data4
+  8, // 0x07 DW_FORM_data8
+  0, // 0x08 DW_FORM_string
+  0, // 0x09 DW_FORM_block
+  0, // 0x0a DW_FORM_block1
+  1, // 0x0b DW_FORM_data1
+  1, // 0x0c DW_FORM_flag
+  0, // 0x0d DW_FORM_sdata
+  4, // 0x0e DW_FORM_strp
+  0, // 0x0f DW_FORM_udata
+  8, // 0x10 DW_FORM_ref_addr
+  1, // 0x11 DW_FORM_ref1
+  2, // 0x12 DW_FORM_ref2
+  4, // 0x13 DW_FORM_ref4
+  8, // 0x14 DW_FORM_ref8
+  0, // 0x15 DW_FORM_ref_udata
+  0, // 0x16 DW_FORM_indirect
+  4, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
+};
+
+const uint8_t *
+DWARFFormValue::getFixedFormSizesForAddressSize(uint8_t addr_size) {
+  switch (addr_size) {
+  case 4: return form_sizes_addr4;
+  case 8: return form_sizes_addr8;
+  }
+  return NULL;
+}
+
+bool
+DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                             const DWARFCompileUnit *cu) {
+  bool indirect = false;
+  bool is_block = false;
+  Value.data = NULL;
+  // Read the value for the form into value and follow and DW_FORM_indirect
+  // instances we run into
+  do {
+    indirect = false;
+    switch (Form) {
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr: {
+      RelocAddrMap::const_iterator AI
+        = cu->getRelocMap()->find(*offset_ptr);
+      if (AI != cu->getRelocMap()->end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize()) +
+                     R.second;
+      } else
+        Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+      break;
+    }
+    case DW_FORM_exprloc:
+    case DW_FORM_block:
+      Value.uval = data.getULEB128(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block1:
+      Value.uval = data.getU8(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block2:
+      Value.uval = data.getU16(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block4:
+      Value.uval = data.getU32(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_data1:
+    case DW_FORM_ref1:
+    case DW_FORM_flag:
+      Value.uval = data.getU8(offset_ptr);
+      break;
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      Value.uval = data.getU16(offset_ptr);
+      break;
+    case DW_FORM_data4:
+    case DW_FORM_ref4:
+      Value.uval = data.getU32(offset_ptr);
+      break;
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_sdata:
+      Value.sval = data.getSLEB128(offset_ptr);
+      break;
+    case DW_FORM_strp: {
+      RelocAddrMap::const_iterator AI
+        = cu->getRelocMap()->find(*offset_ptr);
+      if (AI != cu->getRelocMap()->end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = data.getU32(offset_ptr) + R.second;
+      } else
+        Value.uval = data.getU32(offset_ptr);
+      break;
+    }
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    case DW_FORM_string:
+      Value.cstr = data.getCStr(offset_ptr);
+      // Set the string value to also be the data for inlined cstr form
+      // values only so we can tell the differnence between DW_FORM_string
+      // and DW_FORM_strp form values
+      Value.data = (const uint8_t*)Value.cstr;
+      break;
+    case DW_FORM_indirect:
+      Form = data.getULEB128(offset_ptr);
+      indirect = true;
+      break;
+    case DW_FORM_sec_offset:
+      // FIXME: This is 64-bit for DWARF64.
+      Value.uval = data.getU32(offset_ptr);
+      break;
+    case DW_FORM_flag_present:
+      Value.uval = 1;
+      break;
+    case DW_FORM_ref_sig8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_GNU_addr_index:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    case DW_FORM_GNU_str_index:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    default:
+      return false;
+    }
+  } while (indirect);
+
+  if (is_block) {
+    StringRef str = data.getData().substr(*offset_ptr, Value.uval);
+    Value.data = NULL;
+    if (!str.empty()) {
+      Value.data = reinterpret_cast<const uint8_t *>(str.data());
+      *offset_ptr += Value.uval;
+    }
+  }
+
+  return true;
+}
+
+bool
+DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
+                          const DWARFCompileUnit *cu) const {
+  return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
+}
+
+bool
+DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                          uint32_t *offset_ptr, const DWARFCompileUnit *cu) {
+  bool indirect = false;
+  do {
+    indirect = false;
+    switch (form) {
+    // Blocks if inlined data that have a length field and the data bytes
+    // inlined in the .debug_info
+    case DW_FORM_exprloc:
+    case DW_FORM_block: {
+      uint64_t size = debug_info_data.getULEB128(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block1: {
+      uint8_t size = debug_info_data.getU8(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block2: {
+      uint16_t size = debug_info_data.getU16(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block4: {
+      uint32_t size = debug_info_data.getU32(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+
+    // Inlined NULL terminated C-strings
+    case DW_FORM_string:
+      debug_info_data.getCStr(offset_ptr);
+      return true;
+
+    // Compile unit address sized values
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr:
+      *offset_ptr += cu->getAddressByteSize();
+      return true;
+
+    // 0 byte values - implied from the form.
+    case DW_FORM_flag_present:
+      return true;
+
+    // 1 byte values
+    case DW_FORM_data1:
+    case DW_FORM_flag:
+    case DW_FORM_ref1:
+      *offset_ptr += 1;
+      return true;
+
+    // 2 byte values
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      *offset_ptr += 2;
+      return true;
+
+    // 4 byte values
+    case DW_FORM_strp:
+    case DW_FORM_data4:
+    case DW_FORM_ref4:
+      *offset_ptr += 4;
+      return true;
+
+    // 8 byte values
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+    case DW_FORM_ref_sig8:
+      *offset_ptr += 8;
+      return true;
+
+    // signed or unsigned LEB 128 values
+    //  case DW_FORM_APPLE_db_str:
+    case DW_FORM_sdata:
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+    case DW_FORM_GNU_str_index:
+    case DW_FORM_GNU_addr_index:
+      debug_info_data.getULEB128(offset_ptr);
+      return true;
+
+    case DW_FORM_indirect:
+      indirect = true;
+      form = debug_info_data.getULEB128(offset_ptr);
+      break;
+
+    // FIXME: 4 for DWARF32, 8 for DWARF64.
+    case DW_FORM_sec_offset:
+      *offset_ptr += 4;
+      return true;
+
+    default:
+      return false;
+    }
+  } while (indirect);
+  return true;
+}
+
+void
+DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
+  DataExtractor debug_str_data(cu->getStringSection(), true, 0);
+  DataExtractor debug_str_offset_data(cu->getStringOffsetSection(), true, 0);
+  uint64_t uvalue = getUnsigned();
+  bool cu_relative_offset = false;
+
+  switch (Form) {
+  case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_GNU_addr_index: {
+    StringRef AddrOffsetSec = cu->getAddrOffsetSection();
+    OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
+    if (AddrOffsetSec.size() != 0) {
+      DataExtractor DA(AddrOffsetSec, true, cu->getAddressByteSize());
+      OS << format("0x%016" PRIx64, getIndirectAddress(&DA, cu));
+    } else
+      OS << "<no .debug_addr section>";
+    break;
+  }
+  case DW_FORM_flag_present: OS << "true"; break;
+  case DW_FORM_flag:
+  case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
+  case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
+  case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
+  case DW_FORM_ref_sig8:
+  case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_string:
+    OS << '"';
+    OS.write_escaped(getAsCString(NULL));
+    OS << '"';
+    break;
+  case DW_FORM_exprloc:
+  case DW_FORM_block:
+  case DW_FORM_block1:
+  case DW_FORM_block2:
+  case DW_FORM_block4:
+    if (uvalue > 0) {
+      switch (Form) {
+      case DW_FORM_exprloc:
+      case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
+      case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
+      case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
+      case DW_FORM_block4: OS << format("<0x%8.8x> ", (uint32_t)uvalue); break;
+      default: break;
+      }
+
+      const uint8_t* data_ptr = Value.data;
+      if (data_ptr) {
+        // uvalue contains size of block
+        const uint8_t* end_data_ptr = data_ptr + uvalue;
+        while (data_ptr < end_data_ptr) {
+          OS << format("%2.2x ", *data_ptr);
+          ++data_ptr;
+        }
+      }
+      else
+        OS << "NULL";
+    }
+    break;
+
+  case DW_FORM_sdata:     OS << getSigned();   break;
+  case DW_FORM_udata:     OS << getUnsigned(); break;
+  case DW_FORM_strp: {
+    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
+    const char* dbg_str = getAsCString(&debug_str_data);
+    if (dbg_str) {
+      OS << '"';
+      OS.write_escaped(dbg_str);
+      OS << '"';
+    }
+    break;
+  }
+  case DW_FORM_GNU_str_index: {
+    OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
+    const char *dbg_str = getIndirectCString(&debug_str_data,
+                                             &debug_str_offset_data);
+    if (dbg_str) {
+      OS << '"';
+      OS.write_escaped(dbg_str);
+      OS << '"';
+    }
+    break;
+  }
+  case DW_FORM_ref_addr:
+    OS << format("0x%016" PRIx64, uvalue);
+    break;
+  case DW_FORM_ref1:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%2.2x", (uint8_t)uvalue);
+    break;
+  case DW_FORM_ref2:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint16_t)uvalue);
+    break;
+  case DW_FORM_ref4:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint32_t)uvalue);
+    break;
+  case DW_FORM_ref8:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%8.8" PRIx64, uvalue);
+    break;
+  case DW_FORM_ref_udata:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%" PRIx64, uvalue);
+    break;
+
+    // All DW_FORM_indirect attributes should be resolved prior to calling
+    // this function
+  case DW_FORM_indirect:
+    OS << "DW_FORM_indirect";
+    break;
+
+    // Should be formatted to 64-bit for DWARF64.
+  case DW_FORM_sec_offset:
+    OS << format("0x%08x", (uint32_t)uvalue);
+    break;
+
+  default:
+    OS << format("DW_FORM(0x%4.4x)", Form);
+    break;
+  }
+
+  if (cu_relative_offset)
+    OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
+}
+
+const char*
+DWARFFormValue::getAsCString(const DataExtractor *debug_str_data_ptr) const {
+  if (isInlinedCStr()) {
+    return Value.cstr;
+  } else if (debug_str_data_ptr) {
+    uint32_t offset = Value.uval;
+    return debug_str_data_ptr->getCStr(&offset);
+  }
+  return NULL;
+}
+
+const char*
+DWARFFormValue::getIndirectCString(const DataExtractor *DS,
+                                   const DataExtractor *DSO) const {
+  if (!DS || !DSO) return NULL;
+
+  uint32_t offset = Value.uval * 4;
+  uint32_t soffset = DSO->getU32(&offset);
+  return DS->getCStr(&soffset);
+}
+
+uint64_t
+DWARFFormValue::getIndirectAddress(const DataExtractor *DA,
+                                   const DWARFCompileUnit *cu) const {
+  if (!DA) return 0;
+
+  uint32_t offset = Value.uval * cu->getAddressByteSize();
+  return DA->getAddress(&offset);
+}
+
+uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
+  uint64_t die_offset = Value.uval;
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata:
+      die_offset += (cu ? cu->getOffset() : 0);
+      break;
+  default:
+      break;
+  }
+
+  return die_offset;
+}
+
+bool
+DWARFFormValue::resolveCompileUnitReferences(const DWARFCompileUnit *cu) {
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata:
+    Value.uval += cu->getOffset();
+    Form = DW_FORM_ref_addr;
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+const uint8_t *DWARFFormValue::BlockData() const {
+  if (!isInlinedCStr())
+    return Value.data;
+  return NULL;
+}
+
+bool DWARFFormValue::isBlockForm(uint16_t form) {
+  switch (form) {
+  case DW_FORM_exprloc:
+  case DW_FORM_block:
+  case DW_FORM_block1:
+  case DW_FORM_block2:
+  case DW_FORM_block4:
+    return true;
+  }
+  return false;
+}
+
+bool DWARFFormValue::isDataForm(uint16_t form) {
+  switch (form) {
+  case DW_FORM_sdata:
+  case DW_FORM_udata:
+  case DW_FORM_data1:
+  case DW_FORM_data2:
+  case DW_FORM_data4:
+  case DW_FORM_data8:
+    return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARFFormValue.h b/contrib/llvm/lib/DebugInfo/DWARFFormValue.h
new file mode 100644
index 000000000000..b863001e4af8
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFFormValue.h
@@ -0,0 +1,82 @@
+//===-- DWARFFormValue.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFFORMVALUE_H
+#define LLVM_DEBUGINFO_DWARFFORMVALUE_H
+
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class DWARFCompileUnit;
+class raw_ostream;
+
+class DWARFFormValue {
+public:
+  struct ValueType {
+    ValueType() : data(NULL) {
+      uval = 0;
+    }
+
+    union {
+      uint64_t uval;
+      int64_t sval;
+      const char* cstr;
+    };
+    const uint8_t* data;
+  };
+
+  enum {
+    eValueTypeInvalid = 0,
+    eValueTypeUnsigned,
+    eValueTypeSigned,
+    eValueTypeCStr,
+    eValueTypeBlock
+  };
+
+private:
+  uint16_t Form;   // Form for this value.
+  ValueType Value; // Contains all data for the form.
+
+public:
+  DWARFFormValue(uint16_t form = 0) : Form(form) {}
+  uint16_t getForm() const { return Form; }
+  const ValueType& value() const { return Value; }
+  void dump(raw_ostream &OS, const DWARFCompileUnit* cu) const;
+  bool extractValue(DataExtractor data, uint32_t *offset_ptr,
+                    const DWARFCompileUnit *cu);
+  bool isInlinedCStr() const {
+    return Value.data != NULL && Value.data == (const uint8_t*)Value.cstr;
+  }
+  const uint8_t *BlockData() const;
+  uint64_t getReference(const DWARFCompileUnit* cu) const;
+
+  /// Resolve any compile unit specific references so that we don't need
+  /// the compile unit at a later time in order to work with the form
+  /// value.
+  bool resolveCompileUnitReferences(const DWARFCompileUnit* cu);
+  uint64_t getUnsigned() const { return Value.uval; }
+  int64_t getSigned() const { return Value.sval; }
+  const char *getAsCString(const DataExtractor *debug_str_data_ptr) const;
+  const char *getIndirectCString(const DataExtractor *,
+                                 const DataExtractor *) const;
+  uint64_t getIndirectAddress(const DataExtractor *,
+                              const DWARFCompileUnit *) const;
+  bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
+                 const DWARFCompileUnit *cu) const;
+  static bool skipValue(uint16_t form, DataExtractor debug_info_data,
+                        uint32_t *offset_ptr, const DWARFCompileUnit *cu);
+  static bool isBlockForm(uint16_t form);
+  static bool isDataForm(uint16_t form);
+  static const uint8_t *getFixedFormSizesForAddressSize(uint8_t addr_size);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/DebugInfo/DWARFRelocMap.h b/contrib/llvm/lib/DebugInfo/DWARFRelocMap.h
new file mode 100644
index 000000000000..6929e367b84c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARFRelocMap.h
@@ -0,0 +1,22 @@
+//===-- DWARFRelocMap.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFRELOCMAP_H
+#define LLVM_DEBUGINFO_DWARFRELOCMAP_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARFRELOCMAP_H
+
diff --git a/contrib/llvm/lib/ExecutionEngine/EventListenerCommon.h b/contrib/llvm/lib/ExecutionEngine/EventListenerCommon.h
new file mode 100644
index 000000000000..314db8bd84c2
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/EventListenerCommon.h
@@ -0,0 +1,67 @@
+//===-- JIT.h - Abstract Execution Engine Interface -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for JITEventListener implementations
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef EVENT_LISTENER_COMMON_H
+#define EVENT_LISTENER_COMMON_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ValueHandle.h"
+
+namespace llvm {
+
+namespace jitprofiling {
+
+class FilenameCache {
+  // Holds the filename of each Scope, so that we can pass a null-terminated
+  // string into oprofile.  Use an AssertingVH rather than a ValueMap because we
+  // shouldn't be modifying any MDNodes while this map is alive.
+  DenseMap<AssertingVH<MDNode>, std::string> Filenames;
+  DenseMap<AssertingVH<MDNode>, std::string> Paths;
+
+ public:
+  const char *getFilename(MDNode *Scope) {
+    std::string &Filename = Filenames[Scope];
+    if (Filename.empty()) {
+      DIScope DIScope(Scope);
+      Filename = DIScope.getFilename();
+    }
+    return Filename.c_str();
+  }
+
+  const char *getFullPath(MDNode *Scope) {
+    std::string &P = Paths[Scope];
+    if (P.empty()) {
+      DIScope DIScope(Scope);
+      StringRef DirName = DIScope.getDirectory();
+      StringRef FileName = DIScope.getFilename();
+      SmallString<256> FullPath;
+      if (DirName != "." && DirName != "") {
+        FullPath = DirName;
+      }
+      if (FileName != "") {
+        sys::path::append(FullPath, FileName);
+      }
+      P = FullPath.str();
+    }
+    return P.c_str();
+  }
+};
+
+} // namespace jitprofiling
+
+} // namespace llvm
+
+#endif //EVENT_LISTENER_COMMON_H
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
new file mode 100644
index 000000000000..906a3a3fda7f
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -0,0 +1,1308 @@
+//===-- ExecutionEngine.cpp - Common Implementation shared by EEs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common interface used by the various execution engine
+// subclasses.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cmath>
+#include <cstring>
+using namespace llvm;
+
+STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
+STATISTIC(NumGlobals  , "Number of global vars initialized");
+
+ExecutionEngine *(*ExecutionEngine::JITCtor)(
+  Module *M,
+  std::string *ErrorStr,
+  JITMemoryManager *JMM,
+  bool GVsWithCode,
+  TargetMachine *TM) = 0;
+ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
+  Module *M,
+  std::string *ErrorStr,
+  JITMemoryManager *JMM,
+  bool GVsWithCode,
+  TargetMachine *TM) = 0;
+ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
+                                                std::string *ErrorStr) = 0;
+
+ExecutionEngine::ExecutionEngine(Module *M)
+  : EEState(*this),
+    LazyFunctionCreator(0),
+    ExceptionTableRegister(0),
+    ExceptionTableDeregister(0) {
+  CompilingLazily         = false;
+  GVCompilationDisabled   = false;
+  SymbolSearchingDisabled = false;
+  Modules.push_back(M);
+  assert(M && "Module is null?");
+}
+
+ExecutionEngine::~ExecutionEngine() {
+  clearAllGlobalMappings();
+  for (unsigned i = 0, e = Modules.size(); i != e; ++i)
+    delete Modules[i];
+}
+
+void ExecutionEngine::DeregisterAllTables() {
+  if (ExceptionTableDeregister) {
+    DenseMap<const Function*, void*>::iterator it = AllExceptionTables.begin();
+    DenseMap<const Function*, void*>::iterator ite = AllExceptionTables.end();
+    for (; it != ite; ++it)
+      ExceptionTableDeregister(it->second);
+    AllExceptionTables.clear();
+  }
+}
+
+namespace {
+/// \brief Helper class which uses a value handler to automatically deletes the
+/// memory block when the GlobalVariable is destroyed.
+class GVMemoryBlock : public CallbackVH {
+  GVMemoryBlock(const GlobalVariable *GV)
+    : CallbackVH(const_cast<GlobalVariable*>(GV)) {}
+
+public:
+  /// \brief Returns the address the GlobalVariable should be written into.  The
+  /// GVMemoryBlock object prefixes that.
+  static char *Create(const GlobalVariable *GV, const DataLayout& TD) {
+    Type *ElTy = GV->getType()->getElementType();
+    size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
+    void *RawMemory = ::operator new(
+      DataLayout::RoundUpAlignment(sizeof(GVMemoryBlock),
+                                   TD.getPreferredAlignment(GV))
+      + GVSize);
+    new(RawMemory) GVMemoryBlock(GV);
+    return static_cast<char*>(RawMemory) + sizeof(GVMemoryBlock);
+  }
+
+  virtual void deleted() {
+    // We allocated with operator new and with some extra memory hanging off the
+    // end, so don't just delete this.  I'm not sure if this is actually
+    // required.
+    this->~GVMemoryBlock();
+    ::operator delete(this);
+  }
+};
+}  // anonymous namespace
+
+char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) {
+  return GVMemoryBlock::Create(GV, *getDataLayout());
+}
+
+bool ExecutionEngine::removeModule(Module *M) {
+  for(SmallVector<Module *, 1>::iterator I = Modules.begin(),
+        E = Modules.end(); I != E; ++I) {
+    Module *Found = *I;
+    if (Found == M) {
+      Modules.erase(I);
+      clearGlobalMappingsFromModule(M);
+      return true;
+    }
+  }
+  return false;
+}
+
+Function *ExecutionEngine::FindFunctionNamed(const char *FnName) {
+  for (unsigned i = 0, e = Modules.size(); i != e; ++i) {
+    if (Function *F = Modules[i]->getFunction(FnName))
+      return F;
+  }
+  return 0;
+}
+
+
+void *ExecutionEngineState::RemoveMapping(const MutexGuard &,
+                                          const GlobalValue *ToUnmap) {
+  GlobalAddressMapTy::iterator I = GlobalAddressMap.find(ToUnmap);
+  void *OldVal;
+
+  // FIXME: This is silly, we shouldn't end up with a mapping -> 0 in the
+  // GlobalAddressMap.
+  if (I == GlobalAddressMap.end())
+    OldVal = 0;
+  else {
+    OldVal = I->second;
+    GlobalAddressMap.erase(I);
+  }
+
+  GlobalAddressReverseMap.erase(OldVal);
+  return OldVal;
+}
+
+void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
+  MutexGuard locked(lock);
+
+  DEBUG(dbgs() << "JIT: Map \'" << GV->getName()
+        << "\' to [" << Addr << "]\n";);
+  void *&CurVal = EEState.getGlobalAddressMap(locked)[GV];
+  assert((CurVal == 0 || Addr == 0) && "GlobalMapping already established!");
+  CurVal = Addr;
+
+  // If we are using the reverse mapping, add it too.
+  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+    AssertingVH<const GlobalValue> &V =
+      EEState.getGlobalAddressReverseMap(locked)[Addr];
+    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    V = GV;
+  }
+}
+
+void ExecutionEngine::clearAllGlobalMappings() {
+  MutexGuard locked(lock);
+
+  EEState.getGlobalAddressMap(locked).clear();
+  EEState.getGlobalAddressReverseMap(locked).clear();
+}
+
+void ExecutionEngine::clearGlobalMappingsFromModule(Module *M) {
+  MutexGuard locked(lock);
+
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
+    EEState.RemoveMapping(locked, FI);
+  for (Module::global_iterator GI = M->global_begin(), GE = M->global_end();
+       GI != GE; ++GI)
+    EEState.RemoveMapping(locked, GI);
+}
+
+void *ExecutionEngine::updateGlobalMapping(const GlobalValue *GV, void *Addr) {
+  MutexGuard locked(lock);
+
+  ExecutionEngineState::GlobalAddressMapTy &Map =
+    EEState.getGlobalAddressMap(locked);
+
+  // Deleting from the mapping?
+  if (Addr == 0)
+    return EEState.RemoveMapping(locked, GV);
+
+  void *&CurVal = Map[GV];
+  void *OldVal = CurVal;
+
+  if (CurVal && !EEState.getGlobalAddressReverseMap(locked).empty())
+    EEState.getGlobalAddressReverseMap(locked).erase(CurVal);
+  CurVal = Addr;
+
+  // If we are using the reverse mapping, add it too.
+  if (!EEState.getGlobalAddressReverseMap(locked).empty()) {
+    AssertingVH<const GlobalValue> &V =
+      EEState.getGlobalAddressReverseMap(locked)[Addr];
+    assert((V == 0 || GV == 0) && "GlobalMapping already established!");
+    V = GV;
+  }
+  return OldVal;
+}
+
+void *ExecutionEngine::getPointerToGlobalIfAvailable(const GlobalValue *GV) {
+  MutexGuard locked(lock);
+
+  ExecutionEngineState::GlobalAddressMapTy::iterator I =
+    EEState.getGlobalAddressMap(locked).find(GV);
+  return I != EEState.getGlobalAddressMap(locked).end() ? I->second : 0;
+}
+
+const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
+  MutexGuard locked(lock);
+
+  // If we haven't computed the reverse mapping yet, do so first.
+  if (EEState.getGlobalAddressReverseMap(locked).empty()) {
+    for (ExecutionEngineState::GlobalAddressMapTy::iterator
+         I = EEState.getGlobalAddressMap(locked).begin(),
+         E = EEState.getGlobalAddressMap(locked).end(); I != E; ++I)
+      EEState.getGlobalAddressReverseMap(locked).insert(std::make_pair(
+                                                          I->second, I->first));
+  }
+
+  std::map<void *, AssertingVH<const GlobalValue> >::iterator I =
+    EEState.getGlobalAddressReverseMap(locked).find(Addr);
+  return I != EEState.getGlobalAddressReverseMap(locked).end() ? I->second : 0;
+}
+
+namespace {
+class ArgvArray {
+  char *Array;
+  std::vector<char*> Values;
+public:
+  ArgvArray() : Array(NULL) {}
+  ~ArgvArray() { clear(); }
+  void clear() {
+    delete[] Array;
+    Array = NULL;
+    for (size_t I = 0, E = Values.size(); I != E; ++I) {
+      delete[] Values[I];
+    }
+    Values.clear();
+  }
+  /// Turn a vector of strings into a nice argv style array of pointers to null
+  /// terminated strings.
+  void *reset(LLVMContext &C, ExecutionEngine *EE,
+              const std::vector<std::string> &InputArgv);
+};
+}  // anonymous namespace
+void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
+                       const std::vector<std::string> &InputArgv) {
+  clear();  // Free the old contents.
+  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
+  Array = new char[(InputArgv.size()+1)*PtrSize];
+
+  DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array << "\n");
+  Type *SBytePtr = Type::getInt8PtrTy(C);
+
+  for (unsigned i = 0; i != InputArgv.size(); ++i) {
+    unsigned Size = InputArgv[i].size()+1;
+    char *Dest = new char[Size];
+    Values.push_back(Dest);
+    DEBUG(dbgs() << "JIT: ARGV[" << i << "] = " << (void*)Dest << "\n");
+
+    std::copy(InputArgv[i].begin(), InputArgv[i].end(), Dest);
+    Dest[Size-1] = 0;
+
+    // Endian safe: Array[i] = (PointerTy)Dest;
+    EE->StoreValueToMemory(PTOGV(Dest), (GenericValue*)(Array+i*PtrSize),
+                           SBytePtr);
+  }
+
+  // Null terminate it
+  EE->StoreValueToMemory(PTOGV(0),
+                         (GenericValue*)(Array+InputArgv.size()*PtrSize),
+                         SBytePtr);
+  return Array;
+}
+
+void ExecutionEngine::runStaticConstructorsDestructors(Module *module,
+                                                       bool isDtors) {
+  const char *Name = isDtors ? "llvm.global_dtors" : "llvm.global_ctors";
+  GlobalVariable *GV = module->getNamedGlobal(Name);
+
+  // If this global has internal linkage, or if it has a use, then it must be
+  // an old-style (llvmgcc3) static ctor with __main linked in and in use.  If
+  // this is the case, don't execute any of the global ctors, __main will do
+  // it.
+  if (!GV || GV->isDeclaration() || GV->hasLocalLinkage()) return;
+
+  // Should be an array of '{ i32, void ()* }' structs.  The first value is
+  // the init priority, which we ignore.
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0)
+    return;
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
+    if (CS == 0) continue;
+
+    Constant *FP = CS->getOperand(1);
+    if (FP->isNullValue())
+      continue;  // Found a sentinal value, ignore.
+
+    // Strip off constant expression casts.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+      if (CE->isCast())
+        FP = CE->getOperand(0);
+
+    // Execute the ctor/dtor function!
+    if (Function *F = dyn_cast<Function>(FP))
+      runFunction(F, std::vector<GenericValue>());
+
+    // FIXME: It is marginally lame that we just do nothing here if we see an
+    // entry we don't recognize. It might not be unreasonable for the verifier
+    // to not even allow this and just assert here.
+  }
+}
+
+void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) {
+  // Execute global ctors/dtors for each module in the program.
+  for (unsigned i = 0, e = Modules.size(); i != e; ++i)
+    runStaticConstructorsDestructors(Modules[i], isDtors);
+}
+
+#ifndef NDEBUG
+/// isTargetNullPtr - Return whether the target pointer stored at Loc is null.
+static bool isTargetNullPtr(ExecutionEngine *EE, void *Loc) {
+  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
+  for (unsigned i = 0; i < PtrSize; ++i)
+    if (*(i + (uint8_t*)Loc))
+      return false;
+  return true;
+}
+#endif
+
+int ExecutionEngine::runFunctionAsMain(Function *Fn,
+                                       const std::vector<std::string> &argv,
+                                       const char * const * envp) {
+  std::vector<GenericValue> GVArgs;
+  GenericValue GVArgc;
+  GVArgc.IntVal = APInt(32, argv.size());
+
+  // Check main() type
+  unsigned NumArgs = Fn->getFunctionType()->getNumParams();
+  FunctionType *FTy = Fn->getFunctionType();
+  Type* PPInt8Ty = Type::getInt8PtrTy(Fn->getContext())->getPointerTo();
+
+  // Check the argument types.
+  if (NumArgs > 3)
+    report_fatal_error("Invalid number of arguments of main() supplied");
+  if (NumArgs >= 3 && FTy->getParamType(2) != PPInt8Ty)
+    report_fatal_error("Invalid type for third argument of main() supplied");
+  if (NumArgs >= 2 && FTy->getParamType(1) != PPInt8Ty)
+    report_fatal_error("Invalid type for second argument of main() supplied");
+  if (NumArgs >= 1 && !FTy->getParamType(0)->isIntegerTy(32))
+    report_fatal_error("Invalid type for first argument of main() supplied");
+  if (!FTy->getReturnType()->isIntegerTy() &&
+      !FTy->getReturnType()->isVoidTy())
+    report_fatal_error("Invalid return type of main() supplied");
+
+  ArgvArray CArgv;
+  ArgvArray CEnv;
+  if (NumArgs) {
+    GVArgs.push_back(GVArgc); // Arg #0 = argc.
+    if (NumArgs > 1) {
+      // Arg #1 = argv.
+      GVArgs.push_back(PTOGV(CArgv.reset(Fn->getContext(), this, argv)));
+      assert(!isTargetNullPtr(this, GVTOP(GVArgs[1])) &&
+             "argv[0] was null after CreateArgv");
+      if (NumArgs > 2) {
+        std::vector<std::string> EnvVars;
+        for (unsigned i = 0; envp[i]; ++i)
+          EnvVars.push_back(envp[i]);
+        // Arg #2 = envp.
+        GVArgs.push_back(PTOGV(CEnv.reset(Fn->getContext(), this, EnvVars)));
+      }
+    }
+  }
+
+  return runFunction(Fn, GVArgs).IntVal.getZExtValue();
+}
+
+ExecutionEngine *ExecutionEngine::create(Module *M,
+                                         bool ForceInterpreter,
+                                         std::string *ErrorStr,
+                                         CodeGenOpt::Level OptLevel,
+                                         bool GVsWithCode) {
+  EngineBuilder EB =  EngineBuilder(M)
+      .setEngineKind(ForceInterpreter
+                     ? EngineKind::Interpreter
+                     : EngineKind::JIT)
+      .setErrorStr(ErrorStr)
+      .setOptLevel(OptLevel)
+      .setAllocateGVsWithCode(GVsWithCode);
+
+  return EB.create();
+}
+
+/// createJIT - This is the factory method for creating a JIT for the current
+/// machine, it does not fall back to the interpreter.  This takes ownership
+/// of the module.
+ExecutionEngine *ExecutionEngine::createJIT(Module *M,
+                                            std::string *ErrorStr,
+                                            JITMemoryManager *JMM,
+                                            CodeGenOpt::Level OL,
+                                            bool GVsWithCode,
+                                            Reloc::Model RM,
+                                            CodeModel::Model CMM) {
+  if (ExecutionEngine::JITCtor == 0) {
+    if (ErrorStr)
+      *ErrorStr = "JIT has not been linked in.";
+    return 0;
+  }
+
+  // Use the defaults for extra parameters.  Users can use EngineBuilder to
+  // set them.
+  EngineBuilder EB(M);
+  EB.setEngineKind(EngineKind::JIT);
+  EB.setErrorStr(ErrorStr);
+  EB.setRelocationModel(RM);
+  EB.setCodeModel(CMM);
+  EB.setAllocateGVsWithCode(GVsWithCode);
+  EB.setOptLevel(OL);
+  EB.setJITMemoryManager(JMM);
+
+  // TODO: permit custom TargetOptions here
+  TargetMachine *TM = EB.selectTarget();
+  if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
+
+  return ExecutionEngine::JITCtor(M, ErrorStr, JMM, GVsWithCode, TM);
+}
+
+ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
+  OwningPtr<TargetMachine> TheTM(TM); // Take ownership.
+
+  // Make sure we can resolve symbols in the program as well. The zero arg
+  // to the function tells DynamicLibrary to load the program, not a library.
+  if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
+    return 0;
+
+  // If the user specified a memory manager but didn't specify which engine to
+  // create, we assume they only want the JIT, and we fail if they only want
+  // the interpreter.
+  if (JMM) {
+    if (WhichEngine & EngineKind::JIT)
+      WhichEngine = EngineKind::JIT;
+    else {
+      if (ErrorStr)
+        *ErrorStr = "Cannot create an interpreter with a memory manager.";
+      return 0;
+    }
+  }
+
+  // Unless the interpreter was explicitly selected or the JIT is not linked,
+  // try making a JIT.
+  if ((WhichEngine & EngineKind::JIT) && TheTM) {
+    Triple TT(M->getTargetTriple());
+    if (!TM->getTarget().hasJIT()) {
+      errs() << "WARNING: This target JIT is not designed for the host"
+             << " you are running.  If bad things happen, please choose"
+             << " a different -march switch.\n";
+    }
+
+    if (UseMCJIT && ExecutionEngine::MCJITCtor) {
+      ExecutionEngine *EE =
+        ExecutionEngine::MCJITCtor(M, ErrorStr, JMM,
+                                   AllocateGVsWithCode, TheTM.take());
+      if (EE) return EE;
+    } else if (ExecutionEngine::JITCtor) {
+      ExecutionEngine *EE =
+        ExecutionEngine::JITCtor(M, ErrorStr, JMM,
+                                 AllocateGVsWithCode, TheTM.take());
+      if (EE) return EE;
+    }
+  }
+
+  // If we can't make a JIT and we didn't request one specifically, try making
+  // an interpreter instead.
+  if (WhichEngine & EngineKind::Interpreter) {
+    if (ExecutionEngine::InterpCtor)
+      return ExecutionEngine::InterpCtor(M, ErrorStr);
+    if (ErrorStr)
+      *ErrorStr = "Interpreter has not been linked in.";
+    return 0;
+  }
+
+  if ((WhichEngine & EngineKind::JIT) && ExecutionEngine::JITCtor == 0 &&
+      ExecutionEngine::MCJITCtor == 0) {
+    if (ErrorStr)
+      *ErrorStr = "JIT has not been linked in.";
+  }
+
+  return 0;
+}
+
+void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
+  if (Function *F = const_cast<Function*>(dyn_cast<Function>(GV)))
+    return getPointerToFunction(F);
+
+  MutexGuard locked(lock);
+  if (void *P = EEState.getGlobalAddressMap(locked)[GV])
+    return P;
+
+  // Global variable might have been added since interpreter started.
+  if (GlobalVariable *GVar =
+          const_cast<GlobalVariable *>(dyn_cast<GlobalVariable>(GV)))
+    EmitGlobalVariable(GVar);
+  else
+    llvm_unreachable("Global hasn't had an address allocated yet!");
+
+  return EEState.getGlobalAddressMap(locked)[GV];
+}
+
+/// \brief Converts a Constant* into a GenericValue, including handling of
+/// ConstantExpr values.
+GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
+  // If its undefined, return the garbage.
+  if (isa<UndefValue>(C)) {
+    GenericValue Result;
+    switch (C->getType()->getTypeID()) {
+    default:
+      break;
+    case Type::IntegerTyID:
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+    case Type::PPC_FP128TyID:
+      // Although the value is undefined, we still have to construct an APInt
+      // with the correct bit width.
+      Result.IntVal = APInt(C->getType()->getPrimitiveSizeInBits(), 0);
+      break;
+    case Type::VectorTyID:
+      // if the whole vector is 'undef' just reserve memory for the value.
+      const VectorType* VTy = dyn_cast<VectorType>(C->getType());
+      const Type *ElemTy = VTy->getElementType();
+      unsigned int elemNum = VTy->getNumElements();
+      Result.AggregateVal.resize(elemNum);
+      if (ElemTy->isIntegerTy())
+        for (unsigned int i = 0; i < elemNum; ++i)
+          Result.AggregateVal[i].IntVal = 
+            APInt(ElemTy->getPrimitiveSizeInBits(), 0);
+      break;
+    }
+    return Result;
+  }
+
+  // Otherwise, if the value is a ConstantExpr...
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    Constant *Op0 = CE->getOperand(0);
+    switch (CE->getOpcode()) {
+    case Instruction::GetElementPtr: {
+      // Compute the index
+      GenericValue Result = getConstantValue(Op0);
+      APInt Offset(TD->getPointerSizeInBits(), 0);
+      cast<GEPOperator>(CE)->accumulateConstantOffset(*TD, Offset);
+
+      char* tmp = (char*) Result.PointerVal;
+      Result = PTOGV(tmp + Offset.getSExtValue());
+      return Result;
+    }
+    case Instruction::Trunc: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.trunc(BitWidth);
+      return GV;
+    }
+    case Instruction::ZExt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.zext(BitWidth);
+      return GV;
+    }
+    case Instruction::SExt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      GV.IntVal = GV.IntVal.sext(BitWidth);
+      return GV;
+    }
+    case Instruction::FPTrunc: {
+      // FIXME long double
+      GenericValue GV = getConstantValue(Op0);
+      GV.FloatVal = float(GV.DoubleVal);
+      return GV;
+    }
+    case Instruction::FPExt:{
+      // FIXME long double
+      GenericValue GV = getConstantValue(Op0);
+      GV.DoubleVal = double(GV.FloatVal);
+      return GV;
+    }
+    case Instruction::UIToFP: {
+      GenericValue GV = getConstantValue(Op0);
+      if (CE->getType()->isFloatTy())
+        GV.FloatVal = float(GV.IntVal.roundToDouble());
+      else if (CE->getType()->isDoubleTy())
+        GV.DoubleVal = GV.IntVal.roundToDouble();
+      else if (CE->getType()->isX86_FP80Ty()) {
+        APFloat apf = APFloat::getZero(APFloat::x87DoubleExtended);
+        (void)apf.convertFromAPInt(GV.IntVal,
+                                   false,
+                                   APFloat::rmNearestTiesToEven);
+        GV.IntVal = apf.bitcastToAPInt();
+      }
+      return GV;
+    }
+    case Instruction::SIToFP: {
+      GenericValue GV = getConstantValue(Op0);
+      if (CE->getType()->isFloatTy())
+        GV.FloatVal = float(GV.IntVal.signedRoundToDouble());
+      else if (CE->getType()->isDoubleTy())
+        GV.DoubleVal = GV.IntVal.signedRoundToDouble();
+      else if (CE->getType()->isX86_FP80Ty()) {
+        APFloat apf = APFloat::getZero(APFloat::x87DoubleExtended);
+        (void)apf.convertFromAPInt(GV.IntVal,
+                                   true,
+                                   APFloat::rmNearestTiesToEven);
+        GV.IntVal = apf.bitcastToAPInt();
+      }
+      return GV;
+    }
+    case Instruction::FPToUI: // double->APInt conversion handles sign
+    case Instruction::FPToSI: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t BitWidth = cast<IntegerType>(CE->getType())->getBitWidth();
+      if (Op0->getType()->isFloatTy())
+        GV.IntVal = APIntOps::RoundFloatToAPInt(GV.FloatVal, BitWidth);
+      else if (Op0->getType()->isDoubleTy())
+        GV.IntVal = APIntOps::RoundDoubleToAPInt(GV.DoubleVal, BitWidth);
+      else if (Op0->getType()->isX86_FP80Ty()) {
+        APFloat apf = APFloat(APFloat::x87DoubleExtended, GV.IntVal);
+        uint64_t v;
+        bool ignored;
+        (void)apf.convertToInteger(&v, BitWidth,
+                                   CE->getOpcode()==Instruction::FPToSI,
+                                   APFloat::rmTowardZero, &ignored);
+        GV.IntVal = v; // endian?
+      }
+      return GV;
+    }
+    case Instruction::PtrToInt: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t PtrWidth = TD->getTypeSizeInBits(Op0->getType());
+      assert(PtrWidth <= 64 && "Bad pointer width");
+      GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal));
+      uint32_t IntWidth = TD->getTypeSizeInBits(CE->getType());
+      GV.IntVal = GV.IntVal.zextOrTrunc(IntWidth);
+      return GV;
+    }
+    case Instruction::IntToPtr: {
+      GenericValue GV = getConstantValue(Op0);
+      uint32_t PtrWidth = TD->getTypeSizeInBits(CE->getType());
+      GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
+      assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width");
+      GV.PointerVal = PointerTy(uintptr_t(GV.IntVal.getZExtValue()));
+      return GV;
+    }
+    case Instruction::BitCast: {
+      GenericValue GV = getConstantValue(Op0);
+      Type* DestTy = CE->getType();
+      switch (Op0->getType()->getTypeID()) {
+        default: llvm_unreachable("Invalid bitcast operand");
+        case Type::IntegerTyID:
+          assert(DestTy->isFloatingPointTy() && "invalid bitcast");
+          if (DestTy->isFloatTy())
+            GV.FloatVal = GV.IntVal.bitsToFloat();
+          else if (DestTy->isDoubleTy())
+            GV.DoubleVal = GV.IntVal.bitsToDouble();
+          break;
+        case Type::FloatTyID:
+          assert(DestTy->isIntegerTy(32) && "Invalid bitcast");
+          GV.IntVal = APInt::floatToBits(GV.FloatVal);
+          break;
+        case Type::DoubleTyID:
+          assert(DestTy->isIntegerTy(64) && "Invalid bitcast");
+          GV.IntVal = APInt::doubleToBits(GV.DoubleVal);
+          break;
+        case Type::PointerTyID:
+          assert(DestTy->isPointerTy() && "Invalid bitcast");
+          break; // getConstantValue(Op0)  above already converted it
+      }
+      return GV;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      GenericValue LHS = getConstantValue(Op0);
+      GenericValue RHS = getConstantValue(CE->getOperand(1));
+      GenericValue GV;
+      switch (CE->getOperand(0)->getType()->getTypeID()) {
+      default: llvm_unreachable("Bad add type!");
+      case Type::IntegerTyID:
+        switch (CE->getOpcode()) {
+          default: llvm_unreachable("Invalid integer opcode");
+          case Instruction::Add: GV.IntVal = LHS.IntVal + RHS.IntVal; break;
+          case Instruction::Sub: GV.IntVal = LHS.IntVal - RHS.IntVal; break;
+          case Instruction::Mul: GV.IntVal = LHS.IntVal * RHS.IntVal; break;
+          case Instruction::UDiv:GV.IntVal = LHS.IntVal.udiv(RHS.IntVal); break;
+          case Instruction::SDiv:GV.IntVal = LHS.IntVal.sdiv(RHS.IntVal); break;
+          case Instruction::URem:GV.IntVal = LHS.IntVal.urem(RHS.IntVal); break;
+          case Instruction::SRem:GV.IntVal = LHS.IntVal.srem(RHS.IntVal); break;
+          case Instruction::And: GV.IntVal = LHS.IntVal & RHS.IntVal; break;
+          case Instruction::Or:  GV.IntVal = LHS.IntVal | RHS.IntVal; break;
+          case Instruction::Xor: GV.IntVal = LHS.IntVal ^ RHS.IntVal; break;
+        }
+        break;
+      case Type::FloatTyID:
+        switch (CE->getOpcode()) {
+          default: llvm_unreachable("Invalid float opcode");
+          case Instruction::FAdd:
+            GV.FloatVal = LHS.FloatVal + RHS.FloatVal; break;
+          case Instruction::FSub:
+            GV.FloatVal = LHS.FloatVal - RHS.FloatVal; break;
+          case Instruction::FMul:
+            GV.FloatVal = LHS.FloatVal * RHS.FloatVal; break;
+          case Instruction::FDiv:
+            GV.FloatVal = LHS.FloatVal / RHS.FloatVal; break;
+          case Instruction::FRem:
+            GV.FloatVal = std::fmod(LHS.FloatVal,RHS.FloatVal); break;
+        }
+        break;
+      case Type::DoubleTyID:
+        switch (CE->getOpcode()) {
+          default: llvm_unreachable("Invalid double opcode");
+          case Instruction::FAdd:
+            GV.DoubleVal = LHS.DoubleVal + RHS.DoubleVal; break;
+          case Instruction::FSub:
+            GV.DoubleVal = LHS.DoubleVal - RHS.DoubleVal; break;
+          case Instruction::FMul:
+            GV.DoubleVal = LHS.DoubleVal * RHS.DoubleVal; break;
+          case Instruction::FDiv:
+            GV.DoubleVal = LHS.DoubleVal / RHS.DoubleVal; break;
+          case Instruction::FRem:
+            GV.DoubleVal = std::fmod(LHS.DoubleVal,RHS.DoubleVal); break;
+        }
+        break;
+      case Type::X86_FP80TyID:
+      case Type::PPC_FP128TyID:
+      case Type::FP128TyID: {
+        const fltSemantics &Sem = CE->getOperand(0)->getType()->getFltSemantics();
+        APFloat apfLHS = APFloat(Sem, LHS.IntVal);
+        switch (CE->getOpcode()) {
+          default: llvm_unreachable("Invalid long double opcode");
+          case Instruction::FAdd:
+            apfLHS.add(APFloat(Sem, RHS.IntVal), APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FSub:
+            apfLHS.subtract(APFloat(Sem, RHS.IntVal),
+                            APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FMul:
+            apfLHS.multiply(APFloat(Sem, RHS.IntVal),
+                            APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FDiv:
+            apfLHS.divide(APFloat(Sem, RHS.IntVal),
+                          APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          case Instruction::FRem:
+            apfLHS.mod(APFloat(Sem, RHS.IntVal),
+                       APFloat::rmNearestTiesToEven);
+            GV.IntVal = apfLHS.bitcastToAPInt();
+            break;
+          }
+        }
+        break;
+      }
+      return GV;
+    }
+    default:
+      break;
+    }
+
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ConstantExpr not handled: " << *CE;
+    report_fatal_error(OS.str());
+  }
+
+  // Otherwise, we have a simple constant.
+  GenericValue Result;
+  switch (C->getType()->getTypeID()) {
+  case Type::FloatTyID:
+    Result.FloatVal = cast<ConstantFP>(C)->getValueAPF().convertToFloat();
+    break;
+  case Type::DoubleTyID:
+    Result.DoubleVal = cast<ConstantFP>(C)->getValueAPF().convertToDouble();
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    Result.IntVal = cast <ConstantFP>(C)->getValueAPF().bitcastToAPInt();
+    break;
+  case Type::IntegerTyID:
+    Result.IntVal = cast<ConstantInt>(C)->getValue();
+    break;
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(C))
+      Result.PointerVal = 0;
+    else if (const Function *F = dyn_cast<Function>(C))
+      Result = PTOGV(getPointerToFunctionOrStub(const_cast<Function*>(F)));
+    else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+      Result = PTOGV(getOrEmitGlobalVariable(const_cast<GlobalVariable*>(GV)));
+    else if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
+      Result = PTOGV(getPointerToBasicBlock(const_cast<BasicBlock*>(
+                                                        BA->getBasicBlock())));
+    else
+      llvm_unreachable("Unknown constant pointer type!");
+    break;
+  case Type::VectorTyID: {
+    unsigned elemNum;
+    Type* ElemTy;
+    const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
+    const ConstantVector *CV = dyn_cast<ConstantVector>(C);
+    const ConstantAggregateZero *CAZ = dyn_cast<ConstantAggregateZero>(C);
+
+    if (CDV) {
+        elemNum = CDV->getNumElements();
+        ElemTy = CDV->getElementType();
+    } else if (CV || CAZ) {
+        VectorType* VTy = dyn_cast<VectorType>(C->getType());
+        elemNum = VTy->getNumElements();
+        ElemTy = VTy->getElementType();
+    } else {
+        llvm_unreachable("Unknown constant vector type!");
+    }
+
+    Result.AggregateVal.resize(elemNum);
+    // Check if vector holds floats.
+    if(ElemTy->isFloatTy()) {
+      if (CAZ) {
+        GenericValue floatZero;
+        floatZero.FloatVal = 0.f;
+        std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
+                  floatZero);
+        break;
+      }
+      if(CV) {
+        for (unsigned i = 0; i < elemNum; ++i)
+          if (!isa<UndefValue>(CV->getOperand(i)))
+            Result.AggregateVal[i].FloatVal = cast<ConstantFP>(
+              CV->getOperand(i))->getValueAPF().convertToFloat();
+        break;
+      }
+      if(CDV)
+        for (unsigned i = 0; i < elemNum; ++i)
+          Result.AggregateVal[i].FloatVal = CDV->getElementAsFloat(i);
+
+      break;
+    }
+    // Check if vector holds doubles.
+    if (ElemTy->isDoubleTy()) {
+      if (CAZ) {
+        GenericValue doubleZero;
+        doubleZero.DoubleVal = 0.0;
+        std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
+                  doubleZero);
+        break;
+      }
+      if(CV) {
+        for (unsigned i = 0; i < elemNum; ++i)
+          if (!isa<UndefValue>(CV->getOperand(i)))
+            Result.AggregateVal[i].DoubleVal = cast<ConstantFP>(
+              CV->getOperand(i))->getValueAPF().convertToDouble();
+        break;
+      }
+      if(CDV)
+        for (unsigned i = 0; i < elemNum; ++i)
+          Result.AggregateVal[i].DoubleVal = CDV->getElementAsDouble(i);
+
+      break;
+    }
+    // Check if vector holds integers.
+    if (ElemTy->isIntegerTy()) {
+      if (CAZ) {
+        GenericValue intZero;     
+        intZero.IntVal = APInt(ElemTy->getScalarSizeInBits(), 0ull);
+        std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
+                  intZero);
+        break;
+      }
+      if(CV) {
+        for (unsigned i = 0; i < elemNum; ++i)
+          if (!isa<UndefValue>(CV->getOperand(i)))
+            Result.AggregateVal[i].IntVal = cast<ConstantInt>(
+                                            CV->getOperand(i))->getValue();
+          else {
+            Result.AggregateVal[i].IntVal =
+              APInt(CV->getOperand(i)->getType()->getPrimitiveSizeInBits(), 0);
+          }
+        break;
+      }
+      if(CDV)
+        for (unsigned i = 0; i < elemNum; ++i)
+          Result.AggregateVal[i].IntVal = APInt(
+            CDV->getElementType()->getPrimitiveSizeInBits(),
+            CDV->getElementAsInteger(i));
+
+      break;
+    }
+    llvm_unreachable("Unknown constant pointer type!");
+  }
+  break;
+
+  default:
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ERROR: Constant unimplemented for type: " << *C->getType();
+    report_fatal_error(OS.str());
+  }
+
+  return Result;
+}
+
+/// StoreIntToMemory - Fills the StoreBytes bytes of memory starting from Dst
+/// with the integer held in IntVal.
+static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
+                             unsigned StoreBytes) {
+  assert((IntVal.getBitWidth()+7)/8 >= StoreBytes && "Integer too small!");
+  const uint8_t *Src = (const uint8_t *)IntVal.getRawData();
+
+  if (sys::isLittleEndianHost()) {
+    // Little-endian host - the source is ordered from LSB to MSB.  Order the
+    // destination from LSB to MSB: Do a straight copy.
+    memcpy(Dst, Src, StoreBytes);
+  } else {
+    // Big-endian host - the source is an array of 64 bit words ordered from
+    // LSW to MSW.  Each word is ordered from MSB to LSB.  Order the destination
+    // from MSB to LSB: Reverse the word order, but not the bytes in a word.
+    while (StoreBytes > sizeof(uint64_t)) {
+      StoreBytes -= sizeof(uint64_t);
+      // May not be aligned so use memcpy.
+      memcpy(Dst + StoreBytes, Src, sizeof(uint64_t));
+      Src += sizeof(uint64_t);
+    }
+
+    memcpy(Dst, Src + sizeof(uint64_t) - StoreBytes, StoreBytes);
+  }
+}
+
+void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
+                                         GenericValue *Ptr, Type *Ty) {
+  const unsigned StoreBytes = getDataLayout()->getTypeStoreSize(Ty);
+
+  switch (Ty->getTypeID()) {
+  default:
+    dbgs() << "Cannot store value of type " << *Ty << "!\n";
+    break;
+  case Type::IntegerTyID:
+    StoreIntToMemory(Val.IntVal, (uint8_t*)Ptr, StoreBytes);
+    break;
+  case Type::FloatTyID:
+    *((float*)Ptr) = Val.FloatVal;
+    break;
+  case Type::DoubleTyID:
+    *((double*)Ptr) = Val.DoubleVal;
+    break;
+  case Type::X86_FP80TyID:
+    memcpy(Ptr, Val.IntVal.getRawData(), 10);
+    break;
+  case Type::PointerTyID:
+    // Ensure 64 bit target pointers are fully initialized on 32 bit hosts.
+    if (StoreBytes != sizeof(PointerTy))
+      memset(&(Ptr->PointerVal), 0, StoreBytes);
+
+    *((PointerTy*)Ptr) = Val.PointerVal;
+    break;
+  case Type::VectorTyID:
+    for (unsigned i = 0; i < Val.AggregateVal.size(); ++i) {
+      if (cast<VectorType>(Ty)->getElementType()->isDoubleTy())
+        *(((double*)Ptr)+i) = Val.AggregateVal[i].DoubleVal;
+      if (cast<VectorType>(Ty)->getElementType()->isFloatTy())
+        *(((float*)Ptr)+i) = Val.AggregateVal[i].FloatVal;
+      if (cast<VectorType>(Ty)->getElementType()->isIntegerTy()) {
+        unsigned numOfBytes =(Val.AggregateVal[i].IntVal.getBitWidth()+7)/8;
+        StoreIntToMemory(Val.AggregateVal[i].IntVal, 
+          (uint8_t*)Ptr + numOfBytes*i, numOfBytes);
+      }
+    }
+    break;
+  }
+
+  if (sys::isLittleEndianHost() != getDataLayout()->isLittleEndian())
+    // Host and target are different endian - reverse the stored bytes.
+    std::reverse((uint8_t*)Ptr, StoreBytes + (uint8_t*)Ptr);
+}
+
+/// LoadIntFromMemory - Loads the integer stored in the LoadBytes bytes starting
+/// from Src into IntVal, which is assumed to be wide enough and to hold zero.
+static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
+  assert((IntVal.getBitWidth()+7)/8 >= LoadBytes && "Integer too small!");
+  uint8_t *Dst = reinterpret_cast<uint8_t *>(
+                   const_cast<uint64_t *>(IntVal.getRawData()));
+
+  if (sys::isLittleEndianHost())
+    // Little-endian host - the destination must be ordered from LSB to MSB.
+    // The source is ordered from LSB to MSB: Do a straight copy.
+    memcpy(Dst, Src, LoadBytes);
+  else {
+    // Big-endian - the destination is an array of 64 bit words ordered from
+    // LSW to MSW.  Each word must be ordered from MSB to LSB.  The source is
+    // ordered from MSB to LSB: Reverse the word order, but not the bytes in
+    // a word.
+    while (LoadBytes > sizeof(uint64_t)) {
+      LoadBytes -= sizeof(uint64_t);
+      // May not be aligned so use memcpy.
+      memcpy(Dst, Src + LoadBytes, sizeof(uint64_t));
+      Dst += sizeof(uint64_t);
+    }
+
+    memcpy(Dst + sizeof(uint64_t) - LoadBytes, Src, LoadBytes);
+  }
+}
+
+/// FIXME: document
+///
+void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
+                                          GenericValue *Ptr,
+                                          Type *Ty) {
+  const unsigned LoadBytes = getDataLayout()->getTypeStoreSize(Ty);
+
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    // An APInt with all words initially zero.
+    Result.IntVal = APInt(cast<IntegerType>(Ty)->getBitWidth(), 0);
+    LoadIntFromMemory(Result.IntVal, (uint8_t*)Ptr, LoadBytes);
+    break;
+  case Type::FloatTyID:
+    Result.FloatVal = *((float*)Ptr);
+    break;
+  case Type::DoubleTyID:
+    Result.DoubleVal = *((double*)Ptr);
+    break;
+  case Type::PointerTyID:
+    Result.PointerVal = *((PointerTy*)Ptr);
+    break;
+  case Type::X86_FP80TyID: {
+    // This is endian dependent, but it will only work on x86 anyway.
+    // FIXME: Will not trap if loading a signaling NaN.
+    uint64_t y[2];
+    memcpy(y, Ptr, 10);
+    Result.IntVal = APInt(80, y);
+    break;
+  }
+  case Type::VectorTyID: {
+    const VectorType *VT = cast<VectorType>(Ty);
+    const Type *ElemT = VT->getElementType();
+    const unsigned numElems = VT->getNumElements();
+    if (ElemT->isFloatTy()) {
+      Result.AggregateVal.resize(numElems);
+      for (unsigned i = 0; i < numElems; ++i)
+        Result.AggregateVal[i].FloatVal = *((float*)Ptr+i);
+    }
+    if (ElemT->isDoubleTy()) {
+      Result.AggregateVal.resize(numElems);
+      for (unsigned i = 0; i < numElems; ++i)
+        Result.AggregateVal[i].DoubleVal = *((double*)Ptr+i);
+    }
+    if (ElemT->isIntegerTy()) {
+      GenericValue intZero;
+      const unsigned elemBitWidth = cast<IntegerType>(ElemT)->getBitWidth();
+      intZero.IntVal = APInt(elemBitWidth, 0);
+      Result.AggregateVal.resize(numElems, intZero);
+      for (unsigned i = 0; i < numElems; ++i)
+        LoadIntFromMemory(Result.AggregateVal[i].IntVal,
+          (uint8_t*)Ptr+((elemBitWidth+7)/8)*i, (elemBitWidth+7)/8);
+    }
+  break;
+  }
+  default:
+    SmallString<256> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "Cannot load value of type " << *Ty << "!";
+    report_fatal_error(OS.str());
+  }
+}
+
+void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
+  DEBUG(dbgs() << "JIT: Initializing " << Addr << " ");
+  DEBUG(Init->dump());
+  if (isa<UndefValue>(Init))
+    return;
+  
+  if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
+    unsigned ElementSize =
+      getDataLayout()->getTypeAllocSize(CP->getType()->getElementType());
+    for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+      InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
+    return;
+  }
+  
+  if (isa<ConstantAggregateZero>(Init)) {
+    memset(Addr, 0, (size_t)getDataLayout()->getTypeAllocSize(Init->getType()));
+    return;
+  }
+  
+  if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
+    unsigned ElementSize =
+      getDataLayout()->getTypeAllocSize(CPA->getType()->getElementType());
+    for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
+      InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
+    return;
+  }
+  
+  if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
+    const StructLayout *SL =
+      getDataLayout()->getStructLayout(cast<StructType>(CPS->getType()));
+    for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
+      InitializeMemory(CPS->getOperand(i), (char*)Addr+SL->getElementOffset(i));
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+               dyn_cast<ConstantDataSequential>(Init)) {
+    // CDS is already laid out in host memory order.
+    StringRef Data = CDS->getRawDataValues();
+    memcpy(Addr, Data.data(), Data.size());
+    return;
+  }
+
+  if (Init->getType()->isFirstClassType()) {
+    GenericValue Val = getConstantValue(Init);
+    StoreValueToMemory(Val, (GenericValue*)Addr, Init->getType());
+    return;
+  }
+
+  DEBUG(dbgs() << "Bad Type: " << *Init->getType() << "\n");
+  llvm_unreachable("Unknown constant type to initialize memory with!");
+}
+
+/// EmitGlobals - Emit all of the global variables to memory, storing their
+/// addresses into GlobalAddress.  This must make sure to copy the contents of
+/// their initializers into the memory.
+void ExecutionEngine::emitGlobals() {
+  // Loop over all of the global variables in the program, allocating the memory
+  // to hold them.  If there is more than one module, do a prepass over globals
+  // to figure out how the different modules should link together.
+  std::map<std::pair<std::string, Type*>,
+           const GlobalValue*> LinkedGlobalsMap;
+
+  if (Modules.size() != 1) {
+    for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
+      Module &M = *Modules[m];
+      for (Module::const_global_iterator I = M.global_begin(),
+           E = M.global_end(); I != E; ++I) {
+        const GlobalValue *GV = I;
+        if (GV->hasLocalLinkage() || GV->isDeclaration() ||
+            GV->hasAppendingLinkage() || !GV->hasName())
+          continue;// Ignore external globals and globals with internal linkage.
+
+        const GlobalValue *&GVEntry =
+          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+
+        // If this is the first time we've seen this global, it is the canonical
+        // version.
+        if (!GVEntry) {
+          GVEntry = GV;
+          continue;
+        }
+
+        // If the existing global is strong, never replace it.
+        if (GVEntry->hasExternalLinkage() ||
+            GVEntry->hasDLLImportLinkage() ||
+            GVEntry->hasDLLExportLinkage())
+          continue;
+
+        // Otherwise, we know it's linkonce/weak, replace it if this is a strong
+        // symbol.  FIXME is this right for common?
+        if (GV->hasExternalLinkage() || GVEntry->hasExternalWeakLinkage())
+          GVEntry = GV;
+      }
+    }
+  }
+
+  std::vector<const GlobalValue*> NonCanonicalGlobals;
+  for (unsigned m = 0, e = Modules.size(); m != e; ++m) {
+    Module &M = *Modules[m];
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+      // In the multi-module case, see what this global maps to.
+      if (!LinkedGlobalsMap.empty()) {
+        if (const GlobalValue *GVEntry =
+              LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())]) {
+          // If something else is the canonical global, ignore this one.
+          if (GVEntry != &*I) {
+            NonCanonicalGlobals.push_back(I);
+            continue;
+          }
+        }
+      }
+
+      if (!I->isDeclaration()) {
+        addGlobalMapping(I, getMemoryForGV(I));
+      } else {
+        // External variable reference. Try to use the dynamic loader to
+        // get a pointer to it.
+        if (void *SymAddr =
+            sys::DynamicLibrary::SearchForAddressOfSymbol(I->getName()))
+          addGlobalMapping(I, SymAddr);
+        else {
+          report_fatal_error("Could not resolve external global address: "
+                            +I->getName());
+        }
+      }
+    }
+
+    // If there are multiple modules, map the non-canonical globals to their
+    // canonical location.
+    if (!NonCanonicalGlobals.empty()) {
+      for (unsigned i = 0, e = NonCanonicalGlobals.size(); i != e; ++i) {
+        const GlobalValue *GV = NonCanonicalGlobals[i];
+        const GlobalValue *CGV =
+          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+        void *Ptr = getPointerToGlobalIfAvailable(CGV);
+        assert(Ptr && "Canonical global wasn't codegen'd!");
+        addGlobalMapping(GV, Ptr);
+      }
+    }
+
+    // Now that all of the globals are set up in memory, loop through them all
+    // and initialize their contents.
+    for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+      if (!I->isDeclaration()) {
+        if (!LinkedGlobalsMap.empty()) {
+          if (const GlobalValue *GVEntry =
+                LinkedGlobalsMap[std::make_pair(I->getName(), I->getType())])
+            if (GVEntry != &*I)  // Not the canonical variable.
+              continue;
+        }
+        EmitGlobalVariable(I);
+      }
+    }
+  }
+}
+
+// EmitGlobalVariable - This method emits the specified global variable to the
+// address specified in GlobalAddresses, or allocates new memory if it's not
+// already in the map.
+void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
+  void *GA = getPointerToGlobalIfAvailable(GV);
+
+  if (GA == 0) {
+    // If it's not already specified, allocate memory for the global.
+    GA = getMemoryForGV(GV);
+    addGlobalMapping(GV, GA);
+  }
+
+  // Don't initialize if it's thread local, let the client do it.
+  if (!GV->isThreadLocal())
+    InitializeMemory(GV->getInitializer(), GA);
+
+  Type *ElTy = GV->getType()->getElementType();
+  size_t GVSize = (size_t)getDataLayout()->getTypeAllocSize(ElTy);
+  NumInitBytes += (unsigned)GVSize;
+  ++NumGlobals;
+}
+
+ExecutionEngineState::ExecutionEngineState(ExecutionEngine &EE)
+  : EE(EE), GlobalAddressMap(this) {
+}
+
+sys::Mutex *
+ExecutionEngineState::AddressMapConfig::getMutex(ExecutionEngineState *EES) {
+  return &EES->EE.lock;
+}
+
+void ExecutionEngineState::AddressMapConfig::onDelete(ExecutionEngineState *EES,
+                                                      const GlobalValue *Old) {
+  void *OldVal = EES->GlobalAddressMap.lookup(Old);
+  EES->GlobalAddressReverseMap.erase(OldVal);
+}
+
+void ExecutionEngineState::AddressMapConfig::onRAUW(ExecutionEngineState *,
+                                                    const GlobalValue *,
+                                                    const GlobalValue *) {
+  llvm_unreachable("The ExecutionEngine doesn't know how to handle a"
+                   " RAUW on a value it has a global mapping for.");
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
new file mode 100644
index 000000000000..f4e8246476a5
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -0,0 +1,252 @@
+//===-- ExecutionEngineBindings.cpp - C bindings for EEs ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the C bindings for the ExecutionEngine library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm-c/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstring>
+
+using namespace llvm;
+
+/*===-- Operations on generic values --------------------------------------===*/
+
+LLVMGenericValueRef LLVMCreateGenericValueOfInt(LLVMTypeRef Ty,
+                                                unsigned long long N,
+                                                LLVMBool IsSigned) {
+  GenericValue *GenVal = new GenericValue();
+  GenVal->IntVal = APInt(unwrap<IntegerType>(Ty)->getBitWidth(), N, IsSigned);
+  return wrap(GenVal);
+}
+
+LLVMGenericValueRef LLVMCreateGenericValueOfPointer(void *P) {
+  GenericValue *GenVal = new GenericValue();
+  GenVal->PointerVal = P;
+  return wrap(GenVal);
+}
+
+LLVMGenericValueRef LLVMCreateGenericValueOfFloat(LLVMTypeRef TyRef, double N) {
+  GenericValue *GenVal = new GenericValue();
+  switch (unwrap(TyRef)->getTypeID()) {
+  case Type::FloatTyID:
+    GenVal->FloatVal = N;
+    break;
+  case Type::DoubleTyID:
+    GenVal->DoubleVal = N;
+    break;
+  default:
+    llvm_unreachable("LLVMGenericValueToFloat supports only float and double.");
+  }
+  return wrap(GenVal);
+}
+
+unsigned LLVMGenericValueIntWidth(LLVMGenericValueRef GenValRef) {
+  return unwrap(GenValRef)->IntVal.getBitWidth();
+}
+
+unsigned long long LLVMGenericValueToInt(LLVMGenericValueRef GenValRef,
+                                         LLVMBool IsSigned) {
+  GenericValue *GenVal = unwrap(GenValRef);
+  if (IsSigned)
+    return GenVal->IntVal.getSExtValue();
+  else
+    return GenVal->IntVal.getZExtValue();
+}
+
+void *LLVMGenericValueToPointer(LLVMGenericValueRef GenVal) {
+  return unwrap(GenVal)->PointerVal;
+}
+
+double LLVMGenericValueToFloat(LLVMTypeRef TyRef, LLVMGenericValueRef GenVal) {
+  switch (unwrap(TyRef)->getTypeID()) {
+  case Type::FloatTyID:
+    return unwrap(GenVal)->FloatVal;
+  case Type::DoubleTyID:
+    return unwrap(GenVal)->DoubleVal;
+  default:
+    llvm_unreachable("LLVMGenericValueToFloat supports only float and double.");
+  }
+}
+
+void LLVMDisposeGenericValue(LLVMGenericValueRef GenVal) {
+  delete unwrap(GenVal);
+}
+
+/*===-- Operations on execution engines -----------------------------------===*/
+
+LLVMBool LLVMCreateExecutionEngineForModule(LLVMExecutionEngineRef *OutEE,
+                                            LLVMModuleRef M,
+                                            char **OutError) {
+  std::string Error;
+  EngineBuilder builder(unwrap(M));
+  builder.setEngineKind(EngineKind::Either)
+         .setErrorStr(&Error);
+  if (ExecutionEngine *EE = builder.create()){
+    *OutEE = wrap(EE);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+LLVMBool LLVMCreateInterpreterForModule(LLVMExecutionEngineRef *OutInterp,
+                                        LLVMModuleRef M,
+                                        char **OutError) {
+  std::string Error;
+  EngineBuilder builder(unwrap(M));
+  builder.setEngineKind(EngineKind::Interpreter)
+         .setErrorStr(&Error);
+  if (ExecutionEngine *Interp = builder.create()) {
+    *OutInterp = wrap(Interp);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+LLVMBool LLVMCreateJITCompilerForModule(LLVMExecutionEngineRef *OutJIT,
+                                        LLVMModuleRef M,
+                                        unsigned OptLevel,
+                                        char **OutError) {
+  std::string Error;
+  EngineBuilder builder(unwrap(M));
+  builder.setEngineKind(EngineKind::JIT)
+         .setErrorStr(&Error)
+         .setOptLevel((CodeGenOpt::Level)OptLevel);
+  if (ExecutionEngine *JIT = builder.create()) {
+    *OutJIT = wrap(JIT);
+    return 0;
+  }
+  *OutError = strdup(Error.c_str());
+  return 1;
+}
+
+LLVMBool LLVMCreateExecutionEngine(LLVMExecutionEngineRef *OutEE,
+                                   LLVMModuleProviderRef MP,
+                                   char **OutError) {
+  /* The module provider is now actually a module. */
+  return LLVMCreateExecutionEngineForModule(OutEE,
+                                            reinterpret_cast<LLVMModuleRef>(MP),
+                                            OutError);
+}
+
+LLVMBool LLVMCreateInterpreter(LLVMExecutionEngineRef *OutInterp,
+                               LLVMModuleProviderRef MP,
+                               char **OutError) {
+  /* The module provider is now actually a module. */
+  return LLVMCreateInterpreterForModule(OutInterp,
+                                        reinterpret_cast<LLVMModuleRef>(MP),
+                                        OutError);
+}
+
+LLVMBool LLVMCreateJITCompiler(LLVMExecutionEngineRef *OutJIT,
+                               LLVMModuleProviderRef MP,
+                               unsigned OptLevel,
+                               char **OutError) {
+  /* The module provider is now actually a module. */
+  return LLVMCreateJITCompilerForModule(OutJIT,
+                                        reinterpret_cast<LLVMModuleRef>(MP),
+                                        OptLevel, OutError);
+}
+
+
+void LLVMDisposeExecutionEngine(LLVMExecutionEngineRef EE) {
+  delete unwrap(EE);
+}
+
+void LLVMRunStaticConstructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->runStaticConstructorsDestructors(false);
+}
+
+void LLVMRunStaticDestructors(LLVMExecutionEngineRef EE) {
+  unwrap(EE)->runStaticConstructorsDestructors(true);
+}
+
+int LLVMRunFunctionAsMain(LLVMExecutionEngineRef EE, LLVMValueRef F,
+                          unsigned ArgC, const char * const *ArgV,
+                          const char * const *EnvP) {
+  std::vector<std::string> ArgVec;
+  for (unsigned I = 0; I != ArgC; ++I)
+    ArgVec.push_back(ArgV[I]);
+  
+  return unwrap(EE)->runFunctionAsMain(unwrap<Function>(F), ArgVec, EnvP);
+}
+
+LLVMGenericValueRef LLVMRunFunction(LLVMExecutionEngineRef EE, LLVMValueRef F,
+                                    unsigned NumArgs,
+                                    LLVMGenericValueRef *Args) {
+  std::vector<GenericValue> ArgVec;
+  ArgVec.reserve(NumArgs);
+  for (unsigned I = 0; I != NumArgs; ++I)
+    ArgVec.push_back(*unwrap(Args[I]));
+  
+  GenericValue *Result = new GenericValue();
+  *Result = unwrap(EE)->runFunction(unwrap<Function>(F), ArgVec);
+  return wrap(Result);
+}
+
+void LLVMFreeMachineCodeForFunction(LLVMExecutionEngineRef EE, LLVMValueRef F) {
+  unwrap(EE)->freeMachineCodeForFunction(unwrap<Function>(F));
+}
+
+void LLVMAddModule(LLVMExecutionEngineRef EE, LLVMModuleRef M){
+  unwrap(EE)->addModule(unwrap(M));
+}
+
+void LLVMAddModuleProvider(LLVMExecutionEngineRef EE, LLVMModuleProviderRef MP){
+  /* The module provider is now actually a module. */
+  LLVMAddModule(EE, reinterpret_cast<LLVMModuleRef>(MP));
+}
+
+LLVMBool LLVMRemoveModule(LLVMExecutionEngineRef EE, LLVMModuleRef M,
+                          LLVMModuleRef *OutMod, char **OutError) {
+  Module *Mod = unwrap(M);
+  unwrap(EE)->removeModule(Mod);
+  *OutMod = wrap(Mod);
+  return 0;
+}
+
+LLVMBool LLVMRemoveModuleProvider(LLVMExecutionEngineRef EE,
+                                  LLVMModuleProviderRef MP,
+                                  LLVMModuleRef *OutMod, char **OutError) {
+  /* The module provider is now actually a module. */
+  return LLVMRemoveModule(EE, reinterpret_cast<LLVMModuleRef>(MP), OutMod,
+                          OutError);
+}
+
+LLVMBool LLVMFindFunction(LLVMExecutionEngineRef EE, const char *Name,
+                          LLVMValueRef *OutFn) {
+  if (Function *F = unwrap(EE)->FindFunctionNamed(Name)) {
+    *OutFn = wrap(F);
+    return 0;
+  }
+  return 1;
+}
+
+void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE, LLVMValueRef Fn) {
+  return unwrap(EE)->recompileAndRelinkFunction(unwrap<Function>(Fn));
+}
+
+LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {
+  return wrap(unwrap(EE)->getDataLayout());
+}
+
+void LLVMAddGlobalMapping(LLVMExecutionEngineRef EE, LLVMValueRef Global,
+                          void* Addr) {
+  unwrap(EE)->addGlobalMapping(unwrap<GlobalValue>(Global), Addr);
+}
+
+void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
+  return unwrap(EE)->getPointerToGlobal(unwrap<GlobalValue>(Global));
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
new file mode 100644
index 000000000000..7dc295fcbf73
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -0,0 +1,302 @@
+//===-- IntelJITEventListener.cpp - Tell Intel profiler about JITed code --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITEventListener object to tell Intel(R) VTune(TM)
+// Amplifier XE 2011 about JITted functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+
+#define DEBUG_TYPE "amplifier-jit-event-listener"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/ValueHandle.h"
+#include "EventListenerCommon.h"
+#include "IntelJITEventsWrapper.h"
+
+using namespace llvm;
+using namespace llvm::jitprofiling;
+
+namespace {
+
+class IntelJITEventListener : public JITEventListener {
+  typedef DenseMap<void*, unsigned int> MethodIDMap;
+
+  OwningPtr<IntelJITEventsWrapper> Wrapper;
+  MethodIDMap MethodIDs;
+  FilenameCache Filenames;
+
+  typedef SmallVector<const void *, 64> MethodAddressVector;
+  typedef DenseMap<const void *, MethodAddressVector>  ObjectMap;
+
+  ObjectMap  LoadedObjectMap;
+
+public:
+  IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) {
+      Wrapper.reset(libraryWrapper);
+  }
+
+  ~IntelJITEventListener() {
+  }
+
+  virtual void NotifyFunctionEmitted(const Function &F,
+                                     void *FnStart, size_t FnSize,
+                                     const EmittedFunctionDetails &Details);
+
+  virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
+};
+
+static LineNumberInfo LineStartToIntelJITFormat(
+    uintptr_t StartAddress,
+    uintptr_t Address,
+    DebugLoc Loc) {
+  LineNumberInfo Result;
+
+  Result.Offset = Address - StartAddress;
+  Result.LineNumber = Loc.getLine();
+
+  return Result;
+}
+
+static LineNumberInfo DILineInfoToIntelJITFormat(uintptr_t StartAddress,
+                                                 uintptr_t Address,
+                                                 DILineInfo Line) {
+  LineNumberInfo Result;
+
+  Result.Offset = Address - StartAddress;
+  Result.LineNumber = Line.getLine();
+
+  return Result;
+}
+
+static iJIT_Method_Load FunctionDescToIntelJITFormat(
+    IntelJITEventsWrapper& Wrapper,
+    const char* FnName,
+    uintptr_t FnStart,
+    size_t FnSize) {
+  iJIT_Method_Load Result;
+  memset(&Result, 0, sizeof(iJIT_Method_Load));
+
+  Result.method_id = Wrapper.iJIT_GetNewMethodID();
+  Result.method_name = const_cast<char*>(FnName);
+  Result.method_load_address = reinterpret_cast<void*>(FnStart);
+  Result.method_size = FnSize;
+
+  Result.class_id = 0;
+  Result.class_file_name = NULL;
+  Result.user_data = NULL;
+  Result.user_data_size = 0;
+  Result.env = iJDE_JittingAPI;
+
+  return Result;
+}
+
+// Adds the just-emitted function to the symbol table.
+void IntelJITEventListener::NotifyFunctionEmitted(
+    const Function &F, void *FnStart, size_t FnSize,
+    const EmittedFunctionDetails &Details) {
+  iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(*Wrapper,
+                                      F.getName().data(),
+                                      reinterpret_cast<uint64_t>(FnStart),
+                                      FnSize);
+
+  std::vector<LineNumberInfo> LineInfo;
+
+  if (!Details.LineStarts.empty()) {
+    // Now convert the line number information from the address/DebugLoc
+    // format in Details to the offset/lineno in Intel JIT API format.
+
+    LineInfo.reserve(Details.LineStarts.size() + 1);
+
+    DebugLoc FirstLoc = Details.LineStarts[0].Loc;
+    assert(!FirstLoc.isUnknown()
+           && "LineStarts should not contain unknown DebugLocs");
+
+    MDNode *FirstLocScope = FirstLoc.getScope(F.getContext());
+    DISubprogram FunctionDI = getDISubprogram(FirstLocScope);
+    if (FunctionDI.Verify()) {
+      FunctionMessage.source_file_name = const_cast<char*>(
+                                          Filenames.getFullPath(FirstLocScope));
+
+      LineNumberInfo FirstLine;
+      FirstLine.Offset = 0;
+      FirstLine.LineNumber = FunctionDI.getLineNumber();
+      LineInfo.push_back(FirstLine);
+    }
+
+    for (std::vector<EmittedFunctionDetails::LineStart>::const_iterator I =
+          Details.LineStarts.begin(), E = Details.LineStarts.end();
+          I != E; ++I) {
+      // This implementation ignores the DebugLoc filename because the Intel
+      // JIT API does not support multiple source files associated with a single
+      // JIT function
+      LineInfo.push_back(LineStartToIntelJITFormat(
+                          reinterpret_cast<uintptr_t>(FnStart),
+                          I->Address,
+                          I->Loc));
+
+      // If we have no file name yet for the function, use the filename from
+      // the first instruction that has one
+      if (FunctionMessage.source_file_name == 0) {
+        MDNode *scope = I->Loc.getScope(
+          Details.MF->getFunction()->getContext());
+        FunctionMessage.source_file_name = const_cast<char*>(
+                                                  Filenames.getFullPath(scope));
+      }
+    }
+
+    FunctionMessage.line_number_size = LineInfo.size();
+    FunctionMessage.line_number_table = &*LineInfo.begin();
+  } else {
+    FunctionMessage.line_number_size = 0;
+    FunctionMessage.line_number_table = 0;
+  }
+
+  Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+                            &FunctionMessage);
+  MethodIDs[FnStart] = FunctionMessage.method_id;
+}
+
+void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
+  MethodIDMap::iterator I = MethodIDs.find(FnStart);
+  if (I != MethodIDs.end()) {
+    Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &I->second);
+    MethodIDs.erase(I);
+  }
+}
+
+void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+  // Get the address of the object image for use as a unique identifier
+  const void* ObjData = Obj.getData().data();
+  DIContext* Context = DIContext::getDWARFContext(Obj.getObjectFile());
+  MethodAddressVector Functions;
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    std::vector<LineNumberInfo> LineInfo;
+    std::string SourceFileName;
+
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      StringRef  Name;
+      uint64_t   Addr;
+      uint64_t   Size;
+      if (I->getName(Name)) continue;
+      if (I->getAddress(Addr)) continue;
+      if (I->getSize(Size)) continue;
+
+      // Record this address in a local vector
+      Functions.push_back((void*)Addr);
+
+      // Build the function loaded notification message
+      iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(*Wrapper,
+                                           Name.data(),
+                                           Addr,
+                                           Size);
+      if (Context) {
+        DILineInfoTable  Lines = Context->getLineInfoForAddressRange(Addr, Size);
+        DILineInfoTable::iterator  Begin = Lines.begin();
+        DILineInfoTable::iterator  End = Lines.end();
+        for (DILineInfoTable::iterator It = Begin; It != End; ++It) {
+          LineInfo.push_back(DILineInfoToIntelJITFormat((uintptr_t)Addr,
+                                                        It->first,
+                                                        It->second));
+        }
+        if (LineInfo.size() == 0) {
+          FunctionMessage.source_file_name = 0;
+          FunctionMessage.line_number_size = 0;
+          FunctionMessage.line_number_table = 0;
+        } else {
+          SourceFileName = Lines.front().second.getFileName();
+          FunctionMessage.source_file_name = (char *)SourceFileName.c_str();
+          FunctionMessage.line_number_size = LineInfo.size();
+          FunctionMessage.line_number_table = &*LineInfo.begin();
+        }
+      } else {
+        FunctionMessage.source_file_name = 0;
+        FunctionMessage.line_number_size = 0;
+        FunctionMessage.line_number_table = 0;
+      }
+
+      Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+                                &FunctionMessage);
+      MethodIDs[(void*)Addr] = FunctionMessage.method_id;
+    }
+  }
+
+  // To support object unload notification, we need to keep a list of
+  // registered function addresses for each loaded object.  We will
+  // use the MethodIDs map to get the registered ID for each function.
+  LoadedObjectMap[ObjData] = Functions;
+}
+
+void IntelJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+  // Get the address of the object image for use as a unique identifier
+  const void* ObjData = Obj.getData().data();
+
+  // Get the object's function list from LoadedObjectMap
+  ObjectMap::iterator OI = LoadedObjectMap.find(ObjData);
+  if (OI == LoadedObjectMap.end())
+    return;
+  MethodAddressVector& Functions = OI->second;
+
+  // Walk the function list, unregistering each function
+  for (MethodAddressVector::iterator FI = Functions.begin(),
+                                     FE = Functions.end();
+       FI != FE;
+       ++FI) {
+    void* FnStart = const_cast<void*>(*FI);
+    MethodIDMap::iterator MI = MethodIDs.find(FnStart);
+    if (MI != MethodIDs.end()) {
+      Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START,
+                                &MI->second);
+      MethodIDs.erase(MI);
+    }
+  }
+
+  // Erase the object from LoadedObjectMap
+  LoadedObjectMap.erase(OI);
+}
+
+}  // anonymous namespace.
+
+namespace llvm {
+JITEventListener *JITEventListener::createIntelJITEventListener() {
+  return new IntelJITEventListener(new IntelJITEventsWrapper);
+}
+
+// for testing
+JITEventListener *JITEventListener::createIntelJITEventListener(
+                                      IntelJITEventsWrapper* TestImpl) {
+  return new IntelJITEventListener(TestImpl);
+}
+
+} // namespace llvm
+
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
new file mode 100644
index 000000000000..3d9ff5351610
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -0,0 +1,96 @@
+//===-- IntelJITEventsWrapper.h - Intel JIT Events API Wrapper --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a wrapper for the Intel JIT Events API. It allows for the
+// implementation of the jitprofiling library to be swapped with an alternative
+// implementation (for testing). To include this file, you must have the
+// jitprofiling.h header available; it is available in Intel(R) VTune(TM)
+// Amplifier XE 2011.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INTEL_JIT_EVENTS_WRAPPER_H
+#define INTEL_JIT_EVENTS_WRAPPER_H
+
+#include "jitprofiling.h"
+
+namespace llvm {
+
+class IntelJITEventsWrapper {
+  // Function pointer types for testing implementation of Intel jitprofiling
+  // library
+  typedef int (*NotifyEventPtr)(iJIT_JVM_EVENT, void*);
+  typedef void (*RegisterCallbackExPtr)(void *, iJIT_ModeChangedEx );
+  typedef iJIT_IsProfilingActiveFlags (*IsProfilingActivePtr)(void);
+  typedef void (*FinalizeThreadPtr)(void);
+  typedef void (*FinalizeProcessPtr)(void);
+  typedef unsigned int (*GetNewMethodIDPtr)(void);
+
+  NotifyEventPtr NotifyEventFunc;
+  RegisterCallbackExPtr RegisterCallbackExFunc;
+  IsProfilingActivePtr IsProfilingActiveFunc;
+  GetNewMethodIDPtr GetNewMethodIDFunc;
+
+public:
+  bool isAmplifierRunning() {
+    return iJIT_IsProfilingActive() == iJIT_SAMPLING_ON;
+  }
+
+  IntelJITEventsWrapper()
+  : NotifyEventFunc(::iJIT_NotifyEvent),
+    RegisterCallbackExFunc(::iJIT_RegisterCallbackEx),
+    IsProfilingActiveFunc(::iJIT_IsProfilingActive),
+    GetNewMethodIDFunc(::iJIT_GetNewMethodID) {
+  }
+
+  IntelJITEventsWrapper(NotifyEventPtr NotifyEventImpl,
+                   RegisterCallbackExPtr RegisterCallbackExImpl,
+                   IsProfilingActivePtr IsProfilingActiveImpl,
+                   FinalizeThreadPtr FinalizeThreadImpl,
+                   FinalizeProcessPtr FinalizeProcessImpl,
+                   GetNewMethodIDPtr GetNewMethodIDImpl)
+  : NotifyEventFunc(NotifyEventImpl),
+    RegisterCallbackExFunc(RegisterCallbackExImpl),
+    IsProfilingActiveFunc(IsProfilingActiveImpl),
+    GetNewMethodIDFunc(GetNewMethodIDImpl) {
+  }
+
+  // Sends an event anncouncing that a function has been emitted
+  //   return values are event-specific.  See Intel documentation for details.
+  int  iJIT_NotifyEvent(iJIT_JVM_EVENT EventType, void *EventSpecificData) {
+    if (!NotifyEventFunc)
+      return -1;
+    return NotifyEventFunc(EventType, EventSpecificData);
+  }
+
+  // Registers a callback function to receive notice of profiling state changes
+  void iJIT_RegisterCallbackEx(void *UserData,
+                               iJIT_ModeChangedEx NewModeCallBackFuncEx) {
+    if (RegisterCallbackExFunc)
+      RegisterCallbackExFunc(UserData, NewModeCallBackFuncEx);
+  }
+
+  // Returns the current profiler mode
+  iJIT_IsProfilingActiveFlags iJIT_IsProfilingActive(void) {
+    if (!IsProfilingActiveFunc)
+      return iJIT_NOTHING_RUNNING;
+    return IsProfilingActiveFunc();
+  }
+
+  // Generates a locally unique method ID for use in code registration
+  unsigned int iJIT_GetNewMethodID(void) {
+    if (!GetNewMethodIDFunc)
+      return -1;
+    return GetNewMethodIDFunc();
+  }
+};
+
+} //namespace llvm
+
+#endif //INTEL_JIT_EVENTS_WRAPPER_H
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h
new file mode 100644
index 000000000000..1f029fb1c45b
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h
@@ -0,0 +1,454 @@
+/*===-- ittnotify_config.h - JIT Profiling API internal config-----*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API internal config.
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef _ITTNOTIFY_CONFIG_H_
+#define _ITTNOTIFY_CONFIG_H_
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif /* _WIN32 */
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define CDECL /* not actual on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define CDECL __attribute__ ((cdecl))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define STDCALL /* not supported on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define STDCALL __attribute__ ((stdcall))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    CDECL
+#define LIBITTAPI CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    CDECL
+#define LIBITTAPI_CALL CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#endif /* __STRICT_ANSI__ */
+#define ITT_INLINE_ATTRIBUTE __attribute__ ((always_inline))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifndef ITT_ARCH_IA32
+#  define ITT_ARCH_IA32  1
+#endif /* ITT_ARCH_IA32 */
+
+#ifndef ITT_ARCH_IA32E
+#  define ITT_ARCH_IA32E 2
+#endif /* ITT_ARCH_IA32E */
+
+#ifndef ITT_ARCH_IA64
+#  define ITT_ARCH_IA64  3
+#endif /* ITT_ARCH_IA64 */
+
+#ifndef ITT_ARCH
+#  if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#    define ITT_ARCH ITT_ARCH_IA32E
+#  elif defined _M_IA64 || defined __ia64
+#    define ITT_ARCH ITT_ARCH_IA64
+#  else
+#    define ITT_ARCH ITT_ARCH_IA32
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define ITT_EXTERN_C extern "C"
+#else
+#  define ITT_EXTERN_C /* nothing */
+#endif /* __cplusplus */
+
+#define ITT_TO_STR_AUX(x) #x
+#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+
+#define __ITT_BUILD_ASSERT(expr, suffix) do { \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
+    __itt_build_check_##suffix[0] = 0; \
+} while(0)
+#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+
+#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+
+/* Replace with snapshot date YYYYMMDD for promotion build. */
+#define API_VERSION_BUILD    20111111
+
+#ifndef API_VERSION_NUM
+#define API_VERSION_NUM 0.0.0
+#endif /* API_VERSION_NUM */
+
+#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
+                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+
+/* OS communication functions */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+typedef HMODULE           lib_t;
+typedef DWORD             TIDT;
+typedef CRITICAL_SECTION  mutex_t;
+#define MUTEX_INITIALIZER { 0 }
+#define strong_alias(name, aliasname) /* empty for Windows */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <dlfcn.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
+#endif /* _GNU_SOURCE */
+#include <pthread.h>
+typedef void*             lib_t;
+typedef pthread_t         TIDT;
+typedef pthread_mutex_t   mutex_t;
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#define _strong_alias(name, aliasname) \
+            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_get_proc(lib, name) GetProcAddress(lib, name)
+#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_load_lib(name)      LoadLibraryA(name)
+#define __itt_unload_lib(handle)  FreeLibrary(handle)
+#define __itt_system_error()      (int)GetLastError()
+#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
+#define __itt_fstrlen(s)          lstrlenA(s)
+#define __itt_fstrcpyn(s1, s2, l) lstrcpynA(s1, s2, l)
+#define __itt_fstrdup(s)          _strdup(s)
+#define __itt_thread_id()         GetCurrentThreadId()
+#define __itt_thread_yield()      SwitchToThread()
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long 
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return InterlockedIncrement(ptr);
+}
+#endif /* ITT_SIMPLE_INIT */
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#define __itt_get_proc(lib, name) dlsym(lib, name)
+#define __itt_mutex_init(mutex)   {\
+    pthread_mutexattr_t mutex_attr;                                         \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
+                                           PTHREAD_MUTEX_RECURSIVE);        \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
+                           error_code);                                     \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
+                           error_code);                                     \
+}
+#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle)  dlclose(handle)
+#define __itt_system_error()      errno
+#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+#define __itt_fstrlen(s)          strlen(s)
+#define __itt_fstrcpyn(s1, s2, l) strncpy(s1, s2, l)
+#define __itt_fstrdup(s)          strdup(s)
+#define __itt_thread_id()         pthread_self()
+#define __itt_thread_yield()      sched_yield()
+#if ITT_ARCH==ITT_ARCH_IA64
+#ifdef __INTEL_COMPILER
+#define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
+#else  /* __INTEL_COMPILER */
+/* TODO: Add Support for not Intel compilers for IA64 */
+#endif /* __INTEL_COMPILER */
+#else /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long
+__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
+{
+    long result;
+    __asm__ __volatile__("lock\nxadd %0,%1"
+                          : "=r"(result),"=m"(*(long*)ptr)
+                          : "0"(addend), "m"(*(long*)ptr)
+                          : "memory");
+    return result;
+}
+#endif /* ITT_ARCH==ITT_ARCH_IA64 */
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long 
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+}
+#endif /* ITT_SIMPLE_INIT */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+typedef enum {
+    __itt_collection_normal = 0,
+    __itt_collection_paused = 1
+} __itt_collection_state;
+
+typedef enum {
+    __itt_thread_normal  = 0,
+    __itt_thread_ignored = 1
+} __itt_thread_state;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_thread_info
+{
+    const char* nameA; /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    TIDT               tid;
+    __itt_thread_state state;   /*!< Thread state (paused or normal) */
+    int                extra1;  /*!< Reserved to the runtime */
+    void*              extra2;  /*!< Reserved to the runtime */
+    struct ___itt_thread_info* next;
+} __itt_thread_info;
+
+#include "ittnotify_types.h" /* For __itt_group_id definition */
+
+typedef struct ___itt_api_info_20101001
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    __itt_group_id group;
+}  __itt_api_info_20101001;
+
+typedef struct ___itt_api_info
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    void*          null_func;
+    __itt_group_id group;
+}  __itt_api_info;
+
+struct ___itt_domain;
+struct ___itt_string_handle;
+
+typedef struct ___itt_global
+{
+    unsigned char          magic[8];
+    unsigned long          version_major;
+    unsigned long          version_minor;
+    unsigned long          version_build;
+    volatile long          api_initialized;
+    volatile long          mutex_initialized;
+    volatile long          atomic_counter;
+    mutex_t                mutex;
+    lib_t                  lib;
+    void*                  error_handler;
+    const char**           dll_path_ptr;
+    __itt_api_info*        api_list_ptr;
+    struct ___itt_global*  next;
+    /* Joinable structures below */
+    __itt_thread_info*     thread_list;
+    struct ___itt_domain*  domain_list;
+    struct ___itt_string_handle* string_list;
+    __itt_collection_state state;
+} __itt_global;
+
+#pragma pack(pop)
+
+#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = NULL; \
+        h->nameW  = n ? _wcsdup(n) : NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        h->nameW  = NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW  = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = NULL; \
+        h->strW   = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        h->strW   = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h
new file mode 100644
index 000000000000..5df752f66f10
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h
@@ -0,0 +1,70 @@
+/*===-- ittnotify_types.h - JIT Profiling API internal types--------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef _ITTNOTIFY_TYPES_H_
+#define _ITTNOTIFY_TYPES_H_
+
+typedef enum ___itt_group_id
+{
+    __itt_group_none      = 0,
+    __itt_group_legacy    = 1<<0,
+    __itt_group_control   = 1<<1,
+    __itt_group_thread    = 1<<2,
+    __itt_group_mark      = 1<<3,
+    __itt_group_sync      = 1<<4,
+    __itt_group_fsync     = 1<<5,
+    __itt_group_jit       = 1<<6,
+    __itt_group_model     = 1<<7,
+    __itt_group_splitter_min = 1<<7,
+    __itt_group_counter   = 1<<8,
+    __itt_group_frame     = 1<<9,
+    __itt_group_stitch    = 1<<10,
+    __itt_group_heap      = 1<<11,
+    __itt_group_splitter_max = 1<<12,
+    __itt_group_structure = 1<<12,
+    __itt_group_suppress = 1<<13,
+    __itt_group_all       = -1
+} __itt_group_id;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_list
+{
+    __itt_group_id id;
+    const char*    name;
+} __itt_group_list;
+
+#pragma pack(pop)
+
+#define ITT_GROUP_LIST(varname) \
+    static __itt_group_list varname[] = {       \
+        { __itt_group_all,       "all"       }, \
+        { __itt_group_control,   "control"   }, \
+        { __itt_group_thread,    "thread"    }, \
+        { __itt_group_mark,      "mark"      }, \
+        { __itt_group_sync,      "sync"      }, \
+        { __itt_group_fsync,     "fsync"     }, \
+        { __itt_group_jit,       "jit"       }, \
+        { __itt_group_model,     "model"     }, \
+        { __itt_group_counter,   "counter"   }, \
+        { __itt_group_frame,     "frame"     }, \
+        { __itt_group_stitch,    "stitch"    }, \
+        { __itt_group_heap,      "heap"      }, \
+        { __itt_group_structure, "structure" }, \
+        { __itt_group_suppress,  "suppress"  }, \
+        { __itt_group_none,      NULL        }  \
+    }
+
+#endif /* _ITTNOTIFY_TYPES_H_ */
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
new file mode 100644
index 000000000000..7b507de864cd
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
@@ -0,0 +1,481 @@
+/*===-- jitprofiling.c - JIT (Just-In-Time) Profiling API----------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API implementation. 
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+#pragma optimize("", off)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <pthread.h>
+#include <dlfcn.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "jitprofiling.h"
+
+static const char rcsid[] = "\n@(#) $Revision: 243501 $\n";
+
+#define DLL_ENVIRONMENT_VAR             "VS_PROFILER"
+
+#ifndef NEW_DLL_ENVIRONMENT_VAR
+#if ITT_ARCH==ITT_ARCH_IA32
+#define NEW_DLL_ENVIRONMENT_VAR	        "INTEL_JIT_PROFILER32"
+#else
+#define NEW_DLL_ENVIRONMENT_VAR	        "INTEL_JIT_PROFILER64"
+#endif
+#endif /* NEW_DLL_ENVIRONMENT_VAR */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define DEFAULT_DLLNAME                 "JitPI.dll"
+HINSTANCE m_libHandle = NULL;
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define DEFAULT_DLLNAME                 "libJitPI.so"
+void* m_libHandle = NULL;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/* default location of JIT profiling agent on Android */
+#define ANDROID_JIT_AGENT_PATH  "/data/intel/libittnotify.so"
+
+/* the function pointers */
+typedef unsigned int(*TPInitialize)(void);
+static TPInitialize FUNC_Initialize=NULL;
+
+typedef unsigned int(*TPNotify)(unsigned int, void*);
+static TPNotify FUNC_NotifyEvent=NULL;
+
+static iJIT_IsProfilingActiveFlags executionMode = iJIT_NOTHING_RUNNING;
+
+/* end collector dll part. */
+
+/* loadiJIT_Funcs() : this function is called just in the beginning 
+ *  and is responsible to load the functions from BistroJavaCollector.dll
+ * result:
+ *  on success: the functions loads, iJIT_DLL_is_missing=0, return value = 1
+ *  on failure: the functions are NULL, iJIT_DLL_is_missing=1, return value = 0
+ */ 
+static int loadiJIT_Funcs(void);
+
+/* global representing whether the BistroJavaCollector can't be loaded */
+static int iJIT_DLL_is_missing = 0;
+
+/* Virtual stack - the struct is used as a virtual stack for each thread.
+ * Every thread initializes with a stack of size INIT_TOP_STACK.
+ * Every method entry decreases from the current stack point,
+ * and when a thread stack reaches its top of stack (return from the global 
+ * function), the top of stack and the current stack increase. Notice that 
+ * when returning from a function the stack pointer is the address of 
+ * the function return.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static DWORD threadLocalStorageHandle = 0;
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static pthread_key_t threadLocalStorageHandle = (pthread_key_t)0;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#define INIT_TOP_Stack 10000
+
+typedef struct 
+{
+    unsigned int TopStack;
+    unsigned int CurrentStack;
+} ThreadStack, *pThreadStack;
+
+/* end of virtual stack. */
+
+/*
+ * The function for reporting virtual-machine related events to VTune.
+ * Note: when reporting iJVM_EVENT_TYPE_ENTER_NIDS, there is no need to fill 
+ * in the stack_id field in the iJIT_Method_NIDS structure, as VTune fills it.
+ * The return value in iJVM_EVENT_TYPE_ENTER_NIDS && 
+ * iJVM_EVENT_TYPE_LEAVE_NIDS events will be 0 in case of failure.
+ * in iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED event 
+ * it will be -1 if EventSpecificData == 0 otherwise it will be 0.
+*/
+
+ITT_EXTERN_C int JITAPI 
+iJIT_NotifyEvent(iJIT_JVM_EVENT event_type, void *EventSpecificData)
+{
+    int ReturnValue;
+
+    /*
+     * This section is for debugging outside of VTune. 
+     * It creates the environment variables that indicates call graph mode.
+     * If running outside of VTune remove the remark.
+     *
+     *
+     * static int firstTime = 1;
+     * char DoCallGraph[12] = "DoCallGraph";
+     * if (firstTime)
+     * {
+     * firstTime = 0;
+     * SetEnvironmentVariable( "BISTRO_COLLECTORS_DO_CALLGRAPH", DoCallGraph);
+     * }
+     *
+     * end of section.
+    */
+
+    /* initialization part - the functions have not been loaded yet. This part
+     *        will load the functions, and check if we are in Call Graph mode. 
+     *        (for special treatment).
+     */
+    if (!FUNC_NotifyEvent) 
+    {
+        if (iJIT_DLL_is_missing) 
+            return 0;
+
+        /* load the Function from the DLL */
+        if (!loadiJIT_Funcs()) 
+            return 0;
+
+        /* Call Graph initialization. */
+    }
+
+    /* If the event is method entry/exit, check that in the current mode 
+     * VTune is allowed to receive it
+     */
+    if ((event_type == iJVM_EVENT_TYPE_ENTER_NIDS || 
+         event_type == iJVM_EVENT_TYPE_LEAVE_NIDS) &&
+        (executionMode != iJIT_CALLGRAPH_ON))
+    {
+        return 0;
+    }
+    /* This section is performed when method enter event occurs.
+     * It updates the virtual stack, or creates it if this is the first 
+     * method entry in the thread. The stack pointer is decreased.
+     */
+    if (event_type == iJVM_EVENT_TYPE_ENTER_NIDS)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+            (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_NIDS) EventSpecificData)->method_id <= 999 )
+            return 0;
+
+        if (!threadStack)
+        {
+            /* initialize the stack. */
+            threadStack = (pThreadStack) calloc (sizeof(ThreadStack), 1);
+            threadStack->TopStack = INIT_TOP_Stack;
+            threadStack->CurrentStack = INIT_TOP_Stack;
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            TlsSetValue(threadLocalStorageHandle,(void*)threadStack);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+            pthread_setspecific(threadLocalStorageHandle,(void*)threadStack);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        }
+
+        /* decrease the stack. */
+        ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+            (threadStack->CurrentStack)--;
+    }
+
+    /* This section is performed when method leave event occurs
+     * It updates the virtual stack.
+     *    Increases the stack pointer.
+     *    If the stack pointer reached the top (left the global function)
+     *        increase the pointer and the top pointer.
+     */
+    if (event_type == iJVM_EVENT_TYPE_LEAVE_NIDS)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+           (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_NIDS) EventSpecificData)->method_id <= 999 )
+            return 0;
+
+        if (!threadStack)
+        {
+            /* Error: first report in this thread is method exit */
+            exit (1);
+        }
+
+        ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+            ++(threadStack->CurrentStack) + 1;
+
+        if (((piJIT_Method_NIDS) EventSpecificData)->stack_id 
+               > threadStack->TopStack)
+            ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+                (unsigned int)-1;
+    }
+
+    if (event_type == iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED)
+    {
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_Load) EventSpecificData)->method_id <= 999 )
+            return 0;
+    }
+
+    ReturnValue = (int)FUNC_NotifyEvent(event_type, EventSpecificData);   
+
+    return ReturnValue;
+}
+
+/* The new mode call back routine */
+ITT_EXTERN_C void JITAPI 
+iJIT_RegisterCallbackEx(void *userdata, iJIT_ModeChangedEx 
+                        NewModeCallBackFuncEx) 
+{
+    /* is it already missing... or the load of functions from the DLL failed */
+    if (iJIT_DLL_is_missing || !loadiJIT_Funcs())
+    {
+        /* then do not bother with notifications */
+        NewModeCallBackFuncEx(userdata, iJIT_NO_NOTIFICATIONS);  
+        /* Error: could not load JIT functions. */
+        return;
+    }
+    /* nothing to do with the callback */
+}
+
+/*
+ * This function allows the user to query in which mode, if at all, 
+ *VTune is running
+ */
+ITT_EXTERN_C iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive()
+{
+    if (!iJIT_DLL_is_missing)
+    {
+        loadiJIT_Funcs();
+    }
+
+    return executionMode;
+}
+
+/* this function loads the collector dll (BistroJavaCollector) 
+ * and the relevant functions.
+ * on success: all functions load,     iJIT_DLL_is_missing = 0, return value = 1
+ * on failure: all functions are NULL, iJIT_DLL_is_missing = 1, return value = 0
+ */ 
+static int loadiJIT_Funcs()
+{
+    static int bDllWasLoaded = 0;
+    char *dllName = (char*)rcsid; /* !! Just to avoid unused code elimination */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    DWORD dNameLength = 0;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+    if(bDllWasLoaded)
+    {
+        /* dll was already loaded, no need to do it for the second time */
+        return 1;
+    }
+
+    /* Assumes that the DLL will not be found */
+    iJIT_DLL_is_missing = 1;
+    FUNC_NotifyEvent = NULL;
+
+    if (m_libHandle) 
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        FreeLibrary(m_libHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        dlclose(m_libHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = NULL;
+    }
+
+    /* Try to get the dll name from the environment */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    dNameLength = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, NULL, 0);
+    if (dNameLength)
+    {
+        DWORD envret = 0;
+        dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
+        envret = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, 
+                                         dllName, dNameLength);
+        if (envret)
+        {
+            /* Try to load the dll from the PATH... */
+            m_libHandle = LoadLibraryExA(dllName, 
+                                         NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
+        }
+        free(dllName);
+    } else {
+        /* Try to use old VS_PROFILER variable */
+        dNameLength = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, NULL, 0);
+        if (dNameLength)
+        {
+            DWORD envret = 0;
+            dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
+            envret = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, 
+                                             dllName, dNameLength);
+            if (envret)
+            {
+                /* Try to load the dll from the PATH... */
+                m_libHandle = LoadLibraryA(dllName);
+            }
+            free(dllName);
+        }
+    }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    dllName = getenv(NEW_DLL_ENVIRONMENT_VAR);
+    if (!dllName)
+        dllName = getenv(DLL_ENVIRONMENT_VAR);
+#ifdef ANDROID
+    if (!dllName)
+        dllName = ANDROID_JIT_AGENT_PATH;
+#endif
+    if (dllName)
+    {
+        /* Try to load the dll from the PATH... */
+        m_libHandle = dlopen(dllName, RTLD_LAZY);
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+    if (!m_libHandle)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        m_libHandle = LoadLibraryA(DEFAULT_DLLNAME);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = dlopen(DEFAULT_DLLNAME, RTLD_LAZY);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+
+    /* if the dll wasn't loaded - exit. */
+    if (!m_libHandle)
+    {
+        iJIT_DLL_is_missing = 1; /* don't try to initialize 
+                                  * JIT agent the second time 
+                                  */
+        return 0;
+    }
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    FUNC_NotifyEvent = (TPNotify)GetProcAddress(m_libHandle, "NotifyEvent");
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    FUNC_NotifyEvent = (TPNotify)dlsym(m_libHandle, "NotifyEvent");
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (!FUNC_NotifyEvent) 
+    {
+        FUNC_Initialize = NULL;
+        return 0;
+    }
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    FUNC_Initialize = (TPInitialize)GetProcAddress(m_libHandle, "Initialize");
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    FUNC_Initialize = (TPInitialize)dlsym(m_libHandle, "Initialize");
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (!FUNC_Initialize) 
+    {
+        FUNC_NotifyEvent = NULL;
+        return 0;
+    }
+
+    executionMode = (iJIT_IsProfilingActiveFlags)FUNC_Initialize();
+
+    bDllWasLoaded = 1;
+    iJIT_DLL_is_missing = 0; /* DLL is ok. */
+
+    /*
+     * Call Graph mode: init the thread local storage
+     * (need to store the virtual stack there).
+     */
+    if ( executionMode == iJIT_CALLGRAPH_ON )
+    {
+        /* Allocate a thread local storage slot for the thread "stack" */
+        if (!threadLocalStorageHandle)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            threadLocalStorageHandle = TlsAlloc();
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pthread_key_create(&threadLocalStorageHandle, NULL);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+
+    return 1;
+}
+
+/*
+ * This function should be called by the user whenever a thread ends, 
+ * to free the thread "virtual stack" storage
+ */
+ITT_EXTERN_C void JITAPI FinalizeThread()
+{
+    if (threadLocalStorageHandle)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+            (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        if (threadStack)
+        {
+            free (threadStack);
+            threadStack = NULL;
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            TlsSetValue (threadLocalStorageHandle, threadStack);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+            pthread_setspecific(threadLocalStorageHandle, threadStack);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        }
+    }
+}
+
+/*
+ * This function should be called by the user when the process ends, 
+ * to free the local storage index
+*/
+ITT_EXTERN_C void JITAPI FinalizeProcess()
+{
+    if (m_libHandle) 
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        FreeLibrary(m_libHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        dlclose(m_libHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = NULL;
+    }
+
+    if (threadLocalStorageHandle)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        TlsFree (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    pthread_key_delete(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+}
+
+/*
+ * This function should be called by the user for any method once.
+ * The function will return a unique method ID, the user should maintain 
+ * the ID for each method
+ */
+ITT_EXTERN_C unsigned int JITAPI iJIT_GetNewMethodID()
+{
+    static unsigned int methodID = 0x100000;
+
+    if (methodID == 0)
+        return 0;  /* ERROR : this is not a valid value */
+
+    return methodID++;
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h
new file mode 100644
index 000000000000..f08e2870dcef
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h
@@ -0,0 +1,259 @@
+/*===-- jitprofiling.h - JIT Profiling API-------------------------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API declaration.
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef __JITPROFILING_H__
+#define __JITPROFILING_H__
+
+/*
+ * Various constants used by functions
+ */
+
+/* event notification */
+typedef enum iJIT_jvm_event
+{
+
+    /* shutdown  */
+    
+    /* 
+     * Program exiting EventSpecificData NA
+     */
+    iJVM_EVENT_TYPE_SHUTDOWN = 2, 
+
+    /* JIT profiling  */
+    
+    /* 
+     * issued after method code jitted into memory but before code is executed
+     * EventSpecificData is an iJIT_Method_Load
+     */
+    iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED=13,     
+
+    /* issued before unload. Method code will no longer be executed, but code 
+     * and info are still in memory. The VTune profiler may capture method 
+     * code only at this point EventSpecificData is iJIT_Method_Id
+     */
+    iJVM_EVENT_TYPE_METHOD_UNLOAD_START,         
+
+    /* Method Profiling */
+
+    /* method name, Id and stack is supplied 
+     * issued when a method is about to be entered EventSpecificData is 
+     * iJIT_Method_NIDS
+     */
+    iJVM_EVENT_TYPE_ENTER_NIDS = 19, 
+
+    /* method name, Id and stack is supplied 
+     * issued when a method is about to be left EventSpecificData is 
+     * iJIT_Method_NIDS
+     */
+    iJVM_EVENT_TYPE_LEAVE_NIDS               
+} iJIT_JVM_EVENT;
+
+typedef enum _iJIT_ModeFlags
+{
+    /* No need to Notify VTune, since VTune is not running */
+    iJIT_NO_NOTIFICATIONS          = 0x0000,     
+
+    /* when turned on the jit must call 
+     * iJIT_NotifyEvent
+     * (
+     *     iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+     * )
+     * for all the method already jitted
+     */
+    iJIT_BE_NOTIFY_ON_LOAD         = 0x0001,     
+
+    /* when turned on the jit must call
+     * iJIT_NotifyEvent
+     * (
+     *     iJVM_EVENT_TYPE_METHOD_UNLOAD_FINISHED,
+     *  ) for all the method that are unloaded
+     */
+    iJIT_BE_NOTIFY_ON_UNLOAD       = 0x0002,     
+
+    /* when turned on the jit must instrument all
+     * the currently jited code with calls on
+     * method entries
+     */
+    iJIT_BE_NOTIFY_ON_METHOD_ENTRY = 0x0004,     
+
+    /* when turned on the jit must instrument all
+     * the currently jited code with calls
+     * on method exit
+     */
+    iJIT_BE_NOTIFY_ON_METHOD_EXIT  = 0x0008      
+
+} iJIT_ModeFlags;
+
+
+ /* Flags used by iJIT_IsProfilingActive() */
+typedef enum _iJIT_IsProfilingActiveFlags
+{
+    /* No profiler is running. Currently not used */
+    iJIT_NOTHING_RUNNING           = 0x0000,     
+
+    /* Sampling is running. This is the default value
+     * returned by iJIT_IsProfilingActive()
+     */
+    iJIT_SAMPLING_ON               = 0x0001,     
+    
+      /* Call Graph is running */
+    iJIT_CALLGRAPH_ON              = 0x0002
+
+} iJIT_IsProfilingActiveFlags;
+
+/* Enumerator for the environment of methods*/
+typedef enum _iJDEnvironmentType
+{
+    iJDE_JittingAPI = 2
+} iJDEnvironmentType;
+
+/**********************************
+ * Data structures for the events *
+ **********************************/
+
+/* structure for the events:
+ * iJVM_EVENT_TYPE_METHOD_UNLOAD_START
+ */
+
+typedef struct _iJIT_Method_Id
+{
+   /* Id of the method (same as the one passed in
+   * the iJIT_Method_Load struct
+   */
+    unsigned int       method_id;              
+
+} *piJIT_Method_Id, iJIT_Method_Id;
+
+
+/* structure for the events:
+ * iJVM_EVENT_TYPE_ENTER_NIDS,
+ * iJVM_EVENT_TYPE_LEAVE_NIDS,
+ * iJVM_EVENT_TYPE_EXCEPTION_OCCURRED_NIDS
+ */
+
+typedef struct _iJIT_Method_NIDS
+{
+    /* unique method ID */
+    unsigned int       method_id;              
+
+    /* NOTE: no need to fill this field, it's filled by VTune */
+    unsigned int       stack_id;               
+
+    /* method name (just the method, without the class) */
+    char*              method_name;            
+} *piJIT_Method_NIDS, iJIT_Method_NIDS;
+
+/* structures for the events:
+ * iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED
+ */
+
+typedef struct _LineNumberInfo
+{
+    /* x86 Offset from the begining of the method*/
+    unsigned int        Offset;                 
+    
+    /* source line number from the begining of the source file */
+    unsigned int        LineNumber;             
+
+} *pLineNumberInfo, LineNumberInfo;
+
+typedef struct _iJIT_Method_Load
+{
+    /* unique method ID - can be any unique value, (except 0 - 999) */
+    unsigned int        method_id;              
+
+    /* method name (can be with or without the class and signature, in any case
+     * the class name will be added to it)
+     */
+    char*               method_name;            
+
+    /* virtual address of that method - This determines the method range for the
+     * iJVM_EVENT_TYPE_ENTER/LEAVE_METHOD_ADDR events
+     */
+    void*               method_load_address;    
+
+    /* Size in memory - Must be exact */
+    unsigned int        method_size;            
+
+    /* Line Table size in number of entries - Zero if none */
+    unsigned int        line_number_size;       
+    
+    /* Pointer to the begining of the line numbers info array */
+    pLineNumberInfo     line_number_table;      
+
+    /* unique class ID */
+    unsigned int        class_id;               
+    
+    /* class file name */
+    char*               class_file_name;        
+
+    /* source file name */
+    char*               source_file_name;       
+
+    /* bits supplied by the user for saving in the JIT file */
+    void*               user_data;              
+
+    /* the size of the user data buffer */
+    unsigned int        user_data_size;         
+
+    /* NOTE: no need to fill this field, it's filled by VTune */
+    iJDEnvironmentType  env;                    
+
+} *piJIT_Method_Load, iJIT_Method_Load;
+
+/* API Functions */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CDECL
+#  if defined WIN32 || defined _WIN32
+#    define CDECL __cdecl
+#  else /* defined WIN32 || defined _WIN32 */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define CDECL /* not actual on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define CDECL __attribute__ ((cdecl))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* defined WIN32 || defined _WIN32 */
+#endif /* CDECL */
+
+#define JITAPI CDECL
+
+/* called when the settings are changed with new settings */
+typedef void (*iJIT_ModeChangedEx)(void *UserData, iJIT_ModeFlags Flags);
+
+int JITAPI iJIT_NotifyEvent(iJIT_JVM_EVENT event_type, void *EventSpecificData);
+
+/* The new mode call back routine */
+void JITAPI iJIT_RegisterCallbackEx(void *userdata, 
+                                    iJIT_ModeChangedEx NewModeCallBackFuncEx);
+
+iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive(void);
+
+void JITAPI FinalizeThread(void);
+
+void JITAPI FinalizeProcess(void);
+
+unsigned int JITAPI iJIT_GetNewMethodID(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __JITPROFILING_H__ */
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
new file mode 100644
index 000000000000..526c04e082d2
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -0,0 +1,1396 @@
+//===-- Execution.cpp - Implement code to simulate the program ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains the actual instruction interpreter.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "interpreter"
+#include "Interpreter.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+STATISTIC(NumDynamicInsts, "Number of dynamic instructions executed");
+
+static cl::opt<bool> PrintVolatile("interpreter-print-volatile", cl::Hidden,
+          cl::desc("make the interpreter print every volatile load and store"));
+
+//===----------------------------------------------------------------------===//
+//                     Various Helper Functions
+//===----------------------------------------------------------------------===//
+
+static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF) {
+  SF.Values[V] = Val;
+}
+
+//===----------------------------------------------------------------------===//
+//                    Binary Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+#define IMPLEMENT_BINARY_OPERATOR(OP, TY) \
+   case Type::TY##TyID: \
+     Dest.TY##Val = Src1.TY##Val OP Src2.TY##Val; \
+     break
+
+static void executeFAddInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_BINARY_OPERATOR(+, Float);
+    IMPLEMENT_BINARY_OPERATOR(+, Double);
+  default:
+    dbgs() << "Unhandled type for FAdd instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+}
+
+static void executeFSubInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_BINARY_OPERATOR(-, Float);
+    IMPLEMENT_BINARY_OPERATOR(-, Double);
+  default:
+    dbgs() << "Unhandled type for FSub instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+}
+
+static void executeFMulInst(GenericValue &Dest, GenericValue Src1,
+                            GenericValue Src2, Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_BINARY_OPERATOR(*, Float);
+    IMPLEMENT_BINARY_OPERATOR(*, Double);
+  default:
+    dbgs() << "Unhandled type for FMul instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+}
+
+static void executeFDivInst(GenericValue &Dest, GenericValue Src1, 
+                            GenericValue Src2, Type *Ty) {
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_BINARY_OPERATOR(/, Float);
+    IMPLEMENT_BINARY_OPERATOR(/, Double);
+  default:
+    dbgs() << "Unhandled type for FDiv instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+}
+
+static void executeFRemInst(GenericValue &Dest, GenericValue Src1, 
+                            GenericValue Src2, Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    Dest.FloatVal = fmod(Src1.FloatVal, Src2.FloatVal);
+    break;
+  case Type::DoubleTyID:
+    Dest.DoubleVal = fmod(Src1.DoubleVal, Src2.DoubleVal);
+    break;
+  default:
+    dbgs() << "Unhandled type for Rem instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+}
+
+#define IMPLEMENT_INTEGER_ICMP(OP, TY) \
+   case Type::IntegerTyID:  \
+      Dest.IntVal = APInt(1,Src1.IntVal.OP(Src2.IntVal)); \
+      break;
+
+// Handle pointers specially because they must be compared with only as much
+// width as the host has.  We _do not_ want to be comparing 64 bit values when
+// running on a 32-bit target, otherwise the upper 32 bits might mess up
+// comparisons if they contain garbage.
+#define IMPLEMENT_POINTER_ICMP(OP) \
+   case Type::PointerTyID: \
+      Dest.IntVal = APInt(1,(void*)(intptr_t)Src1.PointerVal OP \
+                            (void*)(intptr_t)Src2.PointerVal); \
+      break;
+
+static GenericValue executeICMP_EQ(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(eq,Ty);
+    IMPLEMENT_POINTER_ICMP(==);
+  default:
+    dbgs() << "Unhandled type for ICMP_EQ predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_NE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ne,Ty);
+    IMPLEMENT_POINTER_ICMP(!=);
+  default:
+    dbgs() << "Unhandled type for ICMP_NE predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_ULT(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ult,Ty);
+    IMPLEMENT_POINTER_ICMP(<);
+  default:
+    dbgs() << "Unhandled type for ICMP_ULT predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SLT(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(slt,Ty);
+    IMPLEMENT_POINTER_ICMP(<);
+  default:
+    dbgs() << "Unhandled type for ICMP_SLT predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_UGT(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ugt,Ty);
+    IMPLEMENT_POINTER_ICMP(>);
+  default:
+    dbgs() << "Unhandled type for ICMP_UGT predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SGT(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sgt,Ty);
+    IMPLEMENT_POINTER_ICMP(>);
+  default:
+    dbgs() << "Unhandled type for ICMP_SGT predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_ULE(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(ule,Ty);
+    IMPLEMENT_POINTER_ICMP(<=);
+  default:
+    dbgs() << "Unhandled type for ICMP_ULE predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SLE(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sle,Ty);
+    IMPLEMENT_POINTER_ICMP(<=);
+  default:
+    dbgs() << "Unhandled type for ICMP_SLE predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_UGE(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(uge,Ty);
+    IMPLEMENT_POINTER_ICMP(>=);
+  default:
+    dbgs() << "Unhandled type for ICMP_UGE predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeICMP_SGE(GenericValue Src1, GenericValue Src2,
+                                    Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_INTEGER_ICMP(sge,Ty);
+    IMPLEMENT_POINTER_ICMP(>=);
+  default:
+    dbgs() << "Unhandled type for ICMP_SGE predicate: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+void Interpreter::visitICmpInst(ICmpInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+  
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  R = executeICMP_EQ(Src1,  Src2, Ty); break;
+  case ICmpInst::ICMP_NE:  R = executeICMP_NE(Src1,  Src2, Ty); break;
+  case ICmpInst::ICMP_ULT: R = executeICMP_ULT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SLT: R = executeICMP_SLT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_UGT: R = executeICMP_UGT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SGT: R = executeICMP_SGT(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_ULE: R = executeICMP_ULE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SLE: R = executeICMP_SLE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_UGE: R = executeICMP_UGE(Src1, Src2, Ty); break;
+  case ICmpInst::ICMP_SGE: R = executeICMP_SGE(Src1, Src2, Ty); break;
+  default:
+    dbgs() << "Don't know how to handle this ICmp predicate!\n-->" << I;
+    llvm_unreachable(0);
+  }
+ 
+  SetValue(&I, R, SF);
+}
+
+#define IMPLEMENT_FCMP(OP, TY) \
+   case Type::TY##TyID: \
+     Dest.IntVal = APInt(1,Src1.TY##Val OP Src2.TY##Val); \
+     break
+
+static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(==, Float);
+    IMPLEMENT_FCMP(==, Double);
+  default:
+    dbgs() << "Unhandled type for FCmp EQ instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_ONE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(!=, Float);
+    IMPLEMENT_FCMP(!=, Double);
+
+  default:
+    dbgs() << "Unhandled type for FCmp NE instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OLE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(<=, Float);
+    IMPLEMENT_FCMP(<=, Double);
+  default:
+    dbgs() << "Unhandled type for FCmp LE instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OGE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(>=, Float);
+    IMPLEMENT_FCMP(>=, Double);
+  default:
+    dbgs() << "Unhandled type for FCmp GE instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OLT(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(<, Float);
+    IMPLEMENT_FCMP(<, Double);
+  default:
+    dbgs() << "Unhandled type for FCmp LT instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+static GenericValue executeFCMP_OGT(GenericValue Src1, GenericValue Src2,
+                                     Type *Ty) {
+  GenericValue Dest;
+  switch (Ty->getTypeID()) {
+    IMPLEMENT_FCMP(>, Float);
+    IMPLEMENT_FCMP(>, Double);
+  default:
+    dbgs() << "Unhandled type for FCmp GT instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+  return Dest;
+}
+
+#define IMPLEMENT_UNORDERED(TY, X,Y)                                     \
+  if (TY->isFloatTy()) {                                                 \
+    if (X.FloatVal != X.FloatVal || Y.FloatVal != Y.FloatVal) {          \
+      Dest.IntVal = APInt(1,true);                                       \
+      return Dest;                                                       \
+    }                                                                    \
+  } else if (X.DoubleVal != X.DoubleVal || Y.DoubleVal != Y.DoubleVal) { \
+    Dest.IntVal = APInt(1,true);                                         \
+    return Dest;                                                         \
+  }
+
+
+static GenericValue executeFCMP_UEQ(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OEQ(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UNE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_ONE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ULE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OLE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UGE(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OGE(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ULT(GenericValue Src1, GenericValue Src2,
+                                   Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OLT(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_UGT(GenericValue Src1, GenericValue Src2,
+                                     Type *Ty) {
+  GenericValue Dest;
+  IMPLEMENT_UNORDERED(Ty, Src1, Src2)
+  return executeFCMP_OGT(Src1, Src2, Ty);
+}
+
+static GenericValue executeFCMP_ORD(GenericValue Src1, GenericValue Src2,
+                                     Type *Ty) {
+  GenericValue Dest;
+  if (Ty->isFloatTy())
+    Dest.IntVal = APInt(1,(Src1.FloatVal == Src1.FloatVal && 
+                           Src2.FloatVal == Src2.FloatVal));
+  else
+    Dest.IntVal = APInt(1,(Src1.DoubleVal == Src1.DoubleVal && 
+                           Src2.DoubleVal == Src2.DoubleVal));
+  return Dest;
+}
+
+static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2,
+                                     Type *Ty) {
+  GenericValue Dest;
+  if (Ty->isFloatTy())
+    Dest.IntVal = APInt(1,(Src1.FloatVal != Src1.FloatVal || 
+                           Src2.FloatVal != Src2.FloatVal));
+  else
+    Dest.IntVal = APInt(1,(Src1.DoubleVal != Src1.DoubleVal || 
+                           Src2.DoubleVal != Src2.DoubleVal));
+  return Dest;
+}
+
+void Interpreter::visitFCmpInst(FCmpInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+  
+  switch (I.getPredicate()) {
+  case FCmpInst::FCMP_FALSE: R.IntVal = APInt(1,false); break;
+  case FCmpInst::FCMP_TRUE:  R.IntVal = APInt(1,true); break;
+  case FCmpInst::FCMP_ORD:   R = executeFCMP_ORD(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UNO:   R = executeFCMP_UNO(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UEQ:   R = executeFCMP_UEQ(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OEQ:   R = executeFCMP_OEQ(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UNE:   R = executeFCMP_UNE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ONE:   R = executeFCMP_ONE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ULT:   R = executeFCMP_ULT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OLT:   R = executeFCMP_OLT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UGT:   R = executeFCMP_UGT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OGT:   R = executeFCMP_OGT(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_ULE:   R = executeFCMP_ULE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OLE:   R = executeFCMP_OLE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_UGE:   R = executeFCMP_UGE(Src1, Src2, Ty); break;
+  case FCmpInst::FCMP_OGE:   R = executeFCMP_OGE(Src1, Src2, Ty); break;
+  default:
+    dbgs() << "Don't know how to handle this FCmp predicate!\n-->" << I;
+    llvm_unreachable(0);
+  }
+ 
+  SetValue(&I, R, SF);
+}
+
+static GenericValue executeCmpInst(unsigned predicate, GenericValue Src1, 
+                                   GenericValue Src2, Type *Ty) {
+  GenericValue Result;
+  switch (predicate) {
+  case ICmpInst::ICMP_EQ:    return executeICMP_EQ(Src1, Src2, Ty);
+  case ICmpInst::ICMP_NE:    return executeICMP_NE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_UGT:   return executeICMP_UGT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SGT:   return executeICMP_SGT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_ULT:   return executeICMP_ULT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SLT:   return executeICMP_SLT(Src1, Src2, Ty);
+  case ICmpInst::ICMP_UGE:   return executeICMP_UGE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SGE:   return executeICMP_SGE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_ULE:   return executeICMP_ULE(Src1, Src2, Ty);
+  case ICmpInst::ICMP_SLE:   return executeICMP_SLE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ORD:   return executeFCMP_ORD(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UNO:   return executeFCMP_UNO(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OEQ:   return executeFCMP_OEQ(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UEQ:   return executeFCMP_UEQ(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ONE:   return executeFCMP_ONE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UNE:   return executeFCMP_UNE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OLT:   return executeFCMP_OLT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ULT:   return executeFCMP_ULT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OGT:   return executeFCMP_OGT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UGT:   return executeFCMP_UGT(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OLE:   return executeFCMP_OLE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_ULE:   return executeFCMP_ULE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_OGE:   return executeFCMP_OGE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_UGE:   return executeFCMP_UGE(Src1, Src2, Ty);
+  case FCmpInst::FCMP_FALSE: { 
+    GenericValue Result;
+    Result.IntVal = APInt(1, false);
+    return Result;
+  }
+  case FCmpInst::FCMP_TRUE: {
+    GenericValue Result;
+    Result.IntVal = APInt(1, true);
+    return Result;
+  }
+  default:
+    dbgs() << "Unhandled Cmp predicate\n";
+    llvm_unreachable(0);
+  }
+}
+
+void Interpreter::visitBinaryOperator(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty    = I.getOperand(0)->getType();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue R;   // Result
+
+  switch (I.getOpcode()) {
+  case Instruction::Add:   R.IntVal = Src1.IntVal + Src2.IntVal; break;
+  case Instruction::Sub:   R.IntVal = Src1.IntVal - Src2.IntVal; break;
+  case Instruction::Mul:   R.IntVal = Src1.IntVal * Src2.IntVal; break;
+  case Instruction::FAdd:  executeFAddInst(R, Src1, Src2, Ty); break;
+  case Instruction::FSub:  executeFSubInst(R, Src1, Src2, Ty); break;
+  case Instruction::FMul:  executeFMulInst(R, Src1, Src2, Ty); break;
+  case Instruction::FDiv:  executeFDivInst(R, Src1, Src2, Ty); break;
+  case Instruction::FRem:  executeFRemInst(R, Src1, Src2, Ty); break;
+  case Instruction::UDiv:  R.IntVal = Src1.IntVal.udiv(Src2.IntVal); break;
+  case Instruction::SDiv:  R.IntVal = Src1.IntVal.sdiv(Src2.IntVal); break;
+  case Instruction::URem:  R.IntVal = Src1.IntVal.urem(Src2.IntVal); break;
+  case Instruction::SRem:  R.IntVal = Src1.IntVal.srem(Src2.IntVal); break;
+  case Instruction::And:   R.IntVal = Src1.IntVal & Src2.IntVal; break;
+  case Instruction::Or:    R.IntVal = Src1.IntVal | Src2.IntVal; break;
+  case Instruction::Xor:   R.IntVal = Src1.IntVal ^ Src2.IntVal; break;
+  default:
+    dbgs() << "Don't know how to handle this binary operator!\n-->" << I;
+    llvm_unreachable(0);
+  }
+
+  SetValue(&I, R, SF);
+}
+
+static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
+                                      GenericValue Src3) {
+  return Src1.IntVal == 0 ? Src3 : Src2;
+}
+
+void Interpreter::visitSelectInst(SelectInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue R = executeSelectInst(Src1, Src2, Src3);
+  SetValue(&I, R, SF);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                     Terminator Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::exitCalled(GenericValue GV) {
+  // runAtExitHandlers() assumes there are no stack frames, but
+  // if exit() was called, then it had a stack frame. Blow away
+  // the stack before interpreting atexit handlers.
+  ECStack.clear();
+  runAtExitHandlers();
+  exit(GV.IntVal.zextOrTrunc(32).getZExtValue());
+}
+
+/// Pop the last stack frame off of ECStack and then copy the result
+/// back into the result variable if we are not returning void. The
+/// result variable may be the ExitValue, or the Value of the calling
+/// CallInst if there was a previous stack frame. This method may
+/// invalidate any ECStack iterators you have. This method also takes
+/// care of switching to the normal destination BB, if we are returning
+/// from an invoke.
+///
+void Interpreter::popStackAndReturnValueToCaller(Type *RetTy,
+                                                 GenericValue Result) {
+  // Pop the current stack frame.
+  ECStack.pop_back();
+
+  if (ECStack.empty()) {  // Finished main.  Put result into exit code...
+    if (RetTy && !RetTy->isVoidTy()) {          // Nonvoid return type?
+      ExitValue = Result;   // Capture the exit value of the program
+    } else {
+      memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
+    }
+  } else {
+    // If we have a previous stack frame, and we have a previous call,
+    // fill in the return value...
+    ExecutionContext &CallingSF = ECStack.back();
+    if (Instruction *I = CallingSF.Caller.getInstruction()) {
+      // Save result...
+      if (!CallingSF.Caller.getType()->isVoidTy())
+        SetValue(I, Result, CallingSF);
+      if (InvokeInst *II = dyn_cast<InvokeInst> (I))
+        SwitchToNewBasicBlock (II->getNormalDest (), CallingSF);
+      CallingSF.Caller = CallSite();          // We returned from the call...
+    }
+  }
+}
+
+void Interpreter::visitReturnInst(ReturnInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *RetTy = Type::getVoidTy(I.getContext());
+  GenericValue Result;
+
+  // Save away the return value... (if we are not 'ret void')
+  if (I.getNumOperands()) {
+    RetTy  = I.getReturnValue()->getType();
+    Result = getOperandValue(I.getReturnValue(), SF);
+  }
+
+  popStackAndReturnValueToCaller(RetTy, Result);
+}
+
+void Interpreter::visitUnreachableInst(UnreachableInst &I) {
+  report_fatal_error("Program executed an 'unreachable' instruction!");
+}
+
+void Interpreter::visitBranchInst(BranchInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  BasicBlock *Dest;
+
+  Dest = I.getSuccessor(0);          // Uncond branches have a fixed dest...
+  if (!I.isUnconditional()) {
+    Value *Cond = I.getCondition();
+    if (getOperandValue(Cond, SF).IntVal == 0) // If false cond...
+      Dest = I.getSuccessor(1);
+  }
+  SwitchToNewBasicBlock(Dest, SF);
+}
+
+void Interpreter::visitSwitchInst(SwitchInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Value* Cond = I.getCondition();
+  Type *ElTy = Cond->getType();
+  GenericValue CondVal = getOperandValue(Cond, SF);
+
+  // Check to see if any of the cases match...
+  BasicBlock *Dest = 0;
+  for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
+    IntegersSubset& Case = i.getCaseValueEx();
+    if (Case.isSingleNumber()) {
+      // FIXME: Currently work with ConstantInt based numbers.
+      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
+      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+        Dest = cast<BasicBlock>(i.getCaseSuccessor());
+        break;        
+      }
+    }
+    if (Case.isSingleNumbersOnly()) {
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
+        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
+        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }      
+    } else
+      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
+        IntegersSubset::Range r = Case.getItem(n);
+        // FIXME: Currently work with ConstantInt based numbers.
+        const ConstantInt *LowCI = r.getLow().toConstantInt();
+        const ConstantInt *HighCI = r.getHigh().toConstantInt();
+        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
+        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
+        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
+            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
+          Dest = cast<BasicBlock>(i.getCaseSuccessor());
+          break;        
+        }
+      }
+  }
+  if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
+  SwitchToNewBasicBlock(Dest, SF);
+}
+
+void Interpreter::visitIndirectBrInst(IndirectBrInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  void *Dest = GVTOP(getOperandValue(I.getAddress(), SF));
+  SwitchToNewBasicBlock((BasicBlock*)Dest, SF);
+}
+
+
+// SwitchToNewBasicBlock - This method is used to jump to a new basic block.
+// This function handles the actual updating of block and instruction iterators
+// as well as execution of all of the PHI nodes in the destination block.
+//
+// This method does this because all of the PHI nodes must be executed
+// atomically, reading their inputs before any of the results are updated.  Not
+// doing this can cause problems if the PHI nodes depend on other PHI nodes for
+// their inputs.  If the input PHI node is updated before it is read, incorrect
+// results can happen.  Thus we use a two phase approach.
+//
+void Interpreter::SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF){
+  BasicBlock *PrevBB = SF.CurBB;      // Remember where we came from...
+  SF.CurBB   = Dest;                  // Update CurBB to branch destination
+  SF.CurInst = SF.CurBB->begin();     // Update new instruction ptr...
+
+  if (!isa<PHINode>(SF.CurInst)) return;  // Nothing fancy to do
+
+  // Loop over all of the PHI nodes in the current block, reading their inputs.
+  std::vector<GenericValue> ResultValues;
+
+  for (; PHINode *PN = dyn_cast<PHINode>(SF.CurInst); ++SF.CurInst) {
+    // Search for the value corresponding to this previous bb...
+    int i = PN->getBasicBlockIndex(PrevBB);
+    assert(i != -1 && "PHINode doesn't contain entry for predecessor??");
+    Value *IncomingValue = PN->getIncomingValue(i);
+
+    // Save the incoming value for this PHI node...
+    ResultValues.push_back(getOperandValue(IncomingValue, SF));
+  }
+
+  // Now loop over all of the PHI nodes setting their values...
+  SF.CurInst = SF.CurBB->begin();
+  for (unsigned i = 0; isa<PHINode>(SF.CurInst); ++SF.CurInst, ++i) {
+    PHINode *PN = cast<PHINode>(SF.CurInst);
+    SetValue(PN, ResultValues[i], SF);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                     Memory Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::visitAllocaInst(AllocaInst &I) {
+  ExecutionContext &SF = ECStack.back();
+
+  Type *Ty = I.getType()->getElementType();  // Type to be allocated
+
+  // Get the number of elements being allocated by the array...
+  unsigned NumElements = 
+    getOperandValue(I.getOperand(0), SF).IntVal.getZExtValue();
+
+  unsigned TypeSize = (size_t)TD.getTypeAllocSize(Ty);
+
+  // Avoid malloc-ing zero bytes, use max()...
+  unsigned MemToAlloc = std::max(1U, NumElements * TypeSize);
+
+  // Allocate enough memory to hold the type...
+  void *Memory = malloc(MemToAlloc);
+
+  DEBUG(dbgs() << "Allocated Type: " << *Ty << " (" << TypeSize << " bytes) x " 
+               << NumElements << " (Total: " << MemToAlloc << ") at "
+               << uintptr_t(Memory) << '\n');
+
+  GenericValue Result = PTOGV(Memory);
+  assert(Result.PointerVal != 0 && "Null pointer returned by malloc!");
+  SetValue(&I, Result, SF);
+
+  if (I.getOpcode() == Instruction::Alloca)
+    ECStack.back().Allocas.add(Memory);
+}
+
+// getElementOffset - The workhorse for getelementptr.
+//
+GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
+                                              gep_type_iterator E,
+                                              ExecutionContext &SF) {
+  assert(Ptr->getType()->isPointerTy() &&
+         "Cannot getElementOffset of a nonpointer type!");
+
+  uint64_t Total = 0;
+
+  for (; I != E; ++I) {
+    if (StructType *STy = dyn_cast<StructType>(*I)) {
+      const StructLayout *SLO = TD.getStructLayout(STy);
+
+      const ConstantInt *CPU = cast<ConstantInt>(I.getOperand());
+      unsigned Index = unsigned(CPU->getZExtValue());
+
+      Total += SLO->getElementOffset(Index);
+    } else {
+      SequentialType *ST = cast<SequentialType>(*I);
+      // Get the index number for the array... which must be long type...
+      GenericValue IdxGV = getOperandValue(I.getOperand(), SF);
+
+      int64_t Idx;
+      unsigned BitWidth = 
+        cast<IntegerType>(I.getOperand()->getType())->getBitWidth();
+      if (BitWidth == 32)
+        Idx = (int64_t)(int32_t)IdxGV.IntVal.getZExtValue();
+      else {
+        assert(BitWidth == 64 && "Invalid index type for getelementptr");
+        Idx = (int64_t)IdxGV.IntVal.getZExtValue();
+      }
+      Total += TD.getTypeAllocSize(ST->getElementType())*Idx;
+    }
+  }
+
+  GenericValue Result;
+  Result.PointerVal = ((char*)getOperandValue(Ptr, SF).PointerVal) + Total;
+  DEBUG(dbgs() << "GEP Index " << Total << " bytes.\n");
+  return Result;
+}
+
+void Interpreter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeGEPOperation(I.getPointerOperand(),
+                                   gep_type_begin(I), gep_type_end(I), SF), SF);
+}
+
+void Interpreter::visitLoadInst(LoadInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue SRC = getOperandValue(I.getPointerOperand(), SF);
+  GenericValue *Ptr = (GenericValue*)GVTOP(SRC);
+  GenericValue Result;
+  LoadValueFromMemory(Result, Ptr, I.getType());
+  SetValue(&I, Result, SF);
+  if (I.isVolatile() && PrintVolatile)
+    dbgs() << "Volatile load " << I;
+}
+
+void Interpreter::visitStoreInst(StoreInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Val = getOperandValue(I.getOperand(0), SF);
+  GenericValue SRC = getOperandValue(I.getPointerOperand(), SF);
+  StoreValueToMemory(Val, (GenericValue *)GVTOP(SRC),
+                     I.getOperand(0)->getType());
+  if (I.isVolatile() && PrintVolatile)
+    dbgs() << "Volatile store: " << I;
+}
+
+//===----------------------------------------------------------------------===//
+//                 Miscellaneous Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+void Interpreter::visitCallSite(CallSite CS) {
+  ExecutionContext &SF = ECStack.back();
+
+  // Check to see if this is an intrinsic function call...
+  Function *F = CS.getCalledFunction();
+  if (F && F->isDeclaration())
+    switch (F->getIntrinsicID()) {
+    case Intrinsic::not_intrinsic:
+      break;
+    case Intrinsic::vastart: { // va_start
+      GenericValue ArgIndex;
+      ArgIndex.UIntPairVal.first = ECStack.size() - 1;
+      ArgIndex.UIntPairVal.second = 0;
+      SetValue(CS.getInstruction(), ArgIndex, SF);
+      return;
+    }
+    case Intrinsic::vaend:    // va_end is a noop for the interpreter
+      return;
+    case Intrinsic::vacopy:   // va_copy: dest = src
+      SetValue(CS.getInstruction(), getOperandValue(*CS.arg_begin(), SF), SF);
+      return;
+    default:
+      // If it is an unknown intrinsic function, use the intrinsic lowering
+      // class to transform it into hopefully tasty LLVM code.
+      //
+      BasicBlock::iterator me(CS.getInstruction());
+      BasicBlock *Parent = CS.getInstruction()->getParent();
+      bool atBegin(Parent->begin() == me);
+      if (!atBegin)
+        --me;
+      IL->LowerIntrinsicCall(cast<CallInst>(CS.getInstruction()));
+
+      // Restore the CurInst pointer to the first instruction newly inserted, if
+      // any.
+      if (atBegin) {
+        SF.CurInst = Parent->begin();
+      } else {
+        SF.CurInst = me;
+        ++SF.CurInst;
+      }
+      return;
+    }
+
+
+  SF.Caller = CS;
+  std::vector<GenericValue> ArgVals;
+  const unsigned NumArgs = SF.Caller.arg_size();
+  ArgVals.reserve(NumArgs);
+  uint16_t pNum = 1;
+  for (CallSite::arg_iterator i = SF.Caller.arg_begin(),
+         e = SF.Caller.arg_end(); i != e; ++i, ++pNum) {
+    Value *V = *i;
+    ArgVals.push_back(getOperandValue(V, SF));
+  }
+
+  // To handle indirect calls, we must get the pointer value from the argument
+  // and treat it as a function pointer.
+  GenericValue SRC = getOperandValue(SF.Caller.getCalledValue(), SF);
+  callFunction((Function*)GVTOP(SRC), ArgVals);
+}
+
+void Interpreter::visitShl(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.shl(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitLShr(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.lshr(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitAShr(BinaryOperator &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
+    Dest.IntVal = Src1.IntVal.ashr(Src2.IntVal.getZExtValue());
+  else
+    Dest.IntVal = Src1.IntVal;
+  
+  SetValue(&I, Dest, SF);
+}
+
+GenericValue Interpreter::executeTruncInst(Value *SrcVal, Type *DstTy,
+                                           ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.trunc(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
+                                          ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.sext(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy,
+                                          ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
+  unsigned DBitWidth = DITy->getBitWidth();
+  Dest.IntVal = Src.IntVal.zext(DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, Type *DstTy,
+                                             ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcVal->getType()->isDoubleTy() && DstTy->isFloatTy() &&
+         "Invalid FPTrunc instruction");
+  Dest.FloatVal = (float) Src.DoubleVal;
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPExtInst(Value *SrcVal, Type *DstTy,
+                                           ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcVal->getType()->isFloatTy() && DstTy->isDoubleTy() &&
+         "Invalid FPTrunc instruction");
+  Dest.DoubleVal = (double) Src.FloatVal;
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy,
+                                            ExecutionContext &SF) {
+  Type *SrcTy = SrcVal->getType();
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcTy->isFloatingPointTy() && "Invalid FPToUI instruction");
+
+  if (SrcTy->getTypeID() == Type::FloatTyID)
+    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+  else
+    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy,
+                                            ExecutionContext &SF) {
+  Type *SrcTy = SrcVal->getType();
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcTy->isFloatingPointTy() && "Invalid FPToSI instruction");
+
+  if (SrcTy->getTypeID() == Type::FloatTyID)
+    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+  else
+    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  return Dest;
+}
+
+GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy,
+                                            ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(DstTy->isFloatingPointTy() && "Invalid UIToFP instruction");
+
+  if (DstTy->getTypeID() == Type::FloatTyID)
+    Dest.FloatVal = APIntOps::RoundAPIntToFloat(Src.IntVal);
+  else
+    Dest.DoubleVal = APIntOps::RoundAPIntToDouble(Src.IntVal);
+  return Dest;
+}
+
+GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy,
+                                            ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(DstTy->isFloatingPointTy() && "Invalid SIToFP instruction");
+
+  if (DstTy->getTypeID() == Type::FloatTyID)
+    Dest.FloatVal = APIntOps::RoundSignedAPIntToFloat(Src.IntVal);
+  else
+    Dest.DoubleVal = APIntOps::RoundSignedAPIntToDouble(Src.IntVal);
+  return Dest;
+
+}
+
+GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, Type *DstTy,
+                                              ExecutionContext &SF) {
+  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(SrcVal->getType()->isPointerTy() && "Invalid PtrToInt instruction");
+
+  Dest.IntVal = APInt(DBitWidth, (intptr_t) Src.PointerVal);
+  return Dest;
+}
+
+GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, Type *DstTy,
+                                              ExecutionContext &SF) {
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  assert(DstTy->isPointerTy() && "Invalid PtrToInt instruction");
+
+  uint32_t PtrSize = TD.getPointerSizeInBits();
+  if (PtrSize != Src.IntVal.getBitWidth())
+    Src.IntVal = Src.IntVal.zextOrTrunc(PtrSize);
+
+  Dest.PointerVal = PointerTy(intptr_t(Src.IntVal.getZExtValue()));
+  return Dest;
+}
+
+GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
+                                             ExecutionContext &SF) {
+  
+  Type *SrcTy = SrcVal->getType();
+  GenericValue Dest, Src = getOperandValue(SrcVal, SF);
+  if (DstTy->isPointerTy()) {
+    assert(SrcTy->isPointerTy() && "Invalid BitCast");
+    Dest.PointerVal = Src.PointerVal;
+  } else if (DstTy->isIntegerTy()) {
+    if (SrcTy->isFloatTy()) {
+      Dest.IntVal = APInt::floatToBits(Src.FloatVal);
+    } else if (SrcTy->isDoubleTy()) {
+      Dest.IntVal = APInt::doubleToBits(Src.DoubleVal);
+    } else if (SrcTy->isIntegerTy()) {
+      Dest.IntVal = Src.IntVal;
+    } else 
+      llvm_unreachable("Invalid BitCast");
+  } else if (DstTy->isFloatTy()) {
+    if (SrcTy->isIntegerTy())
+      Dest.FloatVal = Src.IntVal.bitsToFloat();
+    else
+      Dest.FloatVal = Src.FloatVal;
+  } else if (DstTy->isDoubleTy()) {
+    if (SrcTy->isIntegerTy())
+      Dest.DoubleVal = Src.IntVal.bitsToDouble();
+    else
+      Dest.DoubleVal = Src.DoubleVal;
+  } else
+    llvm_unreachable("Invalid Bitcast");
+
+  return Dest;
+}
+
+void Interpreter::visitTruncInst(TruncInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeTruncInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitSExtInst(SExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeSExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitZExtInst(ZExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeZExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPTruncInst(FPTruncInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPTruncInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPExtInst(FPExtInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPExtInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitUIToFPInst(UIToFPInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeUIToFPInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitSIToFPInst(SIToFPInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeSIToFPInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPToUIInst(FPToUIInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPToUIInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitFPToSIInst(FPToSIInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeFPToSIInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitPtrToIntInst(PtrToIntInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executePtrToIntInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitIntToPtrInst(IntToPtrInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeIntToPtrInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+void Interpreter::visitBitCastInst(BitCastInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, executeBitCastInst(I.getOperand(0), I.getType(), SF), SF);
+}
+
+#define IMPLEMENT_VAARG(TY) \
+   case Type::TY##TyID: Dest.TY##Val = Src.TY##Val; break
+
+void Interpreter::visitVAArgInst(VAArgInst &I) {
+  ExecutionContext &SF = ECStack.back();
+
+  // Get the incoming valist parameter.  LLI treats the valist as a
+  // (ec-stack-depth var-arg-index) pair.
+  GenericValue VAList = getOperandValue(I.getOperand(0), SF);
+  GenericValue Dest;
+  GenericValue Src = ECStack[VAList.UIntPairVal.first]
+                      .VarArgs[VAList.UIntPairVal.second];
+  Type *Ty = I.getType();
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    Dest.IntVal = Src.IntVal;
+    break;
+  IMPLEMENT_VAARG(Pointer);
+  IMPLEMENT_VAARG(Float);
+  IMPLEMENT_VAARG(Double);
+  default:
+    dbgs() << "Unhandled dest type for vaarg instruction: " << *Ty << "\n";
+    llvm_unreachable(0);
+  }
+
+  // Set the Value of this Instruction.
+  SetValue(&I, Dest, SF);
+
+  // Move the pointer to the next vararg.
+  ++VAList.UIntPairVal.second;
+}
+
+void Interpreter::visitExtractElementInst(ExtractElementInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest;
+
+  Type *Ty = I.getType();
+  const unsigned indx = unsigned(Src2.IntVal.getZExtValue());
+
+  if(Src1.AggregateVal.size() > indx) {
+    switch (Ty->getTypeID()) {
+    default:
+      dbgs() << "Unhandled destination type for extractelement instruction: "
+      << *Ty << "\n";
+      llvm_unreachable(0);
+      break;
+    case Type::IntegerTyID:
+      Dest.IntVal = Src1.AggregateVal[indx].IntVal;
+      break;
+    case Type::FloatTyID:
+      Dest.FloatVal = Src1.AggregateVal[indx].FloatVal;
+      break;
+    case Type::DoubleTyID:
+      Dest.DoubleVal = Src1.AggregateVal[indx].DoubleVal;
+      break;
+    }
+  } else {
+    dbgs() << "Invalid index in extractelement instruction\n";
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
+GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
+                                                ExecutionContext &SF) {
+  switch (CE->getOpcode()) {
+  case Instruction::Trunc:   
+      return executeTruncInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::ZExt:
+      return executeZExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::SExt:
+      return executeSExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPTrunc:
+      return executeFPTruncInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPExt:
+      return executeFPExtInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::UIToFP:
+      return executeUIToFPInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::SIToFP:
+      return executeSIToFPInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPToUI:
+      return executeFPToUIInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::FPToSI:
+      return executeFPToSIInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::PtrToInt:
+      return executePtrToIntInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::IntToPtr:
+      return executeIntToPtrInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::BitCast:
+      return executeBitCastInst(CE->getOperand(0), CE->getType(), SF);
+  case Instruction::GetElementPtr:
+    return executeGEPOperation(CE->getOperand(0), gep_type_begin(CE),
+                               gep_type_end(CE), SF);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+    return executeCmpInst(CE->getPredicate(),
+                          getOperandValue(CE->getOperand(0), SF),
+                          getOperandValue(CE->getOperand(1), SF),
+                          CE->getOperand(0)->getType());
+  case Instruction::Select:
+    return executeSelectInst(getOperandValue(CE->getOperand(0), SF),
+                             getOperandValue(CE->getOperand(1), SF),
+                             getOperandValue(CE->getOperand(2), SF));
+  default :
+    break;
+  }
+
+  // The cases below here require a GenericValue parameter for the result
+  // so we initialize one, compute it and then return it.
+  GenericValue Op0 = getOperandValue(CE->getOperand(0), SF);
+  GenericValue Op1 = getOperandValue(CE->getOperand(1), SF);
+  GenericValue Dest;
+  Type * Ty = CE->getOperand(0)->getType();
+  switch (CE->getOpcode()) {
+  case Instruction::Add:  Dest.IntVal = Op0.IntVal + Op1.IntVal; break;
+  case Instruction::Sub:  Dest.IntVal = Op0.IntVal - Op1.IntVal; break;
+  case Instruction::Mul:  Dest.IntVal = Op0.IntVal * Op1.IntVal; break;
+  case Instruction::FAdd: executeFAddInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FSub: executeFSubInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FMul: executeFMulInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FDiv: executeFDivInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::FRem: executeFRemInst(Dest, Op0, Op1, Ty); break;
+  case Instruction::SDiv: Dest.IntVal = Op0.IntVal.sdiv(Op1.IntVal); break;
+  case Instruction::UDiv: Dest.IntVal = Op0.IntVal.udiv(Op1.IntVal); break;
+  case Instruction::URem: Dest.IntVal = Op0.IntVal.urem(Op1.IntVal); break;
+  case Instruction::SRem: Dest.IntVal = Op0.IntVal.srem(Op1.IntVal); break;
+  case Instruction::And:  Dest.IntVal = Op0.IntVal & Op1.IntVal; break;
+  case Instruction::Or:   Dest.IntVal = Op0.IntVal | Op1.IntVal; break;
+  case Instruction::Xor:  Dest.IntVal = Op0.IntVal ^ Op1.IntVal; break;
+  case Instruction::Shl:  
+    Dest.IntVal = Op0.IntVal.shl(Op1.IntVal.getZExtValue());
+    break;
+  case Instruction::LShr: 
+    Dest.IntVal = Op0.IntVal.lshr(Op1.IntVal.getZExtValue());
+    break;
+  case Instruction::AShr: 
+    Dest.IntVal = Op0.IntVal.ashr(Op1.IntVal.getZExtValue());
+    break;
+  default:
+    dbgs() << "Unhandled ConstantExpr: " << *CE << "\n";
+    llvm_unreachable("Unhandled ConstantExpr");
+  }
+  return Dest;
+}
+
+GenericValue Interpreter::getOperandValue(Value *V, ExecutionContext &SF) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    return getConstantExprValue(CE, SF);
+  } else if (Constant *CPV = dyn_cast<Constant>(V)) {
+    return getConstantValue(CPV);
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    return PTOGV(getPointerToGlobal(GV));
+  } else {
+    return SF.Values[V];
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                        Dispatch and Execution Code
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// callFunction - Execute the specified function...
+//
+void Interpreter::callFunction(Function *F,
+                               const std::vector<GenericValue> &ArgVals) {
+  assert((ECStack.empty() || ECStack.back().Caller.getInstruction() == 0 ||
+          ECStack.back().Caller.arg_size() == ArgVals.size()) &&
+         "Incorrect number of arguments passed into function call!");
+  // Make a new stack frame... and fill it in.
+  ECStack.push_back(ExecutionContext());
+  ExecutionContext &StackFrame = ECStack.back();
+  StackFrame.CurFunction = F;
+
+  // Special handling for external functions.
+  if (F->isDeclaration()) {
+    GenericValue Result = callExternalFunction (F, ArgVals);
+    // Simulate a 'ret' instruction of the appropriate type.
+    popStackAndReturnValueToCaller (F->getReturnType (), Result);
+    return;
+  }
+
+  // Get pointers to first LLVM BB & Instruction in function.
+  StackFrame.CurBB     = F->begin();
+  StackFrame.CurInst   = StackFrame.CurBB->begin();
+
+  // Run through the function arguments and initialize their values...
+  assert((ArgVals.size() == F->arg_size() ||
+         (ArgVals.size() > F->arg_size() && F->getFunctionType()->isVarArg()))&&
+         "Invalid number of values passed to function invocation!");
+
+  // Handle non-varargs arguments...
+  unsigned i = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); 
+       AI != E; ++AI, ++i)
+    SetValue(AI, ArgVals[i], StackFrame);
+
+  // Handle varargs arguments...
+  StackFrame.VarArgs.assign(ArgVals.begin()+i, ArgVals.end());
+}
+
+
+void Interpreter::run() {
+  while (!ECStack.empty()) {
+    // Interpret a single instruction & increment the "PC".
+    ExecutionContext &SF = ECStack.back();  // Current stack frame
+    Instruction &I = *SF.CurInst++;         // Increment before execute
+
+    // Track the number of dynamic instructions executed.
+    ++NumDynamicInsts;
+
+    DEBUG(dbgs() << "About to interpret: " << I);
+    visit(I);   // Dispatch to one of the visit* methods...
+#if 0
+    // This is not safe, as visiting the instruction could lower it and free I.
+DEBUG(
+    if (!isa<CallInst>(I) && !isa<InvokeInst>(I) && 
+        I.getType() != Type::VoidTy) {
+      dbgs() << "  --> ";
+      const GenericValue &Val = SF.Values[&I];
+      switch (I.getType()->getTypeID()) {
+      default: llvm_unreachable("Invalid GenericValue Type");
+      case Type::VoidTyID:    dbgs() << "void"; break;
+      case Type::FloatTyID:   dbgs() << "float " << Val.FloatVal; break;
+      case Type::DoubleTyID:  dbgs() << "double " << Val.DoubleVal; break;
+      case Type::PointerTyID: dbgs() << "void* " << intptr_t(Val.PointerVal);
+        break;
+      case Type::IntegerTyID: 
+        dbgs() << "i" << Val.IntVal.getBitWidth() << " "
+               << Val.IntVal.toStringUnsigned(10)
+               << " (0x" << Val.IntVal.toStringUnsigned(16) << ")\n";
+        break;
+      }
+    });
+#endif
+  }
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
new file mode 100644
index 000000000000..bef4bbf66023
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -0,0 +1,484 @@
+//===-- ExternalFunctions.cpp - Implement External Functions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains both code to deal with invoking "external" functions, but
+//  also contains code that implements "exported" external functions.
+//
+//  There are currently two mechanisms for handling external functions in the
+//  Interpreter.  The first is to implement lle_* wrapper functions that are
+//  specific to well-known library functions which manually translate the
+//  arguments from GenericValues and make the call.  If such a wrapper does
+//  not exist, and libffi is available, then the Interpreter will attempt to
+//  invoke the function using libffi, after finding its address.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interpreter.h"
+#include "llvm/Config/config.h"     // Detect libffi
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include <cmath>
+#include <csignal>
+#include <cstdio>
+#include <cstring>
+#include <map>
+
+#ifdef HAVE_FFI_CALL
+#ifdef HAVE_FFI_H
+#include <ffi.h>
+#define USE_LIBFFI
+#elif HAVE_FFI_FFI_H
+#include <ffi/ffi.h>
+#define USE_LIBFFI
+#endif
+#endif
+
+using namespace llvm;
+
+static ManagedStatic<sys::Mutex> FunctionsLock;
+
+typedef GenericValue (*ExFunc)(FunctionType *,
+                               const std::vector<GenericValue> &);
+static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
+static std::map<std::string, ExFunc> FuncNames;
+
+#ifdef USE_LIBFFI
+typedef void (*RawFunc)();
+static ManagedStatic<std::map<const Function *, RawFunc> > RawFunctions;
+#endif
+
+static Interpreter *TheInterpreter;
+
+static char getTypeID(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:    return 'V';
+  case Type::IntegerTyID:
+    switch (cast<IntegerType>(Ty)->getBitWidth()) {
+      case 1:  return 'o';
+      case 8:  return 'B';
+      case 16: return 'S';
+      case 32: return 'I';
+      case 64: return 'L';
+      default: return 'N';
+    }
+  case Type::FloatTyID:   return 'F';
+  case Type::DoubleTyID:  return 'D';
+  case Type::PointerTyID: return 'P';
+  case Type::FunctionTyID:return 'M';
+  case Type::StructTyID:  return 'T';
+  case Type::ArrayTyID:   return 'A';
+  default: return 'U';
+  }
+}
+
+// Try to find address of external function given a Function object.
+// Please note, that interpreter doesn't know how to assemble a
+// real call in general case (this is JIT job), that's why it assumes,
+// that all external functions has the same (and pretty "general") signature.
+// The typical example of such functions are "lle_X_" ones.
+static ExFunc lookupFunction(const Function *F) {
+  // Function not found, look it up... start by figuring out what the
+  // composite function name should be.
+  std::string ExtName = "lle_";
+  FunctionType *FT = F->getFunctionType();
+  for (unsigned i = 0, e = FT->getNumContainedTypes(); i != e; ++i)
+    ExtName += getTypeID(FT->getContainedType(i));
+  ExtName += "_" + F->getName().str();
+
+  sys::ScopedLock Writer(*FunctionsLock);
+  ExFunc FnPtr = FuncNames[ExtName];
+  if (FnPtr == 0)
+    FnPtr = FuncNames["lle_X_" + F->getName().str()];
+  if (FnPtr == 0)  // Try calling a generic function... if it exists...
+    FnPtr = (ExFunc)(intptr_t)
+      sys::DynamicLibrary::SearchForAddressOfSymbol("lle_X_" +
+                                                    F->getName().str());
+  if (FnPtr != 0)
+    ExportedFunctions->insert(std::make_pair(F, FnPtr));  // Cache for later
+  return FnPtr;
+}
+
+#ifdef USE_LIBFFI
+static ffi_type *ffiTypeFor(Type *Ty) {
+  switch (Ty->getTypeID()) {
+    case Type::VoidTyID: return &ffi_type_void;
+    case Type::IntegerTyID:
+      switch (cast<IntegerType>(Ty)->getBitWidth()) {
+        case 8:  return &ffi_type_sint8;
+        case 16: return &ffi_type_sint16;
+        case 32: return &ffi_type_sint32;
+        case 64: return &ffi_type_sint64;
+      }
+    case Type::FloatTyID:   return &ffi_type_float;
+    case Type::DoubleTyID:  return &ffi_type_double;
+    case Type::PointerTyID: return &ffi_type_pointer;
+    default: break;
+  }
+  // TODO: Support other types such as StructTyID, ArrayTyID, OpaqueTyID, etc.
+  report_fatal_error("Type could not be mapped for use with libffi.");
+  return NULL;
+}
+
+static void *ffiValueFor(Type *Ty, const GenericValue &AV,
+                         void *ArgDataPtr) {
+  switch (Ty->getTypeID()) {
+    case Type::IntegerTyID:
+      switch (cast<IntegerType>(Ty)->getBitWidth()) {
+        case 8: {
+          int8_t *I8Ptr = (int8_t *) ArgDataPtr;
+          *I8Ptr = (int8_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 16: {
+          int16_t *I16Ptr = (int16_t *) ArgDataPtr;
+          *I16Ptr = (int16_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 32: {
+          int32_t *I32Ptr = (int32_t *) ArgDataPtr;
+          *I32Ptr = (int32_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+        case 64: {
+          int64_t *I64Ptr = (int64_t *) ArgDataPtr;
+          *I64Ptr = (int64_t) AV.IntVal.getZExtValue();
+          return ArgDataPtr;
+        }
+      }
+    case Type::FloatTyID: {
+      float *FloatPtr = (float *) ArgDataPtr;
+      *FloatPtr = AV.FloatVal;
+      return ArgDataPtr;
+    }
+    case Type::DoubleTyID: {
+      double *DoublePtr = (double *) ArgDataPtr;
+      *DoublePtr = AV.DoubleVal;
+      return ArgDataPtr;
+    }
+    case Type::PointerTyID: {
+      void **PtrPtr = (void **) ArgDataPtr;
+      *PtrPtr = GVTOP(AV);
+      return ArgDataPtr;
+    }
+    default: break;
+  }
+  // TODO: Support other types such as StructTyID, ArrayTyID, OpaqueTyID, etc.
+  report_fatal_error("Type value could not be mapped for use with libffi.");
+  return NULL;
+}
+
+static bool ffiInvoke(RawFunc Fn, Function *F,
+                      const std::vector<GenericValue> &ArgVals,
+                      const DataLayout *TD, GenericValue &Result) {
+  ffi_cif cif;
+  FunctionType *FTy = F->getFunctionType();
+  const unsigned NumArgs = F->arg_size();
+
+  // TODO: We don't have type information about the remaining arguments, because
+  // this information is never passed into ExecutionEngine::runFunction().
+  if (ArgVals.size() > NumArgs && F->isVarArg()) {
+    report_fatal_error("Calling external var arg function '" + F->getName()
+                      + "' is not supported by the Interpreter.");
+  }
+
+  unsigned ArgBytes = 0;
+
+  std::vector<ffi_type*> args(NumArgs);
+  for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
+       A != E; ++A) {
+    const unsigned ArgNo = A->getArgNo();
+    Type *ArgTy = FTy->getParamType(ArgNo);
+    args[ArgNo] = ffiTypeFor(ArgTy);
+    ArgBytes += TD->getTypeStoreSize(ArgTy);
+  }
+
+  SmallVector<uint8_t, 128> ArgData;
+  ArgData.resize(ArgBytes);
+  uint8_t *ArgDataPtr = ArgData.data();
+  SmallVector<void*, 16> values(NumArgs);
+  for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
+       A != E; ++A) {
+    const unsigned ArgNo = A->getArgNo();
+    Type *ArgTy = FTy->getParamType(ArgNo);
+    values[ArgNo] = ffiValueFor(ArgTy, ArgVals[ArgNo], ArgDataPtr);
+    ArgDataPtr += TD->getTypeStoreSize(ArgTy);
+  }
+
+  Type *RetTy = FTy->getReturnType();
+  ffi_type *rtype = ffiTypeFor(RetTy);
+
+  if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) {
+    SmallVector<uint8_t, 128> ret;
+    if (RetTy->getTypeID() != Type::VoidTyID)
+      ret.resize(TD->getTypeStoreSize(RetTy));
+    ffi_call(&cif, Fn, ret.data(), values.data());
+    switch (RetTy->getTypeID()) {
+      case Type::IntegerTyID:
+        switch (cast<IntegerType>(RetTy)->getBitWidth()) {
+          case 8:  Result.IntVal = APInt(8 , *(int8_t *) ret.data()); break;
+          case 16: Result.IntVal = APInt(16, *(int16_t*) ret.data()); break;
+          case 32: Result.IntVal = APInt(32, *(int32_t*) ret.data()); break;
+          case 64: Result.IntVal = APInt(64, *(int64_t*) ret.data()); break;
+        }
+        break;
+      case Type::FloatTyID:   Result.FloatVal   = *(float *) ret.data(); break;
+      case Type::DoubleTyID:  Result.DoubleVal  = *(double*) ret.data(); break;
+      case Type::PointerTyID: Result.PointerVal = *(void **) ret.data(); break;
+      default: break;
+    }
+    return true;
+  }
+
+  return false;
+}
+#endif // USE_LIBFFI
+
+GenericValue Interpreter::callExternalFunction(Function *F,
+                                     const std::vector<GenericValue> &ArgVals) {
+  TheInterpreter = this;
+
+  FunctionsLock->acquire();
+
+  // Do a lookup to see if the function is in our cache... this should just be a
+  // deferred annotation!
+  std::map<const Function *, ExFunc>::iterator FI = ExportedFunctions->find(F);
+  if (ExFunc Fn = (FI == ExportedFunctions->end()) ? lookupFunction(F)
+                                                   : FI->second) {
+    FunctionsLock->release();
+    return Fn(F->getFunctionType(), ArgVals);
+  }
+
+#ifdef USE_LIBFFI
+  std::map<const Function *, RawFunc>::iterator RF = RawFunctions->find(F);
+  RawFunc RawFn;
+  if (RF == RawFunctions->end()) {
+    RawFn = (RawFunc)(intptr_t)
+      sys::DynamicLibrary::SearchForAddressOfSymbol(F->getName());
+    if (!RawFn)
+      RawFn = (RawFunc)(intptr_t)getPointerToGlobalIfAvailable(F);
+    if (RawFn != 0)
+      RawFunctions->insert(std::make_pair(F, RawFn));  // Cache for later
+  } else {
+    RawFn = RF->second;
+  }
+
+  FunctionsLock->release();
+
+  GenericValue Result;
+  if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getDataLayout(), Result))
+    return Result;
+#endif // USE_LIBFFI
+
+  if (F->getName() == "__main")
+    errs() << "Tried to execute an unknown external function: "
+      << *F->getType() << " __main\n";
+  else
+    report_fatal_error("Tried to execute an unknown external function: " +
+                       F->getName());
+#ifndef USE_LIBFFI
+  errs() << "Recompiling LLVM with --enable-libffi might help.\n";
+#endif
+  return GenericValue();
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Functions "exported" to the running application...
+//
+
+// void atexit(Function*)
+static
+GenericValue lle_X_atexit(FunctionType *FT,
+                          const std::vector<GenericValue> &Args) {
+  assert(Args.size() == 1);
+  TheInterpreter->addAtExitHandler((Function*)GVTOP(Args[0]));
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
+// void exit(int)
+static
+GenericValue lle_X_exit(FunctionType *FT,
+                        const std::vector<GenericValue> &Args) {
+  TheInterpreter->exitCalled(Args[0]);
+  return GenericValue();
+}
+
+// void abort(void)
+static
+GenericValue lle_X_abort(FunctionType *FT,
+                         const std::vector<GenericValue> &Args) {
+  //FIXME: should we report or raise here?
+  //report_fatal_error("Interpreted program raised SIGABRT");
+  raise (SIGABRT);
+  return GenericValue();
+}
+
+// int sprintf(char *, const char *, ...) - a very rough implementation to make
+// output useful.
+static
+GenericValue lle_X_sprintf(FunctionType *FT,
+                           const std::vector<GenericValue> &Args) {
+  char *OutputBuffer = (char *)GVTOP(Args[0]);
+  const char *FmtStr = (const char *)GVTOP(Args[1]);
+  unsigned ArgNo = 2;
+
+  // printf should return # chars printed.  This is completely incorrect, but
+  // close enough for now.
+  GenericValue GV;
+  GV.IntVal = APInt(32, strlen(FmtStr));
+  while (1) {
+    switch (*FmtStr) {
+    case 0: return GV;             // Null terminator...
+    default:                       // Normal nonspecial character
+      sprintf(OutputBuffer++, "%c", *FmtStr++);
+      break;
+    case '\\': {                   // Handle escape codes
+      sprintf(OutputBuffer, "%c%c", *FmtStr, *(FmtStr+1));
+      FmtStr += 2; OutputBuffer += 2;
+      break;
+    }
+    case '%': {                    // Handle format specifiers
+      char FmtBuf[100] = "", Buffer[1000] = "";
+      char *FB = FmtBuf;
+      *FB++ = *FmtStr++;
+      char Last = *FB++ = *FmtStr++;
+      unsigned HowLong = 0;
+      while (Last != 'c' && Last != 'd' && Last != 'i' && Last != 'u' &&
+             Last != 'o' && Last != 'x' && Last != 'X' && Last != 'e' &&
+             Last != 'E' && Last != 'g' && Last != 'G' && Last != 'f' &&
+             Last != 'p' && Last != 's' && Last != '%') {
+        if (Last == 'l' || Last == 'L') HowLong++;  // Keep track of l's
+        Last = *FB++ = *FmtStr++;
+      }
+      *FB = 0;
+
+      switch (Last) {
+      case '%':
+        memcpy(Buffer, "%", 2); break;
+      case 'c':
+        sprintf(Buffer, FmtBuf, uint32_t(Args[ArgNo++].IntVal.getZExtValue()));
+        break;
+      case 'd': case 'i':
+      case 'u': case 'o':
+      case 'x': case 'X':
+        if (HowLong >= 1) {
+          if (HowLong == 1 &&
+              TheInterpreter->getDataLayout()->getPointerSizeInBits() == 64 &&
+              sizeof(long) < sizeof(int64_t)) {
+            // Make sure we use %lld with a 64 bit argument because we might be
+            // compiling LLI on a 32 bit compiler.
+            unsigned Size = strlen(FmtBuf);
+            FmtBuf[Size] = FmtBuf[Size-1];
+            FmtBuf[Size+1] = 0;
+            FmtBuf[Size-1] = 'l';
+          }
+          sprintf(Buffer, FmtBuf, Args[ArgNo++].IntVal.getZExtValue());
+        } else
+          sprintf(Buffer, FmtBuf,uint32_t(Args[ArgNo++].IntVal.getZExtValue()));
+        break;
+      case 'e': case 'E': case 'g': case 'G': case 'f':
+        sprintf(Buffer, FmtBuf, Args[ArgNo++].DoubleVal); break;
+      case 'p':
+        sprintf(Buffer, FmtBuf, (void*)GVTOP(Args[ArgNo++])); break;
+      case 's':
+        sprintf(Buffer, FmtBuf, (char*)GVTOP(Args[ArgNo++])); break;
+      default:
+        errs() << "<unknown printf code '" << *FmtStr << "'!>";
+        ArgNo++; break;
+      }
+      size_t Len = strlen(Buffer);
+      memcpy(OutputBuffer, Buffer, Len + 1);
+      OutputBuffer += Len;
+      }
+      break;
+    }
+  }
+}
+
+// int printf(const char *, ...) - a very rough implementation to make output
+// useful.
+static
+GenericValue lle_X_printf(FunctionType *FT,
+                          const std::vector<GenericValue> &Args) {
+  char Buffer[10000];
+  std::vector<GenericValue> NewArgs;
+  NewArgs.push_back(PTOGV((void*)&Buffer[0]));
+  NewArgs.insert(NewArgs.end(), Args.begin(), Args.end());
+  GenericValue GV = lle_X_sprintf(FT, NewArgs);
+  outs() << Buffer;
+  return GV;
+}
+
+// int sscanf(const char *format, ...);
+static
+GenericValue lle_X_sscanf(FunctionType *FT,
+                          const std::vector<GenericValue> &args) {
+  assert(args.size() < 10 && "Only handle up to 10 args to sscanf right now!");
+
+  char *Args[10];
+  for (unsigned i = 0; i < args.size(); ++i)
+    Args[i] = (char*)GVTOP(args[i]);
+
+  GenericValue GV;
+  GV.IntVal = APInt(32, sscanf(Args[0], Args[1], Args[2], Args[3], Args[4],
+                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+  return GV;
+}
+
+// int scanf(const char *format, ...);
+static
+GenericValue lle_X_scanf(FunctionType *FT,
+                         const std::vector<GenericValue> &args) {
+  assert(args.size() < 10 && "Only handle up to 10 args to scanf right now!");
+
+  char *Args[10];
+  for (unsigned i = 0; i < args.size(); ++i)
+    Args[i] = (char*)GVTOP(args[i]);
+
+  GenericValue GV;
+  GV.IntVal = APInt(32, scanf( Args[0], Args[1], Args[2], Args[3], Args[4],
+                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+  return GV;
+}
+
+// int fprintf(FILE *, const char *, ...) - a very rough implementation to make
+// output useful.
+static
+GenericValue lle_X_fprintf(FunctionType *FT,
+                           const std::vector<GenericValue> &Args) {
+  assert(Args.size() >= 2);
+  char Buffer[10000];
+  std::vector<GenericValue> NewArgs;
+  NewArgs.push_back(PTOGV(Buffer));
+  NewArgs.insert(NewArgs.end(), Args.begin()+1, Args.end());
+  GenericValue GV = lle_X_sprintf(FT, NewArgs);
+
+  fputs(Buffer, (FILE *) GVTOP(Args[0]));
+  return GV;
+}
+
+void Interpreter::initializeExternalFunctions() {
+  sys::ScopedLock Writer(*FunctionsLock);
+  FuncNames["lle_X_atexit"]       = lle_X_atexit;
+  FuncNames["lle_X_exit"]         = lle_X_exit;
+  FuncNames["lle_X_abort"]        = lle_X_abort;
+
+  FuncNames["lle_X_printf"]       = lle_X_printf;
+  FuncNames["lle_X_sprintf"]      = lle_X_sprintf;
+  FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
+  FuncNames["lle_X_scanf"]        = lle_X_scanf;
+  FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp
new file mode 100644
index 000000000000..9ee9d9456d1d
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -0,0 +1,98 @@
+//===- Interpreter.cpp - Top-Level LLVM Interpreter Implementation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the top-level functionality for the LLVM interpreter.
+// This interpreter is designed to be a very simple, portable, inefficient
+// interpreter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interpreter.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include <cstring>
+using namespace llvm;
+
+namespace {
+
+static struct RegisterInterp {
+  RegisterInterp() { Interpreter::Register(); }
+} InterpRegistrator;
+
+}
+
+extern "C" void LLVMLinkInInterpreter() { }
+
+/// create - Create a new interpreter object.  This can never fail.
+///
+ExecutionEngine *Interpreter::create(Module *M, std::string* ErrStr) {
+  // Tell this Module to materialize everything and release the GVMaterializer.
+  if (M->MaterializeAllPermanently(ErrStr))
+    // We got an error, just return 0
+    return 0;
+
+  return new Interpreter(M);
+}
+
+//===----------------------------------------------------------------------===//
+// Interpreter ctor - Initialize stuff
+//
+Interpreter::Interpreter(Module *M)
+  : ExecutionEngine(M), TD(M) {
+      
+  memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
+  setDataLayout(&TD);
+  // Initialize the "backend"
+  initializeExecutionEngine();
+  initializeExternalFunctions();
+  emitGlobals();
+
+  IL = new IntrinsicLowering(TD);
+}
+
+Interpreter::~Interpreter() {
+  delete IL;
+}
+
+void Interpreter::runAtExitHandlers () {
+  while (!AtExitHandlers.empty()) {
+    callFunction(AtExitHandlers.back(), std::vector<GenericValue>());
+    AtExitHandlers.pop_back();
+    run();
+  }
+}
+
+/// run - Start execution with the specified function and arguments.
+///
+GenericValue
+Interpreter::runFunction(Function *F,
+                         const std::vector<GenericValue> &ArgValues) {
+  assert (F && "Function *F was null at entry to run()");
+
+  // Try extra hard not to pass extra args to a function that isn't
+  // expecting them.  C programmers frequently bend the rules and
+  // declare main() with fewer parameters than it actually gets
+  // passed, and the interpreter barfs if you pass a function more
+  // parameters than it is declared to take. This does not attempt to
+  // take into account gratuitous differences in declared types,
+  // though.
+  std::vector<GenericValue> ActualArgs;
+  const unsigned ArgCount = F->getFunctionType()->getNumParams();
+  for (unsigned i = 0; i < ArgCount; ++i)
+    ActualArgs.push_back(ArgValues[i]);
+
+  // Set up the function call.
+  callFunction(F, ActualArgs);
+
+  // Start executing the function.
+  run();
+
+  return ExitValue;
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
new file mode 100644
index 000000000000..2952d7eabe2b
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -0,0 +1,248 @@
+//===-- Interpreter.h ------------------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines the interpreter structure
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLI_INTERPRETER_H
+#define LLI_INTERPRETER_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+namespace llvm {
+
+class IntrinsicLowering;
+struct FunctionInfo;
+template<typename T> class generic_gep_type_iterator;
+class ConstantExpr;
+typedef generic_gep_type_iterator<User::const_op_iterator> gep_type_iterator;
+
+
+// AllocaHolder - Object to track all of the blocks of memory allocated by
+// alloca.  When the function returns, this object is popped off the execution
+// stack, which causes the dtor to be run, which frees all the alloca'd memory.
+//
+class AllocaHolder {
+  friend class AllocaHolderHandle;
+  std::vector<void*> Allocations;
+  unsigned RefCnt;
+public:
+  AllocaHolder() : RefCnt(0) {}
+  void add(void *mem) { Allocations.push_back(mem); }
+  ~AllocaHolder() {
+    for (unsigned i = 0; i < Allocations.size(); ++i)
+      free(Allocations[i]);
+  }
+};
+
+// AllocaHolderHandle gives AllocaHolder value semantics so we can stick it into
+// a vector...
+//
+class AllocaHolderHandle {
+  AllocaHolder *H;
+public:
+  AllocaHolderHandle() : H(new AllocaHolder()) { H->RefCnt++; }
+  AllocaHolderHandle(const AllocaHolderHandle &AH) : H(AH.H) { H->RefCnt++; }
+  ~AllocaHolderHandle() { if (--H->RefCnt == 0) delete H; }
+
+  void add(void *mem) { H->add(mem); }
+};
+
+typedef std::vector<GenericValue> ValuePlaneTy;
+
+// ExecutionContext struct - This struct represents one stack frame currently
+// executing.
+//
+struct ExecutionContext {
+  Function             *CurFunction;// The currently executing function
+  BasicBlock           *CurBB;      // The currently executing BB
+  BasicBlock::iterator  CurInst;    // The next instruction to execute
+  std::map<Value *, GenericValue> Values; // LLVM values used in this invocation
+  std::vector<GenericValue>  VarArgs; // Values passed through an ellipsis
+  CallSite             Caller;     // Holds the call that called subframes.
+                                   // NULL if main func or debugger invoked fn
+  AllocaHolderHandle    Allocas;    // Track memory allocated by alloca
+};
+
+// Interpreter - This class represents the entirety of the interpreter.
+//
+class Interpreter : public ExecutionEngine, public InstVisitor<Interpreter> {
+  GenericValue ExitValue;          // The return value of the called function
+  DataLayout TD;
+  IntrinsicLowering *IL;
+
+  // The runtime stack of executing code.  The top of the stack is the current
+  // function record.
+  std::vector<ExecutionContext> ECStack;
+
+  // AtExitHandlers - List of functions to call when the program exits,
+  // registered with the atexit() library function.
+  std::vector<Function*> AtExitHandlers;
+
+public:
+  explicit Interpreter(Module *M);
+  ~Interpreter();
+
+  /// runAtExitHandlers - Run any functions registered by the program's calls to
+  /// atexit(3), which we intercept and store in AtExitHandlers.
+  ///
+  void runAtExitHandlers();
+
+  static void Register() {
+    InterpCtor = create;
+  }
+  
+  /// create - Create an interpreter ExecutionEngine. This can never fail.
+  ///
+  static ExecutionEngine *create(Module *M, std::string *ErrorStr = 0);
+
+  /// run - Start execution with the specified function and arguments.
+  ///
+  virtual GenericValue runFunction(Function *F,
+                                   const std::vector<GenericValue> &ArgValues);
+
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true) {
+    // FIXME: not implemented.
+    return 0;
+  }
+
+  /// recompileAndRelinkFunction - For the interpreter, functions are always
+  /// up-to-date.
+  ///
+  virtual void *recompileAndRelinkFunction(Function *F) {
+    return getPointerToFunction(F);
+  }
+
+  /// freeMachineCodeForFunction - The interpreter does not generate any code.
+  ///
+  void freeMachineCodeForFunction(Function *F) { }
+
+  // Methods used to execute code:
+  // Place a call on the stack
+  void callFunction(Function *F, const std::vector<GenericValue> &ArgVals);
+  void run();                // Execute instructions until nothing left to do
+
+  // Opcode Implementations
+  void visitReturnInst(ReturnInst &I);
+  void visitBranchInst(BranchInst &I);
+  void visitSwitchInst(SwitchInst &I);
+  void visitIndirectBrInst(IndirectBrInst &I);
+
+  void visitBinaryOperator(BinaryOperator &I);
+  void visitICmpInst(ICmpInst &I);
+  void visitFCmpInst(FCmpInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitLoadInst(LoadInst &I);
+  void visitStoreInst(StoreInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitPHINode(PHINode &PN) { 
+    llvm_unreachable("PHI nodes already handled!"); 
+  }
+  void visitTruncInst(TruncInst &I);
+  void visitZExtInst(ZExtInst &I);
+  void visitSExtInst(SExtInst &I);
+  void visitFPTruncInst(FPTruncInst &I);
+  void visitFPExtInst(FPExtInst &I);
+  void visitUIToFPInst(UIToFPInst &I);
+  void visitSIToFPInst(SIToFPInst &I);
+  void visitFPToUIInst(FPToUIInst &I);
+  void visitFPToSIInst(FPToSIInst &I);
+  void visitPtrToIntInst(PtrToIntInst &I);
+  void visitIntToPtrInst(IntToPtrInst &I);
+  void visitBitCastInst(BitCastInst &I);
+  void visitSelectInst(SelectInst &I);
+
+
+  void visitCallSite(CallSite CS);
+  void visitCallInst(CallInst &I) { visitCallSite (CallSite (&I)); }
+  void visitInvokeInst(InvokeInst &I) { visitCallSite (CallSite (&I)); }
+  void visitUnreachableInst(UnreachableInst &I);
+
+  void visitShl(BinaryOperator &I);
+  void visitLShr(BinaryOperator &I);
+  void visitAShr(BinaryOperator &I);
+
+  void visitVAArgInst(VAArgInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInstruction(Instruction &I) {
+    errs() << I << "\n";
+    llvm_unreachable("Instruction not interpretable yet!");
+  }
+
+  GenericValue callExternalFunction(Function *F,
+                                    const std::vector<GenericValue> &ArgVals);
+  void exitCalled(GenericValue GV);
+
+  void addAtExitHandler(Function *F) {
+    AtExitHandlers.push_back(F);
+  }
+
+  GenericValue *getFirstVarArg () {
+    return &(ECStack.back ().VarArgs[0]);
+  }
+
+private:  // Helper functions
+  GenericValue executeGEPOperation(Value *Ptr, gep_type_iterator I,
+                                   gep_type_iterator E, ExecutionContext &SF);
+
+  // SwitchToNewBasicBlock - Start execution in a new basic block and run any
+  // PHI nodes in the top of the block.  This is used for intraprocedural
+  // control flow.
+  //
+  void SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF);
+
+  void *getPointerToFunction(Function *F) { return (void*)F; }
+  void *getPointerToBasicBlock(BasicBlock *BB) { return (void*)BB; }
+
+  void initializeExecutionEngine() { }
+  void initializeExternalFunctions();
+  GenericValue getConstantExprValue(ConstantExpr *CE, ExecutionContext &SF);
+  GenericValue getOperandValue(Value *V, ExecutionContext &SF);
+  GenericValue executeTruncInst(Value *SrcVal, Type *DstTy,
+                                ExecutionContext &SF);
+  GenericValue executeSExtInst(Value *SrcVal, Type *DstTy,
+                               ExecutionContext &SF);
+  GenericValue executeZExtInst(Value *SrcVal, Type *DstTy,
+                               ExecutionContext &SF);
+  GenericValue executeFPTruncInst(Value *SrcVal, Type *DstTy,
+                                  ExecutionContext &SF);
+  GenericValue executeFPExtInst(Value *SrcVal, Type *DstTy,
+                                ExecutionContext &SF);
+  GenericValue executeFPToUIInst(Value *SrcVal, Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeFPToSIInst(Value *SrcVal, Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeUIToFPInst(Value *SrcVal, Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executeSIToFPInst(Value *SrcVal, Type *DstTy,
+                                 ExecutionContext &SF);
+  GenericValue executePtrToIntInst(Value *SrcVal, Type *DstTy,
+                                   ExecutionContext &SF);
+  GenericValue executeIntToPtrInst(Value *SrcVal, Type *DstTy,
+                                   ExecutionContext &SF);
+  GenericValue executeBitCastInst(Value *SrcVal, Type *DstTy,
+                                  ExecutionContext &SF);
+  GenericValue executeCastOperation(Instruction::CastOps opcode, Value *SrcVal, 
+                                    Type *Ty, ExecutionContext &SF);
+  void popStackAndReturnValueToCaller(Type *RetTy, GenericValue Result);
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JIT.cpp b/contrib/llvm/lib/ExecutionEngine/JIT/JIT.cpp
new file mode 100644
index 000000000000..53ea0a260087
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JIT.cpp
@@ -0,0 +1,848 @@
+//===-- JIT.cpp - LLVM Just in Time Compiler ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tool implements a just-in-time compiler for LLVM, allowing direct
+// execution of LLVM bitcode in an efficient manner.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JIT.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#ifdef __APPLE__
+// Apple gcc defaults to -fuse-cxa-atexit (i.e. calls __cxa_atexit instead
+// of atexit). It passes the address of linker generated symbol __dso_handle
+// to the function.
+// This configuration change happened at version 5330.
+# include <AvailabilityMacros.h>
+# if defined(MAC_OS_X_VERSION_10_4) && \
+     ((MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_4) || \
+      (MAC_OS_X_VERSION_MIN_REQUIRED == MAC_OS_X_VERSION_10_4 && \
+       __APPLE_CC__ >= 5330))
+#  ifndef HAVE___DSO_HANDLE
+#   define HAVE___DSO_HANDLE 1
+#  endif
+# endif
+#endif
+
+#if HAVE___DSO_HANDLE
+extern void *__dso_handle __attribute__ ((__visibility__ ("hidden")));
+#endif
+
+namespace {
+
+static struct RegisterJIT {
+  RegisterJIT() { JIT::Register(); }
+} JITRegistrator;
+
+}
+
+extern "C" void LLVMLinkInJIT() {
+}
+
+// Determine whether we can register EH tables.
+#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
+     !defined(__USING_SJLJ_EXCEPTIONS__))
+#define HAVE_EHTABLE_SUPPORT 1
+#else
+#define HAVE_EHTABLE_SUPPORT 0
+#endif
+
+#if HAVE_EHTABLE_SUPPORT
+
+// libgcc defines the __register_frame function to dynamically register new
+// dwarf frames for exception handling. This functionality is not portable
+// across compilers and is only provided by GCC. We use the __register_frame
+// function here so that code generated by the JIT cooperates with the unwinding
+// runtime of libgcc. When JITting with exception handling enable, LLVM
+// generates dwarf frames and registers it to libgcc with __register_frame.
+//
+// The __register_frame function works with Linux.
+//
+// Unfortunately, this functionality seems to be in libgcc after the unwinding
+// library of libgcc for darwin was written. The code for darwin overwrites the
+// value updated by __register_frame with a value fetched with "keymgr".
+// "keymgr" is an obsolete functionality, which should be rewritten some day.
+// In the meantime, since "keymgr" is on all libgccs shipped with apple-gcc, we
+// need a workaround in LLVM which uses the "keymgr" to dynamically modify the
+// values of an opaque key, used by libgcc to find dwarf tables.
+
+extern "C" void __register_frame(void*);
+extern "C" void __deregister_frame(void*);
+
+#if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED <= 1050
+# define USE_KEYMGR 1
+#else
+# define USE_KEYMGR 0
+#endif
+
+#if USE_KEYMGR
+
+namespace {
+
+// LibgccObject - This is the structure defined in libgcc. There is no #include
+// provided for this structure, so we also define it here. libgcc calls it
+// "struct object". The structure is undocumented in libgcc.
+struct LibgccObject {
+  void *unused1;
+  void *unused2;
+  void *unused3;
+
+  /// frame - Pointer to the exception table.
+  void *frame;
+
+  /// encoding -  The encoding of the object?
+  union {
+    struct {
+      unsigned long sorted : 1;
+      unsigned long from_array : 1;
+      unsigned long mixed_encoding : 1;
+      unsigned long encoding : 8;
+      unsigned long count : 21;
+    } b;
+    size_t i;
+  } encoding;
+
+  /// fde_end - libgcc defines this field only if some macro is defined. We
+  /// include this field even if it may not there, to make libgcc happy.
+  char *fde_end;
+
+  /// next - At least we know it's a chained list!
+  struct LibgccObject *next;
+};
+
+// "kemgr" stuff. Apparently, all frame tables are stored there.
+extern "C" void _keymgr_set_and_unlock_processwide_ptr(int, void *);
+extern "C" void *_keymgr_get_and_lock_processwide_ptr(int);
+#define KEYMGR_GCC3_DW2_OBJ_LIST        302     /* Dwarf2 object list  */
+
+/// LibgccObjectInfo - libgcc defines this struct as km_object_info. It
+/// probably contains all dwarf tables that are loaded.
+struct LibgccObjectInfo {
+
+  /// seenObjects - LibgccObjects already parsed by the unwinding runtime.
+  ///
+  struct LibgccObject* seenObjects;
+
+  /// unseenObjects - LibgccObjects not parsed yet by the unwinding runtime.
+  ///
+  struct LibgccObject* unseenObjects;
+
+  unsigned unused[2];
+};
+
+/// darwin_register_frame - Since __register_frame does not work with darwin's
+/// libgcc,we provide our own function, which "tricks" libgcc by modifying the
+/// "Dwarf2 object list" key.
+void DarwinRegisterFrame(void* FrameBegin) {
+  // Get the key.
+  LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
+    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
+  assert(LOI && "This should be preallocated by the runtime");
+
+  // Allocate a new LibgccObject to represent this frame. Deallocation of this
+  // object may be impossible: since darwin code in libgcc was written after
+  // the ability to dynamically register frames, things may crash if we
+  // deallocate it.
+  struct LibgccObject* ob = (struct LibgccObject*)
+    malloc(sizeof(struct LibgccObject));
+
+  // Do like libgcc for the values of the field.
+  ob->unused1 = (void *)-1;
+  ob->unused2 = 0;
+  ob->unused3 = 0;
+  ob->frame = FrameBegin;
+  ob->encoding.i = 0;
+  ob->encoding.b.encoding = llvm::dwarf::DW_EH_PE_omit;
+
+  // Put the info on both places, as libgcc uses the first or the second
+  // field. Note that we rely on having two pointers here. If fde_end was a
+  // char, things would get complicated.
+  ob->fde_end = (char*)LOI->unseenObjects;
+  ob->next = LOI->unseenObjects;
+
+  // Update the key's unseenObjects list.
+  LOI->unseenObjects = ob;
+
+  // Finally update the "key". Apparently, libgcc requires it.
+  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
+                                         LOI);
+
+}
+
+}
+#endif // __APPLE__
+#endif // HAVE_EHTABLE_SUPPORT
+
+/// createJIT - This is the factory method for creating a JIT for the current
+/// machine, it does not fall back to the interpreter.  This takes ownership
+/// of the module.
+ExecutionEngine *JIT::createJIT(Module *M,
+                                std::string *ErrorStr,
+                                JITMemoryManager *JMM,
+                                bool GVsWithCode,
+                                TargetMachine *TM) {
+  // Try to register the program as a source of symbols to resolve against.
+  //
+  // FIXME: Don't do this here.
+  sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
+
+  // If the target supports JIT code generation, create the JIT.
+  if (TargetJITInfo *TJ = TM->getJITInfo()) {
+    return new JIT(M, *TM, *TJ, JMM, GVsWithCode);
+  } else {
+    if (ErrorStr)
+      *ErrorStr = "target does not support JIT code generation";
+    return 0;
+  }
+}
+
+namespace {
+/// This class supports the global getPointerToNamedFunction(), which allows
+/// bugpoint or gdb users to search for a function by name without any context.
+class JitPool {
+  SmallPtrSet<JIT*, 1> JITs;  // Optimize for process containing just 1 JIT.
+  mutable sys::Mutex Lock;
+public:
+  void Add(JIT *jit) {
+    MutexGuard guard(Lock);
+    JITs.insert(jit);
+  }
+  void Remove(JIT *jit) {
+    MutexGuard guard(Lock);
+    JITs.erase(jit);
+  }
+  void *getPointerToNamedFunction(const char *Name) const {
+    MutexGuard guard(Lock);
+    assert(JITs.size() != 0 && "No Jit registered");
+    //search function in every instance of JIT
+    for (SmallPtrSet<JIT*, 1>::const_iterator Jit = JITs.begin(),
+           end = JITs.end();
+         Jit != end; ++Jit) {
+      if (Function *F = (*Jit)->FindFunctionNamed(Name))
+        return (*Jit)->getPointerToFunction(F);
+    }
+    // The function is not available : fallback on the first created (will
+    // search in symbol of the current program/library)
+    return (*JITs.begin())->getPointerToNamedFunction(Name);
+  }
+};
+ManagedStatic<JitPool> AllJits;
+}
+extern "C" {
+  // getPointerToNamedFunction - This function is used as a global wrapper to
+  // JIT::getPointerToNamedFunction for the purpose of resolving symbols when
+  // bugpoint is debugging the JIT. In that scenario, we are loading an .so and
+  // need to resolve function(s) that are being mis-codegenerated, so we need to
+  // resolve their addresses at runtime, and this is the way to do it.
+  void *getPointerToNamedFunction(const char *Name) {
+    return AllJits->getPointerToNamedFunction(Name);
+  }
+}
+
+JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
+         JITMemoryManager *jmm, bool GVsWithCode)
+  : ExecutionEngine(M), TM(tm), TJI(tji),
+    JMM(jmm ? jmm : JITMemoryManager::CreateDefaultMemManager()),
+    AllocateGVsWithCode(GVsWithCode), isAlreadyCodeGenerating(false) {
+  setDataLayout(TM.getDataLayout());
+
+  jitstate = new JITState(M);
+
+  // Initialize JCE
+  JCE = createEmitter(*this, JMM, TM);
+
+  // Register in global list of all JITs.
+  AllJits->Add(this);
+
+  // Add target data
+  MutexGuard locked(lock);
+  FunctionPassManager &PM = jitstate->getPM(locked);
+  PM.add(new DataLayout(*TM.getDataLayout()));
+
+  // Turn the machine code intermediate representation into bytes in memory that
+  // may be executed.
+  if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+    report_fatal_error("Target does not support machine code emission!");
+  }
+
+  // Register routine for informing unwinding runtime about new EH frames
+#if HAVE_EHTABLE_SUPPORT
+#if USE_KEYMGR
+  struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
+    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
+
+  // The key is created on demand, and libgcc creates it the first time an
+  // exception occurs. Since we need the key to register frames, we create
+  // it now.
+  if (!LOI)
+    LOI = (LibgccObjectInfo*)calloc(sizeof(struct LibgccObjectInfo), 1);
+  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, LOI);
+  InstallExceptionTableRegister(DarwinRegisterFrame);
+  // Not sure about how to deregister on Darwin.
+#else
+  InstallExceptionTableRegister(__register_frame);
+  InstallExceptionTableDeregister(__deregister_frame);
+#endif // __APPLE__
+#endif // HAVE_EHTABLE_SUPPORT
+
+  // Initialize passes.
+  PM.doInitialization();
+}
+
+JIT::~JIT() {
+  // Unregister all exception tables registered by this JIT.
+  DeregisterAllTables();
+  // Cleanup.
+  AllJits->Remove(this);
+  delete jitstate;
+  delete JCE;
+  // JMM is a ownership of JCE, so we no need delete JMM here.
+  delete &TM;
+}
+
+/// addModule - Add a new Module to the JIT.  If we previously removed the last
+/// Module, we need re-initialize jitstate with a valid Module.
+void JIT::addModule(Module *M) {
+  MutexGuard locked(lock);
+
+  if (Modules.empty()) {
+    assert(!jitstate && "jitstate should be NULL if Modules vector is empty!");
+
+    jitstate = new JITState(M);
+
+    FunctionPassManager &PM = jitstate->getPM(locked);
+    PM.add(new DataLayout(*TM.getDataLayout()));
+
+    // Turn the machine code intermediate representation into bytes in memory
+    // that may be executed.
+    if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+      report_fatal_error("Target does not support machine code emission!");
+    }
+
+    // Initialize passes.
+    PM.doInitialization();
+  }
+
+  ExecutionEngine::addModule(M);
+}
+
+/// removeModule - If we are removing the last Module, invalidate the jitstate
+/// since the PassManager it contains references a released Module.
+bool JIT::removeModule(Module *M) {
+  bool result = ExecutionEngine::removeModule(M);
+
+  MutexGuard locked(lock);
+
+  if (jitstate && jitstate->getModule() == M) {
+    delete jitstate;
+    jitstate = 0;
+  }
+
+  if (!jitstate && !Modules.empty()) {
+    jitstate = new JITState(Modules[0]);
+
+    FunctionPassManager &PM = jitstate->getPM(locked);
+    PM.add(new DataLayout(*TM.getDataLayout()));
+
+    // Turn the machine code intermediate representation into bytes in memory
+    // that may be executed.
+    if (TM.addPassesToEmitMachineCode(PM, *JCE)) {
+      report_fatal_error("Target does not support machine code emission!");
+    }
+
+    // Initialize passes.
+    PM.doInitialization();
+  }
+  return result;
+}
+
+/// run - Start execution with the specified function and arguments.
+///
+GenericValue JIT::runFunction(Function *F,
+                              const std::vector<GenericValue> &ArgValues) {
+  assert(F && "Function *F was null at entry to run()");
+
+  void *FPtr = getPointerToFunction(F);
+  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
+  FunctionType *FTy = F->getFunctionType();
+  Type *RetTy = FTy->getReturnType();
+
+  assert((FTy->getNumParams() == ArgValues.size() ||
+          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
+         "Wrong number of arguments passed into function!");
+  assert(FTy->getNumParams() == ArgValues.size() &&
+         "This doesn't support passing arguments through varargs (yet)!");
+
+  // Handle some common cases first.  These cases correspond to common `main'
+  // prototypes.
+  if (RetTy->isIntegerTy(32) || RetTy->isVoidTy()) {
+    switch (ArgValues.size()) {
+    case 3:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy() &&
+          FTy->getParamType(2)->isPointerTy()) {
+        int (*PF)(int, char **, const char **) =
+          (int(*)(int, char **, const char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1]),
+                                 (const char **)GVTOP(ArgValues[2])));
+        return rv;
+      }
+      break;
+    case 2:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy()) {
+        int (*PF)(int, char **) = (int(*)(int, char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1])));
+        return rv;
+      }
+      break;
+    case 1:
+      if (FTy->getParamType(0)->isIntegerTy(32)) {
+        GenericValue rv;
+        int (*PF)(int) = (int(*)(int))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
+        return rv;
+      }
+      if (FTy->getParamType(0)->isPointerTy()) {
+        GenericValue rv;
+        int (*PF)(char *) = (int(*)(char *))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF((char*)GVTOP(ArgValues[0])));
+        return rv;
+      }
+      break;
+    }
+  }
+
+  // Handle cases where no arguments are passed first.
+  if (ArgValues.empty()) {
+    GenericValue rv;
+    switch (RetTy->getTypeID()) {
+    default: llvm_unreachable("Unknown return type for function call!");
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
+      if (BitWidth == 1)
+        rv.IntVal = APInt(BitWidth, ((bool(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 8)
+        rv.IntVal = APInt(BitWidth, ((char(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 16)
+        rv.IntVal = APInt(BitWidth, ((short(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 32)
+        rv.IntVal = APInt(BitWidth, ((int(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 64)
+        rv.IntVal = APInt(BitWidth, ((int64_t(*)())(intptr_t)FPtr)());
+      else
+        llvm_unreachable("Integer types > 64 bits not supported");
+      return rv;
+    }
+    case Type::VoidTyID:
+      rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)());
+      return rv;
+    case Type::FloatTyID:
+      rv.FloatVal = ((float(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::DoubleTyID:
+      rv.DoubleVal = ((double(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+    case Type::PPC_FP128TyID:
+      llvm_unreachable("long double not supported yet");
+    case Type::PointerTyID:
+      return PTOGV(((void*(*)())(intptr_t)FPtr)());
+    }
+  }
+
+  // Okay, this is not one of our quick and easy cases.  Because we don't have a
+  // full FFI, we have to codegen a nullary stub function that just calls the
+  // function we are interested in, passing in constants for all of the
+  // arguments.  Make this function and return.
+
+  // First, create the function.
+  FunctionType *STy=FunctionType::get(RetTy, false);
+  Function *Stub = Function::Create(STy, Function::InternalLinkage, "",
+                                    F->getParent());
+
+  // Insert a basic block.
+  BasicBlock *StubBB = BasicBlock::Create(F->getContext(), "", Stub);
+
+  // Convert all of the GenericValue arguments over to constants.  Note that we
+  // currently don't support varargs.
+  SmallVector<Value*, 8> Args;
+  for (unsigned i = 0, e = ArgValues.size(); i != e; ++i) {
+    Constant *C = 0;
+    Type *ArgTy = FTy->getParamType(i);
+    const GenericValue &AV = ArgValues[i];
+    switch (ArgTy->getTypeID()) {
+    default: llvm_unreachable("Unknown argument type for function call!");
+    case Type::IntegerTyID:
+        C = ConstantInt::get(F->getContext(), AV.IntVal);
+        break;
+    case Type::FloatTyID:
+        C = ConstantFP::get(F->getContext(), APFloat(AV.FloatVal));
+        break;
+    case Type::DoubleTyID:
+        C = ConstantFP::get(F->getContext(), APFloat(AV.DoubleVal));
+        break;
+    case Type::PPC_FP128TyID:
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+        C = ConstantFP::get(F->getContext(), APFloat(ArgTy->getFltSemantics(),
+                                                     AV.IntVal));
+        break;
+    case Type::PointerTyID:
+      void *ArgPtr = GVTOP(AV);
+      if (sizeof(void*) == 4)
+        C = ConstantInt::get(Type::getInt32Ty(F->getContext()),
+                             (int)(intptr_t)ArgPtr);
+      else
+        C = ConstantInt::get(Type::getInt64Ty(F->getContext()),
+                             (intptr_t)ArgPtr);
+      // Cast the integer to pointer
+      C = ConstantExpr::getIntToPtr(C, ArgTy);
+      break;
+    }
+    Args.push_back(C);
+  }
+
+  CallInst *TheCall = CallInst::Create(F, Args, "", StubBB);
+  TheCall->setCallingConv(F->getCallingConv());
+  TheCall->setTailCall();
+  if (!TheCall->getType()->isVoidTy())
+    // Return result of the call.
+    ReturnInst::Create(F->getContext(), TheCall, StubBB);
+  else
+    ReturnInst::Create(F->getContext(), StubBB);           // Just return void.
+
+  // Finally, call our nullary stub function.
+  GenericValue Result = runFunction(Stub, std::vector<GenericValue>());
+  // Erase it, since no other function can have a reference to it.
+  Stub->eraseFromParent();
+  // And return the result.
+  return Result;
+}
+
+void JIT::RegisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  EventListeners.push_back(L);
+}
+void JIT::UnregisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  std::vector<JITEventListener*>::reverse_iterator I=
+      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
+  if (I != EventListeners.rend()) {
+    std::swap(*I, EventListeners.back());
+    EventListeners.pop_back();
+  }
+}
+void JIT::NotifyFunctionEmitted(
+    const Function &F,
+    void *Code, size_t Size,
+    const JITEvent_EmittedFunctionDetails &Details) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyFunctionEmitted(F, Code, Size, Details);
+  }
+}
+
+void JIT::NotifyFreeingMachineCode(void *OldPtr) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyFreeingMachineCode(OldPtr);
+  }
+}
+
+/// runJITOnFunction - Run the FunctionPassManager full of
+/// just-in-time compilation passes on F, hopefully filling in
+/// GlobalAddress[F] with the address of F's machine code.
+///
+void JIT::runJITOnFunction(Function *F, MachineCodeInfo *MCI) {
+  MutexGuard locked(lock);
+
+  class MCIListener : public JITEventListener {
+    MachineCodeInfo *const MCI;
+   public:
+    MCIListener(MachineCodeInfo *mci) : MCI(mci) {}
+    virtual void NotifyFunctionEmitted(const Function &,
+                                       void *Code, size_t Size,
+                                       const EmittedFunctionDetails &) {
+      MCI->setAddress(Code);
+      MCI->setSize(Size);
+    }
+  };
+  MCIListener MCIL(MCI);
+  if (MCI)
+    RegisterJITEventListener(&MCIL);
+
+  runJITOnFunctionUnlocked(F, locked);
+
+  if (MCI)
+    UnregisterJITEventListener(&MCIL);
+}
+
+void JIT::runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked) {
+  assert(!isAlreadyCodeGenerating && "Error: Recursive compilation detected!");
+
+  jitTheFunction(F, locked);
+
+  // If the function referred to another function that had not yet been
+  // read from bitcode, and we are jitting non-lazily, emit it now.
+  while (!jitstate->getPendingFunctions(locked).empty()) {
+    Function *PF = jitstate->getPendingFunctions(locked).back();
+    jitstate->getPendingFunctions(locked).pop_back();
+
+    assert(!PF->hasAvailableExternallyLinkage() &&
+           "Externally-defined function should not be in pending list.");
+
+    jitTheFunction(PF, locked);
+
+    // Now that the function has been jitted, ask the JITEmitter to rewrite
+    // the stub with real address of the function.
+    updateFunctionStub(PF);
+  }
+}
+
+void JIT::jitTheFunction(Function *F, const MutexGuard &locked) {
+  isAlreadyCodeGenerating = true;
+  jitstate->getPM(locked).run(*F);
+  isAlreadyCodeGenerating = false;
+
+  // clear basic block addresses after this function is done
+  getBasicBlockAddressMap(locked).clear();
+}
+
+/// getPointerToFunction - This method is used to get the address of the
+/// specified function, compiling it if necessary.
+///
+void *JIT::getPointerToFunction(Function *F) {
+
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;   // Check if function already code gen'd
+
+  MutexGuard locked(lock);
+
+  // Now that this thread owns the lock, make sure we read in the function if it
+  // exists in this Module.
+  std::string ErrorMsg;
+  if (F->Materialize(&ErrorMsg)) {
+    report_fatal_error("Error reading function '" + F->getName()+
+                      "' from bitcode file: " + ErrorMsg);
+  }
+
+  // ... and check if another thread has already code gen'd the function.
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;
+
+  if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
+    bool AbortOnFailure = !F->hasExternalWeakLinkage();
+    void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
+    addGlobalMapping(F, Addr);
+    return Addr;
+  }
+
+  runJITOnFunctionUnlocked(F, locked);
+
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
+  return Addr;
+}
+
+void JIT::addPointerToBasicBlock(const BasicBlock *BB, void *Addr) {
+  MutexGuard locked(lock);
+
+  BasicBlockAddressMapTy::iterator I =
+    getBasicBlockAddressMap(locked).find(BB);
+  if (I == getBasicBlockAddressMap(locked).end()) {
+    getBasicBlockAddressMap(locked)[BB] = Addr;
+  } else {
+    // ignore repeats: some BBs can be split into few MBBs?
+  }
+}
+
+void JIT::clearPointerToBasicBlock(const BasicBlock *BB) {
+  MutexGuard locked(lock);
+  getBasicBlockAddressMap(locked).erase(BB);
+}
+
+void *JIT::getPointerToBasicBlock(BasicBlock *BB) {
+  // make sure it's function is compiled by JIT
+  (void)getPointerToFunction(BB->getParent());
+
+  // resolve basic block address
+  MutexGuard locked(lock);
+
+  BasicBlockAddressMapTy::iterator I =
+    getBasicBlockAddressMap(locked).find(BB);
+  if (I != getBasicBlockAddressMap(locked).end()) {
+    return I->second;
+  } else {
+    llvm_unreachable("JIT does not have BB address for address-of-label, was"
+                     " it eliminated by optimizer?");
+  }
+}
+
+void *JIT::getPointerToNamedFunction(const std::string &Name,
+                                     bool AbortOnFailure){
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = JMM->getPointerToNamedFunction(Name, false);
+    if (ptr)
+      return ptr;
+  }
+
+  /// If a LazyFunctionCreator is installed, use it to get/create the function.
+  if (LazyFunctionCreator)
+    if (void *RP = LazyFunctionCreator(Name))
+      return RP;
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
+
+
+/// getOrEmitGlobalVariable - Return the address of the specified global
+/// variable, possibly emitting it to memory if needed.  This is used by the
+/// Emitter.
+void *JIT::getOrEmitGlobalVariable(const GlobalVariable *GV) {
+  MutexGuard locked(lock);
+
+  void *Ptr = getPointerToGlobalIfAvailable(GV);
+  if (Ptr) return Ptr;
+
+  // If the global is external, just remember the address.
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage()) {
+#if HAVE___DSO_HANDLE
+    if (GV->getName() == "__dso_handle")
+      return (void*)&__dso_handle;
+#endif
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(GV->getName());
+    if (Ptr == 0) {
+      report_fatal_error("Could not resolve external global address: "
+                        +GV->getName());
+    }
+    addGlobalMapping(GV, Ptr);
+  } else {
+    // If the global hasn't been emitted to memory yet, allocate space and
+    // emit it into memory.
+    Ptr = getMemoryForGV(GV);
+    addGlobalMapping(GV, Ptr);
+    EmitGlobalVariable(GV);  // Initialize the variable.
+  }
+  return Ptr;
+}
+
+/// recompileAndRelinkFunction - This method is used to force a function
+/// which has already been compiled, to be compiled again, possibly
+/// after it has been modified. Then the entry to the old copy is overwritten
+/// with a branch to the new copy. If there was no old copy, this acts
+/// just like JIT::getPointerToFunction().
+///
+void *JIT::recompileAndRelinkFunction(Function *F) {
+  void *OldAddr = getPointerToGlobalIfAvailable(F);
+
+  // If it's not already compiled there is no reason to patch it up.
+  if (OldAddr == 0) { return getPointerToFunction(F); }
+
+  // Delete the old function mapping.
+  addGlobalMapping(F, 0);
+
+  // Recodegen the function
+  runJITOnFunction(F);
+
+  // Update state, forward the old function to the new function.
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  assert(Addr && "Code generation didn't add function to GlobalAddress table!");
+  TJI.replaceMachineCodeForFunction(OldAddr, Addr);
+  return Addr;
+}
+
+/// getMemoryForGV - This method abstracts memory allocation of global
+/// variable so that the JIT can allocate thread local variables depending
+/// on the target.
+///
+char* JIT::getMemoryForGV(const GlobalVariable* GV) {
+  char *Ptr;
+
+  // GlobalVariable's which are not "constant" will cause trouble in a server
+  // situation. It's returned in the same block of memory as code which may
+  // not be writable.
+  if (isGVCompilationDisabled() && !GV->isConstant()) {
+    report_fatal_error("Compilation of non-internal GlobalValue is disabled!");
+  }
+
+  // Some applications require globals and code to live together, so they may
+  // be allocated into the same buffer, but in general globals are allocated
+  // through the memory manager which puts them near the code but not in the
+  // same buffer.
+  Type *GlobalType = GV->getType()->getElementType();
+  size_t S = getDataLayout()->getTypeAllocSize(GlobalType);
+  size_t A = getDataLayout()->getPreferredAlignment(GV);
+  if (GV->isThreadLocal()) {
+    MutexGuard locked(lock);
+    Ptr = TJI.allocateThreadLocalMemory(S);
+  } else if (TJI.allocateSeparateGVMemory()) {
+    if (A <= 8) {
+      Ptr = (char*)malloc(S);
+    } else {
+      // Allocate S+A bytes of memory, then use an aligned pointer within that
+      // space.
+      Ptr = (char*)malloc(S+A);
+      unsigned MisAligned = ((intptr_t)Ptr & (A-1));
+      Ptr = Ptr + (MisAligned ? (A-MisAligned) : 0);
+    }
+  } else if (AllocateGVsWithCode) {
+    Ptr = (char*)JCE->allocateSpace(S, A);
+  } else {
+    Ptr = (char*)JCE->allocateGlobal(S, A);
+  }
+  return Ptr;
+}
+
+void JIT::addPendingFunction(Function *F) {
+  MutexGuard locked(lock);
+  jitstate->getPendingFunctions(locked).push_back(F);
+}
+
+
+JITEventListener::~JITEventListener() {}
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JIT.h b/contrib/llvm/lib/ExecutionEngine/JIT/JIT.h
new file mode 100644
index 000000000000..2ae155bebf40
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JIT.h
@@ -0,0 +1,226 @@
+//===-- JIT.h - Class definition for the JIT --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the top-level JIT data structure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef JIT_H
+#define JIT_H
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/ValueHandle.h"
+
+namespace llvm {
+
+class Function;
+struct JITEvent_EmittedFunctionDetails;
+class MachineCodeEmitter;
+class MachineCodeInfo;
+class TargetJITInfo;
+class TargetMachine;
+
+class JITState {
+private:
+  FunctionPassManager PM;  // Passes to compile a function
+  Module *M;               // Module used to create the PM
+
+  /// PendingFunctions - Functions which have not been code generated yet, but
+  /// were called from a function being code generated.
+  std::vector<AssertingVH<Function> > PendingFunctions;
+
+public:
+  explicit JITState(Module *M) : PM(M), M(M) {}
+
+  FunctionPassManager &getPM(const MutexGuard &L) {
+    return PM;
+  }
+
+  Module *getModule() const { return M; }
+  std::vector<AssertingVH<Function> > &getPendingFunctions(const MutexGuard &L){
+    return PendingFunctions;
+  }
+};
+
+
+class JIT : public ExecutionEngine {
+  /// types
+  typedef ValueMap<const BasicBlock *, void *>
+      BasicBlockAddressMapTy;
+  /// data
+  TargetMachine &TM;       // The current target we are compiling to
+  TargetJITInfo &TJI;      // The JITInfo for the target we are compiling to
+  JITCodeEmitter *JCE;     // JCE object
+  JITMemoryManager *JMM;
+  std::vector<JITEventListener*> EventListeners;
+
+  /// AllocateGVsWithCode - Some applications require that global variables and
+  /// code be allocated into the same region of memory, in which case this flag
+  /// should be set to true.  Doing so breaks freeMachineCodeForFunction.
+  bool AllocateGVsWithCode;
+
+  /// True while the JIT is generating code.  Used to assert against recursive
+  /// entry.
+  bool isAlreadyCodeGenerating;
+
+  JITState *jitstate;
+
+  /// BasicBlockAddressMap - A mapping between LLVM basic blocks and their
+  /// actualized version, only filled for basic blocks that have their address
+  /// taken.
+  BasicBlockAddressMapTy BasicBlockAddressMap;
+
+
+  JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
+      JITMemoryManager *JMM, bool AllocateGVsWithCode);
+public:
+  ~JIT();
+
+  static void Register() {
+    JITCtor = createJIT;
+  }
+
+  /// getJITInfo - Return the target JIT information structure.
+  ///
+  TargetJITInfo &getJITInfo() const { return TJI; }
+
+  /// create - Create an return a new JIT compiler if there is one available
+  /// for the current target.  Otherwise, return null.
+  ///
+  static ExecutionEngine *create(Module *M,
+                                 std::string *Err,
+                                 JITMemoryManager *JMM,
+                                 CodeGenOpt::Level OptLevel =
+                                   CodeGenOpt::Default,
+                                 bool GVsWithCode = true,
+                                 Reloc::Model RM = Reloc::Default,
+                                 CodeModel::Model CMM = CodeModel::JITDefault) {
+    return ExecutionEngine::createJIT(M, Err, JMM, OptLevel, GVsWithCode,
+                                      RM, CMM);
+  }
+
+  virtual void addModule(Module *M);
+
+  /// removeModule - Remove a Module from the list of modules.  Returns true if
+  /// M is found.
+  virtual bool removeModule(Module *M);
+
+  /// runFunction - Start execution with the specified function and arguments.
+  ///
+  virtual GenericValue runFunction(Function *F,
+                                   const std::vector<GenericValue> &ArgValues);
+
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function by using the MemoryManager. As such it is only
+  /// useful for resolving library symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function silently returns a null pointer. Otherwise,
+  /// it prints a message to stderr and aborts.
+  ///
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true);
+
+  // CompilationCallback - Invoked the first time that a call site is found,
+  // which causes lazy compilation of the target function.
+  //
+  static void CompilationCallback();
+
+  /// getPointerToFunction - This returns the address of the specified function,
+  /// compiling it if necessary.
+  ///
+  void *getPointerToFunction(Function *F);
+
+  /// addPointerToBasicBlock - Adds address of the specific basic block.
+  void addPointerToBasicBlock(const BasicBlock *BB, void *Addr);
+
+  /// clearPointerToBasicBlock - Removes address of specific basic block.
+  void clearPointerToBasicBlock(const BasicBlock *BB);
+
+  /// getPointerToBasicBlock - This returns the address of the specified basic
+  /// block, assuming function is compiled.
+  void *getPointerToBasicBlock(BasicBlock *BB);
+
+  /// getOrEmitGlobalVariable - Return the address of the specified global
+  /// variable, possibly emitting it to memory if needed.  This is used by the
+  /// Emitter.
+  void *getOrEmitGlobalVariable(const GlobalVariable *GV);
+
+  /// getPointerToFunctionOrStub - If the specified function has been
+  /// code-gen'd, return a pointer to the function.  If not, compile it, or use
+  /// a stub to implement lazy compilation if available.
+  ///
+  void *getPointerToFunctionOrStub(Function *F);
+
+  /// recompileAndRelinkFunction - This method is used to force a function
+  /// which has already been compiled, to be compiled again, possibly
+  /// after it has been modified. Then the entry to the old copy is overwritten
+  /// with a branch to the new copy. If there was no old copy, this acts
+  /// just like JIT::getPointerToFunction().
+  ///
+  void *recompileAndRelinkFunction(Function *F);
+
+  /// freeMachineCodeForFunction - deallocate memory used to code-generate this
+  /// Function.
+  ///
+  void freeMachineCodeForFunction(Function *F);
+
+  /// addPendingFunction - while jitting non-lazily, a called but non-codegen'd
+  /// function was encountered.  Add it to a pending list to be processed after
+  /// the current function.
+  ///
+  void addPendingFunction(Function *F);
+
+  /// getCodeEmitter - Return the code emitter this JIT is emitting into.
+  ///
+  JITCodeEmitter *getCodeEmitter() const { return JCE; }
+
+  static ExecutionEngine *createJIT(Module *M,
+                                    std::string *ErrorStr,
+                                    JITMemoryManager *JMM,
+                                    bool GVsWithCode,
+                                    TargetMachine *TM);
+
+  // Run the JIT on F and return information about the generated code
+  void runJITOnFunction(Function *F, MachineCodeInfo *MCI = 0);
+
+  virtual void RegisterJITEventListener(JITEventListener *L);
+  virtual void UnregisterJITEventListener(JITEventListener *L);
+  /// These functions correspond to the methods on JITEventListener.  They
+  /// iterate over the registered listeners and call the corresponding method on
+  /// each.
+  void NotifyFunctionEmitted(
+      const Function &F, void *Code, size_t Size,
+      const JITEvent_EmittedFunctionDetails &Details);
+  void NotifyFreeingMachineCode(void *OldPtr);
+
+  BasicBlockAddressMapTy &
+  getBasicBlockAddressMap(const MutexGuard &) {
+    return BasicBlockAddressMap;
+  }
+
+
+private:
+  static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
+                                       TargetMachine &tm);
+  void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
+  void updateFunctionStub(Function *F);
+  void jitTheFunction(Function *F, const MutexGuard &locked);
+
+protected:
+
+  /// getMemoryforGV - Allocate memory for a global variable.
+  virtual char* getMemoryForGV(const GlobalVariable* GV);
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
new file mode 100644
index 000000000000..35d2b8b1e9f2
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -0,0 +1,596 @@
+//===----- JITDwarfEmitter.cpp - Write dwarf tables into memory -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITDwarfEmitter object that is used by the JIT to
+// write dwarf tables to memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JITDwarfEmitter.h"
+#include "JIT.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+JITDwarfEmitter::JITDwarfEmitter(JIT& theJit) : MMI(0), Jit(theJit) {}
+
+
+unsigned char* JITDwarfEmitter::EmitDwarfTable(MachineFunction& F,
+                                               JITCodeEmitter& jce,
+                                               unsigned char* StartFunction,
+                                               unsigned char* EndFunction,
+                                               unsigned char* &EHFramePtr) {
+  assert(MMI && "MachineModuleInfo not registered!");
+
+  const TargetMachine& TM = F.getTarget();
+  TD = TM.getDataLayout();
+  stackGrowthDirection = TM.getFrameLowering()->getStackGrowthDirection();
+  RI = TM.getRegisterInfo();
+  MAI = TM.getMCAsmInfo();
+  JCE = &jce;
+
+  unsigned char* ExceptionTable = EmitExceptionTable(&F, StartFunction,
+                                                     EndFunction);
+
+  unsigned char* Result = 0;
+
+  const std::vector<const Function *> Personalities = MMI->getPersonalities();
+  EHFramePtr = EmitCommonEHFrame(Personalities[MMI->getPersonalityIndex()]);
+
+  Result = EmitEHFrame(Personalities[MMI->getPersonalityIndex()], EHFramePtr,
+                       StartFunction, EndFunction, ExceptionTable);
+
+  return Result;
+}
+
+
+void
+JITDwarfEmitter::EmitFrameMoves(intptr_t BaseLabelPtr,
+                                const std::vector<MachineMove> &Moves) const {
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameLowering::StackGrowsUp ?
+          PointerSize : -PointerSize;
+  MCSymbol *BaseLabel = 0;
+
+  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
+    const MachineMove &Move = Moves[i];
+    MCSymbol *Label = Move.getLabel();
+
+    // Throw out move if the label is invalid.
+    if (Label && (*JCE->getLabelLocations())[Label] == 0)
+      continue;
+
+    intptr_t LabelPtr = 0;
+    if (Label) LabelPtr = JCE->getLabelAddress(Label);
+
+    const MachineLocation &Dst = Move.getDestination();
+    const MachineLocation &Src = Move.getSource();
+
+    // Advance row if new location.
+    if (BaseLabelPtr && Label && BaseLabel != Label) {
+      JCE->emitByte(dwarf::DW_CFA_advance_loc4);
+      JCE->emitInt32(LabelPtr - BaseLabelPtr);
+
+      BaseLabel = Label;
+      BaseLabelPtr = LabelPtr;
+    }
+
+    // If advancing cfa.
+    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
+      if (!Src.isReg()) {
+        if (Src.getReg() == MachineLocation::VirtualFP) {
+          JCE->emitByte(dwarf::DW_CFA_def_cfa_offset);
+        } else {
+          JCE->emitByte(dwarf::DW_CFA_def_cfa);
+          JCE->emitULEB128Bytes(RI->getDwarfRegNum(Src.getReg(), true));
+        }
+
+        JCE->emitULEB128Bytes(-Src.getOffset());
+      } else {
+        llvm_unreachable("Machine move not supported yet.");
+      }
+    } else if (Src.isReg() &&
+      Src.getReg() == MachineLocation::VirtualFP) {
+      if (Dst.isReg()) {
+        JCE->emitByte(dwarf::DW_CFA_def_cfa_register);
+        JCE->emitULEB128Bytes(RI->getDwarfRegNum(Dst.getReg(), true));
+      } else {
+        llvm_unreachable("Machine move not supported yet.");
+      }
+    } else {
+      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), true);
+      int Offset = Dst.getOffset() / stackGrowth;
+
+      if (Offset < 0) {
+        JCE->emitByte(dwarf::DW_CFA_offset_extended_sf);
+        JCE->emitULEB128Bytes(Reg);
+        JCE->emitSLEB128Bytes(Offset);
+      } else if (Reg < 64) {
+        JCE->emitByte(dwarf::DW_CFA_offset + Reg);
+        JCE->emitULEB128Bytes(Offset);
+      } else {
+        JCE->emitByte(dwarf::DW_CFA_offset_extended);
+        JCE->emitULEB128Bytes(Reg);
+        JCE->emitULEB128Bytes(Offset);
+      }
+    }
+  }
+}
+
+/// SharedTypeIds - How many leading type ids two landing pads have in common.
+static unsigned SharedTypeIds(const LandingPadInfo *L,
+                              const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+  unsigned Count = 0;
+
+  for (; Count != MinSize; ++Count)
+    if (LIds[Count] != RIds[Count])
+      return Count;
+
+  return Count;
+}
+
+
+/// PadLT - Order landing pads lexicographically by type id.
+static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+
+  for (unsigned i = 0; i != MinSize; ++i)
+    if (LIds[i] != RIds[i])
+      return LIds[i] < RIds[i];
+
+  return LSize < RSize;
+}
+
+namespace {
+
+/// ActionEntry - Structure describing an entry in the actions table.
+struct ActionEntry {
+  int ValueForTypeID; // The value to write - may not be equal to the type id.
+  int NextAction;
+  struct ActionEntry *Previous;
+};
+
+/// PadRange - Structure holding a try-range and the associated landing pad.
+struct PadRange {
+  // The index of the landing pad.
+  unsigned PadIndex;
+  // The index of the begin and end labels in the landing pad's label lists.
+  unsigned RangeIndex;
+};
+
+typedef DenseMap<MCSymbol*, PadRange> RangeMapType;
+
+/// CallSiteEntry - Structure describing an entry in the call-site table.
+struct CallSiteEntry {
+  MCSymbol *BeginLabel; // zero indicates the start of the function.
+  MCSymbol *EndLabel;   // zero indicates the end of the function.
+  MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+  unsigned Action;
+};
+
+}
+
+unsigned char* JITDwarfEmitter::EmitExceptionTable(MachineFunction* MF,
+                                         unsigned char* StartFunction,
+                                         unsigned char* EndFunction) const {
+  assert(MMI && "MachineModuleInfo not registered!");
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads(JCE->getLabelLocations());
+
+  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+  if (PadInfos.empty()) return 0;
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
+
+  // Negative type ids index into FilterIds, positive type ids index into
+  // TypeInfos.  The value written for a positive type id is just the type
+  // id itself.  For a negative type id, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type id, because the FilterIds entries
+  // are written using a variable width encoding which outputs one byte per
+  // entry as long as the value written is not too large, but can differ.
+  // This kind of complication does not occur for positive type ids because
+  // type infos are output using a fixed width encoding.
+  // FilterOffsets[i] holds the byte offset corresponding to FilterIds[i].
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
+    E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= MCAsmInfo::getULEB128Size(*I);
+  }
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LP = LandingPads[i];
+    const std::vector<int> &TypeIds = LP->TypeIds;
+    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      ActionEntry *PrevAction = 0;
+
+      if (NumShared) {
+        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = &Actions.back();
+        SizeAction = MCAsmInfo::getSLEB128Size(PrevAction->NextAction) +
+          MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          SizeAction -= MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
+          SizeAction += -PrevAction->NextAction;
+          PrevAction = PrevAction->Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
+        int TypeID = TypeIds[I];
+        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = MCAsmInfo::getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + MCAsmInfo::getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
+        Actions.push_back(Action);
+
+        PrevAction = &Actions.back();
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+  }
+
+  // Compute the call-site table.  Entries must be ordered by address.
+  SmallVector<CallSiteEntry, 64> CallSites;
+
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j=0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
+  bool MayThrow = false;
+  MCSymbol *LastLabel = 0;
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+        I != E; ++I) {
+    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
+          MI != E; ++MI) {
+      if (!MI->isLabel()) {
+        MayThrow |= MI->isCall();
+        continue;
+      }
+
+      MCSymbol *BeginLabel = MI->getOperand(0).getMCSymbol();
+      assert(BeginLabel && "Invalid label!");
+
+      if (BeginLabel == LastLabel)
+        MayThrow = false;
+
+      RangeMapType::iterator L = PadMap.find(BeginLabel);
+
+      if (L == PadMap.end())
+        continue;
+
+      PadRange P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+              "Inconsistent landing pad map!");
+
+      // If some instruction between the previous try-range and this one may
+      // throw, create a call-site entry with no landing pad for the region
+      // between the try-ranges.
+      if (MayThrow) {
+        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
+        CallSites.push_back(Site);
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      CallSiteEntry Site = {BeginLabel, LastLabel,
+        LandingPad->LandingPadLabel, FirstActions[P.PadIndex]};
+
+      assert(Site.BeginLabel && Site.EndLabel && Site.PadLabel &&
+              "Invalid landing pad!");
+
+      // Try to merge with the previous call-site.
+      if (CallSites.size()) {
+        CallSiteEntry &Prev = CallSites.back();
+        if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          // Extend the range of the previous entry.
+          Prev.EndLabel = Site.EndLabel;
+          continue;
+        }
+      }
+
+      // Otherwise, create a new call-site.
+      CallSites.push_back(Site);
+    }
+  }
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (MayThrow) {
+    CallSiteEntry Site = {LastLabel, 0, 0, 0};
+    CallSites.push_back(Site);
+  }
+
+  // Final tallies.
+  unsigned SizeSites = CallSites.size() * (sizeof(int32_t) + // Site start.
+                                            sizeof(int32_t) + // Site length.
+                                            sizeof(int32_t)); // Landing pad.
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
+    SizeSites += MCAsmInfo::getULEB128Size(CallSites[i].Action);
+
+  unsigned SizeTypes = TypeInfos.size() * TD->getPointerSize();
+
+  unsigned TypeOffset = sizeof(int8_t) + // Call site format
+                        // Call-site table length
+                        MCAsmInfo::getULEB128Size(SizeSites) +
+                        SizeSites + SizeActions + SizeTypes;
+
+  // Begin the exception table.
+  JCE->emitAlignmentWithFill(4, 0);
+  // Asm->EOL("Padding");
+
+  unsigned char* DwarfExceptionTable = (unsigned char*)JCE->getCurrentPCValue();
+
+  // Emit the header.
+  JCE->emitByte(dwarf::DW_EH_PE_omit);
+  // Asm->EOL("LPStart format (DW_EH_PE_omit)");
+  JCE->emitByte(dwarf::DW_EH_PE_absptr);
+  // Asm->EOL("TType format (DW_EH_PE_absptr)");
+  JCE->emitULEB128Bytes(TypeOffset);
+  // Asm->EOL("TType base offset");
+  JCE->emitByte(dwarf::DW_EH_PE_udata4);
+  // Asm->EOL("Call site format (DW_EH_PE_udata4)");
+  JCE->emitULEB128Bytes(SizeSites);
+  // Asm->EOL("Call-site table length");
+
+  // Emit the landing pad site information.
+  for (unsigned i = 0; i < CallSites.size(); ++i) {
+    CallSiteEntry &S = CallSites[i];
+    intptr_t BeginLabelPtr = 0;
+    intptr_t EndLabelPtr = 0;
+
+    if (!S.BeginLabel) {
+      BeginLabelPtr = (intptr_t)StartFunction;
+      JCE->emitInt32(0);
+    } else {
+      BeginLabelPtr = JCE->getLabelAddress(S.BeginLabel);
+      JCE->emitInt32(BeginLabelPtr - (intptr_t)StartFunction);
+    }
+
+    // Asm->EOL("Region start");
+
+    if (!S.EndLabel)
+      EndLabelPtr = (intptr_t)EndFunction;
+    else
+      EndLabelPtr = JCE->getLabelAddress(S.EndLabel);
+
+    JCE->emitInt32(EndLabelPtr - BeginLabelPtr);
+    //Asm->EOL("Region length");
+
+    if (!S.PadLabel) {
+      JCE->emitInt32(0);
+    } else {
+      unsigned PadLabelPtr = JCE->getLabelAddress(S.PadLabel);
+      JCE->emitInt32(PadLabelPtr - (intptr_t)StartFunction);
+    }
+    // Asm->EOL("Landing pad");
+
+    JCE->emitULEB128Bytes(S.Action);
+    // Asm->EOL("Action");
+  }
+
+  // Emit the actions.
+  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
+    ActionEntry &Action = Actions[I];
+
+    JCE->emitSLEB128Bytes(Action.ValueForTypeID);
+    //Asm->EOL("TypeInfo index");
+    JCE->emitSLEB128Bytes(Action.NextAction);
+    //Asm->EOL("Next action");
+  }
+
+  // Emit the type ids.
+  for (unsigned M = TypeInfos.size(); M; --M) {
+    const GlobalVariable *GV = TypeInfos[M - 1];
+
+    if (GV) {
+      if (TD->getPointerSize() == sizeof(int32_t))
+        JCE->emitInt32((intptr_t)Jit.getOrEmitGlobalVariable(GV));
+      else
+        JCE->emitInt64((intptr_t)Jit.getOrEmitGlobalVariable(GV));
+    } else {
+      if (TD->getPointerSize() == sizeof(int32_t))
+        JCE->emitInt32(0);
+      else
+        JCE->emitInt64(0);
+    }
+    // Asm->EOL("TypeInfo");
+  }
+
+  // Emit the filter typeids.
+  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
+    unsigned TypeID = FilterIds[j];
+    JCE->emitULEB128Bytes(TypeID);
+    //Asm->EOL("Filter TypeInfo index");
+  }
+
+  JCE->emitAlignmentWithFill(4, 0);
+
+  return DwarfExceptionTable;
+}
+
+unsigned char*
+JITDwarfEmitter::EmitCommonEHFrame(const Function* Personality) const {
+  unsigned PointerSize = TD->getPointerSize();
+  int stackGrowth = stackGrowthDirection == TargetFrameLowering::StackGrowsUp ?
+          PointerSize : -PointerSize;
+
+  unsigned char* StartCommonPtr = (unsigned char*)JCE->getCurrentPCValue();
+  // EH Common Frame header
+  JCE->allocateSpace(4, 0);
+  unsigned char* FrameCommonBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
+  JCE->emitInt32((int)0);
+  JCE->emitByte(dwarf::DW_CIE_VERSION);
+  JCE->emitString(Personality ? "zPLR" : "zR");
+  JCE->emitULEB128Bytes(1);
+  JCE->emitSLEB128Bytes(stackGrowth);
+  JCE->emitByte(RI->getDwarfRegNum(RI->getRARegister(), true));
+
+  if (Personality) {
+    // Augmentation Size: 3 small ULEBs of one byte each, and the personality
+    // function which size is PointerSize.
+    JCE->emitULEB128Bytes(3 + PointerSize);
+
+    // We set the encoding of the personality as direct encoding because we use
+    // the function pointer. The encoding is not relative because the current
+    // PC value may be bigger than the personality function pointer.
+    if (PointerSize == 4) {
+      JCE->emitByte(dwarf::DW_EH_PE_sdata4);
+      JCE->emitInt32(((intptr_t)Jit.getPointerToGlobal(Personality)));
+    } else {
+      JCE->emitByte(dwarf::DW_EH_PE_sdata8);
+      JCE->emitInt64(((intptr_t)Jit.getPointerToGlobal(Personality)));
+    }
+
+    // LSDA encoding: This must match the encoding used in EmitEHFrame ()
+    if (PointerSize == 4)
+      JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+    else
+      JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8);
+    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+  } else {
+    JCE->emitULEB128Bytes(1);
+    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
+  }
+
+  EmitFrameMoves(0, MAI->getInitialFrameState());
+
+  JCE->emitAlignmentWithFill(PointerSize, dwarf::DW_CFA_nop);
+
+  JCE->emitInt32At((uintptr_t*)StartCommonPtr,
+                   (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() -
+                               FrameCommonBeginPtr));
+
+  return StartCommonPtr;
+}
+
+
+unsigned char*
+JITDwarfEmitter::EmitEHFrame(const Function* Personality,
+                             unsigned char* StartCommonPtr,
+                             unsigned char* StartFunction,
+                             unsigned char* EndFunction,
+                             unsigned char* ExceptionTable) const {
+  unsigned PointerSize = TD->getPointerSize();
+
+  // EH frame header.
+  unsigned char* StartEHPtr = (unsigned char*)JCE->getCurrentPCValue();
+  JCE->allocateSpace(4, 0);
+  unsigned char* FrameBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
+  // FDE CIE Offset
+  JCE->emitInt32(FrameBeginPtr - StartCommonPtr);
+  JCE->emitInt32(StartFunction - (unsigned char*)JCE->getCurrentPCValue());
+  JCE->emitInt32(EndFunction - StartFunction);
+
+  // If there is a personality and landing pads then point to the language
+  // specific data area in the exception table.
+  if (Personality) {
+    JCE->emitULEB128Bytes(PointerSize == 4 ? 4 : 8);
+
+    if (PointerSize == 4) {
+      if (!MMI->getLandingPads().empty())
+        JCE->emitInt32(ExceptionTable-(unsigned char*)JCE->getCurrentPCValue());
+      else
+        JCE->emitInt32((int)0);
+    } else {
+      if (!MMI->getLandingPads().empty())
+        JCE->emitInt64(ExceptionTable-(unsigned char*)JCE->getCurrentPCValue());
+      else
+        JCE->emitInt64((int)0);
+    }
+  } else {
+    JCE->emitULEB128Bytes(0);
+  }
+
+  // Indicate locations of function specific  callee saved registers in
+  // frame.
+  EmitFrameMoves((intptr_t)StartFunction, MMI->getFrameMoves());
+
+  JCE->emitAlignmentWithFill(PointerSize, dwarf::DW_CFA_nop);
+
+  // Indicate the size of the table
+  JCE->emitInt32At((uintptr_t*)StartEHPtr,
+                   (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() -
+                               StartEHPtr));
+
+  // Double zeroes for the unwind runtime
+  if (PointerSize == 8) {
+    JCE->emitInt64(0);
+    JCE->emitInt64(0);
+  } else {
+    JCE->emitInt32(0);
+    JCE->emitInt32(0);
+  }
+
+  return StartEHPtr;
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
new file mode 100644
index 000000000000..98ac34049176
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
@@ -0,0 +1,77 @@
+//===------ JITDwarfEmitter.h - Write dwarf tables into memory ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITDwarfEmitter object that is used by the JIT to
+// write dwarf tables to memory.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
+#define LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
+
+#include "llvm/Support/DataTypes.h"
+#include <vector>
+
+namespace llvm {
+
+class Function;
+class JIT;
+class JITCodeEmitter;
+class MachineFunction;
+class MachineModuleInfo;
+class MachineMove;
+class MCAsmInfo;
+class DataLayout;
+class TargetMachine;
+class TargetRegisterInfo;
+
+class JITDwarfEmitter {
+  const DataLayout* TD;
+  JITCodeEmitter* JCE;
+  const TargetRegisterInfo* RI;
+  const MCAsmInfo *MAI;
+  MachineModuleInfo* MMI;
+  JIT& Jit;
+  bool stackGrowthDirection;
+
+  unsigned char* EmitExceptionTable(MachineFunction* MF,
+                                    unsigned char* StartFunction,
+                                    unsigned char* EndFunction) const;
+
+  void EmitFrameMoves(intptr_t BaseLabelPtr,
+                      const std::vector<MachineMove> &Moves) const;
+
+  unsigned char* EmitCommonEHFrame(const Function* Personality) const;
+
+  unsigned char* EmitEHFrame(const Function* Personality,
+                             unsigned char* StartBufferPtr,
+                             unsigned char* StartFunction,
+                             unsigned char* EndFunction,
+                             unsigned char* ExceptionTable) const;
+
+public:
+
+  JITDwarfEmitter(JIT& jit);
+
+  unsigned char* EmitDwarfTable(MachineFunction& F,
+                                JITCodeEmitter& JCE,
+                                unsigned char* StartFunction,
+                                unsigned char* EndFunction,
+                                unsigned char* &EHFramePtr);
+
+
+  void setModuleInfo(MachineModuleInfo* Info) {
+    MMI = Info;
+  }
+};
+
+
+} // end namespace llvm
+
+#endif // LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JITEmitter.cpp b/contrib/llvm/lib/ExecutionEngine/JIT/JITEmitter.cpp
new file mode 100644
index 000000000000..c27387699ab6
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -0,0 +1,1301 @@
+//===-- JITEmitter.cpp - Write machine code to executable memory ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a MachineCodeEmitter object that is used by the JIT to
+// write machine code to memory and remember where relocatable values are.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "JIT.h"
+#include "JITDwarfEmitter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineCodeInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRelocation.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Disassembler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+STATISTIC(NumBytes, "Number of bytes of machine code compiled");
+STATISTIC(NumRelos, "Number of relocations applied");
+STATISTIC(NumRetries, "Number of retries with more memory");
+
+
+// A declaration may stop being a declaration once it's fully read from bitcode.
+// This function returns true if F is fully read and is still a declaration.
+static bool isNonGhostDeclaration(const Function *F) {
+  return F->isDeclaration() && !F->isMaterializable();
+}
+
+//===----------------------------------------------------------------------===//
+// JIT lazy compilation code.
+//
+namespace {
+  class JITEmitter;
+  class JITResolverState;
+
+  template<typename ValueTy>
+  struct NoRAUWValueMapConfig : public ValueMapConfig<ValueTy> {
+    typedef JITResolverState *ExtraData;
+    static void onRAUW(JITResolverState *, Value *Old, Value *New) {
+      llvm_unreachable("The JIT doesn't know how to handle a"
+                       " RAUW on a value it has emitted.");
+    }
+  };
+
+  struct CallSiteValueMapConfig : public NoRAUWValueMapConfig<Function*> {
+    typedef JITResolverState *ExtraData;
+    static void onDelete(JITResolverState *JRS, Function *F);
+  };
+
+  class JITResolverState {
+  public:
+    typedef ValueMap<Function*, void*, NoRAUWValueMapConfig<Function*> >
+      FunctionToLazyStubMapTy;
+    typedef std::map<void*, AssertingVH<Function> > CallSiteToFunctionMapTy;
+    typedef ValueMap<Function *, SmallPtrSet<void*, 1>,
+                     CallSiteValueMapConfig> FunctionToCallSitesMapTy;
+    typedef std::map<AssertingVH<GlobalValue>, void*> GlobalToIndirectSymMapTy;
+  private:
+    /// FunctionToLazyStubMap - Keep track of the lazy stub created for a
+    /// particular function so that we can reuse them if necessary.
+    FunctionToLazyStubMapTy FunctionToLazyStubMap;
+
+    /// CallSiteToFunctionMap - Keep track of the function that each lazy call
+    /// site corresponds to, and vice versa.
+    CallSiteToFunctionMapTy CallSiteToFunctionMap;
+    FunctionToCallSitesMapTy FunctionToCallSitesMap;
+
+    /// GlobalToIndirectSymMap - Keep track of the indirect symbol created for a
+    /// particular GlobalVariable so that we can reuse them if necessary.
+    GlobalToIndirectSymMapTy GlobalToIndirectSymMap;
+
+#ifndef NDEBUG
+    /// Instance of the JIT this ResolverState serves.
+    JIT *TheJIT;
+#endif
+
+  public:
+    JITResolverState(JIT *jit) : FunctionToLazyStubMap(this),
+                                 FunctionToCallSitesMap(this) {
+#ifndef NDEBUG
+      TheJIT = jit;
+#endif
+    }
+
+    FunctionToLazyStubMapTy& getFunctionToLazyStubMap(
+      const MutexGuard& locked) {
+      assert(locked.holds(TheJIT->lock));
+      return FunctionToLazyStubMap;
+    }
+
+    GlobalToIndirectSymMapTy& getGlobalToIndirectSymMap(const MutexGuard& lck) {
+      assert(lck.holds(TheJIT->lock));
+      return GlobalToIndirectSymMap;
+    }
+
+    std::pair<void *, Function *> LookupFunctionFromCallSite(
+        const MutexGuard &locked, void *CallSite) const {
+      assert(locked.holds(TheJIT->lock));
+
+      // The address given to us for the stub may not be exactly right, it
+      // might be a little bit after the stub.  As such, use upper_bound to
+      // find it.
+      CallSiteToFunctionMapTy::const_iterator I =
+        CallSiteToFunctionMap.upper_bound(CallSite);
+      assert(I != CallSiteToFunctionMap.begin() &&
+             "This is not a known call site!");
+      --I;
+      return *I;
+    }
+
+    void AddCallSite(const MutexGuard &locked, void *CallSite, Function *F) {
+      assert(locked.holds(TheJIT->lock));
+
+      bool Inserted = CallSiteToFunctionMap.insert(
+          std::make_pair(CallSite, F)).second;
+      (void)Inserted;
+      assert(Inserted && "Pair was already in CallSiteToFunctionMap");
+      FunctionToCallSitesMap[F].insert(CallSite);
+    }
+
+    void EraseAllCallSitesForPrelocked(Function *F);
+
+    // Erases _all_ call sites regardless of their function.  This is used to
+    // unregister the stub addresses from the StubToResolverMap in
+    // ~JITResolver().
+    void EraseAllCallSitesPrelocked();
+  };
+
+  /// JITResolver - Keep track of, and resolve, call sites for functions that
+  /// have not yet been compiled.
+  class JITResolver {
+    typedef JITResolverState::FunctionToLazyStubMapTy FunctionToLazyStubMapTy;
+    typedef JITResolverState::CallSiteToFunctionMapTy CallSiteToFunctionMapTy;
+    typedef JITResolverState::GlobalToIndirectSymMapTy GlobalToIndirectSymMapTy;
+
+    /// LazyResolverFn - The target lazy resolver function that we actually
+    /// rewrite instructions to use.
+    TargetJITInfo::LazyResolverFn LazyResolverFn;
+
+    JITResolverState state;
+
+    /// ExternalFnToStubMap - This is the equivalent of FunctionToLazyStubMap
+    /// for external functions.  TODO: Of course, external functions don't need
+    /// a lazy stub.  It's actually here to make it more likely that far calls
+    /// succeed, but no single stub can guarantee that.  I'll remove this in a
+    /// subsequent checkin when I actually fix far calls.
+    std::map<void*, void*> ExternalFnToStubMap;
+
+    /// revGOTMap - map addresses to indexes in the GOT
+    std::map<void*, unsigned> revGOTMap;
+    unsigned nextGOTIndex;
+
+    JITEmitter &JE;
+
+    /// Instance of JIT corresponding to this Resolver.
+    JIT *TheJIT;
+
+  public:
+    explicit JITResolver(JIT &jit, JITEmitter &je)
+      : state(&jit), nextGOTIndex(0), JE(je), TheJIT(&jit) {
+      LazyResolverFn = jit.getJITInfo().getLazyResolverFunction(JITCompilerFn);
+    }
+
+    ~JITResolver();
+
+    /// getLazyFunctionStubIfAvailable - This returns a pointer to a function's
+    /// lazy-compilation stub if it has already been created.
+    void *getLazyFunctionStubIfAvailable(Function *F);
+
+    /// getLazyFunctionStub - This returns a pointer to a function's
+    /// lazy-compilation stub, creating one on demand as needed.
+    void *getLazyFunctionStub(Function *F);
+
+    /// getExternalFunctionStub - Return a stub for the function at the
+    /// specified address, created lazily on demand.
+    void *getExternalFunctionStub(void *FnAddr);
+
+    /// getGlobalValueIndirectSym - Return an indirect symbol containing the
+    /// specified GV address.
+    void *getGlobalValueIndirectSym(GlobalValue *V, void *GVAddress);
+
+    /// getGOTIndexForAddress - Return a new or existing index in the GOT for
+    /// an address.  This function only manages slots, it does not manage the
+    /// contents of the slots or the memory associated with the GOT.
+    unsigned getGOTIndexForAddr(void *addr);
+
+    /// JITCompilerFn - This function is called to resolve a stub to a compiled
+    /// address.  If the LLVM Function corresponding to the stub has not yet
+    /// been compiled, this function compiles it first.
+    static void *JITCompilerFn(void *Stub);
+  };
+
+  class StubToResolverMapTy {
+    /// Map a stub address to a specific instance of a JITResolver so that
+    /// lazily-compiled functions can find the right resolver to use.
+    ///
+    /// Guarded by Lock.
+    std::map<void*, JITResolver*> Map;
+
+    /// Guards Map from concurrent accesses.
+    mutable sys::Mutex Lock;
+
+  public:
+    /// Registers a Stub to be resolved by Resolver.
+    void RegisterStubResolver(void *Stub, JITResolver *Resolver) {
+      MutexGuard guard(Lock);
+      Map.insert(std::make_pair(Stub, Resolver));
+    }
+    /// Unregisters the Stub when it's invalidated.
+    void UnregisterStubResolver(void *Stub) {
+      MutexGuard guard(Lock);
+      Map.erase(Stub);
+    }
+    /// Returns the JITResolver instance that owns the Stub.
+    JITResolver *getResolverFromStub(void *Stub) const {
+      MutexGuard guard(Lock);
+      // The address given to us for the stub may not be exactly right, it might
+      // be a little bit after the stub.  As such, use upper_bound to find it.
+      // This is the same trick as in LookupFunctionFromCallSite from
+      // JITResolverState.
+      std::map<void*, JITResolver*>::const_iterator I = Map.upper_bound(Stub);
+      assert(I != Map.begin() && "This is not a known stub!");
+      --I;
+      return I->second;
+    }
+    /// True if any stubs refer to the given resolver. Only used in an assert().
+    /// O(N)
+    bool ResolverHasStubs(JITResolver* Resolver) const {
+      MutexGuard guard(Lock);
+      for (std::map<void*, JITResolver*>::const_iterator I = Map.begin(),
+             E = Map.end(); I != E; ++I) {
+        if (I->second == Resolver)
+          return true;
+      }
+      return false;
+    }
+  };
+  /// This needs to be static so that a lazy call stub can access it with no
+  /// context except the address of the stub.
+  ManagedStatic<StubToResolverMapTy> StubToResolverMap;
+
+  /// JITEmitter - The JIT implementation of the MachineCodeEmitter, which is
+  /// used to output functions to memory for execution.
+  class JITEmitter : public JITCodeEmitter {
+    JITMemoryManager *MemMgr;
+
+    // When outputting a function stub in the context of some other function, we
+    // save BufferBegin/BufferEnd/CurBufferPtr here.
+    uint8_t *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
+
+    // When reattempting to JIT a function after running out of space, we store
+    // the estimated size of the function we're trying to JIT here, so we can
+    // ask the memory manager for at least this much space.  When we
+    // successfully emit the function, we reset this back to zero.
+    uintptr_t SizeEstimate;
+
+    /// Relocations - These are the relocations that the function needs, as
+    /// emitted.
+    std::vector<MachineRelocation> Relocations;
+
+    /// MBBLocations - This vector is a mapping from MBB ID's to their address.
+    /// It is filled in by the StartMachineBasicBlock callback and queried by
+    /// the getMachineBasicBlockAddress callback.
+    std::vector<uintptr_t> MBBLocations;
+
+    /// ConstantPool - The constant pool for the current function.
+    ///
+    MachineConstantPool *ConstantPool;
+
+    /// ConstantPoolBase - A pointer to the first entry in the constant pool.
+    ///
+    void *ConstantPoolBase;
+
+    /// ConstPoolAddresses - Addresses of individual constant pool entries.
+    ///
+    SmallVector<uintptr_t, 8> ConstPoolAddresses;
+
+    /// JumpTable - The jump tables for the current function.
+    ///
+    MachineJumpTableInfo *JumpTable;
+
+    /// JumpTableBase - A pointer to the first entry in the jump table.
+    ///
+    void *JumpTableBase;
+
+    /// Resolver - This contains info about the currently resolved functions.
+    JITResolver Resolver;
+
+    /// DE - The dwarf emitter for the jit.
+    OwningPtr<JITDwarfEmitter> DE;
+
+    /// LabelLocations - This vector is a mapping from Label ID's to their
+    /// address.
+    DenseMap<MCSymbol*, uintptr_t> LabelLocations;
+
+    /// MMI - Machine module info for exception informations
+    MachineModuleInfo* MMI;
+
+    // CurFn - The llvm function being emitted.  Only valid during
+    // finishFunction().
+    const Function *CurFn;
+
+    /// Information about emitted code, which is passed to the
+    /// JITEventListeners.  This is reset in startFunction and used in
+    /// finishFunction.
+    JITEvent_EmittedFunctionDetails EmissionDetails;
+
+    struct EmittedCode {
+      void *FunctionBody;  // Beginning of the function's allocation.
+      void *Code;  // The address the function's code actually starts at.
+      void *ExceptionTable;
+      EmittedCode() : FunctionBody(0), Code(0), ExceptionTable(0) {}
+    };
+    struct EmittedFunctionConfig : public ValueMapConfig<const Function*> {
+      typedef JITEmitter *ExtraData;
+      static void onDelete(JITEmitter *, const Function*);
+      static void onRAUW(JITEmitter *, const Function*, const Function*);
+    };
+    ValueMap<const Function *, EmittedCode,
+             EmittedFunctionConfig> EmittedFunctions;
+
+    DebugLoc PrevDL;
+
+    /// Instance of the JIT
+    JIT *TheJIT;
+
+    bool JITExceptionHandling;
+
+  public:
+    JITEmitter(JIT &jit, JITMemoryManager *JMM, TargetMachine &TM)
+      : SizeEstimate(0), Resolver(jit, *this), MMI(0), CurFn(0),
+        EmittedFunctions(this), TheJIT(&jit),
+        JITExceptionHandling(TM.Options.JITExceptionHandling) {
+      MemMgr = JMM ? JMM : JITMemoryManager::CreateDefaultMemManager();
+      if (jit.getJITInfo().needsGOT()) {
+        MemMgr->AllocateGOT();
+        DEBUG(dbgs() << "JIT is managing a GOT\n");
+      }
+
+      if (JITExceptionHandling) {
+        DE.reset(new JITDwarfEmitter(jit));
+      }
+    }
+    ~JITEmitter() {
+      delete MemMgr;
+    }
+
+    JITResolver &getJITResolver() { return Resolver; }
+
+    virtual void startFunction(MachineFunction &F);
+    virtual bool finishFunction(MachineFunction &F);
+
+    void emitConstantPool(MachineConstantPool *MCP);
+    void initJumpTableInfo(MachineJumpTableInfo *MJTI);
+    void emitJumpTableInfo(MachineJumpTableInfo *MJTI);
+
+    void startGVStub(const GlobalValue* GV,
+                     unsigned StubSize, unsigned Alignment = 1);
+    void startGVStub(void *Buffer, unsigned StubSize);
+    void finishGVStub();
+    virtual void *allocIndirectGV(const GlobalValue *GV,
+                                  const uint8_t *Buffer, size_t Size,
+                                  unsigned Alignment);
+
+    /// allocateSpace - Reserves space in the current block if any, or
+    /// allocate a new one of the given size.
+    virtual void *allocateSpace(uintptr_t Size, unsigned Alignment);
+
+    /// allocateGlobal - Allocate memory for a global.  Unlike allocateSpace,
+    /// this method does not allocate memory in the current output buffer,
+    /// because a global may live longer than the current function.
+    virtual void *allocateGlobal(uintptr_t Size, unsigned Alignment);
+
+    virtual void addRelocation(const MachineRelocation &MR) {
+      Relocations.push_back(MR);
+    }
+
+    virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {
+      if (MBBLocations.size() <= (unsigned)MBB->getNumber())
+        MBBLocations.resize((MBB->getNumber()+1)*2);
+      MBBLocations[MBB->getNumber()] = getCurrentPCValue();
+      if (MBB->hasAddressTaken())
+        TheJIT->addPointerToBasicBlock(MBB->getBasicBlock(),
+                                       (void*)getCurrentPCValue());
+      DEBUG(dbgs() << "JIT: Emitting BB" << MBB->getNumber() << " at ["
+                   << (void*) getCurrentPCValue() << "]\n");
+    }
+
+    virtual uintptr_t getConstantPoolEntryAddress(unsigned Entry) const;
+    virtual uintptr_t getJumpTableEntryAddress(unsigned Entry) const;
+
+    virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const{
+      assert(MBBLocations.size() > (unsigned)MBB->getNumber() &&
+             MBBLocations[MBB->getNumber()] && "MBB not emitted!");
+      return MBBLocations[MBB->getNumber()];
+    }
+
+    /// retryWithMoreMemory - Log a retry and deallocate all memory for the
+    /// given function.  Increase the minimum allocation size so that we get
+    /// more memory next time.
+    void retryWithMoreMemory(MachineFunction &F);
+
+    /// deallocateMemForFunction - Deallocate all memory for the specified
+    /// function body.
+    void deallocateMemForFunction(const Function *F);
+
+    virtual void processDebugLoc(DebugLoc DL, bool BeforePrintingInsn);
+
+    virtual void emitLabel(MCSymbol *Label) {
+      LabelLocations[Label] = getCurrentPCValue();
+    }
+
+    virtual DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() {
+      return &LabelLocations;
+    }
+
+    virtual uintptr_t getLabelAddress(MCSymbol *Label) const {
+      assert(LabelLocations.count(Label) && "Label not emitted!");
+      return LabelLocations.find(Label)->second;
+    }
+
+    virtual void setModuleInfo(MachineModuleInfo* Info) {
+      MMI = Info;
+      if (DE.get()) DE->setModuleInfo(Info);
+    }
+
+  private:
+    void *getPointerToGlobal(GlobalValue *GV, void *Reference,
+                             bool MayNeedFarStub);
+    void *getPointerToGVIndirectSym(GlobalValue *V, void *Reference);
+  };
+}
+
+void CallSiteValueMapConfig::onDelete(JITResolverState *JRS, Function *F) {
+  JRS->EraseAllCallSitesForPrelocked(F);
+}
+
+void JITResolverState::EraseAllCallSitesForPrelocked(Function *F) {
+  FunctionToCallSitesMapTy::iterator F2C = FunctionToCallSitesMap.find(F);
+  if (F2C == FunctionToCallSitesMap.end())
+    return;
+  StubToResolverMapTy &S2RMap = *StubToResolverMap;
+  for (SmallPtrSet<void*, 1>::const_iterator I = F2C->second.begin(),
+         E = F2C->second.end(); I != E; ++I) {
+    S2RMap.UnregisterStubResolver(*I);
+    bool Erased = CallSiteToFunctionMap.erase(*I);
+    (void)Erased;
+    assert(Erased && "Missing call site->function mapping");
+  }
+  FunctionToCallSitesMap.erase(F2C);
+}
+
+void JITResolverState::EraseAllCallSitesPrelocked() {
+  StubToResolverMapTy &S2RMap = *StubToResolverMap;
+  for (CallSiteToFunctionMapTy::const_iterator
+         I = CallSiteToFunctionMap.begin(),
+         E = CallSiteToFunctionMap.end(); I != E; ++I) {
+    S2RMap.UnregisterStubResolver(I->first);
+  }
+  CallSiteToFunctionMap.clear();
+  FunctionToCallSitesMap.clear();
+}
+
+JITResolver::~JITResolver() {
+  // No need to lock because we're in the destructor, and state isn't shared.
+  state.EraseAllCallSitesPrelocked();
+  assert(!StubToResolverMap->ResolverHasStubs(this) &&
+         "Resolver destroyed with stubs still alive.");
+}
+
+/// getLazyFunctionStubIfAvailable - This returns a pointer to a function stub
+/// if it has already been created.
+void *JITResolver::getLazyFunctionStubIfAvailable(Function *F) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a stub for this function, recycle it.
+  return state.getFunctionToLazyStubMap(locked).lookup(F);
+}
+
+/// getFunctionStub - This returns a pointer to a function stub, creating
+/// one on demand as needed.
+void *JITResolver::getLazyFunctionStub(Function *F) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a lazy stub for this function, recycle it.
+  void *&Stub = state.getFunctionToLazyStubMap(locked)[F];
+  if (Stub) return Stub;
+
+  // Call the lazy resolver function if we are JIT'ing lazily.  Otherwise we
+  // must resolve the symbol now.
+  void *Actual = TheJIT->isCompilingLazily()
+    ? (void *)(intptr_t)LazyResolverFn : (void *)0;
+
+  // If this is an external declaration, attempt to resolve the address now
+  // to place in the stub.
+  if (isNonGhostDeclaration(F) || F->hasAvailableExternallyLinkage()) {
+    Actual = TheJIT->getPointerToFunction(F);
+
+    // If we resolved the symbol to a null address (eg. a weak external)
+    // don't emit a stub. Return a null pointer to the application.
+    if (!Actual) return 0;
+  }
+
+  TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
+  JE.startGVStub(F, SL.Size, SL.Alignment);
+  // Codegen a new stub, calling the lazy resolver or the actual address of the
+  // external function, if it was resolved.
+  Stub = TheJIT->getJITInfo().emitFunctionStub(F, Actual, JE);
+  JE.finishGVStub();
+
+  if (Actual != (void*)(intptr_t)LazyResolverFn) {
+    // If we are getting the stub for an external function, we really want the
+    // address of the stub in the GlobalAddressMap for the JIT, not the address
+    // of the external function.
+    TheJIT->updateGlobalMapping(F, Stub);
+  }
+
+  DEBUG(dbgs() << "JIT: Lazy stub emitted at [" << Stub << "] for function '"
+        << F->getName() << "'\n");
+
+  if (TheJIT->isCompilingLazily()) {
+    // Register this JITResolver as the one corresponding to this call site so
+    // JITCompilerFn will be able to find it.
+    StubToResolverMap->RegisterStubResolver(Stub, this);
+
+    // Finally, keep track of the stub-to-Function mapping so that the
+    // JITCompilerFn knows which function to compile!
+    state.AddCallSite(locked, Stub, F);
+  } else if (!Actual) {
+    // If we are JIT'ing non-lazily but need to call a function that does not
+    // exist yet, add it to the JIT's work list so that we can fill in the
+    // stub address later.
+    assert(!isNonGhostDeclaration(F) && !F->hasAvailableExternallyLinkage() &&
+           "'Actual' should have been set above.");
+    TheJIT->addPendingFunction(F);
+  }
+
+  return Stub;
+}
+
+/// getGlobalValueIndirectSym - Return a lazy pointer containing the specified
+/// GV address.
+void *JITResolver::getGlobalValueIndirectSym(GlobalValue *GV, void *GVAddress) {
+  MutexGuard locked(TheJIT->lock);
+
+  // If we already have a stub for this global variable, recycle it.
+  void *&IndirectSym = state.getGlobalToIndirectSymMap(locked)[GV];
+  if (IndirectSym) return IndirectSym;
+
+  // Otherwise, codegen a new indirect symbol.
+  IndirectSym = TheJIT->getJITInfo().emitGlobalValueIndirectSym(GV, GVAddress,
+                                                                JE);
+
+  DEBUG(dbgs() << "JIT: Indirect symbol emitted at [" << IndirectSym
+        << "] for GV '" << GV->getName() << "'\n");
+
+  return IndirectSym;
+}
+
+/// getExternalFunctionStub - Return a stub for the function at the
+/// specified address, created lazily on demand.
+void *JITResolver::getExternalFunctionStub(void *FnAddr) {
+  // If we already have a stub for this function, recycle it.
+  void *&Stub = ExternalFnToStubMap[FnAddr];
+  if (Stub) return Stub;
+
+  TargetJITInfo::StubLayout SL = TheJIT->getJITInfo().getStubLayout();
+  JE.startGVStub(0, SL.Size, SL.Alignment);
+  Stub = TheJIT->getJITInfo().emitFunctionStub(0, FnAddr, JE);
+  JE.finishGVStub();
+
+  DEBUG(dbgs() << "JIT: Stub emitted at [" << Stub
+               << "] for external function at '" << FnAddr << "'\n");
+  return Stub;
+}
+
+unsigned JITResolver::getGOTIndexForAddr(void* addr) {
+  unsigned idx = revGOTMap[addr];
+  if (!idx) {
+    idx = ++nextGOTIndex;
+    revGOTMap[addr] = idx;
+    DEBUG(dbgs() << "JIT: Adding GOT entry " << idx << " for addr ["
+                 << addr << "]\n");
+  }
+  return idx;
+}
+
+/// JITCompilerFn - This function is called when a lazy compilation stub has
+/// been entered.  It looks up which function this stub corresponds to, compiles
+/// it if necessary, then returns the resultant function pointer.
+void *JITResolver::JITCompilerFn(void *Stub) {
+  JITResolver *JR = StubToResolverMap->getResolverFromStub(Stub);
+  assert(JR && "Unable to find the corresponding JITResolver to the call site");
+
+  Function* F = 0;
+  void* ActualPtr = 0;
+
+  {
+    // Only lock for getting the Function. The call getPointerToFunction made
+    // in this function might trigger function materializing, which requires
+    // JIT lock to be unlocked.
+    MutexGuard locked(JR->TheJIT->lock);
+
+    // The address given to us for the stub may not be exactly right, it might
+    // be a little bit after the stub.  As such, use upper_bound to find it.
+    std::pair<void*, Function*> I =
+      JR->state.LookupFunctionFromCallSite(locked, Stub);
+    F = I.second;
+    ActualPtr = I.first;
+  }
+
+  // If we have already code generated the function, just return the address.
+  void *Result = JR->TheJIT->getPointerToGlobalIfAvailable(F);
+
+  if (!Result) {
+    // Otherwise we don't have it, do lazy compilation now.
+
+    // If lazy compilation is disabled, emit a useful error message and abort.
+    if (!JR->TheJIT->isCompilingLazily()) {
+      report_fatal_error("LLVM JIT requested to do lazy compilation of"
+                         " function '"
+                        + F->getName() + "' when lazy compiles are disabled!");
+    }
+
+    DEBUG(dbgs() << "JIT: Lazily resolving function '" << F->getName()
+          << "' In stub ptr = " << Stub << " actual ptr = "
+          << ActualPtr << "\n");
+    (void)ActualPtr;
+
+    Result = JR->TheJIT->getPointerToFunction(F);
+  }
+
+  // Reacquire the lock to update the GOT map.
+  MutexGuard locked(JR->TheJIT->lock);
+
+  // We might like to remove the call site from the CallSiteToFunction map, but
+  // we can't do that! Multiple threads could be stuck, waiting to acquire the
+  // lock above. As soon as the 1st function finishes compiling the function,
+  // the next one will be released, and needs to be able to find the function it
+  // needs to call.
+
+  // FIXME: We could rewrite all references to this stub if we knew them.
+
+  // What we will do is set the compiled function address to map to the
+  // same GOT entry as the stub so that later clients may update the GOT
+  // if they see it still using the stub address.
+  // Note: this is done so the Resolver doesn't have to manage GOT memory
+  // Do this without allocating map space if the target isn't using a GOT
+  if(JR->revGOTMap.find(Stub) != JR->revGOTMap.end())
+    JR->revGOTMap[Result] = JR->revGOTMap[Stub];
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// JITEmitter code.
+//
+void *JITEmitter::getPointerToGlobal(GlobalValue *V, void *Reference,
+                                     bool MayNeedFarStub) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return TheJIT->getOrEmitGlobalVariable(GV);
+
+  if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return TheJIT->getPointerToGlobal(GA->resolveAliasedGlobal(false));
+
+  // If we have already compiled the function, return a pointer to its body.
+  Function *F = cast<Function>(V);
+
+  void *FnStub = Resolver.getLazyFunctionStubIfAvailable(F);
+  if (FnStub) {
+    // Return the function stub if it's already created.  We do this first so
+    // that we're returning the same address for the function as any previous
+    // call.  TODO: Yes, this is wrong. The lazy stub isn't guaranteed to be
+    // close enough to call.
+    return FnStub;
+  }
+
+  // If we know the target can handle arbitrary-distance calls, try to
+  // return a direct pointer.
+  if (!MayNeedFarStub) {
+    // If we have code, go ahead and return that.
+    void *ResultPtr = TheJIT->getPointerToGlobalIfAvailable(F);
+    if (ResultPtr) return ResultPtr;
+
+    // If this is an external function pointer, we can force the JIT to
+    // 'compile' it, which really just adds it to the map.
+    if (isNonGhostDeclaration(F) || F->hasAvailableExternallyLinkage())
+      return TheJIT->getPointerToFunction(F);
+  }
+
+  // Otherwise, we may need a to emit a stub, and, conservatively, we always do
+  // so.  Note that it's possible to return null from getLazyFunctionStub in the
+  // case of a weak extern that fails to resolve.
+  return Resolver.getLazyFunctionStub(F);
+}
+
+void *JITEmitter::getPointerToGVIndirectSym(GlobalValue *V, void *Reference) {
+  // Make sure GV is emitted first, and create a stub containing the fully
+  // resolved address.
+  void *GVAddress = getPointerToGlobal(V, Reference, false);
+  void *StubAddr = Resolver.getGlobalValueIndirectSym(V, GVAddress);
+  return StubAddr;
+}
+
+void JITEmitter::processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) {
+  if (DL.isUnknown()) return;
+  if (!BeforePrintingInsn) return;
+
+  const LLVMContext &Context = EmissionDetails.MF->getFunction()->getContext();
+
+  if (DL.getScope(Context) != 0 && PrevDL != DL) {
+    JITEvent_EmittedFunctionDetails::LineStart NextLine;
+    NextLine.Address = getCurrentPCValue();
+    NextLine.Loc = DL;
+    EmissionDetails.LineStarts.push_back(NextLine);
+  }
+
+  PrevDL = DL;
+}
+
+static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
+                                           const DataLayout *TD) {
+  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
+  if (Constants.empty()) return 0;
+
+  unsigned Size = 0;
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    MachineConstantPoolEntry CPE = Constants[i];
+    unsigned AlignMask = CPE.getAlignment() - 1;
+    Size = (Size + AlignMask) & ~AlignMask;
+    Type *Ty = CPE.getType();
+    Size += TD->getTypeAllocSize(Ty);
+  }
+  return Size;
+}
+
+void JITEmitter::startFunction(MachineFunction &F) {
+  DEBUG(dbgs() << "JIT: Starting CodeGen of Function "
+        << F.getName() << "\n");
+
+  uintptr_t ActualSize = 0;
+  // Set the memory writable, if it's not already
+  MemMgr->setMemoryWritable();
+
+  if (SizeEstimate > 0) {
+    // SizeEstimate will be non-zero on reallocation attempts.
+    ActualSize = SizeEstimate;
+  }
+
+  BufferBegin = CurBufferPtr = MemMgr->startFunctionBody(F.getFunction(),
+                                                         ActualSize);
+  BufferEnd = BufferBegin+ActualSize;
+  EmittedFunctions[F.getFunction()].FunctionBody = BufferBegin;
+
+  // Ensure the constant pool/jump table info is at least 4-byte aligned.
+  emitAlignment(16);
+
+  emitConstantPool(F.getConstantPool());
+  if (MachineJumpTableInfo *MJTI = F.getJumpTableInfo())
+    initJumpTableInfo(MJTI);
+
+  // About to start emitting the machine code for the function.
+  emitAlignment(std::max(F.getFunction()->getAlignment(), 8U));
+  TheJIT->updateGlobalMapping(F.getFunction(), CurBufferPtr);
+  EmittedFunctions[F.getFunction()].Code = CurBufferPtr;
+
+  MBBLocations.clear();
+
+  EmissionDetails.MF = &F;
+  EmissionDetails.LineStarts.clear();
+}
+
+bool JITEmitter::finishFunction(MachineFunction &F) {
+  if (CurBufferPtr == BufferEnd) {
+    // We must call endFunctionBody before retrying, because
+    // deallocateMemForFunction requires it.
+    MemMgr->endFunctionBody(F.getFunction(), BufferBegin, CurBufferPtr);
+    retryWithMoreMemory(F);
+    return true;
+  }
+
+  if (MachineJumpTableInfo *MJTI = F.getJumpTableInfo())
+    emitJumpTableInfo(MJTI);
+
+  // FnStart is the start of the text, not the start of the constant pool and
+  // other per-function data.
+  uint8_t *FnStart =
+    (uint8_t *)TheJIT->getPointerToGlobalIfAvailable(F.getFunction());
+
+  // FnEnd is the end of the function's machine code.
+  uint8_t *FnEnd = CurBufferPtr;
+
+  if (!Relocations.empty()) {
+    CurFn = F.getFunction();
+    NumRelos += Relocations.size();
+
+    // Resolve the relocations to concrete pointers.
+    for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
+      MachineRelocation &MR = Relocations[i];
+      void *ResultPtr = 0;
+      if (!MR.letTargetResolve()) {
+        if (MR.isExternalSymbol()) {
+          ResultPtr = TheJIT->getPointerToNamedFunction(MR.getExternalSymbol(),
+                                                        false);
+          DEBUG(dbgs() << "JIT: Map \'" << MR.getExternalSymbol() << "\' to ["
+                       << ResultPtr << "]\n");
+
+          // If the target REALLY wants a stub for this function, emit it now.
+          if (MR.mayNeedFarStub()) {
+            ResultPtr = Resolver.getExternalFunctionStub(ResultPtr);
+          }
+        } else if (MR.isGlobalValue()) {
+          ResultPtr = getPointerToGlobal(MR.getGlobalValue(),
+                                         BufferBegin+MR.getMachineCodeOffset(),
+                                         MR.mayNeedFarStub());
+        } else if (MR.isIndirectSymbol()) {
+          ResultPtr = getPointerToGVIndirectSym(
+              MR.getGlobalValue(), BufferBegin+MR.getMachineCodeOffset());
+        } else if (MR.isBasicBlock()) {
+          ResultPtr = (void*)getMachineBasicBlockAddress(MR.getBasicBlock());
+        } else if (MR.isConstantPoolIndex()) {
+          ResultPtr =
+            (void*)getConstantPoolEntryAddress(MR.getConstantPoolIndex());
+        } else {
+          assert(MR.isJumpTableIndex());
+          ResultPtr=(void*)getJumpTableEntryAddress(MR.getJumpTableIndex());
+        }
+
+        MR.setResultPointer(ResultPtr);
+      }
+
+      // if we are managing the GOT and the relocation wants an index,
+      // give it one
+      if (MR.isGOTRelative() && MemMgr->isManagingGOT()) {
+        unsigned idx = Resolver.getGOTIndexForAddr(ResultPtr);
+        MR.setGOTIndex(idx);
+        if (((void**)MemMgr->getGOTBase())[idx] != ResultPtr) {
+          DEBUG(dbgs() << "JIT: GOT was out of date for " << ResultPtr
+                       << " pointing at " << ((void**)MemMgr->getGOTBase())[idx]
+                       << "\n");
+          ((void**)MemMgr->getGOTBase())[idx] = ResultPtr;
+        }
+      }
+    }
+
+    CurFn = 0;
+    TheJIT->getJITInfo().relocate(BufferBegin, &Relocations[0],
+                                  Relocations.size(), MemMgr->getGOTBase());
+  }
+
+  // Update the GOT entry for F to point to the new code.
+  if (MemMgr->isManagingGOT()) {
+    unsigned idx = Resolver.getGOTIndexForAddr((void*)BufferBegin);
+    if (((void**)MemMgr->getGOTBase())[idx] != (void*)BufferBegin) {
+      DEBUG(dbgs() << "JIT: GOT was out of date for " << (void*)BufferBegin
+                   << " pointing at " << ((void**)MemMgr->getGOTBase())[idx]
+                   << "\n");
+      ((void**)MemMgr->getGOTBase())[idx] = (void*)BufferBegin;
+    }
+  }
+
+  // CurBufferPtr may have moved beyond FnEnd, due to memory allocation for
+  // global variables that were referenced in the relocations.
+  MemMgr->endFunctionBody(F.getFunction(), BufferBegin, CurBufferPtr);
+
+  if (CurBufferPtr == BufferEnd) {
+    retryWithMoreMemory(F);
+    return true;
+  } else {
+    // Now that we've succeeded in emitting the function, reset the
+    // SizeEstimate back down to zero.
+    SizeEstimate = 0;
+  }
+
+  BufferBegin = CurBufferPtr = 0;
+  NumBytes += FnEnd-FnStart;
+
+  // Invalidate the icache if necessary.
+  sys::Memory::InvalidateInstructionCache(FnStart, FnEnd-FnStart);
+
+  TheJIT->NotifyFunctionEmitted(*F.getFunction(), FnStart, FnEnd-FnStart,
+                                EmissionDetails);
+
+  // Reset the previous debug location.
+  PrevDL = DebugLoc();
+
+  DEBUG(dbgs() << "JIT: Finished CodeGen of [" << (void*)FnStart
+        << "] Function: " << F.getName()
+        << ": " << (FnEnd-FnStart) << " bytes of text, "
+        << Relocations.size() << " relocations\n");
+
+  Relocations.clear();
+  ConstPoolAddresses.clear();
+
+  // Mark code region readable and executable if it's not so already.
+  MemMgr->setMemoryExecutable();
+
+  DEBUG({
+      if (sys::hasDisassembler()) {
+        dbgs() << "JIT: Disassembled code:\n";
+        dbgs() << sys::disassembleBuffer(FnStart, FnEnd-FnStart,
+                                         (uintptr_t)FnStart);
+      } else {
+        dbgs() << "JIT: Binary code:\n";
+        uint8_t* q = FnStart;
+        for (int i = 0; q < FnEnd; q += 4, ++i) {
+          if (i == 4)
+            i = 0;
+          if (i == 0)
+            dbgs() << "JIT: " << (long)(q - FnStart) << ": ";
+          bool Done = false;
+          for (int j = 3; j >= 0; --j) {
+            if (q + j >= FnEnd)
+              Done = true;
+            else
+              dbgs() << (unsigned short)q[j];
+          }
+          if (Done)
+            break;
+          dbgs() << ' ';
+          if (i == 3)
+            dbgs() << '\n';
+        }
+        dbgs()<< '\n';
+      }
+    });
+
+  if (JITExceptionHandling) {
+    uintptr_t ActualSize = 0;
+    SavedBufferBegin = BufferBegin;
+    SavedBufferEnd = BufferEnd;
+    SavedCurBufferPtr = CurBufferPtr;
+    uint8_t *FrameRegister;
+
+    while (true) {
+      BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
+                                                               ActualSize);
+      BufferEnd = BufferBegin+ActualSize;
+      EmittedFunctions[F.getFunction()].ExceptionTable = BufferBegin;
+      uint8_t *EhStart;
+      FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd, EhStart);
+
+      // If the buffer was large enough to hold the table then we are done.
+      if (CurBufferPtr != BufferEnd)
+        break;
+
+      // Try again with twice as much space.
+      ActualSize = (CurBufferPtr - BufferBegin) * 2;
+      MemMgr->deallocateExceptionTable(BufferBegin);
+    }
+    MemMgr->endExceptionTable(F.getFunction(), BufferBegin, CurBufferPtr,
+                              FrameRegister);
+    BufferBegin = SavedBufferBegin;
+    BufferEnd = SavedBufferEnd;
+    CurBufferPtr = SavedCurBufferPtr;
+
+    if (JITExceptionHandling) {
+      TheJIT->RegisterTable(F.getFunction(), FrameRegister);
+    }
+  }
+
+  if (MMI)
+    MMI->EndFunction();
+
+  return false;
+}
+
+void JITEmitter::retryWithMoreMemory(MachineFunction &F) {
+  DEBUG(dbgs() << "JIT: Ran out of space for native code.  Reattempting.\n");
+  Relocations.clear();  // Clear the old relocations or we'll reapply them.
+  ConstPoolAddresses.clear();
+  ++NumRetries;
+  deallocateMemForFunction(F.getFunction());
+  // Try again with at least twice as much free space.
+  SizeEstimate = (uintptr_t)(2 * (BufferEnd - BufferBegin));
+
+  for (MachineFunction::iterator MBB = F.begin(), E = F.end(); MBB != E; ++MBB){
+    if (MBB->hasAddressTaken())
+      TheJIT->clearPointerToBasicBlock(MBB->getBasicBlock());
+  }
+}
+
+/// deallocateMemForFunction - Deallocate all memory for the specified
+/// function body.  Also drop any references the function has to stubs.
+/// May be called while the Function is being destroyed inside ~Value().
+void JITEmitter::deallocateMemForFunction(const Function *F) {
+  ValueMap<const Function *, EmittedCode, EmittedFunctionConfig>::iterator
+    Emitted = EmittedFunctions.find(F);
+  if (Emitted != EmittedFunctions.end()) {
+    MemMgr->deallocateFunctionBody(Emitted->second.FunctionBody);
+    MemMgr->deallocateExceptionTable(Emitted->second.ExceptionTable);
+    TheJIT->NotifyFreeingMachineCode(Emitted->second.Code);
+
+    EmittedFunctions.erase(Emitted);
+  }
+
+  if (JITExceptionHandling) {
+    TheJIT->DeregisterTable(F);
+  }
+}
+
+
+void *JITEmitter::allocateSpace(uintptr_t Size, unsigned Alignment) {
+  if (BufferBegin)
+    return JITCodeEmitter::allocateSpace(Size, Alignment);
+
+  // create a new memory block if there is no active one.
+  // care must be taken so that BufferBegin is invalidated when a
+  // block is trimmed
+  BufferBegin = CurBufferPtr = MemMgr->allocateSpace(Size, Alignment);
+  BufferEnd = BufferBegin+Size;
+  return CurBufferPtr;
+}
+
+void *JITEmitter::allocateGlobal(uintptr_t Size, unsigned Alignment) {
+  // Delegate this call through the memory manager.
+  return MemMgr->allocateGlobal(Size, Alignment);
+}
+
+void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
+  if (TheJIT->getJITInfo().hasCustomConstantPool())
+    return;
+
+  const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
+  if (Constants.empty()) return;
+
+  unsigned Size = GetConstantPoolSizeInBytes(MCP, TheJIT->getDataLayout());
+  unsigned Align = MCP->getConstantPoolAlignment();
+  ConstantPoolBase = allocateSpace(Size, Align);
+  ConstantPool = MCP;
+
+  if (ConstantPoolBase == 0) return;  // Buffer overflow.
+
+  DEBUG(dbgs() << "JIT: Emitted constant pool at [" << ConstantPoolBase
+               << "] (size: " << Size << ", alignment: " << Align << ")\n");
+
+  // Initialize the memory for all of the constant pool entries.
+  unsigned Offset = 0;
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    MachineConstantPoolEntry CPE = Constants[i];
+    unsigned AlignMask = CPE.getAlignment() - 1;
+    Offset = (Offset + AlignMask) & ~AlignMask;
+
+    uintptr_t CAddr = (uintptr_t)ConstantPoolBase + Offset;
+    ConstPoolAddresses.push_back(CAddr);
+    if (CPE.isMachineConstantPoolEntry()) {
+      // FIXME: add support to lower machine constant pool values into bytes!
+      report_fatal_error("Initialize memory with machine specific constant pool"
+                        "entry has not been implemented!");
+    }
+    TheJIT->InitializeMemory(CPE.Val.ConstVal, (void*)CAddr);
+    DEBUG(dbgs() << "JIT:   CP" << i << " at [0x";
+          dbgs().write_hex(CAddr) << "]\n");
+
+    Type *Ty = CPE.Val.ConstVal->getType();
+    Offset += TheJIT->getDataLayout()->getTypeAllocSize(Ty);
+  }
+}
+
+void JITEmitter::initJumpTableInfo(MachineJumpTableInfo *MJTI) {
+  if (TheJIT->getJITInfo().hasCustomJumpTables())
+    return;
+  if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline)
+    return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  unsigned NumEntries = 0;
+  for (unsigned i = 0, e = JT.size(); i != e; ++i)
+    NumEntries += JT[i].MBBs.size();
+
+  unsigned EntrySize = MJTI->getEntrySize(*TheJIT->getDataLayout());
+
+  // Just allocate space for all the jump tables now.  We will fix up the actual
+  // MBB entries in the tables after we emit the code for each block, since then
+  // we will know the final locations of the MBBs in memory.
+  JumpTable = MJTI;
+  JumpTableBase = allocateSpace(NumEntries * EntrySize,
+                             MJTI->getEntryAlignment(*TheJIT->getDataLayout()));
+}
+
+void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
+  if (TheJIT->getJITInfo().hasCustomJumpTables())
+    return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty() || JumpTableBase == 0) return;
+
+
+  switch (MJTI->getEntryKind()) {
+  case MachineJumpTableInfo::EK_Inline:
+    return;
+  case MachineJumpTableInfo::EK_BlockAddress: {
+    // EK_BlockAddress - Each entry is a plain address of block, e.g.:
+    //     .word LBB123
+    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == sizeof(void*) &&
+           "Cross JIT'ing?");
+
+    // For each jump table, map each target in the jump table to the address of
+    // an emitted MachineBasicBlock.
+    intptr_t *SlotPtr = (intptr_t*)JumpTableBase;
+
+    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+      // Store the address of the basic block for this jump table slot in the
+      // memory we allocated for the jump table in 'initJumpTableInfo'
+      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi)
+        *SlotPtr++ = getMachineBasicBlockAddress(MBBs[mi]);
+    }
+    break;
+  }
+
+  case MachineJumpTableInfo::EK_Custom32:
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32: {
+    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == 4&&"Cross JIT'ing?");
+    // For each jump table, place the offset from the beginning of the table
+    // to the target address.
+    int *SlotPtr = (int*)JumpTableBase;
+
+    for (unsigned i = 0, e = JT.size(); i != e; ++i) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+      // Store the offset of the basic block for this jump table slot in the
+      // memory we allocated for the jump table in 'initJumpTableInfo'
+      uintptr_t Base = (uintptr_t)SlotPtr;
+      for (unsigned mi = 0, me = MBBs.size(); mi != me; ++mi) {
+        uintptr_t MBBAddr = getMachineBasicBlockAddress(MBBs[mi]);
+        /// FIXME: USe EntryKind instead of magic "getPICJumpTableEntry" hook.
+        *SlotPtr++ = TheJIT->getJITInfo().getPICJumpTableEntry(MBBAddr, Base);
+      }
+    }
+    break;
+  }
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    llvm_unreachable(
+           "JT Info emission not implemented for GPRel64BlockAddress yet.");
+  }
+}
+
+void JITEmitter::startGVStub(const GlobalValue* GV,
+                             unsigned StubSize, unsigned Alignment) {
+  SavedBufferBegin = BufferBegin;
+  SavedBufferEnd = BufferEnd;
+  SavedCurBufferPtr = CurBufferPtr;
+
+  BufferBegin = CurBufferPtr = MemMgr->allocateStub(GV, StubSize, Alignment);
+  BufferEnd = BufferBegin+StubSize+1;
+}
+
+void JITEmitter::startGVStub(void *Buffer, unsigned StubSize) {
+  SavedBufferBegin = BufferBegin;
+  SavedBufferEnd = BufferEnd;
+  SavedCurBufferPtr = CurBufferPtr;
+
+  BufferBegin = CurBufferPtr = (uint8_t *)Buffer;
+  BufferEnd = BufferBegin+StubSize+1;
+}
+
+void JITEmitter::finishGVStub() {
+  assert(CurBufferPtr != BufferEnd && "Stub overflowed allocated space.");
+  NumBytes += getCurrentPCOffset();
+  BufferBegin = SavedBufferBegin;
+  BufferEnd = SavedBufferEnd;
+  CurBufferPtr = SavedCurBufferPtr;
+}
+
+void *JITEmitter::allocIndirectGV(const GlobalValue *GV,
+                                  const uint8_t *Buffer, size_t Size,
+                                  unsigned Alignment) {
+  uint8_t *IndGV = MemMgr->allocateStub(GV, Size, Alignment);
+  memcpy(IndGV, Buffer, Size);
+  return IndGV;
+}
+
+// getConstantPoolEntryAddress - Return the address of the 'ConstantNum' entry
+// in the constant pool that was last emitted with the 'emitConstantPool'
+// method.
+//
+uintptr_t JITEmitter::getConstantPoolEntryAddress(unsigned ConstantNum) const {
+  assert(ConstantNum < ConstantPool->getConstants().size() &&
+         "Invalid ConstantPoolIndex!");
+  return ConstPoolAddresses[ConstantNum];
+}
+
+// getJumpTableEntryAddress - Return the address of the JumpTable with index
+// 'Index' in the jumpp table that was last initialized with 'initJumpTableInfo'
+//
+uintptr_t JITEmitter::getJumpTableEntryAddress(unsigned Index) const {
+  const std::vector<MachineJumpTableEntry> &JT = JumpTable->getJumpTables();
+  assert(Index < JT.size() && "Invalid jump table index!");
+
+  unsigned EntrySize = JumpTable->getEntrySize(*TheJIT->getDataLayout());
+
+  unsigned Offset = 0;
+  for (unsigned i = 0; i < Index; ++i)
+    Offset += JT[i].MBBs.size();
+
+   Offset *= EntrySize;
+
+  return (uintptr_t)((char *)JumpTableBase + Offset);
+}
+
+void JITEmitter::EmittedFunctionConfig::onDelete(
+  JITEmitter *Emitter, const Function *F) {
+  Emitter->deallocateMemForFunction(F);
+}
+void JITEmitter::EmittedFunctionConfig::onRAUW(
+  JITEmitter *, const Function*, const Function*) {
+  llvm_unreachable("The JIT doesn't know how to handle a"
+                   " RAUW on a value it has emitted.");
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Public interface to this file
+//===----------------------------------------------------------------------===//
+
+JITCodeEmitter *JIT::createEmitter(JIT &jit, JITMemoryManager *JMM,
+                                   TargetMachine &tm) {
+  return new JITEmitter(jit, JMM, tm);
+}
+
+// getPointerToFunctionOrStub - If the specified function has been
+// code-gen'd, return a pointer to the function.  If not, compile it, or use
+// a stub to implement lazy compilation if available.
+//
+void *JIT::getPointerToFunctionOrStub(Function *F) {
+  // If we have already code generated the function, just return the address.
+  if (void *Addr = getPointerToGlobalIfAvailable(F))
+    return Addr;
+
+  // Get a stub if the target supports it.
+  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
+  return JE->getJITResolver().getLazyFunctionStub(F);
+}
+
+void JIT::updateFunctionStub(Function *F) {
+  // Get the empty stub we generated earlier.
+  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
+  void *Stub = JE->getJITResolver().getLazyFunctionStub(F);
+  void *Addr = getPointerToGlobalIfAvailable(F);
+  assert(Addr != Stub && "Function must have non-stub address to be updated.");
+
+  // Tell the target jit info to rewrite the stub at the specified address,
+  // rather than creating a new one.
+  TargetJITInfo::StubLayout layout = getJITInfo().getStubLayout();
+  JE->startGVStub(Stub, layout.Size);
+  getJITInfo().emitFunctionStub(F, Addr, *getCodeEmitter());
+  JE->finishGVStub();
+}
+
+/// freeMachineCodeForFunction - release machine code memory for given Function.
+///
+void JIT::freeMachineCodeForFunction(Function *F) {
+  // Delete translation for this from the ExecutionEngine, so it will get
+  // retranslated next time it is used.
+  updateGlobalMapping(F, 0);
+
+  // Free the actual memory for the function body and related stuff.
+  static_cast<JITEmitter*>(JCE)->deallocateMemForFunction(F);
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
new file mode 100644
index 000000000000..66aeb772ddc3
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -0,0 +1,932 @@
+//===-- JITMemoryManager.cpp - Memory Allocator for JIT'd code ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DefaultJITMemoryManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <vector>
+
+#if defined(__linux__)
+#if defined(HAVE_SYS_STAT_H)
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+using namespace llvm;
+
+STATISTIC(NumSlabs, "Number of slabs of memory allocated by the JIT");
+
+JITMemoryManager::~JITMemoryManager() {}
+
+//===----------------------------------------------------------------------===//
+// Memory Block Implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// MemoryRangeHeader - For a range of memory, this is the header that we put
+  /// on the block of memory.  It is carefully crafted to be one word of memory.
+  /// Allocated blocks have just this header, free'd blocks have FreeRangeHeader
+  /// which starts with this.
+  struct FreeRangeHeader;
+  struct MemoryRangeHeader {
+    /// ThisAllocated - This is true if this block is currently allocated.  If
+    /// not, this can be converted to a FreeRangeHeader.
+    unsigned ThisAllocated : 1;
+
+    /// PrevAllocated - Keep track of whether the block immediately before us is
+    /// allocated.  If not, the word immediately before this header is the size
+    /// of the previous block.
+    unsigned PrevAllocated : 1;
+
+    /// BlockSize - This is the size in bytes of this memory block,
+    /// including this header.
+    uintptr_t BlockSize : (sizeof(intptr_t)*CHAR_BIT - 2);
+
+
+    /// getBlockAfter - Return the memory block immediately after this one.
+    ///
+    MemoryRangeHeader &getBlockAfter() const {
+      return *reinterpret_cast<MemoryRangeHeader *>(
+                reinterpret_cast<char*>(
+                  const_cast<MemoryRangeHeader *>(this))+BlockSize);
+    }
+
+    /// getFreeBlockBefore - If the block before this one is free, return it,
+    /// otherwise return null.
+    FreeRangeHeader *getFreeBlockBefore() const {
+      if (PrevAllocated) return 0;
+      intptr_t PrevSize = reinterpret_cast<intptr_t *>(
+                            const_cast<MemoryRangeHeader *>(this))[-1];
+      return reinterpret_cast<FreeRangeHeader *>(
+               reinterpret_cast<char*>(
+                 const_cast<MemoryRangeHeader *>(this))-PrevSize);
+    }
+
+    /// FreeBlock - Turn an allocated block into a free block, adjusting
+    /// bits in the object headers, and adding an end of region memory block.
+    FreeRangeHeader *FreeBlock(FreeRangeHeader *FreeList);
+
+    /// TrimAllocationToSize - If this allocated block is significantly larger
+    /// than NewSize, split it into two pieces (where the former is NewSize
+    /// bytes, including the header), and add the new block to the free list.
+    FreeRangeHeader *TrimAllocationToSize(FreeRangeHeader *FreeList,
+                                          uint64_t NewSize);
+  };
+
+  /// FreeRangeHeader - For a memory block that isn't already allocated, this
+  /// keeps track of the current block and has a pointer to the next free block.
+  /// Free blocks are kept on a circularly linked list.
+  struct FreeRangeHeader : public MemoryRangeHeader {
+    FreeRangeHeader *Prev;
+    FreeRangeHeader *Next;
+
+    /// getMinBlockSize - Get the minimum size for a memory block.  Blocks
+    /// smaller than this size cannot be created.
+    static unsigned getMinBlockSize() {
+      return sizeof(FreeRangeHeader)+sizeof(intptr_t);
+    }
+
+    /// SetEndOfBlockSizeMarker - The word at the end of every free block is
+    /// known to be the size of the free block.  Set it for this block.
+    void SetEndOfBlockSizeMarker() {
+      void *EndOfBlock = (char*)this + BlockSize;
+      ((intptr_t *)EndOfBlock)[-1] = BlockSize;
+    }
+
+    FreeRangeHeader *RemoveFromFreeList() {
+      assert(Next->Prev == this && Prev->Next == this && "Freelist broken!");
+      Next->Prev = Prev;
+      return Prev->Next = Next;
+    }
+
+    void AddToFreeList(FreeRangeHeader *FreeList) {
+      Next = FreeList;
+      Prev = FreeList->Prev;
+      Prev->Next = this;
+      Next->Prev = this;
+    }
+
+    /// GrowBlock - The block after this block just got deallocated.  Merge it
+    /// into the current block.
+    void GrowBlock(uintptr_t NewSize);
+
+    /// AllocateBlock - Mark this entire block allocated, updating freelists
+    /// etc.  This returns a pointer to the circular free-list.
+    FreeRangeHeader *AllocateBlock();
+  };
+}
+
+
+/// AllocateBlock - Mark this entire block allocated, updating freelists
+/// etc.  This returns a pointer to the circular free-list.
+FreeRangeHeader *FreeRangeHeader::AllocateBlock() {
+  assert(!ThisAllocated && !getBlockAfter().PrevAllocated &&
+         "Cannot allocate an allocated block!");
+  // Mark this block allocated.
+  ThisAllocated = 1;
+  getBlockAfter().PrevAllocated = 1;
+
+  // Remove it from the free list.
+  return RemoveFromFreeList();
+}
+
+/// FreeBlock - Turn an allocated block into a free block, adjusting
+/// bits in the object headers, and adding an end of region memory block.
+/// If possible, coalesce this block with neighboring blocks.  Return the
+/// FreeRangeHeader to allocate from.
+FreeRangeHeader *MemoryRangeHeader::FreeBlock(FreeRangeHeader *FreeList) {
+  MemoryRangeHeader *FollowingBlock = &getBlockAfter();
+  assert(ThisAllocated && "This block is already free!");
+  assert(FollowingBlock->PrevAllocated && "Flags out of sync!");
+
+  FreeRangeHeader *FreeListToReturn = FreeList;
+
+  // If the block after this one is free, merge it into this block.
+  if (!FollowingBlock->ThisAllocated) {
+    FreeRangeHeader &FollowingFreeBlock = *(FreeRangeHeader *)FollowingBlock;
+    // "FreeList" always needs to be a valid free block.  If we're about to
+    // coalesce with it, update our notion of what the free list is.
+    if (&FollowingFreeBlock == FreeList) {
+      FreeList = FollowingFreeBlock.Next;
+      FreeListToReturn = 0;
+      assert(&FollowingFreeBlock != FreeList && "No tombstone block?");
+    }
+    FollowingFreeBlock.RemoveFromFreeList();
+
+    // Include the following block into this one.
+    BlockSize += FollowingFreeBlock.BlockSize;
+    FollowingBlock = &FollowingFreeBlock.getBlockAfter();
+
+    // Tell the block after the block we are coalescing that this block is
+    // allocated.
+    FollowingBlock->PrevAllocated = 1;
+  }
+
+  assert(FollowingBlock->ThisAllocated && "Missed coalescing?");
+
+  if (FreeRangeHeader *PrevFreeBlock = getFreeBlockBefore()) {
+    PrevFreeBlock->GrowBlock(PrevFreeBlock->BlockSize + BlockSize);
+    return FreeListToReturn ? FreeListToReturn : PrevFreeBlock;
+  }
+
+  // Otherwise, mark this block free.
+  FreeRangeHeader &FreeBlock = *(FreeRangeHeader*)this;
+  FollowingBlock->PrevAllocated = 0;
+  FreeBlock.ThisAllocated = 0;
+
+  // Link this into the linked list of free blocks.
+  FreeBlock.AddToFreeList(FreeList);
+
+  // Add a marker at the end of the block, indicating the size of this free
+  // block.
+  FreeBlock.SetEndOfBlockSizeMarker();
+  return FreeListToReturn ? FreeListToReturn : &FreeBlock;
+}
+
+/// GrowBlock - The block after this block just got deallocated.  Merge it
+/// into the current block.
+void FreeRangeHeader::GrowBlock(uintptr_t NewSize) {
+  assert(NewSize > BlockSize && "Not growing block?");
+  BlockSize = NewSize;
+  SetEndOfBlockSizeMarker();
+  getBlockAfter().PrevAllocated = 0;
+}
+
+/// TrimAllocationToSize - If this allocated block is significantly larger
+/// than NewSize, split it into two pieces (where the former is NewSize
+/// bytes, including the header), and add the new block to the free list.
+FreeRangeHeader *MemoryRangeHeader::
+TrimAllocationToSize(FreeRangeHeader *FreeList, uint64_t NewSize) {
+  assert(ThisAllocated && getBlockAfter().PrevAllocated &&
+         "Cannot deallocate part of an allocated block!");
+
+  // Don't allow blocks to be trimmed below minimum required size
+  NewSize = std::max<uint64_t>(FreeRangeHeader::getMinBlockSize(), NewSize);
+
+  // Round up size for alignment of header.
+  unsigned HeaderAlign = __alignof(FreeRangeHeader);
+  NewSize = (NewSize+ (HeaderAlign-1)) & ~(HeaderAlign-1);
+
+  // Size is now the size of the block we will remove from the start of the
+  // current block.
+  assert(NewSize <= BlockSize &&
+         "Allocating more space from this block than exists!");
+
+  // If splitting this block will cause the remainder to be too small, do not
+  // split the block.
+  if (BlockSize <= NewSize+FreeRangeHeader::getMinBlockSize())
+    return FreeList;
+
+  // Otherwise, we splice the required number of bytes out of this block, form
+  // a new block immediately after it, then mark this block allocated.
+  MemoryRangeHeader &FormerNextBlock = getBlockAfter();
+
+  // Change the size of this block.
+  BlockSize = NewSize;
+
+  // Get the new block we just sliced out and turn it into a free block.
+  FreeRangeHeader &NewNextBlock = (FreeRangeHeader &)getBlockAfter();
+  NewNextBlock.BlockSize = (char*)&FormerNextBlock - (char*)&NewNextBlock;
+  NewNextBlock.ThisAllocated = 0;
+  NewNextBlock.PrevAllocated = 1;
+  NewNextBlock.SetEndOfBlockSizeMarker();
+  FormerNextBlock.PrevAllocated = 0;
+  NewNextBlock.AddToFreeList(FreeList);
+  return &NewNextBlock;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Block Implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class DefaultJITMemoryManager;
+
+  class JITSlabAllocator : public SlabAllocator {
+    DefaultJITMemoryManager &JMM;
+  public:
+    JITSlabAllocator(DefaultJITMemoryManager &jmm) : JMM(jmm) { }
+    virtual ~JITSlabAllocator() { }
+    virtual MemSlab *Allocate(size_t Size);
+    virtual void Deallocate(MemSlab *Slab);
+  };
+
+  /// DefaultJITMemoryManager - Manage memory for the JIT code generation.
+  /// This splits a large block of MAP_NORESERVE'd memory into two
+  /// sections, one for function stubs, one for the functions themselves.  We
+  /// have to do this because we may need to emit a function stub while in the
+  /// middle of emitting a function, and we don't know how large the function we
+  /// are emitting is.
+  class DefaultJITMemoryManager : public JITMemoryManager {
+
+    // Whether to poison freed memory.
+    bool PoisonMemory;
+
+    /// LastSlab - This points to the last slab allocated and is used as the
+    /// NearBlock parameter to AllocateRWX so that we can attempt to lay out all
+    /// stubs, data, and code contiguously in memory.  In general, however, this
+    /// is not possible because the NearBlock parameter is ignored on Windows
+    /// platforms and even on Unix it works on a best-effort pasis.
+    sys::MemoryBlock LastSlab;
+
+    // Memory slabs allocated by the JIT.  We refer to them as slabs so we don't
+    // confuse them with the blocks of memory described above.
+    std::vector<sys::MemoryBlock> CodeSlabs;
+    JITSlabAllocator BumpSlabAllocator;
+    BumpPtrAllocator StubAllocator;
+    BumpPtrAllocator DataAllocator;
+
+    // Circular list of free blocks.
+    FreeRangeHeader *FreeMemoryList;
+
+    // When emitting code into a memory block, this is the block.
+    MemoryRangeHeader *CurBlock;
+
+    uint8_t *GOTBase;     // Target Specific reserved memory
+  public:
+    DefaultJITMemoryManager();
+    ~DefaultJITMemoryManager();
+
+    /// allocateNewSlab - Allocates a new MemoryBlock and remembers it as the
+    /// last slab it allocated, so that subsequent allocations follow it.
+    sys::MemoryBlock allocateNewSlab(size_t size);
+
+    /// DefaultCodeSlabSize - When we have to go map more memory, we allocate at
+    /// least this much unless more is requested.
+    static const size_t DefaultCodeSlabSize;
+
+    /// DefaultSlabSize - Allocate data into slabs of this size unless we get
+    /// an allocation above SizeThreshold.
+    static const size_t DefaultSlabSize;
+
+    /// DefaultSizeThreshold - For any allocation larger than this threshold, we
+    /// should allocate a separate slab.
+    static const size_t DefaultSizeThreshold;
+
+    /// getPointerToNamedFunction - This method returns the address of the
+    /// specified function by using the dlsym function call.
+    virtual void *getPointerToNamedFunction(const std::string &Name,
+                                            bool AbortOnFailure = true);
+
+    void AllocateGOT();
+
+    // Testing methods.
+    virtual bool CheckInvariants(std::string &ErrorStr);
+    size_t GetDefaultCodeSlabSize() { return DefaultCodeSlabSize; }
+    size_t GetDefaultDataSlabSize() { return DefaultSlabSize; }
+    size_t GetDefaultStubSlabSize() { return DefaultSlabSize; }
+    unsigned GetNumCodeSlabs() { return CodeSlabs.size(); }
+    unsigned GetNumDataSlabs() { return DataAllocator.GetNumSlabs(); }
+    unsigned GetNumStubSlabs() { return StubAllocator.GetNumSlabs(); }
+
+    /// startFunctionBody - When a function starts, allocate a block of free
+    /// executable memory, returning a pointer to it and its actual size.
+    uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize) {
+
+      FreeRangeHeader* candidateBlock = FreeMemoryList;
+      FreeRangeHeader* head = FreeMemoryList;
+      FreeRangeHeader* iter = head->Next;
+
+      uintptr_t largest = candidateBlock->BlockSize;
+
+      // Search for the largest free block
+      while (iter != head) {
+        if (iter->BlockSize > largest) {
+          largest = iter->BlockSize;
+          candidateBlock = iter;
+        }
+        iter = iter->Next;
+      }
+
+      largest = largest - sizeof(MemoryRangeHeader);
+
+      // If this block isn't big enough for the allocation desired, allocate
+      // another block of memory and add it to the free list.
+      if (largest < ActualSize ||
+          largest <= FreeRangeHeader::getMinBlockSize()) {
+        DEBUG(dbgs() << "JIT: Allocating another slab of memory for function.");
+        candidateBlock = allocateNewCodeSlab((size_t)ActualSize);
+      }
+
+      // Select this candidate block for allocation
+      CurBlock = candidateBlock;
+
+      // Allocate the entire memory block.
+      FreeMemoryList = candidateBlock->AllocateBlock();
+      ActualSize = CurBlock->BlockSize - sizeof(MemoryRangeHeader);
+      return (uint8_t *)(CurBlock + 1);
+    }
+
+    /// allocateNewCodeSlab - Helper method to allocate a new slab of code
+    /// memory from the OS and add it to the free list.  Returns the new
+    /// FreeRangeHeader at the base of the slab.
+    FreeRangeHeader *allocateNewCodeSlab(size_t MinSize) {
+      // If the user needs at least MinSize free memory, then we account for
+      // two MemoryRangeHeaders: the one in the user's block, and the one at the
+      // end of the slab.
+      size_t PaddedMin = MinSize + 2 * sizeof(MemoryRangeHeader);
+      size_t SlabSize = std::max(DefaultCodeSlabSize, PaddedMin);
+      sys::MemoryBlock B = allocateNewSlab(SlabSize);
+      CodeSlabs.push_back(B);
+      char *MemBase = (char*)(B.base());
+
+      // Put a tiny allocated block at the end of the memory chunk, so when
+      // FreeBlock calls getBlockAfter it doesn't fall off the end.
+      MemoryRangeHeader *EndBlock =
+          (MemoryRangeHeader*)(MemBase + B.size()) - 1;
+      EndBlock->ThisAllocated = 1;
+      EndBlock->PrevAllocated = 0;
+      EndBlock->BlockSize = sizeof(MemoryRangeHeader);
+
+      // Start out with a vast new block of free memory.
+      FreeRangeHeader *NewBlock = (FreeRangeHeader*)MemBase;
+      NewBlock->ThisAllocated = 0;
+      // Make sure getFreeBlockBefore doesn't look into unmapped memory.
+      NewBlock->PrevAllocated = 1;
+      NewBlock->BlockSize = (uintptr_t)EndBlock - (uintptr_t)NewBlock;
+      NewBlock->SetEndOfBlockSizeMarker();
+      NewBlock->AddToFreeList(FreeMemoryList);
+
+      assert(NewBlock->BlockSize - sizeof(MemoryRangeHeader) >= MinSize &&
+             "The block was too small!");
+      return NewBlock;
+    }
+
+    /// endFunctionBody - The function F is now allocated, and takes the memory
+    /// in the range [FunctionStart,FunctionEnd).
+    void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                         uint8_t *FunctionEnd) {
+      assert(FunctionEnd > FunctionStart);
+      assert(FunctionStart == (uint8_t *)(CurBlock+1) &&
+             "Mismatched function start/end!");
+
+      uintptr_t BlockSize = FunctionEnd - (uint8_t *)CurBlock;
+
+      // Release the memory at the end of this block that isn't needed.
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+    }
+
+    /// allocateSpace - Allocate a memory block of the given size.  This method
+    /// cannot be called between calls to startFunctionBody and endFunctionBody.
+    uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
+      CurBlock = FreeMemoryList;
+      FreeMemoryList = FreeMemoryList->AllocateBlock();
+
+      uint8_t *result = (uint8_t *)(CurBlock + 1);
+
+      if (Alignment == 0) Alignment = 1;
+      result = (uint8_t*)(((intptr_t)result+Alignment-1) &
+               ~(intptr_t)(Alignment-1));
+
+      uintptr_t BlockSize = result + Size - (uint8_t *)CurBlock;
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+
+      return result;
+    }
+
+    /// allocateStub - Allocate memory for a function stub.
+    uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                          unsigned Alignment) {
+      return (uint8_t*)StubAllocator.Allocate(StubSize, Alignment);
+    }
+
+    /// allocateGlobal - Allocate memory for a global.
+    uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) {
+      return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
+    }
+
+    /// allocateCodeSection - Allocate memory for a code section.
+    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID) {
+      // Grow the required block size to account for the block header
+      Size += sizeof(*CurBlock);
+
+      // FIXME: Alignement handling.
+      FreeRangeHeader* candidateBlock = FreeMemoryList;
+      FreeRangeHeader* head = FreeMemoryList;
+      FreeRangeHeader* iter = head->Next;
+
+      uintptr_t largest = candidateBlock->BlockSize;
+
+      // Search for the largest free block.
+      while (iter != head) {
+        if (iter->BlockSize > largest) {
+          largest = iter->BlockSize;
+          candidateBlock = iter;
+        }
+        iter = iter->Next;
+      }
+
+      largest = largest - sizeof(MemoryRangeHeader);
+
+      // If this block isn't big enough for the allocation desired, allocate
+      // another block of memory and add it to the free list.
+      if (largest < Size || largest <= FreeRangeHeader::getMinBlockSize()) {
+        DEBUG(dbgs() << "JIT: Allocating another slab of memory for function.");
+        candidateBlock = allocateNewCodeSlab((size_t)Size);
+      }
+
+      // Select this candidate block for allocation
+      CurBlock = candidateBlock;
+
+      // Allocate the entire memory block.
+      FreeMemoryList = candidateBlock->AllocateBlock();
+      // Release the memory at the end of this block that isn't needed.
+      FreeMemoryList = CurBlock->TrimAllocationToSize(FreeMemoryList, Size);
+      return (uint8_t *)(CurBlock + 1);
+    }
+
+    /// allocateDataSection - Allocate memory for a data section.
+    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID, bool IsReadOnly) {
+      return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
+    }
+
+    bool applyPermissions(std::string *ErrMsg) {
+      return false;
+    }
+
+    /// startExceptionTable - Use startFunctionBody to allocate memory for the
+    /// function's exception table.
+    uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize) {
+      return startFunctionBody(F, ActualSize);
+    }
+
+    /// endExceptionTable - The exception table of F is now allocated,
+    /// and takes the memory in the range [TableStart,TableEnd).
+    void endExceptionTable(const Function *F, uint8_t *TableStart,
+                           uint8_t *TableEnd, uint8_t* FrameRegister) {
+      assert(TableEnd > TableStart);
+      assert(TableStart == (uint8_t *)(CurBlock+1) &&
+             "Mismatched table start/end!");
+
+      uintptr_t BlockSize = TableEnd - (uint8_t *)CurBlock;
+
+      // Release the memory at the end of this block that isn't needed.
+      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
+    }
+
+    uint8_t *getGOTBase() const {
+      return GOTBase;
+    }
+
+    void deallocateBlock(void *Block) {
+      // Find the block that is allocated for this function.
+      MemoryRangeHeader *MemRange = static_cast<MemoryRangeHeader*>(Block) - 1;
+      assert(MemRange->ThisAllocated && "Block isn't allocated!");
+
+      // Fill the buffer with garbage!
+      if (PoisonMemory) {
+        memset(MemRange+1, 0xCD, MemRange->BlockSize-sizeof(*MemRange));
+      }
+
+      // Free the memory.
+      FreeMemoryList = MemRange->FreeBlock(FreeMemoryList);
+    }
+
+    /// deallocateFunctionBody - Deallocate all memory for the specified
+    /// function body.
+    void deallocateFunctionBody(void *Body) {
+      if (Body) deallocateBlock(Body);
+    }
+
+    /// deallocateExceptionTable - Deallocate memory for the specified
+    /// exception table.
+    void deallocateExceptionTable(void *ET) {
+      if (ET) deallocateBlock(ET);
+    }
+
+    /// setMemoryWritable - When code generation is in progress,
+    /// the code pages may need permissions changed.
+    void setMemoryWritable()
+    {
+      for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
+        sys::Memory::setWritable(CodeSlabs[i]);
+    }
+    /// setMemoryExecutable - When code generation is done and we're ready to
+    /// start execution, the code pages may need permissions changed.
+    void setMemoryExecutable()
+    {
+      for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
+        sys::Memory::setExecutable(CodeSlabs[i]);
+    }
+
+    /// setPoisonMemory - Controls whether we write garbage over freed memory.
+    ///
+    void setPoisonMemory(bool poison) {
+      PoisonMemory = poison;
+    }
+  };
+}
+
+MemSlab *JITSlabAllocator::Allocate(size_t Size) {
+  sys::MemoryBlock B = JMM.allocateNewSlab(Size);
+  MemSlab *Slab = (MemSlab*)B.base();
+  Slab->Size = B.size();
+  Slab->NextPtr = 0;
+  return Slab;
+}
+
+void JITSlabAllocator::Deallocate(MemSlab *Slab) {
+  sys::MemoryBlock B(Slab, Slab->Size);
+  sys::Memory::ReleaseRWX(B);
+}
+
+DefaultJITMemoryManager::DefaultJITMemoryManager()
+  :
+#ifdef NDEBUG
+    PoisonMemory(false),
+#else
+    PoisonMemory(true),
+#endif
+    LastSlab(0, 0),
+    BumpSlabAllocator(*this),
+    StubAllocator(DefaultSlabSize, DefaultSizeThreshold, BumpSlabAllocator),
+    DataAllocator(DefaultSlabSize, DefaultSizeThreshold, BumpSlabAllocator) {
+
+  // Allocate space for code.
+  sys::MemoryBlock MemBlock = allocateNewSlab(DefaultCodeSlabSize);
+  CodeSlabs.push_back(MemBlock);
+  uint8_t *MemBase = (uint8_t*)MemBlock.base();
+
+  // We set up the memory chunk with 4 mem regions, like this:
+  //  [ START
+  //    [ Free      #0 ] -> Large space to allocate functions from.
+  //    [ Allocated #1 ] -> Tiny space to separate regions.
+  //    [ Free      #2 ] -> Tiny space so there is always at least 1 free block.
+  //    [ Allocated #3 ] -> Tiny space to prevent looking past end of block.
+  //  END ]
+  //
+  // The last three blocks are never deallocated or touched.
+
+  // Add MemoryRangeHeader to the end of the memory region, indicating that
+  // the space after the block of memory is allocated.  This is block #3.
+  MemoryRangeHeader *Mem3 = (MemoryRangeHeader*)(MemBase+MemBlock.size())-1;
+  Mem3->ThisAllocated = 1;
+  Mem3->PrevAllocated = 0;
+  Mem3->BlockSize     = sizeof(MemoryRangeHeader);
+
+  /// Add a tiny free region so that the free list always has one entry.
+  FreeRangeHeader *Mem2 =
+    (FreeRangeHeader *)(((char*)Mem3)-FreeRangeHeader::getMinBlockSize());
+  Mem2->ThisAllocated = 0;
+  Mem2->PrevAllocated = 1;
+  Mem2->BlockSize     = FreeRangeHeader::getMinBlockSize();
+  Mem2->SetEndOfBlockSizeMarker();
+  Mem2->Prev = Mem2;   // Mem2 *is* the free list for now.
+  Mem2->Next = Mem2;
+
+  /// Add a tiny allocated region so that Mem2 is never coalesced away.
+  MemoryRangeHeader *Mem1 = (MemoryRangeHeader*)Mem2-1;
+  Mem1->ThisAllocated = 1;
+  Mem1->PrevAllocated = 0;
+  Mem1->BlockSize     = sizeof(MemoryRangeHeader);
+
+  // Add a FreeRangeHeader to the start of the function body region, indicating
+  // that the space is free.  Mark the previous block allocated so we never look
+  // at it.
+  FreeRangeHeader *Mem0 = (FreeRangeHeader*)MemBase;
+  Mem0->ThisAllocated = 0;
+  Mem0->PrevAllocated = 1;
+  Mem0->BlockSize = (char*)Mem1-(char*)Mem0;
+  Mem0->SetEndOfBlockSizeMarker();
+  Mem0->AddToFreeList(Mem2);
+
+  // Start out with the freelist pointing to Mem0.
+  FreeMemoryList = Mem0;
+
+  GOTBase = NULL;
+}
+
+void DefaultJITMemoryManager::AllocateGOT() {
+  assert(GOTBase == 0 && "Cannot allocate the got multiple times");
+  GOTBase = new uint8_t[sizeof(void*) * 8192];
+  HasGOT = true;
+}
+
+DefaultJITMemoryManager::~DefaultJITMemoryManager() {
+  for (unsigned i = 0, e = CodeSlabs.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(CodeSlabs[i]);
+
+  delete[] GOTBase;
+}
+
+sys::MemoryBlock DefaultJITMemoryManager::allocateNewSlab(size_t size) {
+  // Allocate a new block close to the last one.
+  std::string ErrMsg;
+  sys::MemoryBlock *LastSlabPtr = LastSlab.base() ? &LastSlab : 0;
+  sys::MemoryBlock B = sys::Memory::AllocateRWX(size, LastSlabPtr, &ErrMsg);
+  if (B.base() == 0) {
+    report_fatal_error("Allocation failed when allocating new memory in the"
+                       " JIT\n" + Twine(ErrMsg));
+  }
+  LastSlab = B;
+  ++NumSlabs;
+  // Initialize the slab to garbage when debugging.
+  if (PoisonMemory) {
+    memset(B.base(), 0xCD, B.size());
+  }
+  return B;
+}
+
+/// CheckInvariants - For testing only.  Return "" if all internal invariants
+/// are preserved, and a helpful error message otherwise.  For free and
+/// allocated blocks, make sure that adding BlockSize gives a valid block.
+/// For free blocks, make sure they're in the free list and that their end of
+/// block size marker is correct.  This function should return an error before
+/// accessing bad memory.  This function is defined here instead of in
+/// JITMemoryManagerTest.cpp so that we don't have to expose all of the
+/// implementation details of DefaultJITMemoryManager.
+bool DefaultJITMemoryManager::CheckInvariants(std::string &ErrorStr) {
+  raw_string_ostream Err(ErrorStr);
+
+  // Construct a the set of FreeRangeHeader pointers so we can query it
+  // efficiently.
+  llvm::SmallPtrSet<MemoryRangeHeader*, 16> FreeHdrSet;
+  FreeRangeHeader* FreeHead = FreeMemoryList;
+  FreeRangeHeader* FreeRange = FreeHead;
+
+  do {
+    // Check that the free range pointer is in the blocks we've allocated.
+    bool Found = false;
+    for (std::vector<sys::MemoryBlock>::iterator I = CodeSlabs.begin(),
+         E = CodeSlabs.end(); I != E && !Found; ++I) {
+      char *Start = (char*)I->base();
+      char *End = Start + I->size();
+      Found = (Start <= (char*)FreeRange && (char*)FreeRange < End);
+    }
+    if (!Found) {
+      Err << "Corrupt free list; points to " << FreeRange;
+      return false;
+    }
+
+    if (FreeRange->Next->Prev != FreeRange) {
+      Err << "Next and Prev pointers do not match.";
+      return false;
+    }
+
+    // Otherwise, add it to the set.
+    FreeHdrSet.insert(FreeRange);
+    FreeRange = FreeRange->Next;
+  } while (FreeRange != FreeHead);
+
+  // Go over each block, and look at each MemoryRangeHeader.
+  for (std::vector<sys::MemoryBlock>::iterator I = CodeSlabs.begin(),
+       E = CodeSlabs.end(); I != E; ++I) {
+    char *Start = (char*)I->base();
+    char *End = Start + I->size();
+
+    // Check each memory range.
+    for (MemoryRangeHeader *Hdr = (MemoryRangeHeader*)Start, *LastHdr = NULL;
+         Start <= (char*)Hdr && (char*)Hdr < End;
+         Hdr = &Hdr->getBlockAfter()) {
+      if (Hdr->ThisAllocated == 0) {
+        // Check that this range is in the free list.
+        if (!FreeHdrSet.count(Hdr)) {
+          Err << "Found free header at " << Hdr << " that is not in free list.";
+          return false;
+        }
+
+        // Now make sure the size marker at the end of the block is correct.
+        uintptr_t *Marker = ((uintptr_t*)&Hdr->getBlockAfter()) - 1;
+        if (!(Start <= (char*)Marker && (char*)Marker < End)) {
+          Err << "Block size in header points out of current MemoryBlock.";
+          return false;
+        }
+        if (Hdr->BlockSize != *Marker) {
+          Err << "End of block size marker (" << *Marker << ") "
+              << "and BlockSize (" << Hdr->BlockSize << ") don't match.";
+          return false;
+        }
+      }
+
+      if (LastHdr && LastHdr->ThisAllocated != Hdr->PrevAllocated) {
+        Err << "Hdr->PrevAllocated (" << Hdr->PrevAllocated << ") != "
+            << "LastHdr->ThisAllocated (" << LastHdr->ThisAllocated << ")";
+        return false;
+      } else if (!LastHdr && !Hdr->PrevAllocated) {
+        Err << "The first header should have PrevAllocated true.";
+        return false;
+      }
+
+      // Remember the last header.
+      LastHdr = Hdr;
+    }
+  }
+
+  // All invariants are preserved.
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// getPointerToNamedFunction() implementation.
+//===----------------------------------------------------------------------===//
+
+// AtExitHandlers - List of functions to call when the program exits,
+// registered with the atexit() library function.
+static std::vector<void (*)()> AtExitHandlers;
+
+/// runAtExitHandlers - Run any functions registered by the program's
+/// calls to atexit(3), which we intercept and store in
+/// AtExitHandlers.
+///
+static void runAtExitHandlers() {
+  while (!AtExitHandlers.empty()) {
+    void (*Fn)() = AtExitHandlers.back();
+    AtExitHandlers.pop_back();
+    Fn();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Function stubs that are invoked instead of certain library calls
+//
+// Force the following functions to be linked in to anything that uses the
+// JIT. This is a hack designed to work around the all-too-clever Glibc
+// strategy of making these functions work differently when inlined vs. when
+// not inlined, and hiding their real definitions in a separate archive file
+// that the dynamic linker can't see. For more info, search for
+// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+#if defined(__linux__)
+/* stat functions are redirecting to __xstat with a version number.  On x86-64
+ * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
+ * available as an exported symbol, so we have to add it explicitly.
+ */
+namespace {
+class StatSymbols {
+public:
+  StatSymbols() {
+    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
+    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
+    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
+    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
+    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
+    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
+    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
+    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
+    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
+  }
+};
+}
+static StatSymbols initStatSymbols;
+#endif // __linux__
+
+// jit_exit - Used to intercept the "exit" library call.
+static void jit_exit(int Status) {
+  runAtExitHandlers();   // Run atexit handlers...
+  exit(Status);
+}
+
+// jit_atexit - Used to intercept the "atexit" library call.
+static int jit_atexit(void (*Fn)()) {
+  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
+  return 0;  // Always successful
+}
+
+static int jit_noop() {
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// getPointerToNamedFunction - This method returns the address of the specified
+/// function by using the dynamic loader interface.  As such it is only useful
+/// for resolving library symbols, not code generated symbols.
+///
+void *DefaultJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                         bool AbortOnFailure) {
+  // Check to see if this is one of the functions we want to intercept.  Note,
+  // we cast to intptr_t here to silence a -pedantic warning that complains
+  // about casting a function pointer to a normal pointer.
+  if (Name == "exit") return (void*)(intptr_t)&jit_exit;
+  if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
+  const char *NameStr = Name.c_str();
+  // If this is an asm specifier, skip the sentinal.
+  if (NameStr[0] == 1) ++NameStr;
+
+  // If it's an external function, look it up in the process image...
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
+  // are references to hidden visibility symbols that dlsym cannot resolve.
+  // If we have one of these, strip off $LDBLStub and try again.
+#if defined(__APPLE__) && defined(__ppc__)
+  if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
+      memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
+    // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
+    // This mirrors logic in libSystemStubs.a.
+    std::string Prefix = std::string(Name.begin(), Name.end()-9);
+    if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
+      return Ptr;
+    if (void *Ptr = getPointerToNamedFunction(Prefix, false))
+      return Ptr;
+  }
+#endif
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
+
+
+
+JITMemoryManager *JITMemoryManager::CreateDefaultMemManager() {
+  return new DefaultJITMemoryManager();
+}
+
+// Allocate memory for code in 512K slabs.
+const size_t DefaultJITMemoryManager::DefaultCodeSlabSize = 512 * 1024;
+
+// Allocate globals and stubs in slabs of 64K.  (probably 16 pages)
+const size_t DefaultJITMemoryManager::DefaultSlabSize = 64 * 1024;
+
+// Waste at most 16K at the end of each bump slab.  (probably 4 pages)
+const size_t DefaultJITMemoryManager::DefaultSizeThreshold = 16 * 1024;
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
new file mode 100644
index 000000000000..fee10e194355
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -0,0 +1,336 @@
+//===-- MCJIT.cpp - MC-based Just-in-Time Compiler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCJIT.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MutexGuard.h"
+
+using namespace llvm;
+
+namespace {
+
+static struct RegisterJIT {
+  RegisterJIT() { MCJIT::Register(); }
+} JITRegistrator;
+
+}
+
+extern "C" void LLVMLinkInMCJIT() {
+}
+
+ExecutionEngine *MCJIT::createJIT(Module *M,
+                                  std::string *ErrorStr,
+                                  JITMemoryManager *JMM,
+                                  bool GVsWithCode,
+                                  TargetMachine *TM) {
+  // Try to register the program as a source of symbols to resolve against.
+  //
+  // FIXME: Don't do this here.
+  sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
+
+  return new MCJIT(M, TM, JMM, GVsWithCode);
+}
+
+MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
+             bool AllocateGVsWithCode)
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM),
+    isCompiled(false), M(m)  {
+
+  setDataLayout(TM->getDataLayout());
+}
+
+MCJIT::~MCJIT() {
+  if (LoadedObject)
+    NotifyFreeingObject(*LoadedObject.get());
+  delete MemMgr;
+  delete TM;
+}
+
+void MCJIT::emitObject(Module *m) {
+  /// Currently, MCJIT only supports a single module and the module passed to
+  /// this function call is expected to be the contained module.  The module
+  /// is passed as a parameter here to prepare for multiple module support in
+  /// the future.
+  assert(M == m);
+
+  // Get a thread lock to make sure we aren't trying to compile multiple times
+  MutexGuard locked(lock);
+
+  // FIXME: Track compilation state on a per-module basis when multiple modules
+  //        are supported.
+  // Re-compilation is not supported
+  if (isCompiled)
+    return;
+
+  PassManager PM;
+
+  PM.add(new DataLayout(*TM->getDataLayout()));
+
+  // The RuntimeDyld will take ownership of this shortly
+  OwningPtr<ObjectBufferStream> Buffer(new ObjectBufferStream());
+
+  // Turn the machine code intermediate representation into bytes in memory
+  // that may be executed.
+  if (TM->addPassesToEmitMC(PM, Ctx, Buffer->getOStream(), false)) {
+    report_fatal_error("Target does not support MC emission!");
+  }
+
+  // Initialize passes.
+  PM.run(*m);
+  // Flush the output buffer to get the generated code into memory
+  Buffer->flush();
+
+  // Load the object into the dynamic linker.
+  // handing off ownership of the buffer
+  LoadedObject.reset(Dyld.loadObject(Buffer.take()));
+  if (!LoadedObject)
+    report_fatal_error(Dyld.getErrorString());
+
+  // Resolve any relocations.
+  Dyld.resolveRelocations();
+
+  // FIXME: Make this optional, maybe even move it to a JIT event listener
+  LoadedObject->registerWithDebugger();
+
+  NotifyObjectEmitted(*LoadedObject);
+
+  // FIXME: Add support for per-module compilation state
+  isCompiled = true;
+}
+
+// FIXME: Add a parameter to identify which object is being finalized when
+// MCJIT supports multiple modules.
+// FIXME: Provide a way to separate code emission, relocations and page 
+// protection in the interface.
+void MCJIT::finalizeObject() {
+  // If the module hasn't been compiled, just do that.
+  if (!isCompiled) {
+    // If the call to Dyld.resolveRelocations() is removed from emitObject()
+    // we'll need to do that here.
+    emitObject(M);
+
+    // Set page permissions.
+    MemMgr->applyPermissions();
+
+    return;
+  }
+
+  // Resolve any relocations.
+  Dyld.resolveRelocations();
+
+  // Set page permissions.
+  MemMgr->applyPermissions();
+}
+
+void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
+  report_fatal_error("not yet implemented");
+}
+
+void *MCJIT::getPointerToFunction(Function *F) {
+  // FIXME: This should really return a uint64_t since it's a pointer in the
+  // target address space, not our local address space. That's part of the
+  // ExecutionEngine interface, though. Fix that when the old JIT finally
+  // dies.
+
+  // FIXME: Add support for per-module compilation state
+  if (!isCompiled)
+    emitObject(M);
+
+  if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
+    bool AbortOnFailure = !F->hasExternalWeakLinkage();
+    void *Addr = getPointerToNamedFunction(F->getName(), AbortOnFailure);
+    addGlobalMapping(F, Addr);
+    return Addr;
+  }
+
+  // FIXME: Should the Dyld be retaining module information? Probably not.
+  // FIXME: Should we be using the mangler for this? Probably.
+  //
+  // This is the accessor for the target address, so make sure to check the
+  // load address of the symbol, not the local address.
+  StringRef BaseName = F->getName();
+  if (BaseName[0] == '\1')
+    return (void*)Dyld.getSymbolLoadAddress(BaseName.substr(1));
+  return (void*)Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + BaseName).str());
+}
+
+void *MCJIT::recompileAndRelinkFunction(Function *F) {
+  report_fatal_error("not yet implemented");
+}
+
+void MCJIT::freeMachineCodeForFunction(Function *F) {
+  report_fatal_error("not yet implemented");
+}
+
+GenericValue MCJIT::runFunction(Function *F,
+                                const std::vector<GenericValue> &ArgValues) {
+  assert(F && "Function *F was null at entry to run()");
+
+  void *FPtr = getPointerToFunction(F);
+  assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
+  FunctionType *FTy = F->getFunctionType();
+  Type *RetTy = FTy->getReturnType();
+
+  assert((FTy->getNumParams() == ArgValues.size() ||
+          (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
+         "Wrong number of arguments passed into function!");
+  assert(FTy->getNumParams() == ArgValues.size() &&
+         "This doesn't support passing arguments through varargs (yet)!");
+
+  // Handle some common cases first.  These cases correspond to common `main'
+  // prototypes.
+  if (RetTy->isIntegerTy(32) || RetTy->isVoidTy()) {
+    switch (ArgValues.size()) {
+    case 3:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy() &&
+          FTy->getParamType(2)->isPointerTy()) {
+        int (*PF)(int, char **, const char **) =
+          (int(*)(int, char **, const char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1]),
+                                 (const char **)GVTOP(ArgValues[2])));
+        return rv;
+      }
+      break;
+    case 2:
+      if (FTy->getParamType(0)->isIntegerTy(32) &&
+          FTy->getParamType(1)->isPointerTy()) {
+        int (*PF)(int, char **) = (int(*)(int, char **))(intptr_t)FPtr;
+
+        // Call the function.
+        GenericValue rv;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue(),
+                                 (char **)GVTOP(ArgValues[1])));
+        return rv;
+      }
+      break;
+    case 1:
+      if (FTy->getNumParams() == 1 &&
+          FTy->getParamType(0)->isIntegerTy(32)) {
+        GenericValue rv;
+        int (*PF)(int) = (int(*)(int))(intptr_t)FPtr;
+        rv.IntVal = APInt(32, PF(ArgValues[0].IntVal.getZExtValue()));
+        return rv;
+      }
+      break;
+    }
+  }
+
+  // Handle cases where no arguments are passed first.
+  if (ArgValues.empty()) {
+    GenericValue rv;
+    switch (RetTy->getTypeID()) {
+    default: llvm_unreachable("Unknown return type for function call!");
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(RetTy)->getBitWidth();
+      if (BitWidth == 1)
+        rv.IntVal = APInt(BitWidth, ((bool(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 8)
+        rv.IntVal = APInt(BitWidth, ((char(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 16)
+        rv.IntVal = APInt(BitWidth, ((short(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 32)
+        rv.IntVal = APInt(BitWidth, ((int(*)())(intptr_t)FPtr)());
+      else if (BitWidth <= 64)
+        rv.IntVal = APInt(BitWidth, ((int64_t(*)())(intptr_t)FPtr)());
+      else
+        llvm_unreachable("Integer types > 64 bits not supported");
+      return rv;
+    }
+    case Type::VoidTyID:
+      rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)());
+      return rv;
+    case Type::FloatTyID:
+      rv.FloatVal = ((float(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::DoubleTyID:
+      rv.DoubleVal = ((double(*)())(intptr_t)FPtr)();
+      return rv;
+    case Type::X86_FP80TyID:
+    case Type::FP128TyID:
+    case Type::PPC_FP128TyID:
+      llvm_unreachable("long double not supported yet");
+    case Type::PointerTyID:
+      return PTOGV(((void*(*)())(intptr_t)FPtr)());
+    }
+  }
+
+  llvm_unreachable("Full-featured argument passing not supported yet!");
+}
+
+void *MCJIT::getPointerToNamedFunction(const std::string &Name,
+                                       bool AbortOnFailure) {
+  // FIXME: Add support for per-module compilation state
+  if (!isCompiled)
+    emitObject(M);
+
+  if (!isSymbolSearchingDisabled() && MemMgr) {
+    void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
+    if (ptr)
+      return ptr;
+  }
+
+  /// If a LazyFunctionCreator is installed, use it to get/create the function.
+  if (LazyFunctionCreator)
+    if (void *RP = LazyFunctionCreator(Name))
+      return RP;
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                       "' which could not be resolved!");
+  }
+  return 0;
+}
+
+void MCJIT::RegisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  EventListeners.push_back(L);
+}
+void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  SmallVector<JITEventListener*, 2>::reverse_iterator I=
+      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
+  if (I != EventListeners.rend()) {
+    std::swap(*I, EventListeners.back());
+    EventListeners.pop_back();
+  }
+}
+void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyObjectEmitted(Obj);
+  }
+}
+void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyFreeingObject(Obj);
+  }
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
new file mode 100644
index 000000000000..283a8e528118
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -0,0 +1,113 @@
+//===-- MCJIT.h - Class definition for the MCJIT ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_H
+#define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/PassManager.h"
+
+namespace llvm {
+
+class ObjectImage;
+
+// FIXME: This makes all kinds of horrible assumptions for the time being,
+// like only having one module, not needing to worry about multi-threading,
+// blah blah. Purely in get-it-up-and-limping mode for now.
+
+class MCJIT : public ExecutionEngine {
+  MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
+        bool AllocateGVsWithCode);
+
+  TargetMachine *TM;
+  MCContext *Ctx;
+  RTDyldMemoryManager *MemMgr;
+  RuntimeDyld Dyld;
+  SmallVector<JITEventListener*, 2> EventListeners;
+
+  // FIXME: Add support for multiple modules
+  bool isCompiled;
+  Module *M;
+  OwningPtr<ObjectImage> LoadedObject;
+
+public:
+  ~MCJIT();
+
+  /// @name ExecutionEngine interface implementation
+  /// @{
+
+  virtual void finalizeObject();
+
+  virtual void *getPointerToBasicBlock(BasicBlock *BB);
+
+  virtual void *getPointerToFunction(Function *F);
+
+  virtual void *recompileAndRelinkFunction(Function *F);
+
+  virtual void freeMachineCodeForFunction(Function *F);
+
+  virtual GenericValue runFunction(Function *F,
+                                   const std::vector<GenericValue> &ArgValues);
+
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function by using the dlsym function call.  As such it is only
+  /// useful for resolving library symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function silently returns a null pointer. Otherwise,
+  /// it prints a message to stderr and aborts.
+  ///
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true);
+
+  /// mapSectionAddress - map a section to its target address space value.
+  /// Map the address of a JIT section as returned from the memory manager
+  /// to the address in the target process as the running code will see it.
+  /// This is the address which will be used for relocation resolution.
+  virtual void mapSectionAddress(const void *LocalAddress,
+                                 uint64_t TargetAddress) {
+    Dyld.mapSectionAddress(LocalAddress, TargetAddress);
+  }
+
+  virtual void RegisterJITEventListener(JITEventListener *L);
+  virtual void UnregisterJITEventListener(JITEventListener *L);
+
+  /// @}
+  /// @name (Private) Registration Interfaces
+  /// @{
+
+  static void Register() {
+    MCJITCtor = createJIT;
+  }
+
+  static ExecutionEngine *createJIT(Module *M,
+                                    std::string *ErrorStr,
+                                    JITMemoryManager *JMM,
+                                    bool GVsWithCode,
+                                    TargetMachine *TM);
+
+  // @}
+
+protected:
+  /// emitObject -- Generate a JITed object in memory from the specified module
+  /// Currently, MCJIT only supports a single module and the module passed to
+  /// this function call is expected to be the contained module.  The module
+  /// is passed as a parameter here to prepare for multiple module support in 
+  /// the future.
+  void emitObject(Module *M);
+
+  void NotifyObjectEmitted(const ObjectImage& Obj);
+  void NotifyFreeingObject(const ObjectImage& Obj);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
new file mode 100644
index 000000000000..fa35acd389ae
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -0,0 +1,226 @@
+//===- SectionMemoryManager.cpp - Memory manager for MCJIT/RtDyld *- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the section-based memory manager used by the MCJIT
+// execution engine and RuntimeDyld
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/MathExtras.h"
+
+#ifdef __linux__
+  // These includes used by SectionMemoryManager::getPointerToNamedFunction()
+  // for Glibc trickery. See comments in this function for more information.
+  #ifdef HAVE_SYS_STAT_H
+    #include <sys/stat.h>
+  #endif
+  #include <fcntl.h>
+  #include <unistd.h>
+#endif
+
+namespace llvm {
+
+uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID,
+                                                    bool IsReadOnly) {
+  if (IsReadOnly)
+    return allocateSection(RODataMem, Size, Alignment);
+  return allocateSection(RWDataMem, Size, Alignment);
+}
+
+uint8_t *SectionMemoryManager::allocateCodeSection(uintptr_t Size,
+                                                   unsigned Alignment,
+                                                   unsigned SectionID) {
+  return allocateSection(CodeMem, Size, Alignment);
+}
+
+uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
+                                               uintptr_t Size,
+                                               unsigned Alignment) {
+  if (!Alignment)
+    Alignment = 16;
+
+  assert(!(Alignment & (Alignment - 1)) && "Alignment must be a power of two.");
+
+  uintptr_t RequiredSize = Alignment * ((Size + Alignment - 1)/Alignment + 1);
+  uintptr_t Addr = 0;
+
+  // Look in the list of free memory regions and use a block there if one
+  // is available.
+  for (int i = 0, e = MemGroup.FreeMem.size(); i != e; ++i) {
+    sys::MemoryBlock &MB = MemGroup.FreeMem[i];
+    if (MB.size() >= RequiredSize) {
+      Addr = (uintptr_t)MB.base();
+      uintptr_t EndOfBlock = Addr + MB.size();
+      // Align the address.
+      Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+      // Store cutted free memory block.
+      MemGroup.FreeMem[i] = sys::MemoryBlock((void*)(Addr + Size),
+                                             EndOfBlock - Addr - Size);
+      return (uint8_t*)Addr;
+    }
+  }
+
+  // No pre-allocated free block was large enough. Allocate a new memory region.
+  // Note that all sections get allocated as read-write.  The permissions will
+  // be updated later based on memory group.
+  //
+  // FIXME: It would be useful to define a default allocation size (or add
+  // it as a constructor parameter) to minimize the number of allocations.
+  //
+  // FIXME: Initialize the Near member for each memory group to avoid
+  // interleaving.
+  error_code ec;
+  sys::MemoryBlock MB = sys::Memory::allocateMappedMemory(RequiredSize,
+                                                          &MemGroup.Near,
+                                                          sys::Memory::MF_READ |
+                                                            sys::Memory::MF_WRITE,
+                                                          ec);
+  if (ec) {
+    // FIXME: Add error propogation to the interface.
+    return NULL;
+  }
+
+  // Save this address as the basis for our next request
+  MemGroup.Near = MB;
+
+  MemGroup.AllocatedMem.push_back(MB);
+  Addr = (uintptr_t)MB.base();
+  uintptr_t EndOfBlock = Addr + MB.size();
+
+  // Align the address.
+  Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+
+  // The allocateMappedMemory may allocate much more memory than we need. In
+  // this case, we store the unused memory as a free memory block.
+  unsigned FreeSize = EndOfBlock-Addr-Size;
+  if (FreeSize > 16)
+    MemGroup.FreeMem.push_back(sys::MemoryBlock((void*)(Addr + Size), FreeSize));
+
+  // Return aligned address
+  return (uint8_t*)Addr;
+}
+
+bool SectionMemoryManager::applyPermissions(std::string *ErrMsg)
+{
+  // FIXME: Should in-progress permissions be reverted if an error occurs?
+  error_code ec;
+
+  // Make code memory executable.
+  ec = applyMemoryGroupPermissions(CodeMem,
+                                   sys::Memory::MF_READ | sys::Memory::MF_EXEC);
+  if (ec) {
+    if (ErrMsg) {
+      *ErrMsg = ec.message();
+    }
+    return true;
+  }
+
+  // Make read-only data memory read-only.
+  ec = applyMemoryGroupPermissions(RODataMem,
+                                   sys::Memory::MF_READ | sys::Memory::MF_EXEC);
+  if (ec) {
+    if (ErrMsg) {
+      *ErrMsg = ec.message();
+    }
+    return true;
+  }
+
+  // Read-write data memory already has the correct permissions
+
+  return false;
+}
+
+error_code SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
+                                                             unsigned Permissions) {
+
+  for (int i = 0, e = MemGroup.AllocatedMem.size(); i != e; ++i) {
+      error_code ec;
+      ec = sys::Memory::protectMappedMemory(MemGroup.AllocatedMem[i],
+                                            Permissions);
+      if (ec) {
+        return ec;
+      }
+  }
+
+  return error_code::success();
+}
+
+void SectionMemoryManager::invalidateInstructionCache() {
+  for (int i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i)
+    sys::Memory::InvalidateInstructionCache(CodeMem.AllocatedMem[i].base(),
+                                            CodeMem.AllocatedMem[i].size());
+}
+
+static int jit_noop() {
+  return 0;
+}
+
+void *SectionMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                       bool AbortOnFailure) {
+#if defined(__linux__)
+  //===--------------------------------------------------------------------===//
+  // Function stubs that are invoked instead of certain library calls
+  //
+  // Force the following functions to be linked in to anything that uses the
+  // JIT. This is a hack designed to work around the all-too-clever Glibc
+  // strategy of making these functions work differently when inlined vs. when
+  // not inlined, and hiding their real definitions in a separate archive file
+  // that the dynamic linker can't see. For more info, search for
+  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+  if (Name == "stat") return (void*)(intptr_t)&stat;
+  if (Name == "fstat") return (void*)(intptr_t)&fstat;
+  if (Name == "lstat") return (void*)(intptr_t)&lstat;
+  if (Name == "stat64") return (void*)(intptr_t)&stat64;
+  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
+  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
+  if (Name == "atexit") return (void*)(intptr_t)&atexit;
+  if (Name == "mknod") return (void*)(intptr_t)&mknod;
+#endif // __linux__
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
+  const char *NameStr = Name.c_str();
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  if (AbortOnFailure)
+    report_fatal_error("Program used external function '" + Name +
+                      "' which could not be resolved!");
+  return 0;
+}
+
+SectionMemoryManager::~SectionMemoryManager() {
+  for (unsigned i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i)
+    sys::Memory::releaseMappedMemory(CodeMem.AllocatedMem[i]);
+  for (unsigned i = 0, e = RWDataMem.AllocatedMem.size(); i != e; ++i)
+    sys::Memory::releaseMappedMemory(RWDataMem.AllocatedMem[i]);
+  for (unsigned i = 0, e = RODataMem.AllocatedMem.size(); i != e; ++i)
+    sys::Memory::releaseMappedMemory(RODataMem.AllocatedMem[i]);
+}
+
+} // namespace llvm
+
diff --git a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
new file mode 100644
index 000000000000..38867ecca591
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -0,0 +1,177 @@
+//===-- OProfileJITEventListener.cpp - Tell OProfile about JITted code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a JITEventListener object that uses OProfileWrapper to tell
+// oprofile about JITted functions, including source line information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
+
+#define DEBUG_TYPE "oprofile-jit-event-listener"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Errno.h"
+#include "EventListenerCommon.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+
+using namespace llvm;
+using namespace llvm::jitprofiling;
+
+namespace {
+
+class OProfileJITEventListener : public JITEventListener {
+  OProfileWrapper& Wrapper;
+
+  void initialize();
+
+public:
+  OProfileJITEventListener(OProfileWrapper& LibraryWrapper)
+  : Wrapper(LibraryWrapper) {
+    initialize();
+  }
+
+  ~OProfileJITEventListener();
+
+  virtual void NotifyFunctionEmitted(const Function &F,
+                                void *FnStart, size_t FnSize,
+                                const JITEvent_EmittedFunctionDetails &Details);
+
+  virtual void NotifyFreeingMachineCode(void *OldPtr);
+};
+
+void OProfileJITEventListener::initialize() {
+  if (!Wrapper.op_open_agent()) {
+    const std::string err_str = sys::StrError();
+    DEBUG(dbgs() << "Failed to connect to OProfile agent: " << err_str << "\n");
+  } else {
+    DEBUG(dbgs() << "Connected to OProfile agent.\n");
+  }
+}
+
+OProfileJITEventListener::~OProfileJITEventListener() {
+  if (Wrapper.isAgentAvailable()) {
+    if (Wrapper.op_close_agent() == -1) {
+      const std::string err_str = sys::StrError();
+      DEBUG(dbgs() << "Failed to disconnect from OProfile agent: "
+                   << err_str << "\n");
+    } else {
+      DEBUG(dbgs() << "Disconnected from OProfile agent.\n");
+    }
+  }
+}
+
+static debug_line_info LineStartToOProfileFormat(
+    const MachineFunction &MF, FilenameCache &Filenames,
+    uintptr_t Address, DebugLoc Loc) {
+  debug_line_info Result;
+  Result.vma = Address;
+  Result.lineno = Loc.getLine();
+  Result.filename = Filenames.getFilename(
+    Loc.getScope(MF.getFunction()->getContext()));
+  DEBUG(dbgs() << "Mapping " << reinterpret_cast<void*>(Result.vma) << " to "
+               << Result.filename << ":" << Result.lineno << "\n");
+  return Result;
+}
+
+// Adds the just-emitted function to the symbol table.
+void OProfileJITEventListener::NotifyFunctionEmitted(
+    const Function &F, void *FnStart, size_t FnSize,
+    const JITEvent_EmittedFunctionDetails &Details) {
+  assert(F.hasName() && FnStart != 0 && "Bad symbol to add");
+  if (Wrapper.op_write_native_code(F.getName().data(),
+                           reinterpret_cast<uint64_t>(FnStart),
+                           FnStart, FnSize) == -1) {
+    DEBUG(dbgs() << "Failed to tell OProfile about native function "
+          << F.getName() << " at ["
+          << FnStart << "-" << ((char*)FnStart + FnSize) << "]\n");
+    return;
+  }
+
+  if (!Details.LineStarts.empty()) {
+    // Now we convert the line number information from the address/DebugLoc
+    // format in Details to the address/filename/lineno format that OProfile
+    // expects.  Note that OProfile 0.9.4 has a bug that causes it to ignore
+    // line numbers for addresses above 4G.
+    FilenameCache Filenames;
+    std::vector<debug_line_info> LineInfo;
+    LineInfo.reserve(1 + Details.LineStarts.size());
+
+    DebugLoc FirstLoc = Details.LineStarts[0].Loc;
+    assert(!FirstLoc.isUnknown()
+           && "LineStarts should not contain unknown DebugLocs");
+    MDNode *FirstLocScope = FirstLoc.getScope(F.getContext());
+    DISubprogram FunctionDI = getDISubprogram(FirstLocScope);
+    if (FunctionDI.Verify()) {
+      // If we have debug info for the function itself, use that as the line
+      // number of the first several instructions.  Otherwise, after filling
+      // LineInfo, we'll adjust the address of the first line number to point at
+      // the start of the function.
+      debug_line_info line_info;
+      line_info.vma = reinterpret_cast<uintptr_t>(FnStart);
+      line_info.lineno = FunctionDI.getLineNumber();
+      line_info.filename = Filenames.getFilename(FirstLocScope);
+      LineInfo.push_back(line_info);
+    }
+
+    for (std::vector<EmittedFunctionDetails::LineStart>::const_iterator
+           I = Details.LineStarts.begin(), E = Details.LineStarts.end();
+         I != E; ++I) {
+      LineInfo.push_back(LineStartToOProfileFormat(
+                           *Details.MF, Filenames, I->Address, I->Loc));
+    }
+
+    // In case the function didn't have line info of its own, adjust the first
+    // line info's address to include the start of the function.
+    LineInfo[0].vma = reinterpret_cast<uintptr_t>(FnStart);
+
+    if (Wrapper.op_write_debug_line_info(FnStart, LineInfo.size(),
+                                      &*LineInfo.begin()) == -1) {
+      DEBUG(dbgs()
+            << "Failed to tell OProfile about line numbers for native function "
+            << F.getName() << " at ["
+            << FnStart << "-" << ((char*)FnStart + FnSize) << "]\n");
+    }
+  }
+}
+
+// Removes the being-deleted function from the symbol table.
+void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
+  assert(FnStart && "Invalid function pointer");
+  if (Wrapper.op_unload_native_code(reinterpret_cast<uint64_t>(FnStart)) == -1) {
+    DEBUG(dbgs()
+          << "Failed to tell OProfile about unload of native function at "
+          << FnStart << "\n");
+  }
+}
+
+}  // anonymous namespace.
+
+namespace llvm {
+JITEventListener *JITEventListener::createOProfileJITEventListener() {
+  static OwningPtr<OProfileWrapper> JITProfilingWrapper(new OProfileWrapper);
+  return new OProfileJITEventListener(*JITProfilingWrapper);
+}
+
+// for testing
+JITEventListener *JITEventListener::createOProfileJITEventListener(
+                                      OProfileWrapper* TestImpl) {
+  return new OProfileJITEventListener(*TestImpl);
+}
+
+} // namespace llvm
+
diff --git a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
new file mode 100644
index 000000000000..7c0d39518595
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -0,0 +1,264 @@
+//===-- OProfileWrapper.cpp - OProfile JIT API Wrapper implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface in OProfileWrapper.h. It is responsible
+// for loading the opagent dynamic library when the first call to an op_
+// function occurs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
+
+#define DEBUG_TYPE "oprofile-wrapper"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/ADT/SmallString.h"
+
+#include <sstream>
+#include <cstring>
+#include <stddef.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+namespace {
+
+// Global mutex to ensure a single thread initializes oprofile agent.
+llvm::sys::Mutex OProfileInitializationMutex;
+
+} // anonymous namespace
+
+namespace llvm {
+
+OProfileWrapper::OProfileWrapper()
+: Agent(0),
+  OpenAgentFunc(0),
+  CloseAgentFunc(0),
+  WriteNativeCodeFunc(0),
+  WriteDebugLineInfoFunc(0),
+  UnloadNativeCodeFunc(0),
+  MajorVersionFunc(0),
+  MinorVersionFunc(0),
+  IsOProfileRunningFunc(0),
+  Initialized(false) {
+}
+
+bool OProfileWrapper::initialize() {
+  using namespace llvm;
+  using namespace llvm::sys;
+
+  MutexGuard Guard(OProfileInitializationMutex);
+
+  if (Initialized)
+    return OpenAgentFunc != 0;
+
+  Initialized = true;
+
+  // If the oprofile daemon is not running, don't load the opagent library
+  if (!isOProfileRunning()) {
+    DEBUG(dbgs() << "OProfile daemon is not detected.\n");
+    return false;
+  }
+
+  std::string error;
+  if(!DynamicLibrary::LoadLibraryPermanently("libopagent.so", &error)) {
+    DEBUG(dbgs()
+            << "OProfile connector library libopagent.so could not be loaded: "
+            << error << "\n");
+  }
+
+  // Get the addresses of the opagent functions
+  OpenAgentFunc = (op_open_agent_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_open_agent");
+  CloseAgentFunc = (op_close_agent_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_close_agent");
+  WriteNativeCodeFunc = (op_write_native_code_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_write_native_code");
+  WriteDebugLineInfoFunc = (op_write_debug_line_info_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_write_debug_line_info");
+  UnloadNativeCodeFunc = (op_unload_native_code_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_unload_native_code");
+  MajorVersionFunc = (op_major_version_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_major_version");
+  MinorVersionFunc = (op_major_version_ptr_t)(intptr_t)
+          DynamicLibrary::SearchForAddressOfSymbol("op_minor_version");
+
+  // With missing functions, we can do nothing
+  if (!OpenAgentFunc
+      || !CloseAgentFunc
+      || !WriteNativeCodeFunc
+      || !WriteDebugLineInfoFunc
+      || !UnloadNativeCodeFunc) {
+    OpenAgentFunc = 0;
+    CloseAgentFunc = 0;
+    WriteNativeCodeFunc = 0;
+    WriteDebugLineInfoFunc = 0;
+    UnloadNativeCodeFunc = 0;
+    return false;
+  }
+
+  return true;
+}
+
+bool OProfileWrapper::isOProfileRunning() {
+  if (IsOProfileRunningFunc != 0)
+    return IsOProfileRunningFunc();
+  return checkForOProfileProcEntry();
+}
+
+bool OProfileWrapper::checkForOProfileProcEntry() {
+  DIR* ProcDir;
+
+  ProcDir = opendir("/proc");
+  if (!ProcDir)
+    return false;
+
+  // Walk the /proc tree looking for the oprofile daemon
+  struct dirent* Entry;
+  while (0 != (Entry = readdir(ProcDir))) {
+    if (Entry->d_type == DT_DIR) {
+      // Build a path from the current entry name
+      SmallString<256> CmdLineFName;
+      raw_svector_ostream(CmdLineFName) << "/proc/" << Entry->d_name
+                                        << "/cmdline";
+
+      // Open the cmdline file
+      int CmdLineFD = open(CmdLineFName.c_str(), S_IRUSR);
+      if (CmdLineFD != -1) {
+        char    ExeName[PATH_MAX+1];
+        char*   BaseName = 0;
+
+        // Read the cmdline file
+        ssize_t NumRead = read(CmdLineFD, ExeName, PATH_MAX+1);
+        close(CmdLineFD);
+        ssize_t Idx = 0;
+
+        // Find the terminator for the first string
+        while (Idx < NumRead-1 && ExeName[Idx] != 0) {
+          Idx++;
+        }
+
+        // Go back to the last non-null character
+        Idx--;
+
+        // Find the last path separator in the first string
+        while (Idx > 0) {
+          if (ExeName[Idx] == '/') {
+            BaseName = ExeName + Idx + 1;
+            break;
+          }
+          Idx--;
+        }
+
+        // Test this to see if it is the oprofile daemon
+        if (BaseName != 0 && !strcmp("oprofiled", BaseName)) {
+          // If it is, we're done
+          closedir(ProcDir);
+          return true;
+        }
+      }
+    }
+  }
+
+  // We've looked through all the files and didn't find the daemon
+  closedir(ProcDir);
+  return false;
+}
+
+bool OProfileWrapper::op_open_agent() {
+  if (!Initialized)
+    initialize();
+
+  if (OpenAgentFunc != 0) {
+    Agent = OpenAgentFunc();
+    return Agent != 0;
+  }
+
+  return false;
+}
+
+int OProfileWrapper::op_close_agent() {
+  if (!Initialized)
+    initialize();
+
+  int ret = -1;
+  if (Agent && CloseAgentFunc) {
+    ret = CloseAgentFunc(Agent);
+    if (ret == 0) {
+      Agent = 0;
+    }
+  }
+  return ret;
+}
+
+bool OProfileWrapper::isAgentAvailable() {
+  return Agent != 0;
+}
+
+int OProfileWrapper::op_write_native_code(const char* Name,
+                                          uint64_t Addr,
+                                          void const* Code,
+                                          const unsigned int Size) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && WriteNativeCodeFunc)
+    return WriteNativeCodeFunc(Agent, Name, Addr, Code, Size);
+
+  return -1;
+}
+
+int OProfileWrapper::op_write_debug_line_info(
+  void const* Code,
+  size_t NumEntries,
+  struct debug_line_info const* Info) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && WriteDebugLineInfoFunc)
+    return WriteDebugLineInfoFunc(Agent, Code, NumEntries, Info);
+
+  return -1;
+}
+
+int OProfileWrapper::op_major_version() {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && MajorVersionFunc)
+    return MajorVersionFunc();
+
+  return -1;
+}
+
+int OProfileWrapper::op_minor_version() {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && MinorVersionFunc)
+    return MinorVersionFunc();
+
+  return -1;
+}
+
+int  OProfileWrapper::op_unload_native_code(uint64_t Addr) {
+  if (!Initialized)
+    initialize();
+
+  if (Agent && UnloadNativeCodeFunc)
+    return UnloadNativeCodeFunc(Agent, Addr);
+
+  return -1;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
new file mode 100644
index 000000000000..603c526d06e3
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
@@ -0,0 +1,214 @@
+//===-- GDBRegistrar.cpp - Registers objects with GDB ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JITRegistrar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
+
+using namespace llvm;
+
+// This must be kept in sync with gdb/gdb/jit.h .
+extern "C" {
+
+  typedef enum {
+    JIT_NOACTION = 0,
+    JIT_REGISTER_FN,
+    JIT_UNREGISTER_FN
+  } jit_actions_t;
+
+  struct jit_code_entry {
+    struct jit_code_entry *next_entry;
+    struct jit_code_entry *prev_entry;
+    const char *symfile_addr;
+    uint64_t symfile_size;
+  };
+
+  struct jit_descriptor {
+    uint32_t version;
+    // This should be jit_actions_t, but we want to be specific about the
+    // bit-width.
+    uint32_t action_flag;
+    struct jit_code_entry *relevant_entry;
+    struct jit_code_entry *first_entry;
+  };
+
+  // We put information about the JITed function in this global, which the
+  // debugger reads.  Make sure to specify the version statically, because the
+  // debugger checks the version before we can set it during runtime.
+  struct jit_descriptor __jit_debug_descriptor = { 1, 0, 0, 0 };
+
+  // Debuggers puts a breakpoint in this function.
+  LLVM_ATTRIBUTE_NOINLINE void __jit_debug_register_code() { }
+
+}
+
+namespace {
+
+// Buffer for an in-memory object file in executable memory
+typedef llvm::DenseMap< const char*,
+                        std::pair<std::size_t, jit_code_entry*> >
+  RegisteredObjectBufferMap;
+
+/// Global access point for the JIT debugging interface designed for use with a
+/// singleton toolbox. Handles thread-safe registration and deregistration of
+/// object files that are in executable memory managed by the client of this
+/// class.
+class GDBJITRegistrar : public JITRegistrar {
+  /// A map of in-memory object files that have been registered with the
+  /// JIT interface.
+  RegisteredObjectBufferMap ObjectBufferMap;
+
+public:
+  /// Instantiates the JIT service.
+  GDBJITRegistrar() : ObjectBufferMap() {}
+
+  /// Unregisters each object that was previously registered and releases all
+  /// internal resources.
+  virtual ~GDBJITRegistrar();
+
+  /// Creates an entry in the JIT registry for the buffer @p Object,
+  /// which must contain an object file in executable memory with any
+  /// debug information for the debugger.
+  void registerObject(const ObjectBuffer &Object);
+
+  /// Removes the internal registration of @p Object, and
+  /// frees associated resources.
+  /// Returns true if @p Object was found in ObjectBufferMap.
+  bool deregisterObject(const ObjectBuffer &Object);
+
+private:
+  /// Deregister the debug info for the given object file from the debugger
+  /// and delete any temporary copies.  This private method does not remove
+  /// the function from Map so that it can be called while iterating over Map.
+  void deregisterObjectInternal(RegisteredObjectBufferMap::iterator I);
+};
+
+/// Lock used to serialize all jit registration events, since they
+/// modify global variables.
+llvm::sys::Mutex JITDebugLock;
+
+/// Acquire the lock and do the registration.
+void NotifyDebugger(jit_code_entry* JITCodeEntry) {
+  llvm::MutexGuard locked(JITDebugLock);
+  __jit_debug_descriptor.action_flag = JIT_REGISTER_FN;
+
+  // Insert this entry at the head of the list.
+  JITCodeEntry->prev_entry = NULL;
+  jit_code_entry* NextEntry = __jit_debug_descriptor.first_entry;
+  JITCodeEntry->next_entry = NextEntry;
+  if (NextEntry != NULL) {
+    NextEntry->prev_entry = JITCodeEntry;
+  }
+  __jit_debug_descriptor.first_entry = JITCodeEntry;
+  __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+  __jit_debug_register_code();
+}
+
+GDBJITRegistrar::~GDBJITRegistrar() {
+  // Free all registered object files.
+ for (RegisteredObjectBufferMap::iterator I = ObjectBufferMap.begin(), E = ObjectBufferMap.end();
+       I != E; ++I) {
+    // Call the private method that doesn't update the map so our iterator
+    // doesn't break.
+    deregisterObjectInternal(I);
+  }
+  ObjectBufferMap.clear();
+}
+
+void GDBJITRegistrar::registerObject(const ObjectBuffer &Object) {
+
+  const char *Buffer = Object.getBufferStart();
+  size_t      Size = Object.getBufferSize();
+
+  assert(Buffer && "Attempt to register a null object with a debugger.");
+  assert(ObjectBufferMap.find(Buffer) == ObjectBufferMap.end() &&
+         "Second attempt to perform debug registration.");
+  jit_code_entry* JITCodeEntry = new jit_code_entry();
+
+  if (JITCodeEntry == 0) {
+    llvm::report_fatal_error(
+      "Allocation failed when registering a JIT entry!\n");
+  }
+  else {
+    JITCodeEntry->symfile_addr = Buffer;
+    JITCodeEntry->symfile_size = Size;
+
+    ObjectBufferMap[Buffer] = std::make_pair(Size, JITCodeEntry);
+    NotifyDebugger(JITCodeEntry);
+  }
+}
+
+bool GDBJITRegistrar::deregisterObject(const ObjectBuffer& Object) {
+  const char *Buffer = Object.getBufferStart();
+  RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Buffer);
+
+  if (I != ObjectBufferMap.end()) {
+    deregisterObjectInternal(I);
+    ObjectBufferMap.erase(I);
+    return true;
+  }
+  return false;
+}
+
+void GDBJITRegistrar::deregisterObjectInternal(
+    RegisteredObjectBufferMap::iterator I) {
+
+  jit_code_entry*& JITCodeEntry = I->second.second;
+
+  // Acquire the lock and do the unregistration.
+  {
+    llvm::MutexGuard locked(JITDebugLock);
+    __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN;
+
+    // Remove the jit_code_entry from the linked list.
+    jit_code_entry* PrevEntry = JITCodeEntry->prev_entry;
+    jit_code_entry* NextEntry = JITCodeEntry->next_entry;
+
+    if (NextEntry) {
+      NextEntry->prev_entry = PrevEntry;
+    }
+    if (PrevEntry) {
+      PrevEntry->next_entry = NextEntry;
+    }
+    else {
+      assert(__jit_debug_descriptor.first_entry == JITCodeEntry);
+      __jit_debug_descriptor.first_entry = NextEntry;
+    }
+
+    // Tell the debugger which entry we removed, and unregister the code.
+    __jit_debug_descriptor.relevant_entry = JITCodeEntry;
+    __jit_debug_register_code();
+  }
+
+  delete JITCodeEntry;
+  JITCodeEntry = NULL;
+}
+
+} // end namespace
+
+namespace llvm {
+
+JITRegistrar& JITRegistrar::getGDBRegistrar() {
+  static GDBJITRegistrar* sRegistrar = NULL;
+  if (sRegistrar == NULL) {
+    // The mutex is here so that it won't slow down access once the registrar
+    // is instantiated
+    llvm::MutexGuard locked(JITDebugLock);
+    // Check again to be sure another thread didn't create this while we waited
+    if (sRegistrar == NULL) {
+      sRegistrar = new GDBJITRegistrar;
+    }
+  }
+  return *sRegistrar;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
new file mode 100644
index 000000000000..69e9dbe490d6
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -0,0 +1,43 @@
+//===-- JITRegistrar.h - Registers objects with a debugger ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+#define LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
+
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+
+namespace llvm {
+
+/// Global access point for the JIT debugging interface.
+class JITRegistrar {
+public:
+  /// Instantiates the JIT service.
+  JITRegistrar() {}
+
+  /// Unregisters each object that was previously registered and releases all
+  /// internal resources.
+  virtual ~JITRegistrar() {}
+
+  /// Creates an entry in the JIT registry for the buffer @p Object,
+  /// which must contain an object file in executable memory with any
+  /// debug information for the debugger.
+  virtual void registerObject(const ObjectBuffer &Object) = 0;
+
+  /// Removes the internal registration of @p Object, and
+  /// frees associated resources.
+  /// Returns true if @p Object was previously registered.
+  virtual bool deregisterObject(const ObjectBuffer &Object) = 0;
+
+  /// Returns a reference to a GDB JIT registrar singleton
+  static JITRegistrar& getGDBRegistrar();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
new file mode 100644
index 000000000000..89350cc5b621
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -0,0 +1,78 @@
+//===-- ObjectImageCommon.h - Format independent executuable object image -===//
+//
+//		       The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a file format independent ObjectImage class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+#define LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Object/ObjectFile.h"
+
+namespace llvm {
+
+class ObjectImageCommon : public ObjectImage {
+  ObjectImageCommon(); // = delete
+  ObjectImageCommon(const ObjectImageCommon &other); // = delete
+
+protected:
+  object::ObjectFile *ObjFile;
+
+  // This form of the constructor allows subclasses to use
+  // format-specific subclasses of ObjectFile directly
+  ObjectImageCommon(ObjectBuffer *Input, object::ObjectFile *Obj)
+  : ObjectImage(Input), // saves Input as Buffer and takes ownership
+    ObjFile(Obj)
+  {
+  }
+
+public:
+  ObjectImageCommon(ObjectBuffer* Input)
+  : ObjectImage(Input) // saves Input as Buffer and takes ownership
+  {
+    ObjFile = object::ObjectFile::createObjectFile(Buffer->getMemBuffer());
+  }
+  virtual ~ObjectImageCommon() { delete ObjFile; }
+
+  virtual object::symbol_iterator begin_symbols() const
+	      { return ObjFile->begin_symbols(); }
+  virtual object::symbol_iterator end_symbols() const
+	      { return ObjFile->end_symbols(); }
+
+  virtual object::section_iterator begin_sections() const
+	      { return ObjFile->begin_sections(); }
+  virtual object::section_iterator end_sections() const
+	      { return ObjFile->end_sections(); }
+
+  virtual /* Triple::ArchType */ unsigned getArch() const
+	      { return ObjFile->getArch(); }
+
+  virtual StringRef getData() const { return ObjFile->getData(); }
+
+  virtual object::ObjectFile* getObjectFile() const { return ObjFile; }
+
+  // Subclasses can override these methods to update the image with loaded
+  // addresses for sections and common symbols
+  virtual void updateSectionAddress(const object::SectionRef &Sec,
+				    uint64_t Addr) {}
+  virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
+	      {}
+
+  // Subclasses can override these methods to provide JIT debugging support
+  virtual void registerWithDebugger() {}
+  virtual void deregisterWithDebugger() {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
new file mode 100644
index 000000000000..409b25fef3af
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -0,0 +1,537 @@
+//===-- RuntimeDyld.cpp - Run-time dynamic linker for MC-JIT ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dyld"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "ObjectImageCommon.h"
+#include "RuntimeDyldELF.h"
+#include "RuntimeDyldImpl.h"
+#include "RuntimeDyldMachO.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// Empty out-of-line virtual destructor as the key function.
+RTDyldMemoryManager::~RTDyldMemoryManager() {}
+RuntimeDyldImpl::~RuntimeDyldImpl() {}
+
+namespace llvm {
+
+// Resolve the relocations for all symbols we currently know about.
+void RuntimeDyldImpl::resolveRelocations() {
+  // First, resolve relocations associated with external symbols.
+  resolveExternalSymbols();
+
+  // Just iterate over the sections we have and resolve all the relocations
+  // in them. Gross overkill, but it gets the job done.
+  for (int i = 0, e = Sections.size(); i != e; ++i) {
+    uint64_t Addr = Sections[i].LoadAddress;
+    DEBUG(dbgs() << "Resolving relocations Section #" << i
+            << "\t" << format("%p", (uint8_t *)Addr)
+            << "\n");
+    resolveRelocationList(Relocations[i], Addr);
+  }
+}
+
+void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
+                                        uint64_t TargetAddress) {
+  for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
+    if (Sections[i].Address == LocalAddress) {
+      reassignSectionAddress(i, TargetAddress);
+      return;
+    }
+  }
+  llvm_unreachable("Attempting to remap address of unknown section!");
+}
+
+// Subclasses can implement this method to create specialized image instances.
+// The caller owns the pointer that is returned.
+ObjectImage *RuntimeDyldImpl::createObjectImage(ObjectBuffer *InputBuffer) {
+  return new ObjectImageCommon(InputBuffer);
+}
+
+ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
+  OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
+  if (!obj)
+    report_fatal_error("Unable to create object image from memory buffer!");
+
+  Arch = (Triple::ArchType)obj->getArch();
+
+  // Symbols found in this object
+  StringMap<SymbolLoc> LocalSymbols;
+  // Used sections from the object file
+  ObjSectionToIDMap LocalSections;
+
+  // Common symbols requiring allocation, with their sizes and alignments
+  CommonSymbolMap CommonSymbols;
+  // Maximum required total memory to allocate all common symbols
+  uint64_t CommonSize = 0;
+
+  error_code err;
+  // Parse symbols
+  DEBUG(dbgs() << "Parse symbols:\n");
+  for (symbol_iterator i = obj->begin_symbols(), e = obj->end_symbols();
+       i != e; i.increment(err)) {
+    Check(err);
+    object::SymbolRef::Type SymType;
+    StringRef Name;
+    Check(i->getType(SymType));
+    Check(i->getName(Name));
+
+    uint32_t flags;
+    Check(i->getFlags(flags));
+
+    bool isCommon = flags & SymbolRef::SF_Common;
+    if (isCommon) {
+      // Add the common symbols to a list.  We'll allocate them all below.
+      uint64_t Align = getCommonSymbolAlignment(*i);
+      uint64_t Size = 0;
+      Check(i->getSize(Size));
+      CommonSize += Size + Align;
+      CommonSymbols[*i] = CommonSymbolInfo(Size, Align);
+    } else {
+      if (SymType == object::SymbolRef::ST_Function ||
+          SymType == object::SymbolRef::ST_Data ||
+          SymType == object::SymbolRef::ST_Unknown) {
+        uint64_t FileOffset;
+        StringRef SectionData;
+        bool IsCode;
+        section_iterator si = obj->end_sections();
+        Check(i->getFileOffset(FileOffset));
+        Check(i->getSection(si));
+        if (si == obj->end_sections()) continue;
+        Check(si->getContents(SectionData));
+        Check(si->isText(IsCode));
+        const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
+                                (uintptr_t)FileOffset;
+        uintptr_t SectOffset = (uintptr_t)(SymPtr -
+                                           (const uint8_t*)SectionData.begin());
+        unsigned SectionID = findOrEmitSection(*obj, *si, IsCode, LocalSections);
+        LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
+        DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
+                     << " flags: " << flags
+                     << " SID: " << SectionID
+                     << " Offset: " << format("%p", SectOffset));
+        GlobalSymbolTable[Name] = SymbolLoc(SectionID, SectOffset);
+      }
+    }
+    DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name << "\n");
+  }
+
+  // Allocate common symbols
+  if (CommonSize != 0)
+    emitCommonSymbols(*obj, CommonSymbols, CommonSize, LocalSymbols);
+
+  // Parse and process relocations
+  DEBUG(dbgs() << "Parse relocations:\n");
+  for (section_iterator si = obj->begin_sections(),
+       se = obj->end_sections(); si != se; si.increment(err)) {
+    Check(err);
+    bool isFirstRelocation = true;
+    unsigned SectionID = 0;
+    StubMap Stubs;
+
+    for (relocation_iterator i = si->begin_relocations(),
+         e = si->end_relocations(); i != e; i.increment(err)) {
+      Check(err);
+
+      // If it's the first relocation in this section, find its SectionID
+      if (isFirstRelocation) {
+        SectionID = findOrEmitSection(*obj, *si, true, LocalSections);
+        DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
+        isFirstRelocation = false;
+      }
+
+      ObjRelocationInfo RI;
+      RI.SectionID = SectionID;
+      Check(i->getAdditionalInfo(RI.AdditionalInfo));
+      Check(i->getOffset(RI.Offset));
+      Check(i->getSymbol(RI.Symbol));
+      Check(i->getType(RI.Type));
+
+      DEBUG(dbgs() << "\t\tAddend: " << RI.AdditionalInfo
+                   << " Offset: " << format("%p", (uintptr_t)RI.Offset)
+                   << " Type: " << (uint32_t)(RI.Type & 0xffffffffL)
+                   << "\n");
+      processRelocationRef(RI, *obj, LocalSections, LocalSymbols, Stubs);
+    }
+  }
+
+  return obj.take();
+}
+
+void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
+                                        const CommonSymbolMap &CommonSymbols,
+                                        uint64_t TotalSize,
+                                        SymbolTableMap &SymbolTable) {
+  // Allocate memory for the section
+  unsigned SectionID = Sections.size();
+  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
+                                              SectionID, false);
+  if (!Addr)
+    report_fatal_error("Unable to allocate memory for common symbols!");
+  uint64_t Offset = 0;
+  Sections.push_back(SectionEntry(StringRef(), Addr, TotalSize, TotalSize, 0));
+  memset(Addr, 0, TotalSize);
+
+  DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID
+               << " new addr: " << format("%p", Addr)
+               << " DataSize: " << TotalSize
+               << "\n");
+
+  // Assign the address of each symbol
+  for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(),
+       itEnd = CommonSymbols.end(); it != itEnd; it++) {
+    uint64_t Size = it->second.first;
+    uint64_t Align = it->second.second;
+    StringRef Name;
+    it->first.getName(Name);
+    if (Align) {
+      // This symbol has an alignment requirement.
+      uint64_t AlignOffset = OffsetToAlignment((uint64_t)Addr, Align);
+      Addr += AlignOffset;
+      Offset += AlignOffset;
+      DEBUG(dbgs() << "Allocating common symbol " << Name << " address " <<
+                      format("%p\n", Addr));
+    }
+    Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
+    SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset);
+    Offset += Size;
+    Addr += Size;
+  }
+}
+
+unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
+                                      const SectionRef &Section,
+                                      bool IsCode) {
+
+  unsigned StubBufSize = 0,
+           StubSize = getMaxStubSize();
+  error_code err;
+  if (StubSize > 0) {
+    for (relocation_iterator i = Section.begin_relocations(),
+         e = Section.end_relocations(); i != e; i.increment(err), Check(err))
+      StubBufSize += StubSize;
+  }
+  StringRef data;
+  uint64_t Alignment64;
+  Check(Section.getContents(data));
+  Check(Section.getAlignment(Alignment64));
+
+  unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
+  bool IsRequired;
+  bool IsVirtual;
+  bool IsZeroInit;
+  bool IsReadOnly;
+  uint64_t DataSize;
+  StringRef Name;
+  Check(Section.isRequiredForExecution(IsRequired));
+  Check(Section.isVirtual(IsVirtual));
+  Check(Section.isZeroInit(IsZeroInit));
+  Check(Section.isReadOnlyData(IsReadOnly));
+  Check(Section.getSize(DataSize));
+  Check(Section.getName(Name));
+
+  unsigned Allocate;
+  unsigned SectionID = Sections.size();
+  uint8_t *Addr;
+  const char *pData = 0;
+
+  // Some sections, such as debug info, don't need to be loaded for execution.
+  // Leave those where they are.
+  if (IsRequired) {
+    Allocate = DataSize + StubBufSize;
+    Addr = IsCode
+      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
+      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, IsReadOnly);
+    if (!Addr)
+      report_fatal_error("Unable to allocate section memory!");
+
+    // Virtual sections have no data in the object image, so leave pData = 0
+    if (!IsVirtual)
+      pData = data.data();
+
+    // Zero-initialize or copy the data from the image
+    if (IsZeroInit || IsVirtual)
+      memset(Addr, 0, DataSize);
+    else
+      memcpy(Addr, pData, DataSize);
+
+    DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
+                 << " obj addr: " << format("%p", pData)
+                 << " new addr: " << format("%p", Addr)
+                 << " DataSize: " << DataSize
+                 << " StubBufSize: " << StubBufSize
+                 << " Allocate: " << Allocate
+                 << "\n");
+    Obj.updateSectionAddress(Section, (uint64_t)Addr);
+  }
+  else {
+    // Even if we didn't load the section, we need to record an entry for it
+    // to handle later processing (and by 'handle' I mean don't do anything
+    // with these sections).
+    Allocate = 0;
+    Addr = 0;
+    DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
+                 << " obj addr: " << format("%p", data.data())
+                 << " new addr: 0"
+                 << " DataSize: " << DataSize
+                 << " StubBufSize: " << StubBufSize
+                 << " Allocate: " << Allocate
+                 << "\n");
+  }
+
+  Sections.push_back(SectionEntry(Name, Addr, Allocate, DataSize,
+				  (uintptr_t)pData));
+  return SectionID;
+}
+
+unsigned RuntimeDyldImpl::findOrEmitSection(ObjectImage &Obj,
+                                            const SectionRef &Section,
+                                            bool IsCode,
+                                            ObjSectionToIDMap &LocalSections) {
+
+  unsigned SectionID = 0;
+  ObjSectionToIDMap::iterator i = LocalSections.find(Section);
+  if (i != LocalSections.end())
+    SectionID = i->second;
+  else {
+    SectionID = emitSection(Obj, Section, IsCode);
+    LocalSections[Section] = SectionID;
+  }
+  return SectionID;
+}
+
+void RuntimeDyldImpl::addRelocationForSection(const RelocationEntry &RE,
+                                              unsigned SectionID) {
+  Relocations[SectionID].push_back(RE);
+}
+
+void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
+                                             StringRef SymbolName) {
+  // Relocation by symbol.  If the symbol is found in the global symbol table,
+  // create an appropriate section relocation.  Otherwise, add it to
+  // ExternalSymbolRelocations.
+  SymbolTableMap::const_iterator Loc =
+      GlobalSymbolTable.find(SymbolName);
+  if (Loc == GlobalSymbolTable.end()) {
+    ExternalSymbolRelocations[SymbolName].push_back(RE);
+  } else {
+    // Copy the RE since we want to modify its addend.
+    RelocationEntry RECopy = RE;
+    RECopy.Addend += Loc->second.second;
+    Relocations[Loc->second.first].push_back(RECopy);
+  }
+}
+
+uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
+  if (Arch == Triple::arm) {
+    // TODO: There is only ARM far stub now. We should add the Thumb stub,
+    // and stubs for branches Thumb - ARM and ARM - Thumb.
+    uint32_t *StubAddr = (uint32_t*)Addr;
+    *StubAddr = 0xe51ff004; // ldr pc,<label>
+    return (uint8_t*)++StubAddr;
+  } else if (Arch == Triple::mipsel || Arch == Triple::mips) {
+    uint32_t *StubAddr = (uint32_t*)Addr;
+    // 0:   3c190000        lui     t9,%hi(addr).
+    // 4:   27390000        addiu   t9,t9,%lo(addr).
+    // 8:   03200008        jr      t9.
+    // c:   00000000        nop.
+    const unsigned LuiT9Instr = 0x3c190000, AdduiT9Instr = 0x27390000;
+    const unsigned JrT9Instr = 0x03200008, NopInstr = 0x0;
+
+    *StubAddr = LuiT9Instr;
+    StubAddr++;
+    *StubAddr = AdduiT9Instr;
+    StubAddr++;
+    *StubAddr = JrT9Instr;
+    StubAddr++;
+    *StubAddr = NopInstr;
+    return Addr;
+  } else if (Arch == Triple::ppc64) {
+    // PowerPC64 stub: the address points to a function descriptor
+    // instead of the function itself. Load the function address
+    // on r11 and sets it to control register. Also loads the function
+    // TOC in r2 and environment pointer to r11.
+    writeInt32BE(Addr,    0x3D800000); // lis   r12, highest(addr)
+    writeInt32BE(Addr+4,  0x618C0000); // ori   r12, higher(addr)
+    writeInt32BE(Addr+8,  0x798C07C6); // sldi  r12, r12, 32
+    writeInt32BE(Addr+12, 0x658C0000); // oris  r12, r12, h(addr)
+    writeInt32BE(Addr+16, 0x618C0000); // ori   r12, r12, l(addr)
+    writeInt32BE(Addr+20, 0xF8410028); // std   r2,  40(r1)
+    writeInt32BE(Addr+24, 0xE96C0000); // ld    r11, 0(r12)
+    writeInt32BE(Addr+28, 0xE84C0008); // ld    r2,  0(r12)
+    writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11
+    writeInt32BE(Addr+36, 0xE96C0010); // ld    r11, 16(r2)
+    writeInt32BE(Addr+40, 0x4E800420); // bctr
+
+    return Addr;
+  }
+  return Addr;
+}
+
+// Assign an address to a symbol name and resolve all the relocations
+// associated with it.
+void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
+                                             uint64_t Addr) {
+  // The address to use for relocation resolution is not
+  // the address of the local section buffer. We must be doing
+  // a remote execution environment of some sort. Relocations can't
+  // be applied until all the sections have been moved.  The client must
+  // trigger this with a call to MCJIT::finalize() or
+  // RuntimeDyld::resolveRelocations().
+  //
+  // Addr is a uint64_t because we can't assume the pointer width
+  // of the target is the same as that of the host. Just use a generic
+  // "big enough" type.
+  Sections[SectionID].LoadAddress = Addr;
+}
+
+void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
+                                             uint64_t Value) {
+  // Ignore relocations for sections that were not loaded
+  if (Sections[RE.SectionID].Address != 0) {
+    DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
+          << " + " << RE.Offset << " ("
+          << format("%p", Sections[RE.SectionID].Address + RE.Offset) << ")"
+          << " RelType: " << RE.RelType
+          << " Addend: " << RE.Addend
+          << "\n");
+
+    resolveRelocation(Sections[RE.SectionID], RE.Offset,
+                      Value, RE.RelType, RE.Addend);
+  }
+}
+
+void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
+                                            uint64_t Value) {
+  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+    resolveRelocationEntry(Relocs[i], Value);
+  }
+}
+
+void RuntimeDyldImpl::resolveExternalSymbols() {
+  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
+                                      e = ExternalSymbolRelocations.end();
+  for (; i != e; i++) {
+    StringRef Name = i->first();
+    RelocationList &Relocs = i->second;
+    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+    if (Loc == GlobalSymbolTable.end()) {
+      if (Name.size() == 0) {
+        // This is an absolute symbol, use an address of zero.
+        DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
+        resolveRelocationList(Relocs, 0);
+      } else {
+        // This is an external symbol, try to get its address from
+        // MemoryManager.
+        uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
+                                                                   true);
+        DEBUG(dbgs() << "Resolving relocations Name: " << Name
+                << "\t" << format("%p", Addr)
+                << "\n");
+        resolveRelocationList(Relocs, (uintptr_t)Addr);
+      }
+    } else {
+      report_fatal_error("Expected external symbol");
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// RuntimeDyld class implementation
+RuntimeDyld::RuntimeDyld(RTDyldMemoryManager *mm) {
+  // FIXME: There's a potential issue lurking here if a single instance of
+  // RuntimeDyld is used to load multiple objects.  The current implementation
+  // associates a single memory manager with a RuntimeDyld instance.  Even
+  // though the public class spawns a new 'impl' instance for each load,
+  // they share a single memory manager.  This can become a problem when page
+  // permissions are applied.
+  Dyld = 0;
+  MM = mm;
+}
+
+RuntimeDyld::~RuntimeDyld() {
+  delete Dyld;
+}
+
+ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
+  if (!Dyld) {
+    sys::LLVMFileType type = sys::IdentifyFileType(
+            InputBuffer->getBufferStart(),
+            static_cast<unsigned>(InputBuffer->getBufferSize()));
+    switch (type) {
+      case sys::ELF_Relocatable_FileType:
+      case sys::ELF_Executable_FileType:
+      case sys::ELF_SharedObject_FileType:
+      case sys::ELF_Core_FileType:
+        Dyld = new RuntimeDyldELF(MM);
+        break;
+      case sys::Mach_O_Object_FileType:
+      case sys::Mach_O_Executable_FileType:
+      case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
+      case sys::Mach_O_Core_FileType:
+      case sys::Mach_O_PreloadExecutable_FileType:
+      case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
+      case sys::Mach_O_DynamicLinker_FileType:
+      case sys::Mach_O_Bundle_FileType:
+      case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+      case sys::Mach_O_DSYMCompanion_FileType:
+        Dyld = new RuntimeDyldMachO(MM);
+        break;
+      case sys::Unknown_FileType:
+      case sys::Bitcode_FileType:
+      case sys::Archive_FileType:
+      case sys::COFF_FileType:
+        report_fatal_error("Incompatible object format!");
+    }
+  } else {
+    if (!Dyld->isCompatibleFormat(InputBuffer))
+      report_fatal_error("Incompatible object format!");
+  }
+
+  return Dyld->loadObject(InputBuffer);
+}
+
+void *RuntimeDyld::getSymbolAddress(StringRef Name) {
+  return Dyld->getSymbolAddress(Name);
+}
+
+uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  return Dyld->getSymbolLoadAddress(Name);
+}
+
+void RuntimeDyld::resolveRelocations() {
+  Dyld->resolveRelocations();
+}
+
+void RuntimeDyld::reassignSectionAddress(unsigned SectionID,
+                                         uint64_t Addr) {
+  Dyld->reassignSectionAddress(SectionID, Addr);
+}
+
+void RuntimeDyld::mapSectionAddress(const void *LocalAddress,
+                                    uint64_t TargetAddress) {
+  Dyld->mapSectionAddress(LocalAddress, TargetAddress);
+}
+
+StringRef RuntimeDyld::getErrorString() {
+  return Dyld->getErrorString();
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
new file mode 100644
index 000000000000..b8537b1f2f9c
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -0,0 +1,851 @@
+//===-- RuntimeDyldELF.cpp - Run-time dynamic linker for MC-JIT -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of ELF support for the MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dyld"
+#include "RuntimeDyldELF.h"
+#include "JITRegistrar.h"
+#include "ObjectImageCommon.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+
+static inline
+error_code check(error_code Err) {
+  if (Err) {
+    report_fatal_error(Err.message());
+  }
+  return Err;
+}
+
+template<class ELFT>
+class DyldELFObject
+  : public ELFObjectFile<ELFT> {
+  LLVM_ELF_IMPORT_TYPES(ELFT)
+
+  typedef Elf_Shdr_Impl<ELFT> Elf_Shdr;
+  typedef Elf_Sym_Impl<ELFT> Elf_Sym;
+  typedef
+    Elf_Rel_Impl<ELFT, false> Elf_Rel;
+  typedef
+    Elf_Rel_Impl<ELFT, true> Elf_Rela;
+
+  typedef Elf_Ehdr_Impl<ELFT> Elf_Ehdr;
+
+  typedef typename ELFDataTypeTypedefHelper<
+          ELFT>::value_type addr_type;
+
+public:
+  DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
+
+  void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
+  void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
+
+  // Methods for type inquiry through isa, cast and dyn_cast
+  static inline bool classof(const Binary *v) {
+    return (isa<ELFObjectFile<ELFT> >(v)
+            && classof(cast<ELFObjectFile
+                <ELFT> >(v)));
+  }
+  static inline bool classof(
+      const ELFObjectFile<ELFT> *v) {
+    return v->isDyldType();
+  }
+};
+
+template<class ELFT>
+class ELFObjectImage : public ObjectImageCommon {
+  protected:
+    DyldELFObject<ELFT> *DyldObj;
+    bool Registered;
+
+  public:
+    ELFObjectImage(ObjectBuffer *Input,
+                 DyldELFObject<ELFT> *Obj)
+    : ObjectImageCommon(Input, Obj),
+      DyldObj(Obj),
+      Registered(false) {}
+
+    virtual ~ELFObjectImage() {
+      if (Registered)
+        deregisterWithDebugger();
+    }
+
+    // Subclasses can override these methods to update the image with loaded
+    // addresses for sections and common symbols
+    virtual void updateSectionAddress(const SectionRef &Sec, uint64_t Addr)
+    {
+      DyldObj->updateSectionAddress(Sec, Addr);
+    }
+
+    virtual void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr)
+    {
+      DyldObj->updateSymbolAddress(Sym, Addr);
+    }
+
+    virtual void registerWithDebugger()
+    {
+      JITRegistrar::getGDBRegistrar().registerObject(*Buffer);
+      Registered = true;
+    }
+    virtual void deregisterWithDebugger()
+    {
+      JITRegistrar::getGDBRegistrar().deregisterObject(*Buffer);
+    }
+};
+
+// The MemoryBuffer passed into this constructor is just a wrapper around the
+// actual memory.  Ultimately, the Binary parent class will take ownership of
+// this MemoryBuffer object but not the underlying memory.
+template<class ELFT>
+DyldELFObject<ELFT>::DyldELFObject(MemoryBuffer *Wrapper, error_code &ec)
+  : ELFObjectFile<ELFT>(Wrapper, ec) {
+  this->isDyldELFObject = true;
+}
+
+template<class ELFT>
+void DyldELFObject<ELFT>::updateSectionAddress(const SectionRef &Sec,
+                                               uint64_t Addr) {
+  DataRefImpl ShdrRef = Sec.getRawDataRefImpl();
+  Elf_Shdr *shdr = const_cast<Elf_Shdr*>(
+                          reinterpret_cast<const Elf_Shdr *>(ShdrRef.p));
+
+  // This assumes the address passed in matches the target address bitness
+  // The template-based type cast handles everything else.
+  shdr->sh_addr = static_cast<addr_type>(Addr);
+}
+
+template<class ELFT>
+void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
+                                              uint64_t Addr) {
+
+  Elf_Sym *sym = const_cast<Elf_Sym*>(
+    ELFObjectFile<ELFT>::getSymbol(SymRef.getRawDataRefImpl()));
+
+  // This assumes the address passed in matches the target address bitness
+  // The template-based type cast handles everything else.
+  sym->st_value = static_cast<addr_type>(Addr);
+}
+
+} // namespace
+
+namespace llvm {
+
+ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
+  if (Buffer->getBufferSize() < ELF::EI_NIDENT)
+    llvm_unreachable("Unexpected ELF object size");
+  std::pair<unsigned char, unsigned char> Ident = std::make_pair(
+                         (uint8_t)Buffer->getBufferStart()[ELF::EI_CLASS],
+                         (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
+  error_code ec;
+
+  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
+    DyldELFObject<ELFType<support::little, 4, false> > *Obj =
+      new DyldELFObject<ELFType<support::little, 4, false> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::little, 4, false> >(Buffer, Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
+    DyldELFObject<ELFType<support::big, 4, false> > *Obj =
+      new DyldELFObject<ELFType<support::big, 4, false> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::big, 4, false> >(Buffer, Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
+    DyldELFObject<ELFType<support::big, 8, true> > *Obj =
+      new DyldELFObject<ELFType<support::big, 8, true> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::big, 8, true> >(Buffer, Obj);
+  }
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
+    DyldELFObject<ELFType<support::little, 8, true> > *Obj =
+      new DyldELFObject<ELFType<support::little, 8, true> >(
+        Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<ELFType<support::little, 8, true> >(Buffer, Obj);
+  }
+  else
+    llvm_unreachable("Unexpected ELF format");
+}
+
+RuntimeDyldELF::~RuntimeDyldELF() {
+}
+
+void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
+                                             uint64_t Offset,
+                                             uint64_t Value,
+                                             uint32_t Type,
+                                             int64_t Addend) {
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+  break;
+  case ELF::R_X86_64_64: {
+    uint64_t *Target = reinterpret_cast<uint64_t*>(Section.Address + Offset);
+    *Target = Value + Addend;
+    DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend))
+                 << " at " << format("%p\n",Target));
+    break;
+  }
+  case ELF::R_X86_64_32:
+  case ELF::R_X86_64_32S: {
+    Value += Addend;
+    assert((Type == ELF::R_X86_64_32 && (Value <= UINT32_MAX)) ||
+           (Type == ELF::R_X86_64_32S &&
+             ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN)));
+    uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    *Target = TruncatedAddr;
+    DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr)
+                 << " at " << format("%p\n",Target));
+    break;
+  }
+  case ELF::R_X86_64_PC32: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    int64_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
+    assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
+    int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
+    *Target = TruncOffset;
+    break;
+  }
+  }
+}
+
+void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
+                                          uint64_t Offset,
+                                          uint32_t Value,
+                                          uint32_t Type,
+                                          int32_t Addend) {
+  switch (Type) {
+  case ELF::R_386_32: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    *Target = *Placeholder + Value + Addend;
+    break;
+  }
+  case ELF::R_386_PC32: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint32_t  FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
+    uint32_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
+    *Target = RealOffset;
+    break;
+    }
+    default:
+      // There are other relocation types, but it appears these are the
+      // only ones currently used by the LLVM ELF object writer
+      llvm_unreachable("Relocation type not implemented yet!");
+      break;
+  }
+}
+
+void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
+                                          uint64_t Offset,
+                                          uint32_t Value,
+                                          uint32_t Type,
+                                          int32_t Addend) {
+  // TODO: Add Thumb relocations.
+  uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
+  uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
+  Value += Addend;
+
+  DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: "
+               << Section.Address + Offset
+               << " FinalAddress: " << format("%p",FinalAddress)
+               << " Value: " << format("%x",Value)
+               << " Type: " << format("%x",Type)
+               << " Addend: " << format("%x",Addend)
+               << "\n");
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Not implemented relocation type!");
+
+  // Write a 32bit value to relocation address, taking into account the
+  // implicit addend encoded in the target.
+  case ELF::R_ARM_TARGET1 :
+  case ELF::R_ARM_ABS32 :
+    *TargetPtr += Value;
+    break;
+
+  // Write first 16 bit of 32 bit value to the mov instruction.
+  // Last 4 bit should be shifted.
+  case ELF::R_ARM_MOVW_ABS_NC :
+    // We are not expecting any other addend in the relocation address.
+    // Using 0x000F0FFF because MOVW has its 16 bit immediate split into 2
+    // non-contiguous fields.
+    assert((*TargetPtr & 0x000F0FFF) == 0);
+    Value = Value & 0xFFFF;
+    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr |= ((Value >> 12) & 0xF) << 16;
+    break;
+
+  // Write last 16 bit of 32 bit value to the mov instruction.
+  // Last 4 bit should be shifted.
+  case ELF::R_ARM_MOVT_ABS :
+    // We are not expecting any other addend in the relocation address.
+    // Use 0x000F0FFF for the same reason as R_ARM_MOVW_ABS_NC.
+    assert((*TargetPtr & 0x000F0FFF) == 0);
+    Value = (Value >> 16) & 0xFFFF;
+    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr |= ((Value >> 12) & 0xF) << 16;
+    break;
+
+  // Write 24 bit relative value to the branch instruction.
+  case ELF::R_ARM_PC24 :    // Fall through.
+  case ELF::R_ARM_CALL :    // Fall through.
+  case ELF::R_ARM_JUMP24 :
+    int32_t RelValue = static_cast<int32_t>(Value - FinalAddress - 8);
+    RelValue = (RelValue & 0x03FFFFFC) >> 2;
+    *TargetPtr &= 0xFF000000;
+    *TargetPtr |= RelValue;
+    break;
+  }
+}
+
+void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
+                                           uint64_t Offset,
+                                           uint32_t Value,
+                                           uint32_t Type,
+                                           int32_t Addend) {
+  uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
+  Value += Addend;
+
+  DEBUG(dbgs() << "resolveMipselocation, LocalAddress: "
+               << Section.Address + Offset
+               << " FinalAddress: "
+               << format("%p",Section.LoadAddress + Offset)
+               << " Value: " << format("%x",Value)
+               << " Type: " << format("%x",Type)
+               << " Addend: " << format("%x",Addend)
+               << "\n");
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Not implemented relocation type!");
+    break;
+  case ELF::R_MIPS_32:
+    *TargetPtr = Value + (*TargetPtr);
+    break;
+  case ELF::R_MIPS_26:
+    *TargetPtr = ((*TargetPtr) & 0xfc000000) | (( Value & 0x0fffffff) >> 2);
+    break;
+  case ELF::R_MIPS_HI16:
+    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
+    Value += ((*TargetPtr) & 0x0000ffff) << 16;
+    *TargetPtr = ((*TargetPtr) & 0xffff0000) |
+                 (((Value + 0x8000) >> 16) & 0xffff);
+    break;
+   case ELF::R_MIPS_LO16:
+    Value += ((*TargetPtr) & 0x0000ffff);
+    *TargetPtr = ((*TargetPtr) & 0xffff0000) | (Value & 0xffff);
+    break;
+   }
+}
+
+// Return the .TOC. section address to R_PPC64_TOC relocations.
+uint64_t RuntimeDyldELF::findPPC64TOC() const {
+  // The TOC consists of sections .got, .toc, .tocbss, .plt in that
+  // order. The TOC starts where the first of these sections starts.
+  SectionList::const_iterator it = Sections.begin();
+  SectionList::const_iterator ite = Sections.end();
+  for (; it != ite; ++it) {
+    if (it->Name == ".got" ||
+        it->Name == ".toc" ||
+        it->Name == ".tocbss" ||
+        it->Name == ".plt")
+      break;
+  }
+  if (it == ite) {
+    // This may happen for
+    // * references to TOC base base (sym@toc, .odp relocation) without
+    // a .toc directive.
+    // In this case just use the first section (which is usually
+    // the .odp) since the code won't reference the .toc base
+    // directly.
+    it = Sections.begin();
+  }
+  assert (it != ite);
+  // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
+  // thus permitting a full 64 Kbytes segment.
+  return it->LoadAddress + 0x8000;
+}
+
+// Returns the sections and offset associated with the ODP entry referenced
+// by Symbol.
+void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
+                                         ObjSectionToIDMap &LocalSections,
+                                         RelocationValueRef &Rel) {
+  // Get the ELF symbol value (st_value) to compare with Relocation offset in
+  // .opd entries
+
+  error_code err;
+  for (section_iterator si = Obj.begin_sections(),
+     se = Obj.end_sections(); si != se; si.increment(err)) {
+    StringRef SectionName;
+    check(si->getName(SectionName));
+    if (SectionName != ".opd")
+      continue;
+
+    for (relocation_iterator i = si->begin_relocations(),
+         e = si->end_relocations(); i != e;) {
+      check(err);
+
+      // The R_PPC64_ADDR64 relocation indicates the first field
+      // of a .opd entry
+      uint64_t TypeFunc;
+      check(i->getType(TypeFunc));
+      if (TypeFunc != ELF::R_PPC64_ADDR64) {
+        i.increment(err);
+        continue;
+      }
+
+      SymbolRef TargetSymbol;
+      uint64_t TargetSymbolOffset;
+      int64_t TargetAdditionalInfo;
+      check(i->getSymbol(TargetSymbol));
+      check(i->getOffset(TargetSymbolOffset));
+      check(i->getAdditionalInfo(TargetAdditionalInfo));
+
+      i = i.increment(err);
+      if (i == e)
+        break;
+      check(err);
+
+      // Just check if following relocation is a R_PPC64_TOC
+      uint64_t TypeTOC;
+      check(i->getType(TypeTOC));
+      if (TypeTOC != ELF::R_PPC64_TOC)
+        continue;
+
+      // Finally compares the Symbol value and the target symbol offset
+      // to check if this .opd entry refers to the symbol the relocation
+      // points to.
+      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+        continue;
+
+      section_iterator tsi(Obj.end_sections());
+      check(TargetSymbol.getSection(tsi));
+      Rel.SectionID = findOrEmitSection(Obj, (*tsi), true, LocalSections);
+      Rel.Addend = (intptr_t)TargetAdditionalInfo;
+      return;
+    }
+  }
+  llvm_unreachable("Attempting to get address of ODP entry!");
+}
+
+// Relocation masks following the #lo(value), #hi(value), #higher(value),
+// and #highest(value) macros defined in section 4.5.1. Relocation Types
+// in PPC-elf64abi document.
+//
+static inline
+uint16_t applyPPClo (uint64_t value)
+{
+  return value & 0xffff;
+}
+
+static inline
+uint16_t applyPPChi (uint64_t value)
+{
+  return (value >> 16) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChigher (uint64_t value)
+{
+  return (value >> 32) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChighest (uint64_t value)
+{
+  return (value >> 48) & 0xffff;
+}
+
+void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
+                                            uint64_t Offset,
+                                            uint64_t Value,
+                                            uint32_t Type,
+                                            int64_t Addend) {
+  uint8_t* LocalAddress = Section.Address + Offset;
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+  break;
+  case ELF::R_PPC64_ADDR16_LO :
+    writeInt16BE(LocalAddress, applyPPClo (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HI :
+    writeInt16BE(LocalAddress, applyPPChi (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHER :
+    writeInt16BE(LocalAddress, applyPPChigher (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHEST :
+    writeInt16BE(LocalAddress, applyPPChighest (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR14 : {
+    assert(((Value + Addend) & 3) == 0);
+    // Preserve the AA/LK bits in the branch instruction
+    uint8_t aalk = *(LocalAddress+3);
+    writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc));
+  } break;
+  case ELF::R_PPC64_ADDR32 : {
+    int32_t Result = static_cast<int32_t>(Value + Addend);
+    if (SignExtend32<32>(Result) != Result)
+      llvm_unreachable("Relocation R_PPC64_ADDR32 overflow");
+    writeInt32BE(LocalAddress, Result);
+  } break;
+  case ELF::R_PPC64_REL24 : {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
+    if (SignExtend32<24>(delta) != delta)
+      llvm_unreachable("Relocation R_PPC64_REL24 overflow");
+    // Generates a 'bl <address>' instruction
+    writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
+  } break;
+  case ELF::R_PPC64_REL32 : {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
+    if (SignExtend32<32>(delta) != delta)
+      llvm_unreachable("Relocation R_PPC64_REL32 overflow");
+    writeInt32BE(LocalAddress, delta);
+  } break;
+  case ELF::R_PPC64_ADDR64 :
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_PPC64_TOC :
+    writeInt64BE(LocalAddress, findPPC64TOC());
+    break;
+  case ELF::R_PPC64_TOC16 : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = applyPPClo((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  case ELF::R_PPC64_TOC16_DS : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = ((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  }
+}
+
+void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
+                                       uint64_t Offset,
+                                       uint64_t Value,
+                                       uint32_t Type,
+                                       int64_t Addend) {
+  switch (Arch) {
+  case Triple::x86_64:
+    resolveX86_64Relocation(Section, Offset, Value, Type, Addend);
+    break;
+  case Triple::x86:
+    resolveX86Relocation(Section, Offset,
+                         (uint32_t)(Value & 0xffffffffL), Type,
+                         (uint32_t)(Addend & 0xffffffffL));
+    break;
+  case Triple::arm:    // Fall through.
+  case Triple::thumb:
+    resolveARMRelocation(Section, Offset,
+                         (uint32_t)(Value & 0xffffffffL), Type,
+                         (uint32_t)(Addend & 0xffffffffL));
+    break;
+  case Triple::mips:    // Fall through.
+  case Triple::mipsel:
+    resolveMIPSRelocation(Section, Offset,
+                          (uint32_t)(Value & 0xffffffffL), Type,
+                          (uint32_t)(Addend & 0xffffffffL));
+    break;
+  case Triple::ppc64:
+    resolvePPC64Relocation(Section, Offset, Value, Type, Addend);
+    break;
+  default: llvm_unreachable("Unsupported CPU type!");
+  }
+}
+
+void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
+                                          ObjectImage &Obj,
+                                          ObjSectionToIDMap &ObjSectionToID,
+                                          const SymbolTableMap &Symbols,
+                                          StubMap &Stubs) {
+
+  uint32_t RelType = (uint32_t)(Rel.Type & 0xffffffffL);
+  intptr_t Addend = (intptr_t)Rel.AdditionalInfo;
+  const SymbolRef &Symbol = Rel.Symbol;
+
+  // Obtain the symbol name which is referenced in the relocation
+  StringRef TargetName;
+  Symbol.getName(TargetName);
+  DEBUG(dbgs() << "\t\tRelType: " << RelType
+               << " Addend: " << Addend
+               << " TargetName: " << TargetName
+               << "\n");
+  RelocationValueRef Value;
+  // First search for the symbol in the local symbol table
+  SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
+  SymbolRef::Type SymType;
+  Symbol.getType(SymType);
+  if (lsi != Symbols.end()) {
+    Value.SectionID = lsi->second.first;
+    Value.Addend = lsi->second.second;
+  } else {
+    // Search for the symbol in the global symbol table
+    SymbolTableMap::const_iterator gsi =
+        GlobalSymbolTable.find(TargetName.data());
+    if (gsi != GlobalSymbolTable.end()) {
+      Value.SectionID = gsi->second.first;
+      Value.Addend = gsi->second.second;
+    } else {
+      switch (SymType) {
+        case SymbolRef::ST_Debug: {
+          // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
+          // and can be changed by another developers. Maybe best way is add
+          // a new symbol type ST_Section to SymbolRef and use it.
+          section_iterator si(Obj.end_sections());
+          Symbol.getSection(si);
+          if (si == Obj.end_sections())
+            llvm_unreachable("Symbol section not found, bad object file format!");
+          DEBUG(dbgs() << "\t\tThis is section symbol\n");
+          // Default to 'true' in case isText fails (though it never does).
+          bool isCode = true;
+          si->isText(isCode);
+          Value.SectionID = findOrEmitSection(Obj,
+                                              (*si),
+                                              isCode,
+                                              ObjSectionToID);
+          Value.Addend = Addend;
+          break;
+        }
+        case SymbolRef::ST_Unknown: {
+          Value.SymbolName = TargetName.data();
+          Value.Addend = Addend;
+          break;
+        }
+        default:
+          llvm_unreachable("Unresolved symbol type!");
+          break;
+      }
+    }
+  }
+  DEBUG(dbgs() << "\t\tRel.SectionID: " << Rel.SectionID
+               << " Rel.Offset: " << Rel.Offset
+               << "\n");
+  if (Arch == Triple::arm &&
+      (RelType == ELF::R_ARM_PC24 ||
+       RelType == ELF::R_ARM_CALL ||
+       RelType == ELF::R_ARM_JUMP24)) {
+    // This is an ARM branch relocation, need to use a stub function.
+    DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.");
+    SectionEntry &Section = Sections[Rel.SectionID];
+
+    // Look for an existing stub.
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end()) {
+        resolveRelocation(Section, Rel.Offset,
+                          (uint64_t)Section.Address + i->second, RelType, 0);
+      DEBUG(dbgs() << " Stub function found\n");
+    } else {
+      // Create a new stub function.
+      DEBUG(dbgs() << " Create a new stub function\n");
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         ELF::R_ARM_ABS32, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else if ((Arch == Triple::mipsel || Arch == Triple::mips) &&
+             RelType == ELF::R_MIPS_26) {
+    // This is an Mips branch relocation, need to use a stub function.
+    DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
+    SectionEntry &Section = Sections[Rel.SectionID];
+    uint8_t *Target = Section.Address + Rel.Offset;
+    uint32_t *TargetAddress = (uint32_t *)Target;
+
+    // Extract the addend from the instruction.
+    uint32_t Addend = ((*TargetAddress) & 0x03ffffff) << 2;
+
+    Value.Addend += Addend;
+
+    //  Look up for existing stub.
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end()) {
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + i->second, RelType, 0);
+      DEBUG(dbgs() << " Stub function found\n");
+    } else {
+      // Create a new stub function.
+      DEBUG(dbgs() << " Create a new stub function\n");
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+
+      // Creating Hi and Lo relocations for the filled stub instructions.
+      RelocationEntry REHi(Rel.SectionID,
+                           StubTargetAddr - Section.Address,
+                           ELF::R_MIPS_HI16, Value.Addend);
+      RelocationEntry RELo(Rel.SectionID,
+                           StubTargetAddr - Section.Address + 4,
+                           ELF::R_MIPS_LO16, Value.Addend);
+
+      if (Value.SymbolName) {
+        addRelocationForSymbol(REHi, Value.SymbolName);
+        addRelocationForSymbol(RELo, Value.SymbolName);
+      } else {
+        addRelocationForSection(REHi, Value.SectionID);
+        addRelocationForSection(RELo, Value.SectionID);
+      }
+
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else if (Arch == Triple::ppc64) {
+    if (RelType == ELF::R_PPC64_REL24) {
+      // A PPC branch relocation will need a stub function if the target is
+      // an external symbol (Symbol::ST_Unknown) or if the target address
+      // is not within the signed 24-bits branch address.
+      SectionEntry &Section = Sections[Rel.SectionID];
+      uint8_t *Target = Section.Address + Rel.Offset;
+      bool RangeOverflow = false;
+      if (SymType != SymbolRef::ST_Unknown) {
+        // A function call may points to the .opd entry, so the final symbol value
+        // in calculated based in the relocation values in .opd section.
+        findOPDEntrySection(Obj, ObjSectionToID, Value);
+        uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend;
+        int32_t delta = static_cast<int32_t>(Target - RelocTarget);
+        // If it is within 24-bits branch range, just set the branch target
+        if (SignExtend32<24>(delta) == delta) {
+          RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+          if (Value.SymbolName)
+            addRelocationForSymbol(RE, Value.SymbolName);
+          else
+            addRelocationForSection(RE, Value.SectionID);
+        } else {
+          RangeOverflow = true;
+        }
+      }
+      if (SymType == SymbolRef::ST_Unknown || RangeOverflow == true) {
+        // It is an external symbol (SymbolRef::ST_Unknown) or within a range
+        // larger than 24-bits.
+        StubMap::const_iterator i = Stubs.find(Value);
+        if (i != Stubs.end()) {
+          // Symbol function stub already created, just relocate to it
+          resolveRelocation(Section, Rel.Offset,
+                            (uint64_t)Section.Address + i->second, RelType, 0);
+          DEBUG(dbgs() << " Stub function found\n");
+        } else {
+          // Create a new stub function.
+          DEBUG(dbgs() << " Create a new stub function\n");
+          Stubs[Value] = Section.StubOffset;
+          uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                       Section.StubOffset);
+          RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                             ELF::R_PPC64_ADDR64, Value.Addend);
+
+          // Generates the 64-bits address loads as exemplified in section
+          // 4.5.1 in PPC64 ELF ABI.
+          RelocationEntry REhst(Rel.SectionID,
+                                StubTargetAddr - Section.Address + 2,
+                                ELF::R_PPC64_ADDR16_HIGHEST, Value.Addend);
+          RelocationEntry REhr(Rel.SectionID,
+                               StubTargetAddr - Section.Address + 6,
+                               ELF::R_PPC64_ADDR16_HIGHER, Value.Addend);
+          RelocationEntry REh(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 14,
+                              ELF::R_PPC64_ADDR16_HI, Value.Addend);
+          RelocationEntry REl(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 18,
+                              ELF::R_PPC64_ADDR16_LO, Value.Addend);
+
+          if (Value.SymbolName) {
+            addRelocationForSymbol(REhst, Value.SymbolName);
+            addRelocationForSymbol(REhr,  Value.SymbolName);
+            addRelocationForSymbol(REh,   Value.SymbolName);
+            addRelocationForSymbol(REl,   Value.SymbolName);
+          } else {
+            addRelocationForSection(REhst, Value.SectionID);
+            addRelocationForSection(REhr,  Value.SectionID);
+            addRelocationForSection(REh,   Value.SectionID);
+            addRelocationForSection(REl,   Value.SectionID);
+          }
+
+          resolveRelocation(Section, Rel.Offset,
+                            (uint64_t)Section.Address + Section.StubOffset,
+                            RelType, 0);
+          if (SymType == SymbolRef::ST_Unknown)
+            // Restore the TOC for external calls
+            writeInt32BE(Target+4, 0xE8410028); // ld r2,40(r1)
+          Section.StubOffset += getMaxStubSize();
+        }
+      }
+    } else {
+      RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+      // Extra check to avoid relocation againt empty symbols (usually
+      // the R_PPC64_TOC).
+      if (Value.SymbolName && !TargetName.empty())
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
+}
+
+unsigned RuntimeDyldELF::getCommonSymbolAlignment(const SymbolRef &Sym) {
+  // In ELF, the value of an SHN_COMMON symbol is its alignment requirement.
+  uint64_t Align;
+  Check(Sym.getValue(Align));
+  return Align;
+}
+
+bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
+  if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
+    return false;
+  return (memcmp(Buffer->getBufferStart(), ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
+}
+} // namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
new file mode 100644
index 000000000000..07e704b45930
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -0,0 +1,97 @@
+//===-- RuntimeDyldELF.h - Run-time dynamic linker for MC-JIT ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF support for MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIME_DYLD_ELF_H
+#define LLVM_RUNTIME_DYLD_ELF_H
+
+#include "RuntimeDyldImpl.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+namespace {
+  // Helper for extensive error checking in debug builds.
+  error_code Check(error_code Err) {
+    if (Err) {
+      report_fatal_error(Err.message());
+    }
+    return Err;
+  }
+} // end anonymous namespace
+
+class RuntimeDyldELF : public RuntimeDyldImpl {
+protected:
+  void resolveX86_64Relocation(const SectionEntry &Section,
+                               uint64_t Offset,
+                               uint64_t Value,
+                               uint32_t Type,
+                               int64_t Addend);
+
+  void resolveX86Relocation(const SectionEntry &Section,
+                            uint64_t Offset,
+                            uint32_t Value,
+                            uint32_t Type,
+                            int32_t Addend);
+
+  void resolveARMRelocation(const SectionEntry &Section,
+                            uint64_t Offset,
+                            uint32_t Value,
+                            uint32_t Type,
+                            int32_t Addend);
+
+  void resolveMIPSRelocation(const SectionEntry &Section,
+                             uint64_t Offset,
+                             uint32_t Value,
+                             uint32_t Type,
+                             int32_t Addend);
+
+  void resolvePPC64Relocation(const SectionEntry &Section,
+                              uint64_t Offset,
+                              uint64_t Value,
+                              uint32_t Type,
+                              int64_t Addend);
+
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend);
+
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    ObjectImage &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
+
+  unsigned getCommonSymbolAlignment(const SymbolRef &Sym);
+
+  virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
+
+  uint64_t findPPC64TOC() const;
+  void findOPDEntrySection(ObjectImage &Obj,
+                           ObjSectionToIDMap &LocalSections,
+                           RelocationValueRef &Rel);
+
+public:
+  RuntimeDyldELF(RTDyldMemoryManager *mm)
+      : RuntimeDyldImpl(mm) {}
+
+  virtual ~RuntimeDyldELF();
+
+  bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
new file mode 100644
index 000000000000..f1009945775c
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -0,0 +1,344 @@
+//===-- RuntimeDyldImpl.h - Run-time dynamic linker for MC-JIT --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface for the implementations of runtime dynamic linker facilities.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIME_DYLD_IMPL_H
+#define LLVM_RUNTIME_DYLD_IMPL_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <map>
+
+using namespace llvm;
+using namespace llvm::object;
+
+namespace llvm {
+
+class ObjectBuffer;
+class Twine;
+
+
+/// SectionEntry - represents a section emitted into memory by the dynamic
+/// linker.
+class SectionEntry {
+public:
+  /// Name - section name.
+  StringRef Name;
+
+  /// Address - address in the linker's memory where the section resides.
+  uint8_t *Address;
+
+  /// Size - section size.
+  size_t Size;
+
+  /// LoadAddress - the address of the section in the target process's memory.
+  /// Used for situations in which JIT-ed code is being executed in the address
+  /// space of a separate process.  If the code executes in the same address
+  /// space where it was JIT-ed, this just equals Address.
+  uint64_t LoadAddress;
+
+  /// StubOffset - used for architectures with stub functions for far
+  /// relocations (like ARM).
+  uintptr_t StubOffset;
+
+  /// ObjAddress - address of the section in the in-memory object file.  Used
+  /// for calculating relocations in some object formats (like MachO).
+  uintptr_t ObjAddress;
+
+  SectionEntry(StringRef name, uint8_t *address, size_t size,
+	       uintptr_t stubOffset, uintptr_t objAddress)
+    : Name(name), Address(address), Size(size), LoadAddress((uintptr_t)address),
+      StubOffset(stubOffset), ObjAddress(objAddress) {}
+};
+
+/// RelocationEntry - used to represent relocations internally in the dynamic
+/// linker.
+class RelocationEntry {
+public:
+  /// SectionID - the section this relocation points to.
+  unsigned SectionID;
+
+  /// Offset - offset into the section.
+  uintptr_t Offset;
+
+  /// RelType - relocation type.
+  uint32_t RelType;
+
+  /// Addend - the relocation addend encoded in the instruction itself.  Also
+  /// used to make a relocation section relative instead of symbol relative.
+  intptr_t Addend;
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend) {}
+};
+
+/// ObjRelocationInfo - relocation information as read from the object file.
+/// Used to pass around data taken from object::RelocationRef, together with
+/// the section to which the relocation points (represented by a SectionID).
+class ObjRelocationInfo {
+public:
+  unsigned  SectionID;
+  uint64_t  Offset;
+  SymbolRef Symbol;
+  uint64_t  Type;
+  int64_t   AdditionalInfo;
+};
+
+class RelocationValueRef {
+public:
+  unsigned  SectionID;
+  intptr_t  Addend;
+  const char *SymbolName;
+  RelocationValueRef(): SectionID(0), Addend(0), SymbolName(0) {}
+
+  inline bool operator==(const RelocationValueRef &Other) const {
+    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) == 0;
+  }
+  inline bool operator <(const RelocationValueRef &Other) const {
+    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) < 0;
+  }
+};
+
+class RuntimeDyldImpl {
+protected:
+  // The MemoryManager to load objects into.
+  RTDyldMemoryManager *MemMgr;
+
+  // A list of all sections emitted by the dynamic linker.  These sections are
+  // referenced in the code by means of their index in this list - SectionID.
+  typedef SmallVector<SectionEntry, 64> SectionList;
+  SectionList Sections;
+
+  // Keep a map of sections from object file to the SectionID which
+  // references it.
+  typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
+
+  // A global symbol table for symbols from all loaded modules.  Maps the
+  // symbol name to a (SectionID, offset in section) pair.
+  typedef std::pair<unsigned, uintptr_t> SymbolLoc;
+  typedef StringMap<SymbolLoc> SymbolTableMap;
+  SymbolTableMap GlobalSymbolTable;
+
+  // Pair representing the size and alignment requirement for a common symbol.
+  typedef std::pair<unsigned, unsigned> CommonSymbolInfo;
+  // Keep a map of common symbols to their info pairs
+  typedef std::map<SymbolRef, CommonSymbolInfo> CommonSymbolMap;
+
+  // For each symbol, keep a list of relocations based on it. Anytime
+  // its address is reassigned (the JIT re-compiled the function, e.g.),
+  // the relocations get re-resolved.
+  // The symbol (or section) the relocation is sourced from is the Key
+  // in the relocation list where it's stored.
+  typedef SmallVector<RelocationEntry, 64> RelocationList;
+  // Relocations to sections already loaded. Indexed by SectionID which is the
+  // source of the address. The target where the address will be written is
+  // SectionID/Offset in the relocation itself.
+  DenseMap<unsigned, RelocationList> Relocations;
+
+  // Relocations to external symbols that are not yet resolved.  Symbols are
+  // external when they aren't found in the global symbol table of all loaded
+  // modules.  This map is indexed by symbol name.
+  StringMap<RelocationList> ExternalSymbolRelocations;
+
+  typedef std::map<RelocationValueRef, uintptr_t> StubMap;
+
+  Triple::ArchType Arch;
+
+  inline unsigned getMaxStubSize() {
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::mipsel || Arch == Triple::mips)
+      return 16;
+    else if (Arch == Triple::ppc64)
+      return 44;
+    else
+      return 0;
+  }
+
+  bool HasError;
+  std::string ErrorStr;
+
+  // Set the error state and record an error string.
+  bool Error(const Twine &Msg) {
+    ErrorStr = Msg.str();
+    HasError = true;
+    return true;
+  }
+
+  uint64_t getSectionLoadAddress(unsigned SectionID) {
+    return Sections[SectionID].LoadAddress;
+  }
+
+  uint8_t *getSectionAddress(unsigned SectionID) {
+    return (uint8_t*)Sections[SectionID].Address;
+  }
+
+  // Subclasses can override this method to get the alignment requirement of
+  // a common symbol. Returns no alignment requirement if not implemented.
+  virtual unsigned getCommonSymbolAlignment(const SymbolRef &Sym) {
+    return 0;
+  }
+
+
+  void writeInt16BE(uint8_t *Addr, uint16_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 8) & 0xFF;
+    *(Addr+1) = Value & 0xFF;
+  }
+
+  void writeInt32BE(uint8_t *Addr, uint32_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 24) & 0xFF;
+    *(Addr+1) = (Value >> 16) & 0xFF;
+    *(Addr+2) = (Value >> 8) & 0xFF;
+    *(Addr+3) = Value & 0xFF;
+  }
+
+  void writeInt64BE(uint8_t *Addr, uint64_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 56) & 0xFF;
+    *(Addr+1) = (Value >> 48) & 0xFF;
+    *(Addr+2) = (Value >> 40) & 0xFF;
+    *(Addr+3) = (Value >> 32) & 0xFF;
+    *(Addr+4) = (Value >> 24) & 0xFF;
+    *(Addr+5) = (Value >> 16) & 0xFF;
+    *(Addr+6) = (Value >> 8) & 0xFF;
+    *(Addr+7) = Value & 0xFF;
+  }
+
+  /// \brief Given the common symbols discovered in the object file, emit a
+  /// new section for them and update the symbol mappings in the object and
+  /// symbol table.
+  void emitCommonSymbols(ObjectImage &Obj,
+                         const CommonSymbolMap &CommonSymbols,
+                         uint64_t TotalSize,
+                         SymbolTableMap &SymbolTable);
+
+  /// \brief Emits section data from the object file to the MemoryManager.
+  /// \param IsCode if it's true then allocateCodeSection() will be
+  ///        used for emits, else allocateDataSection() will be used.
+  /// \return SectionID.
+  unsigned emitSection(ObjectImage &Obj,
+                       const SectionRef &Section,
+                       bool IsCode);
+
+  /// \brief Find Section in LocalSections. If the secton is not found - emit
+  ///        it and store in LocalSections.
+  /// \param IsCode if it's true then allocateCodeSection() will be
+  ///        used for emmits, else allocateDataSection() will be used.
+  /// \return SectionID.
+  unsigned findOrEmitSection(ObjectImage &Obj,
+                             const SectionRef &Section,
+                             bool IsCode,
+                             ObjSectionToIDMap &LocalSections);
+
+  // \brief Add a relocation entry that uses the given section.
+  void addRelocationForSection(const RelocationEntry &RE, unsigned SectionID);
+
+  // \brief Add a relocation entry that uses the given symbol.  This symbol may
+  // be found in the global symbol table, or it may be external.
+  void addRelocationForSymbol(const RelocationEntry &RE, StringRef SymbolName);
+
+  /// \brief Emits long jump instruction to Addr.
+  /// \return Pointer to the memory area for emitting target address.
+  uint8_t* createStubFunction(uint8_t *Addr);
+
+  /// \brief Resolves relocations from Relocs list with address from Value.
+  void resolveRelocationList(const RelocationList &Relocs, uint64_t Value);
+  void resolveRelocationEntry(const RelocationEntry &RE, uint64_t Value);
+
+  /// \brief A object file specific relocation resolver
+  /// \param Section The section where the relocation is being applied
+  /// \param Offset The offset into the section for this relocation
+  /// \param Value Target symbol address to apply the relocation action
+  /// \param Type object file specific relocation type
+  /// \param Addend A constant addend used to compute the value to be stored
+  ///        into the relocatable field
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend) = 0;
+
+  /// \brief Parses the object file relocation and stores it to Relocations
+  ///        or SymbolRelocations (this depends on the object file type).
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    ObjectImage &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs) = 0;
+
+  /// \brief Resolve relocations to external symbols.
+  void resolveExternalSymbols();
+  virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
+public:
+  RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
+
+  virtual ~RuntimeDyldImpl();
+
+  ObjectImage *loadObject(ObjectBuffer *InputBuffer);
+
+  void *getSymbolAddress(StringRef Name) {
+    // FIXME: Just look up as a function for now. Overly simple of course.
+    // Work in progress.
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+      return 0;
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    return getSectionAddress(Loc.first) + Loc.second;
+  }
+
+  uint64_t getSymbolLoadAddress(StringRef Name) {
+    // FIXME: Just look up as a function for now. Overly simple of course.
+    // Work in progress.
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+      return 0;
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    return getSectionLoadAddress(Loc.first) + Loc.second;
+  }
+
+  void resolveRelocations();
+
+  void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
+
+  void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
+
+  // Is the linker in an error state?
+  bool hasError() { return HasError; }
+
+  // Mark the error condition as handled and continue.
+  void clearError() { HasError = false; }
+
+  // Get the error message.
+  StringRef getErrorString() { return ErrorStr; }
+
+  virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
+};
+
+} // end namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
new file mode 100644
index 000000000000..bcc3df1b4e7c
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -0,0 +1,311 @@
+//===-- RuntimeDyldMachO.cpp - Run-time dynamic linker for MC-JIT -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dyld"
+#include "RuntimeDyldMachO.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+using namespace llvm;
+using namespace llvm::object;
+
+namespace llvm {
+
+void RuntimeDyldMachO::resolveRelocation(const SectionEntry &Section,
+                                         uint64_t Offset,
+                                         uint64_t Value,
+                                         uint32_t Type,
+                                         int64_t Addend) {
+  uint8_t *LocalAddress = Section.Address + Offset;
+  uint64_t FinalAddress = Section.LoadAddress + Offset;
+  bool isPCRel = (Type >> 24) & 1;
+  unsigned MachoType = (Type >> 28) & 0xf;
+  unsigned Size = 1 << ((Type >> 25) & 3);
+
+  DEBUG(dbgs() << "resolveRelocation LocalAddress: " 
+        << format("%p", LocalAddress)
+        << " FinalAddress: " << format("%p", FinalAddress)
+        << " Value: " << format("%p", Value)
+        << " Addend: " << Addend
+        << " isPCRel: " << isPCRel
+        << " MachoType: " << MachoType
+        << " Size: " << Size
+        << "\n");
+
+  // This just dispatches to the proper target specific routine.
+  switch (Arch) {
+  default: llvm_unreachable("Unsupported CPU type!");
+  case Triple::x86_64:
+    resolveX86_64Relocation(LocalAddress,
+                            FinalAddress,
+                            (uintptr_t)Value,
+                            isPCRel,
+                            MachoType,
+                            Size,
+                            Addend);
+    break;
+  case Triple::x86:
+    resolveI386Relocation(LocalAddress,
+                          FinalAddress,
+                          (uintptr_t)Value,
+                          isPCRel,
+                          MachoType,
+                          Size,
+                          Addend);
+    break;
+  case Triple::arm:    // Fall through.
+  case Triple::thumb:
+    resolveARMRelocation(LocalAddress,
+                         FinalAddress,
+                         (uintptr_t)Value,
+                         isPCRel,
+                         MachoType,
+                         Size,
+                         Addend);
+    break;
+  }
+}
+
+bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
+                                             uint64_t FinalAddress,
+                                             uint64_t Value,
+                                             bool isPCRel,
+                                             unsigned Type,
+                                             unsigned Size,
+                                             int64_t Addend) {
+  if (isPCRel)
+    Value -= FinalAddress + 4; // see resolveX86_64Relocation
+
+  switch (Type) {
+  default:
+    llvm_unreachable("Invalid relocation type!");
+  case macho::RIT_Vanilla: {
+    uint8_t *p = LocalAddress;
+    uint64_t ValueToWrite = Value + Addend;
+    for (unsigned i = 0; i < Size; ++i) {
+      *p++ = (uint8_t)(ValueToWrite & 0xff);
+      ValueToWrite >>= 8;
+    }
+    return false;
+  }
+  case macho::RIT_Difference:
+  case macho::RIT_Generic_LocalDifference:
+  case macho::RIT_Generic_PreboundLazyPointer:
+    return Error("Relocation type not implemented yet!");
+  }
+}
+
+bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
+                                               uint64_t FinalAddress,
+                                               uint64_t Value,
+                                               bool isPCRel,
+                                               unsigned Type,
+                                               unsigned Size,
+                                               int64_t Addend) {
+  // If the relocation is PC-relative, the value to be encoded is the
+  // pointer difference.
+  if (isPCRel)
+    // FIXME: It seems this value needs to be adjusted by 4 for an effective PC
+    // address. Is that expected? Only for branches, perhaps?
+    Value -= FinalAddress + 4;
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Invalid relocation type!");
+  case macho::RIT_X86_64_Signed1:
+  case macho::RIT_X86_64_Signed2:
+  case macho::RIT_X86_64_Signed4:
+  case macho::RIT_X86_64_Signed:
+  case macho::RIT_X86_64_Unsigned:
+  case macho::RIT_X86_64_Branch: {
+    Value += Addend;
+    // Mask in the target value a byte at a time (we don't have an alignment
+    // guarantee for the target address, so this is safest).
+    uint8_t *p = (uint8_t*)LocalAddress;
+    for (unsigned i = 0; i < Size; ++i) {
+      *p++ = (uint8_t)Value;
+      Value >>= 8;
+    }
+    return false;
+  }
+  case macho::RIT_X86_64_GOTLoad:
+  case macho::RIT_X86_64_GOT:
+  case macho::RIT_X86_64_Subtractor:
+  case macho::RIT_X86_64_TLV:
+    return Error("Relocation type not implemented yet!");
+  }
+}
+
+bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
+                                            uint64_t FinalAddress,
+                                            uint64_t Value,
+                                            bool isPCRel,
+                                            unsigned Type,
+                                            unsigned Size,
+                                            int64_t Addend) {
+  // If the relocation is PC-relative, the value to be encoded is the
+  // pointer difference.
+  if (isPCRel) {
+    Value -= FinalAddress;
+    // ARM PCRel relocations have an effective-PC offset of two instructions
+    // (four bytes in Thumb mode, 8 bytes in ARM mode).
+    // FIXME: For now, assume ARM mode.
+    Value -= 8;
+  }
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Invalid relocation type!");
+  case macho::RIT_Vanilla: {
+    // Mask in the target value a byte at a time (we don't have an alignment
+    // guarantee for the target address, so this is safest).
+    uint8_t *p = (uint8_t*)LocalAddress;
+    for (unsigned i = 0; i < Size; ++i) {
+      *p++ = (uint8_t)Value;
+      Value >>= 8;
+    }
+    break;
+  }
+  case macho::RIT_ARM_Branch24Bit: {
+    // Mask the value into the target address. We know instructions are
+    // 32-bit aligned, so we can do it all at once.
+    uint32_t *p = (uint32_t*)LocalAddress;
+    // The low two bits of the value are not encoded.
+    Value >>= 2;
+    // Mask the value to 24 bits.
+    Value &= 0xffffff;
+    // FIXME: If the destination is a Thumb function (and the instruction
+    // is a non-predicated BL instruction), we need to change it to a BLX
+    // instruction instead.
+
+    // Insert the value into the instruction.
+    *p = (*p & ~0xffffff) | Value;
+    break;
+  }
+  case macho::RIT_ARM_ThumbBranch22Bit:
+  case macho::RIT_ARM_ThumbBranch32Bit:
+  case macho::RIT_ARM_Half:
+  case macho::RIT_ARM_HalfDifference:
+  case macho::RIT_Pair:
+  case macho::RIT_Difference:
+  case macho::RIT_ARM_LocalDifference:
+  case macho::RIT_ARM_PreboundLazyPointer:
+    return Error("Relocation type not implemented yet!");
+  }
+  return false;
+}
+
+void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
+                                            ObjectImage &Obj,
+                                            ObjSectionToIDMap &ObjSectionToID,
+                                            const SymbolTableMap &Symbols,
+                                            StubMap &Stubs) {
+
+  uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
+  RelocationValueRef Value;
+  SectionEntry &Section = Sections[Rel.SectionID];
+
+  bool isExtern = (RelType >> 27) & 1;
+  if (isExtern) {
+    // Obtain the symbol name which is referenced in the relocation
+    StringRef TargetName;
+    const SymbolRef &Symbol = Rel.Symbol;
+    Symbol.getName(TargetName);
+    // First search for the symbol in the local symbol table
+    SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
+    if (lsi != Symbols.end()) {
+      Value.SectionID = lsi->second.first;
+      Value.Addend = lsi->second.second;
+    } else {
+      // Search for the symbol in the global symbol table
+      SymbolTableMap::const_iterator gsi = GlobalSymbolTable.find(TargetName.data());
+      if (gsi != GlobalSymbolTable.end()) {
+        Value.SectionID = gsi->second.first;
+        Value.Addend = gsi->second.second;
+      } else
+        Value.SymbolName = TargetName.data();
+    }
+  } else {
+    error_code err;
+    uint8_t sectionIndex = static_cast<uint8_t>(RelType & 0xFF);
+    section_iterator si = Obj.begin_sections(),
+                     se = Obj.end_sections();
+    for (uint8_t i = 1; i < sectionIndex; i++) {
+      error_code err;
+      si.increment(err);
+      if (si == se)
+        break;
+    }
+    assert(si != se && "No section containing relocation!");
+    Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
+    Value.Addend = 0;
+    // FIXME: The size and type of the relocation determines if we can
+    // encode an Addend in the target location itself, and if so, how many
+    // bytes we should read in order to get it. We don't yet support doing
+    // that, and just assuming it's sizeof(intptr_t) is blatantly wrong.
+    //Value.Addend = *(const intptr_t *)Target;
+    if (Value.Addend) {
+      // The MachO addend is an offset from the current section.  We need it
+      // to be an offset from the destination section
+      Value.Addend += Section.ObjAddress - Sections[Value.SectionID].ObjAddress;
+    }
+  }
+
+  if (Arch == Triple::arm && (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
+    // This is an ARM branch relocation, need to use a stub function.
+
+    //  Look up for existing stub.
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end())
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + i->second,
+                        RelType, 0);
+    else {
+      // Create a new stub function.
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+      RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                         macho::RIT_Vanilla, Value.Addend);
+      if (Value.SymbolName)
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else {
+    RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+  }
+}
+
+
+bool RuntimeDyldMachO::isCompatibleFormat(
+        const ObjectBuffer *InputBuffer) const {
+  if (InputBuffer->getBufferSize() < 4)
+    return false;
+  StringRef Magic(InputBuffer->getBufferStart(), 4);
+  if (Magic == "\xFE\xED\xFA\xCE") return true;
+  if (Magic == "\xCE\xFA\xED\xFE") return true;
+  if (Magic == "\xFE\xED\xFA\xCF") return true;
+  if (Magic == "\xCF\xFA\xED\xFE") return true;
+  return false;
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
new file mode 100644
index 000000000000..62d84870780c
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -0,0 +1,71 @@
+//===-- RuntimeDyldMachO.h - Run-time dynamic linker for MC-JIT ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachO support for MC-JIT runtime dynamic linker.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIME_DYLD_MACHO_H
+#define LLVM_RUNTIME_DYLD_MACHO_H
+
+#include "RuntimeDyldImpl.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Object/MachOObject.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+
+namespace llvm {
+class RuntimeDyldMachO : public RuntimeDyldImpl {
+protected:
+  bool resolveI386Relocation(uint8_t *LocalAddress,
+                             uint64_t FinalAddress,
+                             uint64_t Value,
+                             bool isPCRel,
+                             unsigned Type,
+                             unsigned Size,
+                             int64_t Addend);
+  bool resolveX86_64Relocation(uint8_t *LocalAddress,
+                               uint64_t FinalAddress,
+                               uint64_t Value,
+                               bool isPCRel,
+                               unsigned Type,
+                               unsigned Size,
+                               int64_t Addend);
+  bool resolveARMRelocation(uint8_t *LocalAddress,
+                            uint64_t FinalAddress,
+                            uint64_t Value,
+                            bool isPCRel,
+                            unsigned Type,
+                            unsigned Size,
+                            int64_t Addend);
+
+  virtual void processRelocationRef(const ObjRelocationInfo &Rel,
+                                    ObjectImage &Obj,
+                                    ObjSectionToIDMap &ObjSectionToID,
+                                    const SymbolTableMap &Symbols,
+                                    StubMap &Stubs);
+
+public:
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
+                                 uint64_t Value,
+                                 uint32_t Type,
+                                 int64_t Addend);
+
+  RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+
+  bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp b/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp
new file mode 100644
index 000000000000..ca4330fa22b0
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/TargetSelect.cpp
@@ -0,0 +1,99 @@
+//===-- TargetSelect.cpp - Target Chooser Code ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This just asks the TargetRegistry for the appropriate target to use, and
+// allows the user to specify a specific one on the commandline with -march=x,
+// -mcpu=y, and -mattr=a,-b,+c. Clients should initialize targets prior to
+// calling selectTarget().
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+TargetMachine *EngineBuilder::selectTarget() {
+  Triple TT;
+
+  // MCJIT can generate code for remote targets, but the old JIT and Interpreter
+  // must use the host architecture.
+  if (UseMCJIT && WhichEngine != EngineKind::Interpreter && M)
+    TT.setTriple(M->getTargetTriple());
+
+  return selectTarget(TT, MArch, MCPU, MAttrs);
+}
+
+/// selectTarget - Pick a target either via -march or by guessing the native
+/// arch.  Add any CPU features specified via -mcpu or -mattr.
+TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
+                              StringRef MArch,
+                              StringRef MCPU,
+                              const SmallVectorImpl<std::string>& MAttrs) {
+  Triple TheTriple(TargetTriple);
+  if (TheTriple.getTriple().empty())
+    TheTriple.setTriple(sys::getProcessTriple());
+
+  // Adjust the triple to match what the user requested.
+  const Target *TheTarget = 0;
+  if (!MArch.empty()) {
+    for (TargetRegistry::iterator it = TargetRegistry::begin(),
+           ie = TargetRegistry::end(); it != ie; ++it) {
+      if (MArch == it->getName()) {
+        TheTarget = &*it;
+        break;
+      }
+    }
+
+    if (!TheTarget) {
+      if (ErrorStr)
+        *ErrorStr = "No available targets are compatible with this -march, "
+                    "see -version for the available targets.\n";
+      return 0;
+    }
+
+    // Adjust the triple to match (if known), otherwise stick with the
+    // requested/host triple.
+    Triple::ArchType Type = Triple::getArchTypeForLLVMName(MArch);
+    if (Type != Triple::UnknownArch)
+      TheTriple.setArch(Type);
+  } else {
+    std::string Error;
+    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), Error);
+    if (TheTarget == 0) {
+      if (ErrorStr)
+        *ErrorStr = Error;
+      return 0;
+    }
+  }
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (!MAttrs.empty()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
+  // Allocate a target...
+  TargetMachine *Target = TheTarget->createTargetMachine(TheTriple.getTriple(),
+                                                         MCPU, FeaturesStr,
+                                                         Options,
+                                                         RelocModel, CMModel,
+                                                         OptLevel);
+  assert(Target && "Could not allocate target machine!");
+  return Target;
+}
diff --git a/contrib/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm/lib/IR/AsmWriter.cpp
new file mode 100644
index 000000000000..fb591a891dae
--- /dev/null
+++ b/contrib/llvm/lib/IR/AsmWriter.cpp
@@ -0,0 +1,2236 @@
+//===-- AsmWriter.cpp - Printing LLVM as an assembly file -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library implements the functionality defined in llvm/Assembly/Writer.h
+//
+// Note that these routines must be extremely tolerant of various errors in the
+// LLVM code, because it can be used for debugging transformations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/Writer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/AssemblyAnnotationWriter.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+// Make virtual table appear in this compilation unit.
+AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+static const Module *getModuleFromVal(const Value *V) {
+  if (const Argument *MA = dyn_cast<Argument>(V))
+    return MA->getParent() ? MA->getParent()->getParent() : 0;
+
+  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+    return BB->getParent() ? BB->getParent()->getParent() : 0;
+
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    const Function *M = I->getParent() ? I->getParent()->getParent() : 0;
+    return M ? M->getParent() : 0;
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    return GV->getParent();
+  return 0;
+}
+
+static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
+  switch (cc) {
+  default:                         Out << "cc" << cc; break;
+  case CallingConv::Fast:          Out << "fastcc"; break;
+  case CallingConv::Cold:          Out << "coldcc"; break;
+  case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
+  case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
+  case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
+  case CallingConv::Intel_OCL_BI:  Out << "intel_ocl_bicc"; break;
+  case CallingConv::ARM_APCS:      Out << "arm_apcscc"; break;
+  case CallingConv::ARM_AAPCS:     Out << "arm_aapcscc"; break;
+  case CallingConv::ARM_AAPCS_VFP: Out << "arm_aapcs_vfpcc"; break;
+  case CallingConv::MSP430_INTR:   Out << "msp430_intrcc"; break;
+  case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
+  case CallingConv::PTX_Device:    Out << "ptx_device"; break;
+  }
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(StringRef Name, raw_ostream &Out) {
+  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+    unsigned char C = Name[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else
+      Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+enum PrefixType {
+  GlobalPrefix,
+  LabelPrefix,
+  LocalPrefix,
+  NoPrefix
+};
+
+/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
+/// prefixed with % (if the string only contains simple characters) or is
+/// surrounded with ""'s (if it has special chars in it).  Print it out.
+static void PrintLLVMName(raw_ostream &OS, StringRef Name, PrefixType Prefix) {
+  assert(!Name.empty() && "Cannot get empty name!");
+  switch (Prefix) {
+  case NoPrefix: break;
+  case GlobalPrefix: OS << '@'; break;
+  case LabelPrefix:  break;
+  case LocalPrefix:  OS << '%'; break;
+  }
+
+  // Scan the name to see if it needs quotes first.
+  bool NeedsQuotes = isdigit(static_cast<unsigned char>(Name[0]));
+  if (!NeedsQuotes) {
+    for (unsigned i = 0, e = Name.size(); i != e; ++i) {
+      // By making this unsigned, the value passed in to isalnum will always be
+      // in the range 0-255.  This is important when building with MSVC because
+      // its implementation will assert.  This situation can arise when dealing
+      // with UTF-8 multibyte characters.
+      unsigned char C = Name[i];
+      if (!isalnum(static_cast<unsigned char>(C)) && C != '-' && C != '.' &&
+          C != '_') {
+        NeedsQuotes = true;
+        break;
+      }
+    }
+  }
+
+  // If we didn't need any quotes, just write out the name in one blast.
+  if (!NeedsQuotes) {
+    OS << Name;
+    return;
+  }
+
+  // Okay, we need quotes.  Output the quotes and escape any scary characters as
+  // needed.
+  OS << '"';
+  PrintEscapedString(Name, OS);
+  OS << '"';
+}
+
+/// PrintLLVMName - Turn the specified name into an 'LLVM name', which is either
+/// prefixed with % (if the string only contains simple characters) or is
+/// surrounded with ""'s (if it has special chars in it).  Print it out.
+static void PrintLLVMName(raw_ostream &OS, const Value *V) {
+  PrintLLVMName(OS, V->getName(),
+                isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
+}
+
+//===----------------------------------------------------------------------===//
+// TypePrinting Class: Type printing machinery
+//===----------------------------------------------------------------------===//
+
+/// TypePrinting - Type printing machinery.
+namespace {
+class TypePrinting {
+  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
+  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
+public:
+
+  /// NamedTypes - The named types that are used by the current module.
+  TypeFinder NamedTypes;
+
+  /// NumberedTypes - The numbered types, along with their value.
+  DenseMap<StructType*, unsigned> NumberedTypes;
+
+
+  TypePrinting() {}
+  ~TypePrinting() {}
+
+  void incorporateTypes(const Module &M);
+
+  void print(Type *Ty, raw_ostream &OS);
+
+  void printStructBody(StructType *Ty, raw_ostream &OS);
+};
+} // end anonymous namespace.
+
+
+void TypePrinting::incorporateTypes(const Module &M) {
+  NamedTypes.run(M, false);
+
+  // The list of struct types we got back includes all the struct types, split
+  // the unnamed ones out to a numbering and remove the anonymous structs.
+  unsigned NextNumber = 0;
+
+  std::vector<StructType*>::iterator NextToUse = NamedTypes.begin(), I, E;
+  for (I = NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I) {
+    StructType *STy = *I;
+
+    // Ignore anonymous types.
+    if (STy->isLiteral())
+      continue;
+
+    if (STy->getName().empty())
+      NumberedTypes[STy] = NextNumber++;
+    else
+      *NextToUse++ = STy;
+  }
+
+  NamedTypes.erase(NextToUse, NamedTypes.end());
+}
+
+
+/// CalcTypeName - Write the specified type to the specified raw_ostream, making
+/// use of type names or up references to shorten the type name where possible.
+void TypePrinting::print(Type *Ty, raw_ostream &OS) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:      OS << "void"; break;
+  case Type::HalfTyID:      OS << "half"; break;
+  case Type::FloatTyID:     OS << "float"; break;
+  case Type::DoubleTyID:    OS << "double"; break;
+  case Type::X86_FP80TyID:  OS << "x86_fp80"; break;
+  case Type::FP128TyID:     OS << "fp128"; break;
+  case Type::PPC_FP128TyID: OS << "ppc_fp128"; break;
+  case Type::LabelTyID:     OS << "label"; break;
+  case Type::MetadataTyID:  OS << "metadata"; break;
+  case Type::X86_MMXTyID:   OS << "x86_mmx"; break;
+  case Type::IntegerTyID:
+    OS << 'i' << cast<IntegerType>(Ty)->getBitWidth();
+    return;
+
+  case Type::FunctionTyID: {
+    FunctionType *FTy = cast<FunctionType>(Ty);
+    print(FTy->getReturnType(), OS);
+    OS << " (";
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+         E = FTy->param_end(); I != E; ++I) {
+      if (I != FTy->param_begin())
+        OS << ", ";
+      print(*I, OS);
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams()) OS << ", ";
+      OS << "...";
+    }
+    OS << ')';
+    return;
+  }
+  case Type::StructTyID: {
+    StructType *STy = cast<StructType>(Ty);
+
+    if (STy->isLiteral())
+      return printStructBody(STy, OS);
+
+    if (!STy->getName().empty())
+      return PrintLLVMName(OS, STy->getName(), LocalPrefix);
+
+    DenseMap<StructType*, unsigned>::iterator I = NumberedTypes.find(STy);
+    if (I != NumberedTypes.end())
+      OS << '%' << I->second;
+    else  // Not enumerated, print the hex address.
+      OS << "%\"type " << STy << '\"';
+    return;
+  }
+  case Type::PointerTyID: {
+    PointerType *PTy = cast<PointerType>(Ty);
+    print(PTy->getElementType(), OS);
+    if (unsigned AddressSpace = PTy->getAddressSpace())
+      OS << " addrspace(" << AddressSpace << ')';
+    OS << '*';
+    return;
+  }
+  case Type::ArrayTyID: {
+    ArrayType *ATy = cast<ArrayType>(Ty);
+    OS << '[' << ATy->getNumElements() << " x ";
+    print(ATy->getElementType(), OS);
+    OS << ']';
+    return;
+  }
+  case Type::VectorTyID: {
+    VectorType *PTy = cast<VectorType>(Ty);
+    OS << "<" << PTy->getNumElements() << " x ";
+    print(PTy->getElementType(), OS);
+    OS << '>';
+    return;
+  }
+  default:
+    OS << "<unrecognized-type>";
+    return;
+  }
+}
+
+void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
+  if (STy->isOpaque()) {
+    OS << "opaque";
+    return;
+  }
+
+  if (STy->isPacked())
+    OS << '<';
+
+  if (STy->getNumElements() == 0) {
+    OS << "{}";
+  } else {
+    StructType::element_iterator I = STy->element_begin();
+    OS << "{ ";
+    print(*I++, OS);
+    for (StructType::element_iterator E = STy->element_end(); I != E; ++I) {
+      OS << ", ";
+      print(*I, OS);
+    }
+
+    OS << " }";
+  }
+  if (STy->isPacked())
+    OS << '>';
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// SlotTracker Class: Enumerate slot numbers for unnamed values
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This class provides computation of slot numbers for LLVM Assembly writing.
+///
+class SlotTracker {
+public:
+  /// ValueMap - A mapping of Values to slot numbers.
+  typedef DenseMap<const Value*, unsigned> ValueMap;
+
+private:
+  /// TheModule - The module for which we are holding slot numbers.
+  const Module* TheModule;
+
+  /// TheFunction - The function for which we are holding slot numbers.
+  const Function* TheFunction;
+  bool FunctionProcessed;
+
+  /// mMap - The slot map for the module level data.
+  ValueMap mMap;
+  unsigned mNext;
+
+  /// fMap - The slot map for the function level data.
+  ValueMap fMap;
+  unsigned fNext;
+
+  /// mdnMap - Map for MDNodes.
+  DenseMap<const MDNode*, unsigned> mdnMap;
+  unsigned mdnNext;
+
+  /// asMap - The slot map for attribute sets.
+  DenseMap<AttributeSet, unsigned> asMap;
+  unsigned asNext;
+public:
+  /// Construct from a module
+  explicit SlotTracker(const Module *M);
+  /// Construct from a function, starting out in incorp state.
+  explicit SlotTracker(const Function *F);
+
+  /// Return the slot number of the specified value in it's type
+  /// plane.  If something is not in the SlotTracker, return -1.
+  int getLocalSlot(const Value *V);
+  int getGlobalSlot(const GlobalValue *V);
+  int getMetadataSlot(const MDNode *N);
+  int getAttributeGroupSlot(AttributeSet AS);
+
+  /// If you'd like to deal with a function instead of just a module, use
+  /// this method to get its data into the SlotTracker.
+  void incorporateFunction(const Function *F) {
+    TheFunction = F;
+    FunctionProcessed = false;
+  }
+
+  /// After calling incorporateFunction, use this method to remove the
+  /// most recently incorporated function from the SlotTracker. This
+  /// will reset the state of the machine back to just the module contents.
+  void purgeFunction();
+
+  /// MDNode map iterators.
+  typedef DenseMap<const MDNode*, unsigned>::iterator mdn_iterator;
+  mdn_iterator mdn_begin() { return mdnMap.begin(); }
+  mdn_iterator mdn_end() { return mdnMap.end(); }
+  unsigned mdn_size() const { return mdnMap.size(); }
+  bool mdn_empty() const { return mdnMap.empty(); }
+
+  /// AttributeSet map iterators.
+  typedef DenseMap<AttributeSet, unsigned>::iterator as_iterator;
+  as_iterator as_begin()   { return asMap.begin(); }
+  as_iterator as_end()     { return asMap.end(); }
+  unsigned as_size() const { return asMap.size(); }
+  bool as_empty() const    { return asMap.empty(); }
+
+  /// This function does the actual initialization.
+  inline void initialize();
+
+  // Implementation Details
+private:
+  /// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
+  void CreateModuleSlot(const GlobalValue *V);
+
+  /// CreateMetadataSlot - Insert the specified MDNode* into the slot table.
+  void CreateMetadataSlot(const MDNode *N);
+
+  /// CreateFunctionSlot - Insert the specified Value* into the slot table.
+  void CreateFunctionSlot(const Value *V);
+
+  /// \brief Insert the specified AttributeSet into the slot table.
+  void CreateAttributeSetSlot(AttributeSet AS);
+
+  /// Add all of the module level global variables (and their initializers)
+  /// and function declarations, but not the contents of those functions.
+  void processModule();
+
+  /// Add all of the functions arguments, basic blocks, and instructions.
+  void processFunction();
+
+  SlotTracker(const SlotTracker &) LLVM_DELETED_FUNCTION;
+  void operator=(const SlotTracker &) LLVM_DELETED_FUNCTION;
+};
+
+}  // end anonymous namespace
+
+
+static SlotTracker *createSlotTracker(const Value *V) {
+  if (const Argument *FA = dyn_cast<Argument>(V))
+    return new SlotTracker(FA->getParent());
+
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getParent())
+      return new SlotTracker(I->getParent()->getParent());
+
+  if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
+    return new SlotTracker(BB->getParent());
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return new SlotTracker(GV->getParent());
+
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V))
+    return new SlotTracker(GA->getParent());
+
+  if (const Function *Func = dyn_cast<Function>(V))
+    return new SlotTracker(Func);
+
+  if (const MDNode *MD = dyn_cast<MDNode>(V)) {
+    if (!MD->isFunctionLocal())
+      return new SlotTracker(MD->getFunction());
+
+    return new SlotTracker((Function *)0);
+  }
+
+  return 0;
+}
+
+#if 0
+#define ST_DEBUG(X) dbgs() << X
+#else
+#define ST_DEBUG(X)
+#endif
+
+// Module level constructor. Causes the contents of the Module (sans functions)
+// to be added to the slot table.
+SlotTracker::SlotTracker(const Module *M)
+  : TheModule(M), TheFunction(0), FunctionProcessed(false),
+    mNext(0), fNext(0),  mdnNext(0), asNext(0) {
+}
+
+// Function level constructor. Causes the contents of the Module and the one
+// function provided to be added to the slot table.
+SlotTracker::SlotTracker(const Function *F)
+  : TheModule(F ? F->getParent() : 0), TheFunction(F), FunctionProcessed(false),
+    mNext(0), fNext(0), mdnNext(0), asNext(0) {
+}
+
+inline void SlotTracker::initialize() {
+  if (TheModule) {
+    processModule();
+    TheModule = 0; ///< Prevent re-processing next time we're called.
+  }
+
+  if (TheFunction && !FunctionProcessed)
+    processFunction();
+}
+
+// Iterate through all the global variables, functions, and global
+// variable initializers and create slots for them.
+void SlotTracker::processModule() {
+  ST_DEBUG("begin processModule!\n");
+
+  // Add all of the unnamed global variables to the value table.
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (!I->hasName())
+      CreateModuleSlot(I);
+  }
+
+  // Add metadata used by named metadata.
+  for (Module::const_named_metadata_iterator
+         I = TheModule->named_metadata_begin(),
+         E = TheModule->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode *NMD = I;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      CreateMetadataSlot(NMD->getOperand(i));
+  }
+
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (!I->hasName())
+      // Add all the unnamed functions to the table.
+      CreateModuleSlot(I);
+
+    // Add all the function attributes to the table.
+    // FIXME: Add attributes of other objects?
+    AttributeSet FnAttrs = I->getAttributes().getFnAttributes();
+    if (FnAttrs.hasAttributes(AttributeSet::FunctionIndex))
+      CreateAttributeSetSlot(FnAttrs);
+  }
+
+  ST_DEBUG("end processModule!\n");
+}
+
+// Process the arguments, basic blocks, and instructions  of a function.
+void SlotTracker::processFunction() {
+  ST_DEBUG("begin processFunction!\n");
+  fNext = 0;
+
+  // Add all the function arguments with no names.
+  for(Function::const_arg_iterator AI = TheFunction->arg_begin(),
+      AE = TheFunction->arg_end(); AI != AE; ++AI)
+    if (!AI->hasName())
+      CreateFunctionSlot(AI);
+
+  ST_DEBUG("Inserting Instructions:\n");
+
+  SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+
+  // Add all of the basic blocks and instructions with no names.
+  for (Function::const_iterator BB = TheFunction->begin(),
+       E = TheFunction->end(); BB != E; ++BB) {
+    if (!BB->hasName())
+      CreateFunctionSlot(BB);
+
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
+      if (!I->getType()->isVoidTy() && !I->hasName())
+        CreateFunctionSlot(I);
+
+      // Intrinsics can directly use metadata.  We allow direct calls to any
+      // llvm.foo function here, because the target may not be linked into the
+      // optimizer.
+      if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (Function *F = CI->getCalledFunction())
+          if (F->getName().startswith("llvm."))
+            for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+              if (MDNode *N = dyn_cast_or_null<MDNode>(I->getOperand(i)))
+                CreateMetadataSlot(N);
+
+        // Add all the call attributes to the table.
+        AttributeSet Attrs = CI->getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+          CreateAttributeSetSlot(Attrs);
+      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+        // Add all the call attributes to the table.
+        AttributeSet Attrs = II->getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+          CreateAttributeSetSlot(Attrs);
+      }
+
+      // Process metadata attached with this instruction.
+      I->getAllMetadata(MDForInst);
+      for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
+        CreateMetadataSlot(MDForInst[i].second);
+      MDForInst.clear();
+    }
+  }
+
+  FunctionProcessed = true;
+
+  ST_DEBUG("end processFunction!\n");
+}
+
+/// Clean up after incorporating a function. This is the only way to get out of
+/// the function incorporation state that affects get*Slot/Create*Slot. Function
+/// incorporation state is indicated by TheFunction != 0.
+void SlotTracker::purgeFunction() {
+  ST_DEBUG("begin purgeFunction!\n");
+  fMap.clear(); // Simply discard the function level map
+  TheFunction = 0;
+  FunctionProcessed = false;
+  ST_DEBUG("end purgeFunction!\n");
+}
+
+/// getGlobalSlot - Get the slot number of a global value.
+int SlotTracker::getGlobalSlot(const GlobalValue *V) {
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+
+  // Find the value in the module map
+  ValueMap::iterator MI = mMap.find(V);
+  return MI == mMap.end() ? -1 : (int)MI->second;
+}
+
+/// getMetadataSlot - Get the slot number of a MDNode.
+int SlotTracker::getMetadataSlot(const MDNode *N) {
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+
+  // Find the MDNode in the module map
+  mdn_iterator MI = mdnMap.find(N);
+  return MI == mdnMap.end() ? -1 : (int)MI->second;
+}
+
+
+/// getLocalSlot - Get the slot number for a value that is local to a function.
+int SlotTracker::getLocalSlot(const Value *V) {
+  assert(!isa<Constant>(V) && "Can't get a constant or global slot with this!");
+
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+
+  ValueMap::iterator FI = fMap.find(V);
+  return FI == fMap.end() ? -1 : (int)FI->second;
+}
+
+int SlotTracker::getAttributeGroupSlot(AttributeSet AS) {
+  // Check for uninitialized state and do lazy initialization.
+  initialize();
+
+  // Find the AttributeSet in the module map.
+  as_iterator AI = asMap.find(AS);
+  return AI == asMap.end() ? -1 : (int)AI->second;
+}
+
+/// CreateModuleSlot - Insert the specified GlobalValue* into the slot table.
+void SlotTracker::CreateModuleSlot(const GlobalValue *V) {
+  assert(V && "Can't insert a null Value into SlotTracker!");
+  assert(!V->getType()->isVoidTy() && "Doesn't need a slot!");
+  assert(!V->hasName() && "Doesn't need a slot!");
+
+  unsigned DestSlot = mNext++;
+  mMap[V] = DestSlot;
+
+  ST_DEBUG("  Inserting value [" << V->getType() << "] = " << V << " slot=" <<
+           DestSlot << " [");
+  // G = Global, F = Function, A = Alias, o = other
+  ST_DEBUG((isa<GlobalVariable>(V) ? 'G' :
+            (isa<Function>(V) ? 'F' :
+             (isa<GlobalAlias>(V) ? 'A' : 'o'))) << "]\n");
+}
+
+/// CreateSlot - Create a new slot for the specified value if it has no name.
+void SlotTracker::CreateFunctionSlot(const Value *V) {
+  assert(!V->getType()->isVoidTy() && !V->hasName() && "Doesn't need a slot!");
+
+  unsigned DestSlot = fNext++;
+  fMap[V] = DestSlot;
+
+  // G = Global, F = Function, o = other
+  ST_DEBUG("  Inserting value [" << V->getType() << "] = " << V << " slot=" <<
+           DestSlot << " [o]\n");
+}
+
+/// CreateModuleSlot - Insert the specified MDNode* into the slot table.
+void SlotTracker::CreateMetadataSlot(const MDNode *N) {
+  assert(N && "Can't insert a null Value into SlotTracker!");
+
+  // Don't insert if N is a function-local metadata, these are always printed
+  // inline.
+  if (!N->isFunctionLocal()) {
+    mdn_iterator I = mdnMap.find(N);
+    if (I != mdnMap.end())
+      return;
+
+    unsigned DestSlot = mdnNext++;
+    mdnMap[N] = DestSlot;
+  }
+
+  // Recursively add any MDNodes referenced by operands.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    if (const MDNode *Op = dyn_cast_or_null<MDNode>(N->getOperand(i)))
+      CreateMetadataSlot(Op);
+}
+
+void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
+  assert(AS.hasAttributes(AttributeSet::FunctionIndex) &&
+         "Doesn't need a slot!");
+
+  as_iterator I = asMap.find(AS);
+  if (I != asMap.end())
+    return;
+
+  unsigned DestSlot = asNext++;
+  asMap[AS] = DestSlot;
+}
+
+//===----------------------------------------------------------------------===//
+// AsmWriter Implementation
+//===----------------------------------------------------------------------===//
+
+static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine,
+                                   const Module *Context);
+
+
+
+static const char *getPredicateText(unsigned predicate) {
+  const char * pred = "unknown";
+  switch (predicate) {
+  case FCmpInst::FCMP_FALSE: pred = "false"; break;
+  case FCmpInst::FCMP_OEQ:   pred = "oeq"; break;
+  case FCmpInst::FCMP_OGT:   pred = "ogt"; break;
+  case FCmpInst::FCMP_OGE:   pred = "oge"; break;
+  case FCmpInst::FCMP_OLT:   pred = "olt"; break;
+  case FCmpInst::FCMP_OLE:   pred = "ole"; break;
+  case FCmpInst::FCMP_ONE:   pred = "one"; break;
+  case FCmpInst::FCMP_ORD:   pred = "ord"; break;
+  case FCmpInst::FCMP_UNO:   pred = "uno"; break;
+  case FCmpInst::FCMP_UEQ:   pred = "ueq"; break;
+  case FCmpInst::FCMP_UGT:   pred = "ugt"; break;
+  case FCmpInst::FCMP_UGE:   pred = "uge"; break;
+  case FCmpInst::FCMP_ULT:   pred = "ult"; break;
+  case FCmpInst::FCMP_ULE:   pred = "ule"; break;
+  case FCmpInst::FCMP_UNE:   pred = "une"; break;
+  case FCmpInst::FCMP_TRUE:  pred = "true"; break;
+  case ICmpInst::ICMP_EQ:    pred = "eq"; break;
+  case ICmpInst::ICMP_NE:    pred = "ne"; break;
+  case ICmpInst::ICMP_SGT:   pred = "sgt"; break;
+  case ICmpInst::ICMP_SGE:   pred = "sge"; break;
+  case ICmpInst::ICMP_SLT:   pred = "slt"; break;
+  case ICmpInst::ICMP_SLE:   pred = "sle"; break;
+  case ICmpInst::ICMP_UGT:   pred = "ugt"; break;
+  case ICmpInst::ICMP_UGE:   pred = "uge"; break;
+  case ICmpInst::ICMP_ULT:   pred = "ult"; break;
+  case ICmpInst::ICMP_ULE:   pred = "ule"; break;
+  }
+  return pred;
+}
+
+static void writeAtomicRMWOperation(raw_ostream &Out,
+                                    AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default: Out << " <unknown operation " << Op << ">"; break;
+  case AtomicRMWInst::Xchg: Out << " xchg"; break;
+  case AtomicRMWInst::Add:  Out << " add"; break;
+  case AtomicRMWInst::Sub:  Out << " sub"; break;
+  case AtomicRMWInst::And:  Out << " and"; break;
+  case AtomicRMWInst::Nand: Out << " nand"; break;
+  case AtomicRMWInst::Or:   Out << " or"; break;
+  case AtomicRMWInst::Xor:  Out << " xor"; break;
+  case AtomicRMWInst::Max:  Out << " max"; break;
+  case AtomicRMWInst::Min:  Out << " min"; break;
+  case AtomicRMWInst::UMax: Out << " umax"; break;
+  case AtomicRMWInst::UMin: Out << " umin"; break;
+  }
+}
+
+static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
+  if (const FPMathOperator *FPO = dyn_cast<const FPMathOperator>(U)) {
+    // Unsafe algebra implies all the others, no need to write them all out
+    if (FPO->hasUnsafeAlgebra())
+      Out << " fast";
+    else {
+      if (FPO->hasNoNaNs())
+        Out << " nnan";
+      if (FPO->hasNoInfs())
+        Out << " ninf";
+      if (FPO->hasNoSignedZeros())
+        Out << " nsz";
+      if (FPO->hasAllowReciprocal())
+        Out << " arcp";
+    }
+  }
+
+  if (const OverflowingBinaryOperator *OBO =
+        dyn_cast<OverflowingBinaryOperator>(U)) {
+    if (OBO->hasNoUnsignedWrap())
+      Out << " nuw";
+    if (OBO->hasNoSignedWrap())
+      Out << " nsw";
+  } else if (const PossiblyExactOperator *Div =
+               dyn_cast<PossiblyExactOperator>(U)) {
+    if (Div->isExact())
+      Out << " exact";
+  } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
+    if (GEP->isInBounds())
+      Out << " inbounds";
+  }
+}
+
+static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
+                                  TypePrinting &TypePrinter,
+                                  SlotTracker *Machine,
+                                  const Module *Context) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    if (CI->getType()->isIntegerTy(1)) {
+      Out << (CI->getZExtValue() ? "true" : "false");
+      return;
+    }
+    Out << CI->getValue();
+    return;
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle ||
+        &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble) {
+      // We would like to output the FP constant value in exponential notation,
+      // but we cannot do this if doing so will lose precision.  Check here to
+      // make sure that we only output it in exponential format if we can parse
+      // the value back and get the same value.
+      //
+      bool ignored;
+      bool isHalf = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEhalf;
+      bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
+      bool isInf = CFP->getValueAPF().isInfinity();
+      bool isNaN = CFP->getValueAPF().isNaN();
+      if (!isHalf && !isInf && !isNaN) {
+        double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
+                                CFP->getValueAPF().convertToFloat();
+        SmallString<128> StrVal;
+        raw_svector_ostream(StrVal) << Val;
+
+        // Check to make sure that the stringized number is not some string like
+        // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
+        // that the string matches the "[-+]?[0-9]" regex.
+        //
+        if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+            ((StrVal[0] == '-' || StrVal[0] == '+') &&
+             (StrVal[1] >= '0' && StrVal[1] <= '9'))) {
+          // Reparse stringized version!
+          if (APFloat(APFloat::IEEEdouble, StrVal).convertToDouble() == Val) {
+            Out << StrVal.str();
+            return;
+          }
+        }
+      }
+      // Otherwise we could not reparse it to exactly the same value, so we must
+      // output the string in hexadecimal format!  Note that loading and storing
+      // floating point types changes the bits of NaNs on some hosts, notably
+      // x86, so we must not use these types.
+      assert(sizeof(double) == sizeof(uint64_t) &&
+             "assuming that double is 64 bits!");
+      char Buffer[40];
+      APFloat apf = CFP->getValueAPF();
+      // Halves and floats are represented in ASCII IR as double, convert.
+      if (!isDouble)
+        apf.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                          &ignored);
+      Out << "0x" <<
+              utohex_buffer(uint64_t(apf.bitcastToAPInt().getZExtValue()),
+                            Buffer+40);
+      return;
+    }
+
+    // Either half, or some form of long double.
+    // These appear as a magic letter identifying the type, then a
+    // fixed number of hex digits.
+    Out << "0x";
+    // Bit position, in the current word, of the next nibble to print.
+    int shiftcount;
+
+    if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended) {
+      Out << 'K';
+      // api needed to prevent premature destruction
+      APInt api = CFP->getValueAPF().bitcastToAPInt();
+      const uint64_t* p = api.getRawData();
+      uint64_t word = p[1];
+      shiftcount = 12;
+      int width = api.getBitWidth();
+      for (int j=0; j<width; j+=4, shiftcount-=4) {
+        unsigned int nibble = (word>>shiftcount) & 15;
+        if (nibble < 10)
+          Out << (unsigned char)(nibble + '0');
+        else
+          Out << (unsigned char)(nibble - 10 + 'A');
+        if (shiftcount == 0 && j+4 < width) {
+          word = *p;
+          shiftcount = 64;
+          if (width-j-4 < 64)
+            shiftcount = width-j-4;
+        }
+      }
+      return;
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad) {
+      shiftcount = 60;
+      Out << 'L';
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble) {
+      shiftcount = 60;
+      Out << 'M';
+    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf) {
+      shiftcount = 12;
+      Out << 'H';
+    } else
+      llvm_unreachable("Unsupported floating point type");
+    // api needed to prevent premature destruction
+    APInt api = CFP->getValueAPF().bitcastToAPInt();
+    const uint64_t* p = api.getRawData();
+    uint64_t word = *p;
+    int width = api.getBitWidth();
+    for (int j=0; j<width; j+=4, shiftcount-=4) {
+      unsigned int nibble = (word>>shiftcount) & 15;
+      if (nibble < 10)
+        Out << (unsigned char)(nibble + '0');
+      else
+        Out << (unsigned char)(nibble - 10 + 'A');
+      if (shiftcount == 0 && j+4 < width) {
+        word = *(++p);
+        shiftcount = 64;
+        if (width-j-4 < 64)
+          shiftcount = width-j-4;
+      }
+    }
+    return;
+  }
+
+  if (isa<ConstantAggregateZero>(CV)) {
+    Out << "zeroinitializer";
+    return;
+  }
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
+    Out << "blockaddress(";
+    WriteAsOperandInternal(Out, BA->getFunction(), &TypePrinter, Machine,
+                           Context);
+    Out << ", ";
+    WriteAsOperandInternal(Out, BA->getBasicBlock(), &TypePrinter, Machine,
+                           Context);
+    Out << ")";
+    return;
+  }
+
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    Type *ETy = CA->getType()->getElementType();
+    Out << '[';
+    TypePrinter.print(ETy, Out);
+    Out << ' ';
+    WriteAsOperandInternal(Out, CA->getOperand(0),
+                           &TypePrinter, Machine,
+                           Context);
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      TypePrinter.print(ETy, Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CA->getOperand(i), &TypePrinter, Machine,
+                             Context);
+    }
+    Out << ']';
+    return;
+  }
+
+  if (const ConstantDataArray *CA = dyn_cast<ConstantDataArray>(CV)) {
+    // As a special case, print the array as a string if it is an array of
+    // i8 with ConstantInt values.
+    if (CA->isString()) {
+      Out << "c\"";
+      PrintEscapedString(CA->getAsString(), Out);
+      Out << '"';
+      return;
+    }
+
+    Type *ETy = CA->getType()->getElementType();
+    Out << '[';
+    TypePrinter.print(ETy, Out);
+    Out << ' ';
+    WriteAsOperandInternal(Out, CA->getElementAsConstant(0),
+                           &TypePrinter, Machine,
+                           Context);
+    for (unsigned i = 1, e = CA->getNumElements(); i != e; ++i) {
+      Out << ", ";
+      TypePrinter.print(ETy, Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CA->getElementAsConstant(i), &TypePrinter,
+                             Machine, Context);
+    }
+    Out << ']';
+    return;
+  }
+
+
+  if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    if (CS->getType()->isPacked())
+      Out << '<';
+    Out << '{';
+    unsigned N = CS->getNumOperands();
+    if (N) {
+      Out << ' ';
+      TypePrinter.print(CS->getOperand(0)->getType(), Out);
+      Out << ' ';
+
+      WriteAsOperandInternal(Out, CS->getOperand(0), &TypePrinter, Machine,
+                             Context);
+
+      for (unsigned i = 1; i < N; i++) {
+        Out << ", ";
+        TypePrinter.print(CS->getOperand(i)->getType(), Out);
+        Out << ' ';
+
+        WriteAsOperandInternal(Out, CS->getOperand(i), &TypePrinter, Machine,
+                               Context);
+      }
+      Out << ' ';
+    }
+
+    Out << '}';
+    if (CS->getType()->isPacked())
+      Out << '>';
+    return;
+  }
+
+  if (isa<ConstantVector>(CV) || isa<ConstantDataVector>(CV)) {
+    Type *ETy = CV->getType()->getVectorElementType();
+    Out << '<';
+    TypePrinter.print(ETy, Out);
+    Out << ' ';
+    WriteAsOperandInternal(Out, CV->getAggregateElement(0U), &TypePrinter,
+                           Machine, Context);
+    for (unsigned i = 1, e = CV->getType()->getVectorNumElements(); i != e;++i){
+      Out << ", ";
+      TypePrinter.print(ETy, Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, CV->getAggregateElement(i), &TypePrinter,
+                             Machine, Context);
+    }
+    Out << '>';
+    return;
+  }
+
+  if (isa<ConstantPointerNull>(CV)) {
+    Out << "null";
+    return;
+  }
+
+  if (isa<UndefValue>(CV)) {
+    Out << "undef";
+    return;
+  }
+
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    Out << CE->getOpcodeName();
+    WriteOptimizationInfo(Out, CE);
+    if (CE->isCompare())
+      Out << ' ' << getPredicateText(CE->getPredicate());
+    Out << " (";
+
+    for (User::const_op_iterator OI=CE->op_begin(); OI != CE->op_end(); ++OI) {
+      TypePrinter.print((*OI)->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, *OI, &TypePrinter, Machine, Context);
+      if (OI+1 != CE->op_end())
+        Out << ", ";
+    }
+
+    if (CE->hasIndices()) {
+      ArrayRef<unsigned> Indices = CE->getIndices();
+      for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+        Out << ", " << Indices[i];
+    }
+
+    if (CE->isCast()) {
+      Out << " to ";
+      TypePrinter.print(CE->getType(), Out);
+    }
+
+    Out << ')';
+    return;
+  }
+
+  Out << "<placeholder or erroneous Constant>";
+}
+
+static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
+                                    TypePrinting *TypePrinter,
+                                    SlotTracker *Machine,
+                                    const Module *Context) {
+  Out << "!{";
+  for (unsigned mi = 0, me = Node->getNumOperands(); mi != me; ++mi) {
+    const Value *V = Node->getOperand(mi);
+    if (V == 0)
+      Out << "null";
+    else {
+      TypePrinter->print(V->getType(), Out);
+      Out << ' ';
+      WriteAsOperandInternal(Out, Node->getOperand(mi),
+                             TypePrinter, Machine, Context);
+    }
+    if (mi + 1 != me)
+      Out << ", ";
+  }
+
+  Out << "}";
+}
+
+
+/// WriteAsOperand - Write the name of the specified value out to the specified
+/// ostream.  This can be useful when you just want to print int %reg126, not
+/// the whole instruction that generated it.
+///
+static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
+                                   TypePrinting *TypePrinter,
+                                   SlotTracker *Machine,
+                                   const Module *Context) {
+  if (V->hasName()) {
+    PrintLLVMName(Out, V);
+    return;
+  }
+
+  const Constant *CV = dyn_cast<Constant>(V);
+  if (CV && !isa<GlobalValue>(CV)) {
+    assert(TypePrinter && "Constants require TypePrinting!");
+    WriteConstantInternal(Out, CV, *TypePrinter, Machine, Context);
+    return;
+  }
+
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+    Out << "asm ";
+    if (IA->hasSideEffects())
+      Out << "sideeffect ";
+    if (IA->isAlignStack())
+      Out << "alignstack ";
+    // We don't emit the AD_ATT dialect as it's the assumed default.
+    if (IA->getDialect() == InlineAsm::AD_Intel)
+      Out << "inteldialect ";
+    Out << '"';
+    PrintEscapedString(IA->getAsmString(), Out);
+    Out << "\", \"";
+    PrintEscapedString(IA->getConstraintString(), Out);
+    Out << '"';
+    return;
+  }
+
+  if (const MDNode *N = dyn_cast<MDNode>(V)) {
+    if (N->isFunctionLocal()) {
+      // Print metadata inline, not via slot reference number.
+      WriteMDNodeBodyInternal(Out, N, TypePrinter, Machine, Context);
+      return;
+    }
+
+    if (!Machine) {
+      if (N->isFunctionLocal())
+        Machine = new SlotTracker(N->getFunction());
+      else
+        Machine = new SlotTracker(Context);
+    }
+    int Slot = Machine->getMetadataSlot(N);
+    if (Slot == -1)
+      Out << "<badref>";
+    else
+      Out << '!' << Slot;
+    return;
+  }
+
+  if (const MDString *MDS = dyn_cast<MDString>(V)) {
+    Out << "!\"";
+    PrintEscapedString(MDS->getString(), Out);
+    Out << '"';
+    return;
+  }
+
+  if (V->getValueID() == Value::PseudoSourceValueVal ||
+      V->getValueID() == Value::FixedStackPseudoSourceValueVal) {
+    V->print(Out);
+    return;
+  }
+
+  char Prefix = '%';
+  int Slot;
+  // If we have a SlotTracker, use it.
+  if (Machine) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+      Slot = Machine->getGlobalSlot(GV);
+      Prefix = '@';
+    } else {
+      Slot = Machine->getLocalSlot(V);
+
+      // If the local value didn't succeed, then we may be referring to a value
+      // from a different function.  Translate it, as this can happen when using
+      // address of blocks.
+      if (Slot == -1)
+        if ((Machine = createSlotTracker(V))) {
+          Slot = Machine->getLocalSlot(V);
+          delete Machine;
+        }
+    }
+  } else if ((Machine = createSlotTracker(V))) {
+    // Otherwise, create one to get the # and then destroy it.
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+      Slot = Machine->getGlobalSlot(GV);
+      Prefix = '@';
+    } else {
+      Slot = Machine->getLocalSlot(V);
+    }
+    delete Machine;
+    Machine = 0;
+  } else {
+    Slot = -1;
+  }
+
+  if (Slot != -1)
+    Out << Prefix << Slot;
+  else
+    Out << "<badref>";
+}
+
+void llvm::WriteAsOperand(raw_ostream &Out, const Value *V,
+                          bool PrintType, const Module *Context) {
+
+  // Fast path: Don't construct and populate a TypePrinting object if we
+  // won't be needing any types printed.
+  if (!PrintType &&
+      ((!isa<Constant>(V) && !isa<MDNode>(V)) ||
+       V->hasName() || isa<GlobalValue>(V))) {
+    WriteAsOperandInternal(Out, V, 0, 0, Context);
+    return;
+  }
+
+  if (Context == 0) Context = getModuleFromVal(V);
+
+  TypePrinting TypePrinter;
+  if (Context)
+    TypePrinter.incorporateTypes(*Context);
+  if (PrintType) {
+    TypePrinter.print(V->getType(), Out);
+    Out << ' ';
+  }
+
+  WriteAsOperandInternal(Out, V, &TypePrinter, 0, Context);
+}
+
+namespace {
+
+class AssemblyWriter {
+  formatted_raw_ostream &Out;
+  SlotTracker &Machine;
+  const Module *TheModule;
+  TypePrinting TypePrinter;
+  AssemblyAnnotationWriter *AnnotationWriter;
+
+public:
+  inline AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                        const Module *M,
+                        AssemblyAnnotationWriter *AAW)
+    : Out(o), Machine(Mac), TheModule(M), AnnotationWriter(AAW) {
+    if (M)
+      TypePrinter.incorporateTypes(*M);
+  }
+
+  void printMDNodeBody(const MDNode *MD);
+  void printNamedMDNode(const NamedMDNode *NMD);
+
+  void printModule(const Module *M);
+
+  void writeOperand(const Value *Op, bool PrintType);
+  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
+
+  void writeAllMDNodes();
+  void writeAllAttributeGroups();
+
+  void printTypeIdentities();
+  void printGlobal(const GlobalVariable *GV);
+  void printAlias(const GlobalAlias *GV);
+  void printFunction(const Function *F);
+  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
+  void printBasicBlock(const BasicBlock *BB);
+  void printInstruction(const Instruction &I);
+
+private:
+  // printInfoComment - Print a little comment after the instruction indicating
+  // which slot it occupies.
+  void printInfoComment(const Value &V);
+};
+}  // end of anonymous namespace
+
+void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
+  if (Operand == 0) {
+    Out << "<null operand!>";
+    return;
+  }
+  if (PrintType) {
+    TypePrinter.print(Operand->getType(), Out);
+    Out << ' ';
+  }
+  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
+}
+
+void AssemblyWriter::writeAtomic(AtomicOrdering Ordering,
+                                 SynchronizationScope SynchScope) {
+  if (Ordering == NotAtomic)
+    return;
+
+  switch (SynchScope) {
+  case SingleThread: Out << " singlethread"; break;
+  case CrossThread: break;
+  }
+
+  switch (Ordering) {
+  default: Out << " <bad ordering " << int(Ordering) << ">"; break;
+  case Unordered: Out << " unordered"; break;
+  case Monotonic: Out << " monotonic"; break;
+  case Acquire: Out << " acquire"; break;
+  case Release: Out << " release"; break;
+  case AcquireRelease: Out << " acq_rel"; break;
+  case SequentiallyConsistent: Out << " seq_cst"; break;
+  }
+}
+
+void AssemblyWriter::writeParamOperand(const Value *Operand,
+                                       AttributeSet Attrs, unsigned Idx) {
+  if (Operand == 0) {
+    Out << "<null operand!>";
+    return;
+  }
+
+  // Print the type
+  TypePrinter.print(Operand->getType(), Out);
+  // Print parameter attributes list
+  if (Attrs.hasAttributes(Idx))
+    Out << ' ' << Attrs.getAsString(Idx);
+  Out << ' ';
+  // Print the operand
+  WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
+}
+
+void AssemblyWriter::printModule(const Module *M) {
+  Machine.initialize();
+
+  if (!M->getModuleIdentifier().empty() &&
+      // Don't print the ID if it will start a new line (which would
+      // require a comment char before it).
+      M->getModuleIdentifier().find('\n') == std::string::npos)
+    Out << "; ModuleID = '" << M->getModuleIdentifier() << "'\n";
+
+  if (!M->getDataLayout().empty())
+    Out << "target datalayout = \"" << M->getDataLayout() << "\"\n";
+  if (!M->getTargetTriple().empty())
+    Out << "target triple = \"" << M->getTargetTriple() << "\"\n";
+
+  if (!M->getModuleInlineAsm().empty()) {
+    // Split the string into lines, to make it easier to read the .ll file.
+    std::string Asm = M->getModuleInlineAsm();
+    size_t CurPos = 0;
+    size_t NewLine = Asm.find_first_of('\n', CurPos);
+    Out << '\n';
+    while (NewLine != std::string::npos) {
+      // We found a newline, print the portion of the asm string from the
+      // last newline up to this newline.
+      Out << "module asm \"";
+      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
+                         Out);
+      Out << "\"\n";
+      CurPos = NewLine+1;
+      NewLine = Asm.find_first_of('\n', CurPos);
+    }
+    std::string rest(Asm.begin()+CurPos, Asm.end());
+    if (!rest.empty()) {
+      Out << "module asm \"";
+      PrintEscapedString(rest, Out);
+      Out << "\"\n";
+    }
+  }
+
+  printTypeIdentities();
+
+  // Output all globals.
+  if (!M->global_empty()) Out << '\n';
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    printGlobal(I); Out << '\n';
+  }
+
+  // Output all aliases.
+  if (!M->alias_empty()) Out << "\n";
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I)
+    printAlias(I);
+
+  // Output all of the functions.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    printFunction(I);
+
+  // Output all attribute groups.
+  if (!Machine.as_empty()) {
+    Out << '\n';
+    writeAllAttributeGroups();
+  }
+
+  // Output named metadata.
+  if (!M->named_metadata_empty()) Out << '\n';
+
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+       E = M->named_metadata_end(); I != E; ++I)
+    printNamedMDNode(I);
+
+  // Output metadata.
+  if (!Machine.mdn_empty()) {
+    Out << '\n';
+    writeAllMDNodes();
+  }
+}
+
+void AssemblyWriter::printNamedMDNode(const NamedMDNode *NMD) {
+  Out << '!';
+  StringRef Name = NMD->getName();
+  if (Name.empty()) {
+    Out << "<empty name> ";
+  } else {
+    if (isalpha(static_cast<unsigned char>(Name[0])) ||
+        Name[0] == '-' || Name[0] == '$' ||
+        Name[0] == '.' || Name[0] == '_')
+      Out << Name[0];
+    else
+      Out << '\\' << hexdigit(Name[0] >> 4) << hexdigit(Name[0] & 0x0F);
+    for (unsigned i = 1, e = Name.size(); i != e; ++i) {
+      unsigned char C = Name[i];
+      if (isalnum(static_cast<unsigned char>(C)) || C == '-' || C == '$' ||
+          C == '.' || C == '_')
+        Out << C;
+      else
+        Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+    }
+  }
+  Out << " = !{";
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    if (i) Out << ", ";
+    int Slot = Machine.getMetadataSlot(NMD->getOperand(i));
+    if (Slot == -1)
+      Out << "<badref>";
+    else
+      Out << '!' << Slot;
+  }
+  Out << "}\n";
+}
+
+
+static void PrintLinkage(GlobalValue::LinkageTypes LT,
+                         formatted_raw_ostream &Out) {
+  switch (LT) {
+  case GlobalValue::ExternalLinkage: break;
+  case GlobalValue::PrivateLinkage:       Out << "private ";        break;
+  case GlobalValue::LinkerPrivateLinkage: Out << "linker_private "; break;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    Out << "linker_private_weak ";
+    break;
+  case GlobalValue::InternalLinkage:      Out << "internal ";       break;
+  case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
+  case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    Out << "linkonce_odr_auto_hide ";
+    break;
+  case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
+  case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
+  case GlobalValue::CommonLinkage:        Out << "common ";         break;
+  case GlobalValue::AppendingLinkage:     Out << "appending ";      break;
+  case GlobalValue::DLLImportLinkage:     Out << "dllimport ";      break;
+  case GlobalValue::DLLExportLinkage:     Out << "dllexport ";      break;
+  case GlobalValue::ExternalWeakLinkage:  Out << "extern_weak ";    break;
+  case GlobalValue::AvailableExternallyLinkage:
+    Out << "available_externally ";
+    break;
+  }
+}
+
+
+static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
+                            formatted_raw_ostream &Out) {
+  switch (Vis) {
+  case GlobalValue::DefaultVisibility: break;
+  case GlobalValue::HiddenVisibility:    Out << "hidden "; break;
+  case GlobalValue::ProtectedVisibility: Out << "protected "; break;
+  }
+}
+
+static void PrintThreadLocalModel(GlobalVariable::ThreadLocalMode TLM,
+                                  formatted_raw_ostream &Out) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "thread_local ";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "thread_local(localdynamic) ";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "thread_local(initialexec) ";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "thread_local(localexec) ";
+      break;
+  }
+}
+
+void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
+  if (GV->isMaterializable())
+    Out << "; Materializable\n";
+
+  WriteAsOperandInternal(Out, GV, &TypePrinter, &Machine, GV->getParent());
+  Out << " = ";
+
+  if (!GV->hasInitializer() && GV->hasExternalLinkage())
+    Out << "external ";
+
+  PrintLinkage(GV->getLinkage(), Out);
+  PrintVisibility(GV->getVisibility(), Out);
+  PrintThreadLocalModel(GV->getThreadLocalMode(), Out);
+
+  if (unsigned AddressSpace = GV->getType()->getAddressSpace())
+    Out << "addrspace(" << AddressSpace << ") ";
+  if (GV->hasUnnamedAddr()) Out << "unnamed_addr ";
+  if (GV->isExternallyInitialized()) Out << "externally_initialized ";
+  Out << (GV->isConstant() ? "constant " : "global ");
+  TypePrinter.print(GV->getType()->getElementType(), Out);
+
+  if (GV->hasInitializer()) {
+    Out << ' ';
+    writeOperand(GV->getInitializer(), false);
+  }
+
+  if (GV->hasSection()) {
+    Out << ", section \"";
+    PrintEscapedString(GV->getSection(), Out);
+    Out << '"';
+  }
+  if (GV->getAlignment())
+    Out << ", align " << GV->getAlignment();
+
+  printInfoComment(*GV);
+}
+
+void AssemblyWriter::printAlias(const GlobalAlias *GA) {
+  if (GA->isMaterializable())
+    Out << "; Materializable\n";
+
+  // Don't crash when dumping partially built GA
+  if (!GA->hasName())
+    Out << "<<nameless>> = ";
+  else {
+    PrintLLVMName(Out, GA);
+    Out << " = ";
+  }
+  PrintVisibility(GA->getVisibility(), Out);
+
+  Out << "alias ";
+
+  PrintLinkage(GA->getLinkage(), Out);
+
+  const Constant *Aliasee = GA->getAliasee();
+
+  if (Aliasee == 0) {
+    TypePrinter.print(GA->getType(), Out);
+    Out << " <<NULL ALIASEE>>";
+  } else {
+    writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
+  }
+
+  printInfoComment(*GA);
+  Out << '\n';
+}
+
+void AssemblyWriter::printTypeIdentities() {
+  if (TypePrinter.NumberedTypes.empty() &&
+      TypePrinter.NamedTypes.empty())
+    return;
+
+  Out << '\n';
+
+  // We know all the numbers that each type is used and we know that it is a
+  // dense assignment.  Convert the map to an index table.
+  std::vector<StructType*> NumberedTypes(TypePrinter.NumberedTypes.size());
+  for (DenseMap<StructType*, unsigned>::iterator I =
+       TypePrinter.NumberedTypes.begin(), E = TypePrinter.NumberedTypes.end();
+       I != E; ++I) {
+    assert(I->second < NumberedTypes.size() && "Didn't get a dense numbering?");
+    NumberedTypes[I->second] = I->first;
+  }
+
+  // Emit all numbered types.
+  for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i) {
+    Out << '%' << i << " = type ";
+
+    // Make sure we print out at least one level of the type structure, so
+    // that we do not get %2 = type %2
+    TypePrinter.printStructBody(NumberedTypes[i], Out);
+    Out << '\n';
+  }
+
+  for (unsigned i = 0, e = TypePrinter.NamedTypes.size(); i != e; ++i) {
+    PrintLLVMName(Out, TypePrinter.NamedTypes[i]->getName(), LocalPrefix);
+    Out << " = type ";
+
+    // Make sure we print out at least one level of the type structure, so
+    // that we do not get %FILE = type %FILE
+    TypePrinter.printStructBody(TypePrinter.NamedTypes[i], Out);
+    Out << '\n';
+  }
+}
+
+/// printFunction - Print all aspects of a function.
+///
+void AssemblyWriter::printFunction(const Function *F) {
+  // Print out the return type and name.
+  Out << '\n';
+
+  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
+
+  if (F->isMaterializable())
+    Out << "; Materializable\n";
+
+  if (F->isDeclaration())
+    Out << "declare ";
+  else
+    Out << "define ";
+
+  PrintLinkage(F->getLinkage(), Out);
+  PrintVisibility(F->getVisibility(), Out);
+
+  // Print the calling convention.
+  if (F->getCallingConv() != CallingConv::C) {
+    PrintCallingConv(F->getCallingConv(), Out);
+    Out << " ";
+  }
+
+  FunctionType *FT = F->getFunctionType();
+  const AttributeSet &Attrs = F->getAttributes();
+  if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+    Out <<  Attrs.getAsString(AttributeSet::ReturnIndex) << ' ';
+  TypePrinter.print(F->getReturnType(), Out);
+  Out << ' ';
+  WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
+  Out << '(';
+  Machine.incorporateFunction(F);
+
+  // Loop over the arguments, printing them...
+
+  unsigned Idx = 1;
+  if (!F->isDeclaration()) {
+    // If this isn't a declaration, print the argument names as well.
+    for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      // Insert commas as we go... the first arg doesn't get a comma
+      if (I != F->arg_begin()) Out << ", ";
+      printArgument(I, Attrs, Idx);
+      Idx++;
+    }
+  } else {
+    // Otherwise, print the types from the function type.
+    for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+      // Insert commas as we go... the first arg doesn't get a comma
+      if (i) Out << ", ";
+
+      // Output type...
+      TypePrinter.print(FT->getParamType(i), Out);
+
+      if (Attrs.hasAttributes(i+1))
+        Out << ' ' << Attrs.getAsString(i+1);
+    }
+  }
+
+  // Finish printing arguments...
+  if (FT->isVarArg()) {
+    if (FT->getNumParams()) Out << ", ";
+    Out << "...";  // Output varargs portion of signature!
+  }
+  Out << ')';
+  if (F->hasUnnamedAddr())
+    Out << " unnamed_addr";
+  if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
+  if (F->hasSection()) {
+    Out << " section \"";
+    PrintEscapedString(F->getSection(), Out);
+    Out << '"';
+  }
+  if (F->getAlignment())
+    Out << " align " << F->getAlignment();
+  if (F->hasGC())
+    Out << " gc \"" << F->getGC() << '"';
+  if (F->isDeclaration()) {
+    Out << '\n';
+  } else {
+    Out << " {";
+    // Output all of the function's basic blocks.
+    for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I)
+      printBasicBlock(I);
+
+    Out << "}\n";
+  }
+
+  Machine.purgeFunction();
+}
+
+/// printArgument - This member is called for every argument that is passed into
+/// the function.  Simply print it out
+///
+void AssemblyWriter::printArgument(const Argument *Arg,
+                                   AttributeSet Attrs, unsigned Idx) {
+  // Output type...
+  TypePrinter.print(Arg->getType(), Out);
+
+  // Output parameter attributes list
+  if (Attrs.hasAttributes(Idx))
+    Out << ' ' << Attrs.getAsString(Idx);
+
+  // Output name, if available...
+  if (Arg->hasName()) {
+    Out << ' ';
+    PrintLLVMName(Out, Arg);
+  }
+}
+
+/// printBasicBlock - This member is called for each basic block in a method.
+///
+void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
+  if (BB->hasName()) {              // Print out the label if it exists...
+    Out << "\n";
+    PrintLLVMName(Out, BB->getName(), LabelPrefix);
+    Out << ':';
+  } else if (!BB->use_empty()) {      // Don't print block # of no uses...
+    Out << "\n; <label>:";
+    int Slot = Machine.getLocalSlot(BB);
+    if (Slot != -1)
+      Out << Slot;
+    else
+      Out << "<badref>";
+  }
+
+  if (BB->getParent() == 0) {
+    Out.PadToColumn(50);
+    Out << "; Error: Block without parent!";
+  } else if (BB != &BB->getParent()->getEntryBlock()) {  // Not the entry block?
+    // Output predecessors for the block.
+    Out.PadToColumn(50);
+    Out << ";";
+    const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+    if (PI == PE) {
+      Out << " No predecessors!";
+    } else {
+      Out << " preds = ";
+      writeOperand(*PI, false);
+      for (++PI; PI != PE; ++PI) {
+        Out << ", ";
+        writeOperand(*PI, false);
+      }
+    }
+  }
+
+  Out << "\n";
+
+  if (AnnotationWriter) AnnotationWriter->emitBasicBlockStartAnnot(BB, Out);
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    printInstruction(*I);
+    Out << '\n';
+  }
+
+  if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out);
+}
+
+/// printInfoComment - Print a little comment after the instruction indicating
+/// which slot it occupies.
+///
+void AssemblyWriter::printInfoComment(const Value &V) {
+  if (AnnotationWriter) {
+    AnnotationWriter->printInfoComment(V, Out);
+    return;
+  }
+}
+
+// This member is called for each Instruction in a function..
+void AssemblyWriter::printInstruction(const Instruction &I) {
+  if (AnnotationWriter) AnnotationWriter->emitInstructionAnnot(&I, Out);
+
+  // Print out indentation for an instruction.
+  Out << "  ";
+
+  // Print out name if it exists...
+  if (I.hasName()) {
+    PrintLLVMName(Out, &I);
+    Out << " = ";
+  } else if (!I.getType()->isVoidTy()) {
+    // Print out the def slot taken.
+    int SlotNum = Machine.getLocalSlot(&I);
+    if (SlotNum == -1)
+      Out << "<badref> = ";
+    else
+      Out << '%' << SlotNum << " = ";
+  }
+
+  if (isa<CallInst>(I) && cast<CallInst>(I).isTailCall())
+    Out << "tail ";
+
+  // Print out the opcode...
+  Out << I.getOpcodeName();
+
+  // If this is an atomic load or store, print out the atomic marker.
+  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isAtomic()) ||
+      (isa<StoreInst>(I) && cast<StoreInst>(I).isAtomic()))
+    Out << " atomic";
+
+  // If this is a volatile operation, print out the volatile marker.
+  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
+      (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile()) ||
+      (isa<AtomicCmpXchgInst>(I) && cast<AtomicCmpXchgInst>(I).isVolatile()) ||
+      (isa<AtomicRMWInst>(I) && cast<AtomicRMWInst>(I).isVolatile()))
+    Out << " volatile";
+
+  // Print out optimization information.
+  WriteOptimizationInfo(Out, &I);
+
+  // Print out the compare instruction predicates
+  if (const CmpInst *CI = dyn_cast<CmpInst>(&I))
+    Out << ' ' << getPredicateText(CI->getPredicate());
+
+  // Print out the atomicrmw operation
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I))
+    writeAtomicRMWOperation(Out, RMWI->getOperation());
+
+  // Print out the type of the operands...
+  const Value *Operand = I.getNumOperands() ? I.getOperand(0) : 0;
+
+  // Special case conditional branches to swizzle the condition out to the front
+  if (isa<BranchInst>(I) && cast<BranchInst>(I).isConditional()) {
+    const BranchInst &BI(cast<BranchInst>(I));
+    Out << ' ';
+    writeOperand(BI.getCondition(), true);
+    Out << ", ";
+    writeOperand(BI.getSuccessor(0), true);
+    Out << ", ";
+    writeOperand(BI.getSuccessor(1), true);
+
+  } else if (isa<SwitchInst>(I)) {
+    const SwitchInst& SI(cast<SwitchInst>(I));
+    // Special case switch instruction to get formatting nice and correct.
+    Out << ' ';
+    writeOperand(SI.getCondition(), true);
+    Out << ", ";
+    writeOperand(SI.getDefaultDest(), true);
+    Out << " [";
+    for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
+         i != e; ++i) {
+      Out << "\n    ";
+      writeOperand(i.getCaseValue(), true);
+      Out << ", ";
+      writeOperand(i.getCaseSuccessor(), true);
+    }
+    Out << "\n  ]";
+  } else if (isa<IndirectBrInst>(I)) {
+    // Special case indirectbr instruction to get formatting nice and correct.
+    Out << ' ';
+    writeOperand(Operand, true);
+    Out << ", [";
+
+    for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
+      if (i != 1)
+        Out << ", ";
+      writeOperand(I.getOperand(i), true);
+    }
+    Out << ']';
+  } else if (const PHINode *PN = dyn_cast<PHINode>(&I)) {
+    Out << ' ';
+    TypePrinter.print(I.getType(), Out);
+    Out << ' ';
+
+    for (unsigned op = 0, Eop = PN->getNumIncomingValues(); op < Eop; ++op) {
+      if (op) Out << ", ";
+      Out << "[ ";
+      writeOperand(PN->getIncomingValue(op), false); Out << ", ";
+      writeOperand(PN->getIncomingBlock(op), false); Out << " ]";
+    }
+  } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&I)) {
+    Out << ' ';
+    writeOperand(I.getOperand(0), true);
+    for (const unsigned *i = EVI->idx_begin(), *e = EVI->idx_end(); i != e; ++i)
+      Out << ", " << *i;
+  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&I)) {
+    Out << ' ';
+    writeOperand(I.getOperand(0), true); Out << ", ";
+    writeOperand(I.getOperand(1), true);
+    for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
+      Out << ", " << *i;
+  } else if (const LandingPadInst *LPI = dyn_cast<LandingPadInst>(&I)) {
+    Out << ' ';
+    TypePrinter.print(I.getType(), Out);
+    Out << " personality ";
+    writeOperand(I.getOperand(0), true); Out << '\n';
+
+    if (LPI->isCleanup())
+      Out << "          cleanup";
+
+    for (unsigned i = 0, e = LPI->getNumClauses(); i != e; ++i) {
+      if (i != 0 || LPI->isCleanup()) Out << "\n";
+      if (LPI->isCatch(i))
+        Out << "          catch ";
+      else
+        Out << "          filter ";
+
+      writeOperand(LPI->getClause(i), true);
+    }
+  } else if (isa<ReturnInst>(I) && !Operand) {
+    Out << " void";
+  } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Print the calling convention being used.
+    if (CI->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(CI->getCallingConv(), Out);
+    }
+
+    Operand = CI->getCalledValue();
+    PointerType *PTy = cast<PointerType>(Operand->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    Type *RetTy = FTy->getReturnType();
+    const AttributeSet &PAL = CI->getAttributes();
+
+    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+
+    // If possible, print out the short form of the call instruction.  We can
+    // only do this if the first argument is a pointer to a nonvararg function,
+    // and if the return type is not a pointer to a function.
+    //
+    Out << ' ';
+    if (!FTy->isVarArg() &&
+        (!RetTy->isPointerTy() ||
+         !cast<PointerType>(RetTy)->getElementType()->isFunctionTy())) {
+      TypePrinter.print(RetTy, Out);
+      Out << ' ';
+      writeOperand(Operand, false);
+    } else {
+      writeOperand(Operand, true);
+    }
+    Out << '(';
+    for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) {
+      if (op > 0)
+        Out << ", ";
+      writeParamOperand(CI->getArgOperand(op), PAL, op + 1);
+    }
+    Out << ')';
+    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+  } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+    Operand = II->getCalledValue();
+    PointerType *PTy = cast<PointerType>(Operand->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    Type *RetTy = FTy->getReturnType();
+    const AttributeSet &PAL = II->getAttributes();
+
+    // Print the calling convention being used.
+    if (II->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(II->getCallingConv(), Out);
+    }
+
+    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+
+    // If possible, print out the short form of the invoke instruction. We can
+    // only do this if the first argument is a pointer to a nonvararg function,
+    // and if the return type is not a pointer to a function.
+    //
+    Out << ' ';
+    if (!FTy->isVarArg() &&
+        (!RetTy->isPointerTy() ||
+         !cast<PointerType>(RetTy)->getElementType()->isFunctionTy())) {
+      TypePrinter.print(RetTy, Out);
+      Out << ' ';
+      writeOperand(Operand, false);
+    } else {
+      writeOperand(Operand, true);
+    }
+    Out << '(';
+    for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) {
+      if (op)
+        Out << ", ";
+      writeParamOperand(II->getArgOperand(op), PAL, op + 1);
+    }
+
+    Out << ')';
+    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+
+    Out << "\n          to ";
+    writeOperand(II->getNormalDest(), true);
+    Out << " unwind ";
+    writeOperand(II->getUnwindDest(), true);
+
+  } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+    Out << ' ';
+    TypePrinter.print(AI->getAllocatedType(), Out);
+    if (!AI->getArraySize() || AI->isArrayAllocation()) {
+      Out << ", ";
+      writeOperand(AI->getArraySize(), true);
+    }
+    if (AI->getAlignment()) {
+      Out << ", align " << AI->getAlignment();
+    }
+  } else if (isa<CastInst>(I)) {
+    if (Operand) {
+      Out << ' ';
+      writeOperand(Operand, true);   // Work with broken code
+    }
+    Out << " to ";
+    TypePrinter.print(I.getType(), Out);
+  } else if (isa<VAArgInst>(I)) {
+    if (Operand) {
+      Out << ' ';
+      writeOperand(Operand, true);   // Work with broken code
+    }
+    Out << ", ";
+    TypePrinter.print(I.getType(), Out);
+  } else if (Operand) {   // Print the normal way.
+
+    // PrintAllTypes - Instructions who have operands of all the same type
+    // omit the type from all but the first operand.  If the instruction has
+    // different type operands (for example br), then they are all printed.
+    bool PrintAllTypes = false;
+    Type *TheType = Operand->getType();
+
+    // Select, Store and ShuffleVector always print all types.
+    if (isa<SelectInst>(I) || isa<StoreInst>(I) || isa<ShuffleVectorInst>(I)
+        || isa<ReturnInst>(I)) {
+      PrintAllTypes = true;
+    } else {
+      for (unsigned i = 1, E = I.getNumOperands(); i != E; ++i) {
+        Operand = I.getOperand(i);
+        // note that Operand shouldn't be null, but the test helps make dump()
+        // more tolerant of malformed IR
+        if (Operand && Operand->getType() != TheType) {
+          PrintAllTypes = true;    // We have differing types!  Print them all!
+          break;
+        }
+      }
+    }
+
+    if (!PrintAllTypes) {
+      Out << ' ';
+      TypePrinter.print(TheType, Out);
+    }
+
+    Out << ' ';
+    for (unsigned i = 0, E = I.getNumOperands(); i != E; ++i) {
+      if (i) Out << ", ";
+      writeOperand(I.getOperand(i), PrintAllTypes);
+    }
+  }
+
+  // Print atomic ordering/alignment for memory operations
+  if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (LI->isAtomic())
+      writeAtomic(LI->getOrdering(), LI->getSynchScope());
+    if (LI->getAlignment())
+      Out << ", align " << LI->getAlignment();
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+    if (SI->isAtomic())
+      writeAtomic(SI->getOrdering(), SI->getSynchScope());
+    if (SI->getAlignment())
+      Out << ", align " << SI->getAlignment();
+  } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+    writeAtomic(CXI->getOrdering(), CXI->getSynchScope());
+  } else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
+    writeAtomic(RMWI->getOrdering(), RMWI->getSynchScope());
+  } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
+    writeAtomic(FI->getOrdering(), FI->getSynchScope());
+  }
+
+  // Print Metadata info.
+  SmallVector<std::pair<unsigned, MDNode*>, 4> InstMD;
+  I.getAllMetadata(InstMD);
+  if (!InstMD.empty()) {
+    SmallVector<StringRef, 8> MDNames;
+    I.getType()->getContext().getMDKindNames(MDNames);
+    for (unsigned i = 0, e = InstMD.size(); i != e; ++i) {
+      unsigned Kind = InstMD[i].first;
+       if (Kind < MDNames.size()) {
+         Out << ", !" << MDNames[Kind];
+      } else {
+        Out << ", !<unknown kind #" << Kind << ">";
+      }
+      Out << ' ';
+      WriteAsOperandInternal(Out, InstMD[i].second, &TypePrinter, &Machine,
+                             TheModule);
+    }
+  }
+  printInfoComment(I);
+}
+
+static void WriteMDNodeComment(const MDNode *Node,
+                               formatted_raw_ostream &Out) {
+  if (Node->getNumOperands() < 1)
+    return;
+
+  Value *Op = Node->getOperand(0);
+  if (!Op || !isa<ConstantInt>(Op) || cast<ConstantInt>(Op)->getBitWidth() < 32)
+    return;
+
+  DIDescriptor Desc(Node);
+  if (!Desc.Verify())
+    return;
+
+  unsigned Tag = Desc.getTag();
+  Out.PadToColumn(50);
+  if (dwarf::TagString(Tag)) {
+    Out << "; ";
+    Desc.print(Out);
+  } else if (Tag == dwarf::DW_TAG_user_base) {
+    Out << "; [ DW_TAG_user_base ]";
+  }
+}
+
+void AssemblyWriter::writeAllMDNodes() {
+  SmallVector<const MDNode *, 16> Nodes;
+  Nodes.resize(Machine.mdn_size());
+  for (SlotTracker::mdn_iterator I = Machine.mdn_begin(), E = Machine.mdn_end();
+       I != E; ++I)
+    Nodes[I->second] = cast<MDNode>(I->first);
+
+  for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
+    Out << '!' << i << " = metadata ";
+    printMDNodeBody(Nodes[i]);
+  }
+}
+
+void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
+  WriteMDNodeBodyInternal(Out, Node, &TypePrinter, &Machine, TheModule);
+  WriteMDNodeComment(Node, Out);
+  Out << "\n";
+}
+
+void AssemblyWriter::writeAllAttributeGroups() {
+  std::vector<std::pair<AttributeSet, unsigned> > asVec;
+  asVec.resize(Machine.as_size());
+
+  for (SlotTracker::as_iterator I = Machine.as_begin(), E = Machine.as_end();
+       I != E; ++I)
+    asVec[I->second] = *I;
+
+  for (std::vector<std::pair<AttributeSet, unsigned> >::iterator
+         I = asVec.begin(), E = asVec.end(); I != E; ++I)
+    Out << "attributes #" << I->second << " = { "
+        << I->first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declarations
+//===----------------------------------------------------------------------===//
+
+void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
+  SlotTracker SlotTable(this);
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, this, AAW);
+  W.printModule(this);
+}
+
+void NamedMDNode::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
+  SlotTracker SlotTable(getParent());
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, getParent(), AAW);
+  W.printNamedMDNode(this);
+}
+
+void Type::print(raw_ostream &OS) const {
+  if (this == 0) {
+    OS << "<null Type>";
+    return;
+  }
+  TypePrinting TP;
+  TP.print(const_cast<Type*>(this), OS);
+
+  // If the type is a named struct type, print the body as well.
+  if (StructType *STy = dyn_cast<StructType>(const_cast<Type*>(this)))
+    if (!STy->isLiteral()) {
+      OS << " = type ";
+      TP.printStructBody(STy, OS);
+    }
+}
+
+void Value::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW) const {
+  if (this == 0) {
+    ROS << "printing a <null> value\n";
+    return;
+  }
+  formatted_raw_ostream OS(ROS);
+  if (const Instruction *I = dyn_cast<Instruction>(this)) {
+    const Function *F = I->getParent() ? I->getParent()->getParent() : 0;
+    SlotTracker SlotTable(F);
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(I), AAW);
+    W.printInstruction(*I);
+  } else if (const BasicBlock *BB = dyn_cast<BasicBlock>(this)) {
+    SlotTracker SlotTable(BB->getParent());
+    AssemblyWriter W(OS, SlotTable, getModuleFromVal(BB), AAW);
+    W.printBasicBlock(BB);
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
+    SlotTracker SlotTable(GV->getParent());
+    AssemblyWriter W(OS, SlotTable, GV->getParent(), AAW);
+    if (const GlobalVariable *V = dyn_cast<GlobalVariable>(GV))
+      W.printGlobal(V);
+    else if (const Function *F = dyn_cast<Function>(GV))
+      W.printFunction(F);
+    else
+      W.printAlias(cast<GlobalAlias>(GV));
+  } else if (const MDNode *N = dyn_cast<MDNode>(this)) {
+    const Function *F = N->getFunction();
+    SlotTracker SlotTable(F);
+    AssemblyWriter W(OS, SlotTable, F ? F->getParent() : 0, AAW);
+    W.printMDNodeBody(N);
+  } else if (const Constant *C = dyn_cast<Constant>(this)) {
+    TypePrinting TypePrinter;
+    TypePrinter.print(C->getType(), OS);
+    OS << ' ';
+    WriteConstantInternal(OS, C, TypePrinter, 0, 0);
+  } else if (isa<InlineAsm>(this) || isa<MDString>(this) ||
+             isa<Argument>(this)) {
+    WriteAsOperand(OS, this, true, 0);
+  } else {
+    // Otherwise we don't know what it is. Call the virtual function to
+    // allow a subclass to print itself.
+    printCustom(OS);
+  }
+}
+
+// Value::printCustom - subclasses should override this to implement printing.
+void Value::printCustom(raw_ostream &OS) const {
+  llvm_unreachable("Unknown value to print out!");
+}
+
+// Value::dump - allow easy printing of Values from the debugger.
+void Value::dump() const { print(dbgs()); dbgs() << '\n'; }
+
+// Type::dump - allow easy printing of Types from the debugger.
+void Type::dump() const { print(dbgs()); }
+
+// Module::dump() - Allow printing of Modules from the debugger.
+void Module::dump() const { print(dbgs(), 0); }
+
+// NamedMDNode::dump() - Allow printing of NamedMDNodes from the debugger.
+void NamedMDNode::dump() const { print(dbgs(), 0); }
diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h
new file mode 100644
index 000000000000..ad2670dade12
--- /dev/null
+++ b/contrib/llvm/lib/IR/AttributeImpl.h
@@ -0,0 +1,278 @@
+//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file defines various helper methods and classes used by
+/// LLVMContextImpl for creating and managing attributes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ATTRIBUTESIMPL_H
+#define LLVM_ATTRIBUTESIMPL_H
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/IR/Attributes.h"
+#include <string>
+
+namespace llvm {
+
+class Constant;
+class LLVMContext;
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief A set of classes that contain the kind and (optional) value of the
+/// attribute object. There are three main categories: enum attribute entries,
+/// represented by Attribute::AttrKind; alignment attribute entries; and string
+/// attribute enties, which are for target-dependent attributes.
+class AttributeEntry {
+  unsigned char KindID;
+protected:
+  enum AttrEntryKind {
+    EnumAttrEntry,
+    AlignAttrEntry,
+    StringAttrEntry
+  };
+public:
+  AttributeEntry(AttrEntryKind Kind)
+    : KindID(Kind) {}
+  virtual ~AttributeEntry() {}
+
+  unsigned getKindID() const { return KindID; }
+
+  static inline bool classof(const AttributeEntry *) { return true; }
+};
+
+class EnumAttributeEntry : public AttributeEntry {
+  Attribute::AttrKind Kind;
+public:
+  EnumAttributeEntry(Attribute::AttrKind Kind)
+    : AttributeEntry(EnumAttrEntry), Kind(Kind) {}
+
+  Attribute::AttrKind getEnumKind() const { return Kind; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == EnumAttrEntry;
+  }
+  static inline bool classof(const EnumAttributeEntry *) { return true; }
+};
+
+class AlignAttributeEntry : public AttributeEntry {
+  Attribute::AttrKind Kind;
+  unsigned Align;
+public:
+  AlignAttributeEntry(Attribute::AttrKind Kind, unsigned Align)
+    : AttributeEntry(AlignAttrEntry), Kind(Kind), Align(Align) {}
+
+  Attribute::AttrKind getEnumKind() const { return Kind; }
+  unsigned getAlignment() const { return Align; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == AlignAttrEntry;
+  }
+  static inline bool classof(const AlignAttributeEntry *) { return true; }
+};
+
+class StringAttributeEntry : public AttributeEntry {
+  std::string Kind;
+  std::string Val;
+public:
+  StringAttributeEntry(StringRef Kind, StringRef Val = StringRef())
+    : AttributeEntry(StringAttrEntry), Kind(Kind), Val(Val) {}
+
+  StringRef getStringKind() const { return Kind; }
+  StringRef getStringValue() const { return Val; }
+
+  static inline bool classof(const AttributeEntry *AE) {
+    return AE->getKindID() == StringAttrEntry;
+  }
+  static inline bool classof(const StringAttributeEntry *) { return true; }
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a single, uniqued attribute. That attribute
+/// could be a single enum, a tuple, or a string.
+class AttributeImpl : public FoldingSetNode {
+  LLVMContext &Context;  ///< Global context for uniquing objects
+
+  AttributeEntry *Entry; ///< Holds the kind and value of the attribute
+
+  // AttributesImpl is uniqued, these should not be publicly available.
+  void operator=(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+  AttributeImpl(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+public:
+  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind);
+  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind, unsigned Align);
+  AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val = StringRef());
+  ~AttributeImpl();
+
+  LLVMContext &getContext() { return Context; }
+
+  bool isEnumAttribute() const;
+  bool isAlignAttribute() const;
+  bool isStringAttribute() const;
+
+  bool hasAttribute(Attribute::AttrKind A) const;
+  bool hasAttribute(StringRef Kind) const;
+
+  Attribute::AttrKind getKindAsEnum() const;
+  uint64_t getValueAsInt() const;
+
+  StringRef getKindAsString() const;
+  StringRef getValueAsString() const;
+
+  /// \brief Used when sorting the attributes.
+  bool operator<(const AttributeImpl &AI) const;
+
+  void Profile(FoldingSetNodeID &ID) const {
+    if (isEnumAttribute())
+      Profile(ID, getKindAsEnum(), 0);
+    else if (isAlignAttribute())
+      Profile(ID, getKindAsEnum(), getValueAsInt());
+    else
+      Profile(ID, getKindAsString(), getValueAsString());
+  }
+  static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
+                      uint64_t Val) {
+    ID.AddInteger(Kind);
+    if (Val) ID.AddInteger(Val);
+  }
+  static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
+    ID.AddString(Kind);
+    if (!Values.empty()) ID.AddString(Values);
+  }
+
+  // FIXME: Remove this!
+  static uint64_t getAttrMask(Attribute::AttrKind Val);
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode : public FoldingSetNode {
+  SmallVector<Attribute, 4> AttrList;
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : AttrList(Attrs.begin(), Attrs.end()) {}
+
+  // AttributesSetNode is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
+  AttributeSetNode(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
+public:
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  bool hasAttribute(Attribute::AttrKind Kind) const;
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return !AttrList.empty(); }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef SmallVectorImpl<Attribute>::iterator       iterator;
+  typedef SmallVectorImpl<Attribute>::const_iterator const_iterator;
+
+  iterator begin() { return AttrList.begin(); }
+  iterator end()   { return AttrList.end(); }
+
+  const_iterator begin() const { return AttrList.begin(); }
+  const_iterator end() const   { return AttrList.end(); }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, AttrList);
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
+      AttrList[I].Profile(ID);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a set of attributes that apply to the function,
+/// return type, and parameters.
+class AttributeSetImpl : public FoldingSetNode {
+  friend class AttributeSet;
+
+  LLVMContext &Context;
+
+  typedef std::pair<unsigned, AttributeSetNode*> IndexAttrPair;
+  SmallVector<IndexAttrPair, 4> AttrNodes;
+
+  // AttributesSet is uniqued, these should not be publicly available.
+  void operator=(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
+  AttributeSetImpl(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
+public:
+  AttributeSetImpl(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, AttributeSetNode*> > attrs)
+    : Context(C), AttrNodes(attrs.begin(), attrs.end()) {}
+
+  /// \brief Get the context that created this AttributeSetImpl.
+  LLVMContext &getContext() { return Context; }
+
+  /// \brief Return the number of attributes this AttributeSet contains.
+  unsigned getNumAttributes() const { return AttrNodes.size(); }
+
+  /// \brief Get the index of the given "slot" in the AttrNodes list. This index
+  /// is the index of the return, parameter, or function object that the
+  /// attributes are applied to, not the index into the AttrNodes list where the
+  /// attributes reside.
+  uint64_t getSlotIndex(unsigned Slot) const {
+    return AttrNodes[Slot].first;
+  }
+
+  /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
+  /// \p Slot is an index into the AttrNodes list, not the index of the return /
+  /// parameter/ function which the attributes apply to.
+  AttributeSet getSlotAttributes(unsigned Slot) const {
+    return AttributeSet::get(Context, AttrNodes[Slot]);
+  }
+
+  /// \brief Retrieve the attribute set node for the given "slot" in the
+  /// AttrNode list.
+  AttributeSetNode *getSlotNode(unsigned Slot) const {
+    return AttrNodes[Slot].second;
+  }
+
+  typedef AttributeSetNode::iterator       iterator;
+  typedef AttributeSetNode::const_iterator const_iterator;
+
+  iterator begin(unsigned Idx)
+    { return AttrNodes[Idx].second->begin(); }
+  iterator end(unsigned Idx)
+    { return AttrNodes[Idx].second->end(); }
+
+  const_iterator begin(unsigned Idx) const
+    { return AttrNodes[Idx].second->begin(); }
+  const_iterator end(unsigned Idx) const
+    { return AttrNodes[Idx].second->end(); }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, AttrNodes);
+  }
+  static void Profile(FoldingSetNodeID &ID,
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Nodes) {
+    for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
+      ID.AddInteger(Nodes[i].first);
+      ID.AddPointer(Nodes[i].second);
+    }
+  }
+
+  // FIXME: This atrocity is temporary.
+  uint64_t Raw(uint64_t Index) const;
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/IR/Attributes.cpp b/contrib/llvm/lib/IR/Attributes.cpp
new file mode 100644
index 000000000000..2d828914cdca
--- /dev/null
+++ b/contrib/llvm/lib/IR/Attributes.cpp
@@ -0,0 +1,1180 @@
+//===-- Attributes.cpp - Implement AttributesList -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
+// AttributeSetImpl, and AttributeSet classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Attributes.h"
+#include "AttributeImpl.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Atomic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Attribute Construction Methods
+//===----------------------------------------------------------------------===//
+
+Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
+                         uint64_t Val) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  FoldingSetNodeID ID;
+  ID.AddInteger(Kind);
+  if (Val) ID.AddInteger(Val);
+
+  void *InsertPoint;
+  AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
+
+  if (!PA) {
+    // If we didn't find any existing attributes of the same shape then create a
+    // new one and insert it.
+    PA = !Val ?
+      new AttributeImpl(Context, Kind) :
+      new AttributeImpl(Context, Kind, Val);
+    pImpl->AttrsSet.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the Attribute that we found or created.
+  return Attribute(PA);
+}
+
+Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  FoldingSetNodeID ID;
+  ID.AddString(Kind);
+  if (!Val.empty()) ID.AddString(Val);
+
+  void *InsertPoint;
+  AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
+
+  if (!PA) {
+    // If we didn't find any existing attributes of the same shape then create a
+    // new one and insert it.
+    PA = new AttributeImpl(Context, Kind, Val);
+    pImpl->AttrsSet.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the Attribute that we found or created.
+  return Attribute(PA);
+}
+
+Attribute Attribute::getWithAlignment(LLVMContext &Context, uint64_t Align) {
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x40000000 && "Alignment too large.");
+  return get(Context, Alignment, Align);
+}
+
+Attribute Attribute::getWithStackAlignment(LLVMContext &Context,
+                                           uint64_t Align) {
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x100 && "Alignment too large.");
+  return get(Context, StackAlignment, Align);
+}
+
+//===----------------------------------------------------------------------===//
+// Attribute Accessor Methods
+//===----------------------------------------------------------------------===//
+
+bool Attribute::isEnumAttribute() const {
+  return pImpl && pImpl->isEnumAttribute();
+}
+
+bool Attribute::isAlignAttribute() const {
+  return pImpl && pImpl->isAlignAttribute();
+}
+
+bool Attribute::isStringAttribute() const {
+  return pImpl && pImpl->isStringAttribute();
+}
+
+Attribute::AttrKind Attribute::getKindAsEnum() const {
+  assert((isEnumAttribute() || isAlignAttribute()) &&
+         "Invalid attribute type to get the kind as an enum!");
+  return pImpl ? pImpl->getKindAsEnum() : None;
+}
+
+uint64_t Attribute::getValueAsInt() const {
+  assert(isAlignAttribute() &&
+         "Expected the attribute to be an alignment attribute!");
+  return pImpl ? pImpl->getValueAsInt() : 0;
+}
+
+StringRef Attribute::getKindAsString() const {
+  assert(isStringAttribute() &&
+         "Invalid attribute type to get the kind as a string!");
+  return pImpl ? pImpl->getKindAsString() : StringRef();
+}
+
+StringRef Attribute::getValueAsString() const {
+  assert(isStringAttribute() &&
+         "Invalid attribute type to get the value as a string!");
+  return pImpl ? pImpl->getValueAsString() : StringRef();
+}
+
+bool Attribute::hasAttribute(AttrKind Kind) const {
+  return (pImpl && pImpl->hasAttribute(Kind)) || (!pImpl && Kind == None);
+}
+
+bool Attribute::hasAttribute(StringRef Kind) const {
+  if (!isStringAttribute()) return false;
+  return pImpl && pImpl->hasAttribute(Kind);
+}
+
+/// This returns the alignment field of an attribute as a byte alignment value.
+unsigned Attribute::getAlignment() const {
+  assert(hasAttribute(Attribute::Alignment) &&
+         "Trying to get alignment from non-alignment attribute!");
+  return pImpl->getValueAsInt();
+}
+
+/// This returns the stack alignment field of an attribute as a byte alignment
+/// value.
+unsigned Attribute::getStackAlignment() const {
+  assert(hasAttribute(Attribute::StackAlignment) &&
+         "Trying to get alignment from non-alignment attribute!");
+  return pImpl->getValueAsInt();
+}
+
+std::string Attribute::getAsString(bool InAttrGrp) const {
+  if (!pImpl) return "";
+
+  if (hasAttribute(Attribute::SanitizeAddress))
+    return "sanitize_address";
+  if (hasAttribute(Attribute::AlwaysInline))
+    return "alwaysinline";
+  if (hasAttribute(Attribute::ByVal))
+    return "byval";
+  if (hasAttribute(Attribute::InlineHint))
+    return "inlinehint";
+  if (hasAttribute(Attribute::InReg))
+    return "inreg";
+  if (hasAttribute(Attribute::MinSize))
+    return "minsize";
+  if (hasAttribute(Attribute::Naked))
+    return "naked";
+  if (hasAttribute(Attribute::Nest))
+    return "nest";
+  if (hasAttribute(Attribute::NoAlias))
+    return "noalias";
+  if (hasAttribute(Attribute::NoBuiltin))
+    return "nobuiltin";
+  if (hasAttribute(Attribute::NoCapture))
+    return "nocapture";
+  if (hasAttribute(Attribute::NoDuplicate))
+    return "noduplicate";
+  if (hasAttribute(Attribute::NoImplicitFloat))
+    return "noimplicitfloat";
+  if (hasAttribute(Attribute::NoInline))
+    return "noinline";
+  if (hasAttribute(Attribute::NonLazyBind))
+    return "nonlazybind";
+  if (hasAttribute(Attribute::NoRedZone))
+    return "noredzone";
+  if (hasAttribute(Attribute::NoReturn))
+    return "noreturn";
+  if (hasAttribute(Attribute::NoUnwind))
+    return "nounwind";
+  if (hasAttribute(Attribute::OptimizeForSize))
+    return "optsize";
+  if (hasAttribute(Attribute::ReadNone))
+    return "readnone";
+  if (hasAttribute(Attribute::ReadOnly))
+    return "readonly";
+  if (hasAttribute(Attribute::ReturnsTwice))
+    return "returns_twice";
+  if (hasAttribute(Attribute::SExt))
+    return "signext";
+  if (hasAttribute(Attribute::StackProtect))
+    return "ssp";
+  if (hasAttribute(Attribute::StackProtectReq))
+    return "sspreq";
+  if (hasAttribute(Attribute::StackProtectStrong))
+    return "sspstrong";
+  if (hasAttribute(Attribute::StructRet))
+    return "sret";
+  if (hasAttribute(Attribute::SanitizeThread))
+    return "sanitize_thread";
+  if (hasAttribute(Attribute::SanitizeMemory))
+    return "sanitize_memory";
+  if (hasAttribute(Attribute::UWTable))
+    return "uwtable";
+  if (hasAttribute(Attribute::ZExt))
+    return "zeroext";
+
+  // FIXME: These should be output like this:
+  //
+  //   align=4
+  //   alignstack=8
+  //
+  if (hasAttribute(Attribute::Alignment)) {
+    std::string Result;
+    Result += "align";
+    Result += (InAttrGrp) ? "=" : " ";
+    Result += utostr(getValueAsInt());
+    return Result;
+  }
+
+  if (hasAttribute(Attribute::StackAlignment)) {
+    std::string Result;
+    Result += "alignstack";
+    if (InAttrGrp) {
+      Result += "=";
+      Result += utostr(getValueAsInt());
+    } else {
+      Result += "(";
+      Result += utostr(getValueAsInt());
+      Result += ")";
+    }
+    return Result;
+  }
+
+  // Convert target-dependent attributes to strings of the form:
+  //
+  //   "kind"
+  //   "kind" = "value"
+  //
+  if (isStringAttribute()) {
+    std::string Result;
+    Result += '\"' + getKindAsString().str() + '"';
+
+    StringRef Val = pImpl->getValueAsString();
+    if (Val.empty()) return Result;
+
+    Result += "=\"" + Val.str() + '"';
+    return Result;
+  }
+
+  llvm_unreachable("Unknown attribute");
+}
+
+bool Attribute::operator<(Attribute A) const {
+  if (!pImpl && !A.pImpl) return false;
+  if (!pImpl) return true;
+  if (!A.pImpl) return false;
+  return *pImpl < *A.pImpl;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeImpl Definition
+//===----------------------------------------------------------------------===//
+
+AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind)
+  : Context(C), Entry(new EnumAttributeEntry(Kind)) {}
+
+AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind,
+                             unsigned Align)
+  : Context(C) {
+  assert((Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) &&
+         "Wrong kind for alignment attribute!");
+  Entry = new AlignAttributeEntry(Kind, Align);
+}
+
+AttributeImpl::AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val)
+  : Context(C), Entry(new StringAttributeEntry(Kind, Val)) {}
+
+AttributeImpl::~AttributeImpl() {
+  delete Entry;
+}
+
+bool AttributeImpl::isEnumAttribute() const {
+  return isa<EnumAttributeEntry>(Entry);
+}
+
+bool AttributeImpl::isAlignAttribute() const {
+  return isa<AlignAttributeEntry>(Entry);
+}
+
+bool AttributeImpl::isStringAttribute() const {
+  return isa<StringAttributeEntry>(Entry);
+}
+
+bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
+  if (isStringAttribute()) return false;
+  return getKindAsEnum() == A;
+}
+
+bool AttributeImpl::hasAttribute(StringRef Kind) const {
+  if (!isStringAttribute()) return false;
+  return getKindAsString() == Kind;
+}
+
+Attribute::AttrKind AttributeImpl::getKindAsEnum() const {
+  if (EnumAttributeEntry *E = dyn_cast<EnumAttributeEntry>(Entry))
+    return E->getEnumKind();
+  return cast<AlignAttributeEntry>(Entry)->getEnumKind();
+}
+
+uint64_t AttributeImpl::getValueAsInt() const {
+  return cast<AlignAttributeEntry>(Entry)->getAlignment();
+}
+
+StringRef AttributeImpl::getKindAsString() const {
+  return cast<StringAttributeEntry>(Entry)->getStringKind();
+}
+
+StringRef AttributeImpl::getValueAsString() const {
+  return cast<StringAttributeEntry>(Entry)->getStringValue();
+}
+
+bool AttributeImpl::operator<(const AttributeImpl &AI) const {
+  // This sorts the attributes with Attribute::AttrKinds coming first (sorted
+  // relative to their enum value) and then strings.
+  if (isEnumAttribute()) {
+    if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum();
+    if (AI.isAlignAttribute()) return true;
+    if (AI.isStringAttribute()) return true;
+  }
+
+  if (isAlignAttribute()) {
+    if (AI.isEnumAttribute()) return false;
+    if (AI.isAlignAttribute()) return getValueAsInt() < AI.getValueAsInt();
+    if (AI.isStringAttribute()) return true;
+  }
+
+  if (AI.isEnumAttribute()) return false;
+  if (AI.isAlignAttribute()) return false;
+  if (getKindAsString() == AI.getKindAsString())
+    return getValueAsString() < AI.getValueAsString();
+  return getKindAsString() < AI.getKindAsString();
+}
+
+uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
+  // FIXME: Remove this.
+  switch (Val) {
+  case Attribute::EndAttrKinds:
+    llvm_unreachable("Synthetic enumerators which should never get here");
+
+  case Attribute::None:            return 0;
+  case Attribute::ZExt:            return 1 << 0;
+  case Attribute::SExt:            return 1 << 1;
+  case Attribute::NoReturn:        return 1 << 2;
+  case Attribute::InReg:           return 1 << 3;
+  case Attribute::StructRet:       return 1 << 4;
+  case Attribute::NoUnwind:        return 1 << 5;
+  case Attribute::NoAlias:         return 1 << 6;
+  case Attribute::ByVal:           return 1 << 7;
+  case Attribute::Nest:            return 1 << 8;
+  case Attribute::ReadNone:        return 1 << 9;
+  case Attribute::ReadOnly:        return 1 << 10;
+  case Attribute::NoInline:        return 1 << 11;
+  case Attribute::AlwaysInline:    return 1 << 12;
+  case Attribute::OptimizeForSize: return 1 << 13;
+  case Attribute::StackProtect:    return 1 << 14;
+  case Attribute::StackProtectReq: return 1 << 15;
+  case Attribute::Alignment:       return 31 << 16;
+  case Attribute::NoCapture:       return 1 << 21;
+  case Attribute::NoRedZone:       return 1 << 22;
+  case Attribute::NoImplicitFloat: return 1 << 23;
+  case Attribute::Naked:           return 1 << 24;
+  case Attribute::InlineHint:      return 1 << 25;
+  case Attribute::StackAlignment:  return 7 << 26;
+  case Attribute::ReturnsTwice:    return 1 << 29;
+  case Attribute::UWTable:         return 1 << 30;
+  case Attribute::NonLazyBind:     return 1U << 31;
+  case Attribute::SanitizeAddress: return 1ULL << 32;
+  case Attribute::MinSize:         return 1ULL << 33;
+  case Attribute::NoDuplicate:     return 1ULL << 34;
+  case Attribute::StackProtectStrong: return 1ULL << 35;
+  case Attribute::SanitizeThread:  return 1ULL << 36;
+  case Attribute::SanitizeMemory:  return 1ULL << 37;
+  case Attribute::NoBuiltin:       return 1ULL << 38;
+  }
+  llvm_unreachable("Unsupported attribute type");
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSetNode Definition
+//===----------------------------------------------------------------------===//
+
+AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
+                                        ArrayRef<Attribute> Attrs) {
+  if (Attrs.empty())
+    return 0;
+
+  // Otherwise, build a key to look up the existing attributes.
+  LLVMContextImpl *pImpl = C.pImpl;
+  FoldingSetNodeID ID;
+
+  SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
+  array_pod_sort(SortedAttrs.begin(), SortedAttrs.end());
+
+  for (SmallVectorImpl<Attribute>::iterator I = SortedAttrs.begin(),
+         E = SortedAttrs.end(); I != E; ++I)
+    I->Profile(ID);
+
+  void *InsertPoint;
+  AttributeSetNode *PA =
+    pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint);
+
+  // If we didn't find any existing attributes of the same shape then create a
+  // new one and insert it.
+  if (!PA) {
+    PA = new AttributeSetNode(SortedAttrs);
+    pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the AttributesListNode that we found or created.
+  return PA;
+}
+
+bool AttributeSetNode::hasAttribute(Attribute::AttrKind Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return true;
+  return false;
+}
+
+bool AttributeSetNode::hasAttribute(StringRef Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return true;
+  return false;
+}
+
+Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return *I;
+  return Attribute();
+}
+
+Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Kind))
+      return *I;
+  return Attribute();
+}
+
+unsigned AttributeSetNode::getAlignment() const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Attribute::Alignment))
+      return I->getAlignment();
+  return 0;
+}
+
+unsigned AttributeSetNode::getStackAlignment() const {
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ++I)
+    if (I->hasAttribute(Attribute::StackAlignment))
+      return I->getStackAlignment();
+  return 0;
+}
+
+std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
+  std::string Str = "";
+  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
+         E = AttrList.end(); I != E; ) {
+    Str += I->getAsString(InAttrGrp);
+    if (++I != E) Str += " ";
+  }
+  return Str;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSetImpl Definition
+//===----------------------------------------------------------------------===//
+
+uint64_t AttributeSetImpl::Raw(uint64_t Index) const {
+  for (unsigned I = 0, E = getNumAttributes(); I != E; ++I) {
+    if (getSlotIndex(I) != Index) continue;
+    const AttributeSetNode *ASN = AttrNodes[I].second;
+    uint64_t Mask = 0;
+
+    for (AttributeSetNode::const_iterator II = ASN->begin(),
+           IE = ASN->end(); II != IE; ++II) {
+      Attribute Attr = *II;
+
+      // This cannot handle string attributes.
+      if (Attr.isStringAttribute()) continue;
+
+      Attribute::AttrKind Kind = Attr.getKindAsEnum();
+
+      if (Kind == Attribute::Alignment)
+        Mask |= (Log2_32(ASN->getAlignment()) + 1) << 16;
+      else if (Kind == Attribute::StackAlignment)
+        Mask |= (Log2_32(ASN->getStackAlignment()) + 1) << 26;
+      else
+        Mask |= AttributeImpl::getAttrMask(Kind);
+    }
+
+    return Mask;
+  }
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSet Construction and Mutation Methods
+//===----------------------------------------------------------------------===//
+
+AttributeSet
+AttributeSet::getImpl(LLVMContext &C,
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Attrs) {
+  LLVMContextImpl *pImpl = C.pImpl;
+  FoldingSetNodeID ID;
+  AttributeSetImpl::Profile(ID, Attrs);
+
+  void *InsertPoint;
+  AttributeSetImpl *PA = pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
+
+  // If we didn't find any existing attributes of the same shape then
+  // create a new one and insert it.
+  if (!PA) {
+    PA = new AttributeSetImpl(C, Attrs);
+    pImpl->AttrsLists.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the AttributesList that we found or created.
+  return AttributeSet(PA);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C,
+                               ArrayRef<std::pair<unsigned, Attribute> > Attrs){
+  // If there are no attributes then return a null AttributesList pointer.
+  if (Attrs.empty())
+    return AttributeSet();
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+    assert((!i || Attrs[i-1].first <= Attrs[i].first) &&
+           "Misordered Attributes list!");
+    assert(!Attrs[i].second.hasAttribute(Attribute::None) &&
+           "Pointless attribute!");
+  }
+#endif
+
+  // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
+  // list.
+  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrPairVec;
+  for (ArrayRef<std::pair<unsigned, Attribute> >::iterator I = Attrs.begin(),
+         E = Attrs.end(); I != E; ) {
+    unsigned Index = I->first;
+    SmallVector<Attribute, 4> AttrVec;
+    while (I != E && I->first == Index) {
+      AttrVec.push_back(I->second);
+      ++I;
+    }
+
+    AttrPairVec.push_back(std::make_pair(Index,
+                                         AttributeSetNode::get(C, AttrVec)));
+  }
+
+  return getImpl(C, AttrPairVec);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C,
+                               ArrayRef<std::pair<unsigned,
+                                                  AttributeSetNode*> > Attrs) {
+  // If there are no attributes then return a null AttributesList pointer.
+  if (Attrs.empty())
+    return AttributeSet();
+
+  return getImpl(C, Attrs);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, unsigned Idx, AttrBuilder &B) {
+  if (!B.hasAttributes())
+    return AttributeSet();
+
+  // Add target-independent attributes.
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  for (Attribute::AttrKind Kind = Attribute::None;
+       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
+    if (!B.contains(Kind))
+      continue;
+
+    if (Kind == Attribute::Alignment)
+      Attrs.push_back(std::make_pair(Idx, Attribute::
+                                     getWithAlignment(C, B.getAlignment())));
+    else if (Kind == Attribute::StackAlignment)
+      Attrs.push_back(std::make_pair(Idx, Attribute::
+                              getWithStackAlignment(C, B.getStackAlignment())));
+    else
+      Attrs.push_back(std::make_pair(Idx, Attribute::get(C, Kind)));
+  }
+
+  // Add target-dependent (string) attributes.
+  for (AttrBuilder::td_iterator I = B.td_begin(), E = B.td_end();
+       I != E; ++I)
+    Attrs.push_back(std::make_pair(Idx, Attribute::get(C, I->first,I->second)));
+
+  return get(C, Attrs);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, unsigned Idx,
+                               ArrayRef<Attribute::AttrKind> Kind) {
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  for (ArrayRef<Attribute::AttrKind>::iterator I = Kind.begin(),
+         E = Kind.end(); I != E; ++I)
+    Attrs.push_back(std::make_pair(Idx, Attribute::get(C, *I)));
+  return get(C, Attrs);
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
+  if (Attrs.empty()) return AttributeSet();
+
+  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
+  for (unsigned I = 0, E = Attrs.size(); I != E; ++I) {
+    AttributeSet AS = Attrs[I];
+    if (!AS.pImpl) continue;
+    AttrNodeVec.append(AS.pImpl->AttrNodes.begin(), AS.pImpl->AttrNodes.end());
+  }
+
+  return getImpl(C, AttrNodeVec);
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Idx,
+                                        Attribute::AttrKind Attr) const {
+  if (hasAttribute(Idx, Attr)) return *this;
+  return addAttributes(C, Idx, AttributeSet::get(C, Idx, Attr));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Idx,
+                                        StringRef Kind) const {
+  llvm::AttrBuilder B;
+  B.addAttribute(Kind);
+  return addAttributes(C, Idx, AttributeSet::get(C, Idx, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Idx,
+                                         AttributeSet Attrs) const {
+  if (!pImpl) return Attrs;
+  if (!Attrs.pImpl) return *this;
+
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment. For now, say
+  // we can't change a known alignment.
+  unsigned OldAlign = getParamAlignment(Idx);
+  unsigned NewAlign = Attrs.getParamAlignment(Idx);
+  assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
+         "Attempt to change alignment!");
+#endif
+
+  // Add the attribute slots before the one we're trying to add.
+  SmallVector<AttributeSet, 4> AttrSet;
+  uint64_t NumAttrs = pImpl->getNumAttributes();
+  AttributeSet AS;
+  uint64_t LastIndex = 0;
+  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
+    if (getSlotIndex(I) >= Idx) {
+      if (getSlotIndex(I) == Idx) AS = getSlotAttributes(LastIndex++);
+      break;
+    }
+    LastIndex = I + 1;
+    AttrSet.push_back(getSlotAttributes(I));
+  }
+
+  // Now add the attribute into the correct slot. There may already be an
+  // AttributeSet there.
+  AttrBuilder B(AS, Idx);
+
+  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+    if (Attrs.getSlotIndex(I) == Idx) {
+      for (AttributeSetImpl::const_iterator II = Attrs.pImpl->begin(I),
+             IE = Attrs.pImpl->end(I); II != IE; ++II)
+        B.addAttribute(*II);
+      break;
+    }
+
+  AttrSet.push_back(AttributeSet::get(C, Idx, B));
+
+  // Add the remaining attribute slots.
+  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
+    AttrSet.push_back(getSlotAttributes(I));
+
+  return get(C, AttrSet);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Idx,
+                                           Attribute::AttrKind Attr) const {
+  if (!hasAttribute(Idx, Attr)) return *this;
+  return removeAttributes(C, Idx, AttributeSet::get(C, Idx, Attr));
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Idx,
+                                            AttributeSet Attrs) const {
+  if (!pImpl) return AttributeSet();
+  if (!Attrs.pImpl) return *this;
+
+#ifndef NDEBUG
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!Attrs.hasAttribute(Idx, Attribute::Alignment) &&
+         "Attempt to change alignment!");
+#endif
+
+  // Add the attribute slots before the one we're trying to add.
+  SmallVector<AttributeSet, 4> AttrSet;
+  uint64_t NumAttrs = pImpl->getNumAttributes();
+  AttributeSet AS;
+  uint64_t LastIndex = 0;
+  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
+    if (getSlotIndex(I) >= Idx) {
+      if (getSlotIndex(I) == Idx) AS = getSlotAttributes(LastIndex++);
+      break;
+    }
+    LastIndex = I + 1;
+    AttrSet.push_back(getSlotAttributes(I));
+  }
+
+  // Now remove the attribute from the correct slot. There may already be an
+  // AttributeSet there.
+  AttrBuilder B(AS, Idx);
+
+  for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
+    if (Attrs.getSlotIndex(I) == Idx) {
+      B.removeAttributes(Attrs.pImpl->getSlotAttributes(I), Idx);
+      break;
+    }
+
+  AttrSet.push_back(AttributeSet::get(C, Idx, B));
+
+  // Add the remaining attribute slots.
+  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
+    AttrSet.push_back(getSlotAttributes(I));
+
+  return get(C, AttrSet);
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSet Accessor Methods
+//===----------------------------------------------------------------------===//
+
+LLVMContext &AttributeSet::getContext() const {
+  return pImpl->getContext();
+}
+
+AttributeSet AttributeSet::getParamAttributes(unsigned Idx) const {
+  return pImpl && hasAttributes(Idx) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(Idx, getAttributes(Idx)))) :
+    AttributeSet();
+}
+
+AttributeSet AttributeSet::getRetAttributes() const {
+  return pImpl && hasAttributes(ReturnIndex) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(ReturnIndex,
+                                       getAttributes(ReturnIndex)))) :
+    AttributeSet();
+}
+
+AttributeSet AttributeSet::getFnAttributes() const {
+  return pImpl && hasAttributes(FunctionIndex) ?
+    AttributeSet::get(pImpl->getContext(),
+                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
+                        std::make_pair(FunctionIndex,
+                                       getAttributes(FunctionIndex)))) :
+    AttributeSet();
+}
+
+bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttribute(Kind) : false;
+}
+
+bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttribute(Kind) : false;
+}
+
+bool AttributeSet::hasAttributes(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->hasAttributes() : false;
+}
+
+/// \brief Return true if the specified attribute is set for at least one
+/// parameter or for the return value.
+bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
+  if (pImpl == 0) return false;
+
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+           IE = pImpl->end(I); II != IE; ++II)
+      if (II->hasAttribute(Attr))
+        return true;
+
+  return false;
+}
+
+Attribute AttributeSet::getAttribute(unsigned Index,
+                                     Attribute::AttrKind Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAttribute(Kind) : Attribute();
+}
+
+Attribute AttributeSet::getAttribute(unsigned Index,
+                                     StringRef Kind) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAttribute(Kind) : Attribute();
+}
+
+unsigned AttributeSet::getParamAlignment(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAlignment() : 0;
+}
+
+unsigned AttributeSet::getStackAlignment(unsigned Index) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getStackAlignment() : 0;
+}
+
+std::string AttributeSet::getAsString(unsigned Index,
+                                      bool InAttrGrp) const {
+  AttributeSetNode *ASN = getAttributes(Index);
+  return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
+}
+
+/// \brief The attributes for the specified index are returned.
+AttributeSetNode *AttributeSet::getAttributes(unsigned Idx) const {
+  if (!pImpl) return 0;
+
+  // Loop through to find the attribute node we want.
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
+    if (pImpl->getSlotIndex(I) == Idx)
+      return pImpl->getSlotNode(I);
+
+  return 0;
+}
+
+AttributeSet::iterator AttributeSet::begin(unsigned Idx) const {
+  if (!pImpl)
+    return ArrayRef<Attribute>().begin();
+  return pImpl->begin(Idx);
+}
+
+AttributeSet::iterator AttributeSet::end(unsigned Idx) const {
+  if (!pImpl)
+    return ArrayRef<Attribute>().end();
+  return pImpl->end(Idx);
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeSet Introspection Methods
+//===----------------------------------------------------------------------===//
+
+/// \brief Return the number of slots used in this attribute list.  This is the
+/// number of arguments that have an attribute set on them (including the
+/// function itself).
+unsigned AttributeSet::getNumSlots() const {
+  return pImpl ? pImpl->getNumAttributes() : 0;
+}
+
+uint64_t AttributeSet::getSlotIndex(unsigned Slot) const {
+  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+         "Slot # out of range!");
+  return pImpl->getSlotIndex(Slot);
+}
+
+AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
+  assert(pImpl && Slot < pImpl->getNumAttributes() &&
+         "Slot # out of range!");
+  return pImpl->getSlotAttributes(Slot);
+}
+
+uint64_t AttributeSet::Raw(unsigned Index) const {
+  // FIXME: Remove this.
+  return pImpl ? pImpl->Raw(Index) : 0;
+}
+
+void AttributeSet::dump() const {
+  dbgs() << "PAL[\n";
+
+  for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
+    uint64_t Index = getSlotIndex(i);
+    dbgs() << "  { ";
+    if (Index == ~0U)
+      dbgs() << "~0U";
+    else
+      dbgs() << Index;
+    dbgs() << " => " << getAsString(Index) << " }\n";
+  }
+
+  dbgs() << "]\n";
+}
+
+//===----------------------------------------------------------------------===//
+// AttrBuilder Method Implementations
+//===----------------------------------------------------------------------===//
+
+AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Idx)
+  : Attrs(0), Alignment(0), StackAlignment(0) {
+  AttributeSetImpl *pImpl = AS.pImpl;
+  if (!pImpl) return;
+
+  for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I) {
+    if (pImpl->getSlotIndex(I) != Idx) continue;
+
+    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+           IE = pImpl->end(I); II != IE; ++II)
+      addAttribute(*II);
+
+    break;
+  }
+}
+
+void AttrBuilder::clear() {
+  Attrs.reset();
+  Alignment = StackAlignment = 0;
+}
+
+AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
+  assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
+  assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment &&
+         "Adding alignment attribute without adding alignment value!");
+  Attrs[Val] = true;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
+  if (Attr.isStringAttribute()) {
+    addAttribute(Attr.getKindAsString(), Attr.getValueAsString());
+    return *this;
+  }
+
+  Attribute::AttrKind Kind = Attr.getKindAsEnum();
+  Attrs[Kind] = true;
+
+  if (Kind == Attribute::Alignment)
+    Alignment = Attr.getAlignment();
+  else if (Kind == Attribute::StackAlignment)
+    StackAlignment = Attr.getStackAlignment();
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAttribute(StringRef A, StringRef V) {
+  TargetDepAttrs[A] = V;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
+  assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
+  Attrs[Val] = false;
+
+  if (Val == Attribute::Alignment)
+    Alignment = 0;
+  else if (Val == Attribute::StackAlignment)
+    StackAlignment = 0;
+
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
+  unsigned Idx = ~0U;
+  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
+    if (A.getSlotIndex(I) == Index) {
+      Idx = I;
+      break;
+    }
+
+  assert(Idx != ~0U && "Couldn't find index in AttributeSet!");
+
+  for (AttributeSet::iterator I = A.begin(Idx), E = A.end(Idx); I != E; ++I) {
+    Attribute Attr = *I;
+    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+      Attribute::AttrKind Kind = I->getKindAsEnum();
+      Attrs[Kind] = false;
+
+      if (Kind == Attribute::Alignment)
+        Alignment = 0;
+      else if (Kind == Attribute::StackAlignment)
+        StackAlignment = 0;
+    } else {
+      assert(Attr.isStringAttribute() && "Invalid attribute type!");
+      std::map<std::string, std::string>::iterator
+        Iter = TargetDepAttrs.find(Attr.getKindAsString());
+      if (Iter != TargetDepAttrs.end())
+        TargetDepAttrs.erase(Iter);
+    }
+  }
+
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
+  std::map<std::string, std::string>::iterator I = TargetDepAttrs.find(A);
+  if (I != TargetDepAttrs.end())
+    TargetDepAttrs.erase(I);
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
+  if (Align == 0) return *this;
+
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x40000000 && "Alignment too large.");
+
+  Attrs[Attribute::Alignment] = true;
+  Alignment = Align;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
+  // Default alignment, allow the target to define how to align it.
+  if (Align == 0) return *this;
+
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x100 && "Alignment too large.");
+
+  Attrs[Attribute::StackAlignment] = true;
+  StackAlignment = Align;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
+  // FIXME: What if both have alignments, but they don't match?!
+  if (!Alignment)
+    Alignment = B.Alignment;
+
+  if (!StackAlignment)
+    StackAlignment = B.StackAlignment;
+
+  Attrs |= B.Attrs;
+
+  for (td_const_iterator I = B.TargetDepAttrs.begin(),
+         E = B.TargetDepAttrs.end(); I != E; ++I)
+    TargetDepAttrs[I->first] = I->second;
+
+  return *this;
+}
+
+bool AttrBuilder::contains(StringRef A) const {
+  return TargetDepAttrs.find(A) != TargetDepAttrs.end();
+}
+
+bool AttrBuilder::hasAttributes() const {
+  return !Attrs.none() || !TargetDepAttrs.empty();
+}
+
+bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
+  unsigned Idx = ~0U;
+  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
+    if (A.getSlotIndex(I) == Index) {
+      Idx = I;
+      break;
+    }
+
+  assert(Idx != ~0U && "Couldn't find the index!");
+
+  for (AttributeSet::iterator I = A.begin(Idx), E = A.end(Idx);
+       I != E; ++I) {
+    Attribute Attr = *I;
+    if (Attr.isEnumAttribute() || Attr.isAlignAttribute()) {
+      if (Attrs[I->getKindAsEnum()])
+        return true;
+    } else {
+      assert(Attr.isStringAttribute() && "Invalid attribute kind!");
+      return TargetDepAttrs.find(Attr.getKindAsString())!=TargetDepAttrs.end();
+    }
+  }
+
+  return false;
+}
+
+bool AttrBuilder::hasAlignmentAttr() const {
+  return Alignment != 0;
+}
+
+bool AttrBuilder::operator==(const AttrBuilder &B) {
+  if (Attrs != B.Attrs)
+    return false;
+
+  for (td_const_iterator I = TargetDepAttrs.begin(),
+         E = TargetDepAttrs.end(); I != E; ++I)
+    if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end())
+      return false;
+
+  return Alignment == B.Alignment && StackAlignment == B.StackAlignment;
+}
+
+void AttrBuilder::removeFunctionOnlyAttrs() {
+  removeAttribute(Attribute::NoReturn)
+    .removeAttribute(Attribute::NoUnwind)
+    .removeAttribute(Attribute::ReadNone)
+    .removeAttribute(Attribute::ReadOnly)
+    .removeAttribute(Attribute::NoInline)
+    .removeAttribute(Attribute::AlwaysInline)
+    .removeAttribute(Attribute::OptimizeForSize)
+    .removeAttribute(Attribute::StackProtect)
+    .removeAttribute(Attribute::StackProtectReq)
+    .removeAttribute(Attribute::StackProtectStrong)
+    .removeAttribute(Attribute::NoRedZone)
+    .removeAttribute(Attribute::NoImplicitFloat)
+    .removeAttribute(Attribute::Naked)
+    .removeAttribute(Attribute::InlineHint)
+    .removeAttribute(Attribute::StackAlignment)
+    .removeAttribute(Attribute::UWTable)
+    .removeAttribute(Attribute::NonLazyBind)
+    .removeAttribute(Attribute::ReturnsTwice)
+    .removeAttribute(Attribute::SanitizeAddress)
+    .removeAttribute(Attribute::SanitizeThread)
+    .removeAttribute(Attribute::SanitizeMemory)
+    .removeAttribute(Attribute::MinSize)
+    .removeAttribute(Attribute::NoDuplicate)
+    .removeAttribute(Attribute::NoBuiltin);
+}
+
+AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
+  // FIXME: Remove this in 4.0.
+  if (!Val) return *this;
+
+  for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
+       I = Attribute::AttrKind(I + 1)) {
+    if (uint64_t A = (Val & AttributeImpl::getAttrMask(I))) {
+      Attrs[I] = true;
+ 
+      if (I == Attribute::Alignment)
+        Alignment = 1ULL << ((A >> 16) - 1);
+      else if (I == Attribute::StackAlignment)
+        StackAlignment = 1ULL << ((A >> 26)-1);
+    }
+  }
+ 
+  return *this;
+}
+
+//===----------------------------------------------------------------------===//
+// AttributeFuncs Function Defintions
+//===----------------------------------------------------------------------===//
+
+/// \brief Which attributes cannot be applied to a type.
+AttributeSet AttributeFuncs::typeIncompatible(Type *Ty, uint64_t Index) {
+  AttrBuilder Incompatible;
+
+  if (!Ty->isIntegerTy())
+    // Attribute that only apply to integers.
+    Incompatible.addAttribute(Attribute::SExt)
+      .addAttribute(Attribute::ZExt);
+
+  if (!Ty->isPointerTy())
+    // Attribute that only apply to pointers.
+    Incompatible.addAttribute(Attribute::ByVal)
+      .addAttribute(Attribute::Nest)
+      .addAttribute(Attribute::NoAlias)
+      .addAttribute(Attribute::NoCapture)
+      .addAttribute(Attribute::StructRet);
+
+  return AttributeSet::get(Ty->getContext(), Index, Incompatible);
+}
diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp
new file mode 100644
index 000000000000..f2375374e356
--- /dev/null
+++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp
@@ -0,0 +1,393 @@
+//===-- AutoUpgrade.cpp - Implement auto-upgrade helper functions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the auto-upgrade helper functions 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AutoUpgrade.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstring>
+using namespace llvm;
+
+// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// changed their type from v4f32 to v2i64.
+static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
+                                 Function *&NewFn) {
+  // Check whether this is an old version of the function, which received
+  // v4f32 arguments.
+  Type *Arg0Type = F->getFunctionType()->getParamType(0);
+  if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4))
+    return false;
+
+  // Yes, it's old, replace it with new version.
+  F->setName(F->getName() + ".old");
+  NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
+  return true;
+}
+
+static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+  assert(F && "Illegal to upgrade a non-existent Function.");
+
+  // Quickly eliminate it, if it's not a candidate.
+  StringRef Name = F->getName();
+  if (Name.size() <= 8 || !Name.startswith("llvm."))
+    return false;
+  Name = Name.substr(5); // Strip off "llvm."
+
+  switch (Name[0]) {
+  default: break;
+  case 'a': {
+    if (Name.startswith("arm.neon.vclz")) {
+      Type* args[2] = {
+        F->arg_begin()->getType(), 
+        Type::getInt1Ty(F->getContext())
+      };
+      // Can't use Intrinsic::getDeclaration here as it adds a ".i1" to
+      // the end of the name. Change name from llvm.arm.neon.vclz.* to
+      //  llvm.ctlz.*
+      FunctionType* fType = FunctionType::get(F->getReturnType(), args, false);
+      NewFn = Function::Create(fType, F->getLinkage(), 
+                               "llvm.ctlz." + Name.substr(14), F->getParent());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vcnt")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    break;
+  }
+  case 'c': {
+    if (Name.startswith("ctlz.") && F->arg_size() == 1) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    if (Name.startswith("cttz.") && F->arg_size() == 1) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    break;
+  }
+  case 'x': {
+    if (Name.startswith("x86.sse2.pcmpeq.") ||
+        Name.startswith("x86.sse2.pcmpgt.") ||
+        Name.startswith("x86.avx2.pcmpeq.") ||
+        Name.startswith("x86.avx2.pcmpgt.") ||
+        Name.startswith("x86.avx.vpermil.") ||
+        Name == "x86.avx.movnt.dq.256" ||
+        Name == "x86.avx.movnt.pd.256" ||
+        Name == "x86.avx.movnt.ps.256" ||
+        (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
+      NewFn = 0;
+      return true;
+    }
+    // SSE4.1 ptest functions may have an old signature.
+    if (Name.startswith("x86.sse41.ptest")) {
+      if (Name == "x86.sse41.ptestc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
+      if (Name == "x86.sse41.ptestz")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
+      if (Name == "x86.sse41.ptestnzc")
+        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+    }
+    // frcz.ss/sd may need to have an argument dropped
+    if (Name.startswith("x86.xop.vfrcz.ss") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_ss);
+      return true;
+    }
+    if (Name.startswith("x86.xop.vfrcz.sd") && F->arg_size() == 2) {
+      F->setName(Name + ".old");
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::x86_xop_vfrcz_sd);
+      return true;
+    }
+    // Fix the FMA4 intrinsics to remove the 4
+    if (Name.startswith("x86.fma4.")) {
+      F->setName("llvm.x86.fma" + Name.substr(8));
+      NewFn = F;
+      return true;
+    }
+    break;
+  }
+  }
+
+  //  This may not belong here. This function is effectively being overloaded
+  //  to both detect an intrinsic which needs upgrading, and to provide the
+  //  upgraded form of the intrinsic. We should perhaps have two separate
+  //  functions for this.
+  return false;
+}
+
+bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
+  NewFn = 0;
+  bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
+
+  // Upgrade intrinsic attributes.  This does not change the function.
+  if (NewFn)
+    F = NewFn;
+  if (unsigned id = F->getIntrinsicID())
+    F->setAttributes(Intrinsic::getAttributes(F->getContext(),
+                                              (Intrinsic::ID)id));
+  return Upgraded;
+}
+
+bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
+  // Nothing to do yet.
+  return false;
+}
+
+// UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the
+// upgraded intrinsic. All argument and return casting must be provided in
+// order to seamlessly integrate with existing context.
+void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
+  Function *F = CI->getCalledFunction();
+  LLVMContext &C = CI->getContext();
+  IRBuilder<> Builder(C);
+  Builder.SetInsertPoint(CI->getParent(), CI);
+
+  assert(F && "Intrinsic call is not direct?");
+
+  if (!NewFn) {
+    // Get the Function's name.
+    StringRef Name = F->getName();
+
+    Value *Rep;
+    // Upgrade packed integer vector compares intrinsics to compare instructions
+    if (Name.startswith("llvm.x86.sse2.pcmpeq.") ||
+        Name.startswith("llvm.x86.avx2.pcmpeq.")) {
+      Rep = Builder.CreateICmpEQ(CI->getArgOperand(0), CI->getArgOperand(1),
+                                 "pcmpeq");
+      // need to sign extend since icmp returns vector of i1
+      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+    } else if (Name.startswith("llvm.x86.sse2.pcmpgt.") ||
+               Name.startswith("llvm.x86.avx2.pcmpgt.")) {
+      Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1),
+                                  "pcmpgt");
+      // need to sign extend since icmp returns vector of i1
+      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
+    } else if (Name == "llvm.x86.avx.movnt.dq.256" ||
+               Name == "llvm.x86.avx.movnt.ps.256" ||
+               Name == "llvm.x86.avx.movnt.pd.256") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      Module *M = F->getParent();
+      SmallVector<Value *, 1> Elts;
+      Elts.push_back(ConstantInt::get(Type::getInt32Ty(C), 1));
+      MDNode *Node = MDNode::get(C, Elts);
+
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC = Builder.CreateBitCast(Arg0,
+                                        PointerType::getUnqual(Arg1->getType()),
+                                        "cast");
+      StoreInst *SI = Builder.CreateStore(Arg1, BC);
+      SI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      SI->setAlignment(16);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
+      return;
+    } else if (Name.startswith("llvm.x86.xop.vpcom")) {
+      Intrinsic::ID intID;
+      if (Name.endswith("ub"))
+        intID = Intrinsic::x86_xop_vpcomub;
+      else if (Name.endswith("uw"))
+        intID = Intrinsic::x86_xop_vpcomuw;
+      else if (Name.endswith("ud"))
+        intID = Intrinsic::x86_xop_vpcomud;
+      else if (Name.endswith("uq"))
+        intID = Intrinsic::x86_xop_vpcomuq;
+      else if (Name.endswith("b"))
+        intID = Intrinsic::x86_xop_vpcomb;
+      else if (Name.endswith("w"))
+        intID = Intrinsic::x86_xop_vpcomw;
+      else if (Name.endswith("d"))
+        intID = Intrinsic::x86_xop_vpcomd;
+      else if (Name.endswith("q"))
+        intID = Intrinsic::x86_xop_vpcomq;
+      else
+        llvm_unreachable("Unknown suffix");
+
+      Name = Name.substr(18); // strip off "llvm.x86.xop.vpcom"
+      unsigned Imm;
+      if (Name.startswith("lt"))
+        Imm = 0;
+      else if (Name.startswith("le"))
+        Imm = 1;
+      else if (Name.startswith("gt"))
+        Imm = 2;
+      else if (Name.startswith("ge"))
+        Imm = 3;
+      else if (Name.startswith("eq"))
+        Imm = 4;
+      else if (Name.startswith("ne"))
+        Imm = 5;
+      else if (Name.startswith("true"))
+        Imm = 6;
+      else if (Name.startswith("false"))
+        Imm = 7;
+      else
+        llvm_unreachable("Unknown condition");
+
+      Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
+      Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
+                                CI->getArgOperand(1), Builder.getInt8(Imm));
+    } else {
+      bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
+      if (Name == "llvm.x86.avx.vpermil.pd.256")
+        PD256 = true;
+      else if (Name == "llvm.x86.avx.vpermil.pd")
+        PD128 = true;
+      else if (Name == "llvm.x86.avx.vpermil.ps.256")
+        PS256 = true;
+      else if (Name == "llvm.x86.avx.vpermil.ps")
+        PS128 = true;
+
+      if (PD256 || PD128 || PS256 || PS128) {
+        Value *Op0 = CI->getArgOperand(0);
+        unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+        SmallVector<Constant*, 8> Idxs;
+
+        if (PD128)
+          for (unsigned i = 0; i != 2; ++i)
+            Idxs.push_back(Builder.getInt32((Imm >> i) & 0x1));
+        else if (PD256)
+          for (unsigned l = 0; l != 4; l+=2)
+            for (unsigned i = 0; i != 2; ++i)
+              Idxs.push_back(Builder.getInt32(((Imm >> (l+i)) & 0x1) + l));
+        else if (PS128)
+          for (unsigned i = 0; i != 4; ++i)
+            Idxs.push_back(Builder.getInt32((Imm >> (2 * i)) & 0x3));
+        else if (PS256)
+          for (unsigned l = 0; l != 8; l+=4)
+            for (unsigned i = 0; i != 4; ++i)
+              Idxs.push_back(Builder.getInt32(((Imm >> (2 * i)) & 0x3) + l));
+        else
+          llvm_unreachable("Unexpected function");
+
+        Rep = Builder.CreateShuffleVector(Op0, Op0, ConstantVector::get(Idxs));
+      } else {
+        llvm_unreachable("Unknown function for CallInst upgrade.");
+      }
+    }
+
+    CI->replaceAllUsesWith(Rep);
+    CI->eraseFromParent();
+    return;
+  }
+
+  std::string Name = CI->getName().str();
+  CI->setName(Name + ".old");
+
+  switch (NewFn->getIntrinsicID()) {
+  default:
+    llvm_unreachable("Unknown function for CallInst upgrade.");
+
+  case Intrinsic::ctlz:
+  case Intrinsic::cttz:
+    assert(CI->getNumArgOperands() == 1 &&
+           "Mismatch between function args and call args");
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
+                                               Builder.getFalse(), Name));
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::arm_neon_vclz: {
+    // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
+                                               Builder.getFalse(),
+                                               "llvm.ctlz." + Name.substr(14)));
+    CI->eraseFromParent();
+    return;
+  }
+  case Intrinsic::ctpop: {
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(0)));
+    CI->eraseFromParent();
+    return;
+  }
+
+  case Intrinsic::x86_xop_vfrcz_ss:
+  case Intrinsic::x86_xop_vfrcz_sd:
+    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, CI->getArgOperand(1),
+                                              Name));
+    CI->eraseFromParent();
+    return;
+
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestnzc: {
+    // The arguments for these intrinsics used to be v4f32, and changed
+    // to v2i64. This is purely a nop, since those are bitwise intrinsics.
+    // So, the only thing required is a bitcast for both arguments.
+    // First, check the arguments have the old type.
+    Value *Arg0 = CI->getArgOperand(0);
+    if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4))
+      return;
+
+    // Old intrinsic, add bitcasts
+    Value *Arg1 = CI->getArgOperand(1);
+
+    Value *BC0 =
+      Builder.CreateBitCast(Arg0,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+    Value *BC1 =
+      Builder.CreateBitCast(Arg1,
+                            VectorType::get(Type::getInt64Ty(C), 2),
+                            "cast");
+
+    CallInst* NewCall = Builder.CreateCall2(NewFn, BC0, BC1, Name);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
+  }
+}
+
+// This tests each Function to determine if it needs upgrading. When we find 
+// one we are interested in, we then upgrade all calls to reflect the new 
+// function.
+void llvm::UpgradeCallsToIntrinsic(Function* F) {
+  assert(F && "Illegal attempt to upgrade a non-existent intrinsic.");
+
+  // Upgrade the function and check if it is a totaly new function.
+  Function *NewFn;
+  if (UpgradeIntrinsicFunction(F, NewFn)) {
+    if (NewFn != F) {
+      // Replace all uses to the old function with the new one if necessary.
+      for (Value::use_iterator UI = F->use_begin(), UE = F->use_end();
+           UI != UE; ) {
+        if (CallInst *CI = dyn_cast<CallInst>(*UI++))
+          UpgradeIntrinsicCall(CI, NewFn);
+      }
+      // Remove old function, no longer used, from the module.
+      F->eraseFromParent();
+    }
+  }
+}
+
diff --git a/contrib/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm/lib/IR/BasicBlock.cpp
new file mode 100644
index 000000000000..41e58ec5da2d
--- /dev/null
+++ b/contrib/llvm/lib/IR/BasicBlock.cpp
@@ -0,0 +1,371 @@
+//===-- BasicBlock.cpp - Implement BasicBlock related methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BasicBlock class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/LeakDetector.h"
+#include <algorithm>
+using namespace llvm;
+
+ValueSymbolTable *BasicBlock::getValueSymbolTable() {
+  if (Function *F = getParent())
+    return &F->getValueSymbolTable();
+  return 0;
+}
+
+LLVMContext &BasicBlock::getContext() const {
+  return getType()->getContext();
+}
+
+// Explicit instantiation of SymbolTableListTraits since some of the methods
+// are not in the public header file...
+template class llvm::SymbolTableListTraits<Instruction, BasicBlock>;
+
+
+BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
+                       BasicBlock *InsertBefore)
+  : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(0) {
+
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (InsertBefore) {
+    assert(NewParent &&
+           "Cannot insert block before another block with no function!");
+    NewParent->getBasicBlockList().insert(InsertBefore, this);
+  } else if (NewParent) {
+    NewParent->getBasicBlockList().push_back(this);
+  }
+
+  setName(Name);
+}
+
+
+BasicBlock::~BasicBlock() {
+  // If the address of the block is taken and it is being deleted (e.g. because
+  // it is dead), this means that there is either a dangling constant expr
+  // hanging off the block, or an undefined use of the block (source code
+  // expecting the address of a label to keep the block alive even though there
+  // is no indirect branch).  Handle these cases by zapping the BlockAddress
+  // nodes.  There are no other possible uses at this point.
+  if (hasAddressTaken()) {
+    assert(!use_empty() && "There should be at least one blockaddress!");
+    Constant *Replacement =
+      ConstantInt::get(llvm::Type::getInt32Ty(getContext()), 1);
+    while (!use_empty()) {
+      BlockAddress *BA = cast<BlockAddress>(use_back());
+      BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
+                                                       BA->getType()));
+      BA->destroyConstant();
+    }
+  }
+
+  assert(getParent() == 0 && "BasicBlock still linked into the program!");
+  dropAllReferences();
+  InstList.clear();
+}
+
+void BasicBlock::setParent(Function *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+
+  // Set Parent=parent, updating instruction symtab entries as appropriate.
+  InstList.setSymTabObject(&Parent, parent);
+
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void BasicBlock::removeFromParent() {
+  getParent()->getBasicBlockList().remove(this);
+}
+
+void BasicBlock::eraseFromParent() {
+  getParent()->getBasicBlockList().erase(this);
+}
+
+/// moveBefore - Unlink this basic block from its current function and
+/// insert it into the function that MovePos lives in, right before MovePos.
+void BasicBlock::moveBefore(BasicBlock *MovePos) {
+  MovePos->getParent()->getBasicBlockList().splice(MovePos,
+                       getParent()->getBasicBlockList(), this);
+}
+
+/// moveAfter - Unlink this basic block from its current function and
+/// insert it into the function that MovePos lives in, right after MovePos.
+void BasicBlock::moveAfter(BasicBlock *MovePos) {
+  Function::iterator I = MovePos;
+  MovePos->getParent()->getBasicBlockList().splice(++I,
+                                       getParent()->getBasicBlockList(), this);
+}
+
+
+TerminatorInst *BasicBlock::getTerminator() {
+  if (InstList.empty()) return 0;
+  return dyn_cast<TerminatorInst>(&InstList.back());
+}
+
+const TerminatorInst *BasicBlock::getTerminator() const {
+  if (InstList.empty()) return 0;
+  return dyn_cast<TerminatorInst>(&InstList.back());
+}
+
+Instruction* BasicBlock::getFirstNonPHI() {
+  BasicBlock::iterator i = begin();
+  // All valid basic blocks should have a terminator,
+  // which is not a PHINode. If we have an invalid basic
+  // block we'll get an assertion failure when dereferencing
+  // a past-the-end iterator.
+  while (isa<PHINode>(i)) ++i;
+  return &*i;
+}
+
+Instruction* BasicBlock::getFirstNonPHIOrDbg() {
+  BasicBlock::iterator i = begin();
+  // All valid basic blocks should have a terminator,
+  // which is not a PHINode. If we have an invalid basic
+  // block we'll get an assertion failure when dereferencing
+  // a past-the-end iterator.
+  while (isa<PHINode>(i) || isa<DbgInfoIntrinsic>(i)) ++i;
+  return &*i;
+}
+
+Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
+  // All valid basic blocks should have a terminator,
+  // which is not a PHINode. If we have an invalid basic
+  // block we'll get an assertion failure when dereferencing
+  // a past-the-end iterator.
+  BasicBlock::iterator i = begin();
+  for (;; ++i) {
+    if (isa<PHINode>(i) || isa<DbgInfoIntrinsic>(i))
+      continue;
+
+    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(i);
+    if (!II)
+      break;
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      break;
+  }
+  return &*i;
+}
+
+BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
+  iterator InsertPt = getFirstNonPHI();
+  if (isa<LandingPadInst>(InsertPt)) ++InsertPt;
+  return InsertPt;
+}
+
+void BasicBlock::dropAllReferences() {
+  for(iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+}
+
+/// getSinglePredecessor - If this basic block has a single predecessor block,
+/// return the block, otherwise return a null pointer.
+BasicBlock *BasicBlock::getSinglePredecessor() {
+  pred_iterator PI = pred_begin(this), E = pred_end(this);
+  if (PI == E) return 0;         // No preds.
+  BasicBlock *ThePred = *PI;
+  ++PI;
+  return (PI == E) ? ThePred : 0 /*multiple preds*/;
+}
+
+/// getUniquePredecessor - If this basic block has a unique predecessor block,
+/// return the block, otherwise return a null pointer.
+/// Note that unique predecessor doesn't mean single edge, there can be
+/// multiple edges from the unique predecessor to this block (for example
+/// a switch statement with multiple cases having the same destination).
+BasicBlock *BasicBlock::getUniquePredecessor() {
+  pred_iterator PI = pred_begin(this), E = pred_end(this);
+  if (PI == E) return 0; // No preds.
+  BasicBlock *PredBB = *PI;
+  ++PI;
+  for (;PI != E; ++PI) {
+    if (*PI != PredBB)
+      return 0;
+    // The same predecessor appears multiple times in the predecessor list.
+    // This is OK.
+  }
+  return PredBB;
+}
+
+/// removePredecessor - This method is used to notify a BasicBlock that the
+/// specified Predecessor of the block is no longer able to reach it.  This is
+/// actually not used to update the Predecessor list, but is actually used to
+/// update the PHI nodes that reside in the block.  Note that this should be
+/// called while the predecessor still refers to this block.
+///
+void BasicBlock::removePredecessor(BasicBlock *Pred,
+                                   bool DontDeleteUselessPHIs) {
+  assert((hasNUsesOrMore(16)||// Reduce cost of this assertion for complex CFGs.
+          find(pred_begin(this), pred_end(this), Pred) != pred_end(this)) &&
+         "removePredecessor: BB is not a predecessor!");
+
+  if (InstList.empty()) return;
+  PHINode *APN = dyn_cast<PHINode>(&front());
+  if (!APN) return;   // Quick exit.
+
+  // If there are exactly two predecessors, then we want to nuke the PHI nodes
+  // altogether.  However, we cannot do this, if this in this case:
+  //
+  //  Loop:
+  //    %x = phi [X, Loop]
+  //    %x2 = add %x, 1         ;; This would become %x2 = add %x2, 1
+  //    br Loop                 ;; %x2 does not dominate all uses
+  //
+  // This is because the PHI node input is actually taken from the predecessor
+  // basic block.  The only case this can happen is with a self loop, so we
+  // check for this case explicitly now.
+  //
+  unsigned max_idx = APN->getNumIncomingValues();
+  assert(max_idx != 0 && "PHI Node in block with 0 predecessors!?!?!");
+  if (max_idx == 2) {
+    BasicBlock *Other = APN->getIncomingBlock(APN->getIncomingBlock(0) == Pred);
+
+    // Disable PHI elimination!
+    if (this == Other) max_idx = 3;
+  }
+
+  // <= Two predecessors BEFORE I remove one?
+  if (max_idx <= 2 && !DontDeleteUselessPHIs) {
+    // Yup, loop through and nuke the PHI nodes
+    while (PHINode *PN = dyn_cast<PHINode>(&front())) {
+      // Remove the predecessor first.
+      PN->removeIncomingValue(Pred, !DontDeleteUselessPHIs);
+
+      // If the PHI _HAD_ two uses, replace PHI node with its now *single* value
+      if (max_idx == 2) {
+        if (PN->getIncomingValue(0) != PN)
+          PN->replaceAllUsesWith(PN->getIncomingValue(0));
+        else
+          // We are left with an infinite loop with no entries: kill the PHI.
+          PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+        getInstList().pop_front();    // Remove the PHI node
+      }
+
+      // If the PHI node already only had one entry, it got deleted by
+      // removeIncomingValue.
+    }
+  } else {
+    // Okay, now we know that we need to remove predecessor #pred_idx from all
+    // PHI nodes.  Iterate over each PHI node fixing them up
+    PHINode *PN;
+    for (iterator II = begin(); (PN = dyn_cast<PHINode>(II)); ) {
+      ++II;
+      PN->removeIncomingValue(Pred, false);
+      // If all incoming values to the Phi are the same, we can replace the Phi
+      // with that value.
+      Value* PNV = 0;
+      if (!DontDeleteUselessPHIs && (PNV = PN->hasConstantValue()))
+        if (PNV != PN) {
+          PN->replaceAllUsesWith(PNV);
+          PN->eraseFromParent();
+        }
+    }
+  }
+}
+
+
+/// splitBasicBlock - This splits a basic block into two at the specified
+/// instruction.  Note that all instructions BEFORE the specified iterator stay
+/// as part of the original basic block, an unconditional branch is added to
+/// the new BB, and the rest of the instructions in the BB are moved to the new
+/// BB, including the old terminator.  This invalidates the iterator.
+///
+/// Note that this only works on well formed basic blocks (must have a
+/// terminator), and 'I' must not be the end of instruction list (which would
+/// cause a degenerate basic block to be formed, having a terminator inside of
+/// the basic block).
+///
+BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
+  assert(getTerminator() && "Can't use splitBasicBlock on degenerate BB!");
+  assert(I != InstList.end() &&
+         "Trying to get me to create degenerate basic block!");
+
+  BasicBlock *InsertBefore = llvm::next(Function::iterator(this))
+                               .getNodePtrUnchecked();
+  BasicBlock *New = BasicBlock::Create(getContext(), BBName,
+                                       getParent(), InsertBefore);
+
+  // Move all of the specified instructions from the original basic block into
+  // the new basic block.
+  New->getInstList().splice(New->end(), this->getInstList(), I, end());
+
+  // Add a branch instruction to the newly formed basic block.
+  BranchInst::Create(New, this);
+
+  // Now we must loop through all of the successors of the New block (which
+  // _were_ the successors of the 'this' block), and update any PHI nodes in
+  // successors.  If there were PHI nodes in the successors, then they need to
+  // know that incoming branches will be from New, not from Old.
+  //
+  for (succ_iterator I = succ_begin(New), E = succ_end(New); I != E; ++I) {
+    // Loop over any phi nodes in the basic block, updating the BB field of
+    // incoming values...
+    BasicBlock *Successor = *I;
+    PHINode *PN;
+    for (BasicBlock::iterator II = Successor->begin();
+         (PN = dyn_cast<PHINode>(II)); ++II) {
+      int IDX = PN->getBasicBlockIndex(this);
+      while (IDX != -1) {
+        PN->setIncomingBlock((unsigned)IDX, New);
+        IDX = PN->getBasicBlockIndex(this);
+      }
+    }
+  }
+  return New;
+}
+
+void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
+  TerminatorInst *TI = getTerminator();
+  if (!TI)
+    // Cope with being called on a BasicBlock that doesn't have a terminator
+    // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this.
+    return;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *Succ = TI->getSuccessor(i);
+    // N.B. Succ might not be a complete BasicBlock, so don't assume
+    // that it ends with a non-phi instruction.
+    for (iterator II = Succ->begin(), IE = Succ->end(); II != IE; ++II) {
+      PHINode *PN = dyn_cast<PHINode>(II);
+      if (!PN)
+        break;
+      int i;
+      while ((i = PN->getBasicBlockIndex(this)) >= 0)
+        PN->setIncomingBlock(i, New);
+    }
+  }
+}
+
+/// isLandingPad - Return true if this basic block is a landing pad. I.e., it's
+/// the destination of the 'unwind' edge of an invoke instruction.
+bool BasicBlock::isLandingPad() const {
+  return isa<LandingPadInst>(getFirstNonPHI());
+}
+
+/// getLandingPadInst() - Return the landingpad instruction associated with
+/// the landing pad.
+LandingPadInst *BasicBlock::getLandingPadInst() {
+  return dyn_cast<LandingPadInst>(getFirstNonPHI());
+}
+const LandingPadInst *BasicBlock::getLandingPadInst() const {
+  return dyn_cast<LandingPadInst>(getFirstNonPHI());
+}
diff --git a/contrib/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm/lib/IR/ConstantFold.cpp
new file mode 100644
index 000000000000..bf93d4f95663
--- /dev/null
+++ b/contrib/llvm/lib/IR/ConstantFold.cpp
@@ -0,0 +1,2074 @@
+//===- ConstantFold.cpp - LLVM constant folder ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements folding of constants for LLVM.  This implements the
+// (internal) ConstantFold.h interface, which is used by the
+// ConstantExpr::get* methods to automatically fold constants when possible.
+//
+// The current constant folding implementation is implemented in two pieces: the
+// pieces that don't need DataLayout, and the pieces that do. This is to avoid
+// a dependence in IR on Target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ConstantFold.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include <limits>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                ConstantFold*Instruction Implementations
+//===----------------------------------------------------------------------===//
+
+/// BitCastConstantVector - Convert the specified vector Constant node to the
+/// specified vector type.  At this point, we know that the elements of the
+/// input vector constant are all simple integer or FP values.
+static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
+
+  if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
+  if (CV->isNullValue()) return Constant::getNullValue(DstTy);
+
+  // If this cast changes element count then we can't handle it here:
+  // doing so requires endianness information.  This should be handled by
+  // Analysis/ConstantFolding.cpp
+  unsigned NumElts = DstTy->getNumElements();
+  if (NumElts != CV->getType()->getVectorNumElements())
+    return 0;
+  
+  Type *DstEltTy = DstTy->getElementType();
+
+  SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(CV->getContext(), 32);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *C =
+      ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
+    C = ConstantExpr::getBitCast(C, DstEltTy);
+    Result.push_back(C);
+  }
+
+  return ConstantVector::get(Result);
+}
+
+/// This function determines which opcode to use to fold two constant cast 
+/// expressions together. It uses CastInst::isEliminableCastPair to determine
+/// the opcode. Consequently its just a wrapper around that function.
+/// @brief Determine if it is valid to fold a cast of a cast
+static unsigned
+foldConstantCastPair(
+  unsigned opc,          ///< opcode of the second cast constant expression
+  ConstantExpr *Op,      ///< the first cast constant expression
+  Type *DstTy      ///< desintation type of the first cast
+) {
+  assert(Op && Op->isCast() && "Can't fold cast of cast without a cast!");
+  assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
+  assert(CastInst::isCast(opc) && "Invalid cast opcode");
+
+  // The the types and opcodes for the two Cast constant expressions
+  Type *SrcTy = Op->getOperand(0)->getType();
+  Type *MidTy = Op->getType();
+  Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opc);
+
+  // Assume that pointers are never more than 64 bits wide.
+  IntegerType *FakeIntPtrTy = Type::getInt64Ty(DstTy->getContext());
+
+  // Let CastInst::isEliminableCastPair do the heavy lifting.
+  return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
+                                        FakeIntPtrTy, FakeIntPtrTy,
+                                        FakeIntPtrTy);
+}
+
+static Constant *FoldBitCast(Constant *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy == DestTy)
+    return V; // no-op cast
+
+  // Check to see if we are casting a pointer to an aggregate to a pointer to
+  // the first element.  If so, return the appropriate GEP instruction.
+  if (PointerType *PTy = dyn_cast<PointerType>(V->getType()))
+    if (PointerType *DPTy = dyn_cast<PointerType>(DestTy))
+      if (PTy->getAddressSpace() == DPTy->getAddressSpace()
+          && DPTy->getElementType()->isSized()) {
+        SmallVector<Value*, 8> IdxList;
+        Value *Zero =
+          Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
+        IdxList.push_back(Zero);
+        Type *ElTy = PTy->getElementType();
+        while (ElTy != DPTy->getElementType()) {
+          if (StructType *STy = dyn_cast<StructType>(ElTy)) {
+            if (STy->getNumElements() == 0) break;
+            ElTy = STy->getElementType(0);
+            IdxList.push_back(Zero);
+          } else if (SequentialType *STy = 
+                     dyn_cast<SequentialType>(ElTy)) {
+            if (ElTy->isPointerTy()) break;  // Can't index into pointers!
+            ElTy = STy->getElementType();
+            IdxList.push_back(Zero);
+          } else {
+            break;
+          }
+        }
+
+        if (ElTy == DPTy->getElementType())
+          // This GEP is inbounds because all indices are zero.
+          return ConstantExpr::getInBoundsGetElementPtr(V, IdxList);
+      }
+
+  // Handle casts from one vector constant to another.  We know that the src 
+  // and dest type have the same size (otherwise its an illegal cast).
+  if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+    if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
+      assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
+             "Not cast between same sized vectors!");
+      SrcTy = NULL;
+      // First, check for null.  Undef is already handled.
+      if (isa<ConstantAggregateZero>(V))
+        return Constant::getNullValue(DestTy);
+
+      // Handle ConstantVector and ConstantAggregateVector.
+      return BitCastConstantVector(V, DestPTy);
+    }
+
+    // Canonicalize scalar-to-vector bitcasts into vector-to-vector bitcasts
+    // This allows for other simplifications (although some of them
+    // can only be handled by Analysis/ConstantFolding.cpp).
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V))
+      return ConstantExpr::getBitCast(ConstantVector::get(V), DestPTy);
+  }
+
+  // Finally, implement bitcast folding now.   The code below doesn't handle
+  // bitcast right.
+  if (isa<ConstantPointerNull>(V))  // ptr->ptr cast.
+    return ConstantPointerNull::get(cast<PointerType>(DestTy));
+
+  // Handle integral constant input.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (DestTy->isIntegerTy())
+      // Integral -> Integral. This is a no-op because the bit widths must
+      // be the same. Consequently, we just fold to V.
+      return V;
+
+    if (DestTy->isFloatingPointTy())
+      return ConstantFP::get(DestTy->getContext(),
+                             APFloat(DestTy->getFltSemantics(),
+                                     CI->getValue()));
+
+    // Otherwise, can't fold this (vector?)
+    return 0;
+  }
+
+  // Handle ConstantFP input: FP -> Integral.
+  if (ConstantFP *FP = dyn_cast<ConstantFP>(V))
+    return ConstantInt::get(FP->getContext(),
+                            FP->getValueAPF().bitcastToAPInt());
+
+  return 0;
+}
+
+
+/// ExtractConstantBytes - V is an integer constant which only has a subset of
+/// its bytes used.  The bytes used are indicated by ByteStart (which is the
+/// first byte used, counting from the least significant byte) and ByteSize,
+/// which is the number of bytes used.
+///
+/// This function analyzes the specified constant to see if the specified byte
+/// range can be returned as a simplified constant.  If so, the constant is
+/// returned, otherwise null is returned.
+/// 
+static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
+                                      unsigned ByteSize) {
+  assert(C->getType()->isIntegerTy() &&
+         (cast<IntegerType>(C->getType())->getBitWidth() & 7) == 0 &&
+         "Non-byte sized integer input");
+  unsigned CSize = cast<IntegerType>(C->getType())->getBitWidth()/8;
+  assert(ByteSize && "Must be accessing some piece");
+  assert(ByteStart+ByteSize <= CSize && "Extracting invalid piece from input");
+  assert(ByteSize != CSize && "Should not extract everything");
+  
+  // Constant Integers are simple.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+    APInt V = CI->getValue();
+    if (ByteStart)
+      V = V.lshr(ByteStart*8);
+    V = V.trunc(ByteSize*8);
+    return ConstantInt::get(CI->getContext(), V);
+  }
+  
+  // In the input is a constant expr, we might be able to recursively simplify.
+  // If not, we definitely can't do anything.
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (CE == 0) return 0;
+  
+  switch (CE->getOpcode()) {
+  default: return 0;
+  case Instruction::Or: {
+    Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
+    if (RHS == 0)
+      return 0;
+    
+    // X | -1 -> -1.
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
+      if (RHSC->isAllOnesValue())
+        return RHSC;
+    
+    Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
+    if (LHS == 0)
+      return 0;
+    return ConstantExpr::getOr(LHS, RHS);
+  }
+  case Instruction::And: {
+    Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
+    if (RHS == 0)
+      return 0;
+    
+    // X & 0 -> 0.
+    if (RHS->isNullValue())
+      return RHS;
+    
+    Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
+    if (LHS == 0)
+      return 0;
+    return ConstantExpr::getAnd(LHS, RHS);
+  }
+  case Instruction::LShr: {
+    ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
+    if (Amt == 0)
+      return 0;
+    unsigned ShAmt = Amt->getZExtValue();
+    // Cannot analyze non-byte shifts.
+    if ((ShAmt & 7) != 0)
+      return 0;
+    ShAmt >>= 3;
+    
+    // If the extract is known to be all zeros, return zero.
+    if (ByteStart >= CSize-ShAmt)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+    // If the extract is known to be fully in the input, extract it.
+    if (ByteStart+ByteSize+ShAmt <= CSize)
+      return ExtractConstantBytes(CE->getOperand(0), ByteStart+ShAmt, ByteSize);
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+    
+  case Instruction::Shl: {
+    ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
+    if (Amt == 0)
+      return 0;
+    unsigned ShAmt = Amt->getZExtValue();
+    // Cannot analyze non-byte shifts.
+    if ((ShAmt & 7) != 0)
+      return 0;
+    ShAmt >>= 3;
+    
+    // If the extract is known to be all zeros, return zero.
+    if (ByteStart+ByteSize <= ShAmt)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+    // If the extract is known to be fully in the input, extract it.
+    if (ByteStart >= ShAmt)
+      return ExtractConstantBytes(CE->getOperand(0), ByteStart-ShAmt, ByteSize);
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+      
+  case Instruction::ZExt: {
+    unsigned SrcBitSize =
+      cast<IntegerType>(CE->getOperand(0)->getType())->getBitWidth();
+    
+    // If extracting something that is completely zero, return 0.
+    if (ByteStart*8 >= SrcBitSize)
+      return Constant::getNullValue(IntegerType::get(CE->getContext(),
+                                                     ByteSize*8));
+
+    // If exactly extracting the input, return it.
+    if (ByteStart == 0 && ByteSize*8 == SrcBitSize)
+      return CE->getOperand(0);
+    
+    // If extracting something completely in the input, if if the input is a
+    // multiple of 8 bits, recurse.
+    if ((SrcBitSize&7) == 0 && (ByteStart+ByteSize)*8 <= SrcBitSize)
+      return ExtractConstantBytes(CE->getOperand(0), ByteStart, ByteSize);
+      
+    // Otherwise, if extracting a subset of the input, which is not multiple of
+    // 8 bits, do a shift and trunc to get the bits.
+    if ((ByteStart+ByteSize)*8 < SrcBitSize) {
+      assert((SrcBitSize&7) && "Shouldn't get byte sized case here");
+      Constant *Res = CE->getOperand(0);
+      if (ByteStart)
+        Res = ConstantExpr::getLShr(Res, 
+                                 ConstantInt::get(Res->getType(), ByteStart*8));
+      return ConstantExpr::getTrunc(Res, IntegerType::get(C->getContext(),
+                                                          ByteSize*8));
+    }
+    
+    // TODO: Handle the 'partially zero' case.
+    return 0;
+  }
+  }
+}
+
+/// getFoldedSizeOf - Return a ConstantExpr with type DestTy for sizeof
+/// on Ty, with any known factors factored out. If Folded is false,
+/// return null if no factoring was possible, to avoid endlessly
+/// bouncing an unfoldable expression back into the top-level folder.
+///
+static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
+                                 bool Folded) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
+    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
+    return ConstantExpr::getNUWMul(E, N);
+  }
+
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!STy->isPacked()) {
+      unsigned NumElems = STy->getNumElements();
+      // An empty struct has size zero.
+      if (NumElems == 0)
+        return ConstantExpr::getNullValue(DestTy);
+      // Check for a struct with all members having the same size.
+      Constant *MemberSize =
+        getFoldedSizeOf(STy->getElementType(0), DestTy, true);
+      bool AllSame = true;
+      for (unsigned i = 1; i != NumElems; ++i)
+        if (MemberSize !=
+            getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
+          AllSame = false;
+          break;
+        }
+      if (AllSame) {
+        Constant *N = ConstantInt::get(DestTy, NumElems);
+        return ConstantExpr::getNUWMul(MemberSize, N);
+      }
+    }
+
+  // Pointer size doesn't depend on the pointee type, so canonicalize them
+  // to an arbitrary pointee.
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+    if (!PTy->getElementType()->isIntegerTy(1))
+      return
+        getFoldedSizeOf(PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                         PTy->getAddressSpace()),
+                        DestTy, true);
+
+  // If there's no interesting folding happening, bail so that we don't create
+  // a constant that looks like it needs folding but really doesn't.
+  if (!Folded)
+    return 0;
+
+  // Base case: Get a regular sizeof expression.
+  Constant *C = ConstantExpr::getSizeOf(Ty);
+  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                    DestTy, false),
+                            C, DestTy);
+  return C;
+}
+
+/// getFoldedAlignOf - Return a ConstantExpr with type DestTy for alignof
+/// on Ty, with any known factors factored out. If Folded is false,
+/// return null if no factoring was possible, to avoid endlessly
+/// bouncing an unfoldable expression back into the top-level folder.
+///
+static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
+                                  bool Folded) {
+  // The alignment of an array is equal to the alignment of the
+  // array element. Note that this is not always true for vectors.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Constant *C = ConstantExpr::getAlignOf(ATy->getElementType());
+    C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                      DestTy,
+                                                      false),
+                              C, DestTy);
+    return C;
+  }
+
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    // Packed structs always have an alignment of 1.
+    if (STy->isPacked())
+      return ConstantInt::get(DestTy, 1);
+
+    // Otherwise, struct alignment is the maximum alignment of any member.
+    // Without target data, we can't compare much, but we can check to see
+    // if all the members have the same alignment.
+    unsigned NumElems = STy->getNumElements();
+    // An empty struct has minimal alignment.
+    if (NumElems == 0)
+      return ConstantInt::get(DestTy, 1);
+    // Check for a struct with all members having the same alignment.
+    Constant *MemberAlign =
+      getFoldedAlignOf(STy->getElementType(0), DestTy, true);
+    bool AllSame = true;
+    for (unsigned i = 1; i != NumElems; ++i)
+      if (MemberAlign != getFoldedAlignOf(STy->getElementType(i), DestTy, true)) {
+        AllSame = false;
+        break;
+      }
+    if (AllSame)
+      return MemberAlign;
+  }
+
+  // Pointer alignment doesn't depend on the pointee type, so canonicalize them
+  // to an arbitrary pointee.
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+    if (!PTy->getElementType()->isIntegerTy(1))
+      return
+        getFoldedAlignOf(PointerType::get(IntegerType::get(PTy->getContext(),
+                                                           1),
+                                          PTy->getAddressSpace()),
+                         DestTy, true);
+
+  // If there's no interesting folding happening, bail so that we don't create
+  // a constant that looks like it needs folding but really doesn't.
+  if (!Folded)
+    return 0;
+
+  // Base case: Get a regular alignof expression.
+  Constant *C = ConstantExpr::getAlignOf(Ty);
+  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                    DestTy, false),
+                            C, DestTy);
+  return C;
+}
+
+/// getFoldedOffsetOf - Return a ConstantExpr with type DestTy for offsetof
+/// on Ty and FieldNo, with any known factors factored out. If Folded is false,
+/// return null if no factoring was possible, to avoid endlessly
+/// bouncing an unfoldable expression back into the top-level folder.
+///
+static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
+                                   Type *DestTy,
+                                   bool Folded) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
+                                                                DestTy, false),
+                                        FieldNo, DestTy);
+    Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
+    return ConstantExpr::getNUWMul(E, N);
+  }
+
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!STy->isPacked()) {
+      unsigned NumElems = STy->getNumElements();
+      // An empty struct has no members.
+      if (NumElems == 0)
+        return 0;
+      // Check for a struct with all members having the same size.
+      Constant *MemberSize =
+        getFoldedSizeOf(STy->getElementType(0), DestTy, true);
+      bool AllSame = true;
+      for (unsigned i = 1; i != NumElems; ++i)
+        if (MemberSize !=
+            getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
+          AllSame = false;
+          break;
+        }
+      if (AllSame) {
+        Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo,
+                                                                    false,
+                                                                    DestTy,
+                                                                    false),
+                                            FieldNo, DestTy);
+        return ConstantExpr::getNUWMul(MemberSize, N);
+      }
+    }
+
+  // If there's no interesting folding happening, bail so that we don't create
+  // a constant that looks like it needs folding but really doesn't.
+  if (!Folded)
+    return 0;
+
+  // Base case: Get a regular offsetof expression.
+  Constant *C = ConstantExpr::getOffsetOf(Ty, FieldNo);
+  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                    DestTy, false),
+                            C, DestTy);
+  return C;
+}
+
+Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
+                                            Type *DestTy) {
+  if (isa<UndefValue>(V)) {
+    // zext(undef) = 0, because the top bits will be zero.
+    // sext(undef) = 0, because the top bits will all be the same.
+    // [us]itofp(undef) = 0, because the result value is bounded.
+    if (opc == Instruction::ZExt || opc == Instruction::SExt ||
+        opc == Instruction::UIToFP || opc == Instruction::SIToFP)
+      return Constant::getNullValue(DestTy);
+    return UndefValue::get(DestTy);
+  }
+
+  if (V->isNullValue() && !DestTy->isX86_MMXTy())
+    return Constant::getNullValue(DestTy);
+
+  // If the cast operand is a constant expression, there's a few things we can
+  // do to try to simplify it.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    if (CE->isCast()) {
+      // Try hard to fold cast of cast because they are often eliminable.
+      if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
+        return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
+    } else if (CE->getOpcode() == Instruction::GetElementPtr) {
+      // If all of the indexes in the GEP are null values, there is no pointer
+      // adjustment going on.  We might as well cast the source pointer.
+      bool isAllNull = true;
+      for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+        if (!CE->getOperand(i)->isNullValue()) {
+          isAllNull = false;
+          break;
+        }
+      if (isAllNull)
+        // This is casting one pointer type to another, always BitCast
+        return ConstantExpr::getPointerCast(CE->getOperand(0), DestTy);
+    }
+  }
+
+  // If the cast operand is a constant vector, perform the cast by
+  // operating on each element. In the cast of bitcasts, the element
+  // count may be mismatched; don't attempt to handle that here.
+  if ((isa<ConstantVector>(V) || isa<ConstantDataVector>(V)) &&
+      DestTy->isVectorTy() &&
+      DestTy->getVectorNumElements() == V->getType()->getVectorNumElements()) {
+    SmallVector<Constant*, 16> res;
+    VectorType *DestVecTy = cast<VectorType>(DestTy);
+    Type *DstEltTy = DestVecTy->getElementType();
+    Type *Ty = IntegerType::get(V->getContext(), 32);
+    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) {
+      Constant *C =
+        ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
+      res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
+    }
+    return ConstantVector::get(res);
+  }
+
+  // We actually have to do a cast now. Perform the cast according to the
+  // opcode specified.
+  switch (opc) {
+  default:
+    llvm_unreachable("Failed to cast constant expression");
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
+      bool ignored;
+      APFloat Val = FPC->getValueAPF();
+      Val.convert(DestTy->isHalfTy() ? APFloat::IEEEhalf :
+                  DestTy->isFloatTy() ? APFloat::IEEEsingle :
+                  DestTy->isDoubleTy() ? APFloat::IEEEdouble :
+                  DestTy->isX86_FP80Ty() ? APFloat::x87DoubleExtended :
+                  DestTy->isFP128Ty() ? APFloat::IEEEquad :
+                  DestTy->isPPC_FP128Ty() ? APFloat::PPCDoubleDouble :
+                  APFloat::Bogus,
+                  APFloat::rmNearestTiesToEven, &ignored);
+      return ConstantFP::get(V->getContext(), Val);
+    }
+    return 0; // Can't fold.
+  case Instruction::FPToUI: 
+  case Instruction::FPToSI:
+    if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
+      const APFloat &V = FPC->getValueAPF();
+      bool ignored;
+      uint64_t x[2]; 
+      uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      (void) V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
+                                APFloat::rmTowardZero, &ignored);
+      APInt Val(DestBitWidth, x);
+      return ConstantInt::get(FPC->getContext(), Val);
+    }
+    return 0; // Can't fold.
+  case Instruction::IntToPtr:   //always treated as unsigned
+    if (V->isNullValue())       // Is it an integral null value?
+      return ConstantPointerNull::get(cast<PointerType>(DestTy));
+    return 0;                   // Other pointer types cannot be casted
+  case Instruction::PtrToInt:   // always treated as unsigned
+    // Is it a null pointer value?
+    if (V->isNullValue())
+      return ConstantInt::get(DestTy, 0);
+    // If this is a sizeof-like expression, pull out multiplications by
+    // known factors to expose them to subsequent folding. If it's an
+    // alignof-like expression, factor out known factors.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      if (CE->getOpcode() == Instruction::GetElementPtr &&
+          CE->getOperand(0)->isNullValue()) {
+        Type *Ty =
+          cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
+        if (CE->getNumOperands() == 2) {
+          // Handle a sizeof-like expression.
+          Constant *Idx = CE->getOperand(1);
+          bool isOne = isa<ConstantInt>(Idx) && cast<ConstantInt>(Idx)->isOne();
+          if (Constant *C = getFoldedSizeOf(Ty, DestTy, !isOne)) {
+            Idx = ConstantExpr::getCast(CastInst::getCastOpcode(Idx, true,
+                                                                DestTy, false),
+                                        Idx, DestTy);
+            return ConstantExpr::getMul(C, Idx);
+          }
+        } else if (CE->getNumOperands() == 3 &&
+                   CE->getOperand(1)->isNullValue()) {
+          // Handle an alignof-like expression.
+          if (StructType *STy = dyn_cast<StructType>(Ty))
+            if (!STy->isPacked()) {
+              ConstantInt *CI = cast<ConstantInt>(CE->getOperand(2));
+              if (CI->isOne() &&
+                  STy->getNumElements() == 2 &&
+                  STy->getElementType(0)->isIntegerTy(1)) {
+                return getFoldedAlignOf(STy->getElementType(1), DestTy, false);
+              }
+            }
+          // Handle an offsetof-like expression.
+          if (Ty->isStructTy() || Ty->isArrayTy()) {
+            if (Constant *C = getFoldedOffsetOf(Ty, CE->getOperand(2),
+                                                DestTy, false))
+              return C;
+          }
+        }
+      }
+    // Other pointer types cannot be casted
+    return 0;
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      APInt api = CI->getValue();
+      APFloat apf(DestTy->getFltSemantics(),
+                  APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
+      (void)apf.convertFromAPInt(api, 
+                                 opc==Instruction::SIToFP,
+                                 APFloat::rmNearestTiesToEven);
+      return ConstantFP::get(V->getContext(), apf);
+    }
+    return 0;
+  case Instruction::ZExt:
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      return ConstantInt::get(V->getContext(),
+                              CI->getValue().zext(BitWidth));
+    }
+    return 0;
+  case Instruction::SExt:
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      return ConstantInt::get(V->getContext(),
+                              CI->getValue().sext(BitWidth));
+    }
+    return 0;
+  case Instruction::Trunc: {
+    uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      return ConstantInt::get(V->getContext(),
+                              CI->getValue().trunc(DestBitWidth));
+    }
+    
+    // The input must be a constantexpr.  See if we can simplify this based on
+    // the bytes we are demanding.  Only do this if the source and dest are an
+    // even multiple of a byte.
+    if ((DestBitWidth & 7) == 0 &&
+        (cast<IntegerType>(V->getType())->getBitWidth() & 7) == 0)
+      if (Constant *Res = ExtractConstantBytes(V, 0, DestBitWidth / 8))
+        return Res;
+      
+    return 0;
+  }
+  case Instruction::BitCast:
+    return FoldBitCast(V, DestTy);
+  }
+}
+
+Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
+                                              Constant *V1, Constant *V2) {
+  // Check for i1 and vector true/false conditions.
+  if (Cond->isNullValue()) return V2;
+  if (Cond->isAllOnesValue()) return V1;
+
+  // If the condition is a vector constant, fold the result elementwise.
+  if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) {
+    SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(CondV->getContext(), 32);
+    for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){
+      ConstantInt *Cond = dyn_cast<ConstantInt>(CondV->getOperand(i));
+      if (Cond == 0) break;
+      
+      Constant *V = Cond->isNullValue() ? V2 : V1;
+      Constant *Res = ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
+      Result.push_back(Res);
+    }
+    
+    // If we were able to build the vector, return it.
+    if (Result.size() == V1->getType()->getVectorNumElements())
+      return ConstantVector::get(Result);
+  }
+
+  if (isa<UndefValue>(Cond)) {
+    if (isa<UndefValue>(V1)) return V1;
+    return V2;
+  }
+  if (isa<UndefValue>(V1)) return V2;
+  if (isa<UndefValue>(V2)) return V1;
+  if (V1 == V2) return V1;
+
+  if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
+    if (TrueVal->getOpcode() == Instruction::Select)
+      if (TrueVal->getOperand(0) == Cond)
+        return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
+  }
+  if (ConstantExpr *FalseVal = dyn_cast<ConstantExpr>(V2)) {
+    if (FalseVal->getOpcode() == Instruction::Select)
+      if (FalseVal->getOperand(0) == Cond)
+        return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
+  }
+
+  return 0;
+}
+
+Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
+                                                      Constant *Idx) {
+  if (isa<UndefValue>(Val))  // ee(undef, x) -> undef
+    return UndefValue::get(Val->getType()->getVectorElementType());
+  if (Val->isNullValue())  // ee(zero, x) -> zero
+    return Constant::getNullValue(Val->getType()->getVectorElementType());
+  // ee({w,x,y,z}, undef) -> undef
+  if (isa<UndefValue>(Idx))
+    return UndefValue::get(Val->getType()->getVectorElementType());
+
+  if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx)) {
+    uint64_t Index = CIdx->getZExtValue();
+    // ee({w,x,y,z}, wrong_value) -> undef
+    if (Index >= Val->getType()->getVectorNumElements())
+      return UndefValue::get(Val->getType()->getVectorElementType());
+    return Val->getAggregateElement(Index);
+  }
+  return 0;
+}
+
+Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
+                                                     Constant *Elt,
+                                                     Constant *Idx) {
+  ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
+  if (!CIdx) return 0;
+  const APInt &IdxVal = CIdx->getValue();
+  
+  SmallVector<Constant*, 16> Result;
+  Type *Ty = IntegerType::get(Val->getContext(), 32);
+  for (unsigned i = 0, e = Val->getType()->getVectorNumElements(); i != e; ++i){
+    if (i == IdxVal) {
+      Result.push_back(Elt);
+      continue;
+    }
+    
+    Constant *C =
+      ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i));
+    Result.push_back(C);
+  }
+  
+  return ConstantVector::get(Result);
+}
+
+Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
+                                                     Constant *V2,
+                                                     Constant *Mask) {
+  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
+  Type *EltTy = V1->getType()->getVectorElementType();
+
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(Mask))
+    return UndefValue::get(VectorType::get(EltTy, MaskNumElts));
+
+  // Don't break the bitcode reader hack.
+  if (isa<ConstantExpr>(Mask)) return 0;
+  
+  unsigned SrcNumElts = V1->getType()->getVectorNumElements();
+
+  // Loop over the shuffle mask, evaluating each element.
+  SmallVector<Constant*, 32> Result;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    int Elt = ShuffleVectorInst::getMaskValue(Mask, i);
+    if (Elt == -1) {
+      Result.push_back(UndefValue::get(EltTy));
+      continue;
+    }
+    Constant *InElt;
+    if (unsigned(Elt) >= SrcNumElts*2)
+      InElt = UndefValue::get(EltTy);
+    else if (unsigned(Elt) >= SrcNumElts) {
+      Type *Ty = IntegerType::get(V2->getContext(), 32);
+      InElt =
+        ConstantExpr::getExtractElement(V2,
+                                        ConstantInt::get(Ty, Elt - SrcNumElts));
+    } else {
+      Type *Ty = IntegerType::get(V1->getContext(), 32);
+      InElt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, Elt));
+    }
+    Result.push_back(InElt);
+  }
+
+  return ConstantVector::get(Result);
+}
+
+Constant *llvm::ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                    ArrayRef<unsigned> Idxs) {
+  // Base case: no indices, so return the entire value.
+  if (Idxs.empty())
+    return Agg;
+
+  if (Constant *C = Agg->getAggregateElement(Idxs[0]))
+    return ConstantFoldExtractValueInstruction(C, Idxs.slice(1));
+
+  return 0;
+}
+
+Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
+                                                   Constant *Val,
+                                                   ArrayRef<unsigned> Idxs) {
+  // Base case: no indices, so replace the entire value.
+  if (Idxs.empty())
+    return Val;
+
+  unsigned NumElts;
+  if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
+    NumElts = ST->getNumElements();
+  else if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
+    NumElts = AT->getNumElements();
+  else
+    NumElts = Agg->getType()->getVectorNumElements();
+
+  SmallVector<Constant*, 32> Result;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *C = Agg->getAggregateElement(i);
+    if (C == 0) return 0;
+    
+    if (Idxs[0] == i)
+      C = ConstantFoldInsertValueInstruction(C, Val, Idxs.slice(1));
+    
+    Result.push_back(C);
+  }
+  
+  if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
+    return ConstantStruct::get(ST, Result);
+  if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
+    return ConstantArray::get(AT, Result);
+  return ConstantVector::get(Result);
+}
+
+
+Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
+                                              Constant *C1, Constant *C2) {
+  // Handle UndefValue up front.
+  if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
+    switch (Opcode) {
+    case Instruction::Xor:
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return Constant::getNullValue(C1->getType());
+      // Fallthrough
+    case Instruction::Add:
+    case Instruction::Sub:
+      return UndefValue::get(C1->getType());
+    case Instruction::And:
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2)) // undef & undef -> undef
+        return C1;
+      return Constant::getNullValue(C1->getType());   // undef & X -> 0
+    case Instruction::Mul: {
+      ConstantInt *CI;
+      // X * undef -> undef   if X is odd or undef
+      if (((CI = dyn_cast<ConstantInt>(C1)) && CI->getValue()[0]) ||
+          ((CI = dyn_cast<ConstantInt>(C2)) && CI->getValue()[0]) ||
+          (isa<UndefValue>(C1) && isa<UndefValue>(C2)))
+        return UndefValue::get(C1->getType());
+
+      // X * undef -> 0       otherwise
+      return Constant::getNullValue(C1->getType());
+    }
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      // undef / 1 -> undef
+      if (Opcode == Instruction::UDiv || Opcode == Instruction::SDiv)
+        if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2))
+          if (CI2->isOne())
+            return C1;
+      // FALL THROUGH
+    case Instruction::URem:
+    case Instruction::SRem:
+      if (!isa<UndefValue>(C2))                    // undef / X -> 0
+        return Constant::getNullValue(C1->getType());
+      return C2;                                   // X / undef -> undef
+    case Instruction::Or:                          // X | undef -> -1
+      if (isa<UndefValue>(C1) && isa<UndefValue>(C2)) // undef | undef -> undef
+        return C1;
+      return Constant::getAllOnesValue(C1->getType()); // undef | X -> ~0
+    case Instruction::LShr:
+      if (isa<UndefValue>(C2) && isa<UndefValue>(C1))
+        return C1;                                  // undef lshr undef -> undef
+      return Constant::getNullValue(C1->getType()); // X lshr undef -> 0
+                                                    // undef lshr X -> 0
+    case Instruction::AShr:
+      if (!isa<UndefValue>(C2))                     // undef ashr X --> all ones
+        return Constant::getAllOnesValue(C1->getType());
+      else if (isa<UndefValue>(C1)) 
+        return C1;                                  // undef ashr undef -> undef
+      else
+        return C1;                                  // X ashr undef --> X
+    case Instruction::Shl:
+      if (isa<UndefValue>(C2) && isa<UndefValue>(C1))
+        return C1;                                  // undef shl undef -> undef
+      // undef << X -> 0   or   X << undef -> 0
+      return Constant::getNullValue(C1->getType());
+    }
+  }
+
+  // Handle simplifications when the RHS is a constant int.
+  if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
+    switch (Opcode) {
+    case Instruction::Add:
+      if (CI2->equalsInt(0)) return C1;                         // X + 0 == X
+      break;
+    case Instruction::Sub:
+      if (CI2->equalsInt(0)) return C1;                         // X - 0 == X
+      break;
+    case Instruction::Mul:
+      if (CI2->equalsInt(0)) return C2;                         // X * 0 == 0
+      if (CI2->equalsInt(1))
+        return C1;                                              // X * 1 == X
+      break;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      if (CI2->equalsInt(1))
+        return C1;                                            // X / 1 == X
+      if (CI2->equalsInt(0))
+        return UndefValue::get(CI2->getType());               // X / 0 == undef
+      break;
+    case Instruction::URem:
+    case Instruction::SRem:
+      if (CI2->equalsInt(1))
+        return Constant::getNullValue(CI2->getType());        // X % 1 == 0
+      if (CI2->equalsInt(0))
+        return UndefValue::get(CI2->getType());               // X % 0 == undef
+      break;
+    case Instruction::And:
+      if (CI2->isZero()) return C2;                           // X & 0 == 0
+      if (CI2->isAllOnesValue())
+        return C1;                                            // X & -1 == X
+
+      if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
+        // (zext i32 to i64) & 4294967295 -> (zext i32 to i64)
+        if (CE1->getOpcode() == Instruction::ZExt) {
+          unsigned DstWidth = CI2->getType()->getBitWidth();
+          unsigned SrcWidth =
+            CE1->getOperand(0)->getType()->getPrimitiveSizeInBits();
+          APInt PossiblySetBits(APInt::getLowBitsSet(DstWidth, SrcWidth));
+          if ((PossiblySetBits & CI2->getValue()) == PossiblySetBits)
+            return C1;
+        }
+
+        // If and'ing the address of a global with a constant, fold it.
+        if (CE1->getOpcode() == Instruction::PtrToInt && 
+            isa<GlobalValue>(CE1->getOperand(0))) {
+          GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));
+
+          // Functions are at least 4-byte aligned.
+          unsigned GVAlign = GV->getAlignment();
+          if (isa<Function>(GV))
+            GVAlign = std::max(GVAlign, 4U);
+
+          if (GVAlign > 1) {
+            unsigned DstWidth = CI2->getType()->getBitWidth();
+            unsigned SrcWidth = std::min(DstWidth, Log2_32(GVAlign));
+            APInt BitsNotSet(APInt::getLowBitsSet(DstWidth, SrcWidth));
+
+            // If checking bits we know are clear, return zero.
+            if ((CI2->getValue() & BitsNotSet) == CI2->getValue())
+              return Constant::getNullValue(CI2->getType());
+          }
+        }
+      }
+      break;
+    case Instruction::Or:
+      if (CI2->equalsInt(0)) return C1;    // X | 0 == X
+      if (CI2->isAllOnesValue())
+        return C2;                         // X | -1 == -1
+      break;
+    case Instruction::Xor:
+      if (CI2->equalsInt(0)) return C1;    // X ^ 0 == X
+
+      if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
+        switch (CE1->getOpcode()) {
+        default: break;
+        case Instruction::ICmp:
+        case Instruction::FCmp:
+          // cmp pred ^ true -> cmp !pred
+          assert(CI2->equalsInt(1));
+          CmpInst::Predicate pred = (CmpInst::Predicate)CE1->getPredicate();
+          pred = CmpInst::getInversePredicate(pred);
+          return ConstantExpr::getCompare(pred, CE1->getOperand(0),
+                                          CE1->getOperand(1));
+        }
+      }
+      break;
+    case Instruction::AShr:
+      // ashr (zext C to Ty), C2 -> lshr (zext C, CSA), C2
+      if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1))
+        if (CE1->getOpcode() == Instruction::ZExt)  // Top bits known zero.
+          return ConstantExpr::getLShr(C1, C2);
+      break;
+    }
+  } else if (isa<ConstantInt>(C1)) {
+    // If C1 is a ConstantInt and C2 is not, swap the operands.
+    if (Instruction::isCommutative(Opcode))
+      return ConstantExpr::get(Opcode, C2, C1);
+  }
+
+  // At this point we know neither constant is an UndefValue.
+  if (ConstantInt *CI1 = dyn_cast<ConstantInt>(C1)) {
+    if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
+      const APInt &C1V = CI1->getValue();
+      const APInt &C2V = CI2->getValue();
+      switch (Opcode) {
+      default:
+        break;
+      case Instruction::Add:     
+        return ConstantInt::get(CI1->getContext(), C1V + C2V);
+      case Instruction::Sub:     
+        return ConstantInt::get(CI1->getContext(), C1V - C2V);
+      case Instruction::Mul:     
+        return ConstantInt::get(CI1->getContext(), C1V * C2V);
+      case Instruction::UDiv:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        return ConstantInt::get(CI1->getContext(), C1V.udiv(C2V));
+      case Instruction::SDiv:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+          return UndefValue::get(CI1->getType());   // MIN_INT / -1 -> undef
+        return ConstantInt::get(CI1->getContext(), C1V.sdiv(C2V));
+      case Instruction::URem:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        return ConstantInt::get(CI1->getContext(), C1V.urem(C2V));
+      case Instruction::SRem:
+        assert(!CI2->isNullValue() && "Div by zero handled above");
+        if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
+          return UndefValue::get(CI1->getType());   // MIN_INT % -1 -> undef
+        return ConstantInt::get(CI1->getContext(), C1V.srem(C2V));
+      case Instruction::And:
+        return ConstantInt::get(CI1->getContext(), C1V & C2V);
+      case Instruction::Or:
+        return ConstantInt::get(CI1->getContext(), C1V | C2V);
+      case Instruction::Xor:
+        return ConstantInt::get(CI1->getContext(), C1V ^ C2V);
+      case Instruction::Shl: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(CI1->getContext(), C1V.shl(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      case Instruction::LShr: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(CI1->getContext(), C1V.lshr(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      case Instruction::AShr: {
+        uint32_t shiftAmt = C2V.getZExtValue();
+        if (shiftAmt < C1V.getBitWidth())
+          return ConstantInt::get(CI1->getContext(), C1V.ashr(shiftAmt));
+        else
+          return UndefValue::get(C1->getType()); // too big shift is undef
+      }
+      }
+    }
+
+    switch (Opcode) {
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Shl:
+      if (CI1->equalsInt(0)) return C1;
+      break;
+    default:
+      break;
+    }
+  } else if (ConstantFP *CFP1 = dyn_cast<ConstantFP>(C1)) {
+    if (ConstantFP *CFP2 = dyn_cast<ConstantFP>(C2)) {
+      APFloat C1V = CFP1->getValueAPF();
+      APFloat C2V = CFP2->getValueAPF();
+      APFloat C3V = C1V;  // copy for modification
+      switch (Opcode) {
+      default:                   
+        break;
+      case Instruction::FAdd:
+        (void)C3V.add(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C1->getContext(), C3V);
+      case Instruction::FSub:
+        (void)C3V.subtract(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C1->getContext(), C3V);
+      case Instruction::FMul:
+        (void)C3V.multiply(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C1->getContext(), C3V);
+      case Instruction::FDiv:
+        (void)C3V.divide(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C1->getContext(), C3V);
+      case Instruction::FRem:
+        (void)C3V.mod(C2V, APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(C1->getContext(), C3V);
+      }
+    }
+  } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
+    // Perform elementwise folding.
+    SmallVector<Constant*, 16> Result;
+    Type *Ty = IntegerType::get(VTy->getContext(), 32);
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+      Constant *LHS =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *RHS =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+      
+      Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
+    }
+    
+    return ConstantVector::get(Result);
+  }
+
+  if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
+    // There are many possible foldings we could do here.  We should probably
+    // at least fold add of a pointer with an integer into the appropriate
+    // getelementptr.  This will improve alias analysis a bit.
+
+    // Given ((a + b) + c), if (b + c) folds to something interesting, return
+    // (a + (b + c)).
+    if (Instruction::isAssociative(Opcode) && CE1->getOpcode() == Opcode) {
+      Constant *T = ConstantExpr::get(Opcode, CE1->getOperand(1), C2);
+      if (!isa<ConstantExpr>(T) || cast<ConstantExpr>(T)->getOpcode() != Opcode)
+        return ConstantExpr::get(Opcode, CE1->getOperand(0), T);
+    }
+  } else if (isa<ConstantExpr>(C2)) {
+    // If C2 is a constant expr and C1 isn't, flop them around and fold the
+    // other way if possible.
+    if (Instruction::isCommutative(Opcode))
+      return ConstantFoldBinaryInstruction(Opcode, C2, C1);
+  }
+
+  // i1 can be simplified in many cases.
+  if (C1->getType()->isIntegerTy(1)) {
+    switch (Opcode) {
+    case Instruction::Add:
+    case Instruction::Sub:
+      return ConstantExpr::getXor(C1, C2);
+    case Instruction::Mul:
+      return ConstantExpr::getAnd(C1, C2);
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      // We can assume that C2 == 0.  If it were one the result would be
+      // undefined because the shift value is as large as the bitwidth.
+      return C1;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+      // We can assume that C2 == 1.  If it were zero the result would be
+      // undefined through division by zero.
+      return C1;
+    case Instruction::URem:
+    case Instruction::SRem:
+      // We can assume that C2 == 1.  If it were zero the result would be
+      // undefined through division by zero.
+      return ConstantInt::getFalse(C1->getContext());
+    default:
+      break;
+    }
+  }
+
+  // We don't know how to fold this.
+  return 0;
+}
+
+/// isZeroSizedType - This type is zero sized if its an array or structure of
+/// zero sized types.  The only leaf zero sized type is an empty structure.
+static bool isMaybeZeroSizedType(Type *Ty) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    if (STy->isOpaque()) return true;  // Can't say.
+
+    // If all of elements have zero size, this does too.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      if (!isMaybeZeroSizedType(STy->getElementType(i))) return false;
+    return true;
+
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    return isMaybeZeroSizedType(ATy->getElementType());
+  }
+  return false;
+}
+
+/// IdxCompare - Compare the two constants as though they were getelementptr
+/// indices.  This allows coersion of the types to be the same thing.
+///
+/// If the two constants are the "same" (after coersion), return 0.  If the
+/// first is less than the second, return -1, if the second is less than the
+/// first, return 1.  If the constants are not integral, return -2.
+///
+static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
+  if (C1 == C2) return 0;
+
+  // Ok, we found a different index.  If they are not ConstantInt, we can't do
+  // anything with them.
+  if (!isa<ConstantInt>(C1) || !isa<ConstantInt>(C2))
+    return -2; // don't know!
+
+  // Ok, we have two differing integer indices.  Sign extend them to be the same
+  // type.  Long is always big enough, so we use it.
+  if (!C1->getType()->isIntegerTy(64))
+    C1 = ConstantExpr::getSExt(C1, Type::getInt64Ty(C1->getContext()));
+
+  if (!C2->getType()->isIntegerTy(64))
+    C2 = ConstantExpr::getSExt(C2, Type::getInt64Ty(C1->getContext()));
+
+  if (C1 == C2) return 0;  // They are equal
+
+  // If the type being indexed over is really just a zero sized type, there is
+  // no pointer difference being made here.
+  if (isMaybeZeroSizedType(ElTy))
+    return -2; // dunno.
+
+  // If they are really different, now that they are the same type, then we
+  // found a difference!
+  if (cast<ConstantInt>(C1)->getSExtValue() < 
+      cast<ConstantInt>(C2)->getSExtValue())
+    return -1;
+  else
+    return 1;
+}
+
+/// evaluateFCmpRelation - This function determines if there is anything we can
+/// decide about the two constants provided.  This doesn't need to handle simple
+/// things like ConstantFP comparisons, but should instead handle ConstantExprs.
+/// If we can determine that the two constants have a particular relation to 
+/// each other, we should return the corresponding FCmpInst predicate, 
+/// otherwise return FCmpInst::BAD_FCMP_PREDICATE. This is used below in
+/// ConstantFoldCompareInstruction.
+///
+/// To simplify this code we canonicalize the relation so that the first
+/// operand is always the most "complex" of the two.  We consider ConstantFP
+/// to be the simplest, and ConstantExprs to be the most complex.
+static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
+  assert(V1->getType() == V2->getType() &&
+         "Cannot compare values of different types!");
+
+  // Handle degenerate case quickly
+  if (V1 == V2) return FCmpInst::FCMP_OEQ;
+
+  if (!isa<ConstantExpr>(V1)) {
+    if (!isa<ConstantExpr>(V2)) {
+      // We distilled thisUse the standard constant folder for a few cases
+      ConstantInt *R = 0;
+      R = dyn_cast<ConstantInt>(
+                      ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, V1, V2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OEQ;
+      R = dyn_cast<ConstantInt>(
+                      ConstantExpr::getFCmp(FCmpInst::FCMP_OLT, V1, V2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OLT;
+      R = dyn_cast<ConstantInt>(
+                      ConstantExpr::getFCmp(FCmpInst::FCMP_OGT, V1, V2));
+      if (R && !R->isZero()) 
+        return FCmpInst::FCMP_OGT;
+
+      // Nothing more we can do
+      return FCmpInst::BAD_FCMP_PREDICATE;
+    }
+
+    // If the first operand is simple and second is ConstantExpr, swap operands.
+    FCmpInst::Predicate SwappedRelation = evaluateFCmpRelation(V2, V1);
+    if (SwappedRelation != FCmpInst::BAD_FCMP_PREDICATE)
+      return FCmpInst::getSwappedPredicate(SwappedRelation);
+  } else {
+    // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
+    // constantexpr or a simple constant.
+    ConstantExpr *CE1 = cast<ConstantExpr>(V1);
+    switch (CE1->getOpcode()) {
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+      // We might be able to do something with these but we don't right now.
+      break;
+    default:
+      break;
+    }
+  }
+  // There are MANY other foldings that we could perform here.  They will
+  // probably be added on demand, as they seem needed.
+  return FCmpInst::BAD_FCMP_PREDICATE;
+}
+
+/// evaluateICmpRelation - This function determines if there is anything we can
+/// decide about the two constants provided.  This doesn't need to handle simple
+/// things like integer comparisons, but should instead handle ConstantExprs
+/// and GlobalValues.  If we can determine that the two constants have a
+/// particular relation to each other, we should return the corresponding ICmp
+/// predicate, otherwise return ICmpInst::BAD_ICMP_PREDICATE.
+///
+/// To simplify this code we canonicalize the relation so that the first
+/// operand is always the most "complex" of the two.  We consider simple
+/// constants (like ConstantInt) to be the simplest, followed by
+/// GlobalValues, followed by ConstantExpr's (the most complex).
+///
+static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
+                                                bool isSigned) {
+  assert(V1->getType() == V2->getType() &&
+         "Cannot compare different types of values!");
+  if (V1 == V2) return ICmpInst::ICMP_EQ;
+
+  if (!isa<ConstantExpr>(V1) && !isa<GlobalValue>(V1) &&
+      !isa<BlockAddress>(V1)) {
+    if (!isa<GlobalValue>(V2) && !isa<ConstantExpr>(V2) &&
+        !isa<BlockAddress>(V2)) {
+      // We distilled this down to a simple case, use the standard constant
+      // folder.
+      ConstantInt *R = 0;
+      ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
+      if (R && !R->isZero()) 
+        return pred;
+      pred = isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
+      if (R && !R->isZero())
+        return pred;
+      pred = isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+      R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
+      if (R && !R->isZero())
+        return pred;
+
+      // If we couldn't figure it out, bail.
+      return ICmpInst::BAD_ICMP_PREDICATE;
+    }
+
+    // If the first operand is simple, swap operands.
+    ICmpInst::Predicate SwappedRelation = 
+      evaluateICmpRelation(V2, V1, isSigned);
+    if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
+      return ICmpInst::getSwappedPredicate(SwappedRelation);
+
+  } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
+    if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
+      ICmpInst::Predicate SwappedRelation = 
+        evaluateICmpRelation(V2, V1, isSigned);
+      if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
+        return ICmpInst::getSwappedPredicate(SwappedRelation);
+      return ICmpInst::BAD_ICMP_PREDICATE;
+    }
+
+    // Now we know that the RHS is a GlobalValue, BlockAddress or simple
+    // constant (which, since the types must match, means that it's a
+    // ConstantPointerNull).
+    if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
+      // Don't try to decide equality of aliases.
+      if (!isa<GlobalAlias>(GV) && !isa<GlobalAlias>(GV2))
+        if (!GV->hasExternalWeakLinkage() || !GV2->hasExternalWeakLinkage())
+          return ICmpInst::ICMP_NE;
+    } else if (isa<BlockAddress>(V2)) {
+      return ICmpInst::ICMP_NE; // Globals never equal labels.
+    } else {
+      assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
+      // GlobalVals can never be null unless they have external weak linkage.
+      // We don't try to evaluate aliases here.
+      if (!GV->hasExternalWeakLinkage() && !isa<GlobalAlias>(GV))
+        return ICmpInst::ICMP_NE;
+    }
+  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
+    if (isa<ConstantExpr>(V2)) {  // Swap as necessary.
+      ICmpInst::Predicate SwappedRelation = 
+        evaluateICmpRelation(V2, V1, isSigned);
+      if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
+        return ICmpInst::getSwappedPredicate(SwappedRelation);
+      return ICmpInst::BAD_ICMP_PREDICATE;
+    }
+    
+    // Now we know that the RHS is a GlobalValue, BlockAddress or simple
+    // constant (which, since the types must match, means that it is a
+    // ConstantPointerNull).
+    if (const BlockAddress *BA2 = dyn_cast<BlockAddress>(V2)) {
+      // Block address in another function can't equal this one, but block
+      // addresses in the current function might be the same if blocks are
+      // empty.
+      if (BA2->getFunction() != BA->getFunction())
+        return ICmpInst::ICMP_NE;
+    } else {
+      // Block addresses aren't null, don't equal the address of globals.
+      assert((isa<ConstantPointerNull>(V2) || isa<GlobalValue>(V2)) &&
+             "Canonicalization guarantee!");
+      return ICmpInst::ICMP_NE;
+    }
+  } else {
+    // Ok, the LHS is known to be a constantexpr.  The RHS can be any of a
+    // constantexpr, a global, block address, or a simple constant.
+    ConstantExpr *CE1 = cast<ConstantExpr>(V1);
+    Constant *CE1Op0 = CE1->getOperand(0);
+
+    switch (CE1->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      break; // We can't evaluate floating point casts or truncations.
+
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::BitCast:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // If the cast is not actually changing bits, and the second operand is a
+      // null pointer, do the comparison with the pre-casted value.
+      if (V2->isNullValue() &&
+          (CE1->getType()->isPointerTy() || CE1->getType()->isIntegerTy())) {
+        if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
+        if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
+        return evaluateICmpRelation(CE1Op0,
+                                    Constant::getNullValue(CE1Op0->getType()), 
+                                    isSigned);
+      }
+      break;
+
+    case Instruction::GetElementPtr:
+      // Ok, since this is a getelementptr, we know that the constant has a
+      // pointer type.  Check the various cases.
+      if (isa<ConstantPointerNull>(V2)) {
+        // If we are comparing a GEP to a null pointer, check to see if the base
+        // of the GEP equals the null pointer.
+        if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
+          if (GV->hasExternalWeakLinkage())
+            // Weak linkage GVals could be zero or not. We're comparing that
+            // to null pointer so its greater-or-equal
+            return isSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+          else 
+            // If its not weak linkage, the GVal must have a non-zero address
+            // so the result is greater-than
+            return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+        } else if (isa<ConstantPointerNull>(CE1Op0)) {
+          // If we are indexing from a null pointer, check to see if we have any
+          // non-zero indices.
+          for (unsigned i = 1, e = CE1->getNumOperands(); i != e; ++i)
+            if (!CE1->getOperand(i)->isNullValue())
+              // Offsetting from null, must not be equal.
+              return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+          // Only zero indexes from null, must still be zero.
+          return ICmpInst::ICMP_EQ;
+        }
+        // Otherwise, we can't really say if the first operand is null or not.
+      } else if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
+        if (isa<ConstantPointerNull>(CE1Op0)) {
+          if (GV2->hasExternalWeakLinkage())
+            // Weak linkage GVals could be zero or not. We're comparing it to
+            // a null pointer, so its less-or-equal
+            return isSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+          else
+            // If its not weak linkage, the GVal must have a non-zero address
+            // so the result is less-than
+            return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+        } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
+          if (GV == GV2) {
+            // If this is a getelementptr of the same global, then it must be
+            // different.  Because the types must match, the getelementptr could
+            // only have at most one index, and because we fold getelementptr's
+            // with a single zero index, it must be nonzero.
+            assert(CE1->getNumOperands() == 2 &&
+                   !CE1->getOperand(1)->isNullValue() &&
+                   "Surprising getelementptr!");
+            return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+          } else {
+            // If they are different globals, we don't know what the value is.
+            return ICmpInst::BAD_ICMP_PREDICATE;
+          }
+        }
+      } else {
+        ConstantExpr *CE2 = cast<ConstantExpr>(V2);
+        Constant *CE2Op0 = CE2->getOperand(0);
+
+        // There are MANY other foldings that we could perform here.  They will
+        // probably be added on demand, as they seem needed.
+        switch (CE2->getOpcode()) {
+        default: break;
+        case Instruction::GetElementPtr:
+          // By far the most common case to handle is when the base pointers are
+          // obviously to the same global.
+          if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
+            if (CE1Op0 != CE2Op0) // Don't know relative ordering.
+              return ICmpInst::BAD_ICMP_PREDICATE;
+            // Ok, we know that both getelementptr instructions are based on the
+            // same global.  From this, we can precisely determine the relative
+            // ordering of the resultant pointers.
+            unsigned i = 1;
+
+            // The logic below assumes that the result of the comparison
+            // can be determined by finding the first index that differs.
+            // This doesn't work if there is over-indexing in any
+            // subsequent indices, so check for that case first.
+            if (!CE1->isGEPWithNoNotionalOverIndexing() ||
+                !CE2->isGEPWithNoNotionalOverIndexing())
+               return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
+
+            // Compare all of the operands the GEP's have in common.
+            gep_type_iterator GTI = gep_type_begin(CE1);
+            for (;i != CE1->getNumOperands() && i != CE2->getNumOperands();
+                 ++i, ++GTI)
+              switch (IdxCompare(CE1->getOperand(i),
+                                 CE2->getOperand(i), GTI.getIndexedType())) {
+              case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT;
+              case 1:  return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT;
+              case -2: return ICmpInst::BAD_ICMP_PREDICATE;
+              }
+
+            // Ok, we ran out of things they have in common.  If any leftovers
+            // are non-zero then we have a difference, otherwise we are equal.
+            for (; i < CE1->getNumOperands(); ++i)
+              if (!CE1->getOperand(i)->isNullValue()) {
+                if (isa<ConstantInt>(CE1->getOperand(i)))
+                  return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+                else
+                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
+              }
+
+            for (; i < CE2->getNumOperands(); ++i)
+              if (!CE2->getOperand(i)->isNullValue()) {
+                if (isa<ConstantInt>(CE2->getOperand(i)))
+                  return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+                else
+                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
+              }
+            return ICmpInst::ICMP_EQ;
+          }
+        }
+      }
+    default:
+      break;
+    }
+  }
+
+  return ICmpInst::BAD_ICMP_PREDICATE;
+}
+
+Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, 
+                                               Constant *C1, Constant *C2) {
+  Type *ResultTy;
+  if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
+    ResultTy = VectorType::get(Type::getInt1Ty(C1->getContext()),
+                               VT->getNumElements());
+  else
+    ResultTy = Type::getInt1Ty(C1->getContext());
+
+  // Fold FCMP_FALSE/FCMP_TRUE unconditionally.
+  if (pred == FCmpInst::FCMP_FALSE)
+    return Constant::getNullValue(ResultTy);
+
+  if (pred == FCmpInst::FCMP_TRUE)
+    return Constant::getAllOnesValue(ResultTy);
+
+  // Handle some degenerate cases first
+  if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
+    // For EQ and NE, we can always pick a value for the undef to make the
+    // predicate pass or fail, so we can return undef.
+    // Also, if both operands are undef, we can return undef.
+    if (ICmpInst::isEquality(ICmpInst::Predicate(pred)) ||
+        (isa<UndefValue>(C1) && isa<UndefValue>(C2)))
+      return UndefValue::get(ResultTy);
+    // Otherwise, pick the same value as the non-undef operand, and fold
+    // it to true or false.
+    return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(pred));
+  }
+
+  // icmp eq/ne(null,GV) -> false/true
+  if (C1->isNullValue()) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C2))
+      // Don't try to evaluate aliases.  External weak GV can be null.
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+        if (pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse(C1->getContext());
+        else if (pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue(C1->getContext());
+      }
+  // icmp eq/ne(GV,null) -> false/true
+  } else if (C2->isNullValue()) {
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C1))
+      // Don't try to evaluate aliases.  External weak GV can be null.
+      if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
+        if (pred == ICmpInst::ICMP_EQ)
+          return ConstantInt::getFalse(C1->getContext());
+        else if (pred == ICmpInst::ICMP_NE)
+          return ConstantInt::getTrue(C1->getContext());
+      }
+  }
+
+  // If the comparison is a comparison between two i1's, simplify it.
+  if (C1->getType()->isIntegerTy(1)) {
+    switch(pred) {
+    case ICmpInst::ICMP_EQ:
+      if (isa<ConstantInt>(C2))
+        return ConstantExpr::getXor(C1, ConstantExpr::getNot(C2));
+      return ConstantExpr::getXor(ConstantExpr::getNot(C1), C2);
+    case ICmpInst::ICMP_NE:
+      return ConstantExpr::getXor(C1, C2);
+    default:
+      break;
+    }
+  }
+
+  if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
+    APInt V1 = cast<ConstantInt>(C1)->getValue();
+    APInt V2 = cast<ConstantInt>(C2)->getValue();
+    switch (pred) {
+    default: llvm_unreachable("Invalid ICmp Predicate");
+    case ICmpInst::ICMP_EQ:  return ConstantInt::get(ResultTy, V1 == V2);
+    case ICmpInst::ICMP_NE:  return ConstantInt::get(ResultTy, V1 != V2);
+    case ICmpInst::ICMP_SLT: return ConstantInt::get(ResultTy, V1.slt(V2));
+    case ICmpInst::ICMP_SGT: return ConstantInt::get(ResultTy, V1.sgt(V2));
+    case ICmpInst::ICMP_SLE: return ConstantInt::get(ResultTy, V1.sle(V2));
+    case ICmpInst::ICMP_SGE: return ConstantInt::get(ResultTy, V1.sge(V2));
+    case ICmpInst::ICMP_ULT: return ConstantInt::get(ResultTy, V1.ult(V2));
+    case ICmpInst::ICMP_UGT: return ConstantInt::get(ResultTy, V1.ugt(V2));
+    case ICmpInst::ICMP_ULE: return ConstantInt::get(ResultTy, V1.ule(V2));
+    case ICmpInst::ICMP_UGE: return ConstantInt::get(ResultTy, V1.uge(V2));
+    }
+  } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
+    APFloat C1V = cast<ConstantFP>(C1)->getValueAPF();
+    APFloat C2V = cast<ConstantFP>(C2)->getValueAPF();
+    APFloat::cmpResult R = C1V.compare(C2V);
+    switch (pred) {
+    default: llvm_unreachable("Invalid FCmp Predicate");
+    case FCmpInst::FCMP_FALSE: return Constant::getNullValue(ResultTy);
+    case FCmpInst::FCMP_TRUE:  return Constant::getAllOnesValue(ResultTy);
+    case FCmpInst::FCMP_UNO:
+      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered);
+    case FCmpInst::FCMP_ORD:
+      return ConstantInt::get(ResultTy, R!=APFloat::cmpUnordered);
+    case FCmpInst::FCMP_UEQ:
+      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
+                                        R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_OEQ:   
+      return ConstantInt::get(ResultTy, R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_UNE:
+      return ConstantInt::get(ResultTy, R!=APFloat::cmpEqual);
+    case FCmpInst::FCMP_ONE:   
+      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
+                                        R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_ULT: 
+      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
+                                        R==APFloat::cmpLessThan);
+    case FCmpInst::FCMP_OLT:   
+      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan);
+    case FCmpInst::FCMP_UGT:
+      return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered ||
+                                        R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_OGT:
+      return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_ULE:
+      return ConstantInt::get(ResultTy, R!=APFloat::cmpGreaterThan);
+    case FCmpInst::FCMP_OLE: 
+      return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan ||
+                                        R==APFloat::cmpEqual);
+    case FCmpInst::FCMP_UGE:
+      return ConstantInt::get(ResultTy, R!=APFloat::cmpLessThan);
+    case FCmpInst::FCMP_OGE: 
+      return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan ||
+                                        R==APFloat::cmpEqual);
+    }
+  } else if (C1->getType()->isVectorTy()) {
+    // If we can constant fold the comparison of each element, constant fold
+    // the whole vector comparison.
+    SmallVector<Constant*, 4> ResElts;
+    Type *Ty = IntegerType::get(C1->getContext(), 32);
+    // Compare the elements, producing an i1 result or constant expr.
+    for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){
+      Constant *C1E =
+        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
+      Constant *C2E =
+        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+      
+      ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
+    }
+    
+    return ConstantVector::get(ResElts);
+  }
+
+  if (C1->getType()->isFloatingPointTy()) {
+    int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
+    switch (evaluateFCmpRelation(C1, C2)) {
+    default: llvm_unreachable("Unknown relation!");
+    case FCmpInst::FCMP_UNO:
+    case FCmpInst::FCMP_ORD:
+    case FCmpInst::FCMP_UEQ:
+    case FCmpInst::FCMP_UNE:
+    case FCmpInst::FCMP_ULT:
+    case FCmpInst::FCMP_UGT:
+    case FCmpInst::FCMP_ULE:
+    case FCmpInst::FCMP_UGE:
+    case FCmpInst::FCMP_TRUE:
+    case FCmpInst::FCMP_FALSE:
+    case FCmpInst::BAD_FCMP_PREDICATE:
+      break; // Couldn't determine anything about these constants.
+    case FCmpInst::FCMP_OEQ: // We know that C1 == C2
+      Result = (pred == FCmpInst::FCMP_UEQ || pred == FCmpInst::FCMP_OEQ ||
+                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE ||
+                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      break;
+    case FCmpInst::FCMP_OLT: // We know that C1 < C2
+      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
+                pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT ||
+                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE);
+      break;
+    case FCmpInst::FCMP_OGT: // We know that C1 > C2
+      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
+                pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT ||
+                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      break;
+    case FCmpInst::FCMP_OLE: // We know that C1 <= C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+        Result = 1;
+      break;
+    case FCmpInst::FCMP_OGE: // We known that C1 >= C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT) 
+        Result = 1;
+      break;
+    case FCmpInst::FCMP_ONE: // We know that C1 != C2
+      // We can only partially decide this relation.
+      if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ) 
+        Result = 0;
+      else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE) 
+        Result = 1;
+      break;
+    }
+
+    // If we evaluated the result, return it now.
+    if (Result != -1)
+      return ConstantInt::get(ResultTy, Result);
+
+  } else {
+    // Evaluate the relation between the two constants, per the predicate.
+    int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
+    switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(pred))) {
+    default: llvm_unreachable("Unknown relational!");
+    case ICmpInst::BAD_ICMP_PREDICATE:
+      break;  // Couldn't determine anything about these constants.
+    case ICmpInst::ICMP_EQ:   // We know the constants are equal!
+      // If we know the constants are equal, we can decide the result of this
+      // computation precisely.
+      Result = ICmpInst::isTrueWhenEqual((ICmpInst::Predicate)pred);
+      break;
+    case ICmpInst::ICMP_ULT:
+      switch (pred) {
+      case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULE:
+        Result = 1; break;
+      case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGE:
+        Result = 0; break;
+      }
+      break;
+    case ICmpInst::ICMP_SLT:
+      switch (pred) {
+      case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SLE:
+        Result = 1; break;
+      case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SGE:
+        Result = 0; break;
+      }
+      break;
+    case ICmpInst::ICMP_UGT:
+      switch (pred) {
+      case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGE:
+        Result = 1; break;
+      case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE:
+        Result = 0; break;
+      }
+      break;
+    case ICmpInst::ICMP_SGT:
+      switch (pred) {
+      case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SGE:
+        Result = 1; break;
+      case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SLE:
+        Result = 0; break;
+      }
+      break;
+    case ICmpInst::ICMP_ULE:
+      if (pred == ICmpInst::ICMP_UGT) Result = 0;
+      if (pred == ICmpInst::ICMP_ULT || pred == ICmpInst::ICMP_ULE) Result = 1;
+      break;
+    case ICmpInst::ICMP_SLE:
+      if (pred == ICmpInst::ICMP_SGT) Result = 0;
+      if (pred == ICmpInst::ICMP_SLT || pred == ICmpInst::ICMP_SLE) Result = 1;
+      break;
+    case ICmpInst::ICMP_UGE:
+      if (pred == ICmpInst::ICMP_ULT) Result = 0;
+      if (pred == ICmpInst::ICMP_UGT || pred == ICmpInst::ICMP_UGE) Result = 1;
+      break;
+    case ICmpInst::ICMP_SGE:
+      if (pred == ICmpInst::ICMP_SLT) Result = 0;
+      if (pred == ICmpInst::ICMP_SGT || pred == ICmpInst::ICMP_SGE) Result = 1;
+      break;
+    case ICmpInst::ICMP_NE:
+      if (pred == ICmpInst::ICMP_EQ) Result = 0;
+      if (pred == ICmpInst::ICMP_NE) Result = 1;
+      break;
+    }
+
+    // If we evaluated the result, return it now.
+    if (Result != -1)
+      return ConstantInt::get(ResultTy, Result);
+
+    // If the right hand side is a bitcast, try using its inverse to simplify
+    // it by moving it to the left hand side.  We can't do this if it would turn
+    // a vector compare into a scalar compare or visa versa.
+    if (ConstantExpr *CE2 = dyn_cast<ConstantExpr>(C2)) {
+      Constant *CE2Op0 = CE2->getOperand(0);
+      if (CE2->getOpcode() == Instruction::BitCast &&
+          CE2->getType()->isVectorTy() == CE2Op0->getType()->isVectorTy()) {
+        Constant *Inverse = ConstantExpr::getBitCast(C1, CE2Op0->getType());
+        return ConstantExpr::getICmp(pred, Inverse, CE2Op0);
+      }
+    }
+
+    // If the left hand side is an extension, try eliminating it.
+    if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
+      if ((CE1->getOpcode() == Instruction::SExt && ICmpInst::isSigned(pred)) ||
+          (CE1->getOpcode() == Instruction::ZExt && !ICmpInst::isSigned(pred))){
+        Constant *CE1Op0 = CE1->getOperand(0);
+        Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType());
+        if (CE1Inverse == CE1Op0) {
+          // Check whether we can safely truncate the right hand side.
+          Constant *C2Inverse = ConstantExpr::getTrunc(C2, CE1Op0->getType());
+          if (ConstantExpr::getZExt(C2Inverse, C2->getType()) == C2) {
+            return ConstantExpr::getICmp(pred, CE1Inverse, C2Inverse);
+          }
+        }
+      }
+    }
+
+    if ((!isa<ConstantExpr>(C1) && isa<ConstantExpr>(C2)) ||
+        (C1->isNullValue() && !C2->isNullValue())) {
+      // If C2 is a constant expr and C1 isn't, flip them around and fold the
+      // other way if possible.
+      // Also, if C1 is null and C2 isn't, flip them around.
+      pred = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)pred);
+      return ConstantExpr::getICmp(pred, C2, C1);
+    }
+  }
+  return 0;
+}
+
+/// isInBoundsIndices - Test whether the given sequence of *normalized* indices
+/// is "inbounds".
+template<typename IndexTy>
+static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
+  // No indices means nothing that could be out of bounds.
+  if (Idxs.empty()) return true;
+
+  // If the first index is zero, it's in bounds.
+  if (cast<Constant>(Idxs[0])->isNullValue()) return true;
+
+  // If the first index is one and all the rest are zero, it's in bounds,
+  // by the one-past-the-end rule.
+  if (!cast<ConstantInt>(Idxs[0])->isOne())
+    return false;
+  for (unsigned i = 1, e = Idxs.size(); i != e; ++i)
+    if (!cast<Constant>(Idxs[i])->isNullValue())
+      return false;
+  return true;
+}
+
+template<typename IndexTy>
+static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
+                                               bool inBounds,
+                                               ArrayRef<IndexTy> Idxs) {
+  if (Idxs.empty()) return C;
+  Constant *Idx0 = cast<Constant>(Idxs[0]);
+  if ((Idxs.size() == 1 && Idx0->isNullValue()))
+    return C;
+
+  if (isa<UndefValue>(C)) {
+    PointerType *Ptr = cast<PointerType>(C->getType());
+    Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
+    assert(Ty != 0 && "Invalid indices for GEP!");
+    return UndefValue::get(PointerType::get(Ty, Ptr->getAddressSpace()));
+  }
+
+  if (C->isNullValue()) {
+    bool isNull = true;
+    for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
+      if (!cast<Constant>(Idxs[i])->isNullValue()) {
+        isNull = false;
+        break;
+      }
+    if (isNull) {
+      PointerType *Ptr = cast<PointerType>(C->getType());
+      Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
+      assert(Ty != 0 && "Invalid indices for GEP!");
+      return ConstantPointerNull::get(PointerType::get(Ty,
+                                                       Ptr->getAddressSpace()));
+    }
+  }
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Combine Indices - If the source pointer to this getelementptr instruction
+    // is a getelementptr instruction, combine the indices of the two
+    // getelementptr instructions into a single instruction.
+    //
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      Type *LastTy = 0;
+      for (gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
+           I != E; ++I)
+        LastTy = *I;
+
+      if ((LastTy && isa<SequentialType>(LastTy)) || Idx0->isNullValue()) {
+        SmallVector<Value*, 16> NewIndices;
+        NewIndices.reserve(Idxs.size() + CE->getNumOperands());
+        for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
+          NewIndices.push_back(CE->getOperand(i));
+
+        // Add the last index of the source with the first index of the new GEP.
+        // Make sure to handle the case when they are actually different types.
+        Constant *Combined = CE->getOperand(CE->getNumOperands()-1);
+        // Otherwise it must be an array.
+        if (!Idx0->isNullValue()) {
+          Type *IdxTy = Combined->getType();
+          if (IdxTy != Idx0->getType()) {
+            Type *Int64Ty = Type::getInt64Ty(IdxTy->getContext());
+            Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, Int64Ty);
+            Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, Int64Ty);
+            Combined = ConstantExpr::get(Instruction::Add, C1, C2);
+          } else {
+            Combined =
+              ConstantExpr::get(Instruction::Add, Idx0, Combined);
+          }
+        }
+
+        NewIndices.push_back(Combined);
+        NewIndices.append(Idxs.begin() + 1, Idxs.end());
+        return
+          ConstantExpr::getGetElementPtr(CE->getOperand(0), NewIndices,
+                                         inBounds &&
+                                           cast<GEPOperator>(CE)->isInBounds());
+      }
+    }
+
+    // Attempt to fold casts to the same type away.  For example, folding:
+    //
+    //   i32* getelementptr ([2 x i32]* bitcast ([3 x i32]* %X to [2 x i32]*),
+    //                       i64 0, i64 0)
+    // into:
+    //
+    //   i32* getelementptr ([3 x i32]* %X, i64 0, i64 0)
+    //
+    // Don't fold if the cast is changing address spaces.
+    if (CE->isCast() && Idxs.size() > 1 && Idx0->isNullValue()) {
+      PointerType *SrcPtrTy =
+        dyn_cast<PointerType>(CE->getOperand(0)->getType());
+      PointerType *DstPtrTy = dyn_cast<PointerType>(CE->getType());
+      if (SrcPtrTy && DstPtrTy) {
+        ArrayType *SrcArrayTy =
+          dyn_cast<ArrayType>(SrcPtrTy->getElementType());
+        ArrayType *DstArrayTy =
+          dyn_cast<ArrayType>(DstPtrTy->getElementType());
+        if (SrcArrayTy && DstArrayTy
+            && SrcArrayTy->getElementType() == DstArrayTy->getElementType()
+            && SrcPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
+          return ConstantExpr::getGetElementPtr((Constant*)CE->getOperand(0),
+                                                Idxs, inBounds);
+      }
+    }
+  }
+
+  // Check to see if any array indices are not within the corresponding
+  // notional array bounds. If so, try to determine if they can be factored
+  // out into preceding dimensions.
+  bool Unknown = false;
+  SmallVector<Constant *, 8> NewIdxs;
+  Type *Ty = C->getType();
+  Type *Prev = 0;
+  for (unsigned i = 0, e = Idxs.size(); i != e;
+       Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+        if (ATy->getNumElements() <= INT64_MAX &&
+            ATy->getNumElements() != 0 &&
+            CI->getSExtValue() >= (int64_t)ATy->getNumElements()) {
+          if (isa<SequentialType>(Prev)) {
+            // It's out of range, but we can factor it into the prior
+            // dimension.
+            NewIdxs.resize(Idxs.size());
+            ConstantInt *Factor = ConstantInt::get(CI->getType(),
+                                                   ATy->getNumElements());
+            NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
+
+            Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
+            Constant *Div = ConstantExpr::getSDiv(CI, Factor);
+
+            // Before adding, extend both operands to i64 to avoid
+            // overflow trouble.
+            if (!PrevIdx->getType()->isIntegerTy(64))
+              PrevIdx = ConstantExpr::getSExt(PrevIdx,
+                                           Type::getInt64Ty(Div->getContext()));
+            if (!Div->getType()->isIntegerTy(64))
+              Div = ConstantExpr::getSExt(Div,
+                                          Type::getInt64Ty(Div->getContext()));
+
+            NewIdxs[i-1] = ConstantExpr::getAdd(PrevIdx, Div);
+          } else {
+            // It's out of range, but the prior dimension is a struct
+            // so we can't do anything about it.
+            Unknown = true;
+          }
+        }
+    } else {
+      // We don't know if it's in range or not.
+      Unknown = true;
+    }
+  }
+
+  // If we did any factoring, start over with the adjusted indices.
+  if (!NewIdxs.empty()) {
+    for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
+      if (!NewIdxs[i]) NewIdxs[i] = cast<Constant>(Idxs[i]);
+    return ConstantExpr::getGetElementPtr(C, NewIdxs, inBounds);
+  }
+
+  // If all indices are known integers and normalized, we can do a simple
+  // check for the "inbounds" property.
+  if (!Unknown && !inBounds &&
+      isa<GlobalVariable>(C) && isInBoundsIndices(Idxs))
+    return ConstantExpr::getInBoundsGetElementPtr(C, Idxs);
+
+  return 0;
+}
+
+Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
+                                          bool inBounds,
+                                          ArrayRef<Constant *> Idxs) {
+  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs);
+}
+
+Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
+                                          bool inBounds,
+                                          ArrayRef<Value *> Idxs) {
+  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs);
+}
diff --git a/contrib/llvm/lib/IR/ConstantFold.h b/contrib/llvm/lib/IR/ConstantFold.h
new file mode 100644
index 000000000000..e12f27a7cb1e
--- /dev/null
+++ b/contrib/llvm/lib/IR/ConstantFold.h
@@ -0,0 +1,56 @@
+//===-- ConstantFolding.h - Internal Constant Folding Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the (internal) constant folding interfaces for LLVM.  These
+// interfaces are used by the ConstantExpr::get* methods to automatically fold
+// constants when possible.
+//
+// These operators may return a null object if they don't know how to perform
+// the specified operation on the specified constant types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CONSTANTFOLDING_H
+#define CONSTANTFOLDING_H
+
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+  class Value;
+  class Constant;
+  class Type;
+
+  // Constant fold various types of instruction...
+  Constant *ConstantFoldCastInstruction(
+    unsigned opcode,     ///< The opcode of the cast
+    Constant *V,         ///< The source constant
+    Type *DestTy   ///< The destination type
+  );
+  Constant *ConstantFoldSelectInstruction(Constant *Cond,
+                                          Constant *V1, Constant *V2);
+  Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
+  Constant *ConstantFoldInsertElementInstruction(Constant *Val, Constant *Elt,
+                                                 Constant *Idx);
+  Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
+                                                 Constant *Mask);
+  Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
+                                                ArrayRef<unsigned> Idxs);
+  Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
+                                               ArrayRef<unsigned> Idxs);
+  Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
+                                          Constant *V2);
+  Constant *ConstantFoldCompareInstruction(unsigned short predicate, 
+                                           Constant *C1, Constant *C2);
+  Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
+                                      ArrayRef<Constant *> Idxs);
+  Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
+                                      ArrayRef<Value *> Idxs);
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/IR/Constants.cpp b/contrib/llvm/lib/IR/Constants.cpp
new file mode 100644
index 000000000000..1abb65643559
--- /dev/null
+++ b/contrib/llvm/lib/IR/Constants.cpp
@@ -0,0 +1,2779 @@
+//===-- Constants.cpp - Implement Constant nodes --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Constant* classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Constants.h"
+#include "ConstantFold.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdarg>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Constant Class
+//===----------------------------------------------------------------------===//
+
+void Constant::anchor() { }
+
+bool Constant::isNegativeZeroValue() const {
+  // Floating point values have an explicit -0.0 value.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->isZero() && CFP->isNegative();
+
+  // Equivalent for a vector of -0.0's.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
+      if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
+        return true;
+
+  // We've already handled true FP case; any other FP vectors can't represent -0.0.
+  if (getType()->isFPOrFPVectorTy())
+    return false;
+
+  // Otherwise, just use +0.0.
+  return isNullValue();
+}
+
+// Return true iff this constant is positive zero (floating point), negative
+// zero (floating point), or a null value.
+bool Constant::isZeroValue() const {
+  // Floating point values have an explicit -0.0 value.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->isZero();
+
+  // Otherwise, just use +0.0.
+  return isNullValue();
+}
+
+bool Constant::isNullValue() const {
+  // 0 is null.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isZero();
+
+  // +0.0 is null.
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->isZero() && !CFP->isNegative();
+
+  // constant zero is zero for aggregates and cpnull is null for pointers.
+  return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this);
+}
+
+bool Constant::isAllOnesValue() const {
+  // Check for -1 integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isMinusOne();
+
+  // Check for FP which are bitcasted from -1 integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt().isAllOnesValue();
+
+  // Check for constant vectors which are splats of -1 values.
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isAllOnesValue();
+
+  // Check for constant vectors which are splats of -1 values.
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    if (Constant *Splat = CV->getSplatValue())
+      return Splat->isAllOnesValue();
+
+  return false;
+}
+
+// Constructor to create a '0' constant of arbitrary type...
+Constant *Constant::getNullValue(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID:
+    return ConstantInt::get(Ty, 0);
+  case Type::HalfTyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::IEEEhalf));
+  case Type::FloatTyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::IEEEsingle));
+  case Type::DoubleTyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::IEEEdouble));
+  case Type::X86_FP80TyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::x87DoubleExtended));
+  case Type::FP128TyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::IEEEquad));
+  case Type::PPC_FP128TyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat(APFloat::PPCDoubleDouble,
+                                   APInt::getNullValue(128)));
+  case Type::PointerTyID:
+    return ConstantPointerNull::get(cast<PointerType>(Ty));
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    return ConstantAggregateZero::get(Ty);
+  default:
+    // Function, Label, or Opaque type?
+    llvm_unreachable("Cannot create a null constant of that type!");
+  }
+}
+
+Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) {
+  Type *ScalarTy = Ty->getScalarType();
+
+  // Create the base integer constant.
+  Constant *C = ConstantInt::get(Ty->getContext(), V);
+
+  // Convert an integer to a pointer, if necessary.
+  if (PointerType *PTy = dyn_cast<PointerType>(ScalarTy))
+    C = ConstantExpr::getIntToPtr(C, PTy);
+
+  // Broadcast a scalar to a vector, if necessary.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    C = ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C;
+}
+
+Constant *Constant::getAllOnesValue(Type *Ty) {
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+    return ConstantInt::get(Ty->getContext(),
+                            APInt::getAllOnesValue(ITy->getBitWidth()));
+
+  if (Ty->isFloatingPointTy()) {
+    APFloat FL = APFloat::getAllOnesValue(Ty->getPrimitiveSizeInBits(),
+                                          !Ty->isPPC_FP128Ty());
+    return ConstantFP::get(Ty->getContext(), FL);
+  }
+
+  VectorType *VTy = cast<VectorType>(Ty);
+  return ConstantVector::getSplat(VTy->getNumElements(),
+                                  getAllOnesValue(VTy->getElementType()));
+}
+
+/// getAggregateElement - For aggregates (struct/array/vector) return the
+/// constant that corresponds to the specified element if possible, or null if
+/// not.  This can return null if the element index is a ConstantExpr, or if
+/// 'this' is a constant expr.
+Constant *Constant::getAggregateElement(unsigned Elt) const {
+  if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(this))
+    return Elt < CS->getNumOperands() ? CS->getOperand(Elt) : 0;
+
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(this))
+    return Elt < CA->getNumOperands() ? CA->getOperand(Elt) : 0;
+
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    return Elt < CV->getNumOperands() ? CV->getOperand(Elt) : 0;
+
+  if (const ConstantAggregateZero *CAZ =dyn_cast<ConstantAggregateZero>(this))
+    return CAZ->getElementValue(Elt);
+
+  if (const UndefValue *UV = dyn_cast<UndefValue>(this))
+    return UV->getElementValue(Elt);
+
+  if (const ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(this))
+    return Elt < CDS->getNumElements() ? CDS->getElementAsConstant(Elt) : 0;
+  return 0;
+}
+
+Constant *Constant::getAggregateElement(Constant *Elt) const {
+  assert(isa<IntegerType>(Elt->getType()) && "Index must be an integer");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Elt))
+    return getAggregateElement(CI->getZExtValue());
+  return 0;
+}
+
+
+void Constant::destroyConstantImpl() {
+  // When a Constant is destroyed, there may be lingering
+  // references to the constant by other constants in the constant pool.  These
+  // constants are implicitly dependent on the module that is being deleted,
+  // but they don't know that.  Because we only find out when the CPV is
+  // deleted, we must now notify all of our users (that should only be
+  // Constants) that they are, in fact, invalid now and should be deleted.
+  //
+  while (!use_empty()) {
+    Value *V = use_back();
+#ifndef NDEBUG      // Only in -g mode...
+    if (!isa<Constant>(V)) {
+      dbgs() << "While deleting: " << *this
+             << "\n\nUse still stuck around after Def is destroyed: "
+             << *V << "\n\n";
+    }
+#endif
+    assert(isa<Constant>(V) && "References remain to Constant being destroyed");
+    cast<Constant>(V)->destroyConstant();
+
+    // The constant should remove itself from our use list...
+    assert((use_empty() || use_back() != V) && "Constant not removed!");
+  }
+
+  // Value has no outstanding references it is safe to delete it now...
+  delete this;
+}
+
+/// canTrap - Return true if evaluation of this constant could trap.  This is
+/// true for things like constant expressions that could divide by zero.
+bool Constant::canTrap() const {
+  assert(getType()->isFirstClassType() && "Cannot evaluate aggregate vals!");
+  // The only thing that could possibly trap are constant exprs.
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(this);
+  if (!CE) return false;
+
+  // ConstantExpr traps if any operands can trap.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (CE->getOperand(i)->canTrap())
+      return true;
+
+  // Otherwise, only specific operations can trap.
+  switch (CE->getOpcode()) {
+  default:
+    return false;
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+    // Div and rem can trap if the RHS is not known to be non-zero.
+    if (!isa<ConstantInt>(CE->getOperand(1)) ||CE->getOperand(1)->isNullValue())
+      return true;
+    return false;
+  }
+}
+
+/// isThreadDependent - Return true if the value can vary between threads.
+bool Constant::isThreadDependent() const {
+  SmallPtrSet<const Constant*, 64> Visited;
+  SmallVector<const Constant*, 64> WorkList;
+  WorkList.push_back(this);
+  Visited.insert(this);
+
+  while (!WorkList.empty()) {
+    const Constant *C = WorkList.pop_back_val();
+
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->isThreadLocal())
+        return true;
+    }
+
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) {
+      const Constant *D = dyn_cast<Constant>(C->getOperand(I));
+      if (!D)
+        continue;
+      if (Visited.insert(D))
+        WorkList.push_back(D);
+    }
+  }
+
+  return false;
+}
+
+/// isConstantUsed - Return true if the constant has users other than constant
+/// exprs and other dangling things.
+bool Constant::isConstantUsed() const {
+  for (const_use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    const Constant *UC = dyn_cast<Constant>(*UI);
+    if (UC == 0 || isa<GlobalValue>(UC))
+      return true;
+
+    if (UC->isConstantUsed())
+      return true;
+  }
+  return false;
+}
+
+
+
+/// getRelocationInfo - This method classifies the entry according to
+/// whether or not it may generate a relocation entry.  This must be
+/// conservative, so if it might codegen to a relocatable entry, it should say
+/// so.  The return values are:
+/// 
+///  NoRelocation: This constant pool entry is guaranteed to never have a
+///     relocation applied to it (because it holds a simple constant like
+///     '4').
+///  LocalRelocation: This entry has relocations, but the entries are
+///     guaranteed to be resolvable by the static linker, so the dynamic
+///     linker will never see them.
+///  GlobalRelocations: This entry may have arbitrary relocations.
+///
+/// FIXME: This really should not be in IR.
+Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(this)) {
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return LocalRelocation;  // Local to this file/library.
+    return GlobalRelocations;    // Global reference.
+  }
+  
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(this))
+    return BA->getFunction()->getRelocationInfo();
+  
+  // While raw uses of blockaddress need to be relocated, differences between
+  // two of them don't when they are for labels in the same function.  This is a
+  // common idiom when creating a table for the indirect goto extension, so we
+  // handle it efficiently here.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(this))
+    if (CE->getOpcode() == Instruction::Sub) {
+      ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
+      ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
+      if (LHS && RHS &&
+          LHS->getOpcode() == Instruction::PtrToInt &&
+          RHS->getOpcode() == Instruction::PtrToInt &&
+          isa<BlockAddress>(LHS->getOperand(0)) &&
+          isa<BlockAddress>(RHS->getOperand(0)) &&
+          cast<BlockAddress>(LHS->getOperand(0))->getFunction() ==
+            cast<BlockAddress>(RHS->getOperand(0))->getFunction())
+        return NoRelocation;
+    }
+
+  PossibleRelocationsTy Result = NoRelocation;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    Result = std::max(Result,
+                      cast<Constant>(getOperand(i))->getRelocationInfo());
+
+  return Result;
+}
+
+/// removeDeadUsersOfConstant - If the specified constantexpr is dead, remove
+/// it.  This involves recursively eliminating any dead users of the
+/// constantexpr.
+static bool removeDeadUsersOfConstant(const Constant *C) {
+  if (isa<GlobalValue>(C)) return false; // Cannot remove this
+
+  while (!C->use_empty()) {
+    const Constant *User = dyn_cast<Constant>(C->use_back());
+    if (!User) return false; // Non-constant usage;
+    if (!removeDeadUsersOfConstant(User))
+      return false; // Constant wasn't dead
+  }
+
+  const_cast<Constant*>(C)->destroyConstant();
+  return true;
+}
+
+
+/// removeDeadConstantUsers - If there are any dead constant users dangling
+/// off of this constant, remove them.  This method is useful for clients
+/// that want to check to see if a global is unused, but don't want to deal
+/// with potentially dead constants hanging off of the globals.
+void Constant::removeDeadConstantUsers() const {
+  Value::const_use_iterator I = use_begin(), E = use_end();
+  Value::const_use_iterator LastNonDeadUser = E;
+  while (I != E) {
+    const Constant *User = dyn_cast<Constant>(*I);
+    if (User == 0) {
+      LastNonDeadUser = I;
+      ++I;
+      continue;
+    }
+
+    if (!removeDeadUsersOfConstant(User)) {
+      // If the constant wasn't dead, remember that this was the last live use
+      // and move on to the next constant.
+      LastNonDeadUser = I;
+      ++I;
+      continue;
+    }
+
+    // If the constant was dead, then the iterator is invalidated.
+    if (LastNonDeadUser == E) {
+      I = use_begin();
+      if (I == E) break;
+    } else {
+      I = LastNonDeadUser;
+      ++I;
+    }
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                                ConstantInt
+//===----------------------------------------------------------------------===//
+
+void ConstantInt::anchor() { }
+
+ConstantInt::ConstantInt(IntegerType *Ty, const APInt& V)
+  : Constant(Ty, ConstantIntVal, 0, 0), Val(V) {
+  assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
+}
+
+ConstantInt *ConstantInt::getTrue(LLVMContext &Context) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  if (!pImpl->TheTrueVal)
+    pImpl->TheTrueVal = ConstantInt::get(Type::getInt1Ty(Context), 1);
+  return pImpl->TheTrueVal;
+}
+
+ConstantInt *ConstantInt::getFalse(LLVMContext &Context) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  if (!pImpl->TheFalseVal)
+    pImpl->TheFalseVal = ConstantInt::get(Type::getInt1Ty(Context), 0);
+  return pImpl->TheFalseVal;
+}
+
+Constant *ConstantInt::getTrue(Type *Ty) {
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (!VTy) {
+    assert(Ty->isIntegerTy(1) && "True must be i1 or vector of i1.");
+    return ConstantInt::getTrue(Ty->getContext());
+  }
+  assert(VTy->getElementType()->isIntegerTy(1) &&
+         "True must be vector of i1 or i1.");
+  return ConstantVector::getSplat(VTy->getNumElements(),
+                                  ConstantInt::getTrue(Ty->getContext()));
+}
+
+Constant *ConstantInt::getFalse(Type *Ty) {
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (!VTy) {
+    assert(Ty->isIntegerTy(1) && "False must be i1 or vector of i1.");
+    return ConstantInt::getFalse(Ty->getContext());
+  }
+  assert(VTy->getElementType()->isIntegerTy(1) &&
+         "False must be vector of i1 or i1.");
+  return ConstantVector::getSplat(VTy->getNumElements(),
+                                  ConstantInt::getFalse(Ty->getContext()));
+}
+
+
+// Get a ConstantInt from an APInt. Note that the value stored in the DenseMap 
+// as the key, is a DenseMapAPIntKeyInfo::KeyTy which has provided the
+// operator== and operator!= to ensure that the DenseMap doesn't attempt to
+// compare APInt's of different widths, which would violate an APInt class
+// invariant which generates an assertion.
+ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
+  // Get the corresponding integer type for the bit width of the value.
+  IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+  // get an existing value or the insertion position
+  DenseMapAPIntKeyInfo::KeyTy Key(V, ITy);
+  ConstantInt *&Slot = Context.pImpl->IntConstants[Key]; 
+  if (!Slot) Slot = new ConstantInt(ITy, V);
+  return Slot;
+}
+
+Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
+  Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned);
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C;
+}
+
+ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, 
+                              bool isSigned) {
+  return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned));
+}
+
+ConstantInt *ConstantInt::getSigned(IntegerType *Ty, int64_t V) {
+  return get(Ty, V, true);
+}
+
+Constant *ConstantInt::getSigned(Type *Ty, int64_t V) {
+  return get(Ty, V, true);
+}
+
+Constant *ConstantInt::get(Type *Ty, const APInt& V) {
+  ConstantInt *C = get(Ty->getContext(), V);
+  assert(C->getType() == Ty->getScalarType() &&
+         "ConstantInt type doesn't match the type implied by its value!");
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C;
+}
+
+ConstantInt *ConstantInt::get(IntegerType* Ty, StringRef Str,
+                              uint8_t radix) {
+  return get(Ty->getContext(), APInt(Ty->getBitWidth(), Str, radix));
+}
+
+//===----------------------------------------------------------------------===//
+//                                ConstantFP
+//===----------------------------------------------------------------------===//
+
+static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
+  if (Ty->isHalfTy())
+    return &APFloat::IEEEhalf;
+  if (Ty->isFloatTy())
+    return &APFloat::IEEEsingle;
+  if (Ty->isDoubleTy())
+    return &APFloat::IEEEdouble;
+  if (Ty->isX86_FP80Ty())
+    return &APFloat::x87DoubleExtended;
+  else if (Ty->isFP128Ty())
+    return &APFloat::IEEEquad;
+
+  assert(Ty->isPPC_FP128Ty() && "Unknown FP format");
+  return &APFloat::PPCDoubleDouble;
+}
+
+void ConstantFP::anchor() { }
+
+/// get() - This returns a constant fp for the specified value in the
+/// specified type.  This should only be used for simple constant values like
+/// 2.0/1.0 etc, that are known-valid both as double and as the target format.
+Constant *ConstantFP::get(Type *Ty, double V) {
+  LLVMContext &Context = Ty->getContext();
+
+  APFloat FV(V);
+  bool ignored;
+  FV.convert(*TypeToFloatSemantics(Ty->getScalarType()),
+             APFloat::rmNearestTiesToEven, &ignored);
+  Constant *C = get(Context, FV);
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C;
+}
+
+
+Constant *ConstantFP::get(Type *Ty, StringRef Str) {
+  LLVMContext &Context = Ty->getContext();
+
+  APFloat FV(*TypeToFloatSemantics(Ty->getScalarType()), Str);
+  Constant *C = get(Context, FV);
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), C);
+
+  return C; 
+}
+
+
+ConstantFP *ConstantFP::getNegativeZero(Type *Ty) {
+  LLVMContext &Context = Ty->getContext();
+  APFloat apf = cast<ConstantFP>(Constant::getNullValue(Ty))->getValueAPF();
+  apf.changeSign();
+  return get(Context, apf);
+}
+
+
+Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
+  Type *ScalarTy = Ty->getScalarType();
+  if (ScalarTy->isFloatingPointTy()) {
+    Constant *C = getNegativeZero(ScalarTy);
+    if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+      return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return C;
+  }
+
+  return Constant::getNullValue(Ty);
+}
+
+
+// ConstantFP accessors.
+ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
+  DenseMapAPFloatKeyInfo::KeyTy Key(V);
+
+  LLVMContextImpl* pImpl = Context.pImpl;
+
+  ConstantFP *&Slot = pImpl->FPConstants[Key];
+
+  if (!Slot) {
+    Type *Ty;
+    if (&V.getSemantics() == &APFloat::IEEEhalf)
+      Ty = Type::getHalfTy(Context);
+    else if (&V.getSemantics() == &APFloat::IEEEsingle)
+      Ty = Type::getFloatTy(Context);
+    else if (&V.getSemantics() == &APFloat::IEEEdouble)
+      Ty = Type::getDoubleTy(Context);
+    else if (&V.getSemantics() == &APFloat::x87DoubleExtended)
+      Ty = Type::getX86_FP80Ty(Context);
+    else if (&V.getSemantics() == &APFloat::IEEEquad)
+      Ty = Type::getFP128Ty(Context);
+    else {
+      assert(&V.getSemantics() == &APFloat::PPCDoubleDouble && 
+             "Unknown FP format");
+      Ty = Type::getPPC_FP128Ty(Context);
+    }
+    Slot = new ConstantFP(Ty, V);
+  }
+
+  return Slot;
+}
+
+ConstantFP *ConstantFP::getInfinity(Type *Ty, bool Negative) {
+  const fltSemantics &Semantics = *TypeToFloatSemantics(Ty);
+  return ConstantFP::get(Ty->getContext(),
+                         APFloat::getInf(Semantics, Negative));
+}
+
+ConstantFP::ConstantFP(Type *Ty, const APFloat& V)
+  : Constant(Ty, ConstantFPVal, 0, 0), Val(V) {
+  assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
+         "FP type Mismatch");
+}
+
+bool ConstantFP::isExactlyValue(const APFloat &V) const {
+  return Val.bitwiseIsEqual(V);
+}
+
+//===----------------------------------------------------------------------===//
+//                   ConstantAggregateZero Implementation
+//===----------------------------------------------------------------------===//
+
+/// getSequentialElement - If this CAZ has array or vector type, return a zero
+/// with the right element type.
+Constant *ConstantAggregateZero::getSequentialElement() const {
+  return Constant::getNullValue(getType()->getSequentialElementType());
+}
+
+/// getStructElement - If this CAZ has struct type, return a zero with the
+/// right element type for the specified element.
+Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
+  return Constant::getNullValue(getType()->getStructElementType(Elt));
+}
+
+/// getElementValue - Return a zero of the right value for the specified GEP
+/// index if we can, otherwise return null (e.g. if C is a ConstantExpr).
+Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
+  if (isa<SequentialType>(getType()))
+    return getSequentialElement();
+  return getStructElement(cast<ConstantInt>(C)->getZExtValue());
+}
+
+/// getElementValue - Return a zero of the right value for the specified GEP
+/// index.
+Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
+  if (isa<SequentialType>(getType()))
+    return getSequentialElement();
+  return getStructElement(Idx);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         UndefValue Implementation
+//===----------------------------------------------------------------------===//
+
+/// getSequentialElement - If this undef has array or vector type, return an
+/// undef with the right element type.
+UndefValue *UndefValue::getSequentialElement() const {
+  return UndefValue::get(getType()->getSequentialElementType());
+}
+
+/// getStructElement - If this undef has struct type, return a zero with the
+/// right element type for the specified element.
+UndefValue *UndefValue::getStructElement(unsigned Elt) const {
+  return UndefValue::get(getType()->getStructElementType(Elt));
+}
+
+/// getElementValue - Return an undef of the right value for the specified GEP
+/// index if we can, otherwise return null (e.g. if C is a ConstantExpr).
+UndefValue *UndefValue::getElementValue(Constant *C) const {
+  if (isa<SequentialType>(getType()))
+    return getSequentialElement();
+  return getStructElement(cast<ConstantInt>(C)->getZExtValue());
+}
+
+/// getElementValue - Return an undef of the right value for the specified GEP
+/// index.
+UndefValue *UndefValue::getElementValue(unsigned Idx) const {
+  if (isa<SequentialType>(getType()))
+    return getSequentialElement();
+  return getStructElement(Idx);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                            ConstantXXX Classes
+//===----------------------------------------------------------------------===//
+
+template <typename ItTy, typename EltTy>
+static bool rangeOnlyContains(ItTy Start, ItTy End, EltTy Elt) {
+  for (; Start != End; ++Start)
+    if (*Start != Elt)
+      return false;
+  return true;
+}
+
+ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
+  : Constant(T, ConstantArrayVal,
+             OperandTraits<ConstantArray>::op_end(this) - V.size(),
+             V.size()) {
+  assert(V.size() == T->getNumElements() &&
+         "Invalid initializer vector for constant array");
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    assert(V[i]->getType() == T->getElementType() &&
+           "Initializer for array element doesn't match array element type!");
+  std::copy(V.begin(), V.end(), op_begin());
+}
+
+Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
+  // Empty arrays are canonicalized to ConstantAggregateZero.
+  if (V.empty())
+    return ConstantAggregateZero::get(Ty);
+
+  for (unsigned i = 0, e = V.size(); i != e; ++i) {
+    assert(V[i]->getType() == Ty->getElementType() &&
+           "Wrong type in array element initializer");
+  }
+  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
+
+  // If this is an all-zero array, return a ConstantAggregateZero object.  If
+  // all undef, return an UndefValue, if "all simple", then return a
+  // ConstantDataArray.
+  Constant *C = V[0];
+  if (isa<UndefValue>(C) && rangeOnlyContains(V.begin(), V.end(), C))
+    return UndefValue::get(Ty);
+
+  if (C->isNullValue() && rangeOnlyContains(V.begin(), V.end(), C))
+    return ConstantAggregateZero::get(Ty);
+
+  // Check to see if all of the elements are ConstantFP or ConstantInt and if
+  // the element type is compatible with ConstantDataVector.  If so, use it.
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
+    // We speculatively build the elements here even if it turns out that there
+    // is a constantexpr or something else weird in the array, since it is so
+    // uncommon for that to happen.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      if (CI->getType()->isIntegerTy(8)) {
+        SmallVector<uint8_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(16)) {
+        SmallVector<uint16_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(32)) {
+        SmallVector<uint32_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(64)) {
+        SmallVector<uint64_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      }
+    }
+
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      if (CFP->getType()->isFloatTy()) {
+        SmallVector<float, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
+            Elts.push_back(CFP->getValueAPF().convertToFloat());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      } else if (CFP->getType()->isDoubleTy()) {
+        SmallVector<double, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
+            Elts.push_back(CFP->getValueAPF().convertToDouble());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataArray::get(C->getContext(), Elts);
+      }
+    }
+  }
+
+  // Otherwise, we really do want to create a ConstantArray.
+  return pImpl->ArrayConstants.getOrCreate(Ty, V);
+}
+
+/// getTypeForElements - Return an anonymous struct type to use for a constant
+/// with the specified set of elements.  The list must not be empty.
+StructType *ConstantStruct::getTypeForElements(LLVMContext &Context,
+                                               ArrayRef<Constant*> V,
+                                               bool Packed) {
+  unsigned VecSize = V.size();
+  SmallVector<Type*, 16> EltTypes(VecSize);
+  for (unsigned i = 0; i != VecSize; ++i)
+    EltTypes[i] = V[i]->getType();
+
+  return StructType::get(Context, EltTypes, Packed);
+}
+
+
+StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V,
+                                               bool Packed) {
+  assert(!V.empty() &&
+         "ConstantStruct::getTypeForElements cannot be called on empty list");
+  return getTypeForElements(V[0]->getContext(), V, Packed);
+}
+
+
+ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V)
+  : Constant(T, ConstantStructVal,
+             OperandTraits<ConstantStruct>::op_end(this) - V.size(),
+             V.size()) {
+  assert(V.size() == T->getNumElements() &&
+         "Invalid initializer vector for constant structure");
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    assert((T->isOpaque() || V[i]->getType() == T->getElementType(i)) &&
+           "Initializer for struct element doesn't match struct element type!");
+  std::copy(V.begin(), V.end(), op_begin());
+}
+
+// ConstantStruct accessors.
+Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
+  assert((ST->isOpaque() || ST->getNumElements() == V.size()) &&
+         "Incorrect # elements specified to ConstantStruct::get");
+
+  // Create a ConstantAggregateZero value if all elements are zeros.
+  bool isZero = true;
+  bool isUndef = false;
+  
+  if (!V.empty()) {
+    isUndef = isa<UndefValue>(V[0]);
+    isZero = V[0]->isNullValue();
+    if (isUndef || isZero) {
+      for (unsigned i = 0, e = V.size(); i != e; ++i) {
+        if (!V[i]->isNullValue())
+          isZero = false;
+        if (!isa<UndefValue>(V[i]))
+          isUndef = false;
+      }
+    }
+  }
+  if (isZero)
+    return ConstantAggregateZero::get(ST);
+  if (isUndef)
+    return UndefValue::get(ST);
+
+  return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
+}
+
+Constant *ConstantStruct::get(StructType *T, ...) {
+  va_list ap;
+  SmallVector<Constant*, 8> Values;
+  va_start(ap, T);
+  while (Constant *Val = va_arg(ap, llvm::Constant*))
+    Values.push_back(Val);
+  va_end(ap);
+  return get(T, Values);
+}
+
+ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
+  : Constant(T, ConstantVectorVal,
+             OperandTraits<ConstantVector>::op_end(this) - V.size(),
+             V.size()) {
+  for (size_t i = 0, e = V.size(); i != e; i++)
+    assert(V[i]->getType() == T->getElementType() &&
+           "Initializer for vector element doesn't match vector element type!");
+  std::copy(V.begin(), V.end(), op_begin());
+}
+
+// ConstantVector accessors.
+Constant *ConstantVector::get(ArrayRef<Constant*> V) {
+  assert(!V.empty() && "Vectors can't be empty");
+  VectorType *T = VectorType::get(V.front()->getType(), V.size());
+  LLVMContextImpl *pImpl = T->getContext().pImpl;
+
+  // If this is an all-undef or all-zero vector, return a
+  // ConstantAggregateZero or UndefValue.
+  Constant *C = V[0];
+  bool isZero = C->isNullValue();
+  bool isUndef = isa<UndefValue>(C);
+
+  if (isZero || isUndef) {
+    for (unsigned i = 1, e = V.size(); i != e; ++i)
+      if (V[i] != C) {
+        isZero = isUndef = false;
+        break;
+      }
+  }
+
+  if (isZero)
+    return ConstantAggregateZero::get(T);
+  if (isUndef)
+    return UndefValue::get(T);
+
+  // Check to see if all of the elements are ConstantFP or ConstantInt and if
+  // the element type is compatible with ConstantDataVector.  If so, use it.
+  if (ConstantDataSequential::isElementTypeCompatible(C->getType())) {
+    // We speculatively build the elements here even if it turns out that there
+    // is a constantexpr or something else weird in the array, since it is so
+    // uncommon for that to happen.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      if (CI->getType()->isIntegerTy(8)) {
+        SmallVector<uint8_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(16)) {
+        SmallVector<uint16_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(32)) {
+        SmallVector<uint32_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      } else if (CI->getType()->isIntegerTy(64)) {
+        SmallVector<uint64_t, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(V[i]))
+            Elts.push_back(CI->getZExtValue());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      }
+    }
+
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
+      if (CFP->getType()->isFloatTy()) {
+        SmallVector<float, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
+            Elts.push_back(CFP->getValueAPF().convertToFloat());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      } else if (CFP->getType()->isDoubleTy()) {
+        SmallVector<double, 16> Elts;
+        for (unsigned i = 0, e = V.size(); i != e; ++i)
+          if (ConstantFP *CFP = dyn_cast<ConstantFP>(V[i]))
+            Elts.push_back(CFP->getValueAPF().convertToDouble());
+          else
+            break;
+        if (Elts.size() == V.size())
+          return ConstantDataVector::get(C->getContext(), Elts);
+      }
+    }
+  }
+
+  // Otherwise, the element type isn't compatible with ConstantDataVector, or
+  // the operand list constants a ConstantExpr or something else strange.
+  return pImpl->VectorConstants.getOrCreate(T, V);
+}
+
+Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
+  // If this splat is compatible with ConstantDataVector, use it instead of
+  // ConstantVector.
+  if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
+      ConstantDataSequential::isElementTypeCompatible(V->getType()))
+    return ConstantDataVector::getSplat(NumElts, V);
+
+  SmallVector<Constant*, 32> Elts(NumElts, V);
+  return get(Elts);
+}
+
+
+// Utility function for determining if a ConstantExpr is a CastOp or not. This
+// can't be inline because we don't want to #include Instruction.h into
+// Constant.h
+bool ConstantExpr::isCast() const {
+  return Instruction::isCast(getOpcode());
+}
+
+bool ConstantExpr::isCompare() const {
+  return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp;
+}
+
+bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const {
+  if (getOpcode() != Instruction::GetElementPtr) return false;
+
+  gep_type_iterator GEPI = gep_type_begin(this), E = gep_type_end(this);
+  User::const_op_iterator OI = llvm::next(this->op_begin());
+
+  // Skip the first index, as it has no static limit.
+  ++GEPI;
+  ++OI;
+
+  // The remaining indices must be compile-time known integers within the
+  // bounds of the corresponding notional static array types.
+  for (; GEPI != E; ++GEPI, ++OI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(*OI);
+    if (!CI) return false;
+    if (ArrayType *ATy = dyn_cast<ArrayType>(*GEPI))
+      if (CI->getValue().getActiveBits() > 64 ||
+          CI->getZExtValue() >= ATy->getNumElements())
+        return false;
+  }
+
+  // All the indices checked out.
+  return true;
+}
+
+bool ConstantExpr::hasIndices() const {
+  return getOpcode() == Instruction::ExtractValue ||
+         getOpcode() == Instruction::InsertValue;
+}
+
+ArrayRef<unsigned> ConstantExpr::getIndices() const {
+  if (const ExtractValueConstantExpr *EVCE =
+        dyn_cast<ExtractValueConstantExpr>(this))
+    return EVCE->Indices;
+
+  return cast<InsertValueConstantExpr>(this)->Indices;
+}
+
+unsigned ConstantExpr::getPredicate() const {
+  assert(isCompare());
+  return ((const CompareConstantExpr*)this)->predicate;
+}
+
+/// getWithOperandReplaced - Return a constant expression identical to this
+/// one, but with the specified operand set to the specified value.
+Constant *
+ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
+  assert(Op->getType() == getOperand(OpNo)->getType() &&
+         "Replacing operand with value of different type!");
+  if (getOperand(OpNo) == Op)
+    return const_cast<ConstantExpr*>(this);
+
+  SmallVector<Constant*, 8> NewOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    NewOps.push_back(i == OpNo ? Op : getOperand(i));
+
+  return getWithOperands(NewOps);
+}
+
+/// getWithOperands - This returns the current constant expression with the
+/// operands replaced with the specified values.  The specified array must
+/// have the same number of operands as our current one.
+Constant *ConstantExpr::
+getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
+  assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
+  bool AnyChange = Ty != getType();
+  for (unsigned i = 0; i != Ops.size(); ++i)
+    AnyChange |= Ops[i] != getOperand(i);
+
+  if (!AnyChange)  // No operands changed, return self.
+    return const_cast<ConstantExpr*>(this);
+
+  switch (getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    return ConstantExpr::getCast(getOpcode(), Ops[0], Ty);
+  case Instruction::Select:
+    return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
+  case Instruction::InsertElement:
+    return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ConstantExpr::getExtractElement(Ops[0], Ops[1]);
+  case Instruction::InsertValue:
+    return ConstantExpr::getInsertValue(Ops[0], Ops[1], getIndices());
+  case Instruction::ExtractValue:
+    return ConstantExpr::getExtractValue(Ops[0], getIndices());
+  case Instruction::ShuffleVector:
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+  case Instruction::GetElementPtr:
+    return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1),
+                                      cast<GEPOperator>(this)->isInBounds());
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]);
+  default:
+    assert(getNumOperands() == 2 && "Must be binary operator?");
+    return ConstantExpr::get(getOpcode(), Ops[0], Ops[1], SubclassOptionalData);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      isValueValidForType implementations
+
+bool ConstantInt::isValueValidForType(Type *Ty, uint64_t Val) {
+  unsigned NumBits = Ty->getIntegerBitWidth(); // assert okay
+  if (Ty->isIntegerTy(1))
+    return Val == 0 || Val == 1;
+  if (NumBits >= 64)
+    return true; // always true, has to fit in largest type
+  uint64_t Max = (1ll << NumBits) - 1;
+  return Val <= Max;
+}
+
+bool ConstantInt::isValueValidForType(Type *Ty, int64_t Val) {
+  unsigned NumBits = Ty->getIntegerBitWidth();
+  if (Ty->isIntegerTy(1))
+    return Val == 0 || Val == 1 || Val == -1;
+  if (NumBits >= 64)
+    return true; // always true, has to fit in largest type
+  int64_t Min = -(1ll << (NumBits-1));
+  int64_t Max = (1ll << (NumBits-1)) - 1;
+  return (Val >= Min && Val <= Max);
+}
+
+bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  switch (Ty->getTypeID()) {
+  default:
+    return false;         // These can't be represented as floating point!
+
+  // FIXME rounding mode needs to be more flexible
+  case Type::HalfTyID: {
+    if (&Val2.getSemantics() == &APFloat::IEEEhalf)
+      return true;
+    Val2.convert(APFloat::IEEEhalf, APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
+  case Type::FloatTyID: {
+    if (&Val2.getSemantics() == &APFloat::IEEEsingle)
+      return true;
+    Val2.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
+  case Type::DoubleTyID: {
+    if (&Val2.getSemantics() == &APFloat::IEEEhalf ||
+        &Val2.getSemantics() == &APFloat::IEEEsingle ||
+        &Val2.getSemantics() == &APFloat::IEEEdouble)
+      return true;
+    Val2.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
+  case Type::X86_FP80TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEhalf ||
+           &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::x87DoubleExtended;
+  case Type::FP128TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEhalf ||
+           &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::IEEEquad;
+  case Type::PPC_FP128TyID:
+    return &Val2.getSemantics() == &APFloat::IEEEhalf ||
+           &Val2.getSemantics() == &APFloat::IEEEsingle || 
+           &Val2.getSemantics() == &APFloat::IEEEdouble ||
+           &Val2.getSemantics() == &APFloat::PPCDoubleDouble;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      Factory Function Implementation
+
+ConstantAggregateZero *ConstantAggregateZero::get(Type *Ty) {
+  assert((Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()) &&
+         "Cannot create an aggregate zero of non-aggregate type!");
+  
+  ConstantAggregateZero *&Entry = Ty->getContext().pImpl->CAZConstants[Ty];
+  if (Entry == 0)
+    Entry = new ConstantAggregateZero(Ty);
+
+  return Entry;
+}
+
+/// destroyConstant - Remove the constant from the constant table.
+///
+void ConstantAggregateZero::destroyConstant() {
+  getContext().pImpl->CAZConstants.erase(getType());
+  destroyConstantImpl();
+}
+
+/// destroyConstant - Remove the constant from the constant table...
+///
+void ConstantArray::destroyConstant() {
+  getType()->getContext().pImpl->ArrayConstants.remove(this);
+  destroyConstantImpl();
+}
+
+
+//---- ConstantStruct::get() implementation...
+//
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantStruct::destroyConstant() {
+  getType()->getContext().pImpl->StructConstants.remove(this);
+  destroyConstantImpl();
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantVector::destroyConstant() {
+  getType()->getContext().pImpl->VectorConstants.remove(this);
+  destroyConstantImpl();
+}
+
+/// getSplatValue - If this is a splat vector constant, meaning that all of
+/// the elements have the same value, return that value. Otherwise return 0.
+Constant *Constant::getSplatValue() const {
+  assert(this->getType()->isVectorTy() && "Only valid for vectors!");
+  if (isa<ConstantAggregateZero>(this))
+    return getNullValue(this->getType()->getVectorElementType());
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
+    return CV->getSplatValue();
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    return CV->getSplatValue();
+  return 0;
+}
+
+/// getSplatValue - If this is a splat constant, where all of the
+/// elements have the same value, return that value. Otherwise return null.
+Constant *ConstantVector::getSplatValue() const {
+  // Check out first element.
+  Constant *Elt = getOperand(0);
+  // Then make sure all remaining elements point to the same value.
+  for (unsigned I = 1, E = getNumOperands(); I < E; ++I)
+    if (getOperand(I) != Elt)
+      return 0;
+  return Elt;
+}
+
+/// If C is a constant integer then return its value, otherwise C must be a
+/// vector of constant integers, all equal, and the common value is returned.
+const APInt &Constant::getUniqueInteger() const {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->getValue();
+  assert(this->getSplatValue() && "Doesn't contain a unique integer!");
+  const Constant *C = this->getAggregateElement(0U);
+  assert(C && isa<ConstantInt>(C) && "Not a vector of numbers!");
+  return cast<ConstantInt>(C)->getValue();
+}
+
+
+//---- ConstantPointerNull::get() implementation.
+//
+
+ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
+  ConstantPointerNull *&Entry = Ty->getContext().pImpl->CPNConstants[Ty];
+  if (Entry == 0)
+    Entry = new ConstantPointerNull(Ty);
+
+  return Entry;
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantPointerNull::destroyConstant() {
+  getContext().pImpl->CPNConstants.erase(getType());
+  // Free the constant and any dangling references to it.
+  destroyConstantImpl();
+}
+
+
+//---- UndefValue::get() implementation.
+//
+
+UndefValue *UndefValue::get(Type *Ty) {
+  UndefValue *&Entry = Ty->getContext().pImpl->UVConstants[Ty];
+  if (Entry == 0)
+    Entry = new UndefValue(Ty);
+
+  return Entry;
+}
+
+// destroyConstant - Remove the constant from the constant table.
+//
+void UndefValue::destroyConstant() {
+  // Free the constant and any dangling references to it.
+  getContext().pImpl->UVConstants.erase(getType());
+  destroyConstantImpl();
+}
+
+//---- BlockAddress::get() implementation.
+//
+
+BlockAddress *BlockAddress::get(BasicBlock *BB) {
+  assert(BB->getParent() != 0 && "Block must have a parent");
+  return get(BB->getParent(), BB);
+}
+
+BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
+  BlockAddress *&BA =
+    F->getContext().pImpl->BlockAddresses[std::make_pair(F, BB)];
+  if (BA == 0)
+    BA = new BlockAddress(F, BB);
+
+  assert(BA->getFunction() == F && "Basic block moved between functions");
+  return BA;
+}
+
+BlockAddress::BlockAddress(Function *F, BasicBlock *BB)
+: Constant(Type::getInt8PtrTy(F->getContext()), Value::BlockAddressVal,
+           &Op<0>(), 2) {
+  setOperand(0, F);
+  setOperand(1, BB);
+  BB->AdjustBlockAddressRefCount(1);
+}
+
+
+// destroyConstant - Remove the constant from the constant table.
+//
+void BlockAddress::destroyConstant() {
+  getFunction()->getType()->getContext().pImpl
+    ->BlockAddresses.erase(std::make_pair(getFunction(), getBasicBlock()));
+  getBasicBlock()->AdjustBlockAddressRefCount(-1);
+  destroyConstantImpl();
+}
+
+void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
+  // This could be replacing either the Basic Block or the Function.  In either
+  // case, we have to remove the map entry.
+  Function *NewF = getFunction();
+  BasicBlock *NewBB = getBasicBlock();
+
+  if (U == &Op<0>())
+    NewF = cast<Function>(To);
+  else
+    NewBB = cast<BasicBlock>(To);
+
+  // See if the 'new' entry already exists, if not, just update this in place
+  // and return early.
+  BlockAddress *&NewBA =
+    getContext().pImpl->BlockAddresses[std::make_pair(NewF, NewBB)];
+  if (NewBA == 0) {
+    getBasicBlock()->AdjustBlockAddressRefCount(-1);
+
+    // Remove the old entry, this can't cause the map to rehash (just a
+    // tombstone will get added).
+    getContext().pImpl->BlockAddresses.erase(std::make_pair(getFunction(),
+                                                            getBasicBlock()));
+    NewBA = this;
+    setOperand(0, NewF);
+    setOperand(1, NewBB);
+    getBasicBlock()->AdjustBlockAddressRefCount(1);
+    return;
+  }
+
+  // Otherwise, I do need to replace this with an existing value.
+  assert(NewBA != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(NewBA);
+
+  destroyConstant();
+}
+
+//---- ConstantExpr::get() implementations.
+//
+
+/// This is a utility function to handle folding of casts and lookup of the
+/// cast in the ExprConstants map. It is used by the various get* methods below.
+static inline Constant *getFoldedCast(
+  Instruction::CastOps opc, Constant *C, Type *Ty) {
+  assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!");
+  // Fold a few common cases
+  if (Constant *FC = ConstantFoldCastInstruction(opc, C, Ty))
+    return FC;
+
+  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
+
+  // Look up the constant in the table first to ensure uniqueness.
+  ExprMapKeyType Key(opc, C);
+
+  return pImpl->ExprConstants.getOrCreate(Ty, Key);
+}
+
+Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
+  Instruction::CastOps opc = Instruction::CastOps(oc);
+  assert(Instruction::isCast(opc) && "opcode out of range");
+  assert(C && Ty && "Null arguments to getCast");
+  assert(CastInst::castIsValid(opc, C, Ty) && "Invalid constantexpr cast!");
+
+  switch (opc) {
+  default:
+    llvm_unreachable("Invalid cast opcode");
+  case Instruction::Trunc:    return getTrunc(C, Ty);
+  case Instruction::ZExt:     return getZExt(C, Ty);
+  case Instruction::SExt:     return getSExt(C, Ty);
+  case Instruction::FPTrunc:  return getFPTrunc(C, Ty);
+  case Instruction::FPExt:    return getFPExtend(C, Ty);
+  case Instruction::UIToFP:   return getUIToFP(C, Ty);
+  case Instruction::SIToFP:   return getSIToFP(C, Ty);
+  case Instruction::FPToUI:   return getFPToUI(C, Ty);
+  case Instruction::FPToSI:   return getFPToSI(C, Ty);
+  case Instruction::PtrToInt: return getPtrToInt(C, Ty);
+  case Instruction::IntToPtr: return getIntToPtr(C, Ty);
+  case Instruction::BitCast:  return getBitCast(C, Ty);
+  }
+}
+
+Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) {
+  if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return getBitCast(C, Ty);
+  return getZExt(C, Ty);
+}
+
+Constant *ConstantExpr::getSExtOrBitCast(Constant *C, Type *Ty) {
+  if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return getBitCast(C, Ty);
+  return getSExt(C, Ty);
+}
+
+Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
+  if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return getBitCast(C, Ty);
+  return getTrunc(C, Ty);
+}
+
+Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
+          "Invalid cast");
+
+  if (Ty->isIntOrIntVectorTy())
+    return getPtrToInt(S, Ty);
+  return getBitCast(S, Ty);
+}
+
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, 
+                                       bool isSigned) {
+  assert(C->getType()->isIntOrIntVectorTy() &&
+         Ty->isIntOrIntVectorTy() && "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return getCast(opcode, C, Ty);
+}
+
+Constant *ConstantExpr::getFPCast(Constant *C, Type *Ty) {
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  if (SrcBits == DstBits)
+    return C; // Avoid a useless cast
+  Instruction::CastOps opcode =
+    (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt);
+  return getCast(opcode, C, Ty);
+}
+
+Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVectorTy() && "Trunc operand must be integer");
+  assert(Ty->isIntOrIntVectorTy() && "Trunc produces only integral");
+  assert(C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&&
+         "SrcTy must be larger than DestTy for Trunc!");
+
+  return getFoldedCast(Instruction::Trunc, C, Ty);
+}
+
+Constant *ConstantExpr::getSExt(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVectorTy() && "SExt operand must be integral");
+  assert(Ty->isIntOrIntVectorTy() && "SExt produces only integer");
+  assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
+         "SrcTy must be smaller than DestTy for SExt!");
+
+  return getFoldedCast(Instruction::SExt, C, Ty);
+}
+
+Constant *ConstantExpr::getZExt(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVectorTy() && "ZEXt operand must be integral");
+  assert(Ty->isIntOrIntVectorTy() && "ZExt produces only integer");
+  assert(C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
+         "SrcTy must be smaller than DestTy for ZExt!");
+
+  return getFoldedCast(Instruction::ZExt, C, Ty);
+}
+
+Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         C->getType()->getScalarSizeInBits() > Ty->getScalarSizeInBits()&&
+         "This is an illegal floating point truncation!");
+  return getFoldedCast(Instruction::FPTrunc, C, Ty);
+}
+
+Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         C->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits()&&
+         "This is an illegal floating point extension!");
+  return getFoldedCast(Instruction::FPExt, C, Ty);
+}
+
+Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "This is an illegal uint to floating point cast!");
+  return getFoldedCast(Instruction::UIToFP, C, Ty);
+}
+
+Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "This is an illegal sint to floating point cast!");
+  return getFoldedCast(Instruction::SIToFP, C, Ty);
+}
+
+Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "This is an illegal floating point to uint cast!");
+  return getFoldedCast(Instruction::FPToUI, C, Ty);
+}
+
+Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty) {
+#ifndef NDEBUG
+  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
+  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+#endif
+  assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "This is an illegal floating point to sint cast!");
+  return getFoldedCast(Instruction::FPToSI, C, Ty);
+}
+
+Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy) {
+  assert(C->getType()->getScalarType()->isPointerTy() &&
+         "PtrToInt source must be pointer or pointer vector");
+  assert(DstTy->getScalarType()->isIntegerTy() && 
+         "PtrToInt destination must be integer or integer vector");
+  assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+  if (isa<VectorType>(C->getType()))
+    assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
+           "Invalid cast between a different number of vector elements");
+  return getFoldedCast(Instruction::PtrToInt, C, DstTy);
+}
+
+Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) {
+  assert(C->getType()->getScalarType()->isIntegerTy() &&
+         "IntToPtr source must be integer or integer vector");
+  assert(DstTy->getScalarType()->isPointerTy() &&
+         "IntToPtr destination must be a pointer or pointer vector");
+  assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
+  if (isa<VectorType>(C->getType()))
+    assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
+           "Invalid cast between a different number of vector elements");
+  return getFoldedCast(Instruction::IntToPtr, C, DstTy);
+}
+
+Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
+  assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) &&
+         "Invalid constantexpr bitcast!");
+
+  // It is common to ask for a bitcast of a value to its own type, handle this
+  // speedily.
+  if (C->getType() == DstTy) return C;
+
+  return getFoldedCast(Instruction::BitCast, C, DstTy);
+}
+
+Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
+                            unsigned Flags) {
+  // Check the operands for consistency first.
+  assert(Opcode >= Instruction::BinaryOpsBegin &&
+         Opcode <  Instruction::BinaryOpsEnd   &&
+         "Invalid opcode in binary constant expression");
+  assert(C1->getType() == C2->getType() &&
+         "Operand types in binary constant expression should match");
+
+#ifndef NDEBUG
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVectorTy() &&
+           "Tried to create an integer operation on a non-integer type!");
+    break;
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isFPOrFPVectorTy() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
+    break;
+  case Instruction::UDiv: 
+  case Instruction::SDiv: 
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVectorTy() &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::FDiv:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isFPOrFPVectorTy() &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::URem: 
+  case Instruction::SRem: 
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVectorTy() &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::FRem:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isFPOrFPVectorTy() &&
+           "Tried to create an arithmetic operation on a non-arithmetic type!");
+    break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVectorTy() &&
+           "Tried to create a logical operation on a non-integral type!");
+    break;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    assert(C1->getType() == C2->getType() && "Op types should be identical!");
+    assert(C1->getType()->isIntOrIntVectorTy() &&
+           "Tried to create a shift operation on a non-integer type!");
+    break;
+  default:
+    break;
+  }
+#endif
+
+  if (Constant *FC = ConstantFoldBinaryInstruction(Opcode, C1, C2))
+    return FC;          // Fold a few common cases.
+
+  Constant *ArgVec[] = { C1, C2 };
+  ExprMapKeyType Key(Opcode, ArgVec, 0, Flags);
+
+  LLVMContextImpl *pImpl = C1->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(C1->getType(), Key);
+}
+
+Constant *ConstantExpr::getSizeOf(Type* Ty) {
+  // sizeof is implemented as: (i64) gep (Ty*)null, 1
+  // Note that a non-inbounds gep is used, as null isn't within any object.
+  Constant *GEPIdx = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
+  Constant *GEP = getGetElementPtr(
+                 Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx);
+  return getPtrToInt(GEP, 
+                     Type::getInt64Ty(Ty->getContext()));
+}
+
+Constant *ConstantExpr::getAlignOf(Type* Ty) {
+  // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
+  // Note that a non-inbounds gep is used, as null isn't within any object.
+  Type *AligningTy = 
+    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, NULL);
+  Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo());
+  Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
+  Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
+  Constant *Indices[2] = { Zero, One };
+  Constant *GEP = getGetElementPtr(NullPtr, Indices);
+  return getPtrToInt(GEP,
+                     Type::getInt64Ty(Ty->getContext()));
+}
+
+Constant *ConstantExpr::getOffsetOf(StructType* STy, unsigned FieldNo) {
+  return getOffsetOf(STy, ConstantInt::get(Type::getInt32Ty(STy->getContext()),
+                                           FieldNo));
+}
+
+Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) {
+  // offsetof is implemented as: (i64) gep (Ty*)null, 0, FieldNo
+  // Note that a non-inbounds gep is used, as null isn't within any object.
+  Constant *GEPIdx[] = {
+    ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0),
+    FieldNo
+  };
+  Constant *GEP = getGetElementPtr(
+                Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx);
+  return getPtrToInt(GEP,
+                     Type::getInt64Ty(Ty->getContext()));
+}
+
+Constant *ConstantExpr::getCompare(unsigned short Predicate, 
+                                   Constant *C1, Constant *C2) {
+  assert(C1->getType() == C2->getType() && "Op types should be identical!");
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid CmpInst predicate");
+  case CmpInst::FCMP_FALSE: case CmpInst::FCMP_OEQ: case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_OGE:   case CmpInst::FCMP_OLT: case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ONE:   case CmpInst::FCMP_ORD: case CmpInst::FCMP_UNO:
+  case CmpInst::FCMP_UEQ:   case CmpInst::FCMP_UGT: case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_ULT:   case CmpInst::FCMP_ULE: case CmpInst::FCMP_UNE:
+  case CmpInst::FCMP_TRUE:
+    return getFCmp(Predicate, C1, C2);
+
+  case CmpInst::ICMP_EQ:  case CmpInst::ICMP_NE:  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE: case CmpInst::ICMP_ULT: case CmpInst::ICMP_ULE:
+  case CmpInst::ICMP_SGT: case CmpInst::ICMP_SGE: case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SLE:
+    return getICmp(Predicate, C1, C2);
+  }
+}
+
+Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) {
+  assert(!SelectInst::areInvalidOperands(C, V1, V2)&&"Invalid select operands");
+
+  if (Constant *SC = ConstantFoldSelectInstruction(C, V1, V2))
+    return SC;        // Fold common cases
+
+  Constant *ArgVec[] = { C, V1, V2 };
+  ExprMapKeyType Key(Instruction::Select, ArgVec);
+
+  LLVMContextImpl *pImpl = C->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(V1->getType(), Key);
+}
+
+Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
+                                         bool InBounds) {
+  assert(C->getType()->isPtrOrPtrVectorTy() &&
+         "Non-pointer type for constant GetElementPtr expression");
+
+  if (Constant *FC = ConstantFoldGetElementPtr(C, InBounds, Idxs))
+    return FC;          // Fold a few common cases.
+
+  // Get the result type of the getelementptr!
+  Type *Ty = GetElementPtrInst::getIndexedType(C->getType(), Idxs);
+  assert(Ty && "GEP indices invalid!");
+  unsigned AS = C->getType()->getPointerAddressSpace();
+  Type *ReqTy = Ty->getPointerTo(AS);
+  if (VectorType *VecTy = dyn_cast<VectorType>(C->getType()))
+    ReqTy = VectorType::get(ReqTy, VecTy->getNumElements());
+
+  // Look up the constant in the table first to ensure uniqueness
+  std::vector<Constant*> ArgVec;
+  ArgVec.reserve(1 + Idxs.size());
+  ArgVec.push_back(C);
+  for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
+    assert(Idxs[i]->getType()->isVectorTy() == ReqTy->isVectorTy() &&
+           "getelementptr index type missmatch");
+    assert((!Idxs[i]->getType()->isVectorTy() ||
+            ReqTy->getVectorNumElements() ==
+            Idxs[i]->getType()->getVectorNumElements()) &&
+           "getelementptr index type missmatch");
+    ArgVec.push_back(cast<Constant>(Idxs[i]));
+  }
+  const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
+                           InBounds ? GEPOperator::IsInBounds : 0);
+
+  LLVMContextImpl *pImpl = C->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
+}
+
+Constant *
+ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *RHS) {
+  assert(LHS->getType() == RHS->getType());
+  assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && 
+         pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid ICmp Predicate");
+
+  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+    return FC;          // Fold a few common cases...
+
+  // Look up the constant in the table first to ensure uniqueness
+  Constant *ArgVec[] = { LHS, RHS };
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::ICmp, ArgVec, pred);
+
+  Type *ResultTy = Type::getInt1Ty(LHS->getContext());
+  if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
+    ResultTy = VectorType::get(ResultTy, VT->getNumElements());
+
+  LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
+}
+
+Constant *
+ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS) {
+  assert(LHS->getType() == RHS->getType());
+  assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid FCmp Predicate");
+
+  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+    return FC;          // Fold a few common cases...
+
+  // Look up the constant in the table first to ensure uniqueness
+  Constant *ArgVec[] = { LHS, RHS };
+  // Get the key type with both the opcode and predicate
+  const ExprMapKeyType Key(Instruction::FCmp, ArgVec, pred);
+
+  Type *ResultTy = Type::getInt1Ty(LHS->getContext());
+  if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
+    ResultTy = VectorType::get(ResultTy, VT->getNumElements());
+
+  LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
+}
+
+Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx) {
+  assert(Val->getType()->isVectorTy() &&
+         "Tried to create extractelement operation on non-vector type!");
+  assert(Idx->getType()->isIntegerTy(32) &&
+         "Extractelement index must be i32 type!");
+
+  if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
+    return FC;          // Fold a few common cases.
+
+  // Look up the constant in the table first to ensure uniqueness
+  Constant *ArgVec[] = { Val, Idx };
+  const ExprMapKeyType Key(Instruction::ExtractElement, ArgVec);
+
+  LLVMContextImpl *pImpl = Val->getContext().pImpl;
+  Type *ReqTy = Val->getType()->getVectorElementType();
+  return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
+}
+
+Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt, 
+                                         Constant *Idx) {
+  assert(Val->getType()->isVectorTy() &&
+         "Tried to create insertelement operation on non-vector type!");
+  assert(Elt->getType() == Val->getType()->getVectorElementType() &&
+         "Insertelement types must match!");
+  assert(Idx->getType()->isIntegerTy(32) &&
+         "Insertelement index must be i32 type!");
+
+  if (Constant *FC = ConstantFoldInsertElementInstruction(Val, Elt, Idx))
+    return FC;          // Fold a few common cases.
+  // Look up the constant in the table first to ensure uniqueness
+  Constant *ArgVec[] = { Val, Elt, Idx };
+  const ExprMapKeyType Key(Instruction::InsertElement, ArgVec);
+
+  LLVMContextImpl *pImpl = Val->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(Val->getType(), Key);
+}
+
+Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2, 
+                                         Constant *Mask) {
+  assert(ShuffleVectorInst::isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector constant expr operands!");
+
+  if (Constant *FC = ConstantFoldShuffleVectorInstruction(V1, V2, Mask))
+    return FC;          // Fold a few common cases.
+
+  unsigned NElts = Mask->getType()->getVectorNumElements();
+  Type *EltTy = V1->getType()->getVectorElementType();
+  Type *ShufTy = VectorType::get(EltTy, NElts);
+
+  // Look up the constant in the table first to ensure uniqueness
+  Constant *ArgVec[] = { V1, V2, Mask };
+  const ExprMapKeyType Key(Instruction::ShuffleVector, ArgVec);
+
+  LLVMContextImpl *pImpl = ShufTy->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ShufTy, Key);
+}
+
+Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val,
+                                       ArrayRef<unsigned> Idxs) {
+  assert(ExtractValueInst::getIndexedType(Agg->getType(),
+                                          Idxs) == Val->getType() &&
+         "insertvalue indices invalid!");
+  assert(Agg->getType()->isFirstClassType() &&
+         "Non-first-class type for constant insertvalue expression");
+  Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs);
+  assert(FC && "insertvalue constant expr couldn't be folded!");
+  return FC;
+}
+
+Constant *ConstantExpr::getExtractValue(Constant *Agg,
+                                        ArrayRef<unsigned> Idxs) {
+  assert(Agg->getType()->isFirstClassType() &&
+         "Tried to create extractelement operation on non-first-class type!");
+
+  Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
+  (void)ReqTy;
+  assert(ReqTy && "extractvalue indices invalid!");
+
+  assert(Agg->getType()->isFirstClassType() &&
+         "Non-first-class type for constant extractvalue expression");
+  Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs);
+  assert(FC && "ExtractValue constant expr couldn't be folded!");
+  return FC;
+}
+
+Constant *ConstantExpr::getNeg(Constant *C, bool HasNUW, bool HasNSW) {
+  assert(C->getType()->isIntOrIntVectorTy() &&
+         "Cannot NEG a nonintegral value!");
+  return getSub(ConstantFP::getZeroValueForNegation(C->getType()),
+                C, HasNUW, HasNSW);
+}
+
+Constant *ConstantExpr::getFNeg(Constant *C) {
+  assert(C->getType()->isFPOrFPVectorTy() &&
+         "Cannot FNEG a non-floating-point value!");
+  return getFSub(ConstantFP::getZeroValueForNegation(C->getType()), C);
+}
+
+Constant *ConstantExpr::getNot(Constant *C) {
+  assert(C->getType()->isIntOrIntVectorTy() &&
+         "Cannot NOT a nonintegral value!");
+  return get(Instruction::Xor, C, Constant::getAllOnesValue(C->getType()));
+}
+
+Constant *ConstantExpr::getAdd(Constant *C1, Constant *C2,
+                               bool HasNUW, bool HasNSW) {
+  unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) |
+                   (HasNSW ? OverflowingBinaryOperator::NoSignedWrap   : 0);
+  return get(Instruction::Add, C1, C2, Flags);
+}
+
+Constant *ConstantExpr::getFAdd(Constant *C1, Constant *C2) {
+  return get(Instruction::FAdd, C1, C2);
+}
+
+Constant *ConstantExpr::getSub(Constant *C1, Constant *C2,
+                               bool HasNUW, bool HasNSW) {
+  unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) |
+                   (HasNSW ? OverflowingBinaryOperator::NoSignedWrap   : 0);
+  return get(Instruction::Sub, C1, C2, Flags);
+}
+
+Constant *ConstantExpr::getFSub(Constant *C1, Constant *C2) {
+  return get(Instruction::FSub, C1, C2);
+}
+
+Constant *ConstantExpr::getMul(Constant *C1, Constant *C2,
+                               bool HasNUW, bool HasNSW) {
+  unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) |
+                   (HasNSW ? OverflowingBinaryOperator::NoSignedWrap   : 0);
+  return get(Instruction::Mul, C1, C2, Flags);
+}
+
+Constant *ConstantExpr::getFMul(Constant *C1, Constant *C2) {
+  return get(Instruction::FMul, C1, C2);
+}
+
+Constant *ConstantExpr::getUDiv(Constant *C1, Constant *C2, bool isExact) {
+  return get(Instruction::UDiv, C1, C2,
+             isExact ? PossiblyExactOperator::IsExact : 0);
+}
+
+Constant *ConstantExpr::getSDiv(Constant *C1, Constant *C2, bool isExact) {
+  return get(Instruction::SDiv, C1, C2,
+             isExact ? PossiblyExactOperator::IsExact : 0);
+}
+
+Constant *ConstantExpr::getFDiv(Constant *C1, Constant *C2) {
+  return get(Instruction::FDiv, C1, C2);
+}
+
+Constant *ConstantExpr::getURem(Constant *C1, Constant *C2) {
+  return get(Instruction::URem, C1, C2);
+}
+
+Constant *ConstantExpr::getSRem(Constant *C1, Constant *C2) {
+  return get(Instruction::SRem, C1, C2);
+}
+
+Constant *ConstantExpr::getFRem(Constant *C1, Constant *C2) {
+  return get(Instruction::FRem, C1, C2);
+}
+
+Constant *ConstantExpr::getAnd(Constant *C1, Constant *C2) {
+  return get(Instruction::And, C1, C2);
+}
+
+Constant *ConstantExpr::getOr(Constant *C1, Constant *C2) {
+  return get(Instruction::Or, C1, C2);
+}
+
+Constant *ConstantExpr::getXor(Constant *C1, Constant *C2) {
+  return get(Instruction::Xor, C1, C2);
+}
+
+Constant *ConstantExpr::getShl(Constant *C1, Constant *C2,
+                               bool HasNUW, bool HasNSW) {
+  unsigned Flags = (HasNUW ? OverflowingBinaryOperator::NoUnsignedWrap : 0) |
+                   (HasNSW ? OverflowingBinaryOperator::NoSignedWrap   : 0);
+  return get(Instruction::Shl, C1, C2, Flags);
+}
+
+Constant *ConstantExpr::getLShr(Constant *C1, Constant *C2, bool isExact) {
+  return get(Instruction::LShr, C1, C2,
+             isExact ? PossiblyExactOperator::IsExact : 0);
+}
+
+Constant *ConstantExpr::getAShr(Constant *C1, Constant *C2, bool isExact) {
+  return get(Instruction::AShr, C1, C2,
+             isExact ? PossiblyExactOperator::IsExact : 0);
+}
+
+/// getBinOpIdentity - Return the identity for the given binary operation,
+/// i.e. a constant C such that X op C = X and C op X = X for every X.  It
+/// returns null if the operator doesn't have an identity.
+Constant *ConstantExpr::getBinOpIdentity(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an identity.
+    return 0;
+
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return Constant::getNullValue(Ty);
+
+  case Instruction::Mul:
+    return ConstantInt::get(Ty, 1);
+
+  case Instruction::And:
+    return Constant::getAllOnesValue(Ty);
+  }
+}
+
+/// getBinOpAbsorber - Return the absorbing element for the given binary
+/// operation, i.e. a constant C such that X op C = C and C op X = C for
+/// every X.  For example, this returns zero for integer multiplication.
+/// It returns null if the operator doesn't have an absorbing element.
+Constant *ConstantExpr::getBinOpAbsorber(unsigned Opcode, Type *Ty) {
+  switch (Opcode) {
+  default:
+    // Doesn't have an absorber.
+    return 0;
+
+  case Instruction::Or:
+    return Constant::getAllOnesValue(Ty);
+
+  case Instruction::And:
+  case Instruction::Mul:
+    return Constant::getNullValue(Ty);
+  }
+}
+
+// destroyConstant - Remove the constant from the constant table...
+//
+void ConstantExpr::destroyConstant() {
+  getType()->getContext().pImpl->ExprConstants.remove(this);
+  destroyConstantImpl();
+}
+
+const char *ConstantExpr::getOpcodeName() const {
+  return Instruction::getOpcodeName(getOpcode());
+}
+
+
+
+GetElementPtrConstantExpr::
+GetElementPtrConstantExpr(Constant *C, ArrayRef<Constant*> IdxList,
+                          Type *DestTy)
+  : ConstantExpr(DestTy, Instruction::GetElementPtr,
+                 OperandTraits<GetElementPtrConstantExpr>::op_end(this)
+                 - (IdxList.size()+1), IdxList.size()+1) {
+  OperandList[0] = C;
+  for (unsigned i = 0, E = IdxList.size(); i != E; ++i)
+    OperandList[i+1] = IdxList[i];
+}
+
+//===----------------------------------------------------------------------===//
+//                       ConstantData* implementations
+
+void ConstantDataArray::anchor() {}
+void ConstantDataVector::anchor() {}
+
+/// getElementType - Return the element type of the array/vector.
+Type *ConstantDataSequential::getElementType() const {
+  return getType()->getElementType();
+}
+
+StringRef ConstantDataSequential::getRawDataValues() const {
+  return StringRef(DataElements, getNumElements()*getElementByteSize());
+}
+
+/// isElementTypeCompatible - Return true if a ConstantDataSequential can be
+/// formed with a vector or array of the specified element type.
+/// ConstantDataArray only works with normal float and int types that are
+/// stored densely in memory, not with things like i42 or x86_f80.
+bool ConstantDataSequential::isElementTypeCompatible(const Type *Ty) {
+  if (Ty->isFloatTy() || Ty->isDoubleTy()) return true;
+  if (const IntegerType *IT = dyn_cast<IntegerType>(Ty)) {
+    switch (IT->getBitWidth()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      return true;
+    default: break;
+    }
+  }
+  return false;
+}
+
+/// getNumElements - Return the number of elements in the array or vector.
+unsigned ConstantDataSequential::getNumElements() const {
+  if (ArrayType *AT = dyn_cast<ArrayType>(getType()))
+    return AT->getNumElements();
+  return getType()->getVectorNumElements();
+}
+
+
+/// getElementByteSize - Return the size in bytes of the elements in the data.
+uint64_t ConstantDataSequential::getElementByteSize() const {
+  return getElementType()->getPrimitiveSizeInBits()/8;
+}
+
+/// getElementPointer - Return the start of the specified element.
+const char *ConstantDataSequential::getElementPointer(unsigned Elt) const {
+  assert(Elt < getNumElements() && "Invalid Elt");
+  return DataElements+Elt*getElementByteSize();
+}
+
+
+/// isAllZeros - return true if the array is empty or all zeros.
+static bool isAllZeros(StringRef Arr) {
+  for (StringRef::iterator I = Arr.begin(), E = Arr.end(); I != E; ++I)
+    if (*I != 0)
+      return false;
+  return true;
+}
+
+/// getImpl - This is the underlying implementation of all of the
+/// ConstantDataSequential::get methods.  They all thunk down to here, providing
+/// the correct element type.  We take the bytes in as a StringRef because
+/// we *want* an underlying "char*" to avoid TBAA type punning violations.
+Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
+  assert(isElementTypeCompatible(Ty->getSequentialElementType()));
+  // If the elements are all zero or there are no elements, return a CAZ, which
+  // is more dense and canonical.
+  if (isAllZeros(Elements))
+    return ConstantAggregateZero::get(Ty);
+
+  // Do a lookup to see if we have already formed one of these.
+  StringMap<ConstantDataSequential*>::MapEntryTy &Slot =
+    Ty->getContext().pImpl->CDSConstants.GetOrCreateValue(Elements);
+
+  // The bucket can point to a linked list of different CDS's that have the same
+  // body but different types.  For example, 0,0,0,1 could be a 4 element array
+  // of i8, or a 1-element array of i32.  They'll both end up in the same
+  /// StringMap bucket, linked up by their Next pointers.  Walk the list.
+  ConstantDataSequential **Entry = &Slot.getValue();
+  for (ConstantDataSequential *Node = *Entry; Node != 0;
+       Entry = &Node->Next, Node = *Entry)
+    if (Node->getType() == Ty)
+      return Node;
+
+  // Okay, we didn't get a hit.  Create a node of the right class, link it in,
+  // and return it.
+  if (isa<ArrayType>(Ty))
+    return *Entry = new ConstantDataArray(Ty, Slot.getKeyData());
+
+  assert(isa<VectorType>(Ty));
+  return *Entry = new ConstantDataVector(Ty, Slot.getKeyData());
+}
+
+void ConstantDataSequential::destroyConstant() {
+  // Remove the constant from the StringMap.
+  StringMap<ConstantDataSequential*> &CDSConstants = 
+    getType()->getContext().pImpl->CDSConstants;
+
+  StringMap<ConstantDataSequential*>::iterator Slot =
+    CDSConstants.find(getRawDataValues());
+
+  assert(Slot != CDSConstants.end() && "CDS not found in uniquing table");
+
+  ConstantDataSequential **Entry = &Slot->getValue();
+
+  // Remove the entry from the hash table.
+  if ((*Entry)->Next == 0) {
+    // If there is only one value in the bucket (common case) it must be this
+    // entry, and removing the entry should remove the bucket completely.
+    assert((*Entry) == this && "Hash mismatch in ConstantDataSequential");
+    getContext().pImpl->CDSConstants.erase(Slot);
+  } else {
+    // Otherwise, there are multiple entries linked off the bucket, unlink the 
+    // node we care about but keep the bucket around.
+    for (ConstantDataSequential *Node = *Entry; ;
+         Entry = &Node->Next, Node = *Entry) {
+      assert(Node && "Didn't find entry in its uniquing hash table!");
+      // If we found our entry, unlink it from the list and we're done.
+      if (Node == this) {
+        *Entry = Node->Next;
+        break;
+      }
+    }
+  }
+
+  // If we were part of a list, make sure that we don't delete the list that is
+  // still owned by the uniquing map.
+  Next = 0;
+
+  // Finally, actually delete it.
+  destroyConstantImpl();
+}
+
+/// get() constructors - Return a constant with array type with an element
+/// count and element type matching the ArrayRef passed in.  Note that this
+/// can return a ConstantAggregateZero object.
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) {
+  Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
+}
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
+  Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
+}
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
+  Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+}
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
+  Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+}
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
+  Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+}
+Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
+  Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+}
+
+/// getString - This method constructs a CDS and initializes it with a text
+/// string. The default behavior (AddNull==true) causes a null terminator to
+/// be placed at the end of the array (increasing the length of the string by
+/// one more than the StringRef would normally indicate.  Pass AddNull=false
+/// to disable this behavior.
+Constant *ConstantDataArray::getString(LLVMContext &Context,
+                                       StringRef Str, bool AddNull) {
+  if (!AddNull) {
+    const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data());
+    return get(Context, ArrayRef<uint8_t>(const_cast<uint8_t *>(Data),
+               Str.size()));
+  }
+
+  SmallVector<uint8_t, 64> ElementVals;
+  ElementVals.append(Str.begin(), Str.end());
+  ElementVals.push_back(0);
+  return get(Context, ElementVals);
+}
+
+/// get() constructors - Return a constant with vector type with an element
+/// count and element type matching the ArrayRef passed in.  Note that this
+/// can return a ConstantAggregateZero object.
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){
+  Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
+}
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
+  Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
+}
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
+  Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+}
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
+  Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+}
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
+  Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+}
+Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
+  Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+}
+
+Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
+  assert(isElementTypeCompatible(V->getType()) &&
+         "Element type not compatible with ConstantData");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getType()->isIntegerTy(8)) {
+      SmallVector<uint8_t, 16> Elts(NumElts, CI->getZExtValue());
+      return get(V->getContext(), Elts);
+    }
+    if (CI->getType()->isIntegerTy(16)) {
+      SmallVector<uint16_t, 16> Elts(NumElts, CI->getZExtValue());
+      return get(V->getContext(), Elts);
+    }
+    if (CI->getType()->isIntegerTy(32)) {
+      SmallVector<uint32_t, 16> Elts(NumElts, CI->getZExtValue());
+      return get(V->getContext(), Elts);
+    }
+    assert(CI->getType()->isIntegerTy(64) && "Unsupported ConstantData type");
+    SmallVector<uint64_t, 16> Elts(NumElts, CI->getZExtValue());
+    return get(V->getContext(), Elts);
+  }
+
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType()->isFloatTy()) {
+      SmallVector<float, 16> Elts(NumElts, CFP->getValueAPF().convertToFloat());
+      return get(V->getContext(), Elts);
+    }
+    if (CFP->getType()->isDoubleTy()) {
+      SmallVector<double, 16> Elts(NumElts,
+                                   CFP->getValueAPF().convertToDouble());
+      return get(V->getContext(), Elts);
+    }
+  }
+  return ConstantVector::getSplat(NumElts, V);
+}
+
+
+/// getElementAsInteger - If this is a sequential container of integers (of
+/// any size), return the specified element in the low bits of a uint64_t.
+uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
+  assert(isa<IntegerType>(getElementType()) &&
+         "Accessor can only be used when element is an integer");
+  const char *EltPtr = getElementPointer(Elt);
+
+  // The data is stored in host byte order, make sure to cast back to the right
+  // type to load with the right endianness.
+  switch (getElementType()->getIntegerBitWidth()) {
+  default: llvm_unreachable("Invalid bitwidth for CDS");
+  case 8:
+    return *const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(EltPtr));
+  case 16:
+    return *const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(EltPtr));
+  case 32:
+    return *const_cast<uint32_t *>(reinterpret_cast<const uint32_t *>(EltPtr));
+  case 64:
+    return *const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(EltPtr));
+  }
+}
+
+/// getElementAsAPFloat - If this is a sequential container of floating point
+/// type, return the specified element as an APFloat.
+APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
+  const char *EltPtr = getElementPointer(Elt);
+
+  switch (getElementType()->getTypeID()) {
+  default:
+    llvm_unreachable("Accessor can only be used when element is float/double!");
+  case Type::FloatTyID: {
+      const float *FloatPrt = reinterpret_cast<const float *>(EltPtr);
+      return APFloat(*const_cast<float *>(FloatPrt));
+    }
+  case Type::DoubleTyID: {
+      const double *DoublePtr = reinterpret_cast<const double *>(EltPtr);
+      return APFloat(*const_cast<double *>(DoublePtr));
+    }
+  }
+}
+
+/// getElementAsFloat - If this is an sequential container of floats, return
+/// the specified element as a float.
+float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
+  assert(getElementType()->isFloatTy() &&
+         "Accessor can only be used when element is a 'float'");
+  const float *EltPtr = reinterpret_cast<const float *>(getElementPointer(Elt));
+  return *const_cast<float *>(EltPtr);
+}
+
+/// getElementAsDouble - If this is an sequential container of doubles, return
+/// the specified element as a float.
+double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
+  assert(getElementType()->isDoubleTy() &&
+         "Accessor can only be used when element is a 'float'");
+  const double *EltPtr =
+      reinterpret_cast<const double *>(getElementPointer(Elt));
+  return *const_cast<double *>(EltPtr);
+}
+
+/// getElementAsConstant - Return a Constant for a specified index's element.
+/// Note that this has to compute a new constant to return, so it isn't as
+/// efficient as getElementAsInteger/Float/Double.
+Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
+  if (getElementType()->isFloatTy() || getElementType()->isDoubleTy())
+    return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
+
+  return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
+}
+
+/// isString - This method returns true if this is an array of i8.
+bool ConstantDataSequential::isString() const {
+  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(8);
+}
+
+/// isCString - This method returns true if the array "isString", ends with a
+/// nul byte, and does not contains any other nul bytes.
+bool ConstantDataSequential::isCString() const {
+  if (!isString())
+    return false;
+
+  StringRef Str = getAsString();
+
+  // The last value must be nul.
+  if (Str.back() != 0) return false;
+
+  // Other elements must be non-nul.
+  return Str.drop_back().find(0) == StringRef::npos;
+}
+
+/// getSplatValue - If this is a splat constant, meaning that all of the
+/// elements have the same value, return that value. Otherwise return NULL.
+Constant *ConstantDataVector::getSplatValue() const {
+  const char *Base = getRawDataValues().data();
+
+  // Compare elements 1+ to the 0'th element.
+  unsigned EltSize = getElementByteSize();
+  for (unsigned i = 1, e = getNumElements(); i != e; ++i)
+    if (memcmp(Base, Base+i*EltSize, EltSize))
+      return 0;
+
+  // If they're all the same, return the 0th one as a representative.
+  return getElementAsConstant(0);
+}
+
+//===----------------------------------------------------------------------===//
+//                replaceUsesOfWithOnConstant implementations
+
+/// replaceUsesOfWithOnConstant - Update this constant array to change uses of
+/// 'From' to be uses of 'To'.  This must update the uniquing data structures
+/// etc.
+///
+/// Note that we intentionally replace all uses of From with To here.  Consider
+/// a large array that uses 'From' 1000 times.  By handling this case all here,
+/// ConstantArray::replaceUsesOfWithOnConstant is only invoked once, and that
+/// single invocation handles all 1000 uses.  Handling them one at a time would
+/// work, but would be really slow because it would have to unique each updated
+/// array instance.
+///
+void ConstantArray::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  Constant *ToC = cast<Constant>(To);
+
+  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
+
+  SmallVector<Constant*, 8> Values;
+  LLVMContextImpl::ArrayConstantsTy::LookupKey Lookup;
+  Lookup.first = cast<ArrayType>(getType());
+  Values.reserve(getNumOperands());  // Build replacement array.
+
+  // Fill values with the modified operands of the constant array.  Also,
+  // compute whether this turns into an all-zeros array.
+  unsigned NumUpdated = 0;
+
+  // Keep track of whether all the values in the array are "ToC".
+  bool AllSame = true;
+  for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+    Constant *Val = cast<Constant>(O->get());
+    if (Val == From) {
+      Val = ToC;
+      ++NumUpdated;
+    }
+    Values.push_back(Val);
+    AllSame &= Val == ToC;
+  }
+
+  Constant *Replacement = 0;
+  if (AllSame && ToC->isNullValue()) {
+    Replacement = ConstantAggregateZero::get(getType());
+  } else if (AllSame && isa<UndefValue>(ToC)) {
+    Replacement = UndefValue::get(getType());
+  } else {
+    // Check to see if we have this array type already.
+    Lookup.second = makeArrayRef(Values);
+    LLVMContextImpl::ArrayConstantsTy::MapTy::iterator I =
+      pImpl->ArrayConstants.find(Lookup);
+
+    if (I != pImpl->ArrayConstants.map_end()) {
+      Replacement = I->first;
+    } else {
+      // Okay, the new shape doesn't exist in the system yet.  Instead of
+      // creating a new constant array, inserting it, replaceallusesof'ing the
+      // old with the new, then deleting the old... just update the current one
+      // in place!
+      pImpl->ArrayConstants.remove(this);
+
+      // Update to the new value.  Optimize for the case when we have a single
+      // operand that we're changing, but handle bulk updates efficiently.
+      if (NumUpdated == 1) {
+        unsigned OperandToUpdate = U - OperandList;
+        assert(getOperand(OperandToUpdate) == From &&
+               "ReplaceAllUsesWith broken!");
+        setOperand(OperandToUpdate, ToC);
+      } else {
+        for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+          if (getOperand(i) == From)
+            setOperand(i, ToC);
+      }
+      pImpl->ArrayConstants.insert(this);
+      return;
+    }
+  }
+
+  // Otherwise, I do need to replace this with an existing value.
+  assert(Replacement != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(Replacement);
+
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantStruct::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+  Constant *ToC = cast<Constant>(To);
+
+  unsigned OperandToUpdate = U-OperandList;
+  assert(getOperand(OperandToUpdate) == From && "ReplaceAllUsesWith broken!");
+
+  SmallVector<Constant*, 8> Values;
+  LLVMContextImpl::StructConstantsTy::LookupKey Lookup;
+  Lookup.first = cast<StructType>(getType());
+  Values.reserve(getNumOperands());  // Build replacement struct.
+
+  // Fill values with the modified operands of the constant struct.  Also,
+  // compute whether this turns into an all-zeros struct.
+  bool isAllZeros = false;
+  bool isAllUndef = false;
+  if (ToC->isNullValue()) {
+    isAllZeros = true;
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+      Constant *Val = cast<Constant>(O->get());
+      Values.push_back(Val);
+      if (isAllZeros) isAllZeros = Val->isNullValue();
+    }
+  } else if (isa<UndefValue>(ToC)) {
+    isAllUndef = true;
+    for (Use *O = OperandList, *E = OperandList+getNumOperands(); O != E; ++O) {
+      Constant *Val = cast<Constant>(O->get());
+      Values.push_back(Val);
+      if (isAllUndef) isAllUndef = isa<UndefValue>(Val);
+    }
+  } else {
+    for (Use *O = OperandList, *E = OperandList + getNumOperands(); O != E; ++O)
+      Values.push_back(cast<Constant>(O->get()));
+  }
+  Values[OperandToUpdate] = ToC;
+
+  LLVMContextImpl *pImpl = getContext().pImpl;
+
+  Constant *Replacement = 0;
+  if (isAllZeros) {
+    Replacement = ConstantAggregateZero::get(getType());
+  } else if (isAllUndef) {
+    Replacement = UndefValue::get(getType());
+  } else {
+    // Check to see if we have this struct type already.
+    Lookup.second = makeArrayRef(Values);
+    LLVMContextImpl::StructConstantsTy::MapTy::iterator I =
+      pImpl->StructConstants.find(Lookup);
+
+    if (I != pImpl->StructConstants.map_end()) {
+      Replacement = I->first;
+    } else {
+      // Okay, the new shape doesn't exist in the system yet.  Instead of
+      // creating a new constant struct, inserting it, replaceallusesof'ing the
+      // old with the new, then deleting the old... just update the current one
+      // in place!
+      pImpl->StructConstants.remove(this);
+
+      // Update to the new value.
+      setOperand(OperandToUpdate, ToC);
+      pImpl->StructConstants.insert(this);
+      return;
+    }
+  }
+
+  assert(Replacement != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(Replacement);
+
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantVector::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  assert(isa<Constant>(To) && "Cannot make Constant refer to non-constant!");
+
+  SmallVector<Constant*, 8> Values;
+  Values.reserve(getNumOperands());  // Build replacement array...
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    Constant *Val = getOperand(i);
+    if (Val == From) Val = cast<Constant>(To);
+    Values.push_back(Val);
+  }
+
+  Constant *Replacement = get(Values);
+  assert(Replacement != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(Replacement);
+
+  // Delete the old constant!
+  destroyConstant();
+}
+
+void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
+                                               Use *U) {
+  assert(isa<Constant>(ToV) && "Cannot make Constant refer to non-constant!");
+  Constant *To = cast<Constant>(ToV);
+
+  SmallVector<Constant*, 8> NewOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    Constant *Op = getOperand(i);
+    NewOps.push_back(Op == From ? To : Op);
+  }
+
+  Constant *Replacement = getWithOperands(NewOps);
+  assert(Replacement != this && "I didn't contain From!");
+
+  // Everyone using this now uses the replacement.
+  replaceAllUsesWith(Replacement);
+
+  // Delete the old constant!
+  destroyConstant();
+}
+
+Instruction *ConstantExpr::getAsInstruction() {
+  SmallVector<Value*,4> ValueOperands;
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ++I)
+    ValueOperands.push_back(cast<Value>(I));
+
+  ArrayRef<Value*> Ops(ValueOperands);
+
+  switch (getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    return CastInst::Create((Instruction::CastOps)getOpcode(),
+                            Ops[0], getType());
+  case Instruction::Select:
+    return SelectInst::Create(Ops[0], Ops[1], Ops[2]);
+  case Instruction::InsertElement:
+    return InsertElementInst::Create(Ops[0], Ops[1], Ops[2]);
+  case Instruction::ExtractElement:
+    return ExtractElementInst::Create(Ops[0], Ops[1]);
+  case Instruction::InsertValue:
+    return InsertValueInst::Create(Ops[0], Ops[1], getIndices());
+  case Instruction::ExtractValue:
+    return ExtractValueInst::Create(Ops[0], getIndices());
+  case Instruction::ShuffleVector:
+    return new ShuffleVectorInst(Ops[0], Ops[1], Ops[2]);
+
+  case Instruction::GetElementPtr:
+    if (cast<GEPOperator>(this)->isInBounds())
+      return GetElementPtrInst::CreateInBounds(Ops[0], Ops.slice(1));
+    else
+      return GetElementPtrInst::Create(Ops[0], Ops.slice(1));
+
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return CmpInst::Create((Instruction::OtherOps)getOpcode(),
+                           getPredicate(), Ops[0], Ops[1]);
+
+  default:
+    assert(getNumOperands() == 2 && "Must be binary operator?");
+    BinaryOperator *BO =
+      BinaryOperator::Create((Instruction::BinaryOps)getOpcode(),
+                             Ops[0], Ops[1]);
+    if (isa<OverflowingBinaryOperator>(BO)) {
+      BO->setHasNoUnsignedWrap(SubclassOptionalData &
+                               OverflowingBinaryOperator::NoUnsignedWrap);
+      BO->setHasNoSignedWrap(SubclassOptionalData &
+                             OverflowingBinaryOperator::NoSignedWrap);
+    }
+    if (isa<PossiblyExactOperator>(BO))
+      BO->setIsExact(SubclassOptionalData & PossiblyExactOperator::IsExact);
+    return BO;
+  }
+}
diff --git a/contrib/llvm/lib/IR/ConstantsContext.h b/contrib/llvm/lib/IR/ConstantsContext.h
new file mode 100644
index 000000000000..e9958589f53c
--- /dev/null
+++ b/contrib/llvm/lib/IR/ConstantsContext.h
@@ -0,0 +1,774 @@
+//===-- ConstantsContext.h - Constants-related Context Interals -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines various helper methods and classes used by
+// LLVMContextImpl for creating and managing constants.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CONSTANTSCONTEXT_H
+#define LLVM_CONSTANTSCONTEXT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+namespace llvm {
+template<class ValType>
+struct ConstantTraits;
+
+/// UnaryConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement unary constant exprs.
+class UnaryConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+  UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
+    : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
+    Op<0>() = C;
+  }
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// BinaryConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement binary constant exprs.
+class BinaryConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
+                     unsigned Flags)
+    : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    SubclassOptionalData = Flags;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// SelectConstantExpr - This class is private to Constants.cpp, and is used
+/// behind the scenes to implement select constant exprs.
+class SelectConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+    : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ExtractElementConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// extractelement constant exprs.
+class ExtractElementConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  ExtractElementConstantExpr(Constant *C1, Constant *C2)
+    : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(), 
+                   Instruction::ExtractElement, &Op<0>(), 2) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// InsertElementConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// insertelement constant exprs.
+class InsertElementConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+    : ConstantExpr(C1->getType(), Instruction::InsertElement, 
+                   &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ShuffleVectorConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// shufflevector constant exprs.
+class ShuffleVectorConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly three operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 3);
+  }
+  ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
+  : ConstantExpr(VectorType::get(
+                   cast<VectorType>(C1->getType())->getElementType(),
+                   cast<VectorType>(C3->getType())->getNumElements()),
+                 Instruction::ShuffleVector, 
+                 &Op<0>(), 3) {
+    Op<0>() = C1;
+    Op<1>() = C2;
+    Op<2>() = C3;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// ExtractValueConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// extractvalue constant exprs.
+class ExtractValueConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 1);
+  }
+  ExtractValueConstantExpr(Constant *Agg,
+                           const SmallVector<unsigned, 4> &IdxList,
+                           Type *DestTy)
+    : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
+      Indices(IdxList) {
+    Op<0>() = Agg;
+  }
+
+  /// Indices - These identify which value to extract.
+  const SmallVector<unsigned, 4> Indices;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+/// InsertValueConstantExpr - This class is private to
+/// Constants.cpp, and is used behind the scenes to implement
+/// insertvalue constant exprs.
+class InsertValueConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly one operand
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  InsertValueConstantExpr(Constant *Agg, Constant *Val,
+                          const SmallVector<unsigned, 4> &IdxList,
+                          Type *DestTy)
+    : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
+      Indices(IdxList) {
+    Op<0>() = Agg;
+    Op<1>() = Val;
+  }
+
+  /// Indices - These identify the position for the insertion.
+  const SmallVector<unsigned, 4> Indices;
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+
+/// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is
+/// used behind the scenes to implement getelementpr constant exprs.
+class GetElementPtrConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  GetElementPtrConstantExpr(Constant *C, ArrayRef<Constant*> IdxList,
+                            Type *DestTy);
+public:
+  static GetElementPtrConstantExpr *Create(Constant *C,
+                                           ArrayRef<Constant*> IdxList,
+                                           Type *DestTy,
+                                           unsigned Flags) {
+    GetElementPtrConstantExpr *Result =
+      new(IdxList.size() + 1) GetElementPtrConstantExpr(C, IdxList, DestTy);
+    Result->SubclassOptionalData = Flags;
+    return Result;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+// CompareConstantExpr - This class is private to Constants.cpp, and is used
+// behind the scenes to implement ICmp and FCmp constant expressions. This is
+// needed in order to store the predicate value for these instructions.
+class CompareConstantExpr : public ConstantExpr {
+  virtual void anchor();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+public:
+  // allocate space for exactly two operands
+  void *operator new(size_t s) {
+    return User::operator new(s, 2);
+  }
+  unsigned short predicate;
+  CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
+                      unsigned short pred,  Constant* LHS, Constant* RHS)
+    : ConstantExpr(ty, opc, &Op<0>(), 2), predicate(pred) {
+    Op<0>() = LHS;
+    Op<1>() = RHS;
+  }
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+};
+
+template <>
+struct OperandTraits<UnaryConstantExpr> :
+  public FixedNumOperandTraits<UnaryConstantExpr, 1> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(UnaryConstantExpr, Value)
+
+template <>
+struct OperandTraits<BinaryConstantExpr> :
+  public FixedNumOperandTraits<BinaryConstantExpr, 2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryConstantExpr, Value)
+
+template <>
+struct OperandTraits<SelectConstantExpr> :
+  public FixedNumOperandTraits<SelectConstantExpr, 3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SelectConstantExpr, Value)
+
+template <>
+struct OperandTraits<ExtractElementConstantExpr> :
+  public FixedNumOperandTraits<ExtractElementConstantExpr, 2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementConstantExpr, Value)
+
+template <>
+struct OperandTraits<InsertElementConstantExpr> :
+  public FixedNumOperandTraits<InsertElementConstantExpr, 3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertElementConstantExpr, Value)
+
+template <>
+struct OperandTraits<ShuffleVectorConstantExpr> :
+    public FixedNumOperandTraits<ShuffleVectorConstantExpr, 3> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value)
+
+template <>
+struct OperandTraits<ExtractValueConstantExpr> :
+  public FixedNumOperandTraits<ExtractValueConstantExpr, 1> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractValueConstantExpr, Value)
+
+template <>
+struct OperandTraits<InsertValueConstantExpr> :
+  public FixedNumOperandTraits<InsertValueConstantExpr, 2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueConstantExpr, Value)
+
+template <>
+struct OperandTraits<GetElementPtrConstantExpr> :
+  public VariadicOperandTraits<GetElementPtrConstantExpr, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrConstantExpr, Value)
+
+
+template <>
+struct OperandTraits<CompareConstantExpr> :
+  public FixedNumOperandTraits<CompareConstantExpr, 2> {
+};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CompareConstantExpr, Value)
+
+struct ExprMapKeyType {
+  ExprMapKeyType(unsigned opc,
+      ArrayRef<Constant*> ops,
+      unsigned short flags = 0,
+      unsigned short optionalflags = 0,
+      ArrayRef<unsigned> inds = ArrayRef<unsigned>())
+        : opcode(opc), subclassoptionaldata(optionalflags), subclassdata(flags),
+        operands(ops.begin(), ops.end()), indices(inds.begin(), inds.end()) {}
+  uint8_t opcode;
+  uint8_t subclassoptionaldata;
+  uint16_t subclassdata;
+  std::vector<Constant*> operands;
+  SmallVector<unsigned, 4> indices;
+  bool operator==(const ExprMapKeyType& that) const {
+    return this->opcode == that.opcode &&
+           this->subclassdata == that.subclassdata &&
+           this->subclassoptionaldata == that.subclassoptionaldata &&
+           this->operands == that.operands &&
+           this->indices == that.indices;
+  }
+  bool operator<(const ExprMapKeyType & that) const {
+    if (this->opcode != that.opcode) return this->opcode < that.opcode;
+    if (this->operands != that.operands) return this->operands < that.operands;
+    if (this->subclassdata != that.subclassdata)
+      return this->subclassdata < that.subclassdata;
+    if (this->subclassoptionaldata != that.subclassoptionaldata)
+      return this->subclassoptionaldata < that.subclassoptionaldata;
+    if (this->indices != that.indices) return this->indices < that.indices;
+    return false;
+  }
+
+  bool operator!=(const ExprMapKeyType& that) const {
+    return !(*this == that);
+  }
+};
+
+struct InlineAsmKeyType {
+  InlineAsmKeyType(StringRef AsmString,
+                   StringRef Constraints, bool hasSideEffects,
+                   bool isAlignStack, InlineAsm::AsmDialect asmDialect)
+    : asm_string(AsmString), constraints(Constraints),
+      has_side_effects(hasSideEffects), is_align_stack(isAlignStack),
+      asm_dialect(asmDialect) {}
+  std::string asm_string;
+  std::string constraints;
+  bool has_side_effects;
+  bool is_align_stack;
+  InlineAsm::AsmDialect asm_dialect;
+  bool operator==(const InlineAsmKeyType& that) const {
+    return this->asm_string == that.asm_string &&
+           this->constraints == that.constraints &&
+           this->has_side_effects == that.has_side_effects &&
+           this->is_align_stack == that.is_align_stack &&
+           this->asm_dialect == that.asm_dialect;
+  }
+  bool operator<(const InlineAsmKeyType& that) const {
+    if (this->asm_string != that.asm_string)
+      return this->asm_string < that.asm_string;
+    if (this->constraints != that.constraints)
+      return this->constraints < that.constraints;
+    if (this->has_side_effects != that.has_side_effects)
+      return this->has_side_effects < that.has_side_effects;
+    if (this->is_align_stack != that.is_align_stack)
+      return this->is_align_stack < that.is_align_stack;
+    if (this->asm_dialect != that.asm_dialect)
+      return this->asm_dialect < that.asm_dialect;
+    return false;
+  }
+
+  bool operator!=(const InlineAsmKeyType& that) const {
+    return !(*this == that);
+  }
+};
+
+// The number of operands for each ConstantCreator::create method is
+// determined by the ConstantTraits template.
+// ConstantCreator - A class that is used to create constants by
+// ConstantUniqueMap*.  This class should be partially specialized if there is
+// something strange that needs to be done to interface to the ctor for the
+// constant.
+//
+template<typename T, typename Alloc>
+struct ConstantTraits< std::vector<T, Alloc> > {
+  static unsigned uses(const std::vector<T, Alloc>& v) {
+    return v.size();
+  }
+};
+
+template<>
+struct ConstantTraits<Constant *> {
+  static unsigned uses(Constant * const & v) {
+    return 1;
+  }
+};
+
+template<class ConstantClass, class TypeClass, class ValType>
+struct ConstantCreator {
+  static ConstantClass *create(TypeClass *Ty, const ValType &V) {
+    return new(ConstantTraits<ValType>::uses(V)) ConstantClass(Ty, V);
+  }
+};
+
+template<class ConstantClass, class TypeClass>
+struct ConstantArrayCreator {
+  static ConstantClass *create(TypeClass *Ty, ArrayRef<Constant*> V) {
+    return new(V.size()) ConstantClass(Ty, V);
+  }
+};
+
+template<class ConstantClass>
+struct ConstantKeyData {
+  typedef void ValType;
+  static ValType getValType(ConstantClass *C) {
+    llvm_unreachable("Unknown Constant type!");
+  }
+};
+
+template<>
+struct ConstantCreator<ConstantExpr, Type, ExprMapKeyType> {
+  static ConstantExpr *create(Type *Ty, const ExprMapKeyType &V,
+      unsigned short pred = 0) {
+    if (Instruction::isCast(V.opcode))
+      return new UnaryConstantExpr(V.opcode, V.operands[0], Ty);
+    if ((V.opcode >= Instruction::BinaryOpsBegin &&
+         V.opcode < Instruction::BinaryOpsEnd))
+      return new BinaryConstantExpr(V.opcode, V.operands[0], V.operands[1],
+                                    V.subclassoptionaldata);
+    if (V.opcode == Instruction::Select)
+      return new SelectConstantExpr(V.operands[0], V.operands[1], 
+                                    V.operands[2]);
+    if (V.opcode == Instruction::ExtractElement)
+      return new ExtractElementConstantExpr(V.operands[0], V.operands[1]);
+    if (V.opcode == Instruction::InsertElement)
+      return new InsertElementConstantExpr(V.operands[0], V.operands[1],
+                                           V.operands[2]);
+    if (V.opcode == Instruction::ShuffleVector)
+      return new ShuffleVectorConstantExpr(V.operands[0], V.operands[1],
+                                           V.operands[2]);
+    if (V.opcode == Instruction::InsertValue)
+      return new InsertValueConstantExpr(V.operands[0], V.operands[1],
+                                         V.indices, Ty);
+    if (V.opcode == Instruction::ExtractValue)
+      return new ExtractValueConstantExpr(V.operands[0], V.indices, Ty);
+    if (V.opcode == Instruction::GetElementPtr) {
+      std::vector<Constant*> IdxList(V.operands.begin()+1, V.operands.end());
+      return GetElementPtrConstantExpr::Create(V.operands[0], IdxList, Ty,
+                                               V.subclassoptionaldata);
+    }
+
+    // The compare instructions are weird. We have to encode the predicate
+    // value and it is combined with the instruction opcode by multiplying
+    // the opcode by one hundred. We must decode this to get the predicate.
+    if (V.opcode == Instruction::ICmp)
+      return new CompareConstantExpr(Ty, Instruction::ICmp, V.subclassdata,
+                                     V.operands[0], V.operands[1]);
+    if (V.opcode == Instruction::FCmp) 
+      return new CompareConstantExpr(Ty, Instruction::FCmp, V.subclassdata,
+                                     V.operands[0], V.operands[1]);
+    llvm_unreachable("Invalid ConstantExpr!");
+  }
+};
+
+template<>
+struct ConstantKeyData<ConstantExpr> {
+  typedef ExprMapKeyType ValType;
+  static ValType getValType(ConstantExpr *CE) {
+    std::vector<Constant*> Operands;
+    Operands.reserve(CE->getNumOperands());
+    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+      Operands.push_back(cast<Constant>(CE->getOperand(i)));
+    return ExprMapKeyType(CE->getOpcode(), Operands,
+        CE->isCompare() ? CE->getPredicate() : 0,
+        CE->getRawSubclassOptionalData(),
+        CE->hasIndices() ?
+          CE->getIndices() : ArrayRef<unsigned>());
+  }
+};
+
+template<>
+struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
+  static InlineAsm *create(PointerType *Ty, const InlineAsmKeyType &Key) {
+    return new InlineAsm(Ty, Key.asm_string, Key.constraints,
+                         Key.has_side_effects, Key.is_align_stack,
+                         Key.asm_dialect);
+  }
+};
+
+template<>
+struct ConstantKeyData<InlineAsm> {
+  typedef InlineAsmKeyType ValType;
+  static ValType getValType(InlineAsm *Asm) {
+    return InlineAsmKeyType(Asm->getAsmString(), Asm->getConstraintString(),
+                            Asm->hasSideEffects(), Asm->isAlignStack(),
+                            Asm->getDialect());
+  }
+};
+
+template<class ValType, class ValRefType, class TypeClass, class ConstantClass,
+         bool HasLargeKey = false /*true for arrays and structs*/ >
+class ConstantUniqueMap {
+public:
+  typedef std::pair<TypeClass*, ValType> MapKey;
+  typedef std::map<MapKey, ConstantClass *> MapTy;
+  typedef std::map<ConstantClass *, typename MapTy::iterator> InverseMapTy;
+private:
+  /// Map - This is the main map from the element descriptor to the Constants.
+  /// This is the primary way we avoid creating two of the same shape
+  /// constant.
+  MapTy Map;
+    
+  /// InverseMap - If "HasLargeKey" is true, this contains an inverse mapping
+  /// from the constants to their element in Map.  This is important for
+  /// removal of constants from the array, which would otherwise have to scan
+  /// through the map with very large keys.
+  InverseMapTy InverseMap;
+
+public:
+  typename MapTy::iterator map_begin() { return Map.begin(); }
+  typename MapTy::iterator map_end() { return Map.end(); }
+
+  void freeConstants() {
+    for (typename MapTy::iterator I=Map.begin(), E=Map.end();
+         I != E; ++I) {
+      // Asserts that use_empty().
+      delete I->second;
+    }
+  }
+    
+  /// InsertOrGetItem - Return an iterator for the specified element.
+  /// If the element exists in the map, the returned iterator points to the
+  /// entry and Exists=true.  If not, the iterator points to the newly
+  /// inserted entry and returns Exists=false.  Newly inserted entries have
+  /// I->second == 0, and should be filled in.
+  typename MapTy::iterator InsertOrGetItem(std::pair<MapKey, ConstantClass *>
+                                 &InsertVal,
+                                 bool &Exists) {
+    std::pair<typename MapTy::iterator, bool> IP = Map.insert(InsertVal);
+    Exists = !IP.second;
+    return IP.first;
+  }
+    
+private:
+  typename MapTy::iterator FindExistingElement(ConstantClass *CP) {
+    if (HasLargeKey) {
+      typename InverseMapTy::iterator IMI = InverseMap.find(CP);
+      assert(IMI != InverseMap.end() && IMI->second != Map.end() &&
+             IMI->second->second == CP &&
+             "InverseMap corrupt!");
+      return IMI->second;
+    }
+      
+    typename MapTy::iterator I =
+      Map.find(MapKey(static_cast<TypeClass*>(CP->getType()),
+                      ConstantKeyData<ConstantClass>::getValType(CP)));
+    if (I == Map.end() || I->second != CP) {
+      // FIXME: This should not use a linear scan.  If this gets to be a
+      // performance problem, someone should look at this.
+      for (I = Map.begin(); I != Map.end() && I->second != CP; ++I)
+        /* empty */;
+    }
+    return I;
+  }
+
+  ConstantClass *Create(TypeClass *Ty, ValRefType V,
+                        typename MapTy::iterator I) {
+    ConstantClass* Result =
+      ConstantCreator<ConstantClass,TypeClass,ValType>::create(Ty, V);
+
+    assert(Result->getType() == Ty && "Type specified is not correct!");
+    I = Map.insert(I, std::make_pair(MapKey(Ty, V), Result));
+
+    if (HasLargeKey)  // Remember the reverse mapping if needed.
+      InverseMap.insert(std::make_pair(Result, I));
+
+    return Result;
+  }
+public:
+    
+  /// getOrCreate - Return the specified constant from the map, creating it if
+  /// necessary.
+  ConstantClass *getOrCreate(TypeClass *Ty, ValRefType V) {
+    MapKey Lookup(Ty, V);
+    ConstantClass* Result = 0;
+    
+    typename MapTy::iterator I = Map.find(Lookup);
+    // Is it in the map?  
+    if (I != Map.end())
+      Result = I->second;
+        
+    if (!Result) {
+      // If no preexisting value, create one now...
+      Result = Create(Ty, V, I);
+    }
+        
+    return Result;
+  }
+
+  void remove(ConstantClass *CP) {
+    typename MapTy::iterator I = FindExistingElement(CP);
+    assert(I != Map.end() && "Constant not found in constant table!");
+    assert(I->second == CP && "Didn't find correct element?");
+
+    if (HasLargeKey)  // Remember the reverse mapping if needed.
+      InverseMap.erase(CP);
+
+    Map.erase(I);
+  }
+
+  /// MoveConstantToNewSlot - If we are about to change C to be the element
+  /// specified by I, update our internal data structures to reflect this
+  /// fact.
+  void MoveConstantToNewSlot(ConstantClass *C, typename MapTy::iterator I) {
+    // First, remove the old location of the specified constant in the map.
+    typename MapTy::iterator OldI = FindExistingElement(C);
+    assert(OldI != Map.end() && "Constant not found in constant table!");
+    assert(OldI->second == C && "Didn't find correct element?");
+      
+     // Remove the old entry from the map.
+    Map.erase(OldI);
+    
+    // Update the inverse map so that we know that this constant is now
+    // located at descriptor I.
+    if (HasLargeKey) {
+      assert(I->second == C && "Bad inversemap entry!");
+      InverseMap[C] = I;
+    }
+  }
+
+  void dump() const {
+    DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n");
+  }
+};
+
+// Unique map for aggregate constants
+template<class TypeClass, class ConstantClass>
+class ConstantAggrUniqueMap {
+public:
+  typedef ArrayRef<Constant*> Operands;
+  typedef std::pair<TypeClass*, Operands> LookupKey;
+private:
+  struct MapInfo {
+    typedef DenseMapInfo<ConstantClass*> ConstantClassInfo;
+    typedef DenseMapInfo<Constant*> ConstantInfo;
+    typedef DenseMapInfo<TypeClass*> TypeClassInfo;
+    static inline ConstantClass* getEmptyKey() {
+      return ConstantClassInfo::getEmptyKey();
+    }
+    static inline ConstantClass* getTombstoneKey() {
+      return ConstantClassInfo::getTombstoneKey();
+    }
+    static unsigned getHashValue(const ConstantClass *CP) {
+      SmallVector<Constant*, 8> CPOperands;
+      CPOperands.reserve(CP->getNumOperands());
+      for (unsigned I = 0, E = CP->getNumOperands(); I < E; ++I)
+        CPOperands.push_back(CP->getOperand(I));
+      return getHashValue(LookupKey(CP->getType(), CPOperands));
+    }
+    static bool isEqual(const ConstantClass *LHS, const ConstantClass *RHS) {
+      return LHS == RHS;
+    }
+    static unsigned getHashValue(const LookupKey &Val) {
+      return hash_combine(Val.first, hash_combine_range(Val.second.begin(),
+                                                        Val.second.end()));
+    }
+    static bool isEqual(const LookupKey &LHS, const ConstantClass *RHS) {
+      if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+        return false;
+      if (LHS.first != RHS->getType()
+          || LHS.second.size() != RHS->getNumOperands())
+        return false;
+      for (unsigned I = 0, E = RHS->getNumOperands(); I < E; ++I) {
+        if (LHS.second[I] != RHS->getOperand(I))
+          return false;
+      }
+      return true;
+    }
+  };
+public:
+  typedef DenseMap<ConstantClass *, char, MapInfo> MapTy;
+
+private:
+  /// Map - This is the main map from the element descriptor to the Constants.
+  /// This is the primary way we avoid creating two of the same shape
+  /// constant.
+  MapTy Map;
+
+public:
+  typename MapTy::iterator map_begin() { return Map.begin(); }
+  typename MapTy::iterator map_end() { return Map.end(); }
+
+  void freeConstants() {
+    for (typename MapTy::iterator I=Map.begin(), E=Map.end();
+         I != E; ++I) {
+      // Asserts that use_empty().
+      delete I->first;
+    }
+  }
+
+private:
+  typename MapTy::iterator findExistingElement(ConstantClass *CP) {
+    return Map.find(CP);
+  }
+
+  ConstantClass *Create(TypeClass *Ty, Operands V, typename MapTy::iterator I) {
+    ConstantClass* Result =
+      ConstantArrayCreator<ConstantClass,TypeClass>::create(Ty, V);
+
+    assert(Result->getType() == Ty && "Type specified is not correct!");
+    Map[Result] = '\0';
+
+    return Result;
+  }
+public:
+
+  /// getOrCreate - Return the specified constant from the map, creating it if
+  /// necessary.
+  ConstantClass *getOrCreate(TypeClass *Ty, Operands V) {
+    LookupKey Lookup(Ty, V);
+    ConstantClass* Result = 0;
+
+    typename MapTy::iterator I = Map.find_as(Lookup);
+    // Is it in the map?
+    if (I != Map.end())
+      Result = I->first;
+
+    if (!Result) {
+      // If no preexisting value, create one now...
+      Result = Create(Ty, V, I);
+    }
+
+    return Result;
+  }
+
+  /// Find the constant by lookup key.
+  typename MapTy::iterator find(LookupKey Lookup) {
+    return Map.find_as(Lookup);
+  }
+
+  /// Insert the constant into its proper slot.
+  void insert(ConstantClass *CP) {
+    Map[CP] = '\0';
+  }
+
+  /// Remove this constant from the map
+  void remove(ConstantClass *CP) {
+    typename MapTy::iterator I = findExistingElement(CP);
+    assert(I != Map.end() && "Constant not found in constant table!");
+    assert(I->first == CP && "Didn't find correct element?");
+    Map.erase(I);
+  }
+
+  void dump() const {
+    DEBUG(dbgs() << "Constant.cpp: ConstantUniqueMap\n");
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp
new file mode 100644
index 000000000000..983b49c628b4
--- /dev/null
+++ b/contrib/llvm/lib/IR/Core.cpp
@@ -0,0 +1,2458 @@
+//===-- Core.cpp ----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including the C bindings)
+// for libLLVMCore.a, which implements the LLVM intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Core.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/Threading.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+void llvm::initializeCore(PassRegistry &Registry) {
+  initializeDominatorTreePass(Registry);
+  initializePrintModulePassPass(Registry);
+  initializePrintFunctionPassPass(Registry);
+  initializePrintBasicBlockPassPass(Registry);
+  initializeVerifierPass(Registry);
+  initializePreVerifierPass(Registry);
+}
+
+void LLVMInitializeCore(LLVMPassRegistryRef R) {
+  initializeCore(*unwrap(R));
+}
+
+void LLVMShutdown() {
+  llvm_shutdown();
+}
+
+/*===-- Error handling ----------------------------------------------------===*/
+
+void LLVMDisposeMessage(char *Message) {
+  free(Message);
+}
+
+
+/*===-- Operations on contexts --------------------------------------------===*/
+
+LLVMContextRef LLVMContextCreate() {
+  return wrap(new LLVMContext());
+}
+
+LLVMContextRef LLVMGetGlobalContext() {
+  return wrap(&getGlobalContext());
+}
+
+void LLVMContextDispose(LLVMContextRef C) {
+  delete unwrap(C);
+}
+
+unsigned LLVMGetMDKindIDInContext(LLVMContextRef C, const char* Name,
+                                  unsigned SLen) {
+  return unwrap(C)->getMDKindID(StringRef(Name, SLen));
+}
+
+unsigned LLVMGetMDKindID(const char* Name, unsigned SLen) {
+  return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen);
+}
+
+
+/*===-- Operations on modules ---------------------------------------------===*/
+
+LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
+  return wrap(new Module(ModuleID, getGlobalContext()));
+}
+
+LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, 
+                                                LLVMContextRef C) {
+  return wrap(new Module(ModuleID, *unwrap(C)));
+}
+
+void LLVMDisposeModule(LLVMModuleRef M) {
+  delete unwrap(M);
+}
+
+/*--.. Data layout .........................................................--*/
+const char * LLVMGetDataLayout(LLVMModuleRef M) {
+  return unwrap(M)->getDataLayout().c_str();
+}
+
+void LLVMSetDataLayout(LLVMModuleRef M, const char *Triple) {
+  unwrap(M)->setDataLayout(Triple);
+}
+
+/*--.. Target triple .......................................................--*/
+const char * LLVMGetTarget(LLVMModuleRef M) {
+  return unwrap(M)->getTargetTriple().c_str();
+}
+
+void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
+  unwrap(M)->setTargetTriple(Triple);
+}
+
+void LLVMDumpModule(LLVMModuleRef M) {
+  unwrap(M)->dump();
+}
+
+LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
+                               char **ErrorMessage) {
+  std::string error;
+  raw_fd_ostream dest(Filename, error);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  unwrap(M)->print(dest, NULL);
+
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  dest.flush();
+  return false;
+}
+
+/*--.. Operations on inline assembler ......................................--*/
+void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
+  unwrap(M)->setModuleInlineAsm(StringRef(Asm));
+}
+
+
+/*--.. Operations on module contexts ......................................--*/
+LLVMContextRef LLVMGetModuleContext(LLVMModuleRef M) {
+  return wrap(&unwrap(M)->getContext());
+}
+
+
+/*===-- Operations on types -----------------------------------------------===*/
+
+/*--.. Operations on all types (mostly) ....................................--*/
+
+LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
+  switch (unwrap(Ty)->getTypeID()) {
+  default: llvm_unreachable("Unhandled TypeID.");
+  case Type::VoidTyID:
+    return LLVMVoidTypeKind;
+  case Type::HalfTyID:
+    return LLVMHalfTypeKind;
+  case Type::FloatTyID:
+    return LLVMFloatTypeKind;
+  case Type::DoubleTyID:
+    return LLVMDoubleTypeKind;
+  case Type::X86_FP80TyID:
+    return LLVMX86_FP80TypeKind;
+  case Type::FP128TyID:
+    return LLVMFP128TypeKind;
+  case Type::PPC_FP128TyID:
+    return LLVMPPC_FP128TypeKind;
+  case Type::LabelTyID:
+    return LLVMLabelTypeKind;
+  case Type::MetadataTyID:
+    return LLVMMetadataTypeKind;
+  case Type::IntegerTyID:
+    return LLVMIntegerTypeKind;
+  case Type::FunctionTyID:
+    return LLVMFunctionTypeKind;
+  case Type::StructTyID:
+    return LLVMStructTypeKind;
+  case Type::ArrayTyID:
+    return LLVMArrayTypeKind;
+  case Type::PointerTyID:
+    return LLVMPointerTypeKind;
+  case Type::VectorTyID:
+    return LLVMVectorTypeKind;
+  case Type::X86_MMXTyID:
+    return LLVMX86_MMXTypeKind;
+  }
+}
+
+LLVMBool LLVMTypeIsSized(LLVMTypeRef Ty)
+{
+    return unwrap(Ty)->isSized();
+}
+
+LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
+  return wrap(&unwrap(Ty)->getContext());
+}
+
+/*--.. Operations on integer types .........................................--*/
+
+LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C)  {
+  return (LLVMTypeRef) Type::getInt1Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMInt8TypeInContext(LLVMContextRef C)  {
+  return (LLVMTypeRef) Type::getInt8Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMInt16TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getInt16Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMInt32TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getInt32Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMInt64TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getInt64Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMIntTypeInContext(LLVMContextRef C, unsigned NumBits) {
+  return wrap(IntegerType::get(*unwrap(C), NumBits));
+}
+
+LLVMTypeRef LLVMInt1Type(void)  {
+  return LLVMInt1TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMInt8Type(void)  {
+  return LLVMInt8TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMInt16Type(void) {
+  return LLVMInt16TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMInt32Type(void) {
+  return LLVMInt32TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMInt64Type(void) {
+  return LLVMInt64TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMIntType(unsigned NumBits) {
+  return LLVMIntTypeInContext(LLVMGetGlobalContext(), NumBits);
+}
+
+unsigned LLVMGetIntTypeWidth(LLVMTypeRef IntegerTy) {
+  return unwrap<IntegerType>(IntegerTy)->getBitWidth();
+}
+
+/*--.. Operations on real types ............................................--*/
+
+LLVMTypeRef LLVMHalfTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getHalfTy(*unwrap(C));
+}
+LLVMTypeRef LLVMFloatTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getFloatTy(*unwrap(C));
+}
+LLVMTypeRef LLVMDoubleTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getDoubleTy(*unwrap(C));
+}
+LLVMTypeRef LLVMX86FP80TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getX86_FP80Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMFP128TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getFP128Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMPPCFP128TypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getPPC_FP128Ty(*unwrap(C));
+}
+LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getX86_MMXTy(*unwrap(C));
+}
+
+LLVMTypeRef LLVMHalfType(void) {
+  return LLVMHalfTypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMFloatType(void) {
+  return LLVMFloatTypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMDoubleType(void) {
+  return LLVMDoubleTypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMX86FP80Type(void) {
+  return LLVMX86FP80TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMFP128Type(void) {
+  return LLVMFP128TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMPPCFP128Type(void) {
+  return LLVMPPCFP128TypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMX86MMXType(void) {
+  return LLVMX86MMXTypeInContext(LLVMGetGlobalContext());
+}
+
+/*--.. Operations on function types ........................................--*/
+
+LLVMTypeRef LLVMFunctionType(LLVMTypeRef ReturnType,
+                             LLVMTypeRef *ParamTypes, unsigned ParamCount,
+                             LLVMBool IsVarArg) {
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  return wrap(FunctionType::get(unwrap(ReturnType), Tys, IsVarArg != 0));
+}
+
+LLVMBool LLVMIsFunctionVarArg(LLVMTypeRef FunctionTy) {
+  return unwrap<FunctionType>(FunctionTy)->isVarArg();
+}
+
+LLVMTypeRef LLVMGetReturnType(LLVMTypeRef FunctionTy) {
+  return wrap(unwrap<FunctionType>(FunctionTy)->getReturnType());
+}
+
+unsigned LLVMCountParamTypes(LLVMTypeRef FunctionTy) {
+  return unwrap<FunctionType>(FunctionTy)->getNumParams();
+}
+
+void LLVMGetParamTypes(LLVMTypeRef FunctionTy, LLVMTypeRef *Dest) {
+  FunctionType *Ty = unwrap<FunctionType>(FunctionTy);
+  for (FunctionType::param_iterator I = Ty->param_begin(),
+                                    E = Ty->param_end(); I != E; ++I)
+    *Dest++ = wrap(*I);
+}
+
+/*--.. Operations on struct types ..........................................--*/
+
+LLVMTypeRef LLVMStructTypeInContext(LLVMContextRef C, LLVMTypeRef *ElementTypes,
+                           unsigned ElementCount, LLVMBool Packed) {
+  ArrayRef<Type*> Tys(unwrap(ElementTypes), ElementCount);
+  return wrap(StructType::get(*unwrap(C), Tys, Packed != 0));
+}
+
+LLVMTypeRef LLVMStructType(LLVMTypeRef *ElementTypes,
+                           unsigned ElementCount, LLVMBool Packed) {
+  return LLVMStructTypeInContext(LLVMGetGlobalContext(), ElementTypes,
+                                 ElementCount, Packed);
+}
+
+LLVMTypeRef LLVMStructCreateNamed(LLVMContextRef C, const char *Name)
+{
+  return wrap(StructType::create(*unwrap(C), Name));
+}
+
+const char *LLVMGetStructName(LLVMTypeRef Ty)
+{
+  StructType *Type = unwrap<StructType>(Ty);
+  if (!Type->hasName())
+    return 0;
+  return Type->getName().data();
+}
+
+void LLVMStructSetBody(LLVMTypeRef StructTy, LLVMTypeRef *ElementTypes,
+                       unsigned ElementCount, LLVMBool Packed) {
+  ArrayRef<Type*> Tys(unwrap(ElementTypes), ElementCount);
+  unwrap<StructType>(StructTy)->setBody(Tys, Packed != 0);
+}
+
+unsigned LLVMCountStructElementTypes(LLVMTypeRef StructTy) {
+  return unwrap<StructType>(StructTy)->getNumElements();
+}
+
+void LLVMGetStructElementTypes(LLVMTypeRef StructTy, LLVMTypeRef *Dest) {
+  StructType *Ty = unwrap<StructType>(StructTy);
+  for (StructType::element_iterator I = Ty->element_begin(),
+                                    E = Ty->element_end(); I != E; ++I)
+    *Dest++ = wrap(*I);
+}
+
+LLVMBool LLVMIsPackedStruct(LLVMTypeRef StructTy) {
+  return unwrap<StructType>(StructTy)->isPacked();
+}
+
+LLVMBool LLVMIsOpaqueStruct(LLVMTypeRef StructTy) {
+  return unwrap<StructType>(StructTy)->isOpaque();
+}
+
+LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getTypeByName(Name));
+}
+
+/*--.. Operations on array, pointer, and vector types (sequence types) .....--*/
+
+LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) {
+  return wrap(ArrayType::get(unwrap(ElementType), ElementCount));
+}
+
+LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
+  return wrap(PointerType::get(unwrap(ElementType), AddressSpace));
+}
+
+LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
+  return wrap(VectorType::get(unwrap(ElementType), ElementCount));
+}
+
+LLVMTypeRef LLVMGetElementType(LLVMTypeRef Ty) {
+  return wrap(unwrap<SequentialType>(Ty)->getElementType());
+}
+
+unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy) {
+  return unwrap<ArrayType>(ArrayTy)->getNumElements();
+}
+
+unsigned LLVMGetPointerAddressSpace(LLVMTypeRef PointerTy) {
+  return unwrap<PointerType>(PointerTy)->getAddressSpace();
+}
+
+unsigned LLVMGetVectorSize(LLVMTypeRef VectorTy) {
+  return unwrap<VectorType>(VectorTy)->getNumElements();
+}
+
+/*--.. Operations on other types ...........................................--*/
+
+LLVMTypeRef LLVMVoidTypeInContext(LLVMContextRef C)  {
+  return wrap(Type::getVoidTy(*unwrap(C)));
+}
+LLVMTypeRef LLVMLabelTypeInContext(LLVMContextRef C) {
+  return wrap(Type::getLabelTy(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMVoidType(void)  {
+  return LLVMVoidTypeInContext(LLVMGetGlobalContext());
+}
+LLVMTypeRef LLVMLabelType(void) {
+  return LLVMLabelTypeInContext(LLVMGetGlobalContext());
+}
+
+/*===-- Operations on values ----------------------------------------------===*/
+
+/*--.. Operations on all values ............................................--*/
+
+LLVMTypeRef LLVMTypeOf(LLVMValueRef Val) {
+  return wrap(unwrap(Val)->getType());
+}
+
+const char *LLVMGetValueName(LLVMValueRef Val) {
+  return unwrap(Val)->getName().data();
+}
+
+void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
+  unwrap(Val)->setName(Name);
+}
+
+void LLVMDumpValue(LLVMValueRef Val) {
+  unwrap(Val)->dump();
+}
+
+void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) {
+  unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal));
+}
+
+int LLVMHasMetadata(LLVMValueRef Inst) {
+  return unwrap<Instruction>(Inst)->hasMetadata();
+}
+
+LLVMValueRef LLVMGetMetadata(LLVMValueRef Inst, unsigned KindID) {
+  return wrap(unwrap<Instruction>(Inst)->getMetadata(KindID));
+}
+
+void LLVMSetMetadata(LLVMValueRef Inst, unsigned KindID, LLVMValueRef MD) {
+  unwrap<Instruction>(Inst)->setMetadata(KindID, MD? unwrap<MDNode>(MD) : NULL);
+}
+
+/*--.. Conversion functions ................................................--*/
+
+#define LLVM_DEFINE_VALUE_CAST(name)                                       \
+  LLVMValueRef LLVMIsA##name(LLVMValueRef Val) {                           \
+    return wrap(static_cast<Value*>(dyn_cast_or_null<name>(unwrap(Val)))); \
+  }
+
+LLVM_FOR_EACH_VALUE_SUBCLASS(LLVM_DEFINE_VALUE_CAST)
+
+/*--.. Operations on Uses ..................................................--*/
+LLVMUseRef LLVMGetFirstUse(LLVMValueRef Val) {
+  Value *V = unwrap(Val);
+  Value::use_iterator I = V->use_begin();
+  if (I == V->use_end())
+    return 0;
+  return wrap(&(I.getUse()));
+}
+
+LLVMUseRef LLVMGetNextUse(LLVMUseRef U) {
+  Use *Next = unwrap(U)->getNext();
+  if (Next)
+    return wrap(Next);
+  return 0;
+}
+
+LLVMValueRef LLVMGetUser(LLVMUseRef U) {
+  return wrap(unwrap(U)->getUser());
+}
+
+LLVMValueRef LLVMGetUsedValue(LLVMUseRef U) {
+  return wrap(unwrap(U)->get());
+}
+
+/*--.. Operations on Users .................................................--*/
+LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) {
+  Value *V = unwrap(Val);
+  if (MDNode *MD = dyn_cast<MDNode>(V))
+      return wrap(MD->getOperand(Index));
+  return wrap(cast<User>(V)->getOperand(Index));
+}
+
+void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
+  unwrap<User>(Val)->setOperand(Index, unwrap(Op));
+}
+
+int LLVMGetNumOperands(LLVMValueRef Val) {
+  Value *V = unwrap(Val);
+  if (MDNode *MD = dyn_cast<MDNode>(V))
+      return MD->getNumOperands();
+  return cast<User>(V)->getNumOperands();
+}
+
+/*--.. Operations on constants of any type .................................--*/
+
+LLVMValueRef LLVMConstNull(LLVMTypeRef Ty) {
+  return wrap(Constant::getNullValue(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMConstAllOnes(LLVMTypeRef Ty) {
+  return wrap(Constant::getAllOnesValue(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMGetUndef(LLVMTypeRef Ty) {
+  return wrap(UndefValue::get(unwrap(Ty)));
+}
+
+LLVMBool LLVMIsConstant(LLVMValueRef Ty) {
+  return isa<Constant>(unwrap(Ty));
+}
+
+LLVMBool LLVMIsNull(LLVMValueRef Val) {
+  if (Constant *C = dyn_cast<Constant>(unwrap(Val)))
+    return C->isNullValue();
+  return false;
+}
+
+LLVMBool LLVMIsUndef(LLVMValueRef Val) {
+  return isa<UndefValue>(unwrap(Val));
+}
+
+LLVMValueRef LLVMConstPointerNull(LLVMTypeRef Ty) {
+  return
+      wrap(ConstantPointerNull::get(unwrap<PointerType>(Ty)));
+}
+
+/*--.. Operations on metadata nodes ........................................--*/
+
+LLVMValueRef LLVMMDStringInContext(LLVMContextRef C, const char *Str,
+                                   unsigned SLen) {
+  return wrap(MDString::get(*unwrap(C), StringRef(Str, SLen)));
+}
+
+LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
+  return LLVMMDStringInContext(LLVMGetGlobalContext(), Str, SLen);
+}
+
+LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals,
+                                 unsigned Count) {
+  return wrap(MDNode::get(*unwrap(C),
+                          makeArrayRef(unwrap<Value>(Vals, Count), Count)));
+}
+
+LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
+  return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count);
+}
+
+const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
+  if (const MDString *S = dyn_cast<MDString>(unwrap(V))) {
+    *Len = S->getString().size();
+    return S->getString().data();
+  }
+  *Len = 0;
+  return 0;
+}
+
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
+{
+  return cast<MDNode>(unwrap(V))->getNumOperands();
+}
+
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
+{
+  const MDNode *N = cast<MDNode>(unwrap(V));
+  const unsigned numOperands = N->getNumOperands();
+  for (unsigned i = 0; i < numOperands; i++)
+    Dest[i] = wrap(N->getOperand(i));
+}
+
+unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
+{
+  if (NamedMDNode *N = unwrap(M)->getNamedMetadata(name)) {
+    return N->getNumOperands();
+  }
+  return 0;
+}
+
+void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char* name, LLVMValueRef *Dest)
+{
+  NamedMDNode *N = unwrap(M)->getNamedMetadata(name);
+  if (!N)
+    return;
+  for (unsigned i=0;i<N->getNumOperands();i++)
+    Dest[i] = wrap(N->getOperand(i));
+}
+
+void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char* name,
+                                 LLVMValueRef Val)
+{
+  NamedMDNode *N = unwrap(M)->getOrInsertNamedMetadata(name);
+  if (!N)
+    return;
+  MDNode *Op = Val ? unwrap<MDNode>(Val) : NULL;
+  if (Op)
+    N->addOperand(Op);
+}
+
+/*--.. Operations on scalar constants ......................................--*/
+
+LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
+                          LLVMBool SignExtend) {
+  return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), N, SignExtend != 0));
+}
+
+LLVMValueRef LLVMConstIntOfArbitraryPrecision(LLVMTypeRef IntTy,
+                                              unsigned NumWords,
+                                              const uint64_t Words[]) {
+    IntegerType *Ty = unwrap<IntegerType>(IntTy);
+    return wrap(ConstantInt::get(Ty->getContext(),
+                                 APInt(Ty->getBitWidth(),
+                                       makeArrayRef(Words, NumWords))));
+}
+
+LLVMValueRef LLVMConstIntOfString(LLVMTypeRef IntTy, const char Str[],
+                                  uint8_t Radix) {
+  return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), StringRef(Str),
+                               Radix));
+}
+
+LLVMValueRef LLVMConstIntOfStringAndSize(LLVMTypeRef IntTy, const char Str[],
+                                         unsigned SLen, uint8_t Radix) {
+  return wrap(ConstantInt::get(unwrap<IntegerType>(IntTy), StringRef(Str, SLen),
+                               Radix));
+}
+
+LLVMValueRef LLVMConstReal(LLVMTypeRef RealTy, double N) {
+  return wrap(ConstantFP::get(unwrap(RealTy), N));
+}
+
+LLVMValueRef LLVMConstRealOfString(LLVMTypeRef RealTy, const char *Text) {
+  return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Text)));
+}
+
+LLVMValueRef LLVMConstRealOfStringAndSize(LLVMTypeRef RealTy, const char Str[],
+                                          unsigned SLen) {
+  return wrap(ConstantFP::get(unwrap(RealTy), StringRef(Str, SLen)));
+}
+
+unsigned long long LLVMConstIntGetZExtValue(LLVMValueRef ConstantVal) {
+  return unwrap<ConstantInt>(ConstantVal)->getZExtValue();
+}
+
+long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal) {
+  return unwrap<ConstantInt>(ConstantVal)->getSExtValue();
+}
+
+/*--.. Operations on composite constants ...................................--*/
+
+LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
+                                      unsigned Length,
+                                      LLVMBool DontNullTerminate) {
+  /* Inverted the sense of AddNull because ', 0)' is a
+     better mnemonic for null termination than ', 1)'. */
+  return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
+                                           DontNullTerminate == 0));
+}
+LLVMValueRef LLVMConstStructInContext(LLVMContextRef C, 
+                                      LLVMValueRef *ConstantVals,
+                                      unsigned Count, LLVMBool Packed) {
+  Constant **Elements = unwrap<Constant>(ConstantVals, Count);
+  return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count),
+                                      Packed != 0));
+}
+
+LLVMValueRef LLVMConstString(const char *Str, unsigned Length,
+                             LLVMBool DontNullTerminate) {
+  return LLVMConstStringInContext(LLVMGetGlobalContext(), Str, Length,
+                                  DontNullTerminate);
+}
+LLVMValueRef LLVMConstArray(LLVMTypeRef ElementTy,
+                            LLVMValueRef *ConstantVals, unsigned Length) {
+  ArrayRef<Constant*> V(unwrap<Constant>(ConstantVals, Length), Length);
+  return wrap(ConstantArray::get(ArrayType::get(unwrap(ElementTy), Length), V));
+}
+LLVMValueRef LLVMConstStruct(LLVMValueRef *ConstantVals, unsigned Count,
+                             LLVMBool Packed) {
+  return LLVMConstStructInContext(LLVMGetGlobalContext(), ConstantVals, Count,
+                                  Packed);
+}
+
+LLVMValueRef LLVMConstNamedStruct(LLVMTypeRef StructTy,
+                                  LLVMValueRef *ConstantVals,
+                                  unsigned Count) {
+  Constant **Elements = unwrap<Constant>(ConstantVals, Count);
+  StructType *Ty = cast<StructType>(unwrap(StructTy));
+
+  return wrap(ConstantStruct::get(Ty, makeArrayRef(Elements, Count)));
+}
+
+LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) {
+  return wrap(ConstantVector::get(makeArrayRef(
+                            unwrap<Constant>(ScalarConstantVals, Size), Size)));
+}
+
+/*-- Opcode mapping */
+
+static LLVMOpcode map_to_llvmopcode(int opcode)
+{
+    switch (opcode) {
+      default: llvm_unreachable("Unhandled Opcode.");
+#define HANDLE_INST(num, opc, clas) case num: return LLVM##opc;
+#include "llvm/IR/Instruction.def"
+#undef HANDLE_INST
+    }
+}
+
+static int map_from_llvmopcode(LLVMOpcode code)
+{
+    switch (code) {
+#define HANDLE_INST(num, opc, clas) case LLVM##opc: return num;
+#include "llvm/IR/Instruction.def"
+#undef HANDLE_INST
+    }
+    llvm_unreachable("Unhandled Opcode.");
+}
+
+/*--.. Constant expressions ................................................--*/
+
+LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) {
+  return map_to_llvmopcode(unwrap<ConstantExpr>(ConstantVal)->getOpcode());
+}
+
+LLVMValueRef LLVMAlignOf(LLVMTypeRef Ty) {
+  return wrap(ConstantExpr::getAlignOf(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMSizeOf(LLVMTypeRef Ty) {
+  return wrap(ConstantExpr::getSizeOf(unwrap(Ty)));
+}
+
+LLVMValueRef LLVMConstNeg(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNeg(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstNSWNeg(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNSWNeg(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstNUWNeg(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNUWNeg(unwrap<Constant>(ConstantVal)));
+}
+
+
+LLVMValueRef LLVMConstFNeg(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getFNeg(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstNot(LLVMValueRef ConstantVal) {
+  return wrap(ConstantExpr::getNot(unwrap<Constant>(ConstantVal)));
+}
+
+LLVMValueRef LLVMConstAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAdd(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNSWAdd(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNSWAdd(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNUWAdd(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNUWAdd(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFAdd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFAdd(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSub(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNSWSub(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNSWSub(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNUWSub(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNUWSub(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFSub(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFSub(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getMul(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNSWMul(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNSWMul(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstNUWMul(LLVMValueRef LHSConstant,
+                             LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getNUWMul(unwrap<Constant>(LHSConstant),
+                                      unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFMul(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFMul(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstUDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getUDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstExactSDiv(LLVMValueRef LHSConstant,
+                                LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getExactSDiv(unwrap<Constant>(LHSConstant),
+                                         unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFDiv(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFDiv(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstURem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getURem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstSRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getSRem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFRem(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFRem(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstAnd(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAnd(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstOr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getOr(unwrap<Constant>(LHSConstant),
+                                  unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstXor(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getXor(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstICmp(LLVMIntPredicate Predicate,
+                           LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getICmp(Predicate,
+                                    unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstFCmp(LLVMRealPredicate Predicate,
+                           LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getFCmp(Predicate,
+                                    unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstShl(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getShl(unwrap<Constant>(LHSConstant),
+                                   unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstLShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getLShr(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstAShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
+  return wrap(ConstantExpr::getAShr(unwrap<Constant>(LHSConstant),
+                                    unwrap<Constant>(RHSConstant)));
+}
+
+LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
+                          LLVMValueRef *ConstantIndices, unsigned NumIndices) {
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
+  return wrap(ConstantExpr::getGetElementPtr(unwrap<Constant>(ConstantVal),
+                                             IdxList));
+}
+
+LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
+                                  LLVMValueRef *ConstantIndices,
+                                  unsigned NumIndices) {
+  Constant* Val = unwrap<Constant>(ConstantVal);
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
+  return wrap(ConstantExpr::getInBoundsGetElementPtr(Val, IdxList));
+}
+
+LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getTrunc(unwrap<Constant>(ConstantVal),
+                                     unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getSExt(unwrap<Constant>(ConstantVal),
+                                    unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstZExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getZExt(unwrap<Constant>(ConstantVal),
+                                    unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPTrunc(unwrap<Constant>(ConstantVal),
+                                       unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPExt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPExtend(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstUIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getUIToFP(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSIToFP(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getSIToFP(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPToUI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPToUI(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstFPToSI(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPToSI(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstPtrToInt(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getPtrToInt(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstIntToPtr(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getIntToPtr(unwrap<Constant>(ConstantVal),
+                                        unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getBitCast(unwrap<Constant>(ConstantVal),
+                                       unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
+                                    LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getZExtOrBitCast(unwrap<Constant>(ConstantVal),
+                                             unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSExtOrBitCast(LLVMValueRef ConstantVal,
+                                    LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getSExtOrBitCast(unwrap<Constant>(ConstantVal),
+                                             unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstTruncOrBitCast(LLVMValueRef ConstantVal,
+                                     LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getTruncOrBitCast(unwrap<Constant>(ConstantVal),
+                                              unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstPointerCast(LLVMValueRef ConstantVal,
+                                  LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getPointerCast(unwrap<Constant>(ConstantVal),
+                                           unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstIntCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType,
+                              LLVMBool isSigned) {
+  return wrap(ConstantExpr::getIntegerCast(unwrap<Constant>(ConstantVal),
+                                           unwrap(ToType), isSigned));
+}
+
+LLVMValueRef LLVMConstFPCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getFPCast(unwrap<Constant>(ConstantVal),
+                                      unwrap(ToType)));
+}
+
+LLVMValueRef LLVMConstSelect(LLVMValueRef ConstantCondition,
+                             LLVMValueRef ConstantIfTrue,
+                             LLVMValueRef ConstantIfFalse) {
+  return wrap(ConstantExpr::getSelect(unwrap<Constant>(ConstantCondition),
+                                      unwrap<Constant>(ConstantIfTrue),
+                                      unwrap<Constant>(ConstantIfFalse)));
+}
+
+LLVMValueRef LLVMConstExtractElement(LLVMValueRef VectorConstant,
+                                     LLVMValueRef IndexConstant) {
+  return wrap(ConstantExpr::getExtractElement(unwrap<Constant>(VectorConstant),
+                                              unwrap<Constant>(IndexConstant)));
+}
+
+LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant,
+                                    LLVMValueRef ElementValueConstant,
+                                    LLVMValueRef IndexConstant) {
+  return wrap(ConstantExpr::getInsertElement(unwrap<Constant>(VectorConstant),
+                                         unwrap<Constant>(ElementValueConstant),
+                                             unwrap<Constant>(IndexConstant)));
+}
+
+LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
+                                    LLVMValueRef VectorBConstant,
+                                    LLVMValueRef MaskConstant) {
+  return wrap(ConstantExpr::getShuffleVector(unwrap<Constant>(VectorAConstant),
+                                             unwrap<Constant>(VectorBConstant),
+                                             unwrap<Constant>(MaskConstant)));
+}
+
+LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
+                                   unsigned NumIdx) {
+  return wrap(ConstantExpr::getExtractValue(unwrap<Constant>(AggConstant),
+                                            makeArrayRef(IdxList, NumIdx)));
+}
+
+LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
+                                  LLVMValueRef ElementValueConstant,
+                                  unsigned *IdxList, unsigned NumIdx) {
+  return wrap(ConstantExpr::getInsertValue(unwrap<Constant>(AggConstant),
+                                         unwrap<Constant>(ElementValueConstant),
+                                           makeArrayRef(IdxList, NumIdx)));
+}
+
+LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString,
+                                const char *Constraints,
+                                LLVMBool HasSideEffects,
+                                LLVMBool IsAlignStack) {
+  return wrap(InlineAsm::get(dyn_cast<FunctionType>(unwrap(Ty)), AsmString,
+                             Constraints, HasSideEffects, IsAlignStack));
+}
+
+LLVMValueRef LLVMBlockAddress(LLVMValueRef F, LLVMBasicBlockRef BB) {
+  return wrap(BlockAddress::get(unwrap<Function>(F), unwrap(BB)));
+}
+
+/*--.. Operations on global variables, functions, and aliases (globals) ....--*/
+
+LLVMModuleRef LLVMGetGlobalParent(LLVMValueRef Global) {
+  return wrap(unwrap<GlobalValue>(Global)->getParent());
+}
+
+LLVMBool LLVMIsDeclaration(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->isDeclaration();
+}
+
+LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
+  switch (unwrap<GlobalValue>(Global)->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    return LLVMExternalLinkage;
+  case GlobalValue::AvailableExternallyLinkage:
+    return LLVMAvailableExternallyLinkage;
+  case GlobalValue::LinkOnceAnyLinkage:
+    return LLVMLinkOnceAnyLinkage;
+  case GlobalValue::LinkOnceODRLinkage:
+    return LLVMLinkOnceODRLinkage;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    return LLVMLinkOnceODRAutoHideLinkage;
+  case GlobalValue::WeakAnyLinkage:
+    return LLVMWeakAnyLinkage;
+  case GlobalValue::WeakODRLinkage:
+    return LLVMWeakODRLinkage;
+  case GlobalValue::AppendingLinkage:
+    return LLVMAppendingLinkage;
+  case GlobalValue::InternalLinkage:
+    return LLVMInternalLinkage;
+  case GlobalValue::PrivateLinkage:
+    return LLVMPrivateLinkage;
+  case GlobalValue::LinkerPrivateLinkage:
+    return LLVMLinkerPrivateLinkage;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    return LLVMLinkerPrivateWeakLinkage;
+  case GlobalValue::DLLImportLinkage:
+    return LLVMDLLImportLinkage;
+  case GlobalValue::DLLExportLinkage:
+    return LLVMDLLExportLinkage;
+  case GlobalValue::ExternalWeakLinkage:
+    return LLVMExternalWeakLinkage;
+  case GlobalValue::CommonLinkage:
+    return LLVMCommonLinkage;
+  }
+
+  llvm_unreachable("Invalid GlobalValue linkage!");
+}
+
+void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
+  GlobalValue *GV = unwrap<GlobalValue>(Global);
+
+  switch (Linkage) {
+  case LLVMExternalLinkage:
+    GV->setLinkage(GlobalValue::ExternalLinkage);
+    break;
+  case LLVMAvailableExternallyLinkage:
+    GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
+    break;
+  case LLVMLinkOnceAnyLinkage:
+    GV->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+    break;
+  case LLVMLinkOnceODRLinkage:
+    GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    break;
+  case LLVMLinkOnceODRAutoHideLinkage:
+    GV->setLinkage(GlobalValue::LinkOnceODRAutoHideLinkage);
+    break;
+  case LLVMWeakAnyLinkage:
+    GV->setLinkage(GlobalValue::WeakAnyLinkage);
+    break;
+  case LLVMWeakODRLinkage:
+    GV->setLinkage(GlobalValue::WeakODRLinkage);
+    break;
+  case LLVMAppendingLinkage:
+    GV->setLinkage(GlobalValue::AppendingLinkage);
+    break;
+  case LLVMInternalLinkage:
+    GV->setLinkage(GlobalValue::InternalLinkage);
+    break;
+  case LLVMPrivateLinkage:
+    GV->setLinkage(GlobalValue::PrivateLinkage);
+    break;
+  case LLVMLinkerPrivateLinkage:
+    GV->setLinkage(GlobalValue::LinkerPrivateLinkage);
+    break;
+  case LLVMLinkerPrivateWeakLinkage:
+    GV->setLinkage(GlobalValue::LinkerPrivateWeakLinkage);
+    break;
+  case LLVMDLLImportLinkage:
+    GV->setLinkage(GlobalValue::DLLImportLinkage);
+    break;
+  case LLVMDLLExportLinkage:
+    GV->setLinkage(GlobalValue::DLLExportLinkage);
+    break;
+  case LLVMExternalWeakLinkage:
+    GV->setLinkage(GlobalValue::ExternalWeakLinkage);
+    break;
+  case LLVMGhostLinkage:
+    DEBUG(errs()
+          << "LLVMSetLinkage(): LLVMGhostLinkage is no longer supported.");
+    break;
+  case LLVMCommonLinkage:
+    GV->setLinkage(GlobalValue::CommonLinkage);
+    break;
+  }
+}
+
+const char *LLVMGetSection(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->getSection().c_str();
+}
+
+void LLVMSetSection(LLVMValueRef Global, const char *Section) {
+  unwrap<GlobalValue>(Global)->setSection(Section);
+}
+
+LLVMVisibility LLVMGetVisibility(LLVMValueRef Global) {
+  return static_cast<LLVMVisibility>(
+    unwrap<GlobalValue>(Global)->getVisibility());
+}
+
+void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
+  unwrap<GlobalValue>(Global)
+    ->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
+}
+
+unsigned LLVMGetAlignment(LLVMValueRef Global) {
+  return unwrap<GlobalValue>(Global)->getAlignment();
+}
+
+void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes) {
+  unwrap<GlobalValue>(Global)->setAlignment(Bytes);
+}
+
+/*--.. Operations on global variables ......................................--*/
+
+LLVMValueRef LLVMAddGlobal(LLVMModuleRef M, LLVMTypeRef Ty, const char *Name) {
+  return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
+                                 GlobalValue::ExternalLinkage, 0, Name));
+}
+
+LLVMValueRef LLVMAddGlobalInAddressSpace(LLVMModuleRef M, LLVMTypeRef Ty,
+                                         const char *Name,
+                                         unsigned AddressSpace) {
+  return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false,
+                                 GlobalValue::ExternalLinkage, 0, Name, 0,
+                                 GlobalVariable::NotThreadLocal, AddressSpace));
+}
+
+LLVMValueRef LLVMGetNamedGlobal(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getNamedGlobal(Name));
+}
+
+LLVMValueRef LLVMGetFirstGlobal(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::global_iterator I = Mod->global_begin();
+  if (I == Mod->global_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastGlobal(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::global_iterator I = Mod->global_end();
+  if (I == Mod->global_begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextGlobal(LLVMValueRef GlobalVar) {
+  GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
+  Module::global_iterator I = GV;
+  if (++I == GV->getParent()->global_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousGlobal(LLVMValueRef GlobalVar) {
+  GlobalVariable *GV = unwrap<GlobalVariable>(GlobalVar);
+  Module::global_iterator I = GV;
+  if (I == GV->getParent()->global_begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMDeleteGlobal(LLVMValueRef GlobalVar) {
+  unwrap<GlobalVariable>(GlobalVar)->eraseFromParent();
+}
+
+LLVMValueRef LLVMGetInitializer(LLVMValueRef GlobalVar) {
+  GlobalVariable* GV = unwrap<GlobalVariable>(GlobalVar);
+  if ( !GV->hasInitializer() )
+    return 0;
+  return wrap(GV->getInitializer());
+}
+
+void LLVMSetInitializer(LLVMValueRef GlobalVar, LLVMValueRef ConstantVal) {
+  unwrap<GlobalVariable>(GlobalVar)
+    ->setInitializer(unwrap<Constant>(ConstantVal));
+}
+
+LLVMBool LLVMIsThreadLocal(LLVMValueRef GlobalVar) {
+  return unwrap<GlobalVariable>(GlobalVar)->isThreadLocal();
+}
+
+void LLVMSetThreadLocal(LLVMValueRef GlobalVar, LLVMBool IsThreadLocal) {
+  unwrap<GlobalVariable>(GlobalVar)->setThreadLocal(IsThreadLocal != 0);
+}
+
+LLVMBool LLVMIsGlobalConstant(LLVMValueRef GlobalVar) {
+  return unwrap<GlobalVariable>(GlobalVar)->isConstant();
+}
+
+void LLVMSetGlobalConstant(LLVMValueRef GlobalVar, LLVMBool IsConstant) {
+  unwrap<GlobalVariable>(GlobalVar)->setConstant(IsConstant != 0);
+}
+
+/*--.. Operations on aliases ......................................--*/
+
+LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
+                          const char *Name) {
+  return wrap(new GlobalAlias(unwrap(Ty), GlobalValue::ExternalLinkage, Name,
+                              unwrap<Constant>(Aliasee), unwrap (M)));
+}
+
+/*--.. Operations on functions .............................................--*/
+
+LLVMValueRef LLVMAddFunction(LLVMModuleRef M, const char *Name,
+                             LLVMTypeRef FunctionTy) {
+  return wrap(Function::Create(unwrap<FunctionType>(FunctionTy),
+                               GlobalValue::ExternalLinkage, Name, unwrap(M)));
+}
+
+LLVMValueRef LLVMGetNamedFunction(LLVMModuleRef M, const char *Name) {
+  return wrap(unwrap(M)->getFunction(Name));
+}
+
+LLVMValueRef LLVMGetFirstFunction(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::iterator I = Mod->begin();
+  if (I == Mod->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastFunction(LLVMModuleRef M) {
+  Module *Mod = unwrap(M);
+  Module::iterator I = Mod->end();
+  if (I == Mod->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextFunction(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Module::iterator I = Func;
+  if (++I == Func->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousFunction(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Module::iterator I = Func;
+  if (I == Func->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMDeleteFunction(LLVMValueRef Fn) {
+  unwrap<Function>(Fn)->eraseFromParent();
+}
+
+unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
+  if (Function *F = dyn_cast<Function>(unwrap(Fn)))
+    return F->getIntrinsicID();
+  return 0;
+}
+
+unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) {
+  return unwrap<Function>(Fn)->getCallingConv();
+}
+
+void LLVMSetFunctionCallConv(LLVMValueRef Fn, unsigned CC) {
+  return unwrap<Function>(Fn)->setCallingConv(
+    static_cast<CallingConv::ID>(CC));
+}
+
+const char *LLVMGetGC(LLVMValueRef Fn) {
+  Function *F = unwrap<Function>(Fn);
+  return F->hasGC()? F->getGC() : 0;
+}
+
+void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
+  Function *F = unwrap<Function>(Fn);
+  if (GC)
+    F->setGC(GC);
+  else
+    F->clearGC();
+}
+
+void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  AttrBuilder B(PA);
+  const AttributeSet PALnew =
+    PAL.addAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                      AttributeSet::get(Func->getContext(),
+                                        AttributeSet::FunctionIndex, B));
+  Func->setAttributes(PALnew);
+}
+
+void LLVMRemoveFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  AttrBuilder B(PA);
+  const AttributeSet PALnew =
+    PAL.removeAttributes(Func->getContext(), AttributeSet::FunctionIndex,
+                         AttributeSet::get(Func->getContext(),
+                                           AttributeSet::FunctionIndex, B));
+  Func->setAttributes(PALnew);
+}
+
+LLVMAttribute LLVMGetFunctionAttr(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  const AttributeSet PAL = Func->getAttributes();
+  return (LLVMAttribute)PAL.Raw(AttributeSet::FunctionIndex);
+}
+
+/*--.. Operations on parameters ............................................--*/
+
+unsigned LLVMCountParams(LLVMValueRef FnRef) {
+  // This function is strictly redundant to
+  //   LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(FnRef)))
+  return unwrap<Function>(FnRef)->arg_size();
+}
+
+void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
+  Function *Fn = unwrap<Function>(FnRef);
+  for (Function::arg_iterator I = Fn->arg_begin(),
+                              E = Fn->arg_end(); I != E; I++)
+    *ParamRefs++ = wrap(I);
+}
+
+LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
+  Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
+  while (index --> 0)
+    AI++;
+  return wrap(AI);
+}
+
+LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
+  return wrap(unwrap<Argument>(V)->getParent());
+}
+
+LLVMValueRef LLVMGetFirstParam(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::arg_iterator I = Func->arg_begin();
+  if (I == Func->arg_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::arg_iterator I = Func->arg_end();
+  if (I == Func->arg_begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
+  Argument *A = unwrap<Argument>(Arg);
+  Function::arg_iterator I = A;
+  if (++I == A->getParent()->arg_end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
+  Argument *A = unwrap<Argument>(Arg);
+  Function::arg_iterator I = A;
+  if (I == A->getParent()->arg_begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
+  Argument *A = unwrap<Argument>(Arg);
+  AttrBuilder B(PA);
+  A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1,  B));
+}
+
+void LLVMRemoveAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
+  Argument *A = unwrap<Argument>(Arg);
+  AttrBuilder B(PA);
+  A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1,  B));
+}
+
+LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
+  Argument *A = unwrap<Argument>(Arg);
+  return (LLVMAttribute)A->getParent()->getAttributes().
+    Raw(A->getArgNo()+1);
+}
+  
+
+void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
+  Argument *A = unwrap<Argument>(Arg);
+  AttrBuilder B;
+  B.addAlignmentAttr(align);
+  A->addAttr(AttributeSet::get(A->getContext(),A->getArgNo() + 1, B));
+}
+
+/*--.. Operations on basic blocks ..........................................--*/
+
+LLVMValueRef LLVMBasicBlockAsValue(LLVMBasicBlockRef BB) {
+  return wrap(static_cast<Value*>(unwrap(BB)));
+}
+
+LLVMBool LLVMValueIsBasicBlock(LLVMValueRef Val) {
+  return isa<BasicBlock>(unwrap(Val));
+}
+
+LLVMBasicBlockRef LLVMValueAsBasicBlock(LLVMValueRef Val) {
+  return wrap(unwrap<BasicBlock>(Val));
+}
+
+LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) {
+  return wrap(unwrap(BB)->getParent());
+}
+
+LLVMValueRef LLVMGetBasicBlockTerminator(LLVMBasicBlockRef BB) {
+  return wrap(unwrap(BB)->getTerminator());
+}
+
+unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
+  return unwrap<Function>(FnRef)->size();
+}
+
+void LLVMGetBasicBlocks(LLVMValueRef FnRef, LLVMBasicBlockRef *BasicBlocksRefs){
+  Function *Fn = unwrap<Function>(FnRef);
+  for (Function::iterator I = Fn->begin(), E = Fn->end(); I != E; I++)
+    *BasicBlocksRefs++ = wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetEntryBasicBlock(LLVMValueRef Fn) {
+  return wrap(&unwrap<Function>(Fn)->getEntryBlock());
+}
+
+LLVMBasicBlockRef LLVMGetFirstBasicBlock(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::iterator I = Func->begin();
+  if (I == Func->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetLastBasicBlock(LLVMValueRef Fn) {
+  Function *Func = unwrap<Function>(Fn);
+  Function::iterator I = Func->end();
+  if (I == Func->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMBasicBlockRef LLVMGetNextBasicBlock(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  Function::iterator I = Block;
+  if (++I == Block->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMBasicBlockRef LLVMGetPreviousBasicBlock(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  Function::iterator I = Block;
+  if (I == Block->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMBasicBlockRef LLVMAppendBasicBlockInContext(LLVMContextRef C,
+                                                LLVMValueRef FnRef,
+                                                const char *Name) {
+  return wrap(BasicBlock::Create(*unwrap(C), Name, unwrap<Function>(FnRef)));
+}
+
+LLVMBasicBlockRef LLVMAppendBasicBlock(LLVMValueRef FnRef, const char *Name) {
+  return LLVMAppendBasicBlockInContext(LLVMGetGlobalContext(), FnRef, Name);
+}
+
+LLVMBasicBlockRef LLVMInsertBasicBlockInContext(LLVMContextRef C,
+                                                LLVMBasicBlockRef BBRef,
+                                                const char *Name) {
+  BasicBlock *BB = unwrap(BBRef);
+  return wrap(BasicBlock::Create(*unwrap(C), Name, BB->getParent(), BB));
+}
+
+LLVMBasicBlockRef LLVMInsertBasicBlock(LLVMBasicBlockRef BBRef,
+                                       const char *Name) {
+  return LLVMInsertBasicBlockInContext(LLVMGetGlobalContext(), BBRef, Name);
+}
+
+void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) {
+  unwrap(BBRef)->eraseFromParent();
+}
+
+void LLVMRemoveBasicBlockFromParent(LLVMBasicBlockRef BBRef) {
+  unwrap(BBRef)->removeFromParent();
+}
+
+void LLVMMoveBasicBlockBefore(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
+  unwrap(BB)->moveBefore(unwrap(MovePos));
+}
+
+void LLVMMoveBasicBlockAfter(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
+  unwrap(BB)->moveAfter(unwrap(MovePos));
+}
+
+/*--.. Operations on instructions ..........................................--*/
+
+LLVMBasicBlockRef LLVMGetInstructionParent(LLVMValueRef Inst) {
+  return wrap(unwrap<Instruction>(Inst)->getParent());
+}
+
+LLVMValueRef LLVMGetFirstInstruction(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  BasicBlock::iterator I = Block->begin();
+  if (I == Block->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetLastInstruction(LLVMBasicBlockRef BB) {
+  BasicBlock *Block = unwrap(BB);
+  BasicBlock::iterator I = Block->end();
+  if (I == Block->begin())
+    return 0;
+  return wrap(--I);
+}
+
+LLVMValueRef LLVMGetNextInstruction(LLVMValueRef Inst) {
+  Instruction *Instr = unwrap<Instruction>(Inst);
+  BasicBlock::iterator I = Instr;
+  if (++I == Instr->getParent()->end())
+    return 0;
+  return wrap(I);
+}
+
+LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
+  Instruction *Instr = unwrap<Instruction>(Inst);
+  BasicBlock::iterator I = Instr;
+  if (I == Instr->getParent()->begin())
+    return 0;
+  return wrap(--I);
+}
+
+void LLVMInstructionEraseFromParent(LLVMValueRef Inst) {
+  unwrap<Instruction>(Inst)->eraseFromParent();
+}
+
+LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) {
+  if (ICmpInst *I = dyn_cast<ICmpInst>(unwrap(Inst)))
+    return (LLVMIntPredicate)I->getPredicate();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(unwrap(Inst)))
+    if (CE->getOpcode() == Instruction::ICmp)
+      return (LLVMIntPredicate)CE->getPredicate();
+  return (LLVMIntPredicate)0;
+}
+
+LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst) {
+  if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
+    return map_to_llvmopcode(C->getOpcode());
+  return (LLVMOpcode)0;
+}
+
+/*--.. Call and invoke instructions ........................................--*/
+
+unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
+  Value *V = unwrap(Instr);
+  if (CallInst *CI = dyn_cast<CallInst>(V))
+    return CI->getCallingConv();
+  if (InvokeInst *II = dyn_cast<InvokeInst>(V))
+    return II->getCallingConv();
+  llvm_unreachable("LLVMGetInstructionCallConv applies only to call and invoke!");
+}
+
+void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
+  Value *V = unwrap(Instr);
+  if (CallInst *CI = dyn_cast<CallInst>(V))
+    return CI->setCallingConv(static_cast<CallingConv::ID>(CC));
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(V))
+    return II->setCallingConv(static_cast<CallingConv::ID>(CC));
+  llvm_unreachable("LLVMSetInstructionCallConv applies only to call and invoke!");
+}
+
+void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
+                           LLVMAttribute PA) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  AttrBuilder B(PA);
+  Call.setAttributes(
+    Call.getAttributes().addAttributes(Call->getContext(), index,
+                                       AttributeSet::get(Call->getContext(),
+                                                         index, B)));
+}
+
+void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
+                              LLVMAttribute PA) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  AttrBuilder B(PA);
+  Call.setAttributes(Call.getAttributes()
+                       .removeAttributes(Call->getContext(), index,
+                                         AttributeSet::get(Call->getContext(),
+                                                           index, B)));
+}
+
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
+                                unsigned align) {
+  CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  AttrBuilder B;
+  B.addAlignmentAttr(align);
+  Call.setAttributes(Call.getAttributes()
+                       .addAttributes(Call->getContext(), index,
+                                      AttributeSet::get(Call->getContext(),
+                                                        index, B)));
+}
+
+/*--.. Operations on call instructions (only) ..............................--*/
+
+LLVMBool LLVMIsTailCall(LLVMValueRef Call) {
+  return unwrap<CallInst>(Call)->isTailCall();
+}
+
+void LLVMSetTailCall(LLVMValueRef Call, LLVMBool isTailCall) {
+  unwrap<CallInst>(Call)->setTailCall(isTailCall);
+}
+
+/*--.. Operations on switch instructions (only) ............................--*/
+
+LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) {
+  return wrap(unwrap<SwitchInst>(Switch)->getDefaultDest());
+}
+
+/*--.. Operations on phi nodes .............................................--*/
+
+void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
+                     LLVMBasicBlockRef *IncomingBlocks, unsigned Count) {
+  PHINode *PhiVal = unwrap<PHINode>(PhiNode);
+  for (unsigned I = 0; I != Count; ++I)
+    PhiVal->addIncoming(unwrap(IncomingValues[I]), unwrap(IncomingBlocks[I]));
+}
+
+unsigned LLVMCountIncoming(LLVMValueRef PhiNode) {
+  return unwrap<PHINode>(PhiNode)->getNumIncomingValues();
+}
+
+LLVMValueRef LLVMGetIncomingValue(LLVMValueRef PhiNode, unsigned Index) {
+  return wrap(unwrap<PHINode>(PhiNode)->getIncomingValue(Index));
+}
+
+LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) {
+  return wrap(unwrap<PHINode>(PhiNode)->getIncomingBlock(Index));
+}
+
+
+/*===-- Instruction builders ----------------------------------------------===*/
+
+LLVMBuilderRef LLVMCreateBuilderInContext(LLVMContextRef C) {
+  return wrap(new IRBuilder<>(*unwrap(C)));
+}
+
+LLVMBuilderRef LLVMCreateBuilder(void) {
+  return LLVMCreateBuilderInContext(LLVMGetGlobalContext());
+}
+
+void LLVMPositionBuilder(LLVMBuilderRef Builder, LLVMBasicBlockRef Block,
+                         LLVMValueRef Instr) {
+  BasicBlock *BB = unwrap(Block);
+  Instruction *I = Instr? unwrap<Instruction>(Instr) : (Instruction*) BB->end();
+  unwrap(Builder)->SetInsertPoint(BB, I);
+}
+
+void LLVMPositionBuilderBefore(LLVMBuilderRef Builder, LLVMValueRef Instr) {
+  Instruction *I = unwrap<Instruction>(Instr);
+  unwrap(Builder)->SetInsertPoint(I->getParent(), I);
+}
+
+void LLVMPositionBuilderAtEnd(LLVMBuilderRef Builder, LLVMBasicBlockRef Block) {
+  BasicBlock *BB = unwrap(Block);
+  unwrap(Builder)->SetInsertPoint(BB);
+}
+
+LLVMBasicBlockRef LLVMGetInsertBlock(LLVMBuilderRef Builder) {
+   return wrap(unwrap(Builder)->GetInsertBlock());
+}
+
+void LLVMClearInsertionPosition(LLVMBuilderRef Builder) {
+  unwrap(Builder)->ClearInsertionPoint();
+}
+
+void LLVMInsertIntoBuilder(LLVMBuilderRef Builder, LLVMValueRef Instr) {
+  unwrap(Builder)->Insert(unwrap<Instruction>(Instr));
+}
+
+void LLVMInsertIntoBuilderWithName(LLVMBuilderRef Builder, LLVMValueRef Instr,
+                                   const char *Name) {
+  unwrap(Builder)->Insert(unwrap<Instruction>(Instr), Name);
+}
+
+void LLVMDisposeBuilder(LLVMBuilderRef Builder) {
+  delete unwrap(Builder);
+}
+
+/*--.. Metadata builders ...................................................--*/
+
+void LLVMSetCurrentDebugLocation(LLVMBuilderRef Builder, LLVMValueRef L) {
+  MDNode *Loc = L ? unwrap<MDNode>(L) : NULL;
+  unwrap(Builder)->SetCurrentDebugLocation(DebugLoc::getFromDILocation(Loc));
+}
+
+LLVMValueRef LLVMGetCurrentDebugLocation(LLVMBuilderRef Builder) {
+  return wrap(unwrap(Builder)->getCurrentDebugLocation()
+              .getAsMDNode(unwrap(Builder)->getContext()));
+}
+
+void LLVMSetInstDebugLocation(LLVMBuilderRef Builder, LLVMValueRef Inst) {
+  unwrap(Builder)->SetInstDebugLocation(unwrap<Instruction>(Inst));
+}
+
+
+/*--.. Instruction builders ................................................--*/
+
+LLVMValueRef LLVMBuildRetVoid(LLVMBuilderRef B) {
+  return wrap(unwrap(B)->CreateRetVoid());
+}
+
+LLVMValueRef LLVMBuildRet(LLVMBuilderRef B, LLVMValueRef V) {
+  return wrap(unwrap(B)->CreateRet(unwrap(V)));
+}
+
+LLVMValueRef LLVMBuildAggregateRet(LLVMBuilderRef B, LLVMValueRef *RetVals,
+                                   unsigned N) {
+  return wrap(unwrap(B)->CreateAggregateRet(unwrap(RetVals), N));
+}
+
+LLVMValueRef LLVMBuildBr(LLVMBuilderRef B, LLVMBasicBlockRef Dest) {
+  return wrap(unwrap(B)->CreateBr(unwrap(Dest)));
+}
+
+LLVMValueRef LLVMBuildCondBr(LLVMBuilderRef B, LLVMValueRef If,
+                             LLVMBasicBlockRef Then, LLVMBasicBlockRef Else) {
+  return wrap(unwrap(B)->CreateCondBr(unwrap(If), unwrap(Then), unwrap(Else)));
+}
+
+LLVMValueRef LLVMBuildSwitch(LLVMBuilderRef B, LLVMValueRef V,
+                             LLVMBasicBlockRef Else, unsigned NumCases) {
+  return wrap(unwrap(B)->CreateSwitch(unwrap(V), unwrap(Else), NumCases));
+}
+
+LLVMValueRef LLVMBuildIndirectBr(LLVMBuilderRef B, LLVMValueRef Addr,
+                                 unsigned NumDests) {
+  return wrap(unwrap(B)->CreateIndirectBr(unwrap(Addr), NumDests));
+}
+
+LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn,
+                             LLVMValueRef *Args, unsigned NumArgs,
+                             LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateInvoke(unwrap(Fn), unwrap(Then), unwrap(Catch),
+                                      makeArrayRef(unwrap(Args), NumArgs),
+                                      Name));
+}
+
+LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                 LLVMValueRef PersFn, unsigned NumClauses,
+                                 const char *Name) {
+  return wrap(unwrap(B)->CreateLandingPad(unwrap(Ty),
+                                          cast<Function>(unwrap(PersFn)),
+                                          NumClauses, Name));
+}
+
+LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn) {
+  return wrap(unwrap(B)->CreateResume(unwrap(Exn)));
+}
+
+LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) {
+  return wrap(unwrap(B)->CreateUnreachable());
+}
+
+void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
+                 LLVMBasicBlockRef Dest) {
+  unwrap<SwitchInst>(Switch)->addCase(unwrap<ConstantInt>(OnVal), unwrap(Dest));
+}
+
+void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest) {
+  unwrap<IndirectBrInst>(IndirectBr)->addDestination(unwrap(Dest));
+}
+
+void LLVMAddClause(LLVMValueRef LandingPad, LLVMValueRef ClauseVal) {
+  unwrap<LandingPadInst>(LandingPad)->
+    addClause(cast<Constant>(unwrap(ClauseVal)));
+}
+
+void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) {
+  unwrap<LandingPadInst>(LandingPad)->setCleanup(Val);
+}
+
+/*--.. Arithmetic ..........................................................--*/
+
+LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateAdd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNSWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNSWAdd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNUWAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNUWAdd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateFAdd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateSub(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNSWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNSWSub(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNUWSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNUWSub(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFSub(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateFSub(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateMul(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNSWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNSWMul(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNUWMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateNUWMul(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFMul(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateFMul(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildUDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateUDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateSDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildExactSDiv(LLVMBuilderRef B, LLVMValueRef LHS,
+                                LLVMValueRef RHS, const char *Name) {
+  return wrap(unwrap(B)->CreateExactSDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFDiv(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFDiv(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildURem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateURem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildSRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateSRem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFRem(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFRem(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildShl(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateShl(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildLShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateLShr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildAShr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateAShr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildAnd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateAnd(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildOr(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                         const char *Name) {
+  return wrap(unwrap(B)->CreateOr(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildXor(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
+                          const char *Name) {
+  return wrap(unwrap(B)->CreateXor(unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildBinOp(LLVMBuilderRef B, LLVMOpcode Op,
+                            LLVMValueRef LHS, LLVMValueRef RHS,
+                            const char *Name) {
+  return wrap(unwrap(B)->CreateBinOp(Instruction::BinaryOps(map_from_llvmopcode(Op)), unwrap(LHS),
+                                     unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
+  return wrap(unwrap(B)->CreateNeg(unwrap(V), Name));
+}
+
+LLVMValueRef LLVMBuildNSWNeg(LLVMBuilderRef B, LLVMValueRef V,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateNSWNeg(unwrap(V), Name));
+}
+
+LLVMValueRef LLVMBuildNUWNeg(LLVMBuilderRef B, LLVMValueRef V,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateNUWNeg(unwrap(V), Name));
+}
+
+LLVMValueRef LLVMBuildFNeg(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
+  return wrap(unwrap(B)->CreateFNeg(unwrap(V), Name));
+}
+
+LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
+  return wrap(unwrap(B)->CreateNot(unwrap(V), Name));
+}
+
+/*--.. Memory ..............................................................--*/
+
+LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
+                             const char *Name) {
+  Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
+  AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
+                                               ITy, unwrap(Ty), AllocSize, 
+                                               0, 0, "");
+  return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
+}
+
+LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                  LLVMValueRef Val, const char *Name) {
+  Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
+  AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
+                                               ITy, unwrap(Ty), AllocSize, 
+                                               unwrap(Val), 0, "");
+  return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
+}
+
+LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), 0, Name));
+}
+
+LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                  LLVMValueRef Val, const char *Name) {
+  return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), unwrap(Val), Name));
+}
+
+LLVMValueRef LLVMBuildFree(LLVMBuilderRef B, LLVMValueRef PointerVal) {
+  return wrap(unwrap(B)->Insert(
+     CallInst::CreateFree(unwrap(PointerVal), unwrap(B)->GetInsertBlock())));
+}
+
+
+LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
+}
+
+LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, 
+                            LLVMValueRef PointerVal) {
+  return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
+}
+
+LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
+                          LLVMValueRef *Indices, unsigned NumIndices,
+                          const char *Name) {
+  ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
+  return wrap(unwrap(B)->CreateGEP(unwrap(Pointer), IdxList, Name));
+}
+
+LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
+                                  LLVMValueRef *Indices, unsigned NumIndices,
+                                  const char *Name) {
+  ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
+  return wrap(unwrap(B)->CreateInBoundsGEP(unwrap(Pointer), IdxList, Name));
+}
+
+LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
+                                unsigned Idx, const char *Name) {
+  return wrap(unwrap(B)->CreateStructGEP(unwrap(Pointer), Idx, Name));
+}
+
+LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str,
+                                   const char *Name) {
+  return wrap(unwrap(B)->CreateGlobalString(Str, Name));
+}
+
+LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
+                                      const char *Name) {
+  return wrap(unwrap(B)->CreateGlobalStringPtr(Str, Name));
+}
+
+LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->isVolatile();
+  return cast<StoreInst>(P)->isVolatile();
+}
+
+void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
+  Value *P = unwrap<Value>(MemAccessInst);
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->setVolatile(isVolatile);
+  return cast<StoreInst>(P)->setVolatile(isVolatile);
+}
+
+/*--.. Casts ...............................................................--*/
+
+LLVMValueRef LLVMBuildTrunc(LLVMBuilderRef B, LLVMValueRef Val,
+                            LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateTrunc(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildZExt(LLVMBuilderRef B, LLVMValueRef Val,
+                           LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateZExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildSExt(LLVMBuilderRef B, LLVMValueRef Val,
+                           LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateSExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPToUI(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPToUI(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPToSI(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPToSI(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildUIToFP(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateUIToFP(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildSIToFP(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateSIToFP(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPTrunc(LLVMBuilderRef B, LLVMValueRef Val,
+                              LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPTrunc(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildFPExt(LLVMBuilderRef B, LLVMValueRef Val,
+                            LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPExt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildPtrToInt(LLVMBuilderRef B, LLVMValueRef Val,
+                               LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreatePtrToInt(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildIntToPtr(LLVMBuilderRef B, LLVMValueRef Val,
+                               LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateIntToPtr(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
+                              LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy),
+                                             Name));
+}
+
+LLVMValueRef LLVMBuildSExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateSExtOrBitCast(unwrap(Val), unwrap(DestTy),
+                                             Name));
+}
+
+LLVMValueRef LLVMBuildTruncOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                     LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateTruncOrBitCast(unwrap(Val), unwrap(DestTy),
+                                              Name));
+}
+
+LLVMValueRef LLVMBuildCast(LLVMBuilderRef B, LLVMOpcode Op, LLVMValueRef Val,
+                           LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateCast(Instruction::CastOps(map_from_llvmopcode(Op)), unwrap(Val),
+                                    unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildPointerCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                  LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreatePointerCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
+LLVMValueRef LLVMBuildIntCast(LLVMBuilderRef B, LLVMValueRef Val,
+                              LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateIntCast(unwrap(Val), unwrap(DestTy),
+                                       /*isSigned*/true, Name));
+}
+
+LLVMValueRef LLVMBuildFPCast(LLVMBuilderRef B, LLVMValueRef Val,
+                             LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateFPCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
+/*--.. Comparisons .........................................................--*/
+
+LLVMValueRef LLVMBuildICmp(LLVMBuilderRef B, LLVMIntPredicate Op,
+                           LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateICmp(static_cast<ICmpInst::Predicate>(Op),
+                                    unwrap(LHS), unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildFCmp(LLVMBuilderRef B, LLVMRealPredicate Op,
+                           LLVMValueRef LHS, LLVMValueRef RHS,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateFCmp(static_cast<FCmpInst::Predicate>(Op),
+                                    unwrap(LHS), unwrap(RHS), Name));
+}
+
+/*--.. Miscellaneous instructions ..........................................--*/
+
+LLVMValueRef LLVMBuildPhi(LLVMBuilderRef B, LLVMTypeRef Ty, const char *Name) {
+  return wrap(unwrap(B)->CreatePHI(unwrap(Ty), 0, Name));
+}
+
+LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn,
+                           LLVMValueRef *Args, unsigned NumArgs,
+                           const char *Name) {
+  return wrap(unwrap(B)->CreateCall(unwrap(Fn),
+                                    makeArrayRef(unwrap(Args), NumArgs),
+                                    Name));
+}
+
+LLVMValueRef LLVMBuildSelect(LLVMBuilderRef B, LLVMValueRef If,
+                             LLVMValueRef Then, LLVMValueRef Else,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateSelect(unwrap(If), unwrap(Then), unwrap(Else),
+                                      Name));
+}
+
+LLVMValueRef LLVMBuildVAArg(LLVMBuilderRef B, LLVMValueRef List,
+                            LLVMTypeRef Ty, const char *Name) {
+  return wrap(unwrap(B)->CreateVAArg(unwrap(List), unwrap(Ty), Name));
+}
+
+LLVMValueRef LLVMBuildExtractElement(LLVMBuilderRef B, LLVMValueRef VecVal,
+                                      LLVMValueRef Index, const char *Name) {
+  return wrap(unwrap(B)->CreateExtractElement(unwrap(VecVal), unwrap(Index),
+                                              Name));
+}
+
+LLVMValueRef LLVMBuildInsertElement(LLVMBuilderRef B, LLVMValueRef VecVal,
+                                    LLVMValueRef EltVal, LLVMValueRef Index,
+                                    const char *Name) {
+  return wrap(unwrap(B)->CreateInsertElement(unwrap(VecVal), unwrap(EltVal),
+                                             unwrap(Index), Name));
+}
+
+LLVMValueRef LLVMBuildShuffleVector(LLVMBuilderRef B, LLVMValueRef V1,
+                                    LLVMValueRef V2, LLVMValueRef Mask,
+                                    const char *Name) {
+  return wrap(unwrap(B)->CreateShuffleVector(unwrap(V1), unwrap(V2),
+                                             unwrap(Mask), Name));
+}
+
+LLVMValueRef LLVMBuildExtractValue(LLVMBuilderRef B, LLVMValueRef AggVal,
+                                   unsigned Index, const char *Name) {
+  return wrap(unwrap(B)->CreateExtractValue(unwrap(AggVal), Index, Name));
+}
+
+LLVMValueRef LLVMBuildInsertValue(LLVMBuilderRef B, LLVMValueRef AggVal,
+                                  LLVMValueRef EltVal, unsigned Index,
+                                  const char *Name) {
+  return wrap(unwrap(B)->CreateInsertValue(unwrap(AggVal), unwrap(EltVal),
+                                           Index, Name));
+}
+
+LLVMValueRef LLVMBuildIsNull(LLVMBuilderRef B, LLVMValueRef Val,
+                             const char *Name) {
+  return wrap(unwrap(B)->CreateIsNull(unwrap(Val), Name));
+}
+
+LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef B, LLVMValueRef Val,
+                                const char *Name) {
+  return wrap(unwrap(B)->CreateIsNotNull(unwrap(Val), Name));
+}
+
+LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
+                              LLVMValueRef RHS, const char *Name) {
+  return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
+}
+
+
+/*===-- Module providers --------------------------------------------------===*/
+
+LLVMModuleProviderRef
+LLVMCreateModuleProviderForExistingModule(LLVMModuleRef M) {
+  return reinterpret_cast<LLVMModuleProviderRef>(M);
+}
+
+void LLVMDisposeModuleProvider(LLVMModuleProviderRef MP) {
+  delete unwrap(MP);
+}
+
+
+/*===-- Memory buffers ----------------------------------------------------===*/
+
+LLVMBool LLVMCreateMemoryBufferWithContentsOfFile(
+    const char *Path,
+    LLVMMemoryBufferRef *OutMemBuf,
+    char **OutMessage) {
+
+  OwningPtr<MemoryBuffer> MB;
+  error_code ec;
+  if (!(ec = MemoryBuffer::getFile(Path, MB))) {
+    *OutMemBuf = wrap(MB.take());
+    return 0;
+  }
+
+  *OutMessage = strdup(ec.message().c_str());
+  return 1;
+}
+
+LLVMBool LLVMCreateMemoryBufferWithSTDIN(LLVMMemoryBufferRef *OutMemBuf,
+                                         char **OutMessage) {
+  OwningPtr<MemoryBuffer> MB;
+  error_code ec;
+  if (!(ec = MemoryBuffer::getSTDIN(MB))) {
+    *OutMemBuf = wrap(MB.take());
+    return 0;
+  }
+
+  *OutMessage = strdup(ec.message().c_str());
+  return 1;
+}
+
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRange(
+    const char *InputData,
+    size_t InputDataLength,
+    const char *BufferName,
+    LLVMBool RequiresNullTerminator) {
+
+  return wrap(MemoryBuffer::getMemBuffer(
+      StringRef(InputData, InputDataLength),
+      StringRef(BufferName),
+      RequiresNullTerminator));
+}
+
+LLVMMemoryBufferRef LLVMCreateMemoryBufferWithMemoryRangeCopy(
+    const char *InputData,
+    size_t InputDataLength,
+    const char *BufferName) {
+
+  return wrap(MemoryBuffer::getMemBufferCopy(
+      StringRef(InputData, InputDataLength),
+      StringRef(BufferName)));
+}
+
+
+void LLVMDisposeMemoryBuffer(LLVMMemoryBufferRef MemBuf) {
+  delete unwrap(MemBuf);
+}
+
+/*===-- Pass Registry -----------------------------------------------------===*/
+
+LLVMPassRegistryRef LLVMGetGlobalPassRegistry(void) {
+  return wrap(PassRegistry::getPassRegistry());
+}
+
+/*===-- Pass Manager ------------------------------------------------------===*/
+
+LLVMPassManagerRef LLVMCreatePassManager() {
+  return wrap(new PassManager());
+}
+
+LLVMPassManagerRef LLVMCreateFunctionPassManagerForModule(LLVMModuleRef M) {
+  return wrap(new FunctionPassManager(unwrap(M)));
+}
+
+LLVMPassManagerRef LLVMCreateFunctionPassManager(LLVMModuleProviderRef P) {
+  return LLVMCreateFunctionPassManagerForModule(
+                                            reinterpret_cast<LLVMModuleRef>(P));
+}
+
+LLVMBool LLVMRunPassManager(LLVMPassManagerRef PM, LLVMModuleRef M) {
+  return unwrap<PassManager>(PM)->run(*unwrap(M));
+}
+
+LLVMBool LLVMInitializeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doInitialization();
+}
+
+LLVMBool LLVMRunFunctionPassManager(LLVMPassManagerRef FPM, LLVMValueRef F) {
+  return unwrap<FunctionPassManager>(FPM)->run(*unwrap<Function>(F));
+}
+
+LLVMBool LLVMFinalizeFunctionPassManager(LLVMPassManagerRef FPM) {
+  return unwrap<FunctionPassManager>(FPM)->doFinalization();
+}
+
+void LLVMDisposePassManager(LLVMPassManagerRef PM) {
+  delete unwrap(PM);
+}
+
+/*===-- Threading ------------------------------------------------------===*/
+
+LLVMBool LLVMStartMultithreaded() {
+  return llvm_start_multithreaded();
+}
+
+void LLVMStopMultithreaded() {
+  llvm_stop_multithreaded();
+}
+
+LLVMBool LLVMIsMultithreaded() {
+  return llvm_is_multithreaded();
+}
diff --git a/contrib/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm/lib/IR/DIBuilder.cpp
new file mode 100644
index 000000000000..9d6e84072912
--- /dev/null
+++ b/contrib/llvm/lib/IR/DIBuilder.cpp
@@ -0,0 +1,1101 @@
+//===--- DIBuilder.cpp - Debug Information Builder ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the DIBuilder.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DIBuilder.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+
+using namespace llvm;
+using namespace llvm::dwarf;
+
+static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
+  assert((Tag & LLVMDebugVersionMask) == 0 &&
+         "Tag too large for debug encoding!");
+  return ConstantInt::get(Type::getInt32Ty(VMContext), Tag | LLVMDebugVersion);
+}
+
+DIBuilder::DIBuilder(Module &m)
+  : M(m), VMContext(M.getContext()), TheCU(0), TempEnumTypes(0),
+    TempRetainTypes(0), TempSubprograms(0), TempGVs(0), DeclareFn(0),
+    ValueFn(0)
+{}
+
+/// finalize - Construct any deferred debug info descriptors.
+void DIBuilder::finalize() {
+  DIArray Enums = getOrCreateArray(AllEnumTypes);
+  DIType(TempEnumTypes).replaceAllUsesWith(Enums);
+
+  DIArray RetainTypes = getOrCreateArray(AllRetainTypes);
+  DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
+
+  DIArray SPs = getOrCreateArray(AllSubprograms);
+  DIType(TempSubprograms).replaceAllUsesWith(SPs);
+  for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+    DISubprogram SP(SPs.getElement(i));
+    SmallVector<Value *, 4> Variables;
+    if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) {
+      for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii)
+        Variables.push_back(NMD->getOperand(ii));
+      NMD->eraseFromParent();
+    }
+    if (MDNode *Temp = SP.getVariablesNodes()) {
+      DIArray AV = getOrCreateArray(Variables);
+      DIType(Temp).replaceAllUsesWith(AV);
+    }
+  }
+
+  DIArray GVs = getOrCreateArray(AllGVs);
+  DIType(TempGVs).replaceAllUsesWith(GVs);
+}
+
+/// getNonCompileUnitScope - If N is compile unit return NULL otherwise return
+/// N.
+static MDNode *getNonCompileUnitScope(MDNode *N) {
+  if (DIDescriptor(N).isCompileUnit())
+    return NULL;
+  return N;
+}
+
+static MDNode *createFilePathPair(LLVMContext &VMContext, StringRef Filename,
+                                  StringRef Directory) {
+  assert(!Filename.empty() && "Unable to create file without name");
+  Value *Pair[] = {
+    MDString::get(VMContext, Filename),
+    MDString::get(VMContext, Directory),
+  };
+  return MDNode::get(VMContext, Pair);
+}
+
+/// createCompileUnit - A CompileUnit provides an anchor for all debugging
+/// information generated during this instance of compilation.
+void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
+                                  StringRef Directory, StringRef Producer,
+                                  bool isOptimized, StringRef Flags,
+                                  unsigned RunTimeVer, StringRef SplitName) {
+  assert(((Lang <= dwarf::DW_LANG_Python && Lang >= dwarf::DW_LANG_C89) ||
+          (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
+         "Invalid Language tag");
+  assert(!Filename.empty() &&
+         "Unable to create compile unit without filename");
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  TempEnumTypes = MDNode::getTemporary(VMContext, TElts);
+
+  TempRetainTypes = MDNode::getTemporary(VMContext, TElts);
+
+  TempSubprograms = MDNode::getTemporary(VMContext, TElts);
+
+  TempGVs = MDNode::getTemporary(VMContext, TElts);
+
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
+    createFilePathPair(VMContext, Filename, Directory),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Lang),
+    MDString::get(VMContext, Producer),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
+    MDString::get(VMContext, Flags),
+    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer),
+    TempEnumTypes,
+    TempRetainTypes,
+    TempSubprograms,
+    TempGVs,
+    MDString::get(VMContext, SplitName)
+  };
+  TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
+
+  // Create a named metadata so that it is easier to find cu in a module.
+  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+  NMD->addOperand(TheCU);
+}
+
+/// createFile - Create a file descriptor to hold debugging information
+/// for a file.
+DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_file_type),
+    createFilePathPair(VMContext, Filename, Directory)
+  };
+  return DIFile(MDNode::get(VMContext, Elts));
+}
+
+/// createEnumerator - Create a single enumerator value.
+DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
+  assert(!Name.empty() && "Unable to create enumerator without name");
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_enumerator),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt64Ty(VMContext), Val)
+  };
+  return DIEnumerator(MDNode::get(VMContext, Elts));
+}
+
+/// createNullPtrType - Create C++0x nullptr type.
+DIType DIBuilder::createNullPtrType(StringRef Name) {
+  assert(!Name.empty() && "Unable to create type without name");
+  // nullptr is encoded in DIBasicType format. Line number, filename,
+  // ,size, alignment, offset and flags are always empty here.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_type),
+    NULL, // Filename
+    NULL, //TheCU,
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0)  // Encoding
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createBasicType - Create debugging information entry for a basic
+/// type, e.g 'char'.
+DIBasicType
+DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
+                           uint64_t AlignInBits, unsigned Encoding) {
+  assert(!Name.empty() && "Unable to create type without name");
+  // Basic types are encoded in DIBasicType format. Line number, filename,
+  // offset and flags are always empty here.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_base_type),
+    NULL, // File/directory name
+    NULL, //TheCU,
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
+    ConstantInt::get(Type::getInt32Ty(VMContext), Encoding)
+  };
+  return DIBasicType(MDNode::get(VMContext, Elts));
+}
+
+/// createQualifiedType - Create debugging information entry for a qualified
+/// type, e.g. 'const int'.
+DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
+  // Qualified types are encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    NULL, // Filename
+    NULL, //TheCU,
+    MDString::get(VMContext, StringRef()), // Empty name.
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    FromTy
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createPointerType - Create debugging information entry for a pointer.
+DIDerivedType
+DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
+                             uint64_t AlignInBits, StringRef Name) {
+  // Pointer types are encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
+    NULL, // Filename
+    NULL, //TheCU,
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    PointeeTy
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy, DIType Base) {
+  // Pointer types are encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_ptr_to_member_type),
+    NULL, // Filename
+    NULL, //TheCU,
+    NULL,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    PointeeTy,
+    Base
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createReferenceType - Create debugging information entry for a reference
+/// type.
+DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
+  assert(RTy.Verify() && "Unable to create reference type");
+  // References are encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    NULL, // Filename
+    NULL, // TheCU,
+    NULL, // Name
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    RTy
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createTypedef - Create debugging information entry for a typedef.
+DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
+                                       unsigned LineNo, DIDescriptor Context) {
+  // typedefs are encoded in DIDerivedType format.
+  assert(Ty.Verify() && "Invalid typedef type!");
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
+    File.getFileNode(),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    Ty
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createFriend - Create debugging information entry for a 'friend'.
+DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
+  // typedefs are encoded in DIDerivedType format.
+  assert(Ty.Verify() && "Invalid type!");
+  assert(FriendTy.Verify() && "Invalid friend type!");
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_friend),
+    NULL,
+    Ty,
+    NULL, // Name
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    FriendTy
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createInheritance - Create debugging information entry to establish
+/// inheritance relationship between two types.
+DIDerivedType DIBuilder::createInheritance(
+    DIType Ty, DIType BaseTy, uint64_t BaseOffset, unsigned Flags) {
+  assert(Ty.Verify() && "Unable to create inheritance");
+  // TAG_inheritance is encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
+    NULL,
+    Ty,
+    NULL, // Name
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), BaseOffset),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    BaseTy
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createMemberType - Create debugging information entry for a member.
+DIDerivedType DIBuilder::createMemberType(
+    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
+    unsigned Flags, DIType Ty) {
+  // TAG_member is encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty
+  };
+  return DIDerivedType(MDNode::get(VMContext, Elts));
+}
+
+/// createStaticMemberType - Create debugging information entry for a
+/// C++ static data member.
+DIType DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
+                                         DIFile File, unsigned LineNumber,
+                                         DIType Ty, unsigned Flags,
+                                         llvm::Value *Val) {
+  // TAG_member is encoded in DIDerivedType format.
+  Flags |= DIDescriptor::FlagStaticMember;
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*SizeInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*AlignInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*OffsetInBits*/),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty,
+    Val
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createObjCIVar - Create debugging information entry for Objective-C
+/// instance variable.
+DIType DIBuilder::createObjCIVar(StringRef Name,
+                                 DIFile File, unsigned LineNumber,
+                                 uint64_t SizeInBits, uint64_t AlignInBits,
+                                 uint64_t OffsetInBits, unsigned Flags,
+                                 DIType Ty, StringRef PropertyName,
+                                 StringRef GetterName, StringRef SetterName,
+                                 unsigned PropertyAttributes) {
+  // TAG_member is encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    File.getFileNode(),
+    getNonCompileUnitScope(File),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty,
+    MDString::get(VMContext, PropertyName),
+    MDString::get(VMContext, GetterName),
+    MDString::get(VMContext, SetterName),
+    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes)
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createObjCIVar - Create debugging information entry for Objective-C
+/// instance variable.
+DIType DIBuilder::createObjCIVar(StringRef Name,
+                                 DIFile File, unsigned LineNumber,
+                                 uint64_t SizeInBits, uint64_t AlignInBits,
+                                 uint64_t OffsetInBits, unsigned Flags,
+                                 DIType Ty, MDNode *PropertyNode) {
+  // TAG_member is encoded in DIDerivedType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_member),
+    File.getFileNode(),
+    getNonCompileUnitScope(File),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Ty,
+    PropertyNode
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createObjCProperty - Create debugging information entry for Objective-C
+/// property.
+DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
+                                             DIFile File, unsigned LineNumber,
+                                             StringRef GetterName,
+                                             StringRef SetterName, 
+                                             unsigned PropertyAttributes,
+                                             DIType Ty) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    MDString::get(VMContext, GetterName),
+    MDString::get(VMContext, SetterName),
+    ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes),
+    Ty
+  };
+  return DIObjCProperty(MDNode::get(VMContext, Elts));
+}
+
+/// createTemplateTypeParameter - Create debugging information for template
+/// type parameter.
+DITemplateTypeParameter
+DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
+                                       DIType Ty, MDNode *File, unsigned LineNo,
+                                       unsigned ColumnNo) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    Ty,
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
+  };
+  return DITemplateTypeParameter(MDNode::get(VMContext, Elts));
+}
+
+/// createTemplateValueParameter - Create debugging information for template
+/// value parameter.
+DITemplateValueParameter
+DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
+                                        DIType Ty, uint64_t Val,
+                                        MDNode *File, unsigned LineNo,
+                                        unsigned ColumnNo) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_template_value_parameter),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    Ty,
+    ConstantInt::get(Type::getInt64Ty(VMContext), Val),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
+  };
+  return DITemplateValueParameter(MDNode::get(VMContext, Elts));
+}
+
+/// createClassType - Create debugging information entry for a class.
+DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
+                                           DIFile File, unsigned LineNumber,
+                                           uint64_t SizeInBits,
+                                           uint64_t AlignInBits,
+                                           uint64_t OffsetInBits,
+                                           unsigned Flags, DIType DerivedFrom,
+                                           DIArray Elements,
+                                           MDNode *VTableHolder,
+                                           MDNode *TemplateParams) {
+  assert((!Context || Context.Verify()) &&
+         "createClassType should be called with a valid Context");
+  // TAG_class_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
+    File.getFileNode(),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    DerivedFrom,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    VTableHolder,
+    TemplateParams
+  };
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  assert(R.Verify() && "createClassType should return a verifiable DIType");
+  return R;
+}
+
+/// createStructType - Create debugging information entry for a struct.
+DICompositeType DIBuilder::createStructType(DIDescriptor Context,
+                                            StringRef Name, DIFile File,
+                                            unsigned LineNumber,
+                                            uint64_t SizeInBits,
+                                            uint64_t AlignInBits,
+                                            unsigned Flags, DIType DerivedFrom,
+                                            DIArray Elements,
+                                            unsigned RunTimeLang,
+                                            MDNode *VTableHolder) {
+ // TAG_structure_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
+    File.getFileNode(),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    DerivedFrom,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
+    VTableHolder,
+    NULL,
+  };
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  assert(R.Verify() && "createStructType should return a verifiable DIType");
+  return R;
+}
+
+/// createUnionType - Create debugging information entry for an union.
+DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
+                                           DIFile File, unsigned LineNumber,
+                                           uint64_t SizeInBits,
+                                           uint64_t AlignInBits, unsigned Flags,
+                                           DIArray Elements,
+                                           unsigned RunTimeLang) {
+  // TAG_union_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    NULL,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    NULL
+  };
+  return DICompositeType(MDNode::get(VMContext, Elts));
+}
+
+/// createSubroutineType - Create subroutine type.
+DICompositeType
+DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
+  // TAG_subroutine_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    MDString::get(VMContext, ""),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    NULL,
+    ParameterTypes,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
+  };
+  return DICompositeType(MDNode::get(VMContext, Elts));
+}
+
+/// createEnumerationType - Create debugging information entry for an
+/// enumeration.
+DICompositeType DIBuilder::createEnumerationType(
+    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
+    uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
+    DIType ClassType) {
+  // TAG_enumeration_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ClassType,
+    Elements,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  AllEnumTypes.push_back(Node);
+  return DICompositeType(Node);
+}
+
+/// createArrayType - Create debugging information entry for an array.
+DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
+                                           DIType Ty, DIArray Subscripts) {
+  // TAG_array_type is encoded in DICompositeType format.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
+    NULL, // Filename/Directory,
+    NULL, //TheCU,
+    MDString::get(VMContext, ""),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), Size),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    Ty,
+    Subscripts,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
+  };
+  return DICompositeType(MDNode::get(VMContext, Elts));
+}
+
+/// createVectorType - Create debugging information entry for a vector.
+DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
+                                   DIType Ty, DIArray Subscripts) {
+
+  // A vector is an array type with the FlagVector flag applied.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
+    NULL, // Filename/Directory,
+    NULL, //TheCU,
+    MDString::get(VMContext, ""),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), Size),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), DIType::FlagVector),
+    Ty,
+    Subscripts,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createArtificialType - Create a new DIType with "artificial" flag set.
+DIType DIBuilder::createArtificialType(DIType Ty) {
+  if (Ty.isArtificial())
+    return Ty;
+
+  SmallVector<Value *, 9> Elts;
+  MDNode *N = Ty;
+  assert (N && "Unexpected input DIType!");
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i))
+      Elts.push_back(V);
+    else
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  }
+
+  unsigned CurFlags = Ty.getFlags();
+  CurFlags = CurFlags | DIType::FlagArtificial;
+
+  // Flags are stored at this slot.
+  Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
+
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createObjectPointerType - Create a new type with both the object pointer
+/// and artificial flags set.
+DIType DIBuilder::createObjectPointerType(DIType Ty) {
+  if (Ty.isObjectPointer())
+    return Ty;
+
+  SmallVector<Value *, 9> Elts;
+  MDNode *N = Ty;
+  assert (N && "Unexpected input DIType!");
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i))
+      Elts.push_back(V);
+    else
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  }
+
+  unsigned CurFlags = Ty.getFlags();
+  CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
+
+  // Flags are stored at this slot.
+  Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
+
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// retainType - Retain DIType in a module even if it is not referenced
+/// through debug info anchors.
+void DIBuilder::retainType(DIType T) {
+  AllRetainTypes.push_back(T);
+}
+
+/// createUnspecifiedParameter - Create unspeicified type descriptor
+/// for the subroutine type.
+DIDescriptor DIBuilder::createUnspecifiedParameter() {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters)
+  };
+  return DIDescriptor(MDNode::get(VMContext, Elts));
+}
+
+/// createForwardDecl - Create a temporary forward-declared type that
+/// can be RAUW'd if the full type is seen.
+DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
+                                    DIDescriptor Scope, DIFile F,
+                                    unsigned Line, unsigned RuntimeLang,
+                                    uint64_t SizeInBits,
+                                    uint64_t AlignInBits) {
+  // Create a temporary MDNode.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    F.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext),
+                     DIDescriptor::FlagFwdDecl),
+    NULL,
+    DIArray(),
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang)
+  };
+  MDNode *Node = MDNode::getTemporary(VMContext, Elts);
+  assert(DIType(Node).Verify() &&
+         "createForwardDecl result should be verifiable");
+  return DIType(Node);
+}
+
+/// getOrCreateArray - Get a DIArray, create one if required.
+DIArray DIBuilder::getOrCreateArray(ArrayRef<Value *> Elements) {
+  if (Elements.empty()) {
+    Value *Null = Constant::getNullValue(Type::getInt32Ty(VMContext));
+    return DIArray(MDNode::get(VMContext, Null));
+  }
+  return DIArray(MDNode::get(VMContext, Elements));
+}
+
+/// getOrCreateSubrange - Create a descriptor for a value range.  This
+/// implicitly uniques the values returned.
+DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_subrange_type),
+    ConstantInt::get(Type::getInt64Ty(VMContext), Lo),
+    ConstantInt::get(Type::getInt64Ty(VMContext), Count)
+  };
+
+  return DISubrange(MDNode::get(VMContext, Elts));
+}
+
+/// \brief Create a new descriptor for the specified global.
+DIGlobalVariable DIBuilder::
+createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
+                     unsigned LineNumber, DIType Ty, bool isLocalToUnit,
+                     Value *Val) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_variable),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    NULL, // TheCU,
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, LinkageName),
+    F,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
+    Val,
+    DIDescriptor()
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  AllGVs.push_back(Node);
+  return DIGlobalVariable(Node);
+}
+
+/// \brief Create a new descriptor for the specified global.
+DIGlobalVariable DIBuilder::
+createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
+                     DIType Ty, bool isLocalToUnit, Value *Val) {
+  return createGlobalVariable(Name, Name, F, LineNumber, Ty, isLocalToUnit,
+                              Val);
+}
+
+/// createStaticVariable - Create a new descriptor for the specified static
+/// variable.
+DIGlobalVariable DIBuilder::
+createStaticVariable(DIDescriptor Context, StringRef Name,
+                     StringRef LinkageName, DIFile F, unsigned LineNumber,
+                     DIType Ty, bool isLocalToUnit, Value *Val, MDNode *Decl) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_variable),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, LinkageName),
+    F,
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
+    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), isLocalToUnit),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 1), /* isDefinition*/
+    Val,
+    DIDescriptor(Decl)
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  AllGVs.push_back(Node);
+  return DIGlobalVariable(Node);
+}
+
+/// createVariable - Create a new descriptor for the specified variable.
+DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
+                                          StringRef Name, DIFile File,
+                                          unsigned LineNo, DIType Ty,
+                                          bool AlwaysPreserve, unsigned Flags,
+                                          unsigned ArgNo) {
+  DIDescriptor Context(getNonCompileUnitScope(Scope));
+  assert((!Context || Context.Verify()) &&
+         "createLocalVariable should be called with a valid Context");
+  assert(Ty.Verify() &&
+         "createLocalVariable should be called with a valid type");
+  Value *Elts[] = {
+    GetTagConstant(VMContext, Tag),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    File,
+    ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))),
+    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Constant::getNullValue(Type::getInt32Ty(VMContext))
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  if (AlwaysPreserve) {
+    // The optimizer may remove local variable. If there is an interest
+    // to preserve variable info in such situation then stash it in a
+    // named mdnode.
+    DISubprogram Fn(getDISubprogram(Scope));
+    NamedMDNode *FnLocals = getOrInsertFnSpecificMDNode(M, Fn);
+    FnLocals->addOperand(Node);
+  }
+  assert(DIVariable(Node).Verify() &&
+         "createLocalVariable should return a verifiable DIVariable");
+  return DIVariable(Node);
+}
+
+/// createComplexVariable - Create a new descriptor for the specified variable
+/// which has a complex address expression for its address.
+DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
+                                            StringRef Name, DIFile F,
+                                            unsigned LineNo,
+                                            DIType Ty, ArrayRef<Value *> Addr,
+                                            unsigned ArgNo) {
+  SmallVector<Value *, 15> Elts;
+  Elts.push_back(GetTagConstant(VMContext, Tag));
+  Elts.push_back(getNonCompileUnitScope(Scope)),
+  Elts.push_back(MDString::get(VMContext, Name));
+  Elts.push_back(F);
+  Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext),
+                                  (LineNo | (ArgNo << 24))));
+  Elts.push_back(Ty);
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.append(Addr.begin(), Addr.end());
+
+  return DIVariable(MDNode::get(VMContext, Elts));
+}
+
+/// createFunction - Create a new descriptor for the specified function.
+DISubprogram DIBuilder::createFunction(DIDescriptor Context,
+                                       StringRef Name,
+                                       StringRef LinkageName,
+                                       DIFile File, unsigned LineNo,
+                                       DIType Ty,
+                                       bool isLocalToUnit, bool isDefinition,
+                                       unsigned ScopeLine,
+                                       unsigned Flags, bool isOptimized,
+                                       Function *Fn,
+                                       MDNode *TParams,
+                                       MDNode *Decl) {
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
+    File.getFileNode(),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, LinkageName),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    Ty,
+    ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    NULL,
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
+    Fn,
+    TParams,
+    Decl,
+    MDNode::getTemporary(VMContext, TElts),
+    ConstantInt::get(Type::getInt32Ty(VMContext), ScopeLine)
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+
+  // Create a named metadata so that we do not lose this mdnode.
+  if (isDefinition)
+    AllSubprograms.push_back(Node);
+  DISubprogram S(Node);
+  assert(S.Verify() && "createFunction should return a valid DISubprogram");
+  return S;
+}
+
+/// createMethod - Create a new descriptor for the specified C++ method.
+DISubprogram DIBuilder::createMethod(DIDescriptor Context,
+                                     StringRef Name,
+                                     StringRef LinkageName,
+                                     DIFile F,
+                                     unsigned LineNo, DIType Ty,
+                                     bool isLocalToUnit,
+                                     bool isDefinition,
+                                     unsigned VK, unsigned VIndex,
+                                     MDNode *VTableHolder,
+                                     unsigned Flags,
+                                     bool isOptimized,
+                                     Function *Fn,
+                                     MDNode *TParam) {
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
+    F.getFileNode(),
+    getNonCompileUnitScope(Context),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, Name),
+    MDString::get(VMContext, LinkageName),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
+    Ty,
+    ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
+    ConstantInt::get(Type::getInt32Ty(VMContext), (unsigned)VK),
+    ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
+    VTableHolder,
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
+    Fn,
+    TParam,
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    MDNode::getTemporary(VMContext, TElts),
+    // FIXME: Do we want to use different scope/lines?
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
+  };
+  MDNode *Node = MDNode::get(VMContext, Elts);
+  if (isDefinition)
+    AllSubprograms.push_back(Node);
+  DISubprogram S(Node);
+  assert(S.Verify() && "createMethod should return a valid DISubprogram");
+  return S;
+}
+
+/// createNameSpace - This creates new descriptor for a namespace
+/// with the specified parent scope.
+DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
+                                       DIFile File, unsigned LineNo) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_namespace),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    MDString::get(VMContext, Name),
+    ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
+  };
+  DINameSpace R(MDNode::get(VMContext, Elts));
+  assert(R.Verify() &&
+         "createNameSpace should return a verifiable DINameSpace");
+  return R;
+}
+
+/// createLexicalBlockFile - This creates a new MDNode that encapsulates
+/// an existing scope with a new filename.
+DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
+                                                     DIFile File) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
+    File.getFileNode(),
+    Scope
+  };
+  DILexicalBlockFile R(MDNode::get(VMContext, Elts));
+  assert(
+      R.Verify() &&
+      "createLexicalBlockFile should return a verifiable DILexicalBlockFile");
+  return R;
+}
+
+DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
+                                             unsigned Line, unsigned Col) {
+  // Defeat MDNode uniqing for lexical blocks by using unique id.
+  static unsigned int unique_id = 0;
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
+    File.getFileNode(),
+    getNonCompileUnitScope(Scope),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Line),
+    ConstantInt::get(Type::getInt32Ty(VMContext), Col),
+    ConstantInt::get(Type::getInt32Ty(VMContext), unique_id++)
+  };
+  DILexicalBlock R(MDNode::get(VMContext, Elts));
+  assert(R.Verify() &&
+         "createLexicalBlock should return a verifiable DILexicalBlock");
+  return R;
+}
+
+/// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
+Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
+                                      Instruction *InsertBefore) {
+  assert(Storage && "no storage passed to dbg.declare");
+  assert(VarInfo.Verify() && "empty DIVariable passed to dbg.declare");
+  if (!DeclareFn)
+    DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
+
+  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
+  return CallInst::Create(DeclareFn, Args, "", InsertBefore);
+}
+
+/// insertDeclare - Insert a new llvm.dbg.declare intrinsic call.
+Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
+                                      BasicBlock *InsertAtEnd) {
+  assert(Storage && "no storage passed to dbg.declare");
+  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.declare");
+  if (!DeclareFn)
+    DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
+
+  Value *Args[] = { MDNode::get(Storage->getContext(), Storage), VarInfo };
+
+  // If this block already has a terminator then insert this intrinsic
+  // before the terminator.
+  if (TerminatorInst *T = InsertAtEnd->getTerminator())
+    return CallInst::Create(DeclareFn, Args, "", T);
+  else
+    return CallInst::Create(DeclareFn, Args, "", InsertAtEnd);
+}
+
+/// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
+Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
+                                                DIVariable VarInfo,
+                                                Instruction *InsertBefore) {
+  assert(V && "no value passed to dbg.value");
+  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.value");
+  if (!ValueFn)
+    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+
+  Value *Args[] = { MDNode::get(V->getContext(), V),
+                    ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
+                    VarInfo };
+  return CallInst::Create(ValueFn, Args, "", InsertBefore);
+}
+
+/// insertDbgValueIntrinsic - Insert a new llvm.dbg.value intrinsic call.
+Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
+                                                DIVariable VarInfo,
+                                                BasicBlock *InsertAtEnd) {
+  assert(V && "no value passed to dbg.value");
+  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.value");
+  if (!ValueFn)
+    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+
+  Value *Args[] = { MDNode::get(V->getContext(), V),
+                    ConstantInt::get(Type::getInt64Ty(V->getContext()), Offset),
+                    VarInfo };
+  return CallInst::Create(ValueFn, Args, "", InsertAtEnd);
+}
diff --git a/contrib/llvm/lib/IR/DataLayout.cpp b/contrib/llvm/lib/IR/DataLayout.cpp
new file mode 100644
index 000000000000..ecd5216f20ac
--- /dev/null
+++ b/contrib/llvm/lib/IR/DataLayout.cpp
@@ -0,0 +1,697 @@
+//===-- DataLayout.cpp - Data size & alignment routines --------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines layout properties related to datatype size/offset/alignment
+// information.
+//
+// This structure should be created once, filled in if the defaults are not
+// correct and then passed around by const&.  None of the members functions
+// require modification to the object.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdlib>
+using namespace llvm;
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+
+// Register the default SparcV9 implementation...
+INITIALIZE_PASS(DataLayout, "datalayout", "Data Layout", false, true)
+char DataLayout::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Support for StructLayout
+//===----------------------------------------------------------------------===//
+
+StructLayout::StructLayout(StructType *ST, const DataLayout &TD) {
+  assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
+  StructAlignment = 0;
+  StructSize = 0;
+  NumElements = ST->getNumElements();
+
+  // Loop over each of the elements, placing them in memory.
+  for (unsigned i = 0, e = NumElements; i != e; ++i) {
+    Type *Ty = ST->getElementType(i);
+    unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
+
+    // Add padding if necessary to align the data element properly.
+    if ((StructSize & (TyAlign-1)) != 0)
+      StructSize = DataLayout::RoundUpAlignment(StructSize, TyAlign);
+
+    // Keep track of maximum alignment constraint.
+    StructAlignment = std::max(TyAlign, StructAlignment);
+
+    MemberOffsets[i] = StructSize;
+    StructSize += TD.getTypeAllocSize(Ty); // Consume space for this data item
+  }
+
+  // Empty structures have alignment of 1 byte.
+  if (StructAlignment == 0) StructAlignment = 1;
+
+  // Add padding to the end of the struct so that it could be put in an array
+  // and all array elements would be aligned correctly.
+  if ((StructSize & (StructAlignment-1)) != 0)
+    StructSize = DataLayout::RoundUpAlignment(StructSize, StructAlignment);
+}
+
+
+/// getElementContainingOffset - Given a valid offset into the structure,
+/// return the structure index that contains it.
+unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
+  const uint64_t *SI =
+    std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset);
+  assert(SI != &MemberOffsets[0] && "Offset not in structure type!");
+  --SI;
+  assert(*SI <= Offset && "upper_bound didn't work");
+  assert((SI == &MemberOffsets[0] || *(SI-1) <= Offset) &&
+         (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) &&
+         "Upper bound didn't work!");
+
+  // Multiple fields can have the same offset if any of them are zero sized.
+  // For example, in { i32, [0 x i32], i32 }, searching for offset 4 will stop
+  // at the i32 element, because it is the last element at that offset.  This is
+  // the right one to return, because anything after it will have a higher
+  // offset, implying that this element is non-empty.
+  return SI-&MemberOffsets[0];
+}
+
+//===----------------------------------------------------------------------===//
+// LayoutAlignElem, LayoutAlign support
+//===----------------------------------------------------------------------===//
+
+LayoutAlignElem
+LayoutAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
+                     unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  LayoutAlignElem retval;
+  retval.AlignType = align_type;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
+  return (AlignType == rhs.AlignType
+          && ABIAlign == rhs.ABIAlign
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+const LayoutAlignElem
+DataLayout::InvalidAlignmentElem = LayoutAlignElem::get(INVALID_ALIGN, 0, 0, 0);
+
+//===----------------------------------------------------------------------===//
+// PointerAlignElem, PointerAlign support
+//===----------------------------------------------------------------------===//
+
+PointerAlignElem
+PointerAlignElem::get(uint32_t addr_space, unsigned abi_align,
+                      unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  PointerAlignElem retval;
+  retval.AddressSpace = addr_space;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
+  return (ABIAlign == rhs.ABIAlign
+          && AddressSpace == rhs.AddressSpace
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+const PointerAlignElem
+DataLayout::InvalidPointerElem = PointerAlignElem::get(~0U, 0U, 0U, 0U);
+
+//===----------------------------------------------------------------------===//
+//                       DataLayout Class Implementation
+//===----------------------------------------------------------------------===//
+
+void DataLayout::init(StringRef Desc) {
+  initializeDataLayoutPass(*PassRegistry::getPassRegistry());
+
+  LayoutMap = 0;
+  LittleEndian = false;
+  StackNaturalAlign = 0;
+
+  // Default alignments
+  setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
+  setAlignment(INTEGER_ALIGN,   1,  1, 8);   // i8
+  setAlignment(INTEGER_ALIGN,   2,  2, 16);  // i16
+  setAlignment(INTEGER_ALIGN,   4,  4, 32);  // i32
+  setAlignment(INTEGER_ALIGN,   4,  8, 64);  // i64
+  setAlignment(FLOAT_ALIGN,     2,  2, 16);  // half
+  setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
+  setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
+  setAlignment(FLOAT_ALIGN,    16, 16, 128); // ppcf128, quad, ...
+  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32, v1i64, ...
+  setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
+  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct
+  setPointerAlignment(0, 8, 8, 8);
+
+  parseSpecifier(Desc);
+}
+
+/// Checked version of split, to ensure mandatory subparts.
+static std::pair<StringRef, StringRef> split(StringRef Str, char Separator) {
+  assert(!Str.empty() && "parse error, string can't be empty here");
+  std::pair<StringRef, StringRef> Split = Str.split(Separator);
+  assert((!Split.second.empty() || Split.first == Str) &&
+         "a trailing separator is not allowed");
+  return Split;
+}
+
+/// Get an unsinged integer, including error checks.
+static unsigned getInt(StringRef R) {
+  unsigned Result;
+  bool error = R.getAsInteger(10, Result); (void)error;
+  assert(!error && "not a number, or does not fit in an unsigned int");
+  return Result;
+}
+
+/// Convert bits into bytes. Assert if not a byte width multiple.
+static unsigned inBytes(unsigned Bits) {
+  assert(Bits % 8 == 0 && "number of bits must be a byte width multiple");
+  return Bits / 8;
+}
+
+void DataLayout::parseSpecifier(StringRef Desc) {
+
+  while (!Desc.empty()) {
+
+    // Split at '-'.
+    std::pair<StringRef, StringRef> Split = split(Desc, '-');
+    Desc = Split.second;
+
+    // Split at ':'.
+    Split = split(Split.first, ':');
+
+    // Aliases used below.
+    StringRef &Tok  = Split.first;  // Current token.
+    StringRef &Rest = Split.second; // The rest of the string.
+
+    char Specifier = Tok.front();
+    Tok = Tok.substr(1);
+
+    switch (Specifier) {
+    case 'E':
+      LittleEndian = false;
+      break;
+    case 'e':
+      LittleEndian = true;
+      break;
+    case 'p': {
+      // Address space.
+      unsigned AddrSpace = Tok.empty() ? 0 : getInt(Tok);
+      assert(AddrSpace < 1 << 24 &&
+             "Invalid address space, must be a 24bit integer");
+
+      // Size.
+      Split = split(Rest, ':');
+      unsigned PointerMemSize = inBytes(getInt(Tok));
+
+      // ABI alignment.
+      Split = split(Rest, ':');
+      unsigned PointerABIAlign = inBytes(getInt(Tok));
+
+      // Preferred alignment.
+      unsigned PointerPrefAlign = PointerABIAlign;
+      if (!Rest.empty()) {
+        Split = split(Rest, ':');
+        PointerPrefAlign = inBytes(getInt(Tok));
+      }
+
+      setPointerAlignment(AddrSpace, PointerABIAlign, PointerPrefAlign,
+                          PointerMemSize);
+      break;
+    }
+    case 'i':
+    case 'v':
+    case 'f':
+    case 'a':
+    case 's': {
+      AlignTypeEnum AlignType;
+      switch (Specifier) {
+      default:
+      case 'i': AlignType = INTEGER_ALIGN; break;
+      case 'v': AlignType = VECTOR_ALIGN; break;
+      case 'f': AlignType = FLOAT_ALIGN; break;
+      case 'a': AlignType = AGGREGATE_ALIGN; break;
+      case 's': AlignType = STACK_ALIGN; break;
+      }
+
+      // Bit size.
+      unsigned Size = Tok.empty() ? 0 : getInt(Tok);
+
+      // ABI alignment.
+      Split = split(Rest, ':');
+      unsigned ABIAlign = inBytes(getInt(Tok));
+
+      // Preferred alignment.
+      unsigned PrefAlign = ABIAlign;
+      if (!Rest.empty()) {
+        Split = split(Rest, ':');
+        PrefAlign = inBytes(getInt(Tok));
+      }
+
+      setAlignment(AlignType, ABIAlign, PrefAlign, Size);
+
+      break;
+    }
+    case 'n':  // Native integer types.
+      for (;;) {
+        unsigned Width = getInt(Tok);
+        assert(Width != 0 && "width must be non-zero");
+        LegalIntWidths.push_back(Width);
+        if (Rest.empty())
+          break;
+        Split = split(Rest, ':');
+      }
+      break;
+    case 'S': { // Stack natural alignment.
+      StackNaturalAlign = inBytes(getInt(Tok));
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown specifier in datalayout string");
+      break;
+    }
+  }
+}
+
+/// Default ctor.
+///
+/// @note This has to exist, because this is a pass, but it should never be
+/// used.
+DataLayout::DataLayout() : ImmutablePass(ID) {
+  report_fatal_error("Bad DataLayout ctor used.  "
+                     "Tool did not specify a DataLayout to use?");
+}
+
+DataLayout::DataLayout(const Module *M)
+  : ImmutablePass(ID) {
+  init(M->getDataLayout());
+}
+
+void
+DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
+                         unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  assert(pref_align < (1 << 16) && "Alignment doesn't fit in bitfield");
+  assert(bit_width < (1 << 24) && "Bit width doesn't fit in bitfield");
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == (unsigned)align_type &&
+        Alignments[i].TypeBitWidth == bit_width) {
+      // Update the abi, preferred alignments.
+      Alignments[i].ABIAlign = abi_align;
+      Alignments[i].PrefAlign = pref_align;
+      return;
+    }
+  }
+
+  Alignments.push_back(LayoutAlignElem::get(align_type, abi_align,
+                                            pref_align, bit_width));
+}
+
+void
+DataLayout::setPointerAlignment(uint32_t addr_space, unsigned abi_align,
+                         unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  DenseMap<unsigned,PointerAlignElem>::iterator val = Pointers.find(addr_space);
+  if (val == Pointers.end()) {
+    Pointers[addr_space] = PointerAlignElem::get(addr_space,
+          abi_align, pref_align, bit_width);
+  } else {
+    val->second.ABIAlign = abi_align;
+    val->second.PrefAlign = pref_align;
+    val->second.TypeBitWidth = bit_width;
+  }
+}
+
+/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or
+/// preferred if ABIInfo = false) the layout wants for the specified datatype.
+unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
+                                      uint32_t BitWidth, bool ABIInfo,
+                                      Type *Ty) const {
+  // Check to see if we have an exact match and remember the best match we see.
+  int BestMatchIdx = -1;
+  int LargestInt = -1;
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == (unsigned)AlignType &&
+        Alignments[i].TypeBitWidth == BitWidth)
+      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
+
+    // The best match so far depends on what we're looking for.
+     if (AlignType == INTEGER_ALIGN &&
+         Alignments[i].AlignType == INTEGER_ALIGN) {
+      // The "best match" for integers is the smallest size that is larger than
+      // the BitWidth requested.
+      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
+          Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
+        BestMatchIdx = i;
+      // However, if there isn't one that's larger, then we must use the
+      // largest one we have (see below)
+      if (LargestInt == -1 ||
+          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
+        LargestInt = i;
+    }
+  }
+
+  // Okay, we didn't find an exact solution.  Fall back here depending on what
+  // is being looked for.
+  if (BestMatchIdx == -1) {
+    // If we didn't find an integer alignment, fall back on most conservative.
+    if (AlignType == INTEGER_ALIGN) {
+      BestMatchIdx = LargestInt;
+    } else {
+      assert(AlignType == VECTOR_ALIGN && "Unknown alignment type!");
+
+      // By default, use natural alignment for vector types. This is consistent
+      // with what clang and llvm-gcc do.
+      unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+      Align *= cast<VectorType>(Ty)->getNumElements();
+      // If the alignment is not a power of 2, round up to the next power of 2.
+      // This happens for non-power-of-2 length vectors.
+      if (Align & (Align-1))
+        Align = NextPowerOf2(Align);
+      return Align;
+    }
+  }
+
+  // Since we got a "best match" index, just return it.
+  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
+                 : Alignments[BestMatchIdx].PrefAlign;
+}
+
+namespace {
+
+class StructLayoutMap {
+  typedef DenseMap<StructType*, StructLayout*> LayoutInfoTy;
+  LayoutInfoTy LayoutInfo;
+
+public:
+  virtual ~StructLayoutMap() {
+    // Remove any layouts.
+    for (LayoutInfoTy::iterator I = LayoutInfo.begin(), E = LayoutInfo.end();
+         I != E; ++I) {
+      StructLayout *Value = I->second;
+      Value->~StructLayout();
+      free(Value);
+    }
+  }
+
+  StructLayout *&operator[](StructType *STy) {
+    return LayoutInfo[STy];
+  }
+
+  // for debugging...
+  virtual void dump() const {}
+};
+
+} // end anonymous namespace
+
+DataLayout::~DataLayout() {
+  delete static_cast<StructLayoutMap*>(LayoutMap);
+}
+
+bool DataLayout::doFinalization(Module &M) {
+  delete static_cast<StructLayoutMap*>(LayoutMap);
+  LayoutMap = 0;
+  return false;
+}
+
+const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
+  if (!LayoutMap)
+    LayoutMap = new StructLayoutMap();
+
+  StructLayoutMap *STM = static_cast<StructLayoutMap*>(LayoutMap);
+  StructLayout *&SL = (*STM)[Ty];
+  if (SL) return SL;
+
+  // Otherwise, create the struct layout.  Because it is variable length, we
+  // malloc it, then use placement new.
+  int NumElts = Ty->getNumElements();
+  StructLayout *L =
+    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1) * sizeof(uint64_t));
+
+  // Set SL before calling StructLayout's ctor.  The ctor could cause other
+  // entries to be added to TheMap, invalidating our reference.
+  SL = L;
+
+  new (L) StructLayout(Ty, *this);
+
+  return L;
+}
+
+std::string DataLayout::getStringRepresentation() const {
+  std::string Result;
+  raw_string_ostream OS(Result);
+
+  OS << (LittleEndian ? "e" : "E");
+  SmallVector<unsigned, 8> addrSpaces;
+  // Lets get all of the known address spaces and sort them
+  // into increasing order so that we can emit the string
+  // in a cleaner format.
+  for (DenseMap<unsigned, PointerAlignElem>::const_iterator
+      pib = Pointers.begin(), pie = Pointers.end();
+      pib != pie; ++pib) {
+    addrSpaces.push_back(pib->first);
+  }
+  std::sort(addrSpaces.begin(), addrSpaces.end());
+  for (SmallVector<unsigned, 8>::iterator asb = addrSpaces.begin(),
+      ase = addrSpaces.end(); asb != ase; ++asb) {
+    const PointerAlignElem &PI = Pointers.find(*asb)->second;
+    OS << "-p";
+    if (PI.AddressSpace) {
+      OS << PI.AddressSpace;
+    }
+     OS << ":" << PI.TypeBitWidth*8 << ':' << PI.ABIAlign*8
+        << ':' << PI.PrefAlign*8;
+  }
+  OS << "-S" << StackNaturalAlign*8;
+
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    const LayoutAlignElem &AI = Alignments[i];
+    OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
+       << AI.ABIAlign*8 << ':' << AI.PrefAlign*8;
+  }
+
+  if (!LegalIntWidths.empty()) {
+    OS << "-n" << (unsigned)LegalIntWidths[0];
+
+    for (unsigned i = 1, e = LegalIntWidths.size(); i != e; ++i)
+      OS << ':' << (unsigned)LegalIntWidths[i];
+  }
+  return OS.str();
+}
+
+
+/*!
+  \param abi_or_pref Flag that determines which alignment is returned. true
+  returns the ABI alignment, false returns the preferred alignment.
+  \param Ty The underlying type for which alignment is determined.
+
+  Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
+  == false) for the requested type \a Ty.
+ */
+unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
+  int AlignType = -1;
+
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  // Early escape for the non-numeric types.
+  case Type::LabelTyID:
+    return (abi_or_pref
+            ? getPointerABIAlignment(0)
+            : getPointerPrefAlignment(0));
+  case Type::PointerTyID: {
+    unsigned AS = dyn_cast<PointerType>(Ty)->getAddressSpace();
+    return (abi_or_pref
+            ? getPointerABIAlignment(AS)
+            : getPointerPrefAlignment(AS));
+    }
+  case Type::ArrayTyID:
+    return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
+
+  case Type::StructTyID: {
+    // Packed structure types always have an ABI alignment of one.
+    if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
+      return 1;
+
+    // Get the layout annotation... which is lazily created on demand.
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
+    return std::max(Align, Layout->getAlignment());
+  }
+  case Type::IntegerTyID:
+    AlignType = INTEGER_ALIGN;
+    break;
+  case Type::HalfTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  // PPC_FP128TyID and FP128TyID have different data contents, but the
+  // same size and alignment, so they look the same here.
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+  case Type::X86_FP80TyID:
+    AlignType = FLOAT_ALIGN;
+    break;
+  case Type::X86_MMXTyID:
+  case Type::VectorTyID:
+    AlignType = VECTOR_ALIGN;
+    break;
+  default:
+    llvm_unreachable("Bad type for getAlignment!!!");
+  }
+
+  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSizeInBits(Ty),
+                          abi_or_pref, Ty);
+}
+
+unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
+  return getAlignment(Ty, true);
+}
+
+/// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
+/// an integer type of the specified bitwidth.
+unsigned DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
+  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
+}
+
+
+unsigned DataLayout::getCallFrameTypeAlignment(Type *Ty) const {
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
+    if (Alignments[i].AlignType == STACK_ALIGN)
+      return Alignments[i].ABIAlign;
+
+  return getABITypeAlignment(Ty);
+}
+
+unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
+  return getAlignment(Ty, false);
+}
+
+unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
+  unsigned Align = getPrefTypeAlignment(Ty);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+  return Log2_32(Align);
+}
+
+/// getIntPtrType - Return an integer type with size at least as big as that
+/// of a pointer in the given address space.
+IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
+                                       unsigned AddressSpace) const {
+  return IntegerType::get(C, getPointerSizeInBits(AddressSpace));
+}
+
+/// getIntPtrType - Return an integer (vector of integer) type with size at
+/// least as big as that of a pointer of the given pointer (vector of pointer)
+/// type.
+Type *DataLayout::getIntPtrType(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "Expected a pointer or pointer vector type.");
+  unsigned NumBits = getTypeSizeInBits(Ty->getScalarType());
+  IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(IntTy, VecTy->getNumElements());
+  return IntTy;
+}
+
+Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const {
+  for (unsigned i = 0, e = (unsigned)LegalIntWidths.size(); i != e; ++i)
+    if (Width <= LegalIntWidths[i])
+      return Type::getIntNTy(C, LegalIntWidths[i]);
+  return 0;
+}
+
+uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
+                                      ArrayRef<Value *> Indices) const {
+  Type *Ty = ptrTy;
+  assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
+  uint64_t Result = 0;
+
+  generic_gep_type_iterator<Value* const*>
+    TI = gep_type_begin(ptrTy, Indices);
+  for (unsigned CurIDX = 0, EndIDX = Indices.size(); CurIDX != EndIDX;
+       ++CurIDX, ++TI) {
+    if (StructType *STy = dyn_cast<StructType>(*TI)) {
+      assert(Indices[CurIDX]->getType() ==
+             Type::getInt32Ty(ptrTy->getContext()) &&
+             "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
+
+      // Get structure layout information...
+      const StructLayout *Layout = getStructLayout(STy);
+
+      // Add in the offset, as calculated by the structure layout info...
+      Result += Layout->getElementOffset(FieldNo);
+
+      // Update Ty to refer to current element
+      Ty = STy->getElementType(FieldNo);
+    } else {
+      // Update Ty to refer to current element
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // Get the array index and the size of each array element.
+      if (int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue())
+        Result += (uint64_t)arrayIdx * getTypeAllocSize(Ty);
+    }
+  }
+
+  return Result;
+}
+
+/// getPreferredAlignment - Return the preferred alignment of the specified
+/// global.  This includes an explicitly requested alignment (if the global
+/// has one).
+unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
+  Type *ElemType = GV->getType()->getElementType();
+  unsigned Alignment = getPrefTypeAlignment(ElemType);
+  unsigned GVAlignment = GV->getAlignment();
+  if (GVAlignment >= Alignment) {
+    Alignment = GVAlignment;
+  } else if (GVAlignment != 0) {
+    Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType));
+  }
+
+  if (GV->hasInitializer() && GVAlignment == 0) {
+    if (Alignment < 16) {
+      // If the global is not external, see if it is large.  If so, give it a
+      // larger alignment.
+      if (getTypeSizeInBits(ElemType) > 128)
+        Alignment = 16;    // 16-byte alignment.
+    }
+  }
+  return Alignment;
+}
+
+/// getPreferredAlignmentLog - Return the preferred alignment of the
+/// specified global, returned in log form.  This includes an explicitly
+/// requested alignment (if the global has one).
+unsigned DataLayout::getPreferredAlignmentLog(const GlobalVariable *GV) const {
+  return Log2_32(getPreferredAlignment(GV));
+}
diff --git a/contrib/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm/lib/IR/DebugInfo.cpp
new file mode 100644
index 000000000000..0ffe99d70454
--- /dev/null
+++ b/contrib/llvm/lib/IR/DebugInfo.cpp
@@ -0,0 +1,1209 @@
+//===--- DebugInfo.cpp - Debug Information Helper Classes -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the helper classes used to build and interpret debug
+// information in LLVM IR form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace llvm::dwarf;
+
+//===----------------------------------------------------------------------===//
+// DIDescriptor
+//===----------------------------------------------------------------------===//
+
+DIDescriptor::DIDescriptor(const DIFile F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DISubprogram F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DILexicalBlockFile F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DILexicalBlock F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DIVariable F) : DbgNode(F.DbgNode) {
+}
+
+DIDescriptor::DIDescriptor(const DIType F) : DbgNode(F.DbgNode) {
+}
+
+bool DIDescriptor::Verify() const {
+  return DbgNode &&
+         (DIDerivedType(DbgNode).Verify() ||
+          DICompositeType(DbgNode).Verify() || DIBasicType(DbgNode).Verify() ||
+          DIVariable(DbgNode).Verify() || DISubprogram(DbgNode).Verify() ||
+          DIGlobalVariable(DbgNode).Verify() || DIFile(DbgNode).Verify() ||
+          DICompileUnit(DbgNode).Verify() || DINameSpace(DbgNode).Verify() ||
+          DILexicalBlock(DbgNode).Verify() ||
+          DILexicalBlockFile(DbgNode).Verify() ||
+          DISubrange(DbgNode).Verify() || DIEnumerator(DbgNode).Verify() ||
+          DIObjCProperty(DbgNode).Verify() ||
+          DITemplateTypeParameter(DbgNode).Verify() ||
+          DITemplateValueParameter(DbgNode).Verify());
+}
+
+static Value *getField(const MDNode *DbgNode, unsigned Elt) {
+  if (DbgNode == 0 || Elt >= DbgNode->getNumOperands())
+    return 0;
+  return DbgNode->getOperand(Elt);
+}
+
+static const MDNode *getNodeField(const MDNode *DbgNode, unsigned Elt) {
+  if (const MDNode *R = dyn_cast_or_null<MDNode>(getField(DbgNode, Elt)))
+    return R;
+  return 0;
+}
+
+static StringRef getStringField(const MDNode *DbgNode, unsigned Elt) {
+  if (MDString *MDS = dyn_cast_or_null<MDString>(getField(DbgNode, Elt)))
+    return MDS->getString();
+  return StringRef();
+}
+
+StringRef DIDescriptor::getStringField(unsigned Elt) const {
+  return ::getStringField(DbgNode, Elt);
+}
+
+uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+    if (ConstantInt *CI
+        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+      return CI->getZExtValue();
+
+  return 0;
+}
+
+int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+    if (ConstantInt *CI
+        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+      return CI->getSExtValue();
+
+  return 0;
+}
+
+DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return DIDescriptor();
+
+  if (Elt < DbgNode->getNumOperands())
+    return
+      DIDescriptor(dyn_cast_or_null<const MDNode>(DbgNode->getOperand(Elt)));
+  return DIDescriptor();
+}
+
+GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+      return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
+  return 0;
+}
+
+Constant *DIDescriptor::getConstantField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+      return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+  return 0;
+}
+
+Function *DIDescriptor::getFunctionField(unsigned Elt) const {
+  if (DbgNode == 0)
+    return 0;
+
+  if (Elt < DbgNode->getNumOperands())
+      return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
+  return 0;
+}
+
+void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
+  if (DbgNode == 0)
+    return;
+
+  if (Elt < DbgNode->getNumOperands()) {
+    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    Node->replaceOperandWith(Elt, F);
+  }
+}
+
+unsigned DIVariable::getNumAddrElements() const {
+  return DbgNode->getNumOperands()-8;
+}
+
+/// getInlinedAt - If this variable is inlined then return inline location.
+MDNode *DIVariable::getInlinedAt() const {
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7));
+}
+
+//===----------------------------------------------------------------------===//
+// Predicates
+//===----------------------------------------------------------------------===//
+
+/// isBasicType - Return true if the specified tag is legal for
+/// DIBasicType.
+bool DIDescriptor::isBasicType() const {
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_unspecified_type:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
+bool DIDescriptor::isDerivedType() const {
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_ptr_to_member_type:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_member:
+  case dwarf::DW_TAG_inheritance:
+  case dwarf::DW_TAG_friend:
+    return true;
+  default:
+    // CompositeTypes are currently modelled as DerivedTypes.
+    return isCompositeType();
+  }
+}
+
+/// isCompositeType - Return true if the specified tag is legal for
+/// DICompositeType.
+bool DIDescriptor::isCompositeType() const {
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_array_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_subroutine_type:
+  case dwarf::DW_TAG_class_type:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isVariable - Return true if the specified tag is legal for DIVariable.
+bool DIDescriptor::isVariable() const {
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_auto_variable:
+  case dwarf::DW_TAG_arg_variable:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isType - Return true if the specified tag is legal for DIType.
+bool DIDescriptor::isType() const {
+  return isBasicType() || isCompositeType() || isDerivedType();
+}
+
+/// isSubprogram - Return true if the specified tag is legal for
+/// DISubprogram.
+bool DIDescriptor::isSubprogram() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_subprogram;
+}
+
+/// isGlobalVariable - Return true if the specified tag is legal for
+/// DIGlobalVariable.
+bool DIDescriptor::isGlobalVariable() const {
+  return DbgNode && (getTag() == dwarf::DW_TAG_variable ||
+                     getTag() == dwarf::DW_TAG_constant);
+}
+
+/// isGlobal - Return true if the specified tag is legal for DIGlobal.
+bool DIDescriptor::isGlobal() const {
+  return isGlobalVariable();
+}
+
+/// isUnspecifiedParmeter - Return true if the specified tag is
+/// DW_TAG_unspecified_parameters.
+bool DIDescriptor::isUnspecifiedParameter() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_unspecified_parameters;
+}
+
+/// isScope - Return true if the specified tag is one of the scope
+/// related tag.
+bool DIDescriptor::isScope() const {
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_compile_unit:
+  case dwarf::DW_TAG_lexical_block:
+  case dwarf::DW_TAG_subprogram:
+  case dwarf::DW_TAG_namespace:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+/// isTemplateTypeParameter - Return true if the specified tag is
+/// DW_TAG_template_type_parameter.
+bool DIDescriptor::isTemplateTypeParameter() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_template_type_parameter;
+}
+
+/// isTemplateValueParameter - Return true if the specified tag is
+/// DW_TAG_template_value_parameter.
+bool DIDescriptor::isTemplateValueParameter() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_template_value_parameter;
+}
+
+/// isCompileUnit - Return true if the specified tag is DW_TAG_compile_unit.
+bool DIDescriptor::isCompileUnit() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_compile_unit;
+}
+
+/// isFile - Return true if the specified tag is DW_TAG_file_type.
+bool DIDescriptor::isFile() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_file_type;
+}
+
+/// isNameSpace - Return true if the specified tag is DW_TAG_namespace.
+bool DIDescriptor::isNameSpace() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_namespace;
+}
+
+/// isLexicalBlockFile - Return true if the specified descriptor is a
+/// lexical block with an extra file.
+bool DIDescriptor::isLexicalBlockFile() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
+    (DbgNode->getNumOperands() == 3);
+}
+
+/// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
+bool DIDescriptor::isLexicalBlock() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
+    (DbgNode->getNumOperands() > 3);
+}
+
+/// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
+bool DIDescriptor::isSubrange() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_subrange_type;
+}
+
+/// isEnumerator - Return true if the specified tag is DW_TAG_enumerator.
+bool DIDescriptor::isEnumerator() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_enumerator;
+}
+
+/// isObjCProperty - Return true if the specified tag is DW_TAG_APPLE_property.
+bool DIDescriptor::isObjCProperty() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_APPLE_property;
+}
+//===----------------------------------------------------------------------===//
+// Simple Descriptor Constructors and other Methods
+//===----------------------------------------------------------------------===//
+
+DIType::DIType(const MDNode *N) : DIScope(N) {
+  if (!N) return;
+  if (!isBasicType() && !isDerivedType() && !isCompositeType()) {
+    DbgNode = 0;
+  }
+}
+
+unsigned DIArray::getNumElements() const {
+  if (!DbgNode)
+    return 0;
+  return DbgNode->getNumOperands();
+}
+
+/// replaceAllUsesWith - Replace all uses of debug info referenced by
+/// this descriptor.
+void DIType::replaceAllUsesWith(DIDescriptor &D) {
+  if (!DbgNode)
+    return;
+
+  // Since we use a TrackingVH for the node, its easy for clients to manufacture
+  // legitimate situations where they want to replaceAllUsesWith() on something
+  // which, due to uniquing, has merged with the source. We shield clients from
+  // this detail by allowing a value to be replaced with replaceAllUsesWith()
+  // itself.
+  if (DbgNode != D) {
+    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    const MDNode *DN = D;
+    const Value *V = cast_or_null<Value>(DN);
+    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    MDNode::deleteTemporary(Node);
+  }
+}
+
+/// replaceAllUsesWith - Replace all uses of debug info referenced by
+/// this descriptor.
+void DIType::replaceAllUsesWith(MDNode *D) {
+  if (!DbgNode)
+    return;
+
+  // Since we use a TrackingVH for the node, its easy for clients to manufacture
+  // legitimate situations where they want to replaceAllUsesWith() on something
+  // which, due to uniquing, has merged with the source. We shield clients from
+  // this detail by allowing a value to be replaced with replaceAllUsesWith()
+  // itself.
+  if (DbgNode != D) {
+    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    const MDNode *DN = D;
+    const Value *V = cast_or_null<Value>(DN);
+    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    MDNode::deleteTemporary(Node);
+  }
+}
+
+/// isUnsignedDIType - Return true if type encoding is unsigned.
+bool DIType::isUnsignedDIType() {
+  DIDerivedType DTy(DbgNode);
+  if (DTy.Verify())
+    return DTy.getTypeDerivedFrom().isUnsignedDIType();
+
+  DIBasicType BTy(DbgNode);
+  if (BTy.Verify()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char ||
+        Encoding == dwarf::DW_ATE_boolean)
+      return true;
+  }
+  return false;
+}
+
+/// Verify - Verify that a compile unit is well formed.
+bool DICompileUnit::Verify() const {
+  if (!isCompileUnit())
+    return false;
+  StringRef N = getFilename();
+  if (N.empty())
+    return false;
+  // It is possible that directory and produce string is empty.
+  return DbgNode->getNumOperands() == 12;
+}
+
+/// Verify - Verify that an ObjC property is well formed.
+bool DIObjCProperty::Verify() const {
+  if (!isObjCProperty())
+    return false;
+
+  DIType Ty = getType();
+  if (!Ty.Verify()) return false;
+
+  // Don't worry about the rest of the strings for now.
+  return DbgNode->getNumOperands() == 8;
+}
+
+/// Verify - Verify that a type descriptor is well formed.
+bool DIType::Verify() const {
+  if (!isType())
+    return false;
+  if (getContext() && !getContext().Verify())
+    return false;
+  unsigned Tag = getTag();
+  if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
+      Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
+      Tag != dwarf::DW_TAG_ptr_to_member_type &&
+      Tag != dwarf::DW_TAG_reference_type &&
+      Tag != dwarf::DW_TAG_rvalue_reference_type &&
+      Tag != dwarf::DW_TAG_restrict_type &&
+      Tag != dwarf::DW_TAG_array_type &&
+      Tag != dwarf::DW_TAG_enumeration_type &&
+      Tag != dwarf::DW_TAG_subroutine_type &&
+      getFilename().empty())
+    return false;
+  return true;
+}
+
+/// Verify - Verify that a basic type descriptor is well formed.
+bool DIBasicType::Verify() const {
+  return isBasicType() && DbgNode->getNumOperands() == 10;
+}
+
+/// Verify - Verify that a derived type descriptor is well formed.
+bool DIDerivedType::Verify() const {
+  return isDerivedType() && DbgNode->getNumOperands() >= 10 &&
+         DbgNode->getNumOperands() <= 14;
+}
+
+/// Verify - Verify that a composite type descriptor is well formed.
+bool DICompositeType::Verify() const {
+  if (!isCompositeType())
+    return false;
+  if (getContext() && !getContext().Verify())
+    return false;
+
+  return DbgNode->getNumOperands() >= 10 && DbgNode->getNumOperands() <= 14;
+}
+
+/// Verify - Verify that a subprogram descriptor is well formed.
+bool DISubprogram::Verify() const {
+  if (!isSubprogram())
+    return false;
+
+  if (getContext() && !getContext().Verify())
+    return false;
+
+  DICompositeType Ty = getType();
+  if (!Ty.Verify())
+    return false;
+  return DbgNode->getNumOperands() == 20;
+}
+
+/// Verify - Verify that a global variable descriptor is well formed.
+bool DIGlobalVariable::Verify() const {
+  if (!isGlobalVariable())
+    return false;
+
+  if (getDisplayName().empty())
+    return false;
+
+  if (getContext() && !getContext().Verify())
+    return false;
+
+  DIType Ty = getType();
+  if (!Ty.Verify())
+    return false;
+
+  if (!getGlobal() && !getConstant())
+    return false;
+
+  return DbgNode->getNumOperands() == 13;
+}
+
+/// Verify - Verify that a variable descriptor is well formed.
+bool DIVariable::Verify() const {
+  if (!isVariable())
+    return false;
+
+  if (getContext() && !getContext().Verify())
+    return false;
+
+  DIType Ty = getType();
+  if (!Ty.Verify())
+    return false;
+
+  return DbgNode->getNumOperands() >= 8;
+}
+
+/// Verify - Verify that a location descriptor is well formed.
+bool DILocation::Verify() const {
+  if (!DbgNode)
+    return false;
+
+  return DbgNode->getNumOperands() == 4;
+}
+
+/// Verify - Verify that a namespace descriptor is well formed.
+bool DINameSpace::Verify() const {
+  if (!isNameSpace())
+    return false;
+  return DbgNode->getNumOperands() == 5;
+}
+
+/// \brief Retrieve the MDNode for the directory/file pair.
+MDNode *DIFile::getFileNode() const {
+  return const_cast<MDNode*>(getNodeField(DbgNode, 1));
+}
+
+/// \brief Verify that the file descriptor is well formed.
+bool DIFile::Verify() const {
+  return isFile() && DbgNode->getNumOperands() == 2;
+}
+
+/// \brief Verify that the enumerator descriptor is well formed.
+bool DIEnumerator::Verify() const {
+  return isEnumerator() && DbgNode->getNumOperands() == 3;
+}
+
+/// \brief Verify that the subrange descriptor is well formed.
+bool DISubrange::Verify() const {
+  return isSubrange() && DbgNode->getNumOperands() == 3;
+}
+
+/// \brief Verify that the lexical block descriptor is well formed.
+bool DILexicalBlock::Verify() const {
+  return isLexicalBlock() && DbgNode->getNumOperands() == 6;
+}
+
+/// \brief Verify that the file-scoped lexical block descriptor is well formed.
+bool DILexicalBlockFile::Verify() const {
+  return isLexicalBlockFile() && DbgNode->getNumOperands() == 3;
+}
+
+/// \brief Verify that the template type parameter descriptor is well formed.
+bool DITemplateTypeParameter::Verify() const {
+  return isTemplateTypeParameter() && DbgNode->getNumOperands() == 7;
+}
+
+/// \brief Verify that the template value parameter descriptor is well formed.
+bool DITemplateValueParameter::Verify() const {
+  return isTemplateValueParameter() && DbgNode->getNumOperands() == 8;
+}
+
+/// getOriginalTypeSize - If this type is derived from a base type then
+/// return base type size.
+uint64_t DIDerivedType::getOriginalTypeSize() const {
+  unsigned Tag = getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return getSizeInBits();
+
+  DIType BaseType = getTypeDerivedFrom();
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return DIDerivedType(BaseType).getOriginalTypeSize();
+
+  return BaseType.getSizeInBits();
+}
+
+/// getObjCProperty - Return property node, if this ivar is associated with one.
+MDNode *DIDerivedType::getObjCProperty() const {
+  if (DbgNode->getNumOperands() <= 10)
+    return NULL;
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(10));
+}
+
+/// \brief Set the array of member DITypes.
+void DICompositeType::setTypeArray(DIArray Elements, DIArray TParams) {
+  assert((!TParams || DbgNode->getNumOperands() == 14) &&
+         "If you're setting the template parameters this should include a slot "
+         "for that!");
+  TrackingVH<MDNode> N(*this);
+  N->replaceOperandWith(10, Elements);
+  if (TParams)
+    N->replaceOperandWith(13, TParams);
+  DbgNode = N;
+}
+
+/// \brief Set the containing type.
+void DICompositeType::setContainingType(DICompositeType ContainingType) {
+  TrackingVH<MDNode> N(*this);
+  N->replaceOperandWith(12, ContainingType);
+  DbgNode = N;
+}
+
+/// isInlinedFnArgument - Return true if this variable provides debugging
+/// information for an inlined function arguments.
+bool DIVariable::isInlinedFnArgument(const Function *CurFn) {
+  assert(CurFn && "Invalid function");
+  if (!getContext().isSubprogram())
+    return false;
+  // This variable is not inlined function argument if its scope
+  // does not describe current function.
+  return !DISubprogram(getContext()).describes(CurFn);
+}
+
+/// describes - Return true if this subprogram provides debugging
+/// information for the function F.
+bool DISubprogram::describes(const Function *F) {
+  assert(F && "Invalid function");
+  if (F == getFunction())
+    return true;
+  StringRef Name = getLinkageName();
+  if (Name.empty())
+    Name = getName();
+  if (F->getName() == Name)
+    return true;
+  return false;
+}
+
+unsigned DISubprogram::isOptimized() const {
+  assert (DbgNode && "Invalid subprogram descriptor!");
+  if (DbgNode->getNumOperands() == 15)
+    return getUnsignedField(14);
+  return 0;
+}
+
+MDNode *DISubprogram::getVariablesNodes() const {
+  if (!DbgNode || DbgNode->getNumOperands() <= 18)
+    return NULL;
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(18));
+}
+
+DIArray DISubprogram::getVariables() const {
+  if (!DbgNode || DbgNode->getNumOperands() <= 18)
+    return DIArray();
+  if (MDNode *T = dyn_cast_or_null<MDNode>(DbgNode->getOperand(18)))
+    return DIArray(T);
+  return DIArray();
+}
+
+StringRef DIScope::getFilename() const {
+  if (!DbgNode)
+    return StringRef();
+  return ::getStringField(getNodeField(DbgNode, 1), 0);
+}
+
+StringRef DIScope::getDirectory() const {
+  if (!DbgNode)
+    return StringRef();
+  return ::getStringField(getNodeField(DbgNode, 1), 1);
+}
+
+DIArray DICompileUnit::getEnumTypes() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 12)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(7)))
+    return DIArray(N);
+  return DIArray();
+}
+
+DIArray DICompileUnit::getRetainedTypes() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 12)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(8)))
+    return DIArray(N);
+  return DIArray();
+}
+
+DIArray DICompileUnit::getSubprograms() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 12)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(9)))
+    return DIArray(N);
+  return DIArray();
+}
+
+
+DIArray DICompileUnit::getGlobalVariables() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 12)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(10)))
+    return DIArray(N);
+  return DIArray();
+}
+
+/// fixupObjcLikeName - Replace contains special characters used
+/// in a typical Objective-C names with '.' in a given string.
+static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
+  bool isObjCLike = false;
+  for (size_t i = 0, e = Str.size(); i < e; ++i) {
+    char C = Str[i];
+    if (C == '[')
+      isObjCLike = true;
+
+    if (isObjCLike && (C == '[' || C == ']' || C == ' ' || C == ':' ||
+                       C == '+' || C == '(' || C == ')'))
+      Out.push_back('.');
+    else
+      Out.push_back(C);
+  }
+}
+
+/// getFnSpecificMDNode - Return a NameMDNode, if available, that is
+/// suitable to hold function specific information.
+NamedMDNode *llvm::getFnSpecificMDNode(const Module &M, DISubprogram Fn) {
+  SmallString<32> Name = StringRef("llvm.dbg.lv.");
+  StringRef FName = "fn";
+  if (Fn.getFunction())
+    FName = Fn.getFunction()->getName();
+  else
+    FName = Fn.getName();
+  char One = '\1';
+  if (FName.startswith(StringRef(&One, 1)))
+    FName = FName.substr(1);
+  fixupObjcLikeName(FName, Name);
+  return M.getNamedMetadata(Name.str());
+}
+
+/// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
+/// to hold function specific information.
+NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, DISubprogram Fn) {
+  SmallString<32> Name = StringRef("llvm.dbg.lv.");
+  StringRef FName = "fn";
+  if (Fn.getFunction())
+    FName = Fn.getFunction()->getName();
+  else
+    FName = Fn.getName();
+  char One = '\1';
+  if (FName.startswith(StringRef(&One, 1)))
+    FName = FName.substr(1);
+  fixupObjcLikeName(FName, Name);
+
+  return M.getOrInsertNamedMetadata(Name.str());
+}
+
+/// createInlinedVariable - Create a new inlined variable based on current
+/// variable.
+/// @param DV            Current Variable.
+/// @param InlinedScope  Location at current variable is inlined.
+DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
+                                       LLVMContext &VMContext) {
+  SmallVector<Value *, 16> Elts;
+  // Insert inlined scope as 7th element.
+  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
+    i == 7 ? Elts.push_back(InlinedScope) :
+             Elts.push_back(DV->getOperand(i));
+  return DIVariable(MDNode::get(VMContext, Elts));
+}
+
+/// cleanseInlinedVariable - Remove inlined scope from the variable.
+DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
+  SmallVector<Value *, 16> Elts;
+  // Insert inlined scope as 7th element.
+  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
+    i == 7 ?
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
+      Elts.push_back(DV->getOperand(i));
+  return DIVariable(MDNode::get(VMContext, Elts));
+}
+
+/// getDISubprogram - Find subprogram that is enclosing this scope.
+DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
+  DIDescriptor D(Scope);
+  if (D.isSubprogram())
+    return DISubprogram(Scope);
+
+  if (D.isLexicalBlockFile())
+    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
+
+  if (D.isLexicalBlock())
+    return getDISubprogram(DILexicalBlock(Scope).getContext());
+
+  return DISubprogram();
+}
+
+/// getDICompositeType - Find underlying composite type.
+DICompositeType llvm::getDICompositeType(DIType T) {
+  if (T.isCompositeType())
+    return DICompositeType(T);
+
+  if (T.isDerivedType())
+    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+
+  return DICompositeType();
+}
+
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool llvm::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(DIType(Context).getContext());
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// DebugInfoFinder implementations.
+//===----------------------------------------------------------------------===//
+
+/// processModule - Process entire module and collect debug info.
+void DebugInfoFinder::processModule(const Module &M) {
+  if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      addCompileUnit(CU);
+      DIArray GVs = CU.getGlobalVariables();
+      for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
+        DIGlobalVariable DIG(GVs.getElement(i));
+        if (addGlobalVariable(DIG))
+          processType(DIG.getType());
+      }
+      DIArray SPs = CU.getSubprograms();
+      for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
+        processSubprogram(DISubprogram(SPs.getElement(i)));
+      DIArray EnumTypes = CU.getEnumTypes();
+      for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
+        processType(DIType(EnumTypes.getElement(i)));
+      DIArray RetainedTypes = CU.getRetainedTypes();
+      for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
+        processType(DIType(RetainedTypes.getElement(i)));
+      // FIXME: We really shouldn't be bailing out after visiting just one CU
+      return;
+    }
+  }
+}
+
+/// processLocation - Process DILocation.
+void DebugInfoFinder::processLocation(DILocation Loc) {
+  if (!Loc.Verify()) return;
+  DIDescriptor S(Loc.getScope());
+  if (S.isCompileUnit())
+    addCompileUnit(DICompileUnit(S));
+  else if (S.isSubprogram())
+    processSubprogram(DISubprogram(S));
+  else if (S.isLexicalBlock())
+    processLexicalBlock(DILexicalBlock(S));
+  else if (S.isLexicalBlockFile()) {
+    DILexicalBlockFile DBF = DILexicalBlockFile(S);
+    processLexicalBlock(DILexicalBlock(DBF.getScope()));
+  }
+  processLocation(Loc.getOrigLocation());
+}
+
+/// processType - Process DIType.
+void DebugInfoFinder::processType(DIType DT) {
+  if (!addType(DT))
+    return;
+  if (DT.isCompositeType()) {
+    DICompositeType DCT(DT);
+    processType(DCT.getTypeDerivedFrom());
+    DIArray DA = DCT.getTypeArray();
+    for (unsigned i = 0, e = DA.getNumElements(); i != e; ++i) {
+      DIDescriptor D = DA.getElement(i);
+      if (D.isType())
+        processType(DIType(D));
+      else if (D.isSubprogram())
+        processSubprogram(DISubprogram(D));
+    }
+  } else if (DT.isDerivedType()) {
+    DIDerivedType DDT(DT);
+    processType(DDT.getTypeDerivedFrom());
+  }
+}
+
+/// processLexicalBlock
+void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
+  DIScope Context = LB.getContext();
+  if (Context.isLexicalBlock())
+    return processLexicalBlock(DILexicalBlock(Context));
+  else if (Context.isLexicalBlockFile()) {
+    DILexicalBlockFile DBF = DILexicalBlockFile(Context);
+    return processLexicalBlock(DILexicalBlock(DBF.getScope()));
+  }
+  else
+    return processSubprogram(DISubprogram(Context));
+}
+
+/// processSubprogram - Process DISubprogram.
+void DebugInfoFinder::processSubprogram(DISubprogram SP) {
+  if (!addSubprogram(SP))
+    return;
+  processType(SP.getType());
+}
+
+/// processDeclare - Process DbgDeclareInst.
+void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
+  MDNode *N = dyn_cast<MDNode>(DDI->getVariable());
+  if (!N) return;
+
+  DIDescriptor DV(N);
+  if (!DV.isVariable())
+    return;
+
+  if (!NodesSeen.insert(DV))
+    return;
+  processType(DIVariable(N).getType());
+}
+
+/// addType - Add type into Tys.
+bool DebugInfoFinder::addType(DIType DT) {
+  if (!DT.isValid())
+    return false;
+
+  if (!NodesSeen.insert(DT))
+    return false;
+
+  TYs.push_back(DT);
+  return true;
+}
+
+/// addCompileUnit - Add compile unit into CUs.
+bool DebugInfoFinder::addCompileUnit(DICompileUnit CU) {
+  if (!CU.Verify())
+    return false;
+
+  if (!NodesSeen.insert(CU))
+    return false;
+
+  CUs.push_back(CU);
+  return true;
+}
+
+/// addGlobalVariable - Add global variable into GVs.
+bool DebugInfoFinder::addGlobalVariable(DIGlobalVariable DIG) {
+  if (!DIDescriptor(DIG).isGlobalVariable())
+    return false;
+
+  if (!NodesSeen.insert(DIG))
+    return false;
+
+  GVs.push_back(DIG);
+  return true;
+}
+
+// addSubprogram - Add subprgoram into SPs.
+bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
+  if (!DIDescriptor(SP).isSubprogram())
+    return false;
+
+  if (!NodesSeen.insert(SP))
+    return false;
+
+  SPs.push_back(SP);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// DIDescriptor: dump routines for all descriptors.
+//===----------------------------------------------------------------------===//
+
+/// dump - Print descriptor to dbgs() with a newline.
+void DIDescriptor::dump() const {
+  print(dbgs()); dbgs() << '\n';
+}
+
+/// print - Print descriptor.
+void DIDescriptor::print(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  if (const char *Tag = dwarf::TagString(getTag()))
+    OS << "[ " << Tag << " ]";
+
+  if (this->isSubrange()) {
+    DISubrange(DbgNode).printInternal(OS);
+  } else if (this->isCompileUnit()) {
+    DICompileUnit(DbgNode).printInternal(OS);
+  } else if (this->isFile()) {
+    DIFile(DbgNode).printInternal(OS);
+  } else if (this->isEnumerator()) {
+    DIEnumerator(DbgNode).printInternal(OS);
+  } else if (this->isBasicType()) {
+    DIType(DbgNode).printInternal(OS);
+  } else if (this->isDerivedType()) {
+    DIDerivedType(DbgNode).printInternal(OS);
+  } else if (this->isCompositeType()) {
+    DICompositeType(DbgNode).printInternal(OS);
+  } else if (this->isSubprogram()) {
+    DISubprogram(DbgNode).printInternal(OS);
+  } else if (this->isGlobalVariable()) {
+    DIGlobalVariable(DbgNode).printInternal(OS);
+  } else if (this->isVariable()) {
+    DIVariable(DbgNode).printInternal(OS);
+  } else if (this->isObjCProperty()) {
+    DIObjCProperty(DbgNode).printInternal(OS);
+  } else if (this->isNameSpace()) {
+    DINameSpace(DbgNode).printInternal(OS);
+  } else if (this->isScope()) {
+    DIScope(DbgNode).printInternal(OS);
+  }
+}
+
+void DISubrange::printInternal(raw_ostream &OS) const {
+  int64_t Count = getCount();
+  if (Count != -1)
+    OS << " [" << getLo() << ", " << Count - 1 << ']';
+  else
+    OS << " [unbounded]";
+}
+
+void DIScope::printInternal(raw_ostream &OS) const {
+  OS << " [" << getDirectory() << "/" << getFilename() << ']';
+}
+
+void DICompileUnit::printInternal(raw_ostream &OS) const {
+  DIScope::printInternal(OS);
+  if (const char *Lang = dwarf::LanguageString(getLanguage()))
+    OS << " [" << Lang << ']';
+}
+
+void DIEnumerator::printInternal(raw_ostream &OS) const {
+  OS << " [" << getName() << " :: " << getEnumValue() << ']';
+}
+
+void DIType::printInternal(raw_ostream &OS) const {
+  if (!DbgNode) return;
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << "]";
+
+  // TODO: Print context?
+
+  OS << " [line " << getLineNumber()
+     << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits()
+     << ", offset " << getOffsetInBits();
+  if (isBasicType())
+    if (const char *Enc =
+        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+      OS << ", enc " << Enc;
+  OS << "]";
+
+  if (isPrivate())
+    OS << " [private]";
+  else if (isProtected())
+    OS << " [protected]";
+
+  if (isArtificial())
+    OS << " [artificial]";
+
+  if (isForwardDecl())
+    OS << " [fwd]";
+  if (isVector())
+    OS << " [vector]";
+  if (isStaticMember())
+    OS << " [static]";
+}
+
+void DIDerivedType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  OS << " [from " << getTypeDerivedFrom().getName() << ']';
+}
+
+void DICompositeType::printInternal(raw_ostream &OS) const {
+  DIType::printInternal(OS);
+  DIArray A = getTypeArray();
+  OS << " [" << A.getNumElements() << " elements]";
+}
+
+void DINameSpace::printInternal(raw_ostream &OS) const {
+  StringRef Name = getName();
+  if (!Name.empty())
+    OS << " [" << Name << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+}
+
+void DISubprogram::printInternal(raw_ostream &OS) const {
+  // TODO : Print context
+  OS << " [line " << getLineNumber() << ']';
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+
+  if (getScopeLineNumber() != getLineNumber())
+    OS << " [scope " << getScopeLineNumber() << "]";
+
+  if (isPrivate())
+    OS << " [private]";
+  else if (isProtected())
+    OS << " [protected]";
+
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+}
+
+void DIGlobalVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+
+  // TODO : Print context
+
+  if (isLocalToUnit())
+    OS << " [local]";
+
+  if (isDefinition())
+    OS << " [def]";
+}
+
+void DIVariable::printInternal(raw_ostream &OS) const {
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << " [" << Res << ']';
+
+  OS << " [line " << getLineNumber() << ']';
+}
+
+void DIObjCProperty::printInternal(raw_ostream &OS) const {
+  StringRef Name = getObjCPropertyName();
+  if (!Name.empty())
+    OS << " [" << Name << ']';
+
+  OS << " [line " << getLineNumber()
+     << ", properties " << getUnsignedField(6) << ']';
+}
+
+static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void DIVariable::printExtendedName(raw_ostream &OS) const {
+  const LLVMContext &Ctx = DbgNode->getContext();
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << Res << "," << getLineNumber();
+  if (MDNode *InlinedAt = getInlinedAt()) {
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+    if (!InlinedAtDL.isUnknown()) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
diff --git a/contrib/llvm/lib/IR/DebugLoc.cpp b/contrib/llvm/lib/IR/DebugLoc.cpp
new file mode 100644
index 000000000000..c57b5a305303
--- /dev/null
+++ b/contrib/llvm/lib/IR/DebugLoc.cpp
@@ -0,0 +1,315 @@
+//===-- DebugLoc.cpp - Implement DebugLoc class ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DebugLoc.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/DebugInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DebugLoc Implementation
+//===----------------------------------------------------------------------===//
+
+MDNode *DebugLoc::getScope(const LLVMContext &Ctx) const {
+  if (ScopeIdx == 0) return 0;
+  
+  if (ScopeIdx > 0) {
+    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+    // position specified.
+    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
+           "Invalid ScopeIdx!");
+    return Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
+  }
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
+}
+
+MDNode *DebugLoc::getInlinedAt(const LLVMContext &Ctx) const {
+  // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+  // position specified.  Zero is invalid.
+  if (ScopeIdx >= 0) return 0;
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  return Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+}
+
+/// Return both the Scope and the InlinedAt values.
+void DebugLoc::getScopeAndInlinedAt(MDNode *&Scope, MDNode *&IA,
+                                    const LLVMContext &Ctx) const {
+  if (ScopeIdx == 0) {
+    Scope = IA = 0;
+    return;
+  }
+  
+  if (ScopeIdx > 0) {
+    // Positive ScopeIdx is an index into ScopeRecords, which has no inlined-at
+    // position specified.
+    assert(unsigned(ScopeIdx) <= Ctx.pImpl->ScopeRecords.size() &&
+           "Invalid ScopeIdx!");
+    Scope = Ctx.pImpl->ScopeRecords[ScopeIdx-1].get();
+    IA = 0;
+    return;
+  }
+  
+  // Otherwise, the index is in the ScopeInlinedAtRecords array.
+  assert(unsigned(-ScopeIdx) <= Ctx.pImpl->ScopeInlinedAtRecords.size() &&
+         "Invalid ScopeIdx");
+  Scope = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].first.get();
+  IA    = Ctx.pImpl->ScopeInlinedAtRecords[-ScopeIdx-1].second.get();
+}
+
+
+DebugLoc DebugLoc::get(unsigned Line, unsigned Col,
+                       MDNode *Scope, MDNode *InlinedAt) {
+  DebugLoc Result;
+  
+  // If no scope is available, this is an unknown location.
+  if (Scope == 0) return Result;
+  
+  // Saturate line and col to "unknown".
+  if (Col > 255) Col = 0;
+  if (Line >= (1 << 24)) Line = 0;
+  Result.LineCol = Line | (Col << 24);
+  
+  LLVMContext &Ctx = Scope->getContext();
+  
+  // If there is no inlined-at location, use the ScopeRecords array.
+  if (InlinedAt == 0)
+    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeRecordIdxEntry(Scope, 0);
+  else
+    Result.ScopeIdx = Ctx.pImpl->getOrAddScopeInlinedAtIdxEntry(Scope,
+                                                                InlinedAt, 0);
+
+  return Result;
+}
+
+/// getAsMDNode - This method converts the compressed DebugLoc node into a
+/// DILocation compatible MDNode.
+MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
+  if (isUnknown()) return 0;
+  
+  MDNode *Scope, *IA;
+  getScopeAndInlinedAt(Scope, IA, Ctx);
+  assert(Scope && "If scope is null, this should be isUnknown()");
+  
+  LLVMContext &Ctx2 = Scope->getContext();
+  Type *Int32 = Type::getInt32Ty(Ctx2);
+  Value *Elts[] = {
+    ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()),
+    Scope, IA
+  };
+  return MDNode::get(Ctx2, Elts);
+}
+
+/// getFromDILocation - Translate the DILocation quad into a DebugLoc.
+DebugLoc DebugLoc::getFromDILocation(MDNode *N) {
+  DILocation Loc(N);
+  MDNode *Scope = Loc.getScope();
+  if (Scope == 0) return DebugLoc();
+  return get(Loc.getLineNumber(), Loc.getColumnNumber(), Scope,
+             Loc.getOrigLocation());
+}
+
+/// getFromDILexicalBlock - Translate the DILexicalBlock into a DebugLoc.
+DebugLoc DebugLoc::getFromDILexicalBlock(MDNode *N) {
+  DILexicalBlock LexBlock(N);
+  MDNode *Scope = LexBlock.getContext();
+  if (Scope == 0) return DebugLoc();
+  return get(LexBlock.getLineNumber(), LexBlock.getColumnNumber(), Scope, NULL);
+}
+
+void DebugLoc::dump(const LLVMContext &Ctx) const {
+#ifndef NDEBUG
+  if (!isUnknown()) {
+    dbgs() << getLine();
+    if (getCol() != 0)
+      dbgs() << ',' << getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      dbgs() << " @ ";
+      InlinedAtDL.dump(Ctx);
+    } else
+      dbgs() << "\n";
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+// DenseMap specialization
+//===----------------------------------------------------------------------===//
+
+unsigned DenseMapInfo<DebugLoc>::getHashValue(const DebugLoc &Key) {
+  return static_cast<unsigned>(hash_combine(Key.LineCol, Key.ScopeIdx));
+}
+
+//===----------------------------------------------------------------------===//
+// LLVMContextImpl Implementation
+//===----------------------------------------------------------------------===//
+
+int LLVMContextImpl::getOrAddScopeRecordIdxEntry(MDNode *Scope,
+                                                 int ExistingIdx) {
+  // If we already have an entry for this scope, return it.
+  int &Idx = ScopeRecordIdx[Scope];
+  if (Idx) return Idx;
+  
+  // If we don't have an entry, but ExistingIdx is specified, use it.
+  if (ExistingIdx)
+    return Idx = ExistingIdx;
+  
+  // Otherwise add a new entry.
+  
+  // Start out ScopeRecords with a minimal reasonable size to avoid
+  // excessive reallocation starting out.
+  if (ScopeRecords.empty())
+    ScopeRecords.reserve(128);
+  
+  // Index is biased by 1 for index.
+  Idx = ScopeRecords.size()+1;
+  ScopeRecords.push_back(DebugRecVH(Scope, this, Idx));
+  return Idx;
+}
+
+int LLVMContextImpl::getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,
+                                                    int ExistingIdx) {
+  // If we already have an entry, return it.
+  int &Idx = ScopeInlinedAtIdx[std::make_pair(Scope, IA)];
+  if (Idx) return Idx;
+  
+  // If we don't have an entry, but ExistingIdx is specified, use it.
+  if (ExistingIdx)
+    return Idx = ExistingIdx;
+  
+  // Start out ScopeInlinedAtRecords with a minimal reasonable size to avoid
+  // excessive reallocation starting out.
+  if (ScopeInlinedAtRecords.empty())
+    ScopeInlinedAtRecords.reserve(128);
+    
+  // Index is biased by 1 and negated.
+  Idx = -ScopeInlinedAtRecords.size()-1;
+  ScopeInlinedAtRecords.push_back(std::make_pair(DebugRecVH(Scope, this, Idx),
+                                                 DebugRecVH(IA, this, Idx)));
+  return Idx;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DebugRecVH Implementation
+//===----------------------------------------------------------------------===//
+
+/// deleted - The MDNode this is pointing to got deleted, so this pointer needs
+/// to drop to null and we need remove our entry from the DenseMap.
+void DebugRecVH::deleted() {
+  // If this is a non-canonical reference, just drop the value to null, we know
+  // it doesn't have a map entry.
+  if (Idx == 0) {
+    setValPtr(0);
+    return;
+  }
+    
+  MDNode *Cur = get();
+  
+  // If the index is positive, it is an entry in ScopeRecords.
+  if (Idx > 0) {
+    assert(Ctx->ScopeRecordIdx[Cur] == Idx && "Mapping out of date!");
+    Ctx->ScopeRecordIdx.erase(Cur);
+    // Reset this VH to null and we're done.
+    setValPtr(0);
+    Idx = 0;
+    return;
+  }
+  
+  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
+  // is the scope or the inlined-at record entry.
+  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
+  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
+  assert((this == &Entry.first || this == &Entry.second) &&
+         "Mapping out of date!");
+  
+  MDNode *OldScope = Entry.first.get();
+  MDNode *OldInlinedAt = Entry.second.get();
+  assert(OldScope != 0 && OldInlinedAt != 0 &&
+         "Entry should be non-canonical if either val dropped to null");
+
+  // Otherwise, we do have an entry in it, nuke it and we're done.
+  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
+         "Mapping out of date");
+  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
+  
+  // Reset this VH to null.  Drop both 'Idx' values to null to indicate that
+  // we're in non-canonical form now.
+  setValPtr(0);
+  Entry.first.Idx = Entry.second.Idx = 0;
+}
+
+void DebugRecVH::allUsesReplacedWith(Value *NewVa) {
+  // If being replaced with a non-mdnode value (e.g. undef) handle this as if
+  // the mdnode got deleted.
+  MDNode *NewVal = dyn_cast<MDNode>(NewVa);
+  if (NewVal == 0) return deleted();
+  
+  // If this is a non-canonical reference, just change it, we know it already
+  // doesn't have a map entry.
+  if (Idx == 0) {
+    setValPtr(NewVa);
+    return;
+  }
+  
+  MDNode *OldVal = get();
+  assert(OldVal != NewVa && "Node replaced with self?");
+  
+  // If the index is positive, it is an entry in ScopeRecords.
+  if (Idx > 0) {
+    assert(Ctx->ScopeRecordIdx[OldVal] == Idx && "Mapping out of date!");
+    Ctx->ScopeRecordIdx.erase(OldVal);
+    setValPtr(NewVal);
+
+    int NewEntry = Ctx->getOrAddScopeRecordIdxEntry(NewVal, Idx);
+    
+    // If NewVal already has an entry, this becomes a non-canonical reference,
+    // just drop Idx to 0 to signify this.
+    if (NewEntry != Idx)
+      Idx = 0;
+    return;
+  }
+  
+  // Otherwise, it is an entry in ScopeInlinedAtRecords, we don't know if it
+  // is the scope or the inlined-at record entry.
+  assert(unsigned(-Idx-1) < Ctx->ScopeInlinedAtRecords.size());
+  std::pair<DebugRecVH, DebugRecVH> &Entry = Ctx->ScopeInlinedAtRecords[-Idx-1];
+  assert((this == &Entry.first || this == &Entry.second) &&
+         "Mapping out of date!");
+  
+  MDNode *OldScope = Entry.first.get();
+  MDNode *OldInlinedAt = Entry.second.get();
+  assert(OldScope != 0 && OldInlinedAt != 0 &&
+         "Entry should be non-canonical if either val dropped to null");
+  
+  // Otherwise, we do have an entry in it, nuke it and we're done.
+  assert(Ctx->ScopeInlinedAtIdx[std::make_pair(OldScope, OldInlinedAt)] == Idx&&
+         "Mapping out of date");
+  Ctx->ScopeInlinedAtIdx.erase(std::make_pair(OldScope, OldInlinedAt));
+  
+  // Reset this VH to the new value.
+  setValPtr(NewVal);
+
+  int NewIdx = Ctx->getOrAddScopeInlinedAtIdxEntry(Entry.first.get(),
+                                                   Entry.second.get(), Idx);
+  // If NewVal already has an entry, this becomes a non-canonical reference,
+  // just drop Idx to 0 to signify this.
+  if (NewIdx != Idx) {
+    std::pair<DebugRecVH, DebugRecVH> &Entry=Ctx->ScopeInlinedAtRecords[-Idx-1];
+    Entry.first.Idx = Entry.second.Idx = 0;
+  }
+}
diff --git a/contrib/llvm/lib/IR/Dominators.cpp b/contrib/llvm/lib/IR/Dominators.cpp
new file mode 100644
index 000000000000..a1160cdc83b1
--- /dev/null
+++ b/contrib/llvm/lib/IR/Dominators.cpp
@@ -0,0 +1,302 @@
+//===- Dominators.cpp - Dominator Calculation -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators.  Postdominators are available in libanalysis, but are not
+// included in libvmcore, because it's not needed.  Forward dominators are
+// needed to support the Verifier pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+// Always verify dominfo if expensive checking is enabled.
+#ifdef XDEBUG
+static bool VerifyDomInfo = true;
+#else
+static bool VerifyDomInfo = false;
+#endif
+static cl::opt<bool,true>
+VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo),
+               cl::desc("Verify dominator info (time consuming)"));
+
+bool BasicBlockEdge::isSingleEdge() const {
+  const TerminatorInst *TI = Start->getTerminator();
+  unsigned NumEdgesToEnd = 0;
+  for (unsigned int i = 0, n = TI->getNumSuccessors(); i < n; ++i) {
+    if (TI->getSuccessor(i) == End)
+      ++NumEdgesToEnd;
+    if (NumEdgesToEnd >= 2)
+      return false;
+  }
+  assert(NumEdgesToEnd == 1);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  DominatorTree Implementation
+//===----------------------------------------------------------------------===//
+//
+// Provide public access to DominatorTree information.  Implementation details
+// can be found in DominatorInternals.h.
+//
+//===----------------------------------------------------------------------===//
+
+TEMPLATE_INSTANTIATION(class llvm::DomTreeNodeBase<BasicBlock>);
+TEMPLATE_INSTANTIATION(class llvm::DominatorTreeBase<BasicBlock>);
+
+char DominatorTree::ID = 0;
+INITIALIZE_PASS(DominatorTree, "domtree",
+                "Dominator Tree Construction", true, true)
+
+bool DominatorTree::runOnFunction(Function &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+void DominatorTree::verifyAnalysis() const {
+  if (!VerifyDomInfo) return;
+
+  Function &F = *getRoot()->getParent();
+
+  DominatorTree OtherDT;
+  OtherDT.getBase().recalculate(F);
+  if (compare(OtherDT)) {
+    errs() << "DominatorTree is not up to date!\nComputed:\n";
+    print(errs());
+    errs() << "\nActual:\n";
+    OtherDT.print(errs());
+    abort();
+  }
+}
+
+void DominatorTree::print(raw_ostream &OS, const Module *) const {
+  DT->print(OS);
+}
+
+// dominates - Return true if Def dominates a use in User. This performs
+// the special checks necessary if Def and User are in the same basic block.
+// Note that Def doesn't dominate a use in Def itself!
+bool DominatorTree::dominates(const Instruction *Def,
+                              const Instruction *User) const {
+  const BasicBlock *UseBB = User->getParent();
+  const BasicBlock *DefBB = Def->getParent();
+
+  // Any unreachable use is dominated, even if Def == User.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
+
+  // An instruction doesn't dominate a use in itself.
+  if (Def == User)
+    return false;
+
+  // The value defined by an invoke dominates an instruction only if
+  // it dominates every instruction in UseBB.
+  // A PHI is dominated only if the instruction dominates every possible use
+  // in the UseBB.
+  if (isa<InvokeInst>(Def) || isa<PHINode>(User))
+    return dominates(Def, UseBB);
+
+  if (DefBB != UseBB)
+    return dominates(DefBB, UseBB);
+
+  // Loop through the basic block until we find Def or User.
+  BasicBlock::const_iterator I = DefBB->begin();
+  for (; &*I != Def && &*I != User; ++I)
+    /*empty*/;
+
+  return &*I == Def;
+}
+
+// true if Def would dominate a use in any instruction in UseBB.
+// note that dominates(Def, Def->getParent()) is false.
+bool DominatorTree::dominates(const Instruction *Def,
+                              const BasicBlock *UseBB) const {
+  const BasicBlock *DefBB = Def->getParent();
+
+  // Any unreachable use is dominated, even if DefBB == UseBB.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
+
+  if (DefBB == UseBB)
+    return false;
+
+  const InvokeInst *II = dyn_cast<InvokeInst>(Def);
+  if (!II)
+    return dominates(DefBB, UseBB);
+
+  // Invoke results are only usable in the normal destination, not in the
+  // exceptional destination.
+  BasicBlock *NormalDest = II->getNormalDest();
+  BasicBlockEdge E(DefBB, NormalDest);
+  return dominates(E, UseBB);
+}
+
+bool DominatorTree::dominates(const BasicBlockEdge &BBE,
+                              const BasicBlock *UseBB) const {
+  // Assert that we have a single edge. We could handle them by simply
+  // returning false, but since isSingleEdge is linear on the number of
+  // edges, the callers can normally handle them more efficiently.
+  assert(BBE.isSingleEdge());
+
+  // If the BB the edge ends in doesn't dominate the use BB, then the
+  // edge also doesn't.
+  const BasicBlock *Start = BBE.getStart();
+  const BasicBlock *End = BBE.getEnd();
+  if (!dominates(End, UseBB))
+    return false;
+
+  // Simple case: if the end BB has a single predecessor, the fact that it
+  // dominates the use block implies that the edge also does.
+  if (End->getSinglePredecessor())
+    return true;
+
+  // The normal edge from the invoke is critical. Conceptually, what we would
+  // like to do is split it and check if the new block dominates the use.
+  // With X being the new block, the graph would look like:
+  //
+  //        DefBB
+  //          /\      .  .
+  //         /  \     .  .
+  //        /    \    .  .
+  //       /      \   |  |
+  //      A        X  B  C
+  //      |         \ | /
+  //      .          \|/
+  //      .      NormalDest
+  //      .
+  //
+  // Given the definition of dominance, NormalDest is dominated by X iff X
+  // dominates all of NormalDest's predecessors (X, B, C in the example). X
+  // trivially dominates itself, so we only have to find if it dominates the
+  // other predecessors. Since the only way out of X is via NormalDest, X can
+  // only properly dominate a node if NormalDest dominates that node too.
+  for (const_pred_iterator PI = pred_begin(End), E = pred_end(End);
+       PI != E; ++PI) {
+    const BasicBlock *BB = *PI;
+    if (BB == Start)
+      continue;
+
+    if (!dominates(End, BB))
+      return false;
+  }
+  return true;
+}
+
+bool DominatorTree::dominates(const BasicBlockEdge &BBE,
+                              const Use &U) const {
+  // Assert that we have a single edge. We could handle them by simply
+  // returning false, but since isSingleEdge is linear on the number of
+  // edges, the callers can normally handle them more efficiently.
+  assert(BBE.isSingleEdge());
+
+  Instruction *UserInst = cast<Instruction>(U.getUser());
+  // A PHI in the end of the edge is dominated by it.
+  PHINode *PN = dyn_cast<PHINode>(UserInst);
+  if (PN && PN->getParent() == BBE.getEnd() &&
+      PN->getIncomingBlock(U) == BBE.getStart())
+    return true;
+
+  // Otherwise use the edge-dominates-block query, which
+  // handles the crazy critical edge cases properly.
+  const BasicBlock *UseBB;
+  if (PN)
+    UseBB = PN->getIncomingBlock(U);
+  else
+    UseBB = UserInst->getParent();
+  return dominates(BBE, UseBB);
+}
+
+bool DominatorTree::dominates(const Instruction *Def,
+                              const Use &U) const {
+  Instruction *UserInst = cast<Instruction>(U.getUser());
+  const BasicBlock *DefBB = Def->getParent();
+
+  // Determine the block in which the use happens. PHI nodes use
+  // their operands on edges; simulate this by thinking of the use
+  // happening at the end of the predecessor block.
+  const BasicBlock *UseBB;
+  if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+    UseBB = PN->getIncomingBlock(U);
+  else
+    UseBB = UserInst->getParent();
+
+  // Any unreachable use is dominated, even if Def == User.
+  if (!isReachableFromEntry(UseBB))
+    return true;
+
+  // Unreachable definitions don't dominate anything.
+  if (!isReachableFromEntry(DefBB))
+    return false;
+
+  // Invoke instructions define their return values on the edges
+  // to their normal successors, so we have to handle them specially.
+  // Among other things, this means they don't dominate anything in
+  // their own block, except possibly a phi, so we don't need to
+  // walk the block in any case.
+  if (const InvokeInst *II = dyn_cast<InvokeInst>(Def)) {
+    BasicBlock *NormalDest = II->getNormalDest();
+    BasicBlockEdge E(DefBB, NormalDest);
+    return dominates(E, U);
+  }
+
+  // If the def and use are in different blocks, do a simple CFG dominator
+  // tree query.
+  if (DefBB != UseBB)
+    return dominates(DefBB, UseBB);
+
+  // Ok, def and use are in the same block. If the def is an invoke, it
+  // doesn't dominate anything in the block. If it's a PHI, it dominates
+  // everything in the block.
+  if (isa<PHINode>(UserInst))
+    return true;
+
+  // Otherwise, just loop through the basic block until we find Def or User.
+  BasicBlock::const_iterator I = DefBB->begin();
+  for (; &*I != Def && &*I != UserInst; ++I)
+    /*empty*/;
+
+  return &*I != UserInst;
+}
+
+bool DominatorTree::isReachableFromEntry(const Use &U) const {
+  Instruction *I = dyn_cast<Instruction>(U.getUser());
+
+  // ConstantExprs aren't really reachable from the entry block, but they
+  // don't need to be treated like unreachable code either.
+  if (!I) return true;
+
+  // PHI nodes use their operands on their incoming edges.
+  if (PHINode *PN = dyn_cast<PHINode>(I))
+    return isReachableFromEntry(PN->getIncomingBlock(U));
+
+  // Everything else uses their operands in their own block.
+  return isReachableFromEntry(I->getParent());
+}
diff --git a/contrib/llvm/lib/IR/Function.cpp b/contrib/llvm/lib/IR/Function.cpp
new file mode 100644
index 000000000000..1e72b90a13ce
--- /dev/null
+++ b/contrib/llvm/lib/IR/Function.cpp
@@ -0,0 +1,707 @@
+//===-- Function.cpp - Implement the Global object classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Function class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "LLVMContextImpl.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/RWMutex.h"
+#include "llvm/Support/StringPool.h"
+#include "llvm/Support/Threading.h"
+using namespace llvm;
+
+// Explicit instantiations of SymbolTableListTraits since some of the methods
+// are not in the public header file...
+template class llvm::SymbolTableListTraits<Argument, Function>;
+template class llvm::SymbolTableListTraits<BasicBlock, Function>;
+
+//===----------------------------------------------------------------------===//
+// Argument Implementation
+//===----------------------------------------------------------------------===//
+
+void Argument::anchor() { }
+
+Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
+  : Value(Ty, Value::ArgumentVal) {
+  Parent = 0;
+
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (Par)
+    Par->getArgumentList().push_back(this);
+  setName(Name);
+}
+
+void Argument::setParent(Function *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+/// getArgNo - Return the index of this formal argument in its containing
+/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1.
+unsigned Argument::getArgNo() const {
+  const Function *F = getParent();
+  assert(F && "Argument is not in a function");
+
+  Function::const_arg_iterator AI = F->arg_begin();
+  unsigned ArgIdx = 0;
+  for (; &*AI != this; ++AI)
+    ++ArgIdx;
+
+  return ArgIdx;
+}
+
+/// hasByValAttr - Return true if this argument has the byval attribute on it
+/// in its containing function.
+bool Argument::hasByValAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::ByVal);
+}
+
+unsigned Argument::getParamAlignment() const {
+  assert(getType()->isPointerTy() && "Only pointers have alignments");
+  return getParent()->getParamAlignment(getArgNo()+1);
+
+}
+
+/// hasNestAttr - Return true if this argument has the nest attribute on
+/// it in its containing function.
+bool Argument::hasNestAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::Nest);
+}
+
+/// hasNoAliasAttr - Return true if this argument has the noalias attribute on
+/// it in its containing function.
+bool Argument::hasNoAliasAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::NoAlias);
+}
+
+/// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
+/// on it in its containing function.
+bool Argument::hasNoCaptureAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  return getParent()->getAttributes().
+    hasAttribute(getArgNo()+1, Attribute::NoCapture);
+}
+
+/// hasSRetAttr - Return true if this argument has the sret attribute on
+/// it in its containing function.
+bool Argument::hasStructRetAttr() const {
+  if (!getType()->isPointerTy()) return false;
+  if (this != getParent()->arg_begin())
+    return false; // StructRet param must be first param
+  return getParent()->getAttributes().
+    hasAttribute(1, Attribute::StructRet);
+}
+
+/// addAttr - Add attributes to an argument.
+void Argument::addAttr(AttributeSet AS) {
+  assert(AS.getNumSlots() <= 1 &&
+         "Trying to add more than one attribute set to an argument!");
+  AttrBuilder B(AS, AS.getSlotIndex(0));
+  getParent()->addAttributes(getArgNo() + 1,
+                             AttributeSet::get(Parent->getContext(),
+                                               getArgNo() + 1, B));
+}
+
+/// removeAttr - Remove attributes from an argument.
+void Argument::removeAttr(AttributeSet AS) {
+  assert(AS.getNumSlots() <= 1 &&
+         "Trying to remove more than one attribute set from an argument!");
+  AttrBuilder B(AS, AS.getSlotIndex(0));
+  getParent()->removeAttributes(getArgNo() + 1,
+                                AttributeSet::get(Parent->getContext(),
+                                                  getArgNo() + 1, B));
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods in Function
+//===----------------------------------------------------------------------===//
+
+LLVMContext &Function::getContext() const {
+  return getType()->getContext();
+}
+
+FunctionType *Function::getFunctionType() const {
+  return cast<FunctionType>(getType()->getElementType());
+}
+
+bool Function::isVarArg() const {
+  return getFunctionType()->isVarArg();
+}
+
+Type *Function::getReturnType() const {
+  return getFunctionType()->getReturnType();
+}
+
+void Function::removeFromParent() {
+  getParent()->getFunctionList().remove(this);
+}
+
+void Function::eraseFromParent() {
+  getParent()->getFunctionList().erase(this);
+}
+
+//===----------------------------------------------------------------------===//
+// Function Implementation
+//===----------------------------------------------------------------------===//
+
+Function::Function(FunctionType *Ty, LinkageTypes Linkage,
+                   const Twine &name, Module *ParentModule)
+  : GlobalValue(PointerType::getUnqual(Ty),
+                Value::FunctionVal, 0, 0, Linkage, name) {
+  assert(FunctionType::isValidReturnType(getReturnType()) &&
+         "invalid return type");
+  SymTab = new ValueSymbolTable();
+
+  // If the function has arguments, mark them as lazily built.
+  if (Ty->getNumParams())
+    setValueSubclassData(1);   // Set the "has lazy arguments" bit.
+
+  // Make sure that we get added to a function
+  LeakDetector::addGarbageObject(this);
+
+  if (ParentModule)
+    ParentModule->getFunctionList().push_back(this);
+
+  // Ensure intrinsics have the right parameter attributes.
+  if (unsigned IID = getIntrinsicID())
+    setAttributes(Intrinsic::getAttributes(getContext(), Intrinsic::ID(IID)));
+
+}
+
+Function::~Function() {
+  dropAllReferences();    // After this it is safe to delete instructions.
+
+  // Delete all of the method arguments and unlink from symbol table...
+  ArgumentList.clear();
+  delete SymTab;
+
+  // Remove the function from the on-the-side GC table.
+  clearGC();
+
+  // Remove the intrinsicID from the Cache.
+  if (getValueName() && isIntrinsic())
+    getContext().pImpl->IntrinsicIDCache.erase(this);
+}
+
+void Function::BuildLazyArguments() const {
+  // Create the arguments vector, all arguments start out unnamed.
+  FunctionType *FT = getFunctionType();
+  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+    assert(!FT->getParamType(i)->isVoidTy() &&
+           "Cannot have void typed arguments!");
+    ArgumentList.push_back(new Argument(FT->getParamType(i)));
+  }
+
+  // Clear the lazy arguments bit.
+  unsigned SDC = getSubclassDataFromValue();
+  const_cast<Function*>(this)->setValueSubclassData(SDC &= ~1);
+}
+
+size_t Function::arg_size() const {
+  return getFunctionType()->getNumParams();
+}
+bool Function::arg_empty() const {
+  return getFunctionType()->getNumParams() == 0;
+}
+
+void Function::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+// dropAllReferences() - This function causes all the subinstructions to "let
+// go" of all references that they are maintaining.  This allows one to
+// 'delete' a whole class at a time, even though there may be circular
+// references... first all references are dropped, and all use counts go to
+// zero.  Then everything is deleted for real.  Note that no operations are
+// valid on an object that has "dropped all references", except operator
+// delete.
+//
+void Function::dropAllReferences() {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+
+  // Delete all basic blocks. They are now unused, except possibly by
+  // blockaddresses, but BasicBlock's destructor takes care of those.
+  while (!BasicBlocks.empty())
+    BasicBlocks.begin()->eraseFromParent();
+}
+
+void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, attr);
+  setAttributes(PAL);
+}
+
+void Function::addAttributes(unsigned i, AttributeSet attrs) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttributes(getContext(), i, attrs);
+  setAttributes(PAL);
+}
+
+void Function::removeAttributes(unsigned i, AttributeSet attrs) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.removeAttributes(getContext(), i, attrs);
+  setAttributes(PAL);
+}
+
+// Maintain the GC name for each function in an on-the-side table. This saves
+// allocating an additional word in Function for programs which do not use GC
+// (i.e., most programs) at the cost of increased overhead for clients which do
+// use GC.
+static DenseMap<const Function*,PooledStringPtr> *GCNames;
+static StringPool *GCNamePool;
+static ManagedStatic<sys::SmartRWMutex<true> > GCLock;
+
+bool Function::hasGC() const {
+  sys::SmartScopedReader<true> Reader(*GCLock);
+  return GCNames && GCNames->count(this);
+}
+
+const char *Function::getGC() const {
+  assert(hasGC() && "Function has no collector");
+  sys::SmartScopedReader<true> Reader(*GCLock);
+  return *(*GCNames)[this];
+}
+
+void Function::setGC(const char *Str) {
+  sys::SmartScopedWriter<true> Writer(*GCLock);
+  if (!GCNamePool)
+    GCNamePool = new StringPool();
+  if (!GCNames)
+    GCNames = new DenseMap<const Function*,PooledStringPtr>();
+  (*GCNames)[this] = GCNamePool->intern(Str);
+}
+
+void Function::clearGC() {
+  sys::SmartScopedWriter<true> Writer(*GCLock);
+  if (GCNames) {
+    GCNames->erase(this);
+    if (GCNames->empty()) {
+      delete GCNames;
+      GCNames = 0;
+      if (GCNamePool->empty()) {
+        delete GCNamePool;
+        GCNamePool = 0;
+      }
+    }
+  }
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a Function) from the Function Src to this one.
+void Function::copyAttributesFrom(const GlobalValue *Src) {
+  assert(isa<Function>(Src) && "Expected a Function!");
+  GlobalValue::copyAttributesFrom(Src);
+  const Function *SrcF = cast<Function>(Src);
+  setCallingConv(SrcF->getCallingConv());
+  setAttributes(SrcF->getAttributes());
+  if (SrcF->hasGC())
+    setGC(SrcF->getGC());
+  else
+    clearGC();
+}
+
+/// getIntrinsicID - This method returns the ID number of the specified
+/// function, or Intrinsic::not_intrinsic if the function is not an
+/// intrinsic, or if the pointer is null.  This value is always defined to be
+/// zero to allow easy checking for whether a function is intrinsic or not.  The
+/// particular intrinsic functions which correspond to this value are defined in
+/// llvm/Intrinsics.h.  Results are cached in the LLVM context, subsequent
+/// requests for the same ID return results much faster from the cache.
+///
+unsigned Function::getIntrinsicID() const {
+  const ValueName *ValName = this->getValueName();
+  if (!ValName || !isIntrinsic())
+    return 0;
+
+  LLVMContextImpl::IntrinsicIDCacheTy &IntrinsicIDCache =
+    getContext().pImpl->IntrinsicIDCache;
+  if (!IntrinsicIDCache.count(this)) {
+    unsigned Id = lookupIntrinsicID();
+    IntrinsicIDCache[this]=Id;
+    return Id;
+  }
+  return IntrinsicIDCache[this];
+}
+
+/// This private method does the actual lookup of an intrinsic ID when the query
+/// could not be answered from the cache.
+unsigned Function::lookupIntrinsicID() const {
+  const ValueName *ValName = this->getValueName();
+  unsigned Len = ValName->getKeyLength();
+  const char *Name = ValName->getKeyData();
+
+#define GET_FUNCTION_RECOGNIZER
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_FUNCTION_RECOGNIZER
+
+  return 0;
+}
+
+std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
+  assert(id < num_intrinsics && "Invalid intrinsic ID!");
+  static const char * const Table[] = {
+    "not_intrinsic",
+#define GET_INTRINSIC_NAME_TABLE
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+  if (Tys.empty())
+    return Table[id];
+  std::string Result(Table[id]);
+  for (unsigned i = 0; i < Tys.size(); ++i) {
+    if (PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
+      Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) +
+                EVT::getEVT(PTyp->getElementType()).getEVTString();
+    }
+    else if (Tys[i])
+      Result += "." + EVT::getEVT(Tys[i]).getEVTString();
+  }
+  return Result;
+}
+
+
+/// IIT_Info - These are enumerators that describe the entries returned by the
+/// getIntrinsicInfoTableEntries function.
+///
+/// NOTE: This must be kept in synch with the copy in TblGen/IntrinsicEmitter!
+enum IIT_Info {
+  // Common values should be encoded with 0-15.
+  IIT_Done = 0,
+  IIT_I1   = 1,
+  IIT_I8   = 2,
+  IIT_I16  = 3,
+  IIT_I32  = 4,
+  IIT_I64  = 5,
+  IIT_F16  = 6,
+  IIT_F32  = 7,
+  IIT_F64  = 8,
+  IIT_V2   = 9,
+  IIT_V4   = 10,
+  IIT_V8   = 11,
+  IIT_V16  = 12,
+  IIT_V32  = 13,
+  IIT_PTR  = 14,
+  IIT_ARG  = 15,
+
+  // Values from 16+ are only encodable with the inefficient encoding.
+  IIT_MMX  = 16,
+  IIT_METADATA = 17,
+  IIT_EMPTYSTRUCT = 18,
+  IIT_STRUCT2 = 19,
+  IIT_STRUCT3 = 20,
+  IIT_STRUCT4 = 21,
+  IIT_STRUCT5 = 22,
+  IIT_EXTEND_VEC_ARG = 23,
+  IIT_TRUNC_VEC_ARG = 24,
+  IIT_ANYPTR = 25
+};
+
+
+static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+                      SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  IIT_Info Info = IIT_Info(Infos[NextElt++]);
+  unsigned StructElts = 2;
+  using namespace Intrinsic;
+
+  switch (Info) {
+  case IIT_Done:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
+    return;
+  case IIT_MMX:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
+    return;
+  case IIT_METADATA:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Metadata, 0));
+    return;
+  case IIT_F16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
+    return;
+  case IIT_F32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
+    return;
+  case IIT_F64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Double, 0));
+    return;
+  case IIT_I1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 1));
+    return;
+  case IIT_I8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8));
+    return;
+  case IIT_I16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer,16));
+    return;
+  case IIT_I32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 32));
+    return;
+  case IIT_I64:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
+    return;
+  case IIT_V2:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V4:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 4));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V8:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 8));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 16));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_V32:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_PTR:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  case IIT_ANYPTR: {  // [ANYPTR addrspace, subtype]
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer,
+                                             Infos[NextElt++]));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  case IIT_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Argument, ArgInfo));
+    return;
+  }
+  case IIT_EXTEND_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ExtendVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_TRUNC_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::TruncVecArgument,
+                                             ArgInfo));
+    return;
+  }
+  case IIT_EMPTYSTRUCT:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
+    return;
+  case IIT_STRUCT5: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT4: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT3: ++StructElts; // FALL THROUGH.
+  case IIT_STRUCT2: {
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts));
+
+    for (unsigned i = 0; i != StructElts; ++i)
+      DecodeIITType(NextElt, Infos, OutputTable);
+    return;
+  }
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+#define GET_INTRINSIC_GENERATOR_GLOBAL
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_GENERATOR_GLOBAL
+
+void Intrinsic::getIntrinsicInfoTableEntries(ID id,
+                                             SmallVectorImpl<IITDescriptor> &T){
+  // Check to see if the intrinsic's type was expressible by the table.
+  unsigned TableVal = IIT_Table[id-1];
+
+  // Decode the TableVal into an array of IITValues.
+  SmallVector<unsigned char, 8> IITValues;
+  ArrayRef<unsigned char> IITEntries;
+  unsigned NextElt = 0;
+  if ((TableVal >> 31) != 0) {
+    // This is an offset into the IIT_LongEncodingTable.
+    IITEntries = IIT_LongEncodingTable;
+
+    // Strip sentinel bit.
+    NextElt = (TableVal << 1) >> 1;
+  } else {
+    // Decode the TableVal into an array of IITValues.  If the entry was encoded
+    // into a single word in the table itself, decode it now.
+    do {
+      IITValues.push_back(TableVal & 0xF);
+      TableVal >>= 4;
+    } while (TableVal);
+
+    IITEntries = IITValues;
+    NextElt = 0;
+  }
+
+  // Okay, decode the table into the output vector of IITDescriptors.
+  DecodeIITType(NextElt, IITEntries, T);
+  while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
+    DecodeIITType(NextElt, IITEntries, T);
+}
+
+
+static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             ArrayRef<Type*> Tys, LLVMContext &Context) {
+  using namespace Intrinsic;
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+
+  switch (D.Kind) {
+  case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
+  case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
+  case IITDescriptor::Half: return Type::getHalfTy(Context);
+  case IITDescriptor::Float: return Type::getFloatTy(Context);
+  case IITDescriptor::Double: return Type::getDoubleTy(Context);
+
+  case IITDescriptor::Integer:
+    return IntegerType::get(Context, D.Integer_Width);
+  case IITDescriptor::Vector:
+    return VectorType::get(DecodeFixedType(Infos, Tys, Context),D.Vector_Width);
+  case IITDescriptor::Pointer:
+    return PointerType::get(DecodeFixedType(Infos, Tys, Context),
+                            D.Pointer_AddressSpace);
+  case IITDescriptor::Struct: {
+    Type *Elts[5];
+    assert(D.Struct_NumElements <= 5 && "Can't handle this yet");
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      Elts[i] = DecodeFixedType(Infos, Tys, Context);
+    return StructType::get(Context, ArrayRef<Type*>(Elts,D.Struct_NumElements));
+  }
+
+  case IITDescriptor::Argument:
+    return Tys[D.getArgumentNumber()];
+  case IITDescriptor::ExtendVecArgument:
+    return VectorType::getExtendedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+
+  case IITDescriptor::TruncVecArgument:
+    return VectorType::getTruncatedElementVectorType(cast<VectorType>(
+                                                  Tys[D.getArgumentNumber()]));
+  }
+  llvm_unreachable("unhandled");
+}
+
+
+
+FunctionType *Intrinsic::getType(LLVMContext &Context,
+                                 ID id, ArrayRef<Type*> Tys) {
+  SmallVector<IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(id, Table);
+
+  ArrayRef<IITDescriptor> TableRef = Table;
+  Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
+
+  SmallVector<Type*, 8> ArgTys;
+  while (!TableRef.empty())
+    ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
+
+  return FunctionType::get(ResultTy, ArgTys, false);
+}
+
+bool Intrinsic::isOverloaded(ID id) {
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+/// This defines the "Intrinsic::getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
+  // There can never be multiple globals with the same name of different types,
+  // because intrinsics must be a specific type.
+  return
+    cast<Function>(M->getOrInsertFunction(getName(id, Tys),
+                                          getType(M->getContext(), id, Tys)));
+}
+
+// This defines the "Intrinsic::getIntrinsicForGCCBuiltin()" method.
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "llvm/IR/Intrinsics.gen"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+/// hasAddressTaken - returns true if there are any uses of this function
+/// other than direct calls or invokes to it.
+bool Function::hasAddressTaken(const User* *PutOffender) const {
+  for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+    const User *U = *I;
+    if (isa<BlockAddress>(U))
+      continue;
+    if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+      return PutOffender ? (*PutOffender = U, true) : true;
+    ImmutableCallSite CS(cast<Instruction>(U));
+    if (!CS.isCallee(I))
+      return PutOffender ? (*PutOffender = U, true) : true;
+  }
+  return false;
+}
+
+bool Function::isDefTriviallyDead() const {
+  // Check the linkage
+  if (!hasLinkOnceLinkage() && !hasLocalLinkage() &&
+      !hasAvailableExternallyLinkage())
+    return false;
+
+  // Check if the function is used by anything other than a blockaddress.
+  for (Value::const_use_iterator I = use_begin(), E = use_end(); I != E; ++I)
+    if (!isa<BlockAddress>(*I))
+      return false;
+
+  return true;
+}
+
+/// callsFunctionThatReturnsTwice - Return true if the function has a call to
+/// setjmp or other function that gcc recognizes as "returning twice".
+bool Function::callsFunctionThatReturnsTwice() const {
+  for (const_inst_iterator
+         I = inst_begin(this), E = inst_end(this); I != E; ++I) {
+    const CallInst* callInst = dyn_cast<CallInst>(&*I);
+    if (!callInst)
+      continue;
+    if (callInst->canReturnTwice())
+      return true;
+  }
+
+  return false;
+}
+
diff --git a/contrib/llvm/lib/IR/GCOV.cpp b/contrib/llvm/lib/IR/GCOV.cpp
new file mode 100644
index 000000000000..ea2f0a6d556f
--- /dev/null
+++ b/contrib/llvm/lib/IR/GCOV.cpp
@@ -0,0 +1,283 @@
+//===- GCOVr.cpp - LLVM coverage tool -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// GCOV implements the interface to read and write coverage files that use 
+// 'gcov' format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/GCOV.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// GCOVFile implementation.
+
+/// ~GCOVFile - Delete GCOVFile and its content.
+GCOVFile::~GCOVFile() {
+  DeleteContainerPointers(Functions);
+}
+
+/// isGCDAFile - Return true if Format identifies a .gcda file.
+static bool isGCDAFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404;
+}
+
+/// isGCNOFile - Return true if Format identifies a .gcno file.
+static bool isGCNOFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404;
+}
+
+/// read - Read GCOV buffer.
+bool GCOVFile::read(GCOVBuffer &Buffer) {
+  GCOV::GCOVFormat Format = Buffer.readGCOVFormat();
+  if (Format == GCOV::InvalidGCOV)
+    return false;
+
+  unsigned i = 0;
+  while (1) {
+    GCOVFunction *GFun = NULL;
+    if (isGCDAFile(Format)) {
+      // Use existing function while reading .gcda file.
+      assert(i < Functions.size() && ".gcda data does not match .gcno data");
+      GFun = Functions[i];
+    } else if (isGCNOFile(Format)){
+      GFun = new GCOVFunction();
+      Functions.push_back(GFun);
+    }
+    if (!GFun || !GFun->read(Buffer, Format))
+      break;
+    ++i;
+  }
+  return true;
+}
+
+/// dump - Dump GCOVFile content on standard out for debugging purposes.
+void GCOVFile::dump() {
+  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
+         E = Functions.end(); I != E; ++I)
+    (*I)->dump();
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVFile::collectLineCounts(FileInfo &FI) {
+  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
+         E = Functions.end(); I != E; ++I) 
+    (*I)->collectLineCounts(FI);
+  FI.print();
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVFunction implementation.
+
+/// ~GCOVFunction - Delete GCOVFunction and its content.
+GCOVFunction::~GCOVFunction() {
+  DeleteContainerPointers(Blocks);
+}
+
+/// read - Read a aunction from the buffer. Return false if buffer cursor
+/// does not point to a function tag.
+bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
+  if (!Buff.readFunctionTag())
+    return false;
+
+  Buff.readInt(); // Function header length
+  Ident = Buff.readInt(); 
+  Buff.readInt(); // Checksum #1
+  if (Format != GCOV::GCNO_402)
+    Buff.readInt(); // Checksum #2
+
+  Name = Buff.readString();
+  if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
+    Filename = Buff.readString();
+
+  if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
+    Buff.readArcTag();
+    uint32_t Count = Buff.readInt() / 2;
+    for (unsigned i = 0, e = Count; i != e; ++i) {
+      Blocks[i]->addCount(Buff.readInt64());
+    }
+    return true;
+  }
+
+  LineNumber = Buff.readInt();
+
+  // read blocks.
+  bool BlockTagFound = Buff.readBlockTag();
+  (void)BlockTagFound;
+  assert(BlockTagFound && "Block Tag not found!");
+  uint32_t BlockCount = Buff.readInt();
+  for (int i = 0, e = BlockCount; i != e; ++i) {
+    Buff.readInt(); // Block flags;
+    Blocks.push_back(new GCOVBlock(i));
+  }
+
+  // read edges.
+  while (Buff.readEdgeTag()) {
+    uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
+    uint32_t BlockNo = Buff.readInt();
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
+    for (int i = 0, e = EdgeCount; i != e; ++i) {
+      Blocks[BlockNo]->addEdge(Buff.readInt());
+      Buff.readInt(); // Edge flag
+    }
+  }
+
+  // read line table.
+  while (Buff.readLineTag()) {
+    uint32_t LineTableLength = Buff.readInt();
+    uint32_t Size = Buff.getCursor() + LineTableLength*4;
+    uint32_t BlockNo = Buff.readInt();
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
+    GCOVBlock *Block = Blocks[BlockNo];
+    Buff.readInt(); // flag
+    while (Buff.getCursor() != (Size - 4)) {
+      StringRef Filename = Buff.readString();
+      if (Buff.getCursor() == (Size - 4)) break;
+      while (uint32_t L = Buff.readInt())
+        Block->addLine(Filename, L);
+    }
+    Buff.readInt(); // flag
+  }
+  return true;
+}
+
+/// dump - Dump GCOVFunction content on standard out for debugging purposes.
+void GCOVFunction::dump() {
+  outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+         E = Blocks.end(); I != E; ++I)
+    (*I)->dump();
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVFunction::collectLineCounts(FileInfo &FI) {
+  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+         E = Blocks.end(); I != E; ++I)
+    (*I)->collectLineCounts(FI);
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVBlock implementation.
+
+/// ~GCOVBlock - Delete GCOVBlock and its content.
+GCOVBlock::~GCOVBlock() {
+  Edges.clear();
+  DeleteContainerSeconds(Lines);
+}
+
+void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
+  GCOVLines *&LinesForFile = Lines[Filename];
+  if (!LinesForFile)
+    LinesForFile = new GCOVLines();
+  LinesForFile->add(LineNo);
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVBlock::collectLineCounts(FileInfo &FI) {
+  for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
+         E = Lines.end(); I != E; ++I)
+    I->second->collectLineCounts(FI, I->first(), Counter);
+}
+
+/// dump - Dump GCOVBlock content on standard out for debugging purposes.
+void GCOVBlock::dump() {
+  outs() << "Block : " << Number << " Counter : " << Counter << "\n";
+  if (!Edges.empty()) {
+    outs() << "\tEdges : ";
+    for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end();
+         I != E; ++I)
+      outs() << (*I) << ",";
+    outs() << "\n";
+  }
+  if (!Lines.empty()) {
+    outs() << "\tLines : ";
+    for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
+           LE = Lines.end(); LI != LE; ++LI) {
+      outs() << LI->first() << " -> ";
+      LI->second->dump();
+      outs() << "\n";
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVLines implementation.
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
+                                  uint32_t Count) {
+  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
+         E = Lines.end(); I != E; ++I)
+    FI.addLineCount(Filename, *I, Count);
+}
+
+/// dump - Dump GCOVLines content on standard out for debugging purposes.
+void GCOVLines::dump() {
+  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
+         E = Lines.end(); I != E; ++I)
+    outs() << (*I) << ",";
+}
+
+//===----------------------------------------------------------------------===//
+// FileInfo implementation.
+
+/// addLineCount - Add line count for the given line number in a file.
+void FileInfo::addLineCount(StringRef Filename, uint32_t Line, uint32_t Count) {
+  if (LineInfo.find(Filename) == LineInfo.end()) {
+    OwningPtr<MemoryBuffer> Buff;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+      errs() << Filename << ": " << ec.message() << "\n";
+      return;
+    }
+    StringRef AllLines = Buff.take()->getBuffer();
+    LineCounts L(AllLines.count('\n')+2);
+    L[Line-1] = Count;
+    LineInfo[Filename] = L;
+    return;
+  }
+  LineCounts &L = LineInfo[Filename];
+  L[Line-1] = Count;
+}
+
+/// print -  Print source files with collected line count information.
+void FileInfo::print() {
+  for (StringMap<LineCounts>::iterator I = LineInfo.begin(), E = LineInfo.end();
+       I != E; ++I) {
+    StringRef Filename = I->first();
+    outs() << Filename << "\n";
+    LineCounts &L = LineInfo[Filename];
+    OwningPtr<MemoryBuffer> Buff;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+      errs() << Filename << ": " << ec.message() << "\n";
+      return;
+    }
+    StringRef AllLines = Buff.take()->getBuffer();
+    for (unsigned i = 0, e = L.size(); i != e; ++i) {
+      if (L[i])
+        outs() << L[i] << ":\t";
+      else
+        outs() << " :\t";
+      std::pair<StringRef, StringRef> P = AllLines.split('\n');
+      if (AllLines != P.first)
+        outs() << P.first;
+      outs() << "\n";
+      AllLines = P.second;
+    }
+  }
+}
+
+
diff --git a/contrib/llvm/lib/IR/GVMaterializer.cpp b/contrib/llvm/lib/IR/GVMaterializer.cpp
new file mode 100644
index 000000000000..f77a9c908d54
--- /dev/null
+++ b/contrib/llvm/lib/IR/GVMaterializer.cpp
@@ -0,0 +1,18 @@
+//===-- GVMaterializer.cpp - Base implementation for GV materializers -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Minimal implementation of the abstract interface for materializing
+// GlobalValues.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/GVMaterializer.h"
+using namespace llvm;
+
+GVMaterializer::~GVMaterializer() {}
diff --git a/contrib/llvm/lib/IR/Globals.cpp b/contrib/llvm/lib/IR/Globals.cpp
new file mode 100644
index 000000000000..6d547f3edf3f
--- /dev/null
+++ b/contrib/llvm/lib/IR/Globals.cpp
@@ -0,0 +1,269 @@
+//===-- Globals.cpp - Implement the GlobalValue & GlobalVariable class ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GlobalValue & GlobalVariable classes for the IR
+// library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LeakDetector.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                            GlobalValue Class
+//===----------------------------------------------------------------------===//
+
+bool GlobalValue::isMaterializable() const {
+  return getParent() && getParent()->isMaterializable(this);
+}
+bool GlobalValue::isDematerializable() const {
+  return getParent() && getParent()->isDematerializable(this);
+}
+bool GlobalValue::Materialize(std::string *ErrInfo) {
+  return getParent()->Materialize(this, ErrInfo);
+}
+void GlobalValue::Dematerialize() {
+  getParent()->Dematerialize(this);
+}
+
+/// Override destroyConstant to make sure it doesn't get called on
+/// GlobalValue's because they shouldn't be treated like other constants.
+void GlobalValue::destroyConstant() {
+  llvm_unreachable("You can't GV->destroyConstant()!");
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a GlobalValue) from the GlobalValue Src to this one.
+void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
+  setAlignment(Src->getAlignment());
+  setSection(Src->getSection());
+  setVisibility(Src->getVisibility());
+  setUnnamedAddr(Src->hasUnnamedAddr());
+}
+
+void GlobalValue::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
+  Alignment = Log2_32(Align) + 1;
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+
+bool GlobalValue::isDeclaration() const {
+  // Globals are definitions if they have an initializer.
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(this))
+    return GV->getNumOperands() == 0;
+
+  // Functions are definitions if they have a body.
+  if (const Function *F = dyn_cast<Function>(this))
+    return F->empty();
+
+  // Aliases are always definitions.
+  assert(isa<GlobalAlias>(this));
+  return false;
+}
+  
+//===----------------------------------------------------------------------===//
+// GlobalVariable Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
+                               Constant *InitVal,
+                               const Twine &Name, ThreadLocalMode TLMode,
+                               unsigned AddressSpace,
+                               bool isExternallyInitialized)
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
+                Value::GlobalVariableVal,
+                OperandTraits<GlobalVariable>::op_begin(this),
+                InitVal != 0, Link, Name),
+    isConstantGlobal(constant), threadLocalMode(TLMode),
+    isExternallyInitializedConstant(isExternallyInitialized) {
+  if (InitVal) {
+    assert(InitVal->getType() == Ty &&
+           "Initializer should be the same type as the GlobalVariable!");
+    Op<0>() = InitVal;
+  }
+
+  LeakDetector::addGarbageObject(this);
+}
+
+GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
+                               LinkageTypes Link, Constant *InitVal,
+                               const Twine &Name,
+                               GlobalVariable *Before, ThreadLocalMode TLMode,
+                               unsigned AddressSpace,
+                               bool isExternallyInitialized)
+  : GlobalValue(PointerType::get(Ty, AddressSpace),
+                Value::GlobalVariableVal,
+                OperandTraits<GlobalVariable>::op_begin(this),
+                InitVal != 0, Link, Name),
+    isConstantGlobal(constant), threadLocalMode(TLMode),
+    isExternallyInitializedConstant(isExternallyInitialized) {
+  if (InitVal) {
+    assert(InitVal->getType() == Ty &&
+           "Initializer should be the same type as the GlobalVariable!");
+    Op<0>() = InitVal;
+  }
+  
+  LeakDetector::addGarbageObject(this);
+  
+  if (Before)
+    Before->getParent()->getGlobalList().insert(Before, this);
+  else
+    M.getGlobalList().push_back(this);
+}
+
+void GlobalVariable::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void GlobalVariable::removeFromParent() {
+  getParent()->getGlobalList().remove(this);
+}
+
+void GlobalVariable::eraseFromParent() {
+  getParent()->getGlobalList().erase(this);
+}
+
+void GlobalVariable::replaceUsesOfWithOnConstant(Value *From, Value *To,
+                                                 Use *U) {
+  // If you call this, then you better know this GVar has a constant
+  // initializer worth replacing. Enforce that here.
+  assert(getNumOperands() == 1 &&
+         "Attempt to replace uses of Constants on a GVar with no initializer");
+
+  // And, since you know it has an initializer, the From value better be
+  // the initializer :)
+  assert(getOperand(0) == From &&
+         "Attempt to replace wrong constant initializer in GVar");
+
+  // And, you better have a constant for the replacement value
+  assert(isa<Constant>(To) &&
+         "Attempt to replace GVar initializer with non-constant");
+
+  // Okay, preconditions out of the way, replace the constant initializer.
+  this->setOperand(0, cast<Constant>(To));
+}
+
+void GlobalVariable::setInitializer(Constant *InitVal) {
+  if (InitVal == 0) {
+    if (hasInitializer()) {
+      Op<0>().set(0);
+      NumOperands = 0;
+    }
+  } else {
+    assert(InitVal->getType() == getType()->getElementType() &&
+           "Initializer type must match GlobalVariable type");
+    if (!hasInitializer())
+      NumOperands = 1;
+    Op<0>().set(InitVal);
+  }
+}
+
+/// copyAttributesFrom - copy all additional attributes (those not needed to
+/// create a GlobalVariable) from the GlobalVariable Src to this one.
+void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+  assert(isa<GlobalVariable>(Src) && "Expected a GlobalVariable!");
+  GlobalValue::copyAttributesFrom(Src);
+  const GlobalVariable *SrcVar = cast<GlobalVariable>(Src);
+  setThreadLocal(SrcVar->isThreadLocal());
+}
+
+
+//===----------------------------------------------------------------------===//
+// GlobalAlias Implementation
+//===----------------------------------------------------------------------===//
+
+GlobalAlias::GlobalAlias(Type *Ty, LinkageTypes Link,
+                         const Twine &Name, Constant* aliasee,
+                         Module *ParentModule)
+  : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) {
+  LeakDetector::addGarbageObject(this);
+
+  if (aliasee)
+    assert(aliasee->getType() == Ty && "Alias and aliasee types should match!");
+  Op<0>() = aliasee;
+
+  if (ParentModule)
+    ParentModule->getAliasList().push_back(this);
+}
+
+void GlobalAlias::setParent(Module *parent) {
+  if (getParent())
+    LeakDetector::addGarbageObject(this);
+  Parent = parent;
+  if (getParent())
+    LeakDetector::removeGarbageObject(this);
+}
+
+void GlobalAlias::removeFromParent() {
+  getParent()->getAliasList().remove(this);
+}
+
+void GlobalAlias::eraseFromParent() {
+  getParent()->getAliasList().erase(this);
+}
+
+void GlobalAlias::setAliasee(Constant *Aliasee) {
+  assert((!Aliasee || Aliasee->getType() == getType()) &&
+         "Alias and aliasee types should match!");
+  
+  setOperand(0, Aliasee);
+}
+
+const GlobalValue *GlobalAlias::getAliasedGlobal() const {
+  const Constant *C = getAliasee();
+  if (C == 0) return 0;
+  
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return GV;
+
+  const ConstantExpr *CE = cast<ConstantExpr>(C);
+  assert((CE->getOpcode() == Instruction::BitCast || 
+          CE->getOpcode() == Instruction::GetElementPtr) &&
+         "Unsupported aliasee");
+  
+  return cast<GlobalValue>(CE->getOperand(0));
+}
+
+const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
+  SmallPtrSet<const GlobalValue*, 3> Visited;
+
+  // Check if we need to stop early.
+  if (stopOnWeak && mayBeOverridden())
+    return this;
+
+  const GlobalValue *GV = getAliasedGlobal();
+  Visited.insert(GV);
+
+  // Iterate over aliasing chain, stopping on weak alias if necessary.
+  while (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
+    if (stopOnWeak && GA->mayBeOverridden())
+      break;
+
+    GV = GA->getAliasedGlobal();
+
+    if (!Visited.insert(GV))
+      return 0;
+  }
+
+  return GV;
+}
diff --git a/contrib/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm/lib/IR/IRBuilder.cpp
new file mode 100644
index 000000000000..435e54f0ea2a
--- /dev/null
+++ b/contrib/llvm/lib/IR/IRBuilder.cpp
@@ -0,0 +1,153 @@
+//===---- IRBuilder.cpp - Builder for LLVM Instrs -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IRBuilder class, which is used as a convenient way
+// to create LLVM instructions with a consistent and simplified interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+using namespace llvm;
+
+/// CreateGlobalString - Make a new global variable with an initializer that
+/// has array of i8 type filled in with the nul terminated string value
+/// specified.  If Name is specified, it is the name of the global variable
+/// created.
+Value *IRBuilderBase::CreateGlobalString(StringRef Str, const Twine &Name) {
+  Constant *StrConstant = ConstantDataArray::getString(Context, Str);
+  Module &M = *BB->getParent()->getParent();
+  GlobalVariable *GV = new GlobalVariable(M, StrConstant->getType(),
+                                          true, GlobalValue::PrivateLinkage,
+                                          StrConstant);
+  GV->setName(Name);
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
+Type *IRBuilderBase::getCurrentFunctionReturnType() const {
+  assert(BB && BB->getParent() && "No current function!");
+  return BB->getParent()->getReturnType();
+}
+
+Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
+  PointerType *PT = cast<PointerType>(Ptr->getType());
+  if (PT->getElementType()->isIntegerTy(8))
+    return Ptr;
+  
+  // Otherwise, we need to insert a bitcast.
+  PT = getInt8PtrTy(PT->getAddressSpace());
+  BitCastInst *BCI = new BitCastInst(Ptr, PT, "");
+  BB->getInstList().insert(InsertPt, BCI);
+  SetInstDebugLocation(BCI);
+  return BCI;
+}
+
+static CallInst *createCallHelper(Value *Callee, ArrayRef<Value *> Ops,
+                                  IRBuilderBase *Builder) {
+  CallInst *CI = CallInst::Create(Callee, Ops, "");
+  Builder->GetInsertBlock()->getInstList().insert(Builder->GetInsertPoint(),CI);
+  Builder->SetInstDebugLocation(CI);
+  return CI;  
+}
+
+CallInst *IRBuilderBase::
+CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
+             bool isVolatile, MDNode *TBAATag) {
+  Ptr = getCastedInt8PtrValue(Ptr);
+  Value *Ops[] = { Ptr, Val, Size, getInt32(Align), getInt1(isVolatile) };
+  Type *Tys[] = { Ptr->getType(), Size->getType() };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+  
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+  
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+  
+  return CI;
+}
+
+CallInst *IRBuilderBase::
+CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
+             bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Src = getCastedInt8PtrValue(Src);
+
+  Value *Ops[] = { Dst, Src, Size, getInt32(Align), getInt1(isVolatile) };
+  Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
+  
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+  
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  // Set the TBAA Struct info if present.
+  if (TBAAStructTag)
+    CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
+  
+  return CI;  
+}
+
+CallInst *IRBuilderBase::
+CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
+              bool isVolatile, MDNode *TBAATag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Src = getCastedInt8PtrValue(Src);
+  
+  Value *Ops[] = { Dst, Src, Size, getInt32(Align), getInt1(isVolatile) };
+  Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memmove, Tys);
+  
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+  
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+  
+  return CI;  
+}
+
+CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+         "lifetime.start only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+           "lifetime.start requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  return createCallHelper(TheFn, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+         "lifetime.end only applies to pointers.");
+  Ptr = getCastedInt8PtrValue(Ptr);
+  if (!Size)
+    Size = getInt64(-1);
+  else
+    assert(Size->getType() == getInt64Ty() &&
+           "lifetime.end requires the size to be an i64");
+  Value *Ops[] = { Size, Ptr };
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  return createCallHelper(TheFn, Ops, this);
+}
diff --git a/contrib/llvm/lib/IR/InlineAsm.cpp b/contrib/llvm/lib/IR/InlineAsm.cpp
new file mode 100644
index 000000000000..9f2a9fea4b93
--- /dev/null
+++ b/contrib/llvm/lib/IR/InlineAsm.cpp
@@ -0,0 +1,295 @@
+//===-- InlineAsm.cpp - Implement the InlineAsm class ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the InlineAsm class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/InlineAsm.h"
+#include "ConstantsContext.h"
+#include "LLVMContextImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+// Implement the first virtual method in this class in this file so the
+// InlineAsm vtable is emitted here.
+InlineAsm::~InlineAsm() {
+}
+
+
+InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString,
+                          StringRef Constraints, bool hasSideEffects,
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack,
+                       asmDialect);
+  LLVMContextImpl *pImpl = Ty->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
+}
+
+InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString,
+                     const std::string &constraints, bool hasSideEffects,
+                     bool isAlignStack, AsmDialect asmDialect)
+  : Value(Ty, Value::InlineAsmVal),
+    AsmString(asmString), Constraints(constraints),
+    HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
+    Dialect(asmDialect) {
+
+  // Do various checks on the constraint string and type.
+  assert(Verify(getFunctionType(), constraints) &&
+         "Function type not legal for constraints!");
+}
+
+void InlineAsm::destroyConstant() {
+  getType()->getContext().pImpl->InlineAsms.remove(this);
+  delete this;
+}
+
+FunctionType *InlineAsm::getFunctionType() const {
+  return cast<FunctionType>(getType()->getElementType());
+}
+    
+///Default constructor.
+InlineAsm::ConstraintInfo::ConstraintInfo() :
+  Type(isInput), isEarlyClobber(false),
+  MatchingInput(-1), isCommutative(false),
+  isIndirect(false), isMultipleAlternative(false),
+  currentAlternativeIndex(0) {
+}
+
+/// Copy constructor.
+InlineAsm::ConstraintInfo::ConstraintInfo(const ConstraintInfo &other) :
+  Type(other.Type), isEarlyClobber(other.isEarlyClobber),
+  MatchingInput(other.MatchingInput), isCommutative(other.isCommutative),
+  isIndirect(other.isIndirect), Codes(other.Codes),
+  isMultipleAlternative(other.isMultipleAlternative),
+  multipleAlternatives(other.multipleAlternatives),
+  currentAlternativeIndex(other.currentAlternativeIndex) {
+}
+
+/// Parse - Analyze the specified string (e.g. "==&{eax}") and fill in the
+/// fields in this structure.  If the constraint string is not understood,
+/// return true, otherwise return false.
+bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
+                     InlineAsm::ConstraintInfoVector &ConstraintsSoFar) {
+  StringRef::iterator I = Str.begin(), E = Str.end();
+  unsigned multipleAlternativeCount = Str.count('|') + 1;
+  unsigned multipleAlternativeIndex = 0;
+  ConstraintCodeVector *pCodes = &Codes;
+  
+  // Initialize
+  isMultipleAlternative = (multipleAlternativeCount > 1 ? true : false);
+  if (isMultipleAlternative) {
+    multipleAlternatives.resize(multipleAlternativeCount);
+    pCodes = &multipleAlternatives[0].Codes;
+  }
+  Type = isInput;
+  isEarlyClobber = false;
+  MatchingInput = -1;
+  isCommutative = false;
+  isIndirect = false;
+  currentAlternativeIndex = 0;
+  
+  // Parse prefixes.
+  if (*I == '~') {
+    Type = isClobber;
+    ++I;
+  } else if (*I == '=') {
+    ++I;
+    Type = isOutput;
+  }
+  
+  if (*I == '*') {
+    isIndirect = true;
+    ++I;
+  }
+  
+  if (I == E) return true;  // Just a prefix, like "==" or "~".
+  
+  // Parse the modifiers.
+  bool DoneWithModifiers = false;
+  while (!DoneWithModifiers) {
+    switch (*I) {
+    default:
+      DoneWithModifiers = true;
+      break;
+    case '&':     // Early clobber.
+      if (Type != isOutput ||      // Cannot early clobber anything but output.
+          isEarlyClobber)          // Reject &&&&&&
+        return true;
+      isEarlyClobber = true;
+      break;
+    case '%':     // Commutative.
+      if (Type == isClobber ||     // Cannot commute clobbers.
+          isCommutative)           // Reject %%%%%
+        return true;
+      isCommutative = true;
+      break;
+    case '#':     // Comment.
+    case '*':     // Register preferencing.
+      return true;     // Not supported.
+    }
+    
+    if (!DoneWithModifiers) {
+      ++I;
+      if (I == E) return true;   // Just prefixes and modifiers!
+    }
+  }
+  
+  // Parse the various constraints.
+  while (I != E) {
+    if (*I == '{') {   // Physical register reference.
+      // Find the end of the register name.
+      StringRef::iterator ConstraintEnd = std::find(I+1, E, '}');
+      if (ConstraintEnd == E) return true;  // "{foo"
+      pCodes->push_back(std::string(I, ConstraintEnd+1));
+      I = ConstraintEnd+1;
+    } else if (isdigit(static_cast<unsigned char>(*I))) { // Matching Constraint
+      // Maximal munch numbers.
+      StringRef::iterator NumStart = I;
+      while (I != E && isdigit(static_cast<unsigned char>(*I)))
+        ++I;
+      pCodes->push_back(std::string(NumStart, I));
+      unsigned N = atoi(pCodes->back().c_str());
+      // Check that this is a valid matching constraint!
+      if (N >= ConstraintsSoFar.size() || ConstraintsSoFar[N].Type != isOutput||
+          Type != isInput)
+        return true;  // Invalid constraint number.
+      
+      // If Operand N already has a matching input, reject this.  An output
+      // can't be constrained to the same value as multiple inputs.
+      if (isMultipleAlternative) {
+        InlineAsm::SubConstraintInfo &scInfo =
+          ConstraintsSoFar[N].multipleAlternatives[multipleAlternativeIndex];
+        if (scInfo.MatchingInput != -1)
+          return true;
+        // Note that operand #n has a matching input.
+        scInfo.MatchingInput = ConstraintsSoFar.size();
+      } else {
+        if (ConstraintsSoFar[N].hasMatchingInput())
+          return true;
+        // Note that operand #n has a matching input.
+        ConstraintsSoFar[N].MatchingInput = ConstraintsSoFar.size();
+        }
+    } else if (*I == '|') {
+      multipleAlternativeIndex++;
+      pCodes = &multipleAlternatives[multipleAlternativeIndex].Codes;
+      ++I;
+    } else if (*I == '^') {
+      // Multi-letter constraint
+      // FIXME: For now assuming these are 2-character constraints.
+      pCodes->push_back(std::string(I+1, I+3));
+      I += 3;
+    } else {
+      // Single letter constraint.
+      pCodes->push_back(std::string(I, I+1));
+      ++I;
+    }
+  }
+
+  return false;
+}
+
+/// selectAlternative - Point this constraint to the alternative constraint
+/// indicated by the index.
+void InlineAsm::ConstraintInfo::selectAlternative(unsigned index) {
+  if (index < multipleAlternatives.size()) {
+    currentAlternativeIndex = index;
+    InlineAsm::SubConstraintInfo &scInfo =
+      multipleAlternatives[currentAlternativeIndex];
+    MatchingInput = scInfo.MatchingInput;
+    Codes = scInfo.Codes;
+  }
+}
+
+InlineAsm::ConstraintInfoVector
+InlineAsm::ParseConstraints(StringRef Constraints) {
+  ConstraintInfoVector Result;
+  
+  // Scan the constraints string.
+  for (StringRef::iterator I = Constraints.begin(),
+         E = Constraints.end(); I != E; ) {
+    ConstraintInfo Info;
+
+    // Find the end of this constraint.
+    StringRef::iterator ConstraintEnd = std::find(I, E, ',');
+
+    if (ConstraintEnd == I ||  // Empty constraint like ",,"
+        Info.Parse(StringRef(I, ConstraintEnd-I), Result)) {
+      Result.clear();          // Erroneous constraint?
+      break;
+    }
+
+    Result.push_back(Info);
+    
+    // ConstraintEnd may be either the next comma or the end of the string.  In
+    // the former case, we skip the comma.
+    I = ConstraintEnd;
+    if (I != E) {
+      ++I;
+      if (I == E) { Result.clear(); break; }    // don't allow "xyz,"
+    }
+  }
+  
+  return Result;
+}
+
+/// Verify - Verify that the specified constraint string is reasonable for the
+/// specified function type, and otherwise validate the constraint string.
+bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) {
+  if (Ty->isVarArg()) return false;
+  
+  ConstraintInfoVector Constraints = ParseConstraints(ConstStr);
+  
+  // Error parsing constraints.
+  if (Constraints.empty() && !ConstStr.empty()) return false;
+  
+  unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0;
+  unsigned NumIndirect = 0;
+  
+  for (unsigned i = 0, e = Constraints.size(); i != e; ++i) {
+    switch (Constraints[i].Type) {
+    case InlineAsm::isOutput:
+      if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0)
+        return false;  // outputs before inputs and clobbers.
+      if (!Constraints[i].isIndirect) {
+        ++NumOutputs;
+        break;
+      }
+      ++NumIndirect;
+      // FALLTHROUGH for Indirect Outputs.
+    case InlineAsm::isInput:
+      if (NumClobbers) return false;               // inputs before clobbers.
+      ++NumInputs;
+      break;
+    case InlineAsm::isClobber:
+      ++NumClobbers;
+      break;
+    }
+  }
+  
+  switch (NumOutputs) {
+  case 0:
+    if (!Ty->getReturnType()->isVoidTy()) return false;
+    break;
+  case 1:
+    if (Ty->getReturnType()->isStructTy()) return false;
+    break;
+  default:
+    StructType *STy = dyn_cast<StructType>(Ty->getReturnType());
+    if (STy == 0 || STy->getNumElements() != NumOutputs)
+      return false;
+    break;
+  }      
+  
+  if (Ty->getNumParams() != NumInputs) return false;
+  return true;
+}
+
diff --git a/contrib/llvm/lib/IR/Instruction.cpp b/contrib/llvm/lib/IR/Instruction.cpp
new file mode 100644
index 000000000000..2b5a0b39c316
--- /dev/null
+++ b/contrib/llvm/lib/IR/Instruction.cpp
@@ -0,0 +1,555 @@
+//===-- Instruction.cpp - Implement the Instruction class -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Instruction class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/LeakDetector.h"
+using namespace llvm;
+
+Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+                         Instruction *InsertBefore)
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  // Make sure that we get added to a basicblock
+  LeakDetector::addGarbageObject(this);
+
+  // If requested, insert this instruction into a basic block...
+  if (InsertBefore) {
+    assert(InsertBefore->getParent() &&
+           "Instruction to insert before is not in a basic block!");
+    InsertBefore->getParent()->getInstList().insert(InsertBefore, this);
+  }
+}
+
+Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+                         BasicBlock *InsertAtEnd)
+  : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
+  // Make sure that we get added to a basicblock
+  LeakDetector::addGarbageObject(this);
+
+  // append this instruction into the basic block
+  assert(InsertAtEnd && "Basic block to append to may not be NULL!");
+  InsertAtEnd->getInstList().push_back(this);
+}
+
+
+// Out of line virtual method, so the vtable, etc has a home.
+Instruction::~Instruction() {
+  assert(Parent == 0 && "Instruction still linked in the program!");
+  if (hasMetadataHashEntry())
+    clearMetadataHashEntries();
+}
+
+
+void Instruction::setParent(BasicBlock *P) {
+  if (getParent()) {
+    if (!P) LeakDetector::addGarbageObject(this);
+  } else {
+    if (P) LeakDetector::removeGarbageObject(this);
+  }
+
+  Parent = P;
+}
+
+void Instruction::removeFromParent() {
+  getParent()->getInstList().remove(this);
+}
+
+void Instruction::eraseFromParent() {
+  getParent()->getInstList().erase(this);
+}
+
+/// insertBefore - Insert an unlinked instructions into a basic block
+/// immediately before the specified instruction.
+void Instruction::insertBefore(Instruction *InsertPos) {
+  InsertPos->getParent()->getInstList().insert(InsertPos, this);
+}
+
+/// insertAfter - Insert an unlinked instructions into a basic block
+/// immediately after the specified instruction.
+void Instruction::insertAfter(Instruction *InsertPos) {
+  InsertPos->getParent()->getInstList().insertAfter(InsertPos, this);
+}
+
+/// moveBefore - Unlink this instruction from its current basic block and
+/// insert it into the basic block that MovePos lives in, right before
+/// MovePos.
+void Instruction::moveBefore(Instruction *MovePos) {
+  MovePos->getParent()->getInstList().splice(MovePos,getParent()->getInstList(),
+                                             this);
+}
+
+/// Set or clear the unsafe-algebra flag on this instruction, which must be an
+/// operator which supports this flag. See LangRef.html for the meaning of this
+/// flag.
+void Instruction::setHasUnsafeAlgebra(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasUnsafeAlgebra(B);
+}
+
+/// Set or clear the NoNaNs flag on this instruction, which must be an operator
+/// which supports this flag. See LangRef.html for the meaning of this flag.
+void Instruction::setHasNoNaNs(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasNoNaNs(B);
+}
+
+/// Set or clear the no-infs flag on this instruction, which must be an operator
+/// which supports this flag. See LangRef.html for the meaning of this flag.
+void Instruction::setHasNoInfs(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasNoInfs(B);
+}
+
+/// Set or clear the no-signed-zeros flag on this instruction, which must be an
+/// operator which supports this flag. See LangRef.html for the meaning of this
+/// flag.
+void Instruction::setHasNoSignedZeros(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasNoSignedZeros(B);
+}
+
+/// Set or clear the allow-reciprocal flag on this instruction, which must be an
+/// operator which supports this flag. See LangRef.html for the meaning of this
+/// flag.
+void Instruction::setHasAllowReciprocal(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasAllowReciprocal(B);
+}
+
+/// Convenience function for setting all the fast-math flags on this
+/// instruction, which must be an operator which supports these flags. See
+/// LangRef.html for the meaning of these flats.
+void Instruction::setFastMathFlags(FastMathFlags FMF) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setFastMathFlags(FMF);
+}
+
+/// Determine whether the unsafe-algebra flag is set.
+bool Instruction::hasUnsafeAlgebra() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+}
+
+/// Determine whether the no-NaNs flag is set.
+bool Instruction::hasNoNaNs() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasNoNaNs();
+}
+
+/// Determine whether the no-infs flag is set.
+bool Instruction::hasNoInfs() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasNoInfs();
+}
+
+/// Determine whether the no-signed-zeros flag is set.
+bool Instruction::hasNoSignedZeros() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasNoSignedZeros();
+}
+
+/// Determine whether the allow-reciprocal flag is set.
+bool Instruction::hasAllowReciprocal() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasAllowReciprocal();
+}
+
+/// Convenience function for getting all the fast-math flags, which must be an
+/// operator which supports these flags. See LangRef.html for the meaning of
+/// these flats.
+FastMathFlags Instruction::getFastMathFlags() const {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->getFastMathFlags();
+}
+
+/// Copy I's fast-math flags
+void Instruction::copyFastMathFlags(const Instruction *I) {
+  setFastMathFlags(I->getFastMathFlags());
+}
+
+
+const char *Instruction::getOpcodeName(unsigned OpCode) {
+  switch (OpCode) {
+  // Terminators
+  case Ret:    return "ret";
+  case Br:     return "br";
+  case Switch: return "switch";
+  case IndirectBr: return "indirectbr";
+  case Invoke: return "invoke";
+  case Resume: return "resume";
+  case Unreachable: return "unreachable";
+
+  // Standard binary operators...
+  case Add: return "add";
+  case FAdd: return "fadd";
+  case Sub: return "sub";
+  case FSub: return "fsub";
+  case Mul: return "mul";
+  case FMul: return "fmul";
+  case UDiv: return "udiv";
+  case SDiv: return "sdiv";
+  case FDiv: return "fdiv";
+  case URem: return "urem";
+  case SRem: return "srem";
+  case FRem: return "frem";
+
+  // Logical operators...
+  case And: return "and";
+  case Or : return "or";
+  case Xor: return "xor";
+
+  // Memory instructions...
+  case Alloca:        return "alloca";
+  case Load:          return "load";
+  case Store:         return "store";
+  case AtomicCmpXchg: return "cmpxchg";
+  case AtomicRMW:     return "atomicrmw";
+  case Fence:         return "fence";
+  case GetElementPtr: return "getelementptr";
+
+  // Convert instructions...
+  case Trunc:     return "trunc";
+  case ZExt:      return "zext";
+  case SExt:      return "sext";
+  case FPTrunc:   return "fptrunc";
+  case FPExt:     return "fpext";
+  case FPToUI:    return "fptoui";
+  case FPToSI:    return "fptosi";
+  case UIToFP:    return "uitofp";
+  case SIToFP:    return "sitofp";
+  case IntToPtr:  return "inttoptr";
+  case PtrToInt:  return "ptrtoint";
+  case BitCast:   return "bitcast";
+
+  // Other instructions...
+  case ICmp:           return "icmp";
+  case FCmp:           return "fcmp";
+  case PHI:            return "phi";
+  case Select:         return "select";
+  case Call:           return "call";
+  case Shl:            return "shl";
+  case LShr:           return "lshr";
+  case AShr:           return "ashr";
+  case VAArg:          return "va_arg";
+  case ExtractElement: return "extractelement";
+  case InsertElement:  return "insertelement";
+  case ShuffleVector:  return "shufflevector";
+  case ExtractValue:   return "extractvalue";
+  case InsertValue:    return "insertvalue";
+  case LandingPad:     return "landingpad";
+
+  default: return "<Invalid operator> ";
+  }
+}
+
+/// isIdenticalTo - Return true if the specified instruction is exactly
+/// identical to the current one.  This means that all operands match and any
+/// extra information (e.g. load is volatile) agree.
+bool Instruction::isIdenticalTo(const Instruction *I) const {
+  return isIdenticalToWhenDefined(I) &&
+         SubclassOptionalData == I->SubclassOptionalData;
+}
+
+/// isIdenticalToWhenDefined - This is like isIdenticalTo, except that it
+/// ignores the SubclassOptionalData flags, which specify conditions
+/// under which the instruction's result is undefined.
+bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
+  if (getOpcode() != I->getOpcode() ||
+      getNumOperands() != I->getNumOperands() ||
+      getType() != I->getType())
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (getOperand(i) != I->getOperand(i))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
+           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
+    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
+    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
+    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
+           CI->getAttributes() == cast<InvokeInst>(I)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
+    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
+    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
+    return FI->getOrdering() == cast<FenceInst>(FI)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(FI)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
+  if (const PHINode *thisPHI = dyn_cast<PHINode>(this)) {
+    const PHINode *otherPHI = cast<PHINode>(I);
+    for (unsigned i = 0, e = thisPHI->getNumOperands(); i != e; ++i) {
+      if (thisPHI->getIncomingBlock(i) != otherPHI->getIncomingBlock(i))
+        return false;
+    }
+    return true;
+  }
+  return true;
+}
+
+// isSameOperationAs
+// This should be kept in sync with isEquivalentOperation in
+// lib/Transforms/IPO/MergeFunctions.cpp.
+bool Instruction::isSameOperationAs(const Instruction *I,
+                                    unsigned flags) const {
+  bool IgnoreAlignment = flags & CompareIgnoringAlignment;
+  bool UseScalarTypes  = flags & CompareUsingScalarTypes;
+
+  if (getOpcode() != I->getOpcode() ||
+      getNumOperands() != I->getNumOperands() ||
+      (UseScalarTypes ?
+       getType()->getScalarType() != I->getType()->getScalarType() :
+       getType() != I->getType()))
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (UseScalarTypes ?
+        getOperand(i)->getType()->getScalarType() !=
+          I->getOperand(i)->getType()->getScalarType() :
+        getOperand(i)->getType() != I->getOperand(i)->getType())
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(this))
+    return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
+           (LI->getAlignment() == cast<LoadInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
+           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(this))
+    return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
+           (SI->getAlignment() == cast<StoreInst>(I)->getAlignment() ||
+            IgnoreAlignment) &&
+           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(this))
+    return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return CI->isTailCall() == cast<CallInst>(I)->isTailCall() &&
+           CI->getCallingConv() == cast<CallInst>(I)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(this))
+    return CI->getCallingConv() == cast<InvokeInst>(I)->getCallingConv() &&
+           CI->getAttributes() ==
+             cast<InvokeInst>(I)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(this))
+    return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
+    return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
+    return FI->getOrdering() == cast<FenceInst>(I)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
+
+  return true;
+}
+
+/// isUsedOutsideOfBlock - Return true if there are any uses of I outside of the
+/// specified block.  Note that PHI nodes are considered to evaluate their
+/// operands in the corresponding predecessor block.
+bool Instruction::isUsedOutsideOfBlock(const BasicBlock *BB) const {
+  for (const_use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    // PHI nodes uses values in the corresponding predecessor block.  For other
+    // instructions, just check to see whether the parent of the use matches up.
+    const User *U = *UI;
+    const PHINode *PN = dyn_cast<PHINode>(U);
+    if (PN == 0) {
+      if (cast<Instruction>(U)->getParent() != BB)
+        return true;
+      continue;
+    }
+
+    if (PN->getIncomingBlock(UI) != BB)
+      return true;
+  }
+  return false;
+}
+
+/// mayReadFromMemory - Return true if this instruction may read memory.
+///
+bool Instruction::mayReadFromMemory() const {
+  switch (getOpcode()) {
+  default: return false;
+  case Instruction::VAArg:
+  case Instruction::Load:
+  case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+    return true;
+  case Instruction::Call:
+    return !cast<CallInst>(this)->doesNotAccessMemory();
+  case Instruction::Invoke:
+    return !cast<InvokeInst>(this)->doesNotAccessMemory();
+  case Instruction::Store:
+    return !cast<StoreInst>(this)->isUnordered();
+  }
+}
+
+/// mayWriteToMemory - Return true if this instruction may modify memory.
+///
+bool Instruction::mayWriteToMemory() const {
+  switch (getOpcode()) {
+  default: return false;
+  case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory
+  case Instruction::Store:
+  case Instruction::VAArg:
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+    return true;
+  case Instruction::Call:
+    return !cast<CallInst>(this)->onlyReadsMemory();
+  case Instruction::Invoke:
+    return !cast<InvokeInst>(this)->onlyReadsMemory();
+  case Instruction::Load:
+    return !cast<LoadInst>(this)->isUnordered();
+  }
+}
+
+bool Instruction::mayThrow() const {
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return !CI->doesNotThrow();
+  return isa<ResumeInst>(this);
+}
+
+bool Instruction::mayReturn() const {
+  if (const CallInst *CI = dyn_cast<CallInst>(this))
+    return !CI->doesNotReturn();
+  return true;
+}
+
+/// isAssociative - Return true if the instruction is associative:
+///
+///   Associative operators satisfy:  x op (y op z) === (x op y) op z
+///
+/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
+///
+bool Instruction::isAssociative(unsigned Opcode) {
+  return Opcode == And || Opcode == Or || Opcode == Xor ||
+         Opcode == Add || Opcode == Mul;
+}
+
+bool Instruction::isAssociative() const {
+  unsigned Opcode = getOpcode();
+  if (isAssociative(Opcode))
+    return true;
+
+  switch (Opcode) {
+  case FMul:
+  case FAdd:
+    return cast<FPMathOperator>(this)->hasUnsafeAlgebra();
+  default:
+    return false;
+  }
+}
+
+/// isCommutative - Return true if the instruction is commutative:
+///
+///   Commutative operators satisfy: (x op y) === (y op x)
+///
+/// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
+/// applied to any type.
+///
+bool Instruction::isCommutative(unsigned op) {
+  switch (op) {
+  case Add:
+  case FAdd:
+  case Mul:
+  case FMul:
+  case And:
+  case Or:
+  case Xor:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isIdempotent - Return true if the instruction is idempotent:
+///
+///   Idempotent operators satisfy:  x op x === x
+///
+/// In LLVM, the And and Or operators are idempotent.
+///
+bool Instruction::isIdempotent(unsigned Opcode) {
+  return Opcode == And || Opcode == Or;
+}
+
+/// isNilpotent - Return true if the instruction is nilpotent:
+///
+///   Nilpotent operators satisfy:  x op x === Id,
+///
+///   where Id is the identity for the operator, i.e. a constant such that
+///     x op Id === x and Id op x === x for all x.
+///
+/// In LLVM, the Xor operator is nilpotent.
+///
+bool Instruction::isNilpotent(unsigned Opcode) {
+  return Opcode == Xor;
+}
+
+Instruction *Instruction::clone() const {
+  Instruction *New = clone_impl();
+  New->SubclassOptionalData = SubclassOptionalData;
+  if (!hasMetadata())
+    return New;
+
+  // Otherwise, enumerate and copy over metadata from the old instruction to the
+  // new one.
+  SmallVector<std::pair<unsigned, MDNode*>, 4> TheMDs;
+  getAllMetadataOtherThanDebugLoc(TheMDs);
+  for (unsigned i = 0, e = TheMDs.size(); i != e; ++i)
+    New->setMetadata(TheMDs[i].first, TheMDs[i].second);
+
+  New->setDebugLoc(getDebugLoc());
+  return New;
+}
diff --git a/contrib/llvm/lib/IR/Instructions.cpp b/contrib/llvm/lib/IR/Instructions.cpp
new file mode 100644
index 000000000000..d58877ef773a
--- /dev/null
+++ b/contrib/llvm/lib/IR/Instructions.cpp
@@ -0,0 +1,3553 @@
+//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements all of the non-inline methods for the LLVM instruction
+// classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Instructions.h"
+#include "LLVMContextImpl.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                            CallSite Class
+//===----------------------------------------------------------------------===//
+
+User::op_iterator CallSite::getCallee() const {
+  Instruction *II(getInstruction());
+  return isCall()
+    ? cast<CallInst>(II)->op_end() - 1 // Skip Callee
+    : cast<InvokeInst>(II)->op_end() - 3; // Skip BB, BB, Callee
+}
+
+//===----------------------------------------------------------------------===//
+//                            TerminatorInst Class
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method, so the vtable, etc has a home.
+TerminatorInst::~TerminatorInst() {
+}
+
+//===----------------------------------------------------------------------===//
+//                           UnaryInstruction Class
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method, so the vtable, etc has a home.
+UnaryInstruction::~UnaryInstruction() {
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectInst Class
+//===----------------------------------------------------------------------===//
+
+/// areInvalidOperands - Return a string if the specified operands are invalid
+/// for a select operation, otherwise return null.
+const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
+  if (Op1->getType() != Op2->getType())
+    return "both values to select must have same type";
+  
+  if (VectorType *VT = dyn_cast<VectorType>(Op0->getType())) {
+    // Vector select.
+    if (VT->getElementType() != Type::getInt1Ty(Op0->getContext()))
+      return "vector select condition element type must be i1";
+    VectorType *ET = dyn_cast<VectorType>(Op1->getType());
+    if (ET == 0)
+      return "selected values for vector select must be vectors";
+    if (ET->getNumElements() != VT->getNumElements())
+      return "vector select requires selected vectors to have "
+                   "the same vector length as select condition";
+  } else if (Op0->getType() != Type::getInt1Ty(Op0->getContext())) {
+    return "select condition must be i1 or <n x i1>";
+  }
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               PHINode Class
+//===----------------------------------------------------------------------===//
+
+PHINode::PHINode(const PHINode &PN)
+  : Instruction(PN.getType(), Instruction::PHI,
+                allocHungoffUses(PN.getNumOperands()), PN.getNumOperands()),
+    ReservedSpace(PN.getNumOperands()) {
+  std::copy(PN.op_begin(), PN.op_end(), op_begin());
+  std::copy(PN.block_begin(), PN.block_end(), block_begin());
+  SubclassOptionalData = PN.SubclassOptionalData;
+}
+
+PHINode::~PHINode() {
+  dropHungoffUses();
+}
+
+Use *PHINode::allocHungoffUses(unsigned N) const {
+  // Allocate the array of Uses of the incoming values, followed by a pointer
+  // (with bottom bit set) to the User, followed by the array of pointers to
+  // the incoming basic blocks.
+  size_t size = N * sizeof(Use) + sizeof(Use::UserRef)
+    + N * sizeof(BasicBlock*);
+  Use *Begin = static_cast<Use*>(::operator new(size));
+  Use *End = Begin + N;
+  (void) new(End) Use::UserRef(const_cast<PHINode*>(this), 1);
+  return Use::initTags(Begin, End);
+}
+
+// removeIncomingValue - Remove an incoming value.  This is useful if a
+// predecessor basic block is deleted.
+Value *PHINode::removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty) {
+  Value *Removed = getIncomingValue(Idx);
+
+  // Move everything after this operand down.
+  //
+  // FIXME: we could just swap with the end of the list, then erase.  However,
+  // clients might not expect this to happen.  The code as it is thrashes the
+  // use/def lists, which is kinda lame.
+  std::copy(op_begin() + Idx + 1, op_end(), op_begin() + Idx);
+  std::copy(block_begin() + Idx + 1, block_end(), block_begin() + Idx);
+
+  // Nuke the last value.
+  Op<-1>().set(0);
+  --NumOperands;
+
+  // If the PHI node is dead, because it has zero entries, nuke it now.
+  if (getNumOperands() == 0 && DeletePHIIfEmpty) {
+    // If anyone is using this PHI, make them use a dummy value instead...
+    replaceAllUsesWith(UndefValue::get(getType()));
+    eraseFromParent();
+  }
+  return Removed;
+}
+
+/// growOperands - grow operands - This grows the operand list in response
+/// to a push_back style of operation.  This grows the number of ops by 1.5
+/// times.
+///
+void PHINode::growOperands() {
+  unsigned e = getNumOperands();
+  unsigned NumOps = e + e / 2;
+  if (NumOps < 2) NumOps = 2;      // 2 op PHI nodes are VERY common.
+
+  Use *OldOps = op_begin();
+  BasicBlock **OldBlocks = block_begin();
+
+  ReservedSpace = NumOps;
+  OperandList = allocHungoffUses(ReservedSpace);
+
+  std::copy(OldOps, OldOps + e, op_begin());
+  std::copy(OldBlocks, OldBlocks + e, block_begin());
+
+  Use::zap(OldOps, OldOps + e, true);
+}
+
+/// hasConstantValue - If the specified PHI node always merges together the same
+/// value, return the value, otherwise return null.
+Value *PHINode::hasConstantValue() const {
+  // Exploit the fact that phi nodes always have at least one entry.
+  Value *ConstantValue = getIncomingValue(0);
+  for (unsigned i = 1, e = getNumIncomingValues(); i != e; ++i)
+    if (getIncomingValue(i) != ConstantValue && getIncomingValue(i) != this) {
+      if (ConstantValue != this)
+        return 0; // Incoming values not all the same.
+       // The case where the first value is this PHI.
+      ConstantValue = getIncomingValue(i);
+    }
+  if (ConstantValue == this)
+    return UndefValue::get(getType());
+  return ConstantValue;
+}
+
+//===----------------------------------------------------------------------===//
+//                       LandingPadInst Implementation
+//===----------------------------------------------------------------------===//
+
+LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
+                               unsigned NumReservedValues, const Twine &NameStr,
+                               Instruction *InsertBefore)
+  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertBefore) {
+  init(PersonalityFn, 1 + NumReservedValues, NameStr);
+}
+
+LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
+                               unsigned NumReservedValues, const Twine &NameStr,
+                               BasicBlock *InsertAtEnd)
+  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertAtEnd) {
+  init(PersonalityFn, 1 + NumReservedValues, NameStr);
+}
+
+LandingPadInst::LandingPadInst(const LandingPadInst &LP)
+  : Instruction(LP.getType(), Instruction::LandingPad,
+                allocHungoffUses(LP.getNumOperands()), LP.getNumOperands()),
+    ReservedSpace(LP.getNumOperands()) {
+  Use *OL = OperandList, *InOL = LP.OperandList;
+  for (unsigned I = 0, E = ReservedSpace; I != E; ++I)
+    OL[I] = InOL[I];
+
+  setCleanup(LP.isCleanup());
+}
+
+LandingPadInst::~LandingPadInst() {
+  dropHungoffUses();
+}
+
+LandingPadInst *LandingPadInst::Create(Type *RetTy, Value *PersonalityFn,
+                                       unsigned NumReservedClauses,
+                                       const Twine &NameStr,
+                                       Instruction *InsertBefore) {
+  return new LandingPadInst(RetTy, PersonalityFn, NumReservedClauses, NameStr,
+                            InsertBefore);
+}
+
+LandingPadInst *LandingPadInst::Create(Type *RetTy, Value *PersonalityFn,
+                                       unsigned NumReservedClauses,
+                                       const Twine &NameStr,
+                                       BasicBlock *InsertAtEnd) {
+  return new LandingPadInst(RetTy, PersonalityFn, NumReservedClauses, NameStr,
+                            InsertAtEnd);
+}
+
+void LandingPadInst::init(Value *PersFn, unsigned NumReservedValues,
+                          const Twine &NameStr) {
+  ReservedSpace = NumReservedValues;
+  NumOperands = 1;
+  OperandList = allocHungoffUses(ReservedSpace);
+  OperandList[0] = PersFn;
+  setName(NameStr);
+  setCleanup(false);
+}
+
+/// growOperands - grow operands - This grows the operand list in response to a
+/// push_back style of operation. This grows the number of ops by 2 times.
+void LandingPadInst::growOperands(unsigned Size) {
+  unsigned e = getNumOperands();
+  if (ReservedSpace >= e + Size) return;
+  ReservedSpace = (e + Size / 2) * 2;
+
+  Use *NewOps = allocHungoffUses(ReservedSpace);
+  Use *OldOps = OperandList;
+  for (unsigned i = 0; i != e; ++i)
+      NewOps[i] = OldOps[i];
+
+  OperandList = NewOps;
+  Use::zap(OldOps, OldOps + e, true);
+}
+
+void LandingPadInst::addClause(Value *Val) {
+  unsigned OpNo = getNumOperands();
+  growOperands(1);
+  assert(OpNo < ReservedSpace && "Growing didn't work!");
+  ++NumOperands;
+  OperandList[OpNo] = Val;
+}
+
+//===----------------------------------------------------------------------===//
+//                        CallInst Implementation
+//===----------------------------------------------------------------------===//
+
+CallInst::~CallInst() {
+}
+
+void CallInst::init(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr) {
+  assert(NumOperands == Args.size() + 1 && "NumOperands not set up?");
+  Op<-1>() = Func;
+
+#ifndef NDEBUG
+  FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+
+  assert((Args.size() == FTy->getNumParams() ||
+          (FTy->isVarArg() && Args.size() > FTy->getNumParams())) &&
+         "Calling a function with bad signature!");
+
+  for (unsigned i = 0; i != Args.size(); ++i)
+    assert((i >= FTy->getNumParams() || 
+            FTy->getParamType(i) == Args[i]->getType()) &&
+           "Calling a function with a bad signature!");
+#endif
+
+  std::copy(Args.begin(), Args.end(), op_begin());
+  setName(NameStr);
+}
+
+void CallInst::init(Value *Func, const Twine &NameStr) {
+  assert(NumOperands == 1 && "NumOperands not set up?");
+  Op<-1>() = Func;
+
+#ifndef NDEBUG
+  FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
+
+  assert(FTy->getNumParams() == 0 && "Calling a function with bad signature");
+#endif
+
+  setName(NameStr);
+}
+
+CallInst::CallInst(Value *Func, const Twine &Name,
+                   Instruction *InsertBefore)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 1,
+                1, InsertBefore) {
+  init(Func, Name);
+}
+
+CallInst::CallInst(Value *Func, const Twine &Name,
+                   BasicBlock *InsertAtEnd)
+  : Instruction(cast<FunctionType>(cast<PointerType>(Func->getType())
+                                   ->getElementType())->getReturnType(),
+                Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - 1,
+                1, InsertAtEnd) {
+  init(Func, Name);
+}
+
+CallInst::CallInst(const CallInst &CI)
+  : Instruction(CI.getType(), Instruction::Call,
+                OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
+                CI.getNumOperands()) {
+  setAttributes(CI.getAttributes());
+  setTailCall(CI.isTailCall());
+  setCallingConv(CI.getCallingConv());
+    
+  std::copy(CI.op_begin(), CI.op_end(), op_begin());
+  SubclassOptionalData = CI.SubclassOptionalData;
+}
+
+void CallInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, attr);
+  setAttributes(PAL);
+}
+
+void CallInst::removeAttribute(unsigned i, Attribute attr) {
+  AttributeSet PAL = getAttributes();
+  AttrBuilder B(attr);
+  LLVMContext &Context = getContext();
+  PAL = PAL.removeAttributes(Context, i,
+                             AttributeSet::get(Context, i, B));
+  setAttributes(PAL);
+}
+
+bool CallInst::hasFnAttr(Attribute::AttrKind A) const {
+  if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
+  return false;
+}
+
+bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+  if (AttributeList.hasAttribute(i, A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(i, A);
+  return false;
+}
+
+/// IsConstantOne - Return true only if val is constant int 1
+static bool IsConstantOne(Value *val) {
+  assert(val && "IsConstantOne does not work with NULL val");
+  return isa<ConstantInt>(val) && cast<ConstantInt>(val)->isOne();
+}
+
+static Instruction *createMalloc(Instruction *InsertBefore,
+                                 BasicBlock *InsertAtEnd, Type *IntPtrTy,
+                                 Type *AllocTy, Value *AllocSize, 
+                                 Value *ArraySize, Function *MallocF,
+                                 const Twine &Name) {
+  assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
+         "createMalloc needs either InsertBefore or InsertAtEnd");
+
+  // malloc(type) becomes: 
+  //       bitcast (i8* malloc(typeSize)) to type*
+  // malloc(type, arraySize) becomes:
+  //       bitcast (i8 *malloc(typeSize*arraySize)) to type*
+  if (!ArraySize)
+    ArraySize = ConstantInt::get(IntPtrTy, 1);
+  else if (ArraySize->getType() != IntPtrTy) {
+    if (InsertBefore)
+      ArraySize = CastInst::CreateIntegerCast(ArraySize, IntPtrTy, false,
+                                              "", InsertBefore);
+    else
+      ArraySize = CastInst::CreateIntegerCast(ArraySize, IntPtrTy, false,
+                                              "", InsertAtEnd);
+  }
+
+  if (!IsConstantOne(ArraySize)) {
+    if (IsConstantOne(AllocSize)) {
+      AllocSize = ArraySize;         // Operand * 1 = Operand
+    } else if (Constant *CO = dyn_cast<Constant>(ArraySize)) {
+      Constant *Scale = ConstantExpr::getIntegerCast(CO, IntPtrTy,
+                                                     false /*ZExt*/);
+      // Malloc arg is constant product of type size and array size
+      AllocSize = ConstantExpr::getMul(Scale, cast<Constant>(AllocSize));
+    } else {
+      // Multiply type size by the array size...
+      if (InsertBefore)
+        AllocSize = BinaryOperator::CreateMul(ArraySize, AllocSize,
+                                              "mallocsize", InsertBefore);
+      else
+        AllocSize = BinaryOperator::CreateMul(ArraySize, AllocSize,
+                                              "mallocsize", InsertAtEnd);
+    }
+  }
+
+  assert(AllocSize->getType() == IntPtrTy && "malloc arg is wrong size");
+  // Create the call to Malloc.
+  BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
+  Module* M = BB->getParent()->getParent();
+  Type *BPTy = Type::getInt8PtrTy(BB->getContext());
+  Value *MallocFunc = MallocF;
+  if (!MallocFunc)
+    // prototype malloc as "void *malloc(size_t)"
+    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, NULL);
+  PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
+  CallInst *MCall = NULL;
+  Instruction *Result = NULL;
+  if (InsertBefore) {
+    MCall = CallInst::Create(MallocFunc, AllocSize, "malloccall", InsertBefore);
+    Result = MCall;
+    if (Result->getType() != AllocPtrType)
+      // Create a cast instruction to convert to the right type...
+      Result = new BitCastInst(MCall, AllocPtrType, Name, InsertBefore);
+  } else {
+    MCall = CallInst::Create(MallocFunc, AllocSize, "malloccall");
+    Result = MCall;
+    if (Result->getType() != AllocPtrType) {
+      InsertAtEnd->getInstList().push_back(MCall);
+      // Create a cast instruction to convert to the right type...
+      Result = new BitCastInst(MCall, AllocPtrType, Name);
+    }
+  }
+  MCall->setTailCall();
+  if (Function *F = dyn_cast<Function>(MallocFunc)) {
+    MCall->setCallingConv(F->getCallingConv());
+    if (!F->doesNotAlias(0)) F->setDoesNotAlias(0);
+  }
+  assert(!MCall->getType()->isVoidTy() && "Malloc has void return type");
+
+  return Result;
+}
+
+/// CreateMalloc - Generate the IR for a call to malloc:
+/// 1. Compute the malloc call's argument as the specified type's size,
+///    possibly multiplied by the array size if the array size is not
+///    constant 1.
+/// 2. Call malloc with that argument.
+/// 3. Bitcast the result of the malloc call to the specified type.
+Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
+                                    Type *IntPtrTy, Type *AllocTy,
+                                    Value *AllocSize, Value *ArraySize,
+                                    Function * MallocF,
+                                    const Twine &Name) {
+  return createMalloc(InsertBefore, NULL, IntPtrTy, AllocTy, AllocSize,
+                      ArraySize, MallocF, Name);
+}
+
+/// CreateMalloc - Generate the IR for a call to malloc:
+/// 1. Compute the malloc call's argument as the specified type's size,
+///    possibly multiplied by the array size if the array size is not
+///    constant 1.
+/// 2. Call malloc with that argument.
+/// 3. Bitcast the result of the malloc call to the specified type.
+/// Note: This function does not add the bitcast to the basic block, that is the
+/// responsibility of the caller.
+Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
+                                    Type *IntPtrTy, Type *AllocTy,
+                                    Value *AllocSize, Value *ArraySize, 
+                                    Function *MallocF, const Twine &Name) {
+  return createMalloc(NULL, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
+                      ArraySize, MallocF, Name);
+}
+
+static Instruction* createFree(Value* Source, Instruction *InsertBefore,
+                               BasicBlock *InsertAtEnd) {
+  assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
+         "createFree needs either InsertBefore or InsertAtEnd");
+  assert(Source->getType()->isPointerTy() &&
+         "Can not free something of nonpointer type!");
+
+  BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
+  Module* M = BB->getParent()->getParent();
+
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+  Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
+  // prototype free as "void free(void*)"
+  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, NULL);
+  CallInst* Result = NULL;
+  Value *PtrCast = Source;
+  if (InsertBefore) {
+    if (Source->getType() != IntPtrTy)
+      PtrCast = new BitCastInst(Source, IntPtrTy, "", InsertBefore);
+    Result = CallInst::Create(FreeFunc, PtrCast, "", InsertBefore);
+  } else {
+    if (Source->getType() != IntPtrTy)
+      PtrCast = new BitCastInst(Source, IntPtrTy, "", InsertAtEnd);
+    Result = CallInst::Create(FreeFunc, PtrCast, "");
+  }
+  Result->setTailCall();
+  if (Function *F = dyn_cast<Function>(FreeFunc))
+    Result->setCallingConv(F->getCallingConv());
+
+  return Result;
+}
+
+/// CreateFree - Generate the IR for a call to the builtin free function.
+Instruction * CallInst::CreateFree(Value* Source, Instruction *InsertBefore) {
+  return createFree(Source, InsertBefore, NULL);
+}
+
+/// CreateFree - Generate the IR for a call to the builtin free function.
+/// Note: This function does not add the call to the basic block, that is the
+/// responsibility of the caller.
+Instruction* CallInst::CreateFree(Value* Source, BasicBlock *InsertAtEnd) {
+  Instruction* FreeCall = createFree(Source, NULL, InsertAtEnd);
+  assert(FreeCall && "CreateFree did not create a CallInst");
+  return FreeCall;
+}
+
+//===----------------------------------------------------------------------===//
+//                        InvokeInst Implementation
+//===----------------------------------------------------------------------===//
+
+void InvokeInst::init(Value *Fn, BasicBlock *IfNormal, BasicBlock *IfException,
+                      ArrayRef<Value *> Args, const Twine &NameStr) {
+  assert(NumOperands == 3 + Args.size() && "NumOperands not set up?");
+  Op<-3>() = Fn;
+  Op<-2>() = IfNormal;
+  Op<-1>() = IfException;
+
+#ifndef NDEBUG
+  FunctionType *FTy =
+    cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType());
+
+  assert(((Args.size() == FTy->getNumParams()) ||
+          (FTy->isVarArg() && Args.size() > FTy->getNumParams())) &&
+         "Invoking a function with bad signature");
+
+  for (unsigned i = 0, e = Args.size(); i != e; i++)
+    assert((i >= FTy->getNumParams() || 
+            FTy->getParamType(i) == Args[i]->getType()) &&
+           "Invoking a function with a bad signature!");
+#endif
+
+  std::copy(Args.begin(), Args.end(), op_begin());
+  setName(NameStr);
+}
+
+InvokeInst::InvokeInst(const InvokeInst &II)
+  : TerminatorInst(II.getType(), Instruction::Invoke,
+                   OperandTraits<InvokeInst>::op_end(this)
+                   - II.getNumOperands(),
+                   II.getNumOperands()) {
+  setAttributes(II.getAttributes());
+  setCallingConv(II.getCallingConv());
+  std::copy(II.op_begin(), II.op_end(), op_begin());
+  SubclassOptionalData = II.SubclassOptionalData;
+}
+
+BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned InvokeInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  return setSuccessor(idx, B);
+}
+
+bool InvokeInst::hasFnAttr(Attribute::AttrKind A) const {
+  if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, A);
+  return false;
+}
+
+bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind A) const {
+  if (AttributeList.hasAttribute(i, A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(i, A);
+  return false;
+}
+
+void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind attr) {
+  AttributeSet PAL = getAttributes();
+  PAL = PAL.addAttribute(getContext(), i, attr);
+  setAttributes(PAL);
+}
+
+void InvokeInst::removeAttribute(unsigned i, Attribute attr) {
+  AttributeSet PAL = getAttributes();
+  AttrBuilder B(attr);
+  PAL = PAL.removeAttributes(getContext(), i,
+                             AttributeSet::get(getContext(), i, B));
+  setAttributes(PAL);
+}
+
+LandingPadInst *InvokeInst::getLandingPadInst() const {
+  return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
+}
+
+//===----------------------------------------------------------------------===//
+//                        ReturnInst Implementation
+//===----------------------------------------------------------------------===//
+
+ReturnInst::ReturnInst(const ReturnInst &RI)
+  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) -
+                     RI.getNumOperands(),
+                   RI.getNumOperands()) {
+  if (RI.getNumOperands())
+    Op<0>() = RI.Op<0>();
+  SubclassOptionalData = RI.SubclassOptionalData;
+}
+
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                   InsertBefore) {
+  if (retVal)
+    Op<0>() = retVal;
+}
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                   InsertAtEnd) {
+  if (retVal)
+    Op<0>() = retVal;
+}
+ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
+                   OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
+}
+
+unsigned ReturnInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+/// Out-of-line ReturnInst method, put here so the C++ compiler can choose to
+/// emit the vtable for the class in this translation unit.
+void ReturnInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  llvm_unreachable("ReturnInst has no successors!");
+}
+
+BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
+  llvm_unreachable("ReturnInst has no successors!");
+}
+
+ReturnInst::~ReturnInst() {
+}
+
+//===----------------------------------------------------------------------===//
+//                        ResumeInst Implementation
+//===----------------------------------------------------------------------===//
+
+ResumeInst::ResumeInst(const ResumeInst &RI)
+  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1) {
+  Op<0>() = RI.Op<0>();
+}
+
+ResumeInst::ResumeInst(Value *Exn, Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+  Op<0>() = Exn;
+}
+
+ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
+  Op<0>() = Exn;
+}
+
+unsigned ResumeInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+void ResumeInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  llvm_unreachable("ResumeInst has no successors!");
+}
+
+BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const {
+  llvm_unreachable("ResumeInst has no successors!");
+}
+
+//===----------------------------------------------------------------------===//
+//                      UnreachableInst Implementation
+//===----------------------------------------------------------------------===//
+
+UnreachableInst::UnreachableInst(LLVMContext &Context, 
+                                 Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
+                   0, 0, InsertBefore) {
+}
+UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
+                   0, 0, InsertAtEnd) {
+}
+
+unsigned UnreachableInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+void UnreachableInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  llvm_unreachable("UnreachableInst has no successors!");
+}
+
+BasicBlock *UnreachableInst::getSuccessorV(unsigned idx) const {
+  llvm_unreachable("UnreachableInst has no successors!");
+}
+
+//===----------------------------------------------------------------------===//
+//                        BranchInst Implementation
+//===----------------------------------------------------------------------===//
+
+void BranchInst::AssertOK() {
+  if (isConditional())
+    assert(getCondition()->getType()->isIntegerTy(1) &&
+           "May only branch on boolean predicates!");
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 1,
+                   1, InsertBefore) {
+  assert(IfTrue != 0 && "Branch destination may not be null!");
+  Op<-1>() = IfTrue;
+}
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+                       Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 3,
+                   3, InsertBefore) {
+  Op<-1>() = IfTrue;
+  Op<-2>() = IfFalse;
+  Op<-3>() = Cond;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 1,
+                   1, InsertAtEnd) {
+  assert(IfTrue != 0 && "Branch destination may not be null!");
+  Op<-1>() = IfTrue;
+}
+
+BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
+           BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                   OperandTraits<BranchInst>::op_end(this) - 3,
+                   3, InsertAtEnd) {
+  Op<-1>() = IfTrue;
+  Op<-2>() = IfFalse;
+  Op<-3>() = Cond;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+BranchInst::BranchInst(const BranchInst &BI) :
+  TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
+                 OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
+                 BI.getNumOperands()) {
+  Op<-1>() = BI.Op<-1>();
+  if (BI.getNumOperands() != 1) {
+    assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
+    Op<-3>() = BI.Op<-3>();
+    Op<-2>() = BI.Op<-2>();
+  }
+  SubclassOptionalData = BI.SubclassOptionalData;
+}
+
+void BranchInst::swapSuccessors() {
+  assert(isConditional() &&
+         "Cannot swap successors of an unconditional branch");
+  Op<-1>().swap(Op<-2>());
+
+  // Update profile metadata if present and it matches our structural
+  // expectations.
+  MDNode *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return;
+
+  // The first operand is the name. Fetch them backwards and build a new one.
+  Value *Ops[] = {
+    ProfileData->getOperand(0),
+    ProfileData->getOperand(2),
+    ProfileData->getOperand(1)
+  };
+  setMetadata(LLVMContext::MD_prof,
+              MDNode::get(ProfileData->getContext(), Ops));
+}
+
+BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned BranchInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        AllocaInst Implementation
+//===----------------------------------------------------------------------===//
+
+static Value *getAISize(LLVMContext &Context, Value *Amt) {
+  if (!Amt)
+    Amt = ConstantInt::get(Type::getInt32Ty(Context), 1);
+  else {
+    assert(!isa<BasicBlock>(Amt) &&
+           "Passed basic block into allocation size parameter! Use other ctor");
+    assert(Amt->getType()->isIntegerTy() &&
+           "Allocation array size is not an integer!");
+  }
+  return Amt;
+}
+
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize,
+                       const Twine &Name, Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertBefore) {
+  setAlignment(0);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize,
+                       const Twine &Name, BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertAtEnd) {
+  setAlignment(0);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
+                       Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), 0), InsertBefore) {
+  setAlignment(0);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
+                       BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), 0), InsertAtEnd) {
+  setAlignment(0);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+                       const Twine &Name, Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertBefore) {
+  setAlignment(Align);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+                       const Twine &Name, BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertAtEnd) {
+  setAlignment(Align);
+  assert(!Ty->isVoidTy() && "Cannot allocate void!");
+  setName(Name);
+}
+
+// Out of line virtual method, so the vtable, etc has a home.
+AllocaInst::~AllocaInst() {
+}
+
+void AllocaInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
+  setInstructionSubclassData(Log2_32(Align) + 1);
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+
+bool AllocaInst::isArrayAllocation() const {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(getOperand(0)))
+    return !CI->isOne();
+  return true;
+}
+
+Type *AllocaInst::getAllocatedType() const {
+  return getType()->getElementType();
+}
+
+/// isStaticAlloca - Return true if this alloca is in the entry block of the
+/// function and is a constant size.  If so, the code generator will fold it
+/// into the prolog/epilog code, so it is basically free.
+bool AllocaInst::isStaticAlloca() const {
+  // Must be constant size.
+  if (!isa<ConstantInt>(getArraySize())) return false;
+  
+  // Must be in the entry block.
+  const BasicBlock *Parent = getParent();
+  return Parent == &Parent->getParent()->front();
+}
+
+//===----------------------------------------------------------------------===//
+//                           LoadInst Implementation
+//===----------------------------------------------------------------------===//
+
+void LoadInst::AssertOK() {
+  assert(getOperand(0)->getType()->isPointerTy() &&
+         "Ptr must have pointer type.");
+  assert(!(isAtomic() && getAlignment() == 0) &&
+         "Alignment required for atomic load");
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
+                   Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, AtomicOrdering Order,
+                   SynchronizationScope SynchScope,
+                   Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, AtomicOrdering Order,
+                   SynchronizationScope SynchScope,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
+                   Instruction *InsertBef)
+: UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                   Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  if (Name && Name[0]) setName(Name);
+}
+
+void LoadInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
+  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
+                             ((Log2_32(Align)+1)<<1));
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+
+//===----------------------------------------------------------------------===//
+//                           StoreInst Implementation
+//===----------------------------------------------------------------------===//
+
+void StoreInst::AssertOK() {
+  assert(getOperand(0) && getOperand(1) && "Both operands must be non-null!");
+  assert(getOperand(1)->getType()->isPointerTy() &&
+         "Ptr must have pointer type!");
+  assert(getOperand(0)->getType() ==
+                 cast<PointerType>(getOperand(1)->getType())->getElementType()
+         && "Ptr must be a pointer to Val type!");
+  assert(!(isAtomic() && getAlignment() == 0) &&
+         "Alignment required for atomic load");
+}
+
+
+StoreInst::StoreInst(Value *val, Value *addr, Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(false);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, AtomicOrdering Order,
+                     SynchronizationScope SynchScope,
+                     Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, AtomicOrdering Order,
+                     SynchronizationScope SynchScope,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+}
+
+void StoreInst::setAlignment(unsigned Align) {
+  assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
+  assert(Align <= MaximumAlignment &&
+         "Alignment is greater than MaximumAlignment!");
+  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
+                             ((Log2_32(Align)+1) << 1));
+  assert(getAlignment() == Align && "Alignment representation error!");
+}
+
+//===----------------------------------------------------------------------===//
+//                       AtomicCmpXchgInst Implementation
+//===----------------------------------------------------------------------===//
+
+void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope) {
+  Op<0>() = Ptr;
+  Op<1>() = Cmp;
+  Op<2>() = NewVal;
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+
+  assert(getOperand(0) && getOperand(1) && getOperand(2) &&
+         "All operands must be non-null!");
+  assert(getOperand(0)->getType()->isPointerTy() &&
+         "Ptr must have pointer type!");
+  assert(getOperand(1)->getType() ==
+                 cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to Cmp type!");
+  assert(getOperand(2)->getType() ==
+                 cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to NewVal type!");
+  assert(Ordering != NotAtomic &&
+         "AtomicCmpXchg instructions must be atomic!");
+}
+
+AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     AtomicOrdering Ordering,
+                                     SynchronizationScope SynchScope,
+                                     Instruction *InsertBefore)
+  : Instruction(Cmp->getType(), AtomicCmpXchg,
+                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+                OperandTraits<AtomicCmpXchgInst>::operands(this),
+                InsertBefore) {
+  Init(Ptr, Cmp, NewVal, Ordering, SynchScope);
+}
+
+AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     AtomicOrdering Ordering,
+                                     SynchronizationScope SynchScope,
+                                     BasicBlock *InsertAtEnd)
+  : Instruction(Cmp->getType(), AtomicCmpXchg,
+                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+                OperandTraits<AtomicCmpXchgInst>::operands(this),
+                InsertAtEnd) {
+  Init(Ptr, Cmp, NewVal, Ordering, SynchScope);
+}
+ 
+//===----------------------------------------------------------------------===//
+//                       AtomicRMWInst Implementation
+//===----------------------------------------------------------------------===//
+
+void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
+                         AtomicOrdering Ordering,
+                         SynchronizationScope SynchScope) {
+  Op<0>() = Ptr;
+  Op<1>() = Val;
+  setOperation(Operation);
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+
+  assert(getOperand(0) && getOperand(1) &&
+         "All operands must be non-null!");
+  assert(getOperand(0)->getType()->isPointerTy() &&
+         "Ptr must have pointer type!");
+  assert(getOperand(1)->getType() ==
+         cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to Val type!");
+  assert(Ordering != NotAtomic &&
+         "AtomicRMW instructions must be atomic!");
+}
+
+AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope,
+                             Instruction *InsertBefore)
+  : Instruction(Val->getType(), AtomicRMW,
+                OperandTraits<AtomicRMWInst>::op_begin(this),
+                OperandTraits<AtomicRMWInst>::operands(this),
+                InsertBefore) {
+  Init(Operation, Ptr, Val, Ordering, SynchScope);
+}
+
+AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope,
+                             BasicBlock *InsertAtEnd)
+  : Instruction(Val->getType(), AtomicRMW,
+                OperandTraits<AtomicRMWInst>::op_begin(this),
+                OperandTraits<AtomicRMWInst>::operands(this),
+                InsertAtEnd) {
+  Init(Operation, Ptr, Val, Ordering, SynchScope);
+}
+
+//===----------------------------------------------------------------------===//
+//                       FenceInst Implementation
+//===----------------------------------------------------------------------===//
+
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+                     SynchronizationScope SynchScope,
+                     Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertBefore) {
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+}
+
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+                     SynchronizationScope SynchScope,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertAtEnd) {
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+}
+
+//===----------------------------------------------------------------------===//
+//                       GetElementPtrInst Implementation
+//===----------------------------------------------------------------------===//
+
+void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
+                             const Twine &Name) {
+  assert(NumOperands == 1 + IdxList.size() && "NumOperands not initialized?");
+  OperandList[0] = Ptr;
+  std::copy(IdxList.begin(), IdxList.end(), op_begin() + 1);
+  setName(Name);
+}
+
+GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
+  : Instruction(GEPI.getType(), GetElementPtr,
+                OperandTraits<GetElementPtrInst>::op_end(this)
+                - GEPI.getNumOperands(),
+                GEPI.getNumOperands()) {
+  std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin());
+  SubclassOptionalData = GEPI.SubclassOptionalData;
+}
+
+/// getIndexedType - Returns the type of the element that would be accessed with
+/// a gep instruction with the specified parameters.
+///
+/// The Idxs pointer should point to a continuous piece of memory containing the
+/// indices, either as Value* or uint64_t.
+///
+/// A null type is returned if the indices are invalid for the specified
+/// pointer type.
+///
+template <typename IndexTy>
+static Type *getIndexedTypeInternal(Type *Ptr, ArrayRef<IndexTy> IdxList) {
+  PointerType *PTy = dyn_cast<PointerType>(Ptr->getScalarType());
+  if (!PTy) return 0;   // Type isn't a pointer type!
+  Type *Agg = PTy->getElementType();
+
+  // Handle the special case of the empty set index set, which is always valid.
+  if (IdxList.empty())
+    return Agg;
+
+  // If there is at least one index, the top level type must be sized, otherwise
+  // it cannot be 'stepped over'.
+  if (!Agg->isSized())
+    return 0;
+
+  unsigned CurIdx = 1;
+  for (; CurIdx != IdxList.size(); ++CurIdx) {
+    CompositeType *CT = dyn_cast<CompositeType>(Agg);
+    if (!CT || CT->isPointerTy()) return 0;
+    IndexTy Index = IdxList[CurIdx];
+    if (!CT->indexValid(Index)) return 0;
+    Agg = CT->getTypeAtIndex(Index);
+  }
+  return CurIdx == IdxList.size() ? Agg : 0;
+}
+
+Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<Value *> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
+}
+
+Type *GetElementPtrInst::getIndexedType(Type *Ptr,
+                                        ArrayRef<Constant *> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
+}
+
+Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<uint64_t> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
+}
+
+/// hasAllZeroIndices - Return true if all of the indices of this GEP are
+/// zeros.  If so, the result pointer and the first operand have the same
+/// value, just potentially different types.
+bool GetElementPtrInst::hasAllZeroIndices() const {
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(getOperand(i))) {
+      if (!CI->isZero()) return false;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// hasAllConstantIndices - Return true if all of the indices of this GEP are
+/// constant integers.  If so, the result pointer and the first operand have
+/// a constant offset between them.
+bool GetElementPtrInst::hasAllConstantIndices() const {
+  for (unsigned i = 1, e = getNumOperands(); i != e; ++i) {
+    if (!isa<ConstantInt>(getOperand(i)))
+      return false;
+  }
+  return true;
+}
+
+void GetElementPtrInst::setIsInBounds(bool B) {
+  cast<GEPOperator>(this)->setIsInBounds(B);
+}
+
+bool GetElementPtrInst::isInBounds() const {
+  return cast<GEPOperator>(this)->isInBounds();
+}
+
+bool GetElementPtrInst::accumulateConstantOffset(const DataLayout &DL,
+                                                 APInt &Offset) const {
+  // Delegate to the generic GEPOperator implementation.
+  return cast<GEPOperator>(this)->accumulateConstantOffset(DL, Offset);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ExtractElementInst Implementation
+//===----------------------------------------------------------------------===//
+
+ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
+                                       const Twine &Name,
+                                       Instruction *InsertBef)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertBef) {
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
+                                       const Twine &Name,
+                                       BasicBlock *InsertAE)
+  : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+                ExtractElement,
+                OperandTraits<ExtractElementInst>::op_begin(this),
+                2, InsertAE) {
+  assert(isValidOperands(Val, Index) &&
+         "Invalid extractelement instruction operands!");
+
+  Op<0>() = Val;
+  Op<1>() = Index;
+  setName(Name);
+}
+
+
+bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
+  if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy(32))
+    return false;
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           InsertElementInst Implementation
+//===----------------------------------------------------------------------===//
+
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
+                                     const Twine &Name,
+                                     Instruction *InsertBef)
+  : Instruction(Vec->getType(), InsertElement,
+                OperandTraits<InsertElementInst>::op_begin(this),
+                3, InsertBef) {
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
+                                     const Twine &Name,
+                                     BasicBlock *InsertAE)
+  : Instruction(Vec->getType(), InsertElement,
+                OperandTraits<InsertElementInst>::op_begin(this),
+                3, InsertAE) {
+  assert(isValidOperands(Vec, Elt, Index) &&
+         "Invalid insertelement instruction operands!");
+
+  Op<0>() = Vec;
+  Op<1>() = Elt;
+  Op<2>() = Index;
+  setName(Name);
+}
+
+bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, 
+                                        const Value *Index) {
+  if (!Vec->getType()->isVectorTy())
+    return false;   // First operand of insertelement must be vector type.
+  
+  if (Elt->getType() != cast<VectorType>(Vec->getType())->getElementType())
+    return false;// Second operand of insertelement must be vector element type.
+    
+  if (!Index->getType()->isIntegerTy(32))
+    return false;  // Third operand of insertelement must be i32.
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      ShuffleVectorInst Implementation
+//===----------------------------------------------------------------------===//
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const Twine &Name,
+                                     Instruction *InsertBefore)
+: Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                cast<VectorType>(Mask->getType())->getNumElements()),
+              ShuffleVector,
+              OperandTraits<ShuffleVectorInst>::op_begin(this),
+              OperandTraits<ShuffleVectorInst>::operands(this),
+              InsertBefore) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+  Op<0>() = V1;
+  Op<1>() = V2;
+  Op<2>() = Mask;
+  setName(Name);
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const Twine &Name,
+                                     BasicBlock *InsertAtEnd)
+: Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                cast<VectorType>(Mask->getType())->getNumElements()),
+              ShuffleVector,
+              OperandTraits<ShuffleVectorInst>::op_begin(this),
+              OperandTraits<ShuffleVectorInst>::operands(this),
+              InsertAtEnd) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+
+  Op<0>() = V1;
+  Op<1>() = V2;
+  Op<2>() = Mask;
+  setName(Name);
+}
+
+bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
+                                        const Value *Mask) {
+  // V1 and V2 must be vectors of the same type.
+  if (!V1->getType()->isVectorTy() || V1->getType() != V2->getType())
+    return false;
+  
+  // Mask must be vector of i32.
+  VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+  if (MaskTy == 0 || !MaskTy->getElementType()->isIntegerTy(32))
+    return false;
+
+  // Check to see if Mask is valid.
+  if (isa<UndefValue>(Mask) || isa<ConstantAggregateZero>(Mask))
+    return true;
+
+  if (const ConstantVector *MV = dyn_cast<ConstantVector>(Mask)) {
+    unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
+    for (unsigned i = 0, e = MV->getNumOperands(); i != e; ++i) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MV->getOperand(i))) {
+        if (CI->uge(V1Size*2))
+          return false;
+      } else if (!isa<UndefValue>(MV->getOperand(i))) {
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  if (const ConstantDataSequential *CDS =
+        dyn_cast<ConstantDataSequential>(Mask)) {
+    unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
+    for (unsigned i = 0, e = MaskTy->getNumElements(); i != e; ++i)
+      if (CDS->getElementAsInteger(i) >= V1Size*2)
+        return false;
+    return true;
+  }
+  
+  // The bitcode reader can create a place holder for a forward reference
+  // used as the shuffle mask. When this occurs, the shuffle mask will
+  // fall into this case and fail. To avoid this error, do this bit of
+  // ugliness to allow such a mask pass.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Mask))
+    if (CE->getOpcode() == Instruction::UserOp1)
+      return true;
+
+  return false;
+}
+
+/// getMaskValue - Return the index from the shuffle mask for the specified
+/// output result.  This is either -1 if the element is undef or a number less
+/// than 2*numelements.
+int ShuffleVectorInst::getMaskValue(Constant *Mask, unsigned i) {
+  assert(i < Mask->getType()->getVectorNumElements() && "Index out of range");
+  if (ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(Mask))
+    return CDS->getElementAsInteger(i);
+  Constant *C = Mask->getAggregateElement(i);
+  if (isa<UndefValue>(C))
+    return -1;
+  return cast<ConstantInt>(C)->getZExtValue();
+}
+
+/// getShuffleMask - Return the full mask for this instruction, where each
+/// element is the element number and undef's are returned as -1.
+void ShuffleVectorInst::getShuffleMask(Constant *Mask,
+                                       SmallVectorImpl<int> &Result) {
+  unsigned NumElts = Mask->getType()->getVectorNumElements();
+  
+  if (ConstantDataSequential *CDS=dyn_cast<ConstantDataSequential>(Mask)) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Result.push_back(CDS->getElementAsInteger(i));
+    return;
+  }    
+  for (unsigned i = 0; i != NumElts; ++i) {
+    Constant *C = Mask->getAggregateElement(i);
+    Result.push_back(isa<UndefValue>(C) ? -1 :
+                     cast<ConstantInt>(C)->getZExtValue());
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                             InsertValueInst Class
+//===----------------------------------------------------------------------===//
+
+void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, 
+                           const Twine &Name) {
+  assert(NumOperands == 2 && "NumOperands not initialized?");
+
+  // There's no fundamental reason why we require at least one index
+  // (other than weirdness with &*IdxBegin being invalid; see
+  // getelementptr's init routine for example). But there's no
+  // present need to support it.
+  assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+
+  assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
+         Val->getType() && "Inserted value must match indexed type!");
+  Op<0>() = Agg;
+  Op<1>() = Val;
+
+  Indices.append(Idxs.begin(), Idxs.end());
+  setName(Name);
+}
+
+InsertValueInst::InsertValueInst(const InsertValueInst &IVI)
+  : Instruction(IVI.getType(), InsertValue,
+                OperandTraits<InsertValueInst>::op_begin(this), 2),
+    Indices(IVI.Indices) {
+  Op<0>() = IVI.getOperand(0);
+  Op<1>() = IVI.getOperand(1);
+  SubclassOptionalData = IVI.SubclassOptionalData;
+}
+
+//===----------------------------------------------------------------------===//
+//                             ExtractValueInst Class
+//===----------------------------------------------------------------------===//
+
+void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
+  assert(NumOperands == 1 && "NumOperands not initialized?");
+
+  // There's no fundamental reason why we require at least one index.
+  // But there's no present need to support it.
+  assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+
+  Indices.append(Idxs.begin(), Idxs.end());
+  setName(Name);
+}
+
+ExtractValueInst::ExtractValueInst(const ExtractValueInst &EVI)
+  : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0)),
+    Indices(EVI.Indices) {
+  SubclassOptionalData = EVI.SubclassOptionalData;
+}
+
+// getIndexedType - Returns the type of the element that would be extracted
+// with an extractvalue instruction with the specified parameters.
+//
+// A null type is returned if the indices are invalid for the specified
+// pointer type.
+//
+Type *ExtractValueInst::getIndexedType(Type *Agg,
+                                       ArrayRef<unsigned> Idxs) {
+  for (unsigned CurIdx = 0; CurIdx != Idxs.size(); ++CurIdx) {
+    unsigned Index = Idxs[CurIdx];
+    // We can't use CompositeType::indexValid(Index) here.
+    // indexValid() always returns true for arrays because getelementptr allows
+    // out-of-bounds indices. Since we don't allow those for extractvalue and
+    // insertvalue we need to check array indexing manually.
+    // Since the only other types we can index into are struct types it's just
+    // as easy to check those manually as well.
+    if (ArrayType *AT = dyn_cast<ArrayType>(Agg)) {
+      if (Index >= AT->getNumElements())
+        return 0;
+    } else if (StructType *ST = dyn_cast<StructType>(Agg)) {
+      if (Index >= ST->getNumElements())
+        return 0;
+    } else {
+      // Not a valid type to index into.
+      return 0;
+    }
+
+    Agg = cast<CompositeType>(Agg)->getTypeAtIndex(Index);
+  }
+  return const_cast<Type*>(Agg);
+}
+
+//===----------------------------------------------------------------------===//
+//                             BinaryOperator Class
+//===----------------------------------------------------------------------===//
+
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
+                               Type *Ty, const Twine &Name,
+                               Instruction *InsertBefore)
+  : Instruction(Ty, iType,
+                OperandTraits<BinaryOperator>::op_begin(this),
+                OperandTraits<BinaryOperator>::operands(this),
+                InsertBefore) {
+  Op<0>() = S1;
+  Op<1>() = S2;
+  init(iType);
+  setName(Name);
+}
+
+BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
+                               Type *Ty, const Twine &Name,
+                               BasicBlock *InsertAtEnd)
+  : Instruction(Ty, iType,
+                OperandTraits<BinaryOperator>::op_begin(this),
+                OperandTraits<BinaryOperator>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = S1;
+  Op<1>() = S2;
+  init(iType);
+  setName(Name);
+}
+
+
+void BinaryOperator::init(BinaryOps iType) {
+  Value *LHS = getOperand(0), *RHS = getOperand(1);
+  (void)LHS; (void)RHS; // Silence warnings.
+  assert(LHS->getType() == RHS->getType() &&
+         "Binary operator operand types must match!");
+#ifndef NDEBUG
+  switch (iType) {
+  case Add: case Sub:
+  case Mul:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert(getType()->isIntOrIntVectorTy() &&
+           "Tried to create an integer operation on a non-integer type!");
+    break;
+  case FAdd: case FSub:
+  case FMul:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert(getType()->isFPOrFPVectorTy() &&
+           "Tried to create a floating-point operation on a "
+           "non-floating-point type!");
+    break;
+  case UDiv: 
+  case SDiv: 
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
+            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+           "Incorrect operand type (not integer) for S/UDIV");
+    break;
+  case FDiv:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert(getType()->isFPOrFPVectorTy() &&
+           "Incorrect operand type (not floating point) for FDIV");
+    break;
+  case URem: 
+  case SRem: 
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
+            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+           "Incorrect operand type (not integer) for S/UREM");
+    break;
+  case FRem:
+    assert(getType() == LHS->getType() &&
+           "Arithmetic operation should return same type as operands!");
+    assert(getType()->isFPOrFPVectorTy() &&
+           "Incorrect operand type (not floating point) for FREM");
+    break;
+  case Shl:
+  case LShr:
+  case AShr:
+    assert(getType() == LHS->getType() &&
+           "Shift operation should return same type as operands!");
+    assert((getType()->isIntegerTy() ||
+            (getType()->isVectorTy() && 
+             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+           "Tried to create a shift operation on a non-integral type!");
+    break;
+  case And: case Or:
+  case Xor:
+    assert(getType() == LHS->getType() &&
+           "Logical operation should return same type as operands!");
+    assert((getType()->isIntegerTy() ||
+            (getType()->isVectorTy() && 
+             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+           "Tried to create a logical operation on a non-integral type!");
+    break;
+  default:
+    break;
+  }
+#endif
+}
+
+BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
+                                       const Twine &Name,
+                                       Instruction *InsertBefore) {
+  assert(S1->getType() == S2->getType() &&
+         "Cannot create binary operator with two operands of differing type!");
+  return new BinaryOperator(Op, S1, S2, S1->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::Create(BinaryOps Op, Value *S1, Value *S2,
+                                       const Twine &Name,
+                                       BasicBlock *InsertAtEnd) {
+  BinaryOperator *Res = Create(Op, S1, S2, Name);
+  InsertAtEnd->getInstList().push_back(Res);
+  return Res;
+}
+
+BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
+                                          Instruction *InsertBefore) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return new BinaryOperator(Instruction::Sub,
+                            zero, Op,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNeg(Value *Op, const Twine &Name,
+                                          BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return new BinaryOperator(Instruction::Sub,
+                            zero, Op,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
+BinaryOperator *BinaryOperator::CreateNSWNeg(Value *Op, const Twine &Name,
+                                             Instruction *InsertBefore) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return BinaryOperator::CreateNSWSub(zero, Op, Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNSWNeg(Value *Op, const Twine &Name,
+                                             BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return BinaryOperator::CreateNSWSub(zero, Op, Name, InsertAtEnd);
+}
+
+BinaryOperator *BinaryOperator::CreateNUWNeg(Value *Op, const Twine &Name,
+                                             Instruction *InsertBefore) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return BinaryOperator::CreateNUWSub(zero, Op, Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNUWNeg(Value *Op, const Twine &Name,
+                                             BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return BinaryOperator::CreateNUWSub(zero, Op, Name, InsertAtEnd);
+}
+
+BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const Twine &Name,
+                                           Instruction *InsertBefore) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return new BinaryOperator(Instruction::FSub, zero, Op,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const Twine &Name,
+                                           BasicBlock *InsertAtEnd) {
+  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
+  return new BinaryOperator(Instruction::FSub, zero, Op,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
+BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
+                                          Instruction *InsertBefore) {
+  Constant *C = Constant::getAllOnesValue(Op->getType());
+  return new BinaryOperator(Instruction::Xor, Op, C,
+                            Op->getType(), Name, InsertBefore);
+}
+
+BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
+                                          BasicBlock *InsertAtEnd) {
+  Constant *AllOnes = Constant::getAllOnesValue(Op->getType());
+  return new BinaryOperator(Instruction::Xor, Op, AllOnes,
+                            Op->getType(), Name, InsertAtEnd);
+}
+
+
+// isConstantAllOnes - Helper function for several functions below
+static inline bool isConstantAllOnes(const Value *V) {
+  if (const Constant *C = dyn_cast<Constant>(V))
+    return C->isAllOnesValue();
+  return false;
+}
+
+bool BinaryOperator::isNeg(const Value *V) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::Sub)
+      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0)))
+        return C->isNegativeZeroValue();
+  return false;
+}
+
+bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    if (Bop->getOpcode() == Instruction::FSub)
+      if (Constant* C = dyn_cast<Constant>(Bop->getOperand(0))) {
+        if (!IgnoreZeroSign)
+          IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
+        return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
+      }
+  return false;
+}
+
+bool BinaryOperator::isNot(const Value *V) {
+  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
+    return (Bop->getOpcode() == Instruction::Xor &&
+            (isConstantAllOnes(Bop->getOperand(1)) ||
+             isConstantAllOnes(Bop->getOperand(0))));
+  return false;
+}
+
+Value *BinaryOperator::getNegArgument(Value *BinOp) {
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
+}
+
+const Value *BinaryOperator::getNegArgument(const Value *BinOp) {
+  return getNegArgument(const_cast<Value*>(BinOp));
+}
+
+Value *BinaryOperator::getFNegArgument(Value *BinOp) {
+  return cast<BinaryOperator>(BinOp)->getOperand(1);
+}
+
+const Value *BinaryOperator::getFNegArgument(const Value *BinOp) {
+  return getFNegArgument(const_cast<Value*>(BinOp));
+}
+
+Value *BinaryOperator::getNotArgument(Value *BinOp) {
+  assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
+  BinaryOperator *BO = cast<BinaryOperator>(BinOp);
+  Value *Op0 = BO->getOperand(0);
+  Value *Op1 = BO->getOperand(1);
+  if (isConstantAllOnes(Op0)) return Op1;
+
+  assert(isConstantAllOnes(Op1));
+  return Op0;
+}
+
+const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
+  return getNotArgument(const_cast<Value*>(BinOp));
+}
+
+
+// swapOperands - Exchange the two operands to this instruction.  This
+// instruction is safe to use on any binary instruction and does not
+// modify the semantics of the instruction.  If the instruction is
+// order dependent (SetLT f.e.) the opcode is changed.
+//
+bool BinaryOperator::swapOperands() {
+  if (!isCommutative())
+    return true; // Can't commute operands
+  Op<0>().swap(Op<1>());
+  return false;
+}
+
+void BinaryOperator::setHasNoUnsignedWrap(bool b) {
+  cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(b);
+}
+
+void BinaryOperator::setHasNoSignedWrap(bool b) {
+  cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(b);
+}
+
+void BinaryOperator::setIsExact(bool b) {
+  cast<PossiblyExactOperator>(this)->setIsExact(b);
+}
+
+bool BinaryOperator::hasNoUnsignedWrap() const {
+  return cast<OverflowingBinaryOperator>(this)->hasNoUnsignedWrap();
+}
+
+bool BinaryOperator::hasNoSignedWrap() const {
+  return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
+}
+
+bool BinaryOperator::isExact() const {
+  return cast<PossiblyExactOperator>(this)->isExact();
+}
+
+//===----------------------------------------------------------------------===//
+//                             FPMathOperator Class
+//===----------------------------------------------------------------------===//
+
+/// getFPAccuracy - Get the maximum error permitted by this operation in ULPs.
+/// An accuracy of 0.0 means that the operation should be performed with the
+/// default precision.
+float FPMathOperator::getFPAccuracy() const {
+  const MDNode *MD =
+    cast<Instruction>(this)->getMetadata(LLVMContext::MD_fpmath);
+  if (!MD)
+    return 0.0;
+  ConstantFP *Accuracy = cast<ConstantFP>(MD->getOperand(0));
+  return Accuracy->getValueAPF().convertToFloat();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                CastInst Class
+//===----------------------------------------------------------------------===//
+
+void CastInst::anchor() {}
+
+// Just determine if this cast only deals with integral->integral conversion.
+bool CastInst::isIntegerCast() const {
+  switch (getOpcode()) {
+    default: return false;
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::Trunc:
+      return true;
+    case Instruction::BitCast:
+      return getOperand(0)->getType()->isIntegerTy() &&
+        getType()->isIntegerTy();
+  }
+}
+
+bool CastInst::isLosslessCast() const {
+  // Only BitCast can be lossless, exit fast if we're not BitCast
+  if (getOpcode() != Instruction::BitCast)
+    return false;
+
+  // Identity cast is always lossless
+  Type* SrcTy = getOperand(0)->getType();
+  Type* DstTy = getType();
+  if (SrcTy == DstTy)
+    return true;
+  
+  // Pointer to pointer is always lossless.
+  if (SrcTy->isPointerTy())
+    return DstTy->isPointerTy();
+  return false;  // Other types have no identity values
+}
+
+/// This function determines if the CastInst does not require any bits to be
+/// changed in order to effect the cast. Essentially, it identifies cases where
+/// no code gen is necessary for the cast, hence the name no-op cast.  For 
+/// example, the following are all no-op casts:
+/// # bitcast i32* %x to i8*
+/// # bitcast <2 x i32> %x to <4 x i16> 
+/// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
+/// @brief Determine if the described cast is a no-op.
+bool CastInst::isNoopCast(Instruction::CastOps Opcode,
+                          Type *SrcTy,
+                          Type *DestTy,
+                          Type *IntPtrTy) {
+  switch (Opcode) {
+    default: llvm_unreachable("Invalid CastOp");
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt: 
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+      return false; // These always modify bits
+    case Instruction::BitCast:
+      return true;  // BitCast never modifies bits.
+    case Instruction::PtrToInt:
+      return IntPtrTy->getScalarSizeInBits() ==
+             DestTy->getScalarSizeInBits();
+    case Instruction::IntToPtr:
+      return IntPtrTy->getScalarSizeInBits() ==
+             SrcTy->getScalarSizeInBits();
+  }
+}
+
+/// @brief Determine if a cast is a no-op.
+bool CastInst::isNoopCast(Type *IntPtrTy) const {
+  return isNoopCast(getOpcode(), getOperand(0)->getType(), getType(), IntPtrTy);
+}
+
+/// This function determines if a pair of casts can be eliminated and what 
+/// opcode should be used in the elimination. This assumes that there are two 
+/// instructions like this:
+/// *  %F = firstOpcode SrcTy %x to MidTy
+/// *  %S = secondOpcode MidTy %F to DstTy
+/// The function returns a resultOpcode so these two casts can be replaced with:
+/// *  %Replacement = resultOpcode %SrcTy %x to DstTy
+/// If no such cast is permited, the function returns 0.
+unsigned CastInst::isEliminableCastPair(
+  Instruction::CastOps firstOp, Instruction::CastOps secondOp,
+  Type *SrcTy, Type *MidTy, Type *DstTy, Type *SrcIntPtrTy, Type *MidIntPtrTy,
+  Type *DstIntPtrTy) {
+  // Define the 144 possibilities for these two cast instructions. The values
+  // in this matrix determine what to do in a given situation and select the
+  // case in the switch below.  The rows correspond to firstOp, the columns 
+  // correspond to secondOp.  In looking at the table below, keep in  mind
+  // the following cast properties:
+  //
+  //          Size Compare       Source               Destination
+  // Operator  Src ? Size   Type       Sign         Type       Sign
+  // -------- ------------ -------------------   ---------------------
+  // TRUNC         >       Integer      Any        Integral     Any
+  // ZEXT          <       Integral   Unsigned     Integer      Any
+  // SEXT          <       Integral    Signed      Integer      Any
+  // FPTOUI       n/a      FloatPt      n/a        Integral   Unsigned
+  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed 
+  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a   
+  // SITOFP       n/a      Integral    Signed      FloatPt      n/a   
+  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a   
+  // FPEXT         <       FloatPt      n/a        FloatPt      n/a   
+  // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
+  // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
+  // BITCAST       =       FirstClass   n/a       FirstClass    n/a   
+  //
+  // NOTE: some transforms are safe, but we consider them to be non-profitable.
+  // For example, we could merge "fptoui double to i32" + "zext i32 to i64",
+  // into "fptoui double to i64", but this loses information about the range
+  // of the produced value (we no longer know the top-part is all zeros). 
+  // Further this conversion is often much more expensive for typical hardware,
+  // and causes issues when building libgcc.  We disallow fptosi+sext for the 
+  // same reason.
+  const unsigned numCastOps = 
+    Instruction::CastOpsEnd - Instruction::CastOpsBegin;
+  static const uint8_t CastResults[numCastOps][numCastOps] = {
+    // T        F  F  U  S  F  F  P  I  B   -+
+    // R  Z  S  P  P  I  I  T  P  2  N  T    |
+    // U  E  E  2  2  2  2  R  E  I  T  C    +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  2  V    |
+    // C  T  T  I  I  P  P  C  T  T  P  T   -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // Trunc      -+
+    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3 }, // ZExt        |
+    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3 }, // SExt        |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToUI      |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToSI      |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // UIToFP      +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // SIToFP      |
+    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4 }, // FPTrunc     |
+    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4 }, // FPExt       |
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3 }, // PtrToInt    |
+    { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
+    {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
+  };
+  
+  // If either of the casts are a bitcast from scalar to vector, disallow the
+  // merging. However, bitcast of A->B->A are allowed.
+  bool isFirstBitcast  = (firstOp == Instruction::BitCast);
+  bool isSecondBitcast = (secondOp == Instruction::BitCast);
+  bool chainedBitcast  = (SrcTy == DstTy && isFirstBitcast && isSecondBitcast);
+
+  // Check if any of the bitcasts convert scalars<->vectors.
+  if ((isFirstBitcast  && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
+      (isSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
+    // Unless we are bitcasing to the original type, disallow optimizations.
+    if (!chainedBitcast) return 0;
+
+  int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
+                            [secondOp-Instruction::CastOpsBegin];
+  switch (ElimCase) {
+    case 0: 
+      // categorically disallowed
+      return 0;
+    case 1: 
+      // allowed, use first cast's opcode
+      return firstOp;
+    case 2: 
+      // allowed, use second cast's opcode
+      return secondOp;
+    case 3: 
+      // no-op cast in second op implies firstOp as long as the DestTy 
+      // is integer and we are not converting between a vector and a
+      // non vector type.
+      if (!SrcTy->isVectorTy() && DstTy->isIntegerTy())
+        return firstOp;
+      return 0;
+    case 4:
+      // no-op cast in second op implies firstOp as long as the DestTy
+      // is floating point.
+      if (DstTy->isFloatingPointTy())
+        return firstOp;
+      return 0;
+    case 5: 
+      // no-op cast in first op implies secondOp as long as the SrcTy
+      // is an integer.
+      if (SrcTy->isIntegerTy())
+        return secondOp;
+      return 0;
+    case 6:
+      // no-op cast in first op implies secondOp as long as the SrcTy
+      // is a floating point.
+      if (SrcTy->isFloatingPointTy())
+        return secondOp;
+      return 0;
+    case 7: { 
+      // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size
+      if (!SrcIntPtrTy || DstIntPtrTy != SrcIntPtrTy)
+        return 0;
+      unsigned PtrSize = SrcIntPtrTy->getScalarSizeInBits();
+      unsigned MidSize = MidTy->getScalarSizeInBits();
+      if (MidSize >= PtrSize)
+        return Instruction::BitCast;
+      return 0;
+    }
+    case 8: {
+      // ext, trunc -> bitcast,    if the SrcTy and DstTy are same size
+      // ext, trunc -> ext,        if sizeof(SrcTy) < sizeof(DstTy)
+      // ext, trunc -> trunc,      if sizeof(SrcTy) > sizeof(DstTy)
+      unsigned SrcSize = SrcTy->getScalarSizeInBits();
+      unsigned DstSize = DstTy->getScalarSizeInBits();
+      if (SrcSize == DstSize)
+        return Instruction::BitCast;
+      else if (SrcSize < DstSize)
+        return firstOp;
+      return secondOp;
+    }
+    case 9: // zext, sext -> zext, because sext can't sign extend after zext
+      return Instruction::ZExt;
+    case 10:
+      // fpext followed by ftrunc is allowed if the bit size returned to is
+      // the same as the original, in which case its just a bitcast
+      if (SrcTy == DstTy)
+        return Instruction::BitCast;
+      return 0; // If the types are not the same we can't eliminate it.
+    case 11:
+      // bitcast followed by ptrtoint is allowed as long as the bitcast
+      // is a pointer to pointer cast.
+      if (SrcTy->isPointerTy() && MidTy->isPointerTy())
+        return secondOp;
+      return 0;
+    case 12:
+      // inttoptr, bitcast -> intptr  if bitcast is a ptr to ptr cast
+      if (MidTy->isPointerTy() && DstTy->isPointerTy())
+        return firstOp;
+      return 0;
+    case 13: {
+      // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
+      if (!MidIntPtrTy)
+        return 0;
+      unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits();
+      unsigned SrcSize = SrcTy->getScalarSizeInBits();
+      unsigned DstSize = DstTy->getScalarSizeInBits();
+      if (SrcSize <= PtrSize && SrcSize == DstSize)
+        return Instruction::BitCast;
+      return 0;
+    }
+    case 99: 
+      // cast combination can't happen (error in input). This is for all cases
+      // where the MidTy is not the same for the two cast instructions.
+      llvm_unreachable("Invalid Cast Combination");
+    default:
+      llvm_unreachable("Error in CastResults table!!!");
+  }
+}
+
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, 
+  const Twine &Name, Instruction *InsertBefore) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+    case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
+    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertBefore);
+    case SExt:     return new SExtInst     (S, Ty, Name, InsertBefore);
+    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertBefore);
+    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertBefore);
+    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertBefore);
+    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertBefore);
+    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertBefore);
+    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertBefore);
+    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
+    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
+    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
+    default: llvm_unreachable("Invalid opcode provided");
+  }
+}
+
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
+  const Twine &Name, BasicBlock *InsertAtEnd) {
+  assert(castIsValid(op, S, Ty) && "Invalid cast!");
+  // Construct and return the appropriate CastInst subclass
+  switch (op) {
+    case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
+    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertAtEnd);
+    case SExt:     return new SExtInst     (S, Ty, Name, InsertAtEnd);
+    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertAtEnd);
+    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertAtEnd);
+    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertAtEnd);
+    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertAtEnd);
+    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertAtEnd);
+    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertAtEnd);
+    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertAtEnd);
+    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
+    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
+    default: llvm_unreachable("Invalid opcode provided");
+  }
+}
+
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
+                                        const Twine &Name,
+                                        Instruction *InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::ZExt, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
+                                        const Twine &Name,
+                                        BasicBlock *InsertAtEnd) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
+                                        const Twine &Name,
+                                        Instruction *InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::SExt, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
+                                        const Twine &Name,
+                                        BasicBlock *InsertAtEnd) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::SExt, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
+                                         const Twine &Name,
+                                         Instruction *InsertBefore) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+  return Create(Instruction::Trunc, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
+                                         const Twine &Name, 
+                                         BasicBlock *InsertAtEnd) {
+  if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
+    return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::Trunc, S, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
+                                      const Twine &Name,
+                                      BasicBlock *InsertAtEnd) {
+  assert(S->getType()->isPointerTy() && "Invalid cast");
+  assert((Ty->isIntegerTy() || Ty->isPointerTy()) &&
+         "Invalid cast");
+
+  if (Ty->isIntegerTy())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
+  return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
+}
+
+/// @brief Create a BitCast or a PtrToInt cast instruction
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, 
+                                      const Twine &Name, 
+                                      Instruction *InsertBefore) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
+         "Invalid cast");
+
+  if (Ty->isIntOrIntVectorTy())
+    return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+  return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
+                                      bool isSigned, const Twine &Name,
+                                      Instruction *InsertBefore) {
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "Invalid integer cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
+                                      bool isSigned, const Twine &Name,
+                                      BasicBlock *InsertAtEnd) {
+  assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::Trunc :
+      (isSigned ? Instruction::SExt : Instruction::ZExt)));
+  return Create(opcode, C, Ty, Name, InsertAtEnd);
+}
+
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
+                                 const Twine &Name, 
+                                 Instruction *InsertBefore) {
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
+  return Create(opcode, C, Ty, Name, InsertBefore);
+}
+
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
+                                 const Twine &Name, 
+                                 BasicBlock *InsertAtEnd) {
+  assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
+         "Invalid cast");
+  unsigned SrcBits = C->getType()->getScalarSizeInBits();
+  unsigned DstBits = Ty->getScalarSizeInBits();
+  Instruction::CastOps opcode =
+    (SrcBits == DstBits ? Instruction::BitCast :
+     (SrcBits > DstBits ? Instruction::FPTrunc : Instruction::FPExt));
+  return Create(opcode, C, Ty, Name, InsertAtEnd);
+}
+
+// Check whether it is valid to call getCastOpcode for these types.
+// This routine must be kept in sync with getCastOpcode.
+bool CastInst::isCastable(Type *SrcTy, Type *DestTy) {
+  if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
+    return false;
+
+  if (SrcTy == DestTy)
+    return true;
+
+  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Valid if casting the elements is valid.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
+  // Get the bit sizes, we'll need these
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+
+  // Run through the possibilities ...
+  if (DestTy->isIntegerTy()) {               // Casting to integral
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
+        return true;
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
+      return true;
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
+    } else {                                   // Casting from something else
+      return SrcTy->isPointerTy();
+    }
+  } else if (DestTy->isFloatingPointTy()) {  // Casting to floating pt
+    if (SrcTy->isIntegerTy()) {                // Casting from integral
+      return true;
+    } else if (SrcTy->isFloatingPointTy()) {   // Casting from floating pt
+      return true;
+    } else if (SrcTy->isVectorTy()) {          // Casting from vector
+      return DestBits == SrcBits;
+    } else {                                   // Casting from something else
+      return false;
+    }
+  } else if (DestTy->isVectorTy()) {         // Casting to vector
+    return DestBits == SrcBits;
+  } else if (DestTy->isPointerTy()) {        // Casting to pointer
+    if (SrcTy->isPointerTy()) {                // Casting from pointer
+      return true;
+    } else if (SrcTy->isIntegerTy()) {         // Casting from integral
+      return true;
+    } else {                                   // Casting from something else
+      return false;
+    }
+  } else if (DestTy->isX86_MMXTy()) {
+    if (SrcTy->isVectorTy()) {
+      return DestBits == SrcBits;       // 64-bit vector to MMX
+    } else {
+      return false;
+    }
+  } else {                                   // Casting to something else
+    return false;
+  }
+}
+
+// Provide a way to get a "cast" where the cast opcode is inferred from the 
+// types and size of the operand. This, basically, is a parallel of the 
+// logic in the castIsValid function below.  This axiom should hold:
+//   castIsValid( getCastOpcode(Val, Ty), Val, Ty)
+// should not assert in castIsValid. In other words, this produces a "correct"
+// casting opcode for the arguments passed to it.
+// This routine must be kept in sync with isCastable.
+Instruction::CastOps
+CastInst::getCastOpcode(
+  const Value *Src, bool SrcIsSigned, Type *DestTy, bool DestIsSigned) {
+  Type *SrcTy = Src->getType();
+
+  assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() &&
+         "Only first class types are castable!");
+
+  if (SrcTy == DestTy)
+    return BitCast;
+
+  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast.  Find the appropriate opcode based on the
+        // element types.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+
+  // Get the bit sizes, we'll need these
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+
+  // Run through the possibilities ...
+  if (DestTy->isIntegerTy()) {                      // Casting to integral
+    if (SrcTy->isIntegerTy()) {                     // Casting from integral
+      if (DestBits < SrcBits)
+        return Trunc;                               // int -> smaller int
+      else if (DestBits > SrcBits) {                // its an extension
+        if (SrcIsSigned)
+          return SExt;                              // signed -> SEXT
+        else
+          return ZExt;                              // unsigned -> ZEXT
+      } else {
+        return BitCast;                             // Same size, No-op cast
+      }
+    } else if (SrcTy->isFloatingPointTy()) {        // Casting from floating pt
+      if (DestIsSigned) 
+        return FPToSI;                              // FP -> sint
+      else
+        return FPToUI;                              // FP -> uint 
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
+             "Casting vector to integer of different width");
+      return BitCast;                             // Same size, no-op cast
+    } else {
+      assert(SrcTy->isPointerTy() &&
+             "Casting from a value that is not first-class type");
+      return PtrToInt;                              // ptr -> int
+    }
+  } else if (DestTy->isFloatingPointTy()) {         // Casting to floating pt
+    if (SrcTy->isIntegerTy()) {                     // Casting from integral
+      if (SrcIsSigned)
+        return SIToFP;                              // sint -> FP
+      else
+        return UIToFP;                              // uint -> FP
+    } else if (SrcTy->isFloatingPointTy()) {        // Casting from floating pt
+      if (DestBits < SrcBits) {
+        return FPTrunc;                             // FP -> smaller FP
+      } else if (DestBits > SrcBits) {
+        return FPExt;                               // FP -> larger FP
+      } else  {
+        return BitCast;                             // same size, no-op cast
+      }
+    } else if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits &&
+             "Casting vector to floating point of different width");
+      return BitCast;                             // same size, no-op cast
+    }
+    llvm_unreachable("Casting pointer or non-first class to float");
+  } else if (DestTy->isVectorTy()) {
+    assert(DestBits == SrcBits &&
+           "Illegal cast to vector (wrong type or size)");
+    return BitCast;
+  } else if (DestTy->isPointerTy()) {
+    if (SrcTy->isPointerTy()) {
+      return BitCast;                               // ptr -> ptr
+    } else if (SrcTy->isIntegerTy()) {
+      return IntToPtr;                              // int -> ptr
+    }
+    llvm_unreachable("Casting pointer to other than pointer or int");
+  } else if (DestTy->isX86_MMXTy()) {
+    if (SrcTy->isVectorTy()) {
+      assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
+      return BitCast;                               // 64-bit vector to MMX
+    }
+    llvm_unreachable("Illegal cast to X86_MMX");
+  }
+  llvm_unreachable("Casting to type that is not first-class");
+}
+
+//===----------------------------------------------------------------------===//
+//                    CastInst SubClass Constructors
+//===----------------------------------------------------------------------===//
+
+/// Check that the construction parameters for a CastInst are correct. This
+/// could be broken out into the separate constructors but it is useful to have
+/// it in one place and to eliminate the redundant code for getting the sizes
+/// of the types involved.
+bool 
+CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
+
+  // Check for type sanity on the arguments
+  Type *SrcTy = S->getType();
+
+  // If this is a cast to the same type then it's trivially true.
+  if (SrcTy == DstTy)
+    return true;
+
+  if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType() ||
+      SrcTy->isAggregateType() || DstTy->isAggregateType())
+    return false;
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DstBitSize = DstTy->getScalarSizeInBits();
+
+  // If these are vector types, get the lengths of the vectors (using zero for
+  // scalar types means that checking that vector lengths match also checks that
+  // scalars are not being converted to vectors or vectors to scalars).
+  unsigned SrcLength = SrcTy->isVectorTy() ?
+    cast<VectorType>(SrcTy)->getNumElements() : 0;
+  unsigned DstLength = DstTy->isVectorTy() ?
+    cast<VectorType>(DstTy)->getNumElements() : 0;
+
+  // Switch on the opcode provided
+  switch (op) {
+  default: return false; // This is an input error
+  case Instruction::Trunc:
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
+  case Instruction::ZExt:
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
+  case Instruction::SExt: 
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
+  case Instruction::FPTrunc:
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize > DstBitSize;
+  case Instruction::FPExt:
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength && SrcBitSize < DstBitSize;
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy() &&
+      SrcLength == DstLength;
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+    return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
+      SrcLength == DstLength;
+  case Instruction::PtrToInt:
+    if (isa<VectorType>(SrcTy) != isa<VectorType>(DstTy))
+      return false;
+    if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
+      if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
+        return false;
+    return SrcTy->getScalarType()->isPointerTy() &&
+           DstTy->getScalarType()->isIntegerTy();
+  case Instruction::IntToPtr:
+    if (isa<VectorType>(SrcTy) != isa<VectorType>(DstTy))
+      return false;
+    if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
+      if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
+        return false;
+    return SrcTy->getScalarType()->isIntegerTy() &&
+           DstTy->getScalarType()->isPointerTy();
+  case Instruction::BitCast:
+    // BitCast implies a no-op cast of type only. No bits change.
+    // However, you can't cast pointers to anything but pointers.
+    if (SrcTy->isPointerTy() != DstTy->isPointerTy())
+      return false;
+
+    // Now we know we're not dealing with a pointer/non-pointer mismatch. In all
+    // these cases, the cast is okay if the source and destination bit widths
+    // are identical.
+    return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+  }
+}
+
+TruncInst::TruncInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, Trunc, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
+}
+
+TruncInst::TruncInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
+}
+
+ZExtInst::ZExtInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+)  : CastInst(Ty, ZExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
+}
+
+ZExtInst::ZExtInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+)  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
+}
+SExtInst::SExtInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, SExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
+}
+
+SExtInst::SExtInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+)  : CastInst(Ty, SExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
+}
+
+FPTruncInst::FPTruncInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
+}
+
+FPTruncInst::FPTruncInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
+}
+
+FPExtInst::FPExtInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPExt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
+}
+
+FPExtInst::FPExtInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
+}
+
+UIToFPInst::UIToFPInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
+}
+
+UIToFPInst::UIToFPInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
+}
+
+SIToFPInst::SIToFPInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
+}
+
+SIToFPInst::SIToFPInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
+}
+
+FPToUIInst::FPToUIInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
+}
+
+FPToUIInst::FPToUIInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
+}
+
+FPToSIInst::FPToSIInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
+}
+
+FPToSIInst::FPToSIInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
+}
+
+PtrToIntInst::PtrToIntInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
+}
+
+PtrToIntInst::PtrToIntInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
+}
+
+IntToPtrInst::IntToPtrInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
+}
+
+IntToPtrInst::IntToPtrInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
+}
+
+BitCastInst::BitCastInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, BitCast, S, Name, InsertBefore) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
+}
+
+BitCastInst::BitCastInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { 
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
+}
+
+//===----------------------------------------------------------------------===//
+//                               CmpInst Classes
+//===----------------------------------------------------------------------===//
+
+void CmpInst::anchor() {}
+
+CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
+                 Value *LHS, Value *RHS, const Twine &Name,
+                 Instruction *InsertBefore)
+  : Instruction(ty, op,
+                OperandTraits<CmpInst>::op_begin(this),
+                OperandTraits<CmpInst>::operands(this),
+                InsertBefore) {
+    Op<0>() = LHS;
+    Op<1>() = RHS;
+  setPredicate((Predicate)predicate);
+  setName(Name);
+}
+
+CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
+                 Value *LHS, Value *RHS, const Twine &Name,
+                 BasicBlock *InsertAtEnd)
+  : Instruction(ty, op,
+                OperandTraits<CmpInst>::op_begin(this),
+                OperandTraits<CmpInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = LHS;
+  Op<1>() = RHS;
+  setPredicate((Predicate)predicate);
+  setName(Name);
+}
+
+CmpInst *
+CmpInst::Create(OtherOps Op, unsigned short predicate,
+                Value *S1, Value *S2, 
+                const Twine &Name, Instruction *InsertBefore) {
+  if (Op == Instruction::ICmp) {
+    if (InsertBefore)
+      return new ICmpInst(InsertBefore, CmpInst::Predicate(predicate),
+                          S1, S2, Name);
+    else
+      return new ICmpInst(CmpInst::Predicate(predicate),
+                          S1, S2, Name);
+  }
+  
+  if (InsertBefore)
+    return new FCmpInst(InsertBefore, CmpInst::Predicate(predicate),
+                        S1, S2, Name);
+  else
+    return new FCmpInst(CmpInst::Predicate(predicate),
+                        S1, S2, Name);
+}
+
+CmpInst *
+CmpInst::Create(OtherOps Op, unsigned short predicate, Value *S1, Value *S2, 
+                const Twine &Name, BasicBlock *InsertAtEnd) {
+  if (Op == Instruction::ICmp) {
+    return new ICmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
+                        S1, S2, Name);
+  }
+  return new FCmpInst(*InsertAtEnd, CmpInst::Predicate(predicate),
+                      S1, S2, Name);
+}
+
+void CmpInst::swapOperands() {
+  if (ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    IC->swapOperands();
+  else
+    cast<FCmpInst>(this)->swapOperands();
+}
+
+bool CmpInst::isCommutative() const {
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    return IC->isCommutative();
+  return cast<FCmpInst>(this)->isCommutative();
+}
+
+bool CmpInst::isEquality() const {
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(this))
+    return IC->isEquality();
+  return cast<FCmpInst>(this)->isEquality();
+}
+
+
+CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
+  switch (pred) {
+    default: llvm_unreachable("Unknown cmp predicate!");
+    case ICMP_EQ: return ICMP_NE;
+    case ICMP_NE: return ICMP_EQ;
+    case ICMP_UGT: return ICMP_ULE;
+    case ICMP_ULT: return ICMP_UGE;
+    case ICMP_UGE: return ICMP_ULT;
+    case ICMP_ULE: return ICMP_UGT;
+    case ICMP_SGT: return ICMP_SLE;
+    case ICMP_SLT: return ICMP_SGE;
+    case ICMP_SGE: return ICMP_SLT;
+    case ICMP_SLE: return ICMP_SGT;
+
+    case FCMP_OEQ: return FCMP_UNE;
+    case FCMP_ONE: return FCMP_UEQ;
+    case FCMP_OGT: return FCMP_ULE;
+    case FCMP_OLT: return FCMP_UGE;
+    case FCMP_OGE: return FCMP_ULT;
+    case FCMP_OLE: return FCMP_UGT;
+    case FCMP_UEQ: return FCMP_ONE;
+    case FCMP_UNE: return FCMP_OEQ;
+    case FCMP_UGT: return FCMP_OLE;
+    case FCMP_ULT: return FCMP_OGE;
+    case FCMP_UGE: return FCMP_OLT;
+    case FCMP_ULE: return FCMP_OGT;
+    case FCMP_ORD: return FCMP_UNO;
+    case FCMP_UNO: return FCMP_ORD;
+    case FCMP_TRUE: return FCMP_FALSE;
+    case FCMP_FALSE: return FCMP_TRUE;
+  }
+}
+
+ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
+  switch (pred) {
+    default: llvm_unreachable("Unknown icmp predicate!");
+    case ICMP_EQ: case ICMP_NE: 
+    case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: 
+       return pred;
+    case ICMP_UGT: return ICMP_SGT;
+    case ICMP_ULT: return ICMP_SLT;
+    case ICMP_UGE: return ICMP_SGE;
+    case ICMP_ULE: return ICMP_SLE;
+  }
+}
+
+ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
+  switch (pred) {
+    default: llvm_unreachable("Unknown icmp predicate!");
+    case ICMP_EQ: case ICMP_NE: 
+    case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: 
+       return pred;
+    case ICMP_SGT: return ICMP_UGT;
+    case ICMP_SLT: return ICMP_ULT;
+    case ICMP_SGE: return ICMP_UGE;
+    case ICMP_SLE: return ICMP_ULE;
+  }
+}
+
+/// Initialize a set of values that all satisfy the condition with C.
+///
+ConstantRange 
+ICmpInst::makeConstantRange(Predicate pred, const APInt &C) {
+  APInt Lower(C);
+  APInt Upper(C);
+  uint32_t BitWidth = C.getBitWidth();
+  switch (pred) {
+  default: llvm_unreachable("Invalid ICmp opcode to ConstantRange ctor!");
+  case ICmpInst::ICMP_EQ: ++Upper; break;
+  case ICmpInst::ICMP_NE: ++Lower; break;
+  case ICmpInst::ICMP_ULT:
+    Lower = APInt::getMinValue(BitWidth);
+    // Check for an empty-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/false);
+    break;
+  case ICmpInst::ICMP_SLT:
+    Lower = APInt::getSignedMinValue(BitWidth);
+    // Check for an empty-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/false);
+    break;
+  case ICmpInst::ICMP_UGT: 
+    ++Lower; Upper = APInt::getMinValue(BitWidth);        // Min = Next(Max)
+    // Check for an empty-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/false);
+    break;
+  case ICmpInst::ICMP_SGT:
+    ++Lower; Upper = APInt::getSignedMinValue(BitWidth);  // Min = Next(Max)
+    // Check for an empty-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/false);
+    break;
+  case ICmpInst::ICMP_ULE: 
+    Lower = APInt::getMinValue(BitWidth); ++Upper; 
+    // Check for a full-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/true);
+    break;
+  case ICmpInst::ICMP_SLE: 
+    Lower = APInt::getSignedMinValue(BitWidth); ++Upper; 
+    // Check for a full-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/true);
+    break;
+  case ICmpInst::ICMP_UGE:
+    Upper = APInt::getMinValue(BitWidth);        // Min = Next(Max)
+    // Check for a full-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/true);
+    break;
+  case ICmpInst::ICMP_SGE:
+    Upper = APInt::getSignedMinValue(BitWidth);  // Min = Next(Max)
+    // Check for a full-set condition.
+    if (Lower == Upper)
+      return ConstantRange(BitWidth, /*isFullSet=*/true);
+    break;
+  }
+  return ConstantRange(Lower, Upper);
+}
+
+CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
+  switch (pred) {
+    default: llvm_unreachable("Unknown cmp predicate!");
+    case ICMP_EQ: case ICMP_NE:
+      return pred;
+    case ICMP_SGT: return ICMP_SLT;
+    case ICMP_SLT: return ICMP_SGT;
+    case ICMP_SGE: return ICMP_SLE;
+    case ICMP_SLE: return ICMP_SGE;
+    case ICMP_UGT: return ICMP_ULT;
+    case ICMP_ULT: return ICMP_UGT;
+    case ICMP_UGE: return ICMP_ULE;
+    case ICMP_ULE: return ICMP_UGE;
+  
+    case FCMP_FALSE: case FCMP_TRUE:
+    case FCMP_OEQ: case FCMP_ONE:
+    case FCMP_UEQ: case FCMP_UNE:
+    case FCMP_ORD: case FCMP_UNO:
+      return pred;
+    case FCMP_OGT: return FCMP_OLT;
+    case FCMP_OLT: return FCMP_OGT;
+    case FCMP_OGE: return FCMP_OLE;
+    case FCMP_OLE: return FCMP_OGE;
+    case FCMP_UGT: return FCMP_ULT;
+    case FCMP_ULT: return FCMP_UGT;
+    case FCMP_UGE: return FCMP_ULE;
+    case FCMP_ULE: return FCMP_UGE;
+  }
+}
+
+bool CmpInst::isUnsigned(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT: 
+    case ICmpInst::ICMP_UGE: return true;
+  }
+}
+
+bool CmpInst::isSigned(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT: 
+    case ICmpInst::ICMP_SGE: return true;
+  }
+}
+
+bool CmpInst::isOrdered(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case FCmpInst::FCMP_OEQ: case FCmpInst::FCMP_ONE: case FCmpInst::FCMP_OGT: 
+    case FCmpInst::FCMP_OLT: case FCmpInst::FCMP_OGE: case FCmpInst::FCMP_OLE: 
+    case FCmpInst::FCMP_ORD: return true;
+  }
+}
+      
+bool CmpInst::isUnordered(unsigned short predicate) {
+  switch (predicate) {
+    default: return false;
+    case FCmpInst::FCMP_UEQ: case FCmpInst::FCMP_UNE: case FCmpInst::FCMP_UGT: 
+    case FCmpInst::FCMP_ULT: case FCmpInst::FCMP_UGE: case FCmpInst::FCMP_ULE: 
+    case FCmpInst::FCMP_UNO: return true;
+  }
+}
+
+bool CmpInst::isTrueWhenEqual(unsigned short predicate) {
+  switch(predicate) {
+    default: return false;
+    case ICMP_EQ:   case ICMP_UGE: case ICMP_ULE: case ICMP_SGE: case ICMP_SLE:
+    case FCMP_TRUE: case FCMP_UEQ: case FCMP_UGE: case FCMP_ULE: return true;
+  }
+}
+
+bool CmpInst::isFalseWhenEqual(unsigned short predicate) {
+  switch(predicate) {
+  case ICMP_NE:    case ICMP_UGT: case ICMP_ULT: case ICMP_SGT: case ICMP_SLT:
+  case FCMP_FALSE: case FCMP_ONE: case FCMP_OGT: case FCMP_OLT: return true;
+  default: return false;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        SwitchInst Implementation
+//===----------------------------------------------------------------------===//
+
+void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
+  assert(Value && Default && NumReserved);
+  ReservedSpace = NumReserved;
+  NumOperands = 2;
+  OperandList = allocHungoffUses(ReservedSpace);
+
+  OperandList[0] = Value;
+  OperandList[1] = Default;
+}
+
+/// SwitchInst ctor - Create a new switch instruction, specifying a value to
+/// switch on and a default destination.  The number of additional cases can
+/// be specified here to make memory allocation more efficient.  This
+/// constructor can also autoinsert before another instruction.
+SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+                       Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                   0, 0, InsertBefore) {
+  init(Value, Default, 2+NumCases*2);
+}
+
+/// SwitchInst ctor - Create a new switch instruction, specifying a value to
+/// switch on and a default destination.  The number of additional cases can
+/// be specified here to make memory allocation more efficient.  This
+/// constructor also autoinserts at the end of the specified BasicBlock.
+SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
+                       BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                   0, 0, InsertAtEnd) {
+  init(Value, Default, 2+NumCases*2);
+}
+
+SwitchInst::SwitchInst(const SwitchInst &SI)
+  : TerminatorInst(SI.getType(), Instruction::Switch, 0, 0) {
+  init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
+  NumOperands = SI.getNumOperands();
+  Use *OL = OperandList, *InOL = SI.OperandList;
+  for (unsigned i = 2, E = SI.getNumOperands(); i != E; i += 2) {
+    OL[i] = InOL[i];
+    OL[i+1] = InOL[i+1];
+  }
+  TheSubsets = SI.TheSubsets;
+  SubclassOptionalData = SI.SubclassOptionalData;
+}
+
+SwitchInst::~SwitchInst() {
+  dropHungoffUses();
+}
+
+
+/// addCase - Add an entry to the switch instruction...
+///
+void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
+  IntegersSubsetToBB Mapping;
+  
+  // FIXME: Currently we work with ConstantInt based cases.
+  // So inititalize IntItem container directly from ConstantInt.
+  Mapping.add(IntItem::fromConstantInt(OnVal));
+  IntegersSubset CaseRanges = Mapping.getCase();
+  addCase(CaseRanges, Dest);
+}
+
+void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
+  unsigned NewCaseIdx = getNumCases(); 
+  unsigned OpNo = NumOperands;
+  if (OpNo+2 > ReservedSpace)
+    growOperands();  // Get more space!
+  // Initialize some new operands.
+  assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
+  NumOperands = OpNo+2;
+
+  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
+  
+  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
+  Case.updateCaseValueOperand(OnVal);
+  Case.setSuccessor(Dest);
+}
+
+/// removeCase - This method removes the specified case and its successor
+/// from the switch instruction.
+void SwitchInst::removeCase(CaseIt& i) {
+  unsigned idx = i.getCaseIndex();
+  
+  assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
+
+  unsigned NumOps = getNumOperands();
+  Use *OL = OperandList;
+
+  // Overwrite this case with the end of the list.
+  if (2 + (idx + 1) * 2 != NumOps) {
+    OL[2 + idx * 2] = OL[NumOps - 2];
+    OL[2 + idx * 2 + 1] = OL[NumOps - 1];
+  }
+
+  // Nuke the last value.
+  OL[NumOps-2].set(0);
+  OL[NumOps-2+1].set(0);
+
+  // Do the same with TheCases collection:
+  if (i.SubsetIt != --TheSubsets.end()) {
+    *i.SubsetIt = TheSubsets.back();
+    TheSubsets.pop_back();
+  } else {
+    TheSubsets.pop_back();
+    i.SubsetIt = TheSubsets.end();
+  }
+  
+  NumOperands = NumOps-2;
+}
+
+/// growOperands - grow operands - This grows the operand list in response
+/// to a push_back style of operation.  This grows the number of ops by 3 times.
+///
+void SwitchInst::growOperands() {
+  unsigned e = getNumOperands();
+  unsigned NumOps = e*3;
+
+  ReservedSpace = NumOps;
+  Use *NewOps = allocHungoffUses(NumOps);
+  Use *OldOps = OperandList;
+  for (unsigned i = 0; i != e; ++i) {
+      NewOps[i] = OldOps[i];
+  }
+  OperandList = NewOps;
+  Use::zap(OldOps, OldOps + e, true);
+}
+
+
+BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned SwitchInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                        IndirectBrInst Implementation
+//===----------------------------------------------------------------------===//
+
+void IndirectBrInst::init(Value *Address, unsigned NumDests) {
+  assert(Address && Address->getType()->isPointerTy() &&
+         "Address of indirectbr must be a pointer");
+  ReservedSpace = 1+NumDests;
+  NumOperands = 1;
+  OperandList = allocHungoffUses(ReservedSpace);
+  
+  OperandList[0] = Address;
+}
+
+
+/// growOperands - grow operands - This grows the operand list in response
+/// to a push_back style of operation.  This grows the number of ops by 2 times.
+///
+void IndirectBrInst::growOperands() {
+  unsigned e = getNumOperands();
+  unsigned NumOps = e*2;
+  
+  ReservedSpace = NumOps;
+  Use *NewOps = allocHungoffUses(NumOps);
+  Use *OldOps = OperandList;
+  for (unsigned i = 0; i != e; ++i)
+    NewOps[i] = OldOps[i];
+  OperandList = NewOps;
+  Use::zap(OldOps, OldOps + e, true);
+}
+
+IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
+                               Instruction *InsertBefore)
+: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
+                 0, 0, InsertBefore) {
+  init(Address, NumCases);
+}
+
+IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
+                               BasicBlock *InsertAtEnd)
+: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
+                 0, 0, InsertAtEnd) {
+  init(Address, NumCases);
+}
+
+IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI)
+  : TerminatorInst(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
+                   allocHungoffUses(IBI.getNumOperands()),
+                   IBI.getNumOperands()) {
+  Use *OL = OperandList, *InOL = IBI.OperandList;
+  for (unsigned i = 0, E = IBI.getNumOperands(); i != E; ++i)
+    OL[i] = InOL[i];
+  SubclassOptionalData = IBI.SubclassOptionalData;
+}
+
+IndirectBrInst::~IndirectBrInst() {
+  dropHungoffUses();
+}
+
+/// addDestination - Add a destination.
+///
+void IndirectBrInst::addDestination(BasicBlock *DestBB) {
+  unsigned OpNo = NumOperands;
+  if (OpNo+1 > ReservedSpace)
+    growOperands();  // Get more space!
+  // Initialize some new operands.
+  assert(OpNo < ReservedSpace && "Growing didn't work!");
+  NumOperands = OpNo+1;
+  OperandList[OpNo] = DestBB;
+}
+
+/// removeDestination - This method removes the specified successor from the
+/// indirectbr instruction.
+void IndirectBrInst::removeDestination(unsigned idx) {
+  assert(idx < getNumOperands()-1 && "Successor index out of range!");
+  
+  unsigned NumOps = getNumOperands();
+  Use *OL = OperandList;
+
+  // Replace this value with the last one.
+  OL[idx+1] = OL[NumOps-1];
+  
+  // Nuke the last value.
+  OL[NumOps-1].set(0);
+  NumOperands = NumOps-1;
+}
+
+BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned IndirectBrInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                           clone_impl() implementations
+//===----------------------------------------------------------------------===//
+
+// Define these methods here so vtables don't get emitted into every translation
+// unit that uses these classes.
+
+GetElementPtrInst *GetElementPtrInst::clone_impl() const {
+  return new (getNumOperands()) GetElementPtrInst(*this);
+}
+
+BinaryOperator *BinaryOperator::clone_impl() const {
+  return Create(getOpcode(), Op<0>(), Op<1>());
+}
+
+FCmpInst* FCmpInst::clone_impl() const {
+  return new FCmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+
+ICmpInst* ICmpInst::clone_impl() const {
+  return new ICmpInst(getPredicate(), Op<0>(), Op<1>());
+}
+
+ExtractValueInst *ExtractValueInst::clone_impl() const {
+  return new ExtractValueInst(*this);
+}
+
+InsertValueInst *InsertValueInst::clone_impl() const {
+  return new InsertValueInst(*this);
+}
+
+AllocaInst *AllocaInst::clone_impl() const {
+  return new AllocaInst(getAllocatedType(),
+                        (Value*)getOperand(0),
+                        getAlignment());
+}
+
+LoadInst *LoadInst::clone_impl() const {
+  return new LoadInst(getOperand(0), Twine(), isVolatile(),
+                      getAlignment(), getOrdering(), getSynchScope());
+}
+
+StoreInst *StoreInst::clone_impl() const {
+  return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
+                       getAlignment(), getOrdering(), getSynchScope());
+  
+}
+
+AtomicCmpXchgInst *AtomicCmpXchgInst::clone_impl() const {
+  AtomicCmpXchgInst *Result =
+    new AtomicCmpXchgInst(getOperand(0), getOperand(1), getOperand(2),
+                          getOrdering(), getSynchScope());
+  Result->setVolatile(isVolatile());
+  return Result;
+}
+
+AtomicRMWInst *AtomicRMWInst::clone_impl() const {
+  AtomicRMWInst *Result =
+    new AtomicRMWInst(getOperation(),getOperand(0), getOperand(1),
+                      getOrdering(), getSynchScope());
+  Result->setVolatile(isVolatile());
+  return Result;
+}
+
+FenceInst *FenceInst::clone_impl() const {
+  return new FenceInst(getContext(), getOrdering(), getSynchScope());
+}
+
+TruncInst *TruncInst::clone_impl() const {
+  return new TruncInst(getOperand(0), getType());
+}
+
+ZExtInst *ZExtInst::clone_impl() const {
+  return new ZExtInst(getOperand(0), getType());
+}
+
+SExtInst *SExtInst::clone_impl() const {
+  return new SExtInst(getOperand(0), getType());
+}
+
+FPTruncInst *FPTruncInst::clone_impl() const {
+  return new FPTruncInst(getOperand(0), getType());
+}
+
+FPExtInst *FPExtInst::clone_impl() const {
+  return new FPExtInst(getOperand(0), getType());
+}
+
+UIToFPInst *UIToFPInst::clone_impl() const {
+  return new UIToFPInst(getOperand(0), getType());
+}
+
+SIToFPInst *SIToFPInst::clone_impl() const {
+  return new SIToFPInst(getOperand(0), getType());
+}
+
+FPToUIInst *FPToUIInst::clone_impl() const {
+  return new FPToUIInst(getOperand(0), getType());
+}
+
+FPToSIInst *FPToSIInst::clone_impl() const {
+  return new FPToSIInst(getOperand(0), getType());
+}
+
+PtrToIntInst *PtrToIntInst::clone_impl() const {
+  return new PtrToIntInst(getOperand(0), getType());
+}
+
+IntToPtrInst *IntToPtrInst::clone_impl() const {
+  return new IntToPtrInst(getOperand(0), getType());
+}
+
+BitCastInst *BitCastInst::clone_impl() const {
+  return new BitCastInst(getOperand(0), getType());
+}
+
+CallInst *CallInst::clone_impl() const {
+  return  new(getNumOperands()) CallInst(*this);
+}
+
+SelectInst *SelectInst::clone_impl() const {
+  return SelectInst::Create(getOperand(0), getOperand(1), getOperand(2));
+}
+
+VAArgInst *VAArgInst::clone_impl() const {
+  return new VAArgInst(getOperand(0), getType());
+}
+
+ExtractElementInst *ExtractElementInst::clone_impl() const {
+  return ExtractElementInst::Create(getOperand(0), getOperand(1));
+}
+
+InsertElementInst *InsertElementInst::clone_impl() const {
+  return InsertElementInst::Create(getOperand(0), getOperand(1), getOperand(2));
+}
+
+ShuffleVectorInst *ShuffleVectorInst::clone_impl() const {
+  return new ShuffleVectorInst(getOperand(0), getOperand(1), getOperand(2));
+}
+
+PHINode *PHINode::clone_impl() const {
+  return new PHINode(*this);
+}
+
+LandingPadInst *LandingPadInst::clone_impl() const {
+  return new LandingPadInst(*this);
+}
+
+ReturnInst *ReturnInst::clone_impl() const {
+  return new(getNumOperands()) ReturnInst(*this);
+}
+
+BranchInst *BranchInst::clone_impl() const {
+  return new(getNumOperands()) BranchInst(*this);
+}
+
+SwitchInst *SwitchInst::clone_impl() const {
+  return new SwitchInst(*this);
+}
+
+IndirectBrInst *IndirectBrInst::clone_impl() const {
+  return new IndirectBrInst(*this);
+}
+
+
+InvokeInst *InvokeInst::clone_impl() const {
+  return new(getNumOperands()) InvokeInst(*this);
+}
+
+ResumeInst *ResumeInst::clone_impl() const {
+  return new(1) ResumeInst(*this);
+}
+
+UnreachableInst *UnreachableInst::clone_impl() const {
+  LLVMContext &Context = getContext();
+  return new UnreachableInst(Context);
+}
diff --git a/contrib/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm/lib/IR/IntrinsicInst.cpp
new file mode 100644
index 000000000000..51f88d2e6fbd
--- /dev/null
+++ b/contrib/llvm/lib/IR/IntrinsicInst.cpp
@@ -0,0 +1,73 @@
+//===-- InstrinsicInst.cpp - Intrinsic Instruction Wrappers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements methods that make it really easy to deal with intrinsic
+// functions.
+//
+// All intrinsic function calls are instances of the call instruction, so these
+// are all subclasses of the CallInst class.  Note that none of these classes
+// has state or virtual methods, which is an important part of this gross/neat
+// hack working.
+// 
+// In some cases, arguments to intrinsics need to be generic and are defined as
+// type pointer to empty struct { }*.  To access the real item of interest the
+// cast instruction needs to be stripped away. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+/// DbgInfoIntrinsic - This is the common base class for debug info intrinsics
+///
+
+static Value *CastOperand(Value *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    if (CE->isCast())
+      return CE->getOperand(0);
+  return NULL;
+}
+
+Value *DbgInfoIntrinsic::StripCast(Value *C) {
+  if (Value *CO = CastOperand(C)) {
+    C = StripCast(CO);
+  } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->hasInitializer())
+      if (Value *CO = CastOperand(GV->getInitializer()))
+        C = StripCast(CO);
+  }
+  return dyn_cast<GlobalVariable>(C);
+}
+
+//===----------------------------------------------------------------------===//
+/// DbgDeclareInst - This represents the llvm.dbg.declare instruction.
+///
+
+Value *DbgDeclareInst::getAddress() const {
+  if (MDNode* MD = cast_or_null<MDNode>(getArgOperand(0)))
+    return MD->getOperand(0);
+  else
+    return NULL;
+}
+
+//===----------------------------------------------------------------------===//
+/// DbgValueInst - This represents the llvm.dbg.value instruction.
+///
+
+const Value *DbgValueInst::getValue() const {
+  return cast<MDNode>(getArgOperand(0))->getOperand(0);
+}
+
+Value *DbgValueInst::getValue() {
+  return cast<MDNode>(getArgOperand(0))->getOperand(0);
+}
diff --git a/contrib/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm/lib/IR/LLVMContext.cpp
new file mode 100644
index 000000000000..883bb9878fa5
--- /dev/null
+++ b/contrib/llvm/lib/IR/LLVMContext.cpp
@@ -0,0 +1,168 @@
+//===-- LLVMContext.cpp - Implement LLVMContext ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements LLVMContext, as a wrapper around the opaque
+//  class LLVMContextImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/LLVMContext.h"
+#include "LLVMContextImpl.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/SourceMgr.h"
+#include <cctype>
+using namespace llvm;
+
+static ManagedStatic<LLVMContext> GlobalContext;
+
+LLVMContext& llvm::getGlobalContext() {
+  return *GlobalContext;
+}
+
+LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
+  // Create the fixed metadata kinds. This is done in the same order as the
+  // MD_* enum values so that they correspond.
+
+  // Create the 'dbg' metadata kind.
+  unsigned DbgID = getMDKindID("dbg");
+  assert(DbgID == MD_dbg && "dbg kind id drifted"); (void)DbgID;
+
+  // Create the 'tbaa' metadata kind.
+  unsigned TBAAID = getMDKindID("tbaa");
+  assert(TBAAID == MD_tbaa && "tbaa kind id drifted"); (void)TBAAID;
+
+  // Create the 'prof' metadata kind.
+  unsigned ProfID = getMDKindID("prof");
+  assert(ProfID == MD_prof && "prof kind id drifted"); (void)ProfID;
+
+  // Create the 'fpmath' metadata kind.
+  unsigned FPAccuracyID = getMDKindID("fpmath");
+  assert(FPAccuracyID == MD_fpmath && "fpmath kind id drifted");
+  (void)FPAccuracyID;
+
+  // Create the 'range' metadata kind.
+  unsigned RangeID = getMDKindID("range");
+  assert(RangeID == MD_range && "range kind id drifted");
+  (void)RangeID;
+
+  // Create the 'tbaa.struct' metadata kind.
+  unsigned TBAAStructID = getMDKindID("tbaa.struct");
+  assert(TBAAStructID == MD_tbaa_struct && "tbaa.struct kind id drifted");
+  (void)TBAAStructID;
+
+  // Create the 'invariant.load' metadata kind.
+  unsigned InvariantLdId = getMDKindID("invariant.load");
+  assert(InvariantLdId == MD_invariant_load && "invariant.load kind id drifted");
+  (void)InvariantLdId;
+}
+LLVMContext::~LLVMContext() { delete pImpl; }
+
+void LLVMContext::addModule(Module *M) {
+  pImpl->OwnedModules.insert(M);
+}
+
+void LLVMContext::removeModule(Module *M) {
+  pImpl->OwnedModules.erase(M);
+}
+
+//===----------------------------------------------------------------------===//
+// Recoverable Backend Errors
+//===----------------------------------------------------------------------===//
+
+void LLVMContext::
+setInlineAsmDiagnosticHandler(InlineAsmDiagHandlerTy DiagHandler,
+                              void *DiagContext) {
+  pImpl->InlineAsmDiagHandler = DiagHandler;
+  pImpl->InlineAsmDiagContext = DiagContext;
+}
+
+/// getInlineAsmDiagnosticHandler - Return the diagnostic handler set by
+/// setInlineAsmDiagnosticHandler.
+LLVMContext::InlineAsmDiagHandlerTy
+LLVMContext::getInlineAsmDiagnosticHandler() const {
+  return pImpl->InlineAsmDiagHandler;
+}
+
+/// getInlineAsmDiagnosticContext - Return the diagnostic context set by
+/// setInlineAsmDiagnosticHandler.
+void *LLVMContext::getInlineAsmDiagnosticContext() const {
+  return pImpl->InlineAsmDiagContext;
+}
+
+void LLVMContext::emitError(const Twine &ErrorStr) {
+  emitError(0U, ErrorStr);
+}
+
+void LLVMContext::emitError(const Instruction *I, const Twine &ErrorStr) {
+  unsigned LocCookie = 0;
+  if (const MDNode *SrcLoc = I->getMetadata("srcloc")) {
+    if (SrcLoc->getNumOperands() != 0)
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(SrcLoc->getOperand(0)))
+        LocCookie = CI->getZExtValue();
+  }
+  return emitError(LocCookie, ErrorStr);
+}
+
+void LLVMContext::emitError(unsigned LocCookie, const Twine &ErrorStr) {
+  // If there is no error handler installed, just print the error and exit.
+  if (pImpl->InlineAsmDiagHandler == 0) {
+    errs() << "error: " << ErrorStr << "\n";
+    exit(1);
+  }
+
+  // If we do have an error handler, we can report the error and keep going.
+  SMDiagnostic Diag("", SourceMgr::DK_Error, ErrorStr.str());
+
+  pImpl->InlineAsmDiagHandler(Diag, pImpl->InlineAsmDiagContext, LocCookie);
+}
+
+//===----------------------------------------------------------------------===//
+// Metadata Kind Uniquing
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+/// isValidName - Return true if Name is a valid custom metadata handler name.
+static bool isValidName(StringRef MDName) {
+  if (MDName.empty())
+    return false;
+
+  if (!std::isalpha(static_cast<unsigned char>(MDName[0])))
+    return false;
+
+  for (StringRef::iterator I = MDName.begin() + 1, E = MDName.end(); I != E;
+       ++I) {
+    if (!std::isalnum(static_cast<unsigned char>(*I)) && *I != '_' &&
+        *I != '-' && *I != '.')
+      return false;
+  }
+  return true;
+}
+#endif
+
+/// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
+unsigned LLVMContext::getMDKindID(StringRef Name) const {
+  assert(isValidName(Name) && "Invalid MDNode name");
+
+  // If this is new, assign it its ID.
+  return
+    pImpl->CustomMDKindNames.GetOrCreateValue(
+      Name, pImpl->CustomMDKindNames.size()).second;
+}
+
+/// getHandlerNames - Populate client supplied smallvector using custome
+/// metadata name and ID.
+void LLVMContext::getMDKindNames(SmallVectorImpl<StringRef> &Names) const {
+  Names.resize(pImpl->CustomMDKindNames.size());
+  for (StringMap<unsigned>::const_iterator I = pImpl->CustomMDKindNames.begin(),
+       E = pImpl->CustomMDKindNames.end(); I != E; ++I)
+    Names[I->second] = I->first();
+}
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
new file mode 100644
index 000000000000..6a6a4d6801f0
--- /dev/null
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
@@ -0,0 +1,156 @@
+//===-- LLVMContextImpl.cpp - Implement LLVMContextImpl -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the opaque LLVMContextImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Module.h"
+#include <algorithm>
+using namespace llvm;
+
+LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
+  : TheTrueVal(0), TheFalseVal(0),
+    VoidTy(C, Type::VoidTyID),
+    LabelTy(C, Type::LabelTyID),
+    HalfTy(C, Type::HalfTyID),
+    FloatTy(C, Type::FloatTyID),
+    DoubleTy(C, Type::DoubleTyID),
+    MetadataTy(C, Type::MetadataTyID),
+    X86_FP80Ty(C, Type::X86_FP80TyID),
+    FP128Ty(C, Type::FP128TyID),
+    PPC_FP128Ty(C, Type::PPC_FP128TyID),
+    X86_MMXTy(C, Type::X86_MMXTyID),
+    Int1Ty(C, 1),
+    Int8Ty(C, 8),
+    Int16Ty(C, 16),
+    Int32Ty(C, 32),
+    Int64Ty(C, 64) {
+  InlineAsmDiagHandler = 0;
+  InlineAsmDiagContext = 0;
+  NamedStructTypesUniqueID = 0;
+}
+
+namespace {
+struct DropReferences {
+  // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
+  // is a Constant*.
+  template<typename PairT>
+  void operator()(const PairT &P) {
+    P.second->dropAllReferences();
+  }
+};
+
+// Temporary - drops pair.first instead of second.
+struct DropFirst {
+  // Takes the value_type of a ConstantUniqueMap's internal map, whose 'second'
+  // is a Constant*.
+  template<typename PairT>
+  void operator()(const PairT &P) {
+    P.first->dropAllReferences();
+  }
+};
+}
+
+LLVMContextImpl::~LLVMContextImpl() {
+  // NOTE: We need to delete the contents of OwnedModules, but we have to
+  // duplicate it into a temporary vector, because the destructor of Module
+  // will try to remove itself from OwnedModules set.  This would cause
+  // iterator invalidation if we iterated on the set directly.
+  std::vector<Module*> Modules(OwnedModules.begin(), OwnedModules.end());
+  DeleteContainerPointers(Modules);
+  
+  // Free the constants.  This is important to do here to ensure that they are
+  // freed before the LeakDetector is torn down.
+  std::for_each(ExprConstants.map_begin(), ExprConstants.map_end(),
+                DropReferences());
+  std::for_each(ArrayConstants.map_begin(), ArrayConstants.map_end(),
+                DropFirst());
+  std::for_each(StructConstants.map_begin(), StructConstants.map_end(),
+                DropFirst());
+  std::for_each(VectorConstants.map_begin(), VectorConstants.map_end(),
+                DropFirst());
+  ExprConstants.freeConstants();
+  ArrayConstants.freeConstants();
+  StructConstants.freeConstants();
+  VectorConstants.freeConstants();
+  DeleteContainerSeconds(CAZConstants);
+  DeleteContainerSeconds(CPNConstants);
+  DeleteContainerSeconds(UVConstants);
+  InlineAsms.freeConstants();
+  DeleteContainerSeconds(IntConstants);
+  DeleteContainerSeconds(FPConstants);
+  
+  for (StringMap<ConstantDataSequential*>::iterator I = CDSConstants.begin(),
+       E = CDSConstants.end(); I != E; ++I)
+    delete I->second;
+  CDSConstants.clear();
+
+  // Destroy attributes.
+  for (FoldingSetIterator<AttributeImpl> I = AttrsSet.begin(),
+         E = AttrsSet.end(); I != E; ) {
+    FoldingSetIterator<AttributeImpl> Elem = I++;
+    delete &*Elem;
+  }
+
+  // Destroy attribute lists.
+  for (FoldingSetIterator<AttributeSetImpl> I = AttrsLists.begin(),
+         E = AttrsLists.end(); I != E; ) {
+    FoldingSetIterator<AttributeSetImpl> Elem = I++;
+    delete &*Elem;
+  }
+
+  // Destroy attribute node lists.
+  for (FoldingSetIterator<AttributeSetNode> I = AttrsSetNodes.begin(),
+         E = AttrsSetNodes.end(); I != E; ) {
+    FoldingSetIterator<AttributeSetNode> Elem = I++;
+    delete &*Elem;
+  }
+
+  // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
+  // and the NonUniquedMDNodes sets, so copy the values out first.
+  SmallVector<MDNode*, 8> MDNodes;
+  MDNodes.reserve(MDNodeSet.size() + NonUniquedMDNodes.size());
+  for (FoldingSetIterator<MDNode> I = MDNodeSet.begin(), E = MDNodeSet.end();
+       I != E; ++I)
+    MDNodes.push_back(&*I);
+  MDNodes.append(NonUniquedMDNodes.begin(), NonUniquedMDNodes.end());
+  for (SmallVectorImpl<MDNode *>::iterator I = MDNodes.begin(),
+         E = MDNodes.end(); I != E; ++I)
+    (*I)->destroy();
+  assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
+         "Destroying all MDNodes didn't empty the Context's sets.");
+
+  // Destroy MDStrings.
+  DeleteContainerSeconds(MDStringCache);
+}
+
+// ConstantsContext anchors
+void UnaryConstantExpr::anchor() { }
+
+void BinaryConstantExpr::anchor() { }
+
+void SelectConstantExpr::anchor() { }
+
+void ExtractElementConstantExpr::anchor() { }
+
+void InsertElementConstantExpr::anchor() { }
+
+void ShuffleVectorConstantExpr::anchor() { }
+
+void ExtractValueConstantExpr::anchor() { }
+
+void InsertValueConstantExpr::anchor() { }
+
+void GetElementPtrConstantExpr::anchor() { }
+
+void CompareConstantExpr::anchor() { }
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm/lib/IR/LLVMContextImpl.h
new file mode 100644
index 000000000000..0c659b81b706
--- /dev/null
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.h
@@ -0,0 +1,367 @@
+//===-- LLVMContextImpl.h - The LLVMContextImpl opaque class ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file declares LLVMContextImpl, the opaque implementation 
+//  of LLVMContext.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LLVMCONTEXT_IMPL_H
+#define LLVM_LLVMCONTEXT_IMPL_H
+
+#include "AttributeImpl.h"
+#include "ConstantsContext.h"
+#include "LeaksContext.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/ValueHandle.h"
+#include <vector>
+
+namespace llvm {
+
+class ConstantInt;
+class ConstantFP;
+class LLVMContext;
+class Type;
+class Value;
+
+struct DenseMapAPIntKeyInfo {
+  struct KeyTy {
+    APInt val;
+    Type* type;
+    KeyTy(const APInt& V, Type* Ty) : val(V), type(Ty) {}
+    bool operator==(const KeyTy& that) const {
+      return type == that.type && this->val == that.val;
+    }
+    bool operator!=(const KeyTy& that) const {
+      return !this->operator==(that);
+    }
+    friend hash_code hash_value(const KeyTy &Key) {
+      return hash_combine(Key.type, Key.val);
+    }
+  };
+  static inline KeyTy getEmptyKey() { return KeyTy(APInt(1,0), 0); }
+  static inline KeyTy getTombstoneKey() { return KeyTy(APInt(1,1), 0); }
+  static unsigned getHashValue(const KeyTy &Key) {
+    return static_cast<unsigned>(hash_value(Key));
+  }
+  static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
+    return LHS == RHS;
+  }
+};
+
+struct DenseMapAPFloatKeyInfo {
+  struct KeyTy {
+    APFloat val;
+    KeyTy(const APFloat& V) : val(V){}
+    bool operator==(const KeyTy& that) const {
+      return this->val.bitwiseIsEqual(that.val);
+    }
+    bool operator!=(const KeyTy& that) const {
+      return !this->operator==(that);
+    }
+    friend hash_code hash_value(const KeyTy &Key) {
+      return hash_combine(Key.val);
+    }
+  };
+  static inline KeyTy getEmptyKey() { 
+    return KeyTy(APFloat(APFloat::Bogus,1));
+  }
+  static inline KeyTy getTombstoneKey() { 
+    return KeyTy(APFloat(APFloat::Bogus,2)); 
+  }
+  static unsigned getHashValue(const KeyTy &Key) {
+    return static_cast<unsigned>(hash_value(Key));
+  }
+  static bool isEqual(const KeyTy &LHS, const KeyTy &RHS) {
+    return LHS == RHS;
+  }
+};
+
+struct AnonStructTypeKeyInfo {
+  struct KeyTy {
+    ArrayRef<Type*> ETypes;
+    bool isPacked;
+    KeyTy(const ArrayRef<Type*>& E, bool P) :
+      ETypes(E), isPacked(P) {}
+    KeyTy(const StructType* ST) :
+      ETypes(ArrayRef<Type*>(ST->element_begin(), ST->element_end())),
+      isPacked(ST->isPacked()) {}
+    bool operator==(const KeyTy& that) const {
+      if (isPacked != that.isPacked)
+        return false;
+      if (ETypes != that.ETypes)
+        return false;
+      return true;
+    }
+    bool operator!=(const KeyTy& that) const {
+      return !this->operator==(that);
+    }
+  };
+  static inline StructType* getEmptyKey() {
+    return DenseMapInfo<StructType*>::getEmptyKey();
+  }
+  static inline StructType* getTombstoneKey() {
+    return DenseMapInfo<StructType*>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const KeyTy& Key) {
+    return hash_combine(hash_combine_range(Key.ETypes.begin(),
+                                           Key.ETypes.end()),
+                        Key.isPacked);
+  }
+  static unsigned getHashValue(const StructType *ST) {
+    return getHashValue(KeyTy(ST));
+  }
+  static bool isEqual(const KeyTy& LHS, const StructType *RHS) {
+    if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+      return false;
+    return LHS == KeyTy(RHS);
+  }
+  static bool isEqual(const StructType *LHS, const StructType *RHS) {
+    return LHS == RHS;
+  }
+};
+
+struct FunctionTypeKeyInfo {
+  struct KeyTy {
+    const Type *ReturnType;
+    ArrayRef<Type*> Params;
+    bool isVarArg;
+    KeyTy(const Type* R, const ArrayRef<Type*>& P, bool V) :
+      ReturnType(R), Params(P), isVarArg(V) {}
+    KeyTy(const FunctionType* FT) :
+      ReturnType(FT->getReturnType()),
+      Params(ArrayRef<Type*>(FT->param_begin(), FT->param_end())),
+      isVarArg(FT->isVarArg()) {}
+    bool operator==(const KeyTy& that) const {
+      if (ReturnType != that.ReturnType)
+        return false;
+      if (isVarArg != that.isVarArg)
+        return false;
+      if (Params != that.Params)
+        return false;
+      return true;
+    }
+    bool operator!=(const KeyTy& that) const {
+      return !this->operator==(that);
+    }
+  };
+  static inline FunctionType* getEmptyKey() {
+    return DenseMapInfo<FunctionType*>::getEmptyKey();
+  }
+  static inline FunctionType* getTombstoneKey() {
+    return DenseMapInfo<FunctionType*>::getTombstoneKey();
+  }
+  static unsigned getHashValue(const KeyTy& Key) {
+    return hash_combine(Key.ReturnType,
+                        hash_combine_range(Key.Params.begin(),
+                                           Key.Params.end()),
+                        Key.isVarArg);
+  }
+  static unsigned getHashValue(const FunctionType *FT) {
+    return getHashValue(KeyTy(FT));
+  }
+  static bool isEqual(const KeyTy& LHS, const FunctionType *RHS) {
+    if (RHS == getEmptyKey() || RHS == getTombstoneKey())
+      return false;
+    return LHS == KeyTy(RHS);
+  }
+  static bool isEqual(const FunctionType *LHS, const FunctionType *RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Provide a FoldingSetTrait::Equals specialization for MDNode that can use a
+// shortcut to avoid comparing all operands.
+template<> struct FoldingSetTrait<MDNode> : DefaultFoldingSetTrait<MDNode> {
+  static bool Equals(const MDNode &X, const FoldingSetNodeID &ID,
+                     unsigned IDHash, FoldingSetNodeID &TempID) {
+    assert(!X.isNotUniqued() && "Non-uniqued MDNode in FoldingSet?");
+    // First, check if the cached hashes match.  If they don't we can skip the
+    // expensive operand walk.
+    if (X.Hash != IDHash)
+      return false;
+
+    // If they match we have to compare the operands.
+    X.Profile(TempID);
+    return TempID == ID;
+  }
+  static unsigned ComputeHash(const MDNode &X, FoldingSetNodeID &) {
+    return X.Hash; // Return cached hash.
+  }
+};
+
+/// DebugRecVH - This is a CallbackVH used to keep the Scope -> index maps
+/// up to date as MDNodes mutate.  This class is implemented in DebugLoc.cpp.
+class DebugRecVH : public CallbackVH {
+  /// Ctx - This is the LLVM Context being referenced.
+  LLVMContextImpl *Ctx;
+  
+  /// Idx - The index into either ScopeRecordIdx or ScopeInlinedAtRecords that
+  /// this reference lives in.  If this is zero, then it represents a
+  /// non-canonical entry that has no DenseMap value.  This can happen due to
+  /// RAUW.
+  int Idx;
+public:
+  DebugRecVH(MDNode *n, LLVMContextImpl *ctx, int idx)
+    : CallbackVH(n), Ctx(ctx), Idx(idx) {}
+  
+  MDNode *get() const {
+    return cast_or_null<MDNode>(getValPtr());
+  }
+  
+  virtual void deleted();
+  virtual void allUsesReplacedWith(Value *VNew);
+};
+  
+class LLVMContextImpl {
+public:
+  /// OwnedModules - The set of modules instantiated in this context, and which
+  /// will be automatically deleted if this context is deleted.
+  SmallPtrSet<Module*, 4> OwnedModules;
+  
+  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler;
+  void *InlineAsmDiagContext;
+  
+  typedef DenseMap<DenseMapAPIntKeyInfo::KeyTy, ConstantInt*, 
+                         DenseMapAPIntKeyInfo> IntMapTy;
+  IntMapTy IntConstants;
+  
+  typedef DenseMap<DenseMapAPFloatKeyInfo::KeyTy, ConstantFP*, 
+                         DenseMapAPFloatKeyInfo> FPMapTy;
+  FPMapTy FPConstants;
+
+  FoldingSet<AttributeImpl> AttrsSet;
+  FoldingSet<AttributeSetImpl> AttrsLists;
+  FoldingSet<AttributeSetNode> AttrsSetNodes;
+
+  StringMap<Value*> MDStringCache;
+
+  FoldingSet<MDNode> MDNodeSet;
+
+  // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
+  // aren't in the MDNodeSet, but they're still shared between objects, so no
+  // one object can destroy them.  This set allows us to at least destroy them
+  // on Context destruction.
+  SmallPtrSet<MDNode*, 1> NonUniquedMDNodes;
+  
+  DenseMap<Type*, ConstantAggregateZero*> CAZConstants;
+
+  typedef ConstantAggrUniqueMap<ArrayType, ConstantArray> ArrayConstantsTy;
+  ArrayConstantsTy ArrayConstants;
+  
+  typedef ConstantAggrUniqueMap<StructType, ConstantStruct> StructConstantsTy;
+  StructConstantsTy StructConstants;
+  
+  typedef ConstantAggrUniqueMap<VectorType, ConstantVector> VectorConstantsTy;
+  VectorConstantsTy VectorConstants;
+  
+  DenseMap<PointerType*, ConstantPointerNull*> CPNConstants;
+
+  DenseMap<Type*, UndefValue*> UVConstants;
+  
+  StringMap<ConstantDataSequential*> CDSConstants;
+
+  
+  DenseMap<std::pair<Function*, BasicBlock*> , BlockAddress*> BlockAddresses;
+  ConstantUniqueMap<ExprMapKeyType, const ExprMapKeyType&, Type, ConstantExpr>
+    ExprConstants;
+
+  ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&, PointerType,
+                    InlineAsm> InlineAsms;
+  
+  ConstantInt *TheTrueVal;
+  ConstantInt *TheFalseVal;
+  
+  LeakDetectorImpl<Value> LLVMObjects;
+  
+  // Basic type instances.
+  Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy;
+  Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
+  IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty;
+
+  
+  /// TypeAllocator - All dynamically allocated types are allocated from this.
+  /// They live forever until the context is torn down.
+  BumpPtrAllocator TypeAllocator;
+  
+  DenseMap<unsigned, IntegerType*> IntegerTypes;
+  
+  typedef DenseMap<FunctionType*, bool, FunctionTypeKeyInfo> FunctionTypeMap;
+  FunctionTypeMap FunctionTypes;
+  typedef DenseMap<StructType*, bool, AnonStructTypeKeyInfo> StructTypeMap;
+  StructTypeMap AnonStructTypes;
+  StringMap<StructType*> NamedStructTypes;
+  unsigned NamedStructTypesUniqueID;
+    
+  DenseMap<std::pair<Type *, uint64_t>, ArrayType*> ArrayTypes;
+  DenseMap<std::pair<Type *, unsigned>, VectorType*> VectorTypes;
+  DenseMap<Type*, PointerType*> PointerTypes;  // Pointers in AddrSpace = 0
+  DenseMap<std::pair<Type*, unsigned>, PointerType*> ASPointerTypes;
+
+
+  /// ValueHandles - This map keeps track of all of the value handles that are
+  /// watching a Value*.  The Value::HasValueHandle bit is used to know
+  /// whether or not a value has an entry in this map.
+  typedef DenseMap<Value*, ValueHandleBase*> ValueHandlesTy;
+  ValueHandlesTy ValueHandles;
+  
+  /// CustomMDKindNames - Map to hold the metadata string to ID mapping.
+  StringMap<unsigned> CustomMDKindNames;
+  
+  typedef std::pair<unsigned, TrackingVH<MDNode> > MDPairTy;
+  typedef SmallVector<MDPairTy, 2> MDMapTy;
+
+  /// MetadataStore - Collection of per-instruction metadata used in this
+  /// context.
+  DenseMap<const Instruction *, MDMapTy> MetadataStore;
+  
+  /// ScopeRecordIdx - This is the index in ScopeRecords for an MDNode scope
+  /// entry with no "inlined at" element.
+  DenseMap<MDNode*, int> ScopeRecordIdx;
+  
+  /// ScopeRecords - These are the actual mdnodes (in a value handle) for an
+  /// index.  The ValueHandle ensures that ScopeRecordIdx stays up to date if
+  /// the MDNode is RAUW'd.
+  std::vector<DebugRecVH> ScopeRecords;
+  
+  /// ScopeInlinedAtIdx - This is the index in ScopeInlinedAtRecords for an
+  /// scope/inlined-at pair.
+  DenseMap<std::pair<MDNode*, MDNode*>, int> ScopeInlinedAtIdx;
+  
+  /// ScopeInlinedAtRecords - These are the actual mdnodes (in value handles)
+  /// for an index.  The ValueHandle ensures that ScopeINlinedAtIdx stays up
+  /// to date.
+  std::vector<std::pair<DebugRecVH, DebugRecVH> > ScopeInlinedAtRecords;
+  
+  /// IntrinsicIDCache - Cache of intrinsic name (string) to numeric ID mappings
+  /// requested in this context
+  typedef DenseMap<const Function*, unsigned> IntrinsicIDCacheTy;
+  IntrinsicIDCacheTy IntrinsicIDCache;
+
+  int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
+  int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
+  
+  LLVMContextImpl(LLVMContext &C);
+  ~LLVMContextImpl();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/IR/LeakDetector.cpp b/contrib/llvm/lib/IR/LeakDetector.cpp
new file mode 100644
index 000000000000..835e5e61cdf9
--- /dev/null
+++ b/contrib/llvm/lib/IR/LeakDetector.cpp
@@ -0,0 +1,69 @@
+//===-- LeakDetector.cpp - Implement LeakDetector interface ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LeakDetector class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LeakDetector.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Threading.h"
+using namespace llvm;
+
+static ManagedStatic<sys::SmartMutex<true> > ObjectsLock;
+static ManagedStatic<LeakDetectorImpl<void> > Objects;
+
+static void clearGarbage(LLVMContext &Context) {
+  Objects->clear();
+  Context.pImpl->LLVMObjects.clear();
+}
+
+void LeakDetector::addGarbageObjectImpl(void *Object) {
+  sys::SmartScopedLock<true> Lock(*ObjectsLock);
+  Objects->addGarbage(Object);
+}
+
+void LeakDetector::addGarbageObjectImpl(const Value *Object) {
+  LLVMContextImpl *pImpl = Object->getContext().pImpl;
+  pImpl->LLVMObjects.addGarbage(Object);
+}
+
+void LeakDetector::removeGarbageObjectImpl(void *Object) {
+  sys::SmartScopedLock<true> Lock(*ObjectsLock);
+  Objects->removeGarbage(Object);
+}
+
+void LeakDetector::removeGarbageObjectImpl(const Value *Object) {
+  LLVMContextImpl *pImpl = Object->getContext().pImpl;
+  pImpl->LLVMObjects.removeGarbage(Object);
+}
+
+void LeakDetector::checkForGarbageImpl(LLVMContext &Context, 
+                                       const std::string &Message) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  sys::SmartScopedLock<true> Lock(*ObjectsLock);
+  
+  Objects->setName("GENERIC");
+  pImpl->LLVMObjects.setName("LLVM");
+  
+  // use non-short-circuit version so that both checks are performed
+  if (Objects->hasGarbage(Message) |
+      pImpl->LLVMObjects.hasGarbage(Message))
+    errs() << "\nThis is probably because you removed an object, but didn't "
+           << "delete it.  Please check your code for memory leaks.\n";
+
+  // Clear out results so we don't get duplicate warnings on
+  // next call...
+  clearGarbage(Context);
+}
diff --git a/contrib/llvm/lib/IR/LeaksContext.h b/contrib/llvm/lib/IR/LeaksContext.h
new file mode 100644
index 000000000000..5038dc9d6d6d
--- /dev/null
+++ b/contrib/llvm/lib/IR/LeaksContext.h
@@ -0,0 +1,92 @@
+//===- LeaksContext.h - LeadDetector Implementation ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines various helper methods and classes used by
+// LLVMContextImpl for leaks detectors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Value.h"
+
+namespace llvm {
+
+template <class T>
+struct PrinterTrait {
+  static void print(const T* P) { errs() << P; }
+};
+
+template<>
+struct PrinterTrait<Value> {
+  static void print(const Value* P) { errs() << *P; }
+};
+
+template <typename T>
+struct LeakDetectorImpl {
+  explicit LeakDetectorImpl(const char* const name = "") : 
+    Cache(0), Name(name) { }
+
+  void clear() {
+    Cache = 0;
+    Ts.clear();
+  }
+    
+  void setName(const char* n) { 
+    Name = n;
+  }
+    
+  // Because the most common usage pattern, by far, is to add a
+  // garbage object, then remove it immediately, we optimize this
+  // case.  When an object is added, it is not added to the set
+  // immediately, it is added to the CachedValue Value.  If it is
+  // immediately removed, no set search need be performed.
+  void addGarbage(const T* o) {
+    assert(Ts.count(o) == 0 && "Object already in set!");
+    if (Cache) {
+      assert(Cache != o && "Object already in set!");
+      Ts.insert(Cache);
+    }
+    Cache = o;
+  }
+
+  void removeGarbage(const T* o) {
+    if (o == Cache)
+      Cache = 0; // Cache hit
+    else
+      Ts.erase(o);
+  }
+
+  bool hasGarbage(const std::string& Message) {
+    addGarbage(0); // Flush the Cache
+
+    assert(Cache == 0 && "No value should be cached anymore!");
+
+    if (!Ts.empty()) {
+      errs() << "Leaked " << Name << " objects found: " << Message << ":\n";
+      for (typename SmallPtrSet<const T*, 8>::iterator I = Ts.begin(),
+           E = Ts.end(); I != E; ++I) {
+        errs() << '\t';
+        PrinterTrait<T>::print(*I);
+        errs() << '\n';
+      }
+      errs() << '\n';
+
+      return true;
+    }
+    
+    return false;
+  }
+
+private:
+  SmallPtrSet<const T*, 8> Ts;
+  const T* Cache;
+  const char* Name;
+};
+
+}
diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp
new file mode 100644
index 000000000000..0228aeb31f5d
--- /dev/null
+++ b/contrib/llvm/lib/IR/Metadata.cpp
@@ -0,0 +1,745 @@
+//===-- Metadata.cpp - Implement Metadata classes -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Metadata classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Metadata.h"
+#include "LLVMContextImpl.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/ValueHandle.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MDString implementation.
+//
+
+void MDString::anchor() { }
+
+MDString::MDString(LLVMContext &C)
+  : Value(Type::getMetadataTy(C), Value::MDStringVal) {}
+
+MDString *MDString::get(LLVMContext &Context, StringRef Str) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  StringMapEntry<Value*> &Entry =
+    pImpl->MDStringCache.GetOrCreateValue(Str);
+  Value *&S = Entry.getValue();
+  if (!S) S = new MDString(Context);
+  S->setValueName(&Entry);
+  return cast<MDString>(S);
+}
+
+//===----------------------------------------------------------------------===//
+// MDNodeOperand implementation.
+//
+
+// Use CallbackVH to hold MDNode operands.
+namespace llvm {
+class MDNodeOperand : public CallbackVH {
+  MDNode *getParent() {
+    MDNodeOperand *Cur = this;
+
+    while (Cur->getValPtrInt() != 1)
+      --Cur;
+
+    assert(Cur->getValPtrInt() == 1 &&
+           "Couldn't find the beginning of the operand list!");
+    return reinterpret_cast<MDNode*>(Cur) - 1;
+  }
+
+public:
+  MDNodeOperand(Value *V) : CallbackVH(V) {}
+  ~MDNodeOperand() {}
+
+  void set(Value *V) {
+    unsigned IsFirst = this->getValPtrInt();
+    this->setValPtr(V);
+    this->setAsFirstOperand(IsFirst);
+  }
+
+  /// setAsFirstOperand - Accessor method to mark the operand as the first in
+  /// the list.
+  void setAsFirstOperand(unsigned V) { this->setValPtrInt(V); }
+
+  virtual void deleted();
+  virtual void allUsesReplacedWith(Value *NV);
+};
+} // end namespace llvm.
+
+
+void MDNodeOperand::deleted() {
+  getParent()->replaceOperand(this, 0);
+}
+
+void MDNodeOperand::allUsesReplacedWith(Value *NV) {
+  getParent()->replaceOperand(this, NV);
+}
+
+//===----------------------------------------------------------------------===//
+// MDNode implementation.
+//
+
+/// getOperandPtr - Helper function to get the MDNodeOperand's coallocated on
+/// the end of the MDNode.
+static MDNodeOperand *getOperandPtr(MDNode *N, unsigned Op) {
+  // Use <= instead of < to permit a one-past-the-end address.
+  assert(Op <= N->getNumOperands() && "Invalid operand number");
+  return reinterpret_cast<MDNodeOperand*>(N + 1) + Op;
+}
+
+void MDNode::replaceOperandWith(unsigned i, Value *Val) {
+  MDNodeOperand *Op = getOperandPtr(this, i);
+  replaceOperand(Op, Val);
+}
+
+MDNode::MDNode(LLVMContext &C, ArrayRef<Value*> Vals, bool isFunctionLocal)
+: Value(Type::getMetadataTy(C), Value::MDNodeVal) {
+  NumOperands = Vals.size();
+
+  if (isFunctionLocal)
+    setValueSubclassData(getSubclassDataFromValue() | FunctionLocalBit);
+
+  // Initialize the operand list, which is co-allocated on the end of the node.
+  unsigned i = 0;
+  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
+       Op != E; ++Op, ++i) {
+    new (Op) MDNodeOperand(Vals[i]);
+
+    // Mark the first MDNodeOperand as being the first in the list of operands.
+    if (i == 0)
+      Op->setAsFirstOperand(1);
+  }
+}
+
+/// ~MDNode - Destroy MDNode.
+MDNode::~MDNode() {
+  assert((getSubclassDataFromValue() & DestroyFlag) != 0 &&
+         "Not being destroyed through destroy()?");
+  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
+  if (isNotUniqued()) {
+    pImpl->NonUniquedMDNodes.erase(this);
+  } else {
+    pImpl->MDNodeSet.RemoveNode(this);
+  }
+
+  // Destroy the operands.
+  for (MDNodeOperand *Op = getOperandPtr(this, 0), *E = Op+NumOperands;
+       Op != E; ++Op)
+    Op->~MDNodeOperand();
+}
+
+static const Function *getFunctionForValue(Value *V) {
+  if (!V) return NULL;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    BasicBlock *BB = I->getParent();
+    return BB ? BB->getParent() : 0;
+  }
+  if (Argument *A = dyn_cast<Argument>(V))
+    return A->getParent();
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(V))
+    return BB->getParent();
+  if (MDNode *MD = dyn_cast<MDNode>(V))
+    return MD->getFunction();
+  return NULL;
+}
+
+#ifndef NDEBUG
+static const Function *assertLocalFunction(const MDNode *N) {
+  if (!N->isFunctionLocal()) return 0;
+
+  // FIXME: This does not handle cyclic function local metadata.
+  const Function *F = 0, *NewF = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i)) {
+      if (MDNode *MD = dyn_cast<MDNode>(V))
+        NewF = assertLocalFunction(MD);
+      else
+        NewF = getFunctionForValue(V);
+    }
+    if (F == 0)
+      F = NewF;
+    else 
+      assert((NewF == 0 || F == NewF) &&"inconsistent function-local metadata");
+  }
+  return F;
+}
+#endif
+
+// getFunction - If this metadata is function-local and recursively has a
+// function-local operand, return the first such operand's parent function.
+// Otherwise, return null. getFunction() should not be used for performance-
+// critical code because it recursively visits all the MDNode's operands.  
+const Function *MDNode::getFunction() const {
+#ifndef NDEBUG
+  return assertLocalFunction(this);
+#else
+  if (!isFunctionLocal()) return NULL;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    if (const Function *F = getFunctionForValue(getOperand(i)))
+      return F;
+  return NULL;
+#endif
+}
+
+// destroy - Delete this node.  Only when there are no uses.
+void MDNode::destroy() {
+  setValueSubclassData(getSubclassDataFromValue() | DestroyFlag);
+  // Placement delete, then free the memory.
+  this->~MDNode();
+  free(this);
+}
+
+/// isFunctionLocalValue - Return true if this is a value that would require a
+/// function-local MDNode.
+static bool isFunctionLocalValue(Value *V) {
+  return isa<Instruction>(V) || isa<Argument>(V) || isa<BasicBlock>(V) ||
+         (isa<MDNode>(V) && cast<MDNode>(V)->isFunctionLocal());
+}
+
+MDNode *MDNode::getMDNode(LLVMContext &Context, ArrayRef<Value*> Vals,
+                          FunctionLocalness FL, bool Insert) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+
+  // Add all the operand pointers. Note that we don't have to add the
+  // isFunctionLocal bit because that's implied by the operands.
+  // Note that if the operands are later nulled out, the node will be
+  // removed from the uniquing map.
+  FoldingSetNodeID ID;
+  for (unsigned i = 0; i != Vals.size(); ++i)
+    ID.AddPointer(Vals[i]);
+
+  void *InsertPoint;
+  MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint);
+
+  if (N || !Insert)
+    return N;
+
+  bool isFunctionLocal = false;
+  switch (FL) {
+  case FL_Unknown:
+    for (unsigned i = 0; i != Vals.size(); ++i) {
+      Value *V = Vals[i];
+      if (!V) continue;
+      if (isFunctionLocalValue(V)) {
+        isFunctionLocal = true;
+        break;
+      }
+    }
+    break;
+  case FL_No:
+    isFunctionLocal = false;
+    break;
+  case FL_Yes:
+    isFunctionLocal = true;
+    break;
+  }
+
+  // Coallocate space for the node and Operands together, then placement new.
+  void *Ptr = malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
+  N = new (Ptr) MDNode(Context, Vals, isFunctionLocal);
+
+  // Cache the operand hash.
+  N->Hash = ID.ComputeHash();
+
+  // InsertPoint will have been set by the FindNodeOrInsertPos call.
+  pImpl->MDNodeSet.InsertNode(N, InsertPoint);
+
+  return N;
+}
+
+MDNode *MDNode::get(LLVMContext &Context, ArrayRef<Value*> Vals) {
+  return getMDNode(Context, Vals, FL_Unknown);
+}
+
+MDNode *MDNode::getWhenValsUnresolved(LLVMContext &Context,
+                                      ArrayRef<Value*> Vals,
+                                      bool isFunctionLocal) {
+  return getMDNode(Context, Vals, isFunctionLocal ? FL_Yes : FL_No);
+}
+
+MDNode *MDNode::getIfExists(LLVMContext &Context, ArrayRef<Value*> Vals) {
+  return getMDNode(Context, Vals, FL_Unknown, false);
+}
+
+MDNode *MDNode::getTemporary(LLVMContext &Context, ArrayRef<Value*> Vals) {
+  MDNode *N =
+    (MDNode *)malloc(sizeof(MDNode) + Vals.size() * sizeof(MDNodeOperand));
+  N = new (N) MDNode(Context, Vals, FL_No);
+  N->setValueSubclassData(N->getSubclassDataFromValue() |
+                          NotUniquedBit);
+  LeakDetector::addGarbageObject(N);
+  return N;
+}
+
+void MDNode::deleteTemporary(MDNode *N) {
+  assert(N->use_empty() && "Temporary MDNode has uses!");
+  assert(!N->getContext().pImpl->MDNodeSet.RemoveNode(N) &&
+         "Deleting a non-temporary uniqued node!");
+  assert(!N->getContext().pImpl->NonUniquedMDNodes.erase(N) &&
+         "Deleting a non-temporary non-uniqued node!");
+  assert((N->getSubclassDataFromValue() & NotUniquedBit) &&
+         "Temporary MDNode does not have NotUniquedBit set!");
+  assert((N->getSubclassDataFromValue() & DestroyFlag) == 0 &&
+         "Temporary MDNode has DestroyFlag set!");
+  LeakDetector::removeGarbageObject(N);
+  N->destroy();
+}
+
+/// getOperand - Return specified operand.
+Value *MDNode::getOperand(unsigned i) const {
+  assert(i < getNumOperands() && "Invalid operand number");
+  return *getOperandPtr(const_cast<MDNode*>(this), i);
+}
+
+void MDNode::Profile(FoldingSetNodeID &ID) const {
+  // Add all the operand pointers. Note that we don't have to add the
+  // isFunctionLocal bit because that's implied by the operands.
+  // Note that if the operands are later nulled out, the node will be
+  // removed from the uniquing map.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+    ID.AddPointer(getOperand(i));
+}
+
+void MDNode::setIsNotUniqued() {
+  setValueSubclassData(getSubclassDataFromValue() | NotUniquedBit);
+  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
+  pImpl->NonUniquedMDNodes.insert(this);
+}
+
+// Replace value from this node's operand list.
+void MDNode::replaceOperand(MDNodeOperand *Op, Value *To) {
+  Value *From = *Op;
+
+  // If is possible that someone did GV->RAUW(inst), replacing a global variable
+  // with an instruction or some other function-local object.  If this is a
+  // non-function-local MDNode, it can't point to a function-local object.
+  // Handle this case by implicitly dropping the MDNode reference to null.
+  // Likewise if the MDNode is function-local but for a different function.
+  if (To && isFunctionLocalValue(To)) {
+    if (!isFunctionLocal())
+      To = 0;
+    else {
+      const Function *F = getFunction();
+      const Function *FV = getFunctionForValue(To);
+      // Metadata can be function-local without having an associated function.
+      // So only consider functions to have changed if non-null.
+      if (F && FV && F != FV)
+        To = 0;
+    }
+  }
+  
+  if (From == To)
+    return;
+
+  // Update the operand.
+  Op->set(To);
+
+  // If this node is already not being uniqued (because one of the operands
+  // already went to null), then there is nothing else to do here.
+  if (isNotUniqued()) return;
+
+  LLVMContextImpl *pImpl = getType()->getContext().pImpl;
+
+  // Remove "this" from the context map.  FoldingSet doesn't have to reprofile
+  // this node to remove it, so we don't care what state the operands are in.
+  pImpl->MDNodeSet.RemoveNode(this);
+
+  // If we are dropping an argument to null, we choose to not unique the MDNode
+  // anymore.  This commonly occurs during destruction, and uniquing these
+  // brings little reuse.  Also, this means we don't need to include
+  // isFunctionLocal bits in FoldingSetNodeIDs for MDNodes.
+  if (To == 0) {
+    setIsNotUniqued();
+    return;
+  }
+
+  // Now that the node is out of the folding set, get ready to reinsert it.
+  // First, check to see if another node with the same operands already exists
+  // in the set.  If so, then this node is redundant.
+  FoldingSetNodeID ID;
+  Profile(ID);
+  void *InsertPoint;
+  if (MDNode *N = pImpl->MDNodeSet.FindNodeOrInsertPos(ID, InsertPoint)) {
+    replaceAllUsesWith(N);
+    destroy();
+    return;
+  }
+
+  // Cache the operand hash.
+  Hash = ID.ComputeHash();
+  // InsertPoint will have been set by the FindNodeOrInsertPos call.
+  pImpl->MDNodeSet.InsertNode(this, InsertPoint);
+
+  // If this MDValue was previously function-local but no longer is, clear
+  // its function-local flag.
+  if (isFunctionLocal() && !isFunctionLocalValue(To)) {
+    bool isStillFunctionLocal = false;
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+      Value *V = getOperand(i);
+      if (!V) continue;
+      if (isFunctionLocalValue(V)) {
+        isStillFunctionLocal = true;
+        break;
+      }
+    }
+    if (!isStillFunctionLocal)
+      setValueSubclassData(getSubclassDataFromValue() & ~FunctionLocalBit);
+  }
+}
+
+MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  SmallVector<MDNode *, 4> PathA;
+  MDNode *T = A;
+  while (T) {
+    PathA.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  SmallVector<MDNode *, 4> PathB;
+  T = B;
+  while (T) {
+    PathB.push_back(T);
+    T = T->getNumOperands() >= 2 ? cast_or_null<MDNode>(T->getOperand(1)) : 0;
+  }
+
+  int IA = PathA.size() - 1;
+  int IB = PathB.size() - 1;
+
+  MDNode *Ret = 0;
+  while (IA >= 0 && IB >=0) {
+    if (PathA[IA] == PathB[IB])
+      Ret = PathA[IA];
+    else
+      break;
+    --IA;
+    --IB;
+  }
+  return Ret;
+}
+
+MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return NULL;
+
+  APFloat AVal = cast<ConstantFP>(A->getOperand(0))->getValueAPF();
+  APFloat BVal = cast<ConstantFP>(B->getOperand(0))->getValueAPF();
+  if (AVal.compare(BVal) == APFloat::cmpLessThan)
+    return A;
+  return B;
+}
+
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
+static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) {
+  return !A.intersectWith(B).isEmptySet() || isContiguous(A, B);
+}
+
+static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                          ConstantInt *High) {
+  ConstantRange NewRange(Low->getValue(), High->getValue());
+  unsigned Size = EndPoints.size();
+  APInt LB = cast<ConstantInt>(EndPoints[Size - 2])->getValue();
+  APInt LE = cast<ConstantInt>(EndPoints[Size - 1])->getValue();
+  ConstantRange LastRange(LB, LE);
+  if (canBeMerged(NewRange, LastRange)) {
+    ConstantRange Union = LastRange.unionWith(NewRange);
+    Type *Ty = High->getType();
+    EndPoints[Size - 2] = ConstantInt::get(Ty, Union.getLower());
+    EndPoints[Size - 1] = ConstantInt::get(Ty, Union.getUpper());
+    return true;
+  }
+  return false;
+}
+
+static void addRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+                     ConstantInt *High) {
+  if (!EndPoints.empty())
+    if (tryMergeRange(EndPoints, Low, High))
+      return;
+
+  EndPoints.push_back(Low);
+  EndPoints.push_back(High);
+}
+
+MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
+  // Given two ranges, we want to compute the union of the ranges. This
+  // is slightly complitade by having to combine the intervals and merge
+  // the ones that overlap.
+
+  if (!A || !B)
+    return NULL;
+
+  if (A == B)
+    return A;
+
+  // First, walk both lists in older of the lower boundary of each interval.
+  // At each step, try to merge the new interval to the last one we adedd.
+  SmallVector<Value*, 4> EndPoints;
+  int AI = 0;
+  int BI = 0;
+  int AN = A->getNumOperands() / 2;
+  int BN = B->getNumOperands() / 2;
+  while (AI < AN && BI < BN) {
+    ConstantInt *ALow = cast<ConstantInt>(A->getOperand(2 * AI));
+    ConstantInt *BLow = cast<ConstantInt>(B->getOperand(2 * BI));
+
+    if (ALow->getValue().slt(BLow->getValue())) {
+      addRange(EndPoints, ALow, cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+      ++AI;
+    } else {
+      addRange(EndPoints, BLow, cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+      ++BI;
+    }
+  }
+  while (AI < AN) {
+    addRange(EndPoints, cast<ConstantInt>(A->getOperand(2 * AI)),
+             cast<ConstantInt>(A->getOperand(2 * AI + 1)));
+    ++AI;
+  }
+  while (BI < BN) {
+    addRange(EndPoints, cast<ConstantInt>(B->getOperand(2 * BI)),
+             cast<ConstantInt>(B->getOperand(2 * BI + 1)));
+    ++BI;
+  }
+
+  // If we have more than 2 ranges (4 endpoints) we have to try to merge
+  // the last and first ones.
+  unsigned Size = EndPoints.size();
+  if (Size > 4) {
+    ConstantInt *FB = cast<ConstantInt>(EndPoints[0]);
+    ConstantInt *FE = cast<ConstantInt>(EndPoints[1]);
+    if (tryMergeRange(EndPoints, FB, FE)) {
+      for (unsigned i = 0; i < Size - 2; ++i) {
+        EndPoints[i] = EndPoints[i + 2];
+      }
+      EndPoints.resize(Size - 2);
+    }
+  }
+
+  // If in the end we have a single range, it is possible that it is now the
+  // full range. Just drop the metadata in that case.
+  if (EndPoints.size() == 2) {
+    ConstantRange Range(cast<ConstantInt>(EndPoints[0])->getValue(),
+                        cast<ConstantInt>(EndPoints[1])->getValue());
+    if (Range.isFullSet())
+      return NULL;
+  }
+
+  return MDNode::get(A->getContext(), EndPoints);
+}
+
+//===----------------------------------------------------------------------===//
+// NamedMDNode implementation.
+//
+
+static SmallVector<TrackingVH<MDNode>, 4> &getNMDOps(void *Operands) {
+  return *(SmallVector<TrackingVH<MDNode>, 4>*)Operands;
+}
+
+NamedMDNode::NamedMDNode(const Twine &N)
+  : Name(N.str()), Parent(0),
+    Operands(new SmallVector<TrackingVH<MDNode>, 4>()) {
+}
+
+NamedMDNode::~NamedMDNode() {
+  dropAllReferences();
+  delete &getNMDOps(Operands);
+}
+
+/// getNumOperands - Return number of NamedMDNode operands.
+unsigned NamedMDNode::getNumOperands() const {
+  return (unsigned)getNMDOps(Operands).size();
+}
+
+/// getOperand - Return specified operand.
+MDNode *NamedMDNode::getOperand(unsigned i) const {
+  assert(i < getNumOperands() && "Invalid Operand number!");
+  return dyn_cast<MDNode>(&*getNMDOps(Operands)[i]);
+}
+
+/// addOperand - Add metadata Operand.
+void NamedMDNode::addOperand(MDNode *M) {
+  assert(!M->isFunctionLocal() &&
+         "NamedMDNode operands must not be function-local!");
+  getNMDOps(Operands).push_back(TrackingVH<MDNode>(M));
+}
+
+/// eraseFromParent - Drop all references and remove the node from parent
+/// module.
+void NamedMDNode::eraseFromParent() {
+  getParent()->eraseNamedMetadata(this);
+}
+
+/// dropAllReferences - Remove all uses and clear node vector.
+void NamedMDNode::dropAllReferences() {
+  getNMDOps(Operands).clear();
+}
+
+/// getName - Return a constant reference to this named metadata's name.
+StringRef NamedMDNode::getName() const {
+  return StringRef(Name);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Metadata method implementations.
+//
+
+void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
+  if (Node == 0 && !hasMetadata()) return;
+  setMetadata(getContext().getMDKindID(Kind), Node);
+}
+
+MDNode *Instruction::getMetadataImpl(StringRef Kind) const {
+  return getMetadataImpl(getContext().getMDKindID(Kind));
+}
+
+/// setMetadata - Set the metadata of of the specified kind to the specified
+/// node.  This updates/replaces metadata if already present, or removes it if
+/// Node is null.
+void Instruction::setMetadata(unsigned KindID, MDNode *Node) {
+  if (Node == 0 && !hasMetadata()) return;
+
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (KindID == LLVMContext::MD_dbg) {
+    DbgLoc = DebugLoc::getFromDILocation(Node);
+    return;
+  }
+  
+  // Handle the case when we're adding/updating metadata on an instruction.
+  if (Node) {
+    LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
+    assert(!Info.empty() == hasMetadataHashEntry() &&
+           "HasMetadata bit is wonked");
+    if (Info.empty()) {
+      setHasMetadataHashEntry(true);
+    } else {
+      // Handle replacement of an existing value.
+      for (unsigned i = 0, e = Info.size(); i != e; ++i)
+        if (Info[i].first == KindID) {
+          Info[i].second = Node;
+          return;
+        }
+    }
+
+    // No replacement, just add it to the list.
+    Info.push_back(std::make_pair(KindID, Node));
+    return;
+  }
+
+  // Otherwise, we're removing metadata from an instruction.
+  assert((hasMetadataHashEntry() ==
+          getContext().pImpl->MetadataStore.count(this)) &&
+         "HasMetadata bit out of date!");
+  if (!hasMetadataHashEntry())
+    return;  // Nothing to remove!
+  LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
+
+  // Common case is removing the only entry.
+  if (Info.size() == 1 && Info[0].first == KindID) {
+    getContext().pImpl->MetadataStore.erase(this);
+    setHasMetadataHashEntry(false);
+    return;
+  }
+
+  // Handle removal of an existing value.
+  for (unsigned i = 0, e = Info.size(); i != e; ++i)
+    if (Info[i].first == KindID) {
+      Info[i] = Info.back();
+      Info.pop_back();
+      assert(!Info.empty() && "Removing last entry should be handled above");
+      return;
+    }
+  // Otherwise, removing an entry that doesn't exist on the instruction.
+}
+
+MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (KindID == LLVMContext::MD_dbg)
+    return DbgLoc.getAsMDNode(getContext());
+  
+  if (!hasMetadataHashEntry()) return 0;
+  
+  LLVMContextImpl::MDMapTy &Info = getContext().pImpl->MetadataStore[this];
+  assert(!Info.empty() && "bit out of sync with hash table");
+
+  for (LLVMContextImpl::MDMapTy::iterator I = Info.begin(), E = Info.end();
+       I != E; ++I)
+    if (I->first == KindID)
+      return I->second;
+  return 0;
+}
+
+void Instruction::getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned,
+                                       MDNode*> > &Result) const {
+  Result.clear();
+  
+  // Handle 'dbg' as a special case since it is not stored in the hash table.
+  if (!DbgLoc.isUnknown()) {
+    Result.push_back(std::make_pair((unsigned)LLVMContext::MD_dbg,
+                                    DbgLoc.getAsMDNode(getContext())));
+    if (!hasMetadataHashEntry()) return;
+  }
+  
+  assert(hasMetadataHashEntry() &&
+         getContext().pImpl->MetadataStore.count(this) &&
+         "Shouldn't have called this");
+  const LLVMContextImpl::MDMapTy &Info =
+    getContext().pImpl->MetadataStore.find(this)->second;
+  assert(!Info.empty() && "Shouldn't have called this");
+
+  Result.append(Info.begin(), Info.end());
+
+  // Sort the resulting array so it is stable.
+  if (Result.size() > 1)
+    array_pod_sort(Result.begin(), Result.end());
+}
+
+void Instruction::
+getAllMetadataOtherThanDebugLocImpl(SmallVectorImpl<std::pair<unsigned,
+                                    MDNode*> > &Result) const {
+  Result.clear();
+  assert(hasMetadataHashEntry() &&
+         getContext().pImpl->MetadataStore.count(this) &&
+         "Shouldn't have called this");
+  const LLVMContextImpl::MDMapTy &Info =
+    getContext().pImpl->MetadataStore.find(this)->second;
+  assert(!Info.empty() && "Shouldn't have called this");
+  Result.append(Info.begin(), Info.end());
+
+  // Sort the resulting array so it is stable.
+  if (Result.size() > 1)
+    array_pod_sort(Result.begin(), Result.end());
+}
+
+/// clearMetadataHashEntries - Clear all hashtable-based metadata from
+/// this instruction.
+void Instruction::clearMetadataHashEntries() {
+  assert(hasMetadataHashEntry() && "Caller should check");
+  getContext().pImpl->MetadataStore.erase(this);
+  setHasMetadataHashEntry(false);
+}
+
diff --git a/contrib/llvm/lib/IR/Module.cpp b/contrib/llvm/lib/IR/Module.cpp
new file mode 100644
index 000000000000..8affcc946960
--- /dev/null
+++ b/contrib/llvm/lib/IR/Module.cpp
@@ -0,0 +1,451 @@
+//===-- Module.cpp - Implement the Module class ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Module class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/GVMaterializer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/LeakDetector.h"
+#include <algorithm>
+#include <cstdarg>
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Methods to implement the globals and functions lists.
+//
+
+// Explicit instantiations of SymbolTableListTraits since some of the methods
+// are not in the public header file.
+template class llvm::SymbolTableListTraits<Function, Module>;
+template class llvm::SymbolTableListTraits<GlobalVariable, Module>;
+template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
+
+//===----------------------------------------------------------------------===//
+// Primitive Module methods.
+//
+
+Module::Module(StringRef MID, LLVMContext& C)
+  : Context(C), Materializer(NULL), ModuleID(MID) {
+  ValSymTab = new ValueSymbolTable();
+  NamedMDSymTab = new StringMap<NamedMDNode *>();
+  Context.addModule(this);
+}
+
+Module::~Module() {
+  Context.removeModule(this);
+  dropAllReferences();
+  GlobalList.clear();
+  FunctionList.clear();
+  AliasList.clear();
+  NamedMDList.clear();
+  delete ValSymTab;
+  delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
+}
+
+/// Target endian information.
+Module::Endianness Module::getEndianness() const {
+  StringRef temp = DataLayout;
+  Module::Endianness ret = AnyEndianness;
+
+  while (!temp.empty()) {
+    std::pair<StringRef, StringRef> P = getToken(temp, "-");
+
+    StringRef token = P.first;
+    temp = P.second;
+
+    if (token[0] == 'e') {
+      ret = LittleEndian;
+    } else if (token[0] == 'E') {
+      ret = BigEndian;
+    }
+  }
+
+  return ret;
+}
+
+/// Target Pointer Size information.
+Module::PointerSize Module::getPointerSize() const {
+  StringRef temp = DataLayout;
+  Module::PointerSize ret = AnyPointerSize;
+
+  while (!temp.empty()) {
+    std::pair<StringRef, StringRef> TmpP = getToken(temp, "-");
+    temp = TmpP.second;
+    TmpP = getToken(TmpP.first, ":");
+    StringRef token = TmpP.second, signalToken = TmpP.first;
+
+    if (signalToken[0] == 'p') {
+      int size = 0;
+      getToken(token, ":").first.getAsInteger(10, size);
+      if (size == 32)
+        ret = Pointer32;
+      else if (size == 64)
+        ret = Pointer64;
+    }
+  }
+
+  return ret;
+}
+
+/// getNamedValue - Return the first global value in the module with
+/// the specified name, of arbitrary type.  This method returns null
+/// if a global with the specified name is not found.
+GlobalValue *Module::getNamedValue(StringRef Name) const {
+  return cast_or_null<GlobalValue>(getValueSymbolTable().lookup(Name));
+}
+
+/// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
+/// This ID is uniqued across modules in the current LLVMContext.
+unsigned Module::getMDKindID(StringRef Name) const {
+  return Context.getMDKindID(Name);
+}
+
+/// getMDKindNames - Populate client supplied SmallVector with the name for
+/// custom metadata IDs registered in this LLVMContext.   ID #0 is not used,
+/// so it is filled in as an empty string.
+void Module::getMDKindNames(SmallVectorImpl<StringRef> &Result) const {
+  return Context.getMDKindNames(Result);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the functions in the module.
+//
+
+// getOrInsertFunction - Look up the specified function in the module symbol
+// table.  If it does not exist, add a prototype for the function and return
+// it.  This is nice because it allows most passes to get away with not handling
+// the symbol table directly for this common task.
+//
+Constant *Module::getOrInsertFunction(StringRef Name,
+                                      FunctionType *Ty,
+                                      AttributeSet AttributeList) {
+  // See if we have a definition for the specified function already.
+  GlobalValue *F = getNamedValue(Name);
+  if (F == 0) {
+    // Nope, add it
+    Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
+    if (!New->isIntrinsic())       // Intrinsics get attrs set on construction
+      New->setAttributes(AttributeList);
+    FunctionList.push_back(New);
+    return New;                    // Return the new prototype.
+  }
+
+  // Okay, the function exists.  Does it have externally visible linkage?
+  if (F->hasLocalLinkage()) {
+    // Clear the function's name.
+    F->setName("");
+    // Retry, now there won't be a conflict.
+    Constant *NewF = getOrInsertFunction(Name, Ty);
+    F->setName(Name);
+    return NewF;
+  }
+
+  // If the function exists but has the wrong type, return a bitcast to the
+  // right type.
+  if (F->getType() != PointerType::getUnqual(Ty))
+    return ConstantExpr::getBitCast(F, PointerType::getUnqual(Ty));
+
+  // Otherwise, we just found the existing function or a prototype.
+  return F;
+}
+
+Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
+                                             FunctionType *Ty,
+                                             AttributeSet AttributeList) {
+  // See if we have a definition for the specified function already.
+  GlobalValue *F = getNamedValue(Name);
+  if (F == 0) {
+    // Nope, add it
+    Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
+    New->setAttributes(AttributeList);
+    FunctionList.push_back(New);
+    return New; // Return the new prototype.
+  }
+
+  // Otherwise, we just found the existing function or a prototype.
+  return F;
+}
+
+Constant *Module::getOrInsertFunction(StringRef Name,
+                                      FunctionType *Ty) {
+  return getOrInsertFunction(Name, Ty, AttributeSet());
+}
+
+// getOrInsertFunction - Look up the specified function in the module symbol
+// table.  If it does not exist, add a prototype for the function and return it.
+// This version of the method takes a null terminated list of function
+// arguments, which makes it easier for clients to use.
+//
+Constant *Module::getOrInsertFunction(StringRef Name,
+                                      AttributeSet AttributeList,
+                                      Type *RetTy, ...) {
+  va_list Args;
+  va_start(Args, RetTy);
+
+  // Build the list of argument types...
+  std::vector<Type*> ArgTys;
+  while (Type *ArgTy = va_arg(Args, Type*))
+    ArgTys.push_back(ArgTy);
+
+  va_end(Args);
+
+  // Build the function type and chain to the other getOrInsertFunction...
+  return getOrInsertFunction(Name,
+                             FunctionType::get(RetTy, ArgTys, false),
+                             AttributeList);
+}
+
+Constant *Module::getOrInsertFunction(StringRef Name,
+                                      Type *RetTy, ...) {
+  va_list Args;
+  va_start(Args, RetTy);
+
+  // Build the list of argument types...
+  std::vector<Type*> ArgTys;
+  while (Type *ArgTy = va_arg(Args, Type*))
+    ArgTys.push_back(ArgTy);
+
+  va_end(Args);
+
+  // Build the function type and chain to the other getOrInsertFunction...
+  return getOrInsertFunction(Name,
+                             FunctionType::get(RetTy, ArgTys, false),
+                             AttributeSet());
+}
+
+// getFunction - Look up the specified function in the module symbol table.
+// If it does not exist, return null.
+//
+Function *Module::getFunction(StringRef Name) const {
+  return dyn_cast_or_null<Function>(getNamedValue(Name));
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the global variables in the module.
+//
+
+/// getGlobalVariable - Look up the specified global variable in the module
+/// symbol table.  If it does not exist, return null.  The type argument
+/// should be the underlying type of the global, i.e., it should not have
+/// the top-level PointerType, which represents the address of the global.
+/// If AllowLocal is set to true, this function will return types that
+/// have an local. By default, these types are not returned.
+///
+GlobalVariable *Module::getGlobalVariable(StringRef Name,
+                                          bool AllowLocal) const {
+  if (GlobalVariable *Result =
+      dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
+    if (AllowLocal || !Result->hasLocalLinkage())
+      return Result;
+  return 0;
+}
+
+/// getOrInsertGlobal - Look up the specified global in the module symbol table.
+///   1. If it does not exist, add a declaration of the global and return it.
+///   2. Else, the global exists but has the wrong type: return the function
+///      with a constantexpr cast to the right type.
+///   3. Finally, if the existing global is the correct delclaration, return the
+///      existing global.
+Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
+  // See if we have a definition for the specified global already.
+  GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(getNamedValue(Name));
+  if (GV == 0) {
+    // Nope, add it
+    GlobalVariable *New =
+      new GlobalVariable(*this, Ty, false, GlobalVariable::ExternalLinkage,
+                         0, Name);
+     return New;                    // Return the new declaration.
+  }
+
+  // If the variable exists but has the wrong type, return a bitcast to the
+  // right type.
+  if (GV->getType() != PointerType::getUnqual(Ty))
+    return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
+
+  // Otherwise, we just found the existing function or a prototype.
+  return GV;
+}
+
+//===----------------------------------------------------------------------===//
+// Methods for easy access to the global variables in the module.
+//
+
+// getNamedAlias - Look up the specified global in the module symbol table.
+// If it does not exist, return null.
+//
+GlobalAlias *Module::getNamedAlias(StringRef Name) const {
+  return dyn_cast_or_null<GlobalAlias>(getNamedValue(Name));
+}
+
+/// getNamedMetadata - Return the first NamedMDNode in the module with the
+/// specified name. This method returns null if a NamedMDNode with the
+/// specified name is not found.
+NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
+  SmallString<256> NameData;
+  StringRef NameRef = Name.toStringRef(NameData);
+  return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef);
+}
+
+/// getOrInsertNamedMetadata - Return the first named MDNode in the module
+/// with the specified name. This method returns a new NamedMDNode if a
+/// NamedMDNode with the specified name is not found.
+NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
+  NamedMDNode *&NMD =
+    (*static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab))[Name];
+  if (!NMD) {
+    NMD = new NamedMDNode(Name);
+    NMD->setParent(this);
+    NamedMDList.push_back(NMD);
+  }
+  return NMD;
+}
+
+/// eraseNamedMetadata - Remove the given NamedMDNode from this module and
+/// delete it.
+void Module::eraseNamedMetadata(NamedMDNode *NMD) {
+  static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab)->erase(NMD->getName());
+  NamedMDList.erase(NMD);
+}
+
+/// getModuleFlagsMetadata - Returns the module flags in the provided vector.
+void Module::
+getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
+  const NamedMDNode *ModFlags = getModuleFlagsMetadata();
+  if (!ModFlags) return;
+
+  for (unsigned i = 0, e = ModFlags->getNumOperands(); i != e; ++i) {
+    MDNode *Flag = ModFlags->getOperand(i);
+    ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
+    MDString *Key = cast<MDString>(Flag->getOperand(1));
+    Value *Val = Flag->getOperand(2);
+    Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
+                                    Key, Val));
+  }
+}
+
+/// getModuleFlagsMetadata - Returns the NamedMDNode in the module that
+/// represents module-level flags. This method returns null if there are no
+/// module-level flags.
+NamedMDNode *Module::getModuleFlagsMetadata() const {
+  return getNamedMetadata("llvm.module.flags");
+}
+
+/// getOrInsertModuleFlagsMetadata - Returns the NamedMDNode in the module that
+/// represents module-level flags. If module-level flags aren't found, it
+/// creates the named metadata that contains them.
+NamedMDNode *Module::getOrInsertModuleFlagsMetadata() {
+  return getOrInsertNamedMetadata("llvm.module.flags");
+}
+
+/// addModuleFlag - Add a module-level flag to the module-level flags
+/// metadata. It will create the module-level flags named metadata if it doesn't
+/// already exist.
+void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+                           Value *Val) {
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  Value *Ops[3] = {
+    ConstantInt::get(Int32Ty, Behavior), MDString::get(Context, Key), Val
+  };
+  getOrInsertModuleFlagsMetadata()->addOperand(MDNode::get(Context, Ops));
+}
+void Module::addModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+                           uint32_t Val) {
+  Type *Int32Ty = Type::getInt32Ty(Context);
+  addModuleFlag(Behavior, Key, ConstantInt::get(Int32Ty, Val));
+}
+void Module::addModuleFlag(MDNode *Node) {
+  assert(Node->getNumOperands() == 3 &&
+         "Invalid number of operands for module flag!");
+  assert(isa<ConstantInt>(Node->getOperand(0)) &&
+         isa<MDString>(Node->getOperand(1)) &&
+         "Invalid operand types for module flag!");
+  getOrInsertModuleFlagsMetadata()->addOperand(Node);
+}
+
+//===----------------------------------------------------------------------===//
+// Methods to control the materialization of GlobalValues in the Module.
+//
+void Module::setMaterializer(GVMaterializer *GVM) {
+  assert(!Materializer &&
+         "Module already has a GVMaterializer.  Call MaterializeAllPermanently"
+         " to clear it out before setting another one.");
+  Materializer.reset(GVM);
+}
+
+bool Module::isMaterializable(const GlobalValue *GV) const {
+  if (Materializer)
+    return Materializer->isMaterializable(GV);
+  return false;
+}
+
+bool Module::isDematerializable(const GlobalValue *GV) const {
+  if (Materializer)
+    return Materializer->isDematerializable(GV);
+  return false;
+}
+
+bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+  if (Materializer)
+    return Materializer->Materialize(GV, ErrInfo);
+  return false;
+}
+
+void Module::Dematerialize(GlobalValue *GV) {
+  if (Materializer)
+    return Materializer->Dematerialize(GV);
+}
+
+bool Module::MaterializeAll(std::string *ErrInfo) {
+  if (!Materializer)
+    return false;
+  return Materializer->MaterializeModule(this, ErrInfo);
+}
+
+bool Module::MaterializeAllPermanently(std::string *ErrInfo) {
+  if (MaterializeAll(ErrInfo))
+    return true;
+  Materializer.reset();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Other module related stuff.
+//
+
+
+// dropAllReferences() - This function causes all the subelements to "let go"
+// of all references that they are maintaining.  This allows one to 'delete' a
+// whole module at a time, even though there may be circular references... first
+// all references are dropped, and all use counts go to zero.  Then everything
+// is deleted for real.  Note that no operations are valid on an object that
+// has "dropped all references", except operator delete.
+//
+void Module::dropAllReferences() {
+  for(Module::iterator I = begin(), E = end(); I != E; ++I)
+    I->dropAllReferences();
+
+  for(Module::global_iterator I = global_begin(), E = global_end(); I != E; ++I)
+    I->dropAllReferences();
+
+  for(Module::alias_iterator I = alias_begin(), E = alias_end(); I != E; ++I)
+    I->dropAllReferences();
+}
diff --git a/contrib/llvm/lib/IR/Pass.cpp b/contrib/llvm/lib/IR/Pass.cpp
new file mode 100644
index 000000000000..7fc48282380b
--- /dev/null
+++ b/contrib/llvm/lib/IR/Pass.cpp
@@ -0,0 +1,276 @@
+//===- Pass.cpp - LLVM Pass Infrastructure Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM Pass infrastructure.  It is primarily
+// responsible with ensuring that passes are executed and batched together
+// optimally.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/PassNameParser.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Pass Implementation
+//
+
+// Force out-of-line virtual method.
+Pass::~Pass() {
+  delete Resolver;
+}
+
+// Force out-of-line virtual method.
+ModulePass::~ModulePass() { }
+
+Pass *ModulePass::createPrinterPass(raw_ostream &O,
+                                    const std::string &Banner) const {
+  return createPrintModulePass(&O, false, Banner);
+}
+
+PassManagerType ModulePass::getPotentialPassManagerType() const {
+  return PMT_ModulePassManager;
+}
+
+bool Pass::mustPreserveAnalysisID(char &AID) const {
+  return Resolver->getAnalysisIfAvailable(&AID, true) != 0;
+}
+
+// dumpPassStructure - Implement the -debug-pass=Structure option
+void Pass::dumpPassStructure(unsigned Offset) {
+  dbgs().indent(Offset*2) << getPassName() << "\n";
+}
+
+/// getPassName - Return a nice clean name for a pass.  This usually
+/// implemented in terms of the name that is registered by one of the
+/// Registration templates, but can be overloaded directly.
+///
+const char *Pass::getPassName() const {
+  AnalysisID AID =  getPassID();
+  const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(AID);
+  if (PI)
+    return PI->getPassName();
+  return "Unnamed pass: implement Pass::getPassName()";
+}
+
+void Pass::preparePassManager(PMStack &) {
+  // By default, don't do anything.
+}
+
+PassManagerType Pass::getPotentialPassManagerType() const {
+  // Default implementation.
+  return PMT_Unknown;
+}
+
+void Pass::getAnalysisUsage(AnalysisUsage &) const {
+  // By default, no analysis results are used, all are invalidated.
+}
+
+void Pass::releaseMemory() {
+  // By default, don't do anything.
+}
+
+void Pass::verifyAnalysis() const {
+  // By default, don't do anything.
+}
+
+void *Pass::getAdjustedAnalysisPointer(AnalysisID AID) {
+  return this;
+}
+
+ImmutablePass *Pass::getAsImmutablePass() {
+  return 0;
+}
+
+PMDataManager *Pass::getAsPMDataManager() {
+  return 0;
+}
+
+void Pass::setResolver(AnalysisResolver *AR) {
+  assert(!Resolver && "Resolver is already set");
+  Resolver = AR;
+}
+
+// print - Print out the internal state of the pass.  This is called by Analyze
+// to print out the contents of an analysis.  Otherwise it is not necessary to
+// implement this method.
+//
+void Pass::print(raw_ostream &O,const Module*) const {
+  O << "Pass::print not implemented for pass: '" << getPassName() << "'!\n";
+}
+
+// dump - call print(cerr);
+void Pass::dump() const {
+  print(dbgs(), 0);
+}
+
+//===----------------------------------------------------------------------===//
+// ImmutablePass Implementation
+//
+// Force out-of-line virtual method.
+ImmutablePass::~ImmutablePass() { }
+
+void ImmutablePass::initializePass() {
+  // By default, don't do anything.
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPass Implementation
+//
+
+Pass *FunctionPass::createPrinterPass(raw_ostream &O,
+                                      const std::string &Banner) const {
+  return createPrintFunctionPass(Banner, &O);
+}
+
+PassManagerType FunctionPass::getPotentialPassManagerType() const {
+  return PMT_FunctionPassManager;
+}
+
+//===----------------------------------------------------------------------===//
+// BasicBlockPass Implementation
+//
+
+Pass *BasicBlockPass::createPrinterPass(raw_ostream &O,
+                                        const std::string &Banner) const {
+  return createPrintBasicBlockPass(&O, false, Banner);
+}
+
+bool BasicBlockPass::doInitialization(Function &) {
+  // By default, don't do anything.
+  return false;
+}
+
+bool BasicBlockPass::doFinalization(Function &) {
+  // By default, don't do anything.
+  return false;
+}
+
+PassManagerType BasicBlockPass::getPotentialPassManagerType() const {
+  return PMT_BasicBlockPassManager;
+}
+
+const PassInfo *Pass::lookupPassInfo(const void *TI) {
+  return PassRegistry::getPassRegistry()->getPassInfo(TI);
+}
+
+const PassInfo *Pass::lookupPassInfo(StringRef Arg) {
+  return PassRegistry::getPassRegistry()->getPassInfo(Arg);
+}
+
+Pass *Pass::createPass(AnalysisID ID) {
+  const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(ID);
+  if (!PI)
+    return NULL;
+  return PI->createPass();
+}
+
+Pass *PassInfo::createPass() const {
+  assert((!isAnalysisGroup() || NormalCtor) &&
+         "No default implementation found for analysis group!");
+  assert(NormalCtor &&
+         "Cannot call createPass on PassInfo without default ctor!");
+  return NormalCtor();
+}
+
+//===----------------------------------------------------------------------===//
+//                  Analysis Group Implementation Code
+//===----------------------------------------------------------------------===//
+
+// RegisterAGBase implementation
+//
+RegisterAGBase::RegisterAGBase(const char *Name, const void *InterfaceID,
+                               const void *PassID, bool isDefault)
+    : PassInfo(Name, InterfaceID) {
+  PassRegistry::getPassRegistry()->registerAnalysisGroup(InterfaceID, PassID,
+                                                         *this, isDefault);
+}
+
+//===----------------------------------------------------------------------===//
+// PassRegistrationListener implementation
+//
+
+// PassRegistrationListener ctor - Add the current object to the list of
+// PassRegistrationListeners...
+PassRegistrationListener::PassRegistrationListener() {
+  PassRegistry::getPassRegistry()->addRegistrationListener(this);
+}
+
+// dtor - Remove object from list of listeners...
+PassRegistrationListener::~PassRegistrationListener() {
+  PassRegistry::getPassRegistry()->removeRegistrationListener(this);
+}
+
+// enumeratePasses - Iterate over the registered passes, calling the
+// passEnumerate callback on each PassInfo object.
+//
+void PassRegistrationListener::enumeratePasses() {
+  PassRegistry::getPassRegistry()->enumerateWith(this);
+}
+
+PassNameParser::~PassNameParser() {}
+
+//===----------------------------------------------------------------------===//
+//   AnalysisUsage Class Implementation
+//
+
+namespace {
+  struct GetCFGOnlyPasses : public PassRegistrationListener {
+    typedef AnalysisUsage::VectorType VectorType;
+    VectorType &CFGOnlyList;
+    GetCFGOnlyPasses(VectorType &L) : CFGOnlyList(L) {}
+
+    void passEnumerate(const PassInfo *P) {
+      if (P->isCFGOnlyPass())
+        CFGOnlyList.push_back(P->getTypeInfo());
+    }
+  };
+}
+
+// setPreservesCFG - This function should be called to by the pass, iff they do
+// not:
+//
+//  1. Add or remove basic blocks from the function
+//  2. Modify terminator instructions in any way.
+//
+// This function annotates the AnalysisUsage info object to say that analyses
+// that only depend on the CFG are preserved by this pass.
+//
+void AnalysisUsage::setPreservesCFG() {
+  // Since this transformation doesn't modify the CFG, it preserves all analyses
+  // that only depend on the CFG (like dominators, loop info, etc...)
+  GetCFGOnlyPasses(Preserved).enumeratePasses();
+}
+
+AnalysisUsage &AnalysisUsage::addPreserved(StringRef Arg) {
+  const PassInfo *PI = Pass::lookupPassInfo(Arg);
+  // If the pass exists, preserve it. Otherwise silently do nothing.
+  if (PI) Preserved.push_back(PI->getTypeInfo());
+  return *this;
+}
+
+AnalysisUsage &AnalysisUsage::addRequiredID(const void *ID) {
+  Required.push_back(ID);
+  return *this;
+}
+
+AnalysisUsage &AnalysisUsage::addRequiredID(char &ID) {
+  Required.push_back(&ID);
+  return *this;
+}
+
+AnalysisUsage &AnalysisUsage::addRequiredTransitiveID(char &ID) {
+  Required.push_back(&ID);
+  RequiredTransitive.push_back(&ID);
+  return *this;
+}
diff --git a/contrib/llvm/lib/IR/PassManager.cpp b/contrib/llvm/lib/IR/PassManager.cpp
new file mode 100644
index 000000000000..3c968aac164f
--- /dev/null
+++ b/contrib/llvm/lib/IR/PassManager.cpp
@@ -0,0 +1,1912 @@
+//===- PassManager.cpp - LLVM Pass Infrastructure Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM Pass Manager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/PassManagers.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PassNameParser.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+// See PassManagers.h for Pass Manager infrastructure overview.
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// Pass debugging information.  Often it is useful to find out what pass is
+// running when a crash occurs in a utility.  When this library is compiled with
+// debugging on, a command line option (--debug-pass) is enabled that causes the
+// pass name to be printed before it executes.
+//
+
+// Different debug levels that can be enabled...
+enum PassDebugLevel {
+  None, Arguments, Structure, Executions, Details
+};
+
+static cl::opt<enum PassDebugLevel>
+PassDebugging("debug-pass", cl::Hidden,
+                  cl::desc("Print PassManager debugging information"),
+                  cl::values(
+  clEnumVal(None      , "disable debug output"),
+  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
+  clEnumVal(Structure , "print pass structure before run()"),
+  clEnumVal(Executions, "print pass name before it is executed"),
+  clEnumVal(Details   , "print pass details when it is executed"),
+                             clEnumValEnd));
+
+typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
+PassOptionList;
+
+// Print IR out before/after specified passes.
+static PassOptionList
+PrintBefore("print-before",
+            llvm::cl::desc("Print IR before specified passes"),
+            cl::Hidden);
+
+static PassOptionList
+PrintAfter("print-after",
+           llvm::cl::desc("Print IR after specified passes"),
+           cl::Hidden);
+
+static cl::opt<bool>
+PrintBeforeAll("print-before-all",
+               llvm::cl::desc("Print IR before each pass"),
+               cl::init(false));
+static cl::opt<bool>
+PrintAfterAll("print-after-all",
+              llvm::cl::desc("Print IR after each pass"),
+              cl::init(false));
+
+/// This is a helper to determine whether to print IR before or
+/// after a pass.
+
+static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
+                                         PassOptionList &PassesToPrint) {
+  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
+    const llvm::PassInfo *PassInf = PassesToPrint[i];
+    if (PassInf)
+      if (PassInf->getPassArgument() == PI->getPassArgument()) {
+        return true;
+      }
+  }
+  return false;
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// before it.
+static bool ShouldPrintBeforePass(const PassInfo *PI) {
+  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// after it.
+static bool ShouldPrintAfterPass(const PassInfo *PI) {
+  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
+}
+
+} // End of llvm namespace
+
+/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
+/// or higher is specified.
+bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
+  return PassDebugging >= Executions;
+}
+
+
+
+
+void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
+  if (V == 0 && M == 0)
+    OS << "Releasing pass '";
+  else
+    OS << "Running pass '";
+
+  OS << P->getPassName() << "'";
+
+  if (M) {
+    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
+    return;
+  }
+  if (V == 0) {
+    OS << '\n';
+    return;
+  }
+
+  OS << " on ";
+  if (isa<Function>(V))
+    OS << "function";
+  else if (isa<BasicBlock>(V))
+    OS << "basic block";
+  else
+    OS << "value";
+
+  OS << " '";
+  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
+  OS << "'\n";
+}
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// BBPassManager
+//
+/// BBPassManager manages BasicBlockPass. It batches all the
+/// pass together and sequence them to process one basic block before
+/// processing next basic block.
+class BBPassManager : public PMDataManager, public FunctionPass {
+
+public:
+  static char ID;
+  explicit BBPassManager()
+    : PMDataManager(), FunctionPass(ID) {}
+
+  /// Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the function, and if so, return true.
+  bool runOnFunction(Function &F);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M);
+  bool doInitialization(Function &F);
+  bool doFinalization(Module &M);
+  bool doFinalization(Function &F);
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  virtual const char *getPassName() const {
+    return "BasicBlock Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      BP->dumpPassStructure(Offset + 1);
+      dumpLastUses(BP, Offset+1);
+    }
+  }
+
+  BasicBlockPass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
+    return BP;
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_BasicBlockPassManager;
+  }
+};
+
+char BBPassManager::ID = 0;
+}
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl
+//
+/// FunctionPassManagerImpl manages FPPassManagers
+class FunctionPassManagerImpl : public Pass,
+                                public PMDataManager,
+                                public PMTopLevelManager {
+  virtual void anchor();
+private:
+  bool wasRun;
+public:
+  static char ID;
+  explicit FunctionPassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a function printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintFunctionPass(Banner, &O);
+  }
+
+  // Prepare for running an on the fly pass, freeing memory if needed
+  // from a previous run.
+  void releaseMemoryOnTheFly();
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_FunctionPassManager;
+  }
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  FPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
+    return FP;
+  }
+};
+
+void FunctionPassManagerImpl::anchor() {}
+
+char FunctionPassManagerImpl::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// MPPassManager
+//
+/// MPPassManager manages ModulePasses and function pass managers.
+/// It batches all Module passes and function pass managers together and
+/// sequences them to process one module.
+class MPPassManager : public Pass, public PMDataManager {
+public:
+  static char ID;
+  explicit MPPassManager() :
+    Pass(PT_PassManager, ID), PMDataManager() { }
+
+  // Delete on the fly managers.
+  virtual ~MPPassManager() {
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+         I != E; ++I) {
+      FunctionPassManagerImpl *FPP = I->second;
+      delete FPP;
+    }
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  /// Return function pass corresponding to PassInfo PI, that is
+  /// required by module pass MP. Instantiate analysis pass, by using
+  /// its runOnFunction() for function F.
+  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
+
+  virtual const char *getPassName() const {
+    return "Module Pass Manager";
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      ModulePass *MP = getContainedPass(Index);
+      MP->dumpPassStructure(Offset + 1);
+      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+        OnTheFlyManagers.find(MP);
+      if (I != OnTheFlyManagers.end())
+        I->second->dumpPassStructure(Offset + 2);
+      dumpLastUses(MP, Offset+1);
+    }
+  }
+
+  ModulePass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<ModulePass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_ModulePassManager;
+  }
+
+ private:
+  /// Collection of on the fly FPPassManagers. These managers manage
+  /// function passes that are required by module passes.
+  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+};
+
+char MPPassManager::ID = 0;
+//===----------------------------------------------------------------------===//
+// PassManagerImpl
+//
+
+/// PassManagerImpl manages MPPassManagers
+class PassManagerImpl : public Pass,
+                        public PMDataManager,
+                        public PMTopLevelManager {
+  virtual void anchor();
+
+public:
+  static char ID;
+  explicit PassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+                              PMTopLevelManager(new MPPassManager()) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_ModulePassManager;
+  }
+
+  MPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
+    return MP;
+  }
+};
+
+void PassManagerImpl::anchor() {}
+
+char PassManagerImpl::ID = 0;
+} // End of llvm namespace
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// TimingInfo Class - This class is used to calculate information about the
+/// amount of time each pass takes to execute.  This only happens when
+/// -time-passes is enabled on the command line.
+///
+
+static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
+
+class TimingInfo {
+  DenseMap<Pass*, Timer*> TimingData;
+  TimerGroup TG;
+public:
+  // Use 'create' member to get this.
+  TimingInfo() : TG("... Pass execution timing report ...") {}
+
+  // TimingDtor - Print out information about timing information
+  ~TimingInfo() {
+    // Delete all of the timers, which accumulate their info into the
+    // TimerGroup.
+    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
+         E = TimingData.end(); I != E; ++I)
+      delete I->second;
+    // TimerGroup is deleted next, printing the report.
+  }
+
+  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
+  // to a non null value (if the -time-passes option is enabled) or it leaves it
+  // null.  It may be called multiple times.
+  static void createTheTimeInfo();
+
+  /// getPassTimer - Return the timer for the specified pass if it exists.
+  Timer *getPassTimer(Pass *P) {
+    if (P->getAsPMDataManager())
+      return 0;
+
+    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
+    Timer *&T = TimingData[P];
+    if (T == 0)
+      T = new Timer(P->getPassName(), TG);
+    return T;
+  }
+};
+
+} // End of anon namespace
+
+static TimingInfo *TheTimeInfo;
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager implementation
+
+/// Initialize top level manager. Create first pass manager.
+PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
+  PMDM->setTopLevelManager(this);
+  addPassManager(PMDM);
+  activeStack.push(PMDM);
+}
+
+/// Set pass P as the last user of the given analysis passes.
+void
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
+  unsigned PDepth = 0;
+  if (P->getResolver())
+    PDepth = P->getResolver()->getPMDataManager().getDepth();
+
+  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
+         E = AnalysisPasses.end(); I != E; ++I) {
+    Pass *AP = *I;
+    LastUser[AP] = P;
+
+    if (P == AP)
+      continue;
+
+    // Update the last users of passes that are required transitive by AP.
+    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
+    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+    SmallVector<Pass *, 12> LastUses;
+    SmallVector<Pass *, 12> LastPMUses;
+    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      assert(AnalysisPass && "Expected analysis pass to exist.");
+      AnalysisResolver *AR = AnalysisPass->getResolver();
+      assert(AR && "Expected analysis resolver to exist.");
+      unsigned APDepth = AR->getPMDataManager().getDepth();
+
+      if (PDepth == APDepth)
+        LastUses.push_back(AnalysisPass);
+      else if (PDepth > APDepth)
+        LastPMUses.push_back(AnalysisPass);
+    }
+
+    setLastUser(LastUses, P);
+
+    // If this pass has a corresponding pass manager, push higher level
+    // analysis to this pass manager.
+    if (P->getResolver())
+      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
+
+
+    // If AP is the last user of other passes then make P last user of
+    // such passes.
+    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
+           LUE = LastUser.end(); LUI != LUE; ++LUI) {
+      if (LUI->second == AP)
+        // DenseMap iterator is not invalidated here because
+        // this is just updating existing entries.
+        LastUser[LUI->first] = P;
+    }
+  }
+}
+
+/// Collect passes whose last user is P
+void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
+                                        Pass *P) {
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
+    InversedLastUser.find(P);
+  if (DMI == InversedLastUser.end())
+    return;
+
+  SmallPtrSet<Pass *, 8> &LU = DMI->second;
+  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
+         E = LU.end(); I != E; ++I) {
+    LastUses.push_back(*I);
+  }
+
+}
+
+AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
+  AnalysisUsage *AnUsage = NULL;
+  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  if (DMI != AnUsageMap.end())
+    AnUsage = DMI->second;
+  else {
+    AnUsage = new AnalysisUsage();
+    P->getAnalysisUsage(*AnUsage);
+    AnUsageMap[P] = AnUsage;
+  }
+  return AnUsage;
+}
+
+/// Schedule pass P for execution. Make sure that passes required by
+/// P are run before P is run. Update analysis info maintained by
+/// the manager. Remove dead passes. This is a recursive function.
+void PMTopLevelManager::schedulePass(Pass *P) {
+
+  // TODO : Allocate function manager for this pass, other wise required set
+  // may be inserted into previous function manager
+
+  // Give pass a chance to prepare the stage.
+  P->preparePassManager(activeStack);
+
+  // If P is an analysis pass and it is available then do not
+  // generate the analysis again. Stale analysis info should not be
+  // available at this point.
+  const PassInfo *PI =
+    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
+    delete P;
+    return;
+  }
+
+  AnalysisUsage *AnUsage = findAnalysisUsage(P);
+
+  bool checkAnalysis = true;
+  while (checkAnalysis) {
+    checkAnalysis = false;
+
+    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
+           E = RequiredSet.end(); I != E; ++I) {
+
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      if (!AnalysisPass) {
+        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            } else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
+        assert(PI && "Expected required passes to be initialized");
+        AnalysisPass = PI->createPass();
+        if (P->getPotentialPassManagerType () ==
+            AnalysisPass->getPotentialPassManagerType())
+          // Schedule analysis pass that is managed by the same pass manager.
+          schedulePass(AnalysisPass);
+        else if (P->getPotentialPassManagerType () >
+                 AnalysisPass->getPotentialPassManagerType()) {
+          // Schedule analysis pass that is managed by a new manager.
+          schedulePass(AnalysisPass);
+          // Recheck analysis passes to ensure that required analyses that
+          // are already checked are still available.
+          checkAnalysis = true;
+        } else
+          // Do not schedule this analysis. Lower level analsyis
+          // passes are run on the fly.
+          delete AnalysisPass;
+      }
+    }
+  }
+
+  // Now all required passes are available.
+  if (ImmutablePass *IP = P->getAsImmutablePass()) {
+    // P is a immutable pass and it will be managed by this
+    // top level manager. Set up analysis resolver to connect them.
+    PMDataManager *DM = getAsPMDataManager();
+    AnalysisResolver *AR = new AnalysisResolver(*DM);
+    P->setResolver(AR);
+    DM->initializeAnalysisImpl(P);
+    addImmutablePass(IP);
+    DM->recordAvailableAnalysis(IP);
+    return;
+  }
+
+  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+
+  // Add the requested pass to the best available pass manager.
+  P->assignPassManager(activeStack, getTopLevelPassManagerType());
+
+  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+}
+
+/// Find the pass that implements Analysis AID. Search immutable
+/// passes and all pass managers. If desired pass is not found
+/// then return NULL.
+Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+
+  // Check pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+         I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check the immutable passes. Iterate in reverse order so that we find
+  // the most recently registered passes first.
+  for (SmallVector<ImmutablePass *, 8>::reverse_iterator I =
+       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
+    AnalysisID PI = (*I)->getPassID();
+    if (PI == AID)
+      return *I;
+
+    // If Pass not found then check the interfaces implemented by Immutable Pass
+    const PassInfo *PassInf =
+      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
+    const std::vector<const PassInfo*> &ImmPI =
+      PassInf->getInterfacesImplemented();
+    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
+         EE = ImmPI.end(); II != EE; ++II) {
+      if ((*II)->getTypeInfo() == AID)
+        return *I;
+    }
+  }
+
+  return 0;
+}
+
+// Print passes managed by this top level manager.
+void PMTopLevelManager::dumpPasses() const {
+
+  if (PassDebugging < Structure)
+    return;
+
+  // Print out the immutable passes
+  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
+    ImmutablePasses[i]->dumpPassStructure(0);
+  }
+
+  // Every class that derives from PMDataManager also derives from Pass
+  // (sometimes indirectly), but there's no inheritance relationship
+  // between PMDataManager and Pass, so we have to getAsPass to get
+  // from a PMDataManager* to a Pass*.
+  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->getAsPass()->dumpPassStructure(1);
+}
+
+void PMTopLevelManager::dumpArguments() const {
+
+  if (PassDebugging < Arguments)
+    return;
+
+  dbgs() << "Pass Arguments: ";
+  for (SmallVector<ImmutablePass *, 8>::const_iterator I =
+       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    if (const PassInfo *PI =
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
+      if (!PI->isAnalysisGroup())
+        dbgs() << " -" << PI->getPassArgument();
+    }
+  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->dumpPassArguments();
+  dbgs() << "\n";
+}
+
+void PMTopLevelManager::initializeAllAnalysisInfo() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  // Initailize other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+       I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
+        DME = LastUser.end(); DMI != DME; ++DMI) {
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
+      InversedLastUser.find(DMI->second);
+    if (InvDMI != InversedLastUser.end()) {
+      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
+      L.insert(DMI->first);
+    } else {
+      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
+      InversedLastUser[DMI->second] = L;
+    }
+  }
+}
+
+/// Destructor
+PMTopLevelManager::~PMTopLevelManager() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    delete *I;
+
+  for (SmallVectorImpl<ImmutablePass *>::iterator
+         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    delete *I;
+
+  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
+         DME = AnUsageMap.end(); DMI != DME; ++DMI)
+    delete DMI->second;
+}
+
+//===----------------------------------------------------------------------===//
+// PMDataManager implementation
+
+/// Augement AvailableAnalysis by adding analysis made available by pass P.
+void PMDataManager::recordAvailableAnalysis(Pass *P) {
+  AnalysisID PI = P->getPassID();
+
+  AvailableAnalysis[PI] = P;
+
+  assert(!AvailableAnalysis.empty());
+
+  // This pass is the current implementation of all of the interfaces it
+  // implements as well.
+  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  if (PInf == 0) return;
+  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+  for (unsigned i = 0, e = II.size(); i != e; ++i)
+    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+}
+
+// Return true if P preserves high level analysis used by other
+// passes managed by this manager
+bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return true;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
+         E = HigherLevelAnalysis.end(); I  != E; ++I) {
+    Pass *P1 = *I;
+    if (P1->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(),
+                  P1->getPassID()) ==
+           PreservedSet.end())
+      return false;
+  }
+
+  return true;
+}
+
+/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
+void PMDataManager::verifyPreservedAnalysis(Pass *P) {
+  // Don't do this unless assertions are enabled.
+#ifdef NDEBUG
+  return;
+#endif
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+
+  // Verify preserved analysis
+  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
+         E = PreservedSet.end(); I != E; ++I) {
+    AnalysisID AID = *I;
+    if (Pass *AP = findAnalysisPass(AID, true)) {
+      TimeRegion PassTimer(getPassTimer(AP));
+      AP->verifyAnalysis();
+    }
+  }
+}
+
+/// Remove Analysis not preserved by Pass P
+void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+         E = AvailableAnalysis.end(); I != E; ) {
+    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
+    if (Info->second->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+        PreservedSet.end()) {
+      // Remove this analysis
+      if (PassDebugging >= Details) {
+        Pass *S = Info->second;
+        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+        dbgs() << S->getPassName() << "'\n";
+      }
+      AvailableAnalysis.erase(Info);
+    }
+  }
+
+  // Check inherited analysis also. If P is not preserving analysis
+  // provided by parent manager then remove it here.
+  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
+
+    if (!InheritedAnalysis[Index])
+      continue;
+
+    for (DenseMap<AnalysisID, Pass*>::iterator
+           I = InheritedAnalysis[Index]->begin(),
+           E = InheritedAnalysis[Index]->end(); I != E; ) {
+      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
+      if (Info->second->getAsImmutablePass() == 0 &&
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+             PreservedSet.end()) {
+        // Remove this analysis
+        if (PassDebugging >= Details) {
+          Pass *S = Info->second;
+          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+          dbgs() << S->getPassName() << "'\n";
+        }
+        InheritedAnalysis[Index]->erase(Info);
+      }
+    }
+  }
+}
+
+/// Remove analysis passes that are not used any longer
+void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
+                                     enum PassDebuggingString DBG_STR) {
+
+  SmallVector<Pass *, 12> DeadPasses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(DeadPasses, P);
+
+  if (PassDebugging >= Details && !DeadPasses.empty()) {
+    dbgs() << " -*- '" <<  P->getPassName();
+    dbgs() << "' is the last user of following pass instances.";
+    dbgs() << " Free these instances\n";
+  }
+
+  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
+         E = DeadPasses.end(); I != E; ++I)
+    freePass(*I, Msg, DBG_STR);
+}
+
+void PMDataManager::freePass(Pass *P, StringRef Msg,
+                             enum PassDebuggingString DBG_STR) {
+  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
+
+  {
+    // If the pass crashes releasing memory, remember this.
+    PassManagerPrettyStackEntry X(P);
+    TimeRegion PassTimer(getPassTimer(P));
+
+    P->releaseMemory();
+  }
+
+  AnalysisID PI = P->getPassID();
+  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+    // Remove the pass itself (if it is not already removed).
+    AvailableAnalysis.erase(PI);
+
+    // Remove all interfaces this pass implements, for which it is also
+    // listed as the available implementation.
+    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+    for (unsigned i = 0, e = II.size(); i != e; ++i) {
+      DenseMap<AnalysisID, Pass*>::iterator Pos =
+        AvailableAnalysis.find(II[i]->getTypeInfo());
+      if (Pos != AvailableAnalysis.end() && Pos->second == P)
+        AvailableAnalysis.erase(Pos);
+    }
+  }
+}
+
+/// Add pass P into the PassVector. Update
+/// AvailableAnalysis appropriately if ProcessAnalysis is true.
+void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
+  // This manager is going to manage pass P. Set up analysis resolver
+  // to connect them.
+  AnalysisResolver *AR = new AnalysisResolver(*this);
+  P->setResolver(AR);
+
+  // If a FunctionPass F is the last user of ModulePass info M
+  // then the F's manager, not F, records itself as a last user of M.
+  SmallVector<Pass *, 12> TransferLastUses;
+
+  if (!ProcessAnalysis) {
+    // Add pass
+    PassVector.push_back(P);
+    return;
+  }
+
+  // At the moment, this pass is the last user of all required passes.
+  SmallVector<Pass *, 12> LastUses;
+  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
+
+  unsigned PDepth = this->getDepth();
+
+  collectRequiredAnalysis(RequiredPasses,
+                          ReqAnalysisNotAvailable, P);
+  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
+         E = RequiredPasses.end(); I != E; ++I) {
+    Pass *PRequired = *I;
+    unsigned RDepth = 0;
+
+    assert(PRequired->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    RDepth = DM.getDepth();
+
+    if (PDepth == RDepth)
+      LastUses.push_back(PRequired);
+    else if (PDepth > RDepth) {
+      // Let the parent claim responsibility of last use
+      TransferLastUses.push_back(PRequired);
+      // Keep track of higher level analysis used by this manager.
+      HigherLevelAnalysis.push_back(PRequired);
+    } else
+      llvm_unreachable("Unable to accommodate Required Pass");
+  }
+
+  // Set P as P's last user until someone starts using P.
+  // However, if P is a Pass Manager then it does not need
+  // to record its last user.
+  if (P->getAsPMDataManager() == 0)
+    LastUses.push_back(P);
+  TPM->setLastUser(LastUses, P);
+
+  if (!TransferLastUses.empty()) {
+    Pass *My_PM = getAsPass();
+    TPM->setLastUser(TransferLastUses, My_PM);
+    TransferLastUses.clear();
+  }
+
+  // Now, take care of required analyses that are not available.
+  for (SmallVectorImpl<AnalysisID>::iterator
+         I = ReqAnalysisNotAvailable.begin(),
+         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
+    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    Pass *AnalysisPass = PI->createPass();
+    this->addLowerLevelRequiredPass(P, AnalysisPass);
+  }
+
+  // Take a note of analysis required and made available by this pass.
+  // Remove the analysis not preserved by this pass
+  removeNotPreservedAnalysis(P);
+  recordAvailableAnalysis(P);
+
+  // Add pass
+  PassVector.push_back(P);
+}
+
+
+/// Populate RP with analysis pass that are required by
+/// pass P and are available. Populate RP_NotAvail with analysis
+/// pass that are required by pass P but are not available.
+void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
+                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
+                                            Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+  for (AnalysisUsage::VectorType::const_iterator
+         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+
+  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+}
+
+// All Required analyses should be available to the pass as it runs!  Here
+// we fill in the AnalysisImpls member of the pass so that it can
+// successfully use the getAnalysis() method to retrieve the
+// implementations it needs.
+//
+void PMDataManager::initializeAnalysisImpl(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+
+  for (AnalysisUsage::VectorType::const_iterator
+         I = AnUsage->getRequiredSet().begin(),
+         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
+    Pass *Impl = findAnalysisPass(*I, true);
+    if (Impl == 0)
+      // This may be analysis pass that is initialized on the fly.
+      // If that is not the case then it will raise an assert when it is used.
+      continue;
+    AnalysisResolver *AR = P->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->addAnalysisImplsPair(*I, Impl);
+  }
+}
+
+/// Find the pass that implements Analysis AID. If desired pass is not found
+/// then return NULL.
+Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
+
+  // Check if AvailableAnalysis map has one entry.
+  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+
+  if (I != AvailableAnalysis.end())
+    return I->second;
+
+  // Search Parents through TopLevelManager
+  if (SearchParent)
+    return TPM->findAnalysisPass(AID);
+
+  return NULL;
+}
+
+// Print list of passes that are last used by P.
+void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+
+  SmallVector<Pass *, 12> LUses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(LUses, P);
+
+  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
+         E = LUses.end(); I != E; ++I) {
+    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
+    (*I)->dumpPassStructure(0);
+  }
+}
+
+void PMDataManager::dumpPassArguments() const {
+  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
+        E = PassVector.end(); I != E; ++I) {
+    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+      PMD->dumpPassArguments();
+    else
+      if (const PassInfo *PI =
+            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        if (!PI->isAnalysisGroup())
+          dbgs() << " -" << PI->getPassArgument();
+  }
+}
+
+void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                                 enum PassDebuggingString S2,
+                                 StringRef Msg) {
+  if (PassDebugging < Executions)
+    return;
+  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
+  switch (S1) {
+  case EXECUTION_MSG:
+    dbgs() << "Executing Pass '" << P->getPassName();
+    break;
+  case MODIFICATION_MSG:
+    dbgs() << "Made Modification '" << P->getPassName();
+    break;
+  case FREEING_MSG:
+    dbgs() << " Freeing Pass '" << P->getPassName();
+    break;
+  default:
+    break;
+  }
+  switch (S2) {
+  case ON_BASICBLOCK_MSG:
+    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
+    break;
+  case ON_FUNCTION_MSG:
+    dbgs() << "' on Function '" << Msg << "'...\n";
+    break;
+  case ON_MODULE_MSG:
+    dbgs() << "' on Module '"  << Msg << "'...\n";
+    break;
+  case ON_REGION_MSG:
+    dbgs() << "' on Region '"  << Msg << "'...\n";
+    break;
+  case ON_LOOP_MSG:
+    dbgs() << "' on Loop '" << Msg << "'...\n";
+    break;
+  case ON_CG_MSG:
+    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
+    break;
+  default:
+    break;
+  }
+}
+
+void PMDataManager::dumpRequiredSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
+}
+
+void PMDataManager::dumpPreservedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
+}
+
+void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
+                                   const AnalysisUsage::VectorType &Set) const {
+  assert(PassDebugging >= Details);
+  if (Set.empty())
+    return;
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  for (unsigned i = 0; i != Set.size(); ++i) {
+    if (i) dbgs() << ',';
+    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
+    dbgs() << ' ' << PInf->getPassName();
+  }
+  dbgs() << '\n';
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+/// This should be handled by specific pass manager.
+void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  if (TPM) {
+    TPM->dumpArguments();
+    TPM->dumpPasses();
+  }
+
+  // Module Level pass may required Function Level analysis info
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager
+  // to provide this on demand. In that case, in Pass manager terminology,
+  // module level pass is requiring lower level analysis info managed by
+  // lower level pass manager.
+
+  // When Pass manager is not able to order required analysis info, Pass manager
+  // checks whether any lower level manager will be able to provide this
+  // analysis info on demand or not.
+#ifndef NDEBUG
+  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
+  dbgs() << "' required by '" << P->getPassName() << "'\n";
+#endif
+  llvm_unreachable("Unable to schedule pass");
+}
+
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
+  llvm_unreachable("Unable to find on the fly pass");
+}
+
+// Destructor
+PMDataManager::~PMDataManager() {
+  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
+         E = PassVector.end(); I != E; ++I)
+    delete *I;
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: Is this the right place to define this method ?
+// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
+  return PM.findAnalysisPass(ID, dir);
+}
+
+Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
+                                     Function &F) {
+  return PM.getOnTheFlyPass(P, AnalysisPI, F);
+}
+
+//===----------------------------------------------------------------------===//
+// BBPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool BBPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = doInitialization(F);
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      bool LocalChanged = false;
+
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpRequiredSet(BP);
+
+      initializeAnalysisImpl(BP);
+
+      {
+        // If the pass crashes, remember this.
+        PassManagerPrettyStackEntry X(BP, *I);
+        TimeRegion PassTimer(getPassTimer(BP));
+
+        LocalChanged |= BP->runOnBasicBlock(*I);
+      }
+
+      Changed |= LocalChanged;
+      if (LocalChanged)
+        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
+                     I->getName());
+      dumpPreservedSet(BP);
+
+      verifyPreservedAnalysis(BP);
+      removeNotPreservedAnalysis(BP);
+      recordAvailableAnalysis(BP);
+      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+    }
+
+  return doFinalization(F) || Changed;
+}
+
+// Implement doInitialization and doFinalization
+bool BBPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doInitialization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doInitialization(F);
+  }
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doFinalization(F);
+  }
+
+  return Changed;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManager implementation
+
+/// Create new Function pass manager
+FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
+  FPM = new FunctionPassManagerImpl();
+  // FPM is the top level manager.
+  FPM->setTopLevelManager(FPM);
+
+  AnalysisResolver *AR = new AnalysisResolver(*FPM);
+  FPM->setResolver(AR);
+}
+
+FunctionPassManager::~FunctionPassManager() {
+  delete FPM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes
+/// ownership of the Pass to the PassManager.  When the
+/// PassManager_X is destroyed, the pass will be destroyed as well, so
+/// there is no need to delete the pass. (TODO delete passes.)
+/// This implies that all passes MUST be allocated with 'new'.
+void FunctionPassManager::add(Pass *P) {
+  FPM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep
+/// track of whether any of the passes modifies the function, and if
+/// so, return true.
+///
+bool FunctionPassManager::run(Function &F) {
+  if (F.isMaterializable()) {
+    std::string errstr;
+    if (F.Materialize(&errstr))
+      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  }
+  return FPM->run(F);
+}
+
+
+/// doInitialization - Run all of the initializers for the function passes.
+///
+bool FunctionPassManager::doInitialization() {
+  return FPM->doInitialization(*M);
+}
+
+/// doFinalization - Run all of the finalizers for the function passes.
+///
+bool FunctionPassManager::doFinalization() {
+  return FPM->doFinalization(*M);
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// cleanup - After running all passes, clean up pass manager cache.
+void FPPassManager::cleanup() {
+ for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    AnalysisResolver *AR = FP->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->clearAnalysisImpls();
+ }
+}
+
+void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
+  if (!wasRun)
+    return;
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    FPPassManager *FPPM = getContainedManager(Index);
+    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
+      FPPM->getContainedPass(Index)->releaseMemory();
+    }
+  }
+  wasRun = false;
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  wasRun = true;
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FPPassManager implementation
+
+char FPPassManager::ID = 0;
+/// Print passes managed by this manager
+void FPPassManager::dumpPassStructure(unsigned Offset) {
+  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    FP->dumpPassStructure(Offset + 1);
+    dumpLastUses(FP, Offset+1);
+  }
+}
+
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnFunction method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool FPPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpRequiredSet(FP);
+
+    initializeAnalysisImpl(FP);
+
+    {
+      PassManagerPrettyStackEntry X(FP, F);
+      TimeRegion PassTimer(getPassTimer(FP));
+
+      LocalChanged |= FP->runOnFunction(F);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpPreservedSet(FP);
+
+    verifyPreservedAnalysis(FP);
+    removeNotPreservedAnalysis(FP);
+    recordAvailableAnalysis(FP);
+    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
+  }
+  return Changed;
+}
+
+bool FPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    Changed |= runOnFunction(*I);
+
+  return Changed;
+}
+
+bool FPPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+  
+  return Changed;
+}
+
+bool FPPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+  
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MPPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnModule method.  Keep track of whether any of the passes modifies
+/// the module, and if so, return true.
+bool
+MPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Initialize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    Changed |= FPP->doInitialization(M);
+  }
+
+  // Initialize module passes
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    ModulePass *MP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      TimeRegion PassTimer(getPassTimer(MP));
+
+      LocalChanged |= MP->runOnModule(M);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+                   M.getModuleIdentifier());
+    dumpPreservedSet(MP);
+
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
+  }
+
+  // Finalize module passes
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  // Finalize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    // We don't know when is the last time an on-the-fly pass is run,
+    // so we need to releaseMemory / finalize here
+    FPP->releaseMemoryOnTheFly();
+    Changed |= FPP->doFinalization(M);
+  }
+  
+  return Changed;
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+  assert((P->getPotentialPassManagerType() <
+          RequiredPass->getPotentialPassManagerType()) &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  if (!FPP) {
+    FPP = new FunctionPassManagerImpl();
+    // FPP is the top level manager.
+    FPP->setTopLevelManager(FPP);
+
+    OnTheFlyManagers[P] = FPP;
+  }
+  FPP->add(RequiredPass);
+
+  // Register P as the last user of RequiredPass.
+  if (RequiredPass) {
+    SmallVector<Pass *, 1> LU;
+    LU.push_back(RequiredPass);
+    FPP->setLastUser(LU,  P);
+  }
+}
+
+/// Return function pass corresponding to PassInfo PI, that is
+/// required by module pass MP. Instantiate analysis pass, by using
+/// its runOnFunction() for function F.
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+  assert(FPP && "Unable to find on the fly pass");
+
+  FPP->releaseMemoryOnTheFly();
+  FPP->run(F);
+  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnModule(M);
+
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager implementation
+
+/// Create new pass manager
+PassManager::PassManager() {
+  PM = new PassManagerImpl();
+  // PM is the top level manager
+  PM->setTopLevelManager(PM);
+}
+
+PassManager::~PassManager() {
+  delete PM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes ownership of
+/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+/// will be destroyed as well, so there is no need to delete the pass.  This
+/// implies that all passes MUST be allocated with 'new'.
+void PassManager::add(Pass *P) {
+  PM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManager::run(Module &M) {
+  return PM->run(M);
+}
+
+//===----------------------------------------------------------------------===//
+// TimingInfo implementation
+
+bool llvm::TimePassesIsEnabled = false;
+static cl::opt<bool,true>
+EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
+            cl::desc("Time each pass, printing elapsed time for each on exit"));
+
+// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
+// a non null value (if the -time-passes option is enabled) or it leaves it
+// null.  It may be called multiple times.
+void TimingInfo::createTheTimeInfo() {
+  if (!TimePassesIsEnabled || TheTimeInfo) return;
+
+  // Constructed the first time this is called, iff -time-passes is enabled.
+  // This guarantees that the object will be constructed before static globals,
+  // thus it will be destroyed before them.
+  static ManagedStatic<TimingInfo> TTI;
+  TheTimeInfo = &*TTI;
+}
+
+/// If TimingInfo is enabled then start pass timer.
+Timer *llvm::getPassTimer(Pass *P) {
+  if (TheTimeInfo)
+    return TheTimeInfo->getPassTimer(P);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// PMStack implementation
+//
+
+// Pop Pass Manager from the stack and clear its analysis info.
+void PMStack::pop() {
+
+  PMDataManager *Top = this->top();
+  Top->initializeAnalysisInfo();
+
+  S.pop_back();
+}
+
+// Push PM on the stack and set its top level manager.
+void PMStack::push(PMDataManager *PM) {
+  assert(PM && "Unable to push. Pass Manager expected");
+  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
+
+  if (!this->empty()) {
+    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
+           && "pushing bad pass manager to PMStack");
+    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
+
+    assert(TPM && "Unable to find top level manager");
+    TPM->addIndirectPassManager(PM);
+    PM->setTopLevelManager(TPM);
+    PM->setDepth(this->top()->getDepth()+1);
+  } else {
+    assert((PM->getPassManagerType() == PMT_ModulePassManager
+           || PM->getPassManagerType() == PMT_FunctionPassManager)
+           && "pushing bad pass manager to PMStack");
+    PM->setDepth(1);
+  }
+
+  S.push_back(PM);
+}
+
+// Dump content of the pass manager stack.
+void PMStack::dump() const {
+  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
+         E = S.end(); I != E; ++I)
+    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
+
+  if (!S.empty())
+    dbgs() << '\n';
+}
+
+/// Find appropriate Module Pass Manager in the PM Stack and
+/// add self into that manager.
+void ModulePass::assignPassManager(PMStack &PMS,
+                                   PassManagerType PreferredType) {
+  // Find Module Pass Manager
+  while (!PMS.empty()) {
+    PassManagerType TopPMType = PMS.top()->getPassManagerType();
+    if (TopPMType == PreferredType)
+      break; // We found desired pass manager
+    else if (TopPMType > PMT_ModulePassManager)
+      PMS.pop();    // Pop children pass managers
+    else
+      break;
+  }
+  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PMS.top()->add(this);
+}
+
+/// Find appropriate Function Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void FunctionPass::assignPassManager(PMStack &PMS,
+                                     PassManagerType PreferredType) {
+
+  // Find Function Pass Manager
+  while (!PMS.empty()) {
+    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
+      PMS.pop();
+    else
+      break;
+  }
+
+  // Create new Function Pass Manager if needed.
+  FPPassManager *FPP;
+  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
+    FPP = (FPPassManager *)PMS.top();
+  } else {
+    assert(!PMS.empty() && "Unable to create Function Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Function Pass Manager
+    FPP = new FPPassManager();
+    FPP->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(FPP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+
+    // [4] Push new manager into PMS
+    PMS.push(FPP);
+  }
+
+  // Assign FPP as the manager of this pass.
+  FPP->add(this);
+}
+
+/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void BasicBlockPass::assignPassManager(PMStack &PMS,
+                                       PassManagerType PreferredType) {
+  BBPassManager *BBP;
+
+  // Basic Pass Manager is a leaf pass manager. It does not handle
+  // any other pass manager.
+  if (!PMS.empty() &&
+      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
+    BBP = (BBPassManager *)PMS.top();
+  } else {
+    // If leaf manager is not Basic Block Pass manager then create new
+    // basic Block Pass manager.
+    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Basic Block Manager
+    BBP = new BBPassManager();
+
+    // [2] Set up new manager's top level manager
+    // Basic Block Pass Manager does not live by itself
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(BBP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    BBP->assignPassManager(PMS, PreferredType);
+
+    // [4] Push new manager into PMS
+    PMS.push(BBP);
+  }
+
+  // Assign BBP as the manager of this pass.
+  BBP->add(this);
+}
+
+PassManagerBase::~PassManagerBase() {}
diff --git a/contrib/llvm/lib/IR/PassRegistry.cpp b/contrib/llvm/lib/IR/PassRegistry.cpp
new file mode 100644
index 000000000000..a0b64ed78f5f
--- /dev/null
+++ b/contrib/llvm/lib/IR/PassRegistry.cpp
@@ -0,0 +1,209 @@
+//===- PassRegistry.cpp - Pass Registration Implementation ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PassRegistry, with which passes are registered on
+// initialization, and supports the PassManager in dependency resolution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/PassRegistry.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include <vector>
+
+using namespace llvm;
+
+// FIXME: We use ManagedStatic to erase the pass registrar on shutdown.
+// Unfortunately, passes are registered with static ctors, and having
+// llvm_shutdown clear this map prevents successful resurrection after
+// llvm_shutdown is run.  Ideally we should find a solution so that we don't
+// leak the map, AND can still resurrect after shutdown.
+static ManagedStatic<PassRegistry> PassRegistryObj;
+PassRegistry *PassRegistry::getPassRegistry() {
+  return &*PassRegistryObj;
+}
+
+static ManagedStatic<sys::SmartMutex<true> > Lock;
+
+//===----------------------------------------------------------------------===//
+// PassRegistryImpl
+//
+
+namespace {
+struct PassRegistryImpl {
+  /// PassInfoMap - Keep track of the PassInfo object for each registered pass.
+  typedef DenseMap<const void*, const PassInfo*> MapType;
+  MapType PassInfoMap;
+  
+  typedef StringMap<const PassInfo*> StringMapType;
+  StringMapType PassInfoStringMap;
+  
+  /// AnalysisGroupInfo - Keep track of information for each analysis group.
+  struct AnalysisGroupInfo {
+    SmallPtrSet<const PassInfo *, 8> Implementations;
+  };
+  DenseMap<const PassInfo*, AnalysisGroupInfo> AnalysisGroupInfoMap;
+  
+  std::vector<const PassInfo*> ToFree;
+  std::vector<PassRegistrationListener*> Listeners;
+};
+} // end anonymous namespace
+
+void *PassRegistry::getImpl() const {
+  if (!pImpl)
+    pImpl = new PassRegistryImpl();
+  return pImpl;
+}
+
+//===----------------------------------------------------------------------===//
+// Accessors
+//
+
+PassRegistry::~PassRegistry() {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(pImpl);
+  
+  for (std::vector<const PassInfo*>::iterator I = Impl->ToFree.begin(),
+       E = Impl->ToFree.end(); I != E; ++I)
+    delete *I;
+  
+  delete Impl;
+  pImpl = 0;
+}
+
+const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.find(TI);
+  return I != Impl->PassInfoMap.end() ? I->second : 0;
+}
+
+const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  PassRegistryImpl::StringMapType::const_iterator
+    I = Impl->PassInfoStringMap.find(Arg);
+  return I != Impl->PassInfoStringMap.end() ? I->second : 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Registration mechanism
+//
+
+void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  bool Inserted =
+    Impl->PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
+  assert(Inserted && "Pass registered multiple times!");
+  (void)Inserted;
+  Impl->PassInfoStringMap[PI.getPassArgument()] = &PI;
+  
+  // Notify any listeners.
+  for (std::vector<PassRegistrationListener*>::iterator
+       I = Impl->Listeners.begin(), E = Impl->Listeners.end(); I != E; ++I)
+    (*I)->passRegistered(&PI);
+  
+  if (ShouldFree) Impl->ToFree.push_back(&PI);
+}
+
+void PassRegistry::unregisterPass(const PassInfo &PI) {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  PassRegistryImpl::MapType::iterator I = 
+    Impl->PassInfoMap.find(PI.getTypeInfo());
+  assert(I != Impl->PassInfoMap.end() && "Pass registered but not in map!");
+  
+  // Remove pass from the map.
+  Impl->PassInfoMap.erase(I);
+  Impl->PassInfoStringMap.erase(PI.getPassArgument());
+}
+
+void PassRegistry::enumerateWith(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  for (PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.begin(),
+       E = Impl->PassInfoMap.end(); I != E; ++I)
+    L->passEnumerate(I->second);
+}
+
+
+/// Analysis Group Mechanisms.
+void PassRegistry::registerAnalysisGroup(const void *InterfaceID, 
+                                         const void *PassID,
+                                         PassInfo& Registeree,
+                                         bool isDefault,
+                                         bool ShouldFree) {
+  PassInfo *InterfaceInfo =  const_cast<PassInfo*>(getPassInfo(InterfaceID));
+  if (InterfaceInfo == 0) {
+    // First reference to Interface, register it now.
+    registerPass(Registeree);
+    InterfaceInfo = &Registeree;
+  }
+  assert(Registeree.isAnalysisGroup() && 
+         "Trying to join an analysis group that is a normal pass!");
+
+  if (PassID) {
+    PassInfo *ImplementationInfo = const_cast<PassInfo*>(getPassInfo(PassID));
+    assert(ImplementationInfo &&
+           "Must register pass before adding to AnalysisGroup!");
+
+    sys::SmartScopedLock<true> Guard(*Lock);
+    
+    // Make sure we keep track of the fact that the implementation implements
+    // the interface.
+    ImplementationInfo->addInterfaceImplemented(InterfaceInfo);
+
+    PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+    PassRegistryImpl::AnalysisGroupInfo &AGI =
+      Impl->AnalysisGroupInfoMap[InterfaceInfo];
+    assert(AGI.Implementations.count(ImplementationInfo) == 0 &&
+           "Cannot add a pass to the same analysis group more than once!");
+    AGI.Implementations.insert(ImplementationInfo);
+    if (isDefault) {
+      assert(InterfaceInfo->getNormalCtor() == 0 &&
+             "Default implementation for analysis group already specified!");
+      assert(ImplementationInfo->getNormalCtor() &&
+           "Cannot specify pass as default if it does not have a default ctor");
+      InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
+    }
+  }
+  
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  if (ShouldFree) Impl->ToFree.push_back(&Registeree);
+}
+
+void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  Impl->Listeners.push_back(L);
+}
+
+void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
+  sys::SmartScopedLock<true> Guard(*Lock);
+  
+  // NOTE: This is necessary, because removeRegistrationListener() can be called
+  // as part of the llvm_shutdown sequence.  Since we have no control over the
+  // order of that sequence, we need to gracefully handle the case where the
+  // PassRegistry is destructed before the object that triggers this call.
+  if (!pImpl) return;
+  
+  PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
+  std::vector<PassRegistrationListener*>::iterator I =
+    std::find(Impl->Listeners.begin(), Impl->Listeners.end(), L);
+  assert(I != Impl->Listeners.end() &&
+         "PassRegistrationListener not registered!");
+  Impl->Listeners.erase(I);
+}
diff --git a/contrib/llvm/lib/IR/PrintModulePass.cpp b/contrib/llvm/lib/IR/PrintModulePass.cpp
new file mode 100644
index 000000000000..5026bc2d9840
--- /dev/null
+++ b/contrib/llvm/lib/IR/PrintModulePass.cpp
@@ -0,0 +1,136 @@
+//===--- IR/PrintModulePass.cpp - Module/Function Printer -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PrintModulePass and PrintFunctionPass implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+  class PrintModulePass : public ModulePass {
+    std::string Banner;
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintModulePass() : ModulePass(ID), Out(&dbgs()), 
+      DeleteStream(false) {}
+    PrintModulePass(const std::string &B, raw_ostream *o, bool DS)
+        : ModulePass(ID), Banner(B), Out(o), DeleteStream(DS) {}
+    
+    ~PrintModulePass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    bool runOnModule(Module &M) {
+      (*Out) << Banner << M;
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+  
+  class PrintFunctionPass : public FunctionPass {
+    std::string Banner;     // String to print before each function
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintFunctionPass() : FunctionPass(ID), Banner(""), Out(&dbgs()), 
+                          DeleteStream(false) {}
+    PrintFunctionPass(const std::string &B, raw_ostream *o, bool DS)
+      : FunctionPass(ID), Banner(B), Out(o), DeleteStream(DS) {}
+    
+    ~PrintFunctionPass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    // runOnFunction - This pass just prints a banner followed by the
+    // function as it's processed.
+    //
+    bool runOnFunction(Function &F) {
+      (*Out) << Banner << static_cast<Value&>(F);
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+  
+  class PrintBasicBlockPass : public BasicBlockPass {
+    std::string Banner;
+    raw_ostream *Out;       // raw_ostream to print on
+    bool DeleteStream;      // Delete the ostream in our dtor?
+  public:
+    static char ID;
+    PrintBasicBlockPass() : BasicBlockPass(ID), Out(&dbgs()), 
+      DeleteStream(false) {}
+    PrintBasicBlockPass(const std::string &B, raw_ostream *o, bool DS)
+        : BasicBlockPass(ID), Banner(B), Out(o), DeleteStream(DS) {}
+    
+    ~PrintBasicBlockPass() {
+      if (DeleteStream) delete Out;
+    }
+    
+    bool runOnBasicBlock(BasicBlock &BB) {
+      (*Out) << Banner << BB;
+      return false;
+    }
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char PrintModulePass::ID = 0;
+INITIALIZE_PASS(PrintModulePass, "print-module",
+                "Print module to stderr", false, false)
+char PrintFunctionPass::ID = 0;
+INITIALIZE_PASS(PrintFunctionPass, "print-function",
+                "Print function to stderr", false, false)
+char PrintBasicBlockPass::ID = 0;
+INITIALIZE_PASS(PrintBasicBlockPass, "print-bb",
+                "Print BB to stderr", false, false)
+
+/// createPrintModulePass - Create and return a pass that writes the
+/// module to the specified raw_ostream.
+ModulePass *llvm::createPrintModulePass(llvm::raw_ostream *OS, 
+                                        bool DeleteStream,
+                                        const std::string &Banner) {
+  return new PrintModulePass(Banner, OS, DeleteStream);
+}
+
+/// createPrintFunctionPass - Create and return a pass that prints
+/// functions to the specified raw_ostream as they are processed.
+FunctionPass *llvm::createPrintFunctionPass(const std::string &Banner,
+                                            llvm::raw_ostream *OS, 
+                                            bool DeleteStream) {
+  return new PrintFunctionPass(Banner, OS, DeleteStream);
+}
+
+/// createPrintBasicBlockPass - Create and return a pass that writes the
+/// BB to the specified raw_ostream.
+BasicBlockPass *llvm::createPrintBasicBlockPass(llvm::raw_ostream *OS,
+                                        bool DeleteStream,
+                                        const std::string &Banner) {
+  return new PrintBasicBlockPass(Banner, OS, DeleteStream);
+}
+
diff --git a/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h
new file mode 100644
index 000000000000..5a383eee56c5
--- /dev/null
+++ b/contrib/llvm/lib/IR/SymbolTableListTraitsImpl.h
@@ -0,0 +1,118 @@
+//===-- llvm/SymbolTableListTraitsImpl.h - Implementation ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stickier parts of the SymbolTableListTraits class,
+// and is explicitly instantiated where needed to avoid defining all this code
+// in a widely used header.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
+#define LLVM_SYMBOLTABLELISTTRAITS_IMPL_H
+
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/ValueSymbolTable.h"
+
+namespace llvm {
+
+/// setSymTabObject - This is called when (f.e.) the parent of a basic block
+/// changes.  This requires us to remove all the instruction symtab entries from
+/// the current function and reinsert them into the new function.
+template<typename ValueSubClass, typename ItemParentClass>
+template<typename TPtr>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::setSymTabObject(TPtr *Dest, TPtr Src) {
+  // Get the old symtab and value list before doing the assignment.
+  ValueSymbolTable *OldST = TraitsClass::getSymTab(getListOwner());
+
+  // Do it.
+  *Dest = Src;
+  
+  // Get the new SymTab object.
+  ValueSymbolTable *NewST = TraitsClass::getSymTab(getListOwner());
+  
+  // If there is nothing to do, quick exit.
+  if (OldST == NewST) return;
+  
+  // Move all the elements from the old symtab to the new one.
+  iplist<ValueSubClass> &ItemList = TraitsClass::getList(getListOwner());
+  if (ItemList.empty()) return;
+  
+  if (OldST) {
+    // Remove all entries from the previous symtab.
+    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
+         I != ItemList.end(); ++I)
+      if (I->hasName())
+        OldST->removeValueName(I->getValueName());
+  }
+
+  if (NewST) {
+    // Add all of the items to the new symtab.
+    for (typename iplist<ValueSubClass>::iterator I = ItemList.begin();
+         I != ItemList.end(); ++I)
+      if (I->hasName())
+        NewST->reinsertValue(I);
+  }
+  
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::addNodeToList(ValueSubClass *V) {
+  assert(V->getParent() == 0 && "Value already in a container!!");
+  ItemParentClass *Owner = getListOwner();
+  V->setParent(Owner);
+  if (V->hasName())
+    if (ValueSymbolTable *ST = TraitsClass::getSymTab(Owner))
+      ST->reinsertValue(V);
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::removeNodeFromList(ValueSubClass *V) {
+  V->setParent(0);
+  if (V->hasName())
+    if (ValueSymbolTable *ST = TraitsClass::getSymTab(getListOwner()))
+      ST->removeValueName(V->getValueName());
+}
+
+template<typename ValueSubClass, typename ItemParentClass>
+void SymbolTableListTraits<ValueSubClass,ItemParentClass>
+::transferNodesFromList(ilist_traits<ValueSubClass> &L2,
+                        ilist_iterator<ValueSubClass> first,
+                        ilist_iterator<ValueSubClass> last) {
+  // We only have to do work here if transferring instructions between BBs
+  ItemParentClass *NewIP = getListOwner(), *OldIP = L2.getListOwner();
+  if (NewIP == OldIP) return;  // No work to do at all...
+
+  // We only have to update symbol table entries if we are transferring the
+  // instructions to a different symtab object...
+  ValueSymbolTable *NewST = TraitsClass::getSymTab(NewIP);
+  ValueSymbolTable *OldST = TraitsClass::getSymTab(OldIP);
+  if (NewST != OldST) {
+    for (; first != last; ++first) {
+      ValueSubClass &V = *first;
+      bool HasName = V.hasName();
+      if (OldST && HasName)
+        OldST->removeValueName(V.getValueName());
+      V.setParent(NewIP);
+      if (NewST && HasName)
+        NewST->reinsertValue(&V);
+    }
+  } else {
+    // Just transferring between blocks in the same function, simply update the
+    // parent fields in the instructions...
+    for (; first != last; ++first)
+      first->setParent(NewIP);
+  }
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp
new file mode 100644
index 000000000000..1e6a51ab108c
--- /dev/null
+++ b/contrib/llvm/lib/IR/Type.cpp
@@ -0,0 +1,767 @@
+//===-- Type.cpp - Implement the Type class -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Type class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Type.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include <algorithm>
+#include <cstdarg>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                         Type Class Implementation
+//===----------------------------------------------------------------------===//
+
+Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
+  switch (IDNumber) {
+  case VoidTyID      : return getVoidTy(C);
+  case HalfTyID      : return getHalfTy(C);
+  case FloatTyID     : return getFloatTy(C);
+  case DoubleTyID    : return getDoubleTy(C);
+  case X86_FP80TyID  : return getX86_FP80Ty(C);
+  case FP128TyID     : return getFP128Ty(C);
+  case PPC_FP128TyID : return getPPC_FP128Ty(C);
+  case LabelTyID     : return getLabelTy(C);
+  case MetadataTyID  : return getMetadataTy(C);
+  case X86_MMXTyID   : return getX86_MMXTy(C);
+  default:
+    return 0;
+  }
+}
+
+/// getScalarType - If this is a vector type, return the element type,
+/// otherwise return this.
+Type *Type::getScalarType() {
+  if (VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType();
+  return this;
+}
+
+const Type *Type::getScalarType() const {
+  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType();
+  return this;
+}
+
+/// isIntegerTy - Return true if this is an IntegerType of the specified width.
+bool Type::isIntegerTy(unsigned Bitwidth) const {
+  return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
+}
+
+// canLosslesslyBitCastTo - Return true if this type can be converted to
+// 'Ty' without any reinterpretation of bits.  For example, i8* to i32*.
+//
+bool Type::canLosslesslyBitCastTo(Type *Ty) const {
+  // Identity cast means no change so return true
+  if (this == Ty) 
+    return true;
+  
+  // They are not convertible unless they are at least first class types
+  if (!this->isFirstClassType() || !Ty->isFirstClassType())
+    return false;
+
+  // Vector -> Vector conversions are always lossless if the two vector types
+  // have the same size, otherwise not.  Also, 64-bit vector types can be
+  // converted to x86mmx.
+  if (const VectorType *thisPTy = dyn_cast<VectorType>(this)) {
+    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+      return thisPTy->getBitWidth() == thatPTy->getBitWidth();
+    if (Ty->getTypeID() == Type::X86_MMXTyID &&
+        thisPTy->getBitWidth() == 64)
+      return true;
+  }
+
+  if (this->getTypeID() == Type::X86_MMXTyID)
+    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+      if (thatPTy->getBitWidth() == 64)
+        return true;
+
+  // At this point we have only various mismatches of the first class types
+  // remaining and ptr->ptr. Just select the lossless conversions. Everything
+  // else is not lossless.
+  if (this->isPointerTy())
+    return Ty->isPointerTy();
+  return false;  // Other types have no identity values
+}
+
+bool Type::isEmptyTy() const {
+  const ArrayType *ATy = dyn_cast<ArrayType>(this);
+  if (ATy) {
+    unsigned NumElements = ATy->getNumElements();
+    return NumElements == 0 || ATy->getElementType()->isEmptyTy();
+  }
+
+  const StructType *STy = dyn_cast<StructType>(this);
+  if (STy) {
+    unsigned NumElements = STy->getNumElements();
+    for (unsigned i = 0; i < NumElements; ++i)
+      if (!STy->getElementType(i)->isEmptyTy())
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
+unsigned Type::getPrimitiveSizeInBits() const {
+  switch (getTypeID()) {
+  case Type::HalfTyID: return 16;
+  case Type::FloatTyID: return 32;
+  case Type::DoubleTyID: return 64;
+  case Type::X86_FP80TyID: return 80;
+  case Type::FP128TyID: return 128;
+  case Type::PPC_FP128TyID: return 128;
+  case Type::X86_MMXTyID: return 64;
+  case Type::IntegerTyID: return cast<IntegerType>(this)->getBitWidth();
+  case Type::VectorTyID:  return cast<VectorType>(this)->getBitWidth();
+  default: return 0;
+  }
+}
+
+/// getScalarSizeInBits - If this is a vector type, return the
+/// getPrimitiveSizeInBits value for the element type. Otherwise return the
+/// getPrimitiveSizeInBits value for this type.
+unsigned Type::getScalarSizeInBits() {
+  return getScalarType()->getPrimitiveSizeInBits();
+}
+
+/// getFPMantissaWidth - Return the width of the mantissa of this type.  This
+/// is only valid on floating point types.  If the FP type does not
+/// have a stable mantissa (e.g. ppc long double), this method returns -1.
+int Type::getFPMantissaWidth() const {
+  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType()->getFPMantissaWidth();
+  assert(isFloatingPointTy() && "Not a floating point type!");
+  if (getTypeID() == HalfTyID) return 11;
+  if (getTypeID() == FloatTyID) return 24;
+  if (getTypeID() == DoubleTyID) return 53;
+  if (getTypeID() == X86_FP80TyID) return 64;
+  if (getTypeID() == FP128TyID) return 113;
+  assert(getTypeID() == PPC_FP128TyID && "unknown fp type");
+  return -1;
+}
+
+/// isSizedDerivedType - Derived types like structures and arrays are sized
+/// iff all of the members of the type are sized as well.  Since asking for
+/// their size is relatively uncommon, move this operation out of line.
+bool Type::isSizedDerivedType() const {
+  if (this->isIntegerTy())
+    return true;
+
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(this))
+    return ATy->getElementType()->isSized();
+
+  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType()->isSized();
+
+  if (!this->isStructTy()) 
+    return false;
+
+  return cast<StructType>(this)->isSized();
+}
+
+//===----------------------------------------------------------------------===//
+//                         Subclass Helper Methods
+//===----------------------------------------------------------------------===//
+
+unsigned Type::getIntegerBitWidth() const {
+  return cast<IntegerType>(this)->getBitWidth();
+}
+
+bool Type::isFunctionVarArg() const {
+  return cast<FunctionType>(this)->isVarArg();
+}
+
+Type *Type::getFunctionParamType(unsigned i) const {
+  return cast<FunctionType>(this)->getParamType(i);
+}
+
+unsigned Type::getFunctionNumParams() const {
+  return cast<FunctionType>(this)->getNumParams();
+}
+
+StringRef Type::getStructName() const {
+  return cast<StructType>(this)->getName();
+}
+
+unsigned Type::getStructNumElements() const {
+  return cast<StructType>(this)->getNumElements();
+}
+
+Type *Type::getStructElementType(unsigned N) const {
+  return cast<StructType>(this)->getElementType(N);
+}
+
+Type *Type::getSequentialElementType() const {
+  return cast<SequentialType>(this)->getElementType();
+}
+
+uint64_t Type::getArrayNumElements() const {
+  return cast<ArrayType>(this)->getNumElements();
+}
+
+unsigned Type::getVectorNumElements() const {
+  return cast<VectorType>(this)->getNumElements();
+}
+
+unsigned Type::getPointerAddressSpace() const {
+  return cast<PointerType>(getScalarType())->getAddressSpace();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                          Primitive 'Type' data
+//===----------------------------------------------------------------------===//
+
+Type *Type::getVoidTy(LLVMContext &C) { return &C.pImpl->VoidTy; }
+Type *Type::getLabelTy(LLVMContext &C) { return &C.pImpl->LabelTy; }
+Type *Type::getHalfTy(LLVMContext &C) { return &C.pImpl->HalfTy; }
+Type *Type::getFloatTy(LLVMContext &C) { return &C.pImpl->FloatTy; }
+Type *Type::getDoubleTy(LLVMContext &C) { return &C.pImpl->DoubleTy; }
+Type *Type::getMetadataTy(LLVMContext &C) { return &C.pImpl->MetadataTy; }
+Type *Type::getX86_FP80Ty(LLVMContext &C) { return &C.pImpl->X86_FP80Ty; }
+Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
+Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
+Type *Type::getX86_MMXTy(LLVMContext &C) { return &C.pImpl->X86_MMXTy; }
+
+IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
+IntegerType *Type::getInt8Ty(LLVMContext &C) { return &C.pImpl->Int8Ty; }
+IntegerType *Type::getInt16Ty(LLVMContext &C) { return &C.pImpl->Int16Ty; }
+IntegerType *Type::getInt32Ty(LLVMContext &C) { return &C.pImpl->Int32Ty; }
+IntegerType *Type::getInt64Ty(LLVMContext &C) { return &C.pImpl->Int64Ty; }
+
+IntegerType *Type::getIntNTy(LLVMContext &C, unsigned N) {
+  return IntegerType::get(C, N);
+}
+
+PointerType *Type::getHalfPtrTy(LLVMContext &C, unsigned AS) {
+  return getHalfTy(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getFloatPtrTy(LLVMContext &C, unsigned AS) {
+  return getFloatTy(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getDoublePtrTy(LLVMContext &C, unsigned AS) {
+  return getDoubleTy(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getX86_FP80PtrTy(LLVMContext &C, unsigned AS) {
+  return getX86_FP80Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getFP128PtrTy(LLVMContext &C, unsigned AS) {
+  return getFP128Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getPPC_FP128PtrTy(LLVMContext &C, unsigned AS) {
+  return getPPC_FP128Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getX86_MMXPtrTy(LLVMContext &C, unsigned AS) {
+  return getX86_MMXTy(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getIntNPtrTy(LLVMContext &C, unsigned N, unsigned AS) {
+  return getIntNTy(C, N)->getPointerTo(AS);
+}
+
+PointerType *Type::getInt1PtrTy(LLVMContext &C, unsigned AS) {
+  return getInt1Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getInt8PtrTy(LLVMContext &C, unsigned AS) {
+  return getInt8Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getInt16PtrTy(LLVMContext &C, unsigned AS) {
+  return getInt16Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getInt32PtrTy(LLVMContext &C, unsigned AS) {
+  return getInt32Ty(C)->getPointerTo(AS);
+}
+
+PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
+  return getInt64Ty(C)->getPointerTo(AS);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                       IntegerType Implementation
+//===----------------------------------------------------------------------===//
+
+IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
+  assert(NumBits >= MIN_INT_BITS && "bitwidth too small");
+  assert(NumBits <= MAX_INT_BITS && "bitwidth too large");
+  
+  // Check for the built-in integer types
+  switch (NumBits) {
+  case  1: return cast<IntegerType>(Type::getInt1Ty(C));
+  case  8: return cast<IntegerType>(Type::getInt8Ty(C));
+  case 16: return cast<IntegerType>(Type::getInt16Ty(C));
+  case 32: return cast<IntegerType>(Type::getInt32Ty(C));
+  case 64: return cast<IntegerType>(Type::getInt64Ty(C));
+  default: 
+    break;
+  }
+  
+  IntegerType *&Entry = C.pImpl->IntegerTypes[NumBits];
+  
+  if (Entry == 0)
+    Entry = new (C.pImpl->TypeAllocator) IntegerType(C, NumBits);
+  
+  return Entry;
+}
+
+bool IntegerType::isPowerOf2ByteWidth() const {
+  unsigned BitWidth = getBitWidth();
+  return (BitWidth > 7) && isPowerOf2_32(BitWidth);
+}
+
+APInt IntegerType::getMask() const {
+  return APInt::getAllOnesValue(getBitWidth());
+}
+
+//===----------------------------------------------------------------------===//
+//                       FunctionType Implementation
+//===----------------------------------------------------------------------===//
+
+FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
+                           bool IsVarArgs)
+  : Type(Result->getContext(), FunctionTyID) {
+  Type **SubTys = reinterpret_cast<Type**>(this+1);
+  assert(isValidReturnType(Result) && "invalid return type for function");
+  setSubclassData(IsVarArgs);
+
+  SubTys[0] = const_cast<Type*>(Result);
+
+  for (unsigned i = 0, e = Params.size(); i != e; ++i) {
+    assert(isValidArgumentType(Params[i]) &&
+           "Not a valid type for function argument!");
+    SubTys[i+1] = Params[i];
+  }
+
+  ContainedTys = SubTys;
+  NumContainedTys = Params.size() + 1; // + 1 for result type
+}
+
+// FunctionType::get - The factory function for the FunctionType class.
+FunctionType *FunctionType::get(Type *ReturnType,
+                                ArrayRef<Type*> Params, bool isVarArg) {
+  LLVMContextImpl *pImpl = ReturnType->getContext().pImpl;
+  FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
+  LLVMContextImpl::FunctionTypeMap::iterator I =
+    pImpl->FunctionTypes.find_as(Key);
+  FunctionType *FT;
+
+  if (I == pImpl->FunctionTypes.end()) {
+    FT = (FunctionType*) pImpl->TypeAllocator.
+      Allocate(sizeof(FunctionType) + sizeof(Type*) * (Params.size() + 1),
+               AlignOf<FunctionType>::Alignment);
+    new (FT) FunctionType(ReturnType, Params, isVarArg);
+    pImpl->FunctionTypes[FT] = true;
+  } else {
+    FT = I->first;
+  }
+
+  return FT;
+}
+
+FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
+  return get(Result, ArrayRef<Type *>(), isVarArg);
+}
+
+/// isValidReturnType - Return true if the specified type is valid as a return
+/// type.
+bool FunctionType::isValidReturnType(Type *RetTy) {
+  return !RetTy->isFunctionTy() && !RetTy->isLabelTy() &&
+  !RetTy->isMetadataTy();
+}
+
+/// isValidArgumentType - Return true if the specified type is valid as an
+/// argument type.
+bool FunctionType::isValidArgumentType(Type *ArgTy) {
+  return ArgTy->isFirstClassType();
+}
+
+//===----------------------------------------------------------------------===//
+//                       StructType Implementation
+//===----------------------------------------------------------------------===//
+
+// Primitive Constructors.
+
+StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes, 
+                            bool isPacked) {
+  LLVMContextImpl *pImpl = Context.pImpl;
+  AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
+  LLVMContextImpl::StructTypeMap::iterator I =
+    pImpl->AnonStructTypes.find_as(Key);
+  StructType *ST;
+
+  if (I == pImpl->AnonStructTypes.end()) {
+    // Value not found.  Create a new type!
+    ST = new (Context.pImpl->TypeAllocator) StructType(Context);
+    ST->setSubclassData(SCDB_IsLiteral);  // Literal struct.
+    ST->setBody(ETypes, isPacked);
+    Context.pImpl->AnonStructTypes[ST] = true;
+  } else {
+    ST = I->first;
+  }
+
+  return ST;
+}
+
+void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
+  assert(isOpaque() && "Struct body already set!");
+  
+  setSubclassData(getSubclassData() | SCDB_HasBody);
+  if (isPacked)
+    setSubclassData(getSubclassData() | SCDB_Packed);
+
+  unsigned NumElements = Elements.size();
+  Type **Elts = getContext().pImpl->TypeAllocator.Allocate<Type*>(NumElements);
+  memcpy(Elts, Elements.data(), sizeof(Elements[0]) * NumElements);
+  
+  ContainedTys = Elts;
+  NumContainedTys = NumElements;
+}
+
+void StructType::setName(StringRef Name) {
+  if (Name == getName()) return;
+
+  StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
+  typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+  // If this struct already had a name, remove its symbol table entry. Don't
+  // delete the data yet because it may be part of the new name.
+  if (SymbolTableEntry)
+    SymbolTable.remove((EntryTy *)SymbolTableEntry);
+
+  // If this is just removing the name, we're done.
+  if (Name.empty()) {
+    if (SymbolTableEntry) {
+      // Delete the old string data.
+      ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
+      SymbolTableEntry = 0;
+    }
+    return;
+  }
+  
+  // Look up the entry for the name.
+  EntryTy *Entry = &getContext().pImpl->NamedStructTypes.GetOrCreateValue(Name);
+  
+  // While we have a name collision, try a random rename.
+  if (Entry->getValue()) {
+    SmallString<64> TempStr(Name);
+    TempStr.push_back('.');
+    raw_svector_ostream TmpStream(TempStr);
+    unsigned NameSize = Name.size();
+   
+    do {
+      TempStr.resize(NameSize + 1);
+      TmpStream.resync();
+      TmpStream << getContext().pImpl->NamedStructTypesUniqueID++;
+      
+      Entry = &getContext().pImpl->
+                 NamedStructTypes.GetOrCreateValue(TmpStream.str());
+    } while (Entry->getValue());
+  }
+
+  // Okay, we found an entry that isn't used.  It's us!
+  Entry->setValue(this);
+
+  // Delete the old string data.
+  if (SymbolTableEntry)
+    ((EntryTy *)SymbolTableEntry)->Destroy(SymbolTable.getAllocator());
+  SymbolTableEntry = Entry;
+}
+
+//===----------------------------------------------------------------------===//
+// StructType Helper functions.
+
+StructType *StructType::create(LLVMContext &Context, StringRef Name) {
+  StructType *ST = new (Context.pImpl->TypeAllocator) StructType(Context);
+  if (!Name.empty())
+    ST->setName(Name);
+  return ST;
+}
+
+StructType *StructType::get(LLVMContext &Context, bool isPacked) {
+  return get(Context, llvm::ArrayRef<Type*>(), isPacked);
+}
+
+StructType *StructType::get(Type *type, ...) {
+  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  LLVMContext &Ctx = type->getContext();
+  va_list ap;
+  SmallVector<llvm::Type*, 8> StructFields;
+  va_start(ap, type);
+  while (type) {
+    StructFields.push_back(type);
+    type = va_arg(ap, llvm::Type*);
+  }
+  return llvm::StructType::get(Ctx, StructFields);
+}
+
+StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
+                               StringRef Name, bool isPacked) {
+  StructType *ST = create(Context, Name);
+  ST->setBody(Elements, isPacked);
+  return ST;
+}
+
+StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements) {
+  return create(Context, Elements, StringRef());
+}
+
+StructType *StructType::create(LLVMContext &Context) {
+  return create(Context, StringRef());
+}
+
+StructType *StructType::create(ArrayRef<Type*> Elements, StringRef Name,
+                               bool isPacked) {
+  assert(!Elements.empty() &&
+         "This method may not be invoked with an empty list");
+  return create(Elements[0]->getContext(), Elements, Name, isPacked);
+}
+
+StructType *StructType::create(ArrayRef<Type*> Elements) {
+  assert(!Elements.empty() &&
+         "This method may not be invoked with an empty list");
+  return create(Elements[0]->getContext(), Elements, StringRef());
+}
+
+StructType *StructType::create(StringRef Name, Type *type, ...) {
+  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  LLVMContext &Ctx = type->getContext();
+  va_list ap;
+  SmallVector<llvm::Type*, 8> StructFields;
+  va_start(ap, type);
+  while (type) {
+    StructFields.push_back(type);
+    type = va_arg(ap, llvm::Type*);
+  }
+  return llvm::StructType::create(Ctx, StructFields, Name);
+}
+
+bool StructType::isSized() const {
+  if ((getSubclassData() & SCDB_IsSized) != 0)
+    return true;
+  if (isOpaque())
+    return false;
+
+  // Okay, our struct is sized if all of the elements are, but if one of the
+  // elements is opaque, the struct isn't sized *yet*, but may become sized in
+  // the future, so just bail out without caching.
+  for (element_iterator I = element_begin(), E = element_end(); I != E; ++I)
+    if (!(*I)->isSized())
+      return false;
+
+  // Here we cheat a bit and cast away const-ness. The goal is to memoize when
+  // we find a sized type, as types can only move from opaque to sized, not the
+  // other way.
+  const_cast<StructType*>(this)->setSubclassData(
+    getSubclassData() | SCDB_IsSized);
+  return true;
+}
+
+StringRef StructType::getName() const {
+  assert(!isLiteral() && "Literal structs never have names");
+  if (SymbolTableEntry == 0) return StringRef();
+  
+  return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
+}
+
+void StructType::setBody(Type *type, ...) {
+  assert(type != 0 && "Cannot create a struct type with no elements with this");
+  va_list ap;
+  SmallVector<llvm::Type*, 8> StructFields;
+  va_start(ap, type);
+  while (type) {
+    StructFields.push_back(type);
+    type = va_arg(ap, llvm::Type*);
+  }
+  setBody(StructFields);
+}
+
+bool StructType::isValidElementType(Type *ElemTy) {
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
+}
+
+/// isLayoutIdentical - Return true if this is layout identical to the
+/// specified struct.
+bool StructType::isLayoutIdentical(StructType *Other) const {
+  if (this == Other) return true;
+  
+  if (isPacked() != Other->isPacked() ||
+      getNumElements() != Other->getNumElements())
+    return false;
+  
+  return std::equal(element_begin(), element_end(), Other->element_begin());
+}
+
+/// getTypeByName - Return the type with the specified name, or null if there
+/// is none by that name.
+StructType *Module::getTypeByName(StringRef Name) const {
+  StringMap<StructType*>::iterator I =
+    getContext().pImpl->NamedStructTypes.find(Name);
+  if (I != getContext().pImpl->NamedStructTypes.end())
+    return I->second;
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                       CompositeType Implementation
+//===----------------------------------------------------------------------===//
+
+Type *CompositeType::getTypeAtIndex(const Value *V) {
+  if (StructType *STy = dyn_cast<StructType>(this)) {
+    unsigned Idx =
+      (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue();
+    assert(indexValid(Idx) && "Invalid structure index!");
+    return STy->getElementType(Idx);
+  }
+
+  return cast<SequentialType>(this)->getElementType();
+}
+Type *CompositeType::getTypeAtIndex(unsigned Idx) {
+  if (StructType *STy = dyn_cast<StructType>(this)) {
+    assert(indexValid(Idx) && "Invalid structure index!");
+    return STy->getElementType(Idx);
+  }
+  
+  return cast<SequentialType>(this)->getElementType();
+}
+bool CompositeType::indexValid(const Value *V) const {
+  if (const StructType *STy = dyn_cast<StructType>(this)) {
+    // Structure indexes require (vectors of) 32-bit integer constants.  In the
+    // vector case all of the indices must be equal.
+    if (!V->getType()->getScalarType()->isIntegerTy(32))
+      return false;
+    const Constant *C = dyn_cast<Constant>(V);
+    if (C && V->getType()->isVectorTy())
+      C = C->getSplatValue();
+    const ConstantInt *CU = dyn_cast_or_null<ConstantInt>(C);
+    return CU && CU->getZExtValue() < STy->getNumElements();
+  }
+
+  // Sequential types can be indexed by any integer.
+  return V->getType()->isIntOrIntVectorTy();
+}
+
+bool CompositeType::indexValid(unsigned Idx) const {
+  if (const StructType *STy = dyn_cast<StructType>(this))
+    return Idx < STy->getNumElements();
+  // Sequential types can be indexed by any integer.
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           ArrayType Implementation
+//===----------------------------------------------------------------------===//
+
+ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
+  : SequentialType(ArrayTyID, ElType) {
+  NumElements = NumEl;
+}
+
+ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
+  Type *ElementType = const_cast<Type*>(elementType);
+  assert(isValidElementType(ElementType) && "Invalid type for array element!");
+    
+  LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
+  ArrayType *&Entry = 
+    pImpl->ArrayTypes[std::make_pair(ElementType, NumElements)];
+  
+  if (Entry == 0)
+    Entry = new (pImpl->TypeAllocator) ArrayType(ElementType, NumElements);
+  return Entry;
+}
+
+bool ArrayType::isValidElementType(Type *ElemTy) {
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
+}
+
+//===----------------------------------------------------------------------===//
+//                          VectorType Implementation
+//===----------------------------------------------------------------------===//
+
+VectorType::VectorType(Type *ElType, unsigned NumEl)
+  : SequentialType(VectorTyID, ElType) {
+  NumElements = NumEl;
+}
+
+VectorType *VectorType::get(Type *elementType, unsigned NumElements) {
+  Type *ElementType = const_cast<Type*>(elementType);
+  assert(NumElements > 0 && "#Elements of a VectorType must be greater than 0");
+  assert(isValidElementType(ElementType) &&
+         "Elements of a VectorType must be a primitive type");
+  
+  LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
+  VectorType *&Entry = ElementType->getContext().pImpl
+    ->VectorTypes[std::make_pair(ElementType, NumElements)];
+  
+  if (Entry == 0)
+    Entry = new (pImpl->TypeAllocator) VectorType(ElementType, NumElements);
+  return Entry;
+}
+
+bool VectorType::isValidElementType(Type *ElemTy) {
+  return ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
+    ElemTy->isPointerTy();
+}
+
+//===----------------------------------------------------------------------===//
+//                         PointerType Implementation
+//===----------------------------------------------------------------------===//
+
+PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
+  assert(EltTy && "Can't get a pointer to <null> type!");
+  assert(isValidElementType(EltTy) && "Invalid type for pointer element!");
+  
+  LLVMContextImpl *CImpl = EltTy->getContext().pImpl;
+  
+  // Since AddressSpace #0 is the common case, we special case it.
+  PointerType *&Entry = AddressSpace == 0 ? CImpl->PointerTypes[EltTy]
+     : CImpl->ASPointerTypes[std::make_pair(EltTy, AddressSpace)];
+
+  if (Entry == 0)
+    Entry = new (CImpl->TypeAllocator) PointerType(EltTy, AddressSpace);
+  return Entry;
+}
+
+
+PointerType::PointerType(Type *E, unsigned AddrSpace)
+  : SequentialType(PointerTyID, E) {
+#ifndef NDEBUG
+  const unsigned oldNCT = NumContainedTys;
+#endif
+  setSubclassData(AddrSpace);
+  // Check for miscompile. PR11652.
+  assert(oldNCT == NumContainedTys && "bitfield written out of bounds?");
+}
+
+PointerType *Type::getPointerTo(unsigned addrs) {
+  return PointerType::get(this, addrs);
+}
+
+bool PointerType::isValidElementType(Type *ElemTy) {
+  return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
+         !ElemTy->isMetadataTy();
+}
diff --git a/contrib/llvm/lib/IR/TypeFinder.cpp b/contrib/llvm/lib/IR/TypeFinder.cpp
new file mode 100644
index 000000000000..d5e620350705
--- /dev/null
+++ b/contrib/llvm/lib/IR/TypeFinder.cpp
@@ -0,0 +1,148 @@
+//===-- TypeFinder.cpp - Implement the TypeFinder class -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TypeFinder class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+using namespace llvm;
+
+void TypeFinder::run(const Module &M, bool onlyNamed) {
+  OnlyNamed = onlyNamed;
+
+  // Get types from global variables.
+  for (Module::const_global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    incorporateType(I->getType());
+    if (I->hasInitializer())
+      incorporateValue(I->getInitializer());
+  }
+
+  // Get types from aliases.
+  for (Module::const_alias_iterator I = M.alias_begin(),
+         E = M.alias_end(); I != E; ++I) {
+    incorporateType(I->getType());
+    if (const Value *Aliasee = I->getAliasee())
+      incorporateValue(Aliasee);
+  }
+
+  // Get types from functions.
+  SmallVector<std::pair<unsigned, MDNode*>, 4> MDForInst;
+  for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
+    incorporateType(FI->getType());
+
+    // First incorporate the arguments.
+    for (Function::const_arg_iterator AI = FI->arg_begin(),
+           AE = FI->arg_end(); AI != AE; ++AI)
+      incorporateValue(AI);
+
+    for (Function::const_iterator BB = FI->begin(), E = FI->end();
+         BB != E;++BB)
+      for (BasicBlock::const_iterator II = BB->begin(),
+             E = BB->end(); II != E; ++II) {
+        const Instruction &I = *II;
+
+        // Incorporate the type of the instruction.
+        incorporateType(I.getType());
+
+        // Incorporate non-instruction operand types. (We are incorporating all
+        // instructions with this loop.)
+        for (User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
+             OI != OE; ++OI)
+          if (!isa<Instruction>(OI))
+            incorporateValue(*OI);
+
+        // Incorporate types hiding in metadata.
+        I.getAllMetadataOtherThanDebugLoc(MDForInst);
+        for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
+          incorporateMDNode(MDForInst[i].second);
+
+        MDForInst.clear();
+      }
+  }
+
+  for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+         E = M.named_metadata_end(); I != E; ++I) {
+    const NamedMDNode *NMD = I;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      incorporateMDNode(NMD->getOperand(i));
+  }
+}
+
+void TypeFinder::clear() {
+  VisitedConstants.clear();
+  VisitedTypes.clear();
+  StructTypes.clear();
+}
+
+/// incorporateType - This method adds the type to the list of used structures
+/// if it's not in there already.
+void TypeFinder::incorporateType(Type *Ty) {
+  // Check to see if we're already visited this type.
+  if (!VisitedTypes.insert(Ty).second)
+    return;
+
+  // If this is a structure or opaque type, add a name for the type.
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!OnlyNamed || STy->hasName())
+      StructTypes.push_back(STy);
+
+  // Recursively walk all contained types.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+         E = Ty->subtype_end(); I != E; ++I)
+    incorporateType(*I);
+}
+
+/// incorporateValue - This method is used to walk operand lists finding types
+/// hiding in constant expressions and other operands that won't be walked in
+/// other ways.  GlobalValues, basic blocks, instructions, and inst operands are
+/// all explicitly enumerated.
+void TypeFinder::incorporateValue(const Value *V) {
+  if (const MDNode *M = dyn_cast<MDNode>(V))
+    return incorporateMDNode(M);
+
+  if (!isa<Constant>(V) || isa<GlobalValue>(V)) return;
+
+  // Already visited?
+  if (!VisitedConstants.insert(V).second)
+    return;
+
+  // Check this type.
+  incorporateType(V->getType());
+
+  // If this is an instruction, we incorporate it separately.
+  if (isa<Instruction>(V))
+    return;
+
+  // Look in operands for types.
+  const User *U = cast<User>(V);
+  for (Constant::const_op_iterator I = U->op_begin(),
+         E = U->op_end(); I != E;++I)
+    incorporateValue(*I);
+}
+
+/// incorporateMDNode - This method is used to walk the operands of an MDNode to
+/// find types hiding within.
+void TypeFinder::incorporateMDNode(const MDNode *V) {
+  // Already visited?
+  if (!VisitedConstants.insert(V).second)
+    return;
+
+  // Look in operands for types.
+  for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
+    if (Value *Op = V->getOperand(i))
+      incorporateValue(Op);
+}
diff --git a/contrib/llvm/lib/IR/Use.cpp b/contrib/llvm/lib/IR/Use.cpp
new file mode 100644
index 000000000000..1d343e803094
--- /dev/null
+++ b/contrib/llvm/lib/IR/Use.cpp
@@ -0,0 +1,145 @@
+//===-- Use.cpp - Implement the Use class ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the algorithm for finding the User of a Use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Value.h"
+#include <new>
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+//                         Use swap Implementation
+//===----------------------------------------------------------------------===//
+
+void Use::swap(Use &RHS) {
+  Value *V1(Val);
+  Value *V2(RHS.Val);
+  if (V1 != V2) {
+    if (V1) {
+      removeFromList();
+    }
+
+    if (V2) {
+      RHS.removeFromList();
+      Val = V2;
+      V2->addUse(*this);
+    } else {
+      Val = 0;
+    }
+
+    if (V1) {
+      RHS.Val = V1;
+      V1->addUse(RHS);
+    } else {
+      RHS.Val = 0;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use getImpliedUser Implementation
+//===----------------------------------------------------------------------===//
+
+const Use *Use::getImpliedUser() const {
+  const Use *Current = this;
+
+  while (true) {
+    unsigned Tag = (Current++)->Prev.getInt();
+    switch (Tag) {
+      case zeroDigitTag:
+      case oneDigitTag:
+        continue;
+
+      case stopTag: {
+        ++Current;
+        ptrdiff_t Offset = 1;
+        while (true) {
+          unsigned Tag = Current->Prev.getInt();
+          switch (Tag) {
+            case zeroDigitTag:
+            case oneDigitTag:
+              ++Current;
+              Offset = (Offset << 1) + Tag;
+              continue;
+            default:
+              return Current + Offset;
+          }
+        }
+      }
+
+      case fullStopTag:
+        return Current;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use initTags Implementation
+//===----------------------------------------------------------------------===//
+
+Use *Use::initTags(Use * const Start, Use *Stop) {
+  ptrdiff_t Done = 0;
+  while (Done < 20) {
+    if (Start == Stop--)
+      return Start;
+    static const PrevPtrTag tags[20] = { fullStopTag, oneDigitTag, stopTag,
+                                         oneDigitTag, oneDigitTag, stopTag,
+                                         zeroDigitTag, oneDigitTag, oneDigitTag,
+                                         stopTag, zeroDigitTag, oneDigitTag,
+                                         zeroDigitTag, oneDigitTag, stopTag,
+                                         oneDigitTag, oneDigitTag, oneDigitTag,
+                                         oneDigitTag, stopTag
+                                       };
+    new(Stop) Use(tags[Done++]);
+  }
+
+  ptrdiff_t Count = Done;
+  while (Start != Stop) {
+    --Stop;
+    if (!Count) {
+      new(Stop) Use(stopTag);
+      ++Done;
+      Count = Done;
+    } else {
+      new(Stop) Use(PrevPtrTag(Count & 1));
+      Count >>= 1;
+      ++Done;
+    }
+  }
+
+  return Start;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use zap Implementation
+//===----------------------------------------------------------------------===//
+
+void Use::zap(Use *Start, const Use *Stop, bool del) {
+  while (Start != Stop)
+    (--Stop)->~Use();
+  if (del)
+    ::operator delete(Start);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Use getUser Implementation
+//===----------------------------------------------------------------------===//
+
+User *Use::getUser() const {
+  const Use *End = getImpliedUser();
+  const UserRef *ref = reinterpret_cast<const UserRef*>(End);
+  return ref->getInt()
+    ? ref->getPointer()
+    : reinterpret_cast<User*>(const_cast<Use*>(End));
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/IR/User.cpp b/contrib/llvm/lib/IR/User.cpp
new file mode 100644
index 000000000000..940682826acc
--- /dev/null
+++ b/contrib/llvm/lib/IR/User.cpp
@@ -0,0 +1,90 @@
+//===-- User.cpp - Implement the User class -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/User.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Operator.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+//                                 User Class
+//===----------------------------------------------------------------------===//
+
+void User::anchor() {}
+
+// replaceUsesOfWith - Replaces all references to the "From" definition with
+// references to the "To" definition.
+//
+void User::replaceUsesOfWith(Value *From, Value *To) {
+  if (From == To) return;   // Duh what?
+
+  assert((!isa<Constant>(this) || isa<GlobalValue>(this)) &&
+         "Cannot call User::replaceUsesOfWith on a constant!");
+
+  for (unsigned i = 0, E = getNumOperands(); i != E; ++i)
+    if (getOperand(i) == From) {  // Is This operand is pointing to oldval?
+      // The side effects of this setOperand call include linking to
+      // "To", adding "this" to the uses list of To, and
+      // most importantly, removing "this" from the use list of "From".
+      setOperand(i, To); // Fix it now...
+    }
+}
+
+//===----------------------------------------------------------------------===//
+//                         User allocHungoffUses Implementation
+//===----------------------------------------------------------------------===//
+
+Use *User::allocHungoffUses(unsigned N) const {
+  // Allocate the array of Uses, followed by a pointer (with bottom bit set) to
+  // the User.
+  size_t size = N * sizeof(Use) + sizeof(Use::UserRef);
+  Use *Begin = static_cast<Use*>(::operator new(size));
+  Use *End = Begin + N;
+  (void) new(End) Use::UserRef(const_cast<User*>(this), 1);
+  return Use::initTags(Begin, End);
+}
+
+//===----------------------------------------------------------------------===//
+//                         User operator new Implementations
+//===----------------------------------------------------------------------===//
+
+void *User::operator new(size_t s, unsigned Us) {
+  void *Storage = ::operator new(s + sizeof(Use) * Us);
+  Use *Start = static_cast<Use*>(Storage);
+  Use *End = Start + Us;
+  User *Obj = reinterpret_cast<User*>(End);
+  Obj->OperandList = Start;
+  Obj->NumOperands = Us;
+  Use::initTags(Start, End);
+  return Obj;
+}
+
+//===----------------------------------------------------------------------===//
+//                         User operator delete Implementation
+//===----------------------------------------------------------------------===//
+
+void User::operator delete(void *Usr) {
+  User *Start = static_cast<User*>(Usr);
+  Use *Storage = static_cast<Use*>(Usr) - Start->NumOperands;
+  // If there were hung-off uses, they will have been freed already and
+  // NumOperands reset to 0, so here we just free the User itself.
+  ::operator delete(Storage);
+}
+
+//===----------------------------------------------------------------------===//
+//                             Operator Class
+//===----------------------------------------------------------------------===//
+
+Operator::~Operator() {
+  llvm_unreachable("should never destroy an Operator");
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/IR/Value.cpp b/contrib/llvm/lib/IR/Value.cpp
new file mode 100644
index 000000000000..adc702e05e68
--- /dev/null
+++ b/contrib/llvm/lib/IR/Value.cpp
@@ -0,0 +1,701 @@
+//===-- Value.cpp - Implement the Value class -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Value, ValueHandle, and User classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Value.h"
+#include "LLVMContextImpl.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/ValueHandle.h"
+#include <algorithm>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                                Value Class
+//===----------------------------------------------------------------------===//
+
+static inline Type *checkType(Type *Ty) {
+  assert(Ty && "Value defined with a null type: Error!");
+  return const_cast<Type*>(Ty);
+}
+
+Value::Value(Type *ty, unsigned scid)
+  : SubclassID(scid), HasValueHandle(0),
+    SubclassOptionalData(0), SubclassData(0), VTy((Type*)checkType(ty)),
+    UseList(0), Name(0) {
+  // FIXME: Why isn't this in the subclass gunk??
+  // Note, we cannot call isa<CallInst> before the CallInst has been
+  // constructed.
+  if (SubclassID == Instruction::Call || SubclassID == Instruction::Invoke)
+    assert((VTy->isFirstClassType() || VTy->isVoidTy() || VTy->isStructTy()) &&
+           "invalid CallInst type!");
+  else if (SubclassID != BasicBlockVal &&
+           (SubclassID < ConstantFirstVal || SubclassID > ConstantLastVal))
+    assert((VTy->isFirstClassType() || VTy->isVoidTy()) &&
+           "Cannot create non-first-class values except for constants!");
+}
+
+Value::~Value() {
+  // Notify all ValueHandles (if present) that this value is going away.
+  if (HasValueHandle)
+    ValueHandleBase::ValueIsDeleted(this);
+
+#ifndef NDEBUG      // Only in -g mode...
+  // Check to make sure that there are no uses of this value that are still
+  // around when the value is destroyed.  If there are, then we have a dangling
+  // reference and something is wrong.  This code is here to print out what is
+  // still being referenced.  The value in question should be printed as
+  // a <badref>
+  //
+  if (!use_empty()) {
+    dbgs() << "While deleting: " << *VTy << " %" << getName() << "\n";
+    for (use_iterator I = use_begin(), E = use_end(); I != E; ++I)
+      dbgs() << "Use still stuck around after Def is destroyed:"
+           << **I << "\n";
+  }
+#endif
+  assert(use_empty() && "Uses remain when a value is destroyed!");
+
+  // If this value is named, destroy the name.  This should not be in a symtab
+  // at this point.
+  if (Name && SubclassID != MDStringVal)
+    Name->Destroy();
+
+  // There should be no uses of this object anymore, remove it.
+  LeakDetector::removeGarbageObject(this);
+}
+
+/// hasNUses - Return true if this Value has exactly N users.
+///
+bool Value::hasNUses(unsigned N) const {
+  const_use_iterator UI = use_begin(), E = use_end();
+
+  for (; N; --N, ++UI)
+    if (UI == E) return false;  // Too few.
+  return UI == E;
+}
+
+/// hasNUsesOrMore - Return true if this value has N users or more.  This is
+/// logically equivalent to getNumUses() >= N.
+///
+bool Value::hasNUsesOrMore(unsigned N) const {
+  const_use_iterator UI = use_begin(), E = use_end();
+
+  for (; N; --N, ++UI)
+    if (UI == E) return false;  // Too few.
+
+  return true;
+}
+
+/// isUsedInBasicBlock - Return true if this value is used in the specified
+/// basic block.
+bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
+  // Start by scanning over the instructions looking for a use before we start
+  // the expensive use iteration.
+  unsigned MaxBlockSize = 3;
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (std::find(I->op_begin(), I->op_end(), this) != I->op_end())
+      return true;
+    if (MaxBlockSize-- == 0) // If the block is larger fall back to use_iterator
+      break;
+  }
+
+  if (MaxBlockSize != 0) // We scanned the entire block and found no use.
+    return false;
+
+  for (const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
+    const Instruction *User = dyn_cast<Instruction>(*I);
+    if (User && User->getParent() == BB)
+      return true;
+  }
+  return false;
+}
+
+
+/// getNumUses - This method computes the number of uses of this Value.  This
+/// is a linear time operation.  Use hasOneUse or hasNUses to check for specific
+/// values.
+unsigned Value::getNumUses() const {
+  return (unsigned)std::distance(use_begin(), use_end());
+}
+
+static bool getSymTab(Value *V, ValueSymbolTable *&ST) {
+  ST = 0;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (BasicBlock *P = I->getParent())
+      if (Function *PP = P->getParent())
+        ST = &PP->getValueSymbolTable();
+  } else if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
+    if (Function *P = BB->getParent())
+      ST = &P->getValueSymbolTable();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    if (Module *P = GV->getParent())
+      ST = &P->getValueSymbolTable();
+  } else if (Argument *A = dyn_cast<Argument>(V)) {
+    if (Function *P = A->getParent())
+      ST = &P->getValueSymbolTable();
+  } else if (isa<MDString>(V))
+    return true;
+  else {
+    assert(isa<Constant>(V) && "Unknown value type!");
+    return true;  // no name is setable for this.
+  }
+  return false;
+}
+
+StringRef Value::getName() const {
+  // Make sure the empty string is still a C string. For historical reasons,
+  // some clients want to call .data() on the result and expect it to be null
+  // terminated.
+  if (!Name) return StringRef("", 0);
+  return Name->getKey();
+}
+
+void Value::setName(const Twine &NewName) {
+  assert(SubclassID != MDStringVal &&
+         "Cannot set the name of MDString with this method!");
+
+  // Fast path for common IRBuilder case of setName("") when there is no name.
+  if (NewName.isTriviallyEmpty() && !hasName())
+    return;
+
+  SmallString<256> NameData;
+  StringRef NameRef = NewName.toStringRef(NameData);
+
+  // Name isn't changing?
+  if (getName() == NameRef)
+    return;
+
+  assert(!getType()->isVoidTy() && "Cannot assign a name to void values!");
+
+  // Get the symbol table to update for this object.
+  ValueSymbolTable *ST;
+  if (getSymTab(this, ST))
+    return;  // Cannot set a name on this value (e.g. constant).
+
+  if (Function *F = dyn_cast<Function>(this))
+    getContext().pImpl->IntrinsicIDCache.erase(F);
+
+  if (!ST) { // No symbol table to update?  Just do the change.
+    if (NameRef.empty()) {
+      // Free the name for this value.
+      Name->Destroy();
+      Name = 0;
+      return;
+    }
+
+    if (Name)
+      Name->Destroy();
+
+    // NOTE: Could optimize for the case the name is shrinking to not deallocate
+    // then reallocated.
+
+    // Create the new name.
+    Name = ValueName::Create(NameRef.begin(), NameRef.end());
+    Name->setValue(this);
+    return;
+  }
+
+  // NOTE: Could optimize for the case the name is shrinking to not deallocate
+  // then reallocated.
+  if (hasName()) {
+    // Remove old name.
+    ST->removeValueName(Name);
+    Name->Destroy();
+    Name = 0;
+
+    if (NameRef.empty())
+      return;
+  }
+
+  // Name is changing to something new.
+  Name = ST->createValueName(NameRef, this);
+}
+
+
+/// takeName - transfer the name from V to this value, setting V's name to
+/// empty.  It is an error to call V->takeName(V).
+void Value::takeName(Value *V) {
+  assert(SubclassID != MDStringVal && "Cannot take the name of an MDString!");
+
+  ValueSymbolTable *ST = 0;
+  // If this value has a name, drop it.
+  if (hasName()) {
+    // Get the symtab this is in.
+    if (getSymTab(this, ST)) {
+      // We can't set a name on this value, but we need to clear V's name if
+      // it has one.
+      if (V->hasName()) V->setName("");
+      return;  // Cannot set a name on this value (e.g. constant).
+    }
+
+    // Remove old name.
+    if (ST)
+      ST->removeValueName(Name);
+    Name->Destroy();
+    Name = 0;
+  }
+
+  // Now we know that this has no name.
+
+  // If V has no name either, we're done.
+  if (!V->hasName()) return;
+
+  // Get this's symtab if we didn't before.
+  if (!ST) {
+    if (getSymTab(this, ST)) {
+      // Clear V's name.
+      V->setName("");
+      return;  // Cannot set a name on this value (e.g. constant).
+    }
+  }
+
+  // Get V's ST, this should always succed, because V has a name.
+  ValueSymbolTable *VST;
+  bool Failure = getSymTab(V, VST);
+  assert(!Failure && "V has a name, so it should have a ST!"); (void)Failure;
+
+  // If these values are both in the same symtab, we can do this very fast.
+  // This works even if both values have no symtab yet.
+  if (ST == VST) {
+    // Take the name!
+    Name = V->Name;
+    V->Name = 0;
+    Name->setValue(this);
+    return;
+  }
+
+  // Otherwise, things are slightly more complex.  Remove V's name from VST and
+  // then reinsert it into ST.
+
+  if (VST)
+    VST->removeValueName(V->Name);
+  Name = V->Name;
+  V->Name = 0;
+  Name->setValue(this);
+
+  if (ST)
+    ST->reinsertValue(this);
+}
+
+
+void Value::replaceAllUsesWith(Value *New) {
+  assert(New && "Value::replaceAllUsesWith(<null>) is invalid!");
+  assert(New != this && "this->replaceAllUsesWith(this) is NOT valid!");
+  assert(New->getType() == getType() &&
+         "replaceAllUses of value with new value of different type!");
+
+  // Notify all ValueHandles (if present) that this value is going away.
+  if (HasValueHandle)
+    ValueHandleBase::ValueIsRAUWd(this, New);
+
+  while (!use_empty()) {
+    Use &U = *UseList;
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (Constant *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        C->replaceUsesOfWithOnConstant(this, New, &U);
+        continue;
+      }
+    }
+
+    U.set(New);
+  }
+
+  if (BasicBlock *BB = dyn_cast<BasicBlock>(this))
+    BB->replaceSuccessorsPhiUsesWith(cast<BasicBlock>(New));
+}
+
+namespace {
+// Various metrics for how much to strip off of pointers.
+enum PointerStripKind {
+  PSK_ZeroIndices,
+  PSK_InBoundsConstantIndices,
+  PSK_InBounds
+};
+
+template <PointerStripKind StripKind>
+static Value *stripPointerCastsAndOffsets(Value *V) {
+  if (!V->getType()->isPointerTy())
+    return V;
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+
+  Visited.insert(V);
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      switch (StripKind) {
+      case PSK_ZeroIndices:
+        if (!GEP->hasAllZeroIndices())
+          return V;
+        break;
+      case PSK_InBoundsConstantIndices:
+        if (!GEP->hasAllConstantIndices())
+          return V;
+        // fallthrough
+      case PSK_InBounds:
+        if (!GEP->isInBounds())
+          return V;
+        break;
+      }
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      if (GA->mayBeOverridden())
+        return V;
+      V = GA->getAliasee();
+    } else {
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  return V;
+}
+} // namespace
+
+Value *Value::stripPointerCasts() {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
+}
+
+Value *Value::stripInBoundsConstantOffsets() {
+  return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
+}
+
+Value *Value::stripInBoundsOffsets() {
+  return stripPointerCastsAndOffsets<PSK_InBounds>(this);
+}
+
+/// isDereferenceablePointer - Test if this value is always a pointer to
+/// allocated and suitably aligned memory for a simple load or store.
+static bool isDereferenceablePointer(const Value *V,
+                                     SmallPtrSet<const Value *, 32> &Visited) {
+  // Note that it is not safe to speculate into a malloc'd region because
+  // malloc may return null.
+  // It's also not always safe to follow a bitcast, for example:
+  //   bitcast i8* (alloca i8) to i32*
+  // would result in a 4-byte load from a 1-byte alloca. Some cases could
+  // be handled using DataLayout to check sizes and alignments though.
+
+  // These are obviously ok.
+  if (isa<AllocaInst>(V)) return true;
+
+  // Global variables which can't collapse to null are ok.
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return !GV->hasExternalWeakLinkage();
+
+  // byval arguments are ok.
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasByValAttr();
+
+  // For GEPs, determine if the indexing lands within the allocated object.
+  if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    // Conservatively require that the base pointer be fully dereferenceable.
+    if (!Visited.insert(GEP->getOperand(0)))
+      return false;
+    if (!isDereferenceablePointer(GEP->getOperand(0), Visited))
+      return false;
+    // Check the indices.
+    gep_type_iterator GTI = gep_type_begin(GEP);
+    for (User::const_op_iterator I = GEP->op_begin()+1,
+         E = GEP->op_end(); I != E; ++I) {
+      Value *Index = *I;
+      Type *Ty = *GTI++;
+      // Struct indices can't be out of bounds.
+      if (isa<StructType>(Ty))
+        continue;
+      ConstantInt *CI = dyn_cast<ConstantInt>(Index);
+      if (!CI)
+        return false;
+      // Zero is always ok.
+      if (CI->isZero())
+        continue;
+      // Check to see that it's within the bounds of an array.
+      ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+      if (!ATy)
+        return false;
+      if (CI->getValue().getActiveBits() > 64)
+        return false;
+      if (CI->getZExtValue() >= ATy->getNumElements())
+        return false;
+    }
+    // Indices check out; this is dereferenceable.
+    return true;
+  }
+
+  // If we don't know, assume the worst.
+  return false;
+}
+
+/// isDereferenceablePointer - Test if this value is always a pointer to
+/// allocated and suitably aligned memory for a simple load or store.
+bool Value::isDereferenceablePointer() const {
+  SmallPtrSet<const Value *, 32> Visited;
+  return ::isDereferenceablePointer(this, Visited);
+}
+
+/// DoPHITranslation - If this value is a PHI node with CurBB as its parent,
+/// return the value in the PHI node corresponding to PredBB.  If not, return
+/// ourself.  This is useful if you want to know the value something has in a
+/// predecessor block.
+Value *Value::DoPHITranslation(const BasicBlock *CurBB,
+                               const BasicBlock *PredBB) {
+  PHINode *PN = dyn_cast<PHINode>(this);
+  if (PN && PN->getParent() == CurBB)
+    return PN->getIncomingValueForBlock(PredBB);
+  return this;
+}
+
+LLVMContext &Value::getContext() const { return VTy->getContext(); }
+
+//===----------------------------------------------------------------------===//
+//                             ValueHandleBase Class
+//===----------------------------------------------------------------------===//
+
+/// AddToExistingUseList - Add this ValueHandle to the use list for VP, where
+/// List is known to point into the existing use list.
+void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
+  assert(List && "Handle list is null?");
+
+  // Splice ourselves into the list.
+  Next = *List;
+  *List = this;
+  setPrevPtr(List);
+  if (Next) {
+    Next->setPrevPtr(&Next);
+    assert(VP.getPointer() == Next->VP.getPointer() && "Added to wrong list?");
+  }
+}
+
+void ValueHandleBase::AddToExistingUseListAfter(ValueHandleBase *List) {
+  assert(List && "Must insert after existing node");
+
+  Next = List->Next;
+  setPrevPtr(&List->Next);
+  List->Next = this;
+  if (Next)
+    Next->setPrevPtr(&Next);
+}
+
+/// AddToUseList - Add this ValueHandle to the use list for VP.
+void ValueHandleBase::AddToUseList() {
+  assert(VP.getPointer() && "Null pointer doesn't have a use list!");
+
+  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
+
+  if (VP.getPointer()->HasValueHandle) {
+    // If this value already has a ValueHandle, then it must be in the
+    // ValueHandles map already.
+    ValueHandleBase *&Entry = pImpl->ValueHandles[VP.getPointer()];
+    assert(Entry != 0 && "Value doesn't have any handles?");
+    AddToExistingUseList(&Entry);
+    return;
+  }
+
+  // Ok, it doesn't have any handles yet, so we must insert it into the
+  // DenseMap.  However, doing this insertion could cause the DenseMap to
+  // reallocate itself, which would invalidate all of the PrevP pointers that
+  // point into the old table.  Handle this by checking for reallocation and
+  // updating the stale pointers only if needed.
+  DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
+  const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
+
+  ValueHandleBase *&Entry = Handles[VP.getPointer()];
+  assert(Entry == 0 && "Value really did already have handles?");
+  AddToExistingUseList(&Entry);
+  VP.getPointer()->HasValueHandle = true;
+
+  // If reallocation didn't happen or if this was the first insertion, don't
+  // walk the table.
+  if (Handles.isPointerIntoBucketsArray(OldBucketPtr) ||
+      Handles.size() == 1) {
+    return;
+  }
+
+  // Okay, reallocation did happen.  Fix the Prev Pointers.
+  for (DenseMap<Value*, ValueHandleBase*>::iterator I = Handles.begin(),
+       E = Handles.end(); I != E; ++I) {
+    assert(I->second && I->first == I->second->VP.getPointer() &&
+           "List invariant broken!");
+    I->second->setPrevPtr(&I->second);
+  }
+}
+
+/// RemoveFromUseList - Remove this ValueHandle from its current use list.
+void ValueHandleBase::RemoveFromUseList() {
+  assert(VP.getPointer() && VP.getPointer()->HasValueHandle &&
+         "Pointer doesn't have a use list!");
+
+  // Unlink this from its use list.
+  ValueHandleBase **PrevPtr = getPrevPtr();
+  assert(*PrevPtr == this && "List invariant broken");
+
+  *PrevPtr = Next;
+  if (Next) {
+    assert(Next->getPrevPtr() == &Next && "List invariant broken");
+    Next->setPrevPtr(PrevPtr);
+    return;
+  }
+
+  // If the Next pointer was null, then it is possible that this was the last
+  // ValueHandle watching VP.  If so, delete its entry from the ValueHandles
+  // map.
+  LLVMContextImpl *pImpl = VP.getPointer()->getContext().pImpl;
+  DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
+  if (Handles.isPointerIntoBucketsArray(PrevPtr)) {
+    Handles.erase(VP.getPointer());
+    VP.getPointer()->HasValueHandle = false;
+  }
+}
+
+
+void ValueHandleBase::ValueIsDeleted(Value *V) {
+  assert(V->HasValueHandle && "Should only be called if ValueHandles present");
+
+  // Get the linked list base, which is guaranteed to exist since the
+  // HasValueHandle flag is set.
+  LLVMContextImpl *pImpl = V->getContext().pImpl;
+  ValueHandleBase *Entry = pImpl->ValueHandles[V];
+  assert(Entry && "Value bit set but no entries exist");
+
+  // We use a local ValueHandleBase as an iterator so that ValueHandles can add
+  // and remove themselves from the list without breaking our iteration.  This
+  // is not really an AssertingVH; we just have to give ValueHandleBase a kind.
+  // Note that we deliberately do not the support the case when dropping a value
+  // handle results in a new value handle being permanently added to the list
+  // (as might occur in theory for CallbackVH's): the new value handle will not
+  // be processed and the checking code will mete out righteous punishment if
+  // the handle is still present once we have finished processing all the other
+  // value handles (it is fine to momentarily add then remove a value handle).
+  for (ValueHandleBase Iterator(Assert, *Entry); Entry; Entry = Iterator.Next) {
+    Iterator.RemoveFromUseList();
+    Iterator.AddToExistingUseListAfter(Entry);
+    assert(Entry->Next == &Iterator && "Loop invariant broken.");
+
+    switch (Entry->getKind()) {
+    case Assert:
+      break;
+    case Tracking:
+      // Mark that this value has been deleted by setting it to an invalid Value
+      // pointer.
+      Entry->operator=(DenseMapInfo<Value *>::getTombstoneKey());
+      break;
+    case Weak:
+      // Weak just goes to null, which will unlink it from the list.
+      Entry->operator=(0);
+      break;
+    case Callback:
+      // Forward to the subclass's implementation.
+      static_cast<CallbackVH*>(Entry)->deleted();
+      break;
+    }
+  }
+
+  // All callbacks, weak references, and assertingVHs should be dropped by now.
+  if (V->HasValueHandle) {
+#ifndef NDEBUG      // Only in +Asserts mode...
+    dbgs() << "While deleting: " << *V->getType() << " %" << V->getName()
+           << "\n";
+    if (pImpl->ValueHandles[V]->getKind() == Assert)
+      llvm_unreachable("An asserting value handle still pointed to this"
+                       " value!");
+
+#endif
+    llvm_unreachable("All references to V were not removed?");
+  }
+}
+
+
+void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
+  assert(Old->HasValueHandle &&"Should only be called if ValueHandles present");
+  assert(Old != New && "Changing value into itself!");
+
+  // Get the linked list base, which is guaranteed to exist since the
+  // HasValueHandle flag is set.
+  LLVMContextImpl *pImpl = Old->getContext().pImpl;
+  ValueHandleBase *Entry = pImpl->ValueHandles[Old];
+
+  assert(Entry && "Value bit set but no entries exist");
+
+  // We use a local ValueHandleBase as an iterator so that
+  // ValueHandles can add and remove themselves from the list without
+  // breaking our iteration.  This is not really an AssertingVH; we
+  // just have to give ValueHandleBase some kind.
+  for (ValueHandleBase Iterator(Assert, *Entry); Entry; Entry = Iterator.Next) {
+    Iterator.RemoveFromUseList();
+    Iterator.AddToExistingUseListAfter(Entry);
+    assert(Entry->Next == &Iterator && "Loop invariant broken.");
+
+    switch (Entry->getKind()) {
+    case Assert:
+      // Asserting handle does not follow RAUW implicitly.
+      break;
+    case Tracking:
+      // Tracking goes to new value like a WeakVH. Note that this may make it
+      // something incompatible with its templated type. We don't want to have a
+      // virtual (or inline) interface to handle this though, so instead we make
+      // the TrackingVH accessors guarantee that a client never sees this value.
+
+      // FALLTHROUGH
+    case Weak:
+      // Weak goes to the new value, which will unlink it from Old's list.
+      Entry->operator=(New);
+      break;
+    case Callback:
+      // Forward to the subclass's implementation.
+      static_cast<CallbackVH*>(Entry)->allUsesReplacedWith(New);
+      break;
+    }
+  }
+
+#ifndef NDEBUG
+  // If any new tracking or weak value handles were added while processing the
+  // list, then complain about it now.
+  if (Old->HasValueHandle)
+    for (Entry = pImpl->ValueHandles[Old]; Entry; Entry = Entry->Next)
+      switch (Entry->getKind()) {
+      case Tracking:
+      case Weak:
+        dbgs() << "After RAUW from " << *Old->getType() << " %"
+               << Old->getName() << " to " << *New->getType() << " %"
+               << New->getName() << "\n";
+        llvm_unreachable("A tracking or weak value handle still pointed to the"
+                         " old value!\n");
+      default:
+        break;
+      }
+#endif
+}
+
+// Default implementation for CallbackVH.
+void CallbackVH::allUsesReplacedWith(Value *) {}
+
+void CallbackVH::deleted() {
+  setValPtr(NULL);
+}
diff --git a/contrib/llvm/lib/IR/ValueSymbolTable.cpp b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
new file mode 100644
index 000000000000..fffacb377770
--- /dev/null
+++ b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
@@ -0,0 +1,117 @@
+//===-- ValueSymbolTable.cpp - Implement the ValueSymbolTable class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ValueSymbolTable class for the IR library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "valuesymtab"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Class destructor
+ValueSymbolTable::~ValueSymbolTable() {
+#ifndef NDEBUG   // Only do this in -g mode...
+  for (iterator VI = vmap.begin(), VE = vmap.end(); VI != VE; ++VI)
+    dbgs() << "Value still in symbol table! Type = '"
+           << *VI->getValue()->getType() << "' Name = '"
+           << VI->getKeyData() << "'\n";
+  assert(vmap.empty() && "Values remain in symbol table!");
+#endif
+}
+
+// Insert a value into the symbol table with the specified name...
+//
+void ValueSymbolTable::reinsertValue(Value* V) {
+  assert(V->hasName() && "Can't insert nameless Value into symbol table");
+
+  // Try inserting the name, assuming it won't conflict.
+  if (vmap.insert(V->Name)) {
+    //DEBUG(dbgs() << " Inserted value: " << V->Name << ": " << *V << "\n");
+    return;
+  }
+  
+  // Otherwise, there is a naming conflict.  Rename this value.
+  SmallString<256> UniqueName(V->getName().begin(), V->getName().end());
+
+  // The name is too already used, just free it so we can allocate a new name.
+  V->Name->Destroy();
+  
+  unsigned BaseSize = UniqueName.size();
+  while (1) {
+    // Trim any suffix off and append the next number.
+    UniqueName.resize(BaseSize);
+    raw_svector_ostream(UniqueName) << ++LastUnique;
+
+    // Try insert the vmap entry with this suffix.
+    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
+    if (NewName.getValue() == 0) {
+      // Newly inserted name.  Success!
+      NewName.setValue(V);
+      V->Name = &NewName;
+     //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
+      return;
+    }
+  }
+}
+
+void ValueSymbolTable::removeValueName(ValueName *V) {
+  //DEBUG(dbgs() << " Removing Value: " << V->getKeyData() << "\n");
+  // Remove the value from the symbol table.
+  vmap.remove(V);
+}
+
+/// createValueName - This method attempts to create a value name and insert
+/// it into the symbol table with the specified name.  If it conflicts, it
+/// auto-renames the name and returns that instead.
+ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
+  // In the common case, the name is not already in the symbol table.
+  ValueName &Entry = vmap.GetOrCreateValue(Name);
+  if (Entry.getValue() == 0) {
+    Entry.setValue(V);
+    //DEBUG(dbgs() << " Inserted value: " << Entry.getKeyData() << ": "
+    //           << *V << "\n");
+    return &Entry;
+  }
+  
+  // Otherwise, there is a naming conflict.  Rename this value.
+  SmallString<256> UniqueName(Name.begin(), Name.end());
+  
+  while (1) {
+    // Trim any suffix off and append the next number.
+    UniqueName.resize(Name.size());
+    raw_svector_ostream(UniqueName) << ++LastUnique;
+    
+    // Try insert the vmap entry with this suffix.
+    ValueName &NewName = vmap.GetOrCreateValue(UniqueName);
+    if (NewName.getValue() == 0) {
+      // Newly inserted name.  Success!
+      NewName.setValue(V);
+     //DEBUG(dbgs() << " Inserted value: " << UniqueName << ": " << *V << "\n");
+      return &NewName;
+    }
+  }
+}
+
+
+// dump - print out the symbol table
+//
+void ValueSymbolTable::dump() const {
+  //DEBUG(dbgs() << "ValueSymbolTable:\n");
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    //DEBUG(dbgs() << "  '" << I->getKeyData() << "' = ");
+    I->getValue()->dump();
+    //DEBUG(dbgs() << "\n");
+  }
+}
diff --git a/contrib/llvm/lib/IR/ValueTypes.cpp b/contrib/llvm/lib/IR/ValueTypes.cpp
new file mode 100644
index 000000000000..ba04d60c24a1
--- /dev/null
+++ b/contrib/llvm/lib/IR/ValueTypes.cpp
@@ -0,0 +1,277 @@
+//===----------- ValueTypes.cpp - Implementation of EVT methods -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements methods in the CodeGen/ValueTypes.h header.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+EVT EVT::changeExtendedVectorElementTypeToInteger() const {
+  LLVMContext &Context = LLVMTy->getContext();
+  EVT IntTy = getIntegerVT(Context, getVectorElementType().getSizeInBits());
+  return getVectorVT(Context, IntTy, getVectorNumElements());
+}
+
+EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) {
+  EVT VT;
+  VT.LLVMTy = IntegerType::get(Context, BitWidth);
+  assert(VT.isExtended() && "Type is not extended!");
+  return VT;
+}
+
+EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT,
+                             unsigned NumElements) {
+  EVT ResultVT;
+  ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), NumElements);
+  assert(ResultVT.isExtended() && "Type is not extended!");
+  return ResultVT;
+}
+
+bool EVT::isExtendedFloatingPoint() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isFPOrFPVectorTy();
+}
+
+bool EVT::isExtendedInteger() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isIntOrIntVectorTy();
+}
+
+bool EVT::isExtendedVector() const {
+  assert(isExtended() && "Type is not extended!");
+  return LLVMTy->isVectorTy();
+}
+
+bool EVT::isExtended16BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 16;
+}
+
+bool EVT::isExtended32BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 32;
+}
+
+bool EVT::isExtended64BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 64;
+}
+
+bool EVT::isExtended128BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 128;
+}
+
+bool EVT::isExtended256BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 256;
+}
+
+bool EVT::isExtended512BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 512;
+}
+
+bool EVT::isExtended1024BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 1024;
+}
+
+EVT EVT::getExtendedVectorElementType() const {
+  assert(isExtended() && "Type is not extended!");
+  return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType());
+}
+
+unsigned EVT::getExtendedVectorNumElements() const {
+  assert(isExtended() && "Type is not extended!");
+  return cast<VectorType>(LLVMTy)->getNumElements();
+}
+
+unsigned EVT::getExtendedSizeInBits() const {
+  assert(isExtended() && "Type is not extended!");
+  if (IntegerType *ITy = dyn_cast<IntegerType>(LLVMTy))
+    return ITy->getBitWidth();
+  if (VectorType *VTy = dyn_cast<VectorType>(LLVMTy))
+    return VTy->getBitWidth();
+  llvm_unreachable("Unrecognized extended type!");
+}
+
+/// getEVTString - This function returns value type as a string, e.g. "i32".
+std::string EVT::getEVTString() const {
+  switch (V.SimpleTy) {
+  default:
+    if (isVector())
+      return "v" + utostr(getVectorNumElements()) +
+             getVectorElementType().getEVTString();
+    if (isInteger())
+      return "i" + utostr(getSizeInBits());
+    llvm_unreachable("Invalid EVT!");
+  case MVT::i1:      return "i1";
+  case MVT::i8:      return "i8";
+  case MVT::i16:     return "i16";
+  case MVT::i32:     return "i32";
+  case MVT::i64:     return "i64";
+  case MVT::i128:    return "i128";
+  case MVT::f16:     return "f16";
+  case MVT::f32:     return "f32";
+  case MVT::f64:     return "f64";
+  case MVT::f80:     return "f80";
+  case MVT::f128:    return "f128";
+  case MVT::ppcf128: return "ppcf128";
+  case MVT::isVoid:  return "isVoid";
+  case MVT::Other:   return "ch";
+  case MVT::Glue:    return "glue";
+  case MVT::x86mmx:  return "x86mmx";
+  case MVT::v2i1:    return "v2i1";
+  case MVT::v4i1:    return "v4i1";
+  case MVT::v8i1:    return "v8i1";
+  case MVT::v16i1:   return "v16i1";
+  case MVT::v32i1:   return "v32i1";
+  case MVT::v64i1:   return "v64i1";
+  case MVT::v2i8:    return "v2i8";
+  case MVT::v4i8:    return "v4i8";
+  case MVT::v8i8:    return "v8i8";
+  case MVT::v16i8:   return "v16i8";
+  case MVT::v32i8:   return "v32i8";
+  case MVT::v64i8:   return "v64i8";
+  case MVT::v1i16:   return "v1i16";
+  case MVT::v2i16:   return "v2i16";
+  case MVT::v4i16:   return "v4i16";
+  case MVT::v8i16:   return "v8i16";
+  case MVT::v16i16:  return "v16i16";
+  case MVT::v32i16:  return "v32i16";
+  case MVT::v1i32:   return "v1i32";
+  case MVT::v2i32:   return "v2i32";
+  case MVT::v4i32:   return "v4i32";
+  case MVT::v8i32:   return "v8i32";
+  case MVT::v16i32:  return "v16i32";
+  case MVT::v1i64:   return "v1i64";
+  case MVT::v2i64:   return "v2i64";
+  case MVT::v4i64:   return "v4i64";
+  case MVT::v8i64:   return "v8i64";
+  case MVT::v16i64:  return "v16i64";
+  case MVT::v2f32:   return "v2f32";
+  case MVT::v2f16:   return "v2f16";
+  case MVT::v4f32:   return "v4f32";
+  case MVT::v8f32:   return "v8f32";
+  case MVT::v16f32:  return "v16f32";
+  case MVT::v2f64:   return "v2f64";
+  case MVT::v4f64:   return "v4f64";
+  case MVT::v8f64:   return "v8f64";
+  case MVT::Metadata:return "Metadata";
+  case MVT::Untyped: return "Untyped";
+  }
+}
+
+/// getTypeForEVT - This method returns an LLVM type corresponding to the
+/// specified EVT.  For integer types, this returns an unsigned type.  Note
+/// that this will abort for types that cannot be represented.
+Type *EVT::getTypeForEVT(LLVMContext &Context) const {
+  switch (V.SimpleTy) {
+  default:
+    assert(isExtended() && "Type is not extended!");
+    return LLVMTy;
+  case MVT::isVoid:  return Type::getVoidTy(Context);
+  case MVT::i1:      return Type::getInt1Ty(Context);
+  case MVT::i8:      return Type::getInt8Ty(Context);
+  case MVT::i16:     return Type::getInt16Ty(Context);
+  case MVT::i32:     return Type::getInt32Ty(Context);
+  case MVT::i64:     return Type::getInt64Ty(Context);
+  case MVT::i128:    return IntegerType::get(Context, 128);
+  case MVT::f16:     return Type::getHalfTy(Context);
+  case MVT::f32:     return Type::getFloatTy(Context);
+  case MVT::f64:     return Type::getDoubleTy(Context);
+  case MVT::f80:     return Type::getX86_FP80Ty(Context);
+  case MVT::f128:    return Type::getFP128Ty(Context);
+  case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
+  case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
+  case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
+  case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
+  case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
+  case MVT::v16i8:   return VectorType::get(Type::getInt8Ty(Context), 16);
+  case MVT::v32i8:   return VectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::v64i8:   return VectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
+  case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
+  case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
+  case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::v32i16:  return VectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::v1i32:   return VectorType::get(Type::getInt32Ty(Context), 1);
+  case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
+  case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
+  case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
+  case MVT::v16i32:  return VectorType::get(Type::getInt32Ty(Context), 16);
+  case MVT::v1i64:   return VectorType::get(Type::getInt64Ty(Context), 1);
+  case MVT::v2i64:   return VectorType::get(Type::getInt64Ty(Context), 2);
+  case MVT::v4i64:   return VectorType::get(Type::getInt64Ty(Context), 4);
+  case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
+  case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
+  case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
+  case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
+  case MVT::v8f32:   return VectorType::get(Type::getFloatTy(Context), 8);
+  case MVT::v16f32:   return VectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
+  case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4); 
+  case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8); 
+  case MVT::Metadata: return Type::getMetadataTy(Context);
+ }
+}
+
+/// Return the value type corresponding to the specified type.  This returns all
+/// pointers as MVT::iPTR.  If HandleUnknown is true, unknown types are returned
+/// as Other, otherwise they are invalid.
+MVT MVT::getVT(Type *Ty, bool HandleUnknown){
+  switch (Ty->getTypeID()) {
+  default:
+    if (HandleUnknown) return MVT(MVT::Other);
+    llvm_unreachable("Unknown type!");
+  case Type::VoidTyID:
+    return MVT::isVoid;
+  case Type::IntegerTyID:
+    return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
+  case Type::HalfTyID:      return MVT(MVT::f16);
+  case Type::FloatTyID:     return MVT(MVT::f32);
+  case Type::DoubleTyID:    return MVT(MVT::f64);
+  case Type::X86_FP80TyID:  return MVT(MVT::f80);
+  case Type::X86_MMXTyID:   return MVT(MVT::x86mmx);
+  case Type::FP128TyID:     return MVT(MVT::f128);
+  case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
+  case Type::PointerTyID:   return MVT(MVT::iPTR);
+  case Type::VectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return getVectorVT(
+      getVT(VTy->getElementType(), false), VTy->getNumElements());
+  }
+  }
+}
+
+/// getEVT - Return the value type corresponding to the specified type.  This
+/// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
+/// are returned as Other, otherwise they are invalid.
+EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
+  switch (Ty->getTypeID()) {
+  default:
+    return MVT::getVT(Ty, HandleUnknown);
+  case Type::IntegerTyID:
+    return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
+  case Type::VectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return getVectorVT(Ty->getContext(), getEVT(VTy->getElementType(), false),
+                       VTy->getNumElements());
+  }
+  }
+}
diff --git a/contrib/llvm/lib/IR/Verifier.cpp b/contrib/llvm/lib/IR/Verifier.cpp
new file mode 100644
index 000000000000..8bfbb322cf4c
--- /dev/null
+++ b/contrib/llvm/lib/IR/Verifier.cpp
@@ -0,0 +1,2144 @@
+//===-- Verifier.cpp - Implement the Module Verifier -----------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function verifier interface, that can be used for some
+// sanity checking of input to the system.
+//
+// Note that this does not provide full `Java style' security and verifications,
+// instead it just tries to ensure that code is well-formed.
+//
+//  * Both of a binary operator's parameters are of the same type
+//  * Verify that the indices of mem access instructions match other operands
+//  * Verify that arithmetic and other things are only performed on first-class
+//    types.  Verify that shifts & logicals only happen on integrals f.e.
+//  * All of the constants in a switch statement are of the correct type
+//  * The code is in valid SSA form
+//  * It should be illegal to put a label into any other type (like a structure)
+//    or to return one. [except constant arrays!]
+//  * Only phi nodes can be self referential: 'add i32 %0, %0 ; <int>:0' is bad
+//  * PHI nodes must have an entry for each predecessor, with no extras.
+//  * PHI nodes must be the first thing in a basic block, all grouped together
+//  * PHI nodes must have at least one entry
+//  * All basic blocks should only end with terminator insts, not contain them
+//  * The entry node to a function must not have predecessors
+//  * All Instructions must be embedded into a basic block
+//  * Functions cannot take a void-typed parameter
+//  * Verify that a function's argument list agrees with it's declared type.
+//  * It is illegal to specify a name for a void value.
+//  * It is illegal to have a internal global value with no initializer
+//  * It is illegal to have a ret instruction that returns a value that does not
+//    agree with the function return value type.
+//  * Function call argument types match the function prototype
+//  * A landing pad is defined by a landingpad instruction, and can be jumped to
+//    only by the unwind edge of an invoke instruction.
+//  * A landingpad instruction must be the first non-PHI instruction in the
+//    block.
+//  * All landingpad instructions must use the same personality function with
+//    the same function.
+//  * All other things that are tested by asserts spread about the code...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdarg>
+using namespace llvm;
+
+namespace {  // Anonymous namespace for class
+  struct PreVerifier : public FunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+
+    PreVerifier() : FunctionPass(ID) {
+      initializePreVerifierPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    // Check that the prerequisites for successful DominatorTree construction
+    // are satisfied.
+    bool runOnFunction(Function &F) {
+      bool Broken = false;
+
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+        if (I->empty() || !I->back().isTerminator()) {
+          dbgs() << "Basic Block in function '" << F.getName() 
+                 << "' does not have terminator!\n";
+          WriteAsOperand(dbgs(), I, true);
+          dbgs() << "\n";
+          Broken = true;
+        }
+      }
+
+      if (Broken)
+        report_fatal_error("Broken module, no Basic Block terminator!");
+
+      return false;
+    }
+  };
+}
+
+char PreVerifier::ID = 0;
+INITIALIZE_PASS(PreVerifier, "preverify", "Preliminary module verification", 
+                false, false)
+static char &PreVerifyID = PreVerifier::ID;
+
+namespace {
+  struct Verifier : public FunctionPass, public InstVisitor<Verifier> {
+    static char ID; // Pass ID, replacement for typeid
+    bool Broken;          // Is this module found to be broken?
+    VerifierFailureAction action;
+                          // What to do if verification fails.
+    Module *Mod;          // Module we are verifying right now
+    LLVMContext *Context; // Context within which we are verifying
+    DominatorTree *DT;    // Dominator Tree, caution can be null!
+
+    std::string Messages;
+    raw_string_ostream MessagesStr;
+
+    /// InstInThisBlock - when verifying a basic block, keep track of all of the
+    /// instructions we have seen so far.  This allows us to do efficient
+    /// dominance checks for the case when an instruction has an operand that is
+    /// an instruction in the same block.
+    SmallPtrSet<Instruction*, 16> InstsInThisBlock;
+
+    /// MDNodes - keep track of the metadata nodes that have been checked
+    /// already.
+    SmallPtrSet<MDNode *, 32> MDNodes;
+
+    /// PersonalityFn - The personality function referenced by the
+    /// LandingPadInsts. All LandingPadInsts within the same function must use
+    /// the same personality function.
+    const Value *PersonalityFn;
+
+    Verifier()
+      : FunctionPass(ID), Broken(false),
+        action(AbortProcessAction), Mod(0), Context(0), DT(0),
+        MessagesStr(Messages), PersonalityFn(0) {
+      initializeVerifierPass(*PassRegistry::getPassRegistry());
+    }
+    explicit Verifier(VerifierFailureAction ctn)
+      : FunctionPass(ID), Broken(false), action(ctn), Mod(0),
+        Context(0), DT(0), MessagesStr(Messages), PersonalityFn(0) {
+      initializeVerifierPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool doInitialization(Module &M) {
+      Mod = &M;
+      Context = &M.getContext();
+
+      // We must abort before returning back to the pass manager, or else the
+      // pass manager may try to run other passes on the broken module.
+      return abortIfBroken();
+    }
+
+    bool runOnFunction(Function &F) {
+      // Get dominator information if we are being run by PassManager
+      DT = &getAnalysis<DominatorTree>();
+
+      Mod = F.getParent();
+      if (!Context) Context = &F.getContext();
+
+      visit(F);
+      InstsInThisBlock.clear();
+      PersonalityFn = 0;
+
+      // We must abort before returning back to the pass manager, or else the
+      // pass manager may try to run other passes on the broken module.
+      return abortIfBroken();
+    }
+
+    bool doFinalization(Module &M) {
+      // Scan through, checking all of the external function's linkage now...
+      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+        visitGlobalValue(*I);
+
+        // Check to make sure function prototypes are okay.
+        if (I->isDeclaration()) visitFunction(*I);
+      }
+
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+           I != E; ++I)
+        visitGlobalVariable(*I);
+
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); 
+           I != E; ++I)
+        visitGlobalAlias(*I);
+
+      for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+           E = M.named_metadata_end(); I != E; ++I)
+        visitNamedMDNode(*I);
+
+      visitModuleFlags(M);
+
+      // If the module is broken, abort at this time.
+      return abortIfBroken();
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequiredID(PreVerifyID);
+      AU.addRequired<DominatorTree>();
+    }
+
+    /// abortIfBroken - If the module is broken and we are supposed to abort on
+    /// this condition, do so.
+    ///
+    bool abortIfBroken() {
+      if (!Broken) return false;
+      MessagesStr << "Broken module found, ";
+      switch (action) {
+      case AbortProcessAction:
+        MessagesStr << "compilation aborted!\n";
+        dbgs() << MessagesStr.str();
+        // Client should choose different reaction if abort is not desired
+        abort();
+      case PrintMessageAction:
+        MessagesStr << "verification continues.\n";
+        dbgs() << MessagesStr.str();
+        return false;
+      case ReturnStatusAction:
+        MessagesStr << "compilation terminated.\n";
+        return true;
+      }
+      llvm_unreachable("Invalid action");
+    }
+
+
+    // Verification methods...
+    void visitGlobalValue(GlobalValue &GV);
+    void visitGlobalVariable(GlobalVariable &GV);
+    void visitGlobalAlias(GlobalAlias &GA);
+    void visitNamedMDNode(NamedMDNode &NMD);
+    void visitMDNode(MDNode &MD, Function *F);
+    void visitModuleFlags(Module &M);
+    void visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*> &SeenIDs,
+                         SmallVectorImpl<MDNode*> &Requirements);
+    void visitFunction(Function &F);
+    void visitBasicBlock(BasicBlock &BB);
+    using InstVisitor<Verifier>::visit;
+
+    void visit(Instruction &I);
+
+    void visitTruncInst(TruncInst &I);
+    void visitZExtInst(ZExtInst &I);
+    void visitSExtInst(SExtInst &I);
+    void visitFPTruncInst(FPTruncInst &I);
+    void visitFPExtInst(FPExtInst &I);
+    void visitFPToUIInst(FPToUIInst &I);
+    void visitFPToSIInst(FPToSIInst &I);
+    void visitUIToFPInst(UIToFPInst &I);
+    void visitSIToFPInst(SIToFPInst &I);
+    void visitIntToPtrInst(IntToPtrInst &I);
+    void visitPtrToIntInst(PtrToIntInst &I);
+    void visitBitCastInst(BitCastInst &I);
+    void visitPHINode(PHINode &PN);
+    void visitBinaryOperator(BinaryOperator &B);
+    void visitICmpInst(ICmpInst &IC);
+    void visitFCmpInst(FCmpInst &FC);
+    void visitExtractElementInst(ExtractElementInst &EI);
+    void visitInsertElementInst(InsertElementInst &EI);
+    void visitShuffleVectorInst(ShuffleVectorInst &EI);
+    void visitVAArgInst(VAArgInst &VAA) { visitInstruction(VAA); }
+    void visitCallInst(CallInst &CI);
+    void visitInvokeInst(InvokeInst &II);
+    void visitGetElementPtrInst(GetElementPtrInst &GEP);
+    void visitLoadInst(LoadInst &LI);
+    void visitStoreInst(StoreInst &SI);
+    void verifyDominatesUse(Instruction &I, unsigned i);
+    void visitInstruction(Instruction &I);
+    void visitTerminatorInst(TerminatorInst &I);
+    void visitBranchInst(BranchInst &BI);
+    void visitReturnInst(ReturnInst &RI);
+    void visitSwitchInst(SwitchInst &SI);
+    void visitIndirectBrInst(IndirectBrInst &BI);
+    void visitSelectInst(SelectInst &SI);
+    void visitUserOp1(Instruction &I);
+    void visitUserOp2(Instruction &I) { visitUserOp1(I); }
+    void visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI);
+    void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
+    void visitAtomicRMWInst(AtomicRMWInst &RMWI);
+    void visitFenceInst(FenceInst &FI);
+    void visitAllocaInst(AllocaInst &AI);
+    void visitExtractValueInst(ExtractValueInst &EVI);
+    void visitInsertValueInst(InsertValueInst &IVI);
+    void visitLandingPadInst(LandingPadInst &LPI);
+
+    void VerifyCallSite(CallSite CS);
+    bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
+                          int VT, unsigned ArgNo, std::string &Suffix);
+    bool VerifyIntrinsicType(Type *Ty,
+                             ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                             SmallVectorImpl<Type*> &ArgTys);
+    void VerifyParameterAttrs(AttributeSet Attrs, uint64_t Idx, Type *Ty,
+                              bool isReturnValue, const Value *V);
+    void VerifyFunctionAttrs(FunctionType *FT, const AttributeSet &Attrs,
+                             const Value *V);
+
+    void WriteValue(const Value *V) {
+      if (!V) return;
+      if (isa<Instruction>(V)) {
+        MessagesStr << *V << '\n';
+      } else {
+        WriteAsOperand(MessagesStr, V, true, Mod);
+        MessagesStr << '\n';
+      }
+    }
+
+    void WriteType(Type *T) {
+      if (!T) return;
+      MessagesStr << ' ' << *T;
+    }
+
+
+    // CheckFailed - A check failed, so print out the condition and the message
+    // that failed.  This provides a nice place to put a breakpoint if you want
+    // to see why something is not correct.
+    void CheckFailed(const Twine &Message,
+                     const Value *V1 = 0, const Value *V2 = 0,
+                     const Value *V3 = 0, const Value *V4 = 0) {
+      MessagesStr << Message.str() << "\n";
+      WriteValue(V1);
+      WriteValue(V2);
+      WriteValue(V3);
+      WriteValue(V4);
+      Broken = true;
+    }
+
+    void CheckFailed(const Twine &Message, const Value *V1,
+                     Type *T2, const Value *V3 = 0) {
+      MessagesStr << Message.str() << "\n";
+      WriteValue(V1);
+      WriteType(T2);
+      WriteValue(V3);
+      Broken = true;
+    }
+
+    void CheckFailed(const Twine &Message, Type *T1,
+                     Type *T2 = 0, Type *T3 = 0) {
+      MessagesStr << Message.str() << "\n";
+      WriteType(T1);
+      WriteType(T2);
+      WriteType(T3);
+      Broken = true;
+    }
+  };
+} // End anonymous namespace
+
+char Verifier::ID = 0;
+INITIALIZE_PASS_BEGIN(Verifier, "verify", "Module Verifier", false, false)
+INITIALIZE_PASS_DEPENDENCY(PreVerifier)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(Verifier, "verify", "Module Verifier", false, false)
+
+// Assert - We know that cond should be true, if not print an error message.
+#define Assert(C, M) \
+  do { if (!(C)) { CheckFailed(M); return; } } while (0)
+#define Assert1(C, M, V1) \
+  do { if (!(C)) { CheckFailed(M, V1); return; } } while (0)
+#define Assert2(C, M, V1, V2) \
+  do { if (!(C)) { CheckFailed(M, V1, V2); return; } } while (0)
+#define Assert3(C, M, V1, V2, V3) \
+  do { if (!(C)) { CheckFailed(M, V1, V2, V3); return; } } while (0)
+#define Assert4(C, M, V1, V2, V3, V4) \
+  do { if (!(C)) { CheckFailed(M, V1, V2, V3, V4); return; } } while (0)
+
+void Verifier::visit(Instruction &I) {
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    Assert1(I.getOperand(i) != 0, "Operand is null", &I);
+  InstVisitor<Verifier>::visit(I);
+}
+
+
+void Verifier::visitGlobalValue(GlobalValue &GV) {
+  Assert1(!GV.isDeclaration() ||
+          GV.isMaterializable() ||
+          GV.hasExternalLinkage() ||
+          GV.hasDLLImportLinkage() ||
+          GV.hasExternalWeakLinkage() ||
+          (isa<GlobalAlias>(GV) &&
+           (GV.hasLocalLinkage() || GV.hasWeakLinkage())),
+  "Global is external, but doesn't have external or dllimport or weak linkage!",
+          &GV);
+
+  Assert1(!GV.hasDLLImportLinkage() || GV.isDeclaration(),
+          "Global is marked as dllimport, but not external", &GV);
+
+  Assert1(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
+          "Only global variables can have appending linkage!", &GV);
+
+  if (GV.hasAppendingLinkage()) {
+    GlobalVariable *GVar = dyn_cast<GlobalVariable>(&GV);
+    Assert1(GVar && GVar->getType()->getElementType()->isArrayTy(),
+            "Only global arrays can have appending linkage!", GVar);
+  }
+
+  Assert1(!GV.hasLinkOnceODRAutoHideLinkage() || GV.hasDefaultVisibility(),
+          "linkonce_odr_auto_hide can only have default visibility!",
+          &GV);
+}
+
+void Verifier::visitGlobalVariable(GlobalVariable &GV) {
+  if (GV.hasInitializer()) {
+    Assert1(GV.getInitializer()->getType() == GV.getType()->getElementType(),
+            "Global variable initializer type does not match global "
+            "variable type!", &GV);
+
+    // If the global has common linkage, it must have a zero initializer and
+    // cannot be constant.
+    if (GV.hasCommonLinkage()) {
+      Assert1(GV.getInitializer()->isNullValue(),
+              "'common' global must have a zero initializer!", &GV);
+      Assert1(!GV.isConstant(), "'common' global may not be marked constant!",
+              &GV);
+    }
+  } else {
+    Assert1(GV.hasExternalLinkage() || GV.hasDLLImportLinkage() ||
+            GV.hasExternalWeakLinkage(),
+            "invalid linkage type for global declaration", &GV);
+  }
+
+  if (GV.hasName() && (GV.getName() == "llvm.global_ctors" ||
+                       GV.getName() == "llvm.global_dtors")) {
+    Assert1(!GV.hasInitializer() || GV.hasAppendingLinkage(),
+            "invalid linkage for intrinsic global variable", &GV);
+    // Don't worry about emitting an error for it not being an array,
+    // visitGlobalValue will complain on appending non-array.
+    if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getType())) {
+      StructType *STy = dyn_cast<StructType>(ATy->getElementType());
+      PointerType *FuncPtrTy =
+          FunctionType::get(Type::getVoidTy(*Context), false)->getPointerTo();
+      Assert1(STy && STy->getNumElements() == 2 &&
+              STy->getTypeAtIndex(0u)->isIntegerTy(32) &&
+              STy->getTypeAtIndex(1) == FuncPtrTy,
+              "wrong type for intrinsic global variable", &GV);
+    }
+  }
+
+  visitGlobalValue(GV);
+}
+
+void Verifier::visitGlobalAlias(GlobalAlias &GA) {
+  Assert1(!GA.getName().empty(),
+          "Alias name cannot be empty!", &GA);
+  Assert1(GA.hasExternalLinkage() || GA.hasLocalLinkage() ||
+          GA.hasWeakLinkage(),
+          "Alias should have external or external weak linkage!", &GA);
+  Assert1(GA.getAliasee(),
+          "Aliasee cannot be NULL!", &GA);
+  Assert1(GA.getType() == GA.getAliasee()->getType(),
+          "Alias and aliasee types should match!", &GA);
+  Assert1(!GA.hasUnnamedAddr(), "Alias cannot have unnamed_addr!", &GA);
+
+  if (!isa<GlobalValue>(GA.getAliasee())) {
+    const ConstantExpr *CE = dyn_cast<ConstantExpr>(GA.getAliasee());
+    Assert1(CE && 
+            (CE->getOpcode() == Instruction::BitCast ||
+             CE->getOpcode() == Instruction::GetElementPtr) &&
+            isa<GlobalValue>(CE->getOperand(0)),
+            "Aliasee should be either GlobalValue or bitcast of GlobalValue",
+            &GA);
+  }
+
+  const GlobalValue* Aliasee = GA.resolveAliasedGlobal(/*stopOnWeak*/ false);
+  Assert1(Aliasee,
+          "Aliasing chain should end with function or global variable", &GA);
+
+  visitGlobalValue(GA);
+}
+
+void Verifier::visitNamedMDNode(NamedMDNode &NMD) {
+  for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) {
+    MDNode *MD = NMD.getOperand(i);
+    if (!MD)
+      continue;
+
+    Assert1(!MD->isFunctionLocal(),
+            "Named metadata operand cannot be function local!", MD);
+    visitMDNode(*MD, 0);
+  }
+}
+
+void Verifier::visitMDNode(MDNode &MD, Function *F) {
+  // Only visit each node once.  Metadata can be mutually recursive, so this
+  // avoids infinite recursion here, as well as being an optimization.
+  if (!MDNodes.insert(&MD))
+    return;
+
+  for (unsigned i = 0, e = MD.getNumOperands(); i != e; ++i) {
+    Value *Op = MD.getOperand(i);
+    if (!Op)
+      continue;
+    if (isa<Constant>(Op) || isa<MDString>(Op))
+      continue;
+    if (MDNode *N = dyn_cast<MDNode>(Op)) {
+      Assert2(MD.isFunctionLocal() || !N->isFunctionLocal(),
+              "Global metadata operand cannot be function local!", &MD, N);
+      visitMDNode(*N, F);
+      continue;
+    }
+    Assert2(MD.isFunctionLocal(), "Invalid operand for global metadata!", &MD, Op);
+
+    // If this was an instruction, bb, or argument, verify that it is in the
+    // function that we expect.
+    Function *ActualF = 0;
+    if (Instruction *I = dyn_cast<Instruction>(Op))
+      ActualF = I->getParent()->getParent();
+    else if (BasicBlock *BB = dyn_cast<BasicBlock>(Op))
+      ActualF = BB->getParent();
+    else if (Argument *A = dyn_cast<Argument>(Op))
+      ActualF = A->getParent();
+    assert(ActualF && "Unimplemented function local metadata case!");
+
+    Assert2(ActualF == F, "function-local metadata used in wrong function",
+            &MD, Op);
+  }
+}
+
+void Verifier::visitModuleFlags(Module &M) {
+  const NamedMDNode *Flags = M.getModuleFlagsMetadata();
+  if (!Flags) return;
+
+  // Scan each flag, and track the flags and requirements.
+  DenseMap<MDString*, MDNode*> SeenIDs;
+  SmallVector<MDNode*, 16> Requirements;
+  for (unsigned I = 0, E = Flags->getNumOperands(); I != E; ++I) {
+    visitModuleFlag(Flags->getOperand(I), SeenIDs, Requirements);
+  }
+
+  // Validate that the requirements in the module are valid.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Value *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = SeenIDs.lookup(Flag);
+    if (!Op) {
+      CheckFailed("invalid requirement on flag, flag is not present in module",
+                  Flag);
+      continue;
+    }
+
+    if (Op->getOperand(2) != ReqValue) {
+      CheckFailed(("invalid requirement on flag, "
+                   "flag does not have the required value"),
+                  Flag);
+      continue;
+    }
+  }
+}
+
+void Verifier::visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*>&SeenIDs,
+                               SmallVectorImpl<MDNode*> &Requirements) {
+  // Each module flag should have three arguments, the merge behavior (a
+  // constant int), the flag ID (an MDString), and the value.
+  Assert1(Op->getNumOperands() == 3,
+          "incorrect number of operands in module flag", Op);
+  ConstantInt *Behavior = dyn_cast<ConstantInt>(Op->getOperand(0));
+  MDString *ID = dyn_cast<MDString>(Op->getOperand(1));
+  Assert1(Behavior,
+          "invalid behavior operand in module flag (expected constant integer)",
+          Op->getOperand(0));
+  unsigned BehaviorValue = Behavior->getZExtValue();
+  Assert1(ID,
+          "invalid ID operand in module flag (expected metadata string)",
+          Op->getOperand(1));
+
+  // Sanity check the values for behaviors with additional requirements.
+  switch (BehaviorValue) {
+  default:
+    Assert1(false,
+            "invalid behavior operand in module flag (unexpected constant)",
+            Op->getOperand(0));
+    break;
+
+  case Module::Error:
+  case Module::Warning:
+  case Module::Override:
+    // These behavior types accept any value.
+    break;
+
+  case Module::Require: {
+    // The value should itself be an MDNode with two operands, a flag ID (an
+    // MDString), and a value.
+    MDNode *Value = dyn_cast<MDNode>(Op->getOperand(2));
+    Assert1(Value && Value->getNumOperands() == 2,
+            "invalid value for 'require' module flag (expected metadata pair)",
+            Op->getOperand(2));
+    Assert1(isa<MDString>(Value->getOperand(0)),
+            ("invalid value for 'require' module flag "
+             "(first value operand should be a string)"),
+            Value->getOperand(0));
+
+    // Append it to the list of requirements, to check once all module flags are
+    // scanned.
+    Requirements.push_back(Value);
+    break;
+  }
+
+  case Module::Append:
+  case Module::AppendUnique: {
+    // These behavior types require the operand be an MDNode.
+    Assert1(isa<MDNode>(Op->getOperand(2)),
+            "invalid value for 'append'-type module flag "
+            "(expected a metadata node)", Op->getOperand(2));
+    break;
+  }
+  }
+
+  // Unless this is a "requires" flag, check the ID is unique.
+  if (BehaviorValue != Module::Require) {
+    bool Inserted = SeenIDs.insert(std::make_pair(ID, Op)).second;
+    Assert1(Inserted,
+            "module flag identifiers must be unique (or of 'require' type)",
+            ID);
+  }
+}
+
+// VerifyParameterAttrs - Check the given attributes for an argument or return
+// value of the specified type.  The value V is printed in error messages.
+void Verifier::VerifyParameterAttrs(AttributeSet Attrs, uint64_t Idx, Type *Ty,
+                                    bool isReturnValue, const Value *V) {
+  if (!Attrs.hasAttributes(Idx))
+    return;
+
+  Assert1(!Attrs.hasAttribute(Idx, Attribute::NoReturn) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoUnwind) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReadOnly) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoInline) &&
+          !Attrs.hasAttribute(Idx, Attribute::AlwaysInline) &&
+          !Attrs.hasAttribute(Idx, Attribute::OptimizeForSize) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackProtect) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackProtectReq) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoRedZone) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoImplicitFloat) &&
+          !Attrs.hasAttribute(Idx, Attribute::Naked) &&
+          !Attrs.hasAttribute(Idx, Attribute::InlineHint) &&
+          !Attrs.hasAttribute(Idx, Attribute::StackAlignment) &&
+          !Attrs.hasAttribute(Idx, Attribute::UWTable) &&
+          !Attrs.hasAttribute(Idx, Attribute::NonLazyBind) &&
+          !Attrs.hasAttribute(Idx, Attribute::ReturnsTwice) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeAddress) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeThread) &&
+          !Attrs.hasAttribute(Idx, Attribute::SanitizeMemory) &&
+          !Attrs.hasAttribute(Idx, Attribute::MinSize) &&
+          !Attrs.hasAttribute(Idx, Attribute::NoBuiltin),
+          "Some attributes in '" + Attrs.getAsString(Idx) +
+          "' only apply to functions!", V);
+
+  if (isReturnValue)
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+            !Attrs.hasAttribute(Idx, Attribute::Nest) &&
+            !Attrs.hasAttribute(Idx, Attribute::StructRet) &&
+            !Attrs.hasAttribute(Idx, Attribute::NoCapture),
+            "Attribute 'byval', 'nest', 'sret', and 'nocapture' "
+            "do not apply to return values!", V);
+
+  // Check for mutually incompatible attributes.
+  Assert1(!((Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::Nest)) ||
+            (Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::StructRet)) ||
+            (Attrs.hasAttribute(Idx, Attribute::Nest) &&
+             Attrs.hasAttribute(Idx, Attribute::StructRet))), "Attributes "
+          "'byval, nest, and sret' are incompatible!", V);
+
+  Assert1(!((Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::Nest)) ||
+            (Attrs.hasAttribute(Idx, Attribute::ByVal) &&
+             Attrs.hasAttribute(Idx, Attribute::InReg)) ||
+            (Attrs.hasAttribute(Idx, Attribute::Nest) &&
+             Attrs.hasAttribute(Idx, Attribute::InReg))), "Attributes "
+          "'byval, nest, and inreg' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::ZExt) &&
+            Attrs.hasAttribute(Idx, Attribute::SExt)), "Attributes "
+          "'zeroext and signext' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
+            Attrs.hasAttribute(Idx, Attribute::ReadOnly)), "Attributes "
+          "'readnone and readonly' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Idx, Attribute::NoInline) &&
+            Attrs.hasAttribute(Idx, Attribute::AlwaysInline)), "Attributes "
+          "'noinline and alwaysinline' are incompatible!", V);
+
+  Assert1(!AttrBuilder(Attrs, Idx).
+            hasAttributes(AttributeFuncs::typeIncompatible(Ty, Idx), Idx),
+          "Wrong types for attribute: " +
+          AttributeFuncs::typeIncompatible(Ty, Idx).getAsString(Idx), V);
+
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal) ||
+            PTy->getElementType()->isSized(),
+            "Attribute 'byval' does not support unsized types!", V);
+  else
+    Assert1(!Attrs.hasAttribute(Idx, Attribute::ByVal),
+            "Attribute 'byval' only applies to parameters with pointer type!",
+            V);
+}
+
+// VerifyFunctionAttrs - Check parameter attributes against a function type.
+// The value V is printed in error messages.
+void Verifier::VerifyFunctionAttrs(FunctionType *FT,
+                                   const AttributeSet &Attrs,
+                                   const Value *V) {
+  if (Attrs.isEmpty())
+    return;
+
+  bool SawNest = false;
+
+  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
+    unsigned Index = Attrs.getSlotIndex(i);
+
+    Type *Ty;
+    if (Index == 0)
+      Ty = FT->getReturnType();
+    else if (Index-1 < FT->getNumParams())
+      Ty = FT->getParamType(Index-1);
+    else
+      break;  // VarArgs attributes, verified elsewhere.
+
+    VerifyParameterAttrs(Attrs, Index, Ty, Index == 0, V);
+
+    if (Attrs.hasAttribute(i, Attribute::Nest)) {
+      Assert1(!SawNest, "More than one parameter has attribute nest!", V);
+      SawNest = true;
+    }
+
+    if (Attrs.hasAttribute(Index, Attribute::StructRet))
+      Assert1(Index == 1, "Attribute sret is not on first parameter!", V);
+  }
+
+  if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
+    return;
+
+  AttrBuilder NotFn(Attrs, AttributeSet::FunctionIndex);
+  NotFn.removeFunctionOnlyAttrs();
+  Assert1(NotFn.empty(), "Attributes '" +
+          AttributeSet::get(V->getContext(),
+                            AttributeSet::FunctionIndex,
+                            NotFn).getAsString(AttributeSet::FunctionIndex) +
+          "' do not apply to the function!", V);
+
+  // Check for mutually incompatible attributes.
+  Assert1(!((Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::StructRet)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::StructRet))),
+          "Attributes 'byval, nest, and sret' are incompatible!", V);
+
+  Assert1(!((Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::ByVal) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::InReg)) ||
+            (Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::Nest) &&
+             Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::InReg))),
+          "Attributes 'byval, nest, and inreg' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ZExt) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::SExt)),
+          "Attributes 'zeroext and signext' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ReadNone) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::ReadOnly)),
+          "Attributes 'readnone and readonly' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::NoInline) &&
+            Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::AlwaysInline)),
+          "Attributes 'noinline and alwaysinline' are incompatible!", V);
+}
+
+static bool VerifyAttributeCount(const AttributeSet &Attrs, unsigned Params) {
+  if (Attrs.getNumSlots() == 0)
+    return true;
+
+  unsigned LastSlot = Attrs.getNumSlots() - 1;
+  unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
+  if (LastIndex <= Params
+      || (LastIndex == AttributeSet::FunctionIndex
+          && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
+    return true;
+ 
+  return false;
+}
+
+// visitFunction - Verify that a function is ok.
+//
+void Verifier::visitFunction(Function &F) {
+  // Check function arguments.
+  FunctionType *FT = F.getFunctionType();
+  unsigned NumArgs = F.arg_size();
+
+  Assert1(Context == &F.getContext(),
+          "Function context does not match Module context!", &F);
+
+  Assert1(!F.hasCommonLinkage(), "Functions may not have common linkage", &F);
+  Assert2(FT->getNumParams() == NumArgs,
+          "# formal arguments must match # of arguments for function type!",
+          &F, FT);
+  Assert1(F.getReturnType()->isFirstClassType() ||
+          F.getReturnType()->isVoidTy() || 
+          F.getReturnType()->isStructTy(),
+          "Functions cannot return aggregate values!", &F);
+
+  Assert1(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
+          "Invalid struct return type!", &F);
+
+  const AttributeSet &Attrs = F.getAttributes();
+
+  Assert1(VerifyAttributeCount(Attrs, FT->getNumParams()),
+          "Attribute after last parameter!", &F);
+
+  // Check function attributes.
+  VerifyFunctionAttrs(FT, Attrs, &F);
+
+  // Check that this function meets the restrictions on this calling convention.
+  switch (F.getCallingConv()) {
+  default:
+    break;
+  case CallingConv::C:
+    break;
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+  case CallingConv::X86_FastCall:
+  case CallingConv::X86_ThisCall:
+  case CallingConv::Intel_OCL_BI:
+  case CallingConv::PTX_Kernel:
+  case CallingConv::PTX_Device:
+    Assert1(!F.isVarArg(),
+            "Varargs functions must have C calling conventions!", &F);
+    break;
+  }
+
+  bool isLLVMdotName = F.getName().size() >= 5 &&
+                       F.getName().substr(0, 5) == "llvm.";
+
+  // Check that the argument values match the function type for this function...
+  unsigned i = 0;
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++i) {
+    Assert2(I->getType() == FT->getParamType(i),
+            "Argument value does not match function argument type!",
+            I, FT->getParamType(i));
+    Assert1(I->getType()->isFirstClassType(),
+            "Function arguments must have first-class types!", I);
+    if (!isLLVMdotName)
+      Assert2(!I->getType()->isMetadataTy(),
+              "Function takes metadata but isn't an intrinsic", I, &F);
+  }
+
+  if (F.isMaterializable()) {
+    // Function has a body somewhere we can't see.
+  } else if (F.isDeclaration()) {
+    Assert1(F.hasExternalLinkage() || F.hasDLLImportLinkage() ||
+            F.hasExternalWeakLinkage(),
+            "invalid linkage type for function declaration", &F);
+  } else {
+    // Verify that this function (which has a body) is not named "llvm.*".  It
+    // is not legal to define intrinsics.
+    Assert1(!isLLVMdotName, "llvm intrinsics cannot be defined!", &F);
+    
+    // Check the entry node
+    BasicBlock *Entry = &F.getEntryBlock();
+    Assert1(pred_begin(Entry) == pred_end(Entry),
+            "Entry block to function must not have predecessors!", Entry);
+    
+    // The address of the entry block cannot be taken, unless it is dead.
+    if (Entry->hasAddressTaken()) {
+      Assert1(!BlockAddress::get(Entry)->isConstantUsed(),
+              "blockaddress may not be used with the entry block!", Entry);
+    }
+  }
+ 
+  // If this function is actually an intrinsic, verify that it is only used in
+  // direct call/invokes, never having its "address taken".
+  if (F.getIntrinsicID()) {
+    const User *U;
+    if (F.hasAddressTaken(&U))
+      Assert1(0, "Invalid user of intrinsic instruction!", U); 
+  }
+}
+
+// verifyBasicBlock - Verify that a basic block is well formed...
+//
+void Verifier::visitBasicBlock(BasicBlock &BB) {
+  InstsInThisBlock.clear();
+
+  // Ensure that basic blocks have terminators!
+  Assert1(BB.getTerminator(), "Basic Block does not have terminator!", &BB);
+
+  // Check constraints that this basic block imposes on all of the PHI nodes in
+  // it.
+  if (isa<PHINode>(BB.front())) {
+    SmallVector<BasicBlock*, 8> Preds(pred_begin(&BB), pred_end(&BB));
+    SmallVector<std::pair<BasicBlock*, Value*>, 8> Values;
+    std::sort(Preds.begin(), Preds.end());
+    PHINode *PN;
+    for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I));++I) {
+      // Ensure that PHI nodes have at least one entry!
+      Assert1(PN->getNumIncomingValues() != 0,
+              "PHI nodes must have at least one entry.  If the block is dead, "
+              "the PHI should be removed!", PN);
+      Assert1(PN->getNumIncomingValues() == Preds.size(),
+              "PHINode should have one entry for each predecessor of its "
+              "parent basic block!", PN);
+
+      // Get and sort all incoming values in the PHI node...
+      Values.clear();
+      Values.reserve(PN->getNumIncomingValues());
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Values.push_back(std::make_pair(PN->getIncomingBlock(i),
+                                        PN->getIncomingValue(i)));
+      std::sort(Values.begin(), Values.end());
+
+      for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+        // Check to make sure that if there is more than one entry for a
+        // particular basic block in this PHI node, that the incoming values are
+        // all identical.
+        //
+        Assert4(i == 0 || Values[i].first  != Values[i-1].first ||
+                Values[i].second == Values[i-1].second,
+                "PHI node has multiple entries for the same basic block with "
+                "different incoming values!", PN, Values[i].first,
+                Values[i].second, Values[i-1].second);
+
+        // Check to make sure that the predecessors and PHI node entries are
+        // matched up.
+        Assert3(Values[i].first == Preds[i],
+                "PHI node entries do not match predecessors!", PN,
+                Values[i].first, Preds[i]);
+      }
+    }
+  }
+}
+
+void Verifier::visitTerminatorInst(TerminatorInst &I) {
+  // Ensure that terminators only exist at the end of the basic block.
+  Assert1(&I == I.getParent()->getTerminator(),
+          "Terminator found in the middle of a basic block!", I.getParent());
+  visitInstruction(I);
+}
+
+void Verifier::visitBranchInst(BranchInst &BI) {
+  if (BI.isConditional()) {
+    Assert2(BI.getCondition()->getType()->isIntegerTy(1),
+            "Branch condition is not 'i1' type!", &BI, BI.getCondition());
+  }
+  visitTerminatorInst(BI);
+}
+
+void Verifier::visitReturnInst(ReturnInst &RI) {
+  Function *F = RI.getParent()->getParent();
+  unsigned N = RI.getNumOperands();
+  if (F->getReturnType()->isVoidTy()) 
+    Assert2(N == 0,
+            "Found return instr that returns non-void in Function of void "
+            "return type!", &RI, F->getReturnType());
+  else
+    Assert2(N == 1 && F->getReturnType() == RI.getOperand(0)->getType(),
+            "Function return type does not match operand "
+            "type of return inst!", &RI, F->getReturnType());
+
+  // Check to make sure that the return value has necessary properties for
+  // terminators...
+  visitTerminatorInst(RI);
+}
+
+void Verifier::visitSwitchInst(SwitchInst &SI) {
+  // Check to make sure that all of the constants in the switch instruction
+  // have the same type as the switched-on value.
+  Type *SwitchTy = SI.getCondition()->getType();
+  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
+  IntegersSubsetToBB Mapping;
+  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
+  for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
+      IntegersSubset::Range r = CaseRanges.getItem(ri);
+      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
+              "Switch constants must all be same type as switch value!", &SI);
+      Mapping.add(r);
+      RangeSetMap[r] = i.getCaseIndex();
+    }
+  }
+  
+  IntegersSubsetToBB::RangeIterator errItem;
+  if (!Mapping.verify(errItem)) {
+    unsigned CaseIndex = RangeSetMap[errItem->first];
+    SwitchInst::CaseIt i(&SI, CaseIndex);
+    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+  }
+  
+  visitTerminatorInst(SI);
+}
+
+void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
+  Assert1(BI.getAddress()->getType()->isPointerTy(),
+          "Indirectbr operand must have pointer type!", &BI);
+  for (unsigned i = 0, e = BI.getNumDestinations(); i != e; ++i)
+    Assert1(BI.getDestination(i)->getType()->isLabelTy(),
+            "Indirectbr destinations must all have pointer type!", &BI);
+
+  visitTerminatorInst(BI);
+}
+
+void Verifier::visitSelectInst(SelectInst &SI) {
+  Assert1(!SelectInst::areInvalidOperands(SI.getOperand(0), SI.getOperand(1),
+                                          SI.getOperand(2)),
+          "Invalid operands for select instruction!", &SI);
+
+  Assert1(SI.getTrueValue()->getType() == SI.getType(),
+          "Select values must have same type as select instruction!", &SI);
+  visitInstruction(SI);
+}
+
+/// visitUserOp1 - User defined operators shouldn't live beyond the lifetime of
+/// a pass, if any exist, it's an error.
+///
+void Verifier::visitUserOp1(Instruction &I) {
+  Assert1(0, "User-defined operators should not live outside of a pass!", &I);
+}
+
+void Verifier::visitTruncInst(TruncInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DestBitSize = DestTy->getScalarSizeInBits();
+
+  Assert1(SrcTy->isIntOrIntVectorTy(), "Trunc only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVectorTy(), "Trunc only produces integer", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "trunc source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize > DestBitSize,"DestTy too big for Trunc", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitZExtInst(ZExtInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  Assert1(SrcTy->isIntOrIntVectorTy(), "ZExt only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVectorTy(), "ZExt only produces an integer", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "zext source and destination must both be a vector or neither", &I);
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DestBitSize = DestTy->getScalarSizeInBits();
+
+  Assert1(SrcBitSize < DestBitSize,"Type too small for ZExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitSExtInst(SExtInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DestBitSize = DestTy->getScalarSizeInBits();
+
+  Assert1(SrcTy->isIntOrIntVectorTy(), "SExt only operates on integer", &I);
+  Assert1(DestTy->isIntOrIntVectorTy(), "SExt only produces an integer", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "sext source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize < DestBitSize,"Type too small for SExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPTruncInst(FPTruncInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DestBitSize = DestTy->getScalarSizeInBits();
+
+  Assert1(SrcTy->isFPOrFPVectorTy(),"FPTrunc only operates on FP", &I);
+  Assert1(DestTy->isFPOrFPVectorTy(),"FPTrunc only produces an FP", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "fptrunc source and destination must both be a vector or neither",&I);
+  Assert1(SrcBitSize > DestBitSize,"DestTy too big for FPTrunc", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPExtInst(FPExtInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DestBitSize = DestTy->getScalarSizeInBits();
+
+  Assert1(SrcTy->isFPOrFPVectorTy(),"FPExt only operates on FP", &I);
+  Assert1(DestTy->isFPOrFPVectorTy(),"FPExt only produces an FP", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "fpext source and destination must both be a vector or neither", &I);
+  Assert1(SrcBitSize < DestBitSize,"DestTy too small for FPExt", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitUIToFPInst(UIToFPInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  bool SrcVec = SrcTy->isVectorTy();
+  bool DstVec = DestTy->isVectorTy();
+
+  Assert1(SrcVec == DstVec,
+          "UIToFP source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isIntOrIntVectorTy(),
+          "UIToFP source must be integer or integer vector", &I);
+  Assert1(DestTy->isFPOrFPVectorTy(),
+          "UIToFP result must be FP or FP vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "UIToFP source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitSIToFPInst(SIToFPInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  bool SrcVec = SrcTy->isVectorTy();
+  bool DstVec = DestTy->isVectorTy();
+
+  Assert1(SrcVec == DstVec,
+          "SIToFP source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isIntOrIntVectorTy(),
+          "SIToFP source must be integer or integer vector", &I);
+  Assert1(DestTy->isFPOrFPVectorTy(),
+          "SIToFP result must be FP or FP vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "SIToFP source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPToUIInst(FPToUIInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  bool SrcVec = SrcTy->isVectorTy();
+  bool DstVec = DestTy->isVectorTy();
+
+  Assert1(SrcVec == DstVec,
+          "FPToUI source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isFPOrFPVectorTy(), "FPToUI source must be FP or FP vector",
+          &I);
+  Assert1(DestTy->isIntOrIntVectorTy(),
+          "FPToUI result must be integer or integer vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "FPToUI source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitFPToSIInst(FPToSIInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  bool SrcVec = SrcTy->isVectorTy();
+  bool DstVec = DestTy->isVectorTy();
+
+  Assert1(SrcVec == DstVec,
+          "FPToSI source and dest must both be vector or scalar", &I);
+  Assert1(SrcTy->isFPOrFPVectorTy(),
+          "FPToSI source must be FP or FP vector", &I);
+  Assert1(DestTy->isIntOrIntVectorTy(),
+          "FPToSI result must be integer or integer vector", &I);
+
+  if (SrcVec && DstVec)
+    Assert1(cast<VectorType>(SrcTy)->getNumElements() ==
+            cast<VectorType>(DestTy)->getNumElements(),
+            "FPToSI source and dest vector length mismatch", &I);
+
+  visitInstruction(I);
+}
+
+void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  Assert1(SrcTy->getScalarType()->isPointerTy(),
+          "PtrToInt source must be pointer", &I);
+  Assert1(DestTy->getScalarType()->isIntegerTy(),
+          "PtrToInt result must be integral", &I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "PtrToInt type mismatch", &I);
+
+  if (SrcTy->isVectorTy()) {
+    VectorType *VSrc = dyn_cast<VectorType>(SrcTy);
+    VectorType *VDest = dyn_cast<VectorType>(DestTy);
+    Assert1(VSrc->getNumElements() == VDest->getNumElements(),
+          "PtrToInt Vector width mismatch", &I);
+  }
+
+  visitInstruction(I);
+}
+
+void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  Assert1(SrcTy->getScalarType()->isIntegerTy(),
+          "IntToPtr source must be an integral", &I);
+  Assert1(DestTy->getScalarType()->isPointerTy(),
+          "IntToPtr result must be a pointer",&I);
+  Assert1(SrcTy->isVectorTy() == DestTy->isVectorTy(),
+          "IntToPtr type mismatch", &I);
+  if (SrcTy->isVectorTy()) {
+    VectorType *VSrc = dyn_cast<VectorType>(SrcTy);
+    VectorType *VDest = dyn_cast<VectorType>(DestTy);
+    Assert1(VSrc->getNumElements() == VDest->getNumElements(),
+          "IntToPtr Vector width mismatch", &I);
+  }
+  visitInstruction(I);
+}
+
+void Verifier::visitBitCastInst(BitCastInst &I) {
+  // Get the source and destination types
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
+
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // BitCast implies a no-op cast of type only. No bits change.
+  // However, you can't cast pointers to anything but pointers.
+  Assert1(SrcTy->isPointerTy() == DestTy->isPointerTy(),
+          "Bitcast requires both operands to be pointer or neither", &I);
+  Assert1(SrcBitSize == DestBitSize, "Bitcast requires types of same width",&I);
+
+  // Disallow aggregates.
+  Assert1(!SrcTy->isAggregateType(),
+          "Bitcast operand must not be aggregate", &I);
+  Assert1(!DestTy->isAggregateType(),
+          "Bitcast type must not be aggregate", &I);
+
+  visitInstruction(I);
+}
+
+/// visitPHINode - Ensure that a PHI node is well formed.
+///
+void Verifier::visitPHINode(PHINode &PN) {
+  // Ensure that the PHI nodes are all grouped together at the top of the block.
+  // This can be tested by checking whether the instruction before this is
+  // either nonexistent (because this is begin()) or is a PHI node.  If not,
+  // then there is some other instruction before a PHI.
+  Assert2(&PN == &PN.getParent()->front() || 
+          isa<PHINode>(--BasicBlock::iterator(&PN)),
+          "PHI nodes not grouped at top of basic block!",
+          &PN, PN.getParent());
+
+  // Check that all of the values of the PHI node have the same type as the
+  // result, and that the incoming blocks are really basic blocks.
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Assert1(PN.getType() == PN.getIncomingValue(i)->getType(),
+            "PHI node operands are not the same type as the result!", &PN);
+  }
+
+  // All other PHI node constraints are checked in the visitBasicBlock method.
+
+  visitInstruction(PN);
+}
+
+void Verifier::VerifyCallSite(CallSite CS) {
+  Instruction *I = CS.getInstruction();
+
+  Assert1(CS.getCalledValue()->getType()->isPointerTy(),
+          "Called function must be a pointer!", I);
+  PointerType *FPTy = cast<PointerType>(CS.getCalledValue()->getType());
+
+  Assert1(FPTy->getElementType()->isFunctionTy(),
+          "Called function is not pointer to function type!", I);
+  FunctionType *FTy = cast<FunctionType>(FPTy->getElementType());
+
+  // Verify that the correct number of arguments are being passed
+  if (FTy->isVarArg())
+    Assert1(CS.arg_size() >= FTy->getNumParams(),
+            "Called function requires more parameters than were provided!",I);
+  else
+    Assert1(CS.arg_size() == FTy->getNumParams(),
+            "Incorrect number of arguments passed to called function!", I);
+
+  // Verify that all arguments to the call match the function type.
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    Assert3(CS.getArgument(i)->getType() == FTy->getParamType(i),
+            "Call parameter type does not match function signature!",
+            CS.getArgument(i), FTy->getParamType(i), I);
+
+  const AttributeSet &Attrs = CS.getAttributes();
+
+  Assert1(VerifyAttributeCount(Attrs, CS.arg_size()),
+          "Attribute after last parameter!", I);
+
+  // Verify call attributes.
+  VerifyFunctionAttrs(FTy, Attrs, I);
+
+  if (FTy->isVarArg())
+    // Check attributes on the varargs part.
+    for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
+      VerifyParameterAttrs(Attrs, Idx, CS.getArgument(Idx-1)->getType(),
+                           false, I);
+
+      Assert1(!Attrs.hasAttribute(Idx, Attribute::StructRet),
+              "Attribute 'sret' cannot be used for vararg call arguments!", I);
+    }
+
+  // Verify that there's no metadata unless it's a direct call to an intrinsic.
+  if (CS.getCalledFunction() == 0 ||
+      !CS.getCalledFunction()->getName().startswith("llvm.")) {
+    for (FunctionType::param_iterator PI = FTy->param_begin(),
+           PE = FTy->param_end(); PI != PE; ++PI)
+      Assert1(!(*PI)->isMetadataTy(),
+              "Function has metadata parameter but isn't an intrinsic", I);
+  }
+
+  visitInstruction(*I);
+}
+
+void Verifier::visitCallInst(CallInst &CI) {
+  VerifyCallSite(&CI);
+
+  if (Function *F = CI.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      visitIntrinsicFunctionCall(ID, CI);
+}
+
+void Verifier::visitInvokeInst(InvokeInst &II) {
+  VerifyCallSite(&II);
+
+  // Verify that there is a landingpad instruction as the first non-PHI
+  // instruction of the 'unwind' destination.
+  Assert1(II.getUnwindDest()->isLandingPad(),
+          "The unwind destination does not have a landingpad instruction!",&II);
+
+  visitTerminatorInst(II);
+}
+
+/// visitBinaryOperator - Check that both arguments to the binary operator are
+/// of the same type!
+///
+void Verifier::visitBinaryOperator(BinaryOperator &B) {
+  Assert1(B.getOperand(0)->getType() == B.getOperand(1)->getType(),
+          "Both operands to a binary operator are not of the same type!", &B);
+
+  switch (B.getOpcode()) {
+  // Check that integer arithmetic operators are only used with
+  // integral operands.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    Assert1(B.getType()->isIntOrIntVectorTy(),
+            "Integer arithmetic operators only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Integer arithmetic operators must have same type "
+            "for operands and result!", &B);
+    break;
+  // Check that floating-point arithmetic operators are only used with
+  // floating-point operands.
+  case Instruction::FAdd:
+  case Instruction::FSub:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+    Assert1(B.getType()->isFPOrFPVectorTy(),
+            "Floating-point arithmetic operators only work with "
+            "floating-point types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Floating-point arithmetic operators must have same type "
+            "for operands and result!", &B);
+    break;
+  // Check that logical operators are only used with integral operands.
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    Assert1(B.getType()->isIntOrIntVectorTy(),
+            "Logical operators only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Logical operators must have same type for operands and result!",
+            &B);
+    break;
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    Assert1(B.getType()->isIntOrIntVectorTy(),
+            "Shifts only work with integral types!", &B);
+    Assert1(B.getType() == B.getOperand(0)->getType(),
+            "Shift return type must be same as operands!", &B);
+    break;
+  default:
+    llvm_unreachable("Unknown BinaryOperator opcode!");
+  }
+
+  visitInstruction(B);
+}
+
+void Verifier::visitICmpInst(ICmpInst &IC) {
+  // Check that the operands are the same type
+  Type *Op0Ty = IC.getOperand(0)->getType();
+  Type *Op1Ty = IC.getOperand(1)->getType();
+  Assert1(Op0Ty == Op1Ty,
+          "Both operands to ICmp instruction are not of the same type!", &IC);
+  // Check that the operands are the right type
+  Assert1(Op0Ty->isIntOrIntVectorTy() || Op0Ty->getScalarType()->isPointerTy(),
+          "Invalid operand types for ICmp instruction", &IC);
+  // Check that the predicate is valid.
+  Assert1(IC.getPredicate() >= CmpInst::FIRST_ICMP_PREDICATE &&
+          IC.getPredicate() <= CmpInst::LAST_ICMP_PREDICATE,
+          "Invalid predicate in ICmp instruction!", &IC);
+
+  visitInstruction(IC);
+}
+
+void Verifier::visitFCmpInst(FCmpInst &FC) {
+  // Check that the operands are the same type
+  Type *Op0Ty = FC.getOperand(0)->getType();
+  Type *Op1Ty = FC.getOperand(1)->getType();
+  Assert1(Op0Ty == Op1Ty,
+          "Both operands to FCmp instruction are not of the same type!", &FC);
+  // Check that the operands are the right type
+  Assert1(Op0Ty->isFPOrFPVectorTy(),
+          "Invalid operand types for FCmp instruction", &FC);
+  // Check that the predicate is valid.
+  Assert1(FC.getPredicate() >= CmpInst::FIRST_FCMP_PREDICATE &&
+          FC.getPredicate() <= CmpInst::LAST_FCMP_PREDICATE,
+          "Invalid predicate in FCmp instruction!", &FC);
+
+  visitInstruction(FC);
+}
+
+void Verifier::visitExtractElementInst(ExtractElementInst &EI) {
+  Assert1(ExtractElementInst::isValidOperands(EI.getOperand(0),
+                                              EI.getOperand(1)),
+          "Invalid extractelement operands!", &EI);
+  visitInstruction(EI);
+}
+
+void Verifier::visitInsertElementInst(InsertElementInst &IE) {
+  Assert1(InsertElementInst::isValidOperands(IE.getOperand(0),
+                                             IE.getOperand(1),
+                                             IE.getOperand(2)),
+          "Invalid insertelement operands!", &IE);
+  visitInstruction(IE);
+}
+
+void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
+  Assert1(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
+                                             SV.getOperand(2)),
+          "Invalid shufflevector operands!", &SV);
+  visitInstruction(SV);
+}
+
+void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  Type *TargetTy = GEP.getPointerOperandType()->getScalarType();
+
+  Assert1(isa<PointerType>(TargetTy),
+    "GEP base pointer is not a vector or a vector of pointers", &GEP);
+  Assert1(cast<PointerType>(TargetTy)->getElementType()->isSized(),
+          "GEP into unsized type!", &GEP);
+  Assert1(GEP.getPointerOperandType()->isVectorTy() ==
+          GEP.getType()->isVectorTy(), "Vector GEP must return a vector value",
+          &GEP);
+
+  SmallVector<Value*, 16> Idxs(GEP.idx_begin(), GEP.idx_end());
+  Type *ElTy =
+    GetElementPtrInst::getIndexedType(GEP.getPointerOperandType(), Idxs);
+  Assert1(ElTy, "Invalid indices for GEP pointer type!", &GEP);
+
+  Assert2(GEP.getType()->getScalarType()->isPointerTy() &&
+          cast<PointerType>(GEP.getType()->getScalarType())->getElementType()
+          == ElTy, "GEP is not of right type for indices!", &GEP, ElTy);
+
+  if (GEP.getPointerOperandType()->isVectorTy()) {
+    // Additional checks for vector GEPs.
+    unsigned GepWidth = GEP.getPointerOperandType()->getVectorNumElements();
+    Assert1(GepWidth == GEP.getType()->getVectorNumElements(),
+            "Vector GEP result width doesn't match operand's", &GEP);
+    for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
+      Type *IndexTy = Idxs[i]->getType();
+      Assert1(IndexTy->isVectorTy(),
+              "Vector GEP must have vector indices!", &GEP);
+      unsigned IndexWidth = IndexTy->getVectorNumElements();
+      Assert1(IndexWidth == GepWidth, "Invalid GEP index vector width", &GEP);
+    }
+  }
+  visitInstruction(GEP);
+}
+
+static bool isContiguous(const ConstantRange &A, const ConstantRange &B) {
+  return A.getUpper() == B.getLower() || A.getLower() == B.getUpper();
+}
+
+void Verifier::visitLoadInst(LoadInst &LI) {
+  PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
+  Assert1(PTy, "Load operand must be a pointer.", &LI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy == LI.getType(),
+          "Load result type does not match pointer operand type!", &LI, ElTy);
+  if (LI.isAtomic()) {
+    Assert1(LI.getOrdering() != Release && LI.getOrdering() != AcquireRelease,
+            "Load cannot have Release ordering", &LI);
+    Assert1(LI.getAlignment() != 0,
+            "Atomic load must specify explicit alignment", &LI);
+    if (!ElTy->isPointerTy()) {
+      Assert2(ElTy->isIntegerTy(),
+              "atomic store operand must have integer type!",
+              &LI, ElTy);
+      unsigned Size = ElTy->getPrimitiveSizeInBits();
+      Assert2(Size >= 8 && !(Size & (Size - 1)),
+              "atomic store operand must be power-of-two byte-sized integer",
+              &LI, ElTy);
+    }
+  } else {
+    Assert1(LI.getSynchScope() == CrossThread,
+            "Non-atomic load cannot have SynchronizationScope specified", &LI);
+  }
+
+  if (MDNode *Range = LI.getMetadata(LLVMContext::MD_range)) {
+    unsigned NumOperands = Range->getNumOperands();
+    Assert1(NumOperands % 2 == 0, "Unfinished range!", Range);
+    unsigned NumRanges = NumOperands / 2;
+    Assert1(NumRanges >= 1, "It should have at least one range!", Range);
+
+    ConstantRange LastRange(1); // Dummy initial value
+    for (unsigned i = 0; i < NumRanges; ++i) {
+      ConstantInt *Low = dyn_cast<ConstantInt>(Range->getOperand(2*i));
+      Assert1(Low, "The lower limit must be an integer!", Low);
+      ConstantInt *High = dyn_cast<ConstantInt>(Range->getOperand(2*i + 1));
+      Assert1(High, "The upper limit must be an integer!", High);
+      Assert1(High->getType() == Low->getType() &&
+              High->getType() == ElTy, "Range types must match load type!",
+              &LI);
+
+      APInt HighV = High->getValue();
+      APInt LowV = Low->getValue();
+      ConstantRange CurRange(LowV, HighV);
+      Assert1(!CurRange.isEmptySet() && !CurRange.isFullSet(),
+              "Range must not be empty!", Range);
+      if (i != 0) {
+        Assert1(CurRange.intersectWith(LastRange).isEmptySet(),
+                "Intervals are overlapping", Range);
+        Assert1(LowV.sgt(LastRange.getLower()), "Intervals are not in order",
+                Range);
+        Assert1(!isContiguous(CurRange, LastRange), "Intervals are contiguous",
+                Range);
+      }
+      LastRange = ConstantRange(LowV, HighV);
+    }
+    if (NumRanges > 2) {
+      APInt FirstLow =
+        dyn_cast<ConstantInt>(Range->getOperand(0))->getValue();
+      APInt FirstHigh =
+        dyn_cast<ConstantInt>(Range->getOperand(1))->getValue();
+      ConstantRange FirstRange(FirstLow, FirstHigh);
+      Assert1(FirstRange.intersectWith(LastRange).isEmptySet(),
+              "Intervals are overlapping", Range);
+      Assert1(!isContiguous(FirstRange, LastRange), "Intervals are contiguous",
+              Range);
+    }
+
+
+  }
+
+  visitInstruction(LI);
+}
+
+void Verifier::visitStoreInst(StoreInst &SI) {
+  PointerType *PTy = dyn_cast<PointerType>(SI.getOperand(1)->getType());
+  Assert1(PTy, "Store operand must be a pointer.", &SI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy == SI.getOperand(0)->getType(),
+          "Stored value type does not match pointer operand type!",
+          &SI, ElTy);
+  if (SI.isAtomic()) {
+    Assert1(SI.getOrdering() != Acquire && SI.getOrdering() != AcquireRelease,
+            "Store cannot have Acquire ordering", &SI);
+    Assert1(SI.getAlignment() != 0,
+            "Atomic store must specify explicit alignment", &SI);
+    if (!ElTy->isPointerTy()) {
+      Assert2(ElTy->isIntegerTy(),
+              "atomic store operand must have integer type!",
+              &SI, ElTy);
+      unsigned Size = ElTy->getPrimitiveSizeInBits();
+      Assert2(Size >= 8 && !(Size & (Size - 1)),
+              "atomic store operand must be power-of-two byte-sized integer",
+              &SI, ElTy);
+    }
+  } else {
+    Assert1(SI.getSynchScope() == CrossThread,
+            "Non-atomic store cannot have SynchronizationScope specified", &SI);
+  }
+  visitInstruction(SI);
+}
+
+void Verifier::visitAllocaInst(AllocaInst &AI) {
+  PointerType *PTy = AI.getType();
+  Assert1(PTy->getAddressSpace() == 0, 
+          "Allocation instruction pointer not in the generic address space!",
+          &AI);
+  Assert1(PTy->getElementType()->isSized(), "Cannot allocate unsized type",
+          &AI);
+  Assert1(AI.getArraySize()->getType()->isIntegerTy(),
+          "Alloca array size must have integer type", &AI);
+  visitInstruction(AI);
+}
+
+void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
+  Assert1(CXI.getOrdering() != NotAtomic,
+          "cmpxchg instructions must be atomic.", &CXI);
+  Assert1(CXI.getOrdering() != Unordered,
+          "cmpxchg instructions cannot be unordered.", &CXI);
+  PointerType *PTy = dyn_cast<PointerType>(CXI.getOperand(0)->getType());
+  Assert1(PTy, "First cmpxchg operand must be a pointer.", &CXI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy->isIntegerTy(),
+          "cmpxchg operand must have integer type!",
+          &CXI, ElTy);
+  unsigned Size = ElTy->getPrimitiveSizeInBits();
+  Assert2(Size >= 8 && !(Size & (Size - 1)),
+          "cmpxchg operand must be power-of-two byte-sized integer",
+          &CXI, ElTy);
+  Assert2(ElTy == CXI.getOperand(1)->getType(),
+          "Expected value type does not match pointer operand type!",
+          &CXI, ElTy);
+  Assert2(ElTy == CXI.getOperand(2)->getType(),
+          "Stored value type does not match pointer operand type!",
+          &CXI, ElTy);
+  visitInstruction(CXI);
+}
+
+void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
+  Assert1(RMWI.getOrdering() != NotAtomic,
+          "atomicrmw instructions must be atomic.", &RMWI);
+  Assert1(RMWI.getOrdering() != Unordered,
+          "atomicrmw instructions cannot be unordered.", &RMWI);
+  PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
+  Assert1(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy->isIntegerTy(),
+          "atomicrmw operand must have integer type!",
+          &RMWI, ElTy);
+  unsigned Size = ElTy->getPrimitiveSizeInBits();
+  Assert2(Size >= 8 && !(Size & (Size - 1)),
+          "atomicrmw operand must be power-of-two byte-sized integer",
+          &RMWI, ElTy);
+  Assert2(ElTy == RMWI.getOperand(1)->getType(),
+          "Argument value type does not match pointer operand type!",
+          &RMWI, ElTy);
+  Assert1(AtomicRMWInst::FIRST_BINOP <= RMWI.getOperation() &&
+          RMWI.getOperation() <= AtomicRMWInst::LAST_BINOP,
+          "Invalid binary operation!", &RMWI);
+  visitInstruction(RMWI);
+}
+
+void Verifier::visitFenceInst(FenceInst &FI) {
+  const AtomicOrdering Ordering = FI.getOrdering();
+  Assert1(Ordering == Acquire || Ordering == Release ||
+          Ordering == AcquireRelease || Ordering == SequentiallyConsistent,
+          "fence instructions may only have "
+          "acquire, release, acq_rel, or seq_cst ordering.", &FI);
+  visitInstruction(FI);
+}
+
+void Verifier::visitExtractValueInst(ExtractValueInst &EVI) {
+  Assert1(ExtractValueInst::getIndexedType(EVI.getAggregateOperand()->getType(),
+                                           EVI.getIndices()) ==
+          EVI.getType(),
+          "Invalid ExtractValueInst operands!", &EVI);
+  
+  visitInstruction(EVI);
+}
+
+void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
+  Assert1(ExtractValueInst::getIndexedType(IVI.getAggregateOperand()->getType(),
+                                           IVI.getIndices()) ==
+          IVI.getOperand(1)->getType(),
+          "Invalid InsertValueInst operands!", &IVI);
+  
+  visitInstruction(IVI);
+}
+
+void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
+  BasicBlock *BB = LPI.getParent();
+
+  // The landingpad instruction is ill-formed if it doesn't have any clauses and
+  // isn't a cleanup.
+  Assert1(LPI.getNumClauses() > 0 || LPI.isCleanup(),
+          "LandingPadInst needs at least one clause or to be a cleanup.", &LPI);
+
+  // The landingpad instruction defines its parent as a landing pad block. The
+  // landing pad block may be branched to only by the unwind edge of an invoke.
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator());
+    Assert1(II && II->getUnwindDest() == BB && II->getNormalDest() != BB,
+            "Block containing LandingPadInst must be jumped to "
+            "only by the unwind edge of an invoke.", &LPI);
+  }
+
+  // The landingpad instruction must be the first non-PHI instruction in the
+  // block.
+  Assert1(LPI.getParent()->getLandingPadInst() == &LPI,
+          "LandingPadInst not the first non-PHI instruction in the block.",
+          &LPI);
+
+  // The personality functions for all landingpad instructions within the same
+  // function should match.
+  if (PersonalityFn)
+    Assert1(LPI.getPersonalityFn() == PersonalityFn,
+            "Personality function doesn't match others in function", &LPI);
+  PersonalityFn = LPI.getPersonalityFn();
+
+  // All operands must be constants.
+  Assert1(isa<Constant>(PersonalityFn), "Personality function is not constant!",
+          &LPI);
+  for (unsigned i = 0, e = LPI.getNumClauses(); i < e; ++i) {
+    Value *Clause = LPI.getClause(i);
+    Assert1(isa<Constant>(Clause), "Clause is not constant!", &LPI);
+    if (LPI.isCatch(i)) {
+      Assert1(isa<PointerType>(Clause->getType()),
+              "Catch operand does not have pointer type!", &LPI);
+    } else {
+      Assert1(LPI.isFilter(i), "Clause is neither catch nor filter!", &LPI);
+      Assert1(isa<ConstantArray>(Clause) || isa<ConstantAggregateZero>(Clause),
+              "Filter operand is not an array of constants!", &LPI);
+    }
+  }
+
+  visitInstruction(LPI);
+}
+
+void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
+  Instruction *Op = cast<Instruction>(I.getOperand(i));
+  // If the we have an invalid invoke, don't try to compute the dominance.
+  // We already reject it in the invoke specific checks and the dominance
+  // computation doesn't handle multiple edges.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) {
+    if (II->getNormalDest() == II->getUnwindDest())
+      return;
+  }
+
+  const Use &U = I.getOperandUse(i);
+  Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, U),
+          "Instruction does not dominate all uses!", Op, &I);
+}
+
+/// verifyInstruction - Verify that an instruction is well formed.
+///
+void Verifier::visitInstruction(Instruction &I) {
+  BasicBlock *BB = I.getParent();
+  Assert1(BB, "Instruction not embedded in basic block!", &I);
+
+  if (!isa<PHINode>(I)) {   // Check that non-phi nodes are not self referential
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+         UI != UE; ++UI)
+      Assert1(*UI != (User*)&I || !DT->isReachableFromEntry(BB),
+              "Only PHI nodes may reference their own value!", &I);
+  }
+
+  // Check that void typed values don't have names
+  Assert1(!I.getType()->isVoidTy() || !I.hasName(),
+          "Instruction has a name, but provides a void value!", &I);
+
+  // Check that the return value of the instruction is either void or a legal
+  // value type.
+  Assert1(I.getType()->isVoidTy() || 
+          I.getType()->isFirstClassType(),
+          "Instruction returns a non-scalar type!", &I);
+
+  // Check that the instruction doesn't produce metadata. Calls are already
+  // checked against the callee type.
+  Assert1(!I.getType()->isMetadataTy() ||
+          isa<CallInst>(I) || isa<InvokeInst>(I),
+          "Invalid use of metadata!", &I);
+
+  // Check that all uses of the instruction, if they are instructions
+  // themselves, actually have parent basic blocks.  If the use is not an
+  // instruction, it is an error!
+  for (User::use_iterator UI = I.use_begin(), UE = I.use_end();
+       UI != UE; ++UI) {
+    if (Instruction *Used = dyn_cast<Instruction>(*UI))
+      Assert2(Used->getParent() != 0, "Instruction referencing instruction not"
+              " embedded in a basic block!", &I, Used);
+    else {
+      CheckFailed("Use of instruction is not an instruction!", *UI);
+      return;
+    }
+  }
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    Assert1(I.getOperand(i) != 0, "Instruction has null operand!", &I);
+
+    // Check to make sure that only first-class-values are operands to
+    // instructions.
+    if (!I.getOperand(i)->getType()->isFirstClassType()) {
+      Assert1(0, "Instruction operands must be first-class values!", &I);
+    }
+
+    if (Function *F = dyn_cast<Function>(I.getOperand(i))) {
+      // Check to make sure that the "address of" an intrinsic function is never
+      // taken.
+      Assert1(!F->isIntrinsic() || i == (isa<CallInst>(I) ? e-1 : 0),
+              "Cannot take the address of an intrinsic!", &I);
+      Assert1(!F->isIntrinsic() || isa<CallInst>(I) ||
+              F->getIntrinsicID() == Intrinsic::donothing,
+              "Cannot invoke an intrinsinc other than donothing", &I);
+      Assert1(F->getParent() == Mod, "Referencing function in another module!",
+              &I);
+    } else if (BasicBlock *OpBB = dyn_cast<BasicBlock>(I.getOperand(i))) {
+      Assert1(OpBB->getParent() == BB->getParent(),
+              "Referring to a basic block in another function!", &I);
+    } else if (Argument *OpArg = dyn_cast<Argument>(I.getOperand(i))) {
+      Assert1(OpArg->getParent() == BB->getParent(),
+              "Referring to an argument in another function!", &I);
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(i))) {
+      Assert1(GV->getParent() == Mod, "Referencing global in another module!",
+              &I);
+    } else if (isa<Instruction>(I.getOperand(i))) {
+      verifyDominatesUse(I, i);
+    } else if (isa<InlineAsm>(I.getOperand(i))) {
+      Assert1((i + 1 == e && isa<CallInst>(I)) ||
+              (i + 3 == e && isa<InvokeInst>(I)),
+              "Cannot take the address of an inline asm!", &I);
+    }
+  }
+
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_fpmath)) {
+    Assert1(I.getType()->isFPOrFPVectorTy(),
+            "fpmath requires a floating point result!", &I);
+    Assert1(MD->getNumOperands() == 1, "fpmath takes one operand!", &I);
+    Value *Op0 = MD->getOperand(0);
+    if (ConstantFP *CFP0 = dyn_cast_or_null<ConstantFP>(Op0)) {
+      APFloat Accuracy = CFP0->getValueAPF();
+      Assert1(Accuracy.isNormal() && !Accuracy.isNegative(),
+              "fpmath accuracy not a positive number!", &I);
+    } else {
+      Assert1(false, "invalid fpmath accuracy!", &I);
+    }
+  }
+
+  MDNode *MD = I.getMetadata(LLVMContext::MD_range);
+  Assert1(!MD || isa<LoadInst>(I), "Ranges are only for loads!", &I);
+
+  InstsInThisBlock.insert(&I);
+}
+
+/// VerifyIntrinsicType - Verify that the specified type (which comes from an
+/// intrinsic argument or return value) matches the type constraints specified
+/// by the .td file (e.g. an "any integer" argument really is an integer).
+///
+/// This return true on error but does not print a message.
+bool Verifier::VerifyIntrinsicType(Type *Ty,
+                                   ArrayRef<Intrinsic::IITDescriptor> &Infos,
+                                   SmallVectorImpl<Type*> &ArgTys) {
+  using namespace Intrinsic;
+
+  // If we ran out of descriptors, there are too many arguments.
+  if (Infos.empty()) return true; 
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  
+  switch (D.Kind) {
+  case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
+  case IITDescriptor::Metadata: return !Ty->isMetadataTy();
+  case IITDescriptor::Half: return !Ty->isHalfTy();
+  case IITDescriptor::Float: return !Ty->isFloatTy();
+  case IITDescriptor::Double: return !Ty->isDoubleTy();
+  case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
+  case IITDescriptor::Vector: {
+    VectorType *VT = dyn_cast<VectorType>(Ty);
+    return VT == 0 || VT->getNumElements() != D.Vector_Width ||
+           VerifyIntrinsicType(VT->getElementType(), Infos, ArgTys);
+  }
+  case IITDescriptor::Pointer: {
+    PointerType *PT = dyn_cast<PointerType>(Ty);
+    return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace ||
+           VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
+  }
+      
+  case IITDescriptor::Struct: {
+    StructType *ST = dyn_cast<StructType>(Ty);
+    if (ST == 0 || ST->getNumElements() != D.Struct_NumElements)
+      return true;
+    
+    for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
+      if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys))
+        return true;
+    return false;
+  }
+      
+  case IITDescriptor::Argument:
+    // Two cases here - If this is the second occurrence of an argument, verify
+    // that the later instance matches the previous instance. 
+    if (D.getArgumentNumber() < ArgTys.size())
+      return Ty != ArgTys[D.getArgumentNumber()];  
+      
+    // Otherwise, if this is the first instance of an argument, record it and
+    // verify the "Any" kind.
+    assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
+    ArgTys.push_back(Ty);
+      
+    switch (D.getArgumentKind()) {
+    case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
+    case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
+    case IITDescriptor::AK_AnyVector:  return !isa<VectorType>(Ty);
+    case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
+    }
+    llvm_unreachable("all argument kinds not covered");
+      
+  case IITDescriptor::ExtendVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getExtendedElementVectorType(
+                       cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+
+  case IITDescriptor::TruncVecArgument:
+    // This may only be used when referring to a previous vector argument.
+    return D.getArgumentNumber() >= ArgTys.size() ||
+           !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+           VectorType::getTruncatedElementVectorType(
+                         cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+  }
+  llvm_unreachable("unhandled");
+}
+
+/// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
+///
+void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
+  Function *IF = CI.getCalledFunction();
+  Assert1(IF->isDeclaration(), "Intrinsic functions should never be defined!",
+          IF);
+
+  // Verify that the intrinsic prototype lines up with what the .td files
+  // describe.
+  FunctionType *IFTy = IF->getFunctionType();
+  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
+  
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(ID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  SmallVector<Type *, 4> ArgTys;
+  Assert1(!VerifyIntrinsicType(IFTy->getReturnType(), TableRef, ArgTys),
+          "Intrinsic has incorrect return type!", IF);
+  for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
+    Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
+            "Intrinsic has incorrect argument type!", IF);
+  Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
+
+  // Now that we have the intrinsic ID and the actual argument types (and we
+  // know they are legal for the intrinsic!) get the intrinsic name through the
+  // usual means.  This allows us to verify the mangling of argument types into
+  // the name.
+  Assert1(Intrinsic::getName(ID, ArgTys) == IF->getName(),
+          "Intrinsic name not mangled correctly for type arguments!", IF);
+  
+  // If the intrinsic takes MDNode arguments, verify that they are either global
+  // or are local to *this* function.
+  for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
+    if (MDNode *MD = dyn_cast<MDNode>(CI.getArgOperand(i)))
+      visitMDNode(*MD, CI.getParent()->getParent());
+
+  switch (ID) {
+  default:
+    break;
+  case Intrinsic::ctlz:  // llvm.ctlz
+  case Intrinsic::cttz:  // llvm.cttz
+    Assert1(isa<ConstantInt>(CI.getArgOperand(1)),
+            "is_zero_undef argument of bit counting intrinsics must be a "
+            "constant int", &CI);
+    break;
+  case Intrinsic::dbg_declare: {  // llvm.dbg.declare
+    Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
+                "invalid llvm.dbg.declare intrinsic call 1", &CI);
+    MDNode *MD = cast<MDNode>(CI.getArgOperand(0));
+    Assert1(MD->getNumOperands() == 1,
+                "invalid llvm.dbg.declare intrinsic call 2", &CI);
+  } break;
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+    Assert1(isa<ConstantInt>(CI.getArgOperand(3)),
+            "alignment argument of memory intrinsics must be a constant int",
+            &CI);
+    Assert1(isa<ConstantInt>(CI.getArgOperand(4)),
+            "isvolatile argument of memory intrinsics must be a constant int",
+            &CI);
+    break;
+  case Intrinsic::gcroot:
+  case Intrinsic::gcwrite:
+  case Intrinsic::gcread:
+    if (ID == Intrinsic::gcroot) {
+      AllocaInst *AI =
+        dyn_cast<AllocaInst>(CI.getArgOperand(0)->stripPointerCasts());
+      Assert1(AI, "llvm.gcroot parameter #1 must be an alloca.", &CI);
+      Assert1(isa<Constant>(CI.getArgOperand(1)),
+              "llvm.gcroot parameter #2 must be a constant.", &CI);
+      if (!AI->getType()->getElementType()->isPointerTy()) {
+        Assert1(!isa<ConstantPointerNull>(CI.getArgOperand(1)),
+                "llvm.gcroot parameter #1 must either be a pointer alloca, "
+                "or argument #2 must be a non-null constant.", &CI);
+      }
+    }
+
+    Assert1(CI.getParent()->getParent()->hasGC(),
+            "Enclosing function does not use GC.", &CI);
+    break;
+  case Intrinsic::init_trampoline:
+    Assert1(isa<Function>(CI.getArgOperand(1)->stripPointerCasts()),
+            "llvm.init_trampoline parameter #2 must resolve to a function.",
+            &CI);
+    break;
+  case Intrinsic::prefetch:
+    Assert1(isa<ConstantInt>(CI.getArgOperand(1)) &&
+            isa<ConstantInt>(CI.getArgOperand(2)) &&
+            cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue() < 2 &&
+            cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue() < 4,
+            "invalid arguments to llvm.prefetch",
+            &CI);
+    break;
+  case Intrinsic::stackprotector:
+    Assert1(isa<AllocaInst>(CI.getArgOperand(1)->stripPointerCasts()),
+            "llvm.stackprotector parameter #2 must resolve to an alloca.",
+            &CI);
+    break;
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  case Intrinsic::invariant_start:
+    Assert1(isa<ConstantInt>(CI.getArgOperand(0)),
+            "size argument of memory use markers must be a constant integer",
+            &CI);
+    break;
+  case Intrinsic::invariant_end:
+    Assert1(isa<ConstantInt>(CI.getArgOperand(1)),
+            "llvm.invariant.end parameter #2 must be a constant integer", &CI);
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Implement the public interfaces to this file...
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createVerifierPass(VerifierFailureAction action) {
+  return new Verifier(action);
+}
+
+
+/// verifyFunction - Check a function for errors, printing messages on stderr.
+/// Return true if the function is corrupt.
+///
+bool llvm::verifyFunction(const Function &f, VerifierFailureAction action) {
+  Function &F = const_cast<Function&>(f);
+  assert(!F.isDeclaration() && "Cannot verify external functions");
+
+  FunctionPassManager FPM(F.getParent());
+  Verifier *V = new Verifier(action);
+  FPM.add(V);
+  FPM.run(F);
+  return V->Broken;
+}
+
+/// verifyModule - Check a module for errors, printing messages on stderr.
+/// Return true if the module is corrupt.
+///
+bool llvm::verifyModule(const Module &M, VerifierFailureAction action,
+                        std::string *ErrorInfo) {
+  PassManager PM;
+  Verifier *V = new Verifier(action);
+  PM.add(V);
+  PM.run(const_cast<Module&>(M));
+
+  if (ErrorInfo && V->Broken)
+    *ErrorInfo = V->MessagesStr.str();
+  return V->Broken;
+}
diff --git a/contrib/llvm/lib/IRReader/IRReader.cpp b/contrib/llvm/lib/IRReader/IRReader.cpp
new file mode 100644
index 000000000000..eeec14e834c1
--- /dev/null
+++ b/contrib/llvm/lib/IRReader/IRReader.cpp
@@ -0,0 +1,89 @@
+//===---- IRReader.cpp - Reader for LLVM IR files -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Assembly/Parser.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/Timer.h"
+
+using namespace llvm;
+
+namespace llvm {
+  extern bool TimePassesIsEnabled;
+}
+
+static const char *TimeIRParsingGroupName = "LLVM IR Parsing";
+static const char *TimeIRParsingName = "Parse IR";
+
+
+Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
+                              LLVMContext &Context) {
+  if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
+                (const unsigned char *)Buffer->getBufferEnd())) {
+    std::string ErrMsg;
+    Module *M = getLazyBitcodeModule(Buffer, Context, &ErrMsg);
+    if (M == 0) {
+      Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
+                         ErrMsg);
+      // ParseBitcodeFile does not take ownership of the Buffer in the
+      // case of an error.
+      delete Buffer;
+    }
+    return M;
+  }
+
+  return ParseAssembly(Buffer, 0, Err, Context);
+}
+
+Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
+                                  LLVMContext &Context) {
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+    Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                       "Could not open input file: " + ec.message());
+    return 0;
+  }
+
+  return getLazyIRModule(File.take(), Err, Context);
+}
+
+Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
+                      LLVMContext &Context) {
+  NamedRegionTimer T(TimeIRParsingName, TimeIRParsingGroupName,
+                     TimePassesIsEnabled);
+  if (isBitcode((const unsigned char *)Buffer->getBufferStart(),
+                (const unsigned char *)Buffer->getBufferEnd())) {
+    std::string ErrMsg;
+    Module *M = ParseBitcodeFile(Buffer, Context, &ErrMsg);
+    if (M == 0)
+      Err = SMDiagnostic(Buffer->getBufferIdentifier(), SourceMgr::DK_Error,
+                         ErrMsg);
+    // ParseBitcodeFile does not take ownership of the Buffer.
+    delete Buffer;
+    return M;
+  }
+
+  return ParseAssembly(Buffer, 0, Err, Context);
+}
+
+Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
+                          LLVMContext &Context) {
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+    Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                       "Could not open input file: " + ec.message());
+    return 0;
+  }
+
+  return ParseIR(File.take(), Err, Context);
+}
diff --git a/contrib/llvm/lib/Linker/LinkModules.cpp b/contrib/llvm/lib/Linker/LinkModules.cpp
new file mode 100644
index 000000000000..74cbdadd61eb
--- /dev/null
+++ b/contrib/llvm/lib/Linker/LinkModules.cpp
@@ -0,0 +1,1322 @@
+//===- lib/Linker/LinkModules.cpp - Module Linker Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM module linker.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm-c/Linker.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cctype>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TypeMap implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class TypeMapTy : public ValueMapTypeRemapper {
+  /// MappedTypes - This is a mapping from a source type to a destination type
+  /// to use.
+  DenseMap<Type*, Type*> MappedTypes;
+
+  /// SpeculativeTypes - When checking to see if two subgraphs are isomorphic,
+  /// we speculatively add types to MappedTypes, but keep track of them here in
+  /// case we need to roll back.
+  SmallVector<Type*, 16> SpeculativeTypes;
+  
+  /// SrcDefinitionsToResolve - This is a list of non-opaque structs in the
+  /// source module that are mapped to an opaque struct in the destination
+  /// module.
+  SmallVector<StructType*, 16> SrcDefinitionsToResolve;
+  
+  /// DstResolvedOpaqueTypes - This is the set of opaque types in the
+  /// destination modules who are getting a body from the source module.
+  SmallPtrSet<StructType*, 16> DstResolvedOpaqueTypes;
+
+public:
+  /// addTypeMapping - Indicate that the specified type in the destination
+  /// module is conceptually equivalent to the specified type in the source
+  /// module.
+  void addTypeMapping(Type *DstTy, Type *SrcTy);
+
+  /// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
+  /// module from a type definition in the source module.
+  void linkDefinedTypeBodies();
+  
+  /// get - Return the mapped type to use for the specified input type from the
+  /// source module.
+  Type *get(Type *SrcTy);
+
+  FunctionType *get(FunctionType *T) {return cast<FunctionType>(get((Type*)T));}
+
+  /// dump - Dump out the type map for debugging purposes.
+  void dump() const {
+    for (DenseMap<Type*, Type*>::const_iterator
+           I = MappedTypes.begin(), E = MappedTypes.end(); I != E; ++I) {
+      dbgs() << "TypeMap: ";
+      I->first->dump();
+      dbgs() << " => ";
+      I->second->dump();
+      dbgs() << '\n';
+    }
+  }
+
+private:
+  Type *getImpl(Type *T);
+  /// remapType - Implement the ValueMapTypeRemapper interface.
+  Type *remapType(Type *SrcTy) {
+    return get(SrcTy);
+  }
+  
+  bool areTypesIsomorphic(Type *DstTy, Type *SrcTy);
+};
+}
+
+void TypeMapTy::addTypeMapping(Type *DstTy, Type *SrcTy) {
+  Type *&Entry = MappedTypes[SrcTy];
+  if (Entry) return;
+  
+  if (DstTy == SrcTy) {
+    Entry = DstTy;
+    return;
+  }
+  
+  // Check to see if these types are recursively isomorphic and establish a
+  // mapping between them if so.
+  if (!areTypesIsomorphic(DstTy, SrcTy)) {
+    // Oops, they aren't isomorphic.  Just discard this request by rolling out
+    // any speculative mappings we've established.
+    for (unsigned i = 0, e = SpeculativeTypes.size(); i != e; ++i)
+      MappedTypes.erase(SpeculativeTypes[i]);
+  }
+  SpeculativeTypes.clear();
+}
+
+/// areTypesIsomorphic - Recursively walk this pair of types, returning true
+/// if they are isomorphic, false if they are not.
+bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
+  // Two types with differing kinds are clearly not isomorphic.
+  if (DstTy->getTypeID() != SrcTy->getTypeID()) return false;
+
+  // If we have an entry in the MappedTypes table, then we have our answer.
+  Type *&Entry = MappedTypes[SrcTy];
+  if (Entry)
+    return Entry == DstTy;
+
+  // Two identical types are clearly isomorphic.  Remember this
+  // non-speculatively.
+  if (DstTy == SrcTy) {
+    Entry = DstTy;
+    return true;
+  }
+  
+  // Okay, we have two types with identical kinds that we haven't seen before.
+
+  // If this is an opaque struct type, special case it.
+  if (StructType *SSTy = dyn_cast<StructType>(SrcTy)) {
+    // Mapping an opaque type to any struct, just keep the dest struct.
+    if (SSTy->isOpaque()) {
+      Entry = DstTy;
+      SpeculativeTypes.push_back(SrcTy);
+      return true;
+    }
+
+    // Mapping a non-opaque source type to an opaque dest.  If this is the first
+    // type that we're mapping onto this destination type then we succeed.  Keep
+    // the dest, but fill it in later.  This doesn't need to be speculative.  If
+    // this is the second (different) type that we're trying to map onto the
+    // same opaque type then we fail.
+    if (cast<StructType>(DstTy)->isOpaque()) {
+      // We can only map one source type onto the opaque destination type.
+      if (!DstResolvedOpaqueTypes.insert(cast<StructType>(DstTy)))
+        return false;
+      SrcDefinitionsToResolve.push_back(SSTy);
+      Entry = DstTy;
+      return true;
+    }
+  }
+  
+  // If the number of subtypes disagree between the two types, then we fail.
+  if (SrcTy->getNumContainedTypes() != DstTy->getNumContainedTypes())
+    return false;
+  
+  // Fail if any of the extra properties (e.g. array size) of the type disagree.
+  if (isa<IntegerType>(DstTy))
+    return false;  // bitwidth disagrees.
+  if (PointerType *PT = dyn_cast<PointerType>(DstTy)) {
+    if (PT->getAddressSpace() != cast<PointerType>(SrcTy)->getAddressSpace())
+      return false;
+    
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(DstTy)) {
+    if (FT->isVarArg() != cast<FunctionType>(SrcTy)->isVarArg())
+      return false;
+  } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
+    StructType *SSTy = cast<StructType>(SrcTy);
+    if (DSTy->isLiteral() != SSTy->isLiteral() ||
+        DSTy->isPacked() != SSTy->isPacked())
+      return false;
+  } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
+    if (DATy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+      return false;
+  } else if (VectorType *DVTy = dyn_cast<VectorType>(DstTy)) {
+    if (DVTy->getNumElements() != cast<VectorType>(SrcTy)->getNumElements())
+      return false;
+  }
+
+  // Otherwise, we speculate that these two types will line up and recursively
+  // check the subelements.
+  Entry = DstTy;
+  SpeculativeTypes.push_back(SrcTy);
+
+  for (unsigned i = 0, e = SrcTy->getNumContainedTypes(); i != e; ++i)
+    if (!areTypesIsomorphic(DstTy->getContainedType(i),
+                            SrcTy->getContainedType(i)))
+      return false;
+  
+  // If everything seems to have lined up, then everything is great.
+  return true;
+}
+
+/// linkDefinedTypeBodies - Produce a body for an opaque type in the dest
+/// module from a type definition in the source module.
+void TypeMapTy::linkDefinedTypeBodies() {
+  SmallVector<Type*, 16> Elements;
+  SmallString<16> TmpName;
+  
+  // Note that processing entries in this loop (calling 'get') can add new
+  // entries to the SrcDefinitionsToResolve vector.
+  while (!SrcDefinitionsToResolve.empty()) {
+    StructType *SrcSTy = SrcDefinitionsToResolve.pop_back_val();
+    StructType *DstSTy = cast<StructType>(MappedTypes[SrcSTy]);
+    
+    // TypeMap is a many-to-one mapping, if there were multiple types that
+    // provide a body for DstSTy then previous iterations of this loop may have
+    // already handled it.  Just ignore this case.
+    if (!DstSTy->isOpaque()) continue;
+    assert(!SrcSTy->isOpaque() && "Not resolving a definition?");
+    
+    // Map the body of the source type over to a new body for the dest type.
+    Elements.resize(SrcSTy->getNumElements());
+    for (unsigned i = 0, e = Elements.size(); i != e; ++i)
+      Elements[i] = getImpl(SrcSTy->getElementType(i));
+    
+    DstSTy->setBody(Elements, SrcSTy->isPacked());
+    
+    // If DstSTy has no name or has a longer name than STy, then viciously steal
+    // STy's name.
+    if (!SrcSTy->hasName()) continue;
+    StringRef SrcName = SrcSTy->getName();
+    
+    if (!DstSTy->hasName() || DstSTy->getName().size() > SrcName.size()) {
+      TmpName.insert(TmpName.end(), SrcName.begin(), SrcName.end());
+      SrcSTy->setName("");
+      DstSTy->setName(TmpName.str());
+      TmpName.clear();
+    }
+  }
+  
+  DstResolvedOpaqueTypes.clear();
+}
+
+/// get - Return the mapped type to use for the specified input type from the
+/// source module.
+Type *TypeMapTy::get(Type *Ty) {
+  Type *Result = getImpl(Ty);
+  
+  // If this caused a reference to any struct type, resolve it before returning.
+  if (!SrcDefinitionsToResolve.empty())
+    linkDefinedTypeBodies();
+  return Result;
+}
+
+/// getImpl - This is the recursive version of get().
+Type *TypeMapTy::getImpl(Type *Ty) {
+  // If we already have an entry for this type, return it.
+  Type **Entry = &MappedTypes[Ty];
+  if (*Entry) return *Entry;
+  
+  // If this is not a named struct type, then just map all of the elements and
+  // then rebuild the type from inside out.
+  if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral()) {
+    // If there are no element types to map, then the type is itself.  This is
+    // true for the anonymous {} struct, things like 'float', integers, etc.
+    if (Ty->getNumContainedTypes() == 0)
+      return *Entry = Ty;
+    
+    // Remap all of the elements, keeping track of whether any of them change.
+    bool AnyChange = false;
+    SmallVector<Type*, 4> ElementTypes;
+    ElementTypes.resize(Ty->getNumContainedTypes());
+    for (unsigned i = 0, e = Ty->getNumContainedTypes(); i != e; ++i) {
+      ElementTypes[i] = getImpl(Ty->getContainedType(i));
+      AnyChange |= ElementTypes[i] != Ty->getContainedType(i);
+    }
+    
+    // If we found our type while recursively processing stuff, just use it.
+    Entry = &MappedTypes[Ty];
+    if (*Entry) return *Entry;
+    
+    // If all of the element types mapped directly over, then the type is usable
+    // as-is.
+    if (!AnyChange)
+      return *Entry = Ty;
+    
+    // Otherwise, rebuild a modified type.
+    switch (Ty->getTypeID()) {
+    default: llvm_unreachable("unknown derived type to remap");
+    case Type::ArrayTyID:
+      return *Entry = ArrayType::get(ElementTypes[0],
+                                     cast<ArrayType>(Ty)->getNumElements());
+    case Type::VectorTyID: 
+      return *Entry = VectorType::get(ElementTypes[0],
+                                      cast<VectorType>(Ty)->getNumElements());
+    case Type::PointerTyID:
+      return *Entry = PointerType::get(ElementTypes[0],
+                                      cast<PointerType>(Ty)->getAddressSpace());
+    case Type::FunctionTyID:
+      return *Entry = FunctionType::get(ElementTypes[0],
+                                        makeArrayRef(ElementTypes).slice(1),
+                                        cast<FunctionType>(Ty)->isVarArg());
+    case Type::StructTyID:
+      // Note that this is only reached for anonymous structs.
+      return *Entry = StructType::get(Ty->getContext(), ElementTypes,
+                                      cast<StructType>(Ty)->isPacked());
+    }
+  }
+
+  // Otherwise, this is an unmapped named struct.  If the struct can be directly
+  // mapped over, just use it as-is.  This happens in a case when the linked-in
+  // module has something like:
+  //   %T = type {%T*, i32}
+  //   @GV = global %T* null
+  // where T does not exist at all in the destination module.
+  //
+  // The other case we watch for is when the type is not in the destination
+  // module, but that it has to be rebuilt because it refers to something that
+  // is already mapped.  For example, if the destination module has:
+  //  %A = type { i32 }
+  // and the source module has something like
+  //  %A' = type { i32 }
+  //  %B = type { %A'* }
+  //  @GV = global %B* null
+  // then we want to create a new type: "%B = type { %A*}" and have it take the
+  // pristine "%B" name from the source module.
+  //
+  // To determine which case this is, we have to recursively walk the type graph
+  // speculating that we'll be able to reuse it unmodified.  Only if this is
+  // safe would we map the entire thing over.  Because this is an optimization,
+  // and is not required for the prettiness of the linked module, we just skip
+  // it and always rebuild a type here.
+  StructType *STy = cast<StructType>(Ty);
+  
+  // If the type is opaque, we can just use it directly.
+  if (STy->isOpaque())
+    return *Entry = STy;
+  
+  // Otherwise we create a new type and resolve its body later.  This will be
+  // resolved by the top level of get().
+  SrcDefinitionsToResolve.push_back(STy);
+  StructType *DTy = StructType::create(STy->getContext());
+  DstResolvedOpaqueTypes.insert(DTy);
+  return *Entry = DTy;
+}
+
+//===----------------------------------------------------------------------===//
+// ModuleLinker implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// ModuleLinker - This is an implementation class for the LinkModules
+  /// function, which is the entrypoint for this file.
+  class ModuleLinker {
+    Module *DstM, *SrcM;
+    
+    TypeMapTy TypeMap; 
+
+    /// ValueMap - Mapping of values from what they used to be in Src, to what
+    /// they are now in DstM.  ValueToValueMapTy is a ValueMap, which involves
+    /// some overhead due to the use of Value handles which the Linker doesn't
+    /// actually need, but this allows us to reuse the ValueMapper code.
+    ValueToValueMapTy ValueMap;
+    
+    struct AppendingVarInfo {
+      GlobalVariable *NewGV;  // New aggregate global in dest module.
+      Constant *DstInit;      // Old initializer from dest module.
+      Constant *SrcInit;      // Old initializer from src module.
+    };
+    
+    std::vector<AppendingVarInfo> AppendingVars;
+    
+    unsigned Mode; // Mode to treat source module.
+    
+    // Set of items not to link in from source.
+    SmallPtrSet<const Value*, 16> DoNotLinkFromSource;
+    
+    // Vector of functions to lazily link in.
+    std::vector<Function*> LazilyLinkFunctions;
+    
+  public:
+    std::string ErrorMsg;
+    
+    ModuleLinker(Module *dstM, Module *srcM, unsigned mode)
+      : DstM(dstM), SrcM(srcM), Mode(mode) { }
+    
+    bool run();
+    
+  private:
+    /// emitError - Helper method for setting a message and returning an error
+    /// code.
+    bool emitError(const Twine &Message) {
+      ErrorMsg = Message.str();
+      return true;
+    }
+    
+    /// getLinkageResult - This analyzes the two global values and determines
+    /// what the result will look like in the destination module.
+    bool getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
+                          GlobalValue::LinkageTypes &LT,
+                          GlobalValue::VisibilityTypes &Vis,
+                          bool &LinkFromSrc);
+
+    /// getLinkedToGlobal - Given a global in the source module, return the
+    /// global in the destination module that is being linked to, if any.
+    GlobalValue *getLinkedToGlobal(GlobalValue *SrcGV) {
+      // If the source has no name it can't link.  If it has local linkage,
+      // there is no name match-up going on.
+      if (!SrcGV->hasName() || SrcGV->hasLocalLinkage())
+        return 0;
+      
+      // Otherwise see if we have a match in the destination module's symtab.
+      GlobalValue *DGV = DstM->getNamedValue(SrcGV->getName());
+      if (DGV == 0) return 0;
+        
+      // If we found a global with the same name in the dest module, but it has
+      // internal linkage, we are really not doing any linkage here.
+      if (DGV->hasLocalLinkage())
+        return 0;
+
+      // Otherwise, we do in fact link to the destination global.
+      return DGV;
+    }
+    
+    void computeTypeMapping();
+    
+    bool linkAppendingVarProto(GlobalVariable *DstGV, GlobalVariable *SrcGV);
+    bool linkGlobalProto(GlobalVariable *SrcGV);
+    bool linkFunctionProto(Function *SrcF);
+    bool linkAliasProto(GlobalAlias *SrcA);
+    bool linkModuleFlagsMetadata();
+    
+    void linkAppendingVarInit(const AppendingVarInfo &AVI);
+    void linkGlobalInits();
+    void linkFunctionBody(Function *Dst, Function *Src);
+    void linkAliasBodies();
+    void linkNamedMDNodes();
+  };
+}
+
+/// forceRenaming - The LLVM SymbolTable class autorenames globals that conflict
+/// in the symbol table.  This is good for all clients except for us.  Go
+/// through the trouble to force this back.
+static void forceRenaming(GlobalValue *GV, StringRef Name) {
+  // If the global doesn't force its name or if it already has the right name,
+  // there is nothing for us to do.
+  if (GV->hasLocalLinkage() || GV->getName() == Name)
+    return;
+
+  Module *M = GV->getParent();
+
+  // If there is a conflict, rename the conflict.
+  if (GlobalValue *ConflictGV = M->getNamedValue(Name)) {
+    GV->takeName(ConflictGV);
+    ConflictGV->setName(Name);    // This will cause ConflictGV to get renamed
+    assert(ConflictGV->getName() != Name && "forceRenaming didn't work");
+  } else {
+    GV->setName(Name);              // Force the name back
+  }
+}
+
+/// copyGVAttributes - copy additional attributes (those not needed to construct
+/// a GlobalValue) from the SrcGV to the DestGV.
+static void copyGVAttributes(GlobalValue *DestGV, const GlobalValue *SrcGV) {
+  // Use the maximum alignment, rather than just copying the alignment of SrcGV.
+  unsigned Alignment = std::max(DestGV->getAlignment(), SrcGV->getAlignment());
+  DestGV->copyAttributesFrom(SrcGV);
+  DestGV->setAlignment(Alignment);
+  
+  forceRenaming(DestGV, SrcGV->getName());
+}
+
+static bool isLessConstraining(GlobalValue::VisibilityTypes a,
+                               GlobalValue::VisibilityTypes b) {
+  if (a == GlobalValue::HiddenVisibility)
+    return false;
+  if (b == GlobalValue::HiddenVisibility)
+    return true;
+  if (a == GlobalValue::ProtectedVisibility)
+    return false;
+  if (b == GlobalValue::ProtectedVisibility)
+    return true;
+  return false;
+}
+
+/// getLinkageResult - This analyzes the two global values and determines what
+/// the result will look like in the destination module.  In particular, it
+/// computes the resultant linkage type and visibility, computes whether the
+/// global in the source should be copied over to the destination (replacing
+/// the existing one), and computes whether this linkage is an error or not.
+bool ModuleLinker::getLinkageResult(GlobalValue *Dest, const GlobalValue *Src,
+                                    GlobalValue::LinkageTypes &LT,
+                                    GlobalValue::VisibilityTypes &Vis,
+                                    bool &LinkFromSrc) {
+  assert(Dest && "Must have two globals being queried");
+  assert(!Src->hasLocalLinkage() &&
+         "If Src has internal linkage, Dest shouldn't be set!");
+  
+  bool SrcIsDeclaration = Src->isDeclaration() && !Src->isMaterializable();
+  bool DestIsDeclaration = Dest->isDeclaration();
+  
+  if (SrcIsDeclaration) {
+    // If Src is external or if both Src & Dest are external..  Just link the
+    // external globals, we aren't adding anything.
+    if (Src->hasDLLImportLinkage()) {
+      // If one of GVs has DLLImport linkage, result should be dllimport'ed.
+      if (DestIsDeclaration) {
+        LinkFromSrc = true;
+        LT = Src->getLinkage();
+      }
+    } else if (Dest->hasExternalWeakLinkage()) {
+      // If the Dest is weak, use the source linkage.
+      LinkFromSrc = true;
+      LT = Src->getLinkage();
+    } else {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    }
+  } else if (DestIsDeclaration && !Dest->hasDLLImportLinkage()) {
+    // If Dest is external but Src is not:
+    LinkFromSrc = true;
+    LT = Src->getLinkage();
+  } else if (Src->isWeakForLinker()) {
+    // At this point we know that Dest has LinkOnce, External*, Weak, Common,
+    // or DLL* linkage.
+    if (Dest->hasExternalWeakLinkage() ||
+        Dest->hasAvailableExternallyLinkage() ||
+        (Dest->hasLinkOnceLinkage() &&
+         (Src->hasWeakLinkage() || Src->hasCommonLinkage()))) {
+      LinkFromSrc = true;
+      LT = Src->getLinkage();
+    } else {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    }
+  } else if (Dest->isWeakForLinker()) {
+    // At this point we know that Src has External* or DLL* linkage.
+    if (Src->hasExternalWeakLinkage()) {
+      LinkFromSrc = false;
+      LT = Dest->getLinkage();
+    } else {
+      LinkFromSrc = true;
+      LT = GlobalValue::ExternalLinkage;
+    }
+  } else {
+    assert((Dest->hasExternalLinkage()  || Dest->hasDLLImportLinkage() ||
+            Dest->hasDLLExportLinkage() || Dest->hasExternalWeakLinkage()) &&
+           (Src->hasExternalLinkage()   || Src->hasDLLImportLinkage() ||
+            Src->hasDLLExportLinkage()  || Src->hasExternalWeakLinkage()) &&
+           "Unexpected linkage type!");
+    return emitError("Linking globals named '" + Src->getName() +
+                 "': symbol multiply defined!");
+  }
+
+  // Compute the visibility. We follow the rules in the System V Application
+  // Binary Interface.
+  Vis = isLessConstraining(Src->getVisibility(), Dest->getVisibility()) ?
+    Dest->getVisibility() : Src->getVisibility();
+  return false;
+}
+
+/// computeTypeMapping - Loop over all of the linked values to compute type
+/// mappings.  For example, if we link "extern Foo *x" and "Foo *x = NULL", then
+/// we have two struct types 'Foo' but one got renamed when the module was
+/// loaded into the same LLVMContext.
+void ModuleLinker::computeTypeMapping() {
+  // Incorporate globals.
+  for (Module::global_iterator I = SrcM->global_begin(),
+       E = SrcM->global_end(); I != E; ++I) {
+    GlobalValue *DGV = getLinkedToGlobal(I);
+    if (DGV == 0) continue;
+    
+    if (!DGV->hasAppendingLinkage() || !I->hasAppendingLinkage()) {
+      TypeMap.addTypeMapping(DGV->getType(), I->getType());
+      continue;      
+    }
+    
+    // Unify the element type of appending arrays.
+    ArrayType *DAT = cast<ArrayType>(DGV->getType()->getElementType());
+    ArrayType *SAT = cast<ArrayType>(I->getType()->getElementType());
+    TypeMap.addTypeMapping(DAT->getElementType(), SAT->getElementType());
+  }
+  
+  // Incorporate functions.
+  for (Module::iterator I = SrcM->begin(), E = SrcM->end(); I != E; ++I) {
+    if (GlobalValue *DGV = getLinkedToGlobal(I))
+      TypeMap.addTypeMapping(DGV->getType(), I->getType());
+  }
+
+  // Incorporate types by name, scanning all the types in the source module.
+  // At this point, the destination module may have a type "%foo = { i32 }" for
+  // example.  When the source module got loaded into the same LLVMContext, if
+  // it had the same type, it would have been renamed to "%foo.42 = { i32 }".
+  TypeFinder SrcStructTypes;
+  SrcStructTypes.run(*SrcM, true);
+  SmallPtrSet<StructType*, 32> SrcStructTypesSet(SrcStructTypes.begin(),
+                                                 SrcStructTypes.end());
+
+  TypeFinder DstStructTypes;
+  DstStructTypes.run(*DstM, true);
+  SmallPtrSet<StructType*, 32> DstStructTypesSet(DstStructTypes.begin(),
+                                                 DstStructTypes.end());
+
+  for (unsigned i = 0, e = SrcStructTypes.size(); i != e; ++i) {
+    StructType *ST = SrcStructTypes[i];
+    if (!ST->hasName()) continue;
+    
+    // Check to see if there is a dot in the name followed by a digit.
+    size_t DotPos = ST->getName().rfind('.');
+    if (DotPos == 0 || DotPos == StringRef::npos ||
+        ST->getName().back() == '.' ||
+        !isdigit(static_cast<unsigned char>(ST->getName()[DotPos+1])))
+      continue;
+    
+    // Check to see if the destination module has a struct with the prefix name.
+    if (StructType *DST = DstM->getTypeByName(ST->getName().substr(0, DotPos)))
+      // Don't use it if this actually came from the source module. They're in
+      // the same LLVMContext after all. Also don't use it unless the type is
+      // actually used in the destination module. This can happen in situations
+      // like this:
+      //
+      //      Module A                         Module B
+      //      --------                         --------
+      //   %Z = type { %A }                %B = type { %C.1 }
+      //   %A = type { %B.1, [7 x i8] }    %C.1 = type { i8* }
+      //   %B.1 = type { %C }              %A.2 = type { %B.3, [5 x i8] }
+      //   %C = type { i8* }               %B.3 = type { %C.1 }
+      //
+      // When we link Module B with Module A, the '%B' in Module B is
+      // used. However, that would then use '%C.1'. But when we process '%C.1',
+      // we prefer to take the '%C' version. So we are then left with both
+      // '%C.1' and '%C' being used for the same types. This leads to some
+      // variables using one type and some using the other.
+      if (!SrcStructTypesSet.count(DST) && DstStructTypesSet.count(DST))
+        TypeMap.addTypeMapping(DST, ST);
+  }
+
+  // Don't bother incorporating aliases, they aren't generally typed well.
+  
+  // Now that we have discovered all of the type equivalences, get a body for
+  // any 'opaque' types in the dest module that are now resolved. 
+  TypeMap.linkDefinedTypeBodies();
+}
+
+/// linkAppendingVarProto - If there were any appending global variables, link
+/// them together now.  Return true on error.
+bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
+                                         GlobalVariable *SrcGV) {
+ 
+  if (!SrcGV->hasAppendingLinkage() || !DstGV->hasAppendingLinkage())
+    return emitError("Linking globals named '" + SrcGV->getName() +
+           "': can only link appending global with another appending global!");
+  
+  ArrayType *DstTy = cast<ArrayType>(DstGV->getType()->getElementType());
+  ArrayType *SrcTy =
+    cast<ArrayType>(TypeMap.get(SrcGV->getType()->getElementType()));
+  Type *EltTy = DstTy->getElementType();
+  
+  // Check to see that they two arrays agree on type.
+  if (EltTy != SrcTy->getElementType())
+    return emitError("Appending variables with different element types!");
+  if (DstGV->isConstant() != SrcGV->isConstant())
+    return emitError("Appending variables linked with different const'ness!");
+  
+  if (DstGV->getAlignment() != SrcGV->getAlignment())
+    return emitError(
+             "Appending variables with different alignment need to be linked!");
+  
+  if (DstGV->getVisibility() != SrcGV->getVisibility())
+    return emitError(
+            "Appending variables with different visibility need to be linked!");
+  
+  if (DstGV->getSection() != SrcGV->getSection())
+    return emitError(
+          "Appending variables with different section name need to be linked!");
+  
+  uint64_t NewSize = DstTy->getNumElements() + SrcTy->getNumElements();
+  ArrayType *NewType = ArrayType::get(EltTy, NewSize);
+  
+  // Create the new global variable.
+  GlobalVariable *NG =
+    new GlobalVariable(*DstGV->getParent(), NewType, SrcGV->isConstant(),
+                       DstGV->getLinkage(), /*init*/0, /*name*/"", DstGV,
+                       DstGV->getThreadLocalMode(),
+                       DstGV->getType()->getAddressSpace());
+  
+  // Propagate alignment, visibility and section info.
+  copyGVAttributes(NG, DstGV);
+  
+  AppendingVarInfo AVI;
+  AVI.NewGV = NG;
+  AVI.DstInit = DstGV->getInitializer();
+  AVI.SrcInit = SrcGV->getInitializer();
+  AppendingVars.push_back(AVI);
+
+  // Replace any uses of the two global variables with uses of the new
+  // global.
+  ValueMap[SrcGV] = ConstantExpr::getBitCast(NG, TypeMap.get(SrcGV->getType()));
+
+  DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
+  DstGV->eraseFromParent();
+  
+  // Track the source variable so we don't try to link it.
+  DoNotLinkFromSource.insert(SrcGV);
+  
+  return false;
+}
+
+/// linkGlobalProto - Loop through the global variables in the src module and
+/// merge them into the dest module.
+bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
+  GlobalValue *DGV = getLinkedToGlobal(SGV);
+  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+
+  if (DGV) {
+    // Concatenation of appending linkage variables is magic and handled later.
+    if (DGV->hasAppendingLinkage() || SGV->hasAppendingLinkage())
+      return linkAppendingVarProto(cast<GlobalVariable>(DGV), SGV);
+    
+    // Determine whether linkage of these two globals follows the source
+    // module's definition or the destination module's definition.
+    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+    GlobalValue::VisibilityTypes NV;
+    bool LinkFromSrc = false;
+    if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
+      return true;
+    NewVisibility = NV;
+
+    // If we're not linking from the source, then keep the definition that we
+    // have.
+    if (!LinkFromSrc) {
+      // Special case for const propagation.
+      if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
+        if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
+          DGVar->setConstant(true);
+      
+      // Set calculated linkage and visibility.
+      DGV->setLinkage(NewLinkage);
+      DGV->setVisibility(*NewVisibility);
+
+      // Make sure to remember this mapping.
+      ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
+      
+      // Track the source global so that we don't attempt to copy it over when 
+      // processing global initializers.
+      DoNotLinkFromSource.insert(SGV);
+      
+      return false;
+    }
+  }
+  
+  // No linking to be performed or linking from the source: simply create an
+  // identical version of the symbol over in the dest module... the
+  // initializer will be filled in later by LinkGlobalInits.
+  GlobalVariable *NewDGV =
+    new GlobalVariable(*DstM, TypeMap.get(SGV->getType()->getElementType()),
+                       SGV->isConstant(), SGV->getLinkage(), /*init*/0,
+                       SGV->getName(), /*insertbefore*/0,
+                       SGV->getThreadLocalMode(),
+                       SGV->getType()->getAddressSpace());
+  // Propagate alignment, visibility and section info.
+  copyGVAttributes(NewDGV, SGV);
+  if (NewVisibility)
+    NewDGV->setVisibility(*NewVisibility);
+
+  if (DGV) {
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
+    DGV->eraseFromParent();
+  }
+  
+  // Make sure to remember this mapping.
+  ValueMap[SGV] = NewDGV;
+  return false;
+}
+
+/// linkFunctionProto - Link the function in the source module into the
+/// destination module if needed, setting up mapping information.
+bool ModuleLinker::linkFunctionProto(Function *SF) {
+  GlobalValue *DGV = getLinkedToGlobal(SF);
+  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+
+  if (DGV) {
+    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+    bool LinkFromSrc = false;
+    GlobalValue::VisibilityTypes NV;
+    if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
+      return true;
+    NewVisibility = NV;
+
+    if (!LinkFromSrc) {
+      // Set calculated linkage
+      DGV->setLinkage(NewLinkage);
+      DGV->setVisibility(*NewVisibility);
+
+      // Make sure to remember this mapping.
+      ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
+      
+      // Track the function from the source module so we don't attempt to remap 
+      // it.
+      DoNotLinkFromSource.insert(SF);
+      
+      return false;
+    }
+  }
+  
+  // If there is no linkage to be performed or we are linking from the source,
+  // bring SF over.
+  Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()),
+                                     SF->getLinkage(), SF->getName(), DstM);
+  copyGVAttributes(NewDF, SF);
+  if (NewVisibility)
+    NewDF->setVisibility(*NewVisibility);
+
+  if (DGV) {
+    // Any uses of DF need to change to NewDF, with cast.
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
+    DGV->eraseFromParent();
+  } else {
+    // Internal, LO_ODR, or LO linkage - stick in set to ignore and lazily link.
+    if (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
+        SF->hasAvailableExternallyLinkage()) {
+      DoNotLinkFromSource.insert(SF);
+      LazilyLinkFunctions.push_back(SF);
+    }
+  }
+  
+  ValueMap[SF] = NewDF;
+  return false;
+}
+
+/// LinkAliasProto - Set up prototypes for any aliases that come over from the
+/// source module.
+bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
+  GlobalValue *DGV = getLinkedToGlobal(SGA);
+  llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+
+  if (DGV) {
+    GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
+    GlobalValue::VisibilityTypes NV;
+    bool LinkFromSrc = false;
+    if (getLinkageResult(DGV, SGA, NewLinkage, NV, LinkFromSrc))
+      return true;
+    NewVisibility = NV;
+
+    if (!LinkFromSrc) {
+      // Set calculated linkage.
+      DGV->setLinkage(NewLinkage);
+      DGV->setVisibility(*NewVisibility);
+
+      // Make sure to remember this mapping.
+      ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
+      
+      // Track the alias from the source module so we don't attempt to remap it.
+      DoNotLinkFromSource.insert(SGA);
+      
+      return false;
+    }
+  }
+  
+  // If there is no linkage to be performed or we're linking from the source,
+  // bring over SGA.
+  GlobalAlias *NewDA = new GlobalAlias(TypeMap.get(SGA->getType()),
+                                       SGA->getLinkage(), SGA->getName(),
+                                       /*aliasee*/0, DstM);
+  copyGVAttributes(NewDA, SGA);
+  if (NewVisibility)
+    NewDA->setVisibility(*NewVisibility);
+
+  if (DGV) {
+    // Any uses of DGV need to change to NewDA, with cast.
+    DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDA, DGV->getType()));
+    DGV->eraseFromParent();
+  }
+  
+  ValueMap[SGA] = NewDA;
+  return false;
+}
+
+static void getArrayElements(Constant *C, SmallVectorImpl<Constant*> &Dest) {
+  unsigned NumElements = cast<ArrayType>(C->getType())->getNumElements();
+
+  for (unsigned i = 0; i != NumElements; ++i)
+    Dest.push_back(C->getAggregateElement(i));
+}
+                             
+void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
+  // Merge the initializer.
+  SmallVector<Constant*, 16> Elements;
+  getArrayElements(AVI.DstInit, Elements);
+  
+  Constant *SrcInit = MapValue(AVI.SrcInit, ValueMap, RF_None, &TypeMap);
+  getArrayElements(SrcInit, Elements);
+  
+  ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
+  AVI.NewGV->setInitializer(ConstantArray::get(NewType, Elements));
+}
+
+/// linkGlobalInits - Update the initializers in the Dest module now that all
+/// globals that may be referenced are in Dest.
+void ModuleLinker::linkGlobalInits() {
+  // Loop over all of the globals in the src module, mapping them over as we go
+  for (Module::const_global_iterator I = SrcM->global_begin(),
+       E = SrcM->global_end(); I != E; ++I) {
+    
+    // Only process initialized GV's or ones not already in dest.
+    if (!I->hasInitializer() || DoNotLinkFromSource.count(I)) continue;          
+    
+    // Grab destination global variable.
+    GlobalVariable *DGV = cast<GlobalVariable>(ValueMap[I]);
+    // Figure out what the initializer looks like in the dest module.
+    DGV->setInitializer(MapValue(I->getInitializer(), ValueMap,
+                                 RF_None, &TypeMap));
+  }
+}
+
+/// linkFunctionBody - Copy the source function over into the dest function and
+/// fix up references to values.  At this point we know that Dest is an external
+/// function, and that Src is not.
+void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
+  assert(Src && Dst && Dst->isDeclaration() && !Src->isDeclaration());
+
+  // Go through and convert function arguments over, remembering the mapping.
+  Function::arg_iterator DI = Dst->arg_begin();
+  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
+       I != E; ++I, ++DI) {
+    DI->setName(I->getName());  // Copy the name over.
+
+    // Add a mapping to our mapping.
+    ValueMap[I] = DI;
+  }
+
+  if (Mode == Linker::DestroySource) {
+    // Splice the body of the source function into the dest function.
+    Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
+    
+    // At this point, all of the instructions and values of the function are now
+    // copied over.  The only problem is that they are still referencing values in
+    // the Source function as operands.  Loop through all of the operands of the
+    // functions and patch them up to point to the local versions.
+    for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap);
+    
+  } else {
+    // Clone the body of the function into the dest function.
+    SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
+    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", NULL, &TypeMap);
+  }
+  
+  // There is no need to map the arguments anymore.
+  for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
+       I != E; ++I)
+    ValueMap.erase(I);
+  
+}
+
+/// linkAliasBodies - Insert all of the aliases in Src into the Dest module.
+void ModuleLinker::linkAliasBodies() {
+  for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
+       I != E; ++I) {
+    if (DoNotLinkFromSource.count(I))
+      continue;
+    if (Constant *Aliasee = I->getAliasee()) {
+      GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
+      DA->setAliasee(MapValue(Aliasee, ValueMap, RF_None, &TypeMap));
+    }
+  }
+}
+
+/// linkNamedMDNodes - Insert all of the named MDNodes in Src into the Dest
+/// module.
+void ModuleLinker::linkNamedMDNodes() {
+  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
+  for (Module::const_named_metadata_iterator I = SrcM->named_metadata_begin(),
+       E = SrcM->named_metadata_end(); I != E; ++I) {
+    // Don't link module flags here. Do them separately.
+    if (&*I == SrcModFlags) continue;
+    NamedMDNode *DestNMD = DstM->getOrInsertNamedMetadata(I->getName());
+    // Add Src elements into Dest node.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      DestNMD->addOperand(MapValue(I->getOperand(i), ValueMap,
+                                   RF_None, &TypeMap));
+  }
+}
+
+/// linkModuleFlagsMetadata - Merge the linker flags in Src into the Dest
+/// module.
+bool ModuleLinker::linkModuleFlagsMetadata() {
+  // If the source module has no module flags, we are done.
+  const NamedMDNode *SrcModFlags = SrcM->getModuleFlagsMetadata();
+  if (!SrcModFlags) return false;
+
+  // If the destination module doesn't have module flags yet, then just copy
+  // over the source module's flags.
+  NamedMDNode *DstModFlags = DstM->getOrInsertModuleFlagsMetadata();
+  if (DstModFlags->getNumOperands() == 0) {
+    for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I)
+      DstModFlags->addOperand(SrcModFlags->getOperand(I));
+
+    return false;
+  }
+
+  // First build a map of the existing module flags and requirements.
+  DenseMap<MDString*, MDNode*> Flags;
+  SmallSetVector<MDNode*, 16> Requirements;
+  for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Op = DstModFlags->getOperand(I);
+    ConstantInt *Behavior = cast<ConstantInt>(Op->getOperand(0));
+    MDString *ID = cast<MDString>(Op->getOperand(1));
+
+    if (Behavior->getZExtValue() == Module::Require) {
+      Requirements.insert(cast<MDNode>(Op->getOperand(2)));
+    } else {
+      Flags[ID] = Op;
+    }
+  }
+
+  // Merge in the flags from the source module, and also collect its set of
+  // requirements.
+  bool HasErr = false;
+  for (unsigned I = 0, E = SrcModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *SrcOp = SrcModFlags->getOperand(I);
+    ConstantInt *SrcBehavior = cast<ConstantInt>(SrcOp->getOperand(0));
+    MDString *ID = cast<MDString>(SrcOp->getOperand(1));
+    MDNode *DstOp = Flags.lookup(ID);
+    unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+
+    // If this is a requirement, add it and continue.
+    if (SrcBehaviorValue == Module::Require) {
+      // If the destination module does not already have this requirement, add
+      // it.
+      if (Requirements.insert(cast<MDNode>(SrcOp->getOperand(2)))) {
+        DstModFlags->addOperand(SrcOp);
+      }
+      continue;
+    }
+
+    // If there is no existing flag with this ID, just add it.
+    if (!DstOp) {
+      Flags[ID] = SrcOp;
+      DstModFlags->addOperand(SrcOp);
+      continue;
+    }
+
+    // Otherwise, perform a merge.
+    ConstantInt *DstBehavior = cast<ConstantInt>(DstOp->getOperand(0));
+    unsigned DstBehaviorValue = DstBehavior->getZExtValue();
+
+    // If either flag has override behavior, handle it first.
+    if (DstBehaviorValue == Module::Override) {
+      // Diagnose inconsistent flags which both have override behavior.
+      if (SrcBehaviorValue == Module::Override &&
+          SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        HasErr |= emitError("linking module flags '" + ID->getString() +
+                            "': IDs have conflicting override values");
+      }
+      continue;
+    } else if (SrcBehaviorValue == Module::Override) {
+      // Update the destination flag to that of the source.
+      DstOp->replaceOperandWith(0, SrcBehavior);
+      DstOp->replaceOperandWith(2, SrcOp->getOperand(2));
+      continue;
+    }
+
+    // Diagnose inconsistent merge behavior types.
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      HasErr |= emitError("linking module flags '" + ID->getString() +
+                          "': IDs have conflicting behaviors");
+      continue;
+    }
+
+    // Perform the merge for standard behavior types.
+    switch (SrcBehaviorValue) {
+    case Module::Require:
+    case Module::Override: assert(0 && "not possible"); break;
+    case Module::Error: {
+      // Emit an error if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        HasErr |= emitError("linking module flags '" + ID->getString() +
+                            "': IDs have conflicting values");
+      }
+      continue;
+    }
+    case Module::Warning: {
+      // Emit a warning if the values differ.
+      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+        errs() << "WARNING: linking module flags '" << ID->getString()
+               << "': IDs have conflicting values";
+      }
+      continue;
+    }
+    case Module::Append: {
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      unsigned NumOps = DstValue->getNumOperands() + SrcValue->getNumOperands();
+      Value **VP, **Values = VP = new Value*[NumOps];
+      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i, ++VP)
+        *VP = DstValue->getOperand(i);
+      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i, ++VP)
+        *VP = SrcValue->getOperand(i);
+      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
+                                               ArrayRef<Value*>(Values,
+                                                                NumOps)));
+      delete[] Values;
+      break;
+    }
+    case Module::AppendUnique: {
+      SmallSetVector<Value*, 16> Elts;
+      MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
+      MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
+      for (unsigned i = 0, e = DstValue->getNumOperands(); i != e; ++i)
+        Elts.insert(DstValue->getOperand(i));
+      for (unsigned i = 0, e = SrcValue->getNumOperands(); i != e; ++i)
+        Elts.insert(SrcValue->getOperand(i));
+      DstOp->replaceOperandWith(2, MDNode::get(DstM->getContext(),
+                                               ArrayRef<Value*>(Elts.begin(),
+                                                                Elts.end())));
+      break;
+    }
+    }
+  }
+
+  // Check all of the requirements.
+  for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
+    MDNode *Requirement = Requirements[I];
+    MDString *Flag = cast<MDString>(Requirement->getOperand(0));
+    Value *ReqValue = Requirement->getOperand(1);
+
+    MDNode *Op = Flags[Flag];
+    if (!Op || Op->getOperand(2) != ReqValue) {
+      HasErr |= emitError("linking module flags '" + Flag->getString() +
+                          "': does not have the required value");
+      continue;
+    }
+  }
+
+  return HasErr;
+}
+  
+bool ModuleLinker::run() {
+  assert(DstM && "Null destination module");
+  assert(SrcM && "Null source module");
+
+  // Inherit the target data from the source module if the destination module
+  // doesn't have one already.
+  if (DstM->getDataLayout().empty() && !SrcM->getDataLayout().empty())
+    DstM->setDataLayout(SrcM->getDataLayout());
+
+  // Copy the target triple from the source to dest if the dest's is empty.
+  if (DstM->getTargetTriple().empty() && !SrcM->getTargetTriple().empty())
+    DstM->setTargetTriple(SrcM->getTargetTriple());
+
+  if (!SrcM->getDataLayout().empty() && !DstM->getDataLayout().empty() &&
+      SrcM->getDataLayout() != DstM->getDataLayout())
+    errs() << "WARNING: Linking two modules of different data layouts!\n";
+  if (!SrcM->getTargetTriple().empty() &&
+      DstM->getTargetTriple() != SrcM->getTargetTriple()) {
+    errs() << "WARNING: Linking two modules of different target triples: ";
+    if (!SrcM->getModuleIdentifier().empty())
+      errs() << SrcM->getModuleIdentifier() << ": ";
+    errs() << "'" << SrcM->getTargetTriple() << "' and '" 
+           << DstM->getTargetTriple() << "'\n";
+  }
+
+  // Append the module inline asm string.
+  if (!SrcM->getModuleInlineAsm().empty()) {
+    if (DstM->getModuleInlineAsm().empty())
+      DstM->setModuleInlineAsm(SrcM->getModuleInlineAsm());
+    else
+      DstM->setModuleInlineAsm(DstM->getModuleInlineAsm()+"\n"+
+                               SrcM->getModuleInlineAsm());
+  }
+
+  // Loop over all of the linked values to compute type mappings.
+  computeTypeMapping();
+
+  // Insert all of the globals in src into the DstM module... without linking
+  // initializers (which could refer to functions not yet mapped over).
+  for (Module::global_iterator I = SrcM->global_begin(),
+       E = SrcM->global_end(); I != E; ++I)
+    if (linkGlobalProto(I))
+      return true;
+
+  // Link the functions together between the two modules, without doing function
+  // bodies... this just adds external function prototypes to the DstM
+  // function...  We do this so that when we begin processing function bodies,
+  // all of the global values that may be referenced are available in our
+  // ValueMap.
+  for (Module::iterator I = SrcM->begin(), E = SrcM->end(); I != E; ++I)
+    if (linkFunctionProto(I))
+      return true;
+
+  // If there were any aliases, link them now.
+  for (Module::alias_iterator I = SrcM->alias_begin(),
+       E = SrcM->alias_end(); I != E; ++I)
+    if (linkAliasProto(I))
+      return true;
+
+  for (unsigned i = 0, e = AppendingVars.size(); i != e; ++i)
+    linkAppendingVarInit(AppendingVars[i]);
+  
+  // Update the initializers in the DstM module now that all globals that may
+  // be referenced are in DstM.
+  linkGlobalInits();
+
+  // Link in the function bodies that are defined in the source module into
+  // DstM.
+  for (Module::iterator SF = SrcM->begin(), E = SrcM->end(); SF != E; ++SF) {
+    // Skip if not linking from source.
+    if (DoNotLinkFromSource.count(SF)) continue;
+    
+    // Skip if no body (function is external) or materialize.
+    if (SF->isDeclaration()) {
+      if (!SF->isMaterializable())
+        continue;
+      if (SF->Materialize(&ErrorMsg))
+        return true;
+    }
+    
+    linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
+    SF->Dematerialize();
+  }
+
+  // Resolve all uses of aliases with aliasees.
+  linkAliasBodies();
+
+  // Remap all of the named MDNodes in Src into the DstM module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  linkNamedMDNodes();
+
+  // Merge the module flags into the DstM module.
+  if (linkModuleFlagsMetadata())
+    return true;
+
+  // Process vector of lazily linked in functions.
+  bool LinkedInAnyFunctions;
+  do {
+    LinkedInAnyFunctions = false;
+    
+    for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
+        E = LazilyLinkFunctions.end(); I != E; ++I) {
+      if (!*I)
+        continue;
+      
+      Function *SF = *I;
+      Function *DF = cast<Function>(ValueMap[SF]);
+      
+      if (!DF->use_empty()) {
+        
+        // Materialize if necessary.
+        if (SF->isDeclaration()) {
+          if (!SF->isMaterializable())
+            continue;
+          if (SF->Materialize(&ErrorMsg))
+            return true;
+        }
+        
+        // Link in function body.
+        linkFunctionBody(DF, SF);
+        SF->Dematerialize();
+
+        // "Remove" from vector by setting the element to 0.
+        *I = 0;
+        
+        // Set flag to indicate we may have more functions to lazily link in
+        // since we linked in a function.
+        LinkedInAnyFunctions = true;
+      }
+    }
+  } while (LinkedInAnyFunctions);
+  
+  // Remove any prototypes of functions that were not actually linked in.
+  for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
+      E = LazilyLinkFunctions.end(); I != E; ++I) {
+    if (!*I)
+      continue;
+    
+    Function *SF = *I;
+    Function *DF = cast<Function>(ValueMap[SF]);
+    if (DF->use_empty())
+      DF->eraseFromParent();
+  }
+  
+  // Now that all of the types from the source are used, resolve any structs
+  // copied over to the dest that didn't exist there.
+  TypeMap.linkDefinedTypeBodies();
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// LinkModules entrypoint.
+//===----------------------------------------------------------------------===//
+
+/// LinkModules - This function links two modules together, with the resulting
+/// Dest module modified to be the composite of the two input modules.  If an
+/// error occurs, true is returned and ErrorMsg (if not null) is set to indicate
+/// the problem.  Upon failure, the Dest module could be in a modified state,
+/// and shouldn't be relied on to be consistent.
+bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode, 
+                         std::string *ErrorMsg) {
+  ModuleLinker TheLinker(Dest, Src, Mode);
+  if (TheLinker.run()) {
+    if (ErrorMsg) *ErrorMsg = TheLinker.ErrorMsg;
+    return true;
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLinkModules(LLVMModuleRef Dest, LLVMModuleRef Src,
+                         LLVMLinkerMode Mode, char **OutMessages) {
+  std::string Messages;
+  LLVMBool Result = Linker::LinkModules(unwrap(Dest), unwrap(Src),
+                                        Mode, OutMessages? &Messages : 0);
+  if (OutMessages)
+    *OutMessages = strdup(Messages.c_str());
+  return Result;
+}
diff --git a/contrib/llvm/lib/Linker/Linker.cpp b/contrib/llvm/lib/Linker/Linker.cpp
new file mode 100644
index 000000000000..74d24f278b77
--- /dev/null
+++ b/contrib/llvm/lib/Linker/Linker.cpp
@@ -0,0 +1,70 @@
+//===- lib/Linker/Linker.cpp - Basic Linker functionality  ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains basic Linker functionality that all usages will need.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Linker.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+
+Linker::Linker(StringRef progname, StringRef modname,
+               LLVMContext& C, unsigned flags):
+  Context(C),
+  Composite(new Module(modname, C)),
+  Flags(flags),
+  Error(),
+  ProgramName(progname) { }
+
+Linker::Linker(StringRef progname, Module* aModule, unsigned flags) :
+  Context(aModule->getContext()),
+  Composite(aModule),
+  Flags(flags),
+  Error(),
+  ProgramName(progname) { }
+
+Linker::~Linker() {
+  delete Composite;
+}
+
+bool
+Linker::error(StringRef message) {
+  Error = message;
+  if (!(Flags&QuietErrors))
+    errs() << ProgramName << ": error: " << message << "\n";
+  return true;
+}
+
+bool
+Linker::warning(StringRef message) {
+  Error = message;
+  if (!(Flags&QuietWarnings))
+    errs() << ProgramName << ": warning: " << message << "\n";
+  return false;
+}
+
+void
+Linker::verbose(StringRef message) {
+  if (Flags&Verbose)
+    errs() << "  " << message << "\n";
+}
+
+Module*
+Linker::releaseModule() {
+  Module* result = Composite;
+  Error.clear();
+  Composite = 0;
+  Flags = 0;
+  return result;
+}
diff --git a/contrib/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
new file mode 100644
index 000000000000..3d995484e7c7
--- /dev/null
+++ b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
@@ -0,0 +1,1594 @@
+//===- lib/MC/ELFObjectWriter.cpp - ELF File Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <vector>
+using namespace llvm;
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "reloc-info"
+
+namespace {
+class ELFObjectWriter : public MCObjectWriter {
+  protected:
+
+    static bool isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind);
+    static bool RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant);
+    static uint64_t SymbolValue(MCSymbolData &Data, const MCAsmLayout &Layout);
+    static bool isInSymtab(const MCAssembler &Asm, const MCSymbolData &Data,
+                           bool Used, bool Renamed);
+    static bool isLocal(const MCSymbolData &Data, bool isSignature,
+                        bool isUsedInReloc);
+    static bool IsELFMetaDataSection(const MCSectionData &SD);
+    static uint64_t DataSectionSize(const MCSectionData &SD);
+    static uint64_t GetSectionFileSize(const MCAsmLayout &Layout,
+                                       const MCSectionData &SD);
+    static uint64_t GetSectionAddressSize(const MCAsmLayout &Layout,
+                                          const MCSectionData &SD);
+
+    void WriteDataSectionData(MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCSectionELF &Section);
+
+    /*static bool isFixupKindX86RIPRel(unsigned Kind) {
+      return Kind == X86::reloc_riprel_4byte ||
+        Kind == X86::reloc_riprel_4byte_movq_load;
+    }*/
+
+    /// ELFSymbolData - Helper struct for containing some precomputed
+    /// information on symbols.
+    struct ELFSymbolData {
+      MCSymbolData *SymbolData;
+      uint64_t StringIndex;
+      uint32_t SectionIndex;
+
+      // Support lexicographic sorting.
+      bool operator<(const ELFSymbolData &RHS) const {
+        if (MCELF::GetType(*SymbolData) == ELF::STT_FILE)
+          return true;
+        if (MCELF::GetType(*RHS.SymbolData) == ELF::STT_FILE)
+          return false;
+        return SymbolData->getSymbol().getName() <
+               RHS.SymbolData->getSymbol().getName();
+      }
+    };
+
+    /// The target specific ELF writer instance.
+    llvm::OwningPtr<MCELFObjectTargetWriter> TargetObjectWriter;
+
+    SmallPtrSet<const MCSymbol *, 16> UsedInReloc;
+    SmallPtrSet<const MCSymbol *, 16> WeakrefUsedInReloc;
+    DenseMap<const MCSymbol *, const MCSymbol *> Renames;
+
+    llvm::DenseMap<const MCSectionData*,
+                   std::vector<ELFRelocationEntry> > Relocations;
+    DenseMap<const MCSection*, uint64_t> SectionStringTableIndex;
+
+    /// @}
+    /// @name Symbol Table Data
+    /// @{
+
+    SmallString<256> StringTable;
+    std::vector<ELFSymbolData> LocalSymbolData;
+    std::vector<ELFSymbolData> ExternalSymbolData;
+    std::vector<ELFSymbolData> UndefinedSymbolData;
+
+    /// @}
+
+    bool NeedsGOT;
+
+    bool NeedsSymtabShndx;
+
+    // This holds the symbol table index of the last local symbol.
+    unsigned LastLocalSymbolIndex;
+    // This holds the .strtab section index.
+    unsigned StringTableIndex;
+    // This holds the .symtab section index.
+    unsigned SymbolTableIndex;
+
+    unsigned ShstrtabIndex;
+
+
+    const MCSymbol *SymbolToReloc(const MCAssembler &Asm,
+                                  const MCValue &Target,
+                                  const MCFragment &F,
+                                  const MCFixup &Fixup,
+                                  bool IsPCRel) const;
+
+    // TargetObjectWriter wrappers.
+    const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
+                                   const MCValue &Target,
+                                   const MCFragment &F,
+                                   const MCFixup &Fixup,
+                                   bool IsPCRel) const {
+      return TargetObjectWriter->ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+    }
+    const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+      return TargetObjectWriter->undefinedExplicitRelSym(Target, Fixup,
+                                                         IsPCRel);
+    }
+
+    bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+    bool hasRelocationAddend() const {
+      return TargetObjectWriter->hasRelocationAddend();
+    }
+    unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel, bool IsRelocWithSymbol,
+                          int64_t Addend) const {
+      return TargetObjectWriter->GetRelocType(Target, Fixup, IsPCRel,
+                                              IsRelocWithSymbol, Addend);
+    }
+
+  public:
+    ELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                    raw_ostream &_OS, bool IsLittleEndian)
+      : MCObjectWriter(_OS, IsLittleEndian),
+        TargetObjectWriter(MOTW),
+        NeedsGOT(false), NeedsSymtabShndx(false) {
+    }
+
+    virtual ~ELFObjectWriter();
+
+    void WriteWord(uint64_t W) {
+      if (is64Bit())
+        Write64(W);
+      else
+        Write32(W);
+    }
+
+    void StringLE16(char *buf, uint16_t Value) {
+      buf[0] = char(Value >> 0);
+      buf[1] = char(Value >> 8);
+    }
+
+    void StringLE32(char *buf, uint32_t Value) {
+      StringLE16(buf, uint16_t(Value >> 0));
+      StringLE16(buf + 2, uint16_t(Value >> 16));
+    }
+
+    void StringLE64(char *buf, uint64_t Value) {
+      StringLE32(buf, uint32_t(Value >> 0));
+      StringLE32(buf + 4, uint32_t(Value >> 32));
+    }
+
+    void StringBE16(char *buf ,uint16_t Value) {
+      buf[0] = char(Value >> 8);
+      buf[1] = char(Value >> 0);
+    }
+
+    void StringBE32(char *buf, uint32_t Value) {
+      StringBE16(buf, uint16_t(Value >> 16));
+      StringBE16(buf + 2, uint16_t(Value >> 0));
+    }
+
+    void StringBE64(char *buf, uint64_t Value) {
+      StringBE32(buf, uint32_t(Value >> 32));
+      StringBE32(buf + 4, uint32_t(Value >> 0));
+    }
+
+    void String8(MCDataFragment &F, uint8_t Value) {
+      char buf[1];
+      buf[0] = Value;
+      F.getContents().append(&buf[0], &buf[1]);
+    }
+
+    void String16(MCDataFragment &F, uint16_t Value) {
+      char buf[2];
+      if (isLittleEndian())
+        StringLE16(buf, Value);
+      else
+        StringBE16(buf, Value);
+      F.getContents().append(&buf[0], &buf[2]);
+    }
+
+    void String32(MCDataFragment &F, uint32_t Value) {
+      char buf[4];
+      if (isLittleEndian())
+        StringLE32(buf, Value);
+      else
+        StringBE32(buf, Value);
+      F.getContents().append(&buf[0], &buf[4]);
+    }
+
+    void String64(MCDataFragment &F, uint64_t Value) {
+      char buf[8];
+      if (isLittleEndian())
+        StringLE64(buf, Value);
+      else
+        StringBE64(buf, Value);
+      F.getContents().append(&buf[0], &buf[8]);
+    }
+
+    void WriteHeader(const MCAssembler &Asm,
+                     uint64_t SectionDataSize,
+                     unsigned NumberOfSections);
+
+    void WriteSymbolEntry(MCDataFragment *SymtabF,
+                          MCDataFragment *ShndxF,
+                          uint64_t name, uint8_t info,
+                          uint64_t value, uint64_t size,
+                          uint8_t other, uint32_t shndx,
+                          bool Reserved);
+
+    void WriteSymbol(MCDataFragment *SymtabF,  MCDataFragment *ShndxF,
+                     ELFSymbolData &MSD,
+                     const MCAsmLayout &Layout);
+
+    typedef DenseMap<const MCSectionELF*, uint32_t> SectionIndexMapTy;
+    void WriteSymbolTable(MCDataFragment *SymtabF,
+                          MCDataFragment *ShndxF,
+                          const MCAssembler &Asm,
+                          const MCAsmLayout &Layout,
+                          const SectionIndexMapTy &SectionIndexMap);
+
+    virtual void RecordRelocation(const MCAssembler &Asm,
+                                  const MCAsmLayout &Layout,
+                                  const MCFragment *Fragment,
+                                  const MCFixup &Fixup,
+                                  MCValue Target, uint64_t &FixedValue);
+
+    uint64_t getSymbolIndexInSymbolTable(const MCAssembler &Asm,
+                                         const MCSymbol *S);
+
+    // Map from a group section to the signature symbol
+    typedef DenseMap<const MCSectionELF*, const MCSymbol*> GroupMapTy;
+    // Map from a signature symbol to the group section
+    typedef DenseMap<const MCSymbol*, const MCSectionELF*> RevGroupMapTy;
+    // Map from a section to the section with the relocations
+    typedef DenseMap<const MCSectionELF*, const MCSectionELF*> RelMapTy;
+    // Map from a section to its offset
+    typedef DenseMap<const MCSectionELF*, uint64_t> SectionOffsetMapTy;
+
+    /// ComputeSymbolTable - Compute the symbol table data
+    ///
+    /// \param Asm - The assembler.
+    /// \param SectionIndexMap - Maps a section to its index.
+    /// \param RevGroupMap - Maps a signature symbol to the group section.
+    /// \param NumRegularSections - Number of non-relocation sections.
+    void ComputeSymbolTable(MCAssembler &Asm,
+                            const SectionIndexMapTy &SectionIndexMap,
+                            RevGroupMapTy RevGroupMap,
+                            unsigned NumRegularSections);
+
+    void ComputeIndexMap(MCAssembler &Asm,
+                         SectionIndexMapTy &SectionIndexMap,
+                         const RelMapTy &RelMap);
+
+    void CreateRelocationSections(MCAssembler &Asm, MCAsmLayout &Layout,
+                                  RelMapTy &RelMap);
+
+    void WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout,
+                          const RelMapTy &RelMap);
+
+    void CreateMetadataSections(MCAssembler &Asm, MCAsmLayout &Layout,
+                                SectionIndexMapTy &SectionIndexMap,
+                                const RelMapTy &RelMap);
+
+    // Create the sections that show up in the symbol table. Currently
+    // those are the .note.GNU-stack section and the group sections.
+    void CreateIndexedSections(MCAssembler &Asm, MCAsmLayout &Layout,
+                               GroupMapTy &GroupMap,
+                               RevGroupMapTy &RevGroupMap,
+                               SectionIndexMapTy &SectionIndexMap,
+                               const RelMapTy &RelMap);
+
+    virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                          const MCAsmLayout &Layout);
+
+    void WriteSectionHeader(MCAssembler &Asm, const GroupMapTy &GroupMap,
+                            const MCAsmLayout &Layout,
+                            const SectionIndexMapTy &SectionIndexMap,
+                            const SectionOffsetMapTy &SectionOffsetMap);
+
+    void ComputeSectionOrder(MCAssembler &Asm,
+                             std::vector<const MCSectionELF*> &Sections);
+
+    void WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
+                          uint64_t Address, uint64_t Offset,
+                          uint64_t Size, uint32_t Link, uint32_t Info,
+                          uint64_t Alignment, uint64_t EntrySize);
+
+    void WriteRelocationsFragment(const MCAssembler &Asm,
+                                  MCDataFragment *F,
+                                  const MCSectionData *SD);
+
+    virtual bool
+    IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                           const MCSymbolData &DataA,
+                                           const MCFragment &FB,
+                                           bool InSet,
+                                           bool IsPCRel) const;
+
+    virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+    void WriteSection(MCAssembler &Asm,
+                      const SectionIndexMapTy &SectionIndexMap,
+                      uint32_t GroupSymbolIndex,
+                      uint64_t Offset, uint64_t Size, uint64_t Alignment,
+                      const MCSectionELF &Section);
+  };
+}
+
+bool ELFObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) {
+  const MCFixupKindInfo &FKI =
+    Asm.getBackend().getFixupKindInfo((MCFixupKind) Kind);
+
+  return FKI.Flags & MCFixupKindInfo::FKF_IsPCRel;
+}
+
+bool ELFObjectWriter::RelocNeedsGOT(MCSymbolRefExpr::VariantKind Variant) {
+  switch (Variant) {
+  default:
+    return false;
+  case MCSymbolRefExpr::VK_GOT:
+  case MCSymbolRefExpr::VK_PLT:
+  case MCSymbolRefExpr::VK_GOTPCREL:
+  case MCSymbolRefExpr::VK_GOTOFF:
+  case MCSymbolRefExpr::VK_TPOFF:
+  case MCSymbolRefExpr::VK_TLSGD:
+  case MCSymbolRefExpr::VK_GOTTPOFF:
+  case MCSymbolRefExpr::VK_INDNTPOFF:
+  case MCSymbolRefExpr::VK_NTPOFF:
+  case MCSymbolRefExpr::VK_GOTNTPOFF:
+  case MCSymbolRefExpr::VK_TLSLDM:
+  case MCSymbolRefExpr::VK_DTPOFF:
+  case MCSymbolRefExpr::VK_TLSLD:
+    return true;
+  }
+}
+
+ELFObjectWriter::~ELFObjectWriter()
+{}
+
+// Emit the ELF header.
+void ELFObjectWriter::WriteHeader(const MCAssembler &Asm,
+                                  uint64_t SectionDataSize,
+                                  unsigned NumberOfSections) {
+  // ELF Header
+  // ----------
+  //
+  // Note
+  // ----
+  // emitWord method behaves differently for ELF32 and ELF64, writing
+  // 4 bytes in the former and 8 in the latter.
+
+  Write8(0x7f); // e_ident[EI_MAG0]
+  Write8('E');  // e_ident[EI_MAG1]
+  Write8('L');  // e_ident[EI_MAG2]
+  Write8('F');  // e_ident[EI_MAG3]
+
+  Write8(is64Bit() ? ELF::ELFCLASS64 : ELF::ELFCLASS32); // e_ident[EI_CLASS]
+
+  // e_ident[EI_DATA]
+  Write8(isLittleEndian() ? ELF::ELFDATA2LSB : ELF::ELFDATA2MSB);
+
+  Write8(ELF::EV_CURRENT);        // e_ident[EI_VERSION]
+  // e_ident[EI_OSABI]
+  Write8(TargetObjectWriter->getOSABI());
+  Write8(0);                  // e_ident[EI_ABIVERSION]
+
+  WriteZeros(ELF::EI_NIDENT - ELF::EI_PAD);
+
+  Write16(ELF::ET_REL);             // e_type
+
+  Write16(TargetObjectWriter->getEMachine()); // e_machine = target
+
+  Write32(ELF::EV_CURRENT);         // e_version
+  WriteWord(0);                    // e_entry, no entry point in .o file
+  WriteWord(0);                    // e_phoff, no program header for .o
+  WriteWord(SectionDataSize + (is64Bit() ? sizeof(ELF::Elf64_Ehdr) :
+            sizeof(ELF::Elf32_Ehdr)));  // e_shoff = sec hdr table off in bytes
+
+  // e_flags = whatever the target wants
+  Write32(Asm.getELFHeaderEFlags());
+
+  // e_ehsize = ELF header size
+  Write16(is64Bit() ? sizeof(ELF::Elf64_Ehdr) : sizeof(ELF::Elf32_Ehdr));
+
+  Write16(0);                  // e_phentsize = prog header entry size
+  Write16(0);                  // e_phnum = # prog header entries = 0
+
+  // e_shentsize = Section header entry size
+  Write16(is64Bit() ? sizeof(ELF::Elf64_Shdr) : sizeof(ELF::Elf32_Shdr));
+
+  // e_shnum     = # of section header ents
+  if (NumberOfSections >= ELF::SHN_LORESERVE)
+    Write16(ELF::SHN_UNDEF);
+  else
+    Write16(NumberOfSections);
+
+  // e_shstrndx  = Section # of '.shstrtab'
+  if (ShstrtabIndex >= ELF::SHN_LORESERVE)
+    Write16(ELF::SHN_XINDEX);
+  else
+    Write16(ShstrtabIndex);
+}
+
+void ELFObjectWriter::WriteSymbolEntry(MCDataFragment *SymtabF,
+                                       MCDataFragment *ShndxF,
+                                       uint64_t name,
+                                       uint8_t info, uint64_t value,
+                                       uint64_t size, uint8_t other,
+                                       uint32_t shndx,
+                                       bool Reserved) {
+  if (ShndxF) {
+    if (shndx >= ELF::SHN_LORESERVE && !Reserved)
+      String32(*ShndxF, shndx);
+    else
+      String32(*ShndxF, 0);
+  }
+
+  uint16_t Index = (shndx >= ELF::SHN_LORESERVE && !Reserved) ?
+    uint16_t(ELF::SHN_XINDEX) : shndx;
+
+  if (is64Bit()) {
+    String32(*SymtabF, name);  // st_name
+    String8(*SymtabF, info);   // st_info
+    String8(*SymtabF, other);  // st_other
+    String16(*SymtabF, Index); // st_shndx
+    String64(*SymtabF, value); // st_value
+    String64(*SymtabF, size);  // st_size
+  } else {
+    String32(*SymtabF, name);  // st_name
+    String32(*SymtabF, value); // st_value
+    String32(*SymtabF, size);  // st_size
+    String8(*SymtabF, info);   // st_info
+    String8(*SymtabF, other);  // st_other
+    String16(*SymtabF, Index); // st_shndx
+  }
+}
+
+uint64_t ELFObjectWriter::SymbolValue(MCSymbolData &Data,
+                                      const MCAsmLayout &Layout) {
+  if (Data.isCommon() && Data.isExternal())
+    return Data.getCommonAlignment();
+
+  const MCSymbol &Symbol = Data.getSymbol();
+
+  if (Symbol.isAbsolute() && Symbol.isVariable()) {
+    if (const MCExpr *Value = Symbol.getVariableValue()) {
+      int64_t IntValue;
+      if (Value->EvaluateAsAbsolute(IntValue, Layout))
+        return (uint64_t)IntValue;
+    }
+  }
+
+  if (!Symbol.isInSection())
+    return 0;
+
+
+  if (Data.getFragment()) {
+    if (Data.getFlags() & ELF_Other_ThumbFunc)
+      return Layout.getSymbolOffset(&Data)+1;
+    else
+      return Layout.getSymbolOffset(&Data);
+  }
+
+  return 0;
+}
+
+void ELFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
+                                               const MCAsmLayout &Layout) {
+  // The presence of symbol versions causes undefined symbols and
+  // versions declared with @@@ to be renamed.
+
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Alias = it->getSymbol();
+    const MCSymbol &Symbol = Alias.AliasedSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(Symbol);
+
+    // Not an alias.
+    if (&Symbol == &Alias)
+      continue;
+
+    StringRef AliasName = Alias.getName();
+    size_t Pos = AliasName.find('@');
+    if (Pos == StringRef::npos)
+      continue;
+
+    // Aliases defined with .symvar copy the binding from the symbol they alias.
+    // This is the first place we are able to copy this information.
+    it->setExternal(SD.isExternal());
+    MCELF::SetBinding(*it, MCELF::GetBinding(SD));
+
+    StringRef Rest = AliasName.substr(Pos);
+    if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
+      continue;
+
+    // FIXME: produce a better error message.
+    if (Symbol.isUndefined() && Rest.startswith("@@") &&
+        !Rest.startswith("@@@"))
+      report_fatal_error("A @@ version cannot be undefined");
+
+    Renames.insert(std::make_pair(&Symbol, &Alias));
+  }
+}
+
+void ELFObjectWriter::WriteSymbol(MCDataFragment *SymtabF,
+                                  MCDataFragment *ShndxF,
+                                  ELFSymbolData &MSD,
+                                  const MCAsmLayout &Layout) {
+  MCSymbolData &OrigData = *MSD.SymbolData;
+  MCSymbolData &Data =
+    Layout.getAssembler().getSymbolData(OrigData.getSymbol().AliasedSymbol());
+
+  bool IsReserved = Data.isCommon() || Data.getSymbol().isAbsolute() ||
+    Data.getSymbol().isVariable();
+
+  // Binding and Type share the same byte as upper and lower nibbles
+  uint8_t Binding = MCELF::GetBinding(OrigData);
+  uint8_t Type = MCELF::GetType(Data);
+  uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
+
+  // Other and Visibility share the same byte with Visability using the lower
+  // 2 bits
+  uint8_t Visibility = MCELF::GetVisibility(OrigData);
+  uint8_t Other = MCELF::getOther(OrigData) <<
+    (ELF_Other_Shift - ELF_STV_Shift);
+  Other |= Visibility;
+
+  uint64_t Value = SymbolValue(Data, Layout);
+  uint64_t Size = 0;
+
+  assert(!(Data.isCommon() && !Data.isExternal()));
+
+  const MCExpr *ESize = Data.getSize();
+  if (ESize) {
+    int64_t Res;
+    if (!ESize->EvaluateAsAbsolute(Res, Layout))
+      report_fatal_error("Size expression must be absolute.");
+    Size = Res;
+  }
+
+  // Write out the symbol table entry
+  WriteSymbolEntry(SymtabF, ShndxF, MSD.StringIndex, Info, Value,
+                   Size, Other, MSD.SectionIndex, IsReserved);
+}
+
+void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
+                                       MCDataFragment *ShndxF,
+                                       const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                    const SectionIndexMapTy &SectionIndexMap) {
+  // The string table must be emitted first because we need the index
+  // into the string table for all the symbol names.
+  assert(StringTable.size() && "Missing string table");
+
+  // FIXME: Make sure the start of the symbol table is aligned.
+
+  // The first entry is the undefined symbol entry.
+  WriteSymbolEntry(SymtabF, ShndxF, 0, 0, 0, 0, 0, 0, false);
+
+  // Write the symbol table entries.
+  LastLocalSymbolIndex = LocalSymbolData.size() + 1;
+  for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = LocalSymbolData[i];
+    WriteSymbol(SymtabF, ShndxF, MSD, Layout);
+  }
+
+  // Write out a symbol table entry for each regular section.
+  for (MCAssembler::const_iterator i = Asm.begin(), e = Asm.end(); i != e;
+       ++i) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(i->getSection());
+    if (Section.getType() == ELF::SHT_RELA ||
+        Section.getType() == ELF::SHT_REL ||
+        Section.getType() == ELF::SHT_STRTAB ||
+        Section.getType() == ELF::SHT_SYMTAB ||
+        Section.getType() == ELF::SHT_SYMTAB_SHNDX)
+      continue;
+    WriteSymbolEntry(SymtabF, ShndxF, 0, ELF::STT_SECTION, 0, 0,
+                     ELF::STV_DEFAULT, SectionIndexMap.lookup(&Section),
+                     false);
+    LastLocalSymbolIndex++;
+  }
+
+  for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = ExternalSymbolData[i];
+    MCSymbolData &Data = *MSD.SymbolData;
+    assert(((Data.getFlags() & ELF_STB_Global) ||
+            (Data.getFlags() & ELF_STB_Weak)) &&
+           "External symbol requires STB_GLOBAL or STB_WEAK flag");
+    WriteSymbol(SymtabF, ShndxF, MSD, Layout);
+    if (MCELF::GetBinding(Data) == ELF::STB_LOCAL)
+      LastLocalSymbolIndex++;
+  }
+
+  for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i) {
+    ELFSymbolData &MSD = UndefinedSymbolData[i];
+    MCSymbolData &Data = *MSD.SymbolData;
+    WriteSymbol(SymtabF, ShndxF, MSD, Layout);
+    if (MCELF::GetBinding(Data) == ELF::STB_LOCAL)
+      LastLocalSymbolIndex++;
+  }
+}
+
+const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
+                                               const MCValue &Target,
+                                               const MCFragment &F,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const {
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+  const MCSymbol &ASymbol = Symbol.AliasedSymbol();
+  const MCSymbol *Renamed = Renames.lookup(&Symbol);
+  const MCSymbolData &SD = Asm.getSymbolData(Symbol);
+
+  if (ASymbol.isUndefined()) {
+    if (Renamed)
+      return Renamed;
+    return undefinedExplicitRelSym(Target, Fixup, IsPCRel);
+  }
+
+  if (SD.isExternal()) {
+    if (Renamed)
+      return Renamed;
+    return &Symbol;
+  }
+
+  const MCSectionELF &Section =
+    static_cast<const MCSectionELF&>(ASymbol.getSection());
+  const SectionKind secKind = Section.getKind();
+
+  if (secKind.isBSS())
+    return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+
+  if (secKind.isThreadLocal()) {
+    if (Renamed)
+      return Renamed;
+    return &Symbol;
+  }
+
+  MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind();
+  const MCSectionELF &Sec2 =
+    static_cast<const MCSectionELF&>(F.getParent()->getSection());
+
+  if (&Sec2 != &Section &&
+      (Kind == MCSymbolRefExpr::VK_PLT ||
+       Kind == MCSymbolRefExpr::VK_GOTPCREL ||
+       Kind == MCSymbolRefExpr::VK_GOTOFF)) {
+    if (Renamed)
+      return Renamed;
+    return &Symbol;
+  }
+
+  if (Section.getFlags() & ELF::SHF_MERGE) {
+    if (Target.getConstant() == 0)
+      return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+    if (Renamed)
+      return Renamed;
+    return &Symbol;
+  }
+
+  return ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
+
+}
+
+
+void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const MCFragment *Fragment,
+                                       const MCFixup &Fixup,
+                                       MCValue Target,
+                                       uint64_t &FixedValue) {
+  int64_t Addend = 0;
+  int Index = 0;
+  int64_t Value = Target.getConstant();
+  const MCSymbol *RelocSymbol = NULL;
+
+  bool IsPCRel = isFixupKindPCRel(Asm, Fixup.getKind());
+  if (!Target.isAbsolute()) {
+    const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+    const MCSymbol &ASymbol = Symbol.AliasedSymbol();
+    RelocSymbol = SymbolToReloc(Asm, Target, *Fragment, Fixup, IsPCRel);
+
+    if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+      const MCSymbol &SymbolB = RefB->getSymbol();
+      MCSymbolData &SDB = Asm.getSymbolData(SymbolB);
+      IsPCRel = true;
+
+      // Offset of the symbol in the section
+      int64_t a = Layout.getSymbolOffset(&SDB);
+
+      // Offset of the relocation in the section
+      int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+      Value += b - a;
+    }
+
+    if (!RelocSymbol) {
+      MCSymbolData &SD = Asm.getSymbolData(ASymbol);
+      MCFragment *F = SD.getFragment();
+
+      if (F) {
+        Index = F->getParent()->getOrdinal() + 1;
+        // Offset of the symbol in the section
+        Value += Layout.getSymbolOffset(&SD);
+      } else {
+        Index = 0;
+      }
+    } else {
+      if (Asm.getSymbolData(Symbol).getFlags() & ELF_Other_Weakref)
+        WeakrefUsedInReloc.insert(RelocSymbol);
+      else
+        UsedInReloc.insert(RelocSymbol);
+      Index = -1;
+    }
+    Addend = Value;
+    if (hasRelocationAddend())
+      Value = 0;
+  }
+
+  FixedValue = Value;
+  unsigned Type = GetRelocType(Target, Fixup, IsPCRel,
+                               (RelocSymbol != 0), Addend);
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+  if (RelocNeedsGOT(Modifier))
+    NeedsGOT = true;
+
+  uint64_t RelocOffset = Layout.getFragmentOffset(Fragment) +
+    Fixup.getOffset();
+
+  // FIXME: no tests cover this. Is adjustFixupOffset dead code?
+  TargetObjectWriter->adjustFixupOffset(Fixup, RelocOffset);
+
+  if (!hasRelocationAddend())
+    Addend = 0;
+
+  if (is64Bit())
+    assert(isInt<64>(Addend));
+  else
+    assert(isInt<32>(Addend));
+
+  ELFRelocationEntry ERE(RelocOffset, Index, Type, RelocSymbol, Addend, Fixup);
+  Relocations[Fragment->getParent()].push_back(ERE);
+}
+
+
+uint64_t
+ELFObjectWriter::getSymbolIndexInSymbolTable(const MCAssembler &Asm,
+                                             const MCSymbol *S) {
+  MCSymbolData &SD = Asm.getSymbolData(*S);
+  return SD.getIndex();
+}
+
+bool ELFObjectWriter::isInSymtab(const MCAssembler &Asm,
+                                 const MCSymbolData &Data,
+                                 bool Used, bool Renamed) {
+  if (Data.getFlags() & ELF_Other_Weakref)
+    return false;
+
+  if (Used)
+    return true;
+
+  if (Renamed)
+    return false;
+
+  const MCSymbol &Symbol = Data.getSymbol();
+
+  if (Symbol.getName() == "_GLOBAL_OFFSET_TABLE_")
+    return true;
+
+  const MCSymbol &A = Symbol.AliasedSymbol();
+  if (Symbol.isVariable() && !A.isVariable() && A.isUndefined())
+    return false;
+
+  bool IsGlobal = MCELF::GetBinding(Data) == ELF::STB_GLOBAL;
+  if (!Symbol.isVariable() && Symbol.isUndefined() && !IsGlobal)
+    return false;
+
+  if (!Asm.isSymbolLinkerVisible(Symbol) && !Symbol.isUndefined())
+    return false;
+
+  if (Symbol.isTemporary())
+    return false;
+
+  return true;
+}
+
+bool ELFObjectWriter::isLocal(const MCSymbolData &Data, bool isSignature,
+                              bool isUsedInReloc) {
+  if (Data.isExternal())
+    return false;
+
+  const MCSymbol &Symbol = Data.getSymbol();
+  const MCSymbol &RefSymbol = Symbol.AliasedSymbol();
+
+  if (RefSymbol.isUndefined() && !RefSymbol.isVariable()) {
+    if (isSignature && !isUsedInReloc)
+      return true;
+
+    return false;
+  }
+
+  return true;
+}
+
+void ELFObjectWriter::ComputeIndexMap(MCAssembler &Asm,
+                                      SectionIndexMapTy &SectionIndexMap,
+                                      const RelMapTy &RelMap) {
+  unsigned Index = 1;
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(it->getSection());
+    if (Section.getType() != ELF::SHT_GROUP)
+      continue;
+    SectionIndexMap[&Section] = Index++;
+  }
+
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(it->getSection());
+    if (Section.getType() == ELF::SHT_GROUP ||
+        Section.getType() == ELF::SHT_REL ||
+        Section.getType() == ELF::SHT_RELA)
+      continue;
+    SectionIndexMap[&Section] = Index++;
+    const MCSectionELF *RelSection = RelMap.lookup(&Section);
+    if (RelSection)
+      SectionIndexMap[RelSection] = Index++;
+  }
+}
+
+void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
+                                      const SectionIndexMapTy &SectionIndexMap,
+                                         RevGroupMapTy RevGroupMap,
+                                         unsigned NumRegularSections) {
+  // FIXME: Is this the correct place to do this?
+  // FIXME: Why is an undefined reference to _GLOBAL_OFFSET_TABLE_ needed?
+  if (NeedsGOT) {
+    StringRef Name = "_GLOBAL_OFFSET_TABLE_";
+    MCSymbol *Sym = Asm.getContext().GetOrCreateSymbol(Name);
+    MCSymbolData &Data = Asm.getOrCreateSymbolData(*Sym);
+    Data.setExternal(true);
+    MCELF::SetBinding(Data, ELF::STB_GLOBAL);
+  }
+
+  // Index 0 is always the empty string.
+  StringMap<uint64_t> StringIndexMap;
+  StringTable += '\x00';
+
+  // FIXME: We could optimize suffixes in strtab in the same way we
+  // optimize them in shstrtab.
+
+  // Add the data for the symbols.
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Symbol = it->getSymbol();
+
+    bool Used = UsedInReloc.count(&Symbol);
+    bool WeakrefUsed = WeakrefUsedInReloc.count(&Symbol);
+    bool isSignature = RevGroupMap.count(&Symbol);
+
+    if (!isInSymtab(Asm, *it,
+                    Used || WeakrefUsed || isSignature,
+                    Renames.count(&Symbol)))
+      continue;
+
+    ELFSymbolData MSD;
+    MSD.SymbolData = it;
+    const MCSymbol &RefSymbol = Symbol.AliasedSymbol();
+
+    // Undefined symbols are global, but this is the first place we
+    // are able to set it.
+    bool Local = isLocal(*it, isSignature, Used);
+    if (!Local && MCELF::GetBinding(*it) == ELF::STB_LOCAL) {
+      MCSymbolData &SD = Asm.getSymbolData(RefSymbol);
+      MCELF::SetBinding(*it, ELF::STB_GLOBAL);
+      MCELF::SetBinding(SD, ELF::STB_GLOBAL);
+    }
+
+    if (RefSymbol.isUndefined() && !Used && WeakrefUsed)
+      MCELF::SetBinding(*it, ELF::STB_WEAK);
+
+    if (it->isCommon()) {
+      assert(!Local);
+      MSD.SectionIndex = ELF::SHN_COMMON;
+    } else if (Symbol.isAbsolute() || RefSymbol.isVariable()) {
+      MSD.SectionIndex = ELF::SHN_ABS;
+    } else if (RefSymbol.isUndefined()) {
+      if (isSignature && !Used)
+        MSD.SectionIndex = SectionIndexMap.lookup(RevGroupMap[&Symbol]);
+      else
+        MSD.SectionIndex = ELF::SHN_UNDEF;
+    } else {
+      const MCSectionELF &Section =
+        static_cast<const MCSectionELF&>(RefSymbol.getSection());
+      MSD.SectionIndex = SectionIndexMap.lookup(&Section);
+      if (MSD.SectionIndex >= ELF::SHN_LORESERVE)
+        NeedsSymtabShndx = true;
+      assert(MSD.SectionIndex && "Invalid section index!");
+    }
+
+    // The @@@ in symbol version is replaced with @ in undefined symbols and
+    // @@ in defined ones.
+    StringRef Name = Symbol.getName();
+    SmallString<32> Buf;
+
+    size_t Pos = Name.find("@@@");
+    if (Pos != StringRef::npos) {
+      Buf += Name.substr(0, Pos);
+      unsigned Skip = MSD.SectionIndex == ELF::SHN_UNDEF ? 2 : 1;
+      Buf += Name.substr(Pos + Skip);
+      Name = Buf;
+    }
+
+    uint64_t &Entry = StringIndexMap[Name];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Name;
+      StringTable += '\x00';
+    }
+    MSD.StringIndex = Entry;
+    if (MSD.SectionIndex == ELF::SHN_UNDEF)
+      UndefinedSymbolData.push_back(MSD);
+    else if (Local)
+      LocalSymbolData.push_back(MSD);
+    else
+      ExternalSymbolData.push_back(MSD);
+  }
+
+  // Symbols are required to be in lexicographic order.
+  array_pod_sort(LocalSymbolData.begin(), LocalSymbolData.end());
+  array_pod_sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
+  array_pod_sort(UndefinedSymbolData.begin(), UndefinedSymbolData.end());
+
+  // Set the symbol indices. Local symbols must come before all other
+  // symbols with non-local bindings.
+  unsigned Index = 1;
+  for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
+    LocalSymbolData[i].SymbolData->setIndex(Index++);
+
+  Index += NumRegularSections;
+
+  for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
+    ExternalSymbolData[i].SymbolData->setIndex(Index++);
+  for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
+    UndefinedSymbolData[i].SymbolData->setIndex(Index++);
+
+  if (Index >= ELF::SHN_LORESERVE)
+    NeedsSymtabShndx = true;
+}
+
+void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
+                                               MCAsmLayout &Layout,
+                                               RelMapTy &RelMap) {
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
+    if (Relocations[&SD].empty())
+      continue;
+
+    MCContext &Ctx = Asm.getContext();
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(SD.getSection());
+
+    const StringRef SectionName = Section.getSectionName();
+    std::string RelaSectionName = hasRelocationAddend() ? ".rela" : ".rel";
+    RelaSectionName += SectionName;
+
+    unsigned EntrySize;
+    if (hasRelocationAddend())
+      EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rela) : sizeof(ELF::Elf32_Rela);
+    else
+      EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
+
+    const MCSectionELF *RelaSection =
+      Ctx.getELFSection(RelaSectionName, hasRelocationAddend() ?
+                        ELF::SHT_RELA : ELF::SHT_REL, 0,
+                        SectionKind::getReadOnly(),
+                        EntrySize, "");
+    RelMap[&Section] = RelaSection;
+    Asm.getOrCreateSectionData(*RelaSection);
+  }
+}
+
+void ELFObjectWriter::WriteRelocations(MCAssembler &Asm, MCAsmLayout &Layout,
+                                       const RelMapTy &RelMap) {
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(SD.getSection());
+
+    const MCSectionELF *RelaSection = RelMap.lookup(&Section);
+    if (!RelaSection)
+      continue;
+    MCSectionData &RelaSD = Asm.getOrCreateSectionData(*RelaSection);
+    RelaSD.setAlignment(is64Bit() ? 8 : 4);
+
+    MCDataFragment *F = new MCDataFragment(&RelaSD);
+    WriteRelocationsFragment(Asm, F, &*it);
+  }
+}
+
+void ELFObjectWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type,
+                                       uint64_t Flags, uint64_t Address,
+                                       uint64_t Offset, uint64_t Size,
+                                       uint32_t Link, uint32_t Info,
+                                       uint64_t Alignment,
+                                       uint64_t EntrySize) {
+  Write32(Name);        // sh_name: index into string table
+  Write32(Type);        // sh_type
+  WriteWord(Flags);     // sh_flags
+  WriteWord(Address);   // sh_addr
+  WriteWord(Offset);    // sh_offset
+  WriteWord(Size);      // sh_size
+  Write32(Link);        // sh_link
+  Write32(Info);        // sh_info
+  WriteWord(Alignment); // sh_addralign
+  WriteWord(EntrySize); // sh_entsize
+}
+
+void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
+                                               MCDataFragment *F,
+                                               const MCSectionData *SD) {
+  std::vector<ELFRelocationEntry> &Relocs = Relocations[SD];
+
+  // Sort the relocation entries. Most targets just sort by r_offset, but some
+  // (e.g., MIPS) have additional constraints.
+  TargetObjectWriter->sortRelocs(Asm, Relocs);
+
+  for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+    ELFRelocationEntry entry = Relocs[e - i - 1];
+
+    if (!entry.Index)
+      ;
+    else if (entry.Index < 0)
+      entry.Index = getSymbolIndexInSymbolTable(Asm, entry.Symbol);
+    else
+      entry.Index += LocalSymbolData.size();
+    if (is64Bit()) {
+      String64(*F, entry.r_offset);
+      if (TargetObjectWriter->isN64()) {
+        String32(*F, entry.Index);
+
+        String8(*F, TargetObjectWriter->getRSsym(entry.Type));
+        String8(*F, TargetObjectWriter->getRType3(entry.Type));
+        String8(*F, TargetObjectWriter->getRType2(entry.Type));
+        String8(*F, TargetObjectWriter->getRType(entry.Type));
+      }
+      else {
+        struct ELF::Elf64_Rela ERE64;
+        ERE64.setSymbolAndType(entry.Index, entry.Type);
+        String64(*F, ERE64.r_info);
+      }
+      if (hasRelocationAddend())
+        String64(*F, entry.r_addend);
+    } else {
+      String32(*F, entry.r_offset);
+
+      struct ELF::Elf32_Rela ERE32;
+      ERE32.setSymbolAndType(entry.Index, entry.Type);
+      String32(*F, ERE32.r_info);
+
+      if (hasRelocationAddend())
+        String32(*F, entry.r_addend);
+    }
+  }
+}
+
+static int compareBySuffix(const void *a, const void *b) {
+  const MCSectionELF *secA = *static_cast<const MCSectionELF* const *>(a);
+  const MCSectionELF *secB = *static_cast<const MCSectionELF* const *>(b);
+  const StringRef &NameA = secA->getSectionName();
+  const StringRef &NameB = secB->getSectionName();
+  const unsigned sizeA = NameA.size();
+  const unsigned sizeB = NameB.size();
+  const unsigned len = std::min(sizeA, sizeB);
+  for (unsigned int i = 0; i < len; ++i) {
+    char ca = NameA[sizeA - i - 1];
+    char cb = NameB[sizeB - i - 1];
+    if (ca != cb)
+      return cb - ca;
+  }
+
+  return sizeB - sizeA;
+}
+
+void ELFObjectWriter::CreateMetadataSections(MCAssembler &Asm,
+                                             MCAsmLayout &Layout,
+                                             SectionIndexMapTy &SectionIndexMap,
+                                             const RelMapTy &RelMap) {
+  MCContext &Ctx = Asm.getContext();
+  MCDataFragment *F;
+
+  unsigned EntrySize = is64Bit() ? ELF::SYMENTRY_SIZE64 : ELF::SYMENTRY_SIZE32;
+
+  // We construct .shstrtab, .symtab and .strtab in this order to match gnu as.
+  const MCSectionELF *ShstrtabSection =
+    Ctx.getELFSection(".shstrtab", ELF::SHT_STRTAB, 0,
+                      SectionKind::getReadOnly());
+  MCSectionData &ShstrtabSD = Asm.getOrCreateSectionData(*ShstrtabSection);
+  ShstrtabSD.setAlignment(1);
+
+  const MCSectionELF *SymtabSection =
+    Ctx.getELFSection(".symtab", ELF::SHT_SYMTAB, 0,
+                      SectionKind::getReadOnly(),
+                      EntrySize, "");
+  MCSectionData &SymtabSD = Asm.getOrCreateSectionData(*SymtabSection);
+  SymtabSD.setAlignment(is64Bit() ? 8 : 4);
+
+  MCSectionData *SymtabShndxSD = NULL;
+
+  if (NeedsSymtabShndx) {
+    const MCSectionELF *SymtabShndxSection =
+      Ctx.getELFSection(".symtab_shndx", ELF::SHT_SYMTAB_SHNDX, 0,
+                        SectionKind::getReadOnly(), 4, "");
+    SymtabShndxSD = &Asm.getOrCreateSectionData(*SymtabShndxSection);
+    SymtabShndxSD->setAlignment(4);
+  }
+
+  const MCSectionELF *StrtabSection;
+  StrtabSection = Ctx.getELFSection(".strtab", ELF::SHT_STRTAB, 0,
+                                    SectionKind::getReadOnly());
+  MCSectionData &StrtabSD = Asm.getOrCreateSectionData(*StrtabSection);
+  StrtabSD.setAlignment(1);
+
+  ComputeIndexMap(Asm, SectionIndexMap, RelMap);
+
+  ShstrtabIndex = SectionIndexMap.lookup(ShstrtabSection);
+  SymbolTableIndex = SectionIndexMap.lookup(SymtabSection);
+  StringTableIndex = SectionIndexMap.lookup(StrtabSection);
+
+  // Symbol table
+  F = new MCDataFragment(&SymtabSD);
+  MCDataFragment *ShndxF = NULL;
+  if (NeedsSymtabShndx) {
+    ShndxF = new MCDataFragment(SymtabShndxSD);
+  }
+  WriteSymbolTable(F, ShndxF, Asm, Layout, SectionIndexMap);
+
+  F = new MCDataFragment(&StrtabSD);
+  F->getContents().append(StringTable.begin(), StringTable.end());
+
+  F = new MCDataFragment(&ShstrtabSD);
+
+  std::vector<const MCSectionELF*> Sections;
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(it->getSection());
+    Sections.push_back(&Section);
+  }
+  array_pod_sort(Sections.begin(), Sections.end(), compareBySuffix);
+
+  // Section header string table.
+  //
+  // The first entry of a string table holds a null character so skip
+  // section 0.
+  uint64_t Index = 1;
+  F->getContents().push_back('\x00');
+
+  for (unsigned int I = 0, E = Sections.size(); I != E; ++I) {
+    const MCSectionELF &Section = *Sections[I];
+
+    StringRef Name = Section.getSectionName();
+    if (I != 0) {
+      StringRef PreviousName = Sections[I - 1]->getSectionName();
+      if (PreviousName.endswith(Name)) {
+        SectionStringTableIndex[&Section] = Index - Name.size() - 1;
+        continue;
+      }
+    }
+    // Remember the index into the string table so we can write it
+    // into the sh_name field of the section header table.
+    SectionStringTableIndex[&Section] = Index;
+
+    Index += Name.size() + 1;
+    F->getContents().append(Name.begin(), Name.end());
+    F->getContents().push_back('\x00');
+  }
+}
+
+void ELFObjectWriter::CreateIndexedSections(MCAssembler &Asm,
+                                            MCAsmLayout &Layout,
+                                            GroupMapTy &GroupMap,
+                                            RevGroupMapTy &RevGroupMap,
+                                            SectionIndexMapTy &SectionIndexMap,
+                                            const RelMapTy &RelMap) {
+  // Create the .note.GNU-stack section if needed.
+  MCContext &Ctx = Asm.getContext();
+  if (Asm.getNoExecStack()) {
+    const MCSectionELF *GnuStackSection =
+      Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, 0,
+                        SectionKind::getReadOnly());
+    Asm.getOrCreateSectionData(*GnuStackSection);
+  }
+
+  // Build the groups
+  for (MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();
+       it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(it->getSection());
+    if (!(Section.getFlags() & ELF::SHF_GROUP))
+      continue;
+
+    const MCSymbol *SignatureSymbol = Section.getGroup();
+    Asm.getOrCreateSymbolData(*SignatureSymbol);
+    const MCSectionELF *&Group = RevGroupMap[SignatureSymbol];
+    if (!Group) {
+      Group = Ctx.CreateELFGroupSection();
+      MCSectionData &Data = Asm.getOrCreateSectionData(*Group);
+      Data.setAlignment(4);
+      MCDataFragment *F = new MCDataFragment(&Data);
+      String32(*F, ELF::GRP_COMDAT);
+    }
+    GroupMap[Group] = SignatureSymbol;
+  }
+
+  ComputeIndexMap(Asm, SectionIndexMap, RelMap);
+
+  // Add sections to the groups
+  for (MCAssembler::const_iterator it = Asm.begin(), ie = Asm.end();
+       it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF&>(it->getSection());
+    if (!(Section.getFlags() & ELF::SHF_GROUP))
+      continue;
+    const MCSectionELF *Group = RevGroupMap[Section.getGroup()];
+    MCSectionData &Data = Asm.getOrCreateSectionData(*Group);
+    // FIXME: we could use the previous fragment
+    MCDataFragment *F = new MCDataFragment(&Data);
+    unsigned Index = SectionIndexMap.lookup(&Section);
+    String32(*F, Index);
+  }
+}
+
+void ELFObjectWriter::WriteSection(MCAssembler &Asm,
+                                   const SectionIndexMapTy &SectionIndexMap,
+                                   uint32_t GroupSymbolIndex,
+                                   uint64_t Offset, uint64_t Size,
+                                   uint64_t Alignment,
+                                   const MCSectionELF &Section) {
+  uint64_t sh_link = 0;
+  uint64_t sh_info = 0;
+
+  switch(Section.getType()) {
+  case ELF::SHT_DYNAMIC:
+    sh_link = SectionStringTableIndex[&Section];
+    sh_info = 0;
+    break;
+
+  case ELF::SHT_REL:
+  case ELF::SHT_RELA: {
+    const MCSectionELF *SymtabSection;
+    const MCSectionELF *InfoSection;
+    SymtabSection = Asm.getContext().getELFSection(".symtab", ELF::SHT_SYMTAB,
+                                                   0,
+                                                   SectionKind::getReadOnly());
+    sh_link = SectionIndexMap.lookup(SymtabSection);
+    assert(sh_link && ".symtab not found");
+
+    // Remove ".rel" and ".rela" prefixes.
+    unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
+    StringRef SectionName = Section.getSectionName().substr(SecNameLen);
+
+    InfoSection = Asm.getContext().getELFSection(SectionName,
+                                                 ELF::SHT_PROGBITS, 0,
+                                                 SectionKind::getReadOnly());
+    sh_info = SectionIndexMap.lookup(InfoSection);
+    break;
+  }
+
+  case ELF::SHT_SYMTAB:
+  case ELF::SHT_DYNSYM:
+    sh_link = StringTableIndex;
+    sh_info = LastLocalSymbolIndex;
+    break;
+
+  case ELF::SHT_SYMTAB_SHNDX:
+    sh_link = SymbolTableIndex;
+    break;
+
+  case ELF::SHT_PROGBITS:
+  case ELF::SHT_STRTAB:
+  case ELF::SHT_NOBITS:
+  case ELF::SHT_NOTE:
+  case ELF::SHT_NULL:
+  case ELF::SHT_ARM_ATTRIBUTES:
+  case ELF::SHT_INIT_ARRAY:
+  case ELF::SHT_FINI_ARRAY:
+  case ELF::SHT_PREINIT_ARRAY:
+  case ELF::SHT_X86_64_UNWIND:
+  case ELF::SHT_MIPS_REGINFO:
+  case ELF::SHT_MIPS_OPTIONS:
+    // Nothing to do.
+    break;
+
+  case ELF::SHT_GROUP:
+    sh_link = SymbolTableIndex;
+    sh_info = GroupSymbolIndex;
+    break;
+
+  default:
+    assert(0 && "FIXME: sh_type value not supported!");
+    break;
+  }
+
+  if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
+      Section.getType() == ELF::SHT_ARM_EXIDX) {
+    StringRef SecName(Section.getSectionName());
+    if (SecName == ".ARM.exidx") {
+      sh_link = SectionIndexMap.lookup(
+        Asm.getContext().getELFSection(".text",
+                                       ELF::SHT_PROGBITS,
+                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
+                                       SectionKind::getText()));
+    } else if (SecName.startswith(".ARM.exidx")) {
+      sh_link = SectionIndexMap.lookup(
+        Asm.getContext().getELFSection(SecName.substr(sizeof(".ARM.exidx") - 1),
+                                       ELF::SHT_PROGBITS,
+                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
+                                       SectionKind::getText()));
+    }
+  }
+
+  WriteSecHdrEntry(SectionStringTableIndex[&Section], Section.getType(),
+                   Section.getFlags(), 0, Offset, Size, sh_link, sh_info,
+                   Alignment, Section.getEntrySize());
+}
+
+bool ELFObjectWriter::IsELFMetaDataSection(const MCSectionData &SD) {
+  return SD.getOrdinal() == ~UINT32_C(0) &&
+    !SD.getSection().isVirtualSection();
+}
+
+uint64_t ELFObjectWriter::DataSectionSize(const MCSectionData &SD) {
+  uint64_t Ret = 0;
+  for (MCSectionData::const_iterator i = SD.begin(), e = SD.end(); i != e;
+       ++i) {
+    const MCFragment &F = *i;
+    assert(F.getKind() == MCFragment::FT_Data);
+    Ret += cast<MCDataFragment>(F).getContents().size();
+  }
+  return Ret;
+}
+
+uint64_t ELFObjectWriter::GetSectionFileSize(const MCAsmLayout &Layout,
+                                             const MCSectionData &SD) {
+  if (IsELFMetaDataSection(SD))
+    return DataSectionSize(SD);
+  return Layout.getSectionFileSize(&SD);
+}
+
+uint64_t ELFObjectWriter::GetSectionAddressSize(const MCAsmLayout &Layout,
+                                                const MCSectionData &SD) {
+  if (IsELFMetaDataSection(SD))
+    return DataSectionSize(SD);
+  return Layout.getSectionAddressSize(&SD);
+}
+
+void ELFObjectWriter::WriteDataSectionData(MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCSectionELF &Section) {
+  const MCSectionData &SD = Asm.getOrCreateSectionData(Section);
+
+  uint64_t Padding = OffsetToAlignment(OS.tell(), SD.getAlignment());
+  WriteZeros(Padding);
+
+  if (IsELFMetaDataSection(SD)) {
+    for (MCSectionData::const_iterator i = SD.begin(), e = SD.end(); i != e;
+         ++i) {
+      const MCFragment &F = *i;
+      assert(F.getKind() == MCFragment::FT_Data);
+      WriteBytes(cast<MCDataFragment>(F).getContents());
+    }
+  } else {
+    Asm.writeSectionData(&SD, Layout);
+  }
+}
+
+void ELFObjectWriter::WriteSectionHeader(MCAssembler &Asm,
+                                         const GroupMapTy &GroupMap,
+                                         const MCAsmLayout &Layout,
+                                      const SectionIndexMapTy &SectionIndexMap,
+                                   const SectionOffsetMapTy &SectionOffsetMap) {
+  const unsigned NumSections = Asm.size() + 1;
+
+  std::vector<const MCSectionELF*> Sections;
+  Sections.resize(NumSections - 1);
+
+  for (SectionIndexMapTy::const_iterator i=
+         SectionIndexMap.begin(), e = SectionIndexMap.end(); i != e; ++i) {
+    const std::pair<const MCSectionELF*, uint32_t> &p = *i;
+    Sections[p.second - 1] = p.first;
+  }
+
+  // Null section first.
+  uint64_t FirstSectionSize =
+    NumSections >= ELF::SHN_LORESERVE ? NumSections : 0;
+  uint32_t FirstSectionLink =
+    ShstrtabIndex >= ELF::SHN_LORESERVE ? ShstrtabIndex : 0;
+  WriteSecHdrEntry(0, 0, 0, 0, 0, FirstSectionSize, FirstSectionLink, 0, 0, 0);
+
+  for (unsigned i = 0; i < NumSections - 1; ++i) {
+    const MCSectionELF &Section = *Sections[i];
+    const MCSectionData &SD = Asm.getOrCreateSectionData(Section);
+    uint32_t GroupSymbolIndex;
+    if (Section.getType() != ELF::SHT_GROUP)
+      GroupSymbolIndex = 0;
+    else
+      GroupSymbolIndex = getSymbolIndexInSymbolTable(Asm,
+                                                     GroupMap.lookup(&Section));
+
+    uint64_t Size = GetSectionAddressSize(Layout, SD);
+
+    WriteSection(Asm, SectionIndexMap, GroupSymbolIndex,
+                 SectionOffsetMap.lookup(&Section), Size,
+                 SD.getAlignment(), Section);
+  }
+}
+
+void ELFObjectWriter::ComputeSectionOrder(MCAssembler &Asm,
+                                  std::vector<const MCSectionELF*> &Sections) {
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(it->getSection());
+    if (Section.getType() == ELF::SHT_GROUP)
+      Sections.push_back(&Section);
+  }
+
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(it->getSection());
+    if (Section.getType() != ELF::SHT_GROUP &&
+        Section.getType() != ELF::SHT_REL &&
+        Section.getType() != ELF::SHT_RELA)
+      Sections.push_back(&Section);
+  }
+
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(it->getSection());
+    if (Section.getType() == ELF::SHT_REL ||
+        Section.getType() == ELF::SHT_RELA)
+      Sections.push_back(&Section);
+  }
+}
+
+void ELFObjectWriter::WriteObject(MCAssembler &Asm,
+                                  const MCAsmLayout &Layout) {
+  GroupMapTy GroupMap;
+  RevGroupMapTy RevGroupMap;
+  SectionIndexMapTy SectionIndexMap;
+
+  unsigned NumUserSections = Asm.size();
+
+  DenseMap<const MCSectionELF*, const MCSectionELF*> RelMap;
+  CreateRelocationSections(Asm, const_cast<MCAsmLayout&>(Layout), RelMap);
+
+  const unsigned NumUserAndRelocSections = Asm.size();
+  CreateIndexedSections(Asm, const_cast<MCAsmLayout&>(Layout), GroupMap,
+                        RevGroupMap, SectionIndexMap, RelMap);
+  const unsigned AllSections = Asm.size();
+  const unsigned NumIndexedSections = AllSections - NumUserAndRelocSections;
+
+  unsigned NumRegularSections = NumUserSections + NumIndexedSections;
+
+  // Compute symbol table information.
+  ComputeSymbolTable(Asm, SectionIndexMap, RevGroupMap, NumRegularSections);
+
+
+  WriteRelocations(Asm, const_cast<MCAsmLayout&>(Layout), RelMap);
+
+  CreateMetadataSections(const_cast<MCAssembler&>(Asm),
+                         const_cast<MCAsmLayout&>(Layout),
+                         SectionIndexMap,
+                         RelMap);
+
+  uint64_t NaturalAlignment = is64Bit() ? 8 : 4;
+  uint64_t HeaderSize = is64Bit() ? sizeof(ELF::Elf64_Ehdr) :
+                                    sizeof(ELF::Elf32_Ehdr);
+  uint64_t FileOff = HeaderSize;
+
+  std::vector<const MCSectionELF*> Sections;
+  ComputeSectionOrder(Asm, Sections);
+  unsigned NumSections = Sections.size();
+  SectionOffsetMapTy SectionOffsetMap;
+  for (unsigned i = 0; i < NumRegularSections + 1; ++i) {
+    const MCSectionELF &Section = *Sections[i];
+    const MCSectionData &SD = Asm.getOrCreateSectionData(Section);
+
+    FileOff = RoundUpToAlignment(FileOff, SD.getAlignment());
+
+    // Remember the offset into the file for this section.
+    SectionOffsetMap[&Section] = FileOff;
+
+    // Get the size of the section in the output file (including padding).
+    FileOff += GetSectionFileSize(Layout, SD);
+  }
+
+  FileOff = RoundUpToAlignment(FileOff, NaturalAlignment);
+
+  const unsigned SectionHeaderOffset = FileOff - HeaderSize;
+
+  uint64_t SectionHeaderEntrySize = is64Bit() ?
+    sizeof(ELF::Elf64_Shdr) : sizeof(ELF::Elf32_Shdr);
+  FileOff += (NumSections + 1) * SectionHeaderEntrySize;
+
+  for (unsigned i = NumRegularSections + 1; i < NumSections; ++i) {
+    const MCSectionELF &Section = *Sections[i];
+    const MCSectionData &SD = Asm.getOrCreateSectionData(Section);
+
+    FileOff = RoundUpToAlignment(FileOff, SD.getAlignment());
+
+    // Remember the offset into the file for this section.
+    SectionOffsetMap[&Section] = FileOff;
+
+    // Get the size of the section in the output file (including padding).
+    FileOff += GetSectionFileSize(Layout, SD);
+  }
+
+  // Write out the ELF header ...
+  WriteHeader(Asm, SectionHeaderOffset, NumSections + 1);
+
+  // ... then the regular sections ...
+  // + because of .shstrtab
+  for (unsigned i = 0; i < NumRegularSections + 1; ++i)
+    WriteDataSectionData(Asm, Layout, *Sections[i]);
+
+  uint64_t Padding = OffsetToAlignment(OS.tell(), NaturalAlignment);
+  WriteZeros(Padding);
+
+  // ... then the section header table ...
+  WriteSectionHeader(Asm, GroupMap, Layout, SectionIndexMap,
+                     SectionOffsetMap);
+
+  // ... and then the remaining sections ...
+  for (unsigned i = NumRegularSections + 1; i < NumSections; ++i)
+    WriteDataSectionData(Asm, Layout, *Sections[i]);
+}
+
+bool
+ELFObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                                      const MCSymbolData &DataA,
+                                                      const MCFragment &FB,
+                                                      bool InSet,
+                                                      bool IsPCRel) const {
+  if (DataA.getFlags() & ELF_STB_Weak)
+    return false;
+  return MCObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(
+                                                 Asm, DataA, FB,InSet, IsPCRel);
+}
+
+MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                                            raw_ostream &OS,
+                                            bool IsLittleEndian) {
+  return new ELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm/lib/MC/MCAsmBackend.cpp
new file mode 100644
index 000000000000..53960e7980ae
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmBackend.cpp
@@ -0,0 +1,43 @@
+//===-- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+using namespace llvm;
+
+MCAsmBackend::MCAsmBackend()
+  : HasReliableSymbolDifference(false), HasDataInCodeSupport(false) {}
+
+MCAsmBackend::~MCAsmBackend() {}
+
+const MCFixupKindInfo &
+MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  static const MCFixupKindInfo Builtins[] = {
+    { "FK_Data_1",  0,  8, 0 },
+    { "FK_Data_2",  0, 16, 0 },
+    { "FK_Data_4",  0, 32, 0 },
+    { "FK_Data_8",  0, 64, 0 },
+    { "FK_PCRel_1", 0,  8, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_PCRel_2", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_PCRel_4", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_PCRel_8", 0, 64, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_GPRel_1", 0,  8, 0 },
+    { "FK_GPRel_2", 0, 16, 0 },
+    { "FK_GPRel_4", 0, 32, 0 },
+    { "FK_GPRel_8", 0, 64, 0 },
+    { "FK_SecRel_1", 0,  8, 0 },
+    { "FK_SecRel_2", 0, 16, 0 },
+    { "FK_SecRel_4", 0, 32, 0 },
+    { "FK_SecRel_8", 0, 64, 0 }
+  };
+
+  assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) &&
+         "Unknown fixup kind");
+  return Builtins[Kind];
+}
diff --git a/contrib/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm/lib/MC/MCAsmInfo.cpp
new file mode 100644
index 000000000000..51bb4357102e
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmInfo.cpp
@@ -0,0 +1,143 @@
+//===-- MCAsmInfo.cpp - Asm Info -------------------------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Dwarf.h"
+#include <cctype>
+#include <cstring>
+using namespace llvm;
+
+MCAsmInfo::MCAsmInfo() {
+  PointerSize = 4;
+  CalleeSaveStackSlotSize = 4;
+
+  IsLittleEndian = true;
+  StackGrowsUp = false;
+  HasSubsectionsViaSymbols = false;
+  HasMachoZeroFillDirective = false;
+  HasMachoTBSSDirective = false;
+  HasStaticCtorDtorReferenceInStaticMode = false;
+  LinkerRequiresNonEmptyDwarfLines = false;
+  MaxInstLength = 4;
+  PCSymbol = "$";
+  SeparatorString = ";";
+  CommentColumn = 40;
+  CommentString = "#";
+  LabelSuffix = ":";
+  DebugLabelSuffix = ":";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".";
+  LinkerPrivateGlobalPrefix = "";
+  InlineAsmStart = "APP";
+  InlineAsmEnd = "NO_APP";
+  Code16Directive = ".code16";
+  Code32Directive = ".code32";
+  Code64Directive = ".code64";
+  AssemblerDialect = 0;
+  AllowQuotesInName = false;
+  AllowNameToStartWithDigit = false;
+  AllowPeriodsInName = true;
+  AllowUTF8 = true;
+  UseDataRegionDirectives = false;
+  ZeroDirective = "\t.zero\t";
+  AsciiDirective = "\t.ascii\t";
+  AscizDirective = "\t.asciz\t";
+  Data8bitsDirective = "\t.byte\t";
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = "\t.quad\t";
+  SunStyleELFSectionSwitchSyntax = false;
+  UsesELFSectionDirectiveForBSS = false;
+  AlignDirective = "\t.align\t";
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+  GPRel64Directive = 0;
+  GPRel32Directive = 0;
+  GlobalDirective = "\t.globl\t";
+  HasSetDirective = true;
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = true;
+  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
+  HasDotTypeDotSizeDirective = true;
+  HasSingleParameterDotFile = true;
+  HasNoDeadStrip = false;
+  HasSymbolResolver = false;
+  WeakRefDirective = 0;
+  WeakDefDirective = 0;
+  LinkOnceDirective = 0;
+  HiddenVisibilityAttr = MCSA_Hidden;
+  HiddenDeclarationVisibilityAttr = MCSA_Hidden;
+  ProtectedVisibilityAttr = MCSA_Protected;
+  HasLEB128 = false;
+  SupportsDebugInformation = false;
+  ExceptionsType = ExceptionHandling::None;
+  DwarfUsesInlineInfoSection = false;
+  DwarfSectionOffsetDirective = 0;
+  DwarfUsesRelocationsAcrossSections = true;
+  DwarfRegNumForCFI = false;
+  HasMicrosoftFastStdCallMangling = false;
+}
+
+MCAsmInfo::~MCAsmInfo() {
+}
+
+
+unsigned MCAsmInfo::getULEB128Size(unsigned Value) {
+  unsigned Size = 0;
+  do {
+    Value >>= 7;
+    Size += sizeof(int8_t);
+  } while (Value);
+  return Size;
+}
+
+unsigned MCAsmInfo::getSLEB128Size(int Value) {
+  unsigned Size = 0;
+  int Sign = Value >> (8 * sizeof(Value) - 1);
+  bool IsMore;
+
+  do {
+    unsigned Byte = Value & 0x7f;
+    Value >>= 7;
+    IsMore = Value != Sign || ((Byte ^ Sign) & 0x40) != 0;
+    Size += sizeof(int8_t);
+  } while (IsMore);
+  return Size;
+}
+
+const MCExpr *
+MCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                       unsigned Encoding,
+                                       MCStreamer &Streamer) const {
+  return getExprForFDESymbol(Sym, Encoding, Streamer);
+}
+
+const MCExpr *
+MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
+                               unsigned Encoding,
+                               MCStreamer &Streamer) const {
+  if (!(Encoding & dwarf::DW_EH_PE_pcrel))
+    return MCSymbolRefExpr::Create(Sym, Streamer.getContext());
+
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res = MCSymbolRefExpr::Create(Sym, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
diff --git a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
new file mode 100644
index 000000000000..fd79193073df
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -0,0 +1,53 @@
+//===-- MCAsmInfoCOFF.cpp - COFF asm properties -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on COFF-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoCOFF.h"
+using namespace llvm;
+
+void MCAsmInfoCOFF::anchor() { }
+
+MCAsmInfoCOFF::MCAsmInfoCOFF() {
+  GlobalPrefix = "_";
+  // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
+  // alignment.
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+  PrivateGlobalPrefix = "L";  // Prefix for private global symbols
+  WeakRefDirective = "\t.weak\t";
+  LinkOnceDirective = "\t.linkonce discard\n";
+
+  // Doesn't support visibility:
+  HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+  SupportsDebugInformation = true;
+  DwarfSectionOffsetDirective = "\t.secrel32\t";
+  HasMicrosoftFastStdCallMangling = true;
+}
+
+void MCAsmInfoMicrosoft::anchor() { }
+
+MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
+  AllowQuotesInName = true;
+}
+
+void MCAsmInfoGNUCOFF::anchor() { }
+
+MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
+
+}
diff --git a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
new file mode 100644
index 000000000000..a0e3ebad5e2b
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -0,0 +1,64 @@
+//===-- MCAsmInfoDarwin.cpp - Darwin asm properties -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on Darwin-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+void MCAsmInfoDarwin::anchor() { }
+
+MCAsmInfoDarwin::MCAsmInfoDarwin() {
+  // Common settings for all Darwin targets.
+  // Syntax:
+  GlobalPrefix = "_";
+  PrivateGlobalPrefix = "L";
+  LinkerPrivateGlobalPrefix = "l";
+  AllowQuotesInName = true;
+  HasSingleParameterDotFile = false;
+  HasSubsectionsViaSymbols = true;
+
+  AlignmentIsInBytes = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+  InlineAsmStart = " InlineAsm Start";
+  InlineAsmEnd = " InlineAsm End";
+
+  // Directives:
+  WeakDefDirective = "\t.weak_definition ";
+  WeakRefDirective = "\t.weak_reference ";
+  ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
+  HasMachoZeroFillDirective = true;  // Uses .zerofill
+  HasMachoTBSSDirective = true; // Uses .tbss
+  HasStaticCtorDtorReferenceInStaticMode = true;
+
+  // FIXME: Darwin 10 and newer don't need this.
+  LinkerRequiresNonEmptyDwarfLines = true;
+
+  // FIXME: Change this once MC is the system assembler.
+  HasAggressiveSymbolFolding = false;
+
+  HiddenVisibilityAttr = MCSA_PrivateExtern;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+
+  // Doesn't support protected visibility.
+  ProtectedVisibilityAttr = MCSA_Invalid;
+
+  HasDotTypeDotSizeDirective = false;
+  HasNoDeadStrip = true;
+  HasSymbolResolver = true;
+
+  DwarfUsesRelocationsAcrossSections = false;
+}
diff --git a/contrib/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
new file mode 100644
index 000000000000..35613b411c24
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
@@ -0,0 +1,1440 @@
+//===- lib/MC/MCAsmStreamer.cpp - Text Assembly Output --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PathV2.h"
+#include <cctype>
+using namespace llvm;
+
+namespace {
+
+class MCAsmStreamer : public MCStreamer {
+protected:
+  formatted_raw_ostream &OS;
+  const MCAsmInfo &MAI;
+private:
+  OwningPtr<MCInstPrinter> InstPrinter;
+  OwningPtr<MCCodeEmitter> Emitter;
+  OwningPtr<MCAsmBackend> AsmBackend;
+
+  SmallString<128> CommentToEmit;
+  raw_svector_ostream CommentStream;
+
+  unsigned IsVerboseAsm : 1;
+  unsigned ShowInst : 1;
+  unsigned UseLoc : 1;
+  unsigned UseCFI : 1;
+  unsigned UseDwarfDirectory : 1;
+
+  enum EHSymbolFlags { EHGlobal         = 1,
+                       EHWeakDefinition = 1 << 1,
+                       EHPrivateExtern  = 1 << 2 };
+  DenseMap<const MCSymbol*, unsigned> FlagMap;
+
+  bool needsSet(const MCExpr *Value);
+
+  void EmitRegisterName(int64_t Register);
+  virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
+  virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
+
+public:
+  MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
+                bool isVerboseAsm, bool useLoc, bool useCFI,
+                bool useDwarfDirectory,
+                MCInstPrinter *printer, MCCodeEmitter *emitter,
+                MCAsmBackend *asmbackend,
+                bool showInst)
+    : MCStreamer(SK_AsmStreamer, Context), OS(os), MAI(Context.getAsmInfo()),
+      InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
+      CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
+      ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
+      UseDwarfDirectory(useDwarfDirectory) {
+    if (InstPrinter && IsVerboseAsm)
+      InstPrinter->setCommentStream(CommentStream);
+  }
+  ~MCAsmStreamer() {}
+
+  inline void EmitEOL() {
+    // If we don't have any comments, just emit a \n.
+    if (!IsVerboseAsm) {
+      OS << '\n';
+      return;
+    }
+    EmitCommentsAndEOL();
+  }
+  void EmitCommentsAndEOL();
+
+  /// isVerboseAsm - Return true if this streamer supports verbose assembly at
+  /// all.
+  virtual bool isVerboseAsm() const { return IsVerboseAsm; }
+
+  /// hasRawTextSupport - We support EmitRawText.
+  virtual bool hasRawTextSupport() const { return true; }
+
+  /// AddComment - Add a comment that can be emitted to the generated .s
+  /// file if applicable as a QoI issue to make the output of the compiler
+  /// more readable.  This only affects the MCAsmStreamer, and only when
+  /// verbose assembly output is enabled.
+  virtual void AddComment(const Twine &T);
+
+  /// AddEncodingComment - Add a comment showing the encoding of an instruction.
+  virtual void AddEncodingComment(const MCInst &Inst);
+
+  /// GetCommentOS - Return a raw_ostream that comments can be written to.
+  /// Unlike AddComment, you are required to terminate comments with \n if you
+  /// use this method.
+  virtual raw_ostream &GetCommentOS() {
+    if (!IsVerboseAsm)
+      return nulls();  // Discard comments unless in verbose asm mode.
+    return CommentStream;
+  }
+
+  /// AddBlankLine - Emit a blank line to a .s file to pretty it up.
+  virtual void AddBlankLine() {
+    EmitEOL();
+  }
+
+  /// @name MCStreamer Interface
+  /// @{
+
+  virtual void ChangeSection(const MCSection *Section);
+
+  virtual void InitSections() {
+    InitToTextSection();
+  }
+
+  virtual void InitToTextSection() {
+    // FIXME, this is MachO specific, but the testsuite
+    // expects this.
+    SwitchSection(getContext().getMachOSection(
+                                      "__TEXT", "__text",
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      0, SectionKind::getText()));
+  }
+
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
+
+  virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
+                                   MCSymbol *EHSymbol);
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
+  virtual void EmitThumbFunc(MCSymbol *Func);
+
+  virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
+  virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                        const MCSymbol *LastLabel,
+                                        const MCSymbol *Label,
+                                        unsigned PointerSize);
+  virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                         const MCSymbol *Label);
+
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
+  virtual void EmitCOFFSymbolType(int Type);
+  virtual void EndCOFFSymbolDef();
+  virtual void EmitCOFFSecRel32(MCSymbol const *Symbol);
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment);
+
+  /// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
+  ///
+  /// @param Symbol - The common symbol to emit.
+  /// @param Size - The size of the common symbol.
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
+
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
+
+  virtual void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol,
+                               uint64_t Size, unsigned ByteAlignment = 0);
+
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             unsigned AddrSpace);
+  virtual void EmitIntValue(uint64_t Value, unsigned Size,
+                            unsigned AddrSpace = 0);
+
+  virtual void EmitULEB128Value(const MCExpr *Value);
+
+  virtual void EmitSLEB128Value(const MCExpr *Value);
+
+  virtual void EmitGPRel64Value(const MCExpr *Value);
+
+  virtual void EmitGPRel32Value(const MCExpr *Value);
+
+
+  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                        unsigned AddrSpace);
+
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0);
+
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0);
+
+  virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                 unsigned char Value = 0);
+
+  virtual void EmitFileDirective(StringRef Filename);
+  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                      StringRef Filename, unsigned CUID = 0);
+  virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                     unsigned Column, unsigned Flags,
+                                     unsigned Isa, unsigned Discriminator,
+                                     StringRef FileName);
+
+  virtual void EmitCFISections(bool EH, bool Debug);
+  virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
+  virtual void EmitCFIDefCfaOffset(int64_t Offset);
+  virtual void EmitCFIDefCfaRegister(int64_t Register);
+  virtual void EmitCFIOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding);
+  virtual void EmitCFIRememberState();
+  virtual void EmitCFIRestoreState();
+  virtual void EmitCFISameValue(int64_t Register);
+  virtual void EmitCFIRelOffset(int64_t Register, int64_t Offset);
+  virtual void EmitCFIAdjustCfaOffset(int64_t Adjustment);
+  virtual void EmitCFISignalFrame();
+  virtual void EmitCFIUndefined(int64_t Register);
+  virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
+
+  virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
+  virtual void EmitWin64EHEndProc();
+  virtual void EmitWin64EHStartChained();
+  virtual void EmitWin64EHEndChained();
+  virtual void EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                  bool Except);
+  virtual void EmitWin64EHHandlerData();
+  virtual void EmitWin64EHPushReg(unsigned Register);
+  virtual void EmitWin64EHSetFrame(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHAllocStack(unsigned Size);
+  virtual void EmitWin64EHSaveReg(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHSaveXMM(unsigned Register, unsigned Offset);
+  virtual void EmitWin64EHPushFrame(bool Code);
+  virtual void EmitWin64EHEndProlog();
+
+  virtual void EmitFnStart();
+  virtual void EmitFnEnd();
+  virtual void EmitCantUnwind();
+  virtual void EmitPersonality(const MCSymbol *Personality);
+  virtual void EmitHandlerData();
+  virtual void EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void EmitPad(int64_t Offset);
+  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool);
+
+  virtual void EmitTCEntry(const MCSymbol &S);
+
+  virtual void EmitInstruction(const MCInst &Inst);
+
+  virtual void EmitBundleAlignMode(unsigned AlignPow2);
+  virtual void EmitBundleLock(bool AlignToEnd);
+  virtual void EmitBundleUnlock();
+
+  /// EmitRawText - If this file is backed by an assembly streamer, this dumps
+  /// the specified string in the output .s file.  This capability is
+  /// indicated by the hasRawTextSupport() predicate.
+  virtual void EmitRawText(StringRef String);
+
+  virtual void FinishImpl();
+
+  /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_AsmStreamer;
+  }
+};
+
+} // end anonymous namespace.
+
+/// AddComment - Add a comment that can be emitted to the generated .s
+/// file if applicable as a QoI issue to make the output of the compiler
+/// more readable.  This only affects the MCAsmStreamer, and only when
+/// verbose assembly output is enabled.
+void MCAsmStreamer::AddComment(const Twine &T) {
+  if (!IsVerboseAsm) return;
+
+  // Make sure that CommentStream is flushed.
+  CommentStream.flush();
+
+  T.toVector(CommentToEmit);
+  // Each comment goes on its own line.
+  CommentToEmit.push_back('\n');
+
+  // Tell the comment stream that the vector changed underneath it.
+  CommentStream.resync();
+}
+
+void MCAsmStreamer::EmitCommentsAndEOL() {
+  if (CommentToEmit.empty() && CommentStream.GetNumBytesInBuffer() == 0) {
+    OS << '\n';
+    return;
+  }
+
+  CommentStream.flush();
+  StringRef Comments = CommentToEmit.str();
+
+  assert(Comments.back() == '\n' &&
+         "Comment array not newline terminated");
+  do {
+    // Emit a line of comments.
+    OS.PadToColumn(MAI.getCommentColumn());
+    size_t Position = Comments.find('\n');
+    OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n';
+
+    Comments = Comments.substr(Position+1);
+  } while (!Comments.empty());
+
+  CommentToEmit.clear();
+  // Tell the comment stream that the vector changed underneath it.
+  CommentStream.resync();
+}
+
+static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
+  assert(Bytes && "Invalid size!");
+  return Value & ((uint64_t) (int64_t) -1 >> (64 - Bytes * 8));
+}
+
+void MCAsmStreamer::ChangeSection(const MCSection *Section) {
+  assert(Section && "Cannot switch to a null section!");
+  Section->PrintSwitchToSection(MAI, OS);
+}
+
+void MCAsmStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                        MCSymbol *EHSymbol) {
+  if (UseCFI)
+    return;
+
+  unsigned Flags = FlagMap.lookup(Symbol);
+
+  if (Flags & EHGlobal)
+    EmitSymbolAttribute(EHSymbol, MCSA_Global);
+  if (Flags & EHWeakDefinition)
+    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
+  if (Flags & EHPrivateExtern)
+    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
+}
+
+void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCStreamer::EmitLabel(Symbol);
+
+  OS << *Symbol << MAI.getLabelSuffix();
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCStreamer::EmitDebugLabel(Symbol);
+
+  OS << *Symbol << MAI.getDebugLabelSuffix();
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
+  case MCAF_SubsectionsViaSymbols: OS << ".subsections_via_symbols"; break;
+  case MCAF_Code16:                OS << '\t'<< MAI.getCode16Directive(); break;
+  case MCAF_Code32:                OS << '\t'<< MAI.getCode32Directive(); break;
+  case MCAF_Code64:                OS << '\t'<< MAI.getCode64Directive(); break;
+  }
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+  assert(!Options.empty() && "At least one option is required!");
+  OS << "\t.linker_option \"" << Options[0] << '"';
+  for (ArrayRef<std::string>::iterator it = Options.begin() + 1,
+         ie = Options.end(); it != ie; ++it) {
+    OS << ", " << '"' << *it << '"';
+  }
+  OS << "\n";
+}
+
+void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  MCContext &Ctx = getContext();
+  const MCAsmInfo &MAI = Ctx.getAsmInfo();
+  if (!MAI.doesSupportDataRegionDirectives())
+    return;
+  switch (Kind) {
+  case MCDR_DataRegion:            OS << "\t.data_region"; break;
+  case MCDR_DataRegionJT8:         OS << "\t.data_region jt8"; break;
+  case MCDR_DataRegionJT16:        OS << "\t.data_region jt16"; break;
+  case MCDR_DataRegionJT32:        OS << "\t.data_region jt32"; break;
+  case MCDR_DataRegionEnd:         OS << "\t.end_data_region"; break;
+  }
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
+  // This needs to emit to a temporary string to get properly quoted
+  // MCSymbols when they have spaces in them.
+  OS << "\t.thumb_func";
+  // Only Mach-O hasSubsectionsViaSymbols()
+  if (MAI.hasSubsectionsViaSymbols())
+    OS << '\t' << *Func;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  OS << *Symbol << " = " << *Value;
+  EmitEOL();
+
+  // FIXME: Lift context changes into super class.
+  Symbol->setVariableValue(Value);
+}
+
+void MCAsmStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
+  OS << ".weakref " << *Alias << ", " << *Symbol;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                             const MCSymbol *LastLabel,
+                                             const MCSymbol *Label,
+                                             unsigned PointerSize) {
+  EmitDwarfSetLineAddr(LineDelta, Label, PointerSize);
+}
+
+void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                              const MCSymbol *Label) {
+  EmitIntValue(dwarf::DW_CFA_advance_loc4, 1);
+  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  AddrDelta = ForceExpAbs(AddrDelta);
+  EmitValue(AddrDelta, 4);
+}
+
+
+void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                        MCSymbolAttr Attribute) {
+  switch (Attribute) {
+  case MCSA_Invalid: llvm_unreachable("Invalid symbol attribute");
+  case MCSA_ELF_TypeFunction:    /// .type _foo, STT_FUNC  # aka @function
+  case MCSA_ELF_TypeIndFunction: /// .type _foo, STT_GNU_IFUNC
+  case MCSA_ELF_TypeObject:      /// .type _foo, STT_OBJECT  # aka @object
+  case MCSA_ELF_TypeTLS:         /// .type _foo, STT_TLS     # aka @tls_object
+  case MCSA_ELF_TypeCommon:      /// .type _foo, STT_COMMON  # aka @common
+  case MCSA_ELF_TypeNoType:      /// .type _foo, STT_NOTYPE  # aka @notype
+  case MCSA_ELF_TypeGnuUniqueObject:  /// .type _foo, @gnu_unique_object
+    assert(MAI.hasDotTypeDotSizeDirective() && "Symbol Attr not supported");
+    OS << "\t.type\t" << *Symbol << ','
+       << ((MAI.getCommentString()[0] != '@') ? '@' : '%');
+    switch (Attribute) {
+    default: llvm_unreachable("Unknown ELF .type");
+    case MCSA_ELF_TypeFunction:    OS << "function"; break;
+    case MCSA_ELF_TypeIndFunction: OS << "gnu_indirect_function"; break;
+    case MCSA_ELF_TypeObject:      OS << "object"; break;
+    case MCSA_ELF_TypeTLS:         OS << "tls_object"; break;
+    case MCSA_ELF_TypeCommon:      OS << "common"; break;
+    case MCSA_ELF_TypeNoType:      OS << "no_type"; break;
+    case MCSA_ELF_TypeGnuUniqueObject: OS << "gnu_unique_object"; break;
+    }
+    EmitEOL();
+    return;
+  case MCSA_Global: // .globl/.global
+    OS << MAI.getGlobalDirective();
+    FlagMap[Symbol] |= EHGlobal;
+    break;
+  case MCSA_Hidden:         OS << "\t.hidden\t";          break;
+  case MCSA_IndirectSymbol: OS << "\t.indirect_symbol\t"; break;
+  case MCSA_Internal:       OS << "\t.internal\t";        break;
+  case MCSA_LazyReference:  OS << "\t.lazy_reference\t";  break;
+  case MCSA_Local:          OS << "\t.local\t";           break;
+  case MCSA_NoDeadStrip:    OS << "\t.no_dead_strip\t";   break;
+  case MCSA_SymbolResolver: OS << "\t.symbol_resolver\t"; break;
+  case MCSA_PrivateExtern:
+    OS << "\t.private_extern\t";
+    FlagMap[Symbol] |= EHPrivateExtern;
+    break;
+  case MCSA_Protected:      OS << "\t.protected\t";       break;
+  case MCSA_Reference:      OS << "\t.reference\t";       break;
+  case MCSA_Weak:           OS << "\t.weak\t";            break;
+  case MCSA_WeakDefinition:
+    OS << "\t.weak_definition\t";
+    FlagMap[Symbol] |= EHWeakDefinition;
+    break;
+      // .weak_reference
+  case MCSA_WeakReference:  OS << MAI.getWeakRefDirective(); break;
+  case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break;
+  }
+
+  OS << *Symbol;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  OS << ".desc" << ' ' << *Symbol << ',' << DescValue;
+  EmitEOL();
+}
+
+void MCAsmStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+  OS << "\t.def\t " << *Symbol << ';';
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCOFFSymbolStorageClass (int StorageClass) {
+  OS << "\t.scl\t" << StorageClass << ';';
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCOFFSymbolType (int Type) {
+  OS << "\t.type\t" << Type << ';';
+  EmitEOL();
+}
+
+void MCAsmStreamer::EndCOFFSymbolDef() {
+  OS << "\t.endef";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
+  OS << "\t.secrel32\t" << *Symbol << '\n';
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  assert(MAI.hasDotTypeDotSizeDirective());
+  OS << "\t.size\t" << *Symbol << ", " << *Value << '\n';
+}
+
+void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
+  OS << "\t.comm\t" << *Symbol << ',' << Size;
+  if (ByteAlignment != 0) {
+    if (MAI.getCOMMDirectiveAlignmentIsInBytes())
+      OS << ',' << ByteAlignment;
+    else
+      OS << ',' << Log2_32(ByteAlignment);
+  }
+  EmitEOL();
+}
+
+/// EmitLocalCommonSymbol - Emit a local common (.lcomm) symbol.
+///
+/// @param Symbol - The common symbol to emit.
+/// @param Size - The size of the common symbol.
+void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                          unsigned ByteAlign) {
+  OS << "\t.lcomm\t" << *Symbol << ',' << Size;
+  if (ByteAlign > 1) {
+    switch (MAI.getLCOMMDirectiveAlignmentType()) {
+    case LCOMM::NoAlignment:
+      llvm_unreachable("alignment not supported on .lcomm!");
+    case LCOMM::ByteAlignment:
+      OS << ',' << ByteAlign;
+      break;
+    case LCOMM::Log2Alignment:
+      assert(isPowerOf2_32(ByteAlign) && "alignment must be a power of 2");
+      OS << ',' << Log2_32(ByteAlign);
+      break;
+    }
+  }
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                 uint64_t Size, unsigned ByteAlignment) {
+  // Note: a .zerofill directive does not switch sections.
+  OS << ".zerofill ";
+
+  // This is a mach-o specific directive.
+  const MCSectionMachO *MOSection = ((const MCSectionMachO*)Section);
+  OS << MOSection->getSegmentName() << "," << MOSection->getSectionName();
+
+  if (Symbol != NULL) {
+    OS << ',' << *Symbol << ',' << Size;
+    if (ByteAlignment != 0)
+      OS << ',' << Log2_32(ByteAlignment);
+  }
+  EmitEOL();
+}
+
+// .tbss sym, size, align
+// This depends that the symbol has already been mangled from the original,
+// e.g. _a.
+void MCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                   uint64_t Size, unsigned ByteAlignment) {
+  assert(Symbol != NULL && "Symbol shouldn't be NULL!");
+  // Instead of using the Section we'll just use the shortcut.
+  // This is a mach-o specific directive and section.
+  OS << ".tbss " << *Symbol << ", " << Size;
+
+  // Output align if we have it.  We default to 1 so don't bother printing
+  // that.
+  if (ByteAlignment > 1) OS << ", " << Log2_32(ByteAlignment);
+
+  EmitEOL();
+}
+
+static inline char toOctal(int X) { return (X&7)+'0'; }
+
+static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
+  OS << '"';
+
+  for (unsigned i = 0, e = Data.size(); i != e; ++i) {
+    unsigned char C = Data[i];
+    if (C == '"' || C == '\\') {
+      OS << '\\' << (char)C;
+      continue;
+    }
+
+    if (isprint((unsigned char)C)) {
+      OS << (char)C;
+      continue;
+    }
+
+    switch (C) {
+      case '\b': OS << "\\b"; break;
+      case '\f': OS << "\\f"; break;
+      case '\n': OS << "\\n"; break;
+      case '\r': OS << "\\r"; break;
+      case '\t': OS << "\\t"; break;
+      default:
+        OS << '\\';
+        OS << toOctal(C >> 6);
+        OS << toOctal(C >> 3);
+        OS << toOctal(C >> 0);
+        break;
+    }
+  }
+
+  OS << '"';
+}
+
+
+void MCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+  if (Data.empty()) return;
+
+  if (Data.size() == 1) {
+    OS << MAI.getData8bitsDirective(AddrSpace);
+    OS << (unsigned)(unsigned char)Data[0];
+    EmitEOL();
+    return;
+  }
+
+  // If the data ends with 0 and the target supports .asciz, use it, otherwise
+  // use .ascii
+  if (MAI.getAscizDirective() && Data.back() == 0) {
+    OS << MAI.getAscizDirective();
+    Data = Data.substr(0, Data.size()-1);
+  } else {
+    OS << MAI.getAsciiDirective();
+  }
+
+  OS << ' ';
+  PrintQuotedString(Data, OS);
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size,
+                                 unsigned AddrSpace) {
+  EmitValue(MCConstantExpr::Create(Value, getContext()), Size, AddrSpace);
+}
+
+void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                  unsigned AddrSpace) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+  const char *Directive = 0;
+  switch (Size) {
+  default: break;
+  case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break;
+  case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break;
+  case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break;
+  case 8:
+    Directive = MAI.getData64bitsDirective(AddrSpace);
+    // If the target doesn't support 64-bit data, emit as two 32-bit halves.
+    if (Directive) break;
+    int64_t IntValue;
+    if (!Value->EvaluateAsAbsolute(IntValue))
+      report_fatal_error("Don't know how to emit this value.");
+    if (getContext().getAsmInfo().isLittleEndian()) {
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
+      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
+    } else {
+      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
+    }
+    return;
+  }
+
+  assert(Directive && "Invalid size for machine code value!");
+  OS << Directive << *Value;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
+  int64_t IntValue;
+  if (Value->EvaluateAsAbsolute(IntValue)) {
+    EmitULEB128IntValue(IntValue);
+    return;
+  }
+  assert(MAI.hasLEB128() && "Cannot print a .uleb");
+  OS << ".uleb128 " << *Value;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
+  int64_t IntValue;
+  if (Value->EvaluateAsAbsolute(IntValue)) {
+    EmitSLEB128IntValue(IntValue);
+    return;
+  }
+  assert(MAI.hasLEB128() && "Cannot print a .sleb");
+  OS << ".sleb128 " << *Value;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitGPRel64Value(const MCExpr *Value) {
+  assert(MAI.getGPRel64Directive() != 0);
+  OS << MAI.getGPRel64Directive() << *Value;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  assert(MAI.getGPRel32Directive() != 0);
+  OS << MAI.getGPRel32Directive() << *Value;
+  EmitEOL();
+}
+
+
+/// EmitFill - Emit NumBytes bytes worth of the value specified by
+/// FillValue.  This implements directives such as '.space'.
+void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                             unsigned AddrSpace) {
+  if (NumBytes == 0) return;
+
+  if (AddrSpace == 0)
+    if (const char *ZeroDirective = MAI.getZeroDirective()) {
+      OS << ZeroDirective << NumBytes;
+      if (FillValue != 0)
+        OS << ',' << (int)FillValue;
+      EmitEOL();
+      return;
+    }
+
+  // Emit a byte at a time.
+  MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
+}
+
+void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                         unsigned ValueSize,
+                                         unsigned MaxBytesToEmit) {
+  // Some assemblers don't support non-power of two alignments, so we always
+  // emit alignments as a power of two if possible.
+  if (isPowerOf2_32(ByteAlignment)) {
+    switch (ValueSize) {
+    default: llvm_unreachable("Invalid size for machine code value!");
+    case 1: OS << MAI.getAlignDirective(); break;
+    // FIXME: use MAI for this!
+    case 2: OS << ".p2alignw "; break;
+    case 4: OS << ".p2alignl "; break;
+    case 8: llvm_unreachable("Unsupported alignment size!");
+    }
+
+    if (MAI.getAlignmentIsInBytes())
+      OS << ByteAlignment;
+    else
+      OS << Log2_32(ByteAlignment);
+
+    if (Value || MaxBytesToEmit) {
+      OS << ", 0x";
+      OS.write_hex(truncateToSize(Value, ValueSize));
+
+      if (MaxBytesToEmit)
+        OS << ", " << MaxBytesToEmit;
+    }
+    EmitEOL();
+    return;
+  }
+
+  // Non-power of two alignment.  This is not widely supported by assemblers.
+  // FIXME: Parameterize this based on MAI.
+  switch (ValueSize) {
+  default: llvm_unreachable("Invalid size for machine code value!");
+  case 1: OS << ".balign";  break;
+  case 2: OS << ".balignw"; break;
+  case 4: OS << ".balignl"; break;
+  case 8: llvm_unreachable("Unsupported alignment size!");
+  }
+
+  OS << ' ' << ByteAlignment;
+  OS << ", " << truncateToSize(Value, ValueSize);
+  if (MaxBytesToEmit)
+    OS << ", " << MaxBytesToEmit;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                      unsigned MaxBytesToEmit) {
+  // Emit with a text fill value.
+  EmitValueToAlignment(ByteAlignment, MAI.getTextAlignFillValue(),
+                       1, MaxBytesToEmit);
+}
+
+bool MCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                      unsigned char Value) {
+  // FIXME: Verify that Offset is associated with the current section.
+  OS << ".org " << *Offset << ", " << (unsigned) Value;
+  EmitEOL();
+  return false;
+}
+
+
+void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
+  assert(MAI.hasSingleParameterDotFile());
+  OS << "\t.file\t";
+  PrintQuotedString(Filename, OS);
+  EmitEOL();
+}
+
+bool MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                           StringRef Filename, unsigned CUID) {
+  if (!UseDwarfDirectory && !Directory.empty()) {
+    if (sys::path::is_absolute(Filename))
+      return EmitDwarfFileDirective(FileNo, "", Filename, CUID);
+
+    SmallString<128> FullPathName = Directory;
+    sys::path::append(FullPathName, Filename);
+    return EmitDwarfFileDirective(FileNo, "", FullPathName, CUID);
+  }
+
+  if (UseLoc) {
+    OS << "\t.file\t" << FileNo << ' ';
+    if (!Directory.empty()) {
+      PrintQuotedString(Directory, OS);
+      OS << ' ';
+    }
+    PrintQuotedString(Filename, OS);
+    EmitEOL();
+    // All .file will belong to a single CUID.
+    CUID = 0;
+  }
+  return this->MCStreamer::EmitDwarfFileDirective(FileNo, Directory, Filename,
+                                                  CUID);
+}
+
+void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                          unsigned Column, unsigned Flags,
+                                          unsigned Isa,
+                                          unsigned Discriminator,
+                                          StringRef FileName) {
+  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
+                                          Isa, Discriminator, FileName);
+  if (!UseLoc)
+    return;
+
+  OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
+  if (Flags & DWARF2_FLAG_BASIC_BLOCK)
+    OS << " basic_block";
+  if (Flags & DWARF2_FLAG_PROLOGUE_END)
+    OS << " prologue_end";
+  if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
+    OS << " epilogue_begin";
+
+  unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
+  if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
+    OS << " is_stmt ";
+
+    if (Flags & DWARF2_FLAG_IS_STMT)
+      OS << "1";
+    else
+      OS << "0";
+  }
+
+  if (Isa)
+    OS << "isa " << Isa;
+  if (Discriminator)
+    OS << "discriminator " << Discriminator;
+
+  if (IsVerboseAsm) {
+    OS.PadToColumn(MAI.getCommentColumn());
+    OS << MAI.getCommentString() << ' ' << FileName << ':'
+       << Line << ':' << Column;
+  }
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
+  MCStreamer::EmitCFISections(EH, Debug);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_sections ";
+  if (EH) {
+    OS << ".eh_frame";
+    if (Debug)
+      OS << ", .debug_frame";
+  } else if (Debug) {
+    OS << ".debug_frame";
+  }
+
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+  if (!UseCFI) {
+    RecordProcStart(Frame);
+    return;
+  }
+
+  OS << "\t.cfi_startproc";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  if (!UseCFI) {
+    RecordProcEnd(Frame);
+    return;
+  }
+
+  // Put a dummy non-null value in Frame.End to mark that this frame has been
+  // closed.
+  Frame.End = (MCSymbol *) 1;
+
+  OS << "\t.cfi_endproc";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitRegisterName(int64_t Register) {
+  if (InstPrinter && !MAI.useDwarfRegNumForCFI()) {
+    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    unsigned LLVMRegister = MRI.getLLVMRegNum(Register, true);
+    InstPrinter->printRegName(OS, LLVMRegister);
+  } else {
+    OS << Register;
+  }
+}
+
+void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
+  MCStreamer::EmitCFIDefCfa(Register, Offset);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_def_cfa ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
+  MCStreamer::EmitCFIDefCfaOffset(Offset);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_def_cfa_offset " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
+  MCStreamer::EmitCFIDefCfaRegister(Register);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_def_cfa_register ";
+  EmitRegisterName(Register);
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
+  this->MCStreamer::EmitCFIOffset(Register, Offset);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+                                       unsigned Encoding) {
+  MCStreamer::EmitCFIPersonality(Sym, Encoding);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_personality " << Encoding << ", " << *Sym;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+  MCStreamer::EmitCFILsda(Sym, Encoding);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_lsda " << Encoding << ", " << *Sym;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIRememberState() {
+  MCStreamer::EmitCFIRememberState();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_remember_state";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIRestoreState() {
+  MCStreamer::EmitCFIRestoreState();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_restore_state";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
+  MCStreamer::EmitCFISameValue(Register);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_same_value ";
+  EmitRegisterName(Register);
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
+  MCStreamer::EmitCFIRelOffset(Register, Offset);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_rel_offset ";
+  EmitRegisterName(Register);
+  OS << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
+  MCStreamer::EmitCFIAdjustCfaOffset(Adjustment);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_adjust_cfa_offset " << Adjustment;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFISignalFrame() {
+  MCStreamer::EmitCFISignalFrame();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_signal_frame";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIUndefined(int64_t Register) {
+  MCStreamer::EmitCFIUndefined(Register);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_undefined " << Register;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
+  MCStreamer::EmitCFIRegister(Register1, Register2);
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_register " << Register1 << ", " << Register2;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCStreamer::EmitWin64EHStartProc(Symbol);
+
+  OS << ".seh_proc " << *Symbol;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProc() {
+  MCStreamer::EmitWin64EHEndProc();
+
+  OS << "\t.seh_endproc";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHStartChained() {
+  MCStreamer::EmitWin64EHStartChained();
+
+  OS << "\t.seh_startchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndChained() {
+  MCStreamer::EmitWin64EHEndChained();
+
+  OS << "\t.seh_endchained";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                       bool Except) {
+  MCStreamer::EmitWin64EHHandler(Sym, Unwind, Except);
+
+  OS << "\t.seh_handler " << *Sym;
+  if (Unwind)
+    OS << ", @unwind";
+  if (Except)
+    OS << ", @except";
+  EmitEOL();
+}
+
+static const MCSection *getWin64EHTableSection(StringRef suffix,
+                                               MCContext &context) {
+  // FIXME: This doesn't belong in MCObjectFileInfo. However,
+  /// this duplicate code in MCWin64EH.cpp.
+  if (suffix == "")
+    return context.getObjectFileInfo()->getXDataSection();
+  return context.getCOFFSection((".xdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+}
+
+void MCAsmStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // Switch sections. Don't call SwitchSection directly, because that will
+  // cause the section switch to be visible in the emitted assembly.
+  // We only do this so the section switch that terminates the handler
+  // data block is visible.
+  MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo();
+  StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function);
+  const MCSection *xdataSect = getWin64EHTableSection(suffix, getContext());
+  if (xdataSect)
+    SwitchSectionNoChange(xdataSect);
+
+  OS << "\t.seh_handlerdata";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushReg(unsigned Register) {
+  MCStreamer::EmitWin64EHPushReg(Register);
+
+  OS << "\t.seh_pushreg " << Register;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSetFrame(Register, Offset);
+
+  OS << "\t.seh_setframe " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  MCStreamer::EmitWin64EHAllocStack(Size);
+
+  OS << "\t.seh_stackalloc " << Size;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveReg(Register, Offset);
+
+  OS << "\t.seh_savereg " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  MCStreamer::EmitWin64EHSaveXMM(Register, Offset);
+
+  OS << "\t.seh_savexmm " << Register << ", " << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHPushFrame(bool Code) {
+  MCStreamer::EmitWin64EHPushFrame(Code);
+
+  OS << "\t.seh_pushframe";
+  if (Code)
+    OS << " @code";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitWin64EHEndProlog(void) {
+  MCStreamer::EmitWin64EHEndProlog();
+
+  OS << "\t.seh_endprologue";
+  EmitEOL();
+}
+
+void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
+  raw_ostream &OS = GetCommentOS();
+  SmallString<256> Code;
+  SmallVector<MCFixup, 4> Fixups;
+  raw_svector_ostream VecOS(Code);
+  Emitter->EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  // If we are showing fixups, create symbolic markers in the encoded
+  // representation. We do this by making a per-bit map to the fixup item index,
+  // then trying to display it as nicely as possible.
+  SmallVector<uint8_t, 64> FixupMap;
+  FixupMap.resize(Code.size() * 8);
+  for (unsigned i = 0, e = Code.size() * 8; i != e; ++i)
+    FixupMap[i] = 0;
+
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    MCFixup &F = Fixups[i];
+    const MCFixupKindInfo &Info = AsmBackend->getFixupKindInfo(F.getKind());
+    for (unsigned j = 0; j != Info.TargetSize; ++j) {
+      unsigned Index = F.getOffset() * 8 + Info.TargetOffset + j;
+      assert(Index < Code.size() * 8 && "Invalid offset in fixup!");
+      FixupMap[Index] = 1 + i;
+    }
+  }
+
+  // FIXME: Note the fixup comments for Thumb2 are completely bogus since the
+  // high order halfword of a 32-bit Thumb2 instruction is emitted first.
+  OS << "encoding: [";
+  for (unsigned i = 0, e = Code.size(); i != e; ++i) {
+    if (i)
+      OS << ',';
+
+    // See if all bits are the same map entry.
+    uint8_t MapEntry = FixupMap[i * 8 + 0];
+    for (unsigned j = 1; j != 8; ++j) {
+      if (FixupMap[i * 8 + j] == MapEntry)
+        continue;
+
+      MapEntry = uint8_t(~0U);
+      break;
+    }
+
+    if (MapEntry != uint8_t(~0U)) {
+      if (MapEntry == 0) {
+        OS << format("0x%02x", uint8_t(Code[i]));
+      } else {
+        if (Code[i]) {
+          // FIXME: Some of the 8 bits require fix up.
+          OS << format("0x%02x", uint8_t(Code[i])) << '\''
+             << char('A' + MapEntry - 1) << '\'';
+        } else
+          OS << char('A' + MapEntry - 1);
+      }
+    } else {
+      // Otherwise, write out in binary.
+      OS << "0b";
+      for (unsigned j = 8; j--;) {
+        unsigned Bit = (Code[i] >> j) & 1;
+
+        unsigned FixupBit;
+        if (getContext().getAsmInfo().isLittleEndian())
+          FixupBit = i * 8 + j;
+        else
+          FixupBit = i * 8 + (7-j);
+
+        if (uint8_t MapEntry = FixupMap[FixupBit]) {
+          assert(Bit == 0 && "Encoder wrote into fixed up bit!");
+          OS << char('A' + MapEntry - 1);
+        } else
+          OS << Bit;
+      }
+    }
+  }
+  OS << "]\n";
+
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    MCFixup &F = Fixups[i];
+    const MCFixupKindInfo &Info = AsmBackend->getFixupKindInfo(F.getKind());
+    OS << "  fixup " << char('A' + i) << " - " << "offset: " << F.getOffset()
+       << ", value: " << *F.getValue() << ", kind: " << Info.Name << "\n";
+  }
+}
+
+void MCAsmStreamer::EmitFnStart() {
+  OS << "\t.fnstart";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitFnEnd() {
+  OS << "\t.fnend";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitCantUnwind() {
+  OS << "\t.cantunwind";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitHandlerData() {
+  OS << "\t.handlerdata";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName();
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter->printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter->printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter->printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter->printRegName(OS, RegList[i]);
+  }
+
+  OS << "}";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitTCEntry(const MCSymbol &S) {
+  OS << "\t.tc ";
+  OS << S.getName();
+  OS << "[TC],";
+  OS << S.getName();
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
+  assert(getCurrentSection() && "Cannot emit contents before setting section!");
+
+  // Show the encoding in a comment if we have a code emitter.
+  if (Emitter)
+    AddEncodingComment(Inst);
+
+  // Show the MCInst if enabled.
+  if (ShowInst) {
+    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
+    GetCommentOS() << "\n";
+  }
+
+  // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
+  if (InstPrinter)
+    InstPrinter->printInst(&Inst, OS, "");
+  else
+    Inst.print(OS, &MAI);
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+  OS << "\t.bundle_align_mode " << AlignPow2;
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleLock(bool AlignToEnd) {
+  OS << "\t.bundle_lock";
+  if (AlignToEnd)
+    OS << " align_to_end";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleUnlock() {
+  OS << "\t.bundle_unlock";
+  EmitEOL();
+}
+
+/// EmitRawText - If this file is backed by an assembly streamer, this dumps
+/// the specified string in the output .s file.  This capability is
+/// indicated by the hasRawTextSupport() predicate.
+void MCAsmStreamer::EmitRawText(StringRef String) {
+  if (!String.empty() && String.back() == '\n')
+    String = String.substr(0, String.size()-1);
+  OS << String;
+  EmitEOL();
+}
+
+void MCAsmStreamer::FinishImpl() {
+  // FIXME: This header is duplicated with MCObjectStreamer
+  // Dump out the dwarf file & directory tables and line tables.
+  const MCSymbol *LineSectionSymbol = NULL;
+  if (getContext().hasDwarfFiles() && !UseLoc)
+    LineSectionSymbol = MCDwarfFileTable::Emit(this);
+
+  // If we are generating dwarf for assembly source files dump out the sections.
+  if (getContext().getGenDwarfForAssembly())
+    MCGenDwarfInfo::Emit(this, LineSectionSymbol);
+
+  if (!UseCFI)
+    EmitFrames(false);
+}
+MCStreamer *llvm::createAsmStreamer(MCContext &Context,
+                                    formatted_raw_ostream &OS,
+                                    bool isVerboseAsm, bool useLoc,
+                                    bool useCFI, bool useDwarfDirectory,
+                                    MCInstPrinter *IP, MCCodeEmitter *CE,
+                                    MCAsmBackend *MAB, bool ShowInst) {
+  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
+                           useDwarfDirectory, IP, CE, MAB, ShowInst);
+}
diff --git a/contrib/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm/lib/MC/MCAssembler.cpp
new file mode 100644
index 000000000000..1829266f96cb
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAssembler.cpp
@@ -0,0 +1,1154 @@
+//===- lib/MC/MCAssembler.cpp - Assembler Backend Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "assembler"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+namespace stats {
+STATISTIC(EmittedFragments, "Number of emitted assembler fragments - total");
+STATISTIC(EmittedRelaxableFragments,
+          "Number of emitted assembler fragments - relaxable");
+STATISTIC(EmittedDataFragments,
+          "Number of emitted assembler fragments - data");
+STATISTIC(EmittedCompactEncodedInstFragments,
+          "Number of emitted assembler fragments - compact encoded inst");
+STATISTIC(EmittedAlignFragments,
+          "Number of emitted assembler fragments - align");
+STATISTIC(EmittedFillFragments,
+          "Number of emitted assembler fragments - fill");
+STATISTIC(EmittedOrgFragments,
+          "Number of emitted assembler fragments - org");
+STATISTIC(evaluateFixup, "Number of evaluated fixups");
+STATISTIC(FragmentLayouts, "Number of fragment layouts");
+STATISTIC(ObjectBytes, "Number of emitted object file bytes");
+STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
+STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
+}
+}
+
+// FIXME FIXME FIXME: There are number of places in this file where we convert
+// what is a 64-bit assembler value used for computation into a value in the
+// object file, which may truncate it. We should detect that truncation where
+// invalid and report errors back.
+
+/* *** */
+
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
+  : Assembler(Asm), LastValidFragment()
+ {
+  // Compute the section layout order. Virtual sections must go last.
+  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it)
+    if (!it->getSection().isVirtualSection())
+      SectionOrder.push_back(&*it);
+  for (MCAssembler::iterator it = Asm.begin(), ie = Asm.end(); it != ie; ++it)
+    if (it->getSection().isVirtualSection())
+      SectionOrder.push_back(&*it);
+}
+
+bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
+  const MCSectionData &SD = *F->getParent();
+  const MCFragment *LastValid = LastValidFragment.lookup(&SD);
+  if (!LastValid)
+    return false;
+  assert(LastValid->getParent() == F->getParent());
+  return F->getLayoutOrder() <= LastValid->getLayoutOrder();
+}
+
+void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
+  // If this fragment wasn't already valid, we don't need to do anything.
+  if (!isFragmentValid(F))
+    return;
+
+  // Otherwise, reset the last valid fragment to the previous fragment
+  // (if this is the first fragment, it will be NULL).
+  const MCSectionData &SD = *F->getParent();
+  LastValidFragment[&SD] = F->getPrevNode();
+}
+
+void MCAsmLayout::ensureValid(const MCFragment *F) const {
+  MCSectionData &SD = *F->getParent();
+
+  MCFragment *Cur = LastValidFragment[&SD];
+  if (!Cur)
+    Cur = &*SD.begin();
+  else
+    Cur = Cur->getNextNode();
+
+  // Advance the layout position until the fragment is valid.
+  while (!isFragmentValid(F)) {
+    assert(Cur && "Layout bookkeeping error");
+    const_cast<MCAsmLayout*>(this)->layoutFragment(Cur);
+    Cur = Cur->getNextNode();
+  }
+}
+
+uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
+  ensureValid(F);
+  assert(F->Offset != ~UINT64_C(0) && "Address not set!");
+  return F->Offset;
+}
+
+uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const {
+  const MCSymbol &S = SD->getSymbol();
+
+  // If this is a variable, then recursively evaluate now.
+  if (S.isVariable()) {
+    MCValue Target;
+    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, *this))
+      report_fatal_error("unable to evaluate offset for variable '" +
+                         S.getName() + "'");
+
+    // Verify that any used symbols are defined.
+    if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymA()->getSymbol().getName() + "'");
+    if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymB()->getSymbol().getName() + "'");
+
+    uint64_t Offset = Target.getConstant();
+    if (Target.getSymA())
+      Offset += getSymbolOffset(&Assembler.getSymbolData(
+                                  Target.getSymA()->getSymbol()));
+    if (Target.getSymB())
+      Offset -= getSymbolOffset(&Assembler.getSymbolData(
+                                  Target.getSymB()->getSymbol()));
+    return Offset;
+  }
+
+  assert(SD->getFragment() && "Invalid getOffset() on undefined symbol!");
+  return getFragmentOffset(SD->getFragment()) + SD->getOffset();
+}
+
+uint64_t MCAsmLayout::getSectionAddressSize(const MCSectionData *SD) const {
+  // The size is the last fragment's end offset.
+  const MCFragment &F = SD->getFragmentList().back();
+  return getFragmentOffset(&F) + getAssembler().computeFragmentSize(*this, F);
+}
+
+uint64_t MCAsmLayout::getSectionFileSize(const MCSectionData *SD) const {
+  // Virtual sections have no file size.
+  if (SD->getSection().isVirtualSection())
+    return 0;
+
+  // Otherwise, the file size is the same as the address space size.
+  return getSectionAddressSize(SD);
+}
+
+uint64_t MCAsmLayout::computeBundlePadding(const MCFragment *F,
+                                           uint64_t FOffset, uint64_t FSize) {
+  uint64_t BundleSize = Assembler.getBundleAlignSize();
+  assert(BundleSize > 0 &&
+         "computeBundlePadding should only be called if bundling is enabled");
+  uint64_t BundleMask = BundleSize - 1;
+  uint64_t OffsetInBundle = FOffset & BundleMask;
+  uint64_t EndOfFragment = OffsetInBundle + FSize;
+
+  // There are two kinds of bundling restrictions:
+  //
+  // 1) For alignToBundleEnd(), add padding to ensure that the fragment will
+  //    *end* on a bundle boundary.
+  // 2) Otherwise, check if the fragment would cross a bundle boundary. If it
+  //    would, add padding until the end of the bundle so that the fragment
+  //    will start in a new one.
+  if (F->alignToBundleEnd()) {
+    // Three possibilities here:
+    //
+    // A) The fragment just happens to end at a bundle boundary, so we're good.
+    // B) The fragment ends before the current bundle boundary: pad it just
+    //    enough to reach the boundary.
+    // C) The fragment ends after the current bundle boundary: pad it until it
+    //    reaches the end of the next bundle boundary.
+    //
+    // Note: this code could be made shorter with some modulo trickery, but it's
+    // intentionally kept in its more explicit form for simplicity.
+    if (EndOfFragment == BundleSize)
+      return 0;
+    else if (EndOfFragment < BundleSize)
+      return BundleSize - EndOfFragment;
+    else { // EndOfFragment > BundleSize
+      return 2 * BundleSize - EndOfFragment;
+    }
+  } else if (EndOfFragment > BundleSize)
+    return BundleSize - OffsetInBundle;
+  else
+    return 0;
+}
+
+/* *** */
+
+MCFragment::MCFragment() : Kind(FragmentType(~0)) {
+}
+
+MCFragment::~MCFragment() {
+}
+
+MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
+  : Kind(_Kind), Parent(_Parent), Atom(0), Offset(~UINT64_C(0))
+{
+  if (Parent)
+    Parent->getFragmentList().push_back(this);
+}
+
+/* *** */
+
+MCEncodedFragment::~MCEncodedFragment() {
+}
+
+/* *** */
+
+MCEncodedFragmentWithFixups::~MCEncodedFragmentWithFixups() {
+}
+
+/* *** */
+
+MCSectionData::MCSectionData() : Section(0) {}
+
+MCSectionData::MCSectionData(const MCSection &_Section, MCAssembler *A)
+  : Section(&_Section),
+    Ordinal(~UINT32_C(0)),
+    Alignment(1),
+    BundleLockState(NotBundleLocked), BundleGroupBeforeFirstInst(false),
+    HasInstructions(false)
+{
+  if (A)
+    A->getSectionList().push_back(this);
+}
+
+/* *** */
+
+MCSymbolData::MCSymbolData() : Symbol(0) {}
+
+MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
+                           uint64_t _Offset, MCAssembler *A)
+  : Symbol(&_Symbol), Fragment(_Fragment), Offset(_Offset),
+    IsExternal(false), IsPrivateExtern(false),
+    CommonSize(0), SymbolSize(0), CommonAlign(0),
+    Flags(0), Index(0)
+{
+  if (A)
+    A->getSymbolList().push_back(this);
+}
+
+/* *** */
+
+MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
+                         MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
+                         raw_ostream &OS_)
+  : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
+    OS(OS_), BundleAlignSize(0), RelaxAll(false), NoExecStack(false),
+    SubsectionsViaSymbols(false), ELFHeaderEFlags(0) {
+}
+
+MCAssembler::~MCAssembler() {
+}
+
+void MCAssembler::reset() {
+  Sections.clear();
+  Symbols.clear();
+  SectionMap.clear();
+  SymbolMap.clear();
+  IndirectSymbols.clear();
+  DataRegions.clear();
+  ThumbFuncs.clear();
+  RelaxAll = false;
+  NoExecStack = false;
+  SubsectionsViaSymbols = false;
+  ELFHeaderEFlags = 0;
+
+  // reset objects owned by us
+  getBackend().reset();
+  getEmitter().reset();
+  getWriter().reset();
+}
+
+bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
+  // Non-temporary labels should always be visible to the linker.
+  if (!Symbol.isTemporary())
+    return true;
+
+  // Absolute temporary labels are never visible.
+  if (!Symbol.isInSection())
+    return false;
+
+  // Otherwise, check if the section requires symbols even for temporary labels.
+  return getBackend().doesSectionRequireSymbols(Symbol.getSection());
+}
+
+const MCSymbolData *MCAssembler::getAtom(const MCSymbolData *SD) const {
+  // Linker visible symbols define atoms.
+  if (isSymbolLinkerVisible(SD->getSymbol()))
+    return SD;
+
+  // Absolute and undefined symbols have no defining atom.
+  if (!SD->getFragment())
+    return 0;
+
+  // Non-linker visible symbols in sections which can't be atomized have no
+  // defining atom.
+  if (!getBackend().isSectionAtomizable(
+        SD->getFragment()->getParent()->getSection()))
+    return 0;
+
+  // Otherwise, return the atom for the containing fragment.
+  return SD->getFragment()->getAtom();
+}
+
+bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
+                                const MCFixup &Fixup, const MCFragment *DF,
+                                MCValue &Target, uint64_t &Value) const {
+  ++stats::evaluateFixup;
+
+  if (!Fixup.getValue()->EvaluateAsRelocatable(Target, Layout))
+    getContext().FatalError(Fixup.getLoc(), "expected relocatable expression");
+
+  bool IsPCRel = Backend.getFixupKindInfo(
+    Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel;
+
+  bool IsResolved;
+  if (IsPCRel) {
+    if (Target.getSymB()) {
+      IsResolved = false;
+    } else if (!Target.getSymA()) {
+      IsResolved = false;
+    } else {
+      const MCSymbolRefExpr *A = Target.getSymA();
+      const MCSymbol &SA = A->getSymbol();
+      if (A->getKind() != MCSymbolRefExpr::VK_None ||
+          SA.AliasedSymbol().isUndefined()) {
+        IsResolved = false;
+      } else {
+        const MCSymbolData &DataA = getSymbolData(SA);
+        IsResolved =
+          getWriter().IsSymbolRefDifferenceFullyResolvedImpl(*this, DataA,
+                                                             *DF, false, true);
+      }
+    }
+  } else {
+    IsResolved = Target.isAbsolute();
+  }
+
+  Value = Target.getConstant();
+
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+    if (Sym.isDefined())
+      Value += Layout.getSymbolOffset(&getSymbolData(Sym));
+  }
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    const MCSymbol &Sym = B->getSymbol().AliasedSymbol();
+    if (Sym.isDefined())
+      Value -= Layout.getSymbolOffset(&getSymbolData(Sym));
+  }
+
+
+  bool ShouldAlignPC = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
+                         MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
+  assert((ShouldAlignPC ? IsPCRel : true) &&
+    "FKF_IsAlignedDownTo32Bits is only allowed on PC-relative fixups!");
+
+  if (IsPCRel) {
+    uint32_t Offset = Layout.getFragmentOffset(DF) + Fixup.getOffset();
+
+    // A number of ARM fixups in Thumb mode require that the effective PC
+    // address be determined as the 32-bit aligned version of the actual offset.
+    if (ShouldAlignPC) Offset &= ~0x3;
+    Value -= Offset;
+  }
+
+  // Let the backend adjust the fixup value if necessary, including whether
+  // we need a relocation.
+  Backend.processFixupValue(*this, Layout, Fixup, DF, Target, Value,
+                            IsResolved);
+
+  return IsResolved;
+}
+
+uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
+                                          const MCFragment &F) const {
+  switch (F.getKind()) {
+  case MCFragment::FT_Data:
+  case MCFragment::FT_Relaxable:
+  case MCFragment::FT_CompactEncodedInst:
+    return cast<MCEncodedFragment>(F).getContents().size();
+  case MCFragment::FT_Fill:
+    return cast<MCFillFragment>(F).getSize();
+
+  case MCFragment::FT_LEB:
+    return cast<MCLEBFragment>(F).getContents().size();
+
+  case MCFragment::FT_Align: {
+    const MCAlignFragment &AF = cast<MCAlignFragment>(F);
+    unsigned Offset = Layout.getFragmentOffset(&AF);
+    unsigned Size = OffsetToAlignment(Offset, AF.getAlignment());
+    // If we are padding with nops, force the padding to be larger than the
+    // minimum nop size.
+    if (Size > 0 && AF.hasEmitNops()) {
+      while (Size % getBackend().getMinimumNopSize())
+        Size += AF.getAlignment();
+    }
+    if (Size > AF.getMaxBytesToEmit())
+      return 0;
+    return Size;
+  }
+
+  case MCFragment::FT_Org: {
+    const MCOrgFragment &OF = cast<MCOrgFragment>(F);
+    int64_t TargetLocation;
+    if (!OF.getOffset().EvaluateAsAbsolute(TargetLocation, Layout))
+      report_fatal_error("expected assembly-time absolute expression");
+
+    // FIXME: We need a way to communicate this error.
+    uint64_t FragmentOffset = Layout.getFragmentOffset(&OF);
+    int64_t Size = TargetLocation - FragmentOffset;
+    if (Size < 0 || Size >= 0x40000000)
+      report_fatal_error("invalid .org offset '" + Twine(TargetLocation) +
+                         "' (at offset '" + Twine(FragmentOffset) + "')");
+    return Size;
+  }
+
+  case MCFragment::FT_Dwarf:
+    return cast<MCDwarfLineAddrFragment>(F).getContents().size();
+  case MCFragment::FT_DwarfFrame:
+    return cast<MCDwarfCallFrameFragment>(F).getContents().size();
+  }
+
+  llvm_unreachable("invalid fragment kind");
+}
+
+void MCAsmLayout::layoutFragment(MCFragment *F) {
+  MCFragment *Prev = F->getPrevNode();
+
+  // We should never try to recompute something which is valid.
+  assert(!isFragmentValid(F) && "Attempt to recompute a valid fragment!");
+  // We should never try to compute the fragment layout if its predecessor
+  // isn't valid.
+  assert((!Prev || isFragmentValid(Prev)) &&
+         "Attempt to compute fragment before its predecessor!");
+
+  ++stats::FragmentLayouts;
+
+  // Compute fragment offset and size.
+  if (Prev)
+    F->Offset = Prev->Offset + getAssembler().computeFragmentSize(*this, *Prev);
+  else
+    F->Offset = 0;
+  LastValidFragment[F->getParent()] = F;
+
+  // If bundling is enabled and this fragment has instructions in it, it has to
+  // obey the bundling restrictions. With padding, we'll have:
+  //
+  //
+  //        BundlePadding
+  //             |||
+  // -------------------------------------
+  //   Prev  |##########|       F        |
+  // -------------------------------------
+  //                    ^
+  //                    |
+  //                    F->Offset
+  //
+  // The fragment's offset will point to after the padding, and its computed
+  // size won't include the padding.
+  //
+  if (Assembler.isBundlingEnabled() && F->hasInstructions()) {
+    assert(isa<MCEncodedFragment>(F) &&
+           "Only MCEncodedFragment implementations have instructions");
+    uint64_t FSize = Assembler.computeFragmentSize(*this, *F);
+
+    if (FSize > Assembler.getBundleAlignSize())
+      report_fatal_error("Fragment can't be larger than a bundle size");
+
+    uint64_t RequiredBundlePadding = computeBundlePadding(F, F->Offset, FSize);
+    if (RequiredBundlePadding > UINT8_MAX)
+      report_fatal_error("Padding cannot exceed 255 bytes");
+    F->setBundlePadding(static_cast<uint8_t>(RequiredBundlePadding));
+    F->Offset += RequiredBundlePadding;
+  }
+}
+
+/// \brief Write the contents of a fragment to the given object writer. Expects
+///        a MCEncodedFragment.
+static void writeFragmentContents(const MCFragment &F, MCObjectWriter *OW) {
+  const MCEncodedFragment &EF = cast<MCEncodedFragment>(F);
+  OW->WriteBytes(EF.getContents());
+}
+
+/// \brief Write the fragment \p F to the output file.
+static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                          const MCFragment &F) {
+  MCObjectWriter *OW = &Asm.getWriter();
+
+  // FIXME: Embed in fragments instead?
+  uint64_t FragmentSize = Asm.computeFragmentSize(Layout, F);
+
+  // Should NOP padding be written out before this fragment?
+  unsigned BundlePadding = F.getBundlePadding();
+  if (BundlePadding > 0) {
+    assert(Asm.isBundlingEnabled() &&
+           "Writing bundle padding with disabled bundling");
+    assert(F.hasInstructions() &&
+           "Writing bundle padding for a fragment without instructions");
+
+    unsigned TotalLength = BundlePadding + static_cast<unsigned>(FragmentSize);
+    if (F.alignToBundleEnd() && TotalLength > Asm.getBundleAlignSize()) {
+      // If the padding itself crosses a bundle boundary, it must be emitted
+      // in 2 pieces, since even nop instructions must not cross boundaries.
+      //             v--------------v   <- BundleAlignSize
+      //        v---------v             <- BundlePadding
+      // ----------------------------
+      // | Prev |####|####|    F    |
+      // ----------------------------
+      //        ^-------------------^   <- TotalLength
+      unsigned DistanceToBoundary = TotalLength - Asm.getBundleAlignSize();
+      if (!Asm.getBackend().writeNopData(DistanceToBoundary, OW))
+          report_fatal_error("unable to write NOP sequence of " +
+                             Twine(DistanceToBoundary) + " bytes");
+      BundlePadding -= DistanceToBoundary;
+    }
+    if (!Asm.getBackend().writeNopData(BundlePadding, OW))
+      report_fatal_error("unable to write NOP sequence of " +
+                         Twine(BundlePadding) + " bytes");
+  }
+
+  // This variable (and its dummy usage) is to participate in the assert at
+  // the end of the function.
+  uint64_t Start = OW->getStream().tell();
+  (void) Start;
+
+  ++stats::EmittedFragments;
+
+  switch (F.getKind()) {
+  case MCFragment::FT_Align: {
+    ++stats::EmittedAlignFragments;
+    const MCAlignFragment &AF = cast<MCAlignFragment>(F);
+    uint64_t Count = FragmentSize / AF.getValueSize();
+
+    assert(AF.getValueSize() && "Invalid virtual align in concrete fragment!");
+
+    // FIXME: This error shouldn't actually occur (the front end should emit
+    // multiple .align directives to enforce the semantics it wants), but is
+    // severe enough that we want to report it. How to handle this?
+    if (Count * AF.getValueSize() != FragmentSize)
+      report_fatal_error("undefined .align directive, value size '" +
+                        Twine(AF.getValueSize()) +
+                        "' is not a divisor of padding size '" +
+                        Twine(FragmentSize) + "'");
+
+    // See if we are aligning with nops, and if so do that first to try to fill
+    // the Count bytes.  Then if that did not fill any bytes or there are any
+    // bytes left to fill use the Value and ValueSize to fill the rest.
+    // If we are aligning with nops, ask that target to emit the right data.
+    if (AF.hasEmitNops()) {
+      if (!Asm.getBackend().writeNopData(Count, OW))
+        report_fatal_error("unable to write nop sequence of " +
+                          Twine(Count) + " bytes");
+      break;
+    }
+
+    // Otherwise, write out in multiples of the value size.
+    for (uint64_t i = 0; i != Count; ++i) {
+      switch (AF.getValueSize()) {
+      default: llvm_unreachable("Invalid size!");
+      case 1: OW->Write8 (uint8_t (AF.getValue())); break;
+      case 2: OW->Write16(uint16_t(AF.getValue())); break;
+      case 4: OW->Write32(uint32_t(AF.getValue())); break;
+      case 8: OW->Write64(uint64_t(AF.getValue())); break;
+      }
+    }
+    break;
+  }
+
+  case MCFragment::FT_Data: 
+    ++stats::EmittedDataFragments;
+    writeFragmentContents(F, OW);
+    break;
+
+  case MCFragment::FT_Relaxable:
+    ++stats::EmittedRelaxableFragments;
+    writeFragmentContents(F, OW);
+    break;
+
+  case MCFragment::FT_CompactEncodedInst:
+    ++stats::EmittedCompactEncodedInstFragments;
+    writeFragmentContents(F, OW);
+    break;
+
+  case MCFragment::FT_Fill: {
+    ++stats::EmittedFillFragments;
+    const MCFillFragment &FF = cast<MCFillFragment>(F);
+
+    assert(FF.getValueSize() && "Invalid virtual align in concrete fragment!");
+
+    for (uint64_t i = 0, e = FF.getSize() / FF.getValueSize(); i != e; ++i) {
+      switch (FF.getValueSize()) {
+      default: llvm_unreachable("Invalid size!");
+      case 1: OW->Write8 (uint8_t (FF.getValue())); break;
+      case 2: OW->Write16(uint16_t(FF.getValue())); break;
+      case 4: OW->Write32(uint32_t(FF.getValue())); break;
+      case 8: OW->Write64(uint64_t(FF.getValue())); break;
+      }
+    }
+    break;
+  }
+
+  case MCFragment::FT_LEB: {
+    const MCLEBFragment &LF = cast<MCLEBFragment>(F);
+    OW->WriteBytes(LF.getContents().str());
+    break;
+  }
+
+  case MCFragment::FT_Org: {
+    ++stats::EmittedOrgFragments;
+    const MCOrgFragment &OF = cast<MCOrgFragment>(F);
+
+    for (uint64_t i = 0, e = FragmentSize; i != e; ++i)
+      OW->Write8(uint8_t(OF.getValue()));
+
+    break;
+  }
+
+  case MCFragment::FT_Dwarf: {
+    const MCDwarfLineAddrFragment &OF = cast<MCDwarfLineAddrFragment>(F);
+    OW->WriteBytes(OF.getContents().str());
+    break;
+  }
+  case MCFragment::FT_DwarfFrame: {
+    const MCDwarfCallFrameFragment &CF = cast<MCDwarfCallFrameFragment>(F);
+    OW->WriteBytes(CF.getContents().str());
+    break;
+  }
+  }
+
+  assert(OW->getStream().tell() - Start == FragmentSize &&
+         "The stream should advance by fragment size");
+}
+
+void MCAssembler::writeSectionData(const MCSectionData *SD,
+                                   const MCAsmLayout &Layout) const {
+  // Ignore virtual sections.
+  if (SD->getSection().isVirtualSection()) {
+    assert(Layout.getSectionFileSize(SD) == 0 && "Invalid size for section!");
+
+    // Check that contents are only things legal inside a virtual section.
+    for (MCSectionData::const_iterator it = SD->begin(),
+           ie = SD->end(); it != ie; ++it) {
+      switch (it->getKind()) {
+      default: llvm_unreachable("Invalid fragment in virtual section!");
+      case MCFragment::FT_Data: {
+        // Check that we aren't trying to write a non-zero contents (or fixups)
+        // into a virtual section. This is to support clients which use standard
+        // directives to fill the contents of virtual sections.
+        const MCDataFragment &DF = cast<MCDataFragment>(*it);
+        assert(DF.fixup_begin() == DF.fixup_end() &&
+               "Cannot have fixups in virtual section!");
+        for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
+          assert(DF.getContents()[i] == 0 &&
+                 "Invalid data value for virtual section!");
+        break;
+      }
+      case MCFragment::FT_Align:
+        // Check that we aren't trying to write a non-zero value into a virtual
+        // section.
+        assert((!cast<MCAlignFragment>(it)->getValueSize() ||
+                !cast<MCAlignFragment>(it)->getValue()) &&
+               "Invalid align in virtual section!");
+        break;
+      case MCFragment::FT_Fill:
+        assert(!cast<MCFillFragment>(it)->getValueSize() &&
+               "Invalid fill in virtual section!");
+        break;
+      }
+    }
+
+    return;
+  }
+
+  uint64_t Start = getWriter().getStream().tell();
+  (void)Start;
+
+  for (MCSectionData::const_iterator it = SD->begin(), ie = SD->end();
+       it != ie; ++it)
+    writeFragment(*this, Layout, *it);
+
+  assert(getWriter().getStream().tell() - Start ==
+         Layout.getSectionAddressSize(SD));
+}
+
+
+uint64_t MCAssembler::handleFixup(const MCAsmLayout &Layout,
+                                  MCFragment &F,
+                                  const MCFixup &Fixup) {
+   // Evaluate the fixup.
+   MCValue Target;
+   uint64_t FixedValue;
+   if (!evaluateFixup(Layout, Fixup, &F, Target, FixedValue)) {
+     // The fixup was unresolved, we need a relocation. Inform the object
+     // writer of the relocation, and give it an opportunity to adjust the
+     // fixup value if need be.
+     getWriter().RecordRelocation(*this, Layout, &F, Fixup, Target, FixedValue);
+   }
+   return FixedValue;
+ }
+
+void MCAssembler::Finish() {
+  DEBUG_WITH_TYPE("mc-dump", {
+      llvm::errs() << "assembler backend - pre-layout\n--\n";
+      dump(); });
+
+  // Create the layout object.
+  MCAsmLayout Layout(*this);
+
+  // Create dummy fragments and assign section ordinals.
+  unsigned SectionIndex = 0;
+  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
+    // Create dummy fragments to eliminate any empty sections, this simplifies
+    // layout.
+    if (it->getFragmentList().empty())
+      new MCDataFragment(it);
+
+    it->setOrdinal(SectionIndex++);
+  }
+
+  // Assign layout order indices to sections and fragments.
+  for (unsigned i = 0, e = Layout.getSectionOrder().size(); i != e; ++i) {
+    MCSectionData *SD = Layout.getSectionOrder()[i];
+    SD->setLayoutOrder(i);
+
+    unsigned FragmentIndex = 0;
+    for (MCSectionData::iterator iFrag = SD->begin(), iFragEnd = SD->end();
+         iFrag != iFragEnd; ++iFrag)
+      iFrag->setLayoutOrder(FragmentIndex++);
+  }
+
+  // Layout until everything fits.
+  while (layoutOnce(Layout))
+    continue;
+
+  DEBUG_WITH_TYPE("mc-dump", {
+      llvm::errs() << "assembler backend - post-relaxation\n--\n";
+      dump(); });
+
+  // Finalize the layout, including fragment lowering.
+  finishLayout(Layout);
+
+  DEBUG_WITH_TYPE("mc-dump", {
+      llvm::errs() << "assembler backend - final-layout\n--\n";
+      dump(); });
+
+  uint64_t StartOffset = OS.tell();
+
+  // Allow the object writer a chance to perform post-layout binding (for
+  // example, to set the index fields in the symbol data).
+  getWriter().ExecutePostLayoutBinding(*this, Layout);
+
+  // Evaluate and apply the fixups, generating relocation entries as necessary.
+  for (MCAssembler::iterator it = begin(), ie = end(); it != ie; ++it) {
+    for (MCSectionData::iterator it2 = it->begin(),
+           ie2 = it->end(); it2 != ie2; ++it2) {
+      MCEncodedFragmentWithFixups *F =
+        dyn_cast<MCEncodedFragmentWithFixups>(it2);
+      if (F) {
+        for (MCEncodedFragmentWithFixups::fixup_iterator it3 = F->fixup_begin(),
+             ie3 = F->fixup_end(); it3 != ie3; ++it3) {
+          MCFixup &Fixup = *it3;
+          uint64_t FixedValue = handleFixup(Layout, *F, Fixup);
+          getBackend().applyFixup(Fixup, F->getContents().data(),
+                                  F->getContents().size(), FixedValue);
+        }
+      }
+    }
+  }
+
+  // Write the object file.
+  getWriter().WriteObject(*this, Layout);
+
+  stats::ObjectBytes += OS.tell() - StartOffset;
+}
+
+bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                       const MCRelaxableFragment *DF,
+                                       const MCAsmLayout &Layout) const {
+  // If we cannot resolve the fixup value, it requires relaxation.
+  MCValue Target;
+  uint64_t Value;
+  if (!evaluateFixup(Layout, Fixup, DF, Target, Value))
+    return true;
+
+  return getBackend().fixupNeedsRelaxation(Fixup, Value, DF, Layout);
+}
+
+bool MCAssembler::fragmentNeedsRelaxation(const MCRelaxableFragment *F,
+                                          const MCAsmLayout &Layout) const {
+  // If this inst doesn't ever need relaxation, ignore it. This occurs when we
+  // are intentionally pushing out inst fragments, or because we relaxed a
+  // previous instruction to one that doesn't need relaxation.
+  if (!getBackend().mayNeedRelaxation(F->getInst()))
+    return false;
+
+  for (MCRelaxableFragment::const_fixup_iterator it = F->fixup_begin(),
+       ie = F->fixup_end(); it != ie; ++it)
+    if (fixupNeedsRelaxation(*it, F, Layout))
+      return true;
+
+  return false;
+}
+
+bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
+                                   MCRelaxableFragment &F) {
+  if (!fragmentNeedsRelaxation(&F, Layout))
+    return false;
+
+  ++stats::RelaxedInstructions;
+
+  // FIXME-PERF: We could immediately lower out instructions if we can tell
+  // they are fully resolved, to avoid retesting on later passes.
+
+  // Relax the fragment.
+
+  MCInst Relaxed;
+  getBackend().relaxInstruction(F.getInst(), Relaxed);
+
+  // Encode the new instruction.
+  //
+  // FIXME-PERF: If it matters, we could let the target do this. It can
+  // probably do so more efficiently in many cases.
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getEmitter().EncodeInstruction(Relaxed, VecOS, Fixups);
+  VecOS.flush();
+
+  // Update the fragment.
+  F.setInst(Relaxed);
+  F.getContents() = Code;
+  F.getFixups() = Fixups;
+
+  return true;
+}
+
+bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
+  int64_t Value = 0;
+  uint64_t OldSize = LF.getContents().size();
+  bool IsAbs = LF.getValue().EvaluateAsAbsolute(Value, Layout);
+  (void)IsAbs;
+  assert(IsAbs);
+  SmallString<8> &Data = LF.getContents();
+  Data.clear();
+  raw_svector_ostream OSE(Data);
+  if (LF.isSigned())
+    encodeSLEB128(Value, OSE);
+  else
+    encodeULEB128(Value, OSE);
+  OSE.flush();
+  return OldSize != LF.getContents().size();
+}
+
+bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
+                                     MCDwarfLineAddrFragment &DF) {
+  int64_t AddrDelta = 0;
+  uint64_t OldSize = DF.getContents().size();
+  bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
+  (void)IsAbs;
+  assert(IsAbs);
+  int64_t LineDelta;
+  LineDelta = DF.getLineDelta();
+  SmallString<8> &Data = DF.getContents();
+  Data.clear();
+  raw_svector_ostream OSE(Data);
+  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OSE);
+  OSE.flush();
+  return OldSize != Data.size();
+}
+
+bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
+                                              MCDwarfCallFrameFragment &DF) {
+  int64_t AddrDelta = 0;
+  uint64_t OldSize = DF.getContents().size();
+  bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
+  (void)IsAbs;
+  assert(IsAbs);
+  SmallString<8> &Data = DF.getContents();
+  Data.clear();
+  raw_svector_ostream OSE(Data);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OSE);
+  OSE.flush();
+  return OldSize != Data.size();
+}
+
+bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
+  // Holds the first fragment which needed relaxing during this layout. It will
+  // remain NULL if none were relaxed.
+  // When a fragment is relaxed, all the fragments following it should get
+  // invalidated because their offset is going to change.
+  MCFragment *FirstRelaxedFragment = NULL;
+
+  // Attempt to relax all the fragments in the section.
+  for (MCSectionData::iterator I = SD.begin(), IE = SD.end(); I != IE; ++I) {
+    // Check if this is a fragment that needs relaxation.
+    bool RelaxedFrag = false;
+    switch(I->getKind()) {
+    default:
+      break;
+    case MCFragment::FT_Relaxable:
+      assert(!getRelaxAll() &&
+             "Did not expect a MCRelaxableFragment in RelaxAll mode");
+      RelaxedFrag = relaxInstruction(Layout, *cast<MCRelaxableFragment>(I));
+      break;
+    case MCFragment::FT_Dwarf:
+      RelaxedFrag = relaxDwarfLineAddr(Layout,
+                                       *cast<MCDwarfLineAddrFragment>(I));
+      break;
+    case MCFragment::FT_DwarfFrame:
+      RelaxedFrag =
+        relaxDwarfCallFrameFragment(Layout,
+                                    *cast<MCDwarfCallFrameFragment>(I));
+      break;
+    case MCFragment::FT_LEB:
+      RelaxedFrag = relaxLEB(Layout, *cast<MCLEBFragment>(I));
+      break;
+    }
+    if (RelaxedFrag && !FirstRelaxedFragment)
+      FirstRelaxedFragment = I;
+  }
+  if (FirstRelaxedFragment) {
+    Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
+    return true;
+  }
+  return false;
+}
+
+bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
+  ++stats::RelaxationSteps;
+
+  bool WasRelaxed = false;
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    MCSectionData &SD = *it;
+    while (layoutSectionOnce(Layout, SD))
+      WasRelaxed = true;
+  }
+
+  return WasRelaxed;
+}
+
+void MCAssembler::finishLayout(MCAsmLayout &Layout) {
+  // The layout is done. Mark every fragment as valid.
+  for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+    Layout.getFragmentOffset(&*Layout.getSectionOrder()[i]->rbegin());
+  }
+}
+
+// Debugging methods
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
+  OS << "<MCFixup" << " Offset:" << AF.getOffset()
+     << " Value:" << *AF.getValue()
+     << " Kind:" << AF.getKind() << ">";
+  return OS;
+}
+
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCFragment::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<";
+  switch (getKind()) {
+  case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
+  case MCFragment::FT_CompactEncodedInst:
+    OS << "MCCompactEncodedInstFragment"; break;
+  case MCFragment::FT_Fill:  OS << "MCFillFragment"; break;
+  case MCFragment::FT_Relaxable:  OS << "MCRelaxableFragment"; break;
+  case MCFragment::FT_Org:   OS << "MCOrgFragment"; break;
+  case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
+  case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
+  case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
+  }
+
+  OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
+     << " Offset:" << Offset
+     << " HasInstructions:" << hasInstructions() 
+     << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
+
+  switch (getKind()) {
+  case MCFragment::FT_Align: {
+    const MCAlignFragment *AF = cast<MCAlignFragment>(this);
+    if (AF->hasEmitNops())
+      OS << " (emit nops)";
+    OS << "\n       ";
+    OS << " Alignment:" << AF->getAlignment()
+       << " Value:" << AF->getValue() << " ValueSize:" << AF->getValueSize()
+       << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
+    break;
+  }
+  case MCFragment::FT_Data:  {
+    const MCDataFragment *DF = cast<MCDataFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = DF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+
+    if (DF->fixup_begin() != DF->fixup_end()) {
+      OS << ",\n       ";
+      OS << " Fixups:[";
+      for (MCDataFragment::const_fixup_iterator it = DF->fixup_begin(),
+             ie = DF->fixup_end(); it != ie; ++it) {
+        if (it != DF->fixup_begin()) OS << ",\n                ";
+        OS << *it;
+      }
+      OS << "]";
+    }
+    break;
+  }
+  case MCFragment::FT_CompactEncodedInst: {
+    const MCCompactEncodedInstFragment *CEIF =
+      cast<MCCompactEncodedInstFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = CEIF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+    break;
+  }
+  case MCFragment::FT_Fill:  {
+    const MCFillFragment *FF = cast<MCFillFragment>(this);
+    OS << " Value:" << FF->getValue() << " ValueSize:" << FF->getValueSize()
+       << " Size:" << FF->getSize();
+    break;
+  }
+  case MCFragment::FT_Relaxable:  {
+    const MCRelaxableFragment *F = cast<MCRelaxableFragment>(this);
+    OS << "\n       ";
+    OS << " Inst:";
+    F->getInst().dump_pretty(OS);
+    break;
+  }
+  case MCFragment::FT_Org:  {
+    const MCOrgFragment *OF = cast<MCOrgFragment>(this);
+    OS << "\n       ";
+    OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue();
+    break;
+  }
+  case MCFragment::FT_Dwarf:  {
+    const MCDwarfLineAddrFragment *OF = cast<MCDwarfLineAddrFragment>(this);
+    OS << "\n       ";
+    OS << " AddrDelta:" << OF->getAddrDelta()
+       << " LineDelta:" << OF->getLineDelta();
+    break;
+  }
+  case MCFragment::FT_DwarfFrame:  {
+    const MCDwarfCallFrameFragment *CF = cast<MCDwarfCallFrameFragment>(this);
+    OS << "\n       ";
+    OS << " AddrDelta:" << CF->getAddrDelta();
+    break;
+  }
+  case MCFragment::FT_LEB: {
+    const MCLEBFragment *LF = cast<MCLEBFragment>(this);
+    OS << "\n       ";
+    OS << " Value:" << LF->getValue() << " Signed:" << LF->isSigned();
+    break;
+  }
+  }
+  OS << ">";
+}
+
+void MCSectionData::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<MCSectionData";
+  OS << " Alignment:" << getAlignment()
+     << " Fragments:[\n      ";
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    if (it != begin()) OS << ",\n      ";
+    it->dump();
+  }
+  OS << "]>";
+}
+
+void MCSymbolData::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<MCSymbolData Symbol:" << getSymbol()
+     << " Fragment:" << getFragment() << " Offset:" << getOffset()
+     << " Flags:" << getFlags() << " Index:" << getIndex();
+  if (isCommon())
+    OS << " (common, size:" << getCommonSize()
+       << " align: " << getCommonAlignment() << ")";
+  if (isExternal())
+    OS << " (external)";
+  if (isPrivateExtern())
+    OS << " (private extern)";
+  OS << ">";
+}
+
+void MCAssembler::dump() {
+  raw_ostream &OS = llvm::errs();
+
+  OS << "<MCAssembler\n";
+  OS << "  Sections:[\n    ";
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    if (it != begin()) OS << ",\n    ";
+    it->dump();
+  }
+  OS << "],\n";
+  OS << "  Symbols:[";
+
+  for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
+    if (it != symbol_begin()) OS << ",\n           ";
+    it->dump();
+  }
+  OS << "]>\n";
+}
+#endif
+
+// anchors for MC*Fragment vtables
+void MCEncodedFragment::anchor() { }
+void MCEncodedFragmentWithFixups::anchor() { }
+void MCDataFragment::anchor() { }
+void MCCompactEncodedInstFragment::anchor() { }
+void MCRelaxableFragment::anchor() { }
+void MCAlignFragment::anchor() { }
+void MCFillFragment::anchor() { }
+void MCOrgFragment::anchor() { }
+void MCLEBFragment::anchor() { }
+void MCDwarfLineAddrFragment::anchor() { }
+void MCDwarfCallFrameFragment::anchor() { }
diff --git a/contrib/llvm/lib/MC/MCAtom.cpp b/contrib/llvm/lib/MC/MCAtom.cpp
new file mode 100644
index 000000000000..d71444324f3d
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAtom.cpp
@@ -0,0 +1,97 @@
+//===- lib/MC/MCAtom.cpp - MCAtom implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+void MCAtom::addInst(const MCInst &I, uint64_t Address, unsigned Size) {
+  assert(Type == TextAtom && "Trying to add MCInst to a non-text atom!");
+
+  assert(Address < End+Size &&
+         "Instruction not contiguous with end of atom!");
+  if (Address > End)
+    Parent->remap(this, Begin, End+Size);
+
+  Text.push_back(std::make_pair(Address, I));
+}
+
+void MCAtom::addData(const MCData &D) {
+  assert(Type == DataAtom && "Trying to add MCData to a non-data atom!");
+  Parent->remap(this, Begin, End+1);
+
+  Data.push_back(D);
+}
+
+MCAtom *MCAtom::split(uint64_t SplitPt) {
+  assert((SplitPt > Begin && SplitPt <= End) &&
+         "Splitting at point not contained in atom!");
+
+  // Compute the new begin/end points.
+  uint64_t LeftBegin = Begin;
+  uint64_t LeftEnd = SplitPt - 1;
+  uint64_t RightBegin = SplitPt;
+  uint64_t RightEnd = End;
+
+  // Remap this atom to become the lower of the two new ones.
+  Parent->remap(this, LeftBegin, LeftEnd);
+
+  // Create a new atom for the higher atom.
+  MCAtom *RightAtom = Parent->createAtom(Type, RightBegin, RightEnd);
+
+  // Split the contents of the original atom between it and the new one.  The
+  // precise method depends on whether this is a data or a text atom.
+  if (isDataAtom()) {
+    std::vector<MCData>::iterator I = Data.begin() + (RightBegin - LeftBegin);
+
+    assert(I != Data.end() && "Split point not found in range!");
+
+    std::copy(I, Data.end(), RightAtom->Data.end());
+    Data.erase(I, Data.end());
+  } else if (isTextAtom()) {
+    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+
+    while (I != Text.end() && I->first < SplitPt) ++I;
+
+    assert(I != Text.end() && "Split point not found in disassembly!");
+    assert(I->first == SplitPt &&
+           "Split point does not fall on instruction boundary!");
+
+    std::copy(I, Text.end(), RightAtom->Text.end());
+    Text.erase(I, Text.end());
+  } else
+    llvm_unreachable("Unknown atom type!");
+
+  return RightAtom;
+}
+
+void MCAtom::truncate(uint64_t TruncPt) {
+  assert((TruncPt >= Begin && TruncPt < End) &&
+         "Truncation point not contained in atom!");
+
+  Parent->remap(this, Begin, TruncPt);
+
+  if (isDataAtom()) {
+    Data.resize(TruncPt - Begin + 1);
+  } else if (isTextAtom()) {
+    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+
+    while (I != Text.end() && I->first <= TruncPt) ++I;
+
+    assert(I != Text.end() && "Truncation point not found in disassembly!");
+    assert(I->first == TruncPt+1 &&
+           "Truncation point does not fall on instruction boundary");
+
+    Text.erase(I, Text.end());
+  } else
+    llvm_unreachable("Unknown atom type!");
+}
+
diff --git a/contrib/llvm/lib/MC/MCCodeEmitter.cpp b/contrib/llvm/lib/MC/MCCodeEmitter.cpp
new file mode 100644
index 000000000000..c122763b2fe5
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCCodeEmitter.cpp
@@ -0,0 +1,18 @@
+//===-- MCCodeEmitter.cpp - Instruction Encoding --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCCodeEmitter.h"
+
+using namespace llvm;
+
+MCCodeEmitter::MCCodeEmitter() {
+}
+
+MCCodeEmitter::~MCCodeEmitter() {
+}
diff --git a/contrib/llvm/lib/MC/MCCodeGenInfo.cpp b/contrib/llvm/lib/MC/MCCodeGenInfo.cpp
new file mode 100644
index 000000000000..d9dcfd0614bc
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCCodeGenInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MCCodeGenInfo.cpp - Target CodeGen Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tracks information about the target which can affect codegen,
+// asm parsing, and asm printing. For example, relocation model.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCCodeGenInfo.h"
+using namespace llvm;
+
+void MCCodeGenInfo::InitMCCodeGenInfo(Reloc::Model RM, CodeModel::Model CM,
+                                      CodeGenOpt::Level OL) {
+  RelocationModel = RM;
+  CMModel = CM;
+  OptLevel = OL;
+}
diff --git a/contrib/llvm/lib/MC/MCContext.cpp b/contrib/llvm/lib/MC/MCContext.cpp
new file mode 100644
index 000000000000..9adcc02b71a4
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCContext.cpp
@@ -0,0 +1,390 @@
+//===- lib/MC/MCContext.cpp - Machine Code Context ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCLabel.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace llvm;
+
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
+typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+
+
+MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
+                     const MCObjectFileInfo *mofi, const SourceMgr *mgr,
+                     bool DoAutoReset) :
+  SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi),
+  Allocator(), Symbols(Allocator), UsedNames(Allocator),
+  NextUniqueID(0),
+  CompilationDir(llvm::sys::Path::GetCurrentDirectory().str()),
+  CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0), 
+  DwarfLocSeen(false), GenDwarfForAssembly(false), GenDwarfFileNumber(0),
+  AllowTemporaryLabels(true), DwarfCompileUnitID(0), AutoReset(DoAutoReset) {
+
+  MachOUniquingMap = 0;
+  ELFUniquingMap = 0;
+  COFFUniquingMap = 0;
+
+  SecureLogFile = getenv("AS_SECURE_LOG_FILE");
+  SecureLog = 0;
+  SecureLogUsed = false;
+
+  if (SrcMgr && SrcMgr->getNumBuffers() > 0)
+    MainFileName = SrcMgr->getMemoryBuffer(0)->getBufferIdentifier();
+  else
+    MainFileName = "";
+}
+
+MCContext::~MCContext() {
+
+  if (AutoReset)
+    reset();
+
+  // NOTE: The symbols are all allocated out of a bump pointer allocator,
+  // we don't need to free them here.
+  
+  // If the stream for the .secure_log_unique directive was created free it.
+  delete (raw_ostream*)SecureLog;
+}
+
+//===----------------------------------------------------------------------===//
+// Module Lifetime Management
+//===----------------------------------------------------------------------===//
+
+void MCContext::reset() {
+  UsedNames.clear();
+  Symbols.clear();
+  Allocator.Reset();
+  Instances.clear();
+  MCDwarfFilesCUMap.clear();
+  MCDwarfDirsCUMap.clear();
+  MCGenDwarfLabelEntries.clear();
+  DwarfDebugFlags = StringRef();
+  MCLineSections.clear();
+  MCLineSectionOrder.clear();
+  DwarfCompileUnitID = 0;
+  MCLineTableSymbols.clear();
+  CurrentDwarfLoc = MCDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0);
+
+  // If we have the MachO uniquing map, free it.
+  delete (MachOUniqueMapTy*)MachOUniquingMap;
+  delete (ELFUniqueMapTy*)ELFUniquingMap;
+  delete (COFFUniqueMapTy*)COFFUniquingMap;
+  MachOUniquingMap = 0;
+  ELFUniquingMap = 0;
+  COFFUniquingMap = 0;
+
+  NextUniqueID = 0;
+  AllowTemporaryLabels = true;
+  DwarfLocSeen = false;
+  GenDwarfForAssembly = false;
+  GenDwarfFileNumber = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol Manipulation
+//===----------------------------------------------------------------------===//
+
+MCSymbol *MCContext::GetOrCreateSymbol(StringRef Name) {
+  assert(!Name.empty() && "Normal symbols cannot be unnamed!");
+
+  // Do the lookup and get the entire StringMapEntry.  We want access to the
+  // key if we are creating the entry.
+  StringMapEntry<MCSymbol*> &Entry = Symbols.GetOrCreateValue(Name);
+  MCSymbol *Sym = Entry.getValue();
+
+  if (Sym)
+    return Sym;
+
+  Sym = CreateSymbol(Name);
+  Entry.setValue(Sym);
+  return Sym;
+}
+
+MCSymbol *MCContext::CreateSymbol(StringRef Name) {
+  // Determine whether this is an assembler temporary or normal label, if used.
+  bool isTemporary = false;
+  if (AllowTemporaryLabels)
+    isTemporary = Name.startswith(MAI.getPrivateGlobalPrefix());
+
+  StringMapEntry<bool> *NameEntry = &UsedNames.GetOrCreateValue(Name);
+  if (NameEntry->getValue()) {
+    assert(isTemporary && "Cannot rename non temporary symbols");
+    SmallString<128> NewName = Name;
+    do {
+      NewName.resize(Name.size());
+      raw_svector_ostream(NewName) << NextUniqueID++;
+      NameEntry = &UsedNames.GetOrCreateValue(NewName);
+    } while (NameEntry->getValue());
+  }
+  NameEntry->setValue(true);
+
+  // Ok, the entry doesn't already exist.  Have the MCSymbol object itself refer
+  // to the copy of the string that is embedded in the UsedNames entry.
+  MCSymbol *Result = new (*this) MCSymbol(NameEntry->getKey(), isTemporary);
+
+  return Result;
+}
+
+MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) {
+  SmallString<128> NameSV;
+  Name.toVector(NameSV);
+  return GetOrCreateSymbol(NameSV.str());
+}
+
+MCSymbol *MCContext::CreateTempSymbol() {
+  SmallString<128> NameSV;
+  raw_svector_ostream(NameSV)
+    << MAI.getPrivateGlobalPrefix() << "tmp" << NextUniqueID++;
+  return CreateSymbol(NameSV);
+}
+
+unsigned MCContext::NextInstance(int64_t LocalLabelVal) {
+  MCLabel *&Label = Instances[LocalLabelVal];
+  if (!Label)
+    Label = new (*this) MCLabel(0);
+  return Label->incInstance();
+}
+
+unsigned MCContext::GetInstance(int64_t LocalLabelVal) {
+  MCLabel *&Label = Instances[LocalLabelVal];
+  if (!Label)
+    Label = new (*this) MCLabel(0);
+  return Label->getInstance();
+}
+
+MCSymbol *MCContext::CreateDirectionalLocalSymbol(int64_t LocalLabelVal) {
+  return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
+                           Twine(LocalLabelVal) +
+                           "\2" +
+                           Twine(NextInstance(LocalLabelVal)));
+}
+MCSymbol *MCContext::GetDirectionalLocalSymbol(int64_t LocalLabelVal,
+                                               int bORf) {
+  return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
+                           Twine(LocalLabelVal) +
+                           "\2" +
+                           Twine(GetInstance(LocalLabelVal) + bORf));
+}
+
+MCSymbol *MCContext::LookupSymbol(StringRef Name) const {
+  return Symbols.lookup(Name);
+}
+
+MCSymbol *MCContext::LookupSymbol(const Twine &Name) const {
+  SmallString<128> NameSV;
+  Name.toVector(NameSV);
+  return LookupSymbol(NameSV.str());
+}
+
+//===----------------------------------------------------------------------===//
+// Section Management
+//===----------------------------------------------------------------------===//
+
+const MCSectionMachO *MCContext::
+getMachOSection(StringRef Segment, StringRef Section,
+                unsigned TypeAndAttributes,
+                unsigned Reserved2, SectionKind Kind) {
+
+  // We unique sections by their segment/section pair.  The returned section
+  // may not have the same flags as the requested section, if so this should be
+  // diagnosed by the client as an error.
+
+  // Create the map if it doesn't already exist.
+  if (MachOUniquingMap == 0)
+    MachOUniquingMap = new MachOUniqueMapTy();
+  MachOUniqueMapTy &Map = *(MachOUniqueMapTy*)MachOUniquingMap;
+
+  // Form the name to look up.
+  SmallString<64> Name;
+  Name += Segment;
+  Name.push_back(',');
+  Name += Section;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionMachO *&Entry = Map[Name.str()];
+  if (Entry) return Entry;
+
+  // Otherwise, return a new section.
+  return Entry = new (*this) MCSectionMachO(Segment, Section, TypeAndAttributes,
+                                            Reserved2, Kind);
+}
+
+const MCSectionELF *MCContext::
+getELFSection(StringRef Section, unsigned Type, unsigned Flags,
+              SectionKind Kind) {
+  return getELFSection(Section, Type, Flags, Kind, 0, "");
+}
+
+const MCSectionELF *MCContext::
+getELFSection(StringRef Section, unsigned Type, unsigned Flags,
+              SectionKind Kind, unsigned EntrySize, StringRef Group) {
+  if (ELFUniquingMap == 0)
+    ELFUniquingMap = new ELFUniqueMapTy();
+  ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  StringMapEntry<const MCSectionELF*> &Entry = Map.GetOrCreateValue(Section);
+  if (Entry.getValue()) return Entry.getValue();
+
+  // Possibly refine the entry size first.
+  if (!EntrySize) {
+    EntrySize = MCSectionELF::DetermineEntrySize(Kind);
+  }
+
+  MCSymbol *GroupSym = NULL;
+  if (!Group.empty())
+    GroupSym = GetOrCreateSymbol(Group);
+
+  MCSectionELF *Result = new (*this) MCSectionELF(Entry.getKey(), Type, Flags,
+                                                  Kind, EntrySize, GroupSym);
+  Entry.setValue(Result);
+  return Result;
+}
+
+const MCSectionELF *MCContext::CreateELFGroupSection() {
+  MCSectionELF *Result =
+    new (*this) MCSectionELF(".group", ELF::SHT_GROUP, 0,
+                             SectionKind::getReadOnly(), 4, NULL);
+  return Result;
+}
+
+const MCSection *MCContext::getCOFFSection(StringRef Section,
+                                           unsigned Characteristics,
+                                           int Selection,
+                                           SectionKind Kind) {
+  if (COFFUniquingMap == 0)
+    COFFUniquingMap = new COFFUniqueMapTy();
+  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  StringMapEntry<const MCSectionCOFF*> &Entry = Map.GetOrCreateValue(Section);
+  if (Entry.getValue()) return Entry.getValue();
+
+  MCSectionCOFF *Result = new (*this) MCSectionCOFF(Entry.getKey(),
+                                                    Characteristics,
+                                                    Selection, Kind);
+
+  Entry.setValue(Result);
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Dwarf Management
+//===----------------------------------------------------------------------===//
+
+/// GetDwarfFile - takes a file name an number to place in the dwarf file and
+/// directory tables.  If the file number has already been allocated it is an
+/// error and zero is returned and the client reports the error, else the
+/// allocated file number is returned.  The file numbers may be in any order.
+unsigned MCContext::GetDwarfFile(StringRef Directory, StringRef FileName,
+                                 unsigned FileNumber, unsigned CUID) {
+  // TODO: a FileNumber of zero says to use the next available file number.
+  // Note: in GenericAsmParser::ParseDirectiveFile() FileNumber was checked
+  // to not be less than one.  This needs to be change to be not less than zero.
+
+  SmallVectorImpl<MCDwarfFile *>& MCDwarfFiles = MCDwarfFilesCUMap[CUID];
+  SmallVectorImpl<StringRef>& MCDwarfDirs = MCDwarfDirsCUMap[CUID];
+  // Make space for this FileNumber in the MCDwarfFiles vector if needed.
+  if (FileNumber >= MCDwarfFiles.size()) {
+    MCDwarfFiles.resize(FileNumber + 1);
+  } else {
+    MCDwarfFile *&ExistingFile = MCDwarfFiles[FileNumber];
+    if (ExistingFile)
+      // It is an error to use see the same number more than once.
+      return 0;
+  }
+
+  // Get the new MCDwarfFile slot for this FileNumber.
+  MCDwarfFile *&File = MCDwarfFiles[FileNumber];
+
+  if (Directory.empty()) {
+    // Separate the directory part from the basename of the FileName.
+    StringRef tFileName = sys::path::filename(FileName);
+    if (!tFileName.empty()) {
+      Directory = sys::path::parent_path(FileName);
+      if (!Directory.empty())
+        FileName = tFileName;
+    }
+  }
+
+  // Find or make a entry in the MCDwarfDirs vector for this Directory.
+  // Capture directory name.
+  unsigned DirIndex;
+  if (Directory.empty()) {
+    // For FileNames with no directories a DirIndex of 0 is used.
+    DirIndex = 0;
+  } else {
+    DirIndex = 0;
+    for (unsigned End = MCDwarfDirs.size(); DirIndex < End; DirIndex++) {
+      if (Directory == MCDwarfDirs[DirIndex])
+        break;
+    }
+    if (DirIndex >= MCDwarfDirs.size()) {
+      char *Buf = static_cast<char *>(Allocate(Directory.size()));
+      memcpy(Buf, Directory.data(), Directory.size());
+      MCDwarfDirs.push_back(StringRef(Buf, Directory.size()));
+    }
+    // The DirIndex is one based, as DirIndex of 0 is used for FileNames with
+    // no directories.  MCDwarfDirs[] is unlike MCDwarfFiles[] in that the
+    // directory names are stored at MCDwarfDirs[DirIndex-1] where FileNames
+    // are stored at MCDwarfFiles[FileNumber].Name .
+    DirIndex++;
+  }
+
+  // Now make the MCDwarfFile entry and place it in the slot in the MCDwarfFiles
+  // vector.
+  char *Buf = static_cast<char *>(Allocate(FileName.size()));
+  memcpy(Buf, FileName.data(), FileName.size());
+  File = new (*this) MCDwarfFile(StringRef(Buf, FileName.size()), DirIndex);
+
+  // return the allocated FileNumber.
+  return FileNumber;
+}
+
+/// isValidDwarfFileNumber - takes a dwarf file number and returns true if it
+/// currently is assigned and false otherwise.
+bool MCContext::isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID) {
+  SmallVectorImpl<MCDwarfFile *>& MCDwarfFiles = MCDwarfFilesCUMap[CUID];
+  if(FileNumber == 0 || FileNumber >= MCDwarfFiles.size())
+    return false;
+
+  return MCDwarfFiles[FileNumber] != 0;
+}
+
+void MCContext::FatalError(SMLoc Loc, const Twine &Msg) {
+  // If we have a source manager and a location, use it. Otherwise just
+  // use the generic report_fatal_error().
+  if (!SrcMgr || Loc == SMLoc())
+    report_fatal_error(Msg);
+
+  // Use the source manager to print the message.
+  SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+
+  // If we reached here, we are failing ungracefully. Run the interrupt handlers
+  // to make sure any special cleanups get done, in particular that we remove
+  // files registered with RemoveFileOnSignal.
+  sys::RunInterruptHandlers();
+  exit(1);
+}
diff --git a/contrib/llvm/lib/MC/MCDisassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler.cpp
new file mode 100644
index 000000000000..08096906462f
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCDisassembler.cpp
@@ -0,0 +1,14 @@
+//===-- lib/MC/MCDisassembler.cpp - Disassembler interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDisassembler.h"
+using namespace llvm;
+
+MCDisassembler::~MCDisassembler() {
+}
diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
new file mode 100644
index 000000000000..4766b3747635
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -0,0 +1,223 @@
+//===-- lib/MC/Disassembler.cpp - Disassembler Public C Interface ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Disassembler.h"
+#include "llvm-c/Disassembler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+class Target;
+} // namespace llvm
+using namespace llvm;
+
+// LLVMCreateDisasm() creates a disassembler for the TripleName.  Symbolic
+// disassembly is supported by passing a block of information in the DisInfo
+// parameter and specifying the TagType and callback functions as described in
+// the header llvm-c/Disassembler.h .  The pointer to the block and the 
+// functions can all be passed as NULL.  If successful, this returns a
+// disassembler context.  If not, it returns NULL.
+//
+LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
+                                         void *DisInfo, int TagType,
+                                         LLVMOpInfoCallback GetOpInfo,
+                                         LLVMSymbolLookupCallback SymbolLookUp){
+  // Get the target.
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
+  assert(TheTarget && "Unable to create target!");
+
+  // Get the assembler info needed to setup the MCContext.
+  const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(Triple);
+  if (!MAI)
+    return 0;
+
+  const MCInstrInfo *MII = TheTarget->createMCInstrInfo();
+  if (!MII)
+    return 0;
+
+  const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(Triple);
+  if (!MRI)
+    return 0;
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+
+  const MCSubtargetInfo *STI = TheTarget->createMCSubtargetInfo(Triple, CPU,
+                                                                FeaturesStr);
+  if (!STI)
+    return 0;
+
+  // Set up the MCContext for creating symbols and MCExpr's.
+  MCContext *Ctx = new MCContext(*MAI, *MRI, 0);
+  if (!Ctx)
+    return 0;
+
+  // Set up disassembler.
+  MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI);
+  if (!DisAsm)
+    return 0;
+  DisAsm->setupForSymbolicDisassembly(GetOpInfo, SymbolLookUp, DisInfo, Ctx);
+
+  // Set up the instruction printer.
+  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  MCInstPrinter *IP = TheTarget->createMCInstPrinter(AsmPrinterVariant,
+                                                     *MAI, *MII, *MRI, *STI);
+  if (!IP)
+    return 0;
+
+  LLVMDisasmContext *DC = new LLVMDisasmContext(Triple, DisInfo, TagType,
+                                                GetOpInfo, SymbolLookUp,
+                                                TheTarget, MAI, MRI,
+                                                STI, MII, Ctx, DisAsm, IP);
+  if (!DC)
+    return 0;
+
+  return DC;
+}
+
+LLVMDisasmContextRef LLVMCreateDisasm(const char *Triple, void *DisInfo,
+                                      int TagType, LLVMOpInfoCallback GetOpInfo,
+                                      LLVMSymbolLookupCallback SymbolLookUp) {
+  return LLVMCreateDisasmCPU(Triple, "", DisInfo, TagType, GetOpInfo,
+                             SymbolLookUp);
+}
+
+//
+// LLVMDisasmDispose() disposes of the disassembler specified by the context.
+//
+void LLVMDisasmDispose(LLVMDisasmContextRef DCR){
+  LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+  delete DC;
+}
+
+namespace {
+//
+// The memory object created by LLVMDisasmInstruction().
+//
+class DisasmMemoryObject : public MemoryObject {
+  uint8_t *Bytes;
+  uint64_t Size;
+  uint64_t BasePC;
+public:
+  DisasmMemoryObject(uint8_t *bytes, uint64_t size, uint64_t basePC) :
+                     Bytes(bytes), Size(size), BasePC(basePC) {}
+ 
+  uint64_t getBase() const { return BasePC; }
+  uint64_t getExtent() const { return Size; }
+
+  int readByte(uint64_t Addr, uint8_t *Byte) const {
+    if (Addr - BasePC >= Size)
+      return -1;
+    *Byte = Bytes[Addr - BasePC];
+    return 0;
+  }
+};
+} // end anonymous namespace
+
+//
+// LLVMDisasmInstruction() disassembles a single instruction using the
+// disassembler context specified in the parameter DC.  The bytes of the
+// instruction are specified in the parameter Bytes, and contains at least
+// BytesSize number of bytes.  The instruction is at the address specified by
+// the PC parameter.  If a valid instruction can be disassembled its string is
+// returned indirectly in OutString which whos size is specified in the
+// parameter OutStringSize.  This function returns the number of bytes in the
+// instruction or zero if there was no valid instruction.  If this function
+// returns zero the caller will have to pick how many bytes they want to step
+// over by printing a .byte, .long etc. to continue.
+//
+size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
+                             uint64_t BytesSize, uint64_t PC, char *OutString,
+                             size_t OutStringSize){
+  LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+  // Wrap the pointer to the Bytes, BytesSize and PC in a MemoryObject.
+  DisasmMemoryObject MemoryObject(Bytes, BytesSize, PC);
+
+  uint64_t Size;
+  MCInst Inst;
+  const MCDisassembler *DisAsm = DC->getDisAsm();
+  MCInstPrinter *IP = DC->getIP();
+  MCDisassembler::DecodeStatus S;
+  S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
+                             /*REMOVE*/ nulls(), DC->CommentStream);
+  switch (S) {
+  case MCDisassembler::Fail:
+  case MCDisassembler::SoftFail:
+    // FIXME: Do something different for soft failure modes?
+    return 0;
+
+  case MCDisassembler::Success: {
+    DC->CommentStream.flush();
+    StringRef Comments = DC->CommentsToEmit.str();
+
+    SmallVector<char, 64> InsnStr;
+    raw_svector_ostream OS(InsnStr);
+    IP->printInst(&Inst, OS, Comments);
+    OS.flush();
+
+    // Tell the comment stream that the vector changed underneath it.
+    DC->CommentsToEmit.clear();
+    DC->CommentStream.resync();
+
+    assert(OutStringSize != 0 && "Output buffer cannot be zero size");
+    size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
+    std::memcpy(OutString, InsnStr.data(), OutputSize);
+    OutString[OutputSize] = '\0'; // Terminate string.
+
+    return Size;
+  }
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+//
+// LLVMSetDisasmOptions() sets the disassembler's options.  It returns 1 if it
+// can set all the Options and 0 otherwise.
+//
+int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
+  if (Options & LLVMDisassembler_Option_UseMarkup){
+      LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+      MCInstPrinter *IP = DC->getIP();
+      IP->setUseMarkup(1);
+      Options &= ~LLVMDisassembler_Option_UseMarkup;
+  }
+  if (Options & LLVMDisassembler_Option_PrintImmHex){
+      LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+      MCInstPrinter *IP = DC->getIP();
+      IP->setPrintImmHex(1);
+      Options &= ~LLVMDisassembler_Option_PrintImmHex;
+  }
+  if (Options & LLVMDisassembler_Option_AsmPrinterVariant){
+      LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+      // Try to set up the new instruction printer.
+      const MCAsmInfo *MAI = DC->getAsmInfo();
+      const MCInstrInfo *MII = DC->getInstrInfo();
+      const MCRegisterInfo *MRI = DC->getRegisterInfo();
+      const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+      int AsmPrinterVariant = MAI->getAssemblerDialect();
+      AsmPrinterVariant = AsmPrinterVariant == 0 ? 1 : 0;
+      MCInstPrinter *IP = DC->getTarget()->createMCInstPrinter(
+          AsmPrinterVariant, *MAI, *MII, *MRI, *STI);
+      if (IP) {
+        DC->setIP(IP);
+        Options &= ~LLVMDisassembler_Option_AsmPrinterVariant;
+      }
+  }
+  return (Options == 0);
+}
diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.h b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.h
new file mode 100644
index 000000000000..6eb59d0c57be
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.h
@@ -0,0 +1,121 @@
+//===------------- Disassembler.h - LLVM Disassembler -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the Disassembly library's disassembler 
+// context.  The disassembler is responsible for producing strings for
+// individual instructions according to a given architecture and disassembly
+// syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_DISASSEMBLER_H
+#define LLVM_MC_DISASSEMBLER_H
+
+#include "llvm-c/Disassembler.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+class MCContext;
+class MCAsmInfo;
+class MCDisassembler;
+class MCInstPrinter; 
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+
+//
+// This is the disassembler context returned by LLVMCreateDisasm().
+//
+class LLVMDisasmContext {
+private:
+  //
+  // The passed parameters when the disassembler context is created.
+  //
+  // The TripleName for this disassembler.
+  std::string TripleName;
+  // The pointer to the caller's block of symbolic information.
+  void *DisInfo;
+  // The Triple specific symbolic information type returned by GetOpInfo.
+  int TagType;
+  // The function to get the symbolic information for operands.
+  LLVMOpInfoCallback GetOpInfo;
+  // The function to look up a symbol name.
+  LLVMSymbolLookupCallback SymbolLookUp;
+  //
+  // The objects created and saved by LLVMCreateDisasm() then used by
+  // LLVMDisasmInstruction().
+  //
+  // The LLVM target corresponding to the disassembler.
+  // FIXME: using llvm::OwningPtr<const llvm::Target> causes a malloc error
+  //        when this LLVMDisasmContext is deleted.
+  const Target *TheTarget;
+  // The assembly information for the target architecture.
+  llvm::OwningPtr<const llvm::MCAsmInfo> MAI;
+  // The register information for the target architecture.
+  llvm::OwningPtr<const llvm::MCRegisterInfo> MRI;
+  // The subtarget information for the target architecture.
+  llvm::OwningPtr<const llvm::MCSubtargetInfo> MSI;
+  // The instruction information for the target architecture.
+  llvm::OwningPtr<const llvm::MCInstrInfo> MII;
+  // The assembly context for creating symbols and MCExprs.
+  llvm::OwningPtr<const llvm::MCContext> Ctx;
+  // The disassembler for the target architecture.
+  llvm::OwningPtr<const llvm::MCDisassembler> DisAsm;
+  // The instruction printer for the target architecture.
+  llvm::OwningPtr<llvm::MCInstPrinter> IP;
+
+public:
+  // Comment stream and backing vector.
+  SmallString<128> CommentsToEmit;
+  raw_svector_ostream CommentStream;
+
+  LLVMDisasmContext(std::string tripleName, void *disInfo, int tagType,
+                    LLVMOpInfoCallback getOpInfo,
+                    LLVMSymbolLookupCallback symbolLookUp,
+                    const Target *theTarget, const MCAsmInfo *mAI,
+                    const MCRegisterInfo *mRI,
+                    const MCSubtargetInfo *mSI,
+                    const MCInstrInfo *mII,
+                    llvm::MCContext *ctx, const MCDisassembler *disAsm,
+                    MCInstPrinter *iP) : TripleName(tripleName),
+                    DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
+                    SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+                    CommentStream(CommentsToEmit) {
+    MAI.reset(mAI);
+    MRI.reset(mRI);
+    MSI.reset(mSI);
+    MII.reset(mII);
+    Ctx.reset(ctx);
+    DisAsm.reset(disAsm);
+    IP.reset(iP);
+  }
+  const std::string &getTripleName() const { return TripleName; }
+  void *getDisInfo() const { return DisInfo; }
+  int getTagType() const { return TagType; }
+  LLVMOpInfoCallback getGetOpInfo() const { return GetOpInfo; }
+  LLVMSymbolLookupCallback getSymbolLookupCallback() const {
+    return SymbolLookUp;
+  }
+  const Target *getTarget() const { return TheTarget; }
+  const MCDisassembler *getDisAsm() const { return DisAsm.get(); }
+  const MCAsmInfo *getAsmInfo() const { return MAI.get(); }
+  const MCInstrInfo *getInstrInfo() const { return MII.get(); }
+  const MCRegisterInfo *getRegisterInfo() const { return MRI.get(); }
+  const MCSubtargetInfo *getSubtargetInfo() const { return MSI.get(); }
+  MCInstPrinter *getIP() { return IP.get(); }
+  void setIP(MCInstPrinter *NewIP) { IP.reset(NewIP); }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm/lib/MC/MCDwarf.cpp
new file mode 100644
index 000000000000..0f8f0741bd7c
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCDwarf.cpp
@@ -0,0 +1,1558 @@
+//===- lib/MC/MCDwarf.cpp - MCDwarf implementation ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Given a special op, return the address skip amount (in units of
+// DWARF2_LINE_MIN_INSN_LENGTH.
+#define SPECIAL_ADDR(op) (((op) - DWARF2_LINE_OPCODE_BASE)/DWARF2_LINE_RANGE)
+
+// The maximum address skip amount that can be encoded with a special op.
+#define MAX_SPECIAL_ADDR_DELTA         SPECIAL_ADDR(255)
+
+// First special line opcode - leave room for the standard opcodes.
+// Note: If you want to change this, you'll have to update the
+// "standard_opcode_lengths" table that is emitted in DwarfFileTable::Emit().
+#define DWARF2_LINE_OPCODE_BASE         13
+
+// Minimum line offset in a special line info. opcode.  This value
+// was chosen to give a reasonable range of values.
+#define DWARF2_LINE_BASE                -5
+
+// Range of line offsets in a special line info. opcode.
+#define DWARF2_LINE_RANGE               14
+
+// Define the architecture-dependent minimum instruction length (in bytes).
+// This value should be rather too small than too big.
+#define DWARF2_LINE_MIN_INSN_LENGTH     1
+
+// Note: when DWARF2_LINE_MIN_INSN_LENGTH == 1 which is the current setting,
+// this routine is a nop and will be optimized away.
+static inline uint64_t ScaleAddrDelta(uint64_t AddrDelta) {
+  if (DWARF2_LINE_MIN_INSN_LENGTH == 1)
+    return AddrDelta;
+  if (AddrDelta % DWARF2_LINE_MIN_INSN_LENGTH != 0) {
+    // TODO: report this error, but really only once.
+    ;
+  }
+  return AddrDelta / DWARF2_LINE_MIN_INSN_LENGTH;
+}
+
+//
+// This is called when an instruction is assembled into the specified section
+// and if there is information from the last .loc directive that has yet to have
+// a line entry made for it is made.
+//
+void MCLineEntry::Make(MCStreamer *MCOS, const MCSection *Section) {
+  if (!MCOS->getContext().getDwarfLocSeen())
+    return;
+
+  // Create a symbol at in the current section for use in the line entry.
+  MCSymbol *LineSym = MCOS->getContext().CreateTempSymbol();
+  // Set the value of the symbol to use for the MCLineEntry.
+  MCOS->EmitLabel(LineSym);
+
+  // Get the current .loc info saved in the context.
+  const MCDwarfLoc &DwarfLoc = MCOS->getContext().getCurrentDwarfLoc();
+
+  // Create a (local) line entry with the symbol and the current .loc info.
+  MCLineEntry LineEntry(LineSym, DwarfLoc);
+
+  // clear DwarfLocSeen saying the current .loc info is now used.
+  MCOS->getContext().ClearDwarfLocSeen();
+
+  // Get the MCLineSection for this section, if one does not exist for this
+  // section create it.
+  const DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    MCOS->getContext().getMCLineSections();
+  MCLineSection *LineSection = MCLineSections.lookup(Section);
+  if (!LineSection) {
+    // Create a new MCLineSection.  This will be deleted after the dwarf line
+    // table is created using it by iterating through the MCLineSections
+    // DenseMap.
+    LineSection = new MCLineSection;
+    // Save a pointer to the new LineSection into the MCLineSections DenseMap.
+    MCOS->getContext().addMCLineSection(Section, LineSection);
+  }
+
+  // Add the line entry to this section's entries.
+  LineSection->addLineEntry(LineEntry,
+                            MCOS->getContext().getDwarfCompileUnitID());
+}
+
+//
+// This helper routine returns an expression of End - Start + IntVal .
+//
+static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
+                                                  const MCSymbol &Start,
+                                                  const MCSymbol &End,
+                                                  int IntVal) {
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(&End, Variant, MCOS.getContext());
+  const MCExpr *RHS =
+    MCSymbolRefExpr::Create(&Start, Variant, MCOS.getContext());
+  const MCExpr *Res1 =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, Res, RHS, MCOS.getContext());
+  const MCExpr *Res2 =
+    MCConstantExpr::Create(IntVal, MCOS.getContext());
+  const MCExpr *Res3 =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, Res1, Res2, MCOS.getContext());
+  return Res3;
+}
+
+//
+// This emits the Dwarf line table for the specified section from the entries
+// in the LineSection.
+//
+static inline void EmitDwarfLineTable(MCStreamer *MCOS,
+                                      const MCSection *Section,
+                                      const MCLineSection *LineSection,
+                                      unsigned CUID) {
+  // This LineSection does not contain any LineEntry for the given Compile Unit.
+  if (!LineSection->containEntriesForID(CUID))
+    return;
+
+  unsigned FileNum = 1;
+  unsigned LastLine = 1;
+  unsigned Column = 0;
+  unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
+  unsigned Isa = 0;
+  MCSymbol *LastLabel = NULL;
+
+  // Loop through each MCLineEntry and encode the dwarf line number table.
+  for (MCLineSection::const_iterator
+         it = LineSection->getMCLineEntries(CUID).begin(),
+         ie = LineSection->getMCLineEntries(CUID).end(); it != ie; ++it) {
+
+    if (FileNum != it->getFileNum()) {
+      FileNum = it->getFileNum();
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
+      MCOS->EmitULEB128IntValue(FileNum);
+    }
+    if (Column != it->getColumn()) {
+      Column = it->getColumn();
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_column, 1);
+      MCOS->EmitULEB128IntValue(Column);
+    }
+    if (Isa != it->getIsa()) {
+      Isa = it->getIsa();
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_isa, 1);
+      MCOS->EmitULEB128IntValue(Isa);
+    }
+    if ((it->getFlags() ^ Flags) & DWARF2_FLAG_IS_STMT) {
+      Flags = it->getFlags();
+      MCOS->EmitIntValue(dwarf::DW_LNS_negate_stmt, 1);
+    }
+    if (it->getFlags() & DWARF2_FLAG_BASIC_BLOCK)
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_basic_block, 1);
+    if (it->getFlags() & DWARF2_FLAG_PROLOGUE_END)
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_prologue_end, 1);
+    if (it->getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
+      MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
+
+    int64_t LineDelta = static_cast<int64_t>(it->getLine()) - LastLine;
+    MCSymbol *Label = it->getLabel();
+
+    // At this point we want to emit/create the sequence to encode the delta in
+    // line numbers and the increment of the address from the previous Label
+    // and the current Label.
+    const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
+    MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
+                                   asmInfo.getPointerSize());
+
+    LastLine = it->getLine();
+    LastLabel = Label;
+  }
+
+  // Emit a DW_LNE_end_sequence for the end of the section.
+  // Using the pointer Section create a temporary label at the end of the
+  // section and use that and the LastLabel to compute the address delta
+  // and use INT64_MAX as the line delta which is the signal that this is
+  // actually a DW_LNE_end_sequence.
+
+  // Switch to the section to be able to create a symbol at its end.
+  MCOS->SwitchSection(Section);
+
+  MCContext &context = MCOS->getContext();
+  // Create a symbol at the end of the section.
+  MCSymbol *SectionEnd = context.CreateTempSymbol();
+  // Set the value of the symbol, as we are at the end of the section.
+  MCOS->EmitLabel(SectionEnd);
+
+  // Switch back the dwarf line section.
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
+
+  const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
+  MCOS->EmitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
+                                 asmInfo.getPointerSize());
+}
+
+//
+// This emits the Dwarf file and the line tables.
+//
+const MCSymbol *MCDwarfFileTable::Emit(MCStreamer *MCOS) {
+  MCContext &context = MCOS->getContext();
+  // Switch to the section where the table will be emitted into.
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
+
+  const DenseMap<unsigned, MCSymbol *> &MCLineTableSymbols =
+    MCOS->getContext().getMCLineTableSymbols();
+  // CUID and MCLineTableSymbols are set in DwarfDebug, when DwarfDebug does
+  // not exist, CUID will be 0 and MCLineTableSymbols will be empty.
+  // Handle Compile Unit 0, the line table start symbol is the section symbol.
+  const MCSymbol *LineStartSym = EmitCU(MCOS, 0);
+  // Handle the rest of the Compile Units.
+  for (unsigned Is = 1, Ie = MCLineTableSymbols.size(); Is < Ie; Is++)
+    EmitCU(MCOS, Is);
+
+  // Now delete the MCLineSections that were created in MCLineEntry::Make()
+  // and used to emit the line table.
+  const DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    MCOS->getContext().getMCLineSections();
+  for (DenseMap<const MCSection *, MCLineSection *>::const_iterator it =
+       MCLineSections.begin(), ie = MCLineSections.end(); it != ie;
+       ++it)
+    delete it->second;
+
+  return LineStartSym;
+}
+
+const MCSymbol *MCDwarfFileTable::EmitCU(MCStreamer *MCOS, unsigned CUID) {
+  MCContext &context = MCOS->getContext();
+
+  // Create a symbol at the beginning of the line table.
+  MCSymbol *LineStartSym = MCOS->getContext().getMCLineTableSymbol(CUID);
+  if (!LineStartSym)
+    LineStartSym = context.CreateTempSymbol();
+  // Set the value of the symbol, as we are at the start of the line table.
+  MCOS->EmitLabel(LineStartSym);
+
+  // Create a symbol for the end of the section (to be set when we get there).
+  MCSymbol *LineEndSym = context.CreateTempSymbol();
+
+  // The first 4 bytes is the total length of the information for this
+  // compilation unit (not including these 4 bytes for the length).
+  MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *LineEndSym,4),
+                     4);
+
+  // Next 2 bytes is the Version, which is Dwarf 2.
+  MCOS->EmitIntValue(2, 2);
+
+  // Create a symbol for the end of the prologue (to be set when we get there).
+  MCSymbol *ProEndSym = context.CreateTempSymbol(); // Lprologue_end
+
+  // Length of the prologue, is the next 4 bytes.  Which is the start of the
+  // section to the end of the prologue.  Not including the 4 bytes for the
+  // total length, the 2 bytes for the version, and these 4 bytes for the
+  // length of the prologue.
+  MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym,
+                                           (4 + 2 + 4)), 4, 0);
+
+  // Parameters of the state machine, are next.
+  MCOS->EmitIntValue(DWARF2_LINE_MIN_INSN_LENGTH, 1);
+  MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1);
+  MCOS->EmitIntValue(DWARF2_LINE_BASE, 1);
+  MCOS->EmitIntValue(DWARF2_LINE_RANGE, 1);
+  MCOS->EmitIntValue(DWARF2_LINE_OPCODE_BASE, 1);
+
+  // Standard opcode lengths
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_copy
+  MCOS->EmitIntValue(1, 1); // length of DW_LNS_advance_pc
+  MCOS->EmitIntValue(1, 1); // length of DW_LNS_advance_line
+  MCOS->EmitIntValue(1, 1); // length of DW_LNS_set_file
+  MCOS->EmitIntValue(1, 1); // length of DW_LNS_set_column
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_negate_stmt
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_set_basic_block
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_const_add_pc
+  MCOS->EmitIntValue(1, 1); // length of DW_LNS_fixed_advance_pc
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_set_prologue_end
+  MCOS->EmitIntValue(0, 1); // length of DW_LNS_set_epilogue_begin
+  MCOS->EmitIntValue(1, 1); // DW_LNS_set_isa
+
+  // Put out the directory and file tables.
+
+  // First the directory table.
+  const SmallVectorImpl<StringRef> &MCDwarfDirs =
+    context.getMCDwarfDirs(CUID);
+  for (unsigned i = 0; i < MCDwarfDirs.size(); i++) {
+    MCOS->EmitBytes(MCDwarfDirs[i]); // the DirectoryName
+    MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string
+  }
+  MCOS->EmitIntValue(0, 1); // Terminate the directory list
+
+  // Second the file table.
+  const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+    MCOS->getContext().getMCDwarfFiles(CUID);
+  for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
+    MCOS->EmitBytes(MCDwarfFiles[i]->getName()); // FileName
+    MCOS->EmitBytes(StringRef("\0", 1)); // the null term. of the string
+    // the Directory num
+    MCOS->EmitULEB128IntValue(MCDwarfFiles[i]->getDirIndex());
+    MCOS->EmitIntValue(0, 1); // last modification timestamp (always 0)
+    MCOS->EmitIntValue(0, 1); // filesize (always 0)
+  }
+  MCOS->EmitIntValue(0, 1); // Terminate the file list
+
+  // This is the end of the prologue, so set the value of the symbol at the
+  // end of the prologue (that was used in a previous expression).
+  MCOS->EmitLabel(ProEndSym);
+
+  // Put out the line tables.
+  const DenseMap<const MCSection *, MCLineSection *> &MCLineSections =
+    MCOS->getContext().getMCLineSections();
+  const std::vector<const MCSection *> &MCLineSectionOrder =
+    MCOS->getContext().getMCLineSectionOrder();
+  for (std::vector<const MCSection*>::const_iterator it =
+         MCLineSectionOrder.begin(), ie = MCLineSectionOrder.end(); it != ie;
+       ++it) {
+    const MCSection *Sec = *it;
+    const MCLineSection *Line = MCLineSections.lookup(Sec);
+    EmitDwarfLineTable(MCOS, Sec, Line, CUID);
+  }
+
+  if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines()
+      && MCLineSectionOrder.begin() == MCLineSectionOrder.end()) {
+    // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures
+    // it requires:
+    // total_length >= prologue_length + 10
+    // We are 4 bytes short, since we have total_length = 51 and
+    // prologue_length = 45
+
+    // The regular end_sequence should be sufficient.
+    MCDwarfLineAddr::Emit(MCOS, INT64_MAX, 0);
+  }
+
+  // This is the end of the section, so set the value of the symbol at the end
+  // of this section (that was used in a previous expression).
+  MCOS->EmitLabel(LineEndSym);
+
+  return LineStartSym;
+}
+
+/// Utility function to write the encoding to an object writer.
+void MCDwarfLineAddr::Write(MCObjectWriter *OW, int64_t LineDelta,
+                            uint64_t AddrDelta) {
+  SmallString<256> Tmp;
+  raw_svector_ostream OS(Tmp);
+  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OS);
+  OW->WriteBytes(OS.str());
+}
+
+/// Utility function to emit the encoding to a streamer.
+void MCDwarfLineAddr::Emit(MCStreamer *MCOS, int64_t LineDelta,
+                           uint64_t AddrDelta) {
+  SmallString<256> Tmp;
+  raw_svector_ostream OS(Tmp);
+  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OS);
+  MCOS->EmitBytes(OS.str());
+}
+
+/// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
+void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
+                             raw_ostream &OS) {
+  uint64_t Temp, Opcode;
+  bool NeedCopy = false;
+
+  // Scale the address delta by the minimum instruction length.
+  AddrDelta = ScaleAddrDelta(AddrDelta);
+
+  // A LineDelta of INT64_MAX is a signal that this is actually a
+  // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the
+  // end_sequence to emit the matrix entry.
+  if (LineDelta == INT64_MAX) {
+    if (AddrDelta == MAX_SPECIAL_ADDR_DELTA)
+      OS << char(dwarf::DW_LNS_const_add_pc);
+    else {
+      OS << char(dwarf::DW_LNS_advance_pc);
+      encodeULEB128(AddrDelta, OS);
+    }
+    OS << char(dwarf::DW_LNS_extended_op);
+    OS << char(1);
+    OS << char(dwarf::DW_LNE_end_sequence);
+    return;
+  }
+
+  // Bias the line delta by the base.
+  Temp = LineDelta - DWARF2_LINE_BASE;
+
+  // If the line increment is out of range of a special opcode, we must encode
+  // it with DW_LNS_advance_line.
+  if (Temp >= DWARF2_LINE_RANGE) {
+    OS << char(dwarf::DW_LNS_advance_line);
+    encodeSLEB128(LineDelta, OS);
+
+    LineDelta = 0;
+    Temp = 0 - DWARF2_LINE_BASE;
+    NeedCopy = true;
+  }
+
+  // Use DW_LNS_copy instead of a "line +0, addr +0" special opcode.
+  if (LineDelta == 0 && AddrDelta == 0) {
+    OS << char(dwarf::DW_LNS_copy);
+    return;
+  }
+
+  // Bias the opcode by the special opcode base.
+  Temp += DWARF2_LINE_OPCODE_BASE;
+
+  // Avoid overflow when addr_delta is large.
+  if (AddrDelta < 256 + MAX_SPECIAL_ADDR_DELTA) {
+    // Try using a special opcode.
+    Opcode = Temp + AddrDelta * DWARF2_LINE_RANGE;
+    if (Opcode <= 255) {
+      OS << char(Opcode);
+      return;
+    }
+
+    // Try using DW_LNS_const_add_pc followed by special op.
+    Opcode = Temp + (AddrDelta - MAX_SPECIAL_ADDR_DELTA) * DWARF2_LINE_RANGE;
+    if (Opcode <= 255) {
+      OS << char(dwarf::DW_LNS_const_add_pc);
+      OS << char(Opcode);
+      return;
+    }
+  }
+
+  // Otherwise use DW_LNS_advance_pc.
+  OS << char(dwarf::DW_LNS_advance_pc);
+  encodeULEB128(AddrDelta, OS);
+
+  if (NeedCopy)
+    OS << char(dwarf::DW_LNS_copy);
+  else
+    OS << char(Temp);
+}
+
+void MCDwarfFile::print(raw_ostream &OS) const {
+  OS << '"' << getName() << '"';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCDwarfFile::dump() const {
+  print(dbgs());
+}
+#endif
+
+// Utility function to write a tuple for .debug_abbrev.
+static void EmitAbbrev(MCStreamer *MCOS, uint64_t Name, uint64_t Form) {
+  MCOS->EmitULEB128IntValue(Name);
+  MCOS->EmitULEB128IntValue(Form);
+}
+
+// When generating dwarf for assembly source files this emits
+// the data for .debug_abbrev section which contains three DIEs.
+static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
+  MCContext &context = MCOS->getContext();
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
+
+  // DW_TAG_compile_unit DIE abbrev (1).
+  MCOS->EmitULEB128IntValue(1);
+  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
+  MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
+  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
+  EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr);
+  EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string);
+  EmitAbbrev(MCOS, dwarf::DW_AT_comp_dir, dwarf::DW_FORM_string);
+  StringRef DwarfDebugFlags = context.getDwarfDebugFlags();
+  if (!DwarfDebugFlags.empty())
+    EmitAbbrev(MCOS, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string);
+  EmitAbbrev(MCOS, dwarf::DW_AT_producer, dwarf::DW_FORM_string);
+  EmitAbbrev(MCOS, dwarf::DW_AT_language, dwarf::DW_FORM_data2);
+  EmitAbbrev(MCOS, 0, 0);
+
+  // DW_TAG_label DIE abbrev (2).
+  MCOS->EmitULEB128IntValue(2);
+  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_label);
+  MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
+  EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string);
+  EmitAbbrev(MCOS, dwarf::DW_AT_decl_file, dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_decl_line, dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
+  EmitAbbrev(MCOS, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag);
+  EmitAbbrev(MCOS, 0, 0);
+
+  // DW_TAG_unspecified_parameters DIE abbrev (3).
+  MCOS->EmitULEB128IntValue(3);
+  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_unspecified_parameters);
+  MCOS->EmitIntValue(dwarf::DW_CHILDREN_no, 1);
+  EmitAbbrev(MCOS, 0, 0);
+
+  // Terminate the abbreviations for this compilation unit.
+  MCOS->EmitIntValue(0, 1);
+}
+
+// When generating dwarf for assembly source files this emits the data for
+// .debug_aranges section.  Which contains a header and a table of pairs of
+// PointerSize'ed values for the address and size of section(s) with line table
+// entries (just the default .text in our case) and a terminating pair of zeros.
+static void EmitGenDwarfAranges(MCStreamer *MCOS,
+                                const MCSymbol *InfoSectionSymbol) {
+  MCContext &context = MCOS->getContext();
+
+  // Create a symbol at the end of the section that we are creating the dwarf
+  // debugging info to use later in here as part of the expression to calculate
+  // the size of the section for the table.
+  MCOS->SwitchSection(context.getGenDwarfSection());
+  MCSymbol *SectionEndSym = context.CreateTempSymbol();
+  MCOS->EmitLabel(SectionEndSym);
+  context.setGenDwarfSectionEndSym(SectionEndSym);
+
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
+
+  // This will be the length of the .debug_aranges section, first account for
+  // the size of each item in the header (see below where we emit these items).
+  int Length = 4 + 2 + 4 + 1 + 1;
+
+  // Figure the padding after the header before the table of address and size
+  // pairs who's values are PointerSize'ed.
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  int AddrSize = asmInfo.getPointerSize();
+  int Pad = 2 * AddrSize - (Length & (2 * AddrSize - 1));
+  if (Pad == 2 * AddrSize)
+    Pad = 0;
+  Length += Pad;
+
+  // Add the size of the pair of PointerSize'ed values for the address and size
+  // of the one default .text section we have in the table.
+  Length += 2 * AddrSize;
+  // And the pair of terminating zeros.
+  Length += 2 * AddrSize;
+
+
+  // Emit the header for this section.
+  // The 4 byte length not including the 4 byte value for the length.
+  MCOS->EmitIntValue(Length - 4, 4);
+  // The 2 byte version, which is 2.
+  MCOS->EmitIntValue(2, 2);
+  // The 4 byte offset to the compile unit in the .debug_info from the start
+  // of the .debug_info.
+  if (InfoSectionSymbol)
+    MCOS->EmitSymbolValue(InfoSectionSymbol, 4);
+  else
+    MCOS->EmitIntValue(0, 4);
+  // The 1 byte size of an address.
+  MCOS->EmitIntValue(AddrSize, 1);
+  // The 1 byte size of a segment descriptor, we use a value of zero.
+  MCOS->EmitIntValue(0, 1);
+  // Align the header with the padding if needed, before we put out the table.
+  for(int i = 0; i < Pad; i++)
+    MCOS->EmitIntValue(0, 1);
+
+  // Now emit the table of pairs of PointerSize'ed values for the section(s)
+  // address and size, in our case just the one default .text section.
+  const MCExpr *Addr = MCSymbolRefExpr::Create(
+    context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context);
+  const MCExpr *Size = MakeStartMinusEndExpr(*MCOS,
+    *context.getGenDwarfSectionStartSym(), *SectionEndSym, 0);
+  MCOS->EmitAbsValue(Addr, AddrSize);
+  MCOS->EmitAbsValue(Size, AddrSize);
+
+  // And finally the pair of terminating zeros.
+  MCOS->EmitIntValue(0, AddrSize);
+  MCOS->EmitIntValue(0, AddrSize);
+}
+
+// When generating dwarf for assembly source files this emits the data for
+// .debug_info section which contains three parts.  The header, the compile_unit
+// DIE and a list of label DIEs.
+static void EmitGenDwarfInfo(MCStreamer *MCOS,
+                             const MCSymbol *AbbrevSectionSymbol,
+                             const MCSymbol *LineSectionSymbol) {
+  MCContext &context = MCOS->getContext();
+
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
+
+  // Create a symbol at the start and end of this section used in here for the
+  // expression to calculate the length in the header.
+  MCSymbol *InfoStart = context.CreateTempSymbol();
+  MCOS->EmitLabel(InfoStart);
+  MCSymbol *InfoEnd = context.CreateTempSymbol();
+
+  // First part: the header.
+
+  // The 4 byte total length of the information for this compilation unit, not
+  // including these 4 bytes.
+  const MCExpr *Length = MakeStartMinusEndExpr(*MCOS, *InfoStart, *InfoEnd, 4);
+  MCOS->EmitAbsValue(Length, 4);
+
+  // The 2 byte DWARF version, which is 2.
+  MCOS->EmitIntValue(2, 2);
+
+  // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
+  // it is at the start of that section so this is zero.
+  if (AbbrevSectionSymbol) {
+    MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4);
+  } else {
+    MCOS->EmitIntValue(0, 4);
+  }
+
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  int AddrSize = asmInfo.getPointerSize();
+  // The 1 byte size of an address.
+  MCOS->EmitIntValue(AddrSize, 1);
+
+  // Second part: the compile_unit DIE.
+
+  // The DW_TAG_compile_unit DIE abbrev (1).
+  MCOS->EmitULEB128IntValue(1);
+
+  // DW_AT_stmt_list, a 4 byte offset from the start of the .debug_line section,
+  // which is at the start of that section so this is zero.
+  if (LineSectionSymbol) {
+    MCOS->EmitSymbolValue(LineSectionSymbol, 4);
+  } else {
+    MCOS->EmitIntValue(0, 4);
+  }
+
+  // AT_low_pc, the first address of the default .text section.
+  const MCExpr *Start = MCSymbolRefExpr::Create(
+    context.getGenDwarfSectionStartSym(), MCSymbolRefExpr::VK_None, context);
+  MCOS->EmitAbsValue(Start, AddrSize);
+
+  // AT_high_pc, the last address of the default .text section.
+  const MCExpr *End = MCSymbolRefExpr::Create(
+    context.getGenDwarfSectionEndSym(), MCSymbolRefExpr::VK_None, context);
+  MCOS->EmitAbsValue(End, AddrSize);
+
+  // AT_name, the name of the source file.  Reconstruct from the first directory
+  // and file table entries.
+  const SmallVectorImpl<StringRef> &MCDwarfDirs =
+    context.getMCDwarfDirs();
+  if (MCDwarfDirs.size() > 0) {
+    MCOS->EmitBytes(MCDwarfDirs[0]);
+    MCOS->EmitBytes("/");
+  }
+  const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+    MCOS->getContext().getMCDwarfFiles();
+  MCOS->EmitBytes(MCDwarfFiles[1]->getName());
+  MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+
+  // AT_comp_dir, the working directory the assembly was done in.
+  MCOS->EmitBytes(context.getCompilationDir());
+  MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+
+  // AT_APPLE_flags, the command line arguments of the assembler tool.
+  StringRef DwarfDebugFlags = context.getDwarfDebugFlags();
+  if (!DwarfDebugFlags.empty()){
+    MCOS->EmitBytes(DwarfDebugFlags);
+    MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+  }
+
+  // AT_producer, the version of the assembler tool.
+  StringRef DwarfDebugProducer = context.getDwarfDebugProducer();
+  if (!DwarfDebugProducer.empty()){
+    MCOS->EmitBytes(DwarfDebugProducer);
+  }
+  else {
+    MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM "));
+    MCOS->EmitBytes(StringRef(PACKAGE_VERSION));
+    MCOS->EmitBytes(StringRef(")"));
+  }
+  MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+
+  // AT_language, a 4 byte value.  We use DW_LANG_Mips_Assembler as the dwarf2
+  // draft has no standard code for assembler.
+  MCOS->EmitIntValue(dwarf::DW_LANG_Mips_Assembler, 2);
+
+  // Third part: the list of label DIEs.
+
+  // Loop on saved info for dwarf labels and create the DIEs for them.
+  const std::vector<const MCGenDwarfLabelEntry *> &Entries =
+    MCOS->getContext().getMCGenDwarfLabelEntries();
+  for (std::vector<const MCGenDwarfLabelEntry *>::const_iterator it =
+       Entries.begin(), ie = Entries.end(); it != ie;
+       ++it) {
+    const MCGenDwarfLabelEntry *Entry = *it;
+
+    // The DW_TAG_label DIE abbrev (2).
+    MCOS->EmitULEB128IntValue(2);
+
+    // AT_name, of the label without any leading underbar.
+    MCOS->EmitBytes(Entry->getName());
+    MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+
+    // AT_decl_file, index into the file table.
+    MCOS->EmitIntValue(Entry->getFileNumber(), 4);
+
+    // AT_decl_line, source line number.
+    MCOS->EmitIntValue(Entry->getLineNumber(), 4);
+
+    // AT_low_pc, start address of the label.
+    const MCExpr *AT_low_pc = MCSymbolRefExpr::Create(Entry->getLabel(),
+                                             MCSymbolRefExpr::VK_None, context);
+    MCOS->EmitAbsValue(AT_low_pc, AddrSize);
+
+    // DW_AT_prototyped, a one byte flag value of 0 saying we have no prototype.
+    MCOS->EmitIntValue(0, 1);
+
+    // The DW_TAG_unspecified_parameters DIE abbrev (3).
+    MCOS->EmitULEB128IntValue(3);
+
+    // Add the NULL DIE terminating the DW_TAG_unspecified_parameters DIE's.
+    MCOS->EmitIntValue(0, 1);
+  }
+  // Deallocate the MCGenDwarfLabelEntry classes that saved away the info
+  // for the dwarf labels.
+  for (std::vector<const MCGenDwarfLabelEntry *>::const_iterator it =
+       Entries.begin(), ie = Entries.end(); it != ie;
+       ++it) {
+    const MCGenDwarfLabelEntry *Entry = *it;
+    delete Entry;
+  }
+
+  // Add the NULL DIE terminating the Compile Unit DIE's.
+  MCOS->EmitIntValue(0, 1);
+
+  // Now set the value of the symbol at the end of the info section.
+  MCOS->EmitLabel(InfoEnd);
+}
+
+//
+// When generating dwarf for assembly source files this emits the Dwarf
+// sections.
+//
+void MCGenDwarfInfo::Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) {
+  // Create the dwarf sections in this order (.debug_line already created).
+  MCContext &context = MCOS->getContext();
+  const MCAsmInfo &AsmInfo = context.getAsmInfo();
+  bool CreateDwarfSectionSymbols =
+      AsmInfo.doesDwarfUseRelocationsAcrossSections();
+  if (!CreateDwarfSectionSymbols)
+    LineSectionSymbol = NULL;
+  MCSymbol *AbbrevSectionSymbol = NULL;
+  MCSymbol *InfoSectionSymbol = NULL;
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
+  if (CreateDwarfSectionSymbols) {
+    InfoSectionSymbol = context.CreateTempSymbol();
+    MCOS->EmitLabel(InfoSectionSymbol);
+  }
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
+  if (CreateDwarfSectionSymbols) {
+    AbbrevSectionSymbol = context.CreateTempSymbol();
+    MCOS->EmitLabel(AbbrevSectionSymbol);
+  }
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
+
+  // If there are no line table entries then do not emit any section contents.
+  if (context.getMCLineSections().empty())
+    return;
+
+  // Output the data for .debug_aranges section.
+  EmitGenDwarfAranges(MCOS, InfoSectionSymbol);
+
+  // Output the data for .debug_abbrev section.
+  EmitGenDwarfAbbrev(MCOS);
+
+  // Output the data for .debug_info section.
+  EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol);
+}
+
+//
+// When generating dwarf for assembly source files this is called when symbol
+// for a label is created.  If this symbol is not a temporary and is in the
+// section that dwarf is being generated for, save the needed info to create
+// a dwarf label.
+//
+void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
+                                     SourceMgr &SrcMgr, SMLoc &Loc) {
+  // We won't create dwarf labels for temporary symbols or symbols not in
+  // the default text.
+  if (Symbol->isTemporary())
+    return;
+  MCContext &context = MCOS->getContext();
+  if (context.getGenDwarfSection() != MCOS->getCurrentSection())
+    return;
+
+  // The dwarf label's name does not have the symbol name's leading
+  // underbar if any.
+  StringRef Name = Symbol->getName();
+  if (Name.startswith("_"))
+    Name = Name.substr(1, Name.size()-1);
+
+  // Get the dwarf file number to be used for the dwarf label.
+  unsigned FileNumber = context.getGenDwarfFileNumber();
+
+  // Finding the line number is the expensive part which is why we just don't
+  // pass it in as for some symbols we won't create a dwarf label.
+  int CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
+  unsigned LineNumber = SrcMgr.FindLineNumber(Loc, CurBuffer);
+
+  // We create a temporary symbol for use for the AT_high_pc and AT_low_pc
+  // values so that they don't have things like an ARM thumb bit from the
+  // original symbol. So when used they won't get a low bit set after
+  // relocation.
+  MCSymbol *Label = context.CreateTempSymbol();
+  MCOS->EmitLabel(Label);
+
+  // Create and entry for the info and add it to the other entries.
+  MCGenDwarfLabelEntry *Entry =
+    new MCGenDwarfLabelEntry(Name, FileNumber, LineNumber, Label);
+  MCOS->getContext().addMCGenDwarfLabelEntry(Entry);
+}
+
+static int getDataAlignmentFactor(MCStreamer &streamer) {
+  MCContext &context = streamer.getContext();
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  int size = asmInfo.getCalleeSaveStackSlotSize();
+  if (asmInfo.isStackGrowthDirectionUp())
+    return size;
+  else
+    return -size;
+}
+
+static unsigned getSizeForEncoding(MCStreamer &streamer,
+                                   unsigned symbolEncoding) {
+  MCContext &context = streamer.getContext();
+  unsigned format = symbolEncoding & 0x0f;
+  switch (format) {
+  default: llvm_unreachable("Unknown Encoding");
+  case dwarf::DW_EH_PE_absptr:
+  case dwarf::DW_EH_PE_signed:
+    return context.getAsmInfo().getPointerSize();
+  case dwarf::DW_EH_PE_udata2:
+  case dwarf::DW_EH_PE_sdata2:
+    return 2;
+  case dwarf::DW_EH_PE_udata4:
+  case dwarf::DW_EH_PE_sdata4:
+    return 4;
+  case dwarf::DW_EH_PE_udata8:
+  case dwarf::DW_EH_PE_sdata8:
+    return 8;
+  }
+}
+
+static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol,
+                       unsigned symbolEncoding, const char *comment = 0) {
+  MCContext &context = streamer.getContext();
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo.getExprForFDESymbol(&symbol,
+                                                symbolEncoding,
+                                                streamer);
+  unsigned size = getSizeForEncoding(streamer, symbolEncoding);
+  if (streamer.isVerboseAsm() && comment) streamer.AddComment(comment);
+  streamer.EmitAbsValue(v, size);
+}
+
+static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
+                            unsigned symbolEncoding) {
+  MCContext &context = streamer.getContext();
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo.getExprForPersonalitySymbol(&symbol,
+                                                        symbolEncoding,
+                                                        streamer);
+  unsigned size = getSizeForEncoding(streamer, symbolEncoding);
+  streamer.EmitValue(v, size);
+}
+
+static const MachineLocation TranslateMachineLocation(
+                                                  const MCRegisterInfo &MRI,
+                                                  const MachineLocation &Loc) {
+  unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ?
+    MachineLocation::VirtualFP :
+    unsigned(MRI.getDwarfRegNum(Loc.getReg(), true));
+  const MachineLocation &NewLoc = Loc.isReg() ?
+    MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset());
+  return NewLoc;
+}
+
+namespace {
+  class FrameEmitterImpl {
+    int CFAOffset;
+    int CIENum;
+    bool UsingCFI;
+    bool IsEH;
+    const MCSymbol *SectionStart;
+  public:
+    FrameEmitterImpl(bool usingCFI, bool isEH)
+      : CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
+        SectionStart(0) {}
+
+    void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
+
+    /// EmitCompactUnwind - Emit the unwind information in a compact way. If
+    /// we're successful, return 'true'. Otherwise, return 'false' and it will
+    /// emit the normal CIE and FDE.
+    bool EmitCompactUnwind(MCStreamer &streamer,
+                           const MCDwarfFrameInfo &frame);
+
+    const MCSymbol &EmitCIE(MCStreamer &streamer,
+                            const MCSymbol *personality,
+                            unsigned personalityEncoding,
+                            const MCSymbol *lsda,
+                            bool IsSignalFrame,
+                            unsigned lsdaEncoding);
+    MCSymbol *EmitFDE(MCStreamer &streamer,
+                      const MCSymbol &cieStart,
+                      const MCDwarfFrameInfo &frame);
+    void EmitCFIInstructions(MCStreamer &streamer,
+                             const std::vector<MCCFIInstruction> &Instrs,
+                             MCSymbol *BaseLabel);
+    void EmitCFIInstruction(MCStreamer &Streamer,
+                            const MCCFIInstruction &Instr);
+  };
+
+} // end anonymous namespace
+
+static void EmitEncodingByte(MCStreamer &Streamer, unsigned Encoding,
+                             StringRef Prefix) {
+  if (Streamer.isVerboseAsm()) {
+    const char *EncStr;
+    switch (Encoding) {
+    default: EncStr = "<unknown encoding>"; break;
+    case dwarf::DW_EH_PE_absptr: EncStr = "absptr"; break;
+    case dwarf::DW_EH_PE_omit:   EncStr = "omit"; break;
+    case dwarf::DW_EH_PE_pcrel:  EncStr = "pcrel"; break;
+    case dwarf::DW_EH_PE_udata4: EncStr = "udata4"; break;
+    case dwarf::DW_EH_PE_udata8: EncStr = "udata8"; break;
+    case dwarf::DW_EH_PE_sdata4: EncStr = "sdata4"; break;
+    case dwarf::DW_EH_PE_sdata8: EncStr = "sdata8"; break;
+    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4:
+      EncStr = "pcrel udata4";
+      break;
+    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4:
+      EncStr = "pcrel sdata4";
+      break;
+    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8:
+      EncStr = "pcrel udata8";
+      break;
+    case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8:
+      EncStr = "screl sdata8";
+      break;
+    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_udata4:
+      EncStr = "indirect pcrel udata4";
+      break;
+    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_sdata4:
+      EncStr = "indirect pcrel sdata4";
+      break;
+    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_udata8:
+      EncStr = "indirect pcrel udata8";
+      break;
+    case dwarf::DW_EH_PE_indirect |dwarf::DW_EH_PE_pcrel|dwarf::DW_EH_PE_sdata8:
+      EncStr = "indirect pcrel sdata8";
+      break;
+    }
+
+    Streamer.AddComment(Twine(Prefix) + " = " + EncStr);
+  }
+
+  Streamer.EmitIntValue(Encoding, 1);
+}
+
+void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
+                                          const MCCFIInstruction &Instr) {
+  int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
+  bool VerboseAsm = Streamer.isVerboseAsm();
+
+  switch (Instr.getOperation()) {
+  case MCCFIInstruction::OpRegister: {
+    unsigned Reg1 = Instr.getRegister();
+    unsigned Reg2 = Instr.getRegister2();
+    if (VerboseAsm) {
+      Streamer.AddComment("DW_CFA_register");
+      Streamer.AddComment(Twine("Reg1 ") + Twine(Reg1));
+      Streamer.AddComment(Twine("Reg2 ") + Twine(Reg2));
+    }
+    Streamer.EmitIntValue(dwarf::DW_CFA_register, 1);
+    Streamer.EmitULEB128IntValue(Reg1);
+    Streamer.EmitULEB128IntValue(Reg2);
+    return;
+  }
+  case MCCFIInstruction::OpUndefined: {
+    unsigned Reg = Instr.getRegister();
+    if (VerboseAsm) {
+      Streamer.AddComment("DW_CFA_undefined");
+      Streamer.AddComment(Twine("Reg ") + Twine(Reg));
+    }
+    Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
+    Streamer.EmitULEB128IntValue(Reg);
+    return;
+  }
+  case MCCFIInstruction::OpAdjustCfaOffset:
+  case MCCFIInstruction::OpDefCfaOffset: {
+    const bool IsRelative =
+      Instr.getOperation() == MCCFIInstruction::OpAdjustCfaOffset;
+
+    if (VerboseAsm)
+      Streamer.AddComment("DW_CFA_def_cfa_offset");
+    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1);
+
+    if (IsRelative)
+      CFAOffset += Instr.getOffset();
+    else
+      CFAOffset = -Instr.getOffset();
+
+    if (VerboseAsm)
+      Streamer.AddComment(Twine("Offset " + Twine(CFAOffset)));
+    Streamer.EmitULEB128IntValue(CFAOffset);
+
+    return;
+  }
+  case MCCFIInstruction::OpDefCfa: {
+    if (VerboseAsm)
+      Streamer.AddComment("DW_CFA_def_cfa");
+    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa, 1);
+
+    if (VerboseAsm)
+      Streamer.AddComment(Twine("Reg ") + Twine(Instr.getRegister()));
+    Streamer.EmitULEB128IntValue(Instr.getRegister());
+
+    CFAOffset = -Instr.getOffset();
+
+    if (VerboseAsm)
+      Streamer.AddComment(Twine("Offset " + Twine(CFAOffset)));
+    Streamer.EmitULEB128IntValue(CFAOffset);
+
+    return;
+  }
+
+  case MCCFIInstruction::OpDefCfaRegister: {
+    if (VerboseAsm)
+      Streamer.AddComment("DW_CFA_def_cfa_register");
+    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_register, 1);
+
+    if (VerboseAsm)
+      Streamer.AddComment(Twine("Reg ") + Twine(Instr.getRegister()));
+    Streamer.EmitULEB128IntValue(Instr.getRegister());
+
+    return;
+  }
+
+  case MCCFIInstruction::OpOffset:
+  case MCCFIInstruction::OpRelOffset: {
+    const bool IsRelative =
+      Instr.getOperation() == MCCFIInstruction::OpRelOffset;
+
+    unsigned Reg = Instr.getRegister();
+    int Offset = Instr.getOffset();
+    if (IsRelative)
+      Offset -= CFAOffset;
+    Offset = Offset / dataAlignmentFactor;
+
+    if (Offset < 0) {
+      if (VerboseAsm) Streamer.AddComment("DW_CFA_offset_extended_sf");
+      Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended_sf, 1);
+      if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
+      Streamer.EmitULEB128IntValue(Reg);
+      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
+      Streamer.EmitSLEB128IntValue(Offset);
+    } else if (Reg < 64) {
+      if (VerboseAsm) Streamer.AddComment(Twine("DW_CFA_offset + Reg(") +
+                                          Twine(Reg) + ")");
+      Streamer.EmitIntValue(dwarf::DW_CFA_offset + Reg, 1);
+      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
+      Streamer.EmitULEB128IntValue(Offset);
+    } else {
+      if (VerboseAsm) Streamer.AddComment("DW_CFA_offset_extended");
+      Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended, 1);
+      if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
+      Streamer.EmitULEB128IntValue(Reg);
+      if (VerboseAsm) Streamer.AddComment(Twine("Offset ") + Twine(Offset));
+      Streamer.EmitULEB128IntValue(Offset);
+    }
+    return;
+  }
+  case MCCFIInstruction::OpRememberState:
+    if (VerboseAsm) Streamer.AddComment("DW_CFA_remember_state");
+    Streamer.EmitIntValue(dwarf::DW_CFA_remember_state, 1);
+    return;
+  case MCCFIInstruction::OpRestoreState:
+    if (VerboseAsm) Streamer.AddComment("DW_CFA_restore_state");
+    Streamer.EmitIntValue(dwarf::DW_CFA_restore_state, 1);
+    return;
+  case MCCFIInstruction::OpSameValue: {
+    unsigned Reg = Instr.getRegister();
+    if (VerboseAsm) Streamer.AddComment("DW_CFA_same_value");
+    Streamer.EmitIntValue(dwarf::DW_CFA_same_value, 1);
+    if (VerboseAsm) Streamer.AddComment(Twine("Reg ") + Twine(Reg));
+    Streamer.EmitULEB128IntValue(Reg);
+    return;
+  }
+  case MCCFIInstruction::OpRestore: {
+    unsigned Reg = Instr.getRegister();
+    if (VerboseAsm) {
+      Streamer.AddComment("DW_CFA_restore");
+      Streamer.AddComment(Twine("Reg ") + Twine(Reg));
+    }
+    Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
+    return;
+  }
+  case MCCFIInstruction::OpEscape:
+    if (VerboseAsm) Streamer.AddComment("Escape bytes");
+    Streamer.EmitBytes(Instr.getValues());
+    return;
+  }
+  llvm_unreachable("Unhandled case in switch");
+}
+
+/// EmitFrameMoves - Emit frame instructions to describe the layout of the
+/// frame.
+void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
+                                    const std::vector<MCCFIInstruction> &Instrs,
+                                           MCSymbol *BaseLabel) {
+  for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
+    const MCCFIInstruction &Instr = Instrs[i];
+    MCSymbol *Label = Instr.getLabel();
+    // Throw out move if the label is invalid.
+    if (Label && !Label->isDefined()) continue; // Not emitted, in dead code.
+
+    // Advance row if new location.
+    if (BaseLabel && Label) {
+      MCSymbol *ThisSym = Label;
+      if (ThisSym != BaseLabel) {
+        if (streamer.isVerboseAsm()) streamer.AddComment("DW_CFA_advance_loc4");
+        streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
+        BaseLabel = ThisSym;
+      }
+    }
+
+    EmitCFIInstruction(streamer, Instr);
+  }
+}
+
+/// EmitCompactUnwind - Emit the unwind information in a compact way. If we're
+/// successful, return 'true'. Otherwise, return 'false' and it will emit the
+/// normal CIE and FDE.
+bool FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
+                                         const MCDwarfFrameInfo &Frame) {
+  MCContext &Context = Streamer.getContext();
+  const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
+  bool VerboseAsm = Streamer.isVerboseAsm();
+
+  // range-start range-length  compact-unwind-enc personality-func   lsda
+  //  _foo       LfooEnd-_foo  0x00000023          0                 0
+  //  _bar       LbarEnd-_bar  0x00000025         __gxx_personality  except_tab1
+  //
+  //   .section __LD,__compact_unwind,regular,debug
+  //
+  //   # compact unwind for _foo
+  //   .quad _foo
+  //   .set L1,LfooEnd-_foo
+  //   .long L1
+  //   .long 0x01010001
+  //   .quad 0
+  //   .quad 0
+  //
+  //   # compact unwind for _bar
+  //   .quad _bar
+  //   .set L2,LbarEnd-_bar
+  //   .long L2
+  //   .long 0x01020011
+  //   .quad __gxx_personality
+  //   .quad except_tab1
+
+  uint32_t Encoding = Frame.CompactUnwindEncoding;
+  if (!Encoding) return false;
+
+  // The encoding needs to know we have an LSDA.
+  if (Frame.Lsda)
+    Encoding |= 0x40000000;
+
+  Streamer.SwitchSection(MOFI->getCompactUnwindSection());
+
+  // Range Start
+  unsigned FDEEncoding = MOFI->getFDEEncoding(UsingCFI);
+  unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
+  if (VerboseAsm) Streamer.AddComment("Range Start");
+  Streamer.EmitSymbolValue(Frame.Function, Size);
+
+  // Range Length
+  const MCExpr *Range = MakeStartMinusEndExpr(Streamer, *Frame.Begin,
+                                              *Frame.End, 0);
+  if (VerboseAsm) Streamer.AddComment("Range Length");
+  Streamer.EmitAbsValue(Range, 4);
+
+  // Compact Encoding
+  Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_udata4);
+  if (VerboseAsm) Streamer.AddComment("Compact Unwind Encoding: 0x" +
+                                      Twine::utohexstr(Encoding));
+  Streamer.EmitIntValue(Encoding, Size);
+
+
+  // Personality Function
+  Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_absptr);
+  if (VerboseAsm) Streamer.AddComment("Personality Function");
+  if (Frame.Personality)
+    Streamer.EmitSymbolValue(Frame.Personality, Size);
+  else
+    Streamer.EmitIntValue(0, Size); // No personality fn
+
+  // LSDA
+  Size = getSizeForEncoding(Streamer, Frame.LsdaEncoding);
+  if (VerboseAsm) Streamer.AddComment("LSDA");
+  if (Frame.Lsda)
+    Streamer.EmitSymbolValue(Frame.Lsda, Size);
+  else
+    Streamer.EmitIntValue(0, Size); // No LSDA
+
+  return true;
+}
+
+const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
+                                          const MCSymbol *personality,
+                                          unsigned personalityEncoding,
+                                          const MCSymbol *lsda,
+                                          bool IsSignalFrame,
+                                          unsigned lsdaEncoding) {
+  MCContext &context = streamer.getContext();
+  const MCRegisterInfo &MRI = context.getRegisterInfo();
+  const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
+  bool verboseAsm = streamer.isVerboseAsm();
+
+  MCSymbol *sectionStart;
+  if (MOFI->isFunctionEHFrameSymbolPrivate() || !IsEH)
+    sectionStart = context.CreateTempSymbol();
+  else
+    sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum));
+
+  streamer.EmitLabel(sectionStart);
+  CIENum++;
+
+  MCSymbol *sectionEnd = context.CreateTempSymbol();
+
+  // Length
+  const MCExpr *Length = MakeStartMinusEndExpr(streamer, *sectionStart,
+                                               *sectionEnd, 4);
+  if (verboseAsm) streamer.AddComment("CIE Length");
+  streamer.EmitAbsValue(Length, 4);
+
+  // CIE ID
+  unsigned CIE_ID = IsEH ? 0 : -1;
+  if (verboseAsm) streamer.AddComment("CIE ID Tag");
+  streamer.EmitIntValue(CIE_ID, 4);
+
+  // Version
+  if (verboseAsm) streamer.AddComment("DW_CIE_VERSION");
+  streamer.EmitIntValue(dwarf::DW_CIE_VERSION, 1);
+
+  // Augmentation String
+  SmallString<8> Augmentation;
+  if (IsEH) {
+    if (verboseAsm) streamer.AddComment("CIE Augmentation");
+    Augmentation += "z";
+    if (personality)
+      Augmentation += "P";
+    if (lsda)
+      Augmentation += "L";
+    Augmentation += "R";
+    if (IsSignalFrame)
+      Augmentation += "S";
+    streamer.EmitBytes(Augmentation.str());
+  }
+  streamer.EmitIntValue(0, 1);
+
+  // Code Alignment Factor
+  if (verboseAsm) streamer.AddComment("CIE Code Alignment Factor");
+  streamer.EmitULEB128IntValue(1);
+
+  // Data Alignment Factor
+  if (verboseAsm) streamer.AddComment("CIE Data Alignment Factor");
+  streamer.EmitSLEB128IntValue(getDataAlignmentFactor(streamer));
+
+  // Return Address Register
+  if (verboseAsm) streamer.AddComment("CIE Return Address Column");
+  streamer.EmitULEB128IntValue(MRI.getDwarfRegNum(MRI.getRARegister(), true));
+
+  // Augmentation Data Length (optional)
+
+  unsigned augmentationLength = 0;
+  if (IsEH) {
+    if (personality) {
+      // Personality Encoding
+      augmentationLength += 1;
+      // Personality
+      augmentationLength += getSizeForEncoding(streamer, personalityEncoding);
+    }
+    if (lsda)
+      augmentationLength += 1;
+    // Encoding of the FDE pointers
+    augmentationLength += 1;
+
+    if (verboseAsm) streamer.AddComment("Augmentation Size");
+    streamer.EmitULEB128IntValue(augmentationLength);
+
+    // Augmentation Data (optional)
+    if (personality) {
+      // Personality Encoding
+      EmitEncodingByte(streamer, personalityEncoding,
+                       "Personality Encoding");
+      // Personality
+      if (verboseAsm) streamer.AddComment("Personality");
+      EmitPersonality(streamer, *personality, personalityEncoding);
+    }
+
+    if (lsda)
+      EmitEncodingByte(streamer, lsdaEncoding, "LSDA Encoding");
+
+    // Encoding of the FDE pointers
+    EmitEncodingByte(streamer, MOFI->getFDEEncoding(UsingCFI),
+                     "FDE Encoding");
+  }
+
+  // Initial Instructions
+
+  const MCAsmInfo &MAI = context.getAsmInfo();
+  const std::vector<MachineMove> &Moves = MAI.getInitialFrameState();
+  std::vector<MCCFIInstruction> Instructions;
+
+  for (int i = 0, n = Moves.size(); i != n; ++i) {
+    MCSymbol *Label = Moves[i].getLabel();
+    const MachineLocation &Dst =
+      TranslateMachineLocation(MRI, Moves[i].getDestination());
+    const MachineLocation &Src =
+      TranslateMachineLocation(MRI, Moves[i].getSource());
+
+    if (Dst.isReg()) {
+      assert(Dst.getReg() == MachineLocation::VirtualFP);
+      assert(!Src.isReg());
+      MCCFIInstruction Inst =
+        MCCFIInstruction::createDefCfa(Label, Src.getReg(), -Src.getOffset());
+      Instructions.push_back(Inst);
+    } else {
+      assert(Src.isReg());
+      unsigned Reg = Src.getReg();
+      int Offset = Dst.getOffset();
+      MCCFIInstruction Inst =
+        MCCFIInstruction::createOffset(Label, Reg, Offset);
+      Instructions.push_back(Inst);
+    }
+  }
+
+  EmitCFIInstructions(streamer, Instructions, NULL);
+
+  // Padding
+  streamer.EmitValueToAlignment(IsEH
+                                ? 4 : context.getAsmInfo().getPointerSize());
+
+  streamer.EmitLabel(sectionEnd);
+  return *sectionStart;
+}
+
+MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
+                                    const MCSymbol &cieStart,
+                                    const MCDwarfFrameInfo &frame) {
+  MCContext &context = streamer.getContext();
+  MCSymbol *fdeStart = context.CreateTempSymbol();
+  MCSymbol *fdeEnd = context.CreateTempSymbol();
+  const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
+  bool verboseAsm = streamer.isVerboseAsm();
+
+  if (IsEH && frame.Function && !MOFI->isFunctionEHFrameSymbolPrivate()) {
+    MCSymbol *EHSym =
+      context.GetOrCreateSymbol(frame.Function->getName() + Twine(".eh"));
+    streamer.EmitEHSymAttributes(frame.Function, EHSym);
+    streamer.EmitLabel(EHSym);
+  }
+
+  // Length
+  const MCExpr *Length = MakeStartMinusEndExpr(streamer, *fdeStart, *fdeEnd, 0);
+  if (verboseAsm) streamer.AddComment("FDE Length");
+  streamer.EmitAbsValue(Length, 4);
+
+  streamer.EmitLabel(fdeStart);
+
+  // CIE Pointer
+  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  if (IsEH) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
+                                                 0);
+    if (verboseAsm) streamer.AddComment("FDE CIE Offset");
+    streamer.EmitAbsValue(offset, 4);
+  } else if (!asmInfo.doesDwarfUseRelocationsAcrossSections()) {
+    const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
+                                                 cieStart, 0);
+    streamer.EmitAbsValue(offset, 4);
+  } else {
+    streamer.EmitSymbolValue(&cieStart, 4);
+  }
+
+  // PC Begin
+  unsigned PCEncoding = IsEH ? MOFI->getFDEEncoding(UsingCFI)
+                             : (unsigned)dwarf::DW_EH_PE_absptr;
+  unsigned PCSize = getSizeForEncoding(streamer, PCEncoding);
+  EmitSymbol(streamer, *frame.Begin, PCEncoding, "FDE initial location");
+
+  // PC Range
+  const MCExpr *Range = MakeStartMinusEndExpr(streamer, *frame.Begin,
+                                              *frame.End, 0);
+  if (verboseAsm) streamer.AddComment("FDE address range");
+  streamer.EmitAbsValue(Range, PCSize);
+
+  if (IsEH) {
+    // Augmentation Data Length
+    unsigned augmentationLength = 0;
+
+    if (frame.Lsda)
+      augmentationLength += getSizeForEncoding(streamer, frame.LsdaEncoding);
+
+    if (verboseAsm) streamer.AddComment("Augmentation size");
+    streamer.EmitULEB128IntValue(augmentationLength);
+
+    // Augmentation Data
+    if (frame.Lsda)
+      EmitSymbol(streamer, *frame.Lsda, frame.LsdaEncoding,
+                 "Language Specific Data Area");
+  }
+
+  // Call Frame Instructions
+
+  EmitCFIInstructions(streamer, frame.Instructions, frame.Begin);
+
+  // Padding
+  streamer.EmitValueToAlignment(PCSize);
+
+  return fdeEnd;
+}
+
+namespace {
+  struct CIEKey {
+    static const CIEKey getEmptyKey() { return CIEKey(0, 0, -1, false); }
+    static const CIEKey getTombstoneKey() { return CIEKey(0, -1, 0, false); }
+
+    CIEKey(const MCSymbol* Personality_, unsigned PersonalityEncoding_,
+           unsigned LsdaEncoding_, bool IsSignalFrame_) :
+      Personality(Personality_), PersonalityEncoding(PersonalityEncoding_),
+      LsdaEncoding(LsdaEncoding_), IsSignalFrame(IsSignalFrame_) {
+    }
+    const MCSymbol* Personality;
+    unsigned PersonalityEncoding;
+    unsigned LsdaEncoding;
+    bool IsSignalFrame;
+  };
+}
+
+namespace llvm {
+  template <>
+  struct DenseMapInfo<CIEKey> {
+    static CIEKey getEmptyKey() {
+      return CIEKey::getEmptyKey();
+    }
+    static CIEKey getTombstoneKey() {
+      return CIEKey::getTombstoneKey();
+    }
+    static unsigned getHashValue(const CIEKey &Key) {
+      return static_cast<unsigned>(hash_combine(Key.Personality,
+                                                Key.PersonalityEncoding,
+                                                Key.LsdaEncoding,
+                                                Key.IsSignalFrame));
+    }
+    static bool isEqual(const CIEKey &LHS,
+                        const CIEKey &RHS) {
+      return LHS.Personality == RHS.Personality &&
+        LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
+        LHS.LsdaEncoding == RHS.LsdaEncoding &&
+        LHS.IsSignalFrame == RHS.IsSignalFrame;
+    }
+  };
+}
+
+void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
+                               bool UsingCFI,
+                               bool IsEH) {
+  MCContext &Context = Streamer.getContext();
+  MCObjectFileInfo *MOFI =
+    const_cast<MCObjectFileInfo*>(Context.getObjectFileInfo());
+  FrameEmitterImpl Emitter(UsingCFI, IsEH);
+  ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getFrameInfos();
+
+  // Emit the compact unwind info if available.
+  if (IsEH && MOFI->getCompactUnwindSection())
+    for (unsigned i = 0, n = Streamer.getNumFrameInfos(); i < n; ++i) {
+      const MCDwarfFrameInfo &Frame = Streamer.getFrameInfo(i);
+      if (Frame.CompactUnwindEncoding)
+        Emitter.EmitCompactUnwind(Streamer, Frame);
+    }
+
+  const MCSection &Section = IsEH ? *MOFI->getEHFrameSection() :
+                                    *MOFI->getDwarfFrameSection();
+  Streamer.SwitchSection(&Section);
+  MCSymbol *SectionStart = Context.CreateTempSymbol();
+  Streamer.EmitLabel(SectionStart);
+  Emitter.setSectionStart(SectionStart);
+
+  MCSymbol *FDEEnd = NULL;
+  DenseMap<CIEKey, const MCSymbol*> CIEStarts;
+
+  const MCSymbol *DummyDebugKey = NULL;
+  for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
+    const MCDwarfFrameInfo &Frame = FrameArray[i];
+    CIEKey Key(Frame.Personality, Frame.PersonalityEncoding,
+               Frame.LsdaEncoding, Frame.IsSignalFrame);
+    const MCSymbol *&CIEStart = IsEH ? CIEStarts[Key] : DummyDebugKey;
+    if (!CIEStart)
+      CIEStart = &Emitter.EmitCIE(Streamer, Frame.Personality,
+                                  Frame.PersonalityEncoding, Frame.Lsda,
+                                  Frame.IsSignalFrame,
+                                  Frame.LsdaEncoding);
+
+    FDEEnd = Emitter.EmitFDE(Streamer, *CIEStart, Frame);
+
+    if (i != n - 1)
+      Streamer.EmitLabel(FDEEnd);
+  }
+
+  Streamer.EmitValueToAlignment(Context.getAsmInfo().getPointerSize());
+  if (FDEEnd)
+    Streamer.EmitLabel(FDEEnd);
+}
+
+void MCDwarfFrameEmitter::EmitAdvanceLoc(MCStreamer &Streamer,
+                                         uint64_t AddrDelta) {
+  SmallString<256> Tmp;
+  raw_svector_ostream OS(Tmp);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OS);
+  Streamer.EmitBytes(OS.str());
+}
+
+void MCDwarfFrameEmitter::EncodeAdvanceLoc(uint64_t AddrDelta,
+                                           raw_ostream &OS) {
+  // FIXME: Assumes the code alignment factor is 1.
+  if (AddrDelta == 0) {
+  } else if (isUIntN(6, AddrDelta)) {
+    uint8_t Opcode = dwarf::DW_CFA_advance_loc | AddrDelta;
+    OS << Opcode;
+  } else if (isUInt<8>(AddrDelta)) {
+    OS << uint8_t(dwarf::DW_CFA_advance_loc1);
+    OS << uint8_t(AddrDelta);
+  } else if (isUInt<16>(AddrDelta)) {
+    // FIXME: check what is the correct behavior on a big endian machine.
+    OS << uint8_t(dwarf::DW_CFA_advance_loc2);
+    OS << uint8_t( AddrDelta       & 0xff);
+    OS << uint8_t((AddrDelta >> 8) & 0xff);
+  } else {
+    // FIXME: check what is the correct behavior on a big endian machine.
+    assert(isUInt<32>(AddrDelta));
+    OS << uint8_t(dwarf::DW_CFA_advance_loc4);
+    OS << uint8_t( AddrDelta        & 0xff);
+    OS << uint8_t((AddrDelta >> 8)  & 0xff);
+    OS << uint8_t((AddrDelta >> 16) & 0xff);
+    OS << uint8_t((AddrDelta >> 24) & 0xff);
+
+  }
+}
diff --git a/contrib/llvm/lib/MC/MCELF.cpp b/contrib/llvm/lib/MC/MCELF.cpp
new file mode 100644
index 000000000000..560cdbc6abae
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCELF.cpp
@@ -0,0 +1,86 @@
+//===- lib/MC/MCELF.cpp - MC ELF ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/Support/ELF.h"
+
+namespace llvm {
+
+void MCELF::SetBinding(MCSymbolData &SD, unsigned Binding) {
+  assert(Binding == ELF::STB_LOCAL || Binding == ELF::STB_GLOBAL ||
+         Binding == ELF::STB_WEAK);
+  uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STB_Shift);
+  SD.setFlags(OtherFlags | (Binding << ELF_STB_Shift));
+}
+
+unsigned MCELF::GetBinding(const MCSymbolData &SD) {
+  uint32_t Binding = (SD.getFlags() & (0xf << ELF_STB_Shift)) >> ELF_STB_Shift;
+  assert(Binding == ELF::STB_LOCAL || Binding == ELF::STB_GLOBAL ||
+         Binding == ELF::STB_WEAK);
+  return Binding;
+}
+
+void MCELF::SetType(MCSymbolData &SD, unsigned Type) {
+  assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
+         Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
+         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
+         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+
+  uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STT_Shift);
+  SD.setFlags(OtherFlags | (Type << ELF_STT_Shift));
+}
+
+unsigned MCELF::GetType(const MCSymbolData &SD) {
+  uint32_t Type = (SD.getFlags() & (0xf << ELF_STT_Shift)) >> ELF_STT_Shift;
+  assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
+         Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
+         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
+         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+  return Type;
+}
+
+// Visibility is stored in the first two bits of st_other
+// st_other values are stored in the second byte of get/setFlags
+void MCELF::SetVisibility(MCSymbolData &SD, unsigned Visibility) {
+  assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
+         Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
+
+  uint32_t OtherFlags = SD.getFlags() & ~(0x3 << ELF_STV_Shift);
+  SD.setFlags(OtherFlags | (Visibility << ELF_STV_Shift));
+}
+
+unsigned MCELF::GetVisibility(MCSymbolData &SD) {
+  unsigned Visibility =
+    (SD.getFlags() & (0x3 << ELF_STV_Shift)) >> ELF_STV_Shift;
+  assert(Visibility == ELF::STV_DEFAULT || Visibility == ELF::STV_INTERNAL ||
+         Visibility == ELF::STV_HIDDEN || Visibility == ELF::STV_PROTECTED);
+  return Visibility;
+}
+
+// Other is stored in the last six bits of st_other
+// st_other values are stored in the second byte of get/setFlags
+void MCELF::setOther(MCSymbolData &SD, unsigned Other) {
+  uint32_t OtherFlags = SD.getFlags() & ~(0x3f << ELF_Other_Shift);
+  SD.setFlags(OtherFlags | (Other << ELF_Other_Shift));
+}
+
+unsigned MCELF::getOther(MCSymbolData &SD) {
+  unsigned Other =
+    (SD.getFlags() & (0x3f << ELF_Other_Shift)) >> ELF_Other_Shift;
+  return Other;
+}
+
+}
diff --git a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
new file mode 100644
index 000000000000..4cac84d66609
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
@@ -0,0 +1,51 @@
+//===-- MCELFObjectTargetWriter.cpp - ELF Target Writer Subclass ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+
+using namespace llvm;
+
+MCELFObjectTargetWriter::MCELFObjectTargetWriter(bool Is64Bit_,
+                                                 uint8_t OSABI_,
+                                                 uint16_t EMachine_,
+                                                 bool HasRelocationAddend_,
+                                                 bool IsN64_)
+  : OSABI(OSABI_), EMachine(EMachine_),
+    HasRelocationAddend(HasRelocationAddend_), Is64Bit(Is64Bit_),
+    IsN64(IsN64_){
+}
+
+const MCSymbol *MCELFObjectTargetWriter::ExplicitRelSym(const MCAssembler &Asm,
+                                                        const MCValue &Target,
+                                                        const MCFragment &F,
+                                                        const MCFixup &Fixup,
+                                                        bool IsPCRel) const {
+  return NULL;
+}
+
+const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                                 const MCFixup &Fixup,
+                                                                 bool IsPCRel) const {
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+  return &Symbol.AliasedSymbol();
+}
+
+void MCELFObjectTargetWriter::adjustFixupOffset(const MCFixup &Fixup,
+                                                uint64_t &RelocOffset) {
+}
+
+void
+MCELFObjectTargetWriter::sortRelocs(const MCAssembler &Asm,
+                                    std::vector<ELFRelocationEntry> &Relocs) {
+  // Sort by the r_offset, just like gnu as does.
+  array_pod_sort(Relocs.begin(), Relocs.end());
+}
diff --git a/contrib/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm/lib/MC/MCELFStreamer.cpp
new file mode 100644
index 000000000000..7f5f1b63e5fe
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCELFStreamer.cpp
@@ -0,0 +1,552 @@
+//===- lib/MC/MCELFStreamer.cpp - ELF Object Output -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ELF .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+
+inline void MCELFStreamer::SetSection(StringRef Section, unsigned Type,
+                                      unsigned Flags, SectionKind Kind) {
+  SwitchSection(getContext().getELFSection(Section, Type, Flags, Kind));
+}
+
+inline void MCELFStreamer::SetSectionData() {
+  SetSection(".data",
+             ELF::SHT_PROGBITS,
+             ELF::SHF_WRITE | ELF::SHF_ALLOC,
+             SectionKind::getDataRel());
+  EmitCodeAlignment(4, 0);
+}
+
+inline void MCELFStreamer::SetSectionText() {
+  SetSection(".text",
+             ELF::SHT_PROGBITS,
+             ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
+             SectionKind::getText());
+  EmitCodeAlignment(4, 0);
+}
+
+inline void MCELFStreamer::SetSectionBss() {
+  SetSection(".bss",
+             ELF::SHT_NOBITS,
+             ELF::SHF_WRITE | ELF::SHF_ALLOC,
+             SectionKind::getBSS());
+  EmitCodeAlignment(4, 0);
+}
+
+MCELFStreamer::~MCELFStreamer() {
+}
+
+void MCELFStreamer::InitToTextSection() {
+  SetSectionText();
+}
+
+void MCELFStreamer::InitSections() {
+  // This emulates the same behavior of GNU as. This makes it easier
+  // to compare the output as the major sections are in the same order.
+  SetSectionText();
+  SetSectionData();
+  SetSectionBss();
+  SetSectionText();
+}
+
+void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
+  MCObjectStreamer::EmitLabel(Symbol);
+
+  const MCSectionELF &Section =
+    static_cast<const MCSectionELF&>(Symbol->getSection());
+  MCSymbolData &SD = getAssembler().getSymbolData(*Symbol);
+  if (Section.getFlags() & ELF::SHF_TLS)
+    MCELF::SetType(SD, ELF::STT_TLS);
+}
+
+void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
+void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  case MCAF_SyntaxUnified: return; // no-op here.
+  case MCAF_Code16: return; // Change parsing mode; no-op here.
+  case MCAF_Code32: return; // Change parsing mode; no-op here.
+  case MCAF_Code64: return; // Change parsing mode; no-op here.
+  case MCAF_SubsectionsViaSymbols:
+    getAssembler().setSubsectionsViaSymbols(true);
+    return;
+  }
+
+  llvm_unreachable("invalid assembler flag!");
+}
+
+void MCELFStreamer::ChangeSection(const MCSection *Section) {
+  MCSectionData *CurSection = getCurrentSectionData();
+  if (CurSection && CurSection->isBundleLocked())
+    report_fatal_error("Unterminated .bundle_lock when changing a section");
+  const MCSymbol *Grp = static_cast<const MCSectionELF *>(Section)->getGroup();
+  if (Grp)
+    getAssembler().getOrCreateSymbolData(*Grp);
+  this->MCObjectStreamer::ChangeSection(Section);
+}
+
+void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
+  getAssembler().getOrCreateSymbolData(*Symbol);
+  MCSymbolData &AliasSD = getAssembler().getOrCreateSymbolData(*Alias);
+  AliasSD.setFlags(AliasSD.getFlags() | ELF_Other_Weakref);
+  const MCExpr *Value = MCSymbolRefExpr::Create(Symbol, getContext());
+  Alias->setVariableValue(Value);
+}
+
+void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                          MCSymbolAttr Attribute) {
+  // Indirect symbols are handled differently, to match how 'as' handles
+  // them. This makes writing matching .o files easier.
+  if (Attribute == MCSA_IndirectSymbol) {
+    // Note that we intentionally cannot use the symbol data here; this is
+    // important for matching the string table that 'as' generates.
+    IndirectSymbolData ISD;
+    ISD.Symbol = Symbol;
+    ISD.SectionData = getCurrentSectionData();
+    getAssembler().getIndirectSymbols().push_back(ISD);
+    return;
+  }
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling getOrCreateSymbolData here is to register
+  // the symbol with the assembler.
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // The implementation of symbol attributes is designed to match 'as', but it
+  // leaves much to desired. It doesn't really make sense to arbitrarily add and
+  // remove flags, but 'as' allows this (in particular, see .desc).
+  //
+  // In the future it might be worth trying to make these operations more well
+  // defined.
+  switch (Attribute) {
+  case MCSA_LazyReference:
+  case MCSA_Reference:
+  case MCSA_SymbolResolver:
+  case MCSA_PrivateExtern:
+  case MCSA_WeakDefinition:
+  case MCSA_WeakDefAutoPrivate:
+  case MCSA_Invalid:
+  case MCSA_IndirectSymbol:
+    llvm_unreachable("Invalid symbol attribute for ELF!");
+
+  case MCSA_NoDeadStrip:
+  case MCSA_ELF_TypeGnuUniqueObject:
+    // Ignore for now.
+    break;
+
+  case MCSA_Global:
+    MCELF::SetBinding(SD, ELF::STB_GLOBAL);
+    SD.setExternal(true);
+    BindingExplicitlySet.insert(Symbol);
+    break;
+
+  case MCSA_WeakReference:
+  case MCSA_Weak:
+    MCELF::SetBinding(SD, ELF::STB_WEAK);
+    SD.setExternal(true);
+    BindingExplicitlySet.insert(Symbol);
+    break;
+
+  case MCSA_Local:
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    BindingExplicitlySet.insert(Symbol);
+    break;
+
+  case MCSA_ELF_TypeFunction:
+    MCELF::SetType(SD, ELF::STT_FUNC);
+    break;
+
+  case MCSA_ELF_TypeIndFunction:
+    MCELF::SetType(SD, ELF::STT_GNU_IFUNC);
+    break;
+
+  case MCSA_ELF_TypeObject:
+    MCELF::SetType(SD, ELF::STT_OBJECT);
+    break;
+
+  case MCSA_ELF_TypeTLS:
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+
+  case MCSA_ELF_TypeCommon:
+    MCELF::SetType(SD, ELF::STT_COMMON);
+    break;
+
+  case MCSA_ELF_TypeNoType:
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    break;
+
+  case MCSA_Protected:
+    MCELF::SetVisibility(SD, ELF::STV_PROTECTED);
+    break;
+
+  case MCSA_Hidden:
+    MCELF::SetVisibility(SD, ELF::STV_HIDDEN);
+    break;
+
+  case MCSA_Internal:
+    MCELF::SetVisibility(SD, ELF::STV_INTERNAL);
+    break;
+  }
+}
+
+void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  if (!BindingExplicitlySet.count(Symbol)) {
+    MCELF::SetBinding(SD, ELF::STB_GLOBAL);
+    SD.setExternal(true);
+  }
+
+  MCELF::SetType(SD, ELF::STT_OBJECT);
+
+  if (MCELF::GetBinding(SD) == ELF_STB_Local) {
+    const MCSection *Section = getAssembler().getContext().getELFSection(".bss",
+                                                         ELF::SHT_NOBITS,
+                                                         ELF::SHF_WRITE |
+                                                         ELF::SHF_ALLOC,
+                                                         SectionKind::getBSS());
+    Symbol->setSection(*Section);
+
+    struct LocalCommon L = {&SD, Size, ByteAlignment};
+    LocalCommons.push_back(L);
+  } else {
+    SD.setCommon(Size, ByteAlignment);
+  }
+
+  SD.setSize(MCConstantExpr::Create(Size, getContext()));
+}
+
+void MCELFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  SD.setSize(Value);
+}
+
+void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                          unsigned ByteAlignment) {
+  // FIXME: Should this be caught and done earlier?
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  MCELF::SetBinding(SD, ELF::STB_LOCAL);
+  SD.setExternal(false);
+  BindingExplicitlySet.insert(Symbol);
+  EmitCommonSymbol(Symbol, Size, ByteAlignment);
+}
+
+void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                  unsigned AddrSpace) {
+  if (getCurrentSectionData()->isBundleLocked())
+    report_fatal_error("Emitting values inside a locked bundle is forbidden");
+  fixSymbolsInTLSFixups(Value);
+  MCObjectStreamer::EmitValueImpl(Value, Size, AddrSpace);
+}
+
+void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                         int64_t Value,
+                                         unsigned ValueSize,
+                                         unsigned MaxBytesToEmit) {
+  if (getCurrentSectionData()->isBundleLocked())
+    report_fatal_error("Emitting values inside a locked bundle is forbidden");
+  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value,
+                                         ValueSize, MaxBytesToEmit);
+}
+
+
+// Add a symbol for the file name of this module. This is the second
+// entry in the module's symbol table (the first being the null symbol).
+void MCELFStreamer::EmitFileDirective(StringRef Filename) {
+  MCSymbol *Symbol = getAssembler().getContext().GetOrCreateSymbol(Filename);
+  Symbol->setSection(*getCurrentSection());
+  Symbol->setAbsolute();
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  SD.setFlags(ELF_STT_File | ELF_STB_Local | ELF_STV_Default);
+}
+
+void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
+  switch (expr->getKind()) {
+  case MCExpr::Target:
+    cast<MCTargetExpr>(expr)->fixELFSymbolsInTLSFixups(getAssembler());
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *be = cast<MCBinaryExpr>(expr);
+    fixSymbolsInTLSFixups(be->getLHS());
+    fixSymbolsInTLSFixups(be->getRHS());
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &symRef = *cast<MCSymbolRefExpr>(expr);
+    switch (symRef.getKind()) {
+    default:
+      return;
+    case MCSymbolRefExpr::VK_GOTTPOFF:
+    case MCSymbolRefExpr::VK_INDNTPOFF:
+    case MCSymbolRefExpr::VK_NTPOFF:
+    case MCSymbolRefExpr::VK_GOTNTPOFF:
+    case MCSymbolRefExpr::VK_TLSGD:
+    case MCSymbolRefExpr::VK_TLSLD:
+    case MCSymbolRefExpr::VK_TLSLDM:
+    case MCSymbolRefExpr::VK_TPOFF:
+    case MCSymbolRefExpr::VK_DTPOFF:
+    case MCSymbolRefExpr::VK_ARM_TLSGD:
+    case MCSymbolRefExpr::VK_ARM_TPOFF:
+    case MCSymbolRefExpr::VK_ARM_GOTTPOFF:
+    case MCSymbolRefExpr::VK_Mips_TLSGD:
+    case MCSymbolRefExpr::VK_Mips_GOTTPREL:
+    case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+    case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLS:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLSGD:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+    case MCSymbolRefExpr::VK_PPC_TLSLD:
+      break;
+    }
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(symRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixSymbolsInTLSFixups(cast<MCUnaryExpr>(expr)->getSubExpr());
+    break;
+  }
+}
+
+void MCELFStreamer::EmitInstToFragment(const MCInst &Inst) {
+  this->MCObjectStreamer::EmitInstToFragment(Inst);
+  MCRelaxableFragment &F = *cast<MCRelaxableFragment>(getCurrentFragment());
+
+  for (unsigned i = 0, e = F.getFixups().size(); i != e; ++i)
+    fixSymbolsInTLSFixups(F.getFixups()[i].getValue());
+}
+
+void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i)
+    fixSymbolsInTLSFixups(Fixups[i].getValue());
+
+  // There are several possibilities here:
+  //
+  // If bundling is disabled, append the encoded instruction to the current data
+  // fragment (or create a new such fragment if the current fragment is not a
+  // data fragment).
+  //
+  // If bundling is enabled:
+  // - If we're not in a bundle-locked group, emit the instruction into a
+  //   fragment of its own. If there are no fixups registered for the
+  //   instruction, emit a MCCompactEncodedInstFragment. Otherwise, emit a
+  //   MCDataFragment.
+  // - If we're in a bundle-locked group, append the instruction to the current
+  //   data fragment because we want all the instructions in a group to get into
+  //   the same fragment. Be careful not to do that for the first instruction in
+  //   the group, though.
+  MCDataFragment *DF;
+
+  if (Assembler.isBundlingEnabled()) {
+    MCSectionData *SD = getCurrentSectionData();
+    if (SD->isBundleLocked() && !SD->isBundleGroupBeforeFirstInst())
+      // If we are bundle-locked, we re-use the current fragment.
+      // The bundle-locking directive ensures this is a new data fragment.
+      DF = cast<MCDataFragment>(getCurrentFragment());
+    else if (!SD->isBundleLocked() && Fixups.size() == 0) {
+      // Optimize memory usage by emitting the instruction to a
+      // MCCompactEncodedInstFragment when not in a bundle-locked group and
+      // there are no fixups registered.
+      MCCompactEncodedInstFragment *CEIF = new MCCompactEncodedInstFragment(SD);
+      CEIF->getContents().append(Code.begin(), Code.end());
+      return;
+    } else {
+      DF = new MCDataFragment(SD);
+      if (SD->getBundleLockState() == MCSectionData::BundleLockedAlignToEnd) {
+        // If this is a new fragment created for a bundle-locked group, and the
+        // group was marked as "align_to_end", set a flag in the fragment.
+        DF->setAlignToBundleEnd(true);
+      }
+    }
+
+    // We're now emitting an instruction in a bundle group, so this flag has
+    // to be turned off.
+    SD->setBundleGroupBeforeFirstInst(false);
+  } else {
+    DF = getOrCreateDataFragment();
+  }
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCELFStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+  assert(AlignPow2 <= 30 && "Invalid bundle alignment");
+  MCAssembler &Assembler = getAssembler();
+  if (Assembler.getBundleAlignSize() == 0 && AlignPow2 > 0)
+    Assembler.setBundleAlignSize(1 << AlignPow2);
+  else
+    report_fatal_error(".bundle_align_mode should be only set once per file");
+}
+
+void MCELFStreamer::EmitBundleLock(bool AlignToEnd) {
+  MCSectionData *SD = getCurrentSectionData();
+
+  // Sanity checks
+  //
+  if (!getAssembler().isBundlingEnabled())
+    report_fatal_error(".bundle_lock forbidden when bundling is disabled");
+  else if (SD->isBundleLocked())
+    report_fatal_error("Nesting of .bundle_lock is forbidden");
+
+  SD->setBundleLockState(AlignToEnd ? MCSectionData::BundleLockedAlignToEnd :
+                                      MCSectionData::BundleLocked);
+  SD->setBundleGroupBeforeFirstInst(true);
+}
+
+void MCELFStreamer::EmitBundleUnlock() {
+  MCSectionData *SD = getCurrentSectionData();
+
+  // Sanity checks
+  if (!getAssembler().isBundlingEnabled())
+    report_fatal_error(".bundle_unlock forbidden when bundling is disabled");
+  else if (!SD->isBundleLocked())
+    report_fatal_error(".bundle_unlock without matching lock");
+  else if (SD->isBundleGroupBeforeFirstInst())
+    report_fatal_error("Empty bundle-locked group is forbidden");
+
+  SD->setBundleLockState(MCSectionData::NotBundleLocked);
+}
+
+void MCELFStreamer::FinishImpl() {
+  EmitFrames(true);
+
+  for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
+                                                e = LocalCommons.end();
+       i != e; ++i) {
+    MCSymbolData *SD = i->SD;
+    uint64_t Size = i->Size;
+    unsigned ByteAlignment = i->ByteAlignment;
+    const MCSymbol &Symbol = SD->getSymbol();
+    const MCSection &Section = Symbol.getSection();
+
+    MCSectionData &SectData = getAssembler().getOrCreateSectionData(Section);
+    new MCAlignFragment(ByteAlignment, 0, 1, ByteAlignment, &SectData);
+
+    MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
+    SD->setFragment(F);
+
+    // Update the maximum alignment of the section if necessary.
+    if (ByteAlignment > SectData.getAlignment())
+      SectData.setAlignment(ByteAlignment);
+  }
+
+  this->MCObjectStreamer::FinishImpl();
+}
+void MCELFStreamer::EmitTCEntry(const MCSymbol &S) {
+  // Creates a R_PPC64_TOC relocation
+  MCObjectStreamer::EmitSymbolValue(&S, 8);
+}
+
+MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *CE,
+                                    bool RelaxAll, bool NoExecStack) {
+  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
+
+void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
+  llvm_unreachable("Generic ELF doesn't support this directive");
+}
+
+MCSymbolData &MCELFStreamer::getOrCreateSymbolData(MCSymbol *Symbol) {
+  return getAssembler().getOrCreateSymbolData(*Symbol);
+}
+
+void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::EmitCOFFSymbolType(int Type) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::EndCOFFSymbolDef() {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                 uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
+
+void MCELFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                   uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("ELF doesn't support this directive");
+}
diff --git a/contrib/llvm/lib/MC/MCExpr.cpp b/contrib/llvm/lib/MC/MCExpr.cpp
new file mode 100644
index 000000000000..cd4d144575b1
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCExpr.cpp
@@ -0,0 +1,658 @@
+//===- MCExpr.cpp - Assembly Level Expression Implementation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mcexpr"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+namespace stats {
+STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
+}
+}
+
+void MCExpr::print(raw_ostream &OS) const {
+  switch (getKind()) {
+  case MCExpr::Target:
+    return cast<MCTargetExpr>(this)->PrintImpl(OS);
+  case MCExpr::Constant:
+    OS << cast<MCConstantExpr>(*this).getValue();
+    return;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(*this);
+    const MCSymbol &Sym = SRE.getSymbol();
+    // Parenthesize names that start with $ so that they don't look like
+    // absolute names.
+    bool UseParens = Sym.getName()[0] == '$';
+
+    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) {
+      OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
+      UseParens = true;
+    }
+
+    if (UseParens)
+      OS << '(' << Sym << ')';
+    else
+      OS << Sym;
+
+    if (SRE.getKind() == MCSymbolRefExpr::VK_ARM_NONE ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_PLT ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TLSGD ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOT ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_PREL31)
+      OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
+    else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
+             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_LO16)
+      OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
+
+    return;
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr &UE = cast<MCUnaryExpr>(*this);
+    switch (UE.getOpcode()) {
+    case MCUnaryExpr::LNot:  OS << '!'; break;
+    case MCUnaryExpr::Minus: OS << '-'; break;
+    case MCUnaryExpr::Not:   OS << '~'; break;
+    case MCUnaryExpr::Plus:  OS << '+'; break;
+    }
+    OS << *UE.getSubExpr();
+    return;
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr &BE = cast<MCBinaryExpr>(*this);
+
+    // Only print parens around the LHS if it is non-trivial.
+    if (isa<MCConstantExpr>(BE.getLHS()) || isa<MCSymbolRefExpr>(BE.getLHS())) {
+      OS << *BE.getLHS();
+    } else {
+      OS << '(' << *BE.getLHS() << ')';
+    }
+
+    switch (BE.getOpcode()) {
+    case MCBinaryExpr::Add:
+      // Print "X-42" instead of "X+-42".
+      if (const MCConstantExpr *RHSC = dyn_cast<MCConstantExpr>(BE.getRHS())) {
+        if (RHSC->getValue() < 0) {
+          OS << RHSC->getValue();
+          return;
+        }
+      }
+
+      OS <<  '+';
+      break;
+    case MCBinaryExpr::And:  OS <<  '&'; break;
+    case MCBinaryExpr::Div:  OS <<  '/'; break;
+    case MCBinaryExpr::EQ:   OS << "=="; break;
+    case MCBinaryExpr::GT:   OS <<  '>'; break;
+    case MCBinaryExpr::GTE:  OS << ">="; break;
+    case MCBinaryExpr::LAnd: OS << "&&"; break;
+    case MCBinaryExpr::LOr:  OS << "||"; break;
+    case MCBinaryExpr::LT:   OS <<  '<'; break;
+    case MCBinaryExpr::LTE:  OS << "<="; break;
+    case MCBinaryExpr::Mod:  OS <<  '%'; break;
+    case MCBinaryExpr::Mul:  OS <<  '*'; break;
+    case MCBinaryExpr::NE:   OS << "!="; break;
+    case MCBinaryExpr::Or:   OS <<  '|'; break;
+    case MCBinaryExpr::Shl:  OS << "<<"; break;
+    case MCBinaryExpr::Shr:  OS << ">>"; break;
+    case MCBinaryExpr::Sub:  OS <<  '-'; break;
+    case MCBinaryExpr::Xor:  OS <<  '^'; break;
+    }
+
+    // Only print parens around the LHS if it is non-trivial.
+    if (isa<MCConstantExpr>(BE.getRHS()) || isa<MCSymbolRefExpr>(BE.getRHS())) {
+      OS << *BE.getRHS();
+    } else {
+      OS << '(' << *BE.getRHS() << ')';
+    }
+    return;
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCExpr::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+/* *** */
+
+const MCBinaryExpr *MCBinaryExpr::Create(Opcode Opc, const MCExpr *LHS,
+                                         const MCExpr *RHS, MCContext &Ctx) {
+  return new (Ctx) MCBinaryExpr(Opc, LHS, RHS);
+}
+
+const MCUnaryExpr *MCUnaryExpr::Create(Opcode Opc, const MCExpr *Expr,
+                                       MCContext &Ctx) {
+  return new (Ctx) MCUnaryExpr(Opc, Expr);
+}
+
+const MCConstantExpr *MCConstantExpr::Create(int64_t Value, MCContext &Ctx) {
+  return new (Ctx) MCConstantExpr(Value);
+}
+
+/* *** */
+
+const MCSymbolRefExpr *MCSymbolRefExpr::Create(const MCSymbol *Sym,
+                                               VariantKind Kind,
+                                               MCContext &Ctx) {
+  return new (Ctx) MCSymbolRefExpr(Sym, Kind);
+}
+
+const MCSymbolRefExpr *MCSymbolRefExpr::Create(StringRef Name, VariantKind Kind,
+                                               MCContext &Ctx) {
+  return Create(Ctx.GetOrCreateSymbol(Name), Kind, Ctx);
+}
+
+StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
+  switch (Kind) {
+  case VK_Invalid: return "<<invalid>>";
+  case VK_None: return "<<none>>";
+
+  case VK_GOT: return "GOT";
+  case VK_GOTOFF: return "GOTOFF";
+  case VK_GOTPCREL: return "GOTPCREL";
+  case VK_GOTTPOFF: return "GOTTPOFF";
+  case VK_INDNTPOFF: return "INDNTPOFF";
+  case VK_NTPOFF: return "NTPOFF";
+  case VK_GOTNTPOFF: return "GOTNTPOFF";
+  case VK_PLT: return "PLT";
+  case VK_TLSGD: return "TLSGD";
+  case VK_TLSLD: return "TLSLD";
+  case VK_TLSLDM: return "TLSLDM";
+  case VK_TPOFF: return "TPOFF";
+  case VK_DTPOFF: return "DTPOFF";
+  case VK_TLVP: return "TLVP";
+  case VK_SECREL: return "SECREL32";
+  case VK_ARM_NONE: return "(NONE)";
+  case VK_ARM_PLT: return "(PLT)";
+  case VK_ARM_GOT: return "(GOT)";
+  case VK_ARM_GOTOFF: return "(GOTOFF)";
+  case VK_ARM_TPOFF: return "(tpoff)";
+  case VK_ARM_GOTTPOFF: return "(gottpoff)";
+  case VK_ARM_TLSGD: return "(tlsgd)";
+  case VK_ARM_TARGET1: return "(target1)";
+  case VK_ARM_TARGET2: return "(target2)";
+  case VK_ARM_PREL31: return "(prel31)";
+  case VK_PPC_TOC: return "tocbase";
+  case VK_PPC_TOC_ENTRY: return "toc";
+  case VK_PPC_DARWIN_HA16: return "ha16";
+  case VK_PPC_DARWIN_LO16: return "lo16";
+  case VK_PPC_GAS_HA16: return "ha";
+  case VK_PPC_GAS_LO16: return "l";
+  case VK_PPC_TPREL16_HA: return "tprel@ha";
+  case VK_PPC_TPREL16_LO: return "tprel@l";
+  case VK_PPC_DTPREL16_HA: return "dtprel@ha";
+  case VK_PPC_DTPREL16_LO: return "dtprel@l";
+  case VK_PPC_TOC16_HA: return "toc@ha";
+  case VK_PPC_TOC16_LO: return "toc@l";
+  case VK_PPC_GOT_TPREL16_HA: return "got@tprel@ha";
+  case VK_PPC_GOT_TPREL16_LO: return "got@tprel@l";
+  case VK_PPC_TLS: return "tls";
+  case VK_PPC_GOT_TLSGD16_HA: return "got@tlsgd@ha";
+  case VK_PPC_GOT_TLSGD16_LO: return "got@tlsgd@l";
+  case VK_PPC_GOT_TLSLD16_HA: return "got@tlsld@ha";
+  case VK_PPC_GOT_TLSLD16_LO: return "got@tlsld@l";
+  case VK_PPC_TLSGD: return "tlsgd";
+  case VK_PPC_TLSLD: return "tlsld";
+  case VK_Mips_GPREL: return "GPREL";
+  case VK_Mips_GOT_CALL: return "GOT_CALL";
+  case VK_Mips_GOT16: return "GOT16";
+  case VK_Mips_GOT: return "GOT";
+  case VK_Mips_ABS_HI: return "ABS_HI";
+  case VK_Mips_ABS_LO: return "ABS_LO";
+  case VK_Mips_TLSGD: return "TLSGD";
+  case VK_Mips_TLSLDM: return "TLSLDM";
+  case VK_Mips_DTPREL_HI: return "DTPREL_HI";
+  case VK_Mips_DTPREL_LO: return "DTPREL_LO";
+  case VK_Mips_GOTTPREL: return "GOTTPREL";
+  case VK_Mips_TPREL_HI: return "TPREL_HI";
+  case VK_Mips_TPREL_LO: return "TPREL_LO";
+  case VK_Mips_GPOFF_HI: return "GPOFF_HI";
+  case VK_Mips_GPOFF_LO: return "GPOFF_LO";
+  case VK_Mips_GOT_DISP: return "GOT_DISP";
+  case VK_Mips_GOT_PAGE: return "GOT_PAGE";
+  case VK_Mips_GOT_OFST: return "GOT_OFST";
+  case VK_Mips_HIGHER:   return "HIGHER";
+  case VK_Mips_HIGHEST:  return "HIGHEST";
+  case VK_Mips_GOT_HI16: return "GOT_HI16";
+  case VK_Mips_GOT_LO16: return "GOT_LO16";
+  case VK_Mips_CALL_HI16: return "CALL_HI16";
+  case VK_Mips_CALL_LO16: return "CALL_LO16";
+  }
+  llvm_unreachable("Invalid variant kind");
+}
+
+MCSymbolRefExpr::VariantKind
+MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
+  return StringSwitch<VariantKind>(Name)
+    .Case("GOT", VK_GOT)
+    .Case("got", VK_GOT)
+    .Case("GOTOFF", VK_GOTOFF)
+    .Case("gotoff", VK_GOTOFF)
+    .Case("GOTPCREL", VK_GOTPCREL)
+    .Case("gotpcrel", VK_GOTPCREL)
+    .Case("GOTTPOFF", VK_GOTTPOFF)
+    .Case("gottpoff", VK_GOTTPOFF)
+    .Case("INDNTPOFF", VK_INDNTPOFF)
+    .Case("indntpoff", VK_INDNTPOFF)
+    .Case("NTPOFF", VK_NTPOFF)
+    .Case("ntpoff", VK_NTPOFF)
+    .Case("GOTNTPOFF", VK_GOTNTPOFF)
+    .Case("gotntpoff", VK_GOTNTPOFF)
+    .Case("PLT", VK_PLT)
+    .Case("plt", VK_PLT)
+    .Case("TLSGD", VK_TLSGD)
+    .Case("tlsgd", VK_TLSGD)
+    .Case("TLSLD", VK_TLSLD)
+    .Case("tlsld", VK_TLSLD)
+    .Case("TLSLDM", VK_TLSLDM)
+    .Case("tlsldm", VK_TLSLDM)
+    .Case("TPOFF", VK_TPOFF)
+    .Case("tpoff", VK_TPOFF)
+    .Case("DTPOFF", VK_DTPOFF)
+    .Case("dtpoff", VK_DTPOFF)
+    .Case("TLVP", VK_TLVP)
+    .Case("tlvp", VK_TLVP)
+    .Default(VK_Invalid);
+}
+
+/* *** */
+
+void MCTargetExpr::anchor() {}
+
+/* *** */
+
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res) const {
+  return EvaluateAsAbsolute(Res, 0, 0, 0);
+}
+
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res,
+                                const MCAsmLayout &Layout) const {
+  return EvaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, 0);
+}
+
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res,
+                                const MCAsmLayout &Layout,
+                                const SectionAddrMap &Addrs) const {
+  return EvaluateAsAbsolute(Res, &Layout.getAssembler(), &Layout, &Addrs);
+}
+
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const {
+  return EvaluateAsAbsolute(Res, &Asm, 0, 0);
+}
+
+bool MCExpr::EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
+                                const MCAsmLayout *Layout,
+                                const SectionAddrMap *Addrs) const {
+  MCValue Value;
+
+  // Fast path constants.
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(this)) {
+    Res = CE->getValue();
+    return true;
+  }
+
+  // FIXME: The use if InSet = Addrs is a hack. Setting InSet causes us
+  // absolutize differences across sections and that is what the MachO writer
+  // uses Addrs for.
+  bool IsRelocatable =
+    EvaluateAsRelocatableImpl(Value, Asm, Layout, Addrs, /*InSet*/ Addrs);
+
+  // Record the current value.
+  Res = Value.getConstant();
+
+  return IsRelocatable && Value.isAbsolute();
+}
+
+/// \brief Helper method for \see EvaluateSymbolAdd().
+static void AttemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
+                                                const MCAsmLayout *Layout,
+                                                const SectionAddrMap *Addrs,
+                                                bool InSet,
+                                                const MCSymbolRefExpr *&A,
+                                                const MCSymbolRefExpr *&B,
+                                                int64_t &Addend) {
+  if (!A || !B)
+    return;
+
+  const MCSymbol &SA = A->getSymbol();
+  const MCSymbol &SB = B->getSymbol();
+
+  if (SA.isUndefined() || SB.isUndefined())
+    return;
+
+  if (!Asm->getWriter().IsSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet))
+    return;
+
+  MCSymbolData &AD = Asm->getSymbolData(SA);
+  MCSymbolData &BD = Asm->getSymbolData(SB);
+
+  if (AD.getFragment() == BD.getFragment()) {
+    Addend += (AD.getOffset() - BD.getOffset());
+
+    // Pointers to Thumb symbols need to have their low-bit set to allow
+    // for interworking.
+    if (Asm->isThumbFunc(&SA))
+      Addend |= 1;
+
+    // Clear the symbol expr pointers to indicate we have folded these
+    // operands.
+    A = B = 0;
+    return;
+  }
+
+  if (!Layout)
+    return;
+
+  const MCSectionData &SecA = *AD.getFragment()->getParent();
+  const MCSectionData &SecB = *BD.getFragment()->getParent();
+
+  if ((&SecA != &SecB) && !Addrs)
+    return;
+
+  // Eagerly evaluate.
+  Addend += (Layout->getSymbolOffset(&Asm->getSymbolData(A->getSymbol())) -
+             Layout->getSymbolOffset(&Asm->getSymbolData(B->getSymbol())));
+  if (Addrs && (&SecA != &SecB))
+    Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB));
+
+  // Pointers to Thumb symbols need to have their low-bit set to allow
+  // for interworking.
+  if (Asm->isThumbFunc(&SA))
+    Addend |= 1;
+
+  // Clear the symbol expr pointers to indicate we have folded these
+  // operands.
+  A = B = 0;
+}
+
+/// \brief Evaluate the result of an add between (conceptually) two MCValues.
+///
+/// This routine conceptually attempts to construct an MCValue:
+///   Result = (Result_A - Result_B + Result_Cst)
+/// from two MCValue's LHS and RHS where
+///   Result = LHS + RHS
+/// and
+///   Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst).
+///
+/// This routine attempts to aggresively fold the operands such that the result
+/// is representable in an MCValue, but may not always succeed.
+///
+/// \returns True on success, false if the result is not representable in an
+/// MCValue.
+
+/// NOTE: It is really important to have both the Asm and Layout arguments.
+/// They might look redundant, but this function can be used before layout
+/// is done (see the object streamer for example) and having the Asm argument
+/// lets us avoid relaxations early.
+static bool EvaluateSymbolicAdd(const MCAssembler *Asm,
+                                const MCAsmLayout *Layout,
+                                const SectionAddrMap *Addrs,
+                                bool InSet,
+                                const MCValue &LHS,const MCSymbolRefExpr *RHS_A,
+                                const MCSymbolRefExpr *RHS_B, int64_t RHS_Cst,
+                                MCValue &Res) {
+  // FIXME: This routine (and other evaluation parts) are *incredibly* sloppy
+  // about dealing with modifiers. This will ultimately bite us, one day.
+  const MCSymbolRefExpr *LHS_A = LHS.getSymA();
+  const MCSymbolRefExpr *LHS_B = LHS.getSymB();
+  int64_t LHS_Cst = LHS.getConstant();
+
+  // Fold the result constant immediately.
+  int64_t Result_Cst = LHS_Cst + RHS_Cst;
+
+  assert((!Layout || Asm) &&
+         "Must have an assembler object if layout is given!");
+
+  // If we have a layout, we can fold resolved differences.
+  if (Asm) {
+    // First, fold out any differences which are fully resolved. By
+    // reassociating terms in
+    //   Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst).
+    // we have the four possible differences:
+    //   (LHS_A - LHS_B),
+    //   (LHS_A - RHS_B),
+    //   (RHS_A - LHS_B),
+    //   (RHS_A - RHS_B).
+    // Since we are attempting to be as aggressive as possible about folding, we
+    // attempt to evaluate each possible alternative.
+    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, LHS_B,
+                                        Result_Cst);
+    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, LHS_A, RHS_B,
+                                        Result_Cst);
+    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, RHS_A, LHS_B,
+                                        Result_Cst);
+    AttemptToFoldSymbolOffsetDifference(Asm, Layout, Addrs, InSet, RHS_A, RHS_B,
+                                        Result_Cst);
+  }
+
+  // We can't represent the addition or subtraction of two symbols.
+  if ((LHS_A && RHS_A) || (LHS_B && RHS_B))
+    return false;
+
+  // At this point, we have at most one additive symbol and one subtractive
+  // symbol -- find them.
+  const MCSymbolRefExpr *A = LHS_A ? LHS_A : RHS_A;
+  const MCSymbolRefExpr *B = LHS_B ? LHS_B : RHS_B;
+
+  // If we have a negated symbol, then we must have also have a non-negated
+  // symbol in order to encode the expression.
+  if (B && !A)
+    return false;
+
+  Res = MCValue::get(A, B, Result_Cst);
+  return true;
+}
+
+bool MCExpr::EvaluateAsRelocatable(MCValue &Res,
+                                   const MCAsmLayout &Layout) const {
+  return EvaluateAsRelocatableImpl(Res, &Layout.getAssembler(), &Layout,
+                                   0, false);
+}
+
+bool MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                       const MCAssembler *Asm,
+                                       const MCAsmLayout *Layout,
+                                       const SectionAddrMap *Addrs,
+                                       bool InSet) const {
+  ++stats::MCExprEvaluate;
+
+  switch (getKind()) {
+  case Target:
+    return cast<MCTargetExpr>(this)->EvaluateAsRelocatableImpl(Res, Layout);
+
+  case Constant:
+    Res = MCValue::get(cast<MCConstantExpr>(this)->getValue());
+    return true;
+
+  case SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
+    const MCSymbol &Sym = SRE->getSymbol();
+
+    // Evaluate recursively if this is a variable.
+    if (Sym.isVariable() && SRE->getKind() == MCSymbolRefExpr::VK_None) {
+      bool Ret = Sym.getVariableValue()->EvaluateAsRelocatableImpl(Res, Asm,
+                                                                   Layout,
+                                                                   Addrs,
+                                                                   true);
+      // If we failed to simplify this to a constant, let the target
+      // handle it.
+      if (Ret && !Res.getSymA() && !Res.getSymB())
+        return true;
+    }
+
+    Res = MCValue::get(SRE, 0, 0);
+    return true;
+  }
+
+  case Unary: {
+    const MCUnaryExpr *AUE = cast<MCUnaryExpr>(this);
+    MCValue Value;
+
+    if (!AUE->getSubExpr()->EvaluateAsRelocatableImpl(Value, Asm, Layout,
+                                                      Addrs, InSet))
+      return false;
+
+    switch (AUE->getOpcode()) {
+    case MCUnaryExpr::LNot:
+      if (!Value.isAbsolute())
+        return false;
+      Res = MCValue::get(!Value.getConstant());
+      break;
+    case MCUnaryExpr::Minus:
+      /// -(a - b + const) ==> (b - a - const)
+      if (Value.getSymA() && !Value.getSymB())
+        return false;
+      Res = MCValue::get(Value.getSymB(), Value.getSymA(),
+                         -Value.getConstant());
+      break;
+    case MCUnaryExpr::Not:
+      if (!Value.isAbsolute())
+        return false;
+      Res = MCValue::get(~Value.getConstant());
+      break;
+    case MCUnaryExpr::Plus:
+      Res = Value;
+      break;
+    }
+
+    return true;
+  }
+
+  case Binary: {
+    const MCBinaryExpr *ABE = cast<MCBinaryExpr>(this);
+    MCValue LHSValue, RHSValue;
+
+    if (!ABE->getLHS()->EvaluateAsRelocatableImpl(LHSValue, Asm, Layout,
+                                                  Addrs, InSet) ||
+        !ABE->getRHS()->EvaluateAsRelocatableImpl(RHSValue, Asm, Layout,
+                                                  Addrs, InSet))
+      return false;
+
+    // We only support a few operations on non-constant expressions, handle
+    // those first.
+    if (!LHSValue.isAbsolute() || !RHSValue.isAbsolute()) {
+      switch (ABE->getOpcode()) {
+      default:
+        return false;
+      case MCBinaryExpr::Sub:
+        // Negate RHS and add.
+        return EvaluateSymbolicAdd(Asm, Layout, Addrs, InSet, LHSValue,
+                                   RHSValue.getSymB(), RHSValue.getSymA(),
+                                   -RHSValue.getConstant(),
+                                   Res);
+
+      case MCBinaryExpr::Add:
+        return EvaluateSymbolicAdd(Asm, Layout, Addrs, InSet, LHSValue,
+                                   RHSValue.getSymA(), RHSValue.getSymB(),
+                                   RHSValue.getConstant(),
+                                   Res);
+      }
+    }
+
+    // FIXME: We need target hooks for the evaluation. It may be limited in
+    // width, and gas defines the result of comparisons and right shifts
+    // differently from Apple as.
+    int64_t LHS = LHSValue.getConstant(), RHS = RHSValue.getConstant();
+    int64_t Result = 0;
+    switch (ABE->getOpcode()) {
+    case MCBinaryExpr::Add:  Result = LHS + RHS; break;
+    case MCBinaryExpr::And:  Result = LHS & RHS; break;
+    case MCBinaryExpr::Div:  Result = LHS / RHS; break;
+    case MCBinaryExpr::EQ:   Result = LHS == RHS; break;
+    case MCBinaryExpr::GT:   Result = LHS > RHS; break;
+    case MCBinaryExpr::GTE:  Result = LHS >= RHS; break;
+    case MCBinaryExpr::LAnd: Result = LHS && RHS; break;
+    case MCBinaryExpr::LOr:  Result = LHS || RHS; break;
+    case MCBinaryExpr::LT:   Result = LHS < RHS; break;
+    case MCBinaryExpr::LTE:  Result = LHS <= RHS; break;
+    case MCBinaryExpr::Mod:  Result = LHS % RHS; break;
+    case MCBinaryExpr::Mul:  Result = LHS * RHS; break;
+    case MCBinaryExpr::NE:   Result = LHS != RHS; break;
+    case MCBinaryExpr::Or:   Result = LHS | RHS; break;
+    case MCBinaryExpr::Shl:  Result = LHS << RHS; break;
+    case MCBinaryExpr::Shr:  Result = LHS >> RHS; break;
+    case MCBinaryExpr::Sub:  Result = LHS - RHS; break;
+    case MCBinaryExpr::Xor:  Result = LHS ^ RHS; break;
+    }
+
+    Res = MCValue::get(Result);
+    return true;
+  }
+  }
+
+  llvm_unreachable("Invalid assembly expression kind!");
+}
+
+const MCSection *MCExpr::FindAssociatedSection() const {
+  switch (getKind()) {
+  case Target:
+    // We never look through target specific expressions.
+    return cast<MCTargetExpr>(this)->FindAssociatedSection();
+
+  case Constant:
+    return MCSymbol::AbsolutePseudoSection;
+
+  case SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(this);
+    const MCSymbol &Sym = SRE->getSymbol();
+
+    if (Sym.isDefined())
+      return &Sym.getSection();
+
+    return 0;
+  }
+
+  case Unary:
+    return cast<MCUnaryExpr>(this)->getSubExpr()->FindAssociatedSection();
+
+  case Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(this);
+    const MCSection *LHS_S = BE->getLHS()->FindAssociatedSection();
+    const MCSection *RHS_S = BE->getRHS()->FindAssociatedSection();
+
+    // If either section is absolute, return the other.
+    if (LHS_S == MCSymbol::AbsolutePseudoSection)
+      return RHS_S;
+    if (RHS_S == MCSymbol::AbsolutePseudoSection)
+      return LHS_S;
+
+    // Otherwise, return the first non-null section.
+    return LHS_S ? LHS_S : RHS_S;
+  }
+  }
+
+  llvm_unreachable("Invalid assembly expression kind!");
+}
diff --git a/contrib/llvm/lib/MC/MCInst.cpp b/contrib/llvm/lib/MC/MCInst.cpp
new file mode 100644
index 000000000000..124cc149beb6
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCInst.cpp
@@ -0,0 +1,72 @@
+//===- lib/MC/MCInst.cpp - MCInst implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  OS << "<MCOperand ";
+  if (!isValid())
+    OS << "INVALID";
+  else if (isReg())
+    OS << "Reg:" << getReg();
+  else if (isImm())
+    OS << "Imm:" << getImm();
+  else if (isExpr()) {
+    OS << "Expr:(" << *getExpr() << ")";
+  } else if (isInst()) {
+    OS << "Inst:(" << *getInst() << ")";
+  } else
+    OS << "UNDEFINED";
+  OS << ">";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCOperand::dump() const {
+  print(dbgs(), 0);
+  dbgs() << "\n";
+}
+#endif
+
+void MCInst::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  OS << "<MCInst " << getOpcode();
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    OS << " ";
+    getOperand(i).print(OS, MAI);
+  }
+  OS << ">";
+}
+
+void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
+                         const MCInstPrinter *Printer,
+                         StringRef Separator) const {
+  OS << "<MCInst #" << getOpcode();
+
+  // Show the instruction opcode name if we have access to a printer.
+  if (Printer)
+    OS << ' ' << Printer->getOpcodeName(getOpcode());
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    OS << Separator;
+    getOperand(i).print(OS, MAI);
+  }
+  OS << ">";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCInst::dump() const {
+  print(dbgs(), 0);
+  dbgs() << "\n";
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCInstPrinter.cpp b/contrib/llvm/lib/MC/MCInstPrinter.cpp
new file mode 100644
index 000000000000..73f30ffb52a0
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCInstPrinter.cpp
@@ -0,0 +1,61 @@
+//===-- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCInstPrinter::~MCInstPrinter() {
+}
+
+/// getOpcodeName - Return the name of the specified opcode enum (e.g.
+/// "MOV32ri") or empty if we can't resolve it.
+StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  llvm_unreachable("Target should implement this");
+}
+
+void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
+  if (!Annot.empty()) {
+    if (CommentStream)
+      (*CommentStream) << Annot;
+    else
+      OS << " " << MAI.getCommentString() << " " << Annot;
+  }
+}
+
+/// Utility functions to make adding mark ups simpler.
+StringRef MCInstPrinter::markup(StringRef s) const {
+  if (getUseMarkup())
+    return s;
+  else
+    return "";
+}
+StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
+  if (getUseMarkup())
+    return a;
+  else
+    return b;
+}
+
+/// Utility function to print immediates in decimal or hex.
+format_object1<int64_t> MCInstPrinter::formatImm(const int64_t Value) const {
+  if (getPrintImmHex())
+    return format("0x%" PRIx64, Value);
+  else
+    return format("%" PRId64, Value);
+}
diff --git a/contrib/llvm/lib/MC/MCInstrAnalysis.cpp b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
new file mode 100644
index 000000000000..7736702f35a4
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -0,0 +1,21 @@
+//===-- MCInstrAnalysis.cpp - InstrDesc target hooks ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrAnalysis.h"
+using namespace llvm;
+
+uint64_t MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                         uint64_t Size) const {
+  if (Inst.getNumOperands() == 0 ||
+      Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+    return -1ULL;
+
+  int64_t Imm = Inst.getOperand(0).getImm();
+  return Addr+Size+Imm;
+}
diff --git a/contrib/llvm/lib/MC/MCLabel.cpp b/contrib/llvm/lib/MC/MCLabel.cpp
new file mode 100644
index 000000000000..1d3022a93e86
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCLabel.cpp
@@ -0,0 +1,23 @@
+//===- lib/MC/MCLabel.cpp - MCLabel implementation ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCLabel.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+void MCLabel::print(raw_ostream &OS) const {
+  OS << '"' << getInstance() << '"';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCLabel::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCMachOStreamer.cpp b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
new file mode 100644
index 000000000000..7d08d0ecd5e0
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
@@ -0,0 +1,436 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCMachOSymbolFlags.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class MCMachOStreamer : public MCObjectStreamer {
+private:
+  virtual void EmitInstToData(const MCInst &Inst);
+
+  void EmitDataRegion(DataRegionData::KindTy Kind);
+  void EmitDataRegionEnd();
+public:
+  MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
+                  MCCodeEmitter *Emitter)
+      : MCObjectStreamer(SK_MachOStreamer, Context, MAB, OS, Emitter) {}
+
+  /// @name MCStreamer Interface
+  /// @{
+
+  virtual void InitSections();
+  virtual void InitToTextSection();
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
+  virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
+                                   MCSymbol *EHSymbol);
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
+  virtual void EmitDataRegion(MCDataRegionType Kind);
+  virtual void EmitThumbFunc(MCSymbol *Func);
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment);
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+    llvm_unreachable("macho doesn't support this directive");
+  }
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass) {
+    llvm_unreachable("macho doesn't support this directive");
+  }
+  virtual void EmitCOFFSymbolType(int Type) {
+    llvm_unreachable("macho doesn't support this directive");
+  }
+  virtual void EndCOFFSymbolDef() {
+    llvm_unreachable("macho doesn't support this directive");
+  }
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+    llvm_unreachable("macho doesn't support this directive");
+  }
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment = 0);
+
+  virtual void EmitFileDirective(StringRef Filename) {
+    // FIXME: Just ignore the .file; it isn't important enough to fail the
+    // entire assembly.
+
+    //report_fatal_error("unsupported directive: '.file'");
+  }
+
+  virtual void FinishImpl();
+
+  /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_MachOStreamer;
+  }
+};
+
+} // end anonymous namespace.
+
+void MCMachOStreamer::InitSections() {
+  InitToTextSection();
+}
+
+void MCMachOStreamer::InitToTextSection() {
+  SwitchSection(getContext().getMachOSection(
+                                    "__TEXT", "__text",
+                                    MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, 0,
+                                    SectionKind::getText()));
+}
+
+void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                          MCSymbol *EHSymbol) {
+  MCSymbolData &SD =
+    getAssembler().getOrCreateSymbolData(*Symbol);
+  if (SD.isExternal())
+    EmitSymbolAttribute(EHSymbol, MCSA_Global);
+  if (SD.getFlags() & SF_WeakDefinition)
+    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
+  if (SD.isPrivateExtern())
+    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
+}
+
+void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
+  // isSymbolLinkerVisible uses the section.
+  Symbol->setSection(*getCurrentSection());
+  // We have to create a new fragment if this is an atom defining symbol,
+  // fragments cannot span atoms.
+  if (getAssembler().isSymbolLinkerVisible(*Symbol))
+    new MCDataFragment(getCurrentSectionData());
+
+  MCObjectStreamer::EmitLabel(Symbol);
+
+  MCSymbolData &SD = getAssembler().getSymbolData(*Symbol);
+  // This causes the reference type flag to be cleared. Darwin 'as' was "trying"
+  // to clear the weak reference and weak definition bits too, but the
+  // implementation was buggy. For now we just try to match 'as', for
+  // diffability.
+  //
+  // FIXME: Cleanup this code, these bits should be emitted based on semantic
+  // properties, not on the order of definition, etc.
+  SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
+}
+
+void MCMachOStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
+  if (!getAssembler().getBackend().hasDataInCodeSupport())
+    return;
+  // Create a temporary label to mark the start of the data region.
+  MCSymbol *Start = getContext().CreateTempSymbol();
+  EmitLabel(Start);
+  // Record the region for the object writer to use.
+  DataRegionData Data = { Kind, Start, NULL };
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  Regions.push_back(Data);
+}
+
+void MCMachOStreamer::EmitDataRegionEnd() {
+  if (!getAssembler().getBackend().hasDataInCodeSupport())
+    return;
+  std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
+  assert(Regions.size() && "Mismatched .end_data_region!");
+  DataRegionData &Data = Regions.back();
+  assert(Data.End == NULL && "Mismatched .end_data_region!");
+  // Create a temporary label to mark the end of the data region.
+  Data.End = getContext().CreateTempSymbol();
+  EmitLabel(Data.End);
+}
+
+void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+  // Do any generic stuff we need to do.
+  switch (Flag) {
+  case MCAF_SyntaxUnified: return; // no-op here.
+  case MCAF_Code16: return; // Change parsing mode; no-op here.
+  case MCAF_Code32: return; // Change parsing mode; no-op here.
+  case MCAF_Code64: return; // Change parsing mode; no-op here.
+  case MCAF_SubsectionsViaSymbols:
+    getAssembler().setSubsectionsViaSymbols(true);
+    return;
+  }
+}
+
+void MCMachOStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+  getAssembler().getLinkerOptions().push_back(Options);
+}
+
+void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
+  switch (Kind) {
+  case MCDR_DataRegion:
+    EmitDataRegion(DataRegionData::Data);
+    return;
+  case MCDR_DataRegionJT8:
+    EmitDataRegion(DataRegionData::JumpTable8);
+    return;
+  case MCDR_DataRegionJT16:
+    EmitDataRegion(DataRegionData::JumpTable16);
+    return;
+  case MCDR_DataRegionJT32:
+    EmitDataRegion(DataRegionData::JumpTable32);
+    return;
+  case MCDR_DataRegionEnd:
+    EmitDataRegionEnd();
+    return;
+  }
+}
+
+void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+  // Remember that the function is a thumb function. Fixup and relocation
+  // values will need adjusted.
+  getAssembler().setIsThumbFunc(Symbol);
+
+  // Mark the thumb bit on the symbol.
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  SD.setFlags(SD.getFlags() | SF_ThumbFunc);
+}
+
+void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                          MCSymbolAttr Attribute) {
+  // Indirect symbols are handled differently, to match how 'as' handles
+  // them. This makes writing matching .o files easier.
+  if (Attribute == MCSA_IndirectSymbol) {
+    // Note that we intentionally cannot use the symbol data here; this is
+    // important for matching the string table that 'as' generates.
+    IndirectSymbolData ISD;
+    ISD.Symbol = Symbol;
+    ISD.SectionData = getCurrentSectionData();
+    getAssembler().getIndirectSymbols().push_back(ISD);
+    return;
+  }
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling getOrCreateSymbolData here is to register
+  // the symbol with the assembler.
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // The implementation of symbol attributes is designed to match 'as', but it
+  // leaves much to desired. It doesn't really make sense to arbitrarily add and
+  // remove flags, but 'as' allows this (in particular, see .desc).
+  //
+  // In the future it might be worth trying to make these operations more well
+  // defined.
+  switch (Attribute) {
+  case MCSA_Invalid:
+  case MCSA_ELF_TypeFunction:
+  case MCSA_ELF_TypeIndFunction:
+  case MCSA_ELF_TypeObject:
+  case MCSA_ELF_TypeTLS:
+  case MCSA_ELF_TypeCommon:
+  case MCSA_ELF_TypeNoType:
+  case MCSA_ELF_TypeGnuUniqueObject:
+  case MCSA_Hidden:
+  case MCSA_IndirectSymbol:
+  case MCSA_Internal:
+  case MCSA_Protected:
+  case MCSA_Weak:
+  case MCSA_Local:
+    llvm_unreachable("Invalid symbol attribute for Mach-O!");
+
+  case MCSA_Global:
+    SD.setExternal(true);
+    // This effectively clears the undefined lazy bit, in Darwin 'as', although
+    // it isn't very consistent because it implements this as part of symbol
+    // lookup.
+    //
+    // FIXME: Cleanup this code, these bits should be emitted based on semantic
+    // properties, not on the order of definition, etc.
+    SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeUndefinedLazy);
+    break;
+
+  case MCSA_LazyReference:
+    // FIXME: This requires -dynamic.
+    SD.setFlags(SD.getFlags() | SF_NoDeadStrip);
+    if (Symbol->isUndefined())
+      SD.setFlags(SD.getFlags() | SF_ReferenceTypeUndefinedLazy);
+    break;
+
+    // Since .reference sets the no dead strip bit, it is equivalent to
+    // .no_dead_strip in practice.
+  case MCSA_Reference:
+  case MCSA_NoDeadStrip:
+    SD.setFlags(SD.getFlags() | SF_NoDeadStrip);
+    break;
+
+  case MCSA_SymbolResolver:
+    SD.setFlags(SD.getFlags() | SF_SymbolResolver);
+    break;
+
+  case MCSA_PrivateExtern:
+    SD.setExternal(true);
+    SD.setPrivateExtern(true);
+    break;
+
+  case MCSA_WeakReference:
+    // FIXME: This requires -dynamic.
+    if (Symbol->isUndefined())
+      SD.setFlags(SD.getFlags() | SF_WeakReference);
+    break;
+
+  case MCSA_WeakDefinition:
+    // FIXME: 'as' enforces that this is defined and global. The manual claims
+    // it has to be in a coalesced section, but this isn't enforced.
+    SD.setFlags(SD.getFlags() | SF_WeakDefinition);
+    break;
+
+  case MCSA_WeakDefAutoPrivate:
+    SD.setFlags(SD.getFlags() | SF_WeakDefinition | SF_WeakReference);
+    break;
+  }
+}
+
+void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  // Encode the 'desc' value into the lowest implementation defined bits.
+  assert(DescValue == (DescValue & SF_DescFlagsMask) &&
+         "Invalid .desc value!");
+  getAssembler().getOrCreateSymbolData(*Symbol).setFlags(
+    DescValue & SF_DescFlagsMask);
+}
+
+void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {
+  // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+  SD.setExternal(true);
+  SD.setCommon(Size, ByteAlignment);
+}
+
+void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlignment) {
+  // '.lcomm' is equivalent to '.zerofill'.
+  return EmitZerofill(getContext().getMachOSection("__DATA", "__bss",
+                                                   MCSectionMachO::S_ZEROFILL,
+                                                   0, SectionKind::getBSS()),
+                      Symbol, Size, ByteAlignment);
+}
+
+void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                   uint64_t Size, unsigned ByteAlignment) {
+  MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
+
+  // The symbol may not be present, which only creates the section.
+  if (!Symbol)
+    return;
+
+  // FIXME: Assert that this section has the zerofill type.
+
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // Emit an align fragment if necessary.
+  if (ByteAlignment != 1)
+    new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectData);
+
+  MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
+  SD.setFragment(F);
+
+  Symbol->setSection(*Section);
+
+  // Update the maximum alignment on the zero fill section if necessary.
+  if (ByteAlignment > SectData.getAlignment())
+    SectData.setAlignment(ByteAlignment);
+}
+
+// This should always be called with the thread local bss section.  Like the
+// .zerofill directive this doesn't actually switch sections on us.
+void MCMachOStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                     uint64_t Size, unsigned ByteAlignment) {
+  EmitZerofill(Section, Symbol, Size, ByteAlignment);
+  return;
+}
+
+void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCMachOStreamer::FinishImpl() {
+  EmitFrames(true);
+
+  // We have to set the fragment atom associations so we can relax properly for
+  // Mach-O.
+
+  // First, scan the symbol table to build a lookup table from fragments to
+  // defining symbols.
+  DenseMap<const MCFragment*, MCSymbolData*> DefiningSymbolMap;
+  for (MCAssembler::symbol_iterator it = getAssembler().symbol_begin(),
+         ie = getAssembler().symbol_end(); it != ie; ++it) {
+    if (getAssembler().isSymbolLinkerVisible(it->getSymbol()) &&
+        it->getFragment()) {
+      // An atom defining symbol should never be internal to a fragment.
+      assert(it->getOffset() == 0 && "Invalid offset in atom defining symbol!");
+      DefiningSymbolMap[it->getFragment()] = it;
+    }
+  }
+
+  // Set the fragment atom associations by tracking the last seen atom defining
+  // symbol.
+  for (MCAssembler::iterator it = getAssembler().begin(),
+         ie = getAssembler().end(); it != ie; ++it) {
+    MCSymbolData *CurrentAtom = 0;
+    for (MCSectionData::iterator it2 = it->begin(),
+           ie2 = it->end(); it2 != ie2; ++it2) {
+      if (MCSymbolData *SD = DefiningSymbolMap.lookup(it2))
+        CurrentAtom = SD;
+      it2->setAtom(CurrentAtom);
+    }
+  }
+
+  this->MCObjectStreamer::FinishImpl();
+}
+
+MCStreamer *llvm::createMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                      raw_ostream &OS, MCCodeEmitter *CE,
+                                      bool RelaxAll) {
+  MCMachOStreamer *S = new MCMachOStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp
new file mode 100644
index 000000000000..146cebf01a3a
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp
@@ -0,0 +1,22 @@
+//===-- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCMachObjectWriter.h"
+
+using namespace llvm;
+
+MCMachObjectTargetWriter::MCMachObjectTargetWriter(
+  bool Is64Bit_, uint32_t CPUType_, uint32_t CPUSubtype_,
+  bool UseAggressiveSymbolFolding_)
+  : Is64Bit(Is64Bit_), CPUType(CPUType_), CPUSubtype(CPUSubtype_),
+    UseAggressiveSymbolFolding(UseAggressiveSymbolFolding_) {
+}
+
+MCMachObjectTargetWriter::~MCMachObjectTargetWriter() {
+}
diff --git a/contrib/llvm/lib/MC/MCModule.cpp b/contrib/llvm/lib/MC/MCModule.cpp
new file mode 100644
index 000000000000..f5631608330c
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCModule.cpp
@@ -0,0 +1,45 @@
+//===- lib/MC/MCModule.cpp - MCModule implementation ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+
+using namespace llvm;
+
+MCAtom *MCModule::createAtom(MCAtom::AtomType Type,
+                             uint64_t Begin, uint64_t End) {
+  assert(Begin < End && "Creating MCAtom with endpoints reversed?");
+
+  // Check for atoms already covering this range.
+  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Begin);
+  assert((!I.valid() || I.start() < End) && "Offset range already occupied!");
+
+  // Create the new atom and add it to our maps.
+  MCAtom *NewAtom = new MCAtom(Type, this, Begin, End);
+  AtomAllocationTracker.insert(NewAtom);
+  OffsetMap.insert(Begin, End, NewAtom);
+  return NewAtom;
+}
+
+// remap - Update the interval mapping for an atom.
+void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
+  // Find and erase the old mapping.
+  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Atom->Begin);
+  assert(I.valid() && "Atom offset not found in module!");
+  assert(*I == Atom && "Previous atom mapping was invalid!");
+  I.erase();
+
+  // Insert the new mapping.
+  OffsetMap.insert(NewBegin, NewEnd, Atom);
+
+  // Update the atom internal bounds.
+  Atom->Begin = NewBegin;
+  Atom->End = NewEnd;
+}
+
diff --git a/contrib/llvm/lib/MC/MCNullStreamer.cpp b/contrib/llvm/lib/MC/MCNullStreamer.cpp
new file mode 100644
index 000000000000..c872b2203f87
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCNullStreamer.cpp
@@ -0,0 +1,123 @@
+//===- lib/MC/MCNullStreamer.cpp - Dummy Streamer Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+namespace {
+
+  class MCNullStreamer : public MCStreamer {
+  public:
+    MCNullStreamer(MCContext &Context) : MCStreamer(SK_NullStreamer, Context) {}
+
+    /// @name MCStreamer Interface
+    /// @{
+
+    virtual void InitToTextSection() {
+    }
+
+    virtual void InitSections() {
+    }
+
+    virtual void ChangeSection(const MCSection *Section) {
+    }
+
+    virtual void EmitLabel(MCSymbol *Symbol) {
+      assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+      assert(getCurrentSection() && "Cannot emit before setting section!");
+      Symbol->setSection(*getCurrentSection());
+    }
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
+    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+    virtual void EmitThumbFunc(MCSymbol *Func) {}
+
+    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
+    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol){}
+    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                          const MCSymbol *LastLabel,
+                                          const MCSymbol *Label,
+                                          unsigned PointerSize) {}
+
+    virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){}
+
+    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+
+    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+    virtual void EmitCOFFSymbolStorageClass(int StorageClass) {}
+    virtual void EmitCOFFSymbolType(int Type) {}
+    virtual void EndCOFFSymbolDef() {}
+    virtual void EmitCOFFSecRel32(MCSymbol const *Symbol) {}
+
+    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlignment) {}
+    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
+    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                              uint64_t Size = 0, unsigned ByteAlignment = 0) {}
+    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                uint64_t Size, unsigned ByteAlignment) {}
+    virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
+
+    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                               unsigned AddrSpace) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
+    virtual void EmitGPRel32Value(const MCExpr *Value) {}
+    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                      unsigned ValueSize = 1,
+                                      unsigned MaxBytesToEmit = 0) {}
+
+    virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                   unsigned MaxBytesToEmit = 0) {}
+
+    virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                   unsigned char Value = 0) { return false; }
+
+    virtual void EmitFileDirective(StringRef Filename) {}
+    virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                        StringRef Filename, unsigned CUID = 0) {
+      return false;
+    }
+    virtual void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                       unsigned Column, unsigned Flags,
+                                       unsigned Isa, unsigned Discriminator,
+                                       StringRef FileName) {}
+    virtual void EmitInstruction(const MCInst &Inst) {}
+
+    virtual void EmitBundleAlignMode(unsigned AlignPow2) {}
+    virtual void EmitBundleLock(bool AlignToEnd) {}
+    virtual void EmitBundleUnlock() {}
+
+    virtual void FinishImpl() {}
+
+    virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+      RecordProcEnd(Frame);
+    }
+
+    /// @}
+
+    static bool classof(const MCStreamer *S) {
+      return S->getKind() == SK_NullStreamer;
+    }
+
+  };
+
+}
+
+MCStreamer *llvm::createNullStreamer(MCContext &Context) {
+  return new MCNullStreamer(Context);
+}
diff --git a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
new file mode 100644
index 000000000000..d19e79ac64f9
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -0,0 +1,683 @@
+//===-- MObjectFileInfo.cpp - Object File Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+using namespace llvm;
+
+void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
+  // MachO
+  IsFunctionEHFrameSymbolPrivate = false;
+  SupportsWeakOmittedEHFrame = false;
+
+  PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
+    | dwarf::DW_EH_PE_sdata4;
+  LSDAEncoding = FDEEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
+  TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+    dwarf::DW_EH_PE_sdata4;
+
+  // .comm doesn't support alignment before Leopard.
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 5))
+    CommDirectiveSupportsAlignment = false;
+
+  TextSection // .text
+    = Ctx->getMachOSection("__TEXT", "__text",
+                           MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                           SectionKind::getText());
+  DataSection // .data
+    = Ctx->getMachOSection("__DATA", "__data", 0,
+                           SectionKind::getDataRel());
+
+  TLSDataSection // .tdata
+    = Ctx->getMachOSection("__DATA", "__thread_data",
+                           MCSectionMachO::S_THREAD_LOCAL_REGULAR,
+                           SectionKind::getDataRel());
+  TLSBSSSection // .tbss
+    = Ctx->getMachOSection("__DATA", "__thread_bss",
+                           MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
+                           SectionKind::getThreadBSS());
+
+  // TODO: Verify datarel below.
+  TLSTLVSection // .tlv
+    = Ctx->getMachOSection("__DATA", "__thread_vars",
+                           MCSectionMachO::S_THREAD_LOCAL_VARIABLES,
+                           SectionKind::getDataRel());
+
+  TLSThreadInitSection
+    = Ctx->getMachOSection("__DATA", "__thread_init",
+                          MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
+                          SectionKind::getDataRel());
+
+  CStringSection // .cstring
+    = Ctx->getMachOSection("__TEXT", "__cstring",
+                           MCSectionMachO::S_CSTRING_LITERALS,
+                           SectionKind::getMergeable1ByteCString());
+  UStringSection
+    = Ctx->getMachOSection("__TEXT","__ustring", 0,
+                           SectionKind::getMergeable2ByteCString());
+  FourByteConstantSection // .literal4
+    = Ctx->getMachOSection("__TEXT", "__literal4",
+                           MCSectionMachO::S_4BYTE_LITERALS,
+                           SectionKind::getMergeableConst4());
+  EightByteConstantSection // .literal8
+    = Ctx->getMachOSection("__TEXT", "__literal8",
+                           MCSectionMachO::S_8BYTE_LITERALS,
+                           SectionKind::getMergeableConst8());
+
+  // ld_classic doesn't support .literal16 in 32-bit mode, and ld64 falls back
+  // to using it in -static mode.
+  SixteenByteConstantSection = 0;
+  if (RelocM != Reloc::Static &&
+      T.getArch() != Triple::x86_64 && T.getArch() != Triple::ppc64)
+    SixteenByteConstantSection =   // .literal16
+      Ctx->getMachOSection("__TEXT", "__literal16",
+                           MCSectionMachO::S_16BYTE_LITERALS,
+                           SectionKind::getMergeableConst16());
+
+  ReadOnlySection  // .const
+    = Ctx->getMachOSection("__TEXT", "__const", 0,
+                           SectionKind::getReadOnly());
+
+  TextCoalSection
+    = Ctx->getMachOSection("__TEXT", "__textcoal_nt",
+                           MCSectionMachO::S_COALESCED |
+                           MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                           SectionKind::getText());
+  ConstTextCoalSection
+    = Ctx->getMachOSection("__TEXT", "__const_coal",
+                           MCSectionMachO::S_COALESCED,
+                           SectionKind::getReadOnly());
+  ConstDataSection  // .const_data
+    = Ctx->getMachOSection("__DATA", "__const", 0,
+                           SectionKind::getReadOnlyWithRel());
+  DataCoalSection
+    = Ctx->getMachOSection("__DATA","__datacoal_nt",
+                           MCSectionMachO::S_COALESCED,
+                           SectionKind::getDataRel());
+  DataCommonSection
+    = Ctx->getMachOSection("__DATA","__common",
+                           MCSectionMachO::S_ZEROFILL,
+                           SectionKind::getBSS());
+  DataBSSSection
+    = Ctx->getMachOSection("__DATA","__bss", MCSectionMachO::S_ZEROFILL,
+                           SectionKind::getBSS());
+
+
+  LazySymbolPointerSection
+    = Ctx->getMachOSection("__DATA", "__la_symbol_ptr",
+                           MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
+                           SectionKind::getMetadata());
+  NonLazySymbolPointerSection
+    = Ctx->getMachOSection("__DATA", "__nl_symbol_ptr",
+                           MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                           SectionKind::getMetadata());
+
+  if (RelocM == Reloc::Static) {
+    StaticCtorSection
+      = Ctx->getMachOSection("__TEXT", "__constructor", 0,
+                             SectionKind::getDataRel());
+    StaticDtorSection
+      = Ctx->getMachOSection("__TEXT", "__destructor", 0,
+                             SectionKind::getDataRel());
+  } else {
+    StaticCtorSection
+      = Ctx->getMachOSection("__DATA", "__mod_init_func",
+                             MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
+                             SectionKind::getDataRel());
+    StaticDtorSection
+      = Ctx->getMachOSection("__DATA", "__mod_term_func",
+                             MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
+                             SectionKind::getDataRel());
+  }
+
+  // Exception Handling.
+  LSDASection = Ctx->getMachOSection("__TEXT", "__gcc_except_tab", 0,
+                                     SectionKind::getReadOnlyWithRel());
+
+  if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
+    CompactUnwindSection =
+      Ctx->getMachOSection("__LD", "__compact_unwind",
+                           MCSectionMachO::S_ATTR_DEBUG,
+                           SectionKind::getReadOnly());
+
+  // Debug Information.
+  DwarfAccelNamesSection =
+    Ctx->getMachOSection("__DWARF", "__apple_names",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfAccelObjCSection =
+    Ctx->getMachOSection("__DWARF", "__apple_objc",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  // 16 character section limit...
+  DwarfAccelNamespaceSection =
+    Ctx->getMachOSection("__DWARF", "__apple_namespac",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfAccelTypesSection =
+    Ctx->getMachOSection("__DWARF", "__apple_types",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+
+  DwarfAbbrevSection =
+    Ctx->getMachOSection("__DWARF", "__debug_abbrev",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getMachOSection("__DWARF", "__debug_info",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getMachOSection("__DWARF", "__debug_line",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getMachOSection("__DWARF", "__debug_frame",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_pubnames",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getMachOSection("__DWARF", "__debug_str",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfLocSection =
+    Ctx->getMachOSection("__DWARF", "__debug_loc",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_aranges",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_ranges",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getMachOSection("__DWARF", "__debug_macinfo",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfDebugInlineSection =
+    Ctx->getMachOSection("__DWARF", "__debug_inlined",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+
+  TLSExtraDataSection = TLSTLVSection;
+}
+
+void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
+  if (T.getArch() == Triple::mips ||
+      T.getArch() == Triple::mipsel)
+    FDECFIEncoding = dwarf::DW_EH_PE_sdata4;
+  else if (T.getArch() == Triple::mips64 ||
+           T.getArch() == Triple::mips64el)
+    FDECFIEncoding = dwarf::DW_EH_PE_sdata8;
+  else
+    FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+
+  if (T.getArch() == Triple::x86) {
+    PersonalityEncoding = (RelocM == Reloc::PIC_)
+     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+     : dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+    FDEEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = (RelocM == Reloc::PIC_)
+     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+     : dwarf::DW_EH_PE_absptr;
+  } else if (T.getArch() == Triple::x86_64) {
+    if (RelocM == Reloc::PIC_) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+        (CMModel == CodeModel::Small
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+    } else {
+      PersonalityEncoding =
+        (CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = (CMModel == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      FDEEncoding = dwarf::DW_EH_PE_udata4;
+      TTypeEncoding = (CMModel == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+    }
+  }  else if (T.getArch() ==  Triple::aarch64) {
+    // The small model guarantees static code/data size < 4GB, but not where it
+    // will be in memory. Most of these could end up >2GB away so even a signed
+    // pc-relative 32-bit address is insufficient, theoretically.
+    if (RelocM == Reloc::PIC_) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata8;
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8;
+      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        dwarf::DW_EH_PE_sdata8;
+    } else {
+      PersonalityEncoding = dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = dwarf::DW_EH_PE_absptr;
+      FDEEncoding = dwarf::DW_EH_PE_udata4;
+      TTypeEncoding = dwarf::DW_EH_PE_absptr;
+    }
+  } else if (T.getArch() == Triple::ppc64) {
+    PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
+    FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+      dwarf::DW_EH_PE_udata8;
+  }
+
+  // Solaris requires different flags for .eh_frame to seemingly every other
+  // platform.
+  EHSectionType = ELF::SHT_PROGBITS;
+  EHSectionFlags = ELF::SHF_ALLOC;
+  if (T.getOS() == Triple::Solaris) {
+    if (T.getArch() == Triple::x86_64)
+      EHSectionType = ELF::SHT_X86_64_UNWIND;
+    else
+      EHSectionFlags |= ELF::SHF_WRITE;
+  }
+
+
+  // ELF
+  BSSSection =
+    Ctx->getELFSection(".bss", ELF::SHT_NOBITS,
+                       ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                       SectionKind::getBSS());
+
+  TextSection =
+    Ctx->getELFSection(".text", ELF::SHT_PROGBITS,
+                       ELF::SHF_EXECINSTR |
+                       ELF::SHF_ALLOC,
+                       SectionKind::getText());
+
+  DataSection =
+    Ctx->getELFSection(".data", ELF::SHT_PROGBITS,
+                       ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                       SectionKind::getDataRel());
+
+  ReadOnlySection =
+    Ctx->getELFSection(".rodata", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC,
+                       SectionKind::getReadOnly());
+
+  TLSDataSection =
+    Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC | ELF::SHF_TLS |
+                       ELF::SHF_WRITE,
+                       SectionKind::getThreadData());
+
+  TLSBSSSection =
+    Ctx->getELFSection(".tbss", ELF::SHT_NOBITS,
+                       ELF::SHF_ALLOC | ELF::SHF_TLS |
+                       ELF::SHF_WRITE,
+                       SectionKind::getThreadBSS());
+
+  DataRelSection =
+    Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  DataRelLocalSection =
+    Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRelLocal());
+
+  DataRelROSection =
+    Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getReadOnlyWithRel());
+
+  DataRelROLocalSection =
+    Ctx->getELFSection(".data.rel.ro.local", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getReadOnlyWithRelLocal());
+
+  MergeableConst4Section =
+    Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst4());
+
+  MergeableConst8Section =
+    Ctx->getELFSection(".rodata.cst8", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst8());
+
+  MergeableConst16Section =
+    Ctx->getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst16());
+
+  StaticCtorSection =
+    Ctx->getELFSection(".ctors", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  StaticDtorSection =
+    Ctx->getELFSection(".dtors", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  // Exception Handling Sections.
+
+  // FIXME: We're emitting LSDA info into a readonly section on ELF, even though
+  // it contains relocatable pointers.  In PIC mode, this is probably a big
+  // runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    Ctx->getELFSection(".gcc_except_table", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC,
+                       SectionKind::getReadOnly());
+
+  // Debug Info Sections.
+  DwarfAbbrevSection =
+    Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
+                       ELF::SHF_MERGE | ELF::SHF_STRINGS,
+                       SectionKind::getMergeable1ByteCString());
+  DwarfLocSection =
+    Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+
+  // DWARF5 Experimental Debug Info
+
+  // Accelerator Tables
+  DwarfAccelNamesSection =
+    Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelObjCSection =
+    Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelNamespaceSection =
+    Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelTypesSection =
+    Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+
+  // Fission Sections
+  DwarfInfoDWOSection =
+    Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAbbrevDWOSection =
+    Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfStrDWOSection =
+    Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
+                       ELF::SHF_MERGE | ELF::SHF_STRINGS,
+                       SectionKind::getMergeable1ByteCString());
+  DwarfLineDWOSection =
+    Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfLocDWOSection =
+    Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfStrOffDWOSection =
+    Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAddrSection =
+    Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+}
+
+
+void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
+  // COFF
+  TextSection =
+    Ctx->getCOFFSection(".text",
+                        COFF::IMAGE_SCN_CNT_CODE |
+                        COFF::IMAGE_SCN_MEM_EXECUTE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getText());
+  DataSection =
+    Ctx->getCOFFSection(".data",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+  ReadOnlySection =
+    Ctx->getCOFFSection(".rdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getReadOnly());
+  if (T.getOS() == Triple::Win32) {
+    StaticCtorSection =
+      Ctx->getCOFFSection(".CRT$XCU",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
+  } else {
+    StaticCtorSection =
+      Ctx->getCOFFSection(".ctors",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+  }
+
+
+  if (T.getOS() == Triple::Win32) {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".CRT$XTX",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
+  } else {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".dtors",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+  }
+
+  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
+  // though it contains relocatable pointers.  In PIC mode, this is probably a
+  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    Ctx->getCOFFSection(".gcc_except_table",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getReadOnly());
+
+  // Debug info.
+  DwarfAbbrevSection =
+    Ctx->getCOFFSection(".debug_abbrev",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getCOFFSection(".debug_info",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getCOFFSection(".debug_line",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getCOFFSection(".debug_frame",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getCOFFSection(".debug_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getCOFFSection(".debug_pubtypes",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getCOFFSection(".debug_str",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfLocSection =
+    Ctx->getCOFFSection(".debug_loc",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getCOFFSection(".debug_aranges",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getCOFFSection(".debug_ranges",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getCOFFSection(".debug_macinfo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+
+  DrectveSection =
+    Ctx->getCOFFSection(".drectve",
+                        COFF::IMAGE_SCN_LNK_INFO,
+                        SectionKind::getMetadata());
+
+  PDataSection =
+    Ctx->getCOFFSection(".pdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getDataRel());
+
+  XDataSection =
+    Ctx->getCOFFSection(".xdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getDataRel());
+  TLSDataSection =
+    Ctx->getCOFFSection(".tls$",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+}
+
+void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
+                                            CodeModel::Model cm,
+                                            MCContext &ctx) {
+  RelocM = relocm;
+  CMModel = cm;
+  Ctx = &ctx;
+
+  // Common.
+  CommDirectiveSupportsAlignment = true;
+  SupportsWeakOmittedEHFrame = true;
+  IsFunctionEHFrameSymbolPrivate = true;
+
+  PersonalityEncoding = LSDAEncoding = FDEEncoding = FDECFIEncoding =
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+
+  EHFrameSection = 0;             // Created on demand.
+  CompactUnwindSection = 0;       // Used only by selected targets.
+  DwarfAccelNamesSection = 0;     // Used only by selected targets.
+  DwarfAccelObjCSection = 0;      // Used only by selected targets.
+  DwarfAccelNamespaceSection = 0; // Used only by selected targets.
+  DwarfAccelTypesSection = 0;     // Used only by selected targets.
+
+  Triple T(TT);
+  Triple::ArchType Arch = T.getArch();
+  // FIXME: Checking for Arch here to filter out bogus triples such as
+  // cellspu-apple-darwin. Perhaps we should fix in Triple?
+  if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
+       Arch == Triple::arm || Arch == Triple::thumb ||
+       Arch == Triple::ppc || Arch == Triple::ppc64 ||
+       Arch == Triple::UnknownArch) &&
+      (T.isOSDarwin() || T.getEnvironment() == Triple::MachO)) {
+    Env = IsMachO;
+    InitMachOMCObjectFileInfo(T);
+  } else if ((Arch == Triple::x86 || Arch == Triple::x86_64) &&
+             (T.getEnvironment() != Triple::ELF) &&
+             (T.getOS() == Triple::MinGW32 || T.getOS() == Triple::Cygwin ||
+              T.getOS() == Triple::Win32)) {
+    Env = IsCOFF;
+    InitCOFFMCObjectFileInfo(T);
+  } else {
+    Env = IsELF;
+    InitELFMCObjectFileInfo(T);
+  }
+}
+
+void MCObjectFileInfo::InitEHFrameSection() {
+  if (Env == IsMachO)
+    EHFrameSection =
+      Ctx->getMachOSection("__TEXT", "__eh_frame",
+                           MCSectionMachO::S_COALESCED |
+                           MCSectionMachO::S_ATTR_NO_TOC |
+                           MCSectionMachO::S_ATTR_STRIP_STATIC_SYMS |
+                           MCSectionMachO::S_ATTR_LIVE_SUPPORT,
+                           SectionKind::getReadOnly());
+  else if (Env == IsELF)
+    EHFrameSection =
+      Ctx->getELFSection(".eh_frame", EHSectionType,
+                         EHSectionFlags,
+                         SectionKind::getDataRel());
+  else
+    EHFrameSection =
+      Ctx->getCOFFSection(".eh_frame",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+}
diff --git a/contrib/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
new file mode 100644
index 000000000000..0d2ce83a8a10
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
@@ -0,0 +1,360 @@
+//===- lib/MC/MCObjectStreamer.cpp - Object File MCStreamer Interface -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+                                   MCAsmBackend &TAB, raw_ostream &OS,
+                                   MCCodeEmitter *Emitter_)
+    : MCStreamer(Kind, Context),
+      Assembler(new MCAssembler(Context, TAB, *Emitter_,
+                                *TAB.createObjectWriter(OS), OS)),
+      CurSectionData(0) {}
+
+MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+                                   MCAsmBackend &TAB, raw_ostream &OS,
+                                   MCCodeEmitter *Emitter_,
+                                   MCAssembler *_Assembler)
+    : MCStreamer(Kind, Context), Assembler(_Assembler), CurSectionData(0) {}
+
+MCObjectStreamer::~MCObjectStreamer() {
+  delete &Assembler->getBackend();
+  delete &Assembler->getEmitter();
+  delete &Assembler->getWriter();
+  delete Assembler;
+}
+
+void MCObjectStreamer::reset() {
+  if (Assembler)
+    Assembler->reset();
+  CurSectionData = 0;
+  MCStreamer::reset();
+}
+
+MCFragment *MCObjectStreamer::getCurrentFragment() const {
+  assert(getCurrentSectionData() && "No current section!");
+
+  if (!getCurrentSectionData()->empty())
+    return &getCurrentSectionData()->getFragmentList().back();
+
+  return 0;
+}
+
+MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
+  MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+  // When bundling is enabled, we don't want to add data to a fragment that
+  // already has instructions (see MCELFStreamer::EmitInstToData for details)
+  if (!F || (Assembler->isBundlingEnabled() && F->hasInstructions()))
+    F = new MCDataFragment(getCurrentSectionData());
+  return F;
+}
+
+const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    cast<MCTargetExpr>(Value)->AddValueSymbols(Assembler);
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols(BE->getLHS());
+    AddValueSymbols(BE->getRHS());
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Assembler->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
+    break;
+  }
+
+  return Value;
+}
+
+void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                     unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Avoid fixups when possible.
+  int64_t AbsValue;
+  if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue, getAssembler())) {
+    EmitIntValue(AbsValue, Size, AddrSpace);
+    return;
+  }
+  DF->getFixups().push_back(
+      MCFixup::Create(DF->getContents().size(), Value,
+                      MCFixup::getKindForSize(Size, false)));
+  DF->getContents().resize(DF->getContents().size() + Size, 0);
+}
+
+void MCObjectStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+  RecordProcStart(Frame);
+}
+
+void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  RecordProcEnd(Frame);
+}
+
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
+  MCStreamer::EmitLabel(Symbol);
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // FIXME: This is wasteful, we don't necessarily need to create a data
+  // fragment. Instead, we should mark the symbol as pointing into the data
+  // fragment if it exists, otherwise we should just queue the label and set its
+  // fragment pointer when we emit the next fragment.
+  MCDataFragment *F = getOrCreateDataFragment();
+  assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
+  SD.setFragment(F);
+  SD.setOffset(F->getContents().size());
+}
+
+void MCObjectStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
+void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
+  int64_t IntValue;
+  if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
+    EmitULEB128IntValue(IntValue);
+    return;
+  }
+  Value = ForceExpAbs(Value);
+  new MCLEBFragment(*Value, false, getCurrentSectionData());
+}
+
+void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) {
+  int64_t IntValue;
+  if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
+    EmitSLEB128IntValue(IntValue);
+    return;
+  }
+  Value = ForceExpAbs(Value);
+  new MCLEBFragment(*Value, true, getCurrentSectionData());
+}
+
+void MCObjectStreamer::EmitWeakReference(MCSymbol *Alias,
+                                         const MCSymbol *Symbol) {
+  report_fatal_error("This file format doesn't support weak aliases.");
+}
+
+void MCObjectStreamer::ChangeSection(const MCSection *Section) {
+  assert(Section && "Cannot switch to a null section!");
+
+  CurSectionData = &getAssembler().getOrCreateSectionData(*Section);
+}
+
+void MCObjectStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+  getAssembler().getOrCreateSymbolData(*Symbol);
+  Symbol->setVariableValue(AddValueSymbols(Value));
+}
+
+void MCObjectStreamer::EmitInstruction(const MCInst &Inst) {
+  // Scan for values.
+  for (unsigned i = Inst.getNumOperands(); i--; )
+    if (Inst.getOperand(i).isExpr())
+      AddValueSymbols(Inst.getOperand(i).getExpr());
+
+  MCSectionData *SD = getCurrentSectionData();
+  SD->setHasInstructions(true);
+
+  // Now that a machine instruction has been assembled into this section, make
+  // a line entry for any .loc directive that has been seen.
+  MCLineEntry::Make(this, getCurrentSection());
+
+  // If this instruction doesn't need relaxation, just emit it as data.
+  MCAssembler &Assembler = getAssembler();
+  if (!Assembler.getBackend().mayNeedRelaxation(Inst)) {
+    EmitInstToData(Inst);
+    return;
+  }
+
+  // Otherwise, relax and emit it as data if either:
+  // - The RelaxAll flag was passed
+  // - Bundling is enabled and this instruction is inside a bundle-locked
+  //   group. We want to emit all such instructions into the same data
+  //   fragment.
+  if (Assembler.getRelaxAll() ||
+      (Assembler.isBundlingEnabled() && SD->isBundleLocked())) {
+    MCInst Relaxed;
+    getAssembler().getBackend().relaxInstruction(Inst, Relaxed);
+    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed))
+      getAssembler().getBackend().relaxInstruction(Relaxed, Relaxed);
+    EmitInstToData(Relaxed);
+    return;
+  }
+
+  // Otherwise emit to a separate fragment.
+  EmitInstToFragment(Inst);
+}
+
+void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst) {
+  // Always create a new, separate fragment here, because its size can change
+  // during relaxation.
+  MCRelaxableFragment *IF =
+    new MCRelaxableFragment(Inst, getCurrentSectionData());
+
+  SmallString<128> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, IF->getFixups());
+  VecOS.flush();
+  IF->getContents().append(Code.begin(), Code.end());
+}
+
+#ifndef NDEBUG
+static const char *BundlingNotImplementedMsg =
+  "Aligned bundling is not implemented for this object format";
+#endif
+
+void MCObjectStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+  llvm_unreachable(BundlingNotImplementedMsg);
+}
+
+void MCObjectStreamer::EmitBundleLock(bool AlignToEnd) {
+  llvm_unreachable(BundlingNotImplementedMsg);
+}
+
+void MCObjectStreamer::EmitBundleUnlock() {
+  llvm_unreachable(BundlingNotImplementedMsg);
+}
+
+void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                                const MCSymbol *LastLabel,
+                                                const MCSymbol *Label,
+                                                unsigned PointerSize) {
+  if (!LastLabel) {
+    EmitDwarfSetLineAddr(LineDelta, Label, PointerSize);
+    return;
+  }
+  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  int64_t Res;
+  if (AddrDelta->EvaluateAsAbsolute(Res, getAssembler())) {
+    MCDwarfLineAddr::Emit(this, LineDelta, Res);
+    return;
+  }
+  AddrDelta = ForceExpAbs(AddrDelta);
+  new MCDwarfLineAddrFragment(LineDelta, *AddrDelta, getCurrentSectionData());
+}
+
+void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+                                                 const MCSymbol *Label) {
+  const MCExpr *AddrDelta = BuildSymbolDiff(getContext(), Label, LastLabel);
+  int64_t Res;
+  if (AddrDelta->EvaluateAsAbsolute(Res, getAssembler())) {
+    MCDwarfFrameEmitter::EmitAdvanceLoc(*this, Res);
+    return;
+  }
+  AddrDelta = ForceExpAbs(AddrDelta);
+  new MCDwarfCallFrameFragment(*AddrDelta, getCurrentSectionData());
+}
+
+void MCObjectStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
+}
+
+void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                            int64_t Value,
+                                            unsigned ValueSize,
+                                            unsigned MaxBytesToEmit) {
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                         unsigned MaxBytesToEmit) {
+  EmitValueToAlignment(ByteAlignment, 0, 1, MaxBytesToEmit);
+  cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
+}
+
+bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                         unsigned char Value) {
+  int64_t Res;
+  if (Offset->EvaluateAsAbsolute(Res, getAssembler())) {
+    new MCOrgFragment(*Offset, Value, getCurrentSectionData());
+    return false;
+  }
+
+  MCSymbol *CurrentPos = getContext().CreateTempSymbol();
+  EmitLabel(CurrentPos);
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *Ref =
+    MCSymbolRefExpr::Create(CurrentPos, Variant, getContext());
+  const MCExpr *Delta =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, Offset, Ref, getContext());
+
+  if (!Delta->EvaluateAsAbsolute(Res, getAssembler()))
+    return true;
+  EmitFill(Res, Value);
+  return false;
+}
+
+// Associate GPRel32 fixup with data and resize data area
+void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(), 
+                                            Value, FK_GPRel_4));
+  DF->getContents().resize(DF->getContents().size() + 4, 0);
+}
+
+// Associate GPRel32 fixup with data and resize data area
+void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(), 
+                                            Value, FK_GPRel_4));
+  DF->getContents().resize(DF->getContents().size() + 8, 0);
+}
+
+void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                                unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+  // FIXME: A MCFillFragment would be more memory efficient but MCExpr has
+  //        problems evaluating expressions across multiple fragments.
+  getOrCreateDataFragment()->getContents().append(NumBytes, FillValue);
+}
+
+void MCObjectStreamer::FinishImpl() {
+  // Dump out the dwarf file & directory tables and line tables.
+  const MCSymbol *LineSectionSymbol = NULL;
+  if (getContext().hasDwarfFiles())
+    LineSectionSymbol = MCDwarfFileTable::Emit(this);
+
+  // If we are generating dwarf for assembly source files dump out the sections.
+  if (getContext().getGenDwarfForAssembly())
+    MCGenDwarfInfo::Emit(this, LineSectionSymbol);
+
+  getAssembler().Finish();
+}
diff --git a/contrib/llvm/lib/MC/MCObjectWriter.cpp b/contrib/llvm/lib/MC/MCObjectWriter.cpp
new file mode 100644
index 000000000000..94d7cd6fd4f3
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCObjectWriter.cpp
@@ -0,0 +1,56 @@
+//===- lib/MC/MCObjectWriter.cpp - MCObjectWriter implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+MCObjectWriter::~MCObjectWriter() {
+}
+
+bool
+MCObjectWriter::IsSymbolRefDifferenceFullyResolved(const MCAssembler &Asm,
+                                                   const MCSymbolRefExpr *A,
+                                                   const MCSymbolRefExpr *B,
+                                                   bool InSet) const {
+  // Modified symbol references cannot be resolved.
+  if (A->getKind() != MCSymbolRefExpr::VK_None ||
+      B->getKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  const MCSymbol &SA = A->getSymbol();
+  const MCSymbol &SB = B->getSymbol();
+  if (SA.AliasedSymbol().isUndefined() || SB.AliasedSymbol().isUndefined())
+    return false;
+
+  const MCSymbolData &DataA = Asm.getSymbolData(SA);
+  const MCSymbolData &DataB = Asm.getSymbolData(SB);
+  if(!DataA.getFragment() || !DataB.getFragment())
+    return false;
+
+  return IsSymbolRefDifferenceFullyResolvedImpl(Asm, DataA,
+                                                *DataB.getFragment(),
+                                                InSet,
+                                                false);
+}
+
+bool
+MCObjectWriter::IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                                      const MCSymbolData &DataA,
+                                                      const MCFragment &FB,
+                                                      bool InSet,
+                                                      bool IsPCRel) const {
+  const MCSection &SecA = DataA.getSymbol().AliasedSymbol().getSection();
+  const MCSection &SecB = FB.getParent()->getSection();
+  // On ELF and COFF  A - B is absolute if A and B are in the same section.
+  return &SecA == &SecB;
+}
diff --git a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
new file mode 100644
index 000000000000..c1c594a74697
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -0,0 +1,516 @@
+//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the lexer for assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include <cctype>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
+  CurBuf = NULL;
+  CurPtr = NULL;
+  isAtStartOfLine = true;
+}
+
+AsmLexer::~AsmLexer() {
+}
+
+void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
+  CurBuf = buf;
+
+  if (ptr)
+    CurPtr = ptr;
+  else
+    CurPtr = CurBuf->getBufferStart();
+
+  TokStart = 0;
+}
+
+/// ReturnError - Set the error to the specified string at the specified
+/// location.  This is defined to always return AsmToken::Error.
+AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
+  SetError(SMLoc::getFromPointer(Loc), Msg);
+
+  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
+}
+
+int AsmLexer::getNextChar() {
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default:
+    return (unsigned char)CurChar;
+  case 0:
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr-1 != CurBuf->getBufferEnd())
+      return 0;  // Just whitespace.
+
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.
+    return EOF;
+  }
+}
+
+/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
+///
+/// The leading integral digit sequence and dot should have already been
+/// consumed, some or all of the fractional digit sequence *can* have been
+/// consumed.
+AsmToken AsmLexer::LexFloatLiteral() {
+  // Skip the fractional digit sequence.
+  while (isdigit(*CurPtr))
+    ++CurPtr;
+
+  // Check for exponent; we intentionally accept a slighlty wider set of
+  // literals here and rely on the upstream client to reject invalid ones (e.g.,
+  // "1e+").
+  if (*CurPtr == 'e' || *CurPtr == 'E') {
+    ++CurPtr;
+    if (*CurPtr == '-' || *CurPtr == '+')
+      ++CurPtr;
+    while (isdigit(*CurPtr))
+      ++CurPtr;
+  }
+
+  return AsmToken(AsmToken::Real,
+                  StringRef(TokStart, CurPtr - TokStart));
+}
+
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+static bool IsIdentifierChar(char c) {
+  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
+}
+AsmToken AsmLexer::LexIdentifier() {
+  // Check for floating point literals.
+  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
+    // Disambiguate a .1243foo identifier from a floating literal.
+    while (isdigit(*CurPtr))
+      ++CurPtr;
+    if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr))
+      return LexFloatLiteral();
+  }
+
+  while (IsIdentifierChar(*CurPtr))
+    ++CurPtr;
+
+  // Handle . as a special case.
+  if (CurPtr == TokStart+1 && TokStart[0] == '.')
+    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
+
+  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
+}
+
+/// LexSlash: Slash: /
+///           C-Style Comment: /* ... */
+AsmToken AsmLexer::LexSlash() {
+  switch (*CurPtr) {
+  case '*': break; // C style comment.
+  case '/': return ++CurPtr, LexLineComment();
+  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
+  }
+
+  // C Style comment.
+  ++CurPtr;  // skip the star.
+  while (1) {
+    int CurChar = getNextChar();
+    switch (CurChar) {
+    case EOF:
+      return ReturnError(TokStart, "unterminated comment");
+    case '*':
+      // End of the comment?
+      if (CurPtr[0] != '/') break;
+
+      ++CurPtr;   // End the */.
+      return LexToken();
+    }
+  }
+}
+
+/// LexLineComment: Comment: #[^\n]*
+///                        : //[^\n]*
+AsmToken AsmLexer::LexLineComment() {
+  // FIXME: This is broken if we happen to a comment at the end of a file, which
+  // was .included, and which doesn't end with a newline.
+  int CurChar = getNextChar();
+  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
+    CurChar = getNextChar();
+
+  if (CurChar == EOF)
+    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
+  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
+}
+
+static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
+  // Skip ULL, UL, U, L and LL suffices.
+  if (CurPtr[0] == 'U')
+    ++CurPtr;
+  if (CurPtr[0] == 'L')
+    ++CurPtr;
+  if (CurPtr[0] == 'L')
+    ++CurPtr;
+}
+
+// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
+// integer as a hexadecimal, possibly with leading zeroes.
+static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
+  const char *FirstHex = 0;
+  const char *LookAhead = CurPtr;
+  while (1) {
+    if (isdigit(*LookAhead)) {
+      ++LookAhead;
+    } else if (isxdigit(*LookAhead)) {
+      if (!FirstHex)
+        FirstHex = LookAhead;
+      ++LookAhead;
+    } else {
+      break;
+    }
+  }
+  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
+  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
+  if (isHex)
+    return 16;
+  return DefaultRadix;
+}
+
+/// LexDigit: First character is [0-9].
+///   Local Label: [0-9][:]
+///   Forward/Backward Label: [0-9][fb]
+///   Binary integer: 0b[01]+
+///   Octal integer: 0[0-7]+
+///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
+///   Decimal integer: [1-9][0-9]*
+AsmToken AsmLexer::LexDigit() {
+  // Decimal integer: [1-9][0-9]*
+  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
+    unsigned Radix = doLookAhead(CurPtr, 10);
+    bool isHex = Radix == 16;
+    // Check for floating point literals.
+    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
+      ++CurPtr;
+      return LexFloatLiteral();
+    }
+
+    StringRef Result(TokStart, CurPtr - TokStart);
+
+    long long Value;
+    if (Result.getAsInteger(Radix, Value)) {
+      // Allow positive values that are too large to fit into a signed 64-bit
+      // integer, but that do fit in an unsigned one, we just convert them over.
+      unsigned long long UValue;
+      if (Result.getAsInteger(Radix, UValue))
+        return ReturnError(TokStart, !isHex ? "invalid decimal number" :
+                           "invalid hexdecimal number");
+      Value = (long long)UValue;
+    }
+
+    // Consume the [bB][hH].
+    if (Radix == 2 || Radix == 16)
+      ++CurPtr;
+
+    // The darwin/x86 (and x86-64) assembler accepts and ignores type
+    // suffices on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+
+    return AsmToken(AsmToken::Integer, Result, Value);
+  }
+
+  if (*CurPtr == 'b') {
+    ++CurPtr;
+    // See if we actually have "0b" as part of something like "jmp 0b\n"
+    if (!isdigit(CurPtr[0])) {
+      --CurPtr;
+      StringRef Result(TokStart, CurPtr - TokStart);
+      return AsmToken(AsmToken::Integer, Result, 0);
+    }
+    const char *NumStart = CurPtr;
+    while (CurPtr[0] == '0' || CurPtr[0] == '1')
+      ++CurPtr;
+
+    // Requires at least one binary digit.
+    if (CurPtr == NumStart)
+      return ReturnError(TokStart, "invalid binary number");
+
+    StringRef Result(TokStart, CurPtr - TokStart);
+
+    long long Value;
+    if (Result.substr(2).getAsInteger(2, Value))
+      return ReturnError(TokStart, "invalid binary number");
+
+    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+    // suffixes on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+
+    return AsmToken(AsmToken::Integer, Result, Value);
+  }
+
+  if (*CurPtr == 'x') {
+    ++CurPtr;
+    const char *NumStart = CurPtr;
+    while (isxdigit(CurPtr[0]))
+      ++CurPtr;
+
+    // Requires at least one hex digit.
+    if (CurPtr == NumStart)
+      return ReturnError(CurPtr-2, "invalid hexadecimal number");
+
+    unsigned long long Result;
+    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
+      return ReturnError(TokStart, "invalid hexadecimal number");
+
+    // Consume the optional [hH].
+    if (*CurPtr == 'h' || *CurPtr == 'H')
+      ++CurPtr;
+
+    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+    // suffixes on integer literals.
+    SkipIgnoredIntegerSuffix(CurPtr);
+
+    return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
+                    (int64_t)Result);
+  }
+
+  // Either octal or hexadecimal.
+  long long Value;
+  unsigned Radix = doLookAhead(CurPtr, 8);
+  bool isHex = Radix == 16;
+  StringRef Result(TokStart, CurPtr - TokStart);
+  if (Result.getAsInteger(Radix, Value))
+    return ReturnError(TokStart, !isHex ? "invalid octal number" :
+                       "invalid hexdecimal number");
+
+  // Consume the [hH].
+  if (Radix == 16)
+    ++CurPtr;
+
+  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
+  // suffixes on integer literals.
+  SkipIgnoredIntegerSuffix(CurPtr);
+
+  return AsmToken(AsmToken::Integer, Result, Value);
+}
+
+/// LexSingleQuote: Integer: 'b'
+AsmToken AsmLexer::LexSingleQuote() {
+  int CurChar = getNextChar();
+
+  if (CurChar == '\\')
+    CurChar = getNextChar();
+
+  if (CurChar == EOF)
+    return ReturnError(TokStart, "unterminated single quote");
+
+  CurChar = getNextChar();
+
+  if (CurChar != '\'')
+    return ReturnError(TokStart, "single quote way too long");
+
+  // The idea here being that 'c' is basically just an integral
+  // constant.
+  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
+  long long Value;
+
+  if (Res.startswith("\'\\")) {
+    char theChar = Res[2];
+    switch (theChar) {
+      default: Value = theChar; break;
+      case '\'': Value = '\''; break;
+      case 't': Value = '\t'; break;
+      case 'n': Value = '\n'; break;
+      case 'b': Value = '\b'; break;
+    }
+  } else
+    Value = TokStart[1];
+
+  return AsmToken(AsmToken::Integer, Res, Value);
+}
+
+
+/// LexQuote: String: "..."
+AsmToken AsmLexer::LexQuote() {
+  int CurChar = getNextChar();
+  // TODO: does gas allow multiline string constants?
+  while (CurChar != '"') {
+    if (CurChar == '\\') {
+      // Allow \", etc.
+      CurChar = getNextChar();
+    }
+
+    if (CurChar == EOF)
+      return ReturnError(TokStart, "unterminated string constant");
+
+    CurChar = getNextChar();
+  }
+
+  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
+}
+
+StringRef AsmLexer::LexUntilEndOfStatement() {
+  TokStart = CurPtr;
+
+  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
+         !isAtStatementSeparator(CurPtr) && // End of statement marker.
+         *CurPtr != '\n' &&
+         *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
+    ++CurPtr;
+  }
+  return StringRef(TokStart, CurPtr-TokStart);
+}
+
+StringRef AsmLexer::LexUntilEndOfLine() {
+  TokStart = CurPtr;
+
+  while (*CurPtr != '\n' &&
+         *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
+    ++CurPtr;
+  }
+  return StringRef(TokStart, CurPtr-TokStart);
+}
+
+bool AsmLexer::isAtStartOfComment(char Char) {
+  // FIXME: This won't work for multi-character comment indicators like "//".
+  return Char == *MAI.getCommentString();
+}
+
+bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
+  return strncmp(Ptr, MAI.getSeparatorString(),
+                 strlen(MAI.getSeparatorString())) == 0;
+}
+
+AsmToken AsmLexer::LexToken() {
+  TokStart = CurPtr;
+  // This always consumes at least one character.
+  int CurChar = getNextChar();
+
+  if (isAtStartOfComment(CurChar)) {
+    // If this comment starts with a '#', then return the Hash token and let
+    // the assembler parser see if it can be parsed as a cpp line filename
+    // comment. We do this only if we are at the start of a line.
+    if (CurChar == '#' && isAtStartOfLine)
+      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
+    isAtStartOfLine = true;
+    return LexLineComment();
+  }
+  if (isAtStatementSeparator(TokStart)) {
+    CurPtr += strlen(MAI.getSeparatorString()) - 1;
+    return AsmToken(AsmToken::EndOfStatement,
+                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
+  }
+
+  // If we're missing a newline at EOF, make sure we still get an
+  // EndOfStatement token before the Eof token.
+  if (CurChar == EOF && !isAtStartOfLine) {
+    isAtStartOfLine = true;
+    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
+  }
+
+  isAtStartOfLine = false;
+  switch (CurChar) {
+  default:
+    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
+      return LexIdentifier();
+
+    // Unknown character, emit an error.
+    return ReturnError(TokStart, "invalid character in input");
+  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
+  case 0:
+  case ' ':
+  case '\t':
+    if (SkipSpace) {
+      // Ignore whitespace.
+      return LexToken();
+    } else {
+      int len = 1;
+      while (*CurPtr==' ' || *CurPtr=='\t') {
+        CurPtr++;
+        len++;
+      }
+      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
+    }
+  case '\n': // FALL THROUGH.
+  case '\r':
+    isAtStartOfLine = true;
+    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
+  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
+  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
+  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
+  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
+  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
+  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
+  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
+  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
+  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
+  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
+  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
+  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
+  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
+  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
+  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
+  case '=':
+    if (*CurPtr == '=')
+      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
+    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
+  case '|':
+    if (*CurPtr == '|')
+      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
+    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
+  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
+  case '&':
+    if (*CurPtr == '&')
+      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
+    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
+  case '!':
+    if (*CurPtr == '=')
+      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
+    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
+  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
+  case '/': return LexSlash();
+  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
+  case '\'': return LexSingleQuote();
+  case '"': return LexQuote();
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    return LexDigit();
+  case '<':
+    switch (*CurPtr) {
+    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
+                                        StringRef(TokStart, 2));
+    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
+                                        StringRef(TokStart, 2));
+    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
+                                        StringRef(TokStart, 2));
+    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
+    }
+  case '>':
+    switch (*CurPtr) {
+    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
+                                        StringRef(TokStart, 2));
+    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
+                                        StringRef(TokStart, 2));
+    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
+    }
+
+  // TODO: Quoted identifiers (objc methods etc)
+  // local labels: [0-9][:]
+  // Forward/backward labels: [0-9][fb]
+  // Integers, fp constants, character constants.
+  }
+}
diff --git a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
new file mode 100644
index 000000000000..804734cea939
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -0,0 +1,4272 @@
+//===- AsmParser.cpp - Parser for Assembly Files --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the parser for assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/AsmCond.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+#include <set>
+#include <string>
+#include <vector>
+using namespace llvm;
+
+static cl::opt<bool>
+FatalAssemblerWarnings("fatal-assembler-warnings",
+                       cl::desc("Consider warnings as error"));
+
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
+
+namespace {
+
+/// \brief Helper types for tracking macro definitions.
+typedef std::vector<AsmToken> MCAsmMacroArgument;
+typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
+typedef std::pair<StringRef, MCAsmMacroArgument> MCAsmMacroParameter;
+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
+
+struct MCAsmMacro {
+  StringRef Name;
+  StringRef Body;
+  MCAsmMacroParameters Parameters;
+
+public:
+  MCAsmMacro(StringRef N, StringRef B, const MCAsmMacroParameters &P) :
+    Name(N), Body(B), Parameters(P) {}
+
+  MCAsmMacro(const MCAsmMacro& Other)
+    : Name(Other.Name), Body(Other.Body), Parameters(Other.Parameters) {}
+};
+
+/// \brief Helper class for storing information about an active macro
+/// instantiation.
+struct MacroInstantiation {
+  /// The macro being instantiated.
+  const MCAsmMacro *TheMacro;
+
+  /// The macro instantiation with substitutions.
+  MemoryBuffer *Instantiation;
+
+  /// The location of the instantiation.
+  SMLoc InstantiationLoc;
+
+  /// The buffer where parsing should resume upon instantiation completion.
+  int ExitBuffer;
+
+  /// The location where parsing should resume upon instantiation completion.
+  SMLoc ExitLoc;
+
+public:
+  MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB, SMLoc EL,
+                     MemoryBuffer *I);
+};
+
+struct ParseStatementInfo {
+  /// ParsedOperands - The parsed operands from the last parsed statement.
+  SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
+
+  /// Opcode - The opcode from the last parsed instruction.
+  unsigned Opcode;
+
+  /// Error - Was there an error parsing the inline assembly?
+  bool ParseError;
+
+  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+
+  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(0) {}
+  ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
+    : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
+
+  ~ParseStatementInfo() {
+    // Free any parsed operands.
+    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
+      delete ParsedOperands[i];
+    ParsedOperands.clear();
+  }
+};
+
+/// \brief The concrete assembly parser instance.
+class AsmParser : public MCAsmParser {
+  AsmParser(const AsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const AsmParser &) LLVM_DELETED_FUNCTION;
+private:
+  AsmLexer Lexer;
+  MCContext &Ctx;
+  MCStreamer &Out;
+  const MCAsmInfo &MAI;
+  SourceMgr &SrcMgr;
+  SourceMgr::DiagHandlerTy SavedDiagHandler;
+  void *SavedDiagContext;
+  MCAsmParserExtension *PlatformParser;
+
+  /// This is the current buffer index we're lexing from as managed by the
+  /// SourceMgr object.
+  int CurBuffer;
+
+  AsmCond TheCondState;
+  std::vector<AsmCond> TheCondStack;
+
+  /// ExtensionDirectiveMap - maps directive names to handler methods in parser
+  /// extensions. Extensions register themselves in this map by calling
+  /// addDirectiveHandler.
+  StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
+
+  /// MacroMap - Map of currently defined macros.
+  StringMap<MCAsmMacro*> MacroMap;
+
+  /// ActiveMacros - Stack of active macro instantiations.
+  std::vector<MacroInstantiation*> ActiveMacros;
+
+  /// Boolean tracking whether macro substitution is enabled.
+  unsigned MacrosEnabledFlag : 1;
+
+  /// Flag tracking whether any errors have been encountered.
+  unsigned HadError : 1;
+
+  /// The values from the last parsed cpp hash file line comment if any.
+  StringRef CppHashFilename;
+  int64_t CppHashLineNumber;
+  SMLoc CppHashLoc;
+  int CppHashBuf;
+
+  /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
+  unsigned AssemblerDialect;
+
+  /// IsDarwin - is Darwin compatibility enabled?
+  bool IsDarwin;
+
+  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  bool ParsingInlineAsm;
+
+public:
+  AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
+            const MCAsmInfo &MAI);
+  virtual ~AsmParser();
+
+  virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false);
+
+  virtual void addDirectiveHandler(StringRef Directive,
+                                   ExtensionDirectiveHandler Handler) {
+    ExtensionDirectiveMap[Directive] = Handler;
+  }
+
+public:
+  /// @name MCAsmParser Interface
+  /// {
+
+  virtual SourceMgr &getSourceManager() { return SrcMgr; }
+  virtual MCAsmLexer &getLexer() { return Lexer; }
+  virtual MCContext &getContext() { return Ctx; }
+  virtual MCStreamer &getStreamer() { return Out; }
+  virtual unsigned getAssemblerDialect() {
+    if (AssemblerDialect == ~0U)
+      return MAI.getAssemblerDialect();
+    else
+      return AssemblerDialect;
+  }
+  virtual void setAssemblerDialect(unsigned i) {
+    AssemblerDialect = i;
+  }
+
+  virtual bool Warning(SMLoc L, const Twine &Msg,
+                       ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
+  virtual bool Error(SMLoc L, const Twine &Msg,
+                     ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
+
+  virtual const AsmToken &Lex();
+
+  void setParsingInlineAsm(bool V) { ParsingInlineAsm = V; }
+  bool isParsingInlineAsm() { return ParsingInlineAsm; }
+
+  bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                        unsigned &NumOutputs, unsigned &NumInputs,
+                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
+                        SmallVectorImpl<std::string> &Constraints,
+                        SmallVectorImpl<std::string> &Clobbers,
+                        const MCInstrInfo *MII,
+                        const MCInstPrinter *IP,
+                        MCAsmParserSemaCallback &SI);
+
+  bool parseExpression(const MCExpr *&Res);
+  virtual bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
+  virtual bool parseAbsoluteExpression(int64_t &Res);
+
+  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// and set \p Res to the identifier contents.
+  virtual bool parseIdentifier(StringRef &Res);
+  virtual void eatToEndOfStatement();
+
+  virtual void checkForValidSection();
+  /// }
+
+private:
+
+  bool ParseStatement(ParseStatementInfo &Info);
+  void EatToEndOfLine();
+  bool ParseCppHashLineFilenameComment(const SMLoc &L);
+
+  void CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+                        MCAsmMacroParameters Parameters);
+  bool expandMacro(raw_svector_ostream &OS, StringRef Body,
+                   const MCAsmMacroParameters &Parameters,
+                   const MCAsmMacroArguments &A,
+                   const SMLoc &L);
+
+  /// \brief Are macros enabled in the parser?
+  bool MacrosEnabled() {return MacrosEnabledFlag;}
+
+  /// \brief Control a flag in the parser that enables or disables macros.
+  void SetMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
+
+  /// \brief Lookup a previously defined macro.
+  /// \param Name Macro name.
+  /// \returns Pointer to macro. NULL if no such macro was defined.
+  const MCAsmMacro* LookupMacro(StringRef Name);
+
+  /// \brief Define a new macro with the given name and information.
+  void DefineMacro(StringRef Name, const MCAsmMacro& Macro);
+
+  /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
+  void UndefineMacro(StringRef Name);
+
+  /// \brief Are we inside a macro instantiation?
+  bool InsideMacroInstantiation() {return !ActiveMacros.empty();}
+
+  /// \brief Handle entry to macro instantiation. 
+  ///
+  /// \param M The macro.
+  /// \param NameLoc Instantiation location.
+  bool HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+
+  /// \brief Handle exit from macro instantiation.
+  void HandleMacroExit();
+
+  /// \brief Extract AsmTokens for a macro argument. If the argument delimiter
+  /// is initially unknown, set it to AsmToken::Eof. It will be set to the
+  /// correct delimiter by the method.
+  bool ParseMacroArgument(MCAsmMacroArgument &MA,
+                          AsmToken::TokenKind &ArgumentDelimiter);
+
+  /// \brief Parse all macro arguments for a given macro.
+  bool ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+
+  void PrintMacroInstantiations();
+  void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
+                    ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const {
+    SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
+  }
+  static void DiagHandler(const SMDiagnostic &Diag, void *Context);
+
+  /// EnterIncludeFile - Enter the specified file. This returns true on failure.
+  bool EnterIncludeFile(const std::string &Filename);
+  /// ProcessIncbinFile - Process the specified file for the .incbin directive.
+  /// This returns true on failure.
+  bool ProcessIncbinFile(const std::string &Filename);
+
+  /// \brief Reset the current lexer position to that given by \p Loc. The
+  /// current token is not set; clients should ensure Lex() is called
+  /// subsequently.
+  ///
+  /// \param InBuffer If not -1, should be the known buffer id that contains the
+  /// location.
+  void JumpToLoc(SMLoc Loc, int InBuffer=-1);
+
+  /// \brief Parse up to the end of statement and a return the contents from the
+  /// current token until the end of the statement; the current token on exit
+  /// will be either the EndOfStatement or EOF.
+  virtual StringRef parseStringToEndOfStatement();
+
+  /// \brief Parse until the end of a statement or a comma is encountered,
+  /// return the contents from the current token up to the end or comma.
+  StringRef ParseStringToComma();
+
+  bool ParseAssignment(StringRef Name, bool allow_redef,
+                       bool NoDeadStrip = false);
+
+  bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
+
+  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+
+  // Generic (target and platform independent) directive parsing.
+  enum DirectiveKind {
+    DK_NO_DIRECTIVE, // Placeholder
+    DK_SET, DK_EQU, DK_EQUIV, DK_ASCII, DK_ASCIZ, DK_STRING, DK_BYTE, DK_SHORT,
+    DK_VALUE, DK_2BYTE, DK_LONG, DK_INT, DK_4BYTE, DK_QUAD, DK_8BYTE, DK_SINGLE,
+    DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
+    DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
+    DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
+    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL, DK_INDIRECT_SYMBOL,
+    DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
+    DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
+    DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
+    DK_INCLUDE, DK_INCBIN, DK_CODE16, DK_CODE16GCC, DK_REPT, DK_IRP, DK_IRPC,
+    DK_IF, DK_IFB, DK_IFNB, DK_IFC, DK_IFNC, DK_IFDEF, DK_IFNDEF, DK_IFNOTDEF,
+    DK_ELSEIF, DK_ELSE, DK_ENDIF,
+    DK_SPACE, DK_SKIP, DK_FILE, DK_LINE, DK_LOC, DK_STABS,
+    DK_CFI_SECTIONS, DK_CFI_STARTPROC, DK_CFI_ENDPROC, DK_CFI_DEF_CFA,
+    DK_CFI_DEF_CFA_OFFSET, DK_CFI_ADJUST_CFA_OFFSET, DK_CFI_DEF_CFA_REGISTER,
+    DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
+    DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
+    DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
+    DK_CFI_REGISTER,
+    DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
+    DK_SLEB128, DK_ULEB128
+  };
+
+  /// DirectiveKindMap - Maps directive name --> DirectiveKind enum, for
+  /// directives parsed by this class.
+  StringMap<DirectiveKind> DirectiveKindMap;
+
+  // ".ascii", ".asciz", ".string"
+  bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
+  bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
+  bool ParseDirectiveFill(); // ".fill"
+  bool ParseDirectiveZero(); // ".zero"
+  // ".set", ".equ", ".equiv"
+  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
+  bool ParseDirectiveOrg(); // ".org"
+  // ".align{,32}", ".p2align{,w,l}"
+  bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
+
+  // ".file", ".line", ".loc", ".stabs"
+  bool ParseDirectiveFile(SMLoc DirectiveLoc);
+  bool ParseDirectiveLine();
+  bool ParseDirectiveLoc();
+  bool ParseDirectiveStabs();
+
+  // .cfi directives
+  bool ParseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFISections();
+  bool ParseDirectiveCFIStartProc();
+  bool ParseDirectiveCFIEndProc();
+  bool ParseDirectiveCFIDefCfaOffset();
+  bool ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIAdjustCfaOffset();
+  bool ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool ParseDirectiveCFIRememberState();
+  bool ParseDirectiveCFIRestoreState();
+  bool ParseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool ParseDirectiveCFIEscape();
+  bool ParseDirectiveCFISignalFrame();
+  bool ParseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+
+  // macro directives
+  bool ParseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool ParseDirectiveEndMacro(StringRef Directive);
+  bool ParseDirectiveMacro(SMLoc DirectiveLoc);
+  bool ParseDirectiveMacrosOnOff(StringRef Directive);
+
+  // ".bundle_align_mode"
+  bool ParseDirectiveBundleAlignMode();
+  // ".bundle_lock"
+  bool ParseDirectiveBundleLock();
+  // ".bundle_unlock"
+  bool ParseDirectiveBundleUnlock();
+
+  // ".space", ".skip"
+  bool ParseDirectiveSpace(StringRef IDVal);
+
+  // .sleb128 (Signed=true) and .uleb128 (Signed=false)
+  bool ParseDirectiveLEB128(bool Signed);
+
+  /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
+  /// accepts a single symbol (which should be a label or an external).
+  bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+
+  bool ParseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+
+  bool ParseDirectiveAbort(); // ".abort"
+  bool ParseDirectiveInclude(); // ".include"
+  bool ParseDirectiveIncbin(); // ".incbin"
+
+  bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  // ".ifb" or ".ifnb", depending on ExpectBlank.
+  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // ".ifc" or ".ifnc", depending on ExpectEqual.
+  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
+  // ".ifdef" or ".ifndef", depending on expect_defined
+  bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
+  bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
+  bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
+  virtual bool parseEscapedString(std::string &Data);
+
+  const MCExpr *ApplyModifierToExpr(const MCExpr *E,
+                                    MCSymbolRefExpr::VariantKind Variant);
+
+  // Macro-like directives
+  MCAsmMacro *ParseMacroLikeBody(SMLoc DirectiveLoc);
+  void InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                raw_svector_ostream &OS);
+  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+
+  // "_emit" or "__emit"
+  bool ParseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+                            size_t Len);
+
+  // "align"
+  bool ParseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
+
+  void initializeDirectiveKindMap();
+};
+}
+
+namespace llvm {
+
+extern MCAsmParserExtension *createDarwinAsmParser();
+extern MCAsmParserExtension *createELFAsmParser();
+extern MCAsmParserExtension *createCOFFAsmParser();
+
+}
+
+enum { DEFAULT_ADDRSPACE = 0 };
+
+AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
+                     MCStreamer &_Out, const MCAsmInfo &_MAI)
+  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
+    PlatformParser(0),
+    CurBuffer(0), MacrosEnabledFlag(true), CppHashLineNumber(0),
+    AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
+  // Save the old handler.
+  SavedDiagHandler = SrcMgr.getDiagHandler();
+  SavedDiagContext = SrcMgr.getDiagContext();
+  // Set our own handler which calls the saved handler.
+  SrcMgr.setDiagHandler(DiagHandler, this);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+
+  // Initialize the platform / file format parser.
+  //
+  // FIXME: This is a hack, we need to (majorly) cleanup how these objects are
+  // created.
+  if (_MAI.hasMicrosoftFastStdCallMangling()) {
+    PlatformParser = createCOFFAsmParser();
+    PlatformParser->Initialize(*this);
+  } else if (_MAI.hasSubsectionsViaSymbols()) {
+    PlatformParser = createDarwinAsmParser();
+    PlatformParser->Initialize(*this);
+    IsDarwin = true;
+  } else {
+    PlatformParser = createELFAsmParser();
+    PlatformParser->Initialize(*this);
+  }
+
+  initializeDirectiveKindMap();
+}
+
+AsmParser::~AsmParser() {
+  assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
+
+  // Destroy any macros.
+  for (StringMap<MCAsmMacro*>::iterator it = MacroMap.begin(),
+         ie = MacroMap.end(); it != ie; ++it)
+    delete it->getValue();
+
+  delete PlatformParser;
+}
+
+void AsmParser::PrintMacroInstantiations() {
+  // Print the active macro instantiation stack.
+  for (std::vector<MacroInstantiation*>::const_reverse_iterator
+         it = ActiveMacros.rbegin(), ie = ActiveMacros.rend(); it != ie; ++it)
+    PrintMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
+                 "while in macro instantiation");
+}
+
+bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
+  if (FatalAssemblerWarnings)
+    return Error(L, Msg, Ranges);
+  PrintMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
+  PrintMacroInstantiations();
+  return false;
+}
+
+bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
+  HadError = true;
+  PrintMessage(L, SourceMgr::DK_Error, Msg, Ranges);
+  PrintMacroInstantiations();
+  return true;
+}
+
+bool AsmParser::EnterIncludeFile(const std::string &Filename) {
+  std::string IncludedFile;
+  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
+  if (NewBuf == -1)
+    return true;
+
+  CurBuffer = NewBuf;
+
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+
+  return false;
+}
+
+/// Process the specified .incbin file by seaching for it in the include paths
+/// then just emitting the byte contents of the file to the streamer. This
+/// returns true on failure.
+bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
+  std::string IncludedFile;
+  int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
+  if (NewBuf == -1)
+    return true;
+
+  // Pick up the bytes from the file and emit them.
+  getStreamer().EmitBytes(SrcMgr.getMemoryBuffer(NewBuf)->getBuffer(),
+                          DEFAULT_ADDRSPACE);
+  return false;
+}
+
+void AsmParser::JumpToLoc(SMLoc Loc, int InBuffer) {
+  if (InBuffer != -1) {
+    CurBuffer = InBuffer;
+  } else {
+    CurBuffer = SrcMgr.FindBufferContainingLoc(Loc);
+  }
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer), Loc.getPointer());
+}
+
+const AsmToken &AsmParser::Lex() {
+  const AsmToken *tok = &Lexer.Lex();
+
+  if (tok->is(AsmToken::Eof)) {
+    // If this is the end of an included file, pop the parent file off the
+    // include stack.
+    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+    if (ParentIncludeLoc != SMLoc()) {
+      JumpToLoc(ParentIncludeLoc);
+      tok = &Lexer.Lex();
+    }
+  }
+
+  if (tok->is(AsmToken::Error))
+    Error(Lexer.getErrLoc(), Lexer.getErr());
+
+  return *tok;
+}
+
+bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
+  // Create the initial section, if requested.
+  if (!NoInitialTextSection)
+    Out.InitSections();
+
+  // Prime the lexer.
+  Lex();
+
+  HadError = false;
+  AsmCond StartingCondState = TheCondState;
+
+  // If we are generating dwarf for assembly source files save the initial text
+  // section and generate a .file directive.
+  if (getContext().getGenDwarfForAssembly()) {
+    getContext().setGenDwarfSection(getStreamer().getCurrentSection());
+    MCSymbol *SectionStartSym = getContext().CreateTempSymbol();
+    getStreamer().EmitLabel(SectionStartSym);
+    getContext().setGenDwarfSectionStartSym(SectionStartSym);
+    getStreamer().EmitDwarfFileDirective(getContext().nextGenDwarfFileNumber(),
+                                         StringRef(),
+                                         getContext().getMainFileName());
+  }
+
+  // While we have input, parse each statement.
+  while (Lexer.isNot(AsmToken::Eof)) {
+    ParseStatementInfo Info;
+    if (!ParseStatement(Info)) continue;
+
+    // We had an error, validate that one was emitted and recover by skipping to
+    // the next line.
+    assert(HadError && "Parse statement returned an error, but none emitted!");
+    eatToEndOfStatement();
+  }
+
+  if (TheCondState.TheCond != StartingCondState.TheCond ||
+      TheCondState.Ignore != StartingCondState.Ignore)
+    return TokError("unmatched .ifs or .elses");
+
+  // Check to see there are no empty DwarfFile slots.
+  const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+    getContext().getMCDwarfFiles();
+  for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
+    if (!MCDwarfFiles[i])
+      TokError("unassigned file number: " + Twine(i) + " for .file directives");
+  }
+
+  // Check to see that all assembler local symbols were actually defined.
+  // Targets that don't do subsections via symbols may not want this, though,
+  // so conservatively exclude them. Only do this if we're finalizing, though,
+  // as otherwise we won't necessarilly have seen everything yet.
+  if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
+    const MCContext::SymbolTable &Symbols = getContext().getSymbols();
+    for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
+         e = Symbols.end();
+         i != e; ++i) {
+      MCSymbol *Sym = i->getValue();
+      // Variable symbols may not be marked as defined, so check those
+      // explicitly. If we know it's a variable, we have a definition for
+      // the purposes of this check.
+      if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
+        // FIXME: We would really like to refer back to where the symbol was
+        // first referenced for a source location. We need to add something
+        // to track that. Currently, we just point to the end of the file.
+        PrintMessage(getLexer().getLoc(), SourceMgr::DK_Error,
+                     "assembler local symbol '" + Sym->getName() +
+                     "' not defined");
+    }
+  }
+
+
+  // Finalize the output stream if there are no errors and if the client wants
+  // us to.
+  if (!HadError && !NoFinalize)
+    Out.Finish();
+
+  return HadError;
+}
+
+void AsmParser::checkForValidSection() {
+  if (!ParsingInlineAsm && !getStreamer().getCurrentSection()) {
+    TokError("expected section directive before assembly directive");
+    Out.InitToTextSection();
+  }
+}
+
+/// eatToEndOfStatement - Throw away the rest of the line for testing purposes.
+void AsmParser::eatToEndOfStatement() {
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  // Eat EOL.
+  if (Lexer.is(AsmToken::EndOfStatement))
+    Lex();
+}
+
+StringRef AsmParser::parseStringToEndOfStatement() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
+
+StringRef AsmParser::ParseStringToComma() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) &&
+         Lexer.isNot(AsmToken::Comma) &&
+         Lexer.isNot(AsmToken::Eof))
+    Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
+
+/// ParseParenExpr - Parse a paren expression and return it.
+/// NOTE: This assumes the leading '(' has already been consumed.
+///
+/// parenexpr ::= expr)
+///
+bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res)) return true;
+  if (Lexer.isNot(AsmToken::RParen))
+    return TokError("expected ')' in parentheses expression");
+  EndLoc = Lexer.getTok().getEndLoc();
+  Lex();
+  return false;
+}
+
+/// ParseBracketExpr - Parse a bracket expression and return it.
+/// NOTE: This assumes the leading '[' has already been consumed.
+///
+/// bracketexpr ::= expr]
+///
+bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res)) return true;
+  if (Lexer.isNot(AsmToken::RBrac))
+    return TokError("expected ']' in brackets expression");
+  EndLoc = Lexer.getTok().getEndLoc();
+  Lex();
+  return false;
+}
+
+/// ParsePrimaryExpr - Parse a primary expression and return it.
+///  primaryexpr ::= (parenexpr
+///  primaryexpr ::= symbol
+///  primaryexpr ::= number
+///  primaryexpr ::= '.'
+///  primaryexpr ::= ~,+,- primaryexpr
+bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  SMLoc FirstTokenLoc = getLexer().getLoc();
+  AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
+  switch (FirstTokenKind) {
+  default:
+    return TokError("unknown token in expression");
+  // If we have an error assume that we've already handled it.
+  case AsmToken::Error:
+    return true;
+  case AsmToken::Exclaim:
+    Lex(); // Eat the operator.
+    if (ParsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::CreateLNot(Res, getContext());
+    return false;
+  case AsmToken::Dollar:
+  case AsmToken::String:
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (parseIdentifier(Identifier)) {
+      if (FirstTokenKind == AsmToken::Dollar)
+        return Error(FirstTokenLoc, "invalid token in expression");
+      return true;
+    }
+
+    EndLoc = SMLoc::getFromPointer(Identifier.end());
+
+    // This is a symbol reference.
+    std::pair<StringRef, StringRef> Split = Identifier.split('@');
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
+
+    // Lookup the symbol variant if used.
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+    if (Split.first.size() != Identifier.size()) {
+      Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
+      if (Variant == MCSymbolRefExpr::VK_Invalid) {
+        Variant = MCSymbolRefExpr::VK_None;
+        return TokError("invalid variant '" + Split.second + "'");
+      }
+    }
+
+    // If this is an absolute variable reference, substitute it now to preserve
+    // semantics in the face of reassignment.
+    if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
+      if (Variant)
+        return Error(EndLoc, "unexpected modifier on variable reference");
+
+      Res = Sym->getVariableValue();
+      return false;
+    }
+
+    // Otherwise create a symbol ref.
+    Res = MCSymbolRefExpr::Create(Sym, Variant, getContext());
+    return false;
+  }
+  case AsmToken::Integer: {
+    SMLoc Loc = getTok().getLoc();
+    int64_t IntVal = getTok().getIntVal();
+    Res = MCConstantExpr::Create(IntVal, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat token.
+    // Look for 'b' or 'f' following an Integer as a directional label
+    if (Lexer.getKind() == AsmToken::Identifier) {
+      StringRef IDVal = getTok().getString();
+      if (IDVal == "f" || IDVal == "b"){
+        MCSymbol *Sym = Ctx.GetDirectionalLocalSymbol(IntVal,
+                                                      IDVal == "f" ? 1 : 0);
+        Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                      getContext());
+        if (IDVal == "b" && Sym->isUndefined())
+          return Error(Loc, "invalid reference to undefined symbol");
+        EndLoc = Lexer.getTok().getEndLoc();
+        Lex(); // Eat identifier.
+      }
+    }
+    return false;
+  }
+  case AsmToken::Real: {
+    APFloat RealVal(APFloat::IEEEdouble, getTok().getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    Res = MCConstantExpr::Create(IntVal, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat token.
+    return false;
+  }
+  case AsmToken::Dot: {
+    // This is a '.' reference, which references the current PC.  Emit a
+    // temporary label to the streamer and refer to it.
+    MCSymbol *Sym = Ctx.CreateTempSymbol();
+    Out.EmitLabel(Sym);
+    Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat identifier.
+    return false;
+  }
+  case AsmToken::LParen:
+    Lex(); // Eat the '('.
+    return ParseParenExpr(Res, EndLoc);
+  case AsmToken::LBrac:
+    if (!PlatformParser->HasBracketExpressions())
+      return TokError("brackets expression not supported on this target");
+    Lex(); // Eat the '['.
+    return ParseBracketExpr(Res, EndLoc);
+  case AsmToken::Minus:
+    Lex(); // Eat the operator.
+    if (ParsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::CreateMinus(Res, getContext());
+    return false;
+  case AsmToken::Plus:
+    Lex(); // Eat the operator.
+    if (ParsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::CreatePlus(Res, getContext());
+    return false;
+  case AsmToken::Tilde:
+    Lex(); // Eat the operator.
+    if (ParsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::CreateNot(Res, getContext());
+    return false;
+  }
+}
+
+bool AsmParser::parseExpression(const MCExpr *&Res) {
+  SMLoc EndLoc;
+  return parseExpression(Res, EndLoc);
+}
+
+const MCExpr *
+AsmParser::ApplyModifierToExpr(const MCExpr *E,
+                               MCSymbolRefExpr::VariantKind Variant) {
+  // Recurse over the given expression, rebuilding it to apply the given variant
+  // if there is exactly one symbol.
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return 0;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+    if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
+      TokError("invalid variant on expression '" +
+               getTok().getIdentifier() + "' (already modified)");
+      return E;
+    }
+
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, getContext());
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = ApplyModifierToExpr(UE->getSubExpr(), Variant);
+    if (!Sub)
+      return 0;
+    return MCUnaryExpr::Create(UE->getOpcode(), Sub, getContext());
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = ApplyModifierToExpr(BE->getLHS(), Variant);
+    const MCExpr *RHS = ApplyModifierToExpr(BE->getRHS(), Variant);
+
+    if (!LHS && !RHS)
+      return 0;
+
+    if (!LHS) LHS = BE->getLHS();
+    if (!RHS) RHS = BE->getRHS();
+
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// parseExpression - Parse an expression and return it.
+///
+///  expr ::= expr &&,|| expr               -> lowest.
+///  expr ::= expr |,^,&,! expr
+///  expr ::= expr ==,!=,<>,<,<=,>,>= expr
+///  expr ::= expr <<,>> expr
+///  expr ::= expr +,- expr
+///  expr ::= expr *,/,% expr               -> highest.
+///  expr ::= primaryexpr
+///
+bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+  // Parse the expression.
+  Res = 0;
+  if (ParsePrimaryExpr(Res, EndLoc) || ParseBinOpRHS(1, Res, EndLoc))
+    return true;
+
+  // As a special case, we support 'a op b @ modifier' by rewriting the
+  // expression to include the modifier. This is inefficient, but in general we
+  // expect users to use 'a@modifier op b'.
+  if (Lexer.getKind() == AsmToken::At) {
+    Lex();
+
+    if (Lexer.isNot(AsmToken::Identifier))
+      return TokError("unexpected symbol modifier following '@'");
+
+    MCSymbolRefExpr::VariantKind Variant =
+      MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
+    if (Variant == MCSymbolRefExpr::VK_Invalid)
+      return TokError("invalid variant '" + getTok().getIdentifier() + "'");
+
+    const MCExpr *ModifiedRes = ApplyModifierToExpr(Res, Variant);
+    if (!ModifiedRes) {
+      return TokError("invalid modifier '" + getTok().getIdentifier() +
+                      "' (no symbols present)");
+    }
+
+    Res = ModifiedRes;
+    Lex();
+  }
+
+  // Try to constant fold it up front, if possible.
+  int64_t Value;
+  if (Res->EvaluateAsAbsolute(Value))
+    Res = MCConstantExpr::Create(Value, getContext());
+
+  return false;
+}
+
+bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+  Res = 0;
+  return ParseParenExpr(Res, EndLoc) ||
+         ParseBinOpRHS(1, Res, EndLoc);
+}
+
+bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
+  const MCExpr *Expr;
+
+  SMLoc StartLoc = Lexer.getLoc();
+  if (parseExpression(Expr))
+    return true;
+
+  if (!Expr->EvaluateAsAbsolute(Res))
+    return Error(StartLoc, "expected absolute expression");
+
+  return false;
+}
+
+static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
+                                   MCBinaryExpr::Opcode &Kind) {
+  switch (K) {
+  default:
+    return 0;    // not a binop.
+
+    // Lowest Precedence: &&, ||
+  case AsmToken::AmpAmp:
+    Kind = MCBinaryExpr::LAnd;
+    return 1;
+  case AsmToken::PipePipe:
+    Kind = MCBinaryExpr::LOr;
+    return 1;
+
+
+    // Low Precedence: |, &, ^
+    //
+    // FIXME: gas seems to support '!' as an infix operator?
+  case AsmToken::Pipe:
+    Kind = MCBinaryExpr::Or;
+    return 2;
+  case AsmToken::Caret:
+    Kind = MCBinaryExpr::Xor;
+    return 2;
+  case AsmToken::Amp:
+    Kind = MCBinaryExpr::And;
+    return 2;
+
+    // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
+  case AsmToken::EqualEqual:
+    Kind = MCBinaryExpr::EQ;
+    return 3;
+  case AsmToken::ExclaimEqual:
+  case AsmToken::LessGreater:
+    Kind = MCBinaryExpr::NE;
+    return 3;
+  case AsmToken::Less:
+    Kind = MCBinaryExpr::LT;
+    return 3;
+  case AsmToken::LessEqual:
+    Kind = MCBinaryExpr::LTE;
+    return 3;
+  case AsmToken::Greater:
+    Kind = MCBinaryExpr::GT;
+    return 3;
+  case AsmToken::GreaterEqual:
+    Kind = MCBinaryExpr::GTE;
+    return 3;
+
+    // Intermediate Precedence: <<, >>
+  case AsmToken::LessLess:
+    Kind = MCBinaryExpr::Shl;
+    return 4;
+  case AsmToken::GreaterGreater:
+    Kind = MCBinaryExpr::Shr;
+    return 4;
+
+    // High Intermediate Precedence: +, -
+  case AsmToken::Plus:
+    Kind = MCBinaryExpr::Add;
+    return 5;
+  case AsmToken::Minus:
+    Kind = MCBinaryExpr::Sub;
+    return 5;
+
+    // Highest Precedence: *, /, %
+  case AsmToken::Star:
+    Kind = MCBinaryExpr::Mul;
+    return 6;
+  case AsmToken::Slash:
+    Kind = MCBinaryExpr::Div;
+    return 6;
+  case AsmToken::Percent:
+    Kind = MCBinaryExpr::Mod;
+    return 6;
+  }
+}
+
+
+/// ParseBinOpRHS - Parse all binary operators with precedence >= 'Precedence'.
+/// Res contains the LHS of the expression on input.
+bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
+                              SMLoc &EndLoc) {
+  while (1) {
+    MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
+    unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
+
+    // If the next token is lower precedence than we are allowed to eat, return
+    // successfully with what we ate already.
+    if (TokPrec < Precedence)
+      return false;
+
+    Lex();
+
+    // Eat the next primary expression.
+    const MCExpr *RHS;
+    if (ParsePrimaryExpr(RHS, EndLoc)) return true;
+
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    MCBinaryExpr::Opcode Dummy;
+    unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
+    if (TokPrec < NextTokPrec) {
+      if (ParseBinOpRHS(Precedence+1, RHS, EndLoc)) return true;
+    }
+
+    // Merge LHS and RHS according to operator.
+    Res = MCBinaryExpr::Create(Kind, Res, RHS, getContext());
+  }
+}
+
+/// ParseStatement:
+///   ::= EndOfStatement
+///   ::= Label* Directive ...Operands... EndOfStatement
+///   ::= Label* Identifier OperandList* EndOfStatement
+bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
+  if (Lexer.is(AsmToken::EndOfStatement)) {
+    Out.AddBlankLine();
+    Lex();
+    return false;
+  }
+
+  // Statements always start with an identifier or are a full line comment.
+  AsmToken ID = getTok();
+  SMLoc IDLoc = ID.getLoc();
+  StringRef IDVal;
+  int64_t LocalLabelVal = -1;
+  // A full line comment is a '#' as the first token.
+  if (Lexer.is(AsmToken::Hash))
+    return ParseCppHashLineFilenameComment(IDLoc);
+
+  // Allow an integer followed by a ':' as a directional local label.
+  if (Lexer.is(AsmToken::Integer)) {
+    LocalLabelVal = getTok().getIntVal();
+    if (LocalLabelVal < 0) {
+      if (!TheCondState.Ignore)
+        return TokError("unexpected token at start of statement");
+      IDVal = "";
+    } else {
+      IDVal = getTok().getString();
+      Lex(); // Consume the integer token to be used as an identifier token.
+      if (Lexer.getKind() != AsmToken::Colon) {
+        if (!TheCondState.Ignore)
+          return TokError("unexpected token at start of statement");
+      }
+    }
+  } else if (Lexer.is(AsmToken::Dot)) {
+    // Treat '.' as a valid identifier in this context.
+    Lex();
+    IDVal = ".";
+  } else if (parseIdentifier(IDVal)) {
+    if (!TheCondState.Ignore)
+      return TokError("unexpected token at start of statement");
+    IDVal = "";
+  }
+
+  // Handle conditional assembly here before checking for skipping.  We
+  // have to do this so that .endif isn't skipped in a ".if 0" block for
+  // example.
+  StringMap<DirectiveKind>::const_iterator DirKindIt =
+    DirectiveKindMap.find(IDVal);
+  DirectiveKind DirKind =
+    (DirKindIt == DirectiveKindMap.end()) ? DK_NO_DIRECTIVE :
+                                            DirKindIt->getValue();
+  switch (DirKind) {
+    default:
+      break;
+    case DK_IF:
+      return ParseDirectiveIf(IDLoc);
+    case DK_IFB:
+      return ParseDirectiveIfb(IDLoc, true);
+    case DK_IFNB:
+      return ParseDirectiveIfb(IDLoc, false);
+    case DK_IFC:
+      return ParseDirectiveIfc(IDLoc, true);
+    case DK_IFNC:
+      return ParseDirectiveIfc(IDLoc, false);
+    case DK_IFDEF:
+      return ParseDirectiveIfdef(IDLoc, true);
+    case DK_IFNDEF:
+    case DK_IFNOTDEF:
+      return ParseDirectiveIfdef(IDLoc, false);
+    case DK_ELSEIF:
+      return ParseDirectiveElseIf(IDLoc);
+    case DK_ELSE:
+      return ParseDirectiveElse(IDLoc);
+    case DK_ENDIF:
+      return ParseDirectiveEndIf(IDLoc);
+  }
+
+  // Ignore the statement if in the middle of inactive conditional
+  // (e.g. ".if 0").
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+    return false;
+  }
+
+  // FIXME: Recurse on local labels?
+
+  // See what kind of statement we have.
+  switch (Lexer.getKind()) {
+  case AsmToken::Colon: {
+    checkForValidSection();
+
+    // identifier ':'   -> Label.
+    Lex();
+
+    // Diagnose attempt to use '.' as a label.
+    if (IDVal == ".")
+      return Error(IDLoc, "invalid use of pseudo-symbol '.' as a label");
+
+    // Diagnose attempt to use a variable as a label.
+    //
+    // FIXME: Diagnostics. Note the location of the definition as a label.
+    // FIXME: This doesn't diagnose assignment to a symbol which has been
+    // implicitly marked as external.
+    MCSymbol *Sym;
+    if (LocalLabelVal == -1)
+      Sym = getContext().GetOrCreateSymbol(IDVal);
+    else
+      Sym = Ctx.CreateDirectionalLocalSymbol(LocalLabelVal);
+    if (!Sym->isUndefined() || Sym->isVariable())
+      return Error(IDLoc, "invalid symbol redefinition");
+
+    // Emit the label.
+    if (!ParsingInlineAsm)
+      Out.EmitLabel(Sym);
+
+    // If we are generating dwarf for assembly source files then gather the
+    // info to make a dwarf label entry for this label if needed.
+    if (getContext().getGenDwarfForAssembly())
+      MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
+                                 IDLoc);
+
+    // Consume any end of statement token, if present, to avoid spurious
+    // AddBlankLine calls().
+    if (Lexer.is(AsmToken::EndOfStatement)) {
+      Lex();
+      if (Lexer.is(AsmToken::Eof))
+        return false;
+    }
+
+    return false;
+  }
+
+  case AsmToken::Equal:
+    // identifier '=' ... -> assignment statement
+    Lex();
+
+    return ParseAssignment(IDVal, true);
+
+  default: // Normal instruction or directive.
+    break;
+  }
+
+  // If macros are enabled, check to see if this is a macro instantiation.
+  if (MacrosEnabled())
+    if (const MCAsmMacro *M = LookupMacro(IDVal)) {
+      return HandleMacroEntry(M, IDLoc);
+    }
+
+  // Otherwise, we have a normal instruction or directive.
+  
+  // Directives start with "."
+  if (IDVal[0] == '.' && IDVal != ".") {
+    // There are several entities interested in parsing directives:
+    // 
+    // 1. The target-specific assembly parser. Some directives are target
+    //    specific or may potentially behave differently on certain targets.
+    // 2. Asm parser extensions. For example, platform-specific parsers
+    //    (like the ELF parser) register themselves as extensions.
+    // 3. The generic directive parser implemented by this class. These are
+    //    all the directives that behave in a target and platform independent
+    //    manner, or at least have a default behavior that's shared between
+    //    all targets and platforms.
+
+    // First query the target-specific parser. It will return 'true' if it
+    // isn't interested in this directive.
+    if (!getTargetParser().ParseDirective(ID))
+      return false;
+
+    // Next, check the extention directive map to see if any extension has
+    // registered itself to parse this directive.
+    std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
+      ExtensionDirectiveMap.lookup(IDVal);
+    if (Handler.first)
+      return (*Handler.second)(Handler.first, IDVal, IDLoc);
+
+    // Finally, if no one else is interested in this directive, it must be
+    // generic and familiar to this class.
+    switch (DirKind) {
+      default:
+        break;
+      case DK_SET:
+      case DK_EQU:
+        return ParseDirectiveSet(IDVal, true);
+      case DK_EQUIV:
+        return ParseDirectiveSet(IDVal, false);
+      case DK_ASCII:
+        return ParseDirectiveAscii(IDVal, false);
+      case DK_ASCIZ:
+      case DK_STRING:
+        return ParseDirectiveAscii(IDVal, true);
+      case DK_BYTE:
+        return ParseDirectiveValue(1);
+      case DK_SHORT:
+      case DK_VALUE:
+      case DK_2BYTE:
+        return ParseDirectiveValue(2);
+      case DK_LONG:
+      case DK_INT:
+      case DK_4BYTE:
+        return ParseDirectiveValue(4);
+      case DK_QUAD:
+      case DK_8BYTE:
+        return ParseDirectiveValue(8);
+      case DK_SINGLE:
+      case DK_FLOAT:
+        return ParseDirectiveRealValue(APFloat::IEEEsingle);
+      case DK_DOUBLE:
+        return ParseDirectiveRealValue(APFloat::IEEEdouble);
+      case DK_ALIGN: {
+        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+      }
+      case DK_ALIGN32: {
+        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
+        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+      }
+      case DK_BALIGN:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
+      case DK_BALIGNW:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
+      case DK_BALIGNL:
+        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
+      case DK_P2ALIGN:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
+      case DK_P2ALIGNW:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
+      case DK_P2ALIGNL:
+        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+      case DK_ORG:
+        return ParseDirectiveOrg();
+      case DK_FILL:
+        return ParseDirectiveFill();
+      case DK_ZERO:
+        return ParseDirectiveZero();
+      case DK_EXTERN:
+        eatToEndOfStatement(); // .extern is the default, ignore it.
+        return false;
+      case DK_GLOBL:
+      case DK_GLOBAL:
+        return ParseDirectiveSymbolAttribute(MCSA_Global);
+      case DK_INDIRECT_SYMBOL:
+        return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
+      case DK_LAZY_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
+      case DK_NO_DEAD_STRIP:
+        return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
+      case DK_SYMBOL_RESOLVER:
+        return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
+      case DK_PRIVATE_EXTERN:
+        return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
+      case DK_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_Reference);
+      case DK_WEAK_DEFINITION:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
+      case DK_WEAK_REFERENCE:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
+      case DK_WEAK_DEF_CAN_BE_HIDDEN:
+        return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
+      case DK_COMM:
+      case DK_COMMON:
+        return ParseDirectiveComm(/*IsLocal=*/false);
+      case DK_LCOMM:
+        return ParseDirectiveComm(/*IsLocal=*/true);
+      case DK_ABORT:
+        return ParseDirectiveAbort();
+      case DK_INCLUDE:
+        return ParseDirectiveInclude();
+      case DK_INCBIN:
+        return ParseDirectiveIncbin();
+      case DK_CODE16:
+      case DK_CODE16GCC:
+        return TokError(Twine(IDVal) + " not supported yet");
+      case DK_REPT:
+        return ParseDirectiveRept(IDLoc);
+      case DK_IRP:
+        return ParseDirectiveIrp(IDLoc);
+      case DK_IRPC:
+        return ParseDirectiveIrpc(IDLoc);
+      case DK_ENDR:
+        return ParseDirectiveEndr(IDLoc);
+      case DK_BUNDLE_ALIGN_MODE:
+        return ParseDirectiveBundleAlignMode();
+      case DK_BUNDLE_LOCK:
+        return ParseDirectiveBundleLock();
+      case DK_BUNDLE_UNLOCK:
+        return ParseDirectiveBundleUnlock();
+      case DK_SLEB128:
+        return ParseDirectiveLEB128(true);
+      case DK_ULEB128:
+        return ParseDirectiveLEB128(false);
+      case DK_SPACE:
+      case DK_SKIP:
+        return ParseDirectiveSpace(IDVal);
+      case DK_FILE:
+        return ParseDirectiveFile(IDLoc);
+      case DK_LINE:
+        return ParseDirectiveLine();
+      case DK_LOC:
+        return ParseDirectiveLoc();
+      case DK_STABS:
+        return ParseDirectiveStabs();
+      case DK_CFI_SECTIONS:
+        return ParseDirectiveCFISections();
+      case DK_CFI_STARTPROC:
+        return ParseDirectiveCFIStartProc();
+      case DK_CFI_ENDPROC:
+        return ParseDirectiveCFIEndProc();
+      case DK_CFI_DEF_CFA:
+        return ParseDirectiveCFIDefCfa(IDLoc);
+      case DK_CFI_DEF_CFA_OFFSET:
+        return ParseDirectiveCFIDefCfaOffset();
+      case DK_CFI_ADJUST_CFA_OFFSET:
+        return ParseDirectiveCFIAdjustCfaOffset();
+      case DK_CFI_DEF_CFA_REGISTER:
+        return ParseDirectiveCFIDefCfaRegister(IDLoc);
+      case DK_CFI_OFFSET:
+        return ParseDirectiveCFIOffset(IDLoc);
+      case DK_CFI_REL_OFFSET:
+        return ParseDirectiveCFIRelOffset(IDLoc);
+      case DK_CFI_PERSONALITY:
+        return ParseDirectiveCFIPersonalityOrLsda(true);
+      case DK_CFI_LSDA:
+        return ParseDirectiveCFIPersonalityOrLsda(false);
+      case DK_CFI_REMEMBER_STATE:
+        return ParseDirectiveCFIRememberState();
+      case DK_CFI_RESTORE_STATE:
+        return ParseDirectiveCFIRestoreState();
+      case DK_CFI_SAME_VALUE:
+        return ParseDirectiveCFISameValue(IDLoc);
+      case DK_CFI_RESTORE:
+        return ParseDirectiveCFIRestore(IDLoc);
+      case DK_CFI_ESCAPE:
+        return ParseDirectiveCFIEscape();
+      case DK_CFI_SIGNAL_FRAME:
+        return ParseDirectiveCFISignalFrame();
+      case DK_CFI_UNDEFINED:
+        return ParseDirectiveCFIUndefined(IDLoc);
+      case DK_CFI_REGISTER:
+        return ParseDirectiveCFIRegister(IDLoc);
+      case DK_MACROS_ON:
+      case DK_MACROS_OFF:
+        return ParseDirectiveMacrosOnOff(IDVal);
+      case DK_MACRO:
+        return ParseDirectiveMacro(IDLoc);
+      case DK_ENDM:
+      case DK_ENDMACRO:
+        return ParseDirectiveEndMacro(IDVal);
+      case DK_PURGEM:
+        return ParseDirectivePurgeMacro(IDLoc);
+    }
+
+    return Error(IDLoc, "unknown directive");
+  }
+
+  // __asm _emit or __asm __emit
+  if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
+                           IDVal == "_EMIT" || IDVal == "__EMIT"))
+    return ParseDirectiveMSEmit(IDLoc, Info, IDVal.size());
+
+  // __asm align
+  if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
+    return ParseDirectiveMSAlign(IDLoc, Info);
+
+  checkForValidSection();
+
+  // Canonicalize the opcode to lower case.
+  std::string OpcodeStr = IDVal.lower();
+  ParseInstructionInfo IInfo(Info.AsmRewrites);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr,
+                                                     IDLoc, Info.ParsedOperands);
+  Info.ParseError = HadError;
+
+  // Dump the parsed representation, if requested.
+  if (getShowParsedOperands()) {
+    SmallString<256> Str;
+    raw_svector_ostream OS(Str);
+    OS << "parsed instruction: [";
+    for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
+      if (i != 0)
+        OS << ", ";
+      Info.ParsedOperands[i]->print(OS);
+    }
+    OS << "]";
+
+    PrintMessage(IDLoc, SourceMgr::DK_Note, OS.str());
+  }
+
+  // If we are generating dwarf for assembly source files and the current
+  // section is the initial text section then generate a .loc directive for
+  // the instruction.
+  if (!HadError && getContext().getGenDwarfForAssembly() &&
+      getContext().getGenDwarfSection() == getStreamer().getCurrentSection()) {
+
+    unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+
+    // If we previously parsed a cpp hash file line comment then make sure the
+    // current Dwarf File is for the CppHashFilename if not then emit the
+    // Dwarf File table for it and adjust the line number for the .loc.
+    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles = 
+      getContext().getMCDwarfFiles();
+    if (CppHashFilename.size() != 0) {
+      if (MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
+          CppHashFilename)
+        getStreamer().EmitDwarfFileDirective(
+          getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
+
+       unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
+       Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+    }
+
+    getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
+                                        Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
+                                        DWARF2_FLAG_IS_STMT : 0, 0, 0,
+                                        StringRef());
+  }
+
+  // If parsing succeeded, match the instruction.
+  if (!HadError) {
+    unsigned ErrorInfo;
+    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
+                                                         Info.ParsedOperands,
+                                                         Out, ErrorInfo,
+                                                         ParsingInlineAsm);
+  }
+
+  // Don't skip the rest of the line, the instruction parser is responsible for
+  // that.
+  return false;
+}
+
+/// EatToEndOfLine uses the Lexer to eat the characters to the end of the line
+/// since they may not be able to be tokenized to get to the end of line token.
+void AsmParser::EatToEndOfLine() {
+  if (!Lexer.is(AsmToken::EndOfStatement))
+    Lexer.LexUntilEndOfLine();
+ // Eat EOL.
+ Lex();
+}
+
+/// ParseCppHashLineFilenameComment as this:
+///   ::= # number "filename"
+/// or just as a full line comment if it doesn't have a number and a string.
+bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
+  Lex(); // Eat the hash token.
+
+  if (getLexer().isNot(AsmToken::Integer)) {
+    // Consume the line since in cases it is not a well-formed line directive,
+    // as if were simply a full line comment.
+    EatToEndOfLine();
+    return false;
+  }
+
+  int64_t LineNumber = getTok().getIntVal();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::String)) {
+    EatToEndOfLine();
+    return false;
+  }
+
+  StringRef Filename = getTok().getString();
+  // Get rid of the enclosing quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
+  CppHashLoc = L;
+  CppHashFilename = Filename;
+  CppHashLineNumber = LineNumber;
+  CppHashBuf = CurBuffer;
+
+  // Ignore any trailing characters, they're just comment.
+  EatToEndOfLine();
+  return false;
+}
+
+/// DiagHandler - will use the last parsed cpp hash line filename comment
+/// for the Filename and LineNo if any in the diagnostic.
+void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
+  const AsmParser *Parser = static_cast<const AsmParser*>(Context);
+  raw_ostream &OS = errs();
+
+  const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
+  const SMLoc &DiagLoc = Diag.getLoc();
+  int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
+
+  // Like SourceMgr::PrintMessage() we need to print the include stack if any
+  // before printing the message.
+  int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
+     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+  }
+
+  // If we have not parsed a cpp hash line filename comment or the source
+  // manager changed or buffer changed (like in a nested include) then just
+  // print the normal diagnostic using its Filename and LineNo.
+  if (!Parser->CppHashLineNumber ||
+      &DiagSrcMgr != &Parser->SrcMgr ||
+      DiagBuf != CppHashBuf) {
+    if (Parser->SavedDiagHandler)
+      Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
+    else
+      Diag.print(0, OS);
+    return;
+  }
+
+  // Use the CppHashFilename and calculate a line number based on the
+  // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
+  // the diagnostic.
+  const std::string Filename = Parser->CppHashFilename;
+
+  int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
+  int CppHashLocLineNo =
+      Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
+  int LineNo = Parser->CppHashLineNumber - 1 +
+               (DiagLocLineNo - CppHashLocLineNo);
+
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(),
+                       Filename, LineNo, Diag.getColumnNo(),
+                       Diag.getKind(), Diag.getMessage(),
+                       Diag.getLineContents(), Diag.getRanges());
+
+  if (Parser->SavedDiagHandler)
+    Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
+  else
+    NewDiag.print(0, OS);
+}
+
+// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
+// difference being that that function accepts '@' as part of identifiers and
+// we can't do that. AsmLexer.cpp should probably be changed to handle
+// '@' as a special case when needed.
+static bool isIdentifierChar(char c) {
+  return isalnum(static_cast<unsigned char>(c)) || c == '_' || c == '$' ||
+         c == '.';
+}
+
+bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
+                            const MCAsmMacroParameters &Parameters,
+                            const MCAsmMacroArguments &A,
+                            const SMLoc &L) {
+  unsigned NParameters = Parameters.size();
+  if (NParameters != 0 && NParameters != A.size())
+    return Error(L, "Wrong number of arguments");
+
+  // A macro without parameters is handled differently on Darwin:
+  // gas accepts no arguments and does no substitutions
+  while (!Body.empty()) {
+    // Scan for the next substitution.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      if (!NParameters) {
+        // This macro has no parameters, look for $0, $1, etc.
+        if (Body[Pos] != '$' || Pos + 1 == End)
+          continue;
+
+        char Next = Body[Pos + 1];
+        if (Next == '$' || Next == 'n' ||
+            isdigit(static_cast<unsigned char>(Next)))
+          break;
+      } else {
+        // This macro has parameters, look for \foo, \bar, etc.
+        if (Body[Pos] == '\\' && Pos + 1 != End)
+          break;
+      }
+    }
+
+    // Add the prefix.
+    OS << Body.slice(0, Pos);
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    if (!NParameters) {
+      switch (Body[Pos+1]) {
+        // $$ => $
+      case '$':
+        OS << '$';
+        break;
+
+        // $n => number of arguments
+      case 'n':
+        OS << A.size();
+        break;
+
+        // $[0-9] => argument
+      default: {
+        // Missing arguments are ignored.
+        unsigned Index = Body[Pos+1] - '0';
+        if (Index >= A.size())
+          break;
+
+        // Otherwise substitute with the token values, with spaces eliminated.
+        for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          OS << it->getString();
+        break;
+      }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isIdentifierChar(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos +1;
+      StringRef Argument(Begin, I - (Pos +1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index].first == Argument)
+          break;
+
+      if (Index == NParameters) {
+          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
+            Pos += 3;
+          else {
+            OS << '\\' << Argument;
+            Pos = I;
+          }
+      } else {
+        for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          if (it->getKind() == AsmToken::String)
+            OS << it->getStringContents();
+          else
+            OS << it->getString();
+
+        Pos += 1 + Argument.size();
+      }
+    }
+    // Update the scan point.
+    Body = Body.substr(Pos);
+  }
+
+  return false;
+}
+
+MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL,
+                                       int EB, SMLoc EL,
+                                       MemoryBuffer *I)
+  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
+    ExitLoc(EL)
+{
+}
+
+static bool IsOperator(AsmToken::TokenKind kind)
+{
+  switch (kind)
+  {
+    default:
+      return false;
+    case AsmToken::Plus:
+    case AsmToken::Minus:
+    case AsmToken::Tilde:
+    case AsmToken::Slash:
+    case AsmToken::Star:
+    case AsmToken::Dot:
+    case AsmToken::Equal:
+    case AsmToken::EqualEqual:
+    case AsmToken::Pipe:
+    case AsmToken::PipePipe:
+    case AsmToken::Caret:
+    case AsmToken::Amp:
+    case AsmToken::AmpAmp:
+    case AsmToken::Exclaim:
+    case AsmToken::ExclaimEqual:
+    case AsmToken::Percent:
+    case AsmToken::Less:
+    case AsmToken::LessEqual:
+    case AsmToken::LessLess:
+    case AsmToken::LessGreater:
+    case AsmToken::Greater:
+    case AsmToken::GreaterEqual:
+    case AsmToken::GreaterGreater:
+      return true;
+  }
+}
+
+bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
+                                   AsmToken::TokenKind &ArgumentDelimiter) {
+  unsigned ParenLevel = 0;
+  unsigned AddTokens = 0;
+
+  // gas accepts arguments separated by whitespace, except on Darwin
+  if (!IsDarwin)
+    Lexer.setSkipSpace(false);
+
+  for (;;) {
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal)) {
+      Lexer.setSkipSpace(true);
+      return TokError("unexpected token in macro instantiation");
+    }
+
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
+      // Spaces and commas cannot be mixed to delimit parameters
+      if (ArgumentDelimiter == AsmToken::Eof)
+        ArgumentDelimiter = AsmToken::Comma;
+      else if (ArgumentDelimiter != AsmToken::Comma) {
+        Lexer.setSkipSpace(true);
+        return TokError("expected ' ' for macro argument separator");
+      }
+      break;
+    }
+
+    if (Lexer.is(AsmToken::Space)) {
+      Lex(); // Eat spaces
+
+      // Spaces can delimit parameters, but could also be part an expression.
+      // If the token after a space is an operator, add the token and the next
+      // one into this argument
+      if (ArgumentDelimiter == AsmToken::Space ||
+          ArgumentDelimiter == AsmToken::Eof) {
+        if (IsOperator(Lexer.getKind())) {
+          // Check to see whether the token is used as an operator,
+          // or part of an identifier
+          const char *NextChar = getTok().getEndLoc().getPointer();
+          if (*NextChar == ' ')
+            AddTokens = 2;
+        }
+
+        if (!AddTokens && ParenLevel == 0) {
+          if (ArgumentDelimiter == AsmToken::Eof &&
+              !IsOperator(Lexer.getKind()))
+            ArgumentDelimiter = AsmToken::Space;
+          break;
+        }
+      }
+    }
+
+    // HandleMacroEntry relies on not advancing the lexer here
+    // to be able to fill in the remaining default parameter values
+    if (Lexer.is(AsmToken::EndOfStatement))
+      break;
+
+    // Adjust the current parentheses level.
+    if (Lexer.is(AsmToken::LParen))
+      ++ParenLevel;
+    else if (Lexer.is(AsmToken::RParen) && ParenLevel)
+      --ParenLevel;
+
+    // Append the token to the current argument list.
+    MA.push_back(getTok());
+    if (AddTokens)
+      AddTokens--;
+    Lex();
+  }
+
+  Lexer.setSkipSpace(true);
+  if (ParenLevel != 0)
+    return TokError("unbalanced parentheses in macro argument");
+  return false;
+}
+
+// Parse the macro instantiation arguments.
+bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A) {
+  const unsigned NParameters = M ? M->Parameters.size() : 0;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
+
+  // Parse two kinds of macro invocations:
+  // - macros defined without any parameters accept an arbitrary number of them
+  // - macros defined with parameters accept at most that many of them
+  for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
+       ++Parameter) {
+    MCAsmMacroArgument MA;
+
+    if (ParseMacroArgument(MA, ArgumentDelimiter))
+      return true;
+
+    if (!MA.empty() || !NParameters)
+      A.push_back(MA);
+    else if (NParameters) {
+      if (!M->Parameters[Parameter].second.empty())
+        A.push_back(M->Parameters[Parameter].second);
+    }
+
+    // At the end of the statement, fill in remaining arguments that have
+    // default values. If there aren't any, then the next argument is
+    // required but missing
+    if (Lexer.is(AsmToken::EndOfStatement)) {
+      if (NParameters && Parameter < NParameters - 1) {
+        if (M->Parameters[Parameter + 1].second.empty())
+          return TokError("macro argument '" +
+                          Twine(M->Parameters[Parameter + 1].first) +
+                          "' is missing");
+        else
+          continue;
+      }
+      return false;
+    }
+
+    if (Lexer.is(AsmToken::Comma))
+      Lex();
+  }
+  return TokError("Too many arguments");
+}
+
+const MCAsmMacro* AsmParser::LookupMacro(StringRef Name) {
+  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+  return (I == MacroMap.end()) ? NULL : I->getValue();
+}
+
+void AsmParser::DefineMacro(StringRef Name, const MCAsmMacro& Macro) {
+  MacroMap[Name] = new MCAsmMacro(Macro);
+}
+
+void AsmParser::UndefineMacro(StringRef Name) {
+  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+  if (I != MacroMap.end()) {
+    delete I->getValue();
+    MacroMap.erase(I);
+  }
+}
+
+bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+  // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
+  // this, although we should protect against infinite loops.
+  if (ActiveMacros.size() == 20)
+    return TokError("macros cannot be nested more than 20 levels deep");
+
+  MCAsmMacroArguments A;
+  if (ParseMacroArguments(M, A))
+    return true;
+
+  // Remove any trailing empty arguments. Do this after-the-fact as we have
+  // to keep empty arguments in the middle of the list or positionality
+  // gets off. e.g.,  "foo 1, , 2" vs. "foo 1, 2,"
+  while (!A.empty() && A.back().empty())
+    A.pop_back();
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  StringRef Body = M->Body;
+  raw_svector_ostream OS(Buf);
+
+  if (expandMacro(OS, Body, M->Parameters, A, getTok().getLoc()))
+    return true;
+
+  // We include the .endmacro in the buffer as our cue to exit the macro
+  // instantiation.
+  OS << ".endmacro\n";
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
+                                                  CurBuffer,
+                                                  getTok().getLoc(),
+                                                  Instantiation);
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lex();
+
+  return false;
+}
+
+void AsmParser::HandleMacroExit() {
+  // Jump to the EndOfStatement we should return to, and consume it.
+  JumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  Lex();
+
+  // Pop the instantiation entry.
+  delete ActiveMacros.back();
+  ActiveMacros.pop_back();
+}
+
+static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
+  switch (Value->getKind()) {
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Value);
+    return IsUsedIn(Sym, BE->getLHS()) || IsUsedIn(Sym, BE->getRHS());
+    break;
+  }
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return false;
+  case MCExpr::SymbolRef: {
+    const MCSymbol &S = static_cast<const MCSymbolRefExpr*>(Value)->getSymbol();
+    if (S.isVariable())
+      return IsUsedIn(Sym, S.getVariableValue());
+    return &S == Sym;
+  }
+  case MCExpr::Unary:
+    return IsUsedIn(Sym, static_cast<const MCUnaryExpr*>(Value)->getSubExpr());
+  }
+
+  llvm_unreachable("Unknown expr kind!");
+}
+
+bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+                                bool NoDeadStrip) {
+  // FIXME: Use better location, we should use proper tokens.
+  SMLoc EqualLoc = Lexer.getLoc();
+
+  const MCExpr *Value;
+  if (parseExpression(Value))
+    return true;
+
+  // Note: we don't count b as used in "a = b". This is to allow
+  // a = b
+  // b = c
+
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in assignment");
+
+  // Error on assignment to '.'.
+  if (Name == ".") {
+    return Error(EqualLoc, ("assignment to pseudo-symbol '.' is unsupported "
+                            "(use '.space' or '.org').)"));
+  }
+
+  // Eat the end of statement marker.
+  Lex();
+
+  // Validate that the LHS is allowed to be a variable (either it has not been
+  // used as a symbol, or it is an absolute symbol).
+  MCSymbol *Sym = getContext().LookupSymbol(Name);
+  if (Sym) {
+    // Diagnose assignment to a label.
+    //
+    // FIXME: Diagnostics. Note the location of the definition as a label.
+    // FIXME: Diagnose assignment to protected identifier (e.g., register name).
+    if (IsUsedIn(Sym, Value))
+      return Error(EqualLoc, "Recursive use of '" + Name + "'");
+    else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
+      ; // Allow redefinitions of undefined symbols only used in directives.
+    else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
+      ; // Allow redefinitions of variables that haven't yet been used.
+    else if (!Sym->isUndefined() && (!Sym->isVariable() || !allow_redef))
+      return Error(EqualLoc, "redefinition of '" + Name + "'");
+    else if (!Sym->isVariable())
+      return Error(EqualLoc, "invalid assignment to '" + Name + "'");
+    else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
+      return Error(EqualLoc, "invalid reassignment of non-absolute variable '" +
+                   Name + "'");
+
+    // Don't count these checks as uses.
+    Sym->setUsed(false);
+  } else
+    Sym = getContext().GetOrCreateSymbol(Name);
+
+  // FIXME: Handle '.'.
+
+  // Do the assignment.
+  Out.EmitAssignment(Sym, Value);
+  if (NoDeadStrip)
+    Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
+
+
+  return false;
+}
+
+/// parseIdentifier:
+///   ::= identifier
+///   ::= string
+bool AsmParser::parseIdentifier(StringRef &Res) {
+  // The assembler has relaxed rules for accepting identifiers, in particular we
+  // allow things like '.globl $foo', which would normally be separate
+  // tokens. At this level, we have already lexed so we cannot (currently)
+  // handle this as a context dependent token, instead we detect adjacent tokens
+  // and return the combined identifier.
+  if (Lexer.is(AsmToken::Dollar)) {
+    SMLoc DollarLoc = getLexer().getLoc();
+
+    // Consume the dollar sign, and check for a following identifier.
+    Lex();
+    if (Lexer.isNot(AsmToken::Identifier))
+      return true;
+
+    // We have a '$' followed by an identifier, make sure they are adjacent.
+    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+      return true;
+
+    // Construct the joined identifier and consume the token.
+    Res = StringRef(DollarLoc.getPointer(),
+                    getTok().getIdentifier().size() + 1);
+    Lex();
+    return false;
+  }
+
+  if (Lexer.isNot(AsmToken::Identifier) &&
+      Lexer.isNot(AsmToken::String))
+    return true;
+
+  Res = getTok().getIdentifier();
+
+  Lex(); // Consume the identifier token.
+
+  return false;
+}
+
+/// ParseDirectiveSet:
+///   ::= .equ identifier ',' expression
+///   ::= .equiv identifier ',' expression
+///   ::= .set identifier ',' expression
+bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
+  StringRef Name;
+
+  if (parseIdentifier(Name))
+    return TokError("expected identifier after '" + Twine(IDVal) + "'");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '" + Twine(IDVal) + "'");
+  Lex();
+
+  return ParseAssignment(Name, allow_redef, true);
+}
+
+bool AsmParser::parseEscapedString(std::string &Data) {
+  assert(getLexer().is(AsmToken::String) && "Unexpected current token!");
+
+  Data = "";
+  StringRef Str = getTok().getStringContents();
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (Str[i] != '\\') {
+      Data += Str[i];
+      continue;
+    }
+
+    // Recognize escaped characters. Note that this escape semantics currently
+    // loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
+    ++i;
+    if (i == e)
+      return TokError("unexpected backslash at end of string");
+
+    // Recognize octal sequences.
+    if ((unsigned) (Str[i] - '0') <= 7) {
+      // Consume up to three octal characters.
+      unsigned Value = Str[i] - '0';
+
+      if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+        ++i;
+        Value = Value * 8 + (Str[i] - '0');
+
+        if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+          ++i;
+          Value = Value * 8 + (Str[i] - '0');
+        }
+      }
+
+      if (Value > 255)
+        return TokError("invalid octal escape sequence (out of range)");
+
+      Data += (unsigned char) Value;
+      continue;
+    }
+
+    // Otherwise recognize individual escapes.
+    switch (Str[i]) {
+    default:
+      // Just reject invalid escape sequences for now.
+      return TokError("invalid escape sequence (unrecognized character)");
+
+    case 'b': Data += '\b'; break;
+    case 'f': Data += '\f'; break;
+    case 'n': Data += '\n'; break;
+    case 'r': Data += '\r'; break;
+    case 't': Data += '\t'; break;
+    case '"': Data += '"'; break;
+    case '\\': Data += '\\'; break;
+    }
+  }
+
+  return false;
+}
+
+/// ParseDirectiveAscii:
+///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
+bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    checkForValidSection();
+
+    for (;;) {
+      if (getLexer().isNot(AsmToken::String))
+        return TokError("expected string in '" + Twine(IDVal) + "' directive");
+
+      std::string Data;
+      if (parseEscapedString(Data))
+        return true;
+
+      getStreamer().EmitBytes(Data, DEFAULT_ADDRSPACE);
+      if (ZeroTerminated)
+        getStreamer().EmitBytes(StringRef("\0", 1), DEFAULT_ADDRSPACE);
+
+      Lex();
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveValue
+///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
+bool AsmParser::ParseDirectiveValue(unsigned Size) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    checkForValidSection();
+
+    for (;;) {
+      const MCExpr *Value;
+      SMLoc ExprLoc = getLexer().getLoc();
+      if (parseExpression(Value))
+        return true;
+
+      // Special case constant expressions to match code generator.
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        assert(Size <= 8 && "Invalid size");
+        uint64_t IntValue = MCE->getValue();
+        if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+          return Error(ExprLoc, "literal value out of range for directive");
+        getStreamer().EmitIntValue(IntValue, Size, DEFAULT_ADDRSPACE);
+      } else
+        getStreamer().EmitValue(Value, Size, DEFAULT_ADDRSPACE);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveRealValue
+///  ::= (.single | .double) [ expression (, expression)* ]
+bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    checkForValidSection();
+
+    for (;;) {
+      // We don't truly support arithmetic on floating point expressions, so we
+      // have to manually parse unary prefixes.
+      bool IsNeg = false;
+      if (getLexer().is(AsmToken::Minus)) {
+        Lex();
+        IsNeg = true;
+      } else if (getLexer().is(AsmToken::Plus))
+        Lex();
+
+      if (getLexer().isNot(AsmToken::Integer) &&
+          getLexer().isNot(AsmToken::Real) &&
+          getLexer().isNot(AsmToken::Identifier))
+        return TokError("unexpected token in directive");
+
+      // Convert to an APFloat.
+      APFloat Value(Semantics);
+      StringRef IDVal = getTok().getString();
+      if (getLexer().is(AsmToken::Identifier)) {
+        if (!IDVal.compare_lower("infinity") || !IDVal.compare_lower("inf"))
+          Value = APFloat::getInf(Semantics);
+        else if (!IDVal.compare_lower("nan"))
+          Value = APFloat::getNaN(Semantics, false, ~0);
+        else
+          return TokError("invalid floating point literal");
+      } else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
+          APFloat::opInvalidOp)
+        return TokError("invalid floating point literal");
+      if (IsNeg)
+        Value.changeSign();
+
+      // Consume the numeric token.
+      Lex();
+
+      // Emit the value as an integer.
+      APInt AsInt = Value.bitcastToAPInt();
+      getStreamer().EmitIntValue(AsInt.getLimitedValue(),
+                                 AsInt.getBitWidth() / 8, DEFAULT_ADDRSPACE);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveZero
+///  ::= .zero expression
+bool AsmParser::ParseDirectiveZero() {
+  checkForValidSection();
+
+  int64_t NumBytes;
+  if (parseAbsoluteExpression(NumBytes))
+    return true;
+
+  int64_t Val = 0;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    if (parseAbsoluteExpression(Val))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.zero' directive");
+
+  Lex();
+
+  getStreamer().EmitFill(NumBytes, Val, DEFAULT_ADDRSPACE);
+
+  return false;
+}
+
+/// ParseDirectiveFill
+///  ::= .fill expression , expression , expression
+bool AsmParser::ParseDirectiveFill() {
+  checkForValidSection();
+
+  int64_t NumValues;
+  if (parseAbsoluteExpression(NumValues))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.fill' directive");
+  Lex();
+
+  int64_t FillSize;
+  if (parseAbsoluteExpression(FillSize))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.fill' directive");
+  Lex();
+
+  int64_t FillExpr;
+  if (parseAbsoluteExpression(FillExpr))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.fill' directive");
+
+  Lex();
+
+  if (FillSize != 1 && FillSize != 2 && FillSize != 4 && FillSize != 8)
+    return TokError("invalid '.fill' size, expected 1, 2, 4, or 8");
+
+  for (uint64_t i = 0, e = NumValues; i != e; ++i)
+    getStreamer().EmitIntValue(FillExpr, FillSize, DEFAULT_ADDRSPACE);
+
+  return false;
+}
+
+/// ParseDirectiveOrg
+///  ::= .org expression [ , expression ]
+bool AsmParser::ParseDirectiveOrg() {
+  checkForValidSection();
+
+  const MCExpr *Offset;
+  SMLoc Loc = getTok().getLoc();
+  if (parseExpression(Offset))
+    return true;
+
+  // Parse optional fill expression.
+  int64_t FillExpr = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.org' directive");
+    Lex();
+
+    if (parseAbsoluteExpression(FillExpr))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.org' directive");
+  }
+
+  Lex();
+
+  // Only limited forms of relocatable expressions are accepted here, it
+  // has to be relative to the current section. The streamer will return
+  // 'true' if the expression wasn't evaluatable.
+  if (getStreamer().EmitValueToOffset(Offset, FillExpr))
+    return Error(Loc, "expected assembly-time absolute expression");
+
+  return false;
+}
+
+/// ParseDirectiveAlign
+///  ::= {.align, ...} expression [ , expression [ , expression ]]
+bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
+  checkForValidSection();
+
+  SMLoc AlignmentLoc = getLexer().getLoc();
+  int64_t Alignment;
+  if (parseAbsoluteExpression(Alignment))
+    return true;
+
+  SMLoc MaxBytesLoc;
+  bool HasFillExpr = false;
+  int64_t FillExpr = 0;
+  int64_t MaxBytesToFill = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in directive");
+    Lex();
+
+    // The fill expression can be omitted while specifying a maximum number of
+    // alignment bytes, e.g:
+    //  .align 3,,4
+    if (getLexer().isNot(AsmToken::Comma)) {
+      HasFillExpr = true;
+      if (parseAbsoluteExpression(FillExpr))
+        return true;
+    }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+
+      MaxBytesLoc = getLexer().getLoc();
+      if (parseAbsoluteExpression(MaxBytesToFill))
+        return true;
+
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in directive");
+    }
+  }
+
+  Lex();
+
+  if (!HasFillExpr)
+    FillExpr = 0;
+
+  // Compute alignment in bytes.
+  if (IsPow2) {
+    // FIXME: Diagnose overflow.
+    if (Alignment >= 32) {
+      Error(AlignmentLoc, "invalid alignment value");
+      Alignment = 31;
+    }
+
+    Alignment = 1ULL << Alignment;
+  } else {
+    // Reject alignments that aren't a power of two, for gas compatibility.
+    if (!isPowerOf2_64(Alignment))
+      Error(AlignmentLoc, "alignment must be a power of 2");
+  }
+
+  // Diagnose non-sensical max bytes to align.
+  if (MaxBytesLoc.isValid()) {
+    if (MaxBytesToFill < 1) {
+      Error(MaxBytesLoc, "alignment directive can never be satisfied in this "
+            "many bytes, ignoring maximum bytes expression");
+      MaxBytesToFill = 0;
+    }
+
+    if (MaxBytesToFill >= Alignment) {
+      Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
+              "has no effect");
+      MaxBytesToFill = 0;
+    }
+  }
+
+  // Check whether we should use optimal code alignment for this .align
+  // directive.
+  bool UseCodeAlign = getStreamer().getCurrentSection()->UseCodeAlign();
+  if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
+      ValueSize == 1 && UseCodeAlign) {
+    getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
+  } else {
+    // FIXME: Target specific behavior about how the "extra" bytes are filled.
+    getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
+                                       MaxBytesToFill);
+  }
+
+  return false;
+}
+
+/// ParseDirectiveFile
+/// ::= .file [number] filename
+/// ::= .file number directory filename
+bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
+  // FIXME: I'm not sure what this is.
+  int64_t FileNumber = -1;
+  SMLoc FileNumberLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::Integer)) {
+    FileNumber = getTok().getIntVal();
+    Lex();
+
+    if (FileNumber < 1)
+      return TokError("file number less than one");
+  }
+
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("unexpected token in '.file' directive");
+
+  // Usually the directory and filename together, otherwise just the directory.
+  StringRef Path = getTok().getString();
+  Path = Path.substr(1, Path.size()-2);
+  Lex();
+
+  StringRef Directory;
+  StringRef Filename;
+  if (getLexer().is(AsmToken::String)) {
+    if (FileNumber == -1)
+      return TokError("explicit path specified, but no file number");
+    Filename = getTok().getString();
+    Filename = Filename.substr(1, Filename.size()-2);
+    Directory = Path;
+    Lex();
+  } else {
+    Filename = Path;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.file' directive");
+
+  if (FileNumber == -1)
+    getStreamer().EmitFileDirective(Filename);
+  else {
+    if (getContext().getGenDwarfForAssembly() == true)
+      Error(DirectiveLoc, "input can't have .file dwarf directives when -g is "
+                        "used to generate dwarf debug info for assembly code");
+
+    if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename))
+      Error(FileNumberLoc, "file number already allocated");
+  }
+
+  return false;
+}
+
+/// ParseDirectiveLine
+/// ::= .line [number]
+bool AsmParser::ParseDirectiveLine() {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Integer))
+      return TokError("unexpected token in '.line' directive");
+
+    int64_t LineNumber = getTok().getIntVal();
+    (void) LineNumber;
+    Lex();
+
+    // FIXME: Do something with the .line.
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.line' directive");
+
+  return false;
+}
+
+/// ParseDirectiveLoc
+/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
+///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
+/// The first number is a file number, must have been previously assigned with
+/// a .file directive, the second number is the line number and optionally the
+/// third number is a column position (zero if not specified).  The remaining
+/// optional items are .loc sub-directives.
+bool AsmParser::ParseDirectiveLoc() {
+  if (getLexer().isNot(AsmToken::Integer))
+    return TokError("unexpected token in '.loc' directive");
+  int64_t FileNumber = getTok().getIntVal();
+  if (FileNumber < 1)
+    return TokError("file number less than one in '.loc' directive");
+  if (!getContext().isValidDwarfFileNumber(FileNumber))
+    return TokError("unassigned file number in '.loc' directive");
+  Lex();
+
+  int64_t LineNumber = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    LineNumber = getTok().getIntVal();
+    if (LineNumber < 1)
+      return TokError("line number less than one in '.loc' directive");
+    Lex();
+  }
+
+  int64_t ColumnPos = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ColumnPos = getTok().getIntVal();
+    if (ColumnPos < 0)
+      return TokError("column position less than zero in '.loc' directive");
+    Lex();
+  }
+
+  unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
+  unsigned Isa = 0;
+  int64_t Discriminator = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      StringRef Name;
+      SMLoc Loc = getTok().getLoc();
+      if (parseIdentifier(Name))
+        return TokError("unexpected token in '.loc' directive");
+
+      if (Name == "basic_block")
+        Flags |= DWARF2_FLAG_BASIC_BLOCK;
+      else if (Name == "prologue_end")
+        Flags |= DWARF2_FLAG_PROLOGUE_END;
+      else if (Name == "epilogue_begin")
+        Flags |= DWARF2_FLAG_EPILOGUE_BEGIN;
+      else if (Name == "is_stmt") {
+        Loc = getTok().getLoc();
+        const MCExpr *Value;
+        if (parseExpression(Value))
+          return true;
+        // The expression must be the constant 0 or 1.
+        if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+          int Value = MCE->getValue();
+          if (Value == 0)
+            Flags &= ~DWARF2_FLAG_IS_STMT;
+          else if (Value == 1)
+            Flags |= DWARF2_FLAG_IS_STMT;
+          else
+            return Error(Loc, "is_stmt value not 0 or 1");
+        }
+        else {
+          return Error(Loc, "is_stmt value not the constant value of 0 or 1");
+        }
+      }
+      else if (Name == "isa") {
+        Loc = getTok().getLoc();
+        const MCExpr *Value;
+        if (parseExpression(Value))
+          return true;
+        // The expression must be a constant greater or equal to 0.
+        if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+          int Value = MCE->getValue();
+          if (Value < 0)
+            return Error(Loc, "isa number less than zero");
+          Isa = Value;
+        }
+        else {
+          return Error(Loc, "isa number not a constant value");
+        }
+      }
+      else if (Name == "discriminator") {
+        if (parseAbsoluteExpression(Discriminator))
+          return true;
+      }
+      else {
+        return Error(Loc, "unknown sub-directive in '.loc' directive");
+      }
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+    }
+  }
+
+  getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
+                                      Isa, Discriminator, StringRef());
+
+  return false;
+}
+
+/// ParseDirectiveStabs
+/// ::= .stabs string, number, number, number
+bool AsmParser::ParseDirectiveStabs() {
+  return TokError("unsupported directive '.stabs'");
+}
+
+/// ParseDirectiveCFISections
+/// ::= .cfi_sections section [, section]
+bool AsmParser::ParseDirectiveCFISections() {
+  StringRef Name;
+  bool EH = false;
+  bool Debug = false;
+
+  if (parseIdentifier(Name))
+    return TokError("Expected an identifier");
+
+  if (Name == ".eh_frame")
+    EH = true;
+  else if (Name == ".debug_frame")
+    Debug = true;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (parseIdentifier(Name))
+      return TokError("Expected an identifier");
+
+    if (Name == ".eh_frame")
+      EH = true;
+    else if (Name == ".debug_frame")
+      Debug = true;
+  }
+
+  getStreamer().EmitCFISections(EH, Debug);
+  return false;
+}
+
+/// ParseDirectiveCFIStartProc
+/// ::= .cfi_startproc
+bool AsmParser::ParseDirectiveCFIStartProc() {
+  getStreamer().EmitCFIStartProc();
+  return false;
+}
+
+/// ParseDirectiveCFIEndProc
+/// ::= .cfi_endproc
+bool AsmParser::ParseDirectiveCFIEndProc() {
+  getStreamer().EmitCFIEndProc();
+  return false;
+}
+
+/// ParseRegisterOrRegisterNumber - parse register name or number.
+bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
+                                              SMLoc DirectiveLoc) {
+  unsigned RegNo;
+
+  if (getLexer().isNot(AsmToken::Integer)) {
+    if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
+      return true;
+    Register = getContext().getRegisterInfo().getDwarfRegNum(RegNo, true);
+  } else
+    return parseAbsoluteExpression(Register);
+
+  return false;
+}
+
+/// ParseDirectiveCFIDefCfa
+/// ::= .cfi_def_cfa register,  offset
+bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Offset = 0;
+  if (parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().EmitCFIDefCfa(Register, Offset);
+  return false;
+}
+
+/// ParseDirectiveCFIDefCfaOffset
+/// ::= .cfi_def_cfa_offset offset
+bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
+  int64_t Offset = 0;
+  if (parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().EmitCFIDefCfaOffset(Offset);
+  return false;
+}
+
+/// ParseDirectiveCFIRegister
+/// ::= .cfi_register register, register
+bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+  int64_t Register1 = 0;
+  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Register2 = 0;
+  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFIRegister(Register1, Register2);
+  return false;
+}
+
+/// ParseDirectiveCFIAdjustCfaOffset
+/// ::= .cfi_adjust_cfa_offset adjustment
+bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
+  int64_t Adjustment = 0;
+  if (parseAbsoluteExpression(Adjustment))
+    return true;
+
+  getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
+  return false;
+}
+
+/// ParseDirectiveCFIDefCfaRegister
+/// ::= .cfi_def_cfa_register register
+bool AsmParser::ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFIDefCfaRegister(Register);
+  return false;
+}
+
+/// ParseDirectiveCFIOffset
+/// ::= .cfi_offset register, offset
+bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  int64_t Offset = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  if (parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().EmitCFIOffset(Register, Offset);
+  return false;
+}
+
+/// ParseDirectiveCFIRelOffset
+/// ::= .cfi_rel_offset register, offset
+bool AsmParser::ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Offset = 0;
+  if (parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().EmitCFIRelOffset(Register, Offset);
+  return false;
+}
+
+static bool isValidEncoding(int64_t Encoding) {
+  if (Encoding & ~0xff)
+    return false;
+
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return true;
+
+  const unsigned Format = Encoding & 0xf;
+  if (Format != dwarf::DW_EH_PE_absptr && Format != dwarf::DW_EH_PE_udata2 &&
+      Format != dwarf::DW_EH_PE_udata4 && Format != dwarf::DW_EH_PE_udata8 &&
+      Format != dwarf::DW_EH_PE_sdata2 && Format != dwarf::DW_EH_PE_sdata4 &&
+      Format != dwarf::DW_EH_PE_sdata8 && Format != dwarf::DW_EH_PE_signed)
+    return false;
+
+  const unsigned Application = Encoding & 0x70;
+  if (Application != dwarf::DW_EH_PE_absptr &&
+      Application != dwarf::DW_EH_PE_pcrel)
+    return false;
+
+  return true;
+}
+
+/// ParseDirectiveCFIPersonalityOrLsda
+/// IsPersonality true for cfi_personality, false for cfi_lsda
+/// ::= .cfi_personality encoding, [symbol_name]
+/// ::= .cfi_lsda encoding, [symbol_name]
+bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
+  int64_t Encoding = 0;
+  if (parseAbsoluteExpression(Encoding))
+    return true;
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return false;
+
+  if (!isValidEncoding(Encoding))
+    return TokError("unsupported encoding.");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (IsPersonality)
+    getStreamer().EmitCFIPersonality(Sym, Encoding);
+  else
+    getStreamer().EmitCFILsda(Sym, Encoding);
+  return false;
+}
+
+/// ParseDirectiveCFIRememberState
+/// ::= .cfi_remember_state
+bool AsmParser::ParseDirectiveCFIRememberState() {
+  getStreamer().EmitCFIRememberState();
+  return false;
+}
+
+/// ParseDirectiveCFIRestoreState
+/// ::= .cfi_remember_state
+bool AsmParser::ParseDirectiveCFIRestoreState() {
+  getStreamer().EmitCFIRestoreState();
+  return false;
+}
+
+/// ParseDirectiveCFISameValue
+/// ::= .cfi_same_value register
+bool AsmParser::ParseDirectiveCFISameValue(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFISameValue(Register);
+  return false;
+}
+
+/// ParseDirectiveCFIRestore
+/// ::= .cfi_restore register
+bool AsmParser::ParseDirectiveCFIRestore(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFIRestore(Register);
+  return false;
+}
+
+/// ParseDirectiveCFIEscape
+/// ::= .cfi_escape expression[,...]
+bool AsmParser::ParseDirectiveCFIEscape() {
+  std::string Values;
+  int64_t CurrValue;
+  if (parseAbsoluteExpression(CurrValue))
+    return true;
+
+  Values.push_back((uint8_t)CurrValue);
+
+  while (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (parseAbsoluteExpression(CurrValue))
+      return true;
+
+    Values.push_back((uint8_t)CurrValue);
+  }
+
+  getStreamer().EmitCFIEscape(Values);
+  return false;
+}
+
+/// ParseDirectiveCFISignalFrame
+/// ::= .cfi_signal_frame
+bool AsmParser::ParseDirectiveCFISignalFrame() {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                 "unexpected token in '.cfi_signal_frame'");
+
+  getStreamer().EmitCFISignalFrame();
+  return false;
+}
+
+/// ParseDirectiveCFIUndefined
+/// ::= .cfi_undefined register
+bool AsmParser::ParseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().EmitCFIUndefined(Register);
+  return false;
+}
+
+/// ParseDirectiveMacrosOnOff
+/// ::= .macros_on
+/// ::= .macros_off
+bool AsmParser::ParseDirectiveMacrosOnOff(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(),
+                 "unexpected token in '" + Directive + "' directive");
+
+  SetMacrosEnabled(Directive == ".macros_on");
+  return false;
+}
+
+/// ParseDirectiveMacro
+/// ::= .macro name [parameters]
+bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in '.macro' directive");
+
+  MCAsmMacroParameters Parameters;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      MCAsmMacroParameter Parameter;
+      if (parseIdentifier(Parameter.first))
+        return TokError("expected identifier in '.macro' directive");
+
+      if (getLexer().is(AsmToken::Equal)) {
+        Lex();
+        if (ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+          return true;
+      }
+
+      Parameters.push_back(Parameter);
+
+      if (getLexer().is(AsmToken::Comma))
+        Lex();
+      else if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+    }
+  }
+
+  // Eat the end of statement.
+  Lex();
+
+  AsmToken EndToken, StartToken = getTok();
+
+  // Lex the macro definition.
+  for (;;) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof))
+      return Error(DirectiveLoc, "no matching '.endmacro' in definition");
+
+    // Otherwise, check whether we have reach the .endmacro.
+    if (getLexer().is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".endm" ||
+         getTok().getIdentifier() == ".endmacro")) {
+      EndToken = getTok();
+      Lex();
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in '" + EndToken.getIdentifier() +
+                        "' directive");
+      break;
+    }
+
+    // Otherwise, scan til the end of the statement.
+    eatToEndOfStatement();
+  }
+
+  if (LookupMacro(Name)) {
+    return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+  CheckForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  DefineMacro(Name, MCAsmMacro(Name, Body, Parameters));
+  return false;
+}
+
+/// CheckForBadMacro
+///
+/// With the support added for named parameters there may be code out there that
+/// is transitioning from positional parameters.  In versions of gas that did
+/// not support named parameters they would be ignored on the macro defintion.
+/// But to support both styles of parameters this is not possible so if a macro
+/// defintion has named parameters but does not use them and has what appears
+/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
+/// warning that the positional parameter found in body which have no effect.
+/// Hoping the developer will either remove the named parameters from the macro
+/// definiton so the positional parameters get used if that was what was
+/// intended or change the macro to use the named parameters.  It is possible
+/// this warning will trigger when the none of the named parameters are used
+/// and the strings like $1 are infact to simply to be passed trough unchanged.
+void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+                                 StringRef Body,
+                                 MCAsmMacroParameters Parameters) {
+  // If this macro is not defined with named parameters the warning we are
+  // checking for here doesn't apply.
+  unsigned NParameters = Parameters.size();
+  if (NParameters == 0)
+    return;
+
+  bool NamedParametersFound = false;
+  bool PositionalParametersFound = false;
+
+  // Look at the body of the macro for use of both the named parameters and what
+  // are likely to be positional parameters.  This is what expandMacro() is
+  // doing when it finds the parameters in the body.
+  while (!Body.empty()) {
+    // Scan for the next possible parameter.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      // This macro is defined with parameters, look for \foo, \bar, etc.
+      if (Body[Pos] == '\\' && Pos + 1 != End)
+        break;
+
+      // This macro should have parameters, but look for $0, $1, ..., $n too.
+      if (Body[Pos] != '$' || Pos + 1 == End)
+        continue;
+      char Next = Body[Pos + 1];
+      if (Next == '$' || Next == 'n' ||
+          isdigit(static_cast<unsigned char>(Next)))
+        break;
+    }
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    if (Body[Pos] == '$') {
+      switch (Body[Pos+1]) {
+        // $$ => $
+      case '$':
+        break;
+
+        // $n => number of arguments
+      case 'n':
+        PositionalParametersFound = true;
+        break;
+
+        // $[0-9] => argument
+      default: {
+        PositionalParametersFound = true;
+        break;
+        }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isIdentifierChar(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos +1;
+      StringRef Argument(Begin, I - (Pos +1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index].first == Argument)
+          break;
+
+      if (Index == NParameters) {
+          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
+            Pos += 3;
+          else {
+            Pos = I;
+          }
+      } else {
+        NamedParametersFound = true;
+        Pos += 1 + Argument.size();
+      }
+    }
+    // Update the scan point.
+    Body = Body.substr(Pos);
+  }
+
+  if (!NamedParametersFound && PositionalParametersFound)
+    Warning(DirectiveLoc, "macro defined with named parameters which are not "
+                          "used in macro body, possible positional parameter "
+                          "found in body which will have no effect");
+}
+
+/// ParseDirectiveEndMacro
+/// ::= .endm
+/// ::= .endmacro
+bool AsmParser::ParseDirectiveEndMacro(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+
+  // If we are inside a macro instantiation, terminate the current
+  // instantiation.
+  if (InsideMacroInstantiation()) {
+    HandleMacroExit();
+    return false;
+  }
+
+  // Otherwise, this .endmacro is a stray entry in the file; well formed
+  // .endmacro directives are handled during the macro definition parsing.
+  return TokError("unexpected '" + Directive + "' in file, "
+                  "no current macro definition");
+}
+
+/// ParseDirectivePurgeMacro
+/// ::= .purgem
+bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in '.purgem' directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.purgem' directive");
+
+  if (!LookupMacro(Name))
+    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+
+  UndefineMacro(Name);
+  return false;
+}
+
+/// ParseDirectiveBundleAlignMode
+/// ::= {.bundle_align_mode} expression
+bool AsmParser::ParseDirectiveBundleAlignMode() {
+  checkForValidSection();
+
+  // Expect a single argument: an expression that evaluates to a constant
+  // in the inclusive range 0-30.
+  SMLoc ExprLoc = getLexer().getLoc();
+  int64_t AlignSizePow2;
+  if (parseAbsoluteExpression(AlignSizePow2))
+    return true;
+  else if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token after expression in"
+                    " '.bundle_align_mode' directive");
+  else if (AlignSizePow2 < 0 || AlignSizePow2 > 30)
+    return Error(ExprLoc,
+                 "invalid bundle alignment size (expected between 0 and 30)");
+
+  Lex();
+
+  // Because of AlignSizePow2's verified range we can safely truncate it to
+  // unsigned.
+  getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
+  return false;
+}
+
+/// ParseDirectiveBundleLock
+/// ::= {.bundle_lock} [align_to_end]
+bool AsmParser::ParseDirectiveBundleLock() {
+  checkForValidSection();
+  bool AlignToEnd = false;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    StringRef Option;
+    SMLoc Loc = getTok().getLoc();
+    const char *kInvalidOptionError =
+      "invalid option for '.bundle_lock' directive";
+
+    if (parseIdentifier(Option))
+      return Error(Loc, kInvalidOptionError);
+
+    if (Option != "align_to_end")
+      return Error(Loc, kInvalidOptionError);
+    else if (getLexer().isNot(AsmToken::EndOfStatement))
+      return Error(Loc,
+                   "unexpected token after '.bundle_lock' directive option");
+    AlignToEnd = true;
+  }
+
+  Lex();
+
+  getStreamer().EmitBundleLock(AlignToEnd);
+  return false;
+}
+
+/// ParseDirectiveBundleLock
+/// ::= {.bundle_lock}
+bool AsmParser::ParseDirectiveBundleUnlock() {
+  checkForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_unlock' directive");
+  Lex();
+
+  getStreamer().EmitBundleUnlock();
+  return false;
+}
+
+/// ParseDirectiveSpace
+/// ::= (.skip | .space) expression [ , expression ]
+bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
+  checkForValidSection();
+
+  int64_t NumBytes;
+  if (parseAbsoluteExpression(NumBytes))
+    return true;
+
+  int64_t FillExpr = 0;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+
+    if (parseAbsoluteExpression(FillExpr))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+  }
+
+  Lex();
+
+  if (NumBytes <= 0)
+    return TokError("invalid number of bytes in '" +
+                    Twine(IDVal) + "' directive");
+
+  // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
+  getStreamer().EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
+
+  return false;
+}
+
+/// ParseDirectiveLEB128
+/// ::= (.sleb128 | .uleb128) expression
+bool AsmParser::ParseDirectiveLEB128(bool Signed) {
+  checkForValidSection();
+  const MCExpr *Value;
+
+  if (parseExpression(Value))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  if (Signed)
+    getStreamer().EmitSLEB128Value(Value);
+  else
+    getStreamer().EmitULEB128Value(Value);
+
+  return false;
+}
+
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
+bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+      SMLoc Loc = getTok().getLoc();
+
+      if (parseIdentifier(Name))
+        return Error(Loc, "expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      // Assembler local symbols don't make any sense here. Complain loudly.
+      if (Sym->isTemporary())
+        return Error(Loc, "non-local symbol required in directive");
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveComm
+///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
+bool AsmParser::ParseDirectiveComm(bool IsLocal) {
+  checkForValidSection();
+
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (parseAbsoluteExpression(Pow2Alignment))
+      return true;
+
+    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
+    if (IsLocal && LCOMM == LCOMM::NoAlignment)
+      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
+
+    // If this target takes alignments in bytes (not log) validate and convert.
+    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
+        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
+      if (!isPowerOf2_64(Pow2Alignment))
+        return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
+      Pow2Alignment = Log2_64(Pow2Alignment);
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.comm' or '.lcomm' directive");
+
+  Lex();
+
+  // NOTE: a size of zero for a .comm should create a undefined symbol
+  // but a size of .lcomm creates a bss symbol of size zero.
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+                 "be less than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+                 "alignment, can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  // Create the Symbol as a common or local common with Size and Pow2Alignment
+  if (IsLocal) {
+    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+    return false;
+  }
+
+  getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+  return false;
+}
+
+/// ParseDirectiveAbort
+///  ::= .abort [... message ...]
+bool AsmParser::ParseDirectiveAbort() {
+  // FIXME: Use loc from directive.
+  SMLoc Loc = getLexer().getLoc();
+
+  StringRef Str = parseStringToEndOfStatement();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.abort' directive");
+
+  Lex();
+
+  if (Str.empty())
+    Error(Loc, ".abort detected. Assembly stopping.");
+  else
+    Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
+  // FIXME: Actually abort assembly here.
+
+  return false;
+}
+
+/// ParseDirectiveInclude
+///  ::= .include "filename"
+bool AsmParser::ParseDirectiveInclude() {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.include' directive");
+
+  std::string Filename = getTok().getString();
+  SMLoc IncludeLoc = getLexer().getLoc();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.include' directive");
+
+  // Strip the quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Attempt to switch the lexer to the included file before consuming the end
+  // of statement to avoid losing it when we switch.
+  if (EnterIncludeFile(Filename)) {
+    Error(IncludeLoc, "Could not find include file '" + Filename + "'");
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIncbin
+///  ::= .incbin "filename"
+bool AsmParser::ParseDirectiveIncbin() {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.incbin' directive");
+
+  std::string Filename = getTok().getString();
+  SMLoc IncbinLoc = getLexer().getLoc();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.incbin' directive");
+
+  // Strip the quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Attempt to process the included file.
+  if (ProcessIncbinFile(Filename)) {
+    Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIf
+/// ::= .if expression
+bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.if' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfb
+/// ::= .ifb string
+bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    StringRef Str = parseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifb' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfc
+/// ::= .ifc string1, string2
+bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    StringRef Str1 = ParseStringToComma();
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    StringRef Str2 = parseStringToEndOfStatement();
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.ifc' directive");
+
+    Lex();
+
+    TheCondState.CondMet = ExpectEqual == (Str1 == Str2);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveIfdef
+/// ::= .ifdef symbol
+bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+  StringRef Name;
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    if (parseIdentifier(Name))
+      return TokError("expected identifier after '.ifdef'");
+
+    Lex();
+
+    MCSymbol *Sym = getContext().LookupSymbol(Name);
+
+    if (expect_defined)
+      TheCondState.CondMet = (Sym != NULL && !Sym->isUndefined());
+    else
+      TheCondState.CondMet = (Sym == NULL || Sym->isUndefined());
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveElseIf
+/// ::= .elseif expression
+bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+      Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
+                          " an .elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+      LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  }
+  else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue))
+      return true;
+
+    if (getLexer().isNot(AsmToken::EndOfStatement))
+      return TokError("unexpected token in '.elseif' directive");
+
+    Lex();
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// ParseDirectiveElse
+/// ::= .else
+bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.else' directive");
+
+  Lex();
+
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+      Error(DirectiveLoc, "Encountered a .else that doesn't follow a .if or an "
+                          ".elseif");
+  TheCondState.TheCond = AsmCond::ElseCond;
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet)
+    TheCondState.Ignore = true;
+  else
+    TheCondState.Ignore = false;
+
+  return false;
+}
+
+/// ParseDirectiveEndIf
+/// ::= .endif
+bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.endif' directive");
+
+  Lex();
+
+  if ((TheCondState.TheCond == AsmCond::NoCond) ||
+      TheCondStack.empty())
+    Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
+                        ".else");
+  if (!TheCondStack.empty()) {
+    TheCondState = TheCondStack.back();
+    TheCondStack.pop_back();
+  }
+
+  return false;
+}
+
+void AsmParser::initializeDirectiveKindMap() {
+  DirectiveKindMap[".set"] = DK_SET;
+  DirectiveKindMap[".equ"] = DK_EQU;
+  DirectiveKindMap[".equiv"] = DK_EQUIV;
+  DirectiveKindMap[".ascii"] = DK_ASCII;
+  DirectiveKindMap[".asciz"] = DK_ASCIZ;
+  DirectiveKindMap[".string"] = DK_STRING;
+  DirectiveKindMap[".byte"] = DK_BYTE;
+  DirectiveKindMap[".short"] = DK_SHORT;
+  DirectiveKindMap[".value"] = DK_VALUE;
+  DirectiveKindMap[".2byte"] = DK_2BYTE;
+  DirectiveKindMap[".long"] = DK_LONG;
+  DirectiveKindMap[".int"] = DK_INT;
+  DirectiveKindMap[".4byte"] = DK_4BYTE;
+  DirectiveKindMap[".quad"] = DK_QUAD;
+  DirectiveKindMap[".8byte"] = DK_8BYTE;
+  DirectiveKindMap[".single"] = DK_SINGLE;
+  DirectiveKindMap[".float"] = DK_FLOAT;
+  DirectiveKindMap[".double"] = DK_DOUBLE;
+  DirectiveKindMap[".align"] = DK_ALIGN;
+  DirectiveKindMap[".align32"] = DK_ALIGN32;
+  DirectiveKindMap[".balign"] = DK_BALIGN;
+  DirectiveKindMap[".balignw"] = DK_BALIGNW;
+  DirectiveKindMap[".balignl"] = DK_BALIGNL;
+  DirectiveKindMap[".p2align"] = DK_P2ALIGN;
+  DirectiveKindMap[".p2alignw"] = DK_P2ALIGNW;
+  DirectiveKindMap[".p2alignl"] = DK_P2ALIGNL;
+  DirectiveKindMap[".org"] = DK_ORG;
+  DirectiveKindMap[".fill"] = DK_FILL;
+  DirectiveKindMap[".zero"] = DK_ZERO;
+  DirectiveKindMap[".extern"] = DK_EXTERN;
+  DirectiveKindMap[".globl"] = DK_GLOBL;
+  DirectiveKindMap[".global"] = DK_GLOBAL;
+  DirectiveKindMap[".indirect_symbol"] = DK_INDIRECT_SYMBOL;
+  DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
+  DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
+  DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
+  DirectiveKindMap[".private_extern"] = DK_PRIVATE_EXTERN;
+  DirectiveKindMap[".reference"] = DK_REFERENCE;
+  DirectiveKindMap[".weak_definition"] = DK_WEAK_DEFINITION;
+  DirectiveKindMap[".weak_reference"] = DK_WEAK_REFERENCE;
+  DirectiveKindMap[".weak_def_can_be_hidden"] = DK_WEAK_DEF_CAN_BE_HIDDEN;
+  DirectiveKindMap[".comm"] = DK_COMM;
+  DirectiveKindMap[".common"] = DK_COMMON;
+  DirectiveKindMap[".lcomm"] = DK_LCOMM;
+  DirectiveKindMap[".abort"] = DK_ABORT;
+  DirectiveKindMap[".include"] = DK_INCLUDE;
+  DirectiveKindMap[".incbin"] = DK_INCBIN;
+  DirectiveKindMap[".code16"] = DK_CODE16;
+  DirectiveKindMap[".code16gcc"] = DK_CODE16GCC;
+  DirectiveKindMap[".rept"] = DK_REPT;
+  DirectiveKindMap[".irp"] = DK_IRP;
+  DirectiveKindMap[".irpc"] = DK_IRPC;
+  DirectiveKindMap[".endr"] = DK_ENDR;
+  DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
+  DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
+  DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
+  DirectiveKindMap[".if"] = DK_IF;
+  DirectiveKindMap[".ifb"] = DK_IFB;
+  DirectiveKindMap[".ifnb"] = DK_IFNB;
+  DirectiveKindMap[".ifc"] = DK_IFC;
+  DirectiveKindMap[".ifnc"] = DK_IFNC;
+  DirectiveKindMap[".ifdef"] = DK_IFDEF;
+  DirectiveKindMap[".ifndef"] = DK_IFNDEF;
+  DirectiveKindMap[".ifnotdef"] = DK_IFNOTDEF;
+  DirectiveKindMap[".elseif"] = DK_ELSEIF;
+  DirectiveKindMap[".else"] = DK_ELSE;
+  DirectiveKindMap[".endif"] = DK_ENDIF;
+  DirectiveKindMap[".skip"] = DK_SKIP;
+  DirectiveKindMap[".space"] = DK_SPACE;
+  DirectiveKindMap[".file"] = DK_FILE;
+  DirectiveKindMap[".line"] = DK_LINE;
+  DirectiveKindMap[".loc"] = DK_LOC;
+  DirectiveKindMap[".stabs"] = DK_STABS;
+  DirectiveKindMap[".sleb128"] = DK_SLEB128;
+  DirectiveKindMap[".uleb128"] = DK_ULEB128;
+  DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
+  DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
+  DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
+  DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
+  DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
+  DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
+  DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
+  DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
+  DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
+  DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
+  DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
+  DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
+  DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
+  DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
+  DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
+  DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
+  DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
+  DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
+  DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
+  DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
+  DirectiveKindMap[".macro"] = DK_MACRO;
+  DirectiveKindMap[".endm"] = DK_ENDM;
+  DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
+  DirectiveKindMap[".purgem"] = DK_PURGEM;
+}
+
+
+MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+  AsmToken EndToken, StartToken = getTok();
+
+  unsigned NestLevel = 0;
+  for (;;) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof)) {
+      Error(DirectiveLoc, "no matching '.endr' in definition");
+      return 0;
+    }
+
+    if (Lexer.is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".rept")) {
+      ++NestLevel;
+    }
+
+    // Otherwise, check whether we have reached the .endr.
+    if (Lexer.is(AsmToken::Identifier) &&
+        getTok().getIdentifier() == ".endr") {
+      if (NestLevel == 0) {
+        EndToken = getTok();
+        Lex();
+        if (Lexer.isNot(AsmToken::EndOfStatement)) {
+          TokError("unexpected token in '.endr' directive");
+          return 0;
+        }
+        break;
+      }
+      --NestLevel;
+    }
+
+    // Otherwise, scan till the end of the statement.
+    eatToEndOfStatement();
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+
+  // We Are Anonymous.
+  StringRef Name;
+  MCAsmMacroParameters Parameters;
+  return new MCAsmMacro(Name, Body, Parameters);
+}
+
+void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                         raw_svector_ostream &OS) {
+  OS << ".endr\n";
+
+  MemoryBuffer *Instantiation =
+    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
+                                                  CurBuffer,
+                                                  getTok().getLoc(),
+                                                  Instantiation);
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(MI->Instantiation, SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
+  Lex();
+}
+
+bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+  int64_t Count;
+  if (parseAbsoluteExpression(Count))
+    return TokError("unexpected token in '.rept' directive");
+
+  if (Count < 0)
+    return TokError("Count is negative");
+
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.rept' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the rept definition.
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroArguments A;
+  raw_svector_ostream OS(Buf);
+  while (Count--) {
+    if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
+      return true;
+  }
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrp
+/// ::= .irp symbol,values
+bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroParameter Parameter;
+
+  if (parseIdentifier(Parameter.first))
+    return TokError("expected identifier in '.irp' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irp' directive");
+
+  Lex();
+
+  MCAsmMacroArguments A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irp definition.
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  for (MCAsmMacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) {
+    MCAsmMacroArguments Args;
+    Args.push_back(*i);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// ParseDirectiveIrpc
+/// ::= .irpc symbol,values
+bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+  MCAsmMacroParameters Parameters;
+  MCAsmMacroParameter Parameter;
+
+  if (parseIdentifier(Parameter.first))
+    return TokError("expected identifier in '.irpc' directive");
+
+  Parameters.push_back(Parameter);
+
+  if (Lexer.isNot(AsmToken::Comma))
+    return TokError("expected comma in '.irpc' directive");
+
+  Lex();
+
+  MCAsmMacroArguments A;
+  if (ParseMacroArguments(0, A))
+    return true;
+
+  if (A.size() != 1 || A.front().size() != 1)
+    return TokError("unexpected token in '.irpc' directive");
+
+  // Eat the end of statement.
+  Lex();
+
+  // Lex the irpc definition.
+  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StringRef Values = A.front().front().getString();
+  std::size_t I, End = Values.size();
+  for (I = 0; I < End; ++I) {
+    MCAsmMacroArgument Arg;
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+
+    MCAsmMacroArguments Args;
+    Args.push_back(Arg);
+
+    if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
+      return true;
+  }
+
+  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+  if (ActiveMacros.empty())
+    return TokError("unmatched '.endr' directive");
+
+  // The only .repl that should get here are the ones created by
+  // InstantiateMacroLikeBody.
+  assert(getLexer().is(AsmToken::EndOfStatement));
+
+  HandleMacroExit();
+  return false;
+}
+
+bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+                                     size_t Len) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (parseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in _emit");
+  uint64_t IntValue = MCE->getValue();
+  if (!isUIntN(8, IntValue) && !isIntN(8, IntValue))
+    return Error(ExprLoc, "literal value out of range for directive");
+
+  Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, Len));
+  return false;
+}
+
+bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (parseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in align");
+  uint64_t IntValue = MCE->getValue();
+  if (!isPowerOf2_64(IntValue))
+    return Error(ExprLoc, "literal value not a power of two greater then zero");
+
+  Info.AsmRewrites->push_back(AsmRewrite(AOK_Align, IDLoc, 5,
+                                         Log2_64(IntValue)));
+  return false;
+}
+
+// We are comparing pointers, but the pointers are relative to a single string.
+// Thus, this should always be deterministic.
+static int RewritesSort(const void *A, const void *B) {
+  const AsmRewrite *AsmRewriteA = static_cast<const AsmRewrite *>(A);
+  const AsmRewrite *AsmRewriteB = static_cast<const AsmRewrite *>(B);
+  if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
+    return -1;
+  if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
+    return 1;
+
+  // It's possible to have a SizeDirective rewrite and an Input/Output rewrite
+  // to the same location.  Make sure the SizeDirective rewrite is performed
+  // first.  This also ensure the sort algorithm is stable.
+  if (AsmRewriteA->Kind == AOK_SizeDirective) {
+    assert ((AsmRewriteB->Kind == AOK_Input || AsmRewriteB->Kind == AOK_Output) &&
+            "Expected an Input/Output rewrite!");
+    return -1;
+  }
+  if (AsmRewriteB->Kind == AOK_SizeDirective) {
+    assert ((AsmRewriteA->Kind == AOK_Input || AsmRewriteA->Kind == AOK_Output) &&
+            "Expected an Input/Output rewrite!");
+    return 1;
+  }
+  llvm_unreachable ("Unstable rewrite sort.");
+}
+
+bool
+AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                            unsigned &NumOutputs, unsigned &NumInputs,
+                            SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+                            SmallVectorImpl<std::string> &Constraints,
+                            SmallVectorImpl<std::string> &Clobbers,
+                            const MCInstrInfo *MII,
+                            const MCInstPrinter *IP,
+                            MCAsmParserSemaCallback &SI) {
+  SmallVector<void *, 4> InputDecls;
+  SmallVector<void *, 4> OutputDecls;
+  SmallVector<bool, 4> InputDeclsAddressOf;
+  SmallVector<bool, 4> OutputDeclsAddressOf;
+  SmallVector<std::string, 4> InputConstraints;
+  SmallVector<std::string, 4> OutputConstraints;
+  SmallVector<unsigned, 4> ClobberRegs;
+
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
+
+  // Prime the lexer.
+  Lex();
+
+  // While we have input, parse each statement.
+  unsigned InputIdx = 0;
+  unsigned OutputIdx = 0;
+  while (getLexer().isNot(AsmToken::Eof)) {
+    ParseStatementInfo Info(&AsmStrRewrites);
+    if (ParseStatement(Info))
+      return true;
+
+    if (Info.ParseError)
+      return true;
+
+    if (Info.Opcode == ~0U)
+      continue;
+
+    const MCInstrDesc &Desc = MII->get(Info.Opcode);
+
+    // Build the list of clobbers, outputs and inputs.
+    for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
+      MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
+
+      // Immediate.
+      if (Operand->isImm())
+        continue;
+
+      // Register operand.
+      if (Operand->isReg() && !Operand->needAddressOf()) {
+        unsigned NumDefs = Desc.getNumDefs();
+        // Clobber.
+        if (NumDefs && Operand->getMCOperandNum() < NumDefs)
+          ClobberRegs.push_back(Operand->getReg());
+        continue;
+      }
+
+      // Expr/Input or Output.
+      bool IsVarDecl;
+      unsigned Length, Size, Type;
+      void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc,
+                                                  Length, Size, Type,
+                                                  IsVarDecl);
+      if (!OpDecl)
+        continue;
+
+      bool isOutput = (i == 1) && Desc.mayStore();
+      if (isOutput) {
+        ++InputIdx;
+        OutputDecls.push_back(OpDecl);
+        OutputDeclsAddressOf.push_back(Operand->needAddressOf());
+        OutputConstraints.push_back('=' + Operand->getConstraint().str());
+        AsmStrRewrites.push_back(AsmRewrite(AOK_Output, Operand->getStartLoc(),
+                                            Operand->getNameLen()));
+      } else {
+        InputDecls.push_back(OpDecl);
+        InputDeclsAddressOf.push_back(Operand->needAddressOf());
+        InputConstraints.push_back(Operand->getConstraint().str());
+        AsmStrRewrites.push_back(AsmRewrite(AOK_Input, Operand->getStartLoc(),
+                                            Operand->getNameLen()));
+      }
+    }
+  }
+
+  // Set the number of Outputs and Inputs.
+  NumOutputs = OutputDecls.size();
+  NumInputs = InputDecls.size();
+
+  // Set the unique clobbers.
+  array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
+  ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
+                    ClobberRegs.end());
+  Clobbers.assign(ClobberRegs.size(), std::string());
+  for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
+    raw_string_ostream OS(Clobbers[I]);
+    IP->printRegName(OS, ClobberRegs[I]);
+  }
+
+  // Merge the various outputs and inputs.  Output are expected first.
+  if (NumOutputs || NumInputs) {
+    unsigned NumExprs = NumOutputs + NumInputs;
+    OpDecls.resize(NumExprs);
+    Constraints.resize(NumExprs);
+    for (unsigned i = 0; i < NumOutputs; ++i) {
+      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
+      Constraints[i] = OutputConstraints[i];
+    }
+    for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
+      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
+      Constraints[j] = InputConstraints[i];
+    }
+  }
+
+  // Build the IR assembly string.
+  std::string AsmStringIR;
+  raw_string_ostream OS(AsmStringIR);
+  const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart();
+  const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), RewritesSort);
+  for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
+                                             E = AsmStrRewrites.end();
+       I != E; ++I) {
+    const char *Loc = (*I).Loc.getPointer();
+    assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");
+
+    unsigned AdditionalSkip = 0;
+    AsmRewriteKind Kind = (*I).Kind;
+
+    // Emit everything up to the immediate/expression.
+    unsigned Len = Loc - AsmStart;
+    if (Len) {
+      // For Input/Output operands we need to remove the brackets, if present.
+      if ((Kind == AOK_Input || Kind == AOK_Output) && Loc[-1] == '[')
+        --Len;
+      OS << StringRef(AsmStart, Len);
+    }
+
+    // Skip the original expression.
+    if (Kind == AOK_Skip) {
+      AsmStart = Loc + (*I).Len;
+      continue;
+    }
+
+    // Rewrite expressions in $N notation.
+    switch (Kind) {
+    default: break;
+    case AOK_Imm:
+      OS << "$$" << (*I).Val;
+      break;
+    case AOK_ImmPrefix:
+      OS << "$$";
+      break;
+    case AOK_Input:
+      OS << '$' << InputIdx++;
+      break;
+    case AOK_Output:
+      OS << '$' << OutputIdx++;
+      break;
+    case AOK_SizeDirective:
+      switch ((*I).Val) {
+      default: break;
+      case 8:  OS << "byte ptr "; break;
+      case 16: OS << "word ptr "; break;
+      case 32: OS << "dword ptr "; break;
+      case 64: OS << "qword ptr "; break;
+      case 80: OS << "xword ptr "; break;
+      case 128: OS << "xmmword ptr "; break;
+      case 256: OS << "ymmword ptr "; break;
+      }
+      break;
+    case AOK_Emit:
+      OS << ".byte";
+      break;
+    case AOK_Align: {
+      unsigned Val = (*I).Val;
+      OS << ".align " << Val;
+
+      // Skip the original immediate.
+      assert(Val < 10 && "Expected alignment less then 2^10.");
+      AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
+      break;
+    }
+    case AOK_DotOperator:
+      OS << (*I).Val;
+      break;
+    }
+
+    // Skip the original expression.
+    AsmStart = Loc + (*I).Len + AdditionalSkip;
+
+    // For Input/Output operands we need to remove the brackets, if present.
+    if ((Kind == AOK_Input || Kind == AOK_Output) && AsmStart != AsmEnd &&
+        *AsmStart == ']')
+      ++AsmStart;
+  }
+
+  // Emit the remainder of the asm string.
+  if (AsmStart != AsmEnd)
+    OS << StringRef(AsmStart, AsmEnd - AsmStart);
+
+  AsmString = OS.str();
+  return false;
+}
+
+/// \brief Create an MCAsmParser instance.
+MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
+                                     MCContext &C, MCStreamer &Out,
+                                     const MCAsmInfo &MAI) {
+  return new AsmParser(SM, C, Out, MAI);
+}
diff --git a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
new file mode 100644
index 000000000000..a50eab217d21
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -0,0 +1,499 @@
+//===- COFFAsmParser.cpp - COFF Assembly Parser ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/COFF.h"
+using namespace llvm;
+
+namespace {
+
+class COFFAsmParser : public MCAsmParserExtension {
+  template<bool (COFFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<COFFAsmParser, HandlerMethod>);
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+  bool ParseSectionSwitch(StringRef Section,
+                          unsigned Characteristics,
+                          SectionKind Kind);
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    MCAsmParserExtension::Initialize(Parser);
+
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveDef>(".def");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecRel32>(".secrel32");
+
+    // Win64 EH directives.
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
+                                                                   ".seh_proc");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProc>(
+                                                                ".seh_endproc");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartChained>(
+                                                           ".seh_startchained");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndChained>(
+                                                             ".seh_endchained");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandler>(
+                                                                ".seh_handler");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveHandlerData>(
+                                                            ".seh_handlerdata");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushReg>(
+                                                                ".seh_pushreg");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSetFrame>(
+                                                               ".seh_setframe");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveAllocStack>(
+                                                             ".seh_stackalloc");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveReg>(
+                                                                ".seh_savereg");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveSaveXMM>(
+                                                                ".seh_savexmm");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectivePushFrame>(
+                                                              ".seh_pushframe");
+    addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
+                                                            ".seh_endprologue");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+  }
+
+  bool ParseSectionDirectiveText(StringRef, SMLoc) {
+    return ParseSectionSwitch(".text",
+                              COFF::IMAGE_SCN_CNT_CODE
+                            | COFF::IMAGE_SCN_MEM_EXECUTE
+                            | COFF::IMAGE_SCN_MEM_READ,
+                              SectionKind::getText());
+  }
+  bool ParseSectionDirectiveData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data",
+                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA
+                            | COFF::IMAGE_SCN_MEM_READ
+                            | COFF::IMAGE_SCN_MEM_WRITE,
+                              SectionKind::getDataRel());
+  }
+  bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
+    return ParseSectionSwitch(".bss",
+                              COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
+                            | COFF::IMAGE_SCN_MEM_READ
+                            | COFF::IMAGE_SCN_MEM_WRITE,
+                              SectionKind::getBSS());
+  }
+
+  bool ParseDirectiveDef(StringRef, SMLoc);
+  bool ParseDirectiveScl(StringRef, SMLoc);
+  bool ParseDirectiveType(StringRef, SMLoc);
+  bool ParseDirectiveEndef(StringRef, SMLoc);
+  bool ParseDirectiveSecRel32(StringRef, SMLoc);
+
+  // Win64 EH directives.
+  bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProc(StringRef, SMLoc);
+  bool ParseSEHDirectiveStartChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndChained(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandler(StringRef, SMLoc);
+  bool ParseSEHDirectiveHandlerData(StringRef, SMLoc);
+  bool ParseSEHDirectivePushReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSetFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveAllocStack(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveReg(StringRef, SMLoc);
+  bool ParseSEHDirectiveSaveXMM(StringRef, SMLoc);
+  bool ParseSEHDirectivePushFrame(StringRef, SMLoc);
+  bool ParseSEHDirectiveEndProlog(StringRef, SMLoc);
+
+  bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
+  bool ParseSEHRegisterNumber(unsigned &RegNo);
+  bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
+public:
+  COFFAsmParser() {}
+};
+
+} // end annonomous namespace.
+
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".weak", ... } [ identifier ( , identifier )* ]
+bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
+  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Directive)
+    .Case(".weak", MCSA_Weak)
+    .Default(MCSA_Invalid);
+  assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+
+      if (getParser().parseIdentifier(Name))
+        return TokError("expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
+                                       unsigned Characteristics,
+                                       SectionKind Kind) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  getStreamer().SwitchSection(getContext().getCOFFSection(
+                                Section, Characteristics, Kind));
+
+  return false;
+}
+
+bool COFFAsmParser::ParseDirectiveDef(StringRef, SMLoc) {
+  StringRef SymbolName;
+
+  if (getParser().parseIdentifier(SymbolName))
+    return TokError("expected identifier in directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
+  getStreamer().BeginCOFFSymbolDef(Sym);
+
+  Lex();
+  return false;
+}
+
+bool COFFAsmParser::ParseDirectiveScl(StringRef, SMLoc) {
+  int64_t SymbolStorageClass;
+  if (getParser().parseAbsoluteExpression(SymbolStorageClass))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitCOFFSymbolStorageClass(SymbolStorageClass);
+  return false;
+}
+
+bool COFFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
+  int64_t Type;
+  if (getParser().parseAbsoluteExpression(Type))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitCOFFSymbolType(Type);
+  return false;
+}
+
+bool COFFAsmParser::ParseDirectiveEndef(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EndCOFFSymbolDef();
+  return false;
+}
+
+bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitCOFFSecRel32(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *Symbol = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHStartProc(Symbol);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProc(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProc();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveStartChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHStartChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndChained(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndChained();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandler(StringRef, SMLoc) {
+  StringRef SymbolID;
+  if (getParser().parseIdentifier(SymbolID))
+    return true;
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify one or both of @unwind or @except");
+  Lex();
+  bool unwind = false, except = false;
+  if (ParseAtUnwindOrAtExcept(unwind, except))
+    return true;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    if (ParseAtUnwindOrAtExcept(unwind, except))
+      return true;
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *handler = getContext().GetOrCreateSymbol(SymbolID);
+
+  Lex();
+  getStreamer().EmitWin64EHHandler(handler, unwind, except);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveHandlerData(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHHandlerData();
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushReg(Reg);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSetFrame(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify a stack pointer offset");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHSetFrame(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveAllocStack(StringRef, SMLoc) {
+  int64_t Size;
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+
+  if (Size & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHAllocStack(Size);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveSaveReg(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (Off & 7)
+    return Error(startLoc, "size is not a multiple of 8");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  // FIXME: Err on %xmm* registers
+  getStreamer().EmitWin64EHSaveReg(Reg, Off);
+  return false;
+}
+
+// FIXME: This method is inherently x86-specific. It should really be in the
+// x86 backend.
+bool COFFAsmParser::ParseSEHDirectiveSaveXMM(StringRef, SMLoc L) {
+  unsigned Reg;
+  int64_t Off;
+  if (ParseSEHRegisterNumber(Reg))
+    return true;
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("you must specify an offset on the stack");
+
+  Lex();
+  SMLoc startLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Off))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  if (Off & 0x0F)
+    return Error(startLoc, "offset is not a multiple of 16");
+
+  Lex();
+  // FIXME: Err on non-%xmm* registers
+  getStreamer().EmitWin64EHSaveXMM(Reg, Off);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectivePushFrame(StringRef, SMLoc) {
+  bool Code = false;
+  StringRef CodeID;
+  if (getLexer().is(AsmToken::At)) {
+    SMLoc startLoc = getLexer().getLoc();
+    Lex();
+    if (!getParser().parseIdentifier(CodeID)) {
+      if (CodeID != "code")
+        return Error(startLoc, "expected @code");
+      Code = true;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  Lex();
+  getStreamer().EmitWin64EHPushFrame(Code);
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHDirectiveEndProlog(StringRef, SMLoc) {
+  Lex();
+  getStreamer().EmitWin64EHEndProlog();
+  return false;
+}
+
+bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
+  StringRef identifier;
+  if (getLexer().isNot(AsmToken::At))
+    return TokError("a handler attribute must begin with '@'");
+  SMLoc startLoc = getLexer().getLoc();
+  Lex();
+  if (getParser().parseIdentifier(identifier))
+    return Error(startLoc, "expected @unwind or @except");
+  if (identifier == "unwind")
+    unwind = true;
+  else if (identifier == "except")
+    except = true;
+  else
+    return Error(startLoc, "expected @unwind or @except");
+  return false;
+}
+
+bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
+  SMLoc startLoc = getLexer().getLoc();
+  if (getLexer().is(AsmToken::Percent)) {
+    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    SMLoc endLoc;
+    unsigned LLVMRegNo;
+    if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
+      return true;
+
+#if 0
+    // FIXME: TargetAsmInfo::getCalleeSavedRegs() commits a serious layering
+    // violation so this validation code is disabled.
+
+    // Check that this is a non-volatile register.
+    const unsigned *NVRegs = TAI.getCalleeSavedRegs();
+    unsigned i;
+    for (i = 0; NVRegs[i] != 0; ++i)
+      if (NVRegs[i] == LLVMRegNo)
+        break;
+    if (NVRegs[i] == 0)
+      return Error(startLoc, "expected non-volatile register");
+#endif
+
+    int SEHRegNo = MRI.getSEHRegNum(LLVMRegNo);
+    if (SEHRegNo < 0)
+      return Error(startLoc,"register can't be represented in SEH unwind info");
+    RegNo = SEHRegNo;
+  }
+  else {
+    int64_t n;
+    if (getParser().parseAbsoluteExpression(n))
+      return true;
+    if (n > 15)
+      return Error(startLoc, "register number is too high");
+    RegNo = n;
+  }
+
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createCOFFAsmParser() {
+  return new COFFAsmParser;
+}
+
+}
diff --git a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
new file mode 100644
index 000000000000..6d6409fb69e2
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -0,0 +1,826 @@
+//===- DarwinAsmParser.cpp - Darwin (Mach-O) Assembly Parser --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+using namespace llvm;
+
+namespace {
+
+/// \brief Implementation of directive handling which is shared across all
+/// Darwin targets.
+class DarwinAsmParser : public MCAsmParserExtension {
+  template<bool (DarwinAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<DarwinAsmParser, HandlerMethod>);
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+  bool ParseSectionSwitch(const char *Segment, const char *Section,
+                          unsigned TAA = 0, unsigned ImplicitAlign = 0,
+                          unsigned StubSize = 0);
+
+public:
+  DarwinAsmParser() {}
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
+      ".subsections_via_symbols");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".dump");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDumpOrLoad>(".load");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSection>(".section");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePushSection>(
+      ".pushsection");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePopSection>(
+      ".popsection");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectivePrevious>(".previous");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogUnique>(
+      ".secure_log_unique");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSecureLogReset>(
+      ".secure_log_reset");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveTBSS>(".tbss");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveZerofill>(".zerofill");
+
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegion>(
+      ".data_region");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDataRegionEnd>(
+      ".end_data_region");
+
+    // Special section directives.
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(
+      ".const_data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstructor>(
+      ".constructor");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveCString>(
+      ".cstring");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDestructor>(
+      ".destructor");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveDyld>(".dyld");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit0>(
+      ".fvmlib_init0");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveFVMLibInit1>(
+      ".fvmlib_init1");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveLazySymbolPointers>(
+        ".lazy_symbol_pointer");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLinkerOption>(
+      ".linker_option");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral16>(
+      ".literal16");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral4>(
+      ".literal4");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveLiteral8>(
+      ".literal8");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModInitFunc>(
+      ".mod_init_func");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveModTermFunc>(
+      ".mod_term_func");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveNonLazySymbolPointers>(
+        ".non_lazy_symbol_pointer");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatClsMeth>(
+      ".objc_cat_cls_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCatInstMeth>(
+      ".objc_cat_inst_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCCategory>(
+      ".objc_category");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClass>(
+      ".objc_class");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassNames>(
+      ".objc_class_names");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClassVars>(
+      ".objc_class_vars");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsMeth>(
+      ".objc_cls_meth");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCClsRefs>(
+      ".objc_cls_refs");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCInstMeth>(
+      ".objc_inst_meth");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCInstanceVars>(
+        ".objc_instance_vars");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMessageRefs>(
+      ".objc_message_refs");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCMetaClass>(
+      ".objc_meta_class");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCMethVarNames>(
+        ".objc_meth_var_names");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCMethVarTypes>(
+        ".objc_meth_var_types");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCModuleInfo>(
+      ".objc_module_info");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCProtocol>(
+      ".objc_protocol");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCSelectorStrs>(
+        ".objc_selector_strs");
+    addDirectiveHandler<
+      &DarwinAsmParser::ParseSectionDirectiveObjCStringObject>(
+        ".objc_string_object");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveObjCSymbols>(
+      ".objc_symbols");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectivePICSymbolStub>(
+      ".picsymbol_stub");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticConst>(
+      ".static_const");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveStaticData>(
+      ".static_data");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveSymbolStub>(
+      ".symbol_stub");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTData>(".tdata");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveThreadInitFunc>(
+      ".thread_init_func");
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveTLV>(".tlv");
+
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveIdent>(".ident");
+  }
+
+  bool ParseDirectiveDesc(StringRef, SMLoc);
+  bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
+  bool ParseDirectiveLsym(StringRef, SMLoc);
+  bool ParseDirectiveLinkerOption(StringRef, SMLoc);
+  bool ParseDirectiveSection(StringRef, SMLoc);
+  bool ParseDirectivePushSection(StringRef, SMLoc);
+  bool ParseDirectivePopSection(StringRef, SMLoc);
+  bool ParseDirectivePrevious(StringRef, SMLoc);
+  bool ParseDirectiveSecureLogReset(StringRef, SMLoc);
+  bool ParseDirectiveSecureLogUnique(StringRef, SMLoc);
+  bool ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc);
+  bool ParseDirectiveTBSS(StringRef, SMLoc);
+  bool ParseDirectiveZerofill(StringRef, SMLoc);
+  bool ParseDirectiveDataRegion(StringRef, SMLoc);
+  bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
+
+  // Named Section Directive
+  bool ParseSectionDirectiveConst(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__const");
+  }
+  bool ParseSectionDirectiveStaticConst(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__static_const");
+  }
+  bool ParseSectionDirectiveCString(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveLiteral4(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__literal4",
+                              MCSectionMachO::S_4BYTE_LITERALS, 4);
+  }
+  bool ParseSectionDirectiveLiteral8(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__literal8",
+                              MCSectionMachO::S_8BYTE_LITERALS, 8);
+  }
+  bool ParseSectionDirectiveLiteral16(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__literal16",
+                              MCSectionMachO::S_16BYTE_LITERALS, 16);
+  }
+  bool ParseSectionDirectiveConstructor(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__constructor");
+  }
+  bool ParseSectionDirectiveDestructor(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__destructor");
+  }
+  bool ParseSectionDirectiveFVMLibInit0(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__fvmlib_init0");
+  }
+  bool ParseSectionDirectiveFVMLibInit1(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__fvmlib_init1");
+  }
+  bool ParseSectionDirectiveSymbolStub(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__symbol_stub",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                              // FIXME: Different on PPC and ARM.
+                              0, 16);
+  }
+  bool ParseSectionDirectivePICSymbolStub(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT","__picsymbol_stub",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26);
+  }
+  bool ParseSectionDirectiveData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__data");
+  }
+  bool ParseSectionDirectiveStaticData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__static_data");
+  }
+  bool ParseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__nl_symbol_ptr",
+                              MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__la_symbol_ptr",
+                              MCSectionMachO::S_LAZY_SYMBOL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveDyld(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__dyld");
+  }
+  bool ParseSectionDirectiveModInitFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__mod_init_func",
+                              MCSectionMachO::S_MOD_INIT_FUNC_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveModTermFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__mod_term_func",
+                              MCSectionMachO::S_MOD_TERM_FUNC_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveConstData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__const");
+  }
+  bool ParseSectionDirectiveObjCClass(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__class",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCMetaClass(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__meta_class",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cat_cls_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cat_inst_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCProtocol(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__protocol",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCStringObject(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__string_object",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClsMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cls_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCInstMeth(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__inst_meth",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClsRefs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__cls_refs",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
+                              MCSectionMachO::S_LITERAL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__message_refs",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP |
+                              MCSectionMachO::S_LITERAL_POINTERS, 4);
+  }
+  bool ParseSectionDirectiveObjCSymbols(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__symbols",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCCategory(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__category",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClassVars(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__class_vars",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__instance_vars",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__module_info",
+                              MCSectionMachO::S_ATTR_NO_DEAD_STRIP);
+  }
+  bool ParseSectionDirectiveObjCClassNames(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__cstring",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) {
+    return ParseSectionSwitch("__OBJC", "__selector_strs",
+                              MCSectionMachO::S_CSTRING_LITERALS);
+  }
+  bool ParseSectionDirectiveTData(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_data",
+                              MCSectionMachO::S_THREAD_LOCAL_REGULAR);
+  }
+  bool ParseSectionDirectiveText(StringRef, SMLoc) {
+    return ParseSectionSwitch("__TEXT", "__text",
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
+  }
+  bool ParseSectionDirectiveTLV(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_vars",
+                              MCSectionMachO::S_THREAD_LOCAL_VARIABLES);
+  }
+  bool ParseSectionDirectiveIdent(StringRef, SMLoc) {
+    // Darwin silently ignores the .ident directive.
+    getParser().eatToEndOfStatement();
+    return false;
+  }
+  bool ParseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__thread_init",
+                         MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
+  }
+
+};
+
+} // end anonymous namespace
+
+bool DarwinAsmParser::ParseSectionSwitch(const char *Segment,
+                                         const char *Section,
+                                         unsigned TAA, unsigned Align,
+                                         unsigned StubSize) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  // FIXME: Arch specific.
+  bool isText = TAA & MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS;
+  getStreamer().SwitchSection(getContext().getMachOSection(
+                                Segment, Section, TAA, StubSize,
+                                isText ? SectionKind::getText()
+                                       : SectionKind::getDataRel()));
+
+  // Set the implicit alignment, if any.
+  //
+  // FIXME: This isn't really what 'as' does; I think it just uses the implicit
+  // alignment on the section (e.g., if one manually inserts bytes into the
+  // section, then just issuing the section switch directive will not realign
+  // the section. However, this is arguably more reasonable behavior, and there
+  // is no good reason for someone to intentionally emit incorrectly sized
+  // values into the implicitly aligned sections.
+  if (Align)
+    getStreamer().EmitValueToAlignment(Align, 0, 1, 0);
+
+  return false;
+}
+
+/// ParseDirectiveDesc
+///  ::= .desc identifier , expression
+bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.desc' directive");
+  Lex();
+
+  int64_t DescValue;
+  if (getParser().parseAbsoluteExpression(DescValue))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.desc' directive");
+
+  Lex();
+
+  // Set the n_desc field of this Symbol to this DescValue
+  getStreamer().EmitSymbolDesc(Sym, DescValue);
+
+  return false;
+}
+
+/// ParseDirectiveDumpOrLoad
+///  ::= ( .dump | .load ) "filename"
+bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
+                                               SMLoc IDLoc) {
+  bool IsDump = Directive == ".dump";
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("expected string in '.dump' or '.load' directive");
+
+  Lex();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.dump' or '.load' directive");
+
+  Lex();
+
+  // FIXME: If/when .dump and .load are implemented they will be done in the
+  // the assembly parser and not have any need for an MCStreamer API.
+  if (IsDump)
+    return Warning(IDLoc, "ignoring directive .dump for now");
+  else
+    return Warning(IDLoc, "ignoring directive .load for now");
+}
+
+/// ParseDirectiveLinkerOption
+///  ::= .linker_option "string" ( , "string" )*
+bool DarwinAsmParser::ParseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
+  SmallVector<std::string, 4> Args;
+  for (;;) {
+    if (getLexer().isNot(AsmToken::String))
+      return TokError("expected string in '" + Twine(IDVal) + "' directive");
+
+    std::string Data;
+    if (getParser().parseEscapedString(Data))
+      return true;
+
+    Args.push_back(Data);
+
+    Lex();
+    if (getLexer().is(AsmToken::EndOfStatement))
+      break;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+
+  getStreamer().EmitLinkerOptions(Args);
+  return false;
+}
+
+/// ParseDirectiveLsym
+///  ::= .lsym identifier , expression
+bool DarwinAsmParser::ParseDirectiveLsym(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.lsym' directive");
+  Lex();
+
+  const MCExpr *Value;
+  if (getParser().parseExpression(Value))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.lsym' directive");
+
+  Lex();
+
+  // We don't currently support this directive.
+  //
+  // FIXME: Diagnostic location!
+  (void) Sym;
+  return TokError("directive '.lsym' is unsupported");
+}
+
+/// ParseDirectiveSection:
+///   ::= .section identifier (',' identifier)*
+bool DarwinAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
+  SMLoc Loc = getLexer().getLoc();
+
+  StringRef SectionName;
+  if (getParser().parseIdentifier(SectionName))
+    return Error(Loc, "expected identifier after '.section' directive");
+
+  // Verify there is a following comma.
+  if (!getLexer().is(AsmToken::Comma))
+    return TokError("unexpected token in '.section' directive");
+
+  std::string SectionSpec = SectionName;
+  SectionSpec += ",";
+
+  // Add all the tokens until the end of the line, ParseSectionSpecifier will
+  // handle this.
+  StringRef EOL = getLexer().LexUntilEndOfStatement();
+  SectionSpec.append(EOL.begin(), EOL.end());
+
+  Lex();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.section' directive");
+  Lex();
+
+
+  StringRef Segment, Section;
+  unsigned StubSize;
+  unsigned TAA;
+  bool TAAParsed;
+  std::string ErrorStr =
+    MCSectionMachO::ParseSectionSpecifier(SectionSpec, Segment, Section,
+                                          TAA, TAAParsed, StubSize);
+
+  if (!ErrorStr.empty())
+    return Error(Loc, ErrorStr.c_str());
+
+  // FIXME: Arch specific.
+  bool isText = Segment == "__TEXT";  // FIXME: Hack.
+  getStreamer().SwitchSection(getContext().getMachOSection(
+                                Segment, Section, TAA, StubSize,
+                                isText ? SectionKind::getText()
+                                : SectionKind::getDataRel()));
+  return false;
+}
+
+/// ParseDirectivePushSection:
+///   ::= .pushsection identifier (',' identifier)*
+bool DarwinAsmParser::ParseDirectivePushSection(StringRef S, SMLoc Loc) {
+  getStreamer().PushSection();
+
+  if (ParseDirectiveSection(S, Loc)) {
+    getStreamer().PopSection();
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseDirectivePopSection:
+///   ::= .popsection
+bool DarwinAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
+  if (!getStreamer().PopSection())
+    return TokError(".popsection without corresponding .pushsection");
+  return false;
+}
+
+/// ParseDirectivePrevious:
+///   ::= .previous
+bool DarwinAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
+  const MCSection *PreviousSection = getStreamer().getPreviousSection();
+  if (PreviousSection == NULL)
+      return TokError(".previous without corresponding .section");
+  getStreamer().SwitchSection(PreviousSection);
+  return false;
+}
+
+/// ParseDirectiveSecureLogUnique
+///  ::= .secure_log_unique ... message ...
+bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
+  StringRef LogMessage = getParser().parseStringToEndOfStatement();
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.secure_log_unique' directive");
+
+  if (getContext().getSecureLogUsed() != false)
+    return Error(IDLoc, ".secure_log_unique specified multiple times");
+
+  // Get the secure log path.
+  const char *SecureLogFile = getContext().getSecureLogFile();
+  if (SecureLogFile == NULL)
+    return Error(IDLoc, ".secure_log_unique used but AS_SECURE_LOG_FILE "
+                 "environment variable unset.");
+
+  // Open the secure log file if we haven't already.
+  raw_ostream *OS = getContext().getSecureLog();
+  if (OS == NULL) {
+    std::string Err;
+    OS = new raw_fd_ostream(SecureLogFile, Err, raw_fd_ostream::F_Append);
+    if (!Err.empty()) {
+       delete OS;
+       return Error(IDLoc, Twine("can't open secure log file: ") +
+                    SecureLogFile + " (" + Err + ")");
+    }
+    getContext().setSecureLog(OS);
+  }
+
+  // Write the message.
+  int CurBuf = getSourceManager().FindBufferContainingLoc(IDLoc);
+  *OS << getSourceManager().getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
+      << ":" << getSourceManager().FindLineNumber(IDLoc, CurBuf) << ":"
+      << LogMessage + "\n";
+
+  getContext().setSecureLogUsed(true);
+
+  return false;
+}
+
+/// ParseDirectiveSecureLogReset
+///  ::= .secure_log_reset
+bool DarwinAsmParser::ParseDirectiveSecureLogReset(StringRef, SMLoc IDLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.secure_log_reset' directive");
+
+  Lex();
+
+  getContext().setSecureLogUsed(false);
+
+  return false;
+}
+
+/// ParseDirectiveSubsectionsViaSymbols
+///  ::= .subsections_via_symbols
+bool DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.subsections_via_symbols' directive");
+
+  Lex();
+
+  getStreamer().EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return false;
+}
+
+/// ParseDirectiveTBSS
+///  ::= .tbss identifier, size, align
+bool DarwinAsmParser::ParseDirectiveTBSS(StringRef, SMLoc) {
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(Pow2Alignment))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.tbss' directive");
+
+  Lex();
+
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.tbss' directive size, can't be less than"
+                 "zero");
+
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.tbss' alignment, can't be less"
+                 "than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  getStreamer().EmitTBSSSymbol(getContext().getMachOSection(
+                                 "__DATA", "__thread_bss",
+                                 MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
+                                 0, SectionKind::getThreadBSS()),
+                               Sym, Size, 1 << Pow2Alignment);
+
+  return false;
+}
+
+/// ParseDirectiveZerofill
+///  ::= .zerofill segname , sectname [, identifier , size_expression [
+///      , align_expression ]]
+bool DarwinAsmParser::ParseDirectiveZerofill(StringRef, SMLoc) {
+  StringRef Segment;
+  if (getParser().parseIdentifier(Segment))
+    return TokError("expected segment name after '.zerofill' directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  StringRef Section;
+  if (getParser().parseIdentifier(Section))
+    return TokError("expected section name after comma in '.zerofill' "
+                    "directive");
+
+  // If this is the end of the line all that was wanted was to create the
+  // the section but with no symbol.
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    // Create the zerofill section but no symbol
+    getStreamer().EmitZerofill(getContext().getMachOSection(
+                                 Segment, Section, MCSectionMachO::S_ZEROFILL,
+                                 0, SectionKind::getBSS()));
+    return false;
+  }
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef IDStr;
+  if (getParser().parseIdentifier(IDStr))
+    return TokError("expected identifier in directive");
+
+  // handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(IDStr);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (getParser().parseAbsoluteExpression(Pow2Alignment))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.zerofill' directive");
+
+  Lex();
+
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.zerofill' directive size, can't be less "
+                 "than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.zerofill' directive alignment, "
+                 "can't be less than zero");
+
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  // Create the zerofill Symbol with Size and Pow2Alignment
+  //
+  // FIXME: Arch specific.
+  getStreamer().EmitZerofill(getContext().getMachOSection(
+                               Segment, Section, MCSectionMachO::S_ZEROFILL,
+                               0, SectionKind::getBSS()),
+                             Sym, Size, 1 << Pow2Alignment);
+
+  return false;
+}
+
+/// ParseDirectiveDataRegion
+///  ::= .data_region [ ( jt8 | jt16 | jt32 ) ]
+bool DarwinAsmParser::ParseDirectiveDataRegion(StringRef, SMLoc) {
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Lex();
+    getStreamer().EmitDataRegion(MCDR_DataRegion);
+    return false;
+  }
+  StringRef RegionType;
+  SMLoc Loc = getParser().getTok().getLoc();
+  if (getParser().parseIdentifier(RegionType))
+    return TokError("expected region type after '.data_region' directive");
+  int Kind = StringSwitch<int>(RegionType)
+    .Case("jt8", MCDR_DataRegionJT8)
+    .Case("jt16", MCDR_DataRegionJT16)
+    .Case("jt32", MCDR_DataRegionJT32)
+    .Default(-1);
+  if (Kind == -1)
+    return Error(Loc, "unknown region type in '.data_region' directive");
+  Lex();
+
+  getStreamer().EmitDataRegion((MCDataRegionType)Kind);
+  return false;
+}
+
+/// ParseDirectiveDataRegionEnd
+///  ::= .end_data_region
+bool DarwinAsmParser::ParseDirectiveDataRegionEnd(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.end_data_region' directive");
+
+  Lex();
+  getStreamer().EmitDataRegion(MCDR_DataRegionEnd);
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createDarwinAsmParser() {
+  return new DarwinAsmParser;
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
new file mode 100644
index 000000000000..4c45e087445d
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -0,0 +1,622 @@
+//===- ELFAsmParser.cpp - ELF Assembly Parser -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+namespace {
+
+class ELFAsmParser : public MCAsmParserExtension {
+  template<bool (ELFAsmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler = std::make_pair(
+        this, HandleDirective<ELFAsmParser, HandlerMethod>);
+
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+  bool ParseSectionSwitch(StringRef Section, unsigned Type,
+                          unsigned Flags, SectionKind Kind);
+  bool SeenIdent;
+
+public:
+  ELFAsmParser() : SeenIdent(false) {
+    BracketExpressionsSupported = true;
+  }
+
+  virtual void Initialize(MCAsmParser &Parser) {
+    // Call the base implementation.
+    this->MCAsmParserExtension::Initialize(Parser);
+
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveData>(".data");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveText>(".text");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
+    addDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseDirectivePushSection>(".pushsection");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveType>(".type");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveVersion>(".version");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+    addDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".protected");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".internal");
+    addDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden");
+  }
+
+  // FIXME: Part of this logic is duplicated in the MCELFStreamer. What is
+  // the best way for us to get access to it?
+  bool ParseSectionDirectiveData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data", ELF::SHT_PROGBITS,
+                              ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                              SectionKind::getDataRel());
+  }
+  bool ParseSectionDirectiveText(StringRef, SMLoc) {
+    return ParseSectionSwitch(".text", ELF::SHT_PROGBITS,
+                              ELF::SHF_EXECINSTR |
+                              ELF::SHF_ALLOC, SectionKind::getText());
+  }
+  bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
+    return ParseSectionSwitch(".bss", ELF::SHT_NOBITS,
+                              ELF::SHF_WRITE |
+                              ELF::SHF_ALLOC, SectionKind::getBSS());
+  }
+  bool ParseSectionDirectiveRoData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".rodata", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC,
+                              SectionKind::getReadOnly());
+  }
+  bool ParseSectionDirectiveTData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".tdata", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_TLS | ELF::SHF_WRITE,
+                              SectionKind::getThreadData());
+  }
+  bool ParseSectionDirectiveTBSS(StringRef, SMLoc) {
+    return ParseSectionSwitch(".tbss", ELF::SHT_NOBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_TLS | ELF::SHF_WRITE,
+                              SectionKind::getThreadBSS());
+  }
+  bool ParseSectionDirectiveDataRel(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_WRITE,
+                              SectionKind::getDataRel());
+  }
+  bool ParseSectionDirectiveDataRelRo(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel.ro", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_WRITE,
+                              SectionKind::getReadOnlyWithRel());
+  }
+  bool ParseSectionDirectiveDataRelRoLocal(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data.rel.ro.local", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_WRITE,
+                              SectionKind::getReadOnlyWithRelLocal());
+  }
+  bool ParseSectionDirectiveEhFrame(StringRef, SMLoc) {
+    return ParseSectionSwitch(".eh_frame", ELF::SHT_PROGBITS,
+                              ELF::SHF_ALLOC |
+                              ELF::SHF_WRITE,
+                              SectionKind::getDataRel());
+  }
+  bool ParseDirectivePushSection(StringRef, SMLoc);
+  bool ParseDirectivePopSection(StringRef, SMLoc);
+  bool ParseDirectiveSection(StringRef, SMLoc);
+  bool ParseDirectiveSize(StringRef, SMLoc);
+  bool ParseDirectivePrevious(StringRef, SMLoc);
+  bool ParseDirectiveType(StringRef, SMLoc);
+  bool ParseDirectiveIdent(StringRef, SMLoc);
+  bool ParseDirectiveSymver(StringRef, SMLoc);
+  bool ParseDirectiveVersion(StringRef, SMLoc);
+  bool ParseDirectiveWeakref(StringRef, SMLoc);
+  bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
+
+private:
+  bool ParseSectionName(StringRef &SectionName);
+};
+
+}
+
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
+bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
+  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Directive)
+    .Case(".weak", MCSA_Weak)
+    .Case(".local", MCSA_Local)
+    .Case(".hidden", MCSA_Hidden)
+    .Case(".internal", MCSA_Internal)
+    .Case(".protected", MCSA_Protected)
+    .Default(MCSA_Invalid);
+  assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+
+      if (getParser().parseIdentifier(Name))
+        return TokError("expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
+                                      unsigned Flags, SectionKind Kind) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  getStreamer().SwitchSection(getContext().getELFSection(
+                                Section, Type, Flags, Kind));
+
+  return false;
+}
+
+bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  const MCExpr *Expr;
+  if (getParser().parseExpression(Expr))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  getStreamer().EmitELFSize(Sym, Expr);
+  return false;
+}
+
+bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
+  // A section name can contain -, so we cannot just use
+  // parseIdentifier.
+  SMLoc FirstLoc = getLexer().getLoc();
+  unsigned Size = 0;
+
+  if (getLexer().is(AsmToken::String)) {
+    SectionName = getTok().getIdentifier();
+    Lex();
+    return false;
+  }
+
+  for (;;) {
+    StringRef Tmp;
+    unsigned CurSize;
+
+    SMLoc PrevLoc = getLexer().getLoc();
+    if (getLexer().is(AsmToken::Minus)) {
+      CurSize = 1;
+      Lex(); // Consume the "-".
+    } else if (getLexer().is(AsmToken::String)) {
+      CurSize = getTok().getIdentifier().size() + 2;
+      Lex();
+    } else if (getLexer().is(AsmToken::Identifier)) {
+      CurSize = getTok().getIdentifier().size();
+      Lex();
+    } else {
+      break;
+    }
+
+    Size += CurSize;
+    SectionName = StringRef(FirstLoc.getPointer(), Size);
+
+    // Make sure the following token is adjacent.
+    if (PrevLoc.getPointer() + CurSize != getTok().getLoc().getPointer())
+      break;
+  }
+  if (Size == 0)
+    return true;
+
+  return false;
+}
+
+static SectionKind computeSectionKind(unsigned Flags) {
+  if (Flags & ELF::SHF_EXECINSTR)
+    return SectionKind::getText();
+  if (Flags & ELF::SHF_TLS)
+    return SectionKind::getThreadData();
+  return SectionKind::getDataRel();
+}
+
+static int parseSectionFlags(StringRef flagsStr) {
+  int flags = 0;
+
+  for (unsigned i = 0; i < flagsStr.size(); i++) {
+    switch (flagsStr[i]) {
+    case 'a':
+      flags |= ELF::SHF_ALLOC;
+      break;
+    case 'x':
+      flags |= ELF::SHF_EXECINSTR;
+      break;
+    case 'w':
+      flags |= ELF::SHF_WRITE;
+      break;
+    case 'M':
+      flags |= ELF::SHF_MERGE;
+      break;
+    case 'S':
+      flags |= ELF::SHF_STRINGS;
+      break;
+    case 'T':
+      flags |= ELF::SHF_TLS;
+      break;
+    case 'c':
+      flags |= ELF::XCORE_SHF_CP_SECTION;
+      break;
+    case 'd':
+      flags |= ELF::XCORE_SHF_DP_SECTION;
+      break;
+    case 'G':
+      flags |= ELF::SHF_GROUP;
+      break;
+    default:
+      return -1;
+    }
+  }
+
+  return flags;
+}
+
+bool ELFAsmParser::ParseDirectivePushSection(StringRef s, SMLoc loc) {
+  getStreamer().PushSection();
+
+  if (ParseDirectiveSection(s, loc)) {
+    getStreamer().PopSection();
+    return true;
+  }
+
+  return false;
+}
+
+bool ELFAsmParser::ParseDirectivePopSection(StringRef, SMLoc) {
+  if (!getStreamer().PopSection())
+    return TokError(".popsection without corresponding .pushsection");
+  return false;
+}
+
+// FIXME: This is a work in progress.
+bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
+  StringRef SectionName;
+
+  if (ParseSectionName(SectionName))
+    return TokError("expected identifier in directive");
+
+  StringRef TypeName;
+  int64_t Size = 0;
+  StringRef GroupName;
+  unsigned Flags = 0;
+
+  // Set the defaults first.
+  if (SectionName == ".fini" || SectionName == ".init" ||
+      SectionName == ".rodata")
+    Flags |= ELF::SHF_ALLOC;
+  if (SectionName == ".fini" || SectionName == ".init")
+    Flags |= ELF::SHF_EXECINSTR;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (getLexer().isNot(AsmToken::String))
+      return TokError("expected string in directive");
+
+    StringRef FlagsStr = getTok().getStringContents();
+    Lex();
+
+    int extraFlags = parseSectionFlags(FlagsStr);
+    if (extraFlags < 0)
+      return TokError("unknown flag");
+    Flags |= extraFlags;
+
+    bool Mergeable = Flags & ELF::SHF_MERGE;
+    bool Group = Flags & ELF::SHF_GROUP;
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      if (Mergeable)
+        return TokError("Mergeable section must specify the type");
+      if (Group)
+        return TokError("Group section must specify the type");
+    } else {
+      Lex();
+      if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
+        return TokError("expected '@' or '%' before type");
+
+      Lex();
+      if (getParser().parseIdentifier(TypeName))
+        return TokError("expected identifier in directive");
+
+      if (Mergeable) {
+        if (getLexer().isNot(AsmToken::Comma))
+          return TokError("expected the entry size");
+        Lex();
+        if (getParser().parseAbsoluteExpression(Size))
+          return true;
+        if (Size <= 0)
+          return TokError("entry size must be positive");
+      }
+
+      if (Group) {
+        if (getLexer().isNot(AsmToken::Comma))
+          return TokError("expected group name");
+        Lex();
+        if (getParser().parseIdentifier(GroupName))
+          return true;
+        if (getLexer().is(AsmToken::Comma)) {
+          Lex();
+          StringRef Linkage;
+          if (getParser().parseIdentifier(Linkage))
+            return true;
+          if (Linkage != "comdat")
+            return TokError("Linkage must be 'comdat'");
+        }
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  unsigned Type = ELF::SHT_PROGBITS;
+
+  if (TypeName.empty()) {
+    if (SectionName.startswith(".note"))
+      Type = ELF::SHT_NOTE;
+    else if (SectionName == ".init_array")
+      Type = ELF::SHT_INIT_ARRAY;
+    else if (SectionName == ".fini_array")
+      Type = ELF::SHT_FINI_ARRAY;
+    else if (SectionName == ".preinit_array")
+      Type = ELF::SHT_PREINIT_ARRAY;
+  } else {
+    if (TypeName == "init_array")
+      Type = ELF::SHT_INIT_ARRAY;
+    else if (TypeName == "fini_array")
+      Type = ELF::SHT_FINI_ARRAY;
+    else if (TypeName == "preinit_array")
+      Type = ELF::SHT_PREINIT_ARRAY;
+    else if (TypeName == "nobits")
+      Type = ELF::SHT_NOBITS;
+    else if (TypeName == "progbits")
+      Type = ELF::SHT_PROGBITS;
+    else if (TypeName == "note")
+      Type = ELF::SHT_NOTE;
+    else if (TypeName == "unwind")
+      Type = ELF::SHT_X86_64_UNWIND;
+    else
+      return TokError("unknown section type");
+  }
+
+  SectionKind Kind = computeSectionKind(Flags);
+  getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
+                                                         Flags, Kind, Size,
+                                                         GroupName));
+  return false;
+}
+
+bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
+  const MCSection *PreviousSection = getStreamer().getPreviousSection();
+  if (PreviousSection == NULL)
+      return TokError(".previous without corresponding .section");
+  getStreamer().SwitchSection(PreviousSection);
+
+  return false;
+}
+
+/// ParseDirectiveELFType
+///  ::= .type identifier , @attribute
+bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in '.type' directive");
+  Lex();
+
+  if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
+    return TokError("expected '@' or '%' before type");
+  Lex();
+
+  StringRef Type;
+  SMLoc TypeLoc;
+
+  TypeLoc = getLexer().getLoc();
+  if (getParser().parseIdentifier(Type))
+    return TokError("expected symbol type in directive");
+
+  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Type)
+    .Case("function", MCSA_ELF_TypeFunction)
+    .Case("object", MCSA_ELF_TypeObject)
+    .Case("tls_object", MCSA_ELF_TypeTLS)
+    .Case("common", MCSA_ELF_TypeCommon)
+    .Case("notype", MCSA_ELF_TypeNoType)
+    .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+    .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
+    .Default(MCSA_Invalid);
+
+  if (Attr == MCSA_Invalid)
+    return Error(TypeLoc, "unsupported attribute in '.type' directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.type' directive");
+
+  Lex();
+
+  getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+  return false;
+}
+
+/// ParseDirectiveIdent
+///  ::= .ident string
+bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("unexpected token in '.ident' directive");
+
+  StringRef Data = getTok().getIdentifier();
+
+  Lex();
+
+  const MCSection *Comment =
+    getContext().getELFSection(".comment", ELF::SHT_PROGBITS,
+                               ELF::SHF_MERGE |
+                               ELF::SHF_STRINGS,
+                               SectionKind::getReadOnly(),
+                               1, "");
+
+  getStreamer().PushSection();
+  getStreamer().SwitchSection(Comment);
+  if (!SeenIdent) {
+    getStreamer().EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  getStreamer().EmitBytes(Data);
+  getStreamer().EmitIntValue(0, 1);
+  getStreamer().PopSection();
+  return false;
+}
+
+/// ParseDirectiveSymver
+///  ::= .symver foo, bar2@zed
+bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+
+  Lex();
+
+  StringRef AliasName;
+  if (getParser().parseIdentifier(AliasName))
+    return TokError("expected identifier in directive");
+
+  if (AliasName.find('@') == StringRef::npos)
+    return TokError("expected a '@' in the name");
+
+  MCSymbol *Alias = getContext().GetOrCreateSymbol(AliasName);
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCExpr *Value = MCSymbolRefExpr::Create(Sym, getContext());
+
+  getStreamer().EmitAssignment(Alias, Value);
+  return false;
+}
+
+/// ParseDirectiveVersion
+///  ::= .version string
+bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
+  if (getLexer().isNot(AsmToken::String))
+    return TokError("unexpected token in '.version' directive");
+
+  StringRef Data = getTok().getIdentifier();
+
+  Lex();
+
+  const MCSection *Note =
+    getContext().getELFSection(".note", ELF::SHT_NOTE, 0,
+                               SectionKind::getReadOnly());
+
+  getStreamer().PushSection();
+  getStreamer().SwitchSection(Note);
+  getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
+  getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
+  getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
+  getStreamer().EmitBytes(Data);                // name.
+  getStreamer().EmitIntValue(0, 1);             // terminate the string.
+  getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
+  getStreamer().PopSection();
+  return false;
+}
+
+/// ParseDirectiveWeakref
+///  ::= .weakref foo, bar
+bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
+  // FIXME: Share code with the other alias building directives.
+
+  StringRef AliasName;
+  if (getParser().parseIdentifier(AliasName))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+
+  Lex();
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  MCSymbol *Alias = getContext().GetOrCreateSymbol(AliasName);
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  getStreamer().EmitWeakReference(Alias, Sym);
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createELFAsmParser() {
+  return new ELFAsmParser;
+}
+
+}
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
new file mode 100644
index 000000000000..3867691107fb
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
@@ -0,0 +1,32 @@
+//===-- MCAsmLexer.cpp - Abstract Asm Lexer Interface ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace llvm;
+
+MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()),
+                           TokStart(0), SkipSpace(true) {
+}
+
+MCAsmLexer::~MCAsmLexer() {
+}
+
+SMLoc MCAsmLexer::getLoc() const {
+  return SMLoc::getFromPointer(TokStart);
+}
+
+SMLoc AsmToken::getLoc() const {
+  return SMLoc::getFromPointer(Str.data());
+}
+
+SMLoc AsmToken::getEndLoc() const {
+  return SMLoc::getFromPointer(Str.data() + Str.size());
+}
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
new file mode 100644
index 000000000000..6e1ebad36c0d
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -0,0 +1,50 @@
+//===-- MCAsmParser.cpp - Abstract Asm Parser Interface -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCAsmParser::MCAsmParser() : TargetParser(0), ShowParsedOperands(0) {
+}
+
+MCAsmParser::~MCAsmParser() {
+}
+
+void MCAsmParser::setTargetParser(MCTargetAsmParser &P) {
+  assert(!TargetParser && "Target parser is already initialized!");
+  TargetParser = &P;
+  TargetParser->Initialize(*this);
+}
+
+const AsmToken &MCAsmParser::getTok() {
+  return getLexer().getTok();
+}
+
+bool MCAsmParser::TokError(const Twine &Msg, ArrayRef<SMRange> Ranges) {
+  Error(getLexer().getLoc(), Msg, Ranges);
+  return true;
+}
+
+bool MCAsmParser::parseExpression(const MCExpr *&Res) {
+  SMLoc L;
+  return parseExpression(Res, L);
+}
+
+void MCParsedAsmOperand::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  " << *this;
+#endif
+}
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
new file mode 100644
index 000000000000..3f25a14926b6
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -0,0 +1,22 @@
+//===-- MCAsmParserExtension.cpp - Asm Parser Hooks -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+using namespace llvm;
+
+MCAsmParserExtension::MCAsmParserExtension() :
+  BracketExpressionsSupported(false) {
+}
+
+MCAsmParserExtension::~MCAsmParserExtension() {
+}
+
+void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
+  this->Parser = &Parser;
+}
diff --git a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
new file mode 100644
index 000000000000..60a3a3b59a3d
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -0,0 +1,19 @@
+//===-- MCTargetAsmParser.cpp - Target Assembly Parser ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCTargetAsmParser.h"
+using namespace llvm;
+
+MCTargetAsmParser::MCTargetAsmParser()
+  : AvailableFeatures(0), ParsingInlineAsm(false)
+{
+}
+
+MCTargetAsmParser::~MCTargetAsmParser() {
+}
diff --git a/contrib/llvm/lib/MC/MCPureStreamer.cpp b/contrib/llvm/lib/MC/MCPureStreamer.cpp
new file mode 100644
index 000000000000..0e04c5537acb
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCPureStreamer.cpp
@@ -0,0 +1,242 @@
+//===- lib/MC/MCPureStreamer.cpp - MC "Pure" Object Output ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectStreamer.h"
+// FIXME: Remove this.
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+
+class MCPureStreamer : public MCObjectStreamer {
+private:
+  virtual void EmitInstToFragment(const MCInst &Inst);
+  virtual void EmitInstToData(const MCInst &Inst);
+
+public:
+  MCPureStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter)
+      : MCObjectStreamer(SK_PureStreamer, Context, TAB, OS, Emitter) {}
+
+  /// @name MCStreamer Interface
+  /// @{
+
+  virtual void InitSections();
+  virtual void InitToTextSection();
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
+                            uint64_t Size = 0, unsigned ByteAlignment = 0);
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0);
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0);
+  virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                 unsigned char Value = 0);
+  virtual void FinishImpl();
+
+
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment = 0) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitThumbFunc(MCSymbol *Func) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitCOFFSymbolType(int Type) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EndCOFFSymbolDef() {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual void EmitFileDirective(StringRef Filename) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+  virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
+                                      StringRef Filename, unsigned CUID = 0) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
+
+  /// @}
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_PureStreamer;
+  }
+};
+
+} // end anonymous namespace.
+
+void MCPureStreamer::InitSections() {
+  InitToTextSection();
+}
+
+void MCPureStreamer::InitToTextSection() {
+  // FIMXE: To what!?
+  SwitchSection(getContext().getMachOSection("__TEXT", "__text",
+                                    MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                    0, SectionKind::getText()));
+}
+
+void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+
+  Symbol->setSection(*getCurrentSection());
+
+  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  // We have to create a new fragment if this is an atom defining symbol,
+  // fragments cannot span atoms.
+  if (getAssembler().isSymbolLinkerVisible(SD.getSymbol()))
+    new MCDataFragment(getCurrentSectionData());
+
+  // FIXME: This is wasteful, we don't necessarily need to create a data
+  // fragment. Instead, we should mark the symbol as pointing into the data
+  // fragment if it exists, otherwise we should just queue the label and set its
+  // fragment pointer when we emit the next fragment.
+  MCDataFragment *F = getOrCreateDataFragment();
+  assert(!SD.getFragment() && "Unexpected fragment on symbol data!");
+  SD.setFragment(F);
+  SD.setOffset(F->getContents().size());
+}
+
+
+void MCPureStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
+void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                  uint64_t Size, unsigned ByteAlignment) {
+  report_fatal_error("not yet implemented in pure streamer");
+}
+
+void MCPureStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
+}
+
+void MCPureStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                          int64_t Value, unsigned ValueSize,
+                                          unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+void MCPureStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                       unsigned MaxBytesToEmit) {
+  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
+  // MCObjectStreamer.
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
+                                           getCurrentSectionData());
+  F->setEmitNops(true);
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+bool MCPureStreamer::EmitValueToOffset(const MCExpr *Offset,
+                                       unsigned char Value) {
+  new MCOrgFragment(*Offset, Value, getCurrentSectionData());
+  return false;
+}
+
+void MCPureStreamer::EmitInstToFragment(const MCInst &Inst) {
+  MCRelaxableFragment *IF =
+    new MCRelaxableFragment(Inst, getCurrentSectionData());
+
+  // Add the fixups and data.
+  //
+  // FIXME: Revisit this design decision when relaxation is done, we may be
+  // able to get away with not storing any extra data in the MCInst.
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  IF->getContents() = Code;
+  IF->getFixups() = Fixups;
+}
+
+void MCPureStreamer::EmitInstToData(const MCInst &Inst) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+  VecOS.flush();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCPureStreamer::FinishImpl() {
+  // FIXME: Handle DWARF tables?
+
+  this->MCObjectStreamer::FinishImpl();
+}
+
+MCStreamer *llvm::createPureStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_ostream &OS, MCCodeEmitter *CE) {
+  return new MCPureStreamer(Context, MAB, OS, CE);
+}
diff --git a/contrib/llvm/lib/MC/MCRegisterInfo.cpp b/contrib/llvm/lib/MC/MCRegisterInfo.cpp
new file mode 100644
index 000000000000..5c71106c9017
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCRegisterInfo.cpp
@@ -0,0 +1,74 @@
+//=== MC/MCRegisterInfo.cpp - Target Register Description -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements MCRegisterInfo functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+using namespace llvm;
+
+unsigned MCRegisterInfo::getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
+                                             const MCRegisterClass *RC) const {
+  for (MCSuperRegIterator Supers(Reg, this); Supers.isValid(); ++Supers)
+    if (RC->contains(*Supers) && Reg == getSubReg(*Supers, SubIdx))
+      return *Supers;
+  return 0;
+}
+
+unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
+  assert(Idx && Idx < getNumSubRegIndices() &&
+         "This is not a subregister index");
+  // Get a pointer to the corresponding SubRegIndices list. This list has the
+  // name of each sub-register in the same order as MCSubRegIterator.
+  const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
+  for (MCSubRegIterator Subs(Reg, this); Subs.isValid(); ++Subs, ++SRI)
+    if (*SRI == Idx)
+      return *Subs;
+  return 0;
+}
+
+unsigned MCRegisterInfo::getSubRegIndex(unsigned Reg, unsigned SubReg) const {
+  assert(SubReg && SubReg < getNumRegs() && "This is not a register");
+  // Get a pointer to the corresponding SubRegIndices list. This list has the
+  // name of each sub-register in the same order as MCSubRegIterator.
+  const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
+  for (MCSubRegIterator Subs(Reg, this); Subs.isValid(); ++Subs, ++SRI)
+    if (*Subs == SubReg)
+      return *SRI;
+  return 0;
+}
+
+int MCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
+  unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
+
+  DwarfLLVMRegPair Key = { RegNum, 0 };
+  const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
+  if (I == M+Size || I->FromReg != RegNum)
+    return -1;
+  return I->ToReg;
+}
+
+int MCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
+  const DwarfLLVMRegPair *M = isEH ? EHDwarf2LRegs : Dwarf2LRegs;
+  unsigned Size = isEH ? EHDwarf2LRegsSize : Dwarf2LRegsSize;
+
+  DwarfLLVMRegPair Key = { RegNum, 0 };
+  const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
+  assert(I != M+Size && I->FromReg == RegNum && "Invalid RegNum");
+  return I->ToReg;
+}
+
+int MCRegisterInfo::getSEHRegNum(unsigned RegNum) const {
+  const DenseMap<unsigned, int>::const_iterator I = L2SEHRegs.find(RegNum);
+  if (I == L2SEHRegs.end()) return (int)RegNum;
+  return I->second;
+}
diff --git a/contrib/llvm/lib/MC/MCSection.cpp b/contrib/llvm/lib/MC/MCSection.cpp
new file mode 100644
index 000000000000..ccf4a7dddf73
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSection.cpp
@@ -0,0 +1,22 @@
+//===- lib/MC/MCSection.cpp - Machine Code Section Representation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MCSection
+//===----------------------------------------------------------------------===//
+
+MCSection::~MCSection() {
+}
+
diff --git a/contrib/llvm/lib/MC/MCSectionCOFF.cpp b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
new file mode 100644
index 000000000000..aac93775aebe
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
@@ -0,0 +1,84 @@
+//===- lib/MC/MCSectionCOFF.cpp - COFF Code Section Representation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSectionCOFF::~MCSectionCOFF() {} // anchor.
+
+// ShouldOmitSectionDirective - Decides whether a '.section' directive
+// should be printed before the section name
+bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
+                                               const MCAsmInfo &MAI) const {
+
+  // FIXME: Does .section .bss/.data/.text work everywhere??
+  if (Name == ".text" || Name == ".data" || Name == ".bss")
+    return true;
+
+  return false;
+}
+
+void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                         raw_ostream &OS) const {
+
+  // standard sections don't require the '.section'
+  if (ShouldOmitSectionDirective(SectionName, MAI)) {
+    OS << '\t' << getSectionName() << '\n';
+    return;
+  }
+
+  OS << "\t.section\t" << getSectionName() << ",\"";
+  if (getKind().isText())
+    OS << 'x';
+  if (getKind().isWriteable())
+    OS << 'w';
+  else
+    OS << 'r';
+  if (getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE)
+    OS << 'n';
+  OS << "\"\n";
+
+  if (getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT) {
+    switch (Selection) {
+      case COFF::IMAGE_COMDAT_SELECT_NODUPLICATES:
+        OS << "\t.linkonce one_only\n";
+        break;
+      case COFF::IMAGE_COMDAT_SELECT_ANY:
+        OS << "\t.linkonce discard\n";
+        break;
+      case COFF::IMAGE_COMDAT_SELECT_SAME_SIZE:
+        OS << "\t.linkonce same_size\n";
+        break;
+      case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
+        OS << "\t.linkonce same_contents\n";
+        break;
+    //NOTE: as of binutils 2.20, there is no way to specifiy select largest
+    //      with the .linkonce directive. For now, we treat it as an invalid
+    //      comdat selection value.
+      case COFF::IMAGE_COMDAT_SELECT_LARGEST:
+    //  OS << "\t.linkonce largest\n";
+    //  break;
+      default:
+        assert (0 && "unsupported COFF selection type");
+        break;
+    }
+  }
+}
+
+bool MCSectionCOFF::UseCodeAlign() const {
+  return getKind().isText();
+}
+
+bool MCSectionCOFF::isVirtualSection() const {
+  return getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+}
diff --git a/contrib/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm/lib/MC/MCSectionELF.cpp
new file mode 100644
index 000000000000..0775cfa776d7
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSectionELF.cpp
@@ -0,0 +1,150 @@
+//===- lib/MC/MCSectionELF.cpp - ELF Code Section Representation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSectionELF::~MCSectionELF() {} // anchor.
+
+// ShouldOmitSectionDirective - Decides whether a '.section' directive
+// should be printed before the section name
+bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
+                                              const MCAsmInfo &MAI) const {
+
+  // FIXME: Does .section .bss/.data/.text work everywhere??
+  if (Name == ".text" || Name == ".data" ||
+      (Name == ".bss" && !MAI.usesELFSectionDirectiveForBSS()))
+    return true;
+
+  return false;
+}
+
+void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                        raw_ostream &OS) const {
+
+  if (ShouldOmitSectionDirective(SectionName, MAI)) {
+    OS << '\t' << getSectionName() << '\n';
+    return;
+  }
+
+  StringRef name = getSectionName();
+  if (name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == name.npos) {
+    OS << "\t.section\t" << name;
+  } else {
+    OS << "\t.section\t\"";
+    for (const char *b = name.begin(), *e = name.end(); b < e; ++b) {
+      if (*b == '"') // Unquoted "
+        OS << "\\\"";
+      else if (*b != '\\') // Neither " or backslash
+        OS << *b;
+      else if (b + 1 == e) // Trailing backslash
+        OS << "\\\\";
+      else {
+        OS << b[0] << b[1]; // Quoted character
+        ++b;
+      }
+    }
+    OS << '"';
+  }
+
+  // Handle the weird solaris syntax if desired.
+  if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
+      !(Flags & ELF::SHF_MERGE)) {
+    if (Flags & ELF::SHF_ALLOC)
+      OS << ",#alloc";
+    if (Flags & ELF::SHF_EXECINSTR)
+      OS << ",#execinstr";
+    if (Flags & ELF::SHF_WRITE)
+      OS << ",#write";
+    if (Flags & ELF::SHF_TLS)
+      OS << ",#tls";
+    OS << '\n';
+    return;
+  }
+
+  OS << ",\"";
+  if (Flags & ELF::SHF_ALLOC)
+    OS << 'a';
+  if (Flags & ELF::SHF_EXECINSTR)
+    OS << 'x';
+  if (Flags & ELF::SHF_GROUP)
+    OS << 'G';
+  if (Flags & ELF::SHF_WRITE)
+    OS << 'w';
+  if (Flags & ELF::SHF_MERGE)
+    OS << 'M';
+  if (Flags & ELF::SHF_STRINGS)
+    OS << 'S';
+  if (Flags & ELF::SHF_TLS)
+    OS << 'T';
+
+  // If there are target-specific flags, print them.
+  if (Flags & ELF::XCORE_SHF_CP_SECTION)
+    OS << 'c';
+  if (Flags & ELF::XCORE_SHF_DP_SECTION)
+    OS << 'd';
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (MAI.getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  if (Type == ELF::SHT_INIT_ARRAY)
+    OS << "init_array";
+  else if (Type == ELF::SHT_FINI_ARRAY)
+    OS << "fini_array";
+  else if (Type == ELF::SHT_PREINIT_ARRAY)
+    OS << "preinit_array";
+  else if (Type == ELF::SHT_NOBITS)
+    OS << "nobits";
+  else if (Type == ELF::SHT_NOTE)
+    OS << "note";
+  else if (Type == ELF::SHT_PROGBITS)
+    OS << "progbits";
+
+  if (EntrySize) {
+    assert(Flags & ELF::SHF_MERGE);
+    OS << "," << EntrySize;
+  }
+
+  if (Flags & ELF::SHF_GROUP)
+    OS << "," << Group->getName() << ",comdat";
+  OS << '\n';
+}
+
+bool MCSectionELF::UseCodeAlign() const {
+  return getFlags() & ELF::SHF_EXECINSTR;
+}
+
+bool MCSectionELF::isVirtualSection() const {
+  return getType() == ELF::SHT_NOBITS;
+}
+
+unsigned MCSectionELF::DetermineEntrySize(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString()) return 1;
+  if (Kind.isMergeable2ByteCString()) return 2;
+  if (Kind.isMergeable4ByteCString()) return 4;
+  if (Kind.isMergeableConst4())       return 4;
+  if (Kind.isMergeableConst8())       return 8;
+  if (Kind.isMergeableConst16())      return 16;
+  return 0;
+}
diff --git a/contrib/llvm/lib/MC/MCSectionMachO.cpp b/contrib/llvm/lib/MC/MCSectionMachO.cpp
new file mode 100644
index 000000000000..fc323155befa
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSectionMachO.cpp
@@ -0,0 +1,303 @@
+//===- lib/MC/MCSectionMachO.cpp - MachO Code Section Representation ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+using namespace llvm;
+
+/// SectionTypeDescriptors - These are strings that describe the various section
+/// types.  This *must* be kept in order with and stay synchronized with the
+/// section type list.
+static const struct {
+  const char *AssemblerName, *EnumName;
+} SectionTypeDescriptors[MCSectionMachO::LAST_KNOWN_SECTION_TYPE+1] = {
+  { "regular",                  "S_REGULAR" },                    // 0x00
+  { 0,                          "S_ZEROFILL" },                   // 0x01
+  { "cstring_literals",         "S_CSTRING_LITERALS" },           // 0x02
+  { "4byte_literals",           "S_4BYTE_LITERALS" },             // 0x03
+  { "8byte_literals",           "S_8BYTE_LITERALS" },             // 0x04
+  { "literal_pointers",         "S_LITERAL_POINTERS" },           // 0x05
+  { "non_lazy_symbol_pointers", "S_NON_LAZY_SYMBOL_POINTERS" },   // 0x06
+  { "lazy_symbol_pointers",     "S_LAZY_SYMBOL_POINTERS" },       // 0x07
+  { "symbol_stubs",             "S_SYMBOL_STUBS" },               // 0x08
+  { "mod_init_funcs",           "S_MOD_INIT_FUNC_POINTERS" },     // 0x09
+  { "mod_term_funcs",           "S_MOD_TERM_FUNC_POINTERS" },     // 0x0A
+  { "coalesced",                "S_COALESCED" },                  // 0x0B
+  { 0, /*FIXME??*/              "S_GB_ZEROFILL" },                // 0x0C
+  { "interposing",              "S_INTERPOSING" },                // 0x0D
+  { "16byte_literals",          "S_16BYTE_LITERALS" },            // 0x0E
+  { 0, /*FIXME??*/              "S_DTRACE_DOF" },                 // 0x0F
+  { 0, /*FIXME??*/              "S_LAZY_DYLIB_SYMBOL_POINTERS" }, // 0x10
+  { "thread_local_regular",     "S_THREAD_LOCAL_REGULAR" },       // 0x11
+  { "thread_local_zerofill",    "S_THREAD_LOCAL_ZEROFILL" },      // 0x12
+  { "thread_local_variables",   "S_THREAD_LOCAL_VARIABLES" },     // 0x13
+  { "thread_local_variable_pointers",
+    "S_THREAD_LOCAL_VARIABLE_POINTERS" },                         // 0x14
+  { "thread_local_init_function_pointers",
+    "S_THREAD_LOCAL_INIT_FUNCTION_POINTERS"},                     // 0x15
+};
+
+
+/// SectionAttrDescriptors - This is an array of descriptors for section
+/// attributes.  Unlike the SectionTypeDescriptors, this is not directly indexed
+/// by attribute, instead it is searched.  The last entry has an AttrFlagEnd
+/// AttrFlag value.
+static const struct {
+  unsigned AttrFlag;
+  const char *AssemblerName, *EnumName;
+} SectionAttrDescriptors[] = {
+#define ENTRY(ASMNAME, ENUM) \
+  { MCSectionMachO::ENUM, ASMNAME, #ENUM },
+ENTRY("pure_instructions",   S_ATTR_PURE_INSTRUCTIONS)
+ENTRY("no_toc",              S_ATTR_NO_TOC)
+ENTRY("strip_static_syms",   S_ATTR_STRIP_STATIC_SYMS)
+ENTRY("no_dead_strip",       S_ATTR_NO_DEAD_STRIP)
+ENTRY("live_support",        S_ATTR_LIVE_SUPPORT)
+ENTRY("self_modifying_code", S_ATTR_SELF_MODIFYING_CODE)
+ENTRY("debug",               S_ATTR_DEBUG)
+ENTRY(0 /*FIXME*/,           S_ATTR_SOME_INSTRUCTIONS)
+ENTRY(0 /*FIXME*/,           S_ATTR_EXT_RELOC)
+ENTRY(0 /*FIXME*/,           S_ATTR_LOC_RELOC)
+#undef ENTRY
+  { 0, "none", 0 }, // used if section has no attributes but has a stub size
+#define AttrFlagEnd 0xffffffff // non legal value, multiple attribute bits set
+  { AttrFlagEnd, 0, 0 }
+};
+
+MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
+                               unsigned TAA, unsigned reserved2, SectionKind K)
+  : MCSection(SV_MachO, K), TypeAndAttributes(TAA), Reserved2(reserved2) {
+  assert(Segment.size() <= 16 && Section.size() <= 16 &&
+         "Segment or section string too long");
+  for (unsigned i = 0; i != 16; ++i) {
+    if (i < Segment.size())
+      SegmentName[i] = Segment[i];
+    else
+      SegmentName[i] = 0;
+
+    if (i < Section.size())
+      SectionName[i] = Section[i];
+    else
+      SectionName[i] = 0;
+  }
+}
+
+void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                          raw_ostream &OS) const {
+  OS << "\t.section\t" << getSegmentName() << ',' << getSectionName();
+
+  // Get the section type and attributes.
+  unsigned TAA = getTypeAndAttributes();
+  if (TAA == 0) {
+    OS << '\n';
+    return;
+  }
+
+  unsigned SectionType = TAA & MCSectionMachO::SECTION_TYPE;
+  assert(SectionType <= MCSectionMachO::LAST_KNOWN_SECTION_TYPE &&
+         "Invalid SectionType specified!");
+
+  if (SectionTypeDescriptors[SectionType].AssemblerName) {
+    OS << ',';
+    OS << SectionTypeDescriptors[SectionType].AssemblerName;
+  } else {
+    // If we have no name for the attribute, stop here.
+    OS << '\n';
+    return;
+  }
+
+  // If we don't have any attributes, we're done.
+  unsigned SectionAttrs = TAA & MCSectionMachO::SECTION_ATTRIBUTES;
+  if (SectionAttrs == 0) {
+    // If we have a S_SYMBOL_STUBS size specified, print it along with 'none' as
+    // the attribute specifier.
+    if (Reserved2 != 0)
+      OS << ",none," << Reserved2;
+    OS << '\n';
+    return;
+  }
+
+  // Check each attribute to see if we have it.
+  char Separator = ',';
+  for (unsigned i = 0;
+       SectionAttrs != 0 && SectionAttrDescriptors[i].AttrFlag;
+       ++i) {
+    // Check to see if we have this attribute.
+    if ((SectionAttrDescriptors[i].AttrFlag & SectionAttrs) == 0)
+      continue;
+
+    // Yep, clear it and print it.
+    SectionAttrs &= ~SectionAttrDescriptors[i].AttrFlag;
+
+    OS << Separator;
+    if (SectionAttrDescriptors[i].AssemblerName)
+      OS << SectionAttrDescriptors[i].AssemblerName;
+    else
+      OS << "<<" << SectionAttrDescriptors[i].EnumName << ">>";
+    Separator = '+';
+  }
+
+  assert(SectionAttrs == 0 && "Unknown section attributes!");
+
+  // If we have a S_SYMBOL_STUBS size specified, print it.
+  if (Reserved2 != 0)
+    OS << ',' << Reserved2;
+  OS << '\n';
+}
+
+bool MCSectionMachO::UseCodeAlign() const {
+  return hasAttribute(MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS);
+}
+
+bool MCSectionMachO::isVirtualSection() const {
+  return (getType() == MCSectionMachO::S_ZEROFILL ||
+          getType() == MCSectionMachO::S_GB_ZEROFILL ||
+          getType() == MCSectionMachO::S_THREAD_LOCAL_ZEROFILL);
+}
+
+/// StripSpaces - This removes leading and trailing spaces from the StringRef.
+static void StripSpaces(StringRef &Str) {
+  while (!Str.empty() && isspace(static_cast<unsigned char>(Str[0])))
+    Str = Str.substr(1);
+  while (!Str.empty() && isspace(static_cast<unsigned char>(Str.back())))
+    Str = Str.substr(0, Str.size()-1);
+}
+
+/// ParseSectionSpecifier - Parse the section specifier indicated by "Spec".
+/// This is a string that can appear after a .section directive in a mach-o
+/// flavored .s file.  If successful, this fills in the specified Out
+/// parameters and returns an empty string.  When an invalid section
+/// specifier is present, this returns a string indicating the problem.
+std::string MCSectionMachO::ParseSectionSpecifier(StringRef Spec,        // In.
+                                                  StringRef &Segment,    // Out.
+                                                  StringRef &Section,    // Out.
+                                                  unsigned  &TAA,        // Out.
+                                                  bool      &TAAParsed,  // Out.
+                                                  unsigned  &StubSize) { // Out.
+  TAAParsed = false;
+  // Find the first comma.
+  std::pair<StringRef, StringRef> Comma = Spec.split(',');
+
+  // If there is no comma, we fail.
+  if (Comma.second.empty())
+    return "mach-o section specifier requires a segment and section "
+           "separated by a comma";
+
+  // Capture segment, remove leading and trailing whitespace.
+  Segment = Comma.first;
+  StripSpaces(Segment);
+
+  // Verify that the segment is present and not too long.
+  if (Segment.empty() || Segment.size() > 16)
+    return "mach-o section specifier requires a segment whose length is "
+           "between 1 and 16 characters";
+
+  // Split the section name off from any attributes if present.
+  Comma = Comma.second.split(',');
+
+  // Capture section, remove leading and trailing whitespace.
+  Section = Comma.first;
+  StripSpaces(Section);
+
+  // Verify that the section is present and not too long.
+  if (Section.empty() || Section.size() > 16)
+    return "mach-o section specifier requires a section whose length is "
+           "between 1 and 16 characters";
+
+  // If there is no comma after the section, we're done.
+  TAA = 0;
+  StubSize = 0;
+  if (Comma.second.empty())
+    return "";
+
+  // Otherwise, we need to parse the section type and attributes.
+  Comma = Comma.second.split(',');
+
+  // Get the section type.
+  StringRef SectionType = Comma.first;
+  StripSpaces(SectionType);
+
+  // Figure out which section type it is.
+  unsigned TypeID;
+  for (TypeID = 0; TypeID !=MCSectionMachO::LAST_KNOWN_SECTION_TYPE+1; ++TypeID)
+    if (SectionTypeDescriptors[TypeID].AssemblerName &&
+        SectionType == SectionTypeDescriptors[TypeID].AssemblerName)
+      break;
+
+  // If we didn't find the section type, reject it.
+  if (TypeID > MCSectionMachO::LAST_KNOWN_SECTION_TYPE)
+    return "mach-o section specifier uses an unknown section type";
+
+  // Remember the TypeID.
+  TAA = TypeID;
+  TAAParsed = true;
+
+  // If we have no comma after the section type, there are no attributes.
+  if (Comma.second.empty()) {
+    // S_SYMBOL_STUBS always require a symbol stub size specifier.
+    if (TAA == MCSectionMachO::S_SYMBOL_STUBS)
+      return "mach-o section specifier of type 'symbol_stubs' requires a size "
+             "specifier";
+    return "";
+  }
+
+  // Otherwise, we do have some attributes.  Split off the size specifier if
+  // present.
+  Comma = Comma.second.split(',');
+  StringRef Attrs = Comma.first;
+
+  // The attribute list is a '+' separated list of attributes.
+  std::pair<StringRef, StringRef> Plus = Attrs.split('+');
+
+  while (1) {
+    StringRef Attr = Plus.first;
+    StripSpaces(Attr);
+
+    // Look up the attribute.
+    for (unsigned i = 0; ; ++i) {
+      if (SectionAttrDescriptors[i].AttrFlag == AttrFlagEnd)
+        return "mach-o section specifier has invalid attribute";
+
+      if (SectionAttrDescriptors[i].AssemblerName &&
+          Attr == SectionAttrDescriptors[i].AssemblerName) {
+        TAA |= SectionAttrDescriptors[i].AttrFlag;
+        break;
+      }
+    }
+
+    if (Plus.second.empty()) break;
+    Plus = Plus.second.split('+');
+  };
+
+  // Okay, we've parsed the section attributes, see if we have a stub size spec.
+  if (Comma.second.empty()) {
+    // S_SYMBOL_STUBS always require a symbol stub size specifier.
+    if (TAA == MCSectionMachO::S_SYMBOL_STUBS)
+      return "mach-o section specifier of type 'symbol_stubs' requires a size "
+      "specifier";
+    return "";
+  }
+
+  // If we have a stub size spec, we must have a sectiontype of S_SYMBOL_STUBS.
+  if ((TAA & MCSectionMachO::SECTION_TYPE) != MCSectionMachO::S_SYMBOL_STUBS)
+    return "mach-o section specifier cannot have a stub size specified because "
+           "it does not have type 'symbol_stubs'";
+
+  // Okay, if we do, it must be a number.
+  StringRef StubSizeStr = Comma.second;
+  StripSpaces(StubSizeStr);
+
+  // Convert the stub size from a string to an integer.
+  if (StubSizeStr.getAsInteger(0, StubSize))
+    return "mach-o section specifier has a malformed stub size";
+
+  return "";
+}
diff --git a/contrib/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm/lib/MC/MCStreamer.cpp
new file mode 100644
index 000000000000..d02e5535bde5
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCStreamer.cpp
@@ -0,0 +1,627 @@
+//===- lib/MC/MCStreamer.cpp - Streaming Machine Code Output --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+MCStreamer::MCStreamer(StreamerKind Kind, MCContext &Ctx)
+    : Kind(Kind), Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
+      CurrentW64UnwindInfo(0), LastSymbol(0), AutoInitSections(false) {
+  const MCSection *section = 0;
+  SectionStack.push_back(std::make_pair(section, section));
+}
+
+MCStreamer::~MCStreamer() {
+  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
+    delete W64UnwindInfos[i];
+}
+
+void MCStreamer::reset() {
+  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
+    delete W64UnwindInfos[i];
+  EmitEHFrame = true;
+  EmitDebugFrame = false;
+  CurrentW64UnwindInfo = 0;
+  LastSymbol = 0;
+  const MCSection *section = 0;
+  SectionStack.clear();
+  SectionStack.push_back(std::make_pair(section, section));
+}
+
+const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
+                                          const MCSymbol *A,
+                                          const MCSymbol *B) {
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  const MCExpr *ARef =
+    MCSymbolRefExpr::Create(A, Variant, Context);
+  const MCExpr *BRef =
+    MCSymbolRefExpr::Create(B, Variant, Context);
+  const MCExpr *AddrDelta =
+    MCBinaryExpr::Create(MCBinaryExpr::Sub, ARef, BRef, Context);
+  return AddrDelta;
+}
+
+const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) {
+  if (Context.getAsmInfo().hasAggressiveSymbolFolding() ||
+      isa<MCSymbolRefExpr>(Expr))
+    return Expr;
+
+  MCSymbol *ABS = Context.CreateTempSymbol();
+  EmitAssignment(ABS, Expr);
+  return MCSymbolRefExpr::Create(ABS, Context);
+}
+
+raw_ostream &MCStreamer::GetCommentOS() {
+  // By default, discard comments.
+  return nulls();
+}
+
+void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
+                                      const MCSymbol *Label, int PointerSize) {
+  // emit the sequence to set the address
+  EmitIntValue(dwarf::DW_LNS_extended_op, 1);
+  EmitULEB128IntValue(PointerSize + 1);
+  EmitIntValue(dwarf::DW_LNE_set_address, 1);
+  EmitSymbolValue(Label, PointerSize);
+
+  // emit the sequence for the LineDelta (from 1) and a zero address delta.
+  MCDwarfLineAddr::Emit(this, LineDelta, 0);
+}
+
+/// EmitIntValue - Special case of EmitValue that avoids the client having to
+/// pass in a MCExpr for constant integers.
+void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size,
+                              unsigned AddrSpace) {
+  assert(Size <= 8 && "Invalid size");
+  assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
+         "Invalid size");
+  char buf[8];
+  const bool isLittleEndian = Context.getAsmInfo().isLittleEndian();
+  for (unsigned i = 0; i != Size; ++i) {
+    unsigned index = isLittleEndian ? i : (Size - i - 1);
+    buf[i] = uint8_t(Value >> (index * 8));
+  }
+  EmitBytes(StringRef(buf, Size), AddrSpace);
+}
+
+/// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
+/// client having to pass in a MCExpr for constant integers.
+void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned Padding,
+                                     unsigned AddrSpace) {
+  SmallString<128> Tmp;
+  raw_svector_ostream OSE(Tmp);
+  encodeULEB128(Value, OSE, Padding);
+  EmitBytes(OSE.str(), AddrSpace);
+}
+
+/// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
+/// client having to pass in a MCExpr for constant integers.
+void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) {
+  SmallString<128> Tmp;
+  raw_svector_ostream OSE(Tmp);
+  encodeSLEB128(Value, OSE);
+  EmitBytes(OSE.str(), AddrSpace);
+}
+
+void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size,
+                              unsigned AddrSpace) {
+  const MCExpr *ABS = ForceExpAbs(Value);
+  EmitValue(ABS, Size, AddrSpace);
+}
+
+
+void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
+                           unsigned AddrSpace) {
+  EmitValueImpl(Value, Size, AddrSpace);
+}
+
+void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
+                                  unsigned AddrSpace) {
+  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size,
+                AddrSpace);
+}
+
+void MCStreamer::EmitGPRel64Value(const MCExpr *Value) {
+  report_fatal_error("unsupported directive in streamer");
+}
+
+void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  report_fatal_error("unsupported directive in streamer");
+}
+
+/// EmitFill - Emit NumBytes bytes worth of the value specified by
+/// FillValue.  This implements directives such as '.space'.
+void MCStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                          unsigned AddrSpace) {
+  const MCExpr *E = MCConstantExpr::Create(FillValue, getContext());
+  for (uint64_t i = 0, e = NumBytes; i != e; ++i)
+    EmitValue(E, 1, AddrSpace);
+}
+
+bool MCStreamer::EmitDwarfFileDirective(unsigned FileNo,
+                                        StringRef Directory,
+                                        StringRef Filename, unsigned CUID) {
+  return getContext().GetDwarfFile(Directory, Filename, FileNo, CUID) == 0;
+}
+
+void MCStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                       unsigned Column, unsigned Flags,
+                                       unsigned Isa,
+                                       unsigned Discriminator,
+                                       StringRef FileName) {
+  getContext().setCurrentDwarfLoc(FileNo, Line, Column, Flags, Isa,
+                                  Discriminator);
+}
+
+MCDwarfFrameInfo *MCStreamer::getCurrentFrameInfo() {
+  if (FrameInfos.empty())
+    return 0;
+  return &FrameInfos.back();
+}
+
+void MCStreamer::EnsureValidFrame() {
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  if (!CurFrame || CurFrame->End)
+    report_fatal_error("No open frame");
+}
+
+void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+                                     MCSymbol *EHSymbol) {
+}
+
+void MCStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+  Symbol->setSection(*getCurrentSection());
+  LastSymbol = Symbol;
+}
+
+void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+  Symbol->setSection(*getCurrentSection());
+  LastSymbol = Symbol;
+}
+
+void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->CompactUnwindEncoding = CompactUnwindEncoding;
+}
+
+void MCStreamer::EmitCFISections(bool EH, bool Debug) {
+  assert(EH || Debug);
+  EmitEHFrame = EH;
+  EmitDebugFrame = Debug;
+}
+
+void MCStreamer::EmitCFIStartProc() {
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  if (CurFrame && !CurFrame->End)
+    report_fatal_error("Starting a frame before finishing the previous one!");
+
+  MCDwarfFrameInfo Frame;
+  EmitCFIStartProcImpl(Frame);
+
+  FrameInfos.push_back(Frame);
+}
+
+void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+}
+
+void MCStreamer::RecordProcStart(MCDwarfFrameInfo &Frame) {
+  Frame.Function = LastSymbol;
+  // If the function is externally visible, we need to create a local
+  // symbol to avoid relocations.
+  StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix();
+  if (LastSymbol && LastSymbol->getName().startswith(Prefix)) {
+    Frame.Begin = LastSymbol;
+  } else {
+    Frame.Begin = getContext().CreateTempSymbol();
+    EmitLabel(Frame.Begin);
+  }
+}
+
+void MCStreamer::EmitCFIEndProc() {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  EmitCFIEndProcImpl(*CurFrame);
+}
+
+void MCStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+}
+
+void MCStreamer::RecordProcEnd(MCDwarfFrameInfo &Frame) {
+  Frame.End = getContext().CreateTempSymbol();
+  EmitLabel(Frame.End);
+}
+
+MCSymbol *MCStreamer::EmitCFICommon() {
+  EnsureValidFrame();
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  EmitLabel(Label);
+  return Label;
+}
+
+void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createDefCfa(Label, Register, Offset);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createDefCfaOffset(Label, Offset);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createAdjustCfaOffset(Label, Adjustment);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createDefCfaRegister(Label, Register);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createOffset(Label, Register, Offset);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createRelOffset(Label, Register, Offset);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+                                    unsigned Encoding) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Personality = Sym;
+  CurFrame->PersonalityEncoding = Encoding;
+}
+
+void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Lsda = Sym;
+  CurFrame->LsdaEncoding = Encoding;
+}
+
+void MCStreamer::EmitCFIRememberState() {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction = MCCFIInstruction::createRememberState(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIRestoreState() {
+  // FIXME: Error if there is no matching cfi_remember_state.
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction = MCCFIInstruction::createRestoreState(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFISameValue(int64_t Register) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createSameValue(Label, Register);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIRestore(int64_t Register) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createRestore(Label, Register);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIEscape(StringRef Values) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction = MCCFIInstruction::createEscape(Label, Values);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFISignalFrame() {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->IsSignalFrame = true;
+}
+
+void MCStreamer::EmitCFIUndefined(int64_t Register) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createUndefined(Label, Register);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createRegister(Label, Register1, Register2);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
+void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
+  W64UnwindInfos.push_back(Frame);
+  CurrentW64UnwindInfo = W64UnwindInfos.back();
+}
+
+void MCStreamer::EnsureValidW64UnwindInfo() {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame || CurFrame->End)
+    report_fatal_error("No open Win64 EH frame function!");
+}
+
+void MCStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame && !CurFrame->End)
+    report_fatal_error("Starting a function before ending the previous one!");
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = Symbol;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndProc() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Not all chained regions terminated!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+}
+
+void MCStreamer::EmitWin64EHStartChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *Frame = new MCWin64EHUnwindInfo;
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  Frame->Begin = getContext().CreateTempSymbol();
+  Frame->Function = CurFrame->Function;
+  Frame->ChainedParent = CurFrame;
+  EmitLabel(Frame->Begin);
+  setCurrentW64UnwindInfo(Frame);
+}
+
+void MCStreamer::EmitWin64EHEndChained() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (!CurFrame->ChainedParent)
+    report_fatal_error("End of a chained region outside a chained region!");
+  CurFrame->End = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->End);
+  CurrentW64UnwindInfo = CurFrame->ChainedParent;
+}
+
+void MCStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
+                                    bool Except) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+  CurFrame->ExceptionHandler = Sym;
+  if (!Except && !Unwind)
+    report_fatal_error("Don't know what kind of handler this is!");
+  if (Unwind)
+    CurFrame->HandlesUnwind = true;
+  if (Except)
+    CurFrame->HandlesExceptions = true;
+}
+
+void MCStreamer::EmitWin64EHHandlerData() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->ChainedParent)
+    report_fatal_error("Chained unwind areas can't have handlers!");
+}
+
+void MCStreamer::EmitWin64EHPushReg(unsigned Register) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushNonVol, Label, Register);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->LastFrameInst >= 0)
+    report_fatal_error("Frame register and offset already specified!");
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned frame pointer offset!");
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, 0, Register, Offset);
+  CurFrame->LastFrameInst = CurFrame->Instructions.size();
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHAllocStack(unsigned Size) {
+  EnsureValidW64UnwindInfo();
+  if (Size & 7)
+    report_fatal_error("Misaligned stack allocation!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Label, Size);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveReg(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 7)
+    report_fatal_error("Misaligned saved register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+     Offset > 512*1024-8 ? Win64EH::UOP_SaveNonVolBig : Win64EH::UOP_SaveNonVol,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHSaveXMM(unsigned Register, unsigned Offset) {
+  EnsureValidW64UnwindInfo();
+  if (Offset & 0x0F)
+    report_fatal_error("Misaligned saved vector register offset!");
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(
+    Offset > 512*1024-16 ? Win64EH::UOP_SaveXMM128Big : Win64EH::UOP_SaveXMM128,
+                            Label, Register, Offset);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHPushFrame(bool Code) {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  if (CurFrame->Instructions.size() > 0)
+    report_fatal_error("If present, PushMachFrame must be the first UOP");
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_PushMachFrame, Label, Code);
+  EmitLabel(Label);
+  CurFrame->Instructions.push_back(Inst);
+}
+
+void MCStreamer::EmitWin64EHEndProlog() {
+  EnsureValidW64UnwindInfo();
+  MCWin64EHUnwindInfo *CurFrame = CurrentW64UnwindInfo;
+  CurFrame->PrologEnd = getContext().CreateTempSymbol();
+  EmitLabel(CurFrame->PrologEnd);
+}
+
+void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
+  llvm_unreachable("This file format doesn't support this directive");
+}
+
+void MCStreamer::EmitFnStart() {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitFnEnd() {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitCantUnwind() {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitHandlerData() {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitPersonality(const MCSymbol *Personality) {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitPad(int64_t Offset) {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool) {
+  errs() << "Not implemented yet\n";
+  abort();
+}
+
+void MCStreamer::EmitTCEntry(const MCSymbol &S) {
+  llvm_unreachable("Unsupported method");
+}
+
+/// EmitRawText - If this file is backed by an assembly streamer, this dumps
+/// the specified string in the output .s file.  This capability is
+/// indicated by the hasRawTextSupport() predicate.
+void MCStreamer::EmitRawText(StringRef String) {
+  errs() << "EmitRawText called on an MCStreamer that doesn't support it, "
+  " something must not be fully mc'ized\n";
+  abort();
+}
+
+void MCStreamer::EmitRawText(const Twine &T) {
+  SmallString<128> Str;
+  T.toVector(Str);
+  EmitRawText(Str.str());
+}
+
+void MCStreamer::EmitFrames(bool usingCFI) {
+  if (!getNumFrameInfos())
+    return;
+
+  if (EmitEHFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+
+  if (EmitDebugFrame)
+    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+}
+
+void MCStreamer::EmitW64Tables() {
+  if (!getNumW64UnwindInfos())
+    return;
+
+  MCWin64EHUnwindEmitter::Emit(*this);
+}
+
+void MCStreamer::Finish() {
+  if (!FrameInfos.empty() && !FrameInfos.back().End)
+    report_fatal_error("Unfinished frame!");
+
+  FinishImpl();
+}
+
+MCSymbolData &MCStreamer::getOrCreateSymbolData(MCSymbol *Symbol) {
+  report_fatal_error("Not supported!");
+  return *(static_cast<MCSymbolData*>(0));
+}
diff --git a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
new file mode 100644
index 000000000000..f18828dd41ef
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -0,0 +1,118 @@
+//===-- MCSubtargetInfo.cpp - Subtarget Information -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
+
+/// InitMCProcessorInfo - Set or change the CPU (optionally supplemented
+/// with feature string). Recompute feature bits and scheduling model.
+void
+MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
+  SubtargetFeatures Features(FS);
+  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
+                                        ProcFeatures, NumFeatures);
+
+  if (!CPU.empty())
+    CPUSchedModel = getSchedModelForCPU(CPU);
+  else
+    CPUSchedModel = &MCSchedModel::DefaultSchedModel;
+}
+
+void
+MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
+                                     const SubtargetFeatureKV *PF,
+                                     const SubtargetFeatureKV *PD,
+                                     const SubtargetInfoKV *ProcSched,
+                                     const MCWriteProcResEntry *WPR,
+                                     const MCWriteLatencyEntry *WL,
+                                     const MCReadAdvanceEntry *RA,
+                                     const InstrStage *IS,
+                                     const unsigned *OC,
+                                     const unsigned *FP,
+                                     unsigned NF, unsigned NP) {
+  TargetTriple = TT;
+  ProcFeatures = PF;
+  ProcDesc = PD;
+  ProcSchedModels = ProcSched;
+  WriteProcResTable = WPR;
+  WriteLatencyTable = WL;
+  ReadAdvanceTable = RA;
+
+  Stages = IS;
+  OperandCycles = OC;
+  ForwardingPaths = FP;
+  NumFeatures = NF;
+  NumProcs = NP;
+
+  InitMCProcessorInfo(CPU, FS);
+}
+
+/// ToggleFeature - Toggle a feature and returns the re-computed feature
+/// bits. This version does not change the implied bits.
+uint64_t MCSubtargetInfo::ToggleFeature(uint64_t FB) {
+  FeatureBits ^= FB;
+  return FeatureBits;
+}
+
+/// ToggleFeature - Toggle a feature and returns the re-computed feature
+/// bits. This version will also change all implied bits.
+uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
+  SubtargetFeatures Features;
+  FeatureBits = Features.ToggleFeature(FeatureBits, FS,
+                                       ProcFeatures, NumFeatures);
+  return FeatureBits;
+}
+
+
+const MCSchedModel *
+MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
+  assert(ProcSchedModels && "Processor machine model not available!");
+
+#ifndef NDEBUG
+  for (size_t i = 1; i < NumProcs; i++) {
+    assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 &&
+           "Processor machine model table is not sorted");
+  }
+#endif
+
+  // Find entry
+  SubtargetInfoKV KV;
+  KV.Key = CPU.data();
+  const SubtargetInfoKV *Found =
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+  if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
+    errs() << "'" << CPU
+           << "' is not a recognized processor for this target"
+           << " (ignoring processor)\n";
+    return &MCSchedModel::DefaultSchedModel;
+  }
+  assert(Found->Value && "Missing processor SchedModel value");
+  return (const MCSchedModel *)Found->Value;
+}
+
+InstrItineraryData
+MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
+  const MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
+}
+
+/// Initialize an InstrItineraryData instance.
+void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
+  InstrItins =
+    InstrItineraryData(CPUSchedModel, Stages, OperandCycles, ForwardingPaths);
+}
diff --git a/contrib/llvm/lib/MC/MCSymbol.cpp b/contrib/llvm/lib/MC/MCSymbol.cpp
new file mode 100644
index 000000000000..b973c57f7b81
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSymbol.cpp
@@ -0,0 +1,83 @@
+//===- lib/MC/MCSymbol.cpp - MCSymbol implementation ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Sentinel value for the absolute pseudo section.
+const MCSection *MCSymbol::AbsolutePseudoSection =
+  reinterpret_cast<const MCSection *>(1);
+
+static bool isAcceptableChar(char C) {
+  if ((C < 'a' || C > 'z') &&
+      (C < 'A' || C > 'Z') &&
+      (C < '0' || C > '9') &&
+      C != '_' && C != '$' && C != '.' && C != '@')
+    return false;
+  return true;
+}
+
+/// NameNeedsQuoting - Return true if the identifier \p Str needs quotes to be
+/// syntactically correct.
+static bool NameNeedsQuoting(StringRef Str) {
+  assert(!Str.empty() && "Cannot create an empty MCSymbol");
+
+  // If any of the characters in the string is an unacceptable character, force
+  // quotes.
+  for (unsigned i = 0, e = Str.size(); i != e; ++i)
+    if (!isAcceptableChar(Str[i]))
+      return true;
+  return false;
+}
+
+const MCSymbol &MCSymbol::AliasedSymbol() const {
+  const MCSymbol *S = this;
+  while (S->isVariable()) {
+    const MCExpr *Value = S->getVariableValue();
+    if (Value->getKind() != MCExpr::SymbolRef)
+      return *S;
+    const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Value);
+    S = &Ref->getSymbol();
+  }
+  return *S;
+}
+
+void MCSymbol::setVariableValue(const MCExpr *Value) {
+  assert(!IsUsed && "Cannot set a variable that has already been used.");
+  assert(Value && "Invalid variable value!");
+  this->Value = Value;
+
+  // Variables should always be marked as in the same "section" as the value.
+  const MCSection *Section = Value->FindAssociatedSection();
+  if (Section)
+    setSection(*Section);
+  else
+    setUndefined();
+}
+
+void MCSymbol::print(raw_ostream &OS) const {
+  // The name for this MCSymbol is required to be a valid target name.  However,
+  // some targets support quoting names with funny characters.  If the name
+  // contains a funny character, then print it quoted.
+  if (!NameNeedsQuoting(getName())) {
+    OS << getName();
+    return;
+  }
+
+  OS << '"' << getName() << '"';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCSymbol::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCValue.cpp b/contrib/llvm/lib/MC/MCValue.cpp
new file mode 100644
index 000000000000..4393777211e8
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCValue.cpp
@@ -0,0 +1,38 @@
+//===- lib/MC/MCValue.cpp - MCValue implementation ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (isAbsolute()) {
+    OS << getConstant();
+    return;
+  }
+
+  getSymA()->print(OS);
+
+  if (getSymB()) {
+    OS << " - ";
+    getSymB()->print(OS);
+  }
+
+  if (getConstant())
+    OS << " + " << getConstant();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MCValue::dump() const {
+  print(dbgs(), 0);
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCWin64EH.cpp b/contrib/llvm/lib/MC/MCWin64EH.cpp
new file mode 100644
index 000000000000..c5b637c92443
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCWin64EH.cpp
@@ -0,0 +1,278 @@
+//===- lib/MC/MCWin64EH.cpp - MCWin64EH implementation --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+
+namespace llvm {
+
+// NOTE: All relocations generated here are 4-byte image-relative.
+
+static uint8_t CountOfUnwindCodes(std::vector<MCWin64EHInstruction> &instArray){
+  uint8_t count = 0;
+  for (std::vector<MCWin64EHInstruction>::const_iterator I = instArray.begin(),
+       E = instArray.end(); I != E; ++I) {
+    switch (I->getOperation()) {
+    case Win64EH::UOP_PushNonVol:
+    case Win64EH::UOP_AllocSmall:
+    case Win64EH::UOP_SetFPReg:
+    case Win64EH::UOP_PushMachFrame:
+      count += 1;
+      break;
+    case Win64EH::UOP_SaveNonVol:
+    case Win64EH::UOP_SaveXMM128:
+      count += 2;
+      break;
+    case Win64EH::UOP_SaveNonVolBig:
+    case Win64EH::UOP_SaveXMM128Big:
+      count += 3;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      if (I->getSize() > 512*1024-8)
+        count += 3;
+      else
+        count += 2;
+      break;
+    }
+  }
+  return count;
+}
+
+static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
+                              MCSymbol *rhs) {
+  MCContext &context = streamer.getContext();
+  const MCExpr *diff = MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(
+                                                                  lhs, context),
+                                               MCSymbolRefExpr::Create(
+                                                                  rhs, context),
+                                               context);
+  streamer.EmitAbsValue(diff, 1);
+
+}
+
+static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
+                           MCWin64EHInstruction &inst) {
+  uint8_t b1, b2;
+  uint16_t w;
+  b2 = (inst.getOperation() & 0x0F);
+  switch (inst.getOperation()) {
+  case Win64EH::UOP_PushNonVol:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_AllocLarge:
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    if (inst.getSize() > 512*1024-8) {
+      b2 |= 0x10;
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() & 0xFFF8;
+      streamer.EmitIntValue(w, 2);
+      w = inst.getSize() >> 16;
+    } else {
+      streamer.EmitIntValue(b2, 1);
+      w = inst.getSize() >> 3;
+    }
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_AllocSmall:
+    b2 |= (((inst.getSize()-8) >> 3) & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SetFPReg:
+    b1 = inst.getOffset() & 0xF0;
+    streamer.EmitIntValue(b1, 1);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  case Win64EH::UOP_SaveNonVol:
+  case Win64EH::UOP_SaveXMM128:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    w = inst.getOffset() >> 3;
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128)
+      w >>= 1;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_SaveNonVolBig:
+  case Win64EH::UOP_SaveXMM128Big:
+    b2 |= (inst.getRegister() & 0x0F) << 4;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    if (inst.getOperation() == Win64EH::UOP_SaveXMM128Big)
+      w = inst.getOffset() & 0xFFF0;
+    else
+      w = inst.getOffset() & 0xFFF8;
+    streamer.EmitIntValue(w, 2);
+    w = inst.getOffset() >> 16;
+    streamer.EmitIntValue(w, 2);
+    break;
+  case Win64EH::UOP_PushMachFrame:
+    if (inst.isPushCodeFrame())
+      b2 |= 0x10;
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
+    streamer.EmitIntValue(b2, 1);
+    break;
+  }
+}
+
+static void EmitRuntimeFunction(MCStreamer &streamer,
+                                const MCWin64EHUnwindInfo *info) {
+  MCContext &context = streamer.getContext();
+
+  streamer.EmitValueToAlignment(4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+}
+
+static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol) return;
+
+  MCContext &context = streamer.getContext();
+  streamer.EmitValueToAlignment(4);
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
+  info->Symbol = context.CreateTempSymbol();
+  streamer.EmitLabel(info->Symbol);
+
+  if (info->ChainedParent)
+    flags |= Win64EH::UNW_ChainInfo << 3;
+  else {
+    if (info->HandlesUnwind)
+      flags |= Win64EH::UNW_TerminateHandler << 3;
+    if (info->HandlesExceptions)
+      flags |= Win64EH::UNW_ExceptionHandler << 3;
+  }
+  streamer.EmitIntValue(flags, 1);
+
+  if (info->PrologEnd)
+    EmitAbsDifference(streamer, info->PrologEnd, info->Begin);
+  else
+    streamer.EmitIntValue(0, 1);
+
+  uint8_t numCodes = CountOfUnwindCodes(info->Instructions);
+  streamer.EmitIntValue(numCodes, 1);
+
+  uint8_t frame = 0;
+  if (info->LastFrameInst >= 0) {
+    MCWin64EHInstruction &frameInst = info->Instructions[info->LastFrameInst];
+    assert(frameInst.getOperation() == Win64EH::UOP_SetFPReg);
+    frame = (frameInst.getRegister() & 0x0F) |
+            (frameInst.getOffset() & 0xF0);
+  }
+  streamer.EmitIntValue(frame, 1);
+
+  // Emit unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    MCWin64EHInstruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    EmitUnwindCode(streamer, info->Begin, inst);
+  }
+
+  if (flags & (Win64EH::UNW_ChainInfo << 3))
+    EmitRuntimeFunction(streamer, info->ChainedParent);
+  else if (flags &
+           ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
+                       4);
+  else if (numCodes < 2) {
+    // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
+    // a chained unwind info, if there is no handler, and if there are fewer
+    // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
+    if (numCodes == 1)
+      streamer.EmitIntValue(0, 2);
+    else
+      streamer.EmitIntValue(0, 4);
+  }
+}
+
+StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) {
+  if (!func || !func->isInSection()) return "";
+  const MCSection *section = &func->getSection();
+  const MCSectionCOFF *COFFSection;
+  if ((COFFSection = dyn_cast<MCSectionCOFF>(section))) {
+    StringRef name = COFFSection->getSectionName();
+    size_t dollar = name.find('$');
+    size_t dot = name.find('.', 1);
+    if (dollar == StringRef::npos && dot == StringRef::npos)
+      return "";
+    if (dot == StringRef::npos)
+      return name.substr(dollar);
+    if (dollar == StringRef::npos || dot < dollar)
+      return name.substr(dot);
+    return name.substr(dollar);
+  }
+  return "";
+}
+
+static const MCSection *getWin64EHTableSection(StringRef suffix,
+                                               MCContext &context) {
+  if (suffix == "")
+    return context.getObjectFileInfo()->getXDataSection();
+
+  return context.getCOFFSection((".xdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
+                                SectionKind::getDataRel());
+}
+
+static const MCSection *getWin64EHFuncTableSection(StringRef suffix,
+                                                   MCContext &context) {
+  if (suffix == "")
+    return context.getObjectFileInfo()->getPDataSection();
+  return context.getCOFFSection((".pdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ,
+                                SectionKind::getDataRel());
+}
+
+void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
+                                            MCWin64EHUnwindInfo *info) {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCContext &context = streamer.getContext();
+  const MCSection *xdataSect =
+    getWin64EHTableSection(GetSectionSuffix(info->Function), context);
+  streamer.SwitchSection(xdataSect);
+
+  llvm::EmitUnwindInfo(streamer, info);
+}
+
+void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
+  MCContext &context = streamer.getContext();
+  // Emit the unwind info structs first.
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *xdataSect =
+      getWin64EHTableSection(GetSectionSuffix(info.Function), context);
+    streamer.SwitchSection(xdataSect);
+    llvm::EmitUnwindInfo(streamer, &info);
+  }
+  // Now emit RUNTIME_FUNCTION entries.
+  for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
+    MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
+    const MCSection *pdataSect =
+      getWin64EHFuncTableSection(GetSectionSuffix(info.Function), context);
+    streamer.SwitchSection(pdataSect);
+    EmitRuntimeFunction(streamer, &info);
+  }
+}
+
+} // End of namespace llvm
+
diff --git a/contrib/llvm/lib/MC/MachObjectWriter.cpp b/contrib/llvm/lib/MC/MachObjectWriter.cpp
new file mode 100644
index 000000000000..a5ba3c36532a
--- /dev/null
+++ b/contrib/llvm/lib/MC/MachObjectWriter.cpp
@@ -0,0 +1,939 @@
+//===- lib/MC/MachObjectWriter.cpp - Mach-O File Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachOSymbolFlags.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <vector>
+using namespace llvm;
+using namespace llvm::object;
+
+void MachObjectWriter::reset() {
+  Relocations.clear();
+  IndirectSymBase.clear();
+  StringTable.clear();
+  LocalSymbolData.clear();
+  ExternalSymbolData.clear();
+  UndefinedSymbolData.clear();
+  MCObjectWriter::reset();
+}
+
+bool MachObjectWriter::
+doesSymbolRequireExternRelocation(const MCSymbolData *SD) {
+  // Undefined symbols are always extern.
+  if (SD->Symbol->isUndefined())
+    return true;
+
+  // References to weak definitions require external relocation entries; the
+  // definition may not always be the one in the same object file.
+  if (SD->getFlags() & SF_WeakDefinition)
+    return true;
+
+  // Otherwise, we can use an internal relocation.
+  return false;
+}
+
+bool MachObjectWriter::
+MachSymbolData::operator<(const MachSymbolData &RHS) const {
+  return SymbolData->getSymbol().getName() <
+    RHS.SymbolData->getSymbol().getName();
+}
+
+bool MachObjectWriter::isFixupKindPCRel(const MCAssembler &Asm, unsigned Kind) {
+  const MCFixupKindInfo &FKI = Asm.getBackend().getFixupKindInfo(
+    (MCFixupKind) Kind);
+
+  return FKI.Flags & MCFixupKindInfo::FKF_IsPCRel;
+}
+
+uint64_t MachObjectWriter::getFragmentAddress(const MCFragment *Fragment,
+                                              const MCAsmLayout &Layout) const {
+  return getSectionAddress(Fragment->getParent()) +
+    Layout.getFragmentOffset(Fragment);
+}
+
+uint64_t MachObjectWriter::getSymbolAddress(const MCSymbolData* SD,
+                                            const MCAsmLayout &Layout) const {
+  const MCSymbol &S = SD->getSymbol();
+
+  // If this is a variable, then recursively evaluate now.
+  if (S.isVariable()) {
+    if (const MCConstantExpr *C =
+          dyn_cast<const MCConstantExpr>(S.getVariableValue()))
+      return C->getValue();
+
+
+    MCValue Target;
+    if (!S.getVariableValue()->EvaluateAsRelocatable(Target, Layout))
+      report_fatal_error("unable to evaluate offset for variable '" +
+                         S.getName() + "'");
+
+    // Verify that any used symbols are defined.
+    if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymA()->getSymbol().getName() + "'");
+    if (Target.getSymB() && Target.getSymB()->getSymbol().isUndefined())
+      report_fatal_error("unable to evaluate offset to undefined symbol '" +
+                         Target.getSymB()->getSymbol().getName() + "'");
+
+    uint64_t Address = Target.getConstant();
+    if (Target.getSymA())
+      Address += getSymbolAddress(&Layout.getAssembler().getSymbolData(
+                                    Target.getSymA()->getSymbol()), Layout);
+    if (Target.getSymB())
+      Address += getSymbolAddress(&Layout.getAssembler().getSymbolData(
+                                    Target.getSymB()->getSymbol()), Layout);
+    return Address;
+  }
+
+  return getSectionAddress(SD->getFragment()->getParent()) +
+    Layout.getSymbolOffset(SD);
+}
+
+uint64_t MachObjectWriter::getPaddingSize(const MCSectionData *SD,
+                                          const MCAsmLayout &Layout) const {
+  uint64_t EndAddr = getSectionAddress(SD) + Layout.getSectionAddressSize(SD);
+  unsigned Next = SD->getLayoutOrder() + 1;
+  if (Next >= Layout.getSectionOrder().size())
+    return 0;
+
+  const MCSectionData &NextSD = *Layout.getSectionOrder()[Next];
+  if (NextSD.getSection().isVirtualSection())
+    return 0;
+  return OffsetToAlignment(EndAddr, NextSD.getAlignment());
+}
+
+void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
+                                   unsigned LoadCommandsSize,
+                                   bool SubsectionsViaSymbols) {
+  uint32_t Flags = 0;
+
+  if (SubsectionsViaSymbols)
+    Flags |= macho::HF_SubsectionsViaSymbols;
+
+  // struct mach_header (28 bytes) or
+  // struct mach_header_64 (32 bytes)
+
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(is64Bit() ? macho::HM_Object64 : macho::HM_Object32);
+
+  Write32(TargetObjectWriter->getCPUType());
+  Write32(TargetObjectWriter->getCPUSubtype());
+
+  Write32(macho::HFT_Object);
+  Write32(NumLoadCommands);
+  Write32(LoadCommandsSize);
+  Write32(Flags);
+  if (is64Bit())
+    Write32(0); // reserved
+
+  assert(OS.tell() - Start ==
+         (is64Bit() ? macho::Header64Size : macho::Header32Size));
+}
+
+/// WriteSegmentLoadCommand - Write a segment load command.
+///
+/// \param NumSections The number of sections in this segment.
+/// \param SectionDataSize The total size of the sections.
+void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
+                                               uint64_t VMSize,
+                                               uint64_t SectionDataStartOffset,
+                                               uint64_t SectionDataSize) {
+  // struct segment_command (56 bytes) or
+  // struct segment_command_64 (72 bytes)
+
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  unsigned SegmentLoadCommandSize =
+    is64Bit() ? macho::SegmentLoadCommand64Size:
+    macho::SegmentLoadCommand32Size;
+  Write32(is64Bit() ? macho::LCT_Segment64 : macho::LCT_Segment);
+  Write32(SegmentLoadCommandSize +
+          NumSections * (is64Bit() ? macho::Section64Size :
+                         macho::Section32Size));
+
+  WriteBytes("", 16);
+  if (is64Bit()) {
+    Write64(0); // vmaddr
+    Write64(VMSize); // vmsize
+    Write64(SectionDataStartOffset); // file offset
+    Write64(SectionDataSize); // file size
+  } else {
+    Write32(0); // vmaddr
+    Write32(VMSize); // vmsize
+    Write32(SectionDataStartOffset); // file offset
+    Write32(SectionDataSize); // file size
+  }
+  Write32(0x7); // maxprot
+  Write32(0x7); // initprot
+  Write32(NumSections);
+  Write32(0); // flags
+
+  assert(OS.tell() - Start == SegmentLoadCommandSize);
+}
+
+void MachObjectWriter::WriteSection(const MCAssembler &Asm,
+                                    const MCAsmLayout &Layout,
+                                    const MCSectionData &SD,
+                                    uint64_t FileOffset,
+                                    uint64_t RelocationsStart,
+                                    unsigned NumRelocations) {
+  uint64_t SectionSize = Layout.getSectionAddressSize(&SD);
+
+  // The offset is unused for virtual sections.
+  if (SD.getSection().isVirtualSection()) {
+    assert(Layout.getSectionFileSize(&SD) == 0 && "Invalid file size!");
+    FileOffset = 0;
+  }
+
+  // struct section (68 bytes) or
+  // struct section_64 (80 bytes)
+
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  const MCSectionMachO &Section = cast<MCSectionMachO>(SD.getSection());
+  WriteBytes(Section.getSectionName(), 16);
+  WriteBytes(Section.getSegmentName(), 16);
+  if (is64Bit()) {
+    Write64(getSectionAddress(&SD)); // address
+    Write64(SectionSize); // size
+  } else {
+    Write32(getSectionAddress(&SD)); // address
+    Write32(SectionSize); // size
+  }
+  Write32(FileOffset);
+
+  unsigned Flags = Section.getTypeAndAttributes();
+  if (SD.hasInstructions())
+    Flags |= MCSectionMachO::S_ATTR_SOME_INSTRUCTIONS;
+
+  assert(isPowerOf2_32(SD.getAlignment()) && "Invalid alignment!");
+  Write32(Log2_32(SD.getAlignment()));
+  Write32(NumRelocations ? RelocationsStart : 0);
+  Write32(NumRelocations);
+  Write32(Flags);
+  Write32(IndirectSymBase.lookup(&SD)); // reserved1
+  Write32(Section.getStubSize()); // reserved2
+  if (is64Bit())
+    Write32(0); // reserved3
+
+  assert(OS.tell() - Start == (is64Bit() ? macho::Section64Size :
+                               macho::Section32Size));
+}
+
+void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
+                                              uint32_t NumSymbols,
+                                              uint32_t StringTableOffset,
+                                              uint32_t StringTableSize) {
+  // struct symtab_command (24 bytes)
+
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(macho::LCT_Symtab);
+  Write32(macho::SymtabLoadCommandSize);
+  Write32(SymbolOffset);
+  Write32(NumSymbols);
+  Write32(StringTableOffset);
+  Write32(StringTableSize);
+
+  assert(OS.tell() - Start == macho::SymtabLoadCommandSize);
+}
+
+void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
+                                                uint32_t NumLocalSymbols,
+                                                uint32_t FirstExternalSymbol,
+                                                uint32_t NumExternalSymbols,
+                                                uint32_t FirstUndefinedSymbol,
+                                                uint32_t NumUndefinedSymbols,
+                                                uint32_t IndirectSymbolOffset,
+                                                uint32_t NumIndirectSymbols) {
+  // struct dysymtab_command (80 bytes)
+
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(macho::LCT_Dysymtab);
+  Write32(macho::DysymtabLoadCommandSize);
+  Write32(FirstLocalSymbol);
+  Write32(NumLocalSymbols);
+  Write32(FirstExternalSymbol);
+  Write32(NumExternalSymbols);
+  Write32(FirstUndefinedSymbol);
+  Write32(NumUndefinedSymbols);
+  Write32(0); // tocoff
+  Write32(0); // ntoc
+  Write32(0); // modtaboff
+  Write32(0); // nmodtab
+  Write32(0); // extrefsymoff
+  Write32(0); // nextrefsyms
+  Write32(IndirectSymbolOffset);
+  Write32(NumIndirectSymbols);
+  Write32(0); // extreloff
+  Write32(0); // nextrel
+  Write32(0); // locreloff
+  Write32(0); // nlocrel
+
+  assert(OS.tell() - Start == macho::DysymtabLoadCommandSize);
+}
+
+void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
+                                  const MCAsmLayout &Layout) {
+  MCSymbolData &Data = *MSD.SymbolData;
+  const MCSymbol &Symbol = Data.getSymbol();
+  uint8_t Type = 0;
+  uint16_t Flags = Data.getFlags();
+  uint64_t Address = 0;
+
+  // Set the N_TYPE bits. See <mach-o/nlist.h>.
+  //
+  // FIXME: Are the prebound or indirect fields possible here?
+  if (Symbol.isUndefined())
+    Type = macho::STT_Undefined;
+  else if (Symbol.isAbsolute())
+    Type = macho::STT_Absolute;
+  else
+    Type = macho::STT_Section;
+
+  // FIXME: Set STAB bits.
+
+  if (Data.isPrivateExtern())
+    Type |= macho::STF_PrivateExtern;
+
+  // Set external bit.
+  if (Data.isExternal() || Symbol.isUndefined())
+    Type |= macho::STF_External;
+
+  // Compute the symbol address.
+  if (Symbol.isDefined()) {
+    Address = getSymbolAddress(&Data, Layout);
+  } else if (Data.isCommon()) {
+    // Common symbols are encoded with the size in the address
+    // field, and their alignment in the flags.
+    Address = Data.getCommonSize();
+
+    // Common alignment is packed into the 'desc' bits.
+    if (unsigned Align = Data.getCommonAlignment()) {
+      unsigned Log2Size = Log2_32(Align);
+      assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
+      if (Log2Size > 15)
+        report_fatal_error("invalid 'common' alignment '" +
+                           Twine(Align) + "'");
+      // FIXME: Keep this mask with the SymbolFlags enumeration.
+      Flags = (Flags & 0xF0FF) | (Log2Size << 8);
+    }
+  }
+
+  // struct nlist (12 bytes)
+
+  Write32(MSD.StringIndex);
+  Write8(Type);
+  Write8(MSD.SectionIndex);
+
+  // The Mach-O streamer uses the lowest 16-bits of the flags for the 'desc'
+  // value.
+  Write16(Flags);
+  if (is64Bit())
+    Write64(Address);
+  else
+    Write32(Address);
+}
+
+void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
+                                                uint32_t DataOffset,
+                                                uint32_t DataSize) {
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(Type);
+  Write32(macho::LinkeditLoadCommandSize);
+  Write32(DataOffset);
+  Write32(DataSize);
+
+  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+}
+
+static unsigned ComputeLinkerOptionsLoadCommandSize(
+  const std::vector<std::string> &Options, bool is64Bit)
+{
+  unsigned Size = sizeof(macho::LinkerOptionsLoadCommand);
+  for (unsigned i = 0, e = Options.size(); i != e; ++i)
+    Size += Options[i].size() + 1;
+  return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
+}
+
+void MachObjectWriter::WriteLinkerOptionsLoadCommand(
+  const std::vector<std::string> &Options)
+{
+  unsigned Size = ComputeLinkerOptionsLoadCommandSize(Options, is64Bit());
+  uint64_t Start = OS.tell();
+  (void) Start;
+
+  Write32(macho::LCT_LinkerOptions);
+  Write32(Size);
+  Write32(Options.size());
+  uint64_t BytesWritten = sizeof(macho::LinkerOptionsLoadCommand);
+  for (unsigned i = 0, e = Options.size(); i != e; ++i) {
+    // Write each string, including the null byte.
+    const std::string &Option = Options[i];
+    WriteBytes(Option.c_str(), Option.size() + 1);
+    BytesWritten += Option.size() + 1;
+  }
+
+  // Pad to a multiple of the pointer size.
+  WriteBytes("", OffsetToAlignment(BytesWritten, is64Bit() ? 8 : 4));
+
+  assert(OS.tell() - Start == Size);
+}
+
+
+void MachObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup,
+                                        MCValue Target,
+                                        uint64_t &FixedValue) {
+  TargetObjectWriter->RecordRelocation(this, Asm, Layout, Fragment, Fixup,
+                                       Target, FixedValue);
+}
+
+void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
+  // This is the point where 'as' creates actual symbols for indirect symbols
+  // (in the following two passes). It would be easier for us to do this sooner
+  // when we see the attribute, but that makes getting the order in the symbol
+  // table much more complicated than it is worth.
+  //
+  // FIXME: Revisit this when the dust settles.
+
+  // Bind non lazy symbol pointers first.
+  unsigned IndirectIndex = 0;
+  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
+         ie = Asm.indirect_symbol_end(); it != ie; ++it, ++IndirectIndex) {
+    const MCSectionMachO &Section =
+      cast<MCSectionMachO>(it->SectionData->getSection());
+
+    if (Section.getType() != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS)
+      continue;
+
+    // Initialize the section indirect symbol base, if necessary.
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
+
+    Asm.getOrCreateSymbolData(*it->Symbol);
+  }
+
+  // Then lazy symbol pointers and symbol stubs.
+  IndirectIndex = 0;
+  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
+         ie = Asm.indirect_symbol_end(); it != ie; ++it, ++IndirectIndex) {
+    const MCSectionMachO &Section =
+      cast<MCSectionMachO>(it->SectionData->getSection());
+
+    if (Section.getType() != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_SYMBOL_STUBS)
+      continue;
+
+    // Initialize the section indirect symbol base, if necessary.
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
+
+    // Set the symbol type to undefined lazy, but only on construction.
+    //
+    // FIXME: Do not hardcode.
+    bool Created;
+    MCSymbolData &Entry = Asm.getOrCreateSymbolData(*it->Symbol, &Created);
+    if (Created)
+      Entry.setFlags(Entry.getFlags() | 0x0001);
+  }
+}
+
+/// ComputeSymbolTable - Compute the symbol table data
+///
+/// \param StringTable [out] - The string table data.
+/// \param StringIndexMap [out] - Map from symbol names to offsets in the
+/// string table.
+void MachObjectWriter::
+ComputeSymbolTable(MCAssembler &Asm, SmallString<256> &StringTable,
+                   std::vector<MachSymbolData> &LocalSymbolData,
+                   std::vector<MachSymbolData> &ExternalSymbolData,
+                   std::vector<MachSymbolData> &UndefinedSymbolData) {
+  // Build section lookup table.
+  DenseMap<const MCSection*, uint8_t> SectionIndexMap;
+  unsigned Index = 1;
+  for (MCAssembler::iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it, ++Index)
+    SectionIndexMap[&it->getSection()] = Index;
+  assert(Index <= 256 && "Too many sections!");
+
+  // Index 0 is always the empty string.
+  StringMap<uint64_t> StringIndexMap;
+  StringTable += '\x00';
+
+  // Build the symbol arrays and the string table, but only for non-local
+  // symbols.
+  //
+  // The particular order that we collect the symbols and create the string
+  // table, then sort the symbols is chosen to match 'as'. Even though it
+  // doesn't matter for correctness, this is important for letting us diff .o
+  // files.
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Symbol = it->getSymbol();
+
+    // Ignore non-linker visible symbols.
+    if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
+      continue;
+
+    if (!it->isExternal() && !Symbol.isUndefined())
+      continue;
+
+    uint64_t &Entry = StringIndexMap[Symbol.getName()];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Symbol.getName();
+      StringTable += '\x00';
+    }
+
+    MachSymbolData MSD;
+    MSD.SymbolData = it;
+    MSD.StringIndex = Entry;
+
+    if (Symbol.isUndefined()) {
+      MSD.SectionIndex = 0;
+      UndefinedSymbolData.push_back(MSD);
+    } else if (Symbol.isAbsolute()) {
+      MSD.SectionIndex = 0;
+      ExternalSymbolData.push_back(MSD);
+    } else {
+      MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
+      assert(MSD.SectionIndex && "Invalid section index!");
+      ExternalSymbolData.push_back(MSD);
+    }
+  }
+
+  // Now add the data for local symbols.
+  for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
+         ie = Asm.symbol_end(); it != ie; ++it) {
+    const MCSymbol &Symbol = it->getSymbol();
+
+    // Ignore non-linker visible symbols.
+    if (!Asm.isSymbolLinkerVisible(it->getSymbol()))
+      continue;
+
+    if (it->isExternal() || Symbol.isUndefined())
+      continue;
+
+    uint64_t &Entry = StringIndexMap[Symbol.getName()];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Symbol.getName();
+      StringTable += '\x00';
+    }
+
+    MachSymbolData MSD;
+    MSD.SymbolData = it;
+    MSD.StringIndex = Entry;
+
+    if (Symbol.isAbsolute()) {
+      MSD.SectionIndex = 0;
+      LocalSymbolData.push_back(MSD);
+    } else {
+      MSD.SectionIndex = SectionIndexMap.lookup(&Symbol.getSection());
+      assert(MSD.SectionIndex && "Invalid section index!");
+      LocalSymbolData.push_back(MSD);
+    }
+  }
+
+  // External and undefined symbols are required to be in lexicographic order.
+  std::sort(ExternalSymbolData.begin(), ExternalSymbolData.end());
+  std::sort(UndefinedSymbolData.begin(), UndefinedSymbolData.end());
+
+  // Set the symbol indices.
+  Index = 0;
+  for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
+    LocalSymbolData[i].SymbolData->setIndex(Index++);
+  for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
+    ExternalSymbolData[i].SymbolData->setIndex(Index++);
+  for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
+    UndefinedSymbolData[i].SymbolData->setIndex(Index++);
+
+  // The string table is padded to a multiple of 4.
+  while (StringTable.size() % 4)
+    StringTable += '\x00';
+}
+
+void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout) {
+  uint64_t StartAddress = 0;
+  const SmallVectorImpl<MCSectionData*> &Order = Layout.getSectionOrder();
+  for (int i = 0, n = Order.size(); i != n ; ++i) {
+    const MCSectionData *SD = Order[i];
+    StartAddress = RoundUpToAlignment(StartAddress, SD->getAlignment());
+    SectionAddress[SD] = StartAddress;
+    StartAddress += Layout.getSectionAddressSize(SD);
+
+    // Explicitly pad the section to match the alignment requirements of the
+    // following one. This is for 'gas' compatibility, it shouldn't
+    /// strictly be necessary.
+    StartAddress += getPaddingSize(SD, Layout);
+  }
+}
+
+void MachObjectWriter::markAbsoluteVariableSymbols(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout) {
+  for (MCAssembler::symbol_iterator i = Asm.symbol_begin(),
+                                    e = Asm.symbol_end();
+      i != e; ++i) {
+    MCSymbolData &SD = *i;
+    if (!SD.getSymbol().isVariable())
+      continue;
+
+    // Is the variable is a symbol difference (SA - SB + C) expression,
+    // and neither symbol is external, mark the variable as absolute.
+    const MCExpr *Expr = SD.getSymbol().getVariableValue();
+    MCValue Value;
+    if (Expr->EvaluateAsRelocatable(Value, Layout)) {
+      if (Value.getSymA() && Value.getSymB())
+        const_cast<MCSymbol*>(&SD.getSymbol())->setAbsolute();
+    }
+  }
+}
+
+void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
+                                                const MCAsmLayout &Layout) {
+  computeSectionAddresses(Asm, Layout);
+
+  // Create symbol data for any indirect symbols.
+  BindIndirectSymbols(Asm);
+
+  // Mark symbol difference expressions in variables (from .set or = directives)
+  // as absolute.
+  markAbsoluteVariableSymbols(Asm, Layout);
+
+  // Compute symbol table information and bind symbol indices.
+  ComputeSymbolTable(Asm, StringTable, LocalSymbolData, ExternalSymbolData,
+                     UndefinedSymbolData);
+}
+
+bool MachObjectWriter::
+IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
+                                       const MCSymbolData &DataA,
+                                       const MCFragment &FB,
+                                       bool InSet,
+                                       bool IsPCRel) const {
+  if (InSet)
+    return true;
+
+  // The effective address is
+  //     addr(atom(A)) + offset(A)
+  //   - addr(atom(B)) - offset(B)
+  // and the offsets are not relocatable, so the fixup is fully resolved when
+  //  addr(atom(A)) - addr(atom(B)) == 0.
+  const MCSymbolData *A_Base = 0, *B_Base = 0;
+
+  const MCSymbol &SA = DataA.getSymbol().AliasedSymbol();
+  const MCSection &SecA = SA.getSection();
+  const MCSection &SecB = FB.getParent()->getSection();
+
+  if (IsPCRel) {
+    // The simple (Darwin, except on x86_64) way of dealing with this was to
+    // assume that any reference to a temporary symbol *must* be a temporary
+    // symbol in the same atom, unless the sections differ. Therefore, any PCrel
+    // relocation to a temporary symbol (in the same section) is fully
+    // resolved. This also works in conjunction with absolutized .set, which
+    // requires the compiler to use .set to absolutize the differences between
+    // symbols which the compiler knows to be assembly time constants, so we
+    // don't need to worry about considering symbol differences fully resolved.
+    //
+    // If the file isn't using sub-sections-via-symbols, we can make the
+    // same assumptions about any symbol that we normally make about
+    // assembler locals.
+
+    if (!Asm.getBackend().hasReliableSymbolDifference()) {
+      if (!SA.isInSection() || &SecA != &SecB ||
+          (!SA.isTemporary() &&
+           FB.getAtom() != Asm.getSymbolData(SA).getFragment()->getAtom() &&
+           Asm.getSubsectionsViaSymbols()))
+        return false;
+      return true;
+    }
+    // For Darwin x86_64, there is one special case when the reference IsPCRel.
+    // If the fragment with the reference does not have a base symbol but meets
+    // the simple way of dealing with this, in that it is a temporary symbol in
+    // the same atom then it is assumed to be fully resolved.  This is needed so
+    // a relocation entry is not created and so the static linker does not
+    // mess up the reference later.
+    else if(!FB.getAtom() &&
+            SA.isTemporary() && SA.isInSection() && &SecA == &SecB){
+      return true;
+    }
+  } else {
+    if (!TargetObjectWriter->useAggressiveSymbolFolding())
+      return false;
+  }
+
+  const MCFragment *FA = Asm.getSymbolData(SA).getFragment();
+
+  // Bail if the symbol has no fragment.
+  if (!FA)
+    return false;
+
+  A_Base = FA->getAtom();
+  if (!A_Base)
+    return false;
+
+  B_Base = FB.getAtom();
+  if (!B_Base)
+    return false;
+
+  // If the atoms are the same, they are guaranteed to have the same address.
+  if (A_Base == B_Base)
+    return true;
+
+  // Otherwise, we can't prove this is fully resolved.
+  return false;
+}
+
+void MachObjectWriter::WriteObject(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout) {
+  unsigned NumSections = Asm.size();
+
+  // The section data starts after the header, the segment load command (and
+  // section headers) and the symbol table.
+  unsigned NumLoadCommands = 1;
+  uint64_t LoadCommandsSize = is64Bit() ?
+    macho::SegmentLoadCommand64Size + NumSections * macho::Section64Size :
+    macho::SegmentLoadCommand32Size + NumSections * macho::Section32Size;
+
+  // Add the data-in-code load command size, if used.
+  unsigned NumDataRegions = Asm.getDataRegions().size();
+  if (NumDataRegions) {
+    ++NumLoadCommands;
+    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+  }
+
+  // Add the symbol table load command sizes, if used.
+  unsigned NumSymbols = LocalSymbolData.size() + ExternalSymbolData.size() +
+    UndefinedSymbolData.size();
+  if (NumSymbols) {
+    NumLoadCommands += 2;
+    LoadCommandsSize += (macho::SymtabLoadCommandSize +
+                         macho::DysymtabLoadCommandSize);
+  }
+
+  // Add the linker option load commands sizes.
+  const std::vector<std::vector<std::string> > &LinkerOptions =
+    Asm.getLinkerOptions();
+  for (unsigned i = 0, e = LinkerOptions.size(); i != e; ++i) {
+    ++NumLoadCommands;
+    LoadCommandsSize += ComputeLinkerOptionsLoadCommandSize(LinkerOptions[i],
+                                                            is64Bit());
+  }
+  
+  // Compute the total size of the section data, as well as its file size and vm
+  // size.
+  uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
+                               macho::Header32Size) + LoadCommandsSize;
+  uint64_t SectionDataSize = 0;
+  uint64_t SectionDataFileSize = 0;
+  uint64_t VMSize = 0;
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    const MCSectionData &SD = *it;
+    uint64_t Address = getSectionAddress(&SD);
+    uint64_t Size = Layout.getSectionAddressSize(&SD);
+    uint64_t FileSize = Layout.getSectionFileSize(&SD);
+    FileSize += getPaddingSize(&SD, Layout);
+
+    VMSize = std::max(VMSize, Address + Size);
+
+    if (SD.getSection().isVirtualSection())
+      continue;
+
+    SectionDataSize = std::max(SectionDataSize, Address + Size);
+    SectionDataFileSize = std::max(SectionDataFileSize, Address + FileSize);
+  }
+
+  // The section data is padded to 4 bytes.
+  //
+  // FIXME: Is this machine dependent?
+  unsigned SectionDataPadding = OffsetToAlignment(SectionDataFileSize, 4);
+  SectionDataFileSize += SectionDataPadding;
+
+  // Write the prolog, starting with the header and load command...
+  WriteHeader(NumLoadCommands, LoadCommandsSize,
+              Asm.getSubsectionsViaSymbols());
+  WriteSegmentLoadCommand(NumSections, VMSize,
+                          SectionDataStart, SectionDataSize);
+
+  // ... and then the section headers.
+  uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    unsigned NumRelocs = Relocs.size();
+    uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
+    WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
+    RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
+  }
+
+  // Write the data-in-code load command, if used.
+  uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8;
+  if (NumDataRegions) {
+    uint64_t DataRegionsOffset = RelocTableEnd;
+    uint64_t DataRegionsSize = NumDataRegions * 8;
+    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+                             DataRegionsSize);
+  }
+
+  // Write the symbol table load command, if used.
+  if (NumSymbols) {
+    unsigned FirstLocalSymbol = 0;
+    unsigned NumLocalSymbols = LocalSymbolData.size();
+    unsigned FirstExternalSymbol = FirstLocalSymbol + NumLocalSymbols;
+    unsigned NumExternalSymbols = ExternalSymbolData.size();
+    unsigned FirstUndefinedSymbol = FirstExternalSymbol + NumExternalSymbols;
+    unsigned NumUndefinedSymbols = UndefinedSymbolData.size();
+    unsigned NumIndirectSymbols = Asm.indirect_symbol_size();
+    unsigned NumSymTabSymbols =
+      NumLocalSymbols + NumExternalSymbols + NumUndefinedSymbols;
+    uint64_t IndirectSymbolSize = NumIndirectSymbols * 4;
+    uint64_t IndirectSymbolOffset = 0;
+
+    // If used, the indirect symbols are written after the section data.
+    if (NumIndirectSymbols)
+      IndirectSymbolOffset = DataInCodeTableEnd;
+
+    // The symbol table is written after the indirect symbol data.
+    uint64_t SymbolTableOffset = DataInCodeTableEnd + IndirectSymbolSize;
+
+    // The string table is written after symbol table.
+    uint64_t StringTableOffset =
+      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ? macho::Nlist64Size :
+                                              macho::Nlist32Size);
+    WriteSymtabLoadCommand(SymbolTableOffset, NumSymTabSymbols,
+                           StringTableOffset, StringTable.size());
+
+    WriteDysymtabLoadCommand(FirstLocalSymbol, NumLocalSymbols,
+                             FirstExternalSymbol, NumExternalSymbols,
+                             FirstUndefinedSymbol, NumUndefinedSymbols,
+                             IndirectSymbolOffset, NumIndirectSymbols);
+  }
+
+  // Write the linker options load commands.
+  for (unsigned i = 0, e = LinkerOptions.size(); i != e; ++i) {
+    WriteLinkerOptionsLoadCommand(LinkerOptions[i]);
+  }
+
+  // Write the actual section data.
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    Asm.writeSectionData(it, Layout);
+
+    uint64_t Pad = getPaddingSize(it, Layout);
+    for (unsigned int i = 0; i < Pad; ++i)
+      Write8(0);
+  }
+
+  // Write the extra padding.
+  WriteZeros(SectionDataPadding);
+
+  // Write the relocation entries.
+  for (MCAssembler::const_iterator it = Asm.begin(),
+         ie = Asm.end(); it != ie; ++it) {
+    // Write the section relocation entries, in reverse order to match 'as'
+    // (approximately, the exact algorithm is more complicated than this).
+    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
+      Write32(Relocs[e - i - 1].Word0);
+      Write32(Relocs[e - i - 1].Word1);
+    }
+  }
+
+  // Write out the data-in-code region payload, if there is one.
+  for (MCAssembler::const_data_region_iterator
+         it = Asm.data_region_begin(), ie = Asm.data_region_end();
+         it != ie; ++it) {
+    const DataRegionData *Data = &(*it);
+    uint64_t Start =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start),
+                       Layout);
+    uint64_t End =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End),
+                       Layout);
+    DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
+                 << "  start: " << Start << "(" << Data->Start->getName() << ")"
+                 << "  end: " << End << "(" << Data->End->getName() << ")"
+                 << "  size: " << End - Start
+                 << "\n");
+    Write32(Start);
+    Write16(End - Start);
+    Write16(Data->Kind);
+  }
+
+  // Write the symbol table data, if used.
+  if (NumSymbols) {
+    // Write the indirect symbol entries.
+    for (MCAssembler::const_indirect_symbol_iterator
+           it = Asm.indirect_symbol_begin(),
+           ie = Asm.indirect_symbol_end(); it != ie; ++it) {
+      // Indirect symbols in the non lazy symbol pointer section have some
+      // special handling.
+      const MCSectionMachO &Section =
+        static_cast<const MCSectionMachO&>(it->SectionData->getSection());
+      if (Section.getType() == MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS) {
+        // If this symbol is defined and internal, mark it as such.
+        if (it->Symbol->isDefined() &&
+            !Asm.getSymbolData(*it->Symbol).isExternal()) {
+          uint32_t Flags = macho::ISF_Local;
+          if (it->Symbol->isAbsolute())
+            Flags |= macho::ISF_Absolute;
+          Write32(Flags);
+          continue;
+        }
+      }
+
+      Write32(Asm.getSymbolData(*it->Symbol).getIndex());
+    }
+
+    // FIXME: Check that offsets match computed ones.
+
+    // Write the symbol table entries.
+    for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
+      WriteNlist(LocalSymbolData[i], Layout);
+    for (unsigned i = 0, e = ExternalSymbolData.size(); i != e; ++i)
+      WriteNlist(ExternalSymbolData[i], Layout);
+    for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
+      WriteNlist(UndefinedSymbolData[i], Layout);
+
+    // Write the string table.
+    OS << StringTable.str();
+  }
+}
+
+MCObjectWriter *llvm::createMachObjectWriter(MCMachObjectTargetWriter *MOTW,
+                                             raw_ostream &OS,
+                                             bool IsLittleEndian) {
+  return new MachObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/MC/SubtargetFeature.cpp b/contrib/llvm/lib/MC/SubtargetFeature.cpp
new file mode 100644
index 000000000000..7625abd465fa
--- /dev/null
+++ b/contrib/llvm/lib/MC/SubtargetFeature.cpp
@@ -0,0 +1,374 @@
+//===- SubtargetFeature.cpp - CPU characteristics Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SubtargetFeature interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                          Static Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// hasFlag - Determine if a feature has a flag; '+' or '-'
+///
+static inline bool hasFlag(const StringRef Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' or '-' flag
+  return Ch == '+' || Ch =='-';
+}
+
+/// StripFlag - Return string stripped of flag.
+///
+static inline std::string StripFlag(const StringRef Feature) {
+  return hasFlag(Feature) ? Feature.substr(1) : Feature;
+}
+
+/// isEnabled - Return true if enable flag; '+'.
+///
+static inline bool isEnabled(const StringRef Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' for enabled
+  return Ch == '+';
+}
+
+/// PrependFlag - Return a string with a prepended flag; '+' or '-'.
+///
+static inline std::string PrependFlag(const StringRef Feature,
+                                    bool IsEnabled) {
+  assert(!Feature.empty() && "Empty string");
+  if (hasFlag(Feature))
+    return Feature;
+  std::string Prefix = IsEnabled ? "+" : "-";
+  Prefix += Feature;
+  return Prefix;
+}
+
+/// Split - Splits a string of comma separated items in to a vector of strings.
+///
+static void Split(std::vector<std::string> &V, const StringRef S) {
+  if (S.empty())
+    return;
+
+  // Start at beginning of string.
+  size_t Pos = 0;
+  while (true) {
+    // Find the next comma
+    size_t Comma = S.find(',', Pos);
+    // If no comma found then the rest of the string is used
+    if (Comma == std::string::npos) {
+      // Add string to vector
+      V.push_back(S.substr(Pos));
+      break;
+    }
+    // Otherwise add substring to vector
+    V.push_back(S.substr(Pos, Comma - Pos));
+    // Advance to next item
+    Pos = Comma + 1;
+  }
+}
+
+/// Join a vector of strings to a string with a comma separating each element.
+///
+static std::string Join(const std::vector<std::string> &V) {
+  // Start with empty string.
+  std::string Result;
+  // If the vector is not empty
+  if (!V.empty()) {
+    // Start with the first feature
+    Result = V[0];
+    // For each successive feature
+    for (size_t i = 1; i < V.size(); i++) {
+      // Add a comma
+      Result += ",";
+      // Add the feature
+      Result += V[i];
+    }
+  }
+  // Return the features string
+  return Result;
+}
+
+/// Adding features.
+void SubtargetFeatures::AddFeature(const StringRef String,
+                                   bool IsEnabled) {
+  // Don't add empty features
+  if (!String.empty()) {
+    // Convert to lowercase, prepend flag and add to vector
+    Features.push_back(PrependFlag(String.lower(), IsEnabled));
+  }
+}
+
+/// Find KV in array using binary search.
+static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
+                                      size_t L) {
+  // Make the lower bound element we're looking for
+  SubtargetFeatureKV KV;
+  KV.Key = S.data();
+  // Determine the end of the array
+  const SubtargetFeatureKV *Hi = A + L;
+  // Binary search the array
+  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, KV);
+  // If not found then return NULL
+  if (F == Hi || StringRef(F->Key) != S) return NULL;
+  // Return the found array item
+  return F;
+}
+
+/// getLongestEntryLength - Return the length of the longest entry in the table.
+///
+static size_t getLongestEntryLength(const SubtargetFeatureKV *Table,
+                                    size_t Size) {
+  size_t MaxLen = 0;
+  for (size_t i = 0; i < Size; i++)
+    MaxLen = std::max(MaxLen, std::strlen(Table[i].Key));
+  return MaxLen;
+}
+
+/// Display help for feature choices.
+///
+static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
+                 const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) {
+  // Determine the length of the longest CPU and Feature entries.
+  unsigned MaxCPULen  = getLongestEntryLength(CPUTable, CPUTableSize);
+  unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
+
+  // Print the CPU table.
+  errs() << "Available CPUs for this target:\n\n";
+  for (size_t i = 0; i != CPUTableSize; i++)
+    errs() << format("  %-*s - %s.\n",
+                     MaxCPULen, CPUTable[i].Key, CPUTable[i].Desc);
+  errs() << '\n';
+
+  // Print the Feature table.
+  errs() << "Available features for this target:\n\n";
+  for (size_t i = 0; i != FeatTableSize; i++)
+    errs() << format("  %-*s - %s.\n",
+                     MaxFeatLen, FeatTable[i].Key, FeatTable[i].Desc);
+  errs() << '\n';
+
+  errs() << "Use +feature to enable a feature, or -feature to disable it.\n"
+            "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
+  std::exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+//                    SubtargetFeatures Implementation
+//===----------------------------------------------------------------------===//
+
+SubtargetFeatures::SubtargetFeatures(const StringRef Initial) {
+  // Break up string into separate features
+  Split(Features, Initial);
+}
+
+
+std::string SubtargetFeatures::getString() const {
+  return Join(Features);
+}
+
+/// SetImpliedBits - For each feature that is (transitively) implied by this
+/// feature, set it.
+///
+static
+void SetImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                    const SubtargetFeatureKV *FeatureTable,
+                    size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FeatureEntry->Implies & FE.Value) {
+      Bits |= FE.Value;
+      SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// ClearImpliedBits - For each feature that (transitively) implies this
+/// feature, clear it.
+///
+static
+void ClearImpliedBits(uint64_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                      const SubtargetFeatureKV *FeatureTable,
+                      size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FE.Implies & FeatureEntry->Value) {
+      Bits &= ~FE.Value;
+      ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// ToggleFeature - Toggle a feature and returns the newly updated feature
+/// bits.
+uint64_t
+SubtargetFeatures::ToggleFeature(uint64_t Bits, const StringRef Feature,
+                                 const SubtargetFeatureKV *FeatureTable,
+                                 size_t FeatureTableSize) {
+  // Find feature in table.
+  const SubtargetFeatureKV *FeatureEntry =
+    Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+  // If there is a match
+  if (FeatureEntry) {
+    if ((Bits & FeatureEntry->Value) == FeatureEntry->Value) {
+      Bits &= ~FeatureEntry->Value;
+
+      // For each feature that implies this, clear it.
+      ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+    } else {
+      Bits |=  FeatureEntry->Value;
+
+      // For each feature that this implies, set it.
+      SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+    }
+  } else {
+    errs() << "'" << Feature
+           << "' is not a recognized feature for this target"
+           << " (ignoring feature)\n";
+  }
+
+  return Bits;
+}
+
+
+/// getFeatureBits - Get feature bits a CPU.
+///
+uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
+                                         const SubtargetFeatureKV *CPUTable,
+                                         size_t CPUTableSize,
+                                         const SubtargetFeatureKV *FeatureTable,
+                                         size_t FeatureTableSize) {
+  if (!FeatureTableSize || !CPUTableSize)
+    return 0;
+
+#ifndef NDEBUG
+  for (size_t i = 1; i < CPUTableSize; i++) {
+    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
+           "CPU table is not sorted");
+  }
+  for (size_t i = 1; i < FeatureTableSize; i++) {
+    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
+          "CPU features table is not sorted");
+  }
+#endif
+  uint64_t Bits = 0;                    // Resulting bits
+
+  // Check if help is needed
+  if (CPU == "help")
+    Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+
+  // Find CPU entry if CPU name is specified.
+  if (!CPU.empty()) {
+    const SubtargetFeatureKV *CPUEntry = Find(CPU, CPUTable, CPUTableSize);
+    // If there is a match
+    if (CPUEntry) {
+      // Set base feature bits
+      Bits = CPUEntry->Value;
+
+      // Set the feature implied by this CPU feature, if any.
+      for (size_t i = 0; i < FeatureTableSize; ++i) {
+        const SubtargetFeatureKV &FE = FeatureTable[i];
+        if (CPUEntry->Value & FE.Value)
+          SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+      }
+    } else {
+      errs() << "'" << CPU
+             << "' is not a recognized processor for this target"
+             << " (ignoring processor)\n";
+    }
+  }
+
+  // Iterate through each feature
+  for (size_t i = 0, E = Features.size(); i < E; i++) {
+    const StringRef Feature = Features[i];
+
+    // Check for help
+    if (Feature == "+help")
+      Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+
+    // Find feature in table.
+    const SubtargetFeatureKV *FeatureEntry =
+                       Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+    // If there is a match
+    if (FeatureEntry) {
+      // Enable/disable feature in bits
+      if (isEnabled(Feature)) {
+        Bits |=  FeatureEntry->Value;
+
+        // For each feature that this implies, set it.
+        SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      } else {
+        Bits &= ~FeatureEntry->Value;
+
+        // For each feature that implies this, clear it.
+        ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      }
+    } else {
+      errs() << "'" << Feature
+             << "' is not a recognized feature for this target"
+             << " (ignoring feature)\n";
+    }
+  }
+
+  return Bits;
+}
+
+/// print - Print feature string.
+///
+void SubtargetFeatures::print(raw_ostream &OS) const {
+  for (size_t i = 0, e = Features.size(); i != e; ++i)
+    OS << Features[i] << "  ";
+  OS << "\n";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dump - Dump feature info.
+///
+void SubtargetFeatures::dump() const {
+  print(dbgs());
+}
+#endif
+
+/// getDefaultSubtargetFeatures - Return a string listing the features
+/// associated with the target triple.
+///
+/// FIXME: This is an inelegant way of specifying the features of a
+/// subtarget. It would be better if we could encode this information
+/// into the IR. See <rdar://5972456>.
+///
+void SubtargetFeatures::getDefaultSubtargetFeatures(const Triple& Triple) {
+  if (Triple.getVendor() == Triple::Apple) {
+    if (Triple.getArch() == Triple::ppc) {
+      // powerpc-apple-*
+      AddFeature("altivec");
+    } else if (Triple.getArch() == Triple::ppc64) {
+      // powerpc64-apple-*
+      AddFeature("64bit");
+      AddFeature("altivec");
+    }
+  }
+}
diff --git a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..6dffed73dfb3
--- /dev/null
+++ b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -0,0 +1,903 @@
+//===-- llvm/MC/WinCOFFObjectWriter.cpp -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of a Win32 COFF object file writer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "WinCOFFObjectWriter"
+
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TimeValue.h"
+#include <cstdio>
+
+using namespace llvm;
+
+namespace {
+typedef SmallString<COFF::NameSize> name;
+
+enum AuxiliaryType {
+  ATFunctionDefinition,
+  ATbfAndefSymbol,
+  ATWeakExternal,
+  ATFile,
+  ATSectionDefinition
+};
+
+struct AuxSymbol {
+  AuxiliaryType   AuxType;
+  COFF::Auxiliary Aux;
+};
+
+class COFFSymbol;
+class COFFSection;
+
+class COFFSymbol {
+public:
+  COFF::symbol Data;
+
+  typedef SmallVector<AuxSymbol, 1> AuxiliarySymbols;
+
+  name             Name;
+  int              Index;
+  AuxiliarySymbols Aux;
+  COFFSymbol      *Other;
+  COFFSection     *Section;
+  int              Relocations;
+
+  MCSymbolData const *MCData;
+
+  COFFSymbol(StringRef name);
+  size_t size() const;
+  void set_name_offset(uint32_t Offset);
+
+  bool should_keep() const;
+};
+
+// This class contains staging data for a COFF relocation entry.
+struct COFFRelocation {
+  COFF::relocation Data;
+  COFFSymbol          *Symb;
+
+  COFFRelocation() : Symb(NULL) {}
+  static size_t size() { return COFF::RelocationSize; }
+};
+
+typedef std::vector<COFFRelocation> relocations;
+
+class COFFSection {
+public:
+  COFF::section Header;
+
+  std::string          Name;
+  int                  Number;
+  MCSectionData const *MCData;
+  COFFSymbol          *Symbol;
+  relocations          Relocations;
+
+  COFFSection(StringRef name);
+  static size_t size();
+};
+
+// This class holds the COFF string table.
+class StringTable {
+  typedef StringMap<size_t> map;
+  map Map;
+
+  void update_length();
+public:
+  std::vector<char> Data;
+
+  StringTable();
+  size_t size() const;
+  size_t insert(StringRef String);
+};
+
+class WinCOFFObjectWriter : public MCObjectWriter {
+public:
+
+  typedef std::vector<COFFSymbol*>  symbols;
+  typedef std::vector<COFFSection*> sections;
+
+  typedef DenseMap<MCSymbol  const *, COFFSymbol *>   symbol_map;
+  typedef DenseMap<MCSection const *, COFFSection *> section_map;
+
+  llvm::OwningPtr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
+
+  // Root level file contents.
+  COFF::header Header;
+  sections     Sections;
+  symbols      Symbols;
+  StringTable  Strings;
+
+  // Maps used during object file creation.
+  section_map SectionMap;
+  symbol_map  SymbolMap;
+
+  WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
+  ~WinCOFFObjectWriter();
+
+  COFFSymbol *createSymbol(StringRef Name);
+  COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
+  COFFSection *createSection(StringRef Name);
+
+  template <typename object_t, typename list_t>
+  object_t *createCOFFEntity(StringRef Name, list_t &List);
+
+  void DefineSection(MCSectionData const &SectionData);
+  void DefineSymbol(MCSymbol const &Symbol,
+                    MCSymbolData const &SymbolData,
+                    MCAssembler &Assembler);
+
+  void MakeSymbolReal(COFFSymbol &S, size_t Index);
+  void MakeSectionReal(COFFSection &S, size_t Number);
+
+  bool ExportSection(COFFSection const *S);
+  bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
+
+  bool IsPhysicalSection(COFFSection *S);
+
+  // Entity writing methods.
+
+  void WriteFileHeader(const COFF::header &Header);
+  void WriteSymbol(const COFFSymbol *S);
+  void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
+  void WriteSectionHeader(const COFF::section &S);
+  void WriteRelocation(const COFF::relocation &R);
+
+  // MCObjectWriter interface implementation.
+
+  void ExecutePostLayoutBinding(MCAssembler &Asm, const MCAsmLayout &Layout);
+
+  void RecordRelocation(const MCAssembler &Asm,
+                        const MCAsmLayout &Layout,
+                        const MCFragment *Fragment,
+                        const MCFixup &Fixup,
+                        MCValue Target,
+                        uint64_t &FixedValue);
+
+  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+};
+}
+
+static inline void write_uint32_le(void *Data, uint32_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0x000000FF) >>  0;
+  Ptr[1] = (Value & 0x0000FF00) >>  8;
+  Ptr[2] = (Value & 0x00FF0000) >> 16;
+  Ptr[3] = (Value & 0xFF000000) >> 24;
+}
+
+static inline void write_uint16_le(void *Data, uint16_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0x00FF) >> 0;
+  Ptr[1] = (Value & 0xFF00) >> 8;
+}
+
+static inline void write_uint8_le(void *Data, uint8_t const &Value) {
+  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
+  Ptr[0] = (Value & 0xFF) >> 0;
+}
+
+//------------------------------------------------------------------------------
+// Symbol class implementation
+
+COFFSymbol::COFFSymbol(StringRef name)
+  : Name(name.begin(), name.end())
+  , Other(NULL)
+  , Section(NULL)
+  , Relocations(0)
+  , MCData(NULL) {
+  memset(&Data, 0, sizeof(Data));
+}
+
+size_t COFFSymbol::size() const {
+  return COFF::SymbolSize + (Data.NumberOfAuxSymbols * COFF::SymbolSize);
+}
+
+// In the case that the name does not fit within 8 bytes, the offset
+// into the string table is stored in the last 4 bytes instead, leaving
+// the first 4 bytes as 0.
+void COFFSymbol::set_name_offset(uint32_t Offset) {
+  write_uint32_le(Data.Name + 0, 0);
+  write_uint32_le(Data.Name + 4, Offset);
+}
+
+/// logic to decide if the symbol should be reported in the symbol table
+bool COFFSymbol::should_keep() const {
+  // no section means its external, keep it
+  if (Section == NULL)
+    return true;
+
+  // if it has relocations pointing at it, keep it
+  if (Relocations > 0)   {
+    assert(Section->Number != -1 && "Sections with relocations must be real!");
+    return true;
+  }
+
+  // if the section its in is being droped, drop it
+  if (Section->Number == -1)
+      return false;
+
+  // if it is the section symbol, keep it
+  if (Section->Symbol == this)
+    return true;
+
+  // if its temporary, drop it
+  if (MCData && MCData->getSymbol().isTemporary())
+      return false;
+
+  // otherwise, keep it
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Section class implementation
+
+COFFSection::COFFSection(StringRef name)
+  : Name(name)
+  , MCData(NULL)
+  , Symbol(NULL) {
+  memset(&Header, 0, sizeof(Header));
+}
+
+size_t COFFSection::size() {
+  return COFF::SectionSize;
+}
+
+//------------------------------------------------------------------------------
+// StringTable class implementation
+
+/// Write the length of the string table into Data.
+/// The length of the string table includes uint32 length header.
+void StringTable::update_length() {
+  write_uint32_le(&Data.front(), Data.size());
+}
+
+StringTable::StringTable() {
+  // The string table data begins with the length of the entire string table
+  // including the length header. Allocate space for this header.
+  Data.resize(4);
+  update_length();
+}
+
+size_t StringTable::size() const {
+  return Data.size();
+}
+
+/// Add String to the table iff it is not already there.
+/// @returns the index into the string table where the string is now located.
+size_t StringTable::insert(StringRef String) {
+  map::iterator i = Map.find(String);
+
+  if (i != Map.end())
+    return i->second;
+
+  size_t Offset = Data.size();
+
+  // Insert string data into string table.
+  Data.insert(Data.end(), String.begin(), String.end());
+  Data.push_back('\0');
+
+  // Put a reference to it in the map.
+  Map[String] = Offset;
+
+  // Update the internal length field.
+  update_length();
+
+  return Offset;
+}
+
+//------------------------------------------------------------------------------
+// WinCOFFObjectWriter class implementation
+
+WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
+                                         raw_ostream &OS)
+  : MCObjectWriter(OS, true)
+  , TargetObjectWriter(MOTW) {
+  memset(&Header, 0, sizeof(Header));
+
+  Header.Machine = TargetObjectWriter->getMachine();
+}
+
+WinCOFFObjectWriter::~WinCOFFObjectWriter() {
+  for (symbols::iterator I = Symbols.begin(), E = Symbols.end(); I != E; ++I)
+    delete *I;
+  for (sections::iterator I = Sections.begin(), E = Sections.end(); I != E; ++I)
+    delete *I;
+}
+
+COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
+  return createCOFFEntity<COFFSymbol>(Name, Symbols);
+}
+
+COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol * Symbol){
+  symbol_map::iterator i = SymbolMap.find(Symbol);
+  if (i != SymbolMap.end())
+    return i->second;
+  COFFSymbol *RetSymbol
+    = createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
+  SymbolMap[Symbol] = RetSymbol;
+  return RetSymbol;
+}
+
+COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
+  return createCOFFEntity<COFFSection>(Name, Sections);
+}
+
+/// A template used to lookup or create a symbol/section, and initialize it if
+/// needed.
+template <typename object_t, typename list_t>
+object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name,
+                                                list_t &List) {
+  object_t *Object = new object_t(Name);
+
+  List.push_back(Object);
+
+  return Object;
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF section staging object.
+void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
+  assert(SectionData.getSection().getVariant() == MCSection::SV_COFF
+    && "Got non COFF section in the COFF backend!");
+  // FIXME: Not sure how to verify this (at least in a debug build).
+  MCSectionCOFF const &Sec =
+    static_cast<MCSectionCOFF const &>(SectionData.getSection());
+
+  COFFSection *coff_section = createSection(Sec.getSectionName());
+  COFFSymbol  *coff_symbol = createSymbol(Sec.getSectionName());
+
+  coff_section->Symbol = coff_symbol;
+  coff_symbol->Section = coff_section;
+  coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+
+  // In this case the auxiliary symbol is a Section Definition.
+  coff_symbol->Aux.resize(1);
+  memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
+  coff_symbol->Aux[0].AuxType = ATSectionDefinition;
+  coff_symbol->Aux[0].Aux.SectionDefinition.Selection = Sec.getSelection();
+
+  coff_section->Header.Characteristics = Sec.getCharacteristics();
+
+  uint32_t &Characteristics = coff_section->Header.Characteristics;
+  switch (SectionData.getAlignment()) {
+  case 1:    Characteristics |= COFF::IMAGE_SCN_ALIGN_1BYTES;    break;
+  case 2:    Characteristics |= COFF::IMAGE_SCN_ALIGN_2BYTES;    break;
+  case 4:    Characteristics |= COFF::IMAGE_SCN_ALIGN_4BYTES;    break;
+  case 8:    Characteristics |= COFF::IMAGE_SCN_ALIGN_8BYTES;    break;
+  case 16:   Characteristics |= COFF::IMAGE_SCN_ALIGN_16BYTES;   break;
+  case 32:   Characteristics |= COFF::IMAGE_SCN_ALIGN_32BYTES;   break;
+  case 64:   Characteristics |= COFF::IMAGE_SCN_ALIGN_64BYTES;   break;
+  case 128:  Characteristics |= COFF::IMAGE_SCN_ALIGN_128BYTES;  break;
+  case 256:  Characteristics |= COFF::IMAGE_SCN_ALIGN_256BYTES;  break;
+  case 512:  Characteristics |= COFF::IMAGE_SCN_ALIGN_512BYTES;  break;
+  case 1024: Characteristics |= COFF::IMAGE_SCN_ALIGN_1024BYTES; break;
+  case 2048: Characteristics |= COFF::IMAGE_SCN_ALIGN_2048BYTES; break;
+  case 4096: Characteristics |= COFF::IMAGE_SCN_ALIGN_4096BYTES; break;
+  case 8192: Characteristics |= COFF::IMAGE_SCN_ALIGN_8192BYTES; break;
+  default:
+    llvm_unreachable("unsupported section alignment");
+  }
+
+  // Bind internal COFF section to MC section.
+  coff_section->MCData = &SectionData;
+  SectionMap[&SectionData.getSection()] = coff_section;
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF symbol staging object.
+void WinCOFFObjectWriter::DefineSymbol(MCSymbol const &Symbol,
+                                       MCSymbolData const &SymbolData,
+                                       MCAssembler &Assembler) {
+  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
+
+  coff_symbol->Data.Type         = (SymbolData.getFlags() & 0x0000FFFF) >>  0;
+  coff_symbol->Data.StorageClass = (SymbolData.getFlags() & 0x00FF0000) >> 16;
+
+  if (SymbolData.getFlags() & COFF::SF_WeakExternal) {
+    coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+
+    if (Symbol.isVariable()) {
+      coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+
+      // FIXME: This assert message isn't very good.
+      assert(Symbol.getVariableValue()->getKind() == MCExpr::SymbolRef &&
+              "Value must be a SymbolRef!");
+
+      coff_symbol->Other = GetOrCreateCOFFSymbol(&Symbol);
+    } else {
+      std::string WeakName = std::string(".weak.")
+                           +  Symbol.getName().str()
+                           + ".default";
+      COFFSymbol *WeakDefault = createSymbol(WeakName);
+      WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+      WeakDefault->Data.StorageClass  = COFF::IMAGE_SYM_CLASS_EXTERNAL;
+      WeakDefault->Data.Type          = 0;
+      WeakDefault->Data.Value         = 0;
+      coff_symbol->Other = WeakDefault;
+    }
+
+    // Setup the Weak External auxiliary symbol.
+    coff_symbol->Aux.resize(1);
+    memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
+    coff_symbol->Aux[0].AuxType = ATWeakExternal;
+    coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
+    coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
+      COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
+  }
+
+  // If no storage class was specified in the streamer, define it here.
+  if (coff_symbol->Data.StorageClass == 0) {
+    bool external = SymbolData.isExternal() || (SymbolData.Fragment == NULL);
+
+    coff_symbol->Data.StorageClass =
+      external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
+  }
+
+  if (SymbolData.Fragment != NULL)
+    coff_symbol->Section =
+      SectionMap[&SymbolData.Fragment->getParent()->getSection()];
+
+  // Bind internal COFF symbol to MC symbol.
+  coff_symbol->MCData = &SymbolData;
+  SymbolMap[&Symbol] = coff_symbol;
+}
+
+/// making a section real involves assigned it a number and putting
+/// name into the string table if needed
+void WinCOFFObjectWriter::MakeSectionReal(COFFSection &S, size_t Number) {
+  if (S.Name.size() > COFF::NameSize) {
+    size_t StringTableEntry = Strings.insert(S.Name.c_str());
+
+    // FIXME: Why is this number 999999? This number is never mentioned in the
+    // spec. I'm assuming this is due to the printed value needing to fit into
+    // the S.Header.Name field. In which case why not 9999999 (7 9's instead of
+    // 6)? The spec does not state if this entry should be null terminated in
+    // this case, and thus this seems to be the best way to do it. I think I
+    // just solved my own FIXME...
+    if (StringTableEntry > 999999)
+      report_fatal_error("COFF string table is greater than 999999 bytes.");
+
+    std::sprintf(S.Header.Name, "/%d", unsigned(StringTableEntry));
+  } else
+    std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+
+  S.Number = Number;
+  S.Symbol->Data.SectionNumber = S.Number;
+  S.Symbol->Aux[0].Aux.SectionDefinition.Number = S.Number;
+}
+
+void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) {
+  if (S.Name.size() > COFF::NameSize) {
+    size_t StringTableEntry = Strings.insert(S.Name.c_str());
+
+    S.set_name_offset(StringTableEntry);
+  } else
+    std::memcpy(S.Data.Name, S.Name.c_str(), S.Name.size());
+  S.Index = Index;
+}
+
+bool WinCOFFObjectWriter::ExportSection(COFFSection const *S) {
+  return !S->MCData->getFragmentList().empty();
+}
+
+bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
+                                       MCAssembler &Asm) {
+  // This doesn't seem to be right. Strings referred to from the .data section
+  // need symbols so they can be linked to code in the .text section right?
+
+  // return Asm.isSymbolLinkerVisible (&SymbolData);
+
+  // For now, all non-variable symbols are exported,
+  // the linker will sort the rest out for us.
+  return SymbolData.isExternal() || !SymbolData.getSymbol().isVariable();
+}
+
+bool WinCOFFObjectWriter::IsPhysicalSection(COFFSection *S) {
+  return (S->Header.Characteristics
+         & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) == 0;
+}
+
+//------------------------------------------------------------------------------
+// entity writing methods
+
+void WinCOFFObjectWriter::WriteFileHeader(const COFF::header &Header) {
+  WriteLE16(Header.Machine);
+  WriteLE16(Header.NumberOfSections);
+  WriteLE32(Header.TimeDateStamp);
+  WriteLE32(Header.PointerToSymbolTable);
+  WriteLE32(Header.NumberOfSymbols);
+  WriteLE16(Header.SizeOfOptionalHeader);
+  WriteLE16(Header.Characteristics);
+}
+
+void WinCOFFObjectWriter::WriteSymbol(const COFFSymbol *S) {
+  WriteBytes(StringRef(S->Data.Name, COFF::NameSize));
+  WriteLE32(S->Data.Value);
+  WriteLE16(S->Data.SectionNumber);
+  WriteLE16(S->Data.Type);
+  Write8(S->Data.StorageClass);
+  Write8(S->Data.NumberOfAuxSymbols);
+  WriteAuxiliarySymbols(S->Aux);
+}
+
+void WinCOFFObjectWriter::WriteAuxiliarySymbols(
+                                        const COFFSymbol::AuxiliarySymbols &S) {
+  for(COFFSymbol::AuxiliarySymbols::const_iterator i = S.begin(), e = S.end();
+      i != e; ++i) {
+    switch(i->AuxType) {
+    case ATFunctionDefinition:
+      WriteLE32(i->Aux.FunctionDefinition.TagIndex);
+      WriteLE32(i->Aux.FunctionDefinition.TotalSize);
+      WriteLE32(i->Aux.FunctionDefinition.PointerToLinenumber);
+      WriteLE32(i->Aux.FunctionDefinition.PointerToNextFunction);
+      WriteZeros(sizeof(i->Aux.FunctionDefinition.unused));
+      break;
+    case ATbfAndefSymbol:
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused1));
+      WriteLE16(i->Aux.bfAndefSymbol.Linenumber);
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused2));
+      WriteLE32(i->Aux.bfAndefSymbol.PointerToNextFunction);
+      WriteZeros(sizeof(i->Aux.bfAndefSymbol.unused3));
+      break;
+    case ATWeakExternal:
+      WriteLE32(i->Aux.WeakExternal.TagIndex);
+      WriteLE32(i->Aux.WeakExternal.Characteristics);
+      WriteZeros(sizeof(i->Aux.WeakExternal.unused));
+      break;
+    case ATFile:
+      WriteBytes(StringRef(reinterpret_cast<const char *>(i->Aux.File.FileName),
+                 sizeof(i->Aux.File.FileName)));
+      break;
+    case ATSectionDefinition:
+      WriteLE32(i->Aux.SectionDefinition.Length);
+      WriteLE16(i->Aux.SectionDefinition.NumberOfRelocations);
+      WriteLE16(i->Aux.SectionDefinition.NumberOfLinenumbers);
+      WriteLE32(i->Aux.SectionDefinition.CheckSum);
+      WriteLE16(i->Aux.SectionDefinition.Number);
+      Write8(i->Aux.SectionDefinition.Selection);
+      WriteZeros(sizeof(i->Aux.SectionDefinition.unused));
+      break;
+    }
+  }
+}
+
+void WinCOFFObjectWriter::WriteSectionHeader(const COFF::section &S) {
+  WriteBytes(StringRef(S.Name, COFF::NameSize));
+
+  WriteLE32(S.VirtualSize);
+  WriteLE32(S.VirtualAddress);
+  WriteLE32(S.SizeOfRawData);
+  WriteLE32(S.PointerToRawData);
+  WriteLE32(S.PointerToRelocations);
+  WriteLE32(S.PointerToLineNumbers);
+  WriteLE16(S.NumberOfRelocations);
+  WriteLE16(S.NumberOfLineNumbers);
+  WriteLE32(S.Characteristics);
+}
+
+void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
+  WriteLE32(R.VirtualAddress);
+  WriteLE32(R.SymbolTableIndex);
+  WriteLE16(R.Type);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// MCObjectWriter interface implementations
+
+void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout) {
+  // "Define" each section & symbol. This creates section & symbol
+  // entries in the staging area.
+
+  for (MCAssembler::const_iterator i = Asm.begin(), e = Asm.end(); i != e; i++)
+    DefineSection(*i);
+
+  for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
+                                          e = Asm.symbol_end(); i != e; i++) {
+    if (ExportSymbol(*i, Asm)) {
+      const MCSymbol &Alias = i->getSymbol();
+      const MCSymbol &Symbol = Alias.AliasedSymbol();
+      DefineSymbol(Alias, Asm.getSymbolData(Symbol), Asm);
+    }
+  }
+}
+
+void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+  assert(Target.getSymA() != NULL && "Relocation must reference a symbol!");
+
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData &A_SD = Asm.getSymbolData(*A);
+
+  MCSectionData const *SectionData = Fragment->getParent();
+
+  // Mark this symbol as requiring an entry in the symbol table.
+  assert(SectionMap.find(&SectionData->getSection()) != SectionMap.end() &&
+         "Section must already have been defined in ExecutePostLayoutBinding!");
+  assert(SymbolMap.find(&A_SD.getSymbol()) != SymbolMap.end() &&
+         "Symbol must already have been defined in ExecutePostLayoutBinding!");
+
+  COFFSection *coff_section = SectionMap[&SectionData->getSection()];
+  COFFSymbol *coff_symbol = SymbolMap[&A_SD.getSymbol()];
+  const MCSymbolRefExpr *SymA = Target.getSymA();
+  const MCSymbolRefExpr *SymB = Target.getSymB();
+  const bool CrossSection = SymB &&
+    &SymA->getSymbol().getSection() != &SymB->getSymbol().getSection();
+
+  if (Target.getSymB()) {
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+
+    // Offset of the symbol in the section
+    int64_t a = Layout.getSymbolOffset(&B_SD);
+
+    // Ofeset of the relocation in the section
+    int64_t b = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+
+    FixedValue = b - a;
+    // In the case where we have SymbA and SymB, we just need to store the delta
+    // between the two symbols.  Update FixedValue to account for the delta, and
+    // skip recording the relocation.
+    if (!CrossSection)
+      return;
+  } else {
+    FixedValue = Target.getConstant();
+  }
+
+  COFFRelocation Reloc;
+
+  Reloc.Data.SymbolTableIndex = 0;
+  Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
+
+  // Turn relocations for temporary symbols into section relocations.
+  if (coff_symbol->MCData->getSymbol().isTemporary() || CrossSection) {
+    Reloc.Symb = coff_symbol->Section->Symbol;
+    FixedValue += Layout.getFragmentOffset(coff_symbol->MCData->Fragment)
+                + coff_symbol->MCData->getOffset();
+  } else
+    Reloc.Symb = coff_symbol;
+
+  ++Reloc.Symb->Relocations;
+
+  Reloc.Data.VirtualAddress += Fixup.getOffset();
+
+  unsigned FixupKind = Fixup.getKind();
+
+  if (CrossSection)
+    FixupKind = FK_PCRel_4;
+
+  Reloc.Data.Type = TargetObjectWriter->getRelocType(FixupKind);
+
+  // FIXME: Can anyone explain what this does other than adjust for the size
+  // of the offset?
+  if (Reloc.Data.Type == COFF::IMAGE_REL_AMD64_REL32 ||
+      Reloc.Data.Type == COFF::IMAGE_REL_I386_REL32)
+    FixedValue += 4;
+
+  coff_section->Relocations.push_back(Reloc);
+}
+
+void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  // Assign symbol and section indexes and offsets.
+  Header.NumberOfSections = 0;
+
+  for (sections::iterator i = Sections.begin(),
+                          e = Sections.end(); i != e; i++) {
+    if (Layout.getSectionAddressSize((*i)->MCData) > 0) {
+      MakeSectionReal(**i, ++Header.NumberOfSections);
+    } else {
+      (*i)->Number = -1;
+    }
+  }
+
+  Header.NumberOfSymbols = 0;
+
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
+    COFFSymbol *coff_symbol = *i;
+    MCSymbolData const *SymbolData = coff_symbol->MCData;
+
+    // Update section number & offset for symbols that have them.
+    if ((SymbolData != NULL) && (SymbolData->Fragment != NULL)) {
+      assert(coff_symbol->Section != NULL);
+
+      coff_symbol->Data.SectionNumber = coff_symbol->Section->Number;
+      coff_symbol->Data.Value = Layout.getFragmentOffset(SymbolData->Fragment)
+                              + SymbolData->Offset;
+    }
+
+    if (coff_symbol->should_keep()) {
+      MakeSymbolReal(*coff_symbol, Header.NumberOfSymbols++);
+
+      // Update auxiliary symbol info.
+      coff_symbol->Data.NumberOfAuxSymbols = coff_symbol->Aux.size();
+      Header.NumberOfSymbols += coff_symbol->Data.NumberOfAuxSymbols;
+    } else
+      coff_symbol->Index = -1;
+  }
+
+  // Fixup weak external references.
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++) {
+    COFFSymbol *coff_symbol = *i;
+    if (coff_symbol->Other != NULL) {
+      assert(coff_symbol->Index != -1);
+      assert(coff_symbol->Aux.size() == 1 &&
+             "Symbol must contain one aux symbol!");
+      assert(coff_symbol->Aux[0].AuxType == ATWeakExternal &&
+             "Symbol's aux symbol must be a Weak External!");
+      coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = coff_symbol->Other->Index;
+    }
+  }
+
+  // Assign file offsets to COFF object file structures.
+
+  unsigned offset = 0;
+
+  offset += COFF::HeaderSize;
+  offset += COFF::SectionSize * Header.NumberOfSections;
+
+  for (MCAssembler::const_iterator i = Asm.begin(),
+                                   e = Asm.end();
+                                   i != e; i++) {
+    COFFSection *Sec = SectionMap[&i->getSection()];
+
+    if (Sec->Number == -1)
+      continue;
+
+    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(i);
+
+    if (IsPhysicalSection(Sec)) {
+      Sec->Header.PointerToRawData = offset;
+
+      offset += Sec->Header.SizeOfRawData;
+    }
+
+    if (Sec->Relocations.size() > 0) {
+      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
+
+      if (RelocationsOverflow) {
+        // Signal overflow by setting NumberOfSections to max value. Actual
+        // size is found in reloc #0. Microsoft tools understand this.
+        Sec->Header.NumberOfRelocations = 0xffff;
+      } else {
+        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      }
+      Sec->Header.PointerToRelocations = offset;
+
+      if (RelocationsOverflow) {
+        // Reloc #0 will contain actual count, so make room for it.
+        offset += COFF::RelocationSize;
+      }
+
+      offset += COFF::RelocationSize * Sec->Relocations.size();
+
+      for (relocations::iterator cr = Sec->Relocations.begin(),
+                                 er = Sec->Relocations.end();
+                                 cr != er; ++cr) {
+        assert((*cr).Symb->Index != -1);
+        (*cr).Data.SymbolTableIndex = (*cr).Symb->Index;
+      }
+    }
+
+    assert(Sec->Symbol->Aux.size() == 1
+      && "Section's symbol must have one aux!");
+    AuxSymbol &Aux = Sec->Symbol->Aux[0];
+    assert(Aux.AuxType == ATSectionDefinition &&
+           "Section's symbol's aux symbol must be a Section Definition!");
+    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
+    Aux.Aux.SectionDefinition.NumberOfRelocations =
+                                                Sec->Header.NumberOfRelocations;
+    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
+                                                Sec->Header.NumberOfLineNumbers;
+  }
+
+  Header.PointerToSymbolTable = offset;
+
+  Header.TimeDateStamp = sys::TimeValue::now().toEpochTime();
+
+  // Write it all to disk...
+  WriteFileHeader(Header);
+
+  {
+    sections::iterator i, ie;
+    MCAssembler::const_iterator j, je;
+
+    for (i = Sections.begin(), ie = Sections.end(); i != ie; i++)
+      if ((*i)->Number != -1) {
+        if ((*i)->Relocations.size() >= 0xffff) {
+          (*i)->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+        }
+        WriteSectionHeader((*i)->Header);
+      }
+
+    for (i = Sections.begin(), ie = Sections.end(),
+         j = Asm.begin(), je = Asm.end();
+         (i != ie) && (j != je); ++i, ++j) {
+
+      if ((*i)->Number == -1)
+        continue;
+
+      if ((*i)->Header.PointerToRawData != 0) {
+        assert(OS.tell() == (*i)->Header.PointerToRawData &&
+               "Section::PointerToRawData is insane!");
+
+        Asm.writeSectionData(j, Layout);
+      }
+
+      if ((*i)->Relocations.size() > 0) {
+        assert(OS.tell() == (*i)->Header.PointerToRelocations &&
+               "Section::PointerToRelocations is insane!");
+
+        if ((*i)->Relocations.size() >= 0xffff) {
+          // In case of overflow, write actual relocation count as first
+          // relocation. Including the synthetic reloc itself (+ 1).
+          COFF::relocation r;
+          r.VirtualAddress = (*i)->Relocations.size() + 1;
+          r.SymbolTableIndex = 0;
+          r.Type = 0;
+          WriteRelocation(r);
+        }
+
+        for (relocations::const_iterator k = (*i)->Relocations.begin(),
+                                               ke = (*i)->Relocations.end();
+                                               k != ke; k++) {
+          WriteRelocation(k->Data);
+        }
+      } else
+        assert((*i)->Header.PointerToRelocations == 0 &&
+               "Section::PointerToRelocations is insane!");
+    }
+  }
+
+  assert(OS.tell() == Header.PointerToSymbolTable &&
+         "Header::PointerToSymbolTable is insane!");
+
+  for (symbols::iterator i = Symbols.begin(), e = Symbols.end(); i != e; i++)
+    if ((*i)->Index != -1)
+      WriteSymbol(*i);
+
+  OS.write((char const *)&Strings.Data.front(), Strings.Data.size());
+}
+
+MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) :
+  Machine(Machine_) {
+}
+
+//------------------------------------------------------------------------------
+// WinCOFFObjectWriter factory function
+
+namespace llvm {
+  MCObjectWriter *createWinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
+                                            raw_ostream &OS) {
+    return new WinCOFFObjectWriter(MOTW, OS);
+  }
+}
diff --git a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
new file mode 100644
index 000000000000..75f343c421bb
--- /dev/null
+++ b/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
@@ -0,0 +1,336 @@
+//===-- llvm/MC/WinCOFFStreamer.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of a Win32 COFF object file streamer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "WinCOFFStreamer"
+
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class WinCOFFStreamer : public MCObjectStreamer {
+public:
+  MCSymbol const *CurSymbol;
+
+  WinCOFFStreamer(MCContext &Context,
+                  MCAsmBackend &MAB,
+                  MCCodeEmitter &CE,
+                  raw_ostream &OS);
+
+  void AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                       unsigned ByteAlignment, bool External);
+
+  // MCStreamer interface
+
+  virtual void InitSections();
+  virtual void InitToTextSection();
+  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
+  virtual void EmitThumbFunc(MCSymbol *Func);
+  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
+  virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
+  virtual void EmitCOFFSymbolStorageClass(int StorageClass);
+  virtual void EmitCOFFSymbolType(int Type);
+  virtual void EndCOFFSymbolDef();
+  virtual void EmitCOFFSecRel32(MCSymbol const *Symbol);
+  virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
+  virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                unsigned ByteAlignment);
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
+  virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                            uint64_t Size,unsigned ByteAlignment);
+  virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size, unsigned ByteAlignment);
+  virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitWin64EHHandlerData();
+  virtual void FinishImpl();
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_WinCOFFStreamer;
+  }
+
+private:
+  virtual void EmitInstToData(const MCInst &Inst) {
+    MCDataFragment *DF = getOrCreateDataFragment();
+
+    SmallVector<MCFixup, 4> Fixups;
+    SmallString<256> Code;
+    raw_svector_ostream VecOS(Code);
+    getAssembler().getEmitter().EncodeInstruction(Inst, VecOS, Fixups);
+    VecOS.flush();
+
+    // Add the fixups and data.
+    for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+      Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+      DF->getFixups().push_back(Fixups[i]);
+    }
+    DF->getContents().append(Code.begin(), Code.end());
+  }
+
+  void SetSection(StringRef Section,
+                  unsigned Characteristics,
+                  SectionKind Kind) {
+    SwitchSection(getContext().getCOFFSection(Section, Characteristics, Kind));
+  }
+
+  void SetSectionText() {
+    SetSection(".text",
+               COFF::IMAGE_SCN_CNT_CODE
+             | COFF::IMAGE_SCN_MEM_EXECUTE
+             | COFF::IMAGE_SCN_MEM_READ,
+               SectionKind::getText());
+    EmitCodeAlignment(4, 0);
+  }
+
+  void SetSectionData() {
+    SetSection(".data",
+               COFF::IMAGE_SCN_CNT_INITIALIZED_DATA
+             | COFF::IMAGE_SCN_MEM_READ
+             | COFF::IMAGE_SCN_MEM_WRITE,
+               SectionKind::getDataRel());
+    EmitCodeAlignment(4, 0);
+  }
+
+  void SetSectionBSS() {
+    SetSection(".bss",
+               COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
+             | COFF::IMAGE_SCN_MEM_READ
+             | COFF::IMAGE_SCN_MEM_WRITE,
+               SectionKind::getBSS());
+    EmitCodeAlignment(4, 0);
+  }
+};
+} // end anonymous namespace.
+
+WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                 MCCodeEmitter &CE, raw_ostream &OS)
+    : MCObjectStreamer(SK_WinCOFFStreamer, Context, MAB, OS, &CE),
+      CurSymbol(NULL) {}
+
+void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                      unsigned ByteAlignment, bool External) {
+  assert(!Symbol->isInSection() && "Symbol must not already have a section!");
+
+  std::string SectionName(".bss$linkonce");
+  SectionName.append(Symbol->getName().begin(), Symbol->getName().end());
+
+  MCSymbolData &SymbolData = getAssembler().getOrCreateSymbolData(*Symbol);
+
+  unsigned Characteristics =
+    COFF::IMAGE_SCN_LNK_COMDAT |
+    COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+    COFF::IMAGE_SCN_MEM_READ |
+    COFF::IMAGE_SCN_MEM_WRITE;
+
+  int Selection = COFF::IMAGE_COMDAT_SELECT_LARGEST;
+
+  const MCSection *Section = MCStreamer::getContext().getCOFFSection(
+    SectionName, Characteristics, Selection, SectionKind::getBSS());
+
+  MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
+
+  if (SectionData.getAlignment() < ByteAlignment)
+    SectionData.setAlignment(ByteAlignment);
+
+  SymbolData.setExternal(External);
+
+  Symbol->setSection(*Section);
+
+  if (ByteAlignment != 1)
+      new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
+
+  SymbolData.setFragment(new MCFillFragment(0, 0, Size, &SectionData));
+}
+
+// MCStreamer interface
+
+void WinCOFFStreamer::InitToTextSection() {
+  SetSectionText();
+}
+
+void WinCOFFStreamer::InitSections() {
+  SetSectionText();
+  SetSectionData();
+  SetSectionBSS();
+  SetSectionText();
+}
+
+void WinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCObjectStreamer::EmitLabel(Symbol);
+}
+
+void WinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+void WinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                          MCSymbolAttr Attribute) {
+  assert(Symbol && "Symbol must be non-null!");
+  assert((Symbol->isInSection()
+         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
+         : true) && "Got non COFF section in the COFF backend!");
+  switch (Attribute) {
+  case MCSA_WeakReference:
+  case MCSA_Weak: {
+      MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+      SD.modifyFlags(COFF::SF_WeakExternal, COFF::SF_WeakExternal);
+      SD.setExternal(true);
+    }
+    break;
+
+  case MCSA_Global:
+    getAssembler().getOrCreateSymbolData(*Symbol).setExternal(true);
+    break;
+
+  default:
+    llvm_unreachable("unsupported attribute");
+  }
+}
+
+void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::BeginCOFFSymbolDef(MCSymbol const *Symbol) {
+  assert((Symbol->isInSection()
+         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
+         : true) && "Got non COFF section in the COFF backend!");
+  assert(CurSymbol == NULL && "EndCOFFSymbolDef must be called between calls "
+                              "to BeginCOFFSymbolDef!");
+  CurSymbol = Symbol;
+}
+
+void WinCOFFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  assert((StorageClass & ~0xFF) == 0 && "StorageClass must only have data in "
+                                        "the first byte!");
+
+  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
+    StorageClass << COFF::SF_ClassShift,
+    COFF::SF_ClassMask);
+}
+
+void WinCOFFStreamer::EmitCOFFSymbolType(int Type) {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  assert((Type & ~0xFFFF) == 0 && "Type must only have data in the first 2 "
+                                  "bytes");
+
+  getAssembler().getOrCreateSymbolData(*CurSymbol).modifyFlags(
+    Type << COFF::SF_TypeShift,
+    COFF::SF_TypeMask);
+}
+
+void WinCOFFStreamer::EndCOFFSymbolDef() {
+  assert(CurSymbol != NULL && "BeginCOFFSymbolDef must be called first!");
+  CurSymbol = NULL;
+}
+
+void WinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol)
+{
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->getFixups().push_back(
+      MCFixup::Create(DF->getContents().size(),
+                      MCSymbolRefExpr::Create (Symbol, getContext ()),
+                      FK_SecRel_4));
+  DF->getContents().resize(DF->getContents().size() + 4, 0);
+}
+
+void WinCOFFStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {
+  assert((Symbol->isInSection()
+         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
+         : true) && "Got non COFF section in the COFF backend!");
+  AddCommonSymbol(Symbol, Size, ByteAlignment, true);
+}
+
+void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlignment) {
+  assert((Symbol->isInSection()
+         ? Symbol->getSection().getVariant() == MCSection::SV_COFF
+         : true) && "Got non COFF section in the COFF backend!");
+  AddCommonSymbol(Symbol, Size, ByteAlignment, false);
+}
+
+void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                                   uint64_t Size,unsigned ByteAlignment) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                     uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("not implemented");
+}
+
+void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
+  // Ignore for now, linkers don't care, and proper debug
+  // info will be a much large effort.
+}
+
+void WinCOFFStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
+void WinCOFFStreamer::FinishImpl() {
+  EmitW64Tables();
+  MCObjectStreamer::FinishImpl();
+}
+
+namespace llvm
+{
+  MCStreamer *createWinCOFFStreamer(MCContext &Context,
+                                    MCAsmBackend &MAB,
+                                    MCCodeEmitter &CE,
+                                    raw_ostream &OS,
+                                    bool RelaxAll) {
+    WinCOFFStreamer *S = new WinCOFFStreamer(Context, MAB, CE, OS);
+    S->getAssembler().setRelaxAll(RelaxAll);
+    return S;
+  }
+}
diff --git a/contrib/llvm/lib/Object/Archive.cpp b/contrib/llvm/lib/Object/Archive.cpp
new file mode 100644
index 000000000000..0e13d0540fa6
--- /dev/null
+++ b/contrib/llvm/lib/Object/Archive.cpp
@@ -0,0 +1,301 @@
+//===- Archive.cpp - ar File Format implementation --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ArchiveObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Archive.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace object;
+
+static const char *Magic = "!<arch>\n";
+
+static bool isInternalMember(const ArchiveMemberHeader &amh) {
+  static const char *const internals[] = {
+    "/",
+    "//",
+    "#_LLVM_SYM_TAB_#"
+  };
+
+  StringRef name = amh.getName();
+  for (std::size_t i = 0; i < sizeof(internals) / sizeof(*internals); ++i) {
+    if (name == internals[i])
+      return true;
+  }
+  return false;
+}
+
+void Archive::anchor() { }
+
+error_code Archive::Child::getName(StringRef &Result) const {
+  StringRef name = ToHeader(Data.data())->getName();
+  // Check if it's a special name.
+  if (name[0] == '/') {
+    if (name.size() == 1) { // Linker member.
+      Result = name;
+      return object_error::success;
+    }
+    if (name.size() == 2 && name[1] == '/') { // String table.
+      Result = name;
+      return object_error::success;
+    }
+    // It's a long name.
+    // Get the offset.
+    std::size_t offset;
+    if (name.substr(1).rtrim(" ").getAsInteger(10, offset))
+      llvm_unreachable("Long name offset is not an integer");
+    const char *addr = Parent->StringTable->Data.begin()
+                       + sizeof(ArchiveMemberHeader)
+                       + offset;
+    // Verify it.
+    if (Parent->StringTable == Parent->end_children()
+        || addr < (Parent->StringTable->Data.begin()
+                   + sizeof(ArchiveMemberHeader))
+        || addr > (Parent->StringTable->Data.begin()
+                   + sizeof(ArchiveMemberHeader)
+                   + Parent->StringTable->getSize()))
+      return object_error::parse_failed;
+
+    // GNU long file names end with a /.
+    if (Parent->kind() == K_GNU) {
+      StringRef::size_type End = StringRef(addr).find('/');
+      Result = StringRef(addr, End);
+    } else {
+      Result = addr;
+    }
+    return object_error::success;
+  } else if (name.startswith("#1/")) {
+    uint64_t name_size;
+    if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
+      llvm_unreachable("Long name length is not an ingeter");
+    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size);
+    return object_error::success;
+  }
+  // It's a simple name.
+  if (name[name.size() - 1] == '/')
+    Result = name.substr(0, name.size() - 1);
+  else
+    Result = name;
+  return object_error::success;
+}
+
+error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
+  OwningPtr<Binary> ret;
+  OwningPtr<MemoryBuffer> Buff;
+  if (error_code ec = getMemoryBuffer(Buff))
+    return ec;
+  if (error_code ec = createBinary(Buff.take(), ret))
+    return ec;
+  Result.swap(ret);
+  return object_error::success;
+}
+
+Archive::Archive(MemoryBuffer *source, error_code &ec)
+  : Binary(Binary::ID_Archive, source) {
+  // Check for sufficient magic.
+  if (!source || source->getBufferSize()
+                 < (8 + sizeof(ArchiveMemberHeader) + 2) // Smallest archive.
+              || StringRef(source->getBufferStart(), 8) != Magic) {
+    ec = object_error::invalid_file_type;
+    return;
+  }
+
+  // Get the special members.
+  child_iterator i = begin_children(false);
+  child_iterator e = end_children();
+
+  StringRef name;
+  if ((ec = i->getName(name)))
+    return;
+
+  // Below is the pattern that is used to figure out the archive format
+  // GNU archive format
+  //  First member : / (points to the symbol table )
+  //  Second member : // (may exist, if it exists, points to the string table)
+  //  Note : The string table is used if the filename exceeds 15 characters
+  // BSD archive format
+  //  First member : __.SYMDEF (points to the symbol table)
+  //  There is no string table, if the filename exceeds 15 characters or has a 
+  //  embedded space, the filename has #1/<size>, The size represents the size 
+  //  of the filename that needs to be read after the archive header
+  // COFF archive format
+  //  First member : /
+  //  Second member : / (provides a directory of symbols)
+  //  Third member : // contains the string table, this is present even if the
+  //                    string table is empty
+  if (name == "/") {
+    SymbolTable = i;
+    StringTable = e;
+    if (i != e) ++i;
+    if (i == e) {
+      ec = object_error::parse_failed;
+      return;
+    }
+    if ((ec = i->getName(name)))
+      return;
+    if (name[0] != '/') {
+      Format = K_GNU;
+    } else if ((name.size() > 1) && (name == "//")) { 
+      Format = K_GNU;
+      StringTable = i;
+      ++i;
+    } else  { 
+      Format = K_COFF;
+      if (i != e) {
+        SymbolTable = i;
+        ++i;
+      }
+      if (i != e) {
+        StringTable = i;
+      }
+    }
+  } else if (name == "__.SYMDEF") {
+    Format = K_BSD;
+    SymbolTable = i;
+    StringTable = e;
+  } 
+  ec = object_error::success;
+}
+
+Archive::child_iterator Archive::begin_children(bool skip_internal) const {
+  const char *Loc = Data->getBufferStart() + strlen(Magic);
+  size_t Size = sizeof(ArchiveMemberHeader) +
+    ToHeader(Loc)->getSize();
+  Child c(this, StringRef(Loc, Size));
+  // Skip internals at the beginning of an archive.
+  if (skip_internal && isInternalMember(*ToHeader(Loc)))
+    return c.getNext();
+  return c;
+}
+
+Archive::child_iterator Archive::end_children() const {
+  return Child(this, StringRef(0, 0));
+}
+
+error_code Archive::Symbol::getName(StringRef &Result) const {
+  Result = StringRef(Parent->SymbolTable->getBuffer().begin() + StringIndex);
+  return object_error::success;
+}
+
+error_code Archive::Symbol::getMember(child_iterator &Result) const {
+  const char *Buf = Parent->SymbolTable->getBuffer().begin();
+  const char *Offsets = Buf + 4;
+  uint32_t Offset = 0;
+  if (Parent->kind() == K_GNU) {
+    Offset = *(reinterpret_cast<const support::ubig32_t*>(Offsets)
+               + SymbolIndex);
+  } else if (Parent->kind() == K_BSD) {
+    llvm_unreachable("BSD format is not supported");
+  } else {
+    uint32_t MemberCount = *reinterpret_cast<const support::ulittle32_t*>(Buf);
+    
+    // Skip offsets.
+    Buf += sizeof(support::ulittle32_t)
+           + (MemberCount * sizeof(support::ulittle32_t));
+
+    uint32_t SymbolCount = *reinterpret_cast<const support::ulittle32_t*>(Buf);
+
+    if (SymbolIndex >= SymbolCount)
+      return object_error::parse_failed;
+
+    // Skip SymbolCount to get to the indices table.
+    const char *Indices = Buf + sizeof(support::ulittle32_t);
+
+    // Get the index of the offset in the file member offset table for this
+    // symbol.
+    uint16_t OffsetIndex =
+      *(reinterpret_cast<const support::ulittle16_t*>(Indices)
+        + SymbolIndex);
+    // Subtract 1 since OffsetIndex is 1 based.
+    --OffsetIndex;
+
+    if (OffsetIndex >= MemberCount)
+      return object_error::parse_failed;
+
+    Offset = *(reinterpret_cast<const support::ulittle32_t*>(Offsets)
+               + OffsetIndex);
+  }
+
+  const char *Loc = Parent->getData().begin() + Offset;
+  size_t Size = sizeof(ArchiveMemberHeader) +
+    ToHeader(Loc)->getSize();
+  Result = Child(Parent, StringRef(Loc, Size));
+
+  return object_error::success;
+}
+
+Archive::Symbol Archive::Symbol::getNext() const {
+  Symbol t(*this);
+  // Go to one past next null.
+  t.StringIndex =
+      Parent->SymbolTable->getBuffer().find('\0', t.StringIndex) + 1;
+  ++t.SymbolIndex;
+  return t;
+}
+
+Archive::symbol_iterator Archive::begin_symbols() const {
+  const char *buf = SymbolTable->getBuffer().begin();
+  if (kind() == K_GNU) {
+    uint32_t symbol_count = 0;
+    symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
+    buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t)));
+  } else if (kind() == K_BSD) {
+    llvm_unreachable("BSD archive format is not supported");
+  } else {
+    uint32_t member_count = 0;
+    uint32_t symbol_count = 0;
+    member_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
+    buf += 4 + (member_count * 4); // Skip offsets.
+    symbol_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
+    buf += 4 + (symbol_count * 2); // Skip indices.
+  }
+  uint32_t string_start_offset = buf - SymbolTable->getBuffer().begin();
+  return symbol_iterator(Symbol(this, 0, string_start_offset));
+}
+
+Archive::symbol_iterator Archive::end_symbols() const {
+  const char *buf = SymbolTable->getBuffer().begin();
+  uint32_t symbol_count = 0;
+  if (kind() == K_GNU) {
+    symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
+    buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t)));
+  } else if (kind() == K_BSD) {
+    llvm_unreachable("BSD archive format is not supported");
+  } else {
+    uint32_t member_count = 0;
+    member_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
+    buf += 4 + (member_count * 4); // Skip offsets.
+    symbol_count = *reinterpret_cast<const support::ulittle32_t*>(buf);
+  }
+  return symbol_iterator(
+    Symbol(this, symbol_count, 0));
+}
+
+Archive::child_iterator Archive::findSym(StringRef name) const {
+  Archive::symbol_iterator bs = begin_symbols();
+  Archive::symbol_iterator es = end_symbols();
+  Archive::child_iterator result;
+  
+  StringRef symname;
+  for (; bs != es; ++bs) {
+    if (bs->getName(symname))
+        return end_children();
+    if (symname == name) {
+      if (bs->getMember(result))
+        return end_children();
+      return result;
+    }
+  }
+  return end_children();
+}
diff --git a/contrib/llvm/lib/Object/Binary.cpp b/contrib/llvm/lib/Object/Binary.cpp
new file mode 100644
index 000000000000..4e528d8ea565
--- /dev/null
+++ b/contrib/llvm/lib/Object/Binary.cpp
@@ -0,0 +1,103 @@
+//===- Binary.cpp - A generic binary file -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Binary class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Binary.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+
+// Include headers for createBinary.
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
+
+using namespace llvm;
+using namespace object;
+
+Binary::~Binary() {
+  delete Data;
+}
+
+Binary::Binary(unsigned int Type, MemoryBuffer *Source)
+  : TypeID(Type)
+  , Data(Source) {}
+
+StringRef Binary::getData() const {
+  return Data->getBuffer();
+}
+
+StringRef Binary::getFileName() const {
+  return Data->getBufferIdentifier();
+}
+
+error_code object::createBinary(MemoryBuffer *Source,
+                                OwningPtr<Binary> &Result) {
+  OwningPtr<MemoryBuffer> scopedSource(Source);
+  if (!Source)
+    return make_error_code(errc::invalid_argument);
+  if (Source->getBufferSize() < 64)
+    return object_error::invalid_file_type;
+  sys::LLVMFileType type = sys::IdentifyFileType(Source->getBufferStart(),
+                                static_cast<unsigned>(Source->getBufferSize()));
+  error_code ec;
+  switch (type) {
+    case sys::Archive_FileType: {
+      OwningPtr<Binary> ret(new Archive(scopedSource.take(), ec));
+      if (ec) return ec;
+      Result.swap(ret);
+      return object_error::success;
+    }
+    case sys::ELF_Relocatable_FileType:
+    case sys::ELF_Executable_FileType:
+    case sys::ELF_SharedObject_FileType:
+    case sys::ELF_Core_FileType: {
+      OwningPtr<Binary> ret(
+        ObjectFile::createELFObjectFile(scopedSource.take()));
+      if (!ret)
+        return object_error::invalid_file_type;
+      Result.swap(ret);
+      return object_error::success;
+    }
+    case sys::Mach_O_Object_FileType:
+    case sys::Mach_O_Executable_FileType:
+    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
+    case sys::Mach_O_Core_FileType:
+    case sys::Mach_O_PreloadExecutable_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
+    case sys::Mach_O_DynamicLinker_FileType:
+    case sys::Mach_O_Bundle_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType: {
+      OwningPtr<Binary> ret(
+        ObjectFile::createMachOObjectFile(scopedSource.take()));
+      if (!ret)
+        return object_error::invalid_file_type;
+      Result.swap(ret);
+      return object_error::success;
+    }
+    case sys::COFF_FileType: {
+      OwningPtr<Binary> ret(new COFFObjectFile(scopedSource.take(), ec));
+      if (ec) return ec;
+      Result.swap(ret);
+      return object_error::success;
+    }
+    default: // Unrecognized object file format.
+      return object_error::invalid_file_type;
+  }
+}
+
+error_code object::createBinary(StringRef Path, OwningPtr<Binary> &Result) {
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Path, File))
+    return ec;
+  return createBinary(File.take(), Result);
+}
diff --git a/contrib/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm/lib/Object/COFFObjectFile.cpp
new file mode 100644
index 000000000000..ca90e0e3c3fc
--- /dev/null
+++ b/contrib/llvm/lib/Object/COFFObjectFile.cpp
@@ -0,0 +1,837 @@
+//===- COFFObjectFile.cpp - COFF object file implementation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the COFFObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFF.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+using support::ulittle8_t;
+using support::ulittle16_t;
+using support::ulittle32_t;
+using support::little16_t;
+}
+
+namespace {
+// Returns false if size is greater than the buffer size. And sets ec.
+bool checkSize(const MemoryBuffer *m, error_code &ec, uint64_t size) {
+  if (m->getBufferSize() < size) {
+    ec = object_error::unexpected_eof;
+    return false;
+  }
+  return true;
+}
+
+// Returns false if any bytes in [addr, addr + size) fall outsize of m.
+bool checkAddr(const MemoryBuffer *m,
+               error_code &ec,
+               uintptr_t addr,
+               uint64_t size) {
+  if (addr + size < addr ||
+      addr + size < size ||
+      addr + size > uintptr_t(m->getBufferEnd())) {
+    ec = object_error::unexpected_eof;
+    return false;
+  }
+  return true;
+}
+}
+
+const coff_symbol *COFFObjectFile::toSymb(DataRefImpl Symb) const {
+  const coff_symbol *addr = reinterpret_cast<const coff_symbol*>(Symb.p);
+
+# ifndef NDEBUG
+  // Verify that the symbol points to a valid entry in the symbol table.
+  uintptr_t offset = uintptr_t(addr) - uintptr_t(base());
+  if (offset < Header->PointerToSymbolTable
+      || offset >= Header->PointerToSymbolTable
+         + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+    report_fatal_error("Symbol was outside of symbol table.");
+
+  assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+         == 0 && "Symbol did not point to the beginning of a symbol");
+# endif
+
+  return addr;
+}
+
+const coff_section *COFFObjectFile::toSec(DataRefImpl Sec) const {
+  const coff_section *addr = reinterpret_cast<const coff_section*>(Sec.p);
+
+# ifndef NDEBUG
+  // Verify that the section points to a valid entry in the section table.
+  if (addr < SectionTable
+      || addr >= (SectionTable + Header->NumberOfSections))
+    report_fatal_error("Section was outside of section table.");
+
+  uintptr_t offset = uintptr_t(addr) - uintptr_t(SectionTable);
+  assert(offset % sizeof(coff_section) == 0 &&
+         "Section did not point to the beginning of a section");
+# endif
+
+  return addr;
+}
+
+error_code COFFObjectFile::getSymbolNext(DataRefImpl Symb,
+                                         SymbolRef &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  symb += 1 + symb->NumberOfAuxSymbols;
+  Symb.p = reinterpret_cast<uintptr_t>(symb);
+  Result = SymbolRef(Symb, this);
+  return object_error::success;
+}
+
+ error_code COFFObjectFile::getSymbolName(DataRefImpl Symb,
+                                          StringRef &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  return getSymbolName(symb, Result);
+}
+
+error_code COFFObjectFile::getSymbolFileOffset(DataRefImpl Symb,
+                                            uint64_t &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *Section = NULL;
+  if (error_code ec = getSection(symb->SectionNumber, Section))
+    return ec;
+  char Type;
+  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+    return ec;
+  if (Type == 'U' || Type == 'w')
+    Result = UnknownAddressOrSize;
+  else if (Section)
+    Result = Section->PointerToRawData + symb->Value;
+  else
+    Result = symb->Value;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
+                                            uint64_t &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *Section = NULL;
+  if (error_code ec = getSection(symb->SectionNumber, Section))
+    return ec;
+  char Type;
+  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+    return ec;
+  if (Type == 'U' || Type == 'w')
+    Result = UnknownAddressOrSize;
+  else if (Section)
+    Result = Section->VirtualAddress + symb->Value;
+  else
+    Result = symb->Value;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolType(DataRefImpl Symb,
+                                         SymbolRef::Type &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  Result = SymbolRef::ST_Other;
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
+      symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED) {
+    Result = SymbolRef::ST_Unknown;
+  } else {
+    if (symb->getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
+      Result = SymbolRef::ST_Function;
+    } else {
+      char Type;
+      if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+        return ec;
+      if (Type == 'r' || Type == 'R') {
+        Result = SymbolRef::ST_Data;
+      }
+    }
+  }
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolFlags(DataRefImpl Symb,
+                                          uint32_t &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  Result = SymbolRef::SF_None;
+
+  // TODO: Correctly set SF_FormatSpecific, SF_ThreadLocal, SF_Common
+
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
+      symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
+    Result |= SymbolRef::SF_Undefined;
+
+  // TODO: This are certainly too restrictive.
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
+    Result |= SymbolRef::SF_Global;
+
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL)
+    Result |= SymbolRef::SF_Weak;
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_ABSOLUTE)
+    Result |= SymbolRef::SF_Absolute;
+
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
+                                         uint64_t &Result) const {
+  // FIXME: Return the correct size. This requires looking at all the symbols
+  //        in the same section as this symbol, and looking for either the next
+  //        symbol, or the end of the section.
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *Section = NULL;
+  if (error_code ec = getSection(symb->SectionNumber, Section))
+    return ec;
+  char Type;
+  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+    return ec;
+  if (Type == 'U' || Type == 'w')
+    Result = UnknownAddressOrSize;
+  else if (Section)
+    Result = Section->SizeOfRawData - symb->Value;
+  else
+    Result = 0;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
+                                               char &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  StringRef name;
+  if (error_code ec = getSymbolName(Symb, name))
+    return ec;
+  char ret = StringSwitch<char>(name)
+    .StartsWith(".debug", 'N')
+    .StartsWith(".sxdata", 'N')
+    .Default('?');
+
+  if (ret != '?') {
+    Result = ret;
+    return object_error::success;
+  }
+
+  uint32_t Characteristics = 0;
+  if (symb->SectionNumber > 0) {
+    const coff_section *Section = NULL;
+    if (error_code ec = getSection(symb->SectionNumber, Section))
+      return ec;
+    Characteristics = Section->Characteristics;
+  }
+
+  switch (symb->SectionNumber) {
+  case COFF::IMAGE_SYM_UNDEFINED:
+    // Check storage classes.
+    if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
+      Result = 'w';
+      return object_error::success; // Don't do ::toupper.
+    } else if (symb->Value != 0) // Check for common symbols.
+      ret = 'c';
+    else
+      ret = 'u';
+    break;
+  case COFF::IMAGE_SYM_ABSOLUTE:
+    ret = 'a';
+    break;
+  case COFF::IMAGE_SYM_DEBUG:
+    ret = 'n';
+    break;
+  default:
+    // Check section type.
+    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
+      ret = 't';
+    else if (  Characteristics & COFF::IMAGE_SCN_MEM_READ
+            && ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+      ret = 'r';
+    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+      ret = 'd';
+    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
+      ret = 'b';
+    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
+      ret = 'i';
+
+    // Check for section symbol.
+    else if (  symb->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC
+            && symb->Value == 0)
+       ret = 's';
+  }
+
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
+    ret = ::toupper(static_cast<unsigned char>(ret));
+
+  Result = ret;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
+                                            section_iterator &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  if (symb->SectionNumber <= COFF::IMAGE_SYM_UNDEFINED)
+    Result = end_sections();
+  else {
+    const coff_section *sec = 0;
+    if (error_code ec = getSection(symb->SectionNumber, sec)) return ec;
+    DataRefImpl Sec;
+    Sec.p = reinterpret_cast<uintptr_t>(sec);
+    Result = section_iterator(SectionRef(Sec, this));
+  }
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolValue(DataRefImpl Symb,
+                                          uint64_t &Val) const {
+  report_fatal_error("getSymbolValue unimplemented in COFFObjectFile");
+}
+
+error_code COFFObjectFile::getSectionNext(DataRefImpl Sec,
+                                          SectionRef &Result) const {
+  const coff_section *sec = toSec(Sec);
+  sec += 1;
+  Sec.p = reinterpret_cast<uintptr_t>(sec);
+  Result = SectionRef(Sec, this);
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSectionName(DataRefImpl Sec,
+                                          StringRef &Result) const {
+  const coff_section *sec = toSec(Sec);
+  return getSectionName(sec, Result);
+}
+
+error_code COFFObjectFile::getSectionAddress(DataRefImpl Sec,
+                                             uint64_t &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->VirtualAddress;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSectionSize(DataRefImpl Sec,
+                                          uint64_t &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->SizeOfRawData;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSectionContents(DataRefImpl Sec,
+                                              StringRef &Result) const {
+  const coff_section *sec = toSec(Sec);
+  ArrayRef<uint8_t> Res;
+  error_code EC = getSectionContents(sec, Res);
+  Result = StringRef(reinterpret_cast<const char*>(Res.data()), Res.size());
+  return EC;
+}
+
+error_code COFFObjectFile::getSectionAlignment(DataRefImpl Sec,
+                                               uint64_t &Res) const {
+  const coff_section *sec = toSec(Sec);
+  if (!sec)
+    return object_error::parse_failed;
+  Res = uint64_t(1) << (((sec->Characteristics & 0x00F00000) >> 20) - 1);
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionText(DataRefImpl Sec,
+                                         bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_CODE;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionData(DataRefImpl Sec,
+                                         bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionBSS(DataRefImpl Sec,
+                                        bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
+                                                         bool &Result) const {
+  // FIXME: Unimplemented
+  Result = true;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                           bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Sec,
+                                             bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
+                                                 DataRefImpl Symb,
+                                                 bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *symb_sec = 0;
+  if (error_code ec = getSection(symb->SectionNumber, symb_sec)) return ec;
+  if (symb_sec == sec)
+    Result = true;
+  else
+    Result = false;
+  return object_error::success;
+}
+
+relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+  const coff_section *sec = toSec(Sec);
+  DataRefImpl ret;
+  if (sec->NumberOfRelocations == 0)
+    ret.p = 0;
+  else
+    ret.p = reinterpret_cast<uintptr_t>(base() + sec->PointerToRelocations);
+
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
+relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+  const coff_section *sec = toSec(Sec);
+  DataRefImpl ret;
+  if (sec->NumberOfRelocations == 0)
+    ret.p = 0;
+  else
+    ret.p = reinterpret_cast<uintptr_t>(
+              reinterpret_cast<const coff_relocation*>(
+                base() + sec->PointerToRelocations)
+              + sec->NumberOfRelocations);
+
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
+COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
+  : ObjectFile(Binary::ID_COFF, Object, ec)
+  , Header(0)
+  , SectionTable(0)
+  , SymbolTable(0)
+  , StringTable(0)
+  , StringTableSize(0) {
+  // Check that we at least have enough room for a header.
+  if (!checkSize(Data, ec, sizeof(coff_file_header))) return;
+
+  // The actual starting location of the COFF header in the file. This can be
+  // non-zero in PE/COFF files.
+  uint64_t HeaderStart = 0;
+
+  // Check if this is a PE/COFF file.
+  if (base()[0] == 0x4d && base()[1] == 0x5a) {
+    // PE/COFF, seek through MS-DOS compatibility stub and 4-byte
+    // PE signature to find 'normal' COFF header.
+    if (!checkSize(Data, ec, 0x3c + 8)) return;
+    HeaderStart = *reinterpret_cast<const ulittle16_t *>(base() + 0x3c);
+    // Check the PE header. ("PE\0\0")
+    if (std::memcmp(base() + HeaderStart, "PE\0\0", 4) != 0) {
+      ec = object_error::parse_failed;
+      return;
+    }
+    HeaderStart += 4; // Skip the PE Header.
+  }
+
+  Header = reinterpret_cast<const coff_file_header *>(base() + HeaderStart);
+  if (!checkAddr(Data, ec, uintptr_t(Header), sizeof(coff_file_header)))
+    return;
+
+  SectionTable =
+    reinterpret_cast<const coff_section *>( base()
+                                          + HeaderStart
+                                          + sizeof(coff_file_header)
+                                          + Header->SizeOfOptionalHeader);
+  if (!checkAddr(Data, ec, uintptr_t(SectionTable),
+                 Header->NumberOfSections * sizeof(coff_section)))
+    return;
+
+  if (Header->PointerToSymbolTable != 0) {
+    SymbolTable =
+      reinterpret_cast<const coff_symbol *>(base()
+                                            + Header->PointerToSymbolTable);
+    if (!checkAddr(Data, ec, uintptr_t(SymbolTable),
+                   Header->NumberOfSymbols * sizeof(coff_symbol)))
+      return;
+
+    // Find string table.
+    StringTable = reinterpret_cast<const char *>(base())
+                  + Header->PointerToSymbolTable
+                  + Header->NumberOfSymbols * sizeof(coff_symbol);
+    if (!checkAddr(Data, ec, uintptr_t(StringTable), sizeof(ulittle32_t)))
+      return;
+
+    StringTableSize = *reinterpret_cast<const ulittle32_t *>(StringTable);
+    if (!checkAddr(Data, ec, uintptr_t(StringTable), StringTableSize))
+      return;
+    // Check that the string table is null terminated if has any in it.
+    if (StringTableSize < 4
+        || (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)) {
+      ec = object_error::parse_failed;
+      return;
+    }
+  }
+
+  ec = object_error::success;
+}
+
+symbol_iterator COFFObjectFile::begin_symbols() const {
+  DataRefImpl ret;
+  ret.p = reinterpret_cast<intptr_t>(SymbolTable);
+  return symbol_iterator(SymbolRef(ret, this));
+}
+
+symbol_iterator COFFObjectFile::end_symbols() const {
+  // The symbol table ends where the string table begins.
+  DataRefImpl ret;
+  ret.p = reinterpret_cast<intptr_t>(StringTable);
+  return symbol_iterator(SymbolRef(ret, this));
+}
+
+symbol_iterator COFFObjectFile::begin_dynamic_symbols() const {
+  // TODO: implement
+  report_fatal_error("Dynamic symbols unimplemented in COFFObjectFile");
+}
+
+symbol_iterator COFFObjectFile::end_dynamic_symbols() const {
+  // TODO: implement
+  report_fatal_error("Dynamic symbols unimplemented in COFFObjectFile");
+}
+
+library_iterator COFFObjectFile::begin_libraries_needed() const {
+  // TODO: implement
+  report_fatal_error("Libraries needed unimplemented in COFFObjectFile");
+}
+
+library_iterator COFFObjectFile::end_libraries_needed() const {
+  // TODO: implement
+  report_fatal_error("Libraries needed unimplemented in COFFObjectFile");
+}
+
+StringRef COFFObjectFile::getLoadName() const {
+  // COFF does not have this field.
+  return "";
+}
+
+
+section_iterator COFFObjectFile::begin_sections() const {
+  DataRefImpl ret;
+  ret.p = reinterpret_cast<intptr_t>(SectionTable);
+  return section_iterator(SectionRef(ret, this));
+}
+
+section_iterator COFFObjectFile::end_sections() const {
+  DataRefImpl ret;
+  ret.p = reinterpret_cast<intptr_t>(SectionTable + Header->NumberOfSections);
+  return section_iterator(SectionRef(ret, this));
+}
+
+uint8_t COFFObjectFile::getBytesInAddress() const {
+  return getArch() == Triple::x86_64 ? 8 : 4;
+}
+
+StringRef COFFObjectFile::getFileFormatName() const {
+  switch(Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    return "COFF-i386";
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    return "COFF-x86-64";
+  default:
+    return "COFF-<unknown arch>";
+  }
+}
+
+unsigned COFFObjectFile::getArch() const {
+  switch(Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    return Triple::x86;
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    return Triple::x86_64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
+  Res = Header;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSection(int32_t index,
+                                      const coff_section *&Result) const {
+  // Check for special index values.
+  if (index == COFF::IMAGE_SYM_UNDEFINED ||
+      index == COFF::IMAGE_SYM_ABSOLUTE ||
+      index == COFF::IMAGE_SYM_DEBUG)
+    Result = NULL;
+  else if (index > 0 && index <= Header->NumberOfSections)
+    // We already verified the section table data, so no need to check again.
+    Result = SectionTable + (index - 1);
+  else
+    return object_error::parse_failed;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getString(uint32_t offset,
+                                     StringRef &Result) const {
+  if (StringTableSize <= 4)
+    // Tried to get a string from an empty string table.
+    return object_error::parse_failed;
+  if (offset >= StringTableSize)
+    return object_error::unexpected_eof;
+  Result = StringRef(StringTable + offset);
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbol(uint32_t index,
+                                     const coff_symbol *&Result) const {
+  if (index < Header->NumberOfSymbols)
+    Result = SymbolTable + index;
+  else
+    return object_error::parse_failed;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
+                                         StringRef &Res) const {
+  // Check for string table entry. First 4 bytes are 0.
+  if (symbol->Name.Offset.Zeroes == 0) {
+    uint32_t Offset = symbol->Name.Offset.Offset;
+    if (error_code ec = getString(Offset, Res))
+      return ec;
+    return object_error::success;
+  }
+
+  if (symbol->Name.ShortName[7] == 0)
+    // Null terminated, let ::strlen figure out the length.
+    Res = StringRef(symbol->Name.ShortName);
+  else
+    // Not null terminated, use all 8 bytes.
+    Res = StringRef(symbol->Name.ShortName, 8);
+  return object_error::success;
+}
+
+ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
+                                  const coff_symbol *symbol) const {
+  const uint8_t *aux = NULL;
+  
+  if ( symbol->NumberOfAuxSymbols > 0 ) {
+  // AUX data comes immediately after the symbol in COFF
+    aux = reinterpret_cast<const uint8_t *>(symbol + 1);
+# ifndef NDEBUG
+    // Verify that the aux symbol points to a valid entry in the symbol table.
+    uintptr_t offset = uintptr_t(aux) - uintptr_t(base());
+    if (offset < Header->PointerToSymbolTable
+        || offset >= Header->PointerToSymbolTable
+           + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+      report_fatal_error("Aux Symbol data was outside of symbol table.");
+
+    assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+         == 0 && "Aux Symbol data did not point to the beginning of a symbol");
+# endif
+  }
+  return ArrayRef<uint8_t>(aux, symbol->NumberOfAuxSymbols * sizeof(coff_symbol));
+}
+
+error_code COFFObjectFile::getSectionName(const coff_section *Sec,
+                                          StringRef &Res) const {
+  StringRef Name;
+  if (Sec->Name[7] == 0)
+    // Null terminated, let ::strlen figure out the length.
+    Name = Sec->Name;
+  else
+    // Not null terminated, use all 8 bytes.
+    Name = StringRef(Sec->Name, 8);
+
+  // Check for string table entry. First byte is '/'.
+  if (Name[0] == '/') {
+    uint32_t Offset;
+    if (Name.substr(1).getAsInteger(10, Offset))
+      return object_error::parse_failed;
+    if (error_code ec = getString(Offset, Name))
+      return ec;
+  }
+
+  Res = Name;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSectionContents(const coff_section *Sec,
+                                              ArrayRef<uint8_t> &Res) const {
+  // The only thing that we need to verify is that the contents is contained
+  // within the file bounds. We don't need to make sure it doesn't cover other
+  // data, as there's nothing that says that is not allowed.
+  uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
+  uintptr_t ConEnd = ConStart + Sec->SizeOfRawData;
+  if (ConEnd > uintptr_t(Data->getBufferEnd()))
+    return object_error::parse_failed;
+  Res = ArrayRef<uint8_t>(reinterpret_cast<const unsigned char*>(ConStart),
+                          Sec->SizeOfRawData);
+  return object_error::success;
+}
+
+const coff_relocation *COFFObjectFile::toRel(DataRefImpl Rel) const {
+  return reinterpret_cast<const coff_relocation*>(Rel.p);
+}
+error_code COFFObjectFile::getRelocationNext(DataRefImpl Rel,
+                                             RelocationRef &Res) const {
+  Rel.p = reinterpret_cast<uintptr_t>(
+            reinterpret_cast<const coff_relocation*>(Rel.p) + 1);
+  Res = RelocationRef(Rel, this);
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                uint64_t &Res) const {
+  Res = toRel(Rel)->VirtualAddress;
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                               uint64_t &Res) const {
+  Res = toRel(Rel)->VirtualAddress;
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationSymbol(DataRefImpl Rel,
+                                               SymbolRef &Res) const {
+  const coff_relocation* R = toRel(Rel);
+  DataRefImpl Symb;
+  Symb.p = reinterpret_cast<uintptr_t>(SymbolTable + R->SymbolTableIndex);
+  Res = SymbolRef(Symb, this);
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
+                                             uint64_t &Res) const {
+  const coff_relocation* R = toRel(Rel);
+  Res = R->Type;
+  return object_error::success;
+}
+
+const coff_section *COFFObjectFile::getCOFFSection(section_iterator &It) const {
+  return toSec(It->getRawDataRefImpl());
+}
+
+const coff_symbol *COFFObjectFile::getCOFFSymbol(symbol_iterator &It) const {
+  return toSymb(It->getRawDataRefImpl());
+}
+
+const coff_relocation *COFFObjectFile::getCOFFRelocation(
+                                             relocation_iterator &It) const {
+  return toRel(It->getRawDataRefImpl());
+}
+
+
+#define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
+  case COFF::enum: res = #enum; break;
+
+error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const coff_relocation *reloc = toRel(Rel);
+  StringRef res;
+  switch (Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    switch (reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR64);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_1);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_2);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_3);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_4);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_5);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECREL7);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SREL32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_PAIR);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SSPAN32);
+    default:
+      res = "Unknown";
+    }
+    break;
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    switch (reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR16);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_REL16);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SEG12);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECREL7);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_REL32);
+    default:
+      res = "Unknown";
+    }
+    break;
+  default:
+    res = "Unknown";
+  }
+  Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+
+#undef LLVM_COFF_SWITCH_RELOC_TYPE_NAME
+
+error_code COFFObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                       int64_t &Res) const {
+  Res = 0;
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const coff_relocation *reloc = toRel(Rel);
+  const coff_symbol *symb = 0;
+  if (error_code ec = getSymbol(reloc->SymbolTableIndex, symb)) return ec;
+  DataRefImpl sym;
+  sym.p = reinterpret_cast<uintptr_t>(symb);
+  StringRef symname;
+  if (error_code ec = getSymbolName(sym, symname)) return ec;
+  Result.append(symname.begin(), symname.end());
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getLibraryNext(DataRefImpl LibData,
+                                          LibraryRef &Result) const {
+  report_fatal_error("getLibraryNext not implemented in COFFObjectFile");
+}
+
+error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
+                                          StringRef &Result) const {
+  report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
+}
+
+namespace llvm {
+
+  ObjectFile *ObjectFile::createCOFFObjectFile(MemoryBuffer *Object) {
+    error_code ec;
+    return new COFFObjectFile(Object, ec);
+  }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm/lib/Object/ELFObjectFile.cpp
new file mode 100644
index 000000000000..cfe0eb467e53
--- /dev/null
+++ b/contrib/llvm/lib/Object/ELFObjectFile.cpp
@@ -0,0 +1,74 @@
+//===- ELFObjectFile.cpp - ELF object file implementation -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Part of the ELFObjectFile class implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace llvm {
+
+using namespace object;
+
+// Creates an in-memory object-file by default: createELFObjectFile(Buffer)
+ObjectFile *ObjectFile::createELFObjectFile(MemoryBuffer *Object) {
+  std::pair<unsigned char, unsigned char> Ident = getElfArchType(Object);
+  error_code ec;
+
+  std::size_t MaxAlignment =
+    1ULL << CountTrailingZeros_64(uintptr_t(Object->getBufferStart()));
+
+  if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
+    if (MaxAlignment >= 4)
+      return new ELFObjectFile<ELFType<support::little, 4, false> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::little, 2, false> >(Object, ec);
+    else
+      llvm_unreachable("Invalid alignment for ELF file!");
+  else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
+    if (MaxAlignment >= 4)
+      return new ELFObjectFile<ELFType<support::big, 4, false> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::big, 2, false> >(Object, ec);
+    else
+      llvm_unreachable("Invalid alignment for ELF file!");
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB)
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
+    if (MaxAlignment >= 8)
+      return new ELFObjectFile<ELFType<support::big, 8, true> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::big, 2, true> >(Object, ec);
+    else
+      llvm_unreachable("Invalid alignment for ELF file!");
+  else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
+#if !LLVM_IS_UNALIGNED_ACCESS_FAST
+    if (MaxAlignment >= 8)
+      return new ELFObjectFile<ELFType<support::little, 8, true> >(Object, ec);
+    else
+#endif
+    if (MaxAlignment >= 2)
+      return new ELFObjectFile<ELFType<support::little, 2, true> >(Object, ec);
+    else
+      llvm_unreachable("Invalid alignment for ELF file!");
+  }
+
+  report_fatal_error("Buffer is not an ELF object file!");
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Object/Error.cpp b/contrib/llvm/lib/Object/Error.cpp
new file mode 100644
index 000000000000..25946257ab5a
--- /dev/null
+++ b/contrib/llvm/lib/Object/Error.cpp
@@ -0,0 +1,57 @@
+//===- Error.cpp - system_error extensions for Object -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines a new error_category for the Object library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class _object_error_category : public _do_message {
+public:
+  virtual const char* name() const;
+  virtual std::string message(int ev) const;
+  virtual error_condition default_error_condition(int ev) const;
+};
+}
+
+const char *_object_error_category::name() const {
+  return "llvm.object";
+}
+
+std::string _object_error_category::message(int ev) const {
+  switch (ev) {
+  case object_error::success: return "Success";
+  case object_error::invalid_file_type:
+    return "The file was not recognized as a valid object file";
+  case object_error::parse_failed:
+    return "Invalid data was encountered while parsing the file";
+  case object_error::unexpected_eof:
+    return "The end of the file was unexpectedly encountered";
+  default:
+    llvm_unreachable("An enumerator of object_error does not have a message "
+                     "defined.");
+  }
+}
+
+error_condition _object_error_category::default_error_condition(int ev) const {
+  if (ev == object_error::success)
+    return errc::success;
+  return errc::invalid_argument;
+}
+
+const error_category &object::object_category() {
+  static _object_error_category o;
+  return o;
+}
diff --git a/contrib/llvm/lib/Object/MachOObject.cpp b/contrib/llvm/lib/Object/MachOObject.cpp
new file mode 100644
index 000000000000..c9c341a207c7
--- /dev/null
+++ b/contrib/llvm/lib/Object/MachOObject.cpp
@@ -0,0 +1,422 @@
+//===- MachOObject.cpp - Mach-O Object File Wrapper -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/MachOObject.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+/* Translation Utilities */
+
+template<typename T>
+static void SwapValue(T &Value) {
+  Value = sys::SwapByteOrder(Value);
+}
+
+template<typename T>
+static void SwapStruct(T &Value);
+
+template<typename T>
+static void ReadInMemoryStruct(const MachOObject &MOO,
+                               StringRef Buffer, uint64_t Base,
+                               InMemoryStruct<T> &Res) {
+  typedef T struct_type;
+  uint64_t Size = sizeof(struct_type);
+
+  // Check that the buffer contains the expected data.
+  if (Base + Size >  Buffer.size()) {
+    Res = 0;
+    return;
+  }
+
+  // Check whether we can return a direct pointer.
+  struct_type *Ptr = reinterpret_cast<struct_type *>(
+                       const_cast<char *>(Buffer.data() + Base));
+  if (!MOO.isSwappedEndian()) {
+    Res = Ptr;
+    return;
+  }
+
+  // Otherwise, copy the struct and translate the values.
+  Res = *Ptr;
+  SwapStruct(*Res);
+}
+
+/* *** */
+
+MachOObject::MachOObject(MemoryBuffer *Buffer_, bool IsLittleEndian_,
+                         bool Is64Bit_)
+  : Buffer(Buffer_), IsLittleEndian(IsLittleEndian_), Is64Bit(Is64Bit_),
+    IsSwappedEndian(IsLittleEndian != sys::isLittleEndianHost()),
+    HasStringTable(false), LoadCommands(0), NumLoadedCommands(0) {
+  // Load the common header.
+  memcpy(&Header, Buffer->getBuffer().data(), sizeof(Header));
+  if (IsSwappedEndian) {
+    SwapValue(Header.Magic);
+    SwapValue(Header.CPUType);
+    SwapValue(Header.CPUSubtype);
+    SwapValue(Header.FileType);
+    SwapValue(Header.NumLoadCommands);
+    SwapValue(Header.SizeOfLoadCommands);
+    SwapValue(Header.Flags);
+  }
+
+  if (is64Bit()) {
+    memcpy(&Header64Ext, Buffer->getBuffer().data() + sizeof(Header),
+           sizeof(Header64Ext));
+    if (IsSwappedEndian) {
+      SwapValue(Header64Ext.Reserved);
+    }
+  }
+
+  // Create the load command array if sane.
+  if (getHeader().NumLoadCommands < (1 << 20))
+    LoadCommands = new LoadCommandInfo[getHeader().NumLoadCommands];
+}
+
+MachOObject::~MachOObject() {
+  delete [] LoadCommands;
+}
+
+MachOObject *MachOObject::LoadFromBuffer(MemoryBuffer *Buffer,
+                                         std::string *ErrorStr) {
+  // First, check the magic value and initialize the basic object info.
+  bool IsLittleEndian = false, Is64Bit = false;
+  StringRef Magic = Buffer->getBuffer().slice(0, 4);
+  if (Magic == "\xFE\xED\xFA\xCE") {
+  }  else if (Magic == "\xCE\xFA\xED\xFE") {
+    IsLittleEndian = true;
+  } else if (Magic == "\xFE\xED\xFA\xCF") {
+    Is64Bit = true;
+  } else if (Magic == "\xCF\xFA\xED\xFE") {
+    IsLittleEndian = true;
+    Is64Bit = true;
+  } else {
+    if (ErrorStr) *ErrorStr = "not a Mach object file (invalid magic)";
+    return 0;
+  }
+
+  // Ensure that the at least the full header is present.
+  unsigned HeaderSize = Is64Bit ? macho::Header64Size : macho::Header32Size;
+  if (Buffer->getBufferSize() < HeaderSize) {
+    if (ErrorStr) *ErrorStr = "not a Mach object file (invalid header)";
+    return 0;
+  }
+
+  OwningPtr<MachOObject> Object(new MachOObject(Buffer, IsLittleEndian,
+                                                Is64Bit));
+
+  // Check for bogus number of load commands.
+  if (Object->getHeader().NumLoadCommands >= (1 << 20)) {
+    if (ErrorStr) *ErrorStr = "not a Mach object file (unreasonable header)";
+    return 0;
+  }
+
+  if (ErrorStr) *ErrorStr = "";
+  return Object.take();
+}
+
+StringRef MachOObject::getData(size_t Offset, size_t Size) const {
+  return Buffer->getBuffer().substr(Offset,Size);
+}
+
+void MachOObject::RegisterStringTable(macho::SymtabLoadCommand &SLC) {
+  HasStringTable = true;
+  StringTable = Buffer->getBuffer().substr(SLC.StringTableOffset,
+                                           SLC.StringTableSize);
+}
+
+const MachOObject::LoadCommandInfo &
+MachOObject::getLoadCommandInfo(unsigned Index) const {
+  assert(Index < getHeader().NumLoadCommands && "Invalid index!");
+
+  // Load the command, if necessary.
+  if (Index >= NumLoadedCommands) {
+    uint64_t Offset;
+    if (Index == 0) {
+      Offset = getHeaderSize();
+    } else {
+      const LoadCommandInfo &Prev = getLoadCommandInfo(Index - 1);
+      Offset = Prev.Offset + Prev.Command.Size;
+    }
+
+    LoadCommandInfo &Info = LoadCommands[Index];
+    memcpy(&Info.Command, Buffer->getBuffer().data() + Offset,
+           sizeof(macho::LoadCommand));
+    if (IsSwappedEndian) {
+      SwapValue(Info.Command.Type);
+      SwapValue(Info.Command.Size);
+    }
+    Info.Offset = Offset;
+    NumLoadedCommands = Index + 1;
+  }
+
+  return LoadCommands[Index];
+}
+
+template<>
+void SwapStruct(macho::SegmentLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.VMAddress);
+  SwapValue(Value.VMSize);
+  SwapValue(Value.FileOffset);
+  SwapValue(Value.FileSize);
+  SwapValue(Value.MaxVMProtection);
+  SwapValue(Value.InitialVMProtection);
+  SwapValue(Value.NumSections);
+  SwapValue(Value.Flags);
+}
+void MachOObject::ReadSegmentLoadCommand(const LoadCommandInfo &LCI,
+                         InMemoryStruct<macho::SegmentLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::Segment64LoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.VMAddress);
+  SwapValue(Value.VMSize);
+  SwapValue(Value.FileOffset);
+  SwapValue(Value.FileSize);
+  SwapValue(Value.MaxVMProtection);
+  SwapValue(Value.InitialVMProtection);
+  SwapValue(Value.NumSections);
+  SwapValue(Value.Flags);
+}
+void MachOObject::ReadSegment64LoadCommand(const LoadCommandInfo &LCI,
+                       InMemoryStruct<macho::Segment64LoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::SymtabLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.SymbolTableOffset);
+  SwapValue(Value.NumSymbolTableEntries);
+  SwapValue(Value.StringTableOffset);
+  SwapValue(Value.StringTableSize);
+}
+void MachOObject::ReadSymtabLoadCommand(const LoadCommandInfo &LCI,
+                          InMemoryStruct<macho::SymtabLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::DysymtabLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.LocalSymbolsIndex);
+  SwapValue(Value.NumLocalSymbols);
+  SwapValue(Value.ExternalSymbolsIndex);
+  SwapValue(Value.NumExternalSymbols);
+  SwapValue(Value.UndefinedSymbolsIndex);
+  SwapValue(Value.NumUndefinedSymbols);
+  SwapValue(Value.TOCOffset);
+  SwapValue(Value.NumTOCEntries);
+  SwapValue(Value.ModuleTableOffset);
+  SwapValue(Value.NumModuleTableEntries);
+  SwapValue(Value.ReferenceSymbolTableOffset);
+  SwapValue(Value.NumReferencedSymbolTableEntries);
+  SwapValue(Value.IndirectSymbolTableOffset);
+  SwapValue(Value.NumIndirectSymbolTableEntries);
+  SwapValue(Value.ExternalRelocationTableOffset);
+  SwapValue(Value.NumExternalRelocationTableEntries);
+  SwapValue(Value.LocalRelocationTableOffset);
+  SwapValue(Value.NumLocalRelocationTableEntries);
+}
+void MachOObject::ReadDysymtabLoadCommand(const LoadCommandInfo &LCI,
+                        InMemoryStruct<macho::DysymtabLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::LinkeditDataLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.DataOffset);
+  SwapValue(Value.DataSize);
+}
+void MachOObject::ReadLinkeditDataLoadCommand(const LoadCommandInfo &LCI,
+                    InMemoryStruct<macho::LinkeditDataLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::LinkerOptionsLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.Count);
+}
+void MachOObject::ReadLinkerOptionsLoadCommand(const LoadCommandInfo &LCI,
+                   InMemoryStruct<macho::LinkerOptionsLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::IndirectSymbolTableEntry &Value) {
+  SwapValue(Value.Index);
+}
+void
+MachOObject::ReadIndirectSymbolTableEntry(const macho::DysymtabLoadCommand &DLC,
+                                          unsigned Index,
+                   InMemoryStruct<macho::IndirectSymbolTableEntry> &Res) const {
+  uint64_t Offset = (DLC.IndirectSymbolTableOffset +
+                     Index * sizeof(macho::IndirectSymbolTableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+
+template<>
+void SwapStruct(macho::Section &Value) {
+  SwapValue(Value.Address);
+  SwapValue(Value.Size);
+  SwapValue(Value.Offset);
+  SwapValue(Value.Align);
+  SwapValue(Value.RelocationTableOffset);
+  SwapValue(Value.NumRelocationTableEntries);
+  SwapValue(Value.Flags);
+  SwapValue(Value.Reserved1);
+  SwapValue(Value.Reserved2);
+}
+void MachOObject::ReadSection(const LoadCommandInfo &LCI,
+                              unsigned Index,
+                              InMemoryStruct<macho::Section> &Res) const {
+  assert(LCI.Command.Type == macho::LCT_Segment &&
+         "Unexpected load command info!");
+  uint64_t Offset = (LCI.Offset + sizeof(macho::SegmentLoadCommand) +
+                     Index * sizeof(macho::Section));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::Section64 &Value) {
+  SwapValue(Value.Address);
+  SwapValue(Value.Size);
+  SwapValue(Value.Offset);
+  SwapValue(Value.Align);
+  SwapValue(Value.RelocationTableOffset);
+  SwapValue(Value.NumRelocationTableEntries);
+  SwapValue(Value.Flags);
+  SwapValue(Value.Reserved1);
+  SwapValue(Value.Reserved2);
+  SwapValue(Value.Reserved3);
+}
+void MachOObject::ReadSection64(const LoadCommandInfo &LCI,
+                                unsigned Index,
+                                InMemoryStruct<macho::Section64> &Res) const {
+  assert(LCI.Command.Type == macho::LCT_Segment64 &&
+         "Unexpected load command info!");
+  uint64_t Offset = (LCI.Offset + sizeof(macho::Segment64LoadCommand) +
+                     Index * sizeof(macho::Section64));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::RelocationEntry &Value) {
+  SwapValue(Value.Word0);
+  SwapValue(Value.Word1);
+}
+void MachOObject::ReadRelocationEntry(uint64_t RelocationTableOffset,
+                                      unsigned Index,
+                            InMemoryStruct<macho::RelocationEntry> &Res) const {
+  uint64_t Offset = (RelocationTableOffset +
+                     Index * sizeof(macho::RelocationEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::SymbolTableEntry &Value) {
+  SwapValue(Value.StringIndex);
+  SwapValue(Value.Flags);
+  SwapValue(Value.Value);
+}
+void MachOObject::ReadSymbolTableEntry(uint64_t SymbolTableOffset,
+                                       unsigned Index,
+                           InMemoryStruct<macho::SymbolTableEntry> &Res) const {
+  uint64_t Offset = (SymbolTableOffset +
+                     Index * sizeof(macho::SymbolTableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::Symbol64TableEntry &Value) {
+  SwapValue(Value.StringIndex);
+  SwapValue(Value.Flags);
+  SwapValue(Value.Value);
+}
+void MachOObject::ReadSymbol64TableEntry(uint64_t SymbolTableOffset,
+                                       unsigned Index,
+                         InMemoryStruct<macho::Symbol64TableEntry> &Res) const {
+  uint64_t Offset = (SymbolTableOffset +
+                     Index * sizeof(macho::Symbol64TableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+template<>
+void SwapStruct(macho::DataInCodeTableEntry &Value) {
+  SwapValue(Value.Offset);
+  SwapValue(Value.Length);
+  SwapValue(Value.Kind);
+}
+void MachOObject::ReadDataInCodeTableEntry(uint64_t TableOffset,
+                                           unsigned Index,
+                       InMemoryStruct<macho::DataInCodeTableEntry> &Res) const {
+  uint64_t Offset = (TableOffset +
+                     Index * sizeof(macho::DataInCodeTableEntry));
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
+}
+
+void MachOObject::ReadULEB128s(uint64_t Index,
+                               SmallVectorImpl<uint64_t> &Out) const {
+  DataExtractor extractor(Buffer->getBuffer(), true, 0);
+
+  uint32_t offset = Index;
+  uint64_t data = 0;
+  while (uint64_t delta = extractor.getULEB128(&offset)) {
+    data += delta;
+    Out.push_back(data);
+  }
+}
+
+/* ** */
+// Object Dumping Facilities
+void MachOObject::dump() const { print(dbgs()); dbgs() << '\n'; }
+void MachOObject::dumpHeader() const { printHeader(dbgs()); dbgs() << '\n'; }
+
+void MachOObject::printHeader(raw_ostream &O) const {
+  O << "('cputype', " << Header.CPUType << ")\n";
+  O << "('cpusubtype', " << Header.CPUSubtype << ")\n";
+  O << "('filetype', " << Header.FileType << ")\n";
+  O << "('num_load_commands', " << Header.NumLoadCommands << ")\n";
+  O << "('load_commands_size', " << Header.SizeOfLoadCommands << ")\n";
+  O << "('flag', " << Header.Flags << ")\n";
+
+  // Print extended header if 64-bit.
+  if (is64Bit())
+    O << "('reserved', " << Header64Ext.Reserved << ")\n";
+}
+
+void MachOObject::print(raw_ostream &O) const {
+  O << "Header:\n";
+  printHeader(O);
+  O << "Load Commands:\n";
+
+  O << "Buffer:\n";
+}
diff --git a/contrib/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm/lib/Object/MachOObjectFile.cpp
new file mode 100644
index 000000000000..6501df9fb986
--- /dev/null
+++ b/contrib/llvm/lib/Object/MachOObjectFile.cpp
@@ -0,0 +1,1340 @@
+//===- MachOObjectFile.cpp - Mach-O object file binding ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachOObjectFile class, which binds the MachOObject
+// class to the generic ObjectFile wrapper.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/MachO.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cctype>
+#include <cstring>
+#include <limits>
+
+using namespace llvm;
+using namespace object;
+
+namespace llvm {
+namespace object {
+
+MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO,
+                                 error_code &ec)
+    : ObjectFile(Binary::ID_MachO, Object, ec),
+      MachOObj(MOO),
+      RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {
+  DataRefImpl DRI;
+  moveToNextSection(DRI);
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    Sections.push_back(DRI);
+    DRI.d.b++;
+    moveToNextSection(DRI);
+  }
+}
+
+
+ObjectFile *ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer) {
+  error_code ec;
+  std::string Err;
+  MachOObject *MachOObj = MachOObject::LoadFromBuffer(Buffer, &Err);
+  if (!MachOObj)
+    return NULL;
+  // MachOObject takes ownership of the Buffer we passed to it, and
+  // MachOObjectFile does, too, so we need to make sure they don't get the
+  // same object. A MemoryBuffer is cheap (it's just a reference to memory,
+  // not a copy of the memory itself), so just make a new copy here for
+  // the MachOObjectFile.
+  MemoryBuffer *NewBuffer =
+    MemoryBuffer::getMemBuffer(Buffer->getBuffer(),
+                               Buffer->getBufferIdentifier(), false);
+  return new MachOObjectFile(NewBuffer, MachOObj, ec);
+}
+
+/*===-- Symbols -----------------------------------------------------------===*/
+
+void MachOObjectFile::moveToNextSymbol(DataRefImpl &DRI) const {
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    if (LCI.Command.Type == macho::LCT_Symtab) {
+      InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd;
+      MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd);
+      if (DRI.d.b < SymtabLoadCmd->NumSymbolTableEntries)
+        return;
+    }
+
+    DRI.d.a++;
+    DRI.d.b = 0;
+  }
+}
+
+void MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI,
+    InMemoryStruct<macho::SymbolTableEntry> &Res) const {
+  InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd);
+
+  if (RegisteredStringTable != DRI.d.a) {
+    MachOObj->RegisterStringTable(*SymtabLoadCmd);
+    RegisteredStringTable = DRI.d.a;
+  }
+
+  MachOObj->ReadSymbolTableEntry(SymtabLoadCmd->SymbolTableOffset, DRI.d.b,
+                                 Res);
+}
+
+void MachOObjectFile::getSymbol64TableEntry(DataRefImpl DRI,
+    InMemoryStruct<macho::Symbol64TableEntry> &Res) const {
+  InMemoryStruct<macho::SymtabLoadCommand> SymtabLoadCmd;
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSymtabLoadCommand(LCI, SymtabLoadCmd);
+
+  if (RegisteredStringTable != DRI.d.a) {
+    MachOObj->RegisterStringTable(*SymtabLoadCmd);
+    RegisteredStringTable = DRI.d.a;
+  }
+
+  MachOObj->ReadSymbol64TableEntry(SymtabLoadCmd->SymbolTableOffset, DRI.d.b,
+                                   Res);
+}
+
+
+error_code MachOObjectFile::getSymbolNext(DataRefImpl DRI,
+                                          SymbolRef &Result) const {
+  DRI.d.b++;
+  moveToNextSymbol(DRI);
+  Result = SymbolRef(DRI, this);
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolName(DataRefImpl DRI,
+                                          StringRef &Result) const {
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    Result = MachOObj->getStringAtIndex(Entry->StringIndex);
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    Result = MachOObj->getStringAtIndex(Entry->StringIndex);
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolFileOffset(DataRefImpl DRI,
+                                                uint64_t &Result) const {
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    Result = Entry->Value;
+    if (Entry->SectionIndex) {
+      InMemoryStruct<macho::Section64> Section;
+      getSection64(Sections[Entry->SectionIndex-1], Section);
+      Result += Section->Offset - Section->Address;
+    }
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    Result = Entry->Value;
+    if (Entry->SectionIndex) {
+      InMemoryStruct<macho::Section> Section;
+      getSection(Sections[Entry->SectionIndex-1], Section);
+      Result += Section->Offset - Section->Address;
+    }
+  }
+
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolAddress(DataRefImpl DRI,
+                                             uint64_t &Result) const {
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    Result = Entry->Value;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    Result = Entry->Value;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
+                                          uint64_t &Result) const {
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  uint64_t BeginOffset;
+  uint64_t EndOffset = 0;
+  uint8_t SectionIndex;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    BeginOffset = Entry->Value;
+    SectionIndex = Entry->SectionIndex;
+    if (!SectionIndex) {
+      uint32_t flags = SymbolRef::SF_None;
+      getSymbolFlags(DRI, flags);
+      if (flags & SymbolRef::SF_Common)
+        Result = Entry->Value;
+      else
+        Result = UnknownAddressOrSize;
+      return object_error::success;
+    }
+    // Unfortunately symbols are unsorted so we need to touch all
+    // symbols from load command
+    DRI.d.b = 0;
+    uint32_t Command = DRI.d.a;
+    while (Command == DRI.d.a) {
+      moveToNextSymbol(DRI);
+      if (DRI.d.a < LoadCommandCount) {
+        getSymbol64TableEntry(DRI, Entry);
+        if (Entry->SectionIndex == SectionIndex && Entry->Value > BeginOffset)
+          if (!EndOffset || Entry->Value < EndOffset)
+            EndOffset = Entry->Value;
+      }
+      DRI.d.b++;
+    }
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    BeginOffset = Entry->Value;
+    SectionIndex = Entry->SectionIndex;
+    if (!SectionIndex) {
+      uint32_t flags = SymbolRef::SF_None;
+      getSymbolFlags(DRI, flags);
+      if (flags & SymbolRef::SF_Common)
+        Result = Entry->Value;
+      else
+        Result = UnknownAddressOrSize;
+      return object_error::success;
+    }
+    // Unfortunately symbols are unsorted so we need to touch all
+    // symbols from load command
+    DRI.d.b = 0;
+    uint32_t Command = DRI.d.a;
+    while (Command == DRI.d.a) {
+      moveToNextSymbol(DRI);
+      if (DRI.d.a < LoadCommandCount) {
+        getSymbolTableEntry(DRI, Entry);
+        if (Entry->SectionIndex == SectionIndex && Entry->Value > BeginOffset)
+          if (!EndOffset || Entry->Value < EndOffset)
+            EndOffset = Entry->Value;
+      }
+      DRI.d.b++;
+    }
+  }
+  if (!EndOffset) {
+    uint64_t Size;
+    getSectionSize(Sections[SectionIndex-1], Size);
+    getSectionAddress(Sections[SectionIndex-1], EndOffset);
+    EndOffset += Size;
+  }
+  Result = EndOffset - BeginOffset;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolNMTypeChar(DataRefImpl DRI,
+                                                char &Result) const {
+  uint8_t Type, Flags;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    Type = Entry->Type;
+    Flags = Entry->Flags;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    Type = Entry->Type;
+    Flags = Entry->Flags;
+  }
+
+  char Char;
+  switch (Type & macho::STF_TypeMask) {
+    case macho::STT_Undefined:
+      Char = 'u';
+      break;
+    case macho::STT_Absolute:
+    case macho::STT_Section:
+      Char = 's';
+      break;
+    default:
+      Char = '?';
+      break;
+  }
+
+  if (Flags & (macho::STF_External | macho::STF_PrivateExtern))
+    Char = toupper(static_cast<unsigned char>(Char));
+  Result = Char;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
+                                           uint32_t &Result) const {
+  uint16_t MachOFlags;
+  uint8_t MachOType;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    MachOFlags = Entry->Flags;
+    MachOType = Entry->Type;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    MachOFlags = Entry->Flags;
+    MachOType = Entry->Type;
+  }
+
+  // TODO: Correctly set SF_ThreadLocal
+  Result = SymbolRef::SF_None;
+
+  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+    Result |= SymbolRef::SF_Undefined;
+
+  if (MachOFlags & macho::STF_StabsEntryMask)
+    Result |= SymbolRef::SF_FormatSpecific;
+
+  if (MachOType & MachO::NlistMaskExternal) {
+    Result |= SymbolRef::SF_Global;
+    if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+      Result |= SymbolRef::SF_Common;
+  }
+
+  if (MachOFlags & (MachO::NListDescWeakRef | MachO::NListDescWeakDef))
+    Result |= SymbolRef::SF_Weak;
+
+  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeAbsolute)
+    Result |= SymbolRef::SF_Absolute;
+
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolSection(DataRefImpl Symb,
+                                             section_iterator &Res) const {
+  uint8_t index;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(Symb, Entry);
+    index = Entry->SectionIndex;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(Symb, Entry);
+    index = Entry->SectionIndex;
+  }
+
+  if (index == 0)
+    Res = end_sections();
+  else
+    Res = section_iterator(SectionRef(Sections[index-1], this));
+
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
+                                          SymbolRef::Type &Res) const {
+  uint8_t n_type;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(Symb, Entry);
+    n_type = Entry->Type;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(Symb, Entry);
+    n_type = Entry->Type;
+  }
+  Res = SymbolRef::ST_Other;
+
+  // If this is a STAB debugging symbol, we can do nothing more.
+  if (n_type & MachO::NlistMaskStab) {
+    Res = SymbolRef::ST_Debug;
+    return object_error::success;
+  }
+
+  switch (n_type & MachO::NlistMaskType) {
+    case MachO::NListTypeUndefined :
+      Res = SymbolRef::ST_Unknown;
+      break;
+    case MachO::NListTypeSection :
+      Res = SymbolRef::ST_Function;
+      break;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolValue(DataRefImpl Symb,
+                                           uint64_t &Val) const {
+  report_fatal_error("getSymbolValue unimplemented in MachOObjectFile");
+}
+
+symbol_iterator MachOObjectFile::begin_symbols() const {
+  // DRI.d.a = segment number; DRI.d.b = symbol index.
+  DataRefImpl DRI;
+  moveToNextSymbol(DRI);
+  return symbol_iterator(SymbolRef(DRI, this));
+}
+
+symbol_iterator MachOObjectFile::end_symbols() const {
+  DataRefImpl DRI;
+  DRI.d.a = MachOObj->getHeader().NumLoadCommands;
+  return symbol_iterator(SymbolRef(DRI, this));
+}
+
+symbol_iterator MachOObjectFile::begin_dynamic_symbols() const {
+  // TODO: implement
+  report_fatal_error("Dynamic symbols unimplemented in MachOObjectFile");
+}
+
+symbol_iterator MachOObjectFile::end_dynamic_symbols() const {
+  // TODO: implement
+  report_fatal_error("Dynamic symbols unimplemented in MachOObjectFile");
+}
+
+library_iterator MachOObjectFile::begin_libraries_needed() const {
+  // TODO: implement
+  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
+}
+
+library_iterator MachOObjectFile::end_libraries_needed() const {
+  // TODO: implement
+  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
+}
+
+StringRef MachOObjectFile::getLoadName() const {
+  // TODO: Implement
+  report_fatal_error("get_load_name() unimplemented in MachOObjectFile");
+}
+
+/*===-- Sections ----------------------------------------------------------===*/
+
+void MachOObjectFile::moveToNextSection(DataRefImpl &DRI) const {
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    if (LCI.Command.Type == macho::LCT_Segment) {
+      InMemoryStruct<macho::SegmentLoadCommand> SegmentLoadCmd;
+      MachOObj->ReadSegmentLoadCommand(LCI, SegmentLoadCmd);
+      if (DRI.d.b < SegmentLoadCmd->NumSections)
+        return;
+    } else if (LCI.Command.Type == macho::LCT_Segment64) {
+      InMemoryStruct<macho::Segment64LoadCommand> Segment64LoadCmd;
+      MachOObj->ReadSegment64LoadCommand(LCI, Segment64LoadCmd);
+      if (DRI.d.b < Segment64LoadCmd->NumSections)
+        return;
+    }
+
+    DRI.d.a++;
+    DRI.d.b = 0;
+  }
+}
+
+error_code MachOObjectFile::getSectionNext(DataRefImpl DRI,
+                                           SectionRef &Result) const {
+  DRI.d.b++;
+  moveToNextSection(DRI);
+  Result = SectionRef(DRI, this);
+  return object_error::success;
+}
+
+void
+MachOObjectFile::getSection(DataRefImpl DRI,
+                            InMemoryStruct<macho::Section> &Res) const {
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSection(LCI, DRI.d.b, Res);
+}
+
+std::size_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  SectionList::const_iterator loc =
+    std::find(Sections.begin(), Sections.end(), Sec);
+  assert(loc != Sections.end() && "Sec is not a valid section!");
+  return std::distance(Sections.begin(), loc);
+}
+
+void
+MachOObjectFile::getSection64(DataRefImpl DRI,
+                            InMemoryStruct<macho::Section64> &Res) const {
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  MachOObj->ReadSection64(LCI, DRI.d.b, Res);
+}
+
+static bool is64BitLoadCommand(const MachOObject *MachOObj, DataRefImpl DRI) {
+  LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+  if (LCI.Command.Type == macho::LCT_Segment64)
+    return true;
+  assert(LCI.Command.Type == macho::LCT_Segment && "Unexpected Type.");
+  return false;
+}
+
+static StringRef parseSegmentOrSectionName(const char *P) {
+  if (P[15] == 0)
+    // Null terminated.
+    return P;
+  // Not null terminated, so this is a 16 char string.
+  return StringRef(P, 16);
+}
+
+error_code MachOObjectFile::getSectionName(DataRefImpl DRI,
+                                           StringRef &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    unsigned SectionOffset = LCI.Offset + sizeof(macho::Segment64LoadCommand) +
+      DRI.d.b * sizeof(macho::Section64);
+    StringRef Data = MachOObj->getData(SectionOffset, sizeof(macho::Section64));
+    const macho::Section64 *sec =
+      reinterpret_cast<const macho::Section64*>(Data.data());
+    Result = parseSegmentOrSectionName(sec->Name);
+  } else {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
+    unsigned SectionOffset = LCI.Offset + sizeof(macho::SegmentLoadCommand) +
+      DRI.d.b * sizeof(macho::Section);
+    StringRef Data = MachOObj->getData(SectionOffset, sizeof(macho::Section));
+    const macho::Section *sec =
+      reinterpret_cast<const macho::Section*>(Data.data());
+    Result = parseSegmentOrSectionName(sec->Name);
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec,
+                                                       StringRef &Res) const {
+  if (is64BitLoadCommand(MachOObj.get(), Sec)) {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(Sec.d.a);
+    unsigned SectionOffset = LCI.Offset + sizeof(macho::Segment64LoadCommand) +
+      Sec.d.b * sizeof(macho::Section64);
+    StringRef Data = MachOObj->getData(SectionOffset, sizeof(macho::Section64));
+    const macho::Section64 *sec =
+      reinterpret_cast<const macho::Section64*>(Data.data());
+    Res = parseSegmentOrSectionName(sec->SegmentName);
+  } else {
+    LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(Sec.d.a);
+    unsigned SectionOffset = LCI.Offset + sizeof(macho::SegmentLoadCommand) +
+      Sec.d.b * sizeof(macho::Section);
+    StringRef Data = MachOObj->getData(SectionOffset, sizeof(macho::Section));
+    const macho::Section *sec =
+      reinterpret_cast<const macho::Section*>(Data.data());
+    Res = parseSegmentOrSectionName(sec->SegmentName);
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSectionAddress(DataRefImpl DRI,
+                                              uint64_t &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = Sect->Address;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = Sect->Address;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSectionSize(DataRefImpl DRI,
+                                           uint64_t &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = Sect->Size;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = Sect->Size;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSectionContents(DataRefImpl DRI,
+                                               StringRef &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = MachOObj->getData(Sect->Offset, Sect->Size);
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = MachOObj->getData(Sect->Offset, Sect->Size);
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSectionAlignment(DataRefImpl DRI,
+                                                uint64_t &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = uint64_t(1) << Sect->Align;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = uint64_t(1) << Sect->Align;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionText(DataRefImpl DRI,
+                                          bool &Result) const {
+  if (is64BitLoadCommand(MachOObj.get(), DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = Sect->Flags & macho::SF_PureInstructions;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = Sect->Flags & macho::SF_PureInstructions;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionData(DataRefImpl DRI,
+                                          bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionBSS(DataRefImpl DRI,
+                                         bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
+                                                          bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = true;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
+                                             bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI,
+                                              bool &Result) const {
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    unsigned SectionType = Sect->Flags & MachO::SectionFlagMaskSectionType;
+    Result = (SectionType == MachO::SectionTypeZeroFill ||
+              SectionType == MachO::SectionTypeZeroFillLarge);
+  }
+
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                  bool &Result) const {
+  // Consider using the code from isSectionText to look for __const sections.
+  // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
+  // to use section attributes to distinguish code from data.
+
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
+                                                  DataRefImpl Symb,
+                                                  bool &Result) const {
+  SymbolRef::Type ST;
+  getSymbolType(Symb, ST);
+  if (ST == SymbolRef::ST_Unknown) {
+    Result = false;
+    return object_error::success;
+  }
+
+  uint64_t SectBegin, SectEnd;
+  getSectionAddress(Sec, SectBegin);
+  getSectionSize(Sec, SectEnd);
+  SectEnd += SectBegin;
+
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(Symb, Entry);
+    uint64_t SymAddr= Entry->Value;
+    Result = (SymAddr >= SectBegin) && (SymAddr < SectEnd);
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(Symb, Entry);
+    uint64_t SymAddr= Entry->Value;
+    Result = (SymAddr >= SectBegin) && (SymAddr < SectEnd);
+  }
+
+  return object_error::success;
+}
+
+relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+  DataRefImpl ret;
+  ret.d.b = getSectionIndex(Sec);
+  return relocation_iterator(RelocationRef(ret, this));
+}
+relocation_iterator MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+  uint32_t last_reloc;
+  if (is64BitLoadCommand(MachOObj.get(), Sec)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sec, Sect);
+    last_reloc = Sect->NumRelocationTableEntries;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sec, Sect);
+    last_reloc = Sect->NumRelocationTableEntries;
+  }
+  DataRefImpl ret;
+  ret.d.a = last_reloc;
+  ret.d.b = getSectionIndex(Sec);
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
+section_iterator MachOObjectFile::begin_sections() const {
+  DataRefImpl DRI;
+  moveToNextSection(DRI);
+  return section_iterator(SectionRef(DRI, this));
+}
+
+section_iterator MachOObjectFile::end_sections() const {
+  DataRefImpl DRI;
+  DRI.d.a = MachOObj->getHeader().NumLoadCommands;
+  return section_iterator(SectionRef(DRI, this));
+}
+
+/*===-- Relocations -------------------------------------------------------===*/
+
+void MachOObjectFile::
+getRelocation(DataRefImpl Rel,
+              InMemoryStruct<macho::RelocationEntry> &Res) const {
+  uint32_t relOffset;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sections[Rel.d.b], Sect);
+    relOffset = Sect->RelocationTableOffset;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sections[Rel.d.b], Sect);
+    relOffset = Sect->RelocationTableOffset;
+  }
+  MachOObj->ReadRelocationEntry(relOffset, Rel.d.a, Res);
+}
+error_code MachOObjectFile::getRelocationNext(DataRefImpl Rel,
+                                              RelocationRef &Res) const {
+  ++Rel.d.a;
+  Res = RelocationRef(Rel, this);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                 uint64_t &Res) const {
+  const uint8_t* sectAddress = 0;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sections[Rel.d.b], Sect);
+    sectAddress += Sect->Address;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sections[Rel.d.b], Sect);
+    sectAddress += Sect->Address;
+  }
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+  uint64_t RelAddr = 0;
+  if (isScattered)
+    RelAddr = RE->Word0 & 0xFFFFFF;
+  else
+    RelAddr = RE->Word0;
+
+  Res = reinterpret_cast<uintptr_t>(sectAddress + RelAddr);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
+                                                uint64_t &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+  if (isScattered)
+    Res = RE->Word0 & 0xFFFFFF;
+  else
+    Res = RE->Word0;
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationSymbol(DataRefImpl Rel,
+                                                SymbolRef &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  uint32_t SymbolIdx = RE->Word1 & 0xffffff;
+  bool isExtern = (RE->Word1 >> 27) & 1;
+
+  DataRefImpl Sym;
+  moveToNextSymbol(Sym);
+  if (isExtern) {
+    for (unsigned i = 0; i < SymbolIdx; i++) {
+      Sym.d.b++;
+      moveToNextSymbol(Sym);
+      assert(Sym.d.a < MachOObj->getHeader().NumLoadCommands &&
+             "Relocation symbol index out of range!");
+    }
+  }
+  Res = SymbolRef(Sym, this);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
+                                              uint64_t &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  Res = RE->Word0;
+  Res <<= 32;
+  Res |= RE->Word1;
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  // TODO: Support scattered relocations.
+  StringRef res;
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+
+  unsigned r_type;
+  if (isScattered)
+    r_type = (RE->Word0 >> 24) & 0xF;
+  else
+    r_type = (RE->Word1 >> 28) & 0xF;
+
+  switch (Arch) {
+    case Triple::x86: {
+      static const char *const Table[] =  {
+        "GENERIC_RELOC_VANILLA",
+        "GENERIC_RELOC_PAIR",
+        "GENERIC_RELOC_SECTDIFF",
+        "GENERIC_RELOC_PB_LA_PTR",
+        "GENERIC_RELOC_LOCAL_SECTDIFF",
+        "GENERIC_RELOC_TLV" };
+
+      if (r_type > 6)
+        res = "Unknown";
+      else
+        res = Table[r_type];
+      break;
+    }
+    case Triple::x86_64: {
+      static const char *const Table[] =  {
+        "X86_64_RELOC_UNSIGNED",
+        "X86_64_RELOC_SIGNED",
+        "X86_64_RELOC_BRANCH",
+        "X86_64_RELOC_GOT_LOAD",
+        "X86_64_RELOC_GOT",
+        "X86_64_RELOC_SUBTRACTOR",
+        "X86_64_RELOC_SIGNED_1",
+        "X86_64_RELOC_SIGNED_2",
+        "X86_64_RELOC_SIGNED_4",
+        "X86_64_RELOC_TLV" };
+
+      if (r_type > 9)
+        res = "Unknown";
+      else
+        res = Table[r_type];
+      break;
+    }
+    case Triple::arm: {
+      static const char *const Table[] =  {
+        "ARM_RELOC_VANILLA",
+        "ARM_RELOC_PAIR",
+        "ARM_RELOC_SECTDIFF",
+        "ARM_RELOC_LOCAL_SECTDIFF",
+        "ARM_RELOC_PB_LA_PTR",
+        "ARM_RELOC_BR24",
+        "ARM_THUMB_RELOC_BR22",
+        "ARM_THUMB_32BIT_BRANCH",
+        "ARM_RELOC_HALF",
+        "ARM_RELOC_HALF_SECTDIFF" };
+
+      if (r_type > 9)
+        res = "Unknown";
+      else
+        res = Table[r_type];
+      break;
+    }
+    case Triple::ppc: {
+      static const char *const Table[] =  {
+        "PPC_RELOC_VANILLA",
+        "PPC_RELOC_PAIR",
+        "PPC_RELOC_BR14",
+        "PPC_RELOC_BR24",
+        "PPC_RELOC_HI16",
+        "PPC_RELOC_LO16",
+        "PPC_RELOC_HA16",
+        "PPC_RELOC_LO14",
+        "PPC_RELOC_SECTDIFF",
+        "PPC_RELOC_PB_LA_PTR",
+        "PPC_RELOC_HI16_SECTDIFF",
+        "PPC_RELOC_LO16_SECTDIFF",
+        "PPC_RELOC_HA16_SECTDIFF",
+        "PPC_RELOC_JBSR",
+        "PPC_RELOC_LO14_SECTDIFF",
+        "PPC_RELOC_LOCAL_SECTDIFF" };
+
+      res = Table[r_type];
+      break;
+    }
+    case Triple::UnknownArch:
+      res = "Unknown";
+      break;
+  }
+  Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                        int64_t &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  bool isExtern = (RE->Word1 >> 27) & 1;
+  Res = 0;
+  if (!isExtern) {
+    const uint8_t* sectAddress = base();
+    if (MachOObj->is64Bit()) {
+      InMemoryStruct<macho::Section64> Sect;
+      getSection64(Sections[Rel.d.b], Sect);
+      sectAddress += Sect->Offset;
+    } else {
+      InMemoryStruct<macho::Section> Sect;
+      getSection(Sections[Rel.d.b], Sect);
+      sectAddress += Sect->Offset;
+    }
+    Res = reinterpret_cast<uintptr_t>(sectAddress);
+  }
+  return object_error::success;
+}
+
+// Helper to advance a section or symbol iterator multiple increments at a time.
+template<class T>
+error_code advance(T &it, size_t Val) {
+  error_code ec;
+  while (Val--) {
+    it.increment(ec);
+  }
+  return ec;
+}
+
+template<class T>
+void advanceTo(T &it, size_t Val) {
+  if (error_code ec = advance(it, Val))
+    report_fatal_error(ec.message());
+}
+
+void MachOObjectFile::printRelocationTargetName(
+                                     InMemoryStruct<macho::RelocationEntry>& RE,
+                                     raw_string_ostream &fmt) const {
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+
+  // Target of a scattered relocation is an address.  In the interest of
+  // generating pretty output, scan through the symbol table looking for a
+  // symbol that aligns with that address.  If we find one, print it.
+  // Otherwise, we just print the hex address of the target.
+  if (isScattered) {
+    uint32_t Val = RE->Word1;
+
+    error_code ec;
+    for (symbol_iterator SI = begin_symbols(), SE = end_symbols(); SI != SE;
+        SI.increment(ec)) {
+      if (ec) report_fatal_error(ec.message());
+
+      uint64_t Addr;
+      StringRef Name;
+
+      if ((ec = SI->getAddress(Addr)))
+        report_fatal_error(ec.message());
+      if (Addr != Val) continue;
+      if ((ec = SI->getName(Name)))
+        report_fatal_error(ec.message());
+      fmt << Name;
+      return;
+    }
+
+    // If we couldn't find a symbol that this relocation refers to, try
+    // to find a section beginning instead.
+    for (section_iterator SI = begin_sections(), SE = end_sections(); SI != SE;
+         SI.increment(ec)) {
+      if (ec) report_fatal_error(ec.message());
+
+      uint64_t Addr;
+      StringRef Name;
+
+      if ((ec = SI->getAddress(Addr)))
+        report_fatal_error(ec.message());
+      if (Addr != Val) continue;
+      if ((ec = SI->getName(Name)))
+        report_fatal_error(ec.message());
+      fmt << Name;
+      return;
+    }
+
+    fmt << format("0x%x", Val);
+    return;
+  }
+
+  StringRef S;
+  bool isExtern = (RE->Word1 >> 27) & 1;
+  uint32_t Val = RE->Word1 & 0xFFFFFF;
+
+  if (isExtern) {
+    symbol_iterator SI = begin_symbols();
+    advanceTo(SI, Val);
+    SI->getName(S);
+  } else {
+    section_iterator SI = begin_sections();
+    advanceTo(SI, Val);
+    SI->getName(S);
+  }
+
+  fmt << S;
+}
+
+error_code MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+
+  std::string fmtbuf;
+  raw_string_ostream fmt(fmtbuf);
+
+  unsigned Type;
+  if (isScattered)
+    Type = (RE->Word0 >> 24) & 0xF;
+  else
+    Type = (RE->Word1 >> 28) & 0xF;
+
+  bool isPCRel;
+  if (isScattered)
+    isPCRel = ((RE->Word0 >> 30) & 1);
+  else
+    isPCRel = ((RE->Word1 >> 24) & 1);
+
+  // Determine any addends that should be displayed with the relocation.
+  // These require decoding the relocation type, which is triple-specific.
+
+  // X86_64 has entirely custom relocation types.
+  if (Arch == Triple::x86_64) {
+    bool isPCRel = ((RE->Word1 >> 24) & 1);
+
+    switch (Type) {
+      case macho::RIT_X86_64_GOTLoad:   // X86_64_RELOC_GOT_LOAD
+      case macho::RIT_X86_64_GOT: {     // X86_64_RELOC_GOT
+        printRelocationTargetName(RE, fmt);
+        fmt << "@GOT";
+        if (isPCRel) fmt << "PCREL";
+        break;
+      }
+      case macho::RIT_X86_64_Subtractor: { // X86_64_RELOC_SUBTRACTOR
+        InMemoryStruct<macho::RelocationEntry> RENext;
+        DataRefImpl RelNext = Rel;
+        RelNext.d.a++;
+        getRelocation(RelNext, RENext);
+
+        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_UNSIGNED.
+        // NOTE: Scattered relocations don't exist on x86_64.
+        unsigned RType = (RENext->Word1 >> 28) & 0xF;
+        if (RType != 0)
+          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+                             "X86_64_RELOC_SUBTRACTOR.");
+
+        // The X86_64_RELOC_UNSIGNED contains the minuend symbol,
+        // X86_64_SUBTRACTOR contains to the subtrahend.
+        printRelocationTargetName(RENext, fmt);
+        fmt << "-";
+        printRelocationTargetName(RE, fmt);
+        break;
+      }
+      case macho::RIT_X86_64_TLV:
+        printRelocationTargetName(RE, fmt);
+        fmt << "@TLV";
+        if (isPCRel) fmt << "P";
+        break;
+      case macho::RIT_X86_64_Signed1: // X86_64_RELOC_SIGNED1
+        printRelocationTargetName(RE, fmt);
+        fmt << "-1";
+        break;
+      case macho::RIT_X86_64_Signed2: // X86_64_RELOC_SIGNED2
+        printRelocationTargetName(RE, fmt);
+        fmt << "-2";
+        break;
+      case macho::RIT_X86_64_Signed4: // X86_64_RELOC_SIGNED4
+        printRelocationTargetName(RE, fmt);
+        fmt << "-4";
+        break;
+      default:
+        printRelocationTargetName(RE, fmt);
+        break;
+    }
+  // X86 and ARM share some relocation types in common.
+  } else if (Arch == Triple::x86 || Arch == Triple::arm) {
+    // Generic relocation types...
+    switch (Type) {
+      case macho::RIT_Pair: // GENERIC_RELOC_PAIR - prints no info
+        return object_error::success;
+      case macho::RIT_Difference: { // GENERIC_RELOC_SECTDIFF
+        InMemoryStruct<macho::RelocationEntry> RENext;
+        DataRefImpl RelNext = Rel;
+        RelNext.d.a++;
+        getRelocation(RelNext, RENext);
+
+        // X86 sect diff's must be followed by a relocation of type
+        // GENERIC_RELOC_PAIR.
+        bool isNextScattered = (Arch != Triple::x86_64) &&
+                               (RENext->Word0 & macho::RF_Scattered);
+        unsigned RType;
+        if (isNextScattered)
+          RType = (RENext->Word0 >> 24) & 0xF;
+        else
+          RType = (RENext->Word1 >> 28) & 0xF;
+        if (RType != 1)
+          report_fatal_error("Expected GENERIC_RELOC_PAIR after "
+                             "GENERIC_RELOC_SECTDIFF.");
+
+        printRelocationTargetName(RE, fmt);
+        fmt << "-";
+        printRelocationTargetName(RENext, fmt);
+        break;
+      }
+    }
+
+    if (Arch == Triple::x86) {
+      // All X86 relocations that need special printing were already
+      // handled in the generic code.
+      switch (Type) {
+        case macho::RIT_Generic_LocalDifference:{// GENERIC_RELOC_LOCAL_SECTDIFF
+          InMemoryStruct<macho::RelocationEntry> RENext;
+          DataRefImpl RelNext = Rel;
+          RelNext.d.a++;
+          getRelocation(RelNext, RENext);
+
+          // X86 sect diff's must be followed by a relocation of type
+          // GENERIC_RELOC_PAIR.
+          bool isNextScattered = (Arch != Triple::x86_64) &&
+                               (RENext->Word0 & macho::RF_Scattered);
+          unsigned RType;
+          if (isNextScattered)
+            RType = (RENext->Word0 >> 24) & 0xF;
+          else
+            RType = (RENext->Word1 >> 28) & 0xF;
+          if (RType != 1)
+            report_fatal_error("Expected GENERIC_RELOC_PAIR after "
+                               "GENERIC_RELOC_LOCAL_SECTDIFF.");
+
+          printRelocationTargetName(RE, fmt);
+          fmt << "-";
+          printRelocationTargetName(RENext, fmt);
+          break;
+        }
+        case macho::RIT_Generic_TLV: {
+          printRelocationTargetName(RE, fmt);
+          fmt << "@TLV";
+          if (isPCRel) fmt << "P";
+          break;
+        }
+        default:
+          printRelocationTargetName(RE, fmt);
+      }
+    } else { // ARM-specific relocations
+      switch (Type) {
+        case macho::RIT_ARM_Half:             // ARM_RELOC_HALF
+        case macho::RIT_ARM_HalfDifference: { // ARM_RELOC_HALF_SECTDIFF
+          // Half relocations steal a bit from the length field to encode
+          // whether this is an upper16 or a lower16 relocation.
+          bool isUpper;
+          if (isScattered)
+            isUpper = (RE->Word0 >> 28) & 1;
+          else
+            isUpper = (RE->Word1 >> 25) & 1;
+
+          if (isUpper)
+            fmt << ":upper16:(";
+          else
+            fmt << ":lower16:(";
+          printRelocationTargetName(RE, fmt);
+
+          InMemoryStruct<macho::RelocationEntry> RENext;
+          DataRefImpl RelNext = Rel;
+          RelNext.d.a++;
+          getRelocation(RelNext, RENext);
+
+          // ARM half relocs must be followed by a relocation of type
+          // ARM_RELOC_PAIR.
+          bool isNextScattered = (Arch != Triple::x86_64) &&
+                                 (RENext->Word0 & macho::RF_Scattered);
+          unsigned RType;
+          if (isNextScattered)
+            RType = (RENext->Word0 >> 24) & 0xF;
+          else
+            RType = (RENext->Word1 >> 28) & 0xF;
+
+          if (RType != 1)
+            report_fatal_error("Expected ARM_RELOC_PAIR after "
+                               "GENERIC_RELOC_HALF");
+
+          // NOTE: The half of the target virtual address is stashed in the
+          // address field of the secondary relocation, but we can't reverse
+          // engineer the constant offset from it without decoding the movw/movt
+          // instruction to find the other half in its immediate field.
+
+          // ARM_RELOC_HALF_SECTDIFF encodes the second section in the
+          // symbol/section pointer of the follow-on relocation.
+          if (Type == macho::RIT_ARM_HalfDifference) {
+            fmt << "-";
+            printRelocationTargetName(RENext, fmt);
+          }
+
+          fmt << ")";
+          break;
+        }
+        default: {
+          printRelocationTargetName(RE, fmt);
+        }
+      }
+    }
+  } else
+    printRelocationTargetName(RE, fmt);
+
+  fmt.flush();
+  Result.append(fmtbuf.begin(), fmtbuf.end());
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getRelocationHidden(DataRefImpl Rel,
+                                                bool &Result) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+
+  unsigned Arch = getArch();
+  bool isScattered = (Arch != Triple::x86_64) &&
+                     (RE->Word0 & macho::RF_Scattered);
+  unsigned Type;
+  if (isScattered)
+    Type = (RE->Word0 >> 24) & 0xF;
+  else
+    Type = (RE->Word1 >> 28) & 0xF;
+
+  Result = false;
+
+  // On arches that use the generic relocations, GENERIC_RELOC_PAIR
+  // is always hidden.
+  if (Arch == Triple::x86 || Arch == Triple::arm) {
+    if (Type == macho::RIT_Pair) Result = true;
+  } else if (Arch == Triple::x86_64) {
+    // On x86_64, X86_64_RELOC_UNSIGNED is hidden only when it follows
+    // an X864_64_RELOC_SUBTRACTOR.
+    if (Type == macho::RIT_X86_64_Unsigned && Rel.d.a > 0) {
+      DataRefImpl RelPrev = Rel;
+      RelPrev.d.a--;
+      InMemoryStruct<macho::RelocationEntry> REPrev;
+      getRelocation(RelPrev, REPrev);
+
+      unsigned PrevType = (REPrev->Word1 >> 28) & 0xF;
+
+      if (PrevType == macho::RIT_X86_64_Subtractor) Result = true;
+    }
+  }
+
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getLibraryNext(DataRefImpl LibData,
+                                           LibraryRef &Res) const {
+  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
+}
+
+error_code MachOObjectFile::getLibraryPath(DataRefImpl LibData,
+                                           StringRef &Res) const {
+  report_fatal_error("Needed libraries unimplemented in MachOObjectFile");
+}
+
+
+/*===-- Miscellaneous -----------------------------------------------------===*/
+
+uint8_t MachOObjectFile::getBytesInAddress() const {
+  return MachOObj->is64Bit() ? 8 : 4;
+}
+
+StringRef MachOObjectFile::getFileFormatName() const {
+  if (!MachOObj->is64Bit()) {
+    switch (MachOObj->getHeader().CPUType) {
+    case llvm::MachO::CPUTypeI386:
+      return "Mach-O 32-bit i386";
+    case llvm::MachO::CPUTypeARM:
+      return "Mach-O arm";
+    case llvm::MachO::CPUTypePowerPC:
+      return "Mach-O 32-bit ppc";
+    default:
+      assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64) == 0 &&
+             "64-bit object file when we're not 64-bit?");
+      return "Mach-O 32-bit unknown";
+    }
+  }
+
+  // Make sure the cpu type has the correct mask.
+  assert((MachOObj->getHeader().CPUType & llvm::MachO::CPUArchABI64)
+	 == llvm::MachO::CPUArchABI64 &&
+	 "32-bit object file when we're 64-bit?");
+
+  switch (MachOObj->getHeader().CPUType) {
+  case llvm::MachO::CPUTypeX86_64:
+    return "Mach-O 64-bit x86-64";
+  case llvm::MachO::CPUTypePowerPC64:
+    return "Mach-O 64-bit ppc64";
+  default:
+    return "Mach-O 64-bit unknown";
+  }
+}
+
+unsigned MachOObjectFile::getArch() const {
+  switch (MachOObj->getHeader().CPUType) {
+  case llvm::MachO::CPUTypeI386:
+    return Triple::x86;
+  case llvm::MachO::CPUTypeX86_64:
+    return Triple::x86_64;
+  case llvm::MachO::CPUTypeARM:
+    return Triple::arm;
+  case llvm::MachO::CPUTypePowerPC:
+    return Triple::ppc;
+  case llvm::MachO::CPUTypePowerPC64:
+    return Triple::ppc64;
+  default:
+    return Triple::UnknownArch;
+  }
+}
+
+} // end namespace object
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Object/Object.cpp b/contrib/llvm/lib/Object/Object.cpp
new file mode 100644
index 000000000000..f061ea7cebed
--- /dev/null
+++ b/contrib/llvm/lib/Object/Object.cpp
@@ -0,0 +1,218 @@
+//===- Object.cpp - C bindings to the object file library--------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the C bindings to the file-format-independent object
+// library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm-c/Object.h"
+
+using namespace llvm;
+using namespace object;
+
+// ObjectFile creation
+LLVMObjectFileRef LLVMCreateObjectFile(LLVMMemoryBufferRef MemBuf) {
+  return wrap(ObjectFile::createObjectFile(unwrap(MemBuf)));
+}
+
+void LLVMDisposeObjectFile(LLVMObjectFileRef ObjectFile) {
+  delete unwrap(ObjectFile);
+}
+
+// ObjectFile Section iterators
+LLVMSectionIteratorRef LLVMGetSections(LLVMObjectFileRef ObjectFile) {
+  section_iterator SI = unwrap(ObjectFile)->begin_sections();
+  return wrap(new section_iterator(SI));
+}
+
+void LLVMDisposeSectionIterator(LLVMSectionIteratorRef SI) {
+  delete unwrap(SI);
+}
+
+LLVMBool LLVMIsSectionIteratorAtEnd(LLVMObjectFileRef ObjectFile,
+                                LLVMSectionIteratorRef SI) {
+  return (*unwrap(SI) == unwrap(ObjectFile)->end_sections()) ? 1 : 0;
+}
+
+void LLVMMoveToNextSection(LLVMSectionIteratorRef SI) {
+  error_code ec;
+  unwrap(SI)->increment(ec);
+  if (ec) report_fatal_error("LLVMMoveToNextSection failed: " + ec.message());
+}
+
+void LLVMMoveToContainingSection(LLVMSectionIteratorRef Sect,
+                                 LLVMSymbolIteratorRef Sym) {
+  if (error_code ec = (*unwrap(Sym))->getSection(*unwrap(Sect)))
+    report_fatal_error(ec.message());
+}
+
+// ObjectFile Symbol iterators
+LLVMSymbolIteratorRef LLVMGetSymbols(LLVMObjectFileRef ObjectFile) {
+  symbol_iterator SI = unwrap(ObjectFile)->begin_symbols();
+  return wrap(new symbol_iterator(SI));
+}
+
+void LLVMDisposeSymbolIterator(LLVMSymbolIteratorRef SI) {
+  delete unwrap(SI);
+}
+
+LLVMBool LLVMIsSymbolIteratorAtEnd(LLVMObjectFileRef ObjectFile,
+                                LLVMSymbolIteratorRef SI) {
+  return (*unwrap(SI) == unwrap(ObjectFile)->end_symbols()) ? 1 : 0;
+}
+
+void LLVMMoveToNextSymbol(LLVMSymbolIteratorRef SI) {
+  error_code ec;
+  unwrap(SI)->increment(ec);
+  if (ec) report_fatal_error("LLVMMoveToNextSymbol failed: " + ec.message());
+}
+
+// SectionRef accessors
+const char *LLVMGetSectionName(LLVMSectionIteratorRef SI) {
+  StringRef ret;
+  if (error_code ec = (*unwrap(SI))->getName(ret))
+   report_fatal_error(ec.message());
+  return ret.data();
+}
+
+uint64_t LLVMGetSectionSize(LLVMSectionIteratorRef SI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(SI))->getSize(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+const char *LLVMGetSectionContents(LLVMSectionIteratorRef SI) {
+  StringRef ret;
+  if (error_code ec = (*unwrap(SI))->getContents(ret))
+    report_fatal_error(ec.message());
+  return ret.data();
+}
+
+uint64_t LLVMGetSectionAddress(LLVMSectionIteratorRef SI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+LLVMBool LLVMGetSectionContainsSymbol(LLVMSectionIteratorRef SI,
+                                 LLVMSymbolIteratorRef Sym) {
+  bool ret;
+  if (error_code ec = (*unwrap(SI))->containsSymbol(**unwrap(Sym), ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+// Section Relocation iterators
+LLVMRelocationIteratorRef LLVMGetRelocations(LLVMSectionIteratorRef Section) {
+  relocation_iterator SI = (*unwrap(Section))->begin_relocations();
+  return wrap(new relocation_iterator(SI));
+}
+
+void LLVMDisposeRelocationIterator(LLVMRelocationIteratorRef SI) {
+  delete unwrap(SI);
+}
+
+LLVMBool LLVMIsRelocationIteratorAtEnd(LLVMSectionIteratorRef Section,
+                                       LLVMRelocationIteratorRef SI) {
+  return (*unwrap(SI) == (*unwrap(Section))->end_relocations()) ? 1 : 0;
+}
+
+void LLVMMoveToNextRelocation(LLVMRelocationIteratorRef SI) {
+  error_code ec;
+  unwrap(SI)->increment(ec);
+  if (ec) report_fatal_error("LLVMMoveToNextRelocation failed: " +
+                             ec.message());
+}
+
+
+// SymbolRef accessors
+const char *LLVMGetSymbolName(LLVMSymbolIteratorRef SI) {
+  StringRef ret;
+  if (error_code ec = (*unwrap(SI))->getName(ret))
+    report_fatal_error(ec.message());
+  return ret.data();
+}
+
+uint64_t LLVMGetSymbolAddress(LLVMSymbolIteratorRef SI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(SI))->getAddress(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+uint64_t LLVMGetSymbolFileOffset(LLVMSymbolIteratorRef SI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(SI))->getFileOffset(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+uint64_t LLVMGetSymbolSize(LLVMSymbolIteratorRef SI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(SI))->getSize(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+// RelocationRef accessors
+uint64_t LLVMGetRelocationAddress(LLVMRelocationIteratorRef RI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(RI))->getAddress(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(RI))->getOffset(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+LLVMSymbolIteratorRef LLVMGetRelocationSymbol(LLVMRelocationIteratorRef RI) {
+  SymbolRef ret;
+  if (error_code ec = (*unwrap(RI))->getSymbol(ret))
+    report_fatal_error(ec.message());
+
+  return wrap(new symbol_iterator(ret));
+}
+
+uint64_t LLVMGetRelocationType(LLVMRelocationIteratorRef RI) {
+  uint64_t ret;
+  if (error_code ec = (*unwrap(RI))->getType(ret))
+    report_fatal_error(ec.message());
+  return ret;
+}
+
+// NOTE: Caller takes ownership of returned string.
+const char *LLVMGetRelocationTypeName(LLVMRelocationIteratorRef RI) {
+  SmallVector<char, 0> ret;
+  if (error_code ec = (*unwrap(RI))->getTypeName(ret))
+    report_fatal_error(ec.message());
+
+  char *str = static_cast<char*>(malloc(ret.size()));
+  std::copy(ret.begin(), ret.end(), str);
+  return str;
+}
+
+// NOTE: Caller takes ownership of returned string.
+const char *LLVMGetRelocationValueString(LLVMRelocationIteratorRef RI) {
+  SmallVector<char, 0> ret;
+  if (error_code ec = (*unwrap(RI))->getValueString(ret))
+    report_fatal_error(ec.message());
+
+  char *str = static_cast<char*>(malloc(ret.size()));
+  std::copy(ret.begin(), ret.end(), str);
+  return str;
+}
+
diff --git a/contrib/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm/lib/Object/ObjectFile.cpp
new file mode 100644
index 000000000000..860c87be9846
--- /dev/null
+++ b/contrib/llvm/lib/Object/ObjectFile.cpp
@@ -0,0 +1,66 @@
+//===- ObjectFile.cpp - File format independent object file -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a file format independent ObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/system_error.h"
+
+using namespace llvm;
+using namespace object;
+
+void ObjectFile::anchor() { }
+
+ObjectFile::ObjectFile(unsigned int Type, MemoryBuffer *source, error_code &ec)
+  : Binary(Type, source) {
+}
+
+ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
+  if (!Object || Object->getBufferSize() < 64)
+    return 0;
+  sys::LLVMFileType type = sys::IdentifyFileType(Object->getBufferStart(),
+                                static_cast<unsigned>(Object->getBufferSize()));
+  switch (type) {
+    case sys::Unknown_FileType:
+      return 0;
+    case sys::ELF_Relocatable_FileType:
+    case sys::ELF_Executable_FileType:
+    case sys::ELF_SharedObject_FileType:
+    case sys::ELF_Core_FileType:
+      return createELFObjectFile(Object);
+    case sys::Mach_O_Object_FileType:
+    case sys::Mach_O_Executable_FileType:
+    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
+    case sys::Mach_O_Core_FileType:
+    case sys::Mach_O_PreloadExecutable_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
+    case sys::Mach_O_DynamicLinker_FileType:
+    case sys::Mach_O_Bundle_FileType:
+    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+    case sys::Mach_O_DSYMCompanion_FileType:
+      return createMachOObjectFile(Object);
+    case sys::COFF_FileType:
+      return createCOFFObjectFile(Object);
+    default:
+      llvm_unreachable("Unexpected Object File Type");
+  }
+}
+
+ObjectFile *ObjectFile::createObjectFile(StringRef ObjectPath) {
+  OwningPtr<MemoryBuffer> File;
+  if (MemoryBuffer::getFile(ObjectPath, File))
+    return NULL;
+  return createObjectFile(File.take());
+}
diff --git a/contrib/llvm/lib/Option/Arg.cpp b/contrib/llvm/lib/Option/Arg.cpp
new file mode 100644
index 000000000000..4c8da58f5368
--- /dev/null
+++ b/contrib/llvm/lib/Option/Arg.cpp
@@ -0,0 +1,122 @@
+//===--- Arg.cpp - Argument Implementations -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Option/Arg.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::opt;
+
+Arg::Arg(const Option _Opt, StringRef S, unsigned _Index, const Arg *_BaseArg)
+  : Opt(_Opt), BaseArg(_BaseArg), Spelling(S), Index(_Index),
+    Claimed(false), OwnsValues(false) {
+}
+
+Arg::Arg(const Option _Opt, StringRef S, unsigned _Index,
+         const char *Value0, const Arg *_BaseArg)
+  : Opt(_Opt), BaseArg(_BaseArg), Spelling(S), Index(_Index),
+    Claimed(false), OwnsValues(false) {
+  Values.push_back(Value0);
+}
+
+Arg::Arg(const Option _Opt, StringRef S, unsigned _Index,
+         const char *Value0, const char *Value1, const Arg *_BaseArg)
+  : Opt(_Opt), BaseArg(_BaseArg), Spelling(S), Index(_Index),
+    Claimed(false), OwnsValues(false) {
+  Values.push_back(Value0);
+  Values.push_back(Value1);
+}
+
+Arg::~Arg() {
+  if (OwnsValues) {
+    for (unsigned i = 0, e = Values.size(); i != e; ++i)
+      delete[] Values[i];
+  }
+}
+
+void Arg::dump() const {
+  llvm::errs() << "<";
+
+  llvm::errs() << " Opt:";
+  Opt.dump();
+
+  llvm::errs() << " Index:" << Index;
+
+  llvm::errs() << " Values: [";
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (i) llvm::errs() << ", ";
+    llvm::errs() << "'" << Values[i] << "'";
+  }
+
+  llvm::errs() << "]>\n";
+}
+
+std::string Arg::getAsString(const ArgList &Args) const {
+  SmallString<256> Res;
+  llvm::raw_svector_ostream OS(Res);
+
+  ArgStringList ASL;
+  render(Args, ASL);
+  for (ArgStringList::iterator
+         it = ASL.begin(), ie = ASL.end(); it != ie; ++it) {
+    if (it != ASL.begin())
+      OS << ' ';
+    OS << *it;
+  }
+
+  return OS.str();
+}
+
+void Arg::renderAsInput(const ArgList &Args, ArgStringList &Output) const {
+  if (!getOption().hasNoOptAsInput()) {
+    render(Args, Output);
+    return;
+  }
+
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i)
+    Output.push_back(getValue(i));
+}
+
+void Arg::render(const ArgList &Args, ArgStringList &Output) const {
+  switch (getOption().getRenderStyle()) {
+  case Option::RenderValuesStyle:
+    for (unsigned i = 0, e = getNumValues(); i != e; ++i)
+      Output.push_back(getValue(i));
+    break;
+
+  case Option::RenderCommaJoinedStyle: {
+    SmallString<256> Res;
+    llvm::raw_svector_ostream OS(Res);
+    OS << getSpelling();
+    for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+      if (i) OS << ',';
+      OS << getValue(i);
+    }
+    Output.push_back(Args.MakeArgString(OS.str()));
+    break;
+  }
+
+ case Option::RenderJoinedStyle:
+    Output.push_back(Args.GetOrMakeJoinedArgString(
+                       getIndex(), getSpelling(), getValue(0)));
+    for (unsigned i = 1, e = getNumValues(); i != e; ++i)
+      Output.push_back(getValue(i));
+    break;
+
+  case Option::RenderSeparateStyle:
+    Output.push_back(Args.MakeArgString(getSpelling()));
+    for (unsigned i = 0, e = getNumValues(); i != e; ++i)
+      Output.push_back(getValue(i));
+    break;
+  }
+}
diff --git a/contrib/llvm/lib/Option/ArgList.cpp b/contrib/llvm/lib/Option/ArgList.cpp
new file mode 100644
index 000000000000..39b22d776ed9
--- /dev/null
+++ b/contrib/llvm/lib/Option/ArgList.cpp
@@ -0,0 +1,385 @@
+//===--- ArgList.cpp - Argument List Management ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Option/ArgList.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::opt;
+
+void arg_iterator::SkipToNextArg() {
+  for (; Current != Args.end(); ++Current) {
+    // Done if there are no filters.
+    if (!Id0.isValid())
+      break;
+
+    // Otherwise require a match.
+    const Option &O = (*Current)->getOption();
+    if (O.matches(Id0) ||
+        (Id1.isValid() && O.matches(Id1)) ||
+        (Id2.isValid() && O.matches(Id2)))
+      break;
+  }
+}
+
+//
+
+ArgList::ArgList() {
+}
+
+ArgList::~ArgList() {
+}
+
+void ArgList::append(Arg *A) {
+  Args.push_back(A);
+}
+
+void ArgList::eraseArg(OptSpecifier Id) {
+  for (iterator it = begin(), ie = end(); it != ie; ) {
+    if ((*it)->getOption().matches(Id)) {
+      it = Args.erase(it);
+      ie = end();
+    } else {
+      ++it;
+    }
+  }
+}
+
+Arg *ArgList::getLastArgNoClaim(OptSpecifier Id) const {
+  // FIXME: Make search efficient?
+  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
+    if ((*it)->getOption().matches(Id))
+      return *it;
+  return 0;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1)) {
+      Res = *it;
+      Res->claim();
+
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2, OptSpecifier Id3) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2) ||
+        (*it)->getOption().matches(Id3)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2, OptSpecifier Id3,
+                         OptSpecifier Id4) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2) ||
+        (*it)->getOption().matches(Id3) ||
+        (*it)->getOption().matches(Id4)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2, OptSpecifier Id3,
+                         OptSpecifier Id4, OptSpecifier Id5) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2) ||
+        (*it)->getOption().matches(Id3) ||
+        (*it)->getOption().matches(Id4) ||
+        (*it)->getOption().matches(Id5)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2, OptSpecifier Id3,
+                         OptSpecifier Id4, OptSpecifier Id5,
+                         OptSpecifier Id6) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2) ||
+        (*it)->getOption().matches(Id3) ||
+        (*it)->getOption().matches(Id4) ||
+        (*it)->getOption().matches(Id5) ||
+        (*it)->getOption().matches(Id6)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
+                         OptSpecifier Id2, OptSpecifier Id3,
+                         OptSpecifier Id4, OptSpecifier Id5,
+                         OptSpecifier Id6, OptSpecifier Id7) const {
+  Arg *Res = 0;
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
+    if ((*it)->getOption().matches(Id0) ||
+        (*it)->getOption().matches(Id1) ||
+        (*it)->getOption().matches(Id2) ||
+        (*it)->getOption().matches(Id3) ||
+        (*it)->getOption().matches(Id4) ||
+        (*it)->getOption().matches(Id5) ||
+        (*it)->getOption().matches(Id6) ||
+        (*it)->getOption().matches(Id7)) {
+      Res = *it;
+      Res->claim();
+    }
+  }
+
+  return Res;
+}
+
+bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
+  if (Arg *A = getLastArg(Pos, Neg))
+    return A->getOption().matches(Pos);
+  return Default;
+}
+
+StringRef ArgList::getLastArgValue(OptSpecifier Id,
+                                         StringRef Default) const {
+  if (Arg *A = getLastArg(Id))
+    return A->getValue();
+  return Default;
+}
+
+std::vector<std::string> ArgList::getAllArgValues(OptSpecifier Id) const {
+  SmallVector<const char *, 16> Values;
+  AddAllArgValues(Values, Id);
+  return std::vector<std::string>(Values.begin(), Values.end());
+}
+
+void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id) const {
+  if (Arg *A = getLastArg(Id)) {
+    A->claim();
+    A->render(*this, Output);
+  }
+}
+
+void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0,
+                         OptSpecifier Id1, OptSpecifier Id2) const {
+  for (arg_iterator it = filtered_begin(Id0, Id1, Id2),
+         ie = filtered_end(); it != ie; ++it) {
+    (*it)->claim();
+    (*it)->render(*this, Output);
+  }
+}
+
+void ArgList::AddAllArgValues(ArgStringList &Output, OptSpecifier Id0,
+                              OptSpecifier Id1, OptSpecifier Id2) const {
+  for (arg_iterator it = filtered_begin(Id0, Id1, Id2),
+         ie = filtered_end(); it != ie; ++it) {
+    (*it)->claim();
+    for (unsigned i = 0, e = (*it)->getNumValues(); i != e; ++i)
+      Output.push_back((*it)->getValue(i));
+  }
+}
+
+void ArgList::AddAllArgsTranslated(ArgStringList &Output, OptSpecifier Id0,
+                                   const char *Translation,
+                                   bool Joined) const {
+  for (arg_iterator it = filtered_begin(Id0),
+         ie = filtered_end(); it != ie; ++it) {
+    (*it)->claim();
+
+    if (Joined) {
+      Output.push_back(MakeArgString(StringRef(Translation) +
+                                     (*it)->getValue(0)));
+    } else {
+      Output.push_back(Translation);
+      Output.push_back((*it)->getValue(0));
+    }
+  }
+}
+
+void ArgList::ClaimAllArgs(OptSpecifier Id0) const {
+  for (arg_iterator it = filtered_begin(Id0),
+         ie = filtered_end(); it != ie; ++it)
+    (*it)->claim();
+}
+
+void ArgList::ClaimAllArgs() const {
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it)
+    if (!(*it)->isClaimed())
+      (*it)->claim();
+}
+
+const char *ArgList::MakeArgString(const Twine &T) const {
+  SmallString<256> Str;
+  T.toVector(Str);
+  return MakeArgString(Str.str());
+}
+
+const char *ArgList::GetOrMakeJoinedArgString(unsigned Index,
+                                              StringRef LHS,
+                                              StringRef RHS) const {
+  StringRef Cur = getArgString(Index);
+  if (Cur.size() == LHS.size() + RHS.size() &&
+      Cur.startswith(LHS) && Cur.endswith(RHS))
+    return Cur.data();
+
+  return MakeArgString(LHS + RHS);
+}
+
+//
+
+InputArgList::InputArgList(const char* const *ArgBegin,
+                           const char* const *ArgEnd)
+  : NumInputArgStrings(ArgEnd - ArgBegin) {
+  ArgStrings.append(ArgBegin, ArgEnd);
+}
+
+InputArgList::~InputArgList() {
+  // An InputArgList always owns its arguments.
+  for (iterator it = begin(), ie = end(); it != ie; ++it)
+    delete *it;
+}
+
+unsigned InputArgList::MakeIndex(StringRef String0) const {
+  unsigned Index = ArgStrings.size();
+
+  // Tuck away so we have a reliable const char *.
+  SynthesizedStrings.push_back(String0);
+  ArgStrings.push_back(SynthesizedStrings.back().c_str());
+
+  return Index;
+}
+
+unsigned InputArgList::MakeIndex(StringRef String0,
+                                 StringRef String1) const {
+  unsigned Index0 = MakeIndex(String0);
+  unsigned Index1 = MakeIndex(String1);
+  assert(Index0 + 1 == Index1 && "Unexpected non-consecutive indices!");
+  (void) Index1;
+  return Index0;
+}
+
+const char *InputArgList::MakeArgString(StringRef Str) const {
+  return getArgString(MakeIndex(Str));
+}
+
+//
+
+DerivedArgList::DerivedArgList(const InputArgList &_BaseArgs)
+  : BaseArgs(_BaseArgs) {
+}
+
+DerivedArgList::~DerivedArgList() {
+  // We only own the arguments we explicitly synthesized.
+  for (iterator it = SynthesizedArgs.begin(), ie = SynthesizedArgs.end();
+       it != ie; ++it)
+    delete *it;
+}
+
+const char *DerivedArgList::MakeArgString(StringRef Str) const {
+  return BaseArgs.MakeArgString(Str);
+}
+
+Arg *DerivedArgList::MakeFlagArg(const Arg *BaseArg, const Option Opt) const {
+  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
+                                               Twine(Opt.getName())),
+                   BaseArgs.MakeIndex(Opt.getName()), BaseArg);
+  SynthesizedArgs.push_back(A);
+  return A;
+}
+
+Arg *DerivedArgList::MakePositionalArg(const Arg *BaseArg, const Option Opt,
+                                       StringRef Value) const {
+  unsigned Index = BaseArgs.MakeIndex(Value);
+  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
+                                               Twine(Opt.getName())),
+                   Index, BaseArgs.getArgString(Index), BaseArg);
+  SynthesizedArgs.push_back(A);
+  return A;
+}
+
+Arg *DerivedArgList::MakeSeparateArg(const Arg *BaseArg, const Option Opt,
+                                     StringRef Value) const {
+  unsigned Index = BaseArgs.MakeIndex(Opt.getName(), Value);
+  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
+                                               Twine(Opt.getName())),
+                   Index, BaseArgs.getArgString(Index + 1), BaseArg);
+  SynthesizedArgs.push_back(A);
+  return A;
+}
+
+Arg *DerivedArgList::MakeJoinedArg(const Arg *BaseArg, const Option Opt,
+                                   StringRef Value) const {
+  unsigned Index = BaseArgs.MakeIndex(Opt.getName().str() + Value.str());
+  Arg *A = new Arg(Opt, ArgList::MakeArgString(Twine(Opt.getPrefix()) +
+                                               Twine(Opt.getName())), Index,
+                   BaseArgs.getArgString(Index) + Opt.getName().size(),
+                   BaseArg);
+  SynthesizedArgs.push_back(A);
+  return A;
+}
diff --git a/contrib/llvm/lib/Option/OptTable.cpp b/contrib/llvm/lib/Option/OptTable.cpp
new file mode 100644
index 000000000000..5c8a0eacd1f4
--- /dev/null
+++ b/contrib/llvm/lib/Option/OptTable.cpp
@@ -0,0 +1,387 @@
+//===--- OptTable.cpp - Option Table Implementation -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Option/OptTable.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+using namespace llvm::opt;
+
+// Ordering on Info. The ordering is *almost* lexicographic, with two
+// exceptions. First, '\0' comes at the end of the alphabet instead of
+// the beginning (thus options precede any other options which prefix
+// them). Second, for options with the same name, the less permissive
+// version should come first; a Flag option should precede a Joined
+// option, for example.
+
+static int StrCmpOptionName(const char *A, const char *B) {
+  char a = *A, b = *B;
+  while (a == b) {
+    if (a == '\0')
+      return 0;
+
+    a = *++A;
+    b = *++B;
+  }
+
+  if (a == '\0') // A is a prefix of B.
+    return 1;
+  if (b == '\0') // B is a prefix of A.
+    return -1;
+
+  // Otherwise lexicographic.
+  return (a < b) ? -1 : 1;
+}
+
+namespace llvm {
+namespace opt {
+
+static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
+  if (&A == &B)
+    return false;
+
+  if (int N = StrCmpOptionName(A.Name, B.Name))
+    return N == -1;
+
+  for (const char * const *APre = A.Prefixes,
+                  * const *BPre = B.Prefixes;
+                          *APre != 0 && *BPre != 0; ++APre, ++BPre) {
+    if (int N = StrCmpOptionName(*APre, *BPre))
+      return N == -1;
+  }
+
+  // Names are the same, check that classes are in order; exactly one
+  // should be joined, and it should succeed the other.
+  assert(((A.Kind == Option::JoinedClass) ^ (B.Kind == Option::JoinedClass)) &&
+         "Unexpected classes for options with same name.");
+  return B.Kind == Option::JoinedClass;
+}
+
+// Support lower_bound between info and an option name.
+static inline bool operator<(const OptTable::Info &I, const char *Name) {
+  return StrCmpOptionName(I.Name, Name) == -1;
+}
+static inline bool operator<(const char *Name, const OptTable::Info &I) {
+  return StrCmpOptionName(Name, I.Name) == -1;
+}
+}
+}
+
+OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
+
+OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos)
+  : OptionInfos(_OptionInfos),
+    NumOptionInfos(_NumOptionInfos),
+    TheInputOptionID(0),
+    TheUnknownOptionID(0),
+    FirstSearchableIndex(0)
+{
+  // Explicitly zero initialize the error to work around a bug in array
+  // value-initialization on MinGW with gcc 4.3.5.
+
+  // Find start of normal options.
+  for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+    unsigned Kind = getInfo(i + 1).Kind;
+    if (Kind == Option::InputClass) {
+      assert(!TheInputOptionID && "Cannot have multiple input options!");
+      TheInputOptionID = getInfo(i + 1).ID;
+    } else if (Kind == Option::UnknownClass) {
+      assert(!TheUnknownOptionID && "Cannot have multiple unknown options!");
+      TheUnknownOptionID = getInfo(i + 1).ID;
+    } else if (Kind != Option::GroupClass) {
+      FirstSearchableIndex = i;
+      break;
+    }
+  }
+  assert(FirstSearchableIndex != 0 && "No searchable options?");
+
+#ifndef NDEBUG
+  // Check that everything after the first searchable option is a
+  // regular option class.
+  for (unsigned i = FirstSearchableIndex, e = getNumOptions(); i != e; ++i) {
+    Option::OptionClass Kind = (Option::OptionClass) getInfo(i + 1).Kind;
+    assert((Kind != Option::InputClass && Kind != Option::UnknownClass &&
+            Kind != Option::GroupClass) &&
+           "Special options should be defined first!");
+  }
+
+  // Check that options are in order.
+  for (unsigned i = FirstSearchableIndex + 1, e = getNumOptions(); i != e; ++i){
+    if (!(getInfo(i) < getInfo(i + 1))) {
+      getOption(i).dump();
+      getOption(i + 1).dump();
+      llvm_unreachable("Options are not in order!");
+    }
+  }
+#endif
+
+  // Build prefixes.
+  for (unsigned i = FirstSearchableIndex + 1, e = getNumOptions() + 1;
+                i != e; ++i) {
+    if (const char *const *P = getInfo(i).Prefixes) {
+      for (; *P != 0; ++P) {
+        PrefixesUnion.insert(*P);
+      }
+    }
+  }
+
+  // Build prefix chars.
+  for (llvm::StringSet<>::const_iterator I = PrefixesUnion.begin(),
+                                         E = PrefixesUnion.end(); I != E; ++I) {
+    StringRef Prefix = I->getKey();
+    for (StringRef::const_iterator C = Prefix.begin(), CE = Prefix.end();
+                                   C != CE; ++C)
+      if (std::find(PrefixChars.begin(), PrefixChars.end(), *C)
+            == PrefixChars.end())
+        PrefixChars.push_back(*C);
+  }
+}
+
+OptTable::~OptTable() {
+}
+
+const Option OptTable::getOption(OptSpecifier Opt) const {
+  unsigned id = Opt.getID();
+  if (id == 0)
+    return Option(0, 0);
+  assert((unsigned) (id - 1) < getNumOptions() && "Invalid ID.");
+  return Option(&getInfo(id), this);
+}
+
+bool OptTable::isOptionHelpHidden(OptSpecifier id) const {
+  return getInfo(id).Flags & HelpHidden;
+}
+
+static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
+  if (Arg == "-")
+    return true;
+  for (llvm::StringSet<>::const_iterator I = Prefixes.begin(),
+                                         E = Prefixes.end(); I != E; ++I)
+    if (Arg.startswith(I->getKey()))
+      return false;
+  return true;
+}
+
+/// \returns Matched size. 0 means no match.
+static unsigned matchOption(const OptTable::Info *I, StringRef Str) {
+  for (const char * const *Pre = I->Prefixes; *Pre != 0; ++Pre) {
+    StringRef Prefix(*Pre);
+    if (Str.startswith(Prefix) && Str.substr(Prefix.size()).startswith(I->Name))
+      return Prefix.size() + StringRef(I->Name).size();
+  }
+  return 0;
+}
+
+Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index) const {
+  unsigned Prev = Index;
+  const char *Str = Args.getArgString(Index);
+
+  // Anything that doesn't start with PrefixesUnion is an input, as is '-'
+  // itself.
+  if (isInput(PrefixesUnion, Str))
+    return new Arg(getOption(TheInputOptionID), Str, Index++, Str);
+
+  const Info *Start = OptionInfos + FirstSearchableIndex;
+  const Info *End = OptionInfos + getNumOptions();
+  StringRef Name = StringRef(Str).ltrim(PrefixChars);
+
+  // Search for the first next option which could be a prefix.
+  Start = std::lower_bound(Start, End, Name.data());
+
+  // Options are stored in sorted order, with '\0' at the end of the
+  // alphabet. Since the only options which can accept a string must
+  // prefix it, we iteratively search for the next option which could
+  // be a prefix.
+  //
+  // FIXME: This is searching much more than necessary, but I am
+  // blanking on the simplest way to make it fast. We can solve this
+  // problem when we move to TableGen.
+  for (; Start != End; ++Start) {
+    unsigned ArgSize = 0;
+    // Scan for first option which is a proper prefix.
+    for (; Start != End; ++Start)
+      if ((ArgSize = matchOption(Start, Str)))
+        break;
+    if (Start == End)
+      break;
+
+    // See if this option matches.
+    if (Arg *A = Option(Start, this).accept(Args, Index, ArgSize))
+      return A;
+
+    // Otherwise, see if this argument was missing values.
+    if (Prev != Index)
+      return 0;
+  }
+
+  return new Arg(getOption(TheUnknownOptionID), Str, Index++, Str);
+}
+
+InputArgList *OptTable::ParseArgs(const char* const *ArgBegin,
+                                  const char* const *ArgEnd,
+                                  unsigned &MissingArgIndex,
+                                  unsigned &MissingArgCount) const {
+  InputArgList *Args = new InputArgList(ArgBegin, ArgEnd);
+
+  // FIXME: Handle '@' args (or at least error on them).
+
+  MissingArgIndex = MissingArgCount = 0;
+  unsigned Index = 0, End = ArgEnd - ArgBegin;
+  while (Index < End) {
+    // Ignore empty arguments (other things may still take them as arguments).
+    if (Args->getArgString(Index)[0] == '\0') {
+      ++Index;
+      continue;
+    }
+
+    unsigned Prev = Index;
+    Arg *A = ParseOneArg(*Args, Index);
+    assert(Index > Prev && "Parser failed to consume argument.");
+
+    // Check for missing argument error.
+    if (!A) {
+      assert(Index >= End && "Unexpected parser error.");
+      assert(Index - Prev - 1 && "No missing arguments!");
+      MissingArgIndex = Prev;
+      MissingArgCount = Index - Prev - 1;
+      break;
+    }
+
+    Args->append(A);
+  }
+
+  return Args;
+}
+
+static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
+  const Option O = Opts.getOption(Id);
+  std::string Name = O.getPrefixedName();
+
+  // Add metavar, if used.
+  switch (O.getKind()) {
+  case Option::GroupClass: case Option::InputClass: case Option::UnknownClass:
+    llvm_unreachable("Invalid option with help text.");
+
+  case Option::MultiArgClass:
+    llvm_unreachable("Cannot print metavar for this kind of option.");
+
+  case Option::FlagClass:
+    break;
+
+  case Option::SeparateClass: case Option::JoinedOrSeparateClass:
+    Name += ' ';
+    // FALLTHROUGH
+  case Option::JoinedClass: case Option::CommaJoinedClass:
+  case Option::JoinedAndSeparateClass:
+    if (const char *MetaVarName = Opts.getOptionMetaVar(Id))
+      Name += MetaVarName;
+    else
+      Name += "<value>";
+    break;
+  }
+
+  return Name;
+}
+
+static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
+                                std::vector<std::pair<std::string,
+                                const char*> > &OptionHelp) {
+  OS << Title << ":\n";
+
+  // Find the maximum option length.
+  unsigned OptionFieldWidth = 0;
+  for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
+    // Skip titles.
+    if (!OptionHelp[i].second)
+      continue;
+
+    // Limit the amount of padding we are willing to give up for alignment.
+    unsigned Length = OptionHelp[i].first.size();
+    if (Length <= 23)
+      OptionFieldWidth = std::max(OptionFieldWidth, Length);
+  }
+
+  const unsigned InitialPad = 2;
+  for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
+    const std::string &Option = OptionHelp[i].first;
+    int Pad = OptionFieldWidth - int(Option.size());
+    OS.indent(InitialPad) << Option;
+
+    // Break on long option names.
+    if (Pad < 0) {
+      OS << "\n";
+      Pad = OptionFieldWidth + InitialPad;
+    }
+    OS.indent(Pad + 1) << OptionHelp[i].second << '\n';
+  }
+}
+
+static const char *getOptionHelpGroup(const OptTable &Opts, OptSpecifier Id) {
+  unsigned GroupID = Opts.getOptionGroupID(Id);
+
+  // If not in a group, return the default help group.
+  if (!GroupID)
+    return "OPTIONS";
+
+  // Abuse the help text of the option groups to store the "help group"
+  // name.
+  //
+  // FIXME: Split out option groups.
+  if (const char *GroupHelp = Opts.getOptionHelpText(GroupID))
+    return GroupHelp;
+
+  // Otherwise keep looking.
+  return getOptionHelpGroup(Opts, GroupID);
+}
+
+void OptTable::PrintHelp(raw_ostream &OS, const char *Name,
+                         const char *Title, bool ShowHidden) const {
+  OS << "OVERVIEW: " << Title << "\n";
+  OS << '\n';
+  OS << "USAGE: " << Name << " [options] <inputs>\n";
+  OS << '\n';
+
+  // Render help text into a map of group-name to a list of (option, help)
+  // pairs.
+  typedef std::map<std::string,
+                 std::vector<std::pair<std::string, const char*> > > helpmap_ty;
+  helpmap_ty GroupedOptionHelp;
+
+  for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+    unsigned Id = i + 1;
+
+    // FIXME: Split out option groups.
+    if (getOptionKind(Id) == Option::GroupClass)
+      continue;
+
+    if (!ShowHidden && isOptionHelpHidden(Id))
+      continue;
+
+    if (const char *Text = getOptionHelpText(Id)) {
+      const char *HelpGroup = getOptionHelpGroup(*this, Id);
+      const std::string &OptName = getOptionHelpName(*this, Id);
+      GroupedOptionHelp[HelpGroup].push_back(std::make_pair(OptName, Text));
+    }
+  }
+
+  for (helpmap_ty::iterator it = GroupedOptionHelp .begin(),
+         ie = GroupedOptionHelp.end(); it != ie; ++it) {
+    if (it != GroupedOptionHelp .begin())
+      OS << "\n";
+    PrintHelpOptionList(OS, it->first, it->second);
+  }
+
+  OS.flush();
+}
diff --git a/contrib/llvm/lib/Option/Option.cpp b/contrib/llvm/lib/Option/Option.cpp
new file mode 100644
index 000000000000..0e2263475e0c
--- /dev/null
+++ b/contrib/llvm/lib/Option/Option.cpp
@@ -0,0 +1,202 @@
+//===--- Option.cpp - Abstract Driver Options -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Option/Option.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::opt;
+
+Option::Option(const OptTable::Info *info, const OptTable *owner)
+  : Info(info), Owner(owner) {
+
+  // Multi-level aliases are not supported, and alias options cannot
+  // have groups. This just simplifies option tracking, it is not an
+  // inherent limitation.
+  assert((!Info || !getAlias().isValid() || (!getAlias().getAlias().isValid() &&
+         !getGroup().isValid())) &&
+         "Multi-level aliases and aliases with groups are unsupported.");
+}
+
+Option::~Option() {
+}
+
+void Option::dump() const {
+  llvm::errs() << "<";
+  switch (getKind()) {
+#define P(N) case N: llvm::errs() << #N; break
+    P(GroupClass);
+    P(InputClass);
+    P(UnknownClass);
+    P(FlagClass);
+    P(JoinedClass);
+    P(SeparateClass);
+    P(CommaJoinedClass);
+    P(MultiArgClass);
+    P(JoinedOrSeparateClass);
+    P(JoinedAndSeparateClass);
+#undef P
+  }
+
+  llvm::errs() << " Prefixes:[";
+  for (const char * const *Pre = Info->Prefixes; *Pre != 0; ++Pre) {
+    llvm::errs() << '"' << *Pre << (*(Pre + 1) == 0 ? "\"" : "\", ");
+  }
+  llvm::errs() << ']';
+
+  llvm::errs() << " Name:\"" << getName() << '"';
+
+  const Option Group = getGroup();
+  if (Group.isValid()) {
+    llvm::errs() << " Group:";
+    Group.dump();
+  }
+
+  const Option Alias = getAlias();
+  if (Alias.isValid()) {
+    llvm::errs() << " Alias:";
+    Alias.dump();
+  }
+
+  if (getKind() == MultiArgClass)
+    llvm::errs() << " NumArgs:" << getNumArgs();
+
+  llvm::errs() << ">\n";
+}
+
+bool Option::matches(OptSpecifier Opt) const {
+  // Aliases are never considered in matching, look through them.
+  const Option Alias = getAlias();
+  if (Alias.isValid())
+    return Alias.matches(Opt);
+
+  // Check exact match.
+  if (getID() == Opt.getID())
+    return true;
+
+  const Option Group = getGroup();
+  if (Group.isValid())
+    return Group.matches(Opt);
+  return false;
+}
+
+Arg *Option::accept(const ArgList &Args,
+                    unsigned &Index,
+                    unsigned ArgSize) const {
+  const Option &UnaliasedOption = getUnaliasedOption();
+  StringRef Spelling;
+  // If the option was an alias, get the spelling from the unaliased one.
+  if (getID() == UnaliasedOption.getID()) {
+    Spelling = StringRef(Args.getArgString(Index), ArgSize);
+  } else {
+    Spelling = Args.MakeArgString(Twine(UnaliasedOption.getPrefix()) +
+                                  Twine(UnaliasedOption.getName()));
+  }
+
+  switch (getKind()) {
+  case FlagClass:
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+
+    return new Arg(UnaliasedOption, Spelling, Index++);
+  case JoinedClass: {
+    const char *Value = Args.getArgString(Index) + ArgSize;
+    return new Arg(UnaliasedOption, Spelling, Index++, Value);
+  }
+  case CommaJoinedClass: {
+    // Always matches.
+    const char *Str = Args.getArgString(Index) + ArgSize;
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
+
+    // Parse out the comma separated values.
+    const char *Prev = Str;
+    for (;; ++Str) {
+      char c = *Str;
+
+      if (!c || c == ',') {
+        if (Prev != Str) {
+          char *Value = new char[Str - Prev + 1];
+          memcpy(Value, Prev, Str - Prev);
+          Value[Str - Prev] = '\0';
+          A->getValues().push_back(Value);
+        }
+
+        if (!c)
+          break;
+
+        Prev = Str + 1;
+      }
+    }
+    A->setOwnsValues(true);
+
+    return A;
+  }
+  case SeparateClass:
+    // Matches iff this is an exact match.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+
+    Index += 2;
+    if (Index > Args.getNumInputArgStrings())
+      return 0;
+
+    return new Arg(UnaliasedOption, Spelling,
+                   Index - 2, Args.getArgString(Index - 1));
+  case MultiArgClass: {
+    // Matches iff this is an exact match.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+
+    Index += 1 + getNumArgs();
+    if (Index > Args.getNumInputArgStrings())
+      return 0;
+
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index - 1 - getNumArgs(),
+                      Args.getArgString(Index - getNumArgs()));
+    for (unsigned i = 1; i != getNumArgs(); ++i)
+      A->getValues().push_back(Args.getArgString(Index - getNumArgs() + i));
+    return A;
+  }
+  case JoinedOrSeparateClass: {
+    // If this is not an exact match, it is a joined arg.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index))) {
+      const char *Value = Args.getArgString(Index) + ArgSize;
+      return new Arg(*this, Spelling, Index++, Value);
+    }
+
+    // Otherwise it must be separate.
+    Index += 2;
+    if (Index > Args.getNumInputArgStrings())
+      return 0;
+
+    return new Arg(UnaliasedOption, Spelling,
+                   Index - 2, Args.getArgString(Index - 1));
+  }
+  case JoinedAndSeparateClass:
+    // Always matches.
+    Index += 2;
+    if (Index > Args.getNumInputArgStrings())
+      return 0;
+
+    return new Arg(UnaliasedOption, Spelling, Index - 2,
+                   Args.getArgString(Index - 2) + ArgSize,
+                   Args.getArgString(Index - 1));
+  default:
+    llvm_unreachable("Invalid option kind!");
+  }
+}
diff --git a/contrib/llvm/lib/Support/APFloat.cpp b/contrib/llvm/lib/Support/APFloat.cpp
new file mode 100644
index 000000000000..6182e3415005
--- /dev/null
+++ b/contrib/llvm/lib/Support/APFloat.cpp
@@ -0,0 +1,3592 @@
+//===-- APFloat.cpp - Implement APFloat class -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a class to represent arbitrary precision floating
+// point values and provide a variety of arithmetic operations on them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstring>
+#include <limits.h>
+
+using namespace llvm;
+
+#define convolve(lhs, rhs) ((lhs) * 4 + (rhs))
+
+/* Assumed in hexadecimal significand parsing, and conversion to
+   hexadecimal strings.  */
+#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
+COMPILE_TIME_ASSERT(integerPartWidth % 4 == 0);
+
+namespace llvm {
+
+  /* Represents floating point arithmetic semantics.  */
+  struct fltSemantics {
+    /* The largest E such that 2^E is representable; this matches the
+       definition of IEEE 754.  */
+    exponent_t maxExponent;
+
+    /* The smallest E such that 2^E is a normalized number; this
+       matches the definition of IEEE 754.  */
+    exponent_t minExponent;
+
+    /* Number of bits in the significand.  This includes the integer
+       bit.  */
+    unsigned int precision;
+  };
+
+  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11 };
+  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24 };
+  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53 };
+  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113 };
+  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64 };
+  const fltSemantics APFloat::Bogus = { 0, 0, 0 };
+
+  /* The PowerPC format consists of two doubles.  It does not map cleanly
+     onto the usual format above.  It is approximated using twice the
+     mantissa bits.  Note that for exponents near the double minimum,
+     we no longer can represent the full 106 mantissa bits, so those
+     will be treated as denormal numbers.
+
+     FIXME: While this approximation is equivalent to what GCC uses for
+     compile-time arithmetic on PPC double-double numbers, it is not able
+     to represent all possible values held by a PPC double-double number,
+     for example: (long double) 1.0 + (long double) 0x1p-106
+     Should this be replaced by a full emulation of PPC double-double?  */
+  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022 + 53, 53 + 53 };
+
+  /* A tight upper bound on number of parts required to hold the value
+     pow(5, power) is
+
+       power * 815 / (351 * integerPartWidth) + 1
+
+     However, whilst the result may require only this many parts,
+     because we are multiplying two values to get it, the
+     multiplication may require an extra part with the excess part
+     being zero (consider the trivial case of 1 * 1, tcFullMultiply
+     requires two parts to hold the single-part result).  So we add an
+     extra one to guarantee enough space whilst multiplying.  */
+  const unsigned int maxExponent = 16383;
+  const unsigned int maxPrecision = 113;
+  const unsigned int maxPowerOfFiveExponent = maxExponent + maxPrecision - 1;
+  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815)
+                                                / (351 * integerPartWidth));
+}
+
+/* A bunch of private, handy routines.  */
+
+static inline unsigned int
+partCountForBits(unsigned int bits)
+{
+  return ((bits) + integerPartWidth - 1) / integerPartWidth;
+}
+
+/* Returns 0U-9U.  Return values >= 10U are not digits.  */
+static inline unsigned int
+decDigitValue(unsigned int c)
+{
+  return c - '0';
+}
+
+/* Return the value of a decimal exponent of the form
+   [+-]ddddddd.
+
+   If the exponent overflows, returns a large exponent with the
+   appropriate sign.  */
+static int
+readExponent(StringRef::iterator begin, StringRef::iterator end)
+{
+  bool isNegative;
+  unsigned int absExponent;
+  const unsigned int overlargeExponent = 24000;  /* FIXME.  */
+  StringRef::iterator p = begin;
+
+  assert(p != end && "Exponent has no digits");
+
+  isNegative = (*p == '-');
+  if (*p == '-' || *p == '+') {
+    p++;
+    assert(p != end && "Exponent has no digits");
+  }
+
+  absExponent = decDigitValue(*p++);
+  assert(absExponent < 10U && "Invalid character in exponent");
+
+  for (; p != end; ++p) {
+    unsigned int value;
+
+    value = decDigitValue(*p);
+    assert(value < 10U && "Invalid character in exponent");
+
+    value += absExponent * 10;
+    if (absExponent >= overlargeExponent) {
+      absExponent = overlargeExponent;
+      p = end;  /* outwit assert below */
+      break;
+    }
+    absExponent = value;
+  }
+
+  assert(p == end && "Invalid exponent in exponent");
+
+  if (isNegative)
+    return -(int) absExponent;
+  else
+    return (int) absExponent;
+}
+
+/* This is ugly and needs cleaning up, but I don't immediately see
+   how whilst remaining safe.  */
+static int
+totalExponent(StringRef::iterator p, StringRef::iterator end,
+              int exponentAdjustment)
+{
+  int unsignedExponent;
+  bool negative, overflow;
+  int exponent = 0;
+
+  assert(p != end && "Exponent has no digits");
+
+  negative = *p == '-';
+  if (*p == '-' || *p == '+') {
+    p++;
+    assert(p != end && "Exponent has no digits");
+  }
+
+  unsignedExponent = 0;
+  overflow = false;
+  for (; p != end; ++p) {
+    unsigned int value;
+
+    value = decDigitValue(*p);
+    assert(value < 10U && "Invalid character in exponent");
+
+    unsignedExponent = unsignedExponent * 10 + value;
+    if (unsignedExponent > 32767) {
+      overflow = true;
+      break;
+    }
+  }
+
+  if (exponentAdjustment > 32767 || exponentAdjustment < -32768)
+    overflow = true;
+
+  if (!overflow) {
+    exponent = unsignedExponent;
+    if (negative)
+      exponent = -exponent;
+    exponent += exponentAdjustment;
+    if (exponent > 32767 || exponent < -32768)
+      overflow = true;
+  }
+
+  if (overflow)
+    exponent = negative ? -32768: 32767;
+
+  return exponent;
+}
+
+static StringRef::iterator
+skipLeadingZeroesAndAnyDot(StringRef::iterator begin, StringRef::iterator end,
+                           StringRef::iterator *dot)
+{
+  StringRef::iterator p = begin;
+  *dot = end;
+  while (*p == '0' && p != end)
+    p++;
+
+  if (*p == '.') {
+    *dot = p++;
+
+    assert(end - begin != 1 && "Significand has no digits");
+
+    while (*p == '0' && p != end)
+      p++;
+  }
+
+  return p;
+}
+
+/* Given a normal decimal floating point number of the form
+
+     dddd.dddd[eE][+-]ddd
+
+   where the decimal point and exponent are optional, fill out the
+   structure D.  Exponent is appropriate if the significand is
+   treated as an integer, and normalizedExponent if the significand
+   is taken to have the decimal point after a single leading
+   non-zero digit.
+
+   If the value is zero, V->firstSigDigit points to a non-digit, and
+   the return exponent is zero.
+*/
+struct decimalInfo {
+  const char *firstSigDigit;
+  const char *lastSigDigit;
+  int exponent;
+  int normalizedExponent;
+};
+
+static void
+interpretDecimal(StringRef::iterator begin, StringRef::iterator end,
+                 decimalInfo *D)
+{
+  StringRef::iterator dot = end;
+  StringRef::iterator p = skipLeadingZeroesAndAnyDot (begin, end, &dot);
+
+  D->firstSigDigit = p;
+  D->exponent = 0;
+  D->normalizedExponent = 0;
+
+  for (; p != end; ++p) {
+    if (*p == '.') {
+      assert(dot == end && "String contains multiple dots");
+      dot = p++;
+      if (p == end)
+        break;
+    }
+    if (decDigitValue(*p) >= 10U)
+      break;
+  }
+
+  if (p != end) {
+    assert((*p == 'e' || *p == 'E') && "Invalid character in significand");
+    assert(p != begin && "Significand has no digits");
+    assert((dot == end || p - begin != 1) && "Significand has no digits");
+
+    /* p points to the first non-digit in the string */
+    D->exponent = readExponent(p + 1, end);
+
+    /* Implied decimal point?  */
+    if (dot == end)
+      dot = p;
+  }
+
+  /* If number is all zeroes accept any exponent.  */
+  if (p != D->firstSigDigit) {
+    /* Drop insignificant trailing zeroes.  */
+    if (p != begin) {
+      do
+        do
+          p--;
+        while (p != begin && *p == '0');
+      while (p != begin && *p == '.');
+    }
+
+    /* Adjust the exponents for any decimal point.  */
+    D->exponent += static_cast<exponent_t>((dot - p) - (dot > p));
+    D->normalizedExponent = (D->exponent +
+              static_cast<exponent_t>((p - D->firstSigDigit)
+                                      - (dot > D->firstSigDigit && dot < p)));
+  }
+
+  D->lastSigDigit = p;
+}
+
+/* Return the trailing fraction of a hexadecimal number.
+   DIGITVALUE is the first hex digit of the fraction, P points to
+   the next digit.  */
+static lostFraction
+trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
+                            unsigned int digitValue)
+{
+  unsigned int hexDigit;
+
+  /* If the first trailing digit isn't 0 or 8 we can work out the
+     fraction immediately.  */
+  if (digitValue > 8)
+    return lfMoreThanHalf;
+  else if (digitValue < 8 && digitValue > 0)
+    return lfLessThanHalf;
+
+  /* Otherwise we need to find the first non-zero digit.  */
+  while (*p == '0')
+    p++;
+
+  assert(p != end && "Invalid trailing hexadecimal fraction!");
+
+  hexDigit = hexDigitValue(*p);
+
+  /* If we ran off the end it is exactly zero or one-half, otherwise
+     a little more.  */
+  if (hexDigit == -1U)
+    return digitValue == 0 ? lfExactlyZero: lfExactlyHalf;
+  else
+    return digitValue == 0 ? lfLessThanHalf: lfMoreThanHalf;
+}
+
+/* Return the fraction lost were a bignum truncated losing the least
+   significant BITS bits.  */
+static lostFraction
+lostFractionThroughTruncation(const integerPart *parts,
+                              unsigned int partCount,
+                              unsigned int bits)
+{
+  unsigned int lsb;
+
+  lsb = APInt::tcLSB(parts, partCount);
+
+  /* Note this is guaranteed true if bits == 0, or LSB == -1U.  */
+  if (bits <= lsb)
+    return lfExactlyZero;
+  if (bits == lsb + 1)
+    return lfExactlyHalf;
+  if (bits <= partCount * integerPartWidth &&
+      APInt::tcExtractBit(parts, bits - 1))
+    return lfMoreThanHalf;
+
+  return lfLessThanHalf;
+}
+
+/* Shift DST right BITS bits noting lost fraction.  */
+static lostFraction
+shiftRight(integerPart *dst, unsigned int parts, unsigned int bits)
+{
+  lostFraction lost_fraction;
+
+  lost_fraction = lostFractionThroughTruncation(dst, parts, bits);
+
+  APInt::tcShiftRight(dst, parts, bits);
+
+  return lost_fraction;
+}
+
+/* Combine the effect of two lost fractions.  */
+static lostFraction
+combineLostFractions(lostFraction moreSignificant,
+                     lostFraction lessSignificant)
+{
+  if (lessSignificant != lfExactlyZero) {
+    if (moreSignificant == lfExactlyZero)
+      moreSignificant = lfLessThanHalf;
+    else if (moreSignificant == lfExactlyHalf)
+      moreSignificant = lfMoreThanHalf;
+  }
+
+  return moreSignificant;
+}
+
+/* The error from the true value, in half-ulps, on multiplying two
+   floating point numbers, which differ from the value they
+   approximate by at most HUE1 and HUE2 half-ulps, is strictly less
+   than the returned value.
+
+   See "How to Read Floating Point Numbers Accurately" by William D
+   Clinger.  */
+static unsigned int
+HUerrBound(bool inexactMultiply, unsigned int HUerr1, unsigned int HUerr2)
+{
+  assert(HUerr1 < 2 || HUerr2 < 2 || (HUerr1 + HUerr2 < 8));
+
+  if (HUerr1 + HUerr2 == 0)
+    return inexactMultiply * 2;  /* <= inexactMultiply half-ulps.  */
+  else
+    return inexactMultiply + 2 * (HUerr1 + HUerr2);
+}
+
+/* The number of ulps from the boundary (zero, or half if ISNEAREST)
+   when the least significant BITS are truncated.  BITS cannot be
+   zero.  */
+static integerPart
+ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
+{
+  unsigned int count, partBits;
+  integerPart part, boundary;
+
+  assert(bits != 0);
+
+  bits--;
+  count = bits / integerPartWidth;
+  partBits = bits % integerPartWidth + 1;
+
+  part = parts[count] & (~(integerPart) 0 >> (integerPartWidth - partBits));
+
+  if (isNearest)
+    boundary = (integerPart) 1 << (partBits - 1);
+  else
+    boundary = 0;
+
+  if (count == 0) {
+    if (part - boundary <= boundary - part)
+      return part - boundary;
+    else
+      return boundary - part;
+  }
+
+  if (part == boundary) {
+    while (--count)
+      if (parts[count])
+        return ~(integerPart) 0; /* A lot.  */
+
+    return parts[0];
+  } else if (part == boundary - 1) {
+    while (--count)
+      if (~parts[count])
+        return ~(integerPart) 0; /* A lot.  */
+
+    return -parts[0];
+  }
+
+  return ~(integerPart) 0; /* A lot.  */
+}
+
+/* Place pow(5, power) in DST, and return the number of parts used.
+   DST must be at least one part larger than size of the answer.  */
+static unsigned int
+powerOf5(integerPart *dst, unsigned int power)
+{
+  static const integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125,
+                                                  15625, 78125 };
+  integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
+  pow5s[0] = 78125 * 5;
+
+  unsigned int partsCount[16] = { 1 };
+  integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
+  unsigned int result;
+  assert(power <= maxExponent);
+
+  p1 = dst;
+  p2 = scratch;
+
+  *p1 = firstEightPowers[power & 7];
+  power >>= 3;
+
+  result = 1;
+  pow5 = pow5s;
+
+  for (unsigned int n = 0; power; power >>= 1, n++) {
+    unsigned int pc;
+
+    pc = partsCount[n];
+
+    /* Calculate pow(5,pow(2,n+3)) if we haven't yet.  */
+    if (pc == 0) {
+      pc = partsCount[n - 1];
+      APInt::tcFullMultiply(pow5, pow5 - pc, pow5 - pc, pc, pc);
+      pc *= 2;
+      if (pow5[pc - 1] == 0)
+        pc--;
+      partsCount[n] = pc;
+    }
+
+    if (power & 1) {
+      integerPart *tmp;
+
+      APInt::tcFullMultiply(p2, p1, pow5, result, pc);
+      result += pc;
+      if (p2[result - 1] == 0)
+        result--;
+
+      /* Now result is in p1 with partsCount parts and p2 is scratch
+         space.  */
+      tmp = p1, p1 = p2, p2 = tmp;
+    }
+
+    pow5 += pc;
+  }
+
+  if (p1 != dst)
+    APInt::tcAssign(dst, p1, result);
+
+  return result;
+}
+
+/* Zero at the end to avoid modular arithmetic when adding one; used
+   when rounding up during hexadecimal output.  */
+static const char hexDigitsLower[] = "0123456789abcdef0";
+static const char hexDigitsUpper[] = "0123456789ABCDEF0";
+static const char infinityL[] = "infinity";
+static const char infinityU[] = "INFINITY";
+static const char NaNL[] = "nan";
+static const char NaNU[] = "NAN";
+
+/* Write out an integerPart in hexadecimal, starting with the most
+   significant nibble.  Write out exactly COUNT hexdigits, return
+   COUNT.  */
+static unsigned int
+partAsHex (char *dst, integerPart part, unsigned int count,
+           const char *hexDigitChars)
+{
+  unsigned int result = count;
+
+  assert(count != 0 && count <= integerPartWidth / 4);
+
+  part >>= (integerPartWidth - 4 * count);
+  while (count--) {
+    dst[count] = hexDigitChars[part & 0xf];
+    part >>= 4;
+  }
+
+  return result;
+}
+
+/* Write out an unsigned decimal integer.  */
+static char *
+writeUnsignedDecimal (char *dst, unsigned int n)
+{
+  char buff[40], *p;
+
+  p = buff;
+  do
+    *p++ = '0' + n % 10;
+  while (n /= 10);
+
+  do
+    *dst++ = *--p;
+  while (p != buff);
+
+  return dst;
+}
+
+/* Write out a signed decimal integer.  */
+static char *
+writeSignedDecimal (char *dst, int value)
+{
+  if (value < 0) {
+    *dst++ = '-';
+    dst = writeUnsignedDecimal(dst, -(unsigned) value);
+  } else
+    dst = writeUnsignedDecimal(dst, value);
+
+  return dst;
+}
+
+/* Constructors.  */
+void
+APFloat::initialize(const fltSemantics *ourSemantics)
+{
+  unsigned int count;
+
+  semantics = ourSemantics;
+  count = partCount();
+  if (count > 1)
+    significand.parts = new integerPart[count];
+}
+
+void
+APFloat::freeSignificand()
+{
+  if (partCount() > 1)
+    delete [] significand.parts;
+}
+
+void
+APFloat::assign(const APFloat &rhs)
+{
+  assert(semantics == rhs.semantics);
+
+  sign = rhs.sign;
+  category = rhs.category;
+  exponent = rhs.exponent;
+  if (category == fcNormal || category == fcNaN)
+    copySignificand(rhs);
+}
+
+void
+APFloat::copySignificand(const APFloat &rhs)
+{
+  assert(category == fcNormal || category == fcNaN);
+  assert(rhs.partCount() >= partCount());
+
+  APInt::tcAssign(significandParts(), rhs.significandParts(),
+                  partCount());
+}
+
+/* Make this number a NaN, with an arbitrary but deterministic value
+   for the significand.  If double or longer, this is a signalling NaN,
+   which may not be ideal.  If float, this is QNaN(0).  */
+void APFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill)
+{
+  category = fcNaN;
+  sign = Negative;
+
+  integerPart *significand = significandParts();
+  unsigned numParts = partCount();
+
+  // Set the significand bits to the fill.
+  if (!fill || fill->getNumWords() < numParts)
+    APInt::tcSet(significand, 0, numParts);
+  if (fill) {
+    APInt::tcAssign(significand, fill->getRawData(),
+                    std::min(fill->getNumWords(), numParts));
+
+    // Zero out the excess bits of the significand.
+    unsigned bitsToPreserve = semantics->precision - 1;
+    unsigned part = bitsToPreserve / 64;
+    bitsToPreserve %= 64;
+    significand[part] &= ((1ULL << bitsToPreserve) - 1);
+    for (part++; part != numParts; ++part)
+      significand[part] = 0;
+  }
+
+  unsigned QNaNBit = semantics->precision - 2;
+
+  if (SNaN) {
+    // We always have to clear the QNaN bit to make it an SNaN.
+    APInt::tcClearBit(significand, QNaNBit);
+
+    // If there are no bits set in the payload, we have to set
+    // *something* to make it a NaN instead of an infinity;
+    // conventionally, this is the next bit down from the QNaN bit.
+    if (APInt::tcIsZero(significand, numParts))
+      APInt::tcSetBit(significand, QNaNBit - 1);
+  } else {
+    // We always have to set the QNaN bit to make it a QNaN.
+    APInt::tcSetBit(significand, QNaNBit);
+  }
+
+  // For x87 extended precision, we want to make a NaN, not a
+  // pseudo-NaN.  Maybe we should expose the ability to make
+  // pseudo-NaNs?
+  if (semantics == &APFloat::x87DoubleExtended)
+    APInt::tcSetBit(significand, QNaNBit + 1);
+}
+
+APFloat APFloat::makeNaN(const fltSemantics &Sem, bool SNaN, bool Negative,
+                         const APInt *fill) {
+  APFloat value(Sem, uninitialized);
+  value.makeNaN(SNaN, Negative, fill);
+  return value;
+}
+
+APFloat &
+APFloat::operator=(const APFloat &rhs)
+{
+  if (this != &rhs) {
+    if (semantics != rhs.semantics) {
+      freeSignificand();
+      initialize(rhs.semantics);
+    }
+    assign(rhs);
+  }
+
+  return *this;
+}
+
+bool
+APFloat::isDenormal() const {
+  return isNormal() && (exponent == semantics->minExponent) &&
+         (APInt::tcExtractBit(significandParts(), 
+                              semantics->precision - 1) == 0);
+}
+
+bool
+APFloat::bitwiseIsEqual(const APFloat &rhs) const {
+  if (this == &rhs)
+    return true;
+  if (semantics != rhs.semantics ||
+      category != rhs.category ||
+      sign != rhs.sign)
+    return false;
+  if (category==fcZero || category==fcInfinity)
+    return true;
+  else if (category==fcNormal && exponent!=rhs.exponent)
+    return false;
+  else {
+    int i= partCount();
+    const integerPart* p=significandParts();
+    const integerPart* q=rhs.significandParts();
+    for (; i>0; i--, p++, q++) {
+      if (*p != *q)
+        return false;
+    }
+    return true;
+  }
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value) {
+  initialize(&ourSemantics);
+  sign = 0;
+  zeroSignificand();
+  exponent = ourSemantics.precision - 1;
+  significandParts()[0] = value;
+  normalize(rmNearestTiesToEven, lfExactlyZero);
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics) {
+  initialize(&ourSemantics);
+  category = fcZero;
+  sign = false;
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics, uninitializedTag tag) {
+  // Allocates storage if necessary but does not initialize it.
+  initialize(&ourSemantics);
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics,
+                 fltCategory ourCategory, bool negative) {
+  initialize(&ourSemantics);
+  category = ourCategory;
+  sign = negative;
+  if (category == fcNormal)
+    category = fcZero;
+  else if (ourCategory == fcNaN)
+    makeNaN();
+}
+
+APFloat::APFloat(const fltSemantics &ourSemantics, StringRef text) {
+  initialize(&ourSemantics);
+  convertFromString(text, rmNearestTiesToEven);
+}
+
+APFloat::APFloat(const APFloat &rhs) {
+  initialize(rhs.semantics);
+  assign(rhs);
+}
+
+APFloat::~APFloat()
+{
+  freeSignificand();
+}
+
+// Profile - This method 'profiles' an APFloat for use with FoldingSet.
+void APFloat::Profile(FoldingSetNodeID& ID) const {
+  ID.Add(bitcastToAPInt());
+}
+
+unsigned int
+APFloat::partCount() const
+{
+  return partCountForBits(semantics->precision + 1);
+}
+
+unsigned int
+APFloat::semanticsPrecision(const fltSemantics &semantics)
+{
+  return semantics.precision;
+}
+
+const integerPart *
+APFloat::significandParts() const
+{
+  return const_cast<APFloat *>(this)->significandParts();
+}
+
+integerPart *
+APFloat::significandParts()
+{
+  assert(category == fcNormal || category == fcNaN);
+
+  if (partCount() > 1)
+    return significand.parts;
+  else
+    return &significand.part;
+}
+
+void
+APFloat::zeroSignificand()
+{
+  category = fcNormal;
+  APInt::tcSet(significandParts(), 0, partCount());
+}
+
+/* Increment an fcNormal floating point number's significand.  */
+void
+APFloat::incrementSignificand()
+{
+  integerPart carry;
+
+  carry = APInt::tcIncrement(significandParts(), partCount());
+
+  /* Our callers should never cause us to overflow.  */
+  assert(carry == 0);
+  (void)carry;
+}
+
+/* Add the significand of the RHS.  Returns the carry flag.  */
+integerPart
+APFloat::addSignificand(const APFloat &rhs)
+{
+  integerPart *parts;
+
+  parts = significandParts();
+
+  assert(semantics == rhs.semantics);
+  assert(exponent == rhs.exponent);
+
+  return APInt::tcAdd(parts, rhs.significandParts(), 0, partCount());
+}
+
+/* Subtract the significand of the RHS with a borrow flag.  Returns
+   the borrow flag.  */
+integerPart
+APFloat::subtractSignificand(const APFloat &rhs, integerPart borrow)
+{
+  integerPart *parts;
+
+  parts = significandParts();
+
+  assert(semantics == rhs.semantics);
+  assert(exponent == rhs.exponent);
+
+  return APInt::tcSubtract(parts, rhs.significandParts(), borrow,
+                           partCount());
+}
+
+/* Multiply the significand of the RHS.  If ADDEND is non-NULL, add it
+   on to the full-precision result of the multiplication.  Returns the
+   lost fraction.  */
+lostFraction
+APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
+{
+  unsigned int omsb;        // One, not zero, based MSB.
+  unsigned int partsCount, newPartsCount, precision;
+  integerPart *lhsSignificand;
+  integerPart scratch[4];
+  integerPart *fullSignificand;
+  lostFraction lost_fraction;
+  bool ignored;
+
+  assert(semantics == rhs.semantics);
+
+  precision = semantics->precision;
+  newPartsCount = partCountForBits(precision * 2);
+
+  if (newPartsCount > 4)
+    fullSignificand = new integerPart[newPartsCount];
+  else
+    fullSignificand = scratch;
+
+  lhsSignificand = significandParts();
+  partsCount = partCount();
+
+  APInt::tcFullMultiply(fullSignificand, lhsSignificand,
+                        rhs.significandParts(), partsCount, partsCount);
+
+  lost_fraction = lfExactlyZero;
+  omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
+  exponent += rhs.exponent;
+
+  if (addend) {
+    Significand savedSignificand = significand;
+    const fltSemantics *savedSemantics = semantics;
+    fltSemantics extendedSemantics;
+    opStatus status;
+    unsigned int extendedPrecision;
+
+    /* Normalize our MSB.  */
+    extendedPrecision = precision + precision - 1;
+    if (omsb != extendedPrecision) {
+      APInt::tcShiftLeft(fullSignificand, newPartsCount,
+                         extendedPrecision - omsb);
+      exponent -= extendedPrecision - omsb;
+    }
+
+    /* Create new semantics.  */
+    extendedSemantics = *semantics;
+    extendedSemantics.precision = extendedPrecision;
+
+    if (newPartsCount == 1)
+      significand.part = fullSignificand[0];
+    else
+      significand.parts = fullSignificand;
+    semantics = &extendedSemantics;
+
+    APFloat extendedAddend(*addend);
+    status = extendedAddend.convert(extendedSemantics, rmTowardZero, &ignored);
+    assert(status == opOK);
+    (void)status;
+    lost_fraction = addOrSubtractSignificand(extendedAddend, false);
+
+    /* Restore our state.  */
+    if (newPartsCount == 1)
+      fullSignificand[0] = significand.part;
+    significand = savedSignificand;
+    semantics = savedSemantics;
+
+    omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
+  }
+
+  exponent -= (precision - 1);
+
+  if (omsb > precision) {
+    unsigned int bits, significantParts;
+    lostFraction lf;
+
+    bits = omsb - precision;
+    significantParts = partCountForBits(omsb);
+    lf = shiftRight(fullSignificand, significantParts, bits);
+    lost_fraction = combineLostFractions(lf, lost_fraction);
+    exponent += bits;
+  }
+
+  APInt::tcAssign(lhsSignificand, fullSignificand, partsCount);
+
+  if (newPartsCount > 4)
+    delete [] fullSignificand;
+
+  return lost_fraction;
+}
+
+/* Multiply the significands of LHS and RHS to DST.  */
+lostFraction
+APFloat::divideSignificand(const APFloat &rhs)
+{
+  unsigned int bit, i, partsCount;
+  const integerPart *rhsSignificand;
+  integerPart *lhsSignificand, *dividend, *divisor;
+  integerPart scratch[4];
+  lostFraction lost_fraction;
+
+  assert(semantics == rhs.semantics);
+
+  lhsSignificand = significandParts();
+  rhsSignificand = rhs.significandParts();
+  partsCount = partCount();
+
+  if (partsCount > 2)
+    dividend = new integerPart[partsCount * 2];
+  else
+    dividend = scratch;
+
+  divisor = dividend + partsCount;
+
+  /* Copy the dividend and divisor as they will be modified in-place.  */
+  for (i = 0; i < partsCount; i++) {
+    dividend[i] = lhsSignificand[i];
+    divisor[i] = rhsSignificand[i];
+    lhsSignificand[i] = 0;
+  }
+
+  exponent -= rhs.exponent;
+
+  unsigned int precision = semantics->precision;
+
+  /* Normalize the divisor.  */
+  bit = precision - APInt::tcMSB(divisor, partsCount) - 1;
+  if (bit) {
+    exponent += bit;
+    APInt::tcShiftLeft(divisor, partsCount, bit);
+  }
+
+  /* Normalize the dividend.  */
+  bit = precision - APInt::tcMSB(dividend, partsCount) - 1;
+  if (bit) {
+    exponent -= bit;
+    APInt::tcShiftLeft(dividend, partsCount, bit);
+  }
+
+  /* Ensure the dividend >= divisor initially for the loop below.
+     Incidentally, this means that the division loop below is
+     guaranteed to set the integer bit to one.  */
+  if (APInt::tcCompare(dividend, divisor, partsCount) < 0) {
+    exponent--;
+    APInt::tcShiftLeft(dividend, partsCount, 1);
+    assert(APInt::tcCompare(dividend, divisor, partsCount) >= 0);
+  }
+
+  /* Long division.  */
+  for (bit = precision; bit; bit -= 1) {
+    if (APInt::tcCompare(dividend, divisor, partsCount) >= 0) {
+      APInt::tcSubtract(dividend, divisor, 0, partsCount);
+      APInt::tcSetBit(lhsSignificand, bit - 1);
+    }
+
+    APInt::tcShiftLeft(dividend, partsCount, 1);
+  }
+
+  /* Figure out the lost fraction.  */
+  int cmp = APInt::tcCompare(dividend, divisor, partsCount);
+
+  if (cmp > 0)
+    lost_fraction = lfMoreThanHalf;
+  else if (cmp == 0)
+    lost_fraction = lfExactlyHalf;
+  else if (APInt::tcIsZero(dividend, partsCount))
+    lost_fraction = lfExactlyZero;
+  else
+    lost_fraction = lfLessThanHalf;
+
+  if (partsCount > 2)
+    delete [] dividend;
+
+  return lost_fraction;
+}
+
+unsigned int
+APFloat::significandMSB() const
+{
+  return APInt::tcMSB(significandParts(), partCount());
+}
+
+unsigned int
+APFloat::significandLSB() const
+{
+  return APInt::tcLSB(significandParts(), partCount());
+}
+
+/* Note that a zero result is NOT normalized to fcZero.  */
+lostFraction
+APFloat::shiftSignificandRight(unsigned int bits)
+{
+  /* Our exponent should not overflow.  */
+  assert((exponent_t) (exponent + bits) >= exponent);
+
+  exponent += bits;
+
+  return shiftRight(significandParts(), partCount(), bits);
+}
+
+/* Shift the significand left BITS bits, subtract BITS from its exponent.  */
+void
+APFloat::shiftSignificandLeft(unsigned int bits)
+{
+  assert(bits < semantics->precision);
+
+  if (bits) {
+    unsigned int partsCount = partCount();
+
+    APInt::tcShiftLeft(significandParts(), partsCount, bits);
+    exponent -= bits;
+
+    assert(!APInt::tcIsZero(significandParts(), partsCount));
+  }
+}
+
+APFloat::cmpResult
+APFloat::compareAbsoluteValue(const APFloat &rhs) const
+{
+  int compare;
+
+  assert(semantics == rhs.semantics);
+  assert(category == fcNormal);
+  assert(rhs.category == fcNormal);
+
+  compare = exponent - rhs.exponent;
+
+  /* If exponents are equal, do an unsigned bignum comparison of the
+     significands.  */
+  if (compare == 0)
+    compare = APInt::tcCompare(significandParts(), rhs.significandParts(),
+                               partCount());
+
+  if (compare > 0)
+    return cmpGreaterThan;
+  else if (compare < 0)
+    return cmpLessThan;
+  else
+    return cmpEqual;
+}
+
+/* Handle overflow.  Sign is preserved.  We either become infinity or
+   the largest finite number.  */
+APFloat::opStatus
+APFloat::handleOverflow(roundingMode rounding_mode)
+{
+  /* Infinity?  */
+  if (rounding_mode == rmNearestTiesToEven ||
+      rounding_mode == rmNearestTiesToAway ||
+      (rounding_mode == rmTowardPositive && !sign) ||
+      (rounding_mode == rmTowardNegative && sign)) {
+    category = fcInfinity;
+    return (opStatus) (opOverflow | opInexact);
+  }
+
+  /* Otherwise we become the largest finite number.  */
+  category = fcNormal;
+  exponent = semantics->maxExponent;
+  APInt::tcSetLeastSignificantBits(significandParts(), partCount(),
+                                   semantics->precision);
+
+  return opInexact;
+}
+
+/* Returns TRUE if, when truncating the current number, with BIT the
+   new LSB, with the given lost fraction and rounding mode, the result
+   would need to be rounded away from zero (i.e., by increasing the
+   signficand).  This routine must work for fcZero of both signs, and
+   fcNormal numbers.  */
+bool
+APFloat::roundAwayFromZero(roundingMode rounding_mode,
+                           lostFraction lost_fraction,
+                           unsigned int bit) const
+{
+  /* NaNs and infinities should not have lost fractions.  */
+  assert(category == fcNormal || category == fcZero);
+
+  /* Current callers never pass this so we don't handle it.  */
+  assert(lost_fraction != lfExactlyZero);
+
+  switch (rounding_mode) {
+  case rmNearestTiesToAway:
+    return lost_fraction == lfExactlyHalf || lost_fraction == lfMoreThanHalf;
+
+  case rmNearestTiesToEven:
+    if (lost_fraction == lfMoreThanHalf)
+      return true;
+
+    /* Our zeroes don't have a significand to test.  */
+    if (lost_fraction == lfExactlyHalf && category != fcZero)
+      return APInt::tcExtractBit(significandParts(), bit);
+
+    return false;
+
+  case rmTowardZero:
+    return false;
+
+  case rmTowardPositive:
+    return sign == false;
+
+  case rmTowardNegative:
+    return sign == true;
+  }
+  llvm_unreachable("Invalid rounding mode found");
+}
+
+APFloat::opStatus
+APFloat::normalize(roundingMode rounding_mode,
+                   lostFraction lost_fraction)
+{
+  unsigned int omsb;                /* One, not zero, based MSB.  */
+  int exponentChange;
+
+  if (category != fcNormal)
+    return opOK;
+
+  /* Before rounding normalize the exponent of fcNormal numbers.  */
+  omsb = significandMSB() + 1;
+
+  if (omsb) {
+    /* OMSB is numbered from 1.  We want to place it in the integer
+       bit numbered PRECISION if possible, with a compensating change in
+       the exponent.  */
+    exponentChange = omsb - semantics->precision;
+
+    /* If the resulting exponent is too high, overflow according to
+       the rounding mode.  */
+    if (exponent + exponentChange > semantics->maxExponent)
+      return handleOverflow(rounding_mode);
+
+    /* Subnormal numbers have exponent minExponent, and their MSB
+       is forced based on that.  */
+    if (exponent + exponentChange < semantics->minExponent)
+      exponentChange = semantics->minExponent - exponent;
+
+    /* Shifting left is easy as we don't lose precision.  */
+    if (exponentChange < 0) {
+      assert(lost_fraction == lfExactlyZero);
+
+      shiftSignificandLeft(-exponentChange);
+
+      return opOK;
+    }
+
+    if (exponentChange > 0) {
+      lostFraction lf;
+
+      /* Shift right and capture any new lost fraction.  */
+      lf = shiftSignificandRight(exponentChange);
+
+      lost_fraction = combineLostFractions(lf, lost_fraction);
+
+      /* Keep OMSB up-to-date.  */
+      if (omsb > (unsigned) exponentChange)
+        omsb -= exponentChange;
+      else
+        omsb = 0;
+    }
+  }
+
+  /* Now round the number according to rounding_mode given the lost
+     fraction.  */
+
+  /* As specified in IEEE 754, since we do not trap we do not report
+     underflow for exact results.  */
+  if (lost_fraction == lfExactlyZero) {
+    /* Canonicalize zeroes.  */
+    if (omsb == 0)
+      category = fcZero;
+
+    return opOK;
+  }
+
+  /* Increment the significand if we're rounding away from zero.  */
+  if (roundAwayFromZero(rounding_mode, lost_fraction, 0)) {
+    if (omsb == 0)
+      exponent = semantics->minExponent;
+
+    incrementSignificand();
+    omsb = significandMSB() + 1;
+
+    /* Did the significand increment overflow?  */
+    if (omsb == (unsigned) semantics->precision + 1) {
+      /* Renormalize by incrementing the exponent and shifting our
+         significand right one.  However if we already have the
+         maximum exponent we overflow to infinity.  */
+      if (exponent == semantics->maxExponent) {
+        category = fcInfinity;
+
+        return (opStatus) (opOverflow | opInexact);
+      }
+
+      shiftSignificandRight(1);
+
+      return opInexact;
+    }
+  }
+
+  /* The normal case - we were and are not denormal, and any
+     significand increment above didn't overflow.  */
+  if (omsb == semantics->precision)
+    return opInexact;
+
+  /* We have a non-zero denormal.  */
+  assert(omsb < semantics->precision);
+
+  /* Canonicalize zeroes.  */
+  if (omsb == 0)
+    category = fcZero;
+
+  /* The fcZero case is a denormal that underflowed to zero.  */
+  return (opStatus) (opUnderflow | opInexact);
+}
+
+APFloat::opStatus
+APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    llvm_unreachable(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcNormal, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcZero):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcZero, fcInfinity):
+    category = fcInfinity;
+    sign = rhs.sign ^ subtract;
+    return opOK;
+
+  case convolve(fcZero, fcNormal):
+    assign(rhs);
+    sign = rhs.sign ^ subtract;
+    return opOK;
+
+  case convolve(fcZero, fcZero):
+    /* Sign depends on rounding mode; handled by caller.  */
+    return opOK;
+
+  case convolve(fcInfinity, fcInfinity):
+    /* Differently signed infinities can only be validly
+       subtracted.  */
+    if (((sign ^ rhs.sign)!=0) != subtract) {
+      makeNaN();
+      return opInvalidOp;
+    }
+
+    return opOK;
+
+  case convolve(fcNormal, fcNormal):
+    return opDivByZero;
+  }
+}
+
+/* Add or subtract two normal numbers.  */
+lostFraction
+APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
+{
+  integerPart carry;
+  lostFraction lost_fraction;
+  int bits;
+
+  /* Determine if the operation on the absolute values is effectively
+     an addition or subtraction.  */
+  subtract ^= (sign ^ rhs.sign) ? true : false;
+
+  /* Are we bigger exponent-wise than the RHS?  */
+  bits = exponent - rhs.exponent;
+
+  /* Subtraction is more subtle than one might naively expect.  */
+  if (subtract) {
+    APFloat temp_rhs(rhs);
+    bool reverse;
+
+    if (bits == 0) {
+      reverse = compareAbsoluteValue(temp_rhs) == cmpLessThan;
+      lost_fraction = lfExactlyZero;
+    } else if (bits > 0) {
+      lost_fraction = temp_rhs.shiftSignificandRight(bits - 1);
+      shiftSignificandLeft(1);
+      reverse = false;
+    } else {
+      lost_fraction = shiftSignificandRight(-bits - 1);
+      temp_rhs.shiftSignificandLeft(1);
+      reverse = true;
+    }
+
+    if (reverse) {
+      carry = temp_rhs.subtractSignificand
+        (*this, lost_fraction != lfExactlyZero);
+      copySignificand(temp_rhs);
+      sign = !sign;
+    } else {
+      carry = subtractSignificand
+        (temp_rhs, lost_fraction != lfExactlyZero);
+    }
+
+    /* Invert the lost fraction - it was on the RHS and
+       subtracted.  */
+    if (lost_fraction == lfLessThanHalf)
+      lost_fraction = lfMoreThanHalf;
+    else if (lost_fraction == lfMoreThanHalf)
+      lost_fraction = lfLessThanHalf;
+
+    /* The code above is intended to ensure that no borrow is
+       necessary.  */
+    assert(!carry);
+    (void)carry;
+  } else {
+    if (bits > 0) {
+      APFloat temp_rhs(rhs);
+
+      lost_fraction = temp_rhs.shiftSignificandRight(bits);
+      carry = addSignificand(temp_rhs);
+    } else {
+      lost_fraction = shiftSignificandRight(-bits);
+      carry = addSignificand(rhs);
+    }
+
+    /* We have a guard bit; generating a carry cannot happen.  */
+    assert(!carry);
+    (void)carry;
+  }
+
+  return lost_fraction;
+}
+
+APFloat::opStatus
+APFloat::multiplySpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    llvm_unreachable(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcInfinity):
+    category = fcInfinity;
+    return opOK;
+
+  case convolve(fcZero, fcNormal):
+  case convolve(fcNormal, fcZero):
+  case convolve(fcZero, fcZero):
+    category = fcZero;
+    return opOK;
+
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcInfinity, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+APFloat::opStatus
+APFloat::divideSpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    llvm_unreachable(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcInfinity):
+    category = fcZero;
+    return opOK;
+
+  case convolve(fcNormal, fcZero):
+    category = fcInfinity;
+    return opDivByZero;
+
+  case convolve(fcInfinity, fcInfinity):
+  case convolve(fcZero, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+APFloat::opStatus
+APFloat::modSpecials(const APFloat &rhs)
+{
+  switch (convolve(category, rhs.category)) {
+  default:
+    llvm_unreachable(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+  case convolve(fcNormal, fcInfinity):
+    return opOK;
+
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    category = fcNaN;
+    copySignificand(rhs);
+    return opOK;
+
+  case convolve(fcNormal, fcZero):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcInfinity):
+  case convolve(fcZero, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case convolve(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+/* Change sign.  */
+void
+APFloat::changeSign()
+{
+  /* Look mummy, this one's easy.  */
+  sign = !sign;
+}
+
+void
+APFloat::clearSign()
+{
+  /* So is this one. */
+  sign = 0;
+}
+
+void
+APFloat::copySign(const APFloat &rhs)
+{
+  /* And this one. */
+  sign = rhs.sign;
+}
+
+/* Normalized addition or subtraction.  */
+APFloat::opStatus
+APFloat::addOrSubtract(const APFloat &rhs, roundingMode rounding_mode,
+                       bool subtract)
+{
+  opStatus fs;
+
+  fs = addOrSubtractSpecials(rhs, subtract);
+
+  /* This return code means it was not a simple case.  */
+  if (fs == opDivByZero) {
+    lostFraction lost_fraction;
+
+    lost_fraction = addOrSubtractSignificand(rhs, subtract);
+    fs = normalize(rounding_mode, lost_fraction);
+
+    /* Can only be zero if we lost no fraction.  */
+    assert(category != fcZero || lost_fraction == lfExactlyZero);
+  }
+
+  /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
+     positive zero unless rounding to minus infinity, except that
+     adding two like-signed zeroes gives that zero.  */
+  if (category == fcZero) {
+    if (rhs.category != fcZero || (sign == rhs.sign) == subtract)
+      sign = (rounding_mode == rmTowardNegative);
+  }
+
+  return fs;
+}
+
+/* Normalized addition.  */
+APFloat::opStatus
+APFloat::add(const APFloat &rhs, roundingMode rounding_mode)
+{
+  return addOrSubtract(rhs, rounding_mode, false);
+}
+
+/* Normalized subtraction.  */
+APFloat::opStatus
+APFloat::subtract(const APFloat &rhs, roundingMode rounding_mode)
+{
+  return addOrSubtract(rhs, rounding_mode, true);
+}
+
+/* Normalized multiply.  */
+APFloat::opStatus
+APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  sign ^= rhs.sign;
+  fs = multiplySpecials(rhs);
+
+  if (category == fcNormal) {
+    lostFraction lost_fraction = multiplySignificand(rhs, 0);
+    fs = normalize(rounding_mode, lost_fraction);
+    if (lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+  }
+
+  return fs;
+}
+
+/* Normalized divide.  */
+APFloat::opStatus
+APFloat::divide(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  sign ^= rhs.sign;
+  fs = divideSpecials(rhs);
+
+  if (category == fcNormal) {
+    lostFraction lost_fraction = divideSignificand(rhs);
+    fs = normalize(rounding_mode, lost_fraction);
+    if (lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+  }
+
+  return fs;
+}
+
+/* Normalized remainder.  This is not currently correct in all cases.  */
+APFloat::opStatus
+APFloat::remainder(const APFloat &rhs)
+{
+  opStatus fs;
+  APFloat V = *this;
+  unsigned int origSign = sign;
+
+  fs = V.divide(rhs, rmNearestTiesToEven);
+  if (fs == opDivByZero)
+    return fs;
+
+  int parts = partCount();
+  integerPart *x = new integerPart[parts];
+  bool ignored;
+  fs = V.convertToInteger(x, parts * integerPartWidth, true,
+                          rmNearestTiesToEven, &ignored);
+  if (fs==opInvalidOp)
+    return fs;
+
+  fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
+                                        rmNearestTiesToEven);
+  assert(fs==opOK);   // should always work
+
+  fs = V.multiply(rhs, rmNearestTiesToEven);
+  assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
+
+  fs = subtract(V, rmNearestTiesToEven);
+  assert(fs==opOK || fs==opInexact);   // likewise
+
+  if (isZero())
+    sign = origSign;    // IEEE754 requires this
+  delete[] x;
+  return fs;
+}
+
+/* Normalized llvm frem (C fmod).
+   This is not currently correct in all cases.  */
+APFloat::opStatus
+APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
+{
+  opStatus fs;
+  fs = modSpecials(rhs);
+
+  if (category == fcNormal && rhs.category == fcNormal) {
+    APFloat V = *this;
+    unsigned int origSign = sign;
+
+    fs = V.divide(rhs, rmNearestTiesToEven);
+    if (fs == opDivByZero)
+      return fs;
+
+    int parts = partCount();
+    integerPart *x = new integerPart[parts];
+    bool ignored;
+    fs = V.convertToInteger(x, parts * integerPartWidth, true,
+                            rmTowardZero, &ignored);
+    if (fs==opInvalidOp)
+      return fs;
+
+    fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
+                                          rmNearestTiesToEven);
+    assert(fs==opOK);   // should always work
+
+    fs = V.multiply(rhs, rounding_mode);
+    assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
+
+    fs = subtract(V, rounding_mode);
+    assert(fs==opOK || fs==opInexact);   // likewise
+
+    if (isZero())
+      sign = origSign;    // IEEE754 requires this
+    delete[] x;
+  }
+  return fs;
+}
+
+/* Normalized fused-multiply-add.  */
+APFloat::opStatus
+APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
+                          const APFloat &addend,
+                          roundingMode rounding_mode)
+{
+  opStatus fs;
+
+  /* Post-multiplication sign, before addition.  */
+  sign ^= multiplicand.sign;
+
+  /* If and only if all arguments are normal do we need to do an
+     extended-precision calculation.  */
+  if (category == fcNormal &&
+      multiplicand.category == fcNormal &&
+      addend.category == fcNormal) {
+    lostFraction lost_fraction;
+
+    lost_fraction = multiplySignificand(multiplicand, &addend);
+    fs = normalize(rounding_mode, lost_fraction);
+    if (lost_fraction != lfExactlyZero)
+      fs = (opStatus) (fs | opInexact);
+
+    /* If two numbers add (exactly) to zero, IEEE 754 decrees it is a
+       positive zero unless rounding to minus infinity, except that
+       adding two like-signed zeroes gives that zero.  */
+    if (category == fcZero && sign != addend.sign)
+      sign = (rounding_mode == rmTowardNegative);
+  } else {
+    fs = multiplySpecials(multiplicand);
+
+    /* FS can only be opOK or opInvalidOp.  There is no more work
+       to do in the latter case.  The IEEE-754R standard says it is
+       implementation-defined in this case whether, if ADDEND is a
+       quiet NaN, we raise invalid op; this implementation does so.
+
+       If we need to do the addition we can do so with normal
+       precision.  */
+    if (fs == opOK)
+      fs = addOrSubtract(addend, rounding_mode, false);
+  }
+
+  return fs;
+}
+
+/* Rounding-mode corrrect round to integral value.  */
+APFloat::opStatus APFloat::roundToIntegral(roundingMode rounding_mode) {
+  opStatus fs;
+
+  // If the exponent is large enough, we know that this value is already
+  // integral, and the arithmetic below would potentially cause it to saturate
+  // to +/-Inf.  Bail out early instead.
+  if (category == fcNormal && exponent+1 >= (int)semanticsPrecision(*semantics))
+    return opOK;
+
+  // The algorithm here is quite simple: we add 2^(p-1), where p is the
+  // precision of our format, and then subtract it back off again.  The choice
+  // of rounding modes for the addition/subtraction determines the rounding mode
+  // for our integral rounding as well.
+  // NOTE: When the input value is negative, we do subtraction followed by
+  // addition instead.
+  APInt IntegerConstant(NextPowerOf2(semanticsPrecision(*semantics)), 1);
+  IntegerConstant <<= semanticsPrecision(*semantics)-1;
+  APFloat MagicConstant(*semantics);
+  fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
+                                      rmNearestTiesToEven);
+  MagicConstant.copySign(*this);
+
+  if (fs != opOK)
+    return fs;
+
+  // Preserve the input sign so that we can handle 0.0/-0.0 cases correctly.
+  bool inputSign = isNegative();
+
+  fs = add(MagicConstant, rounding_mode);
+  if (fs != opOK && fs != opInexact)
+    return fs;
+
+  fs = subtract(MagicConstant, rounding_mode);
+
+  // Restore the input sign.
+  if (inputSign != isNegative())
+    changeSign();
+
+  return fs;
+}
+
+
+/* Comparison requires normalized numbers.  */
+APFloat::cmpResult
+APFloat::compare(const APFloat &rhs) const
+{
+  cmpResult result;
+
+  assert(semantics == rhs.semantics);
+
+  switch (convolve(category, rhs.category)) {
+  default:
+    llvm_unreachable(0);
+
+  case convolve(fcNaN, fcZero):
+  case convolve(fcNaN, fcNormal):
+  case convolve(fcNaN, fcInfinity):
+  case convolve(fcNaN, fcNaN):
+  case convolve(fcZero, fcNaN):
+  case convolve(fcNormal, fcNaN):
+  case convolve(fcInfinity, fcNaN):
+    return cmpUnordered;
+
+  case convolve(fcInfinity, fcNormal):
+  case convolve(fcInfinity, fcZero):
+  case convolve(fcNormal, fcZero):
+    if (sign)
+      return cmpLessThan;
+    else
+      return cmpGreaterThan;
+
+  case convolve(fcNormal, fcInfinity):
+  case convolve(fcZero, fcInfinity):
+  case convolve(fcZero, fcNormal):
+    if (rhs.sign)
+      return cmpGreaterThan;
+    else
+      return cmpLessThan;
+
+  case convolve(fcInfinity, fcInfinity):
+    if (sign == rhs.sign)
+      return cmpEqual;
+    else if (sign)
+      return cmpLessThan;
+    else
+      return cmpGreaterThan;
+
+  case convolve(fcZero, fcZero):
+    return cmpEqual;
+
+  case convolve(fcNormal, fcNormal):
+    break;
+  }
+
+  /* Two normal numbers.  Do they have the same sign?  */
+  if (sign != rhs.sign) {
+    if (sign)
+      result = cmpLessThan;
+    else
+      result = cmpGreaterThan;
+  } else {
+    /* Compare absolute values; invert result if negative.  */
+    result = compareAbsoluteValue(rhs);
+
+    if (sign) {
+      if (result == cmpLessThan)
+        result = cmpGreaterThan;
+      else if (result == cmpGreaterThan)
+        result = cmpLessThan;
+    }
+  }
+
+  return result;
+}
+
+/// APFloat::convert - convert a value of one floating point type to another.
+/// The return value corresponds to the IEEE754 exceptions.  *losesInfo
+/// records whether the transformation lost information, i.e. whether
+/// converting the result back to the original type will produce the
+/// original value (this is almost the same as return value==fsOK, but there
+/// are edge cases where this is not so).
+
+APFloat::opStatus
+APFloat::convert(const fltSemantics &toSemantics,
+                 roundingMode rounding_mode, bool *losesInfo)
+{
+  lostFraction lostFraction;
+  unsigned int newPartCount, oldPartCount;
+  opStatus fs;
+  int shift;
+  const fltSemantics &fromSemantics = *semantics;
+
+  lostFraction = lfExactlyZero;
+  newPartCount = partCountForBits(toSemantics.precision + 1);
+  oldPartCount = partCount();
+  shift = toSemantics.precision - fromSemantics.precision;
+
+  bool X86SpecialNan = false;
+  if (&fromSemantics == &APFloat::x87DoubleExtended &&
+      &toSemantics != &APFloat::x87DoubleExtended && category == fcNaN &&
+      (!(*significandParts() & 0x8000000000000000ULL) ||
+       !(*significandParts() & 0x4000000000000000ULL))) {
+    // x86 has some unusual NaNs which cannot be represented in any other
+    // format; note them here.
+    X86SpecialNan = true;
+  }
+
+  // If this is a truncation, perform the shift before we narrow the storage.
+  if (shift < 0 && (category==fcNormal || category==fcNaN))
+    lostFraction = shiftRight(significandParts(), oldPartCount, -shift);
+
+  // Fix the storage so it can hold to new value.
+  if (newPartCount > oldPartCount) {
+    // The new type requires more storage; make it available.
+    integerPart *newParts;
+    newParts = new integerPart[newPartCount];
+    APInt::tcSet(newParts, 0, newPartCount);
+    if (category==fcNormal || category==fcNaN)
+      APInt::tcAssign(newParts, significandParts(), oldPartCount);
+    freeSignificand();
+    significand.parts = newParts;
+  } else if (newPartCount == 1 && oldPartCount != 1) {
+    // Switch to built-in storage for a single part.
+    integerPart newPart = 0;
+    if (category==fcNormal || category==fcNaN)
+      newPart = significandParts()[0];
+    freeSignificand();
+    significand.part = newPart;
+  }
+
+  // Now that we have the right storage, switch the semantics.
+  semantics = &toSemantics;
+
+  // If this is an extension, perform the shift now that the storage is
+  // available.
+  if (shift > 0 && (category==fcNormal || category==fcNaN))
+    APInt::tcShiftLeft(significandParts(), newPartCount, shift);
+
+  if (category == fcNormal) {
+    fs = normalize(rounding_mode, lostFraction);
+    *losesInfo = (fs != opOK);
+  } else if (category == fcNaN) {
+    *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan;
+
+    // For x87 extended precision, we want to make a NaN, not a special NaN if
+    // the input wasn't special either.
+    if (!X86SpecialNan && semantics == &APFloat::x87DoubleExtended)
+      APInt::tcSetBit(significandParts(), semantics->precision - 1);
+
+    // gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
+    // does not give you back the same bits.  This is dubious, and we
+    // don't currently do it.  You're really supposed to get
+    // an invalid operation signal at runtime, but nobody does that.
+    fs = opOK;
+  } else {
+    *losesInfo = false;
+    fs = opOK;
+  }
+
+  return fs;
+}
+
+/* Convert a floating point number to an integer according to the
+   rounding mode.  If the rounded integer value is out of range this
+   returns an invalid operation exception and the contents of the
+   destination parts are unspecified.  If the rounded value is in
+   range but the floating point number is not the exact integer, the C
+   standard doesn't require an inexact exception to be raised.  IEEE
+   854 does require it so we do that.
+
+   Note that for conversions to integer type the C standard requires
+   round-to-zero to always be used.  */
+APFloat::opStatus
+APFloat::convertToSignExtendedInteger(integerPart *parts, unsigned int width,
+                                      bool isSigned,
+                                      roundingMode rounding_mode,
+                                      bool *isExact) const
+{
+  lostFraction lost_fraction;
+  const integerPart *src;
+  unsigned int dstPartsCount, truncatedBits;
+
+  *isExact = false;
+
+  /* Handle the three special cases first.  */
+  if (category == fcInfinity || category == fcNaN)
+    return opInvalidOp;
+
+  dstPartsCount = partCountForBits(width);
+
+  if (category == fcZero) {
+    APInt::tcSet(parts, 0, dstPartsCount);
+    // Negative zero can't be represented as an int.
+    *isExact = !sign;
+    return opOK;
+  }
+
+  src = significandParts();
+
+  /* Step 1: place our absolute value, with any fraction truncated, in
+     the destination.  */
+  if (exponent < 0) {
+    /* Our absolute value is less than one; truncate everything.  */
+    APInt::tcSet(parts, 0, dstPartsCount);
+    /* For exponent -1 the integer bit represents .5, look at that.
+       For smaller exponents leftmost truncated bit is 0. */
+    truncatedBits = semantics->precision -1U - exponent;
+  } else {
+    /* We want the most significant (exponent + 1) bits; the rest are
+       truncated.  */
+    unsigned int bits = exponent + 1U;
+
+    /* Hopelessly large in magnitude?  */
+    if (bits > width)
+      return opInvalidOp;
+
+    if (bits < semantics->precision) {
+      /* We truncate (semantics->precision - bits) bits.  */
+      truncatedBits = semantics->precision - bits;
+      APInt::tcExtract(parts, dstPartsCount, src, bits, truncatedBits);
+    } else {
+      /* We want at least as many bits as are available.  */
+      APInt::tcExtract(parts, dstPartsCount, src, semantics->precision, 0);
+      APInt::tcShiftLeft(parts, dstPartsCount, bits - semantics->precision);
+      truncatedBits = 0;
+    }
+  }
+
+  /* Step 2: work out any lost fraction, and increment the absolute
+     value if we would round away from zero.  */
+  if (truncatedBits) {
+    lost_fraction = lostFractionThroughTruncation(src, partCount(),
+                                                  truncatedBits);
+    if (lost_fraction != lfExactlyZero &&
+        roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
+      if (APInt::tcIncrement(parts, dstPartsCount))
+        return opInvalidOp;     /* Overflow.  */
+    }
+  } else {
+    lost_fraction = lfExactlyZero;
+  }
+
+  /* Step 3: check if we fit in the destination.  */
+  unsigned int omsb = APInt::tcMSB(parts, dstPartsCount) + 1;
+
+  if (sign) {
+    if (!isSigned) {
+      /* Negative numbers cannot be represented as unsigned.  */
+      if (omsb != 0)
+        return opInvalidOp;
+    } else {
+      /* It takes omsb bits to represent the unsigned integer value.
+         We lose a bit for the sign, but care is needed as the
+         maximally negative integer is a special case.  */
+      if (omsb == width && APInt::tcLSB(parts, dstPartsCount) + 1 != omsb)
+        return opInvalidOp;
+
+      /* This case can happen because of rounding.  */
+      if (omsb > width)
+        return opInvalidOp;
+    }
+
+    APInt::tcNegate (parts, dstPartsCount);
+  } else {
+    if (omsb >= width + !isSigned)
+      return opInvalidOp;
+  }
+
+  if (lost_fraction == lfExactlyZero) {
+    *isExact = true;
+    return opOK;
+  } else
+    return opInexact;
+}
+
+/* Same as convertToSignExtendedInteger, except we provide
+   deterministic values in case of an invalid operation exception,
+   namely zero for NaNs and the minimal or maximal value respectively
+   for underflow or overflow.
+   The *isExact output tells whether the result is exact, in the sense
+   that converting it back to the original floating point type produces
+   the original value.  This is almost equivalent to result==opOK,
+   except for negative zeroes.
+*/
+APFloat::opStatus
+APFloat::convertToInteger(integerPart *parts, unsigned int width,
+                          bool isSigned,
+                          roundingMode rounding_mode, bool *isExact) const
+{
+  opStatus fs;
+
+  fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode,
+                                    isExact);
+
+  if (fs == opInvalidOp) {
+    unsigned int bits, dstPartsCount;
+
+    dstPartsCount = partCountForBits(width);
+
+    if (category == fcNaN)
+      bits = 0;
+    else if (sign)
+      bits = isSigned;
+    else
+      bits = width - isSigned;
+
+    APInt::tcSetLeastSignificantBits(parts, dstPartsCount, bits);
+    if (sign && isSigned)
+      APInt::tcShiftLeft(parts, dstPartsCount, width - 1);
+  }
+
+  return fs;
+}
+
+/* Same as convertToInteger(integerPart*, ...), except the result is returned in
+   an APSInt, whose initial bit-width and signed-ness are used to determine the
+   precision of the conversion.
+ */
+APFloat::opStatus
+APFloat::convertToInteger(APSInt &result,
+                          roundingMode rounding_mode, bool *isExact) const
+{
+  unsigned bitWidth = result.getBitWidth();
+  SmallVector<uint64_t, 4> parts(result.getNumWords());
+  opStatus status = convertToInteger(
+    parts.data(), bitWidth, result.isSigned(), rounding_mode, isExact);
+  // Keeps the original signed-ness.
+  result = APInt(bitWidth, parts);
+  return status;
+}
+
+/* Convert an unsigned integer SRC to a floating point number,
+   rounding according to ROUNDING_MODE.  The sign of the floating
+   point number is not modified.  */
+APFloat::opStatus
+APFloat::convertFromUnsignedParts(const integerPart *src,
+                                  unsigned int srcCount,
+                                  roundingMode rounding_mode)
+{
+  unsigned int omsb, precision, dstCount;
+  integerPart *dst;
+  lostFraction lost_fraction;
+
+  category = fcNormal;
+  omsb = APInt::tcMSB(src, srcCount) + 1;
+  dst = significandParts();
+  dstCount = partCount();
+  precision = semantics->precision;
+
+  /* We want the most significant PRECISION bits of SRC.  There may not
+     be that many; extract what we can.  */
+  if (precision <= omsb) {
+    exponent = omsb - 1;
+    lost_fraction = lostFractionThroughTruncation(src, srcCount,
+                                                  omsb - precision);
+    APInt::tcExtract(dst, dstCount, src, precision, omsb - precision);
+  } else {
+    exponent = precision - 1;
+    lost_fraction = lfExactlyZero;
+    APInt::tcExtract(dst, dstCount, src, omsb, 0);
+  }
+
+  return normalize(rounding_mode, lost_fraction);
+}
+
+APFloat::opStatus
+APFloat::convertFromAPInt(const APInt &Val,
+                          bool isSigned,
+                          roundingMode rounding_mode)
+{
+  unsigned int partCount = Val.getNumWords();
+  APInt api = Val;
+
+  sign = false;
+  if (isSigned && api.isNegative()) {
+    sign = true;
+    api = -api;
+  }
+
+  return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
+}
+
+/* Convert a two's complement integer SRC to a floating point number,
+   rounding according to ROUNDING_MODE.  ISSIGNED is true if the
+   integer is signed, in which case it must be sign-extended.  */
+APFloat::opStatus
+APFloat::convertFromSignExtendedInteger(const integerPart *src,
+                                        unsigned int srcCount,
+                                        bool isSigned,
+                                        roundingMode rounding_mode)
+{
+  opStatus status;
+
+  if (isSigned &&
+      APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
+    integerPart *copy;
+
+    /* If we're signed and negative negate a copy.  */
+    sign = true;
+    copy = new integerPart[srcCount];
+    APInt::tcAssign(copy, src, srcCount);
+    APInt::tcNegate(copy, srcCount);
+    status = convertFromUnsignedParts(copy, srcCount, rounding_mode);
+    delete [] copy;
+  } else {
+    sign = false;
+    status = convertFromUnsignedParts(src, srcCount, rounding_mode);
+  }
+
+  return status;
+}
+
+/* FIXME: should this just take a const APInt reference?  */
+APFloat::opStatus
+APFloat::convertFromZeroExtendedInteger(const integerPart *parts,
+                                        unsigned int width, bool isSigned,
+                                        roundingMode rounding_mode)
+{
+  unsigned int partCount = partCountForBits(width);
+  APInt api = APInt(width, makeArrayRef(parts, partCount));
+
+  sign = false;
+  if (isSigned && APInt::tcExtractBit(parts, width - 1)) {
+    sign = true;
+    api = -api;
+  }
+
+  return convertFromUnsignedParts(api.getRawData(), partCount, rounding_mode);
+}
+
+APFloat::opStatus
+APFloat::convertFromHexadecimalString(StringRef s, roundingMode rounding_mode)
+{
+  lostFraction lost_fraction = lfExactlyZero;
+  integerPart *significand;
+  unsigned int bitPos, partsCount;
+  StringRef::iterator dot, firstSignificantDigit;
+
+  zeroSignificand();
+  exponent = 0;
+  category = fcNormal;
+
+  significand = significandParts();
+  partsCount = partCount();
+  bitPos = partsCount * integerPartWidth;
+
+  /* Skip leading zeroes and any (hexa)decimal point.  */
+  StringRef::iterator begin = s.begin();
+  StringRef::iterator end = s.end();
+  StringRef::iterator p = skipLeadingZeroesAndAnyDot(begin, end, &dot);
+  firstSignificantDigit = p;
+
+  for (; p != end;) {
+    integerPart hex_value;
+
+    if (*p == '.') {
+      assert(dot == end && "String contains multiple dots");
+      dot = p++;
+      if (p == end) {
+        break;
+      }
+    }
+
+    hex_value = hexDigitValue(*p);
+    if (hex_value == -1U) {
+      break;
+    }
+
+    p++;
+
+    if (p == end) {
+      break;
+    } else {
+      /* Store the number whilst 4-bit nibbles remain.  */
+      if (bitPos) {
+        bitPos -= 4;
+        hex_value <<= bitPos % integerPartWidth;
+        significand[bitPos / integerPartWidth] |= hex_value;
+      } else {
+        lost_fraction = trailingHexadecimalFraction(p, end, hex_value);
+        while (p != end && hexDigitValue(*p) != -1U)
+          p++;
+        break;
+      }
+    }
+  }
+
+  /* Hex floats require an exponent but not a hexadecimal point.  */
+  assert(p != end && "Hex strings require an exponent");
+  assert((*p == 'p' || *p == 'P') && "Invalid character in significand");
+  assert(p != begin && "Significand has no digits");
+  assert((dot == end || p - begin != 1) && "Significand has no digits");
+
+  /* Ignore the exponent if we are zero.  */
+  if (p != firstSignificantDigit) {
+    int expAdjustment;
+
+    /* Implicit hexadecimal point?  */
+    if (dot == end)
+      dot = p;
+
+    /* Calculate the exponent adjustment implicit in the number of
+       significant digits.  */
+    expAdjustment = static_cast<int>(dot - firstSignificantDigit);
+    if (expAdjustment < 0)
+      expAdjustment++;
+    expAdjustment = expAdjustment * 4 - 1;
+
+    /* Adjust for writing the significand starting at the most
+       significant nibble.  */
+    expAdjustment += semantics->precision;
+    expAdjustment -= partsCount * integerPartWidth;
+
+    /* Adjust for the given exponent.  */
+    exponent = totalExponent(p + 1, end, expAdjustment);
+  }
+
+  return normalize(rounding_mode, lost_fraction);
+}
+
+APFloat::opStatus
+APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
+                                      unsigned sigPartCount, int exp,
+                                      roundingMode rounding_mode)
+{
+  unsigned int parts, pow5PartCount;
+  fltSemantics calcSemantics = { 32767, -32767, 0 };
+  integerPart pow5Parts[maxPowerOfFiveParts];
+  bool isNearest;
+
+  isNearest = (rounding_mode == rmNearestTiesToEven ||
+               rounding_mode == rmNearestTiesToAway);
+
+  parts = partCountForBits(semantics->precision + 11);
+
+  /* Calculate pow(5, abs(exp)).  */
+  pow5PartCount = powerOf5(pow5Parts, exp >= 0 ? exp: -exp);
+
+  for (;; parts *= 2) {
+    opStatus sigStatus, powStatus;
+    unsigned int excessPrecision, truncatedBits;
+
+    calcSemantics.precision = parts * integerPartWidth - 1;
+    excessPrecision = calcSemantics.precision - semantics->precision;
+    truncatedBits = excessPrecision;
+
+    APFloat decSig(calcSemantics, fcZero, sign);
+    APFloat pow5(calcSemantics, fcZero, false);
+
+    sigStatus = decSig.convertFromUnsignedParts(decSigParts, sigPartCount,
+                                                rmNearestTiesToEven);
+    powStatus = pow5.convertFromUnsignedParts(pow5Parts, pow5PartCount,
+                                              rmNearestTiesToEven);
+    /* Add exp, as 10^n = 5^n * 2^n.  */
+    decSig.exponent += exp;
+
+    lostFraction calcLostFraction;
+    integerPart HUerr, HUdistance;
+    unsigned int powHUerr;
+
+    if (exp >= 0) {
+      /* multiplySignificand leaves the precision-th bit set to 1.  */
+      calcLostFraction = decSig.multiplySignificand(pow5, NULL);
+      powHUerr = powStatus != opOK;
+    } else {
+      calcLostFraction = decSig.divideSignificand(pow5);
+      /* Denormal numbers have less precision.  */
+      if (decSig.exponent < semantics->minExponent) {
+        excessPrecision += (semantics->minExponent - decSig.exponent);
+        truncatedBits = excessPrecision;
+        if (excessPrecision > calcSemantics.precision)
+          excessPrecision = calcSemantics.precision;
+      }
+      /* Extra half-ulp lost in reciprocal of exponent.  */
+      powHUerr = (powStatus == opOK && calcLostFraction == lfExactlyZero) ? 0:2;
+    }
+
+    /* Both multiplySignificand and divideSignificand return the
+       result with the integer bit set.  */
+    assert(APInt::tcExtractBit
+           (decSig.significandParts(), calcSemantics.precision - 1) == 1);
+
+    HUerr = HUerrBound(calcLostFraction != lfExactlyZero, sigStatus != opOK,
+                       powHUerr);
+    HUdistance = 2 * ulpsFromBoundary(decSig.significandParts(),
+                                      excessPrecision, isNearest);
+
+    /* Are we guaranteed to round correctly if we truncate?  */
+    if (HUdistance >= HUerr) {
+      APInt::tcExtract(significandParts(), partCount(), decSig.significandParts(),
+                       calcSemantics.precision - excessPrecision,
+                       excessPrecision);
+      /* Take the exponent of decSig.  If we tcExtract-ed less bits
+         above we must adjust our exponent to compensate for the
+         implicit right shift.  */
+      exponent = (decSig.exponent + semantics->precision
+                  - (calcSemantics.precision - excessPrecision));
+      calcLostFraction = lostFractionThroughTruncation(decSig.significandParts(),
+                                                       decSig.partCount(),
+                                                       truncatedBits);
+      return normalize(rounding_mode, calcLostFraction);
+    }
+  }
+}
+
+APFloat::opStatus
+APFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode)
+{
+  decimalInfo D;
+  opStatus fs;
+
+  /* Scan the text.  */
+  StringRef::iterator p = str.begin();
+  interpretDecimal(p, str.end(), &D);
+
+  /* Handle the quick cases.  First the case of no significant digits,
+     i.e. zero, and then exponents that are obviously too large or too
+     small.  Writing L for log 10 / log 2, a number d.ddddd*10^exp
+     definitely overflows if
+
+           (exp - 1) * L >= maxExponent
+
+     and definitely underflows to zero where
+
+           (exp + 1) * L <= minExponent - precision
+
+     With integer arithmetic the tightest bounds for L are
+
+           93/28 < L < 196/59            [ numerator <= 256 ]
+           42039/12655 < L < 28738/8651  [ numerator <= 65536 ]
+  */
+
+  if (decDigitValue(*D.firstSigDigit) >= 10U) {
+    category = fcZero;
+    fs = opOK;
+
+  /* Check whether the normalized exponent is high enough to overflow
+     max during the log-rebasing in the max-exponent check below. */
+  } else if (D.normalizedExponent - 1 > INT_MAX / 42039) {
+    fs = handleOverflow(rounding_mode);
+
+  /* If it wasn't, then it also wasn't high enough to overflow max
+     during the log-rebasing in the min-exponent check.  Check that it
+     won't overflow min in either check, then perform the min-exponent
+     check. */
+  } else if (D.normalizedExponent - 1 < INT_MIN / 42039 ||
+             (D.normalizedExponent + 1) * 28738 <=
+               8651 * (semantics->minExponent - (int) semantics->precision)) {
+    /* Underflow to zero and round.  */
+    zeroSignificand();
+    fs = normalize(rounding_mode, lfLessThanHalf);
+
+  /* We can finally safely perform the max-exponent check. */
+  } else if ((D.normalizedExponent - 1) * 42039
+             >= 12655 * semantics->maxExponent) {
+    /* Overflow and round.  */
+    fs = handleOverflow(rounding_mode);
+  } else {
+    integerPart *decSignificand;
+    unsigned int partCount;
+
+    /* A tight upper bound on number of bits required to hold an
+       N-digit decimal integer is N * 196 / 59.  Allocate enough space
+       to hold the full significand, and an extra part required by
+       tcMultiplyPart.  */
+    partCount = static_cast<unsigned int>(D.lastSigDigit - D.firstSigDigit) + 1;
+    partCount = partCountForBits(1 + 196 * partCount / 59);
+    decSignificand = new integerPart[partCount + 1];
+    partCount = 0;
+
+    /* Convert to binary efficiently - we do almost all multiplication
+       in an integerPart.  When this would overflow do we do a single
+       bignum multiplication, and then revert again to multiplication
+       in an integerPart.  */
+    do {
+      integerPart decValue, val, multiplier;
+
+      val = 0;
+      multiplier = 1;
+
+      do {
+        if (*p == '.') {
+          p++;
+          if (p == str.end()) {
+            break;
+          }
+        }
+        decValue = decDigitValue(*p++);
+        assert(decValue < 10U && "Invalid character in significand");
+        multiplier *= 10;
+        val = val * 10 + decValue;
+        /* The maximum number that can be multiplied by ten with any
+           digit added without overflowing an integerPart.  */
+      } while (p <= D.lastSigDigit && multiplier <= (~ (integerPart) 0 - 9) / 10);
+
+      /* Multiply out the current part.  */
+      APInt::tcMultiplyPart(decSignificand, decSignificand, multiplier, val,
+                            partCount, partCount + 1, false);
+
+      /* If we used another part (likely but not guaranteed), increase
+         the count.  */
+      if (decSignificand[partCount])
+        partCount++;
+    } while (p <= D.lastSigDigit);
+
+    category = fcNormal;
+    fs = roundSignificandWithExponent(decSignificand, partCount,
+                                      D.exponent, rounding_mode);
+
+    delete [] decSignificand;
+  }
+
+  return fs;
+}
+
+APFloat::opStatus
+APFloat::convertFromString(StringRef str, roundingMode rounding_mode)
+{
+  assert(!str.empty() && "Invalid string length");
+
+  /* Handle a leading minus sign.  */
+  StringRef::iterator p = str.begin();
+  size_t slen = str.size();
+  sign = *p == '-' ? 1 : 0;
+  if (*p == '-' || *p == '+') {
+    p++;
+    slen--;
+    assert(slen && "String has no digits");
+  }
+
+  if (slen >= 2 && p[0] == '0' && (p[1] == 'x' || p[1] == 'X')) {
+    assert(slen - 2 && "Invalid string");
+    return convertFromHexadecimalString(StringRef(p + 2, slen - 2),
+                                        rounding_mode);
+  }
+
+  return convertFromDecimalString(StringRef(p, slen), rounding_mode);
+}
+
+/* Write out a hexadecimal representation of the floating point value
+   to DST, which must be of sufficient size, in the C99 form
+   [-]0xh.hhhhp[+-]d.  Return the number of characters written,
+   excluding the terminating NUL.
+
+   If UPPERCASE, the output is in upper case, otherwise in lower case.
+
+   HEXDIGITS digits appear altogether, rounding the value if
+   necessary.  If HEXDIGITS is 0, the minimal precision to display the
+   number precisely is used instead.  If nothing would appear after
+   the decimal point it is suppressed.
+
+   The decimal exponent is always printed and has at least one digit.
+   Zero values display an exponent of zero.  Infinities and NaNs
+   appear as "infinity" or "nan" respectively.
+
+   The above rules are as specified by C99.  There is ambiguity about
+   what the leading hexadecimal digit should be.  This implementation
+   uses whatever is necessary so that the exponent is displayed as
+   stored.  This implies the exponent will fall within the IEEE format
+   range, and the leading hexadecimal digit will be 0 (for denormals),
+   1 (normal numbers) or 2 (normal numbers rounded-away-from-zero with
+   any other digits zero).
+*/
+unsigned int
+APFloat::convertToHexString(char *dst, unsigned int hexDigits,
+                            bool upperCase, roundingMode rounding_mode) const
+{
+  char *p;
+
+  p = dst;
+  if (sign)
+    *dst++ = '-';
+
+  switch (category) {
+  case fcInfinity:
+    memcpy (dst, upperCase ? infinityU: infinityL, sizeof infinityU - 1);
+    dst += sizeof infinityL - 1;
+    break;
+
+  case fcNaN:
+    memcpy (dst, upperCase ? NaNU: NaNL, sizeof NaNU - 1);
+    dst += sizeof NaNU - 1;
+    break;
+
+  case fcZero:
+    *dst++ = '0';
+    *dst++ = upperCase ? 'X': 'x';
+    *dst++ = '0';
+    if (hexDigits > 1) {
+      *dst++ = '.';
+      memset (dst, '0', hexDigits - 1);
+      dst += hexDigits - 1;
+    }
+    *dst++ = upperCase ? 'P': 'p';
+    *dst++ = '0';
+    break;
+
+  case fcNormal:
+    dst = convertNormalToHexString (dst, hexDigits, upperCase, rounding_mode);
+    break;
+  }
+
+  *dst = 0;
+
+  return static_cast<unsigned int>(dst - p);
+}
+
+/* Does the hard work of outputting the correctly rounded hexadecimal
+   form of a normal floating point number with the specified number of
+   hexadecimal digits.  If HEXDIGITS is zero the minimum number of
+   digits necessary to print the value precisely is output.  */
+char *
+APFloat::convertNormalToHexString(char *dst, unsigned int hexDigits,
+                                  bool upperCase,
+                                  roundingMode rounding_mode) const
+{
+  unsigned int count, valueBits, shift, partsCount, outputDigits;
+  const char *hexDigitChars;
+  const integerPart *significand;
+  char *p;
+  bool roundUp;
+
+  *dst++ = '0';
+  *dst++ = upperCase ? 'X': 'x';
+
+  roundUp = false;
+  hexDigitChars = upperCase ? hexDigitsUpper: hexDigitsLower;
+
+  significand = significandParts();
+  partsCount = partCount();
+
+  /* +3 because the first digit only uses the single integer bit, so
+     we have 3 virtual zero most-significant-bits.  */
+  valueBits = semantics->precision + 3;
+  shift = integerPartWidth - valueBits % integerPartWidth;
+
+  /* The natural number of digits required ignoring trailing
+     insignificant zeroes.  */
+  outputDigits = (valueBits - significandLSB () + 3) / 4;
+
+  /* hexDigits of zero means use the required number for the
+     precision.  Otherwise, see if we are truncating.  If we are,
+     find out if we need to round away from zero.  */
+  if (hexDigits) {
+    if (hexDigits < outputDigits) {
+      /* We are dropping non-zero bits, so need to check how to round.
+         "bits" is the number of dropped bits.  */
+      unsigned int bits;
+      lostFraction fraction;
+
+      bits = valueBits - hexDigits * 4;
+      fraction = lostFractionThroughTruncation (significand, partsCount, bits);
+      roundUp = roundAwayFromZero(rounding_mode, fraction, bits);
+    }
+    outputDigits = hexDigits;
+  }
+
+  /* Write the digits consecutively, and start writing in the location
+     of the hexadecimal point.  We move the most significant digit
+     left and add the hexadecimal point later.  */
+  p = ++dst;
+
+  count = (valueBits + integerPartWidth - 1) / integerPartWidth;
+
+  while (outputDigits && count) {
+    integerPart part;
+
+    /* Put the most significant integerPartWidth bits in "part".  */
+    if (--count == partsCount)
+      part = 0;  /* An imaginary higher zero part.  */
+    else
+      part = significand[count] << shift;
+
+    if (count && shift)
+      part |= significand[count - 1] >> (integerPartWidth - shift);
+
+    /* Convert as much of "part" to hexdigits as we can.  */
+    unsigned int curDigits = integerPartWidth / 4;
+
+    if (curDigits > outputDigits)
+      curDigits = outputDigits;
+    dst += partAsHex (dst, part, curDigits, hexDigitChars);
+    outputDigits -= curDigits;
+  }
+
+  if (roundUp) {
+    char *q = dst;
+
+    /* Note that hexDigitChars has a trailing '0'.  */
+    do {
+      q--;
+      *q = hexDigitChars[hexDigitValue (*q) + 1];
+    } while (*q == '0');
+    assert(q >= p);
+  } else {
+    /* Add trailing zeroes.  */
+    memset (dst, '0', outputDigits);
+    dst += outputDigits;
+  }
+
+  /* Move the most significant digit to before the point, and if there
+     is something after the decimal point add it.  This must come
+     after rounding above.  */
+  p[-1] = p[0];
+  if (dst -1 == p)
+    dst--;
+  else
+    p[0] = '.';
+
+  /* Finally output the exponent.  */
+  *dst++ = upperCase ? 'P': 'p';
+
+  return writeSignedDecimal (dst, exponent);
+}
+
+hash_code llvm::hash_value(const APFloat &Arg) {
+  if (Arg.category != APFloat::fcNormal)
+    return hash_combine((uint8_t)Arg.category,
+                        // NaN has no sign, fix it at zero.
+                        Arg.isNaN() ? (uint8_t)0 : (uint8_t)Arg.sign,
+                        Arg.semantics->precision);
+
+  // Normal floats need their exponent and significand hashed.
+  return hash_combine((uint8_t)Arg.category, (uint8_t)Arg.sign,
+                      Arg.semantics->precision, Arg.exponent,
+                      hash_combine_range(
+                        Arg.significandParts(),
+                        Arg.significandParts() + Arg.partCount()));
+}
+
+// Conversion from APFloat to/from host float/double.  It may eventually be
+// possible to eliminate these and have everybody deal with APFloats, but that
+// will take a while.  This approach will not easily extend to long double.
+// Current implementation requires integerPartWidth==64, which is correct at
+// the moment but could be made more general.
+
+// Denormals have exponent minExponent in APFloat, but minExponent-1 in
+// the actual IEEE respresentations.  We compensate for that here.
+
+APInt
+APFloat::convertF80LongDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&x87DoubleExtended);
+  assert(partCount()==2);
+
+  uint64_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+16383; //bias
+    mysignificand = significandParts()[0];
+    if (myexponent==1 && !(mysignificand & 0x8000000000000000ULL))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7fff;
+    mysignificand = 0x8000000000000000ULL;
+  } else {
+    assert(category == fcNaN && "Unknown category");
+    myexponent = 0x7fff;
+    mysignificand = significandParts()[0];
+  }
+
+  uint64_t words[2];
+  words[0] = mysignificand;
+  words[1] =  ((uint64_t)(sign & 1) << 15) |
+              (myexponent & 0x7fffLL);
+  return APInt(80, words);
+}
+
+APInt
+APFloat::convertPPCDoubleDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&PPCDoubleDouble);
+  assert(partCount()==2);
+
+  uint64_t words[2];
+  opStatus fs;
+  bool losesInfo;
+
+  // Convert number to double.  To avoid spurious underflows, we re-
+  // normalize against the "double" minExponent first, and only *then*
+  // truncate the mantissa.  The result of that second conversion
+  // may be inexact, but should never underflow.
+  // Declare fltSemantics before APFloat that uses it (and
+  // saves pointer to it) to ensure correct destruction order.
+  fltSemantics extendedSemantics = *semantics;
+  extendedSemantics.minExponent = IEEEdouble.minExponent;
+  APFloat extended(*this);
+  fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+  (void)fs;
+
+  APFloat u(extended);
+  fs = u.convert(IEEEdouble, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK || fs == opInexact);
+  (void)fs;
+  words[0] = *u.convertDoubleAPFloatToAPInt().getRawData();
+
+  // If conversion was exact or resulted in a special case, we're done;
+  // just set the second double to zero.  Otherwise, re-convert back to
+  // the extended format and compute the difference.  This now should
+  // convert exactly to double.
+  if (u.category == fcNormal && losesInfo) {
+    fs = u.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+
+    APFloat v(extended);
+    v.subtract(u, rmNearestTiesToEven);
+    fs = v.convert(IEEEdouble, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+    words[1] = *v.convertDoubleAPFloatToAPInt().getRawData();
+  } else {
+    words[1] = 0;
+  }
+
+  return APInt(128, words);
+}
+
+APInt
+APFloat::convertQuadrupleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEquad);
+  assert(partCount()==2);
+
+  uint64_t myexponent, mysignificand, mysignificand2;
+
+  if (category==fcNormal) {
+    myexponent = exponent+16383; //bias
+    mysignificand = significandParts()[0];
+    mysignificand2 = significandParts()[1];
+    if (myexponent==1 && !(mysignificand2 & 0x1000000000000LL))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = mysignificand2 = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7fff;
+    mysignificand = mysignificand2 = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0x7fff;
+    mysignificand = significandParts()[0];
+    mysignificand2 = significandParts()[1];
+  }
+
+  uint64_t words[2];
+  words[0] = mysignificand;
+  words[1] = ((uint64_t)(sign & 1) << 63) |
+             ((myexponent & 0x7fff) << 48) |
+             (mysignificand2 & 0xffffffffffffLL);
+
+  return APInt(128, words);
+}
+
+APInt
+APFloat::convertDoubleAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEdouble);
+  assert(partCount()==1);
+
+  uint64_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+1023; //bias
+    mysignificand = *significandParts();
+    if (myexponent==1 && !(mysignificand & 0x10000000000000LL))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x7ff;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0x7ff;
+    mysignificand = *significandParts();
+  }
+
+  return APInt(64, ((((uint64_t)(sign & 1) << 63) |
+                     ((myexponent & 0x7ff) <<  52) |
+                     (mysignificand & 0xfffffffffffffLL))));
+}
+
+APInt
+APFloat::convertFloatAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEsingle);
+  assert(partCount()==1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+127; //bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x800000))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0xff;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0xff;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(32, (((sign&1) << 31) | ((myexponent&0xff) << 23) |
+                    (mysignificand & 0x7fffff)));
+}
+
+APInt
+APFloat::convertHalfAPFloatToAPInt() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEhalf);
+  assert(partCount()==1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (category==fcNormal) {
+    myexponent = exponent+15; //bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x400))
+      myexponent = 0;   // denormal
+  } else if (category==fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category==fcInfinity) {
+    myexponent = 0x1f;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0x1f;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(16, (((sign&1) << 15) | ((myexponent&0x1f) << 10) |
+                    (mysignificand & 0x3ff)));
+}
+
+// This function creates an APInt that is just a bit map of the floating
+// point constant as it would appear in memory.  It is not a conversion,
+// and treating the result as a normal integer is unlikely to be useful.
+
+APInt
+APFloat::bitcastToAPInt() const
+{
+  if (semantics == (const llvm::fltSemantics*)&IEEEhalf)
+    return convertHalfAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics*)&IEEEsingle)
+    return convertFloatAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics*)&IEEEdouble)
+    return convertDoubleAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics*)&IEEEquad)
+    return convertQuadrupleAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics*)&PPCDoubleDouble)
+    return convertPPCDoubleDoubleAPFloatToAPInt();
+
+  assert(semantics == (const llvm::fltSemantics*)&x87DoubleExtended &&
+         "unknown format!");
+  return convertF80LongDoubleAPFloatToAPInt();
+}
+
+float
+APFloat::convertToFloat() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEsingle &&
+         "Float semantics are not IEEEsingle");
+  APInt api = bitcastToAPInt();
+  return api.bitsToFloat();
+}
+
+double
+APFloat::convertToDouble() const
+{
+  assert(semantics == (const llvm::fltSemantics*)&IEEEdouble &&
+         "Float semantics are not IEEEdouble");
+  APInt api = bitcastToAPInt();
+  return api.bitsToDouble();
+}
+
+/// Integer bit is explicit in this format.  Intel hardware (387 and later)
+/// does not support these bit patterns:
+///  exponent = all 1's, integer bit 0, significand 0 ("pseudoinfinity")
+///  exponent = all 1's, integer bit 0, significand nonzero ("pseudoNaN")
+///  exponent = 0, integer bit 1 ("pseudodenormal")
+///  exponent!=0 nor all 1's, integer bit 0 ("unnormal")
+/// At the moment, the first two are treated as NaNs, the second two as Normal.
+void
+APFloat::initFromF80LongDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==80);
+  uint64_t i1 = api.getRawData()[0];
+  uint64_t i2 = api.getRawData()[1];
+  uint64_t myexponent = (i2 & 0x7fff);
+  uint64_t mysignificand = i1;
+
+  initialize(&APFloat::x87DoubleExtended);
+  assert(partCount()==2);
+
+  sign = static_cast<unsigned int>(i2>>15);
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x7fff && mysignificand==0x8000000000000000ULL) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x7fff && mysignificand!=0x8000000000000000ULL) {
+    // exponent meaningless
+    category = fcNaN;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = 0;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 16383;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = 0;
+    if (myexponent==0)          // denormal
+      exponent = -16382;
+  }
+}
+
+void
+APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==128);
+  uint64_t i1 = api.getRawData()[0];
+  uint64_t i2 = api.getRawData()[1];
+  opStatus fs;
+  bool losesInfo;
+
+  // Get the first double and convert to our format.
+  initFromDoubleAPInt(APInt(64, i1));
+  fs = convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+  (void)fs;
+
+  // Unless we have a special case, add in second double.
+  if (category == fcNormal) {
+    APFloat v(IEEEdouble, APInt(64, i2));
+    fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+
+    add(v, rmNearestTiesToEven);
+  }
+}
+
+void
+APFloat::initFromQuadrupleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==128);
+  uint64_t i1 = api.getRawData()[0];
+  uint64_t i2 = api.getRawData()[1];
+  uint64_t myexponent = (i2 >> 48) & 0x7fff;
+  uint64_t mysignificand  = i1;
+  uint64_t mysignificand2 = i2 & 0xffffffffffffLL;
+
+  initialize(&APFloat::IEEEquad);
+  assert(partCount()==2);
+
+  sign = static_cast<unsigned int>(i2>>63);
+  if (myexponent==0 &&
+      (mysignificand==0 && mysignificand2==0)) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x7fff &&
+             (mysignificand==0 && mysignificand2==0)) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x7fff &&
+             (mysignificand!=0 || mysignificand2 !=0)) {
+    // exponent meaningless
+    category = fcNaN;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = mysignificand2;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 16383;
+    significandParts()[0] = mysignificand;
+    significandParts()[1] = mysignificand2;
+    if (myexponent==0)          // denormal
+      exponent = -16382;
+    else
+      significandParts()[1] |= 0x1000000000000LL;  // integer bit
+  }
+}
+
+void
+APFloat::initFromDoubleAPInt(const APInt &api)
+{
+  assert(api.getBitWidth()==64);
+  uint64_t i = *api.getRawData();
+  uint64_t myexponent = (i >> 52) & 0x7ff;
+  uint64_t mysignificand = i & 0xfffffffffffffLL;
+
+  initialize(&APFloat::IEEEdouble);
+  assert(partCount()==1);
+
+  sign = static_cast<unsigned int>(i>>63);
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x7ff && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x7ff && mysignificand!=0) {
+    // exponent meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 1023;
+    *significandParts() = mysignificand;
+    if (myexponent==0)          // denormal
+      exponent = -1022;
+    else
+      *significandParts() |= 0x10000000000000LL;  // integer bit
+  }
+}
+
+void
+APFloat::initFromFloatAPInt(const APInt & api)
+{
+  assert(api.getBitWidth()==32);
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 23) & 0xff;
+  uint32_t mysignificand = i & 0x7fffff;
+
+  initialize(&APFloat::IEEEsingle);
+  assert(partCount()==1);
+
+  sign = i >> 31;
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0xff && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0xff && mysignificand!=0) {
+    // sign, exponent, significand meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 127;  //bias
+    *significandParts() = mysignificand;
+    if (myexponent==0)    // denormal
+      exponent = -126;
+    else
+      *significandParts() |= 0x800000; // integer bit
+  }
+}
+
+void
+APFloat::initFromHalfAPInt(const APInt & api)
+{
+  assert(api.getBitWidth()==16);
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 10) & 0x1f;
+  uint32_t mysignificand = i & 0x3ff;
+
+  initialize(&APFloat::IEEEhalf);
+  assert(partCount()==1);
+
+  sign = i >> 15;
+  if (myexponent==0 && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent==0x1f && mysignificand==0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent==0x1f && mysignificand!=0) {
+    // sign, exponent, significand meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 15;  //bias
+    *significandParts() = mysignificand;
+    if (myexponent==0)    // denormal
+      exponent = -14;
+    else
+      *significandParts() |= 0x400; // integer bit
+  }
+}
+
+/// Treat api as containing the bits of a floating point number.  Currently
+/// we infer the floating point type from the size of the APInt.  The
+/// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
+/// when the size is anything else).
+void
+APFloat::initFromAPInt(const fltSemantics* Sem, const APInt& api)
+{
+  if (Sem == &IEEEhalf)
+    return initFromHalfAPInt(api);
+  if (Sem == &IEEEsingle)
+    return initFromFloatAPInt(api);
+  if (Sem == &IEEEdouble)
+    return initFromDoubleAPInt(api);
+  if (Sem == &x87DoubleExtended)
+    return initFromF80LongDoubleAPInt(api);
+  if (Sem == &IEEEquad)
+    return initFromQuadrupleAPInt(api);
+  if (Sem == &PPCDoubleDouble)
+    return initFromPPCDoubleDoubleAPInt(api);
+
+  llvm_unreachable(0);
+}
+
+APFloat
+APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE)
+{
+  switch (BitWidth) {
+  case 16:
+    return APFloat(IEEEhalf, APInt::getAllOnesValue(BitWidth));
+  case 32:
+    return APFloat(IEEEsingle, APInt::getAllOnesValue(BitWidth));
+  case 64:
+    return APFloat(IEEEdouble, APInt::getAllOnesValue(BitWidth));
+  case 80:
+    return APFloat(x87DoubleExtended, APInt::getAllOnesValue(BitWidth));
+  case 128:
+    if (isIEEE)
+      return APFloat(IEEEquad, APInt::getAllOnesValue(BitWidth));
+    return APFloat(PPCDoubleDouble, APInt::getAllOnesValue(BitWidth));
+  default:
+    llvm_unreachable("Unknown floating bit width");
+  }
+}
+
+APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
+  APFloat Val(Sem, fcNormal, Negative);
+
+  // We want (in interchange format):
+  //   sign = {Negative}
+  //   exponent = 1..10
+  //   significand = 1..1
+
+  Val.exponent = Sem.maxExponent; // unbiased
+
+  // 1-initialize all bits....
+  Val.zeroSignificand();
+  integerPart *significand = Val.significandParts();
+  unsigned N = partCountForBits(Sem.precision);
+  for (unsigned i = 0; i != N; ++i)
+    significand[i] = ~((integerPart) 0);
+
+  // ...and then clear the top bits for internal consistency.
+  if (Sem.precision % integerPartWidth != 0)
+    significand[N-1] &=
+      (((integerPart) 1) << (Sem.precision % integerPartWidth)) - 1;
+
+  return Val;
+}
+
+APFloat APFloat::getSmallest(const fltSemantics &Sem, bool Negative) {
+  APFloat Val(Sem, fcNormal, Negative);
+
+  // We want (in interchange format):
+  //   sign = {Negative}
+  //   exponent = 0..0
+  //   significand = 0..01
+
+  Val.exponent = Sem.minExponent; // unbiased
+  Val.zeroSignificand();
+  Val.significandParts()[0] = 1;
+  return Val;
+}
+
+APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
+  APFloat Val(Sem, fcNormal, Negative);
+
+  // We want (in interchange format):
+  //   sign = {Negative}
+  //   exponent = 0..0
+  //   significand = 10..0
+
+  Val.exponent = Sem.minExponent;
+  Val.zeroSignificand();
+  Val.significandParts()[partCountForBits(Sem.precision)-1] |=
+    (((integerPart) 1) << ((Sem.precision - 1) % integerPartWidth));
+
+  return Val;
+}
+
+APFloat::APFloat(const fltSemantics &Sem, const APInt &API) {
+  initFromAPInt(&Sem, API);
+}
+
+APFloat::APFloat(float f) {
+  initFromAPInt(&IEEEsingle, APInt::floatToBits(f));
+}
+
+APFloat::APFloat(double d) {
+  initFromAPInt(&IEEEdouble, APInt::doubleToBits(d));
+}
+
+namespace {
+  void append(SmallVectorImpl<char> &Buffer, StringRef Str) {
+    Buffer.append(Str.begin(), Str.end());
+  }
+
+  /// Removes data from the given significand until it is no more
+  /// precise than is required for the desired precision.
+  void AdjustToPrecision(APInt &significand,
+                         int &exp, unsigned FormatPrecision) {
+    unsigned bits = significand.getActiveBits();
+
+    // 196/59 is a very slight overestimate of lg_2(10).
+    unsigned bitsRequired = (FormatPrecision * 196 + 58) / 59;
+
+    if (bits <= bitsRequired) return;
+
+    unsigned tensRemovable = (bits - bitsRequired) * 59 / 196;
+    if (!tensRemovable) return;
+
+    exp += tensRemovable;
+
+    APInt divisor(significand.getBitWidth(), 1);
+    APInt powten(significand.getBitWidth(), 10);
+    while (true) {
+      if (tensRemovable & 1)
+        divisor *= powten;
+      tensRemovable >>= 1;
+      if (!tensRemovable) break;
+      powten *= powten;
+    }
+
+    significand = significand.udiv(divisor);
+
+    // Truncate the significand down to its active bit count.
+    significand = significand.trunc(significand.getActiveBits());
+  }
+
+
+  void AdjustToPrecision(SmallVectorImpl<char> &buffer,
+                         int &exp, unsigned FormatPrecision) {
+    unsigned N = buffer.size();
+    if (N <= FormatPrecision) return;
+
+    // The most significant figures are the last ones in the buffer.
+    unsigned FirstSignificant = N - FormatPrecision;
+
+    // Round.
+    // FIXME: this probably shouldn't use 'round half up'.
+
+    // Rounding down is just a truncation, except we also want to drop
+    // trailing zeros from the new result.
+    if (buffer[FirstSignificant - 1] < '5') {
+      while (FirstSignificant < N && buffer[FirstSignificant] == '0')
+        FirstSignificant++;
+
+      exp += FirstSignificant;
+      buffer.erase(&buffer[0], &buffer[FirstSignificant]);
+      return;
+    }
+
+    // Rounding up requires a decimal add-with-carry.  If we continue
+    // the carry, the newly-introduced zeros will just be truncated.
+    for (unsigned I = FirstSignificant; I != N; ++I) {
+      if (buffer[I] == '9') {
+        FirstSignificant++;
+      } else {
+        buffer[I]++;
+        break;
+      }
+    }
+
+    // If we carried through, we have exactly one digit of precision.
+    if (FirstSignificant == N) {
+      exp += FirstSignificant;
+      buffer.clear();
+      buffer.push_back('1');
+      return;
+    }
+
+    exp += FirstSignificant;
+    buffer.erase(&buffer[0], &buffer[FirstSignificant]);
+  }
+}
+
+void APFloat::toString(SmallVectorImpl<char> &Str,
+                       unsigned FormatPrecision,
+                       unsigned FormatMaxPadding) const {
+  switch (category) {
+  case fcInfinity:
+    if (isNegative())
+      return append(Str, "-Inf");
+    else
+      return append(Str, "+Inf");
+
+  case fcNaN: return append(Str, "NaN");
+
+  case fcZero:
+    if (isNegative())
+      Str.push_back('-');
+
+    if (!FormatMaxPadding)
+      append(Str, "0.0E+0");
+    else
+      Str.push_back('0');
+    return;
+
+  case fcNormal:
+    break;
+  }
+
+  if (isNegative())
+    Str.push_back('-');
+
+  // Decompose the number into an APInt and an exponent.
+  int exp = exponent - ((int) semantics->precision - 1);
+  APInt significand(semantics->precision,
+                    makeArrayRef(significandParts(),
+                                 partCountForBits(semantics->precision)));
+
+  // Set FormatPrecision if zero.  We want to do this before we
+  // truncate trailing zeros, as those are part of the precision.
+  if (!FormatPrecision) {
+    // It's an interesting question whether to use the nominal
+    // precision or the active precision here for denormals.
+
+    // FormatPrecision = ceil(significandBits / lg_2(10))
+    FormatPrecision = (semantics->precision * 59 + 195) / 196;
+  }
+
+  // Ignore trailing binary zeros.
+  int trailingZeros = significand.countTrailingZeros();
+  exp += trailingZeros;
+  significand = significand.lshr(trailingZeros);
+
+  // Change the exponent from 2^e to 10^e.
+  if (exp == 0) {
+    // Nothing to do.
+  } else if (exp > 0) {
+    // Just shift left.
+    significand = significand.zext(semantics->precision + exp);
+    significand <<= exp;
+    exp = 0;
+  } else { /* exp < 0 */
+    int texp = -exp;
+
+    // We transform this using the identity:
+    //   (N)(2^-e) == (N)(5^e)(10^-e)
+    // This means we have to multiply N (the significand) by 5^e.
+    // To avoid overflow, we have to operate on numbers large
+    // enough to store N * 5^e:
+    //   log2(N * 5^e) == log2(N) + e * log2(5)
+    //                 <= semantics->precision + e * 137 / 59
+    //   (log_2(5) ~ 2.321928 < 2.322034 ~ 137/59)
+
+    unsigned precision = semantics->precision + (137 * texp + 136) / 59;
+
+    // Multiply significand by 5^e.
+    //   N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
+    significand = significand.zext(precision);
+    APInt five_to_the_i(precision, 5);
+    while (true) {
+      if (texp & 1) significand *= five_to_the_i;
+
+      texp >>= 1;
+      if (!texp) break;
+      five_to_the_i *= five_to_the_i;
+    }
+  }
+
+  AdjustToPrecision(significand, exp, FormatPrecision);
+
+  SmallVector<char, 256> buffer;
+
+  // Fill the buffer.
+  unsigned precision = significand.getBitWidth();
+  APInt ten(precision, 10);
+  APInt digit(precision, 0);
+
+  bool inTrail = true;
+  while (significand != 0) {
+    // digit <- significand % 10
+    // significand <- significand / 10
+    APInt::udivrem(significand, ten, significand, digit);
+
+    unsigned d = digit.getZExtValue();
+
+    // Drop trailing zeros.
+    if (inTrail && !d) exp++;
+    else {
+      buffer.push_back((char) ('0' + d));
+      inTrail = false;
+    }
+  }
+
+  assert(!buffer.empty() && "no characters in buffer!");
+
+  // Drop down to FormatPrecision.
+  // TODO: don't do more precise calculations above than are required.
+  AdjustToPrecision(buffer, exp, FormatPrecision);
+
+  unsigned NDigits = buffer.size();
+
+  // Check whether we should use scientific notation.
+  bool FormatScientific;
+  if (!FormatMaxPadding)
+    FormatScientific = true;
+  else {
+    if (exp >= 0) {
+      // 765e3 --> 765000
+      //              ^^^
+      // But we shouldn't make the number look more precise than it is.
+      FormatScientific = ((unsigned) exp > FormatMaxPadding ||
+                          NDigits + (unsigned) exp > FormatPrecision);
+    } else {
+      // Power of the most significant digit.
+      int MSD = exp + (int) (NDigits - 1);
+      if (MSD >= 0) {
+        // 765e-2 == 7.65
+        FormatScientific = false;
+      } else {
+        // 765e-5 == 0.00765
+        //           ^ ^^
+        FormatScientific = ((unsigned) -MSD) > FormatMaxPadding;
+      }
+    }
+  }
+
+  // Scientific formatting is pretty straightforward.
+  if (FormatScientific) {
+    exp += (NDigits - 1);
+
+    Str.push_back(buffer[NDigits-1]);
+    Str.push_back('.');
+    if (NDigits == 1)
+      Str.push_back('0');
+    else
+      for (unsigned I = 1; I != NDigits; ++I)
+        Str.push_back(buffer[NDigits-1-I]);
+    Str.push_back('E');
+
+    Str.push_back(exp >= 0 ? '+' : '-');
+    if (exp < 0) exp = -exp;
+    SmallVector<char, 6> expbuf;
+    do {
+      expbuf.push_back((char) ('0' + (exp % 10)));
+      exp /= 10;
+    } while (exp);
+    for (unsigned I = 0, E = expbuf.size(); I != E; ++I)
+      Str.push_back(expbuf[E-1-I]);
+    return;
+  }
+
+  // Non-scientific, positive exponents.
+  if (exp >= 0) {
+    for (unsigned I = 0; I != NDigits; ++I)
+      Str.push_back(buffer[NDigits-1-I]);
+    for (unsigned I = 0; I != (unsigned) exp; ++I)
+      Str.push_back('0');
+    return;
+  }
+
+  // Non-scientific, negative exponents.
+
+  // The number of digits to the left of the decimal point.
+  int NWholeDigits = exp + (int) NDigits;
+
+  unsigned I = 0;
+  if (NWholeDigits > 0) {
+    for (; I != (unsigned) NWholeDigits; ++I)
+      Str.push_back(buffer[NDigits-I-1]);
+    Str.push_back('.');
+  } else {
+    unsigned NZeros = 1 + (unsigned) -NWholeDigits;
+
+    Str.push_back('0');
+    Str.push_back('.');
+    for (unsigned Z = 1; Z != NZeros; ++Z)
+      Str.push_back('0');
+  }
+
+  for (; I != NDigits; ++I)
+    Str.push_back(buffer[NDigits-I-1]);
+}
+
+bool APFloat::getExactInverse(APFloat *inv) const {
+  // Special floats and denormals have no exact inverse.
+  if (category != fcNormal)
+    return false;
+
+  // Check that the number is a power of two by making sure that only the
+  // integer bit is set in the significand.
+  if (significandLSB() != semantics->precision - 1)
+    return false;
+
+  // Get the inverse.
+  APFloat reciprocal(*semantics, 1ULL);
+  if (reciprocal.divide(*this, rmNearestTiesToEven) != opOK)
+    return false;
+
+  // Avoid multiplication with a denormal, it is not safe on all platforms and
+  // may be slower than a normal division.
+  if (reciprocal.significandMSB() + 1 < reciprocal.semantics->precision)
+    return false;
+
+  assert(reciprocal.category == fcNormal &&
+         reciprocal.significandLSB() == reciprocal.semantics->precision - 1);
+
+  if (inv)
+    *inv = reciprocal;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Support/APInt.cpp b/contrib/llvm/lib/Support/APInt.cpp
new file mode 100644
index 000000000000..e8534753b46e
--- /dev/null
+++ b/contrib/llvm/lib/Support/APInt.cpp
@@ -0,0 +1,2910 @@
+//===-- APInt.cpp - Implement APInt class ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a class to represent arbitrary precision integer
+// constant values and provide a variety of arithmetic operations on them.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "apint"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+using namespace llvm;
+
+/// A utility function for allocating memory, checking for allocation failures,
+/// and ensuring the contents are zeroed.
+inline static uint64_t* getClearedMemory(unsigned numWords) {
+  uint64_t * result = new uint64_t[numWords];
+  assert(result && "APInt memory allocation fails!");
+  memset(result, 0, numWords * sizeof(uint64_t));
+  return result;
+}
+
+/// A utility function for allocating memory and checking for allocation
+/// failure.  The content is not zeroed.
+inline static uint64_t* getMemory(unsigned numWords) {
+  uint64_t * result = new uint64_t[numWords];
+  assert(result && "APInt memory allocation fails!");
+  return result;
+}
+
+/// A utility function that converts a character to a digit.
+inline static unsigned getDigit(char cdigit, uint8_t radix) {
+  unsigned r;
+
+  if (radix == 16 || radix == 36) {
+    r = cdigit - '0';
+    if (r <= 9)
+      return r;
+
+    r = cdigit - 'A';
+    if (r <= radix - 11U)
+      return r + 10;
+
+    r = cdigit - 'a';
+    if (r <= radix - 11U)
+      return r + 10;
+    
+    radix = 10;
+  }
+
+  r = cdigit - '0';
+  if (r < radix)
+    return r;
+
+  return -1U;
+}
+
+
+void APInt::initSlowCase(unsigned numBits, uint64_t val, bool isSigned) {
+  pVal = getClearedMemory(getNumWords());
+  pVal[0] = val;
+  if (isSigned && int64_t(val) < 0)
+    for (unsigned i = 1; i < getNumWords(); ++i)
+      pVal[i] = -1ULL;
+}
+
+void APInt::initSlowCase(const APInt& that) {
+  pVal = getMemory(getNumWords());
+  memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
+}
+
+void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
+  assert(BitWidth && "Bitwidth too small");
+  assert(bigVal.data() && "Null pointer detected!");
+  if (isSingleWord())
+    VAL = bigVal[0];
+  else {
+    // Get memory, cleared to 0
+    pVal = getClearedMemory(getNumWords());
+    // Calculate the number of words to copy
+    unsigned words = std::min<unsigned>(bigVal.size(), getNumWords());
+    // Copy the words from bigVal to pVal
+    memcpy(pVal, bigVal.data(), words * APINT_WORD_SIZE);
+  }
+  // Make sure unused high bits are cleared
+  clearUnusedBits();
+}
+
+APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
+  : BitWidth(numBits), VAL(0) {
+  initFromArray(bigVal);
+}
+
+APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
+  : BitWidth(numBits), VAL(0) {
+  initFromArray(makeArrayRef(bigVal, numWords));
+}
+
+APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
+  : BitWidth(numbits), VAL(0) {
+  assert(BitWidth && "Bitwidth too small");
+  fromString(numbits, Str, radix);
+}
+
+APInt& APInt::AssignSlowCase(const APInt& RHS) {
+  // Don't do anything for X = X
+  if (this == &RHS)
+    return *this;
+
+  if (BitWidth == RHS.getBitWidth()) {
+    // assume same bit-width single-word case is already handled
+    assert(!isSingleWord());
+    memcpy(pVal, RHS.pVal, getNumWords() * APINT_WORD_SIZE);
+    return *this;
+  }
+
+  if (isSingleWord()) {
+    // assume case where both are single words is already handled
+    assert(!RHS.isSingleWord());
+    VAL = 0;
+    pVal = getMemory(RHS.getNumWords());
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  } else if (getNumWords() == RHS.getNumWords())
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  else if (RHS.isSingleWord()) {
+    delete [] pVal;
+    VAL = RHS.VAL;
+  } else {
+    delete [] pVal;
+    pVal = getMemory(RHS.getNumWords());
+    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
+  }
+  BitWidth = RHS.BitWidth;
+  return clearUnusedBits();
+}
+
+APInt& APInt::operator=(uint64_t RHS) {
+  if (isSingleWord())
+    VAL = RHS;
+  else {
+    pVal[0] = RHS;
+    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+  }
+  return clearUnusedBits();
+}
+
+/// Profile - This method 'profiles' an APInt for use with FoldingSet.
+void APInt::Profile(FoldingSetNodeID& ID) const {
+  ID.AddInteger(BitWidth);
+
+  if (isSingleWord()) {
+    ID.AddInteger(VAL);
+    return;
+  }
+
+  unsigned NumWords = getNumWords();
+  for (unsigned i = 0; i < NumWords; ++i)
+    ID.AddInteger(pVal[i]);
+}
+
+/// add_1 - This function adds a single "digit" integer, y, to the multiple
+/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+static bool add_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
+  for (unsigned i = 0; i < len; ++i) {
+    dest[i] = y + x[i];
+    if (dest[i] < y)
+      y = 1; // Carry one to next digit.
+    else {
+      y = 0; // No need to carry so exit early
+      break;
+    }
+  }
+  return y;
+}
+
+/// @brief Prefix increment operator. Increments the APInt by one.
+APInt& APInt::operator++() {
+  if (isSingleWord())
+    ++VAL;
+  else
+    add_1(pVal, pVal, getNumWords(), 1);
+  return clearUnusedBits();
+}
+
+/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
+/// the multi-digit integer array, x[], propagating the borrowed 1 value until
+/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
+/// In other words, if y > x then this function returns 1, otherwise 0.
+/// @returns the borrow out of the subtraction
+static bool sub_1(uint64_t x[], unsigned len, uint64_t y) {
+  for (unsigned i = 0; i < len; ++i) {
+    uint64_t X = x[i];
+    x[i] -= y;
+    if (y > X)
+      y = 1;  // We have to "borrow 1" from next "digit"
+    else {
+      y = 0;  // No need to borrow
+      break;  // Remaining digits are unchanged so exit early
+    }
+  }
+  return bool(y);
+}
+
+/// @brief Prefix decrement operator. Decrements the APInt by one.
+APInt& APInt::operator--() {
+  if (isSingleWord())
+    --VAL;
+  else
+    sub_1(pVal, getNumWords(), 1);
+  return clearUnusedBits();
+}
+
+/// add - This function adds the integer array x to the integer array Y and
+/// places the result in dest.
+/// @returns the carry out from the addition
+/// @brief General addition of 64-bit integer arrays
+static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y,
+                unsigned len) {
+  bool carry = false;
+  for (unsigned i = 0; i< len; ++i) {
+    uint64_t limit = std::min(x[i],y[i]); // must come first in case dest == x
+    dest[i] = x[i] + y[i] + carry;
+    carry = dest[i] < limit || (carry && dest[i] == limit);
+  }
+  return carry;
+}
+
+/// Adds the RHS APint to this APInt.
+/// @returns this, after addition of RHS.
+/// @brief Addition assignment operator.
+APInt& APInt::operator+=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    VAL += RHS.VAL;
+  else {
+    add(pVal, pVal, RHS.pVal, getNumWords());
+  }
+  return clearUnusedBits();
+}
+
+/// Subtracts the integer array y from the integer array x
+/// @returns returns the borrow out.
+/// @brief Generalized subtraction of 64-bit integer arrays.
+static bool sub(uint64_t *dest, const uint64_t *x, const uint64_t *y,
+                unsigned len) {
+  bool borrow = false;
+  for (unsigned i = 0; i < len; ++i) {
+    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
+    dest[i] = x_tmp - y[i];
+  }
+  return borrow;
+}
+
+/// Subtracts the RHS APInt from this APInt
+/// @returns this, after subtraction
+/// @brief Subtraction assignment operator.
+APInt& APInt::operator-=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    VAL -= RHS.VAL;
+  else
+    sub(pVal, pVal, RHS.pVal, getNumWords());
+  return clearUnusedBits();
+}
+
+/// Multiplies an integer array, x, by a uint64_t integer and places the result
+/// into dest.
+/// @returns the carry out of the multiplication.
+/// @brief Multiply a multi-digit APInt by a single digit (64-bit) integer.
+static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
+  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
+  uint64_t ly = y & 0xffffffffULL, hy = y >> 32;
+  uint64_t carry = 0;
+
+  // For each digit of x.
+  for (unsigned i = 0; i < len; ++i) {
+    // Split x into high and low words
+    uint64_t lx = x[i] & 0xffffffffULL;
+    uint64_t hx = x[i] >> 32;
+    // hasCarry - A flag to indicate if there is a carry to the next digit.
+    // hasCarry == 0, no carry
+    // hasCarry == 1, has carry
+    // hasCarry == 2, no carry and the calculation result == 0.
+    uint8_t hasCarry = 0;
+    dest[i] = carry + lx * ly;
+    // Determine if the add above introduces carry.
+    hasCarry = (dest[i] < carry) ? 1 : 0;
+    carry = hx * ly + (dest[i] >> 32) + (hasCarry ? (1ULL << 32) : 0);
+    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
+    // (2^32 - 1) + 2^32 = 2^64.
+    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+    carry += (lx * hy) & 0xffffffffULL;
+    dest[i] = (carry << 32) | (dest[i] & 0xffffffffULL);
+    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
+            (carry >> 32) + ((lx * hy) >> 32) + hx * hy;
+  }
+  return carry;
+}
+
+/// Multiplies integer array x by integer array y and stores the result into
+/// the integer array dest. Note that dest's size must be >= xlen + ylen.
+/// @brief Generalized multiplicate of integer arrays.
+static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
+                unsigned ylen) {
+  dest[xlen] = mul_1(dest, x, xlen, y[0]);
+  for (unsigned i = 1; i < ylen; ++i) {
+    uint64_t ly = y[i] & 0xffffffffULL, hy = y[i] >> 32;
+    uint64_t carry = 0, lx = 0, hx = 0;
+    for (unsigned j = 0; j < xlen; ++j) {
+      lx = x[j] & 0xffffffffULL;
+      hx = x[j] >> 32;
+      // hasCarry - A flag to indicate if has carry.
+      // hasCarry == 0, no carry
+      // hasCarry == 1, has carry
+      // hasCarry == 2, no carry and the calculation result == 0.
+      uint8_t hasCarry = 0;
+      uint64_t resul = carry + lx * ly;
+      hasCarry = (resul < carry) ? 1 : 0;
+      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + (resul >> 32);
+      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+      carry += (lx * hy) & 0xffffffffULL;
+      resul = (carry << 32) | (resul & 0xffffffffULL);
+      dest[i+j] += resul;
+      carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0)+
+              (carry >> 32) + (dest[i+j] < resul ? 1 : 0) +
+              ((lx * hy) >> 32) + hx * hy;
+    }
+    dest[i+xlen] = carry;
+  }
+}
+
+APInt& APInt::operator*=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL *= RHS.VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  // Get some bit facts about LHS and check for zero
+  unsigned lhsBits = getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
+  if (!lhsWords)
+    // 0 * X ===> 0
+    return *this;
+
+  // Get some bit facts about RHS and check for zero
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
+  if (!rhsWords) {
+    // X * 0 ===> 0
+    clearAllBits();
+    return *this;
+  }
+
+  // Allocate space for the result
+  unsigned destWords = rhsWords + lhsWords;
+  uint64_t *dest = getMemory(destWords);
+
+  // Perform the long multiply
+  mul(dest, pVal, lhsWords, RHS.pVal, rhsWords);
+
+  // Copy result back into *this
+  clearAllBits();
+  unsigned wordsToCopy = destWords >= getNumWords() ? getNumWords() : destWords;
+  memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+  clearUnusedBits();
+
+  // delete dest array and return
+  delete[] dest;
+  return *this;
+}
+
+APInt& APInt::operator&=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL &= RHS.VAL;
+    return *this;
+  }
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] &= RHS.pVal[i];
+  return *this;
+}
+
+APInt& APInt::operator|=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL |= RHS.VAL;
+    return *this;
+  }
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] |= RHS.pVal[i];
+  return *this;
+}
+
+APInt& APInt::operator^=(const APInt& RHS) {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    VAL ^= RHS.VAL;
+    this->clearUnusedBits();
+    return *this;
+  }
+  unsigned numWords = getNumWords();
+  for (unsigned i = 0; i < numWords; ++i)
+    pVal[i] ^= RHS.pVal[i];
+  return clearUnusedBits();
+}
+
+APInt APInt::AndSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t* val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] & RHS.pVal[i];
+  return APInt(val, getBitWidth());
+}
+
+APInt APInt::OrSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t *val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] | RHS.pVal[i];
+  return APInt(val, getBitWidth());
+}
+
+APInt APInt::XorSlowCase(const APInt& RHS) const {
+  unsigned numWords = getNumWords();
+  uint64_t *val = getMemory(numWords);
+  for (unsigned i = 0; i < numWords; ++i)
+    val[i] = pVal[i] ^ RHS.pVal[i];
+
+  // 0^0==1 so clear the high bits in case they got set.
+  return APInt(val, getBitWidth()).clearUnusedBits();
+}
+
+APInt APInt::operator*(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL * RHS.VAL);
+  APInt Result(*this);
+  Result *= RHS;
+  return Result;
+}
+
+APInt APInt::operator+(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL + RHS.VAL);
+  APInt Result(BitWidth, 0);
+  add(Result.pVal, this->pVal, RHS.pVal, getNumWords());
+  return Result.clearUnusedBits();
+}
+
+APInt APInt::operator-(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord())
+    return APInt(BitWidth, VAL - RHS.VAL);
+  APInt Result(BitWidth, 0);
+  sub(Result.pVal, this->pVal, RHS.pVal, getNumWords());
+  return Result.clearUnusedBits();
+}
+
+bool APInt::EqualSlowCase(const APInt& RHS) const {
+  // Get some facts about the number of bits used in the two operands.
+  unsigned n1 = getActiveBits();
+  unsigned n2 = RHS.getActiveBits();
+
+  // If the number of bits isn't the same, they aren't equal
+  if (n1 != n2)
+    return false;
+
+  // If the number of bits fits in a word, we only need to compare the low word.
+  if (n1 <= APINT_BITS_PER_WORD)
+    return pVal[0] == RHS.pVal[0];
+
+  // Otherwise, compare everything
+  for (int i = whichWord(n1 - 1); i >= 0; --i)
+    if (pVal[i] != RHS.pVal[i])
+      return false;
+  return true;
+}
+
+bool APInt::EqualSlowCase(uint64_t Val) const {
+  unsigned n = getActiveBits();
+  if (n <= APINT_BITS_PER_WORD)
+    return pVal[0] == Val;
+  else
+    return false;
+}
+
+bool APInt::ult(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
+  if (isSingleWord())
+    return VAL < RHS.VAL;
+
+  // Get active bit length of both operands
+  unsigned n1 = getActiveBits();
+  unsigned n2 = RHS.getActiveBits();
+
+  // If magnitude of LHS is less than RHS, return true.
+  if (n1 < n2)
+    return true;
+
+  // If magnitude of RHS is greather than LHS, return false.
+  if (n2 < n1)
+    return false;
+
+  // If they bot fit in a word, just compare the low order word
+  if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+    return pVal[0] < RHS.pVal[0];
+
+  // Otherwise, compare all words
+  unsigned topWord = whichWord(std::max(n1,n2)-1);
+  for (int i = topWord; i >= 0; --i) {
+    if (pVal[i] > RHS.pVal[i])
+      return false;
+    if (pVal[i] < RHS.pVal[i])
+      return true;
+  }
+  return false;
+}
+
+bool APInt::slt(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
+  if (isSingleWord()) {
+    int64_t lhsSext = (int64_t(VAL) << (64-BitWidth)) >> (64-BitWidth);
+    int64_t rhsSext = (int64_t(RHS.VAL) << (64-BitWidth)) >> (64-BitWidth);
+    return lhsSext < rhsSext;
+  }
+
+  APInt lhs(*this);
+  APInt rhs(RHS);
+  bool lhsNeg = isNegative();
+  bool rhsNeg = rhs.isNegative();
+  if (lhsNeg) {
+    // Sign bit is set so perform two's complement to make it positive
+    lhs.flipAllBits();
+    ++lhs;
+  }
+  if (rhsNeg) {
+    // Sign bit is set so perform two's complement to make it positive
+    rhs.flipAllBits();
+    ++rhs;
+  }
+
+  // Now we have unsigned values to compare so do the comparison if necessary
+  // based on the negativeness of the values.
+  if (lhsNeg)
+    if (rhsNeg)
+      return lhs.ugt(rhs);
+    else
+      return true;
+  else if (rhsNeg)
+    return false;
+  else
+    return lhs.ult(rhs);
+}
+
+void APInt::setBit(unsigned bitPosition) {
+  if (isSingleWord())
+    VAL |= maskBit(bitPosition);
+  else
+    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
+}
+
+/// Set the given bit to 0 whose position is given as "bitPosition".
+/// @brief Set a given bit to 0.
+void APInt::clearBit(unsigned bitPosition) {
+  if (isSingleWord())
+    VAL &= ~maskBit(bitPosition);
+  else
+    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+}
+
+/// @brief Toggle every bit to its opposite value.
+
+/// Toggle a given bit to its opposite value whose position is given
+/// as "bitPosition".
+/// @brief Toggles a given bit to its opposite value.
+void APInt::flipBit(unsigned bitPosition) {
+  assert(bitPosition < BitWidth && "Out of the bit-width range!");
+  if ((*this)[bitPosition]) clearBit(bitPosition);
+  else setBit(bitPosition);
+}
+
+unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
+  assert(!str.empty() && "Invalid string length");
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+          radix == 36) &&
+         "Radix should be 2, 8, 10, 16, or 36!");
+
+  size_t slen = str.size();
+
+  // Each computation below needs to know if it's negative.
+  StringRef::iterator p = str.begin();
+  unsigned isNegative = *p == '-';
+  if (*p == '-' || *p == '+') {
+    p++;
+    slen--;
+    assert(slen && "String is only a sign, needs a value.");
+  }
+
+  // For radixes of power-of-two values, the bits required is accurately and
+  // easily computed
+  if (radix == 2)
+    return slen + isNegative;
+  if (radix == 8)
+    return slen * 3 + isNegative;
+  if (radix == 16)
+    return slen * 4 + isNegative;
+
+  // FIXME: base 36
+  
+  // This is grossly inefficient but accurate. We could probably do something
+  // with a computation of roughly slen*64/20 and then adjust by the value of
+  // the first few digits. But, I'm not sure how accurate that could be.
+
+  // Compute a sufficient number of bits that is always large enough but might
+  // be too large. This avoids the assertion in the constructor. This
+  // calculation doesn't work appropriately for the numbers 0-9, so just use 4
+  // bits in that case.
+  unsigned sufficient 
+    = radix == 10? (slen == 1 ? 4 : slen * 64/18)
+                 : (slen == 1 ? 7 : slen * 16/3);
+
+  // Convert to the actual binary value.
+  APInt tmp(sufficient, StringRef(p, slen), radix);
+
+  // Compute how many bits are required. If the log is infinite, assume we need
+  // just bit.
+  unsigned log = tmp.logBase2();
+  if (log == (unsigned)-1) {
+    return isNegative + 1;
+  } else {
+    return isNegative + log + 1;
+  }
+}
+
+hash_code llvm::hash_value(const APInt &Arg) {
+  if (Arg.isSingleWord())
+    return hash_combine(Arg.VAL);
+
+  return hash_combine_range(Arg.pVal, Arg.pVal + Arg.getNumWords());
+}
+
+/// HiBits - This function returns the high "numBits" bits of this APInt.
+APInt APInt::getHiBits(unsigned numBits) const {
+  return APIntOps::lshr(*this, BitWidth - numBits);
+}
+
+/// LoBits - This function returns the low "numBits" bits of this APInt.
+APInt APInt::getLoBits(unsigned numBits) const {
+  return APIntOps::lshr(APIntOps::shl(*this, BitWidth - numBits),
+                        BitWidth - numBits);
+}
+
+unsigned APInt::countLeadingZerosSlowCase() const {
+  // Treat the most significand word differently because it might have
+  // meaningless bits set beyond the precision.
+  unsigned BitsInMSW = BitWidth % APINT_BITS_PER_WORD;
+  integerPart MSWMask;
+  if (BitsInMSW) MSWMask = (integerPart(1) << BitsInMSW) - 1;
+  else {
+    MSWMask = ~integerPart(0);
+    BitsInMSW = APINT_BITS_PER_WORD;
+  }
+
+  unsigned i = getNumWords();
+  integerPart MSW = pVal[i-1] & MSWMask;
+  if (MSW)
+    return CountLeadingZeros_64(MSW) - (APINT_BITS_PER_WORD - BitsInMSW);
+
+  unsigned Count = BitsInMSW;
+  for (--i; i > 0u; --i) {
+    if (pVal[i-1] == 0)
+      Count += APINT_BITS_PER_WORD;
+    else {
+      Count += CountLeadingZeros_64(pVal[i-1]);
+      break;
+    }
+  }
+  return Count;
+}
+
+unsigned APInt::countLeadingOnes() const {
+  if (isSingleWord())
+    return CountLeadingOnes_64(VAL << (APINT_BITS_PER_WORD - BitWidth));
+
+  unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
+  unsigned shift;
+  if (!highWordBits) {
+    highWordBits = APINT_BITS_PER_WORD;
+    shift = 0;
+  } else {
+    shift = APINT_BITS_PER_WORD - highWordBits;
+  }
+  int i = getNumWords() - 1;
+  unsigned Count = CountLeadingOnes_64(pVal[i] << shift);
+  if (Count == highWordBits) {
+    for (i--; i >= 0; --i) {
+      if (pVal[i] == -1ULL)
+        Count += APINT_BITS_PER_WORD;
+      else {
+        Count += CountLeadingOnes_64(pVal[i]);
+        break;
+      }
+    }
+  }
+  return Count;
+}
+
+unsigned APInt::countTrailingZeros() const {
+  if (isSingleWord())
+    return std::min(unsigned(CountTrailingZeros_64(VAL)), BitWidth);
+  unsigned Count = 0;
+  unsigned i = 0;
+  for (; i < getNumWords() && pVal[i] == 0; ++i)
+    Count += APINT_BITS_PER_WORD;
+  if (i < getNumWords())
+    Count += CountTrailingZeros_64(pVal[i]);
+  return std::min(Count, BitWidth);
+}
+
+unsigned APInt::countTrailingOnesSlowCase() const {
+  unsigned Count = 0;
+  unsigned i = 0;
+  for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
+    Count += APINT_BITS_PER_WORD;
+  if (i < getNumWords())
+    Count += CountTrailingOnes_64(pVal[i]);
+  return std::min(Count, BitWidth);
+}
+
+unsigned APInt::countPopulationSlowCase() const {
+  unsigned Count = 0;
+  for (unsigned i = 0; i < getNumWords(); ++i)
+    Count += CountPopulation_64(pVal[i]);
+  return Count;
+}
+
+/// Perform a logical right-shift from Src to Dst, which must be equal or
+/// non-overlapping, of Words words, by Shift, which must be less than 64.
+static void lshrNear(uint64_t *Dst, uint64_t *Src, unsigned Words,
+                     unsigned Shift) {
+  uint64_t Carry = 0;
+  for (int I = Words - 1; I >= 0; --I) {
+    uint64_t Tmp = Src[I];
+    Dst[I] = (Tmp >> Shift) | Carry;
+    Carry = Tmp << (64 - Shift);
+  }
+}
+
+APInt APInt::byteSwap() const {
+  assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
+  if (BitWidth == 16)
+    return APInt(BitWidth, ByteSwap_16(uint16_t(VAL)));
+  if (BitWidth == 32)
+    return APInt(BitWidth, ByteSwap_32(unsigned(VAL)));
+  if (BitWidth == 48) {
+    unsigned Tmp1 = unsigned(VAL >> 16);
+    Tmp1 = ByteSwap_32(Tmp1);
+    uint16_t Tmp2 = uint16_t(VAL);
+    Tmp2 = ByteSwap_16(Tmp2);
+    return APInt(BitWidth, (uint64_t(Tmp2) << 32) | Tmp1);
+  }
+  if (BitWidth == 64)
+    return APInt(BitWidth, ByteSwap_64(VAL));
+
+  APInt Result(getNumWords() * APINT_BITS_PER_WORD, 0);
+  for (unsigned I = 0, N = getNumWords(); I != N; ++I)
+    Result.pVal[I] = ByteSwap_64(pVal[N - I - 1]);
+  if (Result.BitWidth != BitWidth) {
+    lshrNear(Result.pVal, Result.pVal, getNumWords(),
+             Result.BitWidth - BitWidth);
+    Result.BitWidth = BitWidth;
+  }
+  return Result;
+}
+
+APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
+                                            const APInt& API2) {
+  APInt A = API1, B = API2;
+  while (!!B) {
+    APInt T = B;
+    B = APIntOps::urem(A, B);
+    A = T;
+  }
+  return A;
+}
+
+APInt llvm::APIntOps::RoundDoubleToAPInt(double Double, unsigned width) {
+  union {
+    double D;
+    uint64_t I;
+  } T;
+  T.D = Double;
+
+  // Get the sign bit from the highest order bit
+  bool isNeg = T.I >> 63;
+
+  // Get the 11-bit exponent and adjust for the 1023 bit bias
+  int64_t exp = ((T.I >> 52) & 0x7ff) - 1023;
+
+  // If the exponent is negative, the value is < 0 so just return 0.
+  if (exp < 0)
+    return APInt(width, 0u);
+
+  // Extract the mantissa by clearing the top 12 bits (sign + exponent).
+  uint64_t mantissa = (T.I & (~0ULL >> 12)) | 1ULL << 52;
+
+  // If the exponent doesn't shift all bits out of the mantissa
+  if (exp < 52)
+    return isNeg ? -APInt(width, mantissa >> (52 - exp)) :
+                    APInt(width, mantissa >> (52 - exp));
+
+  // If the client didn't provide enough bits for us to shift the mantissa into
+  // then the result is undefined, just return 0
+  if (width <= exp - 52)
+    return APInt(width, 0);
+
+  // Otherwise, we have to shift the mantissa bits up to the right location
+  APInt Tmp(width, mantissa);
+  Tmp = Tmp.shl((unsigned)exp - 52);
+  return isNeg ? -Tmp : Tmp;
+}
+
+/// RoundToDouble - This function converts this APInt to a double.
+/// The layout for double is as following (IEEE Standard 754):
+///  --------------------------------------
+/// |  Sign    Exponent    Fraction    Bias |
+/// |-------------------------------------- |
+/// |  1[63]   11[62-52]   52[51-00]   1023 |
+///  --------------------------------------
+double APInt::roundToDouble(bool isSigned) const {
+
+  // Handle the simple case where the value is contained in one uint64_t.
+  // It is wrong to optimize getWord(0) to VAL; there might be more than one word.
+  if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
+    if (isSigned) {
+      int64_t sext = (int64_t(getWord(0)) << (64-BitWidth)) >> (64-BitWidth);
+      return double(sext);
+    } else
+      return double(getWord(0));
+  }
+
+  // Determine if the value is negative.
+  bool isNeg = isSigned ? (*this)[BitWidth-1] : false;
+
+  // Construct the absolute value if we're negative.
+  APInt Tmp(isNeg ? -(*this) : (*this));
+
+  // Figure out how many bits we're using.
+  unsigned n = Tmp.getActiveBits();
+
+  // The exponent (without bias normalization) is just the number of bits
+  // we are using. Note that the sign bit is gone since we constructed the
+  // absolute value.
+  uint64_t exp = n;
+
+  // Return infinity for exponent overflow
+  if (exp > 1023) {
+    if (!isSigned || !isNeg)
+      return std::numeric_limits<double>::infinity();
+    else
+      return -std::numeric_limits<double>::infinity();
+  }
+  exp += 1023; // Increment for 1023 bias
+
+  // Number of bits in mantissa is 52. To obtain the mantissa value, we must
+  // extract the high 52 bits from the correct words in pVal.
+  uint64_t mantissa;
+  unsigned hiWord = whichWord(n-1);
+  if (hiWord == 0) {
+    mantissa = Tmp.pVal[0];
+    if (n > 52)
+      mantissa >>= n - 52; // shift down, we want the top 52 bits.
+  } else {
+    assert(hiWord > 0 && "huh?");
+    uint64_t hibits = Tmp.pVal[hiWord] << (52 - n % APINT_BITS_PER_WORD);
+    uint64_t lobits = Tmp.pVal[hiWord-1] >> (11 + n % APINT_BITS_PER_WORD);
+    mantissa = hibits | lobits;
+  }
+
+  // The leading bit of mantissa is implicit, so get rid of it.
+  uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
+  union {
+    double D;
+    uint64_t I;
+  } T;
+  T.I = sign | (exp << 52) | mantissa;
+  return T.D;
+}
+
+// Truncate to new width.
+APInt APInt::trunc(unsigned width) const {
+  assert(width < BitWidth && "Invalid APInt Truncate request");
+  assert(width && "Can't truncate to 0 bits");
+
+  if (width <= APINT_BITS_PER_WORD)
+    return APInt(width, getRawData()[0]);
+
+  APInt Result(getMemory(getNumWords(width)), width);
+
+  // Copy full words.
+  unsigned i;
+  for (i = 0; i != width / APINT_BITS_PER_WORD; i++)
+    Result.pVal[i] = pVal[i];
+
+  // Truncate and copy any partial word.
+  unsigned bits = (0 - width) % APINT_BITS_PER_WORD;
+  if (bits != 0)
+    Result.pVal[i] = pVal[i] << bits >> bits;
+
+  return Result;
+}
+
+// Sign extend to a new width.
+APInt APInt::sext(unsigned width) const {
+  assert(width > BitWidth && "Invalid APInt SignExtend request");
+
+  if (width <= APINT_BITS_PER_WORD) {
+    uint64_t val = VAL << (APINT_BITS_PER_WORD - BitWidth);
+    val = (int64_t)val >> (width - BitWidth);
+    return APInt(width, val >> (APINT_BITS_PER_WORD - width));
+  }
+
+  APInt Result(getMemory(getNumWords(width)), width);
+
+  // Copy full words.
+  unsigned i;
+  uint64_t word = 0;
+  for (i = 0; i != BitWidth / APINT_BITS_PER_WORD; i++) {
+    word = getRawData()[i];
+    Result.pVal[i] = word;
+  }
+
+  // Read and sign-extend any partial word.
+  unsigned bits = (0 - BitWidth) % APINT_BITS_PER_WORD;
+  if (bits != 0)
+    word = (int64_t)getRawData()[i] << bits >> bits;
+  else
+    word = (int64_t)word >> (APINT_BITS_PER_WORD - 1);
+
+  // Write remaining full words.
+  for (; i != width / APINT_BITS_PER_WORD; i++) {
+    Result.pVal[i] = word;
+    word = (int64_t)word >> (APINT_BITS_PER_WORD - 1);
+  }
+
+  // Write any partial word.
+  bits = (0 - width) % APINT_BITS_PER_WORD;
+  if (bits != 0)
+    Result.pVal[i] = word << bits >> bits;
+
+  return Result;
+}
+
+//  Zero extend to a new width.
+APInt APInt::zext(unsigned width) const {
+  assert(width > BitWidth && "Invalid APInt ZeroExtend request");
+
+  if (width <= APINT_BITS_PER_WORD)
+    return APInt(width, VAL);
+
+  APInt Result(getMemory(getNumWords(width)), width);
+
+  // Copy words.
+  unsigned i;
+  for (i = 0; i != getNumWords(); i++)
+    Result.pVal[i] = getRawData()[i];
+
+  // Zero remaining words.
+  memset(&Result.pVal[i], 0, (Result.getNumWords() - i) * APINT_WORD_SIZE);
+
+  return Result;
+}
+
+APInt APInt::zextOrTrunc(unsigned width) const {
+  if (BitWidth < width)
+    return zext(width);
+  if (BitWidth > width)
+    return trunc(width);
+  return *this;
+}
+
+APInt APInt::sextOrTrunc(unsigned width) const {
+  if (BitWidth < width)
+    return sext(width);
+  if (BitWidth > width)
+    return trunc(width);
+  return *this;
+}
+
+APInt APInt::zextOrSelf(unsigned width) const {
+  if (BitWidth < width)
+    return zext(width);
+  return *this;
+}
+
+APInt APInt::sextOrSelf(unsigned width) const {
+  if (BitWidth < width)
+    return sext(width);
+  return *this;
+}
+
+/// Arithmetic right-shift this APInt by shiftAmt.
+/// @brief Arithmetic right-shift function.
+APInt APInt::ashr(const APInt &shiftAmt) const {
+  return ashr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+/// Arithmetic right-shift this APInt by shiftAmt.
+/// @brief Arithmetic right-shift function.
+APInt APInt::ashr(unsigned shiftAmt) const {
+  assert(shiftAmt <= BitWidth && "Invalid shift amount");
+  // Handle a degenerate case
+  if (shiftAmt == 0)
+    return *this;
+
+  // Handle single word shifts with built-in ashr
+  if (isSingleWord()) {
+    if (shiftAmt == BitWidth)
+      return APInt(BitWidth, 0); // undefined
+    else {
+      unsigned SignBit = APINT_BITS_PER_WORD - BitWidth;
+      return APInt(BitWidth,
+        (((int64_t(VAL) << SignBit) >> SignBit) >> shiftAmt));
+    }
+  }
+
+  // If all the bits were shifted out, the result is, technically, undefined.
+  // We return -1 if it was negative, 0 otherwise. We check this early to avoid
+  // issues in the algorithm below.
+  if (shiftAmt == BitWidth) {
+    if (isNegative())
+      return APInt(BitWidth, -1ULL, true);
+    else
+      return APInt(BitWidth, 0);
+  }
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // Compute some values needed by the following shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD; // bits to shift per word
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
+  unsigned breakWord = getNumWords() - 1 - offset; // last word affected
+  unsigned bitsInWord = whichBit(BitWidth); // how many bits in last word?
+  if (bitsInWord == 0)
+    bitsInWord = APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    // Move the words containing significant bits
+    for (unsigned i = 0; i <= breakWord; ++i)
+      val[i] = pVal[i+offset]; // move whole word
+
+    // Adjust the top significant word for sign bit fill, if negative
+    if (isNegative())
+      if (bitsInWord < APINT_BITS_PER_WORD)
+        val[breakWord] |= ~0ULL << bitsInWord; // set high bits
+  } else {
+    // Shift the low order words
+    for (unsigned i = 0; i < breakWord; ++i) {
+      // This combines the shifted corresponding word with the low bits from
+      // the next word (shifted into this word's high bits).
+      val[i] = (pVal[i+offset] >> wordShift) |
+               (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
+    }
+
+    // Shift the break word. In this case there are no bits from the next word
+    // to include in this word.
+    val[breakWord] = pVal[breakWord+offset] >> wordShift;
+
+    // Deal with sign extenstion in the break word, and possibly the word before
+    // it.
+    if (isNegative()) {
+      if (wordShift > bitsInWord) {
+        if (breakWord > 0)
+          val[breakWord-1] |=
+            ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
+        val[breakWord] |= ~0ULL;
+      } else
+        val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+    }
+  }
+
+  // Remaining words are 0 or -1, just assign them.
+  uint64_t fillValue = (isNegative() ? -1ULL : 0);
+  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
+    val[i] = fillValue;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+/// Logical right-shift this APInt by shiftAmt.
+/// @brief Logical right-shift function.
+APInt APInt::lshr(const APInt &shiftAmt) const {
+  return lshr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+/// Logical right-shift this APInt by shiftAmt.
+/// @brief Logical right-shift function.
+APInt APInt::lshr(unsigned shiftAmt) const {
+  if (isSingleWord()) {
+    if (shiftAmt >= BitWidth)
+      return APInt(BitWidth, 0);
+    else
+      return APInt(BitWidth, this->VAL >> shiftAmt);
+  }
+
+  // If all the bits were shifted out, the result is 0. This avoids issues
+  // with shifting by the size of the integer type, which produces undefined
+  // results. We define these "undefined results" to always be 0.
+  if (shiftAmt >= BitWidth)
+    return APInt(BitWidth, 0);
+
+  // If none of the bits are shifted out, the result is *this. This avoids
+  // issues with shifting by the size of the integer type, which produces
+  // undefined results in the code below. This is also an optimization.
+  if (shiftAmt == 0)
+    return *this;
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // If we are shifting less than a word, compute the shift with a simple carry
+  if (shiftAmt < APINT_BITS_PER_WORD) {
+    lshrNear(val, pVal, getNumWords(), shiftAmt);
+    return APInt(val, BitWidth).clearUnusedBits();
+  }
+
+  // Compute some values needed by the remaining shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    for (unsigned i = 0; i < getNumWords() - offset; ++i)
+      val[i] = pVal[i+offset];
+    for (unsigned i = getNumWords()-offset; i < getNumWords(); i++)
+      val[i] = 0;
+    return APInt(val,BitWidth).clearUnusedBits();
+  }
+
+  // Shift the low order words
+  unsigned breakWord = getNumWords() - offset -1;
+  for (unsigned i = 0; i < breakWord; ++i)
+    val[i] = (pVal[i+offset] >> wordShift) |
+             (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
+  // Shift the break word.
+  val[breakWord] = pVal[breakWord+offset] >> wordShift;
+
+  // Remaining words are 0
+  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
+    val[i] = 0;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+/// Left-shift this APInt by shiftAmt.
+/// @brief Left-shift function.
+APInt APInt::shl(const APInt &shiftAmt) const {
+  // It's undefined behavior in C to shift by BitWidth or greater.
+  return shl((unsigned)shiftAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::shlSlowCase(unsigned shiftAmt) const {
+  // If all the bits were shifted out, the result is 0. This avoids issues
+  // with shifting by the size of the integer type, which produces undefined
+  // results. We define these "undefined results" to always be 0.
+  if (shiftAmt == BitWidth)
+    return APInt(BitWidth, 0);
+
+  // If none of the bits are shifted out, the result is *this. This avoids a
+  // lshr by the words size in the loop below which can produce incorrect
+  // results. It also avoids the expensive computation below for a common case.
+  if (shiftAmt == 0)
+    return *this;
+
+  // Create some space for the result.
+  uint64_t * val = new uint64_t[getNumWords()];
+
+  // If we are shifting less than a word, do it the easy way
+  if (shiftAmt < APINT_BITS_PER_WORD) {
+    uint64_t carry = 0;
+    for (unsigned i = 0; i < getNumWords(); i++) {
+      val[i] = pVal[i] << shiftAmt | carry;
+      carry = pVal[i] >> (APINT_BITS_PER_WORD - shiftAmt);
+    }
+    return APInt(val, BitWidth).clearUnusedBits();
+  }
+
+  // Compute some values needed by the remaining shift algorithms
+  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
+  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
+
+  // If we are shifting whole words, just move whole words
+  if (wordShift == 0) {
+    for (unsigned i = 0; i < offset; i++)
+      val[i] = 0;
+    for (unsigned i = offset; i < getNumWords(); i++)
+      val[i] = pVal[i-offset];
+    return APInt(val,BitWidth).clearUnusedBits();
+  }
+
+  // Copy whole words from this to Result.
+  unsigned i = getNumWords() - 1;
+  for (; i > offset; --i)
+    val[i] = pVal[i-offset] << wordShift |
+             pVal[i-offset-1] >> (APINT_BITS_PER_WORD - wordShift);
+  val[offset] = pVal[0] << wordShift;
+  for (i = 0; i < offset; ++i)
+    val[i] = 0;
+  return APInt(val, BitWidth).clearUnusedBits();
+}
+
+APInt APInt::rotl(const APInt &rotateAmt) const {
+  return rotl((unsigned)rotateAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::rotl(unsigned rotateAmt) const {
+  rotateAmt %= BitWidth;
+  if (rotateAmt == 0)
+    return *this;
+  return shl(rotateAmt) | lshr(BitWidth - rotateAmt);
+}
+
+APInt APInt::rotr(const APInt &rotateAmt) const {
+  return rotr((unsigned)rotateAmt.getLimitedValue(BitWidth));
+}
+
+APInt APInt::rotr(unsigned rotateAmt) const {
+  rotateAmt %= BitWidth;
+  if (rotateAmt == 0)
+    return *this;
+  return lshr(rotateAmt) | shl(BitWidth - rotateAmt);
+}
+
+// Square Root - this method computes and returns the square root of "this".
+// Three mechanisms are used for computation. For small values (<= 5 bits),
+// a table lookup is done. This gets some performance for common cases. For
+// values using less than 52 bits, the value is converted to double and then
+// the libc sqrt function is called. The result is rounded and then converted
+// back to a uint64_t which is then used to construct the result. Finally,
+// the Babylonian method for computing square roots is used.
+APInt APInt::sqrt() const {
+
+  // Determine the magnitude of the value.
+  unsigned magnitude = getActiveBits();
+
+  // Use a fast table for some small values. This also gets rid of some
+  // rounding errors in libc sqrt for small values.
+  if (magnitude <= 5) {
+    static const uint8_t results[32] = {
+      /*     0 */ 0,
+      /*  1- 2 */ 1, 1,
+      /*  3- 6 */ 2, 2, 2, 2,
+      /*  7-12 */ 3, 3, 3, 3, 3, 3,
+      /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+      /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+      /*    31 */ 6
+    };
+    return APInt(BitWidth, results[ (isSingleWord() ? VAL : pVal[0]) ]);
+  }
+
+  // If the magnitude of the value fits in less than 52 bits (the precision of
+  // an IEEE double precision floating point value), then we can use the
+  // libc sqrt function which will probably use a hardware sqrt computation.
+  // This should be faster than the algorithm below.
+  if (magnitude < 52) {
+#if HAVE_ROUND
+    return APInt(BitWidth,
+                 uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
+#else
+    return APInt(BitWidth,
+                 uint64_t(::sqrt(double(isSingleWord()?VAL:pVal[0])) + 0.5));
+#endif
+  }
+
+  // Okay, all the short cuts are exhausted. We must compute it. The following
+  // is a classical Babylonian method for computing the square root. This code
+  // was adapted to APINt from a wikipedia article on such computations.
+  // See http://www.wikipedia.org/ and go to the page named
+  // Calculate_an_integer_square_root.
+  unsigned nbits = BitWidth, i = 4;
+  APInt testy(BitWidth, 16);
+  APInt x_old(BitWidth, 1);
+  APInt x_new(BitWidth, 0);
+  APInt two(BitWidth, 2);
+
+  // Select a good starting value using binary logarithms.
+  for (;; i += 2, testy = testy.shl(2))
+    if (i >= nbits || this->ule(testy)) {
+      x_old = x_old.shl(i / 2);
+      break;
+    }
+
+  // Use the Babylonian method to arrive at the integer square root:
+  for (;;) {
+    x_new = (this->udiv(x_old) + x_old).udiv(two);
+    if (x_old.ule(x_new))
+      break;
+    x_old = x_new;
+  }
+
+  // Make sure we return the closest approximation
+  // NOTE: The rounding calculation below is correct. It will produce an
+  // off-by-one discrepancy with results from pari/gp. That discrepancy has been
+  // determined to be a rounding issue with pari/gp as it begins to use a
+  // floating point representation after 192 bits. There are no discrepancies
+  // between this algorithm and pari/gp for bit widths < 192 bits.
+  APInt square(x_old * x_old);
+  APInt nextSquare((x_old + 1) * (x_old +1));
+  if (this->ult(square))
+    return x_old;
+  assert(this->ule(nextSquare) && "Error in APInt::sqrt computation");
+  APInt midpoint((nextSquare - square).udiv(two));
+  APInt offset(*this - square);
+  if (offset.ult(midpoint))
+    return x_old;
+  return x_old + 1;
+}
+
+/// Computes the multiplicative inverse of this APInt for a given modulo. The
+/// iterative extended Euclidean algorithm is used to solve for this value,
+/// however we simplify it to speed up calculating only the inverse, and take
+/// advantage of div+rem calculations. We also use some tricks to avoid copying
+/// (potentially large) APInts around.
+APInt APInt::multiplicativeInverse(const APInt& modulo) const {
+  assert(ult(modulo) && "This APInt must be smaller than the modulo");
+
+  // Using the properties listed at the following web page (accessed 06/21/08):
+  //   http://www.numbertheory.org/php/euclid.html
+  // (especially the properties numbered 3, 4 and 9) it can be proved that
+  // BitWidth bits suffice for all the computations in the algorithm implemented
+  // below. More precisely, this number of bits suffice if the multiplicative
+  // inverse exists, but may not suffice for the general extended Euclidean
+  // algorithm.
+
+  APInt r[2] = { modulo, *this };
+  APInt t[2] = { APInt(BitWidth, 0), APInt(BitWidth, 1) };
+  APInt q(BitWidth, 0);
+
+  unsigned i;
+  for (i = 0; r[i^1] != 0; i ^= 1) {
+    // An overview of the math without the confusing bit-flipping:
+    // q = r[i-2] / r[i-1]
+    // r[i] = r[i-2] % r[i-1]
+    // t[i] = t[i-2] - t[i-1] * q
+    udivrem(r[i], r[i^1], q, r[i]);
+    t[i] -= t[i^1] * q;
+  }
+
+  // If this APInt and the modulo are not coprime, there is no multiplicative
+  // inverse, so return 0. We check this by looking at the next-to-last
+  // remainder, which is the gcd(*this,modulo) as calculated by the Euclidean
+  // algorithm.
+  if (r[i] != 1)
+    return APInt(BitWidth, 0);
+
+  // The next-to-last t is the multiplicative inverse.  However, we are
+  // interested in a positive inverse. Calcuate a positive one from a negative
+  // one if necessary. A simple addition of the modulo suffices because
+  // abs(t[i]) is known to be less than *this/2 (see the link above).
+  return t[i].isNegative() ? t[i] + modulo : t[i];
+}
+
+/// Calculate the magic numbers required to implement a signed integer division
+/// by a constant as a sequence of multiplies, adds and shifts.  Requires that
+/// the divisor not be 0, 1, or -1.  Taken from "Hacker's Delight", Henry S.
+/// Warren, Jr., chapter 10.
+APInt::ms APInt::magic() const {
+  const APInt& d = *this;
+  unsigned p;
+  APInt ad, anc, delta, q1, r1, q2, r2, t;
+  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
+  struct ms mag;
+
+  ad = d.abs();
+  t = signedMin + (d.lshr(d.getBitWidth() - 1));
+  anc = t - 1 - t.urem(ad);   // absolute value of nc
+  p = d.getBitWidth() - 1;    // initialize p
+  q1 = signedMin.udiv(anc);   // initialize q1 = 2p/abs(nc)
+  r1 = signedMin - q1*anc;    // initialize r1 = rem(2p,abs(nc))
+  q2 = signedMin.udiv(ad);    // initialize q2 = 2p/abs(d)
+  r2 = signedMin - q2*ad;     // initialize r2 = rem(2p,abs(d))
+  do {
+    p = p + 1;
+    q1 = q1<<1;          // update q1 = 2p/abs(nc)
+    r1 = r1<<1;          // update r1 = rem(2p/abs(nc))
+    if (r1.uge(anc)) {  // must be unsigned comparison
+      q1 = q1 + 1;
+      r1 = r1 - anc;
+    }
+    q2 = q2<<1;          // update q2 = 2p/abs(d)
+    r2 = r2<<1;          // update r2 = rem(2p/abs(d))
+    if (r2.uge(ad)) {   // must be unsigned comparison
+      q2 = q2 + 1;
+      r2 = r2 - ad;
+    }
+    delta = ad - r2;
+  } while (q1.ult(delta) || (q1 == delta && r1 == 0));
+
+  mag.m = q2 + 1;
+  if (d.isNegative()) mag.m = -mag.m;   // resulting magic number
+  mag.s = p - d.getBitWidth();          // resulting shift
+  return mag;
+}
+
+/// Calculate the magic numbers required to implement an unsigned integer
+/// division by a constant as a sequence of multiplies, adds and shifts.
+/// Requires that the divisor not be 0.  Taken from "Hacker's Delight", Henry
+/// S. Warren, Jr., chapter 10.
+/// LeadingZeros can be used to simplify the calculation if the upper bits
+/// of the divided value are known zero.
+APInt::mu APInt::magicu(unsigned LeadingZeros) const {
+  const APInt& d = *this;
+  unsigned p;
+  APInt nc, delta, q1, r1, q2, r2;
+  struct mu magu;
+  magu.a = 0;               // initialize "add" indicator
+  APInt allOnes = APInt::getAllOnesValue(d.getBitWidth()).lshr(LeadingZeros);
+  APInt signedMin = APInt::getSignedMinValue(d.getBitWidth());
+  APInt signedMax = APInt::getSignedMaxValue(d.getBitWidth());
+
+  nc = allOnes - (allOnes - d).urem(d);
+  p = d.getBitWidth() - 1;  // initialize p
+  q1 = signedMin.udiv(nc);  // initialize q1 = 2p/nc
+  r1 = signedMin - q1*nc;   // initialize r1 = rem(2p,nc)
+  q2 = signedMax.udiv(d);   // initialize q2 = (2p-1)/d
+  r2 = signedMax - q2*d;    // initialize r2 = rem((2p-1),d)
+  do {
+    p = p + 1;
+    if (r1.uge(nc - r1)) {
+      q1 = q1 + q1 + 1;  // update q1
+      r1 = r1 + r1 - nc; // update r1
+    }
+    else {
+      q1 = q1+q1; // update q1
+      r1 = r1+r1; // update r1
+    }
+    if ((r2 + 1).uge(d - r2)) {
+      if (q2.uge(signedMax)) magu.a = 1;
+      q2 = q2+q2 + 1;     // update q2
+      r2 = r2+r2 + 1 - d; // update r2
+    }
+    else {
+      if (q2.uge(signedMin)) magu.a = 1;
+      q2 = q2+q2;     // update q2
+      r2 = r2+r2 + 1; // update r2
+    }
+    delta = d - 1 - r2;
+  } while (p < d.getBitWidth()*2 &&
+           (q1.ult(delta) || (q1 == delta && r1 == 0)));
+  magu.m = q2 + 1; // resulting magic number
+  magu.s = p - d.getBitWidth();  // resulting shift
+  return magu;
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+                     unsigned m, unsigned n) {
+  assert(u && "Must provide dividend");
+  assert(v && "Must provide divisor");
+  assert(q && "Must provide quotient");
+  assert(u != v && u != q && v != q && "Must us different memory");
+  assert(n>1 && "n must be > 1");
+
+  // Knuth uses the value b as the base of the number system. In our case b
+  // is 2^31 so we just set it to -1u.
+  uint64_t b = uint64_t(1) << 32;
+
+#if 0
+  DEBUG(dbgs() << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  DEBUG(dbgs() << "KnuthDiv: original:");
+  DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
+  DEBUG(dbgs() << " by");
+  DEBUG(for (int i = n; i >0; i--) dbgs() << " " << v[i-1]);
+  DEBUG(dbgs() << '\n');
+#endif
+  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
+  // u and v by d. Note that we have taken Knuth's advice here to use a power
+  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
+  // 2 allows us to shift instead of multiply and it is easy to determine the
+  // shift amount from the leading zeros.  We are basically normalizing the u
+  // and v so that its high bits are shifted to the top of v's range without
+  // overflow. Note that this can require an extra word in u so that u must
+  // be of length m+n+1.
+  unsigned shift = CountLeadingZeros_32(v[n-1]);
+  unsigned v_carry = 0;
+  unsigned u_carry = 0;
+  if (shift) {
+    for (unsigned i = 0; i < m+n; ++i) {
+      unsigned u_tmp = u[i] >> (32 - shift);
+      u[i] = (u[i] << shift) | u_carry;
+      u_carry = u_tmp;
+    }
+    for (unsigned i = 0; i < n; ++i) {
+      unsigned v_tmp = v[i] >> (32 - shift);
+      v[i] = (v[i] << shift) | v_carry;
+      v_carry = v_tmp;
+    }
+  }
+  u[m+n] = u_carry;
+#if 0
+  DEBUG(dbgs() << "KnuthDiv:   normal:");
+  DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
+  DEBUG(dbgs() << " by");
+  DEBUG(for (int i = n; i >0; i--) dbgs() << " " << v[i-1]);
+  DEBUG(dbgs() << '\n');
+#endif
+
+  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
+  int j = m;
+  do {
+    DEBUG(dbgs() << "KnuthDiv: quotient digit #" << j << '\n');
+    // D3. [Calculate q'.].
+    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // on v[n-2] determines at high speed most of the cases in which the trial
+    // value qp is one too large, and it eliminates all cases where qp is two
+    // too large.
+    uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+    DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
+    uint64_t qp = dividend / v[n-1];
+    uint64_t rp = dividend % v[n-1];
+    if (qp == b || qp*v[n-2] > b*rp + u[j+n-2]) {
+      qp--;
+      rp += v[n-1];
+      if (rp < b && (qp == b || qp*v[n-2] > b*rp + u[j+n-2]))
+        qp--;
+    }
+    DEBUG(dbgs() << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+    // consists of a simple multiplication by a one-place number, combined with
+    // a subtraction.
+    bool isNeg = false;
+    for (unsigned i = 0; i < n; ++i) {
+      uint64_t u_tmp = uint64_t(u[j+i]) | (uint64_t(u[j+i+1]) << 32);
+      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+      bool borrow = subtrahend > u_tmp;
+      DEBUG(dbgs() << "KnuthDiv: u_tmp == " << u_tmp
+                   << ", subtrahend == " << subtrahend
+                   << ", borrow = " << borrow << '\n');
+
+      uint64_t result = u_tmp - subtrahend;
+      unsigned k = j + i;
+      u[k++] = (unsigned)(result & (b-1)); // subtract low word
+      u[k++] = (unsigned)(result >> 32);   // subtract high word
+      while (borrow && k <= m+n) { // deal with borrow to the left
+        borrow = u[k] == 0;
+        u[k]--;
+        k++;
+      }
+      isNeg |= borrow;
+      DEBUG(dbgs() << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
+                    u[j+i+1] << '\n');
+    }
+    DEBUG(dbgs() << "KnuthDiv: after subtraction:");
+    DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
+    DEBUG(dbgs() << '\n');
+    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
+    // this step is actually negative, (u[j+n]...u[j]) should be left as the
+    // true value plus b**(n+1), namely as the b's complement of
+    // the true value, and a "borrow" to the left should be remembered.
+    //
+    if (isNeg) {
+      bool carry = true;  // true because b's complement is "complement + 1"
+      for (unsigned i = 0; i <= m+n; ++i) {
+        u[i] = ~u[i] + carry; // b's complement
+        carry = carry && u[i] == 0;
+      }
+    }
+    DEBUG(dbgs() << "KnuthDiv: after complement:");
+    DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
+    DEBUG(dbgs() << '\n');
+
+    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
+    // negative, go to step D6; otherwise go on to step D7.
+    q[j] = (unsigned)qp;
+    if (isNeg) {
+      // D6. [Add back]. The probability that this step is necessary is very
+      // small, on the order of only 2/b. Make sure that test data accounts for
+      // this possibility. Decrease q[j] by 1
+      q[j]--;
+      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
+      // A carry will occur to the left of u[j+n], and it should be ignored
+      // since it cancels with the borrow that occurred in D4.
+      bool carry = false;
+      for (unsigned i = 0; i < n; i++) {
+        unsigned limit = std::min(u[j+i],v[i]);
+        u[j+i] += v[i] + carry;
+        carry = u[j+i] < limit || (carry && u[j+i] == limit);
+      }
+      u[j+n] += carry;
+    }
+    DEBUG(dbgs() << "KnuthDiv: after correction:");
+    DEBUG(for (int i = m+n; i >=0; i--) dbgs() <<" " << u[i]);
+    DEBUG(dbgs() << "\nKnuthDiv: digit result = " << q[j] << '\n');
+
+  // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+  } while (--j >= 0);
+
+  DEBUG(dbgs() << "KnuthDiv: quotient:");
+  DEBUG(for (int i = m; i >=0; i--) dbgs() <<" " << q[i]);
+  DEBUG(dbgs() << '\n');
+
+  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+  // remainder may be obtained by dividing u[...] by d. If r is non-null we
+  // compute the remainder (urem uses this).
+  if (r) {
+    // The value d is expressed by the "shift" value above since we avoided
+    // multiplication by d by using a shift left. So, all we have to do is
+    // shift right here. In order to mak
+    if (shift) {
+      unsigned carry = 0;
+      DEBUG(dbgs() << "KnuthDiv: remainder:");
+      for (int i = n-1; i >= 0; i--) {
+        r[i] = (u[i] >> shift) | carry;
+        carry = u[i] << (32 - shift);
+        DEBUG(dbgs() << " " << r[i]);
+      }
+    } else {
+      for (int i = n-1; i >= 0; i--) {
+        r[i] = u[i];
+        DEBUG(dbgs() << " " << r[i]);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+#if 0
+  DEBUG(dbgs() << '\n');
+#endif
+}
+
+void APInt::divide(const APInt LHS, unsigned lhsWords,
+                   const APInt &RHS, unsigned rhsWords,
+                   APInt *Quotient, APInt *Remainder)
+{
+  assert(lhsWords >= rhsWords && "Fractional result");
+
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
+  unsigned n = rhsWords * 2;
+  unsigned m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  unsigned SPACE[128];
+  unsigned *U = 0;
+  unsigned *V = 0;
+  unsigned *Q = 0;
+  unsigned *R = 0;
+  if ((Remainder?4:3)*n+2*m+1 <= 128) {
+    U = &SPACE[0];
+    V = &SPACE[m+n+1];
+    Q = &SPACE[(m+n+1) + n];
+    if (Remainder)
+      R = &SPACE[(m+n+1) + n + (m+n)];
+  } else {
+    U = new unsigned[m + n + 1];
+    V = new unsigned[n];
+    Q = new unsigned[m+n];
+    if (Remainder)
+      R = new unsigned[n];
+  }
+
+  // Initialize the dividend
+  memset(U, 0, (m+n+1)*sizeof(unsigned));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.VAL : LHS.pVal[i]);
+    U[i * 2] = (unsigned)(tmp & mask);
+    U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+  }
+  U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(V, 0, (n)*sizeof(unsigned));
+  for (unsigned i = 0; i < rhsWords; ++i) {
+    uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.VAL : RHS.pVal[i]);
+    V[i * 2] = (unsigned)(tmp & mask);
+    V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+  }
+
+  // initialize the quotient and remainder
+  memset(Q, 0, (m+n) * sizeof(unsigned));
+  if (Remainder)
+    memset(R, 0, n * sizeof(unsigned));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && V[i-1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m+n; i > 0 && U[i-1] == 0; i--)
+    m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    unsigned divisor = V[0];
+    unsigned remainder = 0;
+    for (int i = m+n-1; i >= 0; i--) {
+      uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+      if (partial_dividend == 0) {
+        Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        Q[i] = 0;
+        remainder = (unsigned)partial_dividend;
+      } else if (partial_dividend == divisor) {
+        Q[i] = 1;
+        remainder = 0;
+      } else {
+        Q[i] = (unsigned)(partial_dividend / divisor);
+        remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+      }
+    }
+    if (R)
+      R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(U, V, Q, R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord())
+        Quotient->VAL = 0;
+      else
+        delete [] Quotient->pVal;
+      Quotient->BitWidth = LHS.BitWidth;
+      if (!Quotient->isSingleWord())
+        Quotient->pVal = getClearedMemory(Quotient->getNumWords());
+    } else
+      Quotient->clearAllBits();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+        uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
+      if (Quotient->isSingleWord())
+        Quotient->VAL = tmp;
+      else
+        Quotient->pVal[0] = tmp;
+    } else {
+      assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->pVal[i] =
+          uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+    }
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != RHS.BitWidth) {
+      if (Remainder->isSingleWord())
+        Remainder->VAL = 0;
+      else
+        delete [] Remainder->pVal;
+      Remainder->BitWidth = RHS.BitWidth;
+      if (!Remainder->isSingleWord())
+        Remainder->pVal = getClearedMemory(Remainder->getNumWords());
+    } else
+      Remainder->clearAllBits();
+
+    // The remainder is in R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+        uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
+      if (Remainder->isSingleWord())
+        Remainder->VAL = tmp;
+      else
+        Remainder->pVal[0] = tmp;
+    } else {
+      assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->pVal[i] =
+          uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
+    }
+  }
+
+  // Clean up the memory we allocated.
+  if (U != &SPACE[0]) {
+    delete [] U;
+    delete [] V;
+    delete [] Q;
+    delete [] R;
+  }
+}
+
+APInt APInt::udiv(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+
+  // First, deal with the easy case
+  if (isSingleWord()) {
+    assert(RHS.VAL != 0 && "Divide by zero?");
+    return APInt(BitWidth, VAL / RHS.VAL);
+  }
+
+  // Get some facts about the LHS and RHS number of bits and words
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  assert(rhsWords && "Divided by zero???");
+  unsigned lhsBits = this->getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+
+  // Deal with some degenerate cases
+  if (!lhsWords)
+    // 0 / X ===> 0
+    return APInt(BitWidth, 0);
+  else if (lhsWords < rhsWords || this->ult(RHS)) {
+    // X / Y ===> 0, iff X < Y
+    return APInt(BitWidth, 0);
+  } else if (*this == RHS) {
+    // X / X ===> 1
+    return APInt(BitWidth, 1);
+  } else if (lhsWords == 1 && rhsWords == 1) {
+    // All high words are zero, just use native divide
+    return APInt(BitWidth, this->pVal[0] / RHS.pVal[0]);
+  }
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Quotient(1,0); // to hold result.
+  divide(*this, lhsWords, RHS, rhsWords, &Quotient, 0);
+  return Quotient;
+}
+
+APInt APInt::sdiv(const APInt &RHS) const {
+  if (isNegative()) {
+    if (RHS.isNegative())
+      return (-(*this)).udiv(-RHS);
+    return -((-(*this)).udiv(RHS));
+  }
+  if (RHS.isNegative())
+    return -(this->udiv(-RHS));
+  return this->udiv(RHS);
+}
+
+APInt APInt::urem(const APInt& RHS) const {
+  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  if (isSingleWord()) {
+    assert(RHS.VAL != 0 && "Remainder by zero?");
+    return APInt(BitWidth, VAL % RHS.VAL);
+  }
+
+  // Get some facts about the LHS
+  unsigned lhsBits = getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+  // Get some facts about the RHS
+  unsigned rhsBits = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  assert(rhsWords && "Performing remainder operation by zero ???");
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {
+    // 0 % Y ===> 0
+    return APInt(BitWidth, 0);
+  } else if (lhsWords < rhsWords || this->ult(RHS)) {
+    // X % Y ===> X, iff X < Y
+    return *this;
+  } else if (*this == RHS) {
+    // X % X == 0;
+    return APInt(BitWidth, 0);
+  } else if (lhsWords == 1) {
+    // All high words are zero, just use native remainder
+    return APInt(BitWidth, pVal[0] % RHS.pVal[0]);
+  }
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Remainder(1,0);
+  divide(*this, lhsWords, RHS, rhsWords, 0, &Remainder);
+  return Remainder;
+}
+
+APInt APInt::srem(const APInt &RHS) const {
+  if (isNegative()) {
+    if (RHS.isNegative())
+      return -((-(*this)).urem(-RHS));
+    return -((-(*this)).urem(RHS));
+  }
+  if (RHS.isNegative())
+    return this->urem(-RHS);
+  return this->urem(RHS);
+}
+
+void APInt::udivrem(const APInt &LHS, const APInt &RHS,
+                    APInt &Quotient, APInt &Remainder) {
+  // Get some size facts about the dividend and divisor
+  unsigned lhsBits  = LHS.getActiveBits();
+  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+  unsigned rhsBits  = RHS.getActiveBits();
+  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {
+    Quotient = 0;                // 0 / Y ===> 0
+    Remainder = 0;               // 0 % Y ===> 0
+    return;
+  }
+
+  if (lhsWords < rhsWords || LHS.ult(RHS)) {
+    Remainder = LHS;            // X % Y ===> X, iff X < Y
+    Quotient = 0;               // X / Y ===> 0, iff X < Y
+    return;
+  }
+
+  if (LHS == RHS) {
+    Quotient  = 1;              // X / X ===> 1
+    Remainder = 0;              // X % X ===> 0;
+    return;
+  }
+
+  if (lhsWords == 1 && rhsWords == 1) {
+    // There is only one word to consider so use the native versions.
+    uint64_t lhsValue = LHS.isSingleWord() ? LHS.VAL : LHS.pVal[0];
+    uint64_t rhsValue = RHS.isSingleWord() ? RHS.VAL : RHS.pVal[0];
+    Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
+    Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+    return;
+  }
+
+  // Okay, lets do it the long way
+  divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
+}
+
+void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
+                    APInt &Quotient, APInt &Remainder) {
+  if (LHS.isNegative()) {
+    if (RHS.isNegative())
+      APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
+    else {
+      APInt::udivrem(-LHS, RHS, Quotient, Remainder);
+      Quotient = -Quotient;
+    }
+    Remainder = -Remainder;
+  } else if (RHS.isNegative()) {
+    APInt::udivrem(LHS, -RHS, Quotient, Remainder);
+    Quotient = -Quotient;
+  } else {
+    APInt::udivrem(LHS, RHS, Quotient, Remainder);
+  }
+}
+
+APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this+RHS;
+  Overflow = isNonNegative() == RHS.isNonNegative() &&
+             Res.isNonNegative() != isNonNegative();
+  return Res;
+}
+
+APInt APInt::uadd_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this+RHS;
+  Overflow = Res.ult(RHS);
+  return Res;
+}
+
+APInt APInt::ssub_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this - RHS;
+  Overflow = isNonNegative() != RHS.isNonNegative() &&
+             Res.isNonNegative() != isNonNegative();
+  return Res;
+}
+
+APInt APInt::usub_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this-RHS;
+  Overflow = Res.ugt(*this);
+  return Res;
+}
+
+APInt APInt::sdiv_ov(const APInt &RHS, bool &Overflow) const {
+  // MININT/-1  -->  overflow.
+  Overflow = isMinSignedValue() && RHS.isAllOnesValue();
+  return sdiv(RHS);
+}
+
+APInt APInt::smul_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this * RHS;
+  
+  if (*this != 0 && RHS != 0)
+    Overflow = Res.sdiv(RHS) != *this || Res.sdiv(*this) != RHS;
+  else
+    Overflow = false;
+  return Res;
+}
+
+APInt APInt::umul_ov(const APInt &RHS, bool &Overflow) const {
+  APInt Res = *this * RHS;
+
+  if (*this != 0 && RHS != 0)
+    Overflow = Res.udiv(RHS) != *this || Res.udiv(*this) != RHS;
+  else
+    Overflow = false;
+  return Res;
+}
+
+APInt APInt::sshl_ov(unsigned ShAmt, bool &Overflow) const {
+  Overflow = ShAmt >= getBitWidth();
+  if (Overflow)
+    ShAmt = getBitWidth()-1;
+
+  if (isNonNegative()) // Don't allow sign change.
+    Overflow = ShAmt >= countLeadingZeros();
+  else
+    Overflow = ShAmt >= countLeadingOnes();
+  
+  return *this << ShAmt;
+}
+
+
+
+
+void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
+  // Check our assumptions here
+  assert(!str.empty() && "Invalid string length");
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+          radix == 36) &&
+         "Radix should be 2, 8, 10, 16, or 36!");
+
+  StringRef::iterator p = str.begin();
+  size_t slen = str.size();
+  bool isNeg = *p == '-';
+  if (*p == '-' || *p == '+') {
+    p++;
+    slen--;
+    assert(slen && "String is only a sign, needs a value.");
+  }
+  assert((slen <= numbits || radix != 2) && "Insufficient bit width");
+  assert(((slen-1)*3 <= numbits || radix != 8) && "Insufficient bit width");
+  assert(((slen-1)*4 <= numbits || radix != 16) && "Insufficient bit width");
+  assert((((slen-1)*64)/22 <= numbits || radix != 10) &&
+         "Insufficient bit width");
+
+  // Allocate memory
+  if (!isSingleWord())
+    pVal = getClearedMemory(getNumWords());
+
+  // Figure out if we can shift instead of multiply
+  unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+  // Set up an APInt for the digit to add outside the loop so we don't
+  // constantly construct/destruct it.
+  APInt apdigit(getBitWidth(), 0);
+  APInt apradix(getBitWidth(), radix);
+
+  // Enter digit traversal loop
+  for (StringRef::iterator e = str.end(); p != e; ++p) {
+    unsigned digit = getDigit(*p, radix);
+    assert(digit < radix && "Invalid character in digit string");
+
+    // Shift or multiply the value by the radix
+    if (slen > 1) {
+      if (shift)
+        *this <<= shift;
+      else
+        *this *= apradix;
+    }
+
+    // Add in the digit we just interpreted
+    if (apdigit.isSingleWord())
+      apdigit.VAL = digit;
+    else
+      apdigit.pVal[0] = digit;
+    *this += apdigit;
+  }
+  // If its negative, put it in two's complement form
+  if (isNeg) {
+    --(*this);
+    this->flipAllBits();
+  }
+}
+
+void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
+                     bool Signed, bool formatAsCLiteral) const {
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 || 
+          Radix == 36) &&
+         "Radix should be 2, 8, 10, 16, or 36!");
+
+  const char *Prefix = "";
+  if (formatAsCLiteral) {
+    switch (Radix) {
+      case 2:
+        // Binary literals are a non-standard extension added in gcc 4.3:
+        // http://gcc.gnu.org/onlinedocs/gcc-4.3.0/gcc/Binary-constants.html
+        Prefix = "0b";
+        break;
+      case 8:
+        Prefix = "0";
+        break;
+      case 10:
+        break; // No prefix
+      case 16:
+        Prefix = "0x";
+        break;
+      default:
+        llvm_unreachable("Invalid radix!");
+    }
+  }
+
+  // First, check for a zero value and just short circuit the logic below.
+  if (*this == 0) {
+    while (*Prefix) {
+      Str.push_back(*Prefix);
+      ++Prefix;
+    };
+    Str.push_back('0');
+    return;
+  }
+
+  static const char Digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+  if (isSingleWord()) {
+    char Buffer[65];
+    char *BufPtr = Buffer+65;
+
+    uint64_t N;
+    if (!Signed) {
+      N = getZExtValue();
+    } else {
+      int64_t I = getSExtValue();
+      if (I >= 0) {
+        N = I;
+      } else {
+        Str.push_back('-');
+        N = -(uint64_t)I;
+      }
+    }
+
+    while (*Prefix) {
+      Str.push_back(*Prefix);
+      ++Prefix;
+    };
+
+    while (N) {
+      *--BufPtr = Digits[N % Radix];
+      N /= Radix;
+    }
+    Str.append(BufPtr, Buffer+65);
+    return;
+  }
+
+  APInt Tmp(*this);
+
+  if (Signed && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    Tmp.flipAllBits();
+    ++Tmp;
+    Str.push_back('-');
+  }
+
+  while (*Prefix) {
+    Str.push_back(*Prefix);
+    ++Prefix;
+  };
+
+  // We insert the digits backward, then reverse them to get the right order.
+  unsigned StartDig = Str.size();
+
+  // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+  // because the number of bits per digit (1, 3 and 4 respectively) divides
+  // equaly.  We just shift until the value is zero.
+  if (Radix == 2 || Radix == 8 || Radix == 16) {
+    // Just shift tmp right for each digit width until it becomes zero
+    unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
+    unsigned MaskAmt = Radix - 1;
+
+    while (Tmp != 0) {
+      unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
+      Str.push_back(Digits[Digit]);
+      Tmp = Tmp.lshr(ShiftAmt);
+    }
+  } else {
+    APInt divisor(Radix == 10? 4 : 8, Radix);
+    while (Tmp != 0) {
+      APInt APdigit(1, 0);
+      APInt tmp2(Tmp.getBitWidth(), 0);
+      divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
+             &APdigit);
+      unsigned Digit = (unsigned)APdigit.getZExtValue();
+      assert(Digit < Radix && "divide failed");
+      Str.push_back(Digits[Digit]);
+      Tmp = tmp2;
+    }
+  }
+
+  // Reverse the digits before returning.
+  std::reverse(Str.begin()+StartDig, Str.end());
+}
+
+/// toString - This returns the APInt as a std::string.  Note that this is an
+/// inefficient method.  It is better to pass in a SmallVector/SmallString
+/// to the methods above.
+std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
+  SmallString<40> S;
+  toString(S, Radix, Signed, /* formatAsCLiteral = */false);
+  return S.str();
+}
+
+
+void APInt::dump() const {
+  SmallString<40> S, U;
+  this->toStringUnsigned(U);
+  this->toStringSigned(S);
+  dbgs() << "APInt(" << BitWidth << "b, "
+         << U.str() << "u " << S.str() << "s)";
+}
+
+void APInt::print(raw_ostream &OS, bool isSigned) const {
+  SmallString<40> S;
+  this->toString(S, 10, isSigned, /* formatAsCLiteral = */false);
+  OS << S.str();
+}
+
+// This implements a variety of operations on a representation of
+// arbitrary precision, two's-complement, bignum integer values.
+
+// Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
+// and unrestricting assumption.
+#define COMPILE_TIME_ASSERT(cond) extern int CTAssert[(cond) ? 1 : -1]
+COMPILE_TIME_ASSERT(integerPartWidth % 2 == 0);
+
+/* Some handy functions local to this file.  */
+namespace {
+
+  /* Returns the integer part with the least significant BITS set.
+     BITS cannot be zero.  */
+  static inline integerPart
+  lowBitMask(unsigned int bits)
+  {
+    assert(bits != 0 && bits <= integerPartWidth);
+
+    return ~(integerPart) 0 >> (integerPartWidth - bits);
+  }
+
+  /* Returns the value of the lower half of PART.  */
+  static inline integerPart
+  lowHalf(integerPart part)
+  {
+    return part & lowBitMask(integerPartWidth / 2);
+  }
+
+  /* Returns the value of the upper half of PART.  */
+  static inline integerPart
+  highHalf(integerPart part)
+  {
+    return part >> (integerPartWidth / 2);
+  }
+
+  /* Returns the bit number of the most significant set bit of a part.
+     If the input number has no bits set -1U is returned.  */
+  static unsigned int
+  partMSB(integerPart value)
+  {
+    unsigned int n, msb;
+
+    if (value == 0)
+      return -1U;
+
+    n = integerPartWidth / 2;
+
+    msb = 0;
+    do {
+      if (value >> n) {
+        value >>= n;
+        msb += n;
+      }
+
+      n >>= 1;
+    } while (n);
+
+    return msb;
+  }
+
+  /* Returns the bit number of the least significant set bit of a
+     part.  If the input number has no bits set -1U is returned.  */
+  static unsigned int
+  partLSB(integerPart value)
+  {
+    unsigned int n, lsb;
+
+    if (value == 0)
+      return -1U;
+
+    lsb = integerPartWidth - 1;
+    n = integerPartWidth / 2;
+
+    do {
+      if (value << n) {
+        value <<= n;
+        lsb -= n;
+      }
+
+      n >>= 1;
+    } while (n);
+
+    return lsb;
+  }
+}
+
+/* Sets the least significant part of a bignum to the input value, and
+   zeroes out higher parts.  */
+void
+APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
+{
+  unsigned int i;
+
+  assert(parts > 0);
+
+  dst[0] = part;
+  for (i = 1; i < parts; i++)
+    dst[i] = 0;
+}
+
+/* Assign one bignum to another.  */
+void
+APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    dst[i] = src[i];
+}
+
+/* Returns true if a bignum is zero, false otherwise.  */
+bool
+APInt::tcIsZero(const integerPart *src, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    if (src[i])
+      return false;
+
+  return true;
+}
+
+/* Extract the given bit of a bignum; returns 0 or 1.  */
+int
+APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
+{
+  return (parts[bit / integerPartWidth] &
+          ((integerPart) 1 << bit % integerPartWidth)) != 0;
+}
+
+/* Set the given bit of a bignum. */
+void
+APInt::tcSetBit(integerPart *parts, unsigned int bit)
+{
+  parts[bit / integerPartWidth] |= (integerPart) 1 << (bit % integerPartWidth);
+}
+
+/* Clears the given bit of a bignum. */
+void
+APInt::tcClearBit(integerPart *parts, unsigned int bit)
+{
+  parts[bit / integerPartWidth] &=
+    ~((integerPart) 1 << (bit % integerPartWidth));
+}
+
+/* Returns the bit number of the least significant set bit of a
+   number.  If the input number has no bits set -1U is returned.  */
+unsigned int
+APInt::tcLSB(const integerPart *parts, unsigned int n)
+{
+  unsigned int i, lsb;
+
+  for (i = 0; i < n; i++) {
+      if (parts[i] != 0) {
+          lsb = partLSB(parts[i]);
+
+          return lsb + i * integerPartWidth;
+      }
+  }
+
+  return -1U;
+}
+
+/* Returns the bit number of the most significant set bit of a number.
+   If the input number has no bits set -1U is returned.  */
+unsigned int
+APInt::tcMSB(const integerPart *parts, unsigned int n)
+{
+  unsigned int msb;
+
+  do {
+    --n;
+
+    if (parts[n] != 0) {
+      msb = partMSB(parts[n]);
+
+      return msb + n * integerPartWidth;
+    }
+  } while (n);
+
+  return -1U;
+}
+
+/* Copy the bit vector of width srcBITS from SRC, starting at bit
+   srcLSB, to DST, of dstCOUNT parts, such that the bit srcLSB becomes
+   the least significant bit of DST.  All high bits above srcBITS in
+   DST are zero-filled.  */
+void
+APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
+                 unsigned int srcBits, unsigned int srcLSB)
+{
+  unsigned int firstSrcPart, dstParts, shift, n;
+
+  dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
+  assert(dstParts <= dstCount);
+
+  firstSrcPart = srcLSB / integerPartWidth;
+  tcAssign (dst, src + firstSrcPart, dstParts);
+
+  shift = srcLSB % integerPartWidth;
+  tcShiftRight (dst, dstParts, shift);
+
+  /* We now have (dstParts * integerPartWidth - shift) bits from SRC
+     in DST.  If this is less that srcBits, append the rest, else
+     clear the high bits.  */
+  n = dstParts * integerPartWidth - shift;
+  if (n < srcBits) {
+    integerPart mask = lowBitMask (srcBits - n);
+    dst[dstParts - 1] |= ((src[firstSrcPart + dstParts] & mask)
+                          << n % integerPartWidth);
+  } else if (n > srcBits) {
+    if (srcBits % integerPartWidth)
+      dst[dstParts - 1] &= lowBitMask (srcBits % integerPartWidth);
+  }
+
+  /* Clear high parts.  */
+  while (dstParts < dstCount)
+    dst[dstParts++] = 0;
+}
+
+/* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
+integerPart
+APInt::tcAdd(integerPart *dst, const integerPart *rhs,
+             integerPart c, unsigned int parts)
+{
+  unsigned int i;
+
+  assert(c <= 1);
+
+  for (i = 0; i < parts; i++) {
+    integerPart l;
+
+    l = dst[i];
+    if (c) {
+      dst[i] += rhs[i] + 1;
+      c = (dst[i] <= l);
+    } else {
+      dst[i] += rhs[i];
+      c = (dst[i] < l);
+    }
+  }
+
+  return c;
+}
+
+/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
+integerPart
+APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
+                  integerPart c, unsigned int parts)
+{
+  unsigned int i;
+
+  assert(c <= 1);
+
+  for (i = 0; i < parts; i++) {
+    integerPart l;
+
+    l = dst[i];
+    if (c) {
+      dst[i] -= rhs[i] + 1;
+      c = (dst[i] >= l);
+    } else {
+      dst[i] -= rhs[i];
+      c = (dst[i] > l);
+    }
+  }
+
+  return c;
+}
+
+/* Negate a bignum in-place.  */
+void
+APInt::tcNegate(integerPart *dst, unsigned int parts)
+{
+  tcComplement(dst, parts);
+  tcIncrement(dst, parts);
+}
+
+/*  DST += SRC * MULTIPLIER + CARRY   if add is true
+    DST  = SRC * MULTIPLIER + CARRY   if add is false
+
+    Requires 0 <= DSTPARTS <= SRCPARTS + 1.  If DST overlaps SRC
+    they must start at the same point, i.e. DST == SRC.
+
+    If DSTPARTS == SRCPARTS + 1 no overflow occurs and zero is
+    returned.  Otherwise DST is filled with the least significant
+    DSTPARTS parts of the result, and if all of the omitted higher
+    parts were zero return zero, otherwise overflow occurred and
+    return one.  */
+int
+APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
+                      integerPart multiplier, integerPart carry,
+                      unsigned int srcParts, unsigned int dstParts,
+                      bool add)
+{
+  unsigned int i, n;
+
+  /* Otherwise our writes of DST kill our later reads of SRC.  */
+  assert(dst <= src || dst >= src + srcParts);
+  assert(dstParts <= srcParts + 1);
+
+  /* N loops; minimum of dstParts and srcParts.  */
+  n = dstParts < srcParts ? dstParts: srcParts;
+
+  for (i = 0; i < n; i++) {
+    integerPart low, mid, high, srcPart;
+
+      /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
+
+         This cannot overflow, because
+
+         (n - 1) * (n - 1) + 2 (n - 1) = (n - 1) * (n + 1)
+
+         which is less than n^2.  */
+
+    srcPart = src[i];
+
+    if (multiplier == 0 || srcPart == 0)        {
+      low = carry;
+      high = 0;
+    } else {
+      low = lowHalf(srcPart) * lowHalf(multiplier);
+      high = highHalf(srcPart) * highHalf(multiplier);
+
+      mid = lowHalf(srcPart) * highHalf(multiplier);
+      high += highHalf(mid);
+      mid <<= integerPartWidth / 2;
+      if (low + mid < low)
+        high++;
+      low += mid;
+
+      mid = highHalf(srcPart) * lowHalf(multiplier);
+      high += highHalf(mid);
+      mid <<= integerPartWidth / 2;
+      if (low + mid < low)
+        high++;
+      low += mid;
+
+      /* Now add carry.  */
+      if (low + carry < low)
+        high++;
+      low += carry;
+    }
+
+    if (add) {
+      /* And now DST[i], and store the new low part there.  */
+      if (low + dst[i] < low)
+        high++;
+      dst[i] += low;
+    } else
+      dst[i] = low;
+
+    carry = high;
+  }
+
+  if (i < dstParts) {
+    /* Full multiplication, there is no overflow.  */
+    assert(i + 1 == dstParts);
+    dst[i] = carry;
+    return 0;
+  } else {
+    /* We overflowed if there is carry.  */
+    if (carry)
+      return 1;
+
+    /* We would overflow if any significant unwritten parts would be
+       non-zero.  This is true if any remaining src parts are non-zero
+       and the multiplier is non-zero.  */
+    if (multiplier)
+      for (; i < srcParts; i++)
+        if (src[i])
+          return 1;
+
+    /* We fitted in the narrow destination.  */
+    return 0;
+  }
+}
+
+/* DST = LHS * RHS, where DST has the same width as the operands and
+   is filled with the least significant parts of the result.  Returns
+   one if overflow occurred, otherwise zero.  DST must be disjoint
+   from both operands.  */
+int
+APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
+                  const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+  int overflow;
+
+  assert(dst != lhs && dst != rhs);
+
+  overflow = 0;
+  tcSet(dst, 0, parts);
+
+  for (i = 0; i < parts; i++)
+    overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
+                               parts - i, true);
+
+  return overflow;
+}
+
+/* DST = LHS * RHS, where DST has width the sum of the widths of the
+   operands.  No overflow occurs.  DST must be disjoint from both
+   operands.  Returns the number of parts required to hold the
+   result.  */
+unsigned int
+APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
+                      const integerPart *rhs, unsigned int lhsParts,
+                      unsigned int rhsParts)
+{
+  /* Put the narrower number on the LHS for less loops below.  */
+  if (lhsParts > rhsParts) {
+    return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
+  } else {
+    unsigned int n;
+
+    assert(dst != lhs && dst != rhs);
+
+    tcSet(dst, 0, rhsParts);
+
+    for (n = 0; n < lhsParts; n++)
+      tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
+
+    n = lhsParts + rhsParts;
+
+    return n - (dst[n - 1] == 0);
+  }
+}
+
+/* If RHS is zero LHS and REMAINDER are left unchanged, return one.
+   Otherwise set LHS to LHS / RHS with the fractional part discarded,
+   set REMAINDER to the remainder, return zero.  i.e.
+
+   OLD_LHS = RHS * LHS + REMAINDER
+
+   SCRATCH is a bignum of the same size as the operands and result for
+   use by the routine; its contents need not be initialized and are
+   destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
+*/
+int
+APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
+                integerPart *remainder, integerPart *srhs,
+                unsigned int parts)
+{
+  unsigned int n, shiftCount;
+  integerPart mask;
+
+  assert(lhs != remainder && lhs != srhs && remainder != srhs);
+
+  shiftCount = tcMSB(rhs, parts) + 1;
+  if (shiftCount == 0)
+    return true;
+
+  shiftCount = parts * integerPartWidth - shiftCount;
+  n = shiftCount / integerPartWidth;
+  mask = (integerPart) 1 << (shiftCount % integerPartWidth);
+
+  tcAssign(srhs, rhs, parts);
+  tcShiftLeft(srhs, parts, shiftCount);
+  tcAssign(remainder, lhs, parts);
+  tcSet(lhs, 0, parts);
+
+  /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
+     the total.  */
+  for (;;) {
+      int compare;
+
+      compare = tcCompare(remainder, srhs, parts);
+      if (compare >= 0) {
+        tcSubtract(remainder, srhs, 0, parts);
+        lhs[n] |= mask;
+      }
+
+      if (shiftCount == 0)
+        break;
+      shiftCount--;
+      tcShiftRight(srhs, parts, 1);
+      if ((mask >>= 1) == 0)
+        mask = (integerPart) 1 << (integerPartWidth - 1), n--;
+  }
+
+  return false;
+}
+
+/* Shift a bignum left COUNT bits in-place.  Shifted in bits are zero.
+   There are no restrictions on COUNT.  */
+void
+APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
+{
+  if (count) {
+    unsigned int jump, shift;
+
+    /* Jump is the inter-part jump; shift is is intra-part shift.  */
+    jump = count / integerPartWidth;
+    shift = count % integerPartWidth;
+
+    while (parts > jump) {
+      integerPart part;
+
+      parts--;
+
+      /* dst[i] comes from the two parts src[i - jump] and, if we have
+         an intra-part shift, src[i - jump - 1].  */
+      part = dst[parts - jump];
+      if (shift) {
+        part <<= shift;
+        if (parts >= jump + 1)
+          part |= dst[parts - jump - 1] >> (integerPartWidth - shift);
+      }
+
+      dst[parts] = part;
+    }
+
+    while (parts > 0)
+      dst[--parts] = 0;
+  }
+}
+
+/* Shift a bignum right COUNT bits in-place.  Shifted in bits are
+   zero.  There are no restrictions on COUNT.  */
+void
+APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
+{
+  if (count) {
+    unsigned int i, jump, shift;
+
+    /* Jump is the inter-part jump; shift is is intra-part shift.  */
+    jump = count / integerPartWidth;
+    shift = count % integerPartWidth;
+
+    /* Perform the shift.  This leaves the most significant COUNT bits
+       of the result at zero.  */
+    for (i = 0; i < parts; i++) {
+      integerPart part;
+
+      if (i + jump >= parts) {
+        part = 0;
+      } else {
+        part = dst[i + jump];
+        if (shift) {
+          part >>= shift;
+          if (i + jump + 1 < parts)
+            part |= dst[i + jump + 1] << (integerPartWidth - shift);
+        }
+      }
+
+      dst[i] = part;
+    }
+  }
+}
+
+/* Bitwise and of two bignums.  */
+void
+APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    dst[i] &= rhs[i];
+}
+
+/* Bitwise inclusive or of two bignums.  */
+void
+APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    dst[i] |= rhs[i];
+}
+
+/* Bitwise exclusive or of two bignums.  */
+void
+APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    dst[i] ^= rhs[i];
+}
+
+/* Complement a bignum in-place.  */
+void
+APInt::tcComplement(integerPart *dst, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    dst[i] = ~dst[i];
+}
+
+/* Comparison (unsigned) of two bignums.  */
+int
+APInt::tcCompare(const integerPart *lhs, const integerPart *rhs,
+                 unsigned int parts)
+{
+  while (parts) {
+      parts--;
+      if (lhs[parts] == rhs[parts])
+        continue;
+
+      if (lhs[parts] > rhs[parts])
+        return 1;
+      else
+        return -1;
+    }
+
+  return 0;
+}
+
+/* Increment a bignum in-place, return the carry flag.  */
+integerPart
+APInt::tcIncrement(integerPart *dst, unsigned int parts)
+{
+  unsigned int i;
+
+  for (i = 0; i < parts; i++)
+    if (++dst[i] != 0)
+      break;
+
+  return i == parts;
+}
+
+/* Set the least significant BITS bits of a bignum, clear the
+   rest.  */
+void
+APInt::tcSetLeastSignificantBits(integerPart *dst, unsigned int parts,
+                                 unsigned int bits)
+{
+  unsigned int i;
+
+  i = 0;
+  while (bits > integerPartWidth) {
+    dst[i++] = ~(integerPart) 0;
+    bits -= integerPartWidth;
+  }
+
+  if (bits)
+    dst[i++] = ~(integerPart) 0 >> (integerPartWidth - bits);
+
+  while (i < parts)
+    dst[i++] = 0;
+}
diff --git a/contrib/llvm/lib/Support/APSInt.cpp b/contrib/llvm/lib/Support/APSInt.cpp
new file mode 100644
index 000000000000..73acafa690c7
--- /dev/null
+++ b/contrib/llvm/lib/Support/APSInt.cpp
@@ -0,0 +1,23 @@
+//===-- llvm/ADT/APSInt.cpp - Arbitrary Precision Signed Int ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the APSInt class, which is a simple class that
+// represents an arbitrary sized integer that knows its signedness.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/FoldingSet.h"
+
+using namespace llvm;
+
+void APSInt::Profile(FoldingSetNodeID& ID) const {
+  ID.AddInteger((unsigned) (IsUnsigned ? 1 : 0));
+  APInt::Profile(ID);
+}
diff --git a/contrib/llvm/lib/Support/Allocator.cpp b/contrib/llvm/lib/Support/Allocator.cpp
new file mode 100644
index 000000000000..3c4191b805a3
--- /dev/null
+++ b/contrib/llvm/lib/Support/Allocator.cpp
@@ -0,0 +1,196 @@
+//===--- Allocator.cpp - Simple memory allocation abstraction -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the BumpPtrAllocator interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Recycler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+namespace llvm {
+
+BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold,
+                                   SlabAllocator &allocator)
+    : SlabSize(size), SizeThreshold(std::min(size, threshold)),
+      Allocator(allocator), CurSlab(0), BytesAllocated(0) { }
+
+BumpPtrAllocator::~BumpPtrAllocator() {
+  DeallocateSlabs(CurSlab);
+}
+
+/// AlignPtr - Align Ptr to Alignment bytes, rounding up.  Alignment should
+/// be a power of two.  This method rounds up, so AlignPtr(7, 4) == 8 and
+/// AlignPtr(8, 4) == 8.
+char *BumpPtrAllocator::AlignPtr(char *Ptr, size_t Alignment) {
+  assert(Alignment && (Alignment & (Alignment - 1)) == 0 &&
+         "Alignment is not a power of two!");
+
+  // Do the alignment.
+  return (char*)(((uintptr_t)Ptr + Alignment - 1) &
+                 ~(uintptr_t)(Alignment - 1));
+}
+
+/// StartNewSlab - Allocate a new slab and move the bump pointers over into
+/// the new slab.  Modifies CurPtr and End.
+void BumpPtrAllocator::StartNewSlab() {
+  // If we allocated a big number of slabs already it's likely that we're going
+  // to allocate more. Increase slab size to reduce mallocs and possibly memory
+  // overhead. The factors are chosen conservatively to avoid overallocation.
+  if (BytesAllocated >= SlabSize * 128)
+    SlabSize *= 2;
+
+  MemSlab *NewSlab = Allocator.Allocate(SlabSize);
+  NewSlab->NextPtr = CurSlab;
+  CurSlab = NewSlab;
+  CurPtr = (char*)(CurSlab + 1);
+  End = ((char*)CurSlab) + CurSlab->Size;
+}
+
+/// DeallocateSlabs - Deallocate all memory slabs after and including this
+/// one.
+void BumpPtrAllocator::DeallocateSlabs(MemSlab *Slab) {
+  while (Slab) {
+    MemSlab *NextSlab = Slab->NextPtr;
+#ifndef NDEBUG
+    // Poison the memory so stale pointers crash sooner.  Note we must
+    // preserve the Size and NextPtr fields at the beginning.
+    sys::Memory::setRangeWritable(Slab + 1, Slab->Size - sizeof(MemSlab));
+    memset(Slab + 1, 0xCD, Slab->Size - sizeof(MemSlab));
+#endif
+    Allocator.Deallocate(Slab);
+    Slab = NextSlab;
+  }
+}
+
+/// Reset - Deallocate all but the current slab and reset the current pointer
+/// to the beginning of it, freeing all memory allocated so far.
+void BumpPtrAllocator::Reset() {
+  if (!CurSlab)
+    return;
+  DeallocateSlabs(CurSlab->NextPtr);
+  CurSlab->NextPtr = 0;
+  CurPtr = (char*)(CurSlab + 1);
+  End = ((char*)CurSlab) + CurSlab->Size;
+  BytesAllocated = 0;
+}
+
+/// Allocate - Allocate space at the specified alignment.
+///
+void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
+  if (!CurSlab) // Start a new slab if we haven't allocated one already.
+    StartNewSlab();
+
+  // Keep track of how many bytes we've allocated.
+  BytesAllocated += Size;
+
+  // 0-byte alignment means 1-byte alignment.
+  if (Alignment == 0) Alignment = 1;
+
+  // Allocate the aligned space, going forwards from CurPtr.
+  char *Ptr = AlignPtr(CurPtr, Alignment);
+
+  // Check if we can hold it.
+  if (Ptr + Size <= End) {
+    CurPtr = Ptr + Size;
+    // Update the allocation point of this memory block in MemorySanitizer.
+    // Without this, MemorySanitizer messages for values originated from here
+    // will point to the allocation of the entire slab.
+    __msan_allocated_memory(Ptr, Size);
+    return Ptr;
+  }
+
+  // If Size is really big, allocate a separate slab for it.
+  size_t PaddedSize = Size + sizeof(MemSlab) + Alignment - 1;
+  if (PaddedSize > SizeThreshold) {
+    MemSlab *NewSlab = Allocator.Allocate(PaddedSize);
+
+    // Put the new slab after the current slab, since we are not allocating
+    // into it.
+    NewSlab->NextPtr = CurSlab->NextPtr;
+    CurSlab->NextPtr = NewSlab;
+
+    Ptr = AlignPtr((char*)(NewSlab + 1), Alignment);
+    assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + NewSlab->Size);
+    __msan_allocated_memory(Ptr, Size);
+    return Ptr;
+  }
+
+  // Otherwise, start a new slab and try again.
+  StartNewSlab();
+  Ptr = AlignPtr(CurPtr, Alignment);
+  CurPtr = Ptr + Size;
+  assert(CurPtr <= End && "Unable to allocate memory!");
+  __msan_allocated_memory(Ptr, Size);
+  return Ptr;
+}
+
+unsigned BumpPtrAllocator::GetNumSlabs() const {
+  unsigned NumSlabs = 0;
+  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
+    ++NumSlabs;
+  }
+  return NumSlabs;
+}
+
+size_t BumpPtrAllocator::getTotalMemory() const {
+  size_t TotalMemory = 0;
+  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
+    TotalMemory += Slab->Size;
+  }
+  return TotalMemory;
+}
+  
+void BumpPtrAllocator::PrintStats() const {
+  unsigned NumSlabs = 0;
+  size_t TotalMemory = 0;
+  for (MemSlab *Slab = CurSlab; Slab != 0; Slab = Slab->NextPtr) {
+    TotalMemory += Slab->Size;
+    ++NumSlabs;
+  }
+
+  errs() << "\nNumber of memory regions: " << NumSlabs << '\n'
+         << "Bytes used: " << BytesAllocated << '\n'
+         << "Bytes allocated: " << TotalMemory << '\n'
+         << "Bytes wasted: " << (TotalMemory - BytesAllocated)
+         << " (includes alignment, etc)\n";
+}
+
+MallocSlabAllocator BumpPtrAllocator::DefaultSlabAllocator =
+  MallocSlabAllocator();
+
+SlabAllocator::~SlabAllocator() { }
+
+MallocSlabAllocator::~MallocSlabAllocator() { }
+
+MemSlab *MallocSlabAllocator::Allocate(size_t Size) {
+  MemSlab *Slab = (MemSlab*)Allocator.Allocate(Size, 0);
+  Slab->Size = Size;
+  Slab->NextPtr = 0;
+  return Slab;
+}
+
+void MallocSlabAllocator::Deallocate(MemSlab *Slab) {
+  Allocator.Deallocate(Slab);
+}
+
+void PrintRecyclerStats(size_t Size,
+                        size_t Align,
+                        size_t FreeListSize) {
+  errs() << "Recycler element size: " << Size << '\n'
+         << "Recycler element alignment: " << Align << '\n'
+         << "Number of elements free for recycling: " << FreeListSize << '\n';
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Atomic.cpp b/contrib/llvm/lib/Support/Atomic.cpp
new file mode 100644
index 000000000000..9559ad729570
--- /dev/null
+++ b/contrib/llvm/lib/Support/Atomic.cpp
@@ -0,0 +1,116 @@
+//===-- Atomic.cpp - Atomic Operations --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements atomic operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Atomic.h"
+#include "llvm/Config/llvm-config.h"
+
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <windows.h>
+#undef MemoryFence
+#endif
+
+#if defined(__GNUC__) || (defined(__IBMCPP__) && __IBMCPP__ >= 1210)
+#define GNU_ATOMICS
+#endif
+
+void sys::MemoryFence() {
+#if LLVM_HAS_ATOMICS == 0
+  return;
+#else
+#  if defined(GNU_ATOMICS)
+  __sync_synchronize();
+#  elif defined(_MSC_VER)
+  MemoryBarrier();
+#  else
+# error No memory fence implementation for your platform!
+#  endif
+#endif
+}
+
+sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
+                                  sys::cas_flag new_value,
+                                  sys::cas_flag old_value) {
+#if LLVM_HAS_ATOMICS == 0
+  sys::cas_flag result = *ptr;
+  if (result == old_value)
+    *ptr = new_value;
+  return result;
+#elif defined(GNU_ATOMICS)
+  return __sync_val_compare_and_swap(ptr, old_value, new_value);
+#elif defined(_MSC_VER)
+  return InterlockedCompareExchange(ptr, new_value, old_value);
+#else
+#  error No compare-and-swap implementation for your platform!
+#endif
+}
+
+sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
+#if LLVM_HAS_ATOMICS == 0
+  ++(*ptr);
+  return *ptr;
+#elif defined(GNU_ATOMICS)
+  return __sync_add_and_fetch(ptr, 1);
+#elif defined(_MSC_VER)
+  return InterlockedIncrement(ptr);
+#else
+#  error No atomic increment implementation for your platform!
+#endif
+}
+
+sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
+#if LLVM_HAS_ATOMICS == 0
+  --(*ptr);
+  return *ptr;
+#elif defined(GNU_ATOMICS)
+  return __sync_sub_and_fetch(ptr, 1);
+#elif defined(_MSC_VER)
+  return InterlockedDecrement(ptr);
+#else
+#  error No atomic decrement implementation for your platform!
+#endif
+}
+
+sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+#if LLVM_HAS_ATOMICS == 0
+  *ptr += val;
+  return *ptr;
+#elif defined(GNU_ATOMICS)
+  return __sync_add_and_fetch(ptr, val);
+#elif defined(_MSC_VER)
+  return InterlockedExchangeAdd(ptr, val) + val;
+#else
+#  error No atomic add implementation for your platform!
+#endif
+}
+
+sys::cas_flag sys::AtomicMul(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+  sys::cas_flag original, result;
+  do {
+    original = *ptr;
+    result = original * val;
+  } while (sys::CompareAndSwap(ptr, result, original) != original);
+
+  return result;
+}
+
+sys::cas_flag sys::AtomicDiv(volatile sys::cas_flag* ptr, sys::cas_flag val) {
+  sys::cas_flag original, result;
+  do {
+    original = *ptr;
+    result = original / val;
+  } while (sys::CompareAndSwap(ptr, result, original) != original);
+
+  return result;
+}
diff --git a/contrib/llvm/lib/Support/BlockFrequency.cpp b/contrib/llvm/lib/Support/BlockFrequency.cpp
new file mode 100644
index 000000000000..84a993e3e5b6
--- /dev/null
+++ b/contrib/llvm/lib/Support/BlockFrequency.cpp
@@ -0,0 +1,130 @@
+//====--------------- lib/Support/BlockFrequency.cpp -----------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Block Frequency class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// mult96bit - Multiply FREQ by N and store result in W array.
+void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
+  uint64_t u0 = freq & UINT32_MAX;
+  uint64_t u1 = freq >> 32;
+
+  // Represent 96-bit value as w[2]:w[1]:w[0];
+  uint32_t w[3] = { 0, 0, 0 };
+
+  uint64_t t = u0 * N;
+  uint64_t k = t >> 32;
+  w[0] = t;
+  t = u1 * N + k;
+  w[1] = t;
+  w[2] = t >> 32;
+
+  // W[1] - higher bits.
+  // W[0] - lower bits.
+  W[0] = w[0] + ((uint64_t) w[1] << 32);
+  W[1] = w[2];
+}
+
+
+/// div96bit - Divide 96-bit value stored in W array by D. Return 64-bit frequency.
+uint64_t div96bit(uint64_t W[2], uint32_t D) {
+  uint64_t y = W[0];
+  uint64_t x = W[1];
+  int i;
+
+  for (i = 1; i <= 64 && x; ++i) {
+    uint32_t t = (int)x >> 31;
+    x = (x << 1) | (y >> 63);
+    y = y << 1;
+    if ((x | t) >= D) {
+      x -= D;
+      ++y;
+    }
+  }
+
+  return y << (64 - i + 1);
+}
+
+}
+
+
+BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
+  uint32_t n = Prob.getNumerator();
+  uint32_t d = Prob.getDenominator();
+
+  assert(n <= d && "Probability must be less or equal to 1.");
+
+  // Calculate Frequency * n.
+  uint64_t mulLo = (Frequency & UINT32_MAX) * n;
+  uint64_t mulHi = (Frequency >> 32) * n;
+  uint64_t mulRes = (mulHi << 32) + mulLo;
+
+  // If there was overflow use 96-bit operations.
+  if (mulHi > UINT32_MAX || mulRes < mulLo) {
+    // 96-bit value represented as W[1]:W[0].
+    uint64_t W[2];
+
+    // Probability is less or equal to 1 which means that results must fit
+    // 64-bit.
+    mult96bit(Frequency, n, W);
+    Frequency = div96bit(W, d);
+    return *this;
+  }
+
+  Frequency = mulRes / d;
+  return *this;
+}
+
+const BlockFrequency
+BlockFrequency::operator*(const BranchProbability &Prob) const {
+  BlockFrequency Freq(Frequency);
+  Freq *= Prob;
+  return Freq;
+}
+
+BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
+  uint64_t Before = Freq.Frequency;
+  Frequency += Freq.Frequency;
+
+  // If overflow, set frequency to the maximum value.
+  if (Frequency < Before)
+    Frequency = UINT64_MAX;
+
+  return *this;
+}
+
+const BlockFrequency
+BlockFrequency::operator+(const BlockFrequency &Prob) const {
+  BlockFrequency Freq(Frequency);
+  Freq += Prob;
+  return Freq;
+}
+
+void BlockFrequency::print(raw_ostream &OS) const {
+  OS << Frequency;
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BlockFrequency &Freq) {
+  Freq.print(OS);
+  return OS;
+}
+
+}
diff --git a/contrib/llvm/lib/Support/BranchProbability.cpp b/contrib/llvm/lib/Support/BranchProbability.cpp
new file mode 100644
index 000000000000..e8b83e59802d
--- /dev/null
+++ b/contrib/llvm/lib/Support/BranchProbability.cpp
@@ -0,0 +1,36 @@
+//===-------------- lib/Support/BranchProbability.cpp -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Branch Probability class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void BranchProbability::print(raw_ostream &OS) const {
+  OS << N << " / " << D << " = " << format("%g%%", ((double)N / D) * 100.0);
+}
+
+void BranchProbability::dump() const {
+  dbgs() << *this << '\n';
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BranchProbability &Prob) {
+  Prob.print(OS);
+  return OS;
+}
+
+}
diff --git a/contrib/llvm/lib/Support/COPYRIGHT.regex b/contrib/llvm/lib/Support/COPYRIGHT.regex
new file mode 100644
index 000000000000..a6392fd37c3d
--- /dev/null
+++ b/contrib/llvm/lib/Support/COPYRIGHT.regex
@@ -0,0 +1,54 @@
+$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
+
+Copyright 1992, 1993, 1994 Henry Spencer.  All rights reserved.
+This software is not subject to any license of the American Telephone
+and Telegraph Company or of the Regents of the University of California.
+
+Permission is granted to anyone to use this software for any purpose on
+any computer system, and to alter it and redistribute it, subject
+to the following restrictions:
+
+1. The author is not responsible for the consequences of use of this
+   software, no matter how awful, even if they arise from flaws in it.
+
+2. The origin of this software must not be misrepresented, either by
+   explicit claim or by omission.  Since few users ever read sources,
+   credits must appear in the documentation.
+
+3. Altered versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.  Since few users
+   ever read sources, credits must appear in the documentation.
+
+4. This notice may not be removed or altered.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+/*-
+ * Copyright (c) 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)COPYRIGHT	8.1 (Berkeley) 3/16/94
+ */
diff --git a/contrib/llvm/lib/Support/CommandLine.cpp b/contrib/llvm/lib/Support/CommandLine.cpp
new file mode 100644
index 000000000000..560d7eb289c6
--- /dev/null
+++ b/contrib/llvm/lib/Support/CommandLine.cpp
@@ -0,0 +1,1420 @@
+//===-- CommandLine.cpp - Command line parser implementation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements a command line argument processor that is useful when
+// creating a tool.  It provides a simple, minimalistic interface that is easily
+// extensible and supports nonlocal (library) command line options.
+//
+// Note that rather than trying to figure out what this code does, you could try
+// reading the library documentation located in docs/CommandLine.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cerrno>
+#include <cstdlib>
+using namespace llvm;
+using namespace cl;
+
+//===----------------------------------------------------------------------===//
+// Template instantiations and anchors.
+//
+namespace llvm { namespace cl {
+TEMPLATE_INSTANTIATION(class basic_parser<bool>);
+TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
+TEMPLATE_INSTANTIATION(class basic_parser<int>);
+TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
+TEMPLATE_INSTANTIATION(class basic_parser<unsigned long long>);
+TEMPLATE_INSTANTIATION(class basic_parser<double>);
+TEMPLATE_INSTANTIATION(class basic_parser<float>);
+TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
+TEMPLATE_INSTANTIATION(class basic_parser<char>);
+
+TEMPLATE_INSTANTIATION(class opt<unsigned>);
+TEMPLATE_INSTANTIATION(class opt<int>);
+TEMPLATE_INSTANTIATION(class opt<std::string>);
+TEMPLATE_INSTANTIATION(class opt<char>);
+TEMPLATE_INSTANTIATION(class opt<bool>);
+} } // end namespace llvm::cl
+
+void GenericOptionValue::anchor() {}
+void OptionValue<boolOrDefault>::anchor() {}
+void OptionValue<std::string>::anchor() {}
+void Option::anchor() {}
+void basic_parser_impl::anchor() {}
+void parser<bool>::anchor() {}
+void parser<boolOrDefault>::anchor() {}
+void parser<int>::anchor() {}
+void parser<unsigned>::anchor() {}
+void parser<unsigned long long>::anchor() {}
+void parser<double>::anchor() {}
+void parser<float>::anchor() {}
+void parser<std::string>::anchor() {}
+void parser<char>::anchor() {}
+
+//===----------------------------------------------------------------------===//
+
+// Globals for name and overview of program.  Program name is not a string to
+// avoid static ctor/dtor issues.
+static char ProgramName[80] = "<premain>";
+static const char *ProgramOverview = 0;
+
+// This collects additional help to be printed.
+static ManagedStatic<std::vector<const char*> > MoreHelp;
+
+extrahelp::extrahelp(const char *Help)
+  : morehelp(Help) {
+  MoreHelp->push_back(Help);
+}
+
+static bool OptionListChanged = false;
+
+// MarkOptionsChanged - Internal helper function.
+void cl::MarkOptionsChanged() {
+  OptionListChanged = true;
+}
+
+/// RegisteredOptionList - This is the list of the command line options that
+/// have statically constructed themselves.
+static Option *RegisteredOptionList = 0;
+
+void Option::addArgument() {
+  assert(NextRegistered == 0 && "argument multiply registered!");
+
+  NextRegistered = RegisteredOptionList;
+  RegisteredOptionList = this;
+  MarkOptionsChanged();
+}
+
+
+//===----------------------------------------------------------------------===//
+// Basic, shared command line option processing machinery.
+//
+
+/// GetOptionInfo - Scan the list of registered options, turning them into data
+/// structures that are easier to handle.
+static void GetOptionInfo(SmallVectorImpl<Option*> &PositionalOpts,
+                          SmallVectorImpl<Option*> &SinkOpts,
+                          StringMap<Option*> &OptionsMap) {
+  SmallVector<const char*, 16> OptionNames;
+  Option *CAOpt = 0;  // The ConsumeAfter option if it exists.
+  for (Option *O = RegisteredOptionList; O; O = O->getNextRegisteredOption()) {
+    // If this option wants to handle multiple option names, get the full set.
+    // This handles enum options like "-O1 -O2" etc.
+    O->getExtraOptionNames(OptionNames);
+    if (O->ArgStr[0])
+      OptionNames.push_back(O->ArgStr);
+
+    // Handle named options.
+    for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
+      // Add argument to the argument map!
+      if (OptionsMap.GetOrCreateValue(OptionNames[i], O).second != O) {
+        errs() << ProgramName << ": CommandLine Error: Argument '"
+             << OptionNames[i] << "' defined more than once!\n";
+      }
+    }
+
+    OptionNames.clear();
+
+    // Remember information about positional options.
+    if (O->getFormattingFlag() == cl::Positional)
+      PositionalOpts.push_back(O);
+    else if (O->getMiscFlags() & cl::Sink) // Remember sink options
+      SinkOpts.push_back(O);
+    else if (O->getNumOccurrencesFlag() == cl::ConsumeAfter) {
+      if (CAOpt)
+        O->error("Cannot specify more than one option with cl::ConsumeAfter!");
+      CAOpt = O;
+    }
+  }
+
+  if (CAOpt)
+    PositionalOpts.push_back(CAOpt);
+
+  // Make sure that they are in order of registration not backwards.
+  std::reverse(PositionalOpts.begin(), PositionalOpts.end());
+}
+
+
+/// LookupOption - Lookup the option specified by the specified option on the
+/// command line.  If there is a value specified (after an equal sign) return
+/// that as well.  This assumes that leading dashes have already been stripped.
+static Option *LookupOption(StringRef &Arg, StringRef &Value,
+                            const StringMap<Option*> &OptionsMap) {
+  // Reject all dashes.
+  if (Arg.empty()) return 0;
+
+  size_t EqualPos = Arg.find('=');
+
+  // If we have an equals sign, remember the value.
+  if (EqualPos == StringRef::npos) {
+    // Look up the option.
+    StringMap<Option*>::const_iterator I = OptionsMap.find(Arg);
+    return I != OptionsMap.end() ? I->second : 0;
+  }
+
+  // If the argument before the = is a valid option name, we match.  If not,
+  // return Arg unmolested.
+  StringMap<Option*>::const_iterator I =
+    OptionsMap.find(Arg.substr(0, EqualPos));
+  if (I == OptionsMap.end()) return 0;
+
+  Value = Arg.substr(EqualPos+1);
+  Arg = Arg.substr(0, EqualPos);
+  return I->second;
+}
+
+/// LookupNearestOption - Lookup the closest match to the option specified by
+/// the specified option on the command line.  If there is a value specified
+/// (after an equal sign) return that as well.  This assumes that leading dashes
+/// have already been stripped.
+static Option *LookupNearestOption(StringRef Arg,
+                                   const StringMap<Option*> &OptionsMap,
+                                   std::string &NearestString) {
+  // Reject all dashes.
+  if (Arg.empty()) return 0;
+
+  // Split on any equal sign.
+  std::pair<StringRef, StringRef> SplitArg = Arg.split('=');
+  StringRef &LHS = SplitArg.first;  // LHS == Arg when no '=' is present.
+  StringRef &RHS = SplitArg.second;
+
+  // Find the closest match.
+  Option *Best = 0;
+  unsigned BestDistance = 0;
+  for (StringMap<Option*>::const_iterator it = OptionsMap.begin(),
+         ie = OptionsMap.end(); it != ie; ++it) {
+    Option *O = it->second;
+    SmallVector<const char*, 16> OptionNames;
+    O->getExtraOptionNames(OptionNames);
+    if (O->ArgStr[0])
+      OptionNames.push_back(O->ArgStr);
+
+    bool PermitValue = O->getValueExpectedFlag() != cl::ValueDisallowed;
+    StringRef Flag = PermitValue ? LHS : Arg;
+    for (size_t i = 0, e = OptionNames.size(); i != e; ++i) {
+      StringRef Name = OptionNames[i];
+      unsigned Distance = StringRef(Name).edit_distance(
+        Flag, /*AllowReplacements=*/true, /*MaxEditDistance=*/BestDistance);
+      if (!Best || Distance < BestDistance) {
+        Best = O;
+        BestDistance = Distance;
+        if (RHS.empty() || !PermitValue)
+          NearestString = OptionNames[i];
+        else
+          NearestString = std::string(OptionNames[i]) + "=" + RHS.str();
+      }
+    }
+  }
+
+  return Best;
+}
+
+/// CommaSeparateAndAddOccurence - A wrapper around Handler->addOccurence() that
+/// does special handling of cl::CommaSeparated options.
+static bool CommaSeparateAndAddOccurence(Option *Handler, unsigned pos,
+                                         StringRef ArgName,
+                                         StringRef Value, bool MultiArg = false)
+{
+  // Check to see if this option accepts a comma separated list of values.  If
+  // it does, we have to split up the value into multiple values.
+  if (Handler->getMiscFlags() & CommaSeparated) {
+    StringRef Val(Value);
+    StringRef::size_type Pos = Val.find(',');
+
+    while (Pos != StringRef::npos) {
+      // Process the portion before the comma.
+      if (Handler->addOccurrence(pos, ArgName, Val.substr(0, Pos), MultiArg))
+        return true;
+      // Erase the portion before the comma, AND the comma.
+      Val = Val.substr(Pos+1);
+      Value.substr(Pos+1);  // Increment the original value pointer as well.
+      // Check for another comma.
+      Pos = Val.find(',');
+    }
+
+    Value = Val;
+  }
+
+  if (Handler->addOccurrence(pos, ArgName, Value, MultiArg))
+    return true;
+
+  return false;
+}
+
+/// ProvideOption - For Value, this differentiates between an empty value ("")
+/// and a null value (StringRef()).  The later is accepted for arguments that
+/// don't allow a value (-foo) the former is rejected (-foo=).
+static inline bool ProvideOption(Option *Handler, StringRef ArgName,
+                                 StringRef Value, int argc,
+                                 const char *const *argv, int &i) {
+  // Is this a multi-argument option?
+  unsigned NumAdditionalVals = Handler->getNumAdditionalVals();
+
+  // Enforce value requirements
+  switch (Handler->getValueExpectedFlag()) {
+  case ValueRequired:
+    if (Value.data() == 0) {       // No value specified?
+      if (i+1 >= argc)
+        return Handler->error("requires a value!");
+      // Steal the next argument, like for '-o filename'
+      Value = argv[++i];
+    }
+    break;
+  case ValueDisallowed:
+    if (NumAdditionalVals > 0)
+      return Handler->error("multi-valued option specified"
+                            " with ValueDisallowed modifier!");
+
+    if (Value.data())
+      return Handler->error("does not allow a value! '" +
+                            Twine(Value) + "' specified.");
+    break;
+  case ValueOptional:
+    break;
+  }
+
+  // If this isn't a multi-arg option, just run the handler.
+  if (NumAdditionalVals == 0)
+    return CommaSeparateAndAddOccurence(Handler, i, ArgName, Value);
+
+  // If it is, run the handle several times.
+  bool MultiArg = false;
+
+  if (Value.data()) {
+    if (CommaSeparateAndAddOccurence(Handler, i, ArgName, Value, MultiArg))
+      return true;
+    --NumAdditionalVals;
+    MultiArg = true;
+  }
+
+  while (NumAdditionalVals > 0) {
+    if (i+1 >= argc)
+      return Handler->error("not enough values!");
+    Value = argv[++i];
+
+    if (CommaSeparateAndAddOccurence(Handler, i, ArgName, Value, MultiArg))
+      return true;
+    MultiArg = true;
+    --NumAdditionalVals;
+  }
+  return false;
+}
+
+static bool ProvidePositionalOption(Option *Handler, StringRef Arg, int i) {
+  int Dummy = i;
+  return ProvideOption(Handler, Handler->ArgStr, Arg, 0, 0, Dummy);
+}
+
+
+// Option predicates...
+static inline bool isGrouping(const Option *O) {
+  return O->getFormattingFlag() == cl::Grouping;
+}
+static inline bool isPrefixedOrGrouping(const Option *O) {
+  return isGrouping(O) || O->getFormattingFlag() == cl::Prefix;
+}
+
+// getOptionPred - Check to see if there are any options that satisfy the
+// specified predicate with names that are the prefixes in Name.  This is
+// checked by progressively stripping characters off of the name, checking to
+// see if there options that satisfy the predicate.  If we find one, return it,
+// otherwise return null.
+//
+static Option *getOptionPred(StringRef Name, size_t &Length,
+                             bool (*Pred)(const Option*),
+                             const StringMap<Option*> &OptionsMap) {
+
+  StringMap<Option*>::const_iterator OMI = OptionsMap.find(Name);
+
+  // Loop while we haven't found an option and Name still has at least two
+  // characters in it (so that the next iteration will not be the empty
+  // string.
+  while (OMI == OptionsMap.end() && Name.size() > 1) {
+    Name = Name.substr(0, Name.size()-1);   // Chop off the last character.
+    OMI = OptionsMap.find(Name);
+  }
+
+  if (OMI != OptionsMap.end() && Pred(OMI->second)) {
+    Length = Name.size();
+    return OMI->second;    // Found one!
+  }
+  return 0;                // No option found!
+}
+
+/// HandlePrefixedOrGroupedOption - The specified argument string (which started
+/// with at least one '-') does not fully match an available option.  Check to
+/// see if this is a prefix or grouped option.  If so, split arg into output an
+/// Arg/Value pair and return the Option to parse it with.
+static Option *HandlePrefixedOrGroupedOption(StringRef &Arg, StringRef &Value,
+                                             bool &ErrorParsing,
+                                         const StringMap<Option*> &OptionsMap) {
+  if (Arg.size() == 1) return 0;
+
+  // Do the lookup!
+  size_t Length = 0;
+  Option *PGOpt = getOptionPred(Arg, Length, isPrefixedOrGrouping, OptionsMap);
+  if (PGOpt == 0) return 0;
+
+  // If the option is a prefixed option, then the value is simply the
+  // rest of the name...  so fall through to later processing, by
+  // setting up the argument name flags and value fields.
+  if (PGOpt->getFormattingFlag() == cl::Prefix) {
+    Value = Arg.substr(Length);
+    Arg = Arg.substr(0, Length);
+    assert(OptionsMap.count(Arg) && OptionsMap.find(Arg)->second == PGOpt);
+    return PGOpt;
+  }
+
+  // This must be a grouped option... handle them now.  Grouping options can't
+  // have values.
+  assert(isGrouping(PGOpt) && "Broken getOptionPred!");
+
+  do {
+    // Move current arg name out of Arg into OneArgName.
+    StringRef OneArgName = Arg.substr(0, Length);
+    Arg = Arg.substr(Length);
+
+    // Because ValueRequired is an invalid flag for grouped arguments,
+    // we don't need to pass argc/argv in.
+    assert(PGOpt->getValueExpectedFlag() != cl::ValueRequired &&
+           "Option can not be cl::Grouping AND cl::ValueRequired!");
+    int Dummy = 0;
+    ErrorParsing |= ProvideOption(PGOpt, OneArgName,
+                                  StringRef(), 0, 0, Dummy);
+
+    // Get the next grouping option.
+    PGOpt = getOptionPred(Arg, Length, isGrouping, OptionsMap);
+  } while (PGOpt && Length != Arg.size());
+
+  // Return the last option with Arg cut down to just the last one.
+  return PGOpt;
+}
+
+
+
+static bool RequiresValue(const Option *O) {
+  return O->getNumOccurrencesFlag() == cl::Required ||
+         O->getNumOccurrencesFlag() == cl::OneOrMore;
+}
+
+static bool EatsUnboundedNumberOfValues(const Option *O) {
+  return O->getNumOccurrencesFlag() == cl::ZeroOrMore ||
+         O->getNumOccurrencesFlag() == cl::OneOrMore;
+}
+
+/// ParseCStringVector - Break INPUT up wherever one or more
+/// whitespace characters are found, and store the resulting tokens in
+/// OUTPUT. The tokens stored in OUTPUT are dynamically allocated
+/// using strdup(), so it is the caller's responsibility to free()
+/// them later.
+///
+static void ParseCStringVector(std::vector<char *> &OutputVector,
+                               const char *Input) {
+  // Characters which will be treated as token separators:
+  StringRef Delims = " \v\f\t\r\n";
+
+  StringRef WorkStr(Input);
+  while (!WorkStr.empty()) {
+    // If the first character is a delimiter, strip them off.
+    if (Delims.find(WorkStr[0]) != StringRef::npos) {
+      size_t Pos = WorkStr.find_first_not_of(Delims);
+      if (Pos == StringRef::npos) Pos = WorkStr.size();
+      WorkStr = WorkStr.substr(Pos);
+      continue;
+    }
+
+    // Find position of first delimiter.
+    size_t Pos = WorkStr.find_first_of(Delims);
+    if (Pos == StringRef::npos) Pos = WorkStr.size();
+
+    // Everything from 0 to Pos is the next word to copy.
+    char *NewStr = (char*)malloc(Pos+1);
+    memcpy(NewStr, WorkStr.data(), Pos);
+    NewStr[Pos] = 0;
+    OutputVector.push_back(NewStr);
+
+    WorkStr = WorkStr.substr(Pos);
+  }
+}
+
+/// ParseEnvironmentOptions - An alternative entry point to the
+/// CommandLine library, which allows you to read the program's name
+/// from the caller (as PROGNAME) and its command-line arguments from
+/// an environment variable (whose name is given in ENVVAR).
+///
+void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
+                                 const char *Overview) {
+  // Check args.
+  assert(progName && "Program name not specified");
+  assert(envVar && "Environment variable name missing");
+
+  // Get the environment variable they want us to parse options out of.
+  const char *envValue = getenv(envVar);
+  if (!envValue)
+    return;
+
+  // Get program's "name", which we wouldn't know without the caller
+  // telling us.
+  std::vector<char*> newArgv;
+  newArgv.push_back(strdup(progName));
+
+  // Parse the value of the environment variable into a "command line"
+  // and hand it off to ParseCommandLineOptions().
+  ParseCStringVector(newArgv, envValue);
+  int newArgc = static_cast<int>(newArgv.size());
+  ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
+
+  // Free all the strdup()ed strings.
+  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
+       i != e; ++i)
+    free(*i);
+}
+
+
+/// ExpandResponseFiles - Copy the contents of argv into newArgv,
+/// substituting the contents of the response files for the arguments
+/// of type @file.
+static void ExpandResponseFiles(unsigned argc, const char*const* argv,
+                                std::vector<char*>& newArgv) {
+  for (unsigned i = 1; i != argc; ++i) {
+    const char *arg = argv[i];
+
+    if (arg[0] == '@') {
+      sys::PathWithStatus respFile(++arg);
+
+      // Check that the response file is not empty (mmap'ing empty
+      // files can be problematic).
+      const sys::FileStatus *FileStat = respFile.getFileStatus();
+      if (FileStat && FileStat->getSize() != 0) {
+
+        // If we could open the file, parse its contents, otherwise
+        // pass the @file option verbatim.
+
+        // TODO: we should also support recursive loading of response files,
+        // since this is how gcc behaves. (From their man page: "The file may
+        // itself contain additional @file options; any such options will be
+        // processed recursively.")
+
+        // Mmap the response file into memory.
+        OwningPtr<MemoryBuffer> respFilePtr;
+        if (!MemoryBuffer::getFile(respFile.c_str(), respFilePtr)) {
+          ParseCStringVector(newArgv, respFilePtr->getBufferStart());
+          continue;
+        }
+      }
+    }
+    newArgv.push_back(strdup(arg));
+  }
+}
+
+void cl::ParseCommandLineOptions(int argc, const char * const *argv,
+                                 const char *Overview) {
+  // Process all registered options.
+  SmallVector<Option*, 4> PositionalOpts;
+  SmallVector<Option*, 4> SinkOpts;
+  StringMap<Option*> Opts;
+  GetOptionInfo(PositionalOpts, SinkOpts, Opts);
+
+  assert((!Opts.empty() || !PositionalOpts.empty()) &&
+         "No options specified!");
+
+  // Expand response files.
+  std::vector<char*> newArgv;
+  newArgv.push_back(strdup(argv[0]));
+  ExpandResponseFiles(argc, argv, newArgv);
+  argv = &newArgv[0];
+  argc = static_cast<int>(newArgv.size());
+
+  // Copy the program name into ProgName, making sure not to overflow it.
+  std::string ProgName = sys::path::filename(argv[0]);
+  size_t Len = std::min(ProgName.size(), size_t(79));
+  memcpy(ProgramName, ProgName.data(), Len);
+  ProgramName[Len] = '\0';
+
+  ProgramOverview = Overview;
+  bool ErrorParsing = false;
+
+  // Check out the positional arguments to collect information about them.
+  unsigned NumPositionalRequired = 0;
+
+  // Determine whether or not there are an unlimited number of positionals
+  bool HasUnlimitedPositionals = false;
+
+  Option *ConsumeAfterOpt = 0;
+  if (!PositionalOpts.empty()) {
+    if (PositionalOpts[0]->getNumOccurrencesFlag() == cl::ConsumeAfter) {
+      assert(PositionalOpts.size() > 1 &&
+             "Cannot specify cl::ConsumeAfter without a positional argument!");
+      ConsumeAfterOpt = PositionalOpts[0];
+    }
+
+    // Calculate how many positional values are _required_.
+    bool UnboundedFound = false;
+    for (size_t i = ConsumeAfterOpt != 0, e = PositionalOpts.size();
+         i != e; ++i) {
+      Option *Opt = PositionalOpts[i];
+      if (RequiresValue(Opt))
+        ++NumPositionalRequired;
+      else if (ConsumeAfterOpt) {
+        // ConsumeAfter cannot be combined with "optional" positional options
+        // unless there is only one positional argument...
+        if (PositionalOpts.size() > 2)
+          ErrorParsing |=
+            Opt->error("error - this positional option will never be matched, "
+                       "because it does not Require a value, and a "
+                       "cl::ConsumeAfter option is active!");
+      } else if (UnboundedFound && !Opt->ArgStr[0]) {
+        // This option does not "require" a value...  Make sure this option is
+        // not specified after an option that eats all extra arguments, or this
+        // one will never get any!
+        //
+        ErrorParsing |= Opt->error("error - option can never match, because "
+                                   "another positional argument will match an "
+                                   "unbounded number of values, and this option"
+                                   " does not require a value!");
+      }
+      UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
+    }
+    HasUnlimitedPositionals = UnboundedFound || ConsumeAfterOpt;
+  }
+
+  // PositionalVals - A vector of "positional" arguments we accumulate into
+  // the process at the end.
+  //
+  SmallVector<std::pair<StringRef,unsigned>, 4> PositionalVals;
+
+  // If the program has named positional arguments, and the name has been run
+  // across, keep track of which positional argument was named.  Otherwise put
+  // the positional args into the PositionalVals list...
+  Option *ActivePositionalArg = 0;
+
+  // Loop over all of the arguments... processing them.
+  bool DashDashFound = false;  // Have we read '--'?
+  for (int i = 1; i < argc; ++i) {
+    Option *Handler = 0;
+    Option *NearestHandler = 0;
+    std::string NearestHandlerString;
+    StringRef Value;
+    StringRef ArgName = "";
+
+    // If the option list changed, this means that some command line
+    // option has just been registered or deregistered.  This can occur in
+    // response to things like -load, etc.  If this happens, rescan the options.
+    if (OptionListChanged) {
+      PositionalOpts.clear();
+      SinkOpts.clear();
+      Opts.clear();
+      GetOptionInfo(PositionalOpts, SinkOpts, Opts);
+      OptionListChanged = false;
+    }
+
+    // Check to see if this is a positional argument.  This argument is
+    // considered to be positional if it doesn't start with '-', if it is "-"
+    // itself, or if we have seen "--" already.
+    //
+    if (argv[i][0] != '-' || argv[i][1] == 0 || DashDashFound) {
+      // Positional argument!
+      if (ActivePositionalArg) {
+        ProvidePositionalOption(ActivePositionalArg, argv[i], i);
+        continue;  // We are done!
+      }
+
+      if (!PositionalOpts.empty()) {
+        PositionalVals.push_back(std::make_pair(argv[i],i));
+
+        // All of the positional arguments have been fulfulled, give the rest to
+        // the consume after option... if it's specified...
+        //
+        if (PositionalVals.size() >= NumPositionalRequired &&
+            ConsumeAfterOpt != 0) {
+          for (++i; i < argc; ++i)
+            PositionalVals.push_back(std::make_pair(argv[i],i));
+          break;   // Handle outside of the argument processing loop...
+        }
+
+        // Delay processing positional arguments until the end...
+        continue;
+      }
+    } else if (argv[i][0] == '-' && argv[i][1] == '-' && argv[i][2] == 0 &&
+               !DashDashFound) {
+      DashDashFound = true;  // This is the mythical "--"?
+      continue;              // Don't try to process it as an argument itself.
+    } else if (ActivePositionalArg &&
+               (ActivePositionalArg->getMiscFlags() & PositionalEatsArgs)) {
+      // If there is a positional argument eating options, check to see if this
+      // option is another positional argument.  If so, treat it as an argument,
+      // otherwise feed it to the eating positional.
+      ArgName = argv[i]+1;
+      // Eat leading dashes.
+      while (!ArgName.empty() && ArgName[0] == '-')
+        ArgName = ArgName.substr(1);
+
+      Handler = LookupOption(ArgName, Value, Opts);
+      if (!Handler || Handler->getFormattingFlag() != cl::Positional) {
+        ProvidePositionalOption(ActivePositionalArg, argv[i], i);
+        continue;  // We are done!
+      }
+
+    } else {     // We start with a '-', must be an argument.
+      ArgName = argv[i]+1;
+      // Eat leading dashes.
+      while (!ArgName.empty() && ArgName[0] == '-')
+        ArgName = ArgName.substr(1);
+
+      Handler = LookupOption(ArgName, Value, Opts);
+
+      // Check to see if this "option" is really a prefixed or grouped argument.
+      if (Handler == 0)
+        Handler = HandlePrefixedOrGroupedOption(ArgName, Value,
+                                                ErrorParsing, Opts);
+
+      // Otherwise, look for the closest available option to report to the user
+      // in the upcoming error.
+      if (Handler == 0 && SinkOpts.empty())
+        NearestHandler = LookupNearestOption(ArgName, Opts,
+                                             NearestHandlerString);
+    }
+
+    if (Handler == 0) {
+      if (SinkOpts.empty()) {
+        errs() << ProgramName << ": Unknown command line argument '"
+             << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
+
+        if (NearestHandler) {
+          // If we know a near match, report it as well.
+          errs() << ProgramName << ": Did you mean '-"
+                 << NearestHandlerString << "'?\n";
+        }
+
+        ErrorParsing = true;
+      } else {
+        for (SmallVectorImpl<Option*>::iterator I = SinkOpts.begin(),
+               E = SinkOpts.end(); I != E ; ++I)
+          (*I)->addOccurrence(i, "", argv[i]);
+      }
+      continue;
+    }
+
+    // If this is a named positional argument, just remember that it is the
+    // active one...
+    if (Handler->getFormattingFlag() == cl::Positional)
+      ActivePositionalArg = Handler;
+    else
+      ErrorParsing |= ProvideOption(Handler, ArgName, Value, argc, argv, i);
+  }
+
+  // Check and handle positional arguments now...
+  if (NumPositionalRequired > PositionalVals.size()) {
+    errs() << ProgramName
+         << ": Not enough positional command line arguments specified!\n"
+         << "Must specify at least " << NumPositionalRequired
+         << " positional arguments: See: " << argv[0] << " -help\n";
+
+    ErrorParsing = true;
+  } else if (!HasUnlimitedPositionals &&
+             PositionalVals.size() > PositionalOpts.size()) {
+    errs() << ProgramName
+         << ": Too many positional arguments specified!\n"
+         << "Can specify at most " << PositionalOpts.size()
+         << " positional arguments: See: " << argv[0] << " -help\n";
+    ErrorParsing = true;
+
+  } else if (ConsumeAfterOpt == 0) {
+    // Positional args have already been handled if ConsumeAfter is specified.
+    unsigned ValNo = 0, NumVals = static_cast<unsigned>(PositionalVals.size());
+    for (size_t i = 0, e = PositionalOpts.size(); i != e; ++i) {
+      if (RequiresValue(PositionalOpts[i])) {
+        ProvidePositionalOption(PositionalOpts[i], PositionalVals[ValNo].first,
+                                PositionalVals[ValNo].second);
+        ValNo++;
+        --NumPositionalRequired;  // We fulfilled our duty...
+      }
+
+      // If we _can_ give this option more arguments, do so now, as long as we
+      // do not give it values that others need.  'Done' controls whether the
+      // option even _WANTS_ any more.
+      //
+      bool Done = PositionalOpts[i]->getNumOccurrencesFlag() == cl::Required;
+      while (NumVals-ValNo > NumPositionalRequired && !Done) {
+        switch (PositionalOpts[i]->getNumOccurrencesFlag()) {
+        case cl::Optional:
+          Done = true;          // Optional arguments want _at most_ one value
+          // FALL THROUGH
+        case cl::ZeroOrMore:    // Zero or more will take all they can get...
+        case cl::OneOrMore:     // One or more will take all they can get...
+          ProvidePositionalOption(PositionalOpts[i],
+                                  PositionalVals[ValNo].first,
+                                  PositionalVals[ValNo].second);
+          ValNo++;
+          break;
+        default:
+          llvm_unreachable("Internal error, unexpected NumOccurrences flag in "
+                 "positional argument processing!");
+        }
+      }
+    }
+  } else {
+    assert(ConsumeAfterOpt && NumPositionalRequired <= PositionalVals.size());
+    unsigned ValNo = 0;
+    for (size_t j = 1, e = PositionalOpts.size(); j != e; ++j)
+      if (RequiresValue(PositionalOpts[j])) {
+        ErrorParsing |= ProvidePositionalOption(PositionalOpts[j],
+                                                PositionalVals[ValNo].first,
+                                                PositionalVals[ValNo].second);
+        ValNo++;
+      }
+
+    // Handle the case where there is just one positional option, and it's
+    // optional.  In this case, we want to give JUST THE FIRST option to the
+    // positional option and keep the rest for the consume after.  The above
+    // loop would have assigned no values to positional options in this case.
+    //
+    if (PositionalOpts.size() == 2 && ValNo == 0 && !PositionalVals.empty()) {
+      ErrorParsing |= ProvidePositionalOption(PositionalOpts[1],
+                                              PositionalVals[ValNo].first,
+                                              PositionalVals[ValNo].second);
+      ValNo++;
+    }
+
+    // Handle over all of the rest of the arguments to the
+    // cl::ConsumeAfter command line option...
+    for (; ValNo != PositionalVals.size(); ++ValNo)
+      ErrorParsing |= ProvidePositionalOption(ConsumeAfterOpt,
+                                              PositionalVals[ValNo].first,
+                                              PositionalVals[ValNo].second);
+  }
+
+  // Loop over args and make sure all required args are specified!
+  for (StringMap<Option*>::iterator I = Opts.begin(),
+         E = Opts.end(); I != E; ++I) {
+    switch (I->second->getNumOccurrencesFlag()) {
+    case Required:
+    case OneOrMore:
+      if (I->second->getNumOccurrences() == 0) {
+        I->second->error("must be specified at least once!");
+        ErrorParsing = true;
+      }
+      // Fall through
+    default:
+      break;
+    }
+  }
+
+  // Now that we know if -debug is specified, we can use it.
+  // Note that if ReadResponseFiles == true, this must be done before the
+  // memory allocated for the expanded command line is free()d below.
+  DEBUG(dbgs() << "Args: ";
+        for (int i = 0; i < argc; ++i)
+          dbgs() << argv[i] << ' ';
+        dbgs() << '\n';
+       );
+
+  // Free all of the memory allocated to the map.  Command line options may only
+  // be processed once!
+  Opts.clear();
+  PositionalOpts.clear();
+  MoreHelp->clear();
+
+  // Free the memory allocated by ExpandResponseFiles.
+  // Free all the strdup()ed strings.
+  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
+       i != e; ++i)
+    free(*i);
+
+  // If we had an error processing our arguments, don't let the program execute
+  if (ErrorParsing) exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+// Option Base class implementation
+//
+
+bool Option::error(const Twine &Message, StringRef ArgName) {
+  if (ArgName.data() == 0) ArgName = ArgStr;
+  if (ArgName.empty())
+    errs() << HelpStr;  // Be nice for positional arguments
+  else
+    errs() << ProgramName << ": for the -" << ArgName;
+
+  errs() << " option: " << Message << "\n";
+  return true;
+}
+
+bool Option::addOccurrence(unsigned pos, StringRef ArgName,
+                           StringRef Value, bool MultiArg) {
+  if (!MultiArg)
+    NumOccurrences++;   // Increment the number of times we have been seen
+
+  switch (getNumOccurrencesFlag()) {
+  case Optional:
+    if (NumOccurrences > 1)
+      return error("may only occur zero or one times!", ArgName);
+    break;
+  case Required:
+    if (NumOccurrences > 1)
+      return error("must occur exactly one time!", ArgName);
+    // Fall through
+  case OneOrMore:
+  case ZeroOrMore:
+  case ConsumeAfter: break;
+  }
+
+  return handleOccurrence(pos, ArgName, Value);
+}
+
+
+// getValueStr - Get the value description string, using "DefaultMsg" if nothing
+// has been specified yet.
+//
+static const char *getValueStr(const Option &O, const char *DefaultMsg) {
+  if (O.ValueStr[0] == 0) return DefaultMsg;
+  return O.ValueStr;
+}
+
+//===----------------------------------------------------------------------===//
+// cl::alias class implementation
+//
+
+// Return the width of the option tag for printing...
+size_t alias::getOptionWidth() const {
+  return std::strlen(ArgStr)+6;
+}
+
+// Print out the option for the alias.
+void alias::printOptionInfo(size_t GlobalWidth) const {
+  size_t L = std::strlen(ArgStr);
+  outs() << "  -" << ArgStr;
+  outs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Parser Implementation code...
+//
+
+// basic_parser implementation
+//
+
+// Return the width of the option tag for printing...
+size_t basic_parser_impl::getOptionWidth(const Option &O) const {
+  size_t Len = std::strlen(O.ArgStr);
+  if (const char *ValName = getValueName())
+    Len += std::strlen(getValueStr(O, ValName))+3;
+
+  return Len + 6;
+}
+
+// printOptionInfo - Print out information about this option.  The
+// to-be-maintained width is specified.
+//
+void basic_parser_impl::printOptionInfo(const Option &O,
+                                        size_t GlobalWidth) const {
+  outs() << "  -" << O.ArgStr;
+
+  if (const char *ValName = getValueName())
+    outs() << "=<" << getValueStr(O, ValName) << '>';
+
+  outs().indent(GlobalWidth-getOptionWidth(O)) << " - " << O.HelpStr << '\n';
+}
+
+void basic_parser_impl::printOptionName(const Option &O,
+                                        size_t GlobalWidth) const {
+  outs() << "  -" << O.ArgStr;
+  outs().indent(GlobalWidth-std::strlen(O.ArgStr));
+}
+
+
+// parser<bool> implementation
+//
+bool parser<bool>::parse(Option &O, StringRef ArgName,
+                         StringRef Arg, bool &Value) {
+  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
+      Arg == "1") {
+    Value = true;
+    return false;
+  }
+
+  if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
+    Value = false;
+    return false;
+  }
+  return O.error("'" + Arg +
+                 "' is invalid value for boolean argument! Try 0 or 1");
+}
+
+// parser<boolOrDefault> implementation
+//
+bool parser<boolOrDefault>::parse(Option &O, StringRef ArgName,
+                                  StringRef Arg, boolOrDefault &Value) {
+  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
+      Arg == "1") {
+    Value = BOU_TRUE;
+    return false;
+  }
+  if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
+    Value = BOU_FALSE;
+    return false;
+  }
+
+  return O.error("'" + Arg +
+                 "' is invalid value for boolean argument! Try 0 or 1");
+}
+
+// parser<int> implementation
+//
+bool parser<int>::parse(Option &O, StringRef ArgName,
+                        StringRef Arg, int &Value) {
+  if (Arg.getAsInteger(0, Value))
+    return O.error("'" + Arg + "' value invalid for integer argument!");
+  return false;
+}
+
+// parser<unsigned> implementation
+//
+bool parser<unsigned>::parse(Option &O, StringRef ArgName,
+                             StringRef Arg, unsigned &Value) {
+
+  if (Arg.getAsInteger(0, Value))
+    return O.error("'" + Arg + "' value invalid for uint argument!");
+  return false;
+}
+
+// parser<unsigned long long> implementation
+//
+bool parser<unsigned long long>::parse(Option &O, StringRef ArgName,
+                                      StringRef Arg, unsigned long long &Value){
+
+  if (Arg.getAsInteger(0, Value))
+    return O.error("'" + Arg + "' value invalid for uint argument!");
+  return false;
+}
+
+// parser<double>/parser<float> implementation
+//
+static bool parseDouble(Option &O, StringRef Arg, double &Value) {
+  SmallString<32> TmpStr(Arg.begin(), Arg.end());
+  const char *ArgStart = TmpStr.c_str();
+  char *End;
+  Value = strtod(ArgStart, &End);
+  if (*End != 0)
+    return O.error("'" + Arg + "' value invalid for floating point argument!");
+  return false;
+}
+
+bool parser<double>::parse(Option &O, StringRef ArgName,
+                           StringRef Arg, double &Val) {
+  return parseDouble(O, Arg, Val);
+}
+
+bool parser<float>::parse(Option &O, StringRef ArgName,
+                          StringRef Arg, float &Val) {
+  double dVal;
+  if (parseDouble(O, Arg, dVal))
+    return true;
+  Val = (float)dVal;
+  return false;
+}
+
+
+
+// generic_parser_base implementation
+//
+
+// findOption - Return the option number corresponding to the specified
+// argument string.  If the option is not found, getNumOptions() is returned.
+//
+unsigned generic_parser_base::findOption(const char *Name) {
+  unsigned e = getNumOptions();
+
+  for (unsigned i = 0; i != e; ++i) {
+    if (strcmp(getOption(i), Name) == 0)
+      return i;
+  }
+  return e;
+}
+
+
+// Return the width of the option tag for printing...
+size_t generic_parser_base::getOptionWidth(const Option &O) const {
+  if (O.hasArgStr()) {
+    size_t Size = std::strlen(O.ArgStr)+6;
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
+      Size = std::max(Size, std::strlen(getOption(i))+8);
+    return Size;
+  } else {
+    size_t BaseSize = 0;
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i)
+      BaseSize = std::max(BaseSize, std::strlen(getOption(i))+8);
+    return BaseSize;
+  }
+}
+
+// printOptionInfo - Print out information about this option.  The
+// to-be-maintained width is specified.
+//
+void generic_parser_base::printOptionInfo(const Option &O,
+                                          size_t GlobalWidth) const {
+  if (O.hasArgStr()) {
+    size_t L = std::strlen(O.ArgStr);
+    outs() << "  -" << O.ArgStr;
+    outs().indent(GlobalWidth-L-6) << " - " << O.HelpStr << '\n';
+
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+      size_t NumSpaces = GlobalWidth-strlen(getOption(i))-8;
+      outs() << "    =" << getOption(i);
+      outs().indent(NumSpaces) << " -   " << getDescription(i) << '\n';
+    }
+  } else {
+    if (O.HelpStr[0])
+      outs() << "  " << O.HelpStr << '\n';
+    for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
+      size_t L = std::strlen(getOption(i));
+      outs() << "    -" << getOption(i);
+      outs().indent(GlobalWidth-L-8) << " - " << getDescription(i) << '\n';
+    }
+  }
+}
+
+static const size_t MaxOptWidth = 8; // arbitrary spacing for printOptionDiff
+
+// printGenericOptionDiff - Print the value of this option and it's default.
+//
+// "Generic" options have each value mapped to a name.
+void generic_parser_base::
+printGenericOptionDiff(const Option &O, const GenericOptionValue &Value,
+                       const GenericOptionValue &Default,
+                       size_t GlobalWidth) const {
+  outs() << "  -" << O.ArgStr;
+  outs().indent(GlobalWidth-std::strlen(O.ArgStr));
+
+  unsigned NumOpts = getNumOptions();
+  for (unsigned i = 0; i != NumOpts; ++i) {
+    if (Value.compare(getOptionValue(i)))
+      continue;
+
+    outs() << "= " << getOption(i);
+    size_t L = std::strlen(getOption(i));
+    size_t NumSpaces = MaxOptWidth > L ? MaxOptWidth - L : 0;
+    outs().indent(NumSpaces) << " (default: ";
+    for (unsigned j = 0; j != NumOpts; ++j) {
+      if (Default.compare(getOptionValue(j)))
+        continue;
+      outs() << getOption(j);
+      break;
+    }
+    outs() << ")\n";
+    return;
+  }
+  outs() << "= *unknown option value*\n";
+}
+
+// printOptionDiff - Specializations for printing basic value types.
+//
+#define PRINT_OPT_DIFF(T)                                               \
+  void parser<T>::                                                      \
+  printOptionDiff(const Option &O, T V, OptionValue<T> D,               \
+                  size_t GlobalWidth) const {                           \
+    printOptionName(O, GlobalWidth);                                    \
+    std::string Str;                                                    \
+    {                                                                   \
+      raw_string_ostream SS(Str);                                       \
+      SS << V;                                                          \
+    }                                                                   \
+    outs() << "= " << Str;                                              \
+    size_t NumSpaces = MaxOptWidth > Str.size() ? MaxOptWidth - Str.size() : 0;\
+    outs().indent(NumSpaces) << " (default: ";                          \
+    if (D.hasValue())                                                   \
+      outs() << D.getValue();                                           \
+    else                                                                \
+      outs() << "*no default*";                                         \
+    outs() << ")\n";                                                    \
+  }                                                                     \
+
+PRINT_OPT_DIFF(bool)
+PRINT_OPT_DIFF(boolOrDefault)
+PRINT_OPT_DIFF(int)
+PRINT_OPT_DIFF(unsigned)
+PRINT_OPT_DIFF(unsigned long long)
+PRINT_OPT_DIFF(double)
+PRINT_OPT_DIFF(float)
+PRINT_OPT_DIFF(char)
+
+void parser<std::string>::
+printOptionDiff(const Option &O, StringRef V, OptionValue<std::string> D,
+                size_t GlobalWidth) const {
+  printOptionName(O, GlobalWidth);
+  outs() << "= " << V;
+  size_t NumSpaces = MaxOptWidth > V.size() ? MaxOptWidth - V.size() : 0;
+  outs().indent(NumSpaces) << " (default: ";
+  if (D.hasValue())
+    outs() << D.getValue();
+  else
+    outs() << "*no default*";
+  outs() << ")\n";
+}
+
+// Print a placeholder for options that don't yet support printOptionDiff().
+void basic_parser_impl::
+printOptionNoValue(const Option &O, size_t GlobalWidth) const {
+  printOptionName(O, GlobalWidth);
+  outs() << "= *cannot print option value*\n";
+}
+
+//===----------------------------------------------------------------------===//
+// -help and -help-hidden option implementation
+//
+
+static int OptNameCompare(const void *LHS, const void *RHS) {
+  typedef std::pair<const char *, Option*> pair_ty;
+
+  return strcmp(((const pair_ty*)LHS)->first, ((const pair_ty*)RHS)->first);
+}
+
+// Copy Options into a vector so we can sort them as we like.
+static void
+sortOpts(StringMap<Option*> &OptMap,
+         SmallVectorImpl< std::pair<const char *, Option*> > &Opts,
+         bool ShowHidden) {
+  SmallPtrSet<Option*, 128> OptionSet;  // Duplicate option detection.
+
+  for (StringMap<Option*>::iterator I = OptMap.begin(), E = OptMap.end();
+       I != E; ++I) {
+    // Ignore really-hidden options.
+    if (I->second->getOptionHiddenFlag() == ReallyHidden)
+      continue;
+
+    // Unless showhidden is set, ignore hidden flags.
+    if (I->second->getOptionHiddenFlag() == Hidden && !ShowHidden)
+      continue;
+
+    // If we've already seen this option, don't add it to the list again.
+    if (!OptionSet.insert(I->second))
+      continue;
+
+    Opts.push_back(std::pair<const char *, Option*>(I->getKey().data(),
+                                                    I->second));
+  }
+
+  // Sort the options list alphabetically.
+  qsort(Opts.data(), Opts.size(), sizeof(Opts[0]), OptNameCompare);
+}
+
+namespace {
+
+class HelpPrinter {
+  const bool ShowHidden;
+
+public:
+  explicit HelpPrinter(bool showHidden) : ShowHidden(showHidden) {}
+
+  void operator=(bool Value) {
+    if (Value == false) return;
+
+    // Get all the options.
+    SmallVector<Option*, 4> PositionalOpts;
+    SmallVector<Option*, 4> SinkOpts;
+    StringMap<Option*> OptMap;
+    GetOptionInfo(PositionalOpts, SinkOpts, OptMap);
+
+    SmallVector<std::pair<const char *, Option*>, 128> Opts;
+    sortOpts(OptMap, Opts, ShowHidden);
+
+    if (ProgramOverview)
+      outs() << "OVERVIEW: " << ProgramOverview << "\n";
+
+    outs() << "USAGE: " << ProgramName << " [options]";
+
+    // Print out the positional options.
+    Option *CAOpt = 0;   // The cl::ConsumeAfter option, if it exists...
+    if (!PositionalOpts.empty() &&
+        PositionalOpts[0]->getNumOccurrencesFlag() == ConsumeAfter)
+      CAOpt = PositionalOpts[0];
+
+    for (size_t i = CAOpt != 0, e = PositionalOpts.size(); i != e; ++i) {
+      if (PositionalOpts[i]->ArgStr[0])
+        outs() << " --" << PositionalOpts[i]->ArgStr;
+      outs() << " " << PositionalOpts[i]->HelpStr;
+    }
+
+    // Print the consume after option info if it exists...
+    if (CAOpt) outs() << " " << CAOpt->HelpStr;
+
+    outs() << "\n\n";
+
+    // Compute the maximum argument length...
+    size_t MaxArgLen = 0;
+    for (size_t i = 0, e = Opts.size(); i != e; ++i)
+      MaxArgLen = std::max(MaxArgLen, Opts[i].second->getOptionWidth());
+
+    outs() << "OPTIONS:\n";
+    for (size_t i = 0, e = Opts.size(); i != e; ++i)
+      Opts[i].second->printOptionInfo(MaxArgLen);
+
+    // Print any extra help the user has declared.
+    for (std::vector<const char *>::iterator I = MoreHelp->begin(),
+          E = MoreHelp->end(); I != E; ++I)
+      outs() << *I;
+    MoreHelp->clear();
+
+    // Halt the program since help information was printed
+    exit(1);
+  }
+};
+} // End anonymous namespace
+
+// Define the two HelpPrinter instances that are used to print out help, or
+// help-hidden...
+//
+static HelpPrinter NormalPrinter(false);
+static HelpPrinter HiddenPrinter(true);
+
+static cl::opt<HelpPrinter, true, parser<bool> >
+HOp("help", cl::desc("Display available options (-help-hidden for more)"),
+    cl::location(NormalPrinter), cl::ValueDisallowed);
+
+static cl::opt<HelpPrinter, true, parser<bool> >
+HHOp("help-hidden", cl::desc("Display all available options"),
+     cl::location(HiddenPrinter), cl::Hidden, cl::ValueDisallowed);
+
+static cl::opt<bool>
+PrintOptions("print-options",
+             cl::desc("Print non-default options after command line parsing"),
+             cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+PrintAllOptions("print-all-options",
+                cl::desc("Print all option values after command line parsing"),
+                cl::Hidden, cl::init(false));
+
+// Print the value of each option.
+void cl::PrintOptionValues() {
+  if (!PrintOptions && !PrintAllOptions) return;
+
+  // Get all the options.
+  SmallVector<Option*, 4> PositionalOpts;
+  SmallVector<Option*, 4> SinkOpts;
+  StringMap<Option*> OptMap;
+  GetOptionInfo(PositionalOpts, SinkOpts, OptMap);
+
+  SmallVector<std::pair<const char *, Option*>, 128> Opts;
+  sortOpts(OptMap, Opts, /*ShowHidden*/true);
+
+  // Compute the maximum argument length...
+  size_t MaxArgLen = 0;
+  for (size_t i = 0, e = Opts.size(); i != e; ++i)
+    MaxArgLen = std::max(MaxArgLen, Opts[i].second->getOptionWidth());
+
+  for (size_t i = 0, e = Opts.size(); i != e; ++i)
+    Opts[i].second->printOptionValue(MaxArgLen, PrintAllOptions);
+}
+
+static void (*OverrideVersionPrinter)() = 0;
+
+static std::vector<void (*)()>* ExtraVersionPrinters = 0;
+
+namespace {
+class VersionPrinter {
+public:
+  void print() {
+    raw_ostream &OS = outs();
+    OS << "LLVM (http://llvm.org/):\n"
+       << "  " << PACKAGE_NAME << " version " << PACKAGE_VERSION;
+#ifdef LLVM_VERSION_INFO
+    OS << LLVM_VERSION_INFO;
+#endif
+    OS << "\n  ";
+#ifndef __OPTIMIZE__
+    OS << "DEBUG build";
+#else
+    OS << "Optimized build";
+#endif
+#ifndef NDEBUG
+    OS << " with assertions";
+#endif
+    std::string CPU = sys::getHostCPUName();
+    if (CPU == "generic") CPU = "(unknown)";
+    OS << ".\n"
+#if (ENABLE_TIMESTAMPS == 1)
+       << "  Built " << __DATE__ << " (" << __TIME__ << ").\n"
+#endif
+       << "  Default target: " << sys::getDefaultTargetTriple() << '\n'
+       << "  Host CPU: " << CPU << '\n';
+  }
+  void operator=(bool OptionWasSpecified) {
+    if (!OptionWasSpecified) return;
+
+    if (OverrideVersionPrinter != 0) {
+      (*OverrideVersionPrinter)();
+      exit(1);
+    }
+    print();
+
+    // Iterate over any registered extra printers and call them to add further
+    // information.
+    if (ExtraVersionPrinters != 0) {
+      outs() << '\n';
+      for (std::vector<void (*)()>::iterator I = ExtraVersionPrinters->begin(),
+                                             E = ExtraVersionPrinters->end();
+           I != E; ++I)
+        (*I)();
+    }
+
+    exit(1);
+  }
+};
+} // End anonymous namespace
+
+
+// Define the --version option that prints out the LLVM version for the tool
+static VersionPrinter VersionPrinterInstance;
+
+static cl::opt<VersionPrinter, true, parser<bool> >
+VersOp("version", cl::desc("Display the version of this program"),
+    cl::location(VersionPrinterInstance), cl::ValueDisallowed);
+
+// Utility function for printing the help message.
+void cl::PrintHelpMessage() {
+  // This looks weird, but it actually prints the help message. The
+  // NormalPrinter variable is a HelpPrinter and the help gets printed when
+  // its operator= is invoked. That's because the "normal" usages of the
+  // help printer is to be assigned true/false depending on whether the
+  // -help option was given or not. Since we're circumventing that we have
+  // to make it look like -help was given, so we assign true.
+  NormalPrinter = true;
+}
+
+/// Utility function for printing version number.
+void cl::PrintVersionMessage() {
+  VersionPrinterInstance.print();
+}
+
+void cl::SetVersionPrinter(void (*func)()) {
+  OverrideVersionPrinter = func;
+}
+
+void cl::AddExtraVersionPrinter(void (*func)()) {
+  if (ExtraVersionPrinters == 0)
+    ExtraVersionPrinters = new std::vector<void (*)()>;
+
+  ExtraVersionPrinters->push_back(func);
+}
diff --git a/contrib/llvm/lib/Support/ConstantRange.cpp b/contrib/llvm/lib/Support/ConstantRange.cpp
new file mode 100644
index 000000000000..5c5895026b67
--- /dev/null
+++ b/contrib/llvm/lib/Support/ConstantRange.cpp
@@ -0,0 +1,731 @@
+//===-- ConstantRange.cpp - ConstantRange implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Represent a range of possible values that may occur when the program is run
+// for an integral value.  This keeps track of a lower and upper bound for the
+// constant, which MAY wrap around the end of the numeric range.  To do this, it
+// keeps track of a [lower, upper) bound, which specifies an interval just like
+// STL iterators.  When used with boolean values, the following are important
+// ranges (other integral ranges use min/max values for special range values):
+//
+//  [F, F) = {}     = Empty set
+//  [T, F) = {T}
+//  [F, T) = {F}
+//  [T, T) = {F, T} = Full set
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// Initialize a full (the default) or empty set for the specified type.
+///
+ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) {
+  if (Full)
+    Lower = Upper = APInt::getMaxValue(BitWidth);
+  else
+    Lower = Upper = APInt::getMinValue(BitWidth);
+}
+
+/// Initialize a range to hold the single specified value.
+///
+ConstantRange::ConstantRange(const APInt &V) : Lower(V), Upper(V + 1) {}
+
+ConstantRange::ConstantRange(const APInt &L, const APInt &U) :
+  Lower(L), Upper(U) {
+  assert(L.getBitWidth() == U.getBitWidth() &&
+         "ConstantRange with unequal bit widths");
+  assert((L != U || (L.isMaxValue() || L.isMinValue())) &&
+         "Lower == Upper, but they aren't min or max value!");
+}
+
+ConstantRange ConstantRange::makeICmpRegion(unsigned Pred,
+                                            const ConstantRange &CR) {
+  if (CR.isEmptySet())
+    return CR;
+
+  uint32_t W = CR.getBitWidth();
+  switch (Pred) {
+    default: llvm_unreachable("Invalid ICmp predicate to makeICmpRegion()");
+    case CmpInst::ICMP_EQ:
+      return CR;
+    case CmpInst::ICMP_NE:
+      if (CR.isSingleElement())
+        return ConstantRange(CR.getUpper(), CR.getLower());
+      return ConstantRange(W);
+    case CmpInst::ICMP_ULT: {
+      APInt UMax(CR.getUnsignedMax());
+      if (UMax.isMinValue())
+        return ConstantRange(W, /* empty */ false);
+      return ConstantRange(APInt::getMinValue(W), UMax);
+    }
+    case CmpInst::ICMP_SLT: {
+      APInt SMax(CR.getSignedMax());
+      if (SMax.isMinSignedValue())
+        return ConstantRange(W, /* empty */ false);
+      return ConstantRange(APInt::getSignedMinValue(W), SMax);
+    }
+    case CmpInst::ICMP_ULE: {
+      APInt UMax(CR.getUnsignedMax());
+      if (UMax.isMaxValue())
+        return ConstantRange(W);
+      return ConstantRange(APInt::getMinValue(W), UMax + 1);
+    }
+    case CmpInst::ICMP_SLE: {
+      APInt SMax(CR.getSignedMax());
+      if (SMax.isMaxSignedValue())
+        return ConstantRange(W);
+      return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+    }
+    case CmpInst::ICMP_UGT: {
+      APInt UMin(CR.getUnsignedMin());
+      if (UMin.isMaxValue())
+        return ConstantRange(W, /* empty */ false);
+      return ConstantRange(UMin + 1, APInt::getNullValue(W));
+    }
+    case CmpInst::ICMP_SGT: {
+      APInt SMin(CR.getSignedMin());
+      if (SMin.isMaxSignedValue())
+        return ConstantRange(W, /* empty */ false);
+      return ConstantRange(SMin + 1, APInt::getSignedMinValue(W));
+    }
+    case CmpInst::ICMP_UGE: {
+      APInt UMin(CR.getUnsignedMin());
+      if (UMin.isMinValue())
+        return ConstantRange(W);
+      return ConstantRange(UMin, APInt::getNullValue(W));
+    }
+    case CmpInst::ICMP_SGE: {
+      APInt SMin(CR.getSignedMin());
+      if (SMin.isMinSignedValue())
+        return ConstantRange(W);
+      return ConstantRange(SMin, APInt::getSignedMinValue(W));
+    }
+  }
+}
+
+/// isFullSet - Return true if this set contains all of the elements possible
+/// for this data-type
+bool ConstantRange::isFullSet() const {
+  return Lower == Upper && Lower.isMaxValue();
+}
+
+/// isEmptySet - Return true if this set contains no members.
+///
+bool ConstantRange::isEmptySet() const {
+  return Lower == Upper && Lower.isMinValue();
+}
+
+/// isWrappedSet - Return true if this set wraps around the top of the range,
+/// for example: [100, 8)
+///
+bool ConstantRange::isWrappedSet() const {
+  return Lower.ugt(Upper);
+}
+
+/// isSignWrappedSet - Return true if this set wraps around the INT_MIN of
+/// its bitwidth, for example: i8 [120, 140).
+///
+bool ConstantRange::isSignWrappedSet() const {
+  return contains(APInt::getSignedMaxValue(getBitWidth())) &&
+         contains(APInt::getSignedMinValue(getBitWidth()));
+}
+
+/// getSetSize - Return the number of elements in this set.
+///
+APInt ConstantRange::getSetSize() const {
+  if (isEmptySet())
+    return APInt(getBitWidth()+1, 0);
+
+  if (isFullSet()) {
+    APInt Size(getBitWidth()+1, 0);
+    Size.setBit(getBitWidth());
+    return Size;
+  }
+
+  // This is also correct for wrapped sets.
+  return (Upper - Lower).zext(getBitWidth()+1);
+}
+
+/// getUnsignedMax - Return the largest unsigned value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getUnsignedMax() const {
+  if (isFullSet() || isWrappedSet())
+    return APInt::getMaxValue(getBitWidth());
+  return getUpper() - 1;
+}
+
+/// getUnsignedMin - Return the smallest unsigned value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getUnsignedMin() const {
+  if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+    return APInt::getMinValue(getBitWidth());
+  return getLower();
+}
+
+/// getSignedMax - Return the largest signed value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getSignedMax() const {
+  APInt SignedMax(APInt::getSignedMaxValue(getBitWidth()));
+  if (!isWrappedSet()) {
+    if (getLower().sle(getUpper() - 1))
+      return getUpper() - 1;
+    return SignedMax;
+  }
+  if (getLower().isNegative() == getUpper().isNegative())
+    return SignedMax;
+  return getUpper() - 1;
+}
+
+/// getSignedMin - Return the smallest signed value contained in the
+/// ConstantRange.
+///
+APInt ConstantRange::getSignedMin() const {
+  APInt SignedMin(APInt::getSignedMinValue(getBitWidth()));
+  if (!isWrappedSet()) {
+    if (getLower().sle(getUpper() - 1))
+      return getLower();
+    return SignedMin;
+  }
+  if ((getUpper() - 1).slt(getLower())) {
+    if (getUpper() != SignedMin)
+      return SignedMin;
+  }
+  return getLower();
+}
+
+/// contains - Return true if the specified value is in the set.
+///
+bool ConstantRange::contains(const APInt &V) const {
+  if (Lower == Upper)
+    return isFullSet();
+
+  if (!isWrappedSet())
+    return Lower.ule(V) && V.ult(Upper);
+  return Lower.ule(V) || V.ult(Upper);
+}
+
+/// contains - Return true if the argument is a subset of this range.
+/// Two equal sets contain each other. The empty set contained by all other
+/// sets.
+///
+bool ConstantRange::contains(const ConstantRange &Other) const {
+  if (isFullSet() || Other.isEmptySet()) return true;
+  if (isEmptySet() || Other.isFullSet()) return false;
+
+  if (!isWrappedSet()) {
+    if (Other.isWrappedSet())
+      return false;
+
+    return Lower.ule(Other.getLower()) && Other.getUpper().ule(Upper);
+  }
+
+  if (!Other.isWrappedSet())
+    return Other.getUpper().ule(Upper) ||
+           Lower.ule(Other.getLower());
+
+  return Other.getUpper().ule(Upper) && Lower.ule(Other.getLower());
+}
+
+/// subtract - Subtract the specified constant from the endpoints of this
+/// constant range.
+ConstantRange ConstantRange::subtract(const APInt &Val) const {
+  assert(Val.getBitWidth() == getBitWidth() && "Wrong bit width");
+  // If the set is empty or full, don't modify the endpoints.
+  if (Lower == Upper) 
+    return *this;
+  return ConstantRange(Lower - Val, Upper - Val);
+}
+
+/// \brief Subtract the specified range from this range (aka relative complement
+/// of the sets).
+ConstantRange ConstantRange::difference(const ConstantRange &CR) const {
+  return intersectWith(CR.inverse());
+}
+
+/// intersectWith - Return the range that results from the intersection of this
+/// range with another range.  The resultant range is guaranteed to include all
+/// elements contained in both input ranges, and to have the smallest possible
+/// set size that does so.  Because there may be two intersections with the
+/// same set size, A.intersectWith(B) might not be equal to B.intersectWith(A).
+ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
+  assert(getBitWidth() == CR.getBitWidth() && 
+         "ConstantRange types don't agree!");
+
+  // Handle common cases.
+  if (   isEmptySet() || CR.isFullSet()) return *this;
+  if (CR.isEmptySet() ||    isFullSet()) return CR;
+
+  if (!isWrappedSet() && CR.isWrappedSet())
+    return CR.intersectWith(*this);
+
+  if (!isWrappedSet() && !CR.isWrappedSet()) {
+    if (Lower.ult(CR.Lower)) {
+      if (Upper.ule(CR.Lower))
+        return ConstantRange(getBitWidth(), false);
+
+      if (Upper.ult(CR.Upper))
+        return ConstantRange(CR.Lower, Upper);
+
+      return CR;
+    }
+    if (Upper.ult(CR.Upper))
+      return *this;
+
+    if (Lower.ult(CR.Upper))
+      return ConstantRange(Lower, CR.Upper);
+
+    return ConstantRange(getBitWidth(), false);
+  }
+
+  if (isWrappedSet() && !CR.isWrappedSet()) {
+    if (CR.Lower.ult(Upper)) {
+      if (CR.Upper.ult(Upper))
+        return CR;
+
+      if (CR.Upper.ule(Lower))
+        return ConstantRange(CR.Lower, Upper);
+
+      if (getSetSize().ult(CR.getSetSize()))
+        return *this;
+      return CR;
+    }
+    if (CR.Lower.ult(Lower)) {
+      if (CR.Upper.ule(Lower))
+        return ConstantRange(getBitWidth(), false);
+
+      return ConstantRange(Lower, CR.Upper);
+    }
+    return CR;
+  }
+
+  if (CR.Upper.ult(Upper)) {
+    if (CR.Lower.ult(Upper)) {
+      if (getSetSize().ult(CR.getSetSize()))
+        return *this;
+      return CR;
+    }
+
+    if (CR.Lower.ult(Lower))
+      return ConstantRange(Lower, CR.Upper);
+
+    return CR;
+  }
+  if (CR.Upper.ule(Lower)) {
+    if (CR.Lower.ult(Lower))
+      return *this;
+
+    return ConstantRange(CR.Lower, Upper);
+  }
+  if (getSetSize().ult(CR.getSetSize()))
+    return *this;
+  return CR;
+}
+
+
+/// unionWith - Return the range that results from the union of this range with
+/// another range.  The resultant range is guaranteed to include the elements of
+/// both sets, but may contain more.  For example, [3, 9) union [12,15) is
+/// [3, 15), which includes 9, 10, and 11, which were not included in either
+/// set before.
+///
+ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
+  assert(getBitWidth() == CR.getBitWidth() && 
+         "ConstantRange types don't agree!");
+
+  if (   isFullSet() || CR.isEmptySet()) return *this;
+  if (CR.isFullSet() ||    isEmptySet()) return CR;
+
+  if (!isWrappedSet() && CR.isWrappedSet()) return CR.unionWith(*this);
+
+  if (!isWrappedSet() && !CR.isWrappedSet()) {
+    if (CR.Upper.ult(Lower) || Upper.ult(CR.Lower)) {
+      // If the two ranges are disjoint, find the smaller gap and bridge it.
+      APInt d1 = CR.Lower - Upper, d2 = Lower - CR.Upper;
+      if (d1.ult(d2))
+        return ConstantRange(Lower, CR.Upper);
+      return ConstantRange(CR.Lower, Upper);
+    }
+
+    APInt L = Lower, U = Upper;
+    if (CR.Lower.ult(L))
+      L = CR.Lower;
+    if ((CR.Upper - 1).ugt(U - 1))
+      U = CR.Upper;
+
+    if (L == 0 && U == 0)
+      return ConstantRange(getBitWidth());
+
+    return ConstantRange(L, U);
+  }
+
+  if (!CR.isWrappedSet()) {
+    // ------U   L-----  and  ------U   L----- : this
+    //   L--U                            L--U  : CR
+    if (CR.Upper.ule(Upper) || CR.Lower.uge(Lower))
+      return *this;
+
+    // ------U   L----- : this
+    //    L---------U   : CR
+    if (CR.Lower.ule(Upper) && Lower.ule(CR.Upper))
+      return ConstantRange(getBitWidth());
+
+    // ----U       L---- : this
+    //       L---U       : CR
+    //    <d1>  <d2>
+    if (Upper.ule(CR.Lower) && CR.Upper.ule(Lower)) {
+      APInt d1 = CR.Lower - Upper, d2 = Lower - CR.Upper;
+      if (d1.ult(d2))
+        return ConstantRange(Lower, CR.Upper);
+      return ConstantRange(CR.Lower, Upper);
+    }
+
+    // ----U     L----- : this
+    //        L----U    : CR
+    if (Upper.ult(CR.Lower) && Lower.ult(CR.Upper))
+      return ConstantRange(CR.Lower, Upper);
+
+    // ------U    L---- : this
+    //    L-----U       : CR
+    assert(CR.Lower.ult(Upper) && CR.Upper.ult(Lower) &&
+           "ConstantRange::unionWith missed a case with one range wrapped");
+    return ConstantRange(Lower, CR.Upper);
+  }
+
+  // ------U    L----  and  ------U    L---- : this
+  // -U  L-----------  and  ------------U  L : CR
+  if (CR.Lower.ule(Upper) || Lower.ule(CR.Upper))
+    return ConstantRange(getBitWidth());
+
+  APInt L = Lower, U = Upper;
+  if (CR.Upper.ugt(U))
+    U = CR.Upper;
+  if (CR.Lower.ult(L))
+    L = CR.Lower;
+
+  return ConstantRange(L, U);
+}
+
+/// zeroExtend - Return a new range in the specified integer type, which must
+/// be strictly larger than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// zero extended.
+ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
+  if (isEmptySet()) return ConstantRange(DstTySize, /*isFullSet=*/false);
+
+  unsigned SrcTySize = getBitWidth();
+  assert(SrcTySize < DstTySize && "Not a value extension");
+  if (isFullSet() || isWrappedSet()) {
+    // Change into [0, 1 << src bit width)
+    APInt LowerExt(DstTySize, 0);
+    if (!Upper) // special case: [X, 0) -- not really wrapping around
+      LowerExt = Lower.zext(DstTySize);
+    return ConstantRange(LowerExt, APInt(DstTySize, 1).shl(SrcTySize));
+  }
+
+  return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize));
+}
+
+/// signExtend - Return a new range in the specified integer type, which must
+/// be strictly larger than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// sign extended.
+ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
+  if (isEmptySet()) return ConstantRange(DstTySize, /*isFullSet=*/false);
+
+  unsigned SrcTySize = getBitWidth();
+  assert(SrcTySize < DstTySize && "Not a value extension");
+  if (isFullSet() || isSignWrappedSet()) {
+    return ConstantRange(APInt::getHighBitsSet(DstTySize,DstTySize-SrcTySize+1),
+                         APInt::getLowBitsSet(DstTySize, SrcTySize-1) + 1);
+  }
+
+  return ConstantRange(Lower.sext(DstTySize), Upper.sext(DstTySize));
+}
+
+/// truncate - Return a new range in the specified integer type, which must be
+/// strictly smaller than the current type.  The returned range will
+/// correspond to the possible range of values as if the source range had been
+/// truncated to the specified type.
+ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
+  assert(getBitWidth() > DstTySize && "Not a value truncation");
+  if (isEmptySet())
+    return ConstantRange(DstTySize, /*isFullSet=*/false);
+  if (isFullSet())
+    return ConstantRange(DstTySize, /*isFullSet=*/true);
+
+  APInt MaxValue = APInt::getMaxValue(DstTySize).zext(getBitWidth());
+  APInt MaxBitValue(getBitWidth(), 0);
+  MaxBitValue.setBit(DstTySize);
+
+  APInt LowerDiv(Lower), UpperDiv(Upper);
+  ConstantRange Union(DstTySize, /*isFullSet=*/false);
+
+  // Analyze wrapped sets in their two parts: [0, Upper) \/ [Lower, MaxValue]
+  // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
+  // then we do the union with [MaxValue, Upper)
+  if (isWrappedSet()) {
+    // if Upper is greater than Max Value, it covers the whole truncated range.
+    if (Upper.uge(MaxValue))
+      return ConstantRange(DstTySize, /*isFullSet=*/true);
+
+    Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
+    UpperDiv = APInt::getMaxValue(getBitWidth());
+
+    // Union covers the MaxValue case, so return if the remaining range is just
+    // MaxValue.
+    if (LowerDiv == UpperDiv)
+      return Union;
+  }
+
+  // Chop off the most significant bits that are past the destination bitwidth.
+  if (LowerDiv.uge(MaxValue)) {
+    APInt Div(getBitWidth(), 0);
+    APInt::udivrem(LowerDiv, MaxBitValue, Div, LowerDiv);
+    UpperDiv = UpperDiv - MaxBitValue * Div;
+  }
+
+  if (UpperDiv.ule(MaxValue))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperDiv.trunc(DstTySize)).unionWith(Union);
+
+  // The truncated value wrapps around. Check if we can do better than fullset.
+  APInt UpperModulo = UpperDiv - MaxBitValue;
+  if (UpperModulo.ult(LowerDiv))
+    return ConstantRange(LowerDiv.trunc(DstTySize),
+                         UpperModulo.trunc(DstTySize)).unionWith(Union);
+
+  return ConstantRange(DstTySize, /*isFullSet=*/true);
+}
+
+/// zextOrTrunc - make this range have the bit width given by \p DstTySize. The
+/// value is zero extended, truncated, or left alone to make it that width.
+ConstantRange ConstantRange::zextOrTrunc(uint32_t DstTySize) const {
+  unsigned SrcTySize = getBitWidth();
+  if (SrcTySize > DstTySize)
+    return truncate(DstTySize);
+  if (SrcTySize < DstTySize)
+    return zeroExtend(DstTySize);
+  return *this;
+}
+
+/// sextOrTrunc - make this range have the bit width given by \p DstTySize. The
+/// value is sign extended, truncated, or left alone to make it that width.
+ConstantRange ConstantRange::sextOrTrunc(uint32_t DstTySize) const {
+  unsigned SrcTySize = getBitWidth();
+  if (SrcTySize > DstTySize)
+    return truncate(DstTySize);
+  if (SrcTySize < DstTySize)
+    return signExtend(DstTySize);
+  return *this;
+}
+
+ConstantRange
+ConstantRange::add(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  if (isFullSet() || Other.isFullSet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
+  APInt NewLower = getLower() + Other.getLower();
+  APInt NewUpper = getUpper() + Other.getUpper() - 1;
+  if (NewLower == NewUpper)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  ConstantRange X = ConstantRange(NewLower, NewUpper);
+  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+    // We've wrapped, therefore, full set.
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  return X;
+}
+
+ConstantRange
+ConstantRange::sub(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  if (isFullSet() || Other.isFullSet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
+  APInt NewLower = getLower() - Other.getUpper() + 1;
+  APInt NewUpper = getUpper() - Other.getLower();
+  if (NewLower == NewUpper)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  ConstantRange X = ConstantRange(NewLower, NewUpper);
+  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+    // We've wrapped, therefore, full set.
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  return X;
+}
+
+ConstantRange
+ConstantRange::multiply(const ConstantRange &Other) const {
+  // TODO: If either operand is a single element and the multiply is known to
+  // be non-wrapping, round the result min and max value to the appropriate
+  // multiple of that element. If wrapping is possible, at least adjust the
+  // range according to the greatest power-of-two factor of the single element.
+
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+
+  APInt this_min = getUnsignedMin().zext(getBitWidth() * 2);
+  APInt this_max = getUnsignedMax().zext(getBitWidth() * 2);
+  APInt Other_min = Other.getUnsignedMin().zext(getBitWidth() * 2);
+  APInt Other_max = Other.getUnsignedMax().zext(getBitWidth() * 2);
+
+  ConstantRange Result_zext = ConstantRange(this_min * Other_min,
+                                            this_max * Other_max + 1);
+  return Result_zext.truncate(getBitWidth());
+}
+
+ConstantRange
+ConstantRange::smax(const ConstantRange &Other) const {
+  // X smax Y is: range(smax(X_smin, Y_smin),
+  //                    smax(X_smax, Y_smax))
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  APInt NewL = APIntOps::smax(getSignedMin(), Other.getSignedMin());
+  APInt NewU = APIntOps::smax(getSignedMax(), Other.getSignedMax()) + 1;
+  if (NewU == NewL)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(NewL, NewU);
+}
+
+ConstantRange
+ConstantRange::umax(const ConstantRange &Other) const {
+  // X umax Y is: range(umax(X_umin, Y_umin),
+  //                    umax(X_umax, Y_umax))
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  APInt NewL = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
+  APInt NewU = APIntOps::umax(getUnsignedMax(), Other.getUnsignedMax()) + 1;
+  if (NewU == NewL)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(NewL, NewU);
+}
+
+ConstantRange
+ConstantRange::udiv(const ConstantRange &RHS) const {
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  if (RHS.isFullSet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
+
+  APInt RHS_umin = RHS.getUnsignedMin();
+  if (RHS_umin == 0) {
+    // We want the lowest value in RHS excluding zero. Usually that would be 1
+    // except for a range in the form of [X, 1) in which case it would be X.
+    if (RHS.getUpper() == 1)
+      RHS_umin = RHS.getLower();
+    else
+      RHS_umin = APInt(getBitWidth(), 1);
+  }
+
+  APInt Upper = getUnsignedMax().udiv(RHS_umin) + 1;
+
+  // If the LHS is Full and the RHS is a wrapped interval containing 1 then
+  // this could occur.
+  if (Lower == Upper)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  return ConstantRange(Lower, Upper);
+}
+
+ConstantRange
+ConstantRange::binaryAnd(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+
+  // TODO: replace this with something less conservative
+
+  APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
+  if (umin.isAllOnesValue())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(APInt::getNullValue(getBitWidth()), umin + 1);
+}
+
+ConstantRange
+ConstantRange::binaryOr(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+
+  // TODO: replace this with something less conservative
+
+  APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
+  if (umax.isMinValue())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(umax, APInt::getNullValue(getBitWidth()));
+}
+
+ConstantRange
+ConstantRange::shl(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+
+  APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
+  APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+
+  // there's no overflow!
+  APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
+  if (Zeros.ugt(Other.getUnsignedMax()))
+    return ConstantRange(min, max + 1);
+
+  // FIXME: implement the other tricky cases
+  return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+}
+
+ConstantRange
+ConstantRange::lshr(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  
+  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+  APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
+  if (min == max + 1)
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  return ConstantRange(min, max + 1);
+}
+
+ConstantRange ConstantRange::inverse() const {
+  if (isFullSet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/false);
+  if (isEmptySet())
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+  return ConstantRange(Upper, Lower);
+}
+
+/// print - Print out the bounds to a stream...
+///
+void ConstantRange::print(raw_ostream &OS) const {
+  if (isFullSet())
+    OS << "full-set";
+  else if (isEmptySet())
+    OS << "empty-set";
+  else
+    OS << "[" << Lower << "," << Upper << ")";
+}
+
+/// dump - Allow printing from a debugger easily...
+///
+void ConstantRange::dump() const {
+  print(dbgs());
+}
diff --git a/contrib/llvm/lib/Support/ConvertUTF.c b/contrib/llvm/lib/Support/ConvertUTF.c
new file mode 100644
index 000000000000..23f17ca25aea
--- /dev/null
+++ b/contrib/llvm/lib/Support/ConvertUTF.c
@@ -0,0 +1,571 @@
+/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===------------------------------------------------------------------------=*/
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ * 
+ * Disclaimer
+ * 
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ * 
+ * Limitations on Rights to Redistribute This Code
+ * 
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/* ---------------------------------------------------------------------
+
+    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
+    Author: Mark E. Davis, 1994.
+    Rev History: Rick McGowan, fixes & updates May 2001.
+    Sept 2001: fixed const & error conditions per
+        mods suggested by S. Parent & A. Lillich.
+    June 2002: Tim Dodd added detection and handling of incomplete
+        source sequences, enhanced error detection, added casts
+        to eliminate compiler warnings.
+    July 2003: slight mods to back out aggressive FFFE detection.
+    Jan 2004: updated switches in from-UTF8 conversions.
+    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
+
+    See the header file "ConvertUTF.h" for complete documentation.
+
+------------------------------------------------------------------------ */
+
+
+#include "llvm/Support/ConvertUTF.h"
+#ifdef CVTUTF_DEBUG
+#include <stdio.h>
+#endif
+
+static const int halfShift  = 10; /* used for shifting by 10 bits */
+
+static const UTF32 halfBase = 0x0010000UL;
+static const UTF32 halfMask = 0x3FFUL;
+
+#define UNI_SUR_HIGH_START  (UTF32)0xD800
+#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
+#define UNI_SUR_LOW_START   (UTF32)0xDC00
+#define UNI_SUR_LOW_END     (UTF32)0xDFFF
+#define false      0
+#define true        1
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
+ */
+static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/*
+ * Magic values subtracted from a buffer value during UTF8 conversion.
+ * This table contains as many values as there might be trailing bytes
+ * in a UTF-8 sequence.
+ */
+static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
+                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+/*
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+ * into the first byte, depending on how many bytes follow.  There are
+ * as many entries in this table as there are UTF-8 sequence types.
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
+ */
+static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+/* --------------------------------------------------------------------- */
+
+/* The interface converts a whole buffer to avoid function-call overhead.
+ * Constants have been gathered. Loops & conditionals have been removed as
+ * much as possible for efficiency, in favor of drop-through switches.
+ * (See "Note A" at the bottom of the file for equivalent code.)
+ * If your compiler supports it, the "isLegalUTF8" call can be turned
+ * into an inline function.
+ */
+
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF16 (
+        const UTF32** sourceStart, const UTF32* sourceEnd, 
+        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF32* source = *sourceStart;
+    UTF16* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        if (target >= targetEnd) {
+            result = targetExhausted; break;
+        }
+        ch = *source++;
+        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = (UTF16)ch; /* normal case */
+            }
+        } else if (ch > UNI_MAX_LEGAL_UTF32) {
+            if (flags == strictConversion) {
+                result = sourceIllegal;
+            } else {
+                *target++ = UNI_REPLACEMENT_CHAR;
+            }
+        } else {
+            /* target is a character in range 0xFFFF - 0x10FFFF. */
+            if (target + 1 >= targetEnd) {
+                --source; /* Back up source pointer! */
+                result = targetExhausted; break;
+            }
+            ch -= halfBase;
+            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF16toUTF32 (
+        const UTF16** sourceStart, const UTF16* sourceEnd, 
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF16* source = *sourceStart;
+    UTF32* target = *targetStart;
+    UTF32 ch, ch2;
+    while (source < sourceEnd) {
+        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
+        ch = *source++;
+        /* If we have a surrogate pair, convert to UTF32 first. */
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+            /* If the 16 bits following the high surrogate are in the source buffer... */
+            if (source < sourceEnd) {
+                ch2 = *source;
+                /* If it's a low surrogate, convert to UTF32. */
+                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
+                    ++source;
+                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                }
+            } else { /* We don't have the 16 bits following the high surrogate. */
+                --source; /* return to the high surrogate */
+                result = sourceExhausted;
+                break;
+            }
+        } else if (flags == strictConversion) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        if (target >= targetEnd) {
+            source = oldSource; /* Back up source pointer! */
+            result = targetExhausted; break;
+        }
+        *target++ = ch;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+#ifdef CVTUTF_DEBUG
+if (result == sourceIllegal) {
+    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
+    fflush(stderr);
+}
+#endif
+    return result;
+}
+ConversionResult ConvertUTF16toUTF8 (
+        const UTF16** sourceStart, const UTF16* sourceEnd, 
+        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF16* source = *sourceStart;
+    UTF8* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        unsigned short bytesToWrite = 0;
+        const UTF32 byteMask = 0xBF;
+        const UTF32 byteMark = 0x80; 
+        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+        ch = *source++;
+        /* If we have a surrogate pair, convert to UTF32 first. */
+        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+            /* If the 16 bits following the high surrogate are in the source buffer... */
+            if (source < sourceEnd) {
+                UTF32 ch2 = *source;
+                /* If it's a low surrogate, convert to UTF32. */
+                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
+                    ++source;
+                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+                    --source; /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                }
+            } else { /* We don't have the 16 bits following the high surrogate. */
+                --source; /* return to the high surrogate */
+                result = sourceExhausted;
+                break;
+            }
+        } else if (flags == strictConversion) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        /* Figure out how many bytes the result will require */
+        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
+        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
+        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
+        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
+        } else {                            bytesToWrite = 3;
+                                            ch = UNI_REPLACEMENT_CHAR;
+        }
+
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            source = oldSource; /* Back up source pointer! */
+            target -= bytesToWrite; result = targetExhausted; break;
+        }
+        switch (bytesToWrite) { /* note: everything falls through. */
+            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF8 (
+        const UTF32** sourceStart, const UTF32* sourceEnd, 
+        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF32* source = *sourceStart;
+    UTF8* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch;
+        unsigned short bytesToWrite = 0;
+        const UTF32 byteMask = 0xBF;
+        const UTF32 byteMark = 0x80; 
+        ch = *source++;
+        if (flags == strictConversion ) {
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                --source; /* return to the illegal value itself */
+                result = sourceIllegal;
+                break;
+            }
+        }
+        /*
+         * Figure out how many bytes the result will require. Turn any
+         * illegally large UTF32 things (> Plane 17) into replacement chars.
+         */
+        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
+        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
+        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
+        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
+        } else {                            bytesToWrite = 3;
+                                            ch = UNI_REPLACEMENT_CHAR;
+                                            result = sourceIllegal;
+        }
+        
+        target += bytesToWrite;
+        if (target > targetEnd) {
+            --source; /* Back up source pointer! */
+            target -= bytesToWrite; result = targetExhausted; break;
+        }
+        switch (bytesToWrite) { /* note: everything falls through. */
+            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
+        }
+        target += bytesToWrite;
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * If not calling this from ConvertUTF8to*, then the length can be set by:
+ *  length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns false.  The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
+    UTF8 a;
+    const UTF8 *srcptr = source+length;
+    switch (length) {
+    default: return false;
+        /* Everything else falls through when "true"... */
+    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+
+        switch (*source) {
+            /* no fall-through in this inner switch */
+            case 0xE0: if (a < 0xA0) return false; break;
+            case 0xED: if (a > 0x9F) return false; break;
+            case 0xF0: if (a < 0x90) return false; break;
+            case 0xF4: if (a > 0x8F) return false; break;
+            default:   if (a < 0x80) return false;
+        }
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4) return false;
+    return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 sequence is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
+    int length = trailingBytesForUTF8[*source]+1;
+    if (length > sourceEnd - source) {
+        return false;
+    }
+    return isLegalUTF8(source, length);
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+  return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 string is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+    while (*source != sourceEnd) {
+        int length = trailingBytesForUTF8[**source] + 1;
+        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
+            return false;
+        *source += length;
+    }
+    return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF16 (
+        const UTF8** sourceStart, const UTF8* sourceEnd, 
+        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF8* source = *sourceStart;
+    UTF16* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+        if (extraBytesToRead >= sourceEnd - source) {
+            result = sourceExhausted; break;
+        }
+        /* Do this check whether lenient or strict */
+        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+            result = sourceIllegal;
+            break;
+        }
+        /*
+         * The cases all fall through. See "Note A" below.
+         */
+        switch (extraBytesToRead) {
+            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+            case 3: ch += *source++; ch <<= 6;
+            case 2: ch += *source++; ch <<= 6;
+            case 1: ch += *source++; ch <<= 6;
+            case 0: ch += *source++;
+        }
+        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        if (target >= targetEnd) {
+            source -= (extraBytesToRead+1); /* Back up source pointer! */
+            result = targetExhausted; break;
+        }
+        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+            /* UTF-16 surrogate values are illegal in UTF-32 */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = (UTF16)ch; /* normal case */
+            }
+        } else if (ch > UNI_MAX_UTF16) {
+            if (flags == strictConversion) {
+                result = sourceIllegal;
+                source -= (extraBytesToRead+1); /* return to the start */
+                break; /* Bail out; shouldn't continue */
+            } else {
+                *target++ = UNI_REPLACEMENT_CHAR;
+            }
+        } else {
+            /* target is a character in range 0xFFFF - 0x10FFFF. */
+            if (target + 1 >= targetEnd) {
+                source -= (extraBytesToRead+1); /* Back up source pointer! */
+                result = targetExhausted; break;
+            }
+            ch -= halfBase;
+            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF32 (
+        const UTF8** sourceStart, const UTF8* sourceEnd, 
+        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+    ConversionResult result = conversionOK;
+    const UTF8* source = *sourceStart;
+    UTF32* target = *targetStart;
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+        if (extraBytesToRead >= sourceEnd - source) {
+            result = sourceExhausted; break;
+        }
+        /* Do this check whether lenient or strict */
+        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+            result = sourceIllegal;
+            break;
+        }
+        /*
+         * The cases all fall through. See "Note A" below.
+         */
+        switch (extraBytesToRead) {
+            case 5: ch += *source++; ch <<= 6;
+            case 4: ch += *source++; ch <<= 6;
+            case 3: ch += *source++; ch <<= 6;
+            case 2: ch += *source++; ch <<= 6;
+            case 1: ch += *source++; ch <<= 6;
+            case 0: ch += *source++;
+        }
+        ch -= offsetsFromUTF8[extraBytesToRead];
+
+        if (target >= targetEnd) {
+            source -= (extraBytesToRead+1); /* Back up the source pointer! */
+            result = targetExhausted; break;
+        }
+        if (ch <= UNI_MAX_LEGAL_UTF32) {
+            /*
+             * UTF-16 surrogate values are illegal in UTF-32, and anything
+             * over Plane 17 (> 0x10FFFF) is illegal.
+             */
+            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+                if (flags == strictConversion) {
+                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
+                    result = sourceIllegal;
+                    break;
+                } else {
+                    *target++ = UNI_REPLACEMENT_CHAR;
+                }
+            } else {
+                *target++ = ch;
+            }
+        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+            result = sourceIllegal;
+            *target++ = UNI_REPLACEMENT_CHAR;
+        }
+    }
+    *sourceStart = source;
+    *targetStart = target;
+    return result;
+}
+
+/* ---------------------------------------------------------------------
+
+    Note A.
+    The fall-through switches in UTF-8 reading code save a
+    temp variable, some decrements & conditionals.  The switches
+    are equivalent to the following loop:
+        {
+            int tmpBytesToRead = extraBytesToRead+1;
+            do {
+                ch += *source++;
+                --tmpBytesToRead;
+                if (tmpBytesToRead) ch <<= 6;
+            } while (tmpBytesToRead > 0);
+        }
+    In UTF-8 writing code, the switches on "bytesToWrite" are
+    similarly unrolled loops.
+
+   --------------------------------------------------------------------- */
diff --git a/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp b/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp
new file mode 100644
index 000000000000..458fbb0b496a
--- /dev/null
+++ b/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -0,0 +1,76 @@
+//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ConvertUTF.h"
+
+namespace llvm {
+
+bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
+                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
+  assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
+  ConversionResult result = conversionOK;
+  // Copy the character span over.
+  if (WideCharWidth == 1) {
+    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
+      result = sourceIllegal;
+      ErrorPtr = Pos;
+    } else {
+      memcpy(ResultPtr, Source.data(), Source.size());
+      ResultPtr += Source.size();
+    }
+  } else if (WideCharWidth == 2) {
+    const UTF8 *sourceStart = (const UTF8*)Source.data();
+    // FIXME: Make the type of the result buffer correct instead of
+    // using reinterpret_cast.
+    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+    ConversionFlags flags = strictConversion;
+    result = ConvertUTF8toUTF16(
+        &sourceStart, sourceStart + Source.size(),
+        &targetStart, targetStart + 2*Source.size(), flags);
+    if (result == conversionOK)
+      ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
+  } else if (WideCharWidth == 4) {
+    const UTF8 *sourceStart = (const UTF8*)Source.data();
+    // FIXME: Make the type of the result buffer correct instead of
+    // using reinterpret_cast.
+    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+    ConversionFlags flags = strictConversion;
+    result = ConvertUTF8toUTF32(
+        &sourceStart, sourceStart + Source.size(),
+        &targetStart, targetStart + 4*Source.size(), flags);
+    if (result == conversionOK)
+      ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
+  }
+  assert((result != targetExhausted)
+         && "ConvertUTF8toUTFXX exhausted target buffer");
+  return result == conversionOK;
+}
+
+bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
+  const UTF32 *SourceStart = &Source;
+  const UTF32 *SourceEnd = SourceStart + 1;
+  UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
+  UTF8 *TargetEnd = TargetStart + 4;
+  ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
+                                           &TargetStart, TargetEnd,
+                                           strictConversion);
+  if (CR != conversionOK)
+    return false;
+
+  ResultPtr = reinterpret_cast<char*>(TargetStart);
+  return true;
+}
+
+} // end namespace llvm
+
diff --git a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
new file mode 100644
index 000000000000..182c362cc755
--- /dev/null
+++ b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -0,0 +1,346 @@
+//===--- CrashRecoveryContext.cpp - Crash Recovery ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CrashRecoveryContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/ThreadLocal.h"
+#include <cstdio>
+#include <setjmp.h>
+using namespace llvm;
+
+namespace {
+
+struct CrashRecoveryContextImpl;
+
+static sys::ThreadLocal<const CrashRecoveryContextImpl> CurrentContext;
+
+struct CrashRecoveryContextImpl {
+  CrashRecoveryContext *CRC;
+  std::string Backtrace;
+  ::jmp_buf JumpBuffer;
+  volatile unsigned Failed : 1;
+
+public:
+  CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
+                                                        Failed(false) {
+    CurrentContext.set(this);
+  }
+  ~CrashRecoveryContextImpl() {
+    CurrentContext.erase();
+  }
+
+  void HandleCrash() {
+    // Eliminate the current context entry, to avoid re-entering in case the
+    // cleanup code crashes.
+    CurrentContext.erase();
+
+    assert(!Failed && "Crash recovery context already failed!");
+    Failed = true;
+
+    // FIXME: Stash the backtrace.
+
+    // Jump back to the RunSafely we were called under.
+    longjmp(JumpBuffer, 1);
+  }
+};
+
+}
+
+static sys::Mutex gCrashRecoveryContexMutex;
+static bool gCrashRecoveryEnabled = false;
+
+static sys::ThreadLocal<const CrashRecoveryContextCleanup> 
+       tlIsRecoveringFromCrash;
+
+CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
+
+CrashRecoveryContext::~CrashRecoveryContext() {
+  // Reclaim registered resources.
+  CrashRecoveryContextCleanup *i = head;
+  tlIsRecoveringFromCrash.set(head);
+  while (i) {
+    CrashRecoveryContextCleanup *tmp = i;
+    i = tmp->next;
+    tmp->cleanupFired = true;
+    tmp->recoverResources();
+    delete tmp;
+  }
+  tlIsRecoveringFromCrash.erase();
+  
+  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
+  delete CRCI;
+}
+
+bool CrashRecoveryContext::isRecoveringFromCrash() {
+  return tlIsRecoveringFromCrash.get() != 0;
+}
+
+CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
+  if (!gCrashRecoveryEnabled)
+    return 0;
+
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  if (!CRCI)
+    return 0;
+
+  return CRCI->CRC;
+}
+
+void CrashRecoveryContext::registerCleanup(CrashRecoveryContextCleanup *cleanup)
+{
+  if (!cleanup)
+    return;
+  if (head)
+    head->prev = cleanup;
+  cleanup->next = head;
+  head = cleanup;
+}
+
+void
+CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
+  if (!cleanup)
+    return;
+  if (cleanup == head) {
+    head = cleanup->next;
+    if (head)
+      head->prev = 0;
+  }
+  else {
+    cleanup->prev->next = cleanup->next;
+    if (cleanup->next)
+      cleanup->next->prev = cleanup->prev;
+  }
+  delete cleanup;
+}
+
+#ifdef LLVM_ON_WIN32
+
+#include "Windows/Windows.h"
+
+// On Windows, we can make use of vectored exception handling to
+// catch most crashing situations.  Note that this does mean
+// we will be alerted of exceptions *before* structured exception
+// handling has the opportunity to catch it.  But that isn't likely
+// to cause problems because nowhere in the project is SEH being
+// used.
+//
+// Vectored exception handling is built on top of SEH, and so it
+// works on a per-thread basis.
+//
+// The vectored exception handler functionality was added in Windows
+// XP, so if support for older versions of Windows is required,
+// it will have to be added.
+//
+// If we want to support as far back as Win2k, we could use the
+// SetUnhandledExceptionFilter API, but there's a risk of that
+// being entirely overwritten (it's not a chain).
+
+static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
+{
+  // Lookup the current thread local recovery object.
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+
+  if (!CRCI) {
+    // Something has gone horribly wrong, so let's just tell everyone
+    // to keep searching
+    CrashRecoveryContext::Disable();
+    return EXCEPTION_CONTINUE_SEARCH;
+  }
+
+  // TODO: We can capture the stack backtrace here and store it on the
+  // implementation if we so choose.
+
+  // Handle the crash
+  const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
+
+  // Note that we don't actually get here because HandleCrash calls
+  // longjmp, which means the HandleCrash function never returns.
+  llvm_unreachable("Handled the crash, should have longjmp'ed out of here");
+}
+
+// Because the Enable and Disable calls are static, it means that
+// there may not actually be an Impl available, or even a current
+// CrashRecoveryContext at all.  So we make use of a thread-local
+// exception table.  The handles contained in here will either be
+// non-NULL, valid VEH handles, or NULL.
+static sys::ThreadLocal<const void> sCurrentExceptionHandle;
+
+void CrashRecoveryContext::Enable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = true;
+
+  // We can set up vectored exception handling now.  We will install our
+  // handler as the front of the list, though there's no assurances that
+  // it will remain at the front (another call could install itself before
+  // our handler).  This 1) isn't likely, and 2) shouldn't cause problems.
+  PVOID handle = ::AddVectoredExceptionHandler(1, ExceptionHandler);
+  sCurrentExceptionHandle.set(handle);
+}
+
+void CrashRecoveryContext::Disable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (!gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = false;
+
+  PVOID currentHandle = const_cast<PVOID>(sCurrentExceptionHandle.get());
+  if (currentHandle) {
+    // Now we can remove the vectored exception handler from the chain
+    ::RemoveVectoredExceptionHandler(currentHandle);
+
+    // Reset the handle in our thread-local set.
+    sCurrentExceptionHandle.set(NULL);
+  }
+}
+
+#else
+
+// Generic POSIX implementation.
+//
+// This implementation relies on synchronous signals being delivered to the
+// current thread. We use a thread local object to keep track of the active
+// crash recovery context, and install signal handlers to invoke HandleCrash on
+// the active object.
+//
+// This implementation does not to attempt to chain signal handlers in any
+// reliable fashion -- if we get a signal outside of a crash recovery context we
+// simply disable crash recovery and raise the signal again.
+
+#include <signal.h>
+
+static const int Signals[] = { SIGABRT, SIGBUS, SIGFPE, SIGILL, SIGSEGV, SIGTRAP };
+static const unsigned NumSignals = sizeof(Signals) / sizeof(Signals[0]);
+static struct sigaction PrevActions[NumSignals];
+
+static void CrashRecoverySignalHandler(int Signal) {
+  // Lookup the current thread local recovery object.
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+
+  if (!CRCI) {
+    // We didn't find a crash recovery context -- this means either we got a
+    // signal on a thread we didn't expect it on, the application got a signal
+    // outside of a crash recovery context, or something else went horribly
+    // wrong.
+    //
+    // Disable crash recovery and raise the signal again. The assumption here is
+    // that the enclosing application will terminate soon, and we won't want to
+    // attempt crash recovery again.
+    //
+    // This call of Disable isn't thread safe, but it doesn't actually matter.
+    CrashRecoveryContext::Disable();
+    raise(Signal);
+
+    // The signal will be thrown once the signal mask is restored.
+    return;
+  }
+
+  // Unblock the signal we received.
+  sigset_t SigMask;
+  sigemptyset(&SigMask);
+  sigaddset(&SigMask, Signal);
+  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+
+  if (CRCI)
+    const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
+}
+
+void CrashRecoveryContext::Enable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = true;
+
+  // Setup the signal handler.
+  struct sigaction Handler;
+  Handler.sa_handler = CrashRecoverySignalHandler;
+  Handler.sa_flags = 0;
+  sigemptyset(&Handler.sa_mask);
+
+  for (unsigned i = 0; i != NumSignals; ++i) {
+    sigaction(Signals[i], &Handler, &PrevActions[i]);
+  }
+}
+
+void CrashRecoveryContext::Disable() {
+  sys::ScopedLock L(gCrashRecoveryContexMutex);
+
+  if (!gCrashRecoveryEnabled)
+    return;
+
+  gCrashRecoveryEnabled = false;
+
+  // Restore the previous signal handlers.
+  for (unsigned i = 0; i != NumSignals; ++i)
+    sigaction(Signals[i], &PrevActions[i], 0);
+}
+
+#endif
+
+bool CrashRecoveryContext::RunSafely(void (*Fn)(void*), void *UserData) {
+  // If crash recovery is disabled, do nothing.
+  if (gCrashRecoveryEnabled) {
+    assert(!Impl && "Crash recovery context already initialized!");
+    CrashRecoveryContextImpl *CRCI = new CrashRecoveryContextImpl(this);
+    Impl = CRCI;
+
+    if (setjmp(CRCI->JumpBuffer) != 0) {
+      return false;
+    }
+  }
+
+  Fn(UserData);
+  return true;
+}
+
+void CrashRecoveryContext::HandleCrash() {
+  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
+  assert(CRCI && "Crash recovery context never initialized!");
+  CRCI->HandleCrash();
+}
+
+const std::string &CrashRecoveryContext::getBacktrace() const {
+  CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *) Impl;
+  assert(CRC && "Crash recovery context never initialized!");
+  assert(CRC->Failed && "No crash was detected!");
+  return CRC->Backtrace;
+}
+
+//
+
+namespace {
+struct RunSafelyOnThreadInfo {
+  void (*UserFn)(void*);
+  void *UserData;
+  CrashRecoveryContext *CRC;
+  bool Result;
+};
+}
+
+static void RunSafelyOnThread_Dispatch(void *UserData) {
+  RunSafelyOnThreadInfo *Info =
+    reinterpret_cast<RunSafelyOnThreadInfo*>(UserData);
+  Info->Result = Info->CRC->RunSafely(Info->UserFn, Info->UserData);
+}
+bool CrashRecoveryContext::RunSafelyOnThread(void (*Fn)(void*), void *UserData,
+                                             unsigned RequestedStackSize) {
+  RunSafelyOnThreadInfo Info = { Fn, UserData, this, false };
+  llvm_execute_on_thread(RunSafelyOnThread_Dispatch, &Info, RequestedStackSize);
+  return Info.Result;
+}
diff --git a/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp
new file mode 100644
index 000000000000..34e82cf44169
--- /dev/null
+++ b/contrib/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -0,0 +1,360 @@
+//===--- DAGDeltaAlgorithm.cpp - A DAG Minimization Algorithm --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// The algorithm we use attempts to exploit the dependency information by
+// minimizing top-down. We start by constructing an initial root set R, and
+// then iteratively:
+//
+//   1. Minimize the set R using the test predicate:
+//       P'(S) = P(S union pred*(S))
+//
+//   2. Extend R to R' = R union pred(R).
+//
+// until a fixed point is reached.
+//
+// The idea is that we want to quickly prune entire portions of the graph, so we
+// try to find high-level nodes that can be eliminated with all of their
+// dependents.
+//
+// FIXME: The current algorithm doesn't actually provide a strong guarantee
+// about the minimality of the result. The problem is that after adding nodes to
+// the required set, we no longer consider them for elimination. For strictly
+// well formed predicates, this doesn't happen, but it commonly occurs in
+// practice when there are unmodelled dependencies. I believe we can resolve
+// this by allowing the required set to be minimized as well, but need more test
+// cases first.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DAGDeltaAlgorithm.h"
+#include "llvm/ADT/DeltaAlgorithm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <map>
+using namespace llvm;
+
+namespace {
+
+class DAGDeltaAlgorithmImpl {
+  friend class DeltaActiveSetHelper;
+
+public:
+  typedef DAGDeltaAlgorithm::change_ty change_ty;
+  typedef DAGDeltaAlgorithm::changeset_ty changeset_ty;
+  typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty;
+  typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+
+private:
+  typedef std::vector<change_ty>::iterator pred_iterator_ty;
+  typedef std::vector<change_ty>::iterator succ_iterator_ty;
+  typedef std::set<change_ty>::iterator pred_closure_iterator_ty;
+  typedef std::set<change_ty>::iterator succ_closure_iterator_ty;
+
+  DAGDeltaAlgorithm &DDA;
+
+  const changeset_ty &Changes;
+  const std::vector<edge_ty> &Dependencies;
+
+  std::vector<change_ty> Roots;
+
+  /// Cache of failed test results. Successful test results are never cached
+  /// since we always reduce following a success. We maintain an independent
+  /// cache from that used by the individual delta passes because we may get
+  /// hits across multiple individual delta invocations.
+  mutable std::set<changeset_ty> FailedTestsCache;
+
+  // FIXME: Gross.
+  std::map<change_ty, std::vector<change_ty> > Predecessors;
+  std::map<change_ty, std::vector<change_ty> > Successors;
+
+  std::map<change_ty, std::set<change_ty> > PredClosure;
+  std::map<change_ty, std::set<change_ty> > SuccClosure;
+
+private:
+  pred_iterator_ty pred_begin(change_ty Node) {
+    assert(Predecessors.count(Node) && "Invalid node!");
+    return Predecessors[Node].begin();
+  }
+  pred_iterator_ty pred_end(change_ty Node) {
+    assert(Predecessors.count(Node) && "Invalid node!");
+    return Predecessors[Node].end();
+  }
+
+  pred_closure_iterator_ty pred_closure_begin(change_ty Node) {
+    assert(PredClosure.count(Node) && "Invalid node!");
+    return PredClosure[Node].begin();
+  }
+  pred_closure_iterator_ty pred_closure_end(change_ty Node) {
+    assert(PredClosure.count(Node) && "Invalid node!");
+    return PredClosure[Node].end();
+  }
+  
+  succ_iterator_ty succ_begin(change_ty Node) {
+    assert(Successors.count(Node) && "Invalid node!");
+    return Successors[Node].begin();
+  }
+  succ_iterator_ty succ_end(change_ty Node) {
+    assert(Successors.count(Node) && "Invalid node!");
+    return Successors[Node].end();
+  }
+
+  succ_closure_iterator_ty succ_closure_begin(change_ty Node) {
+    assert(SuccClosure.count(Node) && "Invalid node!");
+    return SuccClosure[Node].begin();
+  }
+  succ_closure_iterator_ty succ_closure_end(change_ty Node) {
+    assert(SuccClosure.count(Node) && "Invalid node!");
+    return SuccClosure[Node].end();
+  }
+
+  void UpdatedSearchState(const changeset_ty &Changes,
+                          const changesetlist_ty &Sets,
+                          const changeset_ty &Required) {
+    DDA.UpdatedSearchState(Changes, Sets, Required);
+  }
+
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
+  bool ExecuteOneTest(const changeset_ty &S) {
+    // Check dependencies invariant.
+    DEBUG({
+        for (changeset_ty::const_iterator it = S.begin(),
+               ie = S.end(); it != ie; ++it)
+          for (succ_iterator_ty it2 = succ_begin(*it),
+                 ie2 = succ_end(*it); it2 != ie2; ++it2)
+            assert(S.count(*it2) && "Attempt to run invalid changeset!");
+      });
+
+    return DDA.ExecuteOneTest(S);
+  }
+
+public:
+  DAGDeltaAlgorithmImpl(DAGDeltaAlgorithm &_DDA,
+                        const changeset_ty &_Changes,
+                        const std::vector<edge_ty> &_Dependencies);
+
+  changeset_ty Run();
+
+  /// GetTestResult - Get the test result for the active set \p Changes with
+  /// \p Required changes from the cache, executing the test if necessary.
+  ///
+  /// \param Changes - The set of active changes being minimized, which should
+  /// have their pred closure included in the test.
+  /// \param Required - The set of changes which have previously been
+  /// established to be required.
+  /// \return - The test result.
+  bool GetTestResult(const changeset_ty &Changes, const changeset_ty &Required);
+};
+
+/// Helper object for minimizing an active set of changes.
+class DeltaActiveSetHelper : public DeltaAlgorithm {
+  DAGDeltaAlgorithmImpl &DDAI;
+
+  const changeset_ty &Required;
+
+protected:
+  /// UpdatedSearchState - Callback used when the search state changes.
+  virtual void UpdatedSearchState(const changeset_ty &Changes,
+                                  const changesetlist_ty &Sets) LLVM_OVERRIDE {
+    DDAI.UpdatedSearchState(Changes, Sets, Required);
+  }
+
+  virtual bool ExecuteOneTest(const changeset_ty &S) LLVM_OVERRIDE {
+    return DDAI.GetTestResult(S, Required);
+  }
+
+public:
+  DeltaActiveSetHelper(DAGDeltaAlgorithmImpl &_DDAI,
+                       const changeset_ty &_Required)
+    : DDAI(_DDAI), Required(_Required) {}
+};
+
+}
+
+DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl(DAGDeltaAlgorithm &_DDA,
+                                             const changeset_ty &_Changes,
+                                             const std::vector<edge_ty>
+                                               &_Dependencies)
+  : DDA(_DDA),
+    Changes(_Changes),
+    Dependencies(_Dependencies)
+{
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it) {
+    Predecessors.insert(std::make_pair(*it, std::vector<change_ty>()));
+    Successors.insert(std::make_pair(*it, std::vector<change_ty>()));
+  }
+  for (std::vector<edge_ty>::const_iterator it = Dependencies.begin(),
+         ie = Dependencies.end(); it != ie; ++it) {
+    Predecessors[it->second].push_back(it->first);
+    Successors[it->first].push_back(it->second);
+  }
+
+  // Compute the roots.
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    if (succ_begin(*it) == succ_end(*it))
+      Roots.push_back(*it);
+
+  // Pre-compute the closure of the successor relation.
+  std::vector<change_ty> Worklist(Roots.begin(), Roots.end());
+  while (!Worklist.empty()) {
+    change_ty Change = Worklist.back();
+    Worklist.pop_back();
+
+    std::set<change_ty> &ChangeSuccs = SuccClosure[Change];
+    for (pred_iterator_ty it = pred_begin(Change), 
+           ie = pred_end(Change); it != ie; ++it) {
+      SuccClosure[*it].insert(Change);
+      SuccClosure[*it].insert(ChangeSuccs.begin(), ChangeSuccs.end());
+      Worklist.push_back(*it);
+    }
+  }
+
+  // Invert to form the predecessor closure map.
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    PredClosure.insert(std::make_pair(*it, std::set<change_ty>()));
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
+           ie2 = succ_closure_end(*it); it2 != ie2; ++it2)
+      PredClosure[*it2].insert(*it);
+  
+  // Dump useful debug info.
+  DEBUG({
+      llvm::errs() << "-- DAGDeltaAlgorithmImpl --\n";
+      llvm::errs() << "Changes: [";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        if (it != Changes.begin()) llvm::errs() << ", ";
+        llvm::errs() << *it;
+
+        if (succ_begin(*it) != succ_end(*it)) {
+          llvm::errs() << "(";
+          for (succ_iterator_ty it2 = succ_begin(*it),
+                 ie2 = succ_end(*it); it2 != ie2; ++it2) {
+            if (it2 != succ_begin(*it)) llvm::errs() << ", ";
+            llvm::errs() << "->" << *it2;
+          }
+          llvm::errs() << ")";
+        }
+      }
+      llvm::errs() << "]\n";
+
+      llvm::errs() << "Roots: [";
+      for (std::vector<change_ty>::const_iterator it = Roots.begin(),
+             ie = Roots.end(); it != ie; ++it) {
+        if (it != Roots.begin()) llvm::errs() << ", ";
+        llvm::errs() << *it;
+      }
+      llvm::errs() << "]\n";
+
+      llvm::errs() << "Predecessor Closure:\n";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        llvm::errs() << format("  %-4d: [", *it);
+        for (pred_closure_iterator_ty it2 = pred_closure_begin(*it),
+               ie2 = pred_closure_end(*it); it2 != ie2; ++it2) {
+          if (it2 != pred_closure_begin(*it)) llvm::errs() << ", ";
+          llvm::errs() << *it2;
+        }
+        llvm::errs() << "]\n";
+      }
+      
+      llvm::errs() << "Successor Closure:\n";
+      for (changeset_ty::const_iterator it = Changes.begin(),
+             ie = Changes.end(); it != ie; ++it) {
+        llvm::errs() << format("  %-4d: [", *it);
+        for (succ_closure_iterator_ty it2 = succ_closure_begin(*it),
+               ie2 = succ_closure_end(*it); it2 != ie2; ++it2) {
+          if (it2 != succ_closure_begin(*it)) llvm::errs() << ", ";
+          llvm::errs() << *it2;
+        }
+        llvm::errs() << "]\n";
+      }
+
+      llvm::errs() << "\n\n";
+    });
+}
+
+bool DAGDeltaAlgorithmImpl::GetTestResult(const changeset_ty &Changes,
+                                          const changeset_ty &Required) {
+  changeset_ty Extended(Required);
+  Extended.insert(Changes.begin(), Changes.end());
+  for (changeset_ty::const_iterator it = Changes.begin(),
+         ie = Changes.end(); it != ie; ++it)
+    Extended.insert(pred_closure_begin(*it), pred_closure_end(*it));
+
+  if (FailedTestsCache.count(Extended))
+    return false;
+
+  bool Result = ExecuteOneTest(Extended);
+  if (!Result)
+    FailedTestsCache.insert(Extended);
+
+  return Result;
+}
+
+DAGDeltaAlgorithm::changeset_ty
+DAGDeltaAlgorithmImpl::Run() {
+  // The current set of changes we are minimizing, starting at the roots.
+  changeset_ty CurrentSet(Roots.begin(), Roots.end());
+
+  // The set of required changes.
+  changeset_ty Required;
+
+  // Iterate until the active set of changes is empty. Convergence is guaranteed
+  // assuming input was a DAG.
+  //
+  // Invariant:  CurrentSet intersect Required == {}
+  // Invariant:  Required == (Required union succ*(Required))
+  while (!CurrentSet.empty()) {
+    DEBUG({
+        llvm::errs() << "DAG_DD - " << CurrentSet.size() << " active changes, "
+                     << Required.size() << " required changes\n";
+      });
+
+    // Minimize the current set of changes.
+    DeltaActiveSetHelper Helper(*this, Required);
+    changeset_ty CurrentMinSet = Helper.Run(CurrentSet);
+
+    // Update the set of required changes. Since
+    //   CurrentMinSet subset CurrentSet
+    // and after the last iteration,
+    //   succ(CurrentSet) subset Required
+    // then
+    //   succ(CurrentMinSet) subset Required
+    // and our invariant on Required is maintained.
+    Required.insert(CurrentMinSet.begin(), CurrentMinSet.end());
+
+    // Replace the current set with the predecssors of the minimized set of
+    // active changes.
+    CurrentSet.clear();
+    for (changeset_ty::const_iterator it = CurrentMinSet.begin(),
+           ie = CurrentMinSet.end(); it != ie; ++it)
+      CurrentSet.insert(pred_begin(*it), pred_end(*it));
+
+    // FIXME: We could enforce CurrentSet intersect Required == {} here if we
+    // wanted to protect against cyclic graphs.
+  }
+
+  return Required;
+}
+
+void DAGDeltaAlgorithm::anchor() {
+}
+
+DAGDeltaAlgorithm::changeset_ty
+DAGDeltaAlgorithm::Run(const changeset_ty &Changes,
+                       const std::vector<edge_ty> &Dependencies) {
+  return DAGDeltaAlgorithmImpl(*this, Changes, Dependencies).Run();
+}
diff --git a/contrib/llvm/lib/Support/DataExtractor.cpp b/contrib/llvm/lib/Support/DataExtractor.cpp
new file mode 100644
index 000000000000..3d5cce05358c
--- /dev/null
+++ b/contrib/llvm/lib/Support/DataExtractor.cpp
@@ -0,0 +1,175 @@
+//===-- DataExtractor.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
+using namespace llvm;
+
+template <typename T>
+static T getU(uint32_t *offset_ptr, const DataExtractor *de,
+              bool isLittleEndian, const char *Data) {
+  T val = 0;
+  uint32_t offset = *offset_ptr;
+  if (de->isValidOffsetForDataOfSize(offset, sizeof(val))) {
+    std::memcpy(&val, &Data[offset], sizeof(val));
+    if (sys::isLittleEndianHost() != isLittleEndian)
+      val = sys::SwapByteOrder(val);
+
+    // Advance the offset
+    *offset_ptr += sizeof(val);
+  }
+  return val;
+}
+
+template <typename T>
+static T *getUs(uint32_t *offset_ptr, T *dst, uint32_t count,
+                const DataExtractor *de, bool isLittleEndian, const char *Data){
+  uint32_t offset = *offset_ptr;
+
+  if (count > 0 && de->isValidOffsetForDataOfSize(offset, sizeof(*dst)*count)) {
+    for (T *value_ptr = dst, *end = dst + count; value_ptr != end;
+        ++value_ptr, offset += sizeof(*dst))
+      *value_ptr = getU<T>(offset_ptr, de, isLittleEndian, Data);
+    // Advance the offset
+    *offset_ptr = offset;
+    // Return a non-NULL pointer to the converted data as an indicator of
+    // success
+    return dst;
+  }
+  return NULL;
+}
+
+uint8_t DataExtractor::getU8(uint32_t *offset_ptr) const {
+  return getU<uint8_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint8_t *
+DataExtractor::getU8(uint32_t *offset_ptr, uint8_t *dst, uint32_t count) const {
+  return getUs<uint8_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                       Data.data());
+}
+
+
+uint16_t DataExtractor::getU16(uint32_t *offset_ptr) const {
+  return getU<uint16_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint16_t *DataExtractor::getU16(uint32_t *offset_ptr, uint16_t *dst,
+                                uint32_t count) const {
+  return getUs<uint16_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());
+}
+
+uint32_t DataExtractor::getU32(uint32_t *offset_ptr) const {
+  return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint32_t *DataExtractor::getU32(uint32_t *offset_ptr, uint32_t *dst,
+                                uint32_t count) const {
+  return getUs<uint32_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());
+}
+
+uint64_t DataExtractor::getU64(uint32_t *offset_ptr) const {
+  return getU<uint64_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint64_t *DataExtractor::getU64(uint32_t *offset_ptr, uint64_t *dst,
+                                uint32_t count) const {
+  return getUs<uint64_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());
+}
+
+uint64_t
+DataExtractor::getUnsigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+  switch (byte_size) {
+  case 1:
+    return getU8(offset_ptr);
+  case 2:
+    return getU16(offset_ptr);
+  case 4:
+    return getU32(offset_ptr);
+  case 8:
+    return getU64(offset_ptr);
+  }
+  llvm_unreachable("getUnsigned unhandled case!");
+}
+
+int64_t
+DataExtractor::getSigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+  switch (byte_size) {
+  case 1:
+    return (int8_t)getU8(offset_ptr);
+  case 2:
+    return (int16_t)getU16(offset_ptr);
+  case 4:
+    return (int32_t)getU32(offset_ptr);
+  case 8:
+    return (int64_t)getU64(offset_ptr);
+  }
+  llvm_unreachable("getSigned unhandled case!");
+}
+
+const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
+  uint32_t offset = *offset_ptr;
+  StringRef::size_type pos = Data.find('\0', offset);
+  if (pos != StringRef::npos) {
+    *offset_ptr = pos + 1;
+    return Data.data() + offset;
+  }
+  return NULL;
+}
+
+uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
+  uint64_t result = 0;
+  if (Data.empty())
+    return 0;
+
+  unsigned shift = 0;
+  uint32_t offset = *offset_ptr;
+  uint8_t byte = 0;
+
+  while (isValidOffset(offset)) {
+    byte = Data[offset++];
+    result |= uint64_t(byte & 0x7f) << shift;
+    shift += 7;
+    if ((byte & 0x80) == 0)
+      break;
+  }
+
+  *offset_ptr = offset;
+  return result;
+}
+
+int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
+  int64_t result = 0;
+  if (Data.empty())
+    return 0;
+
+  unsigned shift = 0;
+  uint32_t offset = *offset_ptr;
+  uint8_t byte = 0;
+
+  while (isValidOffset(offset)) {
+    byte = Data[offset++];
+    result |= uint64_t(byte & 0x7f) << shift;
+    shift += 7;
+    if ((byte & 0x80) == 0)
+      break;
+  }
+
+  // Sign bit of byte is 2nd high order bit (0x40)
+  if (shift < 64 && (byte & 0x40))
+    result |= -(1ULL << shift);
+
+  *offset_ptr = offset;
+  return result;
+}
diff --git a/contrib/llvm/lib/Support/DataStream.cpp b/contrib/llvm/lib/Support/DataStream.cpp
new file mode 100644
index 000000000000..0a02281c2549
--- /dev/null
+++ b/contrib/llvm/lib/Support/DataStream.cpp
@@ -0,0 +1,98 @@
+//===--- llvm/Support/DataStream.cpp - Lazy streamed data -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements DataStreamer, which fetches bytes of Data from
+// a stream source. It provides support for streaming (lazy reading) of
+// bitcode. An example implementation of streaming from a file or stdin
+// is included.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "Data-stream"
+#include "llvm/Support/DataStream.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/system_error.h"
+#include <cerrno>
+#include <cstdio>
+#include <string>
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+#include <fcntl.h>
+using namespace llvm;
+
+// Interface goals:
+// * StreamableMemoryObject doesn't care about complexities like using
+//   threads/async callbacks to actually overlap download+compile
+// * Don't want to duplicate Data in memory
+// * Don't need to know total Data len in advance
+// Non-goals:
+// StreamableMemoryObject already has random access so this interface only does
+// in-order streaming (no arbitrary seeking, else we'd have to buffer all the
+// Data here in addition to MemoryObject).  This also means that if we want
+// to be able to to free Data, BitstreamBytes/BitcodeReader will implement it
+
+STATISTIC(NumStreamFetches, "Number of calls to Data stream fetch");
+
+namespace llvm {
+DataStreamer::~DataStreamer() {}
+}
+
+namespace {
+
+// Very simple stream backed by a file. Mostly useful for stdin and debugging;
+// actual file access is probably still best done with mmap.
+class DataFileStreamer : public DataStreamer {
+ int Fd;
+public:
+  DataFileStreamer() : Fd(0) {}
+  virtual ~DataFileStreamer() {
+    close(Fd);
+  }
+  virtual size_t GetBytes(unsigned char *buf, size_t len) LLVM_OVERRIDE {
+    NumStreamFetches++;
+    return read(Fd, buf, len);
+  }
+
+  error_code OpenFile(const std::string &Filename) {
+    if (Filename == "-") {
+      Fd = 0;
+      sys::Program::ChangeStdinToBinary();
+      return error_code::success();
+    }
+  
+    int OpenFlags = O_RDONLY;
+#ifdef O_BINARY
+    OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
+#endif
+    Fd = ::open(Filename.c_str(), OpenFlags);
+    if (Fd == -1)
+      return error_code(errno, posix_category());
+    return error_code::success();
+  }
+};
+
+}
+
+namespace llvm {
+DataStreamer *getDataFileStreamer(const std::string &Filename,
+                                  std::string *StrError) {
+  DataFileStreamer *s = new DataFileStreamer();
+  if (error_code e = s->OpenFile(Filename)) {
+    *StrError = std::string("Could not open ") + Filename + ": " +
+        e.message() + "\n";
+    return NULL;
+  }
+  return s;
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Debug.cpp b/contrib/llvm/lib/Support/Debug.cpp
new file mode 100644
index 000000000000..d9cb8a9da815
--- /dev/null
+++ b/contrib/llvm/lib/Support/Debug.cpp
@@ -0,0 +1,134 @@
+//===-- Debug.cpp - An easy way to add debug output to your code ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a handy way of adding debugging information to your
+// code, without it being enabled all of the time, and without having to add
+// command line options to enable it.
+//
+// In particular, just wrap your code with the DEBUG() macro, and it will be
+// enabled automatically if you specify '-debug' on the command-line.
+// Alternatively, you can also use the SET_DEBUG_TYPE("foo") macro to specify
+// that your debug code belongs to class "foo".  Then, on the command line, you
+// can specify '-debug-only=foo' to enable JUST the debug information for the
+// foo class.
+//
+// When compiling without assertions, the -debug-* options and all code in
+// DEBUG() statements disappears, so it does not affect the runtime of the code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/circular_raw_ostream.h"
+
+using namespace llvm;
+
+// All Debug.h functionality is a no-op in NDEBUG mode.
+#ifndef NDEBUG
+bool llvm::DebugFlag;  // DebugFlag - Exported boolean set by the -debug option
+
+// -debug - Command line option to enable the DEBUG statements in the passes.
+// This flag may only be enabled in debug builds.
+static cl::opt<bool, true>
+Debug("debug", cl::desc("Enable debug output"), cl::Hidden,
+      cl::location(DebugFlag));
+
+// -debug-buffer-size - Buffer the last N characters of debug output
+//until program termination.
+static cl::opt<unsigned>
+DebugBufferSize("debug-buffer-size",
+                cl::desc("Buffer the last N characters of debug output "
+                         "until program termination. "
+                         "[default 0 -- immediate print-out]"),
+                cl::Hidden,
+                cl::init(0));
+
+static std::string CurrentDebugType;
+
+namespace {
+
+struct DebugOnlyOpt {
+  void operator=(const std::string &Val) const {
+    DebugFlag |= !Val.empty();
+    CurrentDebugType = Val;
+  }
+};
+
+}
+
+static DebugOnlyOpt DebugOnlyOptLoc;
+
+static cl::opt<DebugOnlyOpt, true, cl::parser<std::string> >
+DebugOnly("debug-only", cl::desc("Enable a specific type of debug output"),
+          cl::Hidden, cl::value_desc("debug string"),
+          cl::location(DebugOnlyOptLoc), cl::ValueRequired);
+
+// Signal handlers - dump debug output on termination.
+static void debug_user_sig_handler(void *Cookie) {
+  // This is a bit sneaky.  Since this is under #ifndef NDEBUG, we
+  // know that debug mode is enabled and dbgs() really is a
+  // circular_raw_ostream.  If NDEBUG is defined, then dbgs() ==
+  // errs() but this will never be invoked.
+  llvm::circular_raw_ostream *dbgout =
+    static_cast<llvm::circular_raw_ostream *>(&llvm::dbgs());
+  dbgout->flushBufferWithBanner();
+}
+
+// isCurrentDebugType - Return true if the specified string is the debug type
+// specified on the command line, or if none was specified on the command line
+// with the -debug-only=X option.
+//
+bool llvm::isCurrentDebugType(const char *DebugType) {
+  return CurrentDebugType.empty() || DebugType == CurrentDebugType;
+}
+
+/// setCurrentDebugType - Set the current debug type, as if the -debug-only=X
+/// option were specified.  Note that DebugFlag also needs to be set to true for
+/// debug output to be produced.
+///
+void llvm::setCurrentDebugType(const char *Type) {
+  CurrentDebugType = Type;
+}
+
+/// dbgs - Return a circular-buffered debug stream.
+raw_ostream &llvm::dbgs() {
+  // Do one-time initialization in a thread-safe way.
+  static struct dbgstream {
+    circular_raw_ostream strm;
+
+    dbgstream() :
+        strm(errs(), "*** Debug Log Output ***\n",
+             (!EnableDebugBuffering || !DebugFlag) ? 0 : DebugBufferSize) {
+      if (EnableDebugBuffering && DebugFlag && DebugBufferSize != 0)
+        // TODO: Add a handler for SIGUSER1-type signals so the user can
+        // force a debug dump.
+        sys::AddSignalHandler(&debug_user_sig_handler, 0);
+      // Otherwise we've already set the debug stream buffer size to
+      // zero, disabling buffering so it will output directly to errs().
+    }
+  } thestrm;
+
+  return thestrm.strm;
+}
+
+#else
+// Avoid "has no symbols" warning.
+namespace llvm {
+  /// dbgs - Return errs().
+  raw_ostream &dbgs() {
+    return errs();
+  }
+}
+
+#endif
+
+/// EnableDebugBuffering - Turn on signal handler installation.
+///
+bool llvm::EnableDebugBuffering = false;
diff --git a/contrib/llvm/lib/Support/DeltaAlgorithm.cpp b/contrib/llvm/lib/Support/DeltaAlgorithm.cpp
new file mode 100644
index 000000000000..9e52874de832
--- /dev/null
+++ b/contrib/llvm/lib/Support/DeltaAlgorithm.cpp
@@ -0,0 +1,114 @@
+//===--- DeltaAlgorithm.cpp - A Set Minimization Algorithm -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DeltaAlgorithm.h"
+#include <algorithm>
+#include <iterator>
+using namespace llvm;
+
+DeltaAlgorithm::~DeltaAlgorithm() {
+}
+
+bool DeltaAlgorithm::GetTestResult(const changeset_ty &Changes) {
+  if (FailedTestsCache.count(Changes))
+    return false;
+
+  bool Result = ExecuteOneTest(Changes);
+  if (!Result)
+    FailedTestsCache.insert(Changes);
+
+  return Result;
+}
+
+void DeltaAlgorithm::Split(const changeset_ty &S, changesetlist_ty &Res) {
+  // FIXME: Allow clients to provide heuristics for improved splitting.
+
+  // FIXME: This is really slow.
+  changeset_ty LHS, RHS;
+  unsigned idx = 0, N = S.size() / 2;
+  for (changeset_ty::const_iterator it = S.begin(),
+         ie = S.end(); it != ie; ++it, ++idx)
+    ((idx < N) ? LHS : RHS).insert(*it);
+  if (!LHS.empty())
+    Res.push_back(LHS);
+  if (!RHS.empty())
+    Res.push_back(RHS);
+}
+
+DeltaAlgorithm::changeset_ty
+DeltaAlgorithm::Delta(const changeset_ty &Changes,
+                      const changesetlist_ty &Sets) {
+  // Invariant: union(Res) == Changes
+  UpdatedSearchState(Changes, Sets);
+
+  // If there is nothing left we can remove, we are done.
+  if (Sets.size() <= 1)
+    return Changes;
+
+  // Look for a passing subset.
+  changeset_ty Res;
+  if (Search(Changes, Sets, Res))
+    return Res;
+
+  // Otherwise, partition the sets if possible; if not we are done.
+  changesetlist_ty SplitSets;
+  for (changesetlist_ty::const_iterator it = Sets.begin(),
+         ie = Sets.end(); it != ie; ++it)
+    Split(*it, SplitSets);
+  if (SplitSets.size() == Sets.size())
+    return Changes;
+
+  return Delta(Changes, SplitSets);
+}
+
+bool DeltaAlgorithm::Search(const changeset_ty &Changes,
+                            const changesetlist_ty &Sets,
+                            changeset_ty &Res) {
+  // FIXME: Parallelize.
+  for (changesetlist_ty::const_iterator it = Sets.begin(),
+         ie = Sets.end(); it != ie; ++it) {
+    // If the test passes on this subset alone, recurse.
+    if (GetTestResult(*it)) {
+      changesetlist_ty Sets;
+      Split(*it, Sets);
+      Res = Delta(*it, Sets);
+      return true;
+    }
+
+    // Otherwise, if we have more than two sets, see if test passes on the
+    // complement.
+    if (Sets.size() > 2) {
+      // FIXME: This is really slow.
+      changeset_ty Complement;
+      std::set_difference(
+        Changes.begin(), Changes.end(), it->begin(), it->end(),
+        std::insert_iterator<changeset_ty>(Complement, Complement.begin()));
+      if (GetTestResult(Complement)) {
+        changesetlist_ty ComplementSets;
+        ComplementSets.insert(ComplementSets.end(), Sets.begin(), it);
+        ComplementSets.insert(ComplementSets.end(), it + 1, Sets.end());
+        Res = Delta(Complement, ComplementSets);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+DeltaAlgorithm::changeset_ty DeltaAlgorithm::Run(const changeset_ty &Changes) {
+  // Check empty set first to quickly find poor test functions.
+  if (GetTestResult(changeset_ty()))
+    return changeset_ty();
+
+  // Otherwise run the real delta algorithm.
+  changesetlist_ty Sets;
+  Split(Changes, Sets);
+
+  return Delta(Changes, Sets);
+}
diff --git a/contrib/llvm/lib/Support/Disassembler.cpp b/contrib/llvm/lib/Support/Disassembler.cpp
new file mode 100644
index 000000000000..b3244fab7df7
--- /dev/null
+++ b/contrib/llvm/lib/Support/Disassembler.cpp
@@ -0,0 +1,74 @@
+//===- lib/Support/Disassembler.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the necessary glue to call external disassembler
+// libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Disassembler.h"
+#include "llvm/Config/config.h"
+#include <cassert>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#if USE_UDIS86
+#include <udis86.h>
+#endif
+
+using namespace llvm;
+
+bool llvm::sys::hasDisassembler()
+{
+#if defined (__i386__) || defined (__amd64__) || defined (__x86_64__)
+  // We have option to enable udis86 library.
+# if USE_UDIS86
+  return true;
+#else
+  return false;
+#endif
+#else
+  return false;
+#endif
+}
+
+std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
+                                         uint64_t pc) {
+  std::stringstream res;
+
+#if (defined (__i386__) || defined (__amd64__) || defined (__x86_64__)) \
+  && USE_UDIS86
+  unsigned bits;
+# if defined(__i386__)
+  bits = 32;
+# else
+  bits = 64;
+# endif
+
+  ud_t ud_obj;
+
+  ud_init(&ud_obj);
+  ud_set_input_buffer(&ud_obj, start, length);
+  ud_set_mode(&ud_obj, bits);
+  ud_set_pc(&ud_obj, pc);
+  ud_set_syntax(&ud_obj, UD_SYN_ATT);
+
+  res << std::setbase(16)
+      << std::setw(bits/4);
+
+  while (ud_disassemble(&ud_obj)) {
+    res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n";
+  }
+#else
+  res << "No disassembler available. See configure help for options.\n";
+#endif
+
+  return res.str();
+}
diff --git a/contrib/llvm/lib/Support/Dwarf.cpp b/contrib/llvm/lib/Support/Dwarf.cpp
new file mode 100644
index 000000000000..0f91c11ac260
--- /dev/null
+++ b/contrib/llvm/lib/Support/Dwarf.cpp
@@ -0,0 +1,724 @@
+//===-- llvm/Support/Dwarf.cpp - Dwarf Framework ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for generic dwarf information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Dwarf.h"
+using namespace llvm;
+using namespace dwarf;
+
+/// TagString - Return the string for the specified tag.
+///
+const char *llvm::dwarf::TagString(unsigned Tag) {
+  switch (Tag) {
+  case DW_TAG_array_type:                return "DW_TAG_array_type";
+  case DW_TAG_class_type:                return "DW_TAG_class_type";
+  case DW_TAG_entry_point:               return "DW_TAG_entry_point";
+  case DW_TAG_enumeration_type:          return "DW_TAG_enumeration_type";
+  case DW_TAG_formal_parameter:          return "DW_TAG_formal_parameter";
+  case DW_TAG_imported_declaration:      return "DW_TAG_imported_declaration";
+  case DW_TAG_label:                     return "DW_TAG_label";
+  case DW_TAG_lexical_block:             return "DW_TAG_lexical_block";
+  case DW_TAG_member:                    return "DW_TAG_member";
+  case DW_TAG_pointer_type:              return "DW_TAG_pointer_type";
+  case DW_TAG_reference_type:            return "DW_TAG_reference_type";
+  case DW_TAG_compile_unit:              return "DW_TAG_compile_unit";
+  case DW_TAG_string_type:               return "DW_TAG_string_type";
+  case DW_TAG_structure_type:            return "DW_TAG_structure_type";
+  case DW_TAG_subroutine_type:           return "DW_TAG_subroutine_type";
+  case DW_TAG_typedef:                   return "DW_TAG_typedef";
+  case DW_TAG_union_type:                return "DW_TAG_union_type";
+  case DW_TAG_unspecified_parameters:    return "DW_TAG_unspecified_parameters";
+  case DW_TAG_variant:                   return "DW_TAG_variant";
+  case DW_TAG_common_block:              return "DW_TAG_common_block";
+  case DW_TAG_common_inclusion:          return "DW_TAG_common_inclusion";
+  case DW_TAG_inheritance:               return "DW_TAG_inheritance";
+  case DW_TAG_inlined_subroutine:        return "DW_TAG_inlined_subroutine";
+  case DW_TAG_module:                    return "DW_TAG_module";
+  case DW_TAG_ptr_to_member_type:        return "DW_TAG_ptr_to_member_type";
+  case DW_TAG_set_type:                  return "DW_TAG_set_type";
+  case DW_TAG_subrange_type:             return "DW_TAG_subrange_type";
+  case DW_TAG_with_stmt:                 return "DW_TAG_with_stmt";
+  case DW_TAG_access_declaration:        return "DW_TAG_access_declaration";
+  case DW_TAG_base_type:                 return "DW_TAG_base_type";
+  case DW_TAG_catch_block:               return "DW_TAG_catch_block";
+  case DW_TAG_const_type:                return "DW_TAG_const_type";
+  case DW_TAG_constant:                  return "DW_TAG_constant";
+  case DW_TAG_enumerator:                return "DW_TAG_enumerator";
+  case DW_TAG_file_type:                 return "DW_TAG_file_type";
+  case DW_TAG_friend:                    return "DW_TAG_friend";
+  case DW_TAG_namelist:                  return "DW_TAG_namelist";
+  case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
+  case DW_TAG_packed_type:               return "DW_TAG_packed_type";
+  case DW_TAG_subprogram:                return "DW_TAG_subprogram";
+  case DW_TAG_template_type_parameter:  return "DW_TAG_template_type_parameter";
+  case DW_TAG_template_value_parameter:return "DW_TAG_template_value_parameter";
+  case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
+  case DW_TAG_try_block:                 return "DW_TAG_try_block";
+  case DW_TAG_variant_part:              return "DW_TAG_variant_part";
+  case DW_TAG_variable:                  return "DW_TAG_variable";
+  case DW_TAG_volatile_type:             return "DW_TAG_volatile_type";
+  case DW_TAG_dwarf_procedure:           return "DW_TAG_dwarf_procedure";
+  case DW_TAG_restrict_type:             return "DW_TAG_restrict_type";
+  case DW_TAG_interface_type:            return "DW_TAG_interface_type";
+  case DW_TAG_namespace:                 return "DW_TAG_namespace";
+  case DW_TAG_imported_module:           return "DW_TAG_imported_module";
+  case DW_TAG_unspecified_type:          return "DW_TAG_unspecified_type";
+  case DW_TAG_partial_unit:              return "DW_TAG_partial_unit";
+  case DW_TAG_imported_unit:             return "DW_TAG_imported_unit";
+  case DW_TAG_condition:                 return "DW_TAG_condition";
+  case DW_TAG_shared_type:               return "DW_TAG_shared_type";
+  case DW_TAG_lo_user:                   return "DW_TAG_lo_user";
+  case DW_TAG_hi_user:                   return "DW_TAG_hi_user";
+  case DW_TAG_auto_variable:             return "DW_TAG_auto_variable";
+  case DW_TAG_arg_variable:              return "DW_TAG_arg_variable";
+  case DW_TAG_rvalue_reference_type:     return "DW_TAG_rvalue_reference_type";
+  case DW_TAG_template_alias:            return "DW_TAG_template_alias";
+  case DW_TAG_MIPS_loop:                 return "DW_TAG_MIPS_loop";
+  case DW_TAG_type_unit:                 return "DW_TAG_type_unit";
+  case DW_TAG_format_label:              return "DW_TAG_format_label";
+  case DW_TAG_function_template:         return "DW_TAG_function_template";
+  case DW_TAG_class_template:            return "DW_TAG_class_template";
+  case DW_TAG_GNU_template_template_param:
+    return "DW_TAG_GNU_template_template_param";
+  case DW_TAG_GNU_template_parameter_pack:
+    return "DW_TAG_GNU_template_parameter_pack";
+  case DW_TAG_GNU_formal_parameter_pack:
+    return "DW_TAG_GNU_formal_parameter_pack";
+  case DW_TAG_APPLE_property:            return "DW_TAG_APPLE_property";
+  }
+  return 0;
+}
+
+/// ChildrenString - Return the string for the specified children flag.
+///
+const char *llvm::dwarf::ChildrenString(unsigned Children) {
+  switch (Children) {
+  case DW_CHILDREN_no:                   return "DW_CHILDREN_no";
+  case DW_CHILDREN_yes:                  return "DW_CHILDREN_yes";
+  }
+  return 0;
+}
+
+/// AttributeString - Return the string for the specified attribute.
+///
+const char *llvm::dwarf::AttributeString(unsigned Attribute) {
+  switch (Attribute) {
+  case DW_AT_sibling:                    return "DW_AT_sibling";
+  case DW_AT_location:                   return "DW_AT_location";
+  case DW_AT_name:                       return "DW_AT_name";
+  case DW_AT_ordering:                   return "DW_AT_ordering";
+  case DW_AT_byte_size:                  return "DW_AT_byte_size";
+  case DW_AT_bit_offset:                 return "DW_AT_bit_offset";
+  case DW_AT_bit_size:                   return "DW_AT_bit_size";
+  case DW_AT_stmt_list:                  return "DW_AT_stmt_list";
+  case DW_AT_low_pc:                     return "DW_AT_low_pc";
+  case DW_AT_high_pc:                    return "DW_AT_high_pc";
+  case DW_AT_language:                   return "DW_AT_language";
+  case DW_AT_discr:                      return "DW_AT_discr";
+  case DW_AT_discr_value:                return "DW_AT_discr_value";
+  case DW_AT_visibility:                 return "DW_AT_visibility";
+  case DW_AT_import:                     return "DW_AT_import";
+  case DW_AT_string_length:              return "DW_AT_string_length";
+  case DW_AT_common_reference:           return "DW_AT_common_reference";
+  case DW_AT_comp_dir:                   return "DW_AT_comp_dir";
+  case DW_AT_const_value:                return "DW_AT_const_value";
+  case DW_AT_containing_type:            return "DW_AT_containing_type";
+  case DW_AT_default_value:              return "DW_AT_default_value";
+  case DW_AT_inline:                     return "DW_AT_inline";
+  case DW_AT_is_optional:                return "DW_AT_is_optional";
+  case DW_AT_lower_bound:                return "DW_AT_lower_bound";
+  case DW_AT_producer:                   return "DW_AT_producer";
+  case DW_AT_prototyped:                 return "DW_AT_prototyped";
+  case DW_AT_return_addr:                return "DW_AT_return_addr";
+  case DW_AT_start_scope:                return "DW_AT_start_scope";
+  case DW_AT_bit_stride:                 return "DW_AT_bit_stride";
+  case DW_AT_upper_bound:                return "DW_AT_upper_bound";
+  case DW_AT_abstract_origin:            return "DW_AT_abstract_origin";
+  case DW_AT_accessibility:              return "DW_AT_accessibility";
+  case DW_AT_address_class:              return "DW_AT_address_class";
+  case DW_AT_artificial:                 return "DW_AT_artificial";
+  case DW_AT_base_types:                 return "DW_AT_base_types";
+  case DW_AT_calling_convention:         return "DW_AT_calling_convention";
+  case DW_AT_count:                      return "DW_AT_count";
+  case DW_AT_data_member_location:       return "DW_AT_data_member_location";
+  case DW_AT_decl_column:                return "DW_AT_decl_column";
+  case DW_AT_decl_file:                  return "DW_AT_decl_file";
+  case DW_AT_decl_line:                  return "DW_AT_decl_line";
+  case DW_AT_declaration:                return "DW_AT_declaration";
+  case DW_AT_discr_list:                 return "DW_AT_discr_list";
+  case DW_AT_encoding:                   return "DW_AT_encoding";
+  case DW_AT_external:                   return "DW_AT_external";
+  case DW_AT_frame_base:                 return "DW_AT_frame_base";
+  case DW_AT_friend:                     return "DW_AT_friend";
+  case DW_AT_identifier_case:            return "DW_AT_identifier_case";
+  case DW_AT_macro_info:                 return "DW_AT_macro_info";
+  case DW_AT_namelist_item:              return "DW_AT_namelist_item";
+  case DW_AT_priority:                   return "DW_AT_priority";
+  case DW_AT_segment:                    return "DW_AT_segment";
+  case DW_AT_specification:              return "DW_AT_specification";
+  case DW_AT_static_link:                return "DW_AT_static_link";
+  case DW_AT_type:                       return "DW_AT_type";
+  case DW_AT_use_location:               return "DW_AT_use_location";
+  case DW_AT_variable_parameter:         return "DW_AT_variable_parameter";
+  case DW_AT_virtuality:                 return "DW_AT_virtuality";
+  case DW_AT_vtable_elem_location:       return "DW_AT_vtable_elem_location";
+  case DW_AT_allocated:                  return "DW_AT_allocated";
+  case DW_AT_associated:                 return "DW_AT_associated";
+  case DW_AT_data_location:              return "DW_AT_data_location";
+  case DW_AT_byte_stride:                return "DW_AT_byte_stride";
+  case DW_AT_entry_pc:                   return "DW_AT_entry_pc";
+  case DW_AT_use_UTF8:                   return "DW_AT_use_UTF8";
+  case DW_AT_extension:                  return "DW_AT_extension";
+  case DW_AT_ranges:                     return "DW_AT_ranges";
+  case DW_AT_trampoline:                 return "DW_AT_trampoline";
+  case DW_AT_call_column:                return "DW_AT_call_column";
+  case DW_AT_call_file:                  return "DW_AT_call_file";
+  case DW_AT_call_line:                  return "DW_AT_call_line";
+  case DW_AT_description:                return "DW_AT_description";
+  case DW_AT_binary_scale:               return "DW_AT_binary_scale";
+  case DW_AT_decimal_scale:              return "DW_AT_decimal_scale";
+  case DW_AT_small:                      return "DW_AT_small";
+  case DW_AT_decimal_sign:               return "DW_AT_decimal_sign";
+  case DW_AT_digit_count:                return "DW_AT_digit_count";
+  case DW_AT_picture_string:             return "DW_AT_picture_string";
+  case DW_AT_mutable:                    return "DW_AT_mutable";
+  case DW_AT_threads_scaled:             return "DW_AT_threads_scaled";
+  case DW_AT_explicit:                   return "DW_AT_explicit";
+  case DW_AT_object_pointer:             return "DW_AT_object_pointer";
+  case DW_AT_endianity:                  return "DW_AT_endianity";
+  case DW_AT_elemental:                  return "DW_AT_elemental";
+  case DW_AT_pure:                       return "DW_AT_pure";
+  case DW_AT_recursive:                  return "DW_AT_recursive";
+  case DW_AT_signature:                  return "DW_AT_signature";
+  case DW_AT_main_subprogram:            return "DW_AT_main_subprogram";
+  case DW_AT_data_bit_offset:            return "DW_AT_data_bit_offset";
+  case DW_AT_const_expr:                 return "DW_AT_const_expr";
+  case DW_AT_enum_class:                 return "DW_AT_enum_class";
+  case DW_AT_linkage_name:               return "DW_AT_linkage_name";
+  case DW_AT_MIPS_loop_begin:            return "DW_AT_MIPS_loop_begin";
+  case DW_AT_MIPS_tail_loop_begin:       return "DW_AT_MIPS_tail_loop_begin";
+  case DW_AT_MIPS_epilog_begin:          return "DW_AT_MIPS_epilog_begin";
+  case DW_AT_MIPS_loop_unroll_factor:    return "DW_AT_MIPS_loop_unroll_factor";
+  case DW_AT_MIPS_software_pipeline_depth:
+    return "DW_AT_MIPS_software_pipeline_depth";
+  case DW_AT_MIPS_linkage_name:          return "DW_AT_MIPS_linkage_name";
+  case DW_AT_MIPS_stride:                return "DW_AT_MIPS_stride";
+  case DW_AT_MIPS_abstract_name:         return "DW_AT_MIPS_abstract_name";
+  case DW_AT_MIPS_clone_origin:          return "DW_AT_MIPS_clone_origin";
+  case DW_AT_MIPS_has_inlines:           return "DW_AT_MIPS_has_inlines";
+  case DW_AT_MIPS_stride_byte:           return "DW_AT_MIPS_stride_byte";
+  case DW_AT_MIPS_stride_elem:           return "DW_AT_MIPS_stride_elem";
+  case DW_AT_MIPS_ptr_dopetype:          return "DW_AT_MIPS_ptr_dopetype";
+  case DW_AT_MIPS_allocatable_dopetype:
+    return "DW_AT_MIPS_allocatable_dopetype";
+  case DW_AT_MIPS_assumed_shape_dopetype:
+    return "DW_AT_MIPS_assumed_shape_dopetype";
+  case DW_AT_sf_names:                   return "DW_AT_sf_names";
+  case DW_AT_src_info:                   return "DW_AT_src_info";
+  case DW_AT_mac_info:                   return "DW_AT_mac_info";
+  case DW_AT_src_coords:                 return "DW_AT_src_coords";
+  case DW_AT_body_begin:                 return "DW_AT_body_begin";
+  case DW_AT_body_end:                   return "DW_AT_body_end";
+  case DW_AT_GNU_vector:                 return "DW_AT_GNU_vector";
+  case DW_AT_GNU_template_name:          return "DW_AT_GNU_template_name";
+  case DW_AT_MIPS_assumed_size:          return "DW_AT_MIPS_assumed_size";
+  case DW_AT_lo_user:                    return "DW_AT_lo_user";
+  case DW_AT_hi_user:                    return "DW_AT_hi_user";
+  case DW_AT_APPLE_optimized:            return "DW_AT_APPLE_optimized";
+  case DW_AT_APPLE_flags:                return "DW_AT_APPLE_flags";
+  case DW_AT_APPLE_isa:                  return "DW_AT_APPLE_isa";
+  case DW_AT_APPLE_block:                return "DW_AT_APPLE_block";
+  case DW_AT_APPLE_major_runtime_vers:   return "DW_AT_APPLE_major_runtime_vers";
+  case DW_AT_APPLE_runtime_class:        return "DW_AT_APPLE_runtime_class";
+  case DW_AT_APPLE_omit_frame_ptr:       return "DW_AT_APPLE_omit_frame_ptr";
+  case DW_AT_APPLE_property_name:        return "DW_AT_APPLE_property_name";
+  case DW_AT_APPLE_property_getter:      return "DW_AT_APPLE_property_getter";
+  case DW_AT_APPLE_property_setter:      return "DW_AT_APPLE_property_setter";
+  case DW_AT_APPLE_property_attribute:   return "DW_AT_APPLE_property_attribute";
+  case DW_AT_APPLE_property:             return "DW_AT_APPLE_property";
+  case DW_AT_APPLE_objc_complete_type:   return "DW_AT_APPLE_objc_complete_type";
+
+    // DWARF5 Fission Extension Attribute
+  case DW_AT_GNU_dwo_name:               return "DW_AT_GNU_dwo_name";
+  case DW_AT_GNU_dwo_id:                 return "DW_AT_GNU_dwo_id";
+  case DW_AT_GNU_ranges_base:            return "DW_AT_GNU_ranges_base";
+  case DW_AT_GNU_addr_base:              return "DW_AT_GNU_addr_base";
+  case DW_AT_GNU_pubnames:               return "DW_AT_GNU_pubnames";
+  case DW_AT_GNU_pubtypes:               return "DW_AT_GNU_pubtypes";
+  }
+  return 0;
+}
+
+/// FormEncodingString - Return the string for the specified form encoding.
+///
+const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  case DW_FORM_addr:                     return "DW_FORM_addr";
+  case DW_FORM_block2:                   return "DW_FORM_block2";
+  case DW_FORM_block4:                   return "DW_FORM_block4";
+  case DW_FORM_data2:                    return "DW_FORM_data2";
+  case DW_FORM_data4:                    return "DW_FORM_data4";
+  case DW_FORM_data8:                    return "DW_FORM_data8";
+  case DW_FORM_string:                   return "DW_FORM_string";
+  case DW_FORM_block:                    return "DW_FORM_block";
+  case DW_FORM_block1:                   return "DW_FORM_block1";
+  case DW_FORM_data1:                    return "DW_FORM_data1";
+  case DW_FORM_flag:                     return "DW_FORM_flag";
+  case DW_FORM_sdata:                    return "DW_FORM_sdata";
+  case DW_FORM_strp:                     return "DW_FORM_strp";
+  case DW_FORM_udata:                    return "DW_FORM_udata";
+  case DW_FORM_ref_addr:                 return "DW_FORM_ref_addr";
+  case DW_FORM_ref1:                     return "DW_FORM_ref1";
+  case DW_FORM_ref2:                     return "DW_FORM_ref2";
+  case DW_FORM_ref4:                     return "DW_FORM_ref4";
+  case DW_FORM_ref8:                     return "DW_FORM_ref8";
+  case DW_FORM_ref_udata:                return "DW_FORM_ref_udata";
+  case DW_FORM_indirect:                 return "DW_FORM_indirect";
+  case DW_FORM_sec_offset:               return "DW_FORM_sec_offset";
+  case DW_FORM_exprloc:                  return "DW_FORM_exprloc";
+  case DW_FORM_flag_present:             return "DW_FORM_flag_present";
+  case DW_FORM_ref_sig8:                 return "DW_FORM_ref_sig8";
+
+    // DWARF5 Fission Extension Forms
+  case DW_FORM_GNU_addr_index:           return "DW_FORM_GNU_addr_index";
+  case DW_FORM_GNU_str_index:            return "DW_FORM_GNU_str_index";
+  }
+  return 0;
+}
+
+/// OperationEncodingString - Return the string for the specified operation
+/// encoding.
+const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  case DW_OP_addr:                       return "DW_OP_addr";
+  case DW_OP_deref:                      return "DW_OP_deref";
+  case DW_OP_const1u:                    return "DW_OP_const1u";
+  case DW_OP_const1s:                    return "DW_OP_const1s";
+  case DW_OP_const2u:                    return "DW_OP_const2u";
+  case DW_OP_const2s:                    return "DW_OP_const2s";
+  case DW_OP_const4u:                    return "DW_OP_const4u";
+  case DW_OP_const4s:                    return "DW_OP_const4s";
+  case DW_OP_const8u:                    return "DW_OP_const8u";
+  case DW_OP_const8s:                    return "DW_OP_const8s";
+  case DW_OP_constu:                     return "DW_OP_constu";
+  case DW_OP_consts:                     return "DW_OP_consts";
+  case DW_OP_dup:                        return "DW_OP_dup";
+  case DW_OP_drop:                       return "DW_OP_drop";
+  case DW_OP_over:                       return "DW_OP_over";
+  case DW_OP_pick:                       return "DW_OP_pick";
+  case DW_OP_swap:                       return "DW_OP_swap";
+  case DW_OP_rot:                        return "DW_OP_rot";
+  case DW_OP_xderef:                     return "DW_OP_xderef";
+  case DW_OP_abs:                        return "DW_OP_abs";
+  case DW_OP_and:                        return "DW_OP_and";
+  case DW_OP_div:                        return "DW_OP_div";
+  case DW_OP_minus:                      return "DW_OP_minus";
+  case DW_OP_mod:                        return "DW_OP_mod";
+  case DW_OP_mul:                        return "DW_OP_mul";
+  case DW_OP_neg:                        return "DW_OP_neg";
+  case DW_OP_not:                        return "DW_OP_not";
+  case DW_OP_or:                         return "DW_OP_or";
+  case DW_OP_plus:                       return "DW_OP_plus";
+  case DW_OP_plus_uconst:                return "DW_OP_plus_uconst";
+  case DW_OP_shl:                        return "DW_OP_shl";
+  case DW_OP_shr:                        return "DW_OP_shr";
+  case DW_OP_shra:                       return "DW_OP_shra";
+  case DW_OP_xor:                        return "DW_OP_xor";
+  case DW_OP_skip:                       return "DW_OP_skip";
+  case DW_OP_bra:                        return "DW_OP_bra";
+  case DW_OP_eq:                         return "DW_OP_eq";
+  case DW_OP_ge:                         return "DW_OP_ge";
+  case DW_OP_gt:                         return "DW_OP_gt";
+  case DW_OP_le:                         return "DW_OP_le";
+  case DW_OP_lt:                         return "DW_OP_lt";
+  case DW_OP_ne:                         return "DW_OP_ne";
+  case DW_OP_lit0:                       return "DW_OP_lit0";
+  case DW_OP_lit1:                       return "DW_OP_lit1";
+  case DW_OP_lit2:                       return "DW_OP_lit2";
+  case DW_OP_lit3:                       return "DW_OP_lit3";
+  case DW_OP_lit4:                       return "DW_OP_lit4";
+  case DW_OP_lit5:                       return "DW_OP_lit5";
+  case DW_OP_lit6:                       return "DW_OP_lit6";
+  case DW_OP_lit7:                       return "DW_OP_lit7";
+  case DW_OP_lit8:                       return "DW_OP_lit8";
+  case DW_OP_lit9:                       return "DW_OP_lit9";
+  case DW_OP_lit10:                      return "DW_OP_lit10";
+  case DW_OP_lit11:                      return "DW_OP_lit11";
+  case DW_OP_lit12:                      return "DW_OP_lit12";
+  case DW_OP_lit13:                      return "DW_OP_lit13";
+  case DW_OP_lit14:                      return "DW_OP_lit14";
+  case DW_OP_lit15:                      return "DW_OP_lit15";
+  case DW_OP_lit16:                      return "DW_OP_lit16";
+  case DW_OP_lit17:                      return "DW_OP_lit17";
+  case DW_OP_lit18:                      return "DW_OP_lit18";
+  case DW_OP_lit19:                      return "DW_OP_lit19";
+  case DW_OP_lit20:                      return "DW_OP_lit20";
+  case DW_OP_lit21:                      return "DW_OP_lit21";
+  case DW_OP_lit22:                      return "DW_OP_lit22";
+  case DW_OP_lit23:                      return "DW_OP_lit23";
+  case DW_OP_lit24:                      return "DW_OP_lit24";
+  case DW_OP_lit25:                      return "DW_OP_lit25";
+  case DW_OP_lit26:                      return "DW_OP_lit26";
+  case DW_OP_lit27:                      return "DW_OP_lit27";
+  case DW_OP_lit28:                      return "DW_OP_lit28";
+  case DW_OP_lit29:                      return "DW_OP_lit29";
+  case DW_OP_lit30:                      return "DW_OP_lit30";
+  case DW_OP_lit31:                      return "DW_OP_lit31";
+  case DW_OP_reg0:                       return "DW_OP_reg0";
+  case DW_OP_reg1:                       return "DW_OP_reg1";
+  case DW_OP_reg2:                       return "DW_OP_reg2";
+  case DW_OP_reg3:                       return "DW_OP_reg3";
+  case DW_OP_reg4:                       return "DW_OP_reg4";
+  case DW_OP_reg5:                       return "DW_OP_reg5";
+  case DW_OP_reg6:                       return "DW_OP_reg6";
+  case DW_OP_reg7:                       return "DW_OP_reg7";
+  case DW_OP_reg8:                       return "DW_OP_reg8";
+  case DW_OP_reg9:                       return "DW_OP_reg9";
+  case DW_OP_reg10:                      return "DW_OP_reg10";
+  case DW_OP_reg11:                      return "DW_OP_reg11";
+  case DW_OP_reg12:                      return "DW_OP_reg12";
+  case DW_OP_reg13:                      return "DW_OP_reg13";
+  case DW_OP_reg14:                      return "DW_OP_reg14";
+  case DW_OP_reg15:                      return "DW_OP_reg15";
+  case DW_OP_reg16:                      return "DW_OP_reg16";
+  case DW_OP_reg17:                      return "DW_OP_reg17";
+  case DW_OP_reg18:                      return "DW_OP_reg18";
+  case DW_OP_reg19:                      return "DW_OP_reg19";
+  case DW_OP_reg20:                      return "DW_OP_reg20";
+  case DW_OP_reg21:                      return "DW_OP_reg21";
+  case DW_OP_reg22:                      return "DW_OP_reg22";
+  case DW_OP_reg23:                      return "DW_OP_reg23";
+  case DW_OP_reg24:                      return "DW_OP_reg24";
+  case DW_OP_reg25:                      return "DW_OP_reg25";
+  case DW_OP_reg26:                      return "DW_OP_reg26";
+  case DW_OP_reg27:                      return "DW_OP_reg27";
+  case DW_OP_reg28:                      return "DW_OP_reg28";
+  case DW_OP_reg29:                      return "DW_OP_reg29";
+  case DW_OP_reg30:                      return "DW_OP_reg30";
+  case DW_OP_reg31:                      return "DW_OP_reg31";
+  case DW_OP_breg0:                      return "DW_OP_breg0";
+  case DW_OP_breg1:                      return "DW_OP_breg1";
+  case DW_OP_breg2:                      return "DW_OP_breg2";
+  case DW_OP_breg3:                      return "DW_OP_breg3";
+  case DW_OP_breg4:                      return "DW_OP_breg4";
+  case DW_OP_breg5:                      return "DW_OP_breg5";
+  case DW_OP_breg6:                      return "DW_OP_breg6";
+  case DW_OP_breg7:                      return "DW_OP_breg7";
+  case DW_OP_breg8:                      return "DW_OP_breg8";
+  case DW_OP_breg9:                      return "DW_OP_breg9";
+  case DW_OP_breg10:                     return "DW_OP_breg10";
+  case DW_OP_breg11:                     return "DW_OP_breg11";
+  case DW_OP_breg12:                     return "DW_OP_breg12";
+  case DW_OP_breg13:                     return "DW_OP_breg13";
+  case DW_OP_breg14:                     return "DW_OP_breg14";
+  case DW_OP_breg15:                     return "DW_OP_breg15";
+  case DW_OP_breg16:                     return "DW_OP_breg16";
+  case DW_OP_breg17:                     return "DW_OP_breg17";
+  case DW_OP_breg18:                     return "DW_OP_breg18";
+  case DW_OP_breg19:                     return "DW_OP_breg19";
+  case DW_OP_breg20:                     return "DW_OP_breg20";
+  case DW_OP_breg21:                     return "DW_OP_breg21";
+  case DW_OP_breg22:                     return "DW_OP_breg22";
+  case DW_OP_breg23:                     return "DW_OP_breg23";
+  case DW_OP_breg24:                     return "DW_OP_breg24";
+  case DW_OP_breg25:                     return "DW_OP_breg25";
+  case DW_OP_breg26:                     return "DW_OP_breg26";
+  case DW_OP_breg27:                     return "DW_OP_breg27";
+  case DW_OP_breg28:                     return "DW_OP_breg28";
+  case DW_OP_breg29:                     return "DW_OP_breg29";
+  case DW_OP_breg30:                     return "DW_OP_breg30";
+  case DW_OP_breg31:                     return "DW_OP_breg31";
+  case DW_OP_regx:                       return "DW_OP_regx";
+  case DW_OP_fbreg:                      return "DW_OP_fbreg";
+  case DW_OP_bregx:                      return "DW_OP_bregx";
+  case DW_OP_piece:                      return "DW_OP_piece";
+  case DW_OP_deref_size:                 return "DW_OP_deref_size";
+  case DW_OP_xderef_size:                return "DW_OP_xderef_size";
+  case DW_OP_nop:                        return "DW_OP_nop";
+  case DW_OP_push_object_address:        return "DW_OP_push_object_address";
+  case DW_OP_call2:                      return "DW_OP_call2";
+  case DW_OP_call4:                      return "DW_OP_call4";
+  case DW_OP_call_ref:                   return "DW_OP_call_ref";
+  case DW_OP_form_tls_address:           return "DW_OP_form_tls_address";
+  case DW_OP_call_frame_cfa:             return "DW_OP_call_frame_cfa";
+  case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
+  case DW_OP_implicit_value:             return "DW_OP_implicit_value";
+  case DW_OP_stack_value:                return "DW_OP_stack_value";
+  case DW_OP_lo_user:                    return "DW_OP_lo_user";
+  case DW_OP_hi_user:                    return "DW_OP_hi_user";
+
+    // DWARF5 Fission Proposal Op Extensions
+  case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
+  case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
+  }
+  return 0;
+}
+
+/// AttributeEncodingString - Return the string for the specified attribute
+/// encoding.
+const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  case DW_ATE_address:                   return "DW_ATE_address";
+  case DW_ATE_boolean:                   return "DW_ATE_boolean";
+  case DW_ATE_complex_float:             return "DW_ATE_complex_float";
+  case DW_ATE_float:                     return "DW_ATE_float";
+  case DW_ATE_signed:                    return "DW_ATE_signed";
+  case DW_ATE_signed_char:               return "DW_ATE_signed_char";
+  case DW_ATE_unsigned:                  return "DW_ATE_unsigned";
+  case DW_ATE_unsigned_char:             return "DW_ATE_unsigned_char";
+  case DW_ATE_imaginary_float:           return "DW_ATE_imaginary_float";
+  case DW_ATE_UTF:                       return "DW_ATE_UTF";
+  case DW_ATE_packed_decimal:            return "DW_ATE_packed_decimal";
+  case DW_ATE_numeric_string:            return "DW_ATE_numeric_string";
+  case DW_ATE_edited:                    return "DW_ATE_edited";
+  case DW_ATE_signed_fixed:              return "DW_ATE_signed_fixed";
+  case DW_ATE_unsigned_fixed:            return "DW_ATE_unsigned_fixed";
+  case DW_ATE_decimal_float:             return "DW_ATE_decimal_float";
+  case DW_ATE_lo_user:                   return "DW_ATE_lo_user";
+  case DW_ATE_hi_user:                   return "DW_ATE_hi_user";
+  }
+  return 0;
+}
+
+/// DecimalSignString - Return the string for the specified decimal sign
+/// attribute.
+const char *llvm::dwarf::DecimalSignString(unsigned Sign) {
+  switch (Sign) {
+  case DW_DS_unsigned:                   return "DW_DS_unsigned";
+  case DW_DS_leading_overpunch:          return "DW_DS_leading_overpunch";
+  case DW_DS_trailing_overpunch:         return "DW_DS_trailing_overpunch";
+  case DW_DS_leading_separate:           return "DW_DS_leading_separate";
+  case DW_DS_trailing_separate:          return "DW_DS_trailing_separate";
+  }
+  return 0;
+}
+
+/// EndianityString - Return the string for the specified endianity.
+///
+const char *llvm::dwarf::EndianityString(unsigned Endian) {
+  switch (Endian) {
+  case DW_END_default:                   return "DW_END_default";
+  case DW_END_big:                       return "DW_END_big";
+  case DW_END_little:                    return "DW_END_little";
+  case DW_END_lo_user:                   return "DW_END_lo_user";
+  case DW_END_hi_user:                   return "DW_END_hi_user";
+  }
+  return 0;
+}
+
+/// AccessibilityString - Return the string for the specified accessibility.
+///
+const char *llvm::dwarf::AccessibilityString(unsigned Access) {
+  switch (Access) {
+  // Accessibility codes
+  case DW_ACCESS_public:                 return "DW_ACCESS_public";
+  case DW_ACCESS_protected:              return "DW_ACCESS_protected";
+  case DW_ACCESS_private:                return "DW_ACCESS_private";
+  }
+  return 0;
+}
+
+/// VisibilityString - Return the string for the specified visibility.
+///
+const char *llvm::dwarf::VisibilityString(unsigned Visibility) {
+  switch (Visibility) {
+  case DW_VIS_local:                     return "DW_VIS_local";
+  case DW_VIS_exported:                  return "DW_VIS_exported";
+  case DW_VIS_qualified:                 return "DW_VIS_qualified";
+  }
+  return 0;
+}
+
+/// VirtualityString - Return the string for the specified virtuality.
+///
+const char *llvm::dwarf::VirtualityString(unsigned Virtuality) {
+  switch (Virtuality) {
+  case DW_VIRTUALITY_none:               return "DW_VIRTUALITY_none";
+  case DW_VIRTUALITY_virtual:            return "DW_VIRTUALITY_virtual";
+  case DW_VIRTUALITY_pure_virtual:       return "DW_VIRTUALITY_pure_virtual";
+  }
+  return 0;
+}
+
+/// LanguageString - Return the string for the specified language.
+///
+const char *llvm::dwarf::LanguageString(unsigned Language) {
+  switch (Language) {
+  case DW_LANG_C89:                      return "DW_LANG_C89";
+  case DW_LANG_C:                        return "DW_LANG_C";
+  case DW_LANG_Ada83:                    return "DW_LANG_Ada83";
+  case DW_LANG_C_plus_plus:              return "DW_LANG_C_plus_plus";
+  case DW_LANG_Cobol74:                  return "DW_LANG_Cobol74";
+  case DW_LANG_Cobol85:                  return "DW_LANG_Cobol85";
+  case DW_LANG_Fortran77:                return "DW_LANG_Fortran77";
+  case DW_LANG_Fortran90:                return "DW_LANG_Fortran90";
+  case DW_LANG_Pascal83:                 return "DW_LANG_Pascal83";
+  case DW_LANG_Modula2:                  return "DW_LANG_Modula2";
+  case DW_LANG_Java:                     return "DW_LANG_Java";
+  case DW_LANG_C99:                      return "DW_LANG_C99";
+  case DW_LANG_Ada95:                    return "DW_LANG_Ada95";
+  case DW_LANG_Fortran95:                return "DW_LANG_Fortran95";
+  case DW_LANG_PLI:                      return "DW_LANG_PLI";
+  case DW_LANG_ObjC:                     return "DW_LANG_ObjC";
+  case DW_LANG_ObjC_plus_plus:           return "DW_LANG_ObjC_plus_plus";
+  case DW_LANG_UPC:                      return "DW_LANG_UPC";
+  case DW_LANG_D:                        return "DW_LANG_D";
+  case DW_LANG_lo_user:                  return "DW_LANG_lo_user";
+  case DW_LANG_hi_user:                  return "DW_LANG_hi_user";
+  }
+  return 0;
+}
+
+/// CaseString - Return the string for the specified identifier case.
+///
+const char *llvm::dwarf::CaseString(unsigned Case) {
+  switch (Case) {
+  case DW_ID_case_sensitive:             return "DW_ID_case_sensitive";
+  case DW_ID_up_case:                    return "DW_ID_up_case";
+  case DW_ID_down_case:                  return "DW_ID_down_case";
+  case DW_ID_case_insensitive:           return "DW_ID_case_insensitive";
+  }
+  return 0;
+}
+
+/// ConventionString - Return the string for the specified calling convention.
+///
+const char *llvm::dwarf::ConventionString(unsigned Convention) {
+   switch (Convention) {
+   case DW_CC_normal:                     return "DW_CC_normal";
+   case DW_CC_program:                    return "DW_CC_program";
+   case DW_CC_nocall:                     return "DW_CC_nocall";
+   case DW_CC_lo_user:                    return "DW_CC_lo_user";
+   case DW_CC_hi_user:                    return "DW_CC_hi_user";
+  }
+  return 0;
+}
+
+/// InlineCodeString - Return the string for the specified inline code.
+///
+const char *llvm::dwarf::InlineCodeString(unsigned Code) {
+  switch (Code) {
+  case DW_INL_not_inlined:               return "DW_INL_not_inlined";
+  case DW_INL_inlined:                   return "DW_INL_inlined";
+  case DW_INL_declared_not_inlined:      return "DW_INL_declared_not_inlined";
+  case DW_INL_declared_inlined:          return "DW_INL_declared_inlined";
+  }
+  return 0;
+}
+
+/// ArrayOrderString - Return the string for the specified array order.
+///
+const char *llvm::dwarf::ArrayOrderString(unsigned Order) {
+  switch (Order) {
+  case DW_ORD_row_major:                 return "DW_ORD_row_major";
+  case DW_ORD_col_major:                 return "DW_ORD_col_major";
+  }
+  return 0;
+}
+
+/// DiscriminantString - Return the string for the specified discriminant
+/// descriptor.
+const char *llvm::dwarf::DiscriminantString(unsigned Discriminant) {
+  switch (Discriminant) {
+  case DW_DSC_label:                     return "DW_DSC_label";
+  case DW_DSC_range:                     return "DW_DSC_range";
+  }
+  return 0;
+}
+
+/// LNStandardString - Return the string for the specified line number standard.
+///
+const char *llvm::dwarf::LNStandardString(unsigned Standard) {
+  switch (Standard) {
+  case DW_LNS_copy:                      return "DW_LNS_copy";
+  case DW_LNS_advance_pc:                return "DW_LNS_advance_pc";
+  case DW_LNS_advance_line:              return "DW_LNS_advance_line";
+  case DW_LNS_set_file:                  return "DW_LNS_set_file";
+  case DW_LNS_set_column:                return "DW_LNS_set_column";
+  case DW_LNS_negate_stmt:               return "DW_LNS_negate_stmt";
+  case DW_LNS_set_basic_block:           return "DW_LNS_set_basic_block";
+  case DW_LNS_const_add_pc:              return "DW_LNS_const_add_pc";
+  case DW_LNS_fixed_advance_pc:          return "DW_LNS_fixed_advance_pc";
+  case DW_LNS_set_prologue_end:          return "DW_LNS_set_prologue_end";
+  case DW_LNS_set_epilogue_begin:        return "DW_LNS_set_epilogue_begin";
+  case DW_LNS_set_isa:                   return "DW_LNS_set_isa";
+  }
+  return 0;
+}
+
+/// LNExtendedString - Return the string for the specified line number extended
+/// opcode encodings.
+const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
+  switch (Encoding) {
+  // Line Number Extended Opcode Encodings
+  case DW_LNE_end_sequence:              return "DW_LNE_end_sequence";
+  case DW_LNE_set_address:               return "DW_LNE_set_address";
+  case DW_LNE_define_file:               return "DW_LNE_define_file";
+  case DW_LNE_set_discriminator:         return "DW_LNE_set_discriminator";
+  case DW_LNE_lo_user:                   return "DW_LNE_lo_user";
+  case DW_LNE_hi_user:                   return "DW_LNE_hi_user";
+  }
+  return 0;
+}
+
+/// MacinfoString - Return the string for the specified macinfo type encodings.
+///
+const char *llvm::dwarf::MacinfoString(unsigned Encoding) {
+  switch (Encoding) {
+  // Macinfo Type Encodings
+  case DW_MACINFO_define:                return "DW_MACINFO_define";
+  case DW_MACINFO_undef:                 return "DW_MACINFO_undef";
+  case DW_MACINFO_start_file:            return "DW_MACINFO_start_file";
+  case DW_MACINFO_end_file:              return "DW_MACINFO_end_file";
+  case DW_MACINFO_vendor_ext:            return "DW_MACINFO_vendor_ext";
+  }
+  return 0;
+}
+
+/// CallFrameString - Return the string for the specified call frame instruction
+/// encodings.
+const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
+  switch (Encoding) {
+  case DW_CFA_nop:                       return "DW_CFA_nop";
+  case DW_CFA_advance_loc:               return "DW_CFA_advance_loc";
+  case DW_CFA_offset:                    return "DW_CFA_offset";
+  case DW_CFA_restore:                   return "DW_CFA_restore";
+  case DW_CFA_set_loc:                   return "DW_CFA_set_loc";
+  case DW_CFA_advance_loc1:              return "DW_CFA_advance_loc1";
+  case DW_CFA_advance_loc2:              return "DW_CFA_advance_loc2";
+  case DW_CFA_advance_loc4:              return "DW_CFA_advance_loc4";
+  case DW_CFA_offset_extended:           return "DW_CFA_offset_extended";
+  case DW_CFA_restore_extended:          return "DW_CFA_restore_extended";
+  case DW_CFA_undefined:                 return "DW_CFA_undefined";
+  case DW_CFA_same_value:                return "DW_CFA_same_value";
+  case DW_CFA_register:                  return "DW_CFA_register";
+  case DW_CFA_remember_state:            return "DW_CFA_remember_state";
+  case DW_CFA_restore_state:             return "DW_CFA_restore_state";
+  case DW_CFA_def_cfa:                   return "DW_CFA_def_cfa";
+  case DW_CFA_def_cfa_register:          return "DW_CFA_def_cfa_register";
+  case DW_CFA_def_cfa_offset:            return "DW_CFA_def_cfa_offset";
+  case DW_CFA_def_cfa_expression:        return "DW_CFA_def_cfa_expression";
+  case DW_CFA_expression:                return "DW_CFA_expression";
+  case DW_CFA_offset_extended_sf:        return "DW_CFA_offset_extended_sf";
+  case DW_CFA_def_cfa_sf:                return "DW_CFA_def_cfa_sf";
+  case DW_CFA_def_cfa_offset_sf:         return "DW_CFA_def_cfa_offset_sf";
+  case DW_CFA_val_offset:                return "DW_CFA_val_offset";
+  case DW_CFA_val_offset_sf:             return "DW_CFA_val_offset_sf";
+  case DW_CFA_val_expression:            return "DW_CFA_val_expression";
+  case DW_CFA_MIPS_advance_loc8:         return "DW_CFA_MIPS_advance_loc8";
+  case DW_CFA_GNU_window_save:           return "DW_CFA_GNU_window_save";
+  case DW_CFA_GNU_args_size:             return "DW_CFA_GNU_args_size";
+  case DW_CFA_lo_user:                   return "DW_CFA_lo_user";
+  case DW_CFA_hi_user:                   return "DW_CFA_hi_user";
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/Support/DynamicLibrary.cpp b/contrib/llvm/lib/Support/DynamicLibrary.cpp
new file mode 100644
index 000000000000..f14cb45d9dc0
--- /dev/null
+++ b/contrib/llvm/lib/Support/DynamicLibrary.cpp
@@ -0,0 +1,189 @@
+//===-- DynamicLibrary.cpp - Runtime link/load libraries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system DynamicLibrary concept.
+//
+// FIXME: This file leaks ExplicitSymbols and OpenedHandles!
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Mutex.h"
+#include <cstdio>
+#include <cstring>
+
+// Collection of symbol name/value pairs to be searched prior to any libraries.
+static llvm::StringMap<void *> *ExplicitSymbols = 0;
+
+namespace {
+
+struct ExplicitSymbolsDeleter {
+  ~ExplicitSymbolsDeleter() {
+    delete ExplicitSymbols;
+  }
+};
+
+}
+
+static ExplicitSymbolsDeleter Dummy;
+
+
+static llvm::sys::SmartMutex<true>& getMutex() {
+  static llvm::sys::SmartMutex<true> HandlesMutex;
+  return HandlesMutex;
+}
+
+void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
+                                          void *symbolValue) {
+  SmartScopedLock<true> lock(getMutex());
+  if (ExplicitSymbols == 0)
+    ExplicitSymbols = new StringMap<void*>();
+  (*ExplicitSymbols)[symbolName] = symbolValue;
+}
+
+char llvm::sys::DynamicLibrary::Invalid = 0;
+
+#ifdef LLVM_ON_WIN32
+
+#include "Windows/DynamicLibrary.inc"
+
+#else
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+using namespace llvm;
+using namespace llvm::sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+static DenseSet<void *> *OpenedHandles = 0;
+
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(getMutex());
+
+  void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
+  if (handle == 0) {
+    if (errMsg) *errMsg = dlerror();
+    return DynamicLibrary();
+  }
+
+#ifdef __CYGWIN__
+  // Cygwin searches symbols only in the main
+  // with the handle of dlopen(NULL, RTLD_GLOBAL).
+  if (filename == NULL)
+    handle = RTLD_DEFAULT;
+#endif
+
+  if (OpenedHandles == 0)
+    OpenedHandles = new DenseSet<void *>();
+
+  // If we've already loaded this library, dlclose() the handle in order to
+  // keep the internal refcount at +1.
+  if (!OpenedHandles->insert(handle).second)
+    dlclose(handle);
+
+  return DynamicLibrary(handle);
+}
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  if (!isValid())
+    return NULL;
+  return dlsym(Data, symbolName);
+}
+
+#else
+
+using namespace llvm;
+using namespace llvm::sys;
+
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  if (errMsg) *errMsg = "dlopen() not supported on this platform";
+  return DynamicLibrary();
+}
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  return NULL;
+}
+
+#endif
+
+namespace llvm {
+void *SearchForAddressOfSpecialSymbol(const char* symbolName);
+}
+
+void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
+  SmartScopedLock<true> Lock(getMutex());
+
+  // First check symbols added via AddSymbol().
+  if (ExplicitSymbols) {
+    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
+
+    if (i != ExplicitSymbols->end())
+      return i->second;
+  }
+
+#if HAVE_DLFCN_H
+  // Now search the libraries.
+  if (OpenedHandles) {
+    for (DenseSet<void *>::iterator I = OpenedHandles->begin(),
+         E = OpenedHandles->end(); I != E; ++I) {
+      //lt_ptr ptr = lt_dlsym(*I, symbolName);
+      void *ptr = dlsym(*I, symbolName);
+      if (ptr) {
+        return ptr;
+      }
+    }
+  }
+#endif
+
+  if (void *Result = llvm::SearchForAddressOfSpecialSymbol(symbolName))
+    return Result;
+
+// This macro returns the address of a well-known, explicit symbol
+#define EXPLICIT_SYMBOL(SYM) \
+   if (!strcmp(symbolName, #SYM)) return &SYM
+
+// On linux we have a weird situation. The stderr/out/in symbols are both
+// macros and global variables because of standards requirements. So, we
+// boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
+#if defined(__linux__) and !defined(__ANDROID__)
+  {
+    EXPLICIT_SYMBOL(stderr);
+    EXPLICIT_SYMBOL(stdout);
+    EXPLICIT_SYMBOL(stdin);
+  }
+#else
+  // For everything else, we want to check to make sure the symbol isn't defined
+  // as a macro before using EXPLICIT_SYMBOL.
+  {
+#ifndef stdin
+    EXPLICIT_SYMBOL(stdin);
+#endif
+#ifndef stdout
+    EXPLICIT_SYMBOL(stdout);
+#endif
+#ifndef stderr
+    EXPLICIT_SYMBOL(stderr);
+#endif
+  }
+#endif
+#undef EXPLICIT_SYMBOL
+
+  return 0;
+}
+
+#endif // LLVM_ON_WIN32
diff --git a/contrib/llvm/lib/Support/Errno.cpp b/contrib/llvm/lib/Support/Errno.cpp
new file mode 100644
index 000000000000..730220f47d92
--- /dev/null
+++ b/contrib/llvm/lib/Support/Errno.cpp
@@ -0,0 +1,79 @@
+//===- Errno.cpp - errno support --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the errno wrappers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Errno.h"
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/raw_ostream.h"
+
+#if HAVE_STRING_H
+#include <string.h>
+
+#if HAVE_ERRNO_H
+#include <errno.h>
+#endif
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace sys {
+
+#if HAVE_ERRNO_H
+std::string StrError() {
+  return StrError(errno);
+}
+#endif  // HAVE_ERRNO_H
+
+std::string StrError(int errnum) {
+  const int MaxErrStrLen = 2000;
+  char buffer[MaxErrStrLen];
+  buffer[0] = '\0';
+  std::string str;
+#ifdef HAVE_STRERROR_R
+  // strerror_r is thread-safe.
+  if (errnum)
+# if defined(__GLIBC__) && defined(_GNU_SOURCE)
+    // glibc defines its own incompatible version of strerror_r
+    // which may not use the buffer supplied.
+    str = strerror_r(errnum,buffer,MaxErrStrLen-1);
+# else
+    strerror_r(errnum,buffer,MaxErrStrLen-1);
+    str = buffer;
+# endif
+#elif HAVE_DECL_STRERROR_S // "Windows Secure API"
+    if (errnum) {
+      strerror_s(buffer, MaxErrStrLen - 1, errnum);
+      str = buffer;
+    }
+#elif defined(HAVE_STRERROR)
+  // Copy the thread un-safe result of strerror into
+  // the buffer as fast as possible to minimize impact
+  // of collision of strerror in multiple threads.
+  if (errnum)
+    str = strerror(errnum);
+#else
+  // Strange that this system doesn't even have strerror
+  // but, oh well, just use a generic message
+  raw_string_ostream stream(str);
+  stream << "Error #" << errnum;
+  stream.flush();
+#endif
+  return str;
+}
+
+}  // namespace sys
+}  // namespace llvm
+
+#endif  // HAVE_STRING_H
diff --git a/contrib/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm/lib/Support/ErrorHandling.cpp
new file mode 100644
index 000000000000..f4b591e777eb
--- /dev/null
+++ b/contrib/llvm/lib/Support/ErrorHandling.cpp
@@ -0,0 +1,99 @@
+//===- lib/Support/ErrorHandling.cpp - Callbacks for errors ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an API used to indicate fatal error conditions.  Non-fatal
+// errors (most of them) should be handled through LLVMContext.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/Threading.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+
+#if defined(HAVE_UNISTD_H)
+# include <unistd.h>
+#endif
+#if defined(_MSC_VER)
+# include <io.h>
+# include <fcntl.h>
+#endif
+
+using namespace llvm;
+
+static fatal_error_handler_t ErrorHandler = 0;
+static void *ErrorHandlerUserData = 0;
+
+void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
+                                       void *user_data) {
+  assert(!llvm_is_multithreaded() &&
+         "Cannot register error handlers after starting multithreaded mode!\n");
+  assert(!ErrorHandler && "Error handler already registered!\n");
+  ErrorHandler = handler;
+  ErrorHandlerUserData = user_data;
+}
+
+void llvm::remove_fatal_error_handler() {
+  ErrorHandler = 0;
+}
+
+void llvm::report_fatal_error(const char *Reason, bool GenCrashDiag) {
+  report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(const std::string &Reason, bool GenCrashDiag) {
+  report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(StringRef Reason, bool GenCrashDiag) {
+  report_fatal_error(Twine(Reason), GenCrashDiag);
+}
+
+void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
+  if (ErrorHandler) {
+    ErrorHandler(ErrorHandlerUserData, Reason.str(), GenCrashDiag);
+  } else {
+    // Blast the result out to stderr.  We don't try hard to make sure this
+    // succeeds (e.g. handling EINTR) and we can't use errs() here because
+    // raw ostreams can call report_fatal_error.
+    SmallVector<char, 64> Buffer;
+    raw_svector_ostream OS(Buffer);
+    OS << "LLVM ERROR: " << Reason << "\n";
+    StringRef MessageStr = OS.str();
+    ssize_t written = ::write(2, MessageStr.data(), MessageStr.size());
+    (void)written; // If something went wrong, we deliberately just give up.
+  }
+
+  // If we reached here, we are failing ungracefully. Run the interrupt handlers
+  // to make sure any special cleanups get done, in particular that we remove
+  // files registered with RemoveFileOnSignal.
+  sys::RunInterruptHandlers();
+
+  exit(1);
+}
+
+void llvm::llvm_unreachable_internal(const char *msg, const char *file,
+                                     unsigned line) {
+  // This code intentionally doesn't call the ErrorHandler callback, because
+  // llvm_unreachable is intended to be used to indicate "impossible"
+  // situations, and not legitimate runtime errors.
+  if (msg)
+    dbgs() << msg << "\n";
+  dbgs() << "UNREACHABLE executed";
+  if (file)
+    dbgs() << " at " << file << ":" << line;
+  dbgs() << "!\n";
+  abort();
+}
diff --git a/contrib/llvm/lib/Support/FileOutputBuffer.cpp b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
new file mode 100644
index 000000000000..1ee69b60234f
--- /dev/null
+++ b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
@@ -0,0 +1,119 @@
+//===- FileOutputBuffer.cpp - File Output Buffer ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility for creating a in-memory buffer that will be written to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+using llvm::sys::fs::mapped_file_region;
+
+namespace llvm {
+FileOutputBuffer::FileOutputBuffer(mapped_file_region * R,
+                                   StringRef Path, StringRef TmpPath)
+  : Region(R)
+  , FinalPath(Path)
+  , TempPath(TmpPath) {
+}
+
+FileOutputBuffer::~FileOutputBuffer() {
+  bool Existed;
+  sys::fs::remove(Twine(TempPath), Existed);
+}
+
+error_code FileOutputBuffer::create(StringRef FilePath,
+                                    size_t Size,
+                                    OwningPtr<FileOutputBuffer> &Result,
+                                    unsigned Flags) {
+  // If file already exists, it must be a regular file (to be mappable).
+  sys::fs::file_status Stat;
+  error_code EC = sys::fs::status(FilePath, Stat);
+  switch (Stat.type()) {
+    case sys::fs::file_type::file_not_found:
+      // If file does not exist, we'll create one.
+      break;
+    case sys::fs::file_type::regular_file: {
+        // If file is not currently writable, error out.
+        // FIXME: There is no sys::fs:: api for checking this.
+        // FIXME: In posix, you use the access() call to check this.
+      }
+      break;
+    default:
+      if (EC)
+        return EC;
+      else
+        return make_error_code(errc::operation_not_permitted);
+  }
+
+  // Delete target file.
+  bool Existed;
+  EC = sys::fs::remove(FilePath, Existed);
+  if (EC)
+    return EC;
+
+  // Create new file in same directory but with random name.
+  SmallString<128> TempFilePath;
+  int FD;
+  EC = sys::fs::unique_file(Twine(FilePath) + ".tmp%%%%%%%",
+                            FD, TempFilePath, false, 0644);
+  if (EC)
+    return EC;
+
+  OwningPtr<mapped_file_region> MappedFile(new mapped_file_region(
+      FD, true, mapped_file_region::readwrite, Size, 0, EC));
+  if (EC)
+    return EC;
+
+  // If requested, make the output file executable.
+  if ( Flags & F_executable ) {
+    sys::fs::file_status Stat2;
+    EC = sys::fs::status(Twine(TempFilePath), Stat2);
+    if (EC)
+      return EC;
+
+    sys::fs::perms new_perms = Stat2.permissions();
+    if ( new_perms & sys::fs::owner_read )
+      new_perms |= sys::fs::owner_exe;
+    if ( new_perms & sys::fs::group_read )
+      new_perms |= sys::fs::group_exe;
+    if ( new_perms & sys::fs::others_read )
+      new_perms |= sys::fs::others_exe;
+    new_perms |= sys::fs::add_perms;
+    EC = sys::fs::permissions(Twine(TempFilePath), new_perms);
+    if (EC)
+      return EC;
+  }
+
+  Result.reset(new FileOutputBuffer(MappedFile.get(), FilePath, TempFilePath));
+  if (Result)
+    MappedFile.take();
+
+  return error_code::success();
+}
+
+error_code FileOutputBuffer::commit(int64_t NewSmallerSize) {
+  // Unmap buffer, letting OS flush dirty pages to file on disk.
+  Region.reset(0);
+
+  // If requested, resize file as part of commit.
+  if ( NewSmallerSize != -1 ) {
+    error_code EC = sys::fs::resize_file(Twine(TempPath), NewSmallerSize);
+    if (EC)
+      return EC;
+  }
+
+  // Rename file to final name.
+  return sys::fs::rename(Twine(TempPath), Twine(FinalPath));
+}
+} // namespace
diff --git a/contrib/llvm/lib/Support/FileUtilities.cpp b/contrib/llvm/lib/Support/FileUtilities.cpp
new file mode 100644
index 000000000000..4d7b2391f01e
--- /dev/null
+++ b/contrib/llvm/lib/Support/FileUtilities.cpp
@@ -0,0 +1,280 @@
+//===- Support/FileUtilities.cpp - File System Utilities ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a family of utility functions which are useful for doing
+// various things with files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+static bool isSignedChar(char C) {
+  return (C == '+' || C == '-');
+}
+
+static bool isExponentChar(char C) {
+  switch (C) {
+  case 'D':  // Strange exponential notation.
+  case 'd':  // Strange exponential notation.
+  case 'e':
+  case 'E': return true;
+  default: return false;
+  }
+}
+
+static bool isNumberChar(char C) {
+  switch (C) {
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+  case '.': return true;
+  default: return isSignedChar(C) || isExponentChar(C);
+  }
+}
+
+static const char *BackupNumber(const char *Pos, const char *FirstChar) {
+  // If we didn't stop in the middle of a number, don't backup.
+  if (!isNumberChar(*Pos)) return Pos;
+
+  // Otherwise, return to the start of the number.
+  bool HasPeriod = false;
+  while (Pos > FirstChar && isNumberChar(Pos[-1])) {
+    // Backup over at most one period.
+    if (Pos[-1] == '.') {
+      if (HasPeriod)
+        break;
+      HasPeriod = true;
+    }
+
+    --Pos;
+    if (Pos > FirstChar && isSignedChar(Pos[0]) && !isExponentChar(Pos[-1]))
+      break;
+  }
+  return Pos;
+}
+
+/// EndOfNumber - Return the first character that is not part of the specified
+/// number.  This assumes that the buffer is null terminated, so it won't fall
+/// off the end.
+static const char *EndOfNumber(const char *Pos) {
+  while (isNumberChar(*Pos))
+    ++Pos;
+  return Pos;
+}
+
+/// CompareNumbers - compare two numbers, returning true if they are different.
+static bool CompareNumbers(const char *&F1P, const char *&F2P,
+                           const char *F1End, const char *F2End,
+                           double AbsTolerance, double RelTolerance,
+                           std::string *ErrorMsg) {
+  const char *F1NumEnd, *F2NumEnd;
+  double V1 = 0.0, V2 = 0.0;
+
+  // If one of the positions is at a space and the other isn't, chomp up 'til
+  // the end of the space.
+  while (isspace(static_cast<unsigned char>(*F1P)) && F1P != F1End)
+    ++F1P;
+  while (isspace(static_cast<unsigned char>(*F2P)) && F2P != F2End)
+    ++F2P;
+
+  // If we stop on numbers, compare their difference.
+  if (!isNumberChar(*F1P) || !isNumberChar(*F2P)) {
+    // The diff failed.
+    F1NumEnd = F1P;
+    F2NumEnd = F2P;
+  } else {
+    // Note that some ugliness is built into this to permit support for numbers
+    // that use "D" or "d" as their exponential marker, e.g. "1.234D45".  This
+    // occurs in 200.sixtrack in spec2k.
+    V1 = strtod(F1P, const_cast<char**>(&F1NumEnd));
+    V2 = strtod(F2P, const_cast<char**>(&F2NumEnd));
+
+    if (*F1NumEnd == 'D' || *F1NumEnd == 'd') {
+      // Copy string into tmp buffer to replace the 'D' with an 'e'.
+      SmallString<200> StrTmp(F1P, EndOfNumber(F1NumEnd)+1);
+      // Strange exponential notation!
+      StrTmp[static_cast<unsigned>(F1NumEnd-F1P)] = 'e';
+
+      V1 = strtod(&StrTmp[0], const_cast<char**>(&F1NumEnd));
+      F1NumEnd = F1P + (F1NumEnd-&StrTmp[0]);
+    }
+
+    if (*F2NumEnd == 'D' || *F2NumEnd == 'd') {
+      // Copy string into tmp buffer to replace the 'D' with an 'e'.
+      SmallString<200> StrTmp(F2P, EndOfNumber(F2NumEnd)+1);
+      // Strange exponential notation!
+      StrTmp[static_cast<unsigned>(F2NumEnd-F2P)] = 'e';
+
+      V2 = strtod(&StrTmp[0], const_cast<char**>(&F2NumEnd));
+      F2NumEnd = F2P + (F2NumEnd-&StrTmp[0]);
+    }
+  }
+
+  if (F1NumEnd == F1P || F2NumEnd == F2P) {
+    if (ErrorMsg) {
+      *ErrorMsg = "FP Comparison failed, not a numeric difference between '";
+      *ErrorMsg += F1P[0];
+      *ErrorMsg += "' and '";
+      *ErrorMsg += F2P[0];
+      *ErrorMsg += "'";
+    }
+    return true;
+  }
+
+  // Check to see if these are inside the absolute tolerance
+  if (AbsTolerance < std::abs(V1-V2)) {
+    // Nope, check the relative tolerance...
+    double Diff;
+    if (V2)
+      Diff = std::abs(V1/V2 - 1.0);
+    else if (V1)
+      Diff = std::abs(V2/V1 - 1.0);
+    else
+      Diff = 0;  // Both zero.
+    if (Diff > RelTolerance) {
+      if (ErrorMsg) {
+        raw_string_ostream(*ErrorMsg)
+          << "Compared: " << V1 << " and " << V2 << '\n'
+          << "abs. diff = " << std::abs(V1-V2) << " rel.diff = " << Diff << '\n'
+          << "Out of tolerance: rel/abs: " << RelTolerance << '/'
+          << AbsTolerance;
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, advance our read pointers to the end of the numbers.
+  F1P = F1NumEnd;  F2P = F2NumEnd;
+  return false;
+}
+
+/// DiffFilesWithTolerance - Compare the two files specified, returning 0 if the
+/// files match, 1 if they are different, and 2 if there is a file error.  This
+/// function differs from DiffFiles in that you can specify an absolete and
+/// relative FP error that is allowed to exist.  If you specify a string to fill
+/// in for the error option, it will set the string to an error message if an
+/// error occurs, allowing the caller to distinguish between a failed diff and a
+/// file system error.
+///
+int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
+                                 const sys::PathWithStatus &FileB,
+                                 double AbsTol, double RelTol,
+                                 std::string *Error) {
+  const sys::FileStatus *FileAStat = FileA.getFileStatus(false, Error);
+  if (!FileAStat)
+    return 2;
+  const sys::FileStatus *FileBStat = FileB.getFileStatus(false, Error);
+  if (!FileBStat)
+    return 2;
+
+  // Check for zero length files because some systems croak when you try to
+  // mmap an empty file.
+  size_t A_size = FileAStat->getSize();
+  size_t B_size = FileBStat->getSize();
+
+  // If they are both zero sized then they're the same
+  if (A_size == 0 && B_size == 0)
+    return 0;
+
+  // If only one of them is zero sized then they can't be the same
+  if ((A_size == 0 || B_size == 0)) {
+    if (Error)
+      *Error = "Files differ: one is zero-sized, the other isn't";
+    return 1;
+  }
+
+  // Now its safe to mmap the files into memory because both files
+  // have a non-zero size.
+  OwningPtr<MemoryBuffer> F1;
+  if (error_code ec = MemoryBuffer::getFile(FileA.c_str(), F1)) {
+    if (Error)
+      *Error = ec.message();
+    return 2;
+  }
+  OwningPtr<MemoryBuffer> F2;
+  if (error_code ec = MemoryBuffer::getFile(FileB.c_str(), F2)) {
+    if (Error)
+      *Error = ec.message();
+    return 2;
+  }
+
+  // Okay, now that we opened the files, scan them for the first difference.
+  const char *File1Start = F1->getBufferStart();
+  const char *File2Start = F2->getBufferStart();
+  const char *File1End = F1->getBufferEnd();
+  const char *File2End = F2->getBufferEnd();
+  const char *F1P = File1Start;
+  const char *F2P = File2Start;
+
+  // Are the buffers identical?  Common case: Handle this efficiently.
+  if (A_size == B_size &&
+      std::memcmp(File1Start, File2Start, A_size) == 0)
+    return 0;
+
+  // Otherwise, we are done a tolerances are set.
+  if (AbsTol == 0 && RelTol == 0) {
+    if (Error)
+      *Error = "Files differ without tolerance allowance";
+    return 1;   // Files different!
+  }
+
+  bool CompareFailed = false;
+  while (1) {
+    // Scan for the end of file or next difference.
+    while (F1P < File1End && F2P < File2End && *F1P == *F2P)
+      ++F1P, ++F2P;
+
+    if (F1P >= File1End || F2P >= File2End) break;
+
+    // Okay, we must have found a difference.  Backup to the start of the
+    // current number each stream is at so that we can compare from the
+    // beginning.
+    F1P = BackupNumber(F1P, File1Start);
+    F2P = BackupNumber(F2P, File2Start);
+
+    // Now that we are at the start of the numbers, compare them, exiting if
+    // they don't match.
+    if (CompareNumbers(F1P, F2P, File1End, File2End, AbsTol, RelTol, Error)) {
+      CompareFailed = true;
+      break;
+    }
+  }
+
+  // Okay, we reached the end of file.  If both files are at the end, we
+  // succeeded.
+  bool F1AtEnd = F1P >= File1End;
+  bool F2AtEnd = F2P >= File2End;
+  if (!CompareFailed && (!F1AtEnd || !F2AtEnd)) {
+    // Else, we might have run off the end due to a number: backup and retry.
+    if (F1AtEnd && isNumberChar(F1P[-1])) --F1P;
+    if (F2AtEnd && isNumberChar(F2P[-1])) --F2P;
+    F1P = BackupNumber(F1P, File1Start);
+    F2P = BackupNumber(F2P, File2Start);
+
+    // Now that we are at the start of the numbers, compare them, exiting if
+    // they don't match.
+    if (CompareNumbers(F1P, F2P, File1End, File2End, AbsTol, RelTol, Error))
+      CompareFailed = true;
+
+    // If we found the end, we succeeded.
+    if (F1P < File1End || F2P < File2End)
+      CompareFailed = true;
+  }
+
+  return CompareFailed;
+}
diff --git a/contrib/llvm/lib/Support/FoldingSet.cpp b/contrib/llvm/lib/Support/FoldingSet.cpp
new file mode 100644
index 000000000000..36e33b5aafa3
--- /dev/null
+++ b/contrib/llvm/lib/Support/FoldingSet.cpp
@@ -0,0 +1,424 @@
+//===-- Support/FoldingSet.cpp - Uniquing Hash Set --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a hash set that can be used to remove duplication of
+// nodes in a graph.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstring>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// FoldingSetNodeIDRef Implementation
+
+/// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
+/// used to lookup the node in the FoldingSetImpl.
+unsigned FoldingSetNodeIDRef::ComputeHash() const {
+  return static_cast<unsigned>(hash_combine_range(Data, Data+Size));
+}
+
+bool FoldingSetNodeIDRef::operator==(FoldingSetNodeIDRef RHS) const {
+  if (Size != RHS.Size) return false;
+  return memcmp(Data, RHS.Data, Size*sizeof(*Data)) == 0;
+}
+
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeIDRef::operator<(FoldingSetNodeIDRef RHS) const {
+  if (Size != RHS.Size)
+    return Size < RHS.Size;
+  return memcmp(Data, RHS.Data, Size*sizeof(*Data)) < 0;
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetNodeID Implementation
+
+/// Add* - Add various data types to Bit data.
+///
+void FoldingSetNodeID::AddPointer(const void *Ptr) {
+  // Note: this adds pointers to the hash using sizes and endianness that
+  // depend on the host.  It doesn't matter however, because hashing on
+  // pointer values in inherently unstable.  Nothing  should depend on the 
+  // ordering of nodes in the folding set.
+  Bits.append(reinterpret_cast<unsigned *>(&Ptr),
+              reinterpret_cast<unsigned *>(&Ptr+1));
+}
+void FoldingSetNodeID::AddInteger(signed I) {
+  Bits.push_back(I);
+}
+void FoldingSetNodeID::AddInteger(unsigned I) {
+  Bits.push_back(I);
+}
+void FoldingSetNodeID::AddInteger(long I) {
+  AddInteger((unsigned long)I);
+}
+void FoldingSetNodeID::AddInteger(unsigned long I) {
+  if (sizeof(long) == sizeof(int))
+    AddInteger(unsigned(I));
+  else if (sizeof(long) == sizeof(long long)) {
+    AddInteger((unsigned long long)I);
+  } else {
+    llvm_unreachable("unexpected sizeof(long)");
+  }
+}
+void FoldingSetNodeID::AddInteger(long long I) {
+  AddInteger((unsigned long long)I);
+}
+void FoldingSetNodeID::AddInteger(unsigned long long I) {
+  AddInteger(unsigned(I));
+  if ((uint64_t)(unsigned)I != I)
+    Bits.push_back(unsigned(I >> 32));
+}
+
+void FoldingSetNodeID::AddString(StringRef String) {
+  unsigned Size =  String.size();
+  Bits.push_back(Size);
+  if (!Size) return;
+
+  unsigned Units = Size / 4;
+  unsigned Pos = 0;
+  const unsigned *Base = (const unsigned*) String.data();
+  
+  // If the string is aligned do a bulk transfer.
+  if (!((intptr_t)Base & 3)) {
+    Bits.append(Base, Base + Units);
+    Pos = (Units + 1) * 4;
+  } else {
+    // Otherwise do it the hard way.
+    // To be compatible with above bulk transfer, we need to take endianness
+    // into account.
+    if (sys::isBigEndianHost()) {
+      for (Pos += 4; Pos <= Size; Pos += 4) {
+        unsigned V = ((unsigned char)String[Pos - 4] << 24) |
+                     ((unsigned char)String[Pos - 3] << 16) |
+                     ((unsigned char)String[Pos - 2] << 8) |
+                      (unsigned char)String[Pos - 1];
+        Bits.push_back(V);
+      }
+    } else {
+      assert(sys::isLittleEndianHost() && "Unexpected host endianness");
+      for (Pos += 4; Pos <= Size; Pos += 4) {
+        unsigned V = ((unsigned char)String[Pos - 1] << 24) |
+                     ((unsigned char)String[Pos - 2] << 16) |
+                     ((unsigned char)String[Pos - 3] << 8) |
+                      (unsigned char)String[Pos - 4];
+        Bits.push_back(V);
+      }
+    }
+  }
+  
+  // With the leftover bits.
+  unsigned V = 0;
+  // Pos will have overshot size by 4 - #bytes left over.
+  // No need to take endianness into account here - this is always executed.
+  switch (Pos - Size) {
+  case 1: V = (V << 8) | (unsigned char)String[Size - 3]; // Fall thru.
+  case 2: V = (V << 8) | (unsigned char)String[Size - 2]; // Fall thru.
+  case 3: V = (V << 8) | (unsigned char)String[Size - 1]; break;
+  default: return; // Nothing left.
+  }
+
+  Bits.push_back(V);
+}
+
+// AddNodeID - Adds the Bit data of another ID to *this.
+void FoldingSetNodeID::AddNodeID(const FoldingSetNodeID &ID) {
+  Bits.append(ID.Bits.begin(), ID.Bits.end());
+}
+
+/// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to 
+/// lookup the node in the FoldingSetImpl.
+unsigned FoldingSetNodeID::ComputeHash() const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()).ComputeHash();
+}
+
+/// operator== - Used to compare two nodes to each other.
+///
+bool FoldingSetNodeID::operator==(const FoldingSetNodeID &RHS) const {
+  return *this == FoldingSetNodeIDRef(RHS.Bits.data(), RHS.Bits.size());
+}
+
+/// operator== - Used to compare two nodes to each other.
+///
+bool FoldingSetNodeID::operator==(FoldingSetNodeIDRef RHS) const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()) == RHS;
+}
+
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeID::operator<(const FoldingSetNodeID &RHS) const {
+  return *this < FoldingSetNodeIDRef(RHS.Bits.data(), RHS.Bits.size());
+}
+
+bool FoldingSetNodeID::operator<(FoldingSetNodeIDRef RHS) const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()) < RHS;
+}
+
+/// Intern - Copy this node's data to a memory region allocated from the
+/// given allocator and return a FoldingSetNodeIDRef describing the
+/// interned data.
+FoldingSetNodeIDRef
+FoldingSetNodeID::Intern(BumpPtrAllocator &Allocator) const {
+  unsigned *New = Allocator.Allocate<unsigned>(Bits.size());
+  std::uninitialized_copy(Bits.begin(), Bits.end(), New);
+  return FoldingSetNodeIDRef(New, Bits.size());
+}
+
+//===----------------------------------------------------------------------===//
+/// Helper functions for FoldingSetImpl.
+
+/// GetNextPtr - In order to save space, each bucket is a
+/// singly-linked-list. In order to make deletion more efficient, we make
+/// the list circular, so we can delete a node without computing its hash.
+/// The problem with this is that the start of the hash buckets are not
+/// Nodes.  If NextInBucketPtr is a bucket pointer, this method returns null:
+/// use GetBucketPtr when this happens.
+static FoldingSetImpl::Node *GetNextPtr(void *NextInBucketPtr) {
+  // The low bit is set if this is the pointer back to the bucket.
+  if (reinterpret_cast<intptr_t>(NextInBucketPtr) & 1)
+    return 0;
+  
+  return static_cast<FoldingSetImpl::Node*>(NextInBucketPtr);
+}
+
+
+/// testing.
+static void **GetBucketPtr(void *NextInBucketPtr) {
+  intptr_t Ptr = reinterpret_cast<intptr_t>(NextInBucketPtr);
+  assert((Ptr & 1) && "Not a bucket pointer");
+  return reinterpret_cast<void**>(Ptr & ~intptr_t(1));
+}
+
+/// GetBucketFor - Hash the specified node ID and return the hash bucket for
+/// the specified ID.
+static void **GetBucketFor(unsigned Hash, void **Buckets, unsigned NumBuckets) {
+  // NumBuckets is always a power of 2.
+  unsigned BucketNum = Hash & (NumBuckets-1);
+  return Buckets + BucketNum;
+}
+
+/// AllocateBuckets - Allocated initialized bucket memory.
+static void **AllocateBuckets(unsigned NumBuckets) {
+  void **Buckets = static_cast<void**>(calloc(NumBuckets+1, sizeof(void*)));
+  // Set the very last bucket to be a non-null "pointer".
+  Buckets[NumBuckets] = reinterpret_cast<void*>(-1);
+  return Buckets;
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetImpl Implementation
+
+FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
+  assert(5 < Log2InitSize && Log2InitSize < 32 &&
+         "Initial hash table size out of range");
+  NumBuckets = 1 << Log2InitSize;
+  Buckets = AllocateBuckets(NumBuckets);
+  NumNodes = 0;
+}
+FoldingSetImpl::~FoldingSetImpl() {
+  free(Buckets);
+}
+void FoldingSetImpl::clear() {
+  // Set all but the last bucket to null pointers.
+  memset(Buckets, 0, NumBuckets*sizeof(void*));
+
+  // Set the very last bucket to be a non-null "pointer".
+  Buckets[NumBuckets] = reinterpret_cast<void*>(-1);
+
+  // Reset the node count to zero.
+  NumNodes = 0;
+}
+
+/// GrowHashTable - Double the size of the hash table and rehash everything.
+///
+void FoldingSetImpl::GrowHashTable() {
+  void **OldBuckets = Buckets;
+  unsigned OldNumBuckets = NumBuckets;
+  NumBuckets <<= 1;
+  
+  // Clear out new buckets.
+  Buckets = AllocateBuckets(NumBuckets);
+  NumNodes = 0;
+
+  // Walk the old buckets, rehashing nodes into their new place.
+  FoldingSetNodeID TempID;
+  for (unsigned i = 0; i != OldNumBuckets; ++i) {
+    void *Probe = OldBuckets[i];
+    if (!Probe) continue;
+    while (Node *NodeInBucket = GetNextPtr(Probe)) {
+      // Figure out the next link, remove NodeInBucket from the old link.
+      Probe = NodeInBucket->getNextInBucket();
+      NodeInBucket->SetNextInBucket(0);
+
+      // Insert the node into the new bucket, after recomputing the hash.
+      InsertNode(NodeInBucket,
+                 GetBucketFor(ComputeNodeHash(NodeInBucket, TempID),
+                              Buckets, NumBuckets));
+      TempID.clear();
+    }
+  }
+  
+  free(OldBuckets);
+}
+
+/// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
+/// return it.  If not, return the insertion token that will make insertion
+/// faster.
+FoldingSetImpl::Node
+*FoldingSetImpl::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                     void *&InsertPos) {
+  unsigned IDHash = ID.ComputeHash();
+  void **Bucket = GetBucketFor(IDHash, Buckets, NumBuckets);
+  void *Probe = *Bucket;
+  
+  InsertPos = 0;
+  
+  FoldingSetNodeID TempID;
+  while (Node *NodeInBucket = GetNextPtr(Probe)) {
+    if (NodeEquals(NodeInBucket, ID, IDHash, TempID))
+      return NodeInBucket;
+    TempID.clear();
+
+    Probe = NodeInBucket->getNextInBucket();
+  }
+  
+  // Didn't find the node, return null with the bucket as the InsertPos.
+  InsertPos = Bucket;
+  return 0;
+}
+
+/// InsertNode - Insert the specified node into the folding set, knowing that it
+/// is not already in the map.  InsertPos must be obtained from 
+/// FindNodeOrInsertPos.
+void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
+  assert(N->getNextInBucket() == 0);
+  // Do we need to grow the hashtable?
+  if (NumNodes+1 > NumBuckets*2) {
+    GrowHashTable();
+    FoldingSetNodeID TempID;
+    InsertPos = GetBucketFor(ComputeNodeHash(N, TempID), Buckets, NumBuckets);
+  }
+
+  ++NumNodes;
+  
+  /// The insert position is actually a bucket pointer.
+  void **Bucket = static_cast<void**>(InsertPos);
+  
+  void *Next = *Bucket;
+  
+  // If this is the first insertion into this bucket, its next pointer will be
+  // null.  Pretend as if it pointed to itself, setting the low bit to indicate
+  // that it is a pointer to the bucket.
+  if (Next == 0)
+    Next = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(Bucket)|1);
+
+  // Set the node's next pointer, and make the bucket point to the node.
+  N->SetNextInBucket(Next);
+  *Bucket = N;
+}
+
+/// RemoveNode - Remove a node from the folding set, returning true if one was
+/// removed or false if the node was not in the folding set.
+bool FoldingSetImpl::RemoveNode(Node *N) {
+  // Because each bucket is a circular list, we don't need to compute N's hash
+  // to remove it.
+  void *Ptr = N->getNextInBucket();
+  if (Ptr == 0) return false;  // Not in folding set.
+
+  --NumNodes;
+  N->SetNextInBucket(0);
+
+  // Remember what N originally pointed to, either a bucket or another node.
+  void *NodeNextPtr = Ptr;
+  
+  // Chase around the list until we find the node (or bucket) which points to N.
+  while (true) {
+    if (Node *NodeInBucket = GetNextPtr(Ptr)) {
+      // Advance pointer.
+      Ptr = NodeInBucket->getNextInBucket();
+      
+      // We found a node that points to N, change it to point to N's next node,
+      // removing N from the list.
+      if (Ptr == N) {
+        NodeInBucket->SetNextInBucket(NodeNextPtr);
+        return true;
+      }
+    } else {
+      void **Bucket = GetBucketPtr(Ptr);
+      Ptr = *Bucket;
+      
+      // If we found that the bucket points to N, update the bucket to point to
+      // whatever is next.
+      if (Ptr == N) {
+        *Bucket = NodeNextPtr;
+        return true;
+      }
+    }
+  }
+}
+
+/// GetOrInsertNode - If there is an existing simple Node exactly
+/// equal to the specified node, return it.  Otherwise, insert 'N' and it
+/// instead.
+FoldingSetImpl::Node *FoldingSetImpl::GetOrInsertNode(FoldingSetImpl::Node *N) {
+  FoldingSetNodeID ID;
+  GetNodeProfile(N, ID);
+  void *IP;
+  if (Node *E = FindNodeOrInsertPos(ID, IP))
+    return E;
+  InsertNode(N, IP);
+  return N;
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetIteratorImpl Implementation
+
+FoldingSetIteratorImpl::FoldingSetIteratorImpl(void **Bucket) {
+  // Skip to the first non-null non-self-cycle bucket.
+  while (*Bucket != reinterpret_cast<void*>(-1) &&
+         (*Bucket == 0 || GetNextPtr(*Bucket) == 0))
+    ++Bucket;
+  
+  NodePtr = static_cast<FoldingSetNode*>(*Bucket);
+}
+
+void FoldingSetIteratorImpl::advance() {
+  // If there is another link within this bucket, go to it.
+  void *Probe = NodePtr->getNextInBucket();
+
+  if (FoldingSetNode *NextNodeInBucket = GetNextPtr(Probe))
+    NodePtr = NextNodeInBucket;
+  else {
+    // Otherwise, this is the last link in this bucket.  
+    void **Bucket = GetBucketPtr(Probe);
+
+    // Skip to the next non-null non-self-cycle bucket.
+    do {
+      ++Bucket;
+    } while (*Bucket != reinterpret_cast<void*>(-1) &&
+             (*Bucket == 0 || GetNextPtr(*Bucket) == 0));
+    
+    NodePtr = static_cast<FoldingSetNode*>(*Bucket);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// FoldingSetBucketIteratorImpl Implementation
+
+FoldingSetBucketIteratorImpl::FoldingSetBucketIteratorImpl(void **Bucket) {
+  Ptr = (*Bucket == 0 || GetNextPtr(*Bucket) == 0) ? (void*) Bucket : *Bucket;
+}
diff --git a/contrib/llvm/lib/Support/FormattedStream.cpp b/contrib/llvm/lib/Support/FormattedStream.cpp
new file mode 100644
index 000000000000..231ae48759e2
--- /dev/null
+++ b/contrib/llvm/lib/Support/FormattedStream.cpp
@@ -0,0 +1,101 @@
+//===-- llvm/Support/FormattedStream.cpp - Formatted streams ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of formatted_raw_ostream.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include <algorithm>
+
+using namespace llvm;
+
+/// CountColumns - Examine the given char sequence and figure out which
+/// column we end up in after output.
+///
+static unsigned CountColumns(unsigned Column, const char *Ptr, size_t Size) {
+  // Keep track of the current column by scanning the string for
+  // special characters
+
+  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
+    ++Column;
+    if (*Ptr == '\n' || *Ptr == '\r')
+      Column = 0;
+    else if (*Ptr == '\t')
+      // Assumes tab stop = 8 characters.
+      Column += (8 - (Column & 0x7)) & 0x7;
+  }
+
+  return Column;
+}
+
+/// ComputeColumn - Examine the current output and figure out which
+/// column we end up in after output.
+void formatted_raw_ostream::ComputeColumn(const char *Ptr, size_t Size) {
+  // If our previous scan pointer is inside the buffer, assume we already
+  // scanned those bytes. This depends on raw_ostream to not change our buffer
+  // in unexpected ways.
+  if (Ptr <= Scanned && Scanned <= Ptr + Size) {
+    // Scan all characters added since our last scan to determine the new
+    // column.
+    ColumnScanned = CountColumns(ColumnScanned, Scanned, 
+                                 Size - (Scanned - Ptr));
+  } else
+    ColumnScanned = CountColumns(ColumnScanned, Ptr, Size);
+
+  // Update the scanning pointer.
+  Scanned = Ptr + Size;
+}
+
+/// PadToColumn - Align the output to some column number.
+///
+/// \param NewCol - The column to move to.
+///
+formatted_raw_ostream &formatted_raw_ostream::PadToColumn(unsigned NewCol) { 
+  // Figure out what's in the buffer and add it to the column count.
+  ComputeColumn(getBufferStart(), GetNumBytesInBuffer());
+
+  // Output spaces until we reach the desired column.
+  indent(std::max(int(NewCol - ColumnScanned), 1));
+  return *this;
+}
+
+void formatted_raw_ostream::write_impl(const char *Ptr, size_t Size) {
+  // Figure out what's in the buffer and add it to the column count.
+  ComputeColumn(Ptr, Size);
+
+  // Write the data to the underlying stream (which is unbuffered, so
+  // the data will be immediately written out).
+  TheStream->write(Ptr, Size);
+
+  // Reset the scanning pointer.
+  Scanned = 0;
+}
+
+/// fouts() - This returns a reference to a formatted_raw_ostream for
+/// standard output.  Use it like: fouts() << "foo" << "bar";
+formatted_raw_ostream &llvm::fouts() {
+  static formatted_raw_ostream S(outs());
+  return S;
+}
+
+/// ferrs() - This returns a reference to a formatted_raw_ostream for
+/// standard error.  Use it like: ferrs() << "foo" << "bar";
+formatted_raw_ostream &llvm::ferrs() {
+  static formatted_raw_ostream S(errs());
+  return S;
+}
+
+/// fdbgs() - This returns a reference to a formatted_raw_ostream for
+/// the debug stream.  Use it like: fdbgs() << "foo" << "bar";
+formatted_raw_ostream &llvm::fdbgs() {
+  static formatted_raw_ostream S(dbgs());
+  return S;
+}
diff --git a/contrib/llvm/lib/Support/GraphWriter.cpp b/contrib/llvm/lib/Support/GraphWriter.cpp
new file mode 100644
index 000000000000..bff182f30e35
--- /dev/null
+++ b/contrib/llvm/lib/Support/GraphWriter.cpp
@@ -0,0 +1,209 @@
+//===-- GraphWriter.cpp - Implements GraphWriter support routines ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements misc. GraphWriter support routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+using namespace llvm;
+
+static cl::opt<bool> ViewBackground("view-background", cl::Hidden,
+  cl::desc("Execute graph viewer in the background. Creates tmp file litter."));
+
+std::string llvm::DOT::EscapeString(const std::string &Label) {
+  std::string Str(Label);
+  for (unsigned i = 0; i != Str.length(); ++i)
+  switch (Str[i]) {
+    case '\n':
+      Str.insert(Str.begin()+i, '\\');  // Escape character...
+      ++i;
+      Str[i] = 'n';
+      break;
+    case '\t':
+      Str.insert(Str.begin()+i, ' ');  // Convert to two spaces
+      ++i;
+      Str[i] = ' ';
+      break;
+    case '\\':
+      if (i+1 != Str.length())
+        switch (Str[i+1]) {
+          case 'l': continue; // don't disturb \l
+          case '|': case '{': case '}':
+            Str.erase(Str.begin()+i); continue;
+          default: break;
+        }
+    case '{': case '}':
+    case '<': case '>':
+    case '|': case '"':
+      Str.insert(Str.begin()+i, '\\');  // Escape character...
+      ++i;  // don't infinite loop
+      break;
+  }
+  return Str;
+}
+
+/// \brief Get a color string for this node number. Simply round-robin selects
+/// from a reasonable number of colors.
+StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
+  static const int NumColors = 20;
+  static const char* Colors[NumColors] = {
+    "aaaaaa", "aa0000", "00aa00", "aa5500", "0055ff", "aa00aa", "00aaaa",
+    "555555", "ff5555", "55ff55", "ffff55", "5555ff", "ff55ff", "55ffff",
+    "ffaaaa", "aaffaa", "ffffaa", "aaaaff", "ffaaff", "aaffff"};
+  return Colors[ColorNumber % NumColors];
+}
+
+// Execute the graph viewer. Return true if successful.
+static bool LLVM_ATTRIBUTE_UNUSED
+ExecGraphViewer(const sys::Path &ExecPath, std::vector<const char*> &args,
+                const sys::Path &Filename, bool wait, std::string &ErrMsg) {
+  if (wait) {
+    if (sys::Program::ExecuteAndWait(ExecPath, &args[0],0,0,0,0,&ErrMsg)) {
+      errs() << "Error: " << ErrMsg << "\n";
+      return false;
+    }
+    Filename.eraseFromDisk();
+    errs() << " done. \n";
+  }
+  else {
+    sys::Program::ExecuteNoWait(ExecPath, &args[0],0,0,0,&ErrMsg);
+    errs() << "Remember to erase graph file: " << Filename.str() << "\n";
+  }
+  return true;
+}
+
+void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
+                        GraphProgram::Name program) {
+  wait &= !ViewBackground;
+  std::string ErrMsg;
+#if HAVE_GRAPHVIZ
+  sys::Path Graphviz(LLVM_PATH_GRAPHVIZ);
+
+  std::vector<const char*> args;
+  args.push_back(Graphviz.c_str());
+  args.push_back(Filename.c_str());
+  args.push_back(0);
+
+  errs() << "Running 'Graphviz' program... ";
+  if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
+    return;
+
+#elif HAVE_XDOT_PY
+  std::vector<const char*> args;
+  args.push_back(LLVM_PATH_XDOT_PY);
+  args.push_back(Filename.c_str());
+
+  switch (program) {
+  case GraphProgram::DOT:   args.push_back("-f"); args.push_back("dot"); break;
+  case GraphProgram::FDP:   args.push_back("-f"); args.push_back("fdp"); break;
+  case GraphProgram::NEATO: args.push_back("-f"); args.push_back("neato");break;
+  case GraphProgram::TWOPI: args.push_back("-f"); args.push_back("twopi");break;
+  case GraphProgram::CIRCO: args.push_back("-f"); args.push_back("circo");break;
+  }
+
+  args.push_back(0);
+
+  errs() << "Running 'xdot.py' program... ";
+  if (!ExecGraphViewer(sys::Path(LLVM_PATH_XDOT_PY), args, Filename, wait, ErrMsg))
+    return;
+
+#elif (HAVE_GV && (HAVE_DOT || HAVE_FDP || HAVE_NEATO || \
+                   HAVE_TWOPI || HAVE_CIRCO))
+  sys::Path PSFilename = Filename;
+  PSFilename.appendSuffix("ps");
+
+  sys::Path prog;
+
+  // Set default grapher
+#if HAVE_CIRCO
+  prog = sys::Path(LLVM_PATH_CIRCO);
+#endif
+#if HAVE_TWOPI
+  prog = sys::Path(LLVM_PATH_TWOPI);
+#endif
+#if HAVE_NEATO
+  prog = sys::Path(LLVM_PATH_NEATO);
+#endif
+#if HAVE_FDP
+  prog = sys::Path(LLVM_PATH_FDP);
+#endif
+#if HAVE_DOT
+  prog = sys::Path(LLVM_PATH_DOT);
+#endif
+
+  // Find which program the user wants
+#if HAVE_DOT
+  if (program == GraphProgram::DOT)
+    prog = sys::Path(LLVM_PATH_DOT);
+#endif
+#if (HAVE_FDP)
+  if (program == GraphProgram::FDP)
+    prog = sys::Path(LLVM_PATH_FDP);
+#endif
+#if (HAVE_NEATO)
+  if (program == GraphProgram::NEATO)
+    prog = sys::Path(LLVM_PATH_NEATO);
+#endif
+#if (HAVE_TWOPI)
+  if (program == GraphProgram::TWOPI)
+    prog = sys::Path(LLVM_PATH_TWOPI);
+#endif
+#if (HAVE_CIRCO)
+  if (program == GraphProgram::CIRCO)
+    prog = sys::Path(LLVM_PATH_CIRCO);
+#endif
+
+  std::vector<const char*> args;
+  args.push_back(prog.c_str());
+  args.push_back("-Tps");
+  args.push_back("-Nfontname=Courier");
+  args.push_back("-Gsize=7.5,10");
+  args.push_back(Filename.c_str());
+  args.push_back("-o");
+  args.push_back(PSFilename.c_str());
+  args.push_back(0);
+
+  errs() << "Running '" << prog.str() << "' program... ";
+
+  if (!ExecGraphViewer(prog, args, Filename, wait, ErrMsg))
+    return;
+
+  sys::Path gv(LLVM_PATH_GV);
+  args.clear();
+  args.push_back(gv.c_str());
+  args.push_back(PSFilename.c_str());
+  args.push_back("--spartan");
+  args.push_back(0);
+
+  ErrMsg.clear();
+  if (!ExecGraphViewer(gv, args, PSFilename, wait, ErrMsg))
+    return;
+
+#elif HAVE_DOTTY
+  sys::Path dotty(LLVM_PATH_DOTTY);
+
+  std::vector<const char*> args;
+  args.push_back(dotty.c_str());
+  args.push_back(Filename.c_str());
+  args.push_back(0);
+
+// Dotty spawns another app and doesn't wait until it returns
+#if defined (__MINGW32__) || defined (_WINDOWS)
+  wait = false;
+#endif
+  errs() << "Running 'dotty' program... ";
+  if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
+    return;
+#endif
+}
diff --git a/contrib/llvm/lib/Support/Hashing.cpp b/contrib/llvm/lib/Support/Hashing.cpp
new file mode 100644
index 000000000000..c69efb7c3cc9
--- /dev/null
+++ b/contrib/llvm/lib/Support/Hashing.cpp
@@ -0,0 +1,29 @@
+//===-------------- lib/Support/Hashing.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides implementation bits for the LLVM common hashing
+// infrastructure. Documentation and most of the other information is in the
+// header file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Hashing.h"
+
+using namespace llvm;
+
+// Provide a definition and static initializer for the fixed seed. This
+// initializer should always be zero to ensure its value can never appear to be
+// non-zero, even during dynamic initialization.
+size_t llvm::hashing::detail::fixed_seed_override = 0;
+
+// Implement the function for forced setting of the fixed seed.
+// FIXME: Use atomic operations here so that there is no data race.
+void llvm::set_fixed_execution_hash_seed(size_t fixed_value) {
+  hashing::detail::fixed_seed_override = fixed_value;
+}
diff --git a/contrib/llvm/lib/Support/Host.cpp b/contrib/llvm/lib/Support/Host.cpp
new file mode 100644
index 000000000000..73d98d148746
--- /dev/null
+++ b/contrib/llvm/lib/Support/Host.cpp
@@ -0,0 +1,619 @@
+//===-- Host.cpp - Implement OS Host Concept --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Host concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Host.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/DataStream.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string.h>
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Host.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Host.inc"
+#endif
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#if defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+#endif
+
+//===----------------------------------------------------------------------===//
+//
+//  Implementations of the CPU detection routines
+//
+//===----------------------------------------------------------------------===//
+
+using namespace llvm;
+
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+ || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+
+/// GetX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
+                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    int registers[4];
+    __cpuid(registers, value);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
+  #else
+    return true;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+// pedantic #else returns to appease -Wunreachable-code (so we don't generate
+// postprocessed code that looks like "return true; return false;")
+  #else
+    return true;
+  #endif
+#else
+  return true;
+#endif
+}
+
+static bool OSHasAVXSupport() {
+#if defined(__GNUC__)
+  // Check xgetbv; this uses a .byte sequence instead of the instruction 
+  // directly because older assemblers do not include support for xgetbv and 
+  // there is no easy way to conditionally compile based on the assembler used.
+  int rEAX, rEDX;
+  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
+#elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219
+  unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+#else
+  int rEAX = 0; // Ensures we return false
+#endif
+  return (rEAX & 6) == 6;
+}
+
+static void DetectX86FamilyModel(unsigned EAX, unsigned &Family,
+                                 unsigned &Model) {
+  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
+  if (Family == 6 || Family == 0xf) {
+    if (Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+std::string sys::getHostCPUName() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (GetX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+    return "generic";
+  unsigned Family = 0;
+  unsigned Model  = 0;
+  DetectX86FamilyModel(EAX, Family, Model);
+
+  bool HasSSE3 = (ECX & 0x1);
+  // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV 
+  // indicates that the AVX registers will be saved and restored on context
+  // switch, then we have full AVX support.
+  const unsigned AVXBits = (1 << 27) | (1 << 28);
+  bool HasAVX = ((ECX & AVXBits) == AVXBits) && OSHasAVXSupport();
+  GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  bool Em64T = (EDX >> 29) & 0x1;
+
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+  if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+    switch (Family) {
+    case 3:
+      return "i386";
+    case 4:
+      switch (Model) {
+      case 0: // Intel486 DX processors
+      case 1: // Intel486 DX processors
+      case 2: // Intel486 SX processors
+      case 3: // Intel487 processors, IntelDX2 OverDrive processors,
+              // IntelDX2 processors
+      case 4: // Intel486 SL processor
+      case 5: // IntelSX2 processors
+      case 7: // Write-Back Enhanced IntelDX2 processors
+      case 8: // IntelDX4 OverDrive processors, IntelDX4 processors
+      default: return "i486";
+      }
+    case 5:
+      switch (Model) {
+      case  1: // Pentium OverDrive processor for Pentium processor (60, 66),
+               // Pentium processors (60, 66)
+      case  2: // Pentium OverDrive processor for Pentium processor (75, 90,
+               // 100, 120, 133), Pentium processors (75, 90, 100, 120, 133,
+               // 150, 166, 200)
+      case  3: // Pentium OverDrive processors for Intel486 processor-based
+               // systems
+        return "pentium";
+
+      case  4: // Pentium OverDrive processor with MMX technology for Pentium
+               // processor (75, 90, 100, 120, 133), Pentium processor with
+               // MMX technology (166, 200)
+        return "pentium-mmx";
+
+      default: return "pentium";
+      }
+    case 6:
+      switch (Model) {
+      case  1: // Pentium Pro processor
+        return "pentiumpro";
+
+      case  3: // Intel Pentium II OverDrive processor, Pentium II processor,
+               // model 03
+      case  5: // Pentium II processor, model 05, Pentium II Xeon processor,
+               // model 05, and Intel Celeron processor, model 05
+      case  6: // Celeron processor, model 06
+        return "pentium2";
+
+      case  7: // Pentium III processor, model 07, and Pentium III Xeon
+               // processor, model 07
+      case  8: // Pentium III processor, model 08, Pentium III Xeon processor,
+               // model 08, and Celeron processor, model 08
+      case 10: // Pentium III Xeon processor, model 0Ah
+      case 11: // Pentium III processor, model 0Bh
+        return "pentium3";
+
+      case  9: // Intel Pentium M processor, Intel Celeron M processor model 09.
+      case 13: // Intel Pentium M processor, Intel Celeron M processor, model
+               // 0Dh. All processors are manufactured using the 90 nm process.
+        return "pentium-m";
+
+      case 14: // Intel Core Duo processor, Intel Core Solo processor, model
+               // 0Eh. All processors are manufactured using the 65 nm process.
+        return "yonah";
+
+      case 15: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
+               // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
+               // mobile processor, Intel Core 2 Extreme processor, Intel
+               // Pentium Dual-Core processor, Intel Xeon processor, model
+               // 0Fh. All processors are manufactured using the 65 nm process.
+      case 22: // Intel Celeron processor model 16h. All processors are
+               // manufactured using the 65 nm process
+        return "core2";
+
+      case 21: // Intel EP80579 Integrated Processor and Intel EP80579
+               // Integrated Processor with Intel QuickAssist Technology
+        return "i686"; // FIXME: ???
+
+      case 23: // Intel Core 2 Extreme processor, Intel Xeon processor, model
+               // 17h. All processors are manufactured using the 45 nm process.
+               //
+               // 45nm: Penryn , Wolfdale, Yorkfield (XE)
+        return "penryn";
+
+      case 26: // Intel Core i7 processor and Intel Xeon processor. All
+               // processors are manufactured using the 45 nm process.
+      case 29: // Intel Xeon processor MP. All processors are manufactured using
+               // the 45 nm process.
+      case 30: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
+               // As found in a Summer 2010 model iMac.
+      case 37: // Intel Core i7, laptop version.
+      case 44: // Intel Core i7 processor and Intel Xeon processor. All
+               // processors are manufactured using the 32 nm process.
+      case 46: // Nehalem EX
+      case 47: // Westmere EX
+        return "corei7";
+
+      // SandyBridge:
+      case 42: // Intel Core i7 processor. All processors are manufactured
+               // using the 32 nm process.
+      case 45:
+        // Not all Sandy Bridge processors support AVX (such as the Pentium
+        // versions instead of the i7 versions).
+        return HasAVX ? "corei7-avx" : "corei7";
+
+      // Ivy Bridge:
+      case 58:
+        // Not all Ivy Bridge processors support AVX (such as the Pentium
+        // versions instead of the i7 versions).
+        return HasAVX ? "core-avx-i" : "corei7";
+
+      case 28: // Most 45 nm Intel Atom processors
+      case 38: // 45 nm Atom Lincroft
+      case 39: // 32 nm Atom Medfield
+      case 53: // 32 nm Atom Midview
+      case 54: // 32 nm Atom Midview
+        return "atom";
+
+      default: return (Em64T) ? "x86-64" : "i686";
+      }
+    case 15: {
+      switch (Model) {
+      case  0: // Pentium 4 processor, Intel Xeon processor. All processors are
+               // model 00h and manufactured using the 0.18 micron process.
+      case  1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
+               // processor MP, and Intel Celeron processor. All processors are
+               // model 01h and manufactured using the 0.18 micron process.
+      case  2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
+               // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
+               // processor, and Mobile Intel Celeron processor. All processors
+               // are model 02h and manufactured using the 0.13 micron process.
+        return (Em64T) ? "x86-64" : "pentium4";
+
+      case  3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
+               // processor. All processors are model 03h and manufactured using
+               // the 90 nm process.
+      case  4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
+               // Pentium D processor, Intel Xeon processor, Intel Xeon
+               // processor MP, Intel Celeron D processor. All processors are
+               // model 04h and manufactured using the 90 nm process.
+      case  6: // Pentium 4 processor, Pentium D processor, Pentium processor
+               // Extreme Edition, Intel Xeon processor, Intel Xeon processor
+               // MP, Intel Celeron D processor. All processors are model 06h
+               // and manufactured using the 65 nm process.
+        return (Em64T) ? "nocona" : "prescott";
+
+      default:
+        return (Em64T) ? "x86-64" : "pentium4";
+      }
+    }
+
+    default:
+      return "generic";
+    }
+  } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
+    // appears to be no way to generate the wide variety of AMD-specific targets
+    // from the information returned from CPUID.
+    switch (Family) {
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 6:
+        case 7:  return "k6";
+        case 8:  return "k6-2";
+        case 9:
+        case 13: return "k6-3";
+        case 10: return "geode";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 4:  return "athlon-tbird";
+        case 6:
+        case 7:
+        case 8:  return "athlon-mp";
+        case 10: return "athlon-xp";
+        default: return "athlon";
+        }
+      case 15:
+        if (HasSSE3)
+          return "k8-sse3";
+        switch (Model) {
+        case 1:  return "opteron";
+        case 5:  return "athlon-fx"; // also opteron
+        default: return "athlon64";
+        }
+      case 16:
+        return "amdfam10";
+      case 20:
+        return "btver1";
+      case 21:
+        if (Model <= 15)
+          return "bdver1";
+        else if (Model <= 31)
+          return "bdver2";
+    default:
+      return "generic";
+    }
+  }
+  return "generic";
+}
+#elif defined(__APPLE__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
+std::string sys::getHostCPUName() {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return generic;
+  }
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  const char *CPUInfoStart = buffer;
+  const char *CPUInfoEnd = buffer + CPUInfoSize;
+
+  const char *CIP = CPUInfoStart;
+
+  const char *CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+  
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+  
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+    .Case("604e", "604e")
+    .Case("604", "604")
+    .Case("7400", "7400")
+    .Case("7410", "7400")
+    .Case("7447", "7400")
+    .Case("7455", "7450")
+    .Case("G4", "g4")
+    .Case("POWER4", "970")
+    .Case("PPC970FX", "970")
+    .Case("PPC970MP", "970")
+    .Case("G5", "g5")
+    .Case("POWER5", "g5")
+    .Case("A2", "a2")
+    .Case("POWER6", "pwr6")
+    .Case("POWER7", "pwr7")
+    .Default(generic);
+}
+#elif defined(__linux__) && defined(__arm__)
+std::string sys::getHostCPUName() {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+
+  if (Implementer == "0x41") // ARM Ltd.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+          .Case("0x926", "arm926ej-s")
+          .Case("0xb02", "mpcore")
+          .Case("0xb36", "arm1136j-s")
+          .Case("0xb56", "arm1156t2-s")
+          .Case("0xb76", "arm1176jz-s")
+          .Case("0xc08", "cortex-a8")
+          .Case("0xc09", "cortex-a9")
+          .Case("0xc0f", "cortex-a15")
+          .Case("0xc20", "cortex-m0")
+          .Case("0xc23", "cortex-m3")
+          .Case("0xc24", "cortex-m4")
+          .Default("generic");
+
+  return "generic";
+}
+#else
+std::string sys::getHostCPUName() {
+  return "generic";
+}
+#endif
+
+#if defined(__linux__) && defined(__arm__)
+bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return false;
+  }
+
+  // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
+  // in all cases.
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+
+  if (Implementer == "0x41") { // ARM Ltd.
+    SmallVector<StringRef, 32> CPUFeatures;
+
+    // Look for the CPU features.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("Features")) {
+        Lines[I].split(CPUFeatures, " ");
+        break;
+      }
+
+    for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+      StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
+        .Case("half", "fp16")
+        .Case("neon", "neon")
+        .Case("vfpv3", "vfp3")
+        .Case("vfpv3d16", "d16")
+        .Case("vfpv4", "vfp4")
+        .Case("idiva", "hwdiv-arm")
+        .Case("idivt", "hwdiv")
+        .Default("");
+
+      if (LLVMFeatureStr != "")
+        Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+#else
+bool sys::getHostCPUFeatures(StringMap<bool> &Features){
+  return false;
+}
+#endif
+
+std::string sys::getProcessTriple() {
+  Triple PT(LLVM_HOSTTRIPLE);
+
+  if (sizeof(void *) == 8 && PT.isArch32Bit())
+    PT = PT.get64BitArchVariant();
+  if (sizeof(void *) == 4 && PT.isArch64Bit())
+    PT = PT.get32BitArchVariant();
+
+  return PT.str();
+}
diff --git a/contrib/llvm/lib/Support/IncludeFile.cpp b/contrib/llvm/lib/Support/IncludeFile.cpp
new file mode 100644
index 000000000000..e67acb3d1d67
--- /dev/null
+++ b/contrib/llvm/lib/Support/IncludeFile.cpp
@@ -0,0 +1,20 @@
+//===- lib/Support/IncludeFile.cpp - Ensure Linking Of Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IncludeFile constructor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/IncludeFile.h"
+
+using namespace llvm;
+
+// This constructor is used to ensure linking of other modules. See the
+// llvm/Support/IncludeFile.h header for details.
+IncludeFile::IncludeFile(const void*) {}
diff --git a/contrib/llvm/lib/Support/IntEqClasses.cpp b/contrib/llvm/lib/Support/IntEqClasses.cpp
new file mode 100644
index 000000000000..11344956e4c9
--- /dev/null
+++ b/contrib/llvm/lib/Support/IntEqClasses.cpp
@@ -0,0 +1,70 @@
+//===-- llvm/ADT/IntEqClasses.cpp - Equivalence Classes of Integers -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Equivalence classes for small integers. This is a mapping of the integers
+// 0 .. N-1 into M equivalence classes numbered 0 .. M-1.
+//
+// Initially each integer has its own equivalence class. Classes are joined by
+// passing a representative member of each class to join().
+//
+// Once the classes are built, compress() will number them 0 .. M-1 and prevent
+// further changes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/IntEqClasses.h"
+
+using namespace llvm;
+
+void IntEqClasses::grow(unsigned N) {
+  assert(NumClasses == 0 && "grow() called after compress().");
+  EC.reserve(N);
+  while (EC.size() < N)
+    EC.push_back(EC.size());
+}
+
+void IntEqClasses::join(unsigned a, unsigned b) {
+  assert(NumClasses == 0 && "join() called after compress().");
+  unsigned eca = EC[a];
+  unsigned ecb = EC[b];
+  // Update pointers while searching for the leaders, compressing the paths
+  // incrementally. The larger leader will eventually be updated, joining the
+  // classes.
+  while (eca != ecb)
+    if (eca < ecb)
+      EC[b] = eca, b = ecb, ecb = EC[b];
+    else
+      EC[a] = ecb, a = eca, eca = EC[a];
+}
+
+unsigned IntEqClasses::findLeader(unsigned a) const {
+  assert(NumClasses == 0 && "findLeader() called after compress().");
+  while (a != EC[a])
+    a = EC[a];
+  return a;
+}
+
+void IntEqClasses::compress() {
+  if (NumClasses)
+    return;
+  for (unsigned i = 0, e = EC.size(); i != e; ++i)
+    EC[i] = (EC[i] == i) ? NumClasses++ : EC[EC[i]];
+}
+
+void IntEqClasses::uncompress() {
+  if (!NumClasses)
+    return;
+  SmallVector<unsigned, 8> Leader;
+  for (unsigned i = 0, e = EC.size(); i != e; ++i)
+    if (EC[i] < Leader.size())
+      EC[i] = Leader[EC[i]];
+    else
+      Leader.push_back(EC[i] = i);
+  NumClasses = 0;
+}
diff --git a/contrib/llvm/lib/Support/IntervalMap.cpp b/contrib/llvm/lib/Support/IntervalMap.cpp
new file mode 100644
index 000000000000..4dfcc404ca42
--- /dev/null
+++ b/contrib/llvm/lib/Support/IntervalMap.cpp
@@ -0,0 +1,161 @@
+//===- lib/Support/IntervalMap.cpp - A sorted interval map ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the few non-templated functions in IntervalMap.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/IntervalMap.h"
+
+namespace llvm {
+namespace IntervalMapImpl {
+
+void Path::replaceRoot(void *Root, unsigned Size, IdxPair Offsets) {
+  assert(!path.empty() && "Can't replace missing root");
+  path.front() = Entry(Root, Size, Offsets.first);
+  path.insert(path.begin() + 1, Entry(subtree(0), Offsets.second));
+}
+
+NodeRef Path::getLeftSibling(unsigned Level) const {
+  // The root has no siblings.
+  if (Level == 0)
+    return NodeRef();
+
+  // Go up the tree until we can go left.
+  unsigned l = Level - 1;
+  while (l && path[l].offset == 0)
+    --l;
+
+  // We can't go left.
+  if (path[l].offset == 0)
+    return NodeRef();
+
+  // NR is the subtree containing our left sibling.
+  NodeRef NR = path[l].subtree(path[l].offset - 1);
+
+  // Keep right all the way down.
+  for (++l; l != Level; ++l)
+    NR = NR.subtree(NR.size() - 1);
+  return NR;
+}
+
+void Path::moveLeft(unsigned Level) {
+  assert(Level != 0 && "Cannot move the root node");
+
+  // Go up the tree until we can go left.
+  unsigned l = 0;
+  if (valid()) {
+    l = Level - 1;
+    while (path[l].offset == 0) {
+      assert(l != 0 && "Cannot move beyond begin()");
+      --l;
+    }
+  } else if (height() < Level)
+    // end() may have created a height=0 path.
+    path.resize(Level + 1, Entry(0, 0, 0));
+
+  // NR is the subtree containing our left sibling.
+  --path[l].offset;
+  NodeRef NR = subtree(l);
+
+  // Get the rightmost node in the subtree.
+  for (++l; l != Level; ++l) {
+    path[l] = Entry(NR, NR.size() - 1);
+    NR = NR.subtree(NR.size() - 1);
+  }
+  path[l] = Entry(NR, NR.size() - 1);
+}
+
+NodeRef Path::getRightSibling(unsigned Level) const {
+  // The root has no siblings.
+  if (Level == 0)
+    return NodeRef();
+
+  // Go up the tree until we can go right.
+  unsigned l = Level - 1;
+  while (l && atLastEntry(l))
+    --l;
+
+  // We can't go right.
+  if (atLastEntry(l))
+    return NodeRef();
+
+  // NR is the subtree containing our right sibling.
+  NodeRef NR = path[l].subtree(path[l].offset + 1);
+
+  // Keep left all the way down.
+  for (++l; l != Level; ++l)
+    NR = NR.subtree(0);
+  return NR;
+}
+
+void Path::moveRight(unsigned Level) {
+  assert(Level != 0 && "Cannot move the root node");
+
+  // Go up the tree until we can go right.
+  unsigned l = Level - 1;
+  while (l && atLastEntry(l))
+    --l;
+
+  // NR is the subtree containing our right sibling. If we hit end(), we have
+  // offset(0) == node(0).size().
+  if (++path[l].offset == path[l].size)
+    return;
+  NodeRef NR = subtree(l);
+
+  for (++l; l != Level; ++l) {
+    path[l] = Entry(NR, 0);
+    NR = NR.subtree(0);
+  }
+  path[l] = Entry(NR, 0);
+}
+
+
+IdxPair distribute(unsigned Nodes, unsigned Elements, unsigned Capacity,
+                   const unsigned *CurSize, unsigned NewSize[],
+                   unsigned Position, bool Grow) {
+  assert(Elements + Grow <= Nodes * Capacity && "Not enough room for elements");
+  assert(Position <= Elements && "Invalid position");
+  if (!Nodes)
+    return IdxPair();
+
+  // Trivial algorithm: left-leaning even distribution.
+  const unsigned PerNode = (Elements + Grow) / Nodes;
+  const unsigned Extra = (Elements + Grow) % Nodes;
+  IdxPair PosPair = IdxPair(Nodes, 0);
+  unsigned Sum = 0;
+  for (unsigned n = 0; n != Nodes; ++n) {
+    Sum += NewSize[n] = PerNode + (n < Extra);
+    if (PosPair.first == Nodes && Sum > Position)
+      PosPair = IdxPair(n, Position - (Sum - NewSize[n]));
+  }
+  assert(Sum == Elements + Grow && "Bad distribution sum");
+
+  // Subtract the Grow element that was added.
+  if (Grow) {
+    assert(PosPair.first < Nodes && "Bad algebra");
+    assert(NewSize[PosPair.first] && "Too few elements to need Grow");
+    --NewSize[PosPair.first];
+  }
+
+#ifndef NDEBUG
+  Sum = 0;
+  for (unsigned n = 0; n != Nodes; ++n) {
+    assert(NewSize[n] <= Capacity && "Overallocated node");
+    Sum += NewSize[n];
+  }
+  assert(Sum == Elements && "Bad distribution sum");
+#endif
+
+  return PosPair;
+}
+
+} // namespace IntervalMapImpl
+} // namespace llvm
+
diff --git a/contrib/llvm/lib/Support/IntrusiveRefCntPtr.cpp b/contrib/llvm/lib/Support/IntrusiveRefCntPtr.cpp
new file mode 100644
index 000000000000..a8b45593ae70
--- /dev/null
+++ b/contrib/llvm/lib/Support/IntrusiveRefCntPtr.cpp
@@ -0,0 +1,14 @@
+//== IntrusiveRefCntPtr.cpp - Smart Refcounting Pointer ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
+
+using namespace llvm;
+
+void RefCountedBaseVPTR::anchor() { }
diff --git a/contrib/llvm/lib/Support/IsInf.cpp b/contrib/llvm/lib/Support/IsInf.cpp
new file mode 100644
index 000000000000..d6da0c99e8d8
--- /dev/null
+++ b/contrib/llvm/lib/Support/IsInf.cpp
@@ -0,0 +1,49 @@
+//===-- IsInf.cpp - Platform-independent wrapper around C99 isinf() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Platform-independent wrapper around C99 isinf()
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#if HAVE_ISINF_IN_MATH_H
+# include <math.h>
+#elif HAVE_ISINF_IN_CMATH
+# include <cmath>
+#elif HAVE_STD_ISINF_IN_CMATH
+# include <cmath>
+using std::isinf;
+#elif HAVE_FINITE_IN_IEEEFP_H
+// A handy workaround I found at http://www.unixguide.net/sun/faq ...
+// apparently this has been a problem with Solaris for years.
+# include <ieeefp.h>
+static int isinf(double x) { return !finite(x) && x==x; }
+#elif defined(_MSC_VER)
+#include <float.h>
+#define isinf(X) (!_finite(X))
+#elif defined(_AIX) && defined(__GNUC__)
+// GCC's fixincludes seems to be removing the isinf() declaration from the
+// system header /usr/include/math.h
+# include <math.h>
+static int isinf(double x) { return !finite(x) && x==x; }
+#elif defined(__hpux)
+// HP-UX is "special"
+#include <math.h>
+static int isinf(double x) { return ((x) == INFINITY) || ((x) == -INFINITY); }
+#else
+# error "Don't know how to get isinf()"
+#endif
+
+namespace llvm {
+
+int IsInf(float f)  { return isinf(f); }
+int IsInf(double d) { return isinf(d); }
+
+} // end namespace llvm;
diff --git a/contrib/llvm/lib/Support/IsNAN.cpp b/contrib/llvm/lib/Support/IsNAN.cpp
new file mode 100644
index 000000000000..bdfdfbf3155d
--- /dev/null
+++ b/contrib/llvm/lib/Support/IsNAN.cpp
@@ -0,0 +1,33 @@
+//===-- IsNAN.cpp ---------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Platform-independent wrapper around C99 isnan().
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+
+#if HAVE_ISNAN_IN_MATH_H
+# include <math.h>
+#elif HAVE_ISNAN_IN_CMATH
+# include <cmath>
+#elif HAVE_STD_ISNAN_IN_CMATH
+# include <cmath>
+using std::isnan;
+#elif defined(_MSC_VER)
+#include <float.h>
+#define isnan _isnan
+#else
+# error "Don't know how to get isnan()"
+#endif
+
+namespace llvm {
+  int IsNAN(float f)  { return isnan(f); }
+  int IsNAN(double d) { return isnan(d); }
+} // end namespace llvm;
diff --git a/contrib/llvm/lib/Support/Locale.cpp b/contrib/llvm/lib/Support/Locale.cpp
new file mode 100644
index 000000000000..17b9b6c47d60
--- /dev/null
+++ b/contrib/llvm/lib/Support/Locale.cpp
@@ -0,0 +1,10 @@
+#include "llvm/Support/Locale.h"
+#include "llvm/Config/config.h"
+
+#ifdef __APPLE__
+#include "LocaleXlocale.inc"
+#elif LLVM_ON_WIN32
+#include "LocaleWindows.inc"
+#else
+#include "LocaleGeneric.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/LocaleGeneric.inc b/contrib/llvm/lib/Support/LocaleGeneric.inc
new file mode 100644
index 000000000000..278deee3e4dd
--- /dev/null
+++ b/contrib/llvm/lib/Support/LocaleGeneric.inc
@@ -0,0 +1,17 @@
+#include <cwctype>
+
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+    return s.size();
+}
+
+bool isPrint(int c) {
+    return iswprint(c);
+}
+
+}
+}
+}
diff --git a/contrib/llvm/lib/Support/LocaleWindows.inc b/contrib/llvm/lib/Support/LocaleWindows.inc
new file mode 100644
index 000000000000..28e429c0cb7d
--- /dev/null
+++ b/contrib/llvm/lib/Support/LocaleWindows.inc
@@ -0,0 +1,15 @@
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+    return s.size();
+}
+
+bool isPrint(int c) {
+    return ' ' <= c && c <= '~';
+}
+
+}
+}
+}
diff --git a/contrib/llvm/lib/Support/LocaleXlocale.inc b/contrib/llvm/lib/Support/LocaleXlocale.inc
new file mode 100644
index 000000000000..389fe3d1d4fd
--- /dev/null
+++ b/contrib/llvm/lib/Support/LocaleXlocale.inc
@@ -0,0 +1,61 @@
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ManagedStatic.h"
+#include <cassert>
+#include <xlocale.h>
+
+
+namespace {
+  struct locale_holder {
+    locale_holder()
+    : l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
+    {
+      assert(NULL!=l);
+    }
+    ~locale_holder() {
+      freelocale(l);
+    }
+
+    int mbswidth(llvm::SmallString<16> s) const {
+       // this implementation assumes no '\0' in s
+      assert(s.size()==strlen(s.c_str()));
+
+      size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
+      assert(size!=(size_t)-1);
+      if (size==0)
+        return 0;
+      llvm::SmallVector<wchar_t,200> ws(size);
+      size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
+      assert(ws.size()==size);
+      return wcswidth_l(&ws[0],ws.size(),l);
+    }
+
+    int isprint(int c) const {
+      return iswprint_l(c,l);
+    }
+
+  private:
+
+    locale_t l;
+  };
+
+  llvm::ManagedStatic<locale_holder> l;
+}
+
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef s) {
+  int width = l->mbswidth(s);
+  assert(width>=0);
+  return width;
+}
+
+bool isPrint(int c) {
+  return l->isprint(c);
+}
+
+}
+}
+}
diff --git a/contrib/llvm/lib/Support/LockFileManager.cpp b/contrib/llvm/lib/Support/LockFileManager.cpp
new file mode 100644
index 000000000000..92d8b83cf94e
--- /dev/null
+++ b/contrib/llvm/lib/Support/LockFileManager.cpp
@@ -0,0 +1,226 @@
+//===--- LockFileManager.cpp - File-level Locking Utility------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Support/LockFileManager.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include <fstream>
+#include <sys/stat.h>
+#include <sys/types.h>
+#if LLVM_ON_WIN32
+#include <windows.h>
+#endif
+#if LLVM_ON_UNIX
+#include <unistd.h>
+#endif
+using namespace llvm;
+
+/// \brief Attempt to read the lock file with the given name, if it exists.
+///
+/// \param LockFileName The name of the lock file to read.
+///
+/// \returns The process ID of the process that owns this lock file
+Optional<std::pair<std::string, int> >
+LockFileManager::readLockFile(StringRef LockFileName) {
+  // Check whether the lock file exists. If not, clearly there's nothing
+  // to read, so we just return.
+  bool Exists = false;
+  if (sys::fs::exists(LockFileName, Exists) || !Exists)
+    return None;
+
+  // Read the owning host and PID out of the lock file. If it appears that the
+  // owning process is dead, the lock file is invalid.
+  int PID = 0;
+  std::string Hostname;
+  std::ifstream Input(LockFileName.str().c_str());
+  if (Input >> Hostname >> PID && PID > 0 &&
+      processStillExecuting(Hostname, PID))
+    return std::make_pair(Hostname, PID);
+
+  // Delete the lock file. It's invalid anyway.
+  bool Existed;
+  sys::fs::remove(LockFileName, Existed);
+  return None;
+}
+
+bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
+#if LLVM_ON_UNIX && !defined(__ANDROID__)
+  char MyHostname[256];
+  MyHostname[255] = 0;
+  MyHostname[0] = 0;
+  gethostname(MyHostname, 255);
+  // Check whether the process is dead. If so, we're done.
+  if (MyHostname == Hostname && getsid(PID) == -1 && errno == ESRCH)
+    return false;
+#endif
+
+  return true;
+}
+
+LockFileManager::LockFileManager(StringRef FileName)
+{
+  this->FileName = FileName;
+  LockFileName = FileName;
+  LockFileName += ".lock";
+
+  // If the lock file already exists, don't bother to try to create our own
+  // lock file; it won't work anyway. Just figure out who owns this lock file.
+  if ((Owner = readLockFile(LockFileName)))
+    return;
+
+  // Create a lock file that is unique to this instance.
+  UniqueLockFileName = LockFileName;
+  UniqueLockFileName += "-%%%%%%%%";
+  int UniqueLockFileID;
+  if (error_code EC
+        = sys::fs::unique_file(UniqueLockFileName.str(),
+                                     UniqueLockFileID,
+                                     UniqueLockFileName,
+                                     /*makeAbsolute=*/false)) {
+    Error = EC;
+    return;
+  }
+
+  // Write our process ID to our unique lock file.
+  {
+    raw_fd_ostream Out(UniqueLockFileID, /*shouldClose=*/true);
+
+#if LLVM_ON_UNIX
+    // FIXME: move getpid() call into LLVM
+    char hostname[256];
+    hostname[255] = 0;
+    hostname[0] = 0;
+    gethostname(hostname, 255);
+    Out << hostname << ' ' << getpid();
+#else
+    Out << "localhost 1";
+#endif
+    Out.close();
+
+    if (Out.has_error()) {
+      // We failed to write out PID, so make up an excuse, remove the
+      // unique lock file, and fail.
+      Error = make_error_code(errc::no_space_on_device);
+      bool Existed;
+      sys::fs::remove(UniqueLockFileName.c_str(), Existed);
+      return;
+    }
+  }
+
+  // Create a hard link from the lock file name. If this succeeds, we're done.
+  error_code EC
+    = sys::fs::create_hard_link(UniqueLockFileName.str(),
+                                      LockFileName.str());
+  if (EC == errc::success)
+    return;
+
+  // Creating the hard link failed.
+
+#ifdef LLVM_ON_UNIX
+  // The creation of the hard link may appear to fail, but if stat'ing the
+  // unique file returns a link count of 2, then we can still declare success.
+  struct stat StatBuf;
+  if (stat(UniqueLockFileName.c_str(), &StatBuf) == 0 &&
+      StatBuf.st_nlink == 2)
+    return;
+#endif
+
+  // Someone else managed to create the lock file first. Wipe out our unique
+  // lock file (it's useless now) and read the process ID from the lock file.
+  bool Existed;
+  sys::fs::remove(UniqueLockFileName.str(), Existed);
+  if ((Owner = readLockFile(LockFileName)))
+    return;
+
+  // There is a lock file that nobody owns; try to clean it up and report
+  // an error.
+  sys::fs::remove(LockFileName.str(), Existed);
+  Error = EC;
+}
+
+LockFileManager::LockFileState LockFileManager::getState() const {
+  if (Owner)
+    return LFS_Shared;
+
+  if (Error)
+    return LFS_Error;
+
+  return LFS_Owned;
+}
+
+LockFileManager::~LockFileManager() {
+  if (getState() != LFS_Owned)
+    return;
+
+  // Since we own the lock, remove the lock file and our own unique lock file.
+  bool Existed;
+  sys::fs::remove(LockFileName.str(), Existed);
+  sys::fs::remove(UniqueLockFileName.str(), Existed);
+}
+
+void LockFileManager::waitForUnlock() {
+  if (getState() != LFS_Shared)
+    return;
+
+#if LLVM_ON_WIN32
+  unsigned long Interval = 1;
+#else
+  struct timespec Interval;
+  Interval.tv_sec = 0;
+  Interval.tv_nsec = 1000000;
+#endif
+  // Don't wait more than an hour for the file to appear.
+  const unsigned MaxSeconds = 3600;
+  bool LockFileGone = false;
+  do {
+    // Sleep for the designated interval, to allow the owning process time to
+    // finish up and remove the lock file.
+    // FIXME: Should we hook in to system APIs to get a notification when the
+    // lock file is deleted?
+#if LLVM_ON_WIN32
+    Sleep(Interval);
+#else
+    nanosleep(&Interval, NULL);
+#endif
+    // If the lock file no longer exists, wait for the actual file.
+    bool Exists = false;
+    if (!LockFileGone) {
+      if (!sys::fs::exists(LockFileName.str(), Exists) && !Exists) {
+        LockFileGone = true;
+        Exists = false;
+      }
+    }
+    if (LockFileGone) {
+      if (!sys::fs::exists(FileName.str(), Exists) && Exists)
+        return;
+    }
+
+    if (!processStillExecuting((*Owner).first, (*Owner).second))
+      return;
+
+    // Exponentially increase the time we wait for the lock to be removed.
+#if LLVM_ON_WIN32
+    Interval *= 2;
+#else
+    Interval.tv_sec *= 2;
+    Interval.tv_nsec *= 2;
+    if (Interval.tv_nsec >= 1000000000) {
+      ++Interval.tv_sec;
+      Interval.tv_nsec -= 1000000000;
+    }
+#endif
+  } while (
+#if LLVM_ON_WIN32
+           Interval < MaxSeconds * 1000
+#else
+           Interval.tv_sec < (time_t)MaxSeconds
+#endif
+           );
+
+  // Give up.
+}
diff --git a/contrib/llvm/lib/Support/ManagedStatic.cpp b/contrib/llvm/lib/Support/ManagedStatic.cpp
new file mode 100644
index 000000000000..098cccb68df5
--- /dev/null
+++ b/contrib/llvm/lib/Support/ManagedStatic.cpp
@@ -0,0 +1,81 @@
+//===-- ManagedStatic.cpp - Static Global wrapper -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ManagedStatic class and llvm_shutdown().
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Atomic.h"
+#include <cassert>
+using namespace llvm;
+
+static const ManagedStaticBase *StaticList = 0;
+
+void ManagedStaticBase::RegisterManagedStatic(void *(*Creator)(),
+                                              void (*Deleter)(void*)) const {
+  if (llvm_is_multithreaded()) {
+    llvm_acquire_global_lock();
+
+    if (Ptr == 0) {
+      void* tmp = Creator ? Creator() : 0;
+
+      TsanHappensBefore(this);
+      sys::MemoryFence();
+
+      // This write is racy against the first read in the ManagedStatic
+      // accessors. The race is benign because it does a second read after a
+      // memory fence, at which point it isn't possible to get a partial value.
+      TsanIgnoreWritesBegin();
+      Ptr = tmp;
+      TsanIgnoreWritesEnd();
+      DeleterFn = Deleter;
+      
+      // Add to list of managed statics.
+      Next = StaticList;
+      StaticList = this;
+    }
+
+    llvm_release_global_lock();
+  } else {
+    assert(Ptr == 0 && DeleterFn == 0 && Next == 0 &&
+           "Partially initialized ManagedStatic!?");
+    Ptr = Creator ? Creator() : 0;
+    DeleterFn = Deleter;
+  
+    // Add to list of managed statics.
+    Next = StaticList;
+    StaticList = this;
+  }
+}
+
+void ManagedStaticBase::destroy() const {
+  assert(DeleterFn && "ManagedStatic not initialized correctly!");
+  assert(StaticList == this &&
+         "Not destroyed in reverse order of construction?");
+  // Unlink from list.
+  StaticList = Next;
+  Next = 0;
+
+  // Destroy memory.
+  DeleterFn(Ptr);
+  
+  // Cleanup.
+  Ptr = 0;
+  DeleterFn = 0;
+}
+
+/// llvm_shutdown - Deallocate and destroy all ManagedStatic variables.
+void llvm::llvm_shutdown() {
+  while (StaticList)
+    StaticList->destroy();
+
+  if (llvm_is_multithreaded()) llvm_stop_multithreaded();
+}
diff --git a/contrib/llvm/lib/Support/Memory.cpp b/contrib/llvm/lib/Support/Memory.cpp
new file mode 100644
index 000000000000..f9a4903ad015
--- /dev/null
+++ b/contrib/llvm/lib/Support/Memory.cpp
@@ -0,0 +1,25 @@
+//===- Memory.cpp - Memory Handling Support ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for allocating memory and dealing
+// with memory mapped files
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Memory.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Valgrind.h"
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Memory.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Memory.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp
new file mode 100644
index 000000000000..7c5ab96a764a
--- /dev/null
+++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp
@@ -0,0 +1,426 @@
+//===--- MemoryBuffer.cpp - Memory Buffer implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the MemoryBuffer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/system_error.h"
+#include <cassert>
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <sys/stat.h>
+#include <sys/types.h>
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+// Simplistic definitinos of these macros to allow files to be read with
+// MapInFilePages.
+#ifndef S_ISREG
+#define S_ISREG(x) (1)
+#endif
+#ifndef S_ISBLK
+#define S_ISBLK(x) (0)
+#endif
+#endif
+#include <fcntl.h>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer implementation itself.
+//===----------------------------------------------------------------------===//
+
+MemoryBuffer::~MemoryBuffer() { }
+
+/// init - Initialize this MemoryBuffer as a reference to externally allocated
+/// memory, memory that we know is already null terminated.
+void MemoryBuffer::init(const char *BufStart, const char *BufEnd,
+                        bool RequiresNullTerminator) {
+  assert((!RequiresNullTerminator || BufEnd[0] == 0) &&
+         "Buffer is not null terminated!");
+  BufferStart = BufStart;
+  BufferEnd = BufEnd;
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBufferMem implementation.
+//===----------------------------------------------------------------------===//
+
+/// CopyStringRef - Copies contents of a StringRef into a block of memory and
+/// null-terminates it.
+static void CopyStringRef(char *Memory, StringRef Data) {
+  memcpy(Memory, Data.data(), Data.size());
+  Memory[Data.size()] = 0; // Null terminate string.
+}
+
+namespace {
+struct NamedBufferAlloc {
+  StringRef Name;
+  NamedBufferAlloc(StringRef Name) : Name(Name) {}
+};
+}
+
+void *operator new(size_t N, const NamedBufferAlloc &Alloc) {
+  char *Mem = static_cast<char *>(operator new(N + Alloc.Name.size() + 1));
+  CopyStringRef(Mem + N, Alloc.Name);
+  return Mem;
+}
+
+namespace {
+/// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory.
+class MemoryBufferMem : public MemoryBuffer {
+public:
+  MemoryBufferMem(StringRef InputData, bool RequiresNullTerminator) {
+    init(InputData.begin(), InputData.end(), RequiresNullTerminator);
+  }
+
+  virtual const char *getBufferIdentifier() const LLVM_OVERRIDE {
+     // The name is stored after the class itself.
+    return reinterpret_cast<const char*>(this + 1);
+  }
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
+    return MemoryBuffer_Malloc;
+  }
+};
+}
+
+/// getMemBuffer - Open the specified memory range as a MemoryBuffer.  Note
+/// that InputData must be a null terminated if RequiresNullTerminator is true!
+MemoryBuffer *MemoryBuffer::getMemBuffer(StringRef InputData,
+                                         StringRef BufferName,
+                                         bool RequiresNullTerminator) {
+  return new (NamedBufferAlloc(BufferName))
+      MemoryBufferMem(InputData, RequiresNullTerminator);
+}
+
+/// getMemBufferCopy - Open the specified memory range as a MemoryBuffer,
+/// copying the contents and taking ownership of it.  This has no requirements
+/// on EndPtr[0].
+MemoryBuffer *MemoryBuffer::getMemBufferCopy(StringRef InputData,
+                                             StringRef BufferName) {
+  MemoryBuffer *Buf = getNewUninitMemBuffer(InputData.size(), BufferName);
+  if (!Buf) return 0;
+  memcpy(const_cast<char*>(Buf->getBufferStart()), InputData.data(),
+         InputData.size());
+  return Buf;
+}
+
+/// getNewUninitMemBuffer - Allocate a new MemoryBuffer of the specified size
+/// that is not initialized.  Note that the caller should initialize the
+/// memory allocated by this method.  The memory is owned by the MemoryBuffer
+/// object.
+MemoryBuffer *MemoryBuffer::getNewUninitMemBuffer(size_t Size,
+                                                  StringRef BufferName) {
+  // Allocate space for the MemoryBuffer, the data and the name. It is important
+  // that MemoryBuffer and data are aligned so PointerIntPair works with them.
+  size_t AlignedStringLen =
+    RoundUpToAlignment(sizeof(MemoryBufferMem) + BufferName.size() + 1,
+                       sizeof(void*)); // TODO: Is sizeof(void*) enough?
+  size_t RealLen = AlignedStringLen + Size + 1;
+  char *Mem = static_cast<char*>(operator new(RealLen, std::nothrow));
+  if (!Mem) return 0;
+
+  // The name is stored after the class itself.
+  CopyStringRef(Mem + sizeof(MemoryBufferMem), BufferName);
+
+  // The buffer begins after the name and must be aligned.
+  char *Buf = Mem + AlignedStringLen;
+  Buf[Size] = 0; // Null terminate buffer.
+
+  return new (Mem) MemoryBufferMem(StringRef(Buf, Size), true);
+}
+
+/// getNewMemBuffer - Allocate a new MemoryBuffer of the specified size that
+/// is completely initialized to zeros.  Note that the caller should
+/// initialize the memory allocated by this method.  The memory is owned by
+/// the MemoryBuffer object.
+MemoryBuffer *MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) {
+  MemoryBuffer *SB = getNewUninitMemBuffer(Size, BufferName);
+  if (!SB) return 0;
+  memset(const_cast<char*>(SB->getBufferStart()), 0, Size);
+  return SB;
+}
+
+
+/// getFileOrSTDIN - Open the specified file as a MemoryBuffer, or open stdin
+/// if the Filename is "-".  If an error occurs, this returns null and fills
+/// in *ErrStr with a reason.  If stdin is empty, this API (unlike getSTDIN)
+/// returns an empty buffer.
+error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
+                                        OwningPtr<MemoryBuffer> &result,
+                                        int64_t FileSize) {
+  if (Filename == "-")
+    return getSTDIN(result);
+  return getFile(Filename, result, FileSize);
+}
+
+error_code MemoryBuffer::getFileOrSTDIN(const char *Filename,
+                                        OwningPtr<MemoryBuffer> &result,
+                                        int64_t FileSize) {
+  if (strcmp(Filename, "-") == 0)
+    return getSTDIN(result);
+  return getFile(Filename, result, FileSize);
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer::getFile implementation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Memorry maps a file descriptor using sys::fs::mapped_file_region.
+///
+/// This handles converting the offset into a legal offset on the platform.
+class MemoryBufferMMapFile : public MemoryBuffer {
+  sys::fs::mapped_file_region MFR;
+
+  static uint64_t getLegalMapOffset(uint64_t Offset) {
+    return Offset & ~(sys::fs::mapped_file_region::alignment() - 1);
+  }
+
+  static uint64_t getLegalMapSize(uint64_t Len, uint64_t Offset) {
+    return Len + (Offset - getLegalMapOffset(Offset));
+  }
+
+  const char *getStart(uint64_t Len, uint64_t Offset) {
+    return MFR.const_data() + (Offset - getLegalMapOffset(Offset));
+  }
+
+public:
+  MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len,
+                       uint64_t Offset, error_code EC)
+      : MFR(FD, false, sys::fs::mapped_file_region::readonly,
+            getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) {
+    if (!EC) {
+      const char *Start = getStart(Len, Offset);
+      init(Start, Start + Len, RequiresNullTerminator);
+    }
+  }
+
+  virtual const char *getBufferIdentifier() const LLVM_OVERRIDE {
+    // The name is stored after the class itself.
+    return reinterpret_cast<const char *>(this + 1);
+  }
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
+    return MemoryBuffer_MMap;
+  }
+};
+}
+
+static error_code getMemoryBufferForStream(int FD, 
+                                           StringRef BufferName,
+                                           OwningPtr<MemoryBuffer> &result) {
+  const ssize_t ChunkSize = 4096*4;
+  SmallString<ChunkSize> Buffer;
+  ssize_t ReadBytes;
+  // Read into Buffer until we hit EOF.
+  do {
+    Buffer.reserve(Buffer.size() + ChunkSize);
+    ReadBytes = read(FD, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1) {
+      if (errno == EINTR) continue;
+      return error_code(errno, posix_category());
+    }
+    Buffer.set_size(Buffer.size() + ReadBytes);
+  } while (ReadBytes != 0);
+
+  result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
+  return error_code::success();
+}
+
+error_code MemoryBuffer::getFile(StringRef Filename,
+                                 OwningPtr<MemoryBuffer> &result,
+                                 int64_t FileSize,
+                                 bool RequiresNullTerminator) {
+  // Ensure the path is null terminated.
+  SmallString<256> PathBuf(Filename.begin(), Filename.end());
+  return MemoryBuffer::getFile(PathBuf.c_str(), result, FileSize,
+                               RequiresNullTerminator);
+}
+
+error_code MemoryBuffer::getFile(const char *Filename,
+                                 OwningPtr<MemoryBuffer> &result,
+                                 int64_t FileSize,
+                                 bool RequiresNullTerminator) {
+  // FIXME: Review if this check is unnecessary on windows as well.
+#ifdef LLVM_ON_WIN32
+  // First check that the "file" is not a directory
+  bool is_dir = false;
+  error_code err = sys::fs::is_directory(Filename, is_dir);
+  if (err)
+    return err;
+  if (is_dir)
+    return make_error_code(errc::is_a_directory);
+#endif
+
+  int OpenFlags = O_RDONLY;
+#ifdef O_BINARY
+  OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
+#endif
+  int FD = ::open(Filename, OpenFlags);
+  if (FD == -1)
+    return error_code(errno, posix_category());
+
+  error_code ret = getOpenFile(FD, Filename, result, FileSize, FileSize,
+                               0, RequiresNullTerminator);
+  close(FD);
+  return ret;
+}
+
+static bool shouldUseMmap(int FD,
+                          size_t FileSize,
+                          size_t MapSize,
+                          off_t Offset,
+                          bool RequiresNullTerminator,
+                          int PageSize) {
+  // We don't use mmap for small files because this can severely fragment our
+  // address space.
+  if (MapSize < 4096*4)
+    return false;
+
+  if (!RequiresNullTerminator)
+    return true;
+
+
+  // If we don't know the file size, use fstat to find out.  fstat on an open
+  // file descriptor is cheaper than stat on a random path.
+  // FIXME: this chunk of code is duplicated, but it avoids a fstat when
+  // RequiresNullTerminator = false and MapSize != -1.
+  if (FileSize == size_t(-1)) {
+    struct stat FileInfo;
+    // TODO: This should use fstat64 when available.
+    if (fstat(FD, &FileInfo) == -1) {
+      return error_code(errno, posix_category());
+    }
+    FileSize = FileInfo.st_size;
+  }
+
+  // If we need a null terminator and the end of the map is inside the file,
+  // we cannot use mmap.
+  size_t End = Offset + MapSize;
+  assert(End <= FileSize);
+  if (End != FileSize)
+    return false;
+
+  // Don't try to map files that are exactly a multiple of the system page size
+  // if we need a null terminator.
+  if ((FileSize & (PageSize -1)) == 0)
+    return false;
+
+  return true;
+}
+
+error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
+                                     OwningPtr<MemoryBuffer> &result,
+                                     uint64_t FileSize, uint64_t MapSize,
+                                     int64_t Offset,
+                                     bool RequiresNullTerminator) {
+  static int PageSize = sys::process::get_self()->page_size();
+
+  // Default is to map the full file.
+  if (MapSize == uint64_t(-1)) {
+    // If we don't know the file size, use fstat to find out.  fstat on an open
+    // file descriptor is cheaper than stat on a random path.
+    if (FileSize == uint64_t(-1)) {
+      struct stat FileInfo;
+      // TODO: This should use fstat64 when available.
+      if (fstat(FD, &FileInfo) == -1) {
+        return error_code(errno, posix_category());
+      }
+
+      // If this not a file or a block device (e.g. it's a named pipe
+      // or character device), we can't trust the size. Create the memory
+      // buffer by copying off the stream.
+      if (!S_ISREG(FileInfo.st_mode) && !S_ISBLK(FileInfo.st_mode)) {
+        return getMemoryBufferForStream(FD, Filename, result);
+      }
+
+      FileSize = FileInfo.st_size;
+    }
+    MapSize = FileSize;
+  }
+
+  if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
+                    PageSize)) {
+    error_code EC;
+    result.reset(new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile(
+        RequiresNullTerminator, FD, MapSize, Offset, EC));
+    if (!EC)
+      return error_code::success();
+  }
+
+  MemoryBuffer *Buf = MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename);
+  if (!Buf) {
+    // Failed to create a buffer. The only way it can fail is if
+    // new(std::nothrow) returns 0.
+    return make_error_code(errc::not_enough_memory);
+  }
+
+  OwningPtr<MemoryBuffer> SB(Buf);
+  char *BufPtr = const_cast<char*>(SB->getBufferStart());
+
+  size_t BytesLeft = MapSize;
+#ifndef HAVE_PREAD
+  if (lseek(FD, Offset, SEEK_SET) == -1)
+    return error_code(errno, posix_category());
+#endif
+
+  while (BytesLeft) {
+#ifdef HAVE_PREAD
+    ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset);
+#else
+    ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
+#endif
+    if (NumRead == -1) {
+      if (errno == EINTR)
+        continue;
+      // Error while reading.
+      return error_code(errno, posix_category());
+    }
+    if (NumRead == 0) {
+      assert(0 && "We got inaccurate FileSize value or fstat reported an "
+                   "invalid file size.");
+      *BufPtr = '\0'; // null-terminate at the actual size.
+      break;
+    }
+    BytesLeft -= NumRead;
+    BufPtr += NumRead;
+  }
+
+  result.swap(SB);
+  return error_code::success();
+}
+
+//===----------------------------------------------------------------------===//
+// MemoryBuffer::getSTDIN implementation.
+//===----------------------------------------------------------------------===//
+
+error_code MemoryBuffer::getSTDIN(OwningPtr<MemoryBuffer> &result) {
+  // Read in all of the data from stdin, we cannot mmap stdin.
+  //
+  // FIXME: That isn't necessarily true, we should try to mmap stdin and
+  // fallback if it fails.
+  sys::Program::ChangeStdinToBinary();
+
+  return getMemoryBufferForStream(0, "<stdin>", result);
+}
diff --git a/contrib/llvm/lib/Support/MemoryObject.cpp b/contrib/llvm/lib/Support/MemoryObject.cpp
new file mode 100644
index 000000000000..b20ab8923813
--- /dev/null
+++ b/contrib/llvm/lib/Support/MemoryObject.cpp
@@ -0,0 +1,37 @@
+//===- MemoryObject.cpp - Abstract memory interface -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryObject.h"
+using namespace llvm;
+  
+MemoryObject::~MemoryObject() {
+}
+
+int MemoryObject::readBytes(uint64_t address,
+                            uint64_t size,
+                            uint8_t* buf,
+                            uint64_t* copied) const {
+  uint64_t current = address;
+  uint64_t limit = getBase() + getExtent();
+
+  if (current + size > limit)
+    return -1;
+
+  while (current - address < size) {
+    if (readByte(current, &buf[(current - address)]))
+      return -1;
+    
+    current++;
+  }
+  
+  if (copied)
+    *copied = current - address;
+  
+  return 0;
+}
diff --git a/contrib/llvm/lib/Support/Mutex.cpp b/contrib/llvm/lib/Support/Mutex.cpp
new file mode 100644
index 000000000000..4e4a026b2f0d
--- /dev/null
+++ b/contrib/llvm/lib/Support/Mutex.cpp
@@ -0,0 +1,129 @@
+//===- Mutex.cpp - Mutual Exclusion Lock ------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::sys::Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/Mutex.h"
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+#if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
+// Define all methods as no-ops if threading is explicitly disabled
+namespace llvm {
+using namespace sys;
+MutexImpl::MutexImpl( bool recursive) { }
+MutexImpl::~MutexImpl() { }
+bool MutexImpl::acquire() { return true; }
+bool MutexImpl::release() { return true; }
+bool MutexImpl::tryacquire() { return true; }
+}
+#else
+
+#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_MUTEX_LOCK)
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+// Construct a Mutex using pthread calls
+MutexImpl::MutexImpl( bool recursive)
+  : data_(0)
+{
+  // Declare the pthread_mutex data structures
+  pthread_mutex_t* mutex =
+    static_cast<pthread_mutex_t*>(malloc(sizeof(pthread_mutex_t)));
+  pthread_mutexattr_t attr;
+
+  // Initialize the mutex attributes
+  int errorcode = pthread_mutexattr_init(&attr);
+  assert(errorcode == 0); (void)errorcode;
+
+  // Initialize the mutex as a recursive mutex, if requested, or normal
+  // otherwise.
+  int kind = ( recursive  ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL );
+  errorcode = pthread_mutexattr_settype(&attr, kind);
+  assert(errorcode == 0);
+
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && \
+    !defined(__DragonFly__) && !defined(__Bitrig__)
+  // Make it a process local mutex
+  errorcode = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
+  assert(errorcode == 0);
+#endif
+
+  // Initialize the mutex
+  errorcode = pthread_mutex_init(mutex, &attr);
+  assert(errorcode == 0);
+
+  // Destroy the attributes
+  errorcode = pthread_mutexattr_destroy(&attr);
+  assert(errorcode == 0);
+
+  // Assign the data member
+  data_ = mutex;
+}
+
+// Destruct a Mutex
+MutexImpl::~MutexImpl()
+{
+  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+  assert(mutex != 0);
+  pthread_mutex_destroy(mutex);
+  free(mutex);
+}
+
+bool
+MutexImpl::acquire()
+{
+  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+  assert(mutex != 0);
+
+  int errorcode = pthread_mutex_lock(mutex);
+  return errorcode == 0;
+}
+
+bool
+MutexImpl::release()
+{
+  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+  assert(mutex != 0);
+
+  int errorcode = pthread_mutex_unlock(mutex);
+  return errorcode == 0;
+}
+
+bool
+MutexImpl::tryacquire()
+{
+  pthread_mutex_t* mutex = static_cast<pthread_mutex_t*>(data_);
+  assert(mutex != 0);
+
+  int errorcode = pthread_mutex_trylock(mutex);
+  return errorcode == 0;
+}
+
+}
+
+#elif defined(LLVM_ON_UNIX)
+#include "Unix/Mutex.inc"
+#elif defined( LLVM_ON_WIN32)
+#include "Windows/Mutex.inc"
+#else
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
+#endif
+#endif
diff --git a/contrib/llvm/lib/Support/Path.cpp b/contrib/llvm/lib/Support/Path.cpp
new file mode 100644
index 000000000000..d0703754e04f
--- /dev/null
+++ b/contrib/llvm/lib/Support/Path.cpp
@@ -0,0 +1,295 @@
+//===-- Path.cpp - Implement OS Path Concept --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Path concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Path.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileSystem.h"
+#include <cassert>
+#include <cstring>
+#include <ostream>
+using namespace llvm;
+using namespace sys;
+namespace {
+using support::ulittle32_t;
+}
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+bool Path::operator==(const Path &that) const {
+  return path == that.path;
+}
+
+bool Path::operator<(const Path& that) const {
+  return path < that.path;
+}
+
+LLVMFileType
+sys::IdentifyFileType(const char *magic, unsigned length) {
+  assert(magic && "Invalid magic number string");
+  assert(length >=4 && "Invalid magic number length");
+  switch ((unsigned char)magic[0]) {
+    case 0xDE:  // 0x0B17C0DE = BC wraper
+      if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
+          magic[3] == (char)0x0B)
+        return Bitcode_FileType;
+      break;
+    case 'B':
+      if (magic[1] == 'C' && magic[2] == (char)0xC0 && magic[3] == (char)0xDE)
+        return Bitcode_FileType;
+      break;
+    case '!':
+      if (length >= 8)
+        if (memcmp(magic,"!<arch>\n",8) == 0)
+          return Archive_FileType;
+      break;
+
+    case '\177':
+      if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
+        bool Data2MSB = magic[5] == 2;
+        unsigned high = Data2MSB ? 16 : 17;
+        unsigned low  = Data2MSB ? 17 : 16;
+        if (length >= 18 && magic[high] == 0)
+          switch (magic[low]) {
+            default: break;
+            case 1: return ELF_Relocatable_FileType;
+            case 2: return ELF_Executable_FileType;
+            case 3: return ELF_SharedObject_FileType;
+            case 4: return ELF_Core_FileType;
+          }
+      }
+      break;
+
+    case 0xCA:
+      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) &&
+          magic[3] == char(0xBE)) {
+        // This is complicated by an overlap with Java class files.
+        // See the Mach-O section in /usr/share/file/magic for details.
+        if (length >= 8 && magic[7] < 43)
+          // FIXME: Universal Binary of any type.
+          return Mach_O_DynamicallyLinkedSharedLib_FileType;
+      }
+      break;
+
+      // The two magic numbers for mach-o are:
+      // 0xfeedface - 32-bit mach-o
+      // 0xfeedfacf - 64-bit mach-o
+    case 0xFE:
+    case 0xCE:
+    case 0xCF: {
+      uint16_t type = 0;
+      if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
+          magic[2] == char(0xFA) &&
+          (magic[3] == char(0xCE) || magic[3] == char(0xCF))) {
+        /* Native endian */
+        if (length >= 16) type = magic[14] << 8 | magic[15];
+      } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) &&
+                 magic[1] == char(0xFA) && magic[2] == char(0xED) &&
+                 magic[3] == char(0xFE)) {
+        /* Reverse endian */
+        if (length >= 14) type = magic[13] << 8 | magic[12];
+      }
+      switch (type) {
+        default: break;
+        case 1: return Mach_O_Object_FileType;
+        case 2: return Mach_O_Executable_FileType;
+        case 3: return Mach_O_FixedVirtualMemorySharedLib_FileType;
+        case 4: return Mach_O_Core_FileType;
+        case 5: return Mach_O_PreloadExecutable_FileType;
+        case 6: return Mach_O_DynamicallyLinkedSharedLib_FileType;
+        case 7: return Mach_O_DynamicLinker_FileType;
+        case 8: return Mach_O_Bundle_FileType;
+        case 9: return Mach_O_DynamicallyLinkedSharedLibStub_FileType;
+        case 10: return Mach_O_DSYMCompanion_FileType;
+      }
+      break;
+    }
+    case 0xF0: // PowerPC Windows
+    case 0x83: // Alpha 32-bit
+    case 0x84: // Alpha 64-bit
+    case 0x66: // MPS R4000 Windows
+    case 0x50: // mc68K
+    case 0x4c: // 80386 Windows
+      if (magic[1] == 0x01)
+        return COFF_FileType;
+
+    case 0x90: // PA-RISC Windows
+    case 0x68: // mc68K Windows
+      if (magic[1] == 0x02)
+        return COFF_FileType;
+      break;
+
+    case 0x4d: // Possible MS-DOS stub on Windows PE file
+      if (magic[1] == 0x5a) {
+        uint32_t off = *reinterpret_cast<const ulittle32_t *>(magic + 0x3c);
+        // PE/COFF file, either EXE or DLL.
+        if (off < length && memcmp(magic + off, "PE\0\0",4) == 0)
+          return COFF_FileType;
+      }
+      break;
+
+    case 0x64: // x86-64 Windows.
+      if (magic[1] == char(0x86))
+        return COFF_FileType;
+      break;
+
+    default:
+      break;
+  }
+  return Unknown_FileType;
+}
+
+bool
+Path::isArchive() const {
+  fs::file_magic type;
+  if (fs::identify_magic(str(), type))
+    return false;
+  return type == fs::file_magic::archive;
+}
+
+bool
+Path::isDynamicLibrary() const {
+  fs::file_magic type;
+  if (fs::identify_magic(str(), type))
+    return false;
+  switch (type) {
+    default: return false;
+    case fs::file_magic::macho_fixed_virtual_memory_shared_lib:
+    case fs::file_magic::macho_dynamically_linked_shared_lib:
+    case fs::file_magic::macho_dynamically_linked_shared_lib_stub:
+    case fs::file_magic::elf_shared_object:
+    case fs::file_magic::pecoff_executable:  return true;
+  }
+}
+
+bool
+Path::isObjectFile() const {
+  fs::file_magic type;
+  if (fs::identify_magic(str(), type) || type == fs::file_magic::unknown)
+    return false;
+  return true;
+}
+
+Path
+Path::FindLibrary(std::string& name) {
+  std::vector<sys::Path> LibPaths;
+  GetSystemLibraryPaths(LibPaths);
+  for (unsigned i = 0; i < LibPaths.size(); ++i) {
+    sys::Path FullPath(LibPaths[i]);
+    FullPath.appendComponent("lib" + name + LTDL_SHLIB_EXT);
+    if (FullPath.isDynamicLibrary())
+      return FullPath;
+    FullPath.eraseSuffix();
+    FullPath.appendSuffix("a");
+    if (FullPath.isArchive())
+      return FullPath;
+  }
+  return sys::Path();
+}
+
+StringRef Path::GetDLLSuffix() {
+  return &(LTDL_SHLIB_EXT[1]);
+}
+
+void
+Path::appendSuffix(StringRef suffix) {
+  if (!suffix.empty()) {
+    path.append(".");
+    path.append(suffix);
+  }
+}
+
+bool
+Path::isBitcodeFile() const {
+  fs::file_magic type;
+  if (fs::identify_magic(str(), type))
+    return false;
+  return type == fs::file_magic::bitcode;
+}
+
+bool Path::hasMagicNumber(StringRef Magic) const {
+  std::string actualMagic;
+  if (getMagicNumber(actualMagic, static_cast<unsigned>(Magic.size())))
+    return Magic == actualMagic;
+  return false;
+}
+
+static void getPathList(const char*path, std::vector<Path>& Paths) {
+  const char* at = path;
+  const char* delim = strchr(at, PathSeparator);
+  Path tmpPath;
+  while (delim != 0) {
+    std::string tmp(at, size_t(delim-at));
+    if (tmpPath.set(tmp))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+    at = delim + 1;
+    delim = strchr(at, PathSeparator);
+  }
+
+  if (*at != 0)
+    if (tmpPath.set(std::string(at)))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+}
+
+static StringRef getDirnameCharSep(StringRef path, const char *Sep) {
+  assert(Sep[0] != '\0' && Sep[1] == '\0' &&
+         "Sep must be a 1-character string literal.");
+  if (path.empty())
+    return ".";
+
+  // If the path is all slashes, return a single slash.
+  // Otherwise, remove all trailing slashes.
+
+  signed pos = static_cast<signed>(path.size()) - 1;
+
+  while (pos >= 0 && path[pos] == Sep[0])
+    --pos;
+
+  if (pos < 0)
+    return path[0] == Sep[0] ? Sep : ".";
+
+  // Any slashes left?
+  signed i = 0;
+
+  while (i < pos && path[i] != Sep[0])
+    ++i;
+
+  if (i == pos) // No slashes?  Return "."
+    return ".";
+
+  // There is at least one slash left.  Remove all trailing non-slashes.
+  while (pos >= 0 && path[pos] != Sep[0])
+    --pos;
+
+  // Remove any trailing slashes.
+  while (pos >= 0 && path[pos] == Sep[0])
+    --pos;
+
+  if (pos < 0)
+    return path[0] == Sep[0] ? Sep : ".";
+
+  return path.substr(0, pos+1);
+}
+
+// Include the truly platform-specific parts of this class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/Path.inc"
+#endif
+#if defined(LLVM_ON_WIN32)
+#include "Windows/Path.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/PathV2.cpp b/contrib/llvm/lib/Support/PathV2.cpp
new file mode 100644
index 000000000000..58a6ea720e73
--- /dev/null
+++ b/contrib/llvm/lib/Support/PathV2.cpp
@@ -0,0 +1,946 @@
+//===-- PathV2.cpp - Implement OS Path Concept ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the operating system PathV2 API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include <cctype>
+#include <cstdio>
+#include <cstring>
+#ifdef __APPLE__
+#include <unistd.h>
+#endif
+
+namespace {
+  using llvm::StringRef;
+  using llvm::sys::path::is_separator;
+
+#ifdef LLVM_ON_WIN32
+  const char *separators = "\\/";
+  const char  prefered_separator = '\\';
+#else
+  const char  separators = '/';
+  const char  prefered_separator = '/';
+#endif
+
+  StringRef find_first_component(StringRef path) {
+    // Look for this first component in the following order.
+    // * empty (in this case we return an empty string)
+    // * either C: or {//,\\}net.
+    // * {/,\}
+    // * {.,..}
+    // * {file,directory}name
+
+    if (path.empty())
+      return path;
+
+#ifdef LLVM_ON_WIN32
+    // C:
+    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
+        path[1] == ':')
+      return path.substr(0, 2);
+#endif
+
+    // //net
+    if ((path.size() > 2) &&
+        is_separator(path[0]) &&
+        path[0] == path[1] &&
+        !is_separator(path[2])) {
+      // Find the next directory separator.
+      size_t end = path.find_first_of(separators, 2);
+      return path.substr(0, end);
+    }
+
+    // {/,\}
+    if (is_separator(path[0]))
+      return path.substr(0, 1);
+
+    if (path.startswith(".."))
+      return path.substr(0, 2);
+
+    if (path[0] == '.')
+      return path.substr(0, 1);
+
+    // * {file,directory}name
+    size_t end = path.find_first_of(separators, 2);
+    return path.substr(0, end);
+  }
+
+  size_t filename_pos(StringRef str) {
+    if (str.size() == 2 &&
+        is_separator(str[0]) &&
+        str[0] == str[1])
+      return 0;
+
+    if (str.size() > 0 && is_separator(str[str.size() - 1]))
+      return str.size() - 1;
+
+    size_t pos = str.find_last_of(separators, str.size() - 1);
+
+#ifdef LLVM_ON_WIN32
+    if (pos == StringRef::npos)
+      pos = str.find_last_of(':', str.size() - 2);
+#endif
+
+    if (pos == StringRef::npos ||
+        (pos == 1 && is_separator(str[0])))
+      return 0;
+
+    return pos + 1;
+  }
+
+  size_t root_dir_start(StringRef str) {
+    // case "c:/"
+#ifdef LLVM_ON_WIN32
+    if (str.size() > 2 &&
+        str[1] == ':' &&
+        is_separator(str[2]))
+      return 2;
+#endif
+
+    // case "//"
+    if (str.size() == 2 &&
+        is_separator(str[0]) &&
+        str[0] == str[1])
+      return StringRef::npos;
+
+    // case "//net"
+    if (str.size() > 3 &&
+        is_separator(str[0]) &&
+        str[0] == str[1] &&
+        !is_separator(str[2])) {
+      return str.find_first_of(separators, 2);
+    }
+
+    // case "/"
+    if (str.size() > 0 && is_separator(str[0]))
+      return 0;
+
+    return StringRef::npos;
+  }
+
+  size_t parent_path_end(StringRef path) {
+    size_t end_pos = filename_pos(path);
+
+    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+
+    // Skip separators except for root dir.
+    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+
+    while(end_pos > 0 &&
+          (end_pos - 1) != root_dir_pos &&
+          is_separator(path[end_pos - 1]))
+      --end_pos;
+
+    if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
+      return StringRef::npos;
+
+    return end_pos;
+  }
+} // end unnamed namespace
+
+namespace llvm {
+namespace sys  {
+namespace path {
+
+const_iterator begin(StringRef path) {
+  const_iterator i;
+  i.Path      = path;
+  i.Component = find_first_component(path);
+  i.Position  = 0;
+  return i;
+}
+
+const_iterator end(StringRef path) {
+  const_iterator i;
+  i.Path      = path;
+  i.Position  = path.size();
+  return i;
+}
+
+const_iterator &const_iterator::operator++() {
+  assert(Position < Path.size() && "Tried to increment past end!");
+
+  // Increment Position to past the current component
+  Position += Component.size();
+
+  // Check for end.
+  if (Position == Path.size()) {
+    Component = StringRef();
+    return *this;
+  }
+
+  // Both POSIX and Windows treat paths that begin with exactly two separators
+  // specially.
+  bool was_net = Component.size() > 2 &&
+    is_separator(Component[0]) &&
+    Component[1] == Component[0] &&
+    !is_separator(Component[2]);
+
+  // Handle separators.
+  if (is_separator(Path[Position])) {
+    // Root dir.
+    if (was_net
+#ifdef LLVM_ON_WIN32
+        // c:/
+        || Component.endswith(":")
+#endif
+        ) {
+      Component = Path.substr(Position, 1);
+      return *this;
+    }
+
+    // Skip extra separators.
+    while (Position != Path.size() &&
+           is_separator(Path[Position])) {
+      ++Position;
+    }
+
+    // Treat trailing '/' as a '.'.
+    if (Position == Path.size()) {
+      --Position;
+      Component = ".";
+      return *this;
+    }
+  }
+
+  // Find next component.
+  size_t end_pos = Path.find_first_of(separators, Position);
+  Component = Path.slice(Position, end_pos);
+
+  return *this;
+}
+
+const_iterator &const_iterator::operator--() {
+  // If we're at the end and the previous char was a '/', return '.'.
+  if (Position == Path.size() &&
+      Path.size() > 1 &&
+      is_separator(Path[Position - 1])
+#ifdef LLVM_ON_WIN32
+      && Path[Position - 2] != ':'
+#endif
+      ) {
+    --Position;
+    Component = ".";
+    return *this;
+  }
+
+  // Skip separators unless it's the root directory.
+  size_t root_dir_pos = root_dir_start(Path);
+  size_t end_pos = Position;
+
+  while(end_pos > 0 &&
+        (end_pos - 1) != root_dir_pos &&
+        is_separator(Path[end_pos - 1]))
+    --end_pos;
+
+  // Find next separator.
+  size_t start_pos = filename_pos(Path.substr(0, end_pos));
+  Component = Path.slice(start_pos, end_pos);
+  Position = start_pos;
+  return *this;
+}
+
+bool const_iterator::operator==(const const_iterator &RHS) const {
+  return Path.begin() == RHS.Path.begin() &&
+         Position == RHS.Position;
+}
+
+bool const_iterator::operator!=(const const_iterator &RHS) const {
+  return !(*this == RHS);
+}
+
+ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
+  return Position - RHS.Position;
+}
+
+const StringRef root_path(StringRef path) {
+  const_iterator b = begin(path),
+                 pos = b,
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if (has_net || has_drive) {
+      if ((++pos != e) && is_separator((*pos)[0])) {
+        // {C:/,//net/}, so get the first two components.
+        return path.substr(0, b->size() + pos->size());
+      } else {
+        // just {C:,//net}, return the first component.
+        return *b;
+      }
+    }
+
+    // POSIX style root directory.
+    if (is_separator((*b)[0])) {
+      return *b;
+    }
+  }
+
+  return StringRef();
+}
+
+const StringRef root_name(StringRef path) {
+  const_iterator b = begin(path),
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if (has_net || has_drive) {
+      // just {C:,//net}, return the first component.
+      return *b;
+    }
+  }
+
+  // No path or no name.
+  return StringRef();
+}
+
+const StringRef root_directory(StringRef path) {
+  const_iterator b = begin(path),
+                 pos = b,
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if ((has_net || has_drive) &&
+        // {C:,//net}, skip to the next component.
+        (++pos != e) && is_separator((*pos)[0])) {
+      return *pos;
+    }
+
+    // POSIX style root directory.
+    if (!has_net && is_separator((*b)[0])) {
+      return *b;
+    }
+  }
+
+  // No path or no root.
+  return StringRef();
+}
+
+const StringRef relative_path(StringRef path) {
+  StringRef root = root_path(path);
+  return path.substr(root.size());
+}
+
+void append(SmallVectorImpl<char> &path, const Twine &a,
+                                         const Twine &b,
+                                         const Twine &c,
+                                         const Twine &d) {
+  SmallString<32> a_storage;
+  SmallString<32> b_storage;
+  SmallString<32> c_storage;
+  SmallString<32> d_storage;
+
+  SmallVector<StringRef, 4> components;
+  if (!a.isTriviallyEmpty()) components.push_back(a.toStringRef(a_storage));
+  if (!b.isTriviallyEmpty()) components.push_back(b.toStringRef(b_storage));
+  if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage));
+  if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
+
+  for (SmallVectorImpl<StringRef>::const_iterator i = components.begin(),
+                                                  e = components.end();
+                                                  i != e; ++i) {
+    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
+    bool component_has_sep = !i->empty() && is_separator((*i)[0]);
+    bool is_root_name = has_root_name(*i);
+
+    if (path_has_sep) {
+      // Strip separators from beginning of component.
+      size_t loc = i->find_first_not_of(separators);
+      StringRef c = i->substr(loc);
+
+      // Append it.
+      path.append(c.begin(), c.end());
+      continue;
+    }
+
+    if (!component_has_sep && !(path.empty() || is_root_name)) {
+      // Add a separator.
+      path.push_back(prefered_separator);
+    }
+
+    path.append(i->begin(), i->end());
+  }
+}
+
+void append(SmallVectorImpl<char> &path,
+            const_iterator begin, const_iterator end) {
+  for (; begin != end; ++begin)
+    path::append(path, *begin);
+}
+
+const StringRef parent_path(StringRef path) {
+  size_t end_pos = parent_path_end(path);
+  if (end_pos == StringRef::npos)
+    return StringRef();
+  else
+    return path.substr(0, end_pos);
+}
+
+void remove_filename(SmallVectorImpl<char> &path) {
+  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+  if (end_pos != StringRef::npos)
+    path.set_size(end_pos);
+}
+
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+  StringRef p(path.begin(), path.size());
+  SmallString<32> ext_storage;
+  StringRef ext = extension.toStringRef(ext_storage);
+
+  // Erase existing extension.
+  size_t pos = p.find_last_of('.');
+  if (pos != StringRef::npos && pos >= filename_pos(p))
+    path.set_size(pos);
+
+  // Append '.' if needed.
+  if (ext.size() > 0 && ext[0] != '.')
+    path.push_back('.');
+
+  // Append extension.
+  path.append(ext.begin(), ext.end());
+}
+
+void native(const Twine &path, SmallVectorImpl<char> &result) {
+  // Clear result.
+  result.clear();
+#ifdef LLVM_ON_WIN32
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+  result.reserve(p.size());
+  for (StringRef::const_iterator i = p.begin(),
+                                 e = p.end();
+                                 i != e;
+                                 ++i) {
+    if (*i == '/')
+      result.push_back('\\');
+    else
+      result.push_back(*i);
+  }
+#else
+  path.toVector(result);
+#endif
+}
+
+const StringRef filename(StringRef path) {
+  return *(--end(path));
+}
+
+const StringRef stem(StringRef path) {
+  StringRef fname = filename(path);
+  size_t pos = fname.find_last_of('.');
+  if (pos == StringRef::npos)
+    return fname;
+  else
+    if ((fname.size() == 1 && fname == ".") ||
+        (fname.size() == 2 && fname == ".."))
+      return fname;
+    else
+      return fname.substr(0, pos);
+}
+
+const StringRef extension(StringRef path) {
+  StringRef fname = filename(path);
+  size_t pos = fname.find_last_of('.');
+  if (pos == StringRef::npos)
+    return StringRef();
+  else
+    if ((fname.size() == 1 && fname == ".") ||
+        (fname.size() == 2 && fname == ".."))
+      return StringRef();
+    else
+      return fname.substr(pos);
+}
+
+bool is_separator(char value) {
+  switch(value) {
+#ifdef LLVM_ON_WIN32
+    case '\\': // fall through
+#endif
+    case '/': return true;
+    default: return false;
+  }
+}
+
+void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
+  result.clear();
+
+#ifdef __APPLE__
+  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
+  int ConfName = erasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
+                               : _CS_DARWIN_USER_CACHE_DIR;
+  size_t ConfLen = confstr(ConfName, 0, 0);
+  if (ConfLen > 0) {
+    do {
+      result.resize(ConfLen);
+      ConfLen = confstr(ConfName, result.data(), result.size());
+    } while (ConfLen > 0 && ConfLen != result.size());
+
+    if (ConfLen > 0) {
+      assert(result.back() == 0);
+      result.pop_back();
+      return;
+    }
+
+    result.clear();
+  }
+#endif
+
+  // Check whether the temporary directory is specified by an environment
+  // variable.
+  const char *EnvironmentVariable;
+#ifdef LLVM_ON_WIN32
+  EnvironmentVariable = "TEMP";
+#else
+  EnvironmentVariable = "TMPDIR";
+#endif
+  if (char *RequestedDir = getenv(EnvironmentVariable)) {
+    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+    return;
+  }
+
+  // Fall back to a system default.
+  const char *DefaultResult;
+#ifdef LLVM_ON_WIN32
+  (void)erasedOnReboot;
+  DefaultResult = "C:\\TEMP";
+#else
+  if (erasedOnReboot)
+    DefaultResult = "/tmp";
+  else
+    DefaultResult = "/var/tmp";
+#endif
+  result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
+}
+
+bool has_root_name(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_name(p).empty();
+}
+
+bool has_root_directory(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_directory(p).empty();
+}
+
+bool has_root_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_path(p).empty();
+}
+
+bool has_relative_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !relative_path(p).empty();
+}
+
+bool has_filename(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !filename(p).empty();
+}
+
+bool has_parent_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !parent_path(p).empty();
+}
+
+bool has_stem(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !stem(p).empty();
+}
+
+bool has_extension(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !extension(p).empty();
+}
+
+bool is_absolute(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  bool rootDir = has_root_directory(p),
+#ifdef LLVM_ON_WIN32
+       rootName = has_root_name(p);
+#else
+       rootName = true;
+#endif
+
+  return rootDir && rootName;
+}
+
+bool is_relative(const Twine &path) {
+  return !is_absolute(path);
+}
+
+} // end namespace path
+
+namespace fs {
+
+error_code make_absolute(SmallVectorImpl<char> &path) {
+  StringRef p(path.data(), path.size());
+
+  bool rootDirectory = path::has_root_directory(p),
+#ifdef LLVM_ON_WIN32
+       rootName = path::has_root_name(p);
+#else
+       rootName = true;
+#endif
+
+  // Already absolute.
+  if (rootName && rootDirectory)
+    return error_code::success();
+
+  // All of the following conditions will need the current directory.
+  SmallString<128> current_dir;
+  if (error_code ec = current_path(current_dir)) return ec;
+
+  // Relative path. Prepend the current directory.
+  if (!rootName && !rootDirectory) {
+    // Append path to the current directory.
+    path::append(current_dir, p);
+    // Set path to the result.
+    path.swap(current_dir);
+    return error_code::success();
+  }
+
+  if (!rootName && rootDirectory) {
+    StringRef cdrn = path::root_name(current_dir);
+    SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
+    path::append(curDirRootName, p);
+    // Set path to the result.
+    path.swap(curDirRootName);
+    return error_code::success();
+  }
+
+  if (rootName && !rootDirectory) {
+    StringRef pRootName      = path::root_name(p);
+    StringRef bRootDirectory = path::root_directory(current_dir);
+    StringRef bRelativePath  = path::relative_path(current_dir);
+    StringRef pRelativePath  = path::relative_path(p);
+
+    SmallString<128> res;
+    path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
+    path.swap(res);
+    return error_code::success();
+  }
+
+  llvm_unreachable("All rootName and rootDirectory combinations should have "
+                   "occurred above!");
+}
+
+error_code create_directories(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  StringRef parent = path::parent_path(p);
+  if (!parent.empty()) {
+    bool parent_exists;
+    if (error_code ec = fs::exists(parent, parent_exists)) return ec;
+
+    if (!parent_exists)
+      if (error_code ec = create_directories(parent, existed)) return ec;
+  }
+
+  return create_directory(p, existed);
+}
+
+bool exists(file_status status) {
+  return status_known(status) && status.type() != file_type::file_not_found;
+}
+
+bool status_known(file_status s) {
+  return s.type() != file_type::status_error;
+}
+
+bool is_directory(file_status status) {
+  return status.type() == file_type::directory_file;
+}
+
+error_code is_directory(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_directory(st);
+  return error_code::success();
+}
+
+bool is_regular_file(file_status status) {
+  return status.type() == file_type::regular_file;
+}
+
+error_code is_regular_file(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_regular_file(st);
+  return error_code::success();
+}
+
+bool is_symlink(file_status status) {
+  return status.type() == file_type::symlink_file;
+}
+
+error_code is_symlink(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_symlink(st);
+  return error_code::success();
+}
+
+bool is_other(file_status status) {
+  return exists(status) &&
+         !is_regular_file(status) &&
+         !is_directory(status) &&
+         !is_symlink(status);
+}
+
+void directory_entry::replace_filename(const Twine &filename, file_status st) {
+  SmallString<128> path(Path.begin(), Path.end());
+  path::remove_filename(path);
+  path::append(path, filename);
+  Path = path.str();
+  Status = st;
+}
+
+error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
+  SmallString<32>  MagicStorage;
+  StringRef Magic = magic.toStringRef(MagicStorage);
+  SmallString<32> Buffer;
+
+  if (error_code ec = get_magic(path, Magic.size(), Buffer)) {
+    if (ec == errc::value_too_large) {
+      // Magic.size() > file_size(Path).
+      result = false;
+      return error_code::success();
+    }
+    return ec;
+  }
+
+  result = Magic == Buffer;
+  return error_code::success();
+}
+
+/// @brief Identify the magic in magic.
+file_magic identify_magic(StringRef magic) {
+  if (magic.size() < 4)
+    return file_magic::unknown;
+  switch ((unsigned char)magic[0]) {
+    case 0xDE:  // 0x0B17C0DE = BC wraper
+      if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
+          magic[3] == (char)0x0B)
+        return file_magic::bitcode;
+      break;
+    case 'B':
+      if (magic[1] == 'C' && magic[2] == (char)0xC0 && magic[3] == (char)0xDE)
+        return file_magic::bitcode;
+      break;
+    case '!':
+      if (magic.size() >= 8)
+        if (memcmp(magic.data(),"!<arch>\n",8) == 0)
+          return file_magic::archive;
+      break;
+
+    case '\177':
+      if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
+        if (magic.size() >= 18 && magic[17] == 0)
+          switch (magic[16]) {
+            default: break;
+            case 1: return file_magic::elf_relocatable;
+            case 2: return file_magic::elf_executable;
+            case 3: return file_magic::elf_shared_object;
+            case 4: return file_magic::elf_core;
+          }
+      }
+      break;
+
+    case 0xCA:
+      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) &&
+          magic[3] == char(0xBE)) {
+        // This is complicated by an overlap with Java class files.
+        // See the Mach-O section in /usr/share/file/magic for details.
+        if (magic.size() >= 8 && magic[7] < 43)
+          // FIXME: Universal Binary of any type.
+          return file_magic::macho_dynamically_linked_shared_lib;
+      }
+      break;
+
+      // The two magic numbers for mach-o are:
+      // 0xfeedface - 32-bit mach-o
+      // 0xfeedfacf - 64-bit mach-o
+    case 0xFE:
+    case 0xCE:
+    case 0xCF: {
+      uint16_t type = 0;
+      if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
+          magic[2] == char(0xFA) &&
+          (magic[3] == char(0xCE) || magic[3] == char(0xCF))) {
+        /* Native endian */
+        if (magic.size() >= 16) type = magic[14] << 8 | magic[15];
+      } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) &&
+                 magic[1] == char(0xFA) && magic[2] == char(0xED) &&
+                 magic[3] == char(0xFE)) {
+        /* Reverse endian */
+        if (magic.size() >= 14) type = magic[13] << 8 | magic[12];
+      }
+      switch (type) {
+        default: break;
+        case 1: return file_magic::macho_object;
+        case 2: return file_magic::macho_executable;
+        case 3: return file_magic::macho_fixed_virtual_memory_shared_lib;
+        case 4: return file_magic::macho_core;
+        case 5: return file_magic::macho_preload_executabl;
+        case 6: return file_magic::macho_dynamically_linked_shared_lib;
+        case 7: return file_magic::macho_dynamic_linker;
+        case 8: return file_magic::macho_bundle;
+        case 9: return file_magic::macho_dynamic_linker;
+        case 10: return file_magic::macho_dsym_companion;
+      }
+      break;
+    }
+    case 0xF0: // PowerPC Windows
+    case 0x83: // Alpha 32-bit
+    case 0x84: // Alpha 64-bit
+    case 0x66: // MPS R4000 Windows
+    case 0x50: // mc68K
+    case 0x4c: // 80386 Windows
+      if (magic[1] == 0x01)
+        return file_magic::coff_object;
+
+    case 0x90: // PA-RISC Windows
+    case 0x68: // mc68K Windows
+      if (magic[1] == 0x02)
+        return file_magic::coff_object;
+      break;
+
+    case 0x4d: // Possible MS-DOS stub on Windows PE file
+      if (magic[1] == 0x5a) {
+        uint32_t off =
+          *reinterpret_cast<const support::ulittle32_t*>(magic.data() + 0x3c);
+        // PE/COFF file, either EXE or DLL.
+        if (off < magic.size() && memcmp(magic.data() + off, "PE\0\0",4) == 0)
+          return file_magic::pecoff_executable;
+      }
+      break;
+
+    case 0x64: // x86-64 Windows.
+      if (magic[1] == char(0x86))
+        return file_magic::coff_object;
+      break;
+
+    default:
+      break;
+  }
+  return file_magic::unknown;
+}
+
+error_code identify_magic(const Twine &path, file_magic &result) {
+  SmallString<32> Magic;
+  error_code ec = get_magic(path, Magic.capacity(), Magic);
+  if (ec && ec != errc::value_too_large)
+    return ec;
+
+  result = identify_magic(Magic);
+  return error_code::success();
+}
+
+namespace {
+error_code remove_all_r(StringRef path, file_type ft, uint32_t &count) {
+  if (ft == file_type::directory_file) {
+    // This code would be a lot better with exceptions ;/.
+    error_code ec;
+    directory_iterator i(path, ec);
+    if (ec) return ec;
+    for (directory_iterator e; i != e; i.increment(ec)) {
+      if (ec) return ec;
+      file_status st;
+      if (error_code ec = i->status(st)) return ec;
+      if (error_code ec = remove_all_r(i->path(), st.type(), count)) return ec;
+    }
+    bool obviously_this_exists;
+    if (error_code ec = remove(path, obviously_this_exists)) return ec;
+    assert(obviously_this_exists);
+    ++count; // Include the directory itself in the items removed.
+  } else {
+    bool obviously_this_exists;
+    if (error_code ec = remove(path, obviously_this_exists)) return ec;
+    assert(obviously_this_exists);
+    ++count;
+  }
+
+  return error_code::success();
+}
+} // end unnamed namespace
+
+error_code remove_all(const Twine &path, uint32_t &num_removed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  file_status fs;
+  if (error_code ec = status(path, fs))
+    return ec;
+  num_removed = 0;
+  return remove_all_r(p, fs.type(), num_removed);
+}
+
+error_code directory_entry::status(file_status &result) const {
+  return fs::status(Path, result);
+}
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
+
+// Include the truly platform-specific parts.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/PathV2.inc"
+#endif
+#if defined(LLVM_ON_WIN32)
+#include "Windows/PathV2.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/PluginLoader.cpp b/contrib/llvm/lib/Support/PluginLoader.cpp
new file mode 100644
index 000000000000..358137f08f5f
--- /dev/null
+++ b/contrib/llvm/lib/Support/PluginLoader.cpp
@@ -0,0 +1,47 @@
+//===-- PluginLoader.cpp - Implement -load command line option ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the -load <plugin> command line option handler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DONT_GET_PLUGIN_LOADER_OPTION
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+using namespace llvm;
+
+static ManagedStatic<std::vector<std::string> > Plugins;
+static ManagedStatic<sys::SmartMutex<true> > PluginsLock;
+
+void PluginLoader::operator=(const std::string &Filename) {
+  sys::SmartScopedLock<true> Lock(*PluginsLock);
+  std::string Error;
+  if (sys::DynamicLibrary::LoadLibraryPermanently(Filename.c_str(), &Error)) {
+    errs() << "Error opening '" << Filename << "': " << Error
+           << "\n  -load request ignored.\n";
+  } else {
+    Plugins->push_back(Filename);
+  }
+}
+
+unsigned PluginLoader::getNumPlugins() {
+  sys::SmartScopedLock<true> Lock(*PluginsLock);
+  return Plugins.isConstructed() ? Plugins->size() : 0;
+}
+
+std::string &PluginLoader::getPlugin(unsigned num) {
+  sys::SmartScopedLock<true> Lock(*PluginsLock);
+  assert(Plugins.isConstructed() && num < Plugins->size() &&
+         "Asking for an out of bounds plugin");
+  return (*Plugins)[num];
+}
diff --git a/contrib/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
new file mode 100644
index 000000000000..23ee5ab105ae
--- /dev/null
+++ b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
@@ -0,0 +1,137 @@
+//===- PrettyStackTrace.cpp - Pretty Crash Handling -----------------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/ThreadLocal.h"
+#include "llvm/Support/Watchdog.h"
+#include "llvm/Support/raw_ostream.h"
+
+#ifdef HAVE_CRASHREPORTERCLIENT_H
+#include <CrashReporterClient.h>
+#endif
+
+using namespace llvm;
+
+namespace llvm {
+  bool DisablePrettyStackTrace = false;
+}
+
+// FIXME: This should be thread local when llvm supports threads.
+static sys::ThreadLocal<const PrettyStackTraceEntry> PrettyStackTraceHead;
+
+static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
+  unsigned NextID = 0;
+  if (Entry->getNextEntry())
+    NextID = PrintStack(Entry->getNextEntry(), OS);
+  OS << NextID << ".\t";
+  {
+    sys::Watchdog W(5);
+    Entry->print(OS);
+  }
+  
+  return NextID+1;
+}
+
+/// PrintCurStackTrace - Print the current stack trace to the specified stream.
+static void PrintCurStackTrace(raw_ostream &OS) {
+  // Don't print an empty trace.
+  if (PrettyStackTraceHead.get() == 0) return;
+  
+  // If there are pretty stack frames registered, walk and emit them.
+  OS << "Stack dump:\n";
+  
+  PrintStack(PrettyStackTraceHead.get(), OS);
+  OS.flush();
+}
+
+// Integrate with crash reporter libraries.
+#if defined (__APPLE__) && HAVE_CRASHREPORTERCLIENT_H
+//  If any clients of llvm try to link to libCrashReporterClient.a themselves,
+//  only one crash info struct will be used.
+extern "C" {
+CRASH_REPORTER_CLIENT_HIDDEN 
+struct crashreporter_annotations_t gCRAnnotations 
+        __attribute__((section("__DATA," CRASHREPORTER_ANNOTATIONS_SECTION))) 
+        = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0 };
+}
+#elif defined (__APPLE__) && HAVE_CRASHREPORTER_INFO
+static const char *__crashreporter_info__ = 0;
+asm(".desc ___crashreporter_info__, 0x10");
+#endif
+
+
+/// CrashHandler - This callback is run if a fatal signal is delivered to the
+/// process, it prints the pretty stack trace.
+static void CrashHandler(void *) {
+#ifndef __APPLE__
+  // On non-apple systems, just emit the crash stack trace to stderr.
+  PrintCurStackTrace(errs());
+#else
+  // Otherwise, emit to a smallvector of chars, send *that* to stderr, but also
+  // put it into __crashreporter_info__.
+  SmallString<2048> TmpStr;
+  {
+    raw_svector_ostream Stream(TmpStr);
+    PrintCurStackTrace(Stream);
+  }
+  
+  if (!TmpStr.empty()) {
+#ifdef HAVE_CRASHREPORTERCLIENT_H
+    // Cast to void to avoid warning.
+    (void)CRSetCrashLogMessage(std::string(TmpStr.str()).c_str());
+#elif HAVE_CRASHREPORTER_INFO 
+    __crashreporter_info__ = strdup(std::string(TmpStr.str()).c_str());
+#endif
+    errs() << TmpStr.str();
+  }
+  
+#endif
+}
+
+static bool RegisterCrashPrinter() {
+  if (!DisablePrettyStackTrace)
+    sys::AddSignalHandler(CrashHandler, 0);
+  return false;
+}
+
+PrettyStackTraceEntry::PrettyStackTraceEntry() {
+  // The first time this is called, we register the crash printer.
+  static bool HandlerRegistered = RegisterCrashPrinter();
+  (void)HandlerRegistered;
+    
+  // Link ourselves.
+  NextEntry = PrettyStackTraceHead.get();
+  PrettyStackTraceHead.set(this);
+}
+
+PrettyStackTraceEntry::~PrettyStackTraceEntry() {
+  assert(PrettyStackTraceHead.get() == this &&
+         "Pretty stack trace entry destruction is out of order");
+  PrettyStackTraceHead.set(getNextEntry());
+}
+
+void PrettyStackTraceString::print(raw_ostream &OS) const {
+  OS << Str << "\n";
+}
+
+void PrettyStackTraceProgram::print(raw_ostream &OS) const {
+  OS << "Program arguments: ";
+  // Print the argument list.
+  for (unsigned i = 0, e = ArgC; i != e; ++i)
+    OS << ArgV[i] << ' ';
+  OS << '\n';
+}
diff --git a/contrib/llvm/lib/Support/Process.cpp b/contrib/llvm/lib/Support/Process.cpp
new file mode 100644
index 000000000000..2c0d37bb3299
--- /dev/null
+++ b/contrib/llvm/lib/Support/Process.cpp
@@ -0,0 +1,89 @@
+//===-- Process.cpp - Implement OS Process Concept --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Process concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
+
+using namespace llvm;
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+// Empty virtual destructor to anchor the vtable for the process class.
+process::~process() {}
+
+self_process *process::get_self() {
+  // Use a function local static for thread safe initialization and allocate it
+  // as a raw pointer to ensure it is never destroyed.
+  static self_process *SP = new self_process();
+
+  return SP;
+}
+
+#if defined(_MSC_VER)
+// Visual Studio complains that the self_process destructor never exits. This
+// doesn't make much sense, as that's the whole point of calling abort... Just
+// silence this warning.
+#pragma warning(push)
+#pragma warning(disable:4722)
+#endif
+
+// The destructor for the self_process subclass must never actually be
+// executed. There should be at most one instance of this class, and that
+// instance should live until the process terminates to avoid the potential for
+// racy accesses during shutdown.
+self_process::~self_process() {
+  llvm_unreachable("This destructor must never be executed!");
+}
+
+/// \brief A helper function to compute the elapsed wall-time since the program
+/// started.
+///
+/// Note that this routine actually computes the elapsed wall time since the
+/// first time it was called. However, we arrange to have it called during the
+/// startup of the process to get approximately correct results.
+static TimeValue getElapsedWallTime() {
+  static TimeValue &StartTime = *new TimeValue(TimeValue::now());
+  return TimeValue::now() - StartTime;
+}
+
+/// \brief A special global variable to ensure we call \c getElapsedWallTime
+/// during global initialization of the program.
+///
+/// Note that this variable is never referenced elsewhere. Doing so could
+/// create race conditions during program startup or shutdown.
+static volatile TimeValue DummyTimeValue = getElapsedWallTime();
+
+// Implement this routine by using the static helpers above. They're already
+// portable.
+TimeValue self_process::get_wall_time() const {
+  return getElapsedWallTime();
+}
+
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Process.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Process.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/Program.cpp b/contrib/llvm/lib/Support/Program.cpp
new file mode 100644
index 000000000000..201d5c0d3056
--- /dev/null
+++ b/contrib/llvm/lib/Support/Program.cpp
@@ -0,0 +1,60 @@
+//===-- Program.cpp - Implement OS Program Concept --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This header file implements the operating system Program concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Program.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+int
+Program::ExecuteAndWait(const Path& path,
+                        const char** args,
+                        const char** envp,
+                        const Path** redirects,
+                        unsigned secondsToWait,
+                        unsigned memoryLimit,
+                        std::string* ErrMsg,
+                        bool *ExecutionFailed) {
+  Program prg;
+  if (prg.Execute(path, args, envp, redirects, memoryLimit, ErrMsg)) {
+    if (ExecutionFailed) *ExecutionFailed = false;
+    return prg.Wait(path, secondsToWait, ErrMsg);
+  }
+  if (ExecutionFailed) *ExecutionFailed = true;
+  return -1;
+}
+
+void
+Program::ExecuteNoWait(const Path& path,
+                       const char** args,
+                       const char** envp,
+                       const Path** redirects,
+                       unsigned memoryLimit,
+                       std::string* ErrMsg) {
+  Program prg;
+  prg.Execute(path, args, envp, redirects, memoryLimit, ErrMsg);
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Program.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Program.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/RWMutex.cpp b/contrib/llvm/lib/Support/RWMutex.cpp
new file mode 100644
index 000000000000..6a34f2d08524
--- /dev/null
+++ b/contrib/llvm/lib/Support/RWMutex.cpp
@@ -0,0 +1,125 @@
+//===- RWMutex.cpp - Reader/Writer Mutual Exclusion Lock --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::sys::RWMutex class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/RWMutex.h"
+#include <cstring>
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+#if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
+// Define all methods as no-ops if threading is explicitly disabled
+namespace llvm {
+using namespace sys;
+RWMutexImpl::RWMutexImpl() { }
+RWMutexImpl::~RWMutexImpl() { }
+bool RWMutexImpl::reader_acquire() { return true; }
+bool RWMutexImpl::reader_release() { return true; }
+bool RWMutexImpl::writer_acquire() { return true; }
+bool RWMutexImpl::writer_release() { return true; }
+}
+#else
+
+#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_RWLOCK_INIT)
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+// Construct a RWMutex using pthread calls
+RWMutexImpl::RWMutexImpl()
+  : data_(0)
+{
+  // Declare the pthread_rwlock data structures
+  pthread_rwlock_t* rwlock =
+    static_cast<pthread_rwlock_t*>(malloc(sizeof(pthread_rwlock_t)));
+
+#ifdef __APPLE__
+  // Workaround a bug/mis-feature in Darwin's pthread_rwlock_init.
+  bzero(rwlock, sizeof(pthread_rwlock_t));
+#endif
+
+  // Initialize the rwlock
+  int errorcode = pthread_rwlock_init(rwlock, NULL);
+  (void)errorcode;
+  assert(errorcode == 0);
+
+  // Assign the data member
+  data_ = rwlock;
+}
+
+// Destruct a RWMutex
+RWMutexImpl::~RWMutexImpl()
+{
+  pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
+  assert(rwlock != 0);
+  pthread_rwlock_destroy(rwlock);
+  free(rwlock);
+}
+
+bool
+RWMutexImpl::reader_acquire()
+{
+  pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
+  assert(rwlock != 0);
+
+  int errorcode = pthread_rwlock_rdlock(rwlock);
+  return errorcode == 0;
+}
+
+bool
+RWMutexImpl::reader_release()
+{
+  pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
+  assert(rwlock != 0);
+
+  int errorcode = pthread_rwlock_unlock(rwlock);
+  return errorcode == 0;
+}
+
+bool
+RWMutexImpl::writer_acquire()
+{
+  pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
+  assert(rwlock != 0);
+
+  int errorcode = pthread_rwlock_wrlock(rwlock);
+  return errorcode == 0;
+}
+
+bool
+RWMutexImpl::writer_release()
+{
+  pthread_rwlock_t* rwlock = static_cast<pthread_rwlock_t*>(data_);
+  assert(rwlock != 0);
+
+  int errorcode = pthread_rwlock_unlock(rwlock);
+  return errorcode == 0;
+}
+
+}
+
+#elif defined(LLVM_ON_UNIX)
+#include "Unix/RWMutex.inc"
+#elif defined( LLVM_ON_WIN32)
+#include "Windows/RWMutex.inc"
+#else
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
+#endif
+#endif
diff --git a/contrib/llvm/lib/Support/Regex.cpp b/contrib/llvm/lib/Support/Regex.cpp
new file mode 100644
index 000000000000..efc8b90a0090
--- /dev/null
+++ b/contrib/llvm/lib/Support/Regex.cpp
@@ -0,0 +1,170 @@
+//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a POSIX regular expression matcher.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Regex.h"
+#include "regex_impl.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+using namespace llvm;
+
+Regex::Regex(StringRef regex, unsigned Flags) {
+  unsigned flags = 0;
+  preg = new llvm_regex();
+  preg->re_endp = regex.end();
+  if (Flags & IgnoreCase) 
+    flags |= REG_ICASE;
+  if (Flags & Newline)
+    flags |= REG_NEWLINE;
+  if (!(Flags & BasicRegex))
+    flags |= REG_EXTENDED;
+  error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
+}
+
+Regex::~Regex() {
+  llvm_regfree(preg);
+  delete preg;
+}
+
+bool Regex::isValid(std::string &Error) {
+  if (!error)
+    return true;
+  
+  size_t len = llvm_regerror(error, preg, NULL, 0);
+  
+  Error.resize(len);
+  llvm_regerror(error, preg, &Error[0], len);
+  return false;
+}
+
+/// getNumMatches - In a valid regex, return the number of parenthesized
+/// matches it contains.
+unsigned Regex::getNumMatches() const {
+  return preg->re_nsub;
+}
+
+bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
+  unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
+
+  // pmatch needs to have at least one element.
+  SmallVector<llvm_regmatch_t, 8> pm;
+  pm.resize(nmatch > 0 ? nmatch : 1);
+  pm[0].rm_so = 0;
+  pm[0].rm_eo = String.size();
+
+  int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
+
+  if (rc == REG_NOMATCH)
+    return false;
+  if (rc != 0) {
+    // regexec can fail due to invalid pattern or running out of memory.
+    error = rc;
+    return false;
+  }
+
+  // There was a match.
+
+  if (Matches) { // match position requested
+    Matches->clear();
+    
+    for (unsigned i = 0; i != nmatch; ++i) {
+      if (pm[i].rm_so == -1) {
+        // this group didn't match
+        Matches->push_back(StringRef());
+        continue;
+      }
+      assert(pm[i].rm_eo >= pm[i].rm_so);
+      Matches->push_back(StringRef(String.data()+pm[i].rm_so,
+                                   pm[i].rm_eo-pm[i].rm_so));
+    }
+  }
+
+  return true;
+}
+
+std::string Regex::sub(StringRef Repl, StringRef String,
+                       std::string *Error) {
+  SmallVector<StringRef, 8> Matches;
+
+  // Reset error, if given.
+  if (Error && !Error->empty()) *Error = "";
+
+  // Return the input if there was no match.
+  if (!match(String, &Matches))
+    return String;
+
+  // Otherwise splice in the replacement string, starting with the prefix before
+  // the match.
+  std::string Res(String.begin(), Matches[0].begin());
+
+  // Then the replacement string, honoring possible substitutions.
+  while (!Repl.empty()) {
+    // Skip to the next escape.
+    std::pair<StringRef, StringRef> Split = Repl.split('\\');
+
+    // Add the skipped substring.
+    Res += Split.first;
+
+    // Check for terminimation and trailing backslash.
+    if (Split.second.empty()) {
+      if (Repl.size() != Split.first.size() &&
+          Error && Error->empty())
+        *Error = "replacement string contained trailing backslash";
+      break;
+    }
+
+    // Otherwise update the replacement string and interpret escapes.
+    Repl = Split.second;
+
+    // FIXME: We should have a StringExtras function for mapping C99 escapes.
+    switch (Repl[0]) {
+      // Treat all unrecognized characters as self-quoting.
+    default:
+      Res += Repl[0];
+      Repl = Repl.substr(1);
+      break;
+
+      // Single character escapes.
+    case 't':
+      Res += '\t';
+      Repl = Repl.substr(1);
+      break;
+    case 'n':
+      Res += '\n';
+      Repl = Repl.substr(1);
+      break;
+
+      // Decimal escapes are backreferences.
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9': {
+      // Extract the backreference number.
+      StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
+      Repl = Repl.substr(Ref.size());
+
+      unsigned RefValue;
+      if (!Ref.getAsInteger(10, RefValue) &&
+          RefValue < Matches.size())
+        Res += Matches[RefValue];
+      else if (Error && Error->empty())
+        *Error = "invalid backreference string '" + Ref.str() + "'";
+      break;
+    }
+    }
+  }
+
+  // And finally the suffix.
+  Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
+
+  return Res;
+}
diff --git a/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp b/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp
new file mode 100644
index 000000000000..2d23902bb85c
--- /dev/null
+++ b/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp
@@ -0,0 +1,58 @@
+//===- SearchForAddressOfSpecialSymbol.cpp - Function addresses -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file pulls the addresses of certain symbols out of the linker.  It must
+//  include as few header files as possible because it declares the symbols as
+//  void*, which would conflict with the actual symbol type if any header
+//  declared it.
+//
+//===----------------------------------------------------------------------===//
+
+#include <string.h>
+
+// Must declare the symbols in the global namespace.
+static void *DoSearch(const char* symbolName) {
+#define EXPLICIT_SYMBOL(SYM) \
+   extern void *SYM; if (!strcmp(symbolName, #SYM)) return &SYM
+
+  // If this is darwin, it has some funky issues, try to solve them here.  Some
+  // important symbols are marked 'private external' which doesn't allow
+  // SearchForAddressOfSymbol to find them.  As such, we special case them here,
+  // there is only a small handful of them.
+
+#ifdef __APPLE__
+  {
+    // __eprintf is sometimes used for assert() handling on x86.
+    //
+    // FIXME: Currently disabled when using Clang, as we don't always have our
+    // runtime support libraries available.
+#ifndef __clang__
+#ifdef __i386__
+    EXPLICIT_SYMBOL(__eprintf);
+#endif
+#endif
+  }
+#endif
+
+#ifdef __CYGWIN__
+  {
+    EXPLICIT_SYMBOL(_alloca);
+    EXPLICIT_SYMBOL(__main);
+  }
+#endif
+
+#undef EXPLICIT_SYMBOL
+  return 0;
+}
+
+namespace llvm {
+void *SearchForAddressOfSpecialSymbol(const char* symbolName) {
+  return DoSearch(symbolName);
+}
+}  // namespace llvm
diff --git a/contrib/llvm/lib/Support/Signals.cpp b/contrib/llvm/lib/Support/Signals.cpp
new file mode 100644
index 000000000000..a11789372d93
--- /dev/null
+++ b/contrib/llvm/lib/Support/Signals.cpp
@@ -0,0 +1,34 @@
+//===- Signals.cpp - Signal Handling support --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Signals.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+}
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Signals.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Signals.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/SmallPtrSet.cpp b/contrib/llvm/lib/Support/SmallPtrSet.cpp
new file mode 100644
index 000000000000..f0fed7792ce6
--- /dev/null
+++ b/contrib/llvm/lib/Support/SmallPtrSet.cpp
@@ -0,0 +1,273 @@
+//===- llvm/ADT/SmallPtrSet.cpp - 'Normally small' pointer set ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SmallPtrSet class.  See SmallPtrSet.h for an
+// overview of the algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cstdlib>
+
+using namespace llvm;
+
+void SmallPtrSetImpl::shrink_and_clear() {
+  assert(!isSmall() && "Can't shrink a small set!");
+  free(CurArray);
+
+  // Reduce the number of buckets.
+  CurArraySize = NumElements > 16 ? 1 << (Log2_32_Ceil(NumElements) + 1) : 32;
+  NumElements = NumTombstones = 0;
+
+  // Install the new array.  Clear all the buckets to empty.
+  CurArray = (const void**)malloc(sizeof(void*) * CurArraySize);
+  assert(CurArray && "Failed to allocate memory?");
+  memset(CurArray, -1, CurArraySize*sizeof(void*));
+}
+
+bool SmallPtrSetImpl::insert_imp(const void * Ptr) {
+  if (isSmall()) {
+    // Check to see if it is already in the set.
+    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
+         APtr != E; ++APtr)
+      if (*APtr == Ptr)
+        return false;
+    
+    // Nope, there isn't.  If we stay small, just 'pushback' now.
+    if (NumElements < CurArraySize-1) {
+      SmallArray[NumElements++] = Ptr;
+      return true;
+    }
+    // Otherwise, hit the big set case, which will call grow.
+  }
+  
+  if (NumElements*4 >= CurArraySize*3) {
+    // If more than 3/4 of the array is full, grow.
+    Grow(CurArraySize < 64 ? 128 : CurArraySize*2);
+  } else if (CurArraySize-(NumElements+NumTombstones) < CurArraySize/8) {
+    // If fewer of 1/8 of the array is empty (meaning that many are filled with
+    // tombstones), rehash.
+    Grow(CurArraySize);
+  }
+  
+  // Okay, we know we have space.  Find a hash bucket.
+  const void **Bucket = const_cast<const void**>(FindBucketFor(Ptr));
+  if (*Bucket == Ptr) return false; // Already inserted, good.
+  
+  // Otherwise, insert it!
+  if (*Bucket == getTombstoneMarker())
+    --NumTombstones;
+  *Bucket = Ptr;
+  ++NumElements;  // Track density.
+  return true;
+}
+
+bool SmallPtrSetImpl::erase_imp(const void * Ptr) {
+  if (isSmall()) {
+    // Check to see if it is in the set.
+    for (const void **APtr = SmallArray, **E = SmallArray+NumElements;
+         APtr != E; ++APtr)
+      if (*APtr == Ptr) {
+        // If it is in the set, replace this element.
+        *APtr = E[-1];
+        E[-1] = getEmptyMarker();
+        --NumElements;
+        return true;
+      }
+    
+    return false;
+  }
+  
+  // Okay, we know we have space.  Find a hash bucket.
+  void **Bucket = const_cast<void**>(FindBucketFor(Ptr));
+  if (*Bucket != Ptr) return false;  // Not in the set?
+
+  // Set this as a tombstone.
+  *Bucket = getTombstoneMarker();
+  --NumElements;
+  ++NumTombstones;
+  return true;
+}
+
+const void * const *SmallPtrSetImpl::FindBucketFor(const void *Ptr) const {
+  unsigned Bucket = DenseMapInfo<void *>::getHashValue(Ptr) & (CurArraySize-1);
+  unsigned ArraySize = CurArraySize;
+  unsigned ProbeAmt = 1;
+  const void *const *Array = CurArray;
+  const void *const *Tombstone = 0;
+  while (1) {
+    // Found Ptr's bucket?
+    if (Array[Bucket] == Ptr)
+      return Array+Bucket;
+    
+    // If we found an empty bucket, the pointer doesn't exist in the set.
+    // Return a tombstone if we've seen one so far, or the empty bucket if
+    // not.
+    if (Array[Bucket] == getEmptyMarker())
+      return Tombstone ? Tombstone : Array+Bucket;
+    
+    // If this is a tombstone, remember it.  If Ptr ends up not in the set, we
+    // prefer to return it than something that would require more probing.
+    if (Array[Bucket] == getTombstoneMarker() && !Tombstone)
+      Tombstone = Array+Bucket;  // Remember the first tombstone found.
+    
+    // It's a hash collision or a tombstone. Reprobe.
+    Bucket = (Bucket + ProbeAmt++) & (ArraySize-1);
+  }
+}
+
+/// Grow - Allocate a larger backing store for the buckets and move it over.
+///
+void SmallPtrSetImpl::Grow(unsigned NewSize) {
+  // Allocate at twice as many buckets, but at least 128.
+  unsigned OldSize = CurArraySize;
+  
+  const void **OldBuckets = CurArray;
+  bool WasSmall = isSmall();
+  
+  // Install the new array.  Clear all the buckets to empty.
+  CurArray = (const void**)malloc(sizeof(void*) * NewSize);
+  assert(CurArray && "Failed to allocate memory?");
+  CurArraySize = NewSize;
+  memset(CurArray, -1, NewSize*sizeof(void*));
+  
+  // Copy over all the elements.
+  if (WasSmall) {
+    // Small sets store their elements in order.
+    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+NumElements;
+         BucketPtr != E; ++BucketPtr) {
+      const void *Elt = *BucketPtr;
+      *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+    }
+  } else {
+    // Copy over all valid entries.
+    for (const void **BucketPtr = OldBuckets, **E = OldBuckets+OldSize;
+         BucketPtr != E; ++BucketPtr) {
+      // Copy over the element if it is valid.
+      const void *Elt = *BucketPtr;
+      if (Elt != getTombstoneMarker() && Elt != getEmptyMarker())
+        *const_cast<void**>(FindBucketFor(Elt)) = const_cast<void*>(Elt);
+    }
+    
+    free(OldBuckets);
+    NumTombstones = 0;
+  }
+}
+
+SmallPtrSetImpl::SmallPtrSetImpl(const void **SmallStorage,
+                                 const SmallPtrSetImpl& that) {
+  SmallArray = SmallStorage;
+
+  // If we're becoming small, prepare to insert into our stack space
+  if (that.isSmall()) {
+    CurArray = SmallArray;
+  // Otherwise, allocate new heap space (unless we were the same size)
+  } else {
+    CurArray = (const void**)malloc(sizeof(void*) * that.CurArraySize);
+    assert(CurArray && "Failed to allocate memory?");
+  }
+  
+  // Copy over the new array size
+  CurArraySize = that.CurArraySize;
+
+  // Copy over the contents from the other set
+  memcpy(CurArray, that.CurArray, sizeof(void*)*CurArraySize);
+  
+  NumElements = that.NumElements;
+  NumTombstones = that.NumTombstones;
+}
+
+/// CopyFrom - implement operator= from a smallptrset that has the same pointer
+/// type, but may have a different small size.
+void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
+  if (isSmall() && RHS.isSmall())
+    assert(CurArraySize == RHS.CurArraySize &&
+           "Cannot assign sets with different small sizes");
+
+  // If we're becoming small, prepare to insert into our stack space
+  if (RHS.isSmall()) {
+    if (!isSmall())
+      free(CurArray);
+    CurArray = SmallArray;
+  // Otherwise, allocate new heap space (unless we were the same size)
+  } else if (CurArraySize != RHS.CurArraySize) {
+    if (isSmall())
+      CurArray = (const void**)malloc(sizeof(void*) * RHS.CurArraySize);
+    else
+      CurArray = (const void**)realloc(CurArray, sizeof(void*)*RHS.CurArraySize);
+    assert(CurArray && "Failed to allocate memory?");
+  }
+  
+  // Copy over the new array size
+  CurArraySize = RHS.CurArraySize;
+
+  // Copy over the contents from the other set
+  memcpy(CurArray, RHS.CurArray, sizeof(void*)*CurArraySize);
+  
+  NumElements = RHS.NumElements;
+  NumTombstones = RHS.NumTombstones;
+}
+
+void SmallPtrSetImpl::swap(SmallPtrSetImpl &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither set is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->CurArray, RHS.CurArray);
+    std::swap(this->CurArraySize, RHS.CurArraySize);
+    std::swap(this->NumElements, RHS.NumElements);
+    std::swap(this->NumTombstones, RHS.NumTombstones);
+    return;
+  }
+
+  // FIXME: From here on we assume that both sets have the same small size.
+
+  // If only RHS is small, copy the small elements into LHS and move the pointer
+  // from LHS to RHS.
+  if (!this->isSmall() && RHS.isSmall()) {
+    std::copy(RHS.SmallArray, RHS.SmallArray+RHS.CurArraySize,
+              this->SmallArray);
+    std::swap(this->NumElements, RHS.NumElements);
+    std::swap(this->CurArraySize, RHS.CurArraySize);
+    RHS.CurArray = this->CurArray;
+    RHS.NumTombstones = this->NumTombstones;
+    this->CurArray = this->SmallArray;
+    this->NumTombstones = 0;
+    return;
+  }
+
+  // If only LHS is small, copy the small elements into RHS and move the pointer
+  // from RHS to LHS.
+  if (this->isSmall() && !RHS.isSmall()) {
+    std::copy(this->SmallArray, this->SmallArray+this->CurArraySize,
+              RHS.SmallArray);
+    std::swap(RHS.NumElements, this->NumElements);
+    std::swap(RHS.CurArraySize, this->CurArraySize);
+    this->CurArray = RHS.CurArray;
+    this->NumTombstones = RHS.NumTombstones;
+    RHS.CurArray = RHS.SmallArray;
+    RHS.NumTombstones = 0;
+    return;
+  }
+
+  // Both a small, just swap the small elements.
+  assert(this->isSmall() && RHS.isSmall());
+  assert(this->CurArraySize == RHS.CurArraySize);
+  std::swap_ranges(this->SmallArray, this->SmallArray+this->CurArraySize,
+                   RHS.SmallArray);
+  std::swap(this->NumElements, RHS.NumElements);
+}
+
+SmallPtrSetImpl::~SmallPtrSetImpl() {
+  if (!isSmall())
+    free(CurArray);
+}
diff --git a/contrib/llvm/lib/Support/SmallVector.cpp b/contrib/llvm/lib/Support/SmallVector.cpp
new file mode 100644
index 000000000000..f9c0e78270c9
--- /dev/null
+++ b/contrib/llvm/lib/Support/SmallVector.cpp
@@ -0,0 +1,40 @@
+//===- llvm/ADT/SmallVector.cpp - 'Normally small' vectors ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SmallVector class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+/// grow_pod - This is an implementation of the grow() method which only works
+/// on POD-like datatypes and is out of line to reduce code duplication.
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+                               size_t TSize) {
+  size_t CurSizeBytes = size_in_bytes();
+  size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
+  if (NewCapacityInBytes < MinSizeInBytes)
+    NewCapacityInBytes = MinSizeInBytes;
+
+  void *NewElts;
+  if (BeginX == FirstEl) {
+    NewElts = malloc(NewCapacityInBytes);
+
+    // Copy the elements over.  No need to run dtors on PODs.
+    memcpy(NewElts, this->BeginX, CurSizeBytes);
+  } else {
+    // If this wasn't grown from the inline copy, grow the allocated space.
+    NewElts = realloc(this->BeginX, NewCapacityInBytes);
+  }
+
+  this->EndX = (char*)NewElts+CurSizeBytes;
+  this->BeginX = NewElts;
+  this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+}
diff --git a/contrib/llvm/lib/Support/SourceMgr.cpp b/contrib/llvm/lib/Support/SourceMgr.cpp
new file mode 100644
index 000000000000..fac3cad5cc25
--- /dev/null
+++ b/contrib/llvm/lib/Support/SourceMgr.cpp
@@ -0,0 +1,489 @@
+//===- SourceMgr.cpp - Manager for Simple Source Buffers & Diagnostics ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SourceMgr class.  This class is used as a simple
+// substrate for diagnostics, #include handling, and other low level things for
+// simple parsers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Locale.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+
+static const size_t TabStop = 8;
+
+namespace {
+  struct LineNoCacheTy {
+    int LastQueryBufferID;
+    const char *LastQuery;
+    unsigned LineNoOfQuery;
+  };
+}
+
+static LineNoCacheTy *getCache(void *Ptr) {
+  return (LineNoCacheTy*)Ptr;
+}
+
+
+SourceMgr::~SourceMgr() {
+  // Delete the line # cache if allocated.
+  if (LineNoCacheTy *Cache = getCache(LineNoCache))
+    delete Cache;
+
+  while (!Buffers.empty()) {
+    delete Buffers.back().Buffer;
+    Buffers.pop_back();
+  }
+}
+
+/// AddIncludeFile - Search for a file with the specified name in the current
+/// directory or in one of the IncludeDirs.  If no file is found, this returns
+/// ~0, otherwise it returns the buffer ID of the stacked file.
+unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
+                                   SMLoc IncludeLoc,
+                                   std::string &IncludedFile) {
+  OwningPtr<MemoryBuffer> NewBuf;
+  IncludedFile = Filename;
+  MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
+
+  // If the file didn't exist directly, see if it's in an include path.
+  for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBuf; ++i) {
+    IncludedFile = IncludeDirectories[i] + "/" + Filename;
+    MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
+  }
+
+  if (NewBuf == 0) return ~0U;
+
+  return AddNewSourceBuffer(NewBuf.take(), IncludeLoc);
+}
+
+
+/// FindBufferContainingLoc - Return the ID of the buffer containing the
+/// specified location, returning -1 if not found.
+int SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
+  for (unsigned i = 0, e = Buffers.size(); i != e; ++i)
+    if (Loc.getPointer() >= Buffers[i].Buffer->getBufferStart() &&
+        // Use <= here so that a pointer to the null at the end of the buffer
+        // is included as part of the buffer.
+        Loc.getPointer() <= Buffers[i].Buffer->getBufferEnd())
+      return i;
+  return -1;
+}
+
+/// getLineAndColumn - Find the line and column number for the specified
+/// location in the specified file.  This is not a fast method.
+std::pair<unsigned, unsigned>
+SourceMgr::getLineAndColumn(SMLoc Loc, int BufferID) const {
+  if (BufferID == -1) BufferID = FindBufferContainingLoc(Loc);
+  assert(BufferID != -1 && "Invalid Location!");
+
+  MemoryBuffer *Buff = getBufferInfo(BufferID).Buffer;
+
+  // Count the number of \n's between the start of the file and the specified
+  // location.
+  unsigned LineNo = 1;
+
+  const char *BufStart = Buff->getBufferStart();
+  const char *Ptr = BufStart;
+
+  // If we have a line number cache, and if the query is to a later point in the
+  // same file, start searching from the last query location.  This optimizes
+  // for the case when multiple diagnostics come out of one file in order.
+  if (LineNoCacheTy *Cache = getCache(LineNoCache))
+    if (Cache->LastQueryBufferID == BufferID &&
+        Cache->LastQuery <= Loc.getPointer()) {
+      Ptr = Cache->LastQuery;
+      LineNo = Cache->LineNoOfQuery;
+    }
+
+  // Scan for the location being queried, keeping track of the number of lines
+  // we see.
+  for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
+    if (*Ptr == '\n') ++LineNo;
+
+  // Allocate the line number cache if it doesn't exist.
+  if (LineNoCache == 0)
+    LineNoCache = new LineNoCacheTy();
+
+  // Update the line # cache.
+  LineNoCacheTy &Cache = *getCache(LineNoCache);
+  Cache.LastQueryBufferID = BufferID;
+  Cache.LastQuery = Ptr;
+  Cache.LineNoOfQuery = LineNo;
+  
+  size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
+  if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
+  return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
+}
+
+void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
+  if (IncludeLoc == SMLoc()) return;  // Top of stack.
+
+  int CurBuf = FindBufferContainingLoc(IncludeLoc);
+  assert(CurBuf != -1 && "Invalid or unspecified location!");
+
+  PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+
+  OS << "Included from "
+     << getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
+     << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
+}
+
+
+/// GetMessage - Return an SMDiagnostic at the specified location with the
+/// specified string.
+///
+/// @param Type - If non-null, the kind of message (e.g., "error") which is
+/// prefixed to the message.
+SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                                   const Twine &Msg,
+                                   ArrayRef<SMRange> Ranges,
+                                   ArrayRef<SMFixIt> FixIts) const {
+
+  // First thing to do: find the current buffer containing the specified
+  // location to pull out the source line.
+  SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
+  std::pair<unsigned, unsigned> LineAndCol;
+  const char *BufferID = "<unknown>";
+  std::string LineStr;
+  
+  if (Loc.isValid()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+
+    MemoryBuffer *CurMB = getBufferInfo(CurBuf).Buffer;
+    BufferID = CurMB->getBufferIdentifier();
+    
+    // Scan backward to find the start of the line.
+    const char *LineStart = Loc.getPointer();
+    const char *BufStart = CurMB->getBufferStart();
+    while (LineStart != BufStart && LineStart[-1] != '\n' &&
+           LineStart[-1] != '\r')
+      --LineStart;
+
+    // Get the end of the line.
+    const char *LineEnd = Loc.getPointer();
+    const char *BufEnd = CurMB->getBufferEnd();
+    while (LineEnd != BufEnd && LineEnd[0] != '\n' && LineEnd[0] != '\r')
+      ++LineEnd;
+    LineStr = std::string(LineStart, LineEnd);
+
+    // Convert any ranges to column ranges that only intersect the line of the
+    // location.
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+      SMRange R = Ranges[i];
+      if (!R.isValid()) continue;
+      
+      // If the line doesn't contain any part of the range, then ignore it.
+      if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+        continue;
+     
+      // Ignore pieces of the range that go onto other lines.
+      if (R.Start.getPointer() < LineStart)
+        R.Start = SMLoc::getFromPointer(LineStart);
+      if (R.End.getPointer() > LineEnd)
+        R.End = SMLoc::getFromPointer(LineEnd);
+      
+      // Translate from SMLoc ranges to column ranges.
+      // FIXME: Handle multibyte characters.
+      ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
+                                         R.End.getPointer()-LineStart));
+    }
+
+    LineAndCol = getLineAndColumn(Loc, CurBuf);
+  }
+    
+  return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
+                      LineAndCol.second-1, Kind, Msg.str(),
+                      LineStr, ColRanges, FixIts);
+}
+
+void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
+  
+  // Report the message with the diagnostic handler if present.
+  if (DiagHandler) {
+    DiagHandler(Diagnostic, DiagContext);
+    return;
+  }
+
+  raw_ostream &OS = errs();
+
+  if (Loc != SMLoc()) {
+    int CurBuf = FindBufferContainingLoc(Loc);
+    assert(CurBuf != -1 && "Invalid or unspecified location!");
+    PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
+  }
+
+  Diagnostic.print(0, OS, ShowColors);
+}
+
+//===----------------------------------------------------------------------===//
+// SMDiagnostic Implementation
+//===----------------------------------------------------------------------===//
+
+SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
+                           int Line, int Col, SourceMgr::DiagKind Kind,
+                           StringRef Msg, StringRef LineStr,
+                           ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+                           ArrayRef<SMFixIt> Hints)
+  : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
+    Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
+    FixIts(Hints.begin(), Hints.end()) {
+  std::sort(FixIts.begin(), FixIts.end());
+}
+
+static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
+                           ArrayRef<SMFixIt> FixIts, ArrayRef<char> SourceLine){
+  if (FixIts.empty())
+    return;
+
+  const char *LineStart = SourceLine.begin();
+  const char *LineEnd = SourceLine.end();
+
+  size_t PrevHintEndCol = 0;
+
+  for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end();
+       I != E; ++I) {
+    // If the fixit contains a newline or tab, ignore it.
+    if (I->getText().find_first_of("\n\r\t") != StringRef::npos)
+      continue;
+
+    SMRange R = I->getRange();
+
+    // If the line doesn't contain any part of the range, then ignore it.
+    if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
+      continue;
+
+    // Translate from SMLoc to column.
+    // Ignore pieces of the range that go onto other lines.
+    // FIXME: Handle multibyte characters in the source line.
+    unsigned FirstCol;
+    if (R.Start.getPointer() < LineStart)
+      FirstCol = 0;
+    else
+      FirstCol = R.Start.getPointer() - LineStart;
+
+    // If we inserted a long previous hint, push this one forwards, and add
+    // an extra space to show that this is not part of the previous
+    // completion. This is sort of the best we can do when two hints appear
+    // to overlap.
+    //
+    // Note that if this hint is located immediately after the previous
+    // hint, no space will be added, since the location is more important.
+    unsigned HintCol = FirstCol;
+    if (HintCol < PrevHintEndCol)
+      HintCol = PrevHintEndCol + 1;
+
+    // FIXME: This assertion is intended to catch unintended use of multibyte
+    // characters in fixits. If we decide to do this, we'll have to track
+    // separate byte widths for the source and fixit lines.
+    assert((size_t)llvm::sys::locale::columnWidth(I->getText()) ==
+           I->getText().size());
+
+    // This relies on one byte per column in our fixit hints.
+    unsigned LastColumnModified = HintCol + I->getText().size();
+    if (LastColumnModified > FixItLine.size())
+      FixItLine.resize(LastColumnModified, ' ');
+
+    std::copy(I->getText().begin(), I->getText().end(),
+              FixItLine.begin() + HintCol);
+
+    PrevHintEndCol = LastColumnModified;
+
+    // For replacements, mark the removal range with '~'.
+    // FIXME: Handle multibyte characters in the source line.
+    unsigned LastCol;
+    if (R.End.getPointer() >= LineEnd)
+      LastCol = LineEnd - LineStart;
+    else
+      LastCol = R.End.getPointer() - LineStart;
+
+    std::fill(&CaretLine[FirstCol], &CaretLine[LastCol], '~');
+  }
+}
+
+static void printSourceLine(raw_ostream &S, StringRef LineContents) {
+  // Print out the source line one character at a time, so we can expand tabs.
+  for (unsigned i = 0, e = LineContents.size(), OutCol = 0; i != e; ++i) {
+    if (LineContents[i] != '\t') {
+      S << LineContents[i];
+      ++OutCol;
+      continue;
+    }
+
+    // If we have a tab, emit at least one space, then round up to 8 columns.
+    do {
+      S << ' ';
+      ++OutCol;
+    } while ((OutCol % TabStop) != 0);
+  }
+  S << '\n';
+}
+
+static bool isNonASCII(char c) {
+  return c & 0x80;
+}
+
+void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
+                         bool ShowColors) const {
+  // Display colors only if OS supports colors.
+  ShowColors &= S.has_colors();
+
+  if (ShowColors)
+    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+
+  if (ProgName && ProgName[0])
+    S << ProgName << ": ";
+
+  if (!Filename.empty()) {
+    if (Filename == "-")
+      S << "<stdin>";
+    else
+      S << Filename;
+
+    if (LineNo != -1) {
+      S << ':' << LineNo;
+      if (ColumnNo != -1)
+        S << ':' << (ColumnNo+1);
+    }
+    S << ": ";
+  }
+
+  switch (Kind) {
+  case SourceMgr::DK_Error:
+    if (ShowColors)
+      S.changeColor(raw_ostream::RED, true);
+    S << "error: ";
+    break;
+  case SourceMgr::DK_Warning:
+    if (ShowColors)
+      S.changeColor(raw_ostream::MAGENTA, true);
+    S << "warning: ";
+    break;
+  case SourceMgr::DK_Note:
+    if (ShowColors)
+      S.changeColor(raw_ostream::BLACK, true);
+    S << "note: ";
+    break;
+  }
+
+  if (ShowColors) {
+    S.resetColor();
+    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+  }
+
+  S << Message << '\n';
+
+  if (ShowColors)
+    S.resetColor();
+
+  if (LineNo == -1 || ColumnNo == -1)
+    return;
+
+  // FIXME: If there are multibyte or multi-column characters in the source, all
+  // our ranges will be wrong. To do this properly, we'll need a byte-to-column
+  // map like Clang's TextDiagnostic. For now, we'll just handle tabs by
+  // expanding them later, and bail out rather than show incorrect ranges and
+  // misaligned fixits for any other odd characters.
+  if (std::find_if(LineContents.begin(), LineContents.end(), isNonASCII) !=
+      LineContents.end()) {
+    printSourceLine(S, LineContents);
+    return;
+  }
+  size_t NumColumns = LineContents.size();
+
+  // Build the line with the caret and ranges.
+  std::string CaretLine(NumColumns+1, ' ');
+  
+  // Expand any ranges.
+  for (unsigned r = 0, e = Ranges.size(); r != e; ++r) {
+    std::pair<unsigned, unsigned> R = Ranges[r];
+    std::fill(&CaretLine[R.first],
+              &CaretLine[std::min((size_t)R.second, CaretLine.size())],
+              '~');
+  }
+
+  // Add any fix-its.
+  // FIXME: Find the beginning of the line properly for multibyte characters.
+  std::string FixItInsertionLine;
+  buildFixItLine(CaretLine, FixItInsertionLine, FixIts,
+                 makeArrayRef(Loc.getPointer() - ColumnNo,
+                              LineContents.size()));
+
+  // Finally, plop on the caret.
+  if (unsigned(ColumnNo) <= NumColumns)
+    CaretLine[ColumnNo] = '^';
+  else 
+    CaretLine[NumColumns] = '^';
+  
+  // ... and remove trailing whitespace so the output doesn't wrap for it.  We
+  // know that the line isn't completely empty because it has the caret in it at
+  // least.
+  CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
+  
+  printSourceLine(S, LineContents);
+
+  if (ShowColors)
+    S.changeColor(raw_ostream::GREEN, true);
+
+  // Print out the caret line, matching tabs in the source line.
+  for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+    if (i >= LineContents.size() || LineContents[i] != '\t') {
+      S << CaretLine[i];
+      ++OutCol;
+      continue;
+    }
+    
+    // Okay, we have a tab.  Insert the appropriate number of characters.
+    do {
+      S << CaretLine[i];
+      ++OutCol;
+    } while ((OutCol % TabStop) != 0);
+  }
+  S << '\n';
+
+  if (ShowColors)
+    S.resetColor();
+
+  // Print out the replacement line, matching tabs in the source line.
+  if (FixItInsertionLine.empty())
+    return;
+  
+  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i != e; ++i) {
+    if (i >= LineContents.size() || LineContents[i] != '\t') {
+      S << FixItInsertionLine[i];
+      ++OutCol;
+      continue;
+    }
+
+    // Okay, we have a tab.  Insert the appropriate number of characters.
+    do {
+      S << FixItInsertionLine[i];
+      // FIXME: This is trying not to break up replacements, but then to re-sync
+      // with the tabs between replacements. This will fail, though, if two
+      // fix-it replacements are exactly adjacent, or if a fix-it contains a
+      // space. Really we should be precomputing column widths, which we'll
+      // need anyway for multibyte chars.
+      if (FixItInsertionLine[i] != ' ')
+        ++i;
+      ++OutCol;
+    } while (((OutCol % TabStop) != 0) && i != e);
+  }
+  S << '\n';
+}
diff --git a/contrib/llvm/lib/Support/Statistic.cpp b/contrib/llvm/lib/Support/Statistic.cpp
new file mode 100644
index 000000000000..9c28176b730e
--- /dev/null
+++ b/contrib/llvm/lib/Support/Statistic.cpp
@@ -0,0 +1,170 @@
+//===-- Statistic.cpp - Easy way to expose stats information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the 'Statistic' class, which is designed to be an easy
+// way to expose various success metrics from passes.  These statistics are
+// printed at the end of a run, when the -stats command line option is enabled
+// on the command line.
+//
+// This is useful for reporting information like the number of instructions
+// simplified, optimized or removed by various transformations, like this:
+//
+// static Statistic NumInstEliminated("GCSE", "Number of instructions killed");
+//
+// Later, in the code: ++NumInstEliminated;
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstring>
+using namespace llvm;
+
+// CreateInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
+
+/// -stats - Command line option to cause transformations to emit stats about
+/// what they did.
+///
+static cl::opt<bool>
+Enabled(
+    "stats",
+    cl::desc("Enable statistics output from program (available with Asserts)"));
+
+
+namespace {
+/// StatisticInfo - This class is used in a ManagedStatic so that it is created
+/// on demand (when the first statistic is bumped) and destroyed only when
+/// llvm_shutdown is called.  We print statistics from the destructor.
+class StatisticInfo {
+  std::vector<const Statistic*> Stats;
+  friend void llvm::PrintStatistics();
+  friend void llvm::PrintStatistics(raw_ostream &OS);
+public:
+  ~StatisticInfo();
+
+  void addStatistic(const Statistic *S) {
+    Stats.push_back(S);
+  }
+};
+}
+
+static ManagedStatic<StatisticInfo> StatInfo;
+static ManagedStatic<sys::SmartMutex<true> > StatLock;
+
+/// RegisterStatistic - The first time a statistic is bumped, this method is
+/// called.
+void Statistic::RegisterStatistic() {
+  // If stats are enabled, inform StatInfo that this statistic should be
+  // printed.
+  sys::SmartScopedLock<true> Writer(*StatLock);
+  if (!Initialized) {
+    if (Enabled)
+      StatInfo->addStatistic(this);
+
+    TsanHappensBefore(this);
+    sys::MemoryFence();
+    // Remember we have been registered.
+    TsanIgnoreWritesBegin();
+    Initialized = true;
+    TsanIgnoreWritesEnd();
+  }
+}
+
+namespace {
+
+struct NameCompare {
+  bool operator()(const Statistic *LHS, const Statistic *RHS) const {
+    int Cmp = std::strcmp(LHS->getName(), RHS->getName());
+    if (Cmp != 0) return Cmp < 0;
+
+    // Secondary key is the description.
+    return std::strcmp(LHS->getDesc(), RHS->getDesc()) < 0;
+  }
+};
+
+}
+
+// Print information when destroyed, iff command line option is specified.
+StatisticInfo::~StatisticInfo() {
+  llvm::PrintStatistics();
+}
+
+void llvm::EnableStatistics() {
+  Enabled.setValue(true);
+}
+
+bool llvm::AreStatisticsEnabled() {
+  return Enabled;
+}
+
+void llvm::PrintStatistics(raw_ostream &OS) {
+  StatisticInfo &Stats = *StatInfo;
+
+  // Figure out how long the biggest Value and Name fields are.
+  unsigned MaxNameLen = 0, MaxValLen = 0;
+  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) {
+    MaxValLen = std::max(MaxValLen,
+                         (unsigned)utostr(Stats.Stats[i]->getValue()).size());
+    MaxNameLen = std::max(MaxNameLen,
+                          (unsigned)std::strlen(Stats.Stats[i]->getName()));
+  }
+
+  // Sort the fields by name.
+  std::stable_sort(Stats.Stats.begin(), Stats.Stats.end(), NameCompare());
+
+  // Print out the statistics header...
+  OS << "===" << std::string(73, '-') << "===\n"
+     << "                          ... Statistics Collected ...\n"
+     << "===" << std::string(73, '-') << "===\n\n";
+
+  // Print all of the statistics.
+  for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i)
+    OS << format("%*u %-*s - %s\n",
+                 MaxValLen, Stats.Stats[i]->getValue(),
+                 MaxNameLen, Stats.Stats[i]->getName(),
+                 Stats.Stats[i]->getDesc());
+
+  OS << '\n';  // Flush the output stream.
+  OS.flush();
+
+}
+
+void llvm::PrintStatistics() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  StatisticInfo &Stats = *StatInfo;
+
+  // Statistics not enabled?
+  if (Stats.Stats.empty()) return;
+
+  // Get the stream to write to.
+  raw_ostream &OutStream = *CreateInfoOutputFile();
+  PrintStatistics(OutStream);
+  delete &OutStream;   // Close the file.
+#else
+  // Check if the -stats option is set instead of checking
+  // !Stats.Stats.empty().  In release builds, Statistics operators
+  // do nothing, so stats are never Registered.
+  if (Enabled) {
+    // Get the stream to write to.
+    raw_ostream &OutStream = *CreateInfoOutputFile();
+    OutStream << "Statistics are disabled.  "
+            << "Build with asserts or with -DLLVM_ENABLE_STATS\n";
+    OutStream.flush();
+    delete &OutStream;   // Close the file.
+  }
+#endif
+}
diff --git a/contrib/llvm/lib/Support/StreamableMemoryObject.cpp b/contrib/llvm/lib/Support/StreamableMemoryObject.cpp
new file mode 100644
index 000000000000..59e27a263e06
--- /dev/null
+++ b/contrib/llvm/lib/Support/StreamableMemoryObject.cpp
@@ -0,0 +1,146 @@
+//===- StreamableMemoryObject.cpp - Streamable data interface -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StreamableMemoryObject.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <cstring>
+
+
+using namespace llvm;
+
+namespace {
+
+class RawMemoryObject : public StreamableMemoryObject {
+public:
+  RawMemoryObject(const unsigned char *Start, const unsigned char *End) :
+    FirstChar(Start), LastChar(End) {
+    assert(LastChar >= FirstChar && "Invalid start/end range");
+  }
+
+  virtual uint64_t getBase() const LLVM_OVERRIDE { return 0; }
+  virtual uint64_t getExtent() const LLVM_OVERRIDE {
+    return LastChar - FirstChar;
+  }
+  virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
+  virtual int readBytes(uint64_t address,
+                        uint64_t size,
+                        uint8_t* buf,
+                        uint64_t* copied) const LLVM_OVERRIDE;
+  virtual const uint8_t *getPointer(uint64_t address,
+                                    uint64_t size) const LLVM_OVERRIDE;
+  virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE {
+    return validAddress(address);
+  }
+  virtual bool isObjectEnd(uint64_t address) const LLVM_OVERRIDE {
+    return objectEnd(address);
+  }
+
+private:
+  const uint8_t* const FirstChar;
+  const uint8_t* const LastChar;
+
+  // These are implemented as inline functions here to avoid multiple virtual
+  // calls per public function
+  bool validAddress(uint64_t address) const {
+    return static_cast<ptrdiff_t>(address) < LastChar - FirstChar;
+  }
+  bool objectEnd(uint64_t address) const {
+    return static_cast<ptrdiff_t>(address) == LastChar - FirstChar;
+  }
+
+  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+};
+
+int RawMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
+  if (!validAddress(address)) return -1;
+  *ptr = *((uint8_t *)(uintptr_t)(address + FirstChar));
+  return 0;
+}
+
+int RawMemoryObject::readBytes(uint64_t address,
+                               uint64_t size,
+                               uint8_t* buf,
+                               uint64_t* copied) const {
+  if (!validAddress(address) || !validAddress(address + size - 1)) return -1;
+  memcpy(buf, (uint8_t *)(uintptr_t)(address + FirstChar), size);
+  if (copied) *copied = size;
+  return size;
+}
+
+const uint8_t *RawMemoryObject::getPointer(uint64_t address,
+                                           uint64_t size) const {
+  return FirstChar + address;
+}
+} // anonymous namespace
+
+namespace llvm {
+// If the bitcode has a header, then its size is known, and we don't have to
+// block until we actually want to read it.
+bool StreamingMemoryObject::isValidAddress(uint64_t address) const {
+  if (ObjectSize && address < ObjectSize) return true;
+    return fetchToPos(address);
+}
+
+bool StreamingMemoryObject::isObjectEnd(uint64_t address) const {
+  if (ObjectSize) return address == ObjectSize;
+  fetchToPos(address);
+  return address == ObjectSize && address != 0;
+}
+
+uint64_t StreamingMemoryObject::getExtent() const {
+  if (ObjectSize) return ObjectSize;
+  size_t pos = BytesRead + kChunkSize;
+  // keep fetching until we run out of bytes
+  while (fetchToPos(pos)) pos += kChunkSize;
+  return ObjectSize;
+}
+
+int StreamingMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
+  if (!fetchToPos(address)) return -1;
+  *ptr = Bytes[address + BytesSkipped];
+  return 0;
+}
+
+int StreamingMemoryObject::readBytes(uint64_t address,
+                                     uint64_t size,
+                                     uint8_t* buf,
+                                     uint64_t* copied) const {
+  if (!fetchToPos(address + size - 1)) return -1;
+  memcpy(buf, &Bytes[address + BytesSkipped], size);
+  if (copied) *copied = size;
+  return 0;
+}
+
+bool StreamingMemoryObject::dropLeadingBytes(size_t s) {
+  if (BytesRead < s) return true;
+  BytesSkipped = s;
+  BytesRead -= s;
+  return false;
+}
+
+void StreamingMemoryObject::setKnownObjectSize(size_t size) {
+  ObjectSize = size;
+  Bytes.reserve(size);
+}
+
+StreamableMemoryObject *getNonStreamedMemoryObject(
+    const unsigned char *Start, const unsigned char *End) {
+  return new RawMemoryObject(Start, End);
+}
+
+StreamableMemoryObject::~StreamableMemoryObject() { }
+
+StreamingMemoryObject::StreamingMemoryObject(DataStreamer *streamer) :
+  Bytes(kChunkSize), Streamer(streamer), BytesRead(0), BytesSkipped(0),
+  ObjectSize(0), EOFReached(false) {
+  BytesRead = streamer->GetBytes(&Bytes[0], kChunkSize);
+}
+}
diff --git a/contrib/llvm/lib/Support/StringExtras.cpp b/contrib/llvm/lib/Support/StringExtras.cpp
new file mode 100644
index 000000000000..d77ad7f55a18
--- /dev/null
+++ b/contrib/llvm/lib/Support/StringExtras.cpp
@@ -0,0 +1,59 @@
+//===-- StringExtras.cpp - Implement the StringExtras header --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringExtras.h header
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+/// StrInStrNoCase - Portable version of strcasestr.  Locates the first
+/// occurrence of string 's1' in string 's2', ignoring case.  Returns
+/// the offset of s2 in s1 or npos if s2 cannot be found.
+StringRef::size_type llvm::StrInStrNoCase(StringRef s1, StringRef s2) {
+  size_t N = s2.size(), M = s1.size();
+  if (N > M)
+    return StringRef::npos;
+  for (size_t i = 0, e = M - N + 1; i != e; ++i)
+    if (s1.substr(i, N).equals_lower(s2))
+      return i;
+  return StringRef::npos;
+}
+
+/// getToken - This function extracts one token from source, ignoring any
+/// leading characters that appear in the Delimiters string, and ending the
+/// token at any of the characters that appear in the Delimiters string.  If
+/// there are no tokens in the source string, an empty string is returned.
+/// The function returns a pair containing the extracted token and the
+/// remaining tail string.
+std::pair<StringRef, StringRef> llvm::getToken(StringRef Source,
+                                               StringRef Delimiters) {
+  // Figure out where the token starts.
+  StringRef::size_type Start = Source.find_first_not_of(Delimiters);
+
+  // Find the next occurrence of the delimiter.
+  StringRef::size_type End = Source.find_first_of(Delimiters, Start);
+
+  return std::make_pair(Source.slice(Start, End), Source.substr(End));
+}
+
+/// SplitString - Split up the specified string according to the specified
+/// delimiters, appending the result fragments to the output list.
+void llvm::SplitString(StringRef Source,
+                       SmallVectorImpl<StringRef> &OutFragments,
+                       StringRef Delimiters) {
+  std::pair<StringRef, StringRef> S = getToken(Source, Delimiters);
+  while (!S.first.empty()) {
+    OutFragments.push_back(S.first);
+    S = getToken(S.second, Delimiters);
+  }
+}
diff --git a/contrib/llvm/lib/Support/StringMap.cpp b/contrib/llvm/lib/Support/StringMap.cpp
new file mode 100644
index 000000000000..9ac1f867fdd4
--- /dev/null
+++ b/contrib/llvm/lib/Support/StringMap.cpp
@@ -0,0 +1,238 @@
+//===--- StringMap.cpp - String Hash table map implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringMap class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <cassert>
+using namespace llvm;
+
+StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
+  ItemSize = itemSize;
+  
+  // If a size is specified, initialize the table with that many buckets.
+  if (InitSize) {
+    init(InitSize);
+    return;
+  }
+  
+  // Otherwise, initialize it with zero buckets to avoid the allocation.
+  TheTable = 0;
+  NumBuckets = 0;
+  NumItems = 0;
+  NumTombstones = 0;
+}
+
+void StringMapImpl::init(unsigned InitSize) {
+  assert((InitSize & (InitSize-1)) == 0 &&
+         "Init Size must be a power of 2 or zero!");
+  NumBuckets = InitSize ? InitSize : 16;
+  NumItems = 0;
+  NumTombstones = 0;
+  
+  TheTable = (StringMapEntryBase **)calloc(NumBuckets+1,
+                                           sizeof(StringMapEntryBase **) +
+                                           sizeof(unsigned));
+
+  // Allocate one extra bucket, set it to look filled so the iterators stop at
+  // end.
+  TheTable[NumBuckets] = (StringMapEntryBase*)2;
+}
+
+
+/// LookupBucketFor - Look up the bucket that the specified string should end
+/// up in.  If it already exists as a key in the map, the Item pointer for the
+/// specified bucket will be non-null.  Otherwise, it will be null.  In either
+/// case, the FullHashValue field of the bucket will be set to the hash value
+/// of the string.
+unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
+  unsigned HTSize = NumBuckets;
+  if (HTSize == 0) {  // Hash table unallocated so far?
+    init(16);
+    HTSize = NumBuckets;
+  }
+  unsigned FullHashValue = HashString(Name);
+  unsigned BucketNo = FullHashValue & (HTSize-1);
+  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+  unsigned ProbeAmt = 1;
+  int FirstTombstone = -1;
+  while (1) {
+    StringMapEntryBase *BucketItem = TheTable[BucketNo];
+    // If we found an empty bucket, this key isn't in the table yet, return it.
+    if (LLVM_LIKELY(BucketItem == 0)) {
+      // If we found a tombstone, we want to reuse the tombstone instead of an
+      // empty bucket.  This reduces probing.
+      if (FirstTombstone != -1) {
+        HashTable[FirstTombstone] = FullHashValue;
+        return FirstTombstone;
+      }
+      
+      HashTable[BucketNo] = FullHashValue;
+      return BucketNo;
+    }
+    
+    if (BucketItem == getTombstoneVal()) {
+      // Skip over tombstones.  However, remember the first one we see.
+      if (FirstTombstone == -1) FirstTombstone = BucketNo;
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
+      // If the full hash value matches, check deeply for a match.  The common
+      // case here is that we are only looking at the buckets (for item info
+      // being non-null and for the full hash value) not at the items.  This
+      // is important for cache locality.
+      
+      // Do the comparison like this because Name isn't necessarily
+      // null-terminated!
+      char *ItemStr = (char*)BucketItem+ItemSize;
+      if (Name == StringRef(ItemStr, BucketItem->getKeyLength())) {
+        // We found a match!
+        return BucketNo;
+      }
+    }
+    
+    // Okay, we didn't find the item.  Probe to the next bucket.
+    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    
+    // Use quadratic probing, it has fewer clumping artifacts than linear
+    // probing and has good cache behavior in the common case.
+    ++ProbeAmt;
+  }
+}
+
+
+/// FindKey - Look up the bucket that contains the specified key. If it exists
+/// in the map, return the bucket number of the key.  Otherwise return -1.
+/// This does not modify the map.
+int StringMapImpl::FindKey(StringRef Key) const {
+  unsigned HTSize = NumBuckets;
+  if (HTSize == 0) return -1;  // Really empty table?
+  unsigned FullHashValue = HashString(Key);
+  unsigned BucketNo = FullHashValue & (HTSize-1);
+  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+  unsigned ProbeAmt = 1;
+  while (1) {
+    StringMapEntryBase *BucketItem = TheTable[BucketNo];
+    // If we found an empty bucket, this key isn't in the table yet, return.
+    if (LLVM_LIKELY(BucketItem == 0))
+      return -1;
+    
+    if (BucketItem == getTombstoneVal()) {
+      // Ignore tombstones.
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
+      // If the full hash value matches, check deeply for a match.  The common
+      // case here is that we are only looking at the buckets (for item info
+      // being non-null and for the full hash value) not at the items.  This
+      // is important for cache locality.
+      
+      // Do the comparison like this because NameStart isn't necessarily
+      // null-terminated!
+      char *ItemStr = (char*)BucketItem+ItemSize;
+      if (Key == StringRef(ItemStr, BucketItem->getKeyLength())) {
+        // We found a match!
+        return BucketNo;
+      }
+    }
+    
+    // Okay, we didn't find the item.  Probe to the next bucket.
+    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    
+    // Use quadratic probing, it has fewer clumping artifacts than linear
+    // probing and has good cache behavior in the common case.
+    ++ProbeAmt;
+  }
+}
+
+/// RemoveKey - Remove the specified StringMapEntry from the table, but do not
+/// delete it.  This aborts if the value isn't in the table.
+void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
+  const char *VStr = (char*)V + ItemSize;
+  StringMapEntryBase *V2 = RemoveKey(StringRef(VStr, V->getKeyLength()));
+  (void)V2;
+  assert(V == V2 && "Didn't find key?");
+}
+
+/// RemoveKey - Remove the StringMapEntry for the specified key from the
+/// table, returning it.  If the key is not in the table, this returns null.
+StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
+  int Bucket = FindKey(Key);
+  if (Bucket == -1) return 0;
+  
+  StringMapEntryBase *Result = TheTable[Bucket];
+  TheTable[Bucket] = getTombstoneVal();
+  --NumItems;
+  ++NumTombstones;
+  assert(NumItems + NumTombstones <= NumBuckets);
+
+  return Result;
+}
+
+
+
+/// RehashTable - Grow the table, redistributing values into the buckets with
+/// the appropriate mod-of-hashtable-size.
+void StringMapImpl::RehashTable() {
+  unsigned NewSize;
+  unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
+
+  // If the hash table is now more than 3/4 full, or if fewer than 1/8 of
+  // the buckets are empty (meaning that many are filled with tombstones),
+  // grow/rehash the table.
+  if (NumItems*4 > NumBuckets*3) {
+    NewSize = NumBuckets*2;
+  } else if (NumBuckets-(NumItems+NumTombstones) <= NumBuckets/8) {
+    NewSize = NumBuckets;
+  } else {
+    return;
+  }
+
+  // Allocate one extra bucket which will always be non-empty.  This allows the
+  // iterators to stop at end.
+  StringMapEntryBase **NewTableArray =
+    (StringMapEntryBase **)calloc(NewSize+1, sizeof(StringMapEntryBase *) +
+                                             sizeof(unsigned));
+  unsigned *NewHashArray = (unsigned *)(NewTableArray + NewSize + 1);
+  NewTableArray[NewSize] = (StringMapEntryBase*)2;
+
+  // Rehash all the items into their new buckets.  Luckily :) we already have
+  // the hash values available, so we don't have to rehash any strings.
+  for (unsigned I = 0, E = NumBuckets; I != E; ++I) {
+    StringMapEntryBase *Bucket = TheTable[I];
+    if (Bucket && Bucket != getTombstoneVal()) {
+      // Fast case, bucket available.
+      unsigned FullHash = HashTable[I];
+      unsigned NewBucket = FullHash & (NewSize-1);
+      if (NewTableArray[NewBucket] == 0) {
+        NewTableArray[FullHash & (NewSize-1)] = Bucket;
+        NewHashArray[FullHash & (NewSize-1)] = FullHash;
+        continue;
+      }
+      
+      // Otherwise probe for a spot.
+      unsigned ProbeSize = 1;
+      do {
+        NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
+      } while (NewTableArray[NewBucket]);
+      
+      // Finally found a slot.  Fill it in.
+      NewTableArray[NewBucket] = Bucket;
+      NewHashArray[NewBucket] = FullHash;
+    }
+  }
+  
+  free(TheTable);
+  
+  TheTable = NewTableArray;
+  NumBuckets = NewSize;
+  NumTombstones = 0;
+}
diff --git a/contrib/llvm/lib/Support/StringPool.cpp b/contrib/llvm/lib/Support/StringPool.cpp
new file mode 100644
index 000000000000..ff607cf8c4ad
--- /dev/null
+++ b/contrib/llvm/lib/Support/StringPool.cpp
@@ -0,0 +1,35 @@
+//===-- StringPool.cpp - Interned string pool -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringPool class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StringPool.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+
+StringPool::StringPool() {}
+
+StringPool::~StringPool() {
+  assert(InternTable.empty() && "PooledStringPtr leaked!");
+}
+
+PooledStringPtr StringPool::intern(StringRef Key) {
+  table_t::iterator I = InternTable.find(Key);
+  if (I != InternTable.end())
+    return PooledStringPtr(&*I);
+  
+  entry_t *S = entry_t::Create(Key.begin(), Key.end());
+  S->getValue().Pool = this;
+  InternTable.insert(S);
+  
+  return PooledStringPtr(S);
+}
diff --git a/contrib/llvm/lib/Support/StringRef.cpp b/contrib/llvm/lib/Support/StringRef.cpp
new file mode 100644
index 000000000000..d7a0bfa41005
--- /dev/null
+++ b/contrib/llvm/lib/Support/StringRef.cpp
@@ -0,0 +1,467 @@
+//===-- StringRef.cpp - Lightweight String References ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/edit_distance.h"
+#include <bitset>
+
+using namespace llvm;
+
+// MSVC emits references to this into the translation units which reference it.
+#ifndef _MSC_VER
+const size_t StringRef::npos;
+#endif
+
+static char ascii_tolower(char x) {
+  if (x >= 'A' && x <= 'Z')
+    return x - 'A' + 'a';
+  return x;
+}
+
+static char ascii_toupper(char x) {
+  if (x >= 'a' && x <= 'z')
+    return x - 'a' + 'A';
+  return x;
+}
+
+static bool ascii_isdigit(char x) {
+  return x >= '0' && x <= '9';
+}
+
+/// compare_lower - Compare strings, ignoring case.
+int StringRef::compare_lower(StringRef RHS) const {
+  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
+    unsigned char LHC = ascii_tolower(Data[I]);
+    unsigned char RHC = ascii_tolower(RHS.Data[I]);
+    if (LHC != RHC)
+      return LHC < RHC ? -1 : 1;
+  }
+
+  if (Length == RHS.Length)
+    return 0;
+  return Length < RHS.Length ? -1 : 1;
+}
+
+/// compare_numeric - Compare strings, handle embedded numbers.
+int StringRef::compare_numeric(StringRef RHS) const {
+  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
+    // Check for sequences of digits.
+    if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
+      // The longer sequence of numbers is considered larger.
+      // This doesn't really handle prefixed zeros well.
+      size_t J;
+      for (J = I + 1; J != E + 1; ++J) {
+        bool ld = J < Length && ascii_isdigit(Data[J]);
+        bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
+        if (ld != rd)
+          return rd ? -1 : 1;
+        if (!rd)
+          break;
+      }
+      // The two number sequences have the same length (J-I), just memcmp them.
+      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
+        return Res < 0 ? -1 : 1;
+      // Identical number sequences, continue search after the numbers.
+      I = J - 1;
+      continue;
+    }
+    if (Data[I] != RHS.Data[I])
+      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
+  }
+  if (Length == RHS.Length)
+    return 0;
+  return Length < RHS.Length ? -1 : 1;
+}
+
+// Compute the edit distance between the two given strings.
+unsigned StringRef::edit_distance(llvm::StringRef Other,
+                                  bool AllowReplacements,
+                                  unsigned MaxEditDistance) {
+  return llvm::ComputeEditDistance(
+      llvm::ArrayRef<char>(data(), size()),
+      llvm::ArrayRef<char>(Other.data(), Other.size()),
+      AllowReplacements, MaxEditDistance);
+}
+
+//===----------------------------------------------------------------------===//
+// String Operations
+//===----------------------------------------------------------------------===//
+
+std::string StringRef::lower() const {
+  std::string Result(size(), char());
+  for (size_type i = 0, e = size(); i != e; ++i) {
+    Result[i] = ascii_tolower(Data[i]);
+  }
+  return Result;
+}
+
+std::string StringRef::upper() const {
+  std::string Result(size(), char());
+  for (size_type i = 0, e = size(); i != e; ++i) {
+    Result[i] = ascii_toupper(Data[i]);
+  }
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// String Searching
+//===----------------------------------------------------------------------===//
+
+
+/// find - Search for the first string \arg Str in the string.
+///
+/// \return - The index of the first occurrence of \arg Str, or npos if not
+/// found.
+size_t StringRef::find(StringRef Str, size_t From) const {
+  size_t N = Str.size();
+  if (N > Length)
+    return npos;
+
+  // For short haystacks or unsupported needles fall back to the naive algorithm
+  if (Length < 16 || N > 255 || N == 0) {
+    for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
+      if (substr(i, N).equals(Str))
+        return i;
+    return npos;
+  }
+
+  if (From >= Length)
+    return npos;
+
+  // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
+  uint8_t BadCharSkip[256];
+  std::memset(BadCharSkip, N, 256);
+  for (unsigned i = 0; i != N-1; ++i)
+    BadCharSkip[(uint8_t)Str[i]] = N-1-i;
+
+  unsigned Len = Length-From, Pos = From;
+  while (Len >= N) {
+    if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
+      return Pos;
+
+    // Otherwise skip the appropriate number of bytes.
+    uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
+    Len -= Skip;
+    Pos += Skip;
+  }
+
+  return npos;
+}
+
+/// rfind - Search for the last string \arg Str in the string.
+///
+/// \return - The index of the last occurrence of \arg Str, or npos if not
+/// found.
+size_t StringRef::rfind(StringRef Str) const {
+  size_t N = Str.size();
+  if (N > Length)
+    return npos;
+  for (size_t i = Length - N + 1, e = 0; i != e;) {
+    --i;
+    if (substr(i, N).equals(Str))
+      return i;
+  }
+  return npos;
+}
+
+/// find_first_of - Find the first character in the string that is in \arg
+/// Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_first_of(StringRef Chars,
+                                              size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0; i != Chars.size(); ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+    if (CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
+/// find_first_not_of - Find the first character in the string that is not
+/// \arg C or npos if not found.
+StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
+  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+    if (Data[i] != C)
+      return i;
+  return npos;
+}
+
+/// find_first_not_of - Find the first character in the string that is not
+/// in the string \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
+                                                  size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0; i != Chars.size(); ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length), e = Length; i != e; ++i)
+    if (!CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
+/// find_last_of - Find the last character in the string that is in \arg C,
+/// or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_of(StringRef Chars,
+                                             size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0; i != Chars.size(); ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not
+/// \arg C, or npos if not found.
+StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (Data[i] != C)
+      return i;
+  return npos;
+}
+
+/// find_last_not_of - Find the last character in the string that is not in
+/// \arg Chars, or npos if not found.
+///
+/// Note: O(size() + Chars.size())
+StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
+                                                 size_t From) const {
+  std::bitset<1 << CHAR_BIT> CharBits;
+  for (size_type i = 0, e = Chars.size(); i != e; ++i)
+    CharBits.set((unsigned char)Chars[i]);
+
+  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
+    if (!CharBits.test((unsigned char)Data[i]))
+      return i;
+  return npos;
+}
+
+void StringRef::split(SmallVectorImpl<StringRef> &A,
+                      StringRef Separators, int MaxSplit,
+                      bool KeepEmpty) const {
+  StringRef rest = *this;
+
+  // rest.data() is used to distinguish cases like "a," that splits into
+  // "a" + "" and "a" that splits into "a" + 0.
+  for (int splits = 0;
+       rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit);
+       ++splits) {
+    std::pair<StringRef, StringRef> p = rest.split(Separators);
+
+    if (KeepEmpty || p.first.size() != 0)
+      A.push_back(p.first);
+    rest = p.second;
+  }
+  // If we have a tail left, add it.
+  if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty))
+    A.push_back(rest);
+}
+
+//===----------------------------------------------------------------------===//
+// Helpful Algorithms
+//===----------------------------------------------------------------------===//
+
+/// count - Return the number of non-overlapped occurrences of \arg Str in
+/// the string.
+size_t StringRef::count(StringRef Str) const {
+  size_t Count = 0;
+  size_t N = Str.size();
+  if (N > Length)
+    return 0;
+  for (size_t i = 0, e = Length - N + 1; i != e; ++i)
+    if (substr(i, N).equals(Str))
+      ++Count;
+  return Count;
+}
+
+static unsigned GetAutoSenseRadix(StringRef &Str) {
+  if (Str.startswith("0x")) {
+    Str = Str.substr(2);
+    return 16;
+  }
+  
+  if (Str.startswith("0b")) {
+    Str = Str.substr(2);
+    return 2;
+  }
+
+  if (Str.startswith("0o")) {
+    Str = Str.substr(2);
+    return 8;
+  }
+
+  if (Str.startswith("0"))
+    return 8;
+  
+  return 10;
+}
+
+
+/// GetAsUnsignedInteger - Workhorse method that converts a integer character
+/// sequence of radix up to 36 to an unsigned long long value.
+bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
+                                unsigned long long &Result) {
+  // Autosense radix if not specified.
+  if (Radix == 0)
+    Radix = GetAutoSenseRadix(Str);
+
+  // Empty strings (after the radix autosense) are invalid.
+  if (Str.empty()) return true;
+
+  // Parse all the bytes of the string given this radix.  Watch for overflow.
+  Result = 0;
+  while (!Str.empty()) {
+    unsigned CharVal;
+    if (Str[0] >= '0' && Str[0] <= '9')
+      CharVal = Str[0]-'0';
+    else if (Str[0] >= 'a' && Str[0] <= 'z')
+      CharVal = Str[0]-'a'+10;
+    else if (Str[0] >= 'A' && Str[0] <= 'Z')
+      CharVal = Str[0]-'A'+10;
+    else
+      return true;
+
+    // If the parsed value is larger than the integer radix, the string is
+    // invalid.
+    if (CharVal >= Radix)
+      return true;
+
+    // Add in this character.
+    unsigned long long PrevResult = Result;
+    Result = Result*Radix+CharVal;
+
+    // Check for overflow by shifting back and seeing if bits were lost.
+    if (Result/Radix < PrevResult)
+      return true;
+
+    Str = Str.substr(1);
+  }
+
+  return false;
+}
+
+bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
+                              long long &Result) {
+  unsigned long long ULLVal;
+
+  // Handle positive strings first.
+  if (Str.empty() || Str.front() != '-') {
+    if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
+        // Check for value so large it overflows a signed value.
+        (long long)ULLVal < 0)
+      return true;
+    Result = ULLVal;
+    return false;
+  }
+
+  // Get the positive part of the value.
+  if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
+      // Reject values so large they'd overflow as negative signed, but allow
+      // "-0".  This negates the unsigned so that the negative isn't undefined
+      // on signed overflow.
+      (long long)-ULLVal > 0)
+    return true;
+
+  Result = -ULLVal;
+  return false;
+}
+
+bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
+  StringRef Str = *this;
+
+  // Autosense radix if not specified.
+  if (Radix == 0)
+    Radix = GetAutoSenseRadix(Str);
+
+  assert(Radix > 1 && Radix <= 36);
+
+  // Empty strings (after the radix autosense) are invalid.
+  if (Str.empty()) return true;
+
+  // Skip leading zeroes.  This can be a significant improvement if
+  // it means we don't need > 64 bits.
+  while (!Str.empty() && Str.front() == '0')
+    Str = Str.substr(1);
+
+  // If it was nothing but zeroes....
+  if (Str.empty()) {
+    Result = APInt(64, 0);
+    return false;
+  }
+
+  // (Over-)estimate the required number of bits.
+  unsigned Log2Radix = 0;
+  while ((1U << Log2Radix) < Radix) Log2Radix++;
+  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
+
+  unsigned BitWidth = Log2Radix * Str.size();
+  if (BitWidth < Result.getBitWidth())
+    BitWidth = Result.getBitWidth(); // don't shrink the result
+  else if (BitWidth > Result.getBitWidth())
+    Result = Result.zext(BitWidth);
+
+  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
+  if (!IsPowerOf2Radix) {
+    // These must have the same bit-width as Result.
+    RadixAP = APInt(BitWidth, Radix);
+    CharAP = APInt(BitWidth, 0);
+  }
+
+  // Parse all the bytes of the string given this radix.
+  Result = 0;
+  while (!Str.empty()) {
+    unsigned CharVal;
+    if (Str[0] >= '0' && Str[0] <= '9')
+      CharVal = Str[0]-'0';
+    else if (Str[0] >= 'a' && Str[0] <= 'z')
+      CharVal = Str[0]-'a'+10;
+    else if (Str[0] >= 'A' && Str[0] <= 'Z')
+      CharVal = Str[0]-'A'+10;
+    else
+      return true;
+
+    // If the parsed value is larger than the integer radix, the string is
+    // invalid.
+    if (CharVal >= Radix)
+      return true;
+
+    // Add in this character.
+    if (IsPowerOf2Radix) {
+      Result <<= Log2Radix;
+      Result |= CharVal;
+    } else {
+      Result *= RadixAP;
+      CharAP = CharVal;
+      Result += CharAP;
+    }
+
+    Str = Str.substr(1);
+  }
+
+  return false;
+}
+
+
+// Implementation of StringRef hashing.
+hash_code llvm::hash_value(StringRef S) {
+  return hash_combine_range(S.begin(), S.end());
+}
diff --git a/contrib/llvm/lib/Support/SystemUtils.cpp b/contrib/llvm/lib/Support/SystemUtils.cpp
new file mode 100644
index 000000000000..54b5e97bfe18
--- /dev/null
+++ b/contrib/llvm/lib/Support/SystemUtils.cpp
@@ -0,0 +1,55 @@
+//===- SystemUtils.cpp - Utilities for low-level system tasks -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains functions used to do a variety of low-level, often
+// system-specific, tasks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+bool llvm::CheckBitcodeOutputToConsole(raw_ostream &stream_to_check,
+                                       bool print_warning) {
+  if (stream_to_check.is_displayed()) {
+    if (print_warning) {
+      errs() << "WARNING: You're attempting to print out a bitcode file.\n"
+                "This is inadvisable as it may cause display problems. If\n"
+                "you REALLY want to taste LLVM bitcode first-hand, you\n"
+                "can force output with the `-f' option.\n\n";
+    }
+    return true;
+  }
+  return false;
+}
+
+/// PrependMainExecutablePath - Prepend the path to the program being executed
+/// to \p ExeName, given the value of argv[0] and the address of main()
+/// itself. This allows us to find another LLVM tool if it is built in the same
+/// directory. An empty string is returned on error; note that this function
+/// just mainpulates the path and doesn't check for executability.
+/// @brief Find a named executable.
+sys::Path llvm::PrependMainExecutablePath(const std::string &ExeName,
+                                          const char *Argv0, void *MainAddr) {
+  // Check the directory that the calling program is in.  We can do
+  // this if ProgramPath contains at least one / character, indicating that it
+  // is a relative path to the executable itself.
+  sys::Path Result = sys::Path::GetMainExecutable(Argv0, MainAddr);
+  Result.eraseComponent();
+
+  if (!Result.isEmpty()) {
+    Result.appendComponent(ExeName);
+    Result.appendSuffix(sys::Path::GetEXESuffix());
+  }
+
+  return Result;
+}
diff --git a/contrib/llvm/lib/Support/TargetRegistry.cpp b/contrib/llvm/lib/Support/TargetRegistry.cpp
new file mode 100644
index 000000000000..9c81327615c6
--- /dev/null
+++ b/contrib/llvm/lib/Support/TargetRegistry.cpp
@@ -0,0 +1,163 @@
+//===--- TargetRegistry.cpp - Target registration -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <vector>
+using namespace llvm;
+
+// Clients are responsible for avoid race conditions in registration.
+static Target *FirstTarget = 0;
+
+TargetRegistry::iterator TargetRegistry::begin() {
+  return iterator(FirstTarget);
+}
+
+const Target *TargetRegistry::lookupTarget(const std::string &ArchName,
+                                           Triple &TheTriple,
+                                           std::string &Error) {
+  // Allocate target machine.  First, check whether the user has explicitly
+  // specified an architecture to compile for. If so we have to look it up by
+  // name, because it might be a backend that has no mapping to a target triple.
+  const Target *TheTarget = 0;
+  if (!ArchName.empty()) {
+    for (TargetRegistry::iterator it = TargetRegistry::begin(),
+           ie = TargetRegistry::end(); it != ie; ++it) {
+      if (ArchName == it->getName()) {
+        TheTarget = &*it;
+        break;
+      }
+    }
+
+    if (!TheTarget) {
+      Error = "error: invalid target '" + ArchName + "'.\n";
+      return 0;
+    }
+
+    // Adjust the triple to match (if known), otherwise stick with the
+    // given triple.
+    Triple::ArchType Type = Triple::getArchTypeForLLVMName(ArchName);
+    if (Type != Triple::UnknownArch)
+      TheTriple.setArch(Type);
+  } else {
+    // Get the target specific parser.
+    std::string TempError;
+    TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
+    if (TheTarget == 0) {
+      Error = ": error: unable to get target for '"
+            + TheTriple.getTriple()
+            + "', see --version and --triple.\n";
+      return 0;
+    }
+  }
+
+  return TheTarget;
+}
+
+const Target *TargetRegistry::lookupTarget(const std::string &TT,
+                                           std::string &Error) {
+  // Provide special warning when no targets are initialized.
+  if (begin() == end()) {
+    Error = "Unable to find target for this triple (no targets are registered)";
+    return 0;
+  }
+  const Target *Best = 0, *EquallyBest = 0;
+  unsigned BestQuality = 0;
+  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+    if (unsigned Qual = it->TripleMatchQualityFn(TT)) {
+      if (!Best || Qual > BestQuality) {
+        Best = &*it;
+        EquallyBest = 0;
+        BestQuality = Qual;
+      } else if (Qual == BestQuality)
+        EquallyBest = &*it;
+    }
+  }
+
+  if (!Best) {
+    Error = "No available targets are compatible with this triple, "
+      "see -version for the available targets.";
+    return 0;
+  }
+
+  // Otherwise, take the best target, but make sure we don't have two equally
+  // good best targets.
+  if (EquallyBest) {
+    Error = std::string("Cannot choose between targets \"") +
+      Best->Name  + "\" and \"" + EquallyBest->Name + "\"";
+    return 0;
+  }
+
+  return Best;
+}
+
+void TargetRegistry::RegisterTarget(Target &T,
+                                    const char *Name,
+                                    const char *ShortDesc,
+                                    Target::TripleMatchQualityFnTy TQualityFn,
+                                    bool HasJIT) {
+  assert(Name && ShortDesc && TQualityFn &&
+         "Missing required target information!");
+
+  // Check if this target has already been initialized, we allow this as a
+  // convenience to some clients.
+  if (T.Name)
+    return;
+         
+  // Add to the list of targets.
+  T.Next = FirstTarget;
+  FirstTarget = &T;
+
+  T.Name = Name;
+  T.ShortDesc = ShortDesc;
+  T.TripleMatchQualityFn = TQualityFn;
+  T.HasJIT = HasJIT;
+}
+
+const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
+  const Target *TheTarget = lookupTarget(sys::getDefaultTargetTriple(), Error);
+
+  if (TheTarget && !TheTarget->hasJIT()) {
+    Error = "No JIT compatible target available for this host";
+    return 0;
+  }
+
+  return TheTarget;
+}
+
+static int TargetArraySortFn(const void *LHS, const void *RHS) {
+  typedef std::pair<StringRef, const Target*> pair_ty;
+  return ((const pair_ty*)LHS)->first.compare(((const pair_ty*)RHS)->first);
+}
+
+void TargetRegistry::printRegisteredTargetsForVersion() {
+  std::vector<std::pair<StringRef, const Target*> > Targets;
+  size_t Width = 0;
+  for (TargetRegistry::iterator I = TargetRegistry::begin(),
+       E = TargetRegistry::end();
+       I != E; ++I) {
+    Targets.push_back(std::make_pair(I->getName(), &*I));
+    Width = std::max(Width, Targets.back().first.size());
+  }
+  array_pod_sort(Targets.begin(), Targets.end(), TargetArraySortFn);
+
+  raw_ostream &OS = outs();
+  OS << "  Registered Targets:\n";
+  for (unsigned i = 0, e = Targets.size(); i != e; ++i) {
+    OS << "    " << Targets[i].first;
+    OS.indent(Width - Targets[i].first.size()) << " - "
+      << Targets[i].second->getShortDescription() << '\n';
+  }
+  if (Targets.empty())
+    OS << "    (none)\n";
+}
diff --git a/contrib/llvm/lib/Support/ThreadLocal.cpp b/contrib/llvm/lib/Support/ThreadLocal.cpp
new file mode 100644
index 000000000000..0587aaec7e68
--- /dev/null
+++ b/contrib/llvm/lib/Support/ThreadLocal.cpp
@@ -0,0 +1,92 @@
+//===- ThreadLocal.cpp - Thread Local Data ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the llvm::sys::ThreadLocal class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/ThreadLocal.h"
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
+#if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
+// Define all methods as no-ops if threading is explicitly disabled
+namespace llvm {
+using namespace sys;
+ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::~ThreadLocalImpl() { }
+void ThreadLocalImpl::setInstance(const void* d) {
+  typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
+  void **pd = reinterpret_cast<void**>(&data);
+  *pd = const_cast<void*>(d);
+}
+const void* ThreadLocalImpl::getInstance() {
+  void **pd = reinterpret_cast<void**>(&data);
+  return *pd;
+}
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
+}
+#else
+
+#if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_GETSPECIFIC)
+
+#include <cassert>
+#include <pthread.h>
+#include <stdlib.h>
+
+namespace llvm {
+using namespace sys;
+
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(pthread_key_t) <= sizeof(data) ? 1 : -1];
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_key_create(key, NULL);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+ThreadLocalImpl::~ThreadLocalImpl() {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_key_delete(*key);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+void ThreadLocalImpl::setInstance(const void* d) {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  int errorcode = pthread_setspecific(*key, d);
+  assert(errorcode == 0);
+  (void) errorcode;
+}
+
+const void* ThreadLocalImpl::getInstance() {
+  pthread_key_t* key = reinterpret_cast<pthread_key_t*>(&data);
+  return pthread_getspecific(*key);
+}
+
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
+
+}
+
+#elif defined(LLVM_ON_UNIX)
+#include "Unix/ThreadLocal.inc"
+#elif defined( LLVM_ON_WIN32)
+#include "Windows/ThreadLocal.inc"
+#else
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 set in Support/ThreadLocal.cpp
+#endif
+#endif
diff --git a/contrib/llvm/lib/Support/Threading.cpp b/contrib/llvm/lib/Support/Threading.cpp
new file mode 100644
index 000000000000..13fba2ea2584
--- /dev/null
+++ b/contrib/llvm/lib/Support/Threading.cpp
@@ -0,0 +1,147 @@
+//===-- llvm/Support/Threading.cpp- Control multithreading mode --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements llvm_start_multithreaded() and friends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Threading.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Atomic.h"
+#include "llvm/Support/Mutex.h"
+#include <cassert>
+
+using namespace llvm;
+
+static bool multithreaded_mode = false;
+
+static sys::Mutex* global_lock = 0;
+
+bool llvm::llvm_start_multithreaded() {
+#if LLVM_ENABLE_THREADS != 0
+  assert(!multithreaded_mode && "Already multithreaded!");
+  multithreaded_mode = true;
+  global_lock = new sys::Mutex(true);
+
+  // We fence here to ensure that all initialization is complete BEFORE we
+  // return from llvm_start_multithreaded().
+  sys::MemoryFence();
+  return true;
+#else
+  return false;
+#endif
+}
+
+void llvm::llvm_stop_multithreaded() {
+#if LLVM_ENABLE_THREADS != 0
+  assert(multithreaded_mode && "Not currently multithreaded!");
+
+  // We fence here to insure that all threaded operations are complete BEFORE we
+  // return from llvm_stop_multithreaded().
+  sys::MemoryFence();
+
+  multithreaded_mode = false;
+  delete global_lock;
+#endif
+}
+
+bool llvm::llvm_is_multithreaded() {
+  return multithreaded_mode;
+}
+
+void llvm::llvm_acquire_global_lock() {
+  if (multithreaded_mode) global_lock->acquire();
+}
+
+void llvm::llvm_release_global_lock() {
+  if (multithreaded_mode) global_lock->release();
+}
+
+#if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
+#include <pthread.h>
+
+struct ThreadInfo {
+  void (*UserFn)(void *);
+  void *UserData;
+};
+static void *ExecuteOnThread_Dispatch(void *Arg) {
+  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
+  TI->UserFn(TI->UserData);
+  return 0;
+}
+
+void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+                                  unsigned RequestedStackSize) {
+  ThreadInfo Info = { Fn, UserData };
+  pthread_attr_t Attr;
+  pthread_t Thread;
+
+  // Construct the attributes object.
+  if (::pthread_attr_init(&Attr) != 0)
+    return;
+
+  // Set the requested stack size, if given.
+  if (RequestedStackSize != 0) {
+    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
+      goto error;
+  }
+
+  // Construct and execute the thread.
+  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
+    goto error;
+
+  // Wait for the thread and clean up.
+  ::pthread_join(Thread, 0);
+
+ error:
+  ::pthread_attr_destroy(&Attr);
+}
+#elif LLVM_ENABLE_THREADS!=0 && defined(LLVM_ON_WIN32)
+#include "Windows/Windows.h"
+#include <process.h>
+
+struct ThreadInfo {
+  void (*func)(void*);
+  void *param;
+};
+
+static unsigned __stdcall ThreadCallback(void *param) {
+  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
+  info->func(info->param);
+
+  return 0;
+}
+
+void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+                                  unsigned RequestedStackSize) {
+  struct ThreadInfo param = { Fn, UserData };
+
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
+                                            RequestedStackSize, ThreadCallback,
+                                            &param, 0, NULL);
+
+  if (hThread) {
+    // We actually don't care whether the wait succeeds or fails, in
+    // the same way we don't care whether the pthread_join call succeeds
+    // or fails.  There's not much we could do if this were to fail. But
+    // on success, this call will wait until the thread finishes executing
+    // before returning.
+    (void)::WaitForSingleObject(hThread, INFINITE);
+    ::CloseHandle(hThread);
+  }
+}
+#else
+// Support for non-Win32, non-pthread implementation.
+void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+                                  unsigned RequestedStackSize) {
+  (void) RequestedStackSize;
+  Fn(UserData);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Support/TimeValue.cpp b/contrib/llvm/lib/Support/TimeValue.cpp
new file mode 100644
index 000000000000..bd8af174bcd0
--- /dev/null
+++ b/contrib/llvm/lib/Support/TimeValue.cpp
@@ -0,0 +1,62 @@
+//===-- TimeValue.cpp - Implement OS TimeValue Concept ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the operating system TimeValue concept.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Config/config.h"
+
+namespace llvm {
+using namespace sys;
+
+const TimeValue::SecondsType
+  TimeValue::PosixZeroTimeSeconds = -946684800;
+const TimeValue::SecondsType
+  TimeValue::Win32ZeroTimeSeconds = -12591158400ULL;
+
+const TimeValue TimeValue::MinTime       = TimeValue ( INT64_MIN,0 );
+const TimeValue TimeValue::MaxTime       = TimeValue ( INT64_MAX,0 );
+const TimeValue TimeValue::ZeroTime      = TimeValue ( 0,0 );
+const TimeValue TimeValue::PosixZeroTime = TimeValue ( PosixZeroTimeSeconds,0 );
+const TimeValue TimeValue::Win32ZeroTime = TimeValue ( Win32ZeroTimeSeconds,0 );
+
+void
+TimeValue::normalize( void ) {
+  if ( nanos_ >= NANOSECONDS_PER_SECOND ) {
+    do {
+      seconds_++;
+      nanos_ -= NANOSECONDS_PER_SECOND;
+    } while ( nanos_ >= NANOSECONDS_PER_SECOND );
+  } else if (nanos_ <= -NANOSECONDS_PER_SECOND ) {
+    do {
+      seconds_--;
+      nanos_ += NANOSECONDS_PER_SECOND;
+    } while (nanos_ <= -NANOSECONDS_PER_SECOND);
+  }
+
+  if (seconds_ >= 1 && nanos_ < 0) {
+    seconds_--;
+    nanos_ += NANOSECONDS_PER_SECOND;
+  } else if (seconds_ < 0 && nanos_ > 0) {
+    seconds_++;
+    nanos_ -= NANOSECONDS_PER_SECOND;
+  }
+}
+
+}
+
+/// Include the platform specific portion of TimeValue class
+#ifdef LLVM_ON_UNIX
+#include "Unix/TimeValue.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/TimeValue.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/Timer.cpp b/contrib/llvm/lib/Support/Timer.cpp
new file mode 100644
index 000000000000..896d869aa1e7
--- /dev/null
+++ b/contrib/llvm/lib/Support/Timer.cpp
@@ -0,0 +1,389 @@
+//===-- Timer.cpp - Interval Timing Support -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interval Timing implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Timer.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// CreateInfoOutputFile - Return a file stream to print our output on.
+namespace llvm { extern raw_ostream *CreateInfoOutputFile(); }
+
+// getLibSupportInfoOutputFilename - This ugly hack is brought to you courtesy
+// of constructor/destructor ordering being unspecified by C++.  Basically the
+// problem is that a Statistic object gets destroyed, which ends up calling
+// 'GetLibSupportInfoOutputFile()' (below), which calls this function.
+// LibSupportInfoOutputFilename used to be a global variable, but sometimes it
+// would get destroyed before the Statistic, causing havoc to ensue.  We "fix"
+// this by creating the string the first time it is needed and never destroying
+// it.
+static ManagedStatic<std::string> LibSupportInfoOutputFilename;
+static std::string &getLibSupportInfoOutputFilename() {
+  return *LibSupportInfoOutputFilename;
+}
+
+static ManagedStatic<sys::SmartMutex<true> > TimerLock;
+
+namespace {
+  static cl::opt<bool>
+  TrackSpace("track-memory", cl::desc("Enable -time-passes memory "
+                                      "tracking (this may be slow)"),
+             cl::Hidden);
+
+  static cl::opt<std::string, true>
+  InfoOutputFilename("info-output-file", cl::value_desc("filename"),
+                     cl::desc("File to append -stats and -timer output to"),
+                   cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
+}
+
+// CreateInfoOutputFile - Return a file stream to print our output on.
+raw_ostream *llvm::CreateInfoOutputFile() {
+  const std::string &OutputFilename = getLibSupportInfoOutputFilename();
+  if (OutputFilename.empty())
+    return new raw_fd_ostream(2, false); // stderr.
+  if (OutputFilename == "-")
+    return new raw_fd_ostream(1, false); // stdout.
+  
+  // Append mode is used because the info output file is opened and closed
+  // each time -stats or -time-passes wants to print output to it. To
+  // compensate for this, the test-suite Makefiles have code to delete the
+  // info output file before running commands which write to it.
+  std::string Error;
+  raw_ostream *Result = new raw_fd_ostream(OutputFilename.c_str(),
+                                           Error, raw_fd_ostream::F_Append);
+  if (Error.empty())
+    return Result;
+  
+  errs() << "Error opening info-output-file '"
+    << OutputFilename << " for appending!\n";
+  delete Result;
+  return new raw_fd_ostream(2, false); // stderr.
+}
+
+
+static TimerGroup *DefaultTimerGroup = 0;
+static TimerGroup *getDefaultTimerGroup() {
+  TimerGroup *tmp = DefaultTimerGroup;
+  sys::MemoryFence();
+  if (tmp) return tmp;
+  
+  llvm_acquire_global_lock();
+  tmp = DefaultTimerGroup;
+  if (!tmp) {
+    tmp = new TimerGroup("Miscellaneous Ungrouped Timers");
+    sys::MemoryFence();
+    DefaultTimerGroup = tmp;
+  }
+  llvm_release_global_lock();
+
+  return tmp;
+}
+
+//===----------------------------------------------------------------------===//
+// Timer Implementation
+//===----------------------------------------------------------------------===//
+
+void Timer::init(StringRef N) {
+  assert(TG == 0 && "Timer already initialized");
+  Name.assign(N.begin(), N.end());
+  Started = false;
+  TG = getDefaultTimerGroup();
+  TG->addTimer(*this);
+}
+
+void Timer::init(StringRef N, TimerGroup &tg) {
+  assert(TG == 0 && "Timer already initialized");
+  Name.assign(N.begin(), N.end());
+  Started = false;
+  TG = &tg;
+  TG->addTimer(*this);
+}
+
+Timer::~Timer() {
+  if (!TG) return;  // Never initialized, or already cleared.
+  TG->removeTimer(*this);
+}
+
+static inline size_t getMemUsage() {
+  if (!TrackSpace) return 0;
+  return sys::Process::GetMallocUsage();
+}
+
+TimeRecord TimeRecord::getCurrentTime(bool Start) {
+  TimeRecord Result;
+  sys::TimeValue now(0,0), user(0,0), sys(0,0);
+  
+  if (Start) {
+    Result.MemUsed = getMemUsage();
+    sys::Process::GetTimeUsage(now, user, sys);
+  } else {
+    sys::Process::GetTimeUsage(now, user, sys);
+    Result.MemUsed = getMemUsage();
+  }
+
+  Result.WallTime   =  now.seconds() +  now.microseconds() / 1000000.0;
+  Result.UserTime   = user.seconds() + user.microseconds() / 1000000.0;
+  Result.SystemTime =  sys.seconds() +  sys.microseconds() / 1000000.0;
+  return Result;
+}
+
+static ManagedStatic<std::vector<Timer*> > ActiveTimers;
+
+void Timer::startTimer() {
+  Started = true;
+  ActiveTimers->push_back(this);
+  Time -= TimeRecord::getCurrentTime(true);
+}
+
+void Timer::stopTimer() {
+  Time += TimeRecord::getCurrentTime(false);
+
+  if (ActiveTimers->back() == this) {
+    ActiveTimers->pop_back();
+  } else {
+    std::vector<Timer*>::iterator I =
+      std::find(ActiveTimers->begin(), ActiveTimers->end(), this);
+    assert(I != ActiveTimers->end() && "stop but no startTimer?");
+    ActiveTimers->erase(I);
+  }
+}
+
+static void printVal(double Val, double Total, raw_ostream &OS) {
+  if (Total < 1e-7)   // Avoid dividing by zero.
+    OS << "        -----     ";
+  else
+    OS << format("  %7.4f (%5.1f%%)", Val, Val*100/Total);
+}
+
+void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
+  if (Total.getUserTime())
+    printVal(getUserTime(), Total.getUserTime(), OS);
+  if (Total.getSystemTime())
+    printVal(getSystemTime(), Total.getSystemTime(), OS);
+  if (Total.getProcessTime())
+    printVal(getProcessTime(), Total.getProcessTime(), OS);
+  printVal(getWallTime(), Total.getWallTime(), OS);
+  
+  OS << "  ";
+  
+  if (Total.getMemUsed())
+    OS << format("%9" PRId64 "  ", (int64_t)getMemUsed());
+}
+
+
+//===----------------------------------------------------------------------===//
+//   NamedRegionTimer Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+typedef StringMap<Timer> Name2TimerMap;
+
+class Name2PairMap {
+  StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map;
+public:
+  ~Name2PairMap() {
+    for (StringMap<std::pair<TimerGroup*, Name2TimerMap> >::iterator
+         I = Map.begin(), E = Map.end(); I != E; ++I)
+      delete I->second.first;
+  }
+  
+  Timer &get(StringRef Name, StringRef GroupName) {
+    sys::SmartScopedLock<true> L(*TimerLock);
+    
+    std::pair<TimerGroup*, Name2TimerMap> &GroupEntry = Map[GroupName];
+    
+    if (!GroupEntry.first)
+      GroupEntry.first = new TimerGroup(GroupName);
+    
+    Timer &T = GroupEntry.second[Name];
+    if (!T.isInitialized())
+      T.init(Name, *GroupEntry.first);
+    return T;
+  }
+};
+
+}
+
+static ManagedStatic<Name2TimerMap> NamedTimers;
+static ManagedStatic<Name2PairMap> NamedGroupedTimers;
+
+static Timer &getNamedRegionTimer(StringRef Name) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+  
+  Timer &T = (*NamedTimers)[Name];
+  if (!T.isInitialized())
+    T.init(Name);
+  return T;
+}
+
+NamedRegionTimer::NamedRegionTimer(StringRef Name,
+                                   bool Enabled)
+  : TimeRegion(!Enabled ? 0 : &getNamedRegionTimer(Name)) {}
+
+NamedRegionTimer::NamedRegionTimer(StringRef Name, StringRef GroupName,
+                                   bool Enabled)
+  : TimeRegion(!Enabled ? 0 : &NamedGroupedTimers->get(Name, GroupName)) {}
+
+//===----------------------------------------------------------------------===//
+//   TimerGroup Implementation
+//===----------------------------------------------------------------------===//
+
+/// TimerGroupList - This is the global list of TimerGroups, maintained by the
+/// TimerGroup ctor/dtor and is protected by the TimerLock lock.
+static TimerGroup *TimerGroupList = 0;
+
+TimerGroup::TimerGroup(StringRef name)
+  : Name(name.begin(), name.end()), FirstTimer(0) {
+    
+  // Add the group to TimerGroupList.
+  sys::SmartScopedLock<true> L(*TimerLock);
+  if (TimerGroupList)
+    TimerGroupList->Prev = &Next;
+  Next = TimerGroupList;
+  Prev = &TimerGroupList;
+  TimerGroupList = this;
+}
+
+TimerGroup::~TimerGroup() {
+  // If the timer group is destroyed before the timers it owns, accumulate and
+  // print the timing data.
+  while (FirstTimer != 0)
+    removeTimer(*FirstTimer);
+  
+  // Remove the group from the TimerGroupList.
+  sys::SmartScopedLock<true> L(*TimerLock);
+  *Prev = Next;
+  if (Next)
+    Next->Prev = Prev;
+}
+
+
+void TimerGroup::removeTimer(Timer &T) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+  
+  // If the timer was started, move its data to TimersToPrint.
+  if (T.Started)
+    TimersToPrint.push_back(std::make_pair(T.Time, T.Name));
+
+  T.TG = 0;
+  
+  // Unlink the timer from our list.
+  *T.Prev = T.Next;
+  if (T.Next)
+    T.Next->Prev = T.Prev;
+  
+  // Print the report when all timers in this group are destroyed if some of
+  // them were started.
+  if (FirstTimer != 0 || TimersToPrint.empty())
+    return;
+  
+  raw_ostream *OutStream = CreateInfoOutputFile();
+  PrintQueuedTimers(*OutStream);
+  delete OutStream;   // Close the file.
+}
+
+void TimerGroup::addTimer(Timer &T) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+  
+  // Add the timer to our list.
+  if (FirstTimer)
+    FirstTimer->Prev = &T.Next;
+  T.Next = FirstTimer;
+  T.Prev = &FirstTimer;
+  FirstTimer = &T;
+}
+
+void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
+  // Sort the timers in descending order by amount of time taken.
+  std::sort(TimersToPrint.begin(), TimersToPrint.end());
+  
+  TimeRecord Total;
+  for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i)
+    Total += TimersToPrint[i].first;
+  
+  // Print out timing header.
+  OS << "===" << std::string(73, '-') << "===\n";
+  // Figure out how many spaces to indent TimerGroup name.
+  unsigned Padding = (80-Name.length())/2;
+  if (Padding > 80) Padding = 0;         // Don't allow "negative" numbers
+  OS.indent(Padding) << Name << '\n';
+  OS << "===" << std::string(73, '-') << "===\n";
+  
+  // If this is not an collection of ungrouped times, print the total time.
+  // Ungrouped timers don't really make sense to add up.  We still print the
+  // TOTAL line to make the percentages make sense.
+  if (this != DefaultTimerGroup)
+    OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
+                 Total.getProcessTime(), Total.getWallTime());
+  OS << '\n';
+  
+  if (Total.getUserTime())
+    OS << "   ---User Time---";
+  if (Total.getSystemTime())
+    OS << "   --System Time--";
+  if (Total.getProcessTime())
+    OS << "   --User+System--";
+  OS << "   ---Wall Time---";
+  if (Total.getMemUsed())
+    OS << "  ---Mem---";
+  OS << "  --- Name ---\n";
+  
+  // Loop through all of the timing data, printing it out.
+  for (unsigned i = 0, e = TimersToPrint.size(); i != e; ++i) {
+    const std::pair<TimeRecord, std::string> &Entry = TimersToPrint[e-i-1];
+    Entry.first.print(Total, OS);
+    OS << Entry.second << '\n';
+  }
+  
+  Total.print(Total, OS);
+  OS << "Total\n\n";
+  OS.flush();
+  
+  TimersToPrint.clear();
+}
+
+/// print - Print any started timers in this group and zero them.
+void TimerGroup::print(raw_ostream &OS) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+
+  // See if any of our timers were started, if so add them to TimersToPrint and
+  // reset them.
+  for (Timer *T = FirstTimer; T; T = T->Next) {
+    if (!T->Started) continue;
+    TimersToPrint.push_back(std::make_pair(T->Time, T->Name));
+    
+    // Clear out the time.
+    T->Started = 0;
+    T->Time = TimeRecord();
+  }
+
+  // If any timers were started, print the group.
+  if (!TimersToPrint.empty())
+    PrintQueuedTimers(OS);
+}
+
+/// printAll - This static method prints all timers and clears them all out.
+void TimerGroup::printAll(raw_ostream &OS) {
+  sys::SmartScopedLock<true> L(*TimerLock);
+
+  for (TimerGroup *TG = TimerGroupList; TG; TG = TG->Next)
+    TG->print(OS);
+}
diff --git a/contrib/llvm/lib/Support/ToolOutputFile.cpp b/contrib/llvm/lib/Support/ToolOutputFile.cpp
new file mode 100644
index 000000000000..e7ca927ea537
--- /dev/null
+++ b/contrib/llvm/lib/Support/ToolOutputFile.cpp
@@ -0,0 +1,43 @@
+//===--- ToolOutputFile.cpp - Implement the tool_output_file class --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the tool_output_file class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/Signals.h"
+using namespace llvm;
+
+tool_output_file::CleanupInstaller::CleanupInstaller(const char *filename)
+  : Filename(filename), Keep(false) {
+  // Arrange for the file to be deleted if the process is killed.
+  if (Filename != "-")
+    sys::RemoveFileOnSignal(sys::Path(Filename));
+}
+
+tool_output_file::CleanupInstaller::~CleanupInstaller() {
+  // Delete the file if the client hasn't told us not to.
+  if (!Keep && Filename != "-")
+    sys::Path(Filename).eraseFromDisk();
+
+  // Ok, the file is successfully written and closed, or deleted. There's no
+  // further need to clean it up on signals.
+  if (Filename != "-")
+    sys::DontRemoveFileOnSignal(sys::Path(Filename));
+}
+
+tool_output_file::tool_output_file(const char *filename, std::string &ErrorInfo,
+                                   unsigned Flags)
+  : Installer(filename),
+    OS(filename, ErrorInfo, Flags) {
+  // If open fails, no cleanup is needed.
+  if (!ErrorInfo.empty())
+    Installer.Keep = true;
+}
diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp
new file mode 100644
index 000000000000..d2508ac1ef3a
--- /dev/null
+++ b/contrib/llvm/lib/Support/Triple.cpp
@@ -0,0 +1,785 @@
+//===--- Triple.cpp - Target triple helper class --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstring>
+using namespace llvm;
+
+const char *Triple::getArchTypeName(ArchType Kind) {
+  switch (Kind) {
+  case UnknownArch: return "unknown";
+
+  case aarch64: return "aarch64";
+  case arm:     return "arm";
+  case hexagon: return "hexagon";
+  case mips:    return "mips";
+  case mipsel:  return "mipsel";
+  case mips64:  return "mips64";
+  case mips64el:return "mips64el";
+  case msp430:  return "msp430";
+  case ppc64:   return "powerpc64";
+  case ppc:     return "powerpc";
+  case r600:    return "r600";
+  case sparc:   return "sparc";
+  case sparcv9: return "sparcv9";
+  case tce:     return "tce";
+  case thumb:   return "thumb";
+  case x86:     return "i386";
+  case x86_64:  return "x86_64";
+  case xcore:   return "xcore";
+  case mblaze:  return "mblaze";
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx64";
+  case le32:    return "le32";
+  case amdil:   return "amdil";
+  case spir:    return "spir";
+  case spir64:  return "spir64";
+  }
+
+  llvm_unreachable("Invalid ArchType!");
+}
+
+const char *Triple::getArchTypePrefix(ArchType Kind) {
+  switch (Kind) {
+  default:
+    return 0;
+
+  case aarch64: return "aarch64";
+
+  case arm:
+  case thumb:   return "arm";
+
+  case ppc64:
+  case ppc:     return "ppc";
+
+  case mblaze:  return "mblaze";
+
+  case mips:
+  case mipsel:
+  case mips64:
+  case mips64el:return "mips";
+
+  case hexagon: return "hexagon";
+
+  case r600:    return "r600";
+
+  case sparcv9:
+  case sparc:   return "sparc";
+
+  case x86:
+  case x86_64:  return "x86";
+
+  case xcore:   return "xcore";
+
+  case nvptx:   return "nvptx";
+  case nvptx64: return "nvptx";
+  case le32:    return "le32";
+  case amdil:   return "amdil";
+  case spir:    return "spir";
+  case spir64:  return "spir";
+  }
+}
+
+const char *Triple::getVendorTypeName(VendorType Kind) {
+  switch (Kind) {
+  case UnknownVendor: return "unknown";
+
+  case Apple: return "apple";
+  case PC: return "pc";
+  case SCEI: return "scei";
+  case BGP: return "bgp";
+  case BGQ: return "bgq";
+  case Freescale: return "fsl";
+  case IBM: return "ibm";
+  }
+
+  llvm_unreachable("Invalid VendorType!");
+}
+
+const char *Triple::getOSTypeName(OSType Kind) {
+  switch (Kind) {
+  case UnknownOS: return "unknown";
+
+  case AuroraUX: return "auroraux";
+  case Cygwin: return "cygwin";
+  case Darwin: return "darwin";
+  case DragonFly: return "dragonfly";
+  case FreeBSD: return "freebsd";
+  case IOS: return "ios";
+  case KFreeBSD: return "kfreebsd";
+  case Linux: return "linux";
+  case Lv2: return "lv2";
+  case MacOSX: return "macosx";
+  case MinGW32: return "mingw32";
+  case NetBSD: return "netbsd";
+  case OpenBSD: return "openbsd";
+  case Solaris: return "solaris";
+  case Win32: return "win32";
+  case Haiku: return "haiku";
+  case Minix: return "minix";
+  case RTEMS: return "rtems";
+  case NaCl: return "nacl";
+  case CNK: return "cnk";
+  case Bitrig: return "bitrig";
+  case AIX: return "aix";
+  }
+
+  llvm_unreachable("Invalid OSType");
+}
+
+const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
+  switch (Kind) {
+  case UnknownEnvironment: return "unknown";
+  case GNU: return "gnu";
+  case GNUEABIHF: return "gnueabihf";
+  case GNUEABI: return "gnueabi";
+  case GNUX32: return "gnux32";
+  case EABI: return "eabi";
+  case MachO: return "macho";
+  case Android: return "android";
+  case ELF: return "elf";
+  }
+
+  llvm_unreachable("Invalid EnvironmentType!");
+}
+
+Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
+  return StringSwitch<Triple::ArchType>(Name)
+    .Case("aarch64", aarch64)
+    .Case("arm", arm)
+    .Case("mips", mips)
+    .Case("mipsel", mipsel)
+    .Case("mips64", mips64)
+    .Case("mips64el", mips64el)
+    .Case("msp430", msp430)
+    .Case("ppc64", ppc64)
+    .Case("ppc32", ppc)
+    .Case("ppc", ppc)
+    .Case("mblaze", mblaze)
+    .Case("r600", r600)
+    .Case("hexagon", hexagon)
+    .Case("sparc", sparc)
+    .Case("sparcv9", sparcv9)
+    .Case("tce", tce)
+    .Case("thumb", thumb)
+    .Case("x86", x86)
+    .Case("x86-64", x86_64)
+    .Case("xcore", xcore)
+    .Case("nvptx", nvptx)
+    .Case("nvptx64", nvptx64)
+    .Case("le32", le32)
+    .Case("amdil", amdil)
+    .Case("spir", spir)
+    .Case("spir64", spir64)
+    .Default(UnknownArch);
+}
+
+// Returns architecture name that is understood by the target assembler.
+const char *Triple::getArchNameForAssembler() {
+  if (!isOSDarwin() && getVendor() != Triple::Apple)
+    return NULL;
+
+  return StringSwitch<const char*>(getArchName())
+    .Case("i386", "i386")
+    .Case("x86_64", "x86_64")
+    .Case("powerpc", "ppc")
+    .Case("powerpc64", "ppc64")
+    .Cases("mblaze", "microblaze", "mblaze")
+    .Case("arm", "arm")
+    .Cases("armv4t", "thumbv4t", "armv4t")
+    .Cases("armv5", "armv5e", "thumbv5", "thumbv5e", "armv5")
+    .Cases("armv6", "thumbv6", "armv6")
+    .Cases("armv7", "thumbv7", "armv7")
+    .Case("r600", "r600")
+    .Case("nvptx", "nvptx")
+    .Case("nvptx64", "nvptx64")
+    .Case("le32", "le32")
+    .Case("amdil", "amdil")
+    .Case("spir", "spir")
+    .Case("spir64", "spir64")
+    .Default(NULL);
+}
+
+static Triple::ArchType parseArch(StringRef ArchName) {
+  return StringSwitch<Triple::ArchType>(ArchName)
+    .Cases("i386", "i486", "i586", "i686", Triple::x86)
+    // FIXME: Do we need to support these?
+    .Cases("i786", "i886", "i986", Triple::x86)
+    .Cases("amd64", "x86_64", Triple::x86_64)
+    .Case("powerpc", Triple::ppc)
+    .Cases("powerpc64", "ppu", Triple::ppc64)
+    .Case("mblaze", Triple::mblaze)
+    .Case("aarch64", Triple::aarch64)
+    .Cases("arm", "xscale", Triple::arm)
+    // FIXME: It would be good to replace these with explicit names for all the
+    // various suffixes supported.
+    .StartsWith("armv", Triple::arm)
+    .Case("thumb", Triple::thumb)
+    .StartsWith("thumbv", Triple::thumb)
+    .Case("msp430", Triple::msp430)
+    .Cases("mips", "mipseb", "mipsallegrex", Triple::mips)
+    .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
+    .Cases("mips64", "mips64eb", Triple::mips64)
+    .Case("mips64el", Triple::mips64el)
+    .Case("r600", Triple::r600)
+    .Case("hexagon", Triple::hexagon)
+    .Case("sparc", Triple::sparc)
+    .Case("sparcv9", Triple::sparcv9)
+    .Case("tce", Triple::tce)
+    .Case("xcore", Triple::xcore)
+    .Case("nvptx", Triple::nvptx)
+    .Case("nvptx64", Triple::nvptx64)
+    .Case("le32", Triple::le32)
+    .Case("amdil", Triple::amdil)
+    .Case("spir", Triple::spir)
+    .Case("spir64", Triple::spir64)
+    .Default(Triple::UnknownArch);
+}
+
+static Triple::VendorType parseVendor(StringRef VendorName) {
+  return StringSwitch<Triple::VendorType>(VendorName)
+    .Case("apple", Triple::Apple)
+    .Case("pc", Triple::PC)
+    .Case("scei", Triple::SCEI)
+    .Case("bgp", Triple::BGP)
+    .Case("bgq", Triple::BGQ)
+    .Case("fsl", Triple::Freescale)
+    .Case("ibm", Triple::IBM)
+    .Default(Triple::UnknownVendor);
+}
+
+static Triple::OSType parseOS(StringRef OSName) {
+  return StringSwitch<Triple::OSType>(OSName)
+    .StartsWith("auroraux", Triple::AuroraUX)
+    .StartsWith("cygwin", Triple::Cygwin)
+    .StartsWith("darwin", Triple::Darwin)
+    .StartsWith("dragonfly", Triple::DragonFly)
+    .StartsWith("freebsd", Triple::FreeBSD)
+    .StartsWith("ios", Triple::IOS)
+    .StartsWith("kfreebsd", Triple::KFreeBSD)
+    .StartsWith("linux", Triple::Linux)
+    .StartsWith("lv2", Triple::Lv2)
+    .StartsWith("macosx", Triple::MacOSX)
+    .StartsWith("mingw32", Triple::MinGW32)
+    .StartsWith("netbsd", Triple::NetBSD)
+    .StartsWith("openbsd", Triple::OpenBSD)
+    .StartsWith("solaris", Triple::Solaris)
+    .StartsWith("win32", Triple::Win32)
+    .StartsWith("haiku", Triple::Haiku)
+    .StartsWith("minix", Triple::Minix)
+    .StartsWith("rtems", Triple::RTEMS)
+    .StartsWith("nacl", Triple::NaCl)
+    .StartsWith("cnk", Triple::CNK)
+    .StartsWith("bitrig", Triple::Bitrig)
+    .StartsWith("aix", Triple::AIX)
+    .Default(Triple::UnknownOS);
+}
+
+static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
+  return StringSwitch<Triple::EnvironmentType>(EnvironmentName)
+    .StartsWith("eabi", Triple::EABI)
+    .StartsWith("gnueabihf", Triple::GNUEABIHF)
+    .StartsWith("gnueabi", Triple::GNUEABI)
+    .StartsWith("gnux32", Triple::GNUX32)
+    .StartsWith("gnu", Triple::GNU)
+    .StartsWith("macho", Triple::MachO)
+    .StartsWith("android", Triple::Android)
+    .StartsWith("elf", Triple::ELF)
+    .Default(Triple::UnknownEnvironment);
+}
+
+/// \brief Construct a triple from the string representation provided.
+///
+/// This stores the string representation and parses the various pieces into
+/// enum members.
+Triple::Triple(const Twine &Str)
+    : Data(Str.str()),
+      Arch(parseArch(getArchName())),
+      Vendor(parseVendor(getVendorName())),
+      OS(parseOS(getOSName())),
+      Environment(parseEnvironment(getEnvironmentName())) {
+}
+
+/// \brief Construct a triple from string representations of the architecture,
+/// vendor, and OS.
+///
+/// This joins each argument into a canonical string representation and parses
+/// them into enum members. It leaves the environment unknown and omits it from
+/// the string representation.
+Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr)
+    : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr).str()),
+      Arch(parseArch(ArchStr.str())),
+      Vendor(parseVendor(VendorStr.str())),
+      OS(parseOS(OSStr.str())),
+      Environment() {
+}
+
+/// \brief Construct a triple from string representations of the architecture,
+/// vendor, OS, and environment.
+///
+/// This joins each argument into a canonical string representation and parses
+/// them into enum members.
+Triple::Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr,
+               const Twine &EnvironmentStr)
+    : Data((ArchStr + Twine('-') + VendorStr + Twine('-') + OSStr + Twine('-') +
+            EnvironmentStr).str()),
+      Arch(parseArch(ArchStr.str())),
+      Vendor(parseVendor(VendorStr.str())),
+      OS(parseOS(OSStr.str())),
+      Environment(parseEnvironment(EnvironmentStr.str())) {
+}
+
+std::string Triple::normalize(StringRef Str) {
+  // Parse into components.
+  SmallVector<StringRef, 4> Components;
+  Str.split(Components, "-");
+
+  // If the first component corresponds to a known architecture, preferentially
+  // use it for the architecture.  If the second component corresponds to a
+  // known vendor, preferentially use it for the vendor, etc.  This avoids silly
+  // component movement when a component parses as (eg) both a valid arch and a
+  // valid os.
+  ArchType Arch = UnknownArch;
+  if (Components.size() > 0)
+    Arch = parseArch(Components[0]);
+  VendorType Vendor = UnknownVendor;
+  if (Components.size() > 1)
+    Vendor = parseVendor(Components[1]);
+  OSType OS = UnknownOS;
+  if (Components.size() > 2)
+    OS = parseOS(Components[2]);
+  EnvironmentType Environment = UnknownEnvironment;
+  if (Components.size() > 3)
+    Environment = parseEnvironment(Components[3]);
+
+  // Note which components are already in their final position.  These will not
+  // be moved.
+  bool Found[4];
+  Found[0] = Arch != UnknownArch;
+  Found[1] = Vendor != UnknownVendor;
+  Found[2] = OS != UnknownOS;
+  Found[3] = Environment != UnknownEnvironment;
+
+  // If they are not there already, permute the components into their canonical
+  // positions by seeing if they parse as a valid architecture, and if so moving
+  // the component to the architecture position etc.
+  for (unsigned Pos = 0; Pos != array_lengthof(Found); ++Pos) {
+    if (Found[Pos])
+      continue; // Already in the canonical position.
+
+    for (unsigned Idx = 0; Idx != Components.size(); ++Idx) {
+      // Do not reparse any components that already matched.
+      if (Idx < array_lengthof(Found) && Found[Idx])
+        continue;
+
+      // Does this component parse as valid for the target position?
+      bool Valid = false;
+      StringRef Comp = Components[Idx];
+      switch (Pos) {
+      default: llvm_unreachable("unexpected component type!");
+      case 0:
+        Arch = parseArch(Comp);
+        Valid = Arch != UnknownArch;
+        break;
+      case 1:
+        Vendor = parseVendor(Comp);
+        Valid = Vendor != UnknownVendor;
+        break;
+      case 2:
+        OS = parseOS(Comp);
+        Valid = OS != UnknownOS;
+        break;
+      case 3:
+        Environment = parseEnvironment(Comp);
+        Valid = Environment != UnknownEnvironment;
+        break;
+      }
+      if (!Valid)
+        continue; // Nope, try the next component.
+
+      // Move the component to the target position, pushing any non-fixed
+      // components that are in the way to the right.  This tends to give
+      // good results in the common cases of a forgotten vendor component
+      // or a wrongly positioned environment.
+      if (Pos < Idx) {
+        // Insert left, pushing the existing components to the right.  For
+        // example, a-b-i386 -> i386-a-b when moving i386 to the front.
+        StringRef CurrentComponent(""); // The empty component.
+        // Replace the component we are moving with an empty component.
+        std::swap(CurrentComponent, Components[Idx]);
+        // Insert the component being moved at Pos, displacing any existing
+        // components to the right.
+        for (unsigned i = Pos; !CurrentComponent.empty(); ++i) {
+          // Skip over any fixed components.
+          while (i < array_lengthof(Found) && Found[i])
+            ++i;
+          // Place the component at the new position, getting the component
+          // that was at this position - it will be moved right.
+          std::swap(CurrentComponent, Components[i]);
+        }
+      } else if (Pos > Idx) {
+        // Push right by inserting empty components until the component at Idx
+        // reaches the target position Pos.  For example, pc-a -> -pc-a when
+        // moving pc to the second position.
+        do {
+          // Insert one empty component at Idx.
+          StringRef CurrentComponent(""); // The empty component.
+          for (unsigned i = Idx; i < Components.size();) {
+            // Place the component at the new position, getting the component
+            // that was at this position - it will be moved right.
+            std::swap(CurrentComponent, Components[i]);
+            // If it was placed on top of an empty component then we are done.
+            if (CurrentComponent.empty())
+              break;
+            // Advance to the next component, skipping any fixed components.
+            while (++i < array_lengthof(Found) && Found[i])
+              ;
+          }
+          // The last component was pushed off the end - append it.
+          if (!CurrentComponent.empty())
+            Components.push_back(CurrentComponent);
+
+          // Advance Idx to the component's new position.
+          while (++Idx < array_lengthof(Found) && Found[Idx])
+            ;
+        } while (Idx < Pos); // Add more until the final position is reached.
+      }
+      assert(Pos < Components.size() && Components[Pos] == Comp &&
+             "Component moved wrong!");
+      Found[Pos] = true;
+      break;
+    }
+  }
+
+  // Special case logic goes here.  At this point Arch, Vendor and OS have the
+  // correct values for the computed components.
+
+  // Stick the corrected components back together to form the normalized string.
+  std::string Normalized;
+  for (unsigned i = 0, e = Components.size(); i != e; ++i) {
+    if (i) Normalized += '-';
+    Normalized += Components[i];
+  }
+  return Normalized;
+}
+
+StringRef Triple::getArchName() const {
+  return StringRef(Data).split('-').first;           // Isolate first component
+}
+
+StringRef Triple::getVendorName() const {
+  StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
+  return Tmp.split('-').first;                       // Isolate second component
+}
+
+StringRef Triple::getOSName() const {
+  StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
+  Tmp = Tmp.split('-').second;                       // Strip second component
+  return Tmp.split('-').first;                       // Isolate third component
+}
+
+StringRef Triple::getEnvironmentName() const {
+  StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
+  Tmp = Tmp.split('-').second;                       // Strip second component
+  return Tmp.split('-').second;                      // Strip third component
+}
+
+StringRef Triple::getOSAndEnvironmentName() const {
+  StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
+  return Tmp.split('-').second;                      // Strip second component
+}
+
+static unsigned EatNumber(StringRef &Str) {
+  assert(!Str.empty() && Str[0] >= '0' && Str[0] <= '9' && "Not a number");
+  unsigned Result = 0;
+
+  do {
+    // Consume the leading digit.
+    Result = Result*10 + (Str[0] - '0');
+
+    // Eat the digit.
+    Str = Str.substr(1);
+  } while (!Str.empty() && Str[0] >= '0' && Str[0] <= '9');
+
+  return Result;
+}
+
+void Triple::getOSVersion(unsigned &Major, unsigned &Minor,
+                          unsigned &Micro) const {
+  StringRef OSName = getOSName();
+
+  // Assume that the OS portion of the triple starts with the canonical name.
+  StringRef OSTypeName = getOSTypeName(getOS());
+  if (OSName.startswith(OSTypeName))
+    OSName = OSName.substr(OSTypeName.size());
+
+  // Any unset version defaults to 0.
+  Major = Minor = Micro = 0;
+
+  // Parse up to three components.
+  unsigned *Components[3] = { &Major, &Minor, &Micro };
+  for (unsigned i = 0; i != 3; ++i) {
+    if (OSName.empty() || OSName[0] < '0' || OSName[0] > '9')
+      break;
+
+    // Consume the leading number.
+    *Components[i] = EatNumber(OSName);
+
+    // Consume the separator, if present.
+    if (OSName.startswith("."))
+      OSName = OSName.substr(1);
+  }
+}
+
+bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
+                              unsigned &Micro) const {
+  getOSVersion(Major, Minor, Micro);
+
+  switch (getOS()) {
+  default: llvm_unreachable("unexpected OS for Darwin triple");
+  case Darwin:
+    // Default to darwin8, i.e., MacOSX 10.4.
+    if (Major == 0)
+      Major = 8;
+    // Darwin version numbers are skewed from OS X versions.
+    if (Major < 4)
+      return false;
+    Micro = 0;
+    Minor = Major - 4;
+    Major = 10;
+    break;
+  case MacOSX:
+    // Default to 10.4.
+    if (Major == 0) {
+      Major = 10;
+      Minor = 4;
+    }
+    if (Major != 10)
+      return false;
+    break;
+  case IOS:
+    // Ignore the version from the triple.  This is only handled because the
+    // the clang driver combines OS X and IOS support into a common Darwin
+    // toolchain that wants to know the OS X version number even when targeting
+    // IOS.
+    Major = 10;
+    Minor = 4;
+    Micro = 0;
+    break;
+  }
+  return true;
+}
+
+void Triple::getiOSVersion(unsigned &Major, unsigned &Minor,
+                           unsigned &Micro) const {
+  switch (getOS()) {
+  default: llvm_unreachable("unexpected OS for Darwin triple");
+  case Darwin:
+  case MacOSX:
+    // Ignore the version from the triple.  This is only handled because the
+    // the clang driver combines OS X and IOS support into a common Darwin
+    // toolchain that wants to know the iOS version number even when targeting
+    // OS X.
+    Major = 3;
+    Minor = 0;
+    Micro = 0;
+    break;
+  case IOS:
+    getOSVersion(Major, Minor, Micro);
+    // Default to 3.0.
+    if (Major == 0)
+      Major = 3;
+    break;
+  }
+}
+
+void Triple::setTriple(const Twine &Str) {
+  *this = Triple(Str);
+}
+
+void Triple::setArch(ArchType Kind) {
+  setArchName(getArchTypeName(Kind));
+}
+
+void Triple::setVendor(VendorType Kind) {
+  setVendorName(getVendorTypeName(Kind));
+}
+
+void Triple::setOS(OSType Kind) {
+  setOSName(getOSTypeName(Kind));
+}
+
+void Triple::setEnvironment(EnvironmentType Kind) {
+  setEnvironmentName(getEnvironmentTypeName(Kind));
+}
+
+void Triple::setArchName(StringRef Str) {
+  // Work around a miscompilation bug for Twines in gcc 4.0.3.
+  SmallString<64> Triple;
+  Triple += Str;
+  Triple += "-";
+  Triple += getVendorName();
+  Triple += "-";
+  Triple += getOSAndEnvironmentName();
+  setTriple(Triple.str());
+}
+
+void Triple::setVendorName(StringRef Str) {
+  setTriple(getArchName() + "-" + Str + "-" + getOSAndEnvironmentName());
+}
+
+void Triple::setOSName(StringRef Str) {
+  if (hasEnvironment())
+    setTriple(getArchName() + "-" + getVendorName() + "-" + Str +
+              "-" + getEnvironmentName());
+  else
+    setTriple(getArchName() + "-" + getVendorName() + "-" + Str);
+}
+
+void Triple::setEnvironmentName(StringRef Str) {
+  setTriple(getArchName() + "-" + getVendorName() + "-" + getOSName() +
+            "-" + Str);
+}
+
+void Triple::setOSAndEnvironmentName(StringRef Str) {
+  setTriple(getArchName() + "-" + getVendorName() + "-" + Str);
+}
+
+static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
+  switch (Arch) {
+  case llvm::Triple::UnknownArch:
+    return 0;
+
+  case llvm::Triple::msp430:
+    return 16;
+
+  case llvm::Triple::amdil:
+  case llvm::Triple::arm:
+  case llvm::Triple::hexagon:
+  case llvm::Triple::le32:
+  case llvm::Triple::mblaze:
+  case llvm::Triple::mips:
+  case llvm::Triple::mipsel:
+  case llvm::Triple::nvptx:
+  case llvm::Triple::ppc:
+  case llvm::Triple::r600:
+  case llvm::Triple::sparc:
+  case llvm::Triple::tce:
+  case llvm::Triple::thumb:
+  case llvm::Triple::x86:
+  case llvm::Triple::xcore:
+  case llvm::Triple::spir:
+    return 32;
+
+  case llvm::Triple::aarch64:
+  case llvm::Triple::mips64:
+  case llvm::Triple::mips64el:
+  case llvm::Triple::nvptx64:
+  case llvm::Triple::ppc64:
+  case llvm::Triple::sparcv9:
+  case llvm::Triple::x86_64:
+  case llvm::Triple::spir64:
+    return 64;
+  }
+  llvm_unreachable("Invalid architecture value");
+}
+
+bool Triple::isArch64Bit() const {
+  return getArchPointerBitWidth(getArch()) == 64;
+}
+
+bool Triple::isArch32Bit() const {
+  return getArchPointerBitWidth(getArch()) == 32;
+}
+
+bool Triple::isArch16Bit() const {
+  return getArchPointerBitWidth(getArch()) == 16;
+}
+
+Triple Triple::get32BitArchVariant() const {
+  Triple T(*this);
+  switch (getArch()) {
+  case Triple::UnknownArch:
+  case Triple::aarch64:
+  case Triple::msp430:
+    T.setArch(UnknownArch);
+    break;
+
+  case Triple::amdil:
+  case Triple::spir:
+  case Triple::arm:
+  case Triple::hexagon:
+  case Triple::le32:
+  case Triple::mblaze:
+  case Triple::mips:
+  case Triple::mipsel:
+  case Triple::nvptx:
+  case Triple::ppc:
+  case Triple::r600:
+  case Triple::sparc:
+  case Triple::tce:
+  case Triple::thumb:
+  case Triple::x86:
+  case Triple::xcore:
+    // Already 32-bit.
+    break;
+
+  case Triple::mips64:    T.setArch(Triple::mips);    break;
+  case Triple::mips64el:  T.setArch(Triple::mipsel);  break;
+  case Triple::nvptx64:   T.setArch(Triple::nvptx);   break;
+  case Triple::ppc64:     T.setArch(Triple::ppc);   break;
+  case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
+  case Triple::x86_64:    T.setArch(Triple::x86);     break;
+  case Triple::spir64:    T.setArch(Triple::spir);    break;
+  }
+  return T;
+}
+
+Triple Triple::get64BitArchVariant() const {
+  Triple T(*this);
+  switch (getArch()) {
+  case Triple::UnknownArch:
+  case Triple::amdil:
+  case Triple::arm:
+  case Triple::hexagon:
+  case Triple::le32:
+  case Triple::mblaze:
+  case Triple::msp430:
+  case Triple::r600:
+  case Triple::tce:
+  case Triple::thumb:
+  case Triple::xcore:
+    T.setArch(UnknownArch);
+    break;
+
+  case Triple::aarch64:
+  case Triple::spir64:
+  case Triple::mips64:
+  case Triple::mips64el:
+  case Triple::nvptx64:
+  case Triple::ppc64:
+  case Triple::sparcv9:
+  case Triple::x86_64:
+    // Already 64-bit.
+    break;
+
+  case Triple::mips:    T.setArch(Triple::mips64);    break;
+  case Triple::mipsel:  T.setArch(Triple::mips64el);  break;
+  case Triple::nvptx:   T.setArch(Triple::nvptx64);   break;
+  case Triple::ppc:     T.setArch(Triple::ppc64);     break;
+  case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
+  case Triple::x86:     T.setArch(Triple::x86_64);    break;
+  case Triple::spir:    T.setArch(Triple::spir64);    break;
+  }
+  return T;
+}
diff --git a/contrib/llvm/lib/Support/Twine.cpp b/contrib/llvm/lib/Support/Twine.cpp
new file mode 100644
index 000000000000..3d04bc34f0eb
--- /dev/null
+++ b/contrib/llvm/lib/Support/Twine.cpp
@@ -0,0 +1,171 @@
+//===-- Twine.cpp - Fast Temporary String Concatenation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+std::string Twine::str() const {
+  // If we're storing only a std::string, just return it.
+  if (LHSKind == StdStringKind && RHSKind == EmptyKind)
+    return *LHS.stdString;
+
+  // Otherwise, flatten and copy the contents first.
+  SmallString<256> Vec;
+  return toStringRef(Vec).str();
+}
+
+void Twine::toVector(SmallVectorImpl<char> &Out) const {
+  raw_svector_ostream OS(Out);
+  print(OS);
+}
+
+StringRef Twine::toStringRef(SmallVectorImpl<char> &Out) const {
+  if (isSingleStringRef())
+    return getSingleStringRef();
+  toVector(Out);
+  return StringRef(Out.data(), Out.size());
+}
+
+StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const {
+  if (isUnary()) {
+    switch (getLHSKind()) {
+    case CStringKind:
+      // Already null terminated, yay!
+      return StringRef(LHS.cString);
+    case StdStringKind: {
+      const std::string *str = LHS.stdString;
+      return StringRef(str->c_str(), str->size());
+    }
+    default:
+      break;
+    }
+  }
+  toVector(Out);
+  Out.push_back(0);
+  Out.pop_back();
+  return StringRef(Out.data(), Out.size());
+}
+
+void Twine::printOneChild(raw_ostream &OS, Child Ptr,
+                          NodeKind Kind) const {
+  switch (Kind) {
+  case Twine::NullKind: break;
+  case Twine::EmptyKind: break;
+  case Twine::TwineKind:
+    Ptr.twine->print(OS);
+    break;
+  case Twine::CStringKind:
+    OS << Ptr.cString;
+    break;
+  case Twine::StdStringKind:
+    OS << *Ptr.stdString;
+    break;
+  case Twine::StringRefKind:
+    OS << *Ptr.stringRef;
+    break;
+  case Twine::CharKind:
+    OS << Ptr.character;
+    break;
+  case Twine::DecUIKind:
+    OS << Ptr.decUI;
+    break;
+  case Twine::DecIKind:
+    OS << Ptr.decI;
+    break;
+  case Twine::DecULKind:
+    OS << *Ptr.decUL;
+    break;
+  case Twine::DecLKind:
+    OS << *Ptr.decL;
+    break;
+  case Twine::DecULLKind:
+    OS << *Ptr.decULL;
+    break;
+  case Twine::DecLLKind:
+    OS << *Ptr.decLL;
+    break;
+  case Twine::UHexKind:
+    OS.write_hex(*Ptr.uHex);
+    break;
+  }
+}
+
+void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr,
+                              NodeKind Kind) const {
+  switch (Kind) {
+  case Twine::NullKind:
+    OS << "null"; break;
+  case Twine::EmptyKind:
+    OS << "empty"; break;
+  case Twine::TwineKind:
+    OS << "rope:";
+    Ptr.twine->printRepr(OS);
+    break;
+  case Twine::CStringKind:
+    OS << "cstring:\""
+       << Ptr.cString << "\"";
+    break;
+  case Twine::StdStringKind:
+    OS << "std::string:\""
+       << Ptr.stdString << "\"";
+    break;
+  case Twine::StringRefKind:
+    OS << "stringref:\""
+       << Ptr.stringRef << "\"";
+    break;
+  case Twine::CharKind:
+    OS << "char:\"" << Ptr.character << "\"";
+    break;
+  case Twine::DecUIKind:
+    OS << "decUI:\"" << Ptr.decUI << "\"";
+    break;
+  case Twine::DecIKind:
+    OS << "decI:\"" << Ptr.decI << "\"";
+    break;
+  case Twine::DecULKind:
+    OS << "decUL:\"" << *Ptr.decUL << "\"";
+    break;
+  case Twine::DecLKind:
+    OS << "decL:\"" << *Ptr.decL << "\"";
+    break;
+  case Twine::DecULLKind:
+    OS << "decULL:\"" << *Ptr.decULL << "\"";
+    break;
+  case Twine::DecLLKind:
+    OS << "decLL:\"" << *Ptr.decLL << "\"";
+    break;
+  case Twine::UHexKind:
+    OS << "uhex:\"" << Ptr.uHex << "\"";
+    break;
+  }
+}
+
+void Twine::print(raw_ostream &OS) const {
+  printOneChild(OS, LHS, getLHSKind());
+  printOneChild(OS, RHS, getRHSKind());
+}
+
+void Twine::printRepr(raw_ostream &OS) const {
+  OS << "(Twine ";
+  printOneChildRepr(OS, LHS, getLHSKind());
+  OS << " ";
+  printOneChildRepr(OS, RHS, getRHSKind());
+  OS << ")";
+}
+
+void Twine::dump() const {
+  print(llvm::dbgs());
+}
+
+void Twine::dumpRepr() const {
+  printRepr(llvm::dbgs());
+}
diff --git a/contrib/llvm/lib/Support/Unix/Host.inc b/contrib/llvm/lib/Support/Unix/Host.inc
new file mode 100644
index 000000000000..7f79db074135
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Host.inc
@@ -0,0 +1,69 @@
+ //===- llvm/Support/Unix/Host.inc -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the UNIX Host support.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ADT/StringRef.h"
+#include "Unix.h"
+#include <sys/utsname.h>
+#include <cctype>
+#include <string>
+#include <cstdlib> // ::getenv
+
+using namespace llvm;
+
+#ifdef __FreeBSD__
+std::string sys::getDefaultTargetTriple() {
+  return LLVM_DEFAULT_TARGET_TRIPLE;
+}
+#else // __FreeBSD__
+static std::string getOSVersion() {
+  struct utsname info;
+
+  if (uname(&info))
+    return "";
+
+  return info.release;
+}
+
+std::string sys::getDefaultTargetTriple() {
+  StringRef TargetTripleString(LLVM_DEFAULT_TARGET_TRIPLE);
+  std::pair<StringRef, StringRef> ArchSplit = TargetTripleString.split('-');
+
+  // Normalize the arch, since the target triple may not actually match the target.
+  std::string Arch = ArchSplit.first;
+
+  std::string Triple(Arch);
+  Triple += '-';
+  Triple += ArchSplit.second;
+
+  // Force i<N>86 to i386.
+  if (Triple[0] == 'i' && isdigit(Triple[1]) &&
+      Triple[2] == '8' && Triple[3] == '6')
+    Triple[1] = '3';
+
+  // On darwin, we want to update the version to match that of the
+  // target.
+  std::string::size_type DarwinDashIdx = Triple.find("-darwin");
+  if (DarwinDashIdx != std::string::npos) {
+    Triple.resize(DarwinDashIdx + strlen("-darwin"));
+    Triple += getOSVersion();
+  }
+
+  return Triple;
+}
+#endif // __FreeBSD__
diff --git a/contrib/llvm/lib/Support/Unix/Memory.inc b/contrib/llvm/lib/Support/Unix/Memory.inc
new file mode 100644
index 000000000000..f397408c7743
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Memory.inc
@@ -0,0 +1,353 @@
+//===- Unix/Memory.cpp - Generic UNIX System Configuration ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some functions for various memory management utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach/mach.h>
+#endif
+
+#if defined(__mips__)
+#  if defined(__OpenBSD__)
+#    include <mips64/sysarch.h>
+#  else
+#    include <sys/cachectl.h>
+#  endif
+#endif
+
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+
+namespace {
+
+int getPosixProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  case llvm::sys::Memory::MF_READ:
+    return PROT_READ;
+  case llvm::sys::Memory::MF_WRITE:
+    return PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PROT_READ | PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_EXEC;
+  case llvm::sys::Memory::MF_READ |
+	 llvm::sys::Memory::MF_WRITE |
+	 llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_WRITE | PROT_EXEC;
+  case llvm::sys::Memory::MF_EXEC:
+#if defined(__FreeBSD__)
+    // On PowerPC, having an executable page that has no read permission
+    // can have unintended consequences.  The function InvalidateInstruction-
+    // Cache uses instructions dcbf and icbi, both of which are treated by
+    // the processor as loads.  If the page has no read permissions,
+    // executing these instructions will result in a segmentation fault.
+    // Somehow, this problem is not present on Linux, but it does happen
+    // on FreeBSD.
+    return PROT_READ | PROT_EXEC;
+#else
+    return PROT_EXEC;
+#endif
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PROT_NONE;
+}
+
+} // namespace
+
+namespace llvm {
+namespace sys {
+
+MemoryBlock
+Memory::allocateMappedMemory(size_t NumBytes,
+                             const MemoryBlock *const NearBlock,
+                             unsigned PFlags,
+                             error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  static const size_t PageSize = process::get_self()->page_size();
+  const size_t NumPages = (NumBytes+PageSize-1)/PageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1) {
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+  fd = zero_fd;
+#endif
+
+  int MMFlags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ; // Ends statement above
+
+  int Protect = getPosixProtectionFlags(PFlags);
+
+  // Use any near hint and the page size to set a page-aligned starting address
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                      NearBlock->size() : 0;
+  if (Start && Start % PageSize)
+    Start += PageSize - Start % PageSize;
+
+  void *Addr = ::mmap(reinterpret_cast<void*>(Start), PageSize*NumPages,
+                      Protect, MMFlags, fd, 0);
+  if (Addr == MAP_FAILED) {
+    if (NearBlock) //Try again without a near hint
+      return allocateMappedMemory(NumBytes, 0, PFlags, EC);
+
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+
+  MemoryBlock Result;
+  Result.Address = Addr;
+  Result.Size = NumPages*PageSize;
+
+  if (PFlags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
+}
+
+error_code
+Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (0 != ::munmap(M.Address, M.Size))
+    return error_code(errno, system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code
+Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (!Flags)
+    return error_code(EINVAL, generic_category());
+
+  int Protect = getPosixProtectionFlags(Flags);
+
+  int Result = ::mprotect(M.Address, M.Size, Protect);
+  if (Result != 0)
+    return error_code(errno, system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
+/// AllocateRWX - Allocate a slab of memory with read/write/execute
+/// permissions.  This is typically used for JIT applications where we want
+/// to emit code to the memory then jump to it.  Getting this type of memory
+/// is very OS specific.
+///
+MemoryBlock
+Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
+                    std::string *ErrMsg) {
+  if (NumBytes == 0) return MemoryBlock();
+
+  size_t PageSize = process::get_self()->page_size();
+  size_t NumPages = (NumBytes+PageSize-1)/PageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1) {
+    MakeErrMsg(ErrMsg, "Can't open /dev/zero device");
+    return MemoryBlock();
+  }
+  fd = zero_fd;
+#endif
+
+  int flags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ;
+
+  void* start = NearBlock ? (unsigned char*)NearBlock->base() +
+                            NearBlock->size() : 0;
+
+#if defined(__APPLE__) && defined(__arm__)
+  void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_EXEC,
+                    flags, fd, 0);
+#else
+  void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_WRITE|PROT_EXEC,
+                    flags, fd, 0);
+#endif
+  if (pa == MAP_FAILED) {
+    if (NearBlock) //Try again without a near hint
+      return AllocateRWX(NumBytes, 0);
+
+    MakeErrMsg(ErrMsg, "Can't allocate RWX Memory");
+    return MemoryBlock();
+  }
+
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)pa,
+                                (vm_size_t)(PageSize*NumPages), 0,
+                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  if (KERN_SUCCESS != kr) {
+    MakeErrMsg(ErrMsg, "vm_protect max RX failed");
+    return MemoryBlock();
+  }
+
+  kr = vm_protect(mach_task_self(), (vm_address_t)pa,
+                  (vm_size_t)(PageSize*NumPages), 0,
+                  VM_PROT_READ | VM_PROT_WRITE);
+  if (KERN_SUCCESS != kr) {
+    MakeErrMsg(ErrMsg, "vm_protect RW failed");
+    return MemoryBlock();
+  }
+#endif
+
+  MemoryBlock result;
+  result.Address = pa;
+  result.Size = NumPages*PageSize;
+
+  return result;
+}
+
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  if (M.Address == 0 || M.Size == 0) return false;
+  if (0 != ::munmap(M.Address, M.Size))
+    return MakeErrMsg(ErrMsg, "Can't release RWX Memory");
+  return false;
+}
+
+bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
+#if defined(__APPLE__) && defined(__arm__)
+  if (M.Address == 0 || M.Size == 0) return false;
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
+    (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_WRITE);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
+#if defined(__APPLE__) && defined(__arm__)
+  if (M.Address == 0 || M.Size == 0) return false;
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
+    (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+bool Memory::setRangeWritable(const void *Addr, size_t Size) {
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
+                                (vm_size_t)Size, 0,
+                                VM_PROT_READ | VM_PROT_WRITE);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
+#if defined(__APPLE__) && defined(__arm__)
+  kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
+                                (vm_size_t)Size, 0,
+                                VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
+  return KERN_SUCCESS == kr;
+#else
+  return true;
+#endif
+}
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(const void *Addr,
+                                        size_t Len) {
+
+// icache invalidation for PPC and ARM.
+#if defined(__APPLE__)
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
+#  endif
+
+#else
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+#  elif defined(__arm__) && defined(__GNUC__) && !defined(__FreeBSD__)
+  // FIXME: Can we safely always call this for __GNUC__ everywhere?
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
+#  elif defined(__mips__)
+  const char *Start = static_cast<const char *>(Addr);
+#    if defined(ANDROID)
+  // The declaration of "cacheflush" in Android bionic:
+  // extern int cacheflush(long start, long end, long flags);
+  const char *End = Start + Len;
+  long LStart = reinterpret_cast<long>(const_cast<char *>(Start));
+  long LEnd = reinterpret_cast<long>(const_cast<char *>(End));
+  cacheflush(LStart, LEnd, BCACHE);
+#    else
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
+#    endif
+#  endif
+
+#endif  // end apple
+
+  ValgrindDiscardTranslations(Addr, Len);
+}
+
+} // namespace sys
+} // namespace llvm
diff --git a/contrib/llvm/lib/Support/Unix/Mutex.inc b/contrib/llvm/lib/Support/Unix/Mutex.inc
new file mode 100644
index 000000000000..fe6b17041457
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Mutex.inc
@@ -0,0 +1,43 @@
+//===- llvm/Support/Unix/Mutex.inc - Unix Mutex Implementation ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific (non-pthread) Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+namespace llvm
+{
+using namespace sys;
+
+MutexImpl::MutexImpl( bool recursive)
+{
+}
+
+MutexImpl::~MutexImpl()
+{
+}
+
+bool
+MutexImpl::release()
+{
+  return true;
+}
+
+bool
+MutexImpl::tryacquire( void )
+{
+  return true;
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Unix/Path.inc b/contrib/llvm/lib/Support/Unix/Path.inc
new file mode 100644
index 000000000000..6a5ebb8cd9c7
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Path.inc
@@ -0,0 +1,900 @@
+//===- llvm/Support/Unix/Path.cpp - Unix Path Implementation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the Path class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_UTIME_H
+#include <utime.h>
+#endif
+#if HAVE_TIME_H
+#include <time.h>
+#endif
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+
+#if HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+// For GNU Hurd
+#if defined(__GNU__) && !defined(MAXPATHLEN)
+# define MAXPATHLEN 4096
+#endif
+
+// Put in a hack for Cygwin which falsely reports that the mkdtemp function
+// is available when it is not.
+#ifdef __CYGWIN__
+# undef HAVE_MKDTEMP
+#endif
+
+namespace {
+inline bool lastIsSlash(const std::string& path) {
+  return !path.empty() && path[path.length() - 1] == '/';
+}
+
+}
+
+namespace llvm {
+using namespace sys;
+
+const char sys::PathSeparator = ':';
+
+StringRef Path::GetEXESuffix() {
+  return StringRef();
+}
+
+Path::Path(StringRef p)
+  : path(p) {}
+
+Path::Path(const char *StrStart, unsigned StrLen)
+  : path(StrStart, StrLen) {}
+
+Path&
+Path::operator=(StringRef that) {
+  path.assign(that.data(), that.size());
+  return *this;
+}
+
+bool
+Path::isValid() const {
+  // Empty paths are considered invalid here.
+  // This code doesn't check MAXPATHLEN because there's no need. Nothing in
+  // LLVM manipulates Paths with fixed-sizes arrays, and if the OS can't
+  // handle names longer than some limit, it'll report this on demand using
+  // ENAMETOLONG.
+  return !path.empty();
+}
+
+bool
+Path::isAbsolute(const char *NameStart, unsigned NameLen) {
+  assert(NameStart);
+  if (NameLen == 0)
+    return false;
+  return NameStart[0] == '/';
+}
+
+bool
+Path::isAbsolute() const {
+  if (path.empty())
+    return false;
+  return path[0] == '/';
+}
+
+Path
+Path::GetRootDirectory() {
+  Path result;
+  result.set("/");
+  return result;
+}
+
+Path
+Path::GetTemporaryDirectory(std::string *ErrMsg) {
+#if defined(HAVE_MKDTEMP)
+  // The best way is with mkdtemp but that's not available on many systems,
+  // Linux and FreeBSD have it. Others probably won't.
+  char pathname[] = "/tmp/llvm_XXXXXX";
+  if (0 == mkdtemp(pathname)) {
+    MakeErrMsg(ErrMsg,
+               std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  return Path(pathname);
+#elif defined(HAVE_MKSTEMP)
+  // If no mkdtemp is available, mkstemp can be used to create a temporary file
+  // which is then removed and created as a directory. We prefer this over
+  // mktemp because of mktemp's inherent security and threading risks. We still
+  // have a slight race condition from the time the temporary file is created to
+  // the time it is re-created as a directoy.
+  char pathname[] = "/tmp/llvm_XXXXXX";
+  int fd = 0;
+  if (-1 == (fd = mkstemp(pathname))) {
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  ::close(fd);
+  ::unlink(pathname); // start race condition, ignore errors
+  if (-1 == ::mkdir(pathname, S_IRWXU)) { // end race condition
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  return Path(pathname);
+#elif defined(HAVE_MKTEMP)
+  // If a system doesn't have mkdtemp(3) or mkstemp(3) but it does have
+  // mktemp(3) then we'll assume that system (e.g. AIX) has a reasonable
+  // implementation of mktemp(3) and doesn't follow BSD 4.3's lead of replacing
+  // the XXXXXX with the pid of the process and a letter. That leads to only
+  // twenty six temporary files that can be generated.
+  char pathname[] = "/tmp/llvm_XXXXXX";
+  char *TmpName = ::mktemp(pathname);
+  if (TmpName == 0) {
+    MakeErrMsg(ErrMsg,
+      std::string(TmpName) + ": can't create unique directory name");
+    return Path();
+  }
+  if (-1 == ::mkdir(TmpName, S_IRWXU)) {
+    MakeErrMsg(ErrMsg,
+        std::string(TmpName) + ": can't create temporary directory");
+    return Path();
+  }
+  return Path(TmpName);
+#else
+  // This is the worst case implementation. tempnam(3) leaks memory unless its
+  // on an SVID2 (or later) system. On BSD 4.3 it leaks. tmpnam(3) has thread
+  // issues. The mktemp(3) function doesn't have enough variability in the
+  // temporary name generated. So, we provide our own implementation that
+  // increments an integer from a random number seeded by the current time. This
+  // should be sufficiently unique that we don't have many collisions between
+  // processes. Generally LLVM processes don't run very long and don't use very
+  // many temporary files so this shouldn't be a big issue for LLVM.
+  static time_t num = ::time(0);
+  char pathname[MAXPATHLEN];
+  do {
+    num++;
+    sprintf(pathname, "/tmp/llvm_%010u", unsigned(num));
+  } while ( 0 == access(pathname, F_OK ) );
+  if (-1 == ::mkdir(pathname, S_IRWXU)) {
+    MakeErrMsg(ErrMsg,
+      std::string(pathname) + ": can't create temporary directory");
+    return Path();
+  }
+  return Path(pathname);
+#endif
+}
+
+void
+Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
+#ifdef LTDL_SHLIBPATH_VAR
+  char* env_var = getenv(LTDL_SHLIBPATH_VAR);
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#endif
+  // FIXME: Should this look at LD_LIBRARY_PATH too?
+  Paths.push_back(sys::Path("/usr/local/lib/"));
+  Paths.push_back(sys::Path("/usr/X11R6/lib/"));
+  Paths.push_back(sys::Path("/usr/lib/"));
+  Paths.push_back(sys::Path("/lib/"));
+}
+
+void
+Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
+  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#ifdef LLVM_LIBDIR
+  {
+    Path tmpPath;
+    if (tmpPath.set(LLVM_LIBDIR))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+  }
+#endif
+  GetSystemLibraryPaths(Paths);
+}
+
+Path
+Path::GetUserHomeDirectory() {
+  const char* home = getenv("HOME");
+  Path result;
+  if (home && result.set(home))
+    return result;
+  result.set("/");
+  return result;
+}
+
+Path
+Path::GetCurrentDirectory() {
+  char pathname[MAXPATHLEN];
+  if (!getcwd(pathname, MAXPATHLEN)) {
+    assert(false && "Could not query current working directory.");
+    return Path();
+  }
+
+  return Path(pathname);
+}
+
+#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+    defined(__linux__) || defined(__CYGWIN__)
+static int
+test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
+    const char *dir, const char *bin)
+{
+  struct stat sb;
+
+  snprintf(buf, PATH_MAX, "%s/%s", dir, bin);
+  if (realpath(buf, ret) == NULL)
+    return (1);
+  if (stat(buf, &sb) != 0)
+    return (1);
+
+  return (0);
+}
+
+static char *
+getprogpath(char ret[PATH_MAX], const char *bin)
+{
+  char *pv, *s, *t, buf[PATH_MAX];
+
+  /* First approach: absolute path. */
+  if (bin[0] == '/') {
+    if (test_dir(buf, ret, "/", bin) == 0)
+      return (ret);
+    return (NULL);
+  }
+
+  /* Second approach: relative path. */
+  if (strchr(bin, '/') != NULL) {
+    if (getcwd(buf, PATH_MAX) == NULL)
+      return (NULL);
+    if (test_dir(buf, ret, buf, bin) == 0)
+      return (ret);
+    return (NULL);
+  }
+
+  /* Third approach: $PATH */
+  if ((pv = getenv("PATH")) == NULL)
+    return (NULL);
+  s = pv = strdup(pv);
+  if (pv == NULL)
+    return (NULL);
+  while ((t = strsep(&s, ":")) != NULL) {
+    if (test_dir(buf, ret, t, bin) == 0) {
+      free(pv);
+      return (ret);
+    }
+  }
+  free(pv);
+  return (NULL);
+}
+#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__
+
+/// GetMainExecutable - Return the path to the main executable, given the
+/// value of argv[0] from program startup.
+Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
+#if defined(__APPLE__)
+  // On OS X the executable path is saved to the stack by dyld. Reading it
+  // from there is much faster than calling dladdr, especially for large
+  // binaries with symbols.
+  char exe_path[MAXPATHLEN];
+  uint32_t size = sizeof(exe_path);
+  if (_NSGetExecutablePath(exe_path, &size) == 0) {
+    char link_path[MAXPATHLEN];
+    if (realpath(exe_path, link_path))
+      return Path(link_path);
+  }
+#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
+      defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
+  char exe_path[PATH_MAX];
+
+  if (getprogpath(exe_path, argv0) != NULL)
+    return Path(exe_path);
+#elif defined(__linux__) || defined(__CYGWIN__)
+  char exe_path[MAXPATHLEN];
+  StringRef aPath("/proc/self/exe");
+  if (sys::fs::exists(aPath)) {
+      // /proc is not always mounted under Linux (chroot for example).
+      ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+      if (len >= 0)
+          return Path(StringRef(exe_path, len));
+  } else {
+      // Fall back to the classical detection.
+      if (getprogpath(exe_path, argv0) != NULL)
+          return Path(exe_path);
+  }
+#elif defined(HAVE_DLFCN_H)
+  // Use dladdr to get executable path if available.
+  Dl_info DLInfo;
+  int err = dladdr(MainAddr, &DLInfo);
+  if (err == 0)
+    return Path();
+
+  // If the filename is a symlink, we need to resolve and return the location of
+  // the actual executable.
+  char link_path[MAXPATHLEN];
+  if (realpath(DLInfo.dli_fname, link_path))
+    return Path(link_path);
+#else
+#error GetMainExecutable is not implemented on this host yet.
+#endif
+  return Path();
+}
+
+
+StringRef Path::getDirname() const {
+  return getDirnameCharSep(path, "/");
+}
+
+StringRef
+Path::getBasename() const {
+  // Find the last slash
+  std::string::size_type slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  std::string::size_type dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return StringRef(path).substr(slash);
+  else
+    return StringRef(path).substr(slash, dot - slash);
+}
+
+StringRef
+Path::getSuffix() const {
+  // Find the last slash
+  std::string::size_type slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  std::string::size_type dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return StringRef();
+  else
+    return StringRef(path).substr(dot + 1);
+}
+
+bool Path::getMagicNumber(std::string &Magic, unsigned len) const {
+  assert(len < 1024 && "Request for magic string too long");
+  char Buf[1025];
+  int fd = ::open(path.c_str(), O_RDONLY);
+  if (fd < 0)
+    return false;
+  ssize_t bytes_read = ::read(fd, Buf, len);
+  ::close(fd);
+  if (ssize_t(len) != bytes_read)
+    return false;
+  Magic.assign(Buf, len);
+  return true;
+}
+
+bool
+Path::exists() const {
+  return 0 == access(path.c_str(), F_OK );
+}
+
+bool
+Path::isDirectory() const {
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf))
+    return false;
+  return ((buf.st_mode & S_IFMT) == S_IFDIR) ? true : false;
+}
+
+bool
+Path::isSymLink() const {
+  struct stat buf;
+  if (0 != lstat(path.c_str(), &buf))
+    return false;
+  return S_ISLNK(buf.st_mode);
+}
+
+
+bool
+Path::canRead() const {
+  return 0 == access(path.c_str(), R_OK);
+}
+
+bool
+Path::canWrite() const {
+  return 0 == access(path.c_str(), W_OK);
+}
+
+bool
+Path::isRegularFile() const {
+  // Get the status so we can determine if it's a file or directory
+  struct stat buf;
+
+  if (0 != stat(path.c_str(), &buf))
+    return false;
+
+  if (S_ISREG(buf.st_mode))
+    return true;
+
+  return false;
+}
+
+bool
+Path::canExecute() const {
+  if (0 != access(path.c_str(), R_OK | X_OK ))
+    return false;
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf))
+    return false;
+  if (!S_ISREG(buf.st_mode))
+    return false;
+  return true;
+}
+
+StringRef
+Path::getLast() const {
+  // Find the last slash
+  size_t pos = path.rfind('/');
+
+  // Handle the corner cases
+  if (pos == std::string::npos)
+    return path;
+
+  // If the last character is a slash
+  if (pos == path.length()-1) {
+    // Find the second to last slash
+    size_t pos2 = path.rfind('/', pos-1);
+    if (pos2 == std::string::npos)
+      return StringRef(path).substr(0,pos);
+    else
+      return StringRef(path).substr(pos2+1,pos-pos2-1);
+  }
+  // Return everything after the last slash
+  return StringRef(path).substr(pos+1);
+}
+
+const FileStatus *
+PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
+  if (!fsIsValid || update) {
+    struct stat buf;
+    if (0 != stat(path.c_str(), &buf)) {
+      MakeErrMsg(ErrStr, path + ": can't get status of file");
+      return 0;
+    }
+    status.fileSize = buf.st_size;
+    status.modTime.fromEpochTime(buf.st_mtime);
+    status.mode = buf.st_mode;
+    status.user = buf.st_uid;
+    status.group = buf.st_gid;
+    status.uniqueID = uint64_t(buf.st_ino);
+    status.isDir  = S_ISDIR(buf.st_mode);
+    status.isFile = S_ISREG(buf.st_mode);
+    fsIsValid = true;
+  }
+  return &status;
+}
+
+static bool AddPermissionBits(const Path &File, int bits) {
+  // Get the umask value from the operating system.  We want to use it
+  // when changing the file's permissions. Since calling umask() sets
+  // the umask and returns its old value, we must call it a second
+  // time to reset it to the user's preference.
+  int mask = umask(0777); // The arg. to umask is arbitrary.
+  umask(mask);            // Restore the umask.
+
+  // Get the file's current mode.
+  struct stat buf;
+  if (0 != stat(File.c_str(), &buf))
+    return false;
+  // Change the file to have whichever permissions bits from 'bits'
+  // that the umask would not disable.
+  if ((chmod(File.c_str(), (buf.st_mode | (bits & ~mask)))) == -1)
+      return false;
+  return true;
+}
+
+bool Path::makeReadableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0444))
+    return MakeErrMsg(ErrMsg, path + ": can't make file readable");
+  return false;
+}
+
+bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0222))
+    return MakeErrMsg(ErrMsg, path + ": can't make file writable");
+  return false;
+}
+
+bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
+  if (!AddPermissionBits(*this, 0111))
+    return MakeErrMsg(ErrMsg, path + ": can't make file executable");
+  return false;
+}
+
+bool
+Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
+  DIR* direntries = ::opendir(path.c_str());
+  if (direntries == 0)
+    return MakeErrMsg(ErrMsg, path + ": can't open directory");
+
+  std::string dirPath = path;
+  if (!lastIsSlash(dirPath))
+    dirPath += '/';
+
+  result.clear();
+  struct dirent* de = ::readdir(direntries);
+  for ( ; de != 0; de = ::readdir(direntries)) {
+    if (de->d_name[0] != '.') {
+      Path aPath(dirPath + (const char*)de->d_name);
+      struct stat st;
+      if (0 != lstat(aPath.path.c_str(), &st)) {
+        if (S_ISLNK(st.st_mode))
+          continue; // dangling symlink -- ignore
+        return MakeErrMsg(ErrMsg,
+                          aPath.path +  ": can't determine file object type");
+      }
+      result.insert(aPath);
+    }
+  }
+
+  closedir(direntries);
+  return false;
+}
+
+bool
+Path::set(StringRef a_path) {
+  if (a_path.empty())
+    return false;
+  path = a_path;
+  return true;
+}
+
+bool
+Path::appendComponent(StringRef name) {
+  if (name.empty())
+    return false;
+  if (!lastIsSlash(path))
+    path += '/';
+  path += name;
+  return true;
+}
+
+bool
+Path::eraseComponent() {
+  size_t slashpos = path.rfind('/',path.size());
+  if (slashpos == 0 || slashpos == std::string::npos) {
+    path.erase();
+    return true;
+  }
+  if (slashpos == path.size() - 1)
+    slashpos = path.rfind('/',slashpos-1);
+  if (slashpos == std::string::npos) {
+    path.erase();
+    return true;
+  }
+  path.erase(slashpos);
+  return true;
+}
+
+bool
+Path::eraseSuffix() {
+  size_t dotpos = path.rfind('.',path.size());
+  size_t slashpos = path.rfind('/',path.size());
+  if (dotpos != std::string::npos) {
+    if (slashpos == std::string::npos || dotpos > slashpos+1) {
+      path.erase(dotpos, path.size()-dotpos);
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool createDirectoryHelper(char* beg, char* end, bool create_parents) {
+
+  if (access(beg, R_OK | W_OK) == 0)
+    return false;
+
+  if (create_parents) {
+
+    char* c = end;
+
+    for (; c != beg; --c)
+      if (*c == '/') {
+
+        // Recurse to handling the parent directory.
+        *c = '\0';
+        bool x = createDirectoryHelper(beg, c, create_parents);
+        *c = '/';
+
+        // Return if we encountered an error.
+        if (x)
+          return true;
+
+        break;
+      }
+  }
+
+  return mkdir(beg, S_IRWXU | S_IRWXG) != 0;
+}
+
+bool
+Path::createDirectoryOnDisk( bool create_parents, std::string* ErrMsg ) {
+  // Get a writeable copy of the path name
+  std::string pathname(path);
+
+  // Null-terminate the last component
+  size_t lastchar = path.length() - 1 ;
+
+  if (pathname[lastchar] != '/')
+    ++lastchar;
+
+  pathname[lastchar] = '\0';
+
+  if (createDirectoryHelper(&pathname[0], &pathname[lastchar], create_parents))
+    return MakeErrMsg(ErrMsg, pathname + ": can't create directory");
+
+  return false;
+}
+
+bool
+Path::createFileOnDisk(std::string* ErrMsg) {
+  // Create the file
+  int fd = ::creat(path.c_str(), S_IRUSR | S_IWUSR);
+  if (fd < 0)
+    return MakeErrMsg(ErrMsg, path + ": can't create file");
+  ::close(fd);
+  return false;
+}
+
+bool
+Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
+  // Make this into a unique file name
+  if (makeUnique( reuse_current, ErrMsg ))
+    return true;
+
+  // create the file
+  int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
+  if (fd < 0)
+    return MakeErrMsg(ErrMsg, path + ": can't create temporary file");
+  ::close(fd);
+  return false;
+}
+
+bool
+Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
+  // Get the status so we can determine if it's a file or directory.
+  struct stat buf;
+  if (0 != stat(path.c_str(), &buf)) {
+    MakeErrMsg(ErrStr, path + ": can't get status of file");
+    return true;
+  }
+
+  // Note: this check catches strange situations. In all cases, LLVM should
+  // only be involved in the creation and deletion of regular files.  This
+  // check ensures that what we're trying to erase is a regular file. It
+  // effectively prevents LLVM from erasing things like /dev/null, any block
+  // special file, or other things that aren't "regular" files.
+  if (S_ISREG(buf.st_mode)) {
+    if (unlink(path.c_str()) != 0)
+      return MakeErrMsg(ErrStr, path + ": can't destroy file");
+    return false;
+  }
+
+  if (!S_ISDIR(buf.st_mode)) {
+    if (ErrStr) *ErrStr = "not a file or directory";
+    return true;
+  }
+
+  if (remove_contents) {
+    // Recursively descend the directory to remove its contents.
+    std::string cmd = "/bin/rm -rf " + path;
+    if (system(cmd.c_str()) != 0) {
+      MakeErrMsg(ErrStr, path + ": failed to recursively remove directory.");
+      return true;
+    }
+    return false;
+  }
+
+  // Otherwise, try to just remove the one directory.
+  std::string pathname(path);
+  size_t lastchar = path.length() - 1;
+  if (pathname[lastchar] == '/')
+    pathname[lastchar] = '\0';
+  else
+    pathname[lastchar+1] = '\0';
+
+  if (rmdir(pathname.c_str()) != 0)
+    return MakeErrMsg(ErrStr, pathname + ": can't erase directory");
+  return false;
+}
+
+bool
+Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
+  if (0 != ::rename(path.c_str(), newName.c_str()))
+    return MakeErrMsg(ErrMsg, std::string("can't rename '") + path + "' as '" +
+               newName.str() + "'");
+  return false;
+}
+
+bool
+Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrStr) const {
+  struct utimbuf utb;
+  utb.actime = si.modTime.toPosixTime();
+  utb.modtime = utb.actime;
+  if (0 != ::utime(path.c_str(),&utb))
+    return MakeErrMsg(ErrStr, path + ": can't set file modification time");
+  if (0 != ::chmod(path.c_str(),si.mode))
+    return MakeErrMsg(ErrStr, path + ": can't set mode");
+  return false;
+}
+
+bool
+sys::CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg){
+  int inFile = -1;
+  int outFile = -1;
+  inFile = ::open(Src.c_str(), O_RDONLY);
+  if (inFile == -1)
+    return MakeErrMsg(ErrMsg, Src.str() +
+      ": can't open source file to copy");
+
+  outFile = ::open(Dest.c_str(), O_WRONLY|O_CREAT, 0666);
+  if (outFile == -1) {
+    ::close(inFile);
+    return MakeErrMsg(ErrMsg, Dest.str() +
+      ": can't create destination file for copy");
+  }
+
+  char Buffer[16*1024];
+  while (ssize_t Amt = ::read(inFile, Buffer, 16*1024)) {
+    if (Amt == -1) {
+      if (errno != EINTR && errno != EAGAIN) {
+        ::close(inFile);
+        ::close(outFile);
+        return MakeErrMsg(ErrMsg, Src.str()+": can't read source file");
+      }
+    } else {
+      char *BufPtr = Buffer;
+      while (Amt) {
+        ssize_t AmtWritten = ::write(outFile, BufPtr, Amt);
+        if (AmtWritten == -1) {
+          if (errno != EINTR && errno != EAGAIN) {
+            ::close(inFile);
+            ::close(outFile);
+            return MakeErrMsg(ErrMsg, Dest.str() +
+              ": can't write destination file");
+          }
+        } else {
+          Amt -= AmtWritten;
+          BufPtr += AmtWritten;
+        }
+      }
+    }
+  }
+  ::close(inFile);
+  ::close(outFile);
+  return false;
+}
+
+bool
+Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
+  bool Exists;
+  if (reuse_current && (fs::exists(path, Exists) || !Exists))
+    return false; // File doesn't exist already, just use it!
+
+  // Append an XXXXXX pattern to the end of the file for use with mkstemp,
+  // mktemp or our own implementation.
+  // This uses std::vector instead of SmallVector to avoid a dependence on
+  // libSupport. And performance isn't critical here.
+  std::vector<char> Buf;
+  Buf.resize(path.size()+8);
+  char *FNBuffer = &Buf[0];
+    path.copy(FNBuffer,path.size());
+  bool isdir;
+  if (!fs::is_directory(path, isdir) && isdir)
+    strcpy(FNBuffer+path.size(), "/XXXXXX");
+  else
+    strcpy(FNBuffer+path.size(), "-XXXXXX");
+
+#if defined(HAVE_MKSTEMP)
+  int TempFD;
+  if ((TempFD = mkstemp(FNBuffer)) == -1)
+    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+
+  // We don't need to hold the temp file descriptor... we will trust that no one
+  // will overwrite/delete the file before we can open it again.
+  close(TempFD);
+
+  // Save the name
+  path = FNBuffer;
+
+  // By default mkstemp sets the mode to 0600, so update mode bits now.
+  AddPermissionBits (*this, 0666);
+#elif defined(HAVE_MKTEMP)
+  // If we don't have mkstemp, use the old and obsolete mktemp function.
+  if (mktemp(FNBuffer) == 0)
+    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+
+  // Save the name
+  path = FNBuffer;
+#else
+  // Okay, looks like we have to do it all by our lonesome.
+  static unsigned FCounter = 0;
+  // Try to initialize with unique value.
+  if (FCounter == 0) FCounter = ((unsigned)getpid() & 0xFFFF) << 8;
+  char* pos = strstr(FNBuffer, "XXXXXX");
+  do {
+    if (++FCounter > 0xFFFFFF) {
+      return MakeErrMsg(ErrMsg,
+        path + ": can't make unique filename: too many files");
+    }
+    sprintf(pos, "%06X", FCounter);
+    path = FNBuffer;
+  } while (exists());
+  // POSSIBLE SECURITY BUG: An attacker can easily guess the name and exploit
+  // LLVM.
+#endif
+  return false;
+}
+
+const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
+  int Flags = MAP_PRIVATE;
+#ifdef MAP_FILE
+  Flags |= MAP_FILE;
+#endif
+  void *BasePtr = ::mmap(0, FileSize, PROT_READ, Flags, FD, Offset);
+  if (BasePtr == MAP_FAILED)
+    return 0;
+  return (const char*)BasePtr;
+}
+
+void Path::UnMapFilePages(const char *BasePtr, size_t FileSize) {
+  const void *Addr = static_cast<const void *>(BasePtr);
+  ::munmap(const_cast<void *>(Addr), FileSize);
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Support/Unix/PathV2.inc b/contrib/llvm/lib/Support/Unix/PathV2.inc
new file mode 100644
index 000000000000..a3dfd4b0a32d
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/PathV2.inc
@@ -0,0 +1,695 @@
+//===- llvm/Support/Unix/PathV2.cpp - Unix Path Implementation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific implementation of the PathV2 API.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/Support/Process.h"
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+#if HAVE_STDIO_H
+#include <stdio.h>
+#endif
+#if HAVE_LIMITS_H
+#include <limits.h>
+#endif
+
+// Both stdio.h and cstdio are included via different pathes and
+// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
+// either.
+#undef ferror
+#undef feof
+
+// For GNU Hurd
+#if defined(__GNU__) && !defined(PATH_MAX)
+# define PATH_MAX 4096
+#endif
+
+using namespace llvm;
+
+namespace {
+  /// This class automatically closes the given file descriptor when it goes out
+  /// of scope. You can take back explicit ownership of the file descriptor by
+  /// calling take(). The destructor does not verify that close was successful.
+  /// Therefore, never allow this class to call close on a file descriptor that
+  /// has been read from or written to.
+  struct AutoFD {
+    int FileDescriptor;
+
+    AutoFD(int fd) : FileDescriptor(fd) {}
+    ~AutoFD() {
+      if (FileDescriptor >= 0)
+        ::close(FileDescriptor);
+    }
+
+    int take() {
+      int ret = FileDescriptor;
+      FileDescriptor = -1;
+      return ret;
+    }
+
+    operator int() const {return FileDescriptor;}
+  };
+
+  error_code TempDir(SmallVectorImpl<char> &result) {
+    // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
+    const char *dir = 0;
+    (dir = std::getenv("TMPDIR" )) ||
+    (dir = std::getenv("TMP"    )) ||
+    (dir = std::getenv("TEMP"   )) ||
+    (dir = std::getenv("TEMPDIR")) ||
+#ifdef P_tmpdir
+    (dir = P_tmpdir) ||
+#endif
+    (dir = "/tmp");
+
+    result.clear();
+    StringRef d(dir);
+    result.append(d.begin(), d.end());
+    return error_code::success();
+  }
+}
+
+namespace llvm {
+namespace sys  {
+namespace fs {
+
+error_code current_path(SmallVectorImpl<char> &result) {
+#ifdef MAXPATHLEN
+  result.reserve(MAXPATHLEN);
+#else
+// For GNU Hurd
+  result.reserve(1024);
+#endif
+
+  while (true) {
+    if (::getcwd(result.data(), result.capacity()) == 0) {
+      // See if there was a real error.
+      if (errno != errc::not_enough_memory)
+        return error_code(errno, system_category());
+      // Otherwise there just wasn't enough space.
+      result.reserve(result.capacity() * 2);
+    } else
+      break;
+  }
+
+  result.set_size(strlen(result.data()));
+  return error_code::success();
+}
+
+error_code copy_file(const Twine &from, const Twine &to, copy_option copt) {
+ // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+  const size_t buf_sz = 32768;
+  char buffer[buf_sz];
+  int from_file = -1, to_file = -1;
+
+  // Open from.
+  if ((from_file = ::open(f.begin(), O_RDONLY)) < 0)
+    return error_code(errno, system_category());
+  AutoFD from_fd(from_file);
+
+  // Stat from.
+  struct stat from_stat;
+  if (::stat(f.begin(), &from_stat) != 0)
+    return error_code(errno, system_category());
+
+  // Setup to flags.
+  int to_flags = O_CREAT | O_WRONLY;
+  if (copt == copy_option::fail_if_exists)
+    to_flags |= O_EXCL;
+
+  // Open to.
+  if ((to_file = ::open(t.begin(), to_flags, from_stat.st_mode)) < 0)
+    return error_code(errno, system_category());
+  AutoFD to_fd(to_file);
+
+  // Copy!
+  ssize_t sz, sz_read = 1, sz_write;
+  while (sz_read > 0 &&
+         (sz_read = ::read(from_fd, buffer, buf_sz)) > 0) {
+    // Allow for partial writes - see Advanced Unix Programming (2nd Ed.),
+    // Marc Rochkind, Addison-Wesley, 2004, page 94
+    sz_write = 0;
+    do {
+      if ((sz = ::write(to_fd, buffer + sz_write, sz_read - sz_write)) < 0) {
+        sz_read = sz;  // cause read loop termination.
+        break;         // error.
+      }
+      sz_write += sz;
+    } while (sz_write < sz_read);
+  }
+
+  // After all the file operations above the return value of close actually
+  // matters.
+  if (::close(from_fd.take()) < 0) sz_read = -1;
+  if (::close(to_fd.take()) < 0) sz_read = -1;
+
+  // Check for errors.
+  if (sz_read < 0)
+    return error_code(errno, system_category());
+
+  return error_code::success();
+}
+
+error_code create_directory(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
+    if (errno != errc::file_exists)
+      return error_code(errno, system_category());
+    existed = true;
+  } else
+    existed = false;
+
+  return error_code::success();
+}
+
+error_code create_hard_link(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+  if (::link(t.begin(), f.begin()) == -1)
+    return error_code(errno, system_category());
+
+  return error_code::success();
+}
+
+error_code create_symlink(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+  if (::symlink(t.begin(), f.begin()) == -1)
+    return error_code(errno, system_category());
+
+  return error_code::success();
+}
+
+error_code remove(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::remove(p.begin()) == -1) {
+    if (errno != errc::no_such_file_or_directory)
+      return error_code(errno, system_category());
+    existed = false;
+  } else
+    existed = true;
+
+  return error_code::success();
+}
+
+error_code rename(const Twine &from, const Twine &to) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
+
+  if (::rename(f.begin(), t.begin()) == -1) {
+    // If it's a cross device link, copy then delete, otherwise return the error
+    if (errno == EXDEV) {
+      if (error_code ec = copy_file(from, to, copy_option::overwrite_if_exists))
+        return ec;
+      bool Existed;
+      if (error_code ec = remove(from, Existed))
+        return ec;
+    } else
+      return error_code(errno, system_category());
+  }
+
+  return error_code::success();
+}
+
+error_code resize_file(const Twine &path, uint64_t size) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::truncate(p.begin(), size) == -1)
+    return error_code(errno, system_category());
+
+  return error_code::success();
+}
+
+error_code exists(const Twine &path, bool &result) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::access(p.begin(), F_OK) == -1) {
+    if (errno != errc::no_such_file_or_directory)
+      return error_code(errno, system_category());
+    result = false;
+  } else
+    result = true;
+
+  return error_code::success();
+}
+
+bool equivalent(file_status A, file_status B) {
+  assert(status_known(A) && status_known(B));
+  return A.fs_st_dev == B.fs_st_dev &&
+         A.fs_st_ino == B.fs_st_ino;
+}
+
+error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+  file_status fsA, fsB;
+  if (error_code ec = status(A, fsA)) return ec;
+  if (error_code ec = status(B, fsB)) return ec;
+  result = equivalent(fsA, fsB);
+  return error_code::success();
+}
+
+error_code file_size(const Twine &path, uint64_t &result) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  struct stat status;
+  if (::stat(p.begin(), &status) == -1)
+    return error_code(errno, system_category());
+  if (!S_ISREG(status.st_mode))
+    return make_error_code(errc::operation_not_permitted);
+
+  result = status.st_size;
+  return error_code::success();
+}
+
+error_code status(const Twine &path, file_status &result) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  struct stat status;
+  if (::stat(p.begin(), &status) != 0) {
+    error_code ec(errno, system_category());
+    if (ec == errc::no_such_file_or_directory)
+      result = file_status(file_type::file_not_found);
+    else
+      result = file_status(file_type::status_error);
+    return ec;
+  }
+
+  perms prms = static_cast<perms>(status.st_mode & perms_mask);
+  
+  if (S_ISDIR(status.st_mode))
+    result = file_status(file_type::directory_file, prms);
+  else if (S_ISREG(status.st_mode))
+    result = file_status(file_type::regular_file, prms);
+  else if (S_ISBLK(status.st_mode))
+    result = file_status(file_type::block_file, prms);
+  else if (S_ISCHR(status.st_mode))
+    result = file_status(file_type::character_file, prms);
+  else if (S_ISFIFO(status.st_mode))
+    result = file_status(file_type::fifo_file, prms);
+  else if (S_ISSOCK(status.st_mode))
+    result = file_status(file_type::socket_file, prms);
+  else
+    result = file_status(file_type::type_unknown, prms);
+
+  result.fs_st_dev = status.st_dev;
+  result.fs_st_ino = status.st_ino;
+
+  return error_code::success();
+}
+
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+  if ((prms & add_perms) && (prms & remove_perms))
+    llvm_unreachable("add_perms and remove_perms are mutually exclusive");
+
+  // Get current permissions
+  file_status info;
+  if (error_code ec = status(path, info)) {
+    return ec;
+  }
+  
+  // Set updated permissions.
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+  perms permsToSet;
+  if (prms & add_perms) {
+    permsToSet = (info.permissions() | prms) & perms_mask;
+  } else if (prms & remove_perms) {
+    permsToSet = (info.permissions() & ~prms) & perms_mask;
+  } else {
+    permsToSet = prms & perms_mask;
+  }
+  if (::chmod(p.begin(), static_cast<mode_t>(permsToSet))) {
+    return error_code(errno, system_category()); 
+  }
+
+  return error_code::success();
+}
+
+// Since this is most often used for temporary files, mode defaults to 0600.
+error_code unique_file(const Twine &model, int &result_fd,
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
+  SmallString<128> Model;
+  model.toVector(Model);
+  // Null terminate.
+  Model.c_str();
+
+  if (makeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = path::is_absolute(Twine(Model));
+    if (!absolute) {
+      SmallString<128> TDir;
+      if (error_code ec = TempDir(TDir)) return ec;
+      path::append(TDir, Twine(Model));
+      Model.swap(TDir);
+    }
+  }
+
+  // From here on, DO NOT modify model. It may be needed if the randomly chosen
+  // path already exists.
+  SmallString<128> RandomPath = Model;
+
+retry_random_path:
+  // Replace '%' with random chars.
+  for (unsigned i = 0, e = Model.size(); i != e; ++i) {
+    if (Model[i] == '%')
+      RandomPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15];
+  }
+
+  // Make sure we don't fall into an infinite loop by constantly trying
+  // to create the parent path.
+  bool TriedToCreateParent = false;
+
+  // Try to open + create the file.
+rety_open_create:
+  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode);
+  if (RandomFD == -1) {
+    int SavedErrno = errno;
+    // If the file existed, try again, otherwise, error.
+    if (SavedErrno == errc::file_exists)
+      goto retry_random_path;
+    // If path prefix doesn't exist, try to create it.
+    if (SavedErrno == errc::no_such_file_or_directory &&
+        !exists(path::parent_path(RandomPath)) &&
+        !TriedToCreateParent) {
+      TriedToCreateParent = true;
+      StringRef p(RandomPath);
+      SmallString<64> dir_to_create;
+      for (path::const_iterator i = path::begin(p),
+                                e = --path::end(p); i != e; ++i) {
+        path::append(dir_to_create, *i);
+        bool Exists;
+        if (error_code ec = exists(Twine(dir_to_create), Exists)) return ec;
+        if (!Exists) {
+          // Don't try to create network paths.
+          if (i->size() > 2 && (*i)[0] == '/' &&
+                               (*i)[1] == '/' &&
+                               (*i)[2] != '/')
+            return make_error_code(errc::no_such_file_or_directory);
+          if (::mkdir(dir_to_create.c_str(), 0700) == -1 &&
+              errno != errc::file_exists)
+            return error_code(errno, system_category());
+        }
+      }
+      goto rety_open_create;
+    }
+
+    return error_code(SavedErrno, system_category());
+  }
+
+   // Make the path absolute.
+  char real_path_buff[PATH_MAX + 1];
+  if (realpath(RandomPath.c_str(), real_path_buff) == NULL) {
+    int error = errno;
+    ::close(RandomFD);
+    ::unlink(RandomPath.c_str());
+    return error_code(error, system_category());
+  }
+
+  result_path.clear();
+  StringRef d(real_path_buff);
+  result_path.append(d.begin(), d.end());
+
+  result_fd = RandomFD;
+  return error_code::success();
+}
+
+error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+  AutoFD ScopedFD(FD);
+  if (!CloseFD)
+    ScopedFD.take();
+
+  // Figure out how large the file is.
+  struct stat FileInfo;
+  if (fstat(FD, &FileInfo) == -1)
+    return error_code(errno, system_category());
+  uint64_t FileSize = FileInfo.st_size;
+
+  if (Size == 0)
+    Size = FileSize;
+  else if (FileSize < Size) {
+    // We need to grow the file.
+    if (ftruncate(FD, Size) == -1)
+      return error_code(errno, system_category());
+  }
+
+  int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
+  int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  Mapping = ::mmap(0, Size, prot, flags, FD, Offset);
+  if (Mapping == MAP_FAILED)
+    return error_code(errno, system_category());
+  return error_code::success();
+}
+
+mapped_file_region::mapped_file_region(const Twine &path,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping() {
+  // Make sure that the requested size fits within SIZE_T.
+  if (length > std::numeric_limits<size_t>::max()) {
+    ec = make_error_code(errc::invalid_argument);
+    return;
+  }
+
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
+  int ofd = ::open(name.begin(), oflags);
+  if (ofd == -1) {
+    ec = error_code(errno, system_category());
+    return;
+  }
+
+  ec = init(ofd, true, offset);
+  if (ec)
+    Mapping = 0;
+}
+
+mapped_file_region::mapped_file_region(int fd,
+                                       bool closefd,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping() {
+  // Make sure that the requested size fits within SIZE_T.
+  if (length > std::numeric_limits<size_t>::max()) {
+    ec = make_error_code(errc::invalid_argument);
+    return;
+  }
+
+  ec = init(fd, closefd, offset);
+  if (ec)
+    Mapping = 0;
+}
+
+mapped_file_region::~mapped_file_region() {
+  if (Mapping)
+    ::munmap(Mapping, Size);
+}
+
+#if LLVM_HAS_RVALUE_REFERENCES
+mapped_file_region::mapped_file_region(mapped_file_region &&other)
+  : Mode(other.Mode), Size(other.Size), Mapping(other.Mapping) {
+  other.Mapping = 0;
+}
+#endif
+
+mapped_file_region::mapmode mapped_file_region::flags() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Mode;
+}
+
+uint64_t mapped_file_region::size() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Size;
+}
+
+char *mapped_file_region::data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
+  return reinterpret_cast<char*>(Mapping);
+}
+
+const char *mapped_file_region::const_data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<const char*>(Mapping);
+}
+
+int mapped_file_region::alignment() {
+  return process::get_self()->page_size();
+}
+
+error_code detail::directory_iterator_construct(detail::DirIterState &it,
+                                                StringRef path){
+  SmallString<128> path_null(path);
+  DIR *directory = ::opendir(path_null.c_str());
+  if (directory == 0)
+    return error_code(errno, system_category());
+
+  it.IterationHandle = reinterpret_cast<intptr_t>(directory);
+  // Add something for replace_filename to replace.
+  path::append(path_null, ".");
+  it.CurrentEntry = directory_entry(path_null.str());
+  return directory_iterator_increment(it);
+}
+
+error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+  if (it.IterationHandle)
+    ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
+  it.IterationHandle = 0;
+  it.CurrentEntry = directory_entry();
+  return error_code::success();
+}
+
+error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+  errno = 0;
+  dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
+  if (cur_dir == 0 && errno != 0) {
+    return error_code(errno, system_category());
+  } else if (cur_dir != 0) {
+    StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
+    if ((name.size() == 1 && name[0] == '.') ||
+        (name.size() == 2 && name[0] == '.' && name[1] == '.'))
+      return directory_iterator_increment(it);
+    it.CurrentEntry.replace_filename(name);
+  } else
+    return directory_iterator_destruct(it);
+
+  return error_code::success();
+}
+
+error_code get_magic(const Twine &path, uint32_t len,
+                     SmallVectorImpl<char> &result) {
+  SmallString<128> PathStorage;
+  StringRef Path = path.toNullTerminatedStringRef(PathStorage);
+  result.set_size(0);
+
+  // Open path.
+  std::FILE *file = std::fopen(Path.data(), "rb");
+  if (file == 0)
+    return error_code(errno, system_category());
+
+  // Reserve storage.
+  result.reserve(len);
+
+  // Read magic!
+  size_t size = std::fread(result.data(), 1, len, file);
+  if (std::ferror(file) != 0) {
+    std::fclose(file);
+    return error_code(errno, system_category());
+  } else if (size != result.size()) {
+    if (std::feof(file) != 0) {
+      std::fclose(file);
+      result.set_size(size);
+      return make_error_code(errc::value_too_large);
+    }
+  }
+  std::fclose(file);
+  result.set_size(len);
+  return error_code::success();
+}
+
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = map_writable ? O_RDWR : O_RDONLY;
+  int ofd = ::open(name.begin(), oflags);
+  if ( ofd == -1 )
+    return error_code(errno, system_category());
+  AutoFD fd(ofd);
+  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
+  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  result = ::mmap(0, size, prot, flags, fd, file_offset);
+  if (result == MAP_FAILED) {
+    return error_code(errno, system_category());
+  }
+  
+  return error_code::success();
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  if ( ::munmap(base, size) == -1 )
+    return error_code(errno, system_category());
+   
+  return error_code::success();
+}
+
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Support/Unix/Process.inc b/contrib/llvm/lib/Support/Unix/Process.inc
new file mode 100644
index 000000000000..9a4454f1c650
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Process.inc
@@ -0,0 +1,343 @@
+//===- Unix/Process.cpp - Unix Process Implementation --------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the generic Unix implementation of the Process class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/TimeValue.h"
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+// DragonFlyBSD, OpenBSD, and Bitrig have deprecated <malloc.h> for
+// <stdlib.h> instead. Unix.h includes this for us already.
+#if defined(HAVE_MALLOC_H) && !defined(__DragonFly__) && \
+    !defined(__OpenBSD__) && !defined(__Bitrig__)
+#include <malloc.h>
+#endif
+#ifdef HAVE_MALLOC_MALLOC_H
+#include <malloc/malloc.h>
+#endif
+#ifdef HAVE_SYS_IOCTL_H
+#  include <sys/ioctl.h>
+#endif
+#ifdef HAVE_TERMIOS_H
+#  include <termios.h>
+#endif
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+using namespace llvm;
+using namespace sys;
+
+
+process::id_type self_process::get_id() {
+  return getpid();
+}
+
+static std::pair<TimeValue, TimeValue> getRUsageTimes() {
+#if defined(HAVE_GETRUSAGE)
+  struct rusage RU;
+  ::getrusage(RUSAGE_SELF, &RU);
+  return std::make_pair(
+      TimeValue(
+          static_cast<TimeValue::SecondsType>(RU.ru_utime.tv_sec),
+          static_cast<TimeValue::NanoSecondsType>(
+              RU.ru_utime.tv_usec * TimeValue::NANOSECONDS_PER_MICROSECOND)),
+      TimeValue(
+          static_cast<TimeValue::SecondsType>(RU.ru_stime.tv_sec),
+          static_cast<TimeValue::NanoSecondsType>(
+              RU.ru_stime.tv_usec * TimeValue::NANOSECONDS_PER_MICROSECOND)));
+#else
+#warning Cannot get usage times on this platform
+  return std::make_pair(TimeValue(), TimeValue());
+#endif
+}
+
+TimeValue self_process::get_user_time() const {
+#if _POSIX_TIMERS > 0 && _POSIX_CPUTIME > 0
+  // Try to get a high resolution CPU timer.
+  struct timespec TS;
+  if (::clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &TS) == 0)
+    return TimeValue(static_cast<TimeValue::SecondsType>(TS.tv_sec),
+                     static_cast<TimeValue::NanoSecondsType>(TS.tv_nsec));
+#endif
+
+  // Otherwise fall back to rusage based timing.
+  return getRUsageTimes().first;
+}
+
+TimeValue self_process::get_system_time() const {
+  // We can only collect system time by inspecting the results of getrusage.
+  return getRUsageTimes().second;
+}
+
+static unsigned getPageSize() {
+#if defined(__CYGWIN__)
+  // On Cygwin, getpagesize() returns 64k but the page size for the purposes of
+  // memory protection and mmap() is 4k.
+  // See http://www.cygwin.com/ml/cygwin/2009-01/threads.html#00492
+  const int page_size = 0x1000;
+#elif defined(HAVE_GETPAGESIZE)
+  const int page_size = ::getpagesize();
+#elif defined(HAVE_SYSCONF)
+  long page_size = ::sysconf(_SC_PAGE_SIZE);
+#else
+#warning Cannot get the page size on this machine
+#endif
+  return static_cast<unsigned>(page_size);
+}
+
+// This constructor guaranteed to be run exactly once on a single thread, and
+// sets up various process invariants that can be queried cheaply from then on.
+self_process::self_process() : PageSize(getPageSize()) {
+}
+
+
+size_t Process::GetMallocUsage() {
+#if defined(HAVE_MALLINFO)
+  struct mallinfo mi;
+  mi = ::mallinfo();
+  return mi.uordblks;
+#elif defined(HAVE_MALLOC_ZONE_STATISTICS) && defined(HAVE_MALLOC_MALLOC_H)
+  malloc_statistics_t Stats;
+  malloc_zone_statistics(malloc_default_zone(), &Stats);
+  return Stats.size_in_use;   // darwin
+#elif defined(HAVE_SBRK)
+  // Note this is only an approximation and more closely resembles
+  // the value returned by mallinfo in the arena field.
+  static char *StartOfMemory = reinterpret_cast<char*>(::sbrk(0));
+  char *EndOfMemory = (char*)sbrk(0);
+  if (EndOfMemory != ((char*)-1) && StartOfMemory != ((char*)-1))
+    return EndOfMemory - StartOfMemory;
+  else
+    return 0;
+#else
+#warning Cannot get malloc info on this platform
+  return 0;
+#endif
+}
+
+void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
+                           TimeValue &sys_time) {
+  elapsed = TimeValue::now();
+  llvm::tie(user_time, sys_time) = getRUsageTimes();
+}
+
+int Process::GetCurrentUserId() {
+  return getuid();
+}
+
+int Process::GetCurrentGroupId() {
+  return getgid();
+}
+
+#if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
+#include <mach/mach.h>
+#endif
+
+// Some LLVM programs such as bugpoint produce core files as a normal part of
+// their operation. To prevent the disk from filling up, this function
+// does what's necessary to prevent their generation.
+void Process::PreventCoreFiles() {
+#if HAVE_SETRLIMIT
+  struct rlimit rlim;
+  rlim.rlim_cur = rlim.rlim_max = 0;
+  setrlimit(RLIMIT_CORE, &rlim);
+#endif
+
+#if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
+  // Disable crash reporting on Mac OS X 10.0-10.4
+
+  // get information about the original set of exception ports for the task
+  mach_msg_type_number_t Count = 0;
+  exception_mask_t OriginalMasks[EXC_TYPES_COUNT];
+  exception_port_t OriginalPorts[EXC_TYPES_COUNT];
+  exception_behavior_t OriginalBehaviors[EXC_TYPES_COUNT];
+  thread_state_flavor_t OriginalFlavors[EXC_TYPES_COUNT];
+  kern_return_t err =
+    task_get_exception_ports(mach_task_self(), EXC_MASK_ALL, OriginalMasks,
+                             &Count, OriginalPorts, OriginalBehaviors,
+                             OriginalFlavors);
+  if (err == KERN_SUCCESS) {
+    // replace each with MACH_PORT_NULL.
+    for (unsigned i = 0; i != Count; ++i)
+      task_set_exception_ports(mach_task_self(), OriginalMasks[i],
+                               MACH_PORT_NULL, OriginalBehaviors[i],
+                               OriginalFlavors[i]);
+  }
+
+  // Disable crash reporting on Mac OS X 10.5
+  signal(SIGABRT, _exit);
+  signal(SIGILL,  _exit);
+  signal(SIGFPE,  _exit);
+  signal(SIGSEGV, _exit);
+  signal(SIGBUS,  _exit);
+#endif
+}
+
+bool Process::StandardInIsUserInput() {
+  return FileDescriptorIsDisplayed(STDIN_FILENO);
+}
+
+bool Process::StandardOutIsDisplayed() {
+  return FileDescriptorIsDisplayed(STDOUT_FILENO);
+}
+
+bool Process::StandardErrIsDisplayed() {
+  return FileDescriptorIsDisplayed(STDERR_FILENO);
+}
+
+bool Process::FileDescriptorIsDisplayed(int fd) {
+#if HAVE_ISATTY
+  return isatty(fd);
+#else
+  // If we don't have isatty, just return false.
+  return false;
+#endif
+}
+
+static unsigned getColumns(int FileID) {
+  // If COLUMNS is defined in the environment, wrap to that many columns.
+  if (const char *ColumnsStr = std::getenv("COLUMNS")) {
+    int Columns = std::atoi(ColumnsStr);
+    if (Columns > 0)
+      return Columns;
+  }
+
+  unsigned Columns = 0;
+
+#if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H)
+  // Try to determine the width of the terminal.
+  struct winsize ws;
+  // Zero-fill ws to avoid a false positive from MemorySanitizer.
+  memset(&ws, 0, sizeof(ws));
+  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
+    Columns = ws.ws_col;
+#endif
+
+  return Columns;
+}
+
+unsigned Process::StandardOutColumns() {
+  if (!StandardOutIsDisplayed())
+    return 0;
+
+  return getColumns(1);
+}
+
+unsigned Process::StandardErrColumns() {
+  if (!StandardErrIsDisplayed())
+    return 0;
+
+  return getColumns(2);
+}
+
+static bool terminalHasColors() {
+  if (const char *term = std::getenv("TERM")) {
+    // Most modern terminals support ANSI escape sequences for colors.
+    // We could check terminfo, or have a list of known terms that support
+    // colors, but that would be overkill.
+    // The user can always ask for no colors by setting TERM to dumb, or
+    // using a commandline flag.
+    return strcmp(term, "dumb") != 0;
+  }
+  return false;
+}
+
+bool Process::FileDescriptorHasColors(int fd) {
+  // A file descriptor has colors if it is displayed and the terminal has
+  // colors.
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
+}
+
+bool Process::StandardOutHasColors() {
+  return FileDescriptorHasColors(STDOUT_FILENO);
+}
+
+bool Process::StandardErrHasColors() {
+  return FileDescriptorHasColors(STDERR_FILENO);
+}
+
+bool Process::ColorNeedsFlush() {
+  // No, we use ANSI escape sequences.
+  return false;
+}
+
+#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
+
+#define ALLCOLORS(FGBG,BOLD) {\
+    COLOR(FGBG, "0", BOLD),\
+    COLOR(FGBG, "1", BOLD),\
+    COLOR(FGBG, "2", BOLD),\
+    COLOR(FGBG, "3", BOLD),\
+    COLOR(FGBG, "4", BOLD),\
+    COLOR(FGBG, "5", BOLD),\
+    COLOR(FGBG, "6", BOLD),\
+    COLOR(FGBG, "7", BOLD)\
+  }
+
+static const char colorcodes[2][2][8][10] = {
+ { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
+ { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
+};
+
+const char *Process::OutputColor(char code, bool bold, bool bg) {
+  return colorcodes[bg?1:0][bold?1:0][code&7];
+}
+
+const char *Process::OutputBold(bool bg) {
+  return "\033[1m";
+}
+
+const char *Process::OutputReverse() {
+  return "\033[7m";
+}
+
+const char *Process::ResetColor() {
+  return "\033[0m";
+}
+
+#if !defined(HAVE_ARC4RANDOM)
+static unsigned GetRandomNumberSeed() {
+  // Attempt to get the initial seed from /dev/urandom, if possible.
+  if (FILE *RandomSource = ::fopen("/dev/urandom", "r")) {
+    unsigned seed;
+    int count = ::fread((void *)&seed, sizeof(seed), 1, RandomSource);
+    ::fclose(RandomSource);
+
+    // Return the seed if the read was successful.
+    if (count == 1)
+      return seed;
+  }
+
+  // Otherwise, swizzle the current time and the process ID to form a reasonable
+  // seed.
+  TimeValue Now = TimeValue::now();
+  return hash_combine(Now.seconds(), Now.nanoseconds(), ::getpid());
+}
+#endif
+
+unsigned llvm::sys::Process::GetRandomNumber() {
+#if defined(HAVE_ARC4RANDOM)
+  return arc4random();
+#else
+  static int x = (::srand(GetRandomNumberSeed()), 0);
+  (void)x;
+  return ::rand();
+#endif
+}
diff --git a/contrib/llvm/lib/Support/Unix/Program.inc b/contrib/llvm/lib/Support/Unix/Program.inc
new file mode 100644
index 000000000000..117151c91d8b
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Program.inc
@@ -0,0 +1,412 @@
+//===- llvm/Support/Unix/Program.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the Program class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/FileSystem.h"
+#include <llvm/Config/config.h>
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+#if HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#ifdef HAVE_POSIX_SPAWN
+#include <spawn.h>
+#if !defined(__APPLE__)
+  extern char **environ;
+#else
+#include <crt_externs.h> // _NSGetEnviron
+#endif
+#endif
+
+namespace llvm {
+using namespace sys;
+
+Program::Program() : Data_(0) {}
+
+Program::~Program() {}
+
+// This function just uses the PATH environment variable to find the program.
+Path
+Program::FindProgramByName(const std::string& progName) {
+
+  // Check some degenerate cases
+  if (progName.length() == 0) // no program
+    return Path();
+  Path temp;
+  if (!temp.set(progName)) // invalid name
+    return Path();
+  // Use the given path verbatim if it contains any slashes; this matches
+  // the behavior of sh(1) and friends.
+  if (progName.find('/') != std::string::npos)
+    return temp;
+
+  // At this point, the file name is valid and does not contain slashes. Search
+  // for it through the directories specified in the PATH environment variable.
+
+  // Get the path. If its empty, we can't do anything to find it.
+  const char *PathStr = getenv("PATH");
+  if (PathStr == 0)
+    return Path();
+
+  // Now we have a colon separated list of directories to search; try them.
+  size_t PathLen = strlen(PathStr);
+  while (PathLen) {
+    // Find the first colon...
+    const char *Colon = std::find(PathStr, PathStr+PathLen, ':');
+
+    // Check to see if this first directory contains the executable...
+    Path FilePath;
+    if (FilePath.set(std::string(PathStr,Colon))) {
+      FilePath.appendComponent(progName);
+      if (FilePath.canExecute())
+        return FilePath;                    // Found the executable!
+    }
+
+    // Nope it wasn't in this directory, check the next path in the list!
+    PathLen -= Colon-PathStr;
+    PathStr = Colon;
+
+    // Advance past duplicate colons
+    while (*PathStr == ':') {
+      PathStr++;
+      PathLen--;
+    }
+  }
+  return Path();
+}
+
+static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
+  if (Path == 0) // Noop
+    return false;
+  const char *File;
+  if (Path->isEmpty())
+    // Redirect empty paths to /dev/null
+    File = "/dev/null";
+  else
+    File = Path->c_str();
+
+  // Open the file
+  int InFD = open(File, FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666);
+  if (InFD == -1) {
+    MakeErrMsg(ErrMsg, "Cannot open file '" + std::string(File) + "' for "
+              + (FD == 0 ? "input" : "output"));
+    return true;
+  }
+
+  // Install it as the requested FD
+  if (dup2(InFD, FD) == -1) {
+    MakeErrMsg(ErrMsg, "Cannot dup2");
+    close(InFD);
+    return true;
+  }
+  close(InFD);      // Close the original FD
+  return false;
+}
+
+#ifdef HAVE_POSIX_SPAWN
+static bool RedirectIO_PS(const Path *Path, int FD, std::string *ErrMsg,
+                          posix_spawn_file_actions_t *FileActions) {
+  if (Path == 0) // Noop
+    return false;
+  const char *File;
+  if (Path->isEmpty())
+    // Redirect empty paths to /dev/null
+    File = "/dev/null";
+  else
+    File = Path->c_str();
+
+  if (int Err = posix_spawn_file_actions_addopen(FileActions, FD,
+                            File, FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666))
+    return MakeErrMsg(ErrMsg, "Cannot dup2", Err);
+  return false;
+}
+#endif
+
+static void TimeOutHandler(int Sig) {
+}
+
+static void SetMemoryLimits (unsigned size)
+{
+#if HAVE_SYS_RESOURCE_H && HAVE_GETRLIMIT && HAVE_SETRLIMIT
+  struct rlimit r;
+  __typeof__ (r.rlim_cur) limit = (__typeof__ (r.rlim_cur)) (size) * 1048576;
+
+  // Heap size
+  getrlimit (RLIMIT_DATA, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_DATA, &r);
+#ifdef RLIMIT_RSS
+  // Resident set size.
+  getrlimit (RLIMIT_RSS, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_RSS, &r);
+#endif
+#ifdef RLIMIT_AS  // e.g. NetBSD doesn't have it.
+  // Don't set virtual memory limit if built with any Sanitizer. They need 80Tb
+  // of virtual memory for shadow memory mapping.
+#if !LLVM_MEMORY_SANITIZER_BUILD && !LLVM_ADDRESS_SANITIZER_BUILD
+  // Virtual memory.
+  getrlimit (RLIMIT_AS, &r);
+  r.rlim_cur = limit;
+  setrlimit (RLIMIT_AS, &r);
+#endif
+#endif
+#endif
+}
+
+bool
+Program::Execute(const Path &path, const char **args, const char **envp,
+                 const Path **redirects, unsigned memoryLimit,
+                  std::string *ErrMsg) {
+  // If this OS has posix_spawn and there is no memory limit being implied, use
+  // posix_spawn.  It is more efficient than fork/exec.
+#ifdef HAVE_POSIX_SPAWN
+  if (memoryLimit == 0) {
+    posix_spawn_file_actions_t FileActionsStore;
+    posix_spawn_file_actions_t *FileActions = 0;
+
+    if (redirects) {
+      FileActions = &FileActionsStore;
+      posix_spawn_file_actions_init(FileActions);
+
+      // Redirect stdin/stdout.
+      if (RedirectIO_PS(redirects[0], 0, ErrMsg, FileActions) ||
+          RedirectIO_PS(redirects[1], 1, ErrMsg, FileActions))
+        return false;
+      if (redirects[1] == 0 || redirects[2] == 0 ||
+          *redirects[1] != *redirects[2]) {
+        // Just redirect stderr
+        if (RedirectIO_PS(redirects[2], 2, ErrMsg, FileActions)) return false;
+      } else {
+        // If stdout and stderr should go to the same place, redirect stderr
+        // to the FD already open for stdout.
+        if (int Err = posix_spawn_file_actions_adddup2(FileActions, 1, 2))
+          return !MakeErrMsg(ErrMsg, "Can't redirect stderr to stdout", Err);
+      }
+    }
+
+    if (!envp)
+#if !defined(__APPLE__)
+      envp = const_cast<const char **>(environ);
+#else
+      // environ is missing in dylibs.
+      envp = const_cast<const char **>(*_NSGetEnviron());
+#endif
+
+    // Explicitly initialized to prevent what appears to be a valgrind false
+    // positive.
+    pid_t PID = 0;
+    int Err = posix_spawn(&PID, path.c_str(), FileActions, /*attrp*/0,
+                          const_cast<char **>(args), const_cast<char **>(envp));
+
+    if (FileActions)
+      posix_spawn_file_actions_destroy(FileActions);
+
+    if (Err)
+     return !MakeErrMsg(ErrMsg, "posix_spawn failed", Err);
+
+    Data_ = reinterpret_cast<void*>(PID);
+    return true;
+  }
+#endif
+
+  // Create a child process.
+  int child = fork();
+  switch (child) {
+    // An error occurred:  Return to the caller.
+    case -1:
+      MakeErrMsg(ErrMsg, "Couldn't fork");
+      return false;
+
+    // Child process: Execute the program.
+    case 0: {
+      // Redirect file descriptors...
+      if (redirects) {
+        // Redirect stdin
+        if (RedirectIO(redirects[0], 0, ErrMsg)) { return false; }
+        // Redirect stdout
+        if (RedirectIO(redirects[1], 1, ErrMsg)) { return false; }
+        if (redirects[1] && redirects[2] &&
+            *(redirects[1]) == *(redirects[2])) {
+          // If stdout and stderr should go to the same place, redirect stderr
+          // to the FD already open for stdout.
+          if (-1 == dup2(1,2)) {
+            MakeErrMsg(ErrMsg, "Can't redirect stderr to stdout");
+            return false;
+          }
+        } else {
+          // Just redirect stderr
+          if (RedirectIO(redirects[2], 2, ErrMsg)) { return false; }
+        }
+      }
+
+      // Set memory limits
+      if (memoryLimit!=0) {
+        SetMemoryLimits(memoryLimit);
+      }
+
+      // Execute!
+      if (envp != 0)
+        execve(path.c_str(),
+               const_cast<char **>(args),
+               const_cast<char **>(envp));
+      else
+        execv(path.c_str(),
+              const_cast<char **>(args));
+      // If the execve() failed, we should exit. Follow Unix protocol and
+      // return 127 if the executable was not found, and 126 otherwise.
+      // Use _exit rather than exit so that atexit functions and static
+      // object destructors cloned from the parent process aren't
+      // redundantly run, and so that any data buffered in stdio buffers
+      // cloned from the parent aren't redundantly written out.
+      _exit(errno == ENOENT ? 127 : 126);
+    }
+
+    // Parent process: Break out of the switch to do our processing.
+    default:
+      break;
+  }
+
+  Data_ = reinterpret_cast<void*>(child);
+
+  return true;
+}
+
+int
+Program::Wait(const sys::Path &path,
+              unsigned secondsToWait,
+              std::string* ErrMsg)
+{
+#ifdef HAVE_SYS_WAIT_H
+  struct sigaction Act, Old;
+
+  if (Data_ == 0) {
+    MakeErrMsg(ErrMsg, "Process not started!");
+    return -1;
+  }
+
+  // Install a timeout handler.  The handler itself does nothing, but the simple
+  // fact of having a handler at all causes the wait below to return with EINTR,
+  // unlike if we used SIG_IGN.
+  if (secondsToWait) {
+    memset(&Act, 0, sizeof(Act));
+    Act.sa_handler = TimeOutHandler;
+    sigemptyset(&Act.sa_mask);
+    sigaction(SIGALRM, &Act, &Old);
+    alarm(secondsToWait);
+  }
+
+  // Parent process: Wait for the child process to terminate.
+  int status;
+  uint64_t pid = reinterpret_cast<uint64_t>(Data_);
+  pid_t child = static_cast<pid_t>(pid);
+  while (waitpid(pid, &status, 0) != child)
+    if (secondsToWait && errno == EINTR) {
+      // Kill the child.
+      kill(child, SIGKILL);
+
+      // Turn off the alarm and restore the signal handler
+      alarm(0);
+      sigaction(SIGALRM, &Old, 0);
+
+      // Wait for child to die
+      if (wait(&status) != child)
+        MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
+      else
+        MakeErrMsg(ErrMsg, "Child timed out", 0);
+
+      return -2;   // Timeout detected
+    } else if (errno != EINTR) {
+      MakeErrMsg(ErrMsg, "Error waiting for child process");
+      return -1;
+    }
+
+  // We exited normally without timeout, so turn off the timer.
+  if (secondsToWait) {
+    alarm(0);
+    sigaction(SIGALRM, &Old, 0);
+  }
+
+  // Return the proper exit status. Detect error conditions
+  // so we can return -1 for them and set ErrMsg informatively.
+  int result = 0;
+  if (WIFEXITED(status)) {
+    result = WEXITSTATUS(status);
+#ifdef HAVE_POSIX_SPAWN
+    // The posix_spawn child process returns 127 on any kind of error.
+    // Following the POSIX convention for command-line tools (which posix_spawn
+    // itself apparently does not), check to see if the failure was due to some
+    // reason other than the file not existing, and return 126 in this case.
+    bool Exists;
+    if (result == 127 && !llvm::sys::fs::exists(path.str(), Exists) && Exists)
+      result = 126;
+#endif
+    if (result == 127) {
+      if (ErrMsg)
+        *ErrMsg = llvm::sys::StrError(ENOENT);
+      return -1;
+    }
+    if (result == 126) {
+      if (ErrMsg)
+        *ErrMsg = "Program could not be executed";
+      return -1;
+    }
+  } else if (WIFSIGNALED(status)) {
+    if (ErrMsg) {
+      *ErrMsg = strsignal(WTERMSIG(status));
+#ifdef WCOREDUMP
+      if (WCOREDUMP(status))
+        *ErrMsg += " (core dumped)";
+#endif
+    }
+    // Return a special value to indicate that the process received an unhandled
+    // signal during execution as opposed to failing to execute.
+    return -2;
+  }
+  return result;
+#else
+  if (ErrMsg)
+    *ErrMsg = "Program::Wait is not implemented on this platform yet!";
+  return -1;
+#endif
+}
+
+error_code Program::ChangeStdinToBinary(){
+  // Do nothing, as Unix doesn't differentiate between text and binary.
+  return make_error_code(errc::success);
+}
+
+error_code Program::ChangeStdoutToBinary(){
+  // Do nothing, as Unix doesn't differentiate between text and binary.
+  return make_error_code(errc::success);
+}
+
+error_code Program::ChangeStderrToBinary(){
+  // Do nothing, as Unix doesn't differentiate between text and binary.
+  return make_error_code(errc::success);
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Unix/README.txt b/contrib/llvm/lib/Support/Unix/README.txt
new file mode 100644
index 000000000000..3d547c2990d5
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/README.txt
@@ -0,0 +1,16 @@
+llvm/lib/Support/Unix README
+===========================
+
+This directory provides implementations of the lib/System classes that
+are common to two or more variants of UNIX. For example, the directory
+structure underneath this directory could look like this:
+
+Unix           - only code that is truly generic to all UNIX platforms
+  Posix        - code that is specific to Posix variants of UNIX
+  SUS          - code that is specific to the Single Unix Specification
+  SysV         - code that is specific to System V variants of UNIX
+
+As a rule, only those directories actually needing to be created should be
+created. Also, further subdirectories could be created to reflect versions of
+the various standards. For example, under SUS there could be v1, v2, and v3
+subdirectories to reflect the three major versions of SUS.
diff --git a/contrib/llvm/lib/Support/Unix/RWMutex.inc b/contrib/llvm/lib/Support/Unix/RWMutex.inc
new file mode 100644
index 000000000000..40e87ff13111
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/RWMutex.inc
@@ -0,0 +1,43 @@
+//= llvm/Support/Unix/RWMutex.inc - Unix Reader/Writer Mutual Exclusion Lock  =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific (non-pthread) RWMutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+using namespace sys;
+
+RWMutexImpl::RWMutexImpl() { }
+
+RWMutexImpl::~RWMutexImpl() { }
+
+bool RWMutexImpl::reader_acquire() {
+  return true;
+}
+
+bool RWMutexImpl::reader_release() {
+  return true;
+}
+
+bool RWMutexImpl::writer_acquire() {
+  return true;
+}
+
+bool RWMutexImpl::writer_release() {
+  return true;
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm/lib/Support/Unix/Signals.inc
new file mode 100644
index 000000000000..66338f17d88f
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Signals.inc
@@ -0,0 +1,377 @@
+//===- Signals.cpp - Generic Unix Signals Implementation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines some helpful functions for dealing with the possibility of
+// Unix signals occurring while your program is running.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Mutex.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#if HAVE_EXECINFO_H
+# include <execinfo.h>         // For backtrace().
+#endif
+#if HAVE_SIGNAL_H
+#include <signal.h>
+#endif
+#if HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#if HAVE_DLFCN_H && __GNUG__
+#include <dlfcn.h>
+#include <cxxabi.h>
+#endif
+#if HAVE_MACH_MACH_H
+#include <mach/mach.h>
+#endif
+
+using namespace llvm;
+
+static RETSIGTYPE SignalHandler(int Sig);  // defined below.
+
+static SmartMutex<true> SignalsMutex;
+
+/// InterruptFunction - The function to call if ctrl-c is pressed.
+static void (*InterruptFunction)() = 0;
+
+static std::vector<std::string> FilesToRemove;
+static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
+
+// IntSigs - Signals that represent requested termination. There's no bug
+// or failure, or if there is, it's not our direct responsibility. For whatever
+// reason, our continued execution is no longer desirable.
+static const int IntSigs[] = {
+  SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
+};
+static const int *const IntSigsEnd =
+  IntSigs + sizeof(IntSigs) / sizeof(IntSigs[0]);
+
+// KillSigs - Signals that represent that we have a bug, and our prompt
+// termination has been ordered.
+static const int KillSigs[] = {
+  SIGILL, SIGTRAP, SIGABRT, SIGFPE, SIGBUS, SIGSEGV, SIGQUIT
+#ifdef SIGSYS
+  , SIGSYS
+#endif
+#ifdef SIGXCPU
+  , SIGXCPU
+#endif
+#ifdef SIGXFSZ
+  , SIGXFSZ
+#endif
+#ifdef SIGEMT
+  , SIGEMT
+#endif
+};
+static const int *const KillSigsEnd =
+  KillSigs + sizeof(KillSigs) / sizeof(KillSigs[0]);
+
+static unsigned NumRegisteredSignals = 0;
+static struct {
+  struct sigaction SA;
+  int SigNo;
+} RegisteredSignalInfo[(sizeof(IntSigs)+sizeof(KillSigs))/sizeof(KillSigs[0])];
+
+
+static void RegisterHandler(int Signal) {
+  assert(NumRegisteredSignals <
+         sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
+         "Out of space for signal handlers!");
+
+  struct sigaction NewHandler;
+
+  NewHandler.sa_handler = SignalHandler;
+  NewHandler.sa_flags = SA_NODEFER|SA_RESETHAND;
+  sigemptyset(&NewHandler.sa_mask);
+
+  // Install the new handler, save the old one in RegisteredSignalInfo.
+  sigaction(Signal, &NewHandler,
+            &RegisteredSignalInfo[NumRegisteredSignals].SA);
+  RegisteredSignalInfo[NumRegisteredSignals].SigNo = Signal;
+  ++NumRegisteredSignals;
+}
+
+static void RegisterHandlers() {
+  // If the handlers are already registered, we're done.
+  if (NumRegisteredSignals != 0) return;
+
+  std::for_each(IntSigs, IntSigsEnd, RegisterHandler);
+  std::for_each(KillSigs, KillSigsEnd, RegisterHandler);
+}
+
+static void UnregisterHandlers() {
+  // Restore all of the signal handlers to how they were before we showed up.
+  for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
+    sigaction(RegisteredSignalInfo[i].SigNo,
+              &RegisteredSignalInfo[i].SA, 0);
+  NumRegisteredSignals = 0;
+}
+
+
+/// RemoveFilesToRemove - Process the FilesToRemove list. This function
+/// should be called with the SignalsMutex lock held.
+/// NB: This must be an async signal safe function. It cannot allocate or free
+/// memory, even in debug builds.
+static void RemoveFilesToRemove() {
+  // We avoid iterators in case of debug iterators that allocate or release
+  // memory.
+  for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
+    // We rely on a std::string implementation for which repeated calls to
+    // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these
+    // strings to try to ensure this is safe.
+    const char *path = FilesToRemove[i].c_str();
+
+    // Get the status so we can determine if it's a file or directory. If we
+    // can't stat the file, ignore it.
+    struct stat buf;
+    if (stat(path, &buf) != 0)
+      continue;
+
+    // If this is not a regular file, ignore it. We want to prevent removal of
+    // special files like /dev/null, even if the compiler is being run with the
+    // super-user permissions.
+    if (!S_ISREG(buf.st_mode))
+      continue;
+  
+    // Otherwise, remove the file. We ignore any errors here as there is nothing
+    // else we can do.
+    unlink(path);
+  }
+}
+
+// SignalHandler - The signal handler that runs.
+static RETSIGTYPE SignalHandler(int Sig) {
+  // Restore the signal behavior to default, so that the program actually
+  // crashes when we return and the signal reissues.  This also ensures that if
+  // we crash in our signal handler that the program will terminate immediately
+  // instead of recursing in the signal handler.
+  UnregisterHandlers();
+
+  // Unmask all potentially blocked kill signals.
+  sigset_t SigMask;
+  sigfillset(&SigMask);
+  sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+
+  SignalsMutex.acquire();
+  RemoveFilesToRemove();
+
+  if (std::find(IntSigs, IntSigsEnd, Sig) != IntSigsEnd) {
+    if (InterruptFunction) {
+      void (*IF)() = InterruptFunction;
+      SignalsMutex.release();
+      InterruptFunction = 0;
+      IF();        // run the interrupt function.
+      return;
+    }
+
+    SignalsMutex.release();
+    raise(Sig);   // Execute the default handler.
+    return;
+  }
+
+  SignalsMutex.release();
+
+  // Otherwise if it is a fault (like SEGV) run any handler.
+  for (unsigned i = 0, e = CallBacksToRun.size(); i != e; ++i)
+    CallBacksToRun[i].first(CallBacksToRun[i].second);
+}
+
+void llvm::sys::RunInterruptHandlers() {
+  SignalsMutex.acquire();
+  RemoveFilesToRemove();
+  SignalsMutex.release();
+}
+
+void llvm::sys::SetInterruptFunction(void (*IF)()) {
+  SignalsMutex.acquire();
+  InterruptFunction = IF;
+  SignalsMutex.release();
+  RegisterHandlers();
+}
+
+// RemoveFileOnSignal - The public API
+bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
+                                   std::string* ErrMsg) {
+  SignalsMutex.acquire();
+  std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0];
+  FilesToRemove.push_back(Filename.str());
+
+  // We want to call 'c_str()' on every std::string in this vector so that if
+  // the underlying implementation requires a re-allocation, it happens here
+  // rather than inside of the signal handler. If we see the vector grow, we
+  // have to call it on every entry. If it remains in place, we only need to
+  // call it on the latest one.
+  if (OldPtr == &FilesToRemove[0])
+    FilesToRemove.back().c_str();
+  else
+    for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i)
+      FilesToRemove[i].c_str();
+
+  SignalsMutex.release();
+
+  RegisterHandlers();
+  return false;
+}
+
+// DontRemoveFileOnSignal - The public API
+void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+  SignalsMutex.acquire();
+  std::vector<std::string>::reverse_iterator RI =
+    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename.str());
+  std::vector<std::string>::iterator I = FilesToRemove.end();
+  if (RI != FilesToRemove.rend())
+    I = FilesToRemove.erase(RI.base()-1);
+
+  // We need to call c_str() on every element which would have been moved by
+  // the erase. These elements, in a C++98 implementation where c_str()
+  // requires a reallocation on the first call may have had the call to c_str()
+  // made on insertion become invalid by being copied down an element.
+  for (std::vector<std::string>::iterator E = FilesToRemove.end(); I != E; ++I)
+    I->c_str();
+
+  SignalsMutex.release();
+}
+
+/// AddSignalHandler - Add a function to be called when a signal is delivered
+/// to the process.  The handler can have a cookie passed to it to identify
+/// what instance of the handler it is.
+void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
+  CallBacksToRun.push_back(std::make_pair(FnPtr, Cookie));
+  RegisterHandlers();
+}
+
+
+// PrintStackTrace - In the case of a program crash or fault, print out a stack
+// trace so that the user has an indication of why and where we died.
+//
+// On glibc systems we have the 'backtrace' function, which works nicely, but
+// doesn't demangle symbols.
+void llvm::sys::PrintStackTrace(FILE *FD) {
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
+  static void* StackTrace[256];
+  // Use backtrace() to output a backtrace on Linux systems with glibc.
+  int depth = backtrace(StackTrace,
+                        static_cast<int>(array_lengthof(StackTrace)));
+#if HAVE_DLFCN_H && __GNUG__
+  int width = 0;
+  for (int i = 0; i < depth; ++i) {
+    Dl_info dlinfo;
+    dladdr(StackTrace[i], &dlinfo);
+    const char* name = strrchr(dlinfo.dli_fname, '/');
+
+    int nwidth;
+    if (name == NULL) nwidth = strlen(dlinfo.dli_fname);
+    else              nwidth = strlen(name) - 1;
+
+    if (nwidth > width) width = nwidth;
+  }
+
+  for (int i = 0; i < depth; ++i) {
+    Dl_info dlinfo;
+    dladdr(StackTrace[i], &dlinfo);
+
+    fprintf(FD, "%-2d", i);
+
+    const char* name = strrchr(dlinfo.dli_fname, '/');
+    if (name == NULL) fprintf(FD, " %-*s", width, dlinfo.dli_fname);
+    else              fprintf(FD, " %-*s", width, name+1);
+
+    fprintf(FD, " %#0*lx",
+            (int)(sizeof(void*) * 2) + 2, (unsigned long)StackTrace[i]);
+
+    if (dlinfo.dli_sname != NULL) {
+      int res;
+      fputc(' ', FD);
+      char* d = abi::__cxa_demangle(dlinfo.dli_sname, NULL, NULL, &res);
+      if (d == NULL) fputs(dlinfo.dli_sname, FD);
+      else           fputs(d, FD);
+      free(d);
+
+      // FIXME: When we move to C++11, use %t length modifier. It's not in
+      // C++03 and causes gcc to issue warnings. Losing the upper 32 bits of
+      // the stack offset for a stack dump isn't likely to cause any problems.
+      fprintf(FD, " + %u",(unsigned)((char*)StackTrace[i]-
+                                     (char*)dlinfo.dli_saddr));
+    }
+    fputc('\n', FD);
+  }
+#else
+  backtrace_symbols_fd(StackTrace, depth, STDERR_FILENO);
+#endif
+#endif
+}
+
+static void PrintStackTraceSignalHandler(void *) {
+  PrintStackTrace(stderr);
+}
+
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
+/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+void llvm::sys::PrintStackTraceOnErrorSignal() {
+  AddSignalHandler(PrintStackTraceSignalHandler, 0);
+
+#if defined(__APPLE__)
+  // Environment variable to disable any kind of crash dialog.
+  if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
+    mach_port_t self = mach_task_self();
+
+    exception_mask_t mask = EXC_MASK_CRASH;
+
+    kern_return_t ret = task_set_exception_ports(self,
+                             mask,
+                             MACH_PORT_NULL,
+                             EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES,
+                             THREAD_STATE_NONE);
+    (void)ret;
+  }
+#endif
+}
+
+
+/***/
+
+// On Darwin, raise sends a signal to the main thread instead of the current
+// thread. This has the unfortunate effect that assert() and abort() will end up
+// bypassing our crash recovery attempts. We work around this for anything in
+// the same linkage unit by just defining our own versions of the assert handler
+// and abort.
+
+#ifdef __APPLE__
+
+#include <signal.h>
+#include <pthread.h>
+
+int raise(int sig) {
+  return pthread_kill(pthread_self(), sig);
+}
+
+void __assert_rtn(const char *func,
+                  const char *file,
+                  int line,
+                  const char *expr) {
+  if (func)
+    fprintf(stderr, "Assertion failed: (%s), function %s, file %s, line %d.\n",
+            expr, func, file, line);
+  else
+    fprintf(stderr, "Assertion failed: (%s), file %s, line %d.\n",
+            expr, file, line);
+  abort();
+}
+
+void abort() {
+  raise(SIGABRT);
+  usleep(1000);
+  __builtin_trap();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Support/Unix/ThreadLocal.inc b/contrib/llvm/lib/Support/Unix/ThreadLocal.inc
new file mode 100644
index 000000000000..2b4c9017cd91
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/ThreadLocal.inc
@@ -0,0 +1,26 @@
+//=== llvm/Support/Unix/ThreadLocal.inc - Unix Thread Local Data -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific (non-pthread) ThreadLocal class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+using namespace sys;
+ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::~ThreadLocalImpl() { }
+void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
+const void* ThreadLocalImpl::getInstance() { return data; }
+void ThreadLocalImpl::removeInstance() { setInstance(0); }
+}
diff --git a/contrib/llvm/lib/Support/Unix/TimeValue.inc b/contrib/llvm/lib/Support/Unix/TimeValue.inc
new file mode 100644
index 000000000000..df8558bf8bed
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/TimeValue.inc
@@ -0,0 +1,57 @@
+//===- Unix/TimeValue.cpp - Unix TimeValue Implementation -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Unix specific portion of the TimeValue class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "Unix.h"
+
+namespace llvm {
+  using namespace sys;
+
+std::string TimeValue::str() const {
+  char buffer[32];
+
+  time_t ourTime = time_t(this->toEpochTime());
+#ifdef __hpux
+// note that the following line needs -D_REENTRANT on HP-UX to be picked up
+  asctime_r(localtime(&ourTime), buffer);
+#else
+  ::asctime_r(::localtime(&ourTime), buffer);
+#endif
+
+  std::string result(buffer);
+  return result.substr(0,24);
+}
+
+TimeValue TimeValue::now() {
+  struct timeval the_time;
+  timerclear(&the_time);
+  if (0 != ::gettimeofday(&the_time,0)) {
+    // This is *really* unlikely to occur because the only gettimeofday
+    // errors concern the timezone parameter which we're passing in as 0.
+    // In the unlikely case it does happen, just return MinTime, no error
+    // message needed.
+    return MinTime;
+  }
+
+  return TimeValue(
+    static_cast<TimeValue::SecondsType>( the_time.tv_sec +
+      PosixZeroTimeSeconds ),
+    static_cast<TimeValue::NanoSecondsType>( the_time.tv_usec *
+      NANOSECONDS_PER_MICROSECOND ) );
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Unix/Unix.h b/contrib/llvm/lib/Support/Unix/Unix.h
new file mode 100644
index 000000000000..051f56f96922
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Unix.h
@@ -0,0 +1,81 @@
+//===- llvm/Support/Unix/Unix.h - Common Unix Include File -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things specific to Unix implementations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SYSTEM_UNIX_UNIX_H
+#define LLVM_SYSTEM_UNIX_UNIX_H
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on all UNIX variants.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/Errno.h"
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_ASSERT_H
+#include <assert.h>
+#endif
+
+#ifdef HAVE_SYS_TIME_H
+# include <sys/time.h>
+#endif
+#include <time.h>
+
+#ifdef HAVE_SYS_WAIT_H
+# include <sys/wait.h>
+#endif
+
+#ifndef WEXITSTATUS
+# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
+#endif
+
+#ifndef WIFEXITED
+# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
+#endif
+
+/// This function builds an error message into \p ErrMsg using the \p prefix
+/// string and the Unix error number given by \p errnum. If errnum is -1, the
+/// default then the value of errno is used.
+/// @brief Make an error message
+///
+/// If the error number can be converted to a string, it will be
+/// separated from prefix by ": ".
+static inline bool MakeErrMsg(
+  std::string* ErrMsg, const std::string& prefix, int errnum = -1) {
+  if (!ErrMsg)
+    return true;
+  if (errnum == -1)
+    errnum = errno;
+  *ErrMsg = prefix + ": " + llvm::sys::StrError(errnum);
+  return true;
+}
+
+#endif
diff --git a/contrib/llvm/lib/Support/Unix/Watchdog.inc b/contrib/llvm/lib/Support/Unix/Watchdog.inc
new file mode 100644
index 000000000000..5d89c0e51b11
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Watchdog.inc
@@ -0,0 +1,32 @@
+//===--- Unix/Watchdog.inc - Unix Watchdog Implementation -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the generic Unix implementation of the Watchdog class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+namespace llvm {
+  namespace sys {
+    Watchdog::Watchdog(unsigned int seconds) {
+#ifdef HAVE_UNISTD_H
+      alarm(seconds);
+#endif
+    }
+
+    Watchdog::~Watchdog() {
+#ifdef HAVE_UNISTD_H
+      alarm(0);
+#endif
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Support/Unix/system_error.inc b/contrib/llvm/lib/Support/Unix/system_error.inc
new file mode 100644
index 000000000000..681e919edb4e
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/system_error.inc
@@ -0,0 +1,34 @@
+//===- llvm/Support/Unix/system_error.inc - Unix error_code ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Unix specific implementation of the error_code
+// and error_condition classes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic UNIX code that
+//===          is guaranteed to work on *all* UNIX variants.
+//===----------------------------------------------------------------------===//
+
+using namespace llvm;
+
+std::string
+_system_error_category::message(int ev) const {
+  return _do_message::message(ev);
+}
+
+error_condition
+_system_error_category::default_error_condition(int ev) const {
+#ifdef ELAST
+  if (ev > ELAST)
+    return error_condition(ev, system_category());
+#endif  // ELAST
+  return error_condition(ev, generic_category());
+}
diff --git a/contrib/llvm/lib/Support/Valgrind.cpp b/contrib/llvm/lib/Support/Valgrind.cpp
new file mode 100644
index 000000000000..2b250a357758
--- /dev/null
+++ b/contrib/llvm/lib/Support/Valgrind.cpp
@@ -0,0 +1,67 @@
+//===-- Valgrind.cpp - Implement Valgrind communication ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  Defines Valgrind communication methods, if HAVE_VALGRIND_VALGRIND_H is
+//  defined.  If we have valgrind.h but valgrind isn't running, its macros are
+//  no-ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Valgrind.h"
+#include "llvm/Config/config.h"
+
+#if HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+
+static bool InitNotUnderValgrind() {
+  return !RUNNING_ON_VALGRIND;
+}
+
+// This bool is negated from what we'd expect because code may run before it
+// gets initialized.  If that happens, it will appear to be 0 (false), and we
+// want that to cause the rest of the code in this file to run the
+// Valgrind-provided macros.
+static const bool NotUnderValgrind = InitNotUnderValgrind();
+
+bool llvm::sys::RunningOnValgrind() {
+  if (NotUnderValgrind)
+    return false;
+  return RUNNING_ON_VALGRIND;
+}
+
+void llvm::sys::ValgrindDiscardTranslations(const void *Addr, size_t Len) {
+  if (NotUnderValgrind)
+    return;
+
+  VALGRIND_DISCARD_TRANSLATIONS(Addr, Len);
+}
+
+#else  // !HAVE_VALGRIND_VALGRIND_H
+
+bool llvm::sys::RunningOnValgrind() {
+  return false;
+}
+
+void llvm::sys::ValgrindDiscardTranslations(const void *Addr, size_t Len) {
+}
+
+#endif  // !HAVE_VALGRIND_VALGRIND_H
+
+#if LLVM_ENABLE_THREADS != 0 && !defined(NDEBUG)
+// These functions require no implementation, tsan just looks at the arguments
+// they're called with.
+extern "C" {
+void AnnotateHappensBefore(const char *file, int line,
+                           const volatile void *cv) {}
+void AnnotateHappensAfter(const char *file, int line,
+                          const volatile void *cv) {}
+void AnnotateIgnoreWritesBegin(const char *file, int line) {}
+void AnnotateIgnoreWritesEnd(const char *file, int line) {}
+}
+#endif
diff --git a/contrib/llvm/lib/Support/Watchdog.cpp b/contrib/llvm/lib/Support/Watchdog.cpp
new file mode 100644
index 000000000000..724aa001f16e
--- /dev/null
+++ b/contrib/llvm/lib/Support/Watchdog.cpp
@@ -0,0 +1,23 @@
+//===---- Watchdog.cpp - Implement Watchdog ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements the Watchdog class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Watchdog.h"
+#include "llvm/Config/config.h"
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Watchdog.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Watchdog.inc"
+#endif
diff --git a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
new file mode 100644
index 000000000000..83da82a949ca
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -0,0 +1,163 @@
+//===- Win32/DynamicLibrary.cpp - Win32 DL Implementation -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of DynamicLibrary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+
+#ifdef __MINGW32__
+ #include <imagehlp.h>
+#else
+ #include <dbghelp.h>
+#endif
+
+#ifdef _MSC_VER
+ #include <ntverp.h>
+#endif
+
+#ifdef __MINGW32__
+ #if (HAVE_LIBIMAGEHLP != 1)
+  #error "libimagehlp.a should be present"
+ #endif
+#else
+ #pragma comment(lib, "dbghelp.lib")
+#endif
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code.
+//===----------------------------------------------------------------------===//
+
+static DenseSet<HMODULE> *OpenedHandles;
+
+extern "C" {
+
+  static BOOL CALLBACK ELM_Callback(WIN32_ELMCB_PCSTR ModuleName,
+                                    ULONG_PTR ModuleBase,
+                                    ULONG ModuleSize,
+                                    PVOID UserContext)
+  {
+    // Ignore VC++ runtimes prior to 7.1.  Somehow some of them get loaded
+    // into the process.
+    if (stricmp(ModuleName, "msvci70") != 0 &&
+        stricmp(ModuleName, "msvcirt") != 0 &&
+        stricmp(ModuleName, "msvcp50") != 0 &&
+        stricmp(ModuleName, "msvcp60") != 0 &&
+        stricmp(ModuleName, "msvcp70") != 0 &&
+        stricmp(ModuleName, "msvcr70") != 0 &&
+#ifndef __MINGW32__
+        // Mingw32 uses msvcrt.dll by default. Don't ignore it.
+        // Otherwise, user should be aware, what he's doing :)
+        stricmp(ModuleName, "msvcrt") != 0 &&
+#endif
+        stricmp(ModuleName, "msvcrt20") != 0 &&
+        stricmp(ModuleName, "msvcrt40") != 0) {
+      OpenedHandles->insert((HMODULE)ModuleBase);
+    }
+    return TRUE;
+  }
+}
+
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(getMutex());
+
+  if (!filename) {
+    // When no file is specified, enumerate all DLLs and EXEs in the process.
+    if (OpenedHandles == 0)
+      OpenedHandles = new DenseSet<HMODULE>();
+
+    EnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
+    // Dummy library that represents "search all handles".
+    // This is mostly to ensure that the return value still shows up as "valid".
+    return DynamicLibrary(&OpenedHandles);
+  }
+  
+  HMODULE a_handle = LoadLibrary(filename);
+
+  if (a_handle == 0) {
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
+    return DynamicLibrary();
+  }
+
+  if (OpenedHandles == 0)
+    OpenedHandles = new DenseSet<HMODULE>();
+
+  // If we've already loaded this library, FreeLibrary() the handle in order to
+  // keep the internal refcount at +1.
+  if (!OpenedHandles->insert(a_handle).second)
+    FreeLibrary(a_handle);
+
+  return DynamicLibrary(a_handle);
+}
+
+// Stack probing routines are in the support library (e.g. libgcc), but we don't
+// have dynamic linking on windows. Provide a hook.
+#define EXPLICIT_SYMBOL(SYM)                    \
+  extern "C" { extern void *SYM; }
+#define EXPLICIT_SYMBOL2(SYMFROM, SYMTO) EXPLICIT_SYMBOL(SYMTO)
+
+#include "explicit_symbols.inc"
+
+#undef EXPLICIT_SYMBOL
+#undef EXPLICIT_SYMBOL2
+
+void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+  SmartScopedLock<true> Lock(getMutex());
+
+  // First check symbols added via AddSymbol().
+  if (ExplicitSymbols) {
+    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
+
+    if (i != ExplicitSymbols->end())
+      return i->second;
+  }
+
+  // Now search the libraries.
+  if (OpenedHandles) {
+    for (DenseSet<HMODULE>::iterator I = OpenedHandles->begin(),
+         E = OpenedHandles->end(); I != E; ++I) {
+      FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
+      if (ptr) {
+        return (void *)(intptr_t)ptr;
+      }
+    }
+  }
+
+  #define EXPLICIT_SYMBOL(SYM)                    \
+    if (!strcmp(symbolName, #SYM)) return (void*)&SYM;
+  #define EXPLICIT_SYMBOL2(SYMFROM, SYMTO)        \
+    if (!strcmp(symbolName, #SYMFROM)) return (void*)&SYMTO;
+
+  {
+    #include "explicit_symbols.inc"
+  }
+
+  #undef EXPLICIT_SYMBOL
+  #undef EXPLICIT_SYMBOL2
+
+  return 0;
+}
+
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  if (!isValid())
+    return NULL;
+  if (Data == &OpenedHandles)
+    return SearchForAddressOfSymbol(symbolName);
+  return (void *)(intptr_t)GetProcAddress((HMODULE)Data, symbolName);
+}
+
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/Host.inc b/contrib/llvm/lib/Support/Windows/Host.inc
new file mode 100644
index 000000000000..2e6d6f190370
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Host.inc
@@ -0,0 +1,22 @@
+//===- llvm/Support/Win32/Host.inc -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 Host support.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <cstdio>
+#include <string>
+
+using namespace llvm;
+
+std::string sys::getDefaultTargetTriple() {
+  return LLVM_DEFAULT_TARGET_TRIPLE;
+}
diff --git a/contrib/llvm/lib/Support/Windows/Memory.inc b/contrib/llvm/lib/Support/Windows/Memory.inc
new file mode 100644
index 000000000000..4c5aebd5e71a
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Memory.inc
@@ -0,0 +1,235 @@
+//===- Win32/Memory.cpp - Win32 Memory Implementation -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of various Memory
+// management utilities
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
+
+// The Windows.h header must be the last one included.
+#include "Windows.h"
+
+namespace {
+
+DWORD getWindowsProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  // Contrary to what you might expect, the Windows page protection flags
+  // are not a bitwise combination of RWX values
+  case llvm::sys::Memory::MF_READ:
+    return PAGE_READONLY;
+  case llvm::sys::Memory::MF_WRITE:
+    // Note: PAGE_WRITE is not supported by VirtualProtect
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READ;
+  case llvm::sys::Memory::MF_READ |
+         llvm::sys::Memory::MF_WRITE |
+         llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READWRITE;
+  case llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PAGE_NOACCESS;
+}
+
+size_t getAllocationGranularity() {
+  SYSTEM_INFO  Info;
+  ::GetSystemInfo(&Info);
+  if (Info.dwPageSize > Info.dwAllocationGranularity)
+    return Info.dwPageSize;
+  else
+    return Info.dwAllocationGranularity;
+}
+
+} // namespace
+
+namespace llvm {
+namespace sys {
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
+                                         const MemoryBlock *const NearBlock,
+                                         unsigned Flags,
+                                         error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  // While we'd be happy to allocate single pages, the Windows allocation
+  // granularity may be larger than a single page (in practice, it is 64K)
+  // so mapping less than that will create an unreachable fragment of memory.
+  static const size_t Granularity = getAllocationGranularity();
+  const size_t NumBlocks = (NumBytes+Granularity-1)/Granularity;
+
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                NearBlock->size()
+                           : NULL;
+
+  // If the requested address is not aligned to the allocation granularity,
+  // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
+  if (Start && Start % Granularity != 0)
+    Start += Granularity - Start % Granularity;
+
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  void *PA = ::VirtualAlloc(reinterpret_cast<void*>(Start),
+                            NumBlocks*Granularity,
+                            MEM_RESERVE | MEM_COMMIT, Protect);
+  if (PA == NULL) {
+    if (NearBlock) {
+      // Try again without the NearBlock hint
+      return allocateMappedMemory(NumBytes, NULL, Flags, EC);
+    }
+    EC = error_code(::GetLastError(), system_category());
+    return MemoryBlock();
+  }
+
+  MemoryBlock Result;
+  Result.Address = PA;
+  Result.Size = NumBlocks*Granularity;
+                                 ;
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
+}
+
+error_code Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (!VirtualFree(M.Address, 0, MEM_RELEASE))
+    return error_code(::GetLastError(), system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code Memory::protectMappedMemory(const MemoryBlock &M,
+                                       unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  DWORD OldFlags;
+  if (!VirtualProtect(M.Address, M.Size, Protect, &OldFlags))
+    return error_code(::GetLastError(), system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(
+    const void *Addr, size_t Len) {
+  FlushInstructionCache(GetCurrentProcess(), Addr, Len);
+}
+
+
+MemoryBlock Memory::AllocateRWX(size_t NumBytes,
+                                const MemoryBlock *NearBlock,
+                                std::string *ErrMsg) {
+  MemoryBlock MB;
+  error_code EC;
+  MB = allocateMappedMemory(NumBytes, NearBlock,
+                            MF_READ|MF_WRITE|MF_EXEC, EC);
+  if (EC != error_code::success() && ErrMsg) {
+    MakeErrMsg(ErrMsg, EC.message());
+  }
+  return MB;
+}
+
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  error_code EC = releaseMappedMemory(M);
+  if (EC == error_code::success())
+    return false;
+  MakeErrMsg(ErrMsg, EC.message());
+  return true;
+}
+
+static DWORD getProtection(const void *addr) {
+  MEMORY_BASIC_INFORMATION info;
+  if (sizeof(info) == ::VirtualQuery(addr, &info, sizeof(info))) {
+    return info.Protect;
+  }
+  return 0;
+}
+
+bool Memory::setWritable(MemoryBlock &M, std::string *ErrMsg) {
+  if (!setRangeWritable(M.Address, M.Size)) {
+    return MakeErrMsg(ErrMsg, "Cannot set memory to writeable: ");
+  }
+  return true;
+}
+
+bool Memory::setExecutable(MemoryBlock &M, std::string *ErrMsg) {
+  if (!setRangeExecutable(M.Address, M.Size)) {
+    return MakeErrMsg(ErrMsg, "Cannot set memory to executable: ");
+  }
+  return true;
+}
+
+bool Memory::setRangeWritable(const void *Addr, size_t Size) {
+  DWORD prot = getProtection(Addr);
+  if (!prot)
+    return false;
+
+  if (prot == PAGE_EXECUTE || prot == PAGE_EXECUTE_READ) {
+    prot = PAGE_EXECUTE_READWRITE;
+  } else if (prot == PAGE_NOACCESS || prot == PAGE_READONLY) {
+    prot = PAGE_READWRITE;
+  }
+
+  DWORD oldProt;
+  Memory::InvalidateInstructionCache(Addr, Size);
+  return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
+            == TRUE;
+}
+
+bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
+  DWORD prot = getProtection(Addr);
+  if (!prot)
+    return false;
+
+  if (prot == PAGE_NOACCESS) {
+    prot = PAGE_EXECUTE;
+  } else if (prot == PAGE_READONLY) {
+    prot = PAGE_EXECUTE_READ;
+  } else if (prot == PAGE_READWRITE) {
+    prot = PAGE_EXECUTE_READWRITE;
+  }
+
+  DWORD oldProt;
+  Memory::InvalidateInstructionCache(Addr, Size);
+  return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
+            == TRUE;
+}
+
+} // namespace sys
+} // namespace llvm
diff --git a/contrib/llvm/lib/Support/Windows/Mutex.inc b/contrib/llvm/lib/Support/Windows/Mutex.inc
new file mode 100644
index 000000000000..583dc6359a16
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Mutex.inc
@@ -0,0 +1,58 @@
+//===- llvm/Support/Win32/Mutex.inc - Win32 Mutex Implementation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 specific (non-pthread) Mutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include "llvm/Support/Mutex.h"
+
+namespace llvm {
+using namespace sys;
+
+MutexImpl::MutexImpl(bool /*recursive*/)
+{
+  data_ = new CRITICAL_SECTION;
+  InitializeCriticalSection((LPCRITICAL_SECTION)data_);
+}
+
+MutexImpl::~MutexImpl()
+{
+  DeleteCriticalSection((LPCRITICAL_SECTION)data_);
+  delete (LPCRITICAL_SECTION)data_;
+  data_ = 0;
+}
+
+bool
+MutexImpl::acquire()
+{
+  EnterCriticalSection((LPCRITICAL_SECTION)data_);
+  return true;
+}
+
+bool
+MutexImpl::release()
+{
+  LeaveCriticalSection((LPCRITICAL_SECTION)data_);
+  return true;
+}
+
+bool
+MutexImpl::tryacquire()
+{
+  return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/Path.inc b/contrib/llvm/lib/Support/Windows/Path.inc
new file mode 100644
index 000000000000..f4898e619abf
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Path.inc
@@ -0,0 +1,926 @@
+//===- llvm/Support/Win32/Path.cpp - Win32 Path Implementation ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Path class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <cstdio>
+#include <malloc.h>
+
+// We need to undo a macro defined in Windows.h, otherwise we won't compile:
+#undef CopyFile
+#undef GetCurrentDirectory
+
+// Windows happily accepts either forward or backward slashes, though any path
+// returned by a Win32 API will have backward slashes.  As LLVM code basically
+// assumes forward slashes are used, backward slashs are converted where they
+// can be introduced into a path.
+//
+// Another invariant is that a path ends with a slash if and only if the path
+// is a root directory.  Any other use of a trailing slash is stripped.  Unlike
+// in Unix, Windows has a rather complicated notion of a root path and this
+// invariant helps simply the code.
+
+static void FlipBackSlashes(std::string& s) {
+  for (size_t i = 0; i < s.size(); i++)
+    if (s[i] == '\\')
+      s[i] = '/';
+}
+
+namespace llvm {
+namespace sys {
+
+const char PathSeparator = ';';
+
+StringRef Path::GetEXESuffix() {
+  return "exe";
+}
+
+Path::Path(llvm::StringRef p)
+  : path(p) {
+  FlipBackSlashes(path);
+}
+
+Path::Path(const char *StrStart, unsigned StrLen)
+  : path(StrStart, StrLen) {
+  FlipBackSlashes(path);
+}
+
+Path&
+Path::operator=(StringRef that) {
+  path.assign(that.data(), that.size());
+  FlipBackSlashes(path);
+  return *this;
+}
+
+bool
+Path::isValid() const {
+  if (path.empty())
+    return false;
+
+  size_t len = path.size();
+  // If there is a null character, it and all its successors are ignored.
+  size_t pos = path.find_first_of('\0');
+  if (pos != std::string::npos)
+    len = pos;
+
+  // If there is a colon, it must be the second character, preceded by a letter
+  // and followed by something.
+  pos = path.rfind(':',len);
+  size_t rootslash = 0;
+  if (pos != std::string::npos) {
+    if (pos != 1 || !isalpha(static_cast<unsigned char>(path[0])) || len < 3)
+      return false;
+      rootslash = 2;
+  }
+
+  // Look for a UNC path, and if found adjust our notion of the root slash.
+  if (len > 3 && path[0] == '/' && path[1] == '/') {
+    rootslash = path.find('/', 2);
+    if (rootslash == std::string::npos)
+      rootslash = 0;
+  }
+
+  // Check for illegal characters.
+  if (path.find_first_of("\\<>\"|\001\002\003\004\005\006\007\010\011\012"
+                         "\013\014\015\016\017\020\021\022\023\024\025\026"
+                         "\027\030\031\032\033\034\035\036\037")
+      != std::string::npos)
+    return false;
+
+  // Remove trailing slash, unless it's a root slash.
+  if (len > rootslash+1 && path[len-1] == '/')
+    path.erase(--len);
+
+  // Check each component for legality.
+  for (pos = 0; pos < len; ++pos) {
+    // A component may not end in a space.
+    if (path[pos] == ' ') {
+      if (pos+1 == len || path[pos+1] == '/' || path[pos+1] == '\0')
+        return false;
+    }
+
+    // A component may not end in a period.
+    if (path[pos] == '.') {
+      if (pos+1 == len || path[pos+1] == '/') {
+        // Unless it is the pseudo-directory "."...
+        if (pos == 0 || path[pos-1] == '/' || path[pos-1] == ':')
+          return true;
+        // or "..".
+        if (pos > 0 && path[pos-1] == '.') {
+          if (pos == 1 || path[pos-2] == '/' || path[pos-2] == ':')
+            return true;
+        }
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void Path::makeAbsolute() {
+  TCHAR  FullPath[MAX_PATH + 1] = {0};
+  LPTSTR FilePart = NULL;
+
+  DWORD RetLength = ::GetFullPathNameA(path.c_str(),
+                        sizeof(FullPath)/sizeof(FullPath[0]),
+                        FullPath, &FilePart);
+
+  if (0 == RetLength) {
+    // FIXME: Report the error GetLastError()
+    assert(0 && "Unable to make absolute path!");
+  } else if (RetLength > MAX_PATH) {
+    // FIXME: Report too small buffer (needed RetLength bytes).
+    assert(0 && "Unable to make absolute path!");
+  } else {
+    path = FullPath;
+  }
+}
+
+bool
+Path::isAbsolute(const char *NameStart, unsigned NameLen) {
+  assert(NameStart);
+  // FIXME: This does not handle correctly an absolute path starting from
+  // a drive letter or in UNC format.
+  switch (NameLen) {
+  case 0:
+    return false;
+  case 1:
+  case 2:
+    return NameStart[0] == '/';
+  default:
+    return
+      (NameStart[0] == '/' || (NameStart[1] == ':' && NameStart[2] == '/')) ||
+      (NameStart[0] == '\\' || (NameStart[1] == ':' && NameStart[2] == '\\'));
+  }
+}
+
+bool
+Path::isAbsolute() const {
+  // FIXME: This does not handle correctly an absolute path starting from
+  // a drive letter or in UNC format.
+  switch (path.length()) {
+    case 0:
+      return false;
+    case 1:
+    case 2:
+      return path[0] == '/';
+    default:
+      return path[0] == '/' || (path[1] == ':' && path[2] == '/');
+  }
+}
+
+static Path *TempDirectory;
+
+Path
+Path::GetTemporaryDirectory(std::string* ErrMsg) {
+  if (TempDirectory) {
+#if defined(_MSC_VER)
+    // Visual Studio gets confused and emits a diagnostic about calling exists,
+    // even though this is the implementation for PathV1.  Temporarily 
+    // disable the deprecated warning message
+    #pragma warning(push)
+    #pragma warning(disable:4996)
+#endif
+    assert(TempDirectory->exists() && "Who has removed TempDirectory?");
+#if defined(_MSC_VER)
+    #pragma warning(pop)
+#endif
+    return *TempDirectory;
+  }
+
+  char pathname[MAX_PATH];
+  if (!GetTempPath(MAX_PATH, pathname)) {
+    if (ErrMsg)
+      *ErrMsg = "Can't determine temporary directory";
+    return Path();
+  }
+
+  Path result;
+  result.set(pathname);
+
+  // Append a subdirectory based on our process id so multiple LLVMs don't
+  // step on each other's toes.
+#ifdef __MINGW32__
+  // Mingw's Win32 header files are broken.
+  sprintf(pathname, "LLVM_%u", unsigned(GetCurrentProcessId()));
+#else
+  sprintf(pathname, "LLVM_%u", GetCurrentProcessId());
+#endif
+  result.appendComponent(pathname);
+
+  // If there's a directory left over from a previous LLVM execution that
+  // happened to have the same process id, get rid of it.
+  result.eraseFromDisk(true);
+
+  // And finally (re-)create the empty directory.
+  result.createDirectoryOnDisk(false);
+  TempDirectory = new Path(result);
+  return *TempDirectory;
+}
+
+// FIXME: the following set of functions don't map to Windows very well.
+Path
+Path::GetRootDirectory() {
+  // This is the only notion that that Windows has of a root directory. Nothing
+  // is here except for drives.
+  return Path("file:///");
+}
+
+void
+Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
+  char buff[MAX_PATH];
+  // Generic form of C:\Windows\System32
+  HRESULT res =  SHGetFolderPathA(NULL,
+                                  CSIDL_FLAG_CREATE | CSIDL_SYSTEM,
+                                  NULL,
+                                  SHGFP_TYPE_CURRENT,
+                                  buff);
+  if (res != S_OK) {
+    assert(0 && "Failed to get system directory");
+    return;
+  }
+  Paths.push_back(sys::Path(buff));
+
+  // Reset buff.
+  buff[0] = 0;
+  // Generic form of C:\Windows
+  res =  SHGetFolderPathA(NULL,
+                          CSIDL_FLAG_CREATE | CSIDL_WINDOWS,
+                          NULL,
+                          SHGFP_TYPE_CURRENT,
+                          buff);
+  if (res != S_OK) {
+    assert(0 && "Failed to get windows directory");
+    return;
+  }
+  Paths.push_back(sys::Path(buff));
+}
+
+void
+Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
+  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
+  if (env_var != 0) {
+    getPathList(env_var,Paths);
+  }
+#ifdef LLVM_LIBDIR
+  {
+    Path tmpPath;
+    if (tmpPath.set(LLVM_LIBDIR))
+      if (tmpPath.canRead())
+        Paths.push_back(tmpPath);
+  }
+#endif
+  GetSystemLibraryPaths(Paths);
+}
+
+Path
+Path::GetUserHomeDirectory() {
+  char buff[MAX_PATH];
+  HRESULT res = SHGetFolderPathA(NULL,
+                                 CSIDL_FLAG_CREATE | CSIDL_APPDATA,
+                                 NULL,
+                                 SHGFP_TYPE_CURRENT,
+                                 buff);
+  if (res != S_OK)
+    assert(0 && "Failed to get user home directory");
+  return Path(buff);
+}
+
+Path
+Path::GetCurrentDirectory() {
+  char pathname[MAX_PATH];
+  ::GetCurrentDirectoryA(MAX_PATH,pathname);
+  return Path(pathname);
+}
+
+/// GetMainExecutable - Return the path to the main executable, given the
+/// value of argv[0] from program startup.
+Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
+  char pathname[MAX_PATH];
+  DWORD ret = ::GetModuleFileNameA(NULL, pathname, MAX_PATH);
+  return ret != MAX_PATH ? Path(pathname) : Path();
+}
+
+
+// FIXME: the above set of functions don't map to Windows very well.
+
+
+StringRef Path::getDirname() const {
+  return getDirnameCharSep(path, "/");
+}
+
+StringRef
+Path::getBasename() const {
+  // Find the last slash
+  size_t slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  size_t dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return StringRef(path).substr(slash);
+  else
+    return StringRef(path).substr(slash, dot - slash);
+}
+
+StringRef
+Path::getSuffix() const {
+  // Find the last slash
+  size_t slash = path.rfind('/');
+  if (slash == std::string::npos)
+    slash = 0;
+  else
+    slash++;
+
+  size_t dot = path.rfind('.');
+  if (dot == std::string::npos || dot < slash)
+    return StringRef("");
+  else
+    return StringRef(path).substr(dot + 1);
+}
+
+bool
+Path::exists() const {
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+bool
+Path::isDirectory() const {
+  DWORD attr = GetFileAttributes(path.c_str());
+  return (attr != INVALID_FILE_ATTRIBUTES) &&
+         (attr & FILE_ATTRIBUTE_DIRECTORY);
+}
+
+bool
+Path::isSymLink() const {
+  DWORD attributes = GetFileAttributes(path.c_str());
+
+  if (attributes == INVALID_FILE_ATTRIBUTES)
+    // There's no sane way to report this :(.
+    assert(0 && "GetFileAttributes returned INVALID_FILE_ATTRIBUTES");
+
+  // This isn't exactly what defines a NTFS symlink, but it is only true for
+  // paths that act like a symlink.
+  return attributes & FILE_ATTRIBUTE_REPARSE_POINT;
+}
+
+bool
+Path::canRead() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+bool
+Path::canWrite() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return (attr != INVALID_FILE_ATTRIBUTES) && !(attr & FILE_ATTRIBUTE_READONLY);
+}
+
+bool
+Path::canExecute() const {
+  // FIXME: take security attributes into account.
+  DWORD attr = GetFileAttributes(path.c_str());
+  return attr != INVALID_FILE_ATTRIBUTES;
+}
+
+bool
+Path::isRegularFile() const {
+  bool res;
+  if (fs::is_regular_file(path, res))
+    return false;
+  return res;
+}
+
+StringRef
+Path::getLast() const {
+  // Find the last slash
+  size_t pos = path.rfind('/');
+
+  // Handle the corner cases
+  if (pos == std::string::npos)
+    return path;
+
+  // If the last character is a slash, we have a root directory
+  if (pos == path.length()-1)
+    return path;
+
+  // Return everything after the last slash
+  return StringRef(path).substr(pos+1);
+}
+
+const FileStatus *
+PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
+  if (!fsIsValid || update) {
+    WIN32_FILE_ATTRIBUTE_DATA fi;
+    if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
+      MakeErrMsg(ErrStr, "getStatusInfo():" + std::string(path) +
+                      ": Can't get status: ");
+      return 0;
+    }
+
+    status.fileSize = fi.nFileSizeHigh;
+    status.fileSize <<= sizeof(fi.nFileSizeHigh)*8;
+    status.fileSize += fi.nFileSizeLow;
+
+    status.mode = fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY ? 0555 : 0777;
+    status.user = 9999;    // Not applicable to Windows, so...
+    status.group = 9999;   // Not applicable to Windows, so...
+
+    // FIXME: this is only unique if the file is accessed by the same file path.
+    // How do we do this for C:\dir\file and ..\dir\file ? Unix has inode
+    // numbers, but the concept doesn't exist in Windows.
+    status.uniqueID = 0;
+    for (unsigned i = 0; i < path.length(); ++i)
+      status.uniqueID += path[i];
+
+    ULARGE_INTEGER ui;
+    ui.LowPart = fi.ftLastWriteTime.dwLowDateTime;
+    ui.HighPart = fi.ftLastWriteTime.dwHighDateTime;
+    status.modTime.fromWin32Time(ui.QuadPart);
+
+    status.isDir = fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY;
+    fsIsValid = true;
+  }
+  return &status;
+}
+
+bool Path::makeReadableOnDisk(std::string* ErrMsg) {
+  // All files are readable on Windows (ignoring security attributes).
+  return false;
+}
+
+bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
+  DWORD attr = GetFileAttributes(path.c_str());
+
+  // If it doesn't exist, we're done.
+  if (attr == INVALID_FILE_ATTRIBUTES)
+    return false;
+
+  if (attr & FILE_ATTRIBUTE_READONLY) {
+    if (!SetFileAttributes(path.c_str(), attr & ~FILE_ATTRIBUTE_READONLY)) {
+      MakeErrMsg(ErrMsg, std::string(path) + ": Can't make file writable: ");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
+  // All files are executable on Windows (ignoring security attributes).
+  return false;
+}
+
+bool
+Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
+  WIN32_FILE_ATTRIBUTE_DATA fi;
+  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
+    MakeErrMsg(ErrMsg, path + ": can't get status of file");
+    return true;
+  }
+
+  if (!(fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    if (ErrMsg)
+      *ErrMsg = path + ": not a directory";
+    return true;
+  }
+
+  result.clear();
+  WIN32_FIND_DATA fd;
+  std::string searchpath = path;
+  if (path.size() == 0 || searchpath[path.size()-1] == '/')
+    searchpath += "*";
+  else
+    searchpath += "/*";
+
+  HANDLE h = FindFirstFile(searchpath.c_str(), &fd);
+  if (h == INVALID_HANDLE_VALUE) {
+    if (GetLastError() == ERROR_FILE_NOT_FOUND)
+      return true; // not really an error, now is it?
+    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
+    return true;
+  }
+
+  do {
+    if (fd.cFileName[0] == '.')
+      continue;
+    Path aPath(path);
+    aPath.appendComponent(&fd.cFileName[0]);
+    result.insert(aPath);
+  } while (FindNextFile(h, &fd));
+
+  DWORD err = GetLastError();
+  FindClose(h);
+  if (err != ERROR_NO_MORE_FILES) {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
+    return true;
+  }
+  return false;
+}
+
+bool
+Path::set(StringRef a_path) {
+  if (a_path.empty())
+    return false;
+  std::string save(path);
+  path = a_path;
+  FlipBackSlashes(path);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::appendComponent(StringRef name) {
+  if (name.empty())
+    return false;
+  std::string save(path);
+  if (!path.empty()) {
+    size_t last = path.size() - 1;
+    if (path[last] != '/')
+      path += '/';
+  }
+  path += name;
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseComponent() {
+  size_t slashpos = path.rfind('/',path.size());
+  if (slashpos == path.size() - 1 || slashpos == std::string::npos)
+    return false;
+  std::string save(path);
+  path.erase(slashpos);
+  if (!isValid()) {
+    path = save;
+    return false;
+  }
+  return true;
+}
+
+bool
+Path::eraseSuffix() {
+  size_t dotpos = path.rfind('.',path.size());
+  size_t slashpos = path.rfind('/',path.size());
+  if (dotpos != std::string::npos) {
+    if (slashpos == std::string::npos || dotpos > slashpos+1) {
+      std::string save(path);
+      path.erase(dotpos, path.size()-dotpos);
+      if (!isValid()) {
+        path = save;
+        return false;
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool PathMsg(std::string* ErrMsg, const char* pathname, const char*msg) {
+  if (ErrMsg)
+    *ErrMsg = std::string(pathname) + ": " + std::string(msg);
+  return true;
+}
+
+bool
+Path::createDirectoryOnDisk(bool create_parents, std::string* ErrMsg) {
+  // Get a writeable copy of the path name
+  size_t len = path.length();
+  char *pathname = reinterpret_cast<char *>(_alloca(len+2));
+  path.copy(pathname, len);
+  pathname[len] = 0;
+
+  // Make sure it ends with a slash.
+  if (len == 0 || pathname[len - 1] != '/') {
+    pathname[len] = '/';
+    pathname[++len] = 0;
+  }
+
+  // Determine starting point for initial / search.
+  char *next = pathname;
+  if (pathname[0] == '/' && pathname[1] == '/') {
+    // Skip host name.
+    next = strchr(pathname+2, '/');
+    if (next == NULL)
+      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+
+    // Skip share name.
+    next = strchr(next+1, '/');
+    if (next == NULL)
+      return PathMsg(ErrMsg, pathname,"badly formed remote directory");
+
+    next++;
+    if (*next == 0)
+      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+
+  } else {
+    if (pathname[1] == ':')
+      next += 2;    // skip drive letter
+    if (*next == '/')
+      next++;       // skip root directory
+  }
+
+  // If we're supposed to create intermediate directories
+  if (create_parents) {
+    // Loop through the directory components until we're done
+    while (*next) {
+      next = strchr(next, '/');
+      *next = 0;
+      if (!CreateDirectory(pathname, NULL) &&
+          GetLastError() != ERROR_ALREADY_EXISTS)
+          return MakeErrMsg(ErrMsg,
+            std::string(pathname) + ": Can't create directory: ");
+      *next++ = '/';
+    }
+  } else {
+    // Drop trailing slash.
+    pathname[len-1] = 0;
+    if (!CreateDirectory(pathname, NULL) &&
+        GetLastError() != ERROR_ALREADY_EXISTS) {
+      return MakeErrMsg(ErrMsg, std::string(pathname) +
+                        ": Can't create directory: ");
+    }
+  }
+  return false;
+}
+
+bool
+Path::createFileOnDisk(std::string* ErrMsg) {
+  // Create the file
+  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
+                        FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return MakeErrMsg(ErrMsg, path + ": Can't create file: ");
+
+  CloseHandle(h);
+  return false;
+}
+
+bool
+Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
+  WIN32_FILE_ATTRIBUTE_DATA fi;
+  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi))
+    return true;
+
+  if (fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
+    // If it doesn't exist, we're done.
+    bool Exists;
+    if (fs::exists(path, Exists) || !Exists)
+      return false;
+
+    char *pathname = reinterpret_cast<char *>(_alloca(path.length()+3));
+    int lastchar = path.length() - 1 ;
+    path.copy(pathname, lastchar+1);
+
+    // Make path end with '/*'.
+    if (pathname[lastchar] != '/')
+      pathname[++lastchar] = '/';
+    pathname[lastchar+1] = '*';
+    pathname[lastchar+2] = 0;
+
+    if (remove_contents) {
+      WIN32_FIND_DATA fd;
+      HANDLE h = FindFirstFile(pathname, &fd);
+
+      // It's a bad idea to alter the contents of a directory while enumerating
+      // its contents. So build a list of its contents first, then destroy them.
+
+      if (h != INVALID_HANDLE_VALUE) {
+        std::vector<Path> list;
+
+        do {
+          if (strcmp(fd.cFileName, ".") == 0)
+            continue;
+          if (strcmp(fd.cFileName, "..") == 0)
+            continue;
+
+          Path aPath(path);
+          aPath.appendComponent(&fd.cFileName[0]);
+          list.push_back(aPath);
+        } while (FindNextFile(h, &fd));
+
+        DWORD err = GetLastError();
+        FindClose(h);
+        if (err != ERROR_NO_MORE_FILES) {
+          SetLastError(err);
+          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
+        }
+
+        for (std::vector<Path>::iterator I = list.begin(); I != list.end();
+             ++I) {
+          Path &aPath = *I;
+          aPath.eraseFromDisk(true);
+        }
+      } else {
+        if (GetLastError() != ERROR_FILE_NOT_FOUND)
+          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
+      }
+    }
+
+    pathname[lastchar] = 0;
+    if (!RemoveDirectory(pathname))
+      return MakeErrMsg(ErrStr,
+        std::string(pathname) + ": Can't destroy directory: ");
+    return false;
+  } else {
+    // Read-only files cannot be deleted on Windows.  Must remove the read-only
+    // attribute first.
+    if (fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
+      if (!SetFileAttributes(path.c_str(),
+                             fi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
+    }
+
+    if (!DeleteFile(path.c_str()))
+      return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
+    return false;
+  }
+}
+
+bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
+  assert(len < 1024 && "Request for magic string too long");
+  char* buf = reinterpret_cast<char*>(alloca(len));
+
+  HANDLE h = CreateFile(path.c_str(),
+                        GENERIC_READ,
+                        FILE_SHARE_READ,
+                        NULL,
+                        OPEN_EXISTING,
+                        FILE_ATTRIBUTE_NORMAL,
+                        NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return false;
+
+  DWORD nRead = 0;
+  BOOL ret = ReadFile(h, buf, len, &nRead, NULL);
+  CloseHandle(h);
+
+  if (!ret || nRead != len)
+    return false;
+
+  Magic = std::string(buf, len);
+  return true;
+}
+
+bool
+Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
+  if (!MoveFileEx(path.c_str(), newName.c_str(), MOVEFILE_REPLACE_EXISTING))
+    return MakeErrMsg(ErrMsg, "Can't move '" + path + "' to '" + newName.path
+        + "': ");
+  return false;
+}
+
+bool
+Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrMsg) const {
+  // FIXME: should work on directories also.
+  if (!si.isFile) {
+    return true;
+  }
+
+  HANDLE h = CreateFile(path.c_str(),
+                        FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES,
+                        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                        NULL,
+                        OPEN_EXISTING,
+                        FILE_ATTRIBUTE_NORMAL,
+                        NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return true;
+
+  BY_HANDLE_FILE_INFORMATION bhfi;
+  if (!GetFileInformationByHandle(h, &bhfi)) {
+    DWORD err = GetLastError();
+    CloseHandle(h);
+    SetLastError(err);
+    return MakeErrMsg(ErrMsg, path + ": GetFileInformationByHandle: ");
+  }
+
+  ULARGE_INTEGER ui;
+  ui.QuadPart = si.modTime.toWin32Time();
+  FILETIME ft;
+  ft.dwLowDateTime = ui.LowPart;
+  ft.dwHighDateTime = ui.HighPart;
+  BOOL ret = SetFileTime(h, NULL, &ft, &ft);
+  DWORD err = GetLastError();
+  CloseHandle(h);
+  if (!ret) {
+    SetLastError(err);
+    return MakeErrMsg(ErrMsg, path + ": SetFileTime: ");
+  }
+
+  // Best we can do with Unix permission bits is to interpret the owner
+  // writable bit.
+  if (si.mode & 0200) {
+    if (bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
+      if (!SetFileAttributes(path.c_str(),
+              bhfi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
+    }
+  } else {
+    if (!(bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY)) {
+      if (!SetFileAttributes(path.c_str(),
+              bhfi.dwFileAttributes | FILE_ATTRIBUTE_READONLY))
+        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
+    }
+  }
+
+  return false;
+}
+
+bool
+CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg) {
+  // Can't use CopyFile macro defined in Windows.h because it would mess up the
+  // above line.  We use the expansion it would have in a non-UNICODE build.
+  if (!::CopyFileA(Src.c_str(), Dest.c_str(), false))
+    return MakeErrMsg(ErrMsg, "Can't copy '" + Src.str() +
+               "' to '" + Dest.str() + "': ");
+  return false;
+}
+
+bool
+Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
+  bool Exists;
+  if (reuse_current && (fs::exists(path, Exists) || !Exists))
+    return false; // File doesn't exist already, just use it!
+
+  // Reserve space for -XXXXXX at the end.
+  char *FNBuffer = (char*) alloca(path.size()+8);
+  unsigned offset = path.size();
+  path.copy(FNBuffer, offset);
+
+  // Find a numeric suffix that isn't used by an existing file.  Assume there
+  // won't be more than 1 million files with the same prefix.  Probably a safe
+  // bet.
+  static int FCounter = -1;
+  if (FCounter < 0) {
+    // Give arbitrary initial seed.
+    // FIXME: We should use sys::fs::unique_file() in future.
+    LARGE_INTEGER cnt64;
+    DWORD x = GetCurrentProcessId();
+    x = (x << 16) | (x >> 16);
+    if (QueryPerformanceCounter(&cnt64))    // RDTSC
+      x ^= cnt64.HighPart ^ cnt64.LowPart;
+    FCounter = x % 1000000;
+  }
+  do {
+    sprintf(FNBuffer+offset, "-%06u", FCounter);
+    if (++FCounter > 999999)
+      FCounter = 0;
+    path = FNBuffer;
+  } while (!fs::exists(path, Exists) && Exists);
+  return false;
+}
+
+bool
+Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
+  // Make this into a unique file name
+  makeUnique(reuse_current, ErrMsg);
+
+  // Now go and create it
+  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
+                        FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE)
+    return MakeErrMsg(ErrMsg, path + ": can't create file");
+
+  CloseHandle(h);
+  return false;
+}
+
+/// MapInFilePages - Not yet implemented on win32.
+const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
+  return 0;
+}
+
+/// MapInFilePages - Not yet implemented on win32.
+void Path::UnMapFilePages(const char *Base, size_t FileSize) {
+  assert(0 && "NOT IMPLEMENTED");
+}
+
+}
+}
diff --git a/contrib/llvm/lib/Support/Windows/PathV2.inc b/contrib/llvm/lib/Support/Windows/PathV2.inc
new file mode 100644
index 000000000000..23f3d14f91f0
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/PathV2.inc
@@ -0,0 +1,1022 @@
+//===- llvm/Support/Windows/PathV2.inc - Windows Path Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Windows specific implementation of the PathV2 API.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Windows code that
+//===          is guaranteed to work on *all* Windows variants.
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <fcntl.h>
+#include <io.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#undef max
+
+// MinGW doesn't define this.
+#ifndef _ERRNO_T_DEFINED
+#define _ERRNO_T_DEFINED
+typedef int errno_t;
+#endif
+
+using namespace llvm;
+
+namespace {
+  typedef BOOLEAN (WINAPI *PtrCreateSymbolicLinkW)(
+    /*__in*/ LPCWSTR lpSymlinkFileName,
+    /*__in*/ LPCWSTR lpTargetFileName,
+    /*__in*/ DWORD dwFlags);
+
+  PtrCreateSymbolicLinkW create_symbolic_link_api = PtrCreateSymbolicLinkW(
+    ::GetProcAddress(::GetModuleHandleA("kernel32.dll"),
+                     "CreateSymbolicLinkW"));
+
+  error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16) {
+    int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                    utf8.begin(), utf8.size(),
+                                    utf16.begin(), 0);
+
+    if (len == 0)
+      return windows_error(::GetLastError());
+
+    utf16.reserve(len + 1);
+    utf16.set_size(len);
+
+    len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                    utf8.begin(), utf8.size(),
+                                    utf16.begin(), utf16.size());
+
+    if (len == 0)
+      return windows_error(::GetLastError());
+
+    // Make utf16 null terminated.
+    utf16.push_back(0);
+    utf16.pop_back();
+
+    return error_code::success();
+  }
+
+  error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                               SmallVectorImpl<char> &utf8) {
+    // Get length.
+    int len = ::WideCharToMultiByte(CP_UTF8, 0,
+                                    utf16, utf16_len,
+                                    utf8.begin(), 0,
+                                    NULL, NULL);
+
+    if (len == 0)
+      return windows_error(::GetLastError());
+
+    utf8.reserve(len);
+    utf8.set_size(len);
+
+    // Now do the actual conversion.
+    len = ::WideCharToMultiByte(CP_UTF8, 0,
+                                utf16, utf16_len,
+                                utf8.data(), utf8.size(),
+                                NULL, NULL);
+
+    if (len == 0)
+      return windows_error(::GetLastError());
+
+    // Make utf8 null terminated.
+    utf8.push_back(0);
+    utf8.pop_back();
+
+    return error_code::success();
+  }
+
+  error_code TempDir(SmallVectorImpl<wchar_t> &result) {
+  retry_temp_dir:
+    DWORD len = ::GetTempPathW(result.capacity(), result.begin());
+
+    if (len == 0)
+      return windows_error(::GetLastError());
+
+    if (len > result.capacity()) {
+      result.reserve(len);
+      goto retry_temp_dir;
+    }
+
+    result.set_size(len);
+    return error_code::success();
+  }
+
+  bool is_separator(const wchar_t value) {
+    switch (value) {
+    case L'\\':
+    case L'/':
+      return true;
+    default:
+      return false;
+    }
+  }
+}
+
+namespace llvm {
+namespace sys  {
+namespace fs {
+
+error_code current_path(SmallVectorImpl<char> &result) {
+  SmallVector<wchar_t, 128> cur_path;
+  cur_path.reserve(128);
+retry_cur_dir:
+  DWORD len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
+
+  // A zero return value indicates a failure other than insufficient space.
+  if (len == 0)
+    return windows_error(::GetLastError());
+
+  // If there's insufficient space, the len returned is larger than the len
+  // given.
+  if (len > cur_path.capacity()) {
+    cur_path.reserve(len);
+    goto retry_cur_dir;
+  }
+
+  cur_path.set_size(len);
+  // cur_path now holds the current directory in utf-16. Convert to utf-8.
+
+  // Find out how much space we need. Sadly, this function doesn't return the
+  // size needed unless you tell it the result size is 0, which means you
+  // _always_ have to call it twice.
+  len = ::WideCharToMultiByte(CP_UTF8, 0,
+                              cur_path.data(), cur_path.size(),
+                              result.data(), 0,
+                              NULL, NULL);
+
+  if (len == 0)
+    return make_error_code(windows_error(::GetLastError()));
+
+  result.reserve(len);
+  result.set_size(len);
+  // Now do the actual conversion.
+  len = ::WideCharToMultiByte(CP_UTF8, 0,
+                              cur_path.data(), cur_path.size(),
+                              result.data(), result.size(),
+                              NULL, NULL);
+  if (len == 0)
+    return windows_error(::GetLastError());
+
+  return error_code::success();
+}
+
+error_code copy_file(const Twine &from, const Twine &to, copy_option copt) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+
+  // Copy the file.
+  BOOL res = ::CopyFileW(wide_from.begin(), wide_to.begin(),
+                         copt != copy_option::overwrite_if_exists);
+
+  if (res == 0)
+    return windows_error(::GetLastError());
+
+  return error_code::success();
+}
+
+error_code create_directory(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
+    error_code ec = windows_error(::GetLastError());
+    if (ec == windows_error::already_exists)
+      existed = true;
+    else
+      return ec;
+  } else
+    existed = false;
+
+  return error_code::success();
+}
+
+error_code create_hard_link(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+
+  if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
+    return windows_error(::GetLastError());
+
+  return error_code::success();
+}
+
+error_code create_symlink(const Twine &to, const Twine &from) {
+  // Only do it if the function is available at runtime.
+  if (!create_symbolic_link_api)
+    return make_error_code(errc::function_not_supported);
+
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+
+  if (!create_symbolic_link_api(wide_from.begin(), wide_to.begin(), 0))
+    return windows_error(::GetLastError());
+
+  return error_code::success();
+}
+
+error_code remove(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  if (st.type() == file_type::directory_file) {
+    if (!::RemoveDirectoryW(c_str(path_utf16))) {
+      error_code ec = windows_error(::GetLastError());
+      if (ec != windows_error::file_not_found)
+        return ec;
+      existed = false;
+    } else
+      existed = true;
+  } else {
+    if (!::DeleteFileW(c_str(path_utf16))) {
+      error_code ec = windows_error(::GetLastError());
+      if (ec != windows_error::file_not_found)
+        return ec;
+      existed = false;
+    } else
+      existed = true;
+  }
+
+  return error_code::success();
+}
+
+error_code rename(const Twine &from, const Twine &to) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+
+  error_code ec = error_code::success();
+  for (int i = 0; i < 2000; i++) {
+    if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
+                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
+      return error_code::success();
+    ec = windows_error(::GetLastError());
+    if (ec != windows_error::access_denied)
+      break;
+    // Retry MoveFile() at ACCESS_DENIED.
+    // System scanners (eg. indexer) might open the source file when
+    // It is written and closed.
+    ::Sleep(1);
+  }
+
+  return ec;
+}
+
+error_code resize_file(const Twine &path, uint64_t size) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
+  if (fd == -1)
+    return error_code(errno, generic_category());
+#ifdef HAVE__CHSIZE_S
+  errno_t error = ::_chsize_s(fd, size);
+#else
+  errno_t error = ::_chsize(fd, size);
+#endif
+  ::close(fd);
+  return error_code(error, generic_category());
+}
+
+error_code exists(const Twine &path, bool &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+
+  if (attributes == INVALID_FILE_ATTRIBUTES) {
+    // See if the file didn't actually exist.
+    error_code ec = make_error_code(windows_error(::GetLastError()));
+    if (ec != windows_error::file_not_found &&
+        ec != windows_error::path_not_found)
+      return ec;
+    result = false;
+  } else
+    result = true;
+  return error_code::success();
+}
+
+bool equivalent(file_status A, file_status B) {
+  assert(status_known(A) && status_known(B));
+  return A.FileIndexHigh      == B.FileIndexHigh &&
+         A.FileIndexLow       == B.FileIndexLow &&
+         A.FileSizeHigh       == B.FileSizeHigh &&
+         A.FileSizeLow        == B.FileSizeLow &&
+         A.LastWriteTimeHigh  == B.LastWriteTimeHigh &&
+         A.LastWriteTimeLow   == B.LastWriteTimeLow &&
+         A.VolumeSerialNumber == B.VolumeSerialNumber;
+}
+
+error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+  file_status fsA, fsB;
+  if (error_code ec = status(A, fsA)) return ec;
+  if (error_code ec = status(B, fsB)) return ec;
+  result = equivalent(fsA, fsB);
+  return error_code::success();
+}
+
+error_code file_size(const Twine &path, uint64_t &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  WIN32_FILE_ATTRIBUTE_DATA FileData;
+  if (!::GetFileAttributesExW(path_utf16.begin(),
+                              ::GetFileExInfoStandard,
+                              &FileData))
+    return windows_error(::GetLastError());
+
+  result =
+    (uint64_t(FileData.nFileSizeHigh) << (sizeof(FileData.nFileSizeLow) * 8))
+    + FileData.nFileSizeLow;
+
+  return error_code::success();
+}
+
+static bool isReservedName(StringRef path) {
+  // This list of reserved names comes from MSDN, at:
+  // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
+  static const char *sReservedNames[] = { "nul", "con", "prn", "aux",
+                              "com1", "com2", "com3", "com4", "com5", "com6",
+                              "com7", "com8", "com9", "lpt1", "lpt2", "lpt3",
+                              "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" };
+
+  // First, check to see if this is a device namespace, which always
+  // starts with \\.\, since device namespaces are not legal file paths.
+  if (path.startswith("\\\\.\\"))
+    return true;
+
+  // Then compare against the list of ancient reserved names
+  for (size_t i = 0; i < sizeof(sReservedNames) / sizeof(const char *); ++i) {
+    if (path.equals_lower(sReservedNames[i]))
+      return true;
+  }
+
+  // The path isn't what we consider reserved.
+  return false;
+}
+
+error_code status(const Twine &path, file_status &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  StringRef path8 = path.toStringRef(path_storage);
+  if (isReservedName(path8)) {
+    result = file_status(file_type::character_file);
+    return error_code::success();
+  }
+
+  if (error_code ec = UTF8ToUTF16(path8, path_utf16))
+    return ec;
+
+  DWORD attr = ::GetFileAttributesW(path_utf16.begin());
+  if (attr == INVALID_FILE_ATTRIBUTES)
+    goto handle_status_error;
+
+  // Handle reparse points.
+  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
+    ScopedFileHandle h(
+      ::CreateFileW(path_utf16.begin(),
+                    0, // Attributes only.
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL,
+                    OPEN_EXISTING,
+                    FILE_FLAG_BACKUP_SEMANTICS,
+                    0));
+    if (!h)
+      goto handle_status_error;
+  }
+
+  if (attr & FILE_ATTRIBUTE_DIRECTORY)
+    result = file_status(file_type::directory_file);
+  else {
+    result = file_status(file_type::regular_file);
+    ScopedFileHandle h(
+      ::CreateFileW(path_utf16.begin(),
+                    0, // Attributes only.
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL,
+                    OPEN_EXISTING,
+                    FILE_FLAG_BACKUP_SEMANTICS,
+                    0));
+    if (!h)
+      goto handle_status_error;
+    BY_HANDLE_FILE_INFORMATION Info;
+    if (!::GetFileInformationByHandle(h, &Info))
+      goto handle_status_error;
+    result.FileIndexHigh      = Info.nFileIndexHigh;
+    result.FileIndexLow       = Info.nFileIndexLow;
+    result.FileSizeHigh       = Info.nFileSizeHigh;
+    result.FileSizeLow        = Info.nFileSizeLow;
+    result.LastWriteTimeHigh  = Info.ftLastWriteTime.dwHighDateTime;
+    result.LastWriteTimeLow   = Info.ftLastWriteTime.dwLowDateTime;
+    result.VolumeSerialNumber = Info.dwVolumeSerialNumber;
+  }
+
+  return error_code::success();
+
+handle_status_error:
+  error_code ec = windows_error(::GetLastError());
+  if (ec == windows_error::file_not_found ||
+      ec == windows_error::path_not_found)
+    result = file_status(file_type::file_not_found);
+  else if (ec == windows_error::sharing_violation)
+    result = file_status(file_type::type_unknown);
+  else {
+    result = file_status(file_type::status_error);
+    return ec;
+  }
+
+  return error_code::success();
+}
+
+
+// Modifies permissions on a file.
+error_code permissions(const Twine &path, perms prms) {
+#if 0 // verify code below before enabling:
+  // If the permissions bits are not trying to modify
+  // "write" permissions, there is nothing to do.
+  if (!(prms & (owner_write|group_write|others_write)))
+    return error_code::success();
+  
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+
+  if (prms & add_perms) {
+    attributes &= ~FILE_ATTRIBUTE_READONLY;
+  }
+  else if (prms & remove_perms) {
+    attributes |= FILE_ATTRIBUTE_READONLY;
+  }
+  else {
+    assert(0 && "neither add_perms or remove_perms is set");
+  }
+
+  if ( ! ::SetFileAttributesW(path_utf16.begin(), attributes))
+    return windows_error(::GetLastError());
+#endif    
+  return error_code::success();
+}
+
+
+// FIXME: mode should be used here and default to user r/w only,
+// it currently comes in as a UNIX mode.
+error_code unique_file(const Twine &model, int &result_fd,
+                       SmallVectorImpl<char> &result_path,
+                       bool makeAbsolute, unsigned mode) {
+  // Use result_path as temp storage.
+  result_path.set_size(0);
+  StringRef m = model.toStringRef(result_path);
+
+  SmallVector<wchar_t, 128> model_utf16;
+  if (error_code ec = UTF8ToUTF16(m, model_utf16)) return ec;
+
+  if (makeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = path::is_absolute(m);
+
+    if (!absolute) {
+      SmallVector<wchar_t, 64> temp_dir;
+      if (error_code ec = TempDir(temp_dir)) return ec;
+      // Handle c: by removing it.
+      if (model_utf16.size() > 2 && model_utf16[1] == L':') {
+        model_utf16.erase(model_utf16.begin(), model_utf16.begin() + 2);
+      }
+      model_utf16.insert(model_utf16.begin(), temp_dir.begin(), temp_dir.end());
+    }
+  }
+
+  // Replace '%' with random chars. From here on, DO NOT modify model. It may be
+  // needed if the randomly chosen path already exists.
+  SmallVector<wchar_t, 128> random_path_utf16;
+
+  // Get a Crypto Provider for CryptGenRandom.
+  HCRYPTPROV HCPC;
+  if (!::CryptAcquireContextW(&HCPC,
+                              NULL,
+                              NULL,
+                              PROV_RSA_FULL,
+                              CRYPT_VERIFYCONTEXT))
+    return windows_error(::GetLastError());
+  ScopedCryptContext CryptoProvider(HCPC);
+
+retry_random_path:
+  random_path_utf16.set_size(0);
+  for (SmallVectorImpl<wchar_t>::const_iterator i = model_utf16.begin(),
+                                                e = model_utf16.end();
+                                                i != e; ++i) {
+    if (*i == L'%') {
+      BYTE val = 0;
+      if (!::CryptGenRandom(CryptoProvider, 1, &val))
+          return windows_error(::GetLastError());
+      random_path_utf16.push_back("0123456789abcdef"[val & 15]);
+    }
+    else
+      random_path_utf16.push_back(*i);
+  }
+  // Make random_path_utf16 null terminated.
+  random_path_utf16.push_back(0);
+  random_path_utf16.pop_back();
+
+  // Make sure we don't fall into an infinite loop by constantly trying
+  // to create the parent path.
+  bool TriedToCreateParent = false;
+
+  // Try to create + open the path.
+retry_create_file:
+  HANDLE TempFileHandle = ::CreateFileW(random_path_utf16.begin(),
+                                        GENERIC_READ | GENERIC_WRITE,
+                                        FILE_SHARE_READ,
+                                        NULL,
+                                        // Return ERROR_FILE_EXISTS if the file
+                                        // already exists.
+                                        CREATE_NEW,
+                                        FILE_ATTRIBUTE_TEMPORARY,
+                                        NULL);
+  if (TempFileHandle == INVALID_HANDLE_VALUE) {
+    // If the file existed, try again, otherwise, error.
+    error_code ec = windows_error(::GetLastError());
+    if (ec == windows_error::file_exists)
+      goto retry_random_path;
+    // Check for non-existing parent directories.
+    if (ec == windows_error::path_not_found && !TriedToCreateParent) {
+      TriedToCreateParent = true;
+
+      // Create the directories using result_path as temp storage.
+      if (error_code ec = UTF16ToUTF8(random_path_utf16.begin(),
+                                      random_path_utf16.size(), result_path))
+        return ec;
+      StringRef p(result_path.begin(), result_path.size());
+      SmallString<64> dir_to_create;
+      for (path::const_iterator i = path::begin(p),
+                                e = --path::end(p); i != e; ++i) {
+        path::append(dir_to_create, *i);
+        bool Exists;
+        if (error_code ec = exists(Twine(dir_to_create), Exists)) return ec;
+        if (!Exists) {
+          // If c: doesn't exist, bail.
+          if (i->endswith(":"))
+            return ec;
+
+          SmallVector<wchar_t, 64> dir_to_create_utf16;
+          if (error_code ec = UTF8ToUTF16(dir_to_create, dir_to_create_utf16))
+            return ec;
+
+          // Create the directory.
+          if (!::CreateDirectoryW(dir_to_create_utf16.begin(), NULL))
+            return windows_error(::GetLastError());
+        }
+      }
+      goto retry_create_file;
+    }
+    return ec;
+  }
+
+  // Set result_path to the utf-8 representation of the path.
+  if (error_code ec = UTF16ToUTF8(random_path_utf16.begin(),
+                                  random_path_utf16.size(), result_path)) {
+    ::CloseHandle(TempFileHandle);
+    ::DeleteFileW(random_path_utf16.begin());
+    return ec;
+  }
+
+  // Convert the Windows API file handle into a C-runtime handle.
+  int fd = ::_open_osfhandle(intptr_t(TempFileHandle), 0);
+  if (fd == -1) {
+    ::CloseHandle(TempFileHandle);
+    ::DeleteFileW(random_path_utf16.begin());
+    // MSDN doesn't say anything about _open_osfhandle setting errno or
+    // GetLastError(), so just return invalid_handle.
+    return windows_error::invalid_handle;
+  }
+
+  result_fd = fd;
+  return error_code::success();
+}
+
+error_code get_magic(const Twine &path, uint32_t len,
+                     SmallVectorImpl<char> &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+  result.set_size(0);
+
+  // Convert path to UTF-16.
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  // Open file.
+  HANDLE file = ::CreateFileW(c_str(path_utf16),
+                              GENERIC_READ,
+                              FILE_SHARE_READ,
+                              NULL,
+                              OPEN_EXISTING,
+                              FILE_ATTRIBUTE_READONLY,
+                              NULL);
+  if (file == INVALID_HANDLE_VALUE)
+    return windows_error(::GetLastError());
+
+  // Allocate buffer.
+  result.reserve(len);
+
+  // Get magic!
+  DWORD bytes_read = 0;
+  BOOL read_success = ::ReadFile(file, result.data(), len, &bytes_read, NULL);
+  error_code ec = windows_error(::GetLastError());
+  ::CloseHandle(file);
+  if (!read_success || (bytes_read != len)) {
+    // Set result size to the number of bytes read if it's valid.
+    if (bytes_read <= len)
+      result.set_size(bytes_read);
+    // ERROR_HANDLE_EOF is mapped to errc::value_too_large.
+    return ec;
+  }
+
+  result.set_size(len);
+  return error_code::success();
+}
+
+error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+  FileDescriptor = FD;
+  // Make sure that the requested size fits within SIZE_T.
+  if (Size > std::numeric_limits<SIZE_T>::max()) {
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return make_error_code(errc::invalid_argument);
+  }
+
+  DWORD flprotect;
+  switch (Mode) {
+  case readonly:  flprotect = PAGE_READONLY; break;
+  case readwrite: flprotect = PAGE_READWRITE; break;
+  case priv:      flprotect = PAGE_WRITECOPY; break;
+  default: llvm_unreachable("invalid mapping mode");
+  }
+
+  FileMappingHandle = ::CreateFileMapping(FileHandle,
+                                          0,
+                                          flprotect,
+                                          Size >> 32,
+                                          Size & 0xffffffff,
+                                          0);
+  if (FileMappingHandle == NULL) {
+    error_code ec = windows_error(GetLastError());
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return ec;
+  }
+
+  DWORD dwDesiredAccess;
+  switch (Mode) {
+  case readonly:  dwDesiredAccess = FILE_MAP_READ; break;
+  case readwrite: dwDesiredAccess = FILE_MAP_WRITE; break;
+  case priv:      dwDesiredAccess = FILE_MAP_COPY; break;
+  default: llvm_unreachable("invalid mapping mode");
+  }
+  Mapping = ::MapViewOfFile(FileMappingHandle,
+                            dwDesiredAccess,
+                            Offset >> 32,
+                            Offset & 0xffffffff,
+                            Size);
+  if (Mapping == NULL) {
+    error_code ec = windows_error(GetLastError());
+    ::CloseHandle(FileMappingHandle);
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return ec;
+  }
+
+  if (Size == 0) {
+    MEMORY_BASIC_INFORMATION mbi;
+    SIZE_T Result = VirtualQuery(Mapping, &mbi, sizeof(mbi));
+    if (Result == 0) {
+      error_code ec = windows_error(GetLastError());
+      ::UnmapViewOfFile(Mapping);
+      ::CloseHandle(FileMappingHandle);
+      if (FileDescriptor) {
+        if (CloseFD)
+          _close(FileDescriptor);
+      } else
+        ::CloseHandle(FileHandle);
+      return ec;
+    }
+    Size = mbi.RegionSize;
+  }
+
+  // Close all the handles except for the view. It will keep the other handles
+  // alive.
+  ::CloseHandle(FileMappingHandle);
+  if (FileDescriptor) {
+    if (CloseFD)
+      _close(FileDescriptor); // Also closes FileHandle.
+  } else
+    ::CloseHandle(FileHandle);
+  return error_code::success();
+}
+
+mapped_file_region::mapped_file_region(const Twine &path,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec) 
+  : Mode(mode)
+  , Size(length)
+  , Mapping()
+  , FileDescriptor()
+  , FileHandle(INVALID_HANDLE_VALUE)
+  , FileMappingHandle() {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  // Convert path to UTF-16.
+  if ((ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16)))
+    return;
+
+  // Get file handle for creating a file mapping.
+  FileHandle = ::CreateFileW(c_str(path_utf16),
+                             Mode == readonly ? GENERIC_READ
+                                              : GENERIC_READ | GENERIC_WRITE,
+                             Mode == readonly ? FILE_SHARE_READ
+                                              : 0,
+                             0,
+                             Mode == readonly ? OPEN_EXISTING
+                                              : OPEN_ALWAYS,
+                             Mode == readonly ? FILE_ATTRIBUTE_READONLY
+                                              : FILE_ATTRIBUTE_NORMAL,
+                             0);
+  if (FileHandle == INVALID_HANDLE_VALUE) {
+    ec = windows_error(::GetLastError());
+    return;
+  }
+
+  FileDescriptor = 0;
+  ec = init(FileDescriptor, true, offset);
+  if (ec) {
+    Mapping = FileMappingHandle = 0;
+    FileHandle = INVALID_HANDLE_VALUE;
+    FileDescriptor = 0;
+  }
+}
+
+mapped_file_region::mapped_file_region(int fd,
+                                       bool closefd,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping()
+  , FileDescriptor(fd)
+  , FileHandle(INVALID_HANDLE_VALUE)
+  , FileMappingHandle() {
+  FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (FileHandle == INVALID_HANDLE_VALUE) {
+    if (closefd)
+      _close(FileDescriptor);
+    FileDescriptor = 0;
+    ec = make_error_code(errc::bad_file_descriptor);
+    return;
+  }
+
+  ec = init(FileDescriptor, closefd, offset);
+  if (ec) {
+    Mapping = FileMappingHandle = 0;
+    FileHandle = INVALID_HANDLE_VALUE;
+    FileDescriptor = 0;
+  }
+}
+
+mapped_file_region::~mapped_file_region() {
+  if (Mapping)
+    ::UnmapViewOfFile(Mapping);
+}
+
+#if LLVM_HAS_RVALUE_REFERENCES
+mapped_file_region::mapped_file_region(mapped_file_region &&other)
+  : Mode(other.Mode)
+  , Size(other.Size)
+  , Mapping(other.Mapping)
+  , FileDescriptor(other.FileDescriptor)
+  , FileHandle(other.FileHandle)
+  , FileMappingHandle(other.FileMappingHandle) {
+  other.Mapping = other.FileMappingHandle = 0;
+  other.FileHandle = INVALID_HANDLE_VALUE;
+  other.FileDescriptor = 0;
+}
+#endif
+
+mapped_file_region::mapmode mapped_file_region::flags() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Mode;
+}
+
+uint64_t mapped_file_region::size() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Size;
+}
+
+char *mapped_file_region::data() const {
+  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<char*>(Mapping);
+}
+
+const char *mapped_file_region::const_data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<const char*>(Mapping);
+}
+
+int mapped_file_region::alignment() {
+  SYSTEM_INFO SysInfo;
+  ::GetSystemInfo(&SysInfo);
+  return SysInfo.dwAllocationGranularity;
+}
+
+error_code detail::directory_iterator_construct(detail::DirIterState &it,
+                                                StringRef path){
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path,
+                                  path_utf16))
+    return ec;
+
+  // Convert path to the format that Windows is happy with.
+  if (path_utf16.size() > 0 &&
+      !is_separator(path_utf16[path.size() - 1]) &&
+      path_utf16[path.size() - 1] != L':') {
+    path_utf16.push_back(L'\\');
+    path_utf16.push_back(L'*');
+  } else {
+    path_utf16.push_back(L'*');
+  }
+
+  //  Get the first directory entry.
+  WIN32_FIND_DATAW FirstFind;
+  ScopedFindHandle FindHandle(::FindFirstFileW(c_str(path_utf16), &FirstFind));
+  if (!FindHandle)
+    return windows_error(::GetLastError());
+
+  size_t FilenameLen = ::wcslen(FirstFind.cFileName);
+  while ((FilenameLen == 1 && FirstFind.cFileName[0] == L'.') ||
+         (FilenameLen == 2 && FirstFind.cFileName[0] == L'.' &&
+                              FirstFind.cFileName[1] == L'.'))
+    if (!::FindNextFileW(FindHandle, &FirstFind)) {
+      error_code ec = windows_error(::GetLastError());
+      // Check for end.
+      if (ec == windows_error::no_more_files)
+        return detail::directory_iterator_destruct(it);
+      return ec;
+    } else
+      FilenameLen = ::wcslen(FirstFind.cFileName);
+
+  // Construct the current directory entry.
+  SmallString<128> directory_entry_name_utf8;
+  if (error_code ec = UTF16ToUTF8(FirstFind.cFileName,
+                                  ::wcslen(FirstFind.cFileName),
+                                  directory_entry_name_utf8))
+    return ec;
+
+  it.IterationHandle = intptr_t(FindHandle.take());
+  SmallString<128> directory_entry_path(path);
+  path::append(directory_entry_path, directory_entry_name_utf8.str());
+  it.CurrentEntry = directory_entry(directory_entry_path.str());
+
+  return error_code::success();
+}
+
+error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+  if (it.IterationHandle != 0)
+    // Closes the handle if it's valid.
+    ScopedFindHandle close(HANDLE(it.IterationHandle));
+  it.IterationHandle = 0;
+  it.CurrentEntry = directory_entry();
+  return error_code::success();
+}
+
+error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+  WIN32_FIND_DATAW FindData;
+  if (!::FindNextFileW(HANDLE(it.IterationHandle), &FindData)) {
+    error_code ec = windows_error(::GetLastError());
+    // Check for end.
+    if (ec == windows_error::no_more_files)
+      return detail::directory_iterator_destruct(it);
+    return ec;
+  }
+
+  size_t FilenameLen = ::wcslen(FindData.cFileName);
+  if ((FilenameLen == 1 && FindData.cFileName[0] == L'.') ||
+      (FilenameLen == 2 && FindData.cFileName[0] == L'.' &&
+                           FindData.cFileName[1] == L'.'))
+    return directory_iterator_increment(it);
+
+  SmallString<128> directory_entry_path_utf8;
+  if (error_code ec = UTF16ToUTF8(FindData.cFileName,
+                                  ::wcslen(FindData.cFileName),
+                                  directory_entry_path_utf8))
+    return ec;
+
+  it.CurrentEntry.replace_filename(Twine(directory_entry_path_utf8));
+  return error_code::success();
+}
+
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+error_code unmap_file_pages(void *base, size_t size) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+
+
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Support/Windows/Process.inc b/contrib/llvm/lib/Support/Windows/Process.inc
new file mode 100644
index 000000000000..ad9412852f10
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Process.inc
@@ -0,0 +1,278 @@
+//===- Win32/Process.cpp - Win32 Process Implementation ------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Process class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <direct.h>
+#include <io.h>
+#include <malloc.h>
+#include <psapi.h>
+
+#ifdef __MINGW32__
+ #if (HAVE_LIBPSAPI != 1)
+  #error "libpsapi.a should be present"
+ #endif
+#else
+ #pragma comment(lib, "psapi.lib")
+#endif
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+#ifdef __MINGW32__
+// This ban should be lifted when MinGW 1.0+ has defined this value.
+#  define _HEAPOK (-2)
+#endif
+
+using namespace llvm;
+using namespace sys;
+
+
+process::id_type self_process::get_id() {
+  return GetCurrentProcess();
+}
+
+static TimeValue getTimeValueFromFILETIME(FILETIME Time) {
+  ULARGE_INTEGER TimeInteger;
+  TimeInteger.LowPart = Time.dwLowDateTime;
+  TimeInteger.HighPart = Time.dwHighDateTime;
+
+  // FILETIME's are # of 100 nanosecond ticks (1/10th of a microsecond)
+  return TimeValue(
+      static_cast<TimeValue::SecondsType>(TimeInteger.QuadPart / 10000000),
+      static_cast<TimeValue::NanoSecondsType>(
+          (TimeInteger.QuadPart % 10000000) * 100));
+}
+
+TimeValue self_process::get_user_time() const {
+  FILETIME ProcCreate, ProcExit, KernelTime, UserTime;
+  if (GetProcessTimes(GetCurrentProcess(), &ProcCreate, &ProcExit, &KernelTime,
+                      &UserTime) == 0)
+    return TimeValue();
+
+  return getTimeValueFromFILETIME(UserTime);
+}
+
+TimeValue self_process::get_system_time() const {
+  FILETIME ProcCreate, ProcExit, KernelTime, UserTime;
+  if (GetProcessTimes(GetCurrentProcess(), &ProcCreate, &ProcExit, &KernelTime,
+                      &UserTime) == 0)
+    return TimeValue();
+
+  return getTimeValueFromFILETIME(KernelTime);
+}
+
+// This function retrieves the page size using GetSystemInfo and is present
+// solely so it can be called once to initialize the self_process member below.
+static unsigned getPageSize() {
+  // NOTE: A 32-bit application running under WOW64 is supposed to use
+  // GetNativeSystemInfo.  However, this interface is not present prior
+  // to Windows XP so to use it requires dynamic linking.  It is not clear
+  // how this affects the reported page size, if at all.  One could argue
+  // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  return static_cast<unsigned>(info.dwPageSize);
+}
+
+// This constructor guaranteed to be run exactly once on a single thread, and
+// sets up various process invariants that can be queried cheaply from then on.
+self_process::self_process() : PageSize(getPageSize()) {
+}
+
+
+size_t
+Process::GetMallocUsage()
+{
+  _HEAPINFO hinfo;
+  hinfo._pentry = NULL;
+
+  size_t size = 0;
+
+  while (_heapwalk(&hinfo) == _HEAPOK)
+    size += hinfo._size;
+
+  return size;
+}
+
+void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
+                           TimeValue &sys_time) {
+  elapsed = TimeValue::now();
+
+  FILETIME ProcCreate, ProcExit, KernelTime, UserTime;
+  if (GetProcessTimes(GetCurrentProcess(), &ProcCreate, &ProcExit, &KernelTime,
+                      &UserTime) == 0)
+    return;
+
+  user_time = getTimeValueFromFILETIME(UserTime);
+  sys_time = getTimeValueFromFILETIME(KernelTime);
+}
+
+int Process::GetCurrentUserId()
+{
+  return 65536;
+}
+
+int Process::GetCurrentGroupId()
+{
+  return 65536;
+}
+
+// Some LLVM programs such as bugpoint produce core files as a normal part of
+// their operation. To prevent the disk from filling up, this configuration item
+// does what's necessary to prevent their generation.
+void Process::PreventCoreFiles() {
+  // Windows doesn't do core files, but it does do modal pop-up message
+  // boxes.  As this method is used by bugpoint, preventing these pop-ups
+  // is the moral equivalent of suppressing core files.
+  SetErrorMode(SEM_FAILCRITICALERRORS |
+               SEM_NOGPFAULTERRORBOX |
+               SEM_NOOPENFILEERRORBOX);
+}
+
+bool Process::StandardInIsUserInput() {
+  return FileDescriptorIsDisplayed(0);
+}
+
+bool Process::StandardOutIsDisplayed() {
+  return FileDescriptorIsDisplayed(1);
+}
+
+bool Process::StandardErrIsDisplayed() {
+  return FileDescriptorIsDisplayed(2);
+}
+
+bool Process::FileDescriptorIsDisplayed(int fd) {
+  DWORD Mode;  // Unused
+  return (GetConsoleMode((HANDLE)_get_osfhandle(fd), &Mode) != 0);
+}
+
+unsigned Process::StandardOutColumns() {
+  unsigned Columns = 0;
+  CONSOLE_SCREEN_BUFFER_INFO csbi;
+  if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi))
+    Columns = csbi.dwSize.X;
+  return Columns;
+}
+
+unsigned Process::StandardErrColumns() {
+  unsigned Columns = 0;
+  CONSOLE_SCREEN_BUFFER_INFO csbi;
+  if (GetConsoleScreenBufferInfo(GetStdHandle(STD_ERROR_HANDLE), &csbi))
+    Columns = csbi.dwSize.X;
+  return Columns;
+}
+
+// The terminal always has colors.
+bool Process::FileDescriptorHasColors(int fd) {
+  return FileDescriptorIsDisplayed(fd);
+}
+
+bool Process::StandardOutHasColors() {
+  return FileDescriptorHasColors(1);
+}
+
+bool Process::StandardErrHasColors() {
+  return FileDescriptorHasColors(2);
+}
+
+namespace {
+class DefaultColors
+{
+  private:
+    WORD defaultColor;
+  public:
+    DefaultColors()
+     :defaultColor(GetCurrentColor()) {}
+    static unsigned GetCurrentColor() {
+      CONSOLE_SCREEN_BUFFER_INFO csbi;
+      if (GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi))
+        return csbi.wAttributes;
+      return 0;
+    }
+    WORD operator()() const { return defaultColor; }
+};
+
+DefaultColors defaultColors;
+}
+
+bool Process::ColorNeedsFlush() {
+  return true;
+}
+
+const char *Process::OutputBold(bool bg) {
+  WORD colors = DefaultColors::GetCurrentColor();
+  if (bg)
+    colors |= BACKGROUND_INTENSITY;
+  else
+    colors |= FOREGROUND_INTENSITY;
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors);
+  return 0;
+}
+
+const char *Process::OutputColor(char code, bool bold, bool bg) {
+  WORD colors;
+  if (bg) {
+    colors = ((code&1) ? BACKGROUND_RED : 0) |
+      ((code&2) ? BACKGROUND_GREEN : 0 ) |
+      ((code&4) ? BACKGROUND_BLUE : 0);
+    if (bold)
+      colors |= BACKGROUND_INTENSITY;
+  } else {
+    colors = ((code&1) ? FOREGROUND_RED : 0) |
+      ((code&2) ? FOREGROUND_GREEN : 0 ) |
+      ((code&4) ? FOREGROUND_BLUE : 0);
+    if (bold)
+      colors |= FOREGROUND_INTENSITY;
+  }
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), colors);
+  return 0;
+}
+
+static WORD GetConsoleTextAttribute(HANDLE hConsoleOutput) {
+  CONSOLE_SCREEN_BUFFER_INFO info;
+  GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &info);
+  return info.wAttributes;
+}
+
+const char *Process::OutputReverse() {
+  const WORD attributes
+   = GetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE));
+
+  const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+    FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+    BACKGROUND_RED | BACKGROUND_INTENSITY;
+  const WORD color_mask = foreground_mask | background_mask;
+
+  WORD new_attributes =
+    ((attributes & FOREGROUND_BLUE     )?BACKGROUND_BLUE     :0) |
+    ((attributes & FOREGROUND_GREEN    )?BACKGROUND_GREEN    :0) |
+    ((attributes & FOREGROUND_RED      )?BACKGROUND_RED      :0) |
+    ((attributes & FOREGROUND_INTENSITY)?BACKGROUND_INTENSITY:0) |
+    ((attributes & BACKGROUND_BLUE     )?FOREGROUND_BLUE     :0) |
+    ((attributes & BACKGROUND_GREEN    )?FOREGROUND_GREEN    :0) |
+    ((attributes & BACKGROUND_RED      )?FOREGROUND_RED      :0) |
+    ((attributes & BACKGROUND_INTENSITY)?FOREGROUND_INTENSITY:0) |
+    0;
+  new_attributes = (attributes & ~color_mask) | (new_attributes & color_mask);
+
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), new_attributes);
+  return 0;
+}
+
+const char *Process::ResetColor() {
+  SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
+  return 0;
+}
diff --git a/contrib/llvm/lib/Support/Windows/Program.inc b/contrib/llvm/lib/Support/Windows/Program.inc
new file mode 100644
index 000000000000..691d6d455501
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Program.inc
@@ -0,0 +1,399 @@
+//===- Win32/Program.cpp - Win32 Program Implementation ------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Program class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <cstdio>
+#include <fcntl.h>
+#include <io.h>
+#include <malloc.h>
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct Win32ProcessInfo {
+    HANDLE hProcess;
+    DWORD  dwProcessId;
+  };
+}
+
+namespace llvm {
+using namespace sys;
+
+Program::Program() : Data_(0) {}
+
+Program::~Program() {
+  if (Data_) {
+    Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
+    CloseHandle(wpi->hProcess);
+    delete wpi;
+    Data_ = 0;
+  }
+}
+
+// This function just uses the PATH environment variable to find the program.
+Path
+Program::FindProgramByName(const std::string& progName) {
+
+  // Check some degenerate cases
+  if (progName.length() == 0) // no program
+    return Path();
+  Path temp;
+  if (!temp.set(progName)) // invalid name
+    return Path();
+  // Return paths with slashes verbatim.
+  if (progName.find('\\') != std::string::npos ||
+      progName.find('/') != std::string::npos)
+    return temp;
+
+  // At this point, the file name is valid and does not contain slashes.
+  // Let Windows search for it.
+  char buffer[MAX_PATH];
+  char *dummy = NULL;
+  DWORD len = SearchPath(NULL, progName.c_str(), ".exe", MAX_PATH,
+                         buffer, &dummy);
+
+  // See if it wasn't found.
+  if (len == 0)
+    return Path();
+
+  // See if we got the entire path.
+  if (len < MAX_PATH)
+    return Path(buffer);
+
+  // Buffer was too small; grow and retry.
+  while (true) {
+    char *b = reinterpret_cast<char *>(_alloca(len+1));
+    DWORD len2 = SearchPath(NULL, progName.c_str(), ".exe", len+1, b, &dummy);
+
+    // It is unlikely the search failed, but it's always possible some file
+    // was added or removed since the last search, so be paranoid...
+    if (len2 == 0)
+      return Path();
+    else if (len2 <= len)
+      return Path(b);
+
+    len = len2;
+  }
+}
+
+static HANDLE RedirectIO(const Path *path, int fd, std::string* ErrMsg) {
+  HANDLE h;
+  if (path == 0) {
+    DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
+                    GetCurrentProcess(), &h,
+                    0, TRUE, DUPLICATE_SAME_ACCESS);
+    return h;
+  }
+
+  const char *fname;
+  if (path->isEmpty())
+    fname = "NUL";
+  else
+    fname = path->c_str();
+
+  SECURITY_ATTRIBUTES sa;
+  sa.nLength = sizeof(sa);
+  sa.lpSecurityDescriptor = 0;
+  sa.bInheritHandle = TRUE;
+
+  h = CreateFile(fname, fd ? GENERIC_WRITE : GENERIC_READ, FILE_SHARE_READ,
+                 &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
+                 FILE_ATTRIBUTE_NORMAL, NULL);
+  if (h == INVALID_HANDLE_VALUE) {
+    MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
+        (fd ? "input: " : "output: "));
+  }
+
+  return h;
+}
+
+/// ArgNeedsQuotes - Check whether argument needs to be quoted when calling
+/// CreateProcess.
+static bool ArgNeedsQuotes(const char *Str) {
+  return Str[0] == '\0' || strpbrk(Str, "\t \"&\'()*<>\\`^|") != 0;
+}
+
+
+/// ArgLenWithQuotes - Check whether argument needs to be quoted when calling
+/// CreateProcess and returns length of quoted arg with escaped quotes
+static unsigned int ArgLenWithQuotes(const char *Str) {
+  unsigned int len = ArgNeedsQuotes(Str) ? 2 : 0;
+
+  while (*Str != '\0') {
+    if (*Str == '\"')
+      ++len;
+
+    ++len;
+    ++Str;
+  }
+
+  return len;
+}
+
+
+bool
+Program::Execute(const Path& path,
+                 const char** args,
+                 const char** envp,
+                 const Path** redirects,
+                 unsigned memoryLimit,
+                 std::string* ErrMsg) {
+  if (Data_) {
+    Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
+    CloseHandle(wpi->hProcess);
+    delete wpi;
+    Data_ = 0;
+  }
+
+  if (!path.canExecute()) {
+    if (ErrMsg)
+      *ErrMsg = "program not executable";
+    return false;
+  }
+
+  // Windows wants a command line, not an array of args, to pass to the new
+  // process.  We have to concatenate them all, while quoting the args that
+  // have embedded spaces (or are empty).
+
+  // First, determine the length of the command line.
+  unsigned len = 0;
+  for (unsigned i = 0; args[i]; i++) {
+    len += ArgLenWithQuotes(args[i]) + 1;
+  }
+
+  // Now build the command line.
+  char *command = reinterpret_cast<char *>(_alloca(len+1));
+  char *p = command;
+
+  for (unsigned i = 0; args[i]; i++) {
+    const char *arg = args[i];
+
+    bool needsQuoting = ArgNeedsQuotes(arg);
+    if (needsQuoting)
+      *p++ = '"';
+
+    while (*arg != '\0') {
+      if (*arg == '\"')
+        *p++ = '\\';
+
+      *p++ = *arg++;
+    }
+
+    if (needsQuoting)
+      *p++ = '"';
+    *p++ = ' ';
+  }
+
+  *p = 0;
+
+  // The pointer to the environment block for the new process.
+  char *envblock = 0;
+
+  if (envp) {
+    // An environment block consists of a null-terminated block of
+    // null-terminated strings. Convert the array of environment variables to
+    // an environment block by concatenating them.
+
+    // First, determine the length of the environment block.
+    len = 0;
+    for (unsigned i = 0; envp[i]; i++)
+      len += strlen(envp[i]) + 1;
+
+    // Now build the environment block.
+    envblock = reinterpret_cast<char *>(_alloca(len+1));
+    p = envblock;
+
+    for (unsigned i = 0; envp[i]; i++) {
+      const char *ev = envp[i];
+      size_t len = strlen(ev) + 1;
+      memcpy(p, ev, len);
+      p += len;
+    }
+
+    *p = 0;
+  }
+
+  // Create a child process.
+  STARTUPINFO si;
+  memset(&si, 0, sizeof(si));
+  si.cb = sizeof(si);
+  si.hStdInput = INVALID_HANDLE_VALUE;
+  si.hStdOutput = INVALID_HANDLE_VALUE;
+  si.hStdError = INVALID_HANDLE_VALUE;
+
+  if (redirects) {
+    si.dwFlags = STARTF_USESTDHANDLES;
+
+    si.hStdInput = RedirectIO(redirects[0], 0, ErrMsg);
+    if (si.hStdInput == INVALID_HANDLE_VALUE) {
+      MakeErrMsg(ErrMsg, "can't redirect stdin");
+      return false;
+    }
+    si.hStdOutput = RedirectIO(redirects[1], 1, ErrMsg);
+    if (si.hStdOutput == INVALID_HANDLE_VALUE) {
+      CloseHandle(si.hStdInput);
+      MakeErrMsg(ErrMsg, "can't redirect stdout");
+      return false;
+    }
+    if (redirects[1] && redirects[2] && *(redirects[1]) == *(redirects[2])) {
+      // If stdout and stderr should go to the same place, redirect stderr
+      // to the handle already open for stdout.
+      DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
+                      GetCurrentProcess(), &si.hStdError,
+                      0, TRUE, DUPLICATE_SAME_ACCESS);
+    } else {
+      // Just redirect stderr
+      si.hStdError = RedirectIO(redirects[2], 2, ErrMsg);
+      if (si.hStdError == INVALID_HANDLE_VALUE) {
+        CloseHandle(si.hStdInput);
+        CloseHandle(si.hStdOutput);
+        MakeErrMsg(ErrMsg, "can't redirect stderr");
+        return false;
+      }
+    }
+  }
+
+  PROCESS_INFORMATION pi;
+  memset(&pi, 0, sizeof(pi));
+
+  fflush(stdout);
+  fflush(stderr);
+  BOOL rc = CreateProcess(path.c_str(), command, NULL, NULL, TRUE, 0,
+                          envblock, NULL, &si, &pi);
+  DWORD err = GetLastError();
+
+  // Regardless of whether the process got created or not, we are done with
+  // the handles we created for it to inherit.
+  CloseHandle(si.hStdInput);
+  CloseHandle(si.hStdOutput);
+  CloseHandle(si.hStdError);
+
+  // Now return an error if the process didn't get created.
+  if (!rc) {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, std::string("Couldn't execute program '") +
+               path.str() + "'");
+    return false;
+  }
+  Win32ProcessInfo* wpi = new Win32ProcessInfo;
+  wpi->hProcess = pi.hProcess;
+  wpi->dwProcessId = pi.dwProcessId;
+  Data_ = wpi;
+
+  // Make sure these get closed no matter what.
+  ScopedCommonHandle hThread(pi.hThread);
+
+  // Assign the process to a job if a memory limit is defined.
+  ScopedJobHandle hJob;
+  if (memoryLimit != 0) {
+    hJob = CreateJobObject(0, 0);
+    bool success = false;
+    if (hJob) {
+      JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
+      memset(&jeli, 0, sizeof(jeli));
+      jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_PROCESS_MEMORY;
+      jeli.ProcessMemoryLimit = uintptr_t(memoryLimit) * 1048576;
+      if (SetInformationJobObject(hJob, JobObjectExtendedLimitInformation,
+                                  &jeli, sizeof(jeli))) {
+        if (AssignProcessToJobObject(hJob, pi.hProcess))
+          success = true;
+      }
+    }
+    if (!success) {
+      SetLastError(GetLastError());
+      MakeErrMsg(ErrMsg, std::string("Unable to set memory limit"));
+      TerminateProcess(pi.hProcess, 1);
+      WaitForSingleObject(pi.hProcess, INFINITE);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int
+Program::Wait(const Path &path,
+              unsigned secondsToWait,
+              std::string* ErrMsg) {
+  if (Data_ == 0) {
+    MakeErrMsg(ErrMsg, "Process not started!");
+    return -1;
+  }
+
+  Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
+  HANDLE hProcess = wpi->hProcess;
+
+  // Wait for the process to terminate.
+  DWORD millisecondsToWait = INFINITE;
+  if (secondsToWait > 0)
+    millisecondsToWait = secondsToWait * 1000;
+
+  if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
+    if (!TerminateProcess(hProcess, 1)) {
+      MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
+      // -2 indicates a crash or timeout as opposed to failure to execute.
+      return -2;
+    }
+    WaitForSingleObject(hProcess, INFINITE);
+  }
+
+  // Get its exit status.
+  DWORD status;
+  BOOL rc = GetExitCodeProcess(hProcess, &status);
+  DWORD err = GetLastError();
+
+  if (!rc) {
+    SetLastError(err);
+    MakeErrMsg(ErrMsg, "Failed getting status for program.");
+    // -2 indicates a crash or timeout as opposed to failure to execute.
+    return -2;
+  }
+
+  if (!status)
+    return 0;
+
+  // Pass 10(Warning) and 11(Error) to the callee as negative value.
+  if ((status & 0xBFFF0000U) == 0x80000000U)
+    return (int)status;
+
+  if (status & 0xFF)
+    return status & 0x7FFFFFFF;
+
+  return 1;
+}
+
+error_code Program::ChangeStdinToBinary(){
+  int result = _setmode( _fileno(stdin), _O_BINARY );
+  if (result == -1)
+    return error_code(errno, generic_category());
+  return make_error_code(errc::success);
+}
+
+error_code Program::ChangeStdoutToBinary(){
+  int result = _setmode( _fileno(stdout), _O_BINARY );
+  if (result == -1)
+    return error_code(errno, generic_category());
+  return make_error_code(errc::success);
+}
+
+error_code Program::ChangeStderrToBinary(){
+  int result = _setmode( _fileno(stderr), _O_BINARY );
+  if (result == -1)
+    return error_code(errno, generic_category());
+  return make_error_code(errc::success);
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/RWMutex.inc b/contrib/llvm/lib/Support/Windows/RWMutex.inc
new file mode 100644
index 000000000000..95939237ba61
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/RWMutex.inc
@@ -0,0 +1,134 @@
+//= llvm/Support/Win32/Mutex.inc - Win32 Reader/Writer Mutual Exclusion Lock  =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 specific (non-pthread) RWMutex class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+
+namespace llvm {
+using namespace sys;
+
+// Windows has slim read-writer lock support on Vista and higher, so we
+// will attempt to load the APIs.  If they exist, we will use them, and
+// if not, we will fall back on critical sections.  When we drop support
+// for XP, we can stop lazy-loading these APIs and just use them directly.
+#if defined(__MINGW32__)
+  // Taken from WinNT.h
+  typedef struct _RTL_SRWLOCK {
+    PVOID Ptr;
+  } RTL_SRWLOCK, *PRTL_SRWLOCK;
+
+  // Taken from WinBase.h
+  typedef RTL_SRWLOCK SRWLOCK, *PSRWLOCK;
+#endif
+
+static VOID (WINAPI *fpInitializeSRWLock)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpAcquireSRWLockExclusive)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpAcquireSRWLockShared)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpReleaseSRWLockExclusive)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpReleaseSRWLockShared)(PSRWLOCK lock) = NULL;
+
+static bool sHasSRW = false;
+
+static bool loadSRW() {
+  static bool sChecked = false;
+  if (!sChecked) {
+    sChecked = true;
+
+    HMODULE hLib = ::LoadLibrary(TEXT("Kernel32"));
+    if (hLib) {
+      fpInitializeSRWLock =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "InitializeSRWLock");
+      fpAcquireSRWLockExclusive =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "AcquireSRWLockExclusive");
+      fpAcquireSRWLockShared =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "AcquireSRWLockShared");
+      fpReleaseSRWLockExclusive =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "ReleaseSRWLockExclusive");
+      fpReleaseSRWLockShared =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "ReleaseSRWLockShared");
+      ::FreeLibrary(hLib);
+
+      if (fpInitializeSRWLock != NULL) {
+        sHasSRW = true;
+      }
+    }
+  }
+  return sHasSRW;
+}
+
+RWMutexImpl::RWMutexImpl() {
+  if (loadSRW()) {
+    data_ = calloc(1, sizeof(SRWLOCK));
+    fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
+  } else {
+    data_ = calloc(1, sizeof(CRITICAL_SECTION));
+    InitializeCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
+}
+
+RWMutexImpl::~RWMutexImpl() {
+  if (sHasSRW) {
+    // Nothing to do in the case of slim reader/writers
+  } else {
+    DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+    free(data_);
+  }
+}
+
+bool RWMutexImpl::reader_acquire() {
+  if (sHasSRW) {
+    fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
+  } else {
+    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
+  return true;
+}
+
+bool RWMutexImpl::reader_release() {
+  if (sHasSRW) {
+    fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
+  } else {
+    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
+  return true;
+}
+
+bool RWMutexImpl::writer_acquire() {
+  if (sHasSRW) {
+    fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
+  } else {
+    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
+  return true;
+}
+
+bool RWMutexImpl::writer_release() {
+  if (sHasSRW) {
+    fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
+  } else {
+    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
+  return true;
+}
+
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm/lib/Support/Windows/Signals.inc
new file mode 100644
index 000000000000..3dd6660b031d
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Signals.inc
@@ -0,0 +1,492 @@
+//===- Win32/Signals.cpp - Win32 Signals Implementation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of the Signals class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <algorithm>
+#include <stdio.h>
+#include <vector>
+
+#ifdef __MINGW32__
+ #include <imagehlp.h>
+#else
+ #include <dbghelp.h>
+#endif
+#include <psapi.h>
+
+#ifdef _MSC_VER
+ #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "dbghelp.lib")
+#elif __MINGW32__
+ #if ((HAVE_LIBIMAGEHLP != 1) || (HAVE_LIBPSAPI != 1))
+  #error "libimagehlp.a & libpsapi.a should be present"
+ #endif
+ // The version of g++ that comes with MinGW does *not* properly understand
+ // the ll format specifier for printf. However, MinGW passes the format
+ // specifiers on to the MSVCRT entirely, and the CRT understands the ll
+ // specifier. So these warnings are spurious in this case. Since we compile
+ // with -Wall, this will generate these warnings which should be ignored. So
+ // we will turn off the warnings for this just file. However, MinGW also does
+ // not support push and pop for diagnostics, so we have to manually turn it
+ // back on at the end of the file.
+ #pragma GCC diagnostic ignored "-Wformat"
+ #pragma GCC diagnostic ignored "-Wformat-extra-args"
+
+ #if !defined(__MINGW64_VERSION_MAJOR)
+ // MinGW.org does not have updated support for the 64-bit versions of the
+ // DebugHlp APIs. So we will have to load them manually. The structures and
+ // method signatures were pulled from DbgHelp.h in the Windows Platform SDK,
+ // and adjusted for brevity.
+ typedef struct _IMAGEHLP_LINE64 {
+   DWORD    SizeOfStruct;
+   PVOID    Key;
+   DWORD    LineNumber;
+   PCHAR    FileName;
+   DWORD64  Address;
+ } IMAGEHLP_LINE64, *PIMAGEHLP_LINE64;
+
+ typedef struct _IMAGEHLP_SYMBOL64 {
+   DWORD   SizeOfStruct;
+   DWORD64 Address;
+   DWORD   Size;
+   DWORD   Flags;
+   DWORD   MaxNameLength;
+   CHAR    Name[1];
+ } IMAGEHLP_SYMBOL64, *PIMAGEHLP_SYMBOL64;
+
+ typedef struct _tagADDRESS64 {
+   DWORD64       Offset;
+   WORD          Segment;
+   ADDRESS_MODE  Mode;
+ } ADDRESS64, *LPADDRESS64;
+
+ typedef struct _KDHELP64 {
+   DWORD64   Thread;
+   DWORD   ThCallbackStack;
+   DWORD   ThCallbackBStore;
+   DWORD   NextCallback;
+   DWORD   FramePointer;
+   DWORD64   KiCallUserMode;
+   DWORD64   KeUserCallbackDispatcher;
+   DWORD64   SystemRangeStart;
+   DWORD64   KiUserExceptionDispatcher;
+   DWORD64   StackBase;
+   DWORD64   StackLimit;
+   DWORD64   Reserved[5];
+ } KDHELP64, *PKDHELP64;
+
+ typedef struct _tagSTACKFRAME64 {
+   ADDRESS64   AddrPC;
+   ADDRESS64   AddrReturn;
+   ADDRESS64   AddrFrame;
+   ADDRESS64   AddrStack;
+   ADDRESS64   AddrBStore;
+   PVOID       FuncTableEntry;
+   DWORD64     Params[4];
+   BOOL        Far;
+   BOOL        Virtual;
+   DWORD64     Reserved[3];
+   KDHELP64    KdHelp;
+ } STACKFRAME64, *LPSTACKFRAME64;
+
+typedef BOOL (__stdcall *PREAD_PROCESS_MEMORY_ROUTINE64)(HANDLE hProcess,
+                      DWORD64 qwBaseAddress, PVOID lpBuffer, DWORD nSize,
+                      LPDWORD lpNumberOfBytesRead);
+
+typedef PVOID (__stdcall *PFUNCTION_TABLE_ACCESS_ROUTINE64)( HANDLE ahProcess,
+                      DWORD64 AddrBase);
+
+typedef DWORD64 (__stdcall *PGET_MODULE_BASE_ROUTINE64)(HANDLE hProcess,
+                      DWORD64 Address);
+
+typedef DWORD64 (__stdcall *PTRANSLATE_ADDRESS_ROUTINE64)(HANDLE hProcess,
+                      HANDLE hThread, LPADDRESS64 lpaddr);
+
+typedef BOOL (WINAPI *fpStackWalk64)(DWORD, HANDLE, HANDLE, LPSTACKFRAME64,
+                      PVOID, PREAD_PROCESS_MEMORY_ROUTINE64,
+                      PFUNCTION_TABLE_ACCESS_ROUTINE64,
+                      PGET_MODULE_BASE_ROUTINE64,
+                      PTRANSLATE_ADDRESS_ROUTINE64);
+static fpStackWalk64 StackWalk64;
+
+typedef DWORD64 (WINAPI *fpSymGetModuleBase64)(HANDLE, DWORD64);
+static fpSymGetModuleBase64 SymGetModuleBase64;
+
+typedef BOOL (WINAPI *fpSymGetSymFromAddr64)(HANDLE, DWORD64,
+                      PDWORD64, PIMAGEHLP_SYMBOL64);
+static fpSymGetSymFromAddr64 SymGetSymFromAddr64;
+
+typedef BOOL (WINAPI *fpSymGetLineFromAddr64)(HANDLE, DWORD64,
+                      PDWORD, PIMAGEHLP_LINE64);
+static fpSymGetLineFromAddr64 SymGetLineFromAddr64;
+
+typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
+static fpSymFunctionTableAccess64 SymFunctionTableAccess64;
+
+static bool load64BitDebugHelp(void) {
+  HMODULE hLib = ::LoadLibrary("Dbghelp.dll");
+  if (hLib) {
+    StackWalk64 = (fpStackWalk64)
+                      ::GetProcAddress(hLib, "StackWalk64");
+    SymGetModuleBase64 = (fpSymGetModuleBase64)
+                      ::GetProcAddress(hLib, "SymGetModuleBase64");
+    SymGetSymFromAddr64 = (fpSymGetSymFromAddr64)
+                      ::GetProcAddress(hLib, "SymGetSymFromAddr64");
+    SymGetLineFromAddr64 = (fpSymGetLineFromAddr64)
+                      ::GetProcAddress(hLib, "SymGetLineFromAddr64");
+    SymFunctionTableAccess64 = (fpSymFunctionTableAccess64)
+                     ::GetProcAddress(hLib, "SymFunctionTableAccess64");
+  }
+  return StackWalk64 != NULL;
+}
+ #endif // !defined(__MINGW64_VERSION_MAJOR)
+#endif // __MINGW32__
+
+// Forward declare.
+static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep);
+static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
+
+// InterruptFunction - The function to call if ctrl-c is pressed.
+static void (*InterruptFunction)() = 0;
+
+static std::vector<llvm::sys::Path> *FilesToRemove = NULL;
+static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
+static bool RegisteredUnhandledExceptionFilter = false;
+static bool CleanupExecuted = false;
+static bool ExitOnUnhandledExceptions = false;
+static PTOP_LEVEL_EXCEPTION_FILTER OldFilter = NULL;
+
+// Windows creates a new thread to execute the console handler when an event
+// (such as CTRL/C) occurs.  This causes concurrency issues with the above
+// globals which this critical section addresses.
+static CRITICAL_SECTION CriticalSection;
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code
+//===          and must not be UNIX code
+//===----------------------------------------------------------------------===//
+
+#ifdef _MSC_VER
+/// CRTReportHook - Function called on a CRT debugging event.
+static int CRTReportHook(int ReportType, char *Message, int *Return) {
+  // Don't cause a DebugBreak() on return.
+  if (Return)
+    *Return = 0;
+
+  switch (ReportType) {
+  default:
+  case _CRT_ASSERT:
+    fprintf(stderr, "CRT assert: %s\n", Message);
+    // FIXME: Is there a way to just crash? Perhaps throw to the unhandled
+    // exception code? Perhaps SetErrorMode() handles this.
+    _exit(3);
+    break;
+  case _CRT_ERROR:
+    fprintf(stderr, "CRT error: %s\n", Message);
+    // FIXME: Is there a way to just crash? Perhaps throw to the unhandled
+    // exception code? Perhaps SetErrorMode() handles this.
+    _exit(3);
+    break;
+  case _CRT_WARN:
+    fprintf(stderr, "CRT warn: %s\n", Message);
+    break;
+  }
+
+  // Don't call _CrtDbgReport.
+  return TRUE;
+}
+#endif
+
+static void RegisterHandler() {
+#if __MINGW32__ && !defined(__MINGW64_VERSION_MAJOR)
+  // On MinGW.org, we need to load up the symbols explicitly, because the
+  // Win32 framework they include does not have support for the 64-bit
+  // versions of the APIs we need.  If we cannot load up the APIs (which
+  // would be unexpected as they should exist on every version of Windows
+  // we support), we will bail out since there would be nothing to report.
+  if (!load64BitDebugHelp()) {
+    assert(false && "These APIs should always be available");
+    return;
+  }
+#endif
+
+  if (RegisteredUnhandledExceptionFilter) {
+    EnterCriticalSection(&CriticalSection);
+    return;
+  }
+
+  // Now's the time to create the critical section.  This is the first time
+  // through here, and there's only one thread.
+  InitializeCriticalSection(&CriticalSection);
+
+  // Enter it immediately.  Now if someone hits CTRL/C, the console handler
+  // can't proceed until the globals are updated.
+  EnterCriticalSection(&CriticalSection);
+
+  RegisteredUnhandledExceptionFilter = true;
+  OldFilter = SetUnhandledExceptionFilter(LLVMUnhandledExceptionFilter);
+  SetConsoleCtrlHandler(LLVMConsoleCtrlHandler, TRUE);
+
+  // Environment variable to disable any kind of crash dialog.
+  if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
+#ifdef _MSC_VER
+    _CrtSetReportHook(CRTReportHook);
+#endif
+    SetErrorMode(SEM_FAILCRITICALERRORS |
+                 SEM_NOGPFAULTERRORBOX |
+                 SEM_NOOPENFILEERRORBOX);
+    ExitOnUnhandledExceptions = true;
+  }
+
+  // IMPORTANT NOTE: Caller must call LeaveCriticalSection(&CriticalSection) or
+  // else multi-threading problems will ensue.
+}
+
+// RemoveFileOnSignal - The public API
+bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
+  RegisterHandler();
+
+  if (CleanupExecuted) {
+    if (ErrMsg)
+      *ErrMsg = "Process terminating -- cannot register for removal";
+    return true;
+  }
+
+  if (FilesToRemove == NULL)
+    FilesToRemove = new std::vector<sys::Path>;
+
+  FilesToRemove->push_back(Filename);
+
+  LeaveCriticalSection(&CriticalSection);
+  return false;
+}
+
+// DontRemoveFileOnSignal - The public API
+void sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+  if (FilesToRemove == NULL)
+    return;
+
+  RegisterHandler();
+
+  FilesToRemove->push_back(Filename);
+  std::vector<sys::Path>::reverse_iterator I =
+  std::find(FilesToRemove->rbegin(), FilesToRemove->rend(), Filename);
+  if (I != FilesToRemove->rend())
+    FilesToRemove->erase(I.base()-1);
+
+  LeaveCriticalSection(&CriticalSection);
+}
+
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
+/// SIGSEGV) is delivered to the process, print a stack trace and then exit.
+void sys::PrintStackTraceOnErrorSignal() {
+  RegisterHandler();
+  LeaveCriticalSection(&CriticalSection);
+}
+
+void llvm::sys::PrintStackTrace(FILE *) {
+  // FIXME: Implement.
+}
+
+
+void sys::SetInterruptFunction(void (*IF)()) {
+  RegisterHandler();
+  InterruptFunction = IF;
+  LeaveCriticalSection(&CriticalSection);
+}
+
+
+/// AddSignalHandler - Add a function to be called when a signal is delivered
+/// to the process.  The handler can have a cookie passed to it to identify
+/// what instance of the handler it is.
+void sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
+  if (CallBacksToRun == 0)
+    CallBacksToRun = new std::vector<std::pair<void(*)(void*), void*> >();
+  CallBacksToRun->push_back(std::make_pair(FnPtr, Cookie));
+  RegisterHandler();
+  LeaveCriticalSection(&CriticalSection);
+}
+}
+
+static void Cleanup() {
+  EnterCriticalSection(&CriticalSection);
+
+  // Prevent other thread from registering new files and directories for
+  // removal, should we be executing because of the console handler callback.
+  CleanupExecuted = true;
+
+  // FIXME: open files cannot be deleted.
+
+  if (FilesToRemove != NULL)
+    while (!FilesToRemove->empty()) {
+      FilesToRemove->back().eraseFromDisk();
+      FilesToRemove->pop_back();
+    }
+
+  if (CallBacksToRun)
+    for (unsigned i = 0, e = CallBacksToRun->size(); i != e; ++i)
+      (*CallBacksToRun)[i].first((*CallBacksToRun)[i].second);
+
+  LeaveCriticalSection(&CriticalSection);
+}
+
+void llvm::sys::RunInterruptHandlers() {
+  Cleanup();
+}
+
+static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
+  Cleanup();
+
+  // Initialize the STACKFRAME structure.
+  STACKFRAME64 StackFrame;
+  memset(&StackFrame, 0, sizeof(StackFrame));
+
+  DWORD machineType;
+#if defined(_M_X64)
+  machineType = IMAGE_FILE_MACHINE_AMD64;
+  StackFrame.AddrPC.Offset = ep->ContextRecord->Rip;
+  StackFrame.AddrPC.Mode = AddrModeFlat;
+  StackFrame.AddrStack.Offset = ep->ContextRecord->Rsp;
+  StackFrame.AddrStack.Mode = AddrModeFlat;
+  StackFrame.AddrFrame.Offset = ep->ContextRecord->Rbp;
+  StackFrame.AddrFrame.Mode = AddrModeFlat;
+#elif defined(_M_IX86)
+  machineType = IMAGE_FILE_MACHINE_I386;
+  StackFrame.AddrPC.Offset = ep->ContextRecord->Eip;
+  StackFrame.AddrPC.Mode = AddrModeFlat;
+  StackFrame.AddrStack.Offset = ep->ContextRecord->Esp;
+  StackFrame.AddrStack.Mode = AddrModeFlat;
+  StackFrame.AddrFrame.Offset = ep->ContextRecord->Ebp;
+  StackFrame.AddrFrame.Mode = AddrModeFlat;
+#endif
+
+  HANDLE hProcess = GetCurrentProcess();
+  HANDLE hThread = GetCurrentThread();
+
+  // Initialize the symbol handler.
+  SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_LOAD_LINES);
+  SymInitialize(hProcess, NULL, TRUE);
+
+  while (true) {
+    if (!StackWalk64(machineType, hProcess, hThread, &StackFrame,
+                   ep->ContextRecord, NULL, SymFunctionTableAccess64,
+                   SymGetModuleBase64, NULL)) {
+      break;
+    }
+
+    if (StackFrame.AddrFrame.Offset == 0)
+      break;
+
+    // Print the PC in hexadecimal.
+    DWORD64 PC = StackFrame.AddrPC.Offset;
+#if defined(_M_X64)
+    fprintf(stderr, "0x%016llX", PC);
+#elif defined(_M_IX86)
+    fprintf(stderr, "0x%08lX", static_cast<DWORD>(PC));
+#endif
+
+    // Print the parameters.  Assume there are four.
+#if defined(_M_X64)
+    fprintf(stderr, " (0x%016llX 0x%016llX 0x%016llX 0x%016llX)",
+                StackFrame.Params[0],
+                StackFrame.Params[1],
+                StackFrame.Params[2],
+                StackFrame.Params[3]);
+#elif defined(_M_IX86)
+    fprintf(stderr, " (0x%08lX 0x%08lX 0x%08lX 0x%08lX)",
+                static_cast<DWORD>(StackFrame.Params[0]),
+                static_cast<DWORD>(StackFrame.Params[1]),
+                static_cast<DWORD>(StackFrame.Params[2]),
+                static_cast<DWORD>(StackFrame.Params[3]));
+#endif
+    // Verify the PC belongs to a module in this process.
+    if (!SymGetModuleBase64(hProcess, PC)) {
+      fputs(" <unknown module>\n", stderr);
+      continue;
+    }
+
+    // Print the symbol name.
+    char buffer[512];
+    IMAGEHLP_SYMBOL64 *symbol = reinterpret_cast<IMAGEHLP_SYMBOL64 *>(buffer);
+    memset(symbol, 0, sizeof(IMAGEHLP_SYMBOL64));
+    symbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64);
+    symbol->MaxNameLength = 512 - sizeof(IMAGEHLP_SYMBOL64);
+
+    DWORD64 dwDisp;
+    if (!SymGetSymFromAddr64(hProcess, PC, &dwDisp, symbol)) {
+      fputc('\n', stderr);
+      continue;
+    }
+
+    buffer[511] = 0;
+    if (dwDisp > 0)
+      fprintf(stderr, ", %s() + 0x%llX bytes(s)", symbol->Name, dwDisp);
+    else
+      fprintf(stderr, ", %s", symbol->Name);
+
+    // Print the source file and line number information.
+    IMAGEHLP_LINE64 line;
+    DWORD dwLineDisp;
+    memset(&line, 0, sizeof(line));
+    line.SizeOfStruct = sizeof(line);
+    if (SymGetLineFromAddr64(hProcess, PC, &dwLineDisp, &line)) {
+      fprintf(stderr, ", %s, line %lu", line.FileName, line.LineNumber);
+      if (dwLineDisp > 0)
+        fprintf(stderr, " + 0x%lX byte(s)", dwLineDisp);
+    }
+
+    fputc('\n', stderr);
+  }
+
+  if (ExitOnUnhandledExceptions)
+    _exit(ep->ExceptionRecord->ExceptionCode);
+
+  // Allow dialog box to pop up allowing choice to start debugger.
+  if (OldFilter)
+    return (*OldFilter)(ep);
+  else
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+
+static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
+  // We are running in our very own thread, courtesy of Windows.
+  EnterCriticalSection(&CriticalSection);
+  Cleanup();
+
+  // If an interrupt function has been set, go and run one it; otherwise,
+  // the process dies.
+  void (*IF)() = InterruptFunction;
+  InterruptFunction = 0;      // Don't run it on another CTRL-C.
+
+  if (IF) {
+    // Note: if the interrupt function throws an exception, there is nothing
+    // to catch it in this thread so it will kill the process.
+    IF();                     // Run it now.
+    LeaveCriticalSection(&CriticalSection);
+    return TRUE;              // Don't kill the process.
+  }
+
+  // Allow normal processing to take place; i.e., the process dies.
+  LeaveCriticalSection(&CriticalSection);
+  return FALSE;
+}
+
+#if __MINGW32__
+ // We turned these warnings off for this file so that MinGW-g++ doesn't
+ // complain about the ll format specifiers used.  Now we are turning the
+ // warnings back on.  If MinGW starts to support diagnostic stacks, we can
+ // replace this with a pop.
+ #pragma GCC diagnostic warning "-Wformat"
+ #pragma GCC diagnostic warning "-Wformat-extra-args"
+#endif
diff --git a/contrib/llvm/lib/Support/Windows/ThreadLocal.inc b/contrib/llvm/lib/Support/Windows/ThreadLocal.inc
new file mode 100644
index 000000000000..057deb325d6e
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/ThreadLocal.inc
@@ -0,0 +1,53 @@
+//= llvm/Support/Win32/ThreadLocal.inc - Win32 Thread Local Data -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Win32 specific (non-pthread) ThreadLocal class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include "llvm/Support/ThreadLocal.h"
+
+namespace llvm {
+using namespace sys;
+
+ThreadLocalImpl::ThreadLocalImpl() : data() {
+  typedef int SIZE_TOO_BIG[sizeof(DWORD) <= sizeof(data) ? 1 : -1];
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
+  *tls = TlsAlloc();
+  assert(*tls != TLS_OUT_OF_INDEXES);
+}
+
+ThreadLocalImpl::~ThreadLocalImpl() {
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
+  TlsFree(*tls);
+}
+
+const void* ThreadLocalImpl::getInstance() {
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
+  return TlsGetValue(*tls);
+}
+
+void ThreadLocalImpl::setInstance(const void* d){
+  DWORD* tls = reinterpret_cast<DWORD*>(&data);
+  int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
+  assert(errorcode != 0);
+  (void)errorcode;
+}
+
+void ThreadLocalImpl::removeInstance() {
+  setInstance(0);
+}
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/TimeValue.inc b/contrib/llvm/lib/Support/Windows/TimeValue.inc
new file mode 100644
index 000000000000..12275526f1c8
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/TimeValue.inc
@@ -0,0 +1,51 @@
+//===- Win32/TimeValue.cpp - Win32 TimeValue Implementation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 implementation of the TimeValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Windows.h"
+#include <time.h>
+
+namespace llvm {
+using namespace sys;
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only Win32 specific code.
+//===----------------------------------------------------------------------===//
+
+TimeValue TimeValue::now() {
+  uint64_t ft;
+  GetSystemTimeAsFileTime(reinterpret_cast<FILETIME *>(&ft));
+
+  TimeValue t(0, 0);
+  t.fromWin32Time(ft);
+  return t;
+}
+
+std::string TimeValue::str() const {
+#ifdef __MINGW32__
+  // This ban may be lifted by either:
+  // (i) a future MinGW version other than 1.0 inherents the __time64_t type, or
+  // (ii) configure tests for either the time_t or __time64_t type.
+  time_t ourTime = time_t(this->toEpochTime());
+  struct tm *lt = ::localtime(&ourTime);
+#else
+  __time64_t ourTime = this->toEpochTime();
+  struct tm *lt = ::_localtime64(&ourTime);
+#endif
+
+  char buffer[25];
+  strftime(buffer, 25, "%a %b %d %H:%M:%S %Y", lt);
+  return std::string(buffer);
+}
+
+
+}
diff --git a/contrib/llvm/lib/Support/Windows/Watchdog.inc b/contrib/llvm/lib/Support/Windows/Watchdog.inc
new file mode 100644
index 000000000000..fab2bdf2a941
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Watchdog.inc
@@ -0,0 +1,24 @@
+//===--- Windows/Watchdog.inc - Windows Watchdog Implementation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the generic Windows implementation of the Watchdog class.
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: implement.
+// Currently this is only used by PrettyStackTrace which is also unimplemented
+// on Windows. Roughly, a Windows implementation would use CreateWaitableTimer
+// and a second thread to run the TimerAPCProc.
+
+namespace llvm {
+  namespace sys {
+    Watchdog::Watchdog(unsigned int seconds) {}
+    Watchdog::~Watchdog() {}
+  }
+}
diff --git a/contrib/llvm/lib/Support/Windows/Windows.h b/contrib/llvm/lib/Support/Windows/Windows.h
new file mode 100644
index 000000000000..5c1da0d617aa
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Windows.h
@@ -0,0 +1,150 @@
+//===- Win32/Win32.h - Common Win32 Include File ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines things specific to Win32 implementations.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Win32 code that
+//===          is guaranteed to work on *all* Win32 variants.
+//===----------------------------------------------------------------------===//
+
+// mingw-w64 tends to define it as 0x0502 in its headers.
+#undef _WIN32_WINNT
+
+// Require at least Windows XP(5.1) API.
+#define _WIN32_WINNT 0x0501
+#define _WIN32_IE    0x0600 // MinGW at it again.
+#define WIN32_LEAN_AND_MEAN
+
+#include "llvm/Config/config.h" // Get build system configuration settings
+#include <windows.h>
+#include <wincrypt.h>
+#include <shlobj.h>
+#include <cassert>
+#include <string>
+
+inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
+  if (!ErrMsg)
+    return true;
+  char *buffer = NULL;
+  FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
+      NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+  *ErrMsg = prefix + buffer;
+  LocalFree(buffer);
+  return true;
+}
+
+template <typename HandleTraits>
+class ScopedHandle {
+  typedef typename HandleTraits::handle_type handle_type;
+  handle_type Handle;
+
+  ScopedHandle(const ScopedHandle &other); // = delete;
+  void operator=(const ScopedHandle &other); // = delete;
+public:
+  ScopedHandle()
+    : Handle(HandleTraits::GetInvalid()) {}
+
+  explicit ScopedHandle(handle_type h)
+    : Handle(h) {}
+
+  ~ScopedHandle() {
+    if (HandleTraits::IsValid(Handle))
+      HandleTraits::Close(Handle);
+  }
+
+  handle_type take() {
+    handle_type t = Handle;
+    Handle = HandleTraits::GetInvalid();
+    return t;
+  }
+
+  ScopedHandle &operator=(handle_type h) {
+    if (HandleTraits::IsValid(Handle))
+      HandleTraits::Close(Handle);
+    Handle = h;
+    return *this;
+  }
+
+  // True if Handle is valid.
+  operator bool() const {
+    return HandleTraits::IsValid(Handle) ? true : false;
+  }
+
+  operator handle_type() const {
+    return Handle;
+  }
+};
+
+struct CommonHandleTraits {
+  typedef HANDLE handle_type;
+
+  static handle_type GetInvalid() {
+    return INVALID_HANDLE_VALUE;
+  }
+
+  static void Close(handle_type h) {
+    ::CloseHandle(h);
+  }
+
+  static bool IsValid(handle_type h) {
+    return h != GetInvalid();
+  }
+};
+
+struct JobHandleTraits : CommonHandleTraits {
+  static handle_type GetInvalid() {
+    return NULL;
+  }
+};
+
+struct CryptContextTraits : CommonHandleTraits {
+  typedef HCRYPTPROV handle_type;
+
+  static handle_type GetInvalid() {
+    return 0;
+  }
+
+  static void Close(handle_type h) {
+    ::CryptReleaseContext(h, 0);
+  }
+
+  static bool IsValid(handle_type h) {
+    return h != GetInvalid();
+  }
+};
+
+struct FindHandleTraits : CommonHandleTraits {
+  static void Close(handle_type h) {
+    ::FindClose(h);
+  }
+};
+
+struct FileHandleTraits : CommonHandleTraits {};
+
+typedef ScopedHandle<CommonHandleTraits> ScopedCommonHandle;
+typedef ScopedHandle<FileHandleTraits>   ScopedFileHandle;
+typedef ScopedHandle<CryptContextTraits> ScopedCryptContext;
+typedef ScopedHandle<FindHandleTraits>   ScopedFindHandle;
+typedef ScopedHandle<JobHandleTraits>    ScopedJobHandle;
+
+namespace llvm {
+template <class T>
+class SmallVectorImpl;
+
+template <class T>
+typename SmallVectorImpl<T>::const_pointer
+c_str(SmallVectorImpl<T> &str) {
+  str.push_back(0);
+  str.pop_back();
+  return str.data();
+}
+} // end namespace llvm.
diff --git a/contrib/llvm/lib/Support/Windows/explicit_symbols.inc b/contrib/llvm/lib/Support/Windows/explicit_symbols.inc
new file mode 100644
index 000000000000..379645d2ff60
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/explicit_symbols.inc
@@ -0,0 +1,66 @@
+/* in libgcc.a */
+
+#ifdef HAVE__ALLOCA
+  EXPLICIT_SYMBOL(_alloca)
+  EXPLICIT_SYMBOL2(alloca, _alloca)
+#endif
+#ifdef HAVE___ALLOCA
+  EXPLICIT_SYMBOL(__alloca)
+#endif
+#ifdef HAVE___CHKSTK
+  EXPLICIT_SYMBOL(__chkstk)
+#endif
+#ifdef HAVE____CHKSTK
+  EXPLICIT_SYMBOL(___chkstk)
+#endif
+#ifdef HAVE___MAIN
+  EXPLICIT_SYMBOL(__main) // FIXME: Don't call it.
+#endif
+
+#ifdef HAVE___ASHLDI3
+  EXPLICIT_SYMBOL(__ashldi3)
+#endif
+#ifdef HAVE___ASHRDI3
+  EXPLICIT_SYMBOL(__ashrdi3)
+#endif
+#ifdef HAVE___CMPDI2 // FIXME: unused
+  EXPLICIT_SYMBOL(__cmpdi2)
+#endif
+#ifdef HAVE___DIVDI3
+  EXPLICIT_SYMBOL(__divdi3)
+#endif
+#ifdef HAVE___FIXDFDI
+  EXPLICIT_SYMBOL(__fixdfdi)
+#endif
+#ifdef HAVE___FIXSFDI
+  EXPLICIT_SYMBOL(__fixsfdi)
+#endif
+#ifdef HAVE___FIXUNSDFDI
+  EXPLICIT_SYMBOL(__fixunsdfdi)
+#endif
+#ifdef HAVE___FIXUNSSFDI
+  EXPLICIT_SYMBOL(__fixunssfdi)
+#endif
+#ifdef HAVE___FLOATDIDF
+  EXPLICIT_SYMBOL(__floatdidf)
+#endif
+#ifdef HAVE___FLOATDISF
+  EXPLICIT_SYMBOL(__floatdisf)
+#endif
+#ifdef HAVE___LSHRDI3
+  EXPLICIT_SYMBOL(__lshrdi3)
+#endif
+#ifdef HAVE___MODDI3
+  EXPLICIT_SYMBOL(__moddi3)
+#endif
+#ifdef HAVE___UDIVDI3
+  EXPLICIT_SYMBOL(__udivdi3)
+#endif
+#ifdef HAVE___UMODDI3
+  EXPLICIT_SYMBOL(__umoddi3)
+#endif
+
+/* msvcrt */
+#if defined(_MSC_VER)
+  EXPLICIT_SYMBOL2(alloca, _alloca_probe)
+#endif
diff --git a/contrib/llvm/lib/Support/Windows/system_error.inc b/contrib/llvm/lib/Support/Windows/system_error.inc
new file mode 100644
index 000000000000..37ec81dd363c
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/system_error.inc
@@ -0,0 +1,142 @@
+//===- llvm/Support/Win32/system_error.inc - Windows error_code --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Windows specific implementation of the error_code
+// and error_condition classes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only generic Windows code that
+//===          is guaranteed to work on *all* Windows variants.
+//===----------------------------------------------------------------------===//
+
+#include <windows.h>
+#include <winerror.h>
+
+using namespace llvm;
+
+std::string
+_system_error_category::message(int ev) const {
+  LPVOID lpMsgBuf = 0;
+  DWORD retval = ::FormatMessageA(
+    FORMAT_MESSAGE_ALLOCATE_BUFFER |
+    FORMAT_MESSAGE_FROM_SYSTEM |
+    FORMAT_MESSAGE_IGNORE_INSERTS,
+    NULL,
+    ev,
+    MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), // Default language
+    (LPSTR) &lpMsgBuf,
+    0,
+    NULL);
+  if (retval == 0) {
+    ::LocalFree(lpMsgBuf);
+    return std::string("Unknown error");
+  }
+
+  std::string str( static_cast<LPCSTR>(lpMsgBuf) );
+  ::LocalFree(lpMsgBuf);
+
+  while (str.size()
+     && (str[str.size()-1] == '\n' || str[str.size()-1] == '\r'))
+    str.erase( str.size()-1 );
+  if (str.size() && str[str.size()-1] == '.')
+    str.erase( str.size()-1 );
+  return str;
+}
+
+// I'd rather not double the line count of the following.
+#define MAP_ERR_TO_COND(x, y) case x: return make_error_condition(errc::y)
+
+error_condition
+_system_error_category::default_error_condition(int ev) const {
+  switch (ev) {
+  MAP_ERR_TO_COND(0, success);
+  // Windows system -> posix_errno decode table  ---------------------------//
+  // see WinError.h comments for descriptions of errors
+  MAP_ERR_TO_COND(ERROR_ACCESS_DENIED,       permission_denied);
+  MAP_ERR_TO_COND(ERROR_ALREADY_EXISTS,      file_exists);
+  MAP_ERR_TO_COND(ERROR_BAD_UNIT,            no_such_device);
+  MAP_ERR_TO_COND(ERROR_BUFFER_OVERFLOW,     filename_too_long);
+  MAP_ERR_TO_COND(ERROR_BUSY,                device_or_resource_busy);
+  MAP_ERR_TO_COND(ERROR_BUSY_DRIVE,          device_or_resource_busy);
+  MAP_ERR_TO_COND(ERROR_CANNOT_MAKE,         permission_denied);
+  MAP_ERR_TO_COND(ERROR_CANTOPEN,            io_error);
+  MAP_ERR_TO_COND(ERROR_CANTREAD,            io_error);
+  MAP_ERR_TO_COND(ERROR_CANTWRITE,           io_error);
+  MAP_ERR_TO_COND(ERROR_CURRENT_DIRECTORY,   permission_denied);
+  MAP_ERR_TO_COND(ERROR_DEV_NOT_EXIST,       no_such_device);
+  MAP_ERR_TO_COND(ERROR_DEVICE_IN_USE,       device_or_resource_busy);
+  MAP_ERR_TO_COND(ERROR_DIR_NOT_EMPTY,       directory_not_empty);
+  MAP_ERR_TO_COND(ERROR_DIRECTORY,           invalid_argument);
+  MAP_ERR_TO_COND(ERROR_DISK_FULL,           no_space_on_device);
+  MAP_ERR_TO_COND(ERROR_FILE_EXISTS,         file_exists);
+  MAP_ERR_TO_COND(ERROR_FILE_NOT_FOUND,      no_such_file_or_directory);
+  MAP_ERR_TO_COND(ERROR_HANDLE_DISK_FULL,    no_space_on_device);
+  MAP_ERR_TO_COND(ERROR_HANDLE_EOF,          value_too_large);
+  MAP_ERR_TO_COND(ERROR_INVALID_ACCESS,      permission_denied);
+  MAP_ERR_TO_COND(ERROR_INVALID_DRIVE,       no_such_device);
+  MAP_ERR_TO_COND(ERROR_INVALID_FUNCTION,    function_not_supported);
+  MAP_ERR_TO_COND(ERROR_INVALID_HANDLE,      invalid_argument);
+  MAP_ERR_TO_COND(ERROR_INVALID_NAME,        invalid_argument);
+  MAP_ERR_TO_COND(ERROR_LOCK_VIOLATION,      no_lock_available);
+  MAP_ERR_TO_COND(ERROR_LOCKED,              no_lock_available);
+  MAP_ERR_TO_COND(ERROR_NEGATIVE_SEEK,       invalid_argument);
+  MAP_ERR_TO_COND(ERROR_NOACCESS,            permission_denied);
+  MAP_ERR_TO_COND(ERROR_NOT_ENOUGH_MEMORY,   not_enough_memory);
+  MAP_ERR_TO_COND(ERROR_NOT_READY,           resource_unavailable_try_again);
+  MAP_ERR_TO_COND(ERROR_NOT_SAME_DEVICE,     cross_device_link);
+  MAP_ERR_TO_COND(ERROR_OPEN_FAILED,         io_error);
+  MAP_ERR_TO_COND(ERROR_OPEN_FILES,          device_or_resource_busy);
+  MAP_ERR_TO_COND(ERROR_OPERATION_ABORTED,   operation_canceled);
+  MAP_ERR_TO_COND(ERROR_OUTOFMEMORY,         not_enough_memory);
+  MAP_ERR_TO_COND(ERROR_PATH_NOT_FOUND,      no_such_file_or_directory);
+  MAP_ERR_TO_COND(ERROR_BAD_NETPATH,         no_such_file_or_directory);
+  MAP_ERR_TO_COND(ERROR_READ_FAULT,          io_error);
+  MAP_ERR_TO_COND(ERROR_RETRY,               resource_unavailable_try_again);
+  MAP_ERR_TO_COND(ERROR_SEEK,                io_error);
+  MAP_ERR_TO_COND(ERROR_SHARING_VIOLATION,   permission_denied);
+  MAP_ERR_TO_COND(ERROR_TOO_MANY_OPEN_FILES, too_many_files_open);
+  MAP_ERR_TO_COND(ERROR_WRITE_FAULT,         io_error);
+  MAP_ERR_TO_COND(ERROR_WRITE_PROTECT,       permission_denied);
+  MAP_ERR_TO_COND(ERROR_SEM_TIMEOUT,         timed_out);
+  MAP_ERR_TO_COND(WSAEACCES,                 permission_denied);
+  MAP_ERR_TO_COND(WSAEADDRINUSE,             address_in_use);
+  MAP_ERR_TO_COND(WSAEADDRNOTAVAIL,          address_not_available);
+  MAP_ERR_TO_COND(WSAEAFNOSUPPORT,           address_family_not_supported);
+  MAP_ERR_TO_COND(WSAEALREADY,               connection_already_in_progress);
+  MAP_ERR_TO_COND(WSAEBADF,                  bad_file_descriptor);
+  MAP_ERR_TO_COND(WSAECONNABORTED,           connection_aborted);
+  MAP_ERR_TO_COND(WSAECONNREFUSED,           connection_refused);
+  MAP_ERR_TO_COND(WSAECONNRESET,             connection_reset);
+  MAP_ERR_TO_COND(WSAEDESTADDRREQ,           destination_address_required);
+  MAP_ERR_TO_COND(WSAEFAULT,                 bad_address);
+  MAP_ERR_TO_COND(WSAEHOSTUNREACH,           host_unreachable);
+  MAP_ERR_TO_COND(WSAEINPROGRESS,            operation_in_progress);
+  MAP_ERR_TO_COND(WSAEINTR,                  interrupted);
+  MAP_ERR_TO_COND(WSAEINVAL,                 invalid_argument);
+  MAP_ERR_TO_COND(WSAEISCONN,                already_connected);
+  MAP_ERR_TO_COND(WSAEMFILE,                 too_many_files_open);
+  MAP_ERR_TO_COND(WSAEMSGSIZE,               message_size);
+  MAP_ERR_TO_COND(WSAENAMETOOLONG,           filename_too_long);
+  MAP_ERR_TO_COND(WSAENETDOWN,               network_down);
+  MAP_ERR_TO_COND(WSAENETRESET,              network_reset);
+  MAP_ERR_TO_COND(WSAENETUNREACH,            network_unreachable);
+  MAP_ERR_TO_COND(WSAENOBUFS,                no_buffer_space);
+  MAP_ERR_TO_COND(WSAENOPROTOOPT,            no_protocol_option);
+  MAP_ERR_TO_COND(WSAENOTCONN,               not_connected);
+  MAP_ERR_TO_COND(WSAENOTSOCK,               not_a_socket);
+  MAP_ERR_TO_COND(WSAEOPNOTSUPP,             operation_not_supported);
+  MAP_ERR_TO_COND(WSAEPROTONOSUPPORT,        protocol_not_supported);
+  MAP_ERR_TO_COND(WSAEPROTOTYPE,             wrong_protocol_type);
+  MAP_ERR_TO_COND(WSAETIMEDOUT,              timed_out);
+  MAP_ERR_TO_COND(WSAEWOULDBLOCK,            operation_would_block);
+  default: return error_condition(ev, system_category());
+  }
+}
diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp
new file mode 100644
index 000000000000..2cead20c0b21
--- /dev/null
+++ b/contrib/llvm/lib/Support/YAMLParser.cpp
@@ -0,0 +1,2147 @@
+//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements a YAML parser.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace yaml;
+
+enum UnicodeEncodingForm {
+  UEF_UTF32_LE, ///< UTF-32 Little Endian
+  UEF_UTF32_BE, ///< UTF-32 Big Endian
+  UEF_UTF16_LE, ///< UTF-16 Little Endian
+  UEF_UTF16_BE, ///< UTF-16 Big Endian
+  UEF_UTF8,     ///< UTF-8 or ascii.
+  UEF_Unknown   ///< Not a valid Unicode encoding.
+};
+
+/// EncodingInfo - Holds the encoding type and length of the byte order mark if
+///                it exists. Length is in {0, 2, 3, 4}.
+typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
+
+/// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
+///                      encoding form of \a Input.
+///
+/// @param Input A string of length 0 or more.
+/// @returns An EncodingInfo indicating the Unicode encoding form of the input
+///          and how long the byte order mark is if one exists.
+static EncodingInfo getUnicodeEncoding(StringRef Input) {
+  if (Input.size() == 0)
+    return std::make_pair(UEF_Unknown, 0);
+
+  switch (uint8_t(Input[0])) {
+  case 0x00:
+    if (Input.size() >= 4) {
+      if (  Input[1] == 0
+         && uint8_t(Input[2]) == 0xFE
+         && uint8_t(Input[3]) == 0xFF)
+        return std::make_pair(UEF_UTF32_BE, 4);
+      if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
+        return std::make_pair(UEF_UTF32_BE, 0);
+    }
+
+    if (Input.size() >= 2 && Input[1] != 0)
+      return std::make_pair(UEF_UTF16_BE, 0);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xFF:
+    if (  Input.size() >= 4
+       && uint8_t(Input[1]) == 0xFE
+       && Input[2] == 0
+       && Input[3] == 0)
+      return std::make_pair(UEF_UTF32_LE, 4);
+
+    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
+      return std::make_pair(UEF_UTF16_LE, 2);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xFE:
+    if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
+      return std::make_pair(UEF_UTF16_BE, 2);
+    return std::make_pair(UEF_Unknown, 0);
+  case 0xEF:
+    if (  Input.size() >= 3
+       && uint8_t(Input[1]) == 0xBB
+       && uint8_t(Input[2]) == 0xBF)
+      return std::make_pair(UEF_UTF8, 3);
+    return std::make_pair(UEF_Unknown, 0);
+  }
+
+  // It could still be utf-32 or utf-16.
+  if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
+    return std::make_pair(UEF_UTF32_LE, 0);
+
+  if (Input.size() >= 2 && Input[1] == 0)
+    return std::make_pair(UEF_UTF16_LE, 0);
+
+  return std::make_pair(UEF_UTF8, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// Token - A single YAML token.
+struct Token : ilist_node<Token> {
+  enum TokenKind {
+    TK_Error, // Uninitialized token.
+    TK_StreamStart,
+    TK_StreamEnd,
+    TK_VersionDirective,
+    TK_TagDirective,
+    TK_DocumentStart,
+    TK_DocumentEnd,
+    TK_BlockEntry,
+    TK_BlockEnd,
+    TK_BlockSequenceStart,
+    TK_BlockMappingStart,
+    TK_FlowEntry,
+    TK_FlowSequenceStart,
+    TK_FlowSequenceEnd,
+    TK_FlowMappingStart,
+    TK_FlowMappingEnd,
+    TK_Key,
+    TK_Value,
+    TK_Scalar,
+    TK_Alias,
+    TK_Anchor,
+    TK_Tag
+  } Kind;
+
+  /// A string of length 0 or more whose begin() points to the logical location
+  /// of the token in the input.
+  StringRef Range;
+
+  Token() : Kind(TK_Error) {}
+};
+}
+}
+
+namespace llvm {
+template<>
+struct ilist_sentinel_traits<Token> {
+  Token *createSentinel() const {
+    return &Sentinel;
+  }
+  static void destroySentinel(Token*) {}
+
+  Token *provideInitialHead() const { return createSentinel(); }
+  Token *ensureHead(Token*) const { return createSentinel(); }
+  static void noteHead(Token*, Token*) {}
+
+private:
+  mutable Token Sentinel;
+};
+
+template<>
+struct ilist_node_traits<Token> {
+  Token *createNode(const Token &V) {
+    return new (Alloc.Allocate<Token>()) Token(V);
+  }
+  static void deleteNode(Token *V) {}
+
+  void addNodeToList(Token *) {}
+  void removeNodeFromList(Token *) {}
+  void transferNodesFromList(ilist_node_traits &    /*SrcTraits*/,
+                             ilist_iterator<Token> /*first*/,
+                             ilist_iterator<Token> /*last*/) {}
+
+  BumpPtrAllocator Alloc;
+};
+}
+
+typedef ilist<Token> TokenQueueT;
+
+namespace {
+/// @brief This struct is used to track simple keys.
+///
+/// Simple keys are handled by creating an entry in SimpleKeys for each Token
+/// which could legally be the start of a simple key. When peekNext is called,
+/// if the Token To be returned is referenced by a SimpleKey, we continue
+/// tokenizing until that potential simple key has either been found to not be
+/// a simple key (we moved on to the next line or went further than 1024 chars).
+/// Or when we run into a Value, and then insert a Key token (and possibly
+/// others) before the SimpleKey's Tok.
+struct SimpleKey {
+  TokenQueueT::iterator Tok;
+  unsigned Column;
+  unsigned Line;
+  unsigned FlowLevel;
+  bool IsRequired;
+
+  bool operator ==(const SimpleKey &Other) {
+    return Tok == Other.Tok;
+  }
+};
+}
+
+/// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
+///        subsequence and the subsequence's length in code units (uint8_t).
+///        A length of 0 represents an error.
+typedef std::pair<uint32_t, unsigned> UTF8Decoded;
+
+static UTF8Decoded decodeUTF8(StringRef Range) {
+  StringRef::iterator Position= Range.begin();
+  StringRef::iterator End = Range.end();
+  // 1 byte: [0x00, 0x7f]
+  // Bit pattern: 0xxxxxxx
+  if ((*Position & 0x80) == 0) {
+     return std::make_pair(*Position, 1);
+  }
+  // 2 bytes: [0x80, 0x7ff]
+  // Bit pattern: 110xxxxx 10xxxxxx
+  if (Position + 1 != End &&
+      ((*Position & 0xE0) == 0xC0) &&
+      ((*(Position + 1) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x1F) << 6) |
+                          (*(Position + 1) & 0x3F);
+    if (codepoint >= 0x80)
+      return std::make_pair(codepoint, 2);
+  }
+  // 3 bytes: [0x8000, 0xffff]
+  // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
+  if (Position + 2 != End &&
+      ((*Position & 0xF0) == 0xE0) &&
+      ((*(Position + 1) & 0xC0) == 0x80) &&
+      ((*(Position + 2) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x0F) << 12) |
+                         ((*(Position + 1) & 0x3F) << 6) |
+                          (*(Position + 2) & 0x3F);
+    // Codepoints between 0xD800 and 0xDFFF are invalid, as
+    // they are high / low surrogate halves used by UTF-16.
+    if (codepoint >= 0x800 &&
+        (codepoint < 0xD800 || codepoint > 0xDFFF))
+      return std::make_pair(codepoint, 3);
+  }
+  // 4 bytes: [0x10000, 0x10FFFF]
+  // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  if (Position + 3 != End &&
+      ((*Position & 0xF8) == 0xF0) &&
+      ((*(Position + 1) & 0xC0) == 0x80) &&
+      ((*(Position + 2) & 0xC0) == 0x80) &&
+      ((*(Position + 3) & 0xC0) == 0x80)) {
+    uint32_t codepoint = ((*Position & 0x07) << 18) |
+                         ((*(Position + 1) & 0x3F) << 12) |
+                         ((*(Position + 2) & 0x3F) << 6) |
+                          (*(Position + 3) & 0x3F);
+    if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
+      return std::make_pair(codepoint, 4);
+  }
+  return std::make_pair(0, 0);
+}
+
+namespace llvm {
+namespace yaml {
+/// @brief Scans YAML tokens from a MemoryBuffer.
+class Scanner {
+public:
+  Scanner(const StringRef Input, SourceMgr &SM);
+  Scanner(MemoryBuffer *Buffer, SourceMgr &SM_);
+
+  /// @brief Parse the next token and return it without popping it.
+  Token &peekNext();
+
+  /// @brief Parse the next token and pop it from the queue.
+  Token getNext();
+
+  void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
+                  ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    SM.PrintMessage(Loc, Kind, Message, Ranges);
+  }
+
+  void setError(const Twine &Message, StringRef::iterator Position) {
+    if (Current >= End)
+      Current = End - 1;
+
+    // Don't print out more errors after the first one we encounter. The rest
+    // are just the result of the first, and have no meaning.
+    if (!Failed)
+      printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
+    Failed = true;
+  }
+
+  void setError(const Twine &Message) {
+    setError(Message, Current);
+  }
+
+  /// @brief Returns true if an error occurred while parsing.
+  bool failed() {
+    return Failed;
+  }
+
+private:
+  StringRef currentInput() {
+    return StringRef(Current, End - Current);
+  }
+
+  /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
+  ///        at \a Position.
+  ///
+  /// If the UTF-8 code units starting at Position do not form a well-formed
+  /// code unit subsequence, then the Unicode scalar value is 0, and the length
+  /// is 0.
+  UTF8Decoded decodeUTF8(StringRef::iterator Position) {
+    return ::decodeUTF8(StringRef(Position, End - Position));
+  }
+
+  // The following functions are based on the gramar rules in the YAML spec. The
+  // style of the function names it meant to closely match how they are written
+  // in the spec. The number within the [] is the number of the grammar rule in
+  // the spec.
+  //
+  // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
+  //
+  // c-
+  //   A production starting and ending with a special character.
+  // b-
+  //   A production matching a single line break.
+  // nb-
+  //   A production starting and ending with a non-break character.
+  // s-
+  //   A production starting and ending with a white space character.
+  // ns-
+  //   A production starting and ending with a non-space character.
+  // l-
+  //   A production matching complete line(s).
+
+  /// @brief Skip a single nb-char[27] starting at Position.
+  ///
+  /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
+  ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
+  ///
+  /// @returns The code unit after the nb-char, or Position if it's not an
+  ///          nb-char.
+  StringRef::iterator skip_nb_char(StringRef::iterator Position);
+
+  /// @brief Skip a single b-break[28] starting at Position.
+  ///
+  /// A b-break is 0xD 0xA | 0xD | 0xA
+  ///
+  /// @returns The code unit after the b-break, or Position if it's not a
+  ///          b-break.
+  StringRef::iterator skip_b_break(StringRef::iterator Position);
+
+  /// @brief Skip a single s-white[33] starting at Position.
+  ///
+  /// A s-white is 0x20 | 0x9
+  ///
+  /// @returns The code unit after the s-white, or Position if it's not a
+  ///          s-white.
+  StringRef::iterator skip_s_white(StringRef::iterator Position);
+
+  /// @brief Skip a single ns-char[34] starting at Position.
+  ///
+  /// A ns-char is nb-char - s-white
+  ///
+  /// @returns The code unit after the ns-char, or Position if it's not a
+  ///          ns-char.
+  StringRef::iterator skip_ns_char(StringRef::iterator Position);
+
+  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
+  /// @brief Skip minimal well-formed code unit subsequences until Func
+  ///        returns its input.
+  ///
+  /// @returns The code unit after the last minimal well-formed code unit
+  ///          subsequence that Func accepted.
+  StringRef::iterator skip_while( SkipWhileFunc Func
+                                , StringRef::iterator Position);
+
+  /// @brief Scan ns-uri-char[39]s starting at Cur.
+  ///
+  /// This updates Cur and Column while scanning.
+  ///
+  /// @returns A StringRef starting at Cur which covers the longest contiguous
+  ///          sequence of ns-uri-char.
+  StringRef scan_ns_uri_char();
+
+  /// @brief Scan ns-plain-one-line[133] starting at \a Cur.
+  StringRef scan_ns_plain_one_line();
+
+  /// @brief Consume a minimal well-formed code unit subsequence starting at
+  ///        \a Cur. Return false if it is not the same Unicode scalar value as
+  ///        \a Expected. This updates \a Column.
+  bool consume(uint32_t Expected);
+
+  /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
+  void skip(uint32_t Distance);
+
+  /// @brief Return true if the minimal well-formed code unit subsequence at
+  ///        Pos is whitespace or a new line
+  bool isBlankOrBreak(StringRef::iterator Position);
+
+  /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
+  void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+                             , unsigned AtColumn
+                             , bool IsRequired);
+
+  /// @brief Remove simple keys that can no longer be valid simple keys.
+  ///
+  /// Invalid simple keys are not on the current line or are further than 1024
+  /// columns back.
+  void removeStaleSimpleKeyCandidates();
+
+  /// @brief Remove all simple keys on FlowLevel \a Level.
+  void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
+
+  /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
+  ///        tokens if needed.
+  bool unrollIndent(int ToColumn);
+
+  /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
+  ///        if needed.
+  bool rollIndent( int ToColumn
+                 , Token::TokenKind Kind
+                 , TokenQueueT::iterator InsertPoint);
+
+  /// @brief Skip whitespace and comments until the start of the next token.
+  void scanToNextToken();
+
+  /// @brief Must be the first token generated.
+  bool scanStreamStart();
+
+  /// @brief Generate tokens needed to close out the stream.
+  bool scanStreamEnd();
+
+  /// @brief Scan a %BLAH directive.
+  bool scanDirective();
+
+  /// @brief Scan a ... or ---.
+  bool scanDocumentIndicator(bool IsStart);
+
+  /// @brief Scan a [ or { and generate the proper flow collection start token.
+  bool scanFlowCollectionStart(bool IsSequence);
+
+  /// @brief Scan a ] or } and generate the proper flow collection end token.
+  bool scanFlowCollectionEnd(bool IsSequence);
+
+  /// @brief Scan the , that separates entries in a flow collection.
+  bool scanFlowEntry();
+
+  /// @brief Scan the - that starts block sequence entries.
+  bool scanBlockEntry();
+
+  /// @brief Scan an explicit ? indicating a key.
+  bool scanKey();
+
+  /// @brief Scan an explicit : indicating a value.
+  bool scanValue();
+
+  /// @brief Scan a quoted scalar.
+  bool scanFlowScalar(bool IsDoubleQuoted);
+
+  /// @brief Scan an unquoted scalar.
+  bool scanPlainScalar();
+
+  /// @brief Scan an Alias or Anchor starting with * or &.
+  bool scanAliasOrAnchor(bool IsAlias);
+
+  /// @brief Scan a block scalar starting with | or >.
+  bool scanBlockScalar(bool IsLiteral);
+
+  /// @brief Scan a tag of the form !stuff.
+  bool scanTag();
+
+  /// @brief Dispatch to the next scanning function based on \a *Cur.
+  bool fetchMoreTokens();
+
+  /// @brief The SourceMgr used for diagnostics and buffer management.
+  SourceMgr &SM;
+
+  /// @brief The original input.
+  MemoryBuffer *InputBuffer;
+
+  /// @brief The current position of the scanner.
+  StringRef::iterator Current;
+
+  /// @brief The end of the input (one past the last character).
+  StringRef::iterator End;
+
+  /// @brief Current YAML indentation level in spaces.
+  int Indent;
+
+  /// @brief Current column number in Unicode code points.
+  unsigned Column;
+
+  /// @brief Current line number.
+  unsigned Line;
+
+  /// @brief How deep we are in flow style containers. 0 Means at block level.
+  unsigned FlowLevel;
+
+  /// @brief Are we at the start of the stream?
+  bool IsStartOfStream;
+
+  /// @brief Can the next token be the start of a simple key?
+  bool IsSimpleKeyAllowed;
+
+  /// @brief True if an error has occurred.
+  bool Failed;
+
+  /// @brief Queue of tokens. This is required to queue up tokens while looking
+  ///        for the end of a simple key. And for cases where a single character
+  ///        can produce multiple tokens (e.g. BlockEnd).
+  TokenQueueT TokenQueue;
+
+  /// @brief Indentation levels.
+  SmallVector<int, 4> Indents;
+
+  /// @brief Potential simple keys.
+  SmallVector<SimpleKey, 4> SimpleKeys;
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+/// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
+static void encodeUTF8( uint32_t UnicodeScalarValue
+                      , SmallVectorImpl<char> &Result) {
+  if (UnicodeScalarValue <= 0x7F) {
+    Result.push_back(UnicodeScalarValue & 0x7F);
+  } else if (UnicodeScalarValue <= 0x7FF) {
+    uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
+    uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+  } else if (UnicodeScalarValue <= 0xFFFF) {
+    uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
+    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+    uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+    Result.push_back(ThirdByte);
+  } else if (UnicodeScalarValue <= 0x10FFFF) {
+    uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
+    uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
+    uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
+    uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
+    Result.push_back(FirstByte);
+    Result.push_back(SecondByte);
+    Result.push_back(ThirdByte);
+    Result.push_back(FourthByte);
+  }
+}
+
+bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
+  SourceMgr SM;
+  Scanner scanner(Input, SM);
+  while (true) {
+    Token T = scanner.getNext();
+    switch (T.Kind) {
+    case Token::TK_StreamStart:
+      OS << "Stream-Start: ";
+      break;
+    case Token::TK_StreamEnd:
+      OS << "Stream-End: ";
+      break;
+    case Token::TK_VersionDirective:
+      OS << "Version-Directive: ";
+      break;
+    case Token::TK_TagDirective:
+      OS << "Tag-Directive: ";
+      break;
+    case Token::TK_DocumentStart:
+      OS << "Document-Start: ";
+      break;
+    case Token::TK_DocumentEnd:
+      OS << "Document-End: ";
+      break;
+    case Token::TK_BlockEntry:
+      OS << "Block-Entry: ";
+      break;
+    case Token::TK_BlockEnd:
+      OS << "Block-End: ";
+      break;
+    case Token::TK_BlockSequenceStart:
+      OS << "Block-Sequence-Start: ";
+      break;
+    case Token::TK_BlockMappingStart:
+      OS << "Block-Mapping-Start: ";
+      break;
+    case Token::TK_FlowEntry:
+      OS << "Flow-Entry: ";
+      break;
+    case Token::TK_FlowSequenceStart:
+      OS << "Flow-Sequence-Start: ";
+      break;
+    case Token::TK_FlowSequenceEnd:
+      OS << "Flow-Sequence-End: ";
+      break;
+    case Token::TK_FlowMappingStart:
+      OS << "Flow-Mapping-Start: ";
+      break;
+    case Token::TK_FlowMappingEnd:
+      OS << "Flow-Mapping-End: ";
+      break;
+    case Token::TK_Key:
+      OS << "Key: ";
+      break;
+    case Token::TK_Value:
+      OS << "Value: ";
+      break;
+    case Token::TK_Scalar:
+      OS << "Scalar: ";
+      break;
+    case Token::TK_Alias:
+      OS << "Alias: ";
+      break;
+    case Token::TK_Anchor:
+      OS << "Anchor: ";
+      break;
+    case Token::TK_Tag:
+      OS << "Tag: ";
+      break;
+    case Token::TK_Error:
+      break;
+    }
+    OS << T.Range << "\n";
+    if (T.Kind == Token::TK_StreamEnd)
+      break;
+    else if (T.Kind == Token::TK_Error)
+      return false;
+  }
+  return true;
+}
+
+bool yaml::scanTokens(StringRef Input) {
+  llvm::SourceMgr SM;
+  llvm::yaml::Scanner scanner(Input, SM);
+  for (;;) {
+    llvm::yaml::Token T = scanner.getNext();
+    if (T.Kind == Token::TK_StreamEnd)
+      break;
+    else if (T.Kind == Token::TK_Error)
+      return false;
+  }
+  return true;
+}
+
+std::string yaml::escape(StringRef Input) {
+  std::string EscapedInput;
+  for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
+    if (*i == '\\')
+      EscapedInput += "\\\\";
+    else if (*i == '"')
+      EscapedInput += "\\\"";
+    else if (*i == 0)
+      EscapedInput += "\\0";
+    else if (*i == 0x07)
+      EscapedInput += "\\a";
+    else if (*i == 0x08)
+      EscapedInput += "\\b";
+    else if (*i == 0x09)
+      EscapedInput += "\\t";
+    else if (*i == 0x0A)
+      EscapedInput += "\\n";
+    else if (*i == 0x0B)
+      EscapedInput += "\\v";
+    else if (*i == 0x0C)
+      EscapedInput += "\\f";
+    else if (*i == 0x0D)
+      EscapedInput += "\\r";
+    else if (*i == 0x1B)
+      EscapedInput += "\\e";
+    else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
+      std::string HexStr = utohexstr(*i);
+      EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+    } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
+      UTF8Decoded UnicodeScalarValue
+        = decodeUTF8(StringRef(i, Input.end() - i));
+      if (UnicodeScalarValue.second == 0) {
+        // Found invalid char.
+        SmallString<4> Val;
+        encodeUTF8(0xFFFD, Val);
+        EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
+        // FIXME: Error reporting.
+        return EscapedInput;
+      }
+      if (UnicodeScalarValue.first == 0x85)
+        EscapedInput += "\\N";
+      else if (UnicodeScalarValue.first == 0xA0)
+        EscapedInput += "\\_";
+      else if (UnicodeScalarValue.first == 0x2028)
+        EscapedInput += "\\L";
+      else if (UnicodeScalarValue.first == 0x2029)
+        EscapedInput += "\\P";
+      else {
+        std::string HexStr = utohexstr(UnicodeScalarValue.first);
+        if (HexStr.size() <= 2)
+          EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
+        else if (HexStr.size() <= 4)
+          EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
+        else if (HexStr.size() <= 8)
+          EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
+      }
+      i += UnicodeScalarValue.second - 1;
+    } else
+      EscapedInput.push_back(*i);
+  }
+  return EscapedInput;
+}
+
+Scanner::Scanner(StringRef Input, SourceMgr &sm)
+  : SM(sm)
+  , Indent(-1)
+  , Column(0)
+  , Line(0)
+  , FlowLevel(0)
+  , IsStartOfStream(true)
+  , IsSimpleKeyAllowed(true)
+  , Failed(false) {
+  InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML");
+  SM.AddNewSourceBuffer(InputBuffer, SMLoc());
+  Current = InputBuffer->getBufferStart();
+  End = InputBuffer->getBufferEnd();
+}
+
+Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_)
+  : SM(SM_)
+  , InputBuffer(Buffer)
+  , Current(InputBuffer->getBufferStart())
+  , End(InputBuffer->getBufferEnd())
+  , Indent(-1)
+  , Column(0)
+  , Line(0)
+  , FlowLevel(0)
+  , IsStartOfStream(true)
+  , IsSimpleKeyAllowed(true)
+  , Failed(false) {
+    SM.AddNewSourceBuffer(InputBuffer, SMLoc());
+}
+
+Token &Scanner::peekNext() {
+  // If the current token is a possible simple key, keep parsing until we
+  // can confirm.
+  bool NeedMore = false;
+  while (true) {
+    if (TokenQueue.empty() || NeedMore) {
+      if (!fetchMoreTokens()) {
+        TokenQueue.clear();
+        TokenQueue.push_back(Token());
+        return TokenQueue.front();
+      }
+    }
+    assert(!TokenQueue.empty() &&
+            "fetchMoreTokens lied about getting tokens!");
+
+    removeStaleSimpleKeyCandidates();
+    SimpleKey SK;
+    SK.Tok = TokenQueue.front();
+    if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK)
+        == SimpleKeys.end())
+      break;
+    else
+      NeedMore = true;
+  }
+  return TokenQueue.front();
+}
+
+Token Scanner::getNext() {
+  Token Ret = peekNext();
+  // TokenQueue can be empty if there was an error getting the next token.
+  if (!TokenQueue.empty())
+    TokenQueue.pop_front();
+
+  // There cannot be any referenced Token's if the TokenQueue is empty. So do a
+  // quick deallocation of them all.
+  if (TokenQueue.empty()) {
+    TokenQueue.Alloc.Reset();
+  }
+
+  return Ret;
+}
+
+StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  // Check 7 bit c-printable - b-char.
+  if (   *Position == 0x09
+      || (*Position >= 0x20 && *Position <= 0x7E))
+    return Position + 1;
+
+  // Check for valid UTF-8.
+  if (uint8_t(*Position) & 0x80) {
+    UTF8Decoded u8d = decodeUTF8(Position);
+    if (   u8d.second != 0
+        && u8d.first != 0xFEFF
+        && ( u8d.first == 0x85
+          || ( u8d.first >= 0xA0
+            && u8d.first <= 0xD7FF)
+          || ( u8d.first >= 0xE000
+            && u8d.first <= 0xFFFD)
+          || ( u8d.first >= 0x10000
+            && u8d.first <= 0x10FFFF)))
+      return Position + u8d.second;
+  }
+  return Position;
+}
+
+StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  if (*Position == 0x0D) {
+    if (Position + 1 != End && *(Position + 1) == 0x0A)
+      return Position + 2;
+    return Position + 1;
+  }
+
+  if (*Position == 0x0A)
+    return Position + 1;
+  return Position;
+}
+
+
+StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  if (*Position == ' ' || *Position == '\t')
+    return Position + 1;
+  return Position;
+}
+
+StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
+  if (Position == End)
+    return Position;
+  if (*Position == ' ' || *Position == '\t')
+    return Position;
+  return skip_nb_char(Position);
+}
+
+StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
+                                       , StringRef::iterator Position) {
+  while (true) {
+    StringRef::iterator i = (this->*Func)(Position);
+    if (i == Position)
+      break;
+    Position = i;
+  }
+  return Position;
+}
+
+static bool is_ns_hex_digit(const char C) {
+  return    (C >= '0' && C <= '9')
+         || (C >= 'a' && C <= 'z')
+         || (C >= 'A' && C <= 'Z');
+}
+
+static bool is_ns_word_char(const char C) {
+  return    C == '-'
+         || (C >= 'a' && C <= 'z')
+         || (C >= 'A' && C <= 'Z');
+}
+
+StringRef Scanner::scan_ns_uri_char() {
+  StringRef::iterator Start = Current;
+  while (true) {
+    if (Current == End)
+      break;
+    if ((   *Current == '%'
+          && Current + 2 < End
+          && is_ns_hex_digit(*(Current + 1))
+          && is_ns_hex_digit(*(Current + 2)))
+        || is_ns_word_char(*Current)
+        || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
+          != StringRef::npos) {
+      ++Current;
+      ++Column;
+    } else
+      break;
+  }
+  return StringRef(Start, Current - Start);
+}
+
+StringRef Scanner::scan_ns_plain_one_line() {
+  StringRef::iterator start = Current;
+  // The first character must already be verified.
+  ++Current;
+  while (true) {
+    if (Current == End) {
+      break;
+    } else if (*Current == ':') {
+      // Check if the next character is a ns-char.
+      if (Current + 1 == End)
+        break;
+      StringRef::iterator i = skip_ns_char(Current + 1);
+      if (Current + 1 != i) {
+        Current = i;
+        Column += 2; // Consume both the ':' and ns-char.
+      } else
+        break;
+    } else if (*Current == '#') {
+      // Check if the previous character was a ns-char.
+      // The & 0x80 check is to check for the trailing byte of a utf-8
+      if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) {
+        ++Current;
+        ++Column;
+      } else
+        break;
+    } else {
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current)
+        break;
+      Current = i;
+      ++Column;
+    }
+  }
+  return StringRef(start, Current - start);
+}
+
+bool Scanner::consume(uint32_t Expected) {
+  if (Expected >= 0x80)
+    report_fatal_error("Not dealing with this yet");
+  if (Current == End)
+    return false;
+  if (uint8_t(*Current) >= 0x80)
+    report_fatal_error("Not dealing with this yet");
+  if (uint8_t(*Current) == Expected) {
+    ++Current;
+    ++Column;
+    return true;
+  }
+  return false;
+}
+
+void Scanner::skip(uint32_t Distance) {
+  Current += Distance;
+  Column += Distance;
+  assert(Current <= End && "Skipped past the end");
+}
+
+bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
+  if (Position == End)
+    return false;
+  if (   *Position == ' ' || *Position == '\t'
+      || *Position == '\r' || *Position == '\n')
+    return true;
+  return false;
+}
+
+void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
+                                    , unsigned AtColumn
+                                    , bool IsRequired) {
+  if (IsSimpleKeyAllowed) {
+    SimpleKey SK;
+    SK.Tok = Tok;
+    SK.Line = Line;
+    SK.Column = AtColumn;
+    SK.IsRequired = IsRequired;
+    SK.FlowLevel = FlowLevel;
+    SimpleKeys.push_back(SK);
+  }
+}
+
+void Scanner::removeStaleSimpleKeyCandidates() {
+  for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
+                                            i != SimpleKeys.end();) {
+    if (i->Line != Line || i->Column + 1024 < Column) {
+      if (i->IsRequired)
+        setError( "Could not find expected : for simple key"
+                , i->Tok->Range.begin());
+      i = SimpleKeys.erase(i);
+    } else
+      ++i;
+  }
+}
+
+void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
+  if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
+    SimpleKeys.pop_back();
+}
+
+bool Scanner::unrollIndent(int ToColumn) {
+  Token T;
+  // Indentation is ignored in flow.
+  if (FlowLevel != 0)
+    return true;
+
+  while (Indent > ToColumn) {
+    T.Kind = Token::TK_BlockEnd;
+    T.Range = StringRef(Current, 1);
+    TokenQueue.push_back(T);
+    Indent = Indents.pop_back_val();
+  }
+
+  return true;
+}
+
+bool Scanner::rollIndent( int ToColumn
+                        , Token::TokenKind Kind
+                        , TokenQueueT::iterator InsertPoint) {
+  if (FlowLevel)
+    return true;
+  if (Indent < ToColumn) {
+    Indents.push_back(Indent);
+    Indent = ToColumn;
+
+    Token T;
+    T.Kind = Kind;
+    T.Range = StringRef(Current, 0);
+    TokenQueue.insert(InsertPoint, T);
+  }
+  return true;
+}
+
+void Scanner::scanToNextToken() {
+  while (true) {
+    while (*Current == ' ' || *Current == '\t') {
+      skip(1);
+    }
+
+    // Skip comment.
+    if (*Current == '#') {
+      while (true) {
+        // This may skip more than one byte, thus Column is only incremented
+        // for code points.
+        StringRef::iterator i = skip_nb_char(Current);
+        if (i == Current)
+          break;
+        Current = i;
+        ++Column;
+      }
+    }
+
+    // Skip EOL.
+    StringRef::iterator i = skip_b_break(Current);
+    if (i == Current)
+      break;
+    Current = i;
+    ++Line;
+    Column = 0;
+    // New lines may start a simple key.
+    if (!FlowLevel)
+      IsSimpleKeyAllowed = true;
+  }
+}
+
+bool Scanner::scanStreamStart() {
+  IsStartOfStream = false;
+
+  EncodingInfo EI = getUnicodeEncoding(currentInput());
+
+  Token T;
+  T.Kind = Token::TK_StreamStart;
+  T.Range = StringRef(Current, EI.second);
+  TokenQueue.push_back(T);
+  Current += EI.second;
+  return true;
+}
+
+bool Scanner::scanStreamEnd() {
+  // Force an ending new line if one isn't present.
+  if (Column != 0) {
+    Column = 0;
+    ++Line;
+  }
+
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  Token T;
+  T.Kind = Token::TK_StreamEnd;
+  T.Range = StringRef(Current, 0);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanDirective() {
+  // Reset the indentation level.
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  StringRef::iterator Start = Current;
+  consume('%');
+  StringRef::iterator NameStart = Current;
+  Current = skip_while(&Scanner::skip_ns_char, Current);
+  StringRef Name(NameStart, Current - NameStart);
+  Current = skip_while(&Scanner::skip_s_white, Current);
+
+  if (Name == "YAML") {
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    Token T;
+    T.Kind = Token::TK_VersionDirective;
+    T.Range = StringRef(Start, Current - Start);
+    TokenQueue.push_back(T);
+    return true;
+  }
+  return false;
+}
+
+bool Scanner::scanDocumentIndicator(bool IsStart) {
+  unrollIndent(-1);
+  SimpleKeys.clear();
+  IsSimpleKeyAllowed = false;
+
+  Token T;
+  T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
+  T.Range = StringRef(Current, 3);
+  skip(3);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanFlowCollectionStart(bool IsSequence) {
+  Token T;
+  T.Kind = IsSequence ? Token::TK_FlowSequenceStart
+                      : Token::TK_FlowMappingStart;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+
+  // [ and { may begin a simple key.
+  saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false);
+
+  // And may also be followed by a simple key.
+  IsSimpleKeyAllowed = true;
+  ++FlowLevel;
+  return true;
+}
+
+bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = false;
+  Token T;
+  T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
+                      : Token::TK_FlowMappingEnd;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  if (FlowLevel)
+    --FlowLevel;
+  return true;
+}
+
+bool Scanner::scanFlowEntry() {
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = true;
+  Token T;
+  T.Kind = Token::TK_FlowEntry;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanBlockEntry() {
+  rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = true;
+  Token T;
+  T.Kind = Token::TK_BlockEntry;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanKey() {
+  if (!FlowLevel)
+    rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+
+  removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
+  IsSimpleKeyAllowed = !FlowLevel;
+
+  Token T;
+  T.Kind = Token::TK_Key;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanValue() {
+  // If the previous token could have been a simple key, insert the key token
+  // into the token queue.
+  if (!SimpleKeys.empty()) {
+    SimpleKey SK = SimpleKeys.pop_back_val();
+    Token T;
+    T.Kind = Token::TK_Key;
+    T.Range = SK.Tok->Range;
+    TokenQueueT::iterator i, e;
+    for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
+      if (i == SK.Tok)
+        break;
+    }
+    assert(i != e && "SimpleKey not in token queue!");
+    i = TokenQueue.insert(i, T);
+
+    // We may also need to add a Block-Mapping-Start token.
+    rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
+
+    IsSimpleKeyAllowed = false;
+  } else {
+    if (!FlowLevel)
+      rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
+    IsSimpleKeyAllowed = !FlowLevel;
+  }
+
+  Token T;
+  T.Kind = Token::TK_Value;
+  T.Range = StringRef(Current, 1);
+  skip(1);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+// Forbidding inlining improves performance by roughly 20%.
+// FIXME: Remove once llvm optimizes this to the faster version without hints.
+LLVM_ATTRIBUTE_NOINLINE static bool
+wasEscaped(StringRef::iterator First, StringRef::iterator Position);
+
+// Returns whether a character at 'Position' was escaped with a leading '\'.
+// 'First' specifies the position of the first character in the string.
+static bool wasEscaped(StringRef::iterator First,
+                       StringRef::iterator Position) {
+  assert(Position - 1 >= First);
+  StringRef::iterator I = Position - 1;
+  // We calculate the number of consecutive '\'s before the current position
+  // by iterating backwards through our string.
+  while (I >= First && *I == '\\') --I;
+  // (Position - 1 - I) now contains the number of '\'s before the current
+  // position. If it is odd, the character at 'Position' was escaped.
+  return (Position - 1 - I) % 2 == 1;
+}
+
+bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  if (IsDoubleQuoted) {
+    do {
+      ++Current;
+      while (Current != End && *Current != '"')
+        ++Current;
+      // Repeat until the previous character was not a '\' or was an escaped
+      // backslash.
+    } while (   Current != End
+             && *(Current - 1) == '\\'
+             && wasEscaped(Start + 1, Current));
+  } else {
+    skip(1);
+    while (true) {
+      // Skip a ' followed by another '.
+      if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
+        skip(2);
+        continue;
+      } else if (*Current == '\'')
+        break;
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current) {
+        i = skip_b_break(Current);
+        if (i == Current)
+          break;
+        Current = i;
+        Column = 0;
+        ++Line;
+      } else {
+        if (i == End)
+          break;
+        Current = i;
+        ++Column;
+      }
+    }
+  }
+
+  if (Current == End) {
+    setError("Expected quote at end of scalar", Current);
+    return false;
+  }
+
+  skip(1); // Skip ending quote.
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanPlainScalar() {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  unsigned LeadingBlanks = 0;
+  assert(Indent >= -1 && "Indent must be >= -1 !");
+  unsigned indent = static_cast<unsigned>(Indent + 1);
+  while (true) {
+    if (*Current == '#')
+      break;
+
+    while (!isBlankOrBreak(Current)) {
+      if (  FlowLevel && *Current == ':'
+          && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
+        setError("Found unexpected ':' while scanning a plain scalar", Current);
+        return false;
+      }
+
+      // Check for the end of the plain scalar.
+      if (  (*Current == ':' && isBlankOrBreak(Current + 1))
+          || (  FlowLevel
+          && (StringRef(Current, 1).find_first_of(",:?[]{}")
+              != StringRef::npos)))
+        break;
+
+      StringRef::iterator i = skip_nb_char(Current);
+      if (i == Current)
+        break;
+      Current = i;
+      ++Column;
+    }
+
+    // Are we at the end?
+    if (!isBlankOrBreak(Current))
+      break;
+
+    // Eat blanks.
+    StringRef::iterator Tmp = Current;
+    while (isBlankOrBreak(Tmp)) {
+      StringRef::iterator i = skip_s_white(Tmp);
+      if (i != Tmp) {
+        if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
+          setError("Found invalid tab character in indentation", Tmp);
+          return false;
+        }
+        Tmp = i;
+        ++Column;
+      } else {
+        i = skip_b_break(Tmp);
+        if (!LeadingBlanks)
+          LeadingBlanks = 1;
+        Tmp = i;
+        Column = 0;
+        ++Line;
+      }
+    }
+
+    if (!FlowLevel && Column < indent)
+      break;
+
+    Current = Tmp;
+  }
+  if (Start == Current) {
+    setError("Got empty plain scalar", Start);
+    return false;
+  }
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Plain scalars can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanAliasOrAnchor(bool IsAlias) {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  skip(1);
+  while(true) {
+    if (   *Current == '[' || *Current == ']'
+        || *Current == '{' || *Current == '}'
+        || *Current == ','
+        || *Current == ':')
+      break;
+    StringRef::iterator i = skip_ns_char(Current);
+    if (i == Current)
+      break;
+    Current = i;
+    ++Column;
+  }
+
+  if (Start == Current) {
+    setError("Got empty alias or anchor", Start);
+    return false;
+  }
+
+  Token T;
+  T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Alias and anchors can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::scanBlockScalar(bool IsLiteral) {
+  StringRef::iterator Start = Current;
+  skip(1); // Eat | or >
+  while(true) {
+    StringRef::iterator i = skip_nb_char(Current);
+    if (i == Current) {
+      if (Column == 0)
+        break;
+      i = skip_b_break(Current);
+      if (i != Current) {
+        // We got a line break.
+        Column = 0;
+        ++Line;
+        Current = i;
+        continue;
+      } else {
+        // There was an error, which should already have been printed out.
+        return false;
+      }
+    }
+    Current = i;
+    ++Column;
+  }
+
+  if (Start == Current) {
+    setError("Got empty block scalar", Start);
+    return false;
+  }
+
+  Token T;
+  T.Kind = Token::TK_Scalar;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+  return true;
+}
+
+bool Scanner::scanTag() {
+  StringRef::iterator Start = Current;
+  unsigned ColStart = Column;
+  skip(1); // Eat !.
+  if (Current == End || isBlankOrBreak(Current)); // An empty tag.
+  else if (*Current == '<') {
+    skip(1);
+    scan_ns_uri_char();
+    if (!consume('>'))
+      return false;
+  } else {
+    // FIXME: Actually parse the c-ns-shorthand-tag rule.
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+  }
+
+  Token T;
+  T.Kind = Token::TK_Tag;
+  T.Range = StringRef(Start, Current - Start);
+  TokenQueue.push_back(T);
+
+  // Tags can be simple keys.
+  saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false);
+
+  IsSimpleKeyAllowed = false;
+
+  return true;
+}
+
+bool Scanner::fetchMoreTokens() {
+  if (IsStartOfStream)
+    return scanStreamStart();
+
+  scanToNextToken();
+
+  if (Current == End)
+    return scanStreamEnd();
+
+  removeStaleSimpleKeyCandidates();
+
+  unrollIndent(Column);
+
+  if (Column == 0 && *Current == '%')
+    return scanDirective();
+
+  if (Column == 0 && Current + 4 <= End
+      && *Current == '-'
+      && *(Current + 1) == '-'
+      && *(Current + 2) == '-'
+      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+    return scanDocumentIndicator(true);
+
+  if (Column == 0 && Current + 4 <= End
+      && *Current == '.'
+      && *(Current + 1) == '.'
+      && *(Current + 2) == '.'
+      && (Current + 3 == End || isBlankOrBreak(Current + 3)))
+    return scanDocumentIndicator(false);
+
+  if (*Current == '[')
+    return scanFlowCollectionStart(true);
+
+  if (*Current == '{')
+    return scanFlowCollectionStart(false);
+
+  if (*Current == ']')
+    return scanFlowCollectionEnd(true);
+
+  if (*Current == '}')
+    return scanFlowCollectionEnd(false);
+
+  if (*Current == ',')
+    return scanFlowEntry();
+
+  if (*Current == '-' && isBlankOrBreak(Current + 1))
+    return scanBlockEntry();
+
+  if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
+    return scanKey();
+
+  if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
+    return scanValue();
+
+  if (*Current == '*')
+    return scanAliasOrAnchor(true);
+
+  if (*Current == '&')
+    return scanAliasOrAnchor(false);
+
+  if (*Current == '!')
+    return scanTag();
+
+  if (*Current == '|' && !FlowLevel)
+    return scanBlockScalar(true);
+
+  if (*Current == '>' && !FlowLevel)
+    return scanBlockScalar(false);
+
+  if (*Current == '\'')
+    return scanFlowScalar(false);
+
+  if (*Current == '"')
+    return scanFlowScalar(true);
+
+  // Get a plain scalar.
+  StringRef FirstChar(Current, 1);
+  if (!(isBlankOrBreak(Current)
+        || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
+      || (*Current == '-' && !isBlankOrBreak(Current + 1))
+      || (!FlowLevel && (*Current == '?' || *Current == ':')
+          && isBlankOrBreak(Current + 1))
+      || (!FlowLevel && *Current == ':'
+                      && Current + 2 < End
+                      && *(Current + 1) == ':'
+                      && !isBlankOrBreak(Current + 2)))
+    return scanPlainScalar();
+
+  setError("Unrecognized character while tokenizing.");
+  return false;
+}
+
+Stream::Stream(StringRef Input, SourceMgr &SM)
+  : scanner(new Scanner(Input, SM))
+  , CurrentDoc(0) {}
+
+Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM)
+  : scanner(new Scanner(InputBuffer, SM))
+  , CurrentDoc(0) {}
+
+Stream::~Stream() {}
+
+bool Stream::failed() { return scanner->failed(); }
+
+void Stream::printError(Node *N, const Twine &Msg) {
+  SmallVector<SMRange, 1> Ranges;
+  Ranges.push_back(N->getSourceRange());
+  scanner->printError( N->getSourceRange().Start
+                     , SourceMgr::DK_Error
+                     , Msg
+                     , Ranges);
+}
+
+void Stream::handleYAMLDirective(const Token &t) {
+  // TODO: Ensure version is 1.x.
+}
+
+document_iterator Stream::begin() {
+  if (CurrentDoc)
+    report_fatal_error("Can only iterate over the stream once");
+
+  // Skip Stream-Start.
+  scanner->getNext();
+
+  CurrentDoc.reset(new Document(*this));
+  return document_iterator(CurrentDoc);
+}
+
+document_iterator Stream::end() {
+  return document_iterator();
+}
+
+void Stream::skip() {
+  for (document_iterator i = begin(), e = end(); i != e; ++i)
+    i->skip();
+}
+
+Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
+  : Doc(D)
+  , TypeID(Type)
+  , Anchor(A) {
+  SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
+  SourceRange = SMRange(Start, Start);
+}
+
+Token &Node::peekNext() {
+  return Doc->peekNext();
+}
+
+Token Node::getNext() {
+  return Doc->getNext();
+}
+
+Node *Node::parseBlockNode() {
+  return Doc->parseBlockNode();
+}
+
+BumpPtrAllocator &Node::getAllocator() {
+  return Doc->NodeAllocator;
+}
+
+void Node::setError(const Twine &Msg, Token &Tok) const {
+  Doc->setError(Msg, Tok);
+}
+
+bool Node::failed() const {
+  return Doc->failed();
+}
+
+
+
+StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
+  // TODO: Handle newlines properly. We need to remove leading whitespace.
+  if (Value[0] == '"') { // Double quoted.
+    // Pull off the leading and trailing "s.
+    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+    // Search for characters that would require unescaping the value.
+    StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
+    if (i != StringRef::npos)
+      return unescapeDoubleQuoted(UnquotedValue, i, Storage);
+    return UnquotedValue;
+  } else if (Value[0] == '\'') { // Single quoted.
+    // Pull off the leading and trailing 's.
+    StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
+    StringRef::size_type i = UnquotedValue.find('\'');
+    if (i != StringRef::npos) {
+      // We're going to need Storage.
+      Storage.clear();
+      Storage.reserve(UnquotedValue.size());
+      for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
+        StringRef Valid(UnquotedValue.begin(), i);
+        Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+        Storage.push_back('\'');
+        UnquotedValue = UnquotedValue.substr(i + 2);
+      }
+      Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+      return StringRef(Storage.begin(), Storage.size());
+    }
+    return UnquotedValue;
+  }
+  // Plain or block.
+  return Value.rtrim(" ");
+}
+
+StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
+                                          , StringRef::size_type i
+                                          , SmallVectorImpl<char> &Storage)
+                                          const {
+  // Use Storage to build proper value.
+  Storage.clear();
+  Storage.reserve(UnquotedValue.size());
+  for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
+    // Insert all previous chars into Storage.
+    StringRef Valid(UnquotedValue.begin(), i);
+    Storage.insert(Storage.end(), Valid.begin(), Valid.end());
+    // Chop off inserted chars.
+    UnquotedValue = UnquotedValue.substr(i);
+
+    assert(!UnquotedValue.empty() && "Can't be empty!");
+
+    // Parse escape or line break.
+    switch (UnquotedValue[0]) {
+    case '\r':
+    case '\n':
+      Storage.push_back('\n');
+      if (   UnquotedValue.size() > 1
+          && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+        UnquotedValue = UnquotedValue.substr(1);
+      UnquotedValue = UnquotedValue.substr(1);
+      break;
+    default:
+      if (UnquotedValue.size() == 1)
+        // TODO: Report error.
+        break;
+      UnquotedValue = UnquotedValue.substr(1);
+      switch (UnquotedValue[0]) {
+      default: {
+          Token T;
+          T.Range = StringRef(UnquotedValue.begin(), 1);
+          setError("Unrecognized escape code!", T);
+          return "";
+        }
+      case '\r':
+      case '\n':
+        // Remove the new line.
+        if (   UnquotedValue.size() > 1
+            && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
+          UnquotedValue = UnquotedValue.substr(1);
+        // If this was just a single byte newline, it will get skipped
+        // below.
+        break;
+      case '0':
+        Storage.push_back(0x00);
+        break;
+      case 'a':
+        Storage.push_back(0x07);
+        break;
+      case 'b':
+        Storage.push_back(0x08);
+        break;
+      case 't':
+      case 0x09:
+        Storage.push_back(0x09);
+        break;
+      case 'n':
+        Storage.push_back(0x0A);
+        break;
+      case 'v':
+        Storage.push_back(0x0B);
+        break;
+      case 'f':
+        Storage.push_back(0x0C);
+        break;
+      case 'r':
+        Storage.push_back(0x0D);
+        break;
+      case 'e':
+        Storage.push_back(0x1B);
+        break;
+      case ' ':
+        Storage.push_back(0x20);
+        break;
+      case '"':
+        Storage.push_back(0x22);
+        break;
+      case '/':
+        Storage.push_back(0x2F);
+        break;
+      case '\\':
+        Storage.push_back(0x5C);
+        break;
+      case 'N':
+        encodeUTF8(0x85, Storage);
+        break;
+      case '_':
+        encodeUTF8(0xA0, Storage);
+        break;
+      case 'L':
+        encodeUTF8(0x2028, Storage);
+        break;
+      case 'P':
+        encodeUTF8(0x2029, Storage);
+        break;
+      case 'x': {
+          if (UnquotedValue.size() < 3)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(2);
+          break;
+        }
+      case 'u': {
+          if (UnquotedValue.size() < 5)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(4);
+          break;
+        }
+      case 'U': {
+          if (UnquotedValue.size() < 9)
+            // TODO: Report error.
+            break;
+          unsigned int UnicodeScalarValue;
+          if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
+            // TODO: Report error.
+            UnicodeScalarValue = 0xFFFD;
+          encodeUTF8(UnicodeScalarValue, Storage);
+          UnquotedValue = UnquotedValue.substr(8);
+          break;
+        }
+      }
+      UnquotedValue = UnquotedValue.substr(1);
+    }
+  }
+  Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
+  return StringRef(Storage.begin(), Storage.size());
+}
+
+Node *KeyValueNode::getKey() {
+  if (Key)
+    return Key;
+  // Handle implicit null keys.
+  {
+    Token &t = peekNext();
+    if (   t.Kind == Token::TK_BlockEnd
+        || t.Kind == Token::TK_Value
+        || t.Kind == Token::TK_Error) {
+      return Key = new (getAllocator()) NullNode(Doc);
+    }
+    if (t.Kind == Token::TK_Key)
+      getNext(); // skip TK_Key.
+  }
+
+  // Handle explicit null keys.
+  Token &t = peekNext();
+  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
+    return Key = new (getAllocator()) NullNode(Doc);
+  }
+
+  // We've got a normal key.
+  return Key = parseBlockNode();
+}
+
+Node *KeyValueNode::getValue() {
+  if (Value)
+    return Value;
+  getKey()->skip();
+  if (failed())
+    return Value = new (getAllocator()) NullNode(Doc);
+
+  // Handle implicit null values.
+  {
+    Token &t = peekNext();
+    if (   t.Kind == Token::TK_BlockEnd
+        || t.Kind == Token::TK_FlowMappingEnd
+        || t.Kind == Token::TK_Key
+        || t.Kind == Token::TK_FlowEntry
+        || t.Kind == Token::TK_Error) {
+      return Value = new (getAllocator()) NullNode(Doc);
+    }
+
+    if (t.Kind != Token::TK_Value) {
+      setError("Unexpected token in Key Value.", t);
+      return Value = new (getAllocator()) NullNode(Doc);
+    }
+    getNext(); // skip TK_Value.
+  }
+
+  // Handle explicit null values.
+  Token &t = peekNext();
+  if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
+    return Value = new (getAllocator()) NullNode(Doc);
+  }
+
+  // We got a normal value.
+  return Value = parseBlockNode();
+}
+
+void MappingNode::increment() {
+  if (failed()) {
+    IsAtEnd = true;
+    CurrentEntry = 0;
+    return;
+  }
+  if (CurrentEntry) {
+    CurrentEntry->skip();
+    if (Type == MT_Inline) {
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      return;
+    }
+  }
+  Token T = peekNext();
+  if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
+    // KeyValueNode eats the TK_Key. That way it can detect null keys.
+    CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
+  } else if (Type == MT_Block) {
+    switch (T.Kind) {
+    case Token::TK_BlockEnd:
+      getNext();
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError("Unexpected token. Expected Key or Block End", T);
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else {
+    switch (T.Kind) {
+    case Token::TK_FlowEntry:
+      // Eat the flow entry and recurse.
+      getNext();
+      return increment();
+    case Token::TK_FlowMappingEnd:
+      getNext();
+    case Token::TK_Error:
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
+                "Mapping End."
+              , T);
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  }
+}
+
+void SequenceNode::increment() {
+  if (failed()) {
+    IsAtEnd = true;
+    CurrentEntry = 0;
+    return;
+  }
+  if (CurrentEntry)
+    CurrentEntry->skip();
+  Token T = peekNext();
+  if (SeqType == ST_Block) {
+    switch (T.Kind) {
+    case Token::TK_BlockEntry:
+      getNext();
+      CurrentEntry = parseBlockNode();
+      if (CurrentEntry == 0) { // An error occurred.
+        IsAtEnd = true;
+        CurrentEntry = 0;
+      }
+      break;
+    case Token::TK_BlockEnd:
+      getNext();
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      setError( "Unexpected token. Expected Block Entry or Block End."
+              , T);
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else if (SeqType == ST_Indentless) {
+    switch (T.Kind) {
+    case Token::TK_BlockEntry:
+      getNext();
+      CurrentEntry = parseBlockNode();
+      if (CurrentEntry == 0) { // An error occurred.
+        IsAtEnd = true;
+        CurrentEntry = 0;
+      }
+      break;
+    default:
+    case Token::TK_Error:
+      IsAtEnd = true;
+      CurrentEntry = 0;
+    }
+  } else if (SeqType == ST_Flow) {
+    switch (T.Kind) {
+    case Token::TK_FlowEntry:
+      // Eat the flow entry and recurse.
+      getNext();
+      WasPreviousTokenFlowEntry = true;
+      return increment();
+    case Token::TK_FlowSequenceEnd:
+      getNext();
+    case Token::TK_Error:
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    case Token::TK_StreamEnd:
+    case Token::TK_DocumentEnd:
+    case Token::TK_DocumentStart:
+      setError("Could not find closing ]!", T);
+      // Set this to end iterator.
+      IsAtEnd = true;
+      CurrentEntry = 0;
+      break;
+    default:
+      if (!WasPreviousTokenFlowEntry) {
+        setError("Expected , between entries!", T);
+        IsAtEnd = true;
+        CurrentEntry = 0;
+        break;
+      }
+      // Otherwise it must be a flow entry.
+      CurrentEntry = parseBlockNode();
+      if (!CurrentEntry) {
+        IsAtEnd = true;
+      }
+      WasPreviousTokenFlowEntry = false;
+      break;
+    }
+  }
+}
+
+Document::Document(Stream &S) : stream(S), Root(0) {
+  if (parseDirectives())
+    expectToken(Token::TK_DocumentStart);
+  Token &T = peekNext();
+  if (T.Kind == Token::TK_DocumentStart)
+    getNext();
+}
+
+bool Document::skip()  {
+  if (stream.scanner->failed())
+    return false;
+  if (!Root)
+    getRoot();
+  Root->skip();
+  Token &T = peekNext();
+  if (T.Kind == Token::TK_StreamEnd)
+    return false;
+  if (T.Kind == Token::TK_DocumentEnd) {
+    getNext();
+    return skip();
+  }
+  return true;
+}
+
+Token &Document::peekNext() {
+  return stream.scanner->peekNext();
+}
+
+Token Document::getNext() {
+  return stream.scanner->getNext();
+}
+
+void Document::setError(const Twine &Message, Token &Location) const {
+  stream.scanner->setError(Message, Location.Range.begin());
+}
+
+bool Document::failed() const {
+  return stream.scanner->failed();
+}
+
+Node *Document::parseBlockNode() {
+  Token T = peekNext();
+  // Handle properties.
+  Token AnchorInfo;
+parse_property:
+  switch (T.Kind) {
+  case Token::TK_Alias:
+    getNext();
+    return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
+  case Token::TK_Anchor:
+    if (AnchorInfo.Kind == Token::TK_Anchor) {
+      setError("Already encountered an anchor for this node!", T);
+      return 0;
+    }
+    AnchorInfo = getNext(); // Consume TK_Anchor.
+    T = peekNext();
+    goto parse_property;
+  case Token::TK_Tag:
+    getNext(); // Skip TK_Tag.
+    T = peekNext();
+    goto parse_property;
+  default:
+    break;
+  }
+
+  switch (T.Kind) {
+  case Token::TK_BlockEntry:
+    // We got an unindented BlockEntry sequence. This is not terminated with
+    // a BlockEnd.
+    // Don't eat the TK_BlockEntry, SequenceNode needs it.
+    return new (NodeAllocator) SequenceNode( stream.CurrentDoc
+                                           , AnchorInfo.Range.substr(1)
+                                           , SequenceNode::ST_Indentless);
+  case Token::TK_BlockSequenceStart:
+    getNext();
+    return new (NodeAllocator)
+      SequenceNode( stream.CurrentDoc
+                  , AnchorInfo.Range.substr(1)
+                  , SequenceNode::ST_Block);
+  case Token::TK_BlockMappingStart:
+    getNext();
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Block);
+  case Token::TK_FlowSequenceStart:
+    getNext();
+    return new (NodeAllocator)
+      SequenceNode( stream.CurrentDoc
+                  , AnchorInfo.Range.substr(1)
+                  , SequenceNode::ST_Flow);
+  case Token::TK_FlowMappingStart:
+    getNext();
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Flow);
+  case Token::TK_Scalar:
+    getNext();
+    return new (NodeAllocator)
+      ScalarNode( stream.CurrentDoc
+                , AnchorInfo.Range.substr(1)
+                , T.Range);
+  case Token::TK_Key:
+    // Don't eat the TK_Key, KeyValueNode expects it.
+    return new (NodeAllocator)
+      MappingNode( stream.CurrentDoc
+                 , AnchorInfo.Range.substr(1)
+                 , MappingNode::MT_Inline);
+  case Token::TK_DocumentStart:
+  case Token::TK_DocumentEnd:
+  case Token::TK_StreamEnd:
+  default:
+    // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
+    //       !!null null.
+    return new (NodeAllocator) NullNode(stream.CurrentDoc);
+  case Token::TK_Error:
+    return 0;
+  }
+  llvm_unreachable("Control flow shouldn't reach here.");
+  return 0;
+}
+
+bool Document::parseDirectives() {
+  bool isDirective = false;
+  while (true) {
+    Token T = peekNext();
+    if (T.Kind == Token::TK_TagDirective) {
+      handleTagDirective(getNext());
+      isDirective = true;
+    } else if (T.Kind == Token::TK_VersionDirective) {
+      stream.handleYAMLDirective(getNext());
+      isDirective = true;
+    } else
+      break;
+  }
+  return isDirective;
+}
+
+bool Document::expectToken(int TK) {
+  Token T = getNext();
+  if (T.Kind != TK) {
+    setError("Unexpected token", T);
+    return false;
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm/lib/Support/YAMLTraits.cpp
new file mode 100644
index 000000000000..9da2aa7c841d
--- /dev/null
+++ b/contrib/llvm/lib/Support/YAMLTraits.cpp
@@ -0,0 +1,827 @@
+//===- lib/Support/YAMLTraits.cpp -----------------------------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+using namespace llvm;
+using namespace yaml;
+
+//===----------------------------------------------------------------------===//
+//  IO
+//===----------------------------------------------------------------------===//
+
+IO::IO(void *Context) : Ctxt(Context) {
+}
+
+IO::~IO() {
+}
+
+void *IO::getContext() {
+  return Ctxt;
+}
+
+void IO::setContext(void *Context) {
+  Ctxt = Context;
+}
+
+//===----------------------------------------------------------------------===//
+//  Input
+//===----------------------------------------------------------------------===//
+
+Input::Input(StringRef InputContent, void *Ctxt) 
+  : IO(Ctxt), 
+    Strm(new Stream(InputContent, SrcMgr)),
+    CurrentNode(NULL) {
+  DocIterator = Strm->begin();
+}
+
+Input::~Input() {
+  
+}
+
+error_code Input::error() {
+  return EC;
+}
+
+void Input::setDiagHandler(SourceMgr::DiagHandlerTy Handler, void *Ctxt) {
+  SrcMgr.setDiagHandler(Handler, Ctxt);
+}
+
+bool Input::outputting() {
+  return false;
+}
+
+bool Input::setCurrentDocument() {
+  if (DocIterator != Strm->end()) {
+    Node *N = DocIterator->getRoot();
+    if (isa<NullNode>(N)) {
+      // Empty files are allowed and ignored
+      ++DocIterator;
+      return setCurrentDocument();
+    }
+    TopNode.reset(this->createHNodes(N));
+    CurrentNode = TopNode.get();
+    return true;
+  }
+  return false;
+}
+
+void Input::nextDocument() {
+  ++DocIterator;
+}
+
+void Input::beginMapping() {
+  if (EC)
+    return;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (MN) {
+    MN->ValidKeys.clear();
+  }
+}
+
+bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
+                         void *&SaveInfo) {
+  UseDefault = false;
+  if (EC)
+    return false;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (!MN) {
+    setError(CurrentNode, "not a mapping");
+    return false;
+  }
+  MN->ValidKeys.push_back(Key);
+  HNode *Value = MN->Mapping[Key];
+  if (!Value) {
+    if (Required)
+      setError(CurrentNode, Twine("missing required key '") + Key + "'");
+    else
+      UseDefault = true;
+    return false;
+  }
+  SaveInfo = CurrentNode;
+  CurrentNode = Value;
+  return true;
+}
+
+void Input::postflightKey(void *saveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(saveInfo);
+}
+
+void Input::endMapping() {
+  if (EC)
+    return;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (!MN)
+    return;
+  for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
+       End = MN->Mapping.end(); i != End; ++i) {
+    if (!MN->isValidKey(i->first)) {
+      setError(i->second, Twine("unknown key '") + i->first + "'");
+      break;
+    }
+  }
+}
+
+unsigned Input::beginSequence() {
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    return SQ->Entries.size();
+  }
+  return 0;
+}
+
+void Input::endSequence() {
+}
+
+bool Input::preflightElement(unsigned Index, void *&SaveInfo) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    SaveInfo = CurrentNode;
+    CurrentNode = SQ->Entries[Index];
+    return true;
+  }
+  return false;
+}
+
+void Input::postflightElement(void *SaveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+unsigned Input::beginFlowSequence() {
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    return SQ->Entries.size();
+  }
+  return 0;
+}
+
+bool Input::preflightFlowElement(unsigned index, void *&SaveInfo) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    SaveInfo = CurrentNode;
+    CurrentNode = SQ->Entries[index];
+    return true;
+  }
+  return false;
+}
+
+void Input::postflightFlowElement(void *SaveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+void Input::endFlowSequence() {
+}
+
+void Input::beginEnumScalar() {
+  ScalarMatchFound = false;
+}
+
+bool Input::matchEnumScalar(const char *Str, bool) {
+  if (ScalarMatchFound)
+    return false;
+  if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+    if (SN->value().equals(Str)) {
+      ScalarMatchFound = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+void Input::endEnumScalar() {
+  if (!ScalarMatchFound) {
+    setError(CurrentNode, "unknown enumerated scalar");
+  }
+}
+
+bool Input::beginBitSetScalar(bool &DoClear) {
+  BitValuesUsed.clear();
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    BitValuesUsed.insert(BitValuesUsed.begin(), SQ->Entries.size(), false);
+  } else {
+    setError(CurrentNode, "expected sequence of bit values");
+  }
+  DoClear = true;
+  return true;
+}
+
+bool Input::bitSetMatch(const char *Str, bool) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    unsigned Index = 0;
+    for (std::vector<HNode *>::iterator i = SQ->Entries.begin(),
+         End = SQ->Entries.end(); i != End; ++i) {
+      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(*i)) {
+        if (SN->value().equals(Str)) {
+          BitValuesUsed[Index] = true;
+          return true;
+        }
+      } else {
+        setError(CurrentNode, "unexpected scalar in sequence of bit values");
+      }
+      ++Index;
+    }
+  } else {
+    setError(CurrentNode, "expected sequence of bit values");
+  }
+  return false;
+}
+
+void Input::endBitSetScalar() {
+  if (EC)
+    return;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    assert(BitValuesUsed.size() == SQ->Entries.size());
+    for (unsigned i = 0; i < SQ->Entries.size(); ++i) {
+      if (!BitValuesUsed[i]) {
+        setError(SQ->Entries[i], "unknown bit value");
+        return;
+      }
+    }
+  }
+}
+
+void Input::scalarString(StringRef &S) {
+  if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+    S = SN->value();
+  } else {
+    setError(CurrentNode, "unexpected scalar");
+  }
+}
+
+void Input::setError(HNode *hnode, const Twine &message) {
+  this->setError(hnode->_node, message);
+}
+
+void Input::setError(Node *node, const Twine &message) {
+  Strm->printError(node, message);
+  EC = make_error_code(errc::invalid_argument);
+}
+
+Input::HNode *Input::createHNodes(Node *N) {
+  SmallString<128> StringStorage;
+  if (ScalarNode *SN = dyn_cast<ScalarNode>(N)) {
+    StringRef KeyStr = SN->getValue(StringStorage);
+    if (!StringStorage.empty()) {
+      // Copy string to permanent storage
+      unsigned Len = StringStorage.size();
+      char *Buf = StringAllocator.Allocate<char>(Len);
+      memcpy(Buf, &StringStorage[0], Len);
+      KeyStr = StringRef(Buf, Len);
+    }
+    return new ScalarHNode(N, KeyStr);
+  } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
+    SequenceHNode *SQHNode = new SequenceHNode(N);
+    for (SequenceNode::iterator i = SQ->begin(), End = SQ->end(); i != End;
+         ++i) {
+      HNode *Entry = this->createHNodes(i);
+      if (EC)
+        break;
+      SQHNode->Entries.push_back(Entry);
+    }
+    return SQHNode;
+  } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
+    MapHNode *mapHNode = new MapHNode(N);
+    for (MappingNode::iterator i = Map->begin(), End = Map->end(); i != End;
+         ++i) {
+      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(i->getKey());
+      StringStorage.clear();
+      StringRef KeyStr = KeyScalar->getValue(StringStorage);
+      if (!StringStorage.empty()) {
+        // Copy string to permanent storage
+        unsigned Len = StringStorage.size();
+        char *Buf = StringAllocator.Allocate<char>(Len);
+        memcpy(Buf, &StringStorage[0], Len);
+        KeyStr = StringRef(Buf, Len);
+      }
+      HNode *ValueHNode = this->createHNodes(i->getValue());
+      if (EC)
+        break;
+      mapHNode->Mapping[KeyStr] = ValueHNode;
+    }
+    return mapHNode;
+  } else if (isa<NullNode>(N)) {
+    return new EmptyHNode(N);
+  } else {
+    setError(N, "unknown node kind");
+    return NULL;
+  }
+}
+
+bool Input::MapHNode::isValidKey(StringRef Key) {
+  for (SmallVector<const char *, 6>::iterator i = ValidKeys.begin(),
+       End = ValidKeys.end(); i != End; ++i) {
+    if (Key.equals(*i))
+      return true;
+  }
+  return false;
+}
+
+void Input::setError(const Twine &Message) {
+  this->setError(CurrentNode, Message);
+}
+
+Input::MapHNode::~MapHNode() {
+  for (MapHNode::NameToNode::iterator i = Mapping.begin(), End = Mapping.end();
+                                                                i != End; ++i) {
+    delete i->second;
+  }
+}
+
+Input::SequenceHNode::~SequenceHNode() {
+  for (std::vector<HNode*>::iterator i = Entries.begin(), End = Entries.end();
+                                                                i != End; ++i) {
+    delete *i;
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Output
+//===----------------------------------------------------------------------===//
+
+Output::Output(raw_ostream &yout, void *context)
+    : IO(context),
+      Out(yout),
+      Column(0),
+      ColumnAtFlowStart(0),
+      NeedBitValueComma(false),
+      NeedFlowSequenceComma(false),
+      EnumerationMatchFound(false),
+      NeedsNewLine(false) {
+}
+
+Output::~Output() {
+}
+
+bool Output::outputting() {
+  return true;
+}
+
+void Output::beginMapping() {
+  StateStack.push_back(inMapFirstKey);
+  NeedsNewLine = true;
+}
+
+void Output::endMapping() {
+  StateStack.pop_back();
+}
+
+bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
+                          bool &UseDefault, void *&) {
+  UseDefault = false;
+  if (Required || !SameAsDefault) {
+    this->newLineCheck();
+    this->paddedKey(Key);
+    return true;
+  }
+  return false;
+}
+
+void Output::postflightKey(void *) {
+  if (StateStack.back() == inMapFirstKey) {
+    StateStack.pop_back();
+    StateStack.push_back(inMapOtherKey);
+  }
+}
+
+void Output::beginDocuments() {
+  this->outputUpToEndOfLine("---");
+}
+
+bool Output::preflightDocument(unsigned index) {
+  if (index > 0)
+    this->outputUpToEndOfLine("\n---");
+  return true;
+}
+
+void Output::postflightDocument() {
+}
+
+void Output::endDocuments() {
+  output("\n...\n");
+}
+
+unsigned Output::beginSequence() {
+  StateStack.push_back(inSeq);
+  NeedsNewLine = true;
+  return 0;
+}
+
+void Output::endSequence() {
+  StateStack.pop_back();
+}
+
+bool Output::preflightElement(unsigned, void *&) {
+  return true;
+}
+
+void Output::postflightElement(void *) {
+}
+
+unsigned Output::beginFlowSequence() {
+  StateStack.push_back(inFlowSeq);
+  this->newLineCheck();
+  ColumnAtFlowStart = Column;
+  output("[ ");
+  NeedFlowSequenceComma = false;
+  return 0;
+}
+
+void Output::endFlowSequence() {
+  StateStack.pop_back();
+  this->outputUpToEndOfLine(" ]");
+}
+
+bool Output::preflightFlowElement(unsigned, void *&) {
+  if (NeedFlowSequenceComma)
+    output(", ");
+  if (Column > 70) {
+    output("\n");
+    for (int i = 0; i < ColumnAtFlowStart; ++i)
+      output(" ");
+    Column = ColumnAtFlowStart;
+    output("  ");
+  }
+  return true;
+}
+
+void Output::postflightFlowElement(void *) {
+  NeedFlowSequenceComma = true;
+}
+
+void Output::beginEnumScalar() {
+  EnumerationMatchFound = false;
+}
+
+bool Output::matchEnumScalar(const char *Str, bool Match) {
+  if (Match && !EnumerationMatchFound) {
+    this->newLineCheck();
+    this->outputUpToEndOfLine(Str);
+    EnumerationMatchFound = true;
+  }
+  return false;
+}
+
+void Output::endEnumScalar() {
+  if (!EnumerationMatchFound)
+    llvm_unreachable("bad runtime enum value");
+}
+
+bool Output::beginBitSetScalar(bool &DoClear) {
+  this->newLineCheck();
+  output("[ ");
+  NeedBitValueComma = false;
+  DoClear = false;
+  return true;
+}
+
+bool Output::bitSetMatch(const char *Str, bool Matches) {
+  if (Matches) {
+    if (NeedBitValueComma)
+      output(", ");
+    this->output(Str);
+    NeedBitValueComma = true;
+  }
+  return false;
+}
+
+void Output::endBitSetScalar() {
+  this->outputUpToEndOfLine(" ]");
+}
+
+void Output::scalarString(StringRef &S) {
+  this->newLineCheck();
+  if (S.find('\n') == StringRef::npos) {
+    // No embedded new-line chars, just print string.
+    this->outputUpToEndOfLine(S);
+    return;
+  }
+  unsigned i = 0;
+  unsigned j = 0;
+  unsigned End = S.size();
+  output("'"); // Starting single quote.
+  const char *Base = S.data();
+  while (j < End) {
+    // Escape a single quote by doubling it.
+    if (S[j] == '\'') {
+      output(StringRef(&Base[i], j - i + 1));
+      output("'");
+      i = j + 1;
+    }
+    ++j;
+  }
+  output(StringRef(&Base[i], j - i));
+  this->outputUpToEndOfLine("'"); // Ending single quote.
+}
+
+void Output::setError(const Twine &message) {
+}
+
+void Output::output(StringRef s) {
+  Column += s.size();
+  Out << s;
+}
+
+void Output::outputUpToEndOfLine(StringRef s) {
+  this->output(s);
+  if (StateStack.empty() || StateStack.back() != inFlowSeq)
+    NeedsNewLine = true;
+}
+
+void Output::outputNewLine() {
+  Out << "\n";
+  Column = 0;
+}
+
+// if seq at top, indent as if map, then add "- "
+// if seq in middle, use "- " if firstKey, else use "  "
+//
+
+void Output::newLineCheck() {
+  if (!NeedsNewLine)
+    return;
+  NeedsNewLine = false;
+
+  this->outputNewLine();
+
+  assert(StateStack.size() > 0);
+  unsigned Indent = StateStack.size() - 1;
+  bool OutputDash = false;
+
+  if (StateStack.back() == inSeq) {
+    OutputDash = true;
+  } else if ((StateStack.size() > 1) && (StateStack.back() == inMapFirstKey) &&
+             (StateStack[StateStack.size() - 2] == inSeq)) {
+    --Indent;
+    OutputDash = true;
+  }
+
+  for (unsigned i = 0; i < Indent; ++i) {
+    output("  ");
+  }
+  if (OutputDash) {
+    output("- ");
+  }
+
+}
+
+void Output::paddedKey(StringRef key) {
+  output(key);
+  output(":");
+  const char *spaces = "                ";
+  if (key.size() < strlen(spaces))
+    output(&spaces[key.size()]);
+  else
+    output(" ");
+}
+
+//===----------------------------------------------------------------------===//
+//  traits for built-in types
+//===----------------------------------------------------------------------===//
+
+void ScalarTraits<bool>::output(const bool &Val, void *, raw_ostream &Out) {
+  Out << (Val ? "true" : "false");
+}
+
+StringRef ScalarTraits<bool>::input(StringRef Scalar, void *, bool &Val) {
+  if (Scalar.equals("true")) {
+    Val = true;
+    return StringRef();
+  } else if (Scalar.equals("false")) {
+    Val = false;
+    return StringRef();
+  }
+  return "invalid boolean";
+}
+
+void ScalarTraits<StringRef>::output(const StringRef &Val, void *,
+                                     raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<StringRef>::input(StringRef Scalar, void *,
+                                         StringRef &Val) {
+  Val = Scalar;
+  return StringRef();
+}
+
+void ScalarTraits<uint8_t>::output(const uint8_t &Val, void *,
+                                   raw_ostream &Out) {
+  // use temp uin32_t because ostream thinks uint8_t is a character
+  uint32_t Num = Val;
+  Out << Num;
+}
+
+StringRef ScalarTraits<uint8_t>::input(StringRef Scalar, void *, uint8_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFF)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint16_t>::output(const uint16_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint16_t>::input(StringRef Scalar, void *,
+                                        uint16_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFFFF)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint32_t>::output(const uint32_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint32_t>::input(StringRef Scalar, void *,
+                                        uint32_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFFFFFFFFUL)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint64_t>::output(const uint64_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint64_t>::input(StringRef Scalar, void *,
+                                        uint64_t &Val) {
+  unsigned long long N;
+  if (getAsUnsignedInteger(Scalar, 0, N))
+    return "invalid number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int8_t>::output(const int8_t &Val, void *, raw_ostream &Out) {
+  // use temp in32_t because ostream thinks int8_t is a character
+  int32_t Num = Val;
+  Out << Num;
+}
+
+StringRef ScalarTraits<int8_t>::input(StringRef Scalar, void *, int8_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > 127) || (N < -128))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int16_t>::output(const int16_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int16_t>::input(StringRef Scalar, void *, int16_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > INT16_MAX) || (N < INT16_MIN))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int32_t>::output(const int32_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int32_t>::input(StringRef Scalar, void *, int32_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > INT32_MAX) || (N < INT32_MIN))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int64_t>::output(const int64_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int64_t>::input(StringRef Scalar, void *, int64_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<double>::output(const double &Val, void *, raw_ostream &Out) {
+  Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<double>::input(StringRef Scalar, void *, double &Val) {
+  SmallString<32> buff(Scalar.begin(), Scalar.end());
+  char *end;
+  Val = strtod(buff.c_str(), &end);
+  if (*end != '\0')
+    return "invalid floating point number";
+  return StringRef();
+}
+
+void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
+  Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
+  SmallString<32> buff(Scalar.begin(), Scalar.end());
+  char *end;
+  Val = strtod(buff.c_str(), &end);
+  if (*end != '\0')
+    return "invalid floating point number";
+  return StringRef();
+}
+
+void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
+  uint8_t Num = Val;
+  Out << format("0x%02X", Num);
+}
+
+StringRef ScalarTraits<Hex8>::input(StringRef Scalar, void *, Hex8 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex8 number";
+  if (n > 0xFF)
+    return "out of range hex8 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex16>::output(const Hex16 &Val, void *, raw_ostream &Out) {
+  uint16_t Num = Val;
+  Out << format("0x%04X", Num);
+}
+
+StringRef ScalarTraits<Hex16>::input(StringRef Scalar, void *, Hex16 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex16 number";
+  if (n > 0xFFFF)
+    return "out of range hex16 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex32>::output(const Hex32 &Val, void *, raw_ostream &Out) {
+  uint32_t Num = Val;
+  Out << format("0x%08X", Num);
+}
+
+StringRef ScalarTraits<Hex32>::input(StringRef Scalar, void *, Hex32 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex32 number";
+  if (n > 0xFFFFFFFFUL)
+    return "out of range hex32 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex64>::output(const Hex64 &Val, void *, raw_ostream &Out) {
+  uint64_t Num = Val;
+  Out << format("0x%016llX", Num);
+}
+
+StringRef ScalarTraits<Hex64>::input(StringRef Scalar, void *, Hex64 &Val) {
+  unsigned long long Num;
+  if (getAsUnsignedInteger(Scalar, 0, Num))
+    return "invalid hex64 number";
+  Val = Num;
+  return StringRef();
+}
diff --git a/contrib/llvm/lib/Support/circular_raw_ostream.cpp b/contrib/llvm/lib/Support/circular_raw_ostream.cpp
new file mode 100644
index 000000000000..ca0d30db388c
--- /dev/null
+++ b/contrib/llvm/lib/Support/circular_raw_ostream.cpp
@@ -0,0 +1,45 @@
+//===- circular_raw_ostream.cpp - Implement circular_raw_ostream ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements support for circular buffered streams.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/circular_raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+void circular_raw_ostream::write_impl(const char *Ptr, size_t Size) {
+  if (BufferSize == 0) {
+    TheStream->write(Ptr, Size);
+    return;
+  }
+
+  // Write into the buffer, wrapping if necessary.
+  while (Size != 0) {
+    unsigned Bytes =
+      std::min(unsigned(Size), unsigned(BufferSize - (Cur - BufferArray)));
+    memcpy(Cur, Ptr, Bytes);
+    Size -= Bytes;
+    Cur += Bytes;
+    if (Cur == BufferArray + BufferSize) {
+      // Reset the output pointer to the start of the buffer.
+      Cur = BufferArray;
+      Filled = true;
+    }
+  }    
+}
+
+void circular_raw_ostream::flushBufferWithBanner() {
+  if (BufferSize != 0) {
+    // Write out the buffer
+    TheStream->write(Banner, std::strlen(Banner));
+    flushBuffer();
+  }
+}
diff --git a/contrib/llvm/lib/Support/raw_os_ostream.cpp b/contrib/llvm/lib/Support/raw_os_ostream.cpp
new file mode 100644
index 000000000000..44f2325d7f8a
--- /dev/null
+++ b/contrib/llvm/lib/Support/raw_os_ostream.cpp
@@ -0,0 +1,30 @@
+//===--- raw_os_ostream.cpp - Implement the raw_os_ostream class ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements support adapting raw_ostream to std::ostream.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_os_ostream.h"
+#include <ostream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  raw_os_ostream
+//===----------------------------------------------------------------------===//
+
+raw_os_ostream::~raw_os_ostream() {
+  flush();
+}
+
+void raw_os_ostream::write_impl(const char *Ptr, size_t Size) {
+  OS.write(Ptr, Size);
+}
+
+uint64_t raw_os_ostream::current_pos() const { return OS.tellp(); }
diff --git a/contrib/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm/lib/Support/raw_ostream.cpp
new file mode 100644
index 000000000000..a433088b1930
--- /dev/null
+++ b/contrib/llvm/lib/Support/raw_ostream.cpp
@@ -0,0 +1,789 @@
+//===--- raw_ostream.cpp - Implement the raw_ostream classes --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements support for bulk buffered stream output.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/system_error.h"
+#include <cctype>
+#include <cerrno>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#if defined(HAVE_UNISTD_H)
+# include <unistd.h>
+#endif
+#if defined(HAVE_FCNTL_H)
+# include <fcntl.h>
+#endif
+#if defined(HAVE_SYS_UIO_H) && defined(HAVE_WRITEV)
+#  include <sys/uio.h>
+#endif
+
+#if defined(__CYGWIN__)
+#include <io.h>
+#endif
+
+#if defined(_MSC_VER)
+#include <io.h>
+#include <fcntl.h>
+#ifndef STDIN_FILENO
+# define STDIN_FILENO 0
+#endif
+#ifndef STDOUT_FILENO
+# define STDOUT_FILENO 1
+#endif
+#ifndef STDERR_FILENO
+# define STDERR_FILENO 2
+#endif
+#endif
+
+using namespace llvm;
+
+raw_ostream::~raw_ostream() {
+  // raw_ostream's subclasses should take care to flush the buffer
+  // in their destructors.
+  assert(OutBufCur == OutBufStart &&
+         "raw_ostream destructor called with non-empty buffer!");
+
+  if (BufferMode == InternalBuffer)
+    delete [] OutBufStart;
+}
+
+// An out of line virtual method to provide a home for the class vtable.
+void raw_ostream::handle() {}
+
+size_t raw_ostream::preferred_buffer_size() const {
+  // BUFSIZ is intended to be a reasonable default.
+  return BUFSIZ;
+}
+
+void raw_ostream::SetBuffered() {
+  // Ask the subclass to determine an appropriate buffer size.
+  if (size_t Size = preferred_buffer_size())
+    SetBufferSize(Size);
+  else
+    // It may return 0, meaning this stream should be unbuffered.
+    SetUnbuffered();
+}
+
+void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size,
+                                   BufferKind Mode) {
+  assert(((Mode == Unbuffered && BufferStart == 0 && Size == 0) ||
+          (Mode != Unbuffered && BufferStart && Size)) &&
+         "stream must be unbuffered or have at least one byte");
+  // Make sure the current buffer is free of content (we can't flush here; the
+  // child buffer management logic will be in write_impl).
+  assert(GetNumBytesInBuffer() == 0 && "Current buffer is non-empty!");
+
+  if (BufferMode == InternalBuffer)
+    delete [] OutBufStart;
+  OutBufStart = BufferStart;
+  OutBufEnd = OutBufStart+Size;
+  OutBufCur = OutBufStart;
+  BufferMode = Mode;
+
+  assert(OutBufStart <= OutBufEnd && "Invalid size!");
+}
+
+raw_ostream &raw_ostream::operator<<(unsigned long N) {
+  // Zero is a special case.
+  if (N == 0)
+    return *this << '0';
+
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+
+  while (N) {
+    *--CurPtr = '0' + char(N % 10);
+    N /= 10;
+  }
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long N) {
+  if (N <  0) {
+    *this << '-';
+    // Avoid undefined behavior on LONG_MIN with a cast.
+    N = -(unsigned long)N;
+  }
+
+  return this->operator<<(static_cast<unsigned long>(N));
+}
+
+raw_ostream &raw_ostream::operator<<(unsigned long long N) {
+  // Output using 32-bit div/mod when possible.
+  if (N == static_cast<unsigned long>(N))
+    return this->operator<<(static_cast<unsigned long>(N));
+
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+
+  while (N) {
+    *--CurPtr = '0' + char(N % 10);
+    N /= 10;
+  }
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::operator<<(long long N) {
+  if (N < 0) {
+    *this << '-';
+    // Avoid undefined behavior on INT64_MIN with a cast.
+    N = -(unsigned long long)N;
+  }
+
+  return this->operator<<(static_cast<unsigned long long>(N));
+}
+
+raw_ostream &raw_ostream::write_hex(unsigned long long N) {
+  // Zero is a special case.
+  if (N == 0)
+    return *this << '0';
+
+  char NumberBuffer[20];
+  char *EndPtr = NumberBuffer+sizeof(NumberBuffer);
+  char *CurPtr = EndPtr;
+
+  while (N) {
+    uintptr_t x = N % 16;
+    *--CurPtr = (x < 10 ? '0' + x : 'a' + x - 10);
+    N /= 16;
+  }
+
+  return write(CurPtr, EndPtr-CurPtr);
+}
+
+raw_ostream &raw_ostream::write_escaped(StringRef Str,
+                                        bool UseHexEscapes) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char c = Str[i];
+
+    switch (c) {
+    case '\\':
+      *this << '\\' << '\\';
+      break;
+    case '\t':
+      *this << '\\' << 't';
+      break;
+    case '\n':
+      *this << '\\' << 'n';
+      break;
+    case '"':
+      *this << '\\' << '"';
+      break;
+    default:
+      if (std::isprint(c)) {
+        *this << c;
+        break;
+      }
+
+      // Write out the escaped representation.
+      if (UseHexEscapes) {
+        *this << '\\' << 'x';
+        *this << hexdigit((c >> 4 & 0xF));
+        *this << hexdigit((c >> 0) & 0xF);
+      } else {
+        // Always use a full 3-character octal escape.
+        *this << '\\';
+        *this << char('0' + ((c >> 6) & 7));
+        *this << char('0' + ((c >> 3) & 7));
+        *this << char('0' + ((c >> 0) & 7));
+      }
+    }
+  }
+
+  return *this;
+}
+
+raw_ostream &raw_ostream::operator<<(const void *P) {
+  *this << '0' << 'x';
+
+  return write_hex((uintptr_t) P);
+}
+
+raw_ostream &raw_ostream::operator<<(double N) {
+#ifdef _WIN32
+  // On MSVCRT and compatible, output of %e is incompatible to Posix
+  // by default. Number of exponent digits should be at least 2. "%+03d"
+  // FIXME: Implement our formatter to here or Support/Format.h!
+  int fpcl = _fpclass(N);
+
+  // negative zero
+  if (fpcl == _FPCLASS_NZ)
+    return *this << "-0.000000e+00";
+
+  char buf[16];
+  unsigned len;
+  len = snprintf(buf, sizeof(buf), "%e", N);
+  if (len <= sizeof(buf) - 2) {
+    if (len >= 5 && buf[len - 5] == 'e' && buf[len - 3] == '0') {
+      int cs = buf[len - 4];
+      if (cs == '+' || cs == '-') {
+        int c1 = buf[len - 2];
+        int c0 = buf[len - 1];
+        if (isdigit(static_cast<unsigned char>(c1)) &&
+            isdigit(static_cast<unsigned char>(c0))) {
+          // Trim leading '0': "...e+012" -> "...e+12\0"
+          buf[len - 3] = c1;
+          buf[len - 2] = c0;
+          buf[--len] = 0;
+        }
+      }
+    }
+    return this->operator<<(buf);
+  }
+#endif
+  return this->operator<<(format("%e", N));
+}
+
+
+
+void raw_ostream::flush_nonempty() {
+  assert(OutBufCur > OutBufStart && "Invalid call to flush_nonempty.");
+  size_t Length = OutBufCur - OutBufStart;
+  OutBufCur = OutBufStart;
+  write_impl(OutBufStart, Length);
+}
+
+raw_ostream &raw_ostream::write(unsigned char C) {
+  // Group exceptional cases into a single branch.
+  if (LLVM_UNLIKELY(OutBufCur >= OutBufEnd)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
+      if (BufferMode == Unbuffered) {
+        write_impl(reinterpret_cast<char*>(&C), 1);
+        return *this;
+      }
+      // Set up a buffer and start over.
+      SetBuffered();
+      return write(C);
+    }
+
+    flush_nonempty();
+  }
+
+  *OutBufCur++ = C;
+  return *this;
+}
+
+raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
+  // Group exceptional cases into a single branch.
+  if (LLVM_UNLIKELY(size_t(OutBufEnd - OutBufCur) < Size)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
+      if (BufferMode == Unbuffered) {
+        write_impl(Ptr, Size);
+        return *this;
+      }
+      // Set up a buffer and start over.
+      SetBuffered();
+      return write(Ptr, Size);
+    }
+
+    size_t NumBytes = OutBufEnd - OutBufCur;
+
+    // If the buffer is empty at this point we have a string that is larger
+    // than the buffer. Directly write the chunk that is a multiple of the
+    // preferred buffer size and put the remainder in the buffer.
+    if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
+      size_t BytesToWrite = Size - (Size % NumBytes);
+      write_impl(Ptr, BytesToWrite);
+      size_t BytesRemaining = Size - BytesToWrite;
+      if (BytesRemaining > size_t(OutBufEnd - OutBufCur)) {
+        // Too much left over to copy into our buffer.
+        return write(Ptr + BytesToWrite, BytesRemaining);
+      }
+      copy_to_buffer(Ptr + BytesToWrite, BytesRemaining);
+      return *this;
+    }
+
+    // We don't have enough space in the buffer to fit the string in. Insert as
+    // much as possible, flush and start over with the remainder.
+    copy_to_buffer(Ptr, NumBytes);
+    flush_nonempty();
+    return write(Ptr + NumBytes, Size - NumBytes);
+  }
+
+  copy_to_buffer(Ptr, Size);
+
+  return *this;
+}
+
+void raw_ostream::copy_to_buffer(const char *Ptr, size_t Size) {
+  assert(Size <= size_t(OutBufEnd - OutBufCur) && "Buffer overrun!");
+
+  // Handle short strings specially, memcpy isn't very good at very short
+  // strings.
+  switch (Size) {
+  case 4: OutBufCur[3] = Ptr[3]; // FALL THROUGH
+  case 3: OutBufCur[2] = Ptr[2]; // FALL THROUGH
+  case 2: OutBufCur[1] = Ptr[1]; // FALL THROUGH
+  case 1: OutBufCur[0] = Ptr[0]; // FALL THROUGH
+  case 0: break;
+  default:
+    memcpy(OutBufCur, Ptr, Size);
+    break;
+  }
+
+  OutBufCur += Size;
+}
+
+// Formatted output.
+raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
+  // If we have more than a few bytes left in our output buffer, try
+  // formatting directly onto its end.
+  size_t NextBufferSize = 127;
+  size_t BufferBytesLeft = OutBufEnd - OutBufCur;
+  if (BufferBytesLeft > 3) {
+    size_t BytesUsed = Fmt.print(OutBufCur, BufferBytesLeft);
+
+    // Common case is that we have plenty of space.
+    if (BytesUsed <= BufferBytesLeft) {
+      OutBufCur += BytesUsed;
+      return *this;
+    }
+
+    // Otherwise, we overflowed and the return value tells us the size to try
+    // again with.
+    NextBufferSize = BytesUsed;
+  }
+
+  // If we got here, we didn't have enough space in the output buffer for the
+  // string.  Try printing into a SmallVector that is resized to have enough
+  // space.  Iterate until we win.
+  SmallVector<char, 128> V;
+
+  while (1) {
+    V.resize(NextBufferSize);
+
+    // Try formatting into the SmallVector.
+    size_t BytesUsed = Fmt.print(V.data(), NextBufferSize);
+
+    // If BytesUsed fit into the vector, we win.
+    if (BytesUsed <= NextBufferSize)
+      return write(V.data(), BytesUsed);
+
+    // Otherwise, try again with a new size.
+    assert(BytesUsed > NextBufferSize && "Didn't grow buffer!?");
+    NextBufferSize = BytesUsed;
+  }
+}
+
+/// indent - Insert 'NumSpaces' spaces.
+raw_ostream &raw_ostream::indent(unsigned NumSpaces) {
+  static const char Spaces[] = "                                "
+                               "                                "
+                               "                ";
+
+  // Usually the indentation is small, handle it with a fastpath.
+  if (NumSpaces < array_lengthof(Spaces))
+    return write(Spaces, NumSpaces);
+
+  while (NumSpaces) {
+    unsigned NumToWrite = std::min(NumSpaces,
+                                   (unsigned)array_lengthof(Spaces)-1);
+    write(Spaces, NumToWrite);
+    NumSpaces -= NumToWrite;
+  }
+  return *this;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Formatted Output
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void format_object_base::home() {
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_fd_ostream
+//===----------------------------------------------------------------------===//
+
+/// raw_fd_ostream - Open the specified file for writing. If an error
+/// occurs, information about the error is put into ErrorInfo, and the
+/// stream should be immediately destroyed; the string will be empty
+/// if no error occurred.
+raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
+                               unsigned Flags)
+  : Error(false), UseAtomicWrites(false), pos(0)
+{
+  assert(Filename != 0 && "Filename is null");
+  // Verify that we don't have both "append" and "excl".
+  assert((!(Flags & F_Excl) || !(Flags & F_Append)) &&
+         "Cannot specify both 'excl' and 'append' file creation flags!");
+
+  ErrorInfo.clear();
+
+  // Handle "-" as stdout. Note that when we do this, we consider ourself
+  // the owner of stdout. This means that we can do things like close the
+  // file descriptor when we're done and set the "binary" flag globally.
+  if (Filename[0] == '-' && Filename[1] == 0) {
+    FD = STDOUT_FILENO;
+    // If user requested binary then put stdout into binary mode if
+    // possible.
+    if (Flags & F_Binary)
+      sys::Program::ChangeStdoutToBinary();
+    // Close stdout when we're done, to detect any output errors.
+    ShouldClose = true;
+    return;
+  }
+
+  int OpenFlags = O_WRONLY|O_CREAT;
+#ifdef O_BINARY
+  if (Flags & F_Binary)
+    OpenFlags |= O_BINARY;
+#endif
+
+  if (Flags & F_Append)
+    OpenFlags |= O_APPEND;
+  else
+    OpenFlags |= O_TRUNC;
+  if (Flags & F_Excl)
+    OpenFlags |= O_EXCL;
+
+  while ((FD = open(Filename, OpenFlags, 0664)) < 0) {
+    if (errno != EINTR) {
+      ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
+      ShouldClose = false;
+      return;
+    }
+  }
+
+  // Ok, we successfully opened the file, so it'll need to be closed.
+  ShouldClose = true;
+}
+
+/// raw_fd_ostream ctor - FD is the file descriptor that this writes to.  If
+/// ShouldClose is true, this closes the file when the stream is destroyed.
+raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
+  : raw_ostream(unbuffered), FD(fd),
+    ShouldClose(shouldClose), Error(false), UseAtomicWrites(false) {
+#ifdef O_BINARY
+  // Setting STDOUT and STDERR to binary mode is necessary in Win32
+  // to avoid undesirable linefeed conversion.
+  if (fd == STDOUT_FILENO || fd == STDERR_FILENO)
+    setmode(fd, O_BINARY);
+#endif
+
+  // Get the starting position.
+  off_t loc = ::lseek(FD, 0, SEEK_CUR);
+  if (loc == (off_t)-1)
+    pos = 0;
+  else
+    pos = static_cast<uint64_t>(loc);
+}
+
+raw_fd_ostream::~raw_fd_ostream() {
+  if (FD >= 0) {
+    flush();
+    if (ShouldClose)
+      while (::close(FD) != 0)
+        if (errno != EINTR) {
+          error_detected();
+          break;
+        }
+  }
+
+#ifdef __MINGW32__
+  // On mingw, global dtors should not call exit().
+  // report_fatal_error() invokes exit(). We know report_fatal_error()
+  // might not write messages to stderr when any errors were detected
+  // on FD == 2.
+  if (FD == 2) return;
+#endif
+
+  // If there are any pending errors, report them now. Clients wishing
+  // to avoid report_fatal_error calls should check for errors with
+  // has_error() and clear the error flag with clear_error() before
+  // destructing raw_ostream objects which may have errors.
+  if (has_error())
+    report_fatal_error("IO failure on output stream.", /*GenCrashDiag=*/false);
+}
+
+
+void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
+  assert(FD >= 0 && "File already closed.");
+  pos += Size;
+
+  do {
+    ssize_t ret;
+
+    // Check whether we should attempt to use atomic writes.
+    if (LLVM_LIKELY(!UseAtomicWrites)) {
+      ret = ::write(FD, Ptr, Size);
+    } else {
+      // Use ::writev() where available.
+#if defined(HAVE_WRITEV)
+      const void *Addr = static_cast<const void *>(Ptr);
+      struct iovec IOV = {const_cast<void *>(Addr), Size };
+      ret = ::writev(FD, &IOV, 1);
+#else
+      ret = ::write(FD, Ptr, Size);
+#endif
+    }
+
+    if (ret < 0) {
+      // If it's a recoverable error, swallow it and retry the write.
+      //
+      // Ideally we wouldn't ever see EAGAIN or EWOULDBLOCK here, since
+      // raw_ostream isn't designed to do non-blocking I/O. However, some
+      // programs, such as old versions of bjam, have mistakenly used
+      // O_NONBLOCK. For compatibility, emulate blocking semantics by
+      // spinning until the write succeeds. If you don't want spinning,
+      // don't use O_NONBLOCK file descriptors with raw_ostream.
+      if (errno == EINTR || errno == EAGAIN
+#ifdef EWOULDBLOCK
+          || errno == EWOULDBLOCK
+#endif
+          )
+        continue;
+
+      // Otherwise it's a non-recoverable error. Note it and quit.
+      error_detected();
+      break;
+    }
+
+    // The write may have written some or all of the data. Update the
+    // size and buffer pointer to reflect the remainder that needs
+    // to be written. If there are no bytes left, we're done.
+    Ptr += ret;
+    Size -= ret;
+  } while (Size > 0);
+}
+
+void raw_fd_ostream::close() {
+  assert(ShouldClose);
+  ShouldClose = false;
+  flush();
+  while (::close(FD) != 0)
+    if (errno != EINTR) {
+      error_detected();
+      break;
+    }
+  FD = -1;
+}
+
+uint64_t raw_fd_ostream::seek(uint64_t off) {
+  flush();
+  pos = ::lseek(FD, off, SEEK_SET);
+  if (pos != off)
+    error_detected();
+  return pos;
+}
+
+size_t raw_fd_ostream::preferred_buffer_size() const {
+#if !defined(_MSC_VER) && !defined(__MINGW32__) && !defined(__minix)
+  // Windows and Minix have no st_blksize.
+  assert(FD >= 0 && "File not yet open!");
+  struct stat statbuf;
+  if (fstat(FD, &statbuf) != 0)
+    return 0;
+
+  // If this is a terminal, don't use buffering. Line buffering
+  // would be a more traditional thing to do, but it's not worth
+  // the complexity.
+  if (S_ISCHR(statbuf.st_mode) && isatty(FD))
+    return 0;
+  // Return the preferred block size.
+  return statbuf.st_blksize;
+#else
+  return raw_ostream::preferred_buffer_size();
+#endif
+}
+
+raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
+                                         bool bg) {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode =
+    (colors == SAVEDCOLOR) ? sys::Process::OutputBold(bg)
+    : sys::Process::OutputColor(colors, bold, bg);
+  if (colorcode) {
+    size_t len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
+raw_ostream &raw_fd_ostream::resetColor() {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode = sys::Process::ResetColor();
+  if (colorcode) {
+    size_t len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
+raw_ostream &raw_fd_ostream::reverseColor() {
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+  const char *colorcode = sys::Process::OutputReverse();
+  if (colorcode) {
+    size_t len = strlen(colorcode);
+    write(colorcode, len);
+    // don't account colors towards output characters
+    pos -= len;
+  }
+  return *this;
+}
+
+bool raw_fd_ostream::is_displayed() const {
+  return sys::Process::FileDescriptorIsDisplayed(FD);
+}
+
+bool raw_fd_ostream::has_colors() const {
+  return sys::Process::FileDescriptorHasColors(FD);
+}
+
+//===----------------------------------------------------------------------===//
+//  outs(), errs(), nulls()
+//===----------------------------------------------------------------------===//
+
+/// outs() - This returns a reference to a raw_ostream for standard output.
+/// Use it like: outs() << "foo" << "bar";
+raw_ostream &llvm::outs() {
+  // Set buffer settings to model stdout behavior.
+  // Delete the file descriptor when the program exists, forcing error
+  // detection. If you don't want this behavior, don't use outs().
+  static raw_fd_ostream S(STDOUT_FILENO, true);
+  return S;
+}
+
+/// errs() - This returns a reference to a raw_ostream for standard error.
+/// Use it like: errs() << "foo" << "bar";
+raw_ostream &llvm::errs() {
+  // Set standard error to be unbuffered by default.
+  static raw_fd_ostream S(STDERR_FILENO, false, true);
+  return S;
+}
+
+/// nulls() - This returns a reference to a raw_ostream which discards output.
+raw_ostream &llvm::nulls() {
+  static raw_null_ostream S;
+  return S;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  raw_string_ostream
+//===----------------------------------------------------------------------===//
+
+raw_string_ostream::~raw_string_ostream() {
+  flush();
+}
+
+void raw_string_ostream::write_impl(const char *Ptr, size_t Size) {
+  OS.append(Ptr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_svector_ostream
+//===----------------------------------------------------------------------===//
+
+// The raw_svector_ostream implementation uses the SmallVector itself as the
+// buffer for the raw_ostream. We guarantee that the raw_ostream buffer is
+// always pointing past the end of the vector, but within the vector
+// capacity. This allows raw_ostream to write directly into the correct place,
+// and we only need to set the vector size when the data is flushed.
+
+raw_svector_ostream::raw_svector_ostream(SmallVectorImpl<char> &O) : OS(O) {
+  // Set up the initial external buffer. We make sure that the buffer has at
+  // least 128 bytes free; raw_ostream itself only requires 64, but we want to
+  // make sure that we don't grow the buffer unnecessarily on destruction (when
+  // the data is flushed). See the FIXME below.
+  OS.reserve(OS.size() + 128);
+  SetBuffer(OS.end(), OS.capacity() - OS.size());
+}
+
+raw_svector_ostream::~raw_svector_ostream() {
+  // FIXME: Prevent resizing during this flush().
+  flush();
+}
+
+/// resync - This is called when the SmallVector we're appending to is changed
+/// outside of the raw_svector_ostream's control.  It is only safe to do this
+/// if the raw_svector_ostream has previously been flushed.
+void raw_svector_ostream::resync() {
+  assert(GetNumBytesInBuffer() == 0 && "Didn't flush before mutating vector");
+
+  if (OS.capacity() - OS.size() < 64)
+    OS.reserve(OS.capacity() * 2);
+  SetBuffer(OS.end(), OS.capacity() - OS.size());
+}
+
+void raw_svector_ostream::write_impl(const char *Ptr, size_t Size) {
+  // If we're writing bytes from the end of the buffer into the smallvector, we
+  // don't need to copy the bytes, just commit the bytes because they are
+  // already in the right place.
+  if (Ptr == OS.end()) {
+    assert(OS.size() + Size <= OS.capacity() && "Invalid write_impl() call!");
+    OS.set_size(OS.size() + Size);
+  } else {
+    assert(GetNumBytesInBuffer() == 0 &&
+           "Should be writing from buffer if some bytes in it");
+    // Otherwise, do copy the bytes.
+    OS.append(Ptr, Ptr+Size);
+  }
+
+  // Grow the vector if necessary.
+  if (OS.capacity() - OS.size() < 64)
+    OS.reserve(OS.capacity() * 2);
+
+  // Update the buffer position.
+  SetBuffer(OS.end(), OS.capacity() - OS.size());
+}
+
+uint64_t raw_svector_ostream::current_pos() const {
+   return OS.size();
+}
+
+StringRef raw_svector_ostream::str() {
+  flush();
+  return StringRef(OS.begin(), OS.size());
+}
+
+//===----------------------------------------------------------------------===//
+//  raw_null_ostream
+//===----------------------------------------------------------------------===//
+
+raw_null_ostream::~raw_null_ostream() {
+#ifndef NDEBUG
+  // ~raw_ostream asserts that the buffer is empty. This isn't necessary
+  // with raw_null_ostream, but it's better to have raw_null_ostream follow
+  // the rules than to change the rules just for raw_null_ostream.
+  flush();
+#endif
+}
+
+void raw_null_ostream::write_impl(const char *Ptr, size_t Size) {
+}
+
+uint64_t raw_null_ostream::current_pos() const {
+  return 0;
+}
diff --git a/contrib/llvm/lib/Support/regcclass.h b/contrib/llvm/lib/Support/regcclass.h
new file mode 100644
index 000000000000..2cea3e4e5406
--- /dev/null
+++ b/contrib/llvm/lib/Support/regcclass.h
@@ -0,0 +1,70 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)cclass.h	8.3 (Berkeley) 3/20/94
+ */
+
+/* character-class table */
+static struct cclass {
+	const char *name;
+	const char *chars;
+	const char *multis;
+} cclasses[] = {
+	{ "alnum",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789",				""} ,
+	{ "alpha",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
+					""} ,
+	{ "blank",	" \t",		""} ,
+	{ "cntrl",	"\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
+\25\26\27\30\31\32\33\34\35\36\37\177",	""} ,
+	{ "digit",	"0123456789",	""} ,
+	{ "graph",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+					""} ,
+	{ "lower",	"abcdefghijklmnopqrstuvwxyz",
+					""} ,
+	{ "print",	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
+0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
+					""} ,
+	{ "punct",	"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
+					""} ,
+	{ "space",	"\t\n\v\f\r ",	""} ,
+	{ "upper",	"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
+					""} ,
+	{ "xdigit",	"0123456789ABCDEFabcdef",
+					""} ,
+	{ NULL,		0,		"" }
+};
diff --git a/contrib/llvm/lib/Support/regcname.h b/contrib/llvm/lib/Support/regcname.h
new file mode 100644
index 000000000000..3c0bb248ffa7
--- /dev/null
+++ b/contrib/llvm/lib/Support/regcname.h
@@ -0,0 +1,139 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)cname.h	8.3 (Berkeley) 3/20/94
+ */
+
+/* character-name table */
+static struct cname {
+	const char *name;
+	char code;
+} cnames[] = {
+	{ "NUL",			'\0' },
+	{ "SOH",			'\001' },
+	{ "STX",			'\002' },
+	{ "ETX",			'\003' },
+	{ "EOT",			'\004' },
+	{ "ENQ",			'\005' },
+	{ "ACK",			'\006' },
+	{ "BEL",			'\007' },
+	{ "alert",			'\007' },
+	{ "BS",				'\010' },
+	{ "backspace",			'\b' },
+	{ "HT",				'\011' },
+	{ "tab",			'\t' },
+	{ "LF",				'\012' },
+	{ "newline",			'\n' },
+	{ "VT",				'\013' },
+	{ "vertical-tab",		'\v' },
+	{ "FF",				'\014' },
+	{ "form-feed",			'\f' },
+	{ "CR",				'\015' },
+	{ "carriage-return",		'\r' },
+	{ "SO",				'\016' },
+	{ "SI",				'\017' },
+	{ "DLE",			'\020' },
+	{ "DC1",			'\021' },
+	{ "DC2",			'\022' },
+	{ "DC3",			'\023' },
+	{ "DC4",			'\024' },
+	{ "NAK",			'\025' },
+	{ "SYN",			'\026' },
+	{ "ETB",			'\027' },
+	{ "CAN",			'\030' },
+	{ "EM",				'\031' },
+	{ "SUB",			'\032' },
+	{ "ESC",			'\033' },
+	{ "IS4",			'\034' },
+	{ "FS",				'\034' },
+	{ "IS3",			'\035' },
+	{ "GS",				'\035' },
+	{ "IS2",			'\036' },
+	{ "RS",				'\036' },
+	{ "IS1",			'\037' },
+	{ "US",				'\037' },
+	{ "space",			' ' },
+	{ "exclamation-mark",		'!' },
+	{ "quotation-mark",		'"' },
+	{ "number-sign",		'#' },
+	{ "dollar-sign",		'$' },
+	{ "percent-sign",		'%' },
+	{ "ampersand",			'&' },
+	{ "apostrophe",			'\'' },
+	{ "left-parenthesis",		'(' },
+	{ "right-parenthesis",		')' },
+	{ "asterisk",			'*' },
+	{ "plus-sign",			'+' },
+	{ "comma",			',' },
+	{ "hyphen",			'-' },
+	{ "hyphen-minus",		'-' },
+	{ "period",			'.' },
+	{ "full-stop",			'.' },
+	{ "slash",			'/' },
+	{ "solidus",			'/' },
+	{ "zero",			'0' },
+	{ "one",			'1' },
+	{ "two",			'2' },
+	{ "three",			'3' },
+	{ "four",			'4' },
+	{ "five",			'5' },
+	{ "six",			'6' },
+	{ "seven",			'7' },
+	{ "eight",			'8' },
+	{ "nine",			'9' },
+	{ "colon",			':' },
+	{ "semicolon",			';' },
+	{ "less-than-sign",		'<' },
+	{ "equals-sign",		'=' },
+	{ "greater-than-sign",		'>' },
+	{ "question-mark",		'?' },
+	{ "commercial-at",		'@' },
+	{ "left-square-bracket",	'[' },
+	{ "backslash",			'\\' },
+	{ "reverse-solidus",		'\\' },
+	{ "right-square-bracket",	']' },
+	{ "circumflex",			'^' },
+	{ "circumflex-accent",		'^' },
+	{ "underscore",			'_' },
+	{ "low-line",			'_' },
+	{ "grave-accent",		'`' },
+	{ "left-brace",			'{' },
+	{ "left-curly-bracket",		'{' },
+	{ "vertical-line",		'|' },
+	{ "right-brace",		'}' },
+	{ "right-curly-bracket",	'}' },
+	{ "tilde",			'~' },
+	{ "DEL",			'\177' },
+	{ NULL,				0 }
+};
diff --git a/contrib/llvm/lib/Support/regcomp.c b/contrib/llvm/lib/Support/regcomp.c
new file mode 100644
index 000000000000..74d9186aaaa2
--- /dev/null
+++ b/contrib/llvm/lib/Support/regcomp.c
@@ -0,0 +1,1553 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+#include "regcclass.h"
+#include "regcname.h"
+
+/*
+ * parse structure, passed up and down to avoid global variables and
+ * other clumsinesses
+ */
+struct parse {
+	char *next;		/* next character in RE */
+	char *end;		/* end of string (-> NUL normally) */
+	int error;		/* has an error been seen? */
+	sop *strip;		/* malloced strip */
+	sopno ssize;		/* malloced strip size (allocated) */
+	sopno slen;		/* malloced strip length (used) */
+	int ncsalloc;		/* number of csets allocated */
+	struct re_guts *g;
+#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
+	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
+	sopno pend[NPAREN];	/* -> ) ([0] unused) */
+};
+
+static void p_ere(struct parse *, int);
+static void p_ere_exp(struct parse *);
+static void p_str(struct parse *);
+static void p_bre(struct parse *, int, int);
+static int p_simp_re(struct parse *, int);
+static int p_count(struct parse *);
+static void p_bracket(struct parse *);
+static void p_b_term(struct parse *, cset *);
+static void p_b_cclass(struct parse *, cset *);
+static void p_b_eclass(struct parse *, cset *);
+static char p_b_symbol(struct parse *);
+static char p_b_coll_elem(struct parse *, int);
+static char othercase(int);
+static void bothcases(struct parse *, int);
+static void ordinary(struct parse *, int);
+static void nonnewline(struct parse *);
+static void repeat(struct parse *, sopno, int, int);
+static int seterr(struct parse *, int);
+static cset *allocset(struct parse *);
+static void freeset(struct parse *, cset *);
+static int freezeset(struct parse *, cset *);
+static int firstch(struct parse *, cset *);
+static int nch(struct parse *, cset *);
+static void mcadd(struct parse *, cset *, const char *);
+static void mcinvert(struct parse *, cset *);
+static void mccase(struct parse *, cset *);
+static int isinsets(struct re_guts *, int);
+static int samesets(struct re_guts *, int, int);
+static void categorize(struct parse *, struct re_guts *);
+static sopno dupl(struct parse *, sopno, sopno);
+static void doemit(struct parse *, sop, size_t);
+static void doinsert(struct parse *, sop, size_t, sopno);
+static void dofwd(struct parse *, sopno, sop);
+static void enlarge(struct parse *, sopno);
+static void stripsnug(struct parse *, struct re_guts *);
+static void findmust(struct parse *, struct re_guts *);
+static sopno pluscount(struct parse *, struct re_guts *);
+
+static char nuls[10];		/* place to point scanner in event of error */
+
+/*
+ * macros for use with parse structure
+ * BEWARE:  these know that the parse structure is named `p' !!!
+ */
+#define	PEEK()	(*p->next)
+#define	PEEK2()	(*(p->next+1))
+#define	MORE()	(p->next < p->end)
+#define	MORE2()	(p->next+1 < p->end)
+#define	SEE(c)	(MORE() && PEEK() == (c))
+#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
+#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
+#define	NEXT()	(p->next++)
+#define	NEXT2()	(p->next += 2)
+#define	NEXTn(n)	(p->next += (n))
+#define	GETNEXT()	(*p->next++)
+#define	SETERROR(e)	seterr(p, (e))
+#define	REQUIRE(co, e)	(void)((co) || SETERROR(e))
+#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
+#define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
+#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
+#define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
+#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
+#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
+#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
+#define	HERE()		(p->slen)
+#define	THERE()		(p->slen - 1)
+#define	THERETHERE()	(p->slen - 2)
+#define	DROP(n)	(p->slen -= (n))
+
+#ifdef	_POSIX2_RE_DUP_MAX
+#define	DUPMAX	_POSIX2_RE_DUP_MAX
+#else
+#define	DUPMAX	255
+#endif
+#define	INFINITY	(DUPMAX + 1)
+
+#ifndef NDEBUG
+static int never = 0;		/* for use in asserts; shuts lint up */
+#else
+#define	never	0		/* some <assert.h>s have bugs too */
+#endif
+
+/*
+ - llvm_regcomp - interface for parser and compilation
+ */
+int				/* 0 success, otherwise REG_something */
+llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
+{
+	struct parse pa;
+	struct re_guts *g;
+	struct parse *p = &pa;
+	int i;
+	size_t len;
+#ifdef REDEBUG
+#	define	GOODFLAGS(f)	(f)
+#else
+#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
+#endif
+
+	cflags = GOODFLAGS(cflags);
+	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
+		return(REG_INVARG);
+
+	if (cflags&REG_PEND) {
+		if (preg->re_endp < pattern)
+			return(REG_INVARG);
+		len = preg->re_endp - pattern;
+	} else
+		len = strlen((const char *)pattern);
+
+	/* do the mallocs early so failure handling is easy */
+	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
+							(NC-1)*sizeof(cat_t));
+	if (g == NULL)
+		return(REG_ESPACE);
+	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
+	p->strip = (sop *)calloc(p->ssize, sizeof(sop));
+	p->slen = 0;
+	if (p->strip == NULL) {
+		free((char *)g);
+		return(REG_ESPACE);
+	}
+
+	/* set things up */
+	p->g = g;
+	p->next = (char *)pattern;	/* convenience; we do not modify it */
+	p->end = p->next + len;
+	p->error = 0;
+	p->ncsalloc = 0;
+	for (i = 0; i < NPAREN; i++) {
+		p->pbegin[i] = 0;
+		p->pend[i] = 0;
+	}
+	g->csetsize = NC;
+	g->sets = NULL;
+	g->setbits = NULL;
+	g->ncsets = 0;
+	g->cflags = cflags;
+	g->iflags = 0;
+	g->nbol = 0;
+	g->neol = 0;
+	g->must = NULL;
+	g->mlen = 0;
+	g->nsub = 0;
+	g->ncategories = 1;	/* category 0 is "everything else" */
+	g->categories = &g->catspace[-(CHAR_MIN)];
+	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
+	g->backrefs = 0;
+
+	/* do it */
+	EMIT(OEND, 0);
+	g->firststate = THERE();
+	if (cflags&REG_EXTENDED)
+		p_ere(p, OUT);
+	else if (cflags&REG_NOSPEC)
+		p_str(p);
+	else
+		p_bre(p, OUT, OUT);
+	EMIT(OEND, 0);
+	g->laststate = THERE();
+
+	/* tidy up loose ends and fill things in */
+	categorize(p, g);
+	stripsnug(p, g);
+	findmust(p, g);
+	g->nplus = pluscount(p, g);
+	g->magic = MAGIC2;
+	preg->re_nsub = g->nsub;
+	preg->re_g = g;
+	preg->re_magic = MAGIC1;
+#ifndef REDEBUG
+	/* not debugging, so can't rely on the assert() in llvm_regexec() */
+	if (g->iflags&REGEX_BAD)
+		SETERROR(REG_ASSERT);
+#endif
+
+	/* win or lose, we're done */
+	if (p->error != 0)	/* lose */
+		llvm_regfree(preg);
+	return(p->error);
+}
+
+/*
+ - p_ere - ERE parser top level, concatenation and alternation
+ */
+static void
+p_ere(struct parse *p, int stop)	/* character this ERE should end at */
+{
+	char c;
+	sopno prevback = 0;
+	sopno prevfwd = 0;
+	sopno conc;
+	int first = 1;		/* is this the first alternative? */
+
+	for (;;) {
+		/* do a bunch of concatenated expressions */
+		conc = HERE();
+		while (MORE() && (c = PEEK()) != '|' && c != stop)
+			p_ere_exp(p);
+		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
+
+		if (!EAT('|'))
+			break;		/* NOTE BREAK OUT */
+
+		if (first) {
+			INSERT(OCH_, conc);	/* offset is wrong */
+			prevfwd = conc;
+			prevback = conc;
+			first = 0;
+		}
+		ASTERN(OOR1, prevback);
+		prevback = THERE();
+		AHEAD(prevfwd);			/* fix previous offset */
+		prevfwd = HERE();
+		EMIT(OOR2, 0);			/* offset is very wrong */
+	}
+
+	if (!first) {		/* tail-end fixups */
+		AHEAD(prevfwd);
+		ASTERN(O_CH, prevback);
+	}
+
+	assert(!MORE() || SEE(stop));
+}
+
+/*
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
+ */
+static void
+p_ere_exp(struct parse *p)
+{
+	char c;
+	sopno pos;
+	int count;
+	int count2;
+	int backrefnum;
+	sopno subno;
+	int wascaret = 0;
+
+	assert(MORE());		/* caller should have ensured this */
+	c = GETNEXT();
+
+	pos = HERE();
+	switch (c) {
+	case '(':
+		REQUIRE(MORE(), REG_EPAREN);
+		p->g->nsub++;
+		subno = p->g->nsub;
+		if (subno < NPAREN)
+			p->pbegin[subno] = HERE();
+		EMIT(OLPAREN, subno);
+		if (!SEE(')'))
+			p_ere(p, ')');
+		if (subno < NPAREN) {
+			p->pend[subno] = HERE();
+			assert(p->pend[subno] != 0);
+		}
+		EMIT(ORPAREN, subno);
+		MUSTEAT(')', REG_EPAREN);
+		break;
+#ifndef POSIX_MISTAKE
+	case ')':		/* happens only if no current unmatched ( */
+		/*
+		 * You may ask, why the ifndef?  Because I didn't notice
+		 * this until slightly too late for 1003.2, and none of the
+		 * other 1003.2 regular-expression reviewers noticed it at
+		 * all.  So an unmatched ) is legal POSIX, at least until
+		 * we can get it fixed.
+		 */
+		SETERROR(REG_EPAREN);
+		break;
+#endif
+	case '^':
+		EMIT(OBOL, 0);
+		p->g->iflags |= USEBOL;
+		p->g->nbol++;
+		wascaret = 1;
+		break;
+	case '$':
+		EMIT(OEOL, 0);
+		p->g->iflags |= USEEOL;
+		p->g->neol++;
+		break;
+	case '|':
+		SETERROR(REG_EMPTY);
+		break;
+	case '*':
+	case '+':
+	case '?':
+		SETERROR(REG_BADRPT);
+		break;
+	case '.':
+		if (p->g->cflags&REG_NEWLINE)
+			nonnewline(p);
+		else
+			EMIT(OANY, 0);
+		break;
+	case '[':
+		p_bracket(p);
+		break;
+	case '\\':
+		REQUIRE(MORE(), REG_EESCAPE);
+		c = GETNEXT();
+		if (c >= '1' && c <= '9') {
+			/* \[0-9] is taken to be a back-reference to a previously specified
+			 * matching group. backrefnum will hold the number. The matching
+			 * group must exist (i.e. if \4 is found there must have been at
+			 * least 4 matching groups specified in the pattern previously).
+			 */
+			backrefnum = c - '0';
+			if (p->pend[backrefnum] == 0) {
+				SETERROR(REG_ESUBREG);
+				break;
+			}
+
+			/* Make sure everything checks out and emit the sequence
+			 * that marks a back-reference to the parse structure.
+			 */
+			assert(backrefnum <= p->g->nsub);
+			EMIT(OBACK_, backrefnum);
+			assert(p->pbegin[backrefnum] != 0);
+			assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
+			assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
+			(void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
+			EMIT(O_BACK, backrefnum);
+			p->g->backrefs = 1;
+		} else {
+			/* Other chars are simply themselves when escaped with a backslash.
+			 */
+			ordinary(p, c);
+		}
+		break;
+	case '{':		/* okay as ordinary except if digit follows */
+		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
+		/* FALLTHROUGH */
+	default:
+		ordinary(p, c);
+		break;
+	}
+
+	if (!MORE())
+		return;
+	c = PEEK();
+	/* we call { a repetition if followed by a digit */
+	if (!( c == '*' || c == '+' || c == '?' ||
+				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
+		return;		/* no repetition, we're done */
+	NEXT();
+
+	REQUIRE(!wascaret, REG_BADRPT);
+	switch (c) {
+	case '*':	/* implemented as +? */
+		/* this case does not require the (y|) trick, noKLUDGE */
+		INSERT(OPLUS_, pos);
+		ASTERN(O_PLUS, pos);
+		INSERT(OQUEST_, pos);
+		ASTERN(O_QUEST, pos);
+		break;
+	case '+':
+		INSERT(OPLUS_, pos);
+		ASTERN(O_PLUS, pos);
+		break;
+	case '?':
+		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+		INSERT(OCH_, pos);		/* offset slightly wrong */
+		ASTERN(OOR1, pos);		/* this one's right */
+		AHEAD(pos);			/* fix the OCH_ */
+		EMIT(OOR2, 0);			/* offset very wrong... */
+		AHEAD(THERE());			/* ...so fix it */
+		ASTERN(O_CH, THERETHERE());
+		break;
+	case '{':
+		count = p_count(p);
+		if (EAT(',')) {
+			if (isdigit((uch)PEEK())) {
+				count2 = p_count(p);
+				REQUIRE(count <= count2, REG_BADBR);
+			} else		/* single number with comma */
+				count2 = INFINITY;
+		} else		/* just a single number */
+			count2 = count;
+		repeat(p, pos, count, count2);
+		if (!EAT('}')) {	/* error heuristics */
+			while (MORE() && PEEK() != '}')
+				NEXT();
+			REQUIRE(MORE(), REG_EBRACE);
+			SETERROR(REG_BADBR);
+		}
+		break;
+	}
+
+	if (!MORE())
+		return;
+	c = PEEK();
+	if (!( c == '*' || c == '+' || c == '?' ||
+				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
+		return;
+	SETERROR(REG_BADRPT);
+}
+
+/*
+ - p_str - string (no metacharacters) "parser"
+ */
+static void
+p_str(struct parse *p)
+{
+	REQUIRE(MORE(), REG_EMPTY);
+	while (MORE())
+		ordinary(p, GETNEXT());
+}
+
+/*
+ - p_bre - BRE parser top level, anchoring and concatenation
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor.  The
+ * only undesirable side effect is that '$' gets included as a character
+ * category in such cases.  This is fairly harmless; not worth fixing.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_bre(struct parse *p,
+    int end1,		/* first terminating character */
+    int end2)		/* second terminating character */
+{
+	sopno start = HERE();
+	int first = 1;			/* first subexpression? */
+	int wasdollar = 0;
+
+	if (EAT('^')) {
+		EMIT(OBOL, 0);
+		p->g->iflags |= USEBOL;
+		p->g->nbol++;
+	}
+	while (MORE() && !SEETWO(end1, end2)) {
+		wasdollar = p_simp_re(p, first);
+		first = 0;
+	}
+	if (wasdollar) {	/* oops, that was a trailing anchor */
+		DROP(1);
+		EMIT(OEOL, 0);
+		p->g->iflags |= USEEOL;
+		p->g->neol++;
+	}
+
+	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
+}
+
+/*
+ - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
+ */
+static int			/* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p,
+    int starordinary)		/* is a leading * an ordinary character? */
+{
+	int c;
+	int count;
+	int count2;
+	sopno pos;
+	int i;
+	sopno subno;
+#	define	BACKSL	(1<<CHAR_BIT)
+
+	pos = HERE();		/* repetion op, if any, covers from here */
+
+	assert(MORE());		/* caller should have ensured this */
+	c = GETNEXT();
+	if (c == '\\') {
+		REQUIRE(MORE(), REG_EESCAPE);
+		c = BACKSL | GETNEXT();
+	}
+	switch (c) {
+	case '.':
+		if (p->g->cflags&REG_NEWLINE)
+			nonnewline(p);
+		else
+			EMIT(OANY, 0);
+		break;
+	case '[':
+		p_bracket(p);
+		break;
+	case BACKSL|'{':
+		SETERROR(REG_BADRPT);
+		break;
+	case BACKSL|'(':
+		p->g->nsub++;
+		subno = p->g->nsub;
+		if (subno < NPAREN)
+			p->pbegin[subno] = HERE();
+		EMIT(OLPAREN, subno);
+		/* the MORE here is an error heuristic */
+		if (MORE() && !SEETWO('\\', ')'))
+			p_bre(p, '\\', ')');
+		if (subno < NPAREN) {
+			p->pend[subno] = HERE();
+			assert(p->pend[subno] != 0);
+		}
+		EMIT(ORPAREN, subno);
+		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+		break;
+	case BACKSL|')':	/* should not get here -- must be user */
+	case BACKSL|'}':
+		SETERROR(REG_EPAREN);
+		break;
+	case BACKSL|'1':
+	case BACKSL|'2':
+	case BACKSL|'3':
+	case BACKSL|'4':
+	case BACKSL|'5':
+	case BACKSL|'6':
+	case BACKSL|'7':
+	case BACKSL|'8':
+	case BACKSL|'9':
+		i = (c&~BACKSL) - '0';
+		assert(i < NPAREN);
+		if (p->pend[i] != 0) {
+			assert(i <= p->g->nsub);
+			EMIT(OBACK_, i);
+			assert(p->pbegin[i] != 0);
+			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+			EMIT(O_BACK, i);
+		} else
+			SETERROR(REG_ESUBREG);
+		p->g->backrefs = 1;
+		break;
+	case '*':
+		REQUIRE(starordinary, REG_BADRPT);
+		/* FALLTHROUGH */
+	default:
+		ordinary(p, (char)c);
+		break;
+	}
+
+	if (EAT('*')) {		/* implemented as +? */
+		/* this case does not require the (y|) trick, noKLUDGE */
+		INSERT(OPLUS_, pos);
+		ASTERN(O_PLUS, pos);
+		INSERT(OQUEST_, pos);
+		ASTERN(O_QUEST, pos);
+	} else if (EATTWO('\\', '{')) {
+		count = p_count(p);
+		if (EAT(',')) {
+			if (MORE() && isdigit((uch)PEEK())) {
+				count2 = p_count(p);
+				REQUIRE(count <= count2, REG_BADBR);
+			} else		/* single number with comma */
+				count2 = INFINITY;
+		} else		/* just a single number */
+			count2 = count;
+		repeat(p, pos, count, count2);
+		if (!EATTWO('\\', '}')) {	/* error heuristics */
+			while (MORE() && !SEETWO('\\', '}'))
+				NEXT();
+			REQUIRE(MORE(), REG_EBRACE);
+			SETERROR(REG_BADBR);
+		}
+	} else if (c == '$')	/* $ (but not \$) ends it */
+		return(1);
+
+	return(0);
+}
+
+/*
+ - p_count - parse a repetition count
+ */
+static int			/* the value */
+p_count(struct parse *p)
+{
+	int count = 0;
+	int ndigits = 0;
+
+	while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
+		count = count*10 + (GETNEXT() - '0');
+		ndigits++;
+	}
+
+	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
+	return(count);
+}
+
+/*
+ - p_bracket - parse a bracketed character list
+ *
+ * Note a significant property of this code:  if the allocset() did SETERROR,
+ * no set operations are done.
+ */
+static void
+p_bracket(struct parse *p)
+{
+	cset *cs;
+	int invert = 0;
+
+	/* Dept of Truly Sickening Special-Case Kludges */
+	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
+		EMIT(OBOW, 0);
+		NEXTn(6);
+		return;
+	}
+	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
+		EMIT(OEOW, 0);
+		NEXTn(6);
+		return;
+	}
+
+	if ((cs = allocset(p)) == NULL) {
+		/* allocset did set error status in p */
+		return;
+	}
+
+	if (EAT('^'))
+		invert++;	/* make note to invert set at end */
+	if (EAT(']'))
+		CHadd(cs, ']');
+	else if (EAT('-'))
+		CHadd(cs, '-');
+	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
+		p_b_term(p, cs);
+	if (EAT('-'))
+		CHadd(cs, '-');
+	MUSTEAT(']', REG_EBRACK);
+
+	if (p->error != 0) {	/* don't mess things up further */
+		freeset(p, cs);
+		return;
+	}
+
+	if (p->g->cflags&REG_ICASE) {
+		int i;
+		int ci;
+
+		for (i = p->g->csetsize - 1; i >= 0; i--)
+			if (CHIN(cs, i) && isalpha(i)) {
+				ci = othercase(i);
+				if (ci != i)
+					CHadd(cs, ci);
+			}
+		if (cs->multis != NULL)
+			mccase(p, cs);
+	}
+	if (invert) {
+		int i;
+
+		for (i = p->g->csetsize - 1; i >= 0; i--)
+			if (CHIN(cs, i))
+				CHsub(cs, i);
+			else
+				CHadd(cs, i);
+		if (p->g->cflags&REG_NEWLINE)
+			CHsub(cs, '\n');
+		if (cs->multis != NULL)
+			mcinvert(p, cs);
+	}
+
+	assert(cs->multis == NULL);		/* xxx */
+
+	if (nch(p, cs) == 1) {		/* optimize singleton sets */
+		ordinary(p, firstch(p, cs));
+		freeset(p, cs);
+	} else
+		EMIT(OANYOF, freezeset(p, cs));
+}
+
+/*
+ - p_b_term - parse one term of a bracketed character list
+ */
+static void
+p_b_term(struct parse *p, cset *cs)
+{
+	char c;
+	char start, finish;
+	int i;
+
+	/* classify what we've got */
+	switch ((MORE()) ? PEEK() : '\0') {
+	case '[':
+		c = (MORE2()) ? PEEK2() : '\0';
+		break;
+	case '-':
+		SETERROR(REG_ERANGE);
+		return;			/* NOTE RETURN */
+		break;
+	default:
+		c = '\0';
+		break;
+	}
+
+	switch (c) {
+	case ':':		/* character class */
+		NEXT2();
+		REQUIRE(MORE(), REG_EBRACK);
+		c = PEEK();
+		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
+		p_b_cclass(p, cs);
+		REQUIRE(MORE(), REG_EBRACK);
+		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
+		break;
+	case '=':		/* equivalence class */
+		NEXT2();
+		REQUIRE(MORE(), REG_EBRACK);
+		c = PEEK();
+		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
+		p_b_eclass(p, cs);
+		REQUIRE(MORE(), REG_EBRACK);
+		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
+		break;
+	default:		/* symbol, ordinary character, or range */
+/* xxx revision needed for multichar stuff */
+		start = p_b_symbol(p);
+		if (SEE('-') && MORE2() && PEEK2() != ']') {
+			/* range */
+			NEXT();
+			if (EAT('-'))
+				finish = '-';
+			else
+				finish = p_b_symbol(p);
+		} else
+			finish = start;
+/* xxx what about signed chars here... */
+		REQUIRE(start <= finish, REG_ERANGE);
+		for (i = start; i <= finish; i++)
+			CHadd(cs, i);
+		break;
+	}
+}
+
+/*
+ - p_b_cclass - parse a character-class name and deal with it
+ */
+static void
+p_b_cclass(struct parse *p, cset *cs)
+{
+	char *sp = p->next;
+	struct cclass *cp;
+	size_t len;
+	const char *u;
+	char c;
+
+	while (MORE() && isalpha((uch)PEEK()))
+		NEXT();
+	len = p->next - sp;
+	for (cp = cclasses; cp->name != NULL; cp++)
+		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+			break;
+	if (cp->name == NULL) {
+		/* oops, didn't find it */
+		SETERROR(REG_ECTYPE);
+		return;
+	}
+
+	u = cp->chars;
+	while ((c = *u++) != '\0')
+		CHadd(cs, c);
+	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
+		MCadd(p, cs, u);
+}
+
+/*
+ - p_b_eclass - parse an equivalence-class name and deal with it
+ *
+ * This implementation is incomplete. xxx
+ */
+static void
+p_b_eclass(struct parse *p, cset *cs)
+{
+	char c;
+
+	c = p_b_coll_elem(p, '=');
+	CHadd(cs, c);
+}
+
+/*
+ - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
+ */
+static char			/* value of symbol */
+p_b_symbol(struct parse *p)
+{
+	char value;
+
+	REQUIRE(MORE(), REG_EBRACK);
+	if (!EATTWO('[', '.'))
+		return(GETNEXT());
+
+	/* collating symbol */
+	value = p_b_coll_elem(p, '.');
+	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
+	return(value);
+}
+
+/*
+ - p_b_coll_elem - parse a collating-element name and look it up
+ */
+static char			/* value of collating element */
+p_b_coll_elem(struct parse *p,
+    int endc)			/* name ended by endc,']' */
+{
+	char *sp = p->next;
+	struct cname *cp;
+	int len;
+
+	while (MORE() && !SEETWO(endc, ']'))
+		NEXT();
+	if (!MORE()) {
+		SETERROR(REG_EBRACK);
+		return(0);
+	}
+	len = p->next - sp;
+	for (cp = cnames; cp->name != NULL; cp++)
+		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+			return(cp->code);	/* known name */
+	if (len == 1)
+		return(*sp);	/* single character */
+	SETERROR(REG_ECOLLATE);			/* neither */
+	return(0);
+}
+
+/*
+ - othercase - return the case counterpart of an alphabetic
+ */
+static char			/* if no counterpart, return ch */
+othercase(int ch)
+{
+	ch = (uch)ch;
+	assert(isalpha(ch));
+	if (isupper(ch))
+		return ((uch)tolower(ch));
+	else if (islower(ch))
+		return ((uch)toupper(ch));
+	else			/* peculiar, but could happen */
+		return(ch);
+}
+
+/*
+ - bothcases - emit a dualcase version of a two-case character
+ *
+ * Boy, is this implementation ever a kludge...
+ */
+static void
+bothcases(struct parse *p, int ch)
+{
+	char *oldnext = p->next;
+	char *oldend = p->end;
+	char bracket[3];
+
+	ch = (uch)ch;
+	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
+	p->next = bracket;
+	p->end = bracket+2;
+	bracket[0] = ch;
+	bracket[1] = ']';
+	bracket[2] = '\0';
+	p_bracket(p);
+	assert(p->next == bracket+2);
+	p->next = oldnext;
+	p->end = oldend;
+}
+
+/*
+ - ordinary - emit an ordinary character
+ */
+static void
+ordinary(struct parse *p, int ch)
+{
+	cat_t *cap = p->g->categories;
+
+	if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
+		bothcases(p, ch);
+	else {
+		EMIT(OCHAR, (uch)ch);
+		if (cap[ch] == 0)
+			cap[ch] = p->g->ncategories++;
+	}
+}
+
+/*
+ - nonnewline - emit REG_NEWLINE version of OANY
+ *
+ * Boy, is this implementation ever a kludge...
+ */
+static void
+nonnewline(struct parse *p)
+{
+	char *oldnext = p->next;
+	char *oldend = p->end;
+	char bracket[4];
+
+	p->next = bracket;
+	p->end = bracket+3;
+	bracket[0] = '^';
+	bracket[1] = '\n';
+	bracket[2] = ']';
+	bracket[3] = '\0';
+	p_bracket(p);
+	assert(p->next == bracket+3);
+	p->next = oldnext;
+	p->end = oldend;
+}
+
+/*
+ - repeat - generate code for a bounded repetition, recursively if needed
+ */
+static void
+repeat(struct parse *p,
+    sopno start,		/* operand from here to end of strip */
+    int from,			/* repeated from this number */
+    int to)			/* to this number of times (maybe INFINITY) */
+{
+	sopno finish = HERE();
+#	define	N	2
+#	define	INF	3
+#	define	REP(f, t)	((f)*8 + (t))
+#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
+	sopno copy;
+
+	if (p->error != 0)	/* head off possible runaway recursion */
+		return;
+
+	assert(from <= to);
+
+	switch (REP(MAP(from), MAP(to))) {
+	case REP(0, 0):			/* must be user doing this */
+		DROP(finish-start);	/* drop the operand */
+		break;
+	case REP(0, 1):			/* as x{1,1}? */
+	case REP(0, N):			/* as x{1,n}? */
+	case REP(0, INF):		/* as x{1,}? */
+		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+		INSERT(OCH_, start);		/* offset is wrong... */
+		repeat(p, start+1, 1, to);
+		ASTERN(OOR1, start);
+		AHEAD(start);			/* ... fix it */
+		EMIT(OOR2, 0);
+		AHEAD(THERE());
+		ASTERN(O_CH, THERETHERE());
+		break;
+	case REP(1, 1):			/* trivial case */
+		/* done */
+		break;
+	case REP(1, N):			/* as x?x{1,n-1} */
+		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
+		INSERT(OCH_, start);
+		ASTERN(OOR1, start);
+		AHEAD(start);
+		EMIT(OOR2, 0);			/* offset very wrong... */
+		AHEAD(THERE());			/* ...so fix it */
+		ASTERN(O_CH, THERETHERE());
+		copy = dupl(p, start+1, finish+1);
+		assert(copy == finish+4);
+		repeat(p, copy, 1, to-1);
+		break;
+	case REP(1, INF):		/* as x+ */
+		INSERT(OPLUS_, start);
+		ASTERN(O_PLUS, start);
+		break;
+	case REP(N, N):			/* as xx{m-1,n-1} */
+		copy = dupl(p, start, finish);
+		repeat(p, copy, from-1, to-1);
+		break;
+	case REP(N, INF):		/* as xx{n-1,INF} */
+		copy = dupl(p, start, finish);
+		repeat(p, copy, from-1, to);
+		break;
+	default:			/* "can't happen" */
+		SETERROR(REG_ASSERT);	/* just in case */
+		break;
+	}
+}
+
+/*
+ - seterr - set an error condition
+ */
+static int			/* useless but makes type checking happy */
+seterr(struct parse *p, int e)
+{
+	if (p->error == 0)	/* keep earliest error condition */
+		p->error = e;
+	p->next = nuls;		/* try to bring things to a halt */
+	p->end = nuls;
+	return(0);		/* make the return value well-defined */
+}
+
+/*
+ - allocset - allocate a set of characters for []
+ */
+static cset *
+allocset(struct parse *p)
+{
+	int no = p->g->ncsets++;
+	size_t nc;
+	size_t nbytes;
+	cset *cs;
+	size_t css = (size_t)p->g->csetsize;
+	int i;
+
+	if (no >= p->ncsalloc) {	/* need another column of space */
+		void *ptr;
+
+		p->ncsalloc += CHAR_BIT;
+		nc = p->ncsalloc;
+		assert(nc % CHAR_BIT == 0);
+		nbytes = nc / CHAR_BIT * css;
+
+		ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
+		if (ptr == NULL)
+			goto nomem;
+		p->g->sets = ptr;
+
+		ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
+		if (ptr == NULL)
+			goto nomem;
+		p->g->setbits = ptr;
+
+		for (i = 0; i < no; i++)
+			p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
+
+		(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
+	}
+	/* XXX should not happen */
+	if (p->g->sets == NULL || p->g->setbits == NULL)
+		goto nomem;
+
+	cs = &p->g->sets[no];
+	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
+	cs->mask = 1 << ((no) % CHAR_BIT);
+	cs->hash = 0;
+	cs->smultis = 0;
+	cs->multis = NULL;
+
+	return(cs);
+nomem:
+	free(p->g->sets);
+	p->g->sets = NULL;
+	free(p->g->setbits);
+	p->g->setbits = NULL;
+
+	SETERROR(REG_ESPACE);
+	/* caller's responsibility not to do set ops */
+	return(NULL);
+}
+
+/*
+ - freeset - free a now-unused set
+ */
+static void
+freeset(struct parse *p, cset *cs)
+{
+	size_t i;
+	cset *top = &p->g->sets[p->g->ncsets];
+	size_t css = (size_t)p->g->csetsize;
+
+	for (i = 0; i < css; i++)
+		CHsub(cs, i);
+	if (cs == top-1)	/* recover only the easy case */
+		p->g->ncsets--;
+}
+
+/*
+ - freezeset - final processing on a set of characters
+ *
+ * The main task here is merging identical sets.  This is usually a waste
+ * of time (although the hash code minimizes the overhead), but can win
+ * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
+ * is done using addition rather than xor -- all ASCII [aA] sets xor to
+ * the same value!
+ */
+static int			/* set number */
+freezeset(struct parse *p, cset *cs)
+{
+	uch h = cs->hash;
+	size_t i;
+	cset *top = &p->g->sets[p->g->ncsets];
+	cset *cs2;
+	size_t css = (size_t)p->g->csetsize;
+
+	/* look for an earlier one which is the same */
+	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
+		if (cs2->hash == h && cs2 != cs) {
+			/* maybe */
+			for (i = 0; i < css; i++)
+				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
+					break;		/* no */
+			if (i == css)
+				break;			/* yes */
+		}
+
+	if (cs2 < top) {	/* found one */
+		freeset(p, cs);
+		cs = cs2;
+	}
+
+	return((int)(cs - p->g->sets));
+}
+
+/*
+ - firstch - return first character in a set (which must have at least one)
+ */
+static int			/* character; there is no "none" value */
+firstch(struct parse *p, cset *cs)
+{
+	size_t i;
+	size_t css = (size_t)p->g->csetsize;
+
+	for (i = 0; i < css; i++)
+		if (CHIN(cs, i))
+			return((char)i);
+	assert(never);
+	return(0);		/* arbitrary */
+}
+
+/*
+ - nch - number of characters in a set
+ */
+static int
+nch(struct parse *p, cset *cs)
+{
+	size_t i;
+	size_t css = (size_t)p->g->csetsize;
+	int n = 0;
+
+	for (i = 0; i < css; i++)
+		if (CHIN(cs, i))
+			n++;
+	return(n);
+}
+
+/*
+ - mcadd - add a collating element to a cset
+ */
+static void
+mcadd( struct parse *p, cset *cs, const char *cp)
+{
+	size_t oldend = cs->smultis;
+	void *np;
+
+	cs->smultis += strlen(cp) + 1;
+	np = realloc(cs->multis, cs->smultis);
+	if (np == NULL) {
+		if (cs->multis)
+			free(cs->multis);
+		cs->multis = NULL;
+		SETERROR(REG_ESPACE);
+		return;
+	}
+	cs->multis = np;
+
+	llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
+}
+
+/*
+ - mcinvert - invert the list of collating elements in a cset
+ *
+ * This would have to know the set of possibilities.  Implementation
+ * is deferred.
+ */
+/* ARGSUSED */
+static void
+mcinvert(struct parse *p, cset *cs)
+{
+	assert(cs->multis == NULL);	/* xxx */
+}
+
+/*
+ - mccase - add case counterparts of the list of collating elements in a cset
+ *
+ * This would have to know the set of possibilities.  Implementation
+ * is deferred.
+ */
+/* ARGSUSED */
+static void
+mccase(struct parse *p, cset *cs)
+{
+	assert(cs->multis == NULL);	/* xxx */
+}
+
+/*
+ - isinsets - is this character in any sets?
+ */
+static int			/* predicate */
+isinsets(struct re_guts *g, int c)
+{
+	uch *col;
+	int i;
+	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
+	unsigned uc = (uch)c;
+
+	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
+		if (col[uc] != 0)
+			return(1);
+	return(0);
+}
+
+/*
+ - samesets - are these two characters in exactly the same sets?
+ */
+static int			/* predicate */
+samesets(struct re_guts *g, int c1, int c2)
+{
+	uch *col;
+	int i;
+	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
+	unsigned uc1 = (uch)c1;
+	unsigned uc2 = (uch)c2;
+
+	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
+		if (col[uc1] != col[uc2])
+			return(0);
+	return(1);
+}
+
+/*
+ - categorize - sort out character categories
+ */
+static void
+categorize(struct parse *p, struct re_guts *g)
+{
+	cat_t *cats = g->categories;
+	int c;
+	int c2;
+	cat_t cat;
+
+	/* avoid making error situations worse */
+	if (p->error != 0)
+		return;
+
+	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
+		if (cats[c] == 0 && isinsets(g, c)) {
+			cat = g->ncategories++;
+			cats[c] = cat;
+			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
+				if (cats[c2] == 0 && samesets(g, c, c2))
+					cats[c2] = cat;
+		}
+}
+
+/*
+ - dupl - emit a duplicate of a bunch of sops
+ */
+static sopno			/* start of duplicate */
+dupl(struct parse *p,
+    sopno start,		/* from here */
+    sopno finish)		/* to this less one */
+{
+	sopno ret = HERE();
+	sopno len = finish - start;
+
+	assert(finish >= start);
+	if (len == 0)
+		return(ret);
+	enlarge(p, p->ssize + len);	/* this many unexpected additions */
+	assert(p->ssize >= p->slen + len);
+	(void) memmove((char *)(p->strip + p->slen),
+		(char *)(p->strip + start), (size_t)len*sizeof(sop));
+	p->slen += len;
+	return(ret);
+}
+
+/*
+ - doemit - emit a strip operator
+ *
+ * It might seem better to implement this as a macro with a function as
+ * hard-case backup, but it's just too big and messy unless there are
+ * some changes to the data structures.  Maybe later.
+ */
+static void
+doemit(struct parse *p, sop op, size_t opnd)
+{
+	/* avoid making error situations worse */
+	if (p->error != 0)
+		return;
+
+	/* deal with oversize operands ("can't happen", more or less) */
+	assert(opnd < 1<<OPSHIFT);
+
+	/* deal with undersized strip */
+	if (p->slen >= p->ssize)
+		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
+	assert(p->slen < p->ssize);
+
+	/* finally, it's all reduced to the easy case */
+	p->strip[p->slen++] = SOP(op, opnd);
+}
+
+/*
+ - doinsert - insert a sop into the strip
+ */
+static void
+doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
+{
+	sopno sn;
+	sop s;
+	int i;
+
+	/* avoid making error situations worse */
+	if (p->error != 0)
+		return;
+
+	sn = HERE();
+	EMIT(op, opnd);		/* do checks, ensure space */
+	assert(HERE() == sn+1);
+	s = p->strip[sn];
+
+	/* adjust paren pointers */
+	assert(pos > 0);
+	for (i = 1; i < NPAREN; i++) {
+		if (p->pbegin[i] >= pos) {
+			p->pbegin[i]++;
+		}
+		if (p->pend[i] >= pos) {
+			p->pend[i]++;
+		}
+	}
+
+	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
+						(HERE()-pos-1)*sizeof(sop));
+	p->strip[pos] = s;
+}
+
+/*
+ - dofwd - complete a forward reference
+ */
+static void
+dofwd(struct parse *p, sopno pos, sop value)
+{
+	/* avoid making error situations worse */
+	if (p->error != 0)
+		return;
+
+	assert(value < 1<<OPSHIFT);
+	p->strip[pos] = OP(p->strip[pos]) | value;
+}
+
+/*
+ - enlarge - enlarge the strip
+ */
+static void
+enlarge(struct parse *p, sopno size)
+{
+	sop *sp;
+
+	if (p->ssize >= size)
+		return;
+
+	sp = (sop *)realloc(p->strip, size*sizeof(sop));
+	if (sp == NULL) {
+		SETERROR(REG_ESPACE);
+		return;
+	}
+	p->strip = sp;
+	p->ssize = size;
+}
+
+/*
+ - stripsnug - compact the strip
+ */
+static void
+stripsnug(struct parse *p, struct re_guts *g)
+{
+	g->nstates = p->slen;
+	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
+	if (g->strip == NULL) {
+		SETERROR(REG_ESPACE);
+		g->strip = p->strip;
+	}
+}
+
+/*
+ - findmust - fill in must and mlen with longest mandatory literal string
+ *
+ * This algorithm could do fancy things like analyzing the operands of |
+ * for common subsequences.  Someday.  This code is simple and finds most
+ * of the interesting cases.
+ *
+ * Note that must and mlen got initialized during setup.
+ */
+static void
+findmust(struct parse *p, struct re_guts *g)
+{
+	sop *scan;
+	sop *start = 0; /* start initialized in the default case, after that */
+	sop *newstart = 0; /* newstart was initialized in the OCHAR case */
+	sopno newlen;
+	sop s;
+	char *cp;
+	sopno i;
+
+	/* avoid making error situations worse */
+	if (p->error != 0)
+		return;
+
+	/* find the longest OCHAR sequence in strip */
+	newlen = 0;
+	scan = g->strip + 1;
+	do {
+		s = *scan++;
+		switch (OP(s)) {
+		case OCHAR:		/* sequence member */
+			if (newlen == 0)		/* new sequence */
+				newstart = scan - 1;
+			newlen++;
+			break;
+		case OPLUS_:		/* things that don't break one */
+		case OLPAREN:
+		case ORPAREN:
+			break;
+		case OQUEST_:		/* things that must be skipped */
+		case OCH_:
+			scan--;
+			do {
+				scan += OPND(s);
+				s = *scan;
+				/* assert() interferes w debug printouts */
+				if (OP(s) != O_QUEST && OP(s) != O_CH &&
+							OP(s) != OOR2) {
+					g->iflags |= REGEX_BAD;
+					return;
+				}
+			} while (OP(s) != O_QUEST && OP(s) != O_CH);
+			/* fallthrough */
+		default:		/* things that break a sequence */
+			if (newlen > g->mlen) {		/* ends one */
+				start = newstart;
+				g->mlen = newlen;
+			}
+			newlen = 0;
+			break;
+		}
+	} while (OP(s) != OEND);
+
+	if (g->mlen == 0)		/* there isn't one */
+		return;
+
+	/* turn it into a character string */
+	g->must = malloc((size_t)g->mlen + 1);
+	if (g->must == NULL) {		/* argh; just forget it */
+		g->mlen = 0;
+		return;
+	}
+	cp = g->must;
+	scan = start;
+	for (i = g->mlen; i > 0; i--) {
+		while (OP(s = *scan++) != OCHAR)
+			continue;
+		assert(cp < g->must + g->mlen);
+		*cp++ = (char)OPND(s);
+	}
+	assert(cp == g->must + g->mlen);
+	*cp++ = '\0';		/* just on general principles */
+}
+
+/*
+ - pluscount - count + nesting
+ */
+static sopno			/* nesting depth */
+pluscount(struct parse *p, struct re_guts *g)
+{
+	sop *scan;
+	sop s;
+	sopno plusnest = 0;
+	sopno maxnest = 0;
+
+	if (p->error != 0)
+		return(0);	/* there may not be an OEND */
+
+	scan = g->strip + 1;
+	do {
+		s = *scan++;
+		switch (OP(s)) {
+		case OPLUS_:
+			plusnest++;
+			break;
+		case O_PLUS:
+			if (plusnest > maxnest)
+				maxnest = plusnest;
+			plusnest--;
+			break;
+		}
+	} while (OP(s) != OEND);
+	if (plusnest != 0)
+		g->iflags |= REGEX_BAD;
+	return(maxnest);
+}
diff --git a/contrib/llvm/lib/Support/regengine.inc b/contrib/llvm/lib/Support/regengine.inc
new file mode 100644
index 000000000000..7e41f96f359d
--- /dev/null
+++ b/contrib/llvm/lib/Support/regengine.inc
@@ -0,0 +1,1034 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)engine.c	8.5 (Berkeley) 3/20/94
+ */
+
+/*
+ * The matching engine and friends.  This file is #included by regexec.c
+ * after suitable #defines of a variety of macros used herein, so that
+ * different state representations can be used without duplicating masses
+ * of code.
+ */
+
+#ifdef SNAMES
+#define	matcher	smatcher
+#define	fast	sfast
+#define	slow	sslow
+#define	dissect	sdissect
+#define	backref	sbackref
+#define	step	sstep
+#define	print	sprint
+#define	at	sat
+#define	match	smat
+#define	nope	snope
+#endif
+#ifdef LNAMES
+#define	matcher	lmatcher
+#define	fast	lfast
+#define	slow	lslow
+#define	dissect	ldissect
+#define	backref	lbackref
+#define	step	lstep
+#define	print	lprint
+#define	at	lat
+#define	match	lmat
+#define	nope	lnope
+#endif
+
+/* another structure passed up and down to avoid zillions of parameters */
+struct match {
+	struct re_guts *g;
+	int eflags;
+	llvm_regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
+	const char *offp;		/* offsets work from here */
+	const char *beginp;		/* start of string -- virtual NUL precedes */
+	const char *endp;		/* end of string -- virtual NUL here */
+	const char *coldp;		/* can be no match starting before here */
+	const char **lastpos;		/* [nplus+1] */
+	STATEVARS;
+	states st;		/* current states */
+	states fresh;		/* states for a fresh start */
+	states tmp;		/* temporary */
+	states empty;		/* empty set of states */
+};
+
+static int matcher(struct re_guts *, const char *, size_t,
+                   llvm_regmatch_t[], int);
+static const char *dissect(struct match *, const char *, const char *, sopno,
+                           sopno);
+static const char *backref(struct match *, const char *, const char *, sopno,
+                           sopno, sopno, int);
+static const char *fast(struct match *, const char *, const char *, sopno, sopno);
+static const char *slow(struct match *, const char *, const char *, sopno, sopno);
+static states step(struct re_guts *, sopno, sopno, states, int, states);
+#define MAX_RECURSION	100
+#define	BOL	(OUT+1)
+#define	EOL	(BOL+1)
+#define	BOLEOL	(BOL+2)
+#define	NOTHING	(BOL+3)
+#define	BOW	(BOL+4)
+#define	EOW	(BOL+5)
+#define	CODEMAX	(BOL+5)		/* highest code used */
+#define	NONCHAR(c)	((c) > CHAR_MAX)
+#define	NNONCHAR	(CODEMAX-CHAR_MAX)
+#ifdef REDEBUG
+static void print(struct match *, char *, states, int, FILE *);
+#endif
+#ifdef REDEBUG
+static void at(struct match *, char *, char *, char *, sopno, sopno);
+#endif
+#ifdef REDEBUG
+static char *pchar(int);
+#endif
+
+#ifdef REDEBUG
+#define	SP(t, s, c)	print(m, t, s, c, stdout)
+#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
+#define	NOTE(str)	{ if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
+static int nope = 0;
+#else
+#define	SP(t, s, c)	/* nothing */
+#define	AT(t, p1, p2, s1, s2)	/* nothing */
+#define	NOTE(s)	/* nothing */
+#endif
+
+/*
+ - matcher - the actual matching engine
+ */
+static int			/* 0 success, REG_NOMATCH failure */
+matcher(struct re_guts *g, const char *string, size_t nmatch,
+        llvm_regmatch_t pmatch[],
+    int eflags)
+{
+	const char *endp;
+	size_t i;
+	struct match mv;
+	struct match *m = &mv;
+	const char *dp;
+	const sopno gf = g->firststate+1;	/* +1 for OEND */
+	const sopno gl = g->laststate;
+	const char *start;
+	const char *stop;
+
+	/* simplify the situation where possible */
+	if (g->cflags&REG_NOSUB)
+		nmatch = 0;
+	if (eflags&REG_STARTEND) {
+		start = string + pmatch[0].rm_so;
+		stop = string + pmatch[0].rm_eo;
+	} else {
+		start = string;
+		stop = start + strlen(start);
+	}
+	if (stop < start)
+		return(REG_INVARG);
+
+	/* prescreening; this does wonders for this rather slow code */
+	if (g->must != NULL) {
+		for (dp = start; dp < stop; dp++)
+			if (*dp == g->must[0] && stop - dp >= g->mlen &&
+				memcmp(dp, g->must, (size_t)g->mlen) == 0)
+				break;
+		if (dp == stop)		/* we didn't find g->must */
+			return(REG_NOMATCH);
+	}
+
+	/* match struct setup */
+	m->g = g;
+	m->eflags = eflags;
+	m->pmatch = NULL;
+	m->lastpos = NULL;
+	m->offp = string;
+	m->beginp = start;
+	m->endp = stop;
+	STATESETUP(m, 4);
+	SETUP(m->st);
+	SETUP(m->fresh);
+	SETUP(m->tmp);
+	SETUP(m->empty);
+	CLEAR(m->empty);
+
+	/* this loop does only one repetition except for backrefs */
+	for (;;) {
+		endp = fast(m, start, stop, gf, gl);
+		if (endp == NULL) {		/* a miss */
+			free(m->pmatch);
+			free((void*)m->lastpos);
+			STATETEARDOWN(m);
+			return(REG_NOMATCH);
+		}
+		if (nmatch == 0 && !g->backrefs)
+			break;		/* no further info needed */
+
+		/* where? */
+		assert(m->coldp != NULL);
+		for (;;) {
+			NOTE("finding start");
+			endp = slow(m, m->coldp, stop, gf, gl);
+			if (endp != NULL)
+				break;
+			assert(m->coldp < m->endp);
+			m->coldp++;
+		}
+		if (nmatch == 1 && !g->backrefs)
+			break;		/* no further info needed */
+
+		/* oh my, he wants the subexpressions... */
+		if (m->pmatch == NULL)
+			m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
+							sizeof(llvm_regmatch_t));
+		if (m->pmatch == NULL) {
+			STATETEARDOWN(m);
+			return(REG_ESPACE);
+		}
+		for (i = 1; i <= m->g->nsub; i++)
+			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
+		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
+			NOTE("dissecting");
+			dp = dissect(m, m->coldp, endp, gf, gl);
+		} else {
+			if (g->nplus > 0 && m->lastpos == NULL)
+				m->lastpos = (const char **)malloc((g->nplus+1) *
+							sizeof(char *));
+			if (g->nplus > 0 && m->lastpos == NULL) {
+				free(m->pmatch);
+				STATETEARDOWN(m);
+				return(REG_ESPACE);
+			}
+			NOTE("backref dissect");
+			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
+		}
+		if (dp != NULL)
+			break;
+
+		/* uh-oh... we couldn't find a subexpression-level match */
+		assert(g->backrefs);	/* must be back references doing it */
+		assert(g->nplus == 0 || m->lastpos != NULL);
+		for (;;) {
+			if (dp != NULL || endp <= m->coldp)
+				break;		/* defeat */
+			NOTE("backoff");
+			endp = slow(m, m->coldp, endp-1, gf, gl);
+			if (endp == NULL)
+				break;		/* defeat */
+			/* try it on a shorter possibility */
+#ifndef NDEBUG
+			for (i = 1; i <= m->g->nsub; i++) {
+				assert(m->pmatch[i].rm_so == -1);
+				assert(m->pmatch[i].rm_eo == -1);
+			}
+#endif
+			NOTE("backoff dissect");
+			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
+		}
+		assert(dp == NULL || dp == endp);
+		if (dp != NULL)		/* found a shorter one */
+			break;
+
+		/* despite initial appearances, there is no match here */
+		NOTE("false alarm");
+		if (m->coldp == stop)
+			break;
+		start = m->coldp + 1;	/* recycle starting later */
+	}
+
+	/* fill in the details if requested */
+	if (nmatch > 0) {
+		pmatch[0].rm_so = m->coldp - m->offp;
+		pmatch[0].rm_eo = endp - m->offp;
+	}
+	if (nmatch > 1) {
+		assert(m->pmatch != NULL);
+		for (i = 1; i < nmatch; i++)
+			if (i <= m->g->nsub)
+				pmatch[i] = m->pmatch[i];
+			else {
+				pmatch[i].rm_so = -1;
+				pmatch[i].rm_eo = -1;
+			}
+	}
+
+	if (m->pmatch != NULL)
+		free((char *)m->pmatch);
+	if (m->lastpos != NULL)
+		free((char *)m->lastpos);
+	STATETEARDOWN(m);
+	return(0);
+}
+
+/*
+ - dissect - figure out what matched what, no back references
+ */
+static const char *			/* == stop (success) always */
+dissect(struct match *m, const char *start, const char *stop, sopno startst,
+        sopno stopst)
+{
+	int i;
+	sopno ss;	/* start sop of current subRE */
+	sopno es;	/* end sop of current subRE */
+	const char *sp;	/* start of string matched by it */
+	const char *stp;	/* string matched by it cannot pass here */
+	const char *rest;	/* start of rest of string */
+	const char *tail;	/* string unmatched by rest of RE */
+	sopno ssub;	/* start sop of subsubRE */
+	sopno esub;	/* end sop of subsubRE */
+	const char *ssp;	/* start of string matched by subsubRE */
+	const char *sep;	/* end of string matched by subsubRE */
+	const char *oldssp;	/* previous ssp */
+
+	AT("diss", start, stop, startst, stopst);
+	sp = start;
+	for (ss = startst; ss < stopst; ss = es) {
+		/* identify end of subRE */
+		es = ss;
+		switch (OP(m->g->strip[es])) {
+		case OPLUS_:
+		case OQUEST_:
+			es += OPND(m->g->strip[es]);
+			break;
+		case OCH_:
+			while (OP(m->g->strip[es]) != O_CH)
+				es += OPND(m->g->strip[es]);
+			break;
+		}
+		es++;
+
+		/* figure out what it matched */
+		switch (OP(m->g->strip[ss])) {
+		case OEND:
+			assert(nope);
+			break;
+		case OCHAR:
+			sp++;
+			break;
+		case OBOL:
+		case OEOL:
+		case OBOW:
+		case OEOW:
+			break;
+		case OANY:
+		case OANYOF:
+			sp++;
+			break;
+		case OBACK_:
+		case O_BACK:
+			assert(nope);
+			break;
+		/* cases where length of match is hard to find */
+		case OQUEST_:
+			stp = stop;
+			for (;;) {
+				/* how long could this one be? */
+				rest = slow(m, sp, stp, ss, es);
+				assert(rest != NULL);	/* it did match */
+				/* could the rest match the rest? */
+				tail = slow(m, rest, stop, es, stopst);
+				if (tail == stop)
+					break;		/* yes! */
+				/* no -- try a shorter match for this one */
+				stp = rest - 1;
+				assert(stp >= sp);	/* it did work */
+			}
+			ssub = ss + 1;
+			esub = es - 1;
+			/* did innards match? */
+			if (slow(m, sp, rest, ssub, esub) != NULL) {
+				const char *dp = dissect(m, sp, rest, ssub, esub);
+				(void)dp; /* avoid warning if assertions off */
+				assert(dp == rest);
+			} else		/* no */
+				assert(sp == rest);
+			sp = rest;
+			break;
+		case OPLUS_:
+			stp = stop;
+			for (;;) {
+				/* how long could this one be? */
+				rest = slow(m, sp, stp, ss, es);
+				assert(rest != NULL);	/* it did match */
+				/* could the rest match the rest? */
+				tail = slow(m, rest, stop, es, stopst);
+				if (tail == stop)
+					break;		/* yes! */
+				/* no -- try a shorter match for this one */
+				stp = rest - 1;
+				assert(stp >= sp);	/* it did work */
+			}
+			ssub = ss + 1;
+			esub = es - 1;
+			ssp = sp;
+			oldssp = ssp;
+			for (;;) {	/* find last match of innards */
+				sep = slow(m, ssp, rest, ssub, esub);
+				if (sep == NULL || sep == ssp)
+					break;	/* failed or matched null */
+				oldssp = ssp;	/* on to next try */
+				ssp = sep;
+			}
+			if (sep == NULL) {
+				/* last successful match */
+				sep = ssp;
+				ssp = oldssp;
+			}
+			assert(sep == rest);	/* must exhaust substring */
+			assert(slow(m, ssp, sep, ssub, esub) == rest);
+			{
+				const char *dp = dissect(m, ssp, sep, ssub, esub);
+				(void)dp; /* avoid warning if assertions off */
+				assert(dp == sep);
+			}
+			sp = rest;
+			break;
+		case OCH_:
+			stp = stop;
+			for (;;) {
+				/* how long could this one be? */
+				rest = slow(m, sp, stp, ss, es);
+				assert(rest != NULL);	/* it did match */
+				/* could the rest match the rest? */
+				tail = slow(m, rest, stop, es, stopst);
+				if (tail == stop)
+					break;		/* yes! */
+				/* no -- try a shorter match for this one */
+				stp = rest - 1;
+				assert(stp >= sp);	/* it did work */
+			}
+			ssub = ss + 1;
+			esub = ss + OPND(m->g->strip[ss]) - 1;
+			assert(OP(m->g->strip[esub]) == OOR1);
+			for (;;) {	/* find first matching branch */
+				if (slow(m, sp, rest, ssub, esub) == rest)
+					break;	/* it matched all of it */
+				/* that one missed, try next one */
+				assert(OP(m->g->strip[esub]) == OOR1);
+				esub++;
+				assert(OP(m->g->strip[esub]) == OOR2);
+				ssub = esub + 1;
+				esub += OPND(m->g->strip[esub]);
+				if (OP(m->g->strip[esub]) == OOR2)
+					esub--;
+				else
+					assert(OP(m->g->strip[esub]) == O_CH);
+			}
+			{
+				const char *dp = dissect(m, sp, rest, ssub, esub);
+				(void)dp; /* avoid warning if assertions off */
+				assert(dp == rest);
+			}
+			sp = rest;
+			break;
+		case O_PLUS:
+		case O_QUEST:
+		case OOR1:
+		case OOR2:
+		case O_CH:
+			assert(nope);
+			break;
+		case OLPAREN:
+			i = OPND(m->g->strip[ss]);
+			assert(0 < i && i <= m->g->nsub);
+			m->pmatch[i].rm_so = sp - m->offp;
+			break;
+		case ORPAREN:
+			i = OPND(m->g->strip[ss]);
+			assert(0 < i && i <= m->g->nsub);
+			m->pmatch[i].rm_eo = sp - m->offp;
+			break;
+		default:		/* uh oh */
+			assert(nope);
+			break;
+		}
+	}
+
+	assert(sp == stop);
+	return(sp);
+}
+
+/*
+ - backref - figure out what matched what, figuring in back references
+ */
+static const char *			/* == stop (success) or NULL (failure) */
+backref(struct match *m, const char *start, const char *stop, sopno startst,
+        sopno stopst, sopno lev, int rec)			/* PLUS nesting level */
+{
+	int i;
+	sopno ss;	/* start sop of current subRE */
+	const char *sp;	/* start of string matched by it */
+	sopno ssub;	/* start sop of subsubRE */
+	sopno esub;	/* end sop of subsubRE */
+	const char *ssp;	/* start of string matched by subsubRE */
+	const char *dp;
+	size_t len;
+	int hard;
+	sop s;
+	llvm_regoff_t offsave;
+	cset *cs;
+
+	AT("back", start, stop, startst, stopst);
+	sp = start;
+
+	/* get as far as we can with easy stuff */
+	hard = 0;
+	for (ss = startst; !hard && ss < stopst; ss++)
+		switch (OP(s = m->g->strip[ss])) {
+		case OCHAR:
+			if (sp == stop || *sp++ != (char)OPND(s))
+				return(NULL);
+			break;
+		case OANY:
+			if (sp == stop)
+				return(NULL);
+			sp++;
+			break;
+		case OANYOF:
+			cs = &m->g->sets[OPND(s)];
+			if (sp == stop || !CHIN(cs, *sp++))
+				return(NULL);
+			break;
+		case OBOL:
+			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
+					(sp < m->endp && *(sp-1) == '\n' &&
+						(m->g->cflags&REG_NEWLINE)) )
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case OEOL:
+			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
+					(sp < m->endp && *sp == '\n' &&
+						(m->g->cflags&REG_NEWLINE)) )
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case OBOW:
+			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
+					(sp < m->endp && *(sp-1) == '\n' &&
+						(m->g->cflags&REG_NEWLINE)) ||
+					(sp > m->beginp &&
+							!ISWORD(*(sp-1))) ) &&
+					(sp < m->endp && ISWORD(*sp)) )
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case OEOW:
+			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
+					(sp < m->endp && *sp == '\n' &&
+						(m->g->cflags&REG_NEWLINE)) ||
+					(sp < m->endp && !ISWORD(*sp)) ) &&
+					(sp > m->beginp && ISWORD(*(sp-1))) )
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case O_QUEST:
+			break;
+		case OOR1:	/* matches null but needs to skip */
+			ss++;
+			s = m->g->strip[ss];
+			do {
+				assert(OP(s) == OOR2);
+				ss += OPND(s);
+			} while (OP(s = m->g->strip[ss]) != O_CH);
+			/* note that the ss++ gets us past the O_CH */
+			break;
+		default:	/* have to make a choice */
+			hard = 1;
+			break;
+		}
+	if (!hard) {		/* that was it! */
+		if (sp != stop)
+			return(NULL);
+		return(sp);
+	}
+	ss--;			/* adjust for the for's final increment */
+
+	/* the hard stuff */
+	AT("hard", sp, stop, ss, stopst);
+	s = m->g->strip[ss];
+	switch (OP(s)) {
+	case OBACK_:		/* the vilest depths */
+		i = OPND(s);
+		assert(0 < i && i <= m->g->nsub);
+		if (m->pmatch[i].rm_eo == -1)
+			return(NULL);
+		assert(m->pmatch[i].rm_so != -1);
+		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
+		if (len == 0 && rec++ > MAX_RECURSION)
+			return(NULL);
+		assert(stop - m->beginp >= len);
+		if (sp > stop - len)
+			return(NULL);	/* not enough left to match */
+		ssp = m->offp + m->pmatch[i].rm_so;
+		if (memcmp(sp, ssp, len) != 0)
+			return(NULL);
+		while (m->g->strip[ss] != SOP(O_BACK, i))
+			ss++;
+		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
+		break;
+	case OQUEST_:		/* to null or not */
+		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+		if (dp != NULL)
+			return(dp);	/* not */
+		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
+		break;
+	case OPLUS_:
+		assert(m->lastpos != NULL);
+		assert(lev+1 <= m->g->nplus);
+		m->lastpos[lev+1] = sp;
+		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
+		break;
+	case O_PLUS:
+		if (sp == m->lastpos[lev])	/* last pass matched null */
+			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
+		/* try another pass */
+		m->lastpos[lev] = sp;
+		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
+		if (dp == NULL)
+			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
+		else
+			return(dp);
+		break;
+	case OCH_:		/* find the right one, if any */
+		ssub = ss + 1;
+		esub = ss + OPND(s) - 1;
+		assert(OP(m->g->strip[esub]) == OOR1);
+		for (;;) {	/* find first matching branch */
+			dp = backref(m, sp, stop, ssub, esub, lev, rec);
+			if (dp != NULL)
+				return(dp);
+			/* that one missed, try next one */
+			if (OP(m->g->strip[esub]) == O_CH)
+				return(NULL);	/* there is none */
+			esub++;
+			assert(OP(m->g->strip[esub]) == OOR2);
+			ssub = esub + 1;
+			esub += OPND(m->g->strip[esub]);
+			if (OP(m->g->strip[esub]) == OOR2)
+				esub--;
+			else
+				assert(OP(m->g->strip[esub]) == O_CH);
+		}
+		break;
+	case OLPAREN:		/* must undo assignment if rest fails */
+		i = OPND(s);
+		assert(0 < i && i <= m->g->nsub);
+		offsave = m->pmatch[i].rm_so;
+		m->pmatch[i].rm_so = sp - m->offp;
+		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+		if (dp != NULL)
+			return(dp);
+		m->pmatch[i].rm_so = offsave;
+		return(NULL);
+		break;
+	case ORPAREN:		/* must undo assignment if rest fails */
+		i = OPND(s);
+		assert(0 < i && i <= m->g->nsub);
+		offsave = m->pmatch[i].rm_eo;
+		m->pmatch[i].rm_eo = sp - m->offp;
+		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
+		if (dp != NULL)
+			return(dp);
+		m->pmatch[i].rm_eo = offsave;
+		return(NULL);
+		break;
+	default:		/* uh oh */
+		assert(nope);
+		break;
+	}
+
+	/* "can't happen" */
+	assert(nope);
+	/* NOTREACHED */
+        return NULL;
+}
+
+/*
+ - fast - step through the string at top speed
+ */
+static const char *			/* where tentative match ended, or NULL */
+fast(struct match *m, const char *start, const char *stop, sopno startst,
+     sopno stopst)
+{
+	states st = m->st;
+	states fresh = m->fresh;
+	states tmp = m->tmp;
+	const char *p = start;
+	int c = (start == m->beginp) ? OUT : *(start-1);
+	int lastc;	/* previous c */
+	int flagch;
+	int i;
+	const char *coldp;	/* last p after which no match was underway */
+
+	CLEAR(st);
+	SET1(st, startst);
+	st = step(m->g, startst, stopst, st, NOTHING, st);
+	ASSIGN(fresh, st);
+	SP("start", st, *p);
+	coldp = NULL;
+	for (;;) {
+		/* next character */
+		lastc = c;
+		c = (p == m->endp) ? OUT : *p;
+		if (EQ(st, fresh))
+			coldp = p;
+
+		/* is there an EOL and/or BOL between lastc and c? */
+		flagch = '\0';
+		i = 0;
+		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
+				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
+			flagch = BOL;
+			i = m->g->nbol;
+		}
+		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
+				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
+			flagch = (flagch == BOL) ? BOLEOL : EOL;
+			i += m->g->neol;
+		}
+		if (i != 0) {
+			for (; i > 0; i--)
+				st = step(m->g, startst, stopst, st, flagch, st);
+			SP("boleol", st, c);
+		}
+
+		/* how about a word boundary? */
+		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
+					(c != OUT && ISWORD(c)) ) {
+			flagch = BOW;
+		}
+		if ( (lastc != OUT && ISWORD(lastc)) &&
+				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+			flagch = EOW;
+		}
+		if (flagch == BOW || flagch == EOW) {
+			st = step(m->g, startst, stopst, st, flagch, st);
+			SP("boweow", st, c);
+		}
+
+		/* are we done? */
+		if (ISSET(st, stopst) || p == stop)
+			break;		/* NOTE BREAK OUT */
+
+		/* no, we must deal with this character */
+		ASSIGN(tmp, st);
+		ASSIGN(st, fresh);
+		assert(c != OUT);
+		st = step(m->g, startst, stopst, tmp, c, st);
+		SP("aft", st, c);
+		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
+		p++;
+	}
+
+	assert(coldp != NULL);
+	m->coldp = coldp;
+	if (ISSET(st, stopst))
+		return(p+1);
+	else
+		return(NULL);
+}
+
+/*
+ - slow - step through the string more deliberately
+ */
+static const char *			/* where it ended */
+slow(struct match *m, const char *start, const char *stop, sopno startst,
+     sopno stopst)
+{
+	states st = m->st;
+	states empty = m->empty;
+	states tmp = m->tmp;
+	const char *p = start;
+	int c = (start == m->beginp) ? OUT : *(start-1);
+	int lastc;	/* previous c */
+	int flagch;
+	int i;
+	const char *matchp;	/* last p at which a match ended */
+
+	AT("slow", start, stop, startst, stopst);
+	CLEAR(st);
+	SET1(st, startst);
+	SP("sstart", st, *p);
+	st = step(m->g, startst, stopst, st, NOTHING, st);
+	matchp = NULL;
+	for (;;) {
+		/* next character */
+		lastc = c;
+		c = (p == m->endp) ? OUT : *p;
+
+		/* is there an EOL and/or BOL between lastc and c? */
+		flagch = '\0';
+		i = 0;
+		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
+				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
+			flagch = BOL;
+			i = m->g->nbol;
+		}
+		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
+				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
+			flagch = (flagch == BOL) ? BOLEOL : EOL;
+			i += m->g->neol;
+		}
+		if (i != 0) {
+			for (; i > 0; i--)
+				st = step(m->g, startst, stopst, st, flagch, st);
+			SP("sboleol", st, c);
+		}
+
+		/* how about a word boundary? */
+		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
+					(c != OUT && ISWORD(c)) ) {
+			flagch = BOW;
+		}
+		if ( (lastc != OUT && ISWORD(lastc)) &&
+				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
+			flagch = EOW;
+		}
+		if (flagch == BOW || flagch == EOW) {
+			st = step(m->g, startst, stopst, st, flagch, st);
+			SP("sboweow", st, c);
+		}
+
+		/* are we done? */
+		if (ISSET(st, stopst))
+			matchp = p;
+		if (EQ(st, empty) || p == stop)
+			break;		/* NOTE BREAK OUT */
+
+		/* no, we must deal with this character */
+		ASSIGN(tmp, st);
+		ASSIGN(st, empty);
+		assert(c != OUT);
+		st = step(m->g, startst, stopst, tmp, c, st);
+		SP("saft", st, c);
+		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
+		p++;
+	}
+
+	return(matchp);
+}
+
+
+/*
+ - step - map set of states reachable before char to set reachable after
+ */
+static states
+step(struct re_guts *g,
+    sopno start,		/* start state within strip */
+    sopno stop,			/* state after stop state within strip */
+    states bef,			/* states reachable before */
+    int ch,			/* character or NONCHAR code */
+    states aft)			/* states already known reachable after */
+{
+	cset *cs;
+	sop s;
+	sopno pc;
+	onestate here;		/* note, macros know this name */
+	sopno look;
+	int i;
+
+	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
+		s = g->strip[pc];
+		switch (OP(s)) {
+		case OEND:
+			assert(pc == stop-1);
+			break;
+		case OCHAR:
+			/* only characters can match */
+			assert(!NONCHAR(ch) || ch != (char)OPND(s));
+			if (ch == (char)OPND(s))
+				FWD(aft, bef, 1);
+			break;
+		case OBOL:
+			if (ch == BOL || ch == BOLEOL)
+				FWD(aft, bef, 1);
+			break;
+		case OEOL:
+			if (ch == EOL || ch == BOLEOL)
+				FWD(aft, bef, 1);
+			break;
+		case OBOW:
+			if (ch == BOW)
+				FWD(aft, bef, 1);
+			break;
+		case OEOW:
+			if (ch == EOW)
+				FWD(aft, bef, 1);
+			break;
+		case OANY:
+			if (!NONCHAR(ch))
+				FWD(aft, bef, 1);
+			break;
+		case OANYOF:
+			cs = &g->sets[OPND(s)];
+			if (!NONCHAR(ch) && CHIN(cs, ch))
+				FWD(aft, bef, 1);
+			break;
+		case OBACK_:		/* ignored here */
+		case O_BACK:
+			FWD(aft, aft, 1);
+			break;
+		case OPLUS_:		/* forward, this is just an empty */
+			FWD(aft, aft, 1);
+			break;
+		case O_PLUS:		/* both forward and back */
+			FWD(aft, aft, 1);
+			i = ISSETBACK(aft, OPND(s));
+			BACK(aft, aft, OPND(s));
+			if (!i && ISSETBACK(aft, OPND(s))) {
+				/* oho, must reconsider loop body */
+				pc -= OPND(s) + 1;
+				INIT(here, pc);
+			}
+			break;
+		case OQUEST_:		/* two branches, both forward */
+			FWD(aft, aft, 1);
+			FWD(aft, aft, OPND(s));
+			break;
+		case O_QUEST:		/* just an empty */
+			FWD(aft, aft, 1);
+			break;
+		case OLPAREN:		/* not significant here */
+		case ORPAREN:
+			FWD(aft, aft, 1);
+			break;
+		case OCH_:		/* mark the first two branches */
+			FWD(aft, aft, 1);
+			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+			FWD(aft, aft, OPND(s));
+			break;
+		case OOR1:		/* done a branch, find the O_CH */
+			if (ISSTATEIN(aft, here)) {
+				for (look = 1;
+						OP(s = g->strip[pc+look]) != O_CH;
+						look += OPND(s))
+					assert(OP(s) == OOR2);
+				FWD(aft, aft, look);
+			}
+			break;
+		case OOR2:		/* propagate OCH_'s marking */
+			FWD(aft, aft, 1);
+			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
+				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
+				FWD(aft, aft, OPND(s));
+			}
+			break;
+		case O_CH:		/* just empty */
+			FWD(aft, aft, 1);
+			break;
+		default:		/* ooooops... */
+			assert(nope);
+			break;
+		}
+	}
+
+	return(aft);
+}
+
+#ifdef REDEBUG
+/*
+ - print - print a set of states
+ */
+static void
+print(struct match *m, char *caption, states st, int ch, FILE *d)
+{
+	struct re_guts *g = m->g;
+	int i;
+	int first = 1;
+
+	if (!(m->eflags&REG_TRACE))
+		return;
+
+	(void)fprintf(d, "%s", caption);
+	if (ch != '\0')
+		(void)fprintf(d, " %s", pchar(ch));
+	for (i = 0; i < g->nstates; i++)
+		if (ISSET(st, i)) {
+			(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
+			first = 0;
+		}
+	(void)fprintf(d, "\n");
+}
+
+/* 
+ - at - print current situation
+ */
+static void
+at(struct match *m, char *title, char *start, char *stop, sopno startst,
+    sopno stopst)
+{
+	if (!(m->eflags&REG_TRACE))
+		return;
+
+	(void)printf("%s %s-", title, pchar(*start));
+	(void)printf("%s ", pchar(*stop));
+	(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
+}
+
+#ifndef PCHARDONE
+#define	PCHARDONE	/* never again */
+/*
+ - pchar - make a character printable
+ *
+ * Is this identical to regchar() over in debug.c?  Well, yes.  But a
+ * duplicate here avoids having a debugging-capable regexec.o tied to
+ * a matching debug.o, and this is convenient.  It all disappears in
+ * the non-debug compilation anyway, so it doesn't matter much.
+ */
+static char *			/* -> representation */
+pchar(int ch)
+{
+	static char pbuf[10];
+
+	if (isprint(ch) || ch == ' ')
+		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
+	else
+		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
+	return(pbuf);
+}
+#endif
+#endif
+
+#undef	matcher
+#undef	fast
+#undef	slow
+#undef	dissect
+#undef	backref
+#undef	step
+#undef	print
+#undef	at
+#undef	match
+#undef	nope
diff --git a/contrib/llvm/lib/Support/regerror.c b/contrib/llvm/lib/Support/regerror.c
new file mode 100644
index 000000000000..1d67c9a2b03b
--- /dev/null
+++ b/contrib/llvm/lib/Support/regerror.c
@@ -0,0 +1,135 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regerror.c	8.4 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+
+#ifdef _MSC_VER
+#define snprintf _snprintf
+#endif
+
+static const char *regatoi(const llvm_regex_t *, char *, int);
+
+static struct rerr {
+	int code;
+	const char *name;
+	const char *explain;
+} rerrs[] = {
+	{ REG_NOMATCH,	"REG_NOMATCH",	"llvm_regexec() failed to match" },
+	{ REG_BADPAT,	"REG_BADPAT",	"invalid regular expression" },
+	{ REG_ECOLLATE,	"REG_ECOLLATE",	"invalid collating element" },
+	{ REG_ECTYPE,	"REG_ECTYPE",	"invalid character class" },
+	{ REG_EESCAPE,	"REG_EESCAPE",	"trailing backslash (\\)" },
+	{ REG_ESUBREG,	"REG_ESUBREG",	"invalid backreference number" },
+	{ REG_EBRACK,	"REG_EBRACK",	"brackets ([ ]) not balanced" },
+	{ REG_EPAREN,	"REG_EPAREN",	"parentheses not balanced" },
+	{ REG_EBRACE,	"REG_EBRACE",	"braces not balanced" },
+	{ REG_BADBR,	"REG_BADBR",	"invalid repetition count(s)" },
+	{ REG_ERANGE,	"REG_ERANGE",	"invalid character range" },
+	{ REG_ESPACE,	"REG_ESPACE",	"out of memory" },
+	{ REG_BADRPT,	"REG_BADRPT",	"repetition-operator operand invalid" },
+	{ REG_EMPTY,	"REG_EMPTY",	"empty (sub)expression" },
+	{ REG_ASSERT,	"REG_ASSERT",	"\"can't happen\" -- you found a bug" },
+	{ REG_INVARG,	"REG_INVARG",	"invalid argument to regex routine" },
+	{ 0,		"",		"*** unknown regexp error code ***" }
+};
+
+/*
+ - llvm_regerror - the interface to error numbers
+ = extern size_t llvm_regerror(int, const llvm_regex_t *, char *, size_t);
+ */
+/* ARGSUSED */
+size_t
+llvm_regerror(int errcode, const llvm_regex_t *preg, char *errbuf, size_t errbuf_size)
+{
+	struct rerr *r;
+	size_t len;
+	int target = errcode &~ REG_ITOA;
+	const char *s;
+	char convbuf[50];
+
+	if (errcode == REG_ATOI)
+		s = regatoi(preg, convbuf, sizeof convbuf);
+	else {
+		for (r = rerrs; r->code != 0; r++)
+			if (r->code == target)
+				break;
+	
+		if (errcode&REG_ITOA) {
+			if (r->code != 0) {
+				assert(strlen(r->name) < sizeof(convbuf));
+				(void) llvm_strlcpy(convbuf, r->name, sizeof convbuf);
+			} else
+				(void)snprintf(convbuf, sizeof convbuf,
+				    "REG_0x%x", target);
+			s = convbuf;
+		} else
+			s = r->explain;
+	}
+
+	len = strlen(s) + 1;
+	if (errbuf_size > 0) {
+		llvm_strlcpy(errbuf, s, errbuf_size);
+	}
+
+	return(len);
+}
+
+/*
+ - regatoi - internal routine to implement REG_ATOI
+ */
+static const char *
+regatoi(const llvm_regex_t *preg, char *localbuf, int localbufsize)
+{
+	struct rerr *r;
+
+	for (r = rerrs; r->code != 0; r++)
+		if (strcmp(r->name, preg->re_endp) == 0)
+			break;
+	if (r->code == 0)
+		return("0");
+
+	(void)snprintf(localbuf, localbufsize, "%d", r->code);
+	return(localbuf);
+}
diff --git a/contrib/llvm/lib/Support/regex2.h b/contrib/llvm/lib/Support/regex2.h
new file mode 100644
index 000000000000..21659c34449a
--- /dev/null
+++ b/contrib/llvm/lib/Support/regex2.h
@@ -0,0 +1,157 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regex2.h	8.4 (Berkeley) 3/20/94
+ */
+
+/*
+ * internals of regex_t
+ */
+#define	MAGIC1	((('r'^0200)<<8) | 'e')
+
+/*
+ * The internal representation is a *strip*, a sequence of
+ * operators ending with an endmarker.  (Some terminology etc. is a
+ * historical relic of earlier versions which used multiple strips.)
+ * Certain oddities in the representation are there to permit running
+ * the machinery backwards; in particular, any deviation from sequential
+ * flow must be marked at both its source and its destination.  Some
+ * fine points:
+ *
+ * - OPLUS_ and O_PLUS are *inside* the loop they create.
+ * - OQUEST_ and O_QUEST are *outside* the bypass they create.
+ * - OCH_ and O_CH are *outside* the multi-way branch they create, while
+ *   OOR1 and OOR2 are respectively the end and the beginning of one of
+ *   the branches.  Note that there is an implicit OOR2 following OCH_
+ *   and an implicit OOR1 preceding O_CH.
+ *
+ * In state representations, an operator's bit is on to signify a state
+ * immediately *preceding* "execution" of that operator.
+ */
+typedef unsigned long sop;	/* strip operator */
+typedef long sopno;
+#define	OPRMASK	0xf8000000LU
+#define	OPDMASK	0x07ffffffLU
+#define	OPSHIFT	((unsigned)27)
+#define	OP(n)	((n)&OPRMASK)
+#define	OPND(n)	((n)&OPDMASK)
+#define	SOP(op, opnd)	((op)|(opnd))
+/* operators			   meaning	operand			*/
+/*						(back, fwd are offsets)	*/
+#define	OEND	(1LU<<OPSHIFT)	/* endmarker	-			*/
+#define	OCHAR	(2LU<<OPSHIFT)	/* character	unsigned char		*/
+#define	OBOL	(3LU<<OPSHIFT)	/* left anchor	-			*/
+#define	OEOL	(4LU<<OPSHIFT)	/* right anchor	-			*/
+#define	OANY	(5LU<<OPSHIFT)	/* .		-			*/
+#define	OANYOF	(6LU<<OPSHIFT)	/* [...]	set number		*/
+#define	OBACK_	(7LU<<OPSHIFT)	/* begin \d	paren number		*/
+#define	O_BACK	(8LU<<OPSHIFT)	/* end \d	paren number		*/
+#define	OPLUS_	(9LU<<OPSHIFT)	/* + prefix	fwd to suffix		*/
+#define	O_PLUS	(10LU<<OPSHIFT)	/* + suffix	back to prefix		*/
+#define	OQUEST_	(11LU<<OPSHIFT)	/* ? prefix	fwd to suffix		*/
+#define	O_QUEST	(12LU<<OPSHIFT)	/* ? suffix	back to prefix		*/
+#define	OLPAREN	(13LU<<OPSHIFT)	/* (		fwd to )		*/
+#define	ORPAREN	(14LU<<OPSHIFT)	/* )		back to (		*/
+#define	OCH_	(15LU<<OPSHIFT)	/* begin choice	fwd to OOR2		*/
+#define	OOR1	(16LU<<OPSHIFT)	/* | pt. 1	back to OOR1 or OCH_	*/
+#define	OOR2	(17LU<<OPSHIFT)	/* | pt. 2	fwd to OOR2 or O_CH	*/
+#define	O_CH	(18LU<<OPSHIFT)	/* end choice	back to OOR1		*/
+#define	OBOW	(19LU<<OPSHIFT)	/* begin word	-			*/
+#define	OEOW	(20LU<<OPSHIFT)	/* end word	-			*/
+
+/*
+ * Structure for [] character-set representation.  Character sets are
+ * done as bit vectors, grouped 8 to a byte vector for compactness.
+ * The individual set therefore has both a pointer to the byte vector
+ * and a mask to pick out the relevant bit of each byte.  A hash code
+ * simplifies testing whether two sets could be identical.
+ *
+ * This will get trickier for multicharacter collating elements.  As
+ * preliminary hooks for dealing with such things, we also carry along
+ * a string of multi-character elements, and decide the size of the
+ * vectors at run time.
+ */
+typedef struct {
+	uch *ptr;		/* -> uch [csetsize] */
+	uch mask;		/* bit within array */
+	uch hash;		/* hash code */
+	size_t smultis;
+	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
+} cset;
+/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
+#define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
+#define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
+#define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
+#define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* llvm_regcomp() internal fns */
+#define	MCsub(p, cs, cp)	mcsub(p, cs, cp)
+#define	MCin(p, cs, cp)	mcin(p, cs, cp)
+
+/* stuff for character categories */
+typedef unsigned char cat_t;
+
+/*
+ * main compiled-expression structure
+ */
+struct re_guts {
+	int magic;
+#		define	MAGIC2	((('R'^0200)<<8)|'E')
+	sop *strip;		/* malloced area for strip */
+	int csetsize;		/* number of bits in a cset vector */
+	int ncsets;		/* number of csets in use */
+	cset *sets;		/* -> cset [ncsets] */
+	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
+	int cflags;		/* copy of llvm_regcomp() cflags argument */
+	sopno nstates;		/* = number of sops */
+	sopno firststate;	/* the initial OEND (normally 0) */
+	sopno laststate;	/* the final OEND */
+	int iflags;		/* internal flags */
+#		define	USEBOL	01	/* used ^ */
+#		define	USEEOL	02	/* used $ */
+#		define	REGEX_BAD	04	/* something wrong */
+	int nbol;		/* number of ^ used */
+	int neol;		/* number of $ used */
+	int ncategories;	/* how many character categories */
+	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
+	char *must;		/* match must contain this string */
+	int mlen;		/* length of must */
+	size_t nsub;		/* copy of re_nsub */
+	int backrefs;		/* does it use back references? */
+	sopno nplus;		/* how deep does it nest +s? */
+	/* catspace must be last */
+	cat_t catspace[1];	/* actually [NC] */
+};
+
+/* misc utilities */
+#define	OUT	(CHAR_MAX+1)	/* a non-character value */
+#define	ISWORD(c)	(isalnum(c&0xff) || (c) == '_')
diff --git a/contrib/llvm/lib/Support/regex_impl.h b/contrib/llvm/lib/Support/regex_impl.h
new file mode 100644
index 000000000000..f8296c9ff75e
--- /dev/null
+++ b/contrib/llvm/lib/Support/regex_impl.h
@@ -0,0 +1,108 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992 Henry Spencer.
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer of the University of Toronto.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regex.h	8.1 (Berkeley) 6/2/93
+ */
+
+#ifndef _REGEX_H_
+#define	_REGEX_H_
+
+#include <sys/types.h>
+typedef off_t llvm_regoff_t;
+typedef struct {
+  llvm_regoff_t rm_so;		/* start of match */
+  llvm_regoff_t rm_eo;		/* end of match */
+} llvm_regmatch_t;
+
+typedef struct llvm_regex {
+  int re_magic;
+  size_t re_nsub;		/* number of parenthesized subexpressions */
+  const char *re_endp;	/* end pointer for REG_PEND */
+  struct re_guts *re_g;	/* none of your business :-) */
+} llvm_regex_t;
+
+/* llvm_regcomp() flags */
+#define	REG_BASIC	0000
+#define	REG_EXTENDED	0001
+#define	REG_ICASE	0002
+#define	REG_NOSUB	0004
+#define	REG_NEWLINE	0010
+#define	REG_NOSPEC	0020
+#define	REG_PEND	0040
+#define	REG_DUMP	0200
+
+/* llvm_regerror() flags */
+#define	REG_NOMATCH	 1
+#define	REG_BADPAT	 2
+#define	REG_ECOLLATE	 3
+#define	REG_ECTYPE	 4
+#define	REG_EESCAPE	 5
+#define	REG_ESUBREG	 6
+#define	REG_EBRACK	 7
+#define	REG_EPAREN	 8
+#define	REG_EBRACE	 9
+#define	REG_BADBR	10
+#define	REG_ERANGE	11
+#define	REG_ESPACE	12
+#define	REG_BADRPT	13
+#define	REG_EMPTY	14
+#define	REG_ASSERT	15
+#define	REG_INVARG	16
+#define	REG_ATOI	255	/* convert name to number (!) */
+#define	REG_ITOA	0400	/* convert number to name (!) */
+
+/* llvm_regexec() flags */
+#define	REG_NOTBOL	00001
+#define	REG_NOTEOL	00002
+#define	REG_STARTEND	00004
+#define	REG_TRACE	00400	/* tracing of execution */
+#define	REG_LARGE	01000	/* force large representation */
+#define	REG_BACKR	02000	/* force use of backref code */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int	llvm_regcomp(llvm_regex_t *, const char *, int);
+size_t	llvm_regerror(int, const llvm_regex_t *, char *, size_t);
+int	llvm_regexec(const llvm_regex_t *, const char *, size_t, 
+                     llvm_regmatch_t [], int);
+void	llvm_regfree(llvm_regex_t *);
+size_t  llvm_strlcpy(char *dst, const char *src, size_t siz);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_REGEX_H_ */
diff --git a/contrib/llvm/lib/Support/regexec.c b/contrib/llvm/lib/Support/regexec.c
new file mode 100644
index 000000000000..bd5e72d4c522
--- /dev/null
+++ b/contrib/llvm/lib/Support/regexec.c
@@ -0,0 +1,162 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regexec.c	8.3 (Berkeley) 3/20/94
+ */
+
+/*
+ * the outer shell of llvm_regexec()
+ *
+ * This file includes engine.inc *twice*, after muchos fiddling with the
+ * macros that code uses.  This lets the same code operate on two different
+ * representations for state sets.
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+/* macros for manipulating states, small version */
+/* FIXME: 'states' is assumed as 'long' on small version. */
+#define	states1	long		/* for later use in llvm_regexec() decision */
+#define	states	states1
+#define	CLEAR(v)	((v) = 0)
+#define	SET0(v, n)	((v) &= ~((unsigned long)1 << (n)))
+#define	SET1(v, n)	((v) |= (unsigned long)1 << (n))
+#define	ISSET(v, n)	(((v) & ((unsigned long)1 << (n))) != 0)
+#define	ASSIGN(d, s)	((d) = (s))
+#define	EQ(a, b)	((a) == (b))
+#define	STATEVARS	long dummy	/* dummy version */
+#define	STATESETUP(m, n)	/* nothing */
+#define	STATETEARDOWN(m)	/* nothing */
+#define	SETUP(v)	((v) = 0)
+#define	onestate	long
+#define	INIT(o, n)	((o) = (unsigned long)1 << (n))
+#define	INC(o)		((o) = (unsigned long)(o) << 1)
+#define	ISSTATEIN(v, o)	(((v) & (o)) != 0)
+/* some abbreviations; note that some of these know variable names! */
+/* do "if I'm here, I can also be there" etc without branches */
+#define	FWD(dst, src, n)	((dst) |= ((unsigned long)(src)&(here)) << (n))
+#define	BACK(dst, src, n)	((dst) |= ((unsigned long)(src)&(here)) >> (n))
+#define	ISSETBACK(v, n)		(((v) & ((unsigned long)here >> (n))) != 0)
+/* function names */
+#define SNAMES			/* engine.inc looks after details */
+
+#include "regengine.inc"
+
+/* now undo things */
+#undef	states
+#undef	CLEAR
+#undef	SET0
+#undef	SET1
+#undef	ISSET
+#undef	ASSIGN
+#undef	EQ
+#undef	STATEVARS
+#undef	STATESETUP
+#undef	STATETEARDOWN
+#undef	SETUP
+#undef	onestate
+#undef	INIT
+#undef	INC
+#undef	ISSTATEIN
+#undef	FWD
+#undef	BACK
+#undef	ISSETBACK
+#undef	SNAMES
+
+/* macros for manipulating states, large version */
+#define	states	char *
+#define	CLEAR(v)	memset(v, 0, m->g->nstates)
+#define	SET0(v, n)	((v)[n] = 0)
+#define	SET1(v, n)	((v)[n] = 1)
+#define	ISSET(v, n)	((v)[n])
+#define	ASSIGN(d, s)	memmove(d, s, m->g->nstates)
+#define	EQ(a, b)	(memcmp(a, b, m->g->nstates) == 0)
+#define	STATEVARS	long vn; char *space
+#define	STATESETUP(m, nv)	{ (m)->space = malloc((nv)*(m)->g->nstates); \
+				if ((m)->space == NULL) return(REG_ESPACE); \
+				(m)->vn = 0; }
+#define	STATETEARDOWN(m)	{ free((m)->space); }
+#define	SETUP(v)	((v) = &m->space[m->vn++ * m->g->nstates])
+#define	onestate	long
+#define	INIT(o, n)	((o) = (n))
+#define	INC(o)	((o)++)
+#define	ISSTATEIN(v, o)	((v)[o])
+/* some abbreviations; note that some of these know variable names! */
+/* do "if I'm here, I can also be there" etc without branches */
+#define	FWD(dst, src, n)	((dst)[here+(n)] |= (src)[here])
+#define	BACK(dst, src, n)	((dst)[here-(n)] |= (src)[here])
+#define	ISSETBACK(v, n)	((v)[here - (n)])
+/* function names */
+#define	LNAMES			/* flag */
+
+#include "regengine.inc"
+
+/*
+ - llvm_regexec - interface for matching
+ *
+ * We put this here so we can exploit knowledge of the state representation
+ * when choosing which matcher to call.  Also, by this point the matchers
+ * have been prototyped.
+ */
+int				/* 0 success, REG_NOMATCH failure */
+llvm_regexec(const llvm_regex_t *preg, const char *string, size_t nmatch,
+             llvm_regmatch_t pmatch[], int eflags)
+{
+	struct re_guts *g = preg->re_g;
+#ifdef REDEBUG
+#	define	GOODFLAGS(f)	(f)
+#else
+#	define	GOODFLAGS(f)	((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
+#endif
+
+	if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
+		return(REG_BADPAT);
+	assert(!(g->iflags&REGEX_BAD));
+	if (g->iflags&REGEX_BAD)		/* backstop for no-debug case */
+		return(REG_BADPAT);
+	eflags = GOODFLAGS(eflags);
+
+	if (g->nstates <= (long)(CHAR_BIT*sizeof(states1)) && !(eflags&REG_LARGE))
+		return(smatcher(g, string, nmatch, pmatch, eflags));
+	else
+		return(lmatcher(g, string, nmatch, pmatch, eflags));
+}
diff --git a/contrib/llvm/lib/Support/regfree.c b/contrib/llvm/lib/Support/regfree.c
new file mode 100644
index 000000000000..dc2b4af90fa7
--- /dev/null
+++ b/contrib/llvm/lib/Support/regfree.c
@@ -0,0 +1,72 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)regfree.c	8.3 (Berkeley) 3/20/94
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "regex_impl.h"
+
+#include "regutils.h"
+#include "regex2.h"
+
+/*
+ - llvm_regfree - free everything
+ */
+void
+llvm_regfree(llvm_regex_t *preg)
+{
+	struct re_guts *g;
+
+	if (preg->re_magic != MAGIC1)	/* oops */
+		return;			/* nice to complain, but hard */
+
+	g = preg->re_g;
+	if (g == NULL || g->magic != MAGIC2)	/* oops again */
+		return;
+	preg->re_magic = 0;		/* mark it invalid */
+	g->magic = 0;			/* mark it invalid */
+
+	if (g->strip != NULL)
+		free((char *)g->strip);
+	if (g->sets != NULL)
+		free((char *)g->sets);
+	if (g->setbits != NULL)
+		free((char *)g->setbits);
+	if (g->must != NULL)
+		free(g->must);
+	free((char *)g);
+}
diff --git a/contrib/llvm/lib/Support/regstrlcpy.c b/contrib/llvm/lib/Support/regstrlcpy.c
new file mode 100644
index 000000000000..8b68afdf75f1
--- /dev/null
+++ b/contrib/llvm/lib/Support/regstrlcpy.c
@@ -0,0 +1,52 @@
+/*
+ * This code is derived from OpenBSD's libc, original license follows:
+ *
+ * Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <string.h>
+
+#include "regex_impl.h"
+/*
+ * Copy src to string dst of size siz.  At most siz-1 characters
+ * will be copied.  Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ */
+size_t
+llvm_strlcpy(char *dst, const char *src, size_t siz)
+{
+	char *d = dst;
+	const char *s = src;
+	size_t n = siz;
+
+	/* Copy as many bytes as will fit */
+	if (n != 0) {
+		while (--n != 0) {
+			if ((*d++ = *s++) == '\0')
+				break;
+		}
+	}
+
+	/* Not enough room in dst, add NUL and traverse rest of src */
+	if (n == 0) {
+		if (siz != 0)
+			*d = '\0';		/* NUL-terminate dst */
+		while (*s++)
+			;
+	}
+
+	return(s - src - 1);	/* count does not include NUL */
+}
diff --git a/contrib/llvm/lib/Support/regutils.h b/contrib/llvm/lib/Support/regutils.h
new file mode 100644
index 000000000000..d0ee100a382b
--- /dev/null
+++ b/contrib/llvm/lib/Support/regutils.h
@@ -0,0 +1,53 @@
+/*-
+ * This code is derived from OpenBSD's libc/regex, original license follows:
+ *
+ * Copyright (c) 1992, 1993, 1994 Henry Spencer.
+ * Copyright (c) 1992, 1993, 1994
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Henry Spencer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)utils.h	8.3 (Berkeley) 3/20/94
+ */
+
+/* utility definitions */
+#define	NC		(CHAR_MAX - CHAR_MIN + 1)
+typedef unsigned char uch;
+
+/* switch off assertions (if not already off) if no REDEBUG */
+#ifndef REDEBUG
+#ifndef NDEBUG
+#define	NDEBUG	/* no assertions please */
+#endif
+#endif
+#include <assert.h>
+
+/* for old systems with bcopy() but no memmove() */
+#ifdef USEBCOPY
+#define	memmove(d, s, c)	bcopy(s, d, c)
+#endif
diff --git a/contrib/llvm/lib/Support/system_error.cpp b/contrib/llvm/lib/Support/system_error.cpp
new file mode 100644
index 000000000000..b22745afc330
--- /dev/null
+++ b/contrib/llvm/lib/Support/system_error.cpp
@@ -0,0 +1,130 @@
+//===---------------------- system_error.cpp ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This was lifted from libc++ and modified for C++03.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/Errno.h"
+#include <cstring>
+#include <string>
+
+namespace llvm {
+
+// class error_category
+
+error_category::error_category() {
+}
+
+error_category::~error_category() {
+}
+
+error_condition
+error_category::default_error_condition(int ev) const {
+  return error_condition(ev, *this);
+}
+
+bool
+error_category::equivalent(int code, const error_condition& condition) const {
+  return default_error_condition(code) == condition;
+}
+
+bool
+error_category::equivalent(const error_code& code, int condition) const {
+  return *this == code.category() && code.value() == condition;
+}
+
+std::string
+_do_message::message(int ev) const {
+  return std::string(sys::StrError(ev));
+}
+
+class _generic_error_category : public _do_message {
+public:
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
+};
+
+const char*
+_generic_error_category::name() const {
+  return "generic";
+}
+
+std::string
+_generic_error_category::message(int ev) const {
+#ifdef ELAST
+  if (ev > ELAST)
+    return std::string("unspecified generic_category error");
+#endif  // ELAST
+  return _do_message::message(ev);
+}
+
+const error_category&
+generic_category() {
+  static _generic_error_category s;
+  return s;
+}
+
+class _system_error_category : public _do_message {
+public:
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
+  virtual error_condition default_error_condition(int ev) const LLVM_OVERRIDE;
+};
+
+const char*
+_system_error_category::name() const {
+  return "system";
+}
+
+// std::string _system_error_category::message(int ev) const {
+// Is in Platform/system_error.inc
+
+// error_condition _system_error_category::default_error_condition(int ev) const
+// Is in Platform/system_error.inc
+
+const error_category&
+system_category() {
+  static _system_error_category s;
+  return s;
+}
+
+const error_category&
+posix_category() {
+#ifdef LLVM_ON_WIN32
+  return generic_category();
+#else
+  return system_category();
+#endif
+}
+
+// error_condition
+
+std::string
+error_condition::message() const {
+  return _cat_->message(_val_);
+}
+
+// error_code
+
+std::string
+error_code::message() const {
+  return _cat_->message(_val_);
+}
+
+} // end namespace llvm
+
+// Include the truly platform-specific parts of this class.
+#if defined(LLVM_ON_UNIX)
+#include "Unix/system_error.inc"
+#endif
+#if defined(LLVM_ON_WIN32)
+#include "Windows/system_error.inc"
+#endif
diff --git a/contrib/llvm/lib/TableGen/Error.cpp b/contrib/llvm/lib/TableGen/Error.cpp
new file mode 100644
index 000000000000..928b1203cd8f
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/Error.cpp
@@ -0,0 +1,75 @@
+//===- Error.cpp - tblgen error handling helper routines --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains error handling helper routines to pretty-print diagnostic
+// messages from tblgen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/Error.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+
+namespace llvm {
+
+SourceMgr SrcMgr;
+unsigned ErrorsPrinted = 0;
+
+static void PrintMessage(ArrayRef<SMLoc> Loc, SourceMgr::DiagKind Kind,
+                         const Twine &Msg) {
+  // Count the total number of errors printed.
+  // This is used to exit with an error code if there were any errors.
+  if (Kind == SourceMgr::DK_Error)
+    ++ErrorsPrinted;
+
+  SMLoc NullLoc;
+  if (Loc.empty())
+    Loc = NullLoc;
+  SrcMgr.PrintMessage(Loc.front(), Kind, Msg);
+  for (unsigned i = 1; i < Loc.size(); ++i)
+    SrcMgr.PrintMessage(Loc[i], SourceMgr::DK_Note,
+                        "instantiated from multiclass");
+}
+
+void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
+  PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
+}
+
+void PrintWarning(const char *Loc, const Twine &Msg) {
+  SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Warning, Msg);
+}
+
+void PrintWarning(const Twine &Msg) {
+  errs() << "warning:" << Msg << "\n";
+}
+
+void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
+  PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
+}
+
+void PrintError(const char *Loc, const Twine &Msg) {
+  SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
+}
+
+void PrintError(const Twine &Msg) {
+  errs() << "error:" << Msg << "\n";
+}
+
+void PrintFatalError(const std::string &Msg) {
+  PrintError(Twine(Msg));
+  std::exit(1);
+}
+
+void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const std::string &Msg) {
+  PrintError(ErrorLoc, Msg);
+  std::exit(1);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/TableGen/Main.cpp b/contrib/llvm/lib/TableGen/Main.cpp
new file mode 100644
index 000000000000..dc4167b305ca
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/Main.cpp
@@ -0,0 +1,130 @@
+//===- Main.cpp - Top-Level TableGen implementation -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen is a tool which can be used to build up a description of something,
+// then invoke one or more "tablegen backends" to emit information about the
+// description in some predefined format.  In practice, this is used by the LLVM
+// code generators to automate generation of a code generator through a
+// high-level description of the target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGParser.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include <algorithm>
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+  cl::opt<std::string>
+  OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"),
+                 cl::init("-"));
+
+  cl::opt<std::string>
+  DependFilename("d",
+                 cl::desc("Dependency filename"),
+                 cl::value_desc("filename"),
+                 cl::init(""));
+
+  cl::opt<std::string>
+  InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+  cl::list<std::string>
+  IncludeDirs("I", cl::desc("Directory of include files"),
+              cl::value_desc("directory"), cl::Prefix);
+}
+
+/// \brief Create a dependency file for `-d` option.
+///
+/// This functionality is really only for the benefit of the build system.
+/// It is similar to GCC's `-M*` family of options.
+static int createDependencyFile(const TGParser &Parser, const char *argv0) {
+  if (OutputFilename == "-") {
+    errs() << argv0 << ": the option -d must be used together with -o\n";
+    return 1;
+  }
+  std::string Error;
+  tool_output_file DepOut(DependFilename.c_str(), Error);
+  if (!Error.empty()) {
+    errs() << argv0 << ": error opening " << DependFilename
+      << ":" << Error << "\n";
+    return 1;
+  }
+  DepOut.os() << OutputFilename << ":";
+  const TGLexer::DependenciesMapTy &Dependencies = Parser.getDependencies();
+  for (TGLexer::DependenciesMapTy::const_iterator I = Dependencies.begin(),
+                                                  E = Dependencies.end();
+       I != E; ++I) {
+    DepOut.os() << " " << I->first;
+  }
+  DepOut.os() << "\n";
+  DepOut.keep();
+  return 0;
+}
+
+namespace llvm {
+
+int TableGenMain(char *argv0, TableGenMainFn *MainFn) {
+  RecordKeeper Records;
+
+  // Parse the input file.
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec =
+        MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+    errs() << "Could not open input file '" << InputFilename << "': "
+           << ec.message() <<"\n";
+    return 1;
+  }
+  MemoryBuffer *F = File.take();
+
+  // Tell SrcMgr about this buffer, which is what TGParser will pick up.
+  SrcMgr.AddNewSourceBuffer(F, SMLoc());
+
+  // Record the location of the include directory so that the lexer can find
+  // it later.
+  SrcMgr.setIncludeDirs(IncludeDirs);
+
+  TGParser Parser(SrcMgr, Records);
+
+  if (Parser.ParseFile())
+    return 1;
+
+  std::string Error;
+  tool_output_file Out(OutputFilename.c_str(), Error);
+  if (!Error.empty()) {
+    errs() << argv0 << ": error opening " << OutputFilename
+      << ":" << Error << "\n";
+    return 1;
+  }
+  if (!DependFilename.empty()) {
+    if (int Ret = createDependencyFile(Parser, argv0))
+      return Ret;
+  }
+
+  if (MainFn(Out.os(), Records))
+    return 1;
+
+  if (ErrorsPrinted > 0) {
+    errs() << argv0 << ": " << ErrorsPrinted << " errors.\n";
+    return 1;
+  }
+
+  // Declare success.
+  Out.keep();
+  return 0;
+}
+
+}
diff --git a/contrib/llvm/lib/TableGen/Record.cpp b/contrib/llvm/lib/TableGen/Record.cpp
new file mode 100644
index 000000000000..9ad20532d7eb
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/Record.cpp
@@ -0,0 +1,2057 @@
+//===- Record.cpp - Record implementation ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the tablegen record classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/Record.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//    std::string wrapper for DenseMap purposes
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// TableGenStringKey - This is a wrapper for std::string suitable for
+/// using as a key to a DenseMap.  Because there isn't a particularly
+/// good way to indicate tombstone or empty keys for strings, we want
+/// to wrap std::string to indicate that this is a "special" string
+/// not expected to take on certain values (those of the tombstone and
+/// empty keys).  This makes things a little safer as it clarifies
+/// that DenseMap is really not appropriate for general strings.
+
+class TableGenStringKey {
+public:
+  TableGenStringKey(const std::string &str) : data(str) {}
+  TableGenStringKey(const char *str) : data(str) {}
+
+  const std::string &str() const { return data; }
+
+  friend hash_code hash_value(const TableGenStringKey &Value) {
+    using llvm::hash_value;
+    return hash_value(Value.str());
+  }
+private:
+  std::string data;
+};
+
+/// Specialize DenseMapInfo for TableGenStringKey.
+template<> struct DenseMapInfo<TableGenStringKey> {
+  static inline TableGenStringKey getEmptyKey() {
+    TableGenStringKey Empty("<<<EMPTY KEY>>>");
+    return Empty;
+  }
+  static inline TableGenStringKey getTombstoneKey() {
+    TableGenStringKey Tombstone("<<<TOMBSTONE KEY>>>");
+    return Tombstone;
+  }
+  static unsigned getHashValue(const TableGenStringKey& Val) {
+    using llvm::hash_value;
+    return hash_value(Val);
+  }
+  static bool isEqual(const TableGenStringKey& LHS,
+                      const TableGenStringKey& RHS) {
+    return LHS.str() == RHS.str();
+  }
+};
+
+} // namespace llvm
+
+//===----------------------------------------------------------------------===//
+//    Type implementations
+//===----------------------------------------------------------------------===//
+
+BitRecTy BitRecTy::Shared;
+IntRecTy IntRecTy::Shared;
+StringRecTy StringRecTy::Shared;
+DagRecTy DagRecTy::Shared;
+
+void RecTy::anchor() { }
+void RecTy::dump() const { print(errs()); }
+
+ListRecTy *RecTy::getListTy() {
+  if (!ListTy)
+    ListTy = new ListRecTy(this);
+  return ListTy;
+}
+
+bool RecTy::baseClassOf(const RecTy *RHS) const{
+  assert (RHS && "NULL pointer");
+  return Kind == RHS->getRecTyKind();
+}
+
+Init *BitRecTy::convertValue(BitsInit *BI) {
+  if (BI->getNumBits() != 1) return 0; // Only accept if just one bit!
+  return BI->getBit(0);
+}
+
+Init *BitRecTy::convertValue(IntInit *II) {
+  int64_t Val = II->getValue();
+  if (Val != 0 && Val != 1) return 0;  // Only accept 0 or 1 for a bit!
+
+  return BitInit::get(Val != 0);
+}
+
+Init *BitRecTy::convertValue(TypedInit *VI) {
+  RecTy *Ty = VI->getType();
+  if (isa<BitRecTy>(Ty) || isa<BitsRecTy>(Ty) || isa<IntRecTy>(Ty))
+    return VI;  // Accept variable if it is already of bit type!
+  return 0;
+}
+
+bool BitRecTy::baseClassOf(const RecTy *RHS) const{
+  if(RecTy::baseClassOf(RHS) || getRecTyKind() == IntRecTyKind)
+    return true;
+  if(const BitsRecTy *BitsTy = dyn_cast<BitsRecTy>(RHS))
+    return BitsTy->getNumBits() == 1;
+  return false;
+}
+
+BitsRecTy *BitsRecTy::get(unsigned Sz) {
+  static std::vector<BitsRecTy*> Shared;
+  if (Sz >= Shared.size())
+    Shared.resize(Sz + 1);
+  BitsRecTy *&Ty = Shared[Sz];
+  if (!Ty)
+    Ty = new BitsRecTy(Sz);
+  return Ty;
+}
+
+std::string BitsRecTy::getAsString() const {
+  return "bits<" + utostr(Size) + ">";
+}
+
+Init *BitsRecTy::convertValue(UnsetInit *UI) {
+  SmallVector<Init *, 16> NewBits(Size);
+
+  for (unsigned i = 0; i != Size; ++i)
+    NewBits[i] = UnsetInit::get();
+
+  return BitsInit::get(NewBits);
+}
+
+Init *BitsRecTy::convertValue(BitInit *UI) {
+  if (Size != 1) return 0;  // Can only convert single bit.
+          return BitsInit::get(UI);
+}
+
+/// canFitInBitfield - Return true if the number of bits is large enough to hold
+/// the integer value.
+static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
+  // For example, with NumBits == 4, we permit Values from [-7 .. 15].
+  return (NumBits >= sizeof(Value) * 8) ||
+         (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1);
+}
+
+/// convertValue from Int initializer to bits type: Split the integer up into the
+/// appropriate bits.
+///
+Init *BitsRecTy::convertValue(IntInit *II) {
+  int64_t Value = II->getValue();
+  // Make sure this bitfield is large enough to hold the integer value.
+  if (!canFitInBitfield(Value, Size))
+    return 0;
+
+  SmallVector<Init *, 16> NewBits(Size);
+
+  for (unsigned i = 0; i != Size; ++i)
+    NewBits[i] = BitInit::get(Value & (1LL << i));
+
+  return BitsInit::get(NewBits);
+}
+
+Init *BitsRecTy::convertValue(BitsInit *BI) {
+  // If the number of bits is right, return it.  Otherwise we need to expand or
+  // truncate.
+  if (BI->getNumBits() == Size) return BI;
+  return 0;
+}
+
+Init *BitsRecTy::convertValue(TypedInit *VI) {
+  if (Size == 1 && isa<BitRecTy>(VI->getType()))
+    return BitsInit::get(VI);
+
+  if (VI->getType()->typeIsConvertibleTo(this)) {
+    SmallVector<Init *, 16> NewBits(Size);
+
+    for (unsigned i = 0; i != Size; ++i)
+      NewBits[i] = VarBitInit::get(VI, i);
+    return BitsInit::get(NewBits);
+  }
+
+  return 0;
+}
+
+bool BitsRecTy::baseClassOf(const RecTy *RHS) const{
+  if (RecTy::baseClassOf(RHS)) //argument and the receiver are the same type
+    return cast<BitsRecTy>(RHS)->Size == Size;
+  RecTyKind kind = RHS->getRecTyKind();
+  return (kind == BitRecTyKind && Size == 1) || (kind == IntRecTyKind);
+}
+
+Init *IntRecTy::convertValue(BitInit *BI) {
+  return IntInit::get(BI->getValue());
+}
+
+Init *IntRecTy::convertValue(BitsInit *BI) {
+  int64_t Result = 0;
+  for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i)
+    if (BitInit *Bit = dyn_cast<BitInit>(BI->getBit(i))) {
+      Result |= Bit->getValue() << i;
+    } else {
+      return 0;
+    }
+  return IntInit::get(Result);
+}
+
+Init *IntRecTy::convertValue(TypedInit *TI) {
+  if (TI->getType()->typeIsConvertibleTo(this))
+    return TI;  // Accept variable if already of the right type!
+  return 0;
+}
+
+bool IntRecTy::baseClassOf(const RecTy *RHS) const{
+  RecTyKind kind = RHS->getRecTyKind();
+  return kind==BitRecTyKind || kind==BitsRecTyKind || kind==IntRecTyKind;
+}
+
+Init *StringRecTy::convertValue(UnOpInit *BO) {
+  if (BO->getOpcode() == UnOpInit::CAST) {
+    Init *L = BO->getOperand()->convertInitializerTo(this);
+    if (L == 0) return 0;
+    if (L != BO->getOperand())
+      return UnOpInit::get(UnOpInit::CAST, L, new StringRecTy);
+    return BO;
+  }
+
+  return convertValue((TypedInit*)BO);
+}
+
+Init *StringRecTy::convertValue(BinOpInit *BO) {
+  if (BO->getOpcode() == BinOpInit::STRCONCAT) {
+    Init *L = BO->getLHS()->convertInitializerTo(this);
+    Init *R = BO->getRHS()->convertInitializerTo(this);
+    if (L == 0 || R == 0) return 0;
+    if (L != BO->getLHS() || R != BO->getRHS())
+      return BinOpInit::get(BinOpInit::STRCONCAT, L, R, new StringRecTy);
+    return BO;
+  }
+
+  return convertValue((TypedInit*)BO);
+}
+
+
+Init *StringRecTy::convertValue(TypedInit *TI) {
+  if (isa<StringRecTy>(TI->getType()))
+    return TI;  // Accept variable if already of the right type!
+  return 0;
+}
+
+std::string ListRecTy::getAsString() const {
+  return "list<" + Ty->getAsString() + ">";
+}
+
+Init *ListRecTy::convertValue(ListInit *LI) {
+  std::vector<Init*> Elements;
+
+  // Verify that all of the elements of the list are subclasses of the
+  // appropriate class!
+  for (unsigned i = 0, e = LI->getSize(); i != e; ++i)
+    if (Init *CI = LI->getElement(i)->convertInitializerTo(Ty))
+      Elements.push_back(CI);
+    else
+      return 0;
+
+  if (!isa<ListRecTy>(LI->getType()))
+    return 0;
+
+  return ListInit::get(Elements, this);
+}
+
+Init *ListRecTy::convertValue(TypedInit *TI) {
+  // Ensure that TI is compatible with our class.
+  if (ListRecTy *LRT = dyn_cast<ListRecTy>(TI->getType()))
+    if (LRT->getElementType()->typeIsConvertibleTo(getElementType()))
+      return TI;
+  return 0;
+}
+
+bool ListRecTy::baseClassOf(const RecTy *RHS) const{
+  if(const ListRecTy* ListTy = dyn_cast<ListRecTy>(RHS))
+    return ListTy->getElementType()->typeIsConvertibleTo(Ty);
+  return false;
+}
+
+Init *DagRecTy::convertValue(TypedInit *TI) {
+  if (TI->getType()->typeIsConvertibleTo(this))
+    return TI;
+  return 0;
+}
+
+Init *DagRecTy::convertValue(UnOpInit *BO) {
+  if (BO->getOpcode() == UnOpInit::CAST) {
+    Init *L = BO->getOperand()->convertInitializerTo(this);
+    if (L == 0) return 0;
+    if (L != BO->getOperand())
+      return UnOpInit::get(UnOpInit::CAST, L, new DagRecTy);
+    return BO;
+  }
+  return 0;
+}
+
+Init *DagRecTy::convertValue(BinOpInit *BO) {
+  if (BO->getOpcode() == BinOpInit::CONCAT) {
+    Init *L = BO->getLHS()->convertInitializerTo(this);
+    Init *R = BO->getRHS()->convertInitializerTo(this);
+    if (L == 0 || R == 0) return 0;
+    if (L != BO->getLHS() || R != BO->getRHS())
+      return BinOpInit::get(BinOpInit::CONCAT, L, R, new DagRecTy);
+    return BO;
+  }
+  return 0;
+}
+
+RecordRecTy *RecordRecTy::get(Record *R) {
+  return dyn_cast<RecordRecTy>(R->getDefInit()->getType());
+}
+
+std::string RecordRecTy::getAsString() const {
+  return Rec->getName();
+}
+
+Init *RecordRecTy::convertValue(DefInit *DI) {
+  // Ensure that DI is a subclass of Rec.
+  if (!DI->getDef()->isSubClassOf(Rec))
+    return 0;
+  return DI;
+}
+
+Init *RecordRecTy::convertValue(TypedInit *TI) {
+  // Ensure that TI is compatible with Rec.
+  if (RecordRecTy *RRT = dyn_cast<RecordRecTy>(TI->getType()))
+    if (RRT->getRecord()->isSubClassOf(getRecord()) ||
+        RRT->getRecord() == getRecord())
+      return TI;
+  return 0;
+}
+
+bool RecordRecTy::baseClassOf(const RecTy *RHS) const{
+  const RecordRecTy *RTy = dyn_cast<RecordRecTy>(RHS);
+  if (!RTy)
+    return false;
+
+  if (Rec == RTy->getRecord() || RTy->getRecord()->isSubClassOf(Rec))
+    return true;
+
+  const std::vector<Record*> &SC = Rec->getSuperClasses();
+  for (unsigned i = 0, e = SC.size(); i != e; ++i)
+    if (RTy->getRecord()->isSubClassOf(SC[i]))
+      return true;
+
+  return false;
+}
+
+/// resolveTypes - Find a common type that T1 and T2 convert to.
+/// Return 0 if no such type exists.
+///
+RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
+  if (T1->typeIsConvertibleTo(T2))
+    return T2;
+  if (T2->typeIsConvertibleTo(T1))
+    return T1;
+
+  // If one is a Record type, check superclasses
+  if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
+    // See if T2 inherits from a type T1 also inherits from
+    const std::vector<Record *> &T1SuperClasses =
+      RecTy1->getRecord()->getSuperClasses();
+    for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
+          iend = T1SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
+      RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
+      if (NewType1 != 0) {
+        if (NewType1 != SuperRecTy1) {
+          delete SuperRecTy1;
+        }
+        return NewType1;
+      }
+    }
+  }
+  if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2)) {
+    // See if T1 inherits from a type T2 also inherits from
+    const std::vector<Record *> &T2SuperClasses =
+      RecTy2->getRecord()->getSuperClasses();
+    for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
+          iend = T2SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
+      RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
+      if (NewType2 != 0) {
+        if (NewType2 != SuperRecTy2) {
+          delete SuperRecTy2;
+        }
+        return NewType2;
+      }
+    }
+  }
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//    Initializer implementations
+//===----------------------------------------------------------------------===//
+
+void Init::anchor() { }
+void Init::dump() const { return print(errs()); }
+
+void UnsetInit::anchor() { }
+
+UnsetInit *UnsetInit::get() {
+  static UnsetInit TheInit;
+  return &TheInit;
+}
+
+void BitInit::anchor() { }
+
+BitInit *BitInit::get(bool V) {
+  static BitInit True(true);
+  static BitInit False(false);
+
+  return V ? &True : &False;
+}
+
+static void
+ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
+  ID.AddInteger(Range.size());
+
+  for (ArrayRef<Init *>::iterator i = Range.begin(),
+         iend = Range.end();
+       i != iend;
+       ++i)
+    ID.AddPointer(*i);
+}
+
+BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
+  typedef FoldingSet<BitsInit> Pool;
+  static Pool ThePool;  
+
+  FoldingSetNodeID ID;
+  ProfileBitsInit(ID, Range);
+
+  void *IP = 0;
+  if (BitsInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  BitsInit *I = new BitsInit(Range);
+  ThePool.InsertNode(I, IP);
+
+  return I;
+}
+
+void BitsInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileBitsInit(ID, Bits);
+}
+
+Init *
+BitsInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  SmallVector<Init *, 16> NewBits(Bits.size());
+
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= getNumBits())
+      return 0;
+    NewBits[i] = getBit(Bits[i]);
+  }
+  return BitsInit::get(NewBits);
+}
+
+std::string BitsInit::getAsString() const {
+  std::string Result = "{ ";
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    if (i) Result += ", ";
+    if (Init *Bit = getBit(e-i-1))
+      Result += Bit->getAsString();
+    else
+      Result += "*";
+  }
+  return Result + " }";
+}
+
+// Fix bit initializer to preserve the behavior that bit reference from a unset
+// bits initializer will resolve into VarBitInit to keep the field name and bit
+// number used in targets with fixed insn length.
+static Init *fixBitInit(const RecordVal *RV, Init *Before, Init *After) {
+  if (RV || After != UnsetInit::get())
+    return After;
+  return Before;
+}
+
+// resolveReferences - If there are any field references that refer to fields
+// that have been filled in, we can propagate the values now.
+//
+Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  bool Changed = false;
+  SmallVector<Init *, 16> NewBits(getNumBits());
+
+  Init *CachedInit = 0;
+  Init *CachedBitVar = 0;
+  bool CachedBitVarChanged = false;
+
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    Init *CurBit = Bits[i];
+    Init *CurBitVar = CurBit->getBitVar();
+
+    NewBits[i] = CurBit;
+
+    if (CurBitVar == CachedBitVar) {
+      if (CachedBitVarChanged) {
+        Init *Bit = CachedInit->getBit(CurBit->getBitNum());
+        NewBits[i] = fixBitInit(RV, CurBit, Bit);
+      }
+      continue;
+    }
+    CachedBitVar = CurBitVar;
+    CachedBitVarChanged = false;
+
+    Init *B;
+    do {
+      B = CurBitVar;
+      CurBitVar = CurBitVar->resolveReferences(R, RV);
+      CachedBitVarChanged |= B != CurBitVar;
+      Changed |= B != CurBitVar;
+    } while (B != CurBitVar);
+    CachedInit = CurBitVar;
+
+    if (CachedBitVarChanged) {
+      Init *Bit = CurBitVar->getBit(CurBit->getBitNum());
+      NewBits[i] = fixBitInit(RV, CurBit, Bit);
+    }
+  }
+
+  if (Changed)
+    return BitsInit::get(NewBits);
+
+  return const_cast<BitsInit *>(this);
+}
+
+IntInit *IntInit::get(int64_t V) {
+  typedef DenseMap<int64_t, IntInit *> Pool;
+  static Pool ThePool;
+
+  IntInit *&I = ThePool[V];
+  if (!I) I = new IntInit(V);
+  return I;
+}
+
+std::string IntInit::getAsString() const {
+  return itostr(Value);
+}
+
+Init *
+IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  SmallVector<Init *, 16> NewBits(Bits.size());
+
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= 64)
+      return 0;
+
+    NewBits[i] = BitInit::get(Value & (INT64_C(1) << Bits[i]));
+  }
+  return BitsInit::get(NewBits);
+}
+
+void StringInit::anchor() { }
+
+StringInit *StringInit::get(StringRef V) {
+  typedef StringMap<StringInit *> Pool;
+  static Pool ThePool;
+
+  StringInit *&I = ThePool[V];
+  if (!I) I = new StringInit(V);
+  return I;
+}
+
+static void ProfileListInit(FoldingSetNodeID &ID,
+                            ArrayRef<Init *> Range,
+                            RecTy *EltTy) {
+  ID.AddInteger(Range.size());
+  ID.AddPointer(EltTy);
+
+  for (ArrayRef<Init *>::iterator i = Range.begin(),
+         iend = Range.end();
+       i != iend;
+       ++i)
+    ID.AddPointer(*i);
+}
+
+ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
+  typedef FoldingSet<ListInit> Pool;
+  static Pool ThePool;
+
+  // Just use the FoldingSetNodeID to compute a hash.  Use a DenseMap
+  // for actual storage.
+  FoldingSetNodeID ID;
+  ProfileListInit(ID, Range, EltTy);
+
+  void *IP = 0;
+  if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  ListInit *I = new ListInit(Range, EltTy);
+  ThePool.InsertNode(I, IP);
+  return I;
+}
+
+void ListInit::Profile(FoldingSetNodeID &ID) const {
+  ListRecTy *ListType = dyn_cast<ListRecTy>(getType());
+  assert(ListType && "Bad type for ListInit!");
+  RecTy *EltTy = ListType->getElementType();
+
+  ProfileListInit(ID, Values, EltTy);
+}
+
+Init *
+ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
+  std::vector<Init*> Vals;
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] >= getSize())
+      return 0;
+    Vals.push_back(getElement(Elements[i]));
+  }
+  return ListInit::get(Vals, getType());
+}
+
+Record *ListInit::getElementAsRecord(unsigned i) const {
+  assert(i < Values.size() && "List element index out of range!");
+  DefInit *DI = dyn_cast<DefInit>(Values[i]);
+  if (DI == 0)
+    PrintFatalError("Expected record in list!");
+  return DI->getDef();
+}
+
+Init *ListInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  std::vector<Init*> Resolved;
+  Resolved.reserve(getSize());
+  bool Changed = false;
+
+  for (unsigned i = 0, e = getSize(); i != e; ++i) {
+    Init *E;
+    Init *CurElt = getElement(i);
+
+    do {
+      E = CurElt;
+      CurElt = CurElt->resolveReferences(R, RV);
+      Changed |= E != CurElt;
+    } while (E != CurElt);
+    Resolved.push_back(E);
+  }
+
+  if (Changed)
+    return ListInit::get(Resolved, getType());
+  return const_cast<ListInit *>(this);
+}
+
+Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
+                                            unsigned Elt) const {
+  if (Elt >= getSize())
+    return 0;  // Out of range reference.
+  Init *E = getElement(Elt);
+  // If the element is set to some value, or if we are resolving a reference
+  // to a specific variable and that variable is explicitly unset, then
+  // replace the VarListElementInit with it.
+  if (IRV || !isa<UnsetInit>(E))
+    return E;
+  return 0;
+}
+
+std::string ListInit::getAsString() const {
+  std::string Result = "[";
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (i) Result += ", ";
+    Result += Values[i]->getAsString();
+  }
+  return Result + "]";
+}
+
+Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
+                                          unsigned Elt) const {
+  Init *Resolved = resolveReferences(R, IRV);
+  OpInit *OResolved = dyn_cast<OpInit>(Resolved);
+  if (OResolved) {
+    Resolved = OResolved->Fold(&R, 0);
+  }
+
+  if (Resolved != this) {
+    TypedInit *Typed = dyn_cast<TypedInit>(Resolved);
+    assert(Typed && "Expected typed init for list reference");
+    if (Typed) {
+      Init *New = Typed->resolveListElementReference(R, IRV, Elt);
+      if (New)
+        return New;
+      return VarListElementInit::get(Typed, Elt);
+    }
+  }
+
+  return 0;
+}
+
+Init *OpInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<OpInit*>(this);
+  return VarBitInit::get(const_cast<OpInit*>(this), Bit);
+}
+
+UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
+  typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
+
+  typedef DenseMap<Key, UnOpInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
+
+  UnOpInit *&I = ThePool[TheKey];
+  if (!I) I = new UnOpInit(opc, lhs, Type);
+  return I;
+}
+
+Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  case CAST: {
+    if (getType()->getAsString() == "string") {
+      if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
+        return LHSs;
+
+      if (DefInit *LHSd = dyn_cast<DefInit>(LHS))
+        return StringInit::get(LHSd->getDef()->getName());
+
+      if (IntInit *LHSi = dyn_cast<IntInit>(LHS))
+        return StringInit::get(LHSi->getAsString());
+    } else {
+      if (StringInit *LHSs = dyn_cast<StringInit>(LHS)) {
+        std::string Name = LHSs->getValue();
+
+        // From TGParser::ParseIDValue
+        if (CurRec) {
+          if (const RecordVal *RV = CurRec->getValue(Name)) {
+            if (RV->getType() != getType())
+              PrintFatalError("type mismatch in cast");
+            return VarInit::get(Name, RV->getType());
+          }
+
+          Init *TemplateArgName = QualifyName(*CurRec, CurMultiClass, Name,
+                                              ":");
+      
+          if (CurRec->isTemplateArg(TemplateArgName)) {
+            const RecordVal *RV = CurRec->getValue(TemplateArgName);
+            assert(RV && "Template arg doesn't exist??");
+
+            if (RV->getType() != getType())
+              PrintFatalError("type mismatch in cast");
+
+            return VarInit::get(TemplateArgName, RV->getType());
+          }
+        }
+
+        if (CurMultiClass) {
+          Init *MCName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name, "::");
+
+          if (CurMultiClass->Rec.isTemplateArg(MCName)) {
+            const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
+            assert(RV && "Template arg doesn't exist??");
+
+            if (RV->getType() != getType())
+              PrintFatalError("type mismatch in cast");
+
+            return VarInit::get(MCName, RV->getType());
+          }
+        }
+
+        if (Record *D = (CurRec->getRecords()).getDef(Name))
+          return DefInit::get(D);
+
+        PrintFatalError(CurRec->getLoc(),
+                        "Undefined reference:'" + Name + "'\n");
+      }
+    }
+    break;
+  }
+  case HEAD: {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
+      if (LHSl->getSize() == 0) {
+        assert(0 && "Empty list in car");
+        return 0;
+      }
+      return LHSl->getElement(0);
+    }
+    break;
+  }
+  case TAIL: {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
+      if (LHSl->getSize() == 0) {
+        assert(0 && "Empty list in cdr");
+        return 0;
+      }
+      // Note the +1.  We can't just pass the result of getValues()
+      // directly.
+      ArrayRef<Init *>::iterator begin = LHSl->getValues().begin()+1;
+      ArrayRef<Init *>::iterator end   = LHSl->getValues().end();
+      ListInit *Result =
+        ListInit::get(ArrayRef<Init *>(begin, end - begin),
+                      LHSl->getType());
+      return Result;
+    }
+    break;
+  }
+  case EMPTY: {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
+      if (LHSl->getSize() == 0) {
+        return IntInit::get(1);
+      } else {
+        return IntInit::get(0);
+      }
+    }
+    if (StringInit *LHSs = dyn_cast<StringInit>(LHS)) {
+      if (LHSs->getValue().empty()) {
+        return IntInit::get(1);
+      } else {
+        return IntInit::get(0);
+      }
+    }
+
+    break;
+  }
+  }
+  return const_cast<UnOpInit *>(this);
+}
+
+Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+
+  if (LHS != lhs)
+    return (UnOpInit::get(getOpcode(), lhs, getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string UnOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
+  case HEAD: Result = "!head"; break;
+  case TAIL: Result = "!tail"; break;
+  case EMPTY: Result = "!empty"; break;
+  }
+  return Result + "(" + LHS->getAsString() + ")";
+}
+
+BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
+                          Init *rhs, RecTy *Type) {
+  typedef std::pair<
+    std::pair<std::pair<unsigned, Init *>, Init *>,
+    RecTy *
+    > Key;
+
+  typedef DenseMap<Key, BinOpInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
+                            Type));
+
+  BinOpInit *&I = ThePool[TheKey];
+  if (!I) I = new BinOpInit(opc, lhs, rhs, Type);
+  return I;
+}
+
+Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  case CONCAT: {
+    DagInit *LHSs = dyn_cast<DagInit>(LHS);
+    DagInit *RHSs = dyn_cast<DagInit>(RHS);
+    if (LHSs && RHSs) {
+      DefInit *LOp = dyn_cast<DefInit>(LHSs->getOperator());
+      DefInit *ROp = dyn_cast<DefInit>(RHSs->getOperator());
+      if (LOp == 0 || ROp == 0 || LOp->getDef() != ROp->getDef())
+        PrintFatalError("Concated Dag operators do not match!");
+      std::vector<Init*> Args;
+      std::vector<std::string> ArgNames;
+      for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) {
+        Args.push_back(LHSs->getArg(i));
+        ArgNames.push_back(LHSs->getArgName(i));
+      }
+      for (unsigned i = 0, e = RHSs->getNumArgs(); i != e; ++i) {
+        Args.push_back(RHSs->getArg(i));
+        ArgNames.push_back(RHSs->getArgName(i));
+      }
+      return DagInit::get(LHSs->getOperator(), "", Args, ArgNames);
+    }
+    break;
+  }
+  case STRCONCAT: {
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
+    if (LHSs && RHSs)
+      return StringInit::get(LHSs->getValue() + RHSs->getValue());
+    break;
+  }
+  case EQ: {
+    // try to fold eq comparison for 'bit' and 'int', otherwise fallback
+    // to string objects.
+    IntInit *L =
+      dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *R =
+      dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
+
+    if (L && R)
+      return IntInit::get(L->getValue() == R->getValue());
+
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
+
+    // Make sure we've resolved
+    if (LHSs && RHSs)
+      return IntInit::get(LHSs->getValue() == RHSs->getValue());
+
+    break;
+  }
+  case ADD:
+  case SHL:
+  case SRA:
+  case SRL: {
+    IntInit *LHSi = dyn_cast<IntInit>(LHS);
+    IntInit *RHSi = dyn_cast<IntInit>(RHS);
+    if (LHSi && RHSi) {
+      int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue();
+      int64_t Result;
+      switch (getOpcode()) {
+      default: llvm_unreachable("Bad opcode!");
+      case ADD: Result = LHSv +  RHSv; break;
+      case SHL: Result = LHSv << RHSv; break;
+      case SRA: Result = LHSv >> RHSv; break;
+      case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
+      }
+      return IntInit::get(Result);
+    }
+    break;
+  }
+  }
+  return const_cast<BinOpInit *>(this);
+}
+
+Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+  Init *rhs = RHS->resolveReferences(R, RV);
+
+  if (LHS != lhs || RHS != rhs)
+    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string BinOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case CONCAT: Result = "!con"; break;
+  case ADD: Result = "!add"; break;
+  case SHL: Result = "!shl"; break;
+  case SRA: Result = "!sra"; break;
+  case SRL: Result = "!srl"; break;
+  case EQ: Result = "!eq"; break;
+  case STRCONCAT: Result = "!strconcat"; break;
+  }
+  return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
+}
+
+TernOpInit *TernOpInit::get(TernaryOp opc, Init *lhs,
+                                  Init *mhs, Init *rhs,
+                                  RecTy *Type) {
+  typedef std::pair<
+    std::pair<
+      std::pair<std::pair<unsigned, RecTy *>, Init *>,
+      Init *
+      >,
+    Init *
+    > Key;
+
+  typedef DenseMap<Key, TernOpInit *> Pool;
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(std::make_pair(std::make_pair(std::make_pair(opc,
+                                                                         Type),
+                                                          lhs),
+                                           mhs),
+                            rhs));
+
+  TernOpInit *&I = ThePool[TheKey];
+  if (!I) I = new TernOpInit(opc, lhs, mhs, rhs, Type);
+  return I;
+}
+
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                           Record *CurRec, MultiClass *CurMultiClass);
+
+static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
+                               RecTy *Type, Record *CurRec,
+                               MultiClass *CurMultiClass) {
+  std::vector<Init *> NewOperands;
+
+  TypedInit *TArg = dyn_cast<TypedInit>(Arg);
+
+  // If this is a dag, recurse
+  if (TArg && TArg->getType()->getAsString() == "dag") {
+    Init *Result = ForeachHelper(LHS, Arg, RHSo, Type,
+                                 CurRec, CurMultiClass);
+    if (Result != 0) {
+      return Result;
+    } else {
+      return 0;
+    }
+  }
+
+  for (int i = 0; i < RHSo->getNumOperands(); ++i) {
+    OpInit *RHSoo = dyn_cast<OpInit>(RHSo->getOperand(i));
+
+    if (RHSoo) {
+      Init *Result = EvaluateOperation(RHSoo, LHS, Arg,
+                                       Type, CurRec, CurMultiClass);
+      if (Result != 0) {
+        NewOperands.push_back(Result);
+      } else {
+        NewOperands.push_back(Arg);
+      }
+    } else if (LHS->getAsString() == RHSo->getOperand(i)->getAsString()) {
+      NewOperands.push_back(Arg);
+    } else {
+      NewOperands.push_back(RHSo->getOperand(i));
+    }
+  }
+
+  // Now run the operator and use its result as the new leaf
+  const OpInit *NewOp = RHSo->clone(NewOperands);
+  Init *NewVal = NewOp->Fold(CurRec, CurMultiClass);
+  if (NewVal != NewOp)
+    return NewVal;
+
+  return 0;
+}
+
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                           Record *CurRec, MultiClass *CurMultiClass) {
+  DagInit *MHSd = dyn_cast<DagInit>(MHS);
+  ListInit *MHSl = dyn_cast<ListInit>(MHS);
+
+  OpInit *RHSo = dyn_cast<OpInit>(RHS);
+
+  if (!RHSo) {
+    PrintFatalError(CurRec->getLoc(), "!foreach requires an operator\n");
+  }
+
+  TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
+
+  if (!LHSt)
+    PrintFatalError(CurRec->getLoc(), "!foreach requires typed variable\n");
+
+  if ((MHSd && isa<DagRecTy>(Type)) || (MHSl && isa<ListRecTy>(Type))) {
+    if (MHSd) {
+      Init *Val = MHSd->getOperator();
+      Init *Result = EvaluateOperation(RHSo, LHS, Val,
+                                       Type, CurRec, CurMultiClass);
+      if (Result != 0) {
+        Val = Result;
+      }
+
+      std::vector<std::pair<Init *, std::string> > args;
+      for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) {
+        Init *Arg;
+        std::string ArgName;
+        Arg = MHSd->getArg(i);
+        ArgName = MHSd->getArgName(i);
+
+        // Process args
+        Init *Result = EvaluateOperation(RHSo, LHS, Arg, Type,
+                                         CurRec, CurMultiClass);
+        if (Result != 0) {
+          Arg = Result;
+        }
+
+        // TODO: Process arg names
+        args.push_back(std::make_pair(Arg, ArgName));
+      }
+
+      return DagInit::get(Val, "", args);
+    }
+    if (MHSl) {
+      std::vector<Init *> NewOperands;
+      std::vector<Init *> NewList(MHSl->begin(), MHSl->end());
+
+      for (std::vector<Init *>::iterator li = NewList.begin(),
+             liend = NewList.end();
+           li != liend;
+           ++li) {
+        Init *Item = *li;
+        NewOperands.clear();
+        for(int i = 0; i < RHSo->getNumOperands(); ++i) {
+          // First, replace the foreach variable with the list item
+          if (LHS->getAsString() == RHSo->getOperand(i)->getAsString()) {
+            NewOperands.push_back(Item);
+          } else {
+            NewOperands.push_back(RHSo->getOperand(i));
+          }
+        }
+
+        // Now run the operator and use its result as the new list item
+        const OpInit *NewOp = RHSo->clone(NewOperands);
+        Init *NewItem = NewOp->Fold(CurRec, CurMultiClass);
+        if (NewItem != NewOp)
+          *li = NewItem;
+      }
+      return ListInit::get(NewList, MHSl->getType());
+    }
+  }
+  return 0;
+}
+
+Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  case SUBST: {
+    DefInit *LHSd = dyn_cast<DefInit>(LHS);
+    VarInit *LHSv = dyn_cast<VarInit>(LHS);
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+
+    DefInit *MHSd = dyn_cast<DefInit>(MHS);
+    VarInit *MHSv = dyn_cast<VarInit>(MHS);
+    StringInit *MHSs = dyn_cast<StringInit>(MHS);
+
+    DefInit *RHSd = dyn_cast<DefInit>(RHS);
+    VarInit *RHSv = dyn_cast<VarInit>(RHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
+
+    if ((LHSd && MHSd && RHSd)
+        || (LHSv && MHSv && RHSv)
+        || (LHSs && MHSs && RHSs)) {
+      if (RHSd) {
+        Record *Val = RHSd->getDef();
+        if (LHSd->getAsString() == RHSd->getAsString()) {
+          Val = MHSd->getDef();
+        }
+        return DefInit::get(Val);
+      }
+      if (RHSv) {
+        std::string Val = RHSv->getName();
+        if (LHSv->getAsString() == RHSv->getAsString()) {
+          Val = MHSv->getName();
+        }
+        return VarInit::get(Val, getType());
+      }
+      if (RHSs) {
+        std::string Val = RHSs->getValue();
+
+        std::string::size_type found;
+        std::string::size_type idx = 0;
+        do {
+          found = Val.find(LHSs->getValue(), idx);
+          if (found != std::string::npos) {
+            Val.replace(found, LHSs->getValue().size(), MHSs->getValue());
+          }
+          idx = found +  MHSs->getValue().size();
+        } while (found != std::string::npos);
+
+        return StringInit::get(Val);
+      }
+    }
+    break;
+  }
+
+  case FOREACH: {
+    Init *Result = ForeachHelper(LHS, MHS, RHS, getType(),
+                                 CurRec, CurMultiClass);
+    if (Result != 0) {
+      return Result;
+    }
+    break;
+  }
+
+  case IF: {
+    IntInit *LHSi = dyn_cast<IntInit>(LHS);
+    if (Init *I = LHS->convertInitializerTo(IntRecTy::get()))
+      LHSi = dyn_cast<IntInit>(I);
+    if (LHSi) {
+      if (LHSi->getValue()) {
+        return MHS;
+      } else {
+        return RHS;
+      }
+    }
+    break;
+  }
+  }
+
+  return const_cast<TernOpInit *>(this);
+}
+
+Init *TernOpInit::resolveReferences(Record &R,
+                                    const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+
+  if (Opc == IF && lhs != LHS) {
+    IntInit *Value = dyn_cast<IntInit>(lhs);
+    if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
+      Value = dyn_cast<IntInit>(I);
+    if (Value != 0) {
+      // Short-circuit
+      if (Value->getValue()) {
+        Init *mhs = MHS->resolveReferences(R, RV);
+        return (TernOpInit::get(getOpcode(), lhs, mhs,
+                                RHS, getType()))->Fold(&R, 0);
+      } else {
+        Init *rhs = RHS->resolveReferences(R, RV);
+        return (TernOpInit::get(getOpcode(), lhs, MHS,
+                                rhs, getType()))->Fold(&R, 0);
+      }
+    }
+  }
+
+  Init *mhs = MHS->resolveReferences(R, RV);
+  Init *rhs = RHS->resolveReferences(R, RV);
+
+  if (LHS != lhs || MHS != mhs || RHS != rhs)
+    return (TernOpInit::get(getOpcode(), lhs, mhs, rhs,
+                            getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string TernOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case SUBST: Result = "!subst"; break;
+  case FOREACH: Result = "!foreach"; break;
+  case IF: Result = "!if"; break;
+ }
+  return Result + "(" + LHS->getAsString() + ", " + MHS->getAsString() + ", "
+    + RHS->getAsString() + ")";
+}
+
+RecTy *TypedInit::getFieldType(const std::string &FieldName) const {
+  if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType()))
+    if (RecordVal *Field = RecordType->getRecord()->getValue(FieldName))
+      return Field->getType();
+  return 0;
+}
+
+Init *
+TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  BitsRecTy *T = dyn_cast<BitsRecTy>(getType());
+  if (T == 0) return 0;  // Cannot subscript a non-bits variable.
+  unsigned NumBits = T->getNumBits();
+
+  SmallVector<Init *, 16> NewBits(Bits.size());
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= NumBits)
+      return 0;
+
+    NewBits[i] = VarBitInit::get(const_cast<TypedInit *>(this), Bits[i]);
+  }
+  return BitsInit::get(NewBits);
+}
+
+Init *
+TypedInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
+  ListRecTy *T = dyn_cast<ListRecTy>(getType());
+  if (T == 0) return 0;  // Cannot subscript a non-list variable.
+
+  if (Elements.size() == 1)
+    return VarListElementInit::get(const_cast<TypedInit *>(this), Elements[0]);
+
+  std::vector<Init*> ListInits;
+  ListInits.reserve(Elements.size());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i)
+    ListInits.push_back(VarListElementInit::get(const_cast<TypedInit *>(this),
+                                                Elements[i]));
+  return ListInit::get(ListInits, T);
+}
+
+
+VarInit *VarInit::get(const std::string &VN, RecTy *T) {
+  Init *Value = StringInit::get(VN);
+  return VarInit::get(Value, T);
+}
+
+VarInit *VarInit::get(Init *VN, RecTy *T) {
+  typedef std::pair<RecTy *, Init *> Key;
+  typedef DenseMap<Key, VarInit *> Pool;
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, VN));
+
+  VarInit *&I = ThePool[TheKey];
+  if (!I) I = new VarInit(VN, T);
+  return I;
+}
+
+const std::string &VarInit::getName() const {
+  StringInit *NameString = dyn_cast<StringInit>(getNameInit());
+  assert(NameString && "VarInit name is not a string!");
+  return NameString->getValue();
+}
+
+Init *VarInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarInit*>(this);
+  return VarBitInit::get(const_cast<VarInit*>(this), Bit);
+}
+
+Init *VarInit::resolveListElementReference(Record &R,
+                                           const RecordVal *IRV,
+                                           unsigned Elt) const {
+  if (R.isTemplateArg(getNameInit())) return 0;
+  if (IRV && IRV->getNameInit() != getNameInit()) return 0;
+
+  RecordVal *RV = R.getValue(getNameInit());
+  assert(RV && "Reference to a non-existent variable?");
+  ListInit *LI = dyn_cast<ListInit>(RV->getValue());
+  if (!LI) {
+    TypedInit *VI = dyn_cast<TypedInit>(RV->getValue());
+    assert(VI && "Invalid list element!");
+    return VarListElementInit::get(VI, Elt);
+  }
+
+  if (Elt >= LI->getSize())
+    return 0;  // Out of range reference.
+  Init *E = LI->getElement(Elt);
+  // If the element is set to some value, or if we are resolving a reference
+  // to a specific variable and that variable is explicitly unset, then
+  // replace the VarListElementInit with it.
+  if (IRV || !isa<UnsetInit>(E))
+    return E;
+  return 0;
+}
+
+
+RecTy *VarInit::getFieldType(const std::string &FieldName) const {
+  if (RecordRecTy *RTy = dyn_cast<RecordRecTy>(getType()))
+    if (const RecordVal *RV = RTy->getRecord()->getValue(FieldName))
+      return RV->getType();
+  return 0;
+}
+
+Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
+  if (isa<RecordRecTy>(getType()))
+    if (const RecordVal *Val = R.getValue(VarName)) {
+      if (RV != Val && (RV || isa<UnsetInit>(Val->getValue())))
+        return 0;
+      Init *TheInit = Val->getValue();
+      assert(TheInit != this && "Infinite loop detected!");
+      if (Init *I = TheInit->getFieldInit(R, RV, FieldName))
+        return I;
+      else
+        return 0;
+    }
+  return 0;
+}
+
+/// resolveReferences - This method is used by classes that refer to other
+/// variables which may not be defined at the time the expression is formed.
+/// If a value is set for the variable later, this method will be called on
+/// users of the value to allow the value to propagate out.
+///
+Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  if (RecordVal *Val = R.getValue(VarName))
+    if (RV == Val || (RV == 0 && !isa<UnsetInit>(Val->getValue())))
+      return Val->getValue();
+  return const_cast<VarInit *>(this);
+}
+
+VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
+  typedef std::pair<TypedInit *, unsigned> Key;
+  typedef DenseMap<Key, VarBitInit *> Pool;
+
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, B));
+
+  VarBitInit *&I = ThePool[TheKey];
+  if (!I) I = new VarBitInit(T, B);
+  return I;
+}
+
+std::string VarBitInit::getAsString() const {
+   return TI->getAsString() + "{" + utostr(Bit) + "}";
+}
+
+Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *I = TI->resolveReferences(R, RV);
+  if (TI != I)
+    return I->getBit(getBitNum());
+
+  return const_cast<VarBitInit*>(this);
+}
+
+VarListElementInit *VarListElementInit::get(TypedInit *T,
+                                            unsigned E) {
+  typedef std::pair<TypedInit *, unsigned> Key;
+  typedef DenseMap<Key, VarListElementInit *> Pool;
+
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, E));
+
+  VarListElementInit *&I = ThePool[TheKey];
+  if (!I) I = new VarListElementInit(T, E);
+  return I;
+}
+
+std::string VarListElementInit::getAsString() const {
+  return TI->getAsString() + "[" + utostr(Element) + "]";
+}
+
+Init *
+VarListElementInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  if (Init *I = getVariable()->resolveListElementReference(R, RV,
+                                                           getElementNum()))
+    return I;
+  return const_cast<VarListElementInit *>(this);
+}
+
+Init *VarListElementInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarListElementInit*>(this);
+  return VarBitInit::get(const_cast<VarListElementInit*>(this), Bit);
+}
+
+Init *VarListElementInit:: resolveListElementReference(Record &R,
+                                                       const RecordVal *RV,
+                                                       unsigned Elt) const {
+  Init *Result = TI->resolveListElementReference(R, RV, Element);
+  
+  if (Result) {
+    if (TypedInit *TInit = dyn_cast<TypedInit>(Result)) {
+      Init *Result2 = TInit->resolveListElementReference(R, RV, Elt);
+      if (Result2) return Result2;
+      return new VarListElementInit(TInit, Elt);
+    }
+    return Result;
+  }
+ 
+  return 0;
+}
+
+DefInit *DefInit::get(Record *R) {
+  return R->getDefInit();
+}
+
+RecTy *DefInit::getFieldType(const std::string &FieldName) const {
+  if (const RecordVal *RV = Def->getValue(FieldName))
+    return RV->getType();
+  return 0;
+}
+
+Init *DefInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
+  return Def->getValue(FieldName)->getValue();
+}
+
+
+std::string DefInit::getAsString() const {
+  return Def->getName();
+}
+
+FieldInit *FieldInit::get(Init *R, const std::string &FN) {
+  typedef std::pair<Init *, TableGenStringKey> Key;
+  typedef DenseMap<Key, FieldInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(R, FN));
+
+  FieldInit *&I = ThePool[TheKey];
+  if (!I) I = new FieldInit(R, FN);
+  return I;
+}
+
+Init *FieldInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<FieldInit*>(this);
+  return VarBitInit::get(const_cast<FieldInit*>(this), Bit);
+}
+
+Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
+                                             unsigned Elt) const {
+  if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
+    if (ListInit *LI = dyn_cast<ListInit>(ListVal)) {
+      if (Elt >= LI->getSize()) return 0;
+      Init *E = LI->getElement(Elt);
+
+      // If the element is set to some value, or if we are resolving a
+      // reference to a specific variable and that variable is explicitly
+      // unset, then replace the VarListElementInit with it.
+      if (RV || !isa<UnsetInit>(E))
+        return E;
+    }
+  return 0;
+}
+
+Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *NewRec = RV ? Rec->resolveReferences(R, RV) : Rec;
+
+  Init *BitsVal = NewRec->getFieldInit(R, RV, FieldName);
+  if (BitsVal) {
+    Init *BVR = BitsVal->resolveReferences(R, RV);
+    return BVR->isComplete() ? BVR : const_cast<FieldInit *>(this);
+  }
+
+  if (NewRec != Rec) {
+    return FieldInit::get(NewRec, FieldName);
+  }
+  return const_cast<FieldInit *>(this);
+}
+
+static void ProfileDagInit(FoldingSetNodeID &ID, Init *V, const std::string &VN,
+                           ArrayRef<Init *> ArgRange,
+                           ArrayRef<std::string> NameRange) {
+  ID.AddPointer(V);
+  ID.AddString(VN);
+
+  ArrayRef<Init *>::iterator Arg  = ArgRange.begin();
+  ArrayRef<std::string>::iterator  Name = NameRange.begin();
+  while (Arg != ArgRange.end()) {
+    assert(Name != NameRange.end() && "Arg name underflow!");
+    ID.AddPointer(*Arg++);
+    ID.AddString(*Name++);
+  }
+  assert(Name == NameRange.end() && "Arg name overflow!");
+}
+
+DagInit *
+DagInit::get(Init *V, const std::string &VN,
+             ArrayRef<Init *> ArgRange,
+             ArrayRef<std::string> NameRange) {
+  typedef FoldingSet<DagInit> Pool;
+  static Pool ThePool;  
+
+  FoldingSetNodeID ID;
+  ProfileDagInit(ID, V, VN, ArgRange, NameRange);
+
+  void *IP = 0;
+  if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  DagInit *I = new DagInit(V, VN, ArgRange, NameRange);
+  ThePool.InsertNode(I, IP);
+
+  return I;
+}
+
+DagInit *
+DagInit::get(Init *V, const std::string &VN,
+             const std::vector<std::pair<Init*, std::string> > &args) {
+  typedef std::pair<Init*, std::string> PairType;
+
+  std::vector<Init *> Args;
+  std::vector<std::string> Names;
+
+  for (std::vector<PairType>::const_iterator i = args.begin(),
+         iend = args.end();
+       i != iend;
+       ++i) {
+    Args.push_back(i->first);
+    Names.push_back(i->second);
+  }
+
+  return DagInit::get(V, VN, Args, Names);
+}
+
+void DagInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileDagInit(ID, Val, ValName, Args, ArgNames);
+}
+
+Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  std::vector<Init*> NewArgs;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+    NewArgs.push_back(Args[i]->resolveReferences(R, RV));
+
+  Init *Op = Val->resolveReferences(R, RV);
+
+  if (Args != NewArgs || Op != Val)
+    return DagInit::get(Op, ValName, NewArgs, ArgNames);
+
+  return const_cast<DagInit *>(this);
+}
+
+
+std::string DagInit::getAsString() const {
+  std::string Result = "(" + Val->getAsString();
+  if (!ValName.empty())
+    Result += ":" + ValName;
+  if (Args.size()) {
+    Result += " " + Args[0]->getAsString();
+    if (!ArgNames[0].empty()) Result += ":$" + ArgNames[0];
+    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
+      Result += ", " + Args[i]->getAsString();
+      if (!ArgNames[i].empty()) Result += ":$" + ArgNames[i];
+    }
+  }
+  return Result + ")";
+}
+
+
+//===----------------------------------------------------------------------===//
+//    Other implementations
+//===----------------------------------------------------------------------===//
+
+RecordVal::RecordVal(Init *N, RecTy *T, unsigned P)
+  : Name(N), Ty(T), Prefix(P) {
+  Value = Ty->convertValue(UnsetInit::get());
+  assert(Value && "Cannot create unset value for current type!");
+}
+
+RecordVal::RecordVal(const std::string &N, RecTy *T, unsigned P)
+  : Name(StringInit::get(N)), Ty(T), Prefix(P) {
+  Value = Ty->convertValue(UnsetInit::get());
+  assert(Value && "Cannot create unset value for current type!");
+}
+
+const std::string &RecordVal::getName() const {
+  StringInit *NameString = dyn_cast<StringInit>(Name);
+  assert(NameString && "RecordVal name is not a string!");
+  return NameString->getValue();
+}
+
+void RecordVal::dump() const { errs() << *this; }
+
+void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
+  if (getPrefix()) OS << "field ";
+  OS << *getType() << " " << getNameInitAsString();
+
+  if (getValue())
+    OS << " = " << *getValue();
+
+  if (PrintSem) OS << ";\n";
+}
+
+unsigned Record::LastID = 0;
+
+void Record::init() {
+  checkName();
+
+  // Every record potentially has a def at the top.  This value is
+  // replaced with the top-level def name at instantiation time.
+  RecordVal DN("NAME", StringRecTy::get(), 0);
+  addValue(DN);
+}
+
+void Record::checkName() {
+  // Ensure the record name has string type.
+  const TypedInit *TypedName = dyn_cast<const TypedInit>(Name);
+  assert(TypedName && "Record name is not typed!");
+  RecTy *Type = TypedName->getType();
+  if (!isa<StringRecTy>(Type))
+    PrintFatalError(getLoc(), "Record name is not a string!");
+}
+
+DefInit *Record::getDefInit() {
+  if (!TheInit)
+    TheInit = new DefInit(this, new RecordRecTy(this));
+  return TheInit;
+}
+
+const std::string &Record::getName() const {
+  const StringInit *NameString = dyn_cast<StringInit>(Name);
+  assert(NameString && "Record name is not a string!");
+  return NameString->getValue();
+}
+
+void Record::setName(Init *NewName) {
+  if (TrackedRecords.getDef(Name->getAsUnquotedString()) == this) {
+    TrackedRecords.removeDef(Name->getAsUnquotedString());
+    TrackedRecords.addDef(this);
+  } else if (TrackedRecords.getClass(Name->getAsUnquotedString()) == this) {
+    TrackedRecords.removeClass(Name->getAsUnquotedString());
+    TrackedRecords.addClass(this);
+  }  // Otherwise this isn't yet registered.
+  Name = NewName;
+  checkName();
+  // DO NOT resolve record values to the name at this point because
+  // there might be default values for arguments of this def.  Those
+  // arguments might not have been resolved yet so we don't want to
+  // prematurely assume values for those arguments were not passed to
+  // this def.
+  //
+  // Nonetheless, it may be that some of this Record's values
+  // reference the record name.  Indeed, the reason for having the
+  // record name be an Init is to provide this flexibility.  The extra
+  // resolve steps after completely instantiating defs takes care of
+  // this.  See TGParser::ParseDef and TGParser::ParseDefm.
+}
+
+void Record::setName(const std::string &Name) {
+  setName(StringInit::get(Name));
+}
+
+/// resolveReferencesTo - If anything in this record refers to RV, replace the
+/// reference to RV with the RHS of RV.  If RV is null, we resolve all possible
+/// references.
+void Record::resolveReferencesTo(const RecordVal *RV) {
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (RV == &Values[i]) // Skip resolve the same field as the given one
+      continue;
+    if (Init *V = Values[i].getValue())
+      if (Values[i].setValue(V->resolveReferences(*this, RV)))
+        PrintFatalError(getLoc(), "Invalid value is found when setting '"
+                      + Values[i].getNameInitAsString()
+                      + "' after resolving references"
+                      + (RV ? " against '" + RV->getNameInitAsString()
+                              + "' of ("
+                              + RV->getValue()->getAsUnquotedString() + ")"
+                            : "")
+                      + "\n");
+  }
+  Init *OldName = getNameInit();
+  Init *NewName = Name->resolveReferences(*this, RV);
+  if (NewName != OldName) {
+    // Re-register with RecordKeeper.
+    setName(NewName);
+  }
+}
+
+void Record::dump() const { errs() << *this; }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
+  OS << R.getNameInitAsString();
+
+  const std::vector<Init *> &TArgs = R.getTemplateArgs();
+  if (!TArgs.empty()) {
+    OS << "<";
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      if (i) OS << ", ";
+      const RecordVal *RV = R.getValue(TArgs[i]);
+      assert(RV && "Template argument record not found??");
+      RV->print(OS, false);
+    }
+    OS << ">";
+  }
+
+  OS << " {";
+  const std::vector<Record*> &SC = R.getSuperClasses();
+  if (!SC.empty()) {
+    OS << "\t//";
+    for (unsigned i = 0, e = SC.size(); i != e; ++i)
+      OS << " " << SC[i]->getNameInitAsString();
+  }
+  OS << "\n";
+
+  const std::vector<RecordVal> &Vals = R.getValues();
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (Vals[i].getPrefix() && !R.isTemplateArg(Vals[i].getName()))
+      OS << Vals[i];
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (!Vals[i].getPrefix() && !R.isTemplateArg(Vals[i].getName()))
+      OS << Vals[i];
+
+  return OS << "}\n";
+}
+
+/// getValueInit - Return the initializer for a value with the specified name,
+/// or abort if the field does not exist.
+///
+Init *Record::getValueInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+  return R->getValue();
+}
+
+
+/// getValueAsString - This method looks up the specified field and returns its
+/// value as a string, aborts if the field does not exist or if
+/// the value is not a string.
+///
+std::string Record::getValueAsString(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (StringInit *SI = dyn_cast<StringInit>(R->getValue()))
+    return SI->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a string initializer!");
+}
+
+/// getValueAsBitsInit - This method looks up the specified field and returns
+/// its value as a BitsInit, aborts if the field does not exist or if
+/// the value is not the right type.
+///
+BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (BitsInit *BI = dyn_cast<BitsInit>(R->getValue()))
+    return BI;
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a BitsInit initializer!");
+}
+
+/// getValueAsListInit - This method looks up the specified field and returns
+/// its value as a ListInit, aborting if the field does not exist or if
+/// the value is not the right type.
+///
+ListInit *Record::getValueAsListInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (ListInit *LI = dyn_cast<ListInit>(R->getValue()))
+    return LI;
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a list initializer!");
+}
+
+/// getValueAsListOfDefs - This method looks up the specified field and returns
+/// its value as a vector of records, aborting if the field does not exist
+/// or if the value is not the right type.
+///
+std::vector<Record*>
+Record::getValueAsListOfDefs(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<Record*> Defs;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (DefInit *DI = dyn_cast<DefInit>(List->getElement(i))) {
+      Defs.push_back(DI->getDef());
+    } else {
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' list is not entirely DefInit!");
+    }
+  }
+  return Defs;
+}
+
+/// getValueAsInt - This method looks up the specified field and returns its
+/// value as an int64_t, aborting if the field does not exist or if the value
+/// is not the right type.
+///
+int64_t Record::getValueAsInt(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (IntInit *II = dyn_cast<IntInit>(R->getValue()))
+    return II->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have an int initializer!");
+}
+
+/// getValueAsListOfInts - This method looks up the specified field and returns
+/// its value as a vector of integers, aborting if the field does not exist or
+/// if the value is not the right type.
+///
+std::vector<int64_t>
+Record::getValueAsListOfInts(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<int64_t> Ints;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (IntInit *II = dyn_cast<IntInit>(List->getElement(i))) {
+      Ints.push_back(II->getValue());
+    } else {
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' does not have a list of ints initializer!");
+    }
+  }
+  return Ints;
+}
+
+/// getValueAsListOfStrings - This method looks up the specified field and
+/// returns its value as a vector of strings, aborting if the field does not
+/// exist or if the value is not the right type.
+///
+std::vector<std::string>
+Record::getValueAsListOfStrings(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<std::string> Strings;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (StringInit *II = dyn_cast<StringInit>(List->getElement(i))) {
+      Strings.push_back(II->getValue());
+    } else {
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' does not have a list of strings initializer!");
+    }
+  }
+  return Strings;
+}
+
+/// getValueAsDef - This method looks up the specified field and returns its
+/// value as a Record, aborting if the field does not exist or if the value
+/// is not the right type.
+///
+Record *Record::getValueAsDef(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (DefInit *DI = dyn_cast<DefInit>(R->getValue()))
+    return DI->getDef();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a def initializer!");
+}
+
+/// getValueAsBit - This method looks up the specified field and returns its
+/// value as a bit, aborting if the field does not exist or if the value is
+/// not the right type.
+///
+bool Record::getValueAsBit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (BitInit *BI = dyn_cast<BitInit>(R->getValue()))
+    return BI->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a bit initializer!");
+}
+
+bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (R->getValue() == UnsetInit::get()) {
+    Unset = true;
+    return false;
+  }
+  Unset = false;
+  if (BitInit *BI = dyn_cast<BitInit>(R->getValue()))
+    return BI->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a bit initializer!");
+}
+
+/// getValueAsDag - This method looks up the specified field and returns its
+/// value as an Dag, aborting if the field does not exist or if the value is
+/// not the right type.
+///
+DagInit *Record::getValueAsDag(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (DagInit *DI = dyn_cast<DagInit>(R->getValue()))
+    return DI;
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a dag initializer!");
+}
+
+
+void MultiClass::dump() const {
+  errs() << "Record:\n";
+  Rec.dump();
+
+  errs() << "Defs:\n";
+  for (RecordVector::const_iterator r = DefPrototypes.begin(),
+         rend = DefPrototypes.end();
+       r != rend;
+       ++r) {
+    (*r)->dump();
+  }
+}
+
+
+void RecordKeeper::dump() const { errs() << *this; }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
+  OS << "------------- Classes -----------------\n";
+  const std::map<std::string, Record*> &Classes = RK.getClasses();
+  for (std::map<std::string, Record*>::const_iterator I = Classes.begin(),
+         E = Classes.end(); I != E; ++I)
+    OS << "class " << *I->second;
+
+  OS << "------------- Defs -----------------\n";
+  const std::map<std::string, Record*> &Defs = RK.getDefs();
+  for (std::map<std::string, Record*>::const_iterator I = Defs.begin(),
+         E = Defs.end(); I != E; ++I)
+    OS << "def " << *I->second;
+  return OS;
+}
+
+
+/// getAllDerivedDefinitions - This method returns all concrete definitions
+/// that derive from the specified class name.  If a class with the specified
+/// name does not exist, an error is printed and true is returned.
+std::vector<Record*>
+RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
+  Record *Class = getClass(ClassName);
+  if (!Class)
+    PrintFatalError("ERROR: Couldn't find the `" + ClassName + "' class!\n");
+
+  std::vector<Record*> Defs;
+  for (std::map<std::string, Record*>::const_iterator I = getDefs().begin(),
+         E = getDefs().end(); I != E; ++I)
+    if (I->second->isSubClassOf(Class))
+      Defs.push_back(I->second);
+
+  return Defs;
+}
+
+/// QualifyName - Return an Init with a qualifier prefix referring
+/// to CurRec's name.
+Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
+                        Init *Name, const std::string &Scoper) {
+  RecTy *Type = dyn_cast<TypedInit>(Name)->getType();
+
+  BinOpInit *NewName =
+    BinOpInit::get(BinOpInit::STRCONCAT, 
+                      BinOpInit::get(BinOpInit::STRCONCAT,
+                                        CurRec.getNameInit(),
+                                        StringInit::get(Scoper),
+                                        Type)->Fold(&CurRec, CurMultiClass),
+                      Name,
+                      Type);
+
+  if (CurMultiClass && Scoper != "::") {
+    NewName =
+      BinOpInit::get(BinOpInit::STRCONCAT, 
+                        BinOpInit::get(BinOpInit::STRCONCAT,
+                                          CurMultiClass->Rec.getNameInit(),
+                                          StringInit::get("::"),
+                                          Type)->Fold(&CurRec, CurMultiClass),
+                        NewName->Fold(&CurRec, CurMultiClass),
+                        Type);
+  }
+
+  return NewName->Fold(&CurRec, CurMultiClass);
+}
+
+/// QualifyName - Return an Init with a qualifier prefix referring
+/// to CurRec's name.
+Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
+                        const std::string &Name,
+                        const std::string &Scoper) {
+  return QualifyName(CurRec, CurMultiClass, StringInit::get(Name), Scoper);
+}
diff --git a/contrib/llvm/lib/TableGen/StringMatcher.cpp b/contrib/llvm/lib/TableGen/StringMatcher.cpp
new file mode 100644
index 000000000000..16681702d1d6
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/StringMatcher.cpp
@@ -0,0 +1,149 @@
+//===- StringMatcher.cpp - Generate a matcher for input strings -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StringMatcher class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+using namespace llvm;
+
+/// FindFirstNonCommonLetter - Find the first character in the keys of the
+/// string pairs that is not shared across the whole set of strings.  All
+/// strings are assumed to have the same length.
+static unsigned 
+FindFirstNonCommonLetter(const std::vector<const
+                              StringMatcher::StringPair*> &Matches) {
+  assert(!Matches.empty());
+  for (unsigned i = 0, e = Matches[0]->first.size(); i != e; ++i) {
+    // Check to see if letter i is the same across the set.
+    char Letter = Matches[0]->first[i];
+    
+    for (unsigned str = 0, e = Matches.size(); str != e; ++str)
+      if (Matches[str]->first[i] != Letter)
+        return i;
+  }
+  
+  return Matches[0]->first.size();
+}
+
+/// EmitStringMatcherForChar - Given a set of strings that are known to be the
+/// same length and whose characters leading up to CharNo are the same, emit
+/// code to verify that CharNo and later are the same.
+///
+/// \return - True if control can leave the emitted code fragment.
+bool StringMatcher::
+EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
+                         unsigned CharNo, unsigned IndentCount) const {
+  assert(!Matches.empty() && "Must have at least one string to match!");
+  std::string Indent(IndentCount*2+4, ' ');
+  
+  // If we have verified that the entire string matches, we're done: output the
+  // matching code.
+  if (CharNo == Matches[0]->first.size()) {
+    assert(Matches.size() == 1 && "Had duplicate keys to match on");
+    
+    // If the to-execute code has \n's in it, indent each subsequent line.
+    StringRef Code = Matches[0]->second;
+    
+    std::pair<StringRef, StringRef> Split = Code.split('\n');
+    OS << Indent << Split.first << "\t // \"" << Matches[0]->first << "\"\n";
+
+    Code = Split.second;
+    while (!Code.empty()) {
+      Split = Code.split('\n');
+      OS << Indent << Split.first << "\n";
+      Code = Split.second;
+    }
+    return false;
+  }
+  
+  // Bucket the matches by the character we are comparing.
+  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
+  
+  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
+    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
+  
+  
+  // If we have exactly one bucket to match, see how many characters are common
+  // across the whole set and match all of them at once.
+  if (MatchesByLetter.size() == 1) {
+    unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches);
+    unsigned NumChars = FirstNonCommonLetter-CharNo;
+    
+    // Emit code to break out if the prefix doesn't match.
+    if (NumChars == 1) {
+      // Do the comparison with if (Str[1] != 'f')
+      // FIXME: Need to escape general characters.
+      OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] != '"
+      << Matches[0]->first[CharNo] << "')\n";
+      OS << Indent << "  break;\n";
+    } else {
+      // Do the comparison with if memcmp(Str.data()+1, "foo", 3).
+      // FIXME: Need to escape general strings.
+      OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
+         << ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
+         << NumChars << "))\n";
+      OS << Indent << "  break;\n";
+    }
+    
+    return EmitStringMatcherForChar(Matches, FirstNonCommonLetter, IndentCount);
+  }
+  
+  // Otherwise, we have multiple possible things, emit a switch on the
+  // character.
+  OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
+  OS << Indent << "default: break;\n";
+  
+  for (std::map<char, std::vector<const StringPair*> >::iterator LI = 
+       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
+    // TODO: escape hard stuff (like \n) if we ever care about it.
+    OS << Indent << "case '" << LI->first << "':\t // "
+       << LI->second.size() << " string";
+    if (LI->second.size() != 1) OS << 's';
+    OS << " to match.\n";
+    if (EmitStringMatcherForChar(LI->second, CharNo+1, IndentCount+1))
+      OS << Indent << "  break;\n";
+  }
+  
+  OS << Indent << "}\n";
+  return true;
+}
+
+
+/// Emit - Top level entry point.
+///
+void StringMatcher::Emit(unsigned Indent) const {
+  // If nothing to match, just fall through.
+  if (Matches.empty()) return;
+  
+  // First level categorization: group strings by length.
+  std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
+  
+  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
+    MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
+  
+  // Output a switch statement on length and categorize the elements within each
+  // bin.
+  OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
+  OS.indent(Indent*2+2) << "default: break;\n";
+  
+  for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
+       MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
+    OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
+       << LI->second.size()
+       << " string" << (LI->second.size() == 1 ? "" : "s") << " to match.\n";
+    if (EmitStringMatcherForChar(LI->second, 0, Indent))
+      OS.indent(Indent*2+4) << "break;\n";
+  }
+  
+  OS.indent(Indent*2+2) << "}\n";
+}
diff --git a/contrib/llvm/lib/TableGen/TGLexer.cpp b/contrib/llvm/lib/TableGen/TGLexer.cpp
new file mode 100644
index 000000000000..c6be4f8a1189
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/TGLexer.cpp
@@ -0,0 +1,486 @@
+//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Lexer for TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGLexer.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Config/config.h" // for strtoull()/strtoll() define
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Error.h"
+#include <cctype>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
+  CurBuffer = 0;
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+  CurPtr = CurBuf->getBufferStart();
+  TokStart = 0;
+}
+
+SMLoc TGLexer::getLoc() const {
+  return SMLoc::getFromPointer(TokStart);
+}
+
+/// ReturnError - Set the error to the specified string at the specified
+/// location.  This is defined to always return tgtok::Error.
+tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
+  PrintError(Loc, Msg);
+  return tgtok::Error;
+}
+
+int TGLexer::getNextChar() {
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default:
+    return (unsigned char)CurChar;
+  case 0: {
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr-1 != CurBuf->getBufferEnd())
+      return 0;  // Just whitespace.
+    
+    // If this is the end of an included file, pop the parent file off the
+    // include stack.
+    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+    if (ParentIncludeLoc != SMLoc()) {
+      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
+      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+      CurPtr = ParentIncludeLoc.getPointer();
+      return getNextChar();
+    }
+    
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.  
+    return EOF;
+  }
+  case '\n':
+  case '\r':
+    // Handle the newline character by ignoring it and incrementing the line
+    // count.  However, be careful about 'dos style' files with \n\r in them.
+    // Only treat a \n\r or \r\n as a single line.
+    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
+        *CurPtr != CurChar)
+      ++CurPtr;  // Eat the two char newline sequence.
+    return '\n';
+  }  
+}
+
+int TGLexer::peekNextChar(int Index) {
+  return *(CurPtr + Index);
+}
+
+tgtok::TokKind TGLexer::LexToken() {
+  TokStart = CurPtr;
+  // This always consumes at least one character.
+  int CurChar = getNextChar();
+
+  switch (CurChar) {
+  default:
+    // Handle letters: [a-zA-Z_]
+    if (isalpha(CurChar) || CurChar == '_')
+      return LexIdentifier();
+
+    // Unknown character, emit an error.
+    return ReturnError(TokStart, "Unexpected character");
+  case EOF: return tgtok::Eof;
+  case ':': return tgtok::colon;
+  case ';': return tgtok::semi;
+  case '.': return tgtok::period;
+  case ',': return tgtok::comma;
+  case '<': return tgtok::less;
+  case '>': return tgtok::greater;
+  case ']': return tgtok::r_square;
+  case '{': return tgtok::l_brace;
+  case '}': return tgtok::r_brace;
+  case '(': return tgtok::l_paren;
+  case ')': return tgtok::r_paren;
+  case '=': return tgtok::equal;
+  case '?': return tgtok::question;
+  case '#': return tgtok::paste;
+      
+  case 0:
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\r':
+    // Ignore whitespace.
+    return LexToken();
+  case '/':
+    // If this is the start of a // comment, skip until the end of the line or
+    // the end of the buffer.
+    if (*CurPtr == '/')
+      SkipBCPLComment();
+    else if (*CurPtr == '*') {
+      if (SkipCComment())
+        return tgtok::Error;
+    } else // Otherwise, this is an error.
+      return ReturnError(TokStart, "Unexpected character");
+    return LexToken();
+  case '-': case '+':
+  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+  case '7': case '8': case '9': {
+    int NextChar = 0;
+    if (isdigit(CurChar)) {
+      // Allow identifiers to start with a number if it is followed by
+      // an identifier.  This can happen with paste operations like
+      // foo#8i.
+      int i = 0;
+      do {
+        NextChar = peekNextChar(i++);
+      } while (isdigit(NextChar));
+
+      if (NextChar == 'x' || NextChar == 'b') {
+        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+        // likely a number.
+        int NextNextChar = peekNextChar(i);
+        switch (NextNextChar) {
+        default:
+          break;
+        case '0': case '1': 
+          if (NextChar == 'b')
+            return LexNumber();
+          // Fallthrough
+        case '2': case '3': case '4': case '5':
+        case '6': case '7': case '8': case '9':
+        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
+        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+          if (NextChar == 'x')
+            return LexNumber();
+          break;
+        }
+      }
+    }
+
+    if (isalpha(NextChar) || NextChar == '_')
+      return LexIdentifier();
+
+    return LexNumber();
+  }
+  case '"': return LexString();
+  case '$': return LexVarName();
+  case '[': return LexBracket();
+  case '!': return LexExclaim();
+  }
+}
+
+/// LexString - Lex "[^"]*"
+tgtok::TokKind TGLexer::LexString() {
+  const char *StrStart = CurPtr;
+  
+  CurStrVal = "";
+  
+  while (*CurPtr != '"') {
+    // If we hit the end of the buffer, report an error.
+    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
+      return ReturnError(StrStart, "End of file in string literal");
+    
+    if (*CurPtr == '\n' || *CurPtr == '\r')
+      return ReturnError(StrStart, "End of line in string literal");
+    
+    if (*CurPtr != '\\') {
+      CurStrVal += *CurPtr++;
+      continue;
+    }
+
+    ++CurPtr;
+    
+    switch (*CurPtr) {
+    case '\\': case '\'': case '"':
+      // These turn into their literal character.
+      CurStrVal += *CurPtr++;
+      break;
+    case 't':
+      CurStrVal += '\t';
+      ++CurPtr;
+      break;
+    case 'n':
+      CurStrVal += '\n';
+      ++CurPtr;
+      break;
+        
+    case '\n':
+    case '\r':
+      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
+
+    // If we hit the end of the buffer, report an error.
+    case '\0':
+      if (CurPtr == CurBuf->getBufferEnd())
+        return ReturnError(StrStart, "End of file in string literal");
+      // FALL THROUGH
+    default:
+      return ReturnError(CurPtr, "invalid escape in string literal");
+    }
+  }
+  
+  ++CurPtr;
+  return tgtok::StrVal;
+}
+
+tgtok::TokKind TGLexer::LexVarName() {
+  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
+    return ReturnError(TokStart, "Invalid variable name");
+  
+  // Otherwise, we're ok, consume the rest of the characters.
+  const char *VarNameStart = CurPtr++;
+  
+  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+    ++CurPtr;
+
+  CurStrVal.assign(VarNameStart, CurPtr);
+  return tgtok::VarName;
+}
+
+
+tgtok::TokKind TGLexer::LexIdentifier() {
+  // The first letter is [a-zA-Z_#].
+  const char *IdentStart = TokStart;
+
+  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
+  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+    ++CurPtr;
+
+  // Check to see if this identifier is a keyword.
+  StringRef Str(IdentStart, CurPtr-IdentStart);
+
+  if (Str == "include") {
+    if (LexInclude()) return tgtok::Error;
+    return Lex();
+  }
+
+  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
+    .Case("int", tgtok::Int)
+    .Case("bit", tgtok::Bit)
+    .Case("bits", tgtok::Bits)
+    .Case("string", tgtok::String)
+    .Case("list", tgtok::List)
+    .Case("code", tgtok::Code)
+    .Case("dag", tgtok::Dag)
+    .Case("class", tgtok::Class)
+    .Case("def", tgtok::Def)
+    .Case("foreach", tgtok::Foreach)
+    .Case("defm", tgtok::Defm)
+    .Case("multiclass", tgtok::MultiClass)
+    .Case("field", tgtok::Field)
+    .Case("let", tgtok::Let)
+    .Case("in", tgtok::In)
+    .Default(tgtok::Id);
+
+  if (Kind == tgtok::Id)
+    CurStrVal.assign(Str.begin(), Str.end());
+  return Kind;
+}
+
+/// LexInclude - We just read the "include" token.  Get the string token that
+/// comes next and enter the include.
+bool TGLexer::LexInclude() {
+  // The token after the include must be a string.
+  tgtok::TokKind Tok = LexToken();
+  if (Tok == tgtok::Error) return true;
+  if (Tok != tgtok::StrVal) {
+    PrintError(getLoc(), "Expected filename after include");
+    return true;
+  }
+
+  // Get the string.
+  std::string Filename = CurStrVal;
+  std::string IncludedFile;
+
+  
+  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
+                                    IncludedFile);
+  if (CurBuffer == -1) {
+    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
+    return true;
+  }
+  
+  DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
+  if (Found != Dependencies.end()) {
+    PrintError(getLoc(),
+               "File '" + IncludedFile + "' has already been included.");
+    SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
+                        "previously included here");
+    return true;
+  }
+  Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
+  // Save the line number and lex buffer of the includer.
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+  CurPtr = CurBuf->getBufferStart();
+  return false;
+}
+
+void TGLexer::SkipBCPLComment() {
+  ++CurPtr;  // skip the second slash.
+  while (1) {
+    switch (*CurPtr) {
+    case '\n':
+    case '\r':
+      return;  // Newline is end of comment.
+    case 0:
+      // If this is the end of the buffer, end the comment.
+      if (CurPtr == CurBuf->getBufferEnd())
+        return;
+      break;
+    }
+    // Otherwise, skip the character.
+    ++CurPtr;
+  }
+}
+
+/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
+/// is that we allow nesting.
+bool TGLexer::SkipCComment() {
+  ++CurPtr;  // skip the star.
+  unsigned CommentDepth = 1;
+  
+  while (1) {
+    int CurChar = getNextChar();
+    switch (CurChar) {
+    case EOF:
+      PrintError(TokStart, "Unterminated comment!");
+      return true;
+    case '*':
+      // End of the comment?
+      if (CurPtr[0] != '/') break;
+      
+      ++CurPtr;   // End the */.
+      if (--CommentDepth == 0)
+        return false;
+      break;
+    case '/':
+      // Start of a nested comment?
+      if (CurPtr[0] != '*') break;
+      ++CurPtr;
+      ++CommentDepth;
+      break;
+    }
+  }
+}
+
+/// LexNumber - Lex:
+///    [-+]?[0-9]+
+///    0x[0-9a-fA-F]+
+///    0b[01]+
+tgtok::TokKind TGLexer::LexNumber() {
+  if (CurPtr[-1] == '0') {
+    if (CurPtr[0] == 'x') {
+      ++CurPtr;
+      const char *NumStart = CurPtr;
+      while (isxdigit(CurPtr[0]))
+        ++CurPtr;
+      
+      // Requires at least one hex digit.
+      if (CurPtr == NumStart)
+        return ReturnError(TokStart, "Invalid hexadecimal number");
+
+      errno = 0;
+      CurIntVal = strtoll(NumStart, 0, 16);
+      if (errno == EINVAL)
+        return ReturnError(TokStart, "Invalid hexadecimal number");
+      if (errno == ERANGE) {
+        errno = 0;
+        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
+        if (errno == EINVAL)
+          return ReturnError(TokStart, "Invalid hexadecimal number");
+        if (errno == ERANGE)
+          return ReturnError(TokStart, "Hexadecimal number out of range");
+      }
+      return tgtok::IntVal;
+    } else if (CurPtr[0] == 'b') {
+      ++CurPtr;
+      const char *NumStart = CurPtr;
+      while (CurPtr[0] == '0' || CurPtr[0] == '1')
+        ++CurPtr;
+
+      // Requires at least one binary digit.
+      if (CurPtr == NumStart)
+        return ReturnError(CurPtr-2, "Invalid binary number");
+      CurIntVal = strtoll(NumStart, 0, 2);
+      return tgtok::IntVal;
+    }
+  }
+
+  // Check for a sign without a digit.
+  if (!isdigit(CurPtr[0])) {
+    if (CurPtr[-1] == '-')
+      return tgtok::minus;
+    else if (CurPtr[-1] == '+')
+      return tgtok::plus;
+  }
+  
+  while (isdigit(CurPtr[0]))
+    ++CurPtr;
+  CurIntVal = strtoll(TokStart, 0, 10);
+  return tgtok::IntVal;
+}
+
+/// LexBracket - We just read '['.  If this is a code block, return it,
+/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
+tgtok::TokKind TGLexer::LexBracket() {
+  if (CurPtr[0] != '{')
+    return tgtok::l_square;
+  ++CurPtr;
+  const char *CodeStart = CurPtr;
+  while (1) {
+    int Char = getNextChar();
+    if (Char == EOF) break;
+    
+    if (Char != '}') continue;
+    
+    Char = getNextChar();
+    if (Char == EOF) break;
+    if (Char == ']') {
+      CurStrVal.assign(CodeStart, CurPtr-2);
+      return tgtok::CodeFragment;
+    }
+  }
+  
+  return ReturnError(CodeStart-2, "Unterminated Code Block");
+}
+
+/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
+tgtok::TokKind TGLexer::LexExclaim() {
+  if (!isalpha(*CurPtr))
+    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
+  
+  const char *Start = CurPtr++;
+  while (isalpha(*CurPtr))
+    ++CurPtr;
+  
+  // Check to see which operator this is.
+  tgtok::TokKind Kind =
+    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
+    .Case("eq", tgtok::XEq)
+    .Case("if", tgtok::XIf)
+    .Case("head", tgtok::XHead)
+    .Case("tail", tgtok::XTail)
+    .Case("con", tgtok::XConcat)
+    .Case("add", tgtok::XADD)
+    .Case("shl", tgtok::XSHL)
+    .Case("sra", tgtok::XSRA)
+    .Case("srl", tgtok::XSRL)
+    .Case("cast", tgtok::XCast)
+    .Case("empty", tgtok::XEmpty)
+    .Case("subst", tgtok::XSubst)
+    .Case("foreach", tgtok::XForEach)
+    .Case("strconcat", tgtok::XStrConcat)
+    .Default(tgtok::Error);
+
+  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
+}
+
diff --git a/contrib/llvm/lib/TableGen/TGLexer.h b/contrib/llvm/lib/TableGen/TGLexer.h
new file mode 100644
index 000000000000..d1bd70d2eca4
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/TGLexer.h
@@ -0,0 +1,132 @@
+//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Lexer for tablegen files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TGLEXER_H
+#define TGLEXER_H
+
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <map>
+#include <string>
+
+namespace llvm {
+class MemoryBuffer;
+class SourceMgr;
+class SMLoc;
+class Twine;
+
+namespace tgtok {
+  enum TokKind {
+    // Markers
+    Eof, Error,
+    
+    // Tokens with no info.
+    minus, plus,        // - +
+    l_square, r_square, // [ ]
+    l_brace, r_brace,   // { }
+    l_paren, r_paren,   // ( )
+    less, greater,      // < >
+    colon, semi,        // : ;
+    comma, period,      // , .
+    equal, question,    // = ?
+    paste,              // #
+
+    // Keywords.
+    Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List,
+    MultiClass, String,
+    
+    // !keywords.
+    XConcat, XADD, XSRA, XSRL, XSHL, XStrConcat, XCast, XSubst,
+    XForEach, XHead, XTail, XEmpty, XIf, XEq,
+
+    // Integer value.
+    IntVal,
+    
+    // String valued tokens.
+    Id, StrVal, VarName, CodeFragment
+  };
+}
+
+/// TGLexer - TableGen Lexer class.
+class TGLexer {
+  SourceMgr &SrcMgr;
+  
+  const char *CurPtr;
+  const MemoryBuffer *CurBuf;
+
+  // Information about the current token.
+  const char *TokStart;
+  tgtok::TokKind CurCode;
+  std::string CurStrVal;  // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
+  int64_t CurIntVal;      // This is valid for INTVAL.
+
+  /// CurBuffer - This is the current buffer index we're lexing from as managed
+  /// by the SourceMgr object.
+  int CurBuffer;
+
+public:
+  typedef std::map<std::string, SMLoc> DependenciesMapTy;
+private:
+  /// Dependencies - This is the list of all included files.
+  DependenciesMapTy Dependencies;
+
+public:
+  TGLexer(SourceMgr &SrcMgr);
+  ~TGLexer() {}
+  
+  tgtok::TokKind Lex() {
+    return CurCode = LexToken();
+  }
+
+  const DependenciesMapTy &getDependencies() const {
+    return Dependencies;
+  }
+  
+  tgtok::TokKind getCode() const { return CurCode; }
+
+  const std::string &getCurStrVal() const {
+    assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 
+            CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
+           "This token doesn't have a string value");
+    return CurStrVal;
+  }
+  int64_t getCurIntVal() const {
+    assert(CurCode == tgtok::IntVal && "This token isn't an integer");
+    return CurIntVal;
+  }
+
+  SMLoc getLoc() const;
+  
+private:
+  /// LexToken - Read the next token and return its code.
+  tgtok::TokKind LexToken();
+  
+  tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
+  
+  int getNextChar();
+  int peekNextChar(int Index);
+  void SkipBCPLComment();
+  bool SkipCComment();
+  tgtok::TokKind LexIdentifier();
+  bool LexInclude();
+  tgtok::TokKind LexString();
+  tgtok::TokKind LexVarName();
+  tgtok::TokKind LexNumber();
+  tgtok::TokKind LexBracket();
+  tgtok::TokKind LexExclaim();
+};
+  
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm/lib/TableGen/TGParser.cpp
new file mode 100644
index 000000000000..86ad2a6e3c09
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/TGParser.cpp
@@ -0,0 +1,2591 @@
+//===- TGParser.cpp - Parser for TableGen Files ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Parser for TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGParser.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TableGen/Record.h"
+#include <algorithm>
+#include <sstream>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Support Code for the Semantic Actions.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+struct SubClassReference {
+  SMRange RefRange;
+  Record *Rec;
+  std::vector<Init*> TemplateArgs;
+  SubClassReference() : Rec(0) {}
+
+  bool isInvalid() const { return Rec == 0; }
+};
+
+struct SubMultiClassReference {
+  SMRange RefRange;
+  MultiClass *MC;
+  std::vector<Init*> TemplateArgs;
+  SubMultiClassReference() : MC(0) {}
+
+  bool isInvalid() const { return MC == 0; }
+  void dump() const;
+};
+
+void SubMultiClassReference::dump() const {
+  errs() << "Multiclass:\n";
+
+  MC->dump();
+
+  errs() << "Template args:\n";
+  for (std::vector<Init *>::const_iterator i = TemplateArgs.begin(),
+         iend = TemplateArgs.end();
+       i != iend;
+       ++i) {
+    (*i)->dump();
+  }
+}
+
+} // end namespace llvm
+
+bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
+  if (CurRec == 0)
+    CurRec = &CurMultiClass->Rec;
+
+  if (RecordVal *ERV = CurRec->getValue(RV.getNameInit())) {
+    // The value already exists in the class, treat this as a set.
+    if (ERV->setValue(RV.getValue()))
+      return Error(Loc, "New definition of '" + RV.getName() + "' of type '" +
+                   RV.getType()->getAsString() + "' is incompatible with " +
+                   "previous definition of type '" +
+                   ERV->getType()->getAsString() + "'");
+  } else {
+    CurRec->addValue(RV);
+  }
+  return false;
+}
+
+/// SetValue -
+/// Return true on error, false on success.
+bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
+                        const std::vector<unsigned> &BitList, Init *V) {
+  if (!V) return false;
+
+  if (CurRec == 0) CurRec = &CurMultiClass->Rec;
+
+  RecordVal *RV = CurRec->getValue(ValName);
+  if (RV == 0)
+    return Error(Loc, "Value '" + ValName->getAsUnquotedString()
+                 + "' unknown!");
+
+  // Do not allow assignments like 'X = X'.  This will just cause infinite loops
+  // in the resolution machinery.
+  if (BitList.empty())
+    if (VarInit *VI = dyn_cast<VarInit>(V))
+      if (VI->getNameInit() == ValName)
+        return false;
+
+  // If we are assigning to a subset of the bits in the value... then we must be
+  // assigning to a field of BitsRecTy, which must have a BitsInit
+  // initializer.
+  //
+  if (!BitList.empty()) {
+    BitsInit *CurVal = dyn_cast<BitsInit>(RV->getValue());
+    if (CurVal == 0)
+      return Error(Loc, "Value '" + ValName->getAsUnquotedString()
+                   + "' is not a bits type");
+
+    // Convert the incoming value to a bits type of the appropriate size...
+    Init *BI = V->convertInitializerTo(BitsRecTy::get(BitList.size()));
+    if (BI == 0) {
+      return Error(Loc, "Initializer is not compatible with bit range");
+    }
+
+    // We should have a BitsInit type now.
+    BitsInit *BInit = dyn_cast<BitsInit>(BI);
+    assert(BInit != 0);
+
+    SmallVector<Init *, 16> NewBits(CurVal->getNumBits());
+
+    // Loop over bits, assigning values as appropriate.
+    for (unsigned i = 0, e = BitList.size(); i != e; ++i) {
+      unsigned Bit = BitList[i];
+      if (NewBits[Bit])
+        return Error(Loc, "Cannot set bit #" + utostr(Bit) + " of value '" +
+                     ValName->getAsUnquotedString() + "' more than once");
+      NewBits[Bit] = BInit->getBit(i);
+    }
+
+    for (unsigned i = 0, e = CurVal->getNumBits(); i != e; ++i)
+      if (NewBits[i] == 0)
+        NewBits[i] = CurVal->getBit(i);
+
+    V = BitsInit::get(NewBits);
+  }
+
+  if (RV->setValue(V))
+    return Error(Loc, "Value '" + ValName->getAsUnquotedString() + "' of type '"
+                 + RV->getType()->getAsString() +
+                 "' is incompatible with initializer '" + V->getAsString()
+                 + "'");
+  return false;
+}
+
+/// AddSubClass - Add SubClass as a subclass to CurRec, resolving its template
+/// args as SubClass's template arguments.
+bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
+  Record *SC = SubClass.Rec;
+  // Add all of the values in the subclass into the current class.
+  const std::vector<RecordVal> &Vals = SC->getValues();
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (AddValue(CurRec, SubClass.RefRange.Start, Vals[i]))
+      return true;
+
+  const std::vector<Init *> &TArgs = SC->getTemplateArgs();
+
+  // Ensure that an appropriate number of template arguments are specified.
+  if (TArgs.size() < SubClass.TemplateArgs.size())
+    return Error(SubClass.RefRange.Start,
+                 "More template args specified than expected");
+
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+    if (i < SubClass.TemplateArgs.size()) {
+      // If a value is specified for this template arg, set it now.
+      if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i],
+                   std::vector<unsigned>(), SubClass.TemplateArgs[i]))
+        return true;
+
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
+
+      // Now remove it.
+      CurRec->removeValue(TArgs[i]);
+
+    } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
+      return Error(SubClass.RefRange.Start,
+                   "Value not specified for template argument #"
+                   + utostr(i) + " (" + TArgs[i]->getAsUnquotedString()
+                   + ") of subclass '" + SC->getNameInitAsString() + "'!");
+    }
+  }
+
+  // Since everything went well, we can now set the "superclass" list for the
+  // current record.
+  const std::vector<Record*> &SCs = SC->getSuperClasses();
+  ArrayRef<SMRange> SCRanges = SC->getSuperClassRanges();
+  for (unsigned i = 0, e = SCs.size(); i != e; ++i) {
+    if (CurRec->isSubClassOf(SCs[i]))
+      return Error(SubClass.RefRange.Start,
+                   "Already subclass of '" + SCs[i]->getName() + "'!\n");
+    CurRec->addSuperClass(SCs[i], SCRanges[i]);
+  }
+
+  if (CurRec->isSubClassOf(SC))
+    return Error(SubClass.RefRange.Start,
+                 "Already subclass of '" + SC->getName() + "'!\n");
+  CurRec->addSuperClass(SC, SubClass.RefRange);
+  return false;
+}
+
+/// AddSubMultiClass - Add SubMultiClass as a subclass to
+/// CurMC, resolving its template args as SubMultiClass's
+/// template arguments.
+bool TGParser::AddSubMultiClass(MultiClass *CurMC,
+                                SubMultiClassReference &SubMultiClass) {
+  MultiClass *SMC = SubMultiClass.MC;
+  Record *CurRec = &CurMC->Rec;
+
+  const std::vector<RecordVal> &MCVals = CurRec->getValues();
+
+  // Add all of the values in the subclass into the current class.
+  const std::vector<RecordVal> &SMCVals = SMC->Rec.getValues();
+  for (unsigned i = 0, e = SMCVals.size(); i != e; ++i)
+    if (AddValue(CurRec, SubMultiClass.RefRange.Start, SMCVals[i]))
+      return true;
+
+  int newDefStart = CurMC->DefPrototypes.size();
+
+  // Add all of the defs in the subclass into the current multiclass.
+  for (MultiClass::RecordVector::const_iterator i = SMC->DefPrototypes.begin(),
+         iend = SMC->DefPrototypes.end();
+       i != iend;
+       ++i) {
+    // Clone the def and add it to the current multiclass
+    Record *NewDef = new Record(**i);
+
+    // Add all of the values in the superclass into the current def.
+    for (unsigned i = 0, e = MCVals.size(); i != e; ++i)
+      if (AddValue(NewDef, SubMultiClass.RefRange.Start, MCVals[i]))
+        return true;
+
+    CurMC->DefPrototypes.push_back(NewDef);
+  }
+
+  const std::vector<Init *> &SMCTArgs = SMC->Rec.getTemplateArgs();
+
+  // Ensure that an appropriate number of template arguments are
+  // specified.
+  if (SMCTArgs.size() < SubMultiClass.TemplateArgs.size())
+    return Error(SubMultiClass.RefRange.Start,
+                 "More template args specified than expected");
+
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = SMCTArgs.size(); i != e; ++i) {
+    if (i < SubMultiClass.TemplateArgs.size()) {
+      // If a value is specified for this template arg, set it in the
+      // superclass now.
+      if (SetValue(CurRec, SubMultiClass.RefRange.Start, SMCTArgs[i],
+                   std::vector<unsigned>(),
+                   SubMultiClass.TemplateArgs[i]))
+        return true;
+
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(SMCTArgs[i]));
+
+      // Now remove it.
+      CurRec->removeValue(SMCTArgs[i]);
+
+      // If a value is specified for this template arg, set it in the
+      // new defs now.
+      for (MultiClass::RecordVector::iterator j =
+             CurMC->DefPrototypes.begin() + newDefStart,
+             jend = CurMC->DefPrototypes.end();
+           j != jend;
+           ++j) {
+        Record *Def = *j;
+
+        if (SetValue(Def, SubMultiClass.RefRange.Start, SMCTArgs[i],
+                     std::vector<unsigned>(),
+                     SubMultiClass.TemplateArgs[i]))
+          return true;
+
+        // Resolve it next.
+        Def->resolveReferencesTo(Def->getValue(SMCTArgs[i]));
+
+        // Now remove it
+        Def->removeValue(SMCTArgs[i]);
+      }
+    } else if (!CurRec->getValue(SMCTArgs[i])->getValue()->isComplete()) {
+      return Error(SubMultiClass.RefRange.Start,
+                   "Value not specified for template argument #"
+                   + utostr(i) + " (" + SMCTArgs[i]->getAsUnquotedString()
+                   + ") of subclass '" + SMC->Rec.getNameInitAsString() + "'!");
+    }
+  }
+
+  return false;
+}
+
+/// ProcessForeachDefs - Given a record, apply all of the variable
+/// values in all surrounding foreach loops, creating new records for
+/// each combination of values.
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc) {
+  if (Loops.empty())
+    return false;
+
+  // We want to instantiate a new copy of CurRec for each combination
+  // of nested loop iterator values.  We don't want top instantiate
+  // any copies until we have values for each loop iterator.
+  IterSet IterVals;
+  return ProcessForeachDefs(CurRec, Loc, IterVals);
+}
+
+/// ProcessForeachDefs - Given a record, a loop and a loop iterator,
+/// apply each of the variable values in this loop and then process
+/// subloops.
+bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
+  // Recursively build a tuple of iterator values.
+  if (IterVals.size() != Loops.size()) {
+    assert(IterVals.size() < Loops.size());
+    ForeachLoop &CurLoop = Loops[IterVals.size()];
+    ListInit *List = dyn_cast<ListInit>(CurLoop.ListValue);
+    if (List == 0) {
+      Error(Loc, "Loop list is not a list");
+      return true;
+    }
+
+    // Process each value.
+    for (int64_t i = 0; i < List->getSize(); ++i) {
+      Init *ItemVal = List->resolveListElementReference(*CurRec, 0, i);
+      IterVals.push_back(IterRecord(CurLoop.IterVar, ItemVal));
+      if (ProcessForeachDefs(CurRec, Loc, IterVals))
+        return true;
+      IterVals.pop_back();
+    }
+    return false;
+  }
+
+  // This is the bottom of the recursion. We have all of the iterator values
+  // for this point in the iteration space.  Instantiate a new record to
+  // reflect this combination of values.
+  Record *IterRec = new Record(*CurRec);
+
+  // Set the iterator values now.
+  for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
+    VarInit *IterVar = IterVals[i].IterVar;
+    TypedInit *IVal = dyn_cast<TypedInit>(IterVals[i].IterValue);
+    if (IVal == 0) {
+      Error(Loc, "foreach iterator value is untyped");
+      return true;
+    }
+
+    IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
+
+    if (SetValue(IterRec, Loc, IterVar->getName(),
+                 std::vector<unsigned>(), IVal)) {
+      Error(Loc, "when instantiating this def");
+      return true;
+    }
+
+    // Resolve it next.
+    IterRec->resolveReferencesTo(IterRec->getValue(IterVar->getName()));
+
+    // Remove it.
+    IterRec->removeValue(IterVar->getName());
+  }
+
+  if (Records.getDef(IterRec->getNameInitAsString())) {
+    Error(Loc, "def already exists: " + IterRec->getNameInitAsString());
+    return true;
+  }
+
+  Records.addDef(IterRec);
+  IterRec->resolveReferences();
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Parser Code
+//===----------------------------------------------------------------------===//
+
+/// isObjectStart - Return true if this is a valid first token for an Object.
+static bool isObjectStart(tgtok::TokKind K) {
+  return K == tgtok::Class || K == tgtok::Def ||
+         K == tgtok::Defm || K == tgtok::Let ||
+         K == tgtok::MultiClass || K == tgtok::Foreach;
+}
+
+static std::string GetNewAnonymousName() {
+  static unsigned AnonCounter = 0;
+  unsigned Tmp = AnonCounter++; // MSVC2012 ICEs without this.
+  return "anonymous." + utostr(Tmp);
+}
+
+/// ParseObjectName - If an object name is specified, return it.  Otherwise,
+/// return 0.
+///   ObjectName ::= Value [ '#' Value ]*
+///   ObjectName ::= /*empty*/
+///
+Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
+  switch (Lex.getCode()) {
+  case tgtok::colon:
+  case tgtok::semi:
+  case tgtok::l_brace:
+    // These are all of the tokens that can begin an object body.
+    // Some of these can also begin values but we disallow those cases
+    // because they are unlikely to be useful.
+    return 0;
+  default:
+    break;
+  }
+
+  Record *CurRec = 0;
+  if (CurMultiClass)
+    CurRec = &CurMultiClass->Rec;
+
+  RecTy *Type = 0;
+  if (CurRec) {
+    const TypedInit *CurRecName = dyn_cast<TypedInit>(CurRec->getNameInit());
+    if (!CurRecName) {
+      TokError("Record name is not typed!");
+      return 0;
+    }
+    Type = CurRecName->getType();
+  }
+
+  return ParseValue(CurRec, Type, ParseNameMode);
+}
+
+/// ParseClassID - Parse and resolve a reference to a class name.  This returns
+/// null on error.
+///
+///    ClassID ::= ID
+///
+Record *TGParser::ParseClassID() {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("expected name for ClassID");
+    return 0;
+  }
+
+  Record *Result = Records.getClass(Lex.getCurStrVal());
+  if (Result == 0)
+    TokError("Couldn't find class '" + Lex.getCurStrVal() + "'");
+
+  Lex.Lex();
+  return Result;
+}
+
+/// ParseMultiClassID - Parse and resolve a reference to a multiclass name.
+/// This returns null on error.
+///
+///    MultiClassID ::= ID
+///
+MultiClass *TGParser::ParseMultiClassID() {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("expected name for MultiClassID");
+    return 0;
+  }
+
+  MultiClass *Result = MultiClasses[Lex.getCurStrVal()];
+  if (Result == 0)
+    TokError("Couldn't find multiclass '" + Lex.getCurStrVal() + "'");
+
+  Lex.Lex();
+  return Result;
+}
+
+/// ParseSubClassReference - Parse a reference to a subclass or to a templated
+/// subclass.  This returns a SubClassRefTy with a null Record* on error.
+///
+///  SubClassRef ::= ClassID
+///  SubClassRef ::= ClassID '<' ValueList '>'
+///
+SubClassReference TGParser::
+ParseSubClassReference(Record *CurRec, bool isDefm) {
+  SubClassReference Result;
+  Result.RefRange.Start = Lex.getLoc();
+
+  if (isDefm) {
+    if (MultiClass *MC = ParseMultiClassID())
+      Result.Rec = &MC->Rec;
+  } else {
+    Result.Rec = ParseClassID();
+  }
+  if (Result.Rec == 0) return Result;
+
+  // If there is no template arg list, we're done.
+  if (Lex.getCode() != tgtok::less) {
+    Result.RefRange.End = Lex.getLoc();
+    return Result;
+  }
+  Lex.Lex();  // Eat the '<'
+
+  if (Lex.getCode() == tgtok::greater) {
+    TokError("subclass reference requires a non-empty list of template values");
+    Result.Rec = 0;
+    return Result;
+  }
+
+  Result.TemplateArgs = ParseValueList(CurRec, Result.Rec);
+  if (Result.TemplateArgs.empty()) {
+    Result.Rec = 0;   // Error parsing value list.
+    return Result;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' in template value list");
+    Result.Rec = 0;
+    return Result;
+  }
+  Lex.Lex();
+  Result.RefRange.End = Lex.getLoc();
+
+  return Result;
+}
+
+/// ParseSubMultiClassReference - Parse a reference to a subclass or to a
+/// templated submulticlass.  This returns a SubMultiClassRefTy with a null
+/// Record* on error.
+///
+///  SubMultiClassRef ::= MultiClassID
+///  SubMultiClassRef ::= MultiClassID '<' ValueList '>'
+///
+SubMultiClassReference TGParser::
+ParseSubMultiClassReference(MultiClass *CurMC) {
+  SubMultiClassReference Result;
+  Result.RefRange.Start = Lex.getLoc();
+
+  Result.MC = ParseMultiClassID();
+  if (Result.MC == 0) return Result;
+
+  // If there is no template arg list, we're done.
+  if (Lex.getCode() != tgtok::less) {
+    Result.RefRange.End = Lex.getLoc();
+    return Result;
+  }
+  Lex.Lex();  // Eat the '<'
+
+  if (Lex.getCode() == tgtok::greater) {
+    TokError("subclass reference requires a non-empty list of template values");
+    Result.MC = 0;
+    return Result;
+  }
+
+  Result.TemplateArgs = ParseValueList(&CurMC->Rec, &Result.MC->Rec);
+  if (Result.TemplateArgs.empty()) {
+    Result.MC = 0;   // Error parsing value list.
+    return Result;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' in template value list");
+    Result.MC = 0;
+    return Result;
+  }
+  Lex.Lex();
+  Result.RefRange.End = Lex.getLoc();
+
+  return Result;
+}
+
+/// ParseRangePiece - Parse a bit/value range.
+///   RangePiece ::= INTVAL
+///   RangePiece ::= INTVAL '-' INTVAL
+///   RangePiece ::= INTVAL INTVAL
+bool TGParser::ParseRangePiece(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::IntVal) {
+    TokError("expected integer or bitrange");
+    return true;
+  }
+  int64_t Start = Lex.getCurIntVal();
+  int64_t End;
+
+  if (Start < 0)
+    return TokError("invalid range, cannot be negative");
+
+  switch (Lex.Lex()) {  // eat first character.
+  default:
+    Ranges.push_back(Start);
+    return false;
+  case tgtok::minus:
+    if (Lex.Lex() != tgtok::IntVal) {
+      TokError("expected integer value as end of range");
+      return true;
+    }
+    End = Lex.getCurIntVal();
+    break;
+  case tgtok::IntVal:
+    End = -Lex.getCurIntVal();
+    break;
+  }
+  if (End < 0)
+    return TokError("invalid range, cannot be negative");
+  Lex.Lex();
+
+  // Add to the range.
+  if (Start < End) {
+    for (; Start <= End; ++Start)
+      Ranges.push_back(Start);
+  } else {
+    for (; Start >= End; --Start)
+      Ranges.push_back(Start);
+  }
+  return false;
+}
+
+/// ParseRangeList - Parse a list of scalars and ranges into scalar values.
+///
+///   RangeList ::= RangePiece (',' RangePiece)*
+///
+std::vector<unsigned> TGParser::ParseRangeList() {
+  std::vector<unsigned> Result;
+
+  // Parse the first piece.
+  if (ParseRangePiece(Result))
+    return std::vector<unsigned>();
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex();  // Eat the comma.
+
+    // Parse the next range piece.
+    if (ParseRangePiece(Result))
+      return std::vector<unsigned>();
+  }
+  return Result;
+}
+
+/// ParseOptionalRangeList - Parse either a range list in <>'s or nothing.
+///   OptionalRangeList ::= '<' RangeList '>'
+///   OptionalRangeList ::= /*empty*/
+bool TGParser::ParseOptionalRangeList(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::less)
+    return false;
+
+  SMLoc StartLoc = Lex.getLoc();
+  Lex.Lex(); // eat the '<'
+
+  // Parse the range list.
+  Ranges = ParseRangeList();
+  if (Ranges.empty()) return true;
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' at end of range list");
+    return Error(StartLoc, "to match this '<'");
+  }
+  Lex.Lex();   // eat the '>'.
+  return false;
+}
+
+/// ParseOptionalBitList - Parse either a bit list in {}'s or nothing.
+///   OptionalBitList ::= '{' RangeList '}'
+///   OptionalBitList ::= /*empty*/
+bool TGParser::ParseOptionalBitList(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::l_brace)
+    return false;
+
+  SMLoc StartLoc = Lex.getLoc();
+  Lex.Lex(); // eat the '{'
+
+  // Parse the range list.
+  Ranges = ParseRangeList();
+  if (Ranges.empty()) return true;
+
+  if (Lex.getCode() != tgtok::r_brace) {
+    TokError("expected '}' at end of bit list");
+    return Error(StartLoc, "to match this '{'");
+  }
+  Lex.Lex();   // eat the '}'.
+  return false;
+}
+
+
+/// ParseType - Parse and return a tblgen type.  This returns null on error.
+///
+///   Type ::= STRING                       // string type
+///   Type ::= CODE                         // code type
+///   Type ::= BIT                          // bit type
+///   Type ::= BITS '<' INTVAL '>'          // bits<x> type
+///   Type ::= INT                          // int type
+///   Type ::= LIST '<' Type '>'            // list<x> type
+///   Type ::= DAG                          // dag type
+///   Type ::= ClassID                      // Record Type
+///
+RecTy *TGParser::ParseType() {
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when expecting a type"); return 0;
+  case tgtok::String: Lex.Lex(); return StringRecTy::get();
+  case tgtok::Code:   Lex.Lex(); return StringRecTy::get();
+  case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
+  case tgtok::Int:    Lex.Lex(); return IntRecTy::get();
+  case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
+  case tgtok::Id:
+    if (Record *R = ParseClassID()) return RecordRecTy::get(R);
+    return 0;
+  case tgtok::Bits: {
+    if (Lex.Lex() != tgtok::less) { // Eat 'bits'
+      TokError("expected '<' after bits type");
+      return 0;
+    }
+    if (Lex.Lex() != tgtok::IntVal) {  // Eat '<'
+      TokError("expected integer in bits<n> type");
+      return 0;
+    }
+    uint64_t Val = Lex.getCurIntVal();
+    if (Lex.Lex() != tgtok::greater) {  // Eat count.
+      TokError("expected '>' at end of bits<n> type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '>'
+    return BitsRecTy::get(Val);
+  }
+  case tgtok::List: {
+    if (Lex.Lex() != tgtok::less) { // Eat 'bits'
+      TokError("expected '<' after list type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '<'
+    RecTy *SubType = ParseType();
+    if (SubType == 0) return 0;
+
+    if (Lex.getCode() != tgtok::greater) {
+      TokError("expected '>' at end of list<ty> type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '>'
+    return ListRecTy::get(SubType);
+  }
+  }
+}
+
+/// ParseIDValue - Parse an ID as a value and decode what it means.
+///
+///  IDValue ::= ID [def local value]
+///  IDValue ::= ID [def template arg]
+///  IDValue ::= ID [multiclass local value]
+///  IDValue ::= ID [multiclass template argument]
+///  IDValue ::= ID [def name]
+///
+Init *TGParser::ParseIDValue(Record *CurRec, IDParseMode Mode) {
+  assert(Lex.getCode() == tgtok::Id && "Expected ID in ParseIDValue");
+  std::string Name = Lex.getCurStrVal();
+  SMLoc Loc = Lex.getLoc();
+  Lex.Lex();
+  return ParseIDValue(CurRec, Name, Loc);
+}
+
+/// ParseIDValue - This is just like ParseIDValue above, but it assumes the ID
+/// has already been read.
+Init *TGParser::ParseIDValue(Record *CurRec,
+                             const std::string &Name, SMLoc NameLoc,
+                             IDParseMode Mode) {
+  if (CurRec) {
+    if (const RecordVal *RV = CurRec->getValue(Name))
+      return VarInit::get(Name, RV->getType());
+
+    Init *TemplateArgName = QualifyName(*CurRec, CurMultiClass, Name, ":");
+
+    if (CurMultiClass)
+      TemplateArgName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name,
+                                    "::");
+
+    if (CurRec->isTemplateArg(TemplateArgName)) {
+      const RecordVal *RV = CurRec->getValue(TemplateArgName);
+      assert(RV && "Template arg doesn't exist??");
+      return VarInit::get(TemplateArgName, RV->getType());
+    }
+  }
+
+  if (CurMultiClass) {
+    Init *MCName = QualifyName(CurMultiClass->Rec, CurMultiClass, Name,
+                               "::");
+
+    if (CurMultiClass->Rec.isTemplateArg(MCName)) {
+      const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
+      assert(RV && "Template arg doesn't exist??");
+      return VarInit::get(MCName, RV->getType());
+    }
+  }
+
+  // If this is in a foreach loop, make sure it's not a loop iterator
+  for (LoopVector::iterator i = Loops.begin(), iend = Loops.end();
+       i != iend;
+       ++i) {
+    VarInit *IterVar = dyn_cast<VarInit>(i->IterVar);
+    if (IterVar && IterVar->getName() == Name)
+      return IterVar;
+  }
+
+  if (Mode == ParseNameMode)
+    return StringInit::get(Name);
+
+  if (Record *D = Records.getDef(Name))
+    return DefInit::get(D);
+
+  if (Mode == ParseValueMode) {
+    Error(NameLoc, "Variable not defined: '" + Name + "'");
+    return 0;
+  }
+  
+  return StringInit::get(Name);
+}
+
+/// ParseOperation - Parse an operator.  This returns null on error.
+///
+/// Operation ::= XOperator ['<' Type '>'] '(' Args ')'
+///
+Init *TGParser::ParseOperation(Record *CurRec) {
+  switch (Lex.getCode()) {
+  default:
+    TokError("unknown operation");
+    return 0;
+  case tgtok::XHead:
+  case tgtok::XTail:
+  case tgtok::XEmpty:
+  case tgtok::XCast: {  // Value ::= !unop '(' Value ')'
+    UnOpInit::UnaryOp Code;
+    RecTy *Type = 0;
+
+    switch (Lex.getCode()) {
+    default: llvm_unreachable("Unhandled code!");
+    case tgtok::XCast:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::CAST;
+
+      Type = ParseOperatorType();
+
+      if (Type == 0) {
+        TokError("did not get type for unary operator");
+        return 0;
+      }
+
+      break;
+    case tgtok::XHead:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::HEAD;
+      break;
+    case tgtok::XTail:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::TAIL;
+      break;
+    case tgtok::XEmpty:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::EMPTY;
+      Type = IntRecTy::get();
+      break;
+    }
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after unary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    Init *LHS = ParseValue(CurRec);
+    if (LHS == 0) return 0;
+
+    if (Code == UnOpInit::HEAD
+        || Code == UnOpInit::TAIL
+        || Code == UnOpInit::EMPTY) {
+      ListInit *LHSl = dyn_cast<ListInit>(LHS);
+      StringInit *LHSs = dyn_cast<StringInit>(LHS);
+      TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
+      if (LHSl == 0 && LHSs == 0 && LHSt == 0) {
+        TokError("expected list or string type argument in unary operator");
+        return 0;
+      }
+      if (LHSt) {
+        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+        StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
+        if (LType == 0 && SType == 0) {
+          TokError("expected list or string type argumnet in unary operator");
+          return 0;
+        }
+      }
+
+      if (Code == UnOpInit::HEAD
+          || Code == UnOpInit::TAIL) {
+        if (LHSl == 0 && LHSt == 0) {
+          TokError("expected list type argumnet in unary operator");
+          return 0;
+        }
+
+        if (LHSl && LHSl->getSize() == 0) {
+          TokError("empty list argument in unary operator");
+          return 0;
+        }
+        if (LHSl) {
+          Init *Item = LHSl->getElement(0);
+          TypedInit *Itemt = dyn_cast<TypedInit>(Item);
+          if (Itemt == 0) {
+            TokError("untyped list element in unary operator");
+            return 0;
+          }
+          if (Code == UnOpInit::HEAD) {
+            Type = Itemt->getType();
+          } else {
+            Type = ListRecTy::get(Itemt->getType());
+          }
+        } else {
+          assert(LHSt && "expected list type argument in unary operator");
+          ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+          if (LType == 0) {
+            TokError("expected list type argumnet in unary operator");
+            return 0;
+          }
+          if (Code == UnOpInit::HEAD) {
+            Type = LType->getElementType();
+          } else {
+            Type = LType;
+          }
+        }
+      }
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in unary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+    return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec, CurMultiClass);
+  }
+
+  case tgtok::XConcat:
+  case tgtok::XADD:
+  case tgtok::XSRA:
+  case tgtok::XSRL:
+  case tgtok::XSHL:
+  case tgtok::XEq:
+  case tgtok::XStrConcat: {  // Value ::= !binop '(' Value ',' Value ')'
+    tgtok::TokKind OpTok = Lex.getCode();
+    SMLoc OpLoc = Lex.getLoc();
+    Lex.Lex();  // eat the operation
+
+    BinOpInit::BinaryOp Code;
+    RecTy *Type = 0;
+
+    switch (OpTok) {
+    default: llvm_unreachable("Unhandled code!");
+    case tgtok::XConcat: Code = BinOpInit::CONCAT;Type = DagRecTy::get(); break;
+    case tgtok::XADD:    Code = BinOpInit::ADD;   Type = IntRecTy::get(); break;
+    case tgtok::XSRA:    Code = BinOpInit::SRA;   Type = IntRecTy::get(); break;
+    case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
+    case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
+    case tgtok::XEq:     Code = BinOpInit::EQ;    Type = BitRecTy::get(); break;
+    case tgtok::XStrConcat:
+      Code = BinOpInit::STRCONCAT;
+      Type = StringRecTy::get();
+      break;
+    }
+
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after binary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    SmallVector<Init*, 2> InitList;
+
+    InitList.push_back(ParseValue(CurRec));
+    if (InitList.back() == 0) return 0;
+
+    while (Lex.getCode() == tgtok::comma) {
+      Lex.Lex();  // eat the ','
+
+      InitList.push_back(ParseValue(CurRec));
+      if (InitList.back() == 0) return 0;
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    // We allow multiple operands to associative operators like !strconcat as
+    // shorthand for nesting them.
+    if (Code == BinOpInit::STRCONCAT) {
+      while (InitList.size() > 2) {
+        Init *RHS = InitList.pop_back_val();
+        RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))
+                           ->Fold(CurRec, CurMultiClass);
+        InitList.back() = RHS;
+      }
+    }
+
+    if (InitList.size() == 2)
+      return (BinOpInit::get(Code, InitList[0], InitList[1], Type))
+        ->Fold(CurRec, CurMultiClass);
+
+    Error(OpLoc, "expected two operands to operator");
+    return 0;
+  }
+
+  case tgtok::XIf:
+  case tgtok::XForEach:
+  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+    TernOpInit::TernaryOp Code;
+    RecTy *Type = 0;
+
+    tgtok::TokKind LexCode = Lex.getCode();
+    Lex.Lex();  // eat the operation
+    switch (LexCode) {
+    default: llvm_unreachable("Unhandled code!");
+    case tgtok::XIf:
+      Code = TernOpInit::IF;
+      break;
+    case tgtok::XForEach:
+      Code = TernOpInit::FOREACH;
+      break;
+    case tgtok::XSubst:
+      Code = TernOpInit::SUBST;
+      break;
+    }
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    Init *LHS = ParseValue(CurRec);
+    if (LHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ','
+
+    Init *MHS = ParseValue(CurRec);
+    if (MHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ','
+
+    Init *RHS = ParseValue(CurRec);
+    if (RHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in binary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    switch (LexCode) {
+    default: llvm_unreachable("Unhandled code!");
+    case tgtok::XIf: {
+      RecTy *MHSTy = 0;
+      RecTy *RHSTy = 0;
+
+      if (TypedInit *MHSt = dyn_cast<TypedInit>(MHS))
+        MHSTy = MHSt->getType();
+      if (BitsInit *MHSbits = dyn_cast<BitsInit>(MHS))
+        MHSTy = BitsRecTy::get(MHSbits->getNumBits());
+      if (isa<BitInit>(MHS))
+        MHSTy = BitRecTy::get();
+
+      if (TypedInit *RHSt = dyn_cast<TypedInit>(RHS))
+        RHSTy = RHSt->getType();
+      if (BitsInit *RHSbits = dyn_cast<BitsInit>(RHS))
+        RHSTy = BitsRecTy::get(RHSbits->getNumBits());
+      if (isa<BitInit>(RHS))
+        RHSTy = BitRecTy::get();
+
+      // For UnsetInit, it's typed from the other hand.
+      if (isa<UnsetInit>(MHS))
+        MHSTy = RHSTy;
+      if (isa<UnsetInit>(RHS))
+        RHSTy = MHSTy;
+
+      if (!MHSTy || !RHSTy) {
+        TokError("could not get type for !if");
+        return 0;
+      }
+
+      if (MHSTy->typeIsConvertibleTo(RHSTy)) {
+        Type = RHSTy;
+      } else if (RHSTy->typeIsConvertibleTo(MHSTy)) {
+        Type = MHSTy;
+      } else {
+        TokError("inconsistent types for !if");
+        return 0;
+      }
+      break;
+    }
+    case tgtok::XForEach: {
+      TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
+      if (MHSt == 0) {
+        TokError("could not get type for !foreach");
+        return 0;
+      }
+      Type = MHSt->getType();
+      break;
+    }
+    case tgtok::XSubst: {
+      TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
+      if (RHSt == 0) {
+        TokError("could not get type for !subst");
+        return 0;
+      }
+      Type = RHSt->getType();
+      break;
+    }
+    }
+    return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec,
+                                                             CurMultiClass);
+  }
+  }
+}
+
+/// ParseOperatorType - Parse a type for an operator.  This returns
+/// null on error.
+///
+/// OperatorType ::= '<' Type '>'
+///
+RecTy *TGParser::ParseOperatorType() {
+  RecTy *Type = 0;
+
+  if (Lex.getCode() != tgtok::less) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+  Lex.Lex();  // eat the <
+
+  Type = ParseType();
+
+  if (Type == 0) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+  Lex.Lex();  // eat the >
+
+  return Type;
+}
+
+
+/// ParseSimpleValue - Parse a tblgen value.  This returns null on error.
+///
+///   SimpleValue ::= IDValue
+///   SimpleValue ::= INTVAL
+///   SimpleValue ::= STRVAL+
+///   SimpleValue ::= CODEFRAGMENT
+///   SimpleValue ::= '?'
+///   SimpleValue ::= '{' ValueList '}'
+///   SimpleValue ::= ID '<' ValueListNE '>'
+///   SimpleValue ::= '[' ValueList ']'
+///   SimpleValue ::= '(' IDValue DagArgList ')'
+///   SimpleValue ::= CONCATTOK '(' Value ',' Value ')'
+///   SimpleValue ::= ADDTOK '(' Value ',' Value ')'
+///   SimpleValue ::= SHLTOK '(' Value ',' Value ')'
+///   SimpleValue ::= SRATOK '(' Value ',' Value ')'
+///   SimpleValue ::= SRLTOK '(' Value ',' Value ')'
+///   SimpleValue ::= STRCONCATTOK '(' Value ',' Value ')'
+///
+Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
+                                 IDParseMode Mode) {
+  Init *R = 0;
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when parsing a value"); break;
+  case tgtok::paste:
+    // This is a leading paste operation.  This is deprecated but
+    // still exists in some .td files.  Ignore it.
+    Lex.Lex();  // Skip '#'.
+    return ParseSimpleValue(CurRec, ItemType, Mode);
+  case tgtok::IntVal: R = IntInit::get(Lex.getCurIntVal()); Lex.Lex(); break;
+  case tgtok::StrVal: {
+    std::string Val = Lex.getCurStrVal();
+    Lex.Lex();
+
+    // Handle multiple consecutive concatenated strings.
+    while (Lex.getCode() == tgtok::StrVal) {
+      Val += Lex.getCurStrVal();
+      Lex.Lex();
+    }
+
+    R = StringInit::get(Val);
+    break;
+  }
+  case tgtok::CodeFragment:
+    R = StringInit::get(Lex.getCurStrVal());
+    Lex.Lex();
+    break;
+  case tgtok::question:
+    R = UnsetInit::get();
+    Lex.Lex();
+    break;
+  case tgtok::Id: {
+    SMLoc NameLoc = Lex.getLoc();
+    std::string Name = Lex.getCurStrVal();
+    if (Lex.Lex() != tgtok::less)  // consume the Id.
+      return ParseIDValue(CurRec, Name, NameLoc, Mode);    // Value ::= IDValue
+
+    // Value ::= ID '<' ValueListNE '>'
+    if (Lex.Lex() == tgtok::greater) {
+      TokError("expected non-empty value list");
+      return 0;
+    }
+
+    // This is a CLASS<initvalslist> expression.  This is supposed to synthesize
+    // a new anonymous definition, deriving from CLASS<initvalslist> with no
+    // body.
+    Record *Class = Records.getClass(Name);
+    if (!Class) {
+      Error(NameLoc, "Expected a class name, got '" + Name + "'");
+      return 0;
+    }
+
+    std::vector<Init*> ValueList = ParseValueList(CurRec, Class);
+    if (ValueList.empty()) return 0;
+
+    if (Lex.getCode() != tgtok::greater) {
+      TokError("expected '>' at end of value list");
+      return 0;
+    }
+    Lex.Lex();  // eat the '>'
+    SMLoc EndLoc = Lex.getLoc();
+
+    // Create the new record, set it as CurRec temporarily.
+    static unsigned AnonCounter = 0;
+    Record *NewRec = new Record("anonymous.val."+utostr(AnonCounter++),
+                                NameLoc,
+                                Records,
+                                /*IsAnonymous=*/true);
+    SubClassReference SCRef;
+    SCRef.RefRange = SMRange(NameLoc, EndLoc);
+    SCRef.Rec = Class;
+    SCRef.TemplateArgs = ValueList;
+    // Add info about the subclass to NewRec.
+    if (AddSubClass(NewRec, SCRef))
+      return 0;
+    NewRec->resolveReferences();
+    Records.addDef(NewRec);
+
+    // The result of the expression is a reference to the new record.
+    return DefInit::get(NewRec);
+  }
+  case tgtok::l_brace: {           // Value ::= '{' ValueList '}'
+    SMLoc BraceLoc = Lex.getLoc();
+    Lex.Lex(); // eat the '{'
+    std::vector<Init*> Vals;
+
+    if (Lex.getCode() != tgtok::r_brace) {
+      Vals = ParseValueList(CurRec);
+      if (Vals.empty()) return 0;
+    }
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of bit list value");
+      return 0;
+    }
+    Lex.Lex();  // eat the '}'
+
+    SmallVector<Init *, 16> NewBits(Vals.size());
+
+    for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+      Init *Bit = Vals[i]->convertInitializerTo(BitRecTy::get());
+      if (Bit == 0) {
+        Error(BraceLoc, "Element #" + utostr(i) + " (" + Vals[i]->getAsString()+
+              ") is not convertable to a bit");
+        return 0;
+      }
+      NewBits[Vals.size()-i-1] = Bit;
+    }
+    return BitsInit::get(NewBits);
+  }
+  case tgtok::l_square: {          // Value ::= '[' ValueList ']'
+    Lex.Lex(); // eat the '['
+    std::vector<Init*> Vals;
+
+    RecTy *DeducedEltTy = 0;
+    ListRecTy *GivenListTy = 0;
+
+    if (ItemType != 0) {
+      ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
+      if (ListType == 0) {
+        std::stringstream s;
+        s << "Type mismatch for list, expected list type, got "
+          << ItemType->getAsString();
+        TokError(s.str());
+        return 0;
+      }
+      GivenListTy = ListType;
+    }
+
+    if (Lex.getCode() != tgtok::r_square) {
+      Vals = ParseValueList(CurRec, 0,
+                            GivenListTy ? GivenListTy->getElementType() : 0);
+      if (Vals.empty()) return 0;
+    }
+    if (Lex.getCode() != tgtok::r_square) {
+      TokError("expected ']' at end of list value");
+      return 0;
+    }
+    Lex.Lex();  // eat the ']'
+
+    RecTy *GivenEltTy = 0;
+    if (Lex.getCode() == tgtok::less) {
+      // Optional list element type
+      Lex.Lex();  // eat the '<'
+
+      GivenEltTy = ParseType();
+      if (GivenEltTy == 0) {
+        // Couldn't parse element type
+        return 0;
+      }
+
+      if (Lex.getCode() != tgtok::greater) {
+        TokError("expected '>' at end of list element type");
+        return 0;
+      }
+      Lex.Lex();  // eat the '>'
+    }
+
+    // Check elements
+    RecTy *EltTy = 0;
+    for (std::vector<Init *>::iterator i = Vals.begin(), ie = Vals.end();
+         i != ie;
+         ++i) {
+      TypedInit *TArg = dyn_cast<TypedInit>(*i);
+      if (TArg == 0) {
+        TokError("Untyped list element");
+        return 0;
+      }
+      if (EltTy != 0) {
+        EltTy = resolveTypes(EltTy, TArg->getType());
+        if (EltTy == 0) {
+          TokError("Incompatible types in list elements");
+          return 0;
+        }
+      } else {
+        EltTy = TArg->getType();
+      }
+    }
+
+    if (GivenEltTy != 0) {
+      if (EltTy != 0) {
+        // Verify consistency
+        if (!EltTy->typeIsConvertibleTo(GivenEltTy)) {
+          TokError("Incompatible types in list elements");
+          return 0;
+        }
+      }
+      EltTy = GivenEltTy;
+    }
+
+    if (EltTy == 0) {
+      if (ItemType == 0) {
+        TokError("No type for list");
+        return 0;
+      }
+      DeducedEltTy = GivenListTy->getElementType();
+    } else {
+      // Make sure the deduced type is compatible with the given type
+      if (GivenListTy) {
+        if (!EltTy->typeIsConvertibleTo(GivenListTy->getElementType())) {
+          TokError("Element type mismatch for list");
+          return 0;
+        }
+      }
+      DeducedEltTy = EltTy;
+    }
+
+    return ListInit::get(Vals, DeducedEltTy);
+  }
+  case tgtok::l_paren: {         // Value ::= '(' IDValue DagArgList ')'
+    Lex.Lex();   // eat the '('
+    if (Lex.getCode() != tgtok::Id && Lex.getCode() != tgtok::XCast) {
+      TokError("expected identifier in dag init");
+      return 0;
+    }
+
+    Init *Operator = ParseValue(CurRec);
+    if (Operator == 0) return 0;
+
+    // If the operator name is present, parse it.
+    std::string OperatorName;
+    if (Lex.getCode() == tgtok::colon) {
+      if (Lex.Lex() != tgtok::VarName) { // eat the ':'
+        TokError("expected variable name in dag operator");
+        return 0;
+      }
+      OperatorName = Lex.getCurStrVal();
+      Lex.Lex();  // eat the VarName.
+    }
+
+    std::vector<std::pair<llvm::Init*, std::string> > DagArgs;
+    if (Lex.getCode() != tgtok::r_paren) {
+      DagArgs = ParseDagArgList(CurRec);
+      if (DagArgs.empty()) return 0;
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in dag init");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    return DagInit::get(Operator, OperatorName, DagArgs);
+  }
+
+  case tgtok::XHead:
+  case tgtok::XTail:
+  case tgtok::XEmpty:
+  case tgtok::XCast:  // Value ::= !unop '(' Value ')'
+  case tgtok::XConcat:
+  case tgtok::XADD:
+  case tgtok::XSRA:
+  case tgtok::XSRL:
+  case tgtok::XSHL:
+  case tgtok::XEq:
+  case tgtok::XStrConcat:   // Value ::= !binop '(' Value ',' Value ')'
+  case tgtok::XIf:
+  case tgtok::XForEach:
+  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+    return ParseOperation(CurRec);
+  }
+  }
+
+  return R;
+}
+
+/// ParseValue - Parse a tblgen value.  This returns null on error.
+///
+///   Value       ::= SimpleValue ValueSuffix*
+///   ValueSuffix ::= '{' BitList '}'
+///   ValueSuffix ::= '[' BitList ']'
+///   ValueSuffix ::= '.' ID
+///
+Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
+  Init *Result = ParseSimpleValue(CurRec, ItemType, Mode);
+  if (Result == 0) return 0;
+
+  // Parse the suffixes now if present.
+  while (1) {
+    switch (Lex.getCode()) {
+    default: return Result;
+    case tgtok::l_brace: {
+      if (Mode == ParseNameMode || Mode == ParseForeachMode)
+        // This is the beginning of the object body.
+        return Result;
+
+      SMLoc CurlyLoc = Lex.getLoc();
+      Lex.Lex(); // eat the '{'
+      std::vector<unsigned> Ranges = ParseRangeList();
+      if (Ranges.empty()) return 0;
+
+      // Reverse the bitlist.
+      std::reverse(Ranges.begin(), Ranges.end());
+      Result = Result->convertInitializerBitRange(Ranges);
+      if (Result == 0) {
+        Error(CurlyLoc, "Invalid bit range for value");
+        return 0;
+      }
+
+      // Eat the '}'.
+      if (Lex.getCode() != tgtok::r_brace) {
+        TokError("expected '}' at end of bit range list");
+        return 0;
+      }
+      Lex.Lex();
+      break;
+    }
+    case tgtok::l_square: {
+      SMLoc SquareLoc = Lex.getLoc();
+      Lex.Lex(); // eat the '['
+      std::vector<unsigned> Ranges = ParseRangeList();
+      if (Ranges.empty()) return 0;
+
+      Result = Result->convertInitListSlice(Ranges);
+      if (Result == 0) {
+        Error(SquareLoc, "Invalid range for list slice");
+        return 0;
+      }
+
+      // Eat the ']'.
+      if (Lex.getCode() != tgtok::r_square) {
+        TokError("expected ']' at end of list slice");
+        return 0;
+      }
+      Lex.Lex();
+      break;
+    }
+    case tgtok::period:
+      if (Lex.Lex() != tgtok::Id) {  // eat the .
+        TokError("expected field identifier after '.'");
+        return 0;
+      }
+      if (!Result->getFieldType(Lex.getCurStrVal())) {
+        TokError("Cannot access field '" + Lex.getCurStrVal() + "' of value '" +
+                 Result->getAsString() + "'");
+        return 0;
+      }
+      Result = FieldInit::get(Result, Lex.getCurStrVal());
+      Lex.Lex();  // eat field name
+      break;
+
+    case tgtok::paste:
+      SMLoc PasteLoc = Lex.getLoc();
+
+      // Create a !strconcat() operation, first casting each operand to
+      // a string if necessary.
+
+      TypedInit *LHS = dyn_cast<TypedInit>(Result);
+      if (!LHS) {
+        Error(PasteLoc, "LHS of paste is not typed!");
+        return 0;
+      }
+  
+      if (LHS->getType() != StringRecTy::get()) {
+        LHS = UnOpInit::get(UnOpInit::CAST, LHS, StringRecTy::get());
+      }
+
+      TypedInit *RHS = 0;
+
+      Lex.Lex();  // Eat the '#'.
+      switch (Lex.getCode()) { 
+      case tgtok::colon:
+      case tgtok::semi:
+      case tgtok::l_brace:
+        // These are all of the tokens that can begin an object body.
+        // Some of these can also begin values but we disallow those cases
+        // because they are unlikely to be useful.
+       
+        // Trailing paste, concat with an empty string.
+        RHS = StringInit::get("");
+        break;
+
+      default:
+        Init *RHSResult = ParseValue(CurRec, ItemType, ParseNameMode);
+        RHS = dyn_cast<TypedInit>(RHSResult);
+        if (!RHS) {
+          Error(PasteLoc, "RHS of paste is not typed!");
+          return 0;
+        }
+
+        if (RHS->getType() != StringRecTy::get()) {
+          RHS = UnOpInit::get(UnOpInit::CAST, RHS, StringRecTy::get());
+        }
+  
+        break;
+      }
+
+      Result = BinOpInit::get(BinOpInit::STRCONCAT, LHS, RHS,
+                              StringRecTy::get())->Fold(CurRec, CurMultiClass);
+      break;
+    }
+  }
+}
+
+/// ParseDagArgList - Parse the argument list for a dag literal expression.
+///
+///    DagArg     ::= Value (':' VARNAME)?
+///    DagArg     ::= VARNAME
+///    DagArgList ::= DagArg
+///    DagArgList ::= DagArgList ',' DagArg
+std::vector<std::pair<llvm::Init*, std::string> >
+TGParser::ParseDagArgList(Record *CurRec) {
+  std::vector<std::pair<llvm::Init*, std::string> > Result;
+
+  while (1) {
+    // DagArg ::= VARNAME
+    if (Lex.getCode() == tgtok::VarName) {
+      // A missing value is treated like '?'.
+      Result.push_back(std::make_pair(UnsetInit::get(), Lex.getCurStrVal()));
+      Lex.Lex();
+    } else {
+      // DagArg ::= Value (':' VARNAME)?
+      Init *Val = ParseValue(CurRec);
+      if (Val == 0)
+        return std::vector<std::pair<llvm::Init*, std::string> >();
+
+      // If the variable name is present, add it.
+      std::string VarName;
+      if (Lex.getCode() == tgtok::colon) {
+        if (Lex.Lex() != tgtok::VarName) { // eat the ':'
+          TokError("expected variable name in dag literal");
+          return std::vector<std::pair<llvm::Init*, std::string> >();
+        }
+        VarName = Lex.getCurStrVal();
+        Lex.Lex();  // eat the VarName.
+      }
+
+      Result.push_back(std::make_pair(Val, VarName));
+    }
+    if (Lex.getCode() != tgtok::comma) break;
+    Lex.Lex(); // eat the ','
+  }
+
+  return Result;
+}
+
+
+/// ParseValueList - Parse a comma separated list of values, returning them as a
+/// vector.  Note that this always expects to be able to parse at least one
+/// value.  It returns an empty list if this is not possible.
+///
+///   ValueList ::= Value (',' Value)
+///
+std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
+                                            RecTy *EltTy) {
+  std::vector<Init*> Result;
+  RecTy *ItemType = EltTy;
+  unsigned int ArgN = 0;
+  if (ArgsRec != 0 && EltTy == 0) {
+    const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
+    if (!TArgs.size()) {
+      TokError("template argument provided to non-template class");
+      return std::vector<Init*>();
+    }
+    const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]);
+    if (!RV) {
+      errs() << "Cannot find template arg " << ArgN << " (" << TArgs[ArgN]
+        << ")\n";
+    }
+    assert(RV && "Template argument record not found??");
+    ItemType = RV->getType();
+    ++ArgN;
+  }
+  Result.push_back(ParseValue(CurRec, ItemType));
+  if (Result.back() == 0) return std::vector<Init*>();
+
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex();  // Eat the comma
+
+    if (ArgsRec != 0 && EltTy == 0) {
+      const std::vector<Init *> &TArgs = ArgsRec->getTemplateArgs();
+      if (ArgN >= TArgs.size()) {
+        TokError("too many template arguments");
+        return std::vector<Init*>();
+      }
+      const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]);
+      assert(RV && "Template argument record not found??");
+      ItemType = RV->getType();
+      ++ArgN;
+    }
+    Result.push_back(ParseValue(CurRec, ItemType));
+    if (Result.back() == 0) return std::vector<Init*>();
+  }
+
+  return Result;
+}
+
+
+/// ParseDeclaration - Read a declaration, returning the name of field ID, or an
+/// empty string on error.  This can happen in a number of different context's,
+/// including within a def or in the template args for a def (which which case
+/// CurRec will be non-null) and within the template args for a multiclass (in
+/// which case CurRec will be null, but CurMultiClass will be set).  This can
+/// also happen within a def that is within a multiclass, which will set both
+/// CurRec and CurMultiClass.
+///
+///  Declaration ::= FIELD? Type ID ('=' Value)?
+///
+Init *TGParser::ParseDeclaration(Record *CurRec,
+                                       bool ParsingTemplateArgs) {
+  // Read the field prefix if present.
+  bool HasField = Lex.getCode() == tgtok::Field;
+  if (HasField) Lex.Lex();
+
+  RecTy *Type = ParseType();
+  if (Type == 0) return 0;
+
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("Expected identifier in declaration");
+    return 0;
+  }
+
+  SMLoc IdLoc = Lex.getLoc();
+  Init *DeclName = StringInit::get(Lex.getCurStrVal());
+  Lex.Lex();
+
+  if (ParsingTemplateArgs) {
+    if (CurRec) {
+      DeclName = QualifyName(*CurRec, CurMultiClass, DeclName, ":");
+    } else {
+      assert(CurMultiClass);
+    }
+    if (CurMultiClass)
+      DeclName = QualifyName(CurMultiClass->Rec, CurMultiClass, DeclName,
+                             "::");
+  }
+
+  // Add the value.
+  if (AddValue(CurRec, IdLoc, RecordVal(DeclName, Type, HasField)))
+    return 0;
+
+  // If a value is present, parse it.
+  if (Lex.getCode() == tgtok::equal) {
+    Lex.Lex();
+    SMLoc ValLoc = Lex.getLoc();
+    Init *Val = ParseValue(CurRec, Type);
+    if (Val == 0 ||
+        SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
+      return 0;
+  }
+
+  return DeclName;
+}
+
+/// ParseForeachDeclaration - Read a foreach declaration, returning
+/// the name of the declared object or a NULL Init on error.  Return
+/// the name of the parsed initializer list through ForeachListName.
+///
+///  ForeachDeclaration ::= ID '=' '[' ValueList ']'
+///  ForeachDeclaration ::= ID '=' '{' RangeList '}'
+///  ForeachDeclaration ::= ID '=' RangePiece
+///
+VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("Expected identifier in foreach declaration");
+    return 0;
+  }
+
+  Init *DeclName = StringInit::get(Lex.getCurStrVal());
+  Lex.Lex();
+
+  // If a value is present, parse it.
+  if (Lex.getCode() != tgtok::equal) {
+    TokError("Expected '=' in foreach declaration");
+    return 0;
+  }
+  Lex.Lex();  // Eat the '='
+
+  RecTy *IterType = 0;
+  std::vector<unsigned> Ranges;
+
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when expecting a range list"); return 0;
+  case tgtok::l_square: { // '[' ValueList ']'
+    Init *List = ParseSimpleValue(0, 0, ParseForeachMode);
+    ForeachListValue = dyn_cast<ListInit>(List);
+    if (ForeachListValue == 0) {
+      TokError("Expected a Value list");
+      return 0;
+    }
+    RecTy *ValueType = ForeachListValue->getType();
+    ListRecTy *ListType = dyn_cast<ListRecTy>(ValueType);
+    if (ListType == 0) {
+      TokError("Value list is not of list type");
+      return 0;
+    }
+    IterType = ListType->getElementType();
+    break;
+  }
+
+  case tgtok::IntVal: { // RangePiece.
+    if (ParseRangePiece(Ranges))
+      return 0;
+    break;
+  }
+
+  case tgtok::l_brace: { // '{' RangeList '}'
+    Lex.Lex(); // eat the '{'
+    Ranges = ParseRangeList();
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of bit range list");
+      return 0;
+    }
+    Lex.Lex();
+    break;
+  }
+  }
+
+  if (!Ranges.empty()) {
+    assert(!IterType && "Type already initialized?");
+    IterType = IntRecTy::get();
+    std::vector<Init*> Values;
+    for (unsigned i = 0, e = Ranges.size(); i != e; ++i)
+      Values.push_back(IntInit::get(Ranges[i]));
+    ForeachListValue = ListInit::get(Values, IterType);
+  }
+
+  if (!IterType)
+    return 0;
+
+  return VarInit::get(DeclName, IterType);
+}
+
+/// ParseTemplateArgList - Read a template argument list, which is a non-empty
+/// sequence of template-declarations in <>'s.  If CurRec is non-null, these are
+/// template args for a def, which may or may not be in a multiclass.  If null,
+/// these are the template args for a multiclass.
+///
+///    TemplateArgList ::= '<' Declaration (',' Declaration)* '>'
+///
+bool TGParser::ParseTemplateArgList(Record *CurRec) {
+  assert(Lex.getCode() == tgtok::less && "Not a template arg list!");
+  Lex.Lex(); // eat the '<'
+
+  Record *TheRecToAddTo = CurRec ? CurRec : &CurMultiClass->Rec;
+
+  // Read the first declaration.
+  Init *TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
+  if (TemplArg == 0)
+    return true;
+
+  TheRecToAddTo->addTemplateArg(TemplArg);
+
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex(); // eat the ','
+
+    // Read the following declarations.
+    TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
+    if (TemplArg == 0)
+      return true;
+    TheRecToAddTo->addTemplateArg(TemplArg);
+  }
+
+  if (Lex.getCode() != tgtok::greater)
+    return TokError("expected '>' at end of template argument list");
+  Lex.Lex(); // eat the '>'.
+  return false;
+}
+
+
+/// ParseBodyItem - Parse a single item at within the body of a def or class.
+///
+///   BodyItem ::= Declaration ';'
+///   BodyItem ::= LET ID OptionalBitList '=' Value ';'
+bool TGParser::ParseBodyItem(Record *CurRec) {
+  if (Lex.getCode() != tgtok::Let) {
+    if (ParseDeclaration(CurRec, false) == 0)
+      return true;
+
+    if (Lex.getCode() != tgtok::semi)
+      return TokError("expected ';' after declaration");
+    Lex.Lex();
+    return false;
+  }
+
+  // LET ID OptionalRangeList '=' Value ';'
+  if (Lex.Lex() != tgtok::Id)
+    return TokError("expected field identifier after let");
+
+  SMLoc IdLoc = Lex.getLoc();
+  std::string FieldName = Lex.getCurStrVal();
+  Lex.Lex();  // eat the field name.
+
+  std::vector<unsigned> BitList;
+  if (ParseOptionalBitList(BitList))
+    return true;
+  std::reverse(BitList.begin(), BitList.end());
+
+  if (Lex.getCode() != tgtok::equal)
+    return TokError("expected '=' in let expression");
+  Lex.Lex();  // eat the '='.
+
+  RecordVal *Field = CurRec->getValue(FieldName);
+  if (Field == 0)
+    return TokError("Value '" + FieldName + "' unknown!");
+
+  RecTy *Type = Field->getType();
+
+  Init *Val = ParseValue(CurRec, Type);
+  if (Val == 0) return true;
+
+  if (Lex.getCode() != tgtok::semi)
+    return TokError("expected ';' after let expression");
+  Lex.Lex();
+
+  return SetValue(CurRec, IdLoc, FieldName, BitList, Val);
+}
+
+/// ParseBody - Read the body of a class or def.  Return true on error, false on
+/// success.
+///
+///   Body     ::= ';'
+///   Body     ::= '{' BodyList '}'
+///   BodyList BodyItem*
+///
+bool TGParser::ParseBody(Record *CurRec) {
+  // If this is a null definition, just eat the semi and return.
+  if (Lex.getCode() == tgtok::semi) {
+    Lex.Lex();
+    return false;
+  }
+
+  if (Lex.getCode() != tgtok::l_brace)
+    return TokError("Expected ';' or '{' to start body");
+  // Eat the '{'.
+  Lex.Lex();
+
+  while (Lex.getCode() != tgtok::r_brace)
+    if (ParseBodyItem(CurRec))
+      return true;
+
+  // Eat the '}'.
+  Lex.Lex();
+  return false;
+}
+
+/// \brief Apply the current let bindings to \a CurRec.
+/// \returns true on error, false otherwise.
+bool TGParser::ApplyLetStack(Record *CurRec) {
+  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
+    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
+      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
+                   LetStack[i][j].Bits, LetStack[i][j].Value))
+        return true;
+  return false;
+}
+
+/// ParseObjectBody - Parse the body of a def or class.  This consists of an
+/// optional ClassList followed by a Body.  CurRec is the current def or class
+/// that is being parsed.
+///
+///   ObjectBody      ::= BaseClassList Body
+///   BaseClassList   ::= /*empty*/
+///   BaseClassList   ::= ':' BaseClassListNE
+///   BaseClassListNE ::= SubClassRef (',' SubClassRef)*
+///
+bool TGParser::ParseObjectBody(Record *CurRec) {
+  // If there is a baseclass list, read it.
+  if (Lex.getCode() == tgtok::colon) {
+    Lex.Lex();
+
+    // Read all of the subclasses.
+    SubClassReference SubClass = ParseSubClassReference(CurRec, false);
+    while (1) {
+      // Check for error.
+      if (SubClass.Rec == 0) return true;
+
+      // Add it.
+      if (AddSubClass(CurRec, SubClass))
+        return true;
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubClass = ParseSubClassReference(CurRec, false);
+    }
+  }
+
+  if (ApplyLetStack(CurRec))
+    return true;
+
+  return ParseBody(CurRec);
+}
+
+/// ParseDef - Parse and return a top level or multiclass def, return the record
+/// corresponding to it.  This returns null on error.
+///
+///   DefInst ::= DEF ObjectName ObjectBody
+///
+bool TGParser::ParseDef(MultiClass *CurMultiClass) {
+  SMLoc DefLoc = Lex.getLoc();
+  assert(Lex.getCode() == tgtok::Def && "Unknown tok");
+  Lex.Lex();  // Eat the 'def' token.
+
+  // Parse ObjectName and make a record for it.
+  Record *CurRec;
+  Init *Name = ParseObjectName(CurMultiClass);
+  if (Name)
+    CurRec = new Record(Name, DefLoc, Records);
+  else
+    CurRec = new Record(GetNewAnonymousName(), DefLoc, Records,
+                        /*IsAnonymous=*/true);
+
+  if (!CurMultiClass && Loops.empty()) {
+    // Top-level def definition.
+
+    // Ensure redefinition doesn't happen.
+    if (Records.getDef(CurRec->getNameInitAsString())) {
+      Error(DefLoc, "def '" + CurRec->getNameInitAsString()
+            + "' already defined");
+      return true;
+    }
+    Records.addDef(CurRec);
+  } else if (CurMultiClass) {
+    // Otherwise, a def inside a multiclass, add it to the multiclass.
+    for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
+      if (CurMultiClass->DefPrototypes[i]->getNameInit()
+          == CurRec->getNameInit()) {
+        Error(DefLoc, "def '" + CurRec->getNameInitAsString() +
+              "' already defined in this multiclass!");
+        return true;
+      }
+    CurMultiClass->DefPrototypes.push_back(CurRec);
+  }
+
+  if (ParseObjectBody(CurRec))
+    return true;
+
+  if (CurMultiClass == 0)  // Def's in multiclasses aren't really defs.
+    // See Record::setName().  This resolve step will see any new name
+    // for the def that might have been created when resolving
+    // inheritance, values and arguments above.
+    CurRec->resolveReferences();
+
+  // If ObjectBody has template arguments, it's an error.
+  assert(CurRec->getTemplateArgs().empty() && "How'd this get template args?");
+
+  if (CurMultiClass) {
+    // Copy the template arguments for the multiclass into the def.
+    const std::vector<Init *> &TArgs =
+                                CurMultiClass->Rec.getTemplateArgs();
+
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      const RecordVal *RV = CurMultiClass->Rec.getValue(TArgs[i]);
+      assert(RV && "Template arg doesn't exist?");
+      CurRec->addValue(*RV);
+    }
+  }
+
+  if (ProcessForeachDefs(CurRec, DefLoc)) {
+    Error(DefLoc,
+          "Could not process loops for def" + CurRec->getNameInitAsString());
+    return true;
+  }
+
+  return false;
+}
+
+/// ParseForeach - Parse a for statement.  Return the record corresponding
+/// to it.  This returns true on error.
+///
+///   Foreach ::= FOREACH Declaration IN '{ ObjectList '}'
+///   Foreach ::= FOREACH Declaration IN Object
+///
+bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
+  assert(Lex.getCode() == tgtok::Foreach && "Unknown tok");
+  Lex.Lex();  // Eat the 'for' token.
+
+  // Make a temporary object to record items associated with the for
+  // loop.
+  ListInit *ListValue = 0;
+  VarInit *IterName = ParseForeachDeclaration(ListValue);
+  if (IterName == 0)
+    return TokError("expected declaration in for");
+
+  if (Lex.getCode() != tgtok::In)
+    return TokError("Unknown tok");
+  Lex.Lex();  // Eat the in
+
+  // Create a loop object and remember it.
+  Loops.push_back(ForeachLoop(IterName, ListValue));
+
+  if (Lex.getCode() != tgtok::l_brace) {
+    // FOREACH Declaration IN Object
+    if (ParseObject(CurMultiClass))
+      return true;
+  }
+  else {
+    SMLoc BraceLoc = Lex.getLoc();
+    // Otherwise, this is a group foreach.
+    Lex.Lex();  // eat the '{'.
+
+    // Parse the object list.
+    if (ParseObjectList(CurMultiClass))
+      return true;
+
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of foreach command");
+      return Error(BraceLoc, "to match this '{'");
+    }
+    Lex.Lex();  // Eat the }
+  }
+
+  // We've processed everything in this loop.
+  Loops.pop_back();
+
+  return false;
+}
+
+/// ParseClass - Parse a tblgen class definition.
+///
+///   ClassInst ::= CLASS ID TemplateArgList? ObjectBody
+///
+bool TGParser::ParseClass() {
+  assert(Lex.getCode() == tgtok::Class && "Unexpected token!");
+  Lex.Lex();
+
+  if (Lex.getCode() != tgtok::Id)
+    return TokError("expected class name after 'class' keyword");
+
+  Record *CurRec = Records.getClass(Lex.getCurStrVal());
+  if (CurRec) {
+    // If the body was previously defined, this is an error.
+    if (CurRec->getValues().size() > 1 ||  // Account for NAME.
+        !CurRec->getSuperClasses().empty() ||
+        !CurRec->getTemplateArgs().empty())
+      return TokError("Class '" + CurRec->getNameInitAsString()
+                      + "' already defined");
+  } else {
+    // If this is the first reference to this class, create and add it.
+    CurRec = new Record(Lex.getCurStrVal(), Lex.getLoc(), Records);
+    Records.addClass(CurRec);
+  }
+  Lex.Lex(); // eat the name.
+
+  // If there are template args, parse them.
+  if (Lex.getCode() == tgtok::less)
+    if (ParseTemplateArgList(CurRec))
+      return true;
+
+  // Finally, parse the object body.
+  return ParseObjectBody(CurRec);
+}
+
+/// ParseLetList - Parse a non-empty list of assignment expressions into a list
+/// of LetRecords.
+///
+///   LetList ::= LetItem (',' LetItem)*
+///   LetItem ::= ID OptionalRangeList '=' Value
+///
+std::vector<LetRecord> TGParser::ParseLetList() {
+  std::vector<LetRecord> Result;
+
+  while (1) {
+    if (Lex.getCode() != tgtok::Id) {
+      TokError("expected identifier in let definition");
+      return std::vector<LetRecord>();
+    }
+    std::string Name = Lex.getCurStrVal();
+    SMLoc NameLoc = Lex.getLoc();
+    Lex.Lex();  // Eat the identifier.
+
+    // Check for an optional RangeList.
+    std::vector<unsigned> Bits;
+    if (ParseOptionalRangeList(Bits))
+      return std::vector<LetRecord>();
+    std::reverse(Bits.begin(), Bits.end());
+
+    if (Lex.getCode() != tgtok::equal) {
+      TokError("expected '=' in let expression");
+      return std::vector<LetRecord>();
+    }
+    Lex.Lex();  // eat the '='.
+
+    Init *Val = ParseValue(0);
+    if (Val == 0) return std::vector<LetRecord>();
+
+    // Now that we have everything, add the record.
+    Result.push_back(LetRecord(Name, Bits, Val, NameLoc));
+
+    if (Lex.getCode() != tgtok::comma)
+      return Result;
+    Lex.Lex();  // eat the comma.
+  }
+}
+
+/// ParseTopLevelLet - Parse a 'let' at top level.  This can be a couple of
+/// different related productions. This works inside multiclasses too.
+///
+///   Object ::= LET LetList IN '{' ObjectList '}'
+///   Object ::= LET LetList IN Object
+///
+bool TGParser::ParseTopLevelLet(MultiClass *CurMultiClass) {
+  assert(Lex.getCode() == tgtok::Let && "Unexpected token");
+  Lex.Lex();
+
+  // Add this entry to the let stack.
+  std::vector<LetRecord> LetInfo = ParseLetList();
+  if (LetInfo.empty()) return true;
+  LetStack.push_back(LetInfo);
+
+  if (Lex.getCode() != tgtok::In)
+    return TokError("expected 'in' at end of top-level 'let'");
+  Lex.Lex();
+
+  // If this is a scalar let, just handle it now
+  if (Lex.getCode() != tgtok::l_brace) {
+    // LET LetList IN Object
+    if (ParseObject(CurMultiClass))
+      return true;
+  } else {   // Object ::= LETCommand '{' ObjectList '}'
+    SMLoc BraceLoc = Lex.getLoc();
+    // Otherwise, this is a group let.
+    Lex.Lex();  // eat the '{'.
+
+    // Parse the object list.
+    if (ParseObjectList(CurMultiClass))
+      return true;
+
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of top level let command");
+      return Error(BraceLoc, "to match this '{'");
+    }
+    Lex.Lex();
+  }
+
+  // Outside this let scope, this let block is not active.
+  LetStack.pop_back();
+  return false;
+}
+
+/// ParseMultiClass - Parse a multiclass definition.
+///
+///  MultiClassInst ::= MULTICLASS ID TemplateArgList?
+///                     ':' BaseMultiClassList '{' MultiClassObject+ '}'
+///  MultiClassObject ::= DefInst
+///  MultiClassObject ::= MultiClassInst
+///  MultiClassObject ::= DefMInst
+///  MultiClassObject ::= LETCommand '{' ObjectList '}'
+///  MultiClassObject ::= LETCommand Object
+///
+bool TGParser::ParseMultiClass() {
+  assert(Lex.getCode() == tgtok::MultiClass && "Unexpected token");
+  Lex.Lex();  // Eat the multiclass token.
+
+  if (Lex.getCode() != tgtok::Id)
+    return TokError("expected identifier after multiclass for name");
+  std::string Name = Lex.getCurStrVal();
+
+  if (MultiClasses.count(Name))
+    return TokError("multiclass '" + Name + "' already defined");
+
+  CurMultiClass = MultiClasses[Name] = new MultiClass(Name, 
+                                                      Lex.getLoc(), Records);
+  Lex.Lex();  // Eat the identifier.
+
+  // If there are template args, parse them.
+  if (Lex.getCode() == tgtok::less)
+    if (ParseTemplateArgList(0))
+      return true;
+
+  bool inherits = false;
+
+  // If there are submulticlasses, parse them.
+  if (Lex.getCode() == tgtok::colon) {
+    inherits = true;
+
+    Lex.Lex();
+
+    // Read all of the submulticlasses.
+    SubMultiClassReference SubMultiClass =
+      ParseSubMultiClassReference(CurMultiClass);
+    while (1) {
+      // Check for error.
+      if (SubMultiClass.MC == 0) return true;
+
+      // Add it.
+      if (AddSubMultiClass(CurMultiClass, SubMultiClass))
+        return true;
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubMultiClass = ParseSubMultiClassReference(CurMultiClass);
+    }
+  }
+
+  if (Lex.getCode() != tgtok::l_brace) {
+    if (!inherits)
+      return TokError("expected '{' in multiclass definition");
+    else if (Lex.getCode() != tgtok::semi)
+      return TokError("expected ';' in multiclass definition");
+    else
+      Lex.Lex();  // eat the ';'.
+  } else {
+    if (Lex.Lex() == tgtok::r_brace)  // eat the '{'.
+      return TokError("multiclass must contain at least one def");
+
+    while (Lex.getCode() != tgtok::r_brace) {
+      switch (Lex.getCode()) {
+        default:
+          return TokError("expected 'let', 'def' or 'defm' in multiclass body");
+        case tgtok::Let:
+        case tgtok::Def:
+        case tgtok::Defm:
+        case tgtok::Foreach:
+          if (ParseObject(CurMultiClass))
+            return true;
+         break;
+      }
+    }
+    Lex.Lex();  // eat the '}'.
+  }
+
+  CurMultiClass = 0;
+  return false;
+}
+
+Record *TGParser::
+InstantiateMulticlassDef(MultiClass &MC,
+                         Record *DefProto,
+                         Init *DefmPrefix,
+                         SMRange DefmPrefixRange) {
+  // We need to preserve DefProto so it can be reused for later
+  // instantiations, so create a new Record to inherit from it.
+
+  // Add in the defm name.  If the defm prefix is empty, give each
+  // instantiated def a unique name.  Otherwise, if "#NAME#" exists in the
+  // name, substitute the prefix for #NAME#.  Otherwise, use the defm name
+  // as a prefix.
+
+  bool IsAnonymous = false;
+  if (DefmPrefix == 0) {
+    DefmPrefix = StringInit::get(GetNewAnonymousName());
+    IsAnonymous = true;
+  }
+
+  Init *DefName = DefProto->getNameInit();
+
+  StringInit *DefNameString = dyn_cast<StringInit>(DefName);
+
+  if (DefNameString != 0) {
+    // We have a fully expanded string so there are no operators to
+    // resolve.  We should concatenate the given prefix and name.
+    DefName =
+      BinOpInit::get(BinOpInit::STRCONCAT,
+                     UnOpInit::get(UnOpInit::CAST, DefmPrefix,
+                                   StringRecTy::get())->Fold(DefProto, &MC),
+                     DefName, StringRecTy::get())->Fold(DefProto, &MC);
+  }
+
+  // Make a trail of SMLocs from the multiclass instantiations.
+  SmallVector<SMLoc, 4> Locs(1, DefmPrefixRange.Start);
+  Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
+  Record *CurRec = new Record(DefName, Locs, Records, IsAnonymous);
+
+  SubClassReference Ref;
+  Ref.RefRange = DefmPrefixRange;
+  Ref.Rec = DefProto;
+  AddSubClass(CurRec, Ref);
+
+  // Set the value for NAME. We don't resolve references to it 'til later,
+  // though, so that uses in nested multiclass names don't get
+  // confused.
+  if (SetValue(CurRec, Ref.RefRange.Start, "NAME", std::vector<unsigned>(),
+               DefmPrefix)) {
+    Error(DefmPrefixRange.Start, "Could not resolve "
+          + CurRec->getNameInitAsString() + ":NAME to '"
+          + DefmPrefix->getAsUnquotedString() + "'");
+    return 0;
+  }
+
+  // If the DefNameString didn't resolve, we probably have a reference to
+  // NAME and need to replace it. We need to do at least this much greedily,
+  // otherwise nested multiclasses will end up with incorrect NAME expansions.
+  if (DefNameString == 0) {
+    RecordVal *DefNameRV = CurRec->getValue("NAME");
+    CurRec->resolveReferencesTo(DefNameRV);
+  }
+
+  if (!CurMultiClass) {
+    // Now that we're at the top level, resolve all NAME references
+    // in the resultant defs that weren't in the def names themselves.
+    RecordVal *DefNameRV = CurRec->getValue("NAME");
+    CurRec->resolveReferencesTo(DefNameRV);
+
+    // Now that NAME references are resolved and we're at the top level of
+    // any multiclass expansions, add the record to the RecordKeeper. If we are
+    // currently in a multiclass, it means this defm appears inside a
+    // multiclass and its name won't be fully resolvable until we see
+    // the top-level defm.  Therefore, we don't add this to the
+    // RecordKeeper at this point.  If we did we could get duplicate
+    // defs as more than one probably refers to NAME or some other
+    // common internal placeholder.
+
+    // Ensure redefinition doesn't happen.
+    if (Records.getDef(CurRec->getNameInitAsString())) {
+      Error(DefmPrefixRange.Start, "def '" + CurRec->getNameInitAsString() +
+            "' already defined, instantiating defm with subdef '" + 
+            DefProto->getNameInitAsString() + "'");
+      return 0;
+    }
+
+    Records.addDef(CurRec);
+  }
+
+  return CurRec;
+}
+
+bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC,
+                                        Record *CurRec,
+                                        SMLoc DefmPrefixLoc,
+                                        SMLoc SubClassLoc,
+                                        const std::vector<Init *> &TArgs,
+                                        std::vector<Init *> &TemplateVals,
+                                        bool DeleteArgs) {
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+    // Check if a value is specified for this temp-arg.
+    if (i < TemplateVals.size()) {
+      // Set it now.
+      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(),
+                   TemplateVals[i]))
+        return true;
+        
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
+
+      if (DeleteArgs)
+        // Now remove it.
+        CurRec->removeValue(TArgs[i]);
+        
+    } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
+      return Error(SubClassLoc, "value not specified for template argument #"+
+                   utostr(i) + " (" + TArgs[i]->getAsUnquotedString()
+                   + ") of multiclassclass '" + MC.Rec.getNameInitAsString()
+                   + "'");
+    }
+  }
+  return false;
+}
+
+bool TGParser::ResolveMulticlassDef(MultiClass &MC,
+                                    Record *CurRec,
+                                    Record *DefProto,
+                                    SMLoc DefmPrefixLoc) {
+  // If the mdef is inside a 'let' expression, add to each def.
+  if (ApplyLetStack(CurRec))
+    return Error(DefmPrefixLoc, "when instantiating this defm");
+
+  // Don't create a top level definition for defm inside multiclasses,
+  // instead, only update the prototypes and bind the template args
+  // with the new created definition.
+  if (!CurMultiClass)
+    return false;
+  for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size();
+       i != e; ++i)
+    if (CurMultiClass->DefPrototypes[i]->getNameInit()
+        == CurRec->getNameInit())
+      return Error(DefmPrefixLoc, "defm '" + CurRec->getNameInitAsString() +
+                   "' already defined in this multiclass!");
+  CurMultiClass->DefPrototypes.push_back(CurRec);
+
+  // Copy the template arguments for the multiclass into the new def.
+  const std::vector<Init *> &TA =
+    CurMultiClass->Rec.getTemplateArgs();
+
+  for (unsigned i = 0, e = TA.size(); i != e; ++i) {
+    const RecordVal *RV = CurMultiClass->Rec.getValue(TA[i]);
+    assert(RV && "Template arg doesn't exist?");
+    CurRec->addValue(*RV);
+  }
+
+  return false;
+}
+
+/// ParseDefm - Parse the instantiation of a multiclass.
+///
+///   DefMInst ::= DEFM ID ':' DefmSubClassRef ';'
+///
+bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
+  assert(Lex.getCode() == tgtok::Defm && "Unexpected token!");
+  SMLoc DefmLoc = Lex.getLoc();
+  Init *DefmPrefix = 0;
+
+  if (Lex.Lex() == tgtok::Id) {  // eat the defm.
+    DefmPrefix = ParseObjectName(CurMultiClass);
+  }
+
+  SMLoc DefmPrefixEndLoc = Lex.getLoc();
+  if (Lex.getCode() != tgtok::colon)
+    return TokError("expected ':' after defm identifier");
+
+  // Keep track of the new generated record definitions.
+  std::vector<Record*> NewRecDefs;
+
+  // This record also inherits from a regular class (non-multiclass)?
+  bool InheritFromClass = false;
+
+  // eat the colon.
+  Lex.Lex();
+
+  SMLoc SubClassLoc = Lex.getLoc();
+  SubClassReference Ref = ParseSubClassReference(0, true);
+
+  while (1) {
+    if (Ref.Rec == 0) return true;
+
+    // To instantiate a multiclass, we need to first get the multiclass, then
+    // instantiate each def contained in the multiclass with the SubClassRef
+    // template parameters.
+    MultiClass *MC = MultiClasses[Ref.Rec->getName()];
+    assert(MC && "Didn't lookup multiclass correctly?");
+    std::vector<Init*> &TemplateVals = Ref.TemplateArgs;
+
+    // Verify that the correct number of template arguments were specified.
+    const std::vector<Init *> &TArgs = MC->Rec.getTemplateArgs();
+    if (TArgs.size() < TemplateVals.size())
+      return Error(SubClassLoc,
+                   "more template args specified than multiclass expects");
+
+    // Loop over all the def's in the multiclass, instantiating each one.
+    for (unsigned i = 0, e = MC->DefPrototypes.size(); i != e; ++i) {
+      Record *DefProto = MC->DefPrototypes[i];
+
+      Record *CurRec = InstantiateMulticlassDef(*MC, DefProto, DefmPrefix,
+                                                SMRange(DefmLoc,
+                                                        DefmPrefixEndLoc));
+      if (!CurRec)
+        return true;
+
+      if (ResolveMulticlassDefArgs(*MC, CurRec, DefmLoc, SubClassLoc,
+                                   TArgs, TemplateVals, true/*Delete args*/))
+        return Error(SubClassLoc, "could not instantiate def");
+
+      if (ResolveMulticlassDef(*MC, CurRec, DefProto, DefmLoc))
+        return Error(SubClassLoc, "could not instantiate def");
+
+      NewRecDefs.push_back(CurRec);
+    }
+
+
+    if (Lex.getCode() != tgtok::comma) break;
+    Lex.Lex(); // eat ','.
+
+    SubClassLoc = Lex.getLoc();
+
+    // A defm can inherit from regular classes (non-multiclass) as
+    // long as they come in the end of the inheritance list.
+    InheritFromClass = (Records.getClass(Lex.getCurStrVal()) != 0);
+
+    if (InheritFromClass)
+      break;
+
+    Ref = ParseSubClassReference(0, true);
+  }
+
+  if (InheritFromClass) {
+    // Process all the classes to inherit as if they were part of a
+    // regular 'def' and inherit all record values.
+    SubClassReference SubClass = ParseSubClassReference(0, false);
+    while (1) {
+      // Check for error.
+      if (SubClass.Rec == 0) return true;
+
+      // Get the expanded definition prototypes and teach them about
+      // the record values the current class to inherit has
+      for (unsigned i = 0, e = NewRecDefs.size(); i != e; ++i) {
+        Record *CurRec = NewRecDefs[i];
+
+        // Add it.
+        if (AddSubClass(CurRec, SubClass))
+          return true;
+
+        if (ApplyLetStack(CurRec))
+          return true;
+      }
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubClass = ParseSubClassReference(0, false);
+    }
+  }
+
+  if (!CurMultiClass)
+    for (unsigned i = 0, e = NewRecDefs.size(); i != e; ++i)
+      // See Record::setName().  This resolve step will see any new
+      // name for the def that might have been created when resolving
+      // inheritance, values and arguments above.
+      NewRecDefs[i]->resolveReferences();
+
+  if (Lex.getCode() != tgtok::semi)
+    return TokError("expected ';' at end of defm");
+  Lex.Lex();
+
+  return false;
+}
+
+/// ParseObject
+///   Object ::= ClassInst
+///   Object ::= DefInst
+///   Object ::= MultiClassInst
+///   Object ::= DefMInst
+///   Object ::= LETCommand '{' ObjectList '}'
+///   Object ::= LETCommand Object
+bool TGParser::ParseObject(MultiClass *MC) {
+  switch (Lex.getCode()) {
+  default:
+    return TokError("Expected class, def, defm, multiclass or let definition");
+  case tgtok::Let:   return ParseTopLevelLet(MC);
+  case tgtok::Def:   return ParseDef(MC);
+  case tgtok::Foreach:   return ParseForeach(MC);
+  case tgtok::Defm:  return ParseDefm(MC);
+  case tgtok::Class: return ParseClass();
+  case tgtok::MultiClass: return ParseMultiClass();
+  }
+}
+
+/// ParseObjectList
+///   ObjectList :== Object*
+bool TGParser::ParseObjectList(MultiClass *MC) {
+  while (isObjectStart(Lex.getCode())) {
+    if (ParseObject(MC))
+      return true;
+  }
+  return false;
+}
+
+bool TGParser::ParseFile() {
+  Lex.Lex(); // Prime the lexer.
+  if (ParseObjectList()) return true;
+
+  // If we have unread input at the end of the file, report it.
+  if (Lex.getCode() == tgtok::Eof)
+    return false;
+
+  return TokError("Unexpected input at top level");
+}
+
diff --git a/contrib/llvm/lib/TableGen/TGParser.h b/contrib/llvm/lib/TableGen/TGParser.h
new file mode 100644
index 000000000000..044e3a02ba4b
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/TGParser.h
@@ -0,0 +1,191 @@
+//===- TGParser.h - Parser for TableGen Files -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Parser for tablegen files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TGPARSER_H
+#define TGPARSER_H
+
+#include "TGLexer.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include <map>
+
+namespace llvm {
+  class Record;
+  class RecordVal;
+  class RecordKeeper;
+  class RecTy;
+  class Init;
+  struct MultiClass;
+  struct SubClassReference;
+  struct SubMultiClassReference;
+
+  struct LetRecord {
+    std::string Name;
+    std::vector<unsigned> Bits;
+    Init *Value;
+    SMLoc Loc;
+    LetRecord(const std::string &N, const std::vector<unsigned> &B, Init *V,
+              SMLoc L)
+      : Name(N), Bits(B), Value(V), Loc(L) {
+    }
+  };
+
+  /// ForeachLoop - Record the iteration state associated with a for loop.
+  /// This is used to instantiate items in the loop body.
+  struct ForeachLoop {
+    VarInit *IterVar;
+    ListInit *ListValue;
+
+    ForeachLoop(VarInit *IVar, ListInit *LValue)
+      : IterVar(IVar), ListValue(LValue) {}
+  };
+
+class TGParser {
+  TGLexer Lex;
+  std::vector<std::vector<LetRecord> > LetStack;
+  std::map<std::string, MultiClass*> MultiClasses;
+
+  /// Loops - Keep track of any foreach loops we are within.
+  ///
+  typedef std::vector<ForeachLoop> LoopVector;
+  LoopVector Loops;
+
+  /// CurMultiClass - If we are parsing a 'multiclass' definition, this is the
+  /// current value.
+  MultiClass *CurMultiClass;
+
+  // Record tracker
+  RecordKeeper &Records;
+
+  // A "named boolean" indicating how to parse identifiers.  Usually
+  // identifiers map to some existing object but in special cases
+  // (e.g. parsing def names) no such object exists yet because we are
+  // in the middle of creating in.  For those situations, allow the
+  // parser to ignore missing object errors.
+  enum IDParseMode {
+    ParseValueMode,   // We are parsing a value we expect to look up.
+    ParseNameMode,    // We are parsing a name of an object that does not yet
+                      // exist.
+    ParseForeachMode  // We are parsing a foreach init.
+  };
+
+public:
+  TGParser(SourceMgr &SrcMgr, RecordKeeper &records) :
+    Lex(SrcMgr), CurMultiClass(0), Records(records) {}
+
+  /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
+  /// routines return true on error, or false on success.
+  bool ParseFile();
+
+  bool Error(SMLoc L, const Twine &Msg) const {
+    PrintError(L, Msg);
+    return true;
+  }
+  bool TokError(const Twine &Msg) const {
+    return Error(Lex.getLoc(), Msg);
+  }
+  const TGLexer::DependenciesMapTy &getDependencies() const {
+    return Lex.getDependencies();
+  }
+
+private:  // Semantic analysis methods.
+  bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
+  bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName,
+                const std::vector<unsigned> &BitList, Init *V);
+  bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName,
+                const std::vector<unsigned> &BitList, Init *V) {
+    return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V);
+  }
+  bool AddSubClass(Record *Rec, SubClassReference &SubClass);
+  bool AddSubMultiClass(MultiClass *CurMC,
+                        SubMultiClassReference &SubMultiClass);
+
+  // IterRecord: Map an iterator name to a value.
+  struct IterRecord {
+    VarInit *IterVar;
+    Init *IterValue;
+    IterRecord(VarInit *Var, Init *Val) : IterVar(Var), IterValue(Val) {}
+  };
+
+  // IterSet: The set of all iterator values at some point in the
+  // iteration space.
+  typedef std::vector<IterRecord> IterSet;
+
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc);
+  bool ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals);
+
+private:  // Parser methods.
+  bool ParseObjectList(MultiClass *MC = 0);
+  bool ParseObject(MultiClass *MC);
+  bool ParseClass();
+  bool ParseMultiClass();
+  Record *InstantiateMulticlassDef(MultiClass &MC,
+                                   Record *DefProto,
+                                   Init *DefmPrefix,
+                                   SMRange DefmPrefixRange);
+  bool ResolveMulticlassDefArgs(MultiClass &MC,
+                                Record *DefProto,
+                                SMLoc DefmPrefixLoc,
+                                SMLoc SubClassLoc,
+                                const std::vector<Init *> &TArgs,
+                                std::vector<Init *> &TemplateVals,
+                                bool DeleteArgs);
+  bool ResolveMulticlassDef(MultiClass &MC,
+                            Record *CurRec,
+                            Record *DefProto,
+                            SMLoc DefmPrefixLoc);
+  bool ParseDefm(MultiClass *CurMultiClass);
+  bool ParseDef(MultiClass *CurMultiClass);
+  bool ParseForeach(MultiClass *CurMultiClass);
+  bool ParseTopLevelLet(MultiClass *CurMultiClass);
+  std::vector<LetRecord> ParseLetList();
+
+  bool ParseObjectBody(Record *CurRec);
+  bool ParseBody(Record *CurRec);
+  bool ParseBodyItem(Record *CurRec);
+
+  bool ParseTemplateArgList(Record *CurRec);
+  Init *ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs);
+  VarInit *ParseForeachDeclaration(ListInit *&ForeachListValue);
+
+  SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm);
+  SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC);
+
+  Init *ParseIDValue(Record *CurRec, IDParseMode Mode = ParseValueMode);
+  Init *ParseIDValue(Record *CurRec, const std::string &Name, SMLoc NameLoc,
+                     IDParseMode Mode = ParseValueMode);
+  Init *ParseSimpleValue(Record *CurRec, RecTy *ItemType = 0,
+                         IDParseMode Mode = ParseValueMode);
+  Init *ParseValue(Record *CurRec, RecTy *ItemType = 0,
+                   IDParseMode Mode = ParseValueMode);
+  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = 0,
+                                    RecTy *EltTy = 0);
+  std::vector<std::pair<llvm::Init*, std::string> > ParseDagArgList(Record *);
+  bool ParseOptionalRangeList(std::vector<unsigned> &Ranges);
+  bool ParseOptionalBitList(std::vector<unsigned> &Ranges);
+  std::vector<unsigned> ParseRangeList();
+  bool ParseRangePiece(std::vector<unsigned> &Ranges);
+  RecTy *ParseType();
+  Init *ParseOperation(Record *CurRec);
+  RecTy *ParseOperatorType();
+  Init *ParseObjectName(MultiClass *CurMultiClass);
+  Record *ParseClassID();
+  MultiClass *ParseMultiClassID();
+  bool ApplyLetStack(Record *CurRec);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/TableGen/TableGenBackend.cpp b/contrib/llvm/lib/TableGen/TableGenBackend.cpp
new file mode 100644
index 000000000000..79d567753a6c
--- /dev/null
+++ b/contrib/llvm/lib/TableGen/TableGenBackend.cpp
@@ -0,0 +1,56 @@
+//===- TableGenBackend.cpp - Utilities for TableGen Backends ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides useful services for TableGen backends...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+
+using namespace llvm;
+
+const size_t MAX_LINE_LEN = 80U;
+
+static void printLine(raw_ostream &OS, const Twine &Prefix, char Fill,
+                      StringRef Suffix) {
+  size_t Pos = (size_t)OS.tell();
+  assert((MAX_LINE_LEN - Prefix.str().size() - Suffix.size() > 0) &&
+    "header line exceeds max limit");
+  OS << Prefix;
+  const size_t e = MAX_LINE_LEN - Suffix.size();
+  for (size_t i = (size_t)OS.tell() - Pos; i < e; ++i)
+    OS << Fill;
+  OS << Suffix << '\n';
+}
+
+void llvm::emitSourceFileHeader(StringRef Desc, raw_ostream &OS) {
+  printLine(OS, "/*===- TableGen'erated file ", '-', "*- C++ -*-===*\\");
+  printLine(OS, "|*", ' ', "*|");
+  size_t Pos = 0U;
+  size_t PosE;
+  StringRef Prefix("|*");
+  StringRef Suffix(" *|");
+  do{
+    size_t PSLen = Suffix.size() + Prefix.size();
+    PosE = Pos + ((MAX_LINE_LEN > (Desc.size() - PSLen)) ?
+      Desc.size() :
+      MAX_LINE_LEN - PSLen);
+    printLine(OS, Prefix + Desc.slice(Pos, PosE), ' ', Suffix);
+    Pos = PosE;
+  } while(Pos < Desc.size());
+  printLine(OS, Prefix, ' ', Suffix);
+  printLine(OS, Prefix + " Automatically generated file, do not edit!", ' ',
+    Suffix);
+  printLine(OS, Prefix, ' ', Suffix);
+  printLine(OS, "\\*===", '-', "===*/");
+  OS << '\n';
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
new file mode 100644
index 000000000000..4de4faa58182
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -0,0 +1,42 @@
+//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AArch64 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_H
+#define LLVM_TARGET_AARCH64_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64AsmPrinter;
+class FunctionPass;
+class AArch64TargetMachine;
+class MachineInstr;
+class MCInst;
+
+FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+
+FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
+
+FunctionPass *createAArch64BranchFixupPass();
+
+void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                      AArch64AsmPrinter &AP);
+
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
new file mode 100644
index 000000000000..e17052b4a565
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -0,0 +1,70 @@
+//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AArch64 Subtarget features.
+//
+
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+  "Enable Advanced SIMD instructions">;
+
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+  "Enable cryptographic instructions">;
+
+//===----------------------------------------------------------------------===//
+// AArch64 Processors
+//
+
+include "AArch64Schedule.td"
+
+def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+
+include "AArch64CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AArch64InstrInfo.td"
+
+def AArch64InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+
+def A64InstPrinter : AsmWriter {
+  string AsmWriterClassName = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def AArch64 : Target {
+  let InstructionSet = AArch64InstrInfo;
+  let AssemblyWriters = [A64InstPrinter];
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
new file mode 100644
index 000000000000..47ebb826e0d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -0,0 +1,347 @@
+//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "AArch64AsmPrinter.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  // See emitFrameIndexDebugValue in InstrInfo for where this instruction is
+  // expected to be created.
+  assert(MI->getNumOperands() == 4 && MI->getOperand(0).isReg()
+         && MI->getOperand(1).isImm() && "unexpected custom DBG_VALUE");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
+/// Try to print a floating-point register as if it belonged to a specified
+/// register-class. For example the inline asm operand modifier "b" requires its
+/// argument to be printed as "bN".
+static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
+                                       const TargetRegisterInfo *TRI,
+                                       const TargetRegisterClass &RegClass,
+                                       raw_ostream &O) {
+  if (!MO.isReg())
+    return true;
+
+  for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
+    if (RegClass.contains(*AR)) {
+      O << AArch64InstPrinter::getRegisterName(*AR);
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
+/// with the obvious type and an immediate 0 as either wzr or xzr.
+static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
+                                       const TargetRegisterInfo *TRI,
+                                       const TargetRegisterClass &RegClass,
+                                       raw_ostream &O) {
+  char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
+
+  if (MO.isImm() && MO.getImm() == 0) {
+    O << Prefix << "zr";
+    return false;
+  } else if (MO.isReg()) {
+    if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
+      O << (Prefix == 'x' ? "sp" : "wsp");
+      return false;
+    }
+
+    for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
+      if (RegClass.contains(*AR)) {
+        O << AArch64InstPrinter::getRegisterName(*AR);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
+                                             bool PrintImmediatePrefix,
+                                             StringRef Suffix, raw_ostream &O) {
+  StringRef Name;
+  StringRef Modifier;
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unexpected operand for symbolic address constraint");
+  case MachineOperand::MO_GlobalAddress:
+    Name = Mang->getSymbol(MO.getGlobal())->getName();
+
+    // Global variables may be accessed either via a GOT or in various fun and
+    // interesting TLS-model specific ways. Set the prefix modifier as
+    // appropriate here.
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
+      Reloc::Model RelocM = TM.getRelocationModel();
+      if (GV->isThreadLocal()) {
+        switch (TM.getTLSModel(GV)) {
+        case TLSModel::GeneralDynamic:
+          Modifier = "tlsdesc";
+          break;
+        case TLSModel::LocalDynamic:
+          Modifier = "dtprel";
+          break;
+        case TLSModel::InitialExec:
+          Modifier = "gottprel";
+          break;
+        case TLSModel::LocalExec:
+          Modifier = "tprel";
+          break;
+        }
+      } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+        Modifier = "got";
+      }
+    }
+    break;
+  case MachineOperand::MO_BlockAddress:
+    Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    Name = MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    Name = GetCPISymbol(MO.getIndex())->getName();
+    break;
+  }
+
+  // Some instructions (notably ADRP) don't take the # prefix for
+  // immediates. Only print it if asked to.
+  if (PrintImmediatePrefix)
+    O << '#';
+
+  // Only need the joining "_" if both the prefix and the suffix are
+  // non-null. This little block simply takes care of the four possibly
+  // combinations involved there.
+  if (Modifier == "" && Suffix == "")
+    O << Name;
+  else if (Modifier == "" && Suffix != "")
+    O << ":" << Suffix << ':' << Name;
+  else if (Modifier != "" && Suffix == "")
+    O << ":" << Modifier << ':' << Name;
+  else
+    O << ":" << Modifier << '_' << Suffix << ':' << Name;
+
+  return false;
+}
+
+bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode, raw_ostream &O) {
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  if (!ExtraCode || !ExtraCode[0]) {
+    // There's actually no operand modifier, which leads to a slightly eclectic
+    // set of behaviour which we have to handle here.
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("Unexpected operand for inline assembly");
+    case MachineOperand::MO_Register:
+      // GCC prints the unmodified operand of a 'w' constraint as the vector
+      // register. Technically, we could allocate the argument as a VPR128, but
+      // that leads to extremely dodgy copies being generated to get the data
+      // there.
+      if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O))
+        O << AArch64InstPrinter::getRegisterName(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      O << '#' << MO.getImm();
+      break;
+    case MachineOperand::MO_FPImmediate:
+      assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
+      O << "#0.0";
+      break;
+    case MachineOperand::MO_BlockAddress:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      return printSymbolicAddress(MO, false, "", O);
+    }
+    return false;
+  }
+
+  // We have a real modifier to handle.
+  switch(ExtraCode[0]) {
+  default:
+    // See if this is a generic operand
+    return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+  case 'c': // Don't print "#" before an immediate operand.
+    if (!MI->getOperand(OpNum).isImm())
+      return true;
+    O << MI->getOperand(OpNum).getImm();
+    return false;
+  case 'w':
+    // Output 32-bit general register operand, constant zero as wzr, or stack
+    // pointer as wsp. Ignored when used with other operand types.
+    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::GPR32RegClass, O);
+  case 'x':
+    // Output 64-bit general register operand, constant zero as xzr, or stack
+    // pointer as sp. Ignored when used with other operand types.
+    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::GPR64RegClass, O);
+  case 'H':
+    // Output higher numbered of a 64-bit general register pair
+  case 'Q':
+    // Output least significant register of a 64-bit general register pair
+  case 'R':
+    // Output most significant register of a 64-bit general register pair
+
+    // FIXME note: these three operand modifiers will require, to some extent,
+    // adding a paired GPR64 register class. Initial investigation suggests that
+    // assertions are hit unless it has a type and is made legal for that type
+    // in ISelLowering. After that step is made, the number of modifications
+    // needed explodes (operation legality, calling conventions, stores, reg
+    // copies ...).
+    llvm_unreachable("FIXME: Unimplemented register pairs");
+  case 'b':
+    // Output 8-bit FP/SIMD scalar register operand, prefixed with b.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR8RegClass, O);
+  case 'h':
+    // Output 16-bit FP/SIMD scalar register operand, prefixed with h.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR16RegClass, O);
+  case 's':
+    // Output 32-bit FP/SIMD scalar register operand, prefixed with s.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR32RegClass, O);
+  case 'd':
+    // Output 64-bit FP/SIMD scalar register operand, prefixed with d.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR64RegClass, O);
+  case 'q':
+    // Output 128-bit FP/SIMD scalar register operand, prefixed with q.
+    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                      AArch64::FPR128RegClass, O);
+  case 'A':
+    // Output symbolic address with appropriate relocation modifier (also
+    // suitable for ADRP).
+    return printSymbolicAddress(MI->getOperand(OpNum), false, "", O);
+  case 'L':
+    // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
+    // modifier.
+    return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O);
+  case 'G':
+    // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
+    // modifier (currently only for TLS local exec).
+    return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O);
+  }
+
+
+}
+
+bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                              unsigned OpNum,
+                                              unsigned AsmVariant,
+                                              const char *ExtraCode,
+                                              raw_ostream &O) {
+  // Currently both the memory constraints (m and Q) behave the same and amount
+  // to the address as a single register. In future, we may allow "m" to provide
+  // both a base and an offset.
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline assembly memory operand");
+  O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
+  return false;
+}
+
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[' << AArch64InstPrinter::getRegisterName(MI->getOperand(0).getReg());
+  OS << '+' << MI->getOperand(1).getImm();
+  OS << ']';
+  OS << "+" << MI->getOperand(NOps - 2).getImm();
+}
+
+
+#include "AArch64GenMCPseudoLowering.inc"
+
+void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  switch (MI->getOpcode()) {
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  }
+
+  MCInst TmpInst;
+  LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0), 0);
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmPrinter() {
+    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64Target);
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.h b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.h
new file mode 100644
index 000000000000..af0c9fed066f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.h
@@ -0,0 +1,80 @@
+// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64ASMPRINTER_H
+#define LLVM_AARCH64ASMPRINTER_H
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  // emitPseudoExpansionLowering - tblgen'erated.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  public:
+  explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+  }
+
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
+
+  MCOperand lowerSymbolOperand(const MachineOperand &MO,
+                               const MCSymbol *Sym) const;
+
+  void EmitInstruction(const MachineInstr *MI);
+  void EmitEndOfAsmFile(Module &M);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O);
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  /// printSymbolicAddress - Given some kind of reasonably bare symbolic
+  /// reference, print out the appropriate asm string to represent it. If
+  /// appropriate, a relocation-specifier will be produced, composed of a
+  /// general class derived from the MO parameter and an instruction-specific
+  /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
+  /// given.
+  bool printSymbolicAddress(const MachineOperand &MO,
+                            bool PrintImmediatePrefix,
+                            StringRef Suffix, raw_ostream &O);
+
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+
+  virtual const char *getPassName() const {
+    return "AArch64 Assembly Printer";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchFixupPass.cpp
new file mode 100644
index 000000000000..71233ba5c3dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchFixupPass.cpp
@@ -0,0 +1,600 @@
+//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that fixes AArch64 branches which have ended up out
+// of range for their immediate operands.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-branch-fixup"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+
+/// Return the worst case padding that could result from unknown offset bits.
+/// This does not include alignment padding caused by known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+namespace {
+  /// Due to limited PC-relative displacements, conditional branches to distant
+  /// blocks may need converting into an unconditional equivalent. For example:
+  ///     tbz w1, #0, far_away
+  /// becomes
+  ///     tbnz w1, #0, skip
+  ///     b far_away
+  ///   skip:
+  class AArch64BranchFixup : public MachineFunctionPass {
+    /// Information about the offset and size of a single basic block.
+    struct BasicBlockInfo {
+      /// Distance from the beginning of the function to the beginning of this
+      /// basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size of the basic block in bytes.  If the block contains inline
+      /// assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      /// The number of low bits in Offset that are known to be exact.  The
+      /// remaining bits of Offset are an upper bound.
+      uint8_t KnownBits;
+
+      /// When non-zero, the block contains instructions (inline asm) of unknown
+      /// size.  The real size may be smaller than Size bytes by a multiple of 1
+      /// << Unalign.
+      uint8_t Unalign;
+
+      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
+
+      /// Compute the number of known offset bits internally to this block.
+      /// This number should be used to predict worst case padding when
+      /// splitting the block.
+      unsigned internalKnownBits() const {
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = CountTrailingZeros_32(Size);
+        return Bits;
+      }
+
+      /// Compute the offset immediately following this block.  If LogAlign is
+      /// specified, return the offset the successor block will get if it has
+      /// this alignment.
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        if (!LogAlign)
+          return PO;
+        // Add alignment padding from the terminator.
+        return PO + UnknownPadding(LogAlign, internalKnownBits());
+      }
+
+      /// Compute the number of known low bits of postOffset.  If this block
+      /// contains inline asm, the number of known bits drops to the
+      /// instruction alignment.  An aligned terminator may increase the number
+      /// of know bits.
+      /// If LogAlign is given, also consider the alignment of the next block.
+      unsigned postKnownBits(unsigned LogAlign = 0) const {
+        return std::max(LogAlign, internalKnownBits());
+      }
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// One per immediate branch, keeping the machine instruction pointer,
+    /// conditional or unconditional, the max displacement, and (if IsCond is
+    /// true) the corresponding inverted branch opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned OffsetBits : 31;
+      bool IsCond : 1;
+      ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
+        : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
+    };
+
+    /// Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    MachineFunction *MF;
+    const AArch64InstrInfo *TII;
+  public:
+    static char ID;
+    AArch64BranchFixup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "AArch64 branch fixup pass";
+    }
+
+  private:
+    void initializeFunctionInfo();
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned OffsetBits);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    void dumpBBs();
+    void verify();
+  };
+  char AArch64BranchFixup::ID = 0;
+}
+
+/// check BBOffsets
+void AArch64BranchFixup::verify() {
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned MBBId = MBB->getNumber();
+    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void AArch64BranchFixup::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+
+/// Returns an instance of the branch fixup pass.
+FunctionPass *llvm::createAArch64BranchFixupPass() {
+  return new AArch64BranchFixup();
+}
+
+bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  DEBUG(dbgs() << "***** AArch64BranchFixup ******");
+  TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block and location of each immediate branch.
+  initializeFunctionInfo();
+
+  // Iteratively fix up branches until there is no change.
+  unsigned NoBRIters = 0;
+  bool MadeChange = false;
+  while (true) {
+    DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  ImmBranches.clear();
+
+  return MadeChange;
+}
+
+/// Return true if the specified basic block can fallthrough into the block
+/// immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// Do the initial scan of the function, building up information about the sizes
+/// of each block, and each immediate branch.
+void AArch64BranchFixup::initializeFunctionInfo() {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool IsCond = false;
+
+        // The offsets encoded in instructions here scale by the instruction
+        // size (4 bytes), effectively increasing their range by 2 bits.
+        unsigned Bits = 0;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case AArch64::TBZxii:
+        case AArch64::TBZwii:
+        case AArch64::TBNZxii:
+        case AArch64::TBNZwii:
+          IsCond = true;
+          Bits = 14 + 2;
+          break;
+        case AArch64::Bcc:
+        case AArch64::CBZx:
+        case AArch64::CBZw:
+        case AArch64::CBNZx:
+        case AArch64::CBNZw:
+          IsCond = true;
+          Bits = 19 + 2;
+          break;
+        case AArch64::Bimm:
+          Bits = 26 + 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
+      }
+    }
+  }
+}
+
+/// Compute the size and some alignment information for MBB.  This function
+/// updates BBInfo directly.
+void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    BBI.Size += TII->getInstSizeInBytes(*I);
+    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I->isInlineAsm())
+      BBI.Unalign = 2;
+  }
+}
+
+/// Return the current offset of the specified machine instruction from the
+/// start of the function.  This offset changes as stuff is moved around inside
+/// the function.
+unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+  return Offset;
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *
+AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.  Stop if the offset is already correct,
+    // and we have updated 2 blocks.  This is the maximum number of blocks
+    // changed before calling this function.
+    if (i > BBNum + 2 &&
+        BBInfo[i].Offset == Offset &&
+        BBInfo[i].KnownBits == KnownBits)
+      break;
+
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
+  }
+}
+
+/// Returns true if the distance between specific MI and specific BB can fit in
+/// MI's displacement field.
+bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
+                                     MachineBasicBlock *DestBB,
+                                     unsigned OffsetBits) {
+  int64_t BrOffset   = getOffsetOf(MI);
+  int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " bits available=" << OffsetBits
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  return isIntN(OffsetBits, DestOffset - BrOffset);
+}
+
+/// Fix up an immediate branch whose destination is too far away to fit in its
+/// displacement field.
+bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = 0;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isMBB()) {
+      DestBB = MI->getOperand(i).getMBB();
+      break;
+    }
+  }
+  assert(DestBB && "Branch with no destination BB?");
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.OffsetBits))
+    return false;
+
+  assert(Br.IsCond && "Only conditional branches should need fixup");
+  return fixupConditionalBr(Br);
+}
+
+/// Fix up a conditional branch whose destination is too far away to fit in its
+/// displacement field. It is converted to an inverse conditional branch + an
+/// unconditional branch to the destination.
+bool
+AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  unsigned CondBrMBBOperand = 0;
+
+  // The general idea is to add an unconditional branch to the destination and
+  // invert the conditional branch to jump over it. Complications occur around
+  // fallthrough and unreachable ends to the block.
+  //   b.lt L1
+  //   =>
+  //   b.ge L2
+  //   b   L1
+  // L2:
+
+  // First we invert the conditional branch, by creating a replacement if
+  // necessary. This if statement contains all the special handling of different
+  // branch types.
+  if (MI->getOpcode() == AArch64::Bcc) {
+    // The basic block is operand number 1 for Bcc
+    CondBrMBBOperand = 1;
+
+    A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
+    CC = A64InvertCondCode(CC);
+    MI->getOperand(0).setImm(CC);
+  } else {
+    MachineInstrBuilder InvertedMI;
+    int InvertedOpcode;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unknown branch type");
+    case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
+    case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
+    case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
+    case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
+    case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
+    case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
+    case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
+    case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
+    }
+
+    InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
+    for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
+      InvertedMI.addOperand(MI->getOperand(i));
+      if (MI->getOperand(i).isMBB())
+        CondBrMBBOperand = i;
+    }
+
+    MI->eraseFromParent();
+    MI = Br.MI = InvertedMI;
+  }
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through
+  // block. Otherwise, split the MBB before the next instruction.
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == AArch64::Bimm) {
+      // Last MI in the BB is an unconditional branch. We can swap destinations:
+      // b.eq L1 (temporarily b.ne L1 after first change)
+      // b   L2
+      // =>
+      // b.ne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    MachineBasicBlock::iterator MBBI = MI; ++MBBI;
+    splitBlockBeforeInstr(MBBI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->getInstSizeInBytes(MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+
+  // After splitting and removing the unconditional branch from the original BB,
+  // the structure is now:
+  // oldbb:
+  //   [things]
+  //   b.invertedCC L1
+  // splitbb/fallthroughbb:
+  //   [old b L2/real continuation]
+  //
+  // We now have to change the conditional branch to point to splitbb and add an
+  // unconditional branch after it to L1, giving the final structure:
+  // oldbb:
+  //   [things]
+  //   b.invertedCC splitbb
+  //   b L1
+  // splitbb/fallthroughbb:
+  //   [old b L2/real continuation]
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#"
+               << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new unconditional branch and fixup the destination of the
+  // conditional one.  Also update the ImmBranch as well as adding a new entry
+  // for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
+    .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
+  MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
+
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+
+  // 26 bits written down in Bimm, specifying a multiple of 4.
+  unsigned OffsetBits = 26 + 2;
+  ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
+
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConv.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConv.td
new file mode 100644
index 000000000000..b880d8373deb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConv.td
@@ -0,0 +1,196 @@
+//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for AArch64 architecture.
+//===----------------------------------------------------------------------===//
+
+
+// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
+// higher level of abstraction than LLVM's target interface presents. In
+// particular, it refers (like other ABIs, in fact) directly to
+// structs. However, generic LLVM code takes the liberty of lowering structure
+// arguments to the component fields before we see them.
+//
+// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
+// implemented, so the goals of this calling convention are, in decreasing
+// priority order:
+//     1. Expose *some* way to express the concepts required to implement the
+//        generic PCS from a front-end.
+//     2. Provide a sane ABI for pure LLVM.
+//     3. Follow the generic PCS as closely as is naturally possible.
+//
+// The suggested front-end implementation of PCS features is:
+//     * Integer, float and vector arguments of all sizes which end up in
+//       registers are passed and returned via the natural LLVM type.
+//     * Structure arguments with size <= 16 bytes are passed and returned in
+//       registers as similar integer or composite types. For example:
+//       [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
+//     * HFAs in registers follow rules similar to small structs: appropriate
+//       composite types.
+//     * Structure arguments with size > 16 bytes are passed via a pointer,
+//       handled completely by the front-end.
+//     * Structure return values > 16 bytes via an sret pointer argument.
+//     * Other stack-based arguments (not large structs) are passed using byval
+//       pointers. Padding arguments are added beforehand to guarantee a large
+//       struct doesn't later use integer registers.
+//
+// N.b. this means that it is the front-end's responsibility (if it cares about
+// PCS compliance) to check whether enough registers are available for an
+// argument when deciding how to pass it.
+
+class CCIfAlign<int Align, CCAction A>:
+  CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
+
+def CC_A64_APCS : CallingConv<[
+  // SRet is an LLVM-specific concept, so it takes precedence over general ABI
+  // concerns. However, this rule will be used by C/C++ frontends to implement
+  // structure return.
+  CCIfSRet<CCAssignToReg<[X8]>>,
+
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Canonicalise the various types that live in different floating-point
+  // registers. This makes sense because the PCS does not distinguish Short
+  // Vectors and Floating-point types.
+  CCIfType<[v2i8], CCBitConvertToType<f16>>,
+  CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCBitConvertToType<f128>>,
+
+  // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
+  // Floating-point or Short Vector Type and the NSRN is less than 8, then the
+  // argument is allocated to the least significant bits of register
+  // v[NSRN]. The NSRN is incremented by one. The argument has now been
+  // allocated."
+  CCIfType<[f16],  CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+
+  // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
+  // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
+  // argument is allocated to SIMD and Floating-point registers (with one
+  // register per element of the HFA). The NSRN is incremented by the number of
+  // registers used. The argument has now been allocated."
+  //
+  // N.b. As above, this rule is the responsibility of the front-end.
+
+  // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
+  // the argument is rounded up to the nearest multiple of 8 bytes."
+  //
+  // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
+  // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
+  // Alignment of the Argument's type."
+  //
+  // It is expected that these will be satisfied by adding dummy arguments to
+  // the prototype.
+
+  // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
+  // type then the size of the argument is set to 8 bytes. The effect is as if
+  // the argument had been copied to the least significant bits of a 64-bit
+  // register and the remaining bits filled with unspecified values."
+  CCIfType<[f16, f32], CCPromoteToType<f64>>,
+
+  // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
+  // precision Floating-point or Short Vector Type, then the argument is copied
+  // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
+  // argument. The argument has now been allocated."
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[f128], CCAssignToStack<16, 16>>,
+
+  // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
+  // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
+  // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
+  // one. The argument has now been allocated."
+
+  // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
+  // represented as two i64s, the first one being split. If we delayed this
+  // operation C.8 would never be reached.
+  CCIfType<[i64],
+        CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
+
+  // Note: the promotion also implements C.14.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // And now the real implementation of C.7
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+
+  // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
+  // up to the next even number."
+  //
+  // "C.9: If the argument is an Integral Type, the size of the argument is
+  // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
+  // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
+  // memory representation of the argument. The NGRN is incremented by two. The
+  // argument has now been allocated."
+  //
+  // Subtlety here: what if alignment is 16 but it is not an integral type? All
+  // floating-point types have been allocated already, which leaves composite
+  // types: this is why a front-end may need to produce i128 for a struct <= 16
+  // bytes.
+
+  // PCS: "C.10 If the argument is a Composite Type and the size in double-words
+  // of the argument is not more than 8 minus NGRN, then the argument is copied
+  // into consecutive general-purpose registers, starting at x[NGRN]. The
+  // argument is passed as though it had been loaded into the registers from a
+  // double-word aligned address with an appropriate sequence of LDR
+  // instructions loading consecutive registers from memory (the contents of any
+  // unused parts of the registers are unspecified by this standard). The NGRN
+  // is incremented by the number of registers used. The argument has now been
+  // allocated."
+  //
+  // Another one that's the responsibility of the front-end (sigh).
+
+  // PCS: "C.11: The NGRN is set to 8."
+  CCCustom<"CC_AArch64NoMoreRegs">,
+
+  // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
+  // Alignment of the argument's type."
+  //
+  // PCS: "C.13: If the argument is a composite type then the argument is copied
+  // to memory at the adjusted NSAA. The NSAA is by the size of the
+  // argument. The argument has now been allocated."
+  //
+  // Note that the effect of this corresponds to a memcpy rather than register
+  // stores so that the struct ends up correctly addressable at the adjusted
+  // NSAA.
+
+  // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
+  // of the argument is set to 8 bytes. The effect is as if the argument was
+  // copied to the least significant bits of a 64-bit register and the remaining
+  // bits filled with unspecified values."
+  //
+  // Integer types were widened above. Floating-point and composite types have
+  // already been allocated completely. Nothing to do.
+
+  // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
+  // is incremented by the size of the argument. The argument has now been
+  // allocated."
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64], CCAssignToStack<8, 8>>
+
+]>;
+
+// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
+// of vector registers (8-15) are callee-saved. The order here is is picked up
+// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
+// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
+// [sp-16], ...
+def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
+                                   (sequence "D%u", 15, 8))>;
+
+
+// TLS descriptor calls are extremely restricted in their changes, to allow
+// optimisations in the (hopefully) more common fast path where no real action
+// is needed. They actually have to preserve all registers, except for the
+// unavoidable X30 and the return register X0.
+def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
+                                   (sequence "Q%u", 31, 0))>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
new file mode 100644
index 000000000000..dc41f2f60525
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -0,0 +1,633 @@
+//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
+                                              uint64_t &Initial,
+                                              uint64_t &Residual) const {
+  // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
+  // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
+  // 0x1f8, but stack adjustment should always be a multiple of 16.
+  if (Total <= 0x1f0) {
+    Initial = Total;
+    Residual = 0;
+  } else {
+    Initial = 0x1f0;
+    Residual = Total - Initial;
+  }
+}
+
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  bool NeedsFrameMoves = MMI.hasDebugInfo()
+    || MF.getFunction()->needsUnwindTableEntry();
+
+  uint64_t NumInitialBytes, NumResidualBytes;
+
+  // Currently we expect the stack to be laid out by
+  //     sub sp, sp, #initial
+  //     stp x29, x30, [sp, #offset]
+  //     ...
+  //     str xxx, [sp, #offset]
+  //     sub sp, sp, #rest (possibly via extra instructions).
+  if (MFI->getCalleeSavedInfo().size()) {
+    // If there are callee-saved registers, we want to store them efficiently as
+    // a block, and virtual base assignment happens too early to do it for us so
+    // we adjust the stack in two phases: first just for callee-saved fiddling,
+    // then to allocate the rest of the frame.
+    splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
+  } else {
+    // If there aren't any callee-saved registers, two-phase adjustment is
+    // inefficient. It's more efficient to adjust with NumInitialBytes too
+    // because when we're in a "callee pops argument space" situation, that pop
+    // must be tacked onto Initial for correctness.
+    NumInitialBytes = MFI->getStackSize();
+    NumResidualBytes = 0;
+  }
+
+  // Tell everyone else how much adjustment we're expecting them to use. In
+  // particular if an adjustment is required for a tail call the epilogue could
+  // have a different view of things.
+  FuncInfo->setInitialStackAdjust(NumInitialBytes);
+
+  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
+               MachineInstr::FrameSetup);
+
+  if (NeedsFrameMoves && NumInitialBytes) {
+    // We emit this update even if the CFA is set from a frame pointer later so
+    // that the CFA is valid in the interim.
+    MCSymbol *SPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+      .addSym(SPLabel);
+
+    MachineLocation Dst(MachineLocation::VirtualFP);
+    MachineLocation Src(AArch64::XSP, NumInitialBytes);
+    Moves.push_back(MachineMove(SPLabel, Dst, Src));
+  }
+
+  // Otherwise we need to set the frame pointer and/or add a second stack
+  // adjustment.
+
+  bool FPNeedsSetting = hasFP(MF);
+  for (; MBBI != MBB.end(); ++MBBI) {
+    // Note that this search makes strong assumptions about the operation used
+    // to store the frame-pointer: it must be "STP x29, x30, ...". This could
+    // change in future, but until then there's no point in implementing
+    // untestable more generic cases.
+    if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
+                       && MBBI->getOperand(0).getReg() == AArch64::X29) {
+      int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
+      FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
+
+      ++MBBI;
+      emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
+                    AArch64::X29,
+                    NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
+                    MachineInstr::FrameSetup);
+
+      // The offset adjustment used when emitting debugging locations relative
+      // to whatever frame base is set. AArch64 uses the default frame base (FP
+      // or SP) and this adjusts the calculations to be correct.
+      MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
+                               - MFI->getStackSize());
+
+      if (NeedsFrameMoves) {
+        MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol();
+        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+          .addSym(FPLabel);
+        MachineLocation Dst(MachineLocation::VirtualFP);
+        MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx));
+        Moves.push_back(MachineMove(FPLabel, Dst, Src));
+      }
+
+      FPNeedsSetting = false;
+    }
+
+    if (!MBBI->getFlag(MachineInstr::FrameSetup))
+      break;
+  }
+
+  assert(!FPNeedsSetting && "Frame pointer couldn't be set");
+
+  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
+               MachineInstr::FrameSetup);
+
+  // Now we emit the rest of the frame setup information, if necessary: we've
+  // already noted the FP and initial SP moves so we're left with the prologue's
+  // final SP update and callee-saved register locations.
+  if (!NeedsFrameMoves)
+    return;
+
+  // Reuse the label if appropriate, so create it in this outer scope.
+  MCSymbol *CSLabel = 0;
+
+  // The rest of the stack adjustment
+  if (!hasFP(MF) && NumResidualBytes) {
+    CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+      .addSym(CSLabel);
+
+    MachineLocation Dst(MachineLocation::VirtualFP);
+    MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes);
+    Moves.push_back(MachineMove(CSLabel, Dst, Src));
+  }
+
+  // And any callee-saved registers (it's fine to leave them to the end here,
+  // because the old values are still valid at this point.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.size()) {
+    if (!CSLabel) {
+      CSLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
+        .addSym(CSLabel);
+    }
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      MachineLocation Dst(MachineLocation::VirtualFP,
+                          MFI->getObjectOffset(I->getFrameIdx()));
+      MachineLocation Src(I->getReg());
+      Moves.push_back(MachineMove(CSLabel, Dst, Src));
+    }
+  }
+}
+
+void
+AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  DebugLoc DL = MBBI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  // Initial and residual are named for consitency with the prologue. Note that
+  // in the epilogue, the residual adjustment is executed first.
+  uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
+  uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
+  uint64_t ArgumentPopSize = 0;
+  if (RetOpcode == AArch64::TC_RETURNdi ||
+      RetOpcode == AArch64::TC_RETURNxi) {
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    MachineInstrBuilder MIB;
+    if (RetOpcode == AArch64::TC_RETURNdi) {
+      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
+      if (JumpTarget.isGlobal()) {
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      } else {
+        assert(JumpTarget.isSymbol() && "unexpected tail call destination");
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else {
+      assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
+             && "Unexpected tail call");
+
+      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
+      MIB.addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    // Add the extra operands onto the new tail call instruction even though
+    // they're not used directly (so that liveness is tracked properly etc).
+    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
+        MIB->addOperand(MBBI->getOperand(i));
+
+
+    // Delete the pseudo instruction TC_RETURN.
+    MachineInstr *NewMI = prior(MBBI);
+    MBB.erase(MBBI);
+    MBBI = NewMI;
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+  }
+
+  assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
+         && "refusing to adjust stack by misaligned amt");
+
+  // We may need to address callee-saved registers differently, so find out the
+  // bound on the frame indices.
+  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The "residual" stack update comes first from this direction and guarantees
+  // that SP is NumInitialBytes below its value on function entry, either by a
+  // direct update or restoring it from the frame pointer.
+  if (NumInitialBytes + ArgumentPopSize != 0) {
+    emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
+                 NumInitialBytes + ArgumentPopSize);
+    --MBBI;
+  }
+
+
+  // MBBI now points to the instruction just past the last callee-saved
+  // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
+  // otherwise).
+
+  // Now we need to find out where to put the bulk of the stack adjustment
+  MachineBasicBlock::iterator FirstEpilogue = MBBI;
+  while (MBBI != MBB.begin()) {
+    --MBBI;
+
+    unsigned FrameOp;
+    for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
+      if (MBBI->getOperand(FrameOp).isFI())
+        break;
+    }
+
+    // If this instruction doesn't have a frame index we've reached the end of
+    // the callee-save restoration.
+    if (FrameOp == MBBI->getNumOperands())
+      break;
+
+    // Likewise if it *is* a local reference, but not to a callee-saved object.
+    int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
+    if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
+      break;
+
+    FirstEpilogue = MBBI;
+  }
+
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    int64_t StaticFrameBase;
+    StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
+    emitRegUpdate(MBB, FirstEpilogue, DL, TII,
+                  AArch64::XSP, AArch64::X29, AArch64::NoRegister,
+                  StaticFrameBase);
+  } else {
+    emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
+  }
+}
+
+int64_t
+AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
+                                                 int FrameIndex,
+                                                 unsigned &FrameReg,
+                                                 int SPAdj,
+                                                 bool IsCalleeSaveOp) const {
+  AArch64MachineFunctionInfo *FuncInfo =
+    MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
+
+  assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
+         && "callee-saved register in unexpected place");
+
+  // If the frame for this function is particularly large, we adjust the stack
+  // in two phases which means the callee-save related operations see a
+  // different (intermediate) stack size.
+  int64_t FrameRegPos;
+  if (IsCalleeSaveOp) {
+    FrameReg = AArch64::XSP;
+    FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
+  } else if (useFPForAddressing(MF)) {
+    // Have to use the frame pointer since we have no idea where SP is.
+    FrameReg = AArch64::X29;
+    FrameRegPos = FuncInfo->getFramePointerOffset();
+  } else {
+    FrameReg = AArch64::XSP;
+    FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
+  }
+
+  return TopOfFrameOffset - FrameRegPos;
+}
+
+void
+AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo =
+    static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+
+  if (hasFP(MF)) {
+    MF.getRegInfo().setPhysRegUsed(AArch64::X29);
+    MF.getRegInfo().setPhysRegUsed(AArch64::X30);
+  }
+
+  // If addressing of local variables is going to be more complicated than
+  // shoving a base register and an offset into the instruction then we may well
+  // need to scavenge registers. We should either specifically add an
+  // callee-save register for this purpose or allocate an extra spill slot.
+
+  bool BigStack =
+    (RS && MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF))
+    || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
+    || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
+
+  if (!BigStack)
+    return;
+
+  // We certainly need some slack space for the scavenger, preferably an extra
+  // register.
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  uint16_t ExtraReg = AArch64::NoRegister;
+
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
+        !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
+      ExtraReg = CSRegs[i];
+      break;
+    }
+  }
+
+  if (ExtraReg != 0) {
+    MF.getRegInfo().setPhysRegUsed(ExtraReg);
+  } else {
+    // Create a stack slot for scavenging purposes. PrologEpilogInserter
+    // helpfully places it near either SP or FP for us to avoid
+    // infinitely-regression during scavenging.
+    const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
+}
+
+bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
+                                                  unsigned Reg) const {
+  // If @llvm.returnaddress is called then it will refer to X30 by some means;
+  // the prologue store does not kill the register.
+  if (Reg == AArch64::X30) {
+    if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
+        && MBB.getParent()->getRegInfo().isLiveIn(Reg))
+    return false;
+  }
+
+  // In all other cases, physical registers are dead after they've been saved
+  // but live at the beginning of the prologue block.
+  MBB.addLiveIn(Reg);
+  return true;
+}
+
+void
+AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      const TargetRegisterInfo *TRI,
+                                      LoadStoreMethod PossClasses[],
+                                      unsigned NumClasses) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // A certain amount of implicit contract is present here. The actual stack
+  // offsets haven't been allocated officially yet, so for strictly correct code
+  // we rely on the fact that the elements of CSI are allocated in order
+  // starting at SP, purely as dictated by size and alignment. In practice since
+  // this function handles the only accesses to those slots it's not quite so
+  // important.
+  //
+  // We have also ordered the Callee-saved register list in AArch64CallingConv
+  // so that the above scheme puts registers in order: in particular we want
+  // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
+  for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    // First we need to find out which register class the register belongs to so
+    // that we can use the correct load/store instrucitons.
+    unsigned ClassIdx;
+    for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
+      if (PossClasses[ClassIdx].RegClass->contains(Reg))
+        break;
+    }
+    assert(ClassIdx != NumClasses
+           && "Asked to store register in unexpected class");
+    const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
+
+    // Now we need to decide whether it's possible to emit a paired instruction:
+    // for this we want the next register to be in the same class.
+    MachineInstrBuilder NewMI;
+    bool Pair = false;
+    if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
+      Pair = true;
+      unsigned StLow = 0, StHigh = 0;
+      if (isPrologue) {
+        // Most of these registers will be live-in to the MBB and killed by our
+        // store, though there are exceptions (see determinePrologueDeath).
+        StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
+        StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+      } else {
+        StLow = RegState::Define;
+        StHigh = RegState::Define;
+      }
+
+      NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
+                .addReg(CSI[i+1].getReg(), StLow)
+                .addReg(CSI[i].getReg(), StHigh);
+
+      // If it's a paired op, we've consumed two registers
+      ++i;
+    } else {
+      unsigned State;
+      if (isPrologue) {
+        State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+      } else {
+        State = RegState::Define;
+      }
+
+      NewMI = BuildMI(MBB, MBBI, DL,
+                      TII.get(PossClasses[ClassIdx].SingleOpcode))
+                .addReg(CSI[i].getReg(), State);
+    }
+
+    // Note that the FrameIdx refers to the second register in a pair: it will
+    // be allocated the smaller numeric address and so is the one an LDP/STP
+    // address must use.
+    int FrameIdx = CSI[i].getFrameIdx();
+    MachineMemOperand::MemOperandFlags Flags;
+    Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                             Flags,
+                             Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
+                             MFI.getObjectAlignment(FrameIdx));
+
+    NewMI.addFrameIndex(FrameIdx)
+      .addImm(0)                  // address-register offset
+      .addMemOperand(MMO);
+
+    if (isPrologue)
+      NewMI.setMIFlags(MachineInstr::FrameSetup);
+
+    // For aesthetic reasons, during an epilogue we want to emit complementary
+    // operations to the prologue, but in the opposite order. So we still
+    // iterate through the CalleeSavedInfo list in order, but we put the
+    // instructions successively earlier in the MBB.
+    if (!isPrologue)
+      --MBBI;
+  }
+}
+
+bool
+AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  static LoadStoreMethod PossibleClasses[] = {
+    {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
+    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
+  };
+  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+
+  emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
+                  PossibleClasses, NumClasses);
+
+  return true;
+}
+
+bool
+AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  if (CSI.empty())
+    return false;
+
+  static LoadStoreMethod PossibleClasses[] = {
+    {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
+    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
+  };
+  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+
+  emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
+                  PossibleClasses, NumClasses);
+
+  return true;
+}
+
+bool
+AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
+
+  // This is a decision of ABI compliance. The AArch64 PCS gives various options
+  // for conformance, and even at the most stringent level more or less permits
+  // elimination for leaf functions because there's no loss of functionality
+  // (for debugging etc)..
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
+    return true;
+
+  // The following are hard-limits: incorrect code will be generated if we try
+  // to omit the frame.
+  return (RI->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool
+AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Of the various reasons for having a frame pointer, it's actually only
+  // variable-sized objects that prevent reservation of a call frame.
+  return !(hasFP(MF) && MFI->hasVarSizedObjects());
+}
+
+void
+AArch64FrameLowering::eliminateCallFramePseudoInstr(
+                                MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MI->getDebugLoc();
+  int Opcode = MI->getOpcode();
+  bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
+
+  if (!hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = MI->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy) Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 12-bits
+      // because there's no guaranteed temporary register available. Mostly call
+      // frames will be allocated at the start of a function so this is OK, but
+      // it is a limitation that needs dealing with.
+      assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
+      emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xfff && "call frame too large");
+    emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
+  }
+
+  MBB.erase(MI);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
new file mode 100644
index 000000000000..45ea0ec8e071
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -0,0 +1,108 @@
+//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the AArch64-specific parts of the TargetFrameLowering
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_FRAMEINFO_H
+#define LLVM_AARCH64_FRAMEINFO_H
+
+#include "AArch64Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class AArch64Subtarget;
+
+class AArch64FrameLowering : public TargetFrameLowering {
+private:
+  // In order to unify the spilling and restoring of callee-saved registers into
+  // emitFrameMemOps, we need to be able to specify which instructions to use
+  // for the relevant memory operations on each register class. An array of the
+  // following struct is populated and passed in to achieve this.
+  struct LoadStoreMethod {
+    const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
+
+    // The preferred instruction.
+    unsigned PairOpcode; // E.g. LSPair64_STR
+
+    // Sometimes only a single register can be handled at once.
+    unsigned SingleOpcode; // E.g. LS64_STR
+  };
+protected:
+  const AArch64Subtarget &STI;
+
+public:
+  explicit AArch64FrameLowering(const AArch64Subtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
+      STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  /// Decides how much stack adjustment to perform in each phase of the prologue
+  /// and epilogue.
+  void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
+                          uint64_t &Residual) const;
+
+  int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
+                                     unsigned &FrameReg, int SPAdj,
+                                     bool IsCalleeSaveOp) const;
+
+  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                    RegScavenger *RS) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  /// If the register is X30 (i.e. LR) and the return address is used in the
+  /// function then the callee-save store doesn't actually kill the register,
+  /// otherwise it does.
+  bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
+
+  /// This function emits the loads or stores required during prologue and
+  /// epilogue as efficiently as possible.
+  ///
+  /// The operations involved in setting up and tearing down the frame are
+  /// similar enough to warrant a shared function, particularly as discrepancies
+  /// between the two would be disastrous.
+  void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       const std::vector<CalleeSavedInfo> &CSI,
+                       const TargetRegisterInfo *TRI,
+                       LoadStoreMethod PossibleClasses[],
+                       unsigned NumClasses) const;
+
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+
+  virtual bool useFPForAddressing(const MachineFunction &MF) const;
+
+  /// On AA
+  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..46b822152a00
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -0,0 +1,415 @@
+//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AArch64 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-isel"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// AArch64 specific code to select AArch64 machine instructions for
+/// SelectionDAG operations.
+///
+namespace {
+
+class AArch64DAGToDAGISel : public SelectionDAGISel {
+  AArch64TargetMachine &TM;
+  const AArch64InstrInfo *TII;
+
+  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+public:
+  explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
+                               CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm),
+      TII(static_cast<const AArch64InstrInfo*>(TM.getInstrInfo())),
+      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "AArch64 Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+  template<unsigned MemSize>
+  bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+    if (!CN || CN->getZExtValue() % MemSize != 0
+        || CN->getZExtValue() / MemSize > 0xfff)
+      return false;
+
+    UImm12 =  CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
+    return true;
+  }
+
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
+
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                unsigned RegWidth);
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps);
+
+  bool SelectLogicalImm(SDValue N, SDValue &Imm);
+
+  template<unsigned RegWidth>
+  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
+    return SelectTSTBOperand(N, FixedPos, RegWidth);
+  }
+
+  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
+
+  SDNode *TrySelectToMoveImm(SDNode *N);
+  SDNode *LowerToFPLitPool(SDNode *Node);
+  SDNode *SelectToLitPool(SDNode *N);
+
+  SDNode* Select(SDNode*);
+private:
+};
+}
+
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  if (!CN) return false;
+
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
+
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+  return true;
+}
+
+bool
+AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                 char ConstraintCode,
+                                                 std::vector<SDValue> &OutOps) {
+  switch (ConstraintCode) {
+  default: llvm_unreachable("Unrecognised AArch64 memory constraint");
+  case 'm':
+    // FIXME: more freedom is actually permitted for 'm'. We can go
+    // hunting for a base and an offset if we want. Of course, since
+    // we don't really know how the operand is going to be used we're
+    // probably restricted to the load/store pair's simm7 as an offset
+    // range anyway.
+  case 'Q':
+    OutOps.push_back(Op);
+  }
+
+  return false;
+}
+
+bool
+AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
+  ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
+  if (!Imm || !Imm->getValueAPF().isPosZero())
+    return false;
+
+  // Doesn't actually carry any information, but keeps TableGen quiet.
+  Dummy = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
+  uint32_t Bits;
+  uint32_t RegWidth = N.getValueType().getSizeInBits();
+
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) return false;
+
+  if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
+    return false;
+
+  Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
+  SDNode *ResNode;
+  DebugLoc dl = Node->getDebugLoc();
+  EVT DestType = Node->getValueType(0);
+  unsigned DestWidth = DestType.getSizeInBits();
+
+  unsigned MOVOpcode;
+  EVT MOVType;
+  int UImm16, Shift;
+  uint32_t LogicalBits;
+
+  uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
+  if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
+    MOVType = DestType;
+    MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
+  } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
+    MOVType = DestType;
+    MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
+  } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
+    // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
+    // use a 32-bit instruction: "movn w0, 0xedbc".
+    MOVType = MVT::i32;
+    MOVOpcode = AArch64::MOVNwii;
+  } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits))  {
+    MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
+    uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
+
+    return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
+                              CurDAG->getRegister(ZR, DestType),
+                              CurDAG->getTargetConstant(LogicalBits, MVT::i32));
+  } else {
+    // Can't handle it in one instruction. There's scope for permitting two (or
+    // more) instructions, but that'll need more thought.
+    return NULL;
+  }
+
+  ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
+                                   CurDAG->getTargetConstant(UImm16, MVT::i32),
+                                   CurDAG->getTargetConstant(Shift, MVT::i32));
+
+  if (MOVType != DestType) {
+    ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                          MVT::i64, MVT::i32, MVT::Other,
+                          CurDAG->getTargetConstant(0, MVT::i64),
+                          SDValue(ResNode, 0),
+                          CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
+  }
+
+  return ResNode;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
+  DebugLoc DL = Node->getDebugLoc();
+  uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
+  int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
+  EVT DestType = Node->getValueType(0);
+  EVT PtrVT = TLI.getPointerTy();
+
+  // Since we may end up loading a 64-bit constant from a 32-bit entry the
+  // constant in the pool may have a different type to the eventual node.
+  ISD::LoadExtType Extension;
+  EVT MemType;
+
+  assert((DestType == MVT::i64 || DestType == MVT::i32)
+         && "Only expect integer constants at the moment");
+
+  if (DestType == MVT::i32) {
+    Extension = ISD::NON_EXTLOAD;
+    MemType = MVT::i32;
+  } else if (UnsignedVal <= UINT32_MAX) {
+    Extension = ISD::ZEXTLOAD;
+    MemType = MVT::i32;
+  } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
+    Extension = ISD::SEXTLOAD;
+    MemType = MVT::i32;
+  } else {
+    Extension = ISD::NON_EXTLOAD;
+    MemType = MVT::i64;
+  }
+
+  Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
+                                                  MemType.getSizeInBits()),
+                                  UnsignedVal);
+  SDValue PoolAddr;
+  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType());
+  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
+                                                         AArch64II::MO_NO_FLAG),
+                             CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0,
+                                                           AArch64II::MO_LO12),
+                             CurDAG->getConstant(Alignment, MVT::i32));
+
+  return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
+                            PoolAddr,
+                            MachinePointerInfo::getConstantPool(), MemType,
+                            /* isVolatile = */ false,
+                            /* isNonTemporal = */ false,
+                            Alignment).getNode();
+}
+
+SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
+  DebugLoc DL = Node->getDebugLoc();
+  const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
+  EVT PtrVT = TLI.getPointerTy();
+  EVT DestType = Node->getValueType(0);
+
+  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType());
+  SDValue PoolAddr;
+
+  assert(TM.getCodeModel() == CodeModel::Small &&
+         "Only small code model supported");
+  PoolAddr = CurDAG->getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
+                                                         AArch64II::MO_NO_FLAG),
+                             CurDAG->getTargetConstantPool(FV, PtrVT, 0, 0,
+                                                           AArch64II::MO_LO12),
+                             CurDAG->getConstant(Alignment, MVT::i32));
+
+  return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
+                         MachinePointerInfo::getConstantPool(),
+                         /* isVolatile = */ false,
+                         /* isNonTemporal = */ false,
+                         /* isInvariant = */ true,
+                         Alignment).getNode();
+}
+
+bool
+AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
+                                       unsigned RegWidth) {
+  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) return false;
+
+  uint64_t Val = CN->getZExtValue();
+
+  if (!isPowerOf2_64(Val)) return false;
+
+  unsigned TestedBit = Log2_64(Val);
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding TestedBit, but it must still be in range.
+  if (TestedBit >= RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
+  return true;
+}
+
+SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+  // Dump information about the Node being selected
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    return NULL;
+  }
+
+  switch (Node->getOpcode()) {
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    EVT PtrTy = TLI.getPointerTy();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
+                                TFI, CurDAG->getTargetConstant(0, PtrTy));
+  }
+  case ISD::ConstantPool: {
+    // Constant pools are fine, just create a Target entry.
+    ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Node);
+    const Constant *C = CN->getConstVal();
+    SDValue CP = CurDAG->getTargetConstantPool(C, CN->getValueType(0));
+
+    ReplaceUses(SDValue(Node, 0), CP);
+    return NULL;
+  }
+  case ISD::Constant: {
+    SDNode *ResNode = 0;
+    if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
+      // XZR and WZR are probably even better than an actual move: most of the
+      // time they can be folded into another instruction with *no* cost.
+
+      EVT Ty = Node->getValueType(0);
+      assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
+      uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
+      ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                       Node->getDebugLoc(),
+                                       Register, Ty).getNode();
+    }
+
+    // Next best option is a move-immediate, see if we can do that.
+    if (!ResNode) {
+      ResNode = TrySelectToMoveImm(Node);
+    }
+
+    if (ResNode)
+      return ResNode;
+
+    // If even that fails we fall back to a lit-pool entry at the moment. Future
+    // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
+    ResNode = SelectToLitPool(Node);
+    assert(ResNode && "We need *some* way to materialise a constant");
+
+    // We want to continue selection at this point since the litpool access
+    // generated used generic nodes for simplicity.
+    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+    Node = ResNode;
+    break;
+  }
+  case ISD::ConstantFP: {
+    if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
+      // FMOV will take care of it from TableGen
+      break;
+    }
+
+    SDNode *ResNode = LowerToFPLitPool(Node);
+    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+
+    // We want to continue selection at this point since the litpool access
+    // generated used generic nodes for simplicity.
+    Node = ResNode;
+    break;
+  }
+  default:
+    break; // Let generic code handle it
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << "\n");
+
+  return ResNode;
+}
+
+/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
+/// instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new AArch64DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
new file mode 100644
index 000000000000..e9f449709c40
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -0,0 +1,2975 @@
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-isel"
+#include "AArch64.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64TargetObjectFile.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+
+using namespace llvm;
+
+static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
+  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  if (Subtarget->isTargetLinux())
+    return new AArch64LinuxTargetObjectFile();
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
+  llvm_unreachable("unknown subtarget type");
+}
+
+
+AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
+  : TargetLowering(TM, createTLOF(TM)),
+    Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+    RegInfo(TM.getRegisterInfo()),
+    Itins(TM.getInstrItineraryData()) {
+
+  // SIMD compares set the entire lane's bits to 1
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Scalar register <-> type mapping
+  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+
+  computeRegisterProperties();
+
+  // Some atomic operations can be folded into load-acquire or store-release
+  // instructions on AArch64. It's marginally simpler to let LLVM expand
+  // everything out to a barrier and then recombine the (few) barriers we can.
+  setInsertFencesForAtomic(true);
+  setTargetDAGCombine(ISD::ATOMIC_FENCE);
+  setTargetDAGCombine(ISD::ATOMIC_STORE);
+
+  // We combine OR nodes for bitfield and NEON BSL operations.
+  setTargetDAGCombine(ISD::OR);
+
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::SRA);
+
+  // AArch64 does not have i1 loads, or much of anything for i1 really.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+
+  setStackPointerRegisterToSaveRestore(AArch64::XSP);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+
+  // We'll lower globals to wrappers for selection.
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  // A64 instructions have the comparison predicate attached to the user of the
+  // result, but having a separate comparison is valuable for matching.
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
+
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+  // Legal floating-point operations.
+  setOperationAction(ISD::FABS, MVT::f32, Legal);
+  setOperationAction(ISD::FABS, MVT::f64, Legal);
+
+  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
+
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FNEG, MVT::f32, Legal);
+  setOperationAction(ISD::FNEG, MVT::f64, Legal);
+
+  setOperationAction(ISD::FRINT, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
+  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
+
+  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
+
+  // Illegal floating-point operations.
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+
+  setOperationAction(ISD::FEXP, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP, MVT::f64, Expand);
+
+  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
+
+  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
+
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+
+  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
+  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+
+  // Virtually no operation on f128 is legal, but LLVM can't expand them when
+  // there's a valid register class, so we need custom operations in most cases.
+  setOperationAction(ISD::FABS,       MVT::f128, Expand);
+  setOperationAction(ISD::FADD,       MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
+  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
+  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
+  setOperationAction(ISD::FMA,        MVT::f128, Expand);
+  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
+  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
+  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
+  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
+  setOperationAction(ISD::FREM,       MVT::f128, Expand);
+  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
+  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
+  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
+  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
+  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
+  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
+
+  // Lowering for many of the conversions is actually specified by the non-f128
+  // type. The LowerXXX function will be trivial when f128 isn't involved.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+
+  // This prevents LLVM trying to compress double constants into a floating
+  // constant-pool entry and trying to load from there. It's of doubtful benefit
+  // for A64: we'd need LDR followed by FCVT, I believe.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+
+  setExceptionPointerRegister(AArch64::X0);
+  setExceptionSelectorRegister(AArch64::X1);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const {
+  // It's reasonably important that this value matches the "natural" legal
+  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
+  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
+  if (!VT.isVector()) return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static void getExclusiveOperation(unsigned Size, unsigned &ldrOpc,
+                                  unsigned &strOpc) {
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for atomic binary op!");
+  case 1:
+    ldrOpc = AArch64::LDXR_byte;
+    strOpc = AArch64::STXR_byte;
+    break;
+  case 2:
+    ldrOpc = AArch64::LDXR_hword;
+    strOpc = AArch64::STXR_hword;
+    break;
+  case 4:
+    ldrOpc = AArch64::LDXR_word;
+    strOpc = AArch64::STXR_word;
+    break;
+  case 8:
+    ldrOpc = AArch64::LDXR_dword;
+    strOpc = AArch64::STXR_dword;
+    break;
+  }
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                        unsigned Size,
+                                        unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  const TargetRegisterClass *TRC
+    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldxr dest, ptr
+  //   <binop> scratch, dest, incr
+  //   stxr stxr_status, scratch, ptr
+  //   cbnz stxr_status, loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (BinOpcode) {
+    // All arithmetic operations we'll be creating are designed to take an extra
+    // shift or extend operand, which we can conveniently set to zero.
+
+    // Operand order needs to go the other way for NAND.
+    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
+        .addReg(incr).addReg(dest).addImm(0);
+    else
+      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
+        .addReg(dest).addReg(incr).addImm(0);
+  }
+
+  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loopMBB);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size,
+                                              unsigned CmpOp,
+                                              A64CC::CondCodes Cond) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  unsigned oldval = dest;
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *TRC, *TRCsp;
+  if (Size == 8) {
+    TRC = &AArch64::GPR64RegClass;
+    TRCsp = &AArch64::GPR64xspRegClass;
+  } else {
+    TRC = &AArch64::GPR32RegClass;
+    TRCsp = &AArch64::GPR32wspRegClass;
+  }
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  MRI.constrainRegClass(scratch, TRCsp);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldxr dest, ptr
+  //   cmp incr, dest (, sign extend if necessary)
+  //   csel scratch, dest, incr, cond
+  //   stxr stxr_status, scratch, ptr
+  //   cbnz stxr_status, loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+
+  // Build compare and cmov instructions.
+  MRI.constrainRegClass(incr, TRCsp);
+  BuildMI(BB, dl, TII->get(CmpOp))
+    .addReg(incr).addReg(oldval).addImm(0);
+
+  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
+          scratch)
+    .addReg(oldval).addReg(incr).addImm(Cond);
+
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
+    .addReg(scratch).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loopMBB);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const {
+  unsigned dest    = MI->getOperand(0).getReg();
+  unsigned ptr     = MI->getOperand(1).getReg();
+  unsigned oldval  = MI->getOperand(2).getReg();
+  unsigned newval  = MI->getOperand(3).getReg();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *TRCsp;
+  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
+
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(Size, ldrOpc, strOpc);
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldxr dest, [ptr]
+  //   cmp dest, oldval
+  //   b.ne exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+
+  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
+  MRI.constrainRegClass(dest, TRCsp);
+  BuildMI(BB, dl, TII->get(CmpOp))
+    .addReg(dest).addReg(oldval).addImm(0);
+  BuildMI(BB, dl, TII->get(AArch64::Bcc))
+    .addImm(A64CC::NE).addMBB(exitMBB);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex stxr_status, newval, [ptr]
+  //   cbnz stxr_status, loop1MBB
+  BB = loop2MBB;
+  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+
+  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
+  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
+    .addReg(stxr_status).addMBB(loop1MBB);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  // We materialise the F128CSEL pseudo-instruction using conditional branches
+  // and loads, giving an instruciton sequence like:
+  //     str q0, [sp]
+  //     b.ne IfTrue
+  //     b Finish
+  // IfTrue:
+  //     str q1, [sp]
+  // Finish:
+  //     ldr q0, [sp]
+  //
+  // Using virtual registers would probably not be beneficial since COPY
+  // instructions are expensive for f128 (there's no actual instruction to
+  // implement them).
+  //
+  // An alternative would be to do an integer-CSEL on some address. E.g.:
+  //     mov x0, sp
+  //     add x1, sp, #16
+  //     str q0, [x0]
+  //     str q1, [x1]
+  //     csel x0, x0, x1, ne
+  //     ldr q0, [x0]
+  //
+  // It's unclear which approach is actually optimal.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineFunction *MF = MBB->getParent();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned IfTrueReg = MI->getOperand(1).getReg();
+  unsigned IfFalseReg = MI->getOperand(2).getReg();
+  unsigned CondCode = MI->getOperand(3).getImm();
+  bool NZCVKilled = MI->getOperand(4).isKill();
+
+  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, TrueBB);
+  MF->insert(It, EndBB);
+
+  // Transfer rest of current basic-block to EndBB
+  EndBB->splice(EndBB->begin(), MBB,
+                llvm::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // We need somewhere to store the f128 value needed.
+  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
+
+  //     [... start of incoming MBB ...]
+  //     str qIFFALSE, [sp]
+  //     b.cc IfTrue
+  //     b Done
+  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
+    .addReg(IfFalseReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
+    .addImm(CondCode)
+    .addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
+    .addMBB(EndBB);
+  MBB->addSuccessor(TrueBB);
+  MBB->addSuccessor(EndBB);
+
+  // IfTrue:
+  //     str qIFTRUE, [sp]
+  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
+    .addReg(IfTrueReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+
+  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
+  // blocks.
+  TrueBB->addSuccessor(EndBB);
+
+  // Done:
+  //     ldr qDEST, [sp]
+  //     [... rest of incoming MBB ...]
+  if (!NZCVKilled)
+    EndBB->addLiveIn(AArch64::NZCV);
+  MachineInstr *StartOfEnd = EndBB->begin();
+  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
+    .addFrameIndex(ScratchFI)
+    .addImm(0);
+
+  MI->eraseFromParent();
+  return EndBB;
+}
+
+MachineBasicBlock *
+AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unhandled instruction with custom inserter");
+  case AArch64::F128CSEL:
+    return EmitF128CSEL(MI, MBB);
+  case AArch64::ATOMIC_LOAD_ADD_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
+  case AArch64::ATOMIC_LOAD_ADD_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_SUB_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
+  case AArch64::ATOMIC_LOAD_SUB_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_AND_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
+  case AArch64::ATOMIC_LOAD_AND_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_OR_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
+  case AArch64::ATOMIC_LOAD_OR_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_XOR_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
+  case AArch64::ATOMIC_LOAD_XOR_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_NAND_I8:
+    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I16:
+    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I32:
+    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
+  case AArch64::ATOMIC_LOAD_NAND_I64:
+    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
+
+  case AArch64::ATOMIC_LOAD_MIN_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
+  case AArch64::ATOMIC_LOAD_MIN_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
+
+  case AArch64::ATOMIC_LOAD_MAX_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
+  case AArch64::ATOMIC_LOAD_MAX_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
+
+  case AArch64::ATOMIC_LOAD_UMIN_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
+  case AArch64::ATOMIC_LOAD_UMIN_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
+
+  case AArch64::ATOMIC_LOAD_UMAX_I8:
+    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I16:
+    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I32:
+    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
+  case AArch64::ATOMIC_LOAD_UMAX_I64:
+    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
+
+  case AArch64::ATOMIC_SWAP_I8:
+    return emitAtomicBinary(MI, MBB, 1, 0);
+  case AArch64::ATOMIC_SWAP_I16:
+    return emitAtomicBinary(MI, MBB, 2, 0);
+  case AArch64::ATOMIC_SWAP_I32:
+    return emitAtomicBinary(MI, MBB, 4, 0);
+  case AArch64::ATOMIC_SWAP_I64:
+    return emitAtomicBinary(MI, MBB, 8, 0);
+
+  case AArch64::ATOMIC_CMP_SWAP_I8:
+    return emitAtomicCmpSwap(MI, MBB, 1);
+  case AArch64::ATOMIC_CMP_SWAP_I16:
+    return emitAtomicCmpSwap(MI, MBB, 2);
+  case AArch64::ATOMIC_CMP_SWAP_I32:
+    return emitAtomicCmpSwap(MI, MBB, 4);
+  case AArch64::ATOMIC_CMP_SWAP_I64:
+    return emitAtomicCmpSwap(MI, MBB, 8);
+  }
+}
+
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
+  case AArch64ISD::Call:           return "AArch64ISD::Call";
+  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
+  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
+  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
+  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
+  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
+  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
+  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
+  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
+  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
+  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
+
+  default:                       return NULL;
+  }
+}
+
+static const uint16_t AArch64FPRArgRegs[] = {
+  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
+};
+static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
+
+static const uint16_t AArch64ArgRegs[] = {
+  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
+  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
+};
+static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
+
+static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                 CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // Mark all remaining general purpose registers as allocated. We don't
+  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
+  // i64 will go in registers (C.11).
+  for (unsigned i = 0; i < NumArgRegs; ++i)
+    State.AllocateReg(AArch64ArgRegs[i]);
+
+  return false;
+}
+
+#include "AArch64GenCallingConv.inc"
+
+CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
+
+  switch(CC) {
+  default: llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return CC_A64_APCS;
+  }
+}
+
+void
+AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                           DebugLoc DL, SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+
+  SmallVector<SDValue, 8> MemOps;
+
+  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
+                                                         NumArgRegs);
+  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
+                                                         NumFPRArgRegs);
+
+  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                                   MachinePointerInfo::getStack(i * 8),
+                                   false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+  }
+
+  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+  int FPRIdx = 0;
+  if (FPRSaveSize != 0) {
+    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
+                                   &AArch64::FPR128RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                                   MachinePointerInfo::getStack(i * 16),
+                                   false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(16, getPointerTy()));
+    }
+  }
+
+  int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
+
+  FuncInfo->setVariadicStackIdx(StackIdx);
+  FuncInfo->setVariadicGPRIdx(GPRIdx);
+  FuncInfo->setVariadicGPRSize(GPRSaveSize);
+  FuncInfo->setVariadicFPRIdx(FPRIdx);
+  FuncInfo->setVariadicFPRSize(FPRSaveSize);
+
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                        MemOps.size());
+  }
+}
+
+
+SDValue
+AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+
+  SmallVector<SDValue, 16> ArgValues;
+
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Byval is used for small structs and HFAs in the PCS, but the system
+      // should work in a non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
+
+      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
+                                                 VA.getLocMemOffset(),
+                                                 false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
+
+      continue;
+    } else if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc());
+
+      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
+
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                             MachinePointerInfo::getFixedStack(FI),
+                             false, false, false, 0);
+
+
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
+      break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt: {
+      unsigned DestSize = VA.getValVT().getSizeInBits();
+      unsigned DestSubReg;
+
+      switch (DestSize) {
+      case 8: DestSubReg = AArch64::sub_8; break;
+      case 16: DestSubReg = AArch64::sub_16; break;
+      case 32: DestSubReg = AArch64::sub_32; break;
+      case 64: DestSubReg = AArch64::sub_64; break;
+      default: llvm_unreachable("Unexpected argument promotion");
+      }
+
+      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                   VA.getValVT(), ArgValue,
+                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
+                         0);
+      break;
+    }
+    }
+
+    InVals.push_back(ArgValue);
+  }
+
+  if (isVarArg)
+    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
+  }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
+
+  return Chain;
+}
+
+SDValue
+AArch64TargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   DebugLoc dl, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    // PCS: "If the type, T, of the result of a function is such that
+    // void func(T arg) would require that arg be passed as a value in a
+    // register (or set of registers) according to the rules in 5.4, then the
+    // result is returned in the same registers as would be used for such an
+    // argument.
+    //
+    // Otherwise, the caller shall reserve a block of memory of sufficient
+    // size and alignment to hold the result. The address of the memory block
+    // shall be passed as an additional argument to the function in x8."
+    //
+    // This is implemented in two places. The register-return values are dealt
+    // with here, more complex returns are passed as an sret parameter, which
+    // means we don't have to worry about it during actual return.
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
+
+
+    SDValue Arg = OutVals[i];
+
+    // There's no convenient note in the ABI about this as there is for normal
+    // arguments, but it says return values are passed in the same registers as
+    // an argument would be. I believe that includes the comments about
+    // unspecified higher bits, putting the burden of widening on the *caller*
+    // for return values.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt:
+      // Floating-point values should only be extended when they're going into
+      // memory, which can't happen here so an integer extend is acceptable.
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+SDValue
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+
+  if (IsTailCall) {
+    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+
+  // On AArch64 (and all other architectures I'm aware of) the most this has to
+  // do is adjust the stack pointer.
+  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
+  }
+
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
+
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
+
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+  }
+
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
+                                        getPointerTy());
+
+  SmallVector<SDValue, 8> MemOpChains;
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    SDValue Arg = OutVals[i];
+
+    // Callee does the actual widening, so all extensions just use an implicit
+    // definition of the rest of the Loc. Aesthetically, this would be nicer as
+    // an ANY_EXTEND, but that isn't valid for floating-point types and this
+    // alternative works on integer types too.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+    case CCValAssign::AExt: {
+      unsigned SrcSize = VA.getValVT().getSizeInBits();
+      unsigned SrcSubReg;
+
+      switch (SrcSize) {
+      case 8: SrcSubReg = AArch64::sub_8; break;
+      case 16: SrcSubReg = AArch64::sub_16; break;
+      case 32: SrcSubReg = AArch64::sub_32; break;
+      case 64: SrcSubReg = AArch64::sub_64; break;
+      default: llvm_unreachable("Unexpected argument promotion");
+      }
+
+      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                                    VA.getLocVT(),
+                                    DAG.getUNDEF(VA.getLocVT()),
+                                    Arg,
+                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
+                    0);
+
+      break;
+    }
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      // A normal register (sub-) argument. For now we just note it down because
+      // we want to copy things into registers as late as possible to avoid
+      // register-pressure (and possibly worse).
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc() && "unexpected argument location");
+
+    SDValue DstAddr;
+    MachinePointerInfo DstInfo;
+    if (IsTailCall) {
+      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
+                                          VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      int32_t Offset = VA.getLocMemOffset() + FPDiff;
+      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+      DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+      // Make sure any stack arguments overlapping with where we're storing are
+      // loaded before this eventual operation. Otherwise they'll be clobbered.
+      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+    } else {
+      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset());
+
+      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
+    }
+
+    if (Flags.isByVal()) {
+      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
+      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
+                                  Flags.getByValAlign(),
+                                  /*isVolatile = */ false,
+                                  /*alwaysInline = */ false,
+                                  DstInfo, MachinePointerInfo(0));
+      MemOpChains.push_back(Cpy);
+    } else {
+      // Normal stack argument, put it where it's needed.
+      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
+                                   false, false, 0);
+      MemOpChains.push_back(Store);
+    }
+  }
+
+  // The loads and stores generated above shouldn't clash with each
+  // other. Combining them with this TokenFactor notes that fact for the rest of
+  // the backend.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Most of the rest of the instructions need to be glued together; we don't
+  // want assignments to actual registers used by a call to be rearranged by a
+  // well-meaning scheduler.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // The linker is responsible for inserting veneers when necessary to put a
+  // function call destination in range, so we don't need to bother with a
+  // wrapper here.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  }
+
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // We produce the following DAG scheme for the actual call instruction:
+  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
+  //
+  // Most arguments aren't going to be used and just keep the values live as
+  // far as LLVM is concerned. It's expected to be selected as simply "bl
+  // callee" (for a direct, non-tail call).
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+  }
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+
+  // Add a register mask operand representing the call-preserved registers. This
+  // is used later in codegen to constrain register-allocation.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // If we needed glue, put it in as the last argument.
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  if (IsTailCall) {
+    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+  }
+
+  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Now we can reclaim the stack, just as well do it before working out where
+  // our return value is.
+  if (!IsSibCall) {
+    uint64_t CalleePopBytes
+      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
+
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(CalleePopBytes, true),
+                               InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  return LowerCallResult(Chain, InFlag, CallConv,
+                         IsVarArg, Ins, dl, DAG, InVals);
+}
+
+SDValue
+AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallingConv::ID CallConv, bool IsVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    // Return values that are too big to fit into registers should use an sret
+    // pointer, so this can be a lot simpler than the main argument code.
+    assert(VA.isRegLoc() && "Memory locations not expected for call return");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                                     InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+    case CCValAssign::SExt:
+    case CCValAssign::AExt:
+      // Floating-point arguments only get extended/truncated if they're going
+      // in memory, so using the integer operation is acceptable here.
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+bool
+AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    bool IsVarArg,
+                                    bool IsCalleeStructRet,
+                                    bool IsCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const {
+
+  // For CallingConv::C this function knows whether the ABI needs
+  // changing. That's not true for other conventions so they will have to opt in
+  // manually.
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = MF.getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible (see
+  // X86) but less efficient and uglier in LowerCall.
+  for (Function::const_arg_iterator i = CallerF->arg_begin(),
+         e = CallerF->arg_end(); i != e; ++i)
+    if (i->hasByValAttr())
+      return false;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Now we search for cases where we can use a tail call without changing the
+  // ABI. Sibcall is used in some places (particularly gcc) to refer to this
+  // concept.
+
+  // I want anyone implementing a new calling convention to think long and hard
+  // about this assert.
+  assert((!IsVarArg || CalleeCC == CallingConv::C)
+         && "Unexpected variadic calling convention");
+
+  if (IsVarArg && !Outs.empty()) {
+    // At least two cases here: if caller is fastcc then we can't have any
+    // memory arguments (we'd be expected to clean up the stack afterwards). If
+    // caller is C then we could potentially use its argument area.
+
+    // FIXME: for now we take the most conservative of these in both cases:
+    // disallow all variadic memory operands.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // Nothing more to check if the callee is taking no arguments
+  if (Outs.empty())
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+
+  const AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+
+  // If the stack arguments for this call would fit into our own save area then
+  // the call can be made tail.
+  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+}
+
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
+}
+
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
+}
+
+SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
+                                                   SelectionDAG &DAG,
+                                                   MachineFrameInfo *MFI,
+                                                   int ClobberedFI) const {
+  SmallVector<SDValue, 8> ArgChains;
+  int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
+  int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument corresponding
+  for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
+         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0) {
+          int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
+          int64_t InLastByte = InFirstByte;
+          InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
+
+          if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
+              (FirstByte <= InFirstByte && InFirstByte <= LastByte))
+            ArgChains.push_back(SDValue(L, 1));
+        }
+
+   // Build a tokenfactor for all the chains.
+   return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+                      &ArgChains[0], ArgChains.size());
+}
+
+static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETEQ:  return A64CC::EQ;
+  case ISD::SETGT:  return A64CC::GT;
+  case ISD::SETGE:  return A64CC::GE;
+  case ISD::SETLT:  return A64CC::LT;
+  case ISD::SETLE:  return A64CC::LE;
+  case ISD::SETNE:  return A64CC::NE;
+  case ISD::SETUGT: return A64CC::HI;
+  case ISD::SETUGE: return A64CC::HS;
+  case ISD::SETULT: return A64CC::LO;
+  case ISD::SETULE: return A64CC::LS;
+  default: llvm_unreachable("Unexpected condition code");
+  }
+}
+
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
+  // icmp is implemented using adds/subs immediate, which take an unsigned
+  // 12-bit immediate, optionally shifted left by 12 bits.
+
+  // Symmetric by using adds/subs
+  if (Val < 0)
+    Val = -Val;
+
+  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
+}
+
+SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
+                                        ISD::CondCode CC, SDValue &A64cc,
+                                        SelectionDAG &DAG, DebugLoc &dl) const {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    int64_t C = 0;
+    EVT VT = RHSC->getValueType(0);
+    bool knownInvalid = false;
+
+    // I'm not convinced the rest of LLVM handles these edge cases properly, but
+    // we can at least get it right.
+    if (isSignedIntSetCC(CC)) {
+      C = RHSC->getSExtValue();
+    } else if (RHSC->getZExtValue() > INT64_MAX) {
+      // A 64-bit constant not representable by a signed 64-bit integer is far
+      // too big to fit into a SUBS immediate anyway.
+      knownInvalid = true;
+    } else {
+      C = RHSC->getZExtValue();
+    }
+
+    if (!knownInvalid && !isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, VT);
+        }
+        break;
+      }
+    }
+  }
+
+  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
+  A64cc = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                     DAG.getCondCode(CC));
+}
+
+static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
+                                    A64CC::CondCodes &Alternative) {
+  A64CC::CondCodes CondCode = A64CC::Invalid;
+  Alternative = A64CC::Invalid;
+
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = A64CC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = A64CC::GE; break;
+  case ISD::SETOLT: CondCode = A64CC::MI; break;
+  case ISD::SETOLE: CondCode = A64CC::LS; break;
+  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
+  case ISD::SETO:   CondCode = A64CC::VC; break;
+  case ISD::SETUO:  CondCode = A64CC::VS; break;
+  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
+  case ISD::SETUGT: CondCode = A64CC::HI; break;
+  case ISD::SETUGE: CondCode = A64CC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = A64CC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = A64CC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = A64CC::NE; break;
+  }
+  return CondCode;
+}
+
+SDValue
+AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small
+         && "Only small code model supported at the moment");
+
+  // The most efficient code is PC-relative anyway for the small memory model,
+  // so we don't need to worry about relocation model.
+  return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                               AArch64II::MO_NO_FLAG),
+                     DAG.getTargetBlockAddress(BA, PtrVT, 0,
+                                               AArch64II::MO_LO12),
+                     DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
+}
+
+
+// (BRCOND chain, val, dest)
+SDValue
+AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = Op.getOperand(0);
+  SDValue TheBit = Op.getOperand(1);
+  SDValue DestBB = Op.getOperand(2);
+
+  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
+  // that as the consumer we are responsible for ignoring rubbish in higher
+  // bits.
+  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
+                       DAG.getConstant(1, MVT::i32));
+
+  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
+                               DAG.getConstant(0, TheBit.getValueType()),
+                               DAG.getCondCode(ISD::SETNE));
+
+  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
+                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
+                     DestBB);
+}
+
+// (BR_CC chain, condcode, lhs, rhs, dest)
+SDValue
+AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue DestBB = Op.getOperand(4);
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to runtime calls by a routine which sets
+    // LHS, RHS and CC appropriately for the rest of this function to continue.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                       Chain, CmpOp, A64cc, DestBB);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                                 Chain, SetCC, A64cc, DestBB);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
+                           A64BR_CC, SetCC, A64cc, DestBB);
+
+  }
+
+  return A64BR_CC;
+}
+
+SDValue
+AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
+                                       RTLIB::Libcall Call) const {
+  ArgListTy Args;
+  ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op.getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
+
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
+  // isTailCall may be true since the callee does not reference caller stack
+  // frame. Check if it's in the right position.
+  SDValue TCChain = InChain;
+  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
+  if (isTailCall)
+    InChain = TCChain;
+
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
+                    0, getLibcallCallingConv(Call), isTailCall,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, Op->getDebugLoc());
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  if (!CallInfo.second.getNode())
+    // It's a tailcall, return the chain (which is the DAG root).
+    return DAG.getRoot();
+
+  return CallInfo.first;
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, Op.getDebugLoc());
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+
+  RTLIB::Libcall LC;
+  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+SDValue
+AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                      bool IsSigned) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (IsSigned)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+SDValue
+AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
+  // we make that distinction here.
+
+  // We support the small memory model for now.
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
+
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GN->getGlobal();
+  unsigned Alignment = GV->getAlignment();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
+    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
+    // to zero when they remain undefined. In PIC mode the GOT can take care of
+    // this, but in absolute mode we use a constant pool load.
+    SDValue PoolAddr;
+    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
+                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
+                                                     AArch64II::MO_NO_FLAG),
+                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
+                                                     AArch64II::MO_LO12),
+                           DAG.getConstant(8, MVT::i32));
+    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
+                                     MachinePointerInfo::getConstantPool(),
+                                     /*isVolatile=*/ false,
+                                     /*isNonTemporal=*/ true,
+                                     /*isInvariant=*/ true, 8);
+    if (GN->getOffset() != 0)
+      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
+                         DAG.getConstant(GN->getOffset(), PtrVT));
+
+    return GlobalAddr;
+  }
+
+  if (Alignment == 0) {
+    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
+    if (GVPtrTy->getElementType()->isSized()) {
+      Alignment
+        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
+    } else {
+      // Be conservative if we can't guess, not that it really matters:
+      // functions and labels aren't valid for loads, and the methods used to
+      // actually calculate an address work with any alignment.
+      Alignment = 1;
+    }
+  }
+
+  unsigned char HiFixup, LoFixup;
+  bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+
+  if (UseGOT) {
+    HiFixup = AArch64II::MO_GOT;
+    LoFixup = AArch64II::MO_GOT_LO12;
+    Alignment = 8;
+  } else {
+    HiFixup = AArch64II::MO_NO_FLAG;
+    LoFixup = AArch64II::MO_LO12;
+  }
+
+  // AArch64's small model demands the following sequence:
+  // ADRP x0, somewhere
+  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
+  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
+                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                             HiFixup),
+                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                             LoFixup),
+                                  DAG.getConstant(Alignment, MVT::i32));
+
+  if (UseGOT) {
+    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
+                            GlobalRef);
+  }
+
+  if (GN->getOffset() != 0)
+    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
+                       DAG.getConstant(GN->getOffset(), PtrVT));
+
+  return GlobalRef;
+}
+
+SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
+                                                SDValue DescAddr,
+                                                DebugLoc DL,
+                                                SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+
+  // The function we need to call is simply the first entry in the GOT for this
+  // descriptor, load it in preparation.
+  SDValue Func, Chain;
+  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
+                     DescAddr);
+
+  // The function takes only one argument: the address of the descriptor itself
+  // in X0.
+  SDValue Glue;
+  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
+  Glue = Chain.getValue(1);
+
+  // Finally, there's a special calling-convention which means that the lookup
+  // must preserve all registers (except X0, obviously).
+  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *A64RI
+    = static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
+
+  // We're now ready to populate the argument list, as with a normal call:
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Func);
+  Ops.push_back(SymAddr);
+  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
+  Ops.push_back(DAG.getRegisterMask(Mask));
+  Ops.push_back(Glue);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
+                      Ops.size());
+  Glue = Chain.getValue(1);
+
+  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
+  // back to the generic handling code.
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
+}
+
+SDValue
+AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  SDValue TPOff;
+  EVT PtrVT = getPointerTy();
+  DebugLoc DL = Op.getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+
+  SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
+
+  if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                   AArch64II::MO_GOTTPREL),
+                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                   AArch64II::MO_GOTTPREL_LO12),
+                        DAG.getConstant(8, MVT::i32));
+    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
+                        TPOff);
+  } else if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_TPREL_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_TPREL_G0_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
+                                       TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                AArch64II::MO_TLSDESC);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                                AArch64II::MO_TLSDESC_LO12);
+    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                                   HiDesc, LoDesc,
+                                   DAG.getConstant(8, MVT::i32));
+    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
+
+    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  } else if (Model == TLSModel::LocalDynamic) {
+    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
+    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
+    // the beginning of the module's TLS region, followed by a DTPREL offset
+    // calculation.
+
+    // These accesses will need deduplicating if there's more than one.
+    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<AArch64MachineFunctionInfo>();
+    MFI->incNumLocalDynamicTLSAccesses();
+
+
+    // Get the location of _TLS_MODULE_BASE_:
+    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                AArch64II::MO_TLSDESC);
+    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                AArch64II::MO_TLSDESC_LO12);
+    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
+                                   HiDesc, LoDesc,
+                                   DAG.getConstant(8, MVT::i32));
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
+
+    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Get the variable's offset from _TLS_MODULE_BASE_
+    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_DTPREL_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
+                                               AArch64II::MO_DTPREL_G0_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
+                                       TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+  } else
+      llvm_unreachable("Unsupported TLS access model");
+
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+}
+
+SDValue
+AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                                      bool IsSigned) const {
+  if (Op.getValueType() != MVT::f128) {
+    // Legal for everything except f128.
+    return Op;
+  }
+
+  RTLIB::Libcall LC;
+  if (IsSigned)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128ToCall(Op, DAG, LC);
+}
+
+
+SDValue
+AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  DebugLoc dl = JT->getDebugLoc();
+
+  // When compiling PIC, jump tables get put in the code section so a static
+  // relocation-style is acceptable for both cases.
+  return DAG.getNode(AArch64ISD::WrapperSmall, dl, getPointerTy(),
+                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()),
+                     DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                            AArch64II::MO_LO12),
+                     DAG.getConstant(1, MVT::i32));
+}
+
+// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
+SDValue
+AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to libcalls, but slot in nicely here
+    // afterwards.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                       CmpOp, IfTrue, IfFalse, A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
+                                     Op.getValueType(),
+                                     SetCC, IfTrue, IfFalse, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+
+  }
+
+  return A64SELECT_CC;
+}
+
+// (SELECT testbit, iftrue, iffalse)
+SDValue
+AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue TheBit = Op.getOperand(0);
+  SDValue IfTrue = Op.getOperand(1);
+  SDValue IfFalse = Op.getOperand(2);
+
+  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
+  // that as the consumer we are responsible for ignoring rubbish in higher
+  // bits.
+  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
+                       DAG.getConstant(1, MVT::i32));
+  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
+                               DAG.getConstant(0, TheBit.getValueType()),
+                               DAG.getCondCode(ISD::SETNE));
+
+  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                     A64CMP, IfTrue, IfFalse,
+                     DAG.getConstant(A64CC::NE, MVT::i32));
+}
+
+// (SETCC lhs, rhs, condcode)
+SDValue
+AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  EVT VT = Op.getValueType();
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
+    // for the rest of the function (some i32 or i64 values).
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, use it.
+    if (RHS.getNode() == 0) {
+      assert(LHS.getValueType() == Op.getValueType() &&
+             "Unexpected setcc expansion!");
+      return LHS;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
+                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
+                       A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
+                                     CmpOp, DAG.getConstant(1, VT),
+                                     DAG.getConstant(0, VT), A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
+                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
+  }
+
+  return A64SELECT_CC;
+}
+
+SDValue
+AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+
+  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
+  // rather than just 8.
+  return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(),
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(32, MVT::i32), 8, false, false,
+                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+}
+
+SDValue
+AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // The layout of the va_list struct is specified in the AArch64 Procedure Call
+  // Standard, section B.3.
+  MachineFunction &MF = DAG.getMachineFunction();
+  AArch64MachineFunctionInfo *FuncInfo
+    = MF.getInfo<AArch64MachineFunctionInfo>();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Chain = Op.getOperand(0);
+  SDValue VAList = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  SmallVector<SDValue, 4> MemOps;
+
+  // void *__stack at offset 0
+  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
+                                    getPointerTy());
+  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
+                                MachinePointerInfo(SV), false, false, 0));
+
+  // void *__gr_top at offset 8
+  int GPRSize = FuncInfo->getVariadicGPRSize();
+  if (GPRSize > 0) {
+    SDValue GRTop, GRTopAddr;
+
+    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(8, getPointerTy()));
+
+    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
+                        DAG.getConstant(GPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
+                                  MachinePointerInfo(SV, 8),
+                                  false, false, 0));
+  }
+
+  // void *__vr_top at offset 16
+  int FPRSize = FuncInfo->getVariadicFPRSize();
+  if (FPRSize > 0) {
+    SDValue VRTop, VRTopAddr;
+    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                            DAG.getConstant(16, getPointerTy()));
+
+    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
+                        DAG.getConstant(FPRSize, getPointerTy()));
+
+    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
+                                  MachinePointerInfo(SV, 16),
+                                  false, false, 0));
+  }
+
+  // int __gr_offs at offset 24
+  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(24, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
+                                GROffsAddr, MachinePointerInfo(SV, 24),
+                                false, false, 0));
+
+  // int __vr_offs at offset 28
+  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                                   DAG.getConstant(28, getPointerTy()));
+  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
+                                VROffsAddr, MachinePointerInfo(SV, 28),
+                                false, false, 0));
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
+                     MemOps.size());
+}
+
+SDValue
+AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
+  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
+  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
+  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
+  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+
+  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::SETCC: return LowerSETCC(Op, DAG);
+  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+  case ISD::VASTART: return LowerVASTART(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // We're looking for an SRA/SHL pair which form an SBFX.
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  uint64_t TruncMask = N->getConstantOperandVal(1);
+  if (!isMask_64(TruncMask))
+    return SDValue();
+
+  uint64_t Width = CountPopulation_64(TruncMask);
+  SDValue Shift = N->getOperand(0);
+
+  if (Shift.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+    return SDValue();
+  uint64_t LSB = Shift->getConstantOperandVal(1);
+
+  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
+                     DAG.getConstant(LSB, MVT::i64),
+                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+}
+
+static SDValue PerformATOMIC_FENCECombine(SDNode *FenceNode,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  // An atomic operation followed by an acquiring atomic fence can be reduced to
+  // an acquiring load. The atomic operation provides a convenient pointer to
+  // load from. If the original operation was a load anyway we can actually
+  // combine the two operations into an acquiring load.
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue AtomicOp = FenceNode->getOperand(0);
+  AtomicSDNode *AtomicNode = dyn_cast<AtomicSDNode>(AtomicOp);
+
+  // A fence on its own can't be optimised
+  if (!AtomicNode)
+    return SDValue();
+
+  AtomicOrdering FenceOrder
+    = static_cast<AtomicOrdering>(FenceNode->getConstantOperandVal(1));
+  SynchronizationScope FenceScope
+    = static_cast<SynchronizationScope>(FenceNode->getConstantOperandVal(2));
+
+  if (FenceOrder != Acquire || FenceScope != AtomicNode->getSynchScope())
+    return SDValue();
+
+  // If the original operation was an ATOMIC_LOAD then we'll be replacing it, so
+  // the chain we use should be its input, otherwise we'll put our store after
+  // it so we use its output chain.
+  SDValue Chain = AtomicNode->getOpcode() == ISD::ATOMIC_LOAD ?
+    AtomicNode->getChain() : AtomicOp;
+
+  // We have an acquire fence with a handy atomic operation nearby, we can
+  // convert the fence into a load-acquire, discarding the result.
+  DebugLoc DL = FenceNode->getDebugLoc();
+  SDValue Op = DAG.getAtomic(ISD::ATOMIC_LOAD, DL, AtomicNode->getMemoryVT(),
+                             AtomicNode->getValueType(0),
+                             Chain,                  // Chain
+                             AtomicOp.getOperand(1), // Pointer
+                             AtomicNode->getMemOperand(), Acquire,
+                             FenceScope);
+
+  if (AtomicNode->getOpcode() == ISD::ATOMIC_LOAD)
+    DAG.ReplaceAllUsesWith(AtomicNode, Op.getNode());
+
+  return Op.getValue(1);
+}
+
+static SDValue PerformATOMIC_STORECombine(SDNode *N,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  // A releasing atomic fence followed by an atomic store can be combined into a
+  // single store operation.
+  SelectionDAG &DAG = DCI.DAG;
+  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(N);
+  SDValue FenceOp = AtomicNode->getOperand(0);
+
+  if (FenceOp.getOpcode() != ISD::ATOMIC_FENCE)
+    return SDValue();
+
+  AtomicOrdering FenceOrder
+    = static_cast<AtomicOrdering>(FenceOp->getConstantOperandVal(1));
+  SynchronizationScope FenceScope
+    = static_cast<SynchronizationScope>(FenceOp->getConstantOperandVal(2));
+
+  if (FenceOrder != Release || FenceScope != AtomicNode->getSynchScope())
+    return SDValue();
+
+  DebugLoc DL = AtomicNode->getDebugLoc();
+  return DAG.getAtomic(ISD::ATOMIC_STORE, DL, AtomicNode->getMemoryVT(),
+                       FenceOp.getOperand(0),  // Chain
+                       AtomicNode->getOperand(1),       // Pointer
+                       AtomicNode->getOperand(2),       // Value
+                       AtomicNode->getMemOperand(), Release,
+                       FenceScope);
+}
+
+/// For a true bitfield insert, the bits getting into that contiguous mask
+/// should come from the low part of an existing value: they must be formed from
+/// a compatible SHL operation (unless they're already low). This function
+/// checks that condition and returns the least-significant bit that's
+/// intended. If the operation not a field preparation, -1 is returned.
+static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+                            SDValue &MaskedVal, uint64_t Mask) {
+  if (!isShiftedMask_64(Mask))
+    return -1;
+
+  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
+  // instruction. BFI will do a left-shift by LSB before applying the mask we've
+  // spotted, so in general we should pre-emptively "undo" that by making sure
+  // the incoming bits have had a right-shift applied to them.
+  //
+  // This right shift, however, will combine with existing left/right shifts. In
+  // the simplest case of a completely straight bitfield operation, it will be
+  // expected to completely cancel out with an existing SHL. More complicated
+  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
+  // the BFI.
+
+  uint64_t LSB = CountTrailingZeros_64(Mask);
+  int64_t ShiftRightRequired = LSB;
+  if (MaskedVal.getOpcode() == ISD::SHL &&
+      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
+    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
+    MaskedVal = MaskedVal.getOperand(0);
+  } else if (MaskedVal.getOpcode() == ISD::SRL &&
+             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
+    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
+    MaskedVal = MaskedVal.getOperand(0);
+  }
+
+  if (ShiftRightRequired > 0)
+    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
+                            DAG.getConstant(ShiftRightRequired, MVT::i64));
+  else if (ShiftRightRequired < 0) {
+    // We could actually end up with a residual left shift, for example with
+    // "struc.bitfield = val << 1".
+    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
+                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
+  }
+
+  return LSB;
+}
+
+/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
+/// a mask and an extension. Returns true if a BFI was found and provides
+/// information on its surroundings.
+static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
+                          bool &Extended) {
+  Extended = false;
+  if (N.getOpcode() == ISD::ZERO_EXTEND) {
+    Extended = true;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
+    Mask = N->getConstantOperandVal(1);
+    N = N.getOperand(0);
+  } else {
+    // Mask is the whole width.
+    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
+  }
+
+  if (N.getOpcode() == AArch64ISD::BFI) {
+    BFI = N;
+    return true;
+  }
+
+  return false;
+}
+
+/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
+/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
+/// can often be further combined with a larger mask. Ultimately, we want mask
+/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
+static SDValue tryCombineToBFI(SDNode *N,
+                               TargetLowering::DAGCombinerInfo &DCI,
+                               const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
+  // abandon the effort.
+  SDValue LHS = N->getOperand(0);
+  if (LHS.getOpcode() != ISD::AND)
+    return SDValue();
+
+  uint64_t LHSMask;
+  if (isa<ConstantSDNode>(LHS.getOperand(1)))
+    LHSMask = LHS->getConstantOperandVal(1);
+  else
+    return SDValue();
+
+  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
+  // is or abandon the effort.
+  SDValue RHS = N->getOperand(1);
+  if (RHS.getOpcode() != ISD::AND)
+    return SDValue();
+
+  uint64_t RHSMask;
+  if (isa<ConstantSDNode>(RHS.getOperand(1)))
+    RHSMask = RHS->getConstantOperandVal(1);
+  else
+    return SDValue();
+
+  // Can't do anything if the masks are incompatible.
+  if (LHSMask & RHSMask)
+    return SDValue();
+
+  // Now we need one of the masks to be a contiguous field. Without loss of
+  // generality that should be the RHS one.
+  SDValue Bitfield = LHS.getOperand(0);
+  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
+    // We know that LHS is a candidate new value, and RHS isn't already a better
+    // one.
+    std::swap(LHS, RHS);
+    std::swap(LHSMask, RHSMask);
+  }
+
+  // We've done our best to put the right operands in the right places, all we
+  // can do now is check whether a BFI exists.
+  Bitfield = RHS.getOperand(0);
+  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
+  if (LSB == -1)
+    return SDValue();
+
+  uint32_t Width = CountPopulation_64(RHSMask);
+  assert(Width && "Expected non-zero bitfield width");
+
+  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
+                            LHS.getOperand(0), Bitfield,
+                            DAG.getConstant(LSB, MVT::i64),
+                            DAG.getConstant(Width, MVT::i64));
+
+  // Mask is trivial
+  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
+    return BFI;
+
+  return DAG.getNode(ISD::AND, DL, VT, BFI,
+                     DAG.getConstant(LHSMask | RHSMask, VT));
+}
+
+/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
+/// original input. This is surprisingly common because SROA splits things up
+/// into i8 chunks, so the originally detected MaskedBFI may actually only act
+/// on the low (say) byte of a word. This is then orred into the rest of the
+/// word afterwards.
+///
+/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
+///
+/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
+/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
+/// involved.
+static SDValue tryCombineToLargerBFI(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // First job is to hunt for a MaskedBFI on either the left or right. Swap
+  // operands if it's actually on the right.
+  SDValue BFI;
+  SDValue PossExtraMask;
+  uint64_t ExistingMask = 0;
+  bool Extended = false;
+  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
+    PossExtraMask = N->getOperand(1);
+  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
+    PossExtraMask = N->getOperand(0);
+  else
+    return SDValue();
+
+  // We can only combine a BFI with another compatible mask.
+  if (PossExtraMask.getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
+    return SDValue();
+
+  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+
+  // Masks must be compatible.
+  if (ExtraMask & ExistingMask)
+    return SDValue();
+
+  SDValue OldBFIVal = BFI.getOperand(0);
+  SDValue NewBFIVal = BFI.getOperand(1);
+  if (Extended) {
+    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
+    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
+    // need to be made compatible.
+    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
+           && "Invalid types for BFI");
+    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
+    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+  }
+
+  // We need the MaskedBFI to be combined with a mask of the *same* value.
+  if (PossExtraMask.getOperand(0) != OldBFIVal)
+    return SDValue();
+
+  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
+                    OldBFIVal, NewBFIVal,
+                    BFI.getOperand(2), BFI.getOperand(3));
+
+  // If the masking is trivial, we don't need to create it.
+  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
+    return BFI;
+
+  return DAG.getNode(ISD::AND, DL, VT, BFI,
+                     DAG.getConstant(ExtraMask | ExistingMask, VT));
+}
+
+/// An EXTR instruction is made up of two shifts, ORed together. This helper
+/// searches for and classifies those shifts.
+static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
+                         bool &FromHi) {
+  if (N.getOpcode() == ISD::SHL)
+    FromHi = false;
+  else if (N.getOpcode() == ISD::SRL)
+    FromHi = true;
+  else
+    return false;
+
+  if (!isa<ConstantSDNode>(N.getOperand(1)))
+    return false;
+
+  ShiftAmount = N->getConstantOperandVal(1);
+  Src = N->getOperand(0);
+  return true;
+}
+
+/// EXTR instruction extracts a contiguous chunk of bits from two existing
+/// registers viewed as a high/low pair. This function looks for the pattern:
+/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
+/// EXTR. Can't quite be done in TableGen because the two immediates aren't
+/// independent.
+static SDValue tryCombineToEXTR(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue LHS;
+  uint32_t ShiftLHS = 0;
+  bool LHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
+    return SDValue();
+
+  SDValue RHS;
+  uint32_t ShiftRHS = 0;
+  bool RHSFromHi = 0;
+  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
+    return SDValue();
+
+  // If they're both trying to come from the high part of the register, they're
+  // not really an EXTR.
+  if (LHSFromHi == RHSFromHi)
+    return SDValue();
+
+  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
+    return SDValue();
+
+  if (LHSFromHi) {
+    std::swap(LHS, RHS);
+    std::swap(ShiftLHS, ShiftRHS);
+  }
+
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
+                     LHS, RHS,
+                     DAG.getConstant(ShiftRHS, MVT::i64));
+}
+
+/// Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  // Attempt to recognise bitfield-insert operations.
+  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
+  if (Res.getNode())
+    return Res;
+
+  // Attempt to combine an existing MaskedBFI operation into one with a larger
+  // mask.
+  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
+  if (Res.getNode())
+    return Res;
+
+  Res = tryCombineToEXTR(N, DCI);
+  if (Res.getNode())
+    return Res;
+
+  return SDValue();
+}
+
+/// Target-specific dag combine xforms for ISD::SRA
+static SDValue PerformSRACombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // We're looking for an SRA/SHL pair which form an SBFX.
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    return SDValue();
+
+  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
+  SDValue Shift = N->getOperand(0);
+
+  if (Shift.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+    return SDValue();
+
+  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
+  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
+  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
+
+  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
+                     DAG.getConstant(LSB, MVT::i64),
+                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+}
+
+
+SDValue
+AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                         DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::AND: return PerformANDCombine(N, DCI);
+  case ISD::ATOMIC_FENCE: return PerformATOMIC_FENCECombine(N, DCI);
+  case ISD::ATOMIC_STORE: return PerformATOMIC_STORECombine(N, DCI);
+  case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
+  case ISD::SRA: return PerformSRACombine(N, DCI);
+  }
+  return SDValue();
+}
+
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'w': // An FP/SIMD vector register
+      return C_RegisterClass;
+    case 'I': // Constant that can be used with an ADD instruction
+    case 'J': // Constant that can be used with a SUB instruction
+    case 'K': // Constant that can be used with a 32-bit logical instruction
+    case 'L': // Constant that can be used with a 64-bit logical instruction
+    case 'M': // Constant that can be used as a 32-bit MOV immediate
+    case 'N': // Constant that can be used as a 64-bit MOV immediate
+    case 'Y': // Floating point constant zero
+    case 'Z': // Integer constant zero
+      return C_Other;
+    case 'Q': // A memory reference with base register and no offset
+      return C_Memory;
+    case 'S': // A symbolic address
+      return C_Other;
+    }
+  }
+
+  // FIXME: Ump, Utf, Usa, Ush
+  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
+  //      whatever they may be
+  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
+  // Usa: An absolute symbolic address
+  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
+  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
+         && Constraint != "Ush" && "Unimplemented constraints");
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                                const char *Constraint) const {
+
+  llvm_unreachable("Constraint weight unimplemented");
+}
+
+void
+AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                    std::string &Constraint,
+                                                    std::vector<SDValue> &Ops,
+                                                    SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only length 1 constraints are C_Other.
+  if (Constraint.size() != 1) return;
+
+  // Only C_Other constraints get lowered like this. That means constants for us
+  // so return early if there's no hope the constraint can be lowered.
+
+  switch(Constraint[0]) {
+  default: break;
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'Z': {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    uint64_t CVal = C->getZExtValue();
+    uint32_t Bits;
+
+    switch (Constraint[0]) {
+    default:
+      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
+      // is a peculiarly useless SUB constraint.
+      llvm_unreachable("Unimplemented C_Other constraint");
+    case 'I':
+      if (CVal <= 0xfff)
+        break;
+      return;
+    case 'K':
+      if (A64Imms::isLogicalImm(32, CVal, Bits))
+        break;
+      return;
+    case 'L':
+      if (A64Imms::isLogicalImm(64, CVal, Bits))
+        break;
+      return;
+    case 'Z':
+      if (CVal == 0)
+        break;
+      return;
+    }
+
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+  case 'S': {
+    // An absolute symbolic address or label reference.
+    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(),
+                                          GA->getValueType(0));
+    } else if (const BlockAddressSDNode *BA
+                 = dyn_cast<BlockAddressSDNode>(Op)) {
+      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                         BA->getValueType(0));
+    } else if (const ExternalSymbolSDNode *ES
+                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
+      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
+                                           ES->getValueType(0));
+    } else
+      return;
+    break;
+  }
+  case 'Y':
+    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+      if (CFP->isExactlyValue(0.0)) {
+        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // It's an unknown constraint for us. Let generic code have a go.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+                                                  const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() <= 32)
+        return std::make_pair(0U, &AArch64::GPR32RegClass);
+      else if (VT == MVT::i64)
+        return std::make_pair(0U, &AArch64::GPR64RegClass);
+      break;
+    case 'w':
+      if (VT == MVT::f16)
+        return std::make_pair(0U, &AArch64::FPR16RegClass);
+      else if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      else if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::VPR64RegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
+      else if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::VPR128RegClass);
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
new file mode 100644
index 000000000000..4960d286e9de
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -0,0 +1,247 @@
+//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AArch64 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
+#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+
+namespace llvm {
+namespace AArch64ISD {
+  enum NodeType {
+    // Start the numbering from where ISD NodeType finishes.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    // This is a conditional branch which also notes the flag needed
+    // (eq/sgt/...). A64 puts this information on the branches rather than
+    // compares as LLVM does.
+    BR_CC,
+
+    // A node to be selected to an actual call operation: either BL or BLR in
+    // the absence of tail calls.
+    Call,
+
+    // Indicates a floating-point immediate which fits into the format required
+    // by the FMOV instructions. First (and only) operand is the 8-bit encoded
+    // value of that immediate.
+    FPMOV,
+
+    // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
+    // and an LSB.
+    EXTR,
+
+    // Wraps a load from the GOT, which should always be performed with a 64-bit
+    // load instruction. This prevents the DAG combiner folding a truncate to
+    // form a smaller memory access.
+    GOTLoad,
+
+    // Performs a bitfield insert. Arguments are: the value being inserted into;
+    // the value being inserted; least significant bit changed; width of the
+    // field.
+    BFI,
+
+    // Simply a convenient node inserted during ISelLowering to represent
+    // procedure return. Will almost certainly be selected to "RET".
+    Ret,
+
+    /// Extracts a field of contiguous bits from the source and sign extends
+    /// them into a single register. Arguments are: source; immr; imms. Note
+    /// these are pre-encoded since DAG matching can't cope with combining LSB
+    /// and Width into these values itself.
+    SBFX,
+
+    /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
+    /// main difference is that it only has the values and an A64 condition,
+    /// which will be produced by a setcc instruction.
+    SELECT_CC,
+
+    /// This serves most of the functions of the LLVM SETCC instruction, for two
+    /// purposes. First, it prevents optimisations from fiddling with the
+    /// compare after we've moved the CondCode information onto the SELECT_CC or
+    /// BR_CC instructions. Second, it gives a legal instruction for the actual
+    /// comparison.
+    ///
+    /// It keeps a record of the condition flags asked for because certain
+    /// instructions are only valid for a subset of condition codes.
+    SETCC,
+
+    // Designates a node which is a tail call: both a call and a return
+    // instruction as far as selction is concerned. It should be selected to an
+    // unconditional branch. Has the usual plethora of call operands, but: 1st
+    // is callee, 2nd is stack adjustment required immediately before branch.
+    TC_RETURN,
+
+    // Designates a call used to support the TLS descriptor ABI. The call itself
+    // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
+    // var") must be attached somehow during code generation. It takes two
+    // operands: the callee and the symbol to be relocated against.
+    TLSDESCCALL,
+
+    // Leaf node which will be lowered to an appropriate MRS to obtain the
+    // thread pointer: TPIDR_EL0.
+    THREAD_POINTER,
+
+    /// Extracts a field of contiguous bits from the source and zero extends
+    /// them into a single register. Arguments are: source; immr; imms. Note
+    /// these are pre-encoded since DAG matching can't cope with combining LSB
+    /// and Width into these values itself.
+    UBFX,
+
+    // Wraps an address which the ISelLowering phase has decided should be
+    // created using the small absolute memory model: i.e. adrp/add or
+    // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
+    // get selected.
+    WrapperSmall
+  };
+}
+
+
+class AArch64Subtarget;
+class AArch64TargetMachine;
+
+class AArch64TargetLowering : public TargetLowering {
+public:
+  explicit AArch64TargetLowering(AArch64TargetMachine &TM);
+
+  const char *getTargetNodeName(unsigned Opcode) const;
+
+  CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+
+  SDValue LowerFormalArguments(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerReturn(SDValue Chain,
+                      CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      DebugLoc dl, SelectionDAG &DAG) const;
+
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const;
+
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool IsVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          DebugLoc dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals) const;
+
+  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                           DebugLoc DL, SDValue &Chain) const;
+
+
+  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+  /// for tail call optimization. Targets which want to do tail call
+  /// optimization should implement this function.
+  bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    bool IsVarArg,
+                                    bool IsCalleeStructRet,
+                                    bool IsCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const;
+
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
+
+  EVT getSetCCResultType(EVT VT) const;
+
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  bool isLegalICmpImmediate(int64_t Val) const;
+  SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                         SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const;
+
+  virtual MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  MachineBasicBlock *
+  emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
+                   unsigned Size, unsigned Opcode) const;
+
+  MachineBasicBlock *
+  emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
+                         unsigned Size, unsigned CmpOp,
+                         A64CC::CondCodes Cond) const;
+  MachineBasicBlock *
+  emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size) const;
+
+  MachineBasicBlock *
+  EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
+                          RTLIB::Libcall Call) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL,
+                           SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+  /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+  /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+  /// is expanded to mul + add.
+  virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+
+  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
+                                                  const char *Constraint) const;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const;
+
+  std::pair<unsigned, const TargetRegisterClass*>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+private:
+  const AArch64Subtarget *Subtarget;
+  const TargetRegisterInfo *RegInfo;
+  const InstrItineraryData *Itins;
+};
+} // namespace llvm
+
+#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
new file mode 100644
index 000000000000..cb93471058df
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -0,0 +1,961 @@
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file describes AArch64 instruction formats, down to the level of the
+// instruction's overall class.
+// ===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// A64 Instruction Format Definitions.
+//===----------------------------------------------------------------------===//
+
+// A64 is currently the only instruction set supported by the AArch64
+// architecture.
+class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
+              InstrItinClass itin>
+    : Instruction {
+  // All A64 instructions are 32-bit. This field will be filled in
+  // gradually going down the hierarchy.
+  field bits<32> Inst;
+
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+
+  // LLVM-level model of the AArch64/A64 distinction.
+  let Namespace = "AArch64";
+  let DecoderNamespace = "A64";
+  let Size = 4;
+
+  // Set the templated fields
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = patterns;
+  let Itinerary = itin;
+}
+
+class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
+  let Namespace = "AArch64";
+
+  let OutOperandList = outs;
+  let InOperandList= ins;
+  let Pattern = patterns;
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// Represents a pseudo-instruction that represents a single A64 instruction for
+// whatever reason, the eventual result will be a 32-bit real instruction.
+class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
+  : PseudoInst<outs, ins, patterns> {
+  let Size = 4;
+}
+
+// As above, this will be a single A64 instruction, but we can actually give the
+// expansion in TableGen.
+class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
+  : A64PseudoInst<outs, ins, patterns>,
+    PseudoInstExpansion<Result>;
+
+
+// First, some common cross-hierarchy register formats.
+
+class A64InstRd<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rd;
+
+  let Inst{4-0} = Rd;
+}
+
+class A64InstRt<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rt;
+
+  let Inst{4-0} = Rt;
+}
+
+
+class A64InstRdn<dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+    : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  // Inherit rdt
+  bits<5> Rn;
+
+  let Inst{9-5} = Rn;
+}
+
+class A64InstRtn<dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+    : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  // Inherit rdt
+  bits<5> Rn;
+
+  let Inst{9-5} = Rn;
+}
+
+// Instructions taking Rt,Rt2,Rn
+class A64InstRtt2n<dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rt2;
+
+  let Inst{14-10} = Rt2;
+}
+
+class A64InstRdnm<dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  let Inst{20-16} = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Actual A64 Instruction Formats
+//
+
+// Format for Add-subtract (extended register) instructions.
+class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
+                     dag outs, dag ins, string asmstr, list<dag> patterns,
+                     InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    bits<3> Imm3;
+
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-24} = 0b01011;
+    let Inst{23-22} = opt;
+    let Inst{21} = 0b1;
+    // Rm inherited in 20-16
+    let Inst{15-13} = option;
+    let Inst{12-10} = Imm3;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+// Format for Add-subtract (immediate) instructions.
+class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<12> Imm12;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = S;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = shift;
+  let Inst{21-10} = Imm12;
+}
+
+// Format for Add-subtract (shifted register) instructions.
+class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
+                       dag outs, dag ins, string asmstr, list<dag> patterns,
+                       InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    bits<6> Imm6;
+
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-24} = 0b01011;
+    let Inst{23-22} = shift;
+    let Inst{21} = 0b0;
+    // Rm inherited in 20-16
+    let Inst{15-10} = Imm6;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+// Format for Add-subtract (with carry) instructions.
+class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
+                       dag outs, dag ins, string asmstr, list<dag> patterns,
+                       InstrItinClass itin>
+    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+    let Inst{31} = sf;
+    let Inst{30} = op;
+    let Inst{29} = S;
+    let Inst{28-21} = 0b11010000;
+    // Rm inherited in 20-16
+    let Inst{15-10} = opcode2;
+    // Rn inherited in 9-5
+    // Rd inherited in 4-0
+}
+
+
+// Format for Bitfield instructions
+class A64I_bitfield<bit sf, bits<2> opc, bit n,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<6> ImmR;
+  bits<6> ImmS;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{22} = n;
+  let Inst{21-16} = ImmR;
+  let Inst{15-10} = ImmS;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for compare and branch (immediate) instructions.
+class A64I_cmpbr<bit sf, bit op,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  bits<19> Label;
+
+  let Inst{31} = sf;
+  let Inst{30-25} = 0b011010;
+  let Inst{24} = op;
+  let Inst{23-5} = Label;
+  // Inherit Rt in 4-0
+}
+
+// Format for conditional branch (immediate) instructions.
+class A64I_condbr<bit o1, bit o0,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<19> Label;
+  bits<4> Cond;
+
+  let Inst{31-25} = 0b0101010;
+  let Inst{24} = o1;
+  let Inst{23-5} = Label;
+  let Inst{4} = o0;
+  let Inst{3-0} = Cond;
+}
+
+// Format for conditional compare (immediate) instructions.
+class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> UImm5;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010010;
+  let Inst{20-16} = UImm5;
+  let Inst{15-12} = Cond;
+  let Inst{11} = 0b1;
+  let Inst{10} = o2;
+  let Inst{9-5} = Rn;
+  let Inst{4} = o3;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for conditional compare (register) instructions.
+class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = Cond;
+  let Inst{11} = 0b0;
+  let Inst{10} = o2;
+  let Inst{9-5} = Rn;
+  let Inst{4} = o3;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for conditional select instructions.
+class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<4> Cond;
+
+  let Inst{31} = sf;
+  let Inst{30} = op;
+  let Inst{29} = s;
+  let Inst{28-21} = 0b11010100;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = op2;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for data processing (1 source) instructions
+class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
+                string asmstr, dag outs, dag ins,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b1;
+  let Inst{29} = S;
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+}
+
+// Format for data processing (2 source) instructions
+class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
+                string asmstr, dag outs, dag ins,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = S;
+  let Inst{28-21} = 0b11010110;
+  let Inst{15-10} = opcode;
+}
+
+// Format for data-processing (3 source) instructions
+
+class A64I_dp3<bit sf, bits<6> opcode,
+               dag outs, dag ins, string asmstr,
+               list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<5> Ra;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opcode{5-4};
+  let Inst{28-24} = 0b11011;
+  let Inst{23-21} = opcode{3-1};
+  // Inherits Rm in 20-16
+  let Inst{15} = opcode{0};
+  let Inst{14-10} = Ra;
+  // Inherits Rn in 9-5
+  // Inherits Rd in 4-0
+}
+
+// Format for exception generation instructions
+class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<16> UImm16;
+
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = opc;
+  let Inst{20-5} = UImm16;
+  let Inst{4-2} = op2;
+  let Inst{1-0} = ll;
+}
+
+// Format for extract (immediate) instructions
+class A64I_extract<bit sf, bits<3> op, bit n,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<6> LSB;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = op{2-1};
+  let Inst{28-23} = 0b100111;
+  let Inst{22} = n;
+  let Inst{21} = op{0};
+  // Inherits Rm in bits 20-16
+  let Inst{15-10} = LSB;
+  // Inherits Rn in 9-5
+  // Inherits Rd in 4-0
+}
+
+// Format for floating-point compare instructions.
+class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = op;
+  let Inst{13-10} = 0b1000;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = opcode2;
+}
+
+// Format for floating-point conditional compare instructions.
+class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> NZCVImm;
+  bits<4> Cond;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5} = Rn;
+  let Inst{4} = op;
+  let Inst{3-0} = NZCVImm;
+}
+
+// Format for floating-point conditional select instructions.
+class A64I_fpcondsel<bit m, bit s, bits<2> type,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<4> Cond;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = Cond;
+  let Inst{11-10} = 0b11;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+
+// Format for floating-point data-processing (1 source) instructions.
+class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point data-processing (2 sources) instructions.
+class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point data-processing (3 sources) instructions.
+class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<5> Ra;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11111;
+  let Inst{23-22} = type;
+  let Inst{21} = o1;
+  // Inherit Rm in 20-16
+  let Inst{15} = o0;
+  let Inst{14-10} = Ra;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point <-> fixed-point conversion instructions.
+class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<6> Scale;
+
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b0;
+  let Inst{20-19} = mode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = Scale;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format for floating-point <-> integer conversion instructions.
+class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = sf;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+
+// Format for floating-point immediate instructions.
+class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<8> Imm8;
+
+  let Inst{31} = m;
+  let Inst{30} = 0b0;
+  let Inst{29} = s;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21} = 0b1;
+  let Inst{20-13} = Imm8;
+  let Inst{12-10} = 0b100;
+  let Inst{9-5} = imm5;
+  // Inherit Rd in 4-0
+}
+
+// Format for load-register (literal) instructions.
+class A64I_LDRlit<bits<2> opc, bit v,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  bits<19> Imm19;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5} = Imm19;
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store exclusive instructions.
+class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                 dag outs, dag ins, string asmstr,
+                 list <dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-30} = size;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = o2;
+  let Inst{22} = L;
+  let Inst{21} = o1;
+  let Inst{15} = o0;
+}
+
+class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rt2;
+   let Inst{14-10} = Rt2;
+}
+
+class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rs;
+   let Inst{20-16} = Rs;
+}
+
+class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
+                     dag outs, dag ins, string asmstr,
+                     list <dag> patterns, InstrItinClass itin>:
+      A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
+   bits<5> Rt2;
+   let Inst{14-10} = Rt2;
+}
+
+// Format for load-store register (immediate post-indexed) instructions
+class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b01;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store register (immediate pre-indexed) instructions
+class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b11;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store register (unprivileged) instructions
+class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for load-store (unscaled immediate) instructions.
+class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<9> SImm9;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b0;
+  let Inst{20-12} = SImm9;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+
+// Format for load-store (unsigned immediate) instructions.
+class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<12> UImm12;
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = UImm12;
+}
+
+// Format for load-store register (register offset) instructions.
+class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  // Complex operand selection needed for these instructions, so they
+  // need an "addr" field for encoding/decoding to be generated.
+  bits<3> Ext;
+  // OptionHi = Ext{2-1}
+  // S = Ext{0}
+
+  let Inst{31-30} = size;
+  let Inst{29-27} = 0b111;
+  let Inst{26} = v;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = Ext{2-1};
+  let Inst{13} = optionlo;
+  let Inst{12} = Ext{0};
+  let Inst{11-10} = 0b10;
+  // Inherits Rn in 9-5
+  // Inherits Rt in 4-0
+
+  let AddedComplexity = 50;
+}
+
+// Format for Load-store register pair (offset) instructions
+class A64I_LSPoffset<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b010;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store register pair (post-indexed) instructions
+class A64I_LSPpostind<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b001;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store register pair (pre-indexed) instructions
+class A64I_LSPpreind<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b011;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Load-store non-temporal register pair (offset) instructions
+class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
+  bits<7> SImm7;
+
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26} = v;
+  let Inst{25-23} = 0b000;
+  let Inst{22} = l;
+  let Inst{21-15} = SImm7;
+  // Inherit Rt2 in 14-10
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format for Logical (immediate) instructions
+class A64I_logicalimm<bit sf, bits<2> opc,
+                      dag outs, dag ins, string asmstr,
+                      list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bit N;
+  bits<6> ImmR;
+  bits<6> ImmS;
+
+  // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
+  // selection), so we'll combine them into a single field here.
+  bits<13> Imm;
+  // N = Imm{12};
+  // ImmR = Imm{11-6};
+  // ImmS = Imm{5-0};
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22} = Imm{12};
+  let Inst{21-16} = Imm{11-6};
+  let Inst{15-10} = Imm{5-0};
+  // Rn inherited in 9-5
+  // Rd inherited in 4-0
+}
+
+// Format for Logical (shifted register) instructions
+class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
+                        dag outs, dag ins, string asmstr,
+                        list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  bits<6> Imm6;
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift;
+  let Inst{21} = N;
+  // Rm inherited
+  let Inst{15-10} = Imm6;
+  // Rn inherited
+  // Rd inherited
+}
+
+// Format for Move wide (immediate)
+class A64I_movw<bit sf, bits<2> opc,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<16> UImm16;
+  bits<2> Shift; // Called "hw" officially
+
+  let Inst{31} = sf;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = Shift;
+  let Inst{20-5} = UImm16;
+  // Inherits Rd in 4-0
+}
+
+// Format for PC-relative addressing instructions, ADR and ADRP.
+class A64I_PCADR<bit op,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs, ins, asmstr, patterns, itin> {
+  bits<21> Label;
+
+  let Inst{31} = op;
+  let Inst{30-29} = Label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5} = Label{20-2};
+}
+
+// Format for system instructions
+class A64I_system<bit l,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  bits<2> Op0;
+  bits<3> Op1;
+  bits<4> CRn;
+  bits<4> CRm;
+  bits<3> Op2;
+  bits<5> Rt;
+
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21} = l;
+  let Inst{20-19} = Op0;
+  let Inst{18-16} = Op1;
+  let Inst{15-12} = CRn;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = Op2;
+  let Inst{4-0} = Rt;
+
+  // These instructions can do horrible things.
+  let hasSideEffects = 1;
+}
+
+// Format for unconditional branch (immediate) instructions
+class A64I_Bimm<bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<26> Label;
+
+  let Inst{31} = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0} = Label;
+}
+
+// Format for Test & branch (immediate) instructions
+class A64I_TBimm<bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRt<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<6> Imm;
+  bits<14> Label;
+
+  let Inst{31} = Imm{5};
+  let Inst{30-25} = 0b011011;
+  let Inst{24} = op;
+  let Inst{23-19} = Imm{4-0};
+  let Inst{18-5} = Label;
+  // Inherit Rt in 4-0
+}
+
+// Format for Unconditional branch (register) instructions, including
+// RET.  Shares no fields with instructions further up the hierarchy
+// so top-level.
+class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64Inst<outs, ins, asmstr, patterns, itin> {
+  // Doubly special in not even sharing register fields with other
+  // instructions, so we create our own Rn here.
+  bits<5> Rn;
+
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = op2;
+  let Inst{15-10} = op3;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = op4;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
new file mode 100644
index 000000000000..cf3a2c3707d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -0,0 +1,822 @@
+//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#include <algorithm>
+
+#define GET_INSTRINFO_CTOR
+#include "AArch64GenInstrInfo.inc"
+
+using namespace llvm;
+
+AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
+  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+    RI(*this, STI), Subtarget(STI) {}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  unsigned Opc = 0;
+  unsigned ZeroReg = 0;
+  if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
+    // E.g. ADD xDst, xsp, #0 (, lsl #0)
+    BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
+      .addReg(SrcReg)
+      .addImm(0);
+    return;
+  } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+    // E.g. ADD wDST, wsp, #0 (, lsl #0)
+    BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
+      .addReg(SrcReg)
+      .addImm(0);
+    return;
+  } else if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg));
+    // E.g. MSR NZCV, xDST
+    BuildMI(MBB, I, DL, get(AArch64::MSRix))
+      .addImm(A64SysReg::NZCV)
+      .addReg(SrcReg);
+  } else if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg));
+    // E.g. MRS xDST, NZCV
+    BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
+      .addImm(A64SysReg::NZCV);
+  } else if (AArch64::GPR64RegClass.contains(DestReg)) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg));
+    Opc = AArch64::ORRxxx_lsl;
+    ZeroReg = AArch64::XZR;
+  } else if (AArch64::GPR32RegClass.contains(DestReg)) {
+    assert(AArch64::GPR32RegClass.contains(SrcReg));
+    Opc = AArch64::ORRwww_lsl;
+    ZeroReg = AArch64::WZR;
+  } else if (AArch64::FPR32RegClass.contains(DestReg)) {
+    assert(AArch64::FPR32RegClass.contains(SrcReg));
+    BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
+      .addReg(SrcReg);
+    return;
+  } else if (AArch64::FPR64RegClass.contains(DestReg)) {
+    assert(AArch64::FPR64RegClass.contains(SrcReg));
+    BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
+      .addReg(SrcReg);
+    return;
+  } else if (AArch64::FPR128RegClass.contains(DestReg)) {
+    assert(AArch64::FPR128RegClass.contains(SrcReg));
+
+    // FIXME: there's no good way to do this, at least without NEON:
+    //   + There's no single move instruction for q-registers
+    //   + We can't create a spill slot and use normal STR/LDR because stack
+    //     allocation has already happened
+    //   + We can't go via X-registers with FMOV because register allocation has
+    //     already happened.
+    // This may not be efficient, but at least it works.
+    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
+      .addReg(SrcReg)
+      .addReg(AArch64::XSP)
+      .addImm(0x1ff & -16);
+
+    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
+      .addReg(AArch64::XSP, RegState::Define)
+      .addReg(AArch64::XSP)
+      .addImm(16);
+    return;
+  } else {
+    llvm_unreachable("Unknown register class in copyPhysReg");
+  }
+
+  // E.g. ORR xDst, xzr, xSrc, lsl #0
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(ZeroReg)
+    .addReg(SrcReg)
+    .addImm(0);
+}
+
+MachineInstr *
+AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                           uint64_t Offset, const MDNode *MDPtr,
+                                           DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0)
+    .addImm(Offset)
+    .addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// Does the Opcode represent a conditional branch that we can remove and re-add
+/// at the end of a basic block?
+static bool isCondBranch(unsigned Opc) {
+  return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
+         Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
+         Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
+         Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
+}
+
+/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
+/// setting TBB to the destination basic block and populating the Cond vector
+/// with data necessary to recreate the conditional branch at a later
+/// date. First element will be the opcode, and subsequent ones define the
+/// conditions being branched on in an instruction-specific manner.
+static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
+                               SmallVectorImpl<MachineOperand> &Cond) {
+  switch(I->getOpcode()) {
+  case AArch64::Bcc:
+  case AArch64::CBZw:
+  case AArch64::CBZx:
+  case AArch64::CBNZw:
+  case AArch64::CBNZx:
+    // These instructions just have one predicate operand in position 0 (either
+    // a condition code or a register being compared).
+    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
+    Cond.push_back(I->getOperand(0));
+    TBB = I->getOperand(1).getMBB();
+    return;
+  case AArch64::TBZwii:
+  case AArch64::TBZxii:
+  case AArch64::TBNZwii:
+  case AArch64::TBNZxii:
+    // These have two predicate operands: a register and a bit position.
+    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
+    Cond.push_back(I->getOperand(0));
+    Cond.push_back(I->getOperand(1));
+    TBB = I->getOperand(2).getMBB();
+    return;
+  default:
+    llvm_unreachable("Unknown conditional branch to classify");
+  }
+}
+
+
+bool
+AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastOpc == AArch64::Bimm) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranch(LastOpc)) {
+      classifyCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && LastOpc == AArch64::Bimm) {
+    while (SecondLastOpc == AArch64::Bimm) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (LastOpc == AArch64::Bimm) {
+    if (SecondLastOpc == AArch64::Bcc) {
+      TBB =  SecondLastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
+      Cond.push_back(SecondLastInst->getOperand(0));
+      FBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (isCondBranch(SecondLastOpc)) {
+      classifyCondBranch(SecondLastInst, TBB, Cond);
+      FBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+bool AArch64InstrInfo::ReverseBranchCondition(
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
+  switch (Cond[0].getImm()) {
+  case AArch64::Bcc: {
+    A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
+    CC = A64InvertCondCode(CC);
+    Cond[1].setImm(CC);
+    return false;
+  }
+  case AArch64::CBZw:
+    Cond[0].setImm(AArch64::CBNZw);
+    return false;
+  case AArch64::CBZx:
+    Cond[0].setImm(AArch64::CBNZx);
+    return false;
+  case AArch64::CBNZw:
+    Cond[0].setImm(AArch64::CBZw);
+    return false;
+  case AArch64::CBNZx:
+    Cond[0].setImm(AArch64::CBZx);
+    return false;
+  case AArch64::TBZwii:
+    Cond[0].setImm(AArch64::TBNZwii);
+    return false;
+  case AArch64::TBZxii:
+    Cond[0].setImm(AArch64::TBNZxii);
+    return false;
+  case AArch64::TBNZwii:
+    Cond[0].setImm(AArch64::TBZwii);
+    return false;
+  case AArch64::TBNZxii:
+    Cond[0].setImm(AArch64::TBZxii);
+    return false;
+  default:
+    llvm_unreachable("Unknown branch type");
+  }
+}
+
+
+unsigned
+AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  if (FBB == 0 && Cond.empty()) {
+    BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
+    return 1;
+  } else if (FBB == 0) {
+    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+    for (int i = 1, e = Cond.size(); i != e; ++i)
+      MIB.addOperand(Cond[i]);
+    MIB.addMBB(TBB);
+    return 1;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
+  for (int i = 1, e = Cond.size(); i != e; ++i)
+    MIB.addOperand(Cond[i]);
+  MIB.addMBB(TBB);
+
+  BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
+  return 2;
+}
+
+unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranch(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+bool
+AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  case AArch64::TLSDESC_BLRx: {
+    MachineInstr *NewMI =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
+        .addOperand(MI.getOperand(1));
+    MI.setDesc(get(AArch64::BLRx));
+
+    llvm::finalizeBundle(MBB, NewMI, *++MBBI);
+    return true;
+    }
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+void
+AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned SrcReg, bool isKill,
+                                      int FrameIdx,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  MachineMemOperand *MMO
+    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FrameIdx),
+                              Align);
+
+  unsigned StoreOp = 0;
+  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
+    switch(RC->getSize()) {
+    case 4: StoreOp = AArch64::LS32_STR; break;
+    case 8: StoreOp = AArch64::LS64_STR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  } else {
+    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
+            RC->hasType(MVT::f128))
+           && "Expected integer or floating type for store");
+    switch (RC->getSize()) {
+    case 4: StoreOp = AArch64::LSFP32_STR; break;
+    case 8: StoreOp = AArch64::LSFP64_STR; break;
+    case 16: StoreOp = AArch64::LSFP128_STR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  }
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
+  NewMI.addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIdx)
+    .addImm(0)
+    .addMemOperand(MMO);
+
+}
+
+void
+AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned DestReg, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  MachineMemOperand *MMO
+    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FrameIdx),
+                              Align);
+
+  unsigned LoadOp = 0;
+  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
+    switch(RC->getSize()) {
+    case 4: LoadOp = AArch64::LS32_LDR; break;
+    case 8: LoadOp = AArch64::LS64_LDR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  } else {
+    assert((RC->hasType(MVT::f32) || RC->hasType(MVT::f64)
+            || RC->hasType(MVT::f128))
+           && "Expected integer or floating type for store");
+    switch (RC->getSize()) {
+    case 4: LoadOp = AArch64::LSFP32_LDR; break;
+    case 8: LoadOp = AArch64::LSFP64_LDR; break;
+    case 16: LoadOp = AArch64::LSFP128_LDR; break;
+    default:
+      llvm_unreachable("Unknown size for regclass");
+    }
+  }
+
+  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
+  NewMI.addFrameIndex(FrameIdx)
+       .addImm(0)
+       .addMemOperand(MMO);
+}
+
+unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
+  unsigned Limit = (1 << 16) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
+        // is the largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
+          Limit = std::min(Limit, 0xfffu);
+          break;
+        }
+
+        int AccessScale, MinOffset, MaxOffset;
+        getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
+        Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
+                                             int &AccessScale, int &MinOffset,
+                                             int &MaxOffset) const {
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unkown load/store kind");
+  case TargetOpcode::DBG_VALUE:
+    AccessScale = 1;
+    MinOffset = INT_MIN;
+    MaxOffset = INT_MAX;
+    return;
+  case AArch64::LS8_LDR: case AArch64::LS8_STR:
+  case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
+  case AArch64::LDRSBw:
+  case AArch64::LDRSBx:
+    AccessScale = 1;
+    MinOffset = 0;
+    MaxOffset = 0xfff;
+    return;
+  case AArch64::LS16_LDR: case AArch64::LS16_STR:
+  case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
+  case AArch64::LDRSHw:
+  case AArch64::LDRSHx:
+    AccessScale = 2;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LS32_LDR:  case AArch64::LS32_STR:
+  case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
+  case AArch64::LDRSWx:
+  case AArch64::LDPSWx:
+    AccessScale = 4;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LS64_LDR: case AArch64::LS64_STR:
+  case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
+  case AArch64::PRFM:
+    AccessScale = 8;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
+    AccessScale = 16;
+    MinOffset = 0;
+    MaxOffset = 0xfff * AccessScale;
+    return;
+  case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
+  case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
+    AccessScale = 4;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
+  case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
+    AccessScale = 8;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
+    AccessScale = 16;
+    MinOffset = -0x40 * AccessScale;
+    MaxOffset = 0x3f * AccessScale;
+    return;
+  }
+}
+
+unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  const MCInstrDesc &MCID = MI.getDesc();
+  const MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction &MF = *MBB.getParent();
+  const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
+
+  if (MCID.getSize())
+    return MCID.getSize();
+
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
+
+  if (MI.isLabel())
+    return 0;
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::BUNDLE:
+    return getInstBundleLength(MI);
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case AArch64::TLSDESCCALL:
+    return 0;
+  default:
+    llvm_unreachable("Unknown instruction class");
+  }
+}
+
+unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += getInstSizeInBytes(*I);
+  }
+  return Size;
+}
+
+bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const AArch64InstrInfo &TII) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MFI.getObjectOffset(FrameRegIdx);
+  llvm_unreachable("Unimplemented rewriteFrameIndex");
+}
+
+void llvm::emitRegUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI,
+                         DebugLoc dl, const TargetInstrInfo &TII,
+                         unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
+                         int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
+  if (NumBytes == 0 && DstReg == SrcReg)
+    return;
+  else if (abs64(NumBytes) & ~0xffffff) {
+    // Generically, we have to materialize the offset into a temporary register
+    // and subtract it. There are a couple of ways this could be done, for now
+    // we'll use a movz/movk or movn/movk sequence.
+    uint64_t Bits = static_cast<uint64_t>(abs64(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
+      .addImm(0xffff & Bits).addImm(0)
+      .setMIFlags(MIFlags);
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(1)
+        .setMIFlags(MIFlags);
+    }
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(2)
+        .setMIFlags(MIFlags);
+    }
+
+    Bits >>= 16;
+    if (Bits & 0xffff) {
+      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(0xffff & Bits).addImm(3)
+        .setMIFlags(MIFlags);
+    }
+
+    // ADD DST, SRC, xTMP (, lsl #0)
+    unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
+    BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(0)
+      .setMIFlag(MIFlags);
+    return;
+  }
+
+  // Now we know that the adjustment can be done in at most two add/sub
+  // (immediate) instructions, which is always more efficient than a
+  // literal-pool load, or even a hypothetical movz/movk/add sequence
+
+  // Decide whether we're doing addition or subtraction
+  unsigned LowOp, HighOp;
+  if (NumBytes >= 0) {
+    LowOp = AArch64::ADDxxi_lsl0_s;
+    HighOp = AArch64::ADDxxi_lsl12_s;
+  } else {
+    LowOp = AArch64::SUBxxi_lsl0_s;
+    HighOp = AArch64::SUBxxi_lsl12_s;
+    NumBytes = abs64(NumBytes);
+  }
+
+  // If we're here, at the very least a move needs to be produced, which just
+  // happens to be materializable by an ADD.
+  if ((NumBytes & 0xfff) || NumBytes == 0) {
+    BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addImm(NumBytes & 0xfff)
+      .setMIFlag(MIFlags);
+
+    // Next update should use the register we've just defined.
+    SrcReg = DstReg;
+  }
+
+  if (NumBytes & 0xfff000) {
+    BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
+      .addReg(SrcReg, RegState::Kill)
+      .addImm(NumBytes >> 12)
+      .setMIFlag(MIFlags);
+  }
+}
+
+void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        DebugLoc dl, const TargetInstrInfo &TII,
+                        unsigned ScratchReg, int64_t NumBytes,
+                        MachineInstr::MIFlag MIFlags) {
+  emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
+                NumBytes, MIFlags);
+}
+
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      AArch64MachineFunctionInfo* MFI
+        = MF.getInfo<AArch64MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+        case AArch64::TLSDESC_BLRx:
+          // Make sure it's a local dynamic access.
+          if (!I->getOperand(1).isSymbol() ||
+              strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+            break;
+
+          if (TLSBaseAddrReg)
+            I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+          else
+            I = SetRegister(I, &TLSBaseAddrReg);
+          Changed = true;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const AArch64TargetMachine *TM =
+          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+      const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
+      // code sequence assumes the address will be.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   AArch64::X0)
+        .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const AArch64TargetMachine *TM =
+          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+      const AArch64InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+
+      // Insert a copy from X0 to TLSBaseAddrReg for later.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+        .addReg(AArch64::X0);
+
+      return Copy;
+    }
+
+    virtual const char *getPassName() const {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
new file mode 100644
index 000000000000..22a2ab4cf60a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -0,0 +1,112 @@
+//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
+#define LLVM_TARGET_AARCH64INSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "AArch64RegisterInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AArch64GenInstrInfo.inc"
+
+namespace llvm {
+
+class AArch64Subtarget;
+
+class AArch64InstrInfo : public AArch64GenInstrInfo {
+  const AArch64RegisterInfo RI;
+  const AArch64Subtarget &Subtarget;
+public:
+  explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  const AArch64Subtarget &getSubTarget() const { return Subtarget; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  /// Look through the instructions in this function and work out the largest
+  /// the stack frame can be while maintaining the ability to address local
+  /// slots with no complexities.
+  unsigned estimateRSStackLimit(MachineFunction &MF) const;
+
+  /// getAddressConstraints - For loads and stores (and PRFMs) taking an
+  /// immediate offset, this function determines the constraints required for
+  /// the immediate. It must satisfy:
+  ///    + MinOffset <= imm <= MaxOffset
+  ///    + imm % OffsetScale == 0
+  void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
+                             int &MinOffset, int &MaxOffset) const;
+
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
+
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
+
+};
+
+bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const AArch64InstrInfo &TII);
+
+
+void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   DebugLoc dl, const TargetInstrInfo &TII,
+                   unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
+                   int64_t NumBytes,
+                   MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                  DebugLoc dl, const TargetInstrInfo &TII,
+                  unsigned ScratchReg, int64_t NumBytes,
+                  MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
new file mode 100644
index 000000000000..37be5e4892e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -0,0 +1,5099 @@
+//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AArch64 scalar instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "AArch64InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Target-specific ISD nodes and profiles
+//===----------------------------------------------------------------------===//
+
+def SDT_A64ret : SDTypeProfile<0, 0, []>;
+def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
+                                                    SDNPOptInGlue,
+                                                    SDNPVariadic]>;
+
+// (ins NZCV, Condition, Dest)
+def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
+def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
+
+// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
+def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
+                                            SDTCisSameAs<0, 2>,
+                                            SDTCisSameAs<2, 3>]>;
+def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
+
+// (outs NZCV), (ins LHS, RHS, Condition)
+def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
+                                        SDTCisSameAs<1, 2>]>;
+def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
+
+
+// (outs GPR64), (ins)
+def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+// A64 compares don't care about the cond really (they set all flags) so a
+// simple binary operator is useful.
+def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
+                     (A64setcc node:$lhs, node:$rhs, cond)>;
+
+
+// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
+// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
+// and V flags can be set differently by this operation. It comes down to
+// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
+// then everything is fine. If not then the optimization is wrong. Thus general
+// comparisons are only valid if op2 != 0.
+
+// So, finally, the only LLVM-native comparisons that don't mention C and V are
+// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
+// absence of information about op2.
+def equality_cond : PatLeaf<(cond), [{
+  return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
+}]>;
+
+def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
+                     (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
+
+// There are two layers of indirection here, driven by the following
+// considerations.
+//     + TableGen does not know CodeModel or Reloc so that decision should be
+//       made for a variable/address at ISelLowering.
+//     + The output of ISelLowering should be selectable (hence the Wrapper,
+//       rather than a bare target opcode)
+def SDTAArch64Wrapper : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<1, 2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisPtrTy<0>]>;
+
+def A64WrapperSmall : SDNode<"AArch64ISD::WrapperSmall", SDTAArch64Wrapper>;
+
+
+def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
+                        [SDNPHasChain]>;
+
+
+// (A64BFI LHS, RHS, LSB, Width)
+def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                     SDTCisSameAs<1, 2>,
+                                     SDTCisVT<3, i64>,
+                                     SDTCisVT<4, i64>]>;
+
+def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
+
+// (A64EXTR HiReg, LoReg, LSB)
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, i64>]>;
+def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+// (A64[SU]BFX Field, ImmR, ImmS).
+//
+// Note that ImmR and ImmS are already encoded for the actual instructions. The
+// more natural LSB and Width mix together to form ImmR and ImmS, something
+// which TableGen can't handle.
+def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
+def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
+
+def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
+
+//===----------------------------------------------------------------------===//
+// Call sequence pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+
+def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
+// destination but needs a relocation against a fixed symbol. As such it has two
+// certain operands: the callee and the relocated variable.
+//
+// The TLS ABI only allows it to be selected to a BLR instructin (with
+// appropriate relocation).
+def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
+
+def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
+                            [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                             SDNPVariadic]>;
+
+
+def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
+                                  [SDNPHasChain, SDNPOutGlue]>;
+
+def SDT_AArch64CallSeqEnd   : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
+def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_AArch64CallSeqEnd,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+
+
+// These pseudo-instructions have special semantics by virtue of being passed to
+// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
+// LowerCall to (in our case) tell the back-end about stack adjustments for
+// arguments passed on the stack. Here we select those markers to
+// pseudo-instructions which explicitly set the stack, and finally in the
+// RegisterInfo we convert them to a true stack adjustment.
+let Defs = [XSP], Uses = [XSP] in {
+  def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
+                                    [(AArch64callseq_start timm:$amt)]>;
+
+  def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                 [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic operation pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+multiclass AtomicSizes<string opname> {
+  def _I8 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+          [(set i32:$dst, (!cast<SDNode>(opname # "_8") i64:$ptr, i32:$incr))]>;
+  def _I16 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+         [(set i32:$dst, (!cast<SDNode>(opname # "_16") i64:$ptr, i32:$incr))]>;
+  def _I32 : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$incr),
+         [(set i32:$dst, (!cast<SDNode>(opname # "_32") i64:$ptr, i32:$incr))]>;
+  def _I64 : PseudoInst<(outs GPR64:$dst), (ins GPR64:$ptr, GPR64:$incr),
+         [(set i64:$dst, (!cast<SDNode>(opname # "_64") i64:$ptr, i64:$incr))]>;
+}
+}
+
+defm ATOMIC_LOAD_ADD  : AtomicSizes<"atomic_load_add">;
+defm ATOMIC_LOAD_SUB  : AtomicSizes<"atomic_load_sub">;
+defm ATOMIC_LOAD_AND  : AtomicSizes<"atomic_load_and">;
+defm ATOMIC_LOAD_OR   : AtomicSizes<"atomic_load_or">;
+defm ATOMIC_LOAD_XOR  : AtomicSizes<"atomic_load_xor">;
+defm ATOMIC_LOAD_NAND : AtomicSizes<"atomic_load_nand">;
+defm ATOMIC_SWAP      : AtomicSizes<"atomic_swap">;
+let Defs = [NZCV] in {
+  // These operations need a CMP to calculate the correct value
+  defm ATOMIC_LOAD_MIN  : AtomicSizes<"atomic_load_min">;
+  defm ATOMIC_LOAD_MAX  : AtomicSizes<"atomic_load_max">;
+  defm ATOMIC_LOAD_UMIN : AtomicSizes<"atomic_load_umin">;
+  defm ATOMIC_LOAD_UMAX : AtomicSizes<"atomic_load_umax">;
+}
+
+let usesCustomInserter = 1, Defs = [NZCV] in {
+def ATOMIC_CMP_SWAP_I8
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+            [(set i32:$dst, (atomic_cmp_swap_8 i64:$ptr, i32:$old, i32:$new))]>;
+def ATOMIC_CMP_SWAP_I16
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+           [(set i32:$dst, (atomic_cmp_swap_16 i64:$ptr, i32:$old, i32:$new))]>;
+def ATOMIC_CMP_SWAP_I32
+  : PseudoInst<(outs GPR32:$dst), (ins GPR64:$ptr, GPR32:$old, GPR32:$new),
+           [(set i32:$dst, (atomic_cmp_swap_32 i64:$ptr, i32:$old, i32:$new))]>;
+def ATOMIC_CMP_SWAP_I64
+  : PseudoInst<(outs GPR64:$dst), (ins GPR64:$ptr, GPR64:$old, GPR64:$new),
+           [(set i64:$dst, (atomic_cmp_swap_64 i64:$ptr, i64:$old, i64:$new))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (extended register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
+
+// The RHS of these operations is conceptually a sign/zero-extended
+// register, optionally shifted left by 1-4. The extension can be a
+// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
+// must be specified with one exception:
+
+// If one of the registers is sp/wsp then LSL is an alias for UXTW in
+// 32-bit instructions and UXTX in 64-bit versions, the shift amount
+// is not optional in that case (but can explicitly be 0), and the
+// entire suffix can be skipped (e.g. "add sp, x3, x2").
+
+multiclass extend_operands<string PREFIX, string Diag> {
+     def _asmoperand : AsmOperandClass {
+         let Name = PREFIX;
+         let RenderMethod = "addRegExtendOperands";
+         let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
+         let DiagnosticType = "AddSubRegExtend" # Diag;
+     }
+
+     def _operand : Operand<i64>,
+                    ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
+         let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
+         let DecoderMethod = "DecodeRegExtendOperand";
+         let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
+     }
+}
+
+defm UXTB : extend_operands<"UXTB", "Small">;
+defm UXTH : extend_operands<"UXTH", "Small">;
+defm UXTW : extend_operands<"UXTW", "Small">;
+defm UXTX : extend_operands<"UXTX", "Large">;
+defm SXTB : extend_operands<"SXTB", "Small">;
+defm SXTH : extend_operands<"SXTH", "Small">;
+defm SXTW : extend_operands<"SXTW", "Small">;
+defm SXTX : extend_operands<"SXTX", "Large">;
+
+def LSL_extasmoperand : AsmOperandClass {
+    let Name = "RegExtendLSL";
+    let RenderMethod = "addRegExtendOperands";
+    let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+def LSL_extoperand : Operand<i64> {
+    let ParserMatchClass = LSL_extasmoperand;
+}
+
+
+// The patterns for various sign-extensions are a little ugly and
+// non-uniform because everything has already been promoted to the
+// legal i64 and i32 types. We'll wrap the various variants up in a
+// class for use later.
+class extend_types {
+    dag uxtb; dag uxth; dag uxtw; dag uxtx;
+    dag sxtb; dag sxth; dag sxtw; dag sxtx;
+    ValueType ty;
+    RegisterClass GPR;
+}
+
+def extends_to_i64 : extend_types {
+    let uxtb = (and (anyext i32:$Rm), 255);
+    let uxth = (and (anyext i32:$Rm), 65535);
+    let uxtw = (zext i32:$Rm);
+    let uxtx = (i64 $Rm);
+
+    let sxtb = (sext_inreg (anyext i32:$Rm), i8);
+    let sxth = (sext_inreg (anyext i32:$Rm), i16);
+    let sxtw = (sext i32:$Rm);
+    let sxtx = (i64 $Rm);
+
+    let ty = i64;
+    let GPR = GPR64xsp;
+}
+
+
+def extends_to_i32 : extend_types {
+    let uxtb = (and i32:$Rm, 255);
+    let uxth = (and i32:$Rm, 65535);
+    let uxtw = (i32 i32:$Rm);
+    let uxtx = (i32 i32:$Rm);
+
+    let sxtb = (sext_inreg i32:$Rm, i8);
+    let sxth = (sext_inreg i32:$Rm, i16);
+    let sxtw = (i32 i32:$Rm);
+    let sxtx = (i32 i32:$Rm);
+
+    let ty = i32;
+    let GPR = GPR32wsp;
+}
+
+// Now, six of the extensions supported are easy and uniform: if the source size
+// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
+// those instructions in one block.
+
+// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
+//     + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
+//       be impossible.
+//     + Patterns are very different as well.
+//     + Passing different registers would be ugly (more fields in extend_types
+//       would probably be the best option).
+multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
+                       SDPatternOperator opfrag,
+                       dag outs, extend_types exts> {
+    def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
+                    NoItinerary>;
+    def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
+                    NoItinerary>;
+    def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
+                    NoItinerary>;
+
+    def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
+                    NoItinerary>;
+    def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
+                    NoItinerary>;
+    def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
+                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
+                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                    [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
+                    NoItinerary>;
+}
+
+// These two could be merge in with the above, but their patterns aren't really
+// necessary and the naming-scheme would necessarily break:
+multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
+                       dag outs> {
+    def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
+                   outs,
+                   (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
+                   NoItinerary>;
+
+    def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
+                   outs,
+                   (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
+                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                   [/* No Pattern: same as uxtx */],
+                   NoItinerary>;
+}
+
+multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
+    def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
+                              outs,
+                              (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
+                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                              [/* No pattern: probably same as uxtw */],
+                              NoItinerary>;
+
+    def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
+                              outs,
+                              (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
+                              !strconcat(asmop, "$Rn, $Rm, $Imm3"),
+                              [/* No Pattern: probably same as uxtw */],
+                              NoItinerary>;
+}
+
+class SetRD<RegisterClass RC, SDPatternOperator op>
+ : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
+class SetNZCV<SDPatternOperator op>
+  : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
+
+defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
+                        (outs GPR64xsp:$Rd), extends_to_i64>,
+            addsub_xxtx<     0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
+                        (outs GPR64xsp:$Rd)>;
+defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
+                        (outs GPR32wsp:$Rd), extends_to_i32>,
+            addsub_wxtx<     0b0, 0b0, "add\t$Rd, ",
+                        (outs GPR32wsp:$Rd)>;
+defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
+                        (outs GPR64xsp:$Rd), extends_to_i64>,
+            addsub_xxtx<     0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
+                        (outs GPR64xsp:$Rd)>;
+defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
+                        (outs GPR32wsp:$Rd), extends_to_i32>,
+            addsub_wxtx<     0b1, 0b0, "sub\t$Rd, ",
+                        (outs GPR32wsp:$Rd)>;
+
+let Defs = [NZCV] in {
+defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
+                         (outs GPR64:$Rd), extends_to_i64>,
+             addsub_xxtx<     0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
+                         (outs GPR64:$Rd)>;
+defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
+                         (outs GPR32:$Rd), extends_to_i32>,
+             addsub_wxtx<     0b0, 0b1, "adds\t$Rd, ",
+                         (outs GPR32:$Rd)>;
+defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
+                         (outs GPR64:$Rd), extends_to_i64>,
+             addsub_xxtx<     0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
+                         (outs GPR64:$Rd)>;
+defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
+                         (outs GPR32:$Rd), extends_to_i32>,
+             addsub_wxtx<     0b1, 0b1, "subs\t$Rd, ",
+                         (outs GPR32:$Rd)>;
+
+
+let Rd = 0b11111, isCompare = 1 in {
+defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
+                        (outs), extends_to_i64>,
+            addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
+defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
+                        (outs), extends_to_i32>,
+            addsub_wxtx<     0b0, 0b1, "cmn\t", (outs)>;
+defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
+                        (outs), extends_to_i64>,
+            addsub_xxtx<     0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
+defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
+                        (outs), extends_to_i32>,
+            addsub_wxtx<     0b1, 0b1, "cmp\t", (outs)>;
+}
+}
+
+// Now patterns for the operation without a shift being needed. No patterns are
+// created for uxtx/sxtx since they're non-uniform and it's expected that
+// add/sub (shifted register) will handle those cases anyway.
+multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
+                                      extend_types exts> {
+    def : Pat<(nodeop exts.ty:$Rn, exts.uxtb),
+              (!cast<Instruction>(prefix # "w_uxtb") $Rn, $Rm, 0)>;
+    def : Pat<(nodeop exts.ty:$Rn, exts.uxth),
+              (!cast<Instruction>(prefix # "w_uxth") $Rn, $Rm, 0)>;
+    def : Pat<(nodeop exts.ty:$Rn, exts.uxtw),
+              (!cast<Instruction>(prefix # "w_uxtw") $Rn, $Rm, 0)>;
+
+    def : Pat<(nodeop exts.ty:$Rn, exts.sxtb),
+              (!cast<Instruction>(prefix # "w_sxtb") $Rn, $Rm, 0)>;
+    def : Pat<(nodeop exts.ty:$Rn, exts.sxth),
+              (!cast<Instruction>(prefix # "w_sxth") $Rn, $Rm, 0)>;
+    def : Pat<(nodeop exts.ty:$Rn, exts.sxtw),
+              (!cast<Instruction>(prefix # "w_sxtw") $Rn, $Rm, 0)>;
+}
+
+defm : addsubext_noshift_patterns<"ADDxx", add, extends_to_i64>;
+defm : addsubext_noshift_patterns<"ADDww", add, extends_to_i32>;
+defm : addsubext_noshift_patterns<"SUBxx", sub, extends_to_i64>;
+defm : addsubext_noshift_patterns<"SUBww", sub, extends_to_i32>;
+
+defm : addsubext_noshift_patterns<"CMNx", A64cmn, extends_to_i64>;
+defm : addsubext_noshift_patterns<"CMNw", A64cmn, extends_to_i32>;
+defm : addsubext_noshift_patterns<"CMPx", A64cmp, extends_to_i64>;
+defm : addsubext_noshift_patterns<"CMPw", A64cmp, extends_to_i32>;
+
+// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
+// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
+// operation. Also permitted in this case is complete omission of the argument,
+// which implies "lsl #0".
+multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
+                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
+    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                    (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
+
+    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
+                (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
+
+}
+
+defm : lsl_aliases<"add",  ADDxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
+defm : lsl_aliases<"add",  ADDxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
+defm : lsl_aliases<"add",  ADDwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
+defm : lsl_aliases<"add",  ADDwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
+defm : lsl_aliases<"sub",  SUBxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
+defm : lsl_aliases<"sub",  SUBxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
+defm : lsl_aliases<"sub",  SUBwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
+defm : lsl_aliases<"sub",  SUBwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
+
+// Rd cannot be sp for flag-setting variants so only half of the aliases are
+// needed.
+defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
+defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
+defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
+defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
+
+// CMP unfortunately has to be different because the instruction doesn't have a
+// dest register.
+multiclass cmp_lsl_aliases<string asmop, Instruction inst,
+                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
+    def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
+                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
+
+    def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
+                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
+}
+
+defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
+defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
+defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
+defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
+
+// These instructions accept a 12-bit unsigned immediate, optionally shifted
+// left by 12 bits. Official assembly format specifies a 12 bit immediate with
+// one of "", "LSL #0", "LSL #12" supplementary operands.
+
+// There are surprisingly few ways to make this work with TableGen, so this
+// implementation has separate instructions for the "LSL #0" and "LSL #12"
+// variants.
+
+// If the MCInst retained a single combined immediate (which could be 0x123000,
+// for example) then both components (imm & shift) would have to be delegated to
+// a single assembly operand. This would entail a separate operand parser
+// (because the LSL would have to live in the same AArch64Operand as the
+// immediate to be accessible); assembly parsing is rather complex and
+// error-prone C++ code.
+//
+// By splitting the immediate, we can delegate handling this optional operand to
+// an InstAlias. Supporting functions to generate the correct MCInst are still
+// required, but these are essentially trivial and parsing can remain generic.
+//
+// Rejected plans with rationale:
+// ------------------------------
+//
+// In an ideal world you'de have two first class immediate operands (in
+// InOperandList, specifying imm12 and shift). Unfortunately this is not
+// selectable by any means I could discover.
+//
+// An Instruction with two MCOperands hidden behind a single entry in
+// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
+// but required more C++ code to handle encoding/decoding. Parsing (the intended
+// main beneficiary) ended up equally complex because of the optional nature of
+// "LSL #0".
+//
+// Attempting to circumvent the need for a custom OperandParser above by giving
+// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
+// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
+// should be parsed: there was no way to accommodate an "lsl #12".
+
+let ParserMethod = "ParseImmWithLSLOperand",
+    RenderMethod = "addImmWithLSLOperands" in {
+  // Derived PredicateMethod fields are different for each
+  def addsubimm_lsl0_asmoperand : AsmOperandClass {
+    let Name = "AddSubImmLSL0";
+    // If an error is reported against this operand, instruction could also be a
+    // register variant.
+    let DiagnosticType = "AddSubSecondSource";
+  }
+
+  def addsubimm_lsl12_asmoperand : AsmOperandClass {
+    let Name = "AddSubImmLSL12";
+    let DiagnosticType = "AddSubSecondSource";
+  }
+}
+
+def shr_12_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
+}]>;
+
+def shr_12_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
+}]>;
+
+def neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
+}]>;
+
+
+multiclass addsub_imm_operands<ValueType ty> {
+ let PrintMethod = "printAddSubImmLSL0Operand",
+      EncoderMethod = "getAddSubImmOpValue",
+      ParserMatchClass = addsubimm_lsl0_asmoperand in {
+    def _posimm_lsl0 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
+    def _negimm_lsl0 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
+                neg_XFORM>;
+  }
+
+  let PrintMethod = "printAddSubImmLSL12Operand",
+      EncoderMethod = "getAddSubImmOpValue",
+      ParserMatchClass = addsubimm_lsl12_asmoperand in {
+    def _posimm_lsl12 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
+                shr_12_XFORM>;
+
+    def _negimm_lsl12 : Operand<ty>,
+        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
+                shr_12_neg_XFORM>;
+  }
+}
+
+// The add operands don't need any transformation
+defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
+defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
+
+multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
+                               string asmop, string cmpasmop,
+                               Operand imm_operand, Operand cmp_imm_operand,
+                               RegisterClass GPR, RegisterClass GPRsp,
+                               AArch64Reg ZR, ValueType Ty> {
+    // All registers for non-S variants allow SP
+  def _s : A64I_addsubimm<sf, op, 0b0, shift,
+                         (outs GPRsp:$Rd),
+                         (ins GPRsp:$Rn, imm_operand:$Imm12),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
+                         [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
+                         NoItinerary>;
+
+
+  // S variants can read SP but would write to ZR
+  def _S : A64I_addsubimm<sf, op, 0b1, shift,
+                         (outs GPR:$Rd),
+                         (ins GPRsp:$Rn, imm_operand:$Imm12),
+                         !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
+                         [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
+                         NoItinerary> {
+    let Defs = [NZCV];
+  }
+
+  // Note that the pattern here for ADDS is subtle. Canonically CMP
+  // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
+  // ADDS a, (-b). This is not true in general.
+  def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
+                            (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
+                            !strconcat(cmpasmop, " $Rn, $Imm12"),
+                            [(set NZCV,
+                                  (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
+                            NoItinerary> {
+    let Rd = 0b11111;
+    let Defs = [NZCV];
+    let isCompare = 1;
+  }
+}
+
+
+multiclass addsubimm_shifts<string prefix, bit sf, bit op,
+           string asmop, string cmpasmop, string operand, string cmpoperand,
+           RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR,
+           ValueType Ty> {
+  defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
+                                   asmop, cmpasmop,
+                                   !cast<Operand>(operand # "_lsl0"),
+                                   !cast<Operand>(cmpoperand # "_lsl0"),
+                                   GPR, GPRsp, ZR, Ty>;
+
+  defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
+                                    asmop, cmpasmop,
+                                    !cast<Operand>(operand # "_lsl12"),
+                                    !cast<Operand>(cmpoperand # "_lsl12"),
+                                    GPR, GPRsp, ZR, Ty>;
+}
+
+defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
+                              "addsubimm_operand_i32_posimm",
+                              "addsubimm_operand_i32_negimm",
+                              GPR32, GPR32wsp, WZR, i32>;
+defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
+                              "addsubimm_operand_i64_posimm",
+                              "addsubimm_operand_i64_negimm",
+                              GPR64, GPR64xsp, XZR, i64>;
+defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
+                              "addsubimm_operand_i32_negimm",
+                              "addsubimm_operand_i32_posimm",
+                              GPR32, GPR32wsp, WZR, i32>;
+defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
+                              "addsubimm_operand_i64_negimm",
+                              "addsubimm_operand_i64_posimm",
+                              GPR64, GPR64xsp, XZR, i64>;
+
+multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
+  def _fromsp : InstAlias<"mov $Rd, $Rn",
+                          (addop GPRsp:$Rd, SP:$Rn, 0),
+                          0b1>;
+
+  def _tosp : InstAlias<"mov $Rd, $Rn",
+                        (addop SP:$Rd, GPRsp:$Rn, 0),
+                        0b1>;
+}
+
+// Recall Rxsp is a RegisterClass containing *just* xsp.
+defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
+defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (shifted register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
+
+//===-------------------------------
+// 1. The "shifed register" operands. Shared with logical insts.
+//===-------------------------------
+
+multiclass shift_operands<string prefix, string form> {
+  def _asmoperand_i32 : AsmOperandClass {
+    let Name = "Shift" # form # "i32";
+    let RenderMethod = "addShiftOperands";
+    let PredicateMethod = "isShift<A64SE::" # form # ", false>";
+    let DiagnosticType = "AddSubRegShift32";
+  }
+
+  // Note that the operand type is intentionally i64 because the DAGCombiner
+  // puts these into a canonical form.
+  def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+    let ParserMatchClass
+          = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
+    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
+    let DecoderMethod = "Decode32BitShiftOperand";
+  }
+
+  def _asmoperand_i64 : AsmOperandClass {
+      let Name = "Shift" # form # "i64";
+      let RenderMethod = "addShiftOperands";
+      let PredicateMethod = "isShift<A64SE::" # form # ", true>";
+      let DiagnosticType = "AddSubRegShift64";
+  }
+
+  def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+    let ParserMatchClass
+          = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
+    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
+  }
+}
+
+defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
+defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
+defm asr_operand : shift_operands<"asr_operand", "ASR">;
+
+// Not used for add/sub, but defined here for completeness. The "logical
+// (shifted register)" instructions *do* have an ROR variant.
+defm ror_operand : shift_operands<"ror_operand", "ROR">;
+
+//===-------------------------------
+// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
+//===-------------------------------
+
+// N.b. the commutable parameter is just !N. It will be first against the wall
+// when the revolution comes.
+multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
+                         string asmop, SDPatternOperator opfrag, ValueType ty,
+                         RegisterClass GPR, list<Register> defs> {
+  let isCommutable = commutable, Defs = defs in {
+  def _lsl : A64I_addsubshift<sf, op, s, 0b00,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_addsubshift<sf, op, s, 0b01,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_addsubshift<sf, op, s, 0b10,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
+                                                      GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag ty:$Rn, ty:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+}
+
+multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
+                         string asmop, SDPatternOperator opfrag,
+                         list<Register> defs> {
+  defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
+                           commutable, asmop, opfrag, i64, GPR64, defs>;
+  defm www : addsub_shifts<prefix # "www", 0b0, op, s,
+                           commutable, asmop, opfrag, i32, GPR32, defs>;
+}
+
+
+defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
+defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
+
+defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
+defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
+
+//===-------------------------------
+// 1. The NEG/NEGS aliases
+//===-------------------------------
+
+multiclass neg_alias<Instruction INST, RegisterClass GPR, Register ZR,
+                     ValueType ty, Operand shift_operand, SDNode shiftop> {
+   def : InstAlias<"neg $Rd, $Rm, $Imm6",
+                   (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+
+   def : Pat<(sub 0, (shiftop ty:$Rm, shift_operand:$Imm6)),
+             (INST ZR, $Rm, shift_operand:$Imm6)>;
+}
+
+defm : neg_alias<SUBwww_lsl, GPR32, WZR, i32, lsl_operand_i32, shl>;
+defm : neg_alias<SUBwww_lsr, GPR32, WZR, i32, lsr_operand_i32, srl>;
+defm : neg_alias<SUBwww_asr, GPR32, WZR, i32, asr_operand_i32, sra>;
+def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+def : Pat<(sub 0, i32:$Rm), (SUBwww_lsl WZR, $Rm, 0)>;
+
+defm : neg_alias<SUBxxx_lsl, GPR64, XZR, i64, lsl_operand_i64, shl>;
+defm : neg_alias<SUBxxx_lsr, GPR64, XZR, i64, lsr_operand_i64, srl>;
+defm : neg_alias<SUBxxx_asr, GPR64, XZR, i64, asr_operand_i64, sra>;
+def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+def : Pat<(sub 0, i64:$Rm), (SUBxxx_lsl XZR, $Rm, 0)>;
+
+// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
+// be involved.
+class negs_alias<Instruction INST, RegisterClass GPR,
+                 Register ZR, Operand shift_operand, SDNode shiftop>
+  : InstAlias<"negs $Rd, $Rm, $Imm6",
+              (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+
+def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
+def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
+def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
+def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+
+def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
+def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
+def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
+def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+
+//===-------------------------------
+// 1. The CMP/CMN aliases
+//===-------------------------------
+
+multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
+                      string asmop, SDPatternOperator opfrag, ValueType ty,
+                      RegisterClass GPR> {
+  let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
+  def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
+                       [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag ty:$Rn, ty:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+}
+
+defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, i32, GPR32>;
+defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, i64, GPR64>;
+
+defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, i32, GPR32>;
+defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, i64, GPR64>;
+
+//===----------------------------------------------------------------------===//
+// Add-subtract (with carry) instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
+
+multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
+  let Uses = [NZCV] in {
+    def www : A64I_addsubcarry<0b0, op, s, 0b000000,
+                               (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
+                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                               [], NoItinerary>;
+
+    def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
+                               (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                               [], NoItinerary>;
+  }
+}
+
+let isCommutable = 1 in {
+  defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
+}
+
+defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
+
+let Defs = [NZCV] in {
+  let isCommutable = 1 in {
+    defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
+  }
+
+  defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
+}
+
+def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
+def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
+def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+
+// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
+// addition). So the flag-setting instructions are appropriate.
+def : Pat<(adde i32:$Rn, i32:$Rm), (ADCSwww $Rn, $Rm)>;
+def : Pat<(adde i64:$Rn, i64:$Rm), (ADCSxxx $Rn, $Rm)>;
+def : Pat<(sube i32:$Rn, i32:$Rm), (SBCSwww $Rn, $Rm)>;
+def : Pat<(sube i64:$Rn, i64:$Rm), (SBCSxxx $Rn, $Rm)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield
+//===----------------------------------------------------------------------===//
+// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
+//     UBFIZ, UBFX
+
+// Because of the rather complicated nearly-overlapping aliases, the decoding of
+// this range of instructions is handled manually. The architectural
+// instructions are BFM, SBFM and UBFM but a disassembler should never produce
+// these.
+//
+// In the end, the best option was to use BFM instructions for decoding under
+// almost all circumstances, but to create aliasing *Instructions* for each of
+// the canonical forms and specify a completely custom decoder which would
+// substitute the correct MCInst as needed.
+//
+// This also simplifies instruction selection, parsing etc because the MCInsts
+// have a shape that's closer to their use in code.
+
+//===-------------------------------
+// 1. The architectural BFM instructions
+//===-------------------------------
+
+def uimm5_asmoperand : AsmOperandClass {
+  let Name = "UImm5";
+  let PredicateMethod = "isUImm<5>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm5";
+}
+
+def uimm6_asmoperand : AsmOperandClass {
+  let Name = "UImm6";
+  let PredicateMethod = "isUImm<6>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm6";
+}
+
+def bitfield32_imm : Operand<i64>,
+                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
+  let ParserMatchClass = uimm5_asmoperand;
+
+  let DecoderMethod = "DecodeBitfield32ImmOperand";
+}
+
+
+def bitfield64_imm : Operand<i64>,
+                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
+  let ParserMatchClass = uimm6_asmoperand;
+
+  // Default decoder works in 64-bit case: the 6-bit field can take any value.
+}
+
+multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                    (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                    [], NoItinerary> {
+    let DecoderMethod = "DecodeBitfieldInstruction";
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                    (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                    [], NoItinerary> {
+    let DecoderMethod = "DecodeBitfieldInstruction";
+  }
+}
+
+defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
+defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
+
+// BFM instructions modify the destination register rather than defining it
+// completely.
+def BFMwwii :
+  A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+        (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  let DecoderMethod = "DecodeBitfieldInstruction";
+  let Constraints = "$src = $Rd";
+}
+
+def BFMxxii :
+  A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+        (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
+        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  let DecoderMethod = "DecodeBitfieldInstruction";
+  let Constraints = "$src = $Rd";
+}
+
+
+//===-------------------------------
+// 2. Extend aliases to 64-bit dest
+//===-------------------------------
+
+// Unfortunately the extensions that end up as 64-bits cannot be handled by an
+// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
+// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
+// not capable of such a map as far as I'm aware
+
+// Note that these instructions are strictly more specific than the
+// BFM ones (in ImmR) so they can handle their own decoding.
+class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
+                    string asmop, bits<6> imms, dag pattern>
+  : A64I_bitfield<sf, opc, sf,
+                  (outs GPRDest:$Rd), (ins GPR32:$Rn),
+                  !strconcat(asmop, "\t$Rd, $Rn"),
+                  [(set dty:$Rd, pattern)], NoItinerary> {
+  let ImmR = 0b000000;
+  let ImmS = imms;
+}
+
+// Signed extensions
+def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtb", 7,
+                         (sext_inreg (anyext i32:$Rn), i8)>;
+def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxtb", 7,
+                         (sext_inreg i32:$Rn, i8)>;
+def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxth", 15,
+                         (sext_inreg (anyext i32:$Rn), i16)>;
+def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxth", 15,
+                         (sext_inreg i32:$Rn, i16)>;
+def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtw", 31, (sext i32:$Rn)>;
+
+// Unsigned extensions
+def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxtb", 7,
+                         (and i32:$Rn, 255)>;
+def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxth", 15,
+                         (and i32:$Rn, 65535)>;
+
+// The 64-bit unsigned variants are not strictly architectural but recommended
+// for consistency.
+let isAsmParserOnly = 1 in {
+  def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxtb", 7,
+                           (and (anyext i32:$Rn), 255)>;
+  def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxth", 15,
+                           (and (anyext i32:$Rn), 65535)>;
+}
+
+// Extra patterns for when the source register is actually 64-bits
+// too. There's no architectural difference here, it's just LLVM
+// shinanigans. There's no need for equivalent zero-extension patterns
+// because they'll already be caught by logical (immediate) matching.
+def : Pat<(sext_inreg i64:$Rn, i8),
+          (SXTBxw (EXTRACT_SUBREG $Rn, sub_32))>;
+def : Pat<(sext_inreg i64:$Rn, i16),
+          (SXTHxw (EXTRACT_SUBREG $Rn, sub_32))>;
+def : Pat<(sext_inreg i64:$Rn, i32),
+          (SXTWxw (EXTRACT_SUBREG $Rn, sub_32))>;
+
+
+//===-------------------------------
+// 3. Aliases for ASR and LSR (the simple shifts)
+//===-------------------------------
+
+// These also handle their own decoding because ImmS being set makes
+// them take precedence over BFM.
+multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
+  def wwi : A64I_bitfield<0b0, opc, 0b0,
+                    (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
+                    [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
+                    NoItinerary> {
+    let ImmS = 31;
+  }
+
+  def xxi : A64I_bitfield<0b1, opc, 0b1,
+                    (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
+                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
+                    [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
+                    NoItinerary> {
+    let ImmS = 63;
+  }
+
+}
+
+defm ASR : A64I_shift<0b00, "asr", sra>;
+defm LSR : A64I_shift<0b10, "lsr", srl>;
+
+//===-------------------------------
+// 4. Aliases for LSL
+//===-------------------------------
+
+// Unfortunately LSL and subsequent aliases are much more complicated. We need
+// to be able to say certain output instruction fields depend in a complex
+// manner on combinations of input assembly fields).
+//
+// MIOperandInfo *might* have been able to do it, but at the cost of
+// significantly more C++ code.
+
+// N.b. contrary to usual practice these operands store the shift rather than
+// the machine bits in an MCInst. The complexity overhead of consistency
+// outweighed the benefits in this case (custom asmparser, printer and selection
+// vs custom encoder).
+def bitfield32_lsl_imm : Operand<i64>,
+                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+  let ParserMatchClass = uimm5_asmoperand;
+  let EncoderMethod = "getBitfield32LSLOpValue";
+}
+
+def bitfield64_lsl_imm : Operand<i64>,
+                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+  let ParserMatchClass = uimm6_asmoperand;
+  let EncoderMethod = "getBitfield64LSLOpValue";
+}
+
+class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
+                        Operand operand>
+  : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
+                  "lsl\t$Rd, $Rn, $FullImm",
+                  [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
+                  NoItinerary> {
+  bits<12> FullImm;
+  let ImmR = FullImm{5-0};
+  let ImmS = FullImm{11-6};
+
+  // No disassembler allowed because it would overlap with BFM which does the
+  // actual work.
+  let isAsmParserOnly = 1;
+}
+
+def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, i32, bitfield32_lsl_imm>;
+def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, i64, bitfield64_lsl_imm>;
+
+//===-------------------------------
+// 5. Aliases for bitfield extract instructions
+//===-------------------------------
+
+def bfx32_width_asmoperand : AsmOperandClass {
+  let Name = "BFX32Width";
+  let PredicateMethod = "isBitfieldWidth<32>";
+  let RenderMethod = "addBFXWidthOperands";
+  let DiagnosticType = "Width32";
+}
+
+def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
+  let PrintMethod = "printBFXWidthOperand";
+  let ParserMatchClass = bfx32_width_asmoperand;
+}
+
+def bfx64_width_asmoperand : AsmOperandClass {
+  let Name = "BFX64Width";
+  let PredicateMethod = "isBitfieldWidth<64>";
+  let RenderMethod = "addBFXWidthOperands";
+  let DiagnosticType = "Width64";
+}
+
+def bfx64_width : Operand<i64> {
+  let PrintMethod = "printBFXWidthOperand";
+  let ParserMatchClass = bfx64_width_asmoperand;
+}
+
+
+multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                       (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
+                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                       [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
+                       NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                       (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
+                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                       [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
+                       NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+}
+
+defm SBFX :  A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
+defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
+
+// Again, variants based on BFM modify Rd so need it as an input too.
+def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+           (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
+           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+           (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
+           "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+// SBFX instructions can do a 1-instruction sign-extension of boolean values.
+def : Pat<(sext_inreg i64:$Rn, i1), (SBFXxxii $Rn, 0, 0)>;
+def : Pat<(sext_inreg i32:$Rn, i1), (SBFXwwii $Rn, 0, 0)>;
+def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
+          (SBFXxxii (SUBREG_TO_REG (i64 0), $Rn, sub_32), 0, 0)>;
+
+// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
+// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
+def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
+                                         sub_32)>;
+
+//===-------------------------------
+// 6. Aliases for bitfield insert instructions
+//===-------------------------------
+
+def bfi32_lsb_asmoperand : AsmOperandClass {
+  let Name = "BFI32LSB";
+  let PredicateMethod = "isUImm<5>";
+  let RenderMethod = "addBFILSBOperands<32>";
+  let DiagnosticType = "UImm5";
+}
+
+def bfi32_lsb : Operand<i64>,
+                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
+  let PrintMethod = "printBFILSBOperand<32>";
+  let ParserMatchClass = bfi32_lsb_asmoperand;
+}
+
+def bfi64_lsb_asmoperand : AsmOperandClass {
+  let Name = "BFI64LSB";
+  let PredicateMethod = "isUImm<6>";
+  let RenderMethod = "addBFILSBOperands<64>";
+  let DiagnosticType = "UImm6";
+}
+
+def bfi64_lsb : Operand<i64>,
+                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
+  let PrintMethod = "printBFILSBOperand<64>";
+  let ParserMatchClass = bfi64_lsb_asmoperand;
+}
+
+// Width verification is performed during conversion so width operand can be
+// shared between 32/64-bit cases. Still needed for the print method though
+// because ImmR encodes "width - 1".
+def bfi32_width_asmoperand : AsmOperandClass {
+  let Name = "BFI32Width";
+  let PredicateMethod = "isBitfieldWidth<32>";
+  let RenderMethod = "addBFIWidthOperands";
+  let DiagnosticType = "Width32";
+}
+
+def bfi32_width : Operand<i64>,
+                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
+  let PrintMethod = "printBFIWidthOperand";
+  let ParserMatchClass = bfi32_width_asmoperand;
+}
+
+def bfi64_width_asmoperand : AsmOperandClass {
+  let Name = "BFI64Width";
+  let PredicateMethod = "isBitfieldWidth<64>";
+  let RenderMethod = "addBFIWidthOperands";
+  let DiagnosticType = "Width64";
+}
+
+def bfi64_width : Operand<i64>,
+                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
+  let PrintMethod = "printBFIWidthOperand";
+  let ParserMatchClass = bfi64_width_asmoperand;
+}
+
+multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
+  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
+                           (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
+                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                           [], NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+
+  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
+                           (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
+                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
+                           [], NoItinerary> {
+    // As above, no disassembler allowed.
+    let isAsmParserOnly = 1;
+  }
+}
+
+defm SBFIZ :  A64I_bitfield_insert<0b00, "sbfiz">;
+defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
+
+
+def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
+                (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
+                (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
+                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary> {
+  // As above, no disassembler allowed.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src = $Rd";
+}
+
+//===----------------------------------------------------------------------===//
+// Compare and branch (immediate)
+//===----------------------------------------------------------------------===//
+// Contains: CBZ, CBNZ
+
+class label_asmoperand<int width, int scale> : AsmOperandClass {
+  let Name = "Label" # width # "_" # scale;
+  let PredicateMethod = "isLabel<" # width # "," # scale # ">";
+  let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
+  let DiagnosticType = "Label";
+}
+
+def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
+
+// All conditional immediate branches are the same really: 19 signed bits scaled
+// by the instruction-size (4).
+def bcc_target : Operand<OtherVT> {
+  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
+  let ParserMatchClass = label_wid19_scal4_asmoperand;
+  let PrintMethod = "printLabelOperand<19, 4>";
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
+  let OperandType = "OPERAND_PCREL";
+}
+
+multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
+  let isBranch = 1, isTerminator = 1 in {
+  def x : A64I_cmpbr<0b1, op,
+                     (outs),
+                     (ins GPR64:$Rt, bcc_target:$Label),
+                     !strconcat(asmop,"\t$Rt, $Label"),
+                     [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
+                     NoItinerary>;
+
+  def w : A64I_cmpbr<0b0, op,
+                     (outs),
+                     (ins GPR32:$Rt, bcc_target:$Label),
+                     !strconcat(asmop,"\t$Rt, $Label"),
+                     [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
+                     NoItinerary>;
+  }
+}
+
+defm CBZ  : cmpbr_sizes<0b0, "cbz",  ImmLeaf<i32, [{
+  return Imm == A64CC::EQ;
+}]> >;
+defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
+  return Imm == A64CC::NE;
+}]> >;
+
+//===----------------------------------------------------------------------===//
+// Conditional branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: B.cc
+
+def cond_code_asmoperand : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "CondCode";
+}
+
+def cond_code : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 15;
+}]> {
+  let PrintMethod = "printCondCodeOperand";
+  let ParserMatchClass = cond_code_asmoperand;
+}
+
+def Bcc : A64I_condbr<0b0, 0b0, (outs),
+                (ins cond_code:$Cond, bcc_target:$Label),
+                "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
+                NoItinerary> {
+  let Uses = [NZCV];
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional compare (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CCMN, CCMP
+
+def uimm4_asmoperand : AsmOperandClass {
+  let Name = "UImm4";
+  let PredicateMethod = "isUImm<4>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm4";
+}
+
+def uimm4 : Operand<i32> {
+  let ParserMatchClass = uimm4_asmoperand;
+}
+
+def uimm5 : Operand<i32> {
+  let ParserMatchClass = uimm5_asmoperand;
+}
+
+// The only difference between this operand and the one for instructions like
+// B.cc is that it's parsed manually. The other get parsed implicitly as part of
+// the mnemonic handling.
+def cond_code_op_asmoperand : AsmOperandClass {
+  let Name = "CondCodeOp";
+  let RenderMethod = "addCondCodeOperands";
+  let PredicateMethod = "isCondCode";
+  let ParserMethod = "ParseCondCodeOperand";
+  let DiagnosticType = "CondCode";
+}
+
+def cond_code_op : Operand<i32> {
+  let PrintMethod = "printCondCodeOperand";
+  let ParserMatchClass = cond_code_op_asmoperand;
+}
+
+class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
+  : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
+                (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
+                !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
+                [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
+def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
+def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
+def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional compare (register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CCMN, CCMP
+
+class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
+  : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
+                    (outs),
+                    (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
+                    !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
+                    [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
+def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
+def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
+def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
+
+//===----------------------------------------------------------------------===//
+// Conditional select instructions
+//===----------------------------------------------------------------------===//
+// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
+
+// Condition code which is encoded as the inversion (semantically rather than
+// bitwise) in the instruction.
+def inv_cond_code_op_asmoperand : AsmOperandClass {
+  let Name = "InvCondCodeOp";
+  let RenderMethod = "addInvCondCodeOperands";
+  let PredicateMethod = "isCondCode";
+  let ParserMethod = "ParseCondCodeOperand";
+  let DiagnosticType = "CondCode";
+}
+
+def inv_cond_code_op : Operand<i32> {
+  let ParserMatchClass = inv_cond_code_op_asmoperand;
+}
+
+// Having a separate operand for the selectable use-case is debatable, but gives
+// consistency with cond_code.
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
+  return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
+}]>;
+
+def inv_cond_code
+  : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
+
+
+multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
+                             SDPatternOperator select> {
+  let Uses = [NZCV] in {
+    def wwwc : A64I_condsel<0b0, op, 0b0, op2,
+                            (outs GPR32:$Rd),
+                            (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
+                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
+                            [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
+                            NoItinerary>;
+
+
+    def xxxc : A64I_condsel<0b1, op, 0b0, op2,
+                            (outs GPR64:$Rd),
+                            (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
+                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
+                            [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
+                            NoItinerary>;
+  }
+}
+
+def simple_select
+  : PatFrag<(ops node:$lhs, node:$rhs),
+            (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
+
+class complex_select<SDPatternOperator opnode>
+  : PatFrag<(ops node:$lhs, node:$rhs),
+        (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
+
+
+defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
+defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
+                               complex_select<PatFrag<(ops node:$val),
+                                                      (add node:$val, 1)>>>;
+defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
+defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
+
+// Now the instruction aliases, which fit nicely into LLVM's model:
+
+def : InstAlias<"cset $Rd, $Cond",
+                (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cset $Rd, $Cond",
+                (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"csetm $Rd, $Cond",
+                (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"csetm $Rd, $Cond",
+                (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinc $Rd, $Rn, $Cond",
+           (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinc $Rd, $Rn, $Cond",
+           (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinv $Rd, $Rn, $Cond",
+           (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cinv $Rd, $Rn, $Cond",
+           (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cneg $Rd, $Rn, $Cond",
+           (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
+def : InstAlias<"cneg $Rd, $Rn, $Cond",
+           (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
+
+// Finally some helper patterns.
+
+// For CSET (a.k.a. zero-extension of icmp)
+def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
+          (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
+          (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
+
+def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
+          (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
+          (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
+
+// For CSETM (a.k.a. sign-extension of icmp)
+def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
+          (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
+          (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
+
+def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
+          (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
+def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
+          (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
+
+// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
+// commutativity. The instructions are to complex for isCommutable to be used,
+// so we have to create the patterns manually:
+
+// No commutable pattern for CSEL since the commuted version is isomorphic.
+
+// CSINC
+def :Pat<(A64select_cc NZCV, (add i32:$Rm, 1), i32:$Rn, inv_cond_code:$Cond),
+         (CSINCwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (add i64:$Rm, 1), i64:$Rn, inv_cond_code:$Cond),
+         (CSINCxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
+
+// CSINV
+def :Pat<(A64select_cc NZCV, (not i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
+         (CSINVwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (not i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
+         (CSINVxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
+
+// CSNEG
+def :Pat<(A64select_cc NZCV, (ineg i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
+         (CSNEGwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
+def :Pat<(A64select_cc NZCV, (ineg i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
+         (CSNEGxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (1 source) instructions
+//===----------------------------------------------------------------------===//
+// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
+
+// We define an unary operator which always fails. We will use this to
+// define unary operators that cannot be matched.
+
+class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
+                   list<dag> patterns, RegisterClass GPRrc,
+                   InstrItinClass itin>:
+      A64I_dp_1src<sf,
+                   0,
+                   0b00000,
+                   opcode,
+                   !strconcat(asmop, "\t$Rd, $Rn"),
+                   (outs GPRrc:$Rd),
+                   (ins GPRrc:$Rn),
+                   patterns,
+                   itin>;
+
+multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
+  let hasSideEffects = 0 in {
+    def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
+    def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
+  }
+}
+
+defm RBIT  : A64I_dp_1src<0b000000, "rbit">;
+defm CLS   : A64I_dp_1src<0b000101, "cls">;
+defm CLZ   : A64I_dp_1src<0b000100, "clz">;
+
+def : Pat<(ctlz i32:$Rn), (CLZww $Rn)>;
+def : Pat<(ctlz i64:$Rn), (CLZxx $Rn)>;
+def : Pat<(ctlz_zero_undef i32:$Rn), (CLZww $Rn)>;
+def : Pat<(ctlz_zero_undef i64:$Rn), (CLZxx $Rn)>;
+
+def : Pat<(cttz i32:$Rn), (CLZww (RBITww $Rn))>;
+def : Pat<(cttz i64:$Rn), (CLZxx (RBITxx $Rn))>;
+def : Pat<(cttz_zero_undef i32:$Rn), (CLZww (RBITww $Rn))>;
+def : Pat<(cttz_zero_undef i64:$Rn), (CLZxx (RBITxx $Rn))>;
+
+
+def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
+                              [(set i32:$Rd, (bswap i32:$Rn))],
+                              GPR32, NoItinerary>;
+def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
+                              [(set i64:$Rd, (bswap i64:$Rn))],
+                              GPR64, NoItinerary>;
+def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
+                          [(set i64:$Rd, (bswap (rotr i64:$Rn, (i64 32))))],
+                          GPR64, NoItinerary>;
+def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
+                          [(set i32:$Rd, (bswap (rotr i32:$Rn, (i64 16))))],
+                          GPR32,
+                          NoItinerary>;
+def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (2 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
+//           LSR, ASR, ROR
+
+
+class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
+                   RegisterClass GPRsp,
+                   InstrItinClass itin>:
+      A64I_dp_2src<sf,
+                   opcode,
+                   0,
+                   !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                   (outs GPRsp:$Rd),
+                   (ins GPRsp:$Rn, GPRsp:$Rm),
+                   patterns,
+                   itin>;
+
+multiclass dp_2src_crc<bit c, string asmop> {
+  def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
+                           !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
+  def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
+                           !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
+  def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
+                           !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
+  def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
+                           !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
+                           (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
+                           NoItinerary>;
+}
+
+multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
+   def www : dp_2src_impl<0b0,
+                         opcode,
+                         asmop,
+                         [(set i32:$Rd,
+                               (op i32:$Rn, (i64 (zext i32:$Rm))))],
+                         GPR32,
+                         NoItinerary>;
+   def xxx : dp_2src_impl<0b1,
+                         opcode,
+                         asmop,
+                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
+                         GPR64,
+                         NoItinerary>;
+}
+
+
+multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
+    def www : dp_2src_impl<0b0,
+                         opcode,
+                         asmop,
+                         [(set i32:$Rd, (op i32:$Rn, i32:$Rm))],
+                         GPR32,
+                         NoItinerary>;
+   def xxx : dp_2src_impl<0b1,
+                         opcode,
+                         asmop,
+                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
+                         GPR64,
+                         NoItinerary>;
+}
+
+// Here we define the data processing 2 source instructions.
+defm CRC32  : dp_2src_crc<0b0, "crc32">;
+defm CRC32C : dp_2src_crc<0b1, "crc32c">;
+
+defm UDIV : dp_2src<0b000010, "udiv", udiv>;
+defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
+
+defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
+defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
+defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
+defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
+
+// Extra patterns for an incoming 64-bit value for a 32-bit
+// operation. Since the LLVM operations are undefined (as in C) if the
+// RHS is out of range, it's perfectly permissible to discard the high
+// bits of the GPR64.
+def : Pat<(shl i32:$Rn, i64:$Rm),
+          (LSLVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
+def : Pat<(srl i32:$Rn, i64:$Rm),
+          (LSRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
+def : Pat<(sra i32:$Rn, i64:$Rm),
+          (ASRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
+def : Pat<(rotr i32:$Rn, i64:$Rm),
+          (RORVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
+
+// Here we define the aliases for the data processing 2 source instructions.
+def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
+def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
+def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
+def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
+
+//===----------------------------------------------------------------------===//
+// Data Processing (3 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
+//    + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
+
+class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
+                        ValueType AccTy, RegisterClass SrcReg,
+                        string asmop, dag pattern>
+  : A64I_dp3<sf, opcode,
+             (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
+             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
+             [(set AccTy:$Rd, pattern)], NoItinerary> {
+  RegisterClass AccGPR = AccReg;
+  RegisterClass SrcGPR = SrcReg;
+}
+
+def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, i32, GPR32, "madd",
+                                 (add i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
+def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, i64, GPR64, "madd",
+                                 (add i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
+
+def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, i32, GPR32, "msub",
+                                 (sub i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
+def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, i64, GPR64, "msub",
+                                 (sub i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
+
+def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, i64, GPR32, "smaddl",
+                     (add i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
+def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, i64, GPR32, "smsubl",
+                     (sub i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
+
+def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, i64, GPR32, "umaddl",
+                     (add i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
+def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, i64, GPR32, "umsubl",
+                     (sub i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
+
+let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
+  def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
+                          (ins GPR64:$Rn, GPR64:$Rm),
+                          "umulh\t$Rd, $Rn, $Rm",
+                          [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
+                          NoItinerary>;
+
+  def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
+                          (ins GPR64:$Rn, GPR64:$Rm),
+                          "smulh\t$Rd, $Rn, $Rm",
+                          [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
+                          NoItinerary>;
+}
+
+multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
+                             Register ZR, dag pattern> {
+  def : InstAlias<asmop # " $Rd, $Rn, $Rm",
+                  (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
+
+  def : Pat<pattern, (INST $Rn, $Rm, ZR)>;
+}
+
+defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul i32:$Rn, i32:$Rm)>;
+defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul i64:$Rn, i64:$Rm)>;
+
+defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
+                         (sub 0, (mul i32:$Rn, i32:$Rm))>;
+defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
+                         (sub 0, (mul i64:$Rn, i64:$Rm))>;
+
+defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
+                         (mul (i64 (sext i32:$Rn)), (sext i32:$Rm))>;
+defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
+                       (sub 0, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
+
+defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
+                         (mul (i64 (zext i32:$Rn)), (zext i32:$Rm))>;
+defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
+                       (sub 0, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
+
+
+//===----------------------------------------------------------------------===//
+// Exception generation
+//===----------------------------------------------------------------------===//
+// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
+
+def uimm16_asmoperand : AsmOperandClass {
+  let Name = "UImm16";
+  let PredicateMethod = "isUImm<16>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm16";
+}
+
+def uimm16 : Operand<i32> {
+  let ParserMatchClass = uimm16_asmoperand;
+}
+
+class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
+  : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
+                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary> {
+  let isBranch = 1;
+  let isTerminator = 1;
+}
+
+def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
+def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
+def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
+def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
+def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
+
+def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
+def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
+def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
+
+// The immediate is optional for the DCPS instructions, defaulting to 0.
+def : InstAlias<"dcps1", (DCPS1i 0)>;
+def : InstAlias<"dcps2", (DCPS2i 0)>;
+def : InstAlias<"dcps3", (DCPS3i 0)>;
+
+//===----------------------------------------------------------------------===//
+// Extract (immediate)
+//===----------------------------------------------------------------------===//
+// Contains: EXTR + alias ROR
+
+def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
+                            (outs GPR32:$Rd),
+                            (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
+                            "extr\t$Rd, $Rn, $Rm, $LSB",
+                            [(set i32:$Rd,
+                                  (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
+                            NoItinerary>;
+def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
+                            (outs GPR64:$Rd),
+                            (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
+                            "extr\t$Rd, $Rn, $Rm, $LSB",
+                            [(set i64:$Rd,
+                                  (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
+                            NoItinerary>;
+
+def : InstAlias<"ror $Rd, $Rs, $LSB",
+               (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
+def : InstAlias<"ror $Rd, $Rs, $LSB",
+               (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
+
+def : Pat<(rotr i32:$Rn, bitfield32_imm:$LSB),
+          (EXTRwwwi $Rn, $Rn, bitfield32_imm:$LSB)>;
+def : Pat<(rotr i64:$Rn, bitfield64_imm:$LSB),
+          (EXTRxxxi $Rn, $Rn, bitfield64_imm:$LSB)>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point compare instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCMP, FCMPE
+
+def fpzero_asmoperand : AsmOperandClass {
+  let Name = "FPZero";
+  let ParserMethod = "ParseFPImmOperand";
+  let DiagnosticType = "FPZero";
+}
+
+def fpz32 : Operand<f32>,
+            ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
+
+def fpz64 : Operand<f64>,
+            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
+
+multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
+  def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
+                          (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
+                          NoItinerary> {
+    let Defs = [NZCV];
+  }
+
+  def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
+                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary> {
+    let Defs = [NZCV];
+  }
+}
+
+defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
+                               (set NZCV, (A64cmp f32:$Rn, f32:$Rm))>;
+defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
+                               (set NZCV, (A64cmp f64:$Rn, f64:$Rm))>;
+
+// What would be Rm should be written as 0; note that even though it's called
+// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
+defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
+                               (set NZCV, (A64cmp f32:$Rn, fpz32:$Rm))>;
+
+defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
+                               (set NZCV, (A64cmp f64:$Rn, fpz64:$Rm))>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating-point conditional compare instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCCMP, FCCMPE
+
+class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
+  : A64I_fpccmp<0b0, 0b0, type, op,
+                (outs),
+                (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
+                !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
+                [], NoItinerary> {
+  let Defs = [NZCV];
+}
+
+def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
+def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
+def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
+def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
+
+//===----------------------------------------------------------------------===//
+// Floating-point conditional select instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCSEL
+
+let Uses = [NZCV] in {
+  def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
+                                 (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
+                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
+                                 [(set f32:$Rd, 
+                                       (simple_select f32:$Rn, f32:$Rm))],
+                                 NoItinerary>;
+
+
+  def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
+                                 (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
+                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
+                                 [(set f64:$Rd,
+                                       (simple_select f64:$Rn, f64:$Rm))],
+                                 NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (1 source)
+//===----------------------------------------------------------------------===//
+// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
+
+def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
+                       [{ (void)N; return false; }]>;
+
+// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
+// syntax. Default to no pattern because most are odd enough not to have one.
+multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
+                           SDPatternOperator opnode = FPNoUnop> {
+  def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
+                     !strconcat(asmstr, "\t$Rd, $Rn"),
+                     [(set f32:$Rd, (opnode f32:$Rn))],
+                     NoItinerary>;
+
+  def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
+                     !strconcat(asmstr, "\t$Rd, $Rn"),
+                     [(set f64:$Rd, (opnode f64:$Rn))],
+                     NoItinerary>;
+}
+
+defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
+defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
+defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
+defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
+
+defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
+defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
+defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
+defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
+defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
+defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
+defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
+
+// The FCVT instrucitons have different source and destination register-types,
+// but the fields are uniform everywhere a D-register (say) crops up. Package
+// this information in a Record.
+class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
+    RegisterClass Class = rc;
+    ValueType VT = vt;
+    bit t1 = fld{1};
+    bit t0 = fld{0};
+}
+
+def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
+def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
+def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
+
+class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
+  : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
+               {0,0,0,1, DestReg.t1, DestReg.t0},
+               (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
+               "fcvt\t$Rd, $Rn",
+               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>;
+
+def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
+def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
+def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
+def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
+def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
+def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (2 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
+
+def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
+                      [{ (void)N; return false; }]>;
+
+multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
+                           SDPatternOperator opnode> {
+  def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
+                      (outs FPR32:$Rd),
+                      (ins FPR32:$Rn, FPR32:$Rm),
+                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
+                      [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
+                      NoItinerary>;
+
+  def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
+                      (outs FPR64:$Rd),
+                      (ins FPR64:$Rn, FPR64:$Rm),
+                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
+                      [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
+                      NoItinerary>;
+}
+
+let isCommutable = 1 in {
+  defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
+  defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
+
+  // No patterns for these.
+  defm FMAX   : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
+  defm FMIN   : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
+  defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
+  defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
+
+  defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
+                                PatFrag<(ops node:$lhs, node:$rhs),
+                                        (fneg (fmul node:$lhs, node:$rhs))> >;
+}
+
+defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
+defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point data-processing (3 sources) instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMADD, FMSUB, FNMADD, FNMSUB
+
+def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                    (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
+def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
+def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
+                     (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
+
+class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
+                     bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
+  : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
+               (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
+               !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
+               [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
+               NoItinerary>;
+
+def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
+def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
+def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
+def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
+
+def FMADDdddd  : A64I_fpdp3Impl<"fmadd",  FPR64, f64, 0b01, 0b0, 0b0, fma>;
+def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
+def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
+def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point <-> fixed-point conversion instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
+
+// #1-#32 allowed, encoded as "64 - <specified imm>
+def fixedpos_asmoperand_i32 : AsmOperandClass {
+  let Name = "CVTFixedPos32";
+  let RenderMethod = "addCVTFixedPosOperands";
+  let PredicateMethod = "isCVTFixedPos<32>";
+  let DiagnosticType = "CVTFixedPos32";
+}
+
+// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
+def fixedpos_asmoperand_i64 : AsmOperandClass {
+  let Name = "CVTFixedPos64";
+  let RenderMethod = "addCVTFixedPosOperands";
+  let PredicateMethod = "isCVTFixedPos<64>";
+  let DiagnosticType = "CVTFixedPos64";
+}
+
+// We need the cartesian product of f32/f64 i32/i64 operands for
+// conversions:
+//   + Selection needs to use operands of correct floating type
+//   + Assembly parsing and decoding depend on integer width
+class cvtfix_i32_op<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
+  let ParserMatchClass = fixedpos_asmoperand_i32;
+  let DecoderMethod = "DecodeCVT32FixedPosOperand";
+  let PrintMethod = "printCVTFixedPosOperand";
+}
+
+class cvtfix_i64_op<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
+  let ParserMatchClass = fixedpos_asmoperand_i64;
+  let PrintMethod = "printCVTFixedPosOperand";
+}
+
+// Because of the proliferation of weird operands, it's not really
+// worth going for a multiclass here. Oh well.
+
+class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
+                   RegisterClass GPR, RegisterClass FPR, 
+                   ValueType DstTy, ValueType SrcTy, 
+                   Operand scale_op, string asmop, SDNode cvtop>
+  : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
+                 (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
+                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
+                 [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
+                 NoItinerary>;
+
+def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
+                             cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
+def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32, i64, f32,
+                             cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
+def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32, i32, f32,
+                             cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
+def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32, i64, f32,
+                             cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
+
+def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64, i32, f64,
+                             cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
+def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64, i64, f64,
+                             cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
+def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64, i32, f64,
+                             cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
+def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64, i64, f64,
+                             cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
+
+
+class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
+                   RegisterClass FPR, RegisterClass GPR,
+                   ValueType DstTy, ValueType SrcTy,
+                   Operand scale_op, string asmop, SDNode cvtop>
+  : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
+                 (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
+                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
+                 [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
+                 NoItinerary>;
+
+def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
+                            cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
+def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64, f32, i64,
+                            cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
+def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32, f32, i32,
+                            cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
+def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64, f32, i64,
+                            cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
+def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32, f64, i32,
+                            cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
+def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64, f64, i64,
+                            cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
+def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32, f64, i32,
+                            cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
+def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
+                            cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point <-> integer conversion instructions
+//===----------------------------------------------------------------------===//
+// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
+
+class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
+                   RegisterClass DestPR, RegisterClass SrcPR, string asmop>
+  : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
+               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>;
+
+multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
+  def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
+                        GPR32, FPR32, asmop # "s">;
+  def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
+                        GPR64, FPR32, asmop # "s">;
+  def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
+                        GPR32, FPR32, asmop # "u">;
+  def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
+                        GPR64, FPR32, asmop # "u">;
+
+  def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
+                        GPR32, FPR64, asmop # "s">;
+  def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
+                        GPR64, FPR64, asmop # "s">;
+  def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
+                        GPR32, FPR64, asmop # "u">;
+  def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
+                        GPR64, FPR64, asmop # "u">;
+}
+
+defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
+defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
+defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
+defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
+defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
+
+def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
+def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
+def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
+def : Pat<(i64 (fp_to_uint f32:$Rn)), (FCVTZUxs $Rn)>;
+def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
+def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
+def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
+def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
+
+multiclass A64I_inttofp<bit o0, string asmop> {
+  def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
+  def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
+  def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
+  def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
+}
+
+defm S : A64I_inttofp<0b0, "scvtf">;
+defm U : A64I_inttofp<0b1, "ucvtf">;
+
+def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
+def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
+def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
+def : Pat<(f64 (sint_to_fp i64:$Rn)), (SCVTFdx $Rn)>;
+def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
+def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
+def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
+def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
+
+def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
+def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
+def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
+def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
+
+def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
+def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
+def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
+def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
+
+def lane1_asmoperand : AsmOperandClass {
+  let Name = "Lane1";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "Lane1";
+}
+
+def lane1 : Operand<i32> {
+  let ParserMatchClass = lane1_asmoperand;
+  let PrintMethod = "printBareImmOperand";
+}
+
+let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
+  def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
+                          (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
+                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>;
+
+  def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
+                          (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
+                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
+}
+
+def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
+                (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
+
+def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
+                (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point immediate instructions
+//===----------------------------------------------------------------------===//
+// Contains: FMOV
+
+def fpimm_asmoperand : AsmOperandClass {
+  let Name = "FMOVImm";
+  let ParserMethod = "ParseFPImmOperand";
+  let DiagnosticType = "FPImm";
+}
+
+// The MCOperand for these instructions are the encoded 8-bit values.
+def SDXF_fpimm : SDNodeXForm<fpimm, [{
+  uint32_t Imm8;
+  A64Imms::isFPImm(N->getValueAPF(), Imm8);
+  return CurDAG->getTargetConstant(Imm8, MVT::i32);
+}]>;
+
+class fmov_operand<ValueType FT>
+  : Operand<i32>,
+    PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
+            SDXF_fpimm> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = fpimm_asmoperand;
+}
+
+def fmov32_operand : fmov_operand<f32>;
+def fmov64_operand : fmov_operand<f64>;
+
+class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
+                      Operand fmov_operand>
+  : A64I_fpimm<0b0, 0b0, type, 0b00000,
+               (outs Reg:$Rd),
+               (ins fmov_operand:$Imm8),
+               "fmov\t$Rd, $Imm8",
+               [(set VT:$Rd, fmov_operand:$Imm8)],
+               NoItinerary>;
+
+def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
+def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
+
+//===----------------------------------------------------------------------===//
+// Load-register (literal) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDR, LDRSW, PRFM
+
+def ldrlit_label_asmoperand : AsmOperandClass {
+  let Name = "LoadLitLabel";
+  let RenderMethod = "addLabelOperands<19, 4>";
+  let DiagnosticType = "Label";
+}
+
+def ldrlit_label : Operand<i64> {
+  let EncoderMethod = "getLoadLitLabelOpValue";
+
+  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<19, 4>";
+  let ParserMatchClass = ldrlit_label_asmoperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Various instructions take an immediate value (which can always be used),
+// where some numbers have a symbolic name to make things easier. These operands
+// and the associated functions abstract away the differences.
+multiclass namedimm<string prefix, string mapper> {
+  def _asmoperand : AsmOperandClass {
+    let Name = "NamedImm" # prefix;
+    let PredicateMethod = "isUImm";
+    let RenderMethod = "addImmOperands";
+    let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
+    let DiagnosticType = "NamedImm_" # prefix;
+  }
+
+  def _op : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+    let PrintMethod = "printNamedImmOperand<" # mapper # ">";
+    let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
+  }
+}
+
+defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
+
+class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
+                      list<dag> patterns = []>
+   : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
+                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>;
+
+let mayLoad = 1 in {
+  def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
+  def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
+}
+
+def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
+def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+
+let mayLoad = 1 in {
+  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+
+
+  def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
+                               (outs GPR64:$Rt),
+                               (ins ldrlit_label:$Imm19),
+                               "ldrsw\t$Rt, $Imm19",
+                               [], NoItinerary>;
+
+  def PRFM_lit : A64I_LDRlit<0b11, 0b0,
+                             (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
+                             "prfm\t$Rt, $Imm19",
+                             [], NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// Load-store exclusive instructions
+//===----------------------------------------------------------------------===//
+// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
+//           STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
+//           STLRH, STLR, LDARB, LDARH, LDAR
+
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fiels since Rt and Rn are always used.
+
+// This operand parses a GPR64xsp register, followed by an optional immediate
+// #0.
+def GPR64xsp0_asmoperand : AsmOperandClass {
+  let Name = "GPR64xsp0";
+  let PredicateMethod = "isWrappedReg";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "ParseLSXAddressOperand";
+  // Diagnostics are provided by ParserMethod
+}
+
+def GPR64xsp0 : RegisterOperand<GPR64xsp> {
+  let ParserMatchClass = GPR64xsp0_asmoperand;
+}
+
+//===----------------------------------
+// Store-exclusive (releasing & normal)
+//===----------------------------------
+
+class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+       A64I_LDSTex_stn <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
+  def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+
+  def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                               [],NoItinerary>;
+
+  def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
+                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+
+  def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
+                              (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
+                              [], NoItinerary>;
+}
+
+defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
+defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
+
+//===----------------------------------
+// Loads
+//===----------------------------------
+
+class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+        A64I_LDSTex_tn <size,
+                        opcode{2}, 1, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, [$Rn]"),
+                        pat, itin> {
+  let mayLoad = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+multiclass A64I_LRex<string asmstr, bits<3> opcode> {
+  def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
+                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
+                            (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm LDXR  : A64I_LRex<"ldxr",  0b000>;
+defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
+defm LDAR  : A64I_LRex<"ldar",  0b101>;
+
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  return cast<AtomicSDNode>(N)->getOrdering() == Acquire;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
+
+def : Pat<(atomic_load_acquire_8  i64:$Rn), (LDAR_byte  $Rn)>;
+def : Pat<(atomic_load_acquire_16 i64:$Rn), (LDAR_hword $Rn)>;
+def : Pat<(atomic_load_acquire_32 i64:$Rn), (LDAR_word  $Rn)>;
+def : Pat<(atomic_load_acquire_64 i64:$Rn), (LDAR_dword $Rn)>;
+
+//===----------------------------------
+// Store-release (no exclusivity)
+//===----------------------------------
+
+class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+        A64I_LDSTex_tn <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  return cast<AtomicSDNode>(N)->getOrdering() == Release;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+def atomic_store_release_64 : releasing_store<atomic_store_64>;
+
+multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
+  def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
+                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                            [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
+                            NoItinerary>;
+
+  def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
+                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
+                           NoItinerary>;
+
+  def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
+                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
+                           NoItinerary>;
+
+  def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
+                           (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
+                           [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
+                           NoItinerary>;
+}
+
+defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
+
+//===----------------------------------
+// Store-exclusive pair (releasing & normal)
+//===----------------------------------
+
+class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+     A64I_LDSTex_stt2n <size,
+                        opcode{2}, 0, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
+                        pat, itin> {
+  let mayStore = 1;
+}
+
+
+multiclass A64I_SPex<string asmstr, bits<3> opcode> {
+  def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
+                            (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
+                                 GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
+                            (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
+                                            GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm STXP  : A64I_SPex<"stxp", 0b010>;
+defm STLXP : A64I_SPex<"stlxp", 0b011>;
+
+//===----------------------------------
+// Load-exclusive pair (acquiring & normal)
+//===----------------------------------
+
+class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
+                        dag ins, list<dag> pat,
+                        InstrItinClass itin> :
+      A64I_LDSTex_tt2n <size,
+                        opcode{2}, 1, opcode{1}, opcode{0},
+                        outs, ins,
+                        !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
+                        pat, itin>{
+  let mayLoad = 1;
+  let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
+}
+
+multiclass A64I_LPex<string asmstr, bits<3> opcode> {
+  def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
+                            (outs GPR32:$Rt, GPR32:$Rt2),
+                            (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+
+  def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
+                            (outs GPR64:$Rt, GPR64:$Rt2),
+                            (ins GPR64xsp0:$Rn),
+                            [], NoItinerary>;
+}
+
+defm LDXP  : A64I_LPex<"ldxp", 0b010>;
+defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register (unscaled immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (register offset) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (unsigned immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (immediate post-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register (immediate pre-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
+
+// Note that patterns are much later on in a completely separate section (they
+// need ADRPxi to be defined).
+
+//===-------------------------------
+// 1. Various operands needed
+//===-------------------------------
+
+//===-------------------------------
+// 1.1 Unsigned 12-bit immediate operands
+//===-------------------------------
+// The addressing mode for these instructions consists of an unsigned 12-bit
+// immediate which is scaled by the size of the memory access.
+//
+// We represent this in the MC layer by two operands:
+//     1. A base register.
+//     2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
+//        would have '1' in this field.
+// This means that separate functions are needed for converting representations
+// which *are* aware of the intended access size.
+
+// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
+// know the access size via some means. An isolated operand does not have this
+// information unless told from here, which means we need separate tablegen
+// Operands for each access size. This multiclass takes care of instantiating
+// the correct template functions in the rest of the backend.
+
+//===-------------------------------
+// 1.1 Unsigned 12-bit immediate operands
+//===-------------------------------
+
+multiclass offsets_uimm12<int MemSize, string prefix> {
+  def uimm12_asmoperand : AsmOperandClass {
+    let Name = "OffsetUImm12_" # MemSize;
+    let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
+    let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreUImm12_" # MemSize;
+  }
+
+  // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
+  // complicates things beyond TableGen's ken.
+  def uimm12 : Operand<i64>,
+               ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
+    let ParserMatchClass
+      = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
+
+    let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
+    let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
+  }
+}
+
+defm byte_  : offsets_uimm12<1, "byte_">;
+defm hword_ : offsets_uimm12<2, "hword_">;
+defm word_  : offsets_uimm12<4, "word_">;
+defm dword_ : offsets_uimm12<8, "dword_">;
+defm qword_ : offsets_uimm12<16, "qword_">;
+
+//===-------------------------------
+// 1.1 Signed 9-bit immediate operands
+//===-------------------------------
+
+// The MCInst is expected to store the bit-wise encoding of the value,
+// which amounts to lopping off the extended sign bits.
+def SDXF_simm9 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
+}]>;
+
+def simm9_asmoperand : AsmOperandClass {
+  let Name = "SImm9";
+  let PredicateMethod = "isSImm<9>";
+  let RenderMethod = "addSImmOperands<9>";
+  let DiagnosticType = "LoadStoreSImm9";
+}
+
+def simm9 : Operand<i64>,
+            ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
+            SDXF_simm9> {
+  let PrintMethod = "printOffsetSImm9Operand";
+  let ParserMatchClass = simm9_asmoperand;
+}
+
+
+//===-------------------------------
+// 1.3 Register offset extensions
+//===-------------------------------
+
+// The assembly-syntax for these addressing-modes is:
+//    [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
+//
+// The essential semantics are:
+//     + <amount> is a shift: #<log(transfer size)> or #0
+//     + <R> can be W or X.
+//     + If <R> is W, <extend> can be UXTW or SXTW
+//     + If <R> is X, <extend> can be LSL or SXTX
+//
+// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
+// which will need separate instructions for LLVM type-consistency. We'll also
+// need separate operands, of course.
+multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
+                   string Rm, string prefix> {
+  def regext_asmoperand : AsmOperandClass {
+    let Name = "AddrRegExtend_" # MemSize # "_" #  Rm;
+    let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
+    let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
+  }
+
+  def regext : Operand<i64> {
+    let PrintMethod
+      = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
+
+    let DecoderMethod = "DecodeAddrRegExtendOperand";
+    let ParserMatchClass
+      = !cast<AsmOperandClass>(prefix # regext_asmoperand);
+  }
+}
+
+multiclass regexts_wx<int MemSize, string prefix> {
+  // Rm is an X-register if LSL or SXTX are specified as the shift.
+  defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
+
+  // Rm is a W-register if UXTW or SXTW are specified as the shift.
+  defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
+}
+
+defm byte_  : regexts_wx<1, "byte_">;
+defm hword_ : regexts_wx<2, "hword_">;
+defm word_  : regexts_wx<4, "word_">;
+defm dword_ : regexts_wx<8, "dword_">;
+defm qword_ : regexts_wx<16, "qword_">;
+
+
+//===------------------------------
+// 2. The instructions themselves.
+//===------------------------------
+
+// We have the following instructions to implement:
+// |                 | B     | H     | W     | X      |
+// |-----------------+-------+-------+-------+--------|
+// | unsigned str    | STRB  | STRH  | STR   | STR    |
+// | unsigned ldr    | LDRB  | LDRH  | LDR   | LDR    |
+// | signed ldr to W | LDRSB | LDRSH | -     | -      |
+// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
+
+// This will instantiate the LDR/STR instructions you'd expect to use for an
+// unsigned datatype (first two rows above) or floating-point register, which is
+// reasonably uniform across all access sizes.
+
+
+//===------------------------------
+// 2.1 Regular instructions
+//===------------------------------
+
+// This class covers the basic unsigned or irrelevantly-signed loads and stores,
+// to general-purpose and floating-point registers.
+
+class AddrParams<string prefix> {
+  Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
+
+  Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
+  Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
+}
+
+def byte_addrparams : AddrParams<"byte">;
+def hword_addrparams : AddrParams<"hword">;
+def word_addrparams : AddrParams<"word">;
+def dword_addrparams : AddrParams<"dword">;
+def qword_addrparams : AddrParams<"qword">;
+
+multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
+                                bit high_opc, string asmsuffix,
+                                RegisterClass GPR, AddrParams params> {
+  // Unsigned immediate
+  def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
+                     (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
+                     "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
+                     [], NoItinerary> {
+    let mayStore = 1;
+  }
+  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
+                (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
+                      (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                      "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
+                      [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
+                (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Register offset (four of these: load/store and Wm/Xm).
+  let mayLoad = 1 in {
+    def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
+                            (outs GPR:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
+                            (outs GPR:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+  }
+  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
+                                                          GPR64:$Rm, 2)>;
+
+  let mayStore = 1 in {
+    def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
+                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
+                                               params.regextWm:$Ext),
+                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                                  [], NoItinerary>;
+
+    def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
+                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
+                                               params.regextXm:$Ext),
+                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
+                                  [], NoItinerary>;
+  }
+  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
+      (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
+                                                        GPR64:$Rm, 2)>;
+
+  // Unaligned immediate
+  def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
+                             (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                             "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary> {
+    let mayStore = 1;
+  }
+  def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
+               (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
+                             (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
+               (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Post-indexed
+  def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                               "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
+                               [], NoItinerary> {
+    let Constraints = "$Rn = $Rn_wb";
+    let mayStore = 1;
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
+                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
+                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                    "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
+                                    [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  // Pre-indexed
+  def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                               "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
+                               [], NoItinerary> {
+    let Constraints = "$Rn = $Rn_wb";
+    let mayStore = 1;
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+  def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
+                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
+                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                    "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
+                                    [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeSingleIndexedInstruction";
+  }
+
+}
+
+// STRB/LDRB: First define the instructions
+defm LS8
+  : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
+
+// STRH/LDRH
+defm LS16
+  : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
+
+
+// STR/LDR to/from a W register
+defm LS32
+  : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
+
+// STR/LDR to/from an X register
+defm LS64
+  : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
+
+// STR/LDR to/from a B register
+defm LSFP8
+  : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
+
+// STR/LDR to/from an H register
+defm LSFP16
+  : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
+
+// STR/LDR to/from an S register
+defm LSFP32
+  : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
+// STR/LDR to/from a D register
+defm LSFP64
+  : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
+// STR/LDR to/from a Q register
+defm LSFP128
+  : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
+                         qword_addrparams>;
+
+//===------------------------------
+// 2.3 Signed loads
+//===------------------------------
+
+// Byte and half-word signed loads can both go into either an X or a W register,
+// so it's worth factoring out. Signed word loads don't fit because there is no
+// W version.
+multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
+                           string prefix> {
+  // Unsigned offset
+  def w : A64I_LSunsigimm<size, 0b0, 0b11,
+                          (outs GPR32:$Rt),
+                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
+                          [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
+                  (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def x : A64I_LSunsigimm<size, 0b0, 0b10,
+                          (outs GPR64:$Rt),
+                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
+                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
+                          [], NoItinerary> {
+    let mayLoad = 1;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
+                  (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+  // Register offset
+  let mayLoad = 1 in {
+    def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
+                            (outs GPR32:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
+                            (outs GPR32:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+
+    def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
+                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
+                            [], NoItinerary>;
+  }
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
+                                                       GPR64:$Rm, 2)>;
+
+  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
+        (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
+                                                       GPR64:$Rm, 2)>;
+
+
+  let mayLoad = 1 in {
+    // Unaligned offset
+    def w_U : A64I_LSunalimm<size, 0b0, 0b11,
+                             (outs GPR32:$Rt),
+                             (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary>;
+
+    def x_U : A64I_LSunalimm<size, 0b0, 0b10,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, simm9:$SImm9),
+                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                             [], NoItinerary>;
+
+
+    // Post-indexed
+    def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
+                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
+                                   (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                   (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                   "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
+                                   [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    // Pre-indexed
+    def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
+                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+
+    def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
+                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+      let Constraints = "$Rn = $Rn_wb";
+      let DecoderMethod = "DecodeSingleIndexedInstruction";
+    }
+  } // let mayLoad = 1
+}
+
+// LDRSB
+defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
+// LDRSH
+defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
+
+// LDRSW: load a 32-bit register, sign-extending to 64-bits.
+def LDRSWx
+    : A64I_LSunsigimm<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt),
+                    (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
+                    "ldrsw\t$Rt, [$Rn, $UImm12]",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+let mayLoad = 1 in {
+  def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
+                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
+                             [], NoItinerary>;
+
+  def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
+                             (outs GPR64:$Rt),
+                             (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
+                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
+                             [], NoItinerary>;
+}
+def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
+                (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
+
+
+def LDURSWx
+    : A64I_LSunalimm<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt),
+                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                    "ldursw\t$Rt, [$Rn, $SImm9]",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+def LDRSWx_PostInd
+    : A64I_LSpostind<0b10, 0b0, 0b10,
+                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                    (ins GPR64xsp:$Rn, simm9:$SImm9),
+                    "ldrsw\t$Rt, [$Rn], $SImm9",
+                    [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeSingleIndexedInstruction";
+}
+
+def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
+                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
+                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
+                                 "ldrsw\t$Rt, [$Rn, $SImm9]!",
+                                 [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeSingleIndexedInstruction";
+}
+
+//===------------------------------
+// 2.4 Prefetch operations
+//===------------------------------
+
+def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
+                 (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
+                 "prfm\t$Rt, [$Rn, $UImm12]",
+                 [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"prfm $Rt, [$Rn]",
+                (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
+
+let mayLoad = 1 in {
+  def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
+                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
+                                             GPR32:$Rm, dword_Wm_regext:$Ext),
+                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
+                                        [], NoItinerary>;
+  def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
+                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
+                                             GPR64:$Rm, dword_Xm_regext:$Ext),
+                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
+                                        [], NoItinerary>;
+}
+
+def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+                (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
+                                   GPR64:$Rm, 2)>;
+
+
+def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
+                         (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                         "prfum\t$Rt, [$Rn, $SImm9]",
+                         [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"prfum $Rt, [$Rn]",
+                (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register (unprivileged) instructions
+//===----------------------------------------------------------------------===//
+// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
+
+// These instructions very much mirror the "unscaled immediate" loads, but since
+// there are no floating-point variants we need to split them out into their own
+// section to avoid instantiation of "ldtr d0, [sp]" etc.
+
+multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
+                         string prefix> {
+  def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
+                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
+                              "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                              [], NoItinerary> {
+    let mayStore = 1;
+  }
+
+  def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
+         (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
+                               (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
+                               "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
+                               [], NoItinerary> {
+    let mayLoad = 1;
+  }
+
+  def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
+         (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
+
+}
+
+// STTRB/LDTRB: First define the instructions
+defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
+
+// STTRH/LDTRH
+defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
+
+// STTR/LDTR to/from a W register
+defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
+
+// STTR/LDTR to/from an X register
+defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
+
+// Now a class for the signed instructions that can go to either 32 or 64
+// bits...
+multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
+  let mayLoad = 1 in {
+    def w : A64I_LSunpriv<size, 0b0, 0b11,
+                          (outs GPR32:$Rt),
+                          (ins GPR64xsp:$Rn, simm9:$SImm9),
+                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                          [], NoItinerary>;
+
+    def x : A64I_LSunpriv<size, 0b0, 0b10,
+                          (outs GPR64:$Rt),
+                          (ins GPR64xsp:$Rn, simm9:$SImm9),
+                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
+                          [], NoItinerary>;
+  }
+
+  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
+                 (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
+
+  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
+                 (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+}
+
+// LDTRSB
+defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
+// LDTRSH
+defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
+
+// And finally LDTRSW which only goes to 64 bits.
+def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
+                            (outs GPR64:$Rt),
+                            (ins GPR64xsp:$Rn, simm9:$SImm9),
+                            "ldtrsw\t$Rt, [$Rn, $SImm9]",
+                            [], NoItinerary> {
+  let mayLoad = 1;
+}
+def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Load-store register pair (offset) instructions
+//===----------------------------------------------------------------------===//
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register pair (post-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STP, LDP, LDPSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store register pair (pre-indexed) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STP, LDP, LDPSW
+//
+// and
+//
+//===----------------------------------------------------------------------===//
+// Load-store non-temporal register pair (offset) instructions
+//===----------------------------------------------------------------------===//
+// Contains: STNP, LDNP
+
+
+// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
+// know the access size via some means. An isolated operand does not have this
+// information unless told from here, which means we need separate tablegen
+// Operands for each access size. This multiclass takes care of instantiating
+// the correct template functions in the rest of the backend.
+
+multiclass offsets_simm7<string MemSize, string prefix> {
+  // The bare signed 7-bit immediate is used in post-indexed instructions, but
+  // because of the scaling performed a generic "simm7" operand isn't
+  // appropriate here either.
+  def simm7_asmoperand : AsmOperandClass {
+    let Name = "SImm7_Scaled" # MemSize;
+    let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
+    let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
+    let DiagnosticType = "LoadStoreSImm7_" # MemSize;
+  }
+
+  def simm7 : Operand<i64> {
+    let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
+  }
+}
+
+defm word_  : offsets_simm7<"4", "word_">;
+defm dword_ : offsets_simm7<"8", "dword_">;
+defm qword_ : offsets_simm7<"16", "qword_">;
+
+multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
+                          Operand simm7, string prefix> {
+  def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayStore = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _LDR : A64I_LSPoffset<opc, v, 0b1,
+                            (outs SomeReg:$Rt, SomeReg:$Rt2),
+                            (ins GPR64xsp:$Rn, simm7:$SImm7),
+                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayLoad = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
+                               (outs GPR64xsp:$Rn_wb),
+                               (ins SomeReg:$Rt, SomeReg:$Rt2,
+                                    GPR64xsp:$Rn,
+                                    simm7:$SImm7),
+                               "stp\t$Rt, $Rt2, [$Rn], $SImm7",
+                               [], NoItinerary> {
+    let mayStore = 1;
+    let Constraints = "$Rn = $Rn_wb";
+
+    // Decoder only needed for unpredictability checking (FIXME).
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
+                        (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
+                        (ins GPR64xsp:$Rn, simm7:$SImm7),
+                        "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
+                        [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                    [], NoItinerary> {
+    let mayStore = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
+                              (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
+                              (ins GPR64xsp:$Rn, simm7:$SImm7),
+                              "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                              [], NoItinerary> {
+    let mayLoad = 1;
+    let Constraints = "$Rn = $Rn_wb";
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+
+  def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
+                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
+                    "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayStore = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+  def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
+                            (outs SomeReg:$Rt, SomeReg:$Rt2),
+                            (ins GPR64xsp:$Rn, simm7:$SImm7),
+                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+    let mayLoad = 1;
+    let DecoderMethod = "DecodeLDSTPairInstruction";
+  }
+  def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
+                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
+
+}
+
+
+defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
+defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
+defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
+defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
+defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
+                                  "LSFPPair128">;
+
+
+def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
+                           (outs GPR64:$Rt, GPR64:$Rt2),
+                           (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary> {
+  let mayLoad = 1;
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
+                (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
+
+def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
+                                  (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
+                                  (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                                  "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
+                                  [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+
+def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
+                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
+                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
+                                   "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
+                                   [], NoItinerary> {
+  let mayLoad = 1;
+  let Constraints = "$Rn = $Rn_wb";
+  let DecoderMethod = "DecodeLDSTPairInstruction";
+}
+
+//===----------------------------------------------------------------------===//
+// Logical (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
+
+multiclass logical_imm_operands<string prefix, string note,
+                                int size, ValueType VT> {
+  def _asmoperand : AsmOperandClass {
+    let Name = "LogicalImm" # note # size;
+    let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
+    let RenderMethod = "addLogicalImmOperands<" # size # ">";
+    let DiagnosticType = "LogicalSecondSource";
+  }
+
+  def _operand
+        : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+    let PrintMethod = "printLogicalImmOperand<" # size # ">";
+    let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
+  }
+}
+
+defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
+defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
+
+// The mov versions only differ in assembly parsing, where they
+// exclude values representable with either MOVZ or MOVN.
+defm logical_imm32_mov
+  : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
+defm logical_imm64_mov
+  : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
+
+
+multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
+  def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
+                         (ins GPR32:$Rn, logical_imm32_operand:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [(set i32:$Rd,
+                               (opnode i32:$Rn, logical_imm32_operand:$Imm))],
+                         NoItinerary>;
+
+  def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
+                         (ins GPR64:$Rn, logical_imm64_operand:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [(set i64:$Rd,
+                               (opnode i64:$Rn, logical_imm64_operand:$Imm))],
+                         NoItinerary>;
+}
+
+defm AND : A64I_logimmSizes<0b00, "and", and>;
+defm ORR : A64I_logimmSizes<0b01, "orr", or>;
+defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
+
+let Defs = [NZCV] in {
+  def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
+                                (ins GPR32:$Rn, logical_imm32_operand:$Imm),
+                                "ands\t$Rd, $Rn, $Imm",
+                                [], NoItinerary>;
+
+  def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
+                                (ins GPR64:$Rn, logical_imm64_operand:$Imm),
+                                "ands\t$Rd, $Rn, $Imm",
+                                [], NoItinerary>;
+}
+
+
+def : InstAlias<"tst $Rn, $Imm",
+                (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
+def : InstAlias<"tst $Rn, $Imm",
+                (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
+def : InstAlias<"mov $Rd, $Imm",
+                (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm)>;
+def : InstAlias<"mov $Rd, $Imm",
+                (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm)>;
+
+//===----------------------------------------------------------------------===//
+// Logical (shifted register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
+
+// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
+// behaves differently for unsigned comparisons, so we defensively only allow
+// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
+// equal to 0" and LLVM gives us this.
+def signed_cond : PatLeaf<(cond), [{
+  return !isUnsignedIntSetCC(N->get());
+}]>;
+
+
+// These instructions share their "shift" operands with add/sub (shifted
+// register instructions). They are defined there.
+
+// N.b. the commutable parameter is just !N. It will be first against the wall
+// when the revolution comes.
+multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
+                          bit N, bit commutable,
+                          string asmop, SDPatternOperator opfrag, ValueType ty,
+                          RegisterClass GPR, list<Register> defs> {
+  let isCommutable = commutable, Defs = defs in {
+  def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, opc, 0b10, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, opc, 0b11, N,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # ty):$Imm6),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
+                       [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
+                            !cast<Operand>("ror_operand_" # ty):$Imm6))
+                       )],
+                       NoItinerary>;
+  }
+
+  def _noshift
+      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
+                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
+                                                      GPR:$Rm, 0)>;
+
+  def : Pat<(opfrag ty:$Rn, ty:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+}
+
+multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
+                         string asmop, SDPatternOperator opfrag,
+                         list<Register> defs> {
+  defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
+                            commutable, asmop, opfrag, i64, GPR64, defs>;
+  defm www : logical_shifts<prefix # "www", 0b0, opc, N,
+                            commutable, asmop, opfrag, i32, GPR32, defs>;
+}
+
+
+defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
+defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
+defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
+defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
+             PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
+                     [{ (void)N; return false; }]>,
+             [NZCV]>;
+
+defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (and node:$lhs, (not node:$rhs))>, []>;
+defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (or node:$lhs, (not node:$rhs))>, []>;
+defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
+                         PatFrag<(ops node:$lhs, node:$rhs),
+                                 (xor node:$lhs, (not node:$rhs))>, []>;
+defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
+                          PatFrag<(ops node:$lhs, node:$rhs),
+                                  (and node:$lhs, (not node:$rhs)),
+                                  [{ (void)N; return false; }]>,
+                          [NZCV]>;
+
+multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
+  let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
+  def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
+                           !cast<Operand>("lsl_operand_" # ty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+
+  def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
+                           !cast<Operand>("lsr_operand_" # ty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
+                           !cast<Operand>("asr_operand_" # ty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
+                       (outs),
+                       (ins GPR:$Rn, GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # ty):$Imm6),
+                       "tst\t$Rn, $Rm, $Imm6",
+                       [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
+                           !cast<Operand>("ror_operand_" # ty):$Imm6)),
+                                          0, signed_cond))],
+                       NoItinerary>;
+  }
+
+  def _noshift : InstAlias<"tst $Rn, $Rm",
+                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(A64setcc (and ty:$Rn, ty:$Rm), 0, signed_cond),
+            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
+}
+
+defm TSTxx : tst_shifts<"TSTxx", 0b1, i64, GPR64>;
+defm TSTww : tst_shifts<"TSTww", 0b0, i32, GPR32>;
+
+
+multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
+  let isCommutable = 0, Rn = 0b11111 in {
+  def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set ty:$Rd, (not (shl ty:$Rm,
+                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
+                       NoItinerary>;
+
+
+  def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set ty:$Rd, (not (srl ty:$Rm,
+                         !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
+                       NoItinerary>;
+
+  def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("asr_operand_" # ty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set ty:$Rd, (not (sra ty:$Rm,
+                         !cast<Operand>("asr_operand_" # ty):$Imm6)))],
+                       NoItinerary>;
+
+  def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
+                       (outs GPR:$Rd),
+                       (ins GPR:$Rm,
+                            !cast<Operand>("ror_operand_" # ty):$Imm6),
+                       "mvn\t$Rd, $Rm, $Imm6",
+                       [(set ty:$Rd, (not (rotr ty:$Rm,
+                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
+                       NoItinerary>;
+  }
+
+  def _noshift : InstAlias<"mvn $Rn, $Rm",
+                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+
+  def : Pat<(not ty:$Rm),
+            (!cast<Instruction>(prefix # "_lsl") $Rm, 0)>;
+}
+
+defm MVNxx : mvn_shifts<"MVNxx", 0b1, i64, GPR64>;
+defm MVNww : mvn_shifts<"MVNww", 0b0, i32, GPR32>;
+
+def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
+def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Move wide (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: MOVN, MOVZ, MOVK + MOV aliases
+
+// A wide variety of different relocations are needed for variants of these
+// instructions, so it turns out that we need a different operand for all of
+// them.
+multiclass movw_operands<string prefix, string instname, int width> {
+  def _imm_asmoperand : AsmOperandClass {
+    let Name = instname # width # "Shifted" # shift;
+    let PredicateMethod = "is" # instname # width # "Imm";
+    let RenderMethod = "addMoveWideImmOperands";
+    let ParserMethod = "ParseImmWithLSLOperand";
+    let DiagnosticType = "MOVWUImm16";
+  }
+
+  def _imm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
+    let PrintMethod = "printMoveWideImmOperand";
+    let EncoderMethod = "getMoveWideImmOpValue";
+    let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
+
+    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
+  }
+}
+
+defm movn32 : movw_operands<"movn32", "MOVN", 32>;
+defm movn64 : movw_operands<"movn64", "MOVN", 64>;
+defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
+defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
+defm movk32 : movw_operands<"movk32", "MOVK", 32>;
+defm movk64 : movw_operands<"movk64", "MOVK", 64>;
+
+multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
+                          dag ins64bit> {
+
+  def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
+                      !strconcat(asmop, "\t$Rd, $FullImm"),
+                      [], NoItinerary> {
+    bits<18> FullImm;
+    let UImm16 = FullImm{15-0};
+    let Shift = FullImm{17-16};
+  }
+
+  def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
+                      !strconcat(asmop, "\t$Rd, $FullImm"),
+                      [], NoItinerary> {
+    bits<18> FullImm;
+    let UImm16 = FullImm{15-0};
+    let Shift = FullImm{17-16};
+  }
+}
+
+let isMoveImm = 1, isReMaterializable = 1,
+    isAsCheapAsAMove = 1, hasSideEffects = 0 in {
+  defm MOVN : A64I_movwSizes<0b00, "movn",
+                             (ins movn32_imm:$FullImm),
+                             (ins movn64_imm:$FullImm)>;
+
+  // Some relocations are able to convert between a MOVZ and a MOVN. If these
+  // are applied the instruction must be emitted with the corresponding bits as
+  // 0, which means a MOVZ needs to override that bit from the default.
+  let PostEncoderMethod = "fixMOVZ" in
+  defm MOVZ : A64I_movwSizes<0b10, "movz",
+                             (ins movz32_imm:$FullImm),
+                             (ins movz64_imm:$FullImm)>;
+}
+
+let Constraints = "$src = $Rd" in
+defm MOVK : A64I_movwSizes<0b11, "movk",
+                           (ins GPR32:$src, movk32_imm:$FullImm),
+                           (ins GPR64:$src, movk64_imm:$FullImm)>;
+
+
+// And now the "MOV" aliases. These also need their own operands because what
+// they accept is completely different to what the base instructions accept.
+multiclass movalias_operand<string prefix, string basename,
+                            string immpredicate, int width> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "MovAlias";
+    let PredicateMethod
+          = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
+    let RenderMethod
+      = "addMoveWideMovAliasOperands<" # width # ", "
+                                       # "A64Imms::" # immpredicate # ">";
+  }
+
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
+
+    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
+  }
+}
+
+defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
+defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
+defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
+defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
+
+// FIXME: these are officially canonical aliases, but TableGen is too limited to
+// print them at the moment. I believe in this case an "AliasPredicate" method
+// will need to be implemented. to allow it, as well as the more generally
+// useful handling of non-register, non-constant operands.
+class movalias<Instruction INST, RegisterClass GPR, Operand operand>
+  : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm)>;
+
+def : movalias<MOVZwii, GPR32, movz32_movimm>;
+def : movalias<MOVZxii, GPR64, movz64_movimm>;
+def : movalias<MOVNwii, GPR32, movn32_movimm>;
+def : movalias<MOVNxii, GPR64, movn64_movimm>;
+
+//===----------------------------------------------------------------------===//
+// PC-relative addressing instructions
+//===----------------------------------------------------------------------===//
+// Contains: ADR, ADRP
+
+def adr_label : Operand<i64> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
+
+  // This label is a 21-bit offset from PC, unscaled
+  let PrintMethod = "printLabelOperand<21, 1>";
+  let ParserMatchClass = label_asmoperand<21, 1>;
+  let OperandType = "OPERAND_PCREL";
+}
+
+def adrp_label_asmoperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let RenderMethod = "addLabelOperands<21, 4096>";
+  let DiagnosticType = "Label";
+}
+
+def adrp_label : Operand<i64> {
+  let EncoderMethod = "getAdrpLabelOpValue";
+
+  // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
+  let PrintMethod = "printLabelOperand<21, 4096>";
+  let ParserMatchClass = adrp_label_asmoperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+let hasSideEffects = 0 in {
+  def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
+                         "adr\t$Rd, $Label", [], NoItinerary>;
+
+  def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
+                          "adrp\t$Rd, $Label", [], NoItinerary>;
+}
+
+//===----------------------------------------------------------------------===//
+// System instructions
+//===----------------------------------------------------------------------===//
+// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
+//    + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
+
+// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
+def uimm3_asmoperand : AsmOperandClass {
+  let Name = "UImm3";
+  let PredicateMethod = "isUImm<3>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm3";
+}
+
+def uimm3 : Operand<i32> {
+  let ParserMatchClass = uimm3_asmoperand;
+}
+
+// The HINT alias can accept a simple unsigned 7-bit immediate.
+def uimm7_asmoperand : AsmOperandClass {
+  let Name = "UImm7";
+  let PredicateMethod = "isUImm<7>";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "UImm7";
+}
+
+def uimm7 : Operand<i32> {
+  let ParserMatchClass = uimm7_asmoperand;
+}
+
+// Multiclass namedimm is defined with the prefetch operands. Most of these fit
+// into the NamedImmMapper scheme well: they either accept a named operand or
+// any immediate under a particular value (which may be 0, implying no immediate
+// is allowed).
+defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
+defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
+defm ic : namedimm<"ic", "A64IC::ICMapper">;
+defm dc : namedimm<"dc", "A64DC::DCMapper">;
+defm at : namedimm<"at", "A64AT::ATMapper">;
+defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
+
+// However, MRS and MSR are more complicated for a few reasons:
+//   * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
+//     implementation-defined effect
+//   * Most registers are shared, but some are read-only or write-only.
+//   * There is a variant of MSR which accepts the same register name (SPSel),
+//     but which would have a different encoding.
+
+// In principle these could be resolved in with more complicated subclasses of
+// NamedImmMapper, however that imposes an overhead on other "named
+// immediates". Both in concrete terms with virtual tables and in unnecessary
+// abstraction.
+
+// The solution adopted here is to take the MRS/MSR Mappers out of the usual
+// hierarchy (they're not derived from NamedImmMapper) and to add logic for
+// their special situation.
+def mrs_asmoperand : AsmOperandClass {
+  let Name = "MRS";
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MRS";
+}
+
+def mrs_op : Operand<i32> {
+  let ParserMatchClass = mrs_asmoperand;
+  let PrintMethod = "printMRSOperand";
+  let DecoderMethod = "DecodeMRSOperand";
+}
+
+def msr_asmoperand : AsmOperandClass {
+  let Name = "MSRWithReg";
+
+  // Note that SPSel is valid for both this and the pstate operands, but with
+  // different immediate encodings. This is why these operands provide a string
+  // AArch64Operand rather than an immediate. The overlap is small enough that
+  // it could be resolved with hackery now, but who can say in future?
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MSR";
+}
+
+def msr_op : Operand<i32> {
+  let ParserMatchClass = msr_asmoperand;
+  let PrintMethod = "printMSROperand";
+  let DecoderMethod = "DecodeMSROperand";
+}
+
+def pstate_asmoperand : AsmOperandClass {
+  let Name = "MSRPState";
+  // See comment above about parser.
+  let ParserMethod = "ParseSysRegOperand";
+  let DiagnosticType = "MSR";
+}
+
+def pstate_op : Operand<i32> {
+  let ParserMatchClass = pstate_asmoperand;
+  let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
+  let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
+}
+
+// When <CRn> is specified, an assembler should accept something like "C4", not
+// the usual "#4" immediate.
+def CRx_asmoperand : AsmOperandClass {
+  let Name = "CRx";
+  let PredicateMethod = "isUImm<4>";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "ParseCRxOperand";
+  // Diagnostics are handled in all cases by ParseCRxOperand.
+}
+
+def CRx : Operand<i32> {
+  let ParserMatchClass = CRx_asmoperand;
+  let PrintMethod = "printCRxOperand";
+}
+
+
+// Finally, we can start defining the instructions.
+
+// HINT is straightforward, with a few aliases.
+def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
+                        [], NoItinerary> {
+  bits<7> UImm7;
+  let CRm = UImm7{6-3};
+  let Op2 = UImm7{2-0};
+
+  let Op0 = 0b00;
+  let Op1 = 0b011;
+  let CRn = 0b0010;
+  let Rt = 0b11111;
+}
+
+def : InstAlias<"nop", (HINTi 0)>;
+def : InstAlias<"yield", (HINTi 1)>;
+def : InstAlias<"wfe", (HINTi 2)>;
+def : InstAlias<"wfi", (HINTi 3)>;
+def : InstAlias<"sev", (HINTi 4)>;
+def : InstAlias<"sevl", (HINTi 5)>;
+
+// Quite a few instructions then follow a similar pattern of fixing common
+// fields in the bitpattern, we'll define a helper-class for them.
+class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
+                 Operand operand, string asmop>
+  : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
+                [], NoItinerary> {
+  let Op0 = op0;
+  let Op1 = op1;
+  let CRn = crn;
+  let Op2 = op2;
+  let Rt = 0b11111;
+}
+
+
+def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
+def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
+def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
+def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
+
+def : InstAlias<"clrex", (CLREXi 0b1111)>;
+def : InstAlias<"isb", (ISBi 0b1111)>;
+
+// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
+// configurations at least.
+def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
+
+// Any SYS bitpattern can be represented with a complex and opaque "SYS"
+// instruction.
+def SYSiccix : A64I_system<0b0, (outs),
+                           (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
+                                uimm3:$Op2, GPR64:$Rt),
+                           "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
+                           [], NoItinerary> {
+  let Op0 = 0b01;
+}
+
+// You can skip the Xt argument whether it makes sense or not for the generic
+// SYS instruction.
+def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
+                (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
+
+
+// But many have aliases, which obviously don't fit into
+class SYSalias<dag ins, string asmstring>
+  : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
+  let isAsmParserOnly = 1;
+
+  bits<14> SysOp;
+  let Op0 = 0b01;
+  let Op1 = SysOp{13-11};
+  let CRn = SysOp{10-7};
+  let CRm = SysOp{6-3};
+  let Op2 = SysOp{2-0};
+}
+
+def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
+
+def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
+  let Rt = 0b11111;
+}
+
+def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
+def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
+
+def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
+
+def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
+  let Rt = 0b11111;
+}
+
+
+def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
+                            (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
+                            "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
+                            [], NoItinerary> {
+  let Op0 = 0b01;
+}
+
+// The instructions themselves are rather simple for MSR and MRS.
+def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
+                        "msr\t$SysReg, $Rt", [], NoItinerary> {
+  bits<16> SysReg;
+  let Op0 = SysReg{15-14};
+  let Op1 = SysReg{13-11};
+  let CRn = SysReg{10-7};
+  let CRm = SysReg{6-3};
+  let Op2 = SysReg{2-0};
+}
+
+def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
+                        "mrs\t$Rt, $SysReg", [], NoItinerary> {
+  bits<16> SysReg;
+  let Op0 = SysReg{15-14};
+  let Op1 = SysReg{13-11};
+  let CRn = SysReg{10-7};
+  let CRm = SysReg{6-3};
+  let Op2 = SysReg{2-0};
+}
+
+def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
+                        "msr\t$PState, $CRm", [], NoItinerary> {
+  bits<6> PState;
+
+  let Op0 = 0b00;
+  let Op1 = PState{5-3};
+  let CRn = 0b0100;
+  let Op2 = PState{2-0};
+  let Rt = 0b11111;
+}
+
+//===----------------------------------------------------------------------===//
+// Test & branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: TBZ, TBNZ
+
+// The bit to test is a simple unsigned 6-bit immediate in the X-register
+// versions.
+def uimm6 : Operand<i64> {
+  let ParserMatchClass = uimm6_asmoperand;
+}
+
+def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
+
+def tbimm_target : Operand<OtherVT> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
+
+  // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<14, 4>";
+  let ParserMatchClass = label_wid14_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
+def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
+
+// These instructions correspond to patterns involving "and" with a power of
+// two, which we need to be able to select.
+def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
+def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
+
+let isBranch = 1, isTerminator = 1 in {
+  def TBZxii : A64I_TBimm<0b0, (outs),
+                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
+                        "tbz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
+                                   A64eq, bb:$Label)],
+                        NoItinerary>;
+
+  def TBNZxii : A64I_TBimm<0b1, (outs),
+                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
+                        "tbnz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
+                                   A64ne, bb:$Label)],
+                        NoItinerary>;
+
+
+  // Note, these instructions overlap with the above 64-bit patterns. This is
+  // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
+  // do the same thing and are both permitted assembly. They also both have
+  // sensible DAG patterns.
+  def TBZwii : A64I_TBimm<0b0, (outs),
+                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
+                        "tbz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
+                                   A64eq, bb:$Label)],
+                        NoItinerary> {
+    let Imm{5} = 0b0;
+  }
+
+  def TBNZwii : A64I_TBimm<0b1, (outs),
+                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
+                        "tbnz\t$Rt, $Imm, $Label",
+                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
+                                   A64ne, bb:$Label)],
+                        NoItinerary> {
+    let Imm{5} = 0b0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (immediate) instructions
+//===----------------------------------------------------------------------===//
+// Contains: B, BL
+
+def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
+
+def bimm_target : Operand<OtherVT> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
+
+  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<26, 4>";
+  let ParserMatchClass = label_wid26_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+def blimm_target : Operand<i64> {
+  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
+
+  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
+  let PrintMethod = "printLabelOperand<26, 4>";
+  let ParserMatchClass = label_wid26_scal4_asmoperand;
+
+  let OperandType = "OPERAND_PCREL";
+}
+
+class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
+  : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
+              !strconcat(asmop, "\t$Label"), patterns,
+              NoItinerary>;
+
+let isBranch = 1 in {
+  def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
+    let isTerminator = 1;
+    let isBarrier = 1;
+  }
+
+  def BLimm : A64I_BimmImpl<0b1, "bl",
+                            [(AArch64Call tglobaladdr:$Label)], blimm_target> {
+    let isCall = 1;
+    let Defs = [X30];
+  }
+}
+
+def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
+
+//===----------------------------------------------------------------------===//
+// Unconditional branch (register) instructions
+//===----------------------------------------------------------------------===//
+// Contains: BR, BLR, RET, ERET, DRP.
+
+// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
+// at the moment.
+class A64I_BregImpl<bits<4> opc,
+                    dag outs, dag ins, string asmstr, list<dag> patterns,
+                    InstrItinClass itin = NoItinerary>
+  : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
+              outs, ins, asmstr, patterns, itin> {
+  let isBranch         = 1;
+  let isIndirectBranch = 1;
+}
+
+// Note that these are not marked isCall or isReturn because as far as LLVM is
+// concerned they're not. "ret" is just another jump unless it has been selected
+// by LLVM as the function's return.
+
+let isBranch = 1 in {
+  def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
+                          "br\t$Rn", [(brind i64:$Rn)]> {
+    let isBarrier = 1;
+    let isTerminator = 1;
+  }
+
+  def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
+                           "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
+    let isBarrier = 0;
+    let isCall = 1;
+    let Defs = [X30];
+  }
+
+  def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
+                           "ret\t$Rn", []> {
+    let isBarrier = 1;
+    let isTerminator = 1;
+    let isReturn = 1;
+  }
+
+  // Create a separate pseudo-instruction for codegen to use so that we don't
+  // flag x30 as used in every function. It'll be restored before the RET by the
+  // epilogue if it's legitimately used.
+  def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
+    let isTerminator = 1;
+    let isBarrier = 1;
+    let isReturn = 1;
+  }
+
+  def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
+    let Rn = 0b11111;
+    let isBarrier = 1;
+    let isTerminator = 1;
+    let isReturn = 1;
+  }
+
+  def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
+    let Rn = 0b11111;
+    let isBarrier = 1;
+  }
+}
+
+def RETAlias : InstAlias<"ret", (RETx X30)>;
+
+
+//===----------------------------------------------------------------------===//
+// Address generation patterns
+//===----------------------------------------------------------------------===//
+
+// Primary method of address generation for the small/absolute memory model is
+// an ADRP/ADR pair:
+//     ADRP x0, some_variable
+//     ADD x0, x0, #:lo12:some_variable
+//
+// The load/store elision of the ADD is accomplished when selecting
+// addressing-modes. This just mops up the cases where that doesn't work and we
+// really need an address in some register.
+
+// This wrapper applies a LO12 modifier to the address. Otherwise we could just
+// use the same address.
+
+class ADRP_ADD<SDNode Wrapper, SDNode addrop>
+ : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
+       (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
+
+def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
+def : ADRP_ADD<A64WrapperSmall, texternalsym>;
+def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
+def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
+def : ADRP_ADD<A64WrapperSmall, tjumptable>;
+
+//===----------------------------------------------------------------------===//
+// GOT access patterns
+//===----------------------------------------------------------------------===//
+
+// FIXME: Wibble
+
+class GOTLoadSmall<SDNode addrfrag>
+  : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
+        (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
+
+def : GOTLoadSmall<texternalsym>;
+def : GOTLoadSmall<tglobaladdr>;
+def : GOTLoadSmall<tglobaltlsaddr>;
+
+//===----------------------------------------------------------------------===//
+// Tail call handling
+//===----------------------------------------------------------------------===//
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
+  def TC_RETURNdi
+    : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
+                 [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
+
+  def TC_RETURNxi
+    : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
+                 [(AArch64tcret i64:$dst, (i32 timm:$FPDiff))]>;
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    Uses = [XSP] in {
+  def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
+                                  (Bimm bimm_target:$Label)>;
+
+  def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
+                                 (BRx GPR64:$Rd)>;
+}
+
+
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
+
+//===----------------------------------------------------------------------===//
+// Thread local storage
+//===----------------------------------------------------------------------===//
+
+// This is a pseudo-instruction representing the ".tlsdesccall" directive in
+// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
+// current location. It should always be immediately followed by a BLR
+// instruction, and is intended solely for relaxation by the linker.
+
+def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
+
+def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
+  let hasSideEffects = 1;
+}
+
+def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
+                            [(A64tlsdesc_blr i64:$Rn, tglobaltlsaddr:$Var)]> {
+  let isCall = 1;
+  let Defs = [X30];
+}
+
+def : Pat<(A64tlsdesc_blr i64:$Rn, texternalsym:$Var),
+          (TLSDESC_BLRx $Rn, texternalsym:$Var)>;
+
+//===----------------------------------------------------------------------===//
+// Bitfield patterns
+//===----------------------------------------------------------------------===//
+
+def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
+}]>;
+
+def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
+}]>;
+
+def bfi_width_to_imms : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
+}]>;
+
+
+// The simpler patterns deal with cases where no AND mask is actually needed
+// (either all bits are used or the low 32 bits are used).
+let AddedComplexity = 10 in {
+
+def : Pat<(A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
+           (BFIxxii $src, $Rn,
+                    (bfi64_lsb_to_immr (i64 imm:$ImmR)),
+                    (bfi_width_to_imms (i64 imm:$ImmS)))>;
+
+def : Pat<(A64Bfi i32:$src, i32:$Rn, imm:$ImmR, imm:$ImmS),
+          (BFIwwii $src, $Rn,
+                   (bfi32_lsb_to_immr (i64 imm:$ImmR)),
+                   (bfi_width_to_imms (i64 imm:$ImmS)))>;
+
+
+def : Pat<(and (A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
+               (i64 4294967295)),
+          (SUBREG_TO_REG (i64 0),
+                         (BFIwwii (EXTRACT_SUBREG $src, sub_32),
+                                  (EXTRACT_SUBREG $Rn, sub_32),
+                                  (bfi32_lsb_to_immr (i64 imm:$ImmR)),
+                                  (bfi_width_to_imms (i64 imm:$ImmS))),
+                         sub_32)>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous patterns
+//===----------------------------------------------------------------------===//
+
+// Truncation from 64 to 32-bits just involves renaming your register.
+def : Pat<(i32 (trunc i64:$val)), (EXTRACT_SUBREG $val, sub_32)>;
+
+// Similarly, extension where we don't care about the high bits is
+// just a rename.
+def : Pat<(i64 (anyext i32:$val)),
+          (INSERT_SUBREG (IMPLICIT_DEF), $val, sub_32)>;
+
+// SELECT instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : PseudoInst<(outs FPR128:$Rd),
+                         (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
+                         [(set f128:$Rd, (simple_select f128:$Rn, f128:$Rm))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Load/store patterns
+//===----------------------------------------------------------------------===//
+
+// There are lots of patterns here, because we need to allow at least three
+// parameters to vary independently.
+//   1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
+//   2. LLVM source: zextloadi8, anyextloadi8, ...
+//   3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
+//
+// The biggest problem turns out to be the address-generation variable. At the
+// point of instantiation we need to produce two DAGs, one for the pattern and
+// one for the instruction. Doing this at the lowest level of classes doesn't
+// work.
+//
+// Consider the simple uimm12 addressing mode, and the desire to match both (add
+// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
+// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
+// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
+// operation, and PatFrags are for selection not output.
+//
+// As a result, the address-generation patterns are the final
+// instantiations. However, we do still need to vary the operand for the address
+// further down (At the point we're deciding A64WrapperSmall, we don't know
+// the memory width of the operation).
+
+//===------------------------------
+// 1. Basic infrastructural defs
+//===------------------------------
+
+// First, some simple classes for !foreach and !subst to use:
+class Decls {
+  dag pattern;
+}
+
+def decls : Decls;
+def ALIGN;
+def INST;
+def OFFSET;
+def SHIFT;
+
+// You can't use !subst on an actual immediate, but you *can* use it on an
+// operand record that happens to match a single immediate. So we do.
+def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
+def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
+def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
+def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
+def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
+
+// If the low bits of a pointer are known to be 0 then an "or" is just as good
+// as addition for computing an offset. This fragment forwards that check for
+// TableGen's use.
+def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
+[{
+  return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
+}]>;
+
+// Load/store (unsigned immediate) operations with relocations against global
+// symbols (for lo12) are only valid if those symbols have correct alignment
+// (since the immediate offset is divided by the access scale, it can't have a
+// remainder).
+//
+// The guaranteed alignment is provided as part of the WrapperSmall
+// operation, and checked against one of these.
+def any_align   : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
+def min_align2  : ImmLeaf<i32, [{ return Imm >= 2; }]>;
+def min_align4  : ImmLeaf<i32, [{ return Imm >= 4; }]>;
+def min_align8  : ImmLeaf<i32, [{ return Imm >= 8; }]>;
+def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
+
+// "Normal" load/store instructions can be used on atomic operations, provided
+// the ordering parameter is at most "monotonic". Anything above that needs
+// special handling with acquire/release instructions.
+class simple_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
+}]>;
+
+def atomic_load_simple_i8  : simple_load<atomic_load_8>;
+def atomic_load_simple_i16 : simple_load<atomic_load_16>;
+def atomic_load_simple_i32 : simple_load<atomic_load_32>;
+def atomic_load_simple_i64 : simple_load<atomic_load_64>;
+
+class simple_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
+}]>;
+
+def atomic_store_simple_i8  : simple_store<atomic_store_8>;
+def atomic_store_simple_i16 : simple_store<atomic_store_16>;
+def atomic_store_simple_i32 : simple_store<atomic_store_32>;
+def atomic_store_simple_i64 : simple_store<atomic_store_64>;
+
+//===------------------------------
+// 2. UImm12 and SImm9
+//===------------------------------
+
+// These instructions have two operands providing the address so they can be
+// treated similarly for most purposes.
+
+//===------------------------------
+// 2.1 Base patterns covering extend/truncate semantics
+//===------------------------------
+
+// Atomic patterns can be shared between integer operations of all sizes, a
+// quick multiclass here allows reuse.
+multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
+                          dag Offset, dag address, ValueType transty,
+                          ValueType sty> {
+  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
+            (LOAD Base, Offset)>;
+
+  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
+            (STORE $Rt, Base, Offset)>;
+}
+
+// Instructions accessing a memory chunk smaller than a register (or, in a
+// pinch, the same size) have a characteristic set of patterns they want to
+// match: extending loads and truncating stores. This class deals with the
+// sign-neutral version of those patterns.
+//
+// It will be instantiated across multiple addressing-modes.
+multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
+                         dag Base, dag Offset,
+                         dag address, ValueType sty>
+  : ls_atomic_pats<LOAD, STORE, Base, Offset, address, i32, sty> {
+  def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
+
+  def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
+
+  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
+  // register was actually set.
+  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
+
+  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
+
+  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
+            (STORE $Rt, Base, Offset)>;
+
+  // For truncating store from 64-bits, we have to manually tell LLVM to
+  // ignore the high bits of the x register.
+  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
+            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
+}
+
+// Next come patterns for sign-extending loads.
+multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
+                            dag address, ValueType sty> {
+  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
+
+  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
+
+}
+
+// and finally "natural-width" loads and stores come next.
+multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
+                           dag Offset, dag address, ValueType sty> {
+  def : Pat<(sty (load address)), (LOAD Base, Offset)>;
+  def : Pat<(store sty:$Rt, address), (STORE $Rt, Base, Offset)>;
+}
+
+// Integer operations also get atomic instructions to select for.
+multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
+                           dag Offset, dag address, ValueType sty>
+  : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
+    ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
+
+//===------------------------------
+// 2.2. Addressing-mode instantiations
+//===------------------------------
+
+multiclass uimm12_pats<dag address, dag Base, dag Offset> {
+  defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, byte_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, byte_uimm12,
+                                !subst(ALIGN, any_align, decls.pattern))),
+                       i8>;
+  defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, hword_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, hword_uimm12,
+                                !subst(ALIGN, min_align2, decls.pattern))),
+                       i16>;
+  defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
+                       !foreach(decls.pattern, Offset,
+                                !subst(OFFSET, word_uimm12, decls.pattern)),
+                       !foreach(decls.pattern, address,
+                                !subst(OFFSET, word_uimm12,
+                                !subst(ALIGN, min_align4, decls.pattern))),
+                       i32>;
+
+  defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, word_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, word_uimm12,
+                                   !subst(ALIGN, min_align4, decls.pattern))),
+                          i32>;
+
+  defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, dword_uimm12,
+                                   !subst(ALIGN, min_align8, decls.pattern))),
+                          i64>;
+
+  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, hword_uimm12,
+                                   !subst(ALIGN, min_align2, decls.pattern))),
+                          f16>;
+
+  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, word_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, word_uimm12,
+                                   !subst(ALIGN, min_align4, decls.pattern))),
+                          f32>;
+
+  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, dword_uimm12,
+                                   !subst(ALIGN, min_align8, decls.pattern))),
+                          f64>;
+
+  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, qword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, qword_uimm12,
+                                   !subst(ALIGN, min_align16, decls.pattern))),
+                          f128>;
+
+  defm : load_signed_pats<"B", "", Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, byte_uimm12,
+                                   !subst(ALIGN, any_align, decls.pattern))),
+                          i8>;
+
+  defm : load_signed_pats<"H", "", Base,
+                          !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, hword_uimm12,
+                                   !subst(ALIGN, min_align2, decls.pattern))),
+                          i16>;
+
+  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
+                                  !subst(OFFSET, word_uimm12,
+                                  !subst(ALIGN, min_align4, decls.pattern)))),
+            (LDRSWx Base, !foreach(decls.pattern, Offset,
+                                  !subst(OFFSET, word_uimm12, decls.pattern)))>;
+}
+
+// Straightforward patterns of last resort: a pointer with or without an
+// appropriate offset.
+defm : uimm12_pats<(i64 i64:$Rn), (i64 i64:$Rn), (i64 0)>;
+defm : uimm12_pats<(add i64:$Rn, OFFSET:$UImm12),
+                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
+
+// The offset could be hidden behind an "or", of course:
+defm : uimm12_pats<(add_like_or i64:$Rn, OFFSET:$UImm12),
+                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
+
+// Global addresses under the small-absolute model should use these
+// instructions. There are ELF relocations specifically for it.
+defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
+                   (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
+
+defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
+                                    ALIGN),
+                   (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
+
+// External symbols that make it this far should also get standard relocations.
+defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
+                                    ALIGN),
+                   (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
+
+defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
+                   (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
+
+// We also want to use uimm12 instructions for local variables at the moment.
+def tframeindex_XFORM : SDNodeXForm<frameindex, [{
+  int FI = cast<FrameIndexSDNode>(N)->getIndex();
+  return CurDAG->getTargetFrameIndex(FI, MVT::i64);
+}]>;
+
+defm : uimm12_pats<(i64 frameindex:$Rn),
+                   (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
+
+// These can be much simpler than uimm12 because we don't to change the operand
+// type (e.g. LDURB and LDURH take the same operands).
+multiclass simm9_pats<dag address, dag Base, dag Offset> {
+  defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
+  defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
+
+  defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
+  defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
+
+  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
+  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
+  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
+  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
+                         f128>;
+
+  def : Pat<(i64 (zextloadi32 address)),
+            (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
+
+  def : Pat<(truncstorei32 i64:$Rt, address),
+            (LS32_STUR (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
+
+  defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
+  defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
+  def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
+}
+
+defm : simm9_pats<(add i64:$Rn, simm9:$SImm9),
+                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
+
+defm : simm9_pats<(add_like_or i64:$Rn, simm9:$SImm9),
+                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
+
+
+//===------------------------------
+// 3. Register offset patterns
+//===------------------------------
+
+// Atomic patterns can be shared between integer operations of all sizes, a
+// quick multiclass here allows reuse.
+multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
+                          dag Offset, dag Extend, dag address,
+                          ValueType transty, ValueType sty> {
+  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
+            (STORE $Rt, Base, Offset, Extend)>;
+}
+
+// The register offset instructions take three operands giving the instruction,
+// and have an annoying split between instructions where Rm is 32-bit and
+// 64-bit. So we need a special hierarchy to describe them. Other than that the
+// same operations should be supported as for simm9 and uimm12 addressing.
+
+multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
+                         dag Base, dag Offset, dag Extend,
+                         dag address, ValueType sty>
+  : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, i32, sty> {
+  def : Pat<(!cast<SDNode>(zextload # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  def : Pat<(!cast<SDNode>(extload # sty) address),
+            (LOAD Base, Offset, Extend)>;
+
+  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
+  // register was actually set.
+  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
+
+  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
+            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
+
+  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
+            (STORE $Rt, Base, Offset, Extend)>;
+
+  // For truncating store from 64-bits, we have to manually tell LLVM to
+  // ignore the high bits of the x register.
+  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
+            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset, Extend)>;
+
+}
+
+// Next come patterns for sign-extending loads.
+multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
+                          dag address, ValueType sty> {
+  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+
+  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
+            (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+}
+
+// and finally "natural-width" loads and stores come next.
+multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
+                           dag Base, dag Offset, dag Extend, dag address,
+                           ValueType sty> {
+  def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
+  def : Pat<(store sty:$Rt, address),
+            (STORE $Rt, Base, Offset, Extend)>;
+}
+
+multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
+                               dag Base, dag Offset, dag Extend, dag address,
+                               ValueType sty>
+  : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, sty>,
+    ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, sty, sty>;
+
+multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
+                       dag Extend> {
+  defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq0, decls.pattern)),
+                       i8>;
+  defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq1, decls.pattern)),
+                       i16>;
+  defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
+                       !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
+                       Base, Offset, Extend,
+                       !foreach(decls.pattern, address,
+                                !subst(SHIFT, imm_eq2, decls.pattern)),
+                       i32>;
+
+  defm : ro_int_neutral_pats<
+                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
+                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
+                            Base, Offset, Extend,
+                            !foreach(decls.pattern, address,
+                                     !subst(SHIFT, imm_eq2, decls.pattern)),
+                            i32>;
+
+  defm : ro_int_neutral_pats<
+                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
+                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
+                            Base, Offset, Extend,
+                            !foreach(decls.pattern, address,
+                                     !subst(SHIFT, imm_eq3, decls.pattern)),
+                            i64>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq1, decls.pattern)),
+                         f16>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq2, decls.pattern)),
+                         f32>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq3, decls.pattern)),
+                         f64>;
+
+  defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
+                         !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
+                         Base, Offset, Extend,
+                         !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq4, decls.pattern)),
+                         f128>;
+
+  defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
+                        !foreach(decls.pattern, address,
+                                 !subst(SHIFT, imm_eq0, decls.pattern)),
+                        i8>;
+
+  defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
+                        !foreach(decls.pattern, address,
+                                 !subst(SHIFT, imm_eq1, decls.pattern)),
+                        i16>;
+
+  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
+                                  !subst(SHIFT, imm_eq2, decls.pattern))),
+            (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
+              Base, Offset, Extend)>;
+}
+
+
+// Finally we're in a position to tell LLVM exactly what addresses are reachable
+// using register-offset instructions. Essentially a base plus a possibly
+// extended, possibly shifted (by access size) offset.
+
+defm : regoff_pats<"Wm", (add i64:$Rn, (sext i32:$Rm)),
+                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 6)>;
+
+defm : regoff_pats<"Wm", (add i64:$Rn, (shl (sext i32:$Rm), SHIFT)),
+                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 7)>;
+
+defm : regoff_pats<"Wm", (add i64:$Rn, (zext i32:$Rm)),
+                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 2)>;
+
+defm : regoff_pats<"Wm", (add i64:$Rn, (shl (zext i32:$Rm), SHIFT)),
+                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 3)>;
+
+defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
+                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 2)>;
+
+defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
+                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
new file mode 100644
index 000000000000..c96bf85a716c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -0,0 +1,140 @@
+//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower AArch64 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64AsmPrinter.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+MCOperand
+AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
+                                      const MCSymbol *Sym) const {
+  const MCExpr *Expr = 0;
+
+  Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+
+  switch (MO.getTargetFlags()) {
+  case AArch64II::MO_GOT:
+    Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOT_LO12:
+    Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_LO12:
+    Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_DTPREL_G1:
+    Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
+    break;
+  case AArch64II::MO_DTPREL_G0_NC:
+    Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOTTPREL:
+    Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
+    break;
+  case AArch64II::MO_GOTTPREL_LO12:
+    Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_TLSDESC:
+    Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
+    break;
+  case AArch64II::MO_TLSDESC_LO12:
+    Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
+    break;
+  case AArch64II::MO_TPREL_G1:
+    Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
+    break;
+  case AArch64II::MO_TPREL_G0_NC:
+    Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
+    break;
+  case AArch64II::MO_NO_FLAG:
+    // Expr is already correct
+    break;
+  default:
+    llvm_unreachable("Unexpected MachineOperand flag");
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(),
+                                                          OutContext),
+                                   OutContext);
+
+  return MCOperand::CreateExpr(Expr);
+}
+
+bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
+                                     MCOperand &MCOp) const {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    if (MO.isImplicit())
+      return false;
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                   MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers
+    return false;
+
+  }
+
+  return true;
+}
+
+void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
+                                            MCInst &OutMI,
+                                            AArch64AsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (AP.lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..f45d8f784f42
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -0,0 +1,18 @@
+//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just contains the anchor for the AArch64MachineFunctionInfo to
+// force vtable emission.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void AArch64MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
new file mode 100644
index 000000000000..33da54f97fda
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -0,0 +1,149 @@
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares AArch64-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AARCH64MACHINEFUNCTIONINFO_H
+#define AARCH64MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// This class is derived from MachineFunctionInfo and contains private AArch64
+/// target-specific information for each MachineFunction.
+class AArch64MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// Number of bytes of arguments this function has on the stack. If the callee
+  /// is expected to restore the argument stack this should be a multiple of 16,
+  /// all usable during a tail call.
+  ///
+  /// The alternative would forbid tail call optimisation in some cases: if we
+  /// want to transfer control from a function with 8-bytes of stack-argument
+  /// space to a function with 16-bytes then misalignment of this value would
+  /// make a stack adjustment necessary, which could not be undone by the
+  /// callee.
+  unsigned BytesInStackArgArea;
+
+  /// The number of bytes to restore to deallocate space for incoming
+  /// arguments. Canonically 0 in the C calling convention, but non-zero when
+  /// callee is expected to pop the args.
+  unsigned ArgumentStackToRestore;
+
+  /// If the stack needs to be adjusted on frame entry in two stages, this
+  /// records the size of the first adjustment just prior to storing
+  /// callee-saved registers. The callee-saved slots are addressed assuming
+  /// SP == <incoming-SP> - InitialStackAdjust.
+  unsigned InitialStackAdjust;
+
+  /// Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of the area where LowerFormalArguments puts the
+  /// general-purpose registers that might contain variadic parameters.
+  int VariadicGPRIdx;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The size of the frame object used to store the general-purpose registers
+  /// which might contain variadic arguments. This is the offset from
+  /// VariadicGPRIdx to what's stored in __gr_top.
+  unsigned VariadicGPRSize;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of the area where LowerFormalArguments puts the
+  /// floating-point registers that might contain variadic parameters.
+  int VariadicFPRIdx;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The size of the frame object used to store the floating-point registers
+  /// which might contain variadic arguments. This is the offset from
+  /// VariadicFPRIdx to what's stored in __vr_top.
+  unsigned VariadicFPRSize;
+
+  /// @see AArch64 Procedure Call Standard, B.3
+  ///
+  /// The Frame index of an object pointing just past the last known stacked
+  /// argument on entry to a variadic function. This goes into the __stack field
+  /// of the va_list type.
+  int VariadicStackIdx;
+
+  /// The offset of the frame pointer from the stack pointer on function
+  /// entry. This is expected to be negative.
+  int FramePointerOffset;
+
+public:
+  AArch64MachineFunctionInfo()
+    : BytesInStackArgArea(0),
+      ArgumentStackToRestore(0),
+      InitialStackAdjust(0),
+      NumLocalDynamics(0),
+      VariadicGPRIdx(0),
+      VariadicGPRSize(0),
+      VariadicFPRIdx(0),
+      VariadicFPRSize(0),
+      VariadicStackIdx(0),
+      FramePointerOffset(0) {}
+
+  explicit AArch64MachineFunctionInfo(MachineFunction &MF)
+    : BytesInStackArgArea(0),
+      ArgumentStackToRestore(0),
+      InitialStackAdjust(0),
+      NumLocalDynamics(0),
+      VariadicGPRIdx(0),
+      VariadicGPRSize(0),
+      VariadicFPRIdx(0),
+      VariadicFPRSize(0),
+      VariadicStackIdx(0),
+      FramePointerOffset(0) {}
+
+  unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
+  void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+
+  unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
+  void setArgumentStackToRestore(unsigned bytes) {
+    ArgumentStackToRestore = bytes;
+  }
+
+  unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
+  void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+  int getVariadicGPRIdx() const { return VariadicGPRIdx; }
+  void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+
+  unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
+  void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+
+  int getVariadicFPRIdx() const { return VariadicFPRIdx; }
+  void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+
+  unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
+  void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+
+  int getVariadicStackIdx() const { return VariadicStackIdx; }
+  void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+
+  int getFramePointerOffset() const { return FramePointerOffset; }
+  void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
new file mode 100644
index 000000000000..20b0dcf86f46
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -0,0 +1,171 @@
+//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64RegisterInfo.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/BitVector.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+using namespace llvm;
+
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii,
+                                         const AArch64Subtarget &sti)
+  : AArch64GenRegisterInfo(AArch64::X30), TII(tii) {
+}
+
+const uint16_t *
+AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  return CSR_PCS_SaveList;
+}
+
+const uint32_t*
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  return CSR_PCS_RegMask;
+}
+
+const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
+  return TLSDesc_RegMask;
+}
+
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::FlagClassRegClass)
+    return &AArch64::GPR64RegClass;
+
+  return RC;
+}
+
+
+
+BitVector
+AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  Reserved.set(AArch64::XSP);
+  Reserved.set(AArch64::WSP);
+
+  Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WZR);
+
+  if (TFI->hasFP(MF)) {
+    Reserved.set(AArch64::X29);
+    Reserved.set(AArch64::W29);
+  }
+
+  return Reserved;
+}
+
+void
+AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
+                                         int SPAdj,
+                                         unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FrameLowering *TFI =
+   static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
+
+  // In order to work out the base and offset for addressing, the FrameLowering
+  // code needs to know (sometimes) whether the instruction is storing/loading a
+  // callee-saved register, or whether it's a more generic
+  // operation. Fortunately the frame indices are used *only* for that purpose
+  // and are contiguous, so we can check here.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
+
+  unsigned FrameReg;
+  int64_t Offset;
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
+                                           IsCalleeSaveOp);
+
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  // DBG_VALUE instructions have no real restrictions so they can be handled
+  // easily.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  int MinOffset, MaxOffset, OffsetScale;
+  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) {
+    MinOffset = 0;
+    MaxOffset = 0xfff;
+    OffsetScale = 1;
+  } else {
+    // Load/store of a stack object
+    TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
+  }
+
+  // The frame lowering has told us a base and offset it thinks we should use to
+  // access this variable, but it's still up to us to make sure the values are
+  // legal for the instruction in question.
+  if (Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) {
+    unsigned BaseReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
+                  BaseReg, FrameReg, BaseReg, Offset);
+    FrameReg = BaseReg;
+    Offset = 0;
+  }
+
+  // Negative offsets are expected if we address from FP, but for
+  // now this checks nothing has gone horribly wrong.
+  assert(Offset >= 0 && "Unexpected negative offset from SP");
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+}
+
+unsigned
+AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF))
+    return AArch64::X29;
+  else
+    return AArch64::XSP;
+}
+
+bool
+AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const AArch64FrameLowering *AFI
+    = static_cast<const AArch64FrameLowering*>(TFI);
+  return AFI->useFPForAddressing(MF);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
new file mode 100644
index 000000000000..bb64fd55b2c3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -0,0 +1,76 @@
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MCRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
+#define LLVM_TARGET_AARCH64REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AArch64GenRegisterInfo.inc"
+
+namespace llvm {
+
+class AArch64InstrInfo;
+class AArch64Subtarget;
+
+struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
+private:
+  const AArch64InstrInfo &TII;
+
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo &tii,
+                      const AArch64Subtarget &sti);
+
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+
+  const uint32_t *getTLSDescCallPreservedMask() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *Rs = NULL) const;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns original class if it is
+  /// possible to copy between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
+  /// legal to use in the current sub-target and has the same spill size.
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
+    if (RC == &AArch64::tcGPR64RegClass)
+      return &AArch64::GPR64RegClass;
+
+    return RC;
+  }
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
new file mode 100644
index 000000000000..bd79546371c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -0,0 +1,203 @@
+//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains declarations that describe the AArch64 register file
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AArch64" in {
+def sub_128 : SubRegIndex;
+def sub_64 : SubRegIndex;
+def sub_32 : SubRegIndex;
+def sub_16 : SubRegIndex;
+def sub_8  : SubRegIndex;
+
+// The VPR registers are handled as sub-registers of FPR equivalents, but
+// they're really the same thing. We give this concept a special index.
+def sub_alias : SubRegIndex;
+}
+
+// Registers are identified with 5-bit ID numbers.
+class AArch64Reg<bits<16> enc, string n> : Register<n> {
+  let HWEncoding = enc;
+  let Namespace = "AArch64";
+}
+
+class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
+                         list<SubRegIndex> inds = []>
+      : AArch64Reg<enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = inds;
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+//===----------------------------------------------------------------------===//
+
+foreach Index = 0-30 in {
+  def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+}
+
+def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR : AArch64Reg<31, "wzr">;
+
+// Could be combined with previous loop, but this way leaves w and x registers
+// consecutive as LLVM register numbers, which makes for easier debugging.
+foreach Index = 0-30 in {
+  def X#Index : AArch64RegWithSubs<Index, "x"#Index,
+                                   [!cast<Register>("W"#Index)], [sub_32]>,
+                DwarfRegNum<[Index]>;
+}
+
+def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
+def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
+
+// Most instructions treat register 31 as zero for reads and a black-hole for
+// writes.
+
+// Note that the order of registers is important for the Disassembler here:
+// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
+// take an encoding value.
+def GPR32 : RegisterClass<"AArch64", [i32], 32,
+                          (add (sequence "W%u", 0, 30), WZR)> {
+}
+
+def GPR64 : RegisterClass<"AArch64", [i64], 64,
+                          (add (sequence "X%u", 0, 30), XZR)> {
+}
+
+def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
+                               (sequence "W%u", 0, 30)> {
+}
+
+def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
+                               (sequence "X%u", 0, 30)> {
+}
+
+// For tail calls, we can't use callee-saved registers or the structure-return
+// register, as they are supposed to be live across function calls and may be
+// clobbered by the epilogue.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
+                            (add (sequence "X%u", 0, 7),
+                                 (sequence "X%u", 9, 18))> {
+}
+
+
+// Certain addressing-useful instructions accept sp directly. Again the order of
+// registers is important to the Disassembler.
+def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
+                             (add (sequence "W%u", 0, 30), WSP)> {
+}
+
+def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
+                             (add (sequence "X%u", 0, 30), XSP)> {
+}
+
+// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
+// non-SP variants). We can't use a bare register in those patterns because
+// TableGen doesn't like it, so we need a class containing just stack registers
+def Rxsp : RegisterClass<"AArch64", [i64], 64,
+                         (add XSP)> {
+}
+
+def Rwsp : RegisterClass<"AArch64", [i32], 32,
+                         (add WSP)> {
+}
+
+//===----------------------------------------------------------------------===//
+//  Scalar registers in the vector unit:
+//  b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
+//===----------------------------------------------------------------------===//
+
+foreach Index = 0-31 in {
+  def B # Index : AArch64Reg< Index, "b" # Index>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def H # Index : AArch64RegWithSubs<Index, "h" # Index,
+                                     [!cast<Register>("B" # Index)], [sub_8]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def S # Index : AArch64RegWithSubs<Index, "s" # Index,
+                                     [!cast<Register>("H" # Index)], [sub_16]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def D # Index : AArch64RegWithSubs<Index, "d" # Index,
+                                     [!cast<Register>("S" # Index)], [sub_32]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+
+  def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
+                                     [!cast<Register>("D" # Index)], [sub_64]>,
+                  DwarfRegNum<[!add(Index, 64)]>;
+}
+
+
+def FPR8 : RegisterClass<"AArch64", [i8], 8,
+                          (sequence "B%u", 0, 31)> {
+}
+
+def FPR16 : RegisterClass<"AArch64", [f16], 16,
+                          (sequence "H%u", 0, 31)> {
+}
+
+def FPR32 : RegisterClass<"AArch64", [f32], 32,
+                          (sequence "S%u", 0, 31)> {
+}
+
+def FPR64 : RegisterClass<"AArch64", [f64], 64,
+                          (sequence "D%u", 0, 31)> {
+}
+
+def FPR128 : RegisterClass<"AArch64", [f128], 128,
+                          (sequence "Q%u", 0, 31)> {
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Vector registers:
+//===----------------------------------------------------------------------===//
+
+// NEON registers simply specify the overall vector, and it's expected that
+// Instructions will individually specify the acceptable data layout. In
+// principle this leaves two approaches open:
+//   + An operand, giving a single ADDvvv instruction (for example). This turns
+//     out to be unworkable in the assembly parser (without every Instruction
+//     having a "cvt" function, at least) because the constraints can't be
+//     properly enforced. It also complicates specifying patterns since each
+//     instruction will accept many types.
+//  + A bare token (e.g. ".2d"). This means the AsmParser has to know specific
+//    details about NEON registers, but simplifies most other details.
+//
+// The second approach was taken.
+
+foreach Index = 0-31 in {
+  def V # Index  : AArch64RegWithSubs<Index, "v" # Index,
+                                      [!cast<Register>("Q" # Index)],
+                                      [sub_alias]>,
+            DwarfRegNum<[!add(Index, 64)]>;
+}
+
+// These two classes contain the same registers, which should be reasonably
+// sensible for MC and allocation purposes, but allows them to be treated
+// separately for things like stack spilling.
+def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64,
+                          (sequence "V%u", 0, 31)>;
+
+def VPR128 : RegisterClass<"AArch64",
+                           [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128,
+                           (sequence "V%u", 0, 31)>;
+
+// Flags register
+def NZCV : Register<"nzcv"> {
+  let Namespace = "AArch64";
+}
+
+def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
new file mode 100644
index 000000000000..e17cdaa1f6d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Schedule.td
@@ -0,0 +1,10 @@
+//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def GenericItineraries : ProcessorItineraries<[], [], []>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..6bbe075a1b61
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -0,0 +1,25 @@
+//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
+}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
new file mode 100644
index 000000000000..d412ed2be180
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -0,0 +1,32 @@
+//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
+#define LLVM_AARCH64SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class AArch64TargetMachine;
+
+class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  const AArch64Subtarget *Subtarget;
+public:
+  explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
+  ~AArch64SelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
new file mode 100644
index 000000000000..d17b73820994
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -0,0 +1,43 @@
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64Subtarget.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AArch64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
+  : AArch64GenSubtargetInfo(TT, CPU, FS)
+  , HasNEON(true)
+  , HasCrypto(true)
+  , TargetTriple(TT) {
+
+  ParseSubtargetFeatures(CPU, FS);
+}
+
+bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
+                                          Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
new file mode 100644
index 000000000000..2e9205fc9924
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -0,0 +1,54 @@
+//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
+#define LLVM_TARGET_AARCH64_SUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AArch64GenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+class StringRef;
+class GlobalValue;
+
+class AArch64Subtarget : public AArch64GenSubtargetInfo {
+protected:
+  bool HasNEON;
+  bool HasCrypto;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_AARCH64_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
new file mode 100644
index 000000000000..df599d599dd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -0,0 +1,81 @@
+//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the AArch64TargetMachine
+// methods. Principally just setting up the passes needed to generate correct
+// code on this architecture.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeAArch64Target() {
+  RegisterTargetMachine<AArch64TargetMachine> X(TheAArch64Target);
+}
+
+AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    InstrInfo(Subtarget),
+    DL("e-p:64:64-i64:64:64-i128:128:128-s0:32:32-f128:128:128-n32:64-S128"),
+    TLInfo(*this),
+    TSInfo(*this),
+    FrameLowering(Subtarget) {
+}
+
+namespace {
+/// AArch64 Code Generator Pass Configuration Options.
+class AArch64PassConfig : public TargetPassConfig {
+public:
+  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AArch64TargetMachine &getAArch64TargetMachine() const {
+    return getTM<AArch64TargetMachine>();
+  }
+
+  const AArch64Subtarget &getAArch64Subtarget() const {
+    return *getAArch64TargetMachine().getSubtargetImpl();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AArch64PassConfig(this, PM);
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  addPass(&UnpackMachineBundlesID);
+  addPass(createAArch64BranchFixupPass());
+  return true;
+}
+
+bool AArch64PassConfig::addInstSelector() {
+  addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64CleanupLocalDynamicTLSPass());
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
new file mode 100644
index 000000000000..c1f47c2e5372
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -0,0 +1,69 @@
+//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the AArch64 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64TARGETMACHINE_H
+#define LLVM_AARCH64TARGETMACHINE_H
+
+#include "AArch64FrameLowering.h"
+#include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64SelectionDAGInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class AArch64TargetMachine : public LLVMTargetMachine {
+  AArch64Subtarget          Subtarget;
+  AArch64InstrInfo          InstrInfo;
+  const DataLayout          DL;
+  AArch64TargetLowering     TLInfo;
+  AArch64SelectionDAGInfo   TSInfo;
+  AArch64FrameLowering      FrameLowering;
+
+public:
+  AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  const AArch64InstrInfo *getInstrInfo() const {
+    return &InstrInfo;
+  }
+
+  const AArch64FrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+
+  const AArch64TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  const AArch64Subtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  const DataLayout *getDataLayout() const { return &DL; }
+
+  const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
new file mode 100644
index 000000000000..b4452f514590
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -0,0 +1,24 @@
+//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file deals with any AArch64 specific requirements on object files.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AArch64TargetObjectFile.h"
+
+using namespace llvm;
+
+void
+AArch64LinuxTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
new file mode 100644
index 000000000000..bf0565a79ec8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -0,0 +1,31 @@
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file deals with any AArch64 specific requirements on object files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  /// AArch64LinuxTargetObjectFile - This implementation is used for linux
+  /// AArch64.
+  class AArch64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
new file mode 100644
index 000000000000..69bb80a48537
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -0,0 +1,2197 @@
+//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the (GNU-style) assembly parser for the AArch64
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64Operand;
+
+class AArch64AsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
+#define GET_ASSEMBLER_HEADER
+#include "AArch64GenAsmMatcher.inc"
+
+public:
+  enum AArch64MatchResultTy {
+    Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "AArch64GenAsmMatcher.inc"
+  };
+
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  // These are the public interface of the MCTargetAsmParser
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirectiveTLSDescCall(SMLoc L);
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer&Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  // The rest of the sub-parsers have more freedom over interface: they return
+  // an OperandMatchResultTy because it's less ambiguous than true/false or
+  // -1/0/1 even if it is more verbose
+  OperandMatchResultTy
+  ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+               StringRef Mnemonic);
+
+  OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
+
+  OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
+
+  OperandMatchResultTy
+  ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                uint32_t NumLanes);
+
+  OperandMatchResultTy
+  ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                uint32_t &NumLanes);
+
+  OperandMatchResultTy
+  ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  template<typename SomeNamedImmMapper> OperandMatchResultTy
+  ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
+  }
+
+  OperandMatchResultTy
+  ParseNamedImmOperand(const NamedImmMapper &Mapper,
+                       SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  OperandMatchResultTy
+  ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool validateInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  /// Scan the next token (which had better be an identifier) and determine
+  /// whether it represents a general-purpose or vector register. It returns
+  /// true if an identifier was found and populates its reference arguments. It
+  /// does not consume the token.
+  bool
+  IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
+                   SMLoc &LayoutLoc) const;
+
+};
+
+}
+
+namespace {
+
+/// Instances of this class represent a parsed AArch64 machine instruction.
+class AArch64Operand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_ImmWithLSL,     // #uimm {, LSL #amt }
+    k_CondCode,       // eq/ne/...
+    k_FPImmediate,    // Limited-precision floating-point imm
+    k_Immediate,      // Including expressions referencing symbols
+    k_Register,
+    k_ShiftExtend,
+    k_SysReg,         // The register operand of MRS and MSR instructions
+    k_Token,          // The mnemonic; other raw tokens the auto-generated
+    k_WrappedRegister // Load/store exclusive permit a wrapped register.
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct ImmWithLSLOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
+    bool ImplicitAmount;
+  };
+
+  struct CondCodeOp {
+    A64CC::CondCodes Code;
+  };
+
+  struct FPImmOp {
+    double Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ShiftExtendOp {
+    A64SE::ShiftExtSpecifiers ShiftType;
+    unsigned Amount;
+    bool ImplicitAmount;
+  };
+
+  struct SysRegOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  union {
+    struct ImmWithLSLOp ImmWithLSL;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct ImmOp Imm;
+    struct RegOp Reg;
+    struct ShiftExtendOp ShiftExtend;
+    struct SysRegOp SysReg;
+    struct TokOp Tok;
+  };
+
+  AArch64Operand(KindTy K, SMLoc S, SMLoc E)
+    : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+
+public:
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+  }
+
+  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getEndLoc() const { return EndLoc; }
+  void print(raw_ostream&) const;
+  void dump() const;
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register || Kind == k_WrappedRegister)
+           && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == k_Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  A64CC::CondCodes getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
+  }
+
+  static bool isNonConstantExpr(const MCExpr *E,
+                                AArch64MCExpr::VariantKind &Variant) {
+    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
+      Variant = A64E->getKind();
+      return true;
+    } else if (!isa<MCConstantExpr>(E)) {
+      Variant = AArch64MCExpr::VK_AARCH64_None;
+      return true;
+    }
+
+    return false;
+  }
+
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isMem() const { return false; }
+  bool isFPImm() const { return Kind == k_FPImmediate; }
+  bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
+  bool isWrappedReg() const { return Kind == k_WrappedRegister; }
+
+  bool isAddSubImmLSL0() const {
+    if (!isImmWithLSL()) return false;
+    if (ImmWithLSL.ShiftAmount != 0) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
+          || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+
+  bool isAddSubImmLSL12() const {
+    if (!isImmWithLSL()) return false;
+    if (ImmWithLSL.ShiftAmount != 12) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
+          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+
+  template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
+    if (!isShiftOrExtend()) return false;
+
+    A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
+    if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
+      return false;
+
+    if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
+      return false;
+
+    return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+  }
+
+  bool isAdrpLabel() const {
+    if (!isImm()) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(getImm(), Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_None
+        || Variant == AArch64MCExpr::VK_AARCH64_GOT
+        || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
+        || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
+    }
+
+    return isLabel<21, 4096>();
+  }
+
+  template<unsigned RegWidth>  bool isBitfieldWidth() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  }
+
+  template<int RegWidth>
+  bool isCVTFixedPos() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  }
+
+  bool isFMOVImm() const {
+    if (!isFPImm()) return false;
+
+    APFloat RealVal(FPImm.Val);
+    uint32_t ImmVal;
+    return A64Imms::isFPImm(RealVal, ImmVal);
+  }
+
+  bool isFPZero() const {
+    if (!isFPImm()) return false;
+
+    APFloat RealVal(FPImm.Val);
+    return RealVal.isPosZero();
+  }
+
+  template<unsigned field_width, unsigned scale>
+  bool isLabel() const {
+    if (!isImm()) return false;
+
+    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
+      return true;
+    } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (scale * (1LL << (field_width - 1)));
+      int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
+      return (Val % scale) == 0 && Val >= Min && Val <= Max;
+    }
+
+    // N.b. this disallows explicit relocation specifications via an
+    // AArch64MCExpr. Users needing that behaviour
+    return false;
+  }
+
+  bool isLane1() const {
+    if (!isImm()) return false;
+
+    // Because it's come through custom assembly parsing, it must always be a
+    // constant expression.
+    return cast<MCConstantExpr>(getImm())->getValue() == 1;
+  }
+
+  bool isLoadLitLabel() const {
+    if (!isImm()) return false;
+
+    AArch64MCExpr::VariantKind Variant;
+    if (isNonConstantExpr(getImm(), Variant)) {
+      return Variant == AArch64MCExpr::VK_AARCH64_None
+          || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
+    }
+
+    return isLabel<19, 4>();
+  }
+
+  template<unsigned RegWidth> bool isLogicalImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+    if (!CE) return false;
+
+    uint32_t Bits;
+    return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  }
+
+  template<unsigned RegWidth> bool isLogicalImmMOV() const {
+    if (!isLogicalImm<RegWidth>()) return false;
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+
+    // The move alias for ORR is only valid if the immediate cannot be
+    // represented with a move (immediate) instruction; they take priority.
+    int UImm16, Shift;
+    return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
+      && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
+  }
+
+  template<int MemSize>
+  bool isOffsetUImm12() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+
+    // Assume they know what they're doing for now if they've given us a
+    // non-constant expression. In principle we could check for ridiculous
+    // things that can't possibly work or relocations that would almost
+    // certainly break resulting code.
+    if (!CE)
+      return true;
+
+    int64_t Val = CE->getValue();
+
+    // Must be a multiple of the access size in bytes.
+    if ((Val & (MemSize - 1)) != 0) return false;
+
+    // Must be 12-bit unsigned
+    return Val >= 0 && Val <= 0xfff * MemSize;
+  }
+
+  template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
+  bool isShift() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != SHKind)
+      return false;
+
+    return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
+  }
+
+  bool isMOVN32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVN64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G2,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+
+  bool isMOVZ32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0,
+      AArch64MCExpr::VK_AARCH64_ABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVZ64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0,
+      AArch64MCExpr::VK_AARCH64_ABS_G1,
+      AArch64MCExpr::VK_AARCH64_ABS_G2,
+      AArch64MCExpr::VK_AARCH64_ABS_G3,
+      AArch64MCExpr::VK_AARCH64_SABS_G0,
+      AArch64MCExpr::VK_AARCH64_SABS_G1,
+      AArch64MCExpr::VK_AARCH64_SABS_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G2,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVK32Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMOVK64Imm() const {
+    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
+      AArch64MCExpr::VK_AARCH64_ABS_G3,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
+      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
+    };
+    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+
+    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  }
+
+  bool isMoveWideImm(unsigned RegWidth,
+                     AArch64MCExpr::VariantKind *PermittedModifiers,
+                     unsigned NumModifiers) const {
+    if (!isImmWithLSL()) return false;
+
+    if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
+    if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
+
+    AArch64MCExpr::VariantKind Modifier;
+    if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
+      // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
+      if (!ImmWithLSL.ImplicitAmount) return false;
+
+      for (unsigned i = 0; i < NumModifiers; ++i)
+        if (PermittedModifiers[i] == Modifier) return true;
+
+      return false;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
+    return CE && CE->getValue() >= 0  && CE->getValue() <= 0xffff;
+  }
+
+  template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
+  bool isMoveWideMovAlias() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    int UImm16, Shift;
+    uint64_t Value = CE->getValue();
+
+    // If this is a 32-bit instruction then all bits above 32 should be the
+    // same: either of these is fine because signed/unsigned values should be
+    // permitted.
+    if (RegWidth == 32) {
+      if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
+        return false;
+
+      Value &= 0xffffffffULL;
+    }
+
+    return isValidImm(RegWidth, Value, UImm16, Shift);
+  }
+
+  bool isMSRWithReg() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isMSRPState() const {
+    if (!isSysReg()) return false;
+
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64PState::PStateMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isMRS() const {
+    if (!isSysReg()) return false;
+
+    // First check against specific MSR-only (write-only) registers
+    bool IsKnownRegister;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
+
+    return IsKnownRegister;
+  }
+
+  bool isPRFM() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+
+    if (!CE)
+      return false;
+
+    return CE->getValue() >= 0 && CE->getValue() <= 31;
+  }
+
+  template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != SHKind)
+      return false;
+
+    return ShiftExtend.Amount <= 4;
+  }
+
+  bool isRegExtendLSL() const {
+    if (!isShiftOrExtend()) return false;
+
+    if (ShiftExtend.ShiftType != A64SE::LSL)
+      return false;
+
+    return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
+  }
+
+  template<int MemSize>  bool isSImm7Scaled() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    int64_t Val = CE->getValue();
+    if (Val % MemSize != 0) return false;
+
+    Val /= MemSize;
+
+    return Val >= -64 && Val < 64;
+  }
+
+  template<int BitWidth>
+  bool isSImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= -(1LL << (BitWidth - 1))
+      && CE->getValue() < (1LL << (BitWidth - 1));
+  }
+
+  template<int bitWidth>
+  bool isUImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
+  }
+
+  bool isUImm() const {
+    if (!isImm()) return false;
+
+    return isa<MCConstantExpr>(getImm());
+  }
+
+  static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
+                                          unsigned ShiftAmount,
+                                          bool ImplicitAmount,
+                                          SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
+    Op->ImmWithLSL.Val = Val;
+    Op->ImmWithLSL.ShiftAmount = ShiftAmount;
+    Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
+    return Op;
+  }
+
+  static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
+                                        SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
+    Op->CondCode.Code = Code;
+    return Op;
+  }
+
+  static AArch64Operand *CreateFPImm(double Val,
+                                     SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
+    Op->FPImm.Val = Val;
+    return Op;
+  }
+
+  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
+    Op->Imm.Val = Val;
+    return Op;
+  }
+
+  static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
+    Op->Reg.RegNum = RegNum;
+    return Op;
+  }
+
+  static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
+    Op->Reg.RegNum = RegNum;
+    return Op;
+  }
+
+  static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
+                                           unsigned Amount,
+                                           bool ImplicitAmount,
+                                           SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
+    Op->ShiftExtend.ShiftType = ShiftTyp;
+    Op->ShiftExtend.Amount = Amount;
+    Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
+    return Op;
+  }
+
+  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
+    AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    return Op;
+  }
+
+  static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
+    AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    return Op;
+  }
+
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  template<unsigned RegWidth>
+  void addBFILSBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
+    Inst.addOperand(MCOperand::CreateImm(EncodedVal));
+  }
+
+  void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+
+    Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+  }
+
+  void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
+  }
+
+  void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    APFloat RealVal(FPImm.Val);
+    uint32_t ImmVal;
+    A64Imms::isFPImm(RealVal, ImmVal);
+
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  }
+
+  void addFPZeroOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    Inst.addOperand(MCOperand::CreateImm(0));
+  }
+
+  void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Encoded = A64InvertCondCode(getCondCode());
+    Inst.addOperand(MCOperand::CreateImm(Encoded));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  template<int MemSize>
+  void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Val = CE->getValue() / MemSize;
+    Inst.addOperand(MCOperand::CreateImm(Val  & 0x7f));
+  }
+
+  template<int BitWidth>
+  void addSImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val  & ((1ULL << BitWidth) - 1)));
+  }
+
+  void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
+    assert (N == 1 && "Invalid number of operands!");
+
+    addExpr(Inst, ImmWithLSL.Val);
+  }
+
+  template<unsigned field_width, unsigned scale>
+  void addLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+
+    if (!CE) {
+      addExpr(Inst, Imm.Val);
+      return;
+    }
+
+    int64_t Val = CE->getValue();
+    assert(Val % scale == 0 && "Unaligned immediate in instruction");
+    Val /= scale;
+
+    Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
+  }
+
+  template<int MemSize>
+  void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
+    } else {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+    }
+  }
+
+  template<unsigned RegWidth>
+  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands");
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+
+    uint32_t Bits;
+    A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMRSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    bool Valid;
+    StringRef Name(SysReg.Data, SysReg.Length);
+    uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
+
+    Inst.addOperand(MCOperand::CreateImm(Bits));
+  }
+
+  void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    addExpr(Inst, ImmWithLSL.Val);
+
+    AArch64MCExpr::VariantKind Variant;
+    if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
+      Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
+      return;
+    }
+
+    // We know it's relocated
+    switch (Variant) {
+    case AArch64MCExpr::VK_AARCH64_ABS_G0:
+    case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G0:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G1:
+    case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G1:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
+    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
+      Inst.addOperand(MCOperand::CreateImm(1));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G2:
+    case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
+    case AArch64MCExpr::VK_AARCH64_SABS_G2:
+    case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+    case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+      Inst.addOperand(MCOperand::CreateImm(2));
+      break;
+    case AArch64MCExpr::VK_AARCH64_ABS_G3:
+      Inst.addOperand(MCOperand::CreateImm(3));
+      break;
+    default: llvm_unreachable("Inappropriate move wide relocation");
+    }
+  }
+
+  template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
+  void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int UImm16, Shift;
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+
+    if (RegWidth == 32) {
+      Value &= 0xffffffffULL;
+    }
+
+    bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
+    (void)Valid;
+    assert(Valid && "Invalid immediates should have been weeded out by now");
+
+    Inst.addOperand(MCOperand::CreateImm(UImm16));
+    Inst.addOperand(MCOperand::CreateImm(Shift));
+  }
+
+  void addPRFMOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    assert(CE->getValue() >= 0 && CE->getValue() <= 31
+           && "PRFM operand should be 5-bits");
+
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  // For Add-sub (extended register) operands.
+  void addRegExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  }
+
+  // For the extend in load-store (register offset) instructions.
+  template<unsigned MemSize>
+  void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
+    addAddrRegExtendOperands(Inst, N, MemSize);
+  }
+
+  void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
+                                unsigned MemSize) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    // First bit of Option is set in instruction classes, the high two bits are
+    // as follows:
+    unsigned OptionHi = 0;
+    switch (ShiftExtend.ShiftType) {
+    case A64SE::UXTW:
+    case A64SE::LSL:
+      OptionHi = 1;
+      break;
+    case A64SE::SXTW:
+    case A64SE::SXTX:
+      OptionHi = 3;
+      break;
+    default:
+      llvm_unreachable("Invalid extend type for register offset");
+    }
+
+    unsigned S = 0;
+    if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
+      S = 1;
+    else if (MemSize != 1 && ShiftExtend.Amount != 0)
+      S = 1;
+
+    Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
+  }
+  void addShiftOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  }
+};
+
+} // end anonymous namespace.
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               StringRef Mnemonic) {
+
+  // See if the operand has a custom parser
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // It could either succeed, fail or just not care.
+  if (ResTy != MatchOperand_NoMatch)
+    return ResTy;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return MatchOperand_ParseFail;
+  case AsmToken::Identifier: {
+    // It might be in the LSL/UXTB family ...
+    OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
+
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // ... or it might be a register ...
+    uint32_t NumLanes = 0;
+    OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
+    assert(GotReg != MatchOperand_ParseFail
+           && "register parsing shouldn't partially succeed");
+
+    if (GotReg == MatchOperand_Success) {
+      if (Parser.getTok().is(AsmToken::LBrac))
+        return ParseNEONLane(Operands, NumLanes);
+      else
+        return MatchOperand_Success;
+    }
+
+    // ... or it might be a symbolish thing
+  }
+    // Fall through
+  case AsmToken::LParen:  // E.g. (strcmp-4)
+  case AsmToken::Integer: // 1f, 2b labels
+  case AsmToken::String:  // quoted labels
+  case AsmToken::Dot:     // . is Current location
+  case AsmToken::Dollar:  // $ is PC
+  case AsmToken::Colon: {
+    SMLoc StartLoc  = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    const MCExpr *ImmVal = 0;
+
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
+    return MatchOperand_Success;
+  }
+  case AsmToken::Hash: {   // Immediates
+    SMLoc StartLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    const MCExpr *ImmVal = 0;
+    Parser.Lex();
+
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
+    return MatchOperand_Success;
+  }
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", Loc));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return ParseOperand(Operands, Mnemonic);
+  }
+  // The following will likely be useful later, but not in very early cases
+  case AsmToken::LCurly:  // Weird SIMD lists
+    llvm_unreachable("Don't know how to deal with '{' in operand");
+    return MatchOperand_ParseFail;
+  }
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
+  if (getLexer().is(AsmToken::Colon)) {
+    AArch64MCExpr::VariantKind RefKind;
+
+    OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
+    if (ResTy != MatchOperand_Success)
+      return ResTy;
+
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return MatchOperand_ParseFail;
+
+    ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
+    return MatchOperand_Success;
+  }
+
+  // No weird AArch64MCExpr prefix
+  return getParser().parseExpression(ExprVal)
+    ? MatchOperand_ParseFail : MatchOperand_Success;
+}
+
+// A lane attached to a NEON register. "[N]", which should yield three tokens:
+// '[', N, ']'. A hash is not allowed to precede the immediate here.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                uint32_t NumLanes) {
+  SMLoc Loc = Parser.getTok().getLoc();
+
+  assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
+  Operands.push_back(AArch64Operand::CreateToken("[", Loc));
+  Parser.Lex(); // Eat '['
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "expected lane number");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().getIntVal() >= NumLanes) {
+    Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
+                                              getContext());
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat actual lane
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
+
+
+  if (Parser.getTok().isNot(AsmToken::RBrac)) {
+    Error(Parser.getTok().getLoc(), "expected ']' after lane");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+  Parser.Lex(); // Eat ']'
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
+  assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(),
+          "expected relocation specifier in operand after ':'");
+    return MatchOperand_ParseFail;
+  }
+
+  std::string LowerCase = Parser.getTok().getIdentifier().lower();
+  RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+    .Case("got",              AArch64MCExpr::VK_AARCH64_GOT)
+    .Case("got_lo12",         AArch64MCExpr::VK_AARCH64_GOT_LO12)
+    .Case("lo12",             AArch64MCExpr::VK_AARCH64_LO12)
+    .Case("abs_g0",           AArch64MCExpr::VK_AARCH64_ABS_G0)
+    .Case("abs_g0_nc",        AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
+    .Case("abs_g1",           AArch64MCExpr::VK_AARCH64_ABS_G1)
+    .Case("abs_g1_nc",        AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
+    .Case("abs_g2",           AArch64MCExpr::VK_AARCH64_ABS_G2)
+    .Case("abs_g2_nc",        AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
+    .Case("abs_g3",           AArch64MCExpr::VK_AARCH64_ABS_G3)
+    .Case("abs_g0_s",         AArch64MCExpr::VK_AARCH64_SABS_G0)
+    .Case("abs_g1_s",         AArch64MCExpr::VK_AARCH64_SABS_G1)
+    .Case("abs_g2_s",         AArch64MCExpr::VK_AARCH64_SABS_G2)
+    .Case("dtprel_g2",        AArch64MCExpr::VK_AARCH64_DTPREL_G2)
+    .Case("dtprel_g1",        AArch64MCExpr::VK_AARCH64_DTPREL_G1)
+    .Case("dtprel_g1_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
+    .Case("dtprel_g0",        AArch64MCExpr::VK_AARCH64_DTPREL_G0)
+    .Case("dtprel_g0_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
+    .Case("dtprel_hi12",      AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
+    .Case("dtprel_lo12",      AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
+    .Case("dtprel_lo12_nc",   AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
+    .Case("gottprel_g1",      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
+    .Case("gottprel_g0_nc",   AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
+    .Case("gottprel",         AArch64MCExpr::VK_AARCH64_GOTTPREL)
+    .Case("gottprel_lo12",    AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
+    .Case("tprel_g2",         AArch64MCExpr::VK_AARCH64_TPREL_G2)
+    .Case("tprel_g1",         AArch64MCExpr::VK_AARCH64_TPREL_G1)
+    .Case("tprel_g1_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
+    .Case("tprel_g0",         AArch64MCExpr::VK_AARCH64_TPREL_G0)
+    .Case("tprel_g0_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
+    .Case("tprel_hi12",       AArch64MCExpr::VK_AARCH64_TPREL_HI12)
+    .Case("tprel_lo12",       AArch64MCExpr::VK_AARCH64_TPREL_LO12)
+    .Case("tprel_lo12_nc",    AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
+    .Case("tlsdesc",          AArch64MCExpr::VK_AARCH64_TLSDESC)
+    .Case("tlsdesc_lo12",     AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
+    .Default(AArch64MCExpr::VK_AARCH64_None);
+
+  if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
+    Error(Parser.getTok().getLoc(),
+          "expected relocation specifier in operand after ':'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat identifier
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(),
+          "expected ':' after relocation specifier");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex();
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseImmWithLSLOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME?: I want to live in a world where immediates must start with
+  // #. Please don't dash my hopes (well, do if you have a good reason).
+  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '#'
+
+  const MCExpr *Imm;
+  if (ParseImmediate(Imm) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+  else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
+    return MatchOperand_Success;
+  }
+
+  // Eat ','
+  Parser.Lex();
+
+  // The optional operand must be "lsl #N" where N is non-negative.
+  if (Parser.getTok().is(AsmToken::Identifier)
+      && Parser.getTok().getIdentifier().lower() == "lsl") {
+    Parser.Lex();
+
+    if (Parser.getTok().is(AsmToken::Hash)) {
+      Parser.Lex();
+
+      if (Parser.getTok().isNot(AsmToken::Integer)) {
+        Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+        return MatchOperand_ParseFail;
+      }
+    }
+  }
+
+  int64_t ShiftAmount = Parser.getTok().getIntVal();
+
+  if (ShiftAmount < 0) {
+    Error(Parser.getTok().getLoc(), "positive shift amount required");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the number
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
+                                                      false, S, E));
+  return MatchOperand_Success;
+}
+
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseCondCodeOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  StringRef Tok = Parser.getTok().getIdentifier();
+  A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
+
+  if (CondCode == A64CC::Invalid)
+    return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat condition code
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseCRxOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  std::string LowerTok = Parser.getTok().getIdentifier().lower();
+  StringRef Tok(LowerTok);
+  if (Tok[0] != 'c') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
+
+  Parser.Lex();
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseFPImmOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  // FIXME?: I want to live in a world where immediates must start with
+  // #. Please don't dash my hopes (well, do if you have a good reason).
+  if (Parser.getTok().isNot(AsmToken::Hash)) return MatchOperand_NoMatch;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '#'
+
+  bool Negative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Negative = true;
+    Parser.Lex(); // Eat '-'
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat '+'
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Real)) {
+    Error(S, "Expected floating-point immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
+  if (Negative) RealVal.changeSign();
+  double DblVal = RealVal.convertToDouble();
+
+  Parser.Lex(); // Eat real number
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
+  return MatchOperand_Success;
+}
+
+
+// Automatically generated
+static unsigned MatchRegisterName(StringRef Name);
+
+bool
+AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
+                                   StringRef &Layout,
+                                   SMLoc &LayoutLoc) const {
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return false;
+
+  std::string LowerReg = Tok.getString().lower();
+  size_t DotPos = LowerReg.find('.');
+
+  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
+  if (RegNum == AArch64::NoRegister) {
+    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
+      .Case("ip0", AArch64::X16)
+      .Case("ip1", AArch64::X17)
+      .Case("fp", AArch64::X29)
+      .Case("lr", AArch64::X30)
+      .Default(AArch64::NoRegister);
+  }
+  if (RegNum == AArch64::NoRegister)
+    return false;
+
+  SMLoc S = Tok.getLoc();
+  RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
+
+  if (DotPos == StringRef::npos) {
+    Layout = StringRef();
+  } else {
+    // Everything afterwards needs to be a literal token, expected to be
+    // '.2d','.b' etc for vector registers.
+
+    // This StringSwitch validates the input and (perhaps more importantly)
+    // gives us a permanent string to use in the token (a pointer into LowerReg
+    // would go out of scope when we return).
+    LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
+    std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos);
+    Layout = StringSwitch<const char *>(LayoutText)
+      .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d")
+      .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s")
+      .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h")
+      .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b")
+      .Default("");
+
+    if (Layout.size() == 0) {
+      // Malformed register
+      return false;
+    }
+  }
+
+  return true;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                uint32_t &NumLanes) {
+  unsigned RegNum;
+  StringRef Layout;
+  SMLoc RegEndLoc, LayoutLoc;
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
+
+  if (Layout.size() != 0) {
+    unsigned long long TmpLanes = 0;
+    llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
+    if (TmpLanes != 0) {
+      NumLanes = TmpLanes;
+    } else {
+      // If the number of lanes isn't specified explicitly, a valid instruction
+      // will have an element specifier and be capable of acting on the entire
+      // vector register.
+      switch (Layout.back()) {
+      default: llvm_unreachable("Invalid layout specifier");
+      case 'b': NumLanes = 16; break;
+      case 'h': NumLanes = 8; break;
+      case 's': NumLanes = 4; break;
+      case 'd': NumLanes = 2; break;
+      }
+    }
+
+    Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
+  }
+
+  Parser.Lex();
+  return MatchOperand_Success;
+}
+
+bool
+AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                SMLoc &EndLoc) {
+  // This callback is used for things like DWARF frame directives in
+  // assembly. They don't care about things like NEON layouts or lanes, they
+  // just want to be able to produce the DWARF register number.
+  StringRef LayoutSpec;
+  SMLoc RegEndLoc, LayoutLoc;
+  StartLoc = Parser.getTok().getLoc();
+
+  if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
+    return true;
+
+  Parser.Lex();
+  EndLoc = Parser.getTok().getLoc();
+
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Since these operands occur in very limited circumstances, without
+  // alternatives, we actually signal an error if there is no match. If relaxing
+  // this, beware of unintended consequences: an immediate will be accepted
+  // during matching, no matter how it gets into the AArch64Operand.
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  if (Tok.is(AsmToken::Identifier)) {
+    bool ValidName;
+    uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
+
+    if (!ValidName) {
+      Error(S, "operand specifier not recognised");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // We're done with the identifier. Eat it
+
+    SMLoc E = Parser.getTok().getLoc();
+    const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
+    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
+    return MatchOperand_Success;
+  } else if (Tok.is(AsmToken::Hash)) {
+    Parser.Lex();
+
+    const MCExpr *ImmVal;
+    if (ParseImmediate(ImmVal) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
+      Error(S, "Invalid immediate for instruction");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
+    return MatchOperand_Success;
+  }
+
+  Error(S, "unexpected operand for instruction");
+  return MatchOperand_ParseFail;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseSysRegOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Any MSR/MRS operand will be an identifier, and we want to store it as some
+  // kind of string: SPSel is valid for two different forms of MSR with two
+  // different encodings. There's no collision at the moment, but the potential
+  // is there.
+  if (!Tok.is(AsmToken::Identifier)) {
+    return MatchOperand_NoMatch;
+  }
+
+  SMLoc S = Tok.getLoc();
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
+  Parser.Lex(); // Eat identifier
+
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseLSXAddressOperand(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  unsigned RegNum;
+  SMLoc RegEndLoc, LayoutLoc;
+  StringRef Layout;
+  if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
+     || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
+     || Layout.size() != 0) {
+    // Check Layout.size because we don't want to let "x3.4s" or similar
+    // through.
+    return MatchOperand_NoMatch;
+  }
+  Parser.Lex(); // Eat register
+
+  if (Parser.getTok().is(AsmToken::RBrac)) {
+    // We're done
+    SMLoc E = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+    return MatchOperand_Success;
+  }
+
+  // Otherwise, only ", #0" is valid
+
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat ','
+
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "expected '#0'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat '#'
+
+  if (Parser.getTok().isNot(AsmToken::Integer)
+      || Parser.getTok().getIntVal() != 0 ) {
+    Error(Parser.getTok().getLoc(), "expected '#0'");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat '0'
+
+  SMLoc E = Parser.getTok().getLoc();
+  Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
+  return MatchOperand_Success;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::ParseShiftExtend(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  StringRef IDVal = Parser.getTok().getIdentifier();
+  std::string LowerID = IDVal.lower();
+
+  A64SE::ShiftExtSpecifiers Spec =
+    StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
+      .Case("lsl", A64SE::LSL)
+      .Case("lsr", A64SE::LSR)
+      .Case("asr", A64SE::ASR)
+      .Case("ror", A64SE::ROR)
+      .Case("uxtb", A64SE::UXTB)
+      .Case("uxth", A64SE::UXTH)
+      .Case("uxtw", A64SE::UXTW)
+      .Case("uxtx", A64SE::UXTX)
+      .Case("sxtb", A64SE::SXTB)
+      .Case("sxth", A64SE::SXTH)
+      .Case("sxtw", A64SE::SXTW)
+      .Case("sxtx", A64SE::SXTX)
+      .Default(A64SE::Invalid);
+
+  if (Spec == A64SE::Invalid)
+    return MatchOperand_NoMatch;
+
+  // Eat the shift
+  SMLoc S, E;
+  S = Parser.getTok().getLoc();
+  Parser.Lex();
+
+  if (Spec != A64SE::LSL && Spec != A64SE::LSR &&
+      Spec != A64SE::ASR && Spec != A64SE::ROR) {
+    // The shift amount can be omitted for the extending versions, but not real
+    // shifts:
+    //     add x0, x0, x0, uxtb
+    // is valid, and equivalent to
+    //     add x0, x0, x0, uxtb #0
+
+    if (Parser.getTok().is(AsmToken::Comma) ||
+        Parser.getTok().is(AsmToken::EndOfStatement) ||
+        Parser.getTok().is(AsmToken::RBrac)) {
+      Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
+                                                           S, E));
+      return MatchOperand_Success;
+    }
+  }
+
+  // Eat # at beginning of immediate
+  if (!Parser.getTok().is(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(),
+          "expected #imm after shift specifier");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex();
+
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
+    return MatchOperand_ParseFail;
+  }
+  unsigned Amount = Parser.getTok().getIntVal();
+  Parser.Lex();
+  E = Parser.getTok().getLoc();
+
+  Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
+                                                       S, E));
+
+  return MatchOperand_Success;
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool AArch64AsmParser::
+validateInstruction(MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  case AArch64::BFIwwii:
+  case AArch64::BFIxxii:
+  case AArch64::SBFIZwwii:
+  case AArch64::SBFIZxxii:
+  case AArch64::UBFIZwwii:
+  case AArch64::UBFIZxxii:  {
+    unsigned ImmOps = Inst.getNumOperands() - 2;
+    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
+    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
+
+    if (ImmR != 0 && ImmS >= ImmR) {
+      return Error(Operands[4]->getStartLoc(),
+                   "requested insert overflows register");
+    }
+    return false;
+  }
+  case AArch64::BFXILwwii:
+  case AArch64::BFXILxxii:
+  case AArch64::SBFXwwii:
+  case AArch64::SBFXxxii:
+  case AArch64::UBFXwwii:
+  case AArch64::UBFXxxii: {
+    unsigned ImmOps = Inst.getNumOperands() - 2;
+    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
+    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
+    int64_t RegWidth = 0;
+    switch (Inst.getOpcode()) {
+    case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
+      RegWidth = 64;
+      break;
+    case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
+      RegWidth = 32;
+      break;
+    }
+
+    if (ImmS >= RegWidth || ImmS < ImmR) {
+      return Error(Operands[4]->getStartLoc(),
+                   "requested extract overflows register");
+    }
+    return false;
+  }
+  case AArch64::ICix: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
+    if (!A64IC::NeedsRegister(ICOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified IC op does not use a register");
+    }
+    return false;
+  }
+  case AArch64::ICi: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
+    if (A64IC::NeedsRegister(ICOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified IC op requires a register");
+    }
+    return false;
+  }
+  case AArch64::TLBIix: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
+    if (!A64TLBI::NeedsRegister(TLBIOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified TLBI op does not use a register");
+    }
+    return false;
+  }
+  case AArch64::TLBIi: {
+    int64_t ImmVal = Inst.getOperand(0).getImm();
+    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
+    if (A64TLBI::NeedsRegister(TLBIOp)) {
+      return Error(Operands[1]->getStartLoc(),
+                   "specified TLBI op requires a register");
+    }
+    return false;
+  }
+  }
+
+  return false;
+}
+
+
+// Parses the instruction *together with* all operands, appending each parsed
+// operand to the "Operands" list
+bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                        StringRef Name, SMLoc NameLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  size_t CondCodePos = Name.find('.');
+
+  StringRef Mnemonic = Name.substr(0, CondCodePos);
+  Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
+
+  if (CondCodePos != StringRef::npos) {
+    // We have a condition code
+    SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
+    StringRef CondStr = Name.substr(CondCodePos + 1, StringRef::npos);
+    A64CC::CondCodes Code;
+
+    Code = A64StringToCondCode(CondStr);
+
+    if (Code == A64CC::Invalid) {
+      Error(S, "invalid condition code");
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
+
+    Operands.push_back(AArch64Operand::CreateToken(".",  DotL));
+    SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
+    Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
+  }
+
+  // Now we parse the operands of this instruction
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Mnemonic)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Mnemonic)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+
+
+      // After successfully parsing some operands there are two special cases to
+      // consider (i.e. notional operands not separated by commas). Both are due
+      // to memory specifiers:
+      //  + An RBrac will end an address for load/store/prefetch
+      //  + An '!' will indicate a pre-indexed operation.
+      //
+      // It's someone else's responsibility to make sure these tokens are sane
+      // in the given context!
+      if (Parser.getTok().is(AsmToken::RBrac)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+        Parser.Lex();
+      }
+
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+        Parser.Lex();
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "expected comma before next operand");
+  }
+
+  // Eat the EndOfStatement
+  Parser.Lex();
+
+  return false;
+}
+
+bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".hword")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal == ".word")
+    return ParseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".xword")
+    return ParseDirectiveWord(8, DirectiveID.getLoc());
+  else if (IDVal == ".tlsdesccall")
+    return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
+
+  return true;
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+// parseDirectiveTLSDescCall:
+//   ::= .tlsdesccall symbol
+bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+
+  MCInst Inst;
+  Inst.setOpcode(AArch64::TLSDESCCALL);
+  Inst.addOperand(MCOperand::CreateExpr(Expr));
+
+  getParser().getStreamer().EmitInstruction(Inst);
+  return false;
+}
+
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                 MCStreamer &Out, unsigned &ErrorInfo,
+                                 bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+
+  if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
+    return Error(IDLoc, "too few operands for instruction");
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    if (validateInstruction(Inst, Operands))
+      return true;
+
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+
+  case Match_AddSubRegExtendSmall:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegShift32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_AddSubSecondSource:
+      return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+          "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_CVTFixedPos32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 32]");
+  case Match_CVTFixedPos64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 64]");
+  case Match_CondCode:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected AArch64 condition code");
+  case Match_FPImm:
+    // Any situation which allows a nontrivial floating-point constant also
+    // allows a register.
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected compatible register or floating-point constant");
+  case Match_FPZero:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected floating-point constant #0.0");
+  case Match_Label:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected label or encodable integer pc offset");
+  case Match_Lane1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected lane specifier '[1]'");
+  case Match_LoadStoreExtend32_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_LoadStoreExtend32_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_LoadStoreExtend32_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_LoadStoreExtend32_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_LoadStoreExtend32_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
+  case Match_LoadStoreExtend64_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_LoadStoreExtend64_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_LoadStoreExtend64_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_LoadStoreExtend64_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_LoadStoreExtend64_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_LoadStoreSImm7_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 4 in range [-256, 252]");
+  case Match_LoadStoreSImm7_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 8 in range [-512, 508]");
+  case Match_LoadStoreSImm7_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer multiple of 16 in range [-1024, 1016]");
+  case Match_LoadStoreSImm9:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [-256, 255]");
+  case Match_LoadStoreUImm12_1:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 4095]");
+  case Match_LoadStoreUImm12_2:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 8190]");
+  case Match_LoadStoreUImm12_4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 16380]");
+  case Match_LoadStoreUImm12_8:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 32760]");
+  case Match_LoadStoreUImm12_16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic reference or integer in range [0, 65520]");
+  case Match_LogicalSecondSource:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected compatible register or logical immediate");
+  case Match_MOVWUImm16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected relocated symbol or integer in range [0, 65535]");
+  case Match_MRS:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected readable system register");
+  case Match_MSR:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected writable system register or pstate");
+  case Match_NamedImm_at:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
+  case Match_NamedImm_dbarrier:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+             "expected integer in range [0, 15] or symbolic barrier operand");
+  case Match_NamedImm_dc:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected symbolic 'dc' operand");
+  case Match_NamedImm_ic:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
+  case Match_NamedImm_isb:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15] or 'sy'");
+  case Match_NamedImm_prefetch:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
+  case Match_NamedImm_tlbi:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected translation buffer invalidation operand");
+  case Match_UImm16:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 65535]");
+  case Match_UImm3:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 7]");
+  case Match_UImm4:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15]");
+  case Match_UImm5:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 31]");
+  case Match_UImm6:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 63]");
+  case Match_UImm7:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 127]");
+  case Match_Width32:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [<lsb>, 31]");
+  case Match_Width64:
+    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [<lsb>, 63]");
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_CondCode:
+    OS << "<CondCode: " << CondCode.Code << ">";
+    break;
+  case k_FPImmediate:
+    OS << "<fpimm: " << FPImm.Val << ">";
+    break;
+  case k_ImmWithLSL:
+    OS << "<immwithlsl: imm=" << ImmWithLSL.Val
+       << ", shift=" << ImmWithLSL.ShiftAmount << ">";
+    break;
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << '>';
+    break;
+  case k_Token:
+    OS << '\'' << getToken() << '\'';
+    break;
+  case k_ShiftExtend:
+    OS << "<shift: type=" << ShiftExtend.ShiftType
+       << ", amount=" << ShiftExtend.Amount << ">";
+    break;
+  case k_SysReg: {
+    StringRef Name(SysReg.Data, SysReg.Length);
+    OS << "<sysreg: " << Name << '>';
+    break;
+  }
+  default:
+    llvm_unreachable("No idea how to print this kind of operand");
+    break;
+  }
+}
+
+void AArch64Operand::dump() const {
+  print(errs());
+}
+
+
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
new file mode 100644
index 000000000000..12c1b8f4c81a
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -0,0 +1,803 @@
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions necessary to decode AArch64 instruction
+// bitpatterns into MCInsts (with the help of TableGenerated information from
+// the instruction definitions).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-disassembler"
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+/// AArch64 disassembler for all AArch64 platforms.
+class AArch64Disassembler : public MCDisassembler {
+  const MCRegisterInfo *RegInfo;
+public:
+  /// Initializes the disassembler.
+  ///
+  AArch64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info)
+    : MCDisassembler(STI), RegInfo(Info) {
+  }
+
+  ~AArch64Disassembler() {
+  }
+
+  /// See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+};
+
+}
+
+// Forward-declarations used in the auto-generated files.
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
+                                              unsigned RegNo, uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
+                                               unsigned OptionHiS,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+
+static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
+                                        unsigned RmBits,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+template<int RegWidth>
+static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
+                                             unsigned FullImm,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+template<int RegWidth>
+static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
+                                            unsigned Bits,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
+static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
+                                           unsigned ShiftAmount,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
+                                            unsigned ShiftAmount,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
+                                                       unsigned Val,
+                                                       uint64_t Address,
+                                                       const void *Decoder);
+
+template<typename SomeNamedImmMapper>
+static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
+                                          unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus
+DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
+                    llvm::MCInst &Inst, unsigned Val,
+                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+
+static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
+                                                   unsigned Val,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+
+static bool Check(DecodeStatus &Out, DecodeStatus In);
+
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+    (bytes[2] << 16) |
+    (bytes[1] <<  8) |
+    (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
+                                          this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
+  return Dis->getRegInfo()->getRegClass(RC).getRegister(RegNo);
+}
+
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                        uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus
+DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus
+DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                         uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
+                                               unsigned OptionHiS,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
+  // S}. Hence we want to check bit 1.
+  if (!(OptionHiS & 2))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(OptionHiS));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
+  // between 0 and 31.
+  if (Imm6Bits > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
+                                               unsigned Imm6Bits,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
+  if (Imm6Bits < 32)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
+                                        unsigned RmBits,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  // Any bits are valid in the instruction (they're architecturally ignored),
+  // but a code generator should insert 0.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  return MCDisassembler::Success;
+}
+
+
+
+template<int RegWidth>
+static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
+                                             unsigned FullImm,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  unsigned Imm16 = FullImm & 0xffff;
+  unsigned Shift = FullImm >> 16;
+
+  if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Imm16));
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+  return MCDisassembler::Success;
+}
+
+template<int RegWidth>
+static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
+                                            unsigned Bits,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  uint64_t Imm;
+  if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Bits));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
+                                           unsigned ShiftAmount,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // Only values 0-4 are valid for this 3-bit field
+  if (ShiftAmount > 4)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
+                                            unsigned ShiftAmount,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  // Only values below 32 are valid for a 32-bit register
+  if (ShiftAmount > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
+  unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
+  unsigned SF = fieldFromInstruction(Insn, 31, 1);
+
+  // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
+  // out assertions that it thinks should never be hit.
+  enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
+  Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
+
+  if (!SF) {
+    // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
+    if (ImmR > 31 || ImmS > 31)
+      return MCDisassembler::Fail;
+  }
+
+  if (SF) {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    // BFM MCInsts use Rd as a source too.
+    if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
+    // BFM MCInsts use Rd as a source too.
+    if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // ASR and LSR have more specific patterns so they won't get here:
+  assert(!(ImmS == 31 && !SF && Opc != BFM)
+         && "shift should have used auto decode");
+  assert(!(ImmS == 63 && SF && Opc != BFM)
+         && "shift should have used auto decode");
+
+  // Extension instructions similarly:
+  if (Opc == SBFM && ImmR == 0) {
+    assert((ImmS != 7 && ImmS != 15) && "extension got here");
+    assert((ImmS != 31 || SF == 0) && "extension got here");
+  } else if (Opc == UBFM && ImmR == 0) {
+    assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
+  }
+
+  if (Opc == UBFM) {
+    // It might be a LSL instruction, which actually takes the shift amount
+    // itself as an MCInst operand.
+    if (SF && (ImmS + 1) % 64 == ImmR) {
+      Inst.setOpcode(AArch64::LSLxxi);
+      Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
+      return MCDisassembler::Success;
+    } else if (!SF && (ImmS + 1) % 32 == ImmR) {
+      Inst.setOpcode(AArch64::LSLwwi);
+      Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
+      return MCDisassembler::Success;
+    }
+  }
+
+  // Otherwise it's definitely either an extract or an insert depending on which
+  // of ImmR or ImmS is larger.
+  unsigned ExtractOp, InsertOp;
+  switch (Opc) {
+  default: llvm_unreachable("unexpected instruction trying to decode bitfield");
+  case SBFM:
+    ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
+    InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
+    break;
+  case BFM:
+    ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
+    InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
+    break;
+  case UBFM:
+    ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
+    InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
+    break;
+  }
+
+  // Otherwise it's a boring insert or extract
+  Inst.addOperand(MCOperand::CreateImm(ImmR));
+  Inst.addOperand(MCOperand::CreateImm(ImmS));
+
+
+  if (ImmS < ImmR)
+    Inst.setOpcode(InsertOp);
+  else
+    Inst.setOpcode(ExtractOp);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // This decoder exists to add the dummy Lane operand to the MCInst, which must
+  // be 1 in assembly but has no other real manifestation.
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
+
+  if (IsToVec) {
+    DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
+  } else {
+    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // Add the lane
+  Inst.addOperand(MCOperand::CreateImm(1));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus Result = MCDisassembler::Success;
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
+  unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
+  unsigned L = fieldFromInstruction(Insn, 22, 1);
+  unsigned V = fieldFromInstruction(Insn, 26, 1);
+  unsigned Opc = fieldFromInstruction(Insn, 30, 2);
+
+  // Not an official name, but it turns out that bit 23 distinguishes indexed
+  // from non-indexed operations.
+  unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
+
+  if (Indexed && L == 0) {
+    // The MCInst for an indexed store has an out operand and 4 ins:
+    //    Rn_wb, Rt, Rt2, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  // You shouldn't load to the same register twice in an instruction...
+  if (L && Rt == Rt2)
+    Result = MCDisassembler::SoftFail;
+
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    Result = MCDisassembler::SoftFail;
+
+  // Exactly how we decode the MCInst's registers depends on the Opc and V
+  // fields of the instruction. These also obviously determine the size of the
+  // operation so we can fill in that information while we're at it.
+  if (V) {
+    // The instruction operates on the FP/SIMD registers
+    switch (Opc) {
+    default: return MCDisassembler::Fail;
+    case 0:
+      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 1:
+      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 2:
+      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    }
+  } else {
+    switch (Opc) {
+    default: return MCDisassembler::Fail;
+    case 0:
+      DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 1:
+      assert(L && "unexpected \"store signed\" attempt");
+      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    case 2:
+      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
+      break;
+    }
+  }
+
+  if (Indexed && L == 1) {
+    // The MCInst for an indexed load has 3 out operands and an 3 ins:
+    //    Rt, Rt2, Rn_wb, Rt2, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(SImm7));
+
+  return Result;
+}
+
+static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
+                                                       uint32_t Val,
+                                                       uint64_t Address,
+                                                       const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Val, 0, 5);
+  unsigned Rn = fieldFromInstruction(Val, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
+  unsigned MemSize = fieldFromInstruction(Val, 30, 2);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (Rt == Rt2) S = MCDisassembler::SoftFail;
+
+  switch (MemSize) {
+    case 2:
+      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case 3:
+      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
+  }
+
+  if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+template<typename SomeNamedImmMapper>
+static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
+                                          unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  SomeNamedImmMapper Mapper;
+  bool ValidNamed;
+  Mapper.toString(Val, ValidNamed);
+  if (ValidNamed || Mapper.validImm(Val)) {
+    Inst.addOperand(MCOperand::CreateImm(Val));
+    return MCDisassembler::Success;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                                        llvm::MCInst &Inst,
+                                        unsigned Val,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  bool ValidNamed;
+  Mapper.toString(Val, ValidNamed);
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
+}
+
+static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
+                             Decoder);
+}
+
+static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
+                                     unsigned Val,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
+                             Decoder);
+}
+
+static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
+
+  unsigned Opc = fieldFromInstruction(Insn, 22, 2);
+  unsigned V = fieldFromInstruction(Insn, 26, 1);
+  unsigned Size = fieldFromInstruction(Insn, 30, 2);
+
+  if (Opc == 0 || (V == 1 && Opc == 2)) {
+    // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  if (V == 0 && (Opc == 2 || Size == 3)) {
+    DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
+  } else if (V == 0) {
+    DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
+  } else if (V == 1 && (Opc & 2)) {
+    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+  } else {
+    switch (Size) {
+    case 0:
+      DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 1:
+      DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  if (Opc != 0 && (V != 1 || Opc != 2)) {
+    // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  }
+
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(Imm9));
+
+  // N.b. The official documentation says undpredictable if Rt == Rn, but this
+  // takes place at the architectural rather than encoding level:
+  //
+  // "STR xzr, [sp], #4" is perfectly valid.
+  if (V == 0 && Rt == Rn && Rn != 31)
+    return MCDisassembler::SoftFail;
+  else
+    return MCDisassembler::Success;
+}
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI) {
+  return new AArch64Disassembler(STI, T.createMCRegInfo(""));
+}
+
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64Target,
+                                         createAArch64Disassembler);
+}
+
+
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
new file mode 100644
index 000000000000..82ce80c8b1a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -0,0 +1,408 @@
+//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter.inc"
+
+static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
+  assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
+  if (Value & (1ULL <<  (BitWidth - 1)))
+    return static_cast<int64_t>(Value) - (1LL << BitWidth);
+  else
+    return Value;
+}
+
+AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
+                                       const MCInstrInfo &MII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void
+AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
+                                              unsigned OpNum, raw_ostream &O) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+  int32_t Imm = unpackSignedImm(9, MOImm.getImm());
+
+  O << '#' << Imm;
+}
+
+void
+AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O, unsigned MemSize,
+                                          unsigned RmSize) {
+  unsigned ExtImm = MI->getOperand(OpNum).getImm();
+  unsigned OptionHi = ExtImm >> 1;
+  unsigned S = ExtImm & 1;
+  bool IsLSL = OptionHi == 1 && RmSize == 64;
+
+  const char *Ext;
+  switch (OptionHi) {
+  case 1:
+    Ext = (RmSize == 32) ? "uxtw" : "lsl";
+    break;
+  case 3:
+    Ext = (RmSize == 32) ? "sxtw" : "sxtx";
+    break;
+  default:
+    llvm_unreachable("Incorrect Option on load/store (reg offset)");
+  }
+  O << Ext;
+
+  if (S) {
+    unsigned ShiftAmt = Log2_32(MemSize);
+    O << " #" << ShiftAmt;
+  } else if (IsLSL) {
+    O << " #0";
+  }
+}
+
+void
+AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
+                                              unsigned OpNum, raw_ostream &O) {
+  const MCOperand &Imm12Op = MI->getOperand(OpNum);
+
+  if (Imm12Op.isImm()) {
+    int64_t Imm12 = Imm12Op.getImm();
+    assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
+    O << "#" << Imm12;
+  } else {
+    assert(Imm12Op.isExpr() && "Unexpected shift operand type");
+    O << "#" << *Imm12Op.getExpr();
+  }
+}
+
+void
+AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+
+  printAddSubImmLSL0Operand(MI, OpNum, O);
+
+  O << ", lsl #12";
+}
+
+void
+AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << MO.getImm();
+}
+
+template<unsigned RegWidth> void
+AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &ImmROp = MI->getOperand(OpNum);
+  unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
+
+  O << '#' << LSB;
+}
+
+void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  const MCOperand &ImmSOp = MI->getOperand(OpNum);
+  unsigned Width = ImmSOp.getImm() + 1;
+
+  O << '#' << Width;
+}
+
+void
+AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &ImmSOp = MI->getOperand(OpNum);
+  const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
+
+  unsigned ImmR = ImmROp.getImm();
+  unsigned ImmS = ImmSOp.getImm();
+
+  assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
+
+  O << '#' << (ImmS - ImmR + 1);
+}
+
+void
+AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O) {
+    const MCOperand &CRx = MI->getOperand(OpNum);
+
+    O << 'c' << CRx.getImm();
+}
+
+
+void
+AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+    const MCOperand &ScaleOp = MI->getOperand(OpNum);
+
+    O << '#' << (64 - ScaleOp.getImm());
+}
+
+
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &o) {
+  const MCOperand &MOImm8 = MI->getOperand(OpNum);
+
+  assert(MOImm8.isImm()
+         && "Immediate operand required for floating-point immediate inst");
+
+  uint32_t Imm8 = MOImm8.getImm();
+  uint32_t Fraction = Imm8 & 0xf;
+  uint32_t Exponent = (Imm8 >> 4) & 0x7;
+  uint32_t Negative = (Imm8 >> 7) & 0x1;
+
+  float Val = 1.0f + Fraction / 16.0f;
+
+  // That is:
+  // 000 -> 2^1,  001 -> 2^2,  010 -> 2^3,  011 -> 2^4,
+  // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
+  if (Exponent & 0x4) {
+    Val /= 1 << (7 - Exponent);
+  } else {
+    Val *= 1 << (Exponent + 1);
+  }
+
+  Val = Negative ? -Val : Val;
+
+  o << '#' << format("%.8f", Val);
+}
+
+void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &o) {
+  o << "#0.0";
+}
+
+void
+AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
+}
+
+template <unsigned field_width, unsigned scale> void
+AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (!MO.isImm()) {
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
+  // is multiplied by 4 (because all A64 instructions are 32-bits wide).
+  uint64_t UImm = MO.getImm();
+  uint64_t Sign = UImm & (1LL << (field_width - 1));
+  int64_t SImm = scale * ((UImm & ~Sign) - Sign);
+
+  O << "#" << SImm;
+}
+
+template<unsigned RegWidth> void
+AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint64_t Val;
+  A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
+  O << "#0x";
+  O.write_hex(Val);
+}
+
+void
+AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O, int MemSize) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+
+  if (MOImm.isImm()) {
+    uint32_t Imm = MOImm.getImm() * MemSize;
+
+    O << "#" << Imm;
+  } else {
+    O << "#" << *MOImm.getExpr();
+  }
+}
+
+void
+AArch64InstPrinter::printShiftOperand(const MCInst *MI,  unsigned OpNum,
+                                      raw_ostream &O,
+                                      A64SE::ShiftExtSpecifiers Shift) {
+    const MCOperand &MO = MI->getOperand(OpNum);
+
+    // LSL #0 is not printed
+    if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
+        return;
+
+    switch (Shift) {
+    case A64SE::LSL: O << "lsl"; break;
+    case A64SE::LSR: O << "lsr"; break;
+    case A64SE::ASR: O << "asr"; break;
+    case A64SE::ROR: O << "ror"; break;
+    default: llvm_unreachable("Invalid shift specifier in logical instruction");
+    }
+
+  O << " #" << MO.getImm();
+}
+
+void
+AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI,  unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &UImm16MO = MI->getOperand(OpNum);
+  const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
+
+  if (UImm16MO.isImm()) {
+    O << '#' << UImm16MO.getImm();
+
+    if (ShiftMO.getImm() != 0)
+      O << ", lsl #" << (ShiftMO.getImm() * 16);
+
+    return;
+  }
+
+  O << "#" << *UImm16MO.getExpr();
+}
+
+void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
+                                              const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  bool ValidName;
+  const MCOperand &MO = MI->getOperand(OpNum);
+  StringRef Name = Mapper.toString(MO.getImm(), ValidName);
+
+  if (ValidName)
+    O << Name;
+  else
+    O << '#' << MO.getImm();
+}
+
+void
+AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                                       const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  bool ValidName;
+  std::string Name = Mapper.toString(MO.getImm(), ValidName);
+  if (ValidName) {
+    O << Name;
+    return;
+  }
+}
+
+
+void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
+                                               unsigned OpNum,
+                                               raw_ostream &O,
+                                               A64SE::ShiftExtSpecifiers Ext) {
+  // FIXME: In principle TableGen should be able to detect this itself far more
+  // easily. We will only accumulate more of these hacks.
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+
+  if (isStackReg(Reg0) || isStackReg(Reg1)) {
+    A64SE::ShiftExtSpecifiers LSLEquiv;
+
+    if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
+      LSLEquiv = A64SE::UXTX;
+    else
+      LSLEquiv = A64SE::UXTW;
+
+    if (Ext == LSLEquiv) {
+      O << "lsl #" << MI->getOperand(OpNum).getImm();
+      return;
+    }
+  }
+
+  switch (Ext) {
+  case A64SE::UXTB: O << "uxtb"; break;
+  case A64SE::UXTH: O << "uxth"; break;
+  case A64SE::UXTW: O << "uxtw"; break;
+  case A64SE::UXTX: O << "uxtx"; break;
+  case A64SE::SXTB: O << "sxtb"; break;
+  case A64SE::SXTH: O << "sxth"; break;
+  case A64SE::SXTW: O << "sxtw"; break;
+  case A64SE::SXTX: O << "sxtx"; break;
+  default: llvm_unreachable("Unexpected shift type for printing");
+  }
+
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.getImm() != 0)
+    O << " #" << MO.getImm();
+}
+
+template<int MemScale> void
+AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &MOImm = MI->getOperand(OpNum);
+  int32_t Imm = unpackSignedImm(7, MOImm.getImm());
+
+  O << "#" << (Imm * MemScale);
+}
+
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  if (MI->getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a special assembler directive which applies an
+    // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
+    // form outside the normal TableGenerated scheme.
+    O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
+  } else if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+
+  printAnnotation(O, Annot);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
new file mode 100644
index 000000000000..639fa869c016
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -0,0 +1,172 @@
+//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an AArch64 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64INSTPRINTER_H
+#define LLVM_AARCH64INSTPRINTER_H
+
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class AArch64InstPrinter : public MCInstPrinter {
+public:
+  AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  // Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  static const char *getInstructionName(unsigned Opcode);
+
+  void printRegName(raw_ostream &O, unsigned RegNum) const;
+
+  template<unsigned MemSize, unsigned RmSize>
+  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O) {
+    printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+  }
+
+
+  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O, unsigned MemSize,
+                                 unsigned RmSize);
+
+  void printAddSubImmLSL0Operand(const MCInst *MI,
+                                 unsigned OpNum, raw_ostream &O);
+  void printAddSubImmLSL12Operand(const MCInst *MI,
+                                  unsigned OpNum, raw_ostream &O);
+
+  void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<unsigned RegWidth>
+  void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+
+  void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O);
+
+  void printCRxOperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O);
+
+  void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
+
+  void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
+
+  template<int MemScale>
+  void printOffsetUImm12Operand(const MCInst *MI,
+                                  unsigned OpNum, raw_ostream &o) {
+    printOffsetUImm12Operand(MI, OpNum, o, MemScale);
+  }
+
+  void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &o, int MemScale);
+
+  template<unsigned field_width, unsigned scale>
+  void printLabelOperand(const MCInst *MI, unsigned OpNum,
+                         raw_ostream &O);
+
+  template<unsigned RegWidth>
+  void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<typename SomeNamedImmMapper>
+  void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O) {
+    printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
+  }
+
+  void printNamedImmOperand(const NamedImmMapper &Mapper,
+                            const MCInst *MI, unsigned OpNum,
+                            raw_ostream &O);
+
+  void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
+                          const MCInst *MI, unsigned OpNum,
+                          raw_ostream &O);
+
+  void printMRSOperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O) {
+    printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
+  }
+
+  void printMSROperand(const MCInst *MI, unsigned OpNum,
+                       raw_ostream &O) {
+    printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
+  }
+
+  void printShiftOperand(const char *name, const MCInst *MI,
+                         unsigned OpIdx, raw_ostream &O);
+
+  void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("lsr", MI, OpNum, O);
+  }
+  void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("asr", MI, OpNum, O);
+  }
+  void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand("ror", MI, OpNum, O);
+  }
+
+  template<A64SE::ShiftExtSpecifiers Shift>
+  void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printShiftOperand(MI, OpNum, O, Shift);
+  }
+
+  void printShiftOperand(const MCInst *MI, unsigned OpNum,
+                         raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
+
+
+  void printMoveWideImmOperand(const  MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  template<int MemSize> void
+  printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  template<A64SE::ShiftExtSpecifiers EXT>
+  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                             raw_ostream &O) {
+    printRegExtendOperand(MI, OpNum, O, EXT);
+  }
+
+  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
+                             raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+  bool isStackReg(unsigned RegNo) {
+    return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+  }
+
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
new file mode 100644
index 000000000000..a3373b1087bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -0,0 +1,585 @@
+//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 implementation of the MCAsmBackend class,
+// which is principally concerned with relaxation of the various fixup kinds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class AArch64AsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
+public:
+  AArch64AsmBackend(const Target &T, const StringRef TT)
+    : MCAsmBackend(),
+      STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
+    {}
+
+
+  ~AArch64AsmBackend() {
+    delete STI;
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  virtual void processFixupValue(const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFixup &Fixup, const MCFragment *DF,
+                                 MCValue &Target, uint64_t &Value,
+                                 bool &IsResolved);
+};
+} // end anonymous namespace
+
+void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
+                                          const MCAsmLayout &Layout,
+                                          const MCFixup &Fixup,
+                                          const MCFragment *DF,
+                                          MCValue &Target, uint64_t &Value,
+                                          bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
+      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
+    IsResolved = false;
+}
+
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFAArch64AsmBackend(const Target &T, const StringRef TT,
+                       uint8_t _OSABI)
+    : AArch64AsmBackend(T, TT), OSABI(_OSABI) { }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
+  unsigned int getNumFixupKinds() const {
+    return AArch64::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// AArch64FixupKinds.h.
+//
+// Name                   Offset (bits)    Size (bits)    Flags
+{ "fixup_a64_ld_prel",               0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_adr_prel",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_adr_prel_page",         0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_add_lo12",              0,    32,             0 },
+{ "fixup_a64_ldst8_lo12",            0,    32,             0 },
+{ "fixup_a64_ldst16_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst32_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst64_lo12",           0,    32,             0 },
+{ "fixup_a64_ldst128_lo12",          0,    32,             0 },
+{ "fixup_a64_tstbr",                 0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_condbr",                0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_uncondbr",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_call",                  0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_movw_uabs_g0",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g0_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g1",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g1_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g2",          0,    32,             0 },
+{ "fixup_a64_movw_uabs_g2_nc",       0,    32,             0 },
+{ "fixup_a64_movw_uabs_g3",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g0",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g1",          0,    32,             0 },
+{ "fixup_a64_movw_sabs_g2",          0,    32,             0 },
+{ "fixup_a64_adr_prel_got_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_ld64_got_lo12_nc",      0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g2",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g1",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g1_nc",     0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g0",        0,    32,             0 },
+{ "fixup_a64_movw_dtprel_g0_nc",     0,    32,             0 },
+{ "fixup_a64_add_dtprel_hi12",       0,    32,             0 },
+{ "fixup_a64_add_dtprel_lo12",       0,    32,             0 },
+{ "fixup_a64_add_dtprel_lo12_nc",    0,    32,             0 },
+{ "fixup_a64_ldst8_dtprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst8_dtprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst16_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst16_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ldst32_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst32_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ldst64_dtprel_lo12",    0,    32,             0 },
+{ "fixup_a64_ldst64_dtprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_movw_gottprel_g1",      0,    32,             0 },
+{ "fixup_a64_movw_gottprel_g0_nc",   0,    32,             0 },
+{ "fixup_a64_adr_gottprel_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_ld64_gottprel_lo12_nc", 0,    32,             0 },
+{ "fixup_a64_ld_gottprel_prel19",    0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_movw_tprel_g2",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g1",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g1_nc",      0,    32,             0 },
+{ "fixup_a64_movw_tprel_g0",         0,    32,             0 },
+{ "fixup_a64_movw_tprel_g0_nc",      0,    32,             0 },
+{ "fixup_a64_add_tprel_hi12",        0,    32,             0 },
+{ "fixup_a64_add_tprel_lo12",        0,    32,             0 },
+{ "fixup_a64_add_tprel_lo12_nc",     0,    32,             0 },
+{ "fixup_a64_ldst8_tprel_lo12",      0,    32,             0 },
+{ "fixup_a64_ldst8_tprel_lo12_nc",   0,    32,             0 },
+{ "fixup_a64_ldst16_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst16_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst32_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst32_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_ldst64_tprel_lo12",     0,    32,             0 },
+{ "fixup_a64_ldst64_tprel_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_tlsdesc_adr_page",      0,    32, MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_a64_tlsdesc_ld64_lo12_nc",  0,    32,             0 },
+{ "fixup_a64_tlsdesc_add_lo12_nc",   0,    32,             0 },
+{ "fixup_a64_tlsdesc_call",          0,     0,             0 }
+    };
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value) return;           // Doesn't change encoding.
+
+    unsigned Offset = Fixup.getOffset();
+    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  }
+
+  bool mayNeedRelaxation(const MCInst&) const {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst&, llvm::MCInst&) const {
+    llvm_unreachable("Cannot relax instructions");
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createAArch64ELFObjectWriter(OS, OSABI);
+  }
+};
+
+} // end anonymous namespace
+
+bool
+ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                           uint64_t Value,
+                                           const MCRelaxableFragment *DF,
+                                           const MCAsmLayout &Layout) const {
+  // Correct for now. With all instructions 32-bit only very low-level
+  // considerations could make you select something which may fail.
+  return false;
+}
+
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // Can't emit NOP with size not multiple of 32-bits
+  if (Count % 4 != 0)
+    return false;
+
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(0xd503201f);
+
+  return true;
+}
+
+static unsigned ADRImmBits(unsigned Value) {
+  unsigned lo2 = Value & 0x3;
+  unsigned hi19 = (Value & 0x1fffff) >> 2;
+
+  return (hi19 << 5) | (lo2 << 29);
+}
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_2:
+    assert((int64_t)Value >= -32768 &&
+           (int64_t)Value <= 65536 &&
+           "Out of range ABS16 fixup");
+    return Value;
+  case FK_Data_4:
+    assert((int64_t)Value >= -(1LL << 31) &&
+           (int64_t)Value <= (1LL << 32) - 1 &&
+           "Out of range ABS32 fixup");
+    return Value;
+  case FK_Data_8:
+    return Value;
+
+  case AArch64::fixup_a64_ld_gottprel_prel19:
+    // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
+    // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
+  case AArch64::fixup_a64_ld_prel:
+    // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
+    // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
+    return (Value & 0x1ffffc) << 3;
+
+  case AArch64::fixup_a64_adr_prel:
+    // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
+    // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
+    return ADRImmBits(Value & 0x1fffff);
+
+  case AArch64::fixup_a64_adr_prel_page:
+    // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
+    // F000 of the result of the operation, checking that -2^32 <= result <
+    // 2^32.
+    assert((int64_t)Value >= -(1LL << 32) &&
+           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
+    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
+
+  case AArch64::fixup_a64_add_dtprel_hi12:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
+    // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
+  case AArch64::fixup_a64_add_tprel_hi12:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
+    // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
+    assert((int64_t)Value >= 0 &&
+           (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
+    return (Value & 0xfff000) >> 2;
+
+  case AArch64::fixup_a64_add_dtprel_lo12:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
+    // FFF of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_add_tprel_lo12:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
+    // FFF of TPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t)Value >= 0 &&
+           (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_add_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
+    // FFF of DTPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_add_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
+    // FFF of TPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
+    // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
+    // FFF of G(TLSDESC(S+A)), with no overflow check.
+  case AArch64::fixup_a64_add_lo12:
+    // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
+    // S+A, with no overflow check.
+    return (Value & 0xfff) << 10;
+
+  case AArch64::fixup_a64_ldst8_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst8_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst8_lo12:
+    // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
+    // of S+A, with no overflow check.
+    return (Value & 0xfff) << 10;
+
+  case AArch64::fixup_a64_ldst16_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst16_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst16_lo12:
+    // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
+    // of S+A, with no overflow check.
+    return (Value & 0xffe) << 9;
+
+  case AArch64::fixup_a64_ldst32_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst32_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst32_lo12:
+    // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
+    // of S+A, with no overflow check.
+    return (Value & 0xffc) << 8;
+
+  case AArch64::fixup_a64_ldst64_dtprel_lo12:
+    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+  case AArch64::fixup_a64_ldst64_tprel_lo12:
+    // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), check 0 <= X < 2^12.
+    assert((int64_t) Value >= 0 &&
+           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
+    // ... fallthrough to no-checking versions ...
+  case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
+    // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
+    // of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_ldst64_lo12:
+    // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
+    // of S+A, with no overflow check.
+    return (Value & 0xff8) << 7;
+
+  case AArch64::fixup_a64_ldst128_lo12:
+    // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
+    // of S+A, with no overflow check.
+    return (Value & 0xff0) << 6;
+
+  case AArch64::fixup_a64_movw_uabs_g0:
+    // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
+    // with a check that S+A < 2^16
+    assert(Value <= 0xffff && "Out of range move wide fixup");
+    return (Value & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g0_nc:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of DTPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_movw_gottprel_g0_nc:
+    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
+  case AArch64::fixup_a64_movw_tprel_g0_nc:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
+    // FFFF of TPREL(S+A) with no overflow check.
+  case AArch64::fixup_a64_movw_uabs_g0_nc:
+    // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
+    // S+A with no overflow check.
+    return (Value & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g1:
+    // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
+    // S+A with a check that S+A < 2^32
+    assert(Value <= 0xffffffffull && "Out of range move wide fixup");
+    return ((Value >> 16) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g1_nc:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
+    // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_movw_tprel_g1_nc:
+    // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
+    // to bits FFFF0000 of TPREL(S+A), with no overflow check.
+  case AArch64::fixup_a64_movw_uabs_g1_nc:
+    // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
+    // FFFF0000 of S+A with no overflow check.
+    return ((Value >> 16) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g2:
+    // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
+    // 0000 of S+A with a check that S+A < 2^48
+    assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
+    return ((Value >> 32) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g2_nc:
+    // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
+    // 0000 of S+A with no overflow check.
+    return ((Value >> 32) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_uabs_g3:
+    // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
+    // 0000 0000 of S+A (no overflow check needed)
+    return ((Value >> 48) & 0xffff) << 5;
+
+  case AArch64::fixup_a64_movw_dtprel_g0:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
+    // to bits FFFF of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_tprel_g0:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
+    // bits FFFF of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g0: {
+    // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
+    // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
+    // should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = (Value & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      // MCCodeEmitter should have encoded a MOVN, which is fine.
+      Value = (~Value & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_movw_dtprel_g1:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
+    // to bits FFFF0000 of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_gottprel_g1:
+    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
+    // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
+  case AArch64::fixup_a64_movw_tprel_g1:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
+    // bits FFFF0000 of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g1: {
+    // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
+    // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
+    // should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = ((Value >> 16) & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      Value = ((~Value >> 16) & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_movw_dtprel_g2:
+    // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
+    // to bits FFFF 0000 0000 of DTPREL(S+A).
+  case AArch64::fixup_a64_movw_tprel_g2:
+    // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
+    // bits FFFF 0000 0000 of TPREL(S+A).
+  case AArch64::fixup_a64_movw_sabs_g2: {
+    // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
+    // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
+    // we should convert between MOVN and MOVZ to achieve our goals).
+    int64_t Signed = Value;
+    assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
+           && "Out of range move wide fixup");
+    if (Signed >= 0) {
+      Value = ((Value >> 32) & 0xffff) << 5;
+      // Bit 30 converts the MOVN encoding into a MOVZ
+      Value |= 1 << 30;
+    } else {
+      Value = ((~Value >> 32) & 0xffff) << 5;
+    }
+    return Value;
+  }
+
+  case AArch64::fixup_a64_tstbr:
+    // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
+    // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
+    assert((int64_t)Value >= -(1LL << 15) &&
+           (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
+    return (Value & 0xfffc) << (5 - 2);
+
+  case AArch64::fixup_a64_condbr:
+    // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
+    // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
+    assert((int64_t)Value >= -(1LL << 20) &&
+           (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
+    return (Value & 0x1ffffc) << (5 - 2);
+
+  case AArch64::fixup_a64_uncondbr:
+    // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
+  case AArch64::fixup_a64_call:
+    // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
+    // checking that -2^27 <= S+A-P < 2^27.
+    assert((int64_t)Value >= -(1LL << 27) &&
+           (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
+    return (Value & 0xffffffc) >> 2;
+
+  case AArch64::fixup_a64_adr_gottprel_page:
+    // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
+    // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
+  case AArch64::fixup_a64_tlsdesc_adr_page:
+    // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
+    // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
+  case AArch64::fixup_a64_adr_prel_got_page:
+    // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
+    // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
+    // 2^32.
+    assert((int64_t)Value >= -(1LL << 32) &&
+           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
+    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
+
+  case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
+    // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
+    // of X, with no overflow check. Check that X & 7 == 0.
+  case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
+    // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
+    // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
+  case AArch64::fixup_a64_ld64_got_lo12_nc:
+    // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
+    // G(S) with no overflow check. Check X & 7 == 0
+    assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
+    return (Value & 0xff8) << 7;
+
+  case AArch64::fixup_a64_tlsdesc_call:
+    // R_AARCH64_TLSDESC_CALL: For relaxation only.
+    return 0;
+  }
+}
+
+MCAsmBackend *
+llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
new file mode 100644
index 000000000000..4bcc65dfca27
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -0,0 +1,292 @@
+//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file handles ELF-specific object emission, converting LLVM's internal
+// fixups into the appropriate relocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  AArch64ELFObjectWriter(uint8_t OSABI);
+
+  virtual ~AArch64ELFObjectWriter();
+
+protected:
+  virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                bool IsPCRel, bool IsRelocWithSymbol,
+                                int64_t Addend) const;
+private:
+};
+}
+
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                            /*HasRelocationAddend*/ true)
+{}
+
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
+{}
+
+unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsPCRel,
+                                              bool IsRelocWithSymbol,
+                                              int64_t Addend) const {
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_2:
+      return ELF::R_AARCH64_PREL16;
+    case AArch64::fixup_a64_ld_prel:
+      Type = ELF::R_AARCH64_LD_PREL_LO19;
+      break;
+    case AArch64::fixup_a64_adr_prel:
+      Type = ELF::R_AARCH64_ADR_PREL_LO21;
+      break;
+    case AArch64::fixup_a64_adr_prel_page:
+      Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      break;
+    case AArch64::fixup_a64_adr_prel_got_page:
+      Type = ELF::R_AARCH64_ADR_GOT_PAGE;
+      break;
+    case AArch64::fixup_a64_tstbr:
+      Type = ELF::R_AARCH64_TSTBR14;
+      break;
+    case AArch64::fixup_a64_condbr:
+      Type = ELF::R_AARCH64_CONDBR19;
+      break;
+    case AArch64::fixup_a64_uncondbr:
+      Type = ELF::R_AARCH64_JUMP26;
+      break;
+    case AArch64::fixup_a64_call:
+      Type = ELF::R_AARCH64_CALL26;
+      break;
+    case AArch64::fixup_a64_adr_gottprel_page:
+      Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      break;
+    case AArch64::fixup_a64_ld_gottprel_prel19:
+      Type =  ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      break;
+    case AArch64::fixup_a64_tlsdesc_adr_page:
+      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_2:
+      return ELF::R_AARCH64_ABS16;
+    case AArch64::fixup_a64_add_lo12:
+      Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ld64_got_lo12_nc:
+      Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_lo12:
+      Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_lo12:
+      Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_lo12:
+      Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_lo12:
+      Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst128_lo12:
+      Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g0:
+      Type = ELF::R_AARCH64_MOVW_UABS_G0;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g0_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g1:
+      Type = ELF::R_AARCH64_MOVW_UABS_G1;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g1_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g2:
+      Type = ELF::R_AARCH64_MOVW_UABS_G2;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g2_nc:
+      Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      break;
+    case AArch64::fixup_a64_movw_uabs_g3:
+      Type = ELF::R_AARCH64_MOVW_UABS_G3;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g0:
+      Type = ELF::R_AARCH64_MOVW_SABS_G0;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g1:
+      Type = ELF::R_AARCH64_MOVW_SABS_G1;
+      break;
+    case AArch64::fixup_a64_movw_sabs_g2:
+      Type = ELF::R_AARCH64_MOVW_SABS_G2;
+      break;
+
+    // TLS Local-dynamic block
+    case AArch64::fixup_a64_movw_dtprel_g2:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g1:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g1_nc:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g0:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      break;
+    case AArch64::fixup_a64_movw_dtprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_add_dtprel_hi12:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      break;
+    case AArch64::fixup_a64_add_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_add_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_dtprel_lo12:
+      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      break;
+
+    // TLS initial-exec block
+    case AArch64::fixup_a64_movw_gottprel_g1:
+      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_gottprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      break;
+
+    // TLS local-exec block
+    case AArch64::fixup_a64_movw_tprel_g2:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g1:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g1_nc:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g0:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      break;
+    case AArch64::fixup_a64_movw_tprel_g0_nc:
+      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      break;
+    case AArch64::fixup_a64_add_tprel_hi12:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      break;
+    case AArch64::fixup_a64_add_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_add_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst8_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst16_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst32_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
+      break;
+    case AArch64::fixup_a64_ldst64_tprel_lo12:
+      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      break;
+    case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
+      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      break;
+
+    // TLS general-dynamic block
+    case AArch64::fixup_a64_tlsdesc_adr_page:
+      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      break;
+    case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
+      Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+      break;
+    case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
+      Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      break;
+    case AArch64::fixup_a64_tlsdesc_call:
+      Type = ELF::R_AARCH64_TLSDESC_CALL;
+      break;
+    }
+  }
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
+                                                   uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
new file mode 100644
index 000000000000..b83577af45c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -0,0 +1,160 @@
+//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits AArch64 ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
+// regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// AArch64 ELF ABI:
+///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
+///
+/// In brief: $x or $d should be emitted at the start of each contiguous region
+/// of A64 code or data in a section. In practice, this emission does not rely
+/// on explicit assembler directives but on inherent properties of the
+/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
+/// instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class AArch64ELFStreamer : public MCELFStreamer {
+public:
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                 raw_ostream &OS, MCCodeEmitter *Emitter)
+    : MCELFStreamer(Context, TAB, OS, Emitter),
+      MappingSymbolCounter(0), LastEMS(EMS_None) {
+  }
+
+  ~AArch64ELFStreamer() {}
+
+  virtual void ChangeSection(const MCSection *Section) {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection()] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  virtual void EmitInstruction(const MCInst& Inst) {
+    EmitA64MappingSymbol();
+    MCELFStreamer::EmitInstruction(Inst);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data, AddrSpace);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
+  /// if necessary.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_A64,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data) return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitA64MappingSymbol() {
+    if (LastEMS == EMS_A64) return;
+    EmitMappingSymbol("$x");
+    LastEMS = EMS_A64;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol =
+      getContext().GetOrCreateSymbol(Name + "." +
+                                     Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection());
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  /// @}
+};
+}
+
+namespace llvm {
+  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack) {
+    AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+    if (RelaxAll)
+      S->getAssembler().setRelaxAll(true);
+    if (NoExecStack)
+      S->getAssembler().setNoExecStack(true);
+    return S;
+  }
+}
+
+
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
new file mode 100644
index 000000000000..5a89ca50cee8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -0,0 +1,27 @@
+//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_ELF_STREAMER_H
+#define LLVM_AARCH64_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                          raw_ostream &OS,
+                                          MCCodeEmitter *Emitter,
+                                          bool RelaxAll, bool NoExecStack);
+}
+
+#endif // AArch64_ELF_STREAMER_H
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
new file mode 100644
index 000000000000..eeb122d38494
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -0,0 +1,113 @@
+//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the LLVM fixups applied to MCInsts in the AArch64
+// backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+  namespace AArch64 {
+    enum Fixups {
+      fixup_a64_ld_prel = FirstTargetFixupKind,
+      fixup_a64_adr_prel,
+      fixup_a64_adr_prel_page,
+
+      fixup_a64_add_lo12,
+
+      fixup_a64_ldst8_lo12,
+      fixup_a64_ldst16_lo12,
+      fixup_a64_ldst32_lo12,
+      fixup_a64_ldst64_lo12,
+      fixup_a64_ldst128_lo12,
+
+      fixup_a64_tstbr,
+      fixup_a64_condbr,
+      fixup_a64_uncondbr,
+      fixup_a64_call,
+
+      fixup_a64_movw_uabs_g0,
+      fixup_a64_movw_uabs_g0_nc,
+      fixup_a64_movw_uabs_g1,
+      fixup_a64_movw_uabs_g1_nc,
+      fixup_a64_movw_uabs_g2,
+      fixup_a64_movw_uabs_g2_nc,
+      fixup_a64_movw_uabs_g3,
+
+      fixup_a64_movw_sabs_g0,
+      fixup_a64_movw_sabs_g1,
+      fixup_a64_movw_sabs_g2,
+
+      fixup_a64_adr_prel_got_page,
+      fixup_a64_ld64_got_lo12_nc,
+
+      // Produce offsets relative to the module's dynamic TLS area.
+      fixup_a64_movw_dtprel_g2,
+      fixup_a64_movw_dtprel_g1,
+      fixup_a64_movw_dtprel_g1_nc,
+      fixup_a64_movw_dtprel_g0,
+      fixup_a64_movw_dtprel_g0_nc,
+      fixup_a64_add_dtprel_hi12,
+      fixup_a64_add_dtprel_lo12,
+      fixup_a64_add_dtprel_lo12_nc,
+      fixup_a64_ldst8_dtprel_lo12,
+      fixup_a64_ldst8_dtprel_lo12_nc,
+      fixup_a64_ldst16_dtprel_lo12,
+      fixup_a64_ldst16_dtprel_lo12_nc,
+      fixup_a64_ldst32_dtprel_lo12,
+      fixup_a64_ldst32_dtprel_lo12_nc,
+      fixup_a64_ldst64_dtprel_lo12,
+      fixup_a64_ldst64_dtprel_lo12_nc,
+
+      // Produce the GOT entry containing a variable's address in TLS's
+      // initial-exec mode.
+      fixup_a64_movw_gottprel_g1,
+      fixup_a64_movw_gottprel_g0_nc,
+      fixup_a64_adr_gottprel_page,
+      fixup_a64_ld64_gottprel_lo12_nc,
+      fixup_a64_ld_gottprel_prel19,
+
+      // Produce offsets relative to the thread pointer: TPIDR_EL0.
+      fixup_a64_movw_tprel_g2,
+      fixup_a64_movw_tprel_g1,
+      fixup_a64_movw_tprel_g1_nc,
+      fixup_a64_movw_tprel_g0,
+      fixup_a64_movw_tprel_g0_nc,
+      fixup_a64_add_tprel_hi12,
+      fixup_a64_add_tprel_lo12,
+      fixup_a64_add_tprel_lo12_nc,
+      fixup_a64_ldst8_tprel_lo12,
+      fixup_a64_ldst8_tprel_lo12_nc,
+      fixup_a64_ldst16_tprel_lo12,
+      fixup_a64_ldst16_tprel_lo12_nc,
+      fixup_a64_ldst32_tprel_lo12,
+      fixup_a64_ldst32_tprel_lo12_nc,
+      fixup_a64_ldst64_tprel_lo12,
+      fixup_a64_ldst64_tprel_lo12_nc,
+
+      // Produce the special fixups used by the general-dynamic TLS model.
+      fixup_a64_tlsdesc_adr_page,
+      fixup_a64_tlsdesc_ld64_lo12_nc,
+      fixup_a64_tlsdesc_add_lo12_nc,
+      fixup_a64_tlsdesc_call,
+
+
+      // Marker
+      LastTargetFixupKind,
+      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
new file mode 100644
index 000000000000..8ec8cbf1c525
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -0,0 +1,41 @@
+//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AArch64MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCAsmInfo.h"
+
+using namespace llvm;
+
+AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
+  PointerSize = 8;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "//";
+  PrivateGlobalPrefix = ".L";
+  Code32Directive = ".code\t32";
+
+  Data16bitsDirective = "\t.hword\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = "\t.xword\t";
+
+  UseDataRegionDirectives = true;
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
new file mode 100644
index 000000000000..a20bc471c20d
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -0,0 +1,27 @@
+//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AArch64MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64TARGETASMINFO_H
+#define LLVM_AARCH64TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+
+  struct AArch64ELFMCAsmInfo : public MCAsmInfo {
+    explicit AArch64ELFMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
new file mode 100644
index 000000000000..a5c591eee800
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -0,0 +1,502 @@
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AArch64MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class AArch64MCCodeEmitter : public MCCodeEmitter {
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  MCContext &Ctx;
+
+public:
+  AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+
+  ~AArch64MCCodeEmitter() {}
+
+  unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  template<int MemSize>
+  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, MemSize);
+  }
+
+  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    int MemSize) const;
+
+  unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  // Labels are handled mostly the same way: a symbol is needed, and
+  // just gets some fixup attached.
+  template<AArch64::Fixups fixupDesired>
+  unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  unsigned getAddressWithFixup(const MCOperand &MO,
+                               unsigned FixupKind,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != 4; ++i) {
+      EmitByte(Val & 0xff, OS);
+      Val >>= 8;
+    }
+  }
+
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue) const;
+
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue) const;
+
+  unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue) const;
+
+
+};
+
+} // end anonymous namespace
+
+unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (!MO.isExpr()) {
+    // This can occur for manually decoded or constructed MCInsts, but neither
+    // the assembly-parser nor instruction selection will currently produce an
+    // MCInst that's not a symbol reference.
+    assert(MO.isImm() && "Unexpected address requested");
+    return MO.getImm();
+  }
+
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+  return 0;
+}
+
+unsigned AArch64MCCodeEmitter::
+getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       int MemSize) const {
+  const MCOperand &ImmOp = MI.getOperand(OpIdx);
+  if (ImmOp.isImm())
+    return ImmOp.getImm();
+
+  assert(ImmOp.isExpr() && "Unexpected operand type");
+  const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
+  unsigned FixupKind;
+
+
+  switch (Expr->getKind()) {
+  default: llvm_unreachable("Unexpected operand modifier");
+  case AArch64MCExpr::VK_AARCH64_LO12: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
+                                AArch64::fixup_a64_ldst16_lo12,
+                                AArch64::fixup_a64_ldst32_lo12,
+                                AArch64::fixup_a64_ldst64_lo12,
+                                AArch64::fixup_a64_ldst128_lo12 };
+    assert(MemSize <= 16 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_GOT_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
+    break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12,
+                                AArch64::fixup_a64_ldst16_dtprel_lo12,
+                                AArch64::fixup_a64_ldst32_dtprel_lo12,
+                                AArch64::fixup_a64_ldst64_dtprel_lo12 };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
+                                AArch64::fixup_a64_ldst64_dtprel_lo12_nc };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
+    break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12,
+                                AArch64::fixup_a64_ldst16_tprel_lo12,
+                                AArch64::fixup_a64_ldst32_tprel_lo12,
+                                AArch64::fixup_a64_ldst64_tprel_lo12 };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
+    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst16_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst32_tprel_lo12_nc,
+                                AArch64::fixup_a64_ldst64_tprel_lo12_nc };
+    assert(MemSize <= 8 && "Invalid fixup for operation");
+    FixupKind = FixupsBySize[Log2_32(MemSize)];
+    break;
+  }
+  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
+    assert(MemSize == 8 && "Invalid fixup for operation");
+    FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
+    break;
+  }
+
+  return getAddressWithFixup(ImmOp, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  unsigned FixupKind = 0;
+  switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
+  default: llvm_unreachable("Invalid expression modifier");
+  case AArch64MCExpr::VK_AARCH64_LO12:
+    FixupKind = AArch64::fixup_a64_add_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
+    FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
+    FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
+    FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
+    FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
+    FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
+    FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
+    FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+
+  assert(MO.isExpr());
+
+  unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
+  if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
+    Modifier = Expr->getKind();
+
+  unsigned FixupKind = 0;
+  switch(Modifier) {
+  case AArch64MCExpr::VK_AARCH64_None:
+    FixupKind = AArch64::fixup_a64_adr_prel_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_GOT:
+    FixupKind = AArch64::fixup_a64_adr_prel_got_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL:
+    FixupKind = AArch64::fixup_a64_adr_gottprel_page;
+    break;
+  case AArch64MCExpr::VK_AARCH64_TLSDESC:
+    FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
+    break;
+  default:
+    llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+unsigned
+AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Only immediate expected for shift");
+
+  return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
+}
+
+unsigned
+AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Only immediate expected for shift");
+
+  return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
+}
+
+
+template<AArch64::Fixups fixupDesired> unsigned
+AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
+                                      unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isExpr())
+    return getAddressWithFixup(MO, fixupDesired, Fixups);
+
+  assert(MO.isImm());
+  return MO.getImm();
+}
+
+unsigned
+AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
+                                       unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+
+  unsigned FixupKind;
+  if (isa<AArch64MCExpr>(MO.getExpr())) {
+    assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
+           == AArch64MCExpr::VK_AARCH64_GOTTPREL
+           && "Invalid symbol modifier for literal load");
+    FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
+  } else {
+    FixupKind = AArch64::fixup_a64_ld_prel;
+  }
+
+  return getAddressWithFixup(MO, FixupKind, Fixups);
+}
+
+
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                       const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg());
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+  return 0;
+}
+
+unsigned
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &UImm16MO = MI.getOperand(OpIdx);
+  const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+
+  unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
+
+  if (UImm16MO.isImm()) {
+    Result |= UImm16MO.getImm();
+    return Result;
+  }
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  AArch64::Fixups requestedFixup;
+  switch (A64E->getKind()) {
+  default: llvm_unreachable("unexpected expression modifier");
+  case AArch64MCExpr::VK_AARCH64_ABS_G0:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G1:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G2:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
+  case AArch64MCExpr::VK_AARCH64_ABS_G3:
+    requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G0:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G1:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
+  case AArch64MCExpr::VK_AARCH64_SABS_G2:
+    requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
+    requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
+  }
+
+  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups);
+}
+
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
+
+  return EncodedValue;
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue) const {
+  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
+  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
+  // job to ensure that any bits possibly affected by this are 0. This means we
+  // must zero out bit 30 (essentially emitting a MOVN).
+  MCOperand UImm16MO = MI.getOperand(1);
+
+  // Nothing to do if there's no fixup.
+  if (UImm16MO.isImm())
+    return EncodedValue;
+
+  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
+  switch (A64E->getKind()) {
+  case AArch64MCExpr::VK_AARCH64_SABS_G0:
+  case AArch64MCExpr::VK_AARCH64_SABS_G1:
+  case AArch64MCExpr::VK_AARCH64_SABS_G2:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
+  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
+  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+    return EncodedValue & ~(1u << 30);
+  default:
+    // Nothing to do for an unsigned fixup.
+    return EncodedValue;
+  }
+
+  llvm_unreachable("Should have returned by now");
+}
+
+unsigned
+AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
+                                 unsigned EncodedValue) const {
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  EncodedValue |= 0x1f << 10;
+  return EncodedValue;
+}
+
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(Ctx);
+}
+
+void AArch64MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
+    const MCExpr *Expr;
+    Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
+    Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
+    return;
+  }
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+
+  EmitInstruction(Binary, OS);
+}
+
+
+#include "AArch64GenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
new file mode 100644
index 000000000000..c1abfe74dfdd
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -0,0 +1,178 @@
+//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64mcexpr"
+#include "AArch64MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/Object/ELF.h"
+
+using namespace llvm;
+
+const AArch64MCExpr*
+AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                      MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Kind, Expr);
+}
+
+void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_AARCH64_GOT:              OS << ":got:"; break;
+  case VK_AARCH64_GOT_LO12:         OS << ":got_lo12:"; break;
+  case VK_AARCH64_LO12:             OS << ":lo12:"; break;
+  case VK_AARCH64_ABS_G0:           OS << ":abs_g0:"; break;
+  case VK_AARCH64_ABS_G0_NC:        OS << ":abs_g0_nc:"; break;
+  case VK_AARCH64_ABS_G1:           OS << ":abs_g1:"; break;
+  case VK_AARCH64_ABS_G1_NC:        OS << ":abs_g1_nc:"; break;
+  case VK_AARCH64_ABS_G2:           OS << ":abs_g2:"; break;
+  case VK_AARCH64_ABS_G2_NC:        OS << ":abs_g2_nc:"; break;
+  case VK_AARCH64_ABS_G3:           OS << ":abs_g3:"; break;
+  case VK_AARCH64_SABS_G0:          OS << ":abs_g0_s:"; break;
+  case VK_AARCH64_SABS_G1:          OS << ":abs_g1_s:"; break;
+  case VK_AARCH64_SABS_G2:          OS << ":abs_g2_s:"; break;
+  case VK_AARCH64_DTPREL_G2:        OS << ":dtprel_g2:"; break;
+  case VK_AARCH64_DTPREL_G1:        OS << ":dtprel_g1:"; break;
+  case VK_AARCH64_DTPREL_G1_NC:     OS << ":dtprel_g1_nc:"; break;
+  case VK_AARCH64_DTPREL_G0:        OS << ":dtprel_g0:"; break;
+  case VK_AARCH64_DTPREL_G0_NC:     OS << ":dtprel_g0_nc:"; break;
+  case VK_AARCH64_DTPREL_HI12:      OS << ":dtprel_hi12:"; break;
+  case VK_AARCH64_DTPREL_LO12:      OS << ":dtprel_lo12:"; break;
+  case VK_AARCH64_DTPREL_LO12_NC:   OS << ":dtprel_lo12_nc:"; break;
+  case VK_AARCH64_GOTTPREL_G1:      OS << ":gottprel_g1:"; break;
+  case VK_AARCH64_GOTTPREL_G0_NC:   OS << ":gottprel_g0_nc:"; break;
+  case VK_AARCH64_GOTTPREL:         OS << ":gottprel:"; break;
+  case VK_AARCH64_GOTTPREL_LO12:    OS << ":gottprel_lo12:"; break;
+  case VK_AARCH64_TPREL_G2:         OS << ":tprel_g2:"; break;
+  case VK_AARCH64_TPREL_G1:         OS << ":tprel_g1:"; break;
+  case VK_AARCH64_TPREL_G1_NC:      OS << ":tprel_g1_nc:"; break;
+  case VK_AARCH64_TPREL_G0:         OS << ":tprel_g0:"; break;
+  case VK_AARCH64_TPREL_G0_NC:      OS << ":tprel_g0_nc:"; break;
+  case VK_AARCH64_TPREL_HI12:       OS << ":tprel_hi12:"; break;
+  case VK_AARCH64_TPREL_LO12:       OS << ":tprel_lo12:"; break;
+  case VK_AARCH64_TPREL_LO12_NC:    OS << ":tprel_lo12_nc:"; break;
+  case VK_AARCH64_TLSDESC:          OS << ":tlsdesc:"; break;
+  case VK_AARCH64_TLSDESC_LO12:     OS << ":tlsdesc_lo12:"; break;
+
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                         const MCAsmLayout *Layout) const {
+  return getSubExpr()->EvaluateAsRelocatable(Res, *Layout);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expression");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
+    MCELF::SetType(SD, ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getKind()) {
+  default:
+    return;
+  case VK_AARCH64_DTPREL_G2:
+  case VK_AARCH64_DTPREL_G1:
+  case VK_AARCH64_DTPREL_G1_NC:
+  case VK_AARCH64_DTPREL_G0:
+  case VK_AARCH64_DTPREL_G0_NC:
+  case VK_AARCH64_DTPREL_HI12:
+  case VK_AARCH64_DTPREL_LO12:
+  case VK_AARCH64_DTPREL_LO12_NC:
+  case VK_AARCH64_GOTTPREL_G1:
+  case VK_AARCH64_GOTTPREL_G0_NC:
+  case VK_AARCH64_GOTTPREL:
+  case VK_AARCH64_GOTTPREL_LO12:
+  case VK_AARCH64_TPREL_G2:
+  case VK_AARCH64_TPREL_G1:
+  case VK_AARCH64_TPREL_G1_NC:
+  case VK_AARCH64_TPREL_G0:
+  case VK_AARCH64_TPREL_G0_NC:
+  case VK_AARCH64_TPREL_HI12:
+  case VK_AARCH64_TPREL_LO12:
+  case VK_AARCH64_TPREL_LO12_NC:
+  case VK_AARCH64_TLSDESC:
+  case VK_AARCH64_TLSDESC_LO12:
+    break;
+  }
+
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
new file mode 100644
index 000000000000..c0e3b29474d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -0,0 +1,167 @@
+//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes AArch64-specific MCExprs, used for modifiers like
+// ":lo12:" or ":gottprel_g1:".
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64MCEXPR_H
+#define LLVM_AARCH64MCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class AArch64MCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_AARCH64_None,
+    VK_AARCH64_GOT,      // :got: modifier in assembly
+    VK_AARCH64_GOT_LO12, // :got_lo12:
+    VK_AARCH64_LO12,     // :lo12:
+
+    VK_AARCH64_ABS_G0, // :abs_g0:
+    VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
+    VK_AARCH64_ABS_G1,
+    VK_AARCH64_ABS_G1_NC,
+    VK_AARCH64_ABS_G2,
+    VK_AARCH64_ABS_G2_NC,
+    VK_AARCH64_ABS_G3,
+
+    VK_AARCH64_SABS_G0, // :abs_g0_s:
+    VK_AARCH64_SABS_G1,
+    VK_AARCH64_SABS_G2,
+
+    VK_AARCH64_DTPREL_G2, // :dtprel_g2:
+    VK_AARCH64_DTPREL_G1,
+    VK_AARCH64_DTPREL_G1_NC,
+    VK_AARCH64_DTPREL_G0,
+    VK_AARCH64_DTPREL_G0_NC,
+    VK_AARCH64_DTPREL_HI12,
+    VK_AARCH64_DTPREL_LO12,
+    VK_AARCH64_DTPREL_LO12_NC,
+
+    VK_AARCH64_GOTTPREL_G1, // :gottprel:
+    VK_AARCH64_GOTTPREL_G0_NC,
+    VK_AARCH64_GOTTPREL,
+    VK_AARCH64_GOTTPREL_LO12,
+
+    VK_AARCH64_TPREL_G2, // :tprel:
+    VK_AARCH64_TPREL_G1,
+    VK_AARCH64_TPREL_G1_NC,
+    VK_AARCH64_TPREL_G0,
+    VK_AARCH64_TPREL_G0_NC,
+    VK_AARCH64_TPREL_HI12,
+    VK_AARCH64_TPREL_LO12,
+    VK_AARCH64_TPREL_LO12_NC,
+
+    VK_AARCH64_TLSDESC, // :tlsdesc:
+    VK_AARCH64_TLSDESC_LO12
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                     MCContext &Ctx);
+
+  static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_AARCH64_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_AARCH64_GOT, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+    return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
+                                                 MCContext &Ctx) {
+    return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+    return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
+                                             MCContext &Ctx) {
+    return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
+  }
+
+  static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
+                                                MCContext &Ctx) {
+    return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const AArch64MCExpr *) { return true; }
+
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
new file mode 100644
index 000000000000..7960db08c8d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -0,0 +1,194 @@
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MCTargetDesc.h"
+#include "AArch64ELFStreamer.h"
+#include "AArch64MCAsmInfo.h"
+#include "InstPrinter/AArch64InstPrinter.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AArch64GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
+                                                          StringRef CPU,
+                                                          StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitAArch64MCSubtargetInfo(X, TT, CPU, "");
+  return X;
+}
+
+
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAArch64MCRegisterInfo(X, AArch64::X30);
+  return X;
+}
+
+static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo();
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(AArch64::XSP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM,
+                                                 CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
+    // On ELF platforms the default static relocation model has a smart enough
+    // linker to cope with referencing external symbols defined in a shared
+    // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+    RM = Reloc::Static;
+  }
+
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+
+static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new AArch64InstPrinter(MAI, MII, MRI, STI);
+  return 0;
+}
+
+namespace {
+
+class AArch64MCInstrAnalysis : public MCInstrAnalysis {
+public:
+  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    if (Inst.getOpcode() == AArch64::Bcc
+        && Inst.getOperand(0).getImm() == A64CC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    if (Inst.getOpcode() == AArch64::Bcc
+        && Inst.getOperand(0).getImm() == A64CC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                          uint64_t Size) const {
+    unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
+    // FIXME: We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
+        != MCOI::OPERAND_PCREL)
+      return -1ULL;
+
+    int64_t Imm = Inst.getOperand(LblOperand).getImm();
+
+    return Addr + Imm;
+  }
+};
+
+}
+
+static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new AArch64MCInstrAnalysis(Info);
+}
+
+
+
+extern "C" void LLVMInitializeAArch64TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn A(TheAArch64Target, createAArch64MCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAArch64Target,
+                                        createAArch64MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAArch64Target,
+                                      createAArch64MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAArch64Target,
+                                    createAArch64MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  using AArch64_MC::createAArch64MCSubtargetInfo;
+  TargetRegistry::RegisterMCSubtargetInfo(TheAArch64Target,
+                                          createAArch64MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64Target,
+                                          createAArch64MCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheAArch64Target,
+                                        createAArch64MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64Target,
+                                       createAArch64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheAArch64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheAArch64Target,
+                                        createAArch64MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
new file mode 100644
index 000000000000..3849fe379513
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -0,0 +1,65 @@
+//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AArch64 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64MCTARGETDESC_H
+#define LLVM_AARCH64MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheAArch64Target;
+
+namespace AArch64_MC {
+  MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                StringRef FS);
+}
+
+MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                          const MCRegisterInfo &MRI,
+                                          const MCSubtargetInfo &STI,
+                                          MCContext &Ctx);
+
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
+                                             uint8_t OSABI);
+
+MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT,
+                                      StringRef CPU);
+
+} // End llvm namespace
+
+// Defines symbolic names for AArch64 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "AArch64GenRegisterInfo.inc"
+
+// Defines symbolic names for the AArch64 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "AArch64GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AArch64GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/README.txt b/contrib/llvm/lib/Target/AArch64/README.txt
new file mode 100644
index 000000000000..601990f17dee
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/README.txt
@@ -0,0 +1,2 @@
+This file will contain changes that need to be made before AArch64 can become an
+officially supported target. Currently a placeholder.
diff --git a/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
new file mode 100644
index 000000000000..b8099cb26b0f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -0,0 +1,24 @@
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the key registration step for the architecture.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheAArch64Target;
+
+extern "C" void LLVMInitializeAArch64TargetInfo() {
+  RegisterTarget<Triple::aarch64>
+    X(TheAArch64Target, "aarch64", "AArch64");
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
new file mode 100644
index 000000000000..1678559aa084
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -0,0 +1,1103 @@
+//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides basic encoding and assembly information for AArch64.
+//
+//===----------------------------------------------------------------------===//
+#include "AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Regex.h"
+
+using namespace llvm;
+
+StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Value == Value) {
+      Valid = true;
+      return Pairs[i].Name;
+    }
+  }
+
+  Valid = false;
+  return StringRef();
+}
+
+uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+  std::string LowerCaseName = Name.lower();
+  for (unsigned i = 0; i < NumPairs; ++i) {
+    if (Pairs[i].Name == LowerCaseName) {
+      Valid = true;
+      return Pairs[i].Value;
+    }
+  }
+
+  Valid = false;
+  return -1;
+}
+
+bool NamedImmMapper::validImm(uint32_t Value) const {
+  return Value < TooBigImm;
+}
+
+const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+  {"s1e1r", S1E1R},
+  {"s1e2r", S1E2R},
+  {"s1e3r", S1E3R},
+  {"s1e1w", S1E1W},
+  {"s1e2w", S1E2W},
+  {"s1e3w", S1E3W},
+  {"s1e0r", S1E0R},
+  {"s1e0w", S1E0W},
+  {"s12e1r", S12E1R},
+  {"s12e1w", S12E1W},
+  {"s12e0r", S12E0R},
+  {"s12e0w", S12E0W},
+};
+
+A64AT::ATMapper::ATMapper()
+  : NamedImmMapper(ATPairs, 0) {}
+
+const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+  {"oshld", OSHLD},
+  {"oshst", OSHST},
+  {"osh", OSH},
+  {"nshld", NSHLD},
+  {"nshst", NSHST},
+  {"nsh", NSH},
+  {"ishld", ISHLD},
+  {"ishst", ISHST},
+  {"ish", ISH},
+  {"ld", LD},
+  {"st", ST},
+  {"sy", SY}
+};
+
+A64DB::DBarrierMapper::DBarrierMapper()
+  : NamedImmMapper(DBarrierPairs, 16u) {}
+
+const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+  {"zva", ZVA},
+  {"ivac", IVAC},
+  {"isw", ISW},
+  {"cvac", CVAC},
+  {"csw", CSW},
+  {"cvau", CVAU},
+  {"civac", CIVAC},
+  {"cisw", CISW}
+};
+
+A64DC::DCMapper::DCMapper()
+  : NamedImmMapper(DCPairs, 0) {}
+
+const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+  {"ialluis",  IALLUIS},
+  {"iallu", IALLU},
+  {"ivau", IVAU}
+};
+
+A64IC::ICMapper::ICMapper()
+  : NamedImmMapper(ICPairs, 0) {}
+
+const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+  {"sy",  SY},
+};
+
+A64ISB::ISBMapper::ISBMapper()
+  : NamedImmMapper(ISBPairs, 16) {}
+
+const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+  {"pldl1keep", PLDL1KEEP},
+  {"pldl1strm", PLDL1STRM},
+  {"pldl2keep", PLDL2KEEP},
+  {"pldl2strm", PLDL2STRM},
+  {"pldl3keep", PLDL3KEEP},
+  {"pldl3strm", PLDL3STRM},
+  {"plil1keep", PLIL1KEEP},
+  {"plil1strm", PLIL1STRM},
+  {"plil2keep", PLIL2KEEP},
+  {"plil2strm", PLIL2STRM},
+  {"plil3keep", PLIL3KEEP},
+  {"plil3strm", PLIL3STRM},
+  {"pstl1keep", PSTL1KEEP},
+  {"pstl1strm", PSTL1STRM},
+  {"pstl2keep", PSTL2KEEP},
+  {"pstl2strm", PSTL2STRM},
+  {"pstl3keep", PSTL3KEEP},
+  {"pstl3strm", PSTL3STRM}
+};
+
+A64PRFM::PRFMMapper::PRFMMapper()
+  : NamedImmMapper(PRFMPairs, 32) {}
+
+const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+  {"spsel", SPSel},
+  {"daifset", DAIFSet},
+  {"daifclr", DAIFClr}
+};
+
+A64PState::PStateMapper::PStateMapper()
+  : NamedImmMapper(PStatePairs, 0) {}
+
+const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+  {"mdccsr_el0", MDCCSR_EL0},
+  {"dbgdtrrx_el0", DBGDTRRX_EL0},
+  {"mdrar_el1", MDRAR_EL1},
+  {"oslsr_el1", OSLSR_EL1},
+  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1},
+  {"pmceid0_el0", PMCEID0_EL0},
+  {"pmceid1_el0", PMCEID1_EL0},
+  {"midr_el1", MIDR_EL1},
+  {"ccsidr_el1", CCSIDR_EL1},
+  {"clidr_el1", CLIDR_EL1},
+  {"ctr_el0", CTR_EL0},
+  {"mpidr_el1", MPIDR_EL1},
+  {"revidr_el1", REVIDR_EL1},
+  {"aidr_el1", AIDR_EL1},
+  {"dczid_el0", DCZID_EL0},
+  {"id_pfr0_el1", ID_PFR0_EL1},
+  {"id_pfr1_el1", ID_PFR1_EL1},
+  {"id_dfr0_el1", ID_DFR0_EL1},
+  {"id_afr0_el1", ID_AFR0_EL1},
+  {"id_mmfr0_el1", ID_MMFR0_EL1},
+  {"id_mmfr1_el1", ID_MMFR1_EL1},
+  {"id_mmfr2_el1", ID_MMFR2_EL1},
+  {"id_mmfr3_el1", ID_MMFR3_EL1},
+  {"id_isar0_el1", ID_ISAR0_EL1},
+  {"id_isar1_el1", ID_ISAR1_EL1},
+  {"id_isar2_el1", ID_ISAR2_EL1},
+  {"id_isar3_el1", ID_ISAR3_EL1},
+  {"id_isar4_el1", ID_ISAR4_EL1},
+  {"id_isar5_el1", ID_ISAR5_EL1},
+  {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+  {"mvfr0_el1", MVFR0_EL1},
+  {"mvfr1_el1", MVFR1_EL1},
+  {"mvfr2_el1", MVFR2_EL1},
+  {"rvbar_el1", RVBAR_EL1},
+  {"rvbar_el2", RVBAR_EL2},
+  {"rvbar_el3", RVBAR_EL3},
+  {"isr_el1", ISR_EL1},
+  {"cntpct_el0", CNTPCT_EL0},
+  {"cntvct_el0", CNTVCT_EL0},
+
+  // Trace registers
+  {"trcstatr", TRCSTATR},
+  {"trcidr8", TRCIDR8},
+  {"trcidr9", TRCIDR9},
+  {"trcidr10", TRCIDR10},
+  {"trcidr11", TRCIDR11},
+  {"trcidr12", TRCIDR12},
+  {"trcidr13", TRCIDR13},
+  {"trcidr0", TRCIDR0},
+  {"trcidr1", TRCIDR1},
+  {"trcidr2", TRCIDR2},
+  {"trcidr3", TRCIDR3},
+  {"trcidr4", TRCIDR4},
+  {"trcidr5", TRCIDR5},
+  {"trcidr6", TRCIDR6},
+  {"trcidr7", TRCIDR7},
+  {"trcoslsr", TRCOSLSR},
+  {"trcpdsr", TRCPDSR},
+  {"trcdevaff0", TRCDEVAFF0},
+  {"trcdevaff1", TRCDEVAFF1},
+  {"trclsr", TRCLSR},
+  {"trcauthstatus", TRCAUTHSTATUS},
+  {"trcdevarch", TRCDEVARCH},
+  {"trcdevid", TRCDEVID},
+  {"trcdevtype", TRCDEVTYPE},
+  {"trcpidr4", TRCPIDR4},
+  {"trcpidr5", TRCPIDR5},
+  {"trcpidr6", TRCPIDR6},
+  {"trcpidr7", TRCPIDR7},
+  {"trcpidr0", TRCPIDR0},
+  {"trcpidr1", TRCPIDR1},
+  {"trcpidr2", TRCPIDR2},
+  {"trcpidr3", TRCPIDR3},
+  {"trccidr0", TRCCIDR0},
+  {"trccidr1", TRCCIDR1},
+  {"trccidr2", TRCCIDR2},
+  {"trccidr3", TRCCIDR3},
+
+  // GICv3 registers
+  {"icc_iar1_el1", ICC_IAR1_EL1},
+  {"icc_iar0_el1", ICC_IAR0_EL1},
+  {"icc_hppir1_el1", ICC_HPPIR1_EL1},
+  {"icc_hppir0_el1", ICC_HPPIR0_EL1},
+  {"icc_rpr_el1", ICC_RPR_EL1},
+  {"ich_vtr_el2", ICH_VTR_EL2},
+  {"ich_eisr_el2", ICH_EISR_EL2},
+  {"ich_elsr_el2", ICH_ELSR_EL2}
+};
+
+A64SysReg::MRSMapper::MRSMapper() {
+    InstPairs = &MRSPairs[0];
+    NumInstPairs = llvm::array_lengthof(MRSPairs);
+}
+
+const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+  {"dbgdtrtx_el0", DBGDTRTX_EL0},
+  {"oslar_el1", OSLAR_EL1},
+  {"pmswinc_el0", PMSWINC_EL0},
+
+  // Trace registers
+  {"trcoslar", TRCOSLAR},
+  {"trclar", TRCLAR},
+
+  // GICv3 registers
+  {"icc_eoir1_el1", ICC_EOIR1_EL1},
+  {"icc_eoir0_el1", ICC_EOIR0_EL1},
+  {"icc_dir_el1", ICC_DIR_EL1},
+  {"icc_sgi1r_el1", ICC_SGI1R_EL1},
+  {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
+  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
+};
+
+A64SysReg::MSRMapper::MSRMapper() {
+    InstPairs = &MSRPairs[0];
+    NumInstPairs = llvm::array_lengthof(MSRPairs);
+}
+
+
+const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+  {"osdtrrx_el1", OSDTRRX_EL1},
+  {"osdtrtx_el1",  OSDTRTX_EL1},
+  {"teecr32_el1", TEECR32_EL1},
+  {"mdccint_el1", MDCCINT_EL1},
+  {"mdscr_el1", MDSCR_EL1},
+  {"dbgdtr_el0", DBGDTR_EL0},
+  {"oseccr_el1", OSECCR_EL1},
+  {"dbgvcr32_el2", DBGVCR32_EL2},
+  {"dbgbvr0_el1", DBGBVR0_EL1},
+  {"dbgbvr1_el1", DBGBVR1_EL1},
+  {"dbgbvr2_el1", DBGBVR2_EL1},
+  {"dbgbvr3_el1", DBGBVR3_EL1},
+  {"dbgbvr4_el1", DBGBVR4_EL1},
+  {"dbgbvr5_el1", DBGBVR5_EL1},
+  {"dbgbvr6_el1", DBGBVR6_EL1},
+  {"dbgbvr7_el1", DBGBVR7_EL1},
+  {"dbgbvr8_el1", DBGBVR8_EL1},
+  {"dbgbvr9_el1", DBGBVR9_EL1},
+  {"dbgbvr10_el1", DBGBVR10_EL1},
+  {"dbgbvr11_el1", DBGBVR11_EL1},
+  {"dbgbvr12_el1", DBGBVR12_EL1},
+  {"dbgbvr13_el1", DBGBVR13_EL1},
+  {"dbgbvr14_el1", DBGBVR14_EL1},
+  {"dbgbvr15_el1", DBGBVR15_EL1},
+  {"dbgbcr0_el1", DBGBCR0_EL1},
+  {"dbgbcr1_el1", DBGBCR1_EL1},
+  {"dbgbcr2_el1", DBGBCR2_EL1},
+  {"dbgbcr3_el1", DBGBCR3_EL1},
+  {"dbgbcr4_el1", DBGBCR4_EL1},
+  {"dbgbcr5_el1", DBGBCR5_EL1},
+  {"dbgbcr6_el1", DBGBCR6_EL1},
+  {"dbgbcr7_el1", DBGBCR7_EL1},
+  {"dbgbcr8_el1", DBGBCR8_EL1},
+  {"dbgbcr9_el1", DBGBCR9_EL1},
+  {"dbgbcr10_el1", DBGBCR10_EL1},
+  {"dbgbcr11_el1", DBGBCR11_EL1},
+  {"dbgbcr12_el1", DBGBCR12_EL1},
+  {"dbgbcr13_el1", DBGBCR13_EL1},
+  {"dbgbcr14_el1", DBGBCR14_EL1},
+  {"dbgbcr15_el1", DBGBCR15_EL1},
+  {"dbgwvr0_el1", DBGWVR0_EL1},
+  {"dbgwvr1_el1", DBGWVR1_EL1},
+  {"dbgwvr2_el1", DBGWVR2_EL1},
+  {"dbgwvr3_el1", DBGWVR3_EL1},
+  {"dbgwvr4_el1", DBGWVR4_EL1},
+  {"dbgwvr5_el1", DBGWVR5_EL1},
+  {"dbgwvr6_el1", DBGWVR6_EL1},
+  {"dbgwvr7_el1", DBGWVR7_EL1},
+  {"dbgwvr8_el1", DBGWVR8_EL1},
+  {"dbgwvr9_el1", DBGWVR9_EL1},
+  {"dbgwvr10_el1", DBGWVR10_EL1},
+  {"dbgwvr11_el1", DBGWVR11_EL1},
+  {"dbgwvr12_el1", DBGWVR12_EL1},
+  {"dbgwvr13_el1", DBGWVR13_EL1},
+  {"dbgwvr14_el1", DBGWVR14_EL1},
+  {"dbgwvr15_el1", DBGWVR15_EL1},
+  {"dbgwcr0_el1", DBGWCR0_EL1},
+  {"dbgwcr1_el1", DBGWCR1_EL1},
+  {"dbgwcr2_el1", DBGWCR2_EL1},
+  {"dbgwcr3_el1", DBGWCR3_EL1},
+  {"dbgwcr4_el1", DBGWCR4_EL1},
+  {"dbgwcr5_el1", DBGWCR5_EL1},
+  {"dbgwcr6_el1", DBGWCR6_EL1},
+  {"dbgwcr7_el1", DBGWCR7_EL1},
+  {"dbgwcr8_el1", DBGWCR8_EL1},
+  {"dbgwcr9_el1", DBGWCR9_EL1},
+  {"dbgwcr10_el1", DBGWCR10_EL1},
+  {"dbgwcr11_el1", DBGWCR11_EL1},
+  {"dbgwcr12_el1", DBGWCR12_EL1},
+  {"dbgwcr13_el1", DBGWCR13_EL1},
+  {"dbgwcr14_el1", DBGWCR14_EL1},
+  {"dbgwcr15_el1", DBGWCR15_EL1},
+  {"teehbr32_el1", TEEHBR32_EL1},
+  {"osdlr_el1", OSDLR_EL1},
+  {"dbgprcr_el1", DBGPRCR_EL1},
+  {"dbgclaimset_el1", DBGCLAIMSET_EL1},
+  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1},
+  {"csselr_el1", CSSELR_EL1},
+  {"vpidr_el2", VPIDR_EL2},
+  {"vmpidr_el2", VMPIDR_EL2},
+  {"sctlr_el1", SCTLR_EL1},
+  {"sctlr_el2", SCTLR_EL2},
+  {"sctlr_el3", SCTLR_EL3},
+  {"actlr_el1", ACTLR_EL1},
+  {"actlr_el2", ACTLR_EL2},
+  {"actlr_el3", ACTLR_EL3},
+  {"cpacr_el1", CPACR_EL1},
+  {"hcr_el2", HCR_EL2},
+  {"scr_el3", SCR_EL3},
+  {"mdcr_el2", MDCR_EL2},
+  {"sder32_el3", SDER32_EL3},
+  {"cptr_el2", CPTR_EL2},
+  {"cptr_el3", CPTR_EL3},
+  {"hstr_el2", HSTR_EL2},
+  {"hacr_el2", HACR_EL2},
+  {"mdcr_el3", MDCR_EL3},
+  {"ttbr0_el1", TTBR0_EL1},
+  {"ttbr0_el2", TTBR0_EL2},
+  {"ttbr0_el3", TTBR0_EL3},
+  {"ttbr1_el1", TTBR1_EL1},
+  {"tcr_el1", TCR_EL1},
+  {"tcr_el2", TCR_EL2},
+  {"tcr_el3", TCR_EL3},
+  {"vttbr_el2", VTTBR_EL2},
+  {"vtcr_el2", VTCR_EL2},
+  {"dacr32_el2", DACR32_EL2},
+  {"spsr_el1", SPSR_EL1},
+  {"spsr_el2", SPSR_EL2},
+  {"spsr_el3", SPSR_EL3},
+  {"elr_el1", ELR_EL1},
+  {"elr_el2", ELR_EL2},
+  {"elr_el3", ELR_EL3},
+  {"sp_el0", SP_EL0},
+  {"sp_el1", SP_EL1},
+  {"sp_el2", SP_EL2},
+  {"spsel", SPSel},
+  {"nzcv", NZCV},
+  {"daif", DAIF},
+  {"currentel", CurrentEL},
+  {"spsr_irq", SPSR_irq},
+  {"spsr_abt", SPSR_abt},
+  {"spsr_und", SPSR_und},
+  {"spsr_fiq", SPSR_fiq},
+  {"fpcr", FPCR},
+  {"fpsr", FPSR},
+  {"dspsr_el0", DSPSR_EL0},
+  {"dlr_el0", DLR_EL0},
+  {"ifsr32_el2", IFSR32_EL2},
+  {"afsr0_el1", AFSR0_EL1},
+  {"afsr0_el2", AFSR0_EL2},
+  {"afsr0_el3", AFSR0_EL3},
+  {"afsr1_el1", AFSR1_EL1},
+  {"afsr1_el2", AFSR1_EL2},
+  {"afsr1_el3", AFSR1_EL3},
+  {"esr_el1", ESR_EL1},
+  {"esr_el2", ESR_EL2},
+  {"esr_el3", ESR_EL3},
+  {"fpexc32_el2", FPEXC32_EL2},
+  {"far_el1", FAR_EL1},
+  {"far_el2", FAR_EL2},
+  {"far_el3", FAR_EL3},
+  {"hpfar_el2", HPFAR_EL2},
+  {"par_el1", PAR_EL1},
+  {"pmcr_el0", PMCR_EL0},
+  {"pmcntenset_el0", PMCNTENSET_EL0},
+  {"pmcntenclr_el0", PMCNTENCLR_EL0},
+  {"pmovsclr_el0", PMOVSCLR_EL0},
+  {"pmselr_el0", PMSELR_EL0},
+  {"pmccntr_el0", PMCCNTR_EL0},
+  {"pmxevtyper_el0", PMXEVTYPER_EL0},
+  {"pmxevcntr_el0", PMXEVCNTR_EL0},
+  {"pmuserenr_el0", PMUSERENR_EL0},
+  {"pmintenset_el1", PMINTENSET_EL1},
+  {"pmintenclr_el1", PMINTENCLR_EL1},
+  {"pmovsset_el0", PMOVSSET_EL0},
+  {"mair_el1", MAIR_EL1},
+  {"mair_el2", MAIR_EL2},
+  {"mair_el3", MAIR_EL3},
+  {"amair_el1", AMAIR_EL1},
+  {"amair_el2", AMAIR_EL2},
+  {"amair_el3", AMAIR_EL3},
+  {"vbar_el1", VBAR_EL1},
+  {"vbar_el2", VBAR_EL2},
+  {"vbar_el3", VBAR_EL3},
+  {"rmr_el1", RMR_EL1},
+  {"rmr_el2", RMR_EL2},
+  {"rmr_el3", RMR_EL3},
+  {"contextidr_el1", CONTEXTIDR_EL1},
+  {"tpidr_el0", TPIDR_EL0},
+  {"tpidr_el2", TPIDR_EL2},
+  {"tpidr_el3", TPIDR_EL3},
+  {"tpidrro_el0", TPIDRRO_EL0},
+  {"tpidr_el1", TPIDR_EL1},
+  {"cntfrq_el0", CNTFRQ_EL0},
+  {"cntvoff_el2", CNTVOFF_EL2},
+  {"cntkctl_el1", CNTKCTL_EL1},
+  {"cnthctl_el2", CNTHCTL_EL2},
+  {"cntp_tval_el0", CNTP_TVAL_EL0},
+  {"cnthp_tval_el2", CNTHP_TVAL_EL2},
+  {"cntps_tval_el1", CNTPS_TVAL_EL1},
+  {"cntp_ctl_el0", CNTP_CTL_EL0},
+  {"cnthp_ctl_el2", CNTHP_CTL_EL2},
+  {"cntps_ctl_el1", CNTPS_CTL_EL1},
+  {"cntp_cval_el0", CNTP_CVAL_EL0},
+  {"cnthp_cval_el2", CNTHP_CVAL_EL2},
+  {"cntps_cval_el1", CNTPS_CVAL_EL1},
+  {"cntv_tval_el0", CNTV_TVAL_EL0},
+  {"cntv_ctl_el0", CNTV_CTL_EL0},
+  {"cntv_cval_el0", CNTV_CVAL_EL0},
+  {"pmevcntr0_el0", PMEVCNTR0_EL0},
+  {"pmevcntr1_el0", PMEVCNTR1_EL0},
+  {"pmevcntr2_el0", PMEVCNTR2_EL0},
+  {"pmevcntr3_el0", PMEVCNTR3_EL0},
+  {"pmevcntr4_el0", PMEVCNTR4_EL0},
+  {"pmevcntr5_el0", PMEVCNTR5_EL0},
+  {"pmevcntr6_el0", PMEVCNTR6_EL0},
+  {"pmevcntr7_el0", PMEVCNTR7_EL0},
+  {"pmevcntr8_el0", PMEVCNTR8_EL0},
+  {"pmevcntr9_el0", PMEVCNTR9_EL0},
+  {"pmevcntr10_el0", PMEVCNTR10_EL0},
+  {"pmevcntr11_el0", PMEVCNTR11_EL0},
+  {"pmevcntr12_el0", PMEVCNTR12_EL0},
+  {"pmevcntr13_el0", PMEVCNTR13_EL0},
+  {"pmevcntr14_el0", PMEVCNTR14_EL0},
+  {"pmevcntr15_el0", PMEVCNTR15_EL0},
+  {"pmevcntr16_el0", PMEVCNTR16_EL0},
+  {"pmevcntr17_el0", PMEVCNTR17_EL0},
+  {"pmevcntr18_el0", PMEVCNTR18_EL0},
+  {"pmevcntr19_el0", PMEVCNTR19_EL0},
+  {"pmevcntr20_el0", PMEVCNTR20_EL0},
+  {"pmevcntr21_el0", PMEVCNTR21_EL0},
+  {"pmevcntr22_el0", PMEVCNTR22_EL0},
+  {"pmevcntr23_el0", PMEVCNTR23_EL0},
+  {"pmevcntr24_el0", PMEVCNTR24_EL0},
+  {"pmevcntr25_el0", PMEVCNTR25_EL0},
+  {"pmevcntr26_el0", PMEVCNTR26_EL0},
+  {"pmevcntr27_el0", PMEVCNTR27_EL0},
+  {"pmevcntr28_el0", PMEVCNTR28_EL0},
+  {"pmevcntr29_el0", PMEVCNTR29_EL0},
+  {"pmevcntr30_el0", PMEVCNTR30_EL0},
+  {"pmccfiltr_el0", PMCCFILTR_EL0},
+  {"pmevtyper0_el0", PMEVTYPER0_EL0},
+  {"pmevtyper1_el0", PMEVTYPER1_EL0},
+  {"pmevtyper2_el0", PMEVTYPER2_EL0},
+  {"pmevtyper3_el0", PMEVTYPER3_EL0},
+  {"pmevtyper4_el0", PMEVTYPER4_EL0},
+  {"pmevtyper5_el0", PMEVTYPER5_EL0},
+  {"pmevtyper6_el0", PMEVTYPER6_EL0},
+  {"pmevtyper7_el0", PMEVTYPER7_EL0},
+  {"pmevtyper8_el0", PMEVTYPER8_EL0},
+  {"pmevtyper9_el0", PMEVTYPER9_EL0},
+  {"pmevtyper10_el0", PMEVTYPER10_EL0},
+  {"pmevtyper11_el0", PMEVTYPER11_EL0},
+  {"pmevtyper12_el0", PMEVTYPER12_EL0},
+  {"pmevtyper13_el0", PMEVTYPER13_EL0},
+  {"pmevtyper14_el0", PMEVTYPER14_EL0},
+  {"pmevtyper15_el0", PMEVTYPER15_EL0},
+  {"pmevtyper16_el0", PMEVTYPER16_EL0},
+  {"pmevtyper17_el0", PMEVTYPER17_EL0},
+  {"pmevtyper18_el0", PMEVTYPER18_EL0},
+  {"pmevtyper19_el0", PMEVTYPER19_EL0},
+  {"pmevtyper20_el0", PMEVTYPER20_EL0},
+  {"pmevtyper21_el0", PMEVTYPER21_EL0},
+  {"pmevtyper22_el0", PMEVTYPER22_EL0},
+  {"pmevtyper23_el0", PMEVTYPER23_EL0},
+  {"pmevtyper24_el0", PMEVTYPER24_EL0},
+  {"pmevtyper25_el0", PMEVTYPER25_EL0},
+  {"pmevtyper26_el0", PMEVTYPER26_EL0},
+  {"pmevtyper27_el0", PMEVTYPER27_EL0},
+  {"pmevtyper28_el0", PMEVTYPER28_EL0},
+  {"pmevtyper29_el0", PMEVTYPER29_EL0},
+  {"pmevtyper30_el0", PMEVTYPER30_EL0},
+
+  // Trace registers
+  {"trcprgctlr", TRCPRGCTLR},
+  {"trcprocselr", TRCPROCSELR},
+  {"trcconfigr", TRCCONFIGR},
+  {"trcauxctlr", TRCAUXCTLR},
+  {"trceventctl0r", TRCEVENTCTL0R},
+  {"trceventctl1r", TRCEVENTCTL1R},
+  {"trcstallctlr", TRCSTALLCTLR},
+  {"trctsctlr", TRCTSCTLR},
+  {"trcsyncpr", TRCSYNCPR},
+  {"trcccctlr", TRCCCCTLR},
+  {"trcbbctlr", TRCBBCTLR},
+  {"trctraceidr", TRCTRACEIDR},
+  {"trcqctlr", TRCQCTLR},
+  {"trcvictlr", TRCVICTLR},
+  {"trcviiectlr", TRCVIIECTLR},
+  {"trcvissctlr", TRCVISSCTLR},
+  {"trcvipcssctlr", TRCVIPCSSCTLR},
+  {"trcvdctlr", TRCVDCTLR},
+  {"trcvdsacctlr", TRCVDSACCTLR},
+  {"trcvdarcctlr", TRCVDARCCTLR},
+  {"trcseqevr0", TRCSEQEVR0},
+  {"trcseqevr1", TRCSEQEVR1},
+  {"trcseqevr2", TRCSEQEVR2},
+  {"trcseqrstevr", TRCSEQRSTEVR},
+  {"trcseqstr", TRCSEQSTR},
+  {"trcextinselr", TRCEXTINSELR},
+  {"trccntrldvr0", TRCCNTRLDVR0},
+  {"trccntrldvr1", TRCCNTRLDVR1},
+  {"trccntrldvr2", TRCCNTRLDVR2},
+  {"trccntrldvr3", TRCCNTRLDVR3},
+  {"trccntctlr0", TRCCNTCTLR0},
+  {"trccntctlr1", TRCCNTCTLR1},
+  {"trccntctlr2", TRCCNTCTLR2},
+  {"trccntctlr3", TRCCNTCTLR3},
+  {"trccntvr0", TRCCNTVR0},
+  {"trccntvr1", TRCCNTVR1},
+  {"trccntvr2", TRCCNTVR2},
+  {"trccntvr3", TRCCNTVR3},
+  {"trcimspec0", TRCIMSPEC0},
+  {"trcimspec1", TRCIMSPEC1},
+  {"trcimspec2", TRCIMSPEC2},
+  {"trcimspec3", TRCIMSPEC3},
+  {"trcimspec4", TRCIMSPEC4},
+  {"trcimspec5", TRCIMSPEC5},
+  {"trcimspec6", TRCIMSPEC6},
+  {"trcimspec7", TRCIMSPEC7},
+  {"trcrsctlr2", TRCRSCTLR2},
+  {"trcrsctlr3", TRCRSCTLR3},
+  {"trcrsctlr4", TRCRSCTLR4},
+  {"trcrsctlr5", TRCRSCTLR5},
+  {"trcrsctlr6", TRCRSCTLR6},
+  {"trcrsctlr7", TRCRSCTLR7},
+  {"trcrsctlr8", TRCRSCTLR8},
+  {"trcrsctlr9", TRCRSCTLR9},
+  {"trcrsctlr10", TRCRSCTLR10},
+  {"trcrsctlr11", TRCRSCTLR11},
+  {"trcrsctlr12", TRCRSCTLR12},
+  {"trcrsctlr13", TRCRSCTLR13},
+  {"trcrsctlr14", TRCRSCTLR14},
+  {"trcrsctlr15", TRCRSCTLR15},
+  {"trcrsctlr16", TRCRSCTLR16},
+  {"trcrsctlr17", TRCRSCTLR17},
+  {"trcrsctlr18", TRCRSCTLR18},
+  {"trcrsctlr19", TRCRSCTLR19},
+  {"trcrsctlr20", TRCRSCTLR20},
+  {"trcrsctlr21", TRCRSCTLR21},
+  {"trcrsctlr22", TRCRSCTLR22},
+  {"trcrsctlr23", TRCRSCTLR23},
+  {"trcrsctlr24", TRCRSCTLR24},
+  {"trcrsctlr25", TRCRSCTLR25},
+  {"trcrsctlr26", TRCRSCTLR26},
+  {"trcrsctlr27", TRCRSCTLR27},
+  {"trcrsctlr28", TRCRSCTLR28},
+  {"trcrsctlr29", TRCRSCTLR29},
+  {"trcrsctlr30", TRCRSCTLR30},
+  {"trcrsctlr31", TRCRSCTLR31},
+  {"trcssccr0", TRCSSCCR0},
+  {"trcssccr1", TRCSSCCR1},
+  {"trcssccr2", TRCSSCCR2},
+  {"trcssccr3", TRCSSCCR3},
+  {"trcssccr4", TRCSSCCR4},
+  {"trcssccr5", TRCSSCCR5},
+  {"trcssccr6", TRCSSCCR6},
+  {"trcssccr7", TRCSSCCR7},
+  {"trcsscsr0", TRCSSCSR0},
+  {"trcsscsr1", TRCSSCSR1},
+  {"trcsscsr2", TRCSSCSR2},
+  {"trcsscsr3", TRCSSCSR3},
+  {"trcsscsr4", TRCSSCSR4},
+  {"trcsscsr5", TRCSSCSR5},
+  {"trcsscsr6", TRCSSCSR6},
+  {"trcsscsr7", TRCSSCSR7},
+  {"trcsspcicr0", TRCSSPCICR0},
+  {"trcsspcicr1", TRCSSPCICR1},
+  {"trcsspcicr2", TRCSSPCICR2},
+  {"trcsspcicr3", TRCSSPCICR3},
+  {"trcsspcicr4", TRCSSPCICR4},
+  {"trcsspcicr5", TRCSSPCICR5},
+  {"trcsspcicr6", TRCSSPCICR6},
+  {"trcsspcicr7", TRCSSPCICR7},
+  {"trcpdcr", TRCPDCR},
+  {"trcacvr0", TRCACVR0},
+  {"trcacvr1", TRCACVR1},
+  {"trcacvr2", TRCACVR2},
+  {"trcacvr3", TRCACVR3},
+  {"trcacvr4", TRCACVR4},
+  {"trcacvr5", TRCACVR5},
+  {"trcacvr6", TRCACVR6},
+  {"trcacvr7", TRCACVR7},
+  {"trcacvr8", TRCACVR8},
+  {"trcacvr9", TRCACVR9},
+  {"trcacvr10", TRCACVR10},
+  {"trcacvr11", TRCACVR11},
+  {"trcacvr12", TRCACVR12},
+  {"trcacvr13", TRCACVR13},
+  {"trcacvr14", TRCACVR14},
+  {"trcacvr15", TRCACVR15},
+  {"trcacatr0", TRCACATR0},
+  {"trcacatr1", TRCACATR1},
+  {"trcacatr2", TRCACATR2},
+  {"trcacatr3", TRCACATR3},
+  {"trcacatr4", TRCACATR4},
+  {"trcacatr5", TRCACATR5},
+  {"trcacatr6", TRCACATR6},
+  {"trcacatr7", TRCACATR7},
+  {"trcacatr8", TRCACATR8},
+  {"trcacatr9", TRCACATR9},
+  {"trcacatr10", TRCACATR10},
+  {"trcacatr11", TRCACATR11},
+  {"trcacatr12", TRCACATR12},
+  {"trcacatr13", TRCACATR13},
+  {"trcacatr14", TRCACATR14},
+  {"trcacatr15", TRCACATR15},
+  {"trcdvcvr0", TRCDVCVR0},
+  {"trcdvcvr1", TRCDVCVR1},
+  {"trcdvcvr2", TRCDVCVR2},
+  {"trcdvcvr3", TRCDVCVR3},
+  {"trcdvcvr4", TRCDVCVR4},
+  {"trcdvcvr5", TRCDVCVR5},
+  {"trcdvcvr6", TRCDVCVR6},
+  {"trcdvcvr7", TRCDVCVR7},
+  {"trcdvcmr0", TRCDVCMR0},
+  {"trcdvcmr1", TRCDVCMR1},
+  {"trcdvcmr2", TRCDVCMR2},
+  {"trcdvcmr3", TRCDVCMR3},
+  {"trcdvcmr4", TRCDVCMR4},
+  {"trcdvcmr5", TRCDVCMR5},
+  {"trcdvcmr6", TRCDVCMR6},
+  {"trcdvcmr7", TRCDVCMR7},
+  {"trccidcvr0", TRCCIDCVR0},
+  {"trccidcvr1", TRCCIDCVR1},
+  {"trccidcvr2", TRCCIDCVR2},
+  {"trccidcvr3", TRCCIDCVR3},
+  {"trccidcvr4", TRCCIDCVR4},
+  {"trccidcvr5", TRCCIDCVR5},
+  {"trccidcvr6", TRCCIDCVR6},
+  {"trccidcvr7", TRCCIDCVR7},
+  {"trcvmidcvr0", TRCVMIDCVR0},
+  {"trcvmidcvr1", TRCVMIDCVR1},
+  {"trcvmidcvr2", TRCVMIDCVR2},
+  {"trcvmidcvr3", TRCVMIDCVR3},
+  {"trcvmidcvr4", TRCVMIDCVR4},
+  {"trcvmidcvr5", TRCVMIDCVR5},
+  {"trcvmidcvr6", TRCVMIDCVR6},
+  {"trcvmidcvr7", TRCVMIDCVR7},
+  {"trccidcctlr0", TRCCIDCCTLR0},
+  {"trccidcctlr1", TRCCIDCCTLR1},
+  {"trcvmidcctlr0", TRCVMIDCCTLR0},
+  {"trcvmidcctlr1", TRCVMIDCCTLR1},
+  {"trcitctrl", TRCITCTRL},
+  {"trcclaimset", TRCCLAIMSET},
+  {"trcclaimclr", TRCCLAIMCLR},
+
+  // GICv3 registers
+  {"icc_bpr1_el1", ICC_BPR1_EL1},
+  {"icc_bpr0_el1", ICC_BPR0_EL1},
+  {"icc_pmr_el1", ICC_PMR_EL1},
+  {"icc_ctlr_el1", ICC_CTLR_EL1},
+  {"icc_ctlr_el3", ICC_CTLR_EL3},
+  {"icc_sre_el1", ICC_SRE_EL1},
+  {"icc_sre_el2", ICC_SRE_EL2},
+  {"icc_sre_el3", ICC_SRE_EL3},
+  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1},
+  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1},
+  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3},
+  {"icc_seien_el1", ICC_SEIEN_EL1},
+  {"icc_ap0r0_el1", ICC_AP0R0_EL1},
+  {"icc_ap0r1_el1", ICC_AP0R1_EL1},
+  {"icc_ap0r2_el1", ICC_AP0R2_EL1},
+  {"icc_ap0r3_el1", ICC_AP0R3_EL1},
+  {"icc_ap1r0_el1", ICC_AP1R0_EL1},
+  {"icc_ap1r1_el1", ICC_AP1R1_EL1},
+  {"icc_ap1r2_el1", ICC_AP1R2_EL1},
+  {"icc_ap1r3_el1", ICC_AP1R3_EL1},
+  {"ich_ap0r0_el2", ICH_AP0R0_EL2},
+  {"ich_ap0r1_el2", ICH_AP0R1_EL2},
+  {"ich_ap0r2_el2", ICH_AP0R2_EL2},
+  {"ich_ap0r3_el2", ICH_AP0R3_EL2},
+  {"ich_ap1r0_el2", ICH_AP1R0_EL2},
+  {"ich_ap1r1_el2", ICH_AP1R1_EL2},
+  {"ich_ap1r2_el2", ICH_AP1R2_EL2},
+  {"ich_ap1r3_el2", ICH_AP1R3_EL2},
+  {"ich_hcr_el2", ICH_HCR_EL2},
+  {"ich_misr_el2", ICH_MISR_EL2},
+  {"ich_vmcr_el2", ICH_VMCR_EL2},
+  {"ich_vseir_el2", ICH_VSEIR_EL2},
+  {"ich_lr0_el2", ICH_LR0_EL2},
+  {"ich_lr1_el2", ICH_LR1_EL2},
+  {"ich_lr2_el2", ICH_LR2_EL2},
+  {"ich_lr3_el2", ICH_LR3_EL2},
+  {"ich_lr4_el2", ICH_LR4_EL2},
+  {"ich_lr5_el2", ICH_LR5_EL2},
+  {"ich_lr6_el2", ICH_LR6_EL2},
+  {"ich_lr7_el2", ICH_LR7_EL2},
+  {"ich_lr8_el2", ICH_LR8_EL2},
+  {"ich_lr9_el2", ICH_LR9_EL2},
+  {"ich_lr10_el2", ICH_LR10_EL2},
+  {"ich_lr11_el2", ICH_LR11_EL2},
+  {"ich_lr12_el2", ICH_LR12_EL2},
+  {"ich_lr13_el2", ICH_LR13_EL2},
+  {"ich_lr14_el2", ICH_LR14_EL2},
+  {"ich_lr15_el2", ICH_LR15_EL2}
+};
+
+uint32_t
+A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
+  // First search the registers shared by all
+  std::string NameLower = Name.lower();
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Name == NameLower) {
+      Valid = true;
+      return SysRegPairs[i].Value;
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Name == NameLower) {
+      Valid = true;
+      return InstPairs[i].Value;
+    }
+  }
+
+  // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
+  // are: 11 xxx 1x11 xxxx xxx
+  Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+
+  SmallVector<StringRef, 4> Ops;
+  if (!GenericRegPattern.match(NameLower, &Ops)) {
+    Valid = false;
+    return -1;
+  }
+
+  uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+  uint32_t Bits;
+  Ops[1].getAsInteger(10, Op1);
+  Ops[2].getAsInteger(10, CRn);
+  Ops[3].getAsInteger(10, CRm);
+  Ops[4].getAsInteger(10, Op2);
+  Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
+
+  Valid = true;
+  return Bits;
+}
+
+std::string
+A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
+    if (SysRegPairs[i].Value == Bits) {
+      Valid = true;
+      return SysRegPairs[i].Name;
+    }
+  }
+
+  for (unsigned i = 0; i < NumInstPairs; ++i) {
+    if (InstPairs[i].Value == Bits) {
+      Valid = true;
+      return InstPairs[i].Name;
+    }
+  }
+
+  uint32_t Op0 = (Bits >> 14) & 0x3;
+  uint32_t Op1 = (Bits >> 11) & 0x7;
+  uint32_t CRn = (Bits >> 7) & 0xf;
+  uint32_t CRm = (Bits >> 3) & 0xf;
+  uint32_t Op2 = Bits & 0x7;
+
+  // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
+  // name.
+  if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
+      Valid = false;
+      return "";
+  }
+
+  assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
+
+  Valid = true;
+  return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+}
+
+const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+  {"ipas2e1is", IPAS2E1IS},
+  {"ipas2le1is", IPAS2LE1IS},
+  {"vmalle1is", VMALLE1IS},
+  {"alle2is", ALLE2IS},
+  {"alle3is", ALLE3IS},
+  {"vae1is", VAE1IS},
+  {"vae2is", VAE2IS},
+  {"vae3is", VAE3IS},
+  {"aside1is", ASIDE1IS},
+  {"vaae1is", VAAE1IS},
+  {"alle1is", ALLE1IS},
+  {"vale1is", VALE1IS},
+  {"vale2is", VALE2IS},
+  {"vale3is", VALE3IS},
+  {"vmalls12e1is", VMALLS12E1IS},
+  {"vaale1is", VAALE1IS},
+  {"ipas2e1", IPAS2E1},
+  {"ipas2le1", IPAS2LE1},
+  {"vmalle1", VMALLE1},
+  {"alle2", ALLE2},
+  {"alle3", ALLE3},
+  {"vae1", VAE1},
+  {"vae2", VAE2},
+  {"vae3", VAE3},
+  {"aside1", ASIDE1},
+  {"vaae1", VAAE1},
+  {"alle1", ALLE1},
+  {"vale1", VALE1},
+  {"vale2", VALE2},
+  {"vale3", VALE3},
+  {"vmalls12e1", VMALLS12E1},
+  {"vaale1", VAALE1}
+};
+
+A64TLBI::TLBIMapper::TLBIMapper()
+  : NamedImmMapper(TLBIPairs, 0) {}
+
+bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
+  const fltSemantics &Sem = Val.getSemantics();
+  unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
+
+  uint32_t ExpMask;
+  switch (FracBits) {
+  case 10: // IEEE half-precision
+    ExpMask = 0x1f;
+    break;
+  case 23: // IEEE single-precision
+    ExpMask = 0xff;
+    break;
+  case 52: // IEEE double-precision
+    ExpMask = 0x7ff;
+    break;
+  case 112: // IEEE quad-precision
+    // No immediates are valid for double precision.
+    return false;
+  default:
+    llvm_unreachable("Only half, single and double precision supported");
+  }
+
+  uint32_t ExpStart = FracBits;
+  uint64_t FracMask = (1ULL << FracBits) - 1;
+
+  uint32_t Sign = Val.isNegative();
+
+  uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
+  uint64_t Fraction = Bits & FracMask;
+  int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
+  Exponent -= ExpMask >> 1;
+
+  // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
+  // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
+  // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
+  uint64_t A64FracStart = FracBits - 4;
+  uint64_t A64FracMask = 0xf;
+
+  // Are there too many fraction bits?
+  if (Fraction & ~(A64FracMask << A64FracStart))
+    return false;
+
+  if (Exponent < -3 || Exponent > 4)
+    return false;
+
+  uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
+  uint32_t PackedExp = (Exponent + 7) & 0x7;
+
+  Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
+  return true;
+}
+
+// Encoding of the immediate for logical (immediate) instructions:
+//
+// | N | imms   | immr   | size | R            | S            |
+// |---+--------+--------+------+--------------+--------------|
+// | 1 | ssssss | rrrrrr |   64 | UInt(rrrrrr) | UInt(ssssss) |
+// | 0 | 0sssss | xrrrrr |   32 | UInt(rrrrr)  | UInt(sssss)  |
+// | 0 | 10ssss | xxrrrr |   16 | UInt(rrrr)   | UInt(ssss)   |
+// | 0 | 110sss | xxxrrr |    8 | UInt(rrr)    | UInt(sss)    |
+// | 0 | 1110ss | xxxxrr |    4 | UInt(rr)     | UInt(ss)     |
+// | 0 | 11110s | xxxxxr |    2 | UInt(r)      | UInt(s)      |
+// | 0 | 11111x | -      |      | UNALLOCATED  |              |
+//
+// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
+// which the lower S+1 bits are ones and the remaining bits are zero, then
+// rotated right by R bits, which is then replicated across the datapath.
+//
+// + Values of 'N', 'imms' and 'immr' which do not match the above table are
+//   RESERVED.
+// + If all 's' bits in the imms field are set then the instruction is
+//   RESERVED.
+// + The 'x' bits in the 'immr' field are IGNORED.
+
+bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
+  int RepeatWidth;
+  int Rotation = 0;
+  int Num1s = 0;
+
+  // Because there are S+1 ones in the replicated mask, an immediate of all
+  // zeros is not allowed. Filtering it here is probably more efficient.
+  if (Imm == 0) return false;
+
+  for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
+    uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
+    uint64_t ReplicatedMask = Imm & RepeatMask;
+
+    if (ReplicatedMask == 0) continue;
+
+    // First we have to make sure the mask is actually repeated in each slot for
+    // this width-specifier.
+    bool IsReplicatedMask = true;
+    for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
+      if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
+        IsReplicatedMask = false;
+        break;
+      }
+    }
+    if (!IsReplicatedMask) continue;
+
+    // Now we have to work out the amount of rotation needed. The first part of
+    // this calculation is actually independent of RepeatWidth, but the complex
+    // case will depend on it.
+    Rotation = CountTrailingZeros_64(Imm);
+    if (Rotation == 0) {
+      // There were no leading zeros, which means it's either in place or there
+      // are 1s at each end (e.g. 0x8003 needs rotating).
+      Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
+                                : CountLeadingOnes_32(Imm);
+      Rotation = RepeatWidth - Rotation;
+    }
+
+    uint64_t ReplicatedOnes = (ReplicatedMask >> Rotation)
+      | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
+    // Of course, they may not actually be ones, so we have to check that:
+    if (!isMask_64(ReplicatedOnes))
+      continue;
+
+    Num1s = CountTrailingOnes_64(ReplicatedOnes);
+
+    // We know we've got an almost valid encoding (certainly, if this is invalid
+    // no other parameters would work).
+    break;
+  }
+
+  // The encodings which would produce all 1s are RESERVED.
+  if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
+
+  uint32_t N = RepeatWidth == 64;
+  uint32_t ImmR = RepeatWidth - Rotation;
+  uint32_t ImmS = Num1s - 1;
+
+  switch (RepeatWidth) {
+  default: break; // No action required for other valid rotations.
+  case 16: ImmS |= 0x20; break; // 10ssss
+  case 8: ImmS |= 0x30; break;  // 110sss
+  case 4: ImmS |= 0x38; break;  // 1110ss
+  case 2: ImmS |= 0x3c; break;  // 11110s
+  }
+
+  Bits = ImmS | (ImmR << 6) | (N << 12);
+
+  return true;
+}
+
+
+bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
+                               uint64_t &Imm) {
+  uint32_t N = Bits >> 12;
+  uint32_t ImmR = (Bits >> 6) & 0x3f;
+  uint32_t ImmS = Bits & 0x3f;
+
+  // N=1 encodes a 64-bit replication and is invalid for the 32-bit
+  // instructions.
+  if (RegWidth == 32 && N != 0) return false;
+
+  int Width = 0;
+  if (N == 1)
+    Width = 64;
+  else if ((ImmS & 0x20) == 0)
+    Width = 32;
+  else if ((ImmS & 0x10) == 0)
+    Width = 16;
+  else if ((ImmS & 0x08) == 0)
+    Width = 8;
+  else if ((ImmS & 0x04) == 0)
+    Width = 4;
+  else if ((ImmS & 0x02) == 0)
+    Width = 2;
+  else {
+    // ImmS  is 0b11111x: UNALLOCATED
+    return false;
+  }
+
+  int Num1s = (ImmS & (Width - 1)) + 1;
+
+  // All encodings which would map to -1 (signed) are RESERVED.
+  if (Num1s == Width) return false;
+
+  int Rotation = (ImmR & (Width - 1));
+  uint64_t Mask = (1ULL << Num1s) - 1;
+  uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
+  Mask = (Mask >> Rotation)
+    | ((Mask << (Width - Rotation)) & WidthMask);
+
+  Imm = 0;
+  for (unsigned i = 0; i < RegWidth / Width; ++i) {
+    Imm |= Mask;
+    Mask <<= Width;
+  }
+
+  return true;
+}
+
+bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
+  // If high bits are set then a 32-bit MOVZ can't possibly work.
+  if (RegWidth == 32 && (Value & ~0xffffffffULL))
+    return false;
+
+  for (int i = 0; i < RegWidth; i += 16) {
+    // If the value is 0 when we mask out all the bits that could be set with
+    // the current LSL value then it's representable.
+    if ((Value & ~(0xffffULL << i)) == 0) {
+      Shift = i / 16;
+      UImm16 = (Value >> i) & 0xffff;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
+  // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
+
+  // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
+  // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
+  // a valid input for isMOVZImm.
+  if (RegWidth == 32 && (Value & ~0xffffffffULL))
+    return false;
+
+  uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
+
+  return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
+}
+
+bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
+                            int &UImm16, int &Shift) {
+  if (isMOVZImm(RegWidth, Value, UImm16, Shift))
+    return false;
+
+  return isMOVNImm(RegWidth, Value, UImm16, Shift);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
new file mode 100644
index 000000000000..1b773d632ebe
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -0,0 +1,1068 @@
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the AArch64 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_AARCH64_BASEINFO_H
+#define LLVM_AARCH64_BASEINFO_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// // Enums corresponding to AArch64 condition codes
+namespace A64CC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes {   // Meaning (integer)          Meaning (floating-point)
+    EQ = 0,        // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Unsigned higher or same    >, ==, or unordered
+    LO,            // Unsigned lower or same     Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Ordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Signed greater than        Greater than
+    LE,            // Signed less than or equal  <, ==, or unordered
+    AL,            // Always (unconditional)     Always (unconditional)
+    NV,             // Always (unconditional)     Always (unconditional)
+    // Note the NV exists purely to disassemble 0b1111. Execution
+    // is "always".
+    Invalid
+  };
+
+} // namespace A64CC
+
+inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code");
+  case A64CC::EQ:  return "eq";
+  case A64CC::NE:  return "ne";
+  case A64CC::HS:  return "hs";
+  case A64CC::LO:  return "lo";
+  case A64CC::MI:  return "mi";
+  case A64CC::PL:  return "pl";
+  case A64CC::VS:  return "vs";
+  case A64CC::VC:  return "vc";
+  case A64CC::HI:  return "hi";
+  case A64CC::LS:  return "ls";
+  case A64CC::GE:  return "ge";
+  case A64CC::LT:  return "lt";
+  case A64CC::GT:  return "gt";
+  case A64CC::LE:  return "le";
+  case A64CC::AL:  return "al";
+  case A64CC::NV:  return "nv";
+  }
+}
+
+inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
+  return StringSwitch<A64CC::CondCodes>(CondStr.lower())
+             .Case("eq", A64CC::EQ)
+             .Case("ne", A64CC::NE)
+             .Case("ne", A64CC::NE)
+             .Case("hs", A64CC::HS)
+             .Case("cs", A64CC::HS)
+             .Case("lo", A64CC::LO)
+             .Case("cc", A64CC::LO)
+             .Case("mi", A64CC::MI)
+             .Case("pl", A64CC::PL)
+             .Case("vs", A64CC::VS)
+             .Case("vc", A64CC::VC)
+             .Case("hi", A64CC::HI)
+             .Case("ls", A64CC::LS)
+             .Case("ge", A64CC::GE)
+             .Case("lt", A64CC::LT)
+             .Case("gt", A64CC::GT)
+             .Case("le", A64CC::LE)
+             .Case("al", A64CC::AL)
+             .Case("nv", A64CC::NV)
+             .Default(A64CC::Invalid);
+}
+
+inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
+  // It turns out that the condition codes have been designed so that in order
+  // to reverse the intent of the condition you only have to invert the low bit:
+
+  return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+}
+
+/// Instances of this class can perform bidirectional mapping from random
+/// identifier strings to operand encodings. For example "MSR" takes a named
+/// system-register which must be encoded somehow and decoded for printing. This
+/// central location means that the information for those transformations is not
+/// duplicated and remains in sync.
+///
+/// FIXME: currently the algorithm is a completely unoptimised linear
+/// search. Obviously this could be improved, but we would probably want to work
+/// out just how often these instructions are emitted before working on it. It
+/// might even be optimal to just reorder the tables for the common instructions
+/// rather than changing the algorithm.
+struct NamedImmMapper {
+  struct Mapping {
+    const char *Name;
+    uint32_t Value;
+  };
+
+  template<int N>
+  NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
+
+  StringRef toString(uint32_t Value, bool &Valid) const;
+  uint32_t fromString(StringRef Name, bool &Valid) const;
+
+  /// Many of the instructions allow an alternative assembly form consisting of
+  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
+  /// N being 0 indicates no immediate syntax-form is allowed.
+  bool validImm(uint32_t Value) const;
+protected:
+  const Mapping *Pairs;
+  size_t NumPairs;
+  uint32_t TooBigImm;
+};
+
+namespace A64AT {
+  enum ATValues {
+    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
+    S1E1R = 0x43c0,  // 01  000  0111  1000  000
+    S1E2R = 0x63c0,  // 01  100  0111  1000  000
+    S1E3R = 0x73c0,  // 01  110  0111  1000  000
+    S1E1W = 0x43c1,  // 01  000  0111  1000  001
+    S1E2W = 0x63c1,  // 01  100  0111  1000  001
+    S1E3W = 0x73c1,  // 01  110  0111  1000  001
+    S1E0R = 0x43c2,  // 01  000  0111  1000  010
+    S1E0W = 0x43c3,  // 01  000  0111  1000  011
+    S12E1R = 0x63c4, // 01  100  0111  1000  100
+    S12E1W = 0x63c5, // 01  100  0111  1000  101
+    S12E0R = 0x63c6, // 01  100  0111  1000  110
+    S12E0W = 0x63c7  // 01  100  0111  1000  111
+  };
+
+  struct ATMapper : NamedImmMapper {
+    const static Mapping ATPairs[];
+
+    ATMapper();
+  };
+
+}
+namespace A64DB {
+  enum DBValues {
+    Invalid = -1,
+    OSHLD = 0x1,
+    OSHST = 0x2,
+    OSH =   0x3,
+    NSHLD = 0x5,
+    NSHST = 0x6,
+    NSH =   0x7,
+    ISHLD = 0x9,
+    ISHST = 0xa,
+    ISH =   0xb,
+    LD =    0xd,
+    ST =    0xe,
+    SY =    0xf
+  };
+
+  struct DBarrierMapper : NamedImmMapper {
+    const static Mapping DBarrierPairs[];
+
+    DBarrierMapper();
+  };
+}
+
+namespace  A64DC {
+  enum DCValues {
+    Invalid = -1,   // Op1  CRn   CRm   Op2
+    ZVA   = 0x5ba1, // 01  011  0111  0100  001
+    IVAC  = 0x43b1, // 01  000  0111  0110  001
+    ISW   = 0x43b2, // 01  000  0111  0110  010
+    CVAC  = 0x5bd1, // 01  011  0111  1010  001
+    CSW   = 0x43d2, // 01  000  0111  1010  010
+    CVAU  = 0x5bd9, // 01  011  0111  1011  001
+    CIVAC = 0x5bf1, // 01  011  0111  1110  001
+    CISW  = 0x43f2  // 01  000  0111  1110  010
+  };
+
+  struct DCMapper : NamedImmMapper {
+    const static Mapping DCPairs[];
+
+    DCMapper();
+  };
+
+}
+
+namespace  A64IC {
+  enum ICValues {
+    Invalid = -1,     // Op1  CRn   CRm   Op2
+    IALLUIS = 0x0388, // 000  0111  0001  000
+    IALLU = 0x03a8,   // 000  0111  0101  000
+    IVAU = 0x1ba9     // 011  0111  0101  001
+  };
+
+
+  struct ICMapper : NamedImmMapper {
+    const static Mapping ICPairs[];
+
+    ICMapper();
+  };
+
+  static inline bool NeedsRegister(ICValues Val) {
+    return Val == IVAU;
+  }
+}
+
+namespace  A64ISB {
+  enum ISBValues {
+    Invalid = -1,
+    SY = 0xf
+  };
+  struct ISBMapper : NamedImmMapper {
+    const static Mapping ISBPairs[];
+
+    ISBMapper();
+  };
+}
+
+namespace A64PRFM {
+  enum PRFMValues {
+    Invalid = -1,
+    PLDL1KEEP = 0x00,
+    PLDL1STRM = 0x01,
+    PLDL2KEEP = 0x02,
+    PLDL2STRM = 0x03,
+    PLDL3KEEP = 0x04,
+    PLDL3STRM = 0x05,
+    PLIL1KEEP = 0x08,
+    PLIL1STRM = 0x09,
+    PLIL2KEEP = 0x0a,
+    PLIL2STRM = 0x0b,
+    PLIL3KEEP = 0x0c,
+    PLIL3STRM = 0x0d,
+    PSTL1KEEP = 0x10,
+    PSTL1STRM = 0x11,
+    PSTL2KEEP = 0x12,
+    PSTL2STRM = 0x13,
+    PSTL3KEEP = 0x14,
+    PSTL3STRM = 0x15
+  };
+
+  struct PRFMMapper : NamedImmMapper {
+    const static Mapping PRFMPairs[];
+
+    PRFMMapper();
+  };
+}
+
+namespace A64PState {
+  enum PStateValues {
+    Invalid = -1,
+    SPSel = 0x05,
+    DAIFSet = 0x1e,
+    DAIFClr = 0x1f
+  };
+
+  struct PStateMapper : NamedImmMapper {
+    const static Mapping PStatePairs[];
+
+    PStateMapper();
+  };
+
+}
+
+namespace A64SE {
+    enum ShiftExtSpecifiers {
+        Invalid = -1,
+        LSL,
+        LSR,
+        ASR,
+        ROR,
+
+        UXTB,
+        UXTH,
+        UXTW,
+        UXTX,
+
+        SXTB,
+        SXTH,
+        SXTW,
+        SXTX
+    };
+}
+
+namespace A64SysReg {
+  enum SysRegROValues {
+    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
+    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
+    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
+    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
+    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
+    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
+    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
+    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
+    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
+    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
+    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
+    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
+    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
+    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
+    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
+    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
+    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
+    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
+    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
+    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
+    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
+    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
+    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
+    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
+    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
+    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
+    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
+    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
+    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
+    ID_AA64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
+    ID_AA64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
+    ID_AA64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
+    ID_AA64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
+    ID_AA64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
+    ID_AA64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
+    ID_AA64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
+    ID_AA64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
+    ID_AA64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
+    ID_AA64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
+    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
+    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
+    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
+    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
+    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
+    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
+    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
+    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
+    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
+
+    // Trace registers
+    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
+    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
+    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
+    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
+    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
+    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
+    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
+    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
+    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
+    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
+    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
+    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
+    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
+    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
+    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
+    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
+    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
+    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
+    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
+    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
+    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
+    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
+    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
+    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
+    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
+    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
+    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
+    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
+    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
+    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
+    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
+    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
+    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
+    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
+    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
+    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
+
+    // GICv3 registers
+    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
+    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
+    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
+    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
+    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
+    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
+    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
+    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
+  };
+
+  enum SysRegWOValues {
+    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
+    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
+    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
+
+    // Trace Registers
+    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
+    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
+
+    // GICv3 registers
+    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
+    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
+    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
+    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
+    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
+    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
+  };
+
+  enum SysRegValues {
+    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
+    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
+    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
+    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
+    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
+    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
+    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
+    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
+    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
+    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
+    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
+    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
+    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
+    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
+    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
+    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
+    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
+    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
+    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
+    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
+    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
+    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
+    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
+    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
+    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
+    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
+    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
+    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
+    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
+    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
+    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
+    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
+    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
+    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
+    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
+    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
+    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
+    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
+    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
+    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
+    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
+    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
+    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
+    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
+    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
+    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
+    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
+    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
+    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
+    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
+    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
+    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
+    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
+    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
+    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
+    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
+    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
+    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
+    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
+    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
+    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
+    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
+    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
+    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
+    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
+    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
+    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
+    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
+    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
+    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
+    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
+    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
+    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
+    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
+    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
+    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
+    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
+    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
+    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
+    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
+    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
+    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
+    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
+    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
+    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
+    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
+    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
+    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
+    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
+    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
+    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
+    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
+    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
+    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
+    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
+    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
+    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
+    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
+    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
+    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
+    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
+    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
+    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
+    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
+    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
+    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
+    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
+    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
+    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
+    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
+    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
+    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
+    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
+    SP_EL0            = 0xc208, // 11  000  0100  0001  000
+    SP_EL1            = 0xe208, // 11  100  0100  0001  000
+    SP_EL2            = 0xf208, // 11  110  0100  0001  000
+    SPSel             = 0xc210, // 11  000  0100  0010  000
+    NZCV              = 0xda10, // 11  011  0100  0010  000
+    DAIF              = 0xda11, // 11  011  0100  0010  001
+    CurrentEL         = 0xc212, // 11  000  0100  0010  010
+    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
+    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
+    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
+    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
+    FPCR              = 0xda20, // 11  011  0100  0100  000
+    FPSR              = 0xda21, // 11  011  0100  0100  001
+    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
+    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
+    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
+    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
+    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
+    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
+    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
+    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
+    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
+    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
+    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
+    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
+    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
+    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
+    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
+    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
+    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
+    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
+    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
+    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
+    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
+    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
+    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
+    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
+    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
+    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
+    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
+    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
+    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
+    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
+    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
+    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
+    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
+    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
+    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
+    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
+    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
+    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
+    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
+    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
+    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
+    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
+    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
+    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
+    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
+    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
+    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
+    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
+    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
+    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
+    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
+    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
+    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
+    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
+    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
+    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
+    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
+    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
+    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
+    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
+    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
+    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
+    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
+    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
+    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
+    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
+    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
+    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
+    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
+    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
+    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
+    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
+    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
+    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
+    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
+    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
+    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
+    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
+    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
+    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
+    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
+    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
+    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
+    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
+    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
+    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
+    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
+    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
+    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
+    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
+    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
+    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
+    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
+    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
+    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
+    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
+    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
+    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
+    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
+    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
+    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
+    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
+    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
+    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
+    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
+    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
+    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
+    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
+    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
+    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
+    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
+    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
+    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
+    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
+    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
+    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
+    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
+    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
+    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
+    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
+    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
+    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
+    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
+    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
+    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
+    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
+    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
+
+    // Trace registers
+    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
+    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
+    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
+    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
+    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
+    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
+    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
+    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
+    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
+    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
+    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
+    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
+    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
+    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
+    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
+    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
+    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
+    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
+    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
+    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
+    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
+    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
+    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
+    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
+    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
+    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
+    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
+    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
+    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
+    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
+    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
+    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
+    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
+    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
+    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
+    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
+    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
+    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
+    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
+    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
+    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
+    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
+    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
+    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
+    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
+    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
+    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
+    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
+    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
+    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
+    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
+    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
+    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
+    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
+    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
+    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
+    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
+    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
+    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
+    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
+    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
+    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
+    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
+    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
+    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
+    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
+    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
+    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
+    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
+    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
+    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
+    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
+    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
+    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
+    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
+    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
+    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
+    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
+    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
+    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
+    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
+    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
+    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
+    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
+    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
+    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
+    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
+    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
+    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
+    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
+    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
+    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
+    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
+    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
+    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
+    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
+    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
+    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
+    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
+    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
+    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
+    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
+    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
+    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
+    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
+    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
+    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
+    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
+    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
+    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
+    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
+    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
+    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
+    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
+    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
+    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
+    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
+    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
+    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
+    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
+    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
+    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
+    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
+    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
+    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
+    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
+    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
+    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
+    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
+    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
+    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
+    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
+    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
+    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
+    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
+    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
+    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
+    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
+    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
+    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
+    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
+    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
+    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
+    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
+    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
+    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
+    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
+    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
+    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
+    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
+    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
+    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
+    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
+    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
+    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
+    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
+    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
+    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
+    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
+    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
+    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
+    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
+    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
+    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
+    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
+    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
+    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
+    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
+    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
+    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
+    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
+    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
+
+    // GICv3 registers
+    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
+    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
+    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
+    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
+    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
+    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
+    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
+    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
+    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
+    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
+    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
+    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
+    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
+    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
+    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
+    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
+    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
+    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
+    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
+    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
+    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
+    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
+    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
+    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
+    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
+    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
+    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
+    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
+    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
+    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
+    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
+    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
+    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
+    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
+    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
+    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
+    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
+    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
+    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
+    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
+    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
+    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
+    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
+    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
+    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
+    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
+    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
+    ICH_LR15_EL2      = 0xe66f  // 11  100  1100  1101  111
+  };
+
+  // Note that these do not inherit from NamedImmMapper. This class is
+  // sufficiently different in its behaviour that I don't believe it's worth
+  // burdening the common NamedImmMapper with abstractions only needed in
+  // this one case.
+  struct SysRegMapper {
+    static const NamedImmMapper::Mapping SysRegPairs[];
+
+    const NamedImmMapper::Mapping *InstPairs;
+    size_t NumInstPairs;
+
+    SysRegMapper() {}
+    uint32_t fromString(StringRef Name, bool &Valid) const;
+    std::string toString(uint32_t Bits, bool &Valid) const;
+  };
+
+  struct MSRMapper : SysRegMapper {
+    static const NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper();
+  };
+
+  struct MRSMapper : SysRegMapper {
+    static const NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper();
+  };
+
+  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+}
+
+namespace A64TLBI {
+  enum TLBIValues {
+    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
+    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
+    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
+    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
+    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
+    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
+    VAE1IS       = 0x4419, // 01  000  1000  0011  001
+    VAE2IS       = 0x6419, // 01  100  1000  0011  001
+    VAE3IS       = 0x7419, // 01  110  1000  0011  001
+    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
+    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
+    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
+    VALE1IS      = 0x441d, // 01  000  1000  0011  101
+    VALE2IS      = 0x641d, // 01  100  1000  0011  101
+    VALE3IS      = 0x741d, // 01  110  1000  0011  101
+    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
+    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
+    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
+    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
+    VMALLE1      = 0x4438, // 01  000  1000  0111  000
+    ALLE2        = 0x6438, // 01  100  1000  0111  000
+    ALLE3        = 0x7438, // 01  110  1000  0111  000
+    VAE1         = 0x4439, // 01  000  1000  0111  001
+    VAE2         = 0x6439, // 01  100  1000  0111  001
+    VAE3         = 0x7439, // 01  110  1000  0111  001
+    ASIDE1       = 0x443a, // 01  000  1000  0111  010
+    VAAE1        = 0x443b, // 01  000  1000  0111  011
+    ALLE1        = 0x643c, // 01  100  1000  0111  100
+    VALE1        = 0x443d, // 01  000  1000  0111  101
+    VALE2        = 0x643d, // 01  100  1000  0111  101
+    VALE3        = 0x743d, // 01  110  1000  0111  101
+    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
+    VAALE1       = 0x443f  // 01  000  1000  0111  111
+  };
+
+  struct TLBIMapper : NamedImmMapper {
+    const static Mapping TLBIPairs[];
+
+    TLBIMapper();
+  };
+
+  static inline bool NeedsRegister(TLBIValues Val) {
+    switch (Val) {
+    case VMALLE1IS:
+    case ALLE2IS:
+    case ALLE3IS:
+    case ALLE1IS:
+    case VMALLS12E1IS:
+    case VMALLE1:
+    case ALLE2:
+    case ALLE3:
+    case ALLE1:
+    case VMALLS12E1:
+      return false;
+    default:
+      return true;
+    }
+  }
+}
+
+namespace AArch64II {
+
+  enum TOF {
+    //===--------------------------------------------------------------===//
+    // AArch64 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    // MO_GOT - Represents a relocation referring to the GOT entry of a given
+    // symbol. Used in adrp.
+    MO_GOT,
+
+    // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
+    // GOT entry of a given symbol. Used in ldr only.
+    MO_GOT_LO12,
+
+    // MO_DTPREL_* - Represents a relocation referring to the offset from a
+    // module's dynamic thread pointer. Used in the local-dynamic TLS access
+    // model.
+    MO_DTPREL_G1,
+    MO_DTPREL_G0_NC,
+
+    // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
+    // providing the offset of a variable from the thread-pointer. Used in
+    // initial-exec TLS model where this offset is assigned in the static thread
+    // block and thus known by the dynamic linker.
+    MO_GOTTPREL,
+    MO_GOTTPREL_LO12,
+
+    // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
+    // a TLS descriptor chosen by the dynamic linker. Used for the
+    // general-dynamic and local-dynamic TLS access models where very littls is
+    // known at link-time.
+    MO_TLSDESC,
+    MO_TLSDESC_LO12,
+
+    // MO_TPREL_* - Represents a relocation referring to the offset of a
+    // variable from the thread pointer itself. Used in the local-exec TLS
+    // access model.
+    MO_TPREL_G1,
+    MO_TPREL_G0_NC,
+
+    // MO_LO12 - On a symbol operand, this represents a relocation containing
+    // lower 12 bits of the address. Used in add/sub/ldr/str.
+    MO_LO12
+  };
+}
+
+class APFloat;
+
+namespace A64Imms {
+  bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
+
+  inline bool isFPImm(const APFloat &Val) {
+    uint32_t Imm8;
+    return isFPImm(Val, Imm8);
+  }
+
+  bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
+  bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
+
+  bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+  bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+
+  // We sometimes want to know whether the immediate is representable with a
+  // MOVN but *not* with a MOVZ (because that would take priority).
+  bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
+
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
new file mode 100644
index 000000000000..f0d4dbe2bfb3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -0,0 +1,704 @@
+//=== A15SDOptimizerPass.cpp - Optimize DPR and SPR register accesses on A15==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Cortex-A15 processor employs a tracking scheme in its register renaming
+// in order to process each instruction's micro-ops speculatively and
+// out-of-order with appropriate forwarding. The ARM architecture allows VFP
+// instructions to read and write 32-bit S-registers.  Each S-register
+// corresponds to one half (upper or lower) of an overlaid 64-bit D-register.
+//
+// There are several instruction patterns which can be used to provide this
+// capability which can provide higher performance than other, potentially more
+// direct patterns, specifically around when one micro-op reads a D-register
+// operand that has recently been written as one or more S-register results.
+//
+// This file defines a pre-regalloc pass which looks for SPR producers which
+// are going to be used by a DPR (or QPR) consumers and creates the more
+// optimized access pattern.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "a15-sd-optimizer"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMISelLowering.h"
+#include "ARMTargetMachine.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <set>
+
+using namespace llvm;
+
+namespace {
+  struct A15SDOptimizer : public MachineFunctionPass {
+    static char ID;
+    A15SDOptimizer() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM A15 S->D optimizer";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool runOnInstruction(MachineInstr *MI);
+
+    //
+    // Instruction builder helpers
+    //
+    unsigned createDupLane(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Reg, unsigned Lane,
+                           bool QPR=false);
+
+    unsigned createExtractSubreg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator InsertBefore,
+                                 DebugLoc DL,
+                                 unsigned DReg, unsigned Lane,
+                                 const TargetRegisterClass *TRC);
+
+    unsigned createVExt(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator InsertBefore,
+                        DebugLoc DL,
+                        unsigned Ssub0, unsigned Ssub1);
+
+    unsigned createRegSequence(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL,
+                               unsigned Reg1, unsigned Reg2);
+
+    unsigned createInsertSubreg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator InsertBefore,
+                                DebugLoc DL, unsigned DReg, unsigned Lane,
+                                unsigned ToInsert);
+
+    unsigned createImplicitDef(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator InsertBefore,
+                               DebugLoc DL);
+    
+    //
+    // Various property checkers
+    //
+    bool usesRegClass(MachineOperand &MO, const TargetRegisterClass *TRC);
+    bool hasPartialWrite(MachineInstr *MI);
+    SmallVector<unsigned, 8> getReadDPRs(MachineInstr *MI);
+    unsigned getDPRLaneFromSPR(unsigned SReg);
+
+    //
+    // Methods used for getting the definitions of partial registers
+    //
+
+    MachineInstr *elideCopies(MachineInstr *MI);
+    void elideCopiesAndPHIs(MachineInstr *MI,
+                            SmallVectorImpl<MachineInstr*> &Outs);
+
+    //
+    // Pattern optimization methods
+    //
+    unsigned optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg);
+    unsigned optimizeSDPattern(MachineInstr *MI);
+    unsigned getPrefSPRLane(unsigned SReg);
+
+    //
+    // Sanitizing method - used to make sure if don't leave dead code around.
+    //
+    void eraseInstrWithNoUses(MachineInstr *MI);
+
+    //
+    // A map used to track the changes done by this pass.
+    //
+    std::map<MachineInstr*, unsigned> Replacements;
+    std::set<MachineInstr *> DeadInstr;
+  };
+  char A15SDOptimizer::ID = 0;
+} // end anonymous namespace
+
+// Returns true if this is a use of a SPR register.
+bool A15SDOptimizer::usesRegClass(MachineOperand &MO,
+                                  const TargetRegisterClass *TRC) {
+  if (!MO.isReg())
+    return false;
+  unsigned Reg = MO.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return MRI->getRegClass(Reg)->hasSuperClassEq(TRC);
+  else
+    return TRC->contains(Reg);
+}
+
+unsigned A15SDOptimizer::getDPRLaneFromSPR(unsigned SReg) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1,
+                                           &ARM::DPRRegClass);
+  if (DReg != ARM::NoRegister) return ARM::ssub_1;
+  return ARM::ssub_0;
+}
+
+// Get the subreg type that is most likely to be coalesced
+// for an SPR register that will be used in VDUP32d pseudo.
+unsigned A15SDOptimizer::getPrefSPRLane(unsigned SReg) {
+  if (!TRI->isVirtualRegister(SReg))
+    return getDPRLaneFromSPR(SReg);
+
+  MachineInstr *MI = MRI->getVRegDef(SReg);
+  if (!MI) return ARM::ssub_0;
+  MachineOperand *MO = MI->findRegisterDefOperand(SReg);
+
+  assert(MO->isReg() && "Non register operand found!");
+  if (!MO) return ARM::ssub_0;
+
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1),
+                                    &ARM::SPRRegClass)) {
+    SReg = MI->getOperand(1).getReg();
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+    if (MO->getSubReg() == ARM::ssub_1) return ARM::ssub_1;
+    return ARM::ssub_0;
+  }
+  return getDPRLaneFromSPR(SReg);
+}
+
+// MI is known to be dead. Figure out what instructions
+// are also made dead by this and mark them for removal.
+void A15SDOptimizer::eraseInstrWithNoUses(MachineInstr *MI) {
+  SmallVector<MachineInstr *, 8> Front;
+  DeadInstr.insert(MI);
+
+  DEBUG(dbgs() << "Deleting base instruction " << *MI << "\n");
+  Front.push_back(MI);
+
+  while (Front.size() != 0) {
+    MI = Front.back();
+    Front.pop_back();
+
+    // MI is already known to be dead. We need to see
+    // if other instructions can also be removed.
+    for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if ((!MO.isReg()) || (!MO.isUse()))
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TRI->isVirtualRegister(Reg))
+        continue;
+      MachineOperand *Op = MI->findRegisterDefOperand(Reg);
+
+      if (!Op)
+        continue;
+
+      MachineInstr *Def = Op->getParent();
+
+      // We don't need to do anything if we have already marked
+      // this instruction as being dead.
+      if (DeadInstr.find(Def) != DeadInstr.end())
+        continue;
+
+      // Check if all the uses of this instruction are marked as
+      // dead. If so, we can also mark this instruction as being
+      // dead.
+      bool IsDead = true;
+      for (unsigned int j = 0; j < Def->getNumOperands(); ++j) {
+        MachineOperand &MODef = Def->getOperand(j);
+        if ((!MODef.isReg()) || (!MODef.isDef()))
+          continue;
+        unsigned DefReg = MODef.getReg();
+        if (!TRI->isVirtualRegister(DefReg)) {
+          IsDead = false;
+          break;
+        }
+        for (MachineRegisterInfo::use_iterator II = MRI->use_begin(Reg),
+                            EE = MRI->use_end();
+                            II != EE; ++II) {
+          // We don't care about self references.
+          if (&*II == Def)
+            continue;
+          if (DeadInstr.find(&*II) == DeadInstr.end()) {
+            IsDead = false;
+            break;
+          }
+        }
+      }
+
+      if (!IsDead) continue;
+
+      DEBUG(dbgs() << "Deleting instruction " << *Def << "\n");
+      DeadInstr.insert(Def);
+    }
+  }
+}
+
+// Creates the more optimized patterns and generally does all the code
+// transformations in this pass.
+unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
+  if (MI->isCopy()) {
+    return optimizeAllLanesPattern(MI, MI->getOperand(1).getReg());
+  }
+
+  if (MI->isInsertSubreg()) {
+    unsigned DPRReg = MI->getOperand(1).getReg();
+    unsigned SPRReg = MI->getOperand(2).getReg();
+
+    if (TRI->isVirtualRegister(DPRReg) && TRI->isVirtualRegister(SPRReg)) {
+      MachineInstr *DPRMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+      MachineInstr *SPRMI = MRI->getVRegDef(MI->getOperand(2).getReg());
+
+      if (DPRMI && SPRMI) {
+        // See if the first operand of this insert_subreg is IMPLICIT_DEF
+        MachineInstr *ECDef = elideCopies(DPRMI);
+        if (ECDef != 0 && ECDef->isImplicitDef()) {
+          // Another corner case - if we're inserting something that is purely
+          // a subreg copy of a DPR, just use that DPR.
+
+          MachineInstr *EC = elideCopies(SPRMI);
+          // Is it a subreg copy of ssub_0?
+          if (EC && EC->isCopy() &&
+              EC->getOperand(1).getSubReg() == ARM::ssub_0) {
+            DEBUG(dbgs() << "Found a subreg copy: " << *SPRMI);
+
+            // Find the thing we're subreg copying out of - is it of the same
+            // regclass as DPRMI? (i.e. a DPR or QPR).
+            unsigned FullReg = SPRMI->getOperand(1).getReg();
+            const TargetRegisterClass *TRC =
+              MRI->getRegClass(MI->getOperand(1).getReg());
+            if (TRC->hasSuperClassEq(MRI->getRegClass(FullReg))) {
+              DEBUG(dbgs() << "Subreg copy is compatible - returning ");
+              DEBUG(dbgs() << PrintReg(FullReg) << "\n");
+              eraseInstrWithNoUses(MI);
+              return FullReg;
+            }
+          }
+
+          return optimizeAllLanesPattern(MI, MI->getOperand(2).getReg());
+        }
+      }
+    }
+    return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1),
+                                          &ARM::SPRRegClass)) {
+    // See if all bar one of the operands are IMPLICIT_DEF and insert the
+    // optimizer pattern accordingly.
+    unsigned NumImplicit = 0, NumTotal = 0;
+    unsigned NonImplicitReg = ~0U;
+
+    for (unsigned I = 1; I < MI->getNumExplicitOperands(); ++I) {
+      if (!MI->getOperand(I).isReg())
+        continue;
+      ++NumTotal;
+      unsigned OpReg = MI->getOperand(I).getReg();
+
+      if (!TRI->isVirtualRegister(OpReg))
+        break;
+
+      MachineInstr *Def = MRI->getVRegDef(OpReg);
+      if (!Def)
+        break;
+      if (Def->isImplicitDef())
+        ++NumImplicit;
+      else
+        NonImplicitReg = MI->getOperand(I).getReg();
+    }
+
+    if (NumImplicit == NumTotal - 1)
+      return optimizeAllLanesPattern(MI, NonImplicitReg);
+    else
+      return optimizeAllLanesPattern(MI, MI->getOperand(0).getReg());
+  }
+
+  assert(0 && "Unhandled update pattern!");
+  return 0;
+}
+
+// Return true if this MachineInstr inserts a scalar (SPR) value into
+// a D or Q register.
+bool A15SDOptimizer::hasPartialWrite(MachineInstr *MI) {
+  // The only way we can do a partial register update is through a COPY,
+  // INSERT_SUBREG or REG_SEQUENCE.
+  if (MI->isCopy() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isInsertSubreg() && usesRegClass(MI->getOperand(2),
+                                           &ARM::SPRRegClass))
+    return true;
+
+  if (MI->isRegSequence() && usesRegClass(MI->getOperand(1), &ARM::SPRRegClass))
+    return true;
+
+  return false;
+}
+
+// Looks through full copies to get the instruction that defines the input
+// operand for MI.
+MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
+  if (!MI->isFullCopy())
+    return MI;
+  if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+    return NULL;
+  MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
+  if (!Def)
+    return NULL;
+  return elideCopies(Def);
+}
+
+// Look through full copies and PHIs to get the set of non-copy MachineInstrs
+// that can produce MI.
+void A15SDOptimizer::elideCopiesAndPHIs(MachineInstr *MI,
+                                        SmallVectorImpl<MachineInstr*> &Outs) {
+   // Looking through PHIs may create loops so we need to track what
+   // instructions we have visited before.
+   std::set<MachineInstr *> Reached;
+   SmallVector<MachineInstr *, 8> Front;
+   Front.push_back(MI);
+   while (Front.size() != 0) {
+     MI = Front.back();
+     Front.pop_back();
+
+     // If we have already explored this MachineInstr, ignore it.
+     if (Reached.find(MI) != Reached.end())
+       continue;
+     Reached.insert(MI);
+     if (MI->isPHI()) {
+       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
+         unsigned Reg = MI->getOperand(I).getReg();
+         if (!TRI->isVirtualRegister(Reg)) {
+           continue;
+         }
+         MachineInstr *NewMI = MRI->getVRegDef(Reg);
+         if (!NewMI)
+           continue;
+         Front.push_back(NewMI);
+       }
+     } else if (MI->isFullCopy()) {
+       if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
+         continue;
+       MachineInstr *NewMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+       if (!NewMI)
+         continue;
+       Front.push_back(NewMI);
+     } else {
+       DEBUG(dbgs() << "Found partial copy" << *MI <<"\n");
+       Outs.push_back(MI);
+     }
+   }
+}
+
+// Return the DPR virtual registers that are read by this machine instruction
+// (if any).
+SmallVector<unsigned, 8> A15SDOptimizer::getReadDPRs(MachineInstr *MI) {
+  if (MI->isCopyLike() || MI->isInsertSubreg() || MI->isRegSequence() ||
+      MI->isKill())
+    return SmallVector<unsigned, 8>();
+
+  SmallVector<unsigned, 8> Defs;
+  for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    if (!usesRegClass(MO, &ARM::DPRRegClass) &&
+        !usesRegClass(MO, &ARM::QPRRegClass))
+      continue;
+
+    Defs.push_back(MO.getReg());
+  }
+  return Defs;
+}
+
+// Creates a DPR register from an SPR one by using a VDUP.
+unsigned
+A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator InsertBefore,
+                              DebugLoc DL,
+                              unsigned Reg, unsigned Lane, bool QPR) {
+  unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
+                                                  &ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
+                         Out)
+                   .addReg(Reg)
+                   .addImm(Lane));
+ 
+  return Out;
+}
+
+// Creates a SPR register from a DPR by copying the value in lane 0.
+unsigned
+A15SDOptimizer::createExtractSubreg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator InsertBefore,
+                                    DebugLoc DL,
+                                    unsigned DReg, unsigned Lane,
+                                    const TargetRegisterClass *TRC) {
+  unsigned Out = MRI->createVirtualRegister(TRC);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::COPY), Out)
+    .addReg(DReg, 0, Lane);
+
+  return Out;
+}
+
+// Takes two SPR registers and creates a DPR by using a REG_SEQUENCE.
+unsigned
+A15SDOptimizer::createRegSequence(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL,
+                                  unsigned Reg1, unsigned Reg2) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::QPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::REG_SEQUENCE), Out)
+    .addReg(Reg1)
+    .addImm(ARM::dsub_0)
+    .addReg(Reg2)
+    .addImm(ARM::dsub_1);
+  return Out;
+}
+
+// Takes two DPR registers that have previously been VDUPed (Ssub0 and Ssub1)
+// and merges them into one DPR register.
+unsigned
+A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           DebugLoc DL,
+                           unsigned Ssub0, unsigned Ssub1) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  AddDefaultPred(BuildMI(MBB,
+                         InsertBefore,
+                         DL,
+                         TII->get(ARM::VEXTd32), Out)
+                   .addReg(Ssub0)
+                   .addReg(Ssub1)
+                   .addImm(1));
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createInsertSubreg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator InsertBefore,
+                                   DebugLoc DL, unsigned DReg, unsigned Lane,
+                                   unsigned ToInsert) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPR_VFP2RegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::INSERT_SUBREG), Out)
+    .addReg(DReg)
+    .addReg(ToInsert)
+    .addImm(Lane);
+
+  return Out;
+}
+
+unsigned
+A15SDOptimizer::createImplicitDef(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator InsertBefore,
+                                  DebugLoc DL) {
+  unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
+  BuildMI(MBB,
+          InsertBefore,
+          DL,
+          TII->get(TargetOpcode::IMPLICIT_DEF), Out);
+  return Out;
+}
+
+// This function inserts instructions in order to optimize interactions between
+// SPR registers and DPR/QPR registers. It does so by performing VDUPs on all
+// lanes, and the using VEXT instructions to recompose the result.
+unsigned
+A15SDOptimizer::optimizeAllLanesPattern(MachineInstr *MI, unsigned Reg) {
+  MachineBasicBlock::iterator InsertPt(MI);
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock &MBB = *MI->getParent();
+  InsertPt++;
+  unsigned Out;
+
+  if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::QPRRegClass)) {
+    unsigned DSub0 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_0, &ARM::DPRRegClass);
+    unsigned DSub1 = createExtractSubreg(MBB, InsertPt, DL, Reg,
+                                         ARM::dsub_1, &ARM::DPRRegClass);
+
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, DSub0, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, DSub0, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+    unsigned Out3 = createDupLane(MBB, InsertPt, DL, DSub1, 0);
+    unsigned Out4 = createDupLane(MBB, InsertPt, DL, DSub1, 1);
+    Out2 = createVExt(MBB, InsertPt, DL, Out3, Out4);
+
+    Out = createRegSequence(MBB, InsertPt, DL, Out, Out2);
+
+  } else if (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::DPRRegClass)) {
+    unsigned Out1 = createDupLane(MBB, InsertPt, DL, Reg, 0);
+    unsigned Out2 = createDupLane(MBB, InsertPt, DL, Reg, 1);
+    Out = createVExt(MBB, InsertPt, DL, Out1, Out2);
+
+  } else {
+    assert(MRI->getRegClass(Reg)->hasSuperClassEq(&ARM::SPRRegClass) &&
+           "Found unexpected regclass!");
+
+    unsigned PrefLane = getPrefSPRLane(Reg);
+    unsigned Lane;
+    switch (PrefLane) {
+      case ARM::ssub_0: Lane = 0; break;
+      case ARM::ssub_1: Lane = 1; break;
+      default: llvm_unreachable("Unknown preferred lane!");
+    }
+
+    bool UsesQPR = usesRegClass(MI->getOperand(0), &ARM::QPRRegClass);
+
+    Out = createImplicitDef(MBB, InsertPt, DL);
+    Out = createInsertSubreg(MBB, InsertPt, DL, Out, PrefLane, Reg);
+    Out = createDupLane(MBB, InsertPt, DL, Out, Lane, UsesQPR);
+    eraseInstrWithNoUses(MI);
+  }
+  return Out;
+}
+
+bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
+  // We look for instructions that write S registers that are then read as
+  // D/Q registers. These can only be caused by COPY, INSERT_SUBREG and
+  // REG_SEQUENCE pseudos that insert an SPR value into a DPR register or
+  // merge two SPR values to form a DPR register.  In order avoid false
+  // positives we make sure that there is an SPR producer so we look past
+  // COPY and PHI nodes to find it.
+  //
+  // The best code pattern for when an SPR producer is going to be used by a
+  // DPR or QPR consumer depends on whether the other lanes of the
+  // corresponding DPR/QPR are currently defined.
+  //
+  // We can handle these efficiently, depending on the type of
+  // pseudo-instruction that is producing the pattern
+  //
+  //   * COPY:          * VDUP all lanes and merge the results together
+  //                      using VEXTs.
+  //
+  //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
+  //                      lane, and the other lane(s) of the DPR/QPR register
+  //                      that we are inserting in are undefined, use the
+  //                      original DPR/QPR value. 
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+  //   * REG_SEQUENCE:  * If all except one of the input operands are
+  //                      IMPLICIT_DEFs, insert the VDUP pattern for just the
+  //                      defined input operand
+  //                    * Otherwise, fall back on the same stategy as COPY.
+  //
+
+  // First, get all the reads of D-registers done by this instruction.
+  SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
+  bool Modified = false;
+
+  for (SmallVector<unsigned, 8>::iterator I = Defs.begin(), E = Defs.end();
+     I != E; ++I) {
+    // Follow the def-use chain for this DPR through COPYs, and also through
+    // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
+    // we can end up with multiple defs of this DPR.
+
+    SmallVector<MachineInstr *, 8> DefSrcs;
+    if (!TRI->isVirtualRegister(*I))
+      continue;
+    MachineInstr *Def = MRI->getVRegDef(*I);
+    if (!Def)
+      continue;
+
+    elideCopiesAndPHIs(Def, DefSrcs);
+
+    for (SmallVector<MachineInstr*, 8>::iterator II = DefSrcs.begin(),
+      EE = DefSrcs.end(); II != EE; ++II) {
+      MachineInstr *MI = *II;
+
+      // If we've already analyzed and replaced this operand, don't do
+      // anything.
+      if (Replacements.find(MI) != Replacements.end())
+        continue;
+
+      // Now, work out if the instruction causes a SPR->DPR dependency.
+      if (!hasPartialWrite(MI))
+        continue;
+
+      // Collect all the uses of this MI's DPR def for updating later.
+      SmallVector<MachineOperand*, 8> Uses;
+      unsigned DPRDefReg = MI->getOperand(0).getReg();
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(DPRDefReg),
+             E = MRI->use_end(); I != E; ++I)
+        Uses.push_back(&I.getOperand());
+
+      // We can optimize this.
+      unsigned NewReg = optimizeSDPattern(MI);
+
+      if (NewReg != 0) {
+        Modified = true;
+        for (SmallVector<MachineOperand*, 8>::const_iterator I = Uses.begin(),
+               E = Uses.end(); I != E; ++I) {
+          DEBUG(dbgs() << "Replacing operand "
+                       << **I << " with "
+                       << PrintReg(NewReg) << "\n");
+          (*I)->substVirtReg(NewReg, 0, *TRI);
+        }
+      }
+      Replacements[MI] = NewReg;
+    }
+  }
+  return Modified;
+}
+
+bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  bool Modified = false;
+
+  DEBUG(dbgs() << "Running on function " << Fn.getName()<< "\n");
+
+  DeadInstr.clear();
+  Replacements.clear();
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+
+    for (MachineBasicBlock::iterator MI = MFI->begin(), ME = MFI->end();
+      MI != ME;) {
+      Modified |= runOnInstruction(MI++);
+    }
+ 
+  }
+
+  for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
+                                            E = DeadInstr.end();
+                                            I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createA15SDOptimizerPass() {
+  return new A15SDOptimizer();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
new file mode 100644
index 000000000000..80e5f37eb086
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -0,0 +1,56 @@
+//===-- ARM.h - Top-level interface for ARM representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_H
+#define TARGET_ARM_H
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMAsmPrinter;
+class ARMBaseTargetMachine;
+class FunctionPass;
+class JITCodeEmitter;
+class MachineInstr;
+class MCInst;
+
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+FunctionPass *createA15SDOptimizerPass();
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMGlobalBaseRegPass();
+FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
+FunctionPass *createARMConstantIslandPass();
+FunctionPass *createMLxExpansionPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createThumb2SizeReductionPass();
+
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
+
+void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                  ARMAsmPrinter &AP);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
new file mode 100644
index 000000000000..68380847a022
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -0,0 +1,320 @@
+//===-- ARM.td - Describe the ARM Target Machine -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget state.
+//
+
+def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
+                                  "Thumb mode">;
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                   "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                   "Enable VFP3 instructions",
+                                   [FeatureVFP2]>;
+def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
+                                   "Enable NEON instructions",
+                                   [FeatureVFP3]>;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution">;
+def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
+                                     "Enable half-precision floating point">;
+def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                     "Enable VFP4 instructions",
+                                     [FeatureVFP3, FeatureFP16]>;
+def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
+                                     "Restrict VFP3 to 16 double registers">;
+def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
+                                     "Enable divide instructions">;
+def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
+                                        "HasHardwareDivideInARM", "true",
+                                      "Enable divide instructions in ARM mode">;
+def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
+                                 "Enable Thumb2 extract and pack instructions">;
+def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                   "Has data barrier (dmb / dsb) instructions">;
+def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                         "FP compare + branch is slow">;
+def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                          "Floating point unit supports single precision only">;
+
+// Some processors have FP multiply-accumulate instructions that don't
+// play nicely with other VFP / NEON instructions, and it's generally better
+// to just not use them.
+def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                         "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+                                       "HasVMLxForwarding", "true",
+                                       "Has multiplier accumulator forwarding">;
+
+// Some processors benefit from using NEON instructions for scalar
+// single-precision FP operations.
+def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
+                                        "true",
+                                        "Use NEON for single precision FP">;
+
+// Disable 32-bit to 16-bit narrowing for experimentation.
+def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
+                                             "Prefer 32-bit Thumb instrs">;
+
+/// Some instructions update CPSR partially, which can add false dependency for
+/// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
+/// mapped to a separate physical register. Avoid partial CPSR update for these
+/// processors.
+def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
+                                               "AvoidCPSRPartialUpdate", "true",
+                                 "Avoid CPSR partial update for OOO execution">;
+
+def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
+                                            "AvoidMOVsShifterOperand", "true",
+                                "Avoid movs instructions with shifter operand">;
+
+// Some processors perform return stack prediction. CodeGen should avoid issue
+// "normal" call instructions to callees which do not return.
+def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true",
+                                     "Has return address stack">;
+
+/// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
+def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
+                                 "Supports v7 DSP instructions in Thumb2">;
+
+// Multiprocessing extension.
+def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                 "Supports Multiprocessing extension">;
+
+// M-series ISA?
+def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+                                     "Is microcontroller profile ('M' series)">;
+
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                       "NaCl trap">;
+
+// ARM ISAs.
+def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
+                                   "Support ARM v4T instructions">;
+def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
+                                   "Support ARM v5T instructions",
+                                   [HasV4TOps]>;
+def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
+                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   [HasV5TOps]>;
+def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
+                                   "Support ARM v6 instructions",
+                                   [HasV5TEOps]>;
+def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
+                                   "Support ARM v6t2 instructions",
+                                   [HasV6Ops, FeatureThumb2]>;
+def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
+                                   "Support ARM v7 instructions",
+                                   [HasV6T2Ops]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+include "ARMSchedule.td"
+
+// ARM processor families.
+def ProcA5      : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5",
+                                   "Cortex-A5 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk]>;
+def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
+                                   "Cortex-A8 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx,
+                                    FeatureVMLxForwarding, FeatureT2XtPk]>;
+def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
+                                   "Cortex-A9 ARM processors",
+                                   [FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
+def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+                                   "Swift ARM processors",
+                                   [FeatureNEONForFP, FeatureT2XtPk,
+                                    FeatureVFP4, FeatureMP, FeatureHWDiv,
+                                    FeatureHWDivARM, FeatureAvoidPartialCPSR,
+                                    FeatureAvoidMOVsShOp,
+                                    FeatureHasSlowFPVMLx]>;
+
+// FIXME: It has not been determined if A15 has these features.
+def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors",
+                                   [FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
+def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
+                                   "Cortex-R5 ARM processors",
+                                   [FeatureSlowFPBrcc, FeatureHWDivARM,
+                                    FeatureHasSlowFPVMLx,
+                                    FeatureAvoidPartialCPSR,
+                                    FeatureT2XtPk]>;
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+// V4 Processors.
+def : ProcNoItin<"generic",         []>;
+def : ProcNoItin<"arm8",            []>;
+def : ProcNoItin<"arm810",          []>;
+def : ProcNoItin<"strongarm",       []>;
+def : ProcNoItin<"strongarm110",    []>;
+def : ProcNoItin<"strongarm1100",   []>;
+def : ProcNoItin<"strongarm1110",   []>;
+
+// V4T Processors.
+def : ProcNoItin<"arm7tdmi",        [HasV4TOps]>;
+def : ProcNoItin<"arm7tdmi-s",      [HasV4TOps]>;
+def : ProcNoItin<"arm710t",         [HasV4TOps]>;
+def : ProcNoItin<"arm720t",         [HasV4TOps]>;
+def : ProcNoItin<"arm9",            [HasV4TOps]>;
+def : ProcNoItin<"arm9tdmi",        [HasV4TOps]>;
+def : ProcNoItin<"arm920",          [HasV4TOps]>;
+def : ProcNoItin<"arm920t",         [HasV4TOps]>;
+def : ProcNoItin<"arm922t",         [HasV4TOps]>;
+def : ProcNoItin<"arm940t",         [HasV4TOps]>;
+def : ProcNoItin<"ep9312",          [HasV4TOps]>;
+
+// V5T Processors.
+def : ProcNoItin<"arm10tdmi",       [HasV5TOps]>;
+def : ProcNoItin<"arm1020t",        [HasV5TOps]>;
+
+// V5TE Processors.
+def : ProcNoItin<"arm9e",           [HasV5TEOps]>;
+def : ProcNoItin<"arm926ej-s",      [HasV5TEOps]>;
+def : ProcNoItin<"arm946e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm966e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm968e-s",       [HasV5TEOps]>;
+def : ProcNoItin<"arm10e",          [HasV5TEOps]>;
+def : ProcNoItin<"arm1020e",        [HasV5TEOps]>;
+def : ProcNoItin<"arm1022e",        [HasV5TEOps]>;
+def : ProcNoItin<"xscale",          [HasV5TEOps]>;
+def : ProcNoItin<"iwmmxt",          [HasV5TEOps]>;
+
+// V6 Processors.
+def : Processor<"arm1136j-s",       ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+def : Processor<"arm1176jz-s",      ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+def : Processor<"mpcorenovfp",      ARMV6Itineraries, [HasV6Ops]>;
+def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx]>;
+
+// V6M Processors.
+def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
+                                                       FeatureDB, FeatureMClass]>;
+
+// V6T2 Processors.
+def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
+                                                       FeatureDSPThumb2]>;
+def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
+                                                       FeatureHasSlowFPVMLx,
+                                                       FeatureDSPThumb2]>;
+
+// V7a Processors.
+// FIXME: A5 has currently the same Schedule model as A8
+def : ProcessorModel<"cortex-a5",   CortexA8Model,
+                                    [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureVFP4, FeatureDSPThumb2,
+                                     FeatureHasRAS]>;
+def : ProcessorModel<"cortex-a8",   CortexA8Model,
+                                    [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS]>;
+def : ProcessorModel<"cortex-a9",   CortexA9Model,
+                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS]>;
+def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
+                                    [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureMP,
+                                     FeatureHasRAS]>;
+// FIXME: A15 has currently the same ProcessorModel as A9.
+def : ProcessorModel<"cortex-a15",   CortexA9Model,
+                                    [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS]>;
+// FIXME: R5 has currently the same ProcessorModel as A8.
+def : ProcessorModel<"cortex-r5",   CortexA8Model,
+                                    [ProcR5, HasV7Ops, FeatureDB,
+                                     FeatureVFP3, FeatureDSPThumb2,
+                                     FeatureHasRAS]>;
+
+// V7M Processors.
+def : ProcNoItin<"cortex-m3",       [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureMClass]>;
+
+// V7EM Processors.
+def : ProcNoItin<"cortex-m4",       [HasV7Ops,
+                                     FeatureThumb2, FeatureNoARM, FeatureDB,
+                                     FeatureHWDiv, FeatureDSPThumb2,
+                                     FeatureT2XtPk, FeatureVFP4,
+                                     FeatureVFPOnlySP, FeatureMClass]>;
+
+// Swift uArch Processors.
+def : ProcessorModel<"swift",       SwiftModel,
+                                    [ProcSwift, HasV7Ops, FeatureNEON,
+                                     FeatureDB, FeatureDSPThumb2,
+                                     FeatureHasRAS]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo;
+
+
+//===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// ARM Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def ARMAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+
+  let AssemblyWriters = [ARMAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
new file mode 100644
index 000000000000..13ec2087938a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -0,0 +1,1945 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARMAsmPrinter.h"
+#include "ARM.h"
+#include "ARMBuildAttrs.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cctype>
+using namespace llvm;
+
+namespace {
+
+  // Per section and per symbol attributes are not supported.
+  // To implement them we would need the ability to delay this emission
+  // until the assembly file is fully parsed/generated as only then do we
+  // know the symbol and section numbers.
+  class AttributeEmitter {
+  public:
+    virtual void MaybeSwitchVendor(StringRef Vendor) = 0;
+    virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0;
+    virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0;
+    virtual void Finish() = 0;
+    virtual ~AttributeEmitter() {}
+  };
+
+  class AsmAttributeEmitter : public AttributeEmitter {
+    MCStreamer &Streamer;
+
+  public:
+    AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {}
+    void MaybeSwitchVendor(StringRef Vendor) { }
+
+    void EmitAttribute(unsigned Attribute, unsigned Value) {
+      Streamer.EmitRawText("\t.eabi_attribute " +
+                           Twine(Attribute) + ", " + Twine(Value));
+    }
+
+    void EmitTextAttribute(unsigned Attribute, StringRef String) {
+      switch (Attribute) {
+      default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
+      case ARMBuildAttrs::CPU_name:
+        Streamer.EmitRawText(StringRef("\t.cpu ") + String.lower());
+        break;
+      /* GAS requires .fpu to be emitted regardless of EABI attribute */
+      case ARMBuildAttrs::Advanced_SIMD_arch:
+      case ARMBuildAttrs::VFP_arch:
+        Streamer.EmitRawText(StringRef("\t.fpu ") + String.lower());
+        break;
+      }
+    }
+    void Finish() { }
+  };
+
+  class ObjectAttributeEmitter : public AttributeEmitter {
+    // This structure holds all attributes, accounting for
+    // their string/numeric value, so we can later emmit them
+    // in declaration order, keeping all in the same vector
+    struct AttributeItemType {
+      enum {
+        HiddenAttribute = 0,
+        NumericAttribute,
+        TextAttribute
+      } Type;
+      unsigned Tag;
+      unsigned IntValue;
+      StringRef StringValue;
+    } AttributeItem;
+
+    MCObjectStreamer &Streamer;
+    StringRef CurrentVendor;
+    SmallVector<AttributeItemType, 64> Contents;
+
+    // Account for the ULEB/String size of each item,
+    // not just the number of items
+    size_t ContentsSize;
+    // FIXME: this should be in a more generic place, but
+    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+    size_t getULEBSize(int Value) {
+      size_t Size = 0;
+      do {
+        Value >>= 7;
+        Size += sizeof(int8_t); // Is this really necessary?
+      } while (Value);
+      return Size;
+    }
+
+  public:
+    ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
+      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
+
+    void MaybeSwitchVendor(StringRef Vendor) {
+      assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+      if (CurrentVendor.empty())
+        CurrentVendor = Vendor;
+      else if (CurrentVendor == Vendor)
+        return;
+      else
+        Finish();
+
+      CurrentVendor = Vendor;
+
+      assert(Contents.size() == 0);
+    }
+
+    void EmitAttribute(unsigned Attribute, unsigned Value) {
+      AttributeItemType attr = {
+        AttributeItemType::NumericAttribute,
+        Attribute,
+        Value,
+        StringRef("")
+      };
+      ContentsSize += getULEBSize(Attribute);
+      ContentsSize += getULEBSize(Value);
+      Contents.push_back(attr);
+    }
+
+    void EmitTextAttribute(unsigned Attribute, StringRef String) {
+      AttributeItemType attr = {
+        AttributeItemType::TextAttribute,
+        Attribute,
+        0,
+        String
+      };
+      ContentsSize += getULEBSize(Attribute);
+      // String + \0
+      ContentsSize += String.size()+1;
+
+      Contents.push_back(attr);
+    }
+
+    void Finish() {
+      // Vendor size + Vendor name + '\0'
+      const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+      // Tag + Tag Size
+      const size_t TagHeaderSize = 1 + 4;
+
+      Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+      Streamer.EmitBytes(CurrentVendor);
+      Streamer.EmitIntValue(0, 1); // '\0'
+
+      Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+      Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+      // Size should have been accounted for already, now
+      // emit each field as its type (ULEB or String)
+      for (unsigned int i=0; i<Contents.size(); ++i) {
+        AttributeItemType item = Contents[i];
+        Streamer.EmitULEB128IntValue(item.Tag);
+        switch (item.Type) {
+        default: llvm_unreachable("Invalid attribute type");
+        case AttributeItemType::NumericAttribute:
+          Streamer.EmitULEB128IntValue(item.IntValue);
+          break;
+        case AttributeItemType::TextAttribute:
+          Streamer.EmitBytes(item.StringValue.upper());
+          Streamer.EmitIntValue(0, 1); // '\0'
+          break;
+        }
+      }
+
+      Contents.clear();
+    }
+  };
+
+} // end of anonymous namespace
+
+MachineLocation ARMAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+  const TargetRegisterInfo *RI = TM.getRegisterInfo();
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
+    AsmPrinter::EmitDwarfRegOp(MLoc);
+  else {
+    unsigned Reg = MLoc.getReg();
+    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+      // S registers are described as bit-pieces of a register
+      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+
+      unsigned SReg = Reg - ARM::S0;
+      bool odd = SReg & 0x1;
+      unsigned Rx = 256 + (SReg >> 1);
+
+      OutStreamer.AddComment("DW_OP_regx for S register");
+      EmitInt8(dwarf::DW_OP_regx);
+
+      OutStreamer.AddComment(Twine(SReg));
+      EmitULEB128(Rx);
+
+      if (odd) {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 32");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(32);
+      } else {
+        OutStreamer.AddComment("DW_OP_bit_piece 32 0");
+        EmitInt8(dwarf::DW_OP_bit_piece);
+        EmitULEB128(32);
+        EmitULEB128(0);
+      }
+    } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+      // Q registers Q0-Q15 are described by composing two D registers together.
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
+      // DW_OP_piece(8)
+
+      unsigned QReg = Reg - ARM::Q0;
+      unsigned D1 = 256 + 2 * QReg;
+      unsigned D2 = D1 + 1;
+
+      OutStreamer.AddComment("DW_OP_regx for Q register: D1");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D1);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+
+      OutStreamer.AddComment("DW_OP_regx for Q register: D2");
+      EmitInt8(dwarf::DW_OP_regx);
+      EmitULEB128(D2);
+      OutStreamer.AddComment("DW_OP_piece 8");
+      EmitInt8(dwarf::DW_OP_piece);
+      EmitULEB128(8);
+    }
+  }
+}
+
+void ARMAsmPrinter::EmitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  if (AFI->isThumbFunction()) {
+    OutStreamer.EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer.EmitThumbFunc(CurrentFnSym);
+  }
+
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
+  assert(Size && "C++ constructor pointer had zero size!");
+
+  const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
+  assert(GV && "C++ constructor pointer was not a GlobalValue!");
+
+  const MCExpr *E = MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+                                            (Subtarget->isTargetDarwin()
+                                             ? MCSymbolRefExpr::VK_None
+                                             : MCSymbolRefExpr::VK_ARM_TARGET1),
+                                            OutContext);
+  
+  OutStreamer.EmitValue(E, Size);
+}
+
+/// runOnMachineFunction - This uses the EmitInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  MCP = MF.getConstantPool();
+
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned TF = MO.getTargetFlags();
+
+  switch (MO.getType()) {
+  default: llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    if(ARM::GPRPairRegClass.contains(Reg)) {
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      Reg = TRI->getSubReg(Reg, ARM::gsub_0);
+    }
+    O << ARMInstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#';
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF == ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF == ARMII::MO_HI16))
+      O << ":upper16:";
+    O << Imm;
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF & ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF & ARMII::MO_HI16))
+      O << ":upper16:";
+    O << *Mang->getSymbol(GV);
+
+    printOffset(MO.getOffset(), O);
+    if (TF == ARMII::MO_PLT)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    if (TF == ARMII::MO_PLT)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    break;
+  }
+}
+
+//===--------------------------------------------------------------------===//
+
+MCSymbol *ARMAsmPrinter::
+GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
+    << getFunctionNumber() << '_' << uid << '_' << uid2;
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+
+MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "SJLJEH"
+    << getFunctionNumber();
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                    unsigned AsmVariant, const char *ExtraCode,
+                                    raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    case 'a': // Print as a memory address.
+      if (MI->getOperand(OpNum).isReg()) {
+        O << "["
+          << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg())
+          << "]";
+        return false;
+      }
+      // Fallthrough
+    case 'c': // Don't print "#" before an immediate operand.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << MI->getOperand(OpNum).getImm();
+      return false;
+    case 'P': // Print a VFP double precision register.
+    case 'q': // Print a NEON quad precision register.
+      printOperand(MI, OpNum, O);
+      return false;
+    case 'y': // Print a VFP single precision register as indexed double.
+      if (MI->getOperand(OpNum).isReg()) {
+        unsigned Reg = MI->getOperand(OpNum).getReg();
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        // Find the 'd' register that has this 's' register as a sub-register,
+        // and determine the lane number.
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (!ARM::DPRRegClass.contains(*SR))
+            continue;
+          bool Lane0 = TRI->getSubReg(*SR, ARM::ssub_0) == Reg;
+          O << ARMInstPrinter::getRegisterName(*SR) << (Lane0 ? "[0]" : "[1]");
+          return false;
+        }
+      }
+      return true;
+    case 'B': // Bitwise inverse of integer or symbol without a preceding #.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << ~(MI->getOperand(OpNum).getImm());
+      return false;
+    case 'L': // The low 16 bits of an immediate constant.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      O << (MI->getOperand(OpNum).getImm() & 0xffff);
+      return false;
+    case 'M': { // A register range suitable for LDM/STM.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      unsigned RegBegin = MO.getReg();
+      // This takes advantage of the 2 operand-ness of ldm/stm and that we've
+      // already got the operands in registers that are operands to the
+      // inline asm statement.
+
+      O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
+
+      // FIXME: The register allocator not only may not have given us the
+      // registers in sequence, but may not be in ascending registers. This
+      // will require changes in the register allocator that'll need to be
+      // propagated down here if the operands change.
+      unsigned RegOps = OpNum + 1;
+      while (MI->getOperand(RegOps).isReg()) {
+        O << ", "
+          << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
+        RegOps++;
+      }
+
+      O << "}";
+
+      return false;
+    }
+    case 'R': // The most significant register of a pair.
+    case 'Q': { // The least significant register of a pair.
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+
+    case 'e': // The low doubleword register of a NEON quad register.
+    case 'f': { // The high doubleword register of a NEON quad register.
+      if (!MI->getOperand(OpNum).isReg())
+        return true;
+      unsigned Reg = MI->getOperand(OpNum).getReg();
+      if (!ARM::QPRRegClass.contains(Reg))
+        return true;
+      const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+      unsigned SubReg = TRI->getSubReg(Reg, ExtraCode[0] == 'e' ?
+                                       ARM::dsub_0 : ARM::dsub_1);
+      O << ARMInstPrinter::getRegisterName(SubReg);
+      return false;
+    }
+
+    // This modifier is not yet supported.
+    case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
+      return true;
+    case 'H': { // The highest-numbered register of a pair.
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (!MO.isReg())
+        return true;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+      unsigned Reg = MO.getReg();
+      if(!ARM::GPRPairRegClass.contains(Reg))
+        return false;
+      Reg = TRI->getSubReg(Reg, ARM::gsub_1);
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNum, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+      case 'A': // A memory operand for a VLD1/VST1 instruction.
+      default: return true;  // Unknown modifier.
+      case 'm': // The base register of a memory operand.
+        if (!MI->getOperand(OpNum).isReg())
+          return true;
+        O << ARMInstPrinter::getRegisterName(MI->getOperand(OpNum).getReg());
+        return false;
+    }
+  }
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    Reloc::Model RelocM = TM.getRelocationModel();
+    if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
+      // Declare all the text sections up front (before the DWARF sections
+      // emitted by AsmPrinter::doInitialization) so the assembler will keep
+      // them together at the beginning of the object file.  This helps
+      // avoid out-of-range branches that are due a fundamental limitation of
+      // the way symbol offsets are encoded with the current Darwin ARM
+      // relocations.
+      const TargetLoweringObjectFileMachO &TLOFMacho =
+        static_cast<const TargetLoweringObjectFileMachO &>(
+          getObjFileLowering());
+
+      // Collect the set of sections our functions will go into.
+      SetVector<const MCSection *, SmallVector<const MCSection *, 8>,
+        SmallPtrSet<const MCSection *, 8> > TextSections;
+      // Default text section comes first.
+      TextSections.insert(TLOFMacho.getTextSection());
+      // Now any user defined text sections from function attributes.
+      for (Module::iterator F = M.begin(), e = M.end(); F != e; ++F)
+        if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage())
+          TextSections.insert(TLOFMacho.SectionForGlobal(F, Mang, TM));
+      // Now the coalescable sections.
+      TextSections.insert(TLOFMacho.getTextCoalSection());
+      TextSections.insert(TLOFMacho.getConstTextCoalSection());
+
+      // Emit the sections in the .s file header to fix the order.
+      for (unsigned i = 0, e = TextSections.size(); i != e; ++i)
+        OutStreamer.SwitchSection(TextSections[i]);
+
+      if (RelocM == Reloc::DynamicNoPIC) {
+        const MCSection *sect =
+          OutContext.getMachOSection("__TEXT", "__symbol_stub4",
+                                     MCSectionMachO::S_SYMBOL_STUBS,
+                                     12, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      } else {
+        const MCSection *sect =
+          OutContext.getMachOSection("__TEXT", "__picsymbolstub4",
+                                     MCSectionMachO::S_SYMBOL_STUBS,
+                                     16, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      }
+      const MCSection *StaticInitSect =
+        OutContext.getMachOSection("__TEXT", "__StaticInit",
+                                   MCSectionMachO::S_REGULAR |
+                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   SectionKind::getText());
+      OutStreamer.SwitchSection(StaticInitSect);
+    }
+  }
+
+  // Use unified assembler syntax.
+  OutStreamer.EmitAssemblerFlag(MCAF_SyntaxUnified);
+
+  // Emit ARM Build Attributes
+  if (Subtarget->isTargetELF())
+    emitAttributes();
+}
+
+
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    // All darwin targets use mach-o.
+    const TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output non-lazy-pointers for external and common global variables.
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$stub:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        //   .indirect_symbol _foo
+        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),MCSA_IndirectSymbol);
+
+        if (MCSym.getInt())
+          // External to current translation unit.
+          OutStreamer.EmitIntValue(0, 4/*size*/);
+        else
+          // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers need to be indirect and pc-rel. We accomplish this by
+          // using NLPs; however, sometimes the types are local to the file.
+          // We need to fill in the value for the NLP in those cases.
+          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+                                                        OutContext),
+                                4/*size*/);
+      }
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$stub:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        //   .long _foo
+        OutStreamer.EmitValue(MCSymbolRefExpr::
+                              Create(Stubs[i].second.getPointer(),
+                                     OutContext),
+                              4/*size*/);
+      }
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+  // FIXME: This should eventually end up somewhere else where more
+  // intelligent flag decisions can be made. For now we are just maintaining
+  // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+  if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&OutStreamer))
+    MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// FIXME:
+// The following seem like one-off assembler flags, but they actually need
+// to appear in the .ARM.attributes section in ELF.
+// Instead of subclassing the MCELFStreamer, we do the work here.
+
+void ARMAsmPrinter::emitAttributes() {
+
+  emitARMAttributeSection();
+
+  /* GAS expect .fpu to be emitted, regardless of VFP build attribute */
+  bool emitFPU = false;
+  AttributeEmitter *AttrEmitter;
+  if (OutStreamer.hasRawTextSupport()) {
+    AttrEmitter = new AsmAttributeEmitter(OutStreamer);
+    emitFPU = true;
+  } else {
+    MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
+    AttrEmitter = new ObjectAttributeEmitter(O);
+  }
+
+  AttrEmitter->MaybeSwitchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  if (CPUString == "cortex-a8" ||
+      Subtarget->isCortexA8()) {
+    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8");
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+    // Fixme: figure out when this is emitted.
+    //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch,
+    //                           ARMBuildAttrs::AllowWMMXv1);
+    //
+
+    /// ADD additional Else-cases here!
+  } else if (CPUString == "xscale") {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::Allowed);
+  } else if (CPUString == "generic") {
+    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV7Ops()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV6T2Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+  else if (Subtarget->hasV6Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+  else if (Subtarget->hasV5TEOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+  else if (Subtarget->hasV5TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+  else if (Subtarget->hasV4TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+
+  if (Subtarget->hasNEON() && emitFPU) {
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (Subtarget->hasVFP4())
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                                     "neon-vfpv4");
+    else
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
+    /* If emitted for NEON, omit from VFP below, since you can have both
+     * NEON and VFP in build attributes but only one .fpu */
+    emitFPU = false;
+  }
+
+  /* VFPv4 + .fpu */
+  if (Subtarget->hasVFP4()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowFPv4A);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4");
+
+  /* VFPv3 + .fpu */
+  } else if (Subtarget->hasVFP3()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowFPv3A);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3");
+
+  /* VFPv2 + .fpu */
+  } else if (Subtarget->hasVFP2()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowFPv2);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2");
+  }
+
+  /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
+   * since NEON can have 1 (allowed) or 2 (MAC operations) */
+  if (Subtarget->hasNEON()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                               ARMBuildAttrs::Allowed);
+  }
+
+  // Signal various FP modes.
+  if (!TM.Options.UnsafeFPMath) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
+                               ARMBuildAttrs::Allowed);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                               ARMBuildAttrs::Allowed);
+  }
+
+  if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                               ARMBuildAttrs::Allowed);
+  else
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                               ARMBuildAttrs::AllowIEE754);
+
+  // FIXME: add more flags to ARMBuildAttrs.h
+  // 8-bytes alignment stuff.
+  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
+  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+
+  // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
+  }
+  // FIXME: Should we signal R9 usage?
+
+  if (Subtarget->hasDivide())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1);
+
+  AttrEmitter->Finish();
+  delete AttrEmitter;
+}
+
+void ARMAsmPrinter::emitARMAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (OutStreamer.hasRawTextSupport())
+    return;
+
+  const ARMElfTargetObjectFile &TLOFELF =
+    static_cast<const ARMElfTargetObjectFile &>
+    (getObjFileLowering());
+
+  OutStreamer.SwitchSection(TLOFELF.getAttributesSection());
+
+  // Format version
+  OutStreamer.EmitIntValue(0x41, 1);
+}
+
+//===----------------------------------------------------------------------===//
+
+static MCSymbol *getPICLabel(const char *Prefix, unsigned FunctionNumber,
+                             unsigned LabelId, MCContext &Ctx) {
+
+  MCSymbol *Label = Ctx.GetOrCreateSymbol(Twine(Prefix)
+                       + "PC" + Twine(FunctionNumber) + "_" + Twine(LabelId));
+  return Label;
+}
+
+static MCSymbolRefExpr::VariantKind
+getModifierVariantKind(ARMCP::ARMCPModifier Modifier) {
+  switch (Modifier) {
+  case ARMCP::no_modifier: return MCSymbolRefExpr::VK_None;
+  case ARMCP::TLSGD:       return MCSymbolRefExpr::VK_ARM_TLSGD;
+  case ARMCP::TPOFF:       return MCSymbolRefExpr::VK_ARM_TPOFF;
+  case ARMCP::GOTTPOFF:    return MCSymbolRefExpr::VK_ARM_GOTTPOFF;
+  case ARMCP::GOT:         return MCSymbolRefExpr::VK_ARM_GOT;
+  case ARMCP::GOTOFF:      return MCSymbolRefExpr::VK_ARM_GOTOFF;
+  }
+  llvm_unreachable("Invalid ARMCPModifier!");
+}
+
+MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
+  bool isIndirect = Subtarget->isTargetDarwin() &&
+    Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+  if (!isIndirect)
+    return Mang->getSymbol(GV);
+
+  // FIXME: Remove this when Darwin transition to @GOT like syntax.
+  MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+  MachineModuleInfoMachO &MMIMachO =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  MachineModuleInfoImpl::StubValueTy &StubSym =
+    GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) :
+    MMIMachO.getGVStubEntry(MCSym);
+  if (StubSym.getPointer() == 0)
+    StubSym = MachineModuleInfoImpl::
+      StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+  return MCSym;
+}
+
+void ARMAsmPrinter::
+EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
+
+  ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+
+  MCSymbol *MCSym;
+  if (ACPV->isLSDA()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+    OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
+    MCSym = OutContext.GetOrCreateSymbol(OS.str());
+  } else if (ACPV->isBlockAddress()) {
+    const BlockAddress *BA =
+      cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
+  } else if (ACPV->isGlobalValue()) {
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+    MCSym = GetARMGVSymbol(GV);
+  } else if (ACPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+    MCSym = MBB->getSymbol();
+  } else {
+    assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+    const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
+  }
+
+  // Create an MCSymbol for the reference.
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(MCSym, getModifierVariantKind(ACPV->getModifier()),
+                            OutContext);
+
+  if (ACPV->getPCAdjustment()) {
+    MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                    getFunctionNumber(),
+                                    ACPV->getLabelId(),
+                                    OutContext);
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, OutContext);
+    PCRelExpr =
+      MCBinaryExpr::CreateAdd(PCRelExpr,
+                              MCConstantExpr::Create(ACPV->getPCAdjustment(),
+                                                     OutContext),
+                              OutContext);
+    if (ACPV->mustAddCurrentAddress()) {
+      // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+      // label, so just emit a local label end reference that instead.
+      MCSymbol *DotSym = OutContext.CreateTempSymbol();
+      OutStreamer.EmitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext);
+    }
+    Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext);
+  }
+  OutStreamer.EmitValue(Expr, Size);
+}
+
+void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = 1;
+  if (Opcode == ARM::BR_JTadd)
+    OpNum = 2;
+  else if (Opcode == ARM::BR_JTm)
+    OpNum = 3;
+
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  // Emit a label for the jump table.
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Mark the jump table as data-in-code.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    // Construct an MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr)
+    //
+    // For example, a table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .word (LBB0 - LJTI_0_0)
+    //    .word (LBB1 - LJTI_0_0)
+    const MCExpr *Expr = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
+
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
+                                                                   OutContext),
+                                     OutContext);
+    // If we're generating a table of Thumb addresses in static relocation
+    // model, we need to add one to keep interworking correctly.
+    else if (AFI->isThumbFunction())
+      Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext),
+                                     OutContext);
+    OutStreamer.EmitValue(Expr, 4);
+  }
+  // Mark the end of jump table data-in-code region.
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  int OpNum = (Opcode == ARM::t2BR_JT) ? 2 : 1;
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  // Emit each entry of the table.
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  unsigned OffsetWidth = 4;
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
+    OffsetWidth = 1;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT8);
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
+    OffsetWidth = 2;
+    // Mark the jump table as data-in-code.
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT16);
+  }
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    const MCExpr *MBBSymbolExpr = MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                                      OutContext);
+    // If this isn't a TBB or TBH, the entries are direct branch instructions.
+    if (OffsetWidth == 4) {
+      OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2B)
+        .addExpr(MBBSymbolExpr)
+        .addImm(ARMCC::AL)
+        .addReg(0));
+      continue;
+    }
+    // Otherwise it's an offset from the dispatch instruction. Construct an
+    // MCExpr for the entry. We want a value of the form:
+    // (BasicBlockAddr - TableBeginAddr) / 2
+    //
+    // For example, a TBB table with entries jumping to basic blocks BB0 and BB1
+    // would look like:
+    // LJTI_0_0:
+    //    .byte (LBB0 - LJTI_0_0) / 2
+    //    .byte (LBB1 - LJTI_0_0) / 2
+    const MCExpr *Expr =
+      MCBinaryExpr::CreateSub(MBBSymbolExpr,
+                              MCSymbolRefExpr::Create(JTISymbol, OutContext),
+                              OutContext);
+    Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(2, OutContext),
+                                   OutContext);
+    OutStreamer.EmitValue(Expr, OffsetWidth);
+  }
+  // Mark the end of jump table data-in-code region. 32-bit offsets use
+  // actual branch instructions here, so we don't mark those as a data-region
+  // at all.
+  if (OffsetWidth != 4)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps-2, OS);
+}
+
+void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
+  assert(MI->getFlag(MachineInstr::FrameSetup) &&
+      "Only instruction which are involved into frame setup code are allowed");
+
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
+
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned Opc = MI->getOpcode();
+  unsigned SrcReg, DstReg;
+
+  if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
+    // Two special cases:
+    // 1) tPUSH does not have src/dst regs.
+    // 2) for Thumb1 code we sometimes materialize the constant via constpool
+    // load. Yes, this is pretty fragile, but for now I don't see better
+    // way... :(
+    SrcReg = DstReg = ARM::SP;
+  } else {
+    SrcReg = MI->getOperand(1).getReg();
+    DstReg = MI->getOperand(0).getReg();
+  }
+
+  // Try to figure out the unwinding opcode out of src / dst regs.
+  if (MI->mayStore()) {
+    // Register saves.
+    assert(DstReg == ARM::SP &&
+           "Only stack pointer as a destination reg is supported");
+
+    SmallVector<unsigned, 4> RegList;
+    // Skip src & dst reg, and pred ops.
+    unsigned StartOp = 2 + 2;
+    // Use all the operands.
+    unsigned NumOffset = 0;
+
+    switch (Opc) {
+    default:
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    case ARM::tPUSH:
+      // Special case here: no src & dst reg, but two extra imp ops.
+      StartOp = 2; NumOffset = 2;
+    case ARM::STMDB_UPD:
+    case ARM::t2STMDB_UPD:
+    case ARM::VSTMDDB_UPD:
+      assert(SrcReg == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      for (unsigned i = StartOp, NumOps = MI->getNumOperands() - NumOffset;
+           i != NumOps; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        // Actually, there should never be any impdef stuff here. Skip it
+        // temporary to workaround PR11902.
+        if (MO.isImplicit())
+          continue;
+        RegList.push_back(MO.getReg());
+      }
+      break;
+    case ARM::STR_PRE_IMM:
+    case ARM::STR_PRE_REG:
+    case ARM::t2STR_PRE:
+      assert(MI->getOperand(2).getReg() == ARM::SP &&
+             "Only stack pointer as a source reg is supported");
+      RegList.push_back(SrcReg);
+      break;
+    }
+    OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+  } else {
+    // Changes of stack / frame pointer.
+    if (SrcReg == ARM::SP) {
+      int64_t Offset = 0;
+      switch (Opc) {
+      default:
+        MI->dump();
+        llvm_unreachable("Unsupported opcode for unwinding information");
+      case ARM::MOVr:
+      case ARM::tMOVr:
+        Offset = 0;
+        break;
+      case ARM::ADDri:
+        Offset = -MI->getOperand(2).getImm();
+        break;
+      case ARM::SUBri:
+      case ARM::t2SUBri:
+        Offset = MI->getOperand(2).getImm();
+        break;
+      case ARM::tSUBspi:
+        Offset = MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tADDspi:
+      case ARM::tADDrSPi:
+        Offset = -MI->getOperand(2).getImm()*4;
+        break;
+      case ARM::tLDRpci: {
+        // Grab the constpool index and check, whether it corresponds to
+        // original or cloned constpool entry.
+        unsigned CPI = MI->getOperand(1).getIndex();
+        const MachineConstantPool *MCP = MF.getConstantPool();
+        if (CPI >= MCP->getConstants().size())
+          CPI = AFI.getOriginalCPIdx(CPI);
+        assert(CPI != -1U && "Invalid constpool index");
+
+        // Derive the actual offset.
+        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+        // FIXME: Check for user, it should be "add" instruction!
+        Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+        break;
+      }
+      }
+
+      if (DstReg == FramePtr && FramePtr != ARM::SP)
+        // Set-up of the frame pointer. Positive values correspond to "add"
+        // instruction.
+        OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset);
+      else if (DstReg == ARM::SP) {
+        // Change of SP by an offset. Positive values correspond to "sub"
+        // instruction.
+        OutStreamer.EmitPad(Offset);
+      } else {
+        MI->dump();
+        llvm_unreachable("Unsupported opcode for unwinding information");
+      }
+    } else if (DstReg == ARM::SP) {
+      // FIXME: .movsp goes here
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+    else {
+      MI->dump();
+      llvm_unreachable("Unsupported opcode for unwinding information");
+    }
+  }
+}
+
+extern cl::opt<bool> EnableARMEHABI;
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
+#include "ARMGenMCPseudoLowering.inc"
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+
+  // Emit unwinding stuff for frame-related instructions
+  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
+
+  assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+         "Pseudo flag setting opcode should be expanded early");
+
+  // Check for manual lowerings.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
+  case ARM::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  case ARM::LEApcrel:
+  case ARM::tLEApcrel:
+  case ARM::t2LEApcrel: {
+    // FIXME: Need to also handle globals and externals
+    MCSymbol *CPISymbol = GetCPISymbol(MI->getOperand(1).getIndex());
+    OutStreamer.EmitInstruction(MCInstBuilder(MI->getOpcode() ==
+                                              ARM::t2LEApcrel ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrel ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::Create(CPISymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(2).getImm())
+      .addReg(MI->getOperand(3).getReg()));
+    return;
+  }
+  case ARM::LEApcrelJT:
+  case ARM::tLEApcrelJT:
+  case ARM::t2LEApcrelJT: {
+    MCSymbol *JTIPICSymbol =
+      GetARMJTIPICJumpTableLabel2(MI->getOperand(1).getIndex(),
+                                  MI->getOperand(2).getImm());
+    OutStreamer.EmitInstruction(MCInstBuilder(MI->getOpcode() ==
+                                              ARM::t2LEApcrelJT ? ARM::t2ADR
+                  : (MI->getOpcode() == ARM::tLEApcrelJT ? ARM::tADR
+                     : ARM::ADR))
+      .addReg(MI->getOperand(0).getReg())
+      .addExpr(MCSymbolRefExpr::Create(JTIPICSymbol, OutContext))
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg()));
+    return;
+  }
+  // Darwin call instructions are just normal call instructions with different
+  // clobber semantics (they clobber R9).
+  case ARM::BX_CALL: {
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::BX)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+  case ARM::tBX_CALL: {
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tBX)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::BMOVPCRX_CALL: {
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::BMOVPCB_CALL: {
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVr)
+      .addReg(ARM::LR)
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    const GlobalValue *GV = MI->getOperand(0).getGlobal();
+    MCSymbol *GVSym = Mang->getSymbol(GV);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc)
+      .addExpr(GVSymExpr)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVi16_ga_pcrel? ARM::MOVi16 : ARM::t2MOVi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+
+    unsigned TF = MI->getOperand(1).getTargetFlags();
+    bool isPIC = TF == ARMII::MO_LO16_NONLAZY_PIC;
+    const GlobalValue *GV = MI->getOperand(1).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    if (isPIC) {
+      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(2).getImm(), OutContext);
+      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+      unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
+      const MCExpr *PCRelExpr =
+        ARMMCExpr::CreateLower16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                  MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+    } else {
+      const MCExpr *RefExpr= ARMMCExpr::CreateLower16(GVSymExpr, OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
+    }
+
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc == ARM::MOVTi16_ga_pcrel
+                      ? ARM::MOVTi16 : ARM::t2MOVTi16);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+
+    unsigned TF = MI->getOperand(2).getTargetFlags();
+    bool isPIC = TF == ARMII::MO_HI16_NONLAZY_PIC;
+    const GlobalValue *GV = MI->getOperand(2).getGlobal();
+    MCSymbol *GVSym = GetARMGVSymbol(GV);
+    const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
+    if (isPIC) {
+      MCSymbol *LabelSym = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                       getFunctionNumber(),
+                                       MI->getOperand(3).getImm(), OutContext);
+      const MCExpr *LabelSymExpr= MCSymbolRefExpr::Create(LabelSym, OutContext);
+      unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
+      const MCExpr *PCRelExpr =
+        ARMMCExpr::CreateUpper16(MCBinaryExpr::CreateSub(GVSymExpr,
+                                   MCBinaryExpr::CreateAdd(LabelSymExpr,
+                                      MCConstantExpr::Create(PCAdj, OutContext),
+                                          OutContext), OutContext), OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(PCRelExpr));
+    } else {
+      const MCExpr *RefExpr= ARMMCExpr::CreateUpper16(GVSymExpr, OutContext);
+      TmpInst.addOperand(MCOperand::CreateExpr(RefExpr));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::tPICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the add.
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tADDhirr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICADD: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the add.
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDrr)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg())
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+    return;
+  }
+  case ARM::PICSTR:
+  case ARM::PICSTRB:
+  case ARM::PICSTRH:
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICLDRH:
+  case ARM::PICLDRSB:
+  case ARM::PICLDRSH: {
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     OP r0, [pc, r0]
+    // The LCP0 label is referenced by a constant pool entry in order to get
+    // a PC-relative address at the ldr instruction.
+
+    // Emit the label.
+    OutStreamer.EmitLabel(getPICLabel(MAI->getPrivateGlobalPrefix(),
+                          getFunctionNumber(), MI->getOperand(2).getImm(),
+                          OutContext));
+
+    // Form and emit the load
+    unsigned Opcode;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case ARM::PICSTR:   Opcode = ARM::STRrs; break;
+    case ARM::PICSTRB:  Opcode = ARM::STRBrs; break;
+    case ARM::PICSTRH:  Opcode = ARM::STRH; break;
+    case ARM::PICLDR:   Opcode = ARM::LDRrs; break;
+    case ARM::PICLDRB:  Opcode = ARM::LDRBrs; break;
+    case ARM::PICLDRH:  Opcode = ARM::LDRH; break;
+    case ARM::PICLDRSB: Opcode = ARM::LDRSB; break;
+    case ARM::PICLDRSH: Opcode = ARM::LDRSH; break;
+    }
+    OutStreamer.EmitInstruction(MCInstBuilder(Opcode)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(1).getReg())
+      .addImm(0)
+      // Add predicate operands.
+      .addImm(MI->getOperand(3).getImm())
+      .addReg(MI->getOperand(4).getReg()));
+
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: {
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    /// The required alignment is specified on the basic block holding this MI.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+  case ARM::t2BR_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::t2TBB_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2TBB)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    // Make sure the next instruction is 2-byte aligned.
+    EmitAlignment(1);
+    return;
+  }
+  case ARM::t2TBH_JT: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::t2TBH)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJump2Table(MI);
+    return;
+  }
+  case ARM::tBR_JTr:
+  case ARM::BR_JTr: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // mov pc, target
+    MCInst TmpInst;
+    unsigned Opc = MI->getOpcode() == ARM::BR_JTr ?
+      ARM::MOVr : ARM::tMOVr;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    // Add 's' bit operand (always reg0 for this)
+    if (Opc == ARM::MOVr)
+      TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+
+    // Make sure the Thumb jump table is 4-byte aligned.
+    if (Opc == ARM::tMOVr)
+      EmitAlignment(2);
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTm: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // ldr pc, target
+    MCInst TmpInst;
+    if (MI->getOperand(1).getReg() == 0) {
+      // literal offset
+      TmpInst.setOpcode(ARM::LDRi12);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+    } else {
+      TmpInst.setOpcode(ARM::LDRrs);
+      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+      TmpInst.addOperand(MCOperand::CreateImm(0));
+    }
+    // Add predicate operands.
+    TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    TmpInst.addOperand(MCOperand::CreateReg(0));
+    OutStreamer.EmitInstruction(TmpInst);
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::BR_JTadd: {
+    // Lower and emit the instruction itself, then the jump table following it.
+    // add pc, target, idx
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDrr)
+      .addReg(ARM::PC)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      // Add predicate operands.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // Add 's' bit operand (always reg0 for this)
+      .addReg(0));
+
+    // Output the data for the jump table itself
+    EmitJumpTable(MI);
+    return;
+  }
+  case ARM::TRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetDarwin()) {
+      //.long 0xe7ffdefe @ trap
+      uint32_t Val = 0xe7ffdefeUL;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 4);
+      return;
+    }
+    break;
+  }
+  case ARM::TRAPNaCl: {
+    //.long 0xe7fedef0 @ trap
+    uint32_t Val = 0xe7fedef0UL;
+    OutStreamer.AddComment("trap");
+    OutStreamer.EmitIntValue(Val, 4);
+    return;
+  }
+  case ARM::tTRAP: {
+    // Non-Darwin binutils don't yet support the "trap" mnemonic.
+    // FIXME: Remove this special case when they do.
+    if (!Subtarget->isTargetDarwin()) {
+      //.short 57086 @ trap
+      uint16_t Val = 0xdefe;
+      OutStreamer.AddComment("trap");
+      OutStreamer.EmitIntValue(Val, 2);
+      return;
+    }
+    break;
+  }
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // mov $val, pc
+    // adds $val, #7
+    // str $val, [$src, #4]
+    // movs r0, #0
+    // b 1f
+    // movs r0, #1
+    // 1:
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+    MCSymbol *Label = GetARMSJLJEHLabel();
+    OutStreamer.AddComment("eh_setjmp begin");
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tADDi3)
+      .addReg(ValReg)
+      // 's' bit operand
+      .addReg(ARM::CPSR)
+      .addReg(ValReg)
+      .addImm(7)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tSTRi)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      // The offset immediate is #4. The operand value is scaled by 4 for the
+      // tSTR instruction.
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    const MCExpr *SymbolExpr = MCSymbolRefExpr::Create(Label, OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tB)
+      .addExpr(SymbolExpr)
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.AddComment("eh_setjmp end");
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVi8)
+      .addReg(ARM::R0)
+      .addReg(ARM::CPSR)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitLabel(Label);
+    return;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::Int_eh_sjlj_setjmp: {
+    // Two incoming args: GPR:$src, GPR:$val
+    // add $val, pc, #8
+    // str $val, [$src, #+4]
+    // mov r0, #0
+    // add pc, pc, #0
+    // mov r0, #1
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ValReg = MI->getOperand(1).getReg();
+
+    OutStreamer.AddComment("eh_setjmp begin");
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDri)
+      .addReg(ValReg)
+      .addReg(ARM::PC)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::STRi12)
+      .addReg(ValReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::ADDri)
+      .addReg(ARM::PC)
+      .addReg(ARM::PC)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+
+    OutStreamer.AddComment("eh_setjmp end");
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::MOVi)
+      .addReg(ARM::R0)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0)
+      // 's' bit operand (always reg0 for this).
+      .addReg(0));
+    return;
+  }
+  case ARM::Int_eh_sjlj_longjmp: {
+    // ldr sp, [$src, #8]
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::SP)
+      .addReg(SrcReg)
+      .addImm(8)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(4)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::LDRi12)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::BX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  case ARM::tInt_eh_sjlj_longjmp: {
+    // ldr $scratch, [$src, #8]
+    // mov sp, $scratch
+    // ldr $scratch, [$src, #4]
+    // ldr r7, [$src]
+    // bx $scratch
+    unsigned SrcReg = MI->getOperand(0).getReg();
+    unsigned ScratchReg = MI->getOperand(1).getReg();
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      // The offset immediate is #8. The operand value is scaled by 4 for the
+      // tLDR instruction.
+      .addImm(2)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tMOVr)
+      .addReg(ARM::SP)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+      .addReg(ScratchReg)
+      .addReg(SrcReg)
+      .addImm(1)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tLDRi)
+      .addReg(ARM::R7)
+      .addReg(SrcReg)
+      .addImm(0)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+
+    OutStreamer.EmitInstruction(MCInstBuilder(ARM::tBX)
+      .addReg(ScratchReg)
+      // Predicate.
+      .addImm(ARMCC::AL)
+      .addReg(0));
+    return;
+  }
+  }
+
+  MCInst TmpInst;
+  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
+
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmPrinter() {
+  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
+  RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
new file mode 100644
index 000000000000..c945e4f28699
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -0,0 +1,132 @@
+//===-- ARMAsmPrinter.h - ARM implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMASMPRINTER_H
+#define ARMASMPRINTER_H
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+namespace ARM {
+  enum DW_ISA {
+    DW_ISA_ARM_thumb = 1,
+    DW_ISA_ARM_arm = 2
+  };
+}
+
+class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const ARMSubtarget *Subtarget;
+
+  /// AFI - Keep a pointer to ARMFunctionInfo for the current
+  /// MachineFunction.
+  ARMFunctionInfo *AFI;
+
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+public:
+  explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    }
+
+  virtual const char *getPassName() const LLVM_OVERRIDE {
+    return "ARM Assembly / Object Emitter";
+  }
+
+  void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = 0);
+
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O) LLVM_OVERRIDE;
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &O) LLVM_OVERRIDE;
+
+  void EmitJumpTable(const MachineInstr *MI);
+  void EmitJump2Table(const MachineInstr *MI);
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
+  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
+
+  virtual void EmitConstantPool() LLVM_OVERRIDE {
+    // we emit constant pools customly!
+  }
+  virtual void EmitFunctionBodyEnd() LLVM_OVERRIDE;
+  virtual void EmitFunctionEntryLabel() LLVM_OVERRIDE;
+  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+  virtual void EmitXXStructor(const Constant *CV) LLVM_OVERRIDE;
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+private:
+  // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+  void emitAttributes();
+
+  // Helper for ELF .o only
+  void emitARMAttributeSection();
+
+  // Generic helper used to emit e.g. ARMv5 mul pseudos
+  void EmitPatchedInstruction(const MachineInstr *MI, unsigned TargetOpc);
+
+  void EmitUnwindingInstruction(const MachineInstr *MI);
+
+  // emitPseudoExpansionLowering - tblgen'erated.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+public:
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  virtual MachineLocation
+    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
+
+  /// EmitDwarfRegOp - Emit dwarf register operation.
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const LLVM_OVERRIDE;
+
+  virtual unsigned getISAEncoding() LLVM_OVERRIDE {
+    // ARM/Darwin adds ISA to the DWARF info for each function.
+    if (!Subtarget->isTargetDarwin())
+      return 0;
+    return Subtarget->isThumb() ?
+      ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
+  }
+
+private:
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
+
+  MCSymbol *GetARMSJLJEHLabel() const;
+
+  MCSymbol *GetARMGVSymbol(const GlobalValue *GV);
+
+public:
+  /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+  /// the .s file.
+  virtual void
+    EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) LLVM_OVERRIDE;
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 000000000000..9e68ff44890e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -0,0 +1,4137 @@
+//===-- ARMBaseInstrInfo.cpp - ARM Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInstrInfo.h"
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMHazardRecognizer.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#define GET_INSTRINFO_CTOR
+#include "ARMGenInstrInfo.inc"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+static cl::opt<bool>
+WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
+           cl::desc("Widen ARM vmovs to vmovd when possible"));
+
+static cl::opt<unsigned>
+SwiftPartialUpdateClearance("swift-partial-update-clearance",
+     cl::Hidden, cl::init(12),
+     cl::desc("Clearance before partial register updates"));
+
+/// ARM_MLxEntry - Record information about MLA / MLS instructions.
+struct ARM_MLxEntry {
+  uint16_t MLxOpc;     // MLA / MLS opcode
+  uint16_t MulOpc;     // Expanded multiplication opcode
+  uint16_t AddSubOpc;  // Expanded add / sub opcode
+  bool NegAcc;         // True if the acc is negated before the add / sub.
+  bool HasLane;        // True if instruction has an extra "lane" operand.
+};
+
+static const ARM_MLxEntry ARM_MLxTable[] = {
+  // MLxOpc,          MulOpc,           AddSubOpc,       NegAcc, HasLane
+  // fp scalar ops
+  { ARM::VMLAS,       ARM::VMULS,       ARM::VADDS,      false,  false },
+  { ARM::VMLSS,       ARM::VMULS,       ARM::VSUBS,      false,  false },
+  { ARM::VMLAD,       ARM::VMULD,       ARM::VADDD,      false,  false },
+  { ARM::VMLSD,       ARM::VMULD,       ARM::VSUBD,      false,  false },
+  { ARM::VNMLAS,      ARM::VNMULS,      ARM::VSUBS,      true,   false },
+  { ARM::VNMLSS,      ARM::VMULS,       ARM::VSUBS,      true,   false },
+  { ARM::VNMLAD,      ARM::VNMULD,      ARM::VSUBD,      true,   false },
+  { ARM::VNMLSD,      ARM::VMULD,       ARM::VSUBD,      true,   false },
+
+  // fp SIMD ops
+  { ARM::VMLAfd,      ARM::VMULfd,      ARM::VADDfd,     false,  false },
+  { ARM::VMLSfd,      ARM::VMULfd,      ARM::VSUBfd,     false,  false },
+  { ARM::VMLAfq,      ARM::VMULfq,      ARM::VADDfq,     false,  false },
+  { ARM::VMLSfq,      ARM::VMULfq,      ARM::VSUBfq,     false,  false },
+  { ARM::VMLAslfd,    ARM::VMULslfd,    ARM::VADDfd,     false,  true  },
+  { ARM::VMLSslfd,    ARM::VMULslfd,    ARM::VSUBfd,     false,  true  },
+  { ARM::VMLAslfq,    ARM::VMULslfq,    ARM::VADDfq,     false,  true  },
+  { ARM::VMLSslfq,    ARM::VMULslfq,    ARM::VSUBfq,     false,  true  },
+};
+
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
+  : ARMGenInstrInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    Subtarget(STI) {
+  for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
+    if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
+      assert(false && "Duplicated entries?");
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
+    MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
+  }
+}
+
+// Use a ScoreboardHazardRecognizer for prepass ARM scheduling. TargetInstrImpl
+// currently defaults to no prepass hazard recognizer.
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetHazardRecognizer(const TargetMachine *TM,
+                             const ScheduleDAG *DAG) const {
+  if (usePreRAHazardRecognizer()) {
+    const InstrItineraryData *II = TM->getInstrItineraryData();
+    return new ScoreboardHazardRecognizer(II, DAG, "pre-RA-sched");
+  }
+  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+}
+
+ScheduleHazardRecognizer *ARMBaseInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  if (Subtarget.isThumb2() || Subtarget.hasVFP2())
+    return (ScheduleHazardRecognizer *)
+      new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG);
+  return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
+}
+
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  // FIXME: Thumb2 support.
+
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  uint64_t TSFlags = MI->getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumOps = MCID.getNumOperands();
+  bool isLoad = !MI->mayStore();
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default: llvm_unreachable("Unknown indexed op!");
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      if (ARM_AM::getSOImmVal(Amt) == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool
+ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(LastInst->getOperand(1));
+      Cond.push_back(LastInst->getOperand(2));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        SecondLastInst = I;
+        SecondLastOpc = SecondLastInst->getOpcode();
+      }
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB =  SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    Cond.push_back(SecondLastInst->getOperand(2));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with a branch table followed by an unconditional
+  // branch. The branch folder can create these, and we must get rid of them for
+  // correctness of Thumb constant islands.
+  if ((isJumpTableBranchOpcode(SecondLastOpc) ||
+       isIndirectBranchOpcode(SecondLastOpc)) &&
+      isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               DebugLoc DL) const {
+  ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+  int BOpc   = !AFI->isThumbFunction()
+    ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+  int BccOpc = !AFI->isThumbFunction()
+    ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+  bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+
+  if (FBB == 0) {
+    if (Cond.empty()) { // Unconditional branch?
+      if (isThumb)
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+      else
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else
+      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  if (isThumb)
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMBaseInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const {
+  if (MI->isBundle()) {
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      int PIdx = I->findFirstPredOperandIdx();
+      if (PIdx != -1 && I->getOperand(PIdx).getImm() != ARMCC::AL)
+        return true;
+    }
+    return false;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+}
+
+bool ARMBaseInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (isUncondBranchOpcode(Opc)) {
+    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+      .addImm(Pred[0].getImm())
+      .addReg(Pred[1].getReg());
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if ((MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) ||
+        (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+/// isPredicable - Return true if the specified instruction can be predicated.
+/// By default, this returns true for every instruction with a
+/// PredicateOperand.
+bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
+  if (!MI->isPredicable())
+    return false;
+
+  if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
+    ARMFunctionInfo *AFI =
+      MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+    return AFI->isThumb2Function();
+  }
+  return true;
+}
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
+LLVM_ATTRIBUTE_NOINLINE
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI);
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  assert(JTI < JT.size());
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getSize())
+    return MCID.getSize();
+
+  // If this machine instr is an inline asm, measure it.
+  if (MI->getOpcode() == ARM::INLINEASM)
+    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+  if (MI->isLabel())
+    return 0;
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case TargetOpcode::BUNDLE:
+    return getInstBundleLength(MI);
+  case ARM::MOVi16_ga_pcrel:
+  case ARM::MOVTi16_ga_pcrel:
+  case ARM::t2MOVi16_ga_pcrel:
+  case ARM::t2MOVTi16_ga_pcrel:
+    return 4;
+  case ARM::MOVi32imm:
+  case ARM::t2MOVi32imm:
+    return 8;
+  case ARM::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
+  case ARM::Int_eh_sjlj_longjmp:
+    return 16;
+  case ARM::tInt_eh_sjlj_longjmp:
+    return 10;
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+    return 20;
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    return 12;
+  case ARM::BR_JTr:
+  case ARM::BR_JTm:
+  case ARM::BR_JTadd:
+  case ARM::tBR_JTr:
+  case ARM::t2BR_JT:
+  case ARM::t2TBB_JT:
+  case ARM::t2TBH_JT: {
+    // These are jumptable branches, i.e. a branch followed by an inlined
+    // jumptable. The size is 4 + 4 * number of entries. For TBB, each
+    // entry is one byte; TBH two byte each.
+    unsigned EntrySize = (Opc == ARM::t2TBB_JT)
+      ? 1 : ((Opc == ARM::t2TBH_JT) ? 2 : 4);
+    unsigned NumOps = MCID.getNumOperands();
+    MachineOperand JTOP =
+      MI->getOperand(NumOps - (MI->isPredicable() ? 3 : 2));
+    unsigned JTI = JTOP.getIndex();
+    const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+    assert(MJTI != 0);
+    const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+    assert(JTI < JT.size());
+    // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+    // 4 aligned. The assembler / linker may add 2 byte padding just before
+    // the JT entries.  The size does not include this padding; the
+    // constant islands pass does separate bookkeeping for it.
+    // FIXME: If we know the size of the function is less than (1 << 16) *2
+    // bytes, we can use 16-bit entries instead. Then there won't be an
+    // alignment issue.
+    unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
+    unsigned NumEntries = getNumJTEntries(JT, JTI);
+    if (Opc == ARM::t2TBB_JT && (NumEntries & 1))
+      // Make sure the instruction that follows TBB is 2-byte aligned.
+      // FIXME: Constant island pass should insert an "ALIGN" instruction
+      // instead.
+      ++NumEntries;
+    return NumEntries * EntrySize + InstSize;
+  }
+  default:
+    // Otherwise, pseudo-instruction sizes are zero.
+    return 0;
+  }
+}
+
+unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += GetInstSizeInBytes(&*I);
+  }
+  return Size;
+}
+
+void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  bool GPRDest = ARM::GPRRegClass.contains(DestReg);
+  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+
+  if (GPRDest && GPRSrc) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+    return;
+  }
+
+  bool SPRDest = ARM::SPRRegClass.contains(DestReg);
+  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+
+  unsigned Opc = 0;
+  if (SPRDest && SPRSrc)
+    Opc = ARM::VMOVS;
+  else if (GPRDest && SPRSrc)
+    Opc = ARM::VMOVRS;
+  else if (SPRDest && GPRSrc)
+    Opc = ARM::VMOVSR;
+  else if (ARM::DPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD;
+  else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq;
+
+  if (Opc) {
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    if (Opc == ARM::VORRq)
+      MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    AddDefaultPred(MIB);
+    return;
+  }
+
+  // Handle register classes that require multiple instructions.
+  unsigned BeginIdx = 0;
+  unsigned SubRegs = 0;
+  int Spacing = 1;
+
+  // Use VORRq when possible.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2;
+  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4;
+  // Fall back to VMOVD.
+  else if (ARM::DPairRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2;
+  else if (ARM::DTripleRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
+  else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
+  else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2;
+
+  else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
+  else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2;
+  else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
+
+  assert(Opc && "Impossible reg-to-reg copy");
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
+
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+      .addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
+}
+
+static const
+MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB,
+                             unsigned Reg, unsigned SubIdx, unsigned State,
+                             const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
+
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  switch (RC->getSize()) {
+    case 4:
+      if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 8:
+      if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+                AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 16:
+      if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addFrameIndex(FI)
+                     .addMemOperand(MMO));
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 24:
+      if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+        // Use aligned spills if the stack can be realigned.
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 32:
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          // FIXME: It's possible to only store part of the QQ register if the
+          // spilled def has a sub-register index.
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+                     .addFrameIndex(FI).addImm(16)
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addMemOperand(MMO));
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 64:
+      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                         .addFrameIndex(FI))
+                         .addMemOperand(MMO);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    default:
+      llvm_unreachable("Unknown reg class!");
+  }
+}
+
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                     int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STRrs:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::STRi12:
+  case ARM::t2STRi12:
+  case ARM::tSTRspi:
+  case ARM::VSTRD:
+  case ARM::VSTRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VST1q64:
+  case ARM::VST1d64TPseudo:
+  case ARM::VST1d64QPseudo:
+    if (MI->getOperand(0).isFI() &&
+        MI->getOperand(2).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  case ARM::VSTMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(
+                    MachinePointerInfo::getFixedStack(FI),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  switch (RC->getSize()) {
+  case 4:
+    if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+
+    } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 8:
+    if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+      unsigned LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA : ARM::LDMIA;
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, I, DL, get(LdmOpc))
+                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 16:
+    if (ARM::DPairRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+                       .addFrameIndex(FI)
+                       .addMemOperand(MMO));
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 24:
+    if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                         .addFrameIndex(FI)
+                         .addMemOperand(MMO));
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+   case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+                     .addFrameIndex(FI).addImm(16)
+                     .addMemOperand(MMO));
+      } else {
+        MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+        if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+          MIB.addReg(DestReg, RegState::ImplicitDefine);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  case 64:
+    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                     .addFrameIndex(FI))
+                     .addMemOperand(MMO);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
+    } else
+      llvm_unreachable("Unknown reg class!");
+    break;
+  default:
+    llvm_unreachable("Unknown regclass!");
+  }
+}
+
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDRrs:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::LDRi12:
+  case ARM::t2LDRi12:
+  case ARM::tLDRspi:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLD1q64:
+  case ARM::VLD1d64TPseudo:
+  case ARM::VLD1d64QPseudo:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLDMQIA:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(0).getSubReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
+  // This hook gets to expand COPY instructions before they become
+  // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
+  // widened to VMOVD.  We prefer the VMOVD when possible because it may be
+  // changed into a VORR that can go down the NEON pipeline.
+  if (!WidenVMOVS || !MI->isCopy() || Subtarget.isCortexA15())
+    return false;
+
+  // Look for a copy between even S-registers.  That is where we keep floats
+  // when using NEON v2f32 instructions for f32 arithmetic.
+  unsigned DstRegS = MI->getOperand(0).getReg();
+  unsigned SrcRegS = MI->getOperand(1).getReg();
+  if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  if (!DstRegD || !SrcRegD)
+    return false;
+
+  // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
+  // legal if the COPY already defines the full DstRegD, and it isn't a
+  // sub-register insertion.
+  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+    return false;
+
+  // A dead copy shouldn't show up here, but reject it just in case.
+  if (MI->getOperand(0).isDead())
+    return false;
+
+  // All clear, widen the COPY.
+  DEBUG(dbgs() << "widening:    " << *MI);
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+
+  // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
+  // or some other super-register.
+  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  if (ImpDefIdx != -1)
+    MI->RemoveOperand(ImpDefIdx);
+
+  // Change the opcode and operands.
+  MI->setDesc(get(ARM::VMOVD));
+  MI->getOperand(0).setReg(DstRegD);
+  MI->getOperand(1).setReg(SrcRegD);
+  AddDefaultPred(MIB);
+
+  // We are now reading SrcRegD instead of SrcRegS.  This may upset the
+  // register scavenger and machine verifier, so we need to indicate that we
+  // are reading an undefined value from SrcRegD, but a proper value from
+  // SrcRegS.
+  MI->getOperand(1).setIsUndef();
+  MIB.addReg(SrcRegS, RegState::Implicit);
+
+  // SrcRegD may actually contain an unrelated value in the ssub_1
+  // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
+  if (MI->getOperand(1).isKill()) {
+    MI->getOperand(1).setIsKill(false);
+    MI->addRegisterKilled(SrcRegS, TRI, true);
+  }
+
+  DEBUG(dbgs() << "replaced by: " << *MI);
+  return true;
+}
+
+MachineInstr*
+ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                           int FrameIx, uint64_t Offset,
+                                           const MDNode *MDPtr,
+                                           DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// Create a copy of a const pool value. Update CPI to the new index and return
+/// the label UID.
+static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
+  MachineConstantPool *MCP = MF.getConstantPool();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+  assert(MCPE.isMachineConstantPoolEntry() &&
+         "Expecting a machine constantpool entry!");
+  ARMConstantPoolValue *ACPV =
+    static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  ARMConstantPoolValue *NewCPV = 0;
+  // FIXME: The below assumes PIC relocation model and that the function
+  // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
+  // zero for non-PIC in ARM or Thumb. The callers are all of thumb LDR
+  // instructions, so that's probably OK, but is PIC always correct when
+  // we get here?
+  if (ACPV->isGlobalValue())
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
+             ARMCP::CPValue, 4);
+  else if (ACPV->isExtSymbol())
+    NewCPV = ARMConstantPoolSymbol::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
+  else if (ACPV->isBlockAddress())
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+             ARMCP::CPBlockAddress, 4);
+  else if (ACPV->isLSDA())
+    NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+                                             ARMCP::CPLSDA, 4);
+  else if (ACPV->isMachineBasicBlock())
+    NewCPV = ARMConstantPoolMBB::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
+  else
+    llvm_unreachable("Unexpected ARM constantpool value type!!");
+  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+  return PCLabelId;
+}
+
+void ARMBaseInstrInfo::
+reMaterialize(MachineBasicBlock &MBB,
+              MachineBasicBlock::iterator I,
+              unsigned DestReg, unsigned SubIdx,
+              const MachineInstr *Orig,
+              const TargetRegisterInfo &TRI) const {
+  unsigned Opcode = Orig->getOpcode();
+  switch (Opcode) {
+  default: {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+    MBB.insert(I, MI);
+    break;
+  }
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
+                                      DestReg)
+      .addConstantPoolIndex(CPI).addImm(PCLabelId);
+    MIB->setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    break;
+  }
+  }
+}
+
+MachineInstr *
+ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
+  MachineInstr *MI = TargetInstrInfo::duplicate(Orig, MF);
+  switch(Orig->getOpcode()) {
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    Orig->getOperand(1).setIndex(CPI);
+    Orig->getOperand(2).setImm(PCLabelId);
+    break;
+  }
+  }
+  return MI;
+}
+
+bool ARMBaseInstrInfo::produceSameValue(const MachineInstr *MI0,
+                                        const MachineInstr *MI1,
+                                        const MachineRegisterInfo *MRI) const {
+  int Opcode = MI0->getOpcode();
+  if (Opcode == ARM::t2LDRpci ||
+      Opcode == ARM::t2LDRpci_pic ||
+      Opcode == ARM::tLDRpci ||
+      Opcode == ARM::tLDRpci_pic ||
+      Opcode == ARM::MOV_ga_dyn ||
+      Opcode == ARM::MOV_ga_pcrel ||
+      Opcode == ARM::MOV_ga_pcrel_ldr ||
+      Opcode == ARM::t2MOV_ga_dyn ||
+      Opcode == ARM::t2MOV_ga_pcrel) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    const MachineOperand &MO0 = MI0->getOperand(1);
+    const MachineOperand &MO1 = MI1->getOperand(1);
+    if (MO0.getOffset() != MO1.getOffset())
+      return false;
+
+    if (Opcode == ARM::MOV_ga_dyn ||
+        Opcode == ARM::MOV_ga_pcrel ||
+        Opcode == ARM::MOV_ga_pcrel_ldr ||
+        Opcode == ARM::t2MOV_ga_dyn ||
+        Opcode == ARM::t2MOV_ga_pcrel)
+      // Ignore the PC labels.
+      return MO0.getGlobal() == MO1.getGlobal();
+
+    const MachineFunction *MF = MI0->getParent()->getParent();
+    const MachineConstantPool *MCP = MF->getConstantPool();
+    int CPI0 = MO0.getIndex();
+    int CPI1 = MO1.getIndex();
+    const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
+    const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
+    bool isARMCP0 = MCPE0.isMachineConstantPoolEntry();
+    bool isARMCP1 = MCPE1.isMachineConstantPoolEntry();
+    if (isARMCP0 && isARMCP1) {
+      ARMConstantPoolValue *ACPV0 =
+        static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+      ARMConstantPoolValue *ACPV1 =
+        static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+      return ACPV0->hasSameValue(ACPV1);
+    } else if (!isARMCP0 && !isARMCP1) {
+      return MCPE0.Val.ConstVal == MCPE1.Val.ConstVal;
+    }
+    return false;
+  } else if (Opcode == ARM::PICLDR) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    unsigned Addr0 = MI0->getOperand(1).getReg();
+    unsigned Addr1 = MI1->getOperand(1).getReg();
+    if (Addr0 != Addr1) {
+      if (!MRI ||
+          !TargetRegisterInfo::isVirtualRegister(Addr0) ||
+          !TargetRegisterInfo::isVirtualRegister(Addr1))
+        return false;
+
+      // This assumes SSA form.
+      MachineInstr *Def0 = MRI->getVRegDef(Addr0);
+      MachineInstr *Def1 = MRI->getVRegDef(Addr1);
+      // Check if the loaded value, e.g. a constantpool of a global address, are
+      // the same.
+      if (!produceSameValue(Def0, Def1, MRI))
+        return false;
+    }
+
+    for (unsigned i = 3, e = MI0->getNumOperands(); i != e; ++i) {
+      // %vreg12<def> = PICLDR %vreg11, 0, pred:14, pred:%noreg
+      const MachineOperand &MO0 = MI0->getOperand(i);
+      const MachineOperand &MO1 = MI1->getOperand(i);
+      if (!MO0.isIdenticalTo(MO1))
+        return false;
+    }
+    return true;
+  }
+
+  return MI0->isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+/// determine if two loads are loading from the same base address. It should
+/// only return true if the base pointers are the same and the only differences
+/// between the two addresses is the offset. It also returns the offsets by
+/// reference.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                               int64_t &Offset1,
+                                               int64_t &Offset2) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+
+  switch (Load1->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  switch (Load2->getMachineOpcode()) {
+  default:
+    return false;
+  case ARM::LDRi12:
+  case ARM::LDRBi12:
+  case ARM::LDRD:
+  case ARM::LDRH:
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRi12:
+  case ARM::t2LDRSHi12:
+    break;
+  }
+
+  // Check if base addresses and chain operands match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+
+  // Index should be Reg0.
+  if (Load1->getOperand(3) != Load2->getOperand(3))
+    return false;
+
+  // Determine the offsets.
+  if (isa<ConstantSDNode>(Load1->getOperand(1)) &&
+      isa<ConstantSDNode>(Load2->getOperand(1))) {
+    Offset1 = cast<ConstantSDNode>(Load1->getOperand(1))->getSExtValue();
+    Offset2 = cast<ConstantSDNode>(Load2->getOperand(1))->getSExtValue();
+    return true;
+  }
+
+  return false;
+}
+
+/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+/// be scheduled togther. On some targets if two loads are loading from
+/// addresses in the same cache line, it's better if they are scheduled
+/// together. This function takes two integers that represent the load offsets
+/// from the common base address. It returns true if it decides it's desirable
+/// to schedule the two loads together. "NumLoads" is the number of loads that
+/// have already been scheduled after Load1.
+///
+/// FIXME: remove this in favor of the MachineInstr interface once pre-RA-sched
+/// is permanently disabled.
+bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                               int64_t Offset1, int64_t Offset2,
+                                               unsigned NumLoads) const {
+  // Don't worry about Thumb: just ARM and Thumb2.
+  if (Subtarget.isThumb1Only()) return false;
+
+  assert(Offset2 > Offset1);
+
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+    return false;  // FIXME: overly conservative?
+
+  // Four loads in a row should be sufficient.
+  if (NumLoads >= 3)
+    return false;
+
+  return true;
+}
+
+bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->isTerminator() || MI->isLabel())
+    return true;
+
+  // Treat the start of the IT block as a scheduling boundary, but schedule
+  // t2IT along with all instructions following it.
+  // FIXME: This is a big hammer. But the alternative is to add all potential
+  // true and anti dependencies to IT block instructions as implicit operands
+  // to the t2IT instruction. The added compile time and complexity does not
+  // seem worth it.
+  MachineBasicBlock::const_iterator I = MI;
+  // Make sure to skip any dbg_value instructions
+  while (++I != MBB->end() && I->isDebugValue())
+    ;
+  if (I != MBB->end() && I->getOpcode() == ARM::t2IT)
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  // Calls don't actually change the stack pointer, even if they have imp-defs.
+  // No ARM calling conventions change the stack pointer. (X86 calling
+  // conventions sometimes do).
+  if (!MI->isCall() && MI->definesRegister(ARM::SP))
+    return true;
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  if (!NumCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  unsigned UnpredCost = Probability.getNumerator() * NumCycles;
+  UnpredCost /= Probability.getDenominator();
+  UnpredCost += 1; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+
+  return (NumCycles + ExtraPredCycles) <= UnpredCost;
+}
+
+bool ARMBaseInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned TCycles, unsigned TExtra,
+                    MachineBasicBlock &FMBB,
+                    unsigned FCycles, unsigned FExtra,
+                    const BranchProbability &Probability) const {
+  if (!TCycles || !FCycles)
+    return false;
+
+  // Attempt to estimate the relative costs of predication versus branching.
+  unsigned TUnpredCost = Probability.getNumerator() * TCycles;
+  TUnpredCost /= Probability.getDenominator();
+
+  uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
+  unsigned FUnpredCost = Comp * FCycles;
+  FUnpredCost /= Probability.getDenominator();
+
+  unsigned UnpredCost = TUnpredCost + FUnpredCost;
+  UnpredCost += 1; // The branch itself
+  UnpredCost += Subtarget.getMispredictionPenalty() / 10;
+
+  return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
+}
+
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                            MachineBasicBlock &FMBB) const {
+  // Reduce false anti-dependencies to let Swift's out-of-order execution
+  // engine do its thing.
+  return Subtarget.isSwift();
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes
+llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+
+int llvm::getMatchingCondBranchOpcode(int Opc) {
+  if (Opc == ARM::B)
+    return ARM::Bcc;
+  if (Opc == ARM::tB)
+    return ARM::tBcc;
+  if (Opc == ARM::t2B)
+    return ARM::t2Bcc;
+
+  llvm_unreachable("Unknown unconditional branch opcode!");
+}
+
+/// commuteInstruction - Handle commutable instructions.
+MachineInstr *
+ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case ARM::MOVCCr:
+  case ARM::t2MOVCCr: {
+    // MOVCC can be commuted by inverting the condition.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
+    // MOVCC AL can't be inverted. Shouldn't happen.
+    if (CC == ARMCC::AL || PredReg != ARM::CPSR)
+      return NULL;
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+    if (!MI)
+      return NULL;
+    // After swapping the MOVCC operands, also invert the condition.
+    MI->getOperand(MI->findFirstPredOperandIdx())
+      .setImm(ARMCC::getOppositeCondition(CC));
+    return MI;
+  }
+  }
+  return TargetInstrInfo::commuteInstruction(MI, NewMI);
+}
+
+/// Identify instructions that can be folded into a MOVCC instruction, and
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return 0;
+  if (!MRI.hasOneNonDBGUse(Reg))
+    return 0;
+  MachineInstr *MI = MRI.getVRegDef(Reg);
+  if (!MI)
+    return 0;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return 0;
+  // Check if MI has any non-dead defs or physreg uses. This also detects
+  // predicated instructions which will be reading CPSR.
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands, PEI can't handle the predicated pseudos.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return 0;
+    if (!MO.isReg())
+      continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return 0;
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      return 0;
+    if (MO.isDef() && !MO.isDead())
+      return 0;
+  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
+    return 0;
+  return MI;
+}
+
+bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     unsigned &TrueOp, unsigned &FalseOp,
+                                     bool &Optimizable) const {
+  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  // MOVCC operands:
+  // 0: Def.
+  // 1: True use.
+  // 2: False use.
+  // 3: Condition code.
+  // 4: CPSR use.
+  TrueOp = 1;
+  FalseOp = 2;
+  Cond.push_back(MI->getOperand(3));
+  Cond.push_back(MI->getOperand(4));
+  // We can always fold a def.
+  Optimizable = true;
+  return false;
+}
+
+MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
+                                               bool PreferFalse) const {
+  assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
+         "Unknown select instruction");
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
+    return 0;
+
+  // Create a new predicated version of DefMI.
+  // Rfalse is the first use.
+  MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                      DefMI->getDesc(),
+                                      MI->getOperand(0).getReg());
+
+  // Copy all the DefMI operands, excluding its (null) predicate.
+  const MCInstrDesc &DefDesc = DefMI->getDesc();
+  for (unsigned i = 1, e = DefDesc.getNumOperands();
+       i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
+    NewMI.addOperand(DefMI->getOperand(i));
+
+  unsigned CondCode = MI->getOperand(3).getImm();
+  if (Invert)
+    NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
+  else
+    NewMI.addImm(CondCode);
+  NewMI.addOperand(MI->getOperand(4));
+
+  // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
+  if (NewMI->hasOptionalDef())
+    AddDefaultCC(NewMI);
+
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  FalseReg.setImplicit();
+  NewMI.addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
+  // The caller will erase MI, but not DefMI.
+  DefMI->eraseFromParent();
+  return NewMI;
+}
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+  uint16_t PseudoOpc;
+  uint16_t MachineOpc;
+};
+
+static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADDSri, ARM::ADDri},
+  {ARM::ADDSrr, ARM::ADDrr},
+  {ARM::ADDSrsi, ARM::ADDrsi},
+  {ARM::ADDSrsr, ARM::ADDrsr},
+
+  {ARM::SUBSri, ARM::SUBri},
+  {ARM::SUBSrr, ARM::SUBrr},
+  {ARM::SUBSrsi, ARM::SUBrsi},
+  {ARM::SUBSrsr, ARM::SUBrsr},
+
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrsi, ARM::RSBrsi},
+  {ARM::RSBSrsr, ARM::RSBrsr},
+
+  {ARM::t2ADDSri, ARM::t2ADDri},
+  {ARM::t2ADDSrr, ARM::t2ADDrr},
+  {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+  {ARM::t2SUBSri, ARM::t2SUBri},
+  {ARM::t2SUBSrr, ARM::t2SUBrr},
+  {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+  for (unsigned i = 0, e = array_lengthof(AddSubFlagsOpcodeMap); i != e; ++i)
+    if (OldOpc == AddSubFlagsOpcodeMap[i].PseudoOpc)
+      return AddSubFlagsOpcodeMap[i].MachineOpc;
+  return 0;
+}
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+
+    assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+    // Build the new ADD / SUB.
+    unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    BaseReg = DestReg;
+  }
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getSOImmVal(Offset) != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+    // as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    // Get the properly encoded SOImmVal field.
+    assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode_i12: {
+      ImmIdx = FrameRegIdx + 1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode2: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode4:
+    case ARMII::AddrMode6:
+      // Can't fold any offset even if it's zero.
+      return false;
+    case ARMII::AddrMode5: {
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Attempt to fold address comp. if opcode has offset bits
+    if (NumBits > 0) {
+      // Common case: small offset, fits into instruction.
+      MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+      int ImmedOffset = Offset / Scale;
+      unsigned Mask = (1 << NumBits) - 1;
+      if ((unsigned)Offset <= Mask * Scale) {
+        // Replace the FrameIndex with sp
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        // FIXME: When addrmode2 goes away, this will simplify (like the
+        // T2 version), as the LDR.i12 versions don't need the encoding
+        // tricks for the offset value.
+        if (isSub) {
+          if (AddrMode == ARMII::AddrMode_i12)
+            ImmedOffset = -ImmedOffset;
+          else
+            ImmedOffset |= 1 << NumBits;
+        }
+        ImmOp.ChangeToImmediate(ImmedOffset);
+        Offset = 0;
+        return true;
+      }
+
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode_i12)
+          ImmedOffset = -ImmedOffset;
+        else
+          ImmedOffset |= 1 << NumBits;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool ARMBaseInstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::CMPri:
+  case ARM::t2CMPri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  case ARM::CMPrr:
+  case ARM::t2CMPrr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case ARM::TSTri:
+  case ARM::t2TSTri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = MI->getOperand(1).getImm();
+    CmpValue = 0;
+    return true;
+  }
+
+  return false;
+}
+
+/// isSuitableForMask - Identify a suitable 'and' instruction that
+/// operates on the given source register and applies the same mask
+/// as a 'tst' instruction. Provide a limited look-through for copies.
+/// When successful, MI will hold the found instruction.
+static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+                              int CmpMask, bool CommonUse) {
+  switch (MI->getOpcode()) {
+    case ARM::ANDri:
+    case ARM::t2ANDri:
+      if (CmpMask != MI->getOperand(2).getImm())
+        return false;
+      if (SrcReg == MI->getOperand(CommonUse ? 1 : 0).getReg())
+        return true;
+      break;
+    case ARM::COPY: {
+      // Walk down one instruction which is potentially an 'and'.
+      const MachineInstr &Copy = *MI;
+      MachineBasicBlock::iterator AND(
+        llvm::next(MachineBasicBlock::iterator(MI)));
+      if (AND == MI->getParent()->end()) return false;
+      MI = AND;
+      return isSuitableForMask(MI, Copy.getOperand(0).getReg(),
+                               CmpMask, true);
+    }
+  }
+
+  return false;
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// CMPri can be made redundant by SUBri if the operands are the same.
+/// This function can be extended later on.
+inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if ((CmpI->getOpcode() == ARM::CMPrr ||
+       CmpI->getOpcode() == ARM::t2CMPrr) &&
+      (OI->getOpcode() == ARM::SUBrr ||
+       OI->getOpcode() == ARM::t2SUBrr) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if ((CmpI->getOpcode() == ARM::CMPri ||
+       CmpI->getOpcode() == ARM::t2CMPri) &&
+      (OI->getOpcode() == ARM::SUBri ||
+       OI->getOpcode() == ARM::t2SUBri) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register;
+/// Remove a redundant Compare instruction if an earlier instruction can set the
+/// flags in the same way as Compare.
+/// E.g. SUBrr(r1,r2) and CMPrr(r1,r2). We also handle the case where two
+/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
+/// condition code of instructions which use the flags.
+bool ARMBaseInstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // Masked compares sometimes use the same register as the corresponding 'and'.
+  if (CmpMask != ~0) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
+      MI = 0;
+      for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
+           UE = MRI->use_end(); UI != UE; ++UI) {
+        if (UI->getParent() != CmpInstr->getParent()) continue;
+        MachineInstr *PotentialAND = &*UI;
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(PotentialAND))
+          continue;
+        MI = PotentialAND;
+        break;
+      }
+      if (!MI) return false;
+    }
+  }
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B) return false;
+
+  // There are two possible candidates which can be changed to set CPSR:
+  // One is MI, the other is a SUB instruction.
+  // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
+  // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
+  MachineInstr *Sub = NULL;
+  if (SrcReg2 != 0)
+    // MI is not a candidate for CMPrr.
+    MI = NULL;
+  else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
+    // Conservatively refuse to convert an instruction which isn't in the same
+    // BB as the comparison.
+    // For CMPri, we need to check Sub, thus we can't return here.
+    if (CmpInstr->getOpcode() == ARM::CMPri ||
+       CmpInstr->getOpcode() == ARM::t2CMPri)
+      MI = NULL;
+    else
+      return false;
+  }
+
+  // Check that CPSR isn't set between the comparison instruction and the one we
+  // want to change. At the same time, search for Sub.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  --I;
+  for (; I != E; --I) {
+    const MachineInstr &Instr = *I;
+
+    if (Instr.modifiesRegister(ARM::CPSR, TRI) ||
+        Instr.readsRegister(ARM::CPSR, TRI))
+      // This instruction modifies or uses CPSR after the one we want to
+      // change. We can't do this transformation.
+      return false;
+
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, &*I)) {
+      Sub = &*I;
+      break;
+    }
+
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  // Return false if no candidates exist.
+  if (!MI && !Sub)
+    return false;
+
+  // The single candidate is called MI.
+  if (!MI) MI = Sub;
+
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(MI))
+    return false;
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
+  case ARM::ADDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
+  case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
+  case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
+  case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri: {
+    // Scan forward for the use of CPSR
+    // When checking against MI: if it's a conditional code requires
+    // checking of V bit, then this is not safe to do.
+    // It is safe to remove CmpInstr if CPSR is redefined or killed.
+    // If we are done with the basic block, we need to check whether CPSR is
+    // live-out.
+    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+        OperandsToUpdate;
+    bool isSafe = false;
+    I = CmpInstr;
+    E = CmpInstr->getParent()->end();
+    while (!isSafe && ++I != E) {
+      const MachineInstr &Instr = *I;
+      for (unsigned IO = 0, EO = Instr.getNumOperands();
+           !isSafe && IO != EO; ++IO) {
+        const MachineOperand &MO = Instr.getOperand(IO);
+        if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+          isSafe = true;
+          break;
+        }
+        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+          continue;
+        if (MO.isDef()) {
+          isSafe = true;
+          break;
+        }
+        // Condition code is after the operand before CPSR.
+        ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
+        if (Sub) {
+          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+          if (NewCC == ARMCC::AL)
+            return false;
+          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+          // on CMP needs to be updated to be based on SUB.
+          // Push the condition code operands to OperandsToUpdate.
+          // If it is safe to remove CmpInstr, the condition code of these
+          // operands will be modified.
+          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+              Sub->getOperand(2).getReg() == SrcReg)
+            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
+                                                      NewCC));
+        }
+        else
+          switch (CC) {
+          default:
+            // CPSR can be used multiple times, we should continue.
+            break;
+          case ARMCC::VS:
+          case ARMCC::VC:
+          case ARMCC::GE:
+          case ARMCC::LT:
+          case ARMCC::GT:
+          case ARMCC::LE:
+            return false;
+          }
+      }
+    }
+
+    // If CPSR is not killed nor re-defined, we should check whether it is
+    // live-out. If it is live-out, do not optimize.
+    if (!isSafe) {
+      MachineBasicBlock *MBB = CmpInstr->getParent();
+      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+               SE = MBB->succ_end(); SI != SE; ++SI)
+        if ((*SI)->isLiveIn(ARM::CPSR))
+          return false;
+    }
+
+    // Toggle the optional operand to CPSR.
+    MI->getOperand(5).setReg(ARM::CPSR);
+    MI->getOperand(5).setIsDef(true);
+    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
+    CmpInstr->eraseFromParent();
+
+    // Modify the condition code of operands in OperandsToUpdate.
+    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
+                                     MachineInstr *DefMI, unsigned Reg,
+                                     MachineRegisterInfo *MRI) const {
+  // Fold large immediates into add, sub, or, xor.
+  unsigned DefOpc = DefMI->getOpcode();
+  if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
+    return false;
+  if (!DefMI->getOperand(1).isImm())
+    // Could be t2MOVi32imm <ga:xx>
+    return false;
+
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  const MCInstrDesc &DefMCID = DefMI->getDesc();
+  if (DefMCID.hasOptionalDef()) {
+    unsigned NumOps = DefMCID.getNumOperands();
+    const MachineOperand &MO = DefMI->getOperand(NumOps-1);
+    if (MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If DefMI defines CPSR and it is not dead, it's obviously not safe
+      // to delete DefMI.
+      return false;
+  }
+
+  const MCInstrDesc &UseMCID = UseMI->getDesc();
+  if (UseMCID.hasOptionalDef()) {
+    unsigned NumOps = UseMCID.getNumOperands();
+    if (UseMI->getOperand(NumOps-1).getReg() == ARM::CPSR)
+      // If the instruction sets the flag, do not attempt this optimization
+      // since it may change the semantics of the code.
+      return false;
+  }
+
+  unsigned UseOpc = UseMI->getOpcode();
+  unsigned NewUseOpc = 0;
+  uint32_t ImmVal = (uint32_t)DefMI->getOperand(1).getImm();
+  uint32_t SOImmValV1 = 0, SOImmValV2 = 0;
+  bool Commute = false;
+  switch (UseOpc) {
+  default: return false;
+  case ARM::SUBrr:
+  case ARM::ADDrr:
+  case ARM::ORRrr:
+  case ARM::EORrr:
+  case ARM::t2SUBrr:
+  case ARM::t2ADDrr:
+  case ARM::t2ORRrr:
+  case ARM::t2EORrr: {
+    Commute = UseMI->getOperand(2).getReg() != Reg;
+    switch (UseOpc) {
+    default: break;
+    case ARM::SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::SUBri;
+      // Fallthrough
+    }
+    case ARM::ADDrr:
+    case ARM::ORRrr:
+    case ARM::EORrr: {
+      if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::ADDrr: NewUseOpc = ARM::ADDri; break;
+      case ARM::ORRrr: NewUseOpc = ARM::ORRri; break;
+      case ARM::EORrr: NewUseOpc = ARM::EORri; break;
+      }
+      break;
+    }
+    case ARM::t2SUBrr: {
+      if (Commute)
+        return false;
+      ImmVal = -ImmVal;
+      NewUseOpc = ARM::t2SUBri;
+      // Fallthrough
+    }
+    case ARM::t2ADDrr:
+    case ARM::t2ORRrr:
+    case ARM::t2EORrr: {
+      if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
+        return false;
+      SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
+      SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
+      switch (UseOpc) {
+      default: break;
+      case ARM::t2ADDrr: NewUseOpc = ARM::t2ADDri; break;
+      case ARM::t2ORRrr: NewUseOpc = ARM::t2ORRri; break;
+      case ARM::t2EORrr: NewUseOpc = ARM::t2EORri; break;
+      }
+      break;
+    }
+    }
+  }
+  }
+
+  unsigned OpIdx = Commute ? 2 : 1;
+  unsigned Reg1 = UseMI->getOperand(OpIdx).getReg();
+  bool isKill = UseMI->getOperand(OpIdx).isKill();
+  unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+  AddDefaultCC(AddDefaultPred(BuildMI(*UseMI->getParent(),
+                                      UseMI, UseMI->getDebugLoc(),
+                                      get(NewUseOpc), NewReg)
+                              .addReg(Reg1, getKillRegState(isKill))
+                              .addImm(SOImmValV1)));
+  UseMI->setDesc(get(NewUseOpc));
+  UseMI->getOperand(1).setReg(NewReg);
+  UseMI->getOperand(1).setIsKill();
+  UseMI->getOperand(2).ChangeToImmediate(SOImmValV2);
+  DefMI->eraseFromParent();
+  return true;
+}
+
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+                                        const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = MI->getDesc();
+    int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+    assert(UOps >= 0 && "bad # UOps");
+    return UOps;
+  }
+
+  case ARM::LDRrs:
+  case ARM::LDRBrs:
+  case ARM::STRrs:
+  case ARM::STRBrs: {
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRH:
+  case ARM::STRH: {
+    if (!MI->getOperand(2).getReg())
+      return 1;
+
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+    return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+
+  case ARM::LDRSB_POST:
+  case ARM::LDRSH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 4 : 3;
+  }
+
+  case ARM::LDR_PRE_REG:
+  case ARM::LDRB_PRE_REG: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rt == Rm)
+      return 3;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::STR_PRE_REG:
+  case ARM::STRB_PRE_REG: {
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::LDRH_PRE:
+  case ARM::STRH_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (!Rm)
+      return 2;
+    if (Rt == Rm)
+      return 3;
+    return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
+      ? 3 : 2;
+  }
+
+  case ARM::LDR_POST_REG:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 3 : 2;
+  }
+
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDRB_POST_IMM:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRH_POST:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STR_PRE_IMM:
+    return 2;
+
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSH_PRE: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm == 0)
+      return 3;
+    unsigned Rt = MI->getOperand(0).getReg();
+    if (Rt == Rm)
+      return 4;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 3;
+    return 4;
+  }
+
+  case ARM::LDRD: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::STRD: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return 2;
+  }
+
+  case ARM::LDRD_POST:
+  case ARM::t2LDRD_POST:
+    return 3;
+
+  case ARM::STRD_POST:
+  case ARM::t2STRD_POST:
+    return 4;
+
+  case ARM::LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::t2LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::STRD_PRE: {
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return 3;
+  }
+
+  case ARM::t2STRD_PRE:
+    return 3;
+
+  case ARM::t2LDR_POST:
+  case ARM::t2LDRB_POST:
+  case ARM::t2LDRB_PRE:
+  case ARM::t2LDRSBi12:
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBpci:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRH_POST:
+  case ARM::t2LDRH_PRE:
+  case ARM::t2LDRSBT:
+  case ARM::t2LDRSB_POST:
+  case ARM::t2LDRSB_PRE:
+  case ARM::t2LDRSH_POST:
+  case ARM::t2LDRSH_PRE:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHpci:
+  case ARM::t2LDRSHs:
+    return 2;
+
+  case ARM::t2LDRDi8: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::t2STRB_POST:
+  case ARM::t2STRB_PRE:
+  case ARM::t2STRBs:
+  case ARM::t2STRDi8:
+  case ARM::t2STRH_POST:
+  case ARM::t2STRH_PRE:
+  case ARM::t2STRHs:
+  case ARM::t2STR_POST:
+  case ARM::t2STR_PRE:
+  case ARM::t2STRs:
+    return 2;
+  }
+}
+
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is a the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+         E = MI->memoperands_end(); I != E; ++I) {
+    Size += (*I)->getSize();
+  }
+  return Size / 4;
+}
+
+unsigned
+ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                 const MachineInstr *MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned Class = Desc.getSchedClass();
+  int ItinUOps = ItinData->getNumMicroOps(Class);
+  if (ItinUOps >= 0) {
+    if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+      return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
+    return ItinUOps;
+  }
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected multi-uops instruction!");
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+
+  // The number of uOps for load / store multiple are determined by the number
+  // registers.
+  //
+  // On Cortex-A8, each pair of register loads / stores can be scheduled on the
+  // same cycle. The scheduling for the first load / store must be done
+  // separately by assuming the address is not 64-bit aligned.
+  //
+  // On Cortex-A9, the formula is simply (#reg / 2) + (#reg % 2). If the address
+  // is not 64-bit aligned, then AGU would take an extra cycle.  For VFP / NEON
+  // load / store multiple, the formula is (#reg / 2) + (#reg % 2) + 1.
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands();
+    return (NumRegs / 2) + (NumRegs % 2) + 1;
+  }
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
+    if (Subtarget.isSwift()) {
+      int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
+      switch (Opc) {
+      default: break;
+      case ARM::VLDMDIA_UPD:
+      case ARM::VLDMDDB_UPD:
+      case ARM::VLDMSIA_UPD:
+      case ARM::VLDMSDB_UPD:
+      case ARM::VSTMDIA_UPD:
+      case ARM::VSTMDDB_UPD:
+      case ARM::VSTMSIA_UPD:
+      case ARM::VSTMSDB_UPD:
+      case ARM::LDMIA_UPD:
+      case ARM::LDMDA_UPD:
+      case ARM::LDMDB_UPD:
+      case ARM::LDMIB_UPD:
+      case ARM::STMIA_UPD:
+      case ARM::STMDA_UPD:
+      case ARM::STMDB_UPD:
+      case ARM::STMIB_UPD:
+      case ARM::tLDMIA_UPD:
+      case ARM::tSTMIA_UPD:
+      case ARM::t2LDMIA_UPD:
+      case ARM::t2LDMDB_UPD:
+      case ARM::t2STMIA_UPD:
+      case ARM::t2STMDB_UPD:
+        ++UOps; // One for base register writeback.
+        break;
+      case ARM::LDMIA_RET:
+      case ARM::tPOP_RET:
+      case ARM::t2LDMIA_RET:
+        UOps += 2; // One for base reg wb, one for write to pc.
+        break;
+      }
+      return UOps;
+    } else if (Subtarget.isCortexA8()) {
+      if (NumRegs < 4)
+        return 2;
+      // 4 registers would be issued: 2, 2.
+      // 5 registers would be issued: 2, 2, 1.
+      int A8UOps = (NumRegs / 2);
+      if (NumRegs % 2)
+        ++A8UOps;
+      return A8UOps;
+    } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+      int A9UOps = (NumRegs / 2);
+      // If there are odd number of registers or if it's not 64-bit aligned,
+      // then it takes an extra AGU (Address Generation Unit) cycle.
+      if ((NumRegs % 2) ||
+          !MI->hasOneMemOperand() ||
+          (*MI->memoperands_begin())->getAlignment() < 8)
+        ++A9UOps;
+      return A9UOps;
+    } else {
+      // Assume the worst.
+      return NumRegs;
+    }
+  }
+  }
+}
+
+int
+ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &DefMCID,
+                                  unsigned DefClass,
+                                  unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8()) {
+    // (regno / 2) + (regno % 2) + 1
+    DefCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++DefCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = RegNo;
+    bool isSLoad = false;
+
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLDMSIA:
+    case ARM::VLDMSIA_UPD:
+    case ARM::VLDMSDB_UPD:
+      isSLoad = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSLoad && (RegNo % 2)) || DefAlign < 8)
+      ++DefCycle;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &DefMCID,
+                                 unsigned DefClass,
+                                 unsigned DefIdx, unsigned DefAlign) const {
+  int RegNo = (int)(DefIdx+1) - DefMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    // Def is the address writeback.
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+
+  int DefCycle;
+  if (Subtarget.isCortexA8()) {
+    // 4 registers would be issued: 1, 2, 1.
+    // 5 registers would be issued: 1, 2, 2.
+    DefCycle = RegNo / 2;
+    if (DefCycle < 1)
+      DefCycle = 1;
+    // Result latency is issue cycle + 2: E2.
+    DefCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    DefCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || DefAlign < 8)
+      ++DefCycle;
+    // Result latency is AGU cycles + 2.
+    DefCycle += 2;
+  } else {
+    // Assume the worst.
+    DefCycle = RegNo + 2;
+  }
+
+  return DefCycle;
+}
+
+int
+ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
+                                  const MCInstrDesc &UseMCID,
+                                  unsigned UseClass,
+                                  unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8()) {
+    // (regno / 2) + (regno % 2) + 1
+    UseCycle = RegNo / 2 + 1;
+    if (RegNo % 2)
+      ++UseCycle;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = RegNo;
+    bool isSStore = false;
+
+    switch (UseMCID.getOpcode()) {
+    default: break;
+    case ARM::VSTMSIA:
+    case ARM::VSTMSIA_UPD:
+    case ARM::VSTMSDB_UPD:
+      isSStore = true;
+      break;
+    }
+
+    // If there are odd number of 'S' registers or if it's not 64-bit aligned,
+    // then it takes an extra cycle.
+    if ((isSStore && (RegNo % 2)) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = RegNo + 2;
+  }
+
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
+                                 const MCInstrDesc &UseMCID,
+                                 unsigned UseClass,
+                                 unsigned UseIdx, unsigned UseAlign) const {
+  int RegNo = (int)(UseIdx+1) - UseMCID.getNumOperands() + 1;
+  if (RegNo <= 0)
+    return ItinData->getOperandCycle(UseClass, UseIdx);
+
+  int UseCycle;
+  if (Subtarget.isCortexA8()) {
+    UseCycle = RegNo / 2;
+    if (UseCycle < 2)
+      UseCycle = 2;
+    // Read in E3.
+    UseCycle += 2;
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
+    UseCycle = (RegNo / 2);
+    // If there are odd number of registers or if it's not 64-bit aligned,
+    // then it takes an extra AGU (Address Generation Unit) cycle.
+    if ((RegNo % 2) || UseAlign < 8)
+      ++UseCycle;
+  } else {
+    // Assume the worst.
+    UseCycle = 1;
+  }
+  return UseCycle;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MCInstrDesc &DefMCID,
+                                    unsigned DefIdx, unsigned DefAlign,
+                                    const MCInstrDesc &UseMCID,
+                                    unsigned UseIdx, unsigned UseAlign) const {
+  unsigned DefClass = DefMCID.getSchedClass();
+  unsigned UseClass = UseMCID.getSchedClass();
+
+  if (DefIdx < DefMCID.getNumDefs() && UseIdx < UseMCID.getNumOperands())
+    return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+
+  // This may be a def / use of a variable_ops instruction, the operand
+  // latency might be determinable dynamically. Let the target try to
+  // figure it out.
+  int DefCycle = -1;
+  bool LdmBypass = false;
+  switch (DefMCID.getOpcode()) {
+  default:
+    DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    break;
+
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+    DefCycle = getVLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tPUSH:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+    LdmBypass = 1;
+    DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
+    break;
+  }
+
+  if (DefCycle == -1)
+    // We can't seem to determine the result latency of the def, assume it's 2.
+    DefCycle = 2;
+
+  int UseCycle = -1;
+  switch (UseMCID.getOpcode()) {
+  default:
+    UseCycle = ItinData->getOperandCycle(UseClass, UseIdx);
+    break;
+
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+    UseCycle = getVSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::STMIA_UPD:
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+  case ARM::tSTMIA_UPD:
+  case ARM::tPOP_RET:
+  case ARM::tPOP:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    UseCycle = getSTMUseCycle(ItinData, UseMCID, UseClass, UseIdx, UseAlign);
+    break;
+  }
+
+  if (UseCycle == -1)
+    // Assume it's read in the first stage.
+    UseCycle = 1;
+
+  UseCycle = DefCycle - UseCycle + 1;
+  if (UseCycle > 0) {
+    if (LdmBypass) {
+      // It's a variable_ops instruction so we can't use DefIdx here. Just use
+      // first def operand.
+      if (ItinData->hasPipelineForwarding(DefClass, DefMCID.getNumOperands()-1,
+                                          UseClass, UseIdx))
+        --UseCycle;
+    } else if (ItinData->hasPipelineForwarding(DefClass, DefIdx,
+                                               UseClass, UseIdx)) {
+      --UseCycle;
+    }
+  }
+
+  return UseCycle;
+}
+
+static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &DefIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_iterator I = MI; ++I;
+  MachineBasicBlock::const_instr_iterator II =
+    llvm::prior(I.getInstrIterator());
+  assert(II->isInsideBundle() && "Empty bundle?");
+
+  int Idx = -1;
+  while (II->isInsideBundle()) {
+    Idx = II->findRegisterDefOperandIdx(Reg, false, true, TRI);
+    if (Idx != -1)
+      break;
+    --II;
+    ++Dist;
+  }
+
+  assert(Idx != -1 && "Cannot find bundled definition!");
+  DefIdx = Idx;
+  return II;
+}
+
+static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
+                                           const MachineInstr *MI, unsigned Reg,
+                                           unsigned &UseIdx, unsigned &Dist) {
+  Dist = 0;
+
+  MachineBasicBlock::const_instr_iterator II = MI; ++II;
+  assert(II->isInsideBundle() && "Empty bundle?");
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  // FIXME: This doesn't properly handle multiple uses.
+  int Idx = -1;
+  while (II != E && II->isInsideBundle()) {
+    Idx = II->findRegisterUseOperandIdx(Reg, false, TRI);
+    if (Idx != -1)
+      break;
+    if (II->getOpcode() != ARM::t2IT)
+      ++Dist;
+    ++II;
+  }
+
+  if (Idx == -1) {
+    Dist = 0;
+    return 0;
+  }
+
+  UseIdx = Idx;
+  return II;
+}
+
+/// Return the number of cycles to add to (or subtract from) the static
+/// itinerary based on the def opcode and alignment. The caller will ensure that
+/// adjusted latency is at least one cycle.
+static int adjustDefLatency(const ARMSubtarget &Subtarget,
+                            const MachineInstr *DefMI,
+                            const MCInstrDesc *DefMCID, unsigned DefAlign) {
+  int Adjust = 0;
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Adjust;
+      break;
+    }
+    }
+  } else if (Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (!isSub &&
+          (ShImm == 0 ||
+           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+            ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+        Adjust -= 2;
+      else if (!isSub &&
+               ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+        Adjust -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.isLikeA9()) {
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD1d64T:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD1d64Q:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8:
+    case ARM::VLD4DUPd16:
+    case ARM::VLD4DUPd32:
+    case ARM::VLD4DUPd8_UPD:
+    case ARM::VLD4DUPd16_UPD:
+    case ARM::VLD4DUPd32_UPD:
+    case ARM::VLD1LNd8:
+    case ARM::VLD1LNd16:
+    case ARM::VLD1LNd32:
+    case ARM::VLD1LNd8_UPD:
+    case ARM::VLD1LNd16_UPD:
+    case ARM::VLD1LNd32_UPD:
+    case ARM::VLD2LNd8:
+    case ARM::VLD2LNd16:
+    case ARM::VLD2LNd32:
+    case ARM::VLD2LNq16:
+    case ARM::VLD2LNq32:
+    case ARM::VLD2LNd8_UPD:
+    case ARM::VLD2LNd16_UPD:
+    case ARM::VLD2LNd32_UPD:
+    case ARM::VLD2LNq16_UPD:
+    case ARM::VLD2LNq32_UPD:
+    case ARM::VLD4LNd8:
+    case ARM::VLD4LNd16:
+    case ARM::VLD4LNd32:
+    case ARM::VLD4LNq16:
+    case ARM::VLD4LNq32:
+    case ARM::VLD4LNd8_UPD:
+    case ARM::VLD4LNd16_UPD:
+    case ARM::VLD4LNd32_UPD:
+    case ARM::VLD4LNq16_UPD:
+    case ARM::VLD4LNq32_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Adjust;
+      break;
+    }
+  }
+  return Adjust;
+}
+
+
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    const MachineInstr *DefMI, unsigned DefIdx,
+                                    const MachineInstr *UseMI,
+                                    unsigned UseIdx) const {
+  // No operand latency. The caller may fall back to getInstrLatency.
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  const MachineOperand &DefMO = DefMI->getOperand(DefIdx);
+  unsigned Reg = DefMO.getReg();
+  const MCInstrDesc *DefMCID = &DefMI->getDesc();
+  const MCInstrDesc *UseMCID = &UseMI->getDesc();
+
+  unsigned DefAdj = 0;
+  if (DefMI->isBundle()) {
+    DefMI = getBundledDefMI(&getRegisterInfo(), DefMI, Reg, DefIdx, DefAdj);
+    DefMCID = &DefMI->getDesc();
+  }
+  if (DefMI->isCopyLike() || DefMI->isInsertSubreg() ||
+      DefMI->isRegSequence() || DefMI->isImplicitDef()) {
+    return 1;
+  }
+
+  unsigned UseAdj = 0;
+  if (UseMI->isBundle()) {
+    unsigned NewUseIdx;
+    const MachineInstr *NewUseMI = getBundledUseMI(&getRegisterInfo(), UseMI,
+                                                   Reg, NewUseIdx, UseAdj);
+    if (!NewUseMI)
+      return -1;
+
+    UseMI = NewUseMI;
+    UseIdx = NewUseIdx;
+    UseMCID = &UseMI->getDesc();
+  }
+
+  if (Reg == ARM::CPSR) {
+    if (DefMI->getOpcode() == ARM::FMSTAT) {
+      // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
+      return Subtarget.isLikeA9() ? 1 : 20;
+    }
+
+    // CPSR set and branch can be paired in the same cycle.
+    if (UseMI->isBranch())
+      return 0;
+
+    // Otherwise it takes the instruction latency (generally one).
+    unsigned Latency = getInstrLatency(ItinData, DefMI);
+
+    // For Thumb2 and -Os, prefer scheduling CPSR setting instruction close to
+    // its uses. Instructions which are otherwise scheduled between them may
+    // incur a code size penalty (not able to use the CPSR setting 16-bit
+    // instructions).
+    if (Latency > 0 && Subtarget.isThumb2()) {
+      const MachineFunction *MF = DefMI->getParent()->getParent();
+      if (MF->getFunction()->getAttributes().
+            hasAttribute(AttributeSet::FunctionIndex,
+                         Attribute::OptimizeForSize))
+        --Latency;
+    }
+    return Latency;
+  }
+
+  if (DefMO.isImplicit() || UseMI->getOperand(UseIdx).isImplicit())
+    return -1;
+
+  unsigned DefAlign = DefMI->hasOneMemOperand()
+    ? (*DefMI->memoperands_begin())->getAlignment() : 0;
+  unsigned UseAlign = UseMI->hasOneMemOperand()
+    ? (*UseMI->memoperands_begin())->getAlignment() : 0;
+
+  // Get the itinerary's latency if possible, and handle variable_ops.
+  int Latency = getOperandLatency(ItinData, *DefMCID, DefIdx, DefAlign,
+                                  *UseMCID, UseIdx, UseAlign);
+  // Unable to find operand latency. The caller may resort to getInstrLatency.
+  if (Latency < 0)
+    return Latency;
+
+  // Adjust for IT block position.
+  int Adj = DefAdj + UseAdj;
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  Adj += adjustDefLatency(Subtarget, DefMI, DefMCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  // Return the itinerary latency, which may be zero but not less than zero.
+  return Latency;
+}
+
+int
+ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                    SDNode *DefNode, unsigned DefIdx,
+                                    SDNode *UseNode, unsigned UseIdx) const {
+  if (!DefNode->isMachineOpcode())
+    return 1;
+
+  const MCInstrDesc &DefMCID = get(DefNode->getMachineOpcode());
+
+  if (isZeroCost(DefMCID.Opcode))
+    return 0;
+
+  if (!ItinData || ItinData->isEmpty())
+    return DefMCID.mayLoad() ? 3 : 1;
+
+  if (!UseNode->isMachineOpcode()) {
+    int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
+    if (Subtarget.isLikeA9() || Subtarget.isSwift())
+      return Latency <= 2 ? 1 : Latency - 1;
+    else
+      return Latency <= 3 ? 1 : Latency - 2;
+  }
+
+  const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
+  const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode);
+  unsigned DefAlign = !DefMN->memoperands_empty()
+    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+  const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
+  unsigned UseAlign = !UseMN->memoperands_empty()
+    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+  int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
+                                  UseMCID, UseIdx, UseAlign);
+
+  if (Latency > 1 &&
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9())) {
+    // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
+    // variants are one cycle cheaper.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          (ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      if (ShAmt == 0 || ShAmt == 2)
+        --Latency;
+      break;
+    }
+    }
+  } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+           ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        Latency -= 2;
+      else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl 0-3 only.
+      Latency -= 2;
+      break;
+    }
+    }
+  }
+
+  if (DefAlign < 8 && Subtarget.isLikeA9())
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD1DUPq8:
+    case ARM::VLD1DUPq16:
+    case ARM::VLD1DUPq32:
+    case ARM::VLD1DUPq8wb_fixed:
+    case ARM::VLD1DUPq16wb_fixed:
+    case ARM::VLD1DUPq32wb_fixed:
+    case ARM::VLD1DUPq8wb_register:
+    case ARM::VLD1DUPq16wb_register:
+    case ARM::VLD1DUPq32wb_register:
+    case ARM::VLD2DUPd8:
+    case ARM::VLD2DUPd16:
+    case ARM::VLD2DUPd32:
+    case ARM::VLD2DUPd8wb_fixed:
+    case ARM::VLD2DUPd16wb_fixed:
+    case ARM::VLD2DUPd32wb_fixed:
+    case ARM::VLD2DUPd8wb_register:
+    case ARM::VLD2DUPd16wb_register:
+    case ARM::VLD2DUPd32wb_register:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+      // If the address is not 64-bit aligned, the latencies of these
+      // instructions increases by one.
+      ++Latency;
+      break;
+    }
+
+  return Latency;
+}
+
+unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                           const MachineInstr *MI,
+                                           unsigned *PredCost) const {
+  if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 1;
+
+  // An instruction scheduler typically runs on unbundled instructions, however
+  // other passes may query the latency of a bundled instruction.
+  if (MI->isBundle()) {
+    unsigned Latency = 0;
+    MachineBasicBlock::const_instr_iterator I = MI;
+    MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+    while (++I != E && I->isInsideBundle()) {
+      if (I->getOpcode() != ARM::t2IT)
+        Latency += getInstrLatency(ItinData, I, PredCost);
+    }
+    return Latency;
+  }
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    *PredCost = 1;
+  }
+  // Be sure to call getStageLatency for an empty itinerary in case it has a
+  // valid MinLatency property.
+  if (!ItinData)
+    return MI->mayLoad() ? 3 : 1;
+
+  unsigned Class = MCID.getSchedClass();
+
+  // For instructions with variable uops, use uops as latency.
+  if (!ItinData->isEmpty() && ItinData->getNumMicroOps(Class) < 0)
+    return getNumMicroOps(ItinData, MI);
+
+  // For the common case, fall back on the itinerary's latency.
+  unsigned Latency = ItinData->getStageLatency(Class);
+
+  // Adjust for dynamic def-side opcode variants not captured by the itinerary.
+  unsigned DefAlign = MI->hasOneMemOperand()
+    ? (*MI->memoperands_begin())->getAlignment() : 0;
+  int Adj = adjustDefLatency(Subtarget, MI, &MCID, DefAlign);
+  if (Adj >= 0 || (int)Latency > -Adj) {
+    return Latency + Adj;
+  }
+  return Latency;
+}
+
+int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                      SDNode *Node) const {
+  if (!Node->isMachineOpcode())
+    return 1;
+
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Opcode = Node->getMachineOpcode();
+  switch (Opcode) {
+  default:
+    return ItinData->getStageLatency(get(Opcode).getSchedClass());
+  case ARM::VLDMQIA:
+  case ARM::VSTMQIA:
+    return 2;
+  }
+}
+
+bool ARMBaseInstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (Subtarget.isCortexA8() &&
+      (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP))
+    // CortexA8 VFP instructions are not pipelined.
+    return true;
+
+  // Hoist VFP / NEON instructions with 4 or higher latency.
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx,
+                                      /*FindMin=*/false);
+  if (Latency < 0)
+    Latency = getInstrLatency(ItinData, DefMI);
+  if (Latency <= 3)
+    return false;
+  return DDomain == ARMII::DomainVFP || DDomain == ARMII::DomainNEON ||
+         UDomain == ARMII::DomainVFP || UDomain == ARMII::DomainNEON;
+}
+
+bool ARMBaseInstrInfo::
+hasLowDefLatency(const InstrItineraryData *ItinData,
+                 const MachineInstr *DefMI, unsigned DefIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask;
+  if (DDomain == ARMII::DomainGeneral) {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+    return (DefCycle != -1 && DefCycle <= 2);
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                         StringRef &ErrInfo) const {
+  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+    ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+    return false;
+  }
+  return true;
+}
+
+bool
+ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                                     unsigned &AddSubOpc,
+                                     bool &NegAcc, bool &HasLane) const {
+  DenseMap<unsigned, unsigned>::const_iterator I = MLxEntryMap.find(Opcode);
+  if (I == MLxEntryMap.end())
+    return false;
+
+  const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
+  MulOpc = Entry.MulOpc;
+  AddSubOpc = Entry.AddSubOpc;
+  NegAcc = Entry.NegAcc;
+  HasLane = Entry.HasLane;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both.  The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+  ExeGeneric = 0,
+  ExeVFP = 1,
+  ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  // VMOVD, VMOVRS and VMOVSR are VFP instructions, but can be changed to NEON
+  // if they are not predicated.
+  if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // CortexA9 is particularly picky about mixing the two and wants these
+  // converted.
+  if (Subtarget.isCortexA9() && !isPredicated(MI) &&
+      (MI->getOpcode() == ARM::VMOVRS ||
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // No other instructions can be swizzled, so just determine their domain.
+  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+
+  if (Domain & ARMII::DomainNEON)
+    return std::make_pair(ExeNEON, 0);
+
+  // Certain instructions can go either way on Cortex-A8.
+  // Treat them as NEON instructions.
+  if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+    return std::make_pair(ExeNEON, 0);
+
+  if (Domain & ARMII::DomainVFP)
+    return std::make_pair(ExeVFP, 0);
+
+  return std::make_pair(ExeGeneric, 0);
+}
+
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+                                       MachineInstr *MI,
+                                       unsigned DReg, unsigned Lane,
+                                       unsigned &ImplicitSReg) {
+  // If the DPR is defined or used already, the other SPR lane will be chained
+  // correctly, so there is nothing to be done.
+  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+    ImplicitSReg = 0;
+    return true;
+  }
+
+  // Otherwise we need to go searching to see if the SPR is set explicitly.
+  ImplicitSReg = TRI->getSubReg(DReg,
+                                (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+  MachineBasicBlock::LivenessQueryResult LQR =
+    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+  if (LQR == MachineBasicBlock::LQR_Live)
+    return true;
+  else if (LQR == MachineBasicBlock::LQR_Unknown)
+    return false;
+
+  // If the register is known not to be live, there is no need to add an
+  // implicit-use.
+  ImplicitSReg = 0;
+  return true;
+}
+
+void
+ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  unsigned DstReg, SrcReg, DReg;
+  unsigned Lane;
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("cannot handle opcode!");
+      break;
+    case ARM::VMOVD:
+      if (Domain != ExeNEON)
+        break;
+
+      // Zap the predicate operands.
+      assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+
+      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+      MI->setDesc(get(ARM::VORRd));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(SrcReg)
+                        .addReg(SrcReg));
+      break;
+    case ARM::VMOVRS:
+      if (Domain != ExeNEON)
+        break;
+      assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
+
+      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
+
+      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+      // Note that DSrc has been widened and the other lane may be undef, which
+      // contaminates the entire register.
+      MI->setDesc(get(ARM::VGETLNi32));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(DReg, RegState::Undef)
+                        .addImm(Lane));
+
+      // The old source should be an implicit use, otherwise we might think it
+      // was dead before here.
+      MIB.addReg(SrcReg, RegState::Implicit);
+      break;
+    case ARM::VMOVSR: {
+      if (Domain != ExeNEON)
+        break;
+      assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
+
+      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+      // Again DDst may be undefined at the beginning of this instruction.
+      MI->setDesc(get(ARM::VSETLNi32));
+      MIB.addReg(DReg, RegState::Define)
+         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
+         .addReg(SrcReg)
+         .addImm(Lane);
+      AddDefaultPred(MIB);
+
+      // The narrower destination must be marked as set to keep previous chains
+      // in place.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
+           .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        if (ImplicitSReg != 0)
+          MIB.addReg(ImplicitSReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
+      AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+  }
+
+}
+
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity.  That means any
+// instruction writing an S-reg implicitly reads the containing D-reg.  The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+unsigned ARMBaseInstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI,
+                             unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (!SwiftPartialUpdateClearance ||
+      !(Subtarget.isSwift() || Subtarget.isCortexA15()))
+    return 0;
+
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.readsReg())
+    return 0;
+  unsigned Reg = MO.getReg();
+  int UseOp = -1;
+
+  switch(MI->getOpcode()) {
+    // Normal instructions writing only an S-register.
+  case ARM::VLDRS:
+  case ARM::FCONSTS:
+  case ARM::VMOVSR:
+  case ARM::VMOVv8i8:
+  case ARM::VMOVv4i16:
+  case ARM::VMOVv2i32:
+  case ARM::VMOVv2f32:
+  case ARM::VMOVv1i64:
+    UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+    break;
+
+    // Explicitly reads the dependency.
+  case ARM::VLD1LNd32:
+    UseOp = 3;
+    break;
+  default:
+    return 0;
+  }
+
+  // If this instruction actually reads a value from Reg, there is no unwanted
+  // dependency.
+  if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+    return 0;
+
+  // We must be able to clobber the whole D-reg.
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // Virtual register must be a foo:ssub_0<def,undef> operand.
+    if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else if (ARM::SPRRegClass.contains(Reg)) {
+    // Physical register: MI must define the full D-reg.
+    unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+                                             &ARM::DPRRegClass);
+    if (!DReg || !MI->definesRegister(DReg, TRI))
+      return 0;
+  }
+
+  // MI has an unwanted D-register dependency.
+  // Avoid defs in the previous N instructrions.
+  return SwiftPartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI,
+                          unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned Reg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "Can't break virtual register dependencies.");
+  unsigned DReg = Reg;
+
+  // If MI defines an S-reg, find the corresponding D super-register.
+  if (ARM::SPRRegClass.contains(Reg)) {
+    DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+    assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+  }
+
+  assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+  assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+  // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+  // the full D-register by loading the same value to both lanes.  The
+  // instruction is micro-coded with 2 uops, so don't do this until we can
+  // properly schedule micro-coded instuctions.  The dispatcher stalls cause
+  // too big regressions.
+
+  // Insert the dependency-breaking FCONSTD before MI.
+  // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+  AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                         get(ARM::FCONSTD), DReg).addImm(96));
+  MI->addRegisterKilled(DReg, TRI, true);
+}
+
+bool ARMBaseInstrInfo::hasNOP() const {
+  return (Subtarget.getFeatureBits() & ARM::HasV6T2Ops) != 0;
+}
+
+bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+  unsigned ShOpVal = MI->getOperand(3).getImm();
+  unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
+  // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
+  if ((ShImm == 1 && ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsr) ||
+      ((ShImm == 1 || ShImm == 2) &&
+       ARM_AM::getSORegShOp(ShOpVal) == ARM_AM::lsl))
+    return true;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 000000000000..7c107bb41951
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -0,0 +1,419 @@
+//===-- ARMBaseInstrInfo.h - ARM Base Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINSTRUCTIONINFO_H
+#define ARMBASEINSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "ARMGenInstrInfo.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseRegisterInfo;
+
+class ARMBaseInstrInfo : public ARMGenInstrInfo {
+  const ARMSubtarget &Subtarget;
+
+protected:
+  // Can be only subclassed.
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+
+public:
+  // Return whether the target has an explicit NOP encoding.
+  bool hasNOP() const;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+  const ARMSubtarget &getSubtarget() const { return Subtarget; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetMachine *TM,
+                               const ScheduleDAG *DAG) const;
+
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const;
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify = false) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  // Predication support.
+  bool isPredicated(const MachineInstr *MI) const;
+
+  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+                      : ARMCC::AL;
+  }
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+
+  virtual bool isPredicable(MachineInstr *MI) const;
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                            int &FrameIndex) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
+
+  virtual void reMaterialize(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI,
+                             unsigned DestReg, unsigned SubIdx,
+                             const MachineInstr *Orig,
+                             const TargetRegisterInfo &TRI) const;
+
+  MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
+
+  MachineInstr *commuteInstruction(MachineInstr*, bool=false) const;
+
+  virtual bool produceSameValue(const MachineInstr *MI0,
+                                const MachineInstr *MI1,
+                                const MachineRegisterInfo *MRI) const;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler to
+  /// determine if two loads are loading from the same base address. It should
+  /// only return true if the base pointers are the same and the only
+  /// differences between the two addresses is the offset. It also returns the
+  /// offsets by reference.
+  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                       int64_t &Offset1, int64_t &Offset2)const;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+  /// should be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                       int64_t Offset1, int64_t Offset2,
+                                       unsigned NumLoads) const;
+
+  virtual bool isSchedulingBoundary(const MachineInstr *MI,
+                                    const MachineBasicBlock *MBB,
+                                    const MachineFunction &MF) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCycles, unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumT, unsigned ExtraT,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumF, unsigned ExtraF,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCycles,
+                                         const BranchProbability
+                                         &Probability) const {
+    return NumCycles == 1;
+  }
+
+  virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2, int &CmpMask,
+                              int &CmpValue) const;
+
+  /// optimizeCompareInstr - Convert the instruction to set the zero flag so
+  /// that we can remove a "comparison with zero"; Remove a redundant CMP
+  /// instruction if the flags can be updated in the same way by an earlier
+  /// instruction such as SUB.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
+  virtual bool analyzeSelect(const MachineInstr *MI,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             unsigned &TrueOp, unsigned &FalseOp,
+                             bool &Optimizable) const;
+
+  virtual MachineInstr *optimizeSelect(MachineInstr *MI, bool) const;
+
+  /// FoldImmediate - 'Reg' is known to be defined by a move immediate
+  /// instruction, try to fold the immediate into the use instruction.
+  virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                             unsigned Reg, MachineRegisterInfo *MRI) const;
+
+  virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
+                                  const MachineInstr *MI) const;
+
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx,
+                        const MachineInstr *UseMI, unsigned UseIdx) const;
+  virtual
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        SDNode *DefNode, unsigned DefIdx,
+                        SDNode *UseNode, unsigned UseIdx) const;
+
+  /// VFP/NEON execution domains.
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+
+  unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
+                                        const TargetRegisterInfo*) const;
+  void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+                                 const TargetRegisterInfo *TRI) const;
+  /// Get the number of addresses by LDM or VLDM or zero for unknown.
+  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+
+private:
+  unsigned getInstBundleLength(const MachineInstr *MI) const;
+
+  int getVLDMDefCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &DefMCID,
+                      unsigned DefClass,
+                      unsigned DefIdx, unsigned DefAlign) const;
+  int getLDMDefCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &DefMCID,
+                     unsigned DefClass,
+                     unsigned DefIdx, unsigned DefAlign) const;
+  int getVSTMUseCycle(const InstrItineraryData *ItinData,
+                      const MCInstrDesc &UseMCID,
+                      unsigned UseClass,
+                      unsigned UseIdx, unsigned UseAlign) const;
+  int getSTMUseCycle(const InstrItineraryData *ItinData,
+                     const MCInstrDesc &UseMCID,
+                     unsigned UseClass,
+                     unsigned UseIdx, unsigned UseAlign) const;
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MCInstrDesc &DefMCID,
+                        unsigned DefIdx, unsigned DefAlign,
+                        const MCInstrDesc &UseMCID,
+                        unsigned UseIdx, unsigned UseAlign) const;
+
+  unsigned getInstrLatency(const InstrItineraryData *ItinData,
+                           const MachineInstr *MI,
+                           unsigned *PredCost = 0) const;
+
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const;
+
+  bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const;
+  bool hasLowDefLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr *DefMI, unsigned DefIdx) const;
+
+  /// verifyInstruction - Perform target specific instruction verification.
+  bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const;
+
+private:
+  /// Modeling special VFP / NEON fp MLA / MLS hazards.
+
+  /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
+  /// MLx table.
+  DenseMap<unsigned, unsigned> MLxEntryMap;
+
+  /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
+  /// stalls when scheduled together with fp MLA / MLS opcodes.
+  SmallSet<unsigned, 16> MLxHazardOpcodes;
+
+public:
+  /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
+  /// instruction.
+  bool isFpMLxInstruction(unsigned Opcode) const {
+    return MLxEntryMap.count(Opcode);
+  }
+
+  /// isFpMLxInstruction - This version also returns the multiply opcode and the
+  /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
+  /// the MLX instructions with an extra lane operand.
+  bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
+                          unsigned &AddSubOpc, bool &NegAcc,
+                          bool &HasLane) const;
+
+  /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
+  /// will cause stalls when scheduled after (within 4-cycle window) a fp
+  /// MLA / MLS instruction.
+  bool canCauseFpMLxStall(unsigned Opcode) const {
+    return MLxHazardOpcodes.count(Opcode);
+  }
+
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isSwiftFastImmShift(const MachineInstr *MI) const;
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+                                          bool isDead = false) {
+  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+  return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+  return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+  return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+    Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+static inline
+bool isIndirectBranchOpcode(int Opc) {
+  return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+int getMatchingCondBranchOpcode(int Opc);
+
+/// Determine if MI can be folded into an ARM MOVCC instruction, and return the
+/// opcode of the SSA instruction representing the conditional MI.
+unsigned canFoldARMInstrIntoMOVCC(unsigned Reg,
+                                  MachineInstr *&MI,
+                                  const MachineRegisterInfo &MRI);
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII,
+                               const ARMBaseRegisterInfo& MRI,
+                               unsigned MIFlags = 0);
+
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 000000000000..b6b27f849a23
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -0,0 +1,740 @@
+//===-- ARMBaseRegisterInfo.cpp - ARM Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseRegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMFrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "ARMGenRegisterInfo.inc"
+
+using namespace llvm;
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                                         const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), TII(tii), STI(sti),
+    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
+    BasePtr(ARM::R6) {
+}
+
+const uint16_t*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  bool ghcCall = false;
+ 
+  if (MF) {
+    const Function *F = MF->getFunction();
+    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+  }
+ 
+  if (ghcCall) {
+      return CSR_GHC_SaveList;
+  }
+  else {
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  }
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
+}
+
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector ARMBaseRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  Reserved.set(ARM::FPSCR);
+  if (TFI->hasFP(MF))
+    Reserved.set(FramePtr);
+  if (hasBasePointer(MF))
+    Reserved.set(BasePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  // Reserve D16-D31 if the subtarget doesn't support them.
+  if (!STI.hasVFP3() || STI.hasD16()) {
+    assert(ARM::D31 == ARM::D16 + 15);
+    for (unsigned i = 0; i != 16; ++i)
+      Reserved.set(ARM::D16 + i);
+  }
+  const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
+  for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
+    for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+      if (Reserved.test(*SI)) Reserved.set(*I);
+
+  return Reserved;
+}
+
+const TargetRegisterClass*
+ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case ARM::GPRRegClassID:
+    case ARM::SPRRegClassID:
+    case ARM::DPRRegClassID:
+    case ARM::QPRRegClassID:
+    case ARM::QQPRRegClassID:
+    case ARM::QQQQPRRegClassID:
+    case ARM::GPRPairRegClassID:
+      return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::GPRRegClass;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM::CCRRegClass)
+    return 0;  // Can't copy CCR registers.
+  return RC;
+}
+
+unsigned
+ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case ARM::tGPRRegClassID:
+    return TFI->hasFP(MF) ? 4 : 5;
+  case ARM::GPRRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+  }
+  case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
+  case ARM::DPRRegClassID:
+    return 32 - 10;
+  }
+}
+
+// Get the other register in a GPRPair.
+static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+  for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
+    if (ARM::GPRPairRegClass.contains(*Supers))
+      return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
+  return 0;
+}
+
+// Resolve the RegPairEven / RegPairOdd register allocator hints.
+void
+ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                           ArrayRef<MCPhysReg> Order,
+                                           SmallVectorImpl<MCPhysReg> &Hints,
+                                           const MachineFunction &MF,
+                                           const VirtRegMap *VRM) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  unsigned Odd;
+  switch (Hint.first) {
+  case ARMRI::RegPairEven:
+    Odd = 0;
+    break;
+  case ARMRI::RegPairOdd:
+    Odd = 1;
+    break;
+  default:
+    TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF, VRM);
+    return;
+  }
+
+  // This register should preferably be even (Odd == 0) or odd (Odd == 1).
+  // Check if the other part of the pair has already been assigned, and provide
+  // the paired register as the first hint.
+  unsigned PairedPhys = 0;
+  if (VRM && VRM->hasPhys(Hint.second)) {
+    PairedPhys = getPairedGPR(VRM->getPhys(Hint.second), Odd, this);
+    if (PairedPhys && MRI.isReserved(PairedPhys))
+      PairedPhys = 0;
+  }
+
+  // First prefer the paired physreg.
+  if (PairedPhys &&
+      std::find(Order.begin(), Order.end(), PairedPhys) != Order.end())
+    Hints.push_back(PairedPhys);
+
+  // Then prefer even or odd registers.
+  for (unsigned I = 0, E = Order.size(); I != E; ++I) {
+    unsigned Reg = Order[I];
+    if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
+      continue;
+    // Don't provide hints that are paired to a reserved register.
+    unsigned Paired = getPairedGPR(Reg, !Odd, this);
+    if (!Paired || MRI.isReserved(Paired))
+      continue;
+    Hints.push_back(Reg);
+  }
+}
+
+void
+ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                                        MachineFunction &MF) const {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+       Hint.first == (unsigned)ARMRI::RegPairEven) &&
+      TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+    // If 'Reg' is one of the even / odd register pair and it's now changed
+    // (e.g. coalesced) into a different register. The other register of the
+    // pair allocation hint must be updated to reflect the relationship
+    // change.
+    unsigned OtherReg = Hint.second;
+    Hint = MRI->getRegAllocationHint(OtherReg);
+    if (Hint.second == Reg)
+      // Make sure the pair has not already divorced.
+      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+  }
+}
+
+bool
+ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
+  // CortexA9 has a Write-after-write hazard for NEON registers.
+  if (!STI.isLikeA9())
+    return false;
+
+  switch (RC->getID()) {
+  case ARM::DPRRegClassID:
+  case ARM::DPR_8RegClassID:
+  case ARM::DPR_VFP2RegClassID:
+  case ARM::QPRRegClassID:
+  case ARM::QPR_8RegClassID:
+  case ARM::QPR_VFP2RegClassID:
+  case ARM::SPRRegClassID:
+  case ARM::SPR_8RegClassID:
+    // Avoid reusing S, D, and Q registers.
+    // Don't increase register pressure for QQ and QQQQ.
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // When outgoing call frames are so large that we adjust the stack pointer
+  // around the call, we can no longer use the stack pointer to reach the
+  // emergency spill slot.
+  if (needsStackRealignment(MF) && !TFI->hasReservedCallFrame(MF))
+    return true;
+
+  // Thumb has trouble with negative offsets from the FP. Thumb2 has a limited
+  // negative range for ldr/str (255), and thumb1 is positive offsets only.
+  // It's going to be better to use the SP or Base Pointer instead. When there
+  // are variable sized objects, we can't reference off of the SP, so we
+  // reserve a Base Pointer.
+  if (AFI->isThumbFunction() && MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, the scavenger will still enable access to work, it just
+    // won't be optimal.
+    if (AFI->isThumb2Function() && MFI->getLocalFrameSize() < 128)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  // We can't realign the stack if:
+  // 1. Dynamic stack realignment is explicitly disabled,
+  // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
+  // 3. There are VLAs in the function and the base pointer is disabled.
+  if (!MF.getTarget().Options.RealignStack)
+    return false;
+  if (AFI->isThumb1OnlyFunction())
+    return false;
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+  // We may also need a base pointer if there are dynamic allocas or stack
+  // pointer adjustments around calls.
+  if (MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF))
+    return true;
+  // A base pointer is required and allowed.  Check that it isn't too late to
+  // reserve it.
+  return MRI->canReserveReg(BasePtr);
+}
+
+bool ARMBaseRegisterInfo::
+needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
+}
+
+bool ARMBaseRegisterInfo::
+cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken()
+    || needsStackRealignment(MF);
+}
+
+unsigned
+ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (TFI->hasFP(MF))
+    return FramePtr;
+  return ARM::SP;
+}
+
+unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::
+emitLoadConstPool(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator &MBBI,
+                  DebugLoc dl,
+                  unsigned DestReg, unsigned SubIdx, int Val,
+                  ARMCC::CondCodes Pred,
+                  unsigned PredReg, unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C =
+        ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx)
+    .addImm(0).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  return true;
+}
+
+int64_t ARMBaseRegisterInfo::
+getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  int64_t InstrOffs = 0;
+  int Scale = 1;
+  unsigned ImmIdx = 0;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+  case ARMII::AddrMode_i12:
+    InstrOffs = MI->getOperand(Idx+1).getImm();
+    Scale = 1;
+    break;
+  case ARMII::AddrMode5: {
+    // VFP address mode.
+    const MachineOperand &OffOp = MI->getOperand(Idx+1);
+    InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+    if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    Scale = 4;
+    break;
+  }
+  case ARMII::AddrMode2: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrMode3: {
+    ImmIdx = Idx+2;
+    InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
+    if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+      InstrOffs = -InstrOffs;
+    break;
+  }
+  case ARMII::AddrModeT1_s: {
+    ImmIdx = Idx+1;
+    InstrOffs = MI->getOperand(ImmIdx).getImm();
+    Scale = 4;
+    break;
+  }
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  return InstrOffs * Scale;
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool ARMBaseRegisterInfo::
+needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) {
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  case ARM::LDRi12: case ARM::LDRH: case ARM::LDRBi12:
+  case ARM::STRi12: case ARM::STRH: case ARM::STRBi12:
+  case ARM::t2LDRi12: case ARM::t2LDRi8:
+  case ARM::t2STRi12: case ARM::t2STRi8:
+  case ARM::VLDRS: case ARM::VLDRD:
+  case ARM::VSTRS: case ARM::VSTRD:
+  case ARM::tSTRspi: case ARM::tLDRspi:
+    break;
+  default:
+    return false;
+  }
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all callee-saved registers get pushed. R4-R6
+  // will be earlier than the FP, so we ignore those.
+  // R7, LR
+  int64_t FPOffset = Offset - 8;
+  // ARM and Thumb2 functions also need to consider R8-R11 and D8-D15
+  if (!AFI->isThumbFunction() || !AFI->isThumb1OnlyFunction())
+    FPOffset -= 80;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset = -Offset;
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  unsigned StackAlign = TFI->getStackAlignment();
+  if (TFI->hasFP(MF) &&
+      !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+    if (isFrameOffsetLegal(MI, FPOffset))
+      return false;
+  }
+  // If we can reference via the stack pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (!MFI->hasVarSizedObjects() && isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal, we want to allocate a virtual base register.
+  return true;
+}
+
+/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
+/// be a pointer to FrameIdx at the beginning of the basic block.
+void ARMBaseRegisterInfo::
+materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                             unsigned BaseReg, int FrameIdx,
+                             int64_t Offset) const {
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
+    (AFI->isThumb1OnlyFunction() ? ARM::tADDrSPi : ARM::t2ADDri);
+
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL;                  // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
+
+  MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset));
+
+  if (!AFI->isThumb1OnlyFunction())
+    AddDefaultCC(MIB);
+}
+
+void
+ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
+                                       unsigned BaseReg, int64_t Offset) const {
+  MachineInstr &MI = *I;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This resolveFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, BaseReg, Off, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
+  }
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  unsigned i = 0;
+
+  while (!MI->getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
+  }
+
+  // AddrMode4 and AddrMode6 cannot handle any offset.
+  if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+    return Offset == 0;
+
+  unsigned NumBits = 0;
+  unsigned Scale = 1;
+  bool isSigned = true;
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i12:
+    // i8 supports only negative, and i12 supports only positive, so
+    // based on Offset sign, consider the appropriate instruction
+    Scale = 1;
+    if (Offset < 0) {
+      NumBits = 8;
+      Offset = -Offset;
+    } else {
+      NumBits = 12;
+    }
+    break;
+  case ARMII::AddrMode5:
+    // VFP address mode.
+    NumBits = 8;
+    Scale = 4;
+    break;
+  case ARMII::AddrMode_i12:
+  case ARMII::AddrMode2:
+    NumBits = 12;
+    break;
+  case ARMII::AddrMode3:
+    NumBits = 8;
+    break;
+  case ARMII::AddrModeT1_s:
+    NumBits = 5;
+    Scale = 4;
+    isSigned = false;
+    break;
+  default:
+    llvm_unreachable("Unsupported addressing mode!");
+  }
+
+  Offset += getFrameIndexInstrOffset(MI, i);
+  // Make sure the offset is encodable for instructions that scale the
+  // immediate.
+  if ((Offset & (Scale-1)) != 0)
+    return false;
+
+  if (isSigned && Offset < 0)
+    Offset = -Offset;
+
+  unsigned Mask = (1 << NumBits) - 1;
+  if ((unsigned)Offset <= Mask * Scale)
+    return true;
+
+  return false;
+}
+
+void
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, unsigned FIOperandNum,
+                                         RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const ARMFrameLowering *TFI =
+    static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateFrameIndex does not support Thumb1!");
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+
+  int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(TFI->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo()->hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  // Special handling of dbg_value instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII);
+  }
+  if (Done)
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert((Offset ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
+         "This code isn't needed if offset already handled!");
+
+  unsigned ScratchReg = 0;
+  int PIdx = MI.findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+  if (Offset == 0)
+    // Must be addrmode4/6.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, false);
+  else {
+    ScratchReg = MF.getRegInfo().createVirtualRegister(&ARM::GPRRegClass);
+    if (!AFI->isThumbFunction())
+      emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                              Offset, Pred, PredReg, TII);
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                             Offset, Pred, PredReg, TII);
+    }
+    // Update the original instruction to use the scratch register.
+    MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false,true);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 000000000000..725033b7e573
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -0,0 +1,178 @@
+//===-- ARMBaseRegisterInfo.h - ARM Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEREGISTERINFO_H
+#define ARMBASEREGISTERINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "ARMGenRegisterInfo.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+/// Register allocation hints.
+namespace ARMRI {
+  enum {
+    RegPairOdd  = 1,
+    RegPairEven = 2
+  };
+}
+
+/// isARMArea1Register - Returns true if the register is a low register (r0-r7)
+/// or a stack/pc register that we should push/pop.
+static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R0:  case R1:  case R2:  case R3:
+    case R4:  case R5:  case R6:  case R7:
+    case LR:  case SP:  case PC:
+      return true;
+    case R8:  case R9:  case R10: case R11:
+      // For iOS we want r7 and lr to be next to each other.
+      return !isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case R8: case R9: case R10: case R11:
+      // iOS has this second area.
+      return isIOS;
+    default:
+      return false;
+  }
+}
+
+static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
+  using namespace ARM;
+  switch (Reg) {
+    case D15: case D14: case D13: case D12:
+    case D11: case D10: case D9:  case D8:
+      return true;
+    default:
+      return false;
+  }
+}
+
+class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+  const ARMBaseInstrInfo &TII;
+  const ARMSubtarget &STI;
+
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+  /// BasePtr - ARM physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+  // Can be only subclassed.
+  explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                               const ARMSubtarget &STI);
+
+  // Return the opcode that implements 'Op', or 0 if no opcode
+  unsigned getOpcode(int Op) const;
+
+public:
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
+  void getRegAllocationHints(unsigned VirtReg,
+                             ArrayRef<MCPhysReg> Order,
+                             SmallVectorImpl<MCPhysReg> &Hints,
+                             const MachineFunction &MF,
+                             const VirtRegMap *VRM) const;
+
+  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                          MachineFunction &MF) const;
+
+  virtual bool avoidWriteAfterWrite(const TargetRegisterClass *RC) const;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const;
+  int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                    unsigned BaseReg, int FrameIdx,
+                                    int64_t Offset) const;
+  void resolveFrameIndex(MachineBasicBlock::iterator I,
+                         unsigned BaseReg, int64_t Offset) const;
+  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const { return BasePtr; }
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  bool isLowRegister(unsigned Reg) const;
+
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 DebugLoc dl,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 int Val,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0,
+                                 unsigned MIFlags = MachineInstr::NoFlags)const;
+
+  /// Code Generation virtual methods...
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
+  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
+  virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                   int SPAdj, unsigned FIOperandNum,
+                                   RegScavenger *RS = NULL) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h
new file mode 100644
index 000000000000..11bd6a4a8dbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMBuildAttrs.h
@@ -0,0 +1,131 @@
+//===-- ARMBuildAttrs.h - ARM Build Attributes ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains enumerations and support routines for ARM build attributes
+// as defined in ARM ABI addenda document (ABI release 2.08).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __TARGET_ARMBUILDATTRS_H__
+#define __TARGET_ARMBUILDATTRS_H__
+
+namespace ARMBuildAttrs {
+  enum SpecialAttr {
+    // This is for the .cpu asm attr. It translates into one or more
+    // AttrType (below) entries in the .ARM.attributes section in the ELF.
+    SEL_CPU 
+  };
+
+  enum AttrType {
+    // Rest correspond to ELF/.ARM.attributes
+    File                      = 1,
+    Section                   = 2,
+    Symbol                    = 3,
+    CPU_raw_name              = 4,
+    CPU_name                  = 5,
+    CPU_arch                  = 6,
+    CPU_arch_profile          = 7,
+    ARM_ISA_use               = 8,
+    THUMB_ISA_use             = 9,
+    VFP_arch                  = 10,
+    WMMX_arch                 = 11,
+    Advanced_SIMD_arch        = 12,
+    PCS_config                = 13,
+    ABI_PCS_R9_use            = 14,
+    ABI_PCS_RW_data           = 15,
+    ABI_PCS_RO_data           = 16,
+    ABI_PCS_GOT_use           = 17,
+    ABI_PCS_wchar_t           = 18,
+    ABI_FP_rounding           = 19,
+    ABI_FP_denormal           = 20,
+    ABI_FP_exceptions         = 21,
+    ABI_FP_user_exceptions    = 22,
+    ABI_FP_number_model       = 23,
+    ABI_align8_needed         = 24,
+    ABI_align8_preserved      = 25,
+    ABI_enum_size             = 26,
+    ABI_HardFP_use            = 27,
+    ABI_VFP_args              = 28,
+    ABI_WMMX_args             = 29,
+    ABI_optimization_goals    = 30,
+    ABI_FP_optimization_goals = 31,
+    compatibility             = 32,
+    CPU_unaligned_access      = 34,
+    VFP_HP_extension          = 36,
+    ABI_FP_16bit_format       = 38,
+    MPextension_use           = 42, // was 70, 2.08 ABI
+    DIV_use                   = 44,
+    nodefaults                = 64,
+    also_compatible_with      = 65,
+    T2EE_use                  = 66,
+    conformance               = 67,
+    Virtualization_use        = 68,
+    MPextension_use_old       = 70
+  };
+
+  // Magic numbers for .ARM.attributes
+  enum AttrMagic {
+    Format_Version  = 0x41
+  };
+
+  // Legal Values for CPU_arch, (=6), uleb128
+  enum CPUArch {
+    Pre_v4       = 0,
+    v4       = 1,   // e.g. SA110
+    v4T      = 2,   // e.g. ARM7TDMI
+    v5T      = 3,   // e.g. ARM9TDMI
+    v5TE     = 4,   // e.g. ARM946E_S
+    v5TEJ    = 5,   // e.g. ARM926EJ_S
+    v6       = 6,   // e.g. ARM1136J_S
+    v6KZ     = 7,   // e.g. ARM1176JZ_S
+    v6T2     = 8,   // e.g. ARM1156T2F_S
+    v6K      = 9,   // e.g. ARM1136J_S
+    v7       = 10,  // e.g. Cortex A8, Cortex M3
+    v6_M     = 11,  // e.g. Cortex M1
+    v6S_M    = 12,  // v6_M with the System extensions
+    v7E_M    = 13   // v7_M with DSP extensions
+  };
+
+  enum CPUArchProfile { // (=7), uleb128 
+    Not_Applicable = 0, // pre v7, or cross-profile code
+    ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
+    RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
+    MicroControllerProfile = (0x4D), // 'M' (e.g. for Cortex M3)
+    SystemProfile = (0x53) // 'S' Application or real-time profile
+  };
+
+  // The following have a lot of common use cases
+  enum { 
+    //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
+    Not_Allowed = 0,
+    Allowed = 1,
+
+    // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
+    AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
+    AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
+    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
+    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
+    AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
+
+    // Tag_WMMX_arch, (=11), uleb128
+    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
+    
+    // Tag_WMMX_arch, (=11), uleb128
+    AllowWMMXv1 = 2,  // The user permitted this entity to use WMMX v2
+
+    // Tag_ABI_FP_denormal, (=20), uleb128 
+    PreserveFPSign = 2, // sign when flushed-to-zero is preserved
+
+    // Tag_ABI_FP_number_model, (=23), uleb128
+    AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
+    AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings
+  };
+}
+
+#endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
new file mode 100644
index 000000000000..e6e8c3d5fac6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -0,0 +1,159 @@
+//=== ARMCallingConv.h - ARM Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the ARM Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMCALLINGCONV_H
+#define ARMCALLINGCONV_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const uint16_t RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
+  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
+  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
+  static const uint16_t ShadowRegList[] = { ARM::R0, ARM::R1 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
+  if (Reg == 0) {
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  unsigned T = State.AllocateReg(LoRegList[i]);
+  (void)T;
+  assert(T == LoRegList[i] && "Could not allocate register");
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
+  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
+  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 000000000000..b378b9662682
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -0,0 +1,206 @@
+//===-- ARMCallingConv.td - Calling Conventions for ARM ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+    
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+def FastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_APCS>
+]>;
+
+def RetFastCC_ARM_APCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_APCS>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention for GHC
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_APCS_GHC : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
+  CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
+  CCIfType<[f32], CCAssignToReg<[S16, S17, S18, S19, S20, S21, S22, S23]>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, SpLim
+  CCIfType<[i32], CCAssignToReg<[R4, R5, R6, R7, R8, R9, R10, R11]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention, common parts
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_Common : CallingConv<[
+
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // i64/f64 is passed in even pairs of GPRs
+  // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+  // (and the same is true for f64 if VFP is not enabled)
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+  CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&"
+                       "ArgFlags.getOrigAlign() != 8",
+                       CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, R3>>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 8>>
+]>;
+
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+// Also used for FastCC (when VFP2 or later is available)
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
+                                     (sequence "D%u", 15, 8))>;
+
+// iOS ABI deviates from ARM standard ABI. R9 is not a callee-saved register.
+// Also save R7-R4 first to match the stack frame fixed spill areas.
+def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
+
+// GHC set of callee saved regs is empty as all those regs are
+// used for passing STG regs around
+// add is a workaround for not being able to compile empty list:
+// def CSR_GHC : CalleeSavedRegs<()>;
+def CSR_GHC : CalleeSavedRegs<(add)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
new file mode 100644
index 000000000000..5e8e1739a984
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -0,0 +1,1916 @@
+//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the ARM machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+  class ARMCodeEmitter : public MachineFunctionPass {
+    ARMJITInfo                *JTI;
+    const ARMBaseInstrInfo    *II;
+    const DataLayout          *TD;
+    const ARMSubtarget        *Subtarget;
+    TargetMachine             &TM;
+    JITCodeEmitter            &MCE;
+    MachineModuleInfo *MMI;
+    const std::vector<MachineConstantPoolEntry> *MCPEs;
+    const std::vector<MachineJumpTableEntry> *MJTEs;
+    bool IsPIC;
+    bool IsThumb;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    static char ID;
+  public:
+    ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(ID), JTI(0),
+        II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
+        TD(tm.getDataLayout()), TM(tm),
+        MCE(mce), MCPEs(0), MJTEs(0),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Binary);
+    void emitDWordLE(uint64_t Binary);
+    void emitConstPoolInstruction(const MachineInstr &MI);
+    void emitMOVi32immInstruction(const MachineInstr &MI);
+    void emitMOVi2piecesInstruction(const MachineInstr &MI);
+    void emitLEApcrelJTInstruction(const MachineInstr &MI);
+    void emitPseudoMoveInstruction(const MachineInstr &MI);
+    void addPCLabel(unsigned LabelID);
+    void emitPseudoInstruction(const MachineInstr &MI);
+    unsigned getMachineSoRegOpValue(const MachineInstr &MI,
+                                    const MCInstrDesc &MCID,
+                                    const MachineOperand &MO,
+                                    unsigned OpIdx);
+
+    unsigned getMachineSoImmOpValue(unsigned SoImm);
+    unsigned getAddrModeSBit(const MachineInstr &MI,
+                             const MCInstrDesc &MCID) const;
+
+    void emitDataProcessingInstruction(const MachineInstr &MI,
+                                       unsigned ImplicitRd = 0,
+                                       unsigned ImplicitRn = 0);
+
+    void emitLoadStoreInstruction(const MachineInstr &MI,
+                                  unsigned ImplicitRd = 0,
+                                  unsigned ImplicitRn = 0);
+
+    void emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                      unsigned ImplicitRn = 0);
+
+    void emitLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMulFrmInstruction(const MachineInstr &MI);
+
+    void emitExtendInstruction(const MachineInstr &MI);
+
+    void emitMiscArithInstruction(const MachineInstr &MI);
+
+    void emitSaturateInstruction(const MachineInstr &MI);
+
+    void emitBranchInstruction(const MachineInstr &MI);
+
+    void emitInlineJumpTable(unsigned JTIndex);
+
+    void emitMiscBranchInstruction(const MachineInstr &MI);
+
+    void emitVFPArithInstruction(const MachineInstr &MI);
+
+    void emitVFPConversionInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitNEONLaneInstruction(const MachineInstr &MI);
+    void emitNEONDupInstruction(const MachineInstr &MI);
+    void emitNEON1RegModImmInstruction(const MachineInstr &MI);
+    void emitNEON2RegInstruction(const MachineInstr &MI);
+    void emitNEON3RegInstruction(const MachineInstr &MI);
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) const {
+      return getMachineOpValue(MI, MI.getOperand(OpIdx));
+    }
+
+    // FIXME: The legacy JIT ARMCodeEmitter doesn't rely on the the
+    //  TableGen'erated getBinaryCodeForInstr() function to encode any
+    //  operand values, instead querying getMachineOpValue() directly for
+    //  each operand it needs to encode. Thus, any of the new encoder
+    //  helper functions can simply return 0 as the values the return
+    //  are already handled elsewhere. They are placeholders to allow this
+    //  encoder to continue to function until the MC encoder is sufficiently
+    //  far along that this one can be eliminated entirely.
+    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val)
+      const { return 0; }
+    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
+    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
+    unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
+      const { return 0; }
+    unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBRTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbBCCTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbCBTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getUnconditionalBranchTargetOpValue(const MachineInstr &MI,
+      unsigned Op) const { return 0; }
+    unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getARMBLTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
+    unsigned getT2AddrModeSORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6OneLane32AddressOpValue(const MachineInstr &MI,
+                                                 unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6DupAddressOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getAddrMode6OffsetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
+                                            unsigned Op) const { return 0; }
+    unsigned getSsatBitPosValue(const MachineInstr &MI,
+                                unsigned Op) const { return 0; }
+    uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const {return 0; }
+    uint32_t getLdStSORegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0; }
+
+    unsigned getAddrModeImm12OpValue(const MachineInstr &MI, unsigned Op)
+      const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+      uint32_t Binary;
+      Binary = Imm12 & 0xfff;
+      if (Imm12 >= 0)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+
+    unsigned getHiLo16ImmOpValue(const MachineInstr &MI, unsigned Op) const {
+      return 0;
+    }
+
+    uint32_t getAddrMode2OpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
+    uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeThumbSPOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeSOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModeISOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrModePCOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    uint32_t getAddrMode5OpValue(const MachineInstr &MI, unsigned Op) const {
+      // {17-13} = reg
+      // {12}    = (U)nsigned (add == '1', sub == '0')
+      // {11-0}  = imm12
+      const MachineOperand &MO  = MI.getOperand(Op);
+      const MachineOperand &MO1 = MI.getOperand(Op + 1);
+      if (!MO.isReg()) {
+        emitConstPoolAddress(MO.getIndex(), ARM::reloc_arm_cp_entry);
+        return 0;
+      }
+      unsigned Reg = II->getRegisterInfo().getEncodingValue(MO.getReg());
+      int32_t Imm12 = MO1.getImm();
+
+      // Special value for #-0
+      if (Imm12 == INT32_MIN)
+        Imm12 = 0;
+
+      // Immediate is always encoded as positive. The 'U' bit controls add vs
+      // sub.
+      bool isAdd = true;
+      if (Imm12 < 0) {
+        Imm12 = -Imm12;
+        isAdd = false;
+      }
+
+      uint32_t Binary = Imm12 & 0xfff;
+      if (isAdd)
+        Binary |= (1 << 12);
+      Binary |= (Reg << 13);
+      return Binary;
+    }
+    unsigned getNEONVcvtImm32OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    unsigned getRegisterListOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    unsigned getShiftRight8Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight16Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight32Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getShiftRight64Imm(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+
+    /// getMovi32Value - Return binary encoding of operand for movw/movt. If the
+    /// machine operand requires relocation, record the relocation and return
+    /// zero.
+    unsigned getMovi32Value(const MachineInstr &MI,const MachineOperand &MO,
+                            unsigned Reloc);
+
+    /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+    ///
+    unsigned getShiftOp(unsigned Imm) const ;
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                           bool MayNeedFarStub,  bool Indirect,
+                           intptr_t ACPV = 0) const;
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
+                               intptr_t JTBase = 0) const;
+    unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRd(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRn(const MachineInstr &MI, unsigned OpIdx) const;
+    unsigned encodeNEONRm(const MachineInstr &MI, unsigned OpIdx) const;
+  };
+}
+
+char ARMCodeEmitter::ID = 0;
+
+/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM
+/// code to the specified MCE object.
+FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new ARMCodeEmitter(TM, JCE);
+}
+
+bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  TargetMachine &Target = const_cast<TargetMachine&>(MF.getTarget());
+
+  assert((Target.getRelocationModel() != Reloc::Default ||
+          Target.getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  JTI = static_cast<ARMJITInfo*>(Target.getJITInfo());
+  II = static_cast<const ARMBaseInstrInfo*>(Target.getInstrInfo());
+  TD = Target.getDataLayout();
+
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = 0;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
+  JTI->Initialize(MF, IsPIC);
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+          << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+///
+unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const {
+  switch (ARM_AM::getAM2ShiftOpc(Imm)) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::asr: return 2;
+  case ARM_AM::lsl: return 0;
+  case ARM_AM::lsr: return 1;
+  case ARM_AM::ror:
+  case ARM_AM::rrx: return 3;
+  }
+}
+
+/// getMovi32Value - Return binary encoding of operand for movw/movt. If the
+/// machine operand requires relocation, record the relocation and return zero.
+unsigned ARMCodeEmitter::getMovi32Value(const MachineInstr &MI,
+                                        const MachineOperand &MO,
+                                        unsigned Reloc) {
+  assert(((Reloc == ARM::reloc_arm_movt) || (Reloc == ARM::reloc_arm_movw))
+      && "Relocation to this function should be for movt or movw");
+
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), Reloc, true, false);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), Reloc);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), Reloc);
+  else {
+#ifndef NDEBUG
+    errs() << MO;
+#endif
+    llvm_unreachable("Unsupported operand type for movw/movt");
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+  if (MO.isReg())
+    return II->getRegisterInfo().getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
+  else if (MO.isCPI()) {
+    const MCInstrDesc &MCID = MI.getDesc();
+    // For VFP load, the immediate offset is multiplied by 4.
+    unsigned Reloc =  ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm)
+      ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry;
+    emitConstPoolAddress(MO.getIndex(), Reloc);
+  } else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream.
+///
+void ARMCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                       bool MayNeedFarStub, bool Indirect,
+                                       intptr_t ACPV) const {
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           const_cast<GlobalValue *>(GV),
+                                           ACPV, MayNeedFarStub)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               const_cast<GlobalValue *>(GV), ACPV,
+                               MayNeedFarStub);
+  MCE.addRelocation(MR);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  // Tell JIT emitter we'll resolve the address.
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, true));
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, true));
+}
+
+/// emitMachineBasicBlock - Emit the specified address basic block.
+void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                           unsigned Reloc,
+                                           intptr_t JTBase) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB, JTBase));
+}
+
+void ARMCodeEmitter::emitWordLE(unsigned Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitDWordLE(uint64_t Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitDWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+  switch (MI.getDesc().TSFlags & ARMII::FormMask) {
+  default: {
+    llvm_unreachable("Unhandled instruction encoding format!");
+  }
+  case ARMII::MiscFrm:
+    if (MI.getOpcode() == ARM::LEApcrelJT) {
+      // Materialize jumptable address.
+      emitLEApcrelJTInstruction(MI);
+      break;
+    }
+    llvm_unreachable("Unhandled instruction encoding!");
+  case ARMII::Pseudo:
+    emitPseudoInstruction(MI);
+    break;
+  case ARMII::DPFrm:
+  case ARMII::DPSoRegFrm:
+    emitDataProcessingInstruction(MI);
+    break;
+  case ARMII::LdFrm:
+  case ARMII::StFrm:
+    emitLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdMiscFrm:
+  case ARMII::StMiscFrm:
+    emitMiscLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdStMulFrm:
+    emitLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::MulFrm:
+    emitMulFrmInstruction(MI);
+    break;
+  case ARMII::ExtFrm:
+    emitExtendInstruction(MI);
+    break;
+  case ARMII::ArithMiscFrm:
+    emitMiscArithInstruction(MI);
+    break;
+  case ARMII::SatFrm:
+    emitSaturateInstruction(MI);
+    break;
+  case ARMII::BrFrm:
+    emitBranchInstruction(MI);
+    break;
+  case ARMII::BrMiscFrm:
+    emitMiscBranchInstruction(MI);
+    break;
+  // VFP instructions.
+  case ARMII::VFPUnaryFrm:
+  case ARMII::VFPBinaryFrm:
+    emitVFPArithInstruction(MI);
+    break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    emitVFPConversionInstruction(MI);
+    break;
+  case ARMII::VFPLdStFrm:
+    emitVFPLoadStoreInstruction(MI);
+    break;
+  case ARMII::VFPLdStMulFrm:
+    emitVFPLoadStoreMultipleInstruction(MI);
+    break;
+
+  // NEON instructions.
+  case ARMII::NGetLnFrm:
+  case ARMII::NSetLnFrm:
+    emitNEONLaneInstruction(MI);
+    break;
+  case ARMII::NDupFrm:
+    emitNEONDupInstruction(MI);
+    break;
+  case ARMII::N1RegModImmFrm:
+    emitNEON1RegModImmInstruction(MI);
+    break;
+  case ARMII::N2RegFrm:
+    emitNEON2RegInstruction(MI);
+    break;
+  case ARMII::N3RegFrm:
+    emitNEON3RegInstruction(MI);
+    break;
+  }
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
+  unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
+  unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
+  const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
+
+  // Remember the CONSTPOOL_ENTRY address for later relocation.
+  JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
+
+  // Emit constpool island entry. In most cases, the actual values will be
+  // resolved and relocated after code emission.
+  if (MCPE.isMachineConstantPoolEntry()) {
+    ARMConstantPoolValue *ACPV =
+      static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+    DEBUG(errs() << "  ** ARM constant pool #" << CPI << " @ "
+          << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
+
+    assert(ACPV->isGlobalValue() && "unsupported constant pool value");
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
+    if (GV) {
+      Reloc::Model RelocM = TM.getRelocationModel();
+      emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+                        isa<Function>(GV),
+                        Subtarget->GVIsIndirectSymbol(GV, RelocM),
+                        (intptr_t)ACPV);
+    } else  {
+      const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+      emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute);
+    }
+    emitWordLE(0);
+  } else {
+    const Constant *CV = MCPE.Val.ConstVal;
+
+    DEBUG({
+        errs() << "  ** Constant pool #" << CPI << " @ "
+               << (void*)MCE.getCurrentPCValue() << " ";
+        if (const Function *F = dyn_cast<Function>(CV))
+          errs() << F->getName();
+        else
+          errs() << *CV;
+        errs() << '\n';
+      });
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
+      emitWordLE(0);
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      uint32_t Val = uint32_t(*CI->getValue().getRawData());
+      emitWordLE(Val);
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      if (CFP->getType()->isFloatTy())
+        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else if (CFP->getType()->isDoubleTy())
+        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else {
+        llvm_unreachable("Unable to handle this constantpool entry!");
+      }
+    } else {
+      llvm_unreachable("Unable to handle this constantpool entry!");
+    }
+  }
+}
+
+void ARMCodeEmitter::emitMOVi32immInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+
+  // Emit the 'movw' instruction.
+  unsigned Binary = 0x30 << 20;  // mov: Insts{27-20} = 0b00110000
+
+  unsigned Lo16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movw) & 0xFFFF;
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode imm16 as imm4:imm12
+  Binary |= Lo16 & 0xFFF; // Insts{11-0} = imm12
+  Binary |= ((Lo16 >> 12) & 0xF) << 16; // Insts{19-16} = imm4
+  emitWordLE(Binary);
+
+  unsigned Hi16 = getMovi32Value(MI, MO1, ARM::reloc_arm_movt) >> 16;
+  // Emit the 'movt' instruction.
+  Binary = 0x34 << 20; // movt: Insts{27-20} = 0b00110100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode imm16 as imm4:imm1, same as movw above.
+  Binary |= Hi16 & 0xFFF;
+  Binary |= ((Hi16 >> 12) & 0xF) << 16;
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+  assert(MO1.isImm() && ARM_AM::isSOImmTwoPartVal(MO1.getImm()) &&
+                                                  "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
+
+  // Emit the 'mov' instruction.
+  unsigned Binary = 0xd << 21;  // mov: Insts{24-21} = 0b1101
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V1);
+  emitWordLE(Binary);
+
+  // Now the 'orr' instruction.
+  Binary = 0xc << 21;  // orr: Insts{24-21} = 0b1100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V2);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
+  // It's basically add r, pc, (LJTI - $+8)
+
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Emit the 'add' instruction.
+  unsigned Binary = 0x4 << 21;  // add: Insts{24-21} = 0b0100
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode Rn which is PC.
+  Binary |= II->getRegisterInfo().getEncodingValue(ARM::PC) << ARMII::RegRnShift;
+
+  // Encode the displacement.
+  Binary |= 1 << ARMII::I_BitShift;
+  emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag)
+    Binary |= 1 << ARMII::S_BitShift;
+
+  // Encode register def if there is one.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode the shift operation.
+  switch (Opcode) {
+  default: break;
+  case ARM::RRX:
+    // rrx
+    Binary |= 0x6 << 4;
+    break;
+  case ARM::MOVsrl_flag:
+    // lsr #1
+    Binary |= (0x2 << 4) | (1 << 7);
+    break;
+  case ARM::MOVsra_flag:
+    // asr #1
+    Binary |= (0x4 << 4) | (1 << 7);
+    break;
+  }
+
+  // Encode register Rm.
+  Binary |= getMachineOpValue(MI, 1);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::addPCLabel(unsigned LabelID) {
+  DEBUG(errs() << "  ** LPC" << LabelID << " @ "
+        << (void*)MCE.getCurrentPCValue() << '\n');
+  JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
+}
+
+void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+  switch (Opcode) {
+  default:
+    llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+  case ARM::BX_CALL:
+  case ARM::BMOVPCRX_CALL: {
+    // First emit mov lr, pc
+    unsigned Binary = 0x01a0e00f;
+    Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+    emitWordLE(Binary);
+
+    // and then emit the branch.
+    emitMiscBranchInstruction(MI);
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI.getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+    MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+    break;
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    // Do nothing.
+    break;
+  case ARM::CONSTPOOL_ENTRY:
+    emitConstPoolInstruction(MI);
+    break;
+  case ARM::PICADD: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // PICADD is just an add instruction that implicitly read pc.
+    emitDataProcessingInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICSTR:
+  case ARM::PICSTRB: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitLoadStoreInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDRH:
+  case ARM::PICLDRSH:
+  case ARM::PICLDRSB:
+  case ARM::PICSTRH: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitMiscLoadStoreInstruction(MI, ARM::PC);
+    break;
+  }
+
+  case ARM::MOVi32imm:
+    // Two instructions to materialize a constant.
+    if (Subtarget->hasV6T2Ops())
+      emitMOVi32immInstruction(MI);
+    else
+      emitMOVi2piecesInstruction(MI);
+    break;
+
+  case ARM::LEApcrelJT:
+    // Materialize jumptable address.
+    emitLEApcrelJTInstruction(MI);
+    break;
+  case ARM::RRX:
+  case ARM::MOVsrl_flag:
+  case ARM::MOVsra_flag:
+    emitPseudoMoveInstruction(MI);
+    break;
+  }
+}
+
+unsigned ARMCodeEmitter::getMachineSoRegOpValue(const MachineInstr &MI,
+                                                const MCInstrDesc &MCID,
+                                                const MachineOperand &MO,
+                                                unsigned OpIdx) {
+  unsigned Binary = getMachineOpValue(MI, MO);
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    // RRX - 0110 and bit[11:8] clear.
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    case ARM_AM::rrx: SBits = 0x6; break;
+    }
+  } else {
+    // Set shift operand (bit[6:4]).
+    // LSL - 000
+    // LSR - 010
+    // ASR - 100
+    // ROR - 110
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x0; break;
+    case ARM_AM::lsr: SBits = 0x2; break;
+    case ARM_AM::asr: SBits = 0x4; break;
+    case ARM_AM::ror: SBits = 0x6; break;
+    }
+  }
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode the shift operation Rs or shift_imm (except rrx).
+  if (Rs) {
+    // Encode Rs bit[11:8].
+    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+    return Binary | (II->getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
+  }
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+}
+
+unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
+  int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  // Encode rotate_imm.
+  unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+    << ARMII::SoRotImmShift;
+
+  // Encode immed_8.
+  Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
+                                         const MCInstrDesc &MCID) const {
+  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){
+    const MachineOperand &MO = MI.getOperand(i-1);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
+      return 1 << ARMII::S_BitShift;
+  }
+  return 0;
+}
+
+void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
+                                                   unsigned ImplicitRd,
+                                                   unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // Encode register def if there is one.
+  unsigned NumDefs = MCID.getNumDefs();
+  unsigned OpIdx = 0;
+  if (NumDefs)
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+  else if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
+
+  if (MCID.Opcode == ARM::MOVi16) {
+      // Get immediate from MI.
+      unsigned Lo16 = getMovi32Value(MI, MI.getOperand(OpIdx),
+                      ARM::reloc_arm_movw);
+      // Encode imm which is the same as in emitMOVi32immInstruction().
+      Binary |= Lo16 & 0xFFF;
+      Binary |= ((Lo16 >> 12) & 0xF) << 16;
+      emitWordLE(Binary);
+      return;
+  } else if(MCID.Opcode == ARM::MOVTi16) {
+      unsigned Hi16 = (getMovi32Value(MI, MI.getOperand(OpIdx),
+                       ARM::reloc_arm_movt) >> 16);
+      Binary |= Hi16 & 0xFFF;
+      Binary |= ((Hi16 >> 12) & 0xF) << 16;
+      emitWordLE(Binary);
+      return;
+  } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) {
+      uint32_t v = ~MI.getOperand(2).getImm();
+      int32_t lsb = CountTrailingZeros_32(v);
+      int32_t msb = (32 - CountLeadingZeros_32(v)) - 1;
+      // Instr{20-16} = msb, Instr{11-7} = lsb
+      Binary |= (msb & 0x1F) << 16;
+      Binary |= (lsb & 0x1F) << 7;
+      emitWordLE(Binary);
+      return;
+  } else if ((MCID.Opcode == ARM::UBFX) || (MCID.Opcode == ARM::SBFX)) {
+      // Encode Rn in Instr{0-3}
+      Binary |= getMachineOpValue(MI, OpIdx++);
+
+      uint32_t lsb = MI.getOperand(OpIdx++).getImm();
+      uint32_t widthm1 = MI.getOperand(OpIdx++).getImm() - 1;
+
+      // Instr{20-16} = widthm1, Instr{11-7} = lsb
+      Binary |= (widthm1 & 0x1F) << 16;
+      Binary |= (lsb & 0x1F) << 7;
+      emitWordLE(Binary);
+      return;
+  }
+
+  // If this is a two-address operand, skip it. e.g. MOVCCr operand 1.
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode first non-shifter register operand if there is one.
+  bool isUnary = MCID.TSFlags & ARMII::UnaryDP;
+  if (!isUnary) {
+    if (ImplicitRn)
+      // Special handling for implicit use (e.g. PC).
+      Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+    else {
+      Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
+      ++OpIdx;
+    }
+  }
+
+  // Encode shifter operand.
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) {
+    // Encode SoReg.
+    emitWordLE(Binary | getMachineSoRegOpValue(MI, MCID, MO, OpIdx));
+    return;
+  }
+
+  if (MO.isReg()) {
+    // Encode register Rm.
+    emitWordLE(Binary | II->getRegisterInfo().getEncodingValue(MO.getReg()));
+    return;
+  }
+
+  // Encode so_imm.
+  Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLoadStoreInstruction(const MachineInstr &MI,
+                                              unsigned ImplicitRd,
+                                              unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // If this is an LDRi12, STRi12 or LDRcp, nothing more needs be done.
+  if (MI.getOpcode() == ARM::LDRi12 || MI.getOpcode() == ARM::LDRcp ||
+      MI.getOpcode() == ARM::STRi12) {
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRd) << ARMII::RegRdShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDR_PRE.
+  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM2Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative).
+  Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+  if (!MO2.getReg()) { // is immediate
+    if (ARM_AM::getAM2Offset(AM2Opc))
+      // Set the value of offset_12 field
+      Binary |= ARM_AM::getAM2Offset(AM2Opc);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set bit I(25), because this is not in immediate encoding.
+  Binary |= 1 << ARMII::I_BitShift;
+  assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
+  // Set bit[3:0] to the corresponding Rm register
+  Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
+
+  // If this instr is in scaled register offset/index instruction, set
+  // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
+  if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) {
+    Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift;  // shift
+    Binary |= ShImm              << ARMII::ShiftShift;     // shift_immed
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                                  unsigned ImplicitRn) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StMiscFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Skip LDRD and STRD's second operand.
+  if (MCID.Opcode == ARM::LDRD || MCID.Opcode == ARM::STRD)
+    ++OpIdx;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (II->getRegisterInfo().getEncodingValue(ImplicitRn) << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDRH_POST.
+  if (!Skipped && MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM3Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative)
+  Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+
+  // If this instr is in register offset/index encoding, set bit[3:0]
+  // to the corresponding Rm register.
+  if (MO2.getReg()) {
+    Binary |= II->getRegisterInfo().getEncodingValue(MO2.getReg());
+    emitWordLE(Binary);
+    return;
+  }
+
+  // This instr is in immediate offset/index encoding, set bit 22 to 1.
+  Binary |= 1 << ARMII::AM3_I_BitShift;
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) {
+    // Set operands
+    Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift;  // immedH
+    Binary |= (ImmOffs & 0xF);                      // immedL
+  }
+
+  emitWordLE(Binary);
+}
+
+static unsigned getAddrModeUPBits(unsigned Mode) {
+  unsigned Binary = 0;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  // IA - Increment after  - bit U = 1 and bit P = 0
+  // IB - Increment before - bit U = 1 and bit P = 1
+  // DA - Decrement after  - bit U = 0 and bit P = 0
+  // DB - Decrement before - bit U = 0 and bit P = 1
+  switch (Mode) {
+  default: llvm_unreachable("Unknown addressing sub-mode!");
+  case ARM_AM::da:                                     break;
+  case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
+  case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
+  case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
+  }
+
+  return Binary;
+}
+
+void ARMCodeEmitter::emitLoadStoreMultipleInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Skip operand 0 of an instruction with base register update.
+  unsigned OpIdx = 0;
+  if (IsUpdating)
+    ++OpIdx;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
+
+  // Set bit W(21)
+  if (IsUpdating)
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // Set registers
+  for (unsigned i = OpIdx+2, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    unsigned RegNum = II->getRegisterInfo().getEncodingValue(MO.getReg());
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           RegNum < 16);
+    Binary |= 0x1 << RegNum;
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, MCID);
+
+  // 32x32->64bit operations have two destination registers. The number
+  // of register definitions will tell us if that's what we're dealing with.
+  unsigned OpIdx = 0;
+  if (MCID.getNumDefs() == 2)
+    Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode Rs
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift;
+
+  // Many multiple instructions (e.g. MLA) have three src operands. Encode
+  // it as Rn (for multiply, that's in the same offset as RdLo.
+  if (MCID.getNumOperands() > OpIdx &&
+      !MCID.OpInfo[OpIdx].isPredicate() &&
+      !MCID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx++);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  if (MO2.isReg()) {
+    // Two register operand form.
+    // Encode Rn.
+    Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift;
+
+    // Encode Rm.
+    Binary |= getMachineOpValue(MI, MO2);
+    ++OpIdx;
+  } else {
+    Binary |= getMachineOpValue(MI, MO1);
+  }
+
+  // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand.
+  if (MI.getOperand(OpIdx).isImm() &&
+      !MCID.OpInfo[OpIdx].isPredicate() &&
+      !MCID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // PKH instructions are finished at this point
+  if (MCID.Opcode == ARM::PKHBT || MCID.Opcode == ARM::PKHTB) {
+    emitWordLE(Binary);
+    return;
+  }
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO = MI.getOperand(OpIdx++);
+  if (OpIdx == MCID.getNumOperands() ||
+      MCID.OpInfo[OpIdx].isPredicate() ||
+      MCID.OpInfo[OpIdx].isOptionalDef()) {
+    // Encode Rm and it's done.
+    Binary |= getMachineOpValue(MI, MO);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift;
+
+  // Encode Rm.
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode shift_imm.
+  unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
+  if (MCID.Opcode == ARM::PKHTB) {
+    assert(ShiftAmt != 0 && "PKHTB shift_imm is 0!");
+    if (ShiftAmt == 32)
+      ShiftAmt = 0;
+  }
+  assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+  Binary |= ShiftAmt << ARMII::ShiftShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitSaturateInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGen.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode saturate bit position.
+  unsigned Pos = MI.getOperand(1).getImm();
+  if (MCID.Opcode == ARM::SSAT || MCID.Opcode == ARM::SSAT16)
+    Pos -= 1;
+  assert((Pos < 16 || (Pos < 32 &&
+                       MCID.Opcode != ARM::SSAT16 &&
+                       MCID.Opcode != ARM::USAT16)) &&
+         "saturate bit position out of range");
+  Binary |= Pos << 16;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, 2);
+
+  // Encode shift_imm.
+  if (MCID.getNumOperands() == 4) {
+    unsigned ShiftOp = MI.getOperand(3).getImm();
+    ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
+    if (Opc == ARM_AM::asr)
+      Binary |= (1 << 6);
+    unsigned ShiftAmt = MI.getOperand(3).getImm();
+    if (ShiftAmt == 32 && Opc == ARM_AM::asr)
+      ShiftAmt = 0;
+    assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+    Binary |= ShiftAmt << ARMII::ShiftShift;
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  if (MCID.Opcode == ARM::TPsoft) {
+    llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set signed_immed_24 field
+  Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) {
+  // Remember the base address of the inline jump table.
+  uintptr_t JTBase = MCE.getCurrentPCValue();
+  JTI->addJumpTableBaseAddr(JTIndex, JTBase);
+  DEBUG(errs() << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
+               << '\n');
+
+  // Now emit the jump table entries.
+  const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    if (IsPIC)
+      // DestBB address - JT base.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase);
+    else
+      // Absolute DestBB address.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute);
+    emitWordLE(0);
+  }
+}
+
+void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Handle jump tables.
+  if (MCID.Opcode == ARM::BR_JTr || MCID.Opcode == ARM::BR_JTadd) {
+    // First emit a ldr pc, [] instruction.
+    emitDataProcessingInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    unsigned JTIndex =
+      (MCID.Opcode == ARM::BR_JTr)
+      ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
+    emitInlineJumpTable(JTIndex);
+    return;
+  } else if (MCID.Opcode == ARM::BR_JTm) {
+    // First emit a ldr pc, [] instruction.
+    emitLoadStoreInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    emitInlineJumpTable(MI.getOperand(3).getIndex());
+    return;
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  if (MCID.Opcode == ARM::BX_RET || MCID.Opcode == ARM::MOVPCLR)
+    // The return register is LR.
+    Binary |= II->getRegisterInfo().getEncodingValue(ARM::LR);
+  else
+    // otherwise, set the return register
+    Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+unsigned ARMCodeEmitter::encodeVFPRd(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegD);
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
+  if (!isSPVFP)
+    Binary |=   RegD               << ARMII::RegRdShift;
+  else {
+    Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
+    Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
+  }
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeVFPRn(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegN);
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
+  if (!isSPVFP)
+    Binary |=   RegN               << ARMII::RegRnShift;
+  else {
+    Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
+    Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
+  }
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeVFPRm(const MachineInstr &MI,
+                                     unsigned OpIdx) const {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = ARM::SPRRegClass.contains(RegM);
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
+  if (!isSPVFP)
+    Binary |=   RegM;
+  else {
+    Binary |= ((RegM & 0x1E) >> 1);
+    Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
+  }
+  return Binary;
+}
+
+void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+  assert((Binary & ARMII::D_BitShift) == 0 &&
+         (Binary & ARMII::N_BitShift) == 0 &&
+         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // If this is a two-address operand, skip it, e.g. FMACD.
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode Dn / Sn.
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm)
+    Binary |= encodeVFPRn(MI, OpIdx++);
+
+  if (OpIdx == MCID.getNumOperands() ||
+      MCID.OpInfo[OpIdx].isPredicate() ||
+      MCID.OpInfo[OpIdx].isOptionalDef()) {
+    // FCMPEZD etc. has only one operand.
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Dm / Sm.
+  Binary |= encodeVFPRm(MI, OpIdx);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPConversionInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Form = MCID.TSFlags & ARMII::FormMask;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 0);
+    break;
+  case ARMII::VFPConv4Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 0);
+    break;
+  case ARMII::VFPConv5Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 0);
+    break;
+  }
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 1);
+    break;
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 1);
+    break;
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 1);
+    break;
+  }
+
+  if (Form == ARMII::VFPConv5Frm)
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 2);
+  else if (Form == ARMII::VFPConv3Frm)
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 2);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // Encode address base.
+  const MachineOperand &Base = MI.getOperand(OpIdx++);
+  Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift;
+
+  // If there is a non-zero immediate offset, encode it.
+  if (Base.isReg()) {
+    const MachineOperand &Offset = MI.getOperand(OpIdx);
+    if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) {
+      if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add)
+        Binary |= 1 << ARMII::U_BitShift;
+      Binary |= ImmOffs;
+      emitWordLE(Binary);
+      return;
+    }
+  }
+
+  // If immediate offset is omitted, default to +0.
+  Binary |= 1 << ARMII::U_BitShift;
+
+  emitWordLE(Binary);
+}
+
+void
+ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool IsUpdating = (MCID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Skip operand 0 of an instruction with base register update.
+  unsigned OpIdx = 0;
+  if (IsUpdating)
+    ++OpIdx;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(MI.getOpcode());
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(Mode));
+
+  // Set bit W(21)
+  if (IsUpdating)
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // First register is encoded in Dd.
+  Binary |= encodeVFPRd(MI, OpIdx+2);
+
+  // Count the number of registers.
+  unsigned NumRegs = 1;
+  for (unsigned i = OpIdx+3, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    ++NumRegs;
+  }
+  // Bit 8 will be set if <list> is consecutive 64-bit registers (e.g., D0)
+  // Otherwise, it will be 0, in the case of 32-bit registers.
+  if(Binary & 0x100)
+    Binary |= NumRegs * 2;
+  else
+    Binary |= NumRegs;
+
+  emitWordLE(Binary);
+}
+
+unsigned ARMCodeEmitter::encodeNEONRd(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegD = II->getRegisterInfo().getEncodingValue(RegD);
+  Binary |= (RegD & 0xf) << ARMII::RegRdShift;
+  Binary |= ((RegD >> 4) & 1) << ARMII::D_BitShift;
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeNEONRn(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegN = II->getRegisterInfo().getEncodingValue(RegN);
+  Binary |= (RegN & 0xf) << ARMII::RegRnShift;
+  Binary |= ((RegN >> 4) & 1) << ARMII::N_BitShift;
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::encodeNEONRm(const MachineInstr &MI,
+                                      unsigned OpIdx) const {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  RegM = II->getRegisterInfo().getEncodingValue(RegM);
+  Binary |= (RegM & 0xf);
+  Binary |= ((RegM >> 4) & 1) << ARMII::M_BitShift;
+  return Binary;
+}
+
+/// convertNEONDataProcToThumb - Convert the ARM mode encoding for a NEON
+/// data-processing instruction to the corresponding Thumb encoding.
+static unsigned convertNEONDataProcToThumb(unsigned Binary) {
+  assert((Binary & 0xfe000000) == 0xf2000000 &&
+         "not an ARM NEON data-processing instruction");
+  unsigned UBit = (Binary >> 24) & 1;
+  return 0xef000000 | (UBit << 28) | (Binary & 0xffffff);
+}
+
+void ARMCodeEmitter::emitNEONLaneInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  unsigned RegTOpIdx, RegNOpIdx, LnOpIdx;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if ((MCID.TSFlags & ARMII::FormMask) == ARMII::NGetLnFrm) {
+    RegTOpIdx = 0;
+    RegNOpIdx = 1;
+    LnOpIdx = 2;
+  } else { // ARMII::NSetLnFrm
+    RegTOpIdx = 2;
+    RegNOpIdx = 0;
+    LnOpIdx = 3;
+  }
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(RegTOpIdx).getReg();
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, RegNOpIdx);
+
+  unsigned LaneShift;
+  if ((Binary & (1 << 22)) != 0)
+    LaneShift = 0; // 8-bit elements
+  else if ((Binary & (1 << 5)) != 0)
+    LaneShift = 1; // 16-bit elements
+  else
+    LaneShift = 2; // 32-bit elements
+
+  unsigned Lane = MI.getOperand(LnOpIdx).getImm() << LaneShift;
+  unsigned Opc1 = Lane >> 2;
+  unsigned Opc2 = Lane & 3;
+  assert((Opc1 & 3) == 0 && "out-of-range lane number operand");
+  Binary |= (Opc1 << 21);
+  Binary |= (Opc2 << 5);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEONDupInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= (IsThumb ? ARMCC::AL : II->getPredicate(&MI)) << ARMII::CondShift;
+
+  unsigned RegT = MI.getOperand(1).getReg();
+  RegT = II->getRegisterInfo().getEncodingValue(RegT);
+  Binary |= (RegT << ARMII::RegRdShift);
+  Binary |= encodeNEONRn(MI, 0);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON1RegModImmInstruction(const MachineInstr &MI) {
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd.
+  Binary |= encodeNEONRd(MI, 0);
+  // Immediate fields: Op, Cmode, I, Imm3, Imm4
+  unsigned Imm = MI.getOperand(1).getImm();
+  unsigned Op = (Imm >> 12) & 1;
+  unsigned Cmode = (Imm >> 8) & 0xf;
+  unsigned I = (Imm >> 7) & 1;
+  unsigned Imm3 = (Imm >> 4) & 0x7;
+  unsigned Imm4 = Imm & 0xf;
+  Binary |= (I << 24) | (Imm3 << 16) | (Cmode << 8) | (Op << 5) | Imm4;
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON2RegInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source register in Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VDUPfdf or VDUPfqf.
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitNEON3RegInstruction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  unsigned Binary = getBinaryCodeForInstr(MI);
+  // Destination register is encoded in Dd; source registers in Dn and Dm.
+  unsigned OpIdx = 0;
+  Binary |= encodeNEONRd(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRn(MI, OpIdx++);
+  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
+    ++OpIdx;
+  Binary |= encodeNEONRm(MI, OpIdx);
+  if (IsThumb)
+    Binary = convertNEONDataProcToThumb(Binary);
+  // FIXME: This does not handle VMOVDneon or VMOVQ.
+  emitWordLE(Binary);
+}
+
+#include "ARMGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 000000000000..4891609b336f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -0,0 +1,2061 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-cp-islands"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+STATISTIC(NumTBs,        "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
+STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
+STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
+STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+
+
+static cl::opt<bool>
+AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
+          cl::desc("Adjust basic block layout to better use TB[BH]"));
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("arm-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+/// UnknownPadding - Return the worst case padding that could result from
+/// unknown offset bits.  This does not include alignment padding caused by
+/// known offset bits.
+///
+/// @param LogAlign log2(alignment)
+/// @param KnownBits Number of known low offset bits.
+static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
+  if (KnownBits < LogAlign)
+    return (1u << LogAlign) - (1u << KnownBits);
+  return 0;
+}
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class ARMConstantIslands : public MachineFunctionPass {
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      /// KnownBits - The number of low bits in Offset that are known to be
+      /// exact.  The remaining bits of Offset are an upper bound.
+      uint8_t KnownBits;
+
+      /// Unalign - When non-zero, the block contains instructions (inline asm)
+      /// of unknown size.  The real size may be smaller than Size bytes by a
+      /// multiple of 1 << Unalign.
+      uint8_t Unalign;
+
+      /// PostAlign - When non-zero, the block terminator contains a .align
+      /// directive, so the end of the block is aligned to 1 << PostAlign
+      /// bytes.
+      uint8_t PostAlign;
+
+      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
+        PostAlign(0) {}
+
+      /// Compute the number of known offset bits internally to this block.
+      /// This number should be used to predict worst case padding when
+      /// splitting the block.
+      unsigned internalKnownBits() const {
+        unsigned Bits = Unalign ? Unalign : KnownBits;
+        // If the block size isn't a multiple of the known bits, assume the
+        // worst case padding.
+        if (Size & ((1u << Bits) - 1))
+          Bits = CountTrailingZeros_32(Size);
+        return Bits;
+      }
+
+      /// Compute the offset immediately following this block.  If LogAlign is
+      /// specified, return the offset the successor block will get if it has
+      /// this alignment.
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        unsigned LA = std::max(unsigned(PostAlign), LogAlign);
+        if (!LA)
+          return PO;
+        // Add alignment padding from the terminator.
+        return PO + UnknownPadding(LA, internalKnownBits());
+      }
+
+      /// Compute the number of known low bits of postOffset.  If this block
+      /// contains inline asm, the number of known bits drops to the
+      /// instruction alignment.  An aligned terminator may increase the number
+      /// of know bits.
+      /// If LogAlign is given, also consider the alignment of the next block.
+      unsigned postKnownBits(unsigned LogAlign = 0) const {
+        return std::max(std::max(unsigned(PostAlign), LogAlign),
+                        internalKnownBits());
+      }
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+    public:
+      bool NegOk;
+      bool IsSoImm;
+      bool KnownAlignment;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg, bool soimm)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
+          KnownAlignment(false) {
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      /// Correct for unknown alignment.
+      /// Conservatively subtract 2 bytes to handle weird alignment effects.
+      unsigned getMaxDisp() const {
+        return (KnownAlignment ? MaxDisp : MaxDisp - 2) - 2;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that
+    /// existed upon entry to this pass), it keeps a vector of entries.
+    /// Original elements are cloned as we go along; the clones are
+    /// put in the vector of the original element, but have distinct CPIs.
+    std::vector<std::vector<CPEntry> > CPEntries;
+
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      int UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+    SmallVector<MachineInstr*, 4> T2JumpTables;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    MachineFunction *MF;
+    MachineConstantPool *MCP;
+    const ARMBaseInstrInfo *TII;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+    bool isThumb1;
+    bool isThumb2;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM constant island placement and branch shortening pass";
+    }
+
+  private:
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void scanFunctionJumpTables();
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
+    bool undoLRSpillRestore();
+    bool mayOptimizeThumb2Instruction(const MachineInstr *MI) const;
+    bool optimizeThumb2Instructions();
+    bool optimizeThumb2Branches();
+    bool reorderThumb2JumpTables();
+    bool optimizeThumb2JumpTables();
+    MachineBasicBlock *adjustJTTargetBlockForward(MachineBasicBlock *BB,
+                                                  MachineBasicBlock *JTBB);
+
+    void computeBlockSize(MachineBasicBlock *MBB);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U) {
+      return isOffsetInRange(UserOffset, TrialOffset,
+                             U.getMaxDisp(), U.NegOk, U.IsSoImm);
+    }
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify() {
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    unsigned MBBId = MBB->getNumber();
+    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
+  }
+  DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n");
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned UserOffset = getUserOffset(U);
+    // Verify offset using the real max displacement without the safety
+    // adjustment.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, U.getMaxDisp()+2, U.NegOk,
+                         /* DoDump = */ true)) {
+      DEBUG(dbgs() << "OK\n");
+      continue;
+    }
+    DEBUG(dbgs() << "Out of range.\n");
+    dumpBBs();
+    DEBUG(MF->dump());
+    llvm_unreachable("Constant pool entry out of range!");
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << " kb=" << unsigned(BBI.KnownBits)
+             << " ua=" << unsigned(BBI.Unalign)
+             << " pa=" << unsigned(BBI.PostAlign)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MCP = mf.getConstantPool();
+
+  DEBUG(dbgs() << "***** ARMConstantIslands: "
+               << MCP->getConstants().size() << " CP entries, aligned to "
+               << MCP->getConstantPoolAlignment() << " bytes *****\n");
+
+  TII = (const ARMBaseInstrInfo*)MF->getTarget().getInstrInfo();
+  AFI = MF->getInfo<ARMFunctionInfo>();
+  STI = &MF->getTarget().getSubtarget<ARMSubtarget>();
+
+  isThumb = AFI->isThumbFunction();
+  isThumb1 = AFI->isThumb1OnlyFunction();
+  isThumb2 = AFI->isThumb2Function();
+
+  HasFarJump = false;
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  // Try to reorder and otherwise adjust the block layout to make good use
+  // of the TB[BH] instructions.
+  bool MadeChange = false;
+  if (isThumb2 && AdjustJumpTableBlocks) {
+    scanFunctionJumpTables();
+    MadeChange |= reorderThumb2JumpTables();
+    // Data is out of date, so clear it. It'll be re-computed later.
+    T2JumpTables.clear();
+    // Blocks may have shifted around. Keep the numbering up to date.
+    MF->RenumberBlocks();
+  }
+
+  // Thumb1 functions containing constant pools get 4-byte alignment.
+  // This is so we can keep exact track of where the alignment padding goes.
+
+  // ARM and Thumb2 functions need to be 4-byte aligned.
+  if (!isThumb1)
+    MF->ensureAlignment(2);  // 2 = log2(4)
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  AFI->initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  if (isThumb2 && !STI->prefers32BitThumb())
+    MadeChange |= optimizeThumb2Instructions();
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify();
+
+  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
+  // undo the spill / restore of LR if possible.
+  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
+    MadeChange |= undoLRSpillRestore();
+
+  // Save the mapping between original and cloned constpool entries.
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+    for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
+      const CPEntry & CPE = CPEntries[i][j];
+      AFI->recordCPEClone(i, CPE.CPI);
+    }
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+  T2JumpTables.clear();
+
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == ARM::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// scanFunctionJumpTables - Do a scan of the function, building up
+/// information about the sizes of each block and the locations of all
+/// the jump tables.
+void ARMConstantIslands::scanFunctionJumpTables() {
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I)
+      if (I->isBranch() && I->getOpcode() == ARM::t2BR_JT)
+        T2JumpTables.push_back(I);
+  }
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+  // The known bits of the entry block offset are determined by the function
+  // alignment.
+  BBInfo.front().KnownBits = MF->getAlignment();
+
+  // Compute block offsets and known bits.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::t2BR_JT:
+          T2JumpTables.push_back(I);
+          continue;   // Does not get an entry in ImmBranches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          // Fallthrough
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        case ARM::t2Bcc:
+          isCond = true;
+          UOpc = ARM::t2B;
+          Bits = 20;
+          Scale = 2;
+          break;
+        case ARM::t2B:
+          Bits = 24;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(I);
+
+      if (Opc == ARM::CONSTPOOL_ENTRY)
+        continue;
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          bool IsSoImm = false;
+
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+
+          // Taking the address of a CP entry.
+          case ARM::LEApcrel:
+            // This takes a SoImm, which is 8 bit immediate rotated. We'll
+            // pretend the maximum offset is 255 * 4. Since each instruction
+            // 4 byte wide, this is always correct. We'll check for other
+            // displacements that fits in a SoImm as well.
+            Bits = 8;
+            Scale = 4;
+            NegOk = true;
+            IsSoImm = true;
+            break;
+          case ARM::t2LEApcrel:
+            Bits = 12;
+            NegOk = true;
+            break;
+          case ARM::tLEApcrel:
+            Bits = 8;
+            Scale = 4;
+            break;
+
+          case ARM::LDRi12:
+          case ARM::LDRcp:
+          case ARM::t2LDRpci:
+            Bits = 12;  // +-offset_12
+            NegOk = true;
+            break;
+
+          case ARM::tLDRpci:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+
+          case ARM::VLDRD:
+          case ARM::VLDRS:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            NegOk = true;
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+  }
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void ARMConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+  BBI.Unalign = 0;
+  BBI.PostAlign = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    BBI.Size += TII->GetInstSizeInBytes(I);
+    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
+    // The actual size may be smaller, but still a multiple of the instr size.
+    if (I->isInlineAsm())
+      BBI.Unalign = isThumb ? 1 : 2;
+    // Also consider instructions that may be shrunk later.
+    else if (isThumb && mayOptimizeThumb2Instruction(I))
+      BBI.Unalign = 1;
+  }
+
+  // tBR_JTr contains a .align 2 directive.
+  if (!MBB->empty() && MBB->back().getOpcode() == ARM::tBR_JTr) {
+    BBI.PostAlign = 2;
+    MBB->getParent()->ensureAlignment(2);
+  }
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+  if (!isThumb)
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  else
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+            .addImm(ARMCC::AL).addReg(0);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(llvm::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+/// getUserOffset - Compute the offset of U.MI as seen by the hardware
+/// displacement computation.  Update U.KnownAlignment to match its current
+/// basic block location.
+unsigned ARMConstantIslands::getUserOffset(CPUser &U) const {
+  unsigned UserOffset = getOffsetOf(U.MI);
+  const BasicBlockInfo &BBI = BBInfo[U.MI->getParent()->getNumber()];
+  unsigned KnownBits = BBI.internalKnownBits();
+
+  // The value read from PC is offset from the actual instruction address.
+  UserOffset += (isThumb ? 4 : 8);
+
+  // Because of inline assembly, we may not know the alignment (mod 4) of U.MI.
+  // Make sure U.getMaxDisp() returns a constrained range.
+  U.KnownAlignment = (KnownBits >= 2);
+
+  // On Thumb, offsets==2 mod 4 are rounded down by the hardware for
+  // purposes of the displacement computation; compensate for that here.
+  // For unknown alignments, getMaxDisp() constrains the range instead.
+  if (isThumb && U.KnownAlignment)
+    UserOffset &= ~3u;
+
+  return UserOffset;
+}
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+/// UserOffset is computed by getUserOffset above to include PC adjustments. If
+/// the mod 4 alignment of UserOffset is not known, the uncertainty must be
+/// subtracted from MaxDisp instead. CPUser::getMaxDisp() does that.
+bool ARMConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK, bool IsSoImm) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool ARMConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth + UnknownPadding(MF->getAlignment(), CPELogAlign);
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI, unsigned MaxDisp,
+                                      bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB
+      || PredMI->getOpcode() == ARM::t2B)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
+    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
+    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
+
+    // This is where block i begins.  Stop if the offset is already correct,
+    // and we have updated 2 blocks.  This is the maximum number of blocks
+    // changed before calling this function.
+    if (i > BBNum + 2 &&
+        BBInfo[i].Offset == Offset &&
+        BBInfo[i].KnownBits == KnownBits)
+      break;
+
+    BBInfo[i].Offset = Offset;
+    BBInfo[i].KnownBits = KnownBits;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case ARM::tB:
+    return ((1<<10)-1)*2;
+  case ARM::t2B:
+    return ((1<<23)-1)*2;
+  default:
+    break;
+  }
+
+  return ((1<<23)-1)*4;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  For Thumb, prefer water that will not
+/// introduce padding to water that will.  To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool ARMConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  (The addition
+  // below is for the unconditional branch we will be adding: 4 bytes on ARM +
+  // Thumb2, 2 on Thumb1.
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = isThumb1 ? 2 : 4;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+      if (!isThumb)
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      else
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+          .addImm(ARMCC::AL).addReg(0);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  This is a
+  // little tricky on Thumb1 since instructions are 2 bytes and constant pool
+  // entries are 4 bytes: if instruction I references island CPE, and
+  // instruction I+1 references CPE', it will not work well to put CPE as far
+  // forward as possible, since then CPE' cannot immediately follow it (that
+  // location is 2 bytes farther away from I+1 than CPE was from I) and we'd
+  // need to create a new island.  So, we make a first guess, then walk through
+  // the instructions between the one currently being looked at and the
+  // possible insertion point, and make sure any other instructions that
+  // reference CPEs will be able to use the same island area; if not, we back
+  // up the insertion point.
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned KnownBits = UserBBI.internalKnownBits();
+  unsigned UPad = UnknownPadding(LogAlign, KnownBits);
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp() - UPad;
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // (allows for long branch on Thumb1).  Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign
+               << " kb=" << KnownBits
+               << " up=" << UPad << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - UPad - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 + UPad +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI),
+       MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+
+    // Remember the last IT instruction.
+    if (MI->getOpcode() == ARM::t2IT)
+      LastIT = MI;
+  }
+
+  --MI;
+
+  // Avoid splitting an IT block.
+  if (LastIT) {
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    if (CC != ARMCC::AL)
+      MI = LastIT;
+  }
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = AFI->createPICLabelUId();
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::isBBInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  if (!isThumb1)
+    llvm_unreachable("fixupUnconditionalBr is Thumb1 only!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setDesc(TII->get(ARM::tBfar));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  if (isThumb)
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+            .addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
+bool ARMConstantIslands::undoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    // First two operands are predicates.
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(2).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 3) {
+      // Create the new insn and copy the predicate from the old.
+      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
+        .addOperand(MI->getOperand(0))
+        .addOperand(MI->getOperand(1));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+// mayOptimizeThumb2Instruction - Returns true if optimizeThumb2Instructions
+// below may shrink MI.
+bool
+ARMConstantIslands::mayOptimizeThumb2Instruction(const MachineInstr *MI) const {
+  switch(MI->getOpcode()) {
+    // optimizeThumb2Instructions.
+    case ARM::t2LEApcrel:
+    case ARM::t2LDRpci:
+    // optimizeThumb2Branches.
+    case ARM::t2B:
+    case ARM::t2Bcc:
+    case ARM::tBcc:
+    // optimizeThumb2JumpTables.
+    case ARM::t2BR_JT:
+      return true;
+  }
+  return false;
+}
+
+bool ARMConstantIslands::optimizeThumb2Instructions() {
+  bool MadeChange = false;
+
+  // Shrink ADR and LDR from constantpool.
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned Opcode = U.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2LEApcrel:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLEApcrel;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    case ARM::t2LDRpci:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLDRpci;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    }
+
+    if (!NewOpc)
+      continue;
+
+    unsigned UserOffset = getUserOffset(U);
+    unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+
+    // Be conservative with inline asm.
+    if (!U.KnownAlignment)
+      MaxOffs -= 2;
+
+    // FIXME: Check if offset is multiple of scale if scale is not 4.
+    if (isCPEntryInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      DEBUG(dbgs() << "Shrink: " << *U.MI);
+      U.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = U.MI->getParent();
+      BBInfo[MBB->getNumber()].Size -= 2;
+      adjustBBOffsetsAfter(MBB);
+      ++NumT2CPShrunk;
+      MadeChange = true;
+    }
+  }
+
+  MadeChange |= optimizeThumb2Branches();
+  MadeChange |= optimizeThumb2JumpTables();
+  return MadeChange;
+}
+
+bool ARMConstantIslands::optimizeThumb2Branches() {
+  bool MadeChange = false;
+
+  for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
+    ImmBranch &Br = ImmBranches[i];
+    unsigned Opcode = Br.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2B:
+      NewOpc = ARM::tB;
+      Bits = 11;
+      Scale = 2;
+      break;
+    case ARM::t2Bcc: {
+      NewOpc = ARM::tBcc;
+      Bits = 8;
+      Scale = 2;
+      break;
+    }
+    }
+    if (NewOpc) {
+      unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+      MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+      if (isBBInRange(Br.MI, DestBB, MaxOffs)) {
+        DEBUG(dbgs() << "Shrink branch: " << *Br.MI);
+        Br.MI->setDesc(TII->get(NewOpc));
+        MachineBasicBlock *MBB = Br.MI->getParent();
+        BBInfo[MBB->getNumber()].Size -= 2;
+        adjustBBOffsetsAfter(MBB);
+        ++NumT2BrShrunk;
+        MadeChange = true;
+      }
+    }
+
+    Opcode = Br.MI->getOpcode();
+    if (Opcode != ARM::tBcc)
+      continue;
+
+    // If the conditional branch doesn't kill CPSR, then CPSR can be liveout
+    // so this transformation is not safe.
+    if (!Br.MI->killsRegister(ARM::CPSR))
+      continue;
+
+    NewOpc = 0;
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = getInstrPredicate(Br.MI, PredReg);
+    if (Pred == ARMCC::EQ)
+      NewOpc = ARM::tCBZ;
+    else if (Pred == ARMCC::NE)
+      NewOpc = ARM::tCBNZ;
+    if (!NewOpc)
+      continue;
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    // Check if the distance is within 126. Subtract starting offset by 2
+    // because the cmp will be eliminated.
+    unsigned BrOffset = getOffsetOf(Br.MI) + 4 - 2;
+    unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+    if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
+      MachineBasicBlock::iterator CmpMI = Br.MI;
+      if (CmpMI != Br.MI->getParent()->begin()) {
+        --CmpMI;
+        if (CmpMI->getOpcode() == ARM::tCMPi8) {
+          unsigned Reg = CmpMI->getOperand(0).getReg();
+          Pred = getInstrPredicate(CmpMI, PredReg);
+          if (Pred == ARMCC::AL &&
+              CmpMI->getOperand(1).getImm() == 0 &&
+              isARMLowRegister(Reg)) {
+            MachineBasicBlock *MBB = Br.MI->getParent();
+            DEBUG(dbgs() << "Fold: " << *CmpMI << " and: " << *Br.MI);
+            MachineInstr *NewBR =
+              BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+              .addReg(Reg).addMBB(DestBB,Br.MI->getOperand(0).getTargetFlags());
+            CmpMI->eraseFromParent();
+            Br.MI->eraseFromParent();
+            Br.MI = NewBR;
+            BBInfo[MBB->getNumber()].Size -= 2;
+            adjustBBOffsetsAfter(MBB);
+            ++NumCBZ;
+            MadeChange = true;
+          }
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+/// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::optimizeThumb2JumpTables() {
+  bool MadeChange = false;
+
+  // FIXME: After the tables are shrunk, can we get rid some of the
+  // constantpool tables?
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (MJTI == 0) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    bool ByteOk = true;
+    bool HalfWordOk = true;
+    unsigned JTOffset = getOffsetOf(MI) + 4;
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      unsigned DstOffset = BBInfo[MBB->getNumber()].Offset;
+      // Negative offset is not ok. FIXME: We should change BB layout to make
+      // sure all the branches are forward.
+      if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+        ByteOk = false;
+      unsigned TBHLimit = ((1<<16)-1)*2;
+      if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+        HalfWordOk = false;
+      if (!ByteOk && !HalfWordOk)
+        break;
+    }
+
+    if (ByteOk || HalfWordOk) {
+      MachineBasicBlock *MBB = MI->getParent();
+      unsigned BaseReg = MI->getOperand(0).getReg();
+      bool BaseRegKill = MI->getOperand(0).isKill();
+      if (!BaseRegKill)
+        continue;
+      unsigned IdxReg = MI->getOperand(1).getReg();
+      bool IdxRegKill = MI->getOperand(1).isKill();
+
+      // Scan backwards to find the instruction that defines the base
+      // register. Due to post-RA scheduling, we can't count on it
+      // immediately preceding the branch instruction.
+      MachineBasicBlock::iterator PrevI = MI;
+      MachineBasicBlock::iterator B = MBB->begin();
+      while (PrevI != B && !PrevI->definesRegister(BaseReg))
+        --PrevI;
+
+      // If for some reason we didn't find it, we can't do anything, so
+      // just skip this one.
+      if (!PrevI->definesRegister(BaseReg))
+        continue;
+
+      MachineInstr *AddrMI = PrevI;
+      bool OptOk = true;
+      // Examine the instruction that calculates the jumptable entry address.
+      // Make sure it only defines the base register and kills any uses
+      // other than the index register.
+      for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
+        const MachineOperand &MO = AddrMI->getOperand(k);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        if (MO.isDef() && MO.getReg() != BaseReg) {
+          OptOk = false;
+          break;
+        }
+        if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) {
+          OptOk = false;
+          break;
+        }
+      }
+      if (!OptOk)
+        continue;
+
+      // Now scan back again to find the tLEApcrel or t2LEApcrelJT instruction
+      // that gave us the initial base register definition.
+      for (--PrevI; PrevI != B && !PrevI->definesRegister(BaseReg); --PrevI)
+        ;
+
+      // The instruction should be a tLEApcrel or t2LEApcrelJT; we want
+      // to delete it as well.
+      MachineInstr *LeaMI = PrevI;
+      if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
+           LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
+          LeaMI->getOperand(0).getReg() != BaseReg)
+        OptOk = false;
+
+      if (!OptOk)
+        continue;
+
+      DEBUG(dbgs() << "Shrink JT: " << *MI << "     addr: " << *AddrMI
+                   << "      lea: " << *LeaMI);
+      unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
+      MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+        .addReg(IdxReg, getKillRegState(IdxRegKill))
+        .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+        .addImm(MI->getOperand(JTOpIdx+1).getImm());
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": " << *NewJTMI);
+      // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
+      // is 2-byte aligned. For now, asm printer will fix it up.
+      unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+      unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI);
+      OrigSize += TII->GetInstSizeInBytes(LeaMI);
+      OrigSize += TII->GetInstSizeInBytes(MI);
+
+      AddrMI->eraseFromParent();
+      LeaMI->eraseFromParent();
+      MI->eraseFromParent();
+
+      int delta = OrigSize - NewSize;
+      BBInfo[MBB->getNumber()].Size -= delta;
+      adjustBBOffsetsAfter(MBB);
+
+      ++NumTBs;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// reorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// jump tables always branch forwards, since that's what tbb and tbh need.
+bool ARMConstantIslands::reorderThumb2JumpTables() {
+  bool MadeChange = false;
+
+  MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (MJTI == 0) return false;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const MCInstrDesc &MCID = MI->getDesc();
+    unsigned NumOps = MCID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (MI->isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    // We prefer if target blocks for the jump table come after the jump
+    // instruction so we can use TB[BH]. Loop through the target blocks
+    // and try to adjust them such that that's true.
+    int JTNumber = MI->getParent()->getNumber();
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      int DTNumber = MBB->getNumber();
+
+      if (DTNumber < JTNumber) {
+        // The destination precedes the switch. Try to move the block forward
+        // so we have a positive offset.
+        MachineBasicBlock *NewBB =
+          adjustJTTargetBlockForward(MBB, MI->getParent());
+        if (NewBB)
+          MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+MachineBasicBlock *ARMConstantIslands::
+adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
+  // If the destination block is terminated by an unconditional branch,
+  // try to move it; otherwise, create a new block following the jump
+  // table that branches back to the actual target. This is a very simple
+  // heuristic. FIXME: We can definitely improve it.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineOperand, 4> CondPrior;
+  MachineFunction::iterator BBi = BB;
+  MachineFunction::iterator OldPrior = prior(BBi);
+
+  // If the block terminator isn't analyzable, don't try to move the block
+  bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
+
+  // If the block ends in an unconditional branch, move it. The prior block
+  // has to have an analyzable terminator for us to move this one. Be paranoid
+  // and make sure we're not trying to move the entry block of the function.
+  if (!B && Cond.empty() && BB != MF->begin() &&
+      !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+    BB->moveAfter(JTBB);
+    OldPrior->updateTerminator();
+    BB->updateTerminator();
+    // Update numbering to account for the block being moved.
+    MF->RenumberBlocks();
+    ++NumJTMoved;
+    return NULL;
+  }
+
+  // Create a new MBB for the code after the jump BB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(JTBB->getBasicBlock());
+  MachineFunction::iterator MBBI = JTBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Add an unconditional branch from NewBB to BB.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond directly to anything in the source.
+  assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
+  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB)
+          .addImm(ARMCC::AL).addReg(0);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Update the CFG.
+  NewBB->addSuccessor(BB);
+  JTBB->removeSuccessor(BB);
+  JTBB->addSuccessor(NewBB);
+
+  ++NumJTInserted;
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 000000000000..4e703ec3c1a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -0,0 +1,306 @@
+//===-- ARMConstantPoolValue.cpp - ARM constantpool value -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+    PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+                                           ARMCP::ARMCPKind kind,
+                                           unsigned char PCAdj,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+    LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
+
+const char *ARMConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+  case ARMCP::no_modifier: return "none";
+  case ARMCP::TLSGD:       return "tlsgd";
+  case ARMCP::GOT:         return "GOT";
+  case ARMCP::GOTOFF:      return "GOTOFF";
+  case ARMCP::GOTTPOFF:    return "gottpoff";
+  case ARMCP::TPOFF:       return "tpoff";
+  }
+  llvm_unreachable("Unknown modifier!");
+}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  llvm_unreachable("Shouldn't be calling this directly!");
+}
+
+void
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddInteger(LabelId);
+  ID.AddInteger(PCAdjust);
+}
+
+bool
+ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
+  if (ACPV->Kind == Kind &&
+      ACPV->PCAdjust == PCAdjust &&
+      ACPV->Modifier == Modifier) {
+    if (ACPV->LabelId == LabelId)
+      return true;
+    // Two PC relative constpool entries containing the same GV address or
+    // external symbols. FIXME: What about blockaddress?
+    if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
+      return true;
+  }
+  return false;
+}
+
+void ARMConstantPoolValue::dump() const {
+  errs() << "  " << *this;
+}
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+  if (Modifier) O << "(" << getModifierText() << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+    if (AddCurrentAddress) O << "-.";
+    O << ")";
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+                                                 const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+                         AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+  return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+                                ARMCP::ARMCPModifier Modifier) {
+  return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+                                     GV, 0, ARMCP::CPValue, 0,
+                                     Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+                                ARMCP::ARMCPModifier Modifier,
+                                bool AddCurrentAddress) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+                                     AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                       unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
+      if (!APC) continue;
+      if (APC->CVal == CVal && equals(APC))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+  return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s,
+                                             unsigned id,
+                                             unsigned char PCAdj,
+                                             ARMCP::ARMCPModifier Modifier,
+                                             bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+                         AddCurrentAddress),
+    S(s) {}
+
+ARMConstantPoolSymbol *
+ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
+                              unsigned ID, unsigned char PCAdj) {
+  return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
+      if (!APS) continue;
+
+      if (APS->S == S && equals(APS))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+  return ACPS && ACPS->S == S && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddString(S);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+                                       const MachineBasicBlock *mbb,
+                                       unsigned id, unsigned char PCAdj,
+                                       ARMCP::ARMCPModifier Modifier,
+                                       bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+                         Modifier, AddCurrentAddress),
+    MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+                                               const MachineBasicBlock *mbb,
+                                               unsigned ID,
+                                               unsigned char PCAdj) {
+  return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
+      if (!APMBB) continue;
+
+      if (APMBB->MBB == MBB && equals(APMBB))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+  return ACPMBB && ACPMBB->MBB == MBB &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+  O << "BB#" << MBB->getNumber();
+  ARMConstantPoolValue::print(O);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 000000000000..93812fe6bb37
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -0,0 +1,226 @@
+//===-- ARMConstantPoolValue.h - ARM constantpool value ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+
+namespace llvm {
+
+class BlockAddress;
+class Constant;
+class GlobalValue;
+class LLVMContext;
+class MachineBasicBlock;
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPExtSymbol,
+    CPBlockAddress,
+    CPLSDA,
+    CPMachineBasicBlock
+  };
+
+  enum ARMCPModifier {
+    no_modifier,
+    TLSGD,
+    GOT,
+    GOTOFF,
+    GOTTPOFF,
+    TPOFF
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // Kind of constant.
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
+                           // 8 for ARM, 4 for Thumb.
+  ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+protected:
+  ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+public:
+  virtual ~ARMConstantPoolValue();
+
+  ARMCP::ARMCPModifier getModifier() const { return Modifier; }
+  const char *getModifierText() const;
+  bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
+  unsigned getLabelId() const { return LabelId; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+
+  bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
+  bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
+  bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+  bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
+
+  virtual unsigned getRelocationInfo() const { return 2; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  bool equals(const ARMConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId &&
+      this->PCAdjust == A->PCAdjust &&
+      this->Modifier == A->Modifier;
+  }
+
+  virtual void print(raw_ostream &O) const;
+  void print(raw_ostream *O) const { if (O) print(*O); }
+  void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+  const Constant *CVal;         // Constant being loaded.
+
+  ARMConstantPoolConstant(const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(Type *Ty, const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+  static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+                                         ARMCP::ARMCPModifier Modifier);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj,
+                                         ARMCP::ARMCPModifier Modifier,
+                                         bool AddCurrentAddress);
+
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  virtual void print(raw_ostream &O) const;
+  static bool classof(const ARMConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
+  }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+  const std::string S;          // ExtSymbol being loaded.
+
+  ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id,
+                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                        bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s,
+                                       unsigned ID, unsigned char PCAdj);
+
+  const char *getSymbol() const { return S.c_str(); }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+                     unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                     bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolMBB *Create(LLVMContext &C,
+                                    const MachineBasicBlock *mbb,
+                                    unsigned ID, unsigned char PCAdj);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
new file mode 100644
index 000000000000..beb843ca9aa8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -0,0 +1,1249 @@
+//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// the post-regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-pseudo"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+                cl::desc("Verify machine code after expanding ARM pseudos"));
+
+namespace {
+  class ARMExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    ARMExpandPseudo() : MachineFunctionPass(ID) {}
+
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM pseudo instruction expansion pass";
+    }
+
+  private:
+    void TransferImpOps(MachineInstr &OldMI,
+                        MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
+    bool ExpandMI(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MBBI);
+    bool ExpandMBB(MachineBasicBlock &MBB);
+    void ExpandVLD(MachineBasicBlock::iterator &MBBI);
+    void ExpandVST(MachineBasicBlock::iterator &MBBI);
+    void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
+    void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                    unsigned Opc, bool IsExt);
+    void ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI);
+  };
+  char ARMExpandPseudo::ID = 0;
+}
+
+/// TransferImpOps - Transfer implicit operands on the pseudo instruction to
+/// the instructions created from the expansion.
+void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
+                                     MachineInstrBuilder &UseMI,
+                                     MachineInstrBuilder &DefMI) {
+  const MCInstrDesc &Desc = OldMI.getDesc();
+  for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = OldMI.getOperand(i);
+    assert(MO.isReg() && MO.getReg());
+    if (MO.isUse())
+      UseMI.addOperand(MO);
+    else
+      DefMI.addOperand(MO);
+  }
+}
+
+namespace {
+  // Constants for register spacing in NEON load/store instructions.
+  // For quad-register load-lane and store-lane pseudo instructors, the
+  // spacing is initially assumed to be EvenDblSpc, and that is changed to
+  // OddDblSpc depending on the lane number operand.
+  enum NEONRegSpacing {
+    SingleSpc,
+    EvenDblSpc,
+    OddDblSpc
+  };
+
+  // Entries for NEON load/store information table.  The table is sorted by
+  // PseudoOpc for fast binary-search lookups.
+  struct NEONLdStTableEntry {
+    uint16_t PseudoOpc;
+    uint16_t RealOpc;
+    bool IsLoad;
+    bool isUpdating;
+    bool hasWritebackOperand;
+    uint8_t RegSpacing; // One of type NEONRegSpacing
+    uint8_t NumRegs; // D registers loaded or stored
+    uint8_t RegElts; // elements per D register; used for lane ops
+    // FIXME: Temporary flag to denote whether the real instruction takes
+    // a single register (like the encoding) or all of the registers in
+    // the list (like the asm syntax and the isel DAG). When all definitions
+    // are converted to take only the single encoded register, this will
+    // go away.
+    bool copyAllListRegs;
+
+    // Comparison methods for binary search of the table.
+    bool operator<(const NEONLdStTableEntry &TE) const {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+    friend bool operator<(const NEONLdStTableEntry &TE, unsigned PseudoOpc) {
+      return TE.PseudoOpc < PseudoOpc;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned PseudoOpc,
+                                                const NEONLdStTableEntry &TE) {
+      return PseudoOpc < TE.PseudoOpc;
+    }
+  };
+}
+
+static const NEONLdStTableEntry NEONLdStTable[] = {
+{ ARM::VLD1LNq16Pseudo,     ARM::VLD1LNd16,     true, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq16Pseudo_UPD, ARM::VLD1LNd16_UPD, true, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VLD1LNq32Pseudo,     ARM::VLD1LNd32,     true, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq32Pseudo_UPD, ARM::VLD1LNd32_UPD, true, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VLD1LNq8Pseudo,      ARM::VLD1LNd8,      true, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VLD1LNq8Pseudo_UPD,  ARM::VLD1LNd8_UPD, true, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VLD1d64QPseudo,      ARM::VLD1d64Q,     true,  false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VLD1d64TPseudo,      ARM::VLD1d64T,     true,  false, false, SingleSpc,  3, 1 ,false},
+
+{ ARM::VLD2LNd16Pseudo,     ARM::VLD2LNd16,     true, false, false, SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true,  SingleSpc,  2, 4 ,true},
+{ ARM::VLD2LNd32Pseudo,     ARM::VLD2LNd32,     true, false, false, SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd32Pseudo_UPD, ARM::VLD2LNd32_UPD, true, true, true,  SingleSpc,  2, 2 ,true},
+{ ARM::VLD2LNd8Pseudo,      ARM::VLD2LNd8,      true, false, false, SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNd8Pseudo_UPD,  ARM::VLD2LNd8_UPD, true, true, true,  SingleSpc,  2, 8 ,true},
+{ ARM::VLD2LNq16Pseudo,     ARM::VLD2LNq16,     true, false, false, EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq16Pseudo_UPD, ARM::VLD2LNq16_UPD, true, true, true,  EvenDblSpc, 2, 4 ,true},
+{ ARM::VLD2LNq32Pseudo,     ARM::VLD2LNq32,     true, false, false, EvenDblSpc, 2, 2 ,true},
+{ ARM::VLD2LNq32Pseudo_UPD, ARM::VLD2LNq32_UPD, true, true, true,  EvenDblSpc, 2, 2 ,true},
+
+{ ARM::VLD2q16Pseudo,       ARM::VLD2q16,      true,  false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_fixed,   ARM::VLD2q16wb_fixed, true, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q16PseudoWB_register,   ARM::VLD2q16wb_register, true, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VLD2q32Pseudo,       ARM::VLD2q32,      true,  false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_fixed,   ARM::VLD2q32wb_fixed, true, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q32PseudoWB_register,   ARM::VLD2q32wb_register, true, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VLD2q8Pseudo,        ARM::VLD2q8,       true,  false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_fixed,    ARM::VLD2q8wb_fixed, true, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VLD2q8PseudoWB_register,    ARM::VLD2q8wb_register, true, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VLD3DUPd16Pseudo,     ARM::VLD3DUPd16,     true, false, false, SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd16Pseudo_UPD, ARM::VLD3DUPd16_UPD, true, true, true,  SingleSpc, 3, 4,true},
+{ ARM::VLD3DUPd32Pseudo,     ARM::VLD3DUPd32,     true, false, false, SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd32Pseudo_UPD, ARM::VLD3DUPd32_UPD, true, true, true,  SingleSpc, 3, 2,true},
+{ ARM::VLD3DUPd8Pseudo,      ARM::VLD3DUPd8,      true, false, false, SingleSpc, 3, 8,true},
+{ ARM::VLD3DUPd8Pseudo_UPD,  ARM::VLD3DUPd8_UPD, true, true, true,  SingleSpc, 3, 8,true},
+
+{ ARM::VLD3LNd16Pseudo,     ARM::VLD3LNd16,     true, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd16Pseudo_UPD, ARM::VLD3LNd16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3LNd32Pseudo,     ARM::VLD3LNd32,     true, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd32Pseudo_UPD, ARM::VLD3LNd32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3LNd8Pseudo,      ARM::VLD3LNd8,      true, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNd8Pseudo_UPD,  ARM::VLD3LNd8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+{ ARM::VLD3LNq16Pseudo,     ARM::VLD3LNq16,     true, false, false, EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq16Pseudo_UPD, ARM::VLD3LNq16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3LNq32Pseudo,     ARM::VLD3LNq32,     true, false, false, EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3LNq32Pseudo_UPD, ARM::VLD3LNq32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+
+{ ARM::VLD3d16Pseudo,       ARM::VLD3d16,      true,  false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d16Pseudo_UPD,   ARM::VLD3d16_UPD, true, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VLD3d32Pseudo,       ARM::VLD3d32,      true,  false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d32Pseudo_UPD,   ARM::VLD3d32_UPD, true, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VLD3d8Pseudo,        ARM::VLD3d8,       true,  false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VLD3d8Pseudo_UPD,    ARM::VLD3d8_UPD, true, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VLD3q16Pseudo_UPD,    ARM::VLD3q16_UPD, true, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VLD3q16oddPseudo,     ARM::VLD3q16,     true,  false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q16oddPseudo_UPD, ARM::VLD3q16_UPD, true, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VLD3q32Pseudo_UPD,    ARM::VLD3q32_UPD, true, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VLD3q32oddPseudo,     ARM::VLD3q32,     true,  false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q32oddPseudo_UPD, ARM::VLD3q32_UPD, true, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VLD3q8Pseudo_UPD,     ARM::VLD3q8_UPD, true, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VLD3q8oddPseudo,      ARM::VLD3q8,      true,  false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VLD3q8oddPseudo_UPD,  ARM::VLD3q8_UPD, true, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VLD4DUPd16Pseudo,     ARM::VLD4DUPd16,     true, false, false, SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd16Pseudo_UPD, ARM::VLD4DUPd16_UPD, true, true, true,  SingleSpc, 4, 4,true},
+{ ARM::VLD4DUPd32Pseudo,     ARM::VLD4DUPd32,     true, false, false, SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd32Pseudo_UPD, ARM::VLD4DUPd32_UPD, true, true, true,  SingleSpc, 4, 2,true},
+{ ARM::VLD4DUPd8Pseudo,      ARM::VLD4DUPd8,      true, false, false, SingleSpc, 4, 8,true},
+{ ARM::VLD4DUPd8Pseudo_UPD,  ARM::VLD4DUPd8_UPD, true, true, true,  SingleSpc, 4, 8,true},
+
+{ ARM::VLD4LNd16Pseudo,     ARM::VLD4LNd16,     true, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd16Pseudo_UPD, ARM::VLD4LNd16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4LNd32Pseudo,     ARM::VLD4LNd32,     true, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd32Pseudo_UPD, ARM::VLD4LNd32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4LNd8Pseudo,      ARM::VLD4LNd8,      true, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNd8Pseudo_UPD,  ARM::VLD4LNd8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+{ ARM::VLD4LNq16Pseudo,     ARM::VLD4LNq16,     true, false, false, EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq16Pseudo_UPD, ARM::VLD4LNq16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4LNq32Pseudo,     ARM::VLD4LNq32,     true, false, false, EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4LNq32Pseudo_UPD, ARM::VLD4LNq32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+
+{ ARM::VLD4d16Pseudo,       ARM::VLD4d16,      true,  false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d16Pseudo_UPD,   ARM::VLD4d16_UPD, true, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VLD4d32Pseudo,       ARM::VLD4d32,      true,  false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d32Pseudo_UPD,   ARM::VLD4d32_UPD, true, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VLD4d8Pseudo,        ARM::VLD4d8,       true,  false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VLD4d8Pseudo_UPD,    ARM::VLD4d8_UPD, true, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VLD4q16Pseudo_UPD,    ARM::VLD4q16_UPD, true, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VLD4q16oddPseudo,     ARM::VLD4q16,     true,  false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q16oddPseudo_UPD, ARM::VLD4q16_UPD, true, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VLD4q32Pseudo_UPD,    ARM::VLD4q32_UPD, true, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VLD4q32oddPseudo,     ARM::VLD4q32,     true,  false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q32oddPseudo_UPD, ARM::VLD4q32_UPD, true, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VLD4q8Pseudo_UPD,     ARM::VLD4q8_UPD, true, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VLD4q8oddPseudo,      ARM::VLD4q8,      true,  false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VLD4q8oddPseudo_UPD,  ARM::VLD4q8_UPD, true, true, true,  OddDblSpc,  4, 8 ,true},
+
+{ ARM::VST1LNq16Pseudo,     ARM::VST1LNd16,    false, false, false, EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq16Pseudo_UPD, ARM::VST1LNd16_UPD, false, true, true,  EvenDblSpc, 1, 4 ,true},
+{ ARM::VST1LNq32Pseudo,     ARM::VST1LNd32,    false, false, false, EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq32Pseudo_UPD, ARM::VST1LNd32_UPD, false, true, true,  EvenDblSpc, 1, 2 ,true},
+{ ARM::VST1LNq8Pseudo,      ARM::VST1LNd8,     false, false, false, EvenDblSpc, 1, 8 ,true},
+{ ARM::VST1LNq8Pseudo_UPD,  ARM::VST1LNd8_UPD, false, true, true,  EvenDblSpc, 1, 8 ,true},
+
+{ ARM::VST1d64QPseudo,      ARM::VST1d64Q,     false, false, false, SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_fixed,  ARM::VST1d64Qwb_fixed, false, true, false,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64QPseudoWB_register, ARM::VST1d64Qwb_register, false, true, true,  SingleSpc,  4, 1 ,false},
+{ ARM::VST1d64TPseudo,      ARM::VST1d64T,     false, false, false, SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_fixed,  ARM::VST1d64Twb_fixed, false, true, false,  SingleSpc,  3, 1 ,false},
+{ ARM::VST1d64TPseudoWB_register,  ARM::VST1d64Twb_register, false, true, true,  SingleSpc,  3, 1 ,false},
+
+{ ARM::VST2LNd16Pseudo,     ARM::VST2LNd16,     false, false, false, SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd16Pseudo_UPD, ARM::VST2LNd16_UPD, false, true, true,  SingleSpc, 2, 4 ,true},
+{ ARM::VST2LNd32Pseudo,     ARM::VST2LNd32,     false, false, false, SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd32Pseudo_UPD, ARM::VST2LNd32_UPD, false, true, true,  SingleSpc, 2, 2 ,true},
+{ ARM::VST2LNd8Pseudo,      ARM::VST2LNd8,      false, false, false, SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNd8Pseudo_UPD,  ARM::VST2LNd8_UPD, false, true, true,  SingleSpc, 2, 8 ,true},
+{ ARM::VST2LNq16Pseudo,     ARM::VST2LNq16,     false, false, false, EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq16Pseudo_UPD, ARM::VST2LNq16_UPD, false, true, true,  EvenDblSpc, 2, 4,true},
+{ ARM::VST2LNq32Pseudo,     ARM::VST2LNq32,     false, false, false, EvenDblSpc, 2, 2,true},
+{ ARM::VST2LNq32Pseudo_UPD, ARM::VST2LNq32_UPD, false, true, true,  EvenDblSpc, 2, 2,true},
+
+{ ARM::VST2q16Pseudo,       ARM::VST2q16,      false, false, false, SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_fixed,   ARM::VST2q16wb_fixed, false, true, false,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q16PseudoWB_register,   ARM::VST2q16wb_register, false, true, true,  SingleSpc,  4, 4 ,false},
+{ ARM::VST2q32Pseudo,       ARM::VST2q32,      false, false, false, SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_fixed,   ARM::VST2q32wb_fixed, false, true, false,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q32PseudoWB_register,   ARM::VST2q32wb_register, false, true, true,  SingleSpc,  4, 2 ,false},
+{ ARM::VST2q8Pseudo,        ARM::VST2q8,       false, false, false, SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_fixed,    ARM::VST2q8wb_fixed, false, true, false,  SingleSpc,  4, 8 ,false},
+{ ARM::VST2q8PseudoWB_register,    ARM::VST2q8wb_register, false, true, true,  SingleSpc,  4, 8 ,false},
+
+{ ARM::VST3LNd16Pseudo,     ARM::VST3LNd16,     false, false, false, SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd16Pseudo_UPD, ARM::VST3LNd16_UPD, false, true, true,  SingleSpc, 3, 4 ,true},
+{ ARM::VST3LNd32Pseudo,     ARM::VST3LNd32,     false, false, false, SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd32Pseudo_UPD, ARM::VST3LNd32_UPD, false, true, true,  SingleSpc, 3, 2 ,true},
+{ ARM::VST3LNd8Pseudo,      ARM::VST3LNd8,      false, false, false, SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNd8Pseudo_UPD,  ARM::VST3LNd8_UPD, false, true, true,  SingleSpc, 3, 8 ,true},
+{ ARM::VST3LNq16Pseudo,     ARM::VST3LNq16,     false, false, false, EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq16Pseudo_UPD, ARM::VST3LNq16_UPD, false, true, true,  EvenDblSpc, 3, 4,true},
+{ ARM::VST3LNq32Pseudo,     ARM::VST3LNq32,     false, false, false, EvenDblSpc, 3, 2,true},
+{ ARM::VST3LNq32Pseudo_UPD, ARM::VST3LNq32_UPD, false, true, true,  EvenDblSpc, 3, 2,true},
+
+{ ARM::VST3d16Pseudo,       ARM::VST3d16,      false, false, false, SingleSpc,  3, 4 ,true},
+{ ARM::VST3d16Pseudo_UPD,   ARM::VST3d16_UPD, false, true, true,  SingleSpc,  3, 4 ,true},
+{ ARM::VST3d32Pseudo,       ARM::VST3d32,      false, false, false, SingleSpc,  3, 2 ,true},
+{ ARM::VST3d32Pseudo_UPD,   ARM::VST3d32_UPD, false, true, true,  SingleSpc,  3, 2 ,true},
+{ ARM::VST3d8Pseudo,        ARM::VST3d8,       false, false, false, SingleSpc,  3, 8 ,true},
+{ ARM::VST3d8Pseudo_UPD,    ARM::VST3d8_UPD, false, true, true,  SingleSpc,  3, 8 ,true},
+
+{ ARM::VST3q16Pseudo_UPD,    ARM::VST3q16_UPD, false, true, true,  EvenDblSpc, 3, 4 ,true},
+{ ARM::VST3q16oddPseudo,     ARM::VST3q16,     false, false, false, OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q16oddPseudo_UPD, ARM::VST3q16_UPD, false, true, true,  OddDblSpc,  3, 4 ,true},
+{ ARM::VST3q32Pseudo_UPD,    ARM::VST3q32_UPD, false, true, true,  EvenDblSpc, 3, 2 ,true},
+{ ARM::VST3q32oddPseudo,     ARM::VST3q32,     false, false, false, OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q32oddPseudo_UPD, ARM::VST3q32_UPD, false, true, true,  OddDblSpc,  3, 2 ,true},
+{ ARM::VST3q8Pseudo_UPD,     ARM::VST3q8_UPD, false, true, true,  EvenDblSpc, 3, 8 ,true},
+{ ARM::VST3q8oddPseudo,      ARM::VST3q8,      false, false, false, OddDblSpc,  3, 8 ,true},
+{ ARM::VST3q8oddPseudo_UPD,  ARM::VST3q8_UPD, false, true, true,  OddDblSpc,  3, 8 ,true},
+
+{ ARM::VST4LNd16Pseudo,     ARM::VST4LNd16,     false, false, false, SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd16Pseudo_UPD, ARM::VST4LNd16_UPD, false, true, true,  SingleSpc, 4, 4 ,true},
+{ ARM::VST4LNd32Pseudo,     ARM::VST4LNd32,     false, false, false, SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd32Pseudo_UPD, ARM::VST4LNd32_UPD, false, true, true,  SingleSpc, 4, 2 ,true},
+{ ARM::VST4LNd8Pseudo,      ARM::VST4LNd8,      false, false, false, SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNd8Pseudo_UPD,  ARM::VST4LNd8_UPD, false, true, true,  SingleSpc, 4, 8 ,true},
+{ ARM::VST4LNq16Pseudo,     ARM::VST4LNq16,     false, false, false, EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq16Pseudo_UPD, ARM::VST4LNq16_UPD, false, true, true,  EvenDblSpc, 4, 4,true},
+{ ARM::VST4LNq32Pseudo,     ARM::VST4LNq32,     false, false, false, EvenDblSpc, 4, 2,true},
+{ ARM::VST4LNq32Pseudo_UPD, ARM::VST4LNq32_UPD, false, true, true,  EvenDblSpc, 4, 2,true},
+
+{ ARM::VST4d16Pseudo,       ARM::VST4d16,      false, false, false, SingleSpc,  4, 4 ,true},
+{ ARM::VST4d16Pseudo_UPD,   ARM::VST4d16_UPD, false, true, true,  SingleSpc,  4, 4 ,true},
+{ ARM::VST4d32Pseudo,       ARM::VST4d32,      false, false, false, SingleSpc,  4, 2 ,true},
+{ ARM::VST4d32Pseudo_UPD,   ARM::VST4d32_UPD, false, true, true,  SingleSpc,  4, 2 ,true},
+{ ARM::VST4d8Pseudo,        ARM::VST4d8,       false, false, false, SingleSpc,  4, 8 ,true},
+{ ARM::VST4d8Pseudo_UPD,    ARM::VST4d8_UPD, false, true, true,  SingleSpc,  4, 8 ,true},
+
+{ ARM::VST4q16Pseudo_UPD,    ARM::VST4q16_UPD, false, true, true,  EvenDblSpc, 4, 4 ,true},
+{ ARM::VST4q16oddPseudo,     ARM::VST4q16,     false, false, false, OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q16oddPseudo_UPD, ARM::VST4q16_UPD, false, true, true,  OddDblSpc,  4, 4 ,true},
+{ ARM::VST4q32Pseudo_UPD,    ARM::VST4q32_UPD, false, true, true,  EvenDblSpc, 4, 2 ,true},
+{ ARM::VST4q32oddPseudo,     ARM::VST4q32,     false, false, false, OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q32oddPseudo_UPD, ARM::VST4q32_UPD, false, true, true,  OddDblSpc,  4, 2 ,true},
+{ ARM::VST4q8Pseudo_UPD,     ARM::VST4q8_UPD, false, true, true,  EvenDblSpc, 4, 8 ,true},
+{ ARM::VST4q8oddPseudo,      ARM::VST4q8,      false, false, false, OddDblSpc,  4, 8 ,true},
+{ ARM::VST4q8oddPseudo_UPD,  ARM::VST4q8_UPD, false, true, true,  OddDblSpc,  4, 8 ,true}
+};
+
+/// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON
+/// load or store pseudo instruction.
+static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
+  const unsigned NumEntries = array_lengthof(NEONLdStTable);
+
+#ifndef NDEBUG
+  // Make sure the table is sorted.
+  static bool TableChecked = false;
+  if (!TableChecked) {
+    for (unsigned i = 0; i != NumEntries-1; ++i)
+      assert(NEONLdStTable[i] < NEONLdStTable[i+1] &&
+             "NEONLdStTable is not sorted!");
+    TableChecked = true;
+  }
+#endif
+
+  const NEONLdStTableEntry *I =
+    std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
+  if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
+    return I;
+  return NULL;
+}
+
+/// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
+/// corresponding to the specified register spacing.  Not all of the results
+/// are necessarily valid, e.g., a Q register only has 2 D subregisters.
+static void GetDSubRegs(unsigned Reg, NEONRegSpacing RegSpc,
+                        const TargetRegisterInfo *TRI, unsigned &D0,
+                        unsigned &D1, unsigned &D2, unsigned &D3) {
+  if (RegSpc == SingleSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_3);
+  } else if (RegSpc == EvenDblSpc) {
+    D0 = TRI->getSubReg(Reg, ARM::dsub_0);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_2);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_4);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_6);
+  } else {
+    assert(RegSpc == OddDblSpc && "unknown register spacing");
+    D0 = TRI->getSubReg(Reg, ARM::dsub_1);
+    D1 = TRI->getSubReg(Reg, ARM::dsub_3);
+    D2 = TRI->getSubReg(Reg, ARM::dsub_5);
+    D3 = TRI->getSubReg(Reg, ARM::dsub_7);
+  }
+}
+
+/// ExpandVLD - Translate VLD pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VLD instructions with D register operands.
+void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+
+  bool DstIsDead = MI.getOperand(OpIdx).isDead();
+  unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // For an instruction writing double-spaced subregs, the pseudo instruction
+  // has an extra operand that is a use of the super-register.  Record the
+  // operand index and skip over it.
+  unsigned SrcOpIdx = 0;
+  if (RegSpc == EvenDblSpc || RegSpc == OddDblSpc)
+    SrcOpIdx = OpIdx++;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source operand used for double-spaced subregs over
+  // to the new instruction as an implicit operand.
+  if (SrcOpIdx != 0) {
+    MachineOperand MO = MI.getOperand(SrcOpIdx);
+    MO.setImplicit(true);
+    MIB.addOperand(MO);
+  }
+  // Add an implicit def for the super-register.
+  MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandVST - Translate VST pseudo instructions with Q, QQ or QQQQ register
+/// operands to real VST instructions with D register operands.
+void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, RegSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 1 && TableEntry->copyAllListRegs)
+    MIB.addReg(D1, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 2 && TableEntry->copyAllListRegs)
+    MIB.addReg(D2, getUndefRegState(SrcIsUndef));
+  if (NumRegs > 3 && TableEntry->copyAllListRegs)
+    MIB.addReg(D3, getUndefRegState(SrcIsUndef));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
+  TransferImpOps(MI, MIB, MIB);
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+  MI.eraseFromParent();
+}
+
+/// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
+  assert(TableEntry && "NEONLdStTable lookup failed");
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
+  unsigned NumRegs = TableEntry->NumRegs;
+  unsigned RegElts = TableEntry->RegElts;
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                    TII->get(TableEntry->RealOpc));
+  unsigned OpIdx = 0;
+  // The lane operand is always the 3rd from last operand, before the 2
+  // predicate operands.
+  unsigned Lane = MI.getOperand(MI.getDesc().getNumOperands() - 3).getImm();
+
+  // Adjust the lane and spacing as needed for Q registers.
+  assert(RegSpc != OddDblSpc && "unexpected register spacing for VLD/VST-lane");
+  if (RegSpc == EvenDblSpc && Lane >= RegElts) {
+    RegSpc = OddDblSpc;
+    Lane -= RegElts;
+  }
+  assert(Lane < RegElts && "out of range lane for VLD/VST-lane");
+
+  unsigned D0 = 0, D1 = 0, D2 = 0, D3 = 0;
+  unsigned DstReg = 0;
+  bool DstIsDead = false;
+  if (TableEntry->IsLoad) {
+    DstIsDead = MI.getOperand(OpIdx).isDead();
+    DstReg = MI.getOperand(OpIdx++).getReg();
+    GetDSubRegs(DstReg, RegSpc, TRI, D0, D1, D2, D3);
+    MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 1)
+      MIB.addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 2)
+      MIB.addReg(D2, RegState::Define | getDeadRegState(DstIsDead));
+    if (NumRegs > 3)
+      MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
+  }
+
+  if (TableEntry->isUpdating)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the addrmode6 operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  // Copy the am6offset operand.
+  if (TableEntry->hasWritebackOperand)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Grab the super-register source.
+  MachineOperand MO = MI.getOperand(OpIdx++);
+  if (!TableEntry->IsLoad)
+    GetDSubRegs(MO.getReg(), RegSpc, TRI, D0, D1, D2, D3);
+
+  // Add the subregs as sources of the new instruction.
+  unsigned SrcFlags = (getUndefRegState(MO.isUndef()) |
+                       getKillRegState(MO.isKill()));
+  MIB.addReg(D0, SrcFlags);
+  if (NumRegs > 1)
+    MIB.addReg(D1, SrcFlags);
+  if (NumRegs > 2)
+    MIB.addReg(D2, SrcFlags);
+  if (NumRegs > 3)
+    MIB.addReg(D3, SrcFlags);
+
+  // Add the lane number operand.
+  MIB.addImm(Lane);
+  OpIdx += 1;
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the super-register source to be an implicit source.
+  MO.setImplicit(true);
+  MIB.addOperand(MO);
+  if (TableEntry->IsLoad)
+    // Add an implicit def for the super-register.
+    MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+  TransferImpOps(MI, MIB, MIB);
+  // Transfer memoperands.
+  MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  MI.eraseFromParent();
+}
+
+/// ExpandVTBL - Translate VTBL and VTBX pseudo instructions with Q or QQ
+/// register operands to real instructions with D register operands.
+void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
+                                 unsigned Opc, bool IsExt) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+  unsigned OpIdx = 0;
+
+  // Transfer the destination register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  if (IsExt)
+    MIB.addOperand(MI.getOperand(OpIdx++));
+
+  bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+  unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+  unsigned D0, D1, D2, D3;
+  GetDSubRegs(SrcReg, SingleSpc, TRI, D0, D1, D2, D3);
+  MIB.addReg(D0);
+
+  // Copy the other source register operand.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  // Copy the predicate operands.
+  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.addOperand(MI.getOperand(OpIdx++));
+
+  if (SrcIsKill)  // Add an implicit kill for the super-reg.
+    MIB->addRegisterKilled(SrcReg, TRI, true);
+  TransferImpOps(MI, MIB, MIB);
+  MI.eraseFromParent();
+}
+
+void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
+  const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  MachineInstrBuilder LO16, HI16;
+
+  if (!STI->hasV6T2Ops() &&
+      (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // Expand into a movi + orr.
+    LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
+    HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg);
+
+    assert (MO.isImm() && "MOVi32imm w/ non-immediate source operand!");
+    unsigned ImmVal = (unsigned)MO.getImm();
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    LO16 = LO16.addImm(SOImmValV1);
+    HI16 = HI16.addImm(SOImmValV2);
+    LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    LO16.addImm(Pred).addReg(PredReg).addReg(0);
+    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    TransferImpOps(MI, LO16, HI16);
+    MI.eraseFromParent();
+    return;
+  }
+
+  unsigned LO16Opc = 0;
+  unsigned HI16Opc = 0;
+  if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+    LO16Opc = ARM::t2MOVi16;
+    HI16Opc = ARM::t2MOVTi16;
+  } else {
+    LO16Opc = ARM::MOVi16;
+    HI16Opc = ARM::MOVTi16;
+  }
+
+  LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
+  HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(DstReg);
+
+  if (MO.isImm()) {
+    unsigned Imm = MO.getImm();
+    unsigned Lo16 = Imm & 0xffff;
+    unsigned Hi16 = (Imm >> 16) & 0xffff;
+    LO16 = LO16.addImm(Lo16);
+    HI16 = HI16.addImm(Hi16);
+  } else {
+    const GlobalValue *GV = MO.getGlobal();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+    HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+  }
+
+  LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+  LO16.addImm(Pred).addReg(PredReg);
+  HI16.addImm(Pred).addReg(PredReg);
+
+  TransferImpOps(MI, LO16, HI16);
+  MI.eraseFromParent();
+}
+
+bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+    default:
+      return false;
+    case ARM::VMOVScc:
+    case ARM::VMOVDcc: {
+      unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
+              MI.getOperand(1).getReg())
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg());
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCr:
+    case ARM::MOVCCr: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCsi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              (MI.getOperand(1).getReg()))
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addReg(MI.getOperand(5).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVCCsr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
+              (MI.getOperand(1).getReg()))
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addReg(MI.getOperand(3).getReg(),
+                getKillRegState(MI.getOperand(3).isKill()))
+        .addImm(MI.getOperand(4).getImm())
+        .addImm(MI.getOperand(5).getImm()) // 'pred'
+        .addReg(MI.getOperand(6).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVCCi16: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg());
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::t2MOVCCi:
+    case ARM::MOVCCi: {
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MVNCCi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi),
+              MI.getOperand(1).getReg())
+        .addImm(MI.getOperand(2).getImm())
+        .addImm(MI.getOperand(3).getImm()) // 'pred'
+        .addReg(MI.getOperand(4).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::Int_eh_sjlj_dispatchsetup: {
+      MachineFunction &MF = *MI.getParent()->getParent();
+      const ARMBaseInstrInfo *AII =
+        static_cast<const ARMBaseInstrInfo*>(TII);
+      const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+      // For functions using a base pointer, we rematerialize it (via the frame
+      // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it
+      // for us. Otherwise, expand to nothing.
+      if (RI.hasBasePointer(MF)) {
+        int32_t NumBytes = AFI->getFramePtrSpillOffset();
+        unsigned FramePtr = RI.getFrameRegister(MF);
+        assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+               "base pointer without frame pointer?");
+
+        if (AFI->isThumb2Function()) {
+          emitT2RegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *TII);
+        } else if (AFI->isThumbFunction()) {
+          emitThumbRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *TII, RI);
+        } else {
+          emitARMRegPlusImmediate(MBB, MBBI, MI.getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0,
+                                  *TII);
+        }
+        // If there's dynamic realignment, adjust for it.
+        if (RI.needsStackRealignment(MF)) {
+          MachineFrameInfo  *MFI = MF.getFrameInfo();
+          unsigned MaxAlign = MFI->getMaxAlignment();
+          assert (!AFI->isThumb1OnlyFunction());
+          // Emit bic r6, r6, MaxAlign
+          unsigned bicOpc = AFI->isThumbFunction() ?
+            ARM::t2BICri : ARM::BICri;
+          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                              TII->get(bicOpc), ARM::R6)
+                                      .addReg(ARM::R6, RegState::Kill)
+                                      .addImm(MaxAlign-1)));
+        }
+
+      }
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVsrl_flag:
+    case ARM::MOVsra_flag: {
+      // These are just fancy MOVs insructions.
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+                             MI.getOperand(0).getReg())
+                     .addOperand(MI.getOperand(1))
+                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
+                                                  ARM_AM::lsr : ARM_AM::asr),
+                                                 1)))
+        .addReg(ARM::CPSR, RegState::Define);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::RRX: {
+      // This encodes as "MOVs Rd, Rm, rrx
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
+                               MI.getOperand(0).getReg())
+                       .addOperand(MI.getOperand(1))
+                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
+        .addReg(0);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tTPsoft:
+    case ARM::TPsoft: {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL))
+        .addExternalSymbol("__aeabi_read_tp", 0);
+
+      MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tLDRpci_pic:
+    case ARM::t2LDRpci_pic: {
+      unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
+        ? ARM::tLDRpci : ARM::t2LDRpci;
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MachineInstrBuilder MIB1 =
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                               TII->get(NewLdOpc), DstReg)
+                       .addOperand(MI.getOperand(1)));
+      MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(ARM::tPICADD))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg)
+        .addOperand(MI.getOperand(2));
+      TransferImpOps(MI, MIB1, MIB2);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOV_ga_dyn:
+    case ARM::MOV_ga_pcrel:
+    case ARM::MOV_ga_pcrel_ldr:
+    case ARM::t2MOV_ga_dyn:
+    case ARM::t2MOV_ga_pcrel: {
+      // Expand into movw + movw. Also "add pc" / ldr [pc] in PIC mode.
+      unsigned LabelId = AFI->createPICLabelUId();
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      const MachineOperand &MO1 = MI.getOperand(1);
+      const GlobalValue *GV = MO1.getGlobal();
+      unsigned TF = MO1.getTargetFlags();
+      bool isARM = (Opcode != ARM::t2MOV_ga_pcrel && Opcode!=ARM::t2MOV_ga_dyn);
+      bool isPIC = (Opcode != ARM::MOV_ga_dyn && Opcode != ARM::t2MOV_ga_dyn);
+      unsigned LO16Opc = isARM ? ARM::MOVi16_ga_pcrel : ARM::t2MOVi16_ga_pcrel;
+      unsigned HI16Opc = isARM ? ARM::MOVTi16_ga_pcrel :ARM::t2MOVTi16_ga_pcrel;
+      unsigned LO16TF = isPIC
+        ? ARMII::MO_LO16_NONLAZY_PIC : ARMII::MO_LO16_NONLAZY;
+      unsigned HI16TF = isPIC
+        ? ARMII::MO_HI16_NONLAZY_PIC : ARMII::MO_HI16_NONLAZY;
+      unsigned PICAddOpc = isARM
+        ? (Opcode == ARM::MOV_ga_pcrel_ldr ? ARM::PICLDR : ARM::PICADD)
+        : ARM::tPICADD;
+      MachineInstrBuilder MIB1 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(LO16Opc), DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | LO16TF)
+        .addImm(LabelId);
+      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(HI16Opc), DstReg)
+        .addReg(DstReg)
+        .addGlobalAddress(GV, MO1.getOffset(), TF | HI16TF)
+        .addImm(LabelId);
+      if (!isPIC) {
+        TransferImpOps(MI, MIB1, MIB2);
+        MI.eraseFromParent();
+        return true;
+      }
+
+      MachineInstrBuilder MIB3 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                         TII->get(PICAddOpc))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg).addImm(LabelId);
+      if (isARM) {
+        AddDefaultPred(MIB3);
+        if (Opcode == ARM::MOV_ga_pcrel_ldr)
+          MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      }
+      TransferImpOps(MI, MIB1, MIB3);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVi32imm:
+    case ARM::MOVCCi32imm:
+    case ARM::t2MOVi32imm:
+    case ARM::t2MOVCCi32imm:
+      ExpandMOV32BitImm(MBB, MBBI);
+      return true;
+
+    case ARM::VLDMQIA: {
+      unsigned NewOpc = ARM::VLDMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register destination.
+      bool DstIsDead = MI.getOperand(OpIdx).isDead();
+      unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the source register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the destination operands (D subregs).
+      unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(DstReg, ARM::dsub_1);
+      MIB.addReg(D0, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(D1, RegState::Define | getDeadRegState(DstIsDead));
+
+      // Add an implicit def for the super-register.
+      MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VSTMQIA: {
+      unsigned NewOpc = ARM::VSTMDIA;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+
+      // Grab the Q register source.
+      bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+      unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
+
+      // Copy the destination register.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Copy the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      // Add the source operands (D subregs).
+      unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+      unsigned D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+      MIB.addReg(D0).addReg(D1);
+
+      if (SrcIsKill)      // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(SrcReg, TRI, true);
+
+      TransferImpOps(MI, MIB, MIB);
+      MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::VDUPfqf:
+    case ARM::VDUPfdf:{
+      unsigned NewOpc = Opcode == ARM::VDUPfqf ? ARM::VDUPLN32q :
+        ARM::VDUPLN32d;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+      unsigned OpIdx = 0;
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned Lane = TRI->getEncodingValue(SrcReg) & 1;
+      unsigned DReg = TRI->getMatchingSuperReg(SrcReg,
+                            Lane & 1 ? ARM::ssub_1 : ARM::ssub_0,
+                            &ARM::DPR_VFP2RegClass);
+      // The lane is [0,1] for the containing DReg superregister.
+      // Copy the dst/src register operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addReg(DReg);
+      ++OpIdx;
+      // Add the lane select operand.
+      MIB.addImm(Lane);
+      // Add the predicate operands.
+      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.addOperand(MI.getOperand(OpIdx++));
+
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::VLD2q8Pseudo:
+    case ARM::VLD2q16Pseudo:
+    case ARM::VLD2q32Pseudo:
+    case ARM::VLD2q8PseudoWB_fixed:
+    case ARM::VLD2q16PseudoWB_fixed:
+    case ARM::VLD2q32PseudoWB_fixed:
+    case ARM::VLD2q8PseudoWB_register:
+    case ARM::VLD2q16PseudoWB_register:
+    case ARM::VLD2q32PseudoWB_register:
+    case ARM::VLD3d8Pseudo:
+    case ARM::VLD3d16Pseudo:
+    case ARM::VLD3d32Pseudo:
+    case ARM::VLD1d64TPseudo:
+    case ARM::VLD3d8Pseudo_UPD:
+    case ARM::VLD3d16Pseudo_UPD:
+    case ARM::VLD3d32Pseudo_UPD:
+    case ARM::VLD3q8Pseudo_UPD:
+    case ARM::VLD3q16Pseudo_UPD:
+    case ARM::VLD3q32Pseudo_UPD:
+    case ARM::VLD3q8oddPseudo:
+    case ARM::VLD3q16oddPseudo:
+    case ARM::VLD3q32oddPseudo:
+    case ARM::VLD3q8oddPseudo_UPD:
+    case ARM::VLD3q16oddPseudo_UPD:
+    case ARM::VLD3q32oddPseudo_UPD:
+    case ARM::VLD4d8Pseudo:
+    case ARM::VLD4d16Pseudo:
+    case ARM::VLD4d32Pseudo:
+    case ARM::VLD1d64QPseudo:
+    case ARM::VLD4d8Pseudo_UPD:
+    case ARM::VLD4d16Pseudo_UPD:
+    case ARM::VLD4d32Pseudo_UPD:
+    case ARM::VLD4q8Pseudo_UPD:
+    case ARM::VLD4q16Pseudo_UPD:
+    case ARM::VLD4q32Pseudo_UPD:
+    case ARM::VLD4q8oddPseudo:
+    case ARM::VLD4q16oddPseudo:
+    case ARM::VLD4q32oddPseudo:
+    case ARM::VLD4q8oddPseudo_UPD:
+    case ARM::VLD4q16oddPseudo_UPD:
+    case ARM::VLD4q32oddPseudo_UPD:
+    case ARM::VLD3DUPd8Pseudo:
+    case ARM::VLD3DUPd16Pseudo:
+    case ARM::VLD3DUPd32Pseudo:
+    case ARM::VLD3DUPd8Pseudo_UPD:
+    case ARM::VLD3DUPd16Pseudo_UPD:
+    case ARM::VLD3DUPd32Pseudo_UPD:
+    case ARM::VLD4DUPd8Pseudo:
+    case ARM::VLD4DUPd16Pseudo:
+    case ARM::VLD4DUPd32Pseudo:
+    case ARM::VLD4DUPd8Pseudo_UPD:
+    case ARM::VLD4DUPd16Pseudo_UPD:
+    case ARM::VLD4DUPd32Pseudo_UPD:
+      ExpandVLD(MBBI);
+      return true;
+
+    case ARM::VST2q8Pseudo:
+    case ARM::VST2q16Pseudo:
+    case ARM::VST2q32Pseudo:
+    case ARM::VST2q8PseudoWB_fixed:
+    case ARM::VST2q16PseudoWB_fixed:
+    case ARM::VST2q32PseudoWB_fixed:
+    case ARM::VST2q8PseudoWB_register:
+    case ARM::VST2q16PseudoWB_register:
+    case ARM::VST2q32PseudoWB_register:
+    case ARM::VST3d8Pseudo:
+    case ARM::VST3d16Pseudo:
+    case ARM::VST3d32Pseudo:
+    case ARM::VST1d64TPseudo:
+    case ARM::VST3d8Pseudo_UPD:
+    case ARM::VST3d16Pseudo_UPD:
+    case ARM::VST3d32Pseudo_UPD:
+    case ARM::VST1d64TPseudoWB_fixed:
+    case ARM::VST1d64TPseudoWB_register:
+    case ARM::VST3q8Pseudo_UPD:
+    case ARM::VST3q16Pseudo_UPD:
+    case ARM::VST3q32Pseudo_UPD:
+    case ARM::VST3q8oddPseudo:
+    case ARM::VST3q16oddPseudo:
+    case ARM::VST3q32oddPseudo:
+    case ARM::VST3q8oddPseudo_UPD:
+    case ARM::VST3q16oddPseudo_UPD:
+    case ARM::VST3q32oddPseudo_UPD:
+    case ARM::VST4d8Pseudo:
+    case ARM::VST4d16Pseudo:
+    case ARM::VST4d32Pseudo:
+    case ARM::VST1d64QPseudo:
+    case ARM::VST4d8Pseudo_UPD:
+    case ARM::VST4d16Pseudo_UPD:
+    case ARM::VST4d32Pseudo_UPD:
+    case ARM::VST1d64QPseudoWB_fixed:
+    case ARM::VST1d64QPseudoWB_register:
+    case ARM::VST4q8Pseudo_UPD:
+    case ARM::VST4q16Pseudo_UPD:
+    case ARM::VST4q32Pseudo_UPD:
+    case ARM::VST4q8oddPseudo:
+    case ARM::VST4q16oddPseudo:
+    case ARM::VST4q32oddPseudo:
+    case ARM::VST4q8oddPseudo_UPD:
+    case ARM::VST4q16oddPseudo_UPD:
+    case ARM::VST4q32oddPseudo_UPD:
+      ExpandVST(MBBI);
+      return true;
+
+    case ARM::VLD1LNq8Pseudo:
+    case ARM::VLD1LNq16Pseudo:
+    case ARM::VLD1LNq32Pseudo:
+    case ARM::VLD1LNq8Pseudo_UPD:
+    case ARM::VLD1LNq16Pseudo_UPD:
+    case ARM::VLD1LNq32Pseudo_UPD:
+    case ARM::VLD2LNd8Pseudo:
+    case ARM::VLD2LNd16Pseudo:
+    case ARM::VLD2LNd32Pseudo:
+    case ARM::VLD2LNq16Pseudo:
+    case ARM::VLD2LNq32Pseudo:
+    case ARM::VLD2LNd8Pseudo_UPD:
+    case ARM::VLD2LNd16Pseudo_UPD:
+    case ARM::VLD2LNd32Pseudo_UPD:
+    case ARM::VLD2LNq16Pseudo_UPD:
+    case ARM::VLD2LNq32Pseudo_UPD:
+    case ARM::VLD3LNd8Pseudo:
+    case ARM::VLD3LNd16Pseudo:
+    case ARM::VLD3LNd32Pseudo:
+    case ARM::VLD3LNq16Pseudo:
+    case ARM::VLD3LNq32Pseudo:
+    case ARM::VLD3LNd8Pseudo_UPD:
+    case ARM::VLD3LNd16Pseudo_UPD:
+    case ARM::VLD3LNd32Pseudo_UPD:
+    case ARM::VLD3LNq16Pseudo_UPD:
+    case ARM::VLD3LNq32Pseudo_UPD:
+    case ARM::VLD4LNd8Pseudo:
+    case ARM::VLD4LNd16Pseudo:
+    case ARM::VLD4LNd32Pseudo:
+    case ARM::VLD4LNq16Pseudo:
+    case ARM::VLD4LNq32Pseudo:
+    case ARM::VLD4LNd8Pseudo_UPD:
+    case ARM::VLD4LNd16Pseudo_UPD:
+    case ARM::VLD4LNd32Pseudo_UPD:
+    case ARM::VLD4LNq16Pseudo_UPD:
+    case ARM::VLD4LNq32Pseudo_UPD:
+    case ARM::VST1LNq8Pseudo:
+    case ARM::VST1LNq16Pseudo:
+    case ARM::VST1LNq32Pseudo:
+    case ARM::VST1LNq8Pseudo_UPD:
+    case ARM::VST1LNq16Pseudo_UPD:
+    case ARM::VST1LNq32Pseudo_UPD:
+    case ARM::VST2LNd8Pseudo:
+    case ARM::VST2LNd16Pseudo:
+    case ARM::VST2LNd32Pseudo:
+    case ARM::VST2LNq16Pseudo:
+    case ARM::VST2LNq32Pseudo:
+    case ARM::VST2LNd8Pseudo_UPD:
+    case ARM::VST2LNd16Pseudo_UPD:
+    case ARM::VST2LNd32Pseudo_UPD:
+    case ARM::VST2LNq16Pseudo_UPD:
+    case ARM::VST2LNq32Pseudo_UPD:
+    case ARM::VST3LNd8Pseudo:
+    case ARM::VST3LNd16Pseudo:
+    case ARM::VST3LNd32Pseudo:
+    case ARM::VST3LNq16Pseudo:
+    case ARM::VST3LNq32Pseudo:
+    case ARM::VST3LNd8Pseudo_UPD:
+    case ARM::VST3LNd16Pseudo_UPD:
+    case ARM::VST3LNd32Pseudo_UPD:
+    case ARM::VST3LNq16Pseudo_UPD:
+    case ARM::VST3LNq32Pseudo_UPD:
+    case ARM::VST4LNd8Pseudo:
+    case ARM::VST4LNd16Pseudo:
+    case ARM::VST4LNd32Pseudo:
+    case ARM::VST4LNq16Pseudo:
+    case ARM::VST4LNq32Pseudo:
+    case ARM::VST4LNd8Pseudo_UPD:
+    case ARM::VST4LNd16Pseudo_UPD:
+    case ARM::VST4LNd32Pseudo_UPD:
+    case ARM::VST4LNq16Pseudo_UPD:
+    case ARM::VST4LNq32Pseudo_UPD:
+      ExpandLaneOp(MBBI);
+      return true;
+
+    case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
+    case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
+    case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
+    case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+  }
+}
+
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+    Modified |= ExpandMI(MBB, MBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+  STI = &TM.getSubtarget<ARMSubtarget>();
+  AFI = MF.getInfo<ARMFunctionInfo>();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= ExpandMBB(*MFI);
+  if (VerifyARMPseudo)
+    MF.verify(this, "After expanding ARM pseudo instructions.");
+  return Modified;
+}
+
+/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createARMExpandPseudoPass() {
+  return new ARMExpandPseudo();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
new file mode 100644
index 000000000000..29fcd4009af3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -0,0 +1,2973 @@
+//===-- ARMFastISel.cpp - ARM FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// ARMGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnableARMLongCalls;
+
+namespace {
+
+  // All possible address modes, plus some.
+  typedef struct Address {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    union {
+      unsigned Reg;
+      int FI;
+    } Base;
+
+    int Offset;
+
+    // Innocuous defaults for our address.
+    Address()
+     : BaseType(RegBase), Offset(0) {
+       Base.Reg = 0;
+     }
+  } Address;
+
+class ARMFastISel : public FastISel {
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  ARMFunctionInfo *AFI;
+
+  // Convenience variables to avoid some queries.
+  bool isThumb2;
+  LLVMContext *Context;
+
+  public:
+    explicit ARMFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo),
+      TM(funcInfo.MF->getTarget()),
+      TII(*TM.getInstrInfo()),
+      TLI(*TM.getTargetLowering()) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+      AFI = funcInfo.MF->getInfo<ARMFunctionInfo>();
+      isThumb2 = AFI->isThumbFunction();
+      Context = &funcInfo.Fn->getContext();
+    }
+
+    // Code from FastISel.cpp.
+  private:
+    unsigned FastEmitInst_(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC);
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+    unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              unsigned Op2, bool Op2IsKill);
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             const ConstantFP *FPImm);
+    unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              uint64_t Imm);
+    unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            uint64_t Imm);
+    unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             uint64_t Imm1, uint64_t Imm2);
+
+    unsigned FastEmitInst_extractsubreg(MVT RetVT,
+                                        unsigned Op0, bool Op0IsKill,
+                                        uint32_t Idx);
+
+    // Backend specific FastISel code.
+  private:
+    virtual bool TargetSelectInstruction(const Instruction *I);
+    virtual unsigned TargetMaterializeConstant(const Constant *C);
+    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
+    virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
+                               const LoadInst *LI);
+    virtual bool FastLowerArguments();
+  private:
+  #include "ARMGenFastISel.inc"
+
+    // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectIToFP(const Instruction *I, bool isSigned);
+    bool SelectFPToI(const Instruction *I, bool isSigned);
+    bool SelectDiv(const Instruction *I, bool isSigned);
+    bool SelectRem(const Instruction *I, bool isSigned);
+    bool SelectCall(const Instruction *I, const char *IntrMemName);
+    bool SelectIntrinsicCall(const IntrinsicInst &I);
+    bool SelectSelect(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+    bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
+
+    // Utility routines.
+  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt);
+    bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     unsigned Alignment = 0, bool isZExt = true,
+                     bool allocReg = true);
+    bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                      unsigned Alignment = 0);
+    bool ARMComputeAddress(const Value *Obj, Address &Addr);
+    void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
+    bool ARMIsMemCpySmall(uint64_t Len);
+    bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+                               unsigned Alignment);
+    unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+    unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned ARMMaterializeInt(const Constant *C, MVT VT);
+    unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
+    unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
+    unsigned ARMSelectCallOp(bool UseReg);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+
+    // Call handling routines.
+  private:
+    CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
+                                  bool Return,
+                                  bool isVarArg);
+    bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool isVarArg);
+    unsigned getLibcallReg(const Twine &Name);
+    bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool isVarArg);
+    bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
+
+    // OptionalDef handling routines.
+  private:
+    bool isARMNEONPred(const MachineInstr *MI);
+    bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
+    const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
+    void AddLoadStoreOperands(MVT VT, Address &Addr,
+                              const MachineInstrBuilder &MIB,
+                              unsigned Flags, bool useAM3);
+};
+
+} // end anonymous namespace
+
+#include "ARMGenCallingConv.inc"
+
+// DefinesOptionalPredicate - This is different from DefinesPredicate in that
+// we don't care about implicit defs here, just places we'll need to add a
+// default CCReg argument. Sets CPSR if we're setting CPSR instead of CCR.
+bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
+  if (!MI->hasOptionalDef())
+    return false;
+
+  // Look to see if our OptionalDef is defining CPSR or CCR.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    if (MO.getReg() == ARM::CPSR)
+      *CPSR = true;
+  }
+  return true;
+}
+
+bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  // If we're a thumb2 or not NEON function we were handled via isPredicable.
+  if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
+       AFI->isThumb2Function())
+    return false;
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.OpInfo[i].isPredicate())
+      return true;
+
+  return false;
+}
+
+// If the machine is predicable go ahead and add the predicate operands, if
+// it needs default CC operands add those.
+// TODO: If we want to support thumb1 then we'll need to deal with optional
+// CPSR defs that need to be added before the remaining operands. See s_cc_out
+// for descriptions why.
+const MachineInstrBuilder &
+ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
+  MachineInstr *MI = &*MIB;
+
+  // Do we use a predicate? or...
+  // Are we NEON in ARM mode and have a predicate operand? If so, I know
+  // we're not predicable but add it anyways.
+  if (TII.isPredicable(MI) || isARMNEONPred(MI))
+    AddDefaultPred(MIB);
+
+  // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
+  // defines CPSR. All other OptionalDefines in ARM are the CCR register.
+  bool CPSR = false;
+  if (DefinesOptionalPredicate(MI, &CPSR)) {
+    if (CPSR)
+      AddDefaultT1CC(MIB);
+    else
+      AddDefaultCC(MIB);
+  }
+  return MIB;
+}
+
+unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass* RC) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg));
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                   TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       unsigned Op2, bool Op2IsKill) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addReg(Op2, Op2IsKill * RegState::Kill));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addReg(Op2, Op2IsKill * RegState::Kill));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      const ConstantFP *FPImm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addFPImm(FPImm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addFPImm(FPImm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Op0, bool Op0IsKill,
+                                       unsigned Op1, bool Op1IsKill,
+                                       uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addReg(Op0, Op0IsKill * RegState::Kill)
+                   .addReg(Op1, Op1IsKill * RegState::Kill)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_i(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                   .addImm(Imm));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                   .addImm(Imm));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                           TII.get(TargetOpcode::COPY), ResultReg)
+                   .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_ii(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      uint64_t Imm1, uint64_t Imm2) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
+                    .addImm(Imm1).addImm(Imm2));
+  } else {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
+                    .addImm(Imm1).addImm(Imm2));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(TargetOpcode::COPY),
+                            ResultReg)
+                    .addReg(II.ImplicitDefs[0]));
+  }
+  return ResultReg;
+}
+
+unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
+                                                 unsigned Op0, bool Op0IsKill,
+                                                 uint32_t Idx) {
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                          DL, TII.get(TargetOpcode::COPY), ResultReg)
+                  .addReg(Op0, getKillRegState(Op0IsKill), Idx));
+  return ResultReg;
+}
+
+// TODO: Don't worry about 64-bit now, but when this is fixed remove the
+// checks from the various callers.
+unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::f64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VMOVSR), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
+  if (VT == MVT::i64) return 0;
+
+  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VMOVRS), MoveReg)
+                  .addReg(SrcReg));
+  return MoveReg;
+}
+
+// For double width floating point we need to materialize two constants
+// (the high and the low) into integer registers then use a move to get
+// the combined constant into an FP reg.
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  const APFloat Val = CFP->getValueAPF();
+  bool is64bit = VT == MVT::f64;
+
+  // This checks to see if we can use VFP3 instructions to materialize
+  // a constant, otherwise we have to go through the constant pool.
+  if (TLI.isFPImmLegal(Val, VT)) {
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM_AM::getFP64Imm(Val);
+      Opc = ARM::FCONSTD;
+    } else {
+      Imm = ARM_AM::getFP32Imm(Val);
+      Opc = ARM::FCONSTS;
+    }
+    unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                            DestReg)
+                    .addImm(Imm));
+    return DestReg;
+  }
+
+  // Require VFP2 for loading fp constants.
+  if (!Subtarget->hasVFP2()) return false;
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(CFP->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = TD.getTypeAllocSize(CFP->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
+
+  // The extra reg is for addrmode5.
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          DestReg)
+                  .addConstantPoolIndex(Idx)
+                  .addReg(0));
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
+
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return false;
+
+  // If we can do this in a single instruction without a constant pool entry
+  // do so now.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (Subtarget->hasV6T2Ops() && isUInt<16>(CI->getZExtValue())) {
+    unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16;
+    const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
+      &ARM::GPRRegClass;
+    unsigned ImmReg = createResultReg(RC);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ImmReg)
+                    .addImm(CI->getZExtValue()));
+    return ImmReg;
+  }
+
+  // Use MVN to emit negative constants.
+  if (VT == MVT::i32 && Subtarget->hasV6T2Ops() && CI->isNegative()) {
+    unsigned Imm = (unsigned)~(CI->getSExtValue());
+    bool UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+    if (UseImm) {
+      unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
+      unsigned ImmReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(Opc), ImmReg)
+                      .addImm(Imm));
+      return ImmReg;
+    }
+  }
+
+  // Load from constant pool.  For now 32-bit only.
+  if (VT != MVT::i32)
+    return false;
+
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // TODO: Figure out if this is correct.
+    Align = TD.getTypeAllocSize(C->getType());
+  }
+  unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+
+  if (isThumb2)
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::t2LDRpci), DestReg)
+                    .addConstantPoolIndex(Idx));
+  else
+    // The extra immediate is for addrmode2.
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::LDRcp), DestReg)
+                    .addConstantPoolIndex(Idx)
+                    .addImm(0));
+
+  return DestReg;
+}
+
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32) return 0;
+
+  Reloc::Model RelocM = TM.getRelocationModel();
+  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Use movw+movt when possible, it avoids constant pool entries.
+  // Darwin targets don't support movt with Reloc::Static, see
+  // ARMTargetLowering::LowerGlobalAddressDarwin.  Other targets only support
+  // static movt relocations.
+  if (Subtarget->useMovt() &&
+      Subtarget->isTargetDarwin() == (RelocM != Reloc::Static)) {
+    unsigned Opc;
+    switch (RelocM) {
+    case Reloc::PIC_:
+      Opc = isThumb2 ? ARM::t2MOV_ga_pcrel : ARM::MOV_ga_pcrel;
+      break;
+    case Reloc::DynamicNoPIC:
+      Opc = isThumb2 ? ARM::t2MOV_ga_dyn : ARM::MOV_ga_dyn;
+      break;
+    default:
+      Opc = isThumb2 ? ARM::t2MOVi32imm : ARM::MOVi32imm;
+      break;
+    }
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                            DestReg).addGlobalAddress(GV));
+  } else {
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = TD.getPrefTypeAlignment(GV->getType());
+    if (Align == 0) {
+      // TODO: Figure out if this is correct.
+      Align = TD.getTypeAllocSize(GV->getType());
+    }
+
+    if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
+      return ARMLowerPICELF(GV, Align, VT);
+
+    // Grab index.
+    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 :
+      (Subtarget->isThumb() ? 4 : 8);
+    unsigned Id = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+                                                                ARMCP::CPValue,
+                                                                PCAdj);
+    unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+    // Load value.
+    MachineInstrBuilder MIB;
+    if (isThumb2) {
+      unsigned Opc = (RelocM!=Reloc::PIC_) ? ARM::t2LDRpci : ARM::t2LDRpci_pic;
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx);
+      if (RelocM == Reloc::PIC_)
+        MIB.addImm(Id);
+      AddOptionalDefs(MIB);
+    } else {
+      // The extra immediate is for addrmode2.
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
+                    DestReg)
+        .addConstantPoolIndex(Idx)
+        .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (RelocM == Reloc::PIC_) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DL, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
+    }
+  }
+
+  if (IsIndirect) {
+    MachineInstrBuilder MIB;
+    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    if (isThumb2)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                    TII.get(ARM::t2LDRi12), NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    else
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRi12),
+                    NewDestReg)
+            .addReg(DestReg)
+            .addImm(0);
+    DestReg = NewDestReg;
+    AddOptionalDefs(MIB);
+  }
+
+  return DestReg;
+}
+
+unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return ARMMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return ARMMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return ARMMaterializeInt(C, VT);
+
+  return 0;
+}
+
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
+unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  // This will get lowered later into the correct offsets and registers
+  // via rewriteXFrameIndex.
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(SI->second)
+                            .addImm(0));
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple()) return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+    return true;
+
+  return false;
+}
+
+// Computes the address to get to an object.
+bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
+  // Some boilerplate from the X86 FastISel.
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+    default:
+    break;
+    case Instruction::BitCast: {
+      // Look through bitcasts.
+      return ARMComputeAddress(U->getOperand(0), Addr);
+    }
+    case Instruction::IntToPtr: {
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    }
+    case Instruction::PtrToInt: {
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return ARMComputeAddress(U->getOperand(0), Addr);
+      break;
+    }
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      int TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+           i != e; ++i, ++GTI) {
+        const Value *Op = *i;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (isa<AddOperator>(Op) &&
+                (!isa<Instruction>(Op) ||
+                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+                 == FuncInfo.MBB) &&
+                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+              // An add (in the same block) with a constant operand. Fold the
+              // constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (ARMComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0) Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
+  bool needsLowering = false;
+  switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unhandled load/store type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (!useAM3) {
+        // Integer loads/stores handle 12-bit offsets.
+        needsLowering = ((Addr.Offset & 0xfff) != Addr.Offset);
+        // Handle negative offsets.
+        if (needsLowering && isThumb2)
+          needsLowering = !(Subtarget->hasV6T2Ops() && Addr.Offset < 0 &&
+                            Addr.Offset > -256);
+      } else {
+        // ARM halfword load/stores and signed byte loads use +/-imm8 offsets.
+        needsLowering = (Addr.Offset > 255 || Addr.Offset < -255);
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      // Floating point operands handle 8-bit offsets.
+      needsLowering = ((Addr.Offset & 0xff) != Addr.Offset);
+      break;
+  }
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
+    const TargetRegisterClass *RC = isThumb2 ?
+      (const TargetRegisterClass*)&ARM::tGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass;
+    unsigned ResultReg = createResultReg(RC);
+    unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(Opc), ResultReg)
+                            .addFrameIndex(Addr.Base.FI)
+                            .addImm(0));
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  // Since the offset is too large for the load/store instruction
+  // get the reg+offset into a register.
+  if (needsLowering) {
+    Addr.Base.Reg = FastEmit_ri_(MVT::i32, ISD::ADD, Addr.Base.Reg,
+                                 /*Op0IsKill*/false, Addr.Offset, MVT::i32);
+    Addr.Offset = 0;
+  }
+}
+
+void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
+                                       const MachineInstrBuilder &MIB,
+                                       unsigned Flags, bool useAM3) {
+  // addrmode5 output depends on the selection dag addressing dividing the
+  // offset by 4 that it then later multiplies. Do this here as well.
+  if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
+    Addr.Offset /= 4;
+
+  // Frame base works a bit differently. Handle it separately.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    int FI = Addr.Base.FI;
+    int Offset = Addr.Offset;
+    MachineMemOperand *MMO =
+          FuncInfo.MF->getMachineMemOperand(
+                                  MachinePointerInfo::getFixedStack(FI, Offset),
+                                  Flags,
+                                  MFI.getObjectSize(FI),
+                                  MFI.getObjectAlignment(FI));
+    // Now add the rest of the operands.
+    MIB.addFrameIndex(FI);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+    MIB.addMemOperand(MMO);
+  } else {
+    // Now add the rest of the operands.
+    MIB.addReg(Addr.Base.Reg);
+
+    // ARM halfword load/stores and signed byte loads need an additional
+    // operand.
+    if (useAM3) {
+      signed Imm = (Addr.Offset < 0) ? (0x100 | -Addr.Offset) : Addr.Offset;
+      MIB.addReg(0);
+      MIB.addImm(Imm);
+    } else {
+      MIB.addImm(Addr.Offset);
+    }
+  }
+  AddOptionalDefs(MIB);
+}
+
+bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              unsigned Alignment, bool isZExt, bool allocReg) {
+  unsigned Opc;
+  bool useAM3 = false;
+  bool needVMOV = false;
+  const TargetRegisterClass *RC;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRBi8 : ARM::t2LDRSBi8;
+        else
+          Opc = isZExt ? ARM::t2LDRBi12 : ARM::t2LDRSBi12;
+      } else {
+        if (isZExt) {
+          Opc = ARM::LDRBi12;
+        } else {
+          Opc = ARM::LDRSB;
+          useAM3 = true;
+        }
+      }
+      RC = &ARM::GPRRegClass;
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
+        else
+          Opc = isZExt ? ARM::t2LDRHi12 : ARM::t2LDRSHi12;
+      } else {
+        Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
+        useAM3 = true;
+      }
+      RC = &ARM::GPRRegClass;
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          Opc = ARM::t2LDRi8;
+        else
+          Opc = ARM::t2LDRi12;
+      } else {
+        Opc = ARM::LDRi12;
+      }
+      RC = &ARM::GPRRegClass;
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned loads need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        needVMOV = true;
+        VT = MVT::i32;
+        Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
+        RC = &ARM::GPRRegClass;
+      } else {
+        Opc = ARM::VLDRS;
+        RC = TLI.getRegClassFor(VT);
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned loads need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+        return false;
+
+      Opc = ARM::VLDRD;
+      RC = TLI.getRegClassFor(VT);
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  if (allocReg)
+    ResultReg = createResultReg(RC);
+  assert (ResultReg > 255 && "Expected an allocated virtual register.");
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(Opc), ResultReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
+
+  // If we had an unaligned load of a float we've converted it to an regular
+  // load.  Now we must move from the GRP to the FP register.
+  if (needVMOV) {
+    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::VMOVSR), MoveReg)
+                    .addReg(ResultReg));
+    ResultReg = MoveReg;
+  }
+  return true;
+}
+
+bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg;
+  if (!ARMEmitLoad(VT, ResultReg, Addr, cast<LoadInst>(I)->getAlignment()))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                               unsigned Alignment) {
+  unsigned StrOpc;
+  bool useAM3 = false;
+  switch (VT.SimpleTy) {
+    // This is mostly going to be Neon/vector support.
+    default: return false;
+    case MVT::i1: {
+      unsigned Res = createResultReg(isThumb2 ?
+        (const TargetRegisterClass*)&ARM::tGPRRegClass :
+        (const TargetRegisterClass*)&ARM::GPRRegClass);
+      unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(Opc), Res)
+                      .addReg(SrcReg).addImm(1));
+      SrcReg = Res;
+    } // Fallthrough here.
+    case MVT::i8:
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRBi8;
+        else
+          StrOpc = ARM::t2STRBi12;
+      } else {
+        StrOpc = ARM::STRBi12;
+      }
+      break;
+    case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRHi8;
+        else
+          StrOpc = ARM::t2STRHi12;
+      } else {
+        StrOpc = ARM::STRH;
+        useAM3 = true;
+      }
+      break;
+    case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
+      if (isThumb2) {
+        if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
+          StrOpc = ARM::t2STRi8;
+        else
+          StrOpc = ARM::t2STRi12;
+      } else {
+        StrOpc = ARM::STRi12;
+      }
+      break;
+    case MVT::f32:
+      if (!Subtarget->hasVFP2()) return false;
+      // Unaligned stores need special handling. Floats require word-alignment.
+      if (Alignment && Alignment < 4) {
+        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(ARM::VMOVRS), MoveReg)
+                        .addReg(SrcReg));
+        SrcReg = MoveReg;
+        VT = MVT::i32;
+        StrOpc = isThumb2 ? ARM::t2STRi12 : ARM::STRi12;
+      } else {
+        StrOpc = ARM::VSTRS;
+      }
+      break;
+    case MVT::f64:
+      if (!Subtarget->hasVFP2()) return false;
+      // FIXME: Unaligned stores need special handling.  Doublewords require
+      // word-alignment.
+      if (Alignment && Alignment < 4)
+          return false;
+
+      StrOpc = ARM::VSTRD;
+      break;
+  }
+  // Simplify this down to something we can handle.
+  ARMSimplifyAddress(Addr, VT, useAM3);
+
+  // Create the base instruction, then add the operands.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(StrOpc))
+                            .addReg(SrcReg);
+  AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
+  return true;
+}
+
+bool ARMFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0) return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!ARMEmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // Needs two compares...
+    case CmpInst::FCMP_ONE:
+    case CmpInst::FCMP_UEQ:
+    default:
+      // AL is our "false" for now. The other two need more compares.
+      return ARMCC::AL;
+    case CmpInst::ICMP_EQ:
+    case CmpInst::FCMP_OEQ:
+      return ARMCC::EQ;
+    case CmpInst::ICMP_SGT:
+    case CmpInst::FCMP_OGT:
+      return ARMCC::GT;
+    case CmpInst::ICMP_SGE:
+    case CmpInst::FCMP_OGE:
+      return ARMCC::GE;
+    case CmpInst::ICMP_UGT:
+    case CmpInst::FCMP_UGT:
+      return ARMCC::HI;
+    case CmpInst::FCMP_OLT:
+      return ARMCC::MI;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::FCMP_OLE:
+      return ARMCC::LS;
+    case CmpInst::FCMP_ORD:
+      return ARMCC::VC;
+    case CmpInst::FCMP_UNO:
+      return ARMCC::VS;
+    case CmpInst::FCMP_UGE:
+      return ARMCC::PL;
+    case CmpInst::ICMP_SLT:
+    case CmpInst::FCMP_ULT:
+      return ARMCC::LT;
+    case CmpInst::ICMP_SLE:
+    case CmpInst::FCMP_ULE:
+      return ARMCC::LE;
+    case CmpInst::FCMP_UNE:
+    case CmpInst::ICMP_NE:
+      return ARMCC::NE;
+    case CmpInst::ICMP_UGE:
+      return ARMCC::HS;
+    case CmpInst::ICMP_ULT:
+      return ARMCC::LO;
+  }
+}
+
+bool ARMFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Simple branch support.
+
+  // If we can, avoid recomputing the compare - redoing it could lead to wonky
+  // behavior.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
+
+      // Get the compare predicate.
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      ARMCC::CondCodes ARMPred = getComparePred(Predicate);
+
+      // We may not handle every CC for now.
+      if (ARMPred == ARMCC::AL) return false;
+
+      // Emit the compare.
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+        return false;
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR);
+      FastEmitBranch(FBB, DL);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
+      unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TstOpc))
+                      .addReg(OpReg).addImm(1));
+
+      unsigned CCMode = ARMCC::NE;
+      if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+        std::swap(TBB, FBB);
+        CCMode = ARMCC::EQ;
+      }
+
+      unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+      .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+
+      FastEmitBranch(FBB, DL);
+      FuncInfo.MBB->addSuccessor(TBB);
+      return true;
+    }
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DL);
+    return true;
+  }
+
+  unsigned CmpReg = getRegForValue(BI->getCondition());
+  if (CmpReg == 0) return false;
+
+  // We've been divorced from our compare!  Our block was split, and
+  // now our compare lives in a predecessor block.  We musn't
+  // re-compare here, as the children of the compare aren't guaranteed
+  // live across the block boundary (we *could* check for this).
+  // Regardless, the compare has been done in the predecessor block,
+  // and it left a value for us in a virtual register.  Ergo, we test
+  // the one-bit value left in the virtual register.
+  unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
+                  .addReg(CmpReg).addImm(1));
+
+  unsigned CCMode = ARMCC::NE;
+  if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+    std::swap(TBB, FBB);
+    CCMode = ARMCC::EQ;
+  }
+
+  unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BrOpc))
+                  .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR);
+  FastEmitBranch(FBB, DL);
+  FuncInfo.MBB->addSuccessor(TBB);
+  return true;
+}
+
+bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0) return false;
+
+  unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+                  .addReg(AddrReg));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                             bool isZExt) {
+  Type *Ty = Src1Value->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple()) return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  // Check to see if the 2nd operand is a constant that we can encode directly
+  // in the compare.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+  // Thus, Src1Value may be a ConstantInt, but we're missing it.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
+    if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
+        SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
+      // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
+      // then a cmn, because there is no way to represent 2147483648 as a 
+      // signed 32-bit int.
+      if (Imm < 0 && Imm != (int)0x80000000) {
+        isNegativeImm = true;
+        Imm = -Imm;
+      }
+      UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+        (ARM_AM::getSOImmVal(Imm) != -1);
+    }
+  } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
+    if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
+      if (ConstFP->isZero() && !ConstFP->isNegative())
+        UseImm = true;
+  }
+
+  unsigned CmpOpc;
+  bool isICmp = true;
+  bool needsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    // TODO: Verify compares.
+    case MVT::f32:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      break;
+    case MVT::f64:
+      isICmp = false;
+      CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      needsExt = true;
+    // Intentional fall-through.
+    case MVT::i32:
+      if (isThumb2) {
+        if (!UseImm)
+          CmpOpc = ARM::t2CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::t2CMNri : ARM::t2CMPri;
+      } else {
+        if (!UseImm)
+          CmpOpc = ARM::CMPrr;
+        else
+          CmpOpc = isNegativeImm ? ARM::CMNri : ARM::CMPri;
+      }
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(Src1Value);
+  if (SrcReg1 == 0) return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(Src2Value);
+    if (SrcReg2 == 0) return false;
+  }
+
+  // We have i1, i8, or i16, we need to either zero extend or sign extend.
+  if (needsExt) {
+    SrcReg1 = ARMEmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
+    if (SrcReg1 == 0) return false;
+    if (!UseImm) {
+      SrcReg2 = ARMEmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
+      if (SrcReg2 == 0) return false;
+    }
+  }
+
+  if (!UseImm) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(CmpOpc))
+                    .addReg(SrcReg1).addReg(SrcReg2));
+  } else {
+    MachineInstrBuilder MIB;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+      .addReg(SrcReg1);
+
+    // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
+    if (isICmp)
+      MIB.addImm(Imm);
+    AddOptionalDefs(MIB);
+  }
+
+  // For floating point we need to move the result to a comparison register
+  // that we can then use for branches.
+  if (Ty->isFloatTy() || Ty->isDoubleTy())
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::FMSTAT)));
+  return true;
+}
+
+bool ARMFastISel::SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  // Get the compare predicate.
+  ARMCC::CondCodes ARMPred = getComparePred(CI->getPredicate());
+
+  // We may not handle every CC for now.
+  if (ARMPred == ARMCC::AL) return false;
+
+  // Emit the compare.
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+    return false;
+
+  // Now set a register based on the comparison. Explicitly set the predicates
+  // here.
+  unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
+  Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
+  unsigned ZeroReg = TargetMaterializeConstant(Zero);
+  // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), DestReg)
+          .addReg(ZeroReg).addImm(1)
+          .addImm(ARMPred).addReg(ARM::CPSR);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPExt(const Instruction *I) {
+  // Make sure we have VFP and that we're extending float to double.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!I->getType()->isDoubleTy() ||
+      !V->getType()->isFloatTy()) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::DPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VCVTDS), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
+  // Make sure we have VFP and that we're truncating double to float.
+  if (!Subtarget->hasVFP2()) return false;
+
+  Value *V = I->getOperand(0);
+  if (!(I->getType()->isFloatTy() &&
+        V->getType()->isDoubleTy())) return false;
+
+  unsigned Op = getRegForValue(V);
+  if (Op == 0) return false;
+
+  unsigned Result = createResultReg(&ARM::SPRRegClass);
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(ARM::VCVTSD), Result)
+                  .addReg(Op));
+  UpdateValueMap(I, Result);
+  return true;
+}
+
+bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, DstVT))
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0) return false;
+
+  // Handle sign-extension.
+  if (SrcVT == MVT::i16 || SrcVT == MVT::i8) {
+    SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32,
+                                       /*isZExt*/!isSigned);
+    if (SrcReg == 0) return false;
+  }
+
+  // The conversion routine works on fp-reg to fp-reg and the operand above
+  // was an integer, move it to the fp registers if possible.
+  unsigned FP = ARMMoveToFPReg(MVT::f32, SrcReg);
+  if (FP == 0) return false;
+
+  unsigned Opc;
+  if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
+  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          ResultReg)
+                  .addReg(FP));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
+  // Make sure we have VFP.
+  if (!Subtarget->hasVFP2()) return false;
+
+  MVT DstVT;
+  Type *RetTy = I->getType();
+  if (!isTypeLegal(RetTy, DstVT))
+    return false;
+
+  unsigned Op = getRegForValue(I->getOperand(0));
+  if (Op == 0) return false;
+
+  unsigned Opc;
+  Type *OpTy = I->getOperand(0)->getType();
+  if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
+  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else return false;
+
+  // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
+                          ResultReg)
+                  .addReg(Op));
+
+  // This result needs to be in an integer register, but the conversion only
+  // takes place in fp-regs.
+  unsigned IntReg = ARMMoveToIntReg(DstVT, ResultReg);
+  if (IntReg == 0) return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+bool ARMFastISel::SelectSelect(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  // Things need to be register sized for register moves.
+  if (VT != MVT::i32) return false;
+
+  unsigned CondReg = getRegForValue(I->getOperand(0));
+  if (CondReg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+
+  // Check to see if we can use an immediate in the conditional move.
+  int Imm = 0;
+  bool UseImm = false;
+  bool isNegativeImm = false;
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
+    assert (VT == MVT::i32 && "Expecting an i32.");
+    Imm = (int)ConstInt->getValue().getZExtValue();
+    if (Imm < 0) {
+      isNegativeImm = true;
+      Imm = ~Imm;
+    }
+    UseImm = isThumb2 ? (ARM_AM::getT2SOImmVal(Imm) != -1) :
+      (ARM_AM::getSOImmVal(Imm) != -1);
+  }
+
+  unsigned Op2Reg = 0;
+  if (!UseImm) {
+    Op2Reg = getRegForValue(I->getOperand(2));
+    if (Op2Reg == 0) return false;
+  }
+
+  unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+                  .addReg(CondReg).addImm(0));
+
+  unsigned MovCCOpc;
+  const TargetRegisterClass *RC;
+  if (!UseImm) {
+    RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
+    MovCCOpc = isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr;
+  } else {
+    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
+    if (!isNegativeImm)
+      MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
+    else
+      MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
+  }
+  unsigned ResultReg = createResultReg(RC);
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
+    .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
+    .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  // If we have integer div support we should have selected this automagically.
+  // In case we have a real miss go ahead and return false and we'll pick
+  // it up later.
+  if (Subtarget->hasDivide()) return false;
+
+  // Otherwise emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SDIV_I8 : RTLIB::UDIV_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SDIV_I16 : RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SDIV_I32 : RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SDIV_I64 : RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SDIV_I128 : RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectRem(const Instruction *I, bool isSigned) {
+  MVT VT;
+  Type *Ty = I->getType();
+  if (!isTypeLegal(Ty, VT))
+    return false;
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i8)
+    LC = isSigned ? RTLIB::SREM_I8 : RTLIB::UREM_I8;
+  else if (VT == MVT::i16)
+    LC = isSigned ? RTLIB::SREM_I16 : RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = isSigned ? RTLIB::SREM_I32 : RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = isSigned ? RTLIB::SREM_I64 : RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = isSigned ? RTLIB::SREM_I128 : RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  return ARMEmitLibcall(I, LC);
+}
+
+bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = isThumb2 ? ARM::t2ADDrr : ARM::ADDrr;
+      break;
+    case ISD::OR:
+      Opc = isThumb2 ? ARM::t2ORRrr : ARM::ORRrr;
+      break;
+    case ISD::SUB:
+      Opc = isThumb2 ? ARM::t2SUBrr : ARM::SUBrr;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // TODO: Often the 2nd operand is an immediate, which can be encoded directly
+  // in the instruction, rather then materializing the value in a register.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(Opc), ResultReg)
+                  .addReg(SrcReg1).addReg(SrcReg2));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT FPVT = TLI.getValueType(I->getType(), true);
+  if (!FPVT.isSimple()) return false;
+  MVT VT = FPVT.getSimpleVT();
+
+  // We can get here in the case when we want to use NEON for our fp
+  // operations, but can't figure out how to. Just use the vfp instructions
+  // if we have them.
+  // FIXME: It'd be nice to use NEON instructions.
+  Type *Ty = I->getType();
+  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
+  if (isFloat && !Subtarget->hasVFP2())
+    return false;
+
+  unsigned Opc;
+  bool is64bit = VT == MVT::f64 || VT == MVT::i64;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::FADD:
+      Opc = is64bit ? ARM::VADDD : ARM::VADDS;
+      break;
+    case ISD::FSUB:
+      Opc = is64bit ? ARM::VSUBD : ARM::VSUBS;
+      break;
+    case ISD::FMUL:
+      Opc = is64bit ? ARM::VMULD : ARM::VMULS;
+      break;
+  }
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(Opc), ResultReg)
+                  .addReg(Op1).addReg(Op2));
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Call Handling Code
+
+// This is largely taken directly from CCAssignFnForNode
+// TODO: We may not support all of this.
+CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool Return,
+                                           bool isVarArg) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+    if (Subtarget->hasVFP2() && !isVarArg) {
+      if (!Subtarget->isAAPCS_ABI())
+        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+      // For AAPCS ABI targets, just use VFP variant of the calling convention.
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    }
+    // Fallthrough
+  case CallingConv::C:
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          TM.Options.FloatABIType == FloatABI::Hard && !isVarArg)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else
+        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+    // Fall through to soft float variant, variadic functions don't
+    // use hard floating point ABI.
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::GHC:
+    if (Return)
+      llvm_unreachable("Can't return in GHC call convention");
+    else
+      return CC_ARM_APCS_GHC;
+  }
+}
+
+bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool isVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags,
+                             CCAssignFnForCall(CC, false, isVarArg));
+
+  // Check that we can handle all of the arguments. If we can't, then bail out
+  // now before we add code to the MBB.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // We don't handle NEON/vector parameters yet.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+      return false;
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      continue;
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      if (VA.getLocVT() != MVT::f64 ||
+          // TODO: Only handle register args for now.
+          !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
+        return false;
+    } else {
+      switch (static_cast<EVT>(ArgVT).getSimpleVT().SimpleTy) {
+      default:
+        return false;
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+        break;
+      case MVT::f32:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      case MVT::f64:
+        if (!Subtarget->hasVFP2())
+          return false;
+        break;
+      }
+    }
+  }
+
+  // At the point, we are able to handle the call's arguments in fast isel.
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(AdjStackDown))
+                  .addImm(NumBytes));
+
+  // Process the args.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    assert((!ArgVT.isVector() && ArgVT.getSizeInBits() <= 64) &&
+           "We don't handle NEON/vector parameters yet.");
+
+    // Handle arg promotion, etc.
+    switch (VA.getLocInfo()) {
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
+        assert (Arg != 0 && "Failed to emit a sext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::AExt:
+        // Intentional fall-through.  Handle AExt and ZExt.
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
+        assert (Arg != 0 && "Failed to emit a sext");
+        ArgVT = DestVT;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        unsigned BC = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, Arg,
+                                 /*TODO: Kill=*/false);
+        assert(BC != 0 && "Failed to emit a bitcast!");
+        Arg = BC;
+        ArgVT = VA.getLocVT();
+        break;
+      }
+      default: llvm_unreachable("Unknown arg promotion!");
+    }
+
+    // Now copy/store arg to correct locations.
+    if (VA.isRegLoc() && !VA.needsCustom()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              VA.getLocReg())
+        .addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else if (VA.needsCustom()) {
+      // TODO: We need custom lowering for vector (v2f64) args.
+      assert(VA.getLocVT() == MVT::f64 &&
+             "Custom lowering for v2f64 args not available");
+
+      CCValAssign &NextVA = ArgLocs[++i];
+
+      assert(VA.isRegLoc() && NextVA.isRegLoc() &&
+             "We only handle register args!");
+
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(ARM::VMOVRRD), VA.getLocReg())
+                      .addReg(NextVA.getLocReg(), RegState::Define)
+                      .addReg(Arg));
+      RegArgs.push_back(VA.getLocReg());
+      RegArgs.push_back(NextVA.getLocReg());
+    } else {
+      assert(VA.isMemLoc());
+      // Need to store on the stack.
+      Address Addr;
+      Addr.BaseType = Address::RegBase;
+      Addr.Base.Reg = ARM::SP;
+      Addr.Offset = VA.getLocMemOffset();
+
+      bool EmitRet = ARMEmitStore(ArgVT, Arg, Addr); (void)EmitRet;
+      assert(EmitRet && "Could not emit a store for argument!");
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool isVarArg) {
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                          TII.get(AdjStackUp))
+                  .addImm(NumBytes).addImm(0));
+
+  // Now the return value.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+
+    // Copy all of the result registers out of their specified physreg.
+    if (RVLocs.size() == 2 && RetVT == MVT::f64) {
+      // For this move we copy into two registers and then move into the
+      // double fp reg we want.
+      MVT DestVT = RVLocs[0].getValVT();
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
+      unsigned ResultReg = createResultReg(DstRC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(ARM::VMOVDRR), ResultReg)
+                      .addReg(RVLocs[0].getLocReg())
+                      .addReg(RVLocs[1].getLocReg()));
+
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[1].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    } else {
+      assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
+      MVT CopyVT = RVLocs[0].getValVT();
+
+      // Special handling for extended integers.
+      if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
+        CopyVT = MVT::i32;
+
+      const TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+
+      unsigned ResultReg = createResultReg(DstRC);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              ResultReg).addReg(RVLocs[0].getLocReg());
+      UsedRegs.push_back(RVLocs[0].getLocReg());
+
+      // Finally update the result.
+      UpdateValueMap(I, ResultReg);
+    }
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
+    CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */,
+                                                 F.isVarArg()));
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple()) return false;
+    MVT RVVT = RVEVT.getSimpleVT();
+    MVT DestVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (RVVT != DestVT) {
+      if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
+        return false;
+
+      assert(DestVT == MVT::i32 && "ARM should always ext to i32");
+
+      // Perform extension if flagged as either zext or sext.  Otherwise, do
+      // nothing.
+      if (Outs[0].Flags.isZExt() || Outs[0].Flags.isSExt()) {
+        SrcReg = ARMEmitIntExt(RVVT, SrcReg, DestVT, Outs[0].Flags.isZExt());
+        if (SrcReg == 0) return false;
+      }
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  unsigned RetOpc = isThumb2 ? ARM::tBX_RET : ARM::BX_RET;
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(RetOpc));
+  AddOptionalDefs(MIB);
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
+  if (UseReg)
+    return isThumb2 ? ARM::tBLXr : ARM::BLX;
+  else
+    return isThumb2 ? ARM::tBL : ARM::BL;
+}
+
+unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+                                       GlobalValue::ExternalLinkage, 0, Name);
+  EVT LCREVT = TLI.getValueType(GV->getType());
+  if (!LCREVT.isSimple()) return 0;
+  return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
+}
+
+// A quick function that will emit a call for a named libcall in F with the
+// vector of passed arguments for the Instruction in I. We can assume that we
+// can emit a call for any libcall we can produce. This is an abridged version
+// of the full call infrastructure since we won't need to worry about things
+// like computed function pointers or strange arguments at call sites.
+// TODO: Try to unify this and the normal call bits for ARM, then try to unify
+// with X86.
+bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
+  CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT))
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, false));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(I->getNumOperands());
+  ArgRegs.reserve(I->getNumOperands());
+  ArgVTs.reserve(I->getNumOperands());
+  ArgFlags.reserve(I->getNumOperands());
+  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+    Value *Op = I->getOperand(i);
+    unsigned Arg = getRegForValue(Op);
+    if (Arg == 0) return false;
+
+    Type *ArgTy = Op->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT)) return false;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Op);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, false))
+    return false;
+
+  unsigned CalleeReg = 0;
+  if (EnableARMLongCalls) {
+    CalleeReg = getLibcallReg(TLI.getLibcallName(Call));
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
+    AddDefaultPred(MIB);
+  if (EnableARMLongCalls)
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, false)) return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::SelectCall(const Instruction *I,
+                             const char *IntrMemName = 0) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee)) return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall()) return false;
+
+  // Check the calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  // TODO: Avoid some calling conventions?
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool isVarArg = FTy->isVarArg();
+
+  // Handle *simple* calls for now.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8  && RetVT != MVT::i1)
+    return false;
+
+  // Can't handle non-double multi-reg retvals.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i1 && RetVT != MVT::i8 &&
+      RetVT != MVT::i16 && RetVT != MVT::i32) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC, true, isVarArg));
+    if (RVLocs.size() >= 2 && RetVT != MVT::f64)
+      return false;
+  }
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  unsigned arg_size = CS.arg_size();
+  Args.reserve(arg_size);
+  ArgRegs.reserve(arg_size);
+  ArgVTs.reserve(arg_size);
+  ArgFlags.reserve(arg_size);
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a memory intrinsic instead of a regular call, skip the
+    // last two arguments, which shouldn't be passed to the underlying function.
+    if (IntrMemName && e-i <= 2)
+      break;
+
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = (*i)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
+        ArgVT != MVT::i1)
+      return false;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*i);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Handle the arguments now that we've gotten them.
+  SmallVector<unsigned, 4> RegArgs;
+  unsigned NumBytes;
+  if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, isVarArg))
+    return false;
+
+  bool UseReg = false;
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV || EnableARMLongCalls) UseReg = true;
+
+  unsigned CalleeReg = 0;
+  if (UseReg) {
+    if (IntrMemName)
+      CalleeReg = getLibcallReg(IntrMemName);
+    else
+      CalleeReg = getRegForValue(Callee);
+
+    if (CalleeReg == 0) return false;
+  }
+
+  // Issue the call.
+  unsigned CallOpc = ARMSelectCallOp(UseReg);
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(CallOpc));
+
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
+    AddDefaultPred(MIB);
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes, isVarArg))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+bool ARMFastISel::ARMIsMemCpySmall(uint64_t Len) {
+  return Len <= 16;
+}
+
+bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
+                                        uint64_t Len, unsigned Alignment) {
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!ARMIsMemCpySmall(Len))
+    return false;
+
+  while (Len) {
+    MVT VT;
+    if (!Alignment || Alignment >= 4) {
+      if (Len >= 4)
+        VT = MVT::i32;
+      else if (Len >= 2)
+        VT = MVT::i16;
+      else {
+        assert (Len == 1 && "Expected a length of 1!");
+        VT = MVT::i8;
+      }
+    } else {
+      // Bound based on alignment.
+      if (Len >= 2 && Alignment == 2)
+        VT = MVT::i16;
+      else {
+        VT = MVT::i8;
+      }
+    }
+
+    bool RV;
+    unsigned ResultReg;
+    RV = ARMEmitLoad(VT, ResultReg, Src);
+    assert (RV == true && "Should be able to handle this load.");
+    RV = ARMEmitStore(VT, ResultReg, Dest);
+    assert (RV == true && "Should be able to handle this store.");
+    (void)RV;
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    Dest.Offset += Size;
+    Src.Offset += Size;
+  }
+
+  return true;
+}
+
+bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::frameaddress: {
+    MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+    MFI->setFrameAddressIsTaken(true);
+
+    unsigned LdrOpc;
+    const TargetRegisterClass *RC;
+    if (isThumb2) {
+      LdrOpc =  ARM::t2LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::tGPRRegClass;
+    } else {
+      LdrOpc =  ARM::LDRi12;
+      RC = (const TargetRegisterClass*)&ARM::GPRRegClass;
+    }
+
+    const ARMBaseRegisterInfo *RegInfo =
+          static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
+    unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+    unsigned SrcReg = FramePtr;
+
+    // Recursively load frame address
+    // ldr r0 [fp]
+    // ldr r0 [r0]
+    // ldr r0 [r0]
+    // ...
+    unsigned DestReg;
+    unsigned Depth = cast<ConstantInt>(I.getOperand(0))->getZExtValue();
+    while (Depth--) {
+      DestReg = createResultReg(RC);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(LdrOpc), DestReg)
+                      .addReg(SrcReg).addImm(0));
+      SrcReg = DestReg;
+    }
+    UpdateValueMap(&I, SrcReg);
+    return true;
+  }
+  case Intrinsic::memcpy:
+  case Intrinsic::memmove: {
+    const MemTransferInst &MTI = cast<MemTransferInst>(I);
+    // Don't handle volatile.
+    if (MTI.isVolatile())
+      return false;
+
+    // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
+    // we would emit dead code because we don't currently handle memmoves.
+    bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
+    if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+      // Small memcpy's are common enough that we want to do them without a call
+      // if possible.
+      uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
+      if (ARMIsMemCpySmall(Len)) {
+        Address Dest, Src;
+        if (!ARMComputeAddress(MTI.getRawDest(), Dest) ||
+            !ARMComputeAddress(MTI.getRawSource(), Src))
+          return false;
+        unsigned Alignment = MTI.getAlignment();
+        if (ARMTryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+          return true;
+      }
+    }
+
+    if (!MTI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+      return false;
+
+    const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
+    return SelectCall(&I, IntrMemName);
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+    // Don't handle volatile.
+    if (MSI.isVolatile())
+      return false;
+
+    if (!MSI.getLength()->getType()->isIntegerTy(32))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return SelectCall(&I, "memset");
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(
+      Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP));
+    return true;
+  }
+  }
+}
+
+bool ARMFastISel::SelectTrunc(const Instruction *I) {
+  // The high bits for a type smaller than the register size are assumed to be
+  // undefined.
+  Value *Op = I->getOperand(0);
+
+  EVT SrcVT, DestVT;
+  SrcVT = TLI.getValueType(Op->getType(), true);
+  DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
+    return false;
+  if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Op);
+  if (!SrcReg) return false;
+
+  // Because the high bits are undefined, a truncate doesn't generate
+  // any code.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                    bool isZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return 0;
+
+  unsigned Opc;
+  bool isBoolZext = false;
+  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  switch (SrcVT.SimpleTy) {
+  default: return 0;
+  case MVT::i16:
+    if (!Subtarget->hasV6Ops()) return 0;
+    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+    if (isZExt)
+      Opc = isThumb2 ? ARM::t2UXTH : ARM::UXTH;
+    else
+      Opc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
+    break;
+  case MVT::i8:
+    if (!Subtarget->hasV6Ops()) return 0;
+    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
+    if (isZExt)
+      Opc = isThumb2 ? ARM::t2UXTB : ARM::UXTB;
+    else
+      Opc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
+    break;
+  case MVT::i1:
+    if (isZExt) {
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
+      Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      isBoolZext = true;
+      break;
+    }
+    return 0;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+        .addReg(SrcReg);
+  if (isBoolZext)
+    MIB.addImm(1);
+  else
+    MIB.addImm(0);
+  AddOptionalDefs(MIB);
+  return ResultReg;
+}
+
+bool ARMFastISel::SelectIntExt(const Instruction *I) {
+  // On ARM, in general, integer casts don't involve legal types; this code
+  // handles promotable integers.
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool isZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple()) return false;
+  if (!DestEVT.isSimple()) return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+  unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
+  if (ResultReg == 0) return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool ARMFastISel::SelectShift(const Instruction *I,
+                              ARM_AM::ShiftOpc ShiftTy) {
+  // We handle thumb2 mode by target independent selector
+  // or SelectionDAG ISel.
+  if (isThumb2)
+    return false;
+
+  // Only handle i32 now.
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+  if (DestVT != MVT::i32)
+    return false;
+
+  unsigned Opc = ARM::MOVsr;
+  unsigned ShiftImm;
+  Value *Src2Value = I->getOperand(1);
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Src2Value)) {
+    ShiftImm = CI->getZExtValue();
+
+    // Fall back to selection DAG isel if the shift amount
+    // is zero or greater than the width of the value type.
+    if (ShiftImm == 0 || ShiftImm >=32)
+      return false;
+
+    Opc = ARM::MOVsi;
+  }
+
+  Value *Src1Value = I->getOperand(0);
+  unsigned Reg1 = getRegForValue(Src1Value);
+  if (Reg1 == 0) return false;
+
+  unsigned Reg2 = 0;
+  if (Opc == ARM::MOVsr) {
+    Reg2 = getRegForValue(Src2Value);
+    if (Reg2 == 0) return false;
+  }
+
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  if(ResultReg == 0) return false;
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(Opc), ResultReg)
+                            .addReg(Reg1);
+
+  if (Opc == ARM::MOVsi)
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, ShiftImm));
+  else if (Opc == ARM::MOVsr) {
+    MIB.addReg(Reg2);
+    MIB.addImm(ARM_AM::getSORegOpc(ShiftTy, 0));
+  }
+
+  AddOptionalDefs(MIB);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// TODO: SoftFP support.
+bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return SelectCmp(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*isSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*isSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*isSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*isSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::FAdd:
+      return SelectBinaryFPOp(I, ISD::FADD);
+    case Instruction::FSub:
+      return SelectBinaryFPOp(I, ISD::FSUB);
+    case Instruction::FMul:
+      return SelectBinaryFPOp(I, ISD::FMUL);
+    case Instruction::SDiv:
+      return SelectDiv(I, /*isSigned*/ true);
+    case Instruction::UDiv:
+      return SelectDiv(I, /*isSigned*/ false);
+    case Instruction::SRem:
+      return SelectRem(I, /*isSigned*/ true);
+    case Instruction::URem:
+      return SelectRem(I, /*isSigned*/ false);
+    case Instruction::Call:
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+        return SelectIntrinsicCall(*II);
+      return SelectCall(I);
+    case Instruction::Select:
+      return SelectSelect(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    case Instruction::Shl:
+      return SelectShift(I, ARM_AM::lsl);
+    case Instruction::LShr:
+      return SelectShift(I, ARM_AM::lsr);
+    case Instruction::AShr:
+      return SelectShift(I, ARM_AM::asr);
+    default: break;
+  }
+  return false;
+}
+
+/// TryToFoldLoad - The specified machine instr operand is a vreg, and that
+/// vreg is being provided by the specified load instruction.  If possible,
+/// try to fold the load as an operand to the instruction, returning true if
+/// successful.
+bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
+                                const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  // ldrb r1, [r0]       ldrb r1, [r0]
+  // uxtb r2, r1     =>
+  // mov  r3, r2         mov  r3, r1
+  bool isZExt = true;
+  switch(MI->getOpcode()) {
+    default: return false;
+    case ARM::SXTH:
+    case ARM::t2SXTH:
+      isZExt = false;
+    case ARM::UXTH:
+    case ARM::t2UXTH:
+      if (VT != MVT::i16)
+        return false;
+    break;
+    case ARM::SXTB:
+    case ARM::t2SXTB:
+      isZExt = false;
+    case ARM::UXTB:
+    case ARM::t2UXTB:
+      if (VT != MVT::i8)
+        return false;
+    break;
+  }
+  // See if we can handle this address.
+  Address Addr;
+  if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+  if (!ARMEmitLoad(VT, ResultReg, Addr, LI->getAlignment(), isZExt, false))
+    return false;
+  MI->eraseFromParent();
+  return true;
+}
+
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
+                                     unsigned Align, MVT VT) {
+  bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+  ARMConstantPoolConstant *CPV =
+    ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+  unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+  unsigned Opc;
+  unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
+  // Load value.
+  if (isThumb2) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::t2LDRpci), DestReg1)
+                    .addConstantPoolIndex(Idx));
+    Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
+  } else {
+    // The extra immediate is for addrmode2.
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                            DL, TII.get(ARM::LDRcp), DestReg1)
+                    .addConstantPoolIndex(Idx).addImm(0));
+    Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
+  }
+
+  unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+  if (GlobalBaseReg == 0) {
+    GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT));
+    AFI->setGlobalBaseReg(GlobalBaseReg);
+  }
+
+  unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(Opc), DestReg2)
+                            .addReg(DestReg1)
+                            .addReg(GlobalBaseReg);
+  if (!UseGOTOFF)
+    MIB.addImm(0);
+  AddOptionalDefs(MIB);
+
+  return DestReg2;
+}
+
+bool ARMFastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  switch (CC) {
+  default:
+    return false;
+  case CallingConv::Fast:
+  case CallingConv::C:
+  case CallingConv::ARM_AAPCS_VFP:
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+    break;
+  }
+
+  // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
+  // which are passed in r0 - r3.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 4)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      break;
+    default:
+      return false;
+    }
+  }
+
+
+  static const uint16_t GPRArgRegs[] = {
+    ARM::R0, ARM::R1, ARM::R2, ARM::R3
+  };
+
+  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  Idx = 0;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (I->use_empty())
+      continue;
+    unsigned SrcReg = GPRArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+namespace llvm {
+  FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    // Completely untested on non-iOS.
+    const TargetMachine &TM = funcInfo.MF->getTarget();
+
+    // Darwin and thumb1 only for now.
+    const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only())
+      return new ARMFastISel(funcInfo, libInfo);
+    return 0;
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
new file mode 100644
index 000000000000..7a02adf24633
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -0,0 +1,1430 @@
+//===-- ARMFrameLowering.cpp - ARM Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMFrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true),
+                     cl::desc("Align ARM NEON spills in prolog and epilog"));
+
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs);
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+
+  // iOS requires FP not to be clobbered for backtracing purpose.
+  if (STI.isTargetIOS())
+    return true;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Always eliminate non-leaf frame pointers.
+  return ((MF.getTarget().Options.DisableFramePointerElim(MF) &&
+           MFI->hasCalls()) ||
+          RegInfo->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool ARMFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Unlike most targets, having a FP
+/// is not sufficient here since we still may reference some objects via SP
+/// even when FP is available in Thumb2 mode.
+bool
+ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI,
+                        const ARMBaseInstrInfo &TII,
+                        const uint16_t *CSRegs) {
+  // Integer spill area is handled with "pop".
+  if (MI->getOpcode() == ARM::LDMIA_RET ||
+      MI->getOpcode() == ARM::t2LDMIA_RET ||
+      MI->getOpcode() == ARM::LDMIA_UPD ||
+      MI->getOpcode() == ARM::t2LDMIA_UPD ||
+      MI->getOpcode() == ARM::VLDMDIA_UPD) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
+       MI->getOpcode() == ARM::LDR_POST_REG ||
+       MI->getOpcode() == ARM::t2LDR_POST) &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
+      MI->getOperand(1).getReg() == ARM::SP)
+    return true;
+
+  return false;
+}
+
+static void
+emitSPUpdate(bool isARM,
+             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+             DebugLoc dl, const ARMBaseInstrInfo &TII,
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
+             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII, MIFlags);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                           Pred, PredReg, TII, MIFlags);
+}
+
+void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+  int D8SpillFI = 0;
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  // Allocate the vararg register save area. This is not counted in NumBytes.
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize,
+                 MachineInstr::FrameSetup);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetIOS()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      // This is a DPR. Exclude the aligned DPRCS2 spills.
+      if (Reg == ARM::D8)
+        D8SpillFI = FI;
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) {
+        AFI->addDPRCalleeSavedAreaFrame(FI);
+        DPRCSSize += 8;
+      }
+    }
+  }
+
+  // Move past area 1.
+  if (GPRCS1Size > 0) MBBI++;
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it is
+  // now safe to emit this assignment.
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
+    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+    AddDefaultCC(AddDefaultPred(MIB));
+  }
+
+  // Move past area 2.
+  if (GPRCS2Size > 0) MBBI++;
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  // Move past area 3.
+  if (DPRCSSize > 0) {
+    MBBI++;
+    // Since vpush register list cannot have gaps, there may be multiple vpush
+    // instructions in the prologue.
+    while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
+      MBBI++;
+  }
+
+  // Move past the aligned DPRCS2 area.
+  if (AFI->getNumAlignedDPRCS2Regs() > 0) {
+    MBBI = skipAlignedDPRCS2Spills(MBBI, AFI->getNumAlignedDPRCS2Regs());
+    // The code inserted by emitAlignedDPRCS2Spills realigns the stack, and
+    // leaves the stack pointer pointing to the DPRCS2 area.
+    //
+    // Adjust NumBytes to represent the stack slots below the DPRCS2 area.
+    NumBytes += MFI->getObjectOffset(D8SpillFI);
+  } else
+    NumBytes = DPRCSOffset;
+
+  if (NumBytes) {
+    // Adjust SP after all the callee-save spills.
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                 MachineInstr::FrameSetup);
+    if (HasFP && isARM)
+      // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
+      // Note it's not safe to do this in Thumb2 mode because it would have
+      // taken two instructions:
+      // mov sp, r7
+      // sub sp, #24
+      // If an interrupt is taken between the two instructions, then sp is in
+      // an inconsistent state (pointing to the middle of callee-saved area).
+      // The interrupt handler can end up clobbering the registers.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (STI.isTargetELF() && hasFP(MF))
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need dynamic stack realignment, do it here. Be paranoid and make
+  // sure if we also have VLAs, we have a base pointer for frame access.
+  // If aligned NEON registers were spilled, the stack has already been
+  // realigned.
+  if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
+    unsigned MaxAlign = MFI->getMaxAlignment();
+    assert (!AFI->isThumb1OnlyFunction());
+    if (!AFI->isThumbFunction()) {
+      // Emit bic sp, sp, MaxAlign
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::BICri), ARM::SP)
+                                  .addReg(ARM::SP, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+    } else {
+      // We cannot use sp as source/dest register here, thus we're emitting the
+      // following sequence:
+      // mov r4, sp
+      // bic r4, r4, MaxAlign
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+        .addReg(ARM::SP, RegState::Kill));
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::t2BICri), ARM::R4)
+                                  .addReg(ARM::R4, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+        .addReg(ARM::R4, RegState::Kill));
+    }
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  // FIXME: Clarify FrameSetup flags here.
+  if (RegInfo->hasBasePointer(MF)) {
+    if (isARM)
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+        .addReg(ARM::SP)
+        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+    else
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                             RegInfo->getBaseRegister())
+        .addReg(ARM::SP));
+  }
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not support Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
+  if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
+    return;
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      if (!isCSRestore(MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Reset SP based on frame pointer only if the stack frame extends beyond
+    // frame pointer stack slot or target is ELF and the function has FP.
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      if (NumBytes) {
+        if (isARM)
+          emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                  ARMCC::AL, 0, TII);
+        else {
+          // It's not possible to restore SP from FP in a single instruction.
+          // For iOS, this looks like:
+          // mov sp, r7
+          // sub sp, #24
+          // This is bad, if an interrupt is taken after the mov, sp is in an
+          // inconsistent state.
+          // Use the first callee-saved register as a scratch register.
+          assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+                 "No scratch register to restore SP from FP!");
+          emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                 ARMCC::AL, 0, TII);
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(ARM::R4));
+        }
+      } else {
+        // Thumb2 or ARM.
+        if (isARM)
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+        else
+          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                                 ARM::SP)
+            .addReg(FramePtr));
+      }
+    } else if (NumBytes)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Increment past our save areas.
+    if (AFI->getDPRCalleeSavedAreaSize()) {
+      MBBI++;
+      // Since vpop register list cannot have gaps, there may be multiple vpop
+      // instructions in the epilogue.
+      while (MBBI->getOpcode() == ARM::VLDMDIA_UPD)
+        MBBI++;
+    }
+    if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
+    if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+  }
+
+  if (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+
+    // Jump to label or value in register.
+    if (RetOpcode == ARM::TCRETURNdi) {
+      unsigned TCOpcode = STI.isThumb() ?
+               (STI.isTargetIOS() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) :
+               ARM::TAILJMPd;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+
+      // Add the default predicate in Thumb mode.
+      if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
+    } else if (RetOpcode == ARM::TCRETURNri) {
+      BuildMI(MBB, MBBI, dl,
+              TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = prior(MBBI);
+    for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+    MBBI = NewMI;
+  }
+
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
+}
+
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int
+ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                         unsigned &FrameReg) const {
+  return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
+}
+
+int
+ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+                                             int FI, unsigned &FrameReg,
+                                             int SPAdj) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  FrameReg = ARM::SP;
+  Offset += SPAdj;
+  if (AFI->isGPRCalleeSavedArea1Frame(FI))
+    return Offset - AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
+    return Offset - AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
+    return Offset - AFI->getDPRCalleeSavedAreaOffset();
+
+  // SP can move around if there are allocas.  We may also lose track of SP
+  // when emergency spilling inside a non-reserved call frame setup.
+  bool hasMovingSP = !hasReservedCallFrame(MF);
+
+  // When dynamically realigning the stack, use the frame pointer for
+  // parameters, and the stack/base pointer for locals.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    if (isFixed) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      Offset = FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) &&
+             "VLAs and dynamic stack alignment, but missing base pointer!");
+      FrameReg = RegInfo->getBaseRegister();
+    }
+    return Offset;
+  }
+
+  // If there is a frame pointer, use it when we can.
+  if (hasFP(MF) && AFI->hasStackFrame()) {
+    // Use frame pointer to reference fixed objects. Use it for locals if
+    // there are VLAs (and thus the SP isn't reliable as a base).
+    if (isFixed || (hasMovingSP && !RegInfo->hasBasePointer(MF))) {
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    } else if (hasMovingSP) {
+      assert(RegInfo->hasBasePointer(MF) && "missing base pointer!");
+      if (AFI->isThumb2Function()) {
+        // Try to use the frame pointer if we can, else use the base pointer
+        // since it's available. This is handy for the emergency spill slot, in
+        // particular.
+        if (FPOffset >= -255 && FPOffset < 0) {
+          FrameReg = RegInfo->getFrameRegister(MF);
+          return FPOffset;
+        }
+      }
+    } else if (AFI->isThumb2Function()) {
+      // Use  add <rd>, sp, #<imm8>
+      //      ldr <rd>, [sp, #<imm8>]
+      // if at all possible to save space.
+      if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
+        return Offset;
+      // In Thumb2 mode, the negative offset is very limited. Try to avoid
+      // out of range references. ldr <rt>,[<rn>, #-<imm8>]
+      if (FPOffset >= -255 && FPOffset < 0) {
+        FrameReg = RegInfo->getFrameRegister(MF);
+        return FPOffset;
+      }
+    } else if (Offset > (FPOffset < 0 ? -FPOffset : FPOffset)) {
+      // Otherwise, use SP or FP, whichever is closer to the stack slot.
+      FrameReg = RegInfo->getFrameRegister(MF);
+      return FPOffset;
+    }
+  }
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  return Offset;
+}
+
+int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    unsigned StmOpc, unsigned StrOpc,
+                                    bool NoGap,
+                                    bool(*Func)(unsigned, bool),
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  SmallVector<std::pair<unsigned,bool>, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetIOS())) continue;
+
+      // D-registers in the aligned area DPRCS2 are NOT spilled here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      // Add the callee-saved register as live-in unless it's LR and
+      // @llvm.returnaddress is called. If LR is returned for
+      // @llvm.returnaddress then it's already added to the function and
+      // entry block live-in sets.
+      bool isKill = true;
+      if (Reg == ARM::LR) {
+        if (MF.getFrameInfo()->isReturnAddressTaken() &&
+            MF.getRegInfo().isLiveIn(Reg))
+          isKill = false;
+      }
+
+      if (isKill)
+        MBB.addLiveIn(Reg);
+
+      // If NoGap is true, push consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpush {d8, d10, d11} -> vpush {d8}, vpush {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+      LastReg = Reg;
+      Regs.push_back(std::make_pair(Reg, isKill));
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || StrOpc== 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                       .addReg(ARM::SP).setMIFlags(MIFlags));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
+    } else if (Regs.size() == 1) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
+                                        ARM::SP)
+        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+        .addReg(ARM::SP).setMIFlags(MIFlags)
+        .addImm(-4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+  }
+}
+
+void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   unsigned LdmOpc, unsigned LdrOpc,
+                                   bool isVarArg, bool NoGap,
+                                   bool(*Func)(unsigned, bool),
+                                   unsigned NumAlignedDPRCS2Regs) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned RetOpcode = MI->getOpcode();
+  bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
+                     RetOpcode == ARM::TCRETURNri);
+
+  SmallVector<unsigned, 4> Regs;
+  unsigned i = CSI.size();
+  while (i != 0) {
+    unsigned LastReg = 0;
+    bool DeleteRet = false;
+    for (; i != 0; --i) {
+      unsigned Reg = CSI[i-1].getReg();
+      if (!(Func)(Reg, STI.isTargetIOS())) continue;
+
+      // The aligned reloads from area DPRCS2 are not inserted here.
+      if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
+        continue;
+
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+        Reg = ARM::PC;
+        LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
+        // Fold the return instruction into the LDM.
+        DeleteRet = true;
+      }
+
+      // If NoGap is true, pop consecutive registers and then leave the rest
+      // for other instructions. e.g.
+      // vpop {d8, d10, d11} -> vpop {d8}, vpop {d10, d11}
+      if (NoGap && LastReg && LastReg != Reg-1)
+        break;
+
+      LastReg = Reg;
+      Regs.push_back(Reg);
+    }
+
+    if (Regs.empty())
+      continue;
+    if (Regs.size() > 1 || LdrOpc == 0) {
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                       .addReg(ARM::SP));
+      for (unsigned i = 0, e = Regs.size(); i < e; ++i)
+        MIB.addReg(Regs[i], getDefRegState(true));
+      if (DeleteRet) {
+        MIB.copyImplicitOps(&*MI);
+        MI->eraseFromParent();
+      }
+      MI = MIB;
+    } else if (Regs.size() == 1) {
+      // If we adjusted the reg to PC from LR above, switch it back here. We
+      // only do that for LDM.
+      if (Regs[0] == ARM::PC)
+        Regs[0] = ARM::LR;
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP);
+      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
+      // that refactoring is complete (eventually).
+      if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
+        MIB.addReg(0);
+        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
+      } else
+        MIB.addImm(4);
+      AddDefaultPred(MIB);
+    }
+    Regs.clear();
+  }
+}
+
+/// Emit aligned spill instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  Also insert stack realignment code and leave the stack
+/// pointer pointing to the d8 spill slot.
+static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned NumAlignedDPRCS2Regs,
+                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  // Mark the D-register spill slots as properly aligned.  Since MFI computes
+  // stack slot layout backwards, this can actually mean that the d-reg stack
+  // slot offsets can be wrong. The offset for d8 will always be correct.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned DNum = CSI[i].getReg() - ARM::D8;
+    if (DNum >= 8)
+      continue;
+    int FI = CSI[i].getFrameIdx();
+    // The even-numbered registers will be 16-byte aligned, the odd-numbered
+    // registers will be 8-byte aligned.
+    MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+
+    // The stack slot for D8 needs to be maximally aligned because this is
+    // actually the point where we align the stack pointer.  MachineFrameInfo
+    // computes all offsets relative to the incoming stack pointer which is a
+    // bit weird when realigning the stack.  Any extra padding for this
+    // over-alignment is not realized because the code inserted below adjusts
+    // the stack pointer by numregs * 8 before aligning the stack pointer.
+    if (DNum == 0)
+      MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+  }
+
+  // Move the stack pointer to the d8 spill slot, and align it at the same
+  // time. Leave the stack slot address in the scratch register r4.
+  //
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  //
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+  AFI->setShouldRestoreSPFromFP(true);
+
+  // sub r4, sp, #numregs * 8
+  // The immediate is <= 64, so it doesn't need any special encoding.
+  unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addReg(ARM::SP)
+                              .addImm(8 * NumAlignedDPRCS2Regs)));
+
+  // bic r4, r4, #align-1
+  Opc = isThumb ? ARM::t2BICri : ARM::BICri;
+  unsigned MaxAlign = MF.getFrameInfo()->getMaxAlignment();
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addReg(ARM::R4, RegState::Kill)
+                              .addImm(MaxAlign - 1)));
+
+  // mov sp, r4
+  // The stack pointer must be adjusted before spilling anything, otherwise
+  // the stack slots could be clobbered by an interrupt handler.
+  // Leave r4 live, it is used below.
+  Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
+                            .addReg(ARM::R4);
+  MIB = AddDefaultPred(MIB);
+  if (!isThumb)
+    AddDefaultCC(MIB);
+
+  // Now spill NumAlignedDPRCS2Regs registers starting from d8.
+  // r4 holds the stack slot address.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vst1.64 with 4 d-regs and address writeback.
+  // The writeback is only needed when emitting two vst1.64 instructions.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
+                           ARM::R4)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vst1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+                   .addReg(ARM::R4).addImm(16).addReg(NextReg)
+                   .addReg(SupReg, RegState::ImplicitKill));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vst1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    MBB.addLiveIn(SupReg);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+                   .addReg(ARM::R4).addImm(16).addReg(SupReg));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vstr.64 for the odd last register.
+  if (NumAlignedDPRCS2Regs) {
+    MBB.addLiveIn(NextReg);
+    // vstr.64 uses addrmode5 which has an offset scale of 4.
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+                   .addReg(NextReg)
+                   .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+  }
+
+  // The last spill instruction inserted should kill the scratch register r4.
+  llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+/// Skip past the code inserted by emitAlignedDPRCS2Spills, and return an
+/// iterator to the following instruction.
+static MachineBasicBlock::iterator
+skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
+                        unsigned NumAlignedDPRCS2Regs) {
+  //   sub r4, sp, #numregs * 8
+  //   bic r4, r4, #align - 1
+  //   mov sp, r4
+  ++MI; ++MI; ++MI;
+  assert(MI->mayStore() && "Expecting spill instruction");
+
+  // These switches all fall through.
+  switch(NumAlignedDPRCS2Regs) {
+  case 7:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  default:
+    ++MI;
+    assert(MI->mayStore() && "Expecting spill instruction");
+  case 1:
+  case 2:
+  case 4:
+    assert(MI->killsRegister(ARM::R4) && "Missed kill flag");
+    ++MI;
+  }
+  return MI;
+}
+
+/// Emit aligned reload instructions for NumAlignedDPRCS2Regs D-registers
+/// starting from d8.  These instructions are assumed to execute while the
+/// stack is still aligned, unlike the code inserted by emitPopInst.
+static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      unsigned NumAlignedDPRCS2Regs,
+                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      const TargetRegisterInfo *TRI) {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Find the frame index assigned to d8.
+  int D8SpillFI = 0;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    if (CSI[i].getReg() == ARM::D8) {
+      D8SpillFI = CSI[i].getFrameIdx();
+      break;
+    }
+
+  // Materialize the address of the d8 spill slot into the scratch register r4.
+  // This can be fairly complicated if the stack frame is large, so just use
+  // the normal frame index elimination mechanism to do it.  This code runs as
+  // the initial part of the epilog where the stack and base pointers haven't
+  // been changed yet.
+  bool isThumb = AFI->isThumbFunction();
+  assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
+
+  unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
+  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+                              .addFrameIndex(D8SpillFI).addImm(0)));
+
+  // Now restore NumAlignedDPRCS2Regs registers starting from d8.
+  unsigned NextReg = ARM::D8;
+
+  // 16-byte aligned vld1.64 with 4 d-regs and writeback.
+  if (NumAlignedDPRCS2Regs >= 6) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+                   .addReg(ARM::R4, RegState::Define)
+                   .addReg(ARM::R4, RegState::Kill).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // We won't modify r4 beyond this point.  It currently points to the next
+  // register to be spilled.
+  unsigned R4BaseReg = NextReg;
+
+  // 16-byte aligned vld1.64 with 4 d-regs, no writeback.
+  if (NumAlignedDPRCS2Regs >= 4) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QQPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+                   .addReg(ARM::R4).addImm(16)
+                   .addReg(SupReg, RegState::ImplicitDefine));
+    NextReg += 4;
+    NumAlignedDPRCS2Regs -= 4;
+  }
+
+  // 16-byte aligned vld1.64 with 2 d-regs.
+  if (NumAlignedDPRCS2Regs >= 2) {
+    unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
+                                               &ARM::QPRRegClass);
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+                   .addReg(ARM::R4).addImm(16));
+    NextReg += 2;
+    NumAlignedDPRCS2Regs -= 2;
+  }
+
+  // Finally, use a vanilla vldr.64 for the remaining odd register.
+  if (NumAlignedDPRCS2Regs)
+    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+                   .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+
+  // Last store kills r4.
+  llvm::prior(MI)->addRegisterKilled(ARM::R4, TRI);
+}
+
+bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
+  unsigned PushOneOpc = AFI->isThumbFunction() ?
+    ARM::t2STR_PRE : ARM::STR_PRE_IMM;
+  unsigned FltOpc = ARM::VSTMDDB_UPD;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
+               MachineInstr::FrameSetup);
+  emitPushInst(MBB, MI, CSI, FltOpc, 0, true, &isARMArea3Register,
+               NumAlignedDPRCS2Regs, MachineInstr::FrameSetup);
+
+  // The code above does not insert spill code for the aligned DPRCS2 registers.
+  // The stack realignment code will be inserted between the push instructions
+  // and these spills.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Spills(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  return true;
+}
+
+bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+
+  // The emitPopInst calls below do not insert reloads for the aligned DPRCS2
+  // registers. Do that here instead.
+  if (NumAlignedDPRCS2Regs)
+    emitAlignedDPRCS2Restores(MBB, MI, NumAlignedDPRCS2Regs, CSI, TRI);
+
+  unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
+  unsigned FltOpc = ARM::VLDMDIA_UPD;
+  emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register,
+              NumAlignedDPRCS2Regs);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea2Register, 0);
+  emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
+              &isARMArea1Register, 0);
+
+  return true;
+}
+
+// FIXME: Make generic?
+static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
+                                       const ARMBaseInstrInfo &TII) {
+  unsigned FnSize = 0;
+  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
+         I != E; ++I)
+      FnSize += TII.GetInstSizeInBytes(I);
+  }
+  return FnSize;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+// FIXME: Move to TII?
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+                                         const TargetFrameLowering *TFI) {
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned Limit = (1 << 12) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        // When using ADDri to get the address of a stack object, 255 is the
+        // largest offset guaranteed to fit in the immediate offset.
+        if (I->getOpcode() == ARM::ADDri) {
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        }
+
+        // Otherwise check the addressing mode.
+        switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
+        case ARMII::AddrMode3:
+        case ARMII::AddrModeT2_i8:
+          Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode5:
+        case ARMII::AddrModeT2_i8s4:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+          break;
+        case ARMII::AddrModeT2_i12:
+          // i12 supports only positive offset so these will be converted to
+          // i8 opcodes. See llvm::rewriteT2FrameIndex.
+          if (TFI->hasFP(MF) && AFI->hasStackFrame())
+            Limit = std::min(Limit, (1U << 8) - 1);
+          break;
+        case ARMII::AddrMode4:
+        case ARMII::AddrMode6:
+          // Addressing modes 4 & 6 (load/store) instructions can't encode an
+          // immediate offset for stack references.
+          return 0;
+        default:
+          break;
+        }
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+// In functions that realign the stack, it can be an advantage to spill the
+// callee-saved vector registers after realigning the stack. The vst1 and vld1
+// instructions take alignment hints that can improve performance.
+//
+static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(0);
+  if (!SpillAlignedNEONRegs)
+    return;
+
+  // Naked functions don't spill callee-saved registers.
+  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                     Attribute::Naked))
+    return;
+
+  // We are planning to use NEON instructions vst1 / vld1.
+  if (!MF.getTarget().getSubtarget<ARMSubtarget>().hasNEON())
+    return;
+
+  // Don't bother if the default stack alignment is sufficiently high.
+  if (MF.getTarget().getFrameLowering()->getStackAlignment() >= 8)
+    return;
+
+  // Aligned spills require stack realignment.
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (!RegInfo->canRealignStack(MF))
+    return;
+
+  // We always spill contiguous d-registers starting from d8. Count how many
+  // needs spilling.  The register allocator will almost always use the
+  // callee-saved registers in order, but it can happen that there are holes in
+  // the range.  Registers above the hole will be spilled to the standard DPRCS
+  // area.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned NumSpills = 0;
+  for (; NumSpills < 8; ++NumSpills)
+    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
+      break;
+
+  // Don't do this for just one d-register. It's not worth it.
+  if (NumSpills < 2)
+    return;
+
+  // Spill the first NumSpills D-registers after realigning the stack.
+  MF.getInfo<ARMFunctionInfo>()->setNumAlignedDPRCS2Regs(NumSpills);
+
+  // A scratch register is required for the vst1 / vld1 instructions.
+  MF.getRegInfo().setPhysRegUsed(ARM::R4);
+}
+
+void
+ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  const ARMBaseRegisterInfo *RegInfo =
+    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register. Also spill R4 if Thumb2 function has varsized objects,
+  // since it's not always possible to restore sp from fp in a single
+  // instruction.
+  // FIXME: It will be better just to find spare register here.
+  if (AFI->isThumb2Function() &&
+      (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
+    MRI.setPhysRegUsed(ARM::R4);
+
+  if (AFI->isThumb1OnlyFunction()) {
+    // Spill LR if Thumb1 function uses variable length argument lists.
+    if (AFI->getVarArgsRegSaveSize() > 0)
+      MRI.setPhysRegUsed(ARM::LR);
+
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
+    // for sure what the stack size will be, but for this, an estimate is good
+    // enough. If there anything changes it, it'll be a spill, which implies
+    // we've used all the registers and so R4 is already used, so not marking
+    // it here will be OK.
+    // FIXME: It will be better just to find spare register here.
+    unsigned StackSize = MFI->estimateStackSize(MF);
+    if (MFI->hasVarSizedObjects() || StackSize > 508)
+      MRI.setPhysRegUsed(ARM::R4);
+  }
+
+  // See if we can spill vector registers to aligned stack.
+  checkNumAlignedDPRCS2Regs(MF);
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MRI.setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MRI.isPhysRegUsed(Reg)) {
+      Spilled = true;
+      CanEliminateFrame = false;
+    }
+
+    if (!ARM::GPRRegClass.contains(Reg))
+      continue;
+
+    if (Spilled) {
+      NumGPRSpills++;
+
+      if (!STI.isTargetIOS()) {
+        if (Reg == ARM::LR)
+          LRSpilled = true;
+        CS1Spilled = true;
+        continue;
+      }
+
+      // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+      switch (Reg) {
+      case ARM::LR:
+        LRSpilled = true;
+        // Fallthrough
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+        CS1Spilled = true;
+        break;
+      default:
+        break;
+      }
+    } else {
+      if (!STI.isTargetIOS()) {
+        UnspilledCS1GPRs.push_back(Reg);
+        continue;
+      }
+
+      switch (Reg) {
+      case ARM::R4: case ARM::R5:
+      case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        UnspilledCS1GPRs.push_back(Reg);
+        break;
+      default:
+        UnspilledCS2GPRs.push_back(Reg);
+        break;
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = GetFunctionSizeInBytes(MF, TII);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  // If any of the stack slot references may be out of range of an immediate
+  // offset, make sure a register (or a spill slot) is available for the
+  // register scavenger. Note that if we're indexing off the frame pointer, the
+  // effective stack size is 4 bytes larger since the FP points to the stack
+  // slot of the previous FP. Also, if we have variable sized objects in the
+  // function, stack slot references will often be negative, and some of
+  // our instructions are positive-offset only, so conservatively consider
+  // that case to want a spill slot (or register) as well. Similarly, if
+  // the function adjusts the stack pointer during execution and the
+  // adjustments aren't already part of our stack size estimate, our offset
+  // calculations may be off, so be conservative.
+  // FIXME: We could add logic to be more precise about negative offsets
+  //        and which instructions will need a scratch register for them. Is it
+  //        worth the effort and added fragility?
+  bool BigStack =
+    (RS &&
+     (MFI->estimateStackSize(MF) +
+      ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >=
+      estimateRSStackSizeLimit(MF, this)))
+    || MFI->hasVarSizedObjects()
+    || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF));
+
+  bool ExtraCSSpill = false;
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MRI.setPhysRegUsed(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    if (hasFP(MF)) {
+      MRI.setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs, spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb1
+          if (!AFI->isThumb1OnlyFunction() ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            MRI.setPhysRegUsed(Reg);
+            if (!MRI.isReserved(Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MRI.setPhysRegUsed(Reg);
+        if (!MRI.isReserved(Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (BigStack && !ExtraCSSpill) {
+      // If any non-reserved CS register isn't spilled, just spill one or two
+      // extra. That should take care of it!
+      unsigned NumExtras = TargetAlign / 4;
+      SmallVector<unsigned, 2> Extras;
+      while (NumExtras && !UnspilledCS1GPRs.empty()) {
+        unsigned Reg = UnspilledCS1GPRs.back();
+        UnspilledCS1GPRs.pop_back();
+        if (!MRI.isReserved(Reg) &&
+            (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
+             Reg == ARM::LR)) {
+          Extras.push_back(Reg);
+          NumExtras--;
+        }
+      }
+      // For non-Thumb1 functions, also check for hi-reg CS registers
+      if (!AFI->isThumb1OnlyFunction()) {
+        while (NumExtras && !UnspilledCS2GPRs.empty()) {
+          unsigned Reg = UnspilledCS2GPRs.back();
+          UnspilledCS2GPRs.pop_back();
+          if (!MRI.isReserved(Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+      }
+      if (Extras.size() && NumExtras == 0) {
+        for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+          MRI.setPhysRegUsed(Extras[i]);
+        }
+      } else if (!AFI->isThumb1OnlyFunction()) {
+        // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
+        // closest to SP or frame pointer.
+        const TargetRegisterClass *RC = &ARM::GPRRegClass;
+        RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                           RC->getAlignment(),
+                                                           false));
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MRI.setPhysRegUsed(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+
+void ARMFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not support Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      int PIdx = Old->findFirstPredOperandIdx();
+      ARMCC::CondCodes Pred = (PIdx == -1)
+        ? ARMCC::AL : (ARMCC::CondCodes)Old->getOperand(PIdx).getImm();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old->getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
+                     Pred, PredReg);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
new file mode 100644
index 000000000000..efa255a5574a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -0,0 +1,82 @@
+//==-- ARMTargetFrameLowering.h - Define frame lowering for ARM --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMFrameLowering : public TargetFrameLowering {
+protected:
+  const ARMSubtarget &STI;
+
+public:
+  explicit ARMFrameLowering(const ARMSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+      STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  int ResolveFrameIndexReference(const MachineFunction &MF,
+                                 int FI,
+                                 unsigned &FrameReg, int SPAdj) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+
+ private:
+  void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
+                    unsigned StrOpc, bool NoGap,
+                    bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
+                    unsigned MIFlags = 0) const;
+  void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                   const std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+                   unsigned LdrOpc, bool isVarArg, bool NoGap,
+                   bool(*Func)(unsigned, bool),
+                   unsigned NumAlignedDPRCS2Regs) const;
+
+  virtual void eliminateCallFramePseudoInstr(
+                                    MachineFunction &MF,
+                                    MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
new file mode 100644
index 000000000000..1240169e84ed
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -0,0 +1,98 @@
+//===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMHazardRecognizer.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
+                         const TargetRegisterInfo &TRI) {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
+  return false;
+}
+
+ScheduleHazardRecognizer::HazardType
+ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "ARM hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (!MI->isDebugValue()) {
+    // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
+    // a VMLA / VMLS will cause 4 cycle stall.
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
+      MachineInstr *DefMI = LastMI;
+      const MCInstrDesc &LastMCID = LastMI->getDesc();
+      // Skip over one non-VFP / NEON instruction.
+      if (!LastMI->isBarrier() &&
+          // On A9, AGU and NEON/FPU are muxed.
+          !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+          (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
+        MachineBasicBlock::iterator I = LastMI;
+        if (I != LastMI->getParent()->begin()) {
+          I = llvm::prior(I);
+          DefMI = &*I;
+        }
+      }
+
+      if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
+          (TII.canCauseFpMLxStall(MI->getOpcode()) ||
+           hasRAWHazard(DefMI, MI, TRI))) {
+        // Try to schedule another instruction for the next 4 cycles.
+        if (FpMLxStalls == 0)
+          FpMLxStalls = 4;
+        return Hazard;
+      }
+    }
+  }
+
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void ARMHazardRecognizer::Reset() {
+  LastMI = 0;
+  FpMLxStalls = 0;
+  ScoreboardHazardRecognizer::Reset();
+}
+
+void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+  if (!MI->isDebugValue()) {
+    LastMI = MI;
+    FpMLxStalls = 0;
+  }
+
+  ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+void ARMHazardRecognizer::AdvanceCycle() {
+  if (FpMLxStalls && --FpMLxStalls == 0)
+    // Stalled for 4 cycles but still can't schedule any other instructions.
+    LastMI = 0;
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void ARMHazardRecognizer::RecedeCycle() {
+  llvm_unreachable("reverse ARM hazard checking unsupported");
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
new file mode 100644
index 000000000000..98bfc4cf0cc5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -0,0 +1,56 @@
+//===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling ARM functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMHAZARDRECOGNIZER_H
+#define ARMHAZARDRECOGNIZER_H
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+
+namespace llvm {
+
+class ARMBaseInstrInfo;
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
+class MachineInstr;
+
+/// ARMHazardRecognizer handles special constraints that are not expressed in
+/// the scheduling itinerary. This is only used during postRA scheduling. The
+/// ARM preRA scheduler uses an unspecialized instance of the
+/// ScoreboardHazardRecognizer.
+class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
+  const ARMBaseInstrInfo &TII;
+  const ARMBaseRegisterInfo &TRI;
+  const ARMSubtarget &STI;
+
+  MachineInstr *LastMI;
+  unsigned FpMLxStalls;
+
+public:
+  ARMHazardRecognizer(const InstrItineraryData *ItinData,
+                      const ARMBaseInstrInfo &tii,
+                      const ARMBaseRegisterInfo &tri,
+                      const ARMSubtarget &sti,
+                      const ScheduleDAG *DAG) :
+    ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii),
+    TRI(tri), STI(sti), LastMI(0) {}
+
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual void Reset();
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void RecedeCycle();
+};
+
+} // end namespace llvm
+
+#endif // ARMHAZARDRECOGNIZER_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 000000000000..2c51de23f7dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -0,0 +1,3603 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-isel"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableShifterOp("disable-shifter-op", cl::Hidden,
+  cl::desc("Disable isel of shifter-op"),
+  cl::init(false));
+
+static cl::opt<bool>
+CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
+  cl::desc("Check fp vmla / vmls hazard at isel time"),
+  cl::init(true));
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+
+enum AddrMode2Type {
+  AM2_BASE, // Simple AM2 (+-imm12)
+  AM2_SHOP  // Shifter-op AM2
+};
+
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  ARMBaseTargetMachine &TM;
+  const ARMBaseInstrInfo *TII;
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
+                           CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm),
+      TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
+      Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM Instruction Selection";
+  }
+
+  virtual void PreprocessISelDAG();
+
+  /// getI32Imm - Return a target constant of type i32 with the specified
+  /// value.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  SDNode *Select(SDNode *N);
+
+
+  bool hasNoVMLxHazardUse(SDNode *N) const;
+  bool isShifterOpProfitable(const SDValue &Shift,
+                             ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
+  bool SelectRegShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, SDValue &C,
+                               bool CheckProfitability = true);
+  bool SelectImmShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, bool CheckProfitability = true);
+  bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B, SDValue &C) {
+    // Don't apply the profitability check
+    return SelectRegShifterOperand(N, A, B, C, false);
+  }
+  bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B) {
+    // Don't apply the profitability check
+    return SelectImmShifterOperand(N, A, B, false);
+  }
+
+  bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
+
+  AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+                                      SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+  }
+
+  bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+                           SDValue &Opc) {
+    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+  }
+
+  bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+                       SDValue &Opc) {
+    SelectAddrMode2Worker(N, Base, Offset, Opc);
+//    return SelectAddrMode2ShOp(N, Base, Offset, Opc);
+    // This always matches one way or another.
+    return true;
+  }
+
+  bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
+  bool SelectAddrMode3(SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode5(SDValue N, SDValue &Base,
+                       SDValue &Offset);
+  bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
+  bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
+
+  bool SelectAddrModePC(SDValue N, SDValue &Offset, SDValue &Label);
+
+  // Thumb Addressing Modes:
+  bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset,
+                             unsigned Scale);
+  bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset);
+  bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base,
+                                SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                 SDValue &OffImm);
+  bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm);
+
+  // Thumb 2 Addressing Modes:
+  bool SelectT2ShifterOperandReg(SDValue N,
+                                 SDValue &BaseReg, SDValue &Opc);
+  bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+  bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+                            SDValue &OffImm);
+  bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                 SDValue &OffImm);
+  bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
+                             SDValue &OffReg, SDValue &ShImm);
+
+  inline bool is_so_imm(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }
+
+  inline bool is_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getSOImmVal(~Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }
+
+  inline bool is_t2_so_imm_not(unsigned Imm) const {
+    return ARM_AM::getT2SOImmVal(~Imm) != -1;
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
+  /// ARM.
+  SDNode *SelectARMIndexedLoad(SDNode *N);
+  SDNode *SelectT2IndexedLoad(SDNode *N);
+
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
+  /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// loads of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+
+  /// SelectVST - Select NEON store intrinsics.  NumVecs should
+  /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// stores of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs <= 2, QOpcodes1 is not used.
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *DOpcodes,
+                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+
+  /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// load/store of D registers and Q registers.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
+                          bool isUpdating, unsigned NumVecs,
+                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
+
+  /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
+  /// should be 2, 3 or 4.  The opcode array specifies the instructions used
+  /// for loading D registers.  (Q registers are not supported.)
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       const uint16_t *Opcodes);
+
+  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
+  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
+  /// generated to force the table registers to be consecutive.
+  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+
+  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
+  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+
+  /// SelectCMOVOp - Select CMOV instructions for ARM.
+  SDNode *SelectCMOVOp(SDNode *N);
+  SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                              ARMCC::CondCodes CCVal, SDValue CCR,
+                              SDValue InFlag);
+  SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                               ARMCC::CondCodes CCVal, SDValue CCR,
+                               SDValue InFlag);
+  SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                              ARMCC::CondCodes CCVal, SDValue CCR,
+                              SDValue InFlag);
+  SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                               ARMCC::CondCodes CCVal, SDValue CCR,
+                               SDValue InFlag);
+
+  // Select special operations if node forms integer ABS pattern
+  SDNode *SelectABSOp(SDNode *N);
+
+  SDNode *SelectInlineAsm(SDNode *N);
+
+  SDNode *SelectConcatVector(SDNode *N);
+
+  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  // Form pairs of consecutive R, S, D, or Q registers.
+  SDNode *createGPRPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createSRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createDRegPairNode(EVT VT, SDValue V0, SDValue V1);
+  SDNode *createQRegPairNode(EVT VT, SDValue V0, SDValue V1);
+
+  // Form sequences of 4 consecutive S, D, or Q registers.
+  SDNode *createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+  SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
+
+  // Get the alignment operand for a NEON VLD or VST instruction.
+  SDValue GetVLDSTAlign(SDValue Align, unsigned NumVecs, bool is64BitVector);
+};
+}
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc &&
+         isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+/// \brief Check whether a particular node is a constant value representable as
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
+///
+/// \param ScaledConstant [out] - On success, the pre-scaled constant value.
+static bool isScaledConstantInRange(SDValue Node, int Scale,
+                                    int RangeMin, int RangeMax,
+                                    int &ScaledConstant) {
+  assert(Scale > 0 && "Invalid scale!");
+
+  // Check that this is a constant.
+  const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
+  if (!C)
+    return false;
+
+  ScaledConstant = (int) C->getZExtValue();
+  if ((ScaledConstant % Scale) != 0)
+    return false;
+
+  ScaledConstant /= Scale;
+  return ScaledConstant >= RangeMin && ScaledConstant < RangeMax;
+}
+
+void ARMDAGToDAGISel::PreprocessISelDAG() {
+  if (!Subtarget->hasV6T2Ops())
+    return;
+
+  bool isThumb2 = Subtarget->isThumb();
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+
+    if (N->getOpcode() != ISD::ADD)
+      continue;
+
+    // Look for (add X1, (and (srl X2, c1), c2)) where c2 is constant with
+    // leading zeros, followed by consecutive set bits, followed by 1 or 2
+    // trailing zeros, e.g. 1020.
+    // Transform the expression to
+    // (add X1, (shl (and (srl X2, c1), (c2>>tz)), tz)) where tz is the number
+    // of trailing zeros of c2. The left shift would be folded as an shifter
+    // operand of 'add' and the 'and' and 'srl' would become a bits extraction
+    // node (UBFX).
+
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    unsigned And_imm = 0;
+    if (!isOpcWithIntImmediate(N1.getNode(), ISD::AND, And_imm)) {
+      if (isOpcWithIntImmediate(N0.getNode(), ISD::AND, And_imm))
+        std::swap(N0, N1);
+    }
+    if (!And_imm)
+      continue;
+
+    // Check if the AND mask is an immediate of the form: 000.....1111111100
+    unsigned TZ = CountTrailingZeros_32(And_imm);
+    if (TZ != 1 && TZ != 2)
+      // Be conservative here. Shifter operands aren't always free. e.g. On
+      // Swift, left shifter operand of 1 / 2 for free but others are not.
+      // e.g.
+      //  ubfx   r3, r1, #16, #8
+      //  ldr.w  r3, [r0, r3, lsl #2]
+      // vs.
+      //  mov.w  r9, #1020
+      //  and.w  r2, r9, r1, lsr #14
+      //  ldr    r2, [r0, r2]
+      continue;
+    And_imm >>= TZ;
+    if (And_imm & (And_imm + 1))
+      continue;
+
+    // Look for (and (srl X, c1), c2).
+    SDValue Srl = N1.getOperand(0);
+    unsigned Srl_imm = 0;
+    if (!isOpcWithIntImmediate(Srl.getNode(), ISD::SRL, Srl_imm) ||
+        (Srl_imm <= 2))
+      continue;
+
+    // Make sure first operand is not a shifter operand which would prevent
+    // folding of the left shift.
+    SDValue CPTmp0;
+    SDValue CPTmp1;
+    SDValue CPTmp2;
+    if (isThumb2) {
+      if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1))
+        continue;
+    } else {
+      if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) ||
+          SelectRegShifterOperand(N0, CPTmp0, CPTmp1, CPTmp2))
+        continue;
+    }
+
+    // Now make the transformation.
+    Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32,
+                          Srl.getOperand(0),
+                          CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
+    N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32,
+                         Srl, CurDAG->getConstant(And_imm, MVT::i32));
+    N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32,
+                         N1, CurDAG->getConstant(TZ, MVT::i32));
+    CurDAG->UpdateNodeOperands(N, N0, N1);
+  }  
+}
+
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+  if (OptLevel == CodeGenOpt::None)
+    return true;
+
+  if (!CheckVMLxHazard)
+    return true;
+
+  if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() &&
+      !Subtarget->isSwift())
+    return true;
+
+  if (!N->hasOneUse())
+    return false;
+
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg)
+    return true;
+  if (Use->isMachineOpcode()) {
+    const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
+    if (MCID.mayStore())
+      return true;
+    unsigned Opcode = MCID.getOpcode();
+    if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return true;
+    // vmlx feeding into another vmlx. We actually want to unfold
+    // the use later in the MLxExpansion pass. e.g.
+    // vmla
+    // vmla (stall 8 cycles)
+    //
+    // vmul (5 cycles)
+    // vadd (5 cycles)
+    // vmla
+    // This adds up to about 18 - 19 cycles.
+    //
+    // vmla
+    // vmul (stall 4 cycles)
+    // vadd adds up to about 14 cycles.
+    return TII->isFpMLxInstruction(Opcode);
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
+                                            ARM_AM::ShiftOpc ShOpcVal,
+                                            unsigned ShAmt) {
+  if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
+    return true;
+  if (Shift.hasOneUse())
+    return true;
+  // R << 2 is free.
+  return ShOpcVal == ARM_AM::lsl &&
+         (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
+}
+
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS) return false;
+  ShImmVal = RHS->getZExtValue() & 31;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &ShReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (RHS) return false;
+
+  ShReg = N.getOperand(1);
+  if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    return false;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        !(Subtarget->useMovt() &&
+                     N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+
+bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ISD::ADD.
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave simple R +/- imm12 operands for LDRi12
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) // 12 bits.
+      return false;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+//-----
+
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+                                                     SDValue &Base,
+                                                     SDValue &Offset,
+                                                     SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL &&
+      (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return AM2_SHOP;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      // ISD::OR that is equivalent to an ADD.
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() != ISD::SUB) {
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                                -0x1000+1, 0x1000, RHSC)) { // 12 bits.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return AM2_BASE;
+    }
+  }
+
+  if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
+    // Compute R +/- (R << N) and reuse it.
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(Offset, ShOpcVal, ShAmt))
+        Offset = N.getOperand(1).getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
+          Offset = N.getOperand(0).getOperand(0);
+          Base = N.getOperand(1);
+        } else {
+          ShAmt = 0;
+          ShOpcVal = ARM_AM::no_shift;
+        }
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return AM2_SHOP;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+    return false;
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
+        Offset = N.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    if (AddSub == ARM_AM::sub) Val *= -1;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(Val, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+  Base = N;
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/1,
+                              -256 + 1, 256, RHSC)) { // 8 bits.
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 256, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+    return true;
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
+                              -256 + 1, 256, RHSC)) {
+    Base = N.getOperand(0);
+    if (Base.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (RHSC < 0) {
+      AddSub = ARM_AM::sub;
+      RHSC = -RHSC;
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                       MVT::i32);
+    return true;
+  }
+
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
+                                      SDValue &Align) {
+  Addr = N;
+
+  unsigned Alignment = 0;
+  if (LSBaseSDNode *LSN = dyn_cast<LSBaseSDNode>(Parent)) {
+    // This case occurs only for VLD1-lane/dup and VST1-lane instructions.
+    // The maximum alignment is equal to the memory size being referenced.
+    unsigned LSNAlign = LSN->getAlignment();
+    unsigned MemSize = LSN->getMemoryVT().getSizeInBits() / 8;
+    if (LSNAlign >= MemSize && MemSize > 1)
+      Alignment = MemSize;
+  } else {
+    // All other uses of addrmode6 are for intrinsics.  For now just record
+    // the raw alignment value; it will be refined later based on the legal
+    // alignment operands for the intrinsic.
+    Alignment = cast<MemIntrinsicSDNode>(Parent)->getAlignment();
+  }
+
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset) {
+  LSBaseSDNode *LdSt = cast<LSBaseSDNode>(Op);
+  ISD::MemIndexedMode AM = LdSt->getAddressingMode();
+  if (AM != ISD::POST_INC)
+    return false;
+  Offset = N;
+  if (ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N)) {
+    if (NC->getZExtValue() * 8 == LdSt->getMemoryVT().getSizeInBits())
+      Offset = CurDAG->getRegister(0, MVT::i32);
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,
+                                       SDValue &Offset, SDValue &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                      MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Thumb Addressing Modes
+//===----------------------------------------------------------------------===//
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N,
+                                            SDValue &Base, SDValue &Offset){
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N)) {
+    ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+    if (!NC || !NC->isNullValue())
+      return false;
+
+    Base = Offset = N;
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base,
+                                       SDValue &Offset, unsigned Scale) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP))
+    return false;
+
+  // FIXME: Why do we explicitly check for a match here and then return false?
+  // Presumably to allow something else to match, but shouldn't this be
+  // documented?
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC))
+    return false;
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 1);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 2);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  return SelectThumbAddrModeRI(N, Base, Offset, 4);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
+                                          SDValue &Base, SDValue &OffImm) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        !(Subtarget->useMovt() &&
+          N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    } else {
+      Base = N;
+    }
+
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP)) {
+    ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0));
+    ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    unsigned LHSC = LHS ? LHS->getZExtValue() : 0;
+    unsigned RHSC = RHS ? RHS->getZExtValue() : 0;
+
+    // Thumb does not have [sp, #imm5] address mode for non-zero imm5.
+    if (LHSC != 0 || RHSC != 0) return false;
+
+    Base = N;
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  int RHSC;
+  if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) {
+    Base = N.getOperand(0);
+    OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S4(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 4, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S2(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 2, Base, OffImm);
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base,
+                                           SDValue &OffImm) {
+  return SelectThumbAddrModeImm5S(N, 1, Base, OffImm);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    int RHSC;
+    if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4, 0, 256, RHSC)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                        Thumb 2 Addressing Modes
+//===----------------------------------------------------------------------===//
+
+
+bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
+                                                SDValue &Opc) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShImmVal = RHS->getZExtValue() & 31;
+    Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal));
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N)) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index.
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    }
+
+    if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::TargetConstantPool)
+        return false;  // We want to select t2LDRpci instead.
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    if (SelectT2AddrModeImm8(N, Base, OffImm))
+      // Let t2LDRi8 handle (R - imm8).
+      return false;
+
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  // Match simple R - imm8 operands.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
+      !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getSExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm){
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  int RHSC;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x100, RHSC)) { // 8 bits.
+    OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+      ? CurDAG->getTargetConstant(RHSC, MVT::i32)
+      : CurDAG->getTargetConstant(-RHSC, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
+                                            SDValue &Base,
+                                            SDValue &OffReg, SDValue &ShImm) {
+  // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+  if (N.getOpcode() != ISD::ADD && !CurDAG->isBaseWithConstantOffset(N))
+    return false;
+
+  // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+      return false;
+    else if (RHSC < 0 && RHSC >= -255) // 8 bits
+      return false;
+  }
+
+  // Look for (R + R) or (R + (R << [1,2,3])).
+  unsigned ShAmt = 0;
+  Base   = N.getOperand(0);
+  OffReg = N.getOperand(1);
+
+  // Swap if it is ((R << c) + R).
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
+  if (ShOpcVal != ARM_AM::lsl) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
+    if (ShOpcVal == ARM_AM::lsl)
+      std::swap(Base, OffReg);
+  }
+
+  if (ShOpcVal == ARM_AM::lsl) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (ShAmt < 4 && isShifterOpProfitable(OffReg, ShOpcVal, ShAmt))
+        OffReg = OffReg.getOperand(0);
+      else {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32);
+
+  return true;
+}
+
+//===--------------------------------------------------------------------===//
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32);
+}
+
+SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return NULL;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Offset, AMOpc;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (LoadedVT == MVT::i32 && isPre &&
+      SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_PRE_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 && !isPre &&
+      SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_POST_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+    Match = true;
+
+  } else if (LoadedVT == MVT::i16 &&
+             SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+    Match = true;
+    Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+      ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+      : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+  } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+    if (LD->getExtensionType() == ISD::SEXTLOAD) {
+      if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+      }
+    } else {
+      if (isPre &&
+          SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_PRE_IMM;
+      } else if (!isPre &&
+                  SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_POST_IMM;
+      } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
+      }
+    }
+  }
+
+  if (Match) {
+    if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 5);
+    } else {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 6);
+    }
+  }
+
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return NULL;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+  SDValue Offset;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) {
+    switch (LoadedVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+      Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
+      break;
+    case MVT::i16:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST;
+      break;
+    case MVT::i8:
+    case MVT::i1:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
+      break;
+    default:
+      return NULL;
+    }
+    Match = true;
+  }
+
+  if (Match) {
+    SDValue Chain = LD->getChain();
+    SDValue Base = LD->getBasePtr();
+    SDValue Ops[]= { Base, Offset, getAL(CurDAG),
+                     CurDAG->getRegister(0, MVT::i32), Chain };
+    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops, 5);
+  }
+
+  return NULL;
+}
+
+/// \brief Form a GPRPair pseudo register from a pair of GPR regs.
+SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::GPRPairRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
+}
+
+/// \brief Form a D register from a pair of S registers.
+SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
+}
+
+/// \brief Form a quad register from a pair of D registers.
+SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
+}
+
+/// \brief Form 4 consecutive D registers from a pair of Q registers.
+SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 5);
+}
+
+/// \brief Form 4 consecutive S registers.
+SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass =
+    CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::ssub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::ssub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::ssub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
+}
+
+/// \brief Form 4 consecutive D registers.
+SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::dsub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::dsub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
+}
+
+/// \brief Form 4 consecutive Q registers.
+SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
+                                   SDValue V2, SDValue V3) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
+  SDValue SubReg2 = CurDAG->getTargetConstant(ARM::qsub_2, MVT::i32);
+  SDValue SubReg3 = CurDAG->getTargetConstant(ARM::qsub_3, MVT::i32);
+  const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1,
+                                    V2, SubReg2, V3, SubReg3 };
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VT, Ops, 9);
+}
+
+/// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
+/// of a NEON VLD or VST instruction.  The supported values depend on the
+/// number of registers being loaded.
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
+                                       bool is64BitVector) {
+  unsigned NumRegs = NumVecs;
+  if (!is64BitVector && NumVecs < 3)
+    NumRegs *= 2;
+
+  unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+  if (Alignment >= 32 && NumRegs == 4)
+    Alignment = 32;
+  else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4))
+    Alignment = 16;
+  else if (Alignment >= 8)
+    Alignment = 8;
+  else
+    Alignment = 0;
+
+  return CurDAG->getTargetConstant(Alignment, MVT::i32);
+}
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case ARM::VLD1d8wb_fixed: return ARM::VLD1d8wb_register;
+  case ARM::VLD1d16wb_fixed: return ARM::VLD1d16wb_register;
+  case ARM::VLD1d32wb_fixed: return ARM::VLD1d32wb_register;
+  case ARM::VLD1d64wb_fixed: return ARM::VLD1d64wb_register;
+  case ARM::VLD1q8wb_fixed: return ARM::VLD1q8wb_register;
+  case ARM::VLD1q16wb_fixed: return ARM::VLD1q16wb_register;
+  case ARM::VLD1q32wb_fixed: return ARM::VLD1q32wb_register;
+  case ARM::VLD1q64wb_fixed: return ARM::VLD1q64wb_register;
+
+  case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register;
+  case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register;
+  case ARM::VST1d32wb_fixed: return ARM::VST1d32wb_register;
+  case ARM::VST1d64wb_fixed: return ARM::VST1d64wb_register;
+  case ARM::VST1q8wb_fixed: return ARM::VST1q8wb_register;
+  case ARM::VST1q16wb_fixed: return ARM::VST1q16wb_register;
+  case ARM::VST1q32wb_fixed: return ARM::VST1q32wb_register;
+  case ARM::VST1q64wb_fixed: return ARM::VST1q64wb_register;
+  case ARM::VST1d64TPseudoWB_fixed: return ARM::VST1d64TPseudoWB_register;
+  case ARM::VST1d64QPseudoWB_fixed: return ARM::VST1d64QPseudoWB_register;
+
+  case ARM::VLD2d8wb_fixed: return ARM::VLD2d8wb_register;
+  case ARM::VLD2d16wb_fixed: return ARM::VLD2d16wb_register;
+  case ARM::VLD2d32wb_fixed: return ARM::VLD2d32wb_register;
+  case ARM::VLD2q8PseudoWB_fixed: return ARM::VLD2q8PseudoWB_register;
+  case ARM::VLD2q16PseudoWB_fixed: return ARM::VLD2q16PseudoWB_register;
+  case ARM::VLD2q32PseudoWB_fixed: return ARM::VLD2q32PseudoWB_register;
+
+  case ARM::VST2d8wb_fixed: return ARM::VST2d8wb_register;
+  case ARM::VST2d16wb_fixed: return ARM::VST2d16wb_register;
+  case ARM::VST2d32wb_fixed: return ARM::VST2d32wb_register;
+  case ARM::VST2q8PseudoWB_fixed: return ARM::VST2q8PseudoWB_register;
+  case ARM::VST2q16PseudoWB_fixed: return ARM::VST2q16PseudoWB_register;
+  case ARM::VST2q32PseudoWB_fixed: return ARM::VST2q32PseudoWB_register;
+
+  case ARM::VLD2DUPd8wb_fixed: return ARM::VLD2DUPd8wb_register;
+  case ARM::VLD2DUPd16wb_fixed: return ARM::VLD2DUPd16wb_register;
+  case ARM::VLD2DUPd32wb_fixed: return ARM::VLD2DUPd32wb_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VLD1");
+    break;
+  }
+
+  EVT ResTy;
+  if (NumVecs == 1)
+    ResTy = VT;
+  else {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
+  }
+  std::vector<EVT> ResTys;
+  ResTys.push_back(ResTy);
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDNode *VLd;
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VLD1/VLD2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if ((NumVecs == 1 || NumVecs == 2) && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if ((NumVecs != 1 && NumVecs != 2 && Opc != ARM::VLD1q64wb_fixed) ||
+          !isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+
+  } else {
+    // Otherwise, quad registers are loaded with two separate instructions,
+    // where one loads the even registers and the other loads the odd registers.
+    EVT AddrTy = MemAddr.getValueType();
+
+    // Load the even subregs.  This is always an updating load, so that it
+    // provides the address to the second load for the odd subregs.
+    SDValue ImplDef =
+      SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
+    const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                          ResTy, AddrTy, MVT::Other, OpsA, 7);
+    Chain = SDValue(VLdA, 2);
+
+    // Load the odd subregs.
+    Ops.push_back(SDValue(VLdA, 1));
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      assert(isa<ConstantSDNode>(Inc.getNode()) &&
+             "only constant post-increment update allowed for VLD3/4");
+      (void)Inc;
+      Ops.push_back(Reg0);
+    }
+    Ops.push_back(SDValue(VLdA, 0));
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                 Ops.data(), Ops.size());
+  }
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // Extract out the subregisters.
+  SDValue SuperReg = SDValue(VLd, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *DOpcodes,
+                                   const uint16_t *QOpcodes0,
+                                   const uint16_t *QOpcodes1) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return NULL;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vst type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  case MVT::v2i64: OpcodeIndex = 3;
+    assert(NumVecs == 1 && "v2i64 type only supported for VST1");
+    break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SmallVector<SDValue, 7> Ops;
+
+  // Double registers and VST1/VST2 quad registers are directly supported.
+  if (is64BitVector || NumVecs <= 2) {
+    SDValue SrcReg;
+    if (NumVecs == 1) {
+      SrcReg = N->getOperand(Vec0Idx);
+    } else if (is64BitVector) {
+      // Form a REG_SEQUENCE to force register allocation.
+      SDValue V0 = N->getOperand(Vec0Idx + 0);
+      SDValue V1 = N->getOperand(Vec0Idx + 1);
+      if (NumVecs == 2)
+        SrcReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+      else {
+        SDValue V2 = N->getOperand(Vec0Idx + 2);
+        // If it's a vst3, form a quad D-register and leave the last part as
+        // an undef.
+        SDValue V3 = (NumVecs == 3)
+          ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
+          : N->getOperand(Vec0Idx + 3);
+        SrcReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+      }
+    } else {
+      // Form a QQ register.
+      SDValue Q0 = N->getOperand(Vec0Idx);
+      SDValue Q1 = N->getOperand(Vec0Idx + 1);
+      SrcReg = SDValue(createQRegPairNode(MVT::v4i64, Q0, Q1), 0);
+    }
+
+    unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                    QOpcodes0[OpcodeIndex]);
+    Ops.push_back(MemAddr);
+    Ops.push_back(Align);
+    if (isUpdating) {
+      SDValue Inc = N->getOperand(AddrOpIdx + 1);
+      // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
+      // case entirely when the rest are updated to that form, too.
+      if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+        Opc = getVLDSTRegisterUpdateOpcode(Opc);
+      // We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
+      // check for that explicitly too. Horribly hacky, but temporary.
+      if ((NumVecs > 2 && Opc != ARM::VST1q64wb_fixed) ||
+          !isa<ConstantSDNode>(Inc.getNode()))
+        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    }
+    Ops.push_back(SrcReg);
+    Ops.push_back(Pred);
+    Ops.push_back(Reg0);
+    Ops.push_back(Chain);
+    SDNode *VSt =
+      CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+
+    // Transfer memoperands.
+    cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+    return VSt;
+  }
+
+  // Otherwise, quad registers are stored with two separate instructions,
+  // where one stores the even registers and the other stores the odd registers.
+
+  // Form the QQQQ REG_SEQUENCE.
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  SDValue V2 = N->getOperand(Vec0Idx + 2);
+  SDValue V3 = (NumVecs == 3)
+    ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+    : N->getOperand(Vec0Idx + 3);
+  SDValue RegSeq = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+
+  // Store the even D registers.  This is always an updating store, so that it
+  // provides the address to the second store for the odd subregs.
+  const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
+  SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
+                                        MemAddr.getValueType(),
+                                        MVT::Other, OpsA, 7);
+  cast<MachineSDNode>(VStA)->setMemRefs(MemOp, MemOp + 1);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd D registers.
+  Ops.push_back(SDValue(VStA, 0));
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    assert(isa<ConstantSDNode>(Inc.getNode()) &&
+           "only constant post-increment update allowed for VST3/4");
+    (void)Inc;
+    Ops.push_back(Reg0);
+  }
+  Ops.push_back(RegSeq);
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+  SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+                                        Ops.data(), Ops.size());
+  cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
+  return VStB;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                         bool isUpdating, unsigned NumVecs,
+                                         const uint16_t *DOpcodes,
+                                         const uint16_t *QOpcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, Align;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+  if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
+    return NULL;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+    cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld/vst lane type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+    // Quad-register operations:
+  case MVT::v8i16: OpcodeIndex = 0; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 1; break;
+  }
+
+  std::vector<EVT> ResTys;
+  if (IsLoad) {
+    unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+    if (!is64BitVector)
+      ResTyElts *= 2;
+    ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+                                      MVT::i64, ResTyElts));
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+  }
+
+  SDValue SuperReg;
+  SDValue V0 = N->getOperand(Vec0Idx + 0);
+  SDValue V1 = N->getOperand(Vec0Idx + 1);
+  if (NumVecs == 2) {
+    if (is64BitVector)
+      SuperReg = SDValue(createDRegPairNode(MVT::v2i64, V0, V1), 0);
+    else
+      SuperReg = SDValue(createQRegPairNode(MVT::v4i64, V0, V1), 0);
+  } else {
+    SDValue V2 = N->getOperand(Vec0Idx + 2);
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(Vec0Idx + 3);
+    if (is64BitVector)
+      SuperReg = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+    else
+      SuperReg = SDValue(createQuadQRegsNode(MVT::v8i64, V0, V1, V2, V3), 0);
+  }
+  Ops.push_back(SuperReg);
+  Ops.push_back(getI32Imm(Lane));
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+                                  QOpcodes[OpcodeIndex]);
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
+                                         Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad)
+    return VLdLn;
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, Align;
+  if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
+    return NULL;
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  unsigned Alignment = 0;
+  if (NumVecs != 3) {
+    Alignment = cast<ConstantSDNode>(Align)->getZExtValue();
+    unsigned NumBytes = NumVecs * VT.getVectorElementType().getSizeInBits()/8;
+    if (Alignment > NumBytes)
+      Alignment = NumBytes;
+    if (Alignment < 8 && Alignment < NumBytes)
+      Alignment = 0;
+    // Alignment must be a power of two; make sure of that.
+    Alignment = (Alignment & -Alignment);
+    if (Alignment == 1)
+      Alignment = 0;
+  }
+  Align = CurDAG->getTargetConstant(Alignment, MVT::i32);
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld-dup type");
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  }
+
+  SDValue Pred = getAL(CurDAG);
+  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+  SDValue SuperReg;
+  unsigned Opc = Opcodes[OpcodeIndex];
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(Align);
+  if (isUpdating) {
+    // fixed-stride update instructions don't have an explicit writeback
+    // operand. It's implicit in the opcode itself.
+    SDValue Inc = N->getOperand(2);
+    if (!isa<ConstantSDNode>(Inc.getNode()))
+      Ops.push_back(Inc);
+    // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
+    else if (NumVecs > 2)
+      Ops.push_back(Reg0);
+  }
+  Ops.push_back(Pred);
+  Ops.push_back(Reg0);
+  Ops.push_back(Chain);
+
+  unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+  std::vector<EVT> ResTys;
+  ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,ResTyElts));
+  if (isUpdating)
+    ResTys.push_back(MVT::i32);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdDup =
+    CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+  SuperReg = SDValue(VLdDup, 0);
+
+  // Extract the subregisters.
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  unsigned SubIdx = ARM::dsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                    unsigned Opc) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  unsigned FirstTblReg = IsExt ? 2 : 1;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SDValue RegSeq;
+  SDValue V0 = N->getOperand(FirstTblReg + 0);
+  SDValue V1 = N->getOperand(FirstTblReg + 1);
+  if (NumVecs == 2)
+    RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+  else {
+    SDValue V2 = N->getOperand(FirstTblReg + 2);
+    // If it's a vtbl3, form a quad D-register and leave the last part as
+    // an undef.
+    SDValue V3 = (NumVecs == 3)
+      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+      : N->getOperand(FirstTblReg + 3);
+    RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
+  }
+
+  SmallVector<SDValue, 6> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
+  Ops.push_back(getAL(CurDAG)); // predicate
+  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops.data(), Ops.size());
+}
+
+SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
+                                                     bool isSigned) {
+  if (!Subtarget->hasV6T2Ops())
+    return NULL;
+
+  unsigned Opc = isSigned
+    ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
+    : (Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX);
+
+  // For unsigned extracts, check for a shift right and mask
+  unsigned And_imm = 0;
+  if (N->getOpcode() == ISD::AND) {
+    if (isOpcWithIntImmediate(N, ISD::AND, And_imm)) {
+
+      // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+      if (And_imm & (And_imm + 1))
+        return NULL;
+
+      unsigned Srl_imm = 0;
+      if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
+                                Srl_imm)) {
+        assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+
+        // Note: The width operand is encoded as width-1.
+        unsigned Width = CountTrailingOnes_32(And_imm) - 1;
+        unsigned LSB = Srl_imm;
+
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+
+        if ((LSB + Width + 1) == N->getValueType(0).getSizeInBits()) {
+          // It's cheaper to use a right shift to extract the top bits.
+          if (Subtarget->isThumb()) {
+            Opc = isSigned ? ARM::t2ASRri : ARM::t2LSRri;
+            SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                              CurDAG->getTargetConstant(LSB, MVT::i32),
+                              getAL(CurDAG), Reg0, Reg0 };
+            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+          }
+
+          // ARM models shift instructions as MOVsi with shifter operand.
+          ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(ISD::SRL);
+          SDValue ShOpc =
+            CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, LSB),
+                                      MVT::i32);
+          SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
+                            getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops, 5);
+        }
+
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          CurDAG->getTargetConstant(LSB, MVT::i32),
+                          CurDAG->getTargetConstant(Width, MVT::i32),
+          getAL(CurDAG), Reg0 };
+        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+      }
+    }
+    return NULL;
+  }
+
+  // Otherwise, we're looking for a shift of a shift
+  unsigned Shl_imm = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+    unsigned Srl_imm = 0;
+    if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = 32 - Srl_imm - 1;
+      int LSB = Srl_imm - Shl_imm;
+      if (LSB < 0)
+        return NULL;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(LSB, MVT::i32),
+                        CurDAG->getTargetConstant(Width, MVT::i32),
+                        getAL(CurDAG), Reg0 };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+    }
+  }
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  SDValue CPTmp0;
+  SDValue CPTmp1;
+  if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
+    unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
+    unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
+    unsigned Opc = 0;
+    switch (SOShOp) {
+    case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
+    case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
+    case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
+    case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
+    default:
+      llvm_unreachable("Unknown so_reg opcode!");
+    }
+    SDValue SOShImm =
+      CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  SDValue CPTmp0;
+  SDValue CPTmp1;
+  SDValue CPTmp2;
+  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
+  }
+
+  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                  ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
+  if (!T)
+    return 0;
+
+  unsigned Opc = 0;
+  unsigned TrueImm = T->getZExtValue();
+  if (is_t2_so_imm(TrueImm)) {
+    Opc = ARM::t2MOVCCi;
+  } else if (TrueImm <= 0xffff) {
+    Opc = ARM::t2MOVCCi16;
+  } else if (is_t2_so_imm_not(TrueImm)) {
+    TrueImm = ~TrueImm;
+    Opc = ARM::t2MVNCCi;
+  } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) {
+    // Large immediate.
+    Opc = ARM::t2MOVCCi32imm;
+  }
+
+  if (Opc) {
+    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+  }
+
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                   ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
+  if (!T)
+    return 0;
+
+  unsigned Opc = 0;
+  unsigned TrueImm = T->getZExtValue();
+  bool isSoImm = is_so_imm(TrueImm);
+  if (isSoImm) {
+    Opc = ARM::MOVCCi;
+  } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) {
+    Opc = ARM::MOVCCi16;
+  } else if (is_so_imm_not(TrueImm)) {
+    TrueImm = ~TrueImm;
+    Opc = ARM::MVNCCi;
+  } else if (TrueVal.getNode()->hasOneUse() &&
+             (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) {
+    // Large immediate.
+    Opc = ARM::MOVCCi32imm;
+  }
+
+  if (Opc) {
+    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+  }
+
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue FalseVal = N->getOperand(0);
+  SDValue TrueVal  = N->getOperand(1);
+  SDValue CC = N->getOperand(2);
+  SDValue CCR = N->getOperand(3);
+  SDValue InFlag = N->getOperand(4);
+  assert(CC.getOpcode() == ISD::Constant);
+  assert(CCR.getOpcode() == ISD::Register);
+  ARMCC::CondCodes CCVal =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue();
+
+  if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 18  cost = 1  size = 0
+    if (Subtarget->isThumb()) {
+      SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal,
+                                        CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    } else {
+      SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal,
+                                         CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false,
+    //             (imm:i32)<<P:Pred_so_imm>>:$true,
+    //             (imm:i32):$cc)
+    // Emits: (MOVCCi:i32 GPR:i32:$false,
+    //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
+    // Pattern complexity = 10  cost = 1  size = 0
+    if (Subtarget->isThumb()) {
+      SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal,
+                                        CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    } else {
+      SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal,
+                                         CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    }
+  }
+
+  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Pattern complexity = 6  cost = 1  size = 0
+  //
+  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Pattern complexity = 6  cost = 11  size = 0
+  //
+  // Also VMOVScc and VMOVDcc.
+  SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
+  SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Illegal conditional move type!");
+  case MVT::i32:
+    Opc = Subtarget->isThumb()
+      ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
+      : ARM::MOVCCr;
+    break;
+  case MVT::f32:
+    Opc = ARM::VMOVScc;
+    break;
+  case MVT::f64:
+    Opc = ARM::VMOVDcc;
+    break;
+  }
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
+}
+
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X,  0,  X, -X
+/// select_cc setgt    X, -1,  X, -X
+/// select_cc setl[te] X,  0, -X,  X
+/// select_cc setlt    X,  1, -X,  X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+  SDValue XORSrc0 = N->getOperand(0);
+  SDValue XORSrc1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  if (Subtarget->isThumb1Only())
+    return NULL;
+
+  if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
+    return NULL;
+
+  SDValue ADDSrc0 = XORSrc0.getOperand(0);
+  SDValue ADDSrc1 = XORSrc0.getOperand(1);
+  SDValue SRASrc0 = XORSrc1.getOperand(0);
+  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
+  EVT XType = SRASrc0.getValueType();
+  unsigned Size = XType.getSizeInBits() - 1;
+
+  if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
+      XType.isInteger() && SRAConstant != NULL &&
+      Size == SRAConstant->getZExtValue()) {
+    unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
+    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+  }
+
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() || N->getNumOperands() != 2)
+    llvm_unreachable("unexpected CONCAT_VECTORS");
+  return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
+}
+
+SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  Ops.push_back(Node->getOperand(2)); // Low part of Val1
+  Ops.push_back(Node->getOperand(3)); // High part of Val1
+  if (Opc == ARM::ATOMCMPXCHG6432) {
+    Ops.push_back(Node->getOperand(4)); // Low part of Val2
+    Ops.push_back(Node->getOperand(5)); // High part of Val2
+  }
+  Ops.push_back(Node->getOperand(0)); // Chain
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other,
+                                           Ops.data() ,Ops.size());
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
+SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::INLINEASM: {
+    SDNode *ResNode = SelectInlineAsm(N);
+    if (ResNode)
+      return ResNode;
+    break;
+  }
+  case ISD::XOR: {
+    // Select special operations if XOR node forms integer ABS pattern
+    SDNode *ResNode = SelectABSOp(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+    bool UseCP = true;
+    if (Subtarget->hasThumb2())
+      // Thumb2-aware targets have the MOVT instruction, so all immediates can
+      // be done with MOV + MOVT, at worst.
+      UseCP = 0;
+    else {
+      if (Subtarget->isThumb()) {
+        UseCP = (Val > 255 &&                          // MOV
+                 ~Val > 255 &&                         // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+      } else
+        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
+                 ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
+                 !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+    }
+
+    if (UseCP) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      TLI.getPointerTy());
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb1Only()) {
+        SDValue Pred = getAL(CurDAG);
+        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
+                                         Ops, 4);
+      } else {
+        SDValue Ops[] = {
+          CPIdx,
+          CurDAG->getTargetConstant(0, MVT::i32),
+          getAL(CurDAG),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                       Ops, 5);
+      }
+      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+      return NULL;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    if (Subtarget->isThumb1Only()) {
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4);
+    } else {
+      unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+                      ARM::t2ADDri : ARM::ADDri);
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::SRL:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
+      return I;
+    break;
+  case ISD::SRA:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
+      return I;
+    break;
+  case ISD::MUL:
+    if (Subtarget->isThumb1Only())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      unsigned RHSV = C->getZExtValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        unsigned ShImm = Log2_32(RHSV-1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7);
+        }
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        unsigned ShImm = Log2_32(RHSV+1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7);
+        }
+      }
+    }
+    break;
+  case ISD::AND: {
+    // Check for unsigned bitfield extract
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
+      return I;
+
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4);
+      }
+    }
+    break;
+  }
+  case ARMISD::VMOVRRD:
+    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                  N->getOperand(0), getAL(CurDAG),
+                                  CurDAG->getRegister(0, MVT::i32));
+  case ISD::UMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32,Ops,4);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::UMULL : ARM::UMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::SMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32,Ops,4);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                    ARM::SMULL : ARM::SMULLv5,
+                                    dl, MVT::i32, MVT::i32, Ops, 5);
+    }
+  }
+  case ARMISD::UMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::UMLAL : ARM::UMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::SMLAL : ARM::SMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
+  case ISD::LOAD: {
+    SDNode *ResNode = 0;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ResNode = SelectT2IndexedLoad(N);
+    else
+      ResNode = SelectARMIndexedLoad(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ?
+      ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
+    SDValue Chain = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    SDValue InFlag = N->getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+                                             MVT::Glue, Ops, 5);
+    Chain = SDValue(ResNode, 0);
+    if (N->getNumValues() == 2) {
+      InFlag = SDValue(ResNode, 1);
+      ReplaceUses(SDValue(N, 1), InFlag);
+    }
+    ReplaceUses(SDValue(N, 0),
+                SDValue(Chain.getNode(), Chain.getResNo()));
+    return NULL;
+  }
+  case ARMISD::CMOV:
+    return SelectCMOVOp(N);
+  case ARMISD::VZIP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VZIPd8; break;
+    case MVT::v4i16: Opc = ARM::VZIPd16; break;
+    case MVT::v2f32:
+    // vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VZIPq8; break;
+    case MVT::v8i16: Opc = ARM::VZIPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VZIPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+  case ARMISD::VUZP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VUZPd8; break;
+    case MVT::v4i16: Opc = ARM::VUZPd16; break;
+    case MVT::v2f32:
+    // vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VUZPq8; break;
+    case MVT::v8i16: Opc = ARM::VUZPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VUZPq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+  case ARMISD::VTRN: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VTRNd8; break;
+    case MVT::v4i16: Opc = ARM::VTRNd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VTRNq8; break;
+    case MVT::v8i16: Opc = ARM::VTRNq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VTRNq32; break;
+    }
+    SDValue Pred = getAL(CurDAG);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+  case ARMISD::BUILD_VECTOR: {
+    EVT VecVT = N->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+    unsigned NumElts = VecVT.getVectorNumElements();
+    if (EltVT == MVT::f64) {
+      assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
+      return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    }
+    assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
+    if (NumElts == 2)
+      return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
+    return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                     N->getOperand(2), N->getOperand(3));
+  }
+
+  case ARMISD::VLD2DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
+                                        ARM::VLD2DUPd32 };
+    return SelectVLDDup(N, false, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
+                                        ARM::VLD3DUPd16Pseudo,
+                                        ARM::VLD3DUPd32Pseudo };
+    return SelectVLDDup(N, false, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
+                                        ARM::VLD4DUPd16Pseudo,
+                                        ARM::VLD4DUPd32Pseudo };
+    return SelectVLDDup(N, false, 4, Opcodes);
+  }
+
+  case ARMISD::VLD2DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
+                                        ARM::VLD2DUPd16wb_fixed,
+                                        ARM::VLD2DUPd32wb_fixed };
+    return SelectVLDDup(N, true, 2, Opcodes);
+  }
+
+  case ARMISD::VLD3DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
+                                        ARM::VLD3DUPd16Pseudo_UPD,
+                                        ARM::VLD3DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 3, Opcodes);
+  }
+
+  case ARMISD::VLD4DUP_UPD: {
+    static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
+                                        ARM::VLD4DUPd16Pseudo_UPD,
+                                        ARM::VLD4DUPd32Pseudo_UPD };
+    return SelectVLDDup(N, true, 4, Opcodes);
+  }
+
+  case ARMISD::VLD1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD1d8wb_fixed,
+                                         ARM::VLD1d16wb_fixed,
+                                         ARM::VLD1d32wb_fixed,
+                                         ARM::VLD1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VLD1q8wb_fixed,
+                                         ARM::VLD1q16wb_fixed,
+                                         ARM::VLD1q32wb_fixed,
+                                         ARM::VLD1q64wb_fixed };
+    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VLD2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
+                                         ARM::VLD2d16wb_fixed,
+                                         ARM::VLD2d32wb_fixed,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
+                                         ARM::VLD2q16PseudoWB_fixed,
+                                         ARM::VLD2q32PseudoWB_fixed };
+    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VLD3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo_UPD,
+                                         ARM::VLD3d16Pseudo_UPD,
+                                         ARM::VLD3d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                          ARM::VLD3q16Pseudo_UPD,
+                                          ARM::VLD3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+                                          ARM::VLD3q16oddPseudo_UPD,
+                                          ARM::VLD3q32oddPseudo_UPD };
+    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
+                                         ARM::VLD4d16Pseudo_UPD,
+                                         ARM::VLD4d32Pseudo_UPD,
+                                         ARM::VLD1q64wb_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                          ARM::VLD4q16Pseudo_UPD,
+                                          ARM::VLD4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+                                          ARM::VLD4q16oddPseudo_UPD,
+                                          ARM::VLD4q32oddPseudo_UPD };
+    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VLD2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD,
+                                         ARM::VLD2LNd16Pseudo_UPD,
+                                         ARM::VLD2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+                                         ARM::VLD2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD,
+                                         ARM::VLD3LNd16Pseudo_UPD,
+                                         ARM::VLD3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+                                         ARM::VLD3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VLD4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD,
+                                         ARM::VLD4LNd16Pseudo_UPD,
+                                         ARM::VLD4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+                                         ARM::VLD4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST1_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST1d8wb_fixed,
+                                         ARM::VST1d16wb_fixed,
+                                         ARM::VST1d32wb_fixed,
+                                         ARM::VST1d64wb_fixed };
+    static const uint16_t QOpcodes[] = { ARM::VST1q8wb_fixed,
+                                         ARM::VST1q16wb_fixed,
+                                         ARM::VST1q32wb_fixed,
+                                         ARM::VST1q64wb_fixed };
+    return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VST2_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
+                                         ARM::VST2d16wb_fixed,
+                                         ARM::VST2d32wb_fixed,
+                                         ARM::VST1q64wb_fixed};
+    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
+                                         ARM::VST2q16PseudoWB_fixed,
+                                         ARM::VST2q32PseudoWB_fixed };
+    return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
+  }
+
+  case ARMISD::VST3_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo_UPD,
+                                         ARM::VST3d16Pseudo_UPD,
+                                         ARM::VST3d32Pseudo_UPD,
+                                         ARM::VST1d64TPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                          ARM::VST3q16Pseudo_UPD,
+                                          ARM::VST3q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+                                          ARM::VST3q16oddPseudo_UPD,
+                                          ARM::VST3q32oddPseudo_UPD };
+    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST4_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
+                                         ARM::VST4d16Pseudo_UPD,
+                                         ARM::VST4d32Pseudo_UPD,
+                                         ARM::VST1d64QPseudoWB_fixed};
+    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                          ARM::VST4q16Pseudo_UPD,
+                                          ARM::VST4q32Pseudo_UPD };
+    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+                                          ARM::VST4q16oddPseudo_UPD,
+                                          ARM::VST4q32oddPseudo_UPD };
+    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+  }
+
+  case ARMISD::VST2LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD,
+                                         ARM::VST2LNd16Pseudo_UPD,
+                                         ARM::VST2LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+                                         ARM::VST2LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST3LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD,
+                                         ARM::VST3LNd16Pseudo_UPD,
+                                         ARM::VST3LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+                                         ARM::VST3LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+  }
+
+  case ARMISD::VST4LN_UPD: {
+    static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD,
+                                         ARM::VST4LNd16Pseudo_UPD,
+                                         ARM::VST4LNd32Pseudo_UPD };
+    static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+                                         ARM::VST4LNq32Pseudo_UPD };
+    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+  }
+
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_ldrexd: {
+      SDValue MemAddr = N->getOperand(2);
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      unsigned NewOpc = isThumb ? ARM::t2LDREXD :ARM::LDREXD;
+
+      // arm_ldrexd returns a i64 value in {i32, i32}
+      std::vector<EVT> ResTys;
+      if (isThumb) {
+        ResTys.push_back(MVT::i32);
+        ResTys.push_back(MVT::i32);
+      } else
+        ResTys.push_back(MVT::Untyped);
+      ResTys.push_back(MVT::Other);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+      SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+
+      // Remap uses.
+      SDValue OutChain = isThumb ? SDValue(Ld, 2) : SDValue(Ld, 1);
+      if (!SDValue(N, 0).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 0);
+        else {
+          SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 0), Result);
+      }
+      if (!SDValue(N, 1).use_empty()) {
+        SDValue Result;
+        if (isThumb)
+          Result = SDValue(Ld, 1);
+        else {
+          SDValue SubRegIdx = CurDAG->getTargetConstant(ARM::gsub_1, MVT::i32);
+          SDNode *ResNode = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+              dl, MVT::i32, SDValue(Ld, 0), SubRegIdx);
+          Result = SDValue(ResNode,0);
+        }
+        ReplaceUses(SDValue(N, 1), Result);
+      }
+      ReplaceUses(SDValue(N, 2), OutChain);
+      return NULL;
+    }
+
+    case Intrinsic::arm_strexd: {
+      DebugLoc dl = N->getDebugLoc();
+      SDValue Chain = N->getOperand(0);
+      SDValue Val0 = N->getOperand(2);
+      SDValue Val1 = N->getOperand(3);
+      SDValue MemAddr = N->getOperand(4);
+
+      // Store exclusive double return a i32 value which is the return status
+      // of the issued store.
+      EVT ResTys[] = { MVT::i32, MVT::Other };
+
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      if (isThumb) {
+        Ops.push_back(Val0);
+        Ops.push_back(Val1);
+      } else
+        // arm_strexd uses GPRPair.
+        Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, Val0, Val1), 0));
+      Ops.push_back(MemAddr);
+      Ops.push_back(getAL(CurDAG));
+      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      Ops.push_back(Chain);
+
+      unsigned NewOpc = isThumb ? ARM::t2STREXD : ARM::STREXD;
+
+      SDNode *St = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops.data(),
+                                          Ops.size());
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
+    }
+
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t DOpcodes[] = { ARM::VLD1d8, ARM::VLD1d16,
+                                           ARM::VLD1d32, ARM::VLD1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
+                                           ARM::VLD1q32, ARM::VLD1q64};
+      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                                           ARM::VLD2d32, ARM::VLD1q64 };
+      static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
+                                           ARM::VLD2q32Pseudo };
+      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3d8Pseudo,
+                                           ARM::VLD3d16Pseudo,
+                                           ARM::VLD3d32Pseudo,
+                                           ARM::VLD1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+                                            ARM::VLD3q16Pseudo_UPD,
+                                            ARM::VLD3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
+                                            ARM::VLD3q16oddPseudo,
+                                            ARM::VLD3q32oddPseudo };
+      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo,
+                                           ARM::VLD4d16Pseudo,
+                                           ARM::VLD4d32Pseudo,
+                                           ARM::VLD1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+                                            ARM::VLD4q16Pseudo_UPD,
+                                            ARM::VLD4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
+                                            ARM::VLD4q16oddPseudo,
+                                            ARM::VLD4q32oddPseudo };
+      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD2LNd8Pseudo,
+                                           ARM::VLD2LNd16Pseudo,
+                                           ARM::VLD2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
+                                           ARM::VLD2LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD3LNd8Pseudo,
+                                           ARM::VLD3LNd16Pseudo,
+                                           ARM::VLD3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
+                                           ARM::VLD3LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VLD4LNd8Pseudo,
+                                           ARM::VLD4LNd16Pseudo,
+                                           ARM::VLD4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
+                                           ARM::VLD4LNq32Pseudo };
+      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t DOpcodes[] = { ARM::VST1d8, ARM::VST1d16,
+                                           ARM::VST1d32, ARM::VST1d64 };
+      static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
+                                           ARM::VST1q32, ARM::VST1q64 };
+      return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                                           ARM::VST2d32, ARM::VST1q64 };
+      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                     ARM::VST2q32Pseudo };
+      return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t DOpcodes[] = { ARM::VST3d8Pseudo,
+                                           ARM::VST3d16Pseudo,
+                                           ARM::VST3d32Pseudo,
+                                           ARM::VST1d64TPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+                                            ARM::VST3q16Pseudo_UPD,
+                                            ARM::VST3q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
+                                            ARM::VST3q16oddPseudo,
+                                            ARM::VST3q32oddPseudo };
+      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo,
+                                           ARM::VST4d16Pseudo,
+                                           ARM::VST4d32Pseudo,
+                                           ARM::VST1d64QPseudo };
+      static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+                                            ARM::VST4q16Pseudo_UPD,
+                                            ARM::VST4q32Pseudo_UPD };
+      static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
+                                            ARM::VST4q16oddPseudo,
+                                            ARM::VST4q32oddPseudo };
+      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST2LNd8Pseudo,
+                                           ARM::VST2LNd16Pseudo,
+                                           ARM::VST2LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
+                                           ARM::VST2LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST3LNd8Pseudo,
+                                           ARM::VST3LNd16Pseudo,
+                                           ARM::VST3LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
+                                           ARM::VST3LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+    }
+
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t DOpcodes[] = { ARM::VST4LNd8Pseudo,
+                                           ARM::VST4LNd16Pseudo,
+                                           ARM::VST4LNd32Pseudo };
+      static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
+                                           ARM::VST4LNq32Pseudo };
+      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vtbl2:
+      return SelectVTBL(N, false, 2, ARM::VTBL2);
+    case Intrinsic::arm_neon_vtbl3:
+      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+    case Intrinsic::arm_neon_vtbl4:
+      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+
+    case Intrinsic::arm_neon_vtbx2:
+      return SelectVTBL(N, true, 2, ARM::VTBX2);
+    case Intrinsic::arm_neon_vtbx3:
+      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+    case Intrinsic::arm_neon_vtbx4:
+      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+    }
+    break;
+  }
+
+  case ARMISD::VTBL1: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+    SmallVector<SDValue, 6> Ops;
+
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size());
+  }
+  case ARMISD::VTBL2: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+
+    // Form a REG_SEQUENCE to force register allocation.
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
+
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(RegSeq);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL2, dl, VT,
+                                  Ops.data(), Ops.size());
+  }
+
+  case ISD::CONCAT_VECTORS:
+    return SelectConcatVector(N);
+
+  case ARMISD::ATOMOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMOR6432);
+  case ARMISD::ATOMXOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMXOR6432);
+  case ARMISD::ATOMADD64_DAG:
+    return SelectAtomic64(N, ARM::ATOMADD6432);
+  case ARMISD::ATOMSUB64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSUB6432);
+  case ARMISD::ATOMNAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMNAND6432);
+  case ARMISD::ATOMAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMAND6432);
+  case ARMISD::ATOMSWAP64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSWAP6432);
+  case ARMISD::ATOMCMPXCHG64_DAG:
+    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
+
+  case ARMISD::ATOMMIN64_DAG:
+    return SelectAtomic64(N, ARM::ATOMMIN6432);
+  case ARMISD::ATOMUMIN64_DAG:
+    return SelectAtomic64(N, ARM::ATOMUMIN6432);
+  case ARMISD::ATOMMAX64_DAG:
+    return SelectAtomic64(N, ARM::ATOMMAX6432);
+  case ARMISD::ATOMUMAX64_DAG:
+    return SelectAtomic64(N, ARM::ATOMUMAX6432);
+  }
+
+  return SelectCode(N);
+}
+
+SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+  std::vector<SDValue> AsmNodeOperands;
+  unsigned Flag, Kind;
+  bool Changed = false;
+  unsigned NumOps = N->getNumOperands();
+
+  ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(
+      N->getOperand(InlineAsm::Op_AsmString));
+  StringRef AsmString = StringRef(S->getSymbol());
+
+  // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
+  // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
+  // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
+  // respectively. Since there is no constraint to explicitly specify a
+  // reg pair, we search %H operand inside the asm string. If it is found, the
+  // transformation below enforces a GPRPair reg class for "%r" for 64-bit data.
+  if (AsmString.find(":H}") == StringRef::npos)
+    return NULL;
+
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Glue = N->getOperand(NumOps-1);
+
+  // Glue node will be appended late.
+  for(unsigned i = 0; i < NumOps -1; ++i) {
+    SDValue op = N->getOperand(i);
+    AsmNodeOperands.push_back(op);
+
+    if (i < InlineAsm::Op_FirstOperand)
+      continue;
+
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) {
+      Flag = C->getZExtValue();
+      Kind = InlineAsm::getKind(Flag);
+    }
+    else
+      continue;
+
+    if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
+        && Kind != InlineAsm::Kind_RegDefEarlyClobber)
+      continue;
+
+    unsigned RegNum = InlineAsm::getNumOperandRegisters(Flag);
+    unsigned RC;
+    bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
+    if (!HasRC || RC != ARM::GPRRegClassID || RegNum != 2)
+      continue;
+
+    assert((i+2 < NumOps-1) && "Invalid number of operands in inline asm");
+    SDValue V0 = N->getOperand(i+1);
+    SDValue V1 = N->getOperand(i+2);
+    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    SDValue PairedReg;
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    if (Kind == InlineAsm::Kind_RegDef ||
+        Kind == InlineAsm::Kind_RegDefEarlyClobber) {
+      // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to
+      // the original GPRs.
+
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      SDValue Chain = SDValue(N,0);
+
+      SDNode *GU = N->getGluedUser();
+      SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::Untyped,
+                                               Chain.getValue(1));
+
+      // Extract values from a GPRPair reg and copy to the original GPR reg.
+      SDValue Sub0 = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue Sub1 = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+                                                    RegCopy);
+      SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0,
+                                        RegCopy.getValue(1));
+      SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1));
+
+      // Update the original glue user.
+      std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
+      Ops.push_back(T1.getValue(1));
+      CurDAG->UpdateNodeOperands(GU, &Ops[0], Ops.size());
+      GU = T1.getNode();
+    }
+    else {
+      // For Kind  == InlineAsm::Kind_RegUse, we first copy two GPRs into a
+      // GPRPair and then pass the GPRPair to the inline asm.
+      SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain];
+
+      // As REG_SEQ doesn't take RegisterSDNode, we copy them first.
+      SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32,
+                                          Chain.getValue(1));
+      SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32,
+                                          T0.getValue(1));
+      SDValue Pair = SDValue(createGPRPairNode(MVT::Untyped, T0, T1), 0);
+
+      // Copy REG_SEQ into a GPRPair-typed VR and replace the original two
+      // i32 VRs of inline asm with it.
+      unsigned GPVR = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      PairedReg = CurDAG->getRegister(GPVR, MVT::Untyped);
+      Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1));
+
+      AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+      Glue = Chain.getValue(1);
+    }
+
+    Changed = true;
+
+    if(PairedReg.getNode()) {
+      Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
+      Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      // Replace the current flag.
+      AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
+          Flag, MVT::i32);
+      // Add the new register node and skip the original two GPRs.
+      AsmNodeOperands.push_back(PairedReg);
+      // Skip the next two GPRs.
+      i += 2;
+    }
+  }
+
+  AsmNodeOperands.push_back(Glue);
+  if (!Changed)
+    return NULL;
+
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+      CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
+                        AsmNodeOperands.size());
+  New->setNodeId(-1);
+  return New.getNode();
+}
+
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all ARM
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new ARMDAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 000000000000..bb26090d2d8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -0,0 +1,10445 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-isel"
+#include "ARMISelLowering.h"
+#include "ARM.h"
+#include "ARMCallingConv.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
+STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
+
+// This option should go away when tail calls fully work.
+static cl::opt<bool>
+EnableARMTailCalls("arm-tail-calls", cl::Hidden,
+  cl::desc("Generate tail calls (TEMPORARY OPTION)."),
+  cl::init(false));
+
+cl::opt<bool>
+EnableARMLongCalls("arm-long-calls", cl::Hidden,
+  cl::desc("Generate calls via indirect call instructions"),
+  cl::init(false));
+
+static cl::opt<bool>
+ARMInterworking("arm-interworking", cl::Hidden,
+  cl::desc("Enable / disable ARM interworking (for debugging only)"),
+  cl::init(true));
+
+namespace {
+  class ARMCCState : public CCState {
+  public:
+    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+               LLVMContext &C, ParmContext PC)
+        : CCState(CC, isVarArg, MF, TM, locs, C) {
+      assert(((PC == Call) || (PC == Prologue)) &&
+             "ARMCCState users must specify whether their context is call"
+             "or prologue generation.");
+      CallOrPrologue = PC;
+    }
+  };
+}
+
+// The APCS parameter registers.
+static const uint16_t GPRArgRegs[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3
+};
+
+void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
+                                       MVT PromotedBitwiseVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
+
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
+  }
+
+  MVT ElemTy = VT.getVectorElementType();
+  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
+    setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  if (ElemTy == MVT::i32) {
+    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+  } else {
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+  setOperationAction(ISD::SELECT,            VT, Expand);
+  setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::VSELECT,           VT, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+  if (VT.isInteger()) {
+    setOperationAction(ISD::SHL, VT, Custom);
+    setOperationAction(ISD::SRA, VT, Custom);
+    setOperationAction(ISD::SRL, VT, Custom);
+  }
+
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT, Promote);
+    AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
+    setOperationAction(ISD::OR,  VT, Promote);
+    AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
+    setOperationAction(ISD::XOR, VT, Promote);
+    AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
+  }
+
+  // Neon does not support vector divide/remainder operations.
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::FDIV, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::DPRRegClass);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &ARM::QPRRegClass);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
+    return new TargetLoweringObjectFileMachO();
+
+  return new ARMElfTargetObjectFile();
+}
+
+ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  RegInfo = TM.getRegisterInfo();
+  Itins = TM.getInstrItineraryData();
+
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  if (Subtarget->isTargetDarwin()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+      // Single-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
+      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
+      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
+      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
+
+      // Double-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
+      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
+      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
+      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
+
+      // Single-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
+      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
+      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
+      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
+      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
+      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
+      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
+      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+
+      // Double-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
+      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
+      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
+      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
+      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
+      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
+      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
+      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+
+      // Floating-point to integer conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
+
+      // Conversions between floating types.
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
+      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+
+      // Integer to floating-point conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+      // e.g., __floatunsidf vs. __floatunssidfvfp.
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    }
+  }
+
+  // These libcalls are not available in 32-bit.
+  setLibcallName(RTLIB::SHL_I128, 0);
+  setLibcallName(RTLIB::SRL_I128, 0);
+  setLibcallName(RTLIB::SRA_I128, 0);
+
+  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
+    // Double-precision floating-point arithmetic helper functions
+    // RTABI chapter 4.1.2, Table 2
+    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
+    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
+    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
+    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
+    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
+
+    // Double-precision floating-point comparison helper functions
+    // RTABI chapter 4.1.2, Table 3
+    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
+    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
+    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
+    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
+    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
+    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
+    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
+    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
+    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
+    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
+
+    // Single-precision floating-point arithmetic helper functions
+    // RTABI chapter 4.1.2, Table 4
+    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
+    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
+    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
+    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
+    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
+
+    // Single-precision floating-point comparison helper functions
+    // RTABI chapter 4.1.2, Table 5
+    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
+    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
+    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
+    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
+    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
+    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
+    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
+    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
+    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
+    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
+
+    // Floating-point to integer conversions.
+    // RTABI chapter 4.1.2, Table 6
+    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
+    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
+    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
+    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
+    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
+
+    // Conversions between floating types.
+    // RTABI chapter 4.1.2, Table 7
+    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
+    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
+    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
+
+    // Integer to floating-point conversions.
+    // RTABI chapter 4.1.2, Table 8
+    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
+    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
+    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
+    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
+    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
+    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
+    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
+    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
+    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
+
+    // Long long helper functions
+    // RTABI chapter 4.2, Table 9
+    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
+    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
+    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
+    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
+
+    // Integer division functions
+    // RTABI chapter 4.3.1
+    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
+    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
+    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
+    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
+    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
+    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
+    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
+    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
+    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
+
+    // Memory operations
+    // RTABI chapter 4.3.4
+    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
+    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
+    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
+    setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
+  }
+
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
+  if (Subtarget->isThumb1Only())
+    addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
+  else
+    addRegisterClass(MVT::i32, &ARM::GPRRegClass);
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    addRegisterClass(MVT::f32, &ARM::SPRRegClass);
+    if (!Subtarget->isFPOnlySP())
+      addRegisterClass(MVT::f64, &ARM::DPRRegClass);
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  }
+
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction((MVT::SimpleValueType)VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+
+  if (Subtarget->hasNEON()) {
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+
+    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+    // neither Neon nor VFP support any arithmetic operations on it.
+    // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
+    // supported for v4f32.
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    // FIXME: Code duplication: FDIV and FREM are expanded always, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    // FIXME: Create unittest.
+    // In another words, find a way when "copysign" appears in DAG with vector
+    // operands.
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    // FIXME: Code duplication: SETCC has custom operation action, see
+    // ARMTargetLowering::addTypeForNEON method for details.
+    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FNEG and for FABS.
+    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
+
+    // Mark v2f32 intrinsics.
+    setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
+
+    // Neon does not support some operations on v1i64 and v2i64 types.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    // Custom handling for some quad-vector types to detect VMULL.
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+    // Custom handling for some vector types to avoid expensive expansions
+    setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
+    setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
+    // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
+    // a destination type that is wider than the source, and nor does
+    // it have a FP_TO_[SU]INT instruction with a narrower destination than
+    // source.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
+
+    setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
+    setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
+
+    // Custom expand long extensions to vectors.
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
+
+    // NEON does not have single instruction CTPOP for vectors with element
+    // types wider than 8-bits.  However, custom lowering can leverage the
+    // v8i8/v16i8 vcnt instruction.
+    setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
+
+    // NEON only has FMA instructions as of VFP4.
+    if (!Subtarget->hasVFP4()) {
+      setOperationAction(ISD::FMA, MVT::v2f32, Expand);
+      setOperationAction(ISD::FMA, MVT::v4f32, Expand);
+    }
+
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::SELECT_CC);
+    setTargetDAGCombine(ISD::BUILD_VECTOR);
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+    setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::STORE);
+    setTargetDAGCombine(ISD::FP_TO_SINT);
+    setTargetDAGCombine(ISD::FP_TO_UINT);
+    setTargetDAGCombine(ISD::FDIV);
+
+    // It is legal to extload from v4i8 to v4i16 or v4i32.
+    MVT Tys[6] = {MVT::v8i8, MVT::v4i8, MVT::v2i8,
+                  MVT::v4i16, MVT::v2i16,
+                  MVT::v2i32};
+    for (unsigned i = 0; i < 6; ++i) {
+      setLoadExtAction(ISD::EXTLOAD, Tys[i], Legal);
+      setLoadExtAction(ISD::ZEXTLOAD, Tys[i], Legal);
+      setLoadExtAction(ISD::SEXTLOAD, Tys[i], Legal);
+    }
+  }
+
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+
+  computeRegisterProperties();
+
+  // ARM does not have f32 extending load.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // ARM does not have i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  if (!Subtarget->isThumb1Only()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im,  MVT::i1,  Legal);
+      setIndexedLoadAction(im,  MVT::i8,  Legal);
+      setIndexedLoadAction(im,  MVT::i16, Legal);
+      setIndexedLoadAction(im,  MVT::i32, Legal);
+      setIndexedStoreAction(im, MVT::i1,  Legal);
+      setIndexedStoreAction(im, MVT::i8,  Legal);
+      setIndexedStoreAction(im, MVT::i16, Legal);
+      setIndexedStoreAction(im, MVT::i32, Legal);
+    }
+  }
+
+  // i64 operation support.
+  setOperationAction(ISD::MUL,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+  if (Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  }
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
+      || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP()))
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  if (!Subtarget->isThumb1Only()) {
+    // FIXME: We should do this for Thumb1 as well.
+    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
+    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
+  }
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // These just redirect to CTTZ and CTLZ on ARM.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
+      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+    // These are expanded into libcalls if the cpu doesn't have HW divider.
+    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  }
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  if (!Subtarget->isTargetDarwin()) {
+    // Non-Darwin platforms may return values in these registers via the
+    // personality function.
+    setOperationAction(ISD::EHSELECTION,      MVT::i32,   Expand);
+    setOperationAction(ISD::EXCEPTIONADDR,    MVT::i32,   Expand);
+    setExceptionPointerRegister(ARM::R0);
+    setExceptionSelectorRegister(ARM::R1);
+  }
+
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
+  // the default expansion.
+  // FIXME: This should be checking for v6k, not just v6.
+  if (Subtarget->hasDataBarrier() ||
+      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
+    // membarrier needs custom lowering; the rest are legal and handled
+    // normally.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+    // Custom lowering for 64-bit ops
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
+    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+    setInsertFencesForAtomic(true);
+  } else {
+    // Set them all for expansion, which will force libcalls.
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+    // Unordered/Monotonic case.
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
+    // Since the libcalls include locking, fold in the fences
+    setShouldFoldAtomicFences(true);
+  }
+
+  setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
+
+  // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
+  if (!Subtarget->hasV6Ops()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
+    // iff target supports vfp2.
+    setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+    setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  if (Subtarget->isTargetDarwin()) {
+    setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+    setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
+  }
+
+  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f32, Custom);
+  setOperationAction(ISD::SELECT,    MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+
+  // We don't support sin/cos/fmod/copysign/pow
+  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
+  setOperationAction(ISD::FREM,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f32, Expand);
+  if (!TM.Options.UseSoftFloat && Subtarget->hasVFP2() &&
+      !Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  }
+  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
+  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
+
+  if (!Subtarget->hasVFP4()) {
+    setOperationAction(ISD::FMA, MVT::f64, Expand);
+    setOperationAction(ISD::FMA, MVT::f32, Expand);
+  }
+
+  // Various VFP goodness
+  if (!TM.Options.UseSoftFloat && !Subtarget->isThumb1Only()) {
+    // int <-> fp are custom expanded into bit_convert + ARMISD ops.
+    if (Subtarget->hasVFP2()) {
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    }
+    // Special handling for half-precision FP.
+    if (!Subtarget->hasFP16()) {
+      setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand);
+      setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
+    }
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
+
+  if (Subtarget->hasV6Ops())
+    setTargetDAGCombine(ISD::SRL);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+
+  if (TM.Options.UseSoftFloat || Subtarget->isThumb1Only() ||
+      !Subtarget->hasVFP2())
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Hybrid);
+
+  //// temporary - rewrite interface to use type
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+  MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2;
+
+  // On ARM arguments smaller than 4 bytes are extended, so all arguments
+  // are at least 4 bytes aligned.
+  setMinStackArgumentAlignment(4);
+
+  // Prefer likely predicted branches to selects on out-of-order cores.
+  PredictableSelectIsExpensive = Subtarget->isLikeA9();
+
+  setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
+}
+
+// FIXME: It might make sense to define the representative register class as the
+// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
+// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
+// SPR's representative would be DPR_VFP2. This should work well if register
+// pressure tracking were modified such that a register use would increment the
+// pressure of the register class's representative and all of it's super
+// classes' representatives transitively. We have not implemented this because
+// of the difficulty prior to coalescing of modeling operand register classes
+// due to the common occurrence of cross class copies and subregister insertions
+// and extractions.
+std::pair<const TargetRegisterClass*, uint8_t>
+ARMTargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  // Use DPR as representative register class for all floating point
+  // and vector types. Since there are 32 SPR registers and 32 DPR registers so
+  // the cost is 1 for both f32 and f64.
+  case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
+  case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
+    RRC = &ARM::DPRRegClass;
+    // When NEON is used for SP, only half of the register file is available
+    // because operations that define both SP and DP results will be constrained
+    // to the VFP2 class (D0-D15). We currently model this constraint prior to
+    // coalescing by double-counting the SP regs. See the FIXME above.
+    if (Subtarget->useNEONForSinglePrecisionFP())
+      Cost = 2;
+    break;
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 2;
+    break;
+  case MVT::v4i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 4;
+    break;
+  case MVT::v8i64:
+    RRC = &ARM::DPRRegClass;
+    Cost = 8;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
+  case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tCALL:         return "ARMISD::tCALL";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMN:           return "ARMISD::CMN";
+  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+
+  case ARMISD::RBIT:          return "ARMISD::RBIT";
+
+  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
+  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
+  case ARMISD::SITOF:         return "ARMISD::SITOF";
+  case ARMISD::UITOF:         return "ARMISD::UITOF";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+
+  case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
+
+  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+
+  case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
+  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
+  case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
+
+  case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
+
+  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
+  case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
+  case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
+  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
+  case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
+  case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
+  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHL:          return "ARMISD::VSHL";
+  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
+  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
+  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
+  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
+  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
+  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
+  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
+  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
+  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
+  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
+  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
+  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
+  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
+  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
+  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
+  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
+  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
+  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
+  case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
+  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
+  case ARMISD::VTBL1:         return "ARMISD::VTBL1";
+  case ARMISD::VTBL2:         return "ARMISD::VTBL2";
+  case ARMISD::VMULLs:        return "ARMISD::VMULLs";
+  case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
+  case ARMISD::FMAX:          return "ARMISD::FMAX";
+  case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::BFI:           return "ARMISD::BFI";
+  case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
+  case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
+  case ARMISD::VBSL:          return "ARMISD::VBSL";
+  case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
+  case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
+  case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
+  case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
+  case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
+  case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
+  case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
+  case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
+  case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
+  case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
+  case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
+  case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
+  case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
+  case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
+  case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
+  case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
+  case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
+  case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
+  case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
+  case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+  }
+}
+
+EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return getPointerTy();
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// getRegClassFor - Return the register class that should be used for the
+/// specified value type.
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
+  // Map v4i64 to QQ registers but do not make the type legal. Similarly map
+  // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
+  // load / store 4 to 8 consecutive D registers.
+  if (Subtarget->hasNEON()) {
+    if (VT == MVT::v4i64)
+      return &ARM::QQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::QQQQPRRegClass;
+  }
+  return TargetLowering::getRegClassFor(VT);
+}
+
+// Create a fast isel object.
+FastISel *
+ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return ARM::createFastISel(funcInfo, libInfo);
+}
+
+/// getMaximalGlobalOffset - Returns the maximal possible offset which can
+/// be used for loads / stores from the global.
+unsigned ARMTargetLowering::getMaximalGlobalOffset() const {
+  return (Subtarget->isThumb1Only() ? 127 : 4095);
+}
+
+Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
+  unsigned NumVals = N->getNumValues();
+  if (!NumVals)
+    return Sched::RegPressure;
+
+  for (unsigned i = 0; i != NumVals; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT == MVT::Glue || VT == MVT::Other)
+      continue;
+    if (VT.isFloatingPoint() || VT.isVector())
+      return Sched::ILP;
+  }
+
+  if (!N->isMachineOpcode())
+    return Sched::RegPressure;
+
+  // Load are scheduled for latency even if there instruction itinerary
+  // is not available.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+
+  if (MCID.getNumDefs() == 0)
+    return Sched::RegPressure;
+  if (!Itins->isEmpty() &&
+      Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
+    return Sched::ILP;
+
+  return Sched::RegPressure;
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::LS; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+    if (Subtarget->hasVFP2() && !isVarArg) {
+      if (!Subtarget->isAAPCS_ABI())
+        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
+      // For AAPCS ABI targets, just use VFP variant of the calling convention.
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    }
+    // Fallthrough
+  case CallingConv::C: {
+    // Use target triple & subtarget features to do actual dispatch.
+    if (!Subtarget->isAAPCS_ABI())
+      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+    else if (Subtarget->hasVFP2() &&
+             getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
+             !isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  }
+  case CallingConv::ARM_AAPCS_VFP:
+    if (!isVarArg)
+      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+    // Fallthrough
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::GHC:
+    return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallResult(Ins,
+                           CCAssignFnForNode(CallConv, /* Return*/ true,
+                                             isVarArg));
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      // Handle f64 or half of a v2f64.
+      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(0, MVT::i32));
+
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Lo.getValue(1);
+        InFlag = Lo.getValue(2);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Hi.getValue(1);
+        InFlag = Hi.getValue(2);
+        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(1, MVT::i32));
+      }
+    } else {
+      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                               InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue
+ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      MachinePointerInfo::getStack(LocMemOffset),
+                      false, false, 0);
+}
+
+void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+                                         SDValue Chain, SDValue &Arg,
+                                         RegsToPassVector &RegsToPass,
+                                         CCValAssign &VA, CCValAssign &NextVA,
+                                         SDValue &StackPtr,
+                                         SmallVector<SDValue, 8> &MemOpChains,
+                                         ISD::ArgFlagsTy Flags) const {
+
+  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+
+  if (NextVA.isRegLoc())
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
+  else {
+    assert(NextVA.isMemLoc());
+    if (StackPtr.getNode() == 0)
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+                                           dl, DAG, NextVA,
+                                           Flags));
+  }
+}
+
+/// LowerCall - Lowering a call into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue
+ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool doesNotRet                       = CLI.DoesNotReturn;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsSibCall = false;
+  // Disable tail calls if they're not supported.
+  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
+    isTailCall = false;
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
+                                                   Outs, OutVals, Ins, DAG);
+    // We don't support GuaranteedTailCallOpt for ARM, only automatically
+    // detected sibcalls.
+    if (isTailCall) {
+      ++NumTailCalls;
+      IsSibCall = true;
+    }
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+  CCInfo.AnalyzeCallOperands(Outs,
+                             CCAssignFnForNode(CallConv, /* Return*/ false,
+                                               isVarArg));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // For tail calls, memory operands are available in our caller's stack.
+  if (IsSibCall)
+    NumBytes = 0;
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+  RegsToPassVector RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization, arguments are handled later.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, MVT::i32));
+
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
+        }
+      } else {
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
+      }
+    } else if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (isByVal) {
+      assert(VA.isMemLoc());
+      unsigned offset = 0;
+
+      // True if this byval aggregate will be split between registers
+      // and memory.
+      if (CCInfo.isFirstByValRegValid()) {
+        EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+        unsigned int i, j;
+        for (i = 0, j = CCInfo.getFirstByValReg(); j < ARM::R4; i++, j++) {
+          SDValue Const = DAG.getConstant(4*i, MVT::i32);
+          SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(j, Load));
+        }
+        offset = ARM::R4 - CCInfo.getFirstByValReg();
+        CCInfo.clearFirstByValReg();
+      }
+
+      if (Flags.getByValSize() - 4*offset > 0) {
+        unsigned LocMemOffset = VA.getLocMemOffset();
+        SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        SDValue Dst = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
+                                  StkPtrOff);
+        SDValue SrcOffset = DAG.getIntPtrConstant(4*offset);
+        SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
+                                           MVT::i32);
+        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), MVT::i32);
+
+        SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
+        MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
+                                          Ops, array_lengthof(Ops)));
+      }
+    } else if (!IsSibCall) {
+      assert(VA.isMemLoc());
+
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+  bool isARMFunc = false;
+  bool isLocalARMFunc = false;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  if (EnableARMLongCalls) {
+    assert (getTargetMachine().getRelocationModel() == Reloc::Static
+            && "long-calls with non-static relocation model!");
+    // Handle a global address or an external symbol. If it's not one of
+    // those, the target's already in a register, so we don't need to do
+    // anything extra.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+
+      // Create a constant pool entry for the callee address
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
+      // Get the address of the callee into a register
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    isDirect = true;
+    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
+    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
+                   getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // ARM call to a local ARM function is predicable.
+    isLocalARMFunc = !Subtarget->isThumb() && (!isExt || !ARMInterworking);
+    // tBX takes a register source operand.
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+    } else {
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      unsigned OpFlags = 0;
+      if (Subtarget->isTargetELF() &&
+          getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    bool isStub = Subtarget->isTargetDarwin() &&
+                  getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+    } else {
+      unsigned OpFlags = 0;
+      // On ELF targets for PIC code, direct calls should go through the PLT
+      if (Subtarget->isTargetELF() &&
+                  getTargetMachine().getRelocationModel() == Reloc::PIC_)
+        OpFlags = ARMII::MO_PLT;
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlags);
+    }
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  bool HasMinSizeAttr = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (Subtarget->isThumb()) {
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+  } else {
+    if (!isDirect && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+               // Emit regular call when code size is the priority
+               !HasMinSizeAttr)
+      // "mov lr, pc; b _foo" to avoid confusing the RSP
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (isTailCall)
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
+                         dl, DAG, InVals);
+}
+
+/// HandleByVal - Every parameter *after* a byval parameter is passed
+/// on the stack.  Remember the next parameter register to allocate,
+/// and then confiscate the rest of the parameter registers to insure
+/// this.
+void
+ARMTargetLowering::HandleByVal(
+    CCState *State, unsigned &size, unsigned Align) const {
+  unsigned reg = State->AllocateReg(GPRArgRegs, 4);
+  assert((State->getCallOrPrologue() == Prologue ||
+          State->getCallOrPrologue() == Call) &&
+         "unhandled ParmContext");
+  if ((!State->isFirstByValRegValid()) &&
+      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+    if (Subtarget->isAAPCS_ABI() && Align > 4) {
+      unsigned AlignInRegs = Align / 4;
+      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
+      for (unsigned i = 0; i < Waste; ++i)
+        reg = State->AllocateReg(GPRArgRegs, 4);
+    }
+    if (reg != 0) {
+      State->setFirstByValReg(reg);
+      // At a call site, a byval parameter that is split between
+      // registers and memory needs its size truncated here.  In a
+      // function prologue, such byval parameters are reassembled in
+      // memory, and are not truncated.
+      if (State->getCallOrPrologue() == Call) {
+        unsigned excess = 4 * (ARM::R4 - reg);
+        assert(size >= excess && "expected larger existing stack allocation");
+        size -= excess;
+      }
+    }
+  }
+  // Confiscate any remaining parameter registers to preclude their
+  // assignment to subsequent parameters.
+  while (State->AllocateReg(GPRArgRegs, 4))
+    ;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const TargetInstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Do not sibcall optimize vararg calls unless the call site is not passing
+  // any arguments.
+  if (isVarArg && !Outs.empty())
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // FIXME: Completely disable sibcall for Thumb1 since Thumb1RegisterInfo::
+  // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
+  // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
+  // support in the assembler and linker to be used. This would need to be
+  // fixed to fully support tail calls in Thumb1.
+  //
+  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
+  // LR.  This means if we need to reload LR, it takes an extra instructions,
+  // which outweighs the value of the tail call; but here we don't know yet
+  // whether LR is going to be used.  Probably the right approach is to
+  // generate the tail call here and turn it back into CALL/RET in
+  // emitEpilogue if LR is used.
+
+  // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
+  // but we need to make sure there are enough registers; the only valid
+  // registers are the 4 used for parameters.  We don't currently do this
+  // case.
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    ARMCCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs1, *DAG.getContext(), Call);
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC, true, isVarArg));
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    ARMCCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                       getTargetMachine(), RVLocs2, *DAG.getContext(), Call);
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC, true, isVarArg));
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If Caller's vararg or byval argument has been split between registers and
+  // stack, do not perform tail call, since part of the argument is in caller's
+  // local frame.
+  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
+                                      getInfo<ARMFunctionInfo>();
+  if (AFI_Caller->getVarArgsRegSaveSize())
+    return false;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    ARMCCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ArgLocs, *DAG.getContext(), Call);
+    CCInfo.AnalyzeCallOperands(Outs,
+                               CCAssignFnForNode(CalleeCC, false, isVarArg));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+           i != e;
+           ++i, ++realArgIdx) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = OutVals[realArgIdx];
+        ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (VA.needsCustom()) {
+          // f64 and vector types are split into multiple registers or
+          // register/stack-slot combinations.  The types will not match
+          // the registers; give up on memory f64 refs until we figure
+          // out what to do about this.
+          if (!VA.isRegLoc())
+            return false;
+          if (!ArgLocs[++i].isRegLoc())
+            return false;
+          if (RegVT == MVT::v2f64) {
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+            if (!ArgLocs[++i].isRegLoc())
+              return false;
+          }
+        } else if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool
+ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForNode(CallConv, /*Return=*/true,
+                                                    isVarArg));
+}
+
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               DebugLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext(), Call);
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
+                                               isVarArg));
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        // Extract the first half and return it in two registers.
+        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                   DAG.getConstant(0, MVT::i32));
+        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(1), Flag);
+        Flag = Chain.getValue(1);
+        RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+        VA = RVLocs[++i]; // skip ahead to next loc
+
+        // Extract the 2nd half and fall through to handle it as an f64 value.
+        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                          DAG.getConstant(1, MVT::i32));
+      }
+      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+      // available.
+      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
+      Flag = Chain.getValue(1);
+      RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are
+    // stuck together, avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // Update chain and glue.
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
+                     RetOps.data(), RetOps.size());
+}
+
+bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
+    SDNode *VMov = Copy;
+    // f64 returned in a pair of GPRs.
+    SmallPtrSet<SDNode*, 2> Copies;
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      if (UI->getOpcode() != ISD::CopyToReg)
+        return false;
+      Copies.insert(*UI);
+    }
+    if (Copies.size() > 2)
+      return false;
+
+    for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
+         UI != UE; ++UI) {
+      SDValue UseChain = UI->getOperand(0);
+      if (Copies.count(UseChain.getNode()))
+        // Second CopyToReg
+        Copy = *UI;
+      else
+        // First CopyToReg
+        TCChain = UseChain;
+    }
+  } else if (Copy->getOpcode() == ISD::BITCAST) {
+    // f32 returned in a single GPR.
+    if (!Copy->hasOneUse())
+      return false;
+    Copy = *Copy->use_begin();
+    if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
+      return false;
+    Chain = Copy->getOperand(0);
+  } else {
+    return false;
+  }
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != ARMISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
+    return false;
+
+  if (!CI->isTailCall())
+    return false;
+
+  return !Subtarget->isThumb1Only();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+unsigned ARMTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static) {
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+  } else {
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMPCLabelIndex = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+                                      ARMCP::CPBlockAddress, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
+                               MachinePointerInfo::getConstantPool(),
+                               false, false, false, 0);
+  if (RelocM == Reloc::Static)
+    return Result;
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) const {
+  DebugLoc dl = GA->getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  SDValue Chain = Argument.getValue(1);
+
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
+  Args.push_back(Entry);
+  // FIXME: is there useful debug info available here?
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                (Type *) Type::getInt32Ty(*DAG.getContext()),
+                false, false, false, false,
+                0, CallingConv::C, /*isTailCall=*/false,
+                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                        SelectionDAG &DAG,
+                                        TLSModel::Model model) const {
+  const GlobalValue *GV = GA->getGlobal();
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue Offset;
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy();
+  // Get the Thread Pointer
+  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+  if (model == TLSModel::InitialExec) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    // Initial exec model.
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                      true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+    Chain = Offset.getValue(1);
+
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  } else {
+    // local exec model
+    assert(model == TLSModel::LocalExec);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         MachinePointerInfo::getConstantPool(),
+                         false, false, false, 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
+
+  switch (model) {
+    case TLSModel::GeneralDynamic:
+    case TLSModel::LocalDynamic:
+      return LowerToTLSGeneralDynamicModel(GA, DAG);
+    case TLSModel::InitialExec:
+    case TLSModel::LocalExec:
+      return LowerToTLSExecModels(GA, DAG, model);
+  }
+  llvm_unreachable("bogus TLS model");
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV,
+                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                                 CPAddr,
+                                 MachinePointerInfo::getConstantPool(),
+                                 false, false, false, 0);
+    SDValue Chain = Result.getValue(1);
+    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
+    if (!UseGOTOFF)
+      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+                           MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+    return Result;
+  }
+
+  // If we have T2 ops, we can materialize the address directly via movt/movw
+  // pair. This is always cheaper.
+  if (Subtarget->useMovt()) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                       DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+  } else {
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, 0);
+  }
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+
+  // FIXME: Enable this for static codegen when tool issues are fixed.  Also
+  // update ARMFastISel::ARMMaterializeGV.
+  if (Subtarget->useMovt() && RelocM != Reloc::Static) {
+    ++NumMovwMovt;
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes.
+    if (RelocM == Reloc::Static)
+      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+
+    unsigned Wrapper = (RelocM == Reloc::PIC_)
+      ? ARMISD::WrapperPIC : ARMISD::WrapperDYN;
+    SDValue Result = DAG.getNode(Wrapper, dl, PtrVT,
+                                 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
+    if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+      Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
+                           MachinePointerInfo::getGOT(),
+                           false, false, false, 0);
+    return Result;
+  }
+
+  unsigned ARMPCLabelIndex = 0;
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static) {
+    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+  } else {
+    ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+    ARMPCLabelIndex = AFI->createPICLabelUId();
+    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
+                                      PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               MachinePointerInfo::getConstantPool(),
+                               false, false, false, 0);
+  SDValue Chain = Result.getValue(1);
+
+  if (RelocM == Reloc::PIC_) {
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+  }
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+    Result = DAG.getLoad(PtrVT, dl, Chain, Result, MachinePointerInfo::getGOT(),
+                         false, false, false, 0);
+
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() &&
+         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
+                                  ARMPCLabelIndex, PCAdj);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               MachinePointerInfo::getConstantPool(),
+                               false, false, false, 0);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Val = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
+                     Op.getOperand(1), Val);
+}
+
+SDValue
+ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(0, MVT::i32));
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                          const ARMSubtarget *Subtarget) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_thread_pointer: {
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    EVT PtrVT = getPointerTy();
+    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+    SDValue CPAddr;
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+                                      ARMCP::CPLSDA, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                  MachinePointerInfo::getConstantPool(),
+                  false, false, false, 0);
+
+    if (RelocM == Reloc::PIC_) {
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    }
+    return Result;
+  }
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
+      ? ARMISD::VMULLs : ARMISD::VMULLu;
+    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+  }
+}
+
+static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
+  DebugLoc dl = Op.getDebugLoc();
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  SDValue Op5 = Op.getOperand(5);
+  bool isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue() != 0;
+  unsigned isLL = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned isLS = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  bool isOnlyStoreBarrier = (isLL == 0 && isLS == 0);
+
+  ARM_MB::MemBOpt DMBOpt;
+  if (isDeviceBarrier)
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ST : ARM_MB::SY;
+  else
+    DMBOpt = isOnlyStoreBarrier ? ARM_MB::ISHST : ARM_MB::ISH;
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(DMBOpt, MVT::i32));
+}
+
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  // FIXME: handle "fence singlethread" more efficiently.
+  DebugLoc dl = Op.getDebugLoc();
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+}
+
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
+                             const ARMSubtarget *Subtarget) {
+  // ARM pre v5TE and Thumb1 does not have preload instructions.
+  if (!(Subtarget->isThumb2() ||
+        (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
+  if (!isRead &&
+      (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
+    // ARMv7 with MP extension has PLDW.
+    return Op.getOperand(0);
+
+  unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (Subtarget->isThumb()) {
+    // Invert the bits.
+    isRead = ~isRead & 1;
+    isData = ~isData & 1;
+  }
+
+  return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
+                     Op.getOperand(1), DAG.getConstant(isRead, MVT::i32),
+                     DAG.getConstant(isData, MVT::i32));
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue
+ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                        SDValue &Root, SelectionDAG &DAG,
+                                        DebugLoc dl) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const TargetRegisterClass *RC;
+  if (AFI->isThumb1OnlyFunction())
+    RC = &ARM::tGPRRegClass;
+  else
+    RC = &ARM::GPRRegClass;
+
+  // Transform the arguments stored in physical registers into virtual ones.
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+  SDValue ArgValue2;
+  if (NextVA.isMemLoc()) {
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int FI = MFI->CreateFixedObject(4, NextVA.getLocMemOffset(), true);
+
+    // Create load node to retrieve arguments from the stack.
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
+                            MachinePointerInfo::getFixedStack(FI),
+                            false, false, false, 0);
+  } else {
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+  }
+
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+void
+ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                                  unsigned &VARegSize, unsigned &VARegSaveSize)
+  const {
+  unsigned NumGPRs;
+  if (CCInfo.isFirstByValRegValid())
+    NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
+  else {
+    unsigned int firstUnalloced;
+    firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
+                                                sizeof(GPRArgRegs) /
+                                                sizeof(GPRArgRegs[0]));
+    NumGPRs = (firstUnalloced <= 3) ? (4 - firstUnalloced) : 0;
+  }
+
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  VARegSize = NumGPRs * 4;
+  VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+}
+
+// The remaining GPRs hold either the beginning of variable-argument
+// data, or the beginning of an aggregate passed by value (usually
+// byval).  Either way, we allocate stack slots adjacent to the data
+// provided by our caller, and store the unallocated registers there.
+// If this is a variadic function, the va_list pointer will begin with
+// these values; otherwise, this reassembles a (byval) structure that
+// was split between registers and memory.
+void
+ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                                        DebugLoc dl, SDValue &Chain,
+                                        const Value *OrigArg,
+                                        unsigned OffsetFromOrigArg,
+                                        unsigned ArgOffset,
+                                        bool ForceMutable) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned firstRegToSaveIndex;
+  if (CCInfo.isFirstByValRegValid())
+    firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
+  else {
+    firstRegToSaveIndex = CCInfo.getFirstUnallocated
+      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+  }
+
+  unsigned VARegSize, VARegSaveSize;
+  computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
+  if (VARegSaveSize) {
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing
+    // the result of va_next.
+    AFI->setVarArgsRegSaveSize(VARegSaveSize);
+    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(VARegSaveSize,
+                                                     ArgOffset + VARegSaveSize
+                                                     - VARegSize,
+                                                     false));
+    SDValue FIN = DAG.getFrameIndex(AFI->getVarArgsFrameIndex(),
+                                    getPointerTy());
+
+    SmallVector<SDValue, 4> MemOps;
+    for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) {
+      const TargetRegisterClass *RC;
+      if (AFI->isThumb1OnlyFunction())
+        RC = &ARM::tGPRRegClass;
+      else
+        RC = &ARM::GPRRegClass;
+
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[firstRegToSaveIndex], RC);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      SDValue Store =
+        DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
+                     false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                        DAG.getConstant(4, getPointerTy()));
+    }
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
+  } else
+    // This will point to the next argument passed via stack.
+    AFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, ArgOffset, !ForceMutable));
+}
+
+SDValue
+ARMTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), ArgLocs, *DAG.getContext(), Prologue);
+  CCInfo.AnalyzeFormalArguments(Ins,
+                                CCAssignFnForNode(CallConv, /* Return*/ false,
+                                                  isVarArg));
+
+  SmallVector<SDValue, 16> ArgValues;
+  int lastInsIndex = -1;
+  SDValue ArgValue;
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+
+      if (VA.needsCustom()) {
+        // f64 and vector types are split up into multiple registers or
+        // combinations of registers and stack slots.
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2;
+          if (VA.isMemLoc()) {
+            int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true);
+            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
+                                    MachinePointerInfo::getFixedStack(FI),
+                                    false, false, false, 0);
+          } else {
+            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                             Chain, DAG, dl);
+          }
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+
+      } else {
+        const TargetRegisterClass *RC;
+
+        if (RegVT == MVT::f32)
+          RC = &ARM::SPRRegClass;
+        else if (RegVT == MVT::f64)
+          RC = &ARM::DPRRegClass;
+        else if (RegVT == MVT::v2f64)
+          RC = &ARM::QPRRegClass;
+        else if (RegVT == MVT::i32)
+          RC = AFI->isThumb1OnlyFunction() ?
+            (const TargetRegisterClass*)&ARM::tGPRRegClass :
+            (const TargetRegisterClass*)&ARM::GPRRegClass;
+        else
+          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+        // Transform the arguments in physical registers into virtual ones.
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+      int index = ArgLocs[i].getValNo();
+
+      // Some Ins[] entries become multiple ArgLoc[] entries.
+      // Process them only once.
+      if (index != lastInsIndex)
+        {
+          ISD::ArgFlagsTy Flags = Ins[index].Flags;
+          // FIXME: For now, all byval parameter objects are marked mutable.
+          // This can be changed with more analysis.
+          // In case of tail call optimization mark all arguments mutable.
+          // Since they could be overwritten by lowering of arguments in case of
+          // a tail call.
+          if (Flags.isByVal()) {
+            ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+            if (!AFI->getVarArgsFrameIndex()) {
+              VarArgStyleRegisters(CCInfo, DAG,
+                                   dl, Chain, CurOrigArg,
+                                   Ins[VA.getValNo()].PartOffset,
+                                   VA.getLocMemOffset(),
+                                   true /*force mutable frames*/);
+              int VAFrameIndex = AFI->getVarArgsFrameIndex();
+              InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy()));
+            } else {
+              int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                              VA.getLocMemOffset(), false);
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+            }
+          } else {
+            int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
+                                            VA.getLocMemOffset(), true);
+
+            // Create load nodes to retrieve arguments from the stack.
+            SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+            InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                         MachinePointerInfo::getFixedStack(FI),
+                                         false, false, false, 0));
+          }
+          lastInsIndex = index;
+        }
+    }
+  }
+
+  // varargs
+  if (isVarArg)
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0,
+                         CCInfo.getNextStackOffset());
+
+  return Chain;
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isPosZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->getValueAPF().isPosZero();
+    }
+  }
+  return false;
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+SDValue
+ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &ARMcc, SelectionDAG &DAG,
+                             DebugLoc dl) const {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    unsigned C = RHSC->getZExtValue();
+    if (!isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C != 0 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+    // Uses only Z Flag
+    CompareType = ARMISD::CMPZ;
+    break;
+  }
+  ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+SDValue
+ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                             DebugLoc dl) const {
+  SDValue Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
+}
+
+/// duplicateCmp - Glue values can have only one use, so this function
+/// duplicates a comparison node.
+SDValue
+ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
+  unsigned Opc = Cmp.getOpcode();
+  DebugLoc DL = Cmp.getDebugLoc();
+  if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
+    return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+
+  assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
+  Cmp = Cmp.getOperand(0);
+  Opc = Cmp.getOpcode();
+  if (Opc == ARMISD::CMPFP)
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+  else {
+    assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+  }
+  return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
+}
+
+SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue SelectTrue = Op.getOperand(1);
+  SDValue SelectFalse = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Convert:
+  //
+  //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
+  //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
+  //
+  if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
+    const ConstantSDNode *CMOVTrue =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(0));
+    const ConstantSDNode *CMOVFalse =
+      dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+
+    if (CMOVTrue && CMOVFalse) {
+      unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
+      unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
+
+      SDValue True;
+      SDValue False;
+      if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
+        True = SelectTrue;
+        False = SelectFalse;
+      } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
+        True = SelectFalse;
+        False = SelectTrue;
+      }
+
+      if (True.getNode() && False.getNode()) {
+        EVT VT = Op.getValueType();
+        SDValue ARMcc = Cond.getOperand(2);
+        SDValue CCR = Cond.getOperand(3);
+        SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
+        assert(True.getValueType() == VT);
+        return DAG.getNode(ARMISD::CMOV, dl, VT, True, False, ARMcc, CCR, Cmp);
+      }
+    }
+  }
+
+  // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
+  // undefined bits before doing a full-word comparison with zero.
+  Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
+                     DAG.getConstant(1, Cond.getValueType()));
+
+  return DAG.getSelectCC(dl, Cond,
+                         DAG.getConstant(0, Cond.getValueType()),
+                         SelectTrue, SelectFalse, ISD::SETNE);
+}
+
+SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMcc;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                               ARMcc, CCR, Cmp);
+  if (CondCode2 != ARMCC::AL) {
+    SDValue ARMcc2 = DAG.getConstant(CondCode2, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
+                         Result, TrueVal, ARMcc2, CCR, Cmp2);
+  }
+  return Result;
+}
+
+/// canChangeToInt - Given the fp compare operand, return true if it is suitable
+/// to morph to an integer compare sequence.
+static bool canChangeToInt(SDValue Op, bool &SeenZero,
+                           const ARMSubtarget *Subtarget) {
+  SDNode *N = Op.getNode();
+  if (!N->hasOneUse())
+    // Otherwise it requires moving the value from fp to integer registers.
+    return false;
+  if (!N->getNumValues())
+    return false;
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
+    // f32 case is generally profitable. f64 case only makes sense when vcmpe +
+    // vmrs are very slow, e.g. cortex-a8.
+    return false;
+
+  if (isFloatingPointZero(Op)) {
+    SeenZero = true;
+    return true;
+  }
+  return ISD::isNormalLoad(N);
+}
+
+static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
+  if (isFloatingPointZero(Op))
+    return DAG.getConstant(0, MVT::i32);
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
+    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+                       Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
+                       Ld->isVolatile(), Ld->isNonTemporal(),
+                       Ld->isInvariant(), Ld->getAlignment());
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
+                           SDValue &RetVal1, SDValue &RetVal2) {
+  if (isFloatingPointZero(Op)) {
+    RetVal1 = DAG.getConstant(0, MVT::i32);
+    RetVal2 = DAG.getConstant(0, MVT::i32);
+    return;
+  }
+
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
+    SDValue Ptr = Ld->getBasePtr();
+    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+                          Ld->getChain(), Ptr,
+                          Ld->getPointerInfo(),
+                          Ld->isVolatile(), Ld->isNonTemporal(),
+                          Ld->isInvariant(), Ld->getAlignment());
+
+    EVT PtrType = Ptr.getValueType();
+    unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
+    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
+                                 PtrType, Ptr, DAG.getConstant(4, PtrType));
+    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+                          Ld->getChain(), NewPtr,
+                          Ld->getPointerInfo().getWithOffset(4),
+                          Ld->isVolatile(), Ld->isNonTemporal(),
+                          Ld->isInvariant(), NewAlign);
+    return;
+  }
+
+  llvm_unreachable("Unknown VFP cmp argument!");
+}
+
+/// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
+/// f32 and even f64 comparisons to integer ones.
+SDValue
+ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  bool LHSSeenZero = false;
+  bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
+  bool RHSSeenZero = false;
+  bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
+  if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
+    // If unsafe fp math optimization is enabled and there are no other uses of
+    // the CMP operands, and the condition code is EQ or NE, we can optimize it
+    // to an integer comparison.
+    if (CC == ISD::SETOEQ)
+      CC = ISD::SETEQ;
+    else if (CC == ISD::SETUNE)
+      CC = ISD::SETNE;
+
+    SDValue Mask = DAG.getConstant(0x7fffffff, MVT::i32);
+    SDValue ARMcc;
+    if (LHS.getValueType() == MVT::f32) {
+      LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(LHS, DAG), Mask);
+      RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
+                        bitcastf32Toi32(RHS, DAG), Mask);
+      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+      return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                         Chain, Dest, ARMcc, CCR, Cmp);
+    }
+
+    SDValue LHS1, LHS2;
+    SDValue RHS1, RHS2;
+    expandf64Toi32(LHS, DAG, LHS1, LHS2);
+    expandf64Toi32(RHS, DAG, RHS1, RHS2);
+    LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
+    RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
+    ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+    ARMcc = DAG.getConstant(CondCode, MVT::i32);
+    SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
+    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
+  }
+
+  return SDValue();
+}
+
+SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMcc;
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                       Chain, Dest, ARMcc, CCR, Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  if (getTargetMachine().Options.UnsafeFPMath &&
+      (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
+       CC == ISD::SETNE || CC == ISD::SETUNE)) {
+    SDValue Result = OptimizeVFPBrcond(Op, DAG);
+    if (Result.getNode())
+      return Result;
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  if (CondCode2 != ARMCC::AL) {
+    ARMcc = DAG.getConstant(CondCode2, MVT::i32);
+    SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  }
+  return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+  if (Subtarget->isThumb2()) {
+    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
+    // which does another jump to the destination. This also makes it easier
+    // to translate it to TBB / TBH later.
+    // FIXME: This might not work if the function is extremely large.
+    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+                       Addr, Op.getOperand(2), JTI, UId);
+  }
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+                       MachinePointerInfo::getJumpTable(),
+                       false, false, false, 0);
+    Chain = Addr.getValue(1);
+    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  } else {
+    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
+                       MachinePointerInfo::getJumpTable(),
+                       false, false, false, 0);
+    Chain = Addr.getValue(1);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  }
+}
+
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getValueType().getVectorElementType() == MVT::i32) {
+    if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4i16)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
+}
+
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::FP_TO_SINT:
+    Opc = ARMISD::FTOSI;
+    break;
+  case ISD::FP_TO_UINT:
+    Opc = ARMISD::FTOUI;
+    break;
+  }
+  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+}
+
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
+    if (VT.getVectorElementType() == MVT::f32)
+      return Op;
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+         "Invalid type for custom lowering!");
+  if (VT != MVT::v4f32)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  unsigned CastOpc;
+  unsigned Opc;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    CastOpc = ISD::SIGN_EXTEND;
+    Opc = ISD::SINT_TO_FP;
+    break;
+  case ISD::UINT_TO_FP:
+    CastOpc = ISD::ZERO_EXTEND;
+    Opc = ISD::UINT_TO_FP;
+    break;
+  }
+
+  Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc;
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode!");
+  case ISD::SINT_TO_FP:
+    Opc = ARMISD::SITOF;
+    break;
+  case ISD::UINT_TO_FP:
+    Opc = ARMISD::UITOF;
+    break;
+  }
+
+  Op = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDValue Tmp0 = Op.getOperand(0);
+  SDValue Tmp1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Tmp1.getValueType();
+  bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
+    Tmp0.getOpcode() == ARMISD::VMOVDRR;
+  bool UseNEON = !InGPR && Subtarget->hasNEON();
+
+  if (UseNEON) {
+    // Use VBSL to copy the sign bit.
+    unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
+    SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
+                               DAG.getTargetConstant(EncodedVal, MVT::i32));
+    EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
+    if (VT == MVT::f64)
+      Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                         DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
+                         DAG.getConstant(32, MVT::i32));
+    else /*if (VT == MVT::f32)*/
+      Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
+    if (SrcVT == MVT::f32) {
+      Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
+      if (VT == MVT::f64)
+        Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
+                           DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
+                           DAG.getConstant(32, MVT::i32));
+    } else if (VT == MVT::f32)
+      Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
+                         DAG.getConstant(32, MVT::i32));
+    Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
+
+    SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
+                                            MVT::i32);
+    AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
+    SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
+                                  DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
+
+    SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
+                              DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
+    if (VT == MVT::f32) {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                        DAG.getConstant(0, MVT::i32));
+    } else {
+      Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
+    }
+
+    return Res;
+  }
+
+  // Bitcast operand 1 to i32.
+  if (SrcVT == MVT::f64)
+    Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                       &Tmp1, 1).getValue(1);
+  Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
+
+  // Or in the signbit with integer operations.
+  SDValue Mask1 = DAG.getConstant(0x80000000, MVT::i32);
+  SDValue Mask2 = DAG.getConstant(0x7fffffff, MVT::i32);
+  Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
+  if (VT == MVT::f32) {
+    Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
+                       DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                       DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
+  }
+
+  // f64: Or the high part with signbit and then combine two parts.
+  Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                     &Tmp0, 1);
+  SDValue Lo = Tmp0.getValue(0);
+  SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
+  Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+}
+
+SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
+    ? ARM::R7 : ARM::R11;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
+/// and size(DestVec) > 128-bits.
+/// This is achieved by doing the one extension from the SrcVec, splitting the
+/// result, extending these parts, and then concatenating these into the
+/// destination.
+static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DestVT = N->getValueType(0);
+
+  assert(DestVT.getSizeInBits() > 128 &&
+         "Custom sext/zext expansion needs >128-bit vector.");
+  // If this is a normal length extension, use the default expansion.
+  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
+      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
+    return SDValue();
+
+  DebugLoc dl = N->getDebugLoc();
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
+  unsigned NumElts = SrcVT.getVectorNumElements();
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
+
+  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                               NumElts);
+  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
+                                 NumElts/2);
+  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
+                               NumElts/2);
+
+  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
+  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(0));
+  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
+                        DAG.getIntPtrConstant(NumElts/2));
+  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
+  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
+}
+
+/// ExpandBITCAST - If the target supports VFP, this function is called to
+/// expand a bit convert where either the source or destination type is i64 to
+/// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
+/// operand type is illegal (e.g., v2f32 for a target that doesn't support
+/// vectors), since the legalizer won't know what to do with that.
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Op = N->getOperand(0);
+
+  // This function is only supposed to be called for i64 types, either as the
+  // source or destination of the bit convert.
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
+         "ExpandBITCAST called for non-i64 type");
+
+  // Turn i64->f64 into VMOVDRR.
+  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(0, MVT::i32));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(1, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, DstVT,
+                       DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
+  }
+
+  // Turn f64->i64 into VMOVRRD.
+  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
+    // Merge the pieces into a single i64 value.
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+  }
+
+  return SDValue();
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+/// Zero vectors are used to represent vector negation and in those cases
+/// will be implemented with the NEON VNEG instruction.  However, VNEG does
+/// not support i64 elements, so sometimes the zero vectors will need to be
+/// explicitly constructed.  Regardless, use a canonical VMOV to create the
+/// zero vector.
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+  // The canonical modified immediate encoding of a zero vector is....0!
+  SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
+  EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+  SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMcc, DAG, dl);
+  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMcc, DAG, dl);
+  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMcc,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // The rounding mode is in bits 23:22 of the FPSCR.
+  // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
+  // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
+  // so that the shift + and get folded into a bitfield extract.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                              DAG.getConstant(Intrinsic::arm_get_fpscr,
+                                              MVT::i32));
+  SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
+                                  DAG.getConstant(1U << 22, MVT::i32));
+  SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
+                              DAG.getConstant(22, MVT::i32));
+  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                     DAG.getConstant(3, MVT::i32));
+}
+
+static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (!ST->hasV6T2Ops())
+    return SDValue();
+
+  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
+}
+
+/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
+/// for each 16-bit element from operand, repeated.  The basic idea is to
+/// leverage vcnt to get the 8-bit counts, gather and add the results.
+///
+/// Trace for v4i16:
+/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
+/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
+/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
+/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
+///            [b0 b1 b2 b3 b4 b5 b6 b7]
+///           +[b1 b0 b3 b2 b5 b4 b7 b6]
+/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
+/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
+static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
+  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
+  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
+}
+
+/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
+/// bit-count for each 16-bit element from the operand.  We need slightly
+/// different sequencing for v4i16 and v8i16 to stay within NEON's available
+/// 64/128-bit registers.
+///
+/// Trace for v4i16:
+/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
+/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
+/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
+/// v4i16:Extracted = [k0    k1    k2    k3    ]
+static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
+                                    BitCounts, DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
+  }
+}
+
+/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
+/// bit-count for each 32-bit element from the operand.  The idea here is
+/// to split the vector into 16-bit elements, leverage the 16-bit count
+/// routine, and then combine the results.
+///
+/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
+/// input    = [v0    v1    ] (vi: 32-bit elements)
+/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
+/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
+/// vrev: N0 = [k1 k0 k3 k2 ]
+///            [k0 k1 k2 k3 ]
+///       N1 =+[k1 k0 k3 k2 ]
+///            [k0 k2 k1 k3 ]
+///       N2 =+[k1 k3 k0 k2 ]
+///            [k0    k2    k1    k3    ]
+/// Extended =+[k1    k3    k0    k2    ]
+///            [k0    k2    ]
+/// Extracted=+[k1    k3    ]
+///
+static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
+  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
+  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
+  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
+  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+
+  if (VT.is64BitVector()) {
+    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
+                       DAG.getIntPtrConstant(0));
+  } else {
+    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
+                                    DAG.getIntPtrConstant(0));
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+  }
+}
+
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
+          VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  if (VT.getVectorElementType() == MVT::i32)
+    return lowerCTPOP32BitElements(N, DAG);
+  else
+    return lowerCTPOP16BitElements(N, DAG);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (!VT.isVector())
+    return SDValue();
+
+  // Lower vector shifts on NEON to use VSHL.
+  assert(ST->hasNEON() && "unexpected vector shift");
+
+  // Left shifts translate directly to the vshiftu intrinsic.
+  if (N->getOpcode() == ISD::SHL)
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                       N->getOperand(0), N->getOperand(1));
+
+  assert((N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+  // NEON uses the same intrinsics for both left and right shifts.  For
+  // right shifts, the shift amounts are negative, so negate the vector of
+  // shift amounts.
+  EVT ShiftVT = N->getOperand(1).getValueType();
+  SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                     getZeroVector(ShiftVT, DAG, dl),
+                                     N->getOperand(1));
+  Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                             Intrinsic::arm_neon_vshifts :
+                             Intrinsic::arm_neon_vshiftu);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(vshiftInt, MVT::i32),
+                     N->getOperand(0), NegatedCount);
+}
+
+static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
+                                const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // We can get here for a node like i32 = ISD::SHL i32, i64
+  if (VT != MVT::i64)
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb1Only()) return SDValue();
+
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                           DAG.getConstant(1, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
+
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+  // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue TmpOp0, TmpOp1;
+  bool Invert = false;
+  bool Swap = false;
+  unsigned Opc = 0;
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal FP comparison");
+    case ISD::SETUNE:
+    case ISD::SETNE:  Invert = true; // Fallthrough
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETOLT:
+    case ISD::SETLT: Swap = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETOLE:
+    case ISD::SETLE:  Swap = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETUGE: Swap = true; // Fallthrough
+    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETUGT: Swap = true; // Fallthrough
+    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETUEQ: Invert = true; // Fallthrough
+    case ISD::SETONE:
+      // Expand this to (OLT | OGT).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      break;
+    case ISD::SETUO: Invert = true; // Fallthrough
+    case ISD::SETO:
+      // Expand this to (OLT | OGE).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      break;
+    }
+  } else {
+    // Integer comparisons.
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal integer comparison");
+    case ISD::SETNE:  Invert = true;
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETLT:  Swap = true;
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETLE:  Swap = true;
+    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    }
+
+    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
+    if (Opc == ARMISD::VCEQ) {
+
+      SDValue AndOp;
+      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+        AndOp = Op0;
+      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+        AndOp = Op1;
+
+      // Ignore bitconvert.
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
+        AndOp = AndOp.getOperand(0);
+
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+        Opc = ARMISD::VTST;
+        Op0 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BITCAST, dl, VT, AndOp.getOperand(1));
+        Invert = !Invert;
+      }
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // If one of the operands is a constant vector zero, attempt to fold the
+  // comparison to a specialized compare-against-zero form.
+  SDValue SingleOp;
+  if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+    SingleOp = Op0;
+  else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
+    if (Opc == ARMISD::VCGE)
+      Opc = ARMISD::VCLEZ;
+    else if (Opc == ARMISD::VCGT)
+      Opc = ARMISD::VCLTZ;
+    SingleOp = Op1;
+  }
+
+  SDValue Result;
+  if (SingleOp.getNode()) {
+    switch (Opc) {
+    case ARMISD::VCEQ:
+      Result = DAG.getNode(ARMISD::VCEQZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGE:
+      Result = DAG.getNode(ARMISD::VCGEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLEZ:
+      Result = DAG.getNode(ARMISD::VCLEZ, dl, VT, SingleOp); break;
+    case ARMISD::VCGT:
+      Result = DAG.getNode(ARMISD::VCGTZ, dl, VT, SingleOp); break;
+    case ARMISD::VCLTZ:
+      Result = DAG.getNode(ARMISD::VCLTZ, dl, VT, SingleOp); break;
+    default:
+      Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+    }
+  } else {
+     Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+  }
+
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+/// isNEONModifiedImm - Check if the specified splat value corresponds to a
+/// valid vector constant for a NEON instruction with a "modified immediate"
+/// operand (e.g., VMOV).  If so, return the encoded value.
+static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                                 unsigned SplatBitSize, SelectionDAG &DAG,
+                                 EVT &VT, bool is128Bits, NEONModImmType type) {
+  unsigned OpCmode, Imm;
+
+  // SplatBitSize is set to the smallest size that splats the vector, so a
+  // zero vector will always have SplatBitSize == 8.  However, NEON modified
+  // immediate instructions others than VMOV do not support the 8-bit encoding
+  // of a zero vector, and the default encoding of zero is supposed to be the
+  // 32-bit version.
+  if (SplatBits == 0)
+    SplatBitSize = 32;
+
+  switch (SplatBitSize) {
+  case 8:
+    if (type != VMOVModImm)
+      return SDValue();
+    // Any 1-byte value is OK.  Op=0, Cmode=1110.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    break;
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn: Op=x, Cmode=100x.
+      OpCmode = 0x8;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00: Op=x, Cmode=101x.
+      OpCmode = 0xa;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    return SDValue();
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn: Op=x, Cmode=000x.
+      OpCmode = 0;
+      Imm = SplatBits;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00: Op=x, Cmode=001x.
+      OpCmode = 0x2;
+      Imm = SplatBits >> 8;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000: Op=x, Cmode=010x.
+      OpCmode = 0x4;
+      Imm = SplatBits >> 16;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000: Op=x, Cmode=011x.
+      OpCmode = 0x6;
+      Imm = SplatBits >> 24;
+      break;
+    }
+
+    // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
+    if (type == OtherModImm) return SDValue();
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff: Op=x, Cmode=1100.
+      OpCmode = 0xc;
+      Imm = SplatBits >> 8;
+      SplatBits |= 0xff;
+      break;
+    }
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff: Op=x, Cmode=1101.
+      OpCmode = 0xd;
+      Imm = SplatBits >> 16;
+      SplatBits |= 0xffff;
+      break;
+    }
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    return SDValue();
+
+  case 64: {
+    if (type != VMOVModImm)
+      return SDValue();
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Val |= BitMask;
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
+        return SDValue();
+      }
+      BitMask <<= 8;
+      ImmMask <<= 1;
+    }
+    // Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    SplatBits = Val;
+    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+    break;
+  }
+
+  default:
+    llvm_unreachable("unexpected size for isNEONModifiedImm");
+  }
+
+  unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
+  return DAG.getTargetConstant(EncodedVal, MVT::i32);
+}
+
+SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) const {
+  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+    return SDValue();
+
+  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
+  assert(Op.getValueType() == MVT::f32 &&
+         "ConstantFP custom lowering should only occur for f32.");
+
+  // Try splatting with a VMOV.f32...
+  APFloat FPVal = CFP->getValueAPF();
+  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  if (ImmVal != -1) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
+                                      NewVal);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // If that fails, try a VMOV.i32
+  EVT VMovVT;
+  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
+  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
+                                     VMOVModImm);
+  if (NewVal != SDValue()) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
+                                      NewVal);
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  // Finally, try a VMVN.i32
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
+                             VMVNModImm);
+  if (NewVal != SDValue()) {
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+    SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
+                                       VecConstant);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+
+static bool isVEXTMask(ArrayRef<int> M, EVT VT,
+                       bool &ReverseVEXT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseVEXT = false;
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseVEXT = true;
+    }
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseVEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
+  // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
+  // range, then 0 is placed into the resulting vector. So pretty much any mask
+  // of 8 elements can work here.
+  return VT == MVT::v8i8 && M.size() == 8;
+}
+
+static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if ((unsigned) M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned) MIdx != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned) M[i] != Idx) ||
+        (M[i+1] >= 0 && (unsigned) M[i+1] != Idx))
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// \return true if this is a reverse operation on an vector.
+static bool isReverseMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  // Make sure the mask has the right size.
+  if (NumElts != M.size())
+      return false;
+
+  // Look for <15, ..., 3, -1, 1, 0>.
+  for (unsigned i = 0; i != NumElts; ++i)
+    if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
+      return false;
+
+  return true;
+}
+
+// If N is an integer constant that can be moved into a register in one
+// instruction, return an SDValue of such a constant (will become a MOV
+// instruction).  Otherwise return null.
+static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
+                                     const ARMSubtarget *ST, DebugLoc dl) {
+  uint64_t Val;
+  if (!isa<ConstantSDNode>(N))
+    return SDValue();
+  Val = cast<ConstantSDNode>(N)->getZExtValue();
+
+  if (ST->isThumb1Only()) {
+    if (Val <= 255 || ~Val <= 255)
+      return DAG.getConstant(Val, MVT::i32);
+  } else {
+    if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
+      return DAG.getConstant(Val, MVT::i32);
+  }
+  return SDValue();
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                                             const ARMSubtarget *ST) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      // Check if an immediate VMOV works.
+      EVT VmovVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMOVModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Try an immediate VMVN.
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
+      Val = isNEONModifiedImm(NegatedImm,
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VmovVT, VT.is128BitVector(),
+                                      VMVNModImm);
+      if (Val.getNode()) {
+        SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
+      }
+
+      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
+      if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
+        int ImmVal = ARM_AM::getFP32Imm(SplatBits);
+        if (ImmVal != -1) {
+          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
+        }
+      }
+    }
+  }
+
+  // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the VDUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+        // We need to create a new undef vector to use for the VDUPLANE if the
+        // size of the vector from which we get the value is different than the
+        // size of the vector that we need to create. We will insert the element
+        // such that the register coalescer will remove unnecessary copies.
+        if (VT != Value->getOperand(0).getValueType()) {
+          ConstantSDNode *constIndex;
+          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
+          assert(constIndex && "The index is not a constant!");
+          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+                             VT.getVectorNumElements();
+          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+                        Value, DAG.getConstant(index, MVT::i32)),
+                           DAG.getConstant(index, MVT::i32));
+        } else
+          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+                                  Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+      Val = LowerBUILD_VECTOR(Val, DAG, ST);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    }
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // Vectors with 32- or 64-bit elements can be built by directly assigning
+  // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
+  // will be legalized.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i)
+      Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  return SDValue();
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
+    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
+               VT.getVectorElementType()) {
+      // This code doesn't know how to handle shuffles where the vector
+      // element types do not match (this happens because type legalization
+      // promotes the return type of EXTRACT_VECTOR_ELT).
+      // FIXME: It might be appropriate to extend this code to handle
+      // mismatched types.
+      return SDValue();
+    }
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    // If the element number isn't a constant, we can't effectively
+    // analyze what's going on.
+    if (!isa<ConstantSDNode>(V.getOperand(1)))
+      return SDValue();
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = {0, 0};
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // It probably isn't worth padding out a smaller vector just to
+      // break it down again in a shuffle.
+      return SDValue();
+    }
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i],
+                                   DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i],
+                                     DAG.getIntPtrConstant(NumElts));
+      ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(VEXTOffsets[i], MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i)
+                                          .getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
+    }
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm, WhichResult;
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  return (EltSize >= 32 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isVREVMask(M, VT, 64) ||
+          isVREVMask(M, VT, 32) ||
+          isVREVMask(M, VT, 16) ||
+          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTBLMask(M, VT) ||
+          isVTRNMask(M, VT, WhichResult) ||
+          isVUZPMask(M, VT, WhichResult) ||
+          isVZIPMask(M, VT, WhichResult) ||
+          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
+          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
+          isVZIP_v_undef_Mask(M, VT, WhichResult) ||
+          ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> VREV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> VREV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3:
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3:
+    return DAG.getNode(ARMISD::VEXT, dl, VT,
+                       OpLHS, OpRHS,
+                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+  case OP_VUZPL:
+  case OP_VUZPR:
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+  case OP_VZIPL:
+  case OP_VZIPR:
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+  case OP_VTRNL:
+  case OP_VTRNR:
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+  }
+}
+
+static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
+                                       ArrayRef<int> ShuffleMask,
+                                       SelectionDAG &DAG) {
+  // Check to see if we can use the VTBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+
+  SmallVector<SDValue, 8> VTBLMask;
+  for (ArrayRef<int>::iterator
+         I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
+    VTBLMask.push_back(DAG.getConstant(*I, MVT::i32));
+
+  if (V2.getNode()->getOpcode() == ISD::UNDEF)
+    return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                   &VTBLMask[0], 8));
+
+  return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
+                                 &VTBLMask[0], 8));
+}
+
+static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
+                                                      SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue OpLHS = Op.getOperand(0);
+  EVT VT = OpLHS.getValueType();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
+         "Expect an v8i16/v16i8 type");
+  OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
+  // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
+  // extract the first 8 bytes into the top double word and the last 8 bytes
+  // into the bottom double word. The v8i16 case is similar.
+  unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
+  return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
+                     DAG.getConstant(ExtractNum, MVT::i32));
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  // FIXME: floating-point vectors should be canonicalized to integer vectors
+  // of the same time so that they get CSEd properly.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize <= 32) {
+    if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+      int Lane = SVN->getSplatIndex();
+      // If this is undef splat, generate it via "just" vdup, if possible.
+      if (Lane == -1) Lane = 0;
+
+      // Test if V1 is a SCALAR_TO_VECTOR.
+      if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
+      // (and probably will turn into a SCALAR_TO_VECTOR once legalization
+      // reaches it).
+      if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
+          !isa<ConstantSDNode>(V1.getOperand(0))) {
+        bool IsScalarToVector = true;
+        for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
+          if (V1.getOperand(i).getOpcode() != ISD::UNDEF) {
+            IsScalarToVector = false;
+            break;
+          }
+        if (IsScalarToVector)
+          return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+      }
+      return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                         DAG.getConstant(Lane, MVT::i32));
+    }
+
+    bool ReverseVEXT;
+    unsigned Imm;
+    if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+      if (ReverseVEXT)
+        std::swap(V1, V2);
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
+    if (isVREVMask(ShuffleMask, VT, 64))
+      return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 32))
+      return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+    if (isVREVMask(ShuffleMask, VT, 16))
+      return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+    if (V2->getOpcode() == ISD::UNDEF &&
+        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
+    // Check for Neon shuffles that modify both input vectors in place.
+    // If both results are used, i.e., if there are two shuffles with the same
+    // source operands and with masks corresponding to both results of one of
+    // these operations, DAG memoization will ensure that a single node is
+    // used for both shuffles.
+    unsigned WhichResult;
+    if (isVTRNMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVUZPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+    if (isVZIPMask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V2).getValue(WhichResult);
+
+    if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+    if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+      return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                         V1, V1).getValue(WhichResult);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
+  if (EltSize >= 32) {
+    // Do the expansion with floating-point types, since that is what the VFP
+    // registers are defined to use, and since i64 is not legal.
+    EVT EltVT = EVT::getFloatingPointVT(EltSize);
+    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      if (ShuffleMask[i] < 0)
+        Ops.push_back(DAG.getUNDEF(EltVT));
+      else
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                  ShuffleMask[i] < (int)NumElts ? V1 : V2,
+                                  DAG.getConstant(ShuffleMask[i] & (NumElts-1),
+                                                  MVT::i32)));
+    }
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+  }
+
+  if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+    return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
+
+  if (VT == MVT::v8i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // INSERT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(2);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  return Op;
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
+  SDValue Lane = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Lane))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  if (Op.getValueType() == MVT::i32 &&
+      Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
+    DebugLoc dl = Op.getDebugLoc();
+    return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+  }
+
+  return Op;
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+         "unexpected CONCAT_VECTORS");
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Val = DAG.getUNDEF(MVT::v2f64);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  if (Op0.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
+                      DAG.getIntPtrConstant(0));
+  if (Op1.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
+                      DAG.getIntPtrConstant(1));
+  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
+}
+
+/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
+/// element has been zero/sign-extended, depending on the isSigned parameter,
+/// from an integer type half its size.
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+                                   bool isSigned) {
+  // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    if (BVN->getValueType(0) != MVT::v4i32 ||
+        BVN->getOpcode() != ISD::BUILD_VECTOR)
+      return false;
+    unsigned LoElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    unsigned HiElt = 1 - LoElt;
+    ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
+    ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
+    ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
+    ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
+    if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
+      return false;
+    if (isSigned) {
+      if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
+          Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
+        return true;
+    } else {
+      if (Hi0->isNullValue() && Hi1->isNullValue())
+        return true;
+    }
+    return false;
+  }
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      unsigned HalfSize = EltSize / 2;
+      if (isSigned) {
+        if (!isIntN(HalfSize, C->getSExtValue()))
+          return false;
+      } else {
+        if (!isUIntN(HalfSize, C->getZExtValue()))
+          return false;
+      }
+      continue;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+/// isSignExtended - Check if a node is a vector value that is sign-extended
+/// or a constant BUILD_VECTOR with sign-extended elements.
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, true))
+    return true;
+  return false;
+}
+
+/// isZeroExtended - Check if a node is a vector value that is zero-extended
+/// or a constant BUILD_VECTOR with zero-extended elements.
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
+    return true;
+  if (isExtendedBUILD_VECTOR(N, DAG, false))
+    return true;
+  return false;
+}
+
+/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
+/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
+/// We insert the required extension here to get the vector to fill a D register.
+static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
+                                            const EVT &OrigTy,
+                                            const EVT &ExtTy,
+                                            unsigned ExtOpcode) {
+  // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+  // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+  // 64-bits we need to insert a new extension so that it will be 64-bits.
+  assert(ExtTy.is128BitVector() && "Unexpected extension size");
+  if (OrigTy.getSizeInBits() >= 64)
+    return N;
+
+  // Must extend size to at least 64 bits to be used as an operand for VMULL.
+  MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
+  EVT NewVT;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Orig Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+    NewVT = MVT::v2i32;
+    break;
+  case MVT::v4i8:
+    NewVT = MVT::v4i16;
+    break;
+  }
+  return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
+}
+
+/// SkipLoadExtensionForVMULL - return a load of the original vector size that
+/// does not do any sign/zero extension. If the original vector is less
+/// than 64 bits, an appropriate extension will be added after the load to
+/// reach a total size of 64 bits. We have to add the extension separately
+/// because ARM does not have a sign/zero extending load for vectors.
+static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
+  SDValue NonExtendingLoad =
+    DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+                LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
+                LD->isNonTemporal(), LD->isInvariant(),
+                LD->getAlignment());
+  unsigned ExtOp = 0;
+  switch (LD->getExtensionType()) {
+  default: llvm_unreachable("Unexpected LoadExtType");
+  case ISD::EXTLOAD:
+  case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
+  case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
+  }
+  MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
+  MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
+  return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
+                                      MemType, ExtType, ExtOp);
+}
+
+/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
+/// extending load, or BUILD_VECTOR with extended elements, return the
+/// unextended value. The unextended vector should be 64 bits so that it can
+/// be used as an operand to a VMULL instruction. If the original vector size
+/// before extension is less than 64 bits we add a an extension to resize
+/// the vector to 64 bits.
+static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+    return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
+                                        N->getOperand(0)->getValueType(0),
+                                        N->getValueType(0),
+                                        N->getOpcode());
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
+    return SkipLoadExtensionForVMULL(LD, DAG);
+
+  // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
+  // have been legalized as a BITCAST from v4i32.
+  if (N->getOpcode() == ISD::BITCAST) {
+    SDNode *BVN = N->getOperand(0).getNode();
+    assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
+           BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
+    unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
+    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
+                       BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
+  }
+  // Construct a new BUILD_VECTOR with elements truncated to half the size.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+  EVT VT = N->getValueType(0);
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT TruncVT = MVT::getIntegerVT(EltSize);
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+    const APInt &CInt = C->getAPIntValue();
+    // Element types smaller than 32 bits are not legal, so use i32 elements.
+    // The values are implicitly truncated so sext vs. zext doesn't matter.
+    Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+  }
+  return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+    SDNode *N0 = N->getOperand(0).getNode();
+    SDNode *N1 = N->getOperand(1).getNode();
+    return N0->hasOneUse() && N1->hasOneUse() &&
+      isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+  }
+  return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // Multiplications are only custom-lowered for 128-bit vectors so that
+  // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
+  EVT VT = Op.getValueType();
+  assert(VT.is128BitVector() && VT.isInteger() &&
+         "unexpected type for custom-lowering ISD::MUL");
+  SDNode *N0 = Op.getOperand(0).getNode();
+  SDNode *N1 = Op.getOperand(1).getNode();
+  unsigned NewOpc = 0;
+  bool isMLA = false;
+  bool isN0SExt = isSignExtended(N0, DAG);
+  bool isN1SExt = isSignExtended(N1, DAG);
+  if (isN0SExt && isN1SExt)
+    NewOpc = ARMISD::VMULLs;
+  else {
+    bool isN0ZExt = isZeroExtended(N0, DAG);
+    bool isN1ZExt = isZeroExtended(N1, DAG);
+    if (isN0ZExt && isN1ZExt)
+      NewOpc = ARMISD::VMULLu;
+    else if (isN1SExt || isN1ZExt) {
+      // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+      // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+      if (isN1SExt && isAddSubSExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLs;
+        isMLA = true;
+      } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+        std::swap(N0, N1);
+        NewOpc = ARMISD::VMULLu;
+        isMLA = true;
+      }
+    }
+
+    if (!NewOpc) {
+      if (VT == MVT::v2i64)
+        // Fall through to expand this.  It is not legal.
+        return SDValue();
+      else
+        // Other vector multiplications are legal.
+        return Op;
+    }
+  }
+
+  // Legalize to a VMULL instruction.
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Op0;
+  SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
+  if (!isMLA) {
+    Op0 = SkipExtensionForVMULL(N0, DAG);
+    assert(Op0.getValueType().is64BitVector() &&
+           Op1.getValueType().is64BitVector() &&
+           "unexpected types for extended operands to VMULL");
+    return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+  }
+
+  // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
+  // isel lowering to take advantage of no-stall back to back vmul + vmla.
+  //   vmull q0, d4, d6
+  //   vmlal q0, d5, d6
+  // is faster than
+  //   vaddl q0, d4, d5
+  //   vmovl q1, d6
+  //   vmul  q0, q0, q1
+  SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
+  SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
+  EVT Op1VT = Op1.getValueType();
+  return DAG.getNode(N0->getOpcode(), DL, VT,
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+                     DAG.getNode(NewOpc, DL, VT,
+                               DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
+
+static SDValue
+LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+  // Convert to float
+  // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
+  // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
+  X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
+  Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
+  X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
+  Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
+  // Get reciprocal estimate.
+  // float4 recip = vrecpeq_f32(yf);
+  Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), Y);
+  // Because char has a smaller range than uchar, we can actually get away
+  // without any newton steps.  This requires that we use a weird bias
+  // of 0xb000, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
+  X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
+  Y = DAG.getConstant(0xb000, MVT::i32);
+  Y = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Y, Y, Y, Y);
+  X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
+  X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
+  // Convert back to short.
+  X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
+  X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
+  return X;
+}
+
+static SDValue
+LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+  SDValue N2;
+  // Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_s16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_s16(x));
+  N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and one refinement step.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), N1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   N1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Because short has a smaller range than ushort, we can actually get away
+  // with only a single newton step.  This requires that we use a weird bias
+  // of 89, however (again, this has been exhaustively tested).
+  // float4 result = as_float4(as_int4(xf*recip) + 0x89);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(0x89, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_s32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::SDIV");
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+
+    N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
+    return N0;
+  }
+  return LowerSDIV_v4i16(N0, N1, dl, DAG);
+}
+
+static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
+         "unexpected type for custom-lowering ISD::UDIV");
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2, N3;
+
+  if (VT == MVT::v8i8) {
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
+
+    N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(4));
+    N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(4));
+    N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
+                     DAG.getIntPtrConstant(0));
+    N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
+                     DAG.getIntPtrConstant(0));
+
+    N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
+    N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
+
+    N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
+    N0 = LowerCONCAT_VECTORS(N0, DAG);
+
+    N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
+                     DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, MVT::i32),
+                     N0);
+    return N0;
+  }
+
+  // v4i16 sdiv ... Convert to float.
+  // float4 yf = vcvt_f32_s32(vmovl_u16(y));
+  // float4 xf = vcvt_f32_s32(vmovl_u16(x));
+  N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
+  N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
+  N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
+  SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
+
+  // Use reciprocal estimate and two refinement steps.
+  // float4 recip = vrecpeq_f32(yf);
+  // recip *= vrecpsq_f32(yf, recip);
+  // recip *= vrecpsq_f32(yf, recip);
+  N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecpe, MVT::i32), BN1);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
+                   DAG.getConstant(Intrinsic::arm_neon_vrecps, MVT::i32),
+                   BN1, N2);
+  N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
+  // Simply multiplying by the reciprocal estimate can leave us a few ulps
+  // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
+  // and that it will never cause us to return an answer too large).
+  // float4 result = as_float4(as_int4(xf*recip) + 2);
+  N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
+  N1 = DAG.getConstant(2, MVT::i32);
+  N1 = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, N1, N1, N1, N1);
+  N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
+  N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
+  // Convert back to integer and return.
+  // return vmovn_u32(vcvt_s32_f32(result));
+  N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
+  N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
+  return N0;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = ARMISD::ADDC; break;
+  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+  case ISD::SUBC: Opc = ARMISD::SUBC; break;
+  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/store is legal for all targets
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Aquire/Release load/store is not legal for targets without a
+  // dmb or equivalent available.
+  return SDValue();
+}
+
+
+static void
+ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
+                    SelectionDAG &DAG, unsigned NewOp) {
+  DebugLoc dl = Node->getDebugLoc();
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(0)); // Chain
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  // Low part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
+  // High part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
+  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
+    // High part of Val1
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
+    // High part of Val2
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+  }
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:
+    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
+      LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SELECT:        return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG);
+  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
+  case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
+                                                               Subtarget);
+  case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
+  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
+  case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::MUL:           return LowerMUL(Op, DAG);
+  case ISD::SDIV:          return LowerSDIV(Op, DAG);
+  case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  SDValue Res;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::BITCAST:
+    Res = ExpandBITCAST(N, DAG);
+    break;
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = ExpandVectorExtension(N, DAG);
+    break;
+  case ISD::SRL:
+  case ISD::SRA:
+    Res = Expand64BitShift(N, DAG, Subtarget);
+    break;
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
+    return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_MIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMIN:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_MAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_UMAX:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    return;
+  }
+  if (Res.getNode())
+    Results.push_back(Res);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                                     unsigned Size) const {
+  unsigned dest    = MI->getOperand(0).getReg();
+  unsigned ptr     = MI->getOperand(1).getReg();
+  unsigned oldval  = MI->getOperand(2).getReg();
+  unsigned newval  = MI->getOperand(3).getReg();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  unsigned scratch = MRI.createVirtualRegister(isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass);
+
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(oldval, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(newval, &ARM::rGPRRegClass);
+  }
+
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldrex dest, [ptr]
+  //   cmp dest, oldval
+  //   bne exitMBB
+  BB = loop1MBB;
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(dest).addReg(oldval));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex scratch, newval, [ptr]
+  //   cmp scratch, #0
+  //   bne loop1MBB
+  BB = loop2MBB;
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    unsigned Size, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   <binop> scratch2, dest, incr
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+  if (BinOpcode) {
+    // operand order needs to go the other way for NAND
+    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(incr).addReg(dest)).addReg(0);
+    else
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(dest).addReg(incr)).addReg(0);
+  }
+
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Size,
+                                          bool signExtend,
+                                          ARMCC::CondCodes Cond) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  unsigned oldval = dest;
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+
+  unsigned ldrOpc, strOpc, extendOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    extendOpc = 0;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned scratch = MRI.createVirtualRegister(TRC);
+  unsigned scratch2 = MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   (sign extend dest, if required)
+  //   cmp dest, incr
+  //   cmov.cond scratch2, incr, dest
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+
+  // Sign extend the value, if necessary.
+  if (signExtend && extendOpc) {
+    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
+                     .addReg(dest)
+                     .addImm(0));
+  }
+
+  // Build compare and cmov instructions.
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(oldval).addReg(incr));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
+         .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
+
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Op1, unsigned Op2,
+                                      bool NeedsCarry, bool IsCmpxchg,
+                                      bool IsMinMax, ARMCC::CondCodes CC) const {
+  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  unsigned vallo = MI->getOperand(3).getReg();
+  unsigned valhi = MI->getOperand(4).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *contBB = 0, *cont2BB = 0;
+  if (IsCmpxchg || IsMinMax)
+    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  if (IsCmpxchg)
+    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MF->insert(It, loopMBB);
+  if (IsCmpxchg || IsMinMax) MF->insert(It, contBB);
+  if (IsCmpxchg) MF->insert(It, cont2BB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned storesuccess = MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrexd r2, r3, ptr
+  //   <binopa> r0, r2, incr
+  //   <binopb> r1, r3, incr
+  //   strexd storesuccess, r0, r1, ptr
+  //   cmp storesuccess, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+
+  // Load
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
+                   .addReg(destlo, RegState::Define)
+                   .addReg(desthi, RegState::Define)
+                   .addReg(ptr));
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
+                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
+    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
+  }
+
+  unsigned StoreLo, StoreHi;
+  if (IsCmpxchg) {
+    // Add early exit
+    for (unsigned i = 0; i < 2; i++) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
+                                                         ARM::CMPrr))
+                     .addReg(i == 0 ? destlo : desthi)
+                     .addReg(i == 0 ? vallo : valhi));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+      BB->addSuccessor(exitMBB);
+      BB->addSuccessor(i == 0 ? contBB : cont2BB);
+      BB = (i == 0 ? contBB : cont2BB);
+    }
+
+    // Copy to physregs for strexd
+    StoreLo = MI->getOperand(5).getReg();
+    StoreHi = MI->getOperand(6).getReg();
+  } else if (Op1) {
+    // Perform binary operation
+    unsigned tmpRegLo = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), tmpRegLo)
+                   .addReg(destlo).addReg(vallo))
+        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
+    unsigned tmpRegHi = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), tmpRegHi)
+                   .addReg(desthi).addReg(valhi))
+        .addReg(IsMinMax ? ARM::CPSR : 0, getDefRegState(IsMinMax));
+
+    StoreLo = tmpRegLo;
+    StoreHi = tmpRegHi;
+  } else {
+    // Copy to physregs for strexd
+    StoreLo = vallo;
+    StoreHi = valhi;
+  }
+  if (IsMinMax) {
+    // Compare and branch to exit block.
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(exitMBB).addImm(CC).addReg(ARM::CPSR);
+    BB->addSuccessor(exitMBB);
+    BB->addSuccessor(contBB);
+    BB = contBB;
+    StoreLo = vallo;
+    StoreHi = valhi;
+  }
+
+  // Store
+  if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+                   .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
+  } else {
+    // Marshal a pair...
+    unsigned StorePair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned UndefPair = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    unsigned r1 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    BuildMI(BB, dl, TII->get(TargetOpcode::IMPLICIT_DEF), UndefPair);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), r1)
+      .addReg(UndefPair)
+      .addReg(StoreLo)
+      .addImm(ARM::gsub_0);
+    BuildMI(BB, dl, TII->get(TargetOpcode::INSERT_SUBREG), StorePair)
+      .addReg(r1)
+      .addReg(StoreHi)
+      .addImm(ARM::gsub_1);
+
+    // ...and store it
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+                   .addReg(StorePair).addReg(ptr));
+  }
+  // Cmp+jump
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(storesuccess).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::
+SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
+                       MachineBasicBlock *DispatchBB, int FI) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  const Function *F = MF->getFunction();
+
+  bool isThumb = Subtarget->isThumb();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+  const TargetRegisterClass *TRC = isThumb ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+
+  // Grab constant pool and fixed stack memory operands.
+  MachineMemOperand *CPMMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
+                             MachineMemOperand::MOLoad, 4, 4);
+
+  MachineMemOperand *FIMMOSt =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore, 4, 4);
+
+  // Load the address of the dispatch MBB into the jump buffer.
+  if (isThumb2) {
+    // Incoming value: jbuf
+    //   ldr.n  r5, LCPI1_1
+    //   orr    r5, r5, #1
+    //   add    r5, pc
+    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+                     .addReg(NewVReg1, RegState::Kill)
+                     .addImm(0x01)));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+      .addReg(NewVReg2, RegState::Kill)
+      .addImm(PCLabelId);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+                   .addReg(NewVReg3, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  } else if (isThumb) {
+    // Incoming value: jbuf
+    //   ldr.n  r1, LCPI1_4
+    //   add    r1, pc
+    //   mov    r2, #1
+    //   orrs   r1, r2
+    //   add    r2, $jbuf, #+4 ; &jbuf[1]
+    //   str    r1, [r2]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+      .addReg(NewVReg1, RegState::Kill)
+      .addImm(PCLabelId);
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addImm(1));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3, RegState::Kill));
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
+                   .addFrameIndex(FI)
+                   .addImm(36)); // &jbuf[1] :: pc
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(FIMMOSt));
+  } else {
+    // Incoming value: jbuf
+    //   ldr  r1, LCPI1_1
+    //   add  r1, pc, r1
+    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addImm(0)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+                   .addReg(NewVReg1, RegState::Kill)
+                   .addImm(PCLabelId));
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  }
+}
+
+MachineBasicBlock *ARMTargetLowering::
+EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  int FI = MFI->getFunctionContextIndex();
+
+  const TargetRegisterClass *TRC = Subtarget->isThumb() ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRnopcRegClass;
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  MachineModuleInfo &MMI = MF->getMMI();
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
+       ++BB) {
+    if (!BB->isLandingPad()) continue;
+
+    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+    // pad.
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
+
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
+
+      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
+      }
+      break;
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock*> LPadList;
+  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+  for (unsigned I = 1; I <= MaxCSNum; ++I) {
+    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the jump table and associated information.
+  MachineJumpTableInfo *JTI =
+    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+  unsigned UId = AFI->createJumpTableUId();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsLandingPad();
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  unsigned trap_opcode;
+  if (Subtarget->isThumb())
+    trap_opcode = ARM::tTRAP;
+  else
+    trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
+
+  BuildMI(TrapBB, dl, TII->get(trap_opcode));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert and MBBs.
+  MF->insert(MF->end(), DispatchBB);
+  MF->insert(MF->end(), DispContBB);
+  MF->insert(MF->end(), TrapBB);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+  MachineMemOperand *FIMMOLd =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad |
+                             MachineMemOperand::MOVolatile, 4, 4);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
+
+  unsigned NumLPads = LPadList.size();
+  if (Subtarget->isThumb2()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(LPadList.size()));
+    } else {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(
+        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+        .addReg(NewVReg3, RegState::Kill)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+      .addReg(NewVReg4, RegState::Kill)
+      .addReg(NewVReg1)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else if (Subtarget->isThumb()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(1)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg1)
+                   .addImm(2));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(JTMMOLd));
+
+    unsigned NewVReg6 = NewVReg5;
+    if (RelocM == Reloc::PIC_) {
+      NewVReg6 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                     .addReg(ARM::CPSR, RegState::Define)
+                     .addReg(NewVReg5, RegState::Kill)
+                     .addReg(NewVReg3));
+    }
+
+    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+      .addReg(NewVReg6, RegState::Kill)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+
+    if (NumLPads < 256) {
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+                     .addReg(NewVReg1)
+                     .addImm(NumLPads));
+    } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+                     .addImm(NumLPads & 0xFFFF));
+
+      unsigned VReg2 = VReg1;
+      if ((NumLPads & 0xFFFF0000) != 0) {
+        VReg2 = MRI->createVirtualRegister(TRC);
+        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+                       .addReg(VReg1)
+                       .addImm(NumLPads >> 16));
+      }
+
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg2));
+    } else {
+      MachineConstantPool *ConstantPool = MF->getConstantPool();
+      Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+      const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
+
+      // MachineConstantPool wants an explicit alignment.
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+      if (Align == 0)
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+      unsigned VReg1 = MRI->createVirtualRegister(TRC);
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+                     .addReg(VReg1, RegState::Define)
+                     .addConstantPoolIndex(Idx)
+                     .addImm(0));
+      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+                     .addReg(NewVReg1)
+                     .addReg(VReg1, RegState::Kill));
+    }
+
+    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+                     .addReg(NewVReg1)
+                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(
+      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+      .addReg(NewVReg3, RegState::Kill)
+      .addReg(NewVReg4)
+      .addImm(0)
+      .addMemOperand(JTMMOLd));
+
+    if (RelocM == Reloc::PIC_) {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+        .addReg(NewVReg5, RegState::Kill)
+        .addReg(NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    } else {
+      BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
+        .addReg(NewVReg5, RegState::Kill)
+        .addJumpTableIndex(MJTI)
+        .addImm(UId);
+    }
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
+    if (SeenMBBs.insert(CurMBB))
+      DispContBB->addSuccessor(CurMBB);
+  }
+
+  // N.B. the order the invoke BBs are processed in doesn't matter here.
+  const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
+  SmallVector<MachineBasicBlock*, 64> MBBLPads;
+  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
+         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
+    MachineBasicBlock *BB = *I;
+
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
+                                                  BB->succ_end());
+    while (!Successors.empty()) {
+      MachineBasicBlock *SMBB = Successors.pop_back_val();
+      if (SMBB->isLandingPad()) {
+        BB->removeSuccessor(SMBB);
+        MBBLPads.push_back(SMBB);
+      }
+    }
+
+    BB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled. This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (MachineBasicBlock::reverse_iterator
+           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+      if (!II->isCall()) continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (MachineInstr::mop_iterator
+             OI = II->operands_begin(), OE = II->operands_end();
+           OI != OE; ++OI) {
+        if (!OI->isReg()) continue;
+        DefRegs[OI->getReg()] = true;
+      }
+
+      MachineInstrBuilder MIB(*MF, &*II);
+
+      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+        unsigned Reg = SavedRegs[i];
+        if (Subtarget->isThumb2() &&
+            !ARM::tGPRRegClass.contains(Reg) &&
+            !ARM::hGPRRegClass.contains(Reg))
+          continue;
+        if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
+          continue;
+        if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
+          continue;
+        if (!DefRegs[Reg])
+          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // Mark all former landing pads as non-landing pads. The dispatch is the only
+  // landing pad now.
+  for (SmallVectorImpl<MachineBasicBlock*>::iterator
+         I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
+    (*I)->setIsLandingPad(false);
+
+  // The instruction is gone now.
+  MI->eraseFromParent();
+
+  return MBB;
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
+  llvm_unreachable("Expecting a BB with two successors!");
+}
+
+MachineBasicBlock *ARMTargetLowering::
+EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+  // This pseudo instruction has 3 operands: dst, src, size
+  // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
+  // Otherwise, we will generate unrolled scalar copies.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned src = MI->getOperand(1).getReg();
+  unsigned SizeVal = MI->getOperand(2).getImm();
+  unsigned Align = MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  bool isThumb2 = Subtarget->isThumb2();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned ldrOpc, strOpc, UnitSize = 0;
+
+  const TargetRegisterClass *TRC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  const TargetRegisterClass *TRC_Vec = 0;
+
+  if (Align & 1) {
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    UnitSize = 1;
+  } else if (Align & 2) {
+    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
+    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
+    UnitSize = 2;
+  } else {
+    // Check whether we can use NEON instructions.
+    if (!MF->getFunction()->getAttributes().
+          hasAttribute(AttributeSet::FunctionIndex,
+                       Attribute::NoImplicitFloat) &&
+        Subtarget->hasNEON()) {
+      if ((Align % 16 == 0) && SizeVal >= 16) {
+        ldrOpc = ARM::VLD1q32wb_fixed;
+        strOpc = ARM::VST1q32wb_fixed;
+        UnitSize = 16;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
+      }
+      else if ((Align % 8 == 0) && SizeVal >= 8) {
+        ldrOpc = ARM::VLD1d32wb_fixed;
+        strOpc = ARM::VST1d32wb_fixed;
+        UnitSize = 8;
+        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
+      }
+    }
+    // Can't use NEON instructions.
+    if (UnitSize == 0) {
+      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
+      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+      UnitSize = 4;
+    }
+  }
+
+  unsigned BytesLeft = SizeVal % UnitSize;
+  unsigned LoopSize = SizeVal - BytesLeft;
+
+  if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
+    // Use LDR and STR to copy.
+    // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
+    // [destOut] = STR_POST(scratch, destIn, UnitSize)
+    unsigned srcIn = src;
+    unsigned destIn = dest;
+    for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
+      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (UnitSize >= 8) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(destIn).addImm(0).addReg(scratch));
+      } else if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addImm(UnitSize));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc), scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
+          .addImm(UnitSize));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(UnitSize));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+
+    // Handle the leftover bytes with LDRB and STRB.
+    // [scratch, srcOut] = LDRB_POST(srcIn, 1)
+    // [destOut] = STRB_POST(scratch, destIn, 1)
+    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+    for (unsigned i = 0; i < BytesLeft; i++) {
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      unsigned srcOut = MRI.createVirtualRegister(TRC);
+      unsigned destOut = MRI.createVirtualRegister(TRC);
+      if (isThumb2) {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      } else {
+        AddDefaultPred(BuildMI(*BB, MI, dl,
+          TII->get(ldrOpc),scratch)
+          .addReg(srcOut, RegState::Define).addReg(srcIn)
+          .addReg(0).addImm(1));
+
+        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
+          .addReg(scratch).addReg(destIn)
+          .addReg(0).addImm(1));
+      }
+      srcIn = srcOut;
+      destIn = destOut;
+    }
+    MI->eraseFromParent();   // The instruction is gone now.
+    return BB;
+  }
+
+  // Expand the pseudo op to a loop.
+  // thisMBB:
+  //   ...
+  //   movw varEnd, # --> with thumb2
+  //   movt varEnd, #
+  //   ldrcp varEnd, idx --> without thumb2
+  //   fallthrough --> loopMBB
+  // loopMBB:
+  //   PHI varPhi, varEnd, varLoop
+  //   PHI srcPhi, src, srcLoop
+  //   PHI destPhi, dst, destLoop
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
+  //   subs varLoop, varPhi, #UnitSize
+  //   bne loopMBB
+  //   fallthrough --> exitMBB
+  // exitMBB:
+  //   epilogue to handle left-over bytes
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Load an immediate to varEnd.
+  unsigned varEnd = MRI.createVirtualRegister(TRC);
+  if (isThumb2) {
+    unsigned VReg1 = varEnd;
+    if ((LoopSize & 0xFFFF0000) != 0)
+      VReg1 = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
+                   .addImm(LoopSize & 0xFFFF));
+
+    if ((LoopSize & 0xFFFF0000) != 0)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
+                     .addReg(VReg1)
+                     .addImm(LoopSize >> 16));
+  } else {
+    MachineConstantPool *ConstantPool = MF->getConstantPool();
+    Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
+    const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
+
+    // MachineConstantPool wants an explicit alignment.
+    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
+    if (Align == 0)
+      Align = getDataLayout()->getTypeAllocSize(C->getType());
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
+                   .addReg(varEnd, RegState::Define)
+                   .addConstantPoolIndex(Idx)
+                   .addImm(0));
+  }
+  BB->addSuccessor(loopMBB);
+
+  // Generate the loop body:
+  //   varPhi = PHI(varLoop, varEnd)
+  //   srcPhi = PHI(srcLoop, src)
+  //   destPhi = PHI(destLoop, dst)
+  MachineBasicBlock *entryBB = BB;
+  BB = loopMBB;
+  unsigned varLoop = MRI.createVirtualRegister(TRC);
+  unsigned varPhi = MRI.createVirtualRegister(TRC);
+  unsigned srcLoop = MRI.createVirtualRegister(TRC);
+  unsigned srcPhi = MRI.createVirtualRegister(TRC);
+  unsigned destLoop = MRI.createVirtualRegister(TRC);
+  unsigned destPhi = MRI.createVirtualRegister(TRC);
+
+  BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
+    .addReg(varLoop).addMBB(loopMBB)
+    .addReg(varEnd).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
+    .addReg(srcLoop).addMBB(loopMBB)
+    .addReg(src).addMBB(entryBB);
+  BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
+    .addReg(destLoop).addMBB(loopMBB)
+    .addReg(dest).addMBB(entryBB);
+
+  //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
+  //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
+  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
+  if (UnitSize >= 8) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(destPhi).addImm(0).addReg(scratch));
+  } else if (isThumb2) {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addImm(UnitSize));
+  } else {
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
+      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
+      .addImm(UnitSize));
+
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
+      .addReg(scratch).addReg(destPhi)
+      .addReg(0).addImm(UnitSize));
+  }
+
+  // Decrement loop variable by UnitSize.
+  MachineInstrBuilder MIB = BuildMI(BB, dl,
+    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+  MIB->getOperand(5).setReg(ARM::CPSR);
+  MIB->getOperand(5).setIsDef(true);
+
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  // loopMBB can loop back to loopMBB or fall through to exitMBB.
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  // Add epilogue to handle BytesLeft.
+  BB = exitMBB;
+  MachineInstr *StartOfExit = exitMBB->begin();
+  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
+  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
+
+  //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
+  //   [destOut] = STRB_POST(scratch, destLoop, 1)
+  unsigned srcIn = srcLoop;
+  unsigned destIn = destLoop;
+  for (unsigned i = 0; i < BytesLeft; i++) {
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    unsigned srcOut = MRI.createVirtualRegister(TRC);
+    unsigned destOut = MRI.createVirtualRegister(TRC);
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addImm(1));
+    } else {
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
+        TII->get(ldrOpc),scratch)
+        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
+
+      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
+        .addReg(scratch).addReg(destIn)
+        .addReg(0).addImm(1));
+    }
+    srcIn = srcOut;
+    destIn = destOut;
+  }
+
+  MI->eraseFromParent();   // The instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+  switch (MI->getOpcode()) {
+  default: {
+    MI->dump();
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+  // The Thumb2 pre-indexed stores have the same MI operands, they just
+  // define them differently in the .td files from the isel patterns, so
+  // they need pseudos.
+  case ARM::t2STR_preidx:
+    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    return BB;
+  case ARM::t2STRB_preidx:
+    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    return BB;
+  case ARM::t2STRH_preidx:
+    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    return BB;
+
+  case ARM::STRi_preidx:
+  case ARM::STRBi_preidx: {
+    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
+      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    // Decode the offset.
+    unsigned Offset = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+    Offset = ARM_AM::getAM2Offset(Offset);
+    if (isSub)
+      Offset = -Offset;
+
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    BuildMI(*BB, MI, dl, TII->get(NewOpc))
+      .addOperand(MI->getOperand(0))  // Rn_wb
+      .addOperand(MI->getOperand(1))  // Rt
+      .addOperand(MI->getOperand(2))  // Rn
+      .addImm(Offset)                 // offset (skip GPR==zero_reg)
+      .addOperand(MI->getOperand(5))  // pred
+      .addOperand(MI->getOperand(6))
+      .addMemOperand(MMO);
+    MI->eraseFromParent();
+    return BB;
+  }
+  case ARM::STRr_preidx:
+  case ARM::STRBr_preidx:
+  case ARM::STRH_preidx: {
+    unsigned NewOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+    MI->eraseFromParent();
+    return BB;
+  }
+  case ARM::ATOMIC_LOAD_ADD_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+
+  case ARM::ATOMIC_LOAD_AND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+
+  case ARM::ATOMIC_LOAD_OR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+
+  case ARM::ATOMIC_LOAD_XOR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+
+  case ARM::ATOMIC_LOAD_NAND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+
+  case ARM::ATOMIC_LOAD_SUB_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+
+  case ARM::ATOMIC_LOAD_MIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::LT);
+  case ARM::ATOMIC_LOAD_MIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::LT);
+
+  case ARM::ATOMIC_LOAD_MAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, true, ARMCC::GT);
+  case ARM::ATOMIC_LOAD_MAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, true, ARMCC::GT);
+
+  case ARM::ATOMIC_LOAD_UMIN_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::LO);
+  case ARM::ATOMIC_LOAD_UMIN_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::LO);
+
+  case ARM::ATOMIC_LOAD_UMAX_I8:
+     return EmitAtomicBinaryMinMax(MI, BB, 1, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I16:
+     return EmitAtomicBinaryMinMax(MI, BB, 2, false, ARMCC::HI);
+  case ARM::ATOMIC_LOAD_UMAX_I32:
+     return EmitAtomicBinaryMinMax(MI, BB, 4, false, ARMCC::HI);
+
+  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
+  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
+  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
+
+  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
+  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
+  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
+
+
+  case ARM::ATOMADD6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
+                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMSUB6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
+                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMXOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
+                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMAND6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
+                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMSWAP6432:
+    return EmitAtomicBinary64(MI, BB, 0, 0, false);
+  case ARM::ATOMCMPXCHG6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
+  case ARM::ATOMMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LT);
+  case ARM::ATOMMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::GE);
+  case ARM::ATOMUMIN6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::LO);
+  case ARM::ATOMUMAX6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true, /*IsCmpxchg*/false,
+                              /*IsMinMax*/ true, ARMCC::HS);
+
+  case ARM::tMOVCCr_pseudo: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(ARM::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::BCCi64:
+  case ARM::BCCZi64: {
+    // If there is an unconditional branch to the other successor, remove it.
+    BB->erase(llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+
+    // Compare both parts that make up the double comparison separately for
+    // equality.
+    bool RHSisZero = MI->getOpcode() == ARM::BCCZi64;
+
+    unsigned LHS1 = MI->getOperand(1).getReg();
+    unsigned LHS2 = MI->getOperand(2).getReg();
+    if (RHSisZero) {
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                     .addReg(LHS1).addImm(0));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+        .addReg(LHS2).addImm(0)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    } else {
+      unsigned RHS1 = MI->getOperand(3).getReg();
+      unsigned RHS2 = MI->getOperand(4).getReg();
+      AddDefaultPred(BuildMI(BB, dl,
+                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                     .addReg(LHS1).addReg(RHS1));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+        .addReg(LHS2).addReg(RHS2)
+        .addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    }
+
+    MachineBasicBlock *destMBB = MI->getOperand(RHSisZero ? 3 : 5).getMBB();
+    MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
+    if (MI->getOperand(0).getImm() == ARMCC::NE)
+      std::swap(destMBB, exitMBB);
+
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+    if (isThumb2)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+    else
+      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::Int_eh_sjlj_setjmp:
+  case ARM::Int_eh_sjlj_setjmp_nofp:
+  case ARM::tInt_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp:
+  case ARM::t2Int_eh_sjlj_setjmp_nofp:
+    EmitSjLjDispatchBlock(MI, BB);
+    return BB;
+
+  case ARM::ABS:
+  case ARM::t2ABS: {
+    // To insert an ABS instruction, we have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // source vreg to test against 0, the destination vreg to set,
+    // the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    // It transforms
+    //     V1 = ABS V0
+    // into
+    //     V2 = MOVS V0
+    //     BCC                      (branch to SinkBB if V0 >= 0)
+    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
+    //     SinkBB: V1 = PHI(V2, V3)
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator BBI = BB;
+    ++BBI;
+    MachineFunction *Fn = BB->getParent();
+    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
+    Fn->insert(BBI, RSBBB);
+    Fn->insert(BBI, SinkBB);
+
+    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
+    unsigned int ABSDstReg = MI->getOperand(0).getReg();
+    bool isThumb2 = Subtarget->isThumb2();
+    MachineRegisterInfo &MRI = Fn->getRegInfo();
+    // In Thumb mode S must not be specified if source register is the SP or
+    // PC and if destination register is the SP, so restrict register class
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ?
+      (const TargetRegisterClass*)&ARM::rGPRRegClass :
+      (const TargetRegisterClass*)&ARM::GPRRegClass);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    SinkBB->splice(SinkBB->begin(), BB,
+      llvm::next(MachineBasicBlock::iterator(MI)),
+      BB->end());
+    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(RSBBB);
+    BB->addSuccessor(SinkBB);
+
+    // fall through to SinkMBB
+    RSBBB->addSuccessor(SinkBB);
+
+    // insert a cmp at the end of BB
+    AddDefaultPred(BuildMI(BB, dl,
+                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                   .addReg(ABSSrcReg).addImm(0));
+
+    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+    BuildMI(BB, dl,
+      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+    // insert rsbri in RSBBB
+    // Note: BCC and rsbri will be converted into predicated rsbmi
+    // by if-conversion pass
+    BuildMI(*RSBBB, RSBBB->begin(), dl,
+      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+      .addReg(ABSSrcReg, RegState::Kill)
+      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+    // insert PHI in SinkBB,
+    // reuse ABSDstReg to not change uses of ABS instruction
+    BuildMI(*SinkBB, SinkBB->begin(), dl,
+      TII->get(ARM::PHI), ABSDstReg)
+      .addReg(NewRsbDstReg).addMBB(RSBBB)
+      .addReg(ABSSrcReg).addMBB(BB);
+
+    // remove ABS instruction
+    MI->eraseFromParent();
+
+    // return last added BB
+    return SinkBB;
+  }
+  case ARM::COPY_STRUCT_BYVAL_I32:
+    ++NumLoopByVals;
+    return EmitStructByval(MI, BB);
+  }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                      SDNode *Node) const {
+  if (!MI->hasPostISelHook()) {
+    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
+    return;
+  }
+
+  const MCInstrDesc *MCID = &MI->getDesc();
+  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+  // operand is still set to noreg. If needed, set the optional operand's
+  // register to CPSR, and remove the redundant implicit def.
+  //
+  // e.g. ADCS (..., CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+  // Rename pseudo opcodes.
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  if (NewOpc) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    MCID = &TII->get(NewOpc);
+
+    assert(MCID->getNumOperands() == MI->getDesc().getNumOperands() + 1 &&
+           "converted opcode should be the same except for cc_out");
+
+    MI->setDesc(*MCID);
+
+    // Add the optional cc_out operand
+    MI->addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
+  }
+  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+  // Any ARM instruction that sets the 's' bit should specify an optional
+  // "cc_out" operand in the last operand position.
+  if (!MI->hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+  // since we already have an optional CPSR def.
+  bool definesCPSR = false;
+  bool deadCPSR = false;
+  for (unsigned i = MCID->getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+      definesCPSR = true;
+      if (MO.isDead())
+        deadCPSR = true;
+      MI->RemoveOperand(i);
+      break;
+    }
+  }
+  if (!definesCPSR) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+  if (deadCPSR) {
+    assert(!MI->getOperand(ccOutIdx).getReg() &&
+           "expect uninitialized optional cc_out operand");
+    return;
+  }
+
+  // If this instruction was defined with an optional CPSR def and its dag node
+  // had a live implicit CPSR def, then activate the optional CPSR def.
+  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MO.setReg(ARM::CPSR);
+  MO.setIsDef(true);
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+// Helper function that checks if N is a null or all ones constant.
+static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  if (!C)
+    return false;
+  return AllOnes ? C->isAllOnesValue() : C->isNullValue();
+}
+
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// Invert is set when N is the null/all ones constant when CC is false.
+// OtherOp is set to the alternative value of N.
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
+                                       SDValue &CC, bool &Invert,
+                                       SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default: return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND:
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    // Fall through.
+  case ISD::SIGN_EXTEND: {
+    EVT VT = N->getValueType(0);
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, VT);
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      // When looking for a 0 constant, N can be zext or sext.
+      OtherOp = DAG.getConstant(1, VT);
+    else
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+    return true;
+  }
+  }
+}
+
+// Combine a constant select operand into its use:
+//
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
+//
+// The transform is rejected if the select doesn't have a constant operand that
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
+//
+// @param N       The node to transform.
+// @param Slct    The N operand that is a select.
+// @param OtherOp The other N operand (x above).
+// @param DCI     Context.
+// @param AllOnes Require the select constant to be all ones instead of null.
+// @returns The new node, or SDValue() on failure.
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            bool AllOnes = false) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
+
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+                                 OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
+
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                     CCOp, TrueVal, FalseVal);
+}
+
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static
+SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
+  }
+  if (N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
+  }
+  return SDValue();
+}
+
+// AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
+// (only after legalization).
+static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Only perform optimization if after legalize, and if NEON is available. We
+  // also expected both operands to be BUILD_VECTORs.
+  if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
+      || N0.getOpcode() != ISD::BUILD_VECTOR
+      || N1.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Check output type since VPADDL operand elements can only be 8, 16, or 32.
+  EVT VT = N->getValueType(0);
+  if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Check that the vector operands are of the right form.
+  // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
+  // operands, where N is the size of the formed vector.
+  // Each EXTRACT_VECTOR should have the same input vector and odd or even
+  // index such that we have a pair wise add pattern.
+
+  // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
+  if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Vec = N0->getOperand(0)->getOperand(0);
+  SDNode *V = Vec.getNode();
+  unsigned nextIndex = 0;
+
+  // For each operands to the ADD which are BUILD_VECTORs,
+  // check to see if each of their operands are an EXTRACT_VECTOR with
+  // the same vector and appropriate index.
+  for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
+    if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
+        && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+
+      SDValue ExtVec0 = N0->getOperand(i);
+      SDValue ExtVec1 = N1->getOperand(i);
+
+      // First operand is the vector, verify its the same.
+      if (V != ExtVec0->getOperand(0).getNode() ||
+          V != ExtVec1->getOperand(0).getNode())
+        return SDValue();
+
+      // Second is the constant, verify its correct.
+      ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
+      ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
+
+      // For the constant, we want to see all the even or all the odd.
+      if (!C0 || !C1 || C0->getZExtValue() != nextIndex
+          || C1->getZExtValue() != nextIndex+1)
+        return SDValue();
+
+      // Increment index.
+      nextIndex+=2;
+    } else
+      return SDValue();
+  }
+
+  // Create VPADDL node.
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls,
+                                TLI.getPointerTy()));
+
+  // Input is the vector.
+  Ops.push_back(Vec);
+
+  // Get widened type and narrowed type.
+  MVT widenType;
+  unsigned numElem = VT.getVectorNumElements();
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+    case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
+    case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
+    case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
+    default:
+      llvm_unreachable("Invalid vector element type for padd optimization.");
+  }
+
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+                            widenType, &Ops[0], Ops.size());
+  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
+}
+
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //          loAdd   UMUL_LOHI
+  //            \    / :lo    \ :hi
+  //             \  /          \          [no multiline comment]
+  //              ADDC         |  hiAdd
+  //                 \ :glue  /  /
+  //                  \      /  /
+  //                    ADDE
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         AddcNode->getValueType(1) == MVT::Glue &&
+         "Expect ADDC with two result values: i32, glue");
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (AddeNode == NULL)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+     return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiMul = &MULOp;
+  SDValue* HiAdd = NULL;
+  SDValue* LoMul = NULL;
+  SDValue* LowAdd = NULL;
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  if (AddcOp0->getOpcode() == Opc) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1->getOpcode() == Opc) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (LoMul == NULL)
+    return SDValue();
+
+  if (LoMul->getNode() != HiMul->getNode())
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+                                 DAG.getVTList(MVT::i32, MVT::i32),
+                                 &Ops[0], Ops.size());
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          const ARMSubtarget *Subtarget){
+
+  // Attempt to create vpaddl for this add.
+  SDValue Result = AddCombineToVPADDL(N, N0, N1, DCI, Subtarget);
+  if (Result.getNode())
+    return Result;
+
+  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+  if (N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
+    if (Result.getNode()) return Result;
+  }
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget);
+  if (Result.getNode())
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+///
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N1.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+
+  if (Subtarget->isThumb1Only())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  int64_t MulAmt = C->getSExtValue();
+  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+
+  ShiftAmt = ShiftAmt & (32 - 1);
+  SDValue V = N->getOperand(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue Res;
+  MulAmt >>= ShiftAmt;
+
+  if (MulAmt >= 0) {
+    if (isPowerOf2_32(MulAmt - 1)) {
+      // (mul x, 2^N + 1) => (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt - 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmt + 1)) {
+      // (mul x, 2^N - 1) => (sub (shl x, N), x)
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmt + 1),
+                                                    MVT::i32)),
+                        V);
+    } else
+      return SDValue();
+  } else {
+    uint64_t MulAmtAbs = -MulAmt;
+    if (isPowerOf2_32(MulAmtAbs + 1)) {
+      // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs + 1),
+                                                    MVT::i32)));
+    } else if (isPowerOf2_32(MulAmtAbs - 1)) {
+      // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
+      Res = DAG.getNode(ISD::ADD, DL, VT,
+                        V,
+                        DAG.getNode(ISD::SHL, DL, VT,
+                                    V,
+                                    DAG.getConstant(Log2_32(MulAmtAbs-1),
+                                                    MVT::i32)));
+      Res = DAG.getNode(ISD::SUB, DL, VT,
+                        DAG.getConstant(0, MVT::i32),Res);
+
+    } else
+      return SDValue();
+  }
+
+  if (ShiftAmt != 0)
+    Res = DAG.getNode(ISD::SHL, DL, VT,
+                      Res, DAG.getConstant(ShiftAmt, MVT::i32));
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, Res, false);
+  return SDValue();
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  // Attempt to use immediate-form VBIC
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VbicVT;
+      SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VbicVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
+        SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *Subtarget) {
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // The code below optimizes (or (and X, Y), Z).
+  // The AND operand needs to have a single user to make these optimizations
+  // profitable.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    APInt SplatBits0;
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                  HasAnyUndefs) && !HasAnyUndefs) {
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+      APInt SplatBits1;
+      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs) && !HasAnyUndefs &&
+          SplatBits0 == ~SplatBits1) {
+        // Canonicalize the vector type to make instruction selection simpler.
+        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                     N0->getOperand(1), N0->getOperand(0),
+                                     N1->getOperand(0));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+      }
+    }
+  }
+
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+
+  // BFI is only available on V6T2+
+  if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
+    return SDValue();
+
+  DebugLoc DL = N->getDebugLoc();
+  // 1) or (and A, mask), val => ARMbfi A, val, mask
+  //      iff (val & mask) == val
+  //
+  // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+  //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+  //          && mask == ~mask2
+  //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+  //          && ~mask == mask2
+  //  (i.e., copy a bitfield value into another bitfield of the same width)
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+
+  // The value and the mask need to be constants so we can verify this is
+  // actually a bitfield set. If the mask is 0xffff, we can do better
+  // via a movt instruction, so don't use BFI in that case.
+  SDValue MaskOp = N0.getOperand(1);
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
+  if (!MaskC)
+    return SDValue();
+  unsigned Mask = MaskC->getZExtValue();
+  if (Mask == 0xffff)
+    return SDValue();
+  SDValue Res;
+  // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N1C) {
+    unsigned Val = N1C->getZExtValue();
+    if ((Val & ~Mask) != Val)
+      return SDValue();
+
+    if (ARM::isBitFieldInvertedMask(Mask)) {
+      Val >>= CountTrailingZeros_32(~Mask);
+
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
+                        DAG.getConstant(Val, MVT::i32),
+                        DAG.getConstant(Mask, MVT::i32));
+
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  } else if (N1.getOpcode() == ISD::AND) {
+    // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned Mask2 = N11C->getZExtValue();
+
+    // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
+    // as is to match.
+    if (ARM::isBitFieldInvertedMask(Mask) &&
+        (Mask == ~Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask == 0xffff || Mask == 0xffff0000))
+        return SDValue();
+      // 2a
+      unsigned amt = CountTrailingZeros_32(Mask2);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+                        DAG.getConstant(amt, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
+                        DAG.getConstant(Mask, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+               (~Mask == Mask2)) {
+      // The pack halfword instruction works better for masks that fit it,
+      // so use that when it's available.
+      if (Subtarget->hasT2ExtractPack() &&
+          (Mask2 == 0xffff || Mask2 == 0xffff0000))
+        return SDValue();
+      // 2b
+      unsigned lsb = CountTrailingZeros_32(Mask);
+      Res = DAG.getNode(ISD::SRL, DL, VT, N00,
+                        DAG.getConstant(lsb, MVT::i32));
+      Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+                        DAG.getConstant(Mask2, MVT::i32));
+      // Do not add new nodes to DAG combiner worklist.
+      DCI.CombineTo(N, Res, false);
+      return SDValue();
+    }
+  }
+
+  if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
+      N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
+      ARM::isBitFieldInvertedMask(~Mask)) {
+    // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
+    // where lsb(mask) == #shamt and masked bits of B are known zero.
+    SDValue ShAmt = N00.getOperand(1);
+    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    unsigned LSB = CountTrailingZeros_32(Mask);
+    if (ShAmtC != LSB)
+      return SDValue();
+
+    Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
+                      DAG.getConstant(~Mask, MVT::i32));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, Res, false);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformXORCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
+/// the bits being cleared by the AND are not demanded by the BFI.
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() == ISD::AND) {
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C)
+      return SDValue();
+    unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned LSB = CountTrailingZeros_32(~InvMask);
+    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
+    unsigned Mask = (1 << Width)-1;
+    unsigned Mask2 = N11C->getZExtValue();
+    if ((Mask & (~Mask2)) == 0)
+      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
+                             N->getOperand(0), N1.getOperand(0),
+                             N->getOperand(2));
+  }
+  return SDValue();
+}
+
+/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVRRD.
+static SDValue PerformVMOVRRDCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  // vmovrrd(vmovdrr x, y) -> x,y
+  SDValue InDouble = N->getOperand(0);
+  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
+    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+
+  // vmovrrd(load f64) -> (load i32), (load i32)
+  SDNode *InNode = InDouble.getNode();
+  if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
+      InNode->getValueType(0) == MVT::f64 &&
+      InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
+      !cast<LoadSDNode>(InNode)->isVolatile()) {
+    // TODO: Should this be done for non-FrameIndex operands?
+    LoadSDNode *LD = cast<LoadSDNode>(InNode);
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc DL = LD->getDebugLoc();
+    SDValue BasePtr = LD->getBasePtr();
+    SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->isInvariant(),
+                                 LD->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, NewLD1.getValue(1), OffsetPtr,
+                                 LD->getPointerInfo(), LD->isVolatile(),
+                                 LD->isNonTemporal(), LD->isInvariant(),
+                                 std::min(4U, LD->getAlignment() / 2));
+
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
+    SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
+    DCI.RemoveFromWorklist(LD);
+    DAG.DeleteNode(LD);
+    return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
+static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
+  // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() == ISD::BITCAST)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::BITCAST)
+    Op1 = Op1.getOperand(0);
+  if (Op0.getOpcode() == ARMISD::VMOVRRD &&
+      Op0.getNode() == Op1.getNode() &&
+      Op0.getResNo() == 0 && Op1.getResNo() == 1)
+    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+                       N->getValueType(0), Op0.getOperand(0));
+  return SDValue();
+}
+
+/// PerformSTORECombine - Target-specific dag combine xforms for
+/// ISD::STORE.
+static SDValue PerformSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  if (St->isVolatile())
+    return SDValue();
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
+  // pack all of the elements in one place.  Next, store to memory in fewer
+  // chunks.
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+  if (St->isTruncatingStore() && VT.isVector()) {
+    SelectionDAG &DAG = DCI.DAG;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    EVT StVT = St->getMemoryVT();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromEltSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToEltSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
+
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
+
+    unsigned SizeRatio  = FromEltSz / ToEltSz;
+    assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle.
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
+                                     NumElems*SizeRatio);
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    DebugLoc DL = St->getDebugLoc();
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
+        StoreType = Tp;
+    }
+    // Didn't find a legal store type.
+    if (!TLI.isTypeLegal(StoreType))
+      return SDValue();
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue BasePtr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
+    for (unsigned I = 0; I < E; I++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(I));
+      SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                            Increment);
+      Chains.push_back(Ch);
+    }
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
+                       Chains.size());
+  }
+
+  if (!ISD::isNormalStore(St))
+    return SDValue();
+
+  // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
+  // ARM stores of arguments in the same cache line.
+  if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
+      StVal.getNode()->hasOneUse()) {
+    SelectionDAG  &DAG = DCI.DAG;
+    DebugLoc DL = St->getDebugLoc();
+    SDValue BasePtr = St->getBasePtr();
+    SDValue NewST1 = DAG.getStore(St->getChain(), DL,
+                                  StVal.getNode()->getOperand(0), BasePtr,
+                                  St->getPointerInfo(), St->isVolatile(),
+                                  St->isNonTemporal(), St->getAlignment());
+
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                    DAG.getConstant(4, MVT::i32));
+    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+                        OffsetPtr, St->getPointerInfo(), St->isVolatile(),
+                        St->isNonTemporal(),
+                        std::min(4U, St->getAlignment() / 2));
+  }
+
+  if (StVal.getValueType() != MVT::i64 ||
+      StVal.getNode()->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  // Bitcast an i64 store extracted from a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = StVal.getDebugLoc();
+  SDValue IntVec = StVal.getOperand(0);
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 IntVec.getValueType().getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
+  SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                               Vec, StVal.getOperand(1));
+  dl = N->getDebugLoc();
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(ExtElt.getNode());
+  DCI.AddToWorklist(V.getNode());
+  return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
+                      St->getPointerInfo(), St->isVolatile(),
+                      St->isNonTemporal(), St->getAlignment(),
+                      St->getTBAAInfo());
+}
+
+/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
+/// are normal, non-volatile loads.  If so, it is profitable to bitcast an
+/// i64 vector to have f64 elements, since the value can then be loaded
+/// directly into a VFP register.
+static bool hasNormalLoadOperand(SDNode *N) {
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDNode *Elt = N->getOperand(i).getNode();
+    if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
+      return true;
+  }
+  return false;
+}
+
+/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
+/// ISD::BUILD_VECTOR.
+static SDValue PerformBUILD_VECTORCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI){
+  // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
+  // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
+  // into a pair of GPRs, which is fine when the value is used as a scalar,
+  // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
+  SelectionDAG &DAG = DCI.DAG;
+  if (N->getNumOperands() == 2) {
+    SDValue RV = PerformVMOVDRRCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Load i64 elements as f64 values so that type legalization does not split
+  // them up into i32 values.
+  EVT VT = N->getValueType(0);
+  if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
+    return SDValue();
+  DebugLoc dl = N->getDebugLoc();
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
+    Ops.push_back(V);
+    // Make the DAGCombiner fold the bitcast.
+    DCI.AddToWorklist(V.getNode());
+  }
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+/// PerformInsertEltCombine - Target-specific dag combine xforms for
+/// ISD::INSERT_VECTOR_ELT.
+static SDValue PerformInsertEltCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // Bitcast an i64 load inserted into a vector to f64.
+  // Otherwise, the i64 value will be legalized to a pair of i32 values.
+  EVT VT = N->getValueType(0);
+  SDNode *Elt = N->getOperand(1).getNode();
+  if (VT.getVectorElementType() != MVT::i64 ||
+      !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
+                                 VT.getVectorNumElements());
+  SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
+  SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  DCI.AddToWorklist(V.getNode());
+  SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
+                               Vec, V, N->getOperand(2));
+  return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
+}
+
+/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
+/// ISD::VECTOR_SHUFFLE.
+static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
+  // The LLVM shufflevector instruction does not require the shuffle mask
+  // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
+  // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
+  // operands do not match the mask length, they are extended by concatenating
+  // them with undef vectors.  That is probably the right thing for other
+  // targets, but for NEON it is better to concatenate two double-register
+  // size vector operands into a single quad-register size vector.  Do that
+  // transformation here:
+  //   shuffle(concat(v1, undef), concat(v2, undef)) ->
+  //   shuffle(concat(v1, v2), undef)
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op1.getOpcode() != ISD::CONCAT_VECTORS ||
+      Op0.getNumOperands() != 2 ||
+      Op1.getNumOperands() != 2)
+    return SDValue();
+  SDValue Concat0Op1 = Op0.getOperand(1);
+  SDValue Concat1Op1 = Op1.getOperand(1);
+  if (Concat0Op1.getOpcode() != ISD::UNDEF ||
+      Concat1Op1.getOpcode() != ISD::UNDEF)
+    return SDValue();
+  // Skip the transformation if any of the types are illegal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  if (!TLI.isTypeLegal(VT) ||
+      !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
+      !TLI.isTypeLegal(Concat1Op1.getValueType()))
+    return SDValue();
+
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                                  Op0.getOperand(0), Op1.getOperand(0));
+  // Translate the shuffle mask.
+  SmallVector<int, 16> NewMask;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfElts = NumElts/2;
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  for (unsigned n = 0; n < NumElts; ++n) {
+    int MaskElt = SVN->getMaskElt(n);
+    int NewElt = -1;
+    if (MaskElt < (int)HalfElts)
+      NewElt = MaskElt;
+    else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
+      NewElt = HalfElts + MaskElt - NumElts;
+    NewMask.push_back(NewElt);
+  }
+  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+                              DAG.getUNDEF(VT), NewMask.data());
+}
+
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
+/// NEON load/store intrinsics to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+      case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+      case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else
+      VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint64_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+    } else if (NumBytes >= 3 * 16) {
+      // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+      // separate instructions that make it harder to use a non-constant update.
+      continue;
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i32;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+                                           Ops.data(), Ops.size(),
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
+/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
+/// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
+/// return true.
+static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // vldN-dup instructions only support 64-bit vectors for N > 1.
+  if (!VT.is64BitVector())
+    return false;
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return false;
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = ARMISD::VLD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = ARMISD::VLD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = ARMISD::VLD4DUP;
+  } else {
+    return false;
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+    cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != ARMISD::VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return false;
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
+                                           Ops, 2, VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return true;
+}
+
+/// PerformVDUPLANECombine - Target-specific dag combine xforms for
+/// ARMISD::VDUPLANE.
+static SDValue PerformVDUPLANECombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op = N->getOperand(0);
+
+  // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
+  // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
+  if (CombineVLDDUP(N, DCI))
+    return SDValue(N, 0);
+
+  // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
+  // redundant.  Ignore bit_converts for now; element sizes are checked below.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
+    return SDValue();
+
+  // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
+  unsigned EltSize = Op.getValueType().getVectorElementType().getSizeInBits();
+  // The canonical VMOV for a zero vector uses a 32-bit element size.
+  unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned EltBits;
+  if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
+    EltSize = 8;
+  EVT VT = N->getValueType(0);
+  if (EltSize > VT.getVectorElementType().getSizeInBits())
+    return SDValue();
+
+  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+}
+
+// isConstVecPow2 - Return true if each vector element is a power of 2, all
+// elements are the same constant, C, and Log2(C) ranges from 1 to 32.
+static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C)
+{
+  integerPart cN;
+  integerPart c0 = 0;
+  for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements();
+       I != E; I++) {
+    ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I));
+    if (!C)
+      return false;
+
+    bool isExact;
+    APFloat APF = C->getValueAPF();
+    if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact)
+        != APFloat::opOK || !isExact)
+      return false;
+
+    c0 = (I == 0) ? cN : c0;
+    if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32)
+      return false;
+  }
+  C = c0;
+  return true;
+}
+
+/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
+/// can replace combinations of VMUL and VCVT (floating-point to integer)
+/// when the VMUL has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vmul.f32        d16, d17, d16
+///  vcvt.s32.f32    d16, d16
+/// becomes:
+///  vcvt.s32.f32    d16, d16, #3
+static SDValue PerformVCVTCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+
+  if (!Subtarget->hasNEON() || !Op.getValueType().isVector() ||
+      Op.getOpcode() != ISD::FMUL)
+    return SDValue();
+
+  uint64_t C;
+  SDValue N0 = Op->getOperand(0);
+  SDValue ConstVec = Op->getOperand(1);
+  bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
+      !isConstVecPow2(ConstVec, isSigned, C))
+    return SDValue();
+
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
+    Intrinsic::arm_neon_vcvtfp2fxu;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+                     N->getValueType(0),
+                     DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
+                     DAG.getConstant(Log2_64(C), MVT::i32));
+}
+
+/// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
+/// can replace combinations of VCVT (integer to floating-point) and VDIV
+/// when the VDIV has a constant operand that is a power of 2.
+///
+/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
+///  vcvt.f32.s32    d16, d16
+///  vdiv.f32        d16, d17, d16
+/// becomes:
+///  vcvt.f32.s32    d16, d16, #3
+static SDValue PerformVDIVCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+  unsigned OpOpcode = Op.getNode()->getOpcode();
+
+  if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() ||
+      (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
+    return SDValue();
+
+  uint64_t C;
+  SDValue ConstVec = N->getOperand(1);
+  bool isSigned = OpOpcode == ISD::SINT_TO_FP;
+
+  if (ConstVec.getOpcode() != ISD::BUILD_VECTOR ||
+      !isConstVecPow2(ConstVec, isSigned, C))
+    return SDValue();
+
+  unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
+    Intrinsic::arm_neon_vcvtfxu2fp;
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+                     Op.getValueType(),
+                     DAG.getConstant(IntrinsicOpcode, MVT::i32),
+                     Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
+}
+
+/// Getvshiftimm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  // Vector shifts: check for immediate versions and lower them.
+  // Note: This is done during DAG combining instead of DAG legalizing because
+  // the build_vectors for 64-bit vector element shift counts are generally
+  // not legal, and it is hard to see their values after they get legalized to
+  // loads from a constant pool.
+  case Intrinsic::arm_neon_vshifts:
+  case Intrinsic::arm_neon_vshiftu:
+  case Intrinsic::arm_neon_vshiftls:
+  case Intrinsic::arm_neon_vshiftlu:
+  case Intrinsic::arm_neon_vshiftn:
+  case Intrinsic::arm_neon_vrshifts:
+  case Intrinsic::arm_neon_vrshiftu:
+  case Intrinsic::arm_neon_vrshiftn:
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+  case Intrinsic::arm_neon_vqshiftsu:
+  case Intrinsic::arm_neon_vqshiftns:
+  case Intrinsic::arm_neon_vqshiftnu:
+  case Intrinsic::arm_neon_vqshiftnsu:
+  case Intrinsic::arm_neon_vqrshiftns:
+  case Intrinsic::arm_neon_vqrshiftnu:
+  case Intrinsic::arm_neon_vqrshiftnsu: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+        VShiftOpc = ARMISD::VSHL;
+        break;
+      }
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+                     ARMISD::VSHRs : ARMISD::VSHRu);
+        break;
+      }
+      return SDValue();
+
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vshll intrinsic");
+
+    case Intrinsic::arm_neon_vrshifts:
+    case Intrinsic::arm_neon_vrshiftu:
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshifts:
+    case Intrinsic::arm_neon_vqshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshiftsu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vqshlu intrinsic");
+
+    case Intrinsic::arm_neon_vshiftn:
+    case Intrinsic::arm_neon_vrshiftn:
+    case Intrinsic::arm_neon_vqshiftns:
+    case Intrinsic::arm_neon_vqshiftnu:
+    case Intrinsic::arm_neon_vqshiftnsu:
+    case Intrinsic::arm_neon_vqrshiftns:
+    case Intrinsic::arm_neon_vqrshiftnu:
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      // Narrowing shifts require an immediate right shift.
+      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for narrowing vector shift "
+                       "intrinsic");
+
+    default:
+      llvm_unreachable("unhandled vector shift");
+    }
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      // Opcode already set above.
+      break;
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (Cnt == VT.getVectorElementType().getSizeInBits())
+        VShiftOpc = ARMISD::VSHLLi;
+      else
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
+                     ARMISD::VSHLLs : ARMISD::VSHLLu);
+      break;
+    case Intrinsic::arm_neon_vshiftn:
+      VShiftOpc = ARMISD::VSHRN; break;
+    case Intrinsic::arm_neon_vrshifts:
+      VShiftOpc = ARMISD::VRSHRs; break;
+    case Intrinsic::arm_neon_vrshiftu:
+      VShiftOpc = ARMISD::VRSHRu; break;
+    case Intrinsic::arm_neon_vrshiftn:
+      VShiftOpc = ARMISD::VRSHRN; break;
+    case Intrinsic::arm_neon_vqshifts:
+      VShiftOpc = ARMISD::VQSHLs; break;
+    case Intrinsic::arm_neon_vqshiftu:
+      VShiftOpc = ARMISD::VQSHLu; break;
+    case Intrinsic::arm_neon_vqshiftsu:
+      VShiftOpc = ARMISD::VQSHLsu; break;
+    case Intrinsic::arm_neon_vqshiftns:
+      VShiftOpc = ARMISD::VQSHRNs; break;
+    case Intrinsic::arm_neon_vqshiftnu:
+      VShiftOpc = ARMISD::VQSHRNu; break;
+    case Intrinsic::arm_neon_vqshiftnsu:
+      VShiftOpc = ARMISD::VQSHRNsu; break;
+    case Intrinsic::arm_neon_vqrshiftns:
+      VShiftOpc = ARMISD::VQRSHRNs; break;
+    case Intrinsic::arm_neon_vqrshiftnu:
+      VShiftOpc = ARMISD::VQRSHRNu; break;
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      VShiftOpc = ARMISD::VQRSHRNsu; break;
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vshiftins: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+      VShiftOpc = ARMISD::VSLI;
+    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+      VShiftOpc = ARMISD::VSRI;
+    else {
+      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2),
+                       DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vqrshifts:
+  case Intrinsic::arm_neon_vqrshiftu:
+    // No immediate versions of these to check for.
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them.  As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
+    // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
+    // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
+    SDValue N1 = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      SDValue N0 = N->getOperand(0);
+      if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
+          DAG.MaskedValueIsZero(N0.getOperand(0),
+                                APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1);
+    }
+  }
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+                            ARMISD::VSHRs : ARMISD::VSHRu);
+      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+  }
+  return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue N0 = N->getOperand(0);
+
+  // Check for sign- and zero-extensions of vector extract operations of 8-
+  // and 16-bit vector elements.  NEON supports these directly.  They are
+  // handled during DAG combining because type legalization will promote them
+  // to 32-bit types and it is messy to recognize the operations after that.
+  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Lane = N0.getOperand(1);
+    EVT VT = N->getValueType(0);
+    EVT EltVT = N0.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    if (VT == MVT::i32 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+        TLI.isTypeLegal(Vec.getValueType()) &&
+        isa<ConstantSDNode>(Lane)) {
+
+      unsigned Opc = 0;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ISD::SIGN_EXTEND:
+        Opc = ARMISD::VGETLANEs;
+        break;
+      case ISD::ZERO_EXTEND:
+      case ISD::ANY_EXTEND:
+        Opc = ARMISD::VGETLANEu;
+        break;
+      }
+      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC
+/// to match f32 max/min patterns to use NEON vmax/vmin instructions.
+static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
+  // If the target supports NEON, try to use vmax/vmin instructions for f32
+  // selects like "x < y ? x : y".  Unless the NoNaNsFPMath option is set,
+  // be careful about NaNs:  NEON's vmax/vmin return NaN if either operand is
+  // a NaN; only do the transformation when it matches that behavior.
+
+  // For now only do this when using NEON for FP operations; if using VFP, it
+  // is not obvious that the benefit outweighs the cost of switching to the
+  // NEON pipeline.
+  if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() ||
+      N->getValueType(0) != MVT::f32)
+    return SDValue();
+
+  SDValue CondLHS = N->getOperand(0);
+  SDValue CondRHS = N->getOperand(1);
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  unsigned Opcode = 0;
+  bool IsReversed;
+  if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) {
+    IsReversed = false; // x CC y ? x : y
+  } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) {
+    IsReversed = true ; // x CC y ? y : x
+  } else {
+    return SDValue();
+  }
+
+  bool IsUnordered;
+  switch (CC) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmin(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin
+    // will return -0, so vmin can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN;
+    break;
+
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // If LHS is NaN, an ordered comparison will be false and the result will
+    // be the RHS, but vmax(NaN, RHS) = NaN.  Avoid this by checking that LHS
+    // != NaN.  Likewise, for unordered comparisons, check for RHS != NaN.
+    IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE);
+    if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS))
+      break;
+    // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax
+    // will return +0, so vmax can only be used for unsafe math or if one of
+    // the operands is known to be nonzero.
+    if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) &&
+        !DAG.getTarget().Options.UnsafeFPMath &&
+        !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+      break;
+    Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX;
+    break;
+  }
+
+  if (!Opcode)
+    return SDValue();
+  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+}
+
+/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
+SDValue
+ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
+  SDValue Cmp = N->getOperand(4);
+  if (Cmp.getOpcode() != ARMISD::CMPZ)
+    // Only looking at EQ and NE cases.
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+  SDValue FalseVal = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue ARMcc = N->getOperand(2);
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+
+  // Simplify
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, y
+  //   moveq   r0, x
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  //
+  //   mov     r1, r0
+  //   cmp     r1, x
+  //   mov     r0, x
+  //   movne   r0, y
+  // to
+  //   cmp     r0, x
+  //   movne   r0, y
+  /// FIXME: Turn this into a target neutral optimization?
+  SDValue Res;
+  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
+                      N->getOperand(3), Cmp);
+  } else if (CC == ARMCC::EQ && TrueVal == RHS) {
+    SDValue ARMcc;
+    SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
+    Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
+                      N->getOperand(3), NewCmp);
+  }
+
+  if (Res.getNode()) {
+    APInt KnownZero, KnownOne;
+    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
+    // Capture demanded bits information that would be otherwise lost.
+    if (KnownZero == 0xfffffffe)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i1));
+    else if (KnownZero == 0xffffff00)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i8));
+    else if (KnownZero == 0xffff0000)
+      Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
+                        DAG.getValueType(MVT::i16));
+  }
+
+  return Res;
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
+  case ISD::SUB:        return PerformSUBCombine(N, DCI);
+  case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
+  case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
+  case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
+  case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::BFI:     return PerformBFICombine(N, DCI);
+  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
+  case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+  case ISD::STORE:      return PerformSTORECombine(N, DCI);
+  case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI);
+  case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+  case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
+  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget);
+  case ISD::FDIV:       return PerformVDIVCombine(N, DCI, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::SELECT_CC:  return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
+  case ARMISD::VLD2DUP:
+  case ARMISD::VLD3DUP:
+  case ARMISD::VLD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default: break;
+    }
+    break;
+  }
+  return SDValue();
+}
+
+bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
+                                                          EVT VT) const {
+  return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
+}
+
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32: {
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    if (AllowsUnaligned) {
+      if (Fast)
+        *Fast = Subtarget->hasV7Ops();
+      return true;
+    }
+    return false;
+  }
+  case MVT::f64:
+  case MVT::v2f64: {
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explictly support unaligned accesses
+    if (Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian())) {
+      if (Fast)
+        *Fast = true;
+      return true;
+    }
+    return false;
+  }
+  }
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+
+  // See if we can use NEON instructions for this...
+  if ((!IsMemset || ZeroMemset) &&
+      Subtarget->hasNEON() &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat)) {
+    bool Fast;
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
+      return MVT::v2f64;
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
+      return MVT::f64;
+    }
+  }
+
+  // Lowering to i32/i16 if the size permits.
+  if (Size >= 4)
+    return MVT::i32;
+  else if (Size >= 2)
+    return MVT::i16;
+
+  // Let the target-independent logic figure it out.
+  return MVT::Other;
+}
+
+bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+    // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
+    return true;
+  }
+
+  return false;
+}
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+  if (V < 0)
+    return false;
+
+  unsigned Scale = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    // Scale == 1;
+    break;
+  case MVT::i16:
+    // Scale == 2;
+    Scale = 2;
+    break;
+  case MVT::i32:
+    // Scale == 4;
+    Scale = 4;
+    break;
+  }
+
+  if ((V & (Scale - 1)) != 0)
+    return false;
+  V /= Scale;
+  return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+                                      const ARMSubtarget *Subtarget) {
+  bool isNeg = false;
+  if (V < 0) {
+    isNeg = true;
+    V = - V;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // + imm12 or - imm8
+    if (isNeg)
+      return V == (V & ((1LL << 8) - 1));
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    // Same as ARM mode. FIXME: NEON?
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (!VT.isSimple())
+    return false;
+
+  if (Subtarget->isThumb1Only())
+    return isLegalT1AddressImmediate(V, VT);
+  else if (Subtarget->isThumb2())
+    return isLegalT2AddressImmediate(V, VT, Subtarget);
+
+  // ARM mode.
+  if (V < 0)
+    V = - V;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::i16:
+    // +- imm8
+    return V == (V & ((1LL << 8) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2()) // FIXME: NEON?
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+                                                      EVT VT) const {
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    if (Scale == 1)
+      return true;
+    // r + r << imm
+    Scale = Scale & ~1;
+    return Scale == 2 || Scale == 4 || Scale == 8;
+  case MVT::i64:
+    // r + r
+    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+      return true;
+    return false;
+  case MVT::isVoid:
+    // Note, we allow "void" uses (basically, uses that aren't loads or
+    // stores), because arm allows folding a scale into many arithmetic
+    // operations.  This should be made more precise and revisited later.
+
+    // Allow r << imm, but the imm has to be a multiple of two.
+    if (Scale & 1) return false;
+    return isPowerOf2_32(Scale);
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  EVT VT = getValueType(Ty, true);
+  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+    return false;
+
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb1Only())
+      return false;
+    // FALL THROUGH.
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+
+    if (!VT.isSimple())
+      return false;
+
+    if (Subtarget->isThumb2())
+      return isLegalT2ScaledAddressingMode(AM, VT);
+
+    int Scale = AM.Scale;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+    case MVT::i64:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (Scale & 1) return false;
+      return isPowerOf2_32(Scale);
+    }
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  // Thumb2 and ARM modes can use cmn for negative immediates.
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(llvm::abs64(Imm)) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(llvm::abs64(Imm)) != -1;
+  // Thumb1 doesn't have cmn, and only 8-bit immediates.
+  return Imm >= 0 && Imm <= 255;
+}
+
+/// isLegalAddImmediate - Return true if the specified immediate is a legal add
+/// *or sub* immediate, that is the target has add or sub instructions which can
+/// add a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Same encoding for add/sub, just flip the sign.
+  int64_t AbsImm = llvm::abs64(Imm);
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(AbsImm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(AbsImm) != -1;
+  // Thumb1 only has 8-bit unsigned immediate.
+  return AbsImm >= 0 && AbsImm <= 255;
+}
+
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                      bool isSEXTLoad, SDValue &Base,
+                                      SDValue &Offset, bool &isInc,
+                                      SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -256) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal=
+        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
+  return false;
+}
+
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
+                                     bool isSEXTLoad, SDValue &Base,
+                                     SDValue &Offset, bool &isInc,
+                                     SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Ptr->getOperand(0);
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
+      assert(Ptr->getOpcode() == ISD::ADD);
+      isInc = false;
+      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+      return true;
+    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
+      isInc = Ptr->getOpcode() == ISD::ADD;
+      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                             SDValue &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                       Offset, isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                        Offset, isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                       isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  if (Ptr != Base) {
+    // Swap base ptr and offset to catch more post-index load / store when
+    // it's legal. In Thumb2 mode, offset must be an immediate.
+    if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
+        !Subtarget->isThumb2())
+      std::swap(Base, Offset);
+
+    // Post-indexed load / store update the base pointer.
+    if (Ptr != Base)
+      return false;
+  }
+
+  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    APInt KnownZeroRHS, KnownOneRHS;
+    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  // Looking for "rev" which is V6+.
+  if (!Subtarget->hasV6Ops())
+    return false;
+
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::string AsmStr = IA->getAsmString();
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t,");
+
+    // rev $0, $1
+    if (AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
+        IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      if (Ty && Ty->getBitWidth() == 32)
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  }
+
+  return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    case 'h': return C_RegisterClass;
+    case 'x': return C_RegisterClass;
+    case 't': return C_RegisterClass;
+    case 'j': return C_Other; // Constant for movw.
+      // An address with a single base register. Due to the way we
+      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'Q': return C_Memory;
+    }
+  } else if (Constraint.size() == 2) {
+    switch (Constraint[0]) {
+    default: break;
+    // All 'U+' constraints are addresses.
+    case 'U': return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+ARMTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'l':
+    if (type->isIntegerTy()) {
+      if (Subtarget->isThumb())
+        weight = CW_SpecificReg;
+      else
+        weight = CW_Register;
+    }
+    break;
+  case 'w':
+    if (type->isFloatingPointTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
+typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
+RCPair
+ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC ARM Constraint Letters
+    switch (Constraint[0]) {
+    case 'l': // Low regs or general regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::tGPRRegClass);
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'h': // High regs or no regs.
+      if (Subtarget->isThumb())
+        return RCPair(0U, &ARM::hGPRRegClass);
+      break;
+    case 'r':
+      return RCPair(0U, &ARM::GPRRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPRRegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPRRegClass);
+      break;
+    case 'x':
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPR_8RegClass);
+      if (VT.getSizeInBits() == 64)
+        return RCPair(0U, &ARM::DPR_8RegClass);
+      if (VT.getSizeInBits() == 128)
+        return RCPair(0U, &ARM::QPR_8RegClass);
+      break;
+    case 't':
+      if (VT == MVT::f32)
+        return RCPair(0U, &ARM::SPRRegClass);
+      break;
+    }
+  }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'j':
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'O':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    int64_t CVal64 = C->getSExtValue();
+    int CVal = (int) CVal64;
+    // None of these constraints allow values larger than 32 bits.  Check
+    // that the value fits in an int.
+    if (CVal != CVal64)
+      return;
+
+    switch (ConstraintLetter) {
+      case 'j':
+        // Constant suitable for movw, must be between 0 and
+        // 65535.
+        if (Subtarget->hasV6T2Ops())
+          if (CVal >= 0 && CVal <= 65535)
+            break;
+        return;
+      case 'I':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between 0 and 255, for ADD
+          // immediates.
+          if (CVal >= 0 && CVal <= 255)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getT2SOImmVal(CVal) != -1)
+            break;
+        } else {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getSOImmVal(CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'J':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between -255 and -1, for negated ADD
+          // immediates. This can be used in GCC with an "n" modifier that
+          // prints the negated value, for use with SUB instructions. It is
+          // not useful otherwise but is implemented for compatibility.
+          if (CVal >= -255 && CVal <= -1)
+            break;
+        } else {
+          // This must be a constant between -4095 and 4095. It is not clear
+          // what this constraint is intended for. Implemented for
+          // compatibility with GCC.
+          if (CVal >= -4095 && CVal <= 4095)
+            break;
+        }
+        return;
+
+      case 'K':
+        if (Subtarget->isThumb1Only()) {
+          // A 32-bit value where only one byte has a nonzero value. Exclude
+          // zero to match GCC. This constraint is used by GCC internally for
+          // constants that can be loaded with a move/shift combination.
+          // It is not useful otherwise but is implemented for compatibility.
+          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+            break;
+        } else {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getSOImmVal(~CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'L':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between -7 and 7,
+          // for 3-operand ADD/SUB immediate instructions.
+          if (CVal >= -7 && CVal < 7)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+            break;
+        } else {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getSOImmVal(-CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'M':
+        if (Subtarget->isThumb()) { // FIXME thumb2
+          // This must be a multiple of 4 between 0 and 1020, for
+          // ADD sp + immediate.
+          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+            break;
+        } else {
+          // A power of two or a constant between 0 and 32.  This is used in
+          // GCC for the shift amount on shifted register operands, but it is
+          // useful in general for any shift amounts.
+          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+            break;
+        }
+        return;
+
+      case 'N':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between 0 and 31, for shift amounts.
+          if (CVal >= 0 && CVal <= 31)
+            break;
+        }
+        return;
+
+      case 'O':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a multiple of 4 between -508 and 508, for
+          // ADD/SUB sp = sp + immediate.
+          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+            break;
+        }
+        return;
+    }
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM target isn't yet aware of offsets.
+  return false;
+}
+
+bool ARM::isBitFieldInvertedMask(unsigned v) {
+  if (v == 0xffffffff)
+    return 0;
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  unsigned int lsb = 0, msb = 31;
+  while (v & (1 << msb)) --msb;
+  while (v & (1 << lsb)) ++lsb;
+  for (unsigned int i = lsb; i <= msb; ++i) {
+    if (v & (1 << i))
+      return 0;
+  }
+  return 1;
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (!Subtarget->hasVFP3())
+    return false;
+  if (VT == MVT::f32)
+    return ARM_AM::getFP32Imm(Imm) != -1;
+  if (VT == MVT::f64)
+    return ARM_AM::getFP64Imm(Imm) != -1;
+  return false;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                           const CallInst &I,
+                                           unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_strexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::arm_ldrexd: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 8;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 000000000000..9ee17f0781b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -0,0 +1,578 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperDYN,   // WrapperDYN - A wrapper node for TargetGlobalAddress in
+                    // DYN mode.
+      WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
+                    // PIC mode.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+
+      // Add pseudo op to model memcpy for struct byval.
+      COPY_STRUCT_BYVAL,
+
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
+      RET_FLAG,     // Return with a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMN,          // ARM CMN instructions.
+      CMPZ,         // ARM compare that sets only Z flag.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+
+      CMOV,         // ARM conditional move instructions.
+
+      BCC_i64,
+
+      RBIT,         // ARM bitreverse instruction
+
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+      ADDC,         // Add with carry
+      ADDE,         // Add using carry
+      SUBC,         // Sub with carry
+      SUBE,         // Sub using carry
+
+      VMOVRRD,      // double to two gprs.
+      VMOVDRR,      // Two gprs to double.
+
+      EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
+
+      TC_RETURN,    // Tail call return pseudo.
+
+      THREAD_POINTER,
+
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
+      MEMBARRIER,   // Memory barrier (DMB)
+      MEMBARRIER_MCR, // Memory barrier (MCR)
+
+      PRELOAD,      // Preload
+
+      VCEQ,         // Vector compare equal.
+      VCEQZ,        // Vector compare equal to zero.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEZ,        // Vector compare greater than or equal to zero.
+      VCLEZ,        // Vector compare less than or equal to zero.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTZ,        // Vector compare greater than zero.
+      VCLTZ,        // Vector compare less than zero.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+      VSHLLs,       // ...left long (signed)
+      VSHLLu,       // ...left long (unsigned)
+      VSHLLi,       // ...left long (with maximum shift count)
+      VSHRN,        // ...right narrow
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector move immediate and move negated immediate:
+      VMOVIMM,
+      VMVNIMM,
+
+      // Vector move f32 immediate:
+      VMOVFPIMM,
+
+      // Vector duplicate:
+      VDUP,
+      VDUPLANE,
+
+      // Vector shuffles:
+      VEXT,         // extract
+      VREV64,       // reverse elements within 64-bit doublewords
+      VREV32,       // reverse elements within 32-bit words
+      VREV16,       // reverse elements within 16-bit halfwords
+      VZIP,         // zip (interleave)
+      VUZP,         // unzip (deinterleave)
+      VTRN,         // transpose
+      VTBL1,        // 1-register shuffle with mask
+      VTBL2,        // 2-register shuffle with mask
+
+      // Vector multiply long:
+      VMULLs,       // ...signed
+      VMULLu,       // ...unsigned
+
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+
+      // Operands of the standard BUILD_VECTOR node are not legalized, which
+      // is fine if BUILD_VECTORs are always lowered to shuffles or other
+      // operations, but for ARM some BUILD_VECTORs are legal as-is and their
+      // operands need to be legalized.  Define an ARM-specific version of
+      // BUILD_VECTOR for this purpose.
+      BUILD_VECTOR,
+
+      // Floating-point max and min:
+      FMAX,
+      FMIN,
+
+      // Bit-field insert
+      BFI,
+
+      // Vector OR with immediate
+      VORRIMM,
+      // Vector AND with NOT of immediate
+      VBICIMM,
+
+      // Vector bitwise select
+      VBSL,
+
+      // Vector load N-element structure to all lanes:
+      VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      VLD3DUP,
+      VLD4DUP,
+
+      // NEON loads with post-increment base updates:
+      VLD1_UPD,
+      VLD2_UPD,
+      VLD3_UPD,
+      VLD4_UPD,
+      VLD2LN_UPD,
+      VLD3LN_UPD,
+      VLD4LN_UPD,
+      VLD2DUP_UPD,
+      VLD3DUP_UPD,
+      VLD4DUP_UPD,
+
+      // NEON stores with post-increment base updates:
+      VST1_UPD,
+      VST2_UPD,
+      VST3_UPD,
+      VST4_UPD,
+      VST2LN_UPD,
+      VST3LN_UPD,
+      VST4LN_UPD,
+
+      // 64-bit atomic ops (value split into two registers)
+      ATOMADD64_DAG,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG,
+      ATOMCMPXCHG64_DAG,
+      ATOMMIN64_DAG,
+      ATOMUMIN64_DAG,
+      ATOMMAX64_DAG,
+      ATOMUMAX64_DAG
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    bool isBitFieldInvertedMask(unsigned v);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+  class ARMTargetLowering : public TargetLowering {
+  public:
+    explicit ARMTargetLowering(TargetMachine &TM);
+
+    virtual unsigned getJumpTableEncoding() const;
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual bool isSelectSupported(SelectSupportKind Kind) const {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
+    virtual void
+    AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+
+    SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses of the specified type. Returns whether it
+    /// is "fast" by reference in the second argument.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    virtual EVT getOptimalMemOpType(uint64_t Size,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool IsMemset, bool ZeroMemset,
+                                    bool MemcpyStrSrc,
+                                    MachineFunction &MF) const;
+
+    using TargetLowering::isZExtFree;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+    bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base, SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth) const;
+
+
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    const ARMSubtarget* getSubtarget() const {
+      return Subtarget;
+    }
+
+    /// getRegClassFor - Return the register class that should be used for the
+    /// specified value type.
+    virtual const TargetRegisterClass *getRegClassFor(MVT VT) const;
+
+    /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+    /// be used for loads / stores from the global.
+    virtual unsigned getMaximalGlobalOffset() const;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                                     const TargetLibraryInfo *libInfo) const;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const;
+
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                    const CallInst &I,
+                                    unsigned Intrinsic) const;
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(MVT VT) const;
+
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    const TargetRegisterInfo *RegInfo;
+
+    const InstrItineraryData *Itins;
+
+    /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVector<SDValue, 8> &MemOpChains,
+                          ISD::ArgFlagsTy Flags) const;
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG,
+                                 DebugLoc dl) const;
+
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
+                                  bool isVarArg) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+    SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG) const;
+    SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                 SelectionDAG &DAG,
+                                 TLSModel::Model model) const;
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
+                            const ARMSubtarget *ST) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) const;
+
+    SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
+                              DebugLoc dl, SDValue &Chain,
+                              const Value *OrigArg,
+                              unsigned OffsetFromOrigArg,
+                              unsigned ArgOffset,
+                              bool ForceMutable = false)
+      const;
+
+    void computeRegArea(CCState &CCInfo, MachineFunction &MF,
+                        unsigned &VARegSize, unsigned &VARegSaveSize) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    /// HandleByVal - Target-specific cleanup for ByVal support.
+    virtual void HandleByVal(CCState *, unsigned &, unsigned) const;
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+
+    virtual bool CanLowerReturn(CallingConv::ID CallConv,
+                                MachineFunction &MF, bool isVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                LLVMContext &Context) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+
+    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+
+    SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                      SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue getVFPCmp(SDValue LHS, SDValue RHS,
+                      SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
+
+    SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
+
+    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+                                        MachineBasicBlock *BB,
+                                        unsigned Size,
+                                        unsigned BinOpcode) const;
+    MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Op1,
+                                          unsigned Op2,
+                                          bool NeedsCarry = false,
+                                          bool IsCmpxchg = false,
+                                          bool IsMinMax = false,
+                                          ARMCC::CondCodes CC = ARMCC::AL) const;
+    MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                                               unsigned Size,
+                                               bool signExtend,
+                                               ARMCC::CondCodes Cond) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr *MI,
+                                             MachineBasicBlock *MBB) const;
+
+    bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitStructByval(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const;
+  };
+
+  enum NEONModImmType {
+    VMOVModImm,
+    VMVNModImm,
+    OtherModImm
+  };
+
+
+  namespace ARM {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif  // ARMISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 000000000000..67a6820932fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -0,0 +1,2094 @@
+//===-- ARMInstrFormats.td - ARM Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def MulFrm        : Format<1>;
+def BrFrm         : Format<2>;
+def BrMiscFrm     : Format<3>;
+
+def DPFrm         : Format<4>;
+def DPSoRegRegFrm    : Format<5>;
+
+def LdFrm         : Format<6>;
+def StFrm         : Format<7>;
+def LdMiscFrm     : Format<8>;
+def StMiscFrm     : Format<9>;
+def LdStMulFrm    : Format<10>;
+
+def LdStExFrm     : Format<11>;
+
+def ArithMiscFrm  : Format<12>;
+def SatFrm        : Format<13>;
+def ExtFrm        : Format<14>;
+
+def VFPUnaryFrm   : Format<15>;
+def VFPBinaryFrm  : Format<16>;
+def VFPConv1Frm   : Format<17>;
+def VFPConv2Frm   : Format<18>;
+def VFPConv3Frm   : Format<19>;
+def VFPConv4Frm   : Format<20>;
+def VFPConv5Frm   : Format<21>;
+def VFPLdStFrm    : Format<22>;
+def VFPLdStMulFrm : Format<23>;
+def VFPMiscFrm    : Format<24>;
+
+def ThumbFrm      : Format<25>;
+def MiscFrm       : Format<26>;
+
+def NGetLnFrm     : Format<27>;
+def NSetLnFrm     : Format<28>;
+def NDupFrm       : Format<29>;
+def NLdStFrm      : Format<30>;
+def N1RegModImmFrm: Format<31>;
+def N2RegFrm      : Format<32>;
+def NVCVTFrm      : Format<33>;
+def NVDupLnFrm    : Format<34>;
+def N2RegVShLFrm  : Format<35>;
+def N2RegVShRFrm  : Format<36>;
+def N3RegFrm      : Format<37>;
+def N3RegVShFrm   : Format<38>;
+def NVExtFrm      : Format<39>;
+def NVMulSLFrm    : Format<40>;
+def NVTBLFrm      : Format<41>;
+def DPSoRegImmFrm  : Format<42>;
+
+// Misc flags.
+
+// The instruction has an Rn register operand.
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP    { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMBaseInstrInfo.h.
+//
+
+// FIXME: Once the JIT is MC-ized, these can go away.
+// Addressing mode.
+class AddrMode<bits<5> val> {
+  bits<5> Value = val;
+}
+def AddrModeNone    : AddrMode<0>;
+def AddrMode1       : AddrMode<1>;
+def AddrMode2       : AddrMode<2>;
+def AddrMode3       : AddrMode<3>;
+def AddrMode4       : AddrMode<4>;
+def AddrMode5       : AddrMode<5>;
+def AddrMode6       : AddrMode<6>;
+def AddrModeT1_1    : AddrMode<7>;
+def AddrModeT1_2    : AddrMode<8>;
+def AddrModeT1_4    : AddrMode<9>;
+def AddrModeT1_s    : AddrMode<10>;
+def AddrModeT2_i12  : AddrMode<11>;
+def AddrModeT2_i8   : AddrMode<12>;
+def AddrModeT2_so   : AddrMode<13>;
+def AddrModeT2_pc   : AddrMode<14>;
+def AddrModeT2_i8s4 : AddrMode<15>;
+def AddrMode_i12    : AddrMode<16>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+def IndexModeUpd  : IndexMode<3>;
+
+// Instruction execution domain.
+class Domain<bits<3> val> {
+  bits<3> Value = val;
+}
+def GenericDomain : Domain<0>;
+def VFPDomain     : Domain<1>; // Instructions in VFP domain only
+def NeonDomain    : Domain<2>; // Instructions in Neon domain only
+def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
+
+//===----------------------------------------------------------------------===//
+// ARM special operands.
+//
+
+// ARM imod and iflag operands, used only by the CPS instruction.
+def imod_op : Operand<i32> {
+  let PrintMethod = "printCPSIMod";
+}
+
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let ParserMethod = "parseProcIFlagsOperand";
+}
+def iflags_op : Operand<i32> {
+  let PrintMethod = "printCPSIFlag";
+  let ParserMatchClass = ProcIFlagsOperand;
+}
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+  let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "DecodePredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let EncoderMethod = "getCCOutOpValue";
+  let PrintMethod = "printSBitModifierOperand";
+  let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
+}
+
+// ARM special operands for disassembly only.
+//
+def SetEndAsmOperand : ImmAsmOperand {
+  let Name = "SetEndImm";
+  let ParserMethod = "parseSetEndImm";
+}
+def setend_op : Operand<i32> {
+  let PrintMethod = "printSetendOperand";
+  let ParserMatchClass = SetEndAsmOperand;
+}
+
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let ParserMethod = "parseMSRMaskOperand";
+}
+def msr_mask : Operand<i32> {
+  let PrintMethod = "printMSRMaskOperand";
+  let DecoderMethod = "DecodeMSRMask";
+  let ParserMatchClass = MSRMaskOperand;
+}
+
+// Shift Right Immediate - A shift right immediate is encoded differently from
+// other shift immediates. The imm6 field is encoded like so:
+//
+//    Offset    Encoding
+//     8        imm6<5:3> = '001', 8 - <imm> is encoded in imm6<2:0>
+//     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
+//     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
+//     64       64 - <imm> is encoded in imm6<5:0>
+def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8  : Operand<i32> {
+  let EncoderMethod = "getShiftRight8Imm";
+  let DecoderMethod = "DecodeShiftRight8Imm";
+  let ParserMatchClass = shr_imm8_asm_operand;
+}
+def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16 : Operand<i32> {
+  let EncoderMethod = "getShiftRight16Imm";
+  let DecoderMethod = "DecodeShiftRight16Imm";
+  let ParserMatchClass = shr_imm16_asm_operand;
+}
+def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32 : Operand<i32> {
+  let EncoderMethod = "getShiftRight32Imm";
+  let DecoderMethod = "DecodeShiftRight32Imm";
+  let ParserMatchClass = shr_imm32_asm_operand;
+}
+def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64 : Operand<i32> {
+  let EncoderMethod = "getShiftRight64Imm";
+  let DecoderMethod = "DecodeShiftRight64Imm";
+  let ParserMatchClass = shr_imm64_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
+class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasNEON]>;
+
+
+class VFP2MnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasVFP2]>;
+class NEONMnemonicAlias<string src, string dst> : MnemonicAlias<src, dst>,
+          Requires<[HasNEON]>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction templates.
+//
+
+
+class InstTemplate<AddrMode am, int sz, IndexMode im,
+                   Format f, Domain d, string cstr, InstrItinClass itin>
+  : Instruction {
+  let Namespace = "ARM";
+
+  AddrMode AM = am;
+  int Size = sz;
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  Format F = f;
+  bits<6> Form = F.Value;
+  Domain D = d;
+  bit isUnaryDataProc = 0;
+  bit canXformTo16Bit = 0;
+  // The instruction is a 16-bit flag setting Thumb instruction. Used
+  // by the parser to determine whether to require the 'S' suffix on the
+  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  bit thumbArithFlagSetting = 0;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
+  let TSFlags{4-0}   = AM.Value;
+  let TSFlags{6-5}   = IndexModeBits;
+  let TSFlags{12-7} = Form;
+  let TSFlags{13}    = isUnaryDataProc;
+  let TSFlags{14}    = canXformTo16Bit;
+  let TSFlags{17-15} = D.Value;
+  let TSFlags{18}    = thumbArithFlagSetting;
+
+  let Constraints = cstr;
+  let Itinerary = itin;
+}
+
+class Encoding {
+  field bits<32> Inst;
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
+  field bits<32> Unpredictable = 0;
+  // SoftFail is the generic name for this field, but we alias it so
+  // as to make it more obvious what it means in ARM-land.
+  field bits<32> SoftFail = Unpredictable;
+}
+
+class InstARM<AddrMode am, int sz, IndexMode im,
+              Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+  let DecoderNamespace = "ARM";
+}
+
+// This Encoding-less class is used by Thumb1 to specify the encoding bits later
+// on by adding flavors to specific instructions.
+class InstThumb<AddrMode am, int sz, IndexMode im,
+                Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin> {
+  let DecoderNamespace = "Thumb";
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class AsmPseudoInst<string asm, dag iops>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "", NoItinerary> {
+  let OutOperandList = (outs);
+  let InOperandList = iops;
+  let Pattern = [];
+  let isCodeGenOnly = 0; // So we get asm matcher for it.
+  let AsmString = asm;
+  let isPseudo = 1;
+}
+
+class ARMAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
+        Requires<[IsARM]>;
+class tAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
+        Requires<[IsThumb]>;
+class t2AsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
+        Requires<[IsThumb2]>;
+class VFP2AsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
+        Requires<[HasVFP2]>;
+class NEONAsmPseudo<string asm, dag iops> : AsmPseudoInst<asm, iops>,
+        Requires<[HasNEON]>;
+
+// Pseudo instructions for the code generator.
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
+  : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
+                 GenericDomain, "", itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let Pattern = pattern;
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// PseudoInst that's ARM-mode only.
+class ARMPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// PseudoInst that's Thumb-mode only.
+class tPseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// PseudoInst that's Thumb2-mode only.
+class t2PseudoInst<dag oops, dag iops, int sz, InstrItinClass itin,
+                    list<dag> pattern>
+  : PseudoInst<oops, iops, itin, pattern> {
+  let Size = sz;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+class ARMPseudoExpand<dag oops, dag iops, int sz,
+                      InstrItinClass itin, list<dag> pattern,
+                      dag Result>
+  : ARMPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class tPseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : tPseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+class t2PseudoExpand<dag oops, dag iops, int sz,
+                    InstrItinClass itin, list<dag> pattern,
+                    dag Result>
+  : t2PseudoInst<oops, iops, sz, itin, pattern>,
+    PseudoInstExpansion<Result>;
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, int sz,
+        IndexMode im, Format f, InstrItinClass itin,
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr,
+           list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, asm);
+  let Pattern = pattern;
+  let isPredicable = 0;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as an input
+// operand since by default it's a zero register. It will become an implicit def
+// once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string opc, string asm, string cstr,
+         list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  bits<4> p; // Predicate operand
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{31-28} = p;
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, int sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : InoP<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 4, IndexModeNone, BrFrm, itin,
+       asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
+
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 1;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-0}  = 0b111110011111;
+}
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = 0;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rd;
+  let Inst{11-4}  = 0b11111001;
+  let Inst{3-0}   = Rt;
+}
+class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  let Inst{27-23} = 0b00010;
+  let Inst{22} = b;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4} = 0b00001001;
+  let Inst{3-0} = Rt2;
+
+  let Unpredictable{11-8} = 0b1111;
+  let DecoderMethod = "DecodeSwap";
+}
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode1, 4, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = 0b00;
+}
+
+// loads
+
+// LDR/LDRB/STR/STRB/...
+class AI2ldst<bits<3> op, bit isLd, bit isByte, dag oops, dag iops, AddrMode am,
+             Format f, InstrItinClass itin, string opc, string asm,
+             list<dag> pattern>
+  : I<oops, iops, am, 4, IndexModeNone, f, itin, opc, asm,
+      "", pattern> {
+  let Inst{27-25} = op;
+  let Inst{24} = 1;  // 24 == P
+  // 23 == U
+  let Inst{22} = isByte;
+  let Inst{21} = 0;  // 21 == W
+  let Inst{20} = isLd;
+}
+// Indexed load/stores
+class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-26} = 0b01;
+  let Inst{24}    = isPre; // P bit
+  let Inst{22}    = isByte; // B bit
+  let Inst{21}    = isPre; // W bit
+  let Inst{20}    = isLd; // L bit
+  let Inst{15-12} = Rt;
+}
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-0} = offset{11-0};
+}
+
+
+// FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
+// but for now use this class for STRT and STRBT.
+class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {17-14}  Rn
+  // {13}     1 == Rm, 0 == imm12
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<18> addr;
+  let Inst{25} = addr{13};
+  let Inst{23} = addr{12};
+  let Inst{19-16} = addr{17-14};
+  let Inst{11-0} = addr{11-0};
+}
+
+// addrmode3 instructions
+class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin,
+      opc, asm, cstr, pattern> {
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = isPre;        // P bit
+  let Inst{21}    = isPre;        // W bit
+  let Inst{20}    = op20;         // L bit
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
+// but for now use this class for LDRSBT, LDRHT, LDSHT.
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
+                  IndexMode im, Format f, InstrItinClass itin, string opc,
+                  string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 0;            // P bit
+  let Inst{21}    = 1;
+  let Inst{20}    = isLoad;       // L bit
+  let Inst{19-16} = addr;         // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{7-4}   = op;
+}
+
+// stores
+class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, 4, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  bits<14> addr;
+  bits<4> Rt;
+  let Inst{27-25} = 0b000;
+  let Inst{24}    = 1;            // P bit
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{21}    = 0;            // W bit
+  let Inst{20}    = 0;            // L bit
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Rt;           // Rt
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{7-4}   = op;
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+// addrmode4 instructions
+class AXI4<dag oops, dag iops, IndexMode im, Format f, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, 4, im, f, itin, asm, cstr, pattern> {
+  bits<4>  p;
+  bits<16> regs;
+  bits<4>  Rn;
+  let Inst{31-28} = p;
+  let Inst{27-25} = 0b100;
+  let Inst{22}    = 0; // S bit
+  let Inst{19-16} = Rn;
+  let Inst{15-0}  = regs;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 0; // S bit
+  let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+       opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+             InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{7-4}   = opc7_4;
+  let Inst{20}    = 1;
+  let Inst{27-21} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+// MSW multiple w/ Ra operand
+class AMul2Ia<bits<7> opcod, bits<4> opc7_4, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMul2I<opcod, opc7_4, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyIbase<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{4}     = 0;
+  let Inst{7}     = 1;
+  let Inst{20}    = 0;
+  let Inst{27-21} = opcod;
+  let Inst{6-5}   = bit6_5;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AMulxyI<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+}
+
+// AMulxyI with Ra operand
+class AMulxyIa<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyI<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+// SMLAL*
+class AMulxyI64<bits<7> opcod, bits<2> bit6_5, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AMulxyIbase<opcod, bit6_5, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ExtFrm, itin,
+      opc, asm, "", pattern> {
+  // All AExtI instructions have Rd and Rm register operands.
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{7-4}   = 0b0111;
+  let Inst{9-8}   = 0b00;
+  let Inst{27-20} = opcod;
+
+  let Unpredictable{9-8} = 0b11;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = Rm;
+  let Inst{7-4}   = 0b0001;
+  let Inst{3-0}   = Rn;
+}
+
+// PKH instructions
+def PKHLSLAsmOperand : ImmAsmOperand {
+  let Name = "PKHLSLImm";
+  let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+  let PrintMethod = "printPKHLSLShiftImm";
+  let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+  let Name = "PKHASRImm";
+  let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+  let PrintMethod = "printPKHASRShiftImm";
+  let ParserMatchClass = PKHASRAsmOperand;
+}
+
+class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<5> sh;
+  let Inst{27-20} = opcod;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = sh;
+  let Inst{6}     = tb;
+  let Inst{5-4}   = 0b01;
+  let Inst{3-0}   = Rm;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5T];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+
+//===----------------------------------------------------------------------===//
+// Thumb Instruction Format Definitions.
+//
+
+class ThumbI<dag oops, dag iops, AddrMode am, int sz,
+             InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// TI - Thumb instruction.
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm,
+          list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 2, itin, asm, "$lhs = $dst",
+           pattern>;
+
+// tBL, tBX 32-bit instructions
+class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
+           dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>,
+      Encoding {
+  let Inst{31-27} = opcod1;
+  let Inst{15-14} = opcod2;
+  let Inst{12}    = opcod3;
+}
+
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm,
+           list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Thumb1 only
+class Thumb1I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1I<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+
+// Two-address instructions
+class T1It<dag oops, dag iops, InstrItinClass itin,
+           string asm, string cstr, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, 2, itin,
+            asm, cstr, pattern>;
+
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = !con(oops, (outs s_cc_out:$s));
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  let thumbArithFlagSetting = 1;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "ThumbSBit";
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, 2, itin, opc, asm,
+             "$Rn = $Rdn", pattern>;
+
+class T1pIs<dag oops, dag iops,
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_s, 2, itin, opc, asm, "", pattern>;
+
+class Encoding16 : Encoding {
+  let Inst{31-16} = 0x0000;
+}
+
+// A6.2 16-bit Thumb instruction encoding
+class T1Encoding<bits<6> opcode> : Encoding16 {
+  let Inst{15-10} = opcode;
+}
+
+// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding.
+class T1General<bits<5> opcode> : Encoding16 {
+  let Inst{15-14} = 0b00;
+  let Inst{13-9} = opcode;
+}
+
+// A6.2.2 Data-processing encoding.
+class T1DataProcessing<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010000;
+  let Inst{9-6} = opcode;
+}
+
+// A6.2.3 Special data instructions and branch and exchange encoding.
+class T1Special<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6}   = opcode;
+}
+
+// A6.2.4 Load/store single data item encoding.
+class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
+  let Inst{15-12} = opA;
+  let Inst{11-9}  = opB;
+}
+class T1LdStSP<bits<3> opB>   : T1LoadStore<0b1001, opB>; // SP relative
+
+class T1BranchCond<bits<4> opcode> : Encoding16 {
+  let Inst{15-12} = opcode;
+}
+
+// Helper classes to encode Thumb1 loads and stores. For immediates, the
+// following bits are used for "opA" (see A6.2.4):
+//
+//   0b0110 => Immediate, 4 bytes
+//   0b1000 => Immediate, 2 bytes
+//   0b0111 => Immediate, 1 byte
+class T1pILdStEncode<bits<3> opcode, dag oops, dag iops, AddrMode am,
+                     InstrItinClass itin, string opc, string asm,
+                     list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<0b0101, opcode> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{8-6} = addr{5-3};    // Rm
+  let Inst{5-3} = addr{2-0};    // Rn
+  let Inst{2-0} = Rt;
+}
+class T1pILdStEncodeImm<bits<4> opA, bit opB, dag oops, dag iops, AddrMode am,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : Thumb1pI<oops, iops, am, 2, itin, opc, asm, "", pattern>,
+    T1LoadStore<opA, {opB,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-6} = addr{7-3};   // imm5
+  let Inst{5-3}  = addr{2-0};   // Rn
+  let Inst{2-0}  = Rt;
+}
+
+// A6.2.5 Miscellaneous 16-bit instructions encoding.
+class T1Misc<bits<7> opcode> : Encoding16 {
+  let Inst{15-12} = 0b1011;
+  let Inst{11-5} = opcode;
+}
+
+// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
+class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
+// input operand since by default it's a zero register. It will become an
+// implicit def once it's "flipped".
+//
+// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
+// more consistent.
+class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  bits<1> s; // condition-code set flag ('1' if the insn should set the flags)
+  let Inst{20} = s;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p, cc_out:$s));
+  let AsmString = !strconcat(opc, "${s}${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+// Special cases
+class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+}
+
+class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
+              InstrItinClass itin,
+              string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "Thumb";
+}
+
+class T2I<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i12, 4, itin, opc, asm, "",pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8, 4, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_so, 4, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
+class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<13> addr;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = addr{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = addr{7-0};
+}
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm, string cstr,
+                  list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  bits<9> imm;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = imm{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb2sI<oops, iops, AddrModeNone, 4, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
+
+// Move to/from coprocessor instructions
+class T2Cop<bits<4> opc, dag oops, dag iops, string asm, list<dag> pattern>
+  : T2XI <oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2]> {
+  let Inst{31-28} = opc;
+}
+
+// Two-address instructions
+class T2XIt<dag oops, dag iops, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
+
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = addr{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<4> Rn;
+  bits<9> offset;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = offset{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = offset{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
+class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only, HasV5T];
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+}
+
+// T2v6Pat - Same as Pat<>, but requires V6T2 Thumb2 mode.
+class T2v6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2, HasV6T2];
+}
+
+// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
+class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, int sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, int sz,
+            IndexMode im, Format f, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  bits<4> p;
+  let Inst{31-28} = p;
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrModeNone, 4, IndexModeNone, f, itin,
+         opc, asm, "", pattern> {
+  let PostEncoderMethod = "VFPThumb2PostEncoder";
+}
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Dd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Dd{4};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Dd{3-0};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
+         VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // Instruction operands.
+  bits<5>  Sd;
+  bits<13> addr;
+
+  // Encode instruction operands.
+  let Inst{23}    = addr{8};      // U (add = (U == '1'))
+  let Inst{22}    = Sd{0};
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{15-12} = Sd{4-1};
+  let Inst{7-0}   = addr{7-0};    // imm8
+
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+
+  // Loads & stores operate on both NEON and VFP pipelines.
+  let D = VFPNeonDomain;
+}
+
+// VFP Load / store multiple pseudo instructions.
+class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
+                     list<dag> pattern>
+  : InstARM<AddrMode4, 4, IndexModeNone, Pseudo, VFPNeonDomain,
+            cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Load / store multiple
+class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{12};
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-0}   = regs{7-0};
+
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+}
+
+class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStMulFrm, itin, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = regs{8};
+  let Inst{15-12} = regs{12-9};
+  let Inst{7-0}   = regs{7-0};
+
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+}
+
+// Double precision, unary
+class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Double precision, binary
+class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+           dag iops, InstrItinClass itin, string opc, string asm,
+           list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Single precision, unary
+class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision unary, if no NEON. Same as ASuI except not available if
+// NEON is enabled.
+class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+            string asm, list<dag> pattern>
+  : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
+         pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// Single precision, binary
+class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+// Single precision binary, if no NEON. Same as ASbI except not available if
+// NEON is enabled.
+class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+            dag iops, InstrItinClass itin, string opc, string asm,
+            list<dag> pattern>
+  : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+               list<dag> pattern>
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = opcod4;
+  let Inst{6}     = 1;
+  let Inst{4}     = 0;
+}
+
+// VFP conversion between floating-point and fixed-point
+class AVConv1XI<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, bit op5,
+                dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+                list<dag> pattern>
+  : AVConv1I<op1, op2, op3, op4, oops, iops, itin, opc, asm, pattern> {
+  bits<5> fbits;
+  // size (fixed-point number): sx == 0 ? 16 : 32
+  let Inst{7} = op5; // sx
+  let Inst{5} = fbits{0};
+  let Inst{3-0} = fbits{4-1};
+}
+
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+               InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{4}     = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
+
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+// Same as NeonI except it does not have a "data type" specifier.
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+             InstrItinClass itin, string opc, string asm, string cstr,
+             list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode6, IndexModeNone, NLdStFrm, itin, opc, dt, asm,
+          cstr, pattern> {
+  let Inst{31-24} = 0b11110100;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{7-4}   = op7_4;
+
+  let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+  let DecoderNamespace = "NEONLoadStore";
+
+  bits<5> Vd;
+  bits<6> Rn;
+  bits<4> Rm;
+
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{3-0}   = Rm{3-0};
+}
+
+class NLdStLn<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NLdSt<op23, op21_20, op11_8, op7_4, oops, iops, itin, opc,
+          dt, asm, cstr, pattern> {
+  bits<3> lane;
+}
+
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+  : InstARM<AddrMode6, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class PseudoNeonI<dag oops, dag iops, InstrItinClass itin, string cstr,
+                  list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, Pseudo, NeonDomain, cstr,
+            itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
+          pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, asm,
+           cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+  let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, InstrItinClass itin,
+               string opc, string dt, string asm, string cstr,
+               list<dag> pattern>
+  : NDataI<oops, iops, N1RegModImmFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{23}    = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8}  = op11_8;
+  let Inst{7}     = op7;
+  let Inst{6}     = op6;
+  let Inst{5}     = op5;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<13> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{24}    = SIMM{7};
+  let Inst{18-16} = SIMM{6-4};
+  let Inst{3-0}   = SIMM{3-0};
+  let DecoderMethod = "DecodeNEONModImmInstruction";
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, N2RegFrm, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// Same as N2V except it doesn't have a datatype suffix.
+class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+           bits<5> op11_7, bit op6, bit op4,
+           dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, N2RegFrm, itin, opc, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7}  = op11_7;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}   = op24;
+  let Inst{23}   = op23;
+  let Inst{11-8} = op11_8;
+  let Inst{7}    = op7;
+  let Inst{6}    = op6;
+  let Inst{4}    = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<6> SIMM;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{21-16} = SIMM{5-0};
+}
+
+// NEON 3 vector register format.
+
+class N3VCommon<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : NDataI<oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bit lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = lane;
+}
+
+class N3VLane16<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string dt, string asm, string cstr,
+                list<dag> pattern>
+  : N3VCommon<op24, op23, op21_20, op11_8, op6, op4,
+              oops, iops, f, itin, opc, dt, asm, cstr, pattern> {
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  bits<2> lane;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{2-0}   = Vm{2-0};
+  let Inst{5}     = lane{1};
+  let Inst{3}     = lane{0};
+}
+
+// Same as N3V except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
+           bit op4,
+           dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, f, itin, opc, asm, cstr, pattern> {
+  let Inst{24}    = op24;
+  let Inst{23}    = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+
+  // Instruction operands.
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22}    = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7}     = Vn{4};
+  let Inst{3-0}   = Vm{3-0};
+  let Inst{5}     = Vm{4};
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string dt, string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, 4, IndexModeNone, f, NeonDomain,
+            "", itin> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{6-5}   = opcod3;
+  let Inst{4}     = 1;
+  // A8.6.303, A8.6.328, A8.6.329
+  let Inst{3-0}   = 0b0000;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+
+  let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+  let DecoderNamespace = "NEONDup";
+
+  bits<5> V;
+  bits<4> R;
+  bits<4> p;
+  bits<4> lane;
+
+  let Inst{31-28} = p{3-0};
+  let Inst{7}     = V{4};
+  let Inst{19-16} = V{3-0};
+  let Inst{15-12} = R{3-0};
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NGetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NSetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NDupFrm, itin,
+             opc, dt, asm, pattern>;
+
+// Vector Duplicate Lane (from scalar to all elements)
+class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
+                InstrItinClass itin, string opc, string dt, string asm,
+                list<dag> pattern>
+  : NDataI<oops, iops, NVDupLnFrm, itin, opc, dt, asm, "", pattern> {
+  let Inst{24-23} = 0b11;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op19_16;
+  let Inst{11-7}  = 0b11000;
+  let Inst{6}     = op6;
+  let Inst{4}     = 0;
+
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{22}     = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+}
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}
+
+// VFP/NEON Instruction aliases for type suffices.
+class VFPDataTypeInstAlias<string opc, string dt, string asm, dag Result> :
+  InstAlias<!strconcat(opc, dt, "\t", asm), Result>, Requires<[HasVFP2]>;
+
+multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+}
+
+multiclass NEONDTAnyInstAlias<string opc, string asm, dag Result> {
+  let Predicates = [HasNEON] in {
+  def : VFPDataTypeInstAlias<opc, ".8", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".16", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".32", asm, Result>;
+  def : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+}
+}
+
+// The same alias classes using AsmPseudo instead, for the more complex
+// stuff in NEON that InstAlias can't quite handle.
+// Note that we can't use anonymous defm references here like we can
+// above, as we care about the ultimate instruction enum names generated, unlike
+// for instalias defs.
+class NEONDataTypeAsmPseudoInst<string opc, string dt, string asm, dag iops> :
+  AsmPseudoInst<!strconcat(opc, dt, "\t", asm), iops>, Requires<[HasNEON]>;
+
+// Data type suffix token aliases. Implements Table A7-3 in the ARM ARM.
+def : TokenAlias<".s8", ".i8">;
+def : TokenAlias<".u8", ".i8">;
+def : TokenAlias<".s16", ".i16">;
+def : TokenAlias<".u16", ".i16">;
+def : TokenAlias<".s32", ".i32">;
+def : TokenAlias<".u32", ".i32">;
+def : TokenAlias<".s64", ".i64">;
+def : TokenAlias<".u64", ".i64">;
+
+def : TokenAlias<".i8", ".8">;
+def : TokenAlias<".i16", ".16">;
+def : TokenAlias<".i32", ".32">;
+def : TokenAlias<".i64", ".64">;
+
+def : TokenAlias<".p8", ".8">;
+def : TokenAlias<".p16", ".16">;
+
+def : TokenAlias<".f32", ".32">;
+def : TokenAlias<".f64", ".64">;
+def : TokenAlias<".f", ".f32">;
+def : TokenAlias<".d", ".f64">;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 000000000000..80f0ec74376a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -0,0 +1,148 @@
+//===-- ARMInstrInfo.cpp - ARM Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+using namespace llvm;
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  if (hasNOP()) {
+    NopInst.setOpcode(ARM::HINT);
+    NopInst.addOperand(MCOperand::CreateImm(0));
+    NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+  } else {
+    NopInst.setOpcode(ARM::MOVr);
+    NopInst.addOperand(MCOperand::CreateReg(ARM::R0));
+    NopInst.addOperand(MCOperand::CreateReg(ARM::R0));
+    NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+    NopInst.addOperand(MCOperand::CreateReg(0));
+  }
+}
+
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
+    return ARM::LDRi12;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
+    return ARM::LDRBi12;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+    return ARM::STRi12;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+    return ARM::STRBi12;
+  }
+
+  return 0;
+}
+
+namespace {
+  /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for ARM ELF.
+  struct ARMCGBR : public MachineFunctionPass {
+    static char ID;
+    ARMCGBR() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      if (AFI->getGlobalBaseReg() == 0)
+        return false;
+
+      const ARMTargetMachine *TM =
+        static_cast<const ARMTargetMachine *>(&MF.getTarget());
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      LLVMContext* Context = &MF.getFunction()->getContext();
+      GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+                                           GlobalValue::ExternalLinkage, 0,
+                                           "_GLOBAL_OFFSET_TABLE_");
+      unsigned Id = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id);
+      unsigned Align = TM->getDataLayout()->getPrefTypeAlignment(GV->getType());
+      unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
+
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
+                     ARM::t2LDRpci : ARM::LDRcp;
+      const TargetInstrInfo &TII = *TM->getInstrInfo();
+      MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
+                                        TII.get(Opc), GlobalBaseReg)
+                                .addConstantPoolIndex(Idx);
+      if (Opc == ARM::LDRcp)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "ARM PIC Global Base Reg Initialization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char ARMCGBR::ID = 0;
+FunctionPass*
+llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 000000000000..5d3e059b7038
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -0,0 +1,46 @@
+//===-- ARMInstrInfo.h - ARM Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTRUCTIONINFO_H
+#define ARMINSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMInstrInfo : public ARMBaseInstrInfo {
+  ARMRegisterInfo RI;
+public:
+  explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 000000000000..11550c5ae678
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -0,0 +1,5233 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+def SDT_ARMStructByVal : SDTypeProfile<0, 4,
+                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMBr2JT   : SDTypeProfile<0, 4,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMBCC_i64 : SDTypeProfile<0, 6,
+                                  [SDTCisVT<0, i32>,
+                                   SDTCisVT<1, i32>, SDTCisVT<2, i32>,
+                                   SDTCisVT<3, i32>, SDTCisVT<4, i32>,
+                                   SDTCisVT<5, OtherVT>]>;
+
+def SDT_ARMAnd     : SDTypeProfile<1, 2,
+                                   [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                    SDTCisVT<2, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                                 SDTCisInt<2>]>;
+def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+
+def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
+                                           SDTCisInt<1>]>;
+
+def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
+def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
+def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
+def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
+def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
+                                SDT_ARMStructByVal,
+                                [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
+                                 SDNPMayStore, SDNPMayLoad]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                               SDNPVariadic]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInGlue]>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+                              [SDNPHasChain]>;
+
+def ARMBcci64        : SDNode<"ARMISD::BCC_i64", SDT_ARMBCC_i64,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmn           : SDNode<"ARMISD::CMN", SDT_ARMCmp,
+                              [SDNPOutGlue]>;
+
+def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
+                              [SDNPOutGlue, SDNPCommutative]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
+
+def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
+                              [SDNPCommutative]>;
+def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
+                               [SDNPHasChain, SDNPSideEffect]>;
+def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
+                               [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+
+def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
+
+def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+
+def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
+                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
+def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
+def HasV5T           : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
+                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
+def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
+                                 AssemblerPredicate<"HasV6Ops", "armv6">;
+def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
+                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
+                                 AssemblerPredicate<"HasV7Ops", "armv7">;
+def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
+def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
+                                 AssemblerPredicate<"FeatureVFP2", "VFP2">;
+def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
+                                 AssemblerPredicate<"FeatureVFP3", "VFP3">;
+def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
+                                 AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasFP16          : Predicate<"Subtarget->hasFP16()">,
+                                 AssemblerPredicate<"FeatureFP16","half-float">;
+def HasDivide        : Predicate<"Subtarget->hasDivide()">,
+                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
+def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
+                                 AssemblerPredicate<"FeatureHWDivARM">;
+def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
+                                 AssemblerPredicate<"FeatureT2XtPk",
+                                                     "pack/extract">;
+def HasThumb2DSP     : Predicate<"Subtarget->hasThumb2DSP()">,
+                                 AssemblerPredicate<"FeatureDSPThumb2",
+                                                    "thumb2-dsp">;
+def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
+                                 AssemblerPredicate<"FeatureDB",
+                                                    "data-barriers">;
+def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
+                                 AssemblerPredicate<"FeatureMP",
+                                                    "mp-extensions">;
+def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb          : Predicate<"Subtarget->isThumb()">,
+                                 AssemblerPredicate<"ModeThumb", "thumb">;
+def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
+                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                                    "thumb2">;
+def IsMClass         : Predicate<"Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
+def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"!FeatureMClass",
+                                                    "armv7a/r">;
+def IsARM            : Predicate<"!Subtarget->isThumb()">,
+                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
+def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
+def IsNotIOS         : Predicate<"!Subtarget->isTargetIOS()">;
+def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
+def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
+                                 AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+def UseMovt          : Predicate<"Subtarget->useMovt()">;
+def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
+def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
+
+// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
+// But only select them if more precision in FP computation is allowed.
+// Do not use them for Darwin platforms.
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast) && "
+                                 "!Subtarget->isTargetDarwin()">;
+def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
+                                 "Subtarget->isTargetDarwin()">;
+
+// VGETLNi32 is microcoded on Swift - prefer VMOV.
+def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+
+// VDUP.32 is microcoded on Swift - prefer VMOV.
+def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+
+// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
+// this allows more effective execution domain optimization. See
+// setExecutionDomain().
+def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+
+def IsLE             : Predicate<"TLI.isLittleEndian()">;
+def IsBE             : Predicate<"TLI.isBigEndian()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : ImmLeaf<i32, [{
+  return (int32_t)Imm >= 16 && (int32_t)Imm < 32;
+}]>;
+
+def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
+def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return Value && ARM_AM::getSOImmVal(Value) != -1;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = so_imm_neg_asmoperand;
+}
+
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use so_imm.
+def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
+def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
+  }], imm_not_XFORM> {
+  let ParserMatchClass = so_imm_not_asmoperand;
+}
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
+}]>;
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def hi16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+  // Returns true if all low 16-bits are 0.
+  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+}], hi16>;
+
+class BinOpWithFlagFrag<dag res> :
+      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'xor' node with a single use.
+def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+
+// An 'fmul' node with a single use.
+def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
+  return N->hasOneUse();
+}]>;
+
+// An 'fadd' node which checks for single non-hazardous use.
+def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+// An 'fsub' node which checks for single non-hazardous use.
+def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
+  return hasNoVMLxHazardUse(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Immediate operands with a shared generic asm render method.
+class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+
+// Branch target.
+// FIXME: rename brtarget to t2_brtarget
+def brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeT2BROperand";
+}
+
+// FIXME: get rid of this one?
+def uncondbrtarget : Operand<OtherVT> {
+  let EncoderMethod = "getUnconditionalBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Branch target for ARM. Handles conditional/unconditional
+def br_target : Operand<OtherVT> {
+  let EncoderMethod = "getARMBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Call target.
+// FIXME: rename bltarget to t2_bl_target?
+def bltarget : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// Call target for ARM. Handles conditional/unconditional
+// FIXME: rename bl_target to t2_bltarget?
+def bl_target : Operand<i32> {
+  let EncoderMethod = "getARMBLTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+def blx_target : Operand<i32> {
+  let EncoderMethod = "getARMBLXTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+}
+
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
+def reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = RegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
+}
+
+def GPRPairOp : RegisterOperand<GPRPair, "printGPRPairOperand">;
+
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
+def dpr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = DPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeDPRRegListOperand";
+}
+
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
+def spr_reglist : Operand<i32> {
+  let EncoderMethod = "getRegisterListOpValue";
+  let ParserMatchClass = SPRRegListAsmOperand;
+  let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeSPRRegListOperand";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// ADR instruction labels.
+def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
+def adrlabel : Operand<i32> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrLabelAsmOperand;
+  let PrintMethod = "printAdrLabelOperand";
+}
+
+def neon_vcvt_imm32 : Operand<i32> {
+  let EncoderMethod = "getNEONVcvtImm32OpValue";
+  let DecoderMethod = "DecodeVCVTImmOperand";
+}
+
+// rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+  switch (N->getZExtValue()){
+  default: assert(0);
+  case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
+  case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
+  case 16: return CurDAG->getTargetConstant(2, MVT::i32);
+  case 24: return CurDAG->getTargetConstant(3, MVT::i32);
+  }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+  let Name = "RotImm";
+  let ParserMethod = "parseRotImm";
+}
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }],
+    rot_imm_XFORM> {
+  let PrintMethod = "printRotImmOperand";
+  let ParserMatchClass = RotImmAsmOperand;
+}
+
+// shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+  let Name = "ShifterImm";
+  let ParserMethod = "parseShifterImm";
+}
+def shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+}
+
+// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>,  // reg reg imm
+                 ComplexPattern<i32, 3, "SelectRegShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
+}
+
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+                 ComplexPattern<i32, 2, "SelectImmShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_imm : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+
+// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits.
+def SOImmAsmOperand: ImmAsmOperand { let Name = "ARMSOImm"; }
+def so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  }]> {
+  let EncoderMethod = "getSOImmOpValue";
+  let ParserMatchClass = SOImmAsmOperand;
+  let DecoderMethod = "DecodeSOImmOperand";
+}
+
+// Break so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
+// get the first/second pieces.
+def so_imm2part : PatLeaf<(imm), [{
+      return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// arm_i32imm - True for +V6T2, or true only if so_imm2part is true.
+///
+def arm_i32imm : PatLeaf<(imm), [{
+  if (Subtarget->hasV6T2Ops())
+    return true;
+  return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+}]>;
+
+/// imm0_1 predicate - Immediate in the range [0,1].
+def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
+
+/// imm0_3 predicate - Immediate in the range [0,3].
+def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
+
+/// imm0_7 predicate - Immediate in the range [0,7].
+def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 8;
+}]> {
+  let ParserMatchClass = Imm0_7AsmOperand;
+}
+
+/// imm8 predicate - Immediate is exactly 8.
+def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
+  let ParserMatchClass = Imm8AsmOperand;
+}
+
+/// imm16 predicate - Immediate is exactly 16.
+def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
+  let ParserMatchClass = Imm16AsmOperand;
+}
+
+/// imm32 predicate - Immediate is exactly 32.
+def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
+  let ParserMatchClass = Imm32AsmOperand;
+}
+
+/// imm1_7 predicate - Immediate in the range [1,7].
+def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
+  let ParserMatchClass = Imm1_7AsmOperand;
+}
+
+/// imm1_15 predicate - Immediate in the range [1,15].
+def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
+  let ParserMatchClass = Imm1_15AsmOperand;
+}
+
+/// imm1_31 predicate - Immediate in the range [1,31].
+def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
+  let ParserMatchClass = Imm1_31AsmOperand;
+}
+
+/// imm0_15 predicate - Immediate in the range [0,15].
+def Imm0_15AsmOperand: ImmAsmOperand {
+  let Name = "Imm0_15";
+  let DiagnosticType = "ImmRange0_15";
+}
+def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 16;
+}]> {
+  let ParserMatchClass = Imm0_15AsmOperand;
+}
+
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_31AsmOperand;
+}
+
+/// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
+def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 32;
+}]> {
+  let ParserMatchClass = Imm0_32AsmOperand;
+}
+
+/// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
+def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 64;
+}]> {
+  let ParserMatchClass = Imm0_63AsmOperand;
+}
+
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+  let ParserMatchClass = Imm0_255AsmOperand;
+}
+
+/// imm0_65535 - An immediate is in the range [0.65535].
+def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535AsmOperand;
+}
+
+// imm0_65535_neg - An immediate whose negative value is in the range [0.65535].
+def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
+  return -Imm >= 0 && -Imm < 65536;
+}]>;
+
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
+//
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
+  let EncoderMethod = "getHiLo16ImmOpValue";
+  let ParserMatchClass = Imm0_65535ExprAsmOperand;
+}
+
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+  let ParserMatchClass = Imm24bitAsmOperand;
+}
+
+
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+  let Name = "Bitfield";
+  let ParserMethod = "parseBitfield";
+}
+
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{
+  return ARM::isBitFieldInvertedMask(N->getZExtValue());
+}] > {
+  let EncoderMethod = "getBitfieldInvertedMaskOpValue";
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+  let DecoderMethod = "DecodeBitfieldMaskOperand";
+  let ParserMatchClass = BitfieldAsmOperand;
+}
+
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+   uint64_t Imm = N->getZExtValue();
+   return Imm > 0 && Imm <= 32;
+ }],
+    imm1_32_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+    imm1_16_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_16AsmOperand;
+}
+
+// Define ARM specific addressing modes.
+// addrmode_imm12 := reg +/- imm12
+//
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
+def addrmode_imm12 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
+  // 12-bit immediate operand. Note that instructions using this encode
+  // #0 and #-0 differently. We flag #-0 as the magic value INT32_MIN. All other
+  // immediate values are as normal.
+
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printAddrModeImm12Operand";
+  let DecoderMethod = "DecodeAddrModeImm12Operand";
+  let ParserMatchClass = MemImm12OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+// ldst_so_reg := reg +/- reg shop imm
+//
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
+def ldst_so_reg : Operand<i32>,
+                  ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
+  let EncoderMethod = "getLdStSORegOpValue";
+  // FIXME: Simplify the printer
+  let PrintMethod = "printAddrMode2Operand";
+  let DecoderMethod = "DecodeSORegMemOperand";
+  let ParserMatchClass = MemRegOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
+}
+
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8Operand";
+  let ParserMatchClass = PostIdxImm8AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8s4Operand";
+  let ParserMatchClass = PostIdxImm8s4AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+  let Name = "PostIdxReg";
+  let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : Operand<i32> {
+  let EncoderMethod = "getPostIdxRegOpValue";
+  let DecoderMethod = "DecodePostIdxReg";
+  let PrintMethod = "printPostIdxRegOperand";
+  let ParserMatchClass = PostIdxRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode2 := reg +/- imm12
+//           := reg +/- reg shop imm
+//
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
+def addrmode2 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let EncoderMethod = "getAddrMode2OpValue";
+  let PrintMethod = "printAddrMode2Operand";
+  let ParserMatchClass = AddrMode2AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+  let Name = "PostIdxRegShifted";
+  let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  // When using this for assembly, it's always as a post-index offset.
+  let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let ParserMatchClass = AM2OffsetImmAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, i32imm);
+}
+
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
+def addrmode3 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let EncoderMethod = "getAddrMode3OpValue";
+  let PrintMethod = "printAddrMode3Operand";
+  let ParserMatchClass = AddrMode3AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+  let Name = "AM3Offset";
+  let ParserMethod = "parseAM3Offset";
+}
+def am3offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset",
+                               [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode3OffsetOpValue";
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let ParserMatchClass = AM3OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// ldstm_mode := {ia, ib, da, db}
+//
+def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
+  let EncoderMethod = "getLdStmModeOpValue";
+  let PrintMethod = "printLdStmModeOperand";
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
+def addrmode5 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let PrintMethod = "printAddrMode5Operand";
+  let EncoderMethod = "getAddrMode5OpValue";
+  let DecoderMethod = "DecodeAddrMode5Operand";
+  let ParserMatchClass = AddrMode5AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
+}
+
+// addrmode6 := reg with optional alignment
+//
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
+def addrmode6 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+def am6offset : Operand<i32>,
+                ComplexPattern<i32, 1, "SelectAddrMode6Offset",
+                               [], [SDNPWantRoot]> {
+  let PrintMethod = "printAddrMode6OffsetOperand";
+  let MIOperandInfo = (ops GPR);
+  let EncoderMethod = "getAddrMode6OffsetOpValue";
+  let DecoderMethod = "DecodeGPRRegisterClass";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VST1/VLD1
+// (single element from one lane) for size 32.
+def addrmode6oneL32 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
+}
+
+// Special version of addrmode6 to handle alignment encoding for VLD-dup
+// instructions, specifically VLD4-dup.
+def addrmode6dup : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+  // FIXME: This is close, but not quite right. The alignment specifier is
+  // different.
+  let ParserMatchClass = AddrMode6AsmOperand;
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : Operand<i32>,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addr_offset_none := reg
+//
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
+  let PrintMethod = "printAddrMode7Operand";
+  let DecoderMethod = "DecodeAddrMode7Operand";
+  let ParserMatchClass = MemNoOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base);
+}
+
+def nohash_imm : Operand<i32> {
+  let PrintMethod = "printNoHashImmediate";
+}
+
+def CoprocNumAsmOperand : AsmOperandClass {
+  let Name = "CoprocNum";
+  let ParserMethod = "parseCoprocNumOperand";
+}
+def p_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+  let DecoderMethod = "DecodeCoprocessor";
+}
+
+def pf_imm : Operand<i32> {
+  let PrintMethod = "printPImmediate";
+  let ParserMatchClass = CoprocNumAsmOperand;
+}
+
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let ParserMethod = "parseCoprocRegOperand";
+}
+def c_imm : Operand<i32> {
+  let PrintMethod = "printCImmediate";
+  let ParserMatchClass = CoprocRegAsmOperand;
+}
+def CoprocOptionAsmOperand : AsmOperandClass {
+  let Name = "CoprocOption";
+  let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+  let PrintMethod = "printCoprocOptionImm";
+  let ParserMatchClass = CoprocOptionAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-0} = imm;
+  }
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving them an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, PatFrag opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>,
+                         Sched<[WriteALU, ReadALU]>;
+
+  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>,
+                         Sched<[WriteALU, ReadALU, ReadALU]> {
+    let isCommutable = Commutable;
+  }
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_imm:$shift))]>,
+                          Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode GPR:$Rn,
+                                                so_reg_reg:$shift))]>,
+                          Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<InstrItinClass iii, InstrItinClass iir,
+                          InstrItinClass iis, PatFrag opnode,
+                          bit Commutable = 0> {
+  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>,
+           Sched<[WriteALU, ReadALU]>;
+
+  def rsi : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_imm:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
+
+  def rsr : ARMPseudoInst<(outs GPR:$Rd),
+                          (ins GPR:$Rn, so_reg_reg:$shift, pred:$p),
+                          4, iis,
+                          [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift,
+                                             GPR:$Rn))]>,
+            Sched<[WriteALUSsr, ReadALUsr]>;
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let isCompare = 1, Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0> {
+  def ri : AI1<opcod, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, iii,
+               opc, "\t$Rn, $imm",
+               [(opnode GPR:$Rn, so_imm:$imm)]>,
+           Sched<[WriteCMP, ReadALU]> {
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-0} = imm;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rr : AI1<opcod, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, iir,
+               opc, "\t$Rn, $Rm",
+               [(opnode GPR:$Rn, GPR:$Rm)]>,
+           Sched<[WriteCMP, ReadALU, ReadALU]> {
+    bits<4> Rn;
+    bits<4> Rm;
+    let isCommutable = Commutable;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-4} = 0b00000000;
+    let Inst{3-0} = Rm;
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsi : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg_imm:$shift)]>,
+            Sched<[WriteCMPsi, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+  def rsr : AI1<opcod, (outs),
+               (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPRnopc:$Rn, so_reg_reg:$shift)]>,
+            Sched<[WriteCMPsr, ReadALU]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+
+    let Unpredictable{15-12} = 0b1111;
+  }
+
+}
+}
+
+/// AI_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{11-10} = rot;
+}
+
+/// AI_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+                                     (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+        Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{9-4}   = 0b000111;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPRnopc:$Rd),
+                (ins GPRnopc:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPRnopc:$Rd, CPSR,
+                    (opnode GPRnopc:$Rn, so_reg_reg:$shift, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+let TwoOperandAliasConstraint = "$Rn = $Rd" in
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+           Sched<[WriteALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>,
+           Sched<[WriteALU, ReadALU, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsi, ReadALU]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]>,
+            Sched<[WriteALUsr, ReadALUsr]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+  }
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPR:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPR:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPR:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                   [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt),
+                   (ins ldst_so_reg:$shift),
+                   AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                   [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+
+multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPR:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPR:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPR:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPR:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                   AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                   [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                    i32imm:$size), NoItinerary, []>;
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt)]>;
+}
+
+// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
+// (These pseudos use a hand-written selection code).
+let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
+def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+                                      GPR:$set1, GPR:$set2),
+                                 NoItinerary, []>;
+def ATOMMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMUMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMUMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+}
+
+def HINT : AI<(outs), (ins imm0_255:$imm), MiscFrm, NoItinerary,
+              "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
+}
+
+def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{27-20} = 0b01101000;
+  let Inst{7-4} = 0b1011;
+  let Inst{11-8} = 0b1111;
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// The 16-bit operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+              "bkpt", "\t$val", []>, Requires<[IsARM]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010010;
+  let Inst{7-4} = 0b0111;
+}
+
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
+class CPS<dag iops, string asm_ops>
+  : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
+        []>, Requires<[IsARM]> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-28} = 0b1111;
+  let Inst{27-20} = 0b00010000;
+  let Inst{19-18} = imod;
+  let Inst{17}    = M; // Enabled if mode is set;
+  let Inst{16-9}  = 0b00000000;
+  let Inst{8-6}   = iflags;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = mode;
+}
+
+let DecoderMethod = "DecodeCPSInstruction" in {
+let M = 1 in
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
+                  "$imod\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
+
+let imod = 0, iflags = 0, M = 1 in
+  def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
+
+// Preload signals the memory system of possible future data/instruction access.
+multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
+
+  def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
+                !strconcat(opc, "\t$addr"),
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 0; // 0 for immediate form
+    let Inst{24} = data;
+    let Inst{23} = addr{12};        // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+
+  def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
+               !strconcat(opc, "\t$shift"),
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> {
+    bits<17> shift;
+    let Inst{31-26} = 0b111101;
+    let Inst{25} = 1; // 1 for register form
+    let Inst{24} = data;
+    let Inst{23} = shift{12};    // U (add = ('U' == 1))
+    let Inst{22} = read;
+    let Inst{21-20} = 0b01;
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = 0b1111;
+    let Inst{11-0}  = shift{11-0};
+    let Inst{4} = 0;
+  }
+}
+
+defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
+defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
+defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
+
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end", []>, Requires<[IsARM]> {
+  bits<1> end;
+  let Inst{31-10} = 0b1111000100000001000000;
+  let Inst{9} = end;
+  let Inst{8-0} = 0;
+}
+
+def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
+             []>, Requires<[IsARM, HasV7]> {
+  bits<4> opt;
+  let Inst{27-4} = 0b001100100000111100001111;
+  let Inst{3-0} = opt;
+}
+
+/*
+ * A5.4 Permanently UNDEFINED instructions.
+ *
+ * For most targets use UDF #65006, for which the OS will generate SIGTRAP.
+ * Other UDF encodings generate SIGILL.
+ *
+ * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb.
+ * Encoding A1:
+ *  1110 0111 1111 iiii iiii iiii 1111 iiii
+ * Encoding T1:
+ *  1101 1110 iiii iiii
+ * It uses the following encoding:
+ *  1110 0111 1111 1110 1101 1110 1111 0000
+ *  - In ARM: UDF #60896;
+ *  - In Thumb: UDF #254 followed by a branch-to-self.
+ */
+let isBarrier = 1, isTerminator = 1 in
+def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,UseNaClTrap]> {
+  let Inst = 0xe7fedef0;
+}
+let isBarrier = 1, isTerminator = 1 in
+def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
+               "trap", [(trap)]>,
+           Requires<[IsARM,DontUseNaClTrap]> {
+  let Inst = 0xe7ffdefe;
+}
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                            4, IIC_iALUr,
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+
+let AddedComplexity = 10 in {
+def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_r,
+                            [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : ARMPseudoInst<(outs GPR:$Rt), (ins addrmodepc:$addr, pred:$p),
+                            4, IIC_iLoad_bh_r,
+                            [(set GPR:$Rt, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR  : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_r, [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei16 GPR:$src,
+                                                   addrmodepc:$addr)]>;
+
+def PICSTRB : ARMPseudoInst<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+      4, IIC_iStore_bh_r, [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+// The 'adr' mnemonic encodes differently if the label is before or after
+// the instruction. The {24-21} opcode bits are set by the fixup, as we don't
+// know until then which form of the instruction will be used.
+def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []>,
+                 Sched<[WriteALU, ReadALU]> {
+  bits<4> Rd;
+  bits<14> label;
+  let Inst{27-25} = 0b001;
+  let Inst{24} = 0;
+  let Inst{23-22} = label{13-12};
+  let Inst{21} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = label{11-0};
+}
+
+let hasSideEffects = 1 in {
+def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
+                    4, IIC_iALUi, []>;
+
+def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
+                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                      4, IIC_iALUi, []>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  // ARMV4T and above
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "bx", "\tlr", [(ARMretflag)]>,
+               Requires<[IsARM, HasV4T]> {
+    let Inst{27-0}  = 0b0001001011111111111100011110;
+  }
+
+  // ARMV4 only
+  def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
+                  "mov", "\tpc, lr", [(ARMretflag)]>,
+               Requires<[IsARM, NoV4T]> {
+    let Inst{27-0} = 0b0001101000001111000000001110;
+  }
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  // ARMV4T and above
+  def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+                  [(brind GPR:$dst)]>,
+              Requires<[IsARM, HasV4T]> {
+    bits<4> dst;
+    let Inst{31-4} = 0b1110000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+
+  def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
+                  "bx", "\t$dst", [/* pattern left blank */]>,
+              Requires<[IsARM, HasV4T]> {
+    bits<4> dst;
+    let Inst{27-4} = 0b000100101111111111110001;
+    let Inst{3-0}  = dst;
+  }
+}
+
+// SP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead.
+let isCall = 1,
+  // FIXME:  Do we really need a non-predicated version? If so, it should
+  // at least be a pseudo instruction expanding to the predicated version
+  // at MC lowering time.
+  Defs = [LR], Uses = [SP] in {
+  def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
+                IIC_Br, "bl\t$func",
+                [(ARMcall tglobaladdr:$func)]>,
+            Requires<[IsARM]> {
+    let Inst{31-28} = 0b1110;
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
+                   IIC_Br, "bl", "\t$func",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                Requires<[IsARM]> {
+    bits<24> func;
+    let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  // ARMv5T and above
+  def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
+                IIC_Br, "blx\t$func",
+                [(ARMcall GPR:$func)]>,
+            Requires<[IsARM, HasV5T]> {
+    bits<4> func;
+    let Inst{31-4} = 0b1110000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
+                    IIC_Br, "blx", "\t$func",
+                    [(ARMcall_pred GPR:$func)]>,
+                 Requires<[IsARM, HasV5T]> {
+    bits<4> func;
+    let Inst{27-4} = 0b000100101111111111110011;
+    let Inst{3-0}  = func;
+  }
+
+  // ARMv4T
+  // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
+  def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, HasV4T]>;
+
+  // ARMv4
+  def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
+                   8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
+                   Requires<[IsARM, NoV4T]>;
+
+  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
+  // return stack predictor.
+  def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
+                               8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
+                      Requires<[IsARM]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
+               IIC_Br, "b", "\t$target",
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
+    bits<24> target;
+    let Inst{23-0} = target;
+    let DecoderMethod = "DecodeBranchImmInstruction";
+  }
+
+  let isBarrier = 1 in {
+    // B is "predicable" since it's just a Bcc with an 'always' condition.
+    let isPredicable = 1 in
+    // FIXME: We shouldn't need this pseudo at all. Just using Bcc directly
+    // should be sufficient.
+    // FIXME: Is B really a Barrier? That doesn't seem right.
+    def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
+                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>;
+
+    let isNotDuplicable = 1, isIndirectBranch = 1 in {
+    def BR_JTr : ARMPseudoInst<(outs),
+                      (ins GPR:$target, i32imm:$jt, i32imm:$id),
+                      0, IIC_Br,
+                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+    // FIXME: This shouldn't use the generic "addrmode2," but rather be split
+    // into i12 and rs suffixed versions.
+    def BR_JTm : ARMPseudoInst<(outs),
+                     (ins addrmode2:$target, i32imm:$jt, i32imm:$id),
+                     0, IIC_Br,
+                     [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                       imm:$id)]>;
+    def BR_JTadd : ARMPseudoInst<(outs),
+                   (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id),
+                   0, IIC_Br,
+                   [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
+                     imm:$id)]>;
+    } // isNotDuplicable = 1, isIndirectBranch = 1
+  } // isBarrier = 1
+
+}
+
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", []>,
+           Requires<[IsARM, HasV5T]> {
+  let Inst{31-25} = 0b1111101;
+  bits<25> target;
+  let Inst{23-0} = target{24-1};
+  let Inst{24} = target{0};
+}
+
+// Branch and Exchange Jazelle
+def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
+              [/* pattern left blank */]> {
+  bits<4> func;
+  let Inst{23-20} = 0b0010;
+  let Inst{19-8} = 0xfff;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = func;
+}
+
+// Tail calls.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>;
+
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>;
+
+  def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
+                                 4, IIC_Br, [],
+                                 (Bcc br_target:$dst, (ops 14, zero_reg))>,
+                                 Requires<[IsARM]>;
+
+  def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
+                                 4, IIC_Br, [],
+                                 (BX GPR:$dst)>,
+                                 Requires<[IsARM]>;
+}
+
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+              []> {
+  bits<4> opt;
+  let Inst{23-4} = 0b01100000000000000111;
+  let Inst{3-0} = opt;
+}
+
+// Supervisor Call (Software Interrupt)
+let isCall = 1, Uses = [SP] in {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> {
+  bits<24> svc;
+  let Inst{23-0} = svc;
+}
+}
+
+// Store Return State
+class SRSI<bit wb, string asm>
+  : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<5> mode;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 1;
+  let Inst{21} = wb;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1101;  // SP
+  let Inst{15-5} = 0b00000101000;
+  let Inst{4-0} = mode;
+}
+
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+
+def : ARMInstAlias<"srsda $mode", (SRSDA imm0_31:$mode)>;
+def : ARMInstAlias<"srsda $mode!", (SRSDA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsdb $mode", (SRSDB imm0_31:$mode)>;
+def : ARMInstAlias<"srsdb $mode!", (SRSDB_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsia $mode", (SRSIA imm0_31:$mode)>;
+def : ARMInstAlias<"srsia $mode!", (SRSIA_UPD imm0_31:$mode)>;
+
+def : ARMInstAlias<"srsib $mode", (SRSIB imm0_31:$mode)>;
+def : ARMInstAlias<"srsib $mode!", (SRSIB_UPD imm0_31:$mode)>;
+
+// Return From Exception
+class RFEI<bit wb, string asm>
+  : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<4> Rn;
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 0;
+  let Inst{21} = wb;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xa00;
+}
+
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+  let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+  let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+  let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+  let Inst{24-23} = 0b11;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / Store Instructions.
+//
+
+// Load
+
+
+defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
+                    UnOpFrag<(load node:$Src)>>;
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+                    UnOpFrag<(zextloadi8 node:$Src)>>;
+defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
+    isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
+                 AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
+                 []> {
+  bits<4> Rt;
+  bits<17> addr;
+  let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rt;
+  let Inst{11-0}  = addr{11-0};   // imm12
+}
+
+// Loads with zero extension
+def LDRH  : AI3ld<0b1011, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                  IIC_iLoad_bh_r, "ldrh", "\t$Rt, $addr",
+                  [(set GPR:$Rt, (zextloadi16 addrmode3:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ld<0b1111, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsh", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ld<0b1101, 1, (outs GPR:$Rt), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoad_bh_r, "ldrsb", "\t$Rt, $addr",
+                   [(set GPR:$Rt, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
+                 (ins addrmode3:$addr), LdMiscFrm,
+                 IIC_iLoad_d_r, "ldrd", "\t$Rd, $dst2, $addr",
+                 []>, Requires<[IsARM, HasV5TE]>;
+}
+
+// Indexed loads
+multiclass AI2_ldridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode_imm12:$addr), IndexModePre, LdFrm, iii,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let DecoderMethod = "DecodeLDRPreImm";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12";
+  }
+
+  def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins ldst_so_reg:$addr), IndexModePre, LdFrm, iir,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let Inst{4} = 0;
+    let DecoderMethod = "DecodeLDRPreReg";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2";
+  }
+
+  def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                       IndexModePost, LdFrm, iir,
+                       opc, "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                      IndexModePost, LdFrm, iii,
+                      opc, "\t$Rt, $addr, $offset",
+                      "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+
+}
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// IIC_iLoad_siu depending on whether it the offset register is shifted.
+defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
+defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
+}
+
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addrmode3:$addr), IndexModePre,
+                        LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<14> addr;
+    let Inst{23}    = addr{8};      // U bit
+    let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr{12-9};   // Rn
+    let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+    let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3";
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+  def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addr_offset_none:$addr, am3offset:$offset),
+                        IndexModePost, LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                        []> {
+    bits<10> offset;
+    bits<4> addr;
+    let Inst{23}    = offset{8};      // U bit
+    let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+    let Inst{19-16} = addr;
+    let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+    let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
+  }
+}
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
+let hasExtraDefRegAllocReq = 1 in {
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addrmode3:$addr), IndexModePre,
+                          LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtLdrdPre";
+}
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addr_offset_none:$addr, am3offset:$offset),
+                          IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // hasExtraDefRegAllocReq = 1
+} // mayLoad = 1, neverHasSideEffects = 1
+
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_ru,
+                    "ldrt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, LdFrm, IIC_iLoad_ru,
+                   "ldrt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                     IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                     "ldrbt", "\t$Rt, $addr, $offset",
+                     "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                    "ldrbt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_imm8:$offset),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Unpredictable{11-8} = 0b1111;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackReg";
+    let DecoderMethod = "DecodeLDR";
+  }
+}
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
+}
+
+// Store
+
+// Stores with truncate
+def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
+               IIC_iStore_bh_r, "strh", "\t$Rt, $addr",
+               [(truncstorei16 GPR:$Rt, addrmode3:$addr)]>;
+
+// Store doubleword
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
+def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
+               StMiscFrm, IIC_iStore_d_r,
+               "strd", "\t$Rt, $src2, $addr", []>,
+           Requires<[IsARM, HasV5TE]> {
+  let Inst{21} = 0;
+}
+
+// Indexed stores
+multiclass AI2_stridx<bit isByte, string opc,
+                      InstrItinClass iii, InstrItinClass iir> {
+  def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                            (ins GPR:$Rt, addrmode_imm12:$addr), IndexModePre,
+                            StFrm, iii,
+                            opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{11-0}  = addr{11-0};   // imm12
+    let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12";
+    let DecoderMethod = "DecodeSTRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                      (ins GPR:$Rt, ldst_so_reg:$addr),
+                      IndexModePre, StFrm, iir,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23}    = addr{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};
+    let Inst{4}     = 0;           // Inst{4} = 0
+    let AsmMatchConverter = "cvtStWriteBackRegAddrMode2";
+    let DecoderMethod = "DecodeSTRPreReg";
+  }
+  def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                IndexModePost, StFrm, iir,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+     let Inst{4} = 0;
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                IndexModePost, StFrm, iii,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// IIC_iStore_siu depending on whether it the offset register is shifted.
+defm STR  : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_reg:$offset),
+             (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_imm:$offset),
+             (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_reg:$offset),
+             (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_imm:$offset),
+             (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                           (ins GPR:$Rt, addrmode3:$addr), IndexModePre,
+                           StMiscFrm, IIC_iStore_bh_ru,
+                           "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let AsmMatchConverter = "cvtStWriteBackRegAddrMode3";
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                       (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+                       IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                       "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                   [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                                      addr_offset_none:$addr,
+                                                      am3offset:$offset))]> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                          IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtStrdPre";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+                               am3offset:$offset),
+                          IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// STRT, STRBT, and STRHT
+
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+}
+
+
+multiclass AI3strT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                    (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+                    IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                    "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                      (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackReg";
+  }
+}
+
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass arm_ldst_mult<string asm, string sfx, bit L_bit, bit P_bit, Format f,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // IA is the default, so no need for an explicit suffix on the
+  // mnemonic here. Without it is the canonical spelling.
+  def IA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DA :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "da${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DA_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "da${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b00;       // Decrement After
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def DB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "db${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "db${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+  def IB :
+    AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeNone, f, itin,
+         !strconcat(asm, "ib${p}\t$Rn, $regs", sfx), "", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def IB_UPD :
+    AXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         IndexModeUpd, f, itin_upd,
+         !strconcat(asm, "ib${p}\t$Rn!, $regs", sfx), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b11;       // Increment Before
+    let Inst{22}    = P_bit;
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm LDM : arm_ldst_mult<"ldm", "", 1, 0, LdStMulFrm, IIC_iLoad_m,
+                         IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm STM : arm_ldst_mult<"stm", "", 0, 0, LdStMulFrm, IIC_iStore_m,
+                         IIC_iStore_mu>;
+
+} // neverHasSideEffects
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def LDMIA_RET : ARMPseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                 reglist:$regs, variable_ops),
+                     4, IIC_iLoad_mBr, [],
+                     (LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+      RegConstraint<"$Rn = $wb">;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm sysLDM : arm_ldst_mult<"ldm", " ^", 1, 1, LdStMulFrm, IIC_iLoad_m,
+                               IIC_iLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
+                               IIC_iStore_mu>;
+
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
+                "mov", "\t$Rd, $Rm", []>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+// A version for the smaller set of tail call registers.
+let neverHasSideEffects = 1 in
+def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+  let Inst{3-0} = Rm;
+  let Inst{15-12} = Rd;
+}
+
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+                DPSoRegRegFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src",
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-8} = src{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = src{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+                DPSoRegImmFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
+                UnaryDP {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-5} = src{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-0} = imm;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
+                 DPFrm, IIC_iMOVi,
+                 "movw", "\t$Rd, $imm",
+                 [(set GPR:$Rd, imm0_65535:$imm)]>,
+                 Requires<[IsARM, HasV6T2]>, UnaryDP {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def : InstAlias<"mov${p} $Rd, $imm",
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
+        Requires<[IsARM]>;
+
+def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+                  (ins GPR:$src, imm0_65535_expr:$imm),
+                  DPFrm, IIC_iMOVi,
+                  "movt", "\t$Rd, $imm",
+                  [(set GPRnopc:$Rd,
+                        (or (and GPR:$src, 0xffff),
+                            lo16AllZero:$imm))]>, UnaryDP,
+                  Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<16> imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-0}  = imm{11-0};
+  let Inst{19-16} = imm{15-12};
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
+}
+
+def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+} // Constraints
+
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
+let Uses = [CPSR] in
+def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
+                    [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
+                    Requires<[IsARM]>;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
+                      Requires<[IsARM]>;
+def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
+                      Requires<[IsARM]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def SXTB  : AI_ext_rrot<0b01101010,
+                         "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def SXTH  : AI_ext_rrot<0b01101011,
+                         "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+def SXTAB : AI_exta_rrot<0b01101010,
+               "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def SXTAH : AI_exta_rrot<0b01101011,
+               "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def UXTB   : AI_ext_rrot<0b01101110,
+                          "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def UXTH   : AI_ext_rrot<0b01101111,
+                          "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def UXTB16 : AI_ext_rrot<0b01101100,
+                          "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+//               (UXTB16r_rot GPR:$Src, 3)>;
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+               (UXTB16 GPR:$Src, 1)>;
+
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+
+
+def SBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111101;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+def UBFX  : I<(outs GPR:$Rd),
+              (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> lsb;
+  bits<5> width;
+  let Inst{27-21} = 0b0111111;
+  let Inst{6-4}   = 0b101;
+  let Inst{20-16} = width;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = lsb;
+  let Inst{3-0}   = Rn;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set.
+//
+// Currently, ADDS/SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real ADD/SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate ADDS/SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm ADDS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm SUBS : AsI1_bin_s_irs<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+defm ADC : AI1_adde_sube_irs<0b0101, "adc",
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+
+defm RSB  : AsI1_rbin_irs<0b0011, "rsb",
+                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                          BinOpFrag<(sub node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                           BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+defm RSC : AI1_rsc_irs<0b0111, "rsc",
+                BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
+             (SUBri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
+             (SUBSri  GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
+
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+// ARM Arithmetic Instruction
+// GPR:$dst = GPR:$a op GPR:$b
+class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
+          list<dag> pattern = [],
+          dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+          string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{27-20} = op27_20;
+  let Inst{11-4} = op11_4;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1111;
+}
+
+// Saturating add/subtract
+
+def QADD    : AAI<0b00010000, 0b00000101, "qadd",
+                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
+                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
+                  "\t$Rd, $Rm, $Rn">;
+
+def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
+def QADD8   : AAI<0b01100010, 0b11111001, "qadd8">;
+def QASX    : AAI<0b01100010, 0b11110011, "qasx">;
+def QSAX    : AAI<0b01100010, 0b11110101, "qsax">;
+def QSUB16  : AAI<0b01100010, 0b11110111, "qsub16">;
+def QSUB8   : AAI<0b01100010, 0b11111111, "qsub8">;
+def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
+def UQADD8  : AAI<0b01100110, 0b11111001, "uqadd8">;
+def UQASX   : AAI<0b01100110, 0b11110011, "uqasx">;
+def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
+def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
+def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
+
+// Signed/Unsigned add/subtract
+
+def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
+def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
+def SADD8  : AAI<0b01100001, 0b11111001, "sadd8">;
+def SSAX   : AAI<0b01100001, 0b11110101, "ssax">;
+def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
+def SSUB8  : AAI<0b01100001, 0b11111111, "ssub8">;
+def UASX   : AAI<0b01100101, 0b11110011, "uasx">;
+def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
+def UADD8  : AAI<0b01100101, 0b11111001, "uadd8">;
+def USAX   : AAI<0b01100101, 0b11110101, "usax">;
+def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
+def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
+
+// Signed/Unsigned halving add/subtract
+
+def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
+def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
+def SHADD8  : AAI<0b01100011, 0b11111001, "shadd8">;
+def SHSAX   : AAI<0b01100011, 0b11110101, "shsax">;
+def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
+def SHSUB8  : AAI<0b01100011, 0b11111111, "shsub8">;
+def UHASX   : AAI<0b01100111, 0b11110011, "uhasx">;
+def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
+def UHADD8  : AAI<0b01100111, 0b11111001, "uhadd8">;
+def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
+def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
+def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+
+def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                MulFrm /* for convenience */, NoItinerary, "usad8",
+                "\t$Rd, $Rn, $Rm", []>,
+             Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-20} = 0b01111000;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                MulFrm /* for convenience */, NoItinerary, "usada8",
+                "\t$Rd, $Rn, $Rm, $Ra", []>,
+             Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+  let Inst{27-20} = 0b01111000;
+  let Inst{7-4} = 0b0001;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8} = Rm;
+  let Inst{3-0} = Rn;
+}
+
+// Signed/Unsigned saturate
+
+def SSAT : AI<(outs GPRnopc:$Rd),
+              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110101;
+  let Inst{5-4} = 0b01;
+  let Inst{20-16} = sat_imm;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{3-0} = Rn;
+}
+
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101010;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT : AI<(outs GPRnopc:$Rd),
+              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  bits<4> Rd;
+  bits<5> sat_imm;
+  bits<4> Rn;
+  bits<8> sh;
+  let Inst{27-21} = 0b0110111;
+  let Inst{5-4} = 0b01;
+  let Inst{15-12} = Rd;
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
+  let Inst{20-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def USAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
+  bits<4> Rd;
+  bits<4> sat_imm;
+  bits<4> Rn;
+  let Inst{27-20} = 0b01101110;
+  let Inst{11-4} = 0b11110011;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = sat_imm;
+  let Inst{3-0} = Rn;
+}
+
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
+               (SSAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
+               (USAT imm:$pos, GPRnopc:$a, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<0b0000, "and",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsr,
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
+def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfc", "\t$Rd, $imm", "$src = $Rd",
+               [(set GPR:$Rd, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-0}   = 0b0011111;
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // msb
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding A1)
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+          AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+          "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+          [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+                           bf_inv_mask_imm:$imm))]>,
+          Requires<[IsARM, HasV6T2]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<10> imm;
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
+  let Inst{15-12} = Rd;
+  let Inst{11-7}  = imm{4-0}; // lsb
+  let Inst{20-16} = imm{9-5}; // width
+  let Inst{3-0}   = Rn;
+}
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
+                  "mvn", "\t$Rd, $Rm",
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{15-12} = Rd;
+  let Inst{3-0} = Rm;
+}
+def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+                  DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+}
+def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+                  DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
+                  IIC_iMVNi, "mvn", "\t$Rd, $imm",
+                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP {
+  bits<4> Rd;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-0} = imm;
+}
+
+def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
+             (BICri GPR:$src, so_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+class AsMul1I32<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = Rd;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// FIXME: The v5 pseudos are only necessary for the additional Constraint
+//        property. Remove them when it's possible to add those properties
+//        on an individual MachineInstr, not just an instruction description.
+let isCommutable = 1, TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
+                    (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                    IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
+                  [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
+                  Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b0000;
+  let Unpredictable{15-12} = 0b1111;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
+                                                    pred:$p, cc_out:$s),
+                           4, IIC_iMUL32,
+               [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
+               (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
+               Requires<[IsARM, NoV6, UseMulOps]>;
+}
+
+def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                     IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+                   Requires<[IsARM, HasV6, UseMulOps]> {
+  bits<4> Ra;
+  let Inst{15-12} = Ra;
+}
+
+let Constraints = "@earlyclobber $Rd" in
+def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
+                           (ins GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s),
+                           4, IIC_iMAC32,
+                        [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))],
+                  (MLA GPR:$Rd, GPR:$Rn, GPR:$Rm, GPR:$Ra, pred:$p, cc_out:$s)>,
+                        Requires<[IsARM, NoV6]>;
+
+def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+                   IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                   [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
+                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<4> Ra;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
+                                 (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]>;
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+
+def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                            (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                            4, IIC_iMUL64, [],
+          (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+}
+
+// Multiply + accumulate
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
+                    "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
+                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+                    "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                    Requires<[IsARM, HasV6]> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
+
+let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in {
+def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
+                              4, IIC_iMAC64, [],
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
+                           Requires<[IsARM, NoV6]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
+                              (ins GPR:$Rn, GPR:$Rm, pred:$p),
+                              4, IIC_iMAC64, [],
+          (UMAAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p)>,
+                           Requires<[IsARM, NoV6]>;
+}
+
+} // neverHasSideEffects
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+               [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6, UseMulOps]>;
+
+def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
+               (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+            Requires<[IsARM, HasV6]>;
+
+multiclass AI_smul<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sext_inreg GPR:$Rn, i16),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sext_inreg GPR:$Rm, i16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (opnode (sra GPR:$Rn, (i32 16)),
+                                      (sra GPR:$Rm, (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]>;
+
+  def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sext_inreg GPR:$Rm, i16)), (i32 16)))]>,
+           Requires<[IsARM, HasV5TE]>;
+
+  def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+              IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set GPR:$Rd, (sra (opnode GPR:$Rn,
+                                    (sra GPR:$Rm, (i32 16))), (i32 16)))]>,
+            Requires<[IsARM, HasV5TE]>;
+}
+
+
+multiclass AI_smla<string opc, PatFrag opnode> {
+  let DecoderMethod = "DecodeSMLAInstruction" in {
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd, (add GPR:$Ra,
+                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                       (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                          (sext_inreg GPRnopc:$Rm, i16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set GPRnopc:$Rd,
+                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                         (sra GPRnopc:$Rm, (i32 16)))))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                  (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
+
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+              IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set GPRnopc:$Rd,
+                 (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                    (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
+  }
+}
+
+defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+              Requires<[IsARM, HasV5TE]>;
+
+// Helper class for AI_smld.
+class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
+                    InstrItinClass itin, string opc, string asm>
+  : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22}    = long;
+  let Inst{21-20} = 0b00;
+  let Inst{11-8}  = Rm;
+  let Inst{7}     = 0;
+  let Inst{6}     = sub;
+  let Inst{5}     = swap;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = Rn;
+}
+class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{19-16} = Rd;
+}
+class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
+                InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> Ra;
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = Ra;
+}
+class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm>
+  : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+}
+
+multiclass AI_smld<bit sub, string opc> {
+
+  def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+
+  def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+
+  def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+
+}
+
+defm SMLA : AI_smld<0, "smla">;
+defm SMLS : AI_smld<1, "smls">;
+
+multiclass AI_sdml<bit sub, string opc> {
+
+  def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+}
+
+defm SMUA : AI_sdml<0, "smua">;
+defm SMUS : AI_sdml<1, "smus">;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "sdiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "udiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AMiscA1I<0b000010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "clz", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ctlz GPR:$Rm))]>, Requires<[IsARM, HasV5T]>,
+           Sched<[WriteALU]>;
+
+def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rbit", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>,
+           Requires<[IsARM, HasV6T2]>,
+           Sched<[WriteALU]>;
+
+def REV  : AMiscA1I<0b01101011, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm),
+              IIC_iUNAr, "rev", "\t$Rd, $Rm",
+              [(set GPR:$Rd, (bswap GPR:$Rm))]>, Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+let AddedComplexity = 5 in
+def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "rev16", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (rotr (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+let AddedComplexity = 5 in
+def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
+               IIC_iUNAr, "revsh", "\t$Rd, $Rm",
+               [(set GPR:$Rd, (sra (bswap GPR:$Rm), (i32 16)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALU]>;
+
+def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
+                   (and (srl GPR:$Rm, (i32 8)), 0xFF)),
+               (REVSH GPR:$Rm)>;
+
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
+               IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+                                      (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
+               IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+                                      (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+                                           0xFFFF)))]>,
+               Requires<[IsARM, HasV6]>,
+           Sched<[WriteALUsi, ReadALU]>;
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16_31:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        IIC_iCMPi, IIC_iCMPr, IIC_iCMPsr,
+                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+
+// ARMcmpZ can re-use the above instruction definitions.
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
+             (CMPri   GPR:$src, so_imm:$imm)>;
+def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
+             (CMPrr   GPR:$src, GPR:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+             (CMPrsi   GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+             (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
+
+// CMN register-integer
+let isCompare = 1, Defs = [CPSR] in {
+def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
+                "cmn", "\t$Rn, $imm",
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]> {
+  bits<4> Rn;
+  bits<12> imm;
+  let Inst{25} = 1;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-0} = imm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+// CMN register-register/shift
+def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
+                 "cmn", "\t$Rn, $Rm",
+                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                   GPR:$Rn, GPR:$Rm)]> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let isCommutable = 1;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rm;
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsi : AI1<0b1011, (outs),
+                  (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPR:$Rn, so_reg_imm:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+def CMNzrsr : AI1<0b1011, (outs),
+                  (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
+                  "cmn", "\t$Rn, $shift",
+                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                    GPRnopc:$Rn, so_reg_reg:$shift)]> {
+  bits<4> Rn;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b0000;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
+
+  let Unpredictable{15-12} = 0b1111;
+}
+
+}
+
+def : ARMPat<(ARMcmp  GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
+             (CMNri   GPR:$src, so_imm_neg:$imm)>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<0b1000, "tst",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>;
+defm TEQ  : AI1_cmp_irs<0b1001, "teq",
+                        IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
+                      BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
+
+// Pseudo i64 compares for some floating point compares.
+let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
+    Defs = [CPSR] in {
+def BCCi64 : PseudoInst<(outs),
+    (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
+     IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>;
+
+def BCCZi64 : PseudoInst<(outs),
+     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>;
+} // usesCustomInserter
+
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let neverHasSideEffects = 1 in {
+
+let isCommutable = 1, isSelect = 1 in
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
+                           4, IIC_iCMOVr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
+                           4, IIC_iCMOVsr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
+                           4, IIC_iCMOVsr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+
+
+let isMoveImm = 1 in
+def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
+                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
+                             4, IIC_iMOVi,
+                             []>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+
+let isMoveImm = 1 in
+def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           4, IIC_iCMOVi,
+   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+
+// Two instruction predicate mov immediate.
+let isMoveImm = 1 in
+def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd),
+                                (ins GPR:$false, i32imm:$src, pred:$p),
+                  8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+
+let isMoveImm = 1 in
+def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           4, IIC_iCMOVi,
+ [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">;
+
+} // neverHasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let ParserMethod = "parseMemBarrierOptOperand";
+}
+def memb_opt : Operand<i32> {
+  let PrintMethod = "printMemBOption";
+  let ParserMatchClass = MemBarrierOptOperand;
+  let DecoderMethod = "DecodeMemBarrierOption";
+}
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff05;
+  let Inst{3-0} = opt;
+}
+}
+
+def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "dsb", "\t$opt", []>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff04;
+  let Inst{3-0} = opt;
+}
+
+// ISB has only full system option
+def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+                "isb", "\t$opt", []>,
+                Requires<[IsARM, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf57ff06;
+  let Inst{3-0} = opt;
+}
+
+// Pseudo instruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in
+def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+
+let usesCustomInserter = 1 in {
+  let Defs = [CPSR] in {
+    def ATOMIC_LOAD_ADD_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umin_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umax_8 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_ADD_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umin_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umax_16 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_ADD_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_MIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_MAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umin_32 GPR:$ptr, GPR:$val))]>;
+    def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
+      [(set GPR:$dst, (atomic_load_umax_32 GPR:$ptr, GPR:$val))]>;
+
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
+}
+}
+
+let usesCustomInserter = 1 in {
+    def COPY_STRUCT_BYVAL_I32 : PseudoInst<
+      (outs), (ins GPR:$dst, GPR:$src, i32imm:$size, i32imm:$alignment),
+      NoItinerary,
+      [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
+}
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary,
+                    "ldrexb", "\t$Rt, $addr", []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexh", "\t$Rt, $addr", []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrex", "\t$Rt, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
+let hasExtraSrcRegAllocReq = 1 in
+def STREXD : AIstrex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
+}
+
+
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
+            Requires<[IsARM, HasV7]>  {
+  let Inst{31-0} = 0b11110101011111111111000000011111;
+}
+
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+def SWPB: AIswp<1, (outs GPRnopc:$Rt),
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor Instructions.
+//
+
+def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+            c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+            NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+            [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                          imm:$CRm, imm:$opc2)]> {
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+def CDP2 : ABXI<0b1110, (outs), (ins pf_imm:$cop, imm0_15:$opc1,
+               c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+               NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+               [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                              imm:$CRm, imm:$opc2)]> {
+  let Inst{31-28} = 0b1111;
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+class ACI<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
+  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+      opc, asm, "", []> {
+  let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
+  : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+         opc, asm, "", []> {
+  let Inst{31-28} = 0b1111;
+  let Inst{27-25} = 0b110;
+}
+multiclass LdStCop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                              postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACI<(outs),
+                    (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                         coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                       asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                                 postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : ACInoP<(outs),
+                       (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                            coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm LDC   : LdStCop <1, 0, "ldc">;
+defm LDCL  : LdStCop <1, 1, "ldcl">;
+defm STC   : LdStCop <0, 0, "stc">;
+defm STCL  : LdStCop <0, 1, "stcl">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
+defm STC2  : LdSt2Cop<0, 0, "stc2">;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">;
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register.
+//
+
+class MovRCopro<string opc, bit direction, dag oops, dag iops,
+                list<dag> pattern>
+  : ABI<0b1110, oops, iops, NoItinerary, opc,
+        "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2", pattern> {
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
+                    (outs),
+                    (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, imm0_7:$opc2),
+                    [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]>;
+def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
+                    (outs GPR:$Rt),
+                    (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                        c_imm:$CRm, 0, pred:$p)>;
+
+def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+             (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRCopro2<string opc, bit direction, dag oops, dag iops,
+                 list<dag> pattern>
+  : ABXI<0b1110, oops, iops, NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
+  let Inst{31-28} = 0b1111;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
+                      (outs),
+                      (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                           c_imm:$CRm, imm0_7:$opc2),
+                      [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                                     imm:$CRm, imm:$opc2)]>;
+def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
+                      (outs GPR:$Rt),
+                      (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           imm0_7:$opc2), []>;
+def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
+                   (MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+
+def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
+                              imm:$CRm, imm:$opc2),
+                (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
+  : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+        GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm),
+        NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
+                      [(int_arm_mcrr imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                     GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
+
+class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
+  : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+         GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+  let Inst{31-28} = 0b1111;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+
+  let DecoderMethod = "DecodeMRRC2";
+}
+
+def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
+                        [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPRnopc:$Rt,
+                                        GPRnopc:$Rt2, imm:$CRm)]>;
+def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register
+//
+
+// Move to ARM core register from Special Register
+def MRS : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+              "mrs", "\t$Rd, apsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b00001111;
+  let Unpredictable{19-17} = 0b111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPRnopc:$Rd, pred:$p)>,
+         Requires<[IsARM]>;
+
+// The MRSsys instruction is the MRS instruction from the ARM ARM,
+// section B9.3.9, with the R bit set to 1.
+def MRSsys : ABI<0b0001, (outs GPRnopc:$Rd), (ins), NoItinerary,
+                 "mrs", "\t$Rd, spsr", []> {
+  bits<4> Rd;
+  let Inst{23-16} = 0b01001111;
+  let Unpredictable{19-16} = 0b1111;
+
+  let Inst{15-12} = Rd;
+
+  let Inst{11-0} = 0b000000000000;
+  let Unpredictable{11-0} = 0b110100001111;
+}
+
+// Move from ARM core register to Special Register
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
+              "msr", "\t$mask, $Rn", []> {
+  bits<5> mask;
+  bits<4> Rn;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-4} = 0b00000000;
+  let Inst{3-0} = Rn;
+}
+
+def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
+               "msr", "\t$mask, $a", []> {
+  bits<5> mask;
+  bits<12> a;
+
+  let Inst{23} = 0;
+  let Inst{22} = mask{4}; // R bit
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = mask{3-0};
+  let Inst{15-12} = 0b1111;
+  let Inst{11-0} = a;
+}
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
+  def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   A constant value is passed in $val, and we use the location as a scratch.
+//
+// These are pseudo-instructions and are lowered to individual MC-insts, so
+// no encoding information is necessary.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                               NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                           Requires<[IsARM, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def Int_eh_sjlj_setjmp_nofp : PseudoInst<(outs), (ins GPR:$src, GPR:$val),
+                                   NoItinerary,
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>,
+                                Requires<[IsARM, NoVFP]>;
+}
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1,
+    Defs = [ R7, LR, SP ] in {
+def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
+                             NoItinerary,
+                         [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                                Requires<[IsARM, IsIOS]>;
+}
+
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
+def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// ARMv4 indirect branch using (MOVr PC, dst)
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
+  def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
+                    4, IIC_Br, [(brind GPR:$dst)],
+                    (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
+                  Requires<[IsARM, NoV4T]>;
+
+// Large immediate handling.
+
+// 32-bit immediate using two piece so_imms or movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def MOVi32imm : PseudoInst<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                           [(set GPR:$dst, (arm_i32imm:$src))]>,
+                           Requires<[IsARM]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if PIC).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def MOV_ga_pcrel : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2addpc,
+                        [(set GPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+def MOV_ga_dyn : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                             IIC_iMOVix2,
+                        [(set GPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
+                        Requires<[IsARM, UseMovt]>;
+
+let AddedComplexity = 10 in
+def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2ld,
+                    [(set GPR:$dst, (load (ARMWrapperPIC tglobaladdr:$addr)))]>,
+                    Requires<[IsARM, UseMovt]>;
+} // isReMaterializable
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
+            Requires<[IsARM, DontUseMovt]>;
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+// Tail calls. These patterns also apply to Thumb mode.
+def : Pat<(ARMtcret tcGPR:$dst), (TCRETURNri tcGPR:$dst)>;
+def : Pat<(ARMtcret (i32 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
+def : Pat<(ARMtcret (i32 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall_nolink texternalsym:$func),
+             (BMOVPCB_CALL texternalsym:$func)>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode_imm12:$addr), (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(zextloadi1 ldst_so_reg:$addr),    (LDRBrs ldst_so_reg:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi1 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+def : ARMPat<(extloadi8 addrmode_imm12:$addr),  (LDRBi12 addrmode_imm12:$addr)>;
+def : ARMPat<(extloadi8 ldst_so_reg:$addr),     (LDRBrs ldst_so_reg:$addr)>;
+
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5MOPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+
+// Pre-v7 uses MCR for synchronization barriers.
+def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
+         Requires<[IsARM, HasV6]>;
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+               (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+               (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8),  (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+               (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+               (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+             (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+             (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+             (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+             (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+             (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+             (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+             (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+             (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+             (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+             (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Memory barriers
+def : InstAlias<"dmb", (DMB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"dsb", (DSB 0xf)>, Requires<[IsARM, HasDB]>;
+def : InstAlias<"isb", (ISB 0xf)>, Requires<[IsARM, HasDB]>;
+
+// System instructions
+def : MnemonicAlias<"swi", "svc">;
+
+// Load / Store Multiple
+def : MnemonicAlias<"ldmfd", "ldm">;
+def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
+def : MnemonicAlias<"stmfd", "stmdb">;
+def : MnemonicAlias<"stmia", "stm">;
+def : MnemonicAlias<"stmea", "stm">;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+                (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+                (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+                (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+                (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+                (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+                (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsda">;
+def : MnemonicAlias<"srsea", "srsdb">;
+def : MnemonicAlias<"srsfd", "srsia">;
+def : MnemonicAlias<"srsed", "srsib">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// "mov Rd, so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+                   (MVNi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+                   (MOVi rGPR:$Rd, so_imm_not:$imm, pred:$p, cc_out:$s)>;
+// Same for AND <--> BIC
+def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                   (ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+                   (ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                   (BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+                   (BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                          pred:$p, cc_out:$s)>;
+
+// Likewise, "add Rd, so_imm_neg" -> sub
+def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rn, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+                 (SUBri GPR:$Rd, GPR:$Rd, so_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via so_imm_neg
+def : ARMInstAlias<"cmp${p} $Rd, $imm",
+                   (CMNri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+def : ARMInstAlias<"cmn${p} $Rd, $imm",
+                   (CMPri rGPR:$Rd, so_imm_neg:$imm, pred:$p)>;
+
+// The shifter forms of the MOV instruction are aliased to the ASR, LSL,
+// LSR, ROR, and RRX instructions.
+// FIXME: We need C++ parser hooks to map the alias to the MOV
+//        encoding. It seems we should be able to do that sort of thing
+//        in tblgen, but it could get ugly.
+let TwoOperandAliasConstraint = "$Rm = $Rd" in {
+def ASRi : ARMAsmPseudo<"asr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSRi : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_32:$imm, pred:$p,
+                             cc_out:$s)>;
+def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
+                        (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
+                             cc_out:$s)>;
+}
+def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+let TwoOperandAliasConstraint = "$Rn = $Rd" in {
+def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+}
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : ARMInstAlias<"neg${s}${p} $Rd, $Rm",
+                   (RSBri GPR:$Rd, GPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// Pre-v6, 'mov r0, r0' was used as a NOP encoding.
+def : InstAlias<"nop${p}", (MOVr R0, R0, pred:$p, zero_reg)>,
+         Requires<[IsARM, NoV6]>;
+
+// UMULL/SMULL are available on all arches, but the instruction definitions
+// need difference constraints pre-v6. Use these aliases for the assembly
+// parsing on pre-v6.
+def : InstAlias<"smull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
+            (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+         Requires<[IsARM, NoV6]>;
+
+// 'it' blocks in ARM mode just validate the predicates. The IT itself
+// is discarded.
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 000000000000..0411ac4e282a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -0,0 +1,6806 @@
+//===-- ARMInstrNEON.td - NEON support for ARM -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def nModImm : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+}
+
+def nImmSplatI8AsmOperand : AsmOperandClass { let Name = "NEONi8splat"; }
+def nImmSplatI8 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI8AsmOperand;
+}
+def nImmSplatI16AsmOperand : AsmOperandClass { let Name = "NEONi16splat"; }
+def nImmSplatI16 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI16AsmOperand;
+}
+def nImmSplatI32AsmOperand : AsmOperandClass { let Name = "NEONi32splat"; }
+def nImmSplatI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI32AsmOperand;
+}
+def nImmVMOVI32AsmOperand : AsmOperandClass { let Name = "NEONi32vmov"; }
+def nImmVMOVI32 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperand;
+}
+def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
+def nImmVMOVI32Neg : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32NegAsmOperand;
+}
+def nImmVMOVF32 : Operand<i32> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
+def nImmSplatI64 : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmSplatI64AsmOperand;
+}
+
+def VectorIndex8Operand  : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndex8Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndex16Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndex32Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Register list of one D register.
+def VecListOneDAsmOperand : AsmOperandClass {
+  let Name = "VecListOneD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneD : RegisterOperand<DPR, "printVectorListOne"> {
+  let ParserMatchClass = VecListOneDAsmOperand;
+}
+// Register list of two sequential D registers.
+def VecListDPairAsmOperand : AsmOperandClass {
+  let Name = "VecListDPair";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPair : RegisterOperand<DPair, "printVectorListTwo"> {
+  let ParserMatchClass = VecListDPairAsmOperand;
+}
+// Register list of three sequential D registers.
+def VecListThreeDAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeD : RegisterOperand<DPR, "printVectorListThree"> {
+  let ParserMatchClass = VecListThreeDAsmOperand;
+}
+// Register list of four sequential D registers.
+def VecListFourDAsmOperand : AsmOperandClass {
+  let Name = "VecListFourD";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourD : RegisterOperand<DPR, "printVectorListFour"> {
+  let ParserMatchClass = VecListFourDAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpaced";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpaced : RegisterOperand<DPair, "printVectorListTwoSpaced"> {
+  let ParserMatchClass = VecListDPairSpacedAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListThreeQAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQ : RegisterOperand<DPR, "printVectorListThreeSpaced"> {
+  let ParserMatchClass = VecListThreeQAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three Q registers).
+def VecListFourQAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQ";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQ : RegisterOperand<DPR, "printVectorListFourSpaced"> {
+  let ParserMatchClass = VecListFourQAsmOperand;
+}
+
+// Register list of one D register, with "all lanes" subscripting.
+def VecListOneDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListOneDAllLanes : RegisterOperand<DPR, "printVectorListOneAllLanes"> {
+  let ParserMatchClass = VecListOneDAllLanesAsmOperand;
+}
+// Register list of two D registers, with "all lanes" subscripting.
+def VecListDPairAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairAllLanes : RegisterOperand<DPair,
+                                           "printVectorListTwoAllLanes"> {
+  let ParserMatchClass = VecListDPairAllLanesAsmOperand;
+}
+// Register list of two D registers spaced by 2 (two sequential Q registers).
+def VecListDPairSpacedAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListDPairSpacedAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListDPairSpacedAllLanes : RegisterOperand<DPair,
+                                         "printVectorListTwoSpacedAllLanes"> {
+  let ParserMatchClass = VecListDPairSpacedAllLanesAsmOperand;
+}
+// Register list of three D registers, with "all lanes" subscripting.
+def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeDAllLanes : RegisterOperand<DPR,
+                                            "printVectorListThreeAllLanes"> {
+  let ParserMatchClass = VecListThreeDAllLanesAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three sequential Q regs).
+def VecListThreeQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListThreeSpacedAllLanes"> {
+  let ParserMatchClass = VecListThreeQAllLanesAsmOperand;
+}
+// Register list of four D registers, with "all lanes" subscripting.
+def VecListFourDAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourDAllLanes : RegisterOperand<DPR, "printVectorListFourAllLanes"> {
+  let ParserMatchClass = VecListFourDAllLanesAsmOperand;
+}
+// Register list of four D registers spaced by 2 (four sequential Q regs).
+def VecListFourQAllLanesAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQAllLanes";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListOperands";
+}
+def VecListFourQAllLanes : RegisterOperand<DPR,
+                                         "printVectorListFourSpacedAllLanes"> {
+  let ParserMatchClass = VecListFourQAllLanesAsmOperand;
+}
+
+
+// Register list of one D register, with byte lane subscripting.
+def VecListOneDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListOneDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListOneDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListOneDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListOneDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListOneDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of two D registers with byte lane subscripting.
+def VecListTwoDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListTwoDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of two Q registers with half-word lane subscripting.
+def VecListTwoQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListTwoQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListTwoQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListTwoQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListTwoQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+
+// Register list of three D registers with byte lane subscripting.
+def VecListThreeDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListThreeDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of three Q registers with half-word lane subscripting.
+def VecListThreeQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListThreeQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListThreeQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListThreeQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListThreeQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+// Register list of four D registers with byte lane subscripting.
+def VecListFourDByteIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDByteIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDByteIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDByteIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with half-word lane subscripting.
+def VecListFourDHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourDWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourDWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourDWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourDWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// Register list of four Q registers with half-word lane subscripting.
+def VecListFourQHWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQHWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQHWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQHWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+// ...with word lane subscripting.
+def VecListFourQWordIndexAsmOperand : AsmOperandClass {
+  let Name = "VecListFourQWordIndexed";
+  let ParserMethod = "parseVectorList";
+  let RenderMethod = "addVecListIndexedOperands";
+}
+def VecListFourQWordIndexed : Operand<i32> {
+  let ParserMatchClass = VecListFourQWordIndexAsmOperand;
+  let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
+}
+
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
+def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 2;
+}]>;
+def hword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 2;
+}]>;
+def byte_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 1;
+}]>;
+def byte_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 1;
+}]>;
+def non_word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def non_word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+def SDTARMVCMPZ   : SDTypeProfile<1, 1, []>;
+
+def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvceqz     : SDNode<"ARMISD::VCEQZ", SDTARMVCMPZ>;
+def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgez     : SDNode<"ARMISD::VCGEZ", SDTARMVCMPZ>;
+def NEONvclez     : SDNode<"ARMISD::VCLEZ", SDTARMVCMPZ>;
+def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtz     : SDNode<"ARMISD::VCGTZ", SDTARMVCMPZ>;
+def NEONvcltz     : SDNode<"ARMISD::VCLTZ", SDTARMVCMPZ>;
+def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates.  The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types.  The "SHINS" version is for shift and insert operations.
+def SDTARMVSH     : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHX    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshlls    : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>;
+def NEONvshllu    : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>;
+def NEONvshlli    : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>;
+def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru    : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn    : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls    : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu    : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu   : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns   : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu   : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu  : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns  : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu  : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli      : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri      : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def NEONvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
+def NEONvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
+
+def NEONvbsl      : SDNode<"ARMISD::VBSL",
+                           SDTypeProfile<1, 3, [SDTCisVec<0>,
+                                                SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<0, 2>,
+                                                SDTCisSameAs<0, 3>]>>;
+
+def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane  : SDNode<"ARMISD::VDUPLANE",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT    : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>,
+                                         SDTCisSameAs<0, 3>]>;
+def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
+
+def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
+def SDTARMFMAX    : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>]>;
+def NEONfmax      : SDNode<"ARMISD::FMAX", SDTARMFMAX>;
+def NEONfmin      : SDNode<"ARMISD::FMIN", SDTARMFMAX>;
+
+def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 32 && EltVal == 0);
+}]>;
+
+def NEONimmAllOnesV: PatLeaf<(NEONvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeNEONModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == 8 && EltVal == 0xff);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+// Use VLDM to load a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VLDMD after reg alloc.
+def VLDMQIA
+  : PseudoVFPLdStM<(outs DPair:$dst), (ins GPR:$Rn),
+                    IIC_fpLoad_m, "",
+                   [(set DPair:$dst, (v2f64 (load GPR:$Rn)))]>;
+
+// Use VSTM to store a Q register as a D register pair.
+// This is a pseudo instruction that is expanded to VSTMD after reg alloc.
+def VSTMQIA
+  : PseudoVFPLdStM<(outs), (ins DPair:$src, GPR:$Rn),
+                    IIC_fpStore_m, "",
+                   [(store (v2f64 DPair:$src), GPR:$Rn)]>;
+
+// Classes for VLD* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+class VLDQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst), (ins addrmode6:$addr), itin, "">;
+class VLDQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr), itin,
+                "$addr.addr = $wb">;
+class VLDQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset), itin,
+                "$addr.addr = $wb">;
+
+
+class VLDQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+                "$src = $dst">;
+class VLDQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb, $src = $dst">;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
+          (ins addrmode6:$Rn), IIC_VLD1,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+class VLD1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
+          (ins addrmode6:$Rn), IIC_VLD1x2,
+          "vld1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16">;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32">;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64">;
+
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8">;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16">;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32">;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64">;
+
+// ...with address register writeback:
+multiclass VLD1DWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                     (ins addrmode6:$Rn), IIC_VLD1u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+multiclass VLD1QWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8">;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16">;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32">;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64">;
+defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8">;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16">;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32">;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64">;
+
+// ...with 3 registers
+class VLD1D3<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
+          (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16">;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32">;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64">;
+
+defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8">;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16">;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32">;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64">;
+
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
+
+// ...with 4 registers
+class VLD1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
+          (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt,
+          "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16">;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32">;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64">;
+
+defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8">;
+defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16">;
+defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32">;
+defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64">;
+
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
+
+//   VLD2     : Vector Load (multiple 2-element structures)
+class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+           InstrItinClass itin>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
+          (ins addrmode6:$Rn), itin,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2>;
+
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2>;
+
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+
+// ...with address register writeback:
+multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                  RegisterOperand VdTy, InstrItinClass itin> {
+  def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                     (ins addrmode6:$Rn), itin,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm), itin,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVLDInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u>;
+
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u>;
+
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+
+// ...with double-spaced registers
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u>;
+
+//   VLD3     : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn), IIC_VLD3,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
+def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
+def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
+
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+
+// ...with address register writeback:
+class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
+def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
+def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+
+// ...with double-spaced registers:
+def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
+def VLD3q16     : VLD3D<0b0101, {0,1,0,?}, "16">;
+def VLD3q32     : VLD3D<0b0101, {1,0,0,?}, "32">;
+def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
+def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
+def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+
+//   VLD4     : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn), IIC_VLD4,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
+def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
+def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
+
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+
+// ...with address register writeback:
+class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
+}
+
+def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
+def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
+def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+
+// ...with double-spaced registers:
+def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
+def VLD4q16     : VLD4D<0b0001, {0,1,?,?}, "16">;
+def VLD4q32     : VLD4D<0b0001, {1,0,?,?}, "32">;
+def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
+def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
+def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+// Classes for VLD*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VLDQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst),
+                (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst),
+                (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+class VLDQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst),
+                (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "$src = $dst">;
+class VLDQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb, $src = $dst">;
+
+//   VLD1LN   : Vector Load (single element to one lane)
+class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag LoadOp>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd),
+          (ins addrmode6oneL32:$Rn, DPR:$src, nohash_imm:$lane),
+          IIC_VLD1ln, "vld1", Dt, "\\{$Vd[$lane]\\}, $Rn",
+          "$src = $Vd",
+          [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
+                                         (i32 (LoadOp addrmode6oneL32:$Rn)),
+                                         imm:$lane))]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
+}
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+  let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
+                                               (i32 (LoadOp addrmode6:$addr)),
+                                               imm:$lane))];
+}
+
+def VLD1LNd8  : VLD1LN<0b0000, {?,?,?,0}, "8", v8i8, extloadi8> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16 : VLD1LN<0b0100, {?,?,0,?}, "16", v4i16, extloadi16> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{5-4} = Rn{5-4};
+}
+def VLD1LNd32 : VLD1LN32<0b1000, {?,0,?,?}, "32", v2i32, load> {
+  let Inst{7} = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VLD1LNq8Pseudo  : VLD1QLNPseudo<v16i8, extloadi8>;
+def VLD1LNq16Pseudo : VLD1QLNPseudo<v8i16, extloadi16>;
+def VLD1LNq32Pseudo : VLD1QLNPseudo<v4i32, load>;
+
+def : Pat<(vector_insert (v2f32 DPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4f32 QPR:$src),
+                         (f32 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+
+// ...with address register writeback:
+class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$src = $Vd, $Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVLD1LN";
+}
+
+def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD1LNd16_UPD : VLD1LNWB<0b0100, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{4};
+  let Inst{4} = Rn{4};
+}
+
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+
+//   VLD2LN   : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+          IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16 : VLD2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+
+// ...with double-spaced registers:
+def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+
+// ...with address register writeback:
+class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, nohash_imm:$lane), IIC_VLD2lnu, "vld2", Dt,
+          "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
+}
+
+def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD2LNd16_UPD : VLD2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+
+def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+
+//   VLD3LN   : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
+          nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16 : VLD3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+
+// ...with double-spaced registers:
+def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+
+// ...with address register writeback:
+class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VLD3lnu, "vld3", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
+          []> {
+  let DecoderMethod = "DecodeVLD3LN";
+}
+
+def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD3LNd16_UPD : VLD3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+
+def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7} = lane{0};
+}
+
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+
+//   VLD4LN   : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+          nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
+          "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN";
+}
+
+def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16 : VLD4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+
+// ...with double-spaced registers:
+def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+
+// ...with address register writeback:
+class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b10, op11_8, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VLD4lnu, "vld4", Dt,
+"\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn$Rm",
+"$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
+          []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN"  ;
+}
+
+def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VLD4LNd16_UPD : VLD4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+
+def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7} = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+//   VLD1DUP  : Vector Load (single element to all lanes)
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
+          (ins addrmode6dup:$Rn),
+          IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListOneDAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>;
+
+def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPd32 addrmode6:$addr)>;
+
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+  : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
+          (ins addrmode6dup:$Rn), IIC_VLD1dup,
+          "vld1", Dt, "$Vd, $Rn", "",
+          [(set VecListDPairAllLanes:$Vd,
+                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
+}
+
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load>;
+
+def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+          (VLD1DUPq32 addrmode6:$addr)>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// ...with address register writeback:
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListOneDAllLanes:$Vd, GPR:$wb),
+                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
+                     (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     "vld1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<1, 0b10, 0b1100, op7_4,
+                        (outs VecListDPairAllLanes:$Vd, GPR:$wb),
+                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        "vld1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD1DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8">;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16">;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32">;
+
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8">;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16">;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32">;
+
+//   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy>
+  : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
+          (ins addrmode6dup:$Rn), IIC_VLD2dup,
+          "vld2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
+}
+
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes>;
+
+// ...with double-spaced registers
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+
+// ...with address register writeback:
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
+  def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
+                     (outs VdTy:$Vd, GPR:$wb),
+                     (ins addrmode6dup:$Rn), IIC_VLD2dupu,
+                     "vld2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbFixed";
+  }
+  def _register : NLdSt<1, 0b10, 0b1101, op7_4,
+                        (outs VdTy:$Vd, GPR:$wb),
+                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+                        "vld2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVLD2DupInstruction";
+    let AsmMatchConverter = "cvtVLDwbRegister";
+  }
+}
+
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes>;
+
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+
+//   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+class VLD3DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6dup:$Rn), IIC_VLD3dup,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
+def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
+def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
+
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
+
+// ...with address register writeback:
+class VLD3DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
+}
+
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
+
+def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8">;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+
+//   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
+class VLD4DUP<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6dup:$Rn), IIC_VLD4dup,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
+def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
+def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+
+// ...with double-spaced registers (not used for codegen):
+def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
+def VLD4DUPq16 : VLD4DUP<{0,1,1,?}, "16">;
+def VLD4DUPq32 : VLD4DUP<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+// ...with address register writeback:
+class VLD4DUPWB<bits<4> op7_4, string Dt>
+  : NLdSt<1, 0b10, 0b1111, op7_4,
+          (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
+          "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
+}
+
+def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
+def VLD4DUPd16_UPD : VLD4DUPWB<{0,1,0,?}, "16">;
+def VLD4DUPd32_UPD : VLD4DUPWB<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPq8_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
+def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
+def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
+
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src), itin, "">;
+class VSTQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), itin, "">;
+class VSTQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBfixedPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+class VSTQQWBregisterPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, rGPR:$offset, QQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+class VSTQQQQPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src), itin, "">;
+class VSTQQQQWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
+                "$addr.addr = $wb">;
+
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, VecListOneD:$Vd),
+          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+class VST1Q<bits<4> op7_4, string Dt>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
+          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def  VST1d8   : VST1D<{0,0,0,?}, "8">;
+def  VST1d16  : VST1D<{0,1,0,?}, "16">;
+def  VST1d32  : VST1D<{1,0,0,?}, "32">;
+def  VST1d64  : VST1D<{1,1,0,?}, "64">;
+
+def  VST1q8   : VST1Q<{0,0,?,?}, "8">;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16">;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32">;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64">;
+
+// ...with address register writeback:
+multiclass VST1DWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
+                     (ins addrmode6:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+                        IIC_VLD1u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{4} = Rn{4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+multiclass VST1QWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                    (ins addrmode6:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+                        IIC_VLD1x2u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+
+defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8">;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16">;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32">;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64">;
+
+defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8">;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16">;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32">;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64">;
+
+// ...with 3 registers
+class VST1D3<bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
+          (ins addrmode6:$Rn, VecListThreeD:$Vd),
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+multiclass VST1D3WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                    (ins addrmode6:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        IIC_VLD1x3u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+
+def VST1d8T     : VST1D3<{0,0,0,?}, "8">;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16">;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32">;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64">;
+
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8">;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16">;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">;
+
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+
+// ...with 4 registers
+class VST1D4<bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
+          (ins addrmode6:$Rn, VecListFourD:$Vd),
+          IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
+          []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+multiclass VST1D4WB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                    (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                     "vst1", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1x4u,
+                        "vst1", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8">;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16">;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32">;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64">;
+
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8">;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16">;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">;
+
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+
+//   VST2     : Vector Store (multiple 2-element structures)
+class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
+            InstrItinClass itin>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, VdTy:$Vd),
+          itin, "vst2", Dt, "$Vd, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2>;
+
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2>;
+
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+
+// ...with address register writeback:
+multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
+                   RegisterOperand VdTy> {
+  def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                     (ins addrmode6:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+multiclass VST2QWB<bits<4> op7_4, string Dt> {
+  def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                     (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     "vst2", Dt, "$Vd, $Rn!",
+                     "$Rn.addr = $wb", []> {
+    let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbFixed";
+  }
+  def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
+                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        IIC_VLD1u,
+                        "vst2", Dt, "$Vd, $Rn, $Rm",
+                        "$Rn.addr = $wb", []> {
+    let Inst{5-4} = Rn{5-4};
+    let DecoderMethod = "DecodeVSTInstruction";
+    let AsmMatchConverter = "cvtVSTwbRegister";
+  }
+}
+
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair>;
+
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8">;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16">;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32">;
+
+def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+
+// ...with double-spaced registers
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced>;
+
+//   VST3     : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
+def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
+def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
+
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+
+// ...with address register writeback:
+class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
+def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
+def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
+
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+
+// ...with double-spaced registers:
+def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
+def VST3q16     : VST3D<0b0101, {0,1,0,?}, "16">;
+def VST3q32     : VST3D<0b0101, {1,0,0,?}, "32">;
+def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
+def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
+def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
+
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+
+//   VST4     : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
+def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
+def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
+
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+
+// ...with address register writeback:
+class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
+           "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
+}
+
+def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
+def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
+def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
+
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+
+// ...with double-spaced registers:
+def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
+def VST4q16     : VST4D<0b0001, {0,1,?,?}, "16">;
+def VST4q32     : VST4D<0b0001, {1,0,?,?}, "32">;
+def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
+def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
+def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
+
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+// ...alternate versions to be allocated odd register numbers:
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// Classes for VST*LN pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+class VSTQQQQLNPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQQQPR:$src, nohash_imm:$lane),
+                itin, "">;
+class VSTQQQQLNWBPseudo<InstrItinClass itin>
+  : PseudoNLdSt<(outs GPR:$wb),
+                (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src,
+                 nohash_imm:$lane), itin, "$addr.addr = $wb">;
+
+//   VST1LN   : Vector Store (single element from one lane)
+class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+             PatFrag StoreOp, SDNode ExtractOp, Operand AddrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
+          IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNPseudo<IIC_VST1ln> {
+  let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                          addrmode6:$addr)];
+}
+
+def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
+                       NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
+                       addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo  : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
+
+def : Pat<(store (extractelt (v2f32 DPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(store (extractelt (v4f32 QPR:$src), imm:$lane), addrmode6:$addr),
+          (VST1LNq32Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+
+// ...with address register writeback:
+class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
+               PatFrag StoreOp, SDNode ExtractOp, Operand AdrMode>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins AdrMode:$Rn, am6offset:$Rm,
+           DPR:$Vd, nohash_imm:$lane), IIC_VST1lnu, "vst1", Dt,
+          "\\{$Vd[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb",
+          [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
+                                  AdrMode:$Rn, am6offset:$Rm))]> {
+  let DecoderMethod = "DecodeVST1LN";
+}
+class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
+  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
+                                        addrmode6:$addr, am6offset:$offset))];
+}
+
+def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
+                             NEONvgetlaneu, addrmode6> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
+                             extractelt, addrmode6oneL32> {
+  let Inst{7}   = lane{0};
+  let Inst{5-4} = Rn{5-4};
+}
+
+def VST1LNq8Pseudo_UPD  : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
+
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+
+//   VST2LN   : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
+          IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16 : VST2LN<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+
+// ...with double-spaced registers:
+def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+  let Inst{4}   = Rn{4};
+}
+def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{4}   = Rn{4};
+}
+
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+
+// ...with address register writeback:
+class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, nohash_imm:$lane), IIC_VST2lnu, "vst2", Dt,
+          "\\{$Vd[$lane], $src2[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
+}
+
+def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST2LNd16_UPD : VST2LNWB<0b0101, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+
+def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+
+//   VST3LN   : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
+           nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+  let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16 : VST3LN<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+
+// ...with double-spaced registers:
+def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32 : VST3LN<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+def VST3LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST3ln>;
+
+// ...with address register writeback:
+class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
+          IIC_VST3lnu, "vst3", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVST3LN";
+}
+
+def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST3LNd16_UPD : VST3LNWB<0b0110, {?,?,0,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+
+def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
+  let Inst{7}   = lane{0};
+}
+
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+
+//   VST4LN   : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
+          (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
+           nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
+          "", []> {
+  let Rm = 0b1111;
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16 : VST4LN<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+
+// ...with double-spaced registers:
+def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+
+// ...with address register writeback:
+class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
+  : NLdStLn<1, 0b00, op11_8, op7_4, (outs GPR:$wb),
+          (ins addrmode6:$Rn, am6offset:$Rm,
+           DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4, nohash_imm:$lane),
+          IIC_VST4lnu, "vst4", Dt,
+  "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
+          "$Rn.addr = $wb", []> {
+  let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
+}
+
+def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
+  let Inst{7-5} = lane{2-0};
+}
+def VST4LNd16_UPD : VST4LNWB<0b0111, {?,?,0,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+
+def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
+  let Inst{7-6} = lane{1-0};
+}
+def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
+  let Inst{7}   = lane{0};
+  let Inst{5} = Rn{5};
+}
+
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+
+} // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
+
+// Use vld1/vst1 for unaligned f64 load / store
+def : Pat<(f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1d16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d16 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1d8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d8 addrmode6:$addr, DPR:$value)>, Requires<[IsLE]>;
+def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
+          (VLD1d64 addrmode6:$addr)>, Requires<[IsBE]>;
+def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
+          (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
+
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations: double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VUNAD, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,
+           string Dt, ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VUNAQ, OpcodeStr, Dt,"$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm))))]>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Narrow 2-register operations.
+class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyD, ValueType TyQ, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (OpNode (TyQ QPR:$Vm))))]>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyD, ValueType TyQ, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$Vd),
+        (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vm))))]>;
+
+// Long 2-register operations (currently only used for VMOVL).
+class N2VL<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vm))))]>;
+
+// Long 2-register intrinsics.
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$Vd),
+        (ins DPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vm))))]>;
+
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$Vd, DPR:$Vm),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD,
+        OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+                  InstrItinClass itin, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$Vd, QPR:$Vm),
+        (ins QPR:$src1, QPR:$src2), itin, OpcodeStr, Dt, "$Vd, $Vm",
+        "$src1 = $Vd, $src2 = $Vm", []>;
+
+// Basic 3-register operations: double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+// Same as N3VD but no data type.
+class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 0, op4,
+         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set DPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+class N3VDSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8,
+               string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$Vn),
+                        (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 1, op4,
+         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+         OpcodeStr, "$Vd, $Vn, $Vm", "",
+         [(set QPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>{
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$Vn),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = 0;
+}
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (Ty DPR:$Vd),
+              (Ty (IntOp (Ty DPR:$Vn),
+                         (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$Vm, DPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (OpTy DPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$Vn),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              Format f, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$Vm, QPR:$Vn), f, itin,
+        OpcodeStr, Dt, "$Vd, $Vm, $Vn", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (OpTy QPR:$Vn))))]> {
+  let TwoOperandAliasConstraint = "$Vm = $Vd";
+  let isCommutable = 0;
+}
+
+// Multiply-Add/Sub operations: double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$Vn, DPR:$Vm)))))]>;
+
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+                                                     imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$Vd),
+        (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (Ty DPR:$Vd),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$Vn,
+                                   (Ty (NEONvduplane (Ty DPR_8:$Vm),
+                                                     imm:$lane)))))))]>;
+
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
+                SDPatternOperator MulOp, SDPatternOperator OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                  SDPatternOperator MulOp, SDPatternOperator ShOp>
+  : N3VLane32<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                        imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType ResTy, ValueType OpTy,
+                    SDNode MulOp, SDNode ShOp>
+  : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$Vn,
+                                   (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                        imm:$lane)))))))]>;
+
+// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
+class N3VDIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (Ty (OpNode DPR:$src1,
+                             (Ty (IntOp (Ty DPR:$Vn), (Ty DPR:$Vm))))))]>;
+class N3VQIntOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDPatternOperator IntOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (Ty (OpNode QPR:$src1,
+                             (Ty (IntOp (Ty QPR:$Vn), (Ty QPR:$Vm))))))]>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Long Multiply-Add/Sub operations.
+class N3VLMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (MulOp (TyD DPR:$Vn),
+                                            (TyD DPR:$Vm)))))]>;
+class N3VLMulOpSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+                                                 imm:$lane))))))]>;
+class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    ValueType TyQ, ValueType TyD, SDNode MulOp, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (OpNode (TyQ QPR:$src1),
+                  (TyQ (MulOp (TyD DPR:$Vn),
+                              (TyD (NEONvduplane (TyD DPR_8:$Vm),
+                                                 imm:$lane))))))]>;
+
+// Long Intrinsic-Op vector operations with explicit extend (VABAL).
+class N3VLIntExtOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                   SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$src1),
+                                (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                        (TyD DPR:$Vm)))))))]>;
+
+// Neon Long 3-argument intrinsic.  The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDPatternOperator IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$Vn), (TyD DPR:$Vm))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                   InstrItinClass itin, string OpcodeStr, string Dt,
+                   ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd),
+        (ins QPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "$src1 = $Vd",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
+              SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINi4D,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (TyD (IntOp (TyQ QPR:$Vn), (TyQ QPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register operations.
+class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType TyQ, ValueType TyD, SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, SDNode OpNode>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set QPR:$Vd,
+          (TyQ (OpNode (TyD DPR:$Vn),
+                       (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+
+// Long 3-register operations with explicitly extended operands.
+class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDNode OpNode, SDNode ExtOp,
+              bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ (ExtOp (TyD DPR:$Vn))),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics with explicit extend (VABDL).
+class N3VLIntExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, SDNode ExtOp,
+                 bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (ExtOp (TyD (IntOp (TyD DPR:$Vn),
+                                                (TyD DPR:$Vm))))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, SDPatternOperator IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
+  let isCommutable = Commutable;
+}
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane32<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+                                                imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
+                  InstrItinClass itin, string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N3VLane16<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$Vd), (ins DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
+        NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
+        [(set (ResTy QPR:$Vd),
+              (ResTy (IntOp (OpTy DPR:$Vn),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+                                                imm:$lane)))))]>;
+
+// Wide 3-register operations.
+class N3VW<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+           SDNode OpNode, SDNode ExtOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$Vd), (ins QPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VSUBiD,
+        OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
+        [(set QPR:$Vd, (OpNode (TyQ QPR:$Vn),
+                                (TyQ (ExtOp (TyD DPR:$Vm)))))]> {
+  // All of these have a two-operand InstAlias.
+  let TwoOperandAliasConstraint = "$Vn = $Vd";
+  let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vm), IIC_VPALiD,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set DPR:$Vd, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$Vm))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+        (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vm), IIC_VPALiQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "$src1 = $Vd",
+        [(set QPR:$Vd, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$Vm))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (Ty (OpNode (Ty DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             Format f, InstrItinClass itin, Operand ImmTy,
+             string OpcodeStr, string Dt, ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), f, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (Ty (OpNode (Ty QPR:$Vm), (i32 imm:$SIMM))))]>;
+}
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs QPR:$Vd), (ins DPR:$Vm, ImmTy:$SIMM), N2RegVShLFrm,
+           IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (OpNode (OpTy DPR:$Vm),
+                                          (i32 imm:$SIMM))))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, Operand ImmTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs DPR:$Vd), (ins QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, itin,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (OpNode (OpTy QPR:$Vm),
+                                          (i32 imm:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$Vm, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), N2RegVShRFrm, IIC_VPALiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$Vm, (i32 imm:$SIMM))))))]>;
+}
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$Vd),
+           (ins DPR:$src1, DPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiD,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set DPR:$Vd, (Ty (ShOp DPR:$src1, DPR:$Vm, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                Operand ImmTy, Format f, string OpcodeStr, string Dt,
+                ValueType Ty,SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$Vd),
+           (ins QPR:$src1, QPR:$Vm, ImmTy:$SIMM), f, IIC_VSHLiQ,
+           OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "$src1 = $Vd",
+           [(set QPR:$Vd, (Ty (ShOp QPR:$src1, QPR:$Vm, (i32 imm:$SIMM))))]>;
+}
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$Vd), (ins DPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAD, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              SDPatternOperator IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$Vd), (ins QPR:$Vm, neon_vcvt_imm32:$SIMM), NVCVTFrm,
+           IIC_VUNAQ, OpcodeStr, Dt, "$Vd, $Vm, $SIMM", "",
+           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Abbreviations used in multiclass suffixes:
+//   Q = quarter int (8 bit) elements
+//   H = half int (16 bit) elements
+//   S = single int (32 bit) elements
+//   D = double int (64 bit) elements
+
+// Neon 2-register vector operations and intrinsics.
+
+// Neon 2-register comparisons.
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op4, string opc, string Dt,
+                       string asm, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set DPR:$Vd, (v8i8 (OpNode (v8i8 DPR:$Vm))))]>;
+  def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set DPR:$Vd, (v4i16 (OpNode (v4i16 DPR:$Vm))))]>;
+  def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2i32 DPR:$Vm))))]>;
+  def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
+                  (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+
+  // 128-bit vector types.
+  def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "8"), asm, "",
+                  [(set QPR:$Vd, (v16i8 (OpNode (v16i8 QPR:$Vm))))]>;
+  def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "16"), asm, "",
+                  [(set QPR:$Vd, (v8i16 (OpNode (v8i16 QPR:$Vm))))]>;
+  def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, !strconcat(Dt, "32"), asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4i32 QPR:$Vm))))]>;
+  def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
+                  (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
+                  opc, "f32", asm, "",
+                  [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> {
+    let Inst{10} = 1; // overwrite F = 1
+  }
+}
+
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "16"),v4i16,v4i16,IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "32"),v2i32,v2i32,IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8,v16i8,IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "16"),v8i16,v8i16,IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                      itinQ, OpcodeStr, !strconcat(Dt, "32"),v4i32,v4i32,IntOp>;
+}
+
+
+// Neon Narrowing 2-register vector operations,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VN_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                    bits<5> op11_7, bit op6, bit op4,
+                    InstrItinClass itin, string OpcodeStr, string Dt,
+                    SDNode OpNode> {
+  def v8i8  : N2VN<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "16"),
+                   v8i8, v8i16, OpNode>;
+  def v4i16 : N2VN<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "32"),
+                   v4i16, v4i32, OpNode>;
+  def v2i32 : N2VN<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                   itin, OpcodeStr, !strconcat(Dt, "64"),
+                   v2i32, v2i64, OpNode>;
+}
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VL_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                    string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VL<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode>;
+  def v4i32 : N2VL<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i64 : N2VL<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                   OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   InstrItinClass itinD16, InstrItinClass itinD32,
+                   InstrItinClass itinQ16, InstrItinClass itinQ32,
+                   string OpcodeStr, string Dt,
+                   SDNode OpNode, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i32, v2i32, OpNode, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, "i16", v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, "i32", v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, "i16", v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, "i32",
+                     v4i32, v2i32, ShOp>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itinD, InstrItinClass itinQ,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0>
+  : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+            OpcodeStr, Dt, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp, Commutable>;
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp, Commutable>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp, Commutable>;
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp, Commutable>;
+}
+multiclass N3VInt_HSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v4i16 : N3VDIntSh<op24, op23, 0b01, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDIntSh<op24, op23, 0b10, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQIntSh<op24, op23, 0b01, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQIntSh<op24, op23, 0b10, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp>;
+}
+
+multiclass N3VIntSL_HS<bits<4> op11_8,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
+  def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
+  def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
+                          OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+  def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_HS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp>
+  : N3VInt_HSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp> {
+  def v8i8  : N3VDIntSh<op24, op23, 0b00, op11_8, op4, f, itinD16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i8, v8i8, IntOp>;
+  def v16i8 : N3VQIntSh<op24, op23, 0b00, op11_8, op4, f, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp>;
+}
+
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VInt_QHS<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp, Commutable>;
+}
+multiclass N3VInt_QHSDSh<bit op24, bit op23, bits<4> op11_8, bit op4, Format f,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp>
+  : N3VInt_QHSSh<op24, op23, op11_8, op4, f, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp> {
+  def v1i64 : N3VDIntSh<op24, op23, 0b11, op11_8, op4, f, itinD32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v1i64, v1i64, IntOp>;
+  def v2i64 : N3VQIntSh<op24, op23, 0b11, op11_8, op4, f, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i64, v2i64, IntOp>;
+}
+
+// Neon Narrowing 3-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0> {
+  def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp, Commutable>;
+  def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp, Commutable>;
+  def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector operations.
+
+multiclass N3VL_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itin16, InstrItinClass itin32,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0> {
+  def v8i16 : N3VL<op24, op23, 0b00, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, Commutable>;
+  def v4i32 : N3VL<op24, op23, 0b01, op11_8, op4, itin16,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, Commutable>;
+  def v2i64 : N3VL<op24, op23, 0b10, op11_8, op4, itin32,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, Commutable>;
+}
+
+multiclass N3VLSL_HS<bit op24, bits<4> op11_8,
+                     InstrItinClass itin, string OpcodeStr, string Dt,
+                     SDNode OpNode> {
+  def v4i16 : N3VLSL16<op24, 0b01, op11_8, itin, OpcodeStr,
+                       !strconcat(Dt, "16"), v4i32, v4i16, OpNode>;
+  def v2i32 : N3VLSL<op24, 0b10, op11_8, itin, OpcodeStr,
+                     !strconcat(Dt, "32"), v2i64, v2i32, OpNode>;
+}
+
+multiclass N3VLExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLExt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VLExt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VLExt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itin16, InstrItinClass itin32,
+                      string OpcodeStr, string Dt,
+                      SDPatternOperator IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+                        InstrItinClass itin, string OpcodeStr, string Dt,
+                        SDPatternOperator IntOp> {
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt,
+               IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+}
+
+// ....with explicit extend (VABDL).
+multiclass N3VLIntExt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDPatternOperator IntOp, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VLIntExt<op24, op23, 0b00, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "8"),
+                         v8i16, v8i8, IntOp, ExtOp, Commutable>;
+  def v4i32 : N3VLIntExt<op24, op23, 0b01, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "16"),
+                         v4i32, v4i16, IntOp, ExtOp, Commutable>;
+  def v2i64 : N3VLIntExt<op24, op23, 0b10, op11_8, op4, itin,
+                         OpcodeStr, !strconcat(Dt, "32"),
+                         v2i64, v2i32, IntOp, ExtOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VW_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, SDNode ExtOp, bit Commutable = 0> {
+  def v8i16 : N3VW<op24, op23, 0b00, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i16, v8i8, OpNode, ExtOp, Commutable>;
+  def v4i32 : N3VW<op24, op23, 0b01, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "16"),
+                   v4i32, v4i16, OpNode, ExtOp, Commutable>;
+  def v2i64 : N3VW<op24, op23, 0b10, op11_8, op4,
+                   OpcodeStr, !strconcat(Dt, "32"),
+                   v2i64, v2i32, OpNode, ExtOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD16, InstrItinClass itinD32,
+                        InstrItinClass itinQ16, InstrItinClass itinQ32,
+                        string OpcodeStr, string Dt, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>;
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>;
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>;
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>;
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
+}
+
+multiclass N3VMulOpSL_HS<bits<4> op11_8,
+                         InstrItinClass itinD16, InstrItinClass itinD32,
+                         InstrItinClass itinQ16, InstrItinClass itinQ32,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+                            OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
+  def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+                          OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
+  def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+                            OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16,
+                            mul, ShOp>;
+  def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+                          OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32,
+                          mul, ShOp>;
+}
+
+// Neon Intrinsic-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VIntOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD, InstrItinClass itinQ,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp,
+                        SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDIntOp<op24, op23, 0b00, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, IntOp, OpNode>;
+  def v4i16 : N3VDIntOp<op24, op23, 0b01, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp, OpNode>;
+  def v2i32 : N3VDIntOp<op24, op23, 0b10, op11_8, op4, itinD,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQIntOp<op24, op23, 0b00, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, IntOp, OpNode>;
+  def v8i16 : N3VQIntOp<op24, op23, 0b01, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, IntOp, OpNode>;
+  def v4i32 : N3VQIntOp<op24, op23, 0b10, op11_8, op4, itinQ,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, IntOp, OpNode>;
+}
+
+// Neon 3-argument intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD, InstrItinClass itinQ,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ,
+                       OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Long Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VLMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         InstrItinClass itin16, InstrItinClass itin32,
+                         string OpcodeStr, string Dt, SDNode MulOp,
+                         SDNode OpNode> {
+  def v8i16 : N3VLMulOp<op24, op23, 0b00, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "8"), v8i16, v8i8, MulOp, OpNode>;
+  def v4i32 : N3VLMulOp<op24, op23, 0b01, op11_8, op4, itin16, OpcodeStr,
+                        !strconcat(Dt, "16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i64 : N3VLMulOp<op24, op23, 0b10, op11_8, op4, itin32, OpcodeStr,
+                        !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+multiclass N3VLMulOpSL_HS<bit op24, bits<4> op11_8, string OpcodeStr,
+                          string Dt, SDNode MulOp, SDNode OpNode> {
+  def v4i16 : N3VLMulOpSL16<op24, 0b01, op11_8, IIC_VMACi16D, OpcodeStr,
+                            !strconcat(Dt,"16"), v4i32, v4i16, MulOp, OpNode>;
+  def v2i32 : N3VLMulOpSL<op24, 0b10, op11_8, IIC_VMACi32D, OpcodeStr,
+                          !strconcat(Dt, "32"), v2i64, v2i32, MulOp, OpNode>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin16, InstrItinClass itin32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin32,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+                           OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itin16, InstrItinClass itin32,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp>
+  : N3VLInt3_HS<op24, op23, op11_8, op4, itin16, itin32, OpcodeStr, Dt, IntOp> {
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin16,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+}
+
+// ....with explicit extend (VABAL).
+multiclass N3VLIntExtOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                            InstrItinClass itin, string OpcodeStr, string Dt,
+                            SDPatternOperator IntOp, SDNode ExtOp, SDNode OpNode> {
+  def v8i16 : N3VLIntExtOp<op24, op23, 0b00, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8,
+                           IntOp, ExtOp, OpNode>;
+  def v4i32 : N3VLIntExtOp<op24, op23, 0b01, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16,
+                           IntOp, ExtOp, OpNode>;
+  def v2i64 : N3VLIntExtOp<op24, op23, 0b10, op11_8, op4, itin,
+                           OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32,
+                           IntOp, ExtOp, OpNode>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                        bits<5> op11_7, bit op4,
+                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                         bits<5> op11_7, bit op4,
+                         string OpcodeStr, string Dt, SDPatternOperator IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShLFrm, itin, i32imm,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       string baseOpc, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm8,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm16,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, N2RegVShRFrm, itin, shr_imm32,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, N2RegVShRFrm, itin, shr_imm64,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Accumulate vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm8,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4, shr_imm32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4, shr_imm64,
+                        OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift-Insert vector operations,
+//   with f of either N2RegVShLFrm or N2RegVShRFrm
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShInsL_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v8i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v4i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v2i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v1i64, NEONvsli>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "8", v16i8, NEONvsli> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "16", v8i16, NEONvsli> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "32", v4i32, NEONvsli> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, i32imm,
+                        N2RegVShLFrm, OpcodeStr, "64", v2i64, NEONvsli>;
+                             // imm6 = xxxxxx
+}
+multiclass N2VShInsR_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                          string OpcodeStr> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v8i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v4i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v2i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v1i64, NEONvsri>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm8,
+                        N2RegVShRFrm, OpcodeStr, "8", v16i8, NEONvsri> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm16,
+                        N2RegVShRFrm, OpcodeStr, "16", v8i16, NEONvsri> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4, shr_imm32,
+                        N2RegVShRFrm, OpcodeStr, "32", v4i32, NEONvsri> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4, shr_imm64,
+                        N2RegVShRFrm, OpcodeStr, "64", v2i64, NEONvsri>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+              OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, imm1_7, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, imm1_15, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+               OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, imm1_31, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
+                      SDNode OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    OpcodeStr, !strconcat(Dt, "16"),
+                    v8i8, v8i16, shr_imm8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"),
+                     v4i16, v4i32, shr_imm16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"),
+                     v2i32, v2i64, shr_imm32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+//   VADD     : Vector Add (integer and floating-point)
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i",
+                         add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
+                     v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
+                     v4f32, v4f32, fadd, 1>;
+//   VADDL    : Vector Add Long (Q = D + D)
+defm VADDLs   : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "s", add, sext, 1>;
+defm VADDLu   : N3VLExt_QHS<1,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vaddl", "u", add, zext, 1>;
+//   VADDW    : Vector Add Wide (Q = Q + D)
+defm VADDWs   : N3VW_QHS<0,1,0b0001,0, "vaddw", "s", add, sext, 0>;
+defm VADDWu   : N3VW_QHS<1,1,0b0001,0, "vaddw", "u", add, zext, 0>;
+//   VHADD    : Vector Halving Add
+defm VHADDs   : N3VInt_QHS<0, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1, 0, 0b0000, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhadd", "u", int_arm_neon_vhaddu, 1>;
+//   VRHADD   : Vector Rounding Halving Add
+defm VRHADDs  : N3VInt_QHS<0, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
+                           IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+//   VQADD    : Vector Saturating Add
+defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
+                            IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
+//   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
+                            int_arm_neon_vaddhn, 1>;
+//   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
+                            int_arm_neon_vraddhn, 1>;
+
+// Vector Multiply Operations.
+
+//   VMUL     : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
+                        IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16D, "vmul",
+                        "p8", v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, N3RegFrm, IIC_VMULi16Q, "vmul",
+                        "p8", v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32",
+                     v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32",
+                     v4f32, v4f32, fmul, 1>;
+defm VMULsl   : N3VSL_HS<0b1000, "vmul", mul>;
+def  VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def  VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32,
+                       v2f32, fmul>;
+
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+          (v4f32 (VMULslfq (v4f32 QPR:$src1),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+//   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
+                          IIC_VMULi16Q, IIC_VMULi32Q,
+                          "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqdmulh", "s",  int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                            imm:$lane)))),
+          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i16_reg imm:$lane))),
+                                 (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                            imm:$lane)))),
+          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                         (DSubReg_i32_reg imm:$lane))),
+                                 (SubReg_i32_lane imm:$lane)))>;
+
+//   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, N3RegFrm,
+                            IIC_VMULi16D,IIC_VMULi32D,IIC_VMULi16Q,IIC_VMULi32Q,
+                            "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+                              IIC_VMULi16Q, IIC_VMULi32Q,
+                              "vqrdmulh", "s",  int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                             imm:$lane)))),
+          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i16_reg imm:$lane))),
+                                  (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                             imm:$lane)))),
+          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                          (DSubReg_i32_reg imm:$lane))),
+                                  (SubReg_i32_lane imm:$lane)))>;
+
+//   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
+defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                         "vmull", "s", NEONvmulls, 1>;
+defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                         "vmull", "u", NEONvmullu, 1>;
+def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                        v8i16, v8i8, int_arm_neon_vmullp, 1>;
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+
+//   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vqdmull", "s", int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D,
+                             "vqdmull", "s", int_arm_neon_vqdmull>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+//   VMLA     : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
+                            v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
+                            v4f32, v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                       (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "s", NEONvmulls, add>;
+defm VMLALu   : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlal", "u", NEONvmullu, add>;
+
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+
+//   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlal", "s", int_arm_neon_vqdmlal>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
+
+//   VMLS     : Vector Multiply Subtract (integer and floating-point)
+defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
+                            v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
+                            v4f32, v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasNEON, UseFPVMLx]>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+                  (mul (v8i16 QPR:$src2),
+                       (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+                  (mul (v4i32 QPR:$src2),
+                     (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                      (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
+                  (fmul_su (v4f32 QPR:$src2),
+                        (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>,
+          Requires<[HasNEON, UseFPVMLx]>;
+
+//   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLu   : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
+                              "vmlsl", "u", NEONvmullu, sub>;
+
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+
+//   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
+                            "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+
+// Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
+def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
+                          v2f32, fmul_su, fadd_mlx>,
+                Requires<[HasVFP4,UseFusedMAC]>;
+
+def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
+                          v4f32, fmul_su, fadd_mlx>,
+                Requires<[HasVFP4,UseFusedMAC]>;
+
+//   Fused Vector Multiply Subtract (floating-point)
+def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
+                          v2f32, fmul_su, fsub_mlx>,
+                Requires<[HasVFP4,UseFusedMAC]>;
+def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
+                          v4f32, fmul_su, fsub_mlx>,
+                Requires<[HasVFP4,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
+          (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma QPR:$Vn, QPR:$Vm, QPR:$src1)),
+          (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+          Requires<[HasVFP4]>;
+def : Pat<(v2f32 (fma (fneg DPR:$Vn), DPR:$Vm, DPR:$src1)),
+          (VFMSfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
+          (VFMSfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+      Requires<[HasVFP4]>;
+
+// Vector Subtract Operations.
+
+//   VSUB     : Vector Subtract (integer and floating-point)
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ,
+                         "vsub", "i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
+                     v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
+                     v4f32, v4f32, fsub, 0>;
+//   VSUBL    : Vector Subtract Long (Q = D - D)
+defm VSUBLs   : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "s", sub, sext, 0>;
+defm VSUBLu   : N3VLExt_QHS<1,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD,
+                            "vsubl", "u", sub, zext, 0>;
+//   VSUBW    : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs   : N3VW_QHS<0,1,0b0011,0, "vsubw", "s", sub, sext, 0>;
+defm VSUBWu   : N3VW_QHS<1,1,0b0011,0, "vsubw", "u", sub, zext, 0>;
+//   VHSUB    : Vector Halving Subtract
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vhsub", "u", int_arm_neon_vhsubu, 0>;
+//   VQSUB    : Vector Saturing Subtract
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
+                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                            "vqsub", "u", int_arm_neon_vqsubu, 0>;
+//   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
+                            int_arm_neon_vsubhn, 0>;
+//   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
+                            int_arm_neon_vrsubhn, 0>;
+
+// Vector Comparisons.
+
+//   VCEQ     : Vector Compare Equal
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vceq", "i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+                     NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+                     NEONvceq, 1>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in
+defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
+                            "$Vd, $Vm, #0", NEONvceqz>;
+
+//   VCGE     : Vector Compare Greater Than or Equal
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcge", "u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
+                     NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+                     NEONvcge, 0>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
+                            "$Vd, $Vm, #0", NEONvcgez>;
+defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
+                            "$Vd, $Vm, #0", NEONvclez>;
+}
+
+//   VCGT     : Vector Compare Greater Than
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
+                        IIC_VSUBi4Q, "vcgt", "u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+                     NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+                     NEONvcgt, 0>;
+
+let TwoOperandAliasConstraint = "$Vm = $Vd" in {
+defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
+                            "$Vd, $Vm, #0", NEONvcgtz>;
+defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
+                            "$Vd, $Vm, #0", NEONvcltz>;
+}
+
+//   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge",
+                        "f32", v2i32, v2f32, int_arm_neon_vacged, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgeq, 0>;
+//   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt",
+                        "f32", v2i32, v2f32, int_arm_neon_vacgtd, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt",
+                        "f32", v4i32, v4f32, int_arm_neon_vacgtq, 0>;
+//   VTST     : Vector Test Bits
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
+
+// Vector Bitwise Operations.
+
+def vnotd : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+def vnotq : PatFrag<(ops node:$in),
+                    (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+
+
+//   VAND     : Vector Bitwise AND
+def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
+                      v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand",
+                      v4i32, v4i32, and, 1>;
+
+//   VEOR     : Vector Bitwise Exclusive OR
+def  VEORd    : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor",
+                      v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor",
+                      v4i32, v4i32, xor, 1>;
+
+//   VORR     : Vector Bitwise OR
+def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
+                      v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
+                      v4i32, v4i32, or, 1>;
+
+def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+
+//   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (and DPR:$Vn,
+                                                 (vnotd DPR:$Vm))))]>;
+def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vbic", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (and QPR:$Vn,
+                                                 (vnotq QPR:$Vm))))]>;
+}
+
+def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI16:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
+                          (outs DPR:$Vd), (ins nImmSplatI32:$SIMM, DPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set DPR:$Vd,
+                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI16:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
+                          (outs QPR:$Vd), (ins nImmSplatI32:$SIMM, QPR:$src),
+                          IIC_VMOVImm,
+                          "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
+                          [(set QPR:$Vd,
+                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+  let Inst{10-9} = SIMM{10-9};
+}
+
+//   VORN     : Vector Bitwise OR NOT
+def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$Vn, DPR:$Vm), N3RegFrm, IIC_VBINiD,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (or DPR:$Vn,
+                                                (vnotd DPR:$Vm))))]>;
+def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$Vn, QPR:$Vm), N3RegFrm, IIC_VBINiQ,
+                     "vorn", "$Vd, $Vn, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (or QPR:$Vn,
+                                                (vnotq QPR:$Vm))))]>;
+
+//   VMVN     : Vector Bitwise NOT (Immediate)
+
+let isReMaterializable = 1 in {
+
+def VMVNv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMVNv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMVNv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmvn", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmvnImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+}
+
+//   VMVN     : Vector Bitwise NOT
+def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+                     (outs DPR:$Vd), (ins DPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set DPR:$Vd, (v2i32 (vnotd DPR:$Vm)))]>;
+def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+                     (outs QPR:$Vd), (ins QPR:$Vm), IIC_VSUBiD,
+                     "vmvn", "$Vd, $Vm", "",
+                     [(set QPR:$Vd, (v4i32 (vnotq QPR:$Vm)))]>;
+def : Pat<(v2i32 (vnotd DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnotq QPR:$src)), (VMVNq QPR:$src)>;
+
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
+                     (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VCNTiD,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set DPR:$Vd,
+                           (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
+                     (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VCNTiQ,
+                     "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     [(set QPR:$Vd,
+                           (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
+
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+//   VBIF     : Vector Bitwise Insert if False
+//              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbif", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+//   VBIT     : Vector Bitwise Insert if True
+//              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+// FIXME: This instruction's encoding MAY NOT BE correct.
+def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
+                     (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm),
+                     N3RegFrm, IIC_VBINiD,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
+                     (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
+                     N3RegFrm, IIC_VBINiQ,
+                     "vbit", "$Vd, $Vn, $Vm", "$src1 = $Vd",
+                     []>;
+
+// VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+//   VABD     : Vector Absolute Difference
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "s", int_arm_neon_vabds, 1>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vabd", "u", int_arm_neon_vabdu, 1>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND,
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ,
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>;
+
+//   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs   : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "s", int_arm_neon_vabds, zext, 1>;
+defm VABDLu   : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q,
+                               "vabdl", "u", int_arm_neon_vabdu, zext, 1>;
+
+//   VABA     : Vector Absolute Difference and Accumulate
+defm VABAs    : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "s", int_arm_neon_vabds, add>;
+defm VABAu    : N3VIntOp_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ,
+                             "vaba", "u", int_arm_neon_vabdu, add>;
+
+//   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs   : N3VLIntExtOp_QHS<0,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "s", int_arm_neon_vabds, zext, add>;
+defm VABALu   : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD,
+                                 "vabal", "u", int_arm_neon_vabdu, zext, add>;
+
+// Vector Maximum and Minimum.
+
+//   VMAX     : Vector Maximum
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmax", "u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmax", "f32",
+                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmax", "f32",
+                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+
+//   VMIN     : Vector Minimum
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
+                           IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
+                           "vmin", "u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
+                        "vmin", "f32",
+                        v2f32, v2f32, int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
+                        "vmin", "f32",
+                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+
+// Vector Pairwise Operations.
+
+//   VPADD    : Vector Pairwise Add
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i8",
+                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i16",
+                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD,
+                        "vpadd", "i32",
+                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm,
+                        IIC_VPBIND, "vpadd", "f32",
+                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+
+//   VPADDL   : Vector Pairwise Add Long
+defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
+                             int_arm_neon_vpaddls>;
+defm VPADDLu  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u",
+                             int_arm_neon_vpaddlu>;
+
+//   VPADAL   : Vector Pairwise Add and Accumulate Long
+defm VPADALs  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s",
+                              int_arm_neon_vpadals>;
+defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
+                              int_arm_neon_vpadalu>;
+
+//   VPMAX    : Vector Pairwise Maximum
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u8", v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u16", v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax",
+                        "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+
+//   VPMIN    : Vector Pairwise Minimum
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s8", v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s16", v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "s32", v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u8", v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u16", v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin",
+                        "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin",
+                        "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+//   VRECPE   : Vector Reciprocal Estimate
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAD, "vrecpe", "u32",
+                        v2i32, v2i32, int_arm_neon_vrecpe>;
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0,
+                        IIC_VUNAQ, "vrecpe", "u32",
+                        v4i32, v4i32, int_arm_neon_vrecpe>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecpe>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecpe>;
+
+//   VRECPS   : Vector Reciprocal Step
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrecps", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecps, 1>;
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrecps", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecps, 1>;
+
+//   VRSQRTE  : Vector Reciprocal Square Root Estimate
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAD, "vrsqrte", "u32",
+                         v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAQ, "vrsqrte", "u32",
+                         v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f32",
+                         v2f32, v2f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAQ, "vrsqrte", "f32",
+                         v4f32, v4f32, int_arm_neon_vrsqrte>;
+
+//   VRSQRTS  : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSD, "vrsqrts", "f32",
+                        v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm,
+                        IIC_VRECSQ, "vrsqrts", "f32",
+                        v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+
+// Vector Shifts.
+
+//   VSHL     : Vector Shift
+defm VSHLs    : N3VInt_QHSDSh<0, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "s", int_arm_neon_vshifts>;
+defm VSHLu    : N3VInt_QHSDSh<1, 0, 0b0100, 0, N3RegVShFrm,
+                            IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ, IIC_VSHLiQ,
+                            "vshl", "u", int_arm_neon_vshiftu>;
+
+//   VSHL     : Vector Shift Left (Immediate)
+defm VSHLi    : N2VShL_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+
+//   VSHR     : Vector Shift Right (Immediate)
+defm VSHRs    : N2VShR_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", "VSHRs",
+                            NEONvshrs>;
+defm VSHRu    : N2VShR_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", "VSHRu",
+                            NEONvshru>;
+
+//   VSHLL    : Vector Shift Left Long
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>;
+
+//   VSHLL    : Vector Shift Left Long (with maximum shift count)
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
+                ValueType OpTy, Operand ImmTy, SDNode OpNode>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
+           ResTy, OpTy, ImmTy, OpNode> {
+  let Inst{21-16} = op21_16;
+  let DecoderMethod = "DecodeVSHLMaxInstruction";
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
+                          v8i16, v8i8, imm8, NEONvshlli>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
+                          v4i32, v4i16, imm16, NEONvshlli>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
+                          v2i64, v2i32, imm32, NEONvshlli>;
+
+//   VSHRN    : Vector Shift Right and Narrow
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i",
+                           NEONvshrn>;
+
+//   VRSHL    : Vector Rounding Shift
+defm VRSHLs   : N3VInt_QHSDSh<0, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "s", int_arm_neon_vrshifts>;
+defm VRSHLu   : N3VInt_QHSDSh<1, 0, 0b0101, 0, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vrshl", "u", int_arm_neon_vrshiftu>;
+//   VRSHR    : Vector Rounding Shift Right
+defm VRSHRs   : N2VShR_QHSD<0,1,0b0010,1, IIC_VSHLi4D, "vrshr", "s", "VRSHRs",
+                            NEONvrshrs>;
+defm VRSHRu   : N2VShR_QHSD<1,1,0b0010,1, IIC_VSHLi4D, "vrshr", "u", "VRSHRu",
+                            NEONvrshru>;
+
+//   VRSHRN   : Vector Rounding Shift Right and Narrow
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
+                           NEONvrshrn>;
+
+//   VQSHL    : Vector Saturating Shift
+defm VQSHLs   : N3VInt_QHSDSh<0, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "s", int_arm_neon_vqshifts>;
+defm VQSHLu   : N3VInt_QHSDSh<1, 0, 0b0100, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqshl", "u", int_arm_neon_vqshiftu>;
+//   VQSHL    : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi  : N2VShL_QHSD<0,1,0b0111,1, IIC_VSHLi4D, "vqshl", "s",NEONvqshls>;
+defm VQSHLui  : N2VShL_QHSD<1,1,0b0111,1, IIC_VSHLi4D, "vqshl", "u",NEONvqshlu>;
+
+//   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu  : N2VShL_QHSD<1,1,0b0110,1, IIC_VSHLi4D,"vqshlu","s",NEONvqshlsu>;
+
+//   VQSHRN   : Vector Saturating Shift Right and Narrow
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
+                           NEONvqshrnu>;
+
+//   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
+                           NEONvqshrnsu>;
+
+//   VQRSHL   : Vector Saturating Rounding Shift
+defm VQRSHLs  : N3VInt_QHSDSh<0, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "s", int_arm_neon_vqrshifts>;
+defm VQRSHLu  : N3VInt_QHSDSh<1, 0, 0b0101, 1, N3RegVShFrm,
+                            IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q, IIC_VSHLi4Q,
+                            "vqrshl", "u", int_arm_neon_vqrshiftu>;
+
+//   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
+                           NEONvqrshrnu>;
+
+//   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
+                           NEONvqrshrnsu>;
+
+//   VSRA     : Vector Shift Right and Accumulate
+defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
+defm VSRAu    : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+//   VRSRA    : Vector Rounding Shift Right and Accumulate
+defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
+defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+
+//   VSLI     : Vector Shift Left and Insert
+defm VSLI     : N2VShInsL_QHSD<1, 1, 0b0101, 1, "vsli">;
+
+//   VSRI     : Vector Shift Right and Insert
+defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
+
+// Vector Absolute and Saturating Absolute.
+
+//   VABS     : Vector Absolute Value
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
+                           int_arm_neon_vabs>;
+def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                     v2f32, v2f32, fabs>;
+def  VABSfq   : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                     "vabs", "f32",
+                      v4f32, v4f32, fabs>;
+
+def : Pat<(v2f32 (int_arm_neon_vabs (v2f32 DPR:$src))), (VABSfd DPR:$src)>;
+def : Pat<(v4f32 (int_arm_neon_vabs (v4f32 QPR:$src))), (VABSfq QPR:$src)>;
+
+//   VQABS    : Vector Saturating Absolute Value
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
+                           int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vnegd  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+def vnegq  : PatFrag<(ops node:$in),
+                     (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
+        IIC_VSHLiD, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (vnegd DPR:$Vm)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm),
+        IIC_VSHLiQ, OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (vnegq QPR:$Vm)))]>;
+
+//   VNEG     : Vector Negate (integer)
+def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
+def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
+def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
+def  VNEGs8q  : VNEGQ<0b00, "vneg", "s8", v16i8>;
+def  VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
+def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
+
+//   VNEG     : Vector Negate (floating-point)
+def  VNEGfd   : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set DPR:$Vd, (v2f32 (fneg DPR:$Vm)))]>;
+def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ,
+                    "vneg", "f32", "$Vd, $Vm", "",
+                    [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>;
+
+def : Pat<(v8i8  (vnegd  DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vnegd  DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vnegd  DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vnegq QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vnegq QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vnegq QPR:$src)), (VNEGs32q QPR:$src)>;
+
+//   VQNEG    : Vector Saturating Negate
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0,
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
+                           int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+//   VCLS     : Vector Count Leading Sign Bits
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
+                           int_arm_neon_vcls>;
+//   VCLZ     : Vector Count Leading Zeros
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0,
+                           IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
+                           ctlz>;
+//   VCNT     : Vector Count One Bits
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiD, "vcnt", "8",
+                        v8i8, v8i8, ctpop>;
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiQ, "vcnt", "8",
+                        v16i8, v16i8, ctpop>;
+
+// Vector Swap
+def  VSWPd    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 0, 0,
+                     (outs DPR:$Vd, DPR:$Vm), (ins DPR:$in1, DPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
+                     (outs QPR:$Vd, QPR:$Vm), (ins QPR:$in1, QPR:$in2),
+                     NoItinerary, "vswp", "$Vd, $Vm", "$in1 = $Vd, $in2 = $Vm",
+                     []>;
+
+// Vector Move Operations.
+
+//   VMOV     : Vector Move (Register)
+def : InstAlias<"vmov${p} $Vd, $Vm",
+                (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : InstAlias<"vmov${p} $Vd, $Vm",
+                (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+//   VMOV     : Vector Move (Immediate)
+
+let isReMaterializable = 1 in {
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v8i8 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI8:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v16i8 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v4i16 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{9} = SIMM{9};
+}
+
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,0}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI16:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v8i16 (NEONvmovImm timm:$SIMM)))]> {
+ let Inst{9} = SIMM{9};
+}
+
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVI32:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4i32 (NEONvmovImm timm:$SIMM)))]> {
+  let Inst{11-8} = SIMM{11-8};
+}
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v1i64 (NEONvmovImm timm:$SIMM)))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
+                         (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
+} // isReMaterializable
+
+//   VMOV     : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNu8  : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
+                          IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
+                          IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
+                          [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+                                           imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
+                          (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
+                          IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
+                          [(set GPR:$R, (extractelt (v2i32 DPR:$V),
+                                           imm:$lane))]>,
+                Requires<[HasNEON, HasFastVGETLNi32]> {
+  let Inst{21} = lane{0};
+}
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+          (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+          (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+          (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i32_reg imm:$lane))),
+                     (SubReg_i32_lane imm:$lane))>,
+      Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1),QPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+//          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+
+
+//   VMOV     : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $V" in {
+def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex8:$lane),
+                          IIC_VMOVISL, "vmov", "8", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21}  = lane{2};
+  let Inst{6-5} = lane{1-0};
+}
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex16:$lane),
+                          IIC_VMOVISL, "vmov", "16", "$V$lane, $R",
+                          [(set DPR:$V, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{1};
+  let Inst{6}  = lane{0};
+}
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
+                          (ins DPR:$src1, GPR:$R, VectorIndex32:$lane),
+                          IIC_VMOVISL, "vmov", "32", "$V$lane, $R",
+                          [(set DPR:$V, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$R, imm:$lane))]> {
+  let Inst{21} = lane{0};
+}
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+          (v16i8 (INSERT_SUBREG QPR:$src1,
+                  (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+                                   (DSubReg_i8_reg imm:$lane))),
+                            GPR:$src2, (SubReg_i8_lane imm:$lane))),
+                  (DSubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+          (v8i16 (INSERT_SUBREG QPR:$src1,
+                  (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i16_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i16_lane imm:$lane))),
+                  (DSubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+          (v4i32 (INSERT_SUBREG QPR:$src1,
+                  (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i32_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i32_lane imm:$lane))),
+                  (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+          (VSETLNi8  (v8i8  (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+          (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+          (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                         (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                         (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                         (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         dsub_0)>;
+
+//   VDUP     : Vector Duplicate (from ARM core register to all elements)
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
+          IIC_VMOVIS, "vdup", Dt, "$V, $R",
+          [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+
+def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
+def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>,
+                Requires<[HasNEON, HasFastVDUP32]>;
+def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
+def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
+def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
+
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+      Requires<[HasNEON,HasFastVDUP32]>;
+def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
+
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+
+//   VDUP     : Vector Duplicate Lane (from scalar to all elements)
+
+class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType Ty, Operand IdxTy>
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+
+class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Operand IdxTy>
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
+              [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+                                      VectorIndex32:$lane)))]>;
+
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+  bits<3> lane;
+  let Inst{19-17} = lane{2-0};
+}
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+  bits<2> lane;
+  let Inst{19-18} = lane{1-0};
+}
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+  bits<1> lane;
+  let Inst{19} = lane{0};
+}
+
+def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+          (VDUPLN32q DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+          (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+                                  (DSubReg_i8_reg imm:$lane))),
+                           (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+          (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+          (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i32_reg imm:$lane))),
+                            (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+          (v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+def  VDUPfdf : PseudoNeonI<(outs DPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
+                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+def  VDUPfqf : PseudoNeonI<(outs QPR:$dst), (ins SPR:$src), IIC_VMOVD, "",
+                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+
+//   VMOVN    : Vector Narrowing Move
+defm VMOVN    : N2VN_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVN,
+                         "vmovn", "i", trunc>;
+//   VQMOVN   : Vector Saturating Narrowing Move
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
+                            "vqmovn", "s", int_arm_neon_vqmovns>;
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
+                            "vqmovn", "u", int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
+                            "vqmovun", "s", int_arm_neon_vqmovnsu>;
+//   VMOVL    : Vector Lengthening Move
+defm VMOVLs   : N2VL_QHS<0b01,0b10100,0,1, "vmovl", "s", sext>;
+defm VMOVLu   : N2VL_QHS<0b11,0b10100,0,1, "vmovl", "u", zext>;
+def : Pat<(v8i16 (anyext (v8i8 DPR:$Vm))), (VMOVLuv8i16 DPR:$Vm)>;
+def : Pat<(v4i32 (anyext (v4i16 DPR:$Vm))), (VMOVLuv4i32 DPR:$Vm)>;
+def : Pat<(v2i64 (anyext (v2i32 DPR:$Vm))), (VMOVLuv2i64 DPR:$Vm)>;
+
+// Vector Conversions.
+
+//   VCVT     : Vector Convert Between Floating-Point and Integers
+def  VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v2i32, v2f32, fp_to_sint>;
+def  VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v2i32, v2f32, fp_to_uint>;
+def  VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v2f32, v2i32, sint_to_fp>;
+def  VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v2f32, v2i32, uint_to_fp>;
+
+def  VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v4i32, v4f32, fp_to_sint>;
+def  VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v4i32, v4f32, fp_to_uint>;
+def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v4f32, v4i32, sint_to_fp>;
+def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v4f32, v4i32, uint_to_fp>;
+
+//   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+let DecoderMethod = "DecodeVCVTD" in {
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+}
+
+let DecoderMethod = "DecodeVCVTQ" in {
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+}
+
+//   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
+def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f16.f32",
+                        v4i16, v4f32, int_arm_neon_vcvtfp2hf>,
+                Requires<[HasNEON, HasFP16]>;
+def  VCVTh2f  : N2VLInt<0b11, 0b11, 0b01, 0b10, 0b01110, 0, 0,
+                        IIC_VUNAQ, "vcvt", "f32.f16",
+                        v4f32, v4i16, int_arm_neon_vcvthf2fp>,
+                Requires<[HasNEON, HasFP16]>;
+
+// Vector Reverse.
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
+def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64", "8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
+def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32", "8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32", "8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
+        (ins DPR:$Vm), IIC_VMOVD,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
+        (ins QPR:$Vm), IIC_VMOVQ,
+        OpcodeStr, Dt, "$Vd, $Vm", "",
+        [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16", "8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
+
+// Other Vector Shuffles.
+
+//  Aligned extractions: really just dropping registers
+
+class AlignedVEXTq<ValueType DestTy, ValueType SrcTy, SDNodeXForm LaneCVT>
+      : Pat<(DestTy (vector_extract_subvec (SrcTy QPR:$src), (i32 imm:$start))),
+             (EXTRACT_SUBREG (SrcTy QPR:$src), (LaneCVT imm:$start))>;
+
+def : AlignedVEXTq<v8i8, v16i8, DSubReg_i8_reg>;
+
+def : AlignedVEXTq<v4i16, v8i16, DSubReg_i16_reg>;
+
+def : AlignedVEXTq<v2i32, v4i32, DSubReg_i32_reg>;
+
+def : AlignedVEXTq<v1i64, v2i64, DSubReg_f64_reg>;
+
+def : AlignedVEXTq<v2f32, v4f32, DSubReg_i32_reg>;
+
+
+//   VEXT     : Vector Extract
+
+
+// All of these have a two-operand InstAlias.
+let TwoOperandAliasConstraint = "$Vn = $Vd" in {
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$Vd),
+        (ins DPR:$Vn, DPR:$Vm, immTy:$index), NVExtFrm,
+        IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
+                                     (Ty DPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
+
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
+  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$Vd),
+        (ins QPR:$Vn, QPR:$Vm, imm0_15:$index), NVExtFrm,
+        IIC_VEXTQ, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
+        [(set QPR:$Vd, (Ty (NEONvext (Ty QPR:$Vn),
+                                     (Ty QPR:$Vm), imm:$index)))]> {
+  bits<4> index;
+  let Inst{11-8} = index{3-0};
+}
+}
+
+def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
+                           (v2f32 DPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTd32 DPR:$Vn, DPR:$Vm, imm:$index)>;
+
+def VEXTq8  : VEXTq<"vext", "8",  v16i8, imm0_15> {
+  let Inst{11-8} = index{3-0};
+}
+def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> {
+  let Inst{11-9} = index{2-0};
+  let Inst{8}    = 0b0;
+}
+def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> {
+  let Inst{11-10} = index{1-0};
+  let Inst{9-8}    = 0b00;
+}
+def VEXTq64 : VEXTq<"vext", "64", v2i64, imm0_1> {
+  let Inst{11} = index{0};
+  let Inst{10-8}    = 0b000;
+}
+def : Pat<(v4f32 (NEONvext (v4f32 QPR:$Vn),
+                           (v4f32 QPR:$Vm),
+                           (i32 imm:$index))),
+          (VEXTq32 QPR:$Vn, QPR:$Vm, imm:$index)>;
+
+//   VTRN     : Vector Transpose
+
+def  VTRNd8   : N2VDShuffle<0b00, 0b00001, "vtrn", "8">;
+def  VTRNd16  : N2VDShuffle<0b01, 0b00001, "vtrn", "16">;
+def  VTRNd32  : N2VDShuffle<0b10, 0b00001, "vtrn", "32">;
+
+def  VTRNq8   : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">;
+def  VTRNq16  : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">;
+def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
+
+//   VUZP     : Vector Unzip (Deinterleave)
+
+def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
+def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
+// vuzp.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vuzp${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
+def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
+def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
+
+//   VZIP     : Vector Zip (Interleave)
+
+def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
+def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
+// vzip.32 Dd, Dm is a pseudo-instruction expanded to vtrn.32 Dd, Dm.
+def : NEONInstAlias<"vzip${p}.32 $Dd, $Dm",
+                    (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
+def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
+def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
+
+// Vector Table Lookup and Table Extension.
+
+//   VTBL     : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
+def  VTBL1
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
+        (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBL2
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
+        (ins VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB2,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL3
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$Vd),
+        (ins VecListThreeD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB3,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+def  VTBL4
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$Vd),
+        (ins VecListFourD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTB4,
+        "vtbl", "8", "$Vd, $Vn, $Vm", "", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBL3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB3, "", []>;
+def  VTBL4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins QQPR:$tbl, DPR:$src), IIC_VTB4, "", []>;
+
+//   VTBX     : Vector Table Extension
+def  VTBX1
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX1,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd",
+        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, VecListOneD:$Vn, DPR:$Vm)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBX2
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListDPair:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX2,
+        "vtbx", "8", "$Vd, $Vn, $Vm", "$orig = $Vd", []>;
+def  VTBX3
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListThreeD:$Vn, DPR:$Vm),
+        NVTBLFrm, IIC_VTBX3,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+def  VTBX4
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$Vd),
+        (ins DPR:$orig, VecListFourD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTBX4,
+        "vtbx", "8", "$Vd, $Vn, $Vm",
+        "$orig = $Vd", []>;
+} // hasExtraSrcRegAllocReq = 1
+
+def  VTBX3Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX3, "$orig = $dst", []>;
+def  VTBX4Pseudo
+  : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
+                IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+class N2VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+class N3VSMulOpPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+              (EXTRACT_SUBREG
+               (v2f32 (COPY_TO_REGCLASS (Inst
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$acc, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$a, ssub_0),
+                (INSERT_SUBREG
+                 (v2f32 (COPY_TO_REGCLASS (v2f32 (IMPLICIT_DEF)), DPR_VFP2)),
+                 SPR:$b, ssub_0)), DPR_VFP2)), ssub_0)>;
+
+def : N3VSPat<fadd, VADDfd>;
+def : N3VSPat<fsub, VSUBfd>;
+def : N3VSPat<fmul, VMULfd>;
+def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
+      Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
+def : N2VSPat<fabs, VABSfd>;
+def : N2VSPat<fneg, VNEGfd>;
+def : N3VSPat<NEONfmax, VMAXfd>;
+def : N3VSPat<NEONfmin, VMINfd>;
+def : N2VSPat<arm_ftosi, VCVTf2sd>;
+def : N2VSPat<arm_ftoui, VCVTf2ud>;
+def : N2VSPat<arm_sitof, VCVTs2fd>;
+def : N2VSPat<arm_uitof, VCVTu2fd>;
+
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+
+// Fold extracting an element out of a v2i32 into a vfp register.
+def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
+          (f32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane)))>;
+
+// Vector lengthening move with load, matching extending loads.
+
+// extload, zextload and sextload for a standard lengthening load. Example:
+// Lengthen_Single<"8", "i16", "8"> = 
+//     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
+//         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
+//                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
+multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
+  let AddedComplexity = 10 in {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                    (!cast<PatFrag>("extloadvi" # SrcTy) addrmode6:$addr)),
+                  (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _Z : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("zextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLuv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+
+  def _S : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                  (!cast<PatFrag>("sextloadvi" # SrcTy) addrmode6:$addr)),
+                (!cast<Instruction>("VMOVLsv" # DestLanes # DestTy)
+                    (!cast<Instruction>("VLD1d" # SrcTy) addrmode6:$addr))>;
+  }
+}
+
+// extload, zextload and sextload for a lengthening load which only uses
+// half the lanes available. Example:
+// Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
+//     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, 
+//                                      (f64 (IMPLICIT_DEF)), (i32 0))),
+//                         dsub_0)>;
+multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
+                               string InsnLanes, string InsnTy> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+       (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # InsnLanes # InsnTy)
+         (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+         dsub_0)>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length.
+//
+// Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32"> =
+//     Pat<(v4i32 (extloadvi8 addrmode6oneL32:$addr))
+//         (EXTRACT_SUBREG (VMOVLuv4i32
+//           (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
+//                                                   (f64 (IMPLICIT_DEF)),
+//                                                   (i32 0))),
+//                           dsub_0)),
+//           dsub_0)>;
+multiclass Lengthen_Double<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6oneL32:$addr)),
+         (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd32 addrmode6oneL32:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0))>;
+}
+
+// extload, zextload and sextload for a lengthening load followed by another
+// lengthening load, to quadruple the initial length, but which ends up only
+// requiring half the available lanes (a 64-bit outcome instead of a 128-bit).
+//
+// Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32"> =
+// Pat<(v2i32 (extloadvi8 addrmode6:$addr))
+//     (EXTRACT_SUBREG (VMOVLuv4i32
+//       (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd16 addrmode6:$addr,
+//                                               (f64 (IMPLICIT_DEF)), (i32 0))),
+//                       dsub_0)),
+//       dsub_0)>;
+multiclass Lengthen_HalfDouble<string DestLanes, string DestTy, string SrcTy,
+                           string Insn1Lanes, string Insn1Ty, string Insn2Lanes,
+                           string Insn2Ty> {
+  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("extloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("zextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLuv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # DestTy)
+                   (!cast<PatFrag>("sextloadv" # SrcTy) addrmode6:$addr)),
+         (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn2Lanes # Insn2Ty)
+           (EXTRACT_SUBREG (!cast<Instruction>("VMOVLsv" # Insn1Lanes # Insn1Ty)
+             (VLD1LNd16 addrmode6:$addr, (f64 (IMPLICIT_DEF)), (i32 0))),
+             dsub_0)),
+          dsub_0)>;
+}
+
+defm : Lengthen_Single<"8", "i16", "8">; // v8i8 -> v8i16
+defm : Lengthen_Single<"4", "i32", "16">; // v4i16 -> v4i32
+defm : Lengthen_Single<"2", "i64", "32">; // v2i32 -> v2i64
+
+defm : Lengthen_HalfSingle<"4", "i16", "i8", "8", "i16">; // v4i8 -> v4i16
+defm : Lengthen_HalfSingle<"2", "i32", "i16", "4", "i32">; // v2i16 -> v2i32
+
+// Double lengthening - v4i8 -> v4i16 -> v4i32
+defm : Lengthen_Double<"4", "i32", "i8", "8", "i16", "4", "i32">;
+// v2i8 -> v2i16 -> v2i32
+defm : Lengthen_HalfDouble<"2", "i32", "i8", "8", "i16", "4", "i32">;
+// v2i16 -> v2i32 -> v2i64
+defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
+
+// Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
+def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
+      (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+         (VLD1LNd16 addrmode6:$addr, 
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
+      (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
+         (VLD1LNd16 addrmode6:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+def : Pat<(v2i64 (sextloadvi8 addrmode6:$addr)),
+      (VMOVLsv2i64 (EXTRACT_SUBREG (VMOVLsv4i32 (EXTRACT_SUBREG (VMOVLsv8i16
+         (VLD1LNd16 addrmode6:$addr,
+                    (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+def : VFP2InstAlias<"fmdhr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 1, pred:$p)>;
+def : VFP2InstAlias<"fmdlr${p} $Dd, $Rn",
+                    (VSETLNi32 DPR:$Dd, GPR:$Rn, 0, pred:$p)>;
+
+// VAND/VBIC/VEOR/VORR accept but do not require a type suffix.
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vd, $Vn, $Vm",
+                         (VANDq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbic${p}", "$Vd, $Vn, $Vm",
+                         (VBICq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vd, $Vn, $Vm",
+                         (VEORq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vd, $Vn, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+// ... two-operand aliases
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vand${p}", "$Vdn, $Vm",
+                         (VANDq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"veor${p}", "$Vdn, $Vm",
+                         (VEORq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRd DPR:$Vdn, DPR:$Vdn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
+                         (VORRq QPR:$Vdn, QPR:$Vdn, QPR:$Vm, pred:$p)>;
+
+// VLD1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST1 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VST1LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
+                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST1LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
+                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST1LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
+                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST1LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST1LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST2 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VST2LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
+                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
+                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST2LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST2LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD3DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VLD3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST3 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VST3LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST3LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST3 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+
+def VST3dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VST3qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
+               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+def VST3dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST3qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VLD4 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD4DUPdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD4DUPdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4DUPqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VLD4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VLD4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+
+// VLD4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VLD4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VLD4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VST4 single-lane pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+
+def VST4LNdWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNdWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNdWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNqWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNqWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+def VST4LNdWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNdWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4LNqWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+
+// VST4 multiple structure pseudo-instructions. These need special handling for
+// the vector operands that the normal instructions don't yet model.
+// FIXME: Remove these when the register classes and instructions are updated.
+def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+
+def VST4dWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4dWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4dWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+def VST4qWB_fixed_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VST4qWB_fixed_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VST4qWB_fixed_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
+               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+def VST4dWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4dWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourD:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_8 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_16 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+def VST4qWB_register_Asm_32 :
+        NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
+                  (ins VecListFourQ:$list, addrmode6:$addr,
+                       rGPR:$Rm, pred:$p)>;
+
+// VMOV takes an optional datatype suffix
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
+                         (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGEsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGEsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGEsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGEuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGEuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGEsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGEsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGEuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGEuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
+// D-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Dd, $Dn, $Dm",
+                    (VCGTsv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Dd, $Dn, $Dm",
+                    (VCGTsv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Dd, $Dn, $Dm",
+                    (VCGTsv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Dd, $Dn, $Dm",
+                    (VCGTuv8i8 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Dd, $Dn, $Dm",
+                    (VCGTuv4i16 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm",
+                    (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm",
+                    (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>;
+// Q-register versions.
+def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm",
+                    (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s16 $Qd, $Qn, $Qm",
+                    (VCGTsv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.s32 $Qd, $Qn, $Qm",
+                    (VCGTsv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u8 $Qd, $Qn, $Qm",
+                    (VCGTuv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u16 $Qd, $Qn, $Qm",
+                    (VCGTuv8i16 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm",
+                    (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm",
+                    (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>;
+
+// VSWP allows, but does not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm",
+                         (VSWPq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
+// VBIF, VBIT, and VBSL allow, but do not require, a type suffix.
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbif${p}", "$Vd, $Vn, $Vm",
+                         (VBIFq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbit${p}", "$Vd, $Vn, $Vm",
+                         (VBITq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vbsl${p}", "$Vd, $Vn, $Vm",
+                         (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm, pred:$p)>;
+
+// "vmov Rd, #-imm" can be handled via "vmvn".
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $imm",
+                    (VMVNv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv2i32 DPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $imm",
+                    (VMOVv4i32 QPR:$Vd, nImmVMOVI32Neg:$imm, pred:$p)>;
+
+// 'gas' compatibility aliases for quad-word instructions. Strictly speaking,
+// these should restrict to just the Q register variants, but the register
+// classes are enough to match correctly regardless, so we keep it simple
+// and just use MnemonicAlias.
+def : NEONMnemonicAlias<"vbicq", "vbic">;
+def : NEONMnemonicAlias<"vandq", "vand">;
+def : NEONMnemonicAlias<"veorq", "veor">;
+def : NEONMnemonicAlias<"vorrq", "vorr">;
+
+def : NEONMnemonicAlias<"vmovq", "vmov">;
+def : NEONMnemonicAlias<"vmvnq", "vmvn">;
+// Explicit versions for floating point so that the FPImm variants get
+// handled early. The parser gets confused otherwise.
+def : NEONMnemonicAlias<"vmovq.f32", "vmov.f32">;
+def : NEONMnemonicAlias<"vmovq.f64", "vmov.f64">;
+
+def : NEONMnemonicAlias<"vaddq", "vadd">;
+def : NEONMnemonicAlias<"vsubq", "vsub">;
+
+def : NEONMnemonicAlias<"vminq", "vmin">;
+def : NEONMnemonicAlias<"vmaxq", "vmax">;
+
+def : NEONMnemonicAlias<"vmulq", "vmul">;
+
+def : NEONMnemonicAlias<"vabsq", "vabs">;
+
+def : NEONMnemonicAlias<"vshlq", "vshl">;
+def : NEONMnemonicAlias<"vshrq", "vshr">;
+
+def : NEONMnemonicAlias<"vcvtq", "vcvt">;
+
+def : NEONMnemonicAlias<"vcleq", "vcle">;
+def : NEONMnemonicAlias<"vceqq", "vceq">;
+
+def : NEONMnemonicAlias<"vzipq", "vzip">;
+def : NEONMnemonicAlias<"vswpq", "vswp">;
+
+def : NEONMnemonicAlias<"vrecpeq.f32", "vrecpe.f32">;
+def : NEONMnemonicAlias<"vrecpeq.u32", "vrecpe.u32">;
+
+
+// Alias for loading floating point immediates that aren't representable
+// using the vmov.f32 encoding but the bitpattern is representable using
+// the .i32 encoding.
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
+                     (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 000000000000..ae7a5c00bd74
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -0,0 +1,1417 @@
+//===-- ARMInstrThumb.td - Thumb support for ARM -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+  unsigned Imm = N->getZExtValue();
+  return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+  uint64_t Imm = N->getZExtValue();
+  return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+  let PrintMethod = "printThumbSRImm";
+  let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getZExtValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift. This uses
+// thumb_immshifted to match and thumb_immshifted_val and thumb_immshifted_shamt
+// to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+}
+
+// Scaled 4 immediate.
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_1020s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_508s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// Alias use only, so no printer is necessary.
+def t_imm0_508s4_neg_asmoperand: AsmOperandClass { let Name = "Imm0_508s4Neg"; }
+def t_imm0_508s4_neg : Operand<i32> {
+  let ParserMatchClass = t_imm0_508s4_neg_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// Define Thumb specific addressing modes.
+
+let OperandType = "OPERAND_PCREL" in {
+def t_brtarget : Operand<OtherVT> {
+  let EncoderMethod = "getThumbBRTargetOpValue";
+  let DecoderMethod = "DecodeThumbBROperand";
+}
+
+def t_bcctarget : Operand<i32> {
+  let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
+}
+
+def t_cbtarget : Operand<i32> {
+  let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
+}
+
+def t_bltarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLTargetOperand";
+}
+
+def t_blxtarget : Operand<i32> {
+  let EncoderMethod = "getThumbBLXTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLXOffset";
+}
+}
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_rrs := reg + reg
+//
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
+def t_addrmode_rrs1 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs2 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+def t_addrmode_rrs4 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
+  let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_is4 := reg + imm5 * 4
+//
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
+def t_addrmode_is4 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let ParserMatchClass = t_addrmode_is4_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is2 := reg + imm5 * 2
+//
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
+def t_addrmode_is2 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let ParserMatchClass = t_addrmode_is2_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_is1 := reg + imm5
+//
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
+def t_addrmode_is1 : Operand<i32>,
+                     ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
+  let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
+  let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let ParserMatchClass = t_addrmode_is1_asm_operand;
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let EncoderMethod = "getAddrModeThumbSPOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeSP";
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let ParserMatchClass = t_addrmode_sp_asm_operand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : Operand<i32> {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+// FIXME: Marking these as hasSideEffects is necessary to prevent machine DCE
+// from removing one half of the matched pairs. That breaks PEI, which assumes
+// these will always be in pairs, and asserts if it finds otherwise. Better way?
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def tADJCALLSTACKUP :
+  PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_end imm:$amt1, imm:$amt2)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+
+def tADJCALLSTACKDOWN :
+  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+             [(ARMcallseq_start imm:$amt)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+}
+
+class T1SystemEncoding<bits<8> opc>
+  : T1Encoding<0b101111> {
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = opc;
+}
+
+def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
+           T1SystemEncoding<0x00>, // A8.6.110
+        Requires<[IsThumb2]>;
+
+def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
+           T1SystemEncoding<0x10>, // A8.6.410
+           Requires<[IsThumb2]>;
+
+def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
+           T1SystemEncoding<0x20>, // A8.6.408
+           Requires<[IsThumb2]>;
+
+def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
+           T1SystemEncoding<0x30>, // A8.6.409
+           Requires<[IsThumb2]>;
+
+def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
+           T1SystemEncoding<0x40>, // A8.6.157
+           Requires<[IsThumb2]>;
+
+// The imm operand $val can be used by a debugger to store more information
+// about the breakpoint.
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+                []>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b10;
+  // A8.6.22
+  bits<8> val;
+  let Inst{7-0} = val;
+}
+
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+                  []>, T1Encoding<0b101101> {
+  bits<1> end;
+  // A8.6.156
+  let Inst{9-5} = 0b10010;
+  let Inst{4}   = 1;
+  let Inst{3}   = end;
+  let Inst{2-0} = 0b000;
+}
+
+// Change Processor State is a system instruction -- for disassembly only.
+def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
+                NoItinerary, "cps$imod $iflags", []>,
+           T1Misc<0b0110011> {
+  // A8.6.38 & B6.1.1
+  bit imod;
+  bits<3> iflags;
+
+  let Inst{4}   = imod;
+  let Inst{3}   = 0;
+  let Inst{2-0} = iflags;
+  let DecoderMethod = "DecodeThumbCPS";
+}
+
+// For both thumb1 and thumb2.
+let isNotDuplicable = 1, isCodeGenOnly = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
+                  [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+              T1Special<{0,0,?,?}> {
+  // A8.6.6
+  bits<3> dst;
+  let Inst{6-3} = 0b1111; // Rm = pc
+  let Inst{2-0} = dst;
+}
+
+// ADD <Rd>, sp, #<imm8>
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+                    IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
+               T1Encoding<{1,0,1,0,1,?}> {
+  // A6.2 & A8.6.8
+  bits<3> dst;
+  bits<8> imm;
+  let Inst{10-8} = dst;
+  let Inst{7-0}  = imm;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+// ADD sp, sp, #<imm7>
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                     IIC_iALUi, "add", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,0,?,?}> {
+  // A6.2.5 & A8.6.8
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+// SUB sp, sp, #<imm7>
+// FIXME: The encoding and the ASM string don't match up.
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                    IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
+              T1Misc<{0,0,0,0,1,?,?}> {
+  // A6.2.5 & A8.6.214
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
+}
+
+def : tInstAlias<"add${p} sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
+
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
+// ADD <Rm>, sp
+def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
+                   "add", "\t$Rdn, $sp, $Rn", []>,
+              T1Special<{0,0,?,?}> {
+  // A8.6.9 Encoding T1
+  bits<4> Rdn;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = 0b1101;
+  let Inst{2-0} = Rdn{2-0};
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+// ADD sp, <Rm>
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+                  "add", "\t$Rdn, $Rm", []>,
+              T1Special<{0,0,?,?}> {
+  // A8.6.9 Encoding T2
+  bits<4> Rm;
+  let Inst{7} = 1;
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = 0b101;
+  let DecoderMethod = "DecodeThumbAddSPReg";
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
+            T1Special<{1,1,0,?}> {
+    // A6.2.3 & A8.6.25
+    bits<4> Rm;
+    let Inst{6-3} = Rm;
+    let Inst{2-0} = 0b000;
+    let Unpredictable{2-0} = 0b111;
+  }
+}
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
+                   [(ARMretflag)], (tBX LR, pred:$p)>;
+
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
+                   2, IIC_Br, [],
+                   (tBX GPR:$Rm, pred:$p)>;
+}
+
+// All calls clobber the non-callee saved registers. SP is marked as a use to
+// prevent stack-pointer assignments that appear immediately before calls from
+// potentially appearing dead.
+let isCall = 1,
+  Defs = [LR], Uses = [SP] in {
+  // Also used for Thumb2
+  def tBL  : TIx2<0b11110, 0b11, 1,
+                  (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
+                  "bl${p}\t$func",
+                  [(ARMtcall tglobaladdr:$func)]>,
+             Requires<[IsThumb]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-0} = func{10-0};
+  }
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi : TIx2<0b11110, 0b11, 0,
+                 (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
+                   "blx${p}\t$func",
+                   [(ARMcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, HasV5T]> {
+    bits<24> func;
+    let Inst{26} = func{23};
+    let Inst{25-16} = func{20-11};
+    let Inst{13} = func{22};
+    let Inst{11} = func{21};
+    let Inst{10-1} = func{10-1};
+    let Inst{0} = 0; // func{0} is assumed zero
+  }
+
+  // Also used for Thumb2
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func), IIC_Br,
+                  "blx${p}\t$func",
+                  [(ARMtcall GPR:$func)]>,
+              Requires<[IsThumb, HasV5T]>,
+              T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
+    bits<4> func;
+    let Inst{6-3} = func;
+    let Inst{2-0} = 0b000;
+  }
+
+  // ARMv4T
+  def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
+                  4, IIC_Br,
+                  [(ARMcall_nolink tGPR:$func)]>,
+            Requires<[IsThumb, IsThumb1Only]>;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  let isPredicable = 1 in
+  def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b", "\t$target", [(br bb:$target)]>,
+             T1Encoding<{1,1,1,0,0,?}> {
+    bits<11> target;
+    let Inst{10-0} = target;
+  }
+
+  // Far jump
+  // Just a pseudo for a tBL instruction. Needed to let regalloc know about
+  // the clobber of LR.
+  let Defs = [LR] in
+  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>;
+
+  def tBR_JTr : tPseudoInst<(outs),
+                      (ins tGPR:$target, i32imm:$jt, i32imm:$id),
+                      0, IIC_Br,
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> {
+    list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+  def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
+                 "b${p}\t$target",
+                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+             T1BranchCond<{1,1,0,1}> {
+  bits<4> p;
+  bits<8> target;
+  let Inst{11-8} = p;
+  let Inst{7-0} = target;
+}
+
+// Tail calls
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS versions.
+  let Uses = [SP] in {
+    def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
+                     4, IIC_Br, [],
+                     (tBX GPR:$dst, (ops 14, zero_reg))>,
+                     Requires<[IsThumb]>;
+  }
+  // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
+  // on IOS), so it's in ARMInstrThumb2.td.
+  // Non-IOS version:
+  let Uses = [SP] in {
+    def tTAILJMPdND : tPseudoExpand<(outs),
+                   (ins t_brtarget:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (tB t_brtarget:$dst, pred:$p)>,
+                 Requires<[IsThumb, IsNotIOS]>;
+  }
+}
+
+
+// A8.6.218 Supervisor Call (Software Interrupt)
+// A8.6.16 B: Encoding T1
+// If Inst{11-8} == 0b1111 then SEE SVC
+let isCall = 1, Uses = [SP] in
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
+                "svc", "\t$imm", []>, Encoding16 {
+  bits<8> imm;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8}  = 0b1111;
+  let Inst{7-0}   = imm;
+}
+
+// The assembler uses 0xDEFE for a trap instruction.
+let isBarrier = 1, isTerminator = 1 in
+def tTRAP : TI<(outs), (ins), IIC_Br,
+               "trap", [(trap)]>, Encoding16 {
+  let Inst = 0xdefe;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+// Loads: reg/reg and reg/imm5
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs tGPR:$Rt), (ins AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 1 /* Load */,
+                      (outs tGPR:$Rt), (ins AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>;
+}
+// Stores: reg/reg and reg/imm5
+multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc,
+                              Operand AddrMode_r, Operand AddrMode_i,
+                              AddrMode am, InstrItinClass itin_r,
+                              InstrItinClass itin_i, string asm,
+                              PatFrag opnode> {
+  def r : // reg/reg
+    T1pILdStEncode<reg_opc,
+                   (outs), (ins tGPR:$Rt, AddrMode_r:$addr),
+                   am, itin_r, asm, "\t$Rt, $addr",
+                   [(opnode tGPR:$Rt, AddrMode_r:$addr)]>;
+  def i : // reg/imm5
+    T1pILdStEncodeImm<imm_opc, 0 /* Store */,
+                      (outs), (ins tGPR:$Rt, AddrMode_i:$addr),
+                      am, itin_i, asm, "\t$Rt, $addr",
+                      [(opnode tGPR:$Rt, AddrMode_i:$addr)]>;
+}
+
+// A8.6.57 & A8.6.60
+defm tLDR  : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iLoad_r, IIC_iLoad_i, "ldr",
+                                UnOpFrag<(load node:$Src)>>;
+
+// A8.6.64 & A8.6.61
+defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb",
+                                UnOpFrag<(zextloadi8 node:$Src)>>;
+
+// A8.6.76 & A8.6.73
+defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
+                                t_addrmode_is2, AddrModeT1_2,
+                                IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh",
+                                UnOpFrag<(zextloadi16 node:$Src)>>;
+
+let AddedComplexity = 10 in
+def tLDRSB :                    // A8.6.80
+  T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_1, IIC_iLoad_bh_r,
+                 "ldrsb", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
+
+let AddedComplexity = 10 in
+def tLDRSH :                    // A8.6.84
+  T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
+                 AddrModeT1_2, IIC_iLoad_bh_r,
+                 "ldrsh", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
+
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
+                    "ldr", "\t$Rt, $addr",
+                    [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+// Load tconstpool
+// FIXME: Use ldr.n to work around a darwin assembler bug.
+let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
+def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                  "ldr", ".n\t$Rt, $addr",
+                  [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// FIXME: Remove this entry when the above ldr.n workaround is fixed.
+// For assembly/disassembly use only.
+def tLDRpciASM : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
+                       "ldr", "\t$Rt, $addr", []>,
+                 T1Encoding<{0,1,0,0,1,?}> {
+  // A6.2 & A8.6.59
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0}  = addr;
+}
+
+// A8.6.194 & A8.6.192
+defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
+                                t_addrmode_is4, AddrModeT1_4,
+                                IIC_iStore_r, IIC_iStore_i, "str",
+                                BinOpFrag<(store node:$LHS, node:$RHS)>>;
+
+// A8.6.197 & A8.6.195
+defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1,
+                                t_addrmode_is1, AddrModeT1_1,
+                                IIC_iStore_bh_r, IIC_iStore_bh_i, "strb",
+                                BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+
+// A8.6.207 & A8.6.205
+defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2,
+                               t_addrmode_is2, AddrModeT1_2,
+                               IIC_iStore_bh_r, IIC_iStore_bh_i, "strh",
+                               BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
+                    "str", "\t$Rt, $addr",
+                    [(store tGPR:$Rt, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}> {
+  bits<3> Rt;
+  bits<8> addr;
+  let Inst{10-8} = Rt;
+  let Inst{7-0} = addr;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// These require base address to be written back or one of the loaded regs.
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+def tLDMIA_UPD :
+    InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "$Rn = $wb", IIC_iLoad_mu>,
+    PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+  let Size = 2;
+  let OutOperandList = (outs GPR:$wb);
+  let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+  let Pattern = [];
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+                         (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+                         AddrModeNone, 2, IIC_iStore_mu,
+                         "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+                     T1Encoding<{1,1,0,0,0,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+} // neverHasSideEffects
+
+def : InstAlias<"ldm${p} $Rn!, $regs",
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+        Requires<[IsThumb, IsThumb1Only]>;
+
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+               IIC_iPop,
+               "pop${p}\t$regs", []>,
+           T1Misc<{1,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{15};
+  let Inst{7-0} = regs{7-0};
+}
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                IIC_iStore_m,
+                "push${p}\t$regs", []>,
+            T1Misc<{0,1,0,?,?,?,?}> {
+  bits<16> regs;
+  let Inst{8}   = regs{14};
+  let Inst{7-0} = regs{7-0};
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Helper classes for encoding T1pI patterns:
+class T1pIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rn;
+}
+class T1pIMiscEncode<bits<7> opA, dag oops, dag iops, InstrItinClass itin,
+                     string opc, string asm, list<dag> pattern>
+    : T1pI<oops, iops, itin, opc, asm, pattern>,
+      T1Misc<opA> {
+  bits<3> Rm;
+  bits<3> Rd;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sI patterns:
+class T1sIDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncode<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rm;
+  bits<3> Rn;
+  bits<3> Rd;
+  let Inst{8-6} = Rm;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+}
+class T1sIGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                       string opc, string asm, list<dag> pattern>
+    : T1sI<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rd;
+}
+
+// Helper classes for encoding T1sIt patterns:
+class T1sItDPEncode<bits<4> opA, dag oops, dag iops, InstrItinClass itin,
+                    string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1DataProcessing<opA> {
+  bits<3> Rdn;
+  bits<3> Rm;
+  let Inst{5-3} = Rm;
+  let Inst{2-0} = Rdn;
+}
+class T1sItGenEncodeImm<bits<5> opA, dag oops, dag iops, InstrItinClass itin,
+                        string opc, string asm, list<dag> pattern>
+    : T1sIt<oops, iops, itin, opc, asm, pattern>,
+      T1General<opA> {
+  bits<3> Rdn;
+  bits<8> imm8;
+  let Inst{10-8} = Rdn;
+  let Inst{7-0}  = imm8;
+}
+
+// Add with carry register
+let isCommutable = 1, Uses = [CPSR] in
+def tADC :                      // A8.6.2
+  T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
+                "adc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>;
+
+// Add immediate
+def tADDi3 :                    // A8.6.4 T1
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                   IIC_iALUi,
+                   "add", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
+
+def tADDi8 :                    // A8.6.4 T2
+  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                    "add", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
+
+// Add register
+let isCommutable = 1 in
+def tADDrr :                    // A8.6.6 T1
+  T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "add", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>;
+
+let neverHasSideEffects = 1 in
+def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
+                     "add", "\t$Rdn, $Rm", []>,
+               T1Special<{0,0,?,?}> {
+  // A8.6.6 T2
+  bits<4> Rdn;
+  bits<4> Rm;
+  let Inst{7}   = Rdn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rdn{2-0};
+}
+
+// AND register
+let isCommutable = 1 in
+def tAND :                      // A8.6.12
+  T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "and", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>;
+
+// ASR immediate
+def tASRri :                    // A8.6.14
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "asr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// ASR register
+def tASRrr :                    // A8.6.15
+  T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "asr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>;
+
+// BIC register
+def tBIC :                      // A8.6.20
+  T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "bic", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>;
+
+// CMN register
+let isCompare = 1, Defs = [CPSR] in {
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//def tCMN :                     // A8.6.33
+//  T1pIDPEncode<0b1011, (outs), (ins tGPR:$lhs, tGPR:$rhs),
+//               IIC_iCMPr,
+//               "cmn", "\t$lhs, $rhs",
+//               [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>;
+
+def tCMNz :                     // A8.6.33
+  T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmn", "\t$Rn, $Rm",
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>;
+
+} // isCompare = 1, Defs = [CPSR]
+
+// CMP immediate
+let isCompare = 1, Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
+                  "cmp", "\t$Rn, $imm8",
+                  [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
+             T1General<{1,0,1,?,?}> {
+  // A8.6.35
+  bits<3> Rn;
+  bits<8> imm8;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = imm8;
+}
+
+// CMP register
+def tCMPr :                     // A8.6.36 T1
+  T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
+               IIC_iCMPr,
+               "cmp", "\t$Rn, $Rm",
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
+                   "cmp", "\t$Rn, $Rm", []>,
+              T1Special<{0,1,?,?}> {
+  // A8.6.36 T2
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{7}   = Rn{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rn{2-0};
+}
+} // isCompare = 1, Defs = [CPSR]
+
+
+// XOR register
+let isCommutable = 1 in
+def tEOR :                      // A8.6.45
+  T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "eor", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>;
+
+// LSL immediate
+def tLSLri :                    // A8.6.88
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
+                   IIC_iMOVsi,
+                   "lsl", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSL register
+def tLSLrr :                    // A8.6.89
+  T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsl", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>;
+
+// LSR immediate
+def tLSRri :                    // A8.6.90
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
+                   IIC_iMOVsi,
+                   "lsr", "\t$Rd, $Rm, $imm5",
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+  bits<5> imm5;
+  let Inst{10-6} = imm5;
+}
+
+// LSR register
+def tLSRrr :                    // A8.6.91
+  T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "lsr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>;
+
+// Move register
+let isMoveImm = 1 in
+def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
+                  "mov", "\t$Rd, $imm8",
+                  [(set tGPR:$Rd, imm0_255:$imm8)]>,
+             T1General<{1,0,0,?,?}> {
+  // A8.6.96
+  bits<3> Rd;
+  bits<8> imm8;
+  let Inst{10-8} = Rd;
+  let Inst{7-0}  = imm8;
+}
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
+
+// A7-73: MOV(2) - mov setting flag.
+
+let neverHasSideEffects = 1 in {
+def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
+                      2, IIC_iMOVr,
+                      "mov", "\t$Rd, $Rm", "", []>,
+                  T1Special<{1,0,?,?}> {
+  // A8.6.97
+  bits<4> Rd;
+  bits<4> Rm;
+  let Inst{7}   = Rd{3};
+  let Inst{6-3} = Rm;
+  let Inst{2-0} = Rd{2-0};
+}
+let Defs = [CPSR] in
+def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
+                      "movs\t$Rd, $Rm", []>, Encoding16 {
+  // A8.6.97
+  bits<3> Rd;
+  bits<3> Rm;
+  let Inst{15-6} = 0b0000000000;
+  let Inst{5-3}  = Rm;
+  let Inst{2-0}  = Rd;
+}
+} // neverHasSideEffects
+
+// Multiply register
+let isCommutable = 1 in
+def tMUL :                      // A8.6.105 T1
+  Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+           IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+           [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+      T1DataProcessing<0b1101> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+  let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+                                               pred:$p)>;
+
+// Move inverse register
+def tMVN :                      // A8.6.107
+  T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
+               "mvn", "\t$Rd, $Rn",
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>;
+
+// Bitwise or register
+let isCommutable = 1 in
+def tORR :                      // A8.6.114
+  T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iBITr,
+                "orr", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>;
+
+// Swaps
+def tREV :                      // A8.6.134
+  T1pIMiscEncode<{1,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def tREV16 :                    // A8.6.135
+  T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "rev16", "\t$Rd, $Rm",
+             [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
+                Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def tREVSH :                    // A8.6.136
+  T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "revsh", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Rotate right register
+def tROR :                      // A8.6.139
+  T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iMOVsr,
+                "ror", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>;
+
+// Negate register
+def tRSB :                      // A8.6.141
+  T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
+               IIC_iALUi,
+               "rsb", "\t$Rd, $Rn, #0",
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
+
+// Subtract with carry register
+let Uses = [CPSR] in
+def tSBC :                      // A8.6.151
+  T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sbc", "\t$Rdn, $Rm",
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>;
+
+// Subtract immediate
+def tSUBi3 :                    // A8.6.210 T1
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                   IIC_iALUi,
+                   "sub", "\t$Rd, $Rm, $imm3",
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
+  bits<3> imm3;
+  let Inst{8-6} = imm3;
+}
+
+def tSUBi8 :                    // A8.6.210 T2
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
+                    "sub", "\t$Rdn, $imm8",
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
+
+// Subtract register
+def tSUBrr :                    // A8.6.212
+  T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                IIC_iALUr,
+                "sub", "\t$Rd, $Rn, $Rm",
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
+
+// Sign-extend byte
+def tSXTB :                     // A8.6.222
+  T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Sign-extend short
+def tSXTH :                     // A8.6.224
+  T1pIMiscEncode<{0,0,1,0,0,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "sxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Test
+let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
+def tTST :                      // A8.6.230
+  T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
+               "tst", "\t$Rn, $Rm",
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>;
+
+// Zero-extend byte
+def tUXTB :                     // A8.6.262
+  T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxtb", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Zero-extend short
+def tUXTH :                     // A8.6.264
+  T1pIMiscEncode<{0,0,1,0,1,0,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
+                 IIC_iUNAr,
+                 "uxth", "\t$Rd, $Rm",
+                 [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
+// Expanded after instruction selection into a branch sequence.
+let usesCustomInserter = 1 in  // Expanded after instruction selection.
+  def tMOVCCr_pseudo :
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
+              NoItinerary,
+             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+
+def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
+               IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
+               T1Encoding<{1,0,1,0,0,?}> {
+  bits<3> Rd;
+  bits<8> addr;
+  let Inst{10-8} = Rd;
+  let Inst{7-0} = addr;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
+                              2, IIC_iALUi, []>;
+
+let hasSideEffects = 1 in
+def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
+                              (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                              2, IIC_iALUi, []>;
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+// This is a pseudo inst so that we can get the encoding right,
+// complete with fixup for the aeabi_read_tp function.
+let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
+def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
+                          [(set R0, ARMthread_pointer)]>;
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//
+
+// eh_sjlj_setjmp() is an instruction sequence to store the return address and
+// save #0 in R0 for the non-longjmp case.  Since by its nature we may be coming
+// from some other function to get here, and we're using the stack frame for the
+// containing function to save/restore registers, we can't keep anything live in
+// regs across the eh_sjlj_setjmp(), else it will almost certainly have been
+// tromped upon when we get here from a longjmp(). We force everything out of
+// registers except for our own input by listing the relevant registers in
+// Defs. By doing so, we also cause the prologue/epilogue code to actively
+// preserve all of the callee-saved resgisters, which is exactly what we want.
+// $val is a scratch register for our use.
+let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
+    hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in
+def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+                                  AddrModeNone, 0, NoItinerary, "","",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+
+// FIXME: Non-IOS version(s)
+let isBarrier = 1, hasSideEffects = 1, isTerminator = 1, isCodeGenOnly = 1,
+    Defs = [ R7, LR, SP ] in
+def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
+                              AddrModeNone, 0, IndexModeNone,
+                              Pseudo, NoItinerary, "", "",
+                              [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
+                             Requires<[IsThumb, IsIOS]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Comparisons
+def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
+            (tCMPi8  tGPR:$Rn, imm0_255:$imm8)>;
+def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
+            (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
+
+// Add with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
+            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
+            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
+            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
+            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
+            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
+            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
+// ConstantPool, GlobalAddress
+def : T1Pat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// JumpTable
+def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Direct calls
+def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+      Requires<[IsThumb]>;
+
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T]>;
+
+// Indirect calls to ARM routines
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
+      Requires<[IsThumb, HasV5T]>;
+
+// zextload i1 -> zextload i8
+def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr),
+            (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(zextloadi1 t_addrmode_is1:$addr),
+            (tLDRBi t_addrmode_is1:$addr)>;
+
+// extload -> zextload
+def : T1Pat<(extloadi1  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi1  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_is1:$addr),  (tLDRBi t_addrmode_is1:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_is2:$addr),  (tLDRHi t_addrmode_is2:$addr)>;
+
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tSXTB (tLDRBi t_addrmode_is1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tSXTH (tLDRHi t_addrmode_is2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>,
+      Requires<[IsThumb, IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr),
+            (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
+            (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
+            (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>;
+def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
+            (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
+
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+             (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
+             (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+             (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
+             (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+             (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
+             (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+             (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+             (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+             (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : T1Pat<(i32 thumb_immshifted:$src),
+            (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                    (thumb_immshifted_shamt imm:$src))>;
+
+def : T1Pat<(i32 imm0_255_comp:$src),
+            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let isReMaterializable = 1 in
+def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                             NoItinerary,
+               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb, IsThumb1Only]>;
+
+// Pseudo-instruction for merged POP and return.
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
+                           2, IIC_iPop_Br, [],
+                           (tPOP pred:$p, reglist:$regs)>;
+
+// Indirect branch using "mov pc, $Rm"
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
+                  2, IIC_Br, [(brind GPR:$Rm)],
+                  (tMOVr PC, GPR:$Rm, pred:$p)>;
+}
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extension and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
+
+// Implied destination operand forms for shifts.
+def : tInstAlias<"lsl${s}${p} $Rdm, $imm",
+             (tLSLri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm0_31:$imm, pred:$p)>;
+def : tInstAlias<"lsr${s}${p} $Rdm, $imm",
+             (tLSRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
+def : tInstAlias<"asr${s}${p} $Rdm, $imm",
+             (tASRri tGPR:$Rdm, cc_out:$s, tGPR:$Rdm, imm_sr:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 000000000000..c9d709eb5222
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -0,0 +1,4327 @@
+//===-- ARMInstrThumb2.td - Thumb2 support for ARM ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+  let Name = "ITCondCode";
+  let ParserMethod = "parseITCondCode";
+}
+def it_pred : Operand<i32> {
+  let PrintMethod = "printMandatoryPredicateOperand";
+  let ParserMatchClass = it_pred_asmoperand;
+}
+
+// IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
+def it_mask : Operand<i32> {
+  let PrintMethod = "printThumbITMask";
+  let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+  let DecoderMethod = "DecodeT2ShifterImmOperand";
+}
+
+// Shifted operands. No register controlled shifts for Thumb2.
+// Note: We do not support rrx shifted operands yet.
+def t2_so_reg : Operand<i32>,    // reg imm
+                ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
+                               [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getT2SORegOpValue";
+  let PrintMethod = "printT2SOOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops rGPR, i32imm);
+}
+
+// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
+def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
+def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
+}]>;
+
+// so_imm_notSext_XFORM - Return a so_imm value packed into the format
+// described for so_imm_notSext def below, with sign extension from 16
+// bits.
+def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
+  APInt apIntN = N->getAPIntValue();
+  unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+  return CurDAG->getTargetConstant(~N16bitSignExt, MVT::i32);
+}]>;
+
+// t2_so_imm - Match a 32-bit immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
+// immediate splatted into multiple bytes of the word.
+def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
+    return ARM_AM::getT2SOImmVal(Imm) != -1;
+  }]> {
+  let ParserMatchClass = t2_so_imm_asmoperand;
+  let EncoderMethod = "getT2SOImmOpValue";
+  let DecoderMethod = "DecodeT2SOImm";
+}
+
+// t2_so_imm_not - Match an immediate that is a complement
+// of a t2_so_imm.
+// Note: this pattern doesn't require an encoder method and such, as it's
+// only used on aliases (Pat<> and InstAlias<>). The actual encoding
+// is handled by the destination instructions, which use t2_so_imm.
+def t2_so_imm_not_asmoperand : AsmOperandClass { let Name = "T2SOImmNot"; }
+def t2_so_imm_not : Operand<i32>, PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_notSext - match an immediate that is a complement of a t2_so_imm
+// if the upper 16 bits are zero.
+def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
+    APInt apIntN = N->getAPIntValue();
+    if (!apIntN.isIntN(16)) return false;
+    unsigned N16bitSignExt = apIntN.trunc(16).sext(32).getZExtValue();
+    return ARM_AM::getT2SOImmVal(~N16bitSignExt) != -1;
+  }], t2_so_imm_notSext16_XFORM> {
+  let ParserMatchClass = t2_so_imm_not_asmoperand;
+}
+
+// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
+def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
+def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
+  int64_t Value = -(int)N->getZExtValue();
+  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+}], t2_so_imm_neg_XFORM> {
+  let ParserMatchClass = t2_so_imm_neg_asmoperand;
+}
+
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
+def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm < 4096;
+}]> {
+  let ParserMatchClass = imm0_4095_asmoperand;
+}
+
+def imm0_4095_neg_asmoperand: AsmOperandClass { let Name = "Imm0_4095Neg"; }
+def imm0_4095_neg : Operand<i32>, PatLeaf<(i32 imm), [{
+ return (uint32_t)(-N->getZExtValue()) < 4096;
+}], imm_neg_XFORM> {
+  let ParserMatchClass = imm0_4095_neg_asmoperand;
+}
+
+def imm1_255_neg : PatLeaf<(i32 imm), [{
+  uint32_t Val = -N->getZExtValue();
+  return (Val > 0 && Val < 255);
+}], imm_neg_XFORM>;
+
+def imm0_255_not : PatLeaf<(i32 imm), [{
+  return (uint32_t)(~N->getZExtValue()) < 255;
+}], imm_comp_XFORM>;
+
+def lo5AllOne : PatLeaf<(i32 imm), [{
+  // Returns true if all low 5-bits are 1.
+  return (((uint32_t)N->getZExtValue()) & 0x1FUL) == 0x1FUL;
+}]>;
+
+// Define Thumb2 specific addressing modes.
+
+// t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
+def t2addrmode_imm12 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
+  let PrintMethod = "printAddrModeImm12Operand";
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm12";
+  let ParserMatchClass = t2addrmode_imm12_asmoperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2ldrlabel  := imm12
+def t2ldrlabel : Operand<i32> {
+  let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printThumbLdrLabelOperand";
+}
+
+def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";}
+def t2ldr_pcrel_imm12 : Operand<i32> {
+  let ParserMatchClass = t2ldr_pcrel_imm12_asmoperand;
+  // used for assembler pseudo instruction and maps to t2ldrlabel, so
+  // doesn't need encoder or print methods of its own.
+}
+
+// ADR instruction labels.
+def t2adrlabel : Operand<i32> {
+  let EncoderMethod = "getT2AdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand";
+}
+
+
+// t2addrmode_posimm8  := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8  := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm8  := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+def t2addrmode_imm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2am_imm8_offset : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
+                                      [], [SDNPWantRoot]> {
+  let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+  let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
+  let DecoderMethod = "DecodeT2Imm8";
+}
+
+// t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
+def t2addrmode_imm8s4 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8s4Operand";
+  let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8s4";
+  let ParserMatchClass = MemImm8s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
+def t2am_imm8s4_offset : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+  let EncoderMethod = "getT2Imm8s4OpValue";
+  let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4  := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+  let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+  let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
+def t2addrmode_so_reg : Operand<i32>,
+                        ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
+  let PrintMethod = "printT2AddrModeSoRegOperand";
+  let EncoderMethod = "getT2AddrModeSORegOpValue";
+  let DecoderMethod = "DecodeT2AddrModeSOReg";
+  let ParserMatchClass = t2addrmode_so_reg_asmoperand;
+  let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
+}
+
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : Operand<i32> {
+  let PrintMethod = "printAddrModeTBB";
+  let ParserMatchClass = addrmode_tbb_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : Operand<i32> {
+  let PrintMethod = "printAddrModeTBH";
+  let ParserMatchClass = addrmode_tbh_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+
+class T2OneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2sOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2OneRegCmpImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{19-16}  = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+
+class T2OneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sOneRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2OneRegCmpShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2TwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sTwoReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegCmp<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+
+class T2TwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2sTwoRegImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+class T2TwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2sTwoRegShiftImm<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<5> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+  let Inst{14-12} = imm{4-2};
+  let Inst{7-6}   = imm{1-0};
+}
+
+class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+class T2TwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2sTwoRegShiftedReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2sI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<12> ShiftedRm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = ShiftedRm{3-0};
+  let Inst{5-4}   = ShiftedRm{6-5};
+  let Inst{14-12} = ShiftedRm{11-9};
+  let Inst{7-6}   = ShiftedRm{8-7};
+}
+
+class T2FourReg<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  bits<4> Ra;
+
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Ra;
+  let Inst{11-8}  = Rd;
+  let Inst{3-0}   = Rm;
+}
+
+class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
+
+
+/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// binary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_bin_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode, bit Commutable = 0,
+                       string wide = ""> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
+                 opc, "\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                 opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+}
+
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+//  the ".w" suffix to indicate that they are wide.
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, Commutable, ".w"> {
+  // Assembler aliases w/ the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p,
+                                    cc_out:$s)>;
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rd, rGPR:$Rn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}.w", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm,
+                                    pred:$p, cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+     (!cast<Instruction>(NAME#"rs") rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$shift,
+                                    pred:$p, cc_out:$s)>;
+}
+
+/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the T2I_bin_irs counterpart.
+multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
+   // shifted imm
+   def ri : T2sTwoRegImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
+                 opc, ".w\t$Rd, $Rn, $imm",
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, "\t$Rd, $Rn, $Rm",
+                 [/* For disassembly only; pattern left blank */]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+
+/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
+                         InstrItinClass iis, PatFrag opnode,
+                         bit Commutable = 0> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, iii,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_imm:$imm))]>;
+   // register
+   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
+                         4, iir,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                rGPR:$Rm))]> {
+     let isCommutable = Commutable;
+   }
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, iis,
+                         [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
+                                                t2_so_reg:$ShiftedRm))]>;
+}
+}
+
+/// T2I_rbin_s_is -  Same as T2I_bin_s_irs, except selection DAG
+/// operands are reversed.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+multiclass T2I_rbin_s_is<PatFrag opnode> {
+   // shifted imm
+   def ri : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
+                         4, IIC_iALUi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
+                                                rGPR:$Rn))]>;
+   // shifted register
+   def rs : t2PseudoInst<(outs rGPR:$Rd),
+                         (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
+                         4, IIC_iALUsi,
+                         [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
+                                                rGPR:$Rn))]>;
+}
+}
+
+/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
+/// patterns for a binary operation that produces a value.
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
+                          bit Commutable = 0> {
+   // shifted imm
+   // The register-immediate version is re-materializable. This is useful
+   // in particular for taking the address of a local.
+   let isReMaterializable = 1 in {
+   def ri : T2sTwoRegImm<
+               (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+               opc, ".w\t$Rd, $Rn, $imm",
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{15} = 0;
+   }
+   }
+   // 12-bit imm
+   def ri12 : T2I<
+                  (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+     bits<4> Rd;
+     bits<4> Rn;
+     bits<12> imm;
+     let Inst{31-27} = 0b11110;
+     let Inst{26} = imm{11};
+     let Inst{25-24} = 0b10;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{19-16} = Rn;
+     let Inst{15} = 0;
+     let Inst{14-12} = imm{10-8};
+     let Inst{11-8} = Rd;
+     let Inst{7-0} = imm{7-0};
+   }
+   // register
+   def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+                 IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+   }
+}
+
+/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
+/// for a binary operation that produces a value and use the carry
+/// bit. It's not predicable.
+let Defs = [CPSR], Uses = [CPSR] in {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+   // shifted imm
+   def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
+                 IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
+                 Requires<[IsThumb2]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
+                 Requires<[IsThumb2]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sTwoRegShiftedReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
+                 IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
+         [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
+                 Requires<[IsThumb2]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+   }
+}
+}
+
+/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
+//  rotate operation that produces a value.
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
+   // 5-bit imm
+   def ri : T2sTwoRegShiftImm<
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
+                 opc, ".w\t$Rd, $Rm, $imm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-21} = 0b010010;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{5-4} = opcod;
+   }
+   // register
+   def rr : T2sThreeReg<
+                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
+                 opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-21} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7-4} = 0b0000;
+   }
+
+  // Optional destination register
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rd, rGPR:$Rn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+     (!cast<Instruction>(NAME#"ri") rGPR:$Rdn, rGPR:$Rdn, ty:$imm, pred:$p,
+                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(NAME#"rr") rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p,
+                                    cc_out:$s)>;
+}
+
+/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to T2I_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+multiclass T2I_cmp_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                       PatFrag opnode> {
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def ri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
+                opc, ".w\t$Rn, $imm",
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def rr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+                opc, ".w\t$Rn, $Rm",
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rn, $ShiftedRm",
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+  // Assembler aliases w/o the ".w" suffix.
+  // No alias here for 'rr' version as not all instantiations of this
+  // multiclass want one (CMP in particular, does not).
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+     (!cast<Instruction>(NAME#"ri") GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+     (!cast<Instruction>(NAME#"rs") GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+}
+
+/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = signed;
+    let Inst{23} = 1;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};  // imm
+  }
+  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+    bits<4> Rt;
+    bits<13> addr;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{9}     = addr{8};    // U
+    let Inst{8} = 0; // The W bit.
+    let Inst{7-0}   = addr{7-0};  // imm
+  }
+  def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+
+  // pci variant is very similar to i12, but supports negative offsets
+  // from the PC.
+  def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+    let isReMaterializable = 1;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = ?; // add = (U == '1')
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = 0b1111; // Rn
+    bits<4> Rt;
+    bits<12> addr;
+    let Inst{15-12} = Rt{3-0};
+    let Inst{11-0}  = addr{11-0};
+  }
+}
+
+/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
+multiclass T2I_st<bits<2> opcod, string opc,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0001;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm
+  }
+  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
+                   opc, "\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{8} = 0; // The W bit.
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{9}     = addr{8};    // U
+    let Inst{7-0}   = addr{7-0};  // imm
+  }
+  def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
+                   opc, ".w\t$Rt, $addr",
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11-6} = 0b000000;
+
+    bits<4> Rt;
+    let Inst{15-12} = Rt;
+
+    bits<10> addr;
+    let Inst{19-16}   = addr{9-6}; // Rn
+    let Inst{3-0} = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm
+  }
+}
+
+/// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, ".w\t$Rd, $Rm$rot",
+             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+             Requires<[IsThumb2]> {
+   let Inst{31-27} = 0b11111;
+   let Inst{26-23} = 0b0100;
+   let Inst{22-20} = opcod;
+   let Inst{19-16} = 0b1111; // Rn
+   let Inst{15-12} = 0b1111;
+   let Inst{7} = 1;
+
+   bits<2> rot;
+   let Inst{5-4} = rot{1-0}; // rotate
+}
+
+// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+          Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
+// supported yet.
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, "\t$Rd, $Rm$rot", []>,
+          Requires<[IsThumb2, HasT2ExtractPack]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+/// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+           Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<12> label;
+
+  let Inst{11-8}  = Rd;
+  let Inst{26}    = label{11};
+  let Inst{14-12} = label{10-8};
+  let Inst{7-0}   = label{7-0};
+}
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
+              (ins t2adrlabel:$addr, pred:$p),
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-24} = 0b10;
+  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+  let Inst{22} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<13> addr;
+  let Inst{11-8} = Rd;
+  let Inst{23}    = addr{12};
+  let Inst{21}    = addr{12};
+  let Inst{26}    = addr{11};
+  let Inst{14-12} = addr{10-8};
+  let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2Adr";
+}
+
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
+                                4, IIC_iALUi, []>;
+let hasSideEffects = 1 in
+def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                                4, IIC_iALUi,
+                                []>;
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1  in
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
+                      UnOpFrag<(load node:$Src)>>;
+
+// Loads with zero extension
+defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      rGPR, UnOpFrag<(zextloadi16 node:$Src)>>;
+defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      rGPR, UnOpFrag<(zextloadi8  node:$Src)>>;
+
+// Loads with sign extension
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      rGPR, UnOpFrag<(sextloadi16 node:$Src)>>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
+                      rGPR, UnOpFrag<(sextloadi8  node:$Src)>>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
+                        (ins t2addrmode_imm8s4:$addr),
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+} // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
+
+// zextload i1 -> zextload i8
+def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+// extload -> zextload
+// FIXME: Reduce the number of patterns by legalizing extload to zextload
+// earlier?
+def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
+            (t2LDRHpci  tconstpool:$addr)>;
+
+// FIXME: The destination register of the loads and stores can't be PC, but
+//        can be SP. We need another regclass (similar to rGPR) to represent
+//        that. Not a pressing issue since these are selected manually,
+//        not via pattern.
+
+// Indexed loads
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+} // mayLoad = 1, neverHasSideEffects = 1
+
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
+// Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
+class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = signed;
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 1; // load
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt;
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW.
+  let Inst{7-0} = addr{7-0};
+}
+
+def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
+def t2LDRBT  : T2IldT<0, 0b00, "ldrbt", IIC_iLoad_bh_i>;
+def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
+def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
+def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
+
+// Store
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
+                   BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
+                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+// Store doubleword
+let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
+                       (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+
+// Indexed stores
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                            "str", "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
+def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
+                        "strh", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
+
+def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
+                        "strb", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
+} // mayStore = 1, neverHasSideEffects = 1
+
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins GPRnopc:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, $Rn$offset",
+                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+             [(set GPRnopc:$Rn_wb,
+                  (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strh", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+       [(set GPRnopc:$Rn_wb,
+             (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+        [(set GPRnopc:$Rn_wb,
+              (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
+// STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
+// only.
+// Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
+class T2IstT<bits<2> type, string opc, InstrItinClass ii>
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+          "\t$Rt, $addr", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = 0; // not signed
+  let Inst{23} = 0;
+  let Inst{22-21} = type;
+  let Inst{20} = 0; // store
+  let Inst{11} = 1;
+  let Inst{10-8} = 0b110; // PUW
+
+  bits<4> Rt;
+  bits<13> addr;
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = addr{12-9};
+  let Inst{7-0}   = addr{7-0};
+}
+
+def t2STRT   : T2IstT<0b10, "strt", IIC_iStore_i>;
+def t2STRBT  : T2IstT<0b00, "strbt", IIC_iStore_bh_i>;
+def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
+
+// ldrd / strd pre / post variants
+// For disassembly only.
+
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2LdrdPre";
+  let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
+
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+                 IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+                 "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2StrdPre";
+  let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
+
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+                      t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
+
+// T2Ipl (Preload Data/Instruction) signals the memory system of possible future
+// data/instruction access.
+// instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
+// (prefetch 1) -> (preload 2),  (prefetch 2) -> (preload 1).
+multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
+
+  def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
+                "\t$addr",
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+
+    bits<17> addr;
+    let addr{12}    = 1;           // add = TRUE
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{23}    = addr{12};    // U
+    let Inst{11-0}  = addr{11-0};  // imm12
+  }
+
+  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
+                "\t$addr",
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // U = 0
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-8} = 0b1100;
+
+    bits<13> addr;
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{7-0}   = addr{7-0};  // imm8
+  }
+
+  def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
+               "\t$addr",
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> {
+    let Inst{31-25} = 0b1111100;
+    let Inst{24} = instr;
+    let Inst{23} = 0; // add = TRUE for T1
+    let Inst{22} = 0;
+    let Inst{21} = write;
+    let Inst{20} = 1;
+    let Inst{15-12} = 0b1111;
+    let Inst{11-6} = 0000000;
+
+    bits<10> addr;
+    let Inst{19-16} = addr{9-6}; // Rn
+    let Inst{3-0}   = addr{5-2}; // Rm
+    let Inst{5-4}   = addr{1-0}; // imm2
+
+    let DecoderMethod = "DecodeT2LoadShift";
+  }
+  // FIXME: We should have a separate 'pci' variant here. As-is we represent
+  // it via the i12 variant, which it's related to, but that means we can
+  // represent negative immediates, which aren't legal for anything except
+  // the 'pci' case (Rn == 15).
+}
+
+defm t2PLD  : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15-0}  = regs;
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+}
+
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+
+} // neverHasSideEffects
+
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+                   "mov", ".w\t$Rd, $Rm", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                pred:$p, zero_reg)>;
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                 pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                               pred:$p, CPSR)>;
+
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    AddedComplexity = 1 in
+def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
+                   "mov", ".w\t$Rd, $imm",
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                 pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                               pred:$p, zero_reg)>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
+                   "movw", "\t$Rd, $imm",
+                   [(set rGPR:$Rd, imm0_65535:$imm)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+
+let Constraints = "$src = $Rd" in {
+def t2MOVTi16 : T2I<(outs rGPR:$Rd),
+                    (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
+                    "movt", "\t$Rd, $imm",
+                    [(set rGPR:$Rd,
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0110;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
+}
+
+def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+} // Constraints
+
+def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
+                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
+                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
+                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
+                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+                               UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+// FIXME: This pattern incorrectly assumes the shl operator is a rotate.
+//        The transformation should probably be done as a combiner action
+//        instead so we can include a check for masking back in the upper
+//        eight bits of the source into the lower eight bits of the result.
+//def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
+//            (t2UXTB16 rGPR:$Src, 3)>,
+//          Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16 rGPR:$Src, 1)>,
+        Requires<[HasT2ExtractPack, IsThumb2]>;
+
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add",
+                             BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm t2ADDS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
+                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm t2SUBS : T2I_bin_s_irs <IIC_iALUi, IIC_iALUr, IIC_iALUsi,
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+let hasPostISelHook = 1 in {
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+}
+
+// RSB
+defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm t2RSBS : T2I_rbin_s_is <BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+// The assume-no-carry-in form uses the negation of the input since add/sub
+// assume opposite meanings of the carry flag (i.e., carry == !borrow).
+// See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
+// details.
+// The AddedComplexity preferences the first variant over the others since
+// it can be shrunk to a 16-bit wide encoding, while the others cannot.
+let AddedComplexity = 1 in
+def : T2Pat<(add        GPR:$src, imm1_255_neg:$imm),
+            (t2SUBri    GPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBri    GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
+            (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
+def : T2Pat<(add        GPR:$src, imm0_65535_neg:$imm),
+            (t2SUBrr    GPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(ARMaddc    rGPR:$src, imm1_255_neg:$imm),
+            (t2SUBSri   rGPR:$src, imm1_255_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_65535_neg:$imm),
+            (t2SUBSrr   rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+// The with-carry-in form matches bitwise not instead of the negation.
+// Effectively, the inverse interpretation of the carry flag already accounts
+// for part of the negation.
+let AddedComplexity = 1 in
+def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
+            (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
+def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
+
+// Select Bytes -- for disassembly only
+
+def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b010;
+  let Inst{23} = 0b1;
+  let Inst{22-20} = 0b010;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 0b1;
+  let Inst{6-4} = 0b000;
+}
+
+// A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
+// And Miscellaneous operations -- for disassembly only
+class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
+              list<dag> pat = [/* For disassembly only; pattern left blank */],
+              dag iops = (ins rGPR:$Rn, rGPR:$Rm),
+              string asm = "\t$Rd, $Rn, $Rm">
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
+    Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0101;
+  let Inst{22-20} = op22_20;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = op7_4;
+
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
+// Saturating add/subtract -- for disassembly only
+
+def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
+                        [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
+def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
+def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
+def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub", [],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
+def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
+                        [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
+                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
+def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
+def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
+def t2UQADD8  : T2I_pam<0b000, 0b0101, "uqadd8">;
+def t2UQASX   : T2I_pam<0b010, 0b0101, "uqasx">;
+def t2UQSAX   : T2I_pam<0b110, 0b0101, "uqsax">;
+def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
+def t2UQSUB8  : T2I_pam<0b100, 0b0101, "uqsub8">;
+
+// Signed/Unsigned add/subtract -- for disassembly only
+
+def t2SASX    : T2I_pam<0b010, 0b0000, "sasx">;
+def t2SADD16  : T2I_pam<0b001, 0b0000, "sadd16">;
+def t2SADD8   : T2I_pam<0b000, 0b0000, "sadd8">;
+def t2SSAX    : T2I_pam<0b110, 0b0000, "ssax">;
+def t2SSUB16  : T2I_pam<0b101, 0b0000, "ssub16">;
+def t2SSUB8   : T2I_pam<0b100, 0b0000, "ssub8">;
+def t2UASX    : T2I_pam<0b010, 0b0100, "uasx">;
+def t2UADD16  : T2I_pam<0b001, 0b0100, "uadd16">;
+def t2UADD8   : T2I_pam<0b000, 0b0100, "uadd8">;
+def t2USAX    : T2I_pam<0b110, 0b0100, "usax">;
+def t2USUB16  : T2I_pam<0b101, 0b0100, "usub16">;
+def t2USUB8   : T2I_pam<0b100, 0b0100, "usub8">;
+
+// Signed/Unsigned halving add/subtract -- for disassembly only
+
+def t2SHASX   : T2I_pam<0b010, 0b0010, "shasx">;
+def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
+def t2SHADD8  : T2I_pam<0b000, 0b0010, "shadd8">;
+def t2SHSAX   : T2I_pam<0b110, 0b0010, "shsax">;
+def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
+def t2SHSUB8  : T2I_pam<0b100, 0b0010, "shsub8">;
+def t2UHASX   : T2I_pam<0b010, 0b0110, "uhasx">;
+def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
+def t2UHADD8  : T2I_pam<0b000, 0b0110, "uhadd8">;
+def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
+def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
+def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
+
+// Helper class for disassembly only
+// A6.3.16 & A6.3.17
+// T2Imac - Thumb2 multiply [accumulate, and absolute difference] instructions.
+class T2ThreeReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
+  dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2FourReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-24} = 0b011;
+  let Inst{23}    = long;
+  let Inst{22-20} = op22_20;
+  let Inst{7-4}   = op7_4;
+}
+
+// Unsigned Sum of Absolute Differences [and Accumulate].
+def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                                           (ins rGPR:$Rn, rGPR:$Rm),
+                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
+                       (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
+                        "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+// Signed/Unsigned saturate.
+class T2SatI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<5> sat_imm;
+  bits<7> sh;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{4-0}   = sat_imm;
+  let Inst{21}    = sh{5};
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+def t2SSAT: T2SatI<
+              (outs rGPR:$Rd),
+              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{5}  = 0;
+}
+
+def t2SSAT16: T2SatI<
+                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1100;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def t2USAT: T2SatI<
+               (outs rGPR:$Rd),
+               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-22} = 0b1110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+}
+
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
+                     NoItinerary,
+                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-22} = 0b1111001110;
+  let Inst{20} = 0;
+  let Inst{15} = 0;
+  let Inst{21} = 1;        // sh = '1'
+  let Inst{14-12} = 0b000; // imm3 = '000'
+  let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
+}
+
+def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
+
+//===----------------------------------------------------------------------===//
+//  Shift and rotate Instructions.
+//
+
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
+                        BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
+                        BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
+                        BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
+                        BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+
+// (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
+def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
+            (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
+
+let Uses = [CPSR] in {
+def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                   "rrx", "\t$Rd, $Rm",
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0011;
+}
+}
+
+let isCodeGenOnly = 1, Defs = [CPSR] in {
+def t2MOVsrl_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "lsrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b01; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+def t2MOVsra_flag : T2TwoRegShiftImm<
+                        (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
+                        "asrs", ".w\t$Rd, $Rm, #1",
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b10; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm t2AND  : T2I_bin_w_irs<0b0000, "and",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+
+defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
+                            IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                            BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+class T2BitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  bits<5> msb;
+  bits<5> lsb;
+
+  let Inst{11-8}  = Rd;
+  let Inst{4-0}   = msb{4-0};
+  let Inst{14-12} = lsb{4-2};
+  let Inst{7-6}   = lsb{1-0};
+}
+
+class T2TwoRegBitFI<dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+    : T2BitFI<oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rn;
+
+  let Inst{19-16} = Rn;
+}
+
+let Constraints = "$src = $Rd" in
+def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iUNAsi, "bfc", "\t$Rd, $imm",
+                [(set rGPR:$Rd, (and rGPR:$src, bf_inv_mask_imm:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0; // should be 0.
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10110;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+  let Inst{5} = 0; // should be 0.
+
+  bits<10> imm;
+  let msb{4-0} = imm{9-5};
+  let lsb{4-0} = imm{4-0};
+}
+
+def t2SBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10100;
+  let Inst{15} = 0;
+}
+
+def t2UBFX: T2TwoRegBitFI<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
+                 IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b11100;
+  let Inst{15} = 0;
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding T1)
+let Constraints = "$src = $Rd" in {
+  def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
+                  (ins rGPR:$src, rGPR:$Rn, bf_inv_mask_imm:$imm),
+                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $imm",
+                  [(set rGPR:$Rd, (ARMbfi rGPR:$src, rGPR:$Rn,
+                                   bf_inv_mask_imm:$imm))]> {
+    let Inst{31-27} = 0b11110;
+    let Inst{26} = 0; // should be 0.
+    let Inst{25} = 1;
+    let Inst{24-20} = 0b10110;
+    let Inst{15} = 0;
+    let Inst{5} = 0; // should be 0.
+
+    bits<10> imm;
+    let msb{4-0} = imm{9-5};
+    let lsb{4-0} = imm{4-0};
+  }
+}
+
+defm t2ORN  : T2I_bin_irs<0b0011, "orn",
+                          IIC_iBITi, IIC_iBITr, IIC_iBITsi,
+                          BinOpFrag<(or node:$LHS, (not node:$RHS))>, 0, "">;
+
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode,
+                      bit Cheap = 0, bit ReMat = 0, bit MoveImm = 0> {
+   // shifted imm
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let isMoveImm = MoveImm;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
+// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
+let AddedComplexity = 1 in
+defm t2MVN  : T2I_un_irs <0b0011, "mvn",
+                          IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
+                          UnOpFrag<(not node:$Src)>, 1, 1, 1>;
+
+let AddedComplexity = 1 in
+def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
+
+// top16Zero - answer true if the upper 16 bits of $src are 0, false otherwise
+def top16Zero: PatLeaf<(i32 rGPR:$src), [{
+  return CurDAG->MaskedValueIsZero(SDValue(N,0), APInt::getHighBitsSet(32, 16));
+  }]>;
+
+// so_imm_notSext is needed instead of so_imm_not, as the value of imm
+// will match the extended, not the original bitWidth for $src.
+def : T2Pat<(and top16Zero:$src, t2_so_imm_notSext:$imm),
+            (t2BICri rGPR:$src, t2_so_imm_notSext:$imm)>;
+
+
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or      rGPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri rGPR:$src, t2_so_imm_not:$imm)>,
+            Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+            (t2MVNi t2_so_imm_not:$src)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+let isCommutable = 1 in
+def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                "mul", "\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLA: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]>,
+           Requires<[IsThumb2, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLS: T2FourReg<
+                (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "mls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]>,
+           Requires<[IsThumb2, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = 0b0001; // Multiply and Subtract
+}
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def t2SMULL : T2MulLong<0b000, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+
+def t2UMULL : T2MulLong<0b010, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
+                   "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+} // isCommutable
+
+// Multiply + accumulate
+def t2SMLAL : T2MlaLong<0b100, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
+
+def t2UMLAL : T2MlaLong<0b110, 0b0000,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
+
+def t2UMAAL : T2MulLong<0b110, 0b0110,
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
+                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
+                  "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+} // neverHasSideEffects
+
+// Rounding variants of the below included for disassembly only
+
+// Most significant word multiply
+def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmul", "\t$Rd, $Rn, $Rm",
+                  [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
+                  "smmulr", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+def t2SMMLA : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmla", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
+              Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMLAR: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                  "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+def t2SMMLS: T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmls", "\t$Rd, $Rn, $Rm, $Ra",
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
+             Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMLSR:T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
+                "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0001; // Rounding (Inst{4} = 1)
+}
+
+multiclass T2I_smul<string opc, PatFrag opnode> {
+  def BB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sext_inreg rGPR:$Rm, i16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16),
+                                      (sra rGPR:$Rm, (i32 16))))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sext_inreg rGPR:$Rm, i16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)),
+                                      (sra rGPR:$Rm, (i32 16))))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
+              [(set rGPR:$Rd, (sra (opnode rGPR:$Rn,
+                                    (sra rGPR:$Rm, (i32 16))), (i32 16)))]>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+
+multiclass T2I_smla<string opc, PatFrag opnode> {
+  def BB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra,
+                               (opnode (sext_inreg rGPR:$Rn, i16),
+                                       (sext_inreg rGPR:$Rm, i16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2FourReg<
+       (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+             !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                               (sext_inreg rGPR:$Rm, i16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
+                                                 (sra rGPR:$Rm, (i32 16)))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                    (sext_inreg rGPR:$Rm, i16)), (i32 16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2FourReg<
+        (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16,
+              !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
+              [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
+                                      (sra rGPR:$Rm, (i32 16))), (i32 16))))]>,
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// Halfword multiple accumulate long: SMLAL<x><y>
+def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
+         (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm",
+           [/* For disassembly only; pattern left blank */]>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+// Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+def t2SMUAD: T2ThreeReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUADX:T2ThreeReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSD: T2ThreeReg_mac<
+            0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMUSDX:T2ThreeReg_mac<
+            0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
+            IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]> {
+  let Inst{15-12} = 0b1111;
+}
+def t2SMLAD   : T2FourReg_mac<
+            0, 0b010, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLADX  : T2FourReg_mac<
+            0, 0b010, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSD   : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
+            (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx",
+            "\t$Rd, $Rn, $Rm, $Ra", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
+                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
+          Requires<[IsThumb2, HasThumb2DSP]>;
+
+//===----------------------------------------------------------------------===//
+//  Division Instructions.
+//  Signed and unsigned division on v7-M
+//
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "sdiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011100;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
+                 "udiv", "\t$Rd, $Rn, $Rm",
+                 [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
+                 Requires<[HasDivide, IsThumb2]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b011101;
+  let Inst{20} = 0b1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-4} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
+      InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : T2ThreeReg<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-22} = 0b01010;
+  let Inst{21-20} = op1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6} = 0b10;
+  let Inst{5-4} = op2;
+  let Rn{3-0} = Rm;
+}
+
+def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>;
+
+def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                      "rbit", "\t$Rd, $Rm",
+                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>;
+
+def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>;
+
+def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "rev16", ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>;
+
+def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
+                       "revsh", ".w\t$Rd, $Rm",
+                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>;
+
+def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
+                (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
+            (t2REVSH rGPR:$Rm)>;
+
+def t2PKHBT : T2ThreeReg<
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
+                  IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
+                                      (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 0; // BT form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
+// will match the pattern below.
+def t2PKHTB : T2ThreeReg<
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
+                  IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
+                  [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
+                                       (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
+                                            0xFFFF)))]>,
+                  Requires<[HasT2ExtractPack, IsThumb2]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 1; // TB form
+  let Inst{4} = 0;
+
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
+                (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
+                          IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
+
+let isCompare = 1, Defs = [CPSR] in {
+   // shifted imm
+   def t2CMNri : T2OneRegCmpImm<
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
+                "cmn", ".w\t$Rn, $imm",
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def t2CMNzrr : T2TwoRegCmp<
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
+                "cmn", ".w\t$Rn, $Rm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, rGPR:$Rm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def t2CMNzrs : T2OneRegCmpShiftedReg<
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
+                "cmn", ".w\t$Rn, $ShiftedRm",
+                [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = 0b1000;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+
+// Assembler aliases w/o the ".w" suffix.
+// No alias here for 'rr' version as not all instantiations of this multiclass
+// want one (CMP in particular, does not).
+def : t2InstAlias<"cmn${p} $Rn, $imm",
+   (t2CMNri GPRnopc:$Rn, t2_so_imm:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rn, $shift",
+   (t2CMNzrs GPRnopc:$Rn, t2_so_reg:$shift, pred:$p)>;
+
+def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNri GPRnopc:$src, t2_so_imm_neg:$imm)>;
+
+defm t2TST  : T2I_cmp_irs<0b0000, "tst",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
+                          IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let neverHasSideEffects = 1 in {
+
+let isCommutable = 1, isSelect = 1 in
+def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
+                            (ins rGPR:$false, rGPR:$Rm, pred:$p),
+                            4, IIC_iCMOVr,
+   [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $Rd">;
+
+let isMoveImm = 1 in
+def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
+                            (ins rGPR:$false, t2_so_imm:$imm, pred:$p),
+                   4, IIC_iCMOVi,
+[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $Rd">;
+
+// FIXME: Pseudo-ize these. For now, just mark codegen only.
+let isCodeGenOnly = 1 in {
+let isMoveImm = 1 in
+def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
+                      IIC_iCMOVi,
+                      "movw", "\t$Rd, $imm", []>,
+                      RegConstraint<"$false = $Rd"> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+
+  bits<4> Rd;
+  bits<16> imm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = imm{15-12};
+  let Inst{26}    = imm{11};
+  let Inst{14-12} = imm{10-8};
+  let Inst{7-0}   = imm{7-0};
+}
+
+let isMoveImm = 1 in
+def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst),
+                               (ins rGPR:$false, i32imm:$src, pred:$p),
+                    IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">;
+
+let isMoveImm = 1 in
+def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
+                   IIC_iCMOVi, "mvn", "\t$Rd, $imm",
+[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
+                   imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $Rd"> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0011;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = opcod; // Shift type.
+}
+def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
+                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
+                             IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
+                 RegConstraint<"$false = $Rd">;
+} // isCodeGenOnly = 1
+
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                  Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f5;
+  let Inst{3-0} = opt;
+}
+}
+
+def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "dsb", "\t$opt", []>,
+                  Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f4;
+  let Inst{3-0} = opt;
+}
+
+def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "isb", "\t$opt",
+                  []>, Requires<[IsThumb, HasDB]> {
+  bits<4> opt;
+  let Inst{31-4} = 0xf3bf8f6;
+  let Inst{3-0} = opt;
+}
+
+class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{11-8} = rt2;
+  let Inst{7-6} = 0b01;
+  let Inst{5-4} = opcod;
+  let Inst{3-0} = 0b1111;
+
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-8} = rt2;
+  let Inst{7-6} = 0b01;
+  let Inst{5-4} = opcod;
+
+  bits<4> Rd;
+  bits<4> addr;
+  bits<4> Rt;
+  let Inst{3-0}  = Rd;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexb", "\t$Rt, $addr", "", []>;
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexh", "\t$Rt, $addr", "", []>;
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldrex", "\t$Rt, $addr", "", []> {
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000101;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldrexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
+def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexb", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexh", "\t$Rd, $Rt, $addr", "", []>;
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             t2addrmode_imm0_1020s4:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "strex", "\t$Rd, $Rt, $addr", "",
+                  []> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{19-16} = addr{11-8};
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{7-0} = addr{7-0};
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
+}
+
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
+            Requires<[IsThumb2, HasV7]>  {
+  let Inst{31-16} = 0xf3bf;
+  let Inst{15-14} = 0b10;
+  let Inst{13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-4} = 0b0010;
+  let Inst{3-0} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everything out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   $val is a scratch register for our use.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
+    Q0, Q1, Q2, Q3, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                             Requires<[IsThumb2, HasVFP2]>;
+}
+
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
+  hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+  usesCustomInserter = 1 in {
+  def t2Int_eh_sjlj_setjmp_nofp : Thumb2XI<(outs), (ins tGPR:$src, tGPR:$val),
+                               AddrModeNone, 0, NoItinerary, "", "",
+                          [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>,
+                                  Requires<[IsThumb2, NoVFP]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Control-Flow Instructions
+//
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1, isCodeGenOnly = 1 in
+def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
+                                                   reglist:$regs, variable_ops),
+                              4, IIC_iLoad_mBr, [],
+            (t2LDMIA_UPD GPR:$wb, GPR:$Rn, pred:$p, reglist:$regs)>,
+                         RegConstraint<"$Rn = $wb">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+let isPredicable = 1 in
+def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+                 "b", ".w\t$target",
+                 [(br bb:$target)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 1;
+
+  bits<24> target;
+  let Inst{26} = target{19};
+  let Inst{11} = target{18};
+  let Inst{13} = target{17};
+  let Inst{25-16} = target{20-11};
+  let Inst{10-0} = target{10-0};
+  let DecoderMethod = "DecodeT2BInstruction";
+}
+
+let isNotDuplicable = 1, isIndirectBranch = 1 in {
+def t2BR_JT : t2PseudoInst<(outs),
+          (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id),
+           0, IIC_Br,
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
+
+// FIXME: Add a non-pc based case that can be predicated.
+def t2TBB_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+
+def t2TBH_JT : t2PseudoInst<(outs),
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+                    "tbb", "\t$addr", []> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 0; // B form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+                   "tbh", "\t$addr", []> {
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{31-20} = 0b111010001101;
+  let Inst{19-16} = Rn;
+  let Inst{15-5} = 0b11110000000;
+  let Inst{4} = 1; // H form
+  let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
+}
+} // isNotDuplicable, isIndirectBranch
+
+} // isBranch, isTerminator, isBarrier
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects ", "two operands. :(
+let isBranch = 1, isTerminator = 1 in
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+                "b", ".w\t$target",
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+
+  bits<4> p;
+  let Inst{25-22} = p;
+
+  bits<21> target;
+  let Inst{26} = target{20};
+  let Inst{11} = target{19};
+  let Inst{13} = target{18};
+  let Inst{21-16} = target{17-12};
+  let Inst{10-0} = target{11-1};
+
+  let DecoderMethod = "DecodeThumb2BCCInstruction";
+}
+
+// Tail calls. The IOS version of thumb tail calls uses a t2 branch, so
+// it goes here.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  // IOS version.
+  let Uses = [SP] in
+  def tTAILJMPd: tPseudoExpand<(outs),
+                   (ins uncondbrtarget:$dst, pred:$p),
+                   4, IIC_Br, [],
+                   (t2B uncondbrtarget:$dst, pred:$p)>,
+                 Requires<[IsThumb2, IsIOS]>;
+}
+
+// IT block
+let Defs = [ITSTATE] in
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+                    AddrModeNone, 2,  IIC_iALUx,
+                    "it$mask\t$cc", "", []> {
+  // 16-bit instruction.
+  let Inst{31-16} = 0x0000;
+  let Inst{15-8} = 0b10111111;
+
+  bits<4> cc;
+  bits<4> mask;
+  let Inst{7-4} = cc;
+  let Inst{3-0} = mask;
+
+  let DecoderMethod = "DecodeIT";
+}
+
+// Branch and Exchange Jazelle -- for disassembly only
+// Rm = Inst{19-16}
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> {
+  bits<4> func;
+  let Inst{31-27} = 0b11110;
+  let Inst{26} = 0;
+  let Inst{25-20} = 0b111100;
+  let Inst{19-16} = func;
+  let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbnz\t$Rn, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+}
+
+
+// Change Processor State is a system instruction.
+// FIXME: Since the asm parser has currently no clean way to handle optional
+// operands, create 3 versions of the same instruction. Once there's a clean
+// framework to represent optional operands, change this behavior.
+class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
+            !strconcat("cps", asm_op), []> {
+  bits<2> imod;
+  bits<3> iflags;
+  bits<5> mode;
+  bit M;
+
+  let Inst{31-27} = 0b11110;
+  let Inst{26}    = 0;
+  let Inst{25-20} = 0b111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-14} = 0b10;
+  let Inst{12}    = 0;
+  let Inst{10-9}  = imod;
+  let Inst{8}     = M;
+  let Inst{7-5}   = iflags;
+  let Inst{4-0}   = mode;
+  let DecoderMethod = "DecodeT2CPSInstruction";
+}
+
+let M = 1 in
+  def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+                      "$imod.w\t$iflags, $mode">;
+let mode = 0, M = 0 in
+  def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
+                      "$imod.w\t$iflags">;
+let imod = 0, iflags = 0, M = 1 in
+  def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
+
+// A6.3.4 Branches and miscellaneous control
+// Table A6-14 Change Processor State, and hint instructions
+def t2HINT : T2I<(outs), (ins imm0_255:$imm), NoItinerary, "hint", "\t$imm",[]>{
+  bits<8> imm;
+  let Inst{31-8} = 0b111100111010111110000000;
+  let Inst{7-0} = imm;
+}
+
+def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_255:$imm, pred:$p)>;
+def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
+def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
+def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
+def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
+def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
+
+def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
+  bits<4> opt;
+  let Inst{31-20} = 0b111100111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-8} = 0b10000000;
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = opt;
+}
+
+// Secure Monitor Call is a system instruction.
+// Option = Inst{19-16}
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111111;
+  let Inst{15-12} = 0b1000;
+
+  bits<4> opt;
+  let Inst{19-16} = opt;
+}
+
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<5> mode;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24-23} = Op;
+  let Inst{22} = 0;
+  let Inst{21} = W;
+  let Inst{20-16} = 0b01101;
+  let Inst{15-5} = 0b11000000000;
+  let Inst{4-0} = mode{4-0};
+}
+
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB  : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsia","\tsp!, $mode", []>;
+def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsia","\tsp, $mode", []>;
+
+
+def : t2InstAlias<"srsdb${p} $mode", (t2SRSDB imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsdb${p} $mode!", (t2SRSDB_UPD imm0_31:$mode, pred:$p)>;
+
+def : t2InstAlias<"srsia${p} $mode", (t2SRSIA imm0_31:$mode, pred:$p)>;
+def : t2InstAlias<"srsia${p} $mode!", (t2SRSIA_UPD imm0_31:$mode, pred:$p)>;
+
+// Return From Exception is a system instruction.
+class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-20} = op31_20{11-0};
+
+  bits<4> Rn;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xc000;
+}
+
+def t2RFEDBW : T2RFE<0b111010000011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEDB  : T2RFE<0b111010000001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfedb", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIAW : T2RFE<0b111010011011,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn!",
+                   [/* For disassembly only; pattern left blank */]>;
+def t2RFEIA  : T2RFE<0b111010011001,
+                   (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
+                   [/* For disassembly only; pattern left blank */]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1, isMoveImm = 1 in
+def t2MOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), IIC_iMOVix2,
+                            [(set rGPR:$dst, (i32 imm:$src))]>,
+                            Requires<[IsThumb, HasV6T2]>;
+
+// Pseudo instruction that combines movw + movt + add pc (if pic).
+// It also makes it possible to rematerialize the instructions.
+// FIXME: Remove this when we can do generalized remat and when machine licm
+// can properly the instructions.
+let isReMaterializable = 1 in {
+def t2MOV_ga_pcrel : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                                IIC_iMOVix2addpc,
+                          [(set rGPR:$dst, (ARMWrapperPIC tglobaladdr:$addr))]>,
+                          Requires<[IsThumb2, UseMovt]>;
+
+def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
+                              IIC_iMOVix2,
+                          [(set rGPR:$dst, (ARMWrapperDYN tglobaladdr:$addr))]>,
+                          Requires<[IsThumb2, UseMovt]>;
+}
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
+           Requires<[IsThumb2, DontUseMovt]>;
+def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
+def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+           Requires<[IsThumb2, UseMovt]>;
+
+def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (t2LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   IIC_iLoadiALU,
+              [(set rGPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb2]>;
+
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+                       NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
+  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+  let Inst{31-28} = op31_28;
+  let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+  def _OFFSET : T2CI<op31_28,
+                     (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                     asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                  asm, "\t$cop, $CRd, $addr!"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                               postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset"> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : T2CI<op31_28, (outs),
+                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                          coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+
+
+//===----------------------------------------------------------------------===//
+// Move between special register and ARM core register -- for disassembly only
+//
+// Move to ARM core register from Special Register
+
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
+                  []>, Requires<[IsThumb2,IsARClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
+                   []>, Requires<[IsThumb2,IsARClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111111111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
+                  "mrs", "\t$Rd, $mask", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<4> Rd;
+  bits<8> mask;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{19-16} = 0b1111;
+  let Inst{7-0} = mask;
+}
+
+
+// Move from ARM core register to Special Register
+//
+// A/R class MSR.
+//
+// No need to have both system and application versions, the encodings are the
+// same and the assembly parser has no way to distinguish between them. The mask
+// operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
+// the mask with the fields to be accessed in the special register.
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+                   NoItinerary, "msr", "\t$mask, $Rn", []>,
+               Requires<[IsThumb2,IsARClass]> {
+  bits<5> mask;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = mask{4}; // R Bit
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-8}  = mask{3-0};
+  let Inst{7-0}   = 0;
+}
+
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+                  NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+              Requires<[IsThumb,IsMClass]> {
+  bits<12> SYSm;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-0}  = SYSm;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Move between coprocessor and ARM core register
+//
+
+class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
+                  list<dag> pattern>
+  : T2Cop<Op, oops, iops,
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+          pattern> {
+  let Inst{27-24} = 0b1110;
+  let Inst{20} = direction;
+  let Inst{4} = 1;
+
+  bits<4> Rt;
+  bits<4> cop;
+  bits<3> opc1;
+  bits<3> opc2;
+  bits<4> CRm;
+  bits<4> CRn;
+
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = cop;
+  let Inst{23-21} = opc1;
+  let Inst{7-5}   = opc2;
+  let Inst{3-0}   = CRm;
+  let Inst{19-16} = CRn;
+}
+
+class t2MovRRCopro<bits<4> Op, string opc, bit direction,
+                   list<dag> pattern = []>
+  : T2Cop<Op, (outs),
+          (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
+          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+  let Inst{27-24} = 0b1100;
+  let Inst{23-21} = 0b010;
+  let Inst{20} = direction;
+
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> cop;
+  bits<4> opc1;
+  bits<4> CRm;
+
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+  let Inst{11-8}  = cop;
+  let Inst{7-4}   = opc1;
+  let Inst{3-0}   = CRm;
+}
+
+/* from ARM core register to coprocessor */
+def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
+           (outs),
+           (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                c_imm:$CRm, imm0_7:$opc2),
+           [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                         imm:$CRm, imm:$opc2)]>;
+def : t2InstAlias<"mcr $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
+             (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, imm0_7:$opc2),
+             [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
+                            imm:$CRm, imm:$opc2)]>;
+def : t2InstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
+                          c_imm:$CRm, 0)>;
+
+/* from coprocessor to ARM core register */
+def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0)>;
+
+def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
+def : t2InstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0)>;
+
+def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+def : T2v6Pat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
+              (t2MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
+
+
+/* from ARM core register to coprocessor */
+def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
+                        [(int_arm_mcrr imm:$cop, imm:$opc1, GPR:$Rt, GPR:$Rt2,
+                                       imm:$CRm)]>;
+def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
+                           [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
+                                           GPR:$Rt2, imm:$CRm)]>;
+/* from coprocessor to ARM core register */
+def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
+
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>;
+
+//===----------------------------------------------------------------------===//
+// Other Coprocessor Instructions.
+//
+
+def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                 c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                 "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                               imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+                   c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
+                   "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                   [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
+                                  imm:$CRm, imm:$opc2)]> {
+  let Inst{27-24} = 0b1110;
+
+  bits<4> opc1;
+  bits<4> CRn;
+  bits<4> CRd;
+  bits<4> cop;
+  bits<3> opc2;
+  bits<4> CRm;
+
+  let Inst{3-0}   = CRm;
+  let Inst{4}     = 0;
+  let Inst{7-5}   = opc2;
+  let Inst{11-8}  = cop;
+  let Inst{15-12} = CRd;
+  let Inst{19-16} = CRn;
+  let Inst{23-20} = opc1;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+            (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+            (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_imm12:$addr),
+            (t2LDRi12   t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_negimm8:$addr),
+            (t2LDRi8    t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_so_reg:$addr),
+            (t2LDRs     t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRBi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRBi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRBs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRHi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRHi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRHs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRi12   GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRi8    GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+                  (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+                  (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+              (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2ADDri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2ADDri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $Rm",
+            (t2ADDrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// add w/ negative immediates is just a sub.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
+                 cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
+               cc_out:$s)>;
+def : t2InstAlias<"addw${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
+
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+              (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+// ... and with the destination and source register combined.
+def : t2InstAlias<"sub${s}${p} $Rdn, $imm",
+      (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rdn, $imm",
+           (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p}.w $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $Rm",
+            (t2SUBrr GPRnopc:$Rdn, GPRnopc:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rdn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+                  (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+                  (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+                  (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+
+// Alias for MVN with(out) the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p}.w $Rd, $imm",
+           (t2MVNi rGPR:$Rd, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+           (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+           (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// STMIA/STMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stm${p} $Rn!, $regs",
+                  (t2STMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMIA/LDMIA_UPD aliases w/o the optional .w suffix
+def : t2InstAlias<"ldm${p} $Rn, $regs",
+                  (t2LDMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldm${p} $Rn!, $regs",
+                  (t2LDMIA_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// STMDB/STMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"stmdb${p}.w $Rn, $regs",
+                  (t2STMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"stmdb${p}.w $Rn!, $regs",
+                  (t2STMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// LDMDB/LDMDB_UPD aliases w/ the optional .w suffix
+def : t2InstAlias<"ldmdb${p}.w $Rn, $regs",
+                  (t2LDMDB GPR:$Rn, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"ldmdb${p}.w $Rn!, $regs",
+                  (t2LDMDB_UPD GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+           (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+           (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+           (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+           (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+                    cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                  (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                  (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+                  (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                  (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+                  (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+                  (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                  (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+                  (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+
+// "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
+// for isel.
+def : t2InstAlias<"mov${p} $Rd, $imm",
+                  (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstAlias<"mvn${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+// Same for AND <--> BIC
+def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+// Likewise, "add Rd, t2_so_imm_neg" -> sub
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $imm",
+                  (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
+                           pred:$p, cc_out:$s)>;
+// Same for CMP <--> CMN via t2_so_imm_neg
+def : t2InstAlias<"cmp${p} $Rd, $imm",
+                  (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstAlias<"cmn${p} $Rd, $imm",
+                  (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
+
+
+// Wide 'mul' encoding can be specified with only two operands.
+def : t2InstAlias<"mul${p} $Rn, $Rm",
+                  (t2MUL rGPR:$Rn, rGPR:$Rm, rGPR:$Rn, pred:$p)>;
+
+// "neg" is and alias for "rsb rd, rn, #0"
+def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
+                  (t2RSBri rGPR:$Rd, rGPR:$Rm, 0, pred:$p, cc_out:$s)>;
+
+// MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
+// these, unfortunately.
+def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+
+def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
+                         (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
+                          (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
+// ADR w/o the .w suffix
+def : t2InstAlias<"adr${p} $Rd, $addr",
+                  (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
+
+// LDR(literal) w/ alternate [pc, #imm] syntax.
+def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSBpcrel  : t2AsmPseudo<"ldrsb${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
+                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+    // Version w/ the .w suffix.
+def : t2InstAlias<"ldr${p}.w $Rt, $addr",
+                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
+                  (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
+                  (t2LDRHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p}.w $Rt, $addr",
+                  (t2LDRSBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
+                  (t2LDRSHpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"add${p} $Rd, pc, $imm",
+                  (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 000000000000..b5a896c69985
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -0,0 +1,1479 @@
+//===-- ARMInstrVFP.td - VFP support for ARM ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_FTOI    : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF    : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                                       SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI",   SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI",   SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF",   SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF",   SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "parseFPImm";
+}
+
+def vfp_f32imm : Operand<f32>,
+                 PatLeaf<(f32 fpimm), [{
+      return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def vfp_f64imm : Operand<f64>,
+                 PatLeaf<(f64 fpimm), [{
+      return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
+
+def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+// The VCVT to/from fixed-point instructions encode the 'fbits' operand
+// (the number of fixed bits) differently than it appears in the assembly
+// source. It's encoded as "Size - fbits" where Size is the size of the
+// fixed-point representation (32 or 16) and fbits is the value appearing
+// in the assembly source, an integer in [0,16] or (0,32], depending on size.
+def fbits32_asm_operand : AsmOperandClass { let Name = "FBits32"; }
+def fbits32 : Operand<i32> {
+  let PrintMethod = "printFBits32";
+  let ParserMatchClass = fbits32_asm_operand;
+}
+
+def fbits16_asm_operand : AsmOperandClass { let Name = "FBits16"; }
+def fbits16 : Operand<i32> {
+  let PrintMethod = "printFBits16";
+  let ParserMatchClass = fbits16_asm_operand;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr),
+                 IIC_fpLoad64, "vldr", "\t$Dd, $addr",
+                 [(set DPR:$Dd, (f64 (alignedload32 addrmode5:$addr)))]>;
+
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
+                 IIC_fpLoad32, "vldr", "\t$Sd, $addr",
+                 [(set SPR:$Sd, (load addrmode5:$addr))]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
+
+def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
+                 IIC_fpStore64, "vstr", "\t$Dd, $addr",
+                 [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>;
+
+def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
+                 IIC_fpStore32, "vstr", "\t$Sd, $addr",
+                 [(store SPR:$Sd, addrmode5:$addr)]> {
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+multiclass vfp_ldst_mult<string asm, bit L_bit,
+                         InstrItinClass itin, InstrItinClass itin_upd> {
+  // Double Precision
+  def DIA :
+    AXDI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def DIA_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def DDB_UPD :
+    AXDI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+  }
+
+  // Single Precision
+  def SIA :
+    AXSI4<(outs), (ins GPR:$Rn, pred:$p, spr_reglist:$regs, variable_ops),
+          IndexModeNone, itin,
+          !strconcat(asm, "ia${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SIA_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+  def SDB_UPD :
+    AXSI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, spr_reglist:$regs,
+                               variable_ops),
+          IndexModeUpd, itin_upd,
+          !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;       // Decrement Before
+    let Inst{21}    = 1;          // Writeback
+    let Inst{20}    = L_bit;
+
+    // Some single precision VFP instructions may be executed on both NEON and
+    // VFP pipelines.
+    let D = VFPNeonDomain;
+  }
+}
+
+let neverHasSideEffects = 1 in {
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+defm VLDM : vfp_ldst_mult<"vldm", 1, IIC_fpLoad_m, IIC_fpLoad_mu>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+defm VSTM : vfp_ldst_mult<"vstm", 0, IIC_fpLoad_m, IIC_fpLoad_mu>;
+
+} // neverHasSideEffects
+
+def : MnemonicAlias<"vldm", "vldmia">;
+def : MnemonicAlias<"vstm", "vstmia">;
+
+def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpush${p} $r", (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+def : InstAlias<"vpop${p} $r",  (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>,
+                Requires<[HasVFP2]>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMSDB_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpush${p}", "$r",
+                         (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMSIA_UPD SP, pred:$p, spr_reglist:$r)>;
+defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
+                         (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
+
+// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VADDD  : ADbI<0b11100, 0b11, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+
+let TwoOperandAliasConstraint = "$Dn = $Dd" in
+def VMULD  : ADbI<0b11100, 0b10, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+
+let TwoOperandAliasConstraint = "$Sn = $Sd" in
+def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VNMULD : ADbI<0b11100, 0b10, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                  IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+
+def VNMULS : ASbI<0b11100, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
+          (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+// These are encoded as unary instructions.
+let Defs = [FPSCR_NZCV] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins DPR:$Dd, DPR:$Dm),
+                  IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
+                  [/* For disassembly only; pattern left blank */]>;
+
+def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
+                  (outs), (ins SPR:$Sd, SPR:$Sm),
+                  IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
+                  [/* For disassembly only; pattern left blank */]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+} // Defs = [FPSCR_NZCV]
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vabs", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fabs (f64 DPR:$Dm)))]>;
+
+def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vabs", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fabs SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+let Defs = [FPSCR_NZCV] in {
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
+                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
+                   [(arm_cmpfp0 SPR:$Sd)]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins DPR:$Dd),
+                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+}
+
+def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
+                   (outs), (ins SPR:$Sd),
+                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
+                   [/* For disassembly only; pattern left blank */]> {
+  let Inst{3-0} = 0b0000;
+  let Inst{5}   = 0;
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+} // Defs = [FPSCR_NZCV]
+
+def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
+                   [(set DPR:$Dd, (fextend SPR:$Sm))]> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+}
+
+// Special case encoding: bits 11-8 is 0b1011.
+def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
+                    IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
+                    [(set SPR:$Sd, (fround DPR:$Dm))]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = 0b11101;
+  let Inst{21-16} = 0b110111;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-6}   = 0b11;
+  let Inst{4}     = 0;
+}
+
+// Between half-precision and single-precision.  For disassembly only.
+
+// FIXME: Verify encoding after integrated assembler is working.
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def : Pat<(f32_to_f16 SPR:$a),
+          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+
+def : Pat<(f16_to_f32 GPR:$a),
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fneg (f64 DPR:$Dm)))]>;
+
+def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   IIC_fpUNA32, "vneg", ".f32\t$Sd, $Sm",
+                   [(set SPR:$Sd, (fneg SPR:$Sm))]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+
+let neverHasSideEffects = 1 in {
+def VMOVD  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs DPR:$Dd), (ins DPR:$Dm),
+                  IIC_fpUNA64, "vmov", ".f64\t$Dd, $Dm", []>;
+
+def VMOVS  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  IIC_fpUNA32, "vmov", ".f32\t$Sd, $Sm", []>;
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def VMOVRS : AVConv2I<0b11100001, 0b1010,
+                      (outs GPR:$Rt), (ins SPR:$Sn),
+                      IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+  // Instruction operands.
+  bits<4> Rt;
+  bits<5> Sn;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+// Bitcast i32 -> f32.  NEON prefers to use VMOVDRR.
+def VMOVSR : AVConv4I<0b11100000, 0b1010,
+                      (outs SPR:$Sn), (ins GPR:$Rt),
+                      IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
+                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+             Requires<[HasVFP2, UseVMOVSR]> {
+  // Instruction operands.
+  bits<5> Sn;
+  bits<4> Rt;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Rt;
+
+  let Inst{6-5}   = 0b00;
+  let Inst{3-0}   = 0b0000;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+let neverHasSideEffects = 1 in {
+def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
+                        (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
+                        IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
+                      (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+                 IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
+                 [/* For disassembly only; pattern left blank */]> {
+  bits<5> src1;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = src1{4-1};
+  let Inst{5}     = src1{0};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+  let DecoderMethod = "DecodeVMOVRRS";
+}
+} // neverHasSideEffects
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def VMOVDRR : AVConv5I<0b11000100, 0b1011,
+                      (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
+                      IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+  // Instruction operands.
+  bits<5> Dm;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
+  let Inst{7-6}   = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+}
+
+let neverHasSideEffects = 1 in
+def VMOVSRR : AVConv5I<0b11000100, 0b1010,
+                     (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
+                IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
+                [/* For disassembly only; pattern left blank */]> {
+  // Instruction operands.
+  bits<5> dst1;
+  bits<4> src1;
+  bits<4> src2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = dst1{4-1};
+  let Inst{5}     = dst1{0};
+  let Inst{15-12} = src1;
+  let Inst{19-16} = src2;
+
+  let Inst{7-6} = 0b00;
+
+  // Some single precision VFP instructions may be executed on both NEON and VFP
+  // pipelines.
+  let D = VFPNeonDomain;
+
+  let DecoderMethod = "DecodeVMOVSRR";
+}
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX:  SPR system reg -> GPR
+// FMSRR: GPR -> SPR
+// FMXR:  GPR -> VFP system reg
+
+
+// Int -> FP:
+
+class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+}
+
+class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,InstrItinClass itin,
+                         string opc, string asm, list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_sitof SPR:$Sm)))]> {
+  let Inst{7} = 1; // s32
+}
+
+def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd),(ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_sitof SPR:$Sm))]> {
+  let Inst{7} = 1; // s32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
+                               (outs DPR:$Dd), (ins SPR:$Sm),
+                               IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
+                               [(set DPR:$Dd, (f64 (arm_uitof SPR:$Sm)))]> {
+  let Inst{7} = 0; // u32
+}
+
+def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
+                                (outs SPR:$Sd), (ins SPR:$Sm),
+                                IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
+                                [(set SPR:$Sd, (arm_uitof SPR:$Sm))]> {
+  let Inst{7} = 0; // u32
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// FP -> Int:
+
+class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                        bits<4> opcod4, dag oops, dag iops,
+                        InstrItinClass itin, string opc, string asm,
+                        list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
+                         bits<4> opcod4, dag oops, dag iops,
+                         InstrItinClass itin, string opc, string asm,
+                         list<dag> pattern>
+  : AVConv1In<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+              pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+}
+
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (arm_ftosi (f64 DPR:$Dm)))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftosi SPR:$Sm))]> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                               (outs SPR:$Sd), (ins DPR:$Dm),
+                               IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
+                               [(set SPR:$Sd, (arm_ftoui (f64 DPR:$Dm)))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (arm_ftoui SPR:$Sm))]> {
+  let Inst{7} = 1; // Z bit
+
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
+let Uses = [FPSCR] in {
+// FIXME: Verify encoding after integrated assembler is working.
+def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
+                                (outs SPR:$Sd), (ins DPR:$Dm),
+                                IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
+                                 (outs SPR:$Sd), (ins SPR:$Sm),
+                                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+  let Inst{7} = 0; // Z bit
+}
+}
+
+// Convert between floating-point and fixed-point
+// Data type for fixed-point naming convention:
+//   S16 (U=0, sx=0) -> SH
+//   U16 (U=1, sx=0) -> UH
+//   S32 (U=0, sx=1) -> SL
+//   U32 (U=1, sx=1) -> UL
+
+let Constraints = "$a = $dst" in {
+
+// FP to Fixed-Point:
+
+// Single Precision register
+class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{0};
+  let Inst{15-12} = dst{4-1};
+}
+
+// Double Precision register
+class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
+                          bit op5, dag oops, dag iops, InstrItinClass itin,
+                          string opc, string asm, list<dag> pattern>
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  bits<5> dst;
+  // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
+  let Inst{22} = dst{4};
+  let Inst{15-12} = dst{3-0};
+}
+
+def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOUHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u16.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSLS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+
+def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+
+def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+
+def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+
+// Fixed-Point to FP:
+
+def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
+                       (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
+                       (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+
+def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
+                       (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+
+def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+
+def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
+                       (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+
+} // End of 'let Constraints = "$a = $dst" in'
+
+//===----------------------------------------------------------------------===//
+// FP Multiply-Accumulate Operations.
+//
+
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+
+def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines on A8.
+  let D = VFPNeonA8Domain;
+}
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+
+//===----------------------------------------------------------------------===//
+// Fused FP Multiply-Accumulate Operations.
+//
+def VFMAD : ADbI<0b11101, 0b10, 0, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfma", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,UseFusedMAC]>;
+
+def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,UseFusedMAC]>;
+def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma x, y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
+          (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
+          (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFMSD : ADbI<0b11101, 0b10, 1, 0,
+                 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                 IIC_fpFMAC64, "vfms", ".f64\t$Dd, $Dn, $Dm",
+                 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+              RegConstraint<"$Ddin = $Dd">,
+              Requires<[HasVFP4,UseFusedMAC]>;
+
+def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfms", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+              RegConstraint<"$Sdin = $Sd">,
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
+          (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,UseFusedMAC]>;
+def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
+          (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fma (fneg x), y, z) -> (vfms z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma x, (fneg y), z) -> (vfms z, x, y)
+def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
+          (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
+          (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnma", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
+                                          (f64 DPR:$Ddin)))]>,
+                RegConstraint<"$Ddin = $Dd">,
+                Requires<[HasVFP4,UseFusedMAC]>;
+
+def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnma", ".f32\t$Sd, $Sn, $Sm",
+                  [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
+                                           SPR:$Sdin))]>,
+                RegConstraint<"$Sdin = $Sd">,
+                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
+          (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
+          (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+// (fneg (fma x, y, z)) -> (vfnma z, x, y)
+def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
+def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
+                  (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
+                  IIC_fpFMAC64, "vfnms", ".f64\t$Dd, $Dn, $Dm",
+                  [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
+                                           (f64 DPR:$Ddin)))]>,
+               RegConstraint<"$Ddin = $Dd">,
+               Requires<[HasVFP4,UseFusedMAC]>;
+
+def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
+                  (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
+                  IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
+             [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
+                         RegConstraint<"$Sdin = $Sd">,
+                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+  // Some single precision VFP instructions may be executed on both NEON and
+  // VFP pipelines.
+}
+
+def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
+          (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
+          Requires<[HasVFP4,UseFusedMAC]>;
+def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
+          (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
+          Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
+
+// Match @llvm.fma.* intrinsics
+
+// (fma x, y, (fneg z)) -> (vfnms z, x, y))
+def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
+def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
+          (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
+      Requires<[HasVFP4]>;
+def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
+          (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
+      Requires<[HasVFP4]>;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+let neverHasSideEffects = 1 in {
+def VMOVDcc  : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p),
+                    4, IIC_fpUNA64,
+                    [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
+                 RegConstraint<"$Dn = $Dd">;
+
+def VMOVScc  : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p),
+                    4, IIC_fpUNA32,
+                    [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
+                 RegConstraint<"$Sn = $Sd">;
+} // neverHasSideEffects
+
+//===----------------------------------------------------------------------===//
+// Move from VFP System Register to ARM core register.
+//
+
+class MovFromVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+                 list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> Rt;
+
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = opc19_16;
+  let Inst{15-12} = Rt;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{6-5}   = 0b00;
+  let Inst{4}     = 1;
+  let Inst{3-0}   = 0b0000;
+}
+
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR_NZCV], Rt = 0b1111 /* apsr_nzcv */ in
+def FMSTAT : MovFromVFP<0b0001 /* fpscr */, (outs), (ins),
+                        "vmrs", "\tAPSR_nzcv, fpscr", [(arm_fmstat)]>;
+
+// Application level FPSCR -> GPR
+let hasSideEffects = 1, Uses = [FPSCR] in
+def VMRS : MovFromVFP<0b0001 /* fpscr */, (outs GPR:$Rt), (ins),
+                      "vmrs", "\t$Rt, fpscr",
+                      [(set GPR:$Rt, (int_arm_get_fpscr))]>;
+
+// System level FPEXC, FPSID -> GPR
+let Uses = [FPSCR] in {
+  def VMRS_FPEXC : MovFromVFP<0b1000 /* fpexc */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpexc", []>;
+  def VMRS_FPSID : MovFromVFP<0b0000 /* fpsid */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpsid", []>;
+  def VMRS_MVFR0 : MovFromVFP<0b0111 /* mvfr0 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr0", []>;
+  def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr1", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move from ARM core register to VFP System Register.
+//
+
+class MovToVFP<bits<4> opc19_16, dag oops, dag iops, string opc, string asm,
+               list<dag> pattern>:
+  VFPAI<oops, iops, VFPMiscFrm, IIC_fpSTAT, opc, asm, pattern> {
+
+  // Instruction operand.
+  bits<4> src;
+
+  // Encode instruction operand.
+  let Inst{15-12} = src;
+
+  let Inst{27-20} = 0b11101110;
+  let Inst{19-16} = opc19_16;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+
+let Defs = [FPSCR] in {
+  // Application level GPR -> FPSCR
+  def VMSR : MovToVFP<0b0001 /* fpscr */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpscr, $src", [(int_arm_set_fpscr GPR:$src)]>;
+  // System level GPR -> FPEXC
+  def VMSR_FPEXC : MovToVFP<0b1000 /* fpexc */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpexc, $src", []>;
+  // System level GPR -> FPSID
+  def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
+                      "vmsr", "\tfpsid, $src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+// Materialize FP immediates. VFP3 only.
+let isReMaterializable = 1 in {
+def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
+                    VFPMiscFrm, IIC_fpUNA64,
+                    "vmov", ".f64\t$Dd, $imm",
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+  bits<5> Dd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Dd{4};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+
+def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
+                     VFPMiscFrm, IIC_fpUNA32,
+                     "vmov", ".f32\t$Sd, $imm",
+                     [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+  bits<5> Sd;
+  bits<8> imm;
+
+  let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision.
+  let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+// A few mnemnoic aliases for pre-unifixed syntax. We don't guarantee to
+// support them all, but supporting at least some of the basics is
+// good to be friendly.
+def : VFP2MnemonicAlias<"flds", "vldr">;
+def : VFP2MnemonicAlias<"fldd", "vldr">;
+def : VFP2MnemonicAlias<"fmrs", "vmov">;
+def : VFP2MnemonicAlias<"fmsr", "vmov">;
+def : VFP2MnemonicAlias<"fsqrts", "vsqrt">;
+def : VFP2MnemonicAlias<"fsqrtd", "vsqrt">;
+def : VFP2MnemonicAlias<"fadds", "vadd.f32">;
+def : VFP2MnemonicAlias<"faddd", "vadd.f64">;
+def : VFP2MnemonicAlias<"fmrdd", "vmov">;
+def : VFP2MnemonicAlias<"fmrds", "vmov">;
+def : VFP2MnemonicAlias<"fmrrd", "vmov">;
+def : VFP2MnemonicAlias<"fmdrr", "vmov">;
+def : VFP2MnemonicAlias<"fmuls", "vmul.f32">;
+def : VFP2MnemonicAlias<"fmuld", "vmul.f64">;
+def : VFP2MnemonicAlias<"fnegs", "vneg.f32">;
+def : VFP2MnemonicAlias<"fnegd", "vneg.f64">;
+def : VFP2MnemonicAlias<"ftosizd", "vcvt.s32.f64">;
+def : VFP2MnemonicAlias<"ftosid", "vcvtr.s32.f64">;
+def : VFP2MnemonicAlias<"ftosizs", "vcvt.s32.f32">;
+def : VFP2MnemonicAlias<"ftosis", "vcvtr.s32.f32">;
+def : VFP2MnemonicAlias<"ftouizd", "vcvt.u32.f64">;
+def : VFP2MnemonicAlias<"ftouid", "vcvtr.u32.f64">;
+def : VFP2MnemonicAlias<"ftouizs", "vcvt.u32.f32">;
+def : VFP2MnemonicAlias<"ftouis", "vcvtr.u32.f32">;
+def : VFP2MnemonicAlias<"fsitod", "vcvt.f64.s32">;
+def : VFP2MnemonicAlias<"fsitos", "vcvt.f32.s32">;
+def : VFP2MnemonicAlias<"fuitod", "vcvt.f64.u32">;
+def : VFP2MnemonicAlias<"fuitos", "vcvt.f32.u32">;
+def : VFP2MnemonicAlias<"fsts", "vstr">;
+def : VFP2MnemonicAlias<"fstd", "vstr">;
+def : VFP2MnemonicAlias<"fmacd", "vmla.f64">;
+def : VFP2MnemonicAlias<"fmacs", "vmla.f32">;
+def : VFP2MnemonicAlias<"fcpys", "vmov.f32">;
+def : VFP2MnemonicAlias<"fcpyd", "vmov.f64">;
+def : VFP2MnemonicAlias<"fcmps", "vcmp.f32">;
+def : VFP2MnemonicAlias<"fcmpd", "vcmp.f64">;
+def : VFP2MnemonicAlias<"fdivs", "vdiv.f32">;
+def : VFP2MnemonicAlias<"fdivd", "vdiv.f64">;
+def : VFP2MnemonicAlias<"fmrx", "vmrs">;
+def : VFP2MnemonicAlias<"fmxr", "vmsr">;
+
+// Be friendly and accept the old form of zero-compare
+def : VFP2InstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
+
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
+                    (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                    (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
+                    (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                    (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+
+// No need for the size suffix on VSQRT. It's implied by the register classes.
+def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
+def : VFP2InstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+// VLDR/VSTR accept an optional type suffix.
+def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
+                    (VLDRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.32 $Sd, $addr",
+                    (VSTRS SPR:$Sd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vldr${p}.64 $Dd, $addr",
+                    (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+def : VFP2InstAlias<"vstr${p}.64 $Dd, $addr",
+                    (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+
+// VMOV can accept optional 32-bit or less data type suffix suffix.
+def : VFP2InstAlias<"vmov${p}.8 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.8 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.16 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.32 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+
+def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn",
+                    (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
+                    (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>;
+
+// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way
+// VMOVD does.
+def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
+                    (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
new file mode 100644
index 000000000000..351a290e2aa0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.cpp
@@ -0,0 +1,336 @@
+//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARMJITInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction");
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. (We need
+// to preserve more context and manipulate the stack directly).  Instead,
+// write our own wrapper, which does things our way, so we have complete
+// control over register saving and restoring.
+extern "C" {
+#if defined(__arm__)
+  void ARMCompilationCallback();
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "ARMCompilationCallback\n"
+    ASMPREFIX "ARMCompilationCallback:\n"
+    // Save caller saved registers since they may contain stuff
+    // for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned, so we can't just preserve the callee saved regs.
+    "stmdb sp!, {r0, r1, r2, r3, lr}\n"
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+    "vstmdb sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    // The LR contains the address of the stub function on entry.
+    // pass it as the argument to the C part of the callback
+    "mov  r0, lr\n"
+    "sub  sp, sp, #4\n"
+    // Call the C portion of the callback
+    "bl   " ASMPREFIX "ARMCompilationCallbackC\n"
+    "add  sp, sp, #4\n"
+    // Restoring the LR to the return address of the function that invoked
+    // the stub and de-allocating the stack space for it requires us to
+    // swap the two saved LR values on the stack, as they're backwards
+    // for what we need since the pop instruction has a pre-determined
+    // order for the registers.
+    //      +--------+
+    //   0  | LR     | Original return address
+    //      +--------+
+    //   1  | LR     | Stub address (start of stub)
+    // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
+    // 6-20 | D0..D7 | Saved VFP registers
+    //      +--------+
+    //
+#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+    // Restore VFP caller-saved registers.
+    "vldmia sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    //
+    //      We need to exchange the values in slots 0 and 1 so we can
+    //      return to the address in slot 1 with the address in slot 0
+    //      restored to the LR.
+    "ldr  r0, [sp,#20]\n"
+    "ldr  r1, [sp,#16]\n"
+    "str  r1, [sp,#20]\n"
+    "str  r0, [sp,#16]\n"
+    // Return to the (newly modified) stub to invoke the real function.
+    // The above twiddling of the saved return addresses allows us to
+    // deallocate everything, including the LR the stub saved, with two
+    // updating load instructions.
+    "ldmia  sp!, {r0, r1, r2, r3, lr}\n"
+    "ldr    pc, [sp], #4\n"
+      );
+#else  // Not an ARM host
+  void ARMCompilationCallback() {
+    llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
+  }
+#endif
+}
+
+/// ARMCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call. We're replacing the first two instructions of the
+  // stub with:
+  //   ldr pc, [pc,#-4]
+  //   <addr>
+  if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub writable");
+  }
+  *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
+  *(intptr_t *)(StubAddr+4) = NewVal;
+  if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub executable");
+  }
+}
+
+TargetJITInfo::LazyResolverFn
+ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return ARMCompilationCallback;
+}
+
+void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
+                                             JITCodeEmitter &JCE) {
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr);
+  void *PtrAddr = JCE.allocIndirectGV(
+      GV, Buffer, sizeof(Buffer), /*Alignment=*/4);
+  addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
+  return PtrAddr;
+}
+
+TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() {
+  // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a
+  // 4-byte address.  See emitFunctionStub for details.
+  StubLayout Result = {16, 4};
+  return Result;
+}
+
+void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  void *Addr;
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
+    // Branch to the corresponding function addr.
+    if (IsPIC) {
+      // The stub is 16-byte size and 4-aligned.
+      intptr_t LazyPtr = getIndirectSymAddr(Fn);
+      if (!LazyPtr) {
+        // In PIC mode, the function stub is loading a lazy-ptr.
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
+        DEBUG(if (F)
+                errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
+                       << "] for GV '" << F->getName() << "'\n";
+              else
+                errs() << "JIT: Stub emitted at [" << LazyPtr
+                       << "] for external function at '" << Fn << "'\n");
+      }
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe59fc004);            // ldr ip, [pc, #+4]
+      JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
+      JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
+      JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8));  // func - (L_func$scv+8)
+      sys::Memory::InvalidateInstructionCache(Addr, 16);
+      if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    } else {
+      // The stub is 8-byte size and 4-aligned.
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
+      JCE.emitWordLE((intptr_t)Fn);  // addr of function
+      sys::Memory::InvalidateInstructionCache(Addr, 8);
+      if (!sys::Memory::setRangeExecutable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    }
+  } else {
+    // The compilation callback will overwrite the first two words of this
+    // stub with indirect branch instructions targeting the compiled code.
+    // This stub sets the return address to restart the stub, so that
+    // the new branch will be invoked when we come back.
+    //
+    // Branch and link to the compilation callback.
+    // The stub is 16-byte size and 4-byte aligned.
+    JCE.emitAlignment(4);
+    Addr = (void*)JCE.getCurrentPCValue();
+    if (!sys::Memory::setRangeWritable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub writable");
+    }
+    // Save LR so the callback can determine which stub called it.
+    // The compilation callback is responsible for popping this prior
+    // to returning.
+    JCE.emitWordLE(0xe92d4000); // push {lr}
+    // Set the return address to go back to the start of this stub.
+    JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12
+    // Invoke the compilation callback.
+    JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
+    // The address of the compilation callback.
+    JCE.emitWordLE((intptr_t)ARMCompilationCallback);
+    sys::Memory::InvalidateInstructionCache(Addr, 16);
+    if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub executable");
+    }
+  }
+
+  return Addr;
+}
+
+intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const {
+  ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType();
+  switch (RT) {
+  default:
+    return (intptr_t)(MR->getResultPointer());
+  case ARM::reloc_arm_pic_jt:
+    // Destination address - jump table base.
+    return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal();
+  case ARM::reloc_arm_jt_base:
+    // Jump table base address.
+    return getJumpTableBaseAddr(MR->getJumpTableIndex());
+  case ARM::reloc_arm_cp_entry:
+  case ARM::reloc_arm_vfp_cp_entry:
+    // Constant pool entry address.
+    return getConstantPoolEntryAddr(MR->getConstantPoolIndex());
+  case ARM::reloc_arm_machine_cp_entry: {
+    ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal();
+    assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) &&
+           "Can't handle this machine constant pool entry yet!");
+    intptr_t Addr = (intptr_t)(MR->getResultPointer());
+    Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment();
+    return Addr;
+  }
+  }
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = resolveRelocDestAddr(MR);
+    switch ((ARM::RelocationType)MR->getRelocationType()) {
+    case ARM::reloc_arm_cp_entry:
+    case ARM::reloc_arm_vfp_cp_entry:
+    case ARM::reloc_arm_relative: {
+      // It is necessary to calculate the correct PC relative value. We
+      // subtract the base addr from the target addr to form a byte offset.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      // If the result is positive, set bit U(23) to 1.
+      if (ResultPtr >= 0)
+        *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift;
+      else {
+        // Otherwise, obtain the absolute value and set bit U(23) to 0.
+        *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift);
+        ResultPtr = - ResultPtr;
+      }
+      // Set the immed value calculated.
+      // VFP immediate offset is multiplied by 4.
+      if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
+        ResultPtr = ResultPtr >> 2;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      // Set register Rn to PC (which is register 15 on all architectures).
+      // FIXME: This avoids the need for register info in the JIT class.
+      *((intptr_t*)RelocPos) |= 15 << ARMII::RegRnShift;
+      break;
+    }
+    case ARM::reloc_arm_pic_jt:
+    case ARM::reloc_arm_machine_cp_entry:
+    case ARM::reloc_arm_absolute: {
+      // These addresses have already been resolved.
+      *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_branch: {
+      // It is necessary to calculate the correct value of signed_immed_24
+      // field. We subtract the base addr from the target addr to form a
+      // byte offset, which must be inside the range -33554432 and +33554428.
+      // Then, we set the signed_immed_24 field of the instruction to bits
+      // [25:2] of the byte offset. More details ARM-ARM p. A4-11.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2;
+      assert(ResultPtr >= -33554432 && ResultPtr <= 33554428);
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_jt_base: {
+      // JT base - (instruction addr + 8)
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_movw: {
+      ResultPtr = ResultPtr & 0xFFFF; 
+      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
+      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
+      break;
+    }
+    case ARM::reloc_arm_movt: {
+      ResultPtr = (ResultPtr >> 16) & 0xFFFF; 
+      *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
+      *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
+      break;
+    }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMJITInfo.h b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
new file mode 100644
index 000000000000..23a6a9b512f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMJITInfo.h
@@ -0,0 +1,183 @@
+//===-- ARMJITInfo.h - ARM implementation of the JIT interface  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMJITINFO_H
+#define ARMJITINFO_H
+
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class ARMTargetMachine;
+
+  class ARMJITInfo : public TargetJITInfo {
+    // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding
+    // CONSTPOOL_ENTRY addresses.
+    SmallVector<intptr_t, 16> ConstPoolId2AddrMap;
+
+    // JumpTableId2AddrMap - A map from inline jumptable ids to the
+    // corresponding inline jump table bases.
+    SmallVector<intptr_t, 16> JumpTableId2AddrMap;
+
+    // PCLabelMap - A map from PC labels to addresses.
+    DenseMap<unsigned, intptr_t> PCLabelMap;
+
+    // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol)
+    // addresses to their indirect symbol addresses.
+    DenseMap<void*, intptr_t> Sym2IndirectSymMap;
+
+    // IsPIC - True if the relocation model is PIC. This is used to determine
+    // how to codegen function stubs.
+    bool IsPIC;
+
+  public:
+    explicit ARMJITInfo() : IsPIC(false) { useGOT = false; }
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                            JITCodeEmitter &JCE);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on ARM.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// hasCustomConstantPool - Allows a target to specify that constant
+    /// pool address resolution is handled by the target.
+    virtual bool hasCustomConstantPool() const { return true; }
+
+    /// hasCustomJumpTables - Allows a target to specify that jumptables
+    /// are emitted by the target.
+    virtual bool hasCustomJumpTables() const { return true; }
+
+    /// allocateSeparateGVMemory - If true, globals should be placed in
+    /// separately allocated heap memory rather than in the same
+    /// code memory allocated by JITCodeEmitter.
+    virtual bool allocateSeparateGVMemory() const {
+#ifdef __APPLE__
+      return true;
+#else
+      return false;
+#endif
+    }
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
+    /// jump table ids to jump table bases map; remember if codegen relocation
+    /// model is PIC.
+    void Initialize(const MachineFunction &MF, bool isPIC) {
+      const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      ConstPoolId2AddrMap.resize(AFI->getNumPICLabels());
+      JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
+      IsPIC = isPIC;
+    }
+
+    /// getConstantPoolEntryAddr - The ARM target puts all constant
+    /// pool entries into constant islands. This returns the address of the
+    /// constant pool entry of the specified index.
+    intptr_t getConstantPoolEntryAddr(unsigned CPI) const {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      return ConstPoolId2AddrMap[CPI];
+    }
+
+    /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address
+    /// where its associated value is stored. When relocations are processed,
+    /// this value will be used to resolve references to the constant.
+    void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      ConstPoolId2AddrMap[CPI] = Addr;
+    }
+
+    /// getJumpTableBaseAddr - The ARM target inline all jump tables within
+    /// text section of the function. This returns the address of the base of
+    /// the jump table of the specified index.
+    intptr_t getJumpTableBaseAddr(unsigned JTI) const {
+      assert(JTI < JumpTableId2AddrMap.size());
+      return JumpTableId2AddrMap[JTI];
+    }
+
+    /// addJumpTableBaseAddr - Map a jump table index to the address where
+    /// the corresponding inline jump table is emitted. When relocations are
+    /// processed, this value will be used to resolve references to the
+    /// jump table.
+    void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) {
+      assert(JTI < JumpTableId2AddrMap.size());
+      JumpTableId2AddrMap[JTI] = Addr;
+    }
+
+    /// getPCLabelAddr - Retrieve the address of the PC label of the
+    /// specified id.
+    intptr_t getPCLabelAddr(unsigned Id) const {
+      DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
+      assert(I != PCLabelMap.end());
+      return I->second;
+    }
+
+    /// addPCLabelAddr - Remember the address of the specified PC label.
+    void addPCLabelAddr(unsigned Id, intptr_t Addr) {
+      PCLabelMap.insert(std::make_pair(Id, Addr));
+    }
+
+    /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the
+    /// specified symbol located at address. Returns 0 if the indirect symbol
+    /// has not been emitted.
+    intptr_t getIndirectSymAddr(void *Addr) const {
+      DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr);
+      if (I != Sym2IndirectSymMap.end())
+        return I->second;
+      return 0;
+    }
+
+    /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to
+    /// its indirect symbol address.
+    void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) {
+      Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr));
+    }
+
+  private:
+    /// resolveRelocDestAddr - Resolve the resulting address of the relocation
+    /// if it's not already solved. Constantpool entries must be resolved by
+    /// ARM target.
+    intptr_t resolveRelocDestAddr(MachineRelocation *MR) const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 000000000000..b7ac5d57c362
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -0,0 +1,2062 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-ldst-opt"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
+STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
+STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
+STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
+STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
+STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
+STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
+
+/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
+/// load / store instructions to form ldm / stm instructions.
+
+namespace {
+  struct ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+    RegScavenger *RS;
+    bool isThumb2;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Reg;
+      bool isKill;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, unsigned r, bool k, unsigned p,
+                      MachineBasicBlock::iterator i)
+        : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {}
+    };
+    class UnitRegsMap {
+    public:
+      UnitRegsMap(const TargetRegisterInfo* _TRI) : TRI(_TRI) {}
+      const SmallVector<unsigned, 4>& operator[](unsigned Reg) {
+        DenseMap<unsigned, SmallVector<unsigned, 4> >::iterator found =
+            Cache.find(Reg);
+        if (found != Cache.end())
+          return found->second;
+        else
+          return Cache.insert(std::make_pair(Reg, this->getUnitRegs(Reg)))
+                      .first->second;
+      }
+    private:
+      SmallVector<unsigned, 4> getUnitRegs(unsigned Reg) {
+        SmallVector<unsigned, 4> Res;
+
+        const TargetRegisterClass* TRC = TRI->getMinimalPhysRegClass(Reg);
+        if (TRC == &ARM::QPRRegClass) {
+          if (Reg > ARM::Q7) {
+            Res.push_back(TRI->getSubReg(Reg, ARM::dsub_0));
+            Res.push_back(TRI->getSubReg(Reg, ARM::dsub_1));
+            return Res;
+          }
+
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0));
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1));
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_2));
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_3));
+
+          return Res;
+        }
+
+        if (TRC == &ARM::DPRRegClass && Reg < ARM::D15) {
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0));
+          Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1));
+
+          return Res;
+        }
+
+        Res.push_back(Reg);
+
+        return Res;
+
+      }
+      const TargetRegisterInfo* TRI;
+      DenseMap<unsigned, SmallVector<unsigned, 4> > Cache;
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  int Offset, unsigned Base, bool BaseKill, int Opcode,
+                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                  DebugLoc dl,
+                  ArrayRef<std::pair<unsigned, bool> > Regs,
+                  ArrayRef<unsigned> ImpDefs);
+    void MergeOpsUpdate(MachineBasicBlock &MBB,
+                        MemOpQueue &MemOps,
+                        unsigned memOpsBegin,
+                        unsigned memOpsEnd,
+                        unsigned insertAfter,
+                        int Offset,
+                        unsigned Base,
+                        bool BaseKill,
+                        int Opcode,
+                        ARMCC::CondCodes Pred,
+                        unsigned PredReg,
+                        unsigned Scratch,
+                        DebugLoc dl,
+                        SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                      int Opcode, unsigned Size,
+                      ARMCC::CondCodes Pred, unsigned PredReg,
+                      unsigned Scratch, MemOpQueue &MemOps,
+                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+
+    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI);
+    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const TargetInstrInfo *TII,
+                                  bool &Advance,
+                                  MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   bool &Advance,
+                                   MachineBasicBlock::iterator &I);
+    unsigned AddMemOp(MemOpQueue& MemOps,
+                      const MemOpQueueEntry newEntry,
+                      UnitRegsMap& UnitRegsInfo,
+                      SmallSet<unsigned, 4>& UsedUnitRegs,
+                      unsigned At = -1U);
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA;
+    case ARM_AM::da: return ARM::LDMDA;
+    case ARM_AM::db: return ARM::LDMDB;
+    case ARM_AM::ib: return ARM::LDMIB;
+    }
+  case ARM::STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA;
+    case ARM_AM::da: return ARM::STMDA;
+    case ARM_AM::db: return ARM::STMDB;
+    case ARM_AM::ib: return ARM::STMIB;
+    }
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA;
+    case ARM_AM::db: return ARM::t2LDMDB;
+    }
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA;
+    case ARM_AM::db: return ARM::t2STMDB;
+    }
+  case ARM::VLDRS:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA;
+    case ARM_AM::db: return 0; // Only VLDMSDB_UPD exists.
+    }
+  case ARM::VSTRS:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA;
+    case ARM_AM::db: return 0; // Only VSTMSDB_UPD exists.
+    }
+  case ARM::VLDRD:
+    ++NumVLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA;
+    case ARM_AM::db: return 0; // Only VLDMDDB_UPD exists.
+    }
+  case ARM::VSTRD:
+    ++NumVSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA;
+    case ARM_AM::db: return 0; // Only VSTMDDB_UPD exists.
+    }
+  }
+}
+
+namespace llvm {
+  namespace ARM_AM {
+
+AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA_RET:
+  case ARM::LDMIA:
+  case ARM::LDMIA_UPD:
+  case ARM::STMIA:
+  case ARM::STMIA_UPD:
+  case ARM::t2LDMIA_RET:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2STMIA:
+  case ARM::t2STMIA_UPD:
+  case ARM::VLDMSIA:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VSTMSIA:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VLDMDIA:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VSTMDIA:
+  case ARM::VSTMDIA_UPD:
+    return ARM_AM::ia;
+
+  case ARM::LDMDA:
+  case ARM::LDMDA_UPD:
+  case ARM::STMDA:
+  case ARM::STMDA_UPD:
+    return ARM_AM::da;
+
+  case ARM::LDMDB:
+  case ARM::LDMDB_UPD:
+  case ARM::STMDB:
+  case ARM::STMDB_UPD:
+  case ARM::t2LDMDB:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMDB:
+  case ARM::t2STMDB_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    return ARM_AM::db;
+
+  case ARM::LDMIB:
+  case ARM::LDMIB_UPD:
+  case ARM::STMIB:
+  case ARM::STMIB_UPD:
+    return ARM_AM::ib;
+  }
+}
+
+  } // end namespace ARM_AM
+} // end namespace llvm
+
+static bool isT2i32Load(unsigned Opc) {
+  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+  return Opc == ARM::LDRi12 || isT2i32Load(Opc);
+}
+
+static bool isT2i32Store(unsigned Opc) {
+  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+  return Opc == ARM::STRi12 || isT2i32Store(Opc);
+}
+
+/// MergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done.
+bool
+ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          int Offset, unsigned Base, bool BaseKill,
+                          int Opcode, ARMCC::CondCodes Pred,
+                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
+                          ArrayRef<std::pair<unsigned, bool> > Regs,
+                          ArrayRef<unsigned> ImpDefs) {
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  // VFP and Thumb2 do not support IB or DA modes.
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  bool haveIBAndDA = isNotVFP && !isThumb2;
+  if (Offset == 4 && haveIBAndDA)
+    Mode = ARM_AM::ib;
+  else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
+    Mode = ARM_AM::da;
+  else if (Offset == -4 * (int)NumRegs && isNotVFP)
+    // VLDM/VSTM do not support DB mode without also updating the base reg.
+    Mode = ARM_AM::db;
+  else if (Offset != 0) {
+    // Check if this is a supported opcode before we insert instructions to
+    // calculate a new base register.
+    if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
+
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (isi32Load(Opcode))
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1].first;
+    else {
+      // Use the scratch register to use as a new base.
+      NewBase = Scratch;
+      if (NewBase == 0)
+        return false;
+    }
+    int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri;
+    if (Offset < 0) {
+      BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri;
+      Offset = - Offset;
+    }
+    int ImmedOffset = isThumb2
+      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset == -1)
+      // FIXME: Try t2ADDri12 or t2SUBri12?
+      return false;  // Probably not worth it then.
+
+    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+      .addImm(Pred).addReg(PredReg).addReg(0);
+    Base = NewBase;
+    BaseKill = true;  // New base is always killed right its use.
+  }
+
+  bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
+                Opcode == ARM::VLDRD);
+  Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
+  if (!Opcode) return false;
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(Pred).addReg(PredReg);
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
+                     | getKillRegState(Regs[i].second));
+
+  // Add implicit defs for super-registers.
+  for (unsigned i = 0, e = ImpDefs.size(); i != e; ++i)
+    MIB.addReg(ImpDefs[i], RegState::ImplicitDefine);
+
+  return true;
+}
+
+// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
+// success.
+void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
+                                     MemOpQueue &memOps,
+                                     unsigned memOpsBegin, unsigned memOpsEnd,
+                                     unsigned insertAfter, int Offset,
+                                     unsigned Base, bool BaseKill,
+                                     int Opcode,
+                                     ARMCC::CondCodes Pred, unsigned PredReg,
+                                     unsigned Scratch,
+                                     DebugLoc dl,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+  // First calculate which of the registers should be killed by the merged
+  // instruction.
+  const unsigned insertPos = memOps[insertAfter].Position;
+  SmallSet<unsigned, 4> KilledRegs;
+  DenseMap<unsigned, unsigned> Killer;
+  for (unsigned i = 0, e = memOps.size(); i != e; ++i) {
+    if (i == memOpsBegin) {
+      i = memOpsEnd;
+      if (i == e)
+        break;
+    }
+    if (memOps[i].Position < insertPos && memOps[i].isKill) {
+      unsigned Reg = memOps[i].Reg;
+      KilledRegs.insert(Reg);
+      Killer[Reg] = i;
+    }
+  }
+
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  SmallVector<unsigned, 8> ImpDefs;
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    unsigned Reg = memOps[i].Reg;
+    // If we are inserting the merged operation after an operation that
+    // uses the same register, make sure to transfer any kill flag.
+    bool isKill = memOps[i].isKill || KilledRegs.count(Reg);
+    Regs.push_back(std::make_pair(Reg, isKill));
+
+    // Collect any implicit defs of super-registers. They must be preserved.
+    for (MIOperands MO(memOps[i].MBBI); MO.isValid(); ++MO) {
+      if (!MO->isReg() || !MO->isDef() || !MO->isImplicit() || MO->isDead())
+        continue;
+      unsigned DefReg = MO->getReg();
+      if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
+        ImpDefs.push_back(DefReg);
+    }
+  }
+
+  // Try to do the merge.
+  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
+  ++Loc;
+  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
+                Pred, PredReg, Scratch, dl, Regs, ImpDefs))
+    return;
+
+  // Merge succeeded, update records.
+  Merges.push_back(prior(Loc));
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    // Remove kill flags from any memops that come before insertPos.
+    if (Regs[i-memOpsBegin].second) {
+      unsigned Reg = Regs[i-memOpsBegin].first;
+      if (KilledRegs.count(Reg)) {
+        unsigned j = Killer[Reg];
+        int Idx = memOps[j].MBBI->findRegisterUseOperandIdx(Reg, true);
+        assert(Idx >= 0 && "Cannot find killing operand");
+        memOps[j].MBBI->getOperand(Idx).setIsKill(false);
+        memOps[j].isKill = false;
+      }
+      memOps[i].isKill = true;
+    }
+    MBB.erase(memOps[i].MBBI);
+    // Update this memop to refer to the merged instruction.
+    // We may need to move kill flags again.
+    memOps[i].Merged = true;
+    memOps[i].MBBI = Merges.back();
+    memOps[i].Position = insertPos;
+  }
+}
+
+/// MergeLDR_STR - Merge a number of load / store instructions into one or more
+/// load / store multiple instructions.
+void
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
+                          unsigned Base, int Opcode, unsigned Size,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          unsigned Scratch, MemOpQueue &MemOps,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+  bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned insertAfter = SIndex;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  DebugLoc dl = Loc->getDebugLoc();
+  const MachineOperand &PMO = Loc->getOperand(0);
+  unsigned PReg = PMO.getReg();
+  unsigned PRegNum = PMO.isUndef() ? UINT_MAX : TRI->getEncodingValue(PReg);
+  unsigned Count = 1;
+  unsigned Limit = ~0U;
+
+  // vldm / vstm limit are 32 for S variants, 16 for D variants.
+
+  switch (Opcode) {
+  default: break;
+  case ARM::VSTRS:
+    Limit = 32;
+    break;
+  case ARM::VSTRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRD:
+    Limit = 16;
+    break;
+  case ARM::VLDRS:
+    Limit = 32;
+    break;
+  }
+
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
+    unsigned Reg = MO.getReg();
+    unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg);
+    // Register numbers must be in ascending order. For VFP / NEON load and
+    // store multiples, the registers must also be consecutive and within the
+    // limit on the number of registers per instruction.
+    if (Reg != ARM::SP &&
+        NewOffset == Offset + (int)Size &&
+        ((isNotVFP && RegNum > PRegNum) ||
+         ((Count < Limit) && RegNum == PRegNum+1))) {
+      Offset += Size;
+      PRegNum = RegNum;
+      ++Count;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
+                     Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
+      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
+                   MemOps, Merges);
+      return;
+    }
+
+    if (MemOps[i].Position > MemOps[insertAfter].Position)
+      insertAfter = i;
+  }
+
+  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
+                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
+  return;
+}
+
+static bool definesCPSR(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead())
+      // If the instruction has live CPSR def, then it's not safe to fold it
+      // into load / store.
+      return true;
+  }
+
+  return false;
+}
+
+static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::t2SUBri:
+  case ARM::SUBri:
+    CheckCPSRDef = true;
+  // fallthrough
+  case ARM::tSUBspi:
+    break;
+  }
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                unsigned Bytes, unsigned Limit,
+                                ARMCC::CondCodes Pred, unsigned PredReg) {
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+
+  bool CheckCPSRDef = false;
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::t2ADDri:
+  case ARM::ADDri:
+    CheckCPSRDef = true;
+  // fallthrough
+  case ARM::tADDspi:
+    break;
+  }
+
+  if (Bytes == 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+  if (!(MI->getOperand(0).getReg() == Base &&
+        MI->getOperand(1).getReg() == Base &&
+        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        getInstrPredicate(MI, MyPredReg) == Pred &&
+        MyPredReg == PredReg))
+    return false;
+
+  return CheckCPSRDef ? !definesCPSR(MI) : true;
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+  case ARM::VLDMSIA:
+  case ARM::VSTMSIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 4;
+  case ARM::VLDMDIA:
+  case ARM::VSTMDIA:
+    return (MI->getNumOperands() - MI->getDesc().getNumOperands() + 1) * 8;
+  }
+}
+
+static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
+                                            ARM_AM::AMSubMode Mode) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::LDMIA_UPD;
+    case ARM_AM::ib: return ARM::LDMIB_UPD;
+    case ARM_AM::da: return ARM::LDMDA_UPD;
+    case ARM_AM::db: return ARM::LDMDB_UPD;
+    }
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::STMIA_UPD;
+    case ARM_AM::ib: return ARM::STMIB_UPD;
+    case ARM_AM::da: return ARM::STMDA_UPD;
+    case ARM_AM::db: return ARM::STMDB_UPD;
+    }
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2LDMIA_UPD;
+    case ARM_AM::db: return ARM::t2LDMDB_UPD;
+    }
+  case ARM::t2STMIA:
+  case ARM::t2STMDB:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::t2STMIA_UPD;
+    case ARM_AM::db: return ARM::t2STMDB_UPD;
+    }
+  case ARM::VLDMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMSIA_UPD;
+    case ARM_AM::db: return ARM::VLDMSDB_UPD;
+    }
+  case ARM::VLDMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VLDMDIA_UPD;
+    case ARM_AM::db: return ARM::VLDMDDB_UPD;
+    }
+  case ARM::VSTMSIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMSIA_UPD;
+    case ARM_AM::db: return ARM::VSTMSDB_UPD;
+    }
+  case ARM::VSTMDIA:
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::VSTMDIA_UPD;
+    case ARM_AM::db: return ARM::VSTMDDB_UPD;
+    }
+  }
+}
+
+/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  bool BaseKill = MI->getOperand(0).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // Can't use an updating ld/st if the base register is also a dest
+  // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+  for (unsigned i = 2, e = MI->getNumOperands(); i != e; ++i)
+    if (MI->getOperand(i).getReg() == Base)
+      return false;
+
+  bool DoMerge = false;
+  ARM_AM::AMSubMode Mode = ARM_AM::getLoadStoreMultipleSubMode(Opcode);
+
+  // Try merging with the previous instruction.
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (Mode == ARM_AM::ia &&
+        isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::db;
+      DoMerge = true;
+    } else if (Mode == ARM_AM::ib &&
+               isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      Mode = ARM_AM::da;
+      DoMerge = true;
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+        isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+               isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode);
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+    .addReg(Base, getDefRegState(true)) // WB base register
+    .addReg(Base, getKillRegState(BaseKill))
+    .addImm(Pred).addReg(PredReg);
+
+  // Transfer the rest of operands.
+  for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  MBB.erase(MBBI);
+  return true;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
+                                             ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_PRE_IMM;
+  case ARM::STRi12:
+    return ARM::STR_PRE_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_PRE;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_PRE;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
+                                              ARM_AM::AddrOpc Mode) {
+  switch (Opc) {
+  case ARM::LDRi12:
+    return ARM::LDR_POST_IMM;
+  case ARM::STRi12:
+    return ARM::STR_POST_IMM;
+  case ARM::VLDRS:
+    return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
+  case ARM::VLDRD:
+    return Mode == ARM_AM::add ? ARM::VLDMDIA_UPD : ARM::VLDMDDB_UPD;
+  case ARM::VSTRS:
+    return Mode == ARM_AM::add ? ARM::VSTMSIA_UPD : ARM::VSTMSDB_UPD;
+  case ARM::VSTRD:
+    return Mode == ARM_AM::add ? ARM::VSTMDIA_UPD : ARM::VSTMDDB_UPD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_POST;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_POST;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+}
+
+/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               const TargetInstrInfo *TII,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
+                Opcode == ARM::VSTRD || Opcode == ARM::VSTRS);
+  bool isAM2 = (Opcode == ARM::LDRi12 || Opcode == ARM::STRi12);
+  if (isi32Load(Opcode) || isi32Store(Opcode))
+    if (MI->getOperand(2).getImm() != 0)
+      return false;
+  if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
+
+  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+
+  // Try merging with the previous instruction.
+  MachineBasicBlock::iterator BeginMBBI = MBB.begin();
+  if (MBBI != BeginMBBI) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue())
+      --PrevMBBI;
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub);
+      MBB.erase(PrevMBBI);
+    }
+  }
+
+  // Try merging with the next instruction.
+  MachineBasicBlock::iterator EndMBBI = MBB.end();
+  if (!DoMerge && MBBI != EndMBBI) {
+    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    while (NextMBBI != EndMBBI && NextMBBI->isDebugValue())
+      ++NextMBBI;
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+    }
+    if (DoMerge) {
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub);
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  if (isAM5) {
+    // VLDM[SD}_UPD, VSTM[SD]_UPD
+    // (There are no base-updating versions of VLDR/VSTR instructions, but the
+    // updating load/store-multiple instructions can be used with only one
+    // register.)
+    MachineOperand &MO = MI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+      .addReg(Base, getDefRegState(true)) // WB base register
+      .addReg(Base, getKillRegState(isLd ? BaseKill : false))
+      .addImm(Pred).addReg(PredReg)
+      .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
+                            getKillRegState(MO.isKill())));
+  } else if (isLd) {
+    if (isAM2) {
+      // LDR_PRE, LDR_POST
+      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+      } else {
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+      }
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+      // t2LDR_PRE, t2LDR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    // FIXME: post-indexed stores use am2offset_imm, which still encodes
+    // the vestigal zero-reg offset register. When that's fixed, this clause
+    // can be removed entirely.
+    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+      // STR_PRE, STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+      // t2STR_PRE, t2STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// isMemoryOp - Returns true if instruction is a memory operation that this
+/// pass is capable of operating on.
+static bool isMemoryOp(const MachineInstr *MI) {
+  // When no memory operands are present, conservatively assume unaligned,
+  // volatile, unfoldable.
+  if (!MI->hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI->memoperands_begin();
+
+  // Don't touch volatile memory accesses - we may be changing their order.
+  if (MMO->isVolatile())
+    return false;
+
+  // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
+  // not.
+  if (MMO->getAlignment() < 4)
+    return false;
+
+  // str <undef> could probably be eliminated entirely, but for now we just want
+  // to avoid making a mess of it.
+  // FIXME: Use str <undef> as a wildcard to enable better stm folding.
+  if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() &&
+      MI->getOperand(0).isUndef())
+    return false;
+
+  // Likewise don't mess with references to undefined addresses.
+  if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() &&
+      MI->getOperand(1).isUndef())
+    return false;
+
+  int Opcode = MI->getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return MI->getOperand(1).isReg();
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return MI->getOperand(1).isReg();
+  case ARM::LDRi12:
+  case ARM::STRi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return MI->getOperand(1).isReg();
+  }
+  return false;
+}
+
+/// AdvanceRS - Advance register scavenger to just before the earliest memory
+/// op that is being merged.
+void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
+  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
+  unsigned Position = MemOps[0].Position;
+  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
+    if (MemOps[i].Position < Position) {
+      Position = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (Loc != MBB.begin())
+    RS->forward(prior(Loc));
+}
+
+static int getMemoryOpOffset(const MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 ||
+      Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
+    return OffField;
+
+  int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
+    : ARM_AM::getAM5Offset(OffField) * 4;
+  if (isAM3) {
+    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  } else {
+    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  }
+  return Offset;
+}
+
+static void InsertLDR_STR(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          int Offset, bool isDef,
+                          DebugLoc dl, unsigned NewOpc,
+                          unsigned Reg, bool RegDeadKill, bool RegUndef,
+                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
+                          bool OffKill, bool OffUndef,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          const TargetInstrInfo *TII, bool isT2) {
+  if (isDef) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
+}
+
+bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = &*MBBI;
+  unsigned Opcode = MI->getOpcode();
+  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
+    const MachineOperand &BaseOp = MI->getOperand(2);
+    unsigned BaseReg = BaseOp.getReg();
+    unsigned EvenReg = MI->getOperand(0).getReg();
+    unsigned OddReg  = MI->getOperand(1).getReg();
+    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
+    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
+    // ARM errata 602117: LDRD with base in list may result in incorrect base
+    // register when interrupted or faulted.
+    bool Errata602117 = EvenReg == BaseReg && STI->isCortexM3();
+    if (!Errata602117 &&
+        ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum))
+      return false;
+
+    MachineBasicBlock::iterator NewBBI = MBBI;
+    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
+    bool EvenDeadKill = isLd ?
+      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+    bool EvenUndef = MI->getOperand(0).isUndef();
+    bool OddDeadKill  = isLd ?
+      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+    bool OddUndef = MI->getOperand(1).isUndef();
+    bool BaseKill = BaseOp.isKill();
+    bool BaseUndef = BaseOp.isUndef();
+    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
+    int OffImm = getMemoryOpOffset(MI);
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+
+    if (OddRegNum > EvenRegNum && OffImm == 0) {
+      // Ascending register numbers and no offset. It's safe to change it to a
+      // ldm or stm.
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? ARM::t2LDMIA : ARM::LDMIA)
+        : (isT2 ? ARM::t2STMIA : ARM::STMIA);
+      if (isLd) {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
+          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+        ++NumLDRD2LDM;
+      } else {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(EvenReg,
+                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+          .addReg(OddReg,
+                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
+        ++NumSTRD2STM;
+      }
+      NewBBI = llvm::prior(MBBI);
+    } else {
+      // Split into two instructions.
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+      // Be extra careful for thumb2. t2LDRi8 can't reference a zero offset,
+      // so adjust and use t2LDRi12 here for that.
+      unsigned NewOpc2 = (isLd)
+        ? (isT2 ? (OffImm+4 < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDRi12)
+        : (isT2 ? (OffImm+4 < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STRi12);
+      DebugLoc dl = MBBI->getDebugLoc();
+      // If this is a load and base register is killed, it may have been
+      // re-defed by the load, make sure the first load does not clobber it.
+      if (isLd &&
+          (BaseKill || OffKill) &&
+          (TRI->regsOverlap(EvenReg, BaseReg))) {
+        assert(!TRI->regsOverlap(OddReg, BaseReg));
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                      OddReg, OddDeadKill, false,
+                      BaseReg, false, BaseUndef, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, false,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      } else {
+        if (OddReg == EvenReg && EvenDeadKill) {
+          // If the two source operands are the same, the kill marker is
+          // probably on the first one. e.g.
+          // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
+          EvenDeadKill = false;
+          OddDeadKill = true;
+        }
+        // Never kill the base register in the first instruction.
+        if (EvenReg == BaseReg)
+          EvenDeadKill = false;
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, EvenUndef,
+                      BaseReg, false, BaseUndef, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        NewBBI = llvm::prior(MBBI);
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc2,
+                      OddReg, OddDeadKill, OddUndef,
+                      BaseReg, BaseKill, BaseUndef, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      }
+      if (isLd)
+        ++NumLDRD2LDR;
+      else
+        ++NumSTRD2STR;
+    }
+
+    MBB.erase(MI);
+    MBBI = NewBBI;
+    return true;
+  }
+  return false;
+}
+
+/// AddMemOp - helper for ARMLoadStoreOpt::LoadStoreMultipleOpti.
+/// It adds store mem ops with simple push_back/insert method,
+/// without any additional logic.
+/// For load operation it does the next:
+/// 1. Adds new load operation into MemOp collection at "At" position.
+/// 2. Removes any "load" operations from MemOps, that changes "Reg" register
+/// contents, prior to "At".
+/// UnitRegsInfo - Map of type Map< Register, UnitRegisters-vector >
+/// UsedUnitRegs - set of unit-registers currently in use.
+/// At - position at which it would added, and prior which the clean-up
+/// should be made (for load operation).
+/// FIXME: The clean-up also should be made for store operations,
+/// but the memory address should be analyzed instead of unit registers.
+unsigned ARMLoadStoreOpt::AddMemOp(MemOpQueue& MemOps,
+                                   const MemOpQueueEntry NewEntry,
+                                   UnitRegsMap& UnitRegsInfo,
+                                   SmallSet<unsigned, 4>& UsedUnitRegs,
+                                   unsigned At) {
+  unsigned Cleaned = 0;
+
+  if (At == -1U) {
+    At = MemOps.size();
+    MemOps.push_back(NewEntry);
+  } else
+    MemOps.insert(&MemOps[At], NewEntry);
+
+  // FIXME:
+  // If operation is not load, leave it as is by now,
+  // So 0 overridden ops would cleaned in this case.
+  if (!NewEntry.MBBI->mayLoad())
+    return 0;
+
+  const SmallVector<unsigned, 4>& NewEntryUnitRegs = UnitRegsInfo[NewEntry.Reg];
+
+  bool FoundOverriddenLoads = false;
+
+  for (unsigned i = 0, e = NewEntryUnitRegs.size(); i != e; ++i)
+    if (UsedUnitRegs.count(NewEntryUnitRegs[i])) {
+      FoundOverriddenLoads = true;
+      break;
+    }
+
+  // If we detect that this register is used by load operations that are
+  // predecessors for the new one, remove them from MemOps then.
+  if (FoundOverriddenLoads) {
+    MemOpQueue UpdatedMemOps;
+
+    // Scan through MemOps entries.
+    for (unsigned i = 0; i != At; ++i) {
+      MemOpQueueEntry& MemOpEntry = MemOps[i];
+
+      // FIXME: Skip non-load operations by now.
+      if (!MemOpEntry.MBBI->mayLoad())
+        continue;
+
+      const SmallVector<unsigned, 4>& MemOpUnitRegs =
+          UnitRegsInfo[MemOpEntry.Reg];
+
+      // Lookup entry that loads contents into register used by new entry.
+      bool ReleaseThisEntry = false;
+      for (unsigned m = 0, em = MemOpUnitRegs.size(); m != em; ++m) {
+        if (std::find(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end(),
+                      MemOpUnitRegs[m]) != NewEntryUnitRegs.end()) {
+          ReleaseThisEntry = true;
+          ++Cleaned;
+          break;
+        }
+      }
+
+      if (ReleaseThisEntry) {
+        const SmallVector<unsigned, 4>& RelesedRegs = UnitRegsInfo[MemOpEntry.Reg];
+        for (unsigned r = 0, er = RelesedRegs.size(); r != er; ++r)
+          UsedUnitRegs.erase(RelesedRegs[r]);
+      } else
+        UpdatedMemOps.push_back(MemOpEntry);
+    }
+
+    // Keep anything without changes after At position.
+    for (unsigned i = At, e = MemOps.size(); i != e; ++i)
+      UpdatedMemOps.push_back(MemOps[i]);
+
+    MemOps.swap(UpdatedMemOps);
+  }
+
+  UsedUnitRegs.insert(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end());
+
+  return Cleaned;
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  UnitRegsMap UnitRegsInfo(TRI);
+  SmallSet<unsigned, 4> UsedRegUnits;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned CurrPredReg = 0;
+  unsigned Position = 0;
+  SmallVector<MachineBasicBlock::iterator,4> Merges;
+
+  RS->enterBasicBlock(&MBB);
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    if (FixInvalidRegPairOp(MBB, MBBI))
+      continue;
+
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    bool isMemOp = isMemoryOp(MBBI);
+    if (isMemOp) {
+      int Opcode = MBBI->getOpcode();
+      unsigned Size = getLSMultipleTransferSize(MBBI);
+      const MachineOperand &MO = MBBI->getOperand(0);
+      unsigned Reg = MO.getReg();
+      bool isKill = MO.isDef() ? false : MO.isKill();
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = getInstrPredicate(MBBI, PredReg);
+      int Offset = getMemoryOpOffset(MBBI);
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        CurrPred = Pred;
+        CurrPredReg = PredReg;
+
+        MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI));
+        ++NumMemOps;
+        const SmallVector<unsigned, 4>& EntryUnitRegs = UnitRegsInfo[Reg];
+        UsedRegUnits.insert(EntryUnitRegs.begin(), EntryUnitRegs.end());
+        Advance = true;
+      } else {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+          // No need to match PredReg.
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            unsigned OverridesCleaned =
+              AddMemOp(MemOps,
+                           MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI),
+                           UnitRegsInfo, UsedRegUnits) != 0;
+            NumMemOps += 1 - OverridesCleaned;
+            Advance = true;
+          } else {
+            for (unsigned I = 0; I != NumMemOps; ++I) {
+              if (Offset < MemOps[I].Offset) {
+                MemOpQueueEntry entry(Offset, Reg, isKill, Position, MBBI);
+                unsigned OverridesCleaned =
+                    AddMemOp(MemOps, entry, UnitRegsInfo,
+                                 UsedRegUnits, I) != 0;
+                NumMemOps += 1 - OverridesCleaned;
+
+                Advance = true;
+                break;
+              } else if (Offset == MemOps[I].Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (MBBI->isDebugValue()) {
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else if (Advance) {
+      ++Position;
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else
+      TryMerge = true;
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        // Try to find a free register to use as a new base in case it's needed.
+        // First advance to the instruction just before the start of the chain.
+        AdvanceRS(MBB, MemOps);
+        // Find a scratch register.
+        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
+        // Process the load / store instructions.
+        RS->forward(prior(MBBI));
+
+        // Merge ops.
+        Merges.clear();
+        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
+
+        // Try folding preceding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
+          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
+            ++NumMerges;
+        NumMerges += Merges.size();
+
+        // Try folding preceding/trailing base inc/dec into those load/store
+        // that were not merged to form LDM/STM ops.
+        for (unsigned i = 0; i != NumMemOps; ++i)
+          if (!MemOps[i].Merged)
+            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+              ++NumMerges;
+
+        // RS may be pointing to an instruction that's deleted.
+        RS->skipTo(prior(MBBI));
+      } else if (NumMemOps == 1) {
+        // Try folding preceding/trailing base inc/dec into the single
+        // load/store.
+        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+          ++NumMerges;
+          RS->forward(prior(MBBI));
+        }
+      }
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      CurrSize = 0;
+      CurrPred = ARMCC::AL;
+      CurrPredReg = 0;
+      if (NumMemOps) {
+        MemOps.clear();
+        UsedRegUnits.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return ops
+/// ("bx lr" and "mov pc, lr") into the preceding stack restore so it
+/// directly restore the value of LR into pc.
+///   ldmfd sp!, {..., lr}
+///   bx lr
+/// or
+///   ldmfd sp!, {..., lr}
+///   mov pc, lr
+/// =>
+///   ldmfd sp!, {..., pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  if (MBBI != MBB.begin() &&
+      (MBBI->getOpcode() == ARM::BX_RET ||
+       MBBI->getOpcode() == ARM::tBX_RET ||
+       MBBI->getOpcode() == ARM::MOVPCLR)) {
+    MachineInstr *PrevMI = prior(MBBI);
+    unsigned Opcode = PrevMI->getOpcode();
+    if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD ||
+        Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD ||
+        Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() != ARM::LR)
+        return false;
+      unsigned NewOpc = (isThumb2 ? ARM::t2LDMIA_RET : ARM::LDMIA_RET);
+      assert(((isThumb2 && Opcode == ARM::t2LDMIA_UPD) ||
+              Opcode == ARM::LDMIA_UPD) && "Unsupported multiple load-return!");
+      PrevMI->setDesc(TII->get(NewOpc));
+      MO.setReg(ARM::PC);
+      PrevMI->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      MBB.erase(MBBI);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  STI = &TM.getSubtarget<ARMSubtarget>();
+  RS = new RegScavenger();
+  isThumb2 = AFI->isThumb2Function();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    if (TM.getSubtarget<ARMSubtarget>().hasV5TOps())
+      Modified |= MergeReturnIntoLDM(MBB);
+  }
+
+  delete RS;
+  return Modified;
+}
+
+
+/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
+/// load / stores from consecutive locations close to make it more
+/// likely they will be combined later.
+
+namespace {
+  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+    static char ID;
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
+
+    const DataLayout *TD;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    MachineRegisterInfo *MRI;
+    MachineFunction *MF;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM pre- register allocation load / store optimization pass";
+    }
+
+  private:
+    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
+                          unsigned &NewOpc, unsigned &EvenReg,
+                          unsigned &OddReg, unsigned &BaseReg,
+                          int &Offset,
+                          unsigned &PredReg, ARMCC::CondCodes &Pred,
+                          bool &isT2);
+    bool RescheduleOps(MachineBasicBlock *MBB,
+                       SmallVector<MachineInstr*, 4> &Ops,
+                       unsigned Base, bool isLd,
+                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+  };
+  char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  TD  = Fn.getTarget().getDataLayout();
+  TII = Fn.getTarget().getInstrInfo();
+  TRI = Fn.getTarget().getRegisterInfo();
+  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  MRI = &Fn.getRegInfo();
+  MF  = &Fn;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI)
+    Modified |= RescheduleLoadStoreInstrs(MFI);
+
+  return Modified;
+}
+
+static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
+                                      MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator E,
+                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
+                                      SmallSet<unsigned, 4> &MemRegs,
+                                      const TargetRegisterInfo *TRI) {
+  // Are there stores / loads / calls between them?
+  // FIXME: This is overly conservative. We should make use of alias information
+  // some day.
+  SmallSet<unsigned, 4> AddedRegPressure;
+  while (++I != E) {
+    if (I->isDebugValue() || MemOps.count(&*I))
+      continue;
+    if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
+      return false;
+    if (isLd && I->mayStore())
+      return false;
+    if (!isLd) {
+      if (I->mayLoad())
+        return false;
+      // It's not safe to move the first 'str' down.
+      // str r1, [r0]
+      // strh r5, [r0]
+      // str r4, [r0, #+4]
+      if (I->mayStore())
+        return false;
+    }
+    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
+        return false;
+      if (Reg != Base && !MemRegs.count(Reg))
+        AddedRegPressure.insert(Reg);
+    }
+  }
+
+  // Estimate register pressure increase due to the transformation.
+  if (MemRegs.size() <= 4)
+    // Ok if we are moving small number of instructions.
+    return true;
+  return AddedRegPressure.size() <= MemRegs.size() * 2;
+}
+
+
+/// Copy Op0 and Op1 operands into a new array assigned to MI.
+static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
+                                   MachineInstr *Op1) {
+  assert(MI->memoperands_empty() && "expected a new machineinstr");
+  size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin())
+    + (Op1->memoperands_end() - Op1->memoperands_begin());
+
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs);
+  MachineSDNode::mmo_iterator MemEnd =
+    std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin);
+  MemEnd =
+    std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd);
+  MI->setMemRefs(MemBegin, MemEnd);
+}
+
+bool
+ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
+                                          DebugLoc &dl,
+                                          unsigned &NewOpc, unsigned &EvenReg,
+                                          unsigned &OddReg, unsigned &BaseReg,
+                                          int &Offset, unsigned &PredReg,
+                                          ARMCC::CondCodes &Pred,
+                                          bool &isT2) {
+  // Make sure we're allowed to generate LDRD/STRD.
+  if (!STI->hasV5TEOps())
+    return false;
+
+  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
+  unsigned Scale = 1;
+  unsigned Opcode = Op0->getOpcode();
+  if (Opcode == ARM::LDRi12)
+    NewOpc = ARM::LDRD;
+  else if (Opcode == ARM::STRi12)
+    NewOpc = ARM::STRD;
+  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+    NewOpc = ARM::t2LDRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+    NewOpc = ARM::t2STRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else
+    return false;
+
+  // Make sure the base address satisfies i64 ld / st alignment requirement.
+  if (!Op0->hasOneMemOperand() ||
+      !(*Op0->memoperands_begin())->getValue() ||
+      (*Op0->memoperands_begin())->isVolatile())
+    return false;
+
+  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  const Function *Func = MF->getFunction();
+  unsigned ReqAlign = STI->hasV6Ops()
+    ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext()))
+    : 8;  // Pre-v6 need 8-byte align
+  if (Align < ReqAlign)
+    return false;
+
+  // Then make sure the immediate offset fits.
+  int OffImm = getMemoryOpOffset(Op0);
+  if (isT2) {
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm <= -Limit) || (OffImm & (Scale-1)))
+      return false;
+    Offset = OffImm;
+  } else {
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (OffImm < 0) {
+      AddSub = ARM_AM::sub;
+      OffImm = - OffImm;
+    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm & (Scale-1)))
+      return false;
+    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
+  }
+  EvenReg = Op0->getOperand(0).getReg();
+  OddReg  = Op1->getOperand(0).getReg();
+  if (EvenReg == OddReg)
+    return false;
+  BaseReg = Op0->getOperand(1).getReg();
+  Pred = getInstrPredicate(Op0, PredReg);
+  dl = Op0->getDebugLoc();
+  return true;
+}
+
+namespace {
+  struct OffsetCompare {
+    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
+      int LOffset = getMemoryOpOffset(LHS);
+      int ROffset = getMemoryOpOffset(RHS);
+      assert(LHS == RHS || LOffset != ROffset);
+      return LOffset > ROffset;
+    }
+  };
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+                                 SmallVector<MachineInstr*, 4> &Ops,
+                                 unsigned Base, bool isLd,
+                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+  bool RetVal = false;
+
+  // Sort by offset (in reverse order).
+  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
+
+  // The loads / stores of the same base are in order. Scan them from first to
+  // last and check for the following:
+  // 1. Any def of base.
+  // 2. Any gaps.
+  while (Ops.size() > 1) {
+    unsigned FirstLoc = ~0U;
+    unsigned LastLoc = 0;
+    MachineInstr *FirstOp = 0;
+    MachineInstr *LastOp = 0;
+    int LastOffset = 0;
+    unsigned LastOpcode = 0;
+    unsigned LastBytes = 0;
+    unsigned NumMove = 0;
+    for (int i = Ops.size() - 1; i >= 0; --i) {
+      MachineInstr *Op = Ops[i];
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
+
+      unsigned LSMOpcode
+        = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
+      if (LastOpcode && LSMOpcode != LastOpcode)
+        break;
+
+      int Offset = getMemoryOpOffset(Op);
+      unsigned Bytes = getLSMultipleTransferSize(Op);
+      if (LastBytes) {
+        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+          break;
+      }
+      LastOffset = Offset;
+      LastBytes = Bytes;
+      LastOpcode = LSMOpcode;
+      if (++NumMove == 8) // FIXME: Tune this limit.
+        break;
+    }
+
+    if (NumMove <= 1)
+      Ops.pop_back();
+    else {
+      SmallPtrSet<MachineInstr*, 4> MemOps;
+      SmallSet<unsigned, 4> MemRegs;
+      for (int i = NumMove-1; i >= 0; --i) {
+        MemOps.insert(Ops[i]);
+        MemRegs.insert(Ops[i]->getOperand(0).getReg());
+      }
+
+      // Be conservative, if the instructions are too far apart, don't
+      // move them. We want to limit the increase of register pressure.
+      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
+      if (DoMove)
+        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
+                                           MemOps, MemRegs, TRI);
+      if (!DoMove) {
+        for (unsigned i = 0; i != NumMove; ++i)
+          Ops.pop_back();
+      } else {
+        // This is the new location for the loads / stores.
+        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+        while (InsertPos != MBB->end()
+               && (MemOps.count(InsertPos) || InsertPos->isDebugValue()))
+          ++InsertPos;
+
+        // If we are moving a pair of loads / stores, see if it makes sense
+        // to try to allocate a pair of registers that can form register pairs.
+        MachineInstr *Op0 = Ops.back();
+        MachineInstr *Op1 = Ops[Ops.size()-2];
+        unsigned EvenReg = 0, OddReg = 0;
+        unsigned BaseReg = 0, PredReg = 0;
+        ARMCC::CondCodes Pred = ARMCC::AL;
+        bool isT2 = false;
+        unsigned NewOpc = 0;
+        int Offset = 0;
+        DebugLoc dl;
+        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
+                                             EvenReg, OddReg, BaseReg,
+                                             Offset, PredReg, Pred, isT2)) {
+          Ops.pop_back();
+          Ops.pop_back();
+
+          const MCInstrDesc &MCID = TII->get(NewOpc);
+          const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+          MRI->constrainRegClass(EvenReg, TRC);
+          MRI->constrainRegClass(OddReg, TRC);
+
+          // Form the pair instruction.
+          if (isLd) {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(EvenReg, RegState::Define)
+              .addReg(OddReg, RegState::Define)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming LDRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            concatenateMemOperands(MIB, Op0, Op1);
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumLDRDFormed;
+          } else {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos, dl, MCID)
+              .addReg(EvenReg)
+              .addReg(OddReg)
+              .addReg(BaseReg);
+            // FIXME: We're converting from LDRi12 to an insn that still
+            // uses addrmode2, so we need an explicit offset reg. It should
+            // always by reg0 since we're transforming STRi12s.
+            if (!isT2)
+              MIB.addReg(0);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            concatenateMemOperands(MIB, Op0, Op1);
+            DEBUG(dbgs() << "Formed " << *MIB << "\n");
+            ++NumSTRDFormed;
+          }
+          MBB->erase(Op0);
+          MBB->erase(Op1);
+
+          // Add register allocation hints to form register pairs.
+          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
+          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
+        } else {
+          for (unsigned i = 0; i != NumMove; ++i) {
+            MachineInstr *Op = Ops.back();
+            Ops.pop_back();
+            MBB->splice(InsertPos, MBB, Op);
+          }
+        }
+
+        NumLdStMoved += NumMove;
+        RetVal = true;
+      }
+    }
+  }
+
+  return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+  bool RetVal = false;
+
+  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+  SmallVector<unsigned, 4> LdBases;
+  SmallVector<unsigned, 4> StBases;
+
+  unsigned Loc = 0;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  while (MBBI != E) {
+    for (; MBBI != E; ++MBBI) {
+      MachineInstr *MI = MBBI;
+      if (MI->isCall() || MI->isTerminator()) {
+        // Stop at barriers.
+        ++MBBI;
+        break;
+      }
+
+      if (!MI->isDebugValue())
+        MI2LocMap[MI] = ++Loc;
+
+      if (!isMemoryOp(MI))
+        continue;
+      unsigned PredReg = 0;
+      if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
+        continue;
+
+      int Opc = MI->getOpcode();
+      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      unsigned Base = MI->getOperand(1).getReg();
+      int Offset = getMemoryOpOffset(MI);
+
+      bool StopHere = false;
+      if (isLd) {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2LdsMap.find(Base);
+        if (BI != Base2LdsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2LdsMap[Base] = MIs;
+          LdBases.push_back(Base);
+        }
+      } else {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2StsMap.find(Base);
+        if (BI != Base2StsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2StsMap[Base] = MIs;
+          StBases.push_back(Base);
+        }
+      }
+
+      if (StopHere) {
+        // Found a duplicate (a base+offset combination that's seen earlier).
+        // Backtrack.
+        --Loc;
+        break;
+      }
+    }
+
+    // Re-schedule loads.
+    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+      unsigned Base = LdBases[i];
+      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
+      if (Lds.size() > 1)
+        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+    }
+
+    // Re-schedule stores.
+    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+      unsigned Base = StBases[i];
+      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
+      if (Sts.size() > 1)
+        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+    }
+
+    if (MBBI != E) {
+      Base2LdsMap.clear();
+      Base2StsMap.clear();
+      LdBases.clear();
+      StBases.clear();
+    }
+  }
+
+  return RetVal;
+}
+
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+  if (PreAlloc)
+    return new ARMPreAllocLoadStoreOpt();
+  return new ARMLoadStoreOpt();
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
new file mode 100644
index 000000000000..b6414832003d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -0,0 +1,125 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAsmPrinter.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+
+MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  switch (MO.getTargetFlags()) {
+  default: {
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                   OutContext);
+    switch (MO.getTargetFlags()) {
+    default: llvm_unreachable("Unknown target flag on symbol operand");
+    case 0:
+      break;
+    case ARMII::MO_LO16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                     OutContext);
+      Expr = ARMMCExpr::CreateLower16(Expr, OutContext);
+      break;
+    case ARMII::MO_HI16:
+      Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                     OutContext);
+      Expr = ARMMCExpr::CreateUpper16(Expr, OutContext);
+      break;
+    }
+    break;
+  }
+
+  case ARMII::MO_PLT:
+    Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_ARM_PLT,
+                                   OutContext);
+    break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(),
+                                                          OutContext),
+                                   OutContext);
+  return MCOperand::CreateExpr(Expr);
+
+}
+
+bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                 MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all non-CPSR implicit register operands.
+    if (MO.isImplicit() && MO.getReg() != ARM::CPSR)
+      return false;
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    MCOp = MCOperand::CreateReg(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+   MCOp = GetSymbolRef(MO,
+                        GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    MCOp = GetSymbolRef(MO, GetJTISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    MCOp = GetSymbolRef(MO, GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = GetSymbolRef(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+    break;
+  }
+  case MachineOperand::MO_RegisterMask:
+    // Ignore call clobbers.
+    return false;
+  }
+  return true;
+}
+
+void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        ARMAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (AP.lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..af445e2f35aa
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void ARMFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 000000000000..88d96c0be8a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -0,0 +1,275 @@
+//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private ARM-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
+  /// to determine if function is compiled under Thumb mode, for that use
+  /// 'isThumb'.
+  bool hasThumb2;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned VarArgsRegSaveSize;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
+  /// emitPrologue.
+  bool RestoreSPFromFP;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  ///
+  /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
+  /// Some may be spilled after the stack has been realigned.
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
+  /// which belong to these spill areas.
+  BitVector GPRCS1Frames;
+  BitVector GPRCS2Frames;
+  BitVector DPRCSFrames;
+
+  /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
+  /// the aligned portion of the stack frame.  This is always a contiguous
+  /// sequence of D-registers starting from d8.
+  ///
+  /// We do not keep track of the frame indices used for these registers - they
+  /// behave like any other frame index in the aligned stack frame.  These
+  /// registers also aren't included in DPRCSSize above.
+  unsigned NumAlignedDPRCS2Regs;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+  unsigned PICLabelUId;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// HasITBlocks - True if IT blocks have been inserted.
+  bool HasITBlocks;
+
+  /// CPEClones - Track constant pool entries clones created by Constant Island
+  /// pass.
+  DenseMap<unsigned, unsigned> CPEClones;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false),
+    hasThumb2(false),
+    VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
+    NumAlignedDPRCS2Regs(0),
+    JumpTableUId(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+
+  explicit ARMFunctionInfo(MachineFunction &MF) :
+    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+    hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+    VarArgsRegSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
+    JumpTableUId(0), PICLabelUId(0),
+    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
+
+  bool isThumbFunction() const { return isThumb; }
+  bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
+  bool isThumb2Function() const { return isThumb && hasThumb2; }
+
+  unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; }
+  void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool shouldRestoreSPFromFP() const { return RestoreSPFromFP; }
+  void setShouldRestoreSPFromFP(bool s) { RestoreSPFromFP = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+
+  unsigned getNumAlignedDPRCS2Regs() const { return NumAlignedDPRCS2Regs; }
+  void setNumAlignedDPRCS2Regs(unsigned n) { NumAlignedDPRCS2Regs = n; }
+
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  bool isGPRCalleeSavedArea1Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
+      return false;
+    return GPRCS1Frames[fi];
+  }
+  bool isGPRCalleeSavedArea2Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
+      return false;
+    return GPRCS2Frames[fi];
+  }
+  bool isDPRCalleeSavedAreaFrame(int fi) const {
+    if (fi < 0 || fi >= (int)DPRCSFrames.size())
+      return false;
+    return DPRCSFrames[fi];
+  }
+
+  void addGPRCalleeSavedArea1Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS1Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS1Frames.resize(Size);
+      }
+      GPRCS1Frames[fi] = true;
+    }
+  }
+  void addGPRCalleeSavedArea2Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS2Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS2Frames.resize(Size);
+      }
+      GPRCS2Frames[fi] = true;
+    }
+  }
+  void addDPRCalleeSavedAreaFrame(int fi) {
+    if (fi >= 0) {
+      int Size = DPRCSFrames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        DPRCSFrames.resize(Size);
+      }
+      DPRCSFrames[fi] = true;
+    }
+  }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+
+  unsigned getNumJumpTables() const {
+    return JumpTableUId;
+  }
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+  unsigned getNumPICLabels() const {
+    return PICLabelUId;
+  }
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasITBlocks() const { return HasITBlocks; }
+  void setHasITBlocks(bool h) { HasITBlocks = h; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
+    if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
+      assert(0 && "Duplicate entries!");
+  }
+
+  unsigned getOriginalCPIdx(unsigned CloneIdx) const {
+    DenseMap<unsigned, unsigned>::const_iterator I = CPEClones.find(CloneIdx);
+    if (I != CPEClones.end())
+      return I->second;
+    else
+      return -1U;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 000000000000..efa22fbed9f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U, // <0,1,2,3>: Cost 0 copy LHS
+  1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U, // <0,1,2,u>: Cost 0 copy LHS
+  2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U, // <0,1,u,3>: Cost 0 copy LHS
+  1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U, // <0,1,u,u>: Cost 0 copy LHS
+  2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U, // <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U, // <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U, // <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U, // <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U, // <0,u,2,3>: Cost 0 copy LHS
+  1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U, // <0,u,2,u>: Cost 0 copy LHS
+  2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U, // <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U, // <0,u,u,3>: Cost 0 copy LHS
+  1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U, // <0,u,u,u>: Cost 0 copy LHS
+  2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U, // <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U, // <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U, // <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U, // <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U, // <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U, // <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U, // <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U, // <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U, // <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U, // <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U, // <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U, // <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U, // <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U, // <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U, // <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U, // <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U, // <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U, // <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U, // <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U, // <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U, // <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U, // <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U, // <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U, // <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U, // <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U, // <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U, // <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U, // <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U, // <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U, // <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U, // <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U, // <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U, // <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U, // <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U, // <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U, // <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U, // <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U, // <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U, // <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U, // <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U, // <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U, // <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U, // <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U, // <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U, // <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U, // <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U, // <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U, // <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U, // <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U, // <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U, // <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U, // <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U, // <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U, // <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U, // <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U, // <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U, // <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U, // <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U, // <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U, // <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U, // <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U, // <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U, // <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U, // <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U, // <4,5,6,7>: Cost 0 copy RHS
+  27705344U, // <4,5,6,u>: Cost 0 copy RHS
+  2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U, // <4,5,u,7>: Cost 0 copy RHS
+  27705344U, // <4,5,u,u>: Cost 0 copy RHS
+  2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U, // <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U, // <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U, // <4,u,6,7>: Cost 0 copy RHS
+  27705344U, // <4,u,6,u>: Cost 0 copy RHS
+  2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U, // <4,u,u,7>: Cost 0 copy RHS
+  27705344U, // <4,u,u,u>: Cost 0 copy RHS
+  2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U, // <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U, // <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U, // <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U, // <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U, // <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U, // <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U, // <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U, // <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U, // <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U, // <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U, // <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U, // <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U, // <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U, // <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U, // <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U, // <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U, // <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U, // <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U, // <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U, // <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U, // <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U, // <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U, // <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U, // <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U, // <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U, // <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U, // <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U, // <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U, // <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U, // <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U, // <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U, // <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U, // <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U, // <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U, // <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U, // <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U, // <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U, // <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U, // <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U, // <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U, // <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U, // <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U, // <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U, // <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U, // <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U, // <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U, // <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U, // <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U, // <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U, // <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U, // <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U, // <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U, // <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U, // <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U, // <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U, // <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U, // <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U, // <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U, // <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U, // <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U, // <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U, // <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U, // <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U, // <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U, // <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U, // <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U, // <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U, // <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U, // <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U, // <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U, // <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U, // <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U, // <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U, // <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U, // <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U, // <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U, // <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U, // <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U, // <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U, // <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U, // <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U, // <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U, // <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U, // <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U, // <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U, // <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U, // <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U, // <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U, // <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U, // <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U, // <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U, // <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U, // <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U, // <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U, // <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U, // <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U, // <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U, // <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U, // <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U, // <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U, // <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U, // <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U, // <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U, // <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U, // <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U, // <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U, // <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U, // <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U, // <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U, // <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U, // <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U, // <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U, // <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U, // <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U, // <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U, // <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U, // <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U, // <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U, // <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U, // <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U, // <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U, // <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U, // <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U, // <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U, // <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U, // <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U, // <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U, // <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U, // <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U, // <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U, // <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U, // <u,1,2,3>: Cost 0 copy LHS
+  1499712822U, // <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U, // <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U, // <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U, // <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U, // <u,1,2,u>: Cost 0 copy LHS
+  1487773798U, // <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U, // <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U, // <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U, // <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U, // <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U, // <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U, // <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U, // <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U, // <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U, // <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U, // <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U, // <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U, // <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U, // <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U, // <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U, // <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U, // <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U, // <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U, // <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U, // <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U, // <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U, // <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U, // <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U, // <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U, // <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U, // <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U, // <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U, // <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U, // <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U, // <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U, // <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U, // <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U, // <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U, // <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U, // <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U, // <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U, // <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U, // <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U, // <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U, // <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U, // <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U, // <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U, // <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U, // <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U, // <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U, // <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U, // <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U, // <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U, // <u,1,u,3>: Cost 0 copy LHS
+  1481846070U, // <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U, // <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U, // <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U, // <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U, // <u,1,u,u>: Cost 0 copy LHS
+  1544110154U, // <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U, // <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U, // <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U, // <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U, // <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U, // <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U, // <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U, // <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U, // <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U, // <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U, // <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U, // <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U, // <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U, // <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U, // <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U, // <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U, // <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U, // <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U, // <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U, // <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U, // <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U, // <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U, // <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U, // <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U, // <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U, // <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U, // <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U, // <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U, // <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U, // <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U, // <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U, // <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U, // <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U, // <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U, // <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U, // <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U, // <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U, // <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U, // <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U, // <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U, // <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U, // <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U, // <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U, // <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U, // <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U, // <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U, // <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U, // <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U, // <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U, // <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U, // <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U, // <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U, // <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U, // <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U, // <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U, // <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U, // <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U, // <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U, // <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U, // <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U, // <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U, // <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U, // <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U, // <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U, // <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U, // <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U, // <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U, // <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U, // <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U, // <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U, // <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U, // <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U, // <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U, // <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U, // <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U, // <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U, // <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U, // <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U, // <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U, // <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U, // <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U, // <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U, // <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U, // <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U, // <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U, // <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U, // <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U, // <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U, // <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U, // <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U, // <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U, // <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U, // <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U, // <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U, // <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U, // <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U, // <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U, // <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U, // <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U, // <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U, // <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U, // <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U, // <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U, // <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U, // <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U, // <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U, // <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U, // <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U, // <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U, // <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U, // <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U, // <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U, // <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U, // <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U, // <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U, // <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U, // <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U, // <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U, // <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U, // <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U, // <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U, // <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U, // <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U, // <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U, // <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U, // <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U, // <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U, // <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U, // <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U, // <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U, // <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U, // <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U, // <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U, // <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U, // <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U, // <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U, // <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U, // <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U, // <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U, // <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U, // <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U, // <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U, // <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U, // <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U, // <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U, // <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U, // <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U, // <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U, // <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U, // <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U, // <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U, // <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U, // <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U, // <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U, // <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U, // <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U, // <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U, // <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U, // <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U, // <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U, // <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U, // <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U, // <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U, // <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U, // <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U, // <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U, // <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U, // <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U, // <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U, // <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U, // <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U, // <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U, // <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U, // <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U, // <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U, // <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U, // <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U, // <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U, // <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U, // <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U, // <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U, // <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U, // <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U, // <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U, // <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U, // <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U, // <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U, // <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U, // <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U, // <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U, // <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U, // <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U, // <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U, // <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U, // <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U, // <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U, // <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U, // <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U, // <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U, // <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U, // <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U, // <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U, // <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U, // <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U, // <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U, // <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U, // <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U, // <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U, // <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U, // <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U, // <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U, // <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U, // <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U, // <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U, // <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U, // <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U, // <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U, // <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U, // <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U, // <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U, // <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U, // <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U, // <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U, // <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U, // <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U, // <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U, // <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U, // <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U, // <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U, // <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U, // <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U, // <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U, // <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U, // <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U, // <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U, // <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U, // <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U, // <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U, // <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U, // <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U, // <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U, // <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U, // <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U, // <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U, // <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U, // <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U, // <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U, // <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U, // <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U, // <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U, // <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U, // <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U, // <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U, // <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U, // <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U, // <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U, // <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U, // <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U, // <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U, // <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U, // <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U, // <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U, // <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U, // <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U, // <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U, // <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U, // <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U, // <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U, // <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U, // <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U, // <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U, // <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U, // <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U, // <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U, // <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U, // <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U, // <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U, // <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U, // <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U, // <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U, // <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U, // <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U, // <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U, // <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U, // <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U, // <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U, // <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U, // <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U, // <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U, // <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U, // <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U, // <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U, // <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U, // <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U, // <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U, // <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U, // <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U, // <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U, // <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U, // <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U, // <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U, // <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U, // <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U, // <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U, // <u,5,6,7>: Cost 0 copy RHS
+  27705344U, // <u,5,6,u>: Cost 0 copy RHS
+  1488101478U, // <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U, // <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U, // <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U, // <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U, // <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U, // <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U, // <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U, // <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U, // <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U, // <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U, // <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U, // <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U, // <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U, // <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U, // <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U, // <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U, // <u,5,u,7>: Cost 0 copy RHS
+  27705344U, // <u,5,u,u>: Cost 0 copy RHS
+  2619211776U, // <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U, // <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U, // <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U, // <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U, // <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U, // <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U, // <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U, // <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U, // <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U, // <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U, // <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U, // <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U, // <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U, // <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U, // <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U, // <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U, // <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U, // <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U, // <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U, // <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U, // <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U, // <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U, // <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U, // <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U, // <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U, // <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U, // <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U, // <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U, // <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U, // <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U, // <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U, // <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U, // <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U, // <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U, // <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U, // <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U, // <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U, // <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U, // <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U, // <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U, // <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U, // <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U, // <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U, // <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U, // <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U, // <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U, // <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U, // <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U, // <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U, // <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U, // <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U, // <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U, // <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U, // <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U, // <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U, // <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U, // <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U, // <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U, // <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U, // <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U, // <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U, // <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U, // <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U, // <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U, // <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U, // <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U, // <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U, // <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U, // <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U, // <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U, // <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U, // <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U, // <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U, // <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U, // <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U, // <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U, // <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U, // <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U, // <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U, // <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U, // <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U, // <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U, // <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U, // <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U, // <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U, // <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U, // <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U, // <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U, // <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U, // <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U, // <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U, // <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U, // <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U, // <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U, // <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U, // <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U, // <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U, // <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U, // <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U, // <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U, // <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U, // <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U, // <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U, // <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U, // <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U, // <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U, // <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U, // <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U, // <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U, // <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U, // <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U, // <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U, // <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U, // <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U, // <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U, // <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U, // <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U, // <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U, // <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U, // <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U, // <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U, // <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U, // <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U, // <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U, // <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U, // <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U, // <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U, // <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U, // <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U, // <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U, // <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U, // <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U, // <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U, // <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U, // <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U, // <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U, // <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U, // <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U, // <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U, // <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U, // <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U, // <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U, // <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U, // <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U, // <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U, // <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U, // <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U, // <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U, // <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U, // <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U, // <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U, // <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U, // <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U, // <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U, // <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U, // <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U, // <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U, // <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U, // <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U, // <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U, // <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U, // <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U, // <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U, // <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U, // <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U, // <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U, // <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U, // <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U, // <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U, // <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U, // <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U, // <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U, // <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U, // <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U, // <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U, // <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U, // <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U, // <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U, // <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U, // <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U, // <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U, // <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U, // <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U, // <u,u,2,3>: Cost 0 copy LHS
+  1482313014U, // <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U, // <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U, // <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U, // <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U, // <u,u,2,u>: Cost 0 copy LHS
+  408576723U, // <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U, // <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U, // <u,u,3,2>: Cost 1 vrev LHS
+  336380006U, // <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U, // <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U, // <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U, // <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U, // <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U, // <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U, // <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U, // <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U, // <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U, // <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U, // <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U, // <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U, // <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U, // <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U, // <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U, // <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U, // <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U, // <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U, // <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U, // <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U, // <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U, // <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U, // <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U, // <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U, // <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U, // <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U, // <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U, // <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U, // <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U, // <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U, // <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U, // <u,u,6,7>: Cost 0 copy RHS
+  27705344U, // <u,u,6,u>: Cost 0 copy RHS
+  432496742U, // <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U, // <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U, // <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U, // <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U, // <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U, // <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U, // <u,u,7,6>: Cost 1 vrev RHS
+  363253046U, // <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U, // <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U, // <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U, // <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U, // <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U, // <u,u,u,3>: Cost 0 copy LHS
+  408620342U, // <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U, // <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U, // <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U, // <u,u,u,7>: Cost 0 copy RHS
+  835584U, // <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 000000000000..6f3819afd0a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -0,0 +1,24 @@
+//===-- ARMRegisterInfo.cpp - ARM Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMRegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+using namespace llvm;
+
+void ARMRegisterInfo::anchor() { }
+
+ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
+                                 const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 000000000000..8a248425c33c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -0,0 +1,33 @@
+//===-- ARMRegisterInfo.h - ARM Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMREGISTERINFO_H
+#define ARMREGISTERINFO_H
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+
+struct ARMRegisterInfo : public ARMBaseRegisterInfo {
+  virtual void anchor();
+public:
+  ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 000000000000..b0f576bc2b6f
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -0,0 +1,403 @@
+//===-- ARMRegisterInfo.td - ARM Register defs -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<16> Enc, string n, list<Register> subregs = []> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+  // All bits of ARM registers with sub-registers are covered by sub-registers.
+  let CoveredBySubRegs = 1;
+}
+
+class ARMFReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "ARM";
+}
+
+// Subregister indices.
+let Namespace = "ARM" in {
+def qqsub_0 : SubRegIndex;
+def qqsub_1 : SubRegIndex;
+
+// Note: Code depends on these having consecutive numbers.
+def qsub_0 : SubRegIndex;
+def qsub_1 : SubRegIndex;
+def qsub_2 : SubRegIndex<[qqsub_1, qsub_0]>;
+def qsub_3 : SubRegIndex<[qqsub_1, qsub_1]>;
+
+def dsub_0 : SubRegIndex;
+def dsub_1 : SubRegIndex;
+def dsub_2 : SubRegIndex<[qsub_1, dsub_0]>;
+def dsub_3 : SubRegIndex<[qsub_1, dsub_1]>;
+def dsub_4 : SubRegIndex<[qsub_2, dsub_0]>;
+def dsub_5 : SubRegIndex<[qsub_2, dsub_1]>;
+def dsub_6 : SubRegIndex<[qsub_3, dsub_0]>;
+def dsub_7 : SubRegIndex<[qsub_3, dsub_1]>;
+
+def ssub_0  : SubRegIndex;
+def ssub_1  : SubRegIndex;
+def ssub_2  : SubRegIndex<[dsub_1, ssub_0]>;
+def ssub_3  : SubRegIndex<[dsub_1, ssub_1]>;
+
+def gsub_0  : SubRegIndex;
+def gsub_1  : SubRegIndex;
+// Let TableGen synthesize the remaining 12 ssub_* indices.
+// We don't need to name them.
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+// These require 32-bit instructions.
+let CostPerUse = 1 in {
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+}
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+let SubRegIndices = [ssub_0, ssub_1] in {
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>, DwarfRegNum<[256]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>, DwarfRegNum<[257]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>, DwarfRegNum<[258]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>, DwarfRegNum<[259]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>, DwarfRegNum<[260]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>, DwarfRegNum<[261]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>, DwarfRegNum<[262]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>, DwarfRegNum<[263]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>, DwarfRegNum<[264]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>, DwarfRegNum<[265]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>, DwarfRegNum<[266]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>, DwarfRegNum<[267]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>, DwarfRegNum<[268]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>, DwarfRegNum<[269]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>, DwarfRegNum<[270]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
+}
+
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; 
+def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
+def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
+def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
+def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
+def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; 
+def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
+def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
+def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
+def D26 : ARMFReg<26, "d26">, DwarfRegNum<[282]>;
+def D27 : ARMFReg<27, "d27">, DwarfRegNum<[283]>;
+def D28 : ARMFReg<28, "d28">, DwarfRegNum<[284]>;
+def D29 : ARMFReg<29, "d29">, DwarfRegNum<[285]>;
+def D30 : ARMFReg<30, "d30">, DwarfRegNum<[286]>;
+def D31 : ARMFReg<31, "d31">, DwarfRegNum<[287]>;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+}
+let SubRegIndices = [dsub_0, dsub_1] in {
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+}
+
+// Current Program Status Register.
+// We model fpscr with two registers: FPSCR models the control bits and will be
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. 
+def CPSR       : ARMReg<0, "cpsr">;
+def APSR       : ARMReg<1, "apsr">;
+def SPSR       : ARMReg<2, "spsr">;
+def FPSCR      : ARMReg<3, "fpscr">;
+def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
+  let Aliases = [FPSCR];
+}
+def ITSTATE    : ARMReg<4, "itstate">;
+
+// Special Registers - only available in privileged mode.
+def FPSID   : ARMReg<0, "fpsid">;
+def MVFR1   : ARMReg<6, "mvfr1">;
+def MVFR0   : ARMReg<7, "mvfr0">;
+def FPEXC   : ARMReg<8, "fpexc">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
+                                               SP, LR, PC)> {
+  // Allocate LR as the first CSR since it is always saved anyway.
+  // For Thumb1 mode, we don't want to allocate hi regs at all, as we don't
+  // know how to spill them. If we make our prologue/epilogue code smarter at
+  // some point, we can go back to using the above allocation orders for the
+  // Thumb1 instructions that know how to use hi regs.
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRs without the PC.  Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination.  Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
+// restricted GPR register class. Many Thumb2 instructions allow the full
+// register range for operands, but have undefined behaviours when PC
+// or SP (R13 or R15) are used. The ARM ISA refers to these operands
+// via the BadReg() pseudo-code description.
+def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
+
+// The high registers in thumb mode, R8-R15.
+def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
+// this class and the preceding one(!)  This is what we want.
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> {
+  let AltOrders = [(and tcGPR, tGPR)];
+  let AltOrderSelect = [{
+      return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+
+// Scalar single precision floating point register class..
+// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
+// avoid partial-write dependencies on D registers (S registers are
+// renamed as portions of D registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
+                                                (sequence "S%u", 0, 31), 2),
+                                               (sequence "S%u", 0, 31))>;
+
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
+
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        (sequence "D%u", 0, 31)> {
+  // Allocate non-VFP2 registers D16-D31 first.
+  let AltOrders = [(rotl DPR, 16)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                             (trunc DPR, 16)>;
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                          (trunc DPR, 8)>;
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        (sequence "Q%u", 0, 15)> {
+  // Allocate non-VFP2 aliases Q8-Q15 first.
+  let AltOrders = [(rotl QPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (trunc QPR, 8)>;
+
+// Subset of QPR that have DPR_8 and SPR_8 subregs.
+def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                           128, (trunc QPR, 4)>;
+
+// Pseudo-registers representing odd-even pairs of D registers. The even-odd
+// pairs are already represented by the Q registers.
+// These are needed by NEON instructions requiring two consecutive D registers.
+// There is no D31_D0 register as that is always an UNPREDICTABLE encoding.
+def TuplesOE2D : RegisterTuples<[dsub_0, dsub_1],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2)]>;
+
+// Register class representing a pair of consecutive D registers.
+// Use the Q registers for the even-odd pairs.
+def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (interleave QPR, TuplesOE2D)> {
+  // Allocate starting at non-VFP2 registers D16-D31 first.
+  // Prefer even-odd pairs as they are easier to copy.
+  let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
+// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
+def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
+                              [(add R0, R2, R4, R6, R8, R10, R12),
+                               (add R1, R3, R5, R7, R9, R11, SP)]>;
+
+// Register class representing a pair of even-odd GPRs.
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+  let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
+// Pseudo-registers representing 3 consecutive D registers.
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(shl DPR, 0),
+                               (shl DPR, 1),
+                               (shl DPR, 2)]>;
+
+// 3 consecutive D registers.
+def DTriple : RegisterClass<"ARM", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+// Pseudo 256-bit registers to represent pairs of Q registers. These should
+// never be present in the emitted code.
+// These are used for NEON load / store instructions, e.g., vld4, vst3.
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1], [(shl QPR, 0), (shl QPR, 1)]>;
+
+// Pseudo 256-bit vector register class to model pairs of Q registers
+// (4 consecutive D registers).
+def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+// Tuples of 4 D regs that isn't also a pair of Q regs.
+def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                                [(decimate (shl DPR, 1), 2),
+                                 (decimate (shl DPR, 2), 2),
+                                 (decimate (shl DPR, 3), 2),
+                                 (decimate (shl DPR, 4), 2)]>;
+
+// 4 consecutive D registers.
+def DQuad : RegisterClass<"ARM", [v4i64], 256,
+                          (interleave Tuples2Q, TuplesOE4D)>;
+
+// Pseudo 512-bit registers to represent four consecutive Q registers.
+def Tuples2QQ : RegisterTuples<[qqsub_0, qqsub_1],
+                               [(shl QQPR, 0), (shl QQPR, 2)]>;
+
+// Pseudo 512-bit vector register class to model 4 consecutive Q registers
+// (8 consecutive D registers).
+def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
+  // Allocate non-VFP2 aliases first.
+  let AltOrders = [(rotl QQQQPR, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+
+
+// Pseudo-registers representing 2-spaced consecutive D registers.
+def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2)]>;
+
+// Spaced pairs of D registers.
+def DPairSpc : RegisterClass<"ARM", [v2i64], 64, (add Tuples2DSpc)>;
+
+def Tuples3DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4)]>;
+
+// Spaced triples of D registers.
+def DTripleSpc : RegisterClass<"ARM", [untyped], 64, (add Tuples3DSpc)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
+                                 [(shl DPR, 0),
+                                  (shl DPR, 2),
+                                  (shl DPR, 4),
+                                  (shl DPR, 6)]>;
+
+// Spaced quads of D registers.
+def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMRelocations.h b/contrib/llvm/lib/Target/ARM/ARMRelocations.h
new file mode 100644
index 000000000000..21877fd9af37
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRelocations.h
@@ -0,0 +1,62 @@
+//===-- ARMRelocations.h - ARM Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMRELOCATIONS_H
+#define ARMRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace ARM {
+    enum RelocationType {
+      // reloc_arm_absolute - Absolute relocation, just add the relocated value
+      // to the value already in memory.
+      reloc_arm_absolute,
+
+      // reloc_arm_relative - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_arm_relative,
+
+      // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose
+      // addresses are kept locally in a map.
+      reloc_arm_cp_entry,
+
+      // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset
+      // should be divided by 4.
+      reloc_arm_vfp_cp_entry,
+
+      // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
+      // entry.
+      reloc_arm_machine_cp_entry,
+
+      // reloc_arm_jt_base - PC relative relocation for jump tables whose
+      // addresses are kept locally in a map.
+      reloc_arm_jt_base,
+
+      // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base.
+      reloc_arm_pic_jt,
+
+      // reloc_arm_branch - Branch address relocation.
+      reloc_arm_branch,
+
+      // reloc_arm_movt  - MOVT immediate relocation.
+      reloc_arm_movt,
+
+      // reloc_arm_movw  - MOVW immediate relocation.
+      reloc_arm_movw
+    };
+  }
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
new file mode 100644
index 000000000000..2d088de96e27
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -0,0 +1,336 @@
+//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Instruction scheduling annotations for out-of-order CPUs.
+// These annotations are independent of the itinerary class defined below.
+// Here we define the subtarget independent read/write per-operand resources.
+// The subtarget schedule definitions will then map these to the subtarget's
+// resource usages.
+// For example:
+// The instruction cycle timings table might contain an entry for an operation
+// like the following:
+// Rd <- ADD Rn, Rm, <shift> Rs
+//  Uops | Latency from register | Uops - resource requirements - latency
+//  2    | Rn: 1 Rm: 4 Rs: 4     | uop T0, Rm, Rs - P01 - 3
+//       |                       | uopc Rd, Rn, T0 -  P01 - 1
+// This is telling us that the result will be available in destination register
+// Rd after a minimum of three cycles after the result in Rm and Rs is available
+// and one cycle after the result in Rn is available. The micro-ops can execute
+// on resource P01.
+// To model this, we need to express that we need to dispatch two micro-ops,
+// that the resource P01 is needed and that the latency to Rn is different than
+// the latency to Rm and Rs. The scheduler can decrease Rn's producer latency by
+// two.
+// We will do this by assigning (abstract) resources to register defs/uses.
+// ARMSchedule.td:
+//   def WriteALUsr : SchedWrite;
+//   def ReadAdvanceALUsr : ScheRead;
+//
+// ARMInstrInfo.td:
+//   def ADDrs : I<>, Sched<[WriteALUsr, ReadAdvanceALUsr, ReadDefault,
+//                           ReadDefault]> { ...}
+// ReadAdvance read resources allow us to define "pipeline by-passes" or
+// shorter latencies to certain registers as needed in the example above.
+// The "ReadDefault" can be omitted.
+// Next, the subtarget td file assigns resources to the abstract resources
+// defined here.
+// ARMScheduleSubtarget.td:
+//  // Resources.
+//  def P01 : ProcResource<3>; // ALU unit (3 of it).
+//  ...
+//  // Resource usages.
+//  def : WriteRes<WriteALUsr, [P01, P01]> {
+//    Latency = 4; // Latency of 4.
+//    NumMicroOps = 2; // Dispatch 2 micro-ops.
+//    // The two instances of resource P01 are occupied for one cycle. It is one
+//    // cycle because these resources happen to be pipelined.
+//    ResourceCycles = [1, 1];
+//  }
+//  def : ReadAdvance<ReadAdvanceALUsr, 3>;
+
+// Basic ALU operation.
+def WriteALU : SchedWrite;
+def ReadALU : SchedRead;
+
+// Basic ALU with shifts.
+def WriteALUsi : SchedWrite; // Shift by immediate.
+def WriteALUsr : SchedWrite; // Shift by register.
+def WriteALUSsr : SchedWrite; // Shift by register (flag setting).
+def ReadALUsr : SchedRead; // Some operands are read later.
+
+// Compares.
+def WriteCMP : SchedWrite;
+def WriteCMPsi : SchedWrite;
+def WriteCMPsr : SchedWrite;
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(MI)}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for ARM
+//
+def IIC_iALUx      : InstrItinClass;
+def IIC_iALUi      : InstrItinClass;
+def IIC_iALUr      : InstrItinClass;
+def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsir    : InstrItinClass;
+def IIC_iALUsr     : InstrItinClass;
+def IIC_iBITi      : InstrItinClass;
+def IIC_iBITr      : InstrItinClass;
+def IIC_iBITsi     : InstrItinClass;
+def IIC_iBITsr     : InstrItinClass;
+def IIC_iUNAr      : InstrItinClass;
+def IIC_iUNAsi     : InstrItinClass;
+def IIC_iEXTr      : InstrItinClass;
+def IIC_iEXTAr     : InstrItinClass;
+def IIC_iEXTAsr    : InstrItinClass;
+def IIC_iCMPi      : InstrItinClass;
+def IIC_iCMPr      : InstrItinClass;
+def IIC_iCMPsi     : InstrItinClass;
+def IIC_iCMPsr     : InstrItinClass;
+def IIC_iTSTi      : InstrItinClass;
+def IIC_iTSTr      : InstrItinClass;
+def IIC_iTSTsi     : InstrItinClass;
+def IIC_iTSTsr     : InstrItinClass;
+def IIC_iMOVi      : InstrItinClass;
+def IIC_iMOVr      : InstrItinClass;
+def IIC_iMOVsi     : InstrItinClass;
+def IIC_iMOVsr     : InstrItinClass;
+def IIC_iMOVix2    : InstrItinClass;
+def IIC_iMOVix2addpc : InstrItinClass;
+def IIC_iMOVix2ld  : InstrItinClass;
+def IIC_iMVNi      : InstrItinClass;
+def IIC_iMVNr      : InstrItinClass;
+def IIC_iMVNsi     : InstrItinClass;
+def IIC_iMVNsr     : InstrItinClass;
+def IIC_iCMOVi     : InstrItinClass;
+def IIC_iCMOVr     : InstrItinClass;
+def IIC_iCMOVsi    : InstrItinClass;
+def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iCMOVix2   : InstrItinClass;
+def IIC_iMUL16     : InstrItinClass;
+def IIC_iMAC16     : InstrItinClass;
+def IIC_iMUL32     : InstrItinClass;
+def IIC_iMAC32     : InstrItinClass;
+def IIC_iMUL64     : InstrItinClass;
+def IIC_iMAC64     : InstrItinClass;
+def IIC_iDIV     : InstrItinClass;
+def IIC_iLoad_i    : InstrItinClass;
+def IIC_iLoad_r    : InstrItinClass;
+def IIC_iLoad_si   : InstrItinClass;
+def IIC_iLoad_iu   : InstrItinClass;
+def IIC_iLoad_ru   : InstrItinClass;
+def IIC_iLoad_siu  : InstrItinClass;
+def IIC_iLoad_bh_i   : InstrItinClass;
+def IIC_iLoad_bh_r   : InstrItinClass;
+def IIC_iLoad_bh_si  : InstrItinClass;
+def IIC_iLoad_bh_iu  : InstrItinClass;
+def IIC_iLoad_bh_ru  : InstrItinClass;
+def IIC_iLoad_bh_siu : InstrItinClass;
+def IIC_iLoad_d_i  : InstrItinClass;
+def IIC_iLoad_d_r  : InstrItinClass;
+def IIC_iLoad_d_ru : InstrItinClass;
+def IIC_iLoad_m    : InstrItinClass;
+def IIC_iLoad_mu   : InstrItinClass;
+def IIC_iLoad_mBr  : InstrItinClass;
+def IIC_iPop       : InstrItinClass;
+def IIC_iPop_Br    : InstrItinClass;
+def IIC_iLoadiALU  : InstrItinClass;
+def IIC_iStore_i   : InstrItinClass;
+def IIC_iStore_r   : InstrItinClass;
+def IIC_iStore_si  : InstrItinClass;
+def IIC_iStore_iu  : InstrItinClass;
+def IIC_iStore_ru  : InstrItinClass;
+def IIC_iStore_siu : InstrItinClass;
+def IIC_iStore_bh_i   : InstrItinClass;
+def IIC_iStore_bh_r   : InstrItinClass;
+def IIC_iStore_bh_si  : InstrItinClass;
+def IIC_iStore_bh_iu  : InstrItinClass;
+def IIC_iStore_bh_ru  : InstrItinClass;
+def IIC_iStore_bh_siu : InstrItinClass;
+def IIC_iStore_d_i   : InstrItinClass;
+def IIC_iStore_d_r   : InstrItinClass;
+def IIC_iStore_d_ru  : InstrItinClass;
+def IIC_iStore_m   : InstrItinClass;
+def IIC_iStore_mu  : InstrItinClass;
+def IIC_Preload    : InstrItinClass;
+def IIC_Br         : InstrItinClass;
+def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA32    : InstrItinClass;
+def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP32    : InstrItinClass;
+def IIC_fpCMP64    : InstrItinClass;
+def IIC_fpCVTSD    : InstrItinClass;
+def IIC_fpCVTDS    : InstrItinClass;
+def IIC_fpCVTSH    : InstrItinClass;
+def IIC_fpCVTHS    : InstrItinClass;
+def IIC_fpCVTIS    : InstrItinClass;
+def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTSI    : InstrItinClass;
+def IIC_fpCVTDI    : InstrItinClass;
+def IIC_fpMOVIS    : InstrItinClass;
+def IIC_fpMOVID    : InstrItinClass;
+def IIC_fpMOVSI    : InstrItinClass;
+def IIC_fpMOVDI    : InstrItinClass;
+def IIC_fpALU32    : InstrItinClass;
+def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL32    : InstrItinClass;
+def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC32    : InstrItinClass;
+def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpFMAC32   : InstrItinClass;
+def IIC_fpFMAC64   : InstrItinClass;
+def IIC_fpDIV32    : InstrItinClass;
+def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT32   : InstrItinClass;
+def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad32   : InstrItinClass;
+def IIC_fpLoad64   : InstrItinClass;
+def IIC_fpLoad_m   : InstrItinClass;
+def IIC_fpLoad_mu  : InstrItinClass;
+def IIC_fpStore32  : InstrItinClass;
+def IIC_fpStore64  : InstrItinClass;
+def IIC_fpStore_m  : InstrItinClass;
+def IIC_fpStore_mu : InstrItinClass;
+def IIC_VLD1       : InstrItinClass;
+def IIC_VLD1x2     : InstrItinClass;
+def IIC_VLD1x3     : InstrItinClass;
+def IIC_VLD1x4     : InstrItinClass;
+def IIC_VLD1u      : InstrItinClass;
+def IIC_VLD1x2u    : InstrItinClass;
+def IIC_VLD1x3u    : InstrItinClass;
+def IIC_VLD1x4u    : InstrItinClass;
+def IIC_VLD1ln     : InstrItinClass;
+def IIC_VLD1lnu    : InstrItinClass;
+def IIC_VLD1dup    : InstrItinClass;
+def IIC_VLD1dupu   : InstrItinClass;
+def IIC_VLD2       : InstrItinClass;
+def IIC_VLD2x2     : InstrItinClass;
+def IIC_VLD2u      : InstrItinClass;
+def IIC_VLD2x2u    : InstrItinClass;
+def IIC_VLD2ln     : InstrItinClass;
+def IIC_VLD2lnu    : InstrItinClass;
+def IIC_VLD2dup    : InstrItinClass;
+def IIC_VLD2dupu   : InstrItinClass;
+def IIC_VLD3       : InstrItinClass;
+def IIC_VLD3ln     : InstrItinClass;
+def IIC_VLD3u      : InstrItinClass;
+def IIC_VLD3lnu    : InstrItinClass;
+def IIC_VLD3dup    : InstrItinClass;
+def IIC_VLD3dupu   : InstrItinClass;
+def IIC_VLD4       : InstrItinClass;
+def IIC_VLD4ln     : InstrItinClass;
+def IIC_VLD4u      : InstrItinClass;
+def IIC_VLD4lnu    : InstrItinClass;
+def IIC_VLD4dup    : InstrItinClass;
+def IIC_VLD4dupu   : InstrItinClass;
+def IIC_VST1       : InstrItinClass;
+def IIC_VST1x2     : InstrItinClass;
+def IIC_VST1x3     : InstrItinClass;
+def IIC_VST1x4     : InstrItinClass;
+def IIC_VST1u      : InstrItinClass;
+def IIC_VST1x2u    : InstrItinClass;
+def IIC_VST1x3u    : InstrItinClass;
+def IIC_VST1x4u    : InstrItinClass;
+def IIC_VST1ln     : InstrItinClass;
+def IIC_VST1lnu    : InstrItinClass;
+def IIC_VST2       : InstrItinClass;
+def IIC_VST2x2     : InstrItinClass;
+def IIC_VST2u      : InstrItinClass;
+def IIC_VST2x2u    : InstrItinClass;
+def IIC_VST2ln     : InstrItinClass;
+def IIC_VST2lnu    : InstrItinClass;
+def IIC_VST3       : InstrItinClass;
+def IIC_VST3u      : InstrItinClass;
+def IIC_VST3ln     : InstrItinClass;
+def IIC_VST3lnu    : InstrItinClass;
+def IIC_VST4       : InstrItinClass;
+def IIC_VST4u      : InstrItinClass;
+def IIC_VST4ln     : InstrItinClass;
+def IIC_VST4lnu    : InstrItinClass;
+def IIC_VUNAD      : InstrItinClass;
+def IIC_VUNAQ      : InstrItinClass;
+def IIC_VBIND      : InstrItinClass;
+def IIC_VBINQ      : InstrItinClass;
+def IIC_VPBIND     : InstrItinClass;
+def IIC_VFMULD     : InstrItinClass;
+def IIC_VFMULQ     : InstrItinClass;
+def IIC_VMOV       : InstrItinClass;
+def IIC_VMOVImm    : InstrItinClass;
+def IIC_VMOVD      : InstrItinClass;
+def IIC_VMOVQ      : InstrItinClass;
+def IIC_VMOVIS     : InstrItinClass;
+def IIC_VMOVID     : InstrItinClass;
+def IIC_VMOVISL    : InstrItinClass;
+def IIC_VMOVSI     : InstrItinClass;
+def IIC_VMOVDI     : InstrItinClass;
+def IIC_VMOVN      : InstrItinClass;
+def IIC_VPERMD     : InstrItinClass;
+def IIC_VPERMQ     : InstrItinClass;
+def IIC_VPERMQ3    : InstrItinClass;
+def IIC_VMACD      : InstrItinClass;
+def IIC_VMACQ      : InstrItinClass;
+def IIC_VFMACD     : InstrItinClass;
+def IIC_VFMACQ     : InstrItinClass;
+def IIC_VRECSD     : InstrItinClass;
+def IIC_VRECSQ     : InstrItinClass;
+def IIC_VCNTiD     : InstrItinClass;
+def IIC_VCNTiQ     : InstrItinClass;
+def IIC_VUNAiD     : InstrItinClass;
+def IIC_VUNAiQ     : InstrItinClass;
+def IIC_VQUNAiD    : InstrItinClass;
+def IIC_VQUNAiQ    : InstrItinClass;
+def IIC_VBINiD     : InstrItinClass;
+def IIC_VBINiQ     : InstrItinClass;
+def IIC_VSUBiD     : InstrItinClass;
+def IIC_VSUBiQ     : InstrItinClass;
+def IIC_VBINi4D    : InstrItinClass;
+def IIC_VBINi4Q    : InstrItinClass;
+def IIC_VSUBi4D    : InstrItinClass;
+def IIC_VSUBi4Q    : InstrItinClass;
+def IIC_VABAD      : InstrItinClass;
+def IIC_VABAQ      : InstrItinClass;
+def IIC_VSHLiD     : InstrItinClass;
+def IIC_VSHLiQ     : InstrItinClass;
+def IIC_VSHLi4D    : InstrItinClass;
+def IIC_VSHLi4Q    : InstrItinClass;
+def IIC_VPALiD     : InstrItinClass;
+def IIC_VPALiQ     : InstrItinClass;
+def IIC_VMULi16D   : InstrItinClass;
+def IIC_VMULi32D   : InstrItinClass;
+def IIC_VMULi16Q   : InstrItinClass;
+def IIC_VMULi32Q   : InstrItinClass;
+def IIC_VMACi16D   : InstrItinClass;
+def IIC_VMACi32D   : InstrItinClass;
+def IIC_VMACi16Q   : InstrItinClass;
+def IIC_VMACi32Q   : InstrItinClass;
+def IIC_VEXTD      : InstrItinClass;
+def IIC_VEXTQ      : InstrItinClass;
+def IIC_VTB1       : InstrItinClass;
+def IIC_VTB2       : InstrItinClass;
+def IIC_VTB3       : InstrItinClass;
+def IIC_VTB4       : InstrItinClass;
+def IIC_VTBX1      : InstrItinClass;
+def IIC_VTBX2      : InstrItinClass;
+def IIC_VTBX3      : InstrItinClass;
+def IIC_VTBX4      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "ARMScheduleV6.td"
+include "ARMScheduleA8.td"
+include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
new file mode 100644
index 000000000000..2c6382542ab9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA8.td
@@ -0,0 +1,1075 @@
+//=- ARMScheduleA8.td - ARM Cortex-A8 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A8 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+// Functional Units.
+def A8_Pipe0   : FuncUnit; // pipeline 0
+def A8_Pipe1   : FuncUnit; // pipeline 1
+def A8_LSPipe  : FuncUnit; // Load / store pipeline
+def A8_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def A8_NLSPipe : FuncUnit; // NEON LS pipe
+//
+// Dual issue pipeline represented by A8_Pipe0 | A8_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<
+  [A8_Pipe0, A8_Pipe1, A8_LSPipe, A8_NPipe, A8_NLSPipe],
+  [], [
+  // Two fully-pipelined integer ALU pipelines
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1,[A8_Pipe0, A8_Pipe1]>], [2, 1, 2]>,
+  InstrItinData<IIC_iALUsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iBITr ,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>],[2, 2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iTSTr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                             InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                                  InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<1, [A8_LSPipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2,[InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                              InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [3, 1]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMVNr , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsi, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMVNsr, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [1, 1, 1]>,
+
+  // Integer multiply pipeline
+  // Result written in E5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [A8_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [A8_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [A8_Pipe0]>], [6, 6, 1, 1]>,
+
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  // FIXME: lsl by 2 takes 1 cycle.
+  InstrItinData<IIC_iLoad_si  , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                              [1, 2, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>],
+                [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Push, def is the 3th operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [A8_Pipe0], 0>,
+                                InstrStage<3, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [4, 1]>,
+
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si , [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<2, [A8_LSPipe]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<1, [A8_LSPipe]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru  , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                  InstrStage<1, [A8_LSPipe]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu, [InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                 InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [A8_Pipe0, A8_Pipe1], 0>,
+                                   InstrStage<2, [A8_LSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple. Pipeline 0 only.
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_iStore_m , [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<2, [A8_Pipe0], 0>,
+                                InstrStage<2, [A8_LSPipe]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>], [2, 2]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [A8_Pipe0, A8_Pipe1]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                              InstrStage<1, [A8_NLSPipe]>], [20]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NPipe], 0>,
+                               InstrStage<4, [A8_NLSPipe]>], [4, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<7, [A8_NPipe], 0>,
+                               InstrStage<7, [A8_NLSPipe]>], [7, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NPipe], 0>,
+                               InstrStage<5, [A8_NLSPipe]>], [5, 1]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<8, [A8_NPipe], 0>,
+                               InstrStage<8, [A8_NLSPipe]>], [8, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<9, [A8_NPipe], 0>,
+                               InstrStage<9, [A8_NLSPipe]>], [9, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<11, [A8_NPipe], 0>,
+                               InstrStage<11, [A8_NLSPipe]>], [11, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<20, [A8_NPipe], 0>,
+                               InstrStage<20, [A8_NLSPipe]>], [20, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<29, [A8_NPipe], 0>,
+                               InstrStage<29, [A8_NLSPipe]>], [29, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>],
+                              [20, 20, 1]>,
+
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: A8_LSPipe cycle time is dynamic, this assumes 3 to 4 registers.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 2], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>,
+                               InstrStage<1, [A8_NLSPipe], 0>,
+                               InstrStage<1, [A8_LSPipe]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>,
+                                InstrStage<1, [A8_NLSPipe], 0>,
+                                InstrStage<1, [A8_LSPipe]>],
+                [2, 1, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // Issue through integer pipeline, and execute in NEON unit.
+  //
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1]>,
+  //
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1]>,
+  //
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<5, [A8_NLSPipe], 0>,
+                               InstrStage<5, [A8_LSPipe]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<2, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<3, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<4, [A8_NLSPipe], 0>,
+                               InstrStage<4, [A8_LSPipe]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2]>,
+  //
+  // Double-register FP Binary
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 2]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [5, 2, 1]>,
+
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 1]>,
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [1, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Quad-register Permute Move
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [20, 20, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN   , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [2, 1]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 2, 2]>,
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 2, 2]>,
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 2]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 2, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [5, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [6, 3, 2, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 2, 2]>,
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>,
+                               InstrStage<2, [A8_NLSPipe], 0>,
+                               InstrStage<3, [A8_NPipe]>], [9, 3, 2, 1]>,
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NLSPipe]>], [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                               InstrStage<2, [A8_NLSPipe]>],[4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NLSPipe]>,
+                               InstrStage<1, [A8_NPipe], 0>,
+                            InstrStage<2, [A8_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Cortex-A8 machine model for scheduling and other instruction cost heuristics.
+def CortexA8Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 13; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA8Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
new file mode 100644
index 000000000000..9739ed20ce2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -0,0 +1,2496 @@
+//=- ARMScheduleA9.td - ARM Cortex-A9 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A9 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+//
+// Ad-hoc scheduling information derived from pretty vague "Cortex-A9 Technical
+// Reference Manual".
+//
+// Functional units
+def A9_Issue0  : FuncUnit; // Issue 0
+def A9_Issue1  : FuncUnit; // Issue 1
+def A9_Branch  : FuncUnit; // Branch
+def A9_ALU0    : FuncUnit; // ALU / MUL pipeline 0
+def A9_ALU1    : FuncUnit; // ALU pipeline 1
+def A9_AGU     : FuncUnit; // Address generation unit for ld / st
+def A9_NPipe   : FuncUnit; // NEON pipeline
+def A9_MUX0    : FuncUnit; // AGU + NEON/FPU multiplexer
+def A9_LSUnit  : FuncUnit; // L/S Unit
+def A9_DRegsVFP: FuncUnit; // FP register set, VFP side
+def A9_DRegsN  : FuncUnit; // FP register set, NEON side
+
+// Bypasses
+def A9_LdBypass : Bypass;
+
+def CortexA9Itineraries : ProcessorItineraries<
+  [A9_Issue0, A9_Issue1, A9_Branch, A9_ALU0, A9_ALU1, A9_AGU, A9_NPipe, A9_MUX0,
+   A9_LSUnit, A9_DRegsVFP, A9_DRegsN],
+  [A9_LdBypass], [
+  // Two fully-pipelined integer ALU pipelines
+
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                                  InstrStage<1, [A9_ALU0, A9_ALU1]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_AGU], 0>,
+                               InstrStage<1, [A9_LSUnit]>], [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                              [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                              [2, 1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [3, 1, 1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1], [NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                            [1, 1, 1], [NoBypass, A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                            [2, 1, 1], [NoBypass, NoBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                            [3, 1, 1, 1],
+                            [NoBypass, A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<1, [A9_ALU0, A9_ALU1]>], [2, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<2, [A9_ALU0, A9_ALU1]>], [3, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                             InstrStage<3, [A9_ALU0, A9_ALU1]>], [3, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1], [A9_LdBypass]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [1, 1], [A9_LdBypass, A9_LdBypass]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>],
+                                [1, 1], [A9_LdBypass, NoBypass]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>],
+                              [1, 1, 1], [A9_LdBypass, NoBypass, NoBypass]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0, A9_ALU1]>], [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0, A9_ALU1]>], [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>,
+                               InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_ALU0, A9_ALU1]>], [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<2, [A9_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>], [4, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<3, [A9_ALU0]>],
+                              [4, 5, 1, 1]>,
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1], [A9_LdBypass]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit], 0>],
+                                [4, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [5, 1, 1], [A9_LdBypass]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1], [A9_LdBypass]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 2, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [3, 3, 1, 1], [A9_LdBypass]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>],
+                                [4, 3, 1, 1], [A9_LdBypass]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [5, 4, 1, 1], [A9_LdBypass]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [2, 1, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 2, 1, 1, 3],
+                         [NoBypass, NoBypass, NoBypass, NoBypass, A9_LdBypass],
+                         -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<2, [A9_AGU], 1>,
+                                InstrStage<2, [A9_LSUnit]>,
+                                InstrStage<1, [A9_Branch]>],
+                               [1, 1, 3],
+                               [NoBypass, NoBypass, A9_LdBypass],
+                               -1>, // dynamic uops
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<1, [A9_LSUnit]>,
+                                InstrStage<1, [A9_ALU0, A9_ALU1]>],
+                               [2, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  // FIXME: If address is 64-bit aligned, AGU cycles is 1.
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<1, [A9_AGU], 0>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                 InstrStage<1, [A9_MUX0], 0>,
+                                 InstrStage<2, [A9_AGU], 1>,
+                                 InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>], [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<1, [A9_AGU], 0>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                  InstrStage<1, [A9_MUX0], 0>,
+                                  InstrStage<2, [A9_AGU], 1>,
+                                  InstrStage<1, [A9_LSUnit]>],
+                                 [3, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<1, [A9_AGU], 0>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [2, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                    InstrStage<1, [A9_MUX0], 0>,
+                                    InstrStage<2, [A9_AGU], 1>,
+                                    InstrStage<1, [A9_LSUnit]>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_AGU], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2], [], -1>, // dynamic uops
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [A9_Issue0, A9_Issue1]>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [A9_Issue0], 0>,
+                                InstrStage<1, [A9_Issue1], 0>,
+                                InstrStage<1, [A9_Branch]>]>,
+
+  // VFP and NEON shares the same register file. This means that every VFP
+  // instruction should wait for full completion of the consecutive NEON
+  // instruction and vice-versa. We model this behavior with two artificial FUs:
+  // DRegsVFP and DRegsVFP.
+  //
+  // Every VFP instruction:
+  //  - Acquires DRegsVFP resource for 1 cycle
+  //  - Reserves DRegsN resource for the whole duration (including time to
+  //    register file writeback!).
+  // Every NEON instruction does the same but with FUs swapped.
+  //
+  // Since the reserved FU cannot be acquired, this models precisely
+  // "cross-domain" stalls.
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit.
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                              InstrStage<1, [A9_MUX0], 0>,
+                              InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                              InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                              InstrStage<1, [A9_NPipe]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra latency cycles since wbck is 4 cycles
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<5, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<6, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<7, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<16, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<10, [A9_NPipe]>],
+                              [15, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<26, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<20, [A9_NPipe]>],
+                              [25, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<18, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<13, [A9_NPipe]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<33, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<28, [A9_NPipe]>],
+                              [32, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               // Extra 1 latency cycle since wbck is 2 cycles
+                               InstrStage<3, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  //
+  // On A9 move-from-VFP is free to issue with no stall if other VFP
+  // operations are in flight. I assume it still can't dual-issue though.
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Load
+  // FIXME: Result latency is 1 if address is 64-bit aligned.
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                [1, 1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: assumes 2 doubles which requires 2 LS cycles.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                                InstrStage<1, [A9_MUX0], 0>,
+                                InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                                InstrStage<2, [A9_DRegsN],   0, Reserved>,
+                                InstrStage<1, [A9_NPipe], 0>,
+                                InstrStage<2, [A9_LSUnit]>],
+                [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1]>,
+  // VLD1x2
+  InstrItinData<IIC_VLD1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  // VLD1x3
+  InstrItinData<IIC_VLD1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x4
+  InstrItinData<IIC_VLD1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1u
+  InstrItinData<IIC_VLD1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 2, 1]>,
+  // VLD1x2u
+  InstrItinData<IIC_VLD1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 2, 1]>,
+  // VLD1x3u
+  InstrItinData<IIC_VLD1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 1]>,
+  // VLD1x4u
+  InstrItinData<IIC_VLD1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 2, 2, 2, 1]>,
+  //
+  // VLD1ln
+  InstrItinData<IIC_VLD1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 1, 1, 1]>,
+  //
+  // VLD1lnu
+  InstrItinData<IIC_VLD1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 2, 1, 1, 1, 1]>,
+  //
+  // VLD1dup
+  InstrItinData<IIC_VLD1dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1]>,
+  //
+  // VLD1dupu
+  InstrItinData<IIC_VLD1dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1, 1]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2x2
+  InstrItinData<IIC_VLD2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 1]>,
+  //
+  // VLD2ln
+  InstrItinData<IIC_VLD2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 1, 1, 1, 1]>,
+  //
+  // VLD2u
+  InstrItinData<IIC_VLD2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1, 1]>,
+  //
+  // VLD2x2u
+  InstrItinData<IIC_VLD2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 3, 2, 3, 2, 1]>,
+  //
+  // VLD2lnu
+  InstrItinData<IIC_VLD2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [3, 3, 2, 1, 1, 1, 1, 1]>,
+  //
+  // VLD2dup
+  InstrItinData<IIC_VLD2dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 1]>,
+  //
+  // VLD2dupu
+  InstrItinData<IIC_VLD2dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 2, 2, 1, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3ln
+  InstrItinData<IIC_VLD3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3u
+  InstrItinData<IIC_VLD3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1]>,
+  //
+  // VLD3lnu
+  InstrItinData<IIC_VLD3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<11,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<5, [A9_NPipe], 0>,
+                               InstrStage<5, [A9_LSUnit]>],
+                              [5, 5, 6, 2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VLD3dup
+  InstrItinData<IIC_VLD3dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 1]>,
+  //
+  // VLD3dupu
+  InstrItinData<IIC_VLD3dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 2, 1, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 1]>,
+  //
+  // VLD4ln
+  InstrItinData<IIC_VLD4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4u
+  InstrItinData<IIC_VLD4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<9,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [3, 3, 4, 4, 2, 1]>,
+  //
+  // VLD4lnu
+  InstrItinData<IIC_VLD4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<10,[A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe], 0>,
+                               InstrStage<4, [A9_LSUnit]>],
+                              [4, 4, 5, 5, 2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VLD4dup
+  InstrItinData<IIC_VLD4dup,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 1]>,
+  //
+  // VLD4dupu
+  InstrItinData<IIC_VLD4dupu, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 2, 3, 3, 2, 1, 1]>,
+  //
+  // VST1
+  InstrItinData<IIC_VST1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1x2
+  InstrItinData<IIC_VST1x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST1x3
+  InstrItinData<IIC_VST1x3,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST1x4
+  InstrItinData<IIC_VST1x4,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1u
+  InstrItinData<IIC_VST1u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST1x2u
+  InstrItinData<IIC_VST1x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST1x3u
+  InstrItinData<IIC_VST1x3u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST1x4u
+  InstrItinData<IIC_VST1x4u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST1ln
+  InstrItinData<IIC_VST1ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1]>,
+  //
+  // VST1lnu
+  InstrItinData<IIC_VST1lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1]>,
+  //
+  // VST2
+  InstrItinData<IIC_VST2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2x2
+  InstrItinData<IIC_VST2x2,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2u
+  InstrItinData<IIC_VST2u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST2x2u
+  InstrItinData<IIC_VST2x2u,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST2ln
+  InstrItinData<IIC_VST2ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [1, 1, 1, 1]>,
+  //
+  // VST2lnu
+  InstrItinData<IIC_VST2lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe], 0>,
+                               InstrStage<1, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1]>,
+  //
+  // VST3
+  InstrItinData<IIC_VST3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3u
+  InstrItinData<IIC_VST3u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST3ln
+  InstrItinData<IIC_VST3ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2]>,
+  //
+  // VST3lnu
+  InstrItinData<IIC_VST3lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe], 0>,
+                               InstrStage<3, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2]>,
+  //
+  // VST4
+  InstrItinData<IIC_VST4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4u
+  InstrItinData<IIC_VST4u,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4ln
+  InstrItinData<IIC_VST4ln,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [1, 1, 1, 1, 2, 2]>,
+  //
+  // VST4lnu
+  InstrItinData<IIC_VST4lnu,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<2, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe], 0>,
+                               InstrStage<2, [A9_LSUnit]>],
+                              [2, 1, 1, 1, 1, 1, 2, 2]>,
+
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [4, 2, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 2, 2]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 2]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 3, 2, 2]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [7, 3, 2, 2]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [9, 3, 2, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1,1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [1, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<3, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               InstrStage<4, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [3, 1]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 2]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [5, 2, 1]>,
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  // FIXME: We're using this itin for many instructions and [2, 2] here is too
+  // optimistic.
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 2, 2]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [6, 2, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 10 cycles
+                               InstrStage<11, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 11 cycles
+                               InstrStage<12, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [10, 2, 2]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 4, 1, 1]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 6 cycles
+                               InstrStage<7, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<2, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<3, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 8 cycles
+                               InstrStage<9, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [4, 1, 2, 2, 3, 3, 1]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
+
+// Cortex-A9 machine model for scheduling and other instruction cost heuristics.
+def CortexA9Model : SchedMachineModel {
+  let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let LoadLatency = 2; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let ILPWindow = 10; // Don't reschedule small blocks to hide
+                      // latency. Minimum latency requirements are already
+                      // modeled strictly by reserving resources.
+  let MispredictPenalty = 8; // Based on estimate of pipeline depth.
+
+  let Itineraries = CortexA9Itineraries;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+let SchedModel = CortexA9Model in {
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1>;
+def A9UnitLS  : ProcResource<1>;
+def A9UnitFP  : ProcResource<1> { let Buffered = 0; }
+def A9UnitB   : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteALU : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def : WriteRes<WriteALUsi, [A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteALUsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+                                              let NumMicroOps = 0; }
+def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+                                                let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+                                     let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+                                              let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+                                      let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands.  These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi,
+                          A9WriteL8, A9WriteL8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+                             A9WriteL2, A9WriteL2Hi,
+                             A9WriteL3Hi, A9WriteL3Hi,
+                             A9WriteL4Hi, A9WriteL4Hi,
+                             A9WriteL5Hi, A9WriteL5Hi,
+                             A9WriteL6Hi, A9WriteL6Hi,
+                             A9WriteL7Hi, A9WriteL7Hi,
+                             A9WriteL8Hi, A9WriteL8Hi]>]> {
+  let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi,
+                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                             A9WriteLMfp2, A9WriteLMfp2Hi,
+                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
+                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+  let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, WriteALU]>;
+def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadALU : SchedReadAdvance<1,
+  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+
+def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+                         IIC_iMVNi,IIC_iMVNsi,
+                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[A9WriteI,A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[WriteALU], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[WriteALU, A9ReadALU], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[WriteALU, A9ReadALU, A9ReadALU],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[WriteALUsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[WriteALUsi, A9ReadALU], [IIC_iALUsi]>;
+def :ItinRW<[WriteALUsi, ReadDefault, A9ReadALU], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteALUsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteALUsr, A9ReadALU], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+                                     IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+                            IIC_fpUNA32, IIC_fpUNA64,
+                            IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+                         IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+                                            IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+                                     IIC_iStore_iu, IIC_iStore_ru,
+                                     IIC_iStore_d_i, IIC_iStore_d_r,
+                                     IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
+                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+                                                      IIC_iStore_m,
+                                                      IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, WriteALU], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+                                        IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+                                         IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+                                         IIC_VLD1x4, IIC_VLD1x4u,
+                                         IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+                                            IIC_VLD2, IIC_VLD2u,
+                                            IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+                                            IIC_VLD2x2, IIC_VLD2x2u,
+                                            IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+                                            IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+                                            IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+                                         IIC_VST1x2, IIC_VST1x2u,
+                                         IIC_VST1ln, IIC_VST1lnu,
+                                         IIC_VST2, IIC_VST2u,
+                                         IIC_VST2x2, IIC_VST2x2u,
+                                         IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+                                         IIC_VST1x4, IIC_VST1x4u,
+                                         IIC_VST3, IIC_VST3u,
+                                         IIC_VST3ln, IIC_VST3lnu,
+                                         IIC_VST4, IIC_VST4u,
+                                         IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+            [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+            [IIC_VTBX4]>;
+
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+
+// Map SchedRWs that are identical for cortexa9 to existing resources.
+def : SchedAlias<WriteALU, A9WriteALU>;
+def : SchedAlias<WriteALUsr, A9WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
+def : SchedAlias<ReadALU, A9ReadALU>;
+def : SchedAlias<ReadALUsr, A9ReadALU>;
+// FIXME: need to special case AND, ORR, EOR, BIC because they don't read
+// advance. But our instrinfo claims it does.
+
+def : SchedAlias<WriteCMP, A9WriteALU>;
+def : SchedAlias<WriteCMPsi, A9WriteALU>;
+def : SchedAlias<WriteCMPsr, A9WriteALU>;
+} // SchedModel = CortexA9Model
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 000000000000..7c6df410706e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,1144 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS   : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+//        IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+//        to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+//        conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+def SwiftItineraries : ProcessorItineraries<
+  [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                 [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_LS]>],
+                              [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 3>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [5, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [5, 6, 1, 1]>,
+  //
+  // Integer divide
+  InstrItinData<IIC_iDIV  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0], 0>,
+                             InstrStage<14, [SW_IDIV]>],
+                            [14, 1, 1]>,
+
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [4, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [2], [], -1>, // dynamic uops
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [SW_DIS0], 0>]>,
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                              InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  //
+  // Most floating-point moves get issued on ALU0.
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 4>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_LS], 4>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Count
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Integer to Lane Move
+  // FIXME: I think this is correct, but it is not clear from the tuning guide.
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Quad-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP Binary
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FusedF P Multiple-Accumulate
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute (3 cycle issue on A9)
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+  let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let LoadLatency = 3;
+  let MispredictPenalty = 14; // A branch direction mispredict.
+
+  let Itineraries = SwiftItineraries;
+}
+
+// Swift predicates.
+def IsFastImmShiftSwiftPred : SchedPredicate<[{TII->isSwiftFastImmShift(MI)}]>;
+
+// Swift resource mapping.
+let SchedModel = SwiftModel in {
+  // Processor resources.
+  def SwiftUnitP01 : ProcResource<2>; // ALU unit.
+  def SwiftUnitP0 : ProcResource<1> { let Super = SwiftUnitP01; } // Mul unit.
+  def SwiftUnitP1 : ProcResource<1> { let Super = SwiftUnitP01; } // Br unit.
+  def SwiftUnitP2 : ProcResource<1>; // LS unit.
+  def SwiftUnitDiv : ProcResource<1>;
+
+  // Generic resource requirements.
+  def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
+  def SwiftWriteP01ThreeCycleTwoUops :
+    SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP0ThreeCycleThreeUops : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+    let ResourceCycles = [3];
+  }
+
+  // 4.2.4 Arithmetic and Logical.
+  // ALU operation register shifted by immediate variant.
+  def SwiftWriteALUsi : SchedWriteVariant<[
+    // lsl #2, lsl #1, or lsr #1.
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01TwoCycle]>,
+    SchedVar<NoSchedPred,             [WriteALU]>
+  ]>;
+  def SwiftWriteALUsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP01ThreeCycleTwoUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftWriteALUSsr : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [SwiftWriteP0ThreeCycleThreeUops]>,
+    SchedVar<NoSchedPred,      [SwiftWriteP01TwoCycle]>
+  ]>;
+  def SwiftReadAdvanceALUsr : SchedReadVariant<[
+    SchedVar<IsPredicatedPred, [SchedReadAdvance<2>]>,
+    SchedVar<NoSchedPred,      [NoReadAdvance]>
+  ]>;
+  // ADC,ADD,NEG,RSB,RSC,SBC,SUB,ADR
+  // AND,BIC,EOR,ORN,ORR
+  // CLZ,RBIT,REV,REV16,REVSH,PKH
+  def : WriteRes<WriteALU, [SwiftUnitP01]>;
+  def : SchedAlias<WriteALUsi, SwiftWriteALUsi>;
+  def : SchedAlias<WriteALUsr, SwiftWriteALUsr>;
+  def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
+  def : ReadAdvance<ReadALU, 0>;
+  def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+
+  // 4.2.5 Integer comparison
+  def : WriteRes<WriteCMP, [SwiftUnitP01]>;
+  def : WriteRes<WriteCMPsi, [SwiftUnitP01]>;
+  def : WriteRes<WriteCMPsr, [SwiftUnitP01]>;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
new file mode 100644
index 000000000000..0ace9bc1796d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleV6.td
@@ -0,0 +1,300 @@
+//===-- ARMScheduleV6.td - ARM v6 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v6 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Model based on ARM1176
+//
+// Functional Units
+def V6_Pipe : FuncUnit; // pipeline
+
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual"
+//
+def ARMV6Itineraries : ProcessorItineraries<
+  [V6_Pipe], [], [
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [V6_Pipe]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iBITr    , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  InstrItinData<IIC_iBITsi   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iBITsr   , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr    , [InstrStage<1, [V6_Pipe]>], [1, 1]>,
+  InstrItinData<IIC_iEXTAr   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iEXTAsr  , [InstrStage<2, [V6_Pipe]>], [3, 3, 2, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iTSTr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iTSTsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iTSTsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_iMOVix2  , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>,
+                                  InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iMOVix2ld , [InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>,
+                                 InstrStage<1, [V6_Pipe]>], [5]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [V6_Pipe]>], [3]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [V6_Pipe]>], [3, 2]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [V6_Pipe]>], [3, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iCMOVix2 , [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [4]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi    , [InstrStage<1, [V6_Pipe]>], [2]>,
+  InstrItinData<IIC_iMVNr    , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  InstrItinData<IIC_iMVNsi   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iMVNsr   , [InstrStage<2, [V6_Pipe]>], [3, 2, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
+  
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [V6_Pipe]>], [4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [V6_Pipe]>], [4, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoad_si   , [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_si, [InstrStage<2, [V6_Pipe]>], [5, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu, [InstrStage<1, [V6_Pipe]>], [4, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru   , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru, [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru , [InstrStage<1, [V6_Pipe]>], [4, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoad_siu,   [InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<2, [V6_Pipe]>], [5, 2, 2, 1]>,
+
+  //
+  // Load multiple, def is the 5th operand.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<3, [V6_Pipe]>], [1, 1, 1, 1, 4]>,
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 1, 4]>,
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 1, 1, 4]>,
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [3, 1]>,
+
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop     , [InstrStage<3, [V6_Pipe]>], [1, 1, 4]>,
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<3, [V6_Pipe]>,
+                                InstrStage<1, [V6_Pipe]>], [1, 2, 4]>,
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStore_i   , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_bh_i, [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  InstrItinData<IIC_iStore_d_i , [InstrStage<1, [V6_Pipe]>], [2, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r   , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r, [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r , [InstrStage<1, [V6_Pipe]>], [2, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStore_si   , [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_si, [InstrStage<2, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu   , [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_iu, [InstrStage<1, [V6_Pipe]>], [2, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru,   [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [V6_Pipe]>], [2, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStore_siu,   [InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  InstrItinData<IIC_iStore_bh_siu,[InstrStage<2, [V6_Pipe]>], [2, 2, 2, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m  , [InstrStage<3, [V6_Pipe]>]>,
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
+  
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [V6_Pipe]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [V6_Pipe]>], [3]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [V6_Pipe]>], [2, 2]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [V6_Pipe]>], [5, 2]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [V6_Pipe]>], [9, 2]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [V6_Pipe]>], [34, 2, 2]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [V6_Pipe]>], [10, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [V6_Pipe]>], [10, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [V6_Pipe]>], [10, 10, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64 , [InstrStage<1, [V6_Pipe]>], [5, 2, 2]>,
+  //
+  // FP Load Multiple
+  InstrItinData<IIC_fpLoad_m , [InstrStage<3, [V6_Pipe]>], [2, 1, 1, 5]>,
+  //
+  // FP Load Multiple + update
+  InstrItinData<IIC_fpLoad_mu, [InstrStage<3, [V6_Pipe]>], [3, 2, 1, 1, 5]>,
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64 , [InstrStage<1, [V6_Pipe]>], [2, 2, 2]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStore_m, [InstrStage<3, [V6_Pipe]>], [2, 2, 2, 2]>,
+  //
+  // FP Store Multiple + update
+  InstrItinData<IIC_fpStore_mu,[InstrStage<3, [V6_Pipe]>], [3, 2, 2, 2, 2]>
+]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..41a7e0c2c8a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -0,0 +1,200 @@
+//===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-selectiondag-info"
+#include "ARMTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+}
+
+ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
+}
+
+SDValue
+ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile, bool AlwaysInline,
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                         DAG.getConstant(SrcOff, MVT::i32)),
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, false, 0);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                              DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                          DAG.getConstant(DstOff, MVT::i32)),
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                       DAG.getConstant(SrcOff, MVT::i32)),
+                           SrcPtrInfo.getWithOffset(SrcOff),
+                           false, false, false, 0);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                        DAG.getConstant(DstOff, MVT::i32)),
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+// Adjust parameters for memset, EABI uses format (ptr, size, value),
+// GNU library uses (ptr, value, size)
+// See RTABI section 4.3.4
+SDValue ARMSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  // Use default for non AAPCS (or Darwin) subtargets
+  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetDarwin())
+    return SDValue();
+
+  const ARMTargetLowering &TLI =
+    *static_cast<const ARMTargetLowering*>(DAG.getTarget().getTargetLowering());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // First argument: data pointer
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+  Entry.Node = Dst;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  // Second argument: buffer size
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Entry.isSExt = false;
+  Args.push_back(Entry);
+
+  // Extend or truncate the argument to be an i32 value for the call.
+  if (Src.getValueType().bitsGT(MVT::i32))
+    Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
+  else
+    Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
+
+  // Third argument: value to fill
+  Entry.Node = Src;
+  Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+  Entry.isSExt = true;
+  Args.push_back(Entry);
+
+  // Emit __eabi_memset call
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                    Type::getVoidTy(*DAG.getContext()), // return type
+                    false, // return sign ext
+                    false, // return zero ext
+                    false, // is var arg
+                    false, // is in regs
+                    0,     // number of fixed arguments
+                    TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv
+                    false, // is tail call
+                    false, // does not return
+                    false, // is return val used
+                    DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                          TLI.getPointerTy()), // callee
+                    Args, DAG, dl);
+  std::pair<SDValue,SDValue> CallResult =
+    TLI.LowerCallTo(CLI);
+
+  return CallResult.second;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
new file mode 100644
index 000000000000..6419a737295a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -0,0 +1,68 @@
+//===-- ARMSelectionDAGInfo.h - ARM SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSELECTIONDAGINFO_H
+#define ARMSELECTIONDAGINFO_H
+
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+namespace ARM_AM {
+  static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+    switch (Opcode) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+}  // end namespace ARM_AM
+
+class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMSelectionDAGInfo(const TargetMachine &TM);
+  ~ARMSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+
+  // Adjust parameters for memset, see RTABI section 4.3.4
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 000000000000..739300e4eff9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -0,0 +1,241 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "ARMGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ReserveR9("arm-reserve-r9", cl::Hidden,
+          cl::desc("Reserve R9, making it unavailable as GPR"));
+
+static cl::opt<bool>
+DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+               cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+StrictAlign("arm-strict-align", cl::Hidden,
+            cl::desc("Disallow all unaligned memory accesses"));
+
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, const TargetOptions &Options)
+  : ARMGenSubtargetInfo(TT, CPU, FS)
+  , ARMProcFamily(Others)
+  , stackAlignment(4)
+  , CPUString(CPU)
+  , TargetTriple(TT)
+  , Options(Options)
+  , TargetABI(ARM_ABI_APCS) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+}
+
+void ARMSubtarget::initializeEnvironment() {
+  HasV4TOps = false;
+  HasV5TOps = false;
+  HasV5TEOps = false;
+  HasV6Ops = false;
+  HasV6T2Ops = false;
+  HasV7Ops = false;
+  HasVFPv2 = false;
+  HasVFPv3 = false;
+  HasVFPv4 = false;
+  HasNEON = false;
+  UseNEONForSinglePrecisionFP = false;
+  UseMulOps = UseFusedMulOps;
+  SlowFPVMLx = false;
+  HasVMLxForwarding = false;
+  SlowFPBrcc = false;
+  InThumbMode = false;
+  HasThumb2 = false;
+  IsMClass = false;
+  NoARM = false;
+  PostRAScheduler = false;
+  IsR9Reserved = ReserveR9;
+  UseMovt = false;
+  SupportsTailCall = false;
+  HasFP16 = false;
+  HasD16 = false;
+  HasHardwareDivide = false;
+  HasHardwareDivideInARM = false;
+  HasT2ExtractPack = false;
+  HasDataBarrier = false;
+  Pref32BitThumb = false;
+  AvoidCPSRPartialUpdate = false;
+  AvoidMOVsShifterOperand = false;
+  HasRAS = false;
+  HasMPExtension = false;
+  FPOnlySP = false;
+  AllowsUnalignedMem = false;
+  Thumb2DSP = false;
+  UseNaClTrap = false;
+  UnsafeFPMath = false;
+}
+
+void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+  if (CPUString.empty())
+    CPUString = "generic";
+
+  // Insert the architecture feature derived from the target triple into the
+  // feature string. This is important for setting features that are implied
+  // based on the architecture version.
+  std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple.getTriple(),
+                                              CPUString);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+  ParseSubtargetFeatures(CPUString, ArchFS);
+
+  // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a
+  // ARM version or CPU and then remove this.
+  if (!HasV6T2Ops && hasThumb2())
+    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true;
+
+  // Keep a pointer to static instruction cost data for the specified CPU.
+  SchedModel = getSchedModelForCPU(CPUString);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  if ((TargetTriple.getTriple().find("eabi") != std::string::npos) ||
+      (isTargetIOS() && isMClass()))
+    // FIXME: We might want to separate AAPCS and EABI. Some systems, e.g.
+    // Darwin-EABI conforms to AACPS but not the rest of EABI.
+    TargetABI = ARM_ABI_AAPCS;
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+
+  if (!isTargetIOS())
+    UseMovt = hasV6T2Ops();
+  else {
+    IsR9Reserved = ReserveR9 | !HasV6Ops;
+    UseMovt = DarwinUseMOVT && hasV6T2Ops();
+    SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
+  }
+
+  if (!isThumb() || hasThumb2())
+    PostRAScheduler = true;
+
+  // v6+ may or may not support unaligned mem access depending on the system
+  // configuration.
+  if (!StrictAlign && hasV6Ops() && isTargetDarwin())
+    AllowsUnalignedMem = true;
+
+  // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
+  uint64_t Bits = getFeatureBits();
+  if ((Bits & ARM::ProcA5 || Bits & ARM::ProcA8) && // Where this matters
+      (Options.UnsafeFPMath || isTargetDarwin()))
+    UseNEONForSinglePrecisionFP = true;
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
+bool
+ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV,
+                                 Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  if (!isTargetDarwin()) {
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+    return true;
+  } else {
+    if (RelocM == Reloc::PIC_) {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+
+      // If symbol visibility is hidden, we have a stub for common symbol
+      // references and external declarations.
+      if (isDecl || GV->hasCommonLinkage())
+        // Hidden $non_lazy_ptr reference.
+        return true;
+
+      return false;
+    } else {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+    }
+  }
+
+  return false;
+}
+
+unsigned ARMSubtarget::getMispredictionPenalty() const {
+  return SchedModel->MispredictPenalty;
+}
+
+bool ARMSubtarget::enablePostRAScheduler(
+           CodeGenOpt::Level OptLevel,
+           TargetSubtargetInfo::AntiDepBreakMode& Mode,
+           RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&ARM::GPRRegClass);
+  return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 000000000000..5b5ee6aeb865
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -0,0 +1,315 @@
+//===-- ARMSubtarget.h - Define Subtarget for the ARM ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "ARMGenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetOptions;
+
+class ARMSubtarget : public ARMGenSubtargetInfo {
+protected:
+  enum ARMProcFamilyEnum {
+    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift
+  };
+
+  /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
+  ARMProcFamilyEnum ARMProcFamily;
+
+  /// HasV4TOps, HasV5TOps, HasV5TEOps, HasV6Ops, HasV6T2Ops, HasV7Ops -
+  /// Specify whether target support specific ARM ISA variants.
+  bool HasV4TOps;
+  bool HasV5TOps;
+  bool HasV5TEOps;
+  bool HasV6Ops;
+  bool HasV6T2Ops;
+  bool HasV7Ops;
+
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON - Specify what
+  /// floating point ISAs are supported.
+  bool HasVFPv2;
+  bool HasVFPv3;
+  bool HasVFPv4;
+  bool HasNEON;
+
+  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+  /// specified. Use the method useNEONForSinglePrecisionFP() to
+  /// determine if NEON should actually be used.
+  bool UseNEONForSinglePrecisionFP;
+
+  /// UseMulOps - True if non-microcoded fused integer multiply-add and
+  /// multiply-subtract instructions should be used.
+  bool UseMulOps;
+
+  /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
+  /// whether the FP VML[AS] instructions are slow (if so, don't use them).
+  bool SlowFPVMLx;
+
+  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+  /// forwarding to allow mul + mla being issued back to back.
+  bool HasVMLxForwarding;
+
+  /// SlowFPBrcc - True if floating point compare + branch is slow.
+  bool SlowFPBrcc;
+
+  /// InThumbMode - True if compiling for Thumb, false for ARM.
+  bool InThumbMode;
+
+  /// HasThumb2 - True if Thumb2 instructions are supported.
+  bool HasThumb2;
+
+  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs -
+  /// v6m, v7m for example.
+  bool IsMClass;
+
+  /// NoARM - True if subtarget does not support ARM mode execution.
+  bool NoARM;
+
+  /// PostRAScheduler - True if using post-register-allocation scheduler.
+  bool PostRAScheduler;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+
+  /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
+  /// imms (including global addresses).
+  bool UseMovt;
+
+  /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+  /// must be able to synthesize call stubs for interworking between ARM and
+  /// Thumb.
+  bool SupportsTailCall;
+
+  /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
+  /// only so far)
+  bool HasFP16;
+
+  /// HasD16 - True if subtarget is limited to 16 double precision
+  /// FP registers for VFPv3.
+  bool HasD16;
+
+  /// HasHardwareDivide - True if subtarget supports [su]div
+  bool HasHardwareDivide;
+
+  /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+  bool HasHardwareDivideInARM;
+
+  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
+  /// instructions.
+  bool HasT2ExtractPack;
+
+  /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
+  /// instructions.
+  bool HasDataBarrier;
+
+  /// Pref32BitThumb - If true, codegen would prefer 32-bit Thumb instructions
+  /// over 16-bit ones.
+  bool Pref32BitThumb;
+
+  /// AvoidCPSRPartialUpdate - If true, codegen would avoid using instructions
+  /// that partially update CPSR and add false dependency on the previous
+  /// CPSR setting instruction.
+  bool AvoidCPSRPartialUpdate;
+
+  /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
+  /// movs with shifter operand (i.e. asr, lsl, lsr).
+  bool AvoidMOVsShifterOperand;
+
+  /// HasRAS - Some processors perform return stack prediction. CodeGen should
+  /// avoid issue "normal" call instructions to callees which do not return.
+  bool HasRAS;
+
+  /// HasMPExtension - True if the subtarget supports Multiprocessing
+  /// extension (ARMv7 only).
+  bool HasMPExtension;
+
+  /// FPOnlySP - If true, the floating point unit only supports single
+  /// precision.
+  bool FPOnlySP;
+
+  /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
+  /// accesses for some types.  For details, see
+  /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
+  bool AllowsUnalignedMem;
+
+  /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
+  /// and such) instructions in Thumb2 code.
+  bool Thumb2DSP;
+
+  /// NaCl TRAP instruction is generated instead of the regular TRAP.
+  bool UseNaClTrap;
+
+  /// Target machine allowed unsafe FP math (such as use of NEON fp)
+  bool UnsafeFPMath;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// SchedModel - Processor specific instruction costs.
+  const MCSchedModel *SchedModel;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Options passed via command line that could influence the target
+  const TargetOptions &Options;
+
+ public:
+  enum {
+    isELF, isDarwin
+  } TargetType;
+
+  enum {
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  ARMSubtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, const TargetOptions &Options);
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const {
+    // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1.
+    // Change this once Thumb1 ldmia / stmia support is added.
+    return isThumb1Only() ? 0 : 64;
+  }
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// \brief Reset the features for the ARM target.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  void computeIssueWidth();
+
+  bool hasV4TOps()  const { return HasV4TOps;  }
+  bool hasV5TOps()  const { return HasV5TOps;  }
+  bool hasV5TEOps() const { return HasV5TEOps; }
+  bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6T2Ops() const { return HasV6T2Ops; }
+  bool hasV7Ops()   const { return HasV7Ops;  }
+
+  bool isCortexA5() const { return ARMProcFamily == CortexA5; }
+  bool isCortexA8() const { return ARMProcFamily == CortexA8; }
+  bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+  bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+  bool isSwift()    const { return ARMProcFamily == Swift; }
+  bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15(); }
+  bool isCortexR5() const { return ARMProcFamily == CortexR5; }
+
+  bool hasARMOps() const { return !NoARM; }
+
+  bool hasVFP2() const { return HasVFPv2; }
+  bool hasVFP3() const { return HasVFPv3; }
+  bool hasVFP4() const { return HasVFPv4; }
+  bool hasNEON() const { return HasNEON;  }
+  bool useNEONForSinglePrecisionFP() const {
+    return hasNEON() && UseNEONForSinglePrecisionFP; }
+
+  bool hasDivide() const { return HasHardwareDivide; }
+  bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
+  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
+  bool hasDataBarrier() const { return HasDataBarrier; }
+  bool useMulOps() const { return UseMulOps; }
+  bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
+  bool isFPBrccSlow() const { return SlowFPBrcc; }
+  bool isFPOnlySP() const { return FPOnlySP; }
+  bool prefers32BitThumb() const { return Pref32BitThumb; }
+  bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+  bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
+  bool hasRAS() const { return HasRAS; }
+  bool hasMPExtension() const { return HasMPExtension; }
+  bool hasThumb2DSP() const { return Thumb2DSP; }
+  bool useNaClTrap() const { return UseNaClTrap; }
+
+  bool hasFP16() const { return HasFP16; }
+  bool hasD16() const { return HasD16; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NaCl;
+  }
+  bool isTargetELF() const { return !isTargetDarwin(); }
+
+  bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
+  bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
+
+  bool isThumb() const { return InThumbMode; }
+  bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
+  bool isThumb2() const { return InThumbMode && HasThumb2; }
+  bool hasThumb2() const { return HasThumb2; }
+  bool isMClass() const { return IsMClass; }
+  bool isARClass() const { return !IsMClass; }
+
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  bool useMovt() const { return UseMovt && hasV6T2Ops(); }
+  bool supportsTailCall() const { return SupportsTailCall; }
+
+  bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
+
+  const std::string & getCPUString() const { return CPUString; }
+
+  unsigned getMispredictionPenalty() const;
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
+  /// symbol.
+  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 000000000000..42c7d2c437e0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -0,0 +1,225 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARM.h"
+#include "ARMFrameLowering.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableGlobalMerge("global-merge", cl::Hidden,
+                  cl::desc("Enable global merge pass"),
+                  cl::init(true));
+
+static cl::opt<bool>
+DisableA15SDOptimization("disable-a15-sd-optimization", cl::Hidden,
+                   cl::desc("Inhibit optimization of S->D register accesses on A15"),
+                   cl::init(false));
+
+extern "C" void LLVMInitializeARMTarget() {
+  // Register the target.
+  RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
+  RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
+}
+
+
+/// TargetMachine ctor - Create an ARM architecture model.
+///
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, Options),
+    JITInfo(),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+  // Default to soft float ABI
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType = FloatABI::Soft;
+}
+
+void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our ARM pass. This
+  // allows the ARM pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createARMTargetTransformInfoPass(this));
+}
+
+
+void ARMTargetMachine::anchor() { }
+
+ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    InstrInfo(Subtarget),
+    DL(Subtarget.isAPCS_ABI() ?
+               std::string("e-p:32:32-f64:32:64-i64:32:64-"
+                           "v128:32:128-v64:32:64-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32-S64") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32-S32")),
+    TLInfo(*this),
+    TSInfo(*this),
+    FrameLowering(Subtarget) {
+  if (!Subtarget.hasARMOps())
+    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+                       "support ARM mode execution!");
+}
+
+void ThumbTargetMachine::anchor() { }
+
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    InstrInfo(Subtarget.hasThumb2()
+              ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
+              : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
+    DL(Subtarget.isAPCS_ABI() ?
+               std::string("e-p:32:32-f64:32:64-i64:32:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:32:128-v64:32:64-a:0:32-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32-S64") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32-S32")),
+    TLInfo(*this),
+    TSInfo(*this),
+    FrameLowering(Subtarget.hasThumb2()
+              ? new ARMFrameLowering(Subtarget)
+              : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
+}
+
+namespace {
+/// ARM Code Generator Pass Configuration Options.
+class ARMPassConfig : public TargetPassConfig {
+public:
+  ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  ARMBaseTargetMachine &getARMTargetMachine() const {
+    return getTM<ARMBaseTargetMachine>();
+  }
+
+  const ARMSubtarget &getARMSubtarget() const {
+    return *getARMTargetMachine().getSubtargetImpl();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new ARMPassConfig(this, PM);
+}
+
+bool ARMPassConfig::addPreISel() {
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
+    addPass(createGlobalMergePass(TM->getTargetLowering()));
+
+  return false;
+}
+
+bool ARMPassConfig::addInstSelector() {
+  addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+
+  const ARMSubtarget *Subtarget = &getARMSubtarget();
+  if (Subtarget->isTargetELF() && !Subtarget->isThumb1Only() &&
+      TM->Options.EnableFastISel)
+    addPass(createARMGlobalBaseRegPass());
+  return false;
+}
+
+bool ARMPassConfig::addPreRegAlloc() {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
+    addPass(createARMLoadStoreOptimizationPass(true));
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
+    addPass(createMLxExpansionPass());
+  // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
+  // enabled when NEON is available.
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA15() &&
+    getARMSubtarget().hasNEON() && !DisableA15SDOptimization) {
+    addPass(createA15SDOptimizerPass());
+  }
+  return true;
+}
+
+bool ARMPassConfig::addPreSched2() {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (!getARMSubtarget().isThumb1Only()) {
+      addPass(createARMLoadStoreOptimizationPass());
+      printAndVerify("After ARM load / store optimizer");
+    }
+    if (getARMSubtarget().hasNEON())
+      addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+  }
+
+  // Expand some pseudo instructions into multiple instructions to allow
+  // proper scheduling.
+  addPass(createARMExpandPseudoPass());
+
+  if (getOptLevel() != CodeGenOpt::None) {
+    if (!getARMSubtarget().isThumb1Only())
+      addPass(&IfConverterID);
+  }
+  if (getARMSubtarget().isThumb2())
+    addPass(createThumb2ITBlockPass());
+
+  return true;
+}
+
+bool ARMPassConfig::addPreEmitPass() {
+  if (getARMSubtarget().isThumb2()) {
+    if (!getARMSubtarget().prefers32BitThumb())
+      addPass(createThumb2SizeReductionPass());
+
+    // Constant island pass work on unbundled instructions.
+    addPass(&UnpackMachineBundlesID);
+  }
+
+  addPass(createARMConstantIslandPass());
+
+  return true;
+}
+
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          JITCodeEmitter &JCE) {
+  // Machine code emitter pass for ARM.
+  PM.add(createARMJITCodeEmitterPass(*this, JCE));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 000000000000..d4caf5ca6e19
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -0,0 +1,146 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETMACHINE_H
+#define ARMTARGETMACHINE_H
+
+#include "ARMFrameLowering.h"
+#include "ARMISelLowering.h"
+#include "ARMInstrInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSelectionDAGInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb1FrameLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class ARMBaseTargetMachine : public LLVMTargetMachine {
+protected:
+  ARMSubtarget        Subtarget;
+private:
+  ARMJITInfo          JITInfo;
+  InstrItineraryData  InstrItins;
+
+public:
+  ARMBaseTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const ARMTargetLowering *getTargetLowering() const {
+    // Implemented by derived classes
+    llvm_unreachable("getTargetLowering not implemented");
+  }
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
+  }
+
+  /// \brief Register ARM analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE);
+};
+
+/// ARMTargetMachine - ARM target machine.
+///
+class ARMTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+  ARMInstrInfo        InstrInfo;
+  const DataLayout    DL;       // Calculates type size & alignment
+  ARMTargetLowering   TLInfo;
+  ARMSelectionDAGInfo TSInfo;
+  ARMFrameLowering    FrameLowering;
+ public:
+  ARMTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
+
+  virtual const ARMRegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const ARMTargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const ARMSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual const ARMFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
+};
+
+/// ThumbTargetMachine - Thumb target machine.
+/// Due to the way architectures are handled, this represents both
+///   Thumb-1 and Thumb-2.
+///
+class ThumbTargetMachine : public ARMBaseTargetMachine {
+  virtual void anchor();
+  // Either Thumb1InstrInfo or Thumb2InstrInfo.
+  OwningPtr<ARMBaseInstrInfo> InstrInfo;
+  const DataLayout    DL;   // Calculates type size & alignment
+  ARMTargetLowering   TLInfo;
+  ARMSelectionDAGInfo TSInfo;
+  // Either Thumb1FrameLowering or ARMFrameLowering.
+  OwningPtr<ARMFrameLowering> FrameLowering;
+public:
+  ThumbTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+
+  /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
+  virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+  virtual const ARMTargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const ARMSelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  /// returns either Thumb1InstrInfo or Thumb2InstrInfo
+  virtual const ARMBaseInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
+  /// returns either Thumb1FrameLowering or ARMFrameLowering
+  virtual const ARMFrameLowering *getFrameLowering() const {
+    return FrameLowering.get();
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
new file mode 100644
index 000000000000..dfdf6ab356a3
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -0,0 +1,53 @@
+//===-- llvm/Target/ARMTargetObjectFile.cpp - ARM Object Info Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetObjectFile.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+//                               ELF Target
+//===----------------------------------------------------------------------===//
+
+void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
+                                        const TargetMachine &TM) {
+  bool isAAPCS_ABI = TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI();
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(isAAPCS_ABI);
+
+  if (isAAPCS_ABI) {
+    LSDASection = NULL;
+  }
+
+  AttributesSection =
+    getContext().getELFSection(".ARM.attributes",
+                               ELF::SHT_ARM_ATTRIBUTES,
+                               0,
+                               SectionKind::getMetadata());
+}
+
+const MCExpr *ARMElfTargetObjectFile::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI, unsigned Encoding,
+                        MCStreamer &Streamer) const {
+  assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
+
+  return MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+                                 MCSymbolRefExpr::VK_ARM_TARGET2,
+                                 getContext());
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 000000000000..7f60727e5305
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -0,0 +1,43 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+class MCContext;
+class TargetMachine;
+
+class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+protected:
+  const MCSection *AttributesSection;
+public:
+  ARMElfTargetObjectFile() :
+    TargetLoweringObjectFileELF(),
+    AttributesSection(NULL)
+  {}
+
+  virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+  const MCExpr *
+  getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                          MachineModuleInfo *MMI, unsigned Encoding,
+                          MCStreamer &Streamer) const;
+  
+  virtual const MCSection *getAttributesSection() const {
+    return AttributesSection;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
new file mode 100644
index 000000000000..1019b972e957
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -0,0 +1,458 @@
+//===-- ARMTargetTransformInfo.cpp - ARM specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// ARM target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "armtti"
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeARMTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class ARMTTI : public ImmutablePass, public TargetTransformInfo {
+  const ARMBaseTargetMachine *TM;
+  const ARMSubtarget *ST;
+  const ARMTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  ARMTTI(const ARMBaseTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeARMTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+
+  virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
+
+  /// @}
+
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 16;
+      return 0;
+    }
+
+    if (ST->isThumb1Only())
+      return 8;
+    return 16;
+  }
+
+  unsigned getRegisterBitWidth(bool Vector) const {
+    if (Vector) {
+      if (ST->hasNEON())
+        return 128;
+      return 0;
+    }
+
+    return 32;
+  }
+
+  unsigned getMaximumUnrollFactor() const {
+    // These are out of order CPUs:
+    if (ST->isCortexA15() || ST->isSwift())
+      return 2;
+    return 1;
+  }
+
+  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                          int Index, Type *SubTp) const;
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                      Type *Src) const;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
+
+  unsigned getAddressComputationCost(Type *Val) const;
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(ARMTTI, TargetTransformInfo, "armtti",
+                   "ARM Target Transform Info", true, true, false)
+char ARMTTI::ID = 0;
+
+ImmutablePass *
+llvm::createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM) {
+  return new ARMTTI(TM);
+}
+
+
+unsigned ARMTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return 4;
+
+  int32_t SImmVal = Imm.getSExtValue();
+  uint32_t ZImmVal = Imm.getZExtValue();
+  if (!ST->isThumb()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  } else if (ST->isThumb2()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+      return 1;
+    return ST->hasV6T2Ops() ? 2 : 3;
+  } else /*Thumb1*/ {
+    if (SImmVal >= 0 && SImmVal < 256)
+      return 1;
+    if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+      return 2;
+    // Load from constantpool.
+    return 3;
+  }
+  return 2;
+}
+
+unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Single to/from double precision conversions.
+  static const CostTblEntry<MVT> NEONFltDblTbl[] = {
+    // Vector fptrunc/fpext conversions.
+    { ISD::FP_ROUND,   MVT::v2f64, 2 },
+    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
+    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
+  };
+
+  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
+                                          ISD == ISD::FP_EXTEND)) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+    int Idx = CostTableLookup<MVT>(NEONFltDblTbl, array_lengthof(NEONFltDblTbl),
+                                ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * NEONFltDblTbl[Idx].Cost;
+  }
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  // Some arithmetic, load and store operations have specific instructions
+  // to cast up/down their types automatically at no extra cost.
+  // TODO: Get these tables to know at least what the related operations are.
+  static const TypeConversionCostTblEntry<MVT> NEONVectorConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
+    { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
+
+    // The number of vmovl instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
+    { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
+
+    // Operations that we legalize using load/stores to the stack.
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i32, 4*1 + 16*2 + 2*1 },
+    { ISD::TRUNCATE,    MVT::v8i8, MVT::v8i32, 2*1 + 8*2 + 1 },
+
+    // Vector float <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 4 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 8 },
+    { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 4 },
+
+    { ISD::FP_TO_SINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v4i8, MVT::v4f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f32, 2 },
+
+    // Vector double <-> i32 conversions.
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i8, 4 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i32, 2 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v2i32, MVT::v2f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 4 },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 8 },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 8 }
+  };
+
+  if (SrcTy.isVector() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONVectorConversionTbl,
+                                array_lengthof(NEONVectorConversionTbl),
+                                ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONVectorConversionTbl[Idx].Cost;
+  }
+
+  // Scalar float to integer conversions.
+  static const TypeConversionCostTblEntry<MVT> NEONFloatConversionTbl[] = {
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i1, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i8, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i16, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f32, 2 },
+    { ISD::FP_TO_SINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::i32, MVT::f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f32, 10 },
+    { ISD::FP_TO_SINT,  MVT::i64, MVT::f64, 10 },
+    { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
+  };
+  if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONFloatConversionTbl,
+                                        array_lengthof(NEONFloatConversionTbl),
+                                        ISD, DstTy.getSimpleVT(),
+                                        SrcTy.getSimpleVT());
+    if (Idx != -1)
+        return NEONFloatConversionTbl[Idx].Cost;
+  }
+
+  // Scalar integer to float conversions.
+  static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i1, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i8, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i16, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i32, 2 },
+    { ISD::SINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f32, MVT::i64, 10 },
+    { ISD::SINT_TO_FP,  MVT::f64, MVT::i64, 10 },
+    { ISD::UINT_TO_FP,  MVT::f64, MVT::i64, 10 }
+  };
+
+  if (SrcTy.isInteger() && ST->hasNEON()) {
+    int Idx = ConvertCostTableLookup<MVT>(NEONIntegerConversionTbl,
+                                       array_lengthof(NEONIntegerConversionTbl),
+                                       ISD, DstTy.getSimpleVT(),
+                                       SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONIntegerConversionTbl[Idx].Cost;
+  }
+
+  // Scalar integer conversion costs.
+  static const TypeConversionCostTblEntry<MVT> ARMIntegerConversionTbl[] = {
+    // i16 -> i64 requires two dependent operations.
+    { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
+
+    // Truncates on i64 are assumed to be free.
+    { ISD::TRUNCATE,    MVT::i32, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i16, MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i8,  MVT::i64, 0 },
+    { ISD::TRUNCATE,    MVT::i1,  MVT::i64, 0 }
+  };
+
+  if (SrcTy.isInteger()) {
+    int Idx =
+      ConvertCostTableLookup<MVT>(ARMIntegerConversionTbl,
+                                  array_lengthof(ARMIntegerConversionTbl),
+                                  ISD, DstTy.getSimpleVT(),
+                                  SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return ARMIntegerConversionTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned ARMTTI::getVectorInstrCost(unsigned Opcode, Type *ValTy,
+                                    unsigned Index) const {
+  // Penalize inserting into an D-subregister. We end up with a three times
+  // lower estimated throughput on swift.
+  if (ST->isSwift() &&
+      Opcode == Instruction::InsertElement &&
+      ValTy->isVectorTy() &&
+      ValTy->getScalarSizeInBits() <= 32)
+    return 3;
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, ValTy, Index);
+}
+
+unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // On NEON a a vector select gets lowered to vbsl.
+  if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // Lowering of some vector selects is currently far from perfect.
+    static const TypeConversionCostTblEntry<MVT> NEONVectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
+                                          array_lengthof(NEONVectorSelectTbl),
+                                          ISD, SelCondTy.getSimpleVT(),
+                                          SelValTy.getSimpleVT());
+    if (Idx != -1)
+      return NEONVectorSelectTbl[Idx].Cost;
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+    return LT.first;
+  }
+
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  // We only handle costs of reverse shuffles for now.
+  if (Kind != SK_Reverse)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  static const CostTblEntry<MVT> NEONShuffleTbl[] = {
+    // Reverse shuffle cost one instruction if we are shuffling within a double
+    // word (vrev) or two if we shuffle a quad word (vrev, vext).
+    { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
+    { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
+
+    { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
+    { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
+  };
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+  int Idx = CostTableLookup<MVT>(NEONShuffleTbl, array_lengthof(NEONShuffleTbl),
+                                 ISD::VECTOR_SHUFFLE, LT.second);
+  if (Idx == -1)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  return LT.first * NEONShuffleTbl[Idx].Cost;
+}
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 000000000000..ed7b7ec9d2cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -0,0 +1,7874 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class ARMOperand;
+
+enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
+
+class ARMAsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCRegisterInfo *MRI;
+
+  // Map of register aliases registers via the .req directive.
+  StringMap<unsigned> RegisterReqs;
+
+  struct {
+    ARMCC::CondCodes Cond;    // Condition for IT block.
+    unsigned Mask:4;          // Condition mask for instructions.
+                              // Starting at first 1 (from lsb).
+                              //   '1'  condition as indicated in IT.
+                              //   '0'  inverse of condition (else).
+                              // Count of instructions in IT block is
+                              // 4 - trailingzeroes(mask)
+
+    bool FirstCond;           // Explicit flag for when we're parsing the
+                              // First instruction in the IT block. It's
+                              // implied in the mask, so needs special
+                              // handling.
+
+    unsigned CurPosition;     // Current position in parsing of IT
+                              // block. In range [0,3]. Initialized
+                              // according to count of instructions in block.
+                              // ~0U if no active IT block.
+  } ITState;
+  bool inITBlock() { return ITState.CurPosition != ~0U;}
+  void forwardITPosition() {
+    if (!inITBlock()) return;
+    // Move to the next instruction in the IT block, if there is one. If not,
+    // mark the block as done.
+    unsigned TZ = CountTrailingZeros_32(ITState.Mask);
+    if (++ITState.CurPosition == 5 - TZ)
+      ITState.CurPosition = ~0U; // Done with the IT block after this.
+  }
+
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  bool Warning(SMLoc L, const Twine &Msg,
+               ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    return Parser.Warning(L, Msg, Ranges);
+  }
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+    return Parser.Error(L, Msg, Ranges);
+  }
+
+  int tryParseRegister();
+  bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
+  int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+  bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+                              unsigned &ShiftAmount);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveARM(SMLoc L);
+  bool parseDirectiveThumbFunc(SMLoc L);
+  bool parseDirectiveCode(SMLoc L);
+  bool parseDirectiveSyntax(SMLoc L);
+  bool parseDirectiveReq(StringRef Name, SMLoc L);
+  bool parseDirectiveUnreq(SMLoc L);
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveEabiAttr(SMLoc L);
+
+  StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+                          bool &CarrySetting, unsigned &ProcessorIMod,
+                          StringRef &ITMask);
+  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+                             bool &CanAcceptPredicationCode);
+
+  bool isThumb() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+  }
+  bool isThumbOne() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) == 0;
+  }
+  bool isThumbTwo() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
+  }
+  bool hasV6Ops() const {
+    return STI.getFeatureBits() & ARM::HasV6Ops;
+  }
+  bool hasV7Ops() const {
+    return STI.getFeatureBits() & ARM::HasV7Ops;
+  }
+  void SwitchMode() {
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
+    setAvailableFeatures(FB);
+  }
+  bool isMClass() const {
+    return STI.getFeatureBits() & ARM::FeatureMClass;
+  }
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "ARMGenAsmMatcher.inc"
+
+  /// }
+
+  OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseCoprocNumOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseCoprocRegOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseCoprocOptionOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseMemBarrierOptOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseProcIFlagsOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseMSRMaskOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O,
+                                   StringRef Op, int Low, int High);
+  OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "lsl", 0, 31);
+  }
+  OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "asr", 1, 32);
+  }
+  OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseVectorList(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index,
+                                       SMLoc &EndLoc);
+
+  // Asm Match Converter Methods
+  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdExtTWriteBackImm(MCInst &Inst,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdExtTWriteBackReg(MCInst &Inst,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStExtTWriteBackImm(MCInst &Inst,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStExtTWriteBackReg(MCInst &Inst,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtThumbMultiply(MCInst &Inst,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtVLDwbFixed(MCInst &Inst,
+                     const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtVLDwbRegister(MCInst &Inst,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtVSTwbFixed(MCInst &Inst,
+                     const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtVSTwbRegister(MCInst &Inst,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool validateInstruction(MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool processInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic,
+                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+public:
+  enum ARMMatchResultTy {
+    Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresNotITBlock,
+    Match_RequiresV6,
+    Match_RequiresThumb2,
+#define GET_OPERAND_DIAGNOSTIC_TYPES
+#include "ARMGenAsmMatcher.inc"
+
+  };
+
+  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+    MCAsmParserExtension::Initialize(_Parser);
+
+    // Cache the MCRegisterInfo.
+    MRI = &getContext().getRegisterInfo();
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Not in an ITBlock to start with.
+    ITState.CurPosition = ~0U;
+
+    // Set ELF header flags.
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the statu/parseDirects quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&Parser.getStreamer()))
+      MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+  }
+
+  // Implementation of the MCTargetAsmParser interface:
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseDirective(AsmToken DirectiveID);
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+  unsigned checkTargetMatchPredicate(MCInst &Inst);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+};
+} // end anonymous namespace
+
+namespace {
+
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// operand.
+class ARMOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    k_CondCode,
+    k_CCOut,
+    k_ITCondMask,
+    k_CoprocNum,
+    k_CoprocReg,
+    k_CoprocOption,
+    k_Immediate,
+    k_MemBarrierOpt,
+    k_Memory,
+    k_PostIndexRegister,
+    k_MSRMask,
+    k_ProcIFlags,
+    k_VectorIndex,
+    k_Register,
+    k_RegisterList,
+    k_DPRRegisterList,
+    k_SPRRegisterList,
+    k_VectorList,
+    k_VectorListAllLanes,
+    k_VectorListIndexed,
+    k_ShiftedRegister,
+    k_ShiftedImmediate,
+    k_ShifterImmediate,
+    k_RotateImmediate,
+    k_BitfieldDescriptor,
+    k_Token
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SmallVector<unsigned, 8> Registers;
+
+  struct CCOp {
+    ARMCC::CondCodes Val;
+  };
+
+  struct CopOp {
+    unsigned Val;
+  };
+
+  struct CoprocOptionOp {
+    unsigned Val;
+  };
+
+  struct ITMaskOp {
+    unsigned Mask:4;
+  };
+
+  struct MBOptOp {
+    ARM_MB::MemBOpt Val;
+  };
+
+  struct IFlagsOp {
+    ARM_PROC::IFlags Val;
+  };
+
+  struct MMaskOp {
+    unsigned Val;
+  };
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned LaneIndex;
+    bool isDoubleSpaced;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  /// Combined record for all forms of ARM address expressions.
+  struct MemoryOp {
+    unsigned BaseRegNum;
+    // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+    // was specified.
+    const MCConstantExpr *OffsetImm;  // Offset immediate value
+    unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+    ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+    unsigned ShiftImm;        // shift for OffsetReg.
+    unsigned Alignment;       // 0 = no alignment specified
+    // n = alignment in bytes (2, 4, 8, 16, or 32)
+    unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+  };
+
+  struct PostIdxRegOp {
+    unsigned RegNum;
+    bool isAdd;
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned ShiftImm;
+  };
+
+  struct ShifterImmOp {
+    bool isASR;
+    unsigned Imm;
+  };
+
+  struct RegShiftedRegOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftReg;
+    unsigned ShiftImm;
+  };
+
+  struct RegShiftedImmOp {
+    ARM_AM::ShiftOpc ShiftTy;
+    unsigned SrcReg;
+    unsigned ShiftImm;
+  };
+
+  struct RotImmOp {
+    unsigned Imm;
+  };
+
+  struct BitfieldOp {
+    unsigned LSB;
+    unsigned Width;
+  };
+
+  union {
+    struct CCOp CC;
+    struct CopOp Cop;
+    struct CoprocOptionOp CoprocOption;
+    struct MBOptOp MBOpt;
+    struct ITMaskOp ITMask;
+    struct IFlagsOp IFlags;
+    struct MMaskOp MMask;
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct MemoryOp Memory;
+    struct PostIdxRegOp PostIdxReg;
+    struct ShifterImmOp ShifterImm;
+    struct RegShiftedRegOp RegShiftedReg;
+    struct RegShiftedImmOp RegShiftedImm;
+    struct RotImmOp RotImm;
+    struct BitfieldOp Bitfield;
+  };
+
+  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+  ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_CondCode:
+      CC = o.CC;
+      break;
+    case k_ITCondMask:
+      ITMask = o.ITMask;
+      break;
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_CCOut:
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_RegisterList:
+    case k_DPRRegisterList:
+    case k_SPRRegisterList:
+      Registers = o.Registers;
+      break;
+    case k_VectorList:
+    case k_VectorListAllLanes:
+    case k_VectorListIndexed:
+      VectorList = o.VectorList;
+      break;
+    case k_CoprocNum:
+    case k_CoprocReg:
+      Cop = o.Cop;
+      break;
+    case k_CoprocOption:
+      CoprocOption = o.CoprocOption;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_MemBarrierOpt:
+      MBOpt = o.MBOpt;
+      break;
+    case k_Memory:
+      Memory = o.Memory;
+      break;
+    case k_PostIndexRegister:
+      PostIdxReg = o.PostIdxReg;
+      break;
+    case k_MSRMask:
+      MMask = o.MMask;
+      break;
+    case k_ProcIFlags:
+      IFlags = o.IFlags;
+      break;
+    case k_ShifterImmediate:
+      ShifterImm = o.ShifterImm;
+      break;
+    case k_ShiftedRegister:
+      RegShiftedReg = o.RegShiftedReg;
+      break;
+    case k_ShiftedImmediate:
+      RegShiftedImm = o.RegShiftedImm;
+      break;
+    case k_RotateImmediate:
+      RotImm = o.RotImm;
+      break;
+    case k_BitfieldDescriptor:
+      Bitfield = o.Bitfield;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
+  ARMCC::CondCodes getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CC.Val;
+  }
+
+  unsigned getCoproc() const {
+    assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
+    return Cop.Val;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const SmallVectorImpl<unsigned> &getRegList() const {
+    assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+            Kind == k_SPRRegisterList) && "Invalid access!");
+    return Registers;
+  }
+
+  const MCExpr *getImm() const {
+    assert(isImm() && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
+  ARM_MB::MemBOpt getMemBarrierOpt() const {
+    assert(Kind == k_MemBarrierOpt && "Invalid access!");
+    return MBOpt.Val;
+  }
+
+  ARM_PROC::IFlags getProcIFlags() const {
+    assert(Kind == k_ProcIFlags && "Invalid access!");
+    return IFlags.Val;
+  }
+
+  unsigned getMSRMask() const {
+    assert(Kind == k_MSRMask && "Invalid access!");
+    return MMask.Val;
+  }
+
+  bool isCoprocNum() const { return Kind == k_CoprocNum; }
+  bool isCoprocReg() const { return Kind == k_CoprocReg; }
+  bool isCoprocOption() const { return Kind == k_CoprocOption; }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isCCOut() const { return Kind == k_CCOut; }
+  bool isITMask() const { return Kind == k_ITCondMask; }
+  bool isITCondCode() const { return Kind == k_CondCode; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isFPImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    return Val != -1;
+  }
+  bool isFBits16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 16;
+  }
+  bool isFBits32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 1 && Value <= 32;
+  }
+  bool isImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+  }
+  bool isImm0_1020s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+  }
+  bool isImm0_508s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+  }
+  bool isImm0_508s4Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    // explicitly exclude zero. we want that to use the normal 0_508 version.
+    return ((Value & 3) == 0) && Value > 0 && Value <= 508;
+  }
+  bool isImm0_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 256;
+  }
+  bool isImm0_4095() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4096;
+  }
+  bool isImm0_4095Neg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = -CE->getValue();
+    return Value > 0 && Value < 4096;
+  }
+  bool isImm0_1() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 2;
+  }
+  bool isImm0_3() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 4;
+  }
+  bool isImm0_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 8;
+  }
+  bool isImm0_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 16;
+  }
+  bool isImm0_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isImm0_63() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 64;
+  }
+  bool isImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 8;
+  }
+  bool isImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 16;
+  }
+  bool isImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 32;
+  }
+  bool isShrImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 8;
+  }
+  bool isShrImm16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 16;
+  }
+  bool isShrImm32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isShrImm64() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 64;
+  }
+  bool isImm1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 8;
+  }
+  bool isImm1_15() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 16;
+  }
+  bool isImm1_31() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 32;
+  }
+  bool isImm1_16() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 17;
+  }
+  bool isImm1_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isImm0_32() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 33;
+  }
+  bool isImm0_65535() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm0_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 65536;
+  }
+  bool isImm24bit() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 0xffffff;
+  }
+  bool isImmThumbSR() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
+  bool isPKHLSLImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isPKHASRImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isAdrLabel() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, but it can't fit 
+    // into shift immediate encoding, we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm())) return true;
+    else return (isARMSOImm() || isARMSOImmNeg());
+  }
+  bool isARMSOImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(Value) != -1;
+  }
+  bool isARMSOImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getSOImmVal(~Value) != -1;
+  }
+  bool isARMSOImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getSOImmVal(Value) == -1 &&
+      ARM_AM::getSOImmVal(-Value) != -1;
+  }
+  bool isT2SOImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) != -1;
+  }
+  bool isT2SOImmNot() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(~Value) != -1;
+  }
+  bool isT2SOImmNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // Only use this when not representable as a plain so_imm.
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(-Value) != -1;
+  }
+  bool isSetEndImm() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value == 1 || Value == 0;
+  }
+  bool isReg() const { return Kind == k_Register; }
+  bool isRegList() const { return Kind == k_RegisterList; }
+  bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isMem() const { return Kind == k_Memory; }
+  bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxReg() const {
+    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
+  }
+  bool isMemNoOffset(bool alignOK = false) const {
+    if (!isMem())
+      return false;
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
+     (alignOK || Memory.Alignment == 0);
+  }
+  bool isMemPCRelImm12() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base register must be PC.
+    if (Memory.BaseRegNum != ARM::PC)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isAlignedMemory() const {
+    return isMemNoOffset(true);
+  }
+  bool isAddrMode2() const {
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAM2OffsetImm() const {
+    if (!isImm()) return false;
+    // Immediate offset in range [-4095, 4095].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAddrMode3() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // No shifts are legal for AM3.
+    if (Memory.ShiftType != ARM_AM::no_shift) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // The #-0 offset is encoded as INT32_MIN, and we have to check 
+    // for this too.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAM3Offset() const {
+    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+      return false;
+    if (Kind == k_PostIndexRegister)
+      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+    // Immediate offset in range [-255, 255].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAddrMode5() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-1020, 1020] and a multiple of 4.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+      Val == INT32_MIN;
+  }
+  bool isMemTBB() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isMemTBH() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+        Memory.Alignment != 0 )
+      return false;
+    return true;
+  }
+  bool isMemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isT2MemRegOffset() const {
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.Alignment != 0)
+      return false;
+    // Only lsl #{0, 1, 2, 3} allowed.
+    if (Memory.ShiftType == ARM_AM::no_shift)
+      return true;
+    if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
+      return false;
+    return true;
+  }
+  bool isMemThumbRR() const {
+    // Thumb reg+reg addressing is simple. Just two registers, a base and
+    // an offset. No shifts, negations or any other complicating factors.
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return isARMLowRegister(Memory.BaseRegNum) &&
+      (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+  }
+  bool isMemThumbRIs4() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 124].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+  }
+  bool isMemThumbRIs2() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 62].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+  }
+  bool isMemThumbRIs1() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 31].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 31;
+  }
+  bool isMemThumbSPI() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
+        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+  }
+  bool isMemImm8s4Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [-1020, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
+  }
+  bool isMemImm0_1020s4Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+  }
+  bool isMemPosImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val < 256;
+  }
+  bool isMemNegImm8Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Base reg of PC isn't allowed for these encodings.
+    if (Memory.BaseRegNum == ARM::PC) return false;
+    // Immediate offset in range [-255, -1].
+    if (!Memory.OffsetImm) return false;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 0);
+  }
+  bool isMemUImm12Offset() const {
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= 0 && Val < 4096);
+  }
+  bool isMemImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm() && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8s4() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+      (Val == INT32_MIN);
+  }
+
+  bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isProcIFlags() const { return Kind == k_ProcIFlags; }
+
+  // NEON operands.
+  bool isSingleSpacedVectorList() const {
+    return Kind == k_VectorList && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorList() const {
+    return Kind == k_VectorList && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPair() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourD() const {
+    if (!isSingleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListDPairSpaced() const {
+    if (isSingleSpacedVectorList()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairSpcRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListThreeQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourQ() const {
+    if (!isDoubleSpacedVectorList()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorAllLanes() const {
+    return Kind == k_VectorListAllLanes && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 1;
+  }
+
+  bool isVecListDPairAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return (ARMMCRegisterClasses[ARM::DPairRegClassID]
+              .contains(VectorList.RegNum));
+  }
+
+  bool isVecListDPairSpacedAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 2;
+  }
+
+  bool isVecListThreeDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListThreeQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 3;
+  }
+
+  bool isVecListFourDAllLanes() const {
+    if (!isSingleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isVecListFourQAllLanes() const {
+    if (!isDoubleSpacedVectorAllLanes()) return false;
+    return VectorList.Count == 4;
+  }
+
+  bool isSingleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced;
+  }
+  bool isDoubleSpacedVectorIndexed() const {
+    return Kind == k_VectorListIndexed && VectorList.isDoubleSpaced;
+  }
+  bool isVecListOneDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListOneDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListOneDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 1 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListTwoDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListTwoQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListTwoDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 2 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListThreeDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListThreeQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListThreeDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 3 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourDByteIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 7;
+  }
+
+  bool isVecListFourDHWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourQWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVecListFourQHWordIndexed() const {
+    if (!isDoubleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 3;
+  }
+
+  bool isVecListFourDWordIndexed() const {
+    if (!isSingleSpacedVectorIndexed()) return false;
+    return VectorList.Count == 4 && VectorList.LaneIndex <= 1;
+  }
+
+  bool isVectorIndex8() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 8;
+  }
+  bool isVectorIndex16() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 4;
+  }
+  bool isVectorIndex32() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 2;
+  }
+
+  bool isNEONi8splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i8 value splatted across 8 bytes. The immediate is just the 8 byte
+    // value.
+    return Value >= 0 && Value < 256;
+  }
+
+  bool isNEONi16splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i16 value in the range [0,255] or [0x0100, 0xff00]
+    return (Value >= 0 && Value < 256) || (Value >= 0x0100 && Value <= 0xff00);
+  }
+
+  bool isNEONi32splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000);
+  }
+
+  bool isNEONi32vmov() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+  bool isNEONi32vmovNeg() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    int64_t Value = ~CE->getValue();
+    // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
+    // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
+    return (Value >= 0 && Value < 256) ||
+      (Value >= 0x0100 && Value <= 0xff00) ||
+      (Value >= 0x010000 && Value <= 0xff0000) ||
+      (Value >= 0x01000000 && Value <= 0xff000000) ||
+      (Value >= 0x01ff && Value <= 0xffff && (Value & 0xff) == 0xff) ||
+      (Value >= 0x01ffff && Value <= 0xffffff && (Value & 0xffff) == 0xffff);
+  }
+
+  bool isNEONi64splat() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
+    // i64 value with each byte being either 0 or 0xff.
+    for (unsigned i = 0; i < 8; ++i)
+      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff) return false;
+    return true;
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    Inst.addOperand(MCOperand::CreateReg(RegNum));
+  }
+
+  void addCoprocNumOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCoprocRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCoproc()));
+  }
+
+  void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(CoprocOption.Val));
+  }
+
+  void addITMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(ITMask.Mask));
+  }
+
+  void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+  }
+
+  void addCCOutOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    assert(isRegShiftedReg() &&
+           "addRegShiftedRegOperands() on non RegShiftedReg!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg));
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg));
+    Inst.addOperand(MCOperand::CreateImm(
+      ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
+  }
+
+  void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert(isRegShiftedImm() &&
+           "addRegShiftedImmOperands() on non RegShiftedImm!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
+    // Shift of #32 is encoded as 0 where permitted
+    unsigned Imm = (RegShiftedImm.ShiftImm == 32 ? 0 : RegShiftedImm.ShiftImm);
+    Inst.addOperand(MCOperand::CreateImm(
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, Imm)));
+  }
+
+  void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm((ShifterImm.isASR << 5) |
+                                         ShifterImm.Imm));
+  }
+
+  void addRegListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ++I)
+      Inst.addOperand(MCOperand::CreateReg(*I));
+  }
+
+  void addDPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addSPRRegListOperands(MCInst &Inst, unsigned N) const {
+    addRegListOperands(Inst, N);
+  }
+
+  void addRotImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Encoded as val>>3. The printer handles display as 8, 16, 24.
+    Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3));
+  }
+
+  void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Munge the lsb/width into a bitfield mask.
+    unsigned lsb = Bitfield.LSB;
+    unsigned width = Bitfield.Width;
+    // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+    uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+                      (32 - (lsb + width)));
+    Inst.addOperand(MCOperand::CreateImm(Mask));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addFBits16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(16 - CE->getValue()));
+  }
+
+  void addFBits32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(32 - CE->getValue()));
+  }
+
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // FIXME: We really want to scale the value here, but the LDRD/STRD
+    // instruction don't encode operands that way yet.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm0_508s4NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-(CE->getValue() / 4)));
+  }
+
+  void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate, except for 32, which encodes as
+    // zero.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Imm == 32 ? 0 : Imm)));
+  }
+
+  void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // An ASR value of 32 encodes as 0, so that's how we want to add it to
+    // the instruction as well.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val == 32 ? 0 : Val));
+  }
+
+  void addT2SOImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its bitwise
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
+  }
+
+  void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a t2_so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually an imm0_4095, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a so_imm, but we have its bitwise
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(~CE->getValue()));
+  }
+
+  void addARMSOImmNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The operand is actually a so_imm, but we have its
+    // negation in the assembly source, so twiddle it here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
+  }
+
+  void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
+  }
+
+  void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+  }
+
+  void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int32_t Imm = Memory.OffsetImm->getValue();
+    // FIXME: Handle #-0
+    if (Imm == INT32_MIN) Imm = 0;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(isImm() && "Not an immediate!");
+
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. 
+    if (!isa<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
+  }
+
+  void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                              Memory.ShiftImm, Memory.ShiftType);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant AM2OffsetImm operand!");
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateReg(0));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM3Opc(AddSub, Val);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (Kind == k_PostIndexRegister) {
+      int32_t Val =
+        ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+      Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+      Inst.addOperand(MCOperand::CreateImm(Val));
+      return;
+    }
+
+    // Constant offset.
+    const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM3Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (isImm()) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (isImm()) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
+
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    unsigned Val =
+      ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                        Memory.ShiftImm, Memory.ShiftType);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.ShiftImm));
+  }
+
+  void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
+
+  void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8s4 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    // Immediate is scaled by 4.
+    Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(PostIdxReg.isAdd));
+  }
+
+  void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    // The sign, shift type, and shift amount are encoded in a single operand
+    // using the AM2 encoding helpers.
+    ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+    unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+                                     PostIdxReg.ShiftTy);
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getMSRMask())));
+  }
+
+  void addProcIFlagsOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
+  }
+
+  void addVecListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  }
+
+  void addVecListIndexedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(VectorList.LaneIndex));
+  }
+
+  void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addNEONi8splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    // Mask in that this is an i8 splat.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() | 0xe00));
+  }
+
+  void addNEONi16splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256)
+      Value = (Value >> 8) | 0xa00;
+    else
+      Value |= 0x800;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi32splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256 && Value <= 0xff00)
+      Value = (Value >> 8) | 0x200;
+    else if (Value > 0xffff && Value <= 0xff0000)
+      Value = (Value >> 16) | 0x400;
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = ~CE->getValue();
+    if (Value >= 256 && Value <= 0xffff)
+      Value = (Value >> 8) | ((Value & 0xff) ? 0xc00 : 0x200);
+    else if (Value > 0xffff && Value <= 0xffffff)
+      Value = (Value >> 16) | ((Value & 0xff) ? 0xd00 : 0x400);
+    else if (Value > 0xffffff)
+      Value = (Value >> 24) | 0x600;
+    Inst.addOperand(MCOperand::CreateImm(Value));
+  }
+
+  void addNEONi64splatOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    unsigned Imm = 0;
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+      Imm |= (Value & 1) << i;
+    }
+    Inst.addOperand(MCOperand::CreateImm(Imm | 0x1e00));
+  }
+
+  virtual void print(raw_ostream &OS) const;
+
+  static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_ITCondMask);
+    Op->ITMask.Mask = Mask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_CondCode);
+    Op->CC.Val = CC;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_CoprocNum);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_CoprocReg);
+    Op->Cop.Val = CopVal;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_CoprocOption);
+    Op->Cop.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_CCOut);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateShiftedRegister(ARM_AM::ShiftOpc ShTy,
+                                           unsigned SrcReg,
+                                           unsigned ShiftReg,
+                                           unsigned ShiftImm,
+                                           SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_ShiftedRegister);
+    Op->RegShiftedReg.ShiftTy = ShTy;
+    Op->RegShiftedReg.SrcReg = SrcReg;
+    Op->RegShiftedReg.ShiftReg = ShiftReg;
+    Op->RegShiftedReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy,
+                                            unsigned SrcReg,
+                                            unsigned ShiftImm,
+                                            SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_ShiftedImmediate);
+    Op->RegShiftedImm.ShiftTy = ShTy;
+    Op->RegShiftedImm.SrcReg = SrcReg;
+    Op->RegShiftedImm.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm,
+                                   SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_ShifterImmediate);
+    Op->ShifterImm.isASR = isASR;
+    Op->ShifterImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_RotateImmediate);
+    Op->RotImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width,
+                                    SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor);
+    Op->Bitfield.LSB = LSB;
+    Op->Bitfield.Width = Width;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *
+  CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
+                SMLoc StartLoc, SMLoc EndLoc) {
+    KindTy Kind = k_RegisterList;
+
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first))
+      Kind = k_DPRRegisterList;
+    else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+             contains(Regs.front().first))
+      Kind = k_SPRRegisterList;
+
+    ARMOperand *Op = new ARMOperand(Kind);
+    for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
+           I = Regs.begin(), E = Regs.end(); I != E; ++I)
+      Op->Registers.push_back(I->first);
+    array_pod_sort(Op->Registers.begin(), Op->Registers.end());
+    Op->StartLoc = StartLoc;
+    Op->EndLoc = EndLoc;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                      bool isDoubleSpaced, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_VectorList);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorListAllLanes(unsigned RegNum, unsigned Count,
+                                              bool isDoubleSpaced,
+                                              SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_VectorListAllLanes);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorListIndexed(unsigned RegNum, unsigned Count,
+                                             unsigned Index,
+                                             bool isDoubleSpaced,
+                                             SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_VectorListIndexed);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.LaneIndex = Index;
+    Op->VectorList.isDoubleSpaced = isDoubleSpaced;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                       MCContext &Ctx) {
+    ARMOperand *Op = new ARMOperand(k_VectorIndex);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateMem(unsigned BaseRegNum,
+                               const MCConstantExpr *OffsetImm,
+                               unsigned OffsetRegNum,
+                               ARM_AM::ShiftOpc ShiftType,
+                               unsigned ShiftImm,
+                               unsigned Alignment,
+                               bool isNegative,
+                               SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_Memory);
+    Op->Memory.BaseRegNum = BaseRegNum;
+    Op->Memory.OffsetImm = OffsetImm;
+    Op->Memory.OffsetRegNum = OffsetRegNum;
+    Op->Memory.ShiftType = ShiftType;
+    Op->Memory.ShiftImm = ShiftImm;
+    Op->Memory.Alignment = Alignment;
+    Op->Memory.isNegative = isNegative;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd,
+                                      ARM_AM::ShiftOpc ShiftTy,
+                                      unsigned ShiftImm,
+                                      SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_PostIndexRegister);
+    Op->PostIdxReg.RegNum = RegNum;
+    Op->PostIdxReg.isAdd = isAdd;
+    Op->PostIdxReg.ShiftTy = ShiftTy;
+    Op->PostIdxReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_MemBarrierOpt);
+    Op->MBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_ProcIFlags);
+    Op->IFlags.Val = IFlags;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_MSRMask);
+    Op->MMask.Val = MMask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void ARMOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_CondCode:
+    OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
+    break;
+  case k_CCOut:
+    OS << "<ccout " << getReg() << ">";
+    break;
+  case k_ITCondMask: {
+    static const char *const MaskStr[] = {
+      "()", "(t)", "(e)", "(tt)", "(et)", "(te)", "(ee)", "(ttt)", "(ett)",
+      "(tet)", "(eet)", "(tte)", "(ete)", "(tee)", "(eee)"
+    };
+    assert((ITMask.Mask & 0xf) == ITMask.Mask);
+    OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+    break;
+  }
+  case k_CoprocNum:
+    OS << "<coprocessor number: " << getCoproc() << ">";
+    break;
+  case k_CoprocReg:
+    OS << "<coprocessor register: " << getCoproc() << ">";
+    break;
+  case k_CoprocOption:
+    OS << "<coprocessor option: " << CoprocOption.Val << ">";
+    break;
+  case k_MSRMask:
+    OS << "<mask: " << getMSRMask() << ">";
+    break;
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_MemBarrierOpt:
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
+    break;
+  case k_Memory:
+    OS << "<memory "
+       << " base:" << Memory.BaseRegNum;
+    OS << ">";
+    break;
+  case k_PostIndexRegister:
+    OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+       << PostIdxReg.RegNum;
+    if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+      OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+         << PostIdxReg.ShiftImm;
+    OS << ">";
+    break;
+  case k_ProcIFlags: {
+    OS << "<ARM_PROC::";
+    unsigned IFlags = getProcIFlags();
+    for (int i=2; i >= 0; --i)
+      if (IFlags & (1 << i))
+        OS << ARM_PROC::IFlagsToString(1 << i);
+    OS << ">";
+    break;
+  }
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_ShifterImmediate:
+    OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+       << " #" << ShifterImm.Imm << ">";
+    break;
+  case k_ShiftedRegister:
+    OS << "<so_reg_reg "
+       << RegShiftedReg.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
+       << " " << RegShiftedReg.ShiftReg << ">";
+    break;
+  case k_ShiftedImmediate:
+    OS << "<so_reg_imm "
+       << RegShiftedImm.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
+       << " #" << RegShiftedImm.ShiftImm << ">";
+    break;
+  case k_RotateImmediate:
+    OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+    break;
+  case k_BitfieldDescriptor:
+    OS << "<bitfield " << "lsb: " << Bitfield.LSB
+       << ", width: " << Bitfield.Width << ">";
+    break;
+  case k_RegisterList:
+  case k_DPRRegisterList:
+  case k_SPRRegisterList: {
+    OS << "<register_list ";
+
+    const SmallVectorImpl<unsigned> &RegList = getRegList();
+    for (SmallVectorImpl<unsigned>::const_iterator
+           I = RegList.begin(), E = RegList.end(); I != E; ) {
+      OS << *I;
+      if (++I < E) OS << ", ";
+    }
+
+    OS << ">";
+    break;
+  }
+  case k_VectorList:
+    OS << "<vector_list " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListAllLanes:
+    OS << "<vector_list(all lanes) " << VectorList.Count << " * "
+       << VectorList.RegNum << ">";
+    break;
+  case k_VectorListIndexed:
+    OS << "<vector_list(lane " << VectorList.LaneIndex << ") "
+       << VectorList.Count << " * " << VectorList.RegNum << ">";
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+bool ARMAsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
+  EndLoc = Parser.getTok().getEndLoc();
+  RegNo = tryParseRegister();
+
+  return (RegNo == (unsigned)-1);
+}
+
+/// Try to parse a register name.  The token must be an Identifier when called,
+/// and if it is a register name the token is eaten and the register number is
+/// returned.  Otherwise return -1.
+///
+int ARMAsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) return -1;
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  if (!RegNum) {
+    RegNum = StringSwitch<unsigned>(lowerCase)
+      .Case("r13", ARM::SP)
+      .Case("r14", ARM::LR)
+      .Case("r15", ARM::PC)
+      .Case("ip", ARM::R12)
+      // Additional register name aliases for 'gas' compatibility.
+      .Case("a1", ARM::R0)
+      .Case("a2", ARM::R1)
+      .Case("a3", ARM::R2)
+      .Case("a4", ARM::R3)
+      .Case("v1", ARM::R4)
+      .Case("v2", ARM::R5)
+      .Case("v3", ARM::R6)
+      .Case("v4", ARM::R7)
+      .Case("v5", ARM::R8)
+      .Case("v6", ARM::R9)
+      .Case("v7", ARM::R10)
+      .Case("v8", ARM::R11)
+      .Case("sb", ARM::R9)
+      .Case("sl", ARM::R10)
+      .Case("fp", ARM::R11)
+      .Default(0);
+  }
+  if (!RegNum) {
+    // Check for aliases registered via .req. Canonicalize to lower case.
+    // That's more consistent since register names are case insensitive, and
+    // it's how the original entry was passed in from MC/MCParser/AsmParser.
+    StringMap<unsigned>::const_iterator Entry = RegisterReqs.find(lowerCase);
+    // If no match, return failure.
+    if (Entry == RegisterReqs.end())
+      return -1;
+    Parser.Lex(); // Eat identifier token.
+    return Entry->getValue();
+  }
+
+  Parser.Lex(); // Eat identifier token.
+
+  return RegNum;
+}
+
+// Try to parse a shifter  (e.g., "lsl <amt>"). On success, return 0.
+// If a recoverable error occurs, return 1. If an irrecoverable error
+// occurs, return -1. An irrecoverable error is one where tokens have been
+// consumed in the process of trying to parse the shifter (i.e., when it is
+// indeed a shifter operand, but malformed).
+int ARMAsmParser::tryParseShiftRegister(
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  ARM_AM::ShiftOpc ShiftTy = StringSwitch<ARM_AM::ShiftOpc>(lowerCase)
+      .Case("asl", ARM_AM::lsl)
+      .Case("lsl", ARM_AM::lsl)
+      .Case("lsr", ARM_AM::lsr)
+      .Case("asr", ARM_AM::asr)
+      .Case("ror", ARM_AM::ror)
+      .Case("rrx", ARM_AM::rrx)
+      .Default(ARM_AM::no_shift);
+
+  if (ShiftTy == ARM_AM::no_shift)
+    return 1;
+
+  Parser.Lex(); // Eat the operator.
+
+  // The source register for the shift has already been added to the
+  // operand list, so we need to pop it off and combine it into the shifted
+  // register operand instead.
+  OwningPtr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
+  if (!PrevOp->isReg())
+    return Error(PrevOp->getStartLoc(), "shift must be of a register");
+  int SrcReg = PrevOp->getReg();
+
+  SMLoc EndLoc;
+  int64_t Imm = 0;
+  int ShiftReg = 0;
+  if (ShiftTy == ARM_AM::rrx) {
+    // RRX Doesn't have an explicit shift amount. The encoder expects
+    // the shift register to be the same as the source register. Seems odd,
+    // but OK.
+    ShiftReg = SrcReg;
+  } else {
+    // Figure out if this is shifted by a constant or a register (for non-RRX).
+    if (Parser.getTok().is(AsmToken::Hash) ||
+        Parser.getTok().is(AsmToken::Dollar)) {
+      Parser.Lex(); // Eat hash.
+      SMLoc ImmLoc = Parser.getTok().getLoc();
+      const MCExpr *ShiftExpr = 0;
+      if (getParser().parseExpression(ShiftExpr, EndLoc)) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // The expression must be evaluatable as an immediate.
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftExpr);
+      if (!CE) {
+        Error(ImmLoc, "invalid immediate shift value");
+        return -1;
+      }
+      // Range check the immediate.
+      // lsl, ror: 0 <= imm <= 31
+      // lsr, asr: 0 <= imm <= 32
+      Imm = CE->getValue();
+      if (Imm < 0 ||
+          ((ShiftTy == ARM_AM::lsl || ShiftTy == ARM_AM::ror) && Imm > 31) ||
+          ((ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr) && Imm > 32)) {
+        Error(ImmLoc, "immediate shift value out of range");
+        return -1;
+      }
+      // shift by zero is a nop. Always send it through as lsl.
+      // ('as' compatibility)
+      if (Imm == 0)
+        ShiftTy = ARM_AM::lsl;
+    } else if (Parser.getTok().is(AsmToken::Identifier)) {
+      SMLoc L = Parser.getTok().getLoc();
+      EndLoc = Parser.getTok().getEndLoc();
+      ShiftReg = tryParseRegister();
+      if (ShiftReg == -1) {
+        Error (L, "expected immediate or register in shift operand");
+        return -1;
+      }
+    } else {
+      Error (Parser.getTok().getLoc(),
+                    "expected immediate or register in shift operand");
+      return -1;
+    }
+  }
+
+  if (ShiftReg && ShiftTy != ARM_AM::rrx)
+    Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+                                                         ShiftReg, Imm,
+                                                         S, EndLoc));
+  else
+    Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
+                                                          S, EndLoc));
+
+  return 0;
+}
+
+
+/// Try to parse a register name.  The token must be an Identifier when called.
+/// If it's a register, an AsmOperand is created. Another AsmOperand is created
+/// if there is a "writeback". 'true' if it's not a register.
+///
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::
+tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &RegTok = Parser.getTok();
+  int RegNo = tryParseRegister();
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(ARMOperand::CreateReg(RegNo, RegTok.getLoc(),
+                                           RegTok.getEndLoc()));
+
+  const AsmToken &ExclaimTok = Parser.getTok();
+  if (ExclaimTok.is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
+                                               ExclaimTok.getLoc()));
+    Parser.Lex(); // Eat exclaim token
+    return false;
+  }
+
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return true;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE)
+      return TokError("immediate value expected for vector index");
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+
+    SMLoc E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
+  }
+
+  return false;
+}
+
+/// MatchCoprocessorOperandName - Try to parse an coprocessor related
+/// instruction with a symbolic operand name. Example: "p1", "p7", "c3",
+/// "c5", ...
+static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
+  // Use the same layout as the tablegen'erated register name matcher. Ugly,
+  // but efficient.
+  switch (Name.size()) {
+  default: return -1;
+  case 2:
+    if (Name[0] != CoprocOp)
+      return -1;
+    switch (Name[1]) {
+    default:  return -1;
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    }
+  case 3:
+    if (Name[0] != CoprocOp || Name[1] != '1')
+      return -1;
+    switch (Name[2]) {
+    default:  return -1;
+    case '0': return 10;
+    case '1': return 11;
+    case '2': return 12;
+    case '3': return 13;
+    case '4': return 14;
+    case '5': return 15;
+    }
+  }
+}
+
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  unsigned CC = StringSwitch<unsigned>(Tok.getString().lower())
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("cs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("cc", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC == ~0U)
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
+
+  Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+  return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
+  if (Num == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocNum(Num, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// token must be an Identifier when called, and if it is a coprocessor
+/// number, the token is eaten and the operand is added to the operand list.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
+  if (Reg == -1)
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateCoprocReg(Reg, S));
+  return MatchOperand_Success;
+}
+
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  // If this isn't a '{', this isn't a coprocessor immediate operand.
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '{'
+
+  const MCExpr *Expr;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+    Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+
+  // Check for and consume the closing '}'
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return MatchOperand_ParseFail;
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the '}'
+
+  Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+  return MatchOperand_Success;
+}
+
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+  // If this is a GPR, we need to do it manually, otherwise we can rely
+  // on the sort ordering of the enumeration since the other reg-classes
+  // are sane.
+  if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    return Reg + 1;
+  switch(Reg) {
+  default: llvm_unreachable("Invalid GPR number!");
+  case ARM::R0:  return ARM::R1;  case ARM::R1:  return ARM::R2;
+  case ARM::R2:  return ARM::R3;  case ARM::R3:  return ARM::R4;
+  case ARM::R4:  return ARM::R5;  case ARM::R5:  return ARM::R6;
+  case ARM::R6:  return ARM::R7;  case ARM::R7:  return ARM::R8;
+  case ARM::R8:  return ARM::R9;  case ARM::R9:  return ARM::R10;
+  case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+  case ARM::R12: return ARM::SP;  case ARM::SP:  return ARM::LR;
+  case ARM::LR:  return ARM::PC;  case ARM::PC:  return ARM::R0;
+  }
+}
+
+// Return the low-subreg of a given Q register.
+static unsigned getDRegFromQReg(unsigned QReg) {
+  switch (QReg) {
+  default: llvm_unreachable("expected a Q register!");
+  case ARM::Q0:  return ARM::D0;
+  case ARM::Q1:  return ARM::D2;
+  case ARM::Q2:  return ARM::D4;
+  case ARM::Q3:  return ARM::D6;
+  case ARM::Q4:  return ARM::D8;
+  case ARM::Q5:  return ARM::D10;
+  case ARM::Q6:  return ARM::D12;
+  case ARM::Q7:  return ARM::D14;
+  case ARM::Q8:  return ARM::D16;
+  case ARM::Q9:  return ARM::D18;
+  case ARM::Q10: return ARM::D20;
+  case ARM::Q11: return ARM::D22;
+  case ARM::Q12: return ARM::D24;
+  case ARM::Q13: return ARM::D26;
+  case ARM::Q14: return ARM::D28;
+  case ARM::Q15: return ARM::D30;
+  }
+}
+
+/// Parse a register list.
+bool ARMAsmParser::
+parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  // Check the first register in the list to see what register class
+  // this is a list of.
+  int Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(RegLoc, "register expected");
+
+  // The reglist instructions have at most 16 registers, so reserve
+  // space for that many.
+  SmallVector<std::pair<unsigned, SMLoc>, 16> Registers;
+
+  // Allow Q regs and just interpret them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    Reg = getDRegFromQReg(Reg);
+    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+    ++Reg;
+  }
+  const MCRegisterClass *RC;
+  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+  else
+    return Error(RegLoc, "invalid register in register list");
+
+  // Store the register.
+  Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+
+  // This starts immediately after the first register token in the list,
+  // so we can see either a comma or a minus (range separator) as a legal
+  // next token.
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1)
+        return Error(AfterMinusLoc, "register expected");
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!RC->contains(EndReg))
+        return Error(AfterMinusLoc, "invalid register in register list");
+      // Ranges must go from low to high.
+      if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg))
+        return Error(AfterMinusLoc, "bad range in register list");
+
+      // Add all the registers in the range to the register list.
+      while (Reg != EndReg) {
+        Reg = getNextRegister(Reg);
+        Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+      }
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    const AsmToken RegTok = Parser.getTok();
+    Reg = tryParseRegister();
+    if (Reg == -1)
+      return Error(RegLoc, "register expected");
+    // Allow Q regs and just interpret them as the two D sub-registers.
+    bool isQReg = false;
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      isQReg = true;
+    }
+    // The register must be in the same register class as the first.
+    if (!RC->contains(Reg))
+      return Error(RegLoc, "invalid register in register list");
+    // List must be monotonically increasing.
+    if (MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+      if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+        Warning(RegLoc, "register list not in ascending order");
+      else
+        return Error(RegLoc, "register list not in ascending order");
+    }
+    if (MRI->getEncodingValue(Reg) == MRI->getEncodingValue(OldReg)) {
+      Warning(RegLoc, "duplicated register (" + RegTok.getString() +
+              ") in register list");
+      continue;
+    }
+    // VFP register lists must also be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+        Reg != OldReg + 1)
+      return Error(RegLoc, "non-contiguous register range");
+    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+    if (isQReg)
+      Registers.push_back(std::pair<unsigned, SMLoc>(++Reg, RegLoc));
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(Parser.getTok().getLoc(), "'}' expected");
+  SMLoc E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  // Push the register list operand.
+  Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
+
+  // The ARM system instruction variants for LDM/STM have a '^' token here.
+  if (Parser.getTok().is(AsmToken::Caret)) {
+    Operands.push_back(ARMOperand::CreateToken("^",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat '^' token.
+  }
+
+  return false;
+}
+
+// Helper function to parse the lane index for vector lists.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
+  Index = 0; // Always return a defined index value.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    Parser.Lex(); // Eat the '['.
+    if (Parser.getTok().is(AsmToken::RBrac)) {
+      // "Dn[]" is the 'all lanes' syntax.
+      LaneKind = AllLanes;
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat the ']'.
+      return MatchOperand_Success;
+    }
+
+    // There's an optional '#' token here. Normally there wouldn't be, but
+    // inline assemble puts one in, and it's friendly to accept that.
+    if (Parser.getTok().is(AsmToken::Hash))
+      Parser.Lex(); // Eat the '#'
+
+    const MCExpr *LaneIndex;
+    SMLoc Loc = Parser.getTok().getLoc();
+    if (getParser().parseExpression(LaneIndex)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LaneIndex);
+    if (!CE) {
+      Error(Loc, "lane index must be empty or an integer");
+      return MatchOperand_ParseFail;
+    }
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(Parser.getTok().getLoc(), "']' expected");
+      return MatchOperand_ParseFail;
+    }
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat the ']'.
+    int64_t Val = CE->getValue();
+
+    // FIXME: Make this range check context sensitive for .8, .16, .32.
+    if (Val < 0 || Val > 7) {
+      Error(Parser.getTok().getLoc(), "lane index out of range");
+      return MatchOperand_ParseFail;
+    }
+    Index = Val;
+    LaneKind = IndexedLane;
+    return MatchOperand_Success;
+  }
+  LaneKind = NoLanes;
+  return MatchOperand_Success;
+}
+
+// parse a vector register list
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  VectorLaneTy LaneKind;
+  unsigned LaneIndex;
+  SMLoc S = Parser.getTok().getLoc();
+  // As an extension (to match gas), support a plain D register or Q register
+  // (without encosing curly braces) as a single or double entry list,
+  // respectively.
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    SMLoc E = Parser.getTok().getEndLoc();
+    int Reg = tryParseRegister();
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, false, S, E));
+        break;
+      case AllLanes:
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 1, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 1,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      OperandMatchResultTy Res = parseVectorLane(LaneKind, LaneIndex, E);
+      if (Res != MatchOperand_Success)
+        return Res;
+      switch (LaneKind) {
+      case NoLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, false, S, E));
+        break;
+      case AllLanes:
+        Reg = MRI->getMatchingSuperReg(Reg, ARM::dsub_0,
+                                   &ARMMCRegisterClasses[ARM::DPairRegClassID]);
+        Operands.push_back(ARMOperand::CreateVectorListAllLanes(Reg, 2, false,
+                                                                S, E));
+        break;
+      case IndexedLane:
+        Operands.push_back(ARMOperand::CreateVectorListIndexed(Reg, 2,
+                                                               LaneIndex,
+                                                               false, S, E));
+        break;
+      }
+      return MatchOperand_Success;
+    }
+    Error(S, "vector register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
+
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    Error(RegLoc, "register expected");
+    return MatchOperand_ParseFail;
+  }
+  unsigned Count = 1;
+  int Spacing = 0;
+  unsigned FirstReg = Reg;
+  // The list is of D registers, but we also allow Q regs and just interpret
+  // them as the two D sub-registers.
+  if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+    FirstReg = Reg = getDRegFromQReg(Reg);
+    Spacing = 1; // double-spacing requires explicit D registers, otherwise
+                 // it's ambiguous with four-register single spaced.
+    ++Reg;
+    ++Count;
+  }
+
+  SMLoc E;
+  if (parseVectorLane(LaneKind, LaneIndex, E) != MatchOperand_Success)
+    return MatchOperand_ParseFail;
+
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(Parser.getTok().getLoc(),
+              "sequential registers in double spaced list");
+        return MatchOperand_ParseFail;
+      }
+      Parser.Lex(); // Eat the minus.
+      SMLoc AfterMinusLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1) {
+        Error(AfterMinusLoc, "register expected");
+        return MatchOperand_ParseFail;
+      }
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+        Error(AfterMinusLoc, "invalid register in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Ranges must go from low to high.
+      if (Reg > EndReg) {
+        Error(AfterMinusLoc, "bad range in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(AfterMinusLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+
+      // Add all the registers in the range to the register list.
+      Count += EndReg - Reg;
+      Reg = EndReg;
+      continue;
+    }
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    Reg = tryParseRegister();
+    if (Reg == -1) {
+      Error(RegLoc, "register expected");
+      return MatchOperand_ParseFail;
+    }
+    // vector register lists must be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    //
+    // The list is of D registers, but we also allow Q regs and just interpret
+    // them as the two D sub-registers.
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      if (!Spacing)
+        Spacing = 1; // Register range implies a single spaced list.
+      else if (Spacing == 2) {
+        Error(RegLoc,
+              "invalid register in double-spaced list (must be 'D' register')");
+        return MatchOperand_ParseFail;
+      }
+      Reg = getDRegFromQReg(Reg);
+      if (Reg != OldReg + 1) {
+        Error(RegLoc, "non-contiguous register range");
+        return MatchOperand_ParseFail;
+      }
+      ++Reg;
+      Count += 2;
+      // Parse the lane specifier if present.
+      VectorLaneTy NextLaneKind;
+      unsigned NextLaneIndex;
+      SMLoc LaneLoc = Parser.getTok().getLoc();
+      if (parseVectorLane(NextLaneKind, NextLaneIndex, E) !=
+          MatchOperand_Success)
+        return MatchOperand_ParseFail;
+      if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+        Error(LaneLoc, "mismatched lane index in register list");
+        return MatchOperand_ParseFail;
+      }
+      continue;
+    }
+    // Normal D register.
+    // Figure out the register spacing (single or double) of the list if
+    // we don't know it already.
+    if (!Spacing)
+      Spacing = 1 + (Reg == OldReg + 2);
+
+    // Just check that it's contiguous and keep going.
+    if (Reg != OldReg + Spacing) {
+      Error(RegLoc, "non-contiguous register range");
+      return MatchOperand_ParseFail;
+    }
+    ++Count;
+    // Parse the lane specifier if present.
+    VectorLaneTy NextLaneKind;
+    unsigned NextLaneIndex;
+    SMLoc EndLoc = Parser.getTok().getLoc();
+    if (parseVectorLane(NextLaneKind, NextLaneIndex, E) != MatchOperand_Success)
+      return MatchOperand_ParseFail;
+    if (NextLaneKind != LaneKind || LaneIndex != NextLaneIndex) {
+      Error(EndLoc, "mismatched lane index in register list");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  switch (LaneKind) {
+  case NoLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+
+    Operands.push_back(ARMOperand::CreateVectorList(FirstReg, Count,
+                                                    (Spacing == 2), S, E));
+    break;
+  case AllLanes:
+    // Two-register operands have been converted to the
+    // composite register classes.
+    if (Count == 2) {
+      const MCRegisterClass *RC = (Spacing == 1) ?
+        &ARMMCRegisterClasses[ARM::DPairRegClassID] :
+        &ARMMCRegisterClasses[ARM::DPairSpcRegClassID];
+      FirstReg = MRI->getMatchingSuperReg(FirstReg, ARM::dsub_0, RC);
+    }
+    Operands.push_back(ARMOperand::CreateVectorListAllLanes(FirstReg, Count,
+                                                            (Spacing == 2),
+                                                            S, E));
+    break;
+  case IndexedLane:
+    Operands.push_back(ARMOperand::CreateVectorListIndexed(FirstReg, Count,
+                                                           LaneIndex,
+                                                           (Spacing == 2),
+                                                           S, E));
+    break;
+  }
+  return MatchOperand_Success;
+}
+
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
+      .Case("sy",    ARM_MB::SY)
+      .Case("st",    ARM_MB::ST)
+      .Case("sh",    ARM_MB::ISH)
+      .Case("ish",   ARM_MB::ISH)
+      .Case("shst",  ARM_MB::ISHST)
+      .Case("ishst", ARM_MB::ISHST)
+      .Case("nsh",   ARM_MB::NSH)
+      .Case("un",    ARM_MB::NSH)
+      .Case("nshst", ARM_MB::NSHST)
+      .Case("unst",  ARM_MB::NSHST)
+      .Case("osh",   ARM_MB::OSH)
+      .Case("oshst", ARM_MB::OSHST)
+      .Default(~0U);
+
+    if (Opt == ~0U)
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat the '#'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *MemBarrierID;
+    if (getParser().parseExpression(MemBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+    
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(MemBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_MB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateMemBarrierOpt((ARM_MB::MemBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier)) 
+    return MatchOperand_NoMatch;
+  StringRef IFlagsStr = Tok.getString();
+
+  // An iflags string of "none" is interpreted to mean that none of the AIF
+  // bits are set.  Not a terribly useful instruction, but a valid encoding.
+  unsigned IFlags = 0;
+  if (IFlagsStr != "none") {
+        for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+        .Case("a", ARM_PROC::A)
+        .Case("i", ARM_PROC::I)
+        .Case("f", ARM_PROC::F)
+        .Default(~0U);
+
+      // If some specific iflag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (Flag == ~0U || (IFlags & Flag))
+        return MatchOperand_NoMatch;
+
+      IFlags |= Flag;
+    }
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateProcIFlags((ARM_PROC::IFlags)IFlags, S));
+  return MatchOperand_Success;
+}
+
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef Mask = Tok.getString();
+
+  if (isMClass()) {
+    // See ARMv6-M 10.1.1
+    std::string Name = Mask.lower();
+    unsigned FlagsVal = StringSwitch<unsigned>(Name)
+      // Note: in the documentation:
+      //  ARM deprecates using MSR APSR without a _<bits> qualifier as an alias
+      //  for MSR APSR_nzcvq.
+      // but we do make it an alias here.  This is so to get the "mask encoding"
+      // bits correct on MSR APSR writes.
+      //
+      // FIXME: Note the 0xc00 "mask encoding" bits version of the registers
+      // should really only be allowed when writing a special register.  Note
+      // they get dropped in the MRS instruction reading a special register as
+      // the SYSm field is only 8 bits.
+      //
+      // FIXME: the _g and _nzcvqg versions are only allowed if the processor
+      // includes the DSP extension but that is not checked.
+      .Case("apsr", 0x800)
+      .Case("apsr_nzcvq", 0x800)
+      .Case("apsr_g", 0x400)
+      .Case("apsr_nzcvqg", 0xc00)
+      .Case("iapsr", 0x801)
+      .Case("iapsr_nzcvq", 0x801)
+      .Case("iapsr_g", 0x401)
+      .Case("iapsr_nzcvqg", 0xc01)
+      .Case("eapsr", 0x802)
+      .Case("eapsr_nzcvq", 0x802)
+      .Case("eapsr_g", 0x402)
+      .Case("eapsr_nzcvqg", 0xc02)
+      .Case("xpsr", 0x803)
+      .Case("xpsr_nzcvq", 0x803)
+      .Case("xpsr_g", 0x403)
+      .Case("xpsr_nzcvqg", 0xc03)
+      .Case("ipsr", 0x805)
+      .Case("epsr", 0x806)
+      .Case("iepsr", 0x807)
+      .Case("msp", 0x808)
+      .Case("psp", 0x809)
+      .Case("primask", 0x810)
+      .Case("basepri", 0x811)
+      .Case("basepri_max", 0x812)
+      .Case("faultmask", 0x813)
+      .Case("control", 0x814)
+      .Default(~0U);
+
+    if (FlagsVal == ~0U)
+      return MatchOperand_NoMatch;
+
+    if (!hasV7Ops() && FlagsVal >= 0x811 && FlagsVal <= 0x813)
+      // basepri, basepri_max and faultmask only valid for V7m.
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+    Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+    return MatchOperand_Success;
+  }
+
+  // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
+  size_t Start = 0, Next = Mask.find('_');
+  StringRef Flags = "";
+  std::string SpecReg = Mask.slice(Start, Next).lower();
+  if (Next != StringRef::npos)
+    Flags = Mask.slice(Next+1, Mask.size());
+
+  // FlagsVal contains the complete mask:
+  // 3-0: Mask
+  // 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  unsigned FlagsVal = 0;
+
+  if (SpecReg == "apsr") {
+    FlagsVal = StringSwitch<unsigned>(Flags)
+    .Case("nzcvq",  0x8) // same as CPSR_f
+    .Case("g",      0x4) // same as CPSR_s
+    .Case("nzcvqg", 0xc) // same as CPSR_fs
+    .Default(~0U);
+
+    if (FlagsVal == ~0U) {
+      if (!Flags.empty())
+        return MatchOperand_NoMatch;
+      else
+        FlagsVal = 8; // No flag
+    }
+  } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
+    // cpsr_all is an alias for cpsr_fc, as is plain cpsr.
+    if (Flags == "all" || Flags == "")
+      Flags = "fc";
+    for (int i = 0, e = Flags.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(Flags.substr(i, 1))
+      .Case("c", 1)
+      .Case("x", 2)
+      .Case("s", 4)
+      .Case("f", 8)
+      .Default(~0U);
+
+      // If some specific flag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (FlagsVal == ~0U || (FlagsVal & Flag))
+        return MatchOperand_NoMatch;
+      FlagsVal |= Flag;
+    }
+  } else // No match for special register.
+    return MatchOperand_NoMatch;
+
+  // Special register without flags is NOT equivalent to "fc" flags.
+  // NOTE: This is a divergence from gas' behavior.  Uncommenting the following
+  // two lines would enable gas compatibility at the expense of breaking
+  // round-tripping.
+  //
+  // if (!FlagsVal)
+  //  FlagsVal = 0x9;
+
+  // Bit 4: Special Reg (cpsr, apsr => 0; spsr => 1)
+  if (SpecReg == "spsr")
+    FlagsVal |= 16;
+
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
+            int Low, int High) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  std::string LowerOp = Op.lower();
+  std::string UpperOp = Op.upper();
+  if (ShiftName != LowerOp && ShiftName != UpperOp) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat shift type token.
+
+  // There must be a '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc Loc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(Loc, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+  if (Val < Low || Val > High) {
+    Error(Loc, "immediate value out of range");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateImm(CE, Loc, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = StringSwitch<int>(Tok.getString())
+    .Case("be", 1)
+    .Case("le", 0)
+    .Default(-1);
+  Parser.Lex(); // Eat the token.
+
+  if (Val == -1) {
+    Error(S, "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val,
+                                                                  getContext()),
+                                           S, Tok.getEndLoc()));
+  return MatchOperand_Success;
+}
+
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+///     lsl #n  'n' in [0,31]
+///     asr #n  'n' in [1,32]
+///             n == 32 encoded as n == 0.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  bool isASR;
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    isASR = false;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    isASR = true;
+  else {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed shift expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "shift amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  if (isASR) {
+    // Shift amount must be in [1,32]
+    if (Val < 1 || Val > 32) {
+      Error(ExLoc, "'asr' shift amount must be in range [1,32]");
+      return MatchOperand_ParseFail;
+    }
+    // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+    if (isThumb() && Val == 32) {
+      Error(ExLoc, "'asr #32' shift amount not allowed in Thumb mode");
+      return MatchOperand_ParseFail;
+    }
+    if (Val == 32) Val = 0;
+  } else {
+    // Shift amount must be in [1,32]
+    if (Val < 0 || Val > 31) {
+      Error(ExLoc, "'lsr' shift amount must be in range [0,31]");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+///     ror #n  'n' in {0, 8, 16, 24}
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName != "ror" && ShiftName != "ROR")
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a rotate amount.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  SMLoc ExLoc = Parser.getTok().getLoc();
+
+  const MCExpr *ShiftAmount;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(ShiftAmount, EndLoc)) {
+    Error(ExLoc, "malformed rotate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(ExLoc, "rotate amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+  // normally, zero is represented in asm by omitting the rotate operand
+  // entirely.
+  if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+    Error(ExLoc, "'ror' rotate amount must be 8, 16, or 24");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateRotImm(Val, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  // The bitfield descriptor is really two operands, the LSB and the width.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *LSBExpr;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().parseExpression(LSBExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+  if (!CE) {
+    Error(E, "'lsb' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t LSB = CE->getValue();
+  // The LSB must be in the range [0,31]
+  if (LSB < 0 || LSB > 31) {
+    Error(E, "'lsb' operand must be in the range [0,31]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  // Expect another immediate operand.
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "too few operands");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *WidthExpr;
+  SMLoc EndLoc;
+  if (getParser().parseExpression(WidthExpr, EndLoc)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  CE = dyn_cast<MCConstantExpr>(WidthExpr);
+  if (!CE) {
+    Error(E, "'width' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Width = CE->getValue();
+  // The LSB must be in the range [1,32-lsb]
+  if (Width < 1 || Width > 32 - LSB) {
+    Error(E, "'width' operand must be in the range [1,32-lsb]");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, EndLoc));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // postidx_reg := '+' register {, shift}
+  //              | '-' register {, shift}
+  //              | register {, shift}
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+      return MatchOperand_ParseFail;
+
+    // FIXME: Only approximates end...may include intervening whitespace.
+    E = Parser.getTok().getLoc();
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+                                                  ShiftImm, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // am3offset := '+' register
+  //              | '-' register
+  //              | register
+  //              | # imm
+  //              | # + imm
+  //              | # - imm
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  // Do immediates first, as we always parse those if we have a '#'.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat the '#'.
+    // Explicitly look for a '-', as we need to encode negative zero
+    // differently.
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    SMLoc E;
+    if (getParser().parseExpression(Offset, E))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    // Negative zero is encoded as the flag value INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      Val = INT32_MIN;
+
+    Operands.push_back(
+      ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E));
+
+    return MatchOperand_Success;
+  }
+
+
+  bool haveEaten = false;
+  bool isAdd = true;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+  
+  Tok = Parser.getTok();
+  int Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Tok.getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+                                                  0, S, Tok.getEndLoc()));
+
+  return MatchOperand_Success;
+}
+
+/// cvtT2LdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtT2StrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtT2StrdPre(MCInst &Inst,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode2(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+
+/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode3(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdrdPre(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtStrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtStrdPre(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode3(MCInst &Inst,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// cvtThumbMultiply - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+void ARMAsmParser::
+cvtThumbMultiply(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
+  // If we have a three-operand form, make sure to set Rn to be the operand
+  // that isn't the same as Rd.
+  unsigned RegOp = 4;
+  if (Operands.size() == 6 &&
+      ((ARMOperand*)Operands[4])->getReg() ==
+        ((ARMOperand*)Operands[3])->getReg())
+    RegOp = 5;
+  ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
+  Inst.addOperand(Inst.getOperand(0));
+  ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::
+cvtVLDwbFixed(MCInst &Inst,
+              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Vd
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Vn
+  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::
+cvtVLDwbRegister(MCInst &Inst,
+                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Vd
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Vn
+  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
+  // Vm
+  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::
+cvtVSTwbFixed(MCInst &Inst,
+              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Vn
+  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
+  // Vt
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+void ARMAsmParser::
+cvtVSTwbRegister(MCInst &Inst,
+                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Vn
+  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
+  // Vm
+  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
+  // Vt
+  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+}
+
+/// Parse an ARM memory expression, return false if successful else return true
+/// or an error.  The first token must be a '[' when called.
+bool ARMAsmParser::
+parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S, E;
+  assert(Parser.getTok().is(AsmToken::LBrac) &&
+         "Token is not a Left Bracket");
+  S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  int BaseRegNum = tryParseRegister();
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
+
+  // The next token must either be a comma, a colon or a closing bracket.
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Colon) && !Tok.is(AsmToken::Comma) &&
+      !Tok.is(AsmToken::RBrac))
+    return Error(Tok.getLoc(), "malformed memory operand");
+
+  if (Tok.is(AsmToken::RBrac)) {
+    E = Tok.getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift,
+                                             0, 0, false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand. It's rather odd, but syntactically valid.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  assert((Tok.is(AsmToken::Colon) || Tok.is(AsmToken::Comma)) &&
+         "Lost colon or comma in memory operand?!");
+  if (Tok.is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+  }
+
+  // If we have a ':', it's an alignment specifier.
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat the ':'.
+    E = Parser.getTok().getLoc();
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    unsigned Align = 0;
+    switch (CE->getValue()) {
+    default:
+      return Error(E,
+                   "alignment specifier must be 16, 32, 64, 128, or 256 bits");
+    case 16:  Align = 2; break;
+    case 32:  Align = 4; break;
+    case 64:  Align = 8; break;
+    case 128: Align = 16; break;
+    case 256: Align = 32; break;
+    }
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0,
+                                             ARM_AM::no_shift, 0, Align,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // If we have a '#', it's an immediate offset, else assume it's a register
+  // offset. Be friendly and also accept a plain integer (without a leading
+  // hash) for gas compatibility.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Dollar) ||
+      Parser.getTok().is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat the '#'.
+    E = Parser.getTok().getLoc();
+
+    bool isNegative = getParser().getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().parseExpression(Offset))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    // If the constant was #-0, represent it as INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      CE = MCConstantExpr::Create(INT32_MIN, getContext());
+
+    // Now we should have the closing ']'
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(Parser.getTok().getLoc(), "']' expected");
+    E = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+                                             ARM_AM::no_shift, 0, 0,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
+    }
+
+    return false;
+  }
+
+  // The register offset is optionally preceded by a '+' or '-'
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex(); // Eat the '-'.
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    // Nothing to do.
+    Parser.Lex(); // Eat the '+'.
+  }
+
+  E = Parser.getTok().getLoc();
+  int OffsetRegNum = tryParseRegister();
+  if (OffsetRegNum == -1)
+    return Error(E, "register expected");
+
+  // If there's a shift operator, handle it.
+  ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+      return true;
+  }
+
+  // Now we should have the closing ']'
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(Parser.getTok().getLoc(), "']' expected");
+  E = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum,
+                                           ShiftType, ShiftImm, 0, isNegative,
+                                           S, E));
+
+  // If there's a pre-indexing writeback marker, '!', just add it as a token
+  // operand.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the '!'.
+  }
+
+  return false;
+}
+
+/// parseMemRegOffsetShift - one of these two:
+///   ( lsl | lsr | asr | ror ) , # shift_amount
+///   rrx
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+                                          unsigned &Amount) {
+  SMLoc Loc = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName == "lsl" || ShiftName == "LSL" ||
+      ShiftName == "asl" || ShiftName == "ASL")
+    St = ARM_AM::lsl;
+  else if (ShiftName == "lsr" || ShiftName == "LSR")
+    St = ARM_AM::lsr;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    St = ARM_AM::asr;
+  else if (ShiftName == "ror" || ShiftName == "ROR")
+    St = ARM_AM::ror;
+  else if (ShiftName == "rrx" || ShiftName == "RRX")
+    St = ARM_AM::rrx;
+  else
+    return Error(Loc, "illegal shift operator");
+  Parser.Lex(); // Eat shift type token.
+
+  // rrx stands alone.
+  Amount = 0;
+  if (St != ARM_AM::rrx) {
+    Loc = Parser.getTok().getLoc();
+    // A '#' and a shift amount.
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash) &&
+        HashTok.isNot(AsmToken::Dollar))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
+
+    const MCExpr *Expr;
+    if (getParser().parseExpression(Expr))
+      return true;
+    // Range check the immediate.
+    // lsl, ror: 0 <= imm <= 31
+    // lsr, asr: 0 <= imm <= 32
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error(Loc, "shift amount must be an immediate");
+    int64_t Imm = CE->getValue();
+    if (Imm < 0 ||
+        ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+        ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+      return Error(Loc, "immediate shift value out of range");
+    // If <ShiftTy> #0, turn it into a no_shift.
+    if (Imm == 0)
+      St = ARM_AM::lsl;
+    // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+    if (Imm == 32)
+      Imm = 0;
+    Amount = Imm;
+  }
+
+  return false;
+}
+
+/// parseFPImm - A floating point immediate expression operand.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Anything that can accept a floating point constant as an operand
+  // needs to go through here, as the regular parseExpression is
+  // integer only.
+  //
+  // This routine still creates a generic Immediate operand, containing
+  // a bitcast of the 64-bit floating point value. The various operands
+  // that accept floats can check whether the value is valid for them
+  // via the standard is*() predicates.
+
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  // Disambiguate the VMOV forms that can accept an FP immediate.
+  // vmov.f32 <sreg>, #imm
+  // vmov.f64 <dreg>, #imm
+  // vmov.f32 <dreg>, #imm  @ vector f32x2
+  // vmov.f32 <qreg>, #imm  @ vector f32x4
+  //
+  // There are also the NEON VMOV instructions which expect an
+  // integer constant. Make sure we don't try to parse an FPImm
+  // for these:
+  // vmov.i{8|16|32|64} <dreg|qreg>, #imm
+  ARMOperand *TyOp = static_cast<ARMOperand*>(Operands[2]);
+  if (!TyOp->isToken() || (TyOp->getToken() != ".f32" &&
+                           TyOp->getToken() != ".f64"))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat the '#'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Loc = Tok.getLoc();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEsingle, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 31;
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::Create(IntVal, getContext()),
+          S, Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+  // Also handle plain integers. Instructions which allow floating point
+  // immediates also allow a raw encoded 8-bit value.
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val = Tok.getIntVal();
+    Parser.Lex(); // Eat the token.
+    if (Val > 255 || Val < 0) {
+      Error(Loc, "encoded floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    double RealVal = ARM_AM::getFPImmFloat(Val);
+    Val = APFloat(APFloat::IEEEdouble, RealVal).bitcastToAPInt().getZExtValue();
+    Operands.push_back(ARMOperand::CreateImm(
+        MCConstantExpr::Create(Val, getContext()), S,
+        Parser.getTok().getLoc()));
+    return MatchOperand_Success;
+  }
+
+  Error(Loc, "invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// Parse a arm instruction operand.  For now this parses the operand regardless
+/// of the mnemonic.
+bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                StringRef Mnemonic) {
+  SMLoc S, E;
+
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Identifier: {
+    // If we've seen a branch mnemonic, the next operand must be a label.  This
+    // is true even if the label is a register name.  So "br r1" means branch to
+    // label "r1".
+    bool ExpectLabel = Mnemonic == "b" || Mnemonic == "bl";
+    if (!ExpectLabel) {
+      if (!tryParseRegisterWithWriteBack(Operands))
+        return false;
+      int Res = tryParseShiftRegister(Operands);
+      if (Res == 0) // success
+        return false;
+      else if (Res == -1) // irrecoverable error
+        return true;
+      // If this is VMRS, check for the apsr_nzcv operand.
+      if (Mnemonic == "vmrs" &&
+          Parser.getTok().getString().equals_lower("apsr_nzcv")) {
+        S = Parser.getTok().getLoc();
+        Parser.Lex();
+        Operands.push_back(ARMOperand::CreateToken("APSR_nzcv", S));
+        return false;
+      }
+    }
+
+    // Fall though for the Identifier case that is not a register or a
+    // special name.
+  }
+  case AsmToken::LParen:  // parenthesized expressions like (_strcmp-4)
+  case AsmToken::Integer: // things like 1f and 2b as a branch targets
+  case AsmToken::String:  // quoted label names.
+  case AsmToken::Dot: {   // . as a branch target
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = Parser.getTok().getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::LBrac:
+    return parseMemory(Operands);
+  case AsmToken::LCurly:
+    return parseRegisterList(Operands);
+  case AsmToken::Dollar:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = Parser.getTok().getLoc();
+    Parser.Lex();
+
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      bool isNegative = Parser.getTok().is(AsmToken::Minus);
+      const MCExpr *ImmVal;
+      if (getParser().parseExpression(ImmVal))
+        return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+      if (CE) {
+        int32_t Val = CE->getValue();
+        if (isNegative && Val == 0)
+          ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
+      }
+      E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
+
+      // There can be a trailing '!' on operands that we want as a separate
+      // '!' Token operand. Handle that here. For example, the compatibilty
+      // alias for 'srsdb sp!, #imm' is 'srsdb #imm!'.
+      if (Parser.getTok().is(AsmToken::Exclaim)) {
+        Operands.push_back(ARMOperand::CreateToken(Parser.getTok().getString(),
+                                                   Parser.getTok().getLoc()));
+        Parser.Lex(); // Eat exclaim token
+      }
+      return false;
+    }
+    // w/ a ':' after the '#', it's just like a plain ':'.
+    // FALLTHROUGH
+  }
+  case AsmToken::Colon: {
+    // ":lower16:" and ":upper16:" expression prefixes
+    // FIXME: Check it's an expression prefix,
+    // e.g. (FOO - :lower16:BAR) isn't legal.
+    ARMMCExpr::VariantKind RefKind;
+    if (parsePrefix(RefKind))
+      return true;
+
+    const MCExpr *SubExprVal;
+    if (getParser().parseExpression(SubExprVal))
+      return true;
+
+    const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
+                                              getContext());
+    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
+    return false;
+  }
+  }
+}
+
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+//  :lower16: and :upper16:.
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
+  RefKind = ARMMCExpr::VK_ARM_None;
+
+  // :lower16: and :upper16: modifiers
+  assert(getLexer().is(AsmToken::Colon) && "expected a :");
+  Parser.Lex(); // Eat ':'
+
+  if (getLexer().isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), "expected prefix identifier in operand");
+    return true;
+  }
+
+  StringRef IDVal = Parser.getTok().getIdentifier();
+  if (IDVal == "lower16") {
+    RefKind = ARMMCExpr::VK_ARM_LO16;
+  } else if (IDVal == "upper16") {
+    RefKind = ARMMCExpr::VK_ARM_HI16;
+  } else {
+    Error(Parser.getTok().getLoc(), "unexpected prefix in operand");
+    return true;
+  }
+  Parser.Lex();
+
+  if (getLexer().isNot(AsmToken::Colon)) {
+    Error(Parser.getTok().getLoc(), "unexpected token after prefix");
+    return true;
+  }
+  Parser.Lex(); // Eat the last ':'
+  return false;
+}
+
+/// \brief Given a mnemonic, split out possible predication code and carry
+/// setting letters to form a canonical mnemonic and flags.
+//
+// FIXME: Would be nice to autogen this.
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+                                      unsigned &PredicationCode,
+                                      bool &CarrySetting,
+                                      unsigned &ProcessorIMod,
+                                      StringRef &ITMask) {
+  PredicationCode = ARMCC::AL;
+  CarrySetting = false;
+  ProcessorIMod = 0;
+
+  // Ignore some mnemonics we know aren't predicated forms.
+  //
+  // FIXME: Would be nice to autogen this.
+  if ((Mnemonic == "movs" && isThumb()) ||
+      Mnemonic == "teq"   || Mnemonic == "vceq"   || Mnemonic == "svc"   ||
+      Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
+      Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
+      Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
+      Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
+      Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
+      Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
+      Mnemonic == "fmuls")
+    return Mnemonic;
+
+  // First, split out any predication code. Ignore mnemonics we know aren't
+  // predicated but do have a carry-set and so weren't caught above.
+  if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+      Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+      Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+      Mnemonic != "sbcs" && Mnemonic != "rscs") {
+    unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
+      .Case("eq", ARMCC::EQ)
+      .Case("ne", ARMCC::NE)
+      .Case("hs", ARMCC::HS)
+      .Case("cs", ARMCC::HS)
+      .Case("lo", ARMCC::LO)
+      .Case("cc", ARMCC::LO)
+      .Case("mi", ARMCC::MI)
+      .Case("pl", ARMCC::PL)
+      .Case("vs", ARMCC::VS)
+      .Case("vc", ARMCC::VC)
+      .Case("hi", ARMCC::HI)
+      .Case("ls", ARMCC::LS)
+      .Case("ge", ARMCC::GE)
+      .Case("lt", ARMCC::LT)
+      .Case("gt", ARMCC::GT)
+      .Case("le", ARMCC::LE)
+      .Case("al", ARMCC::AL)
+      .Default(~0U);
+    if (CC != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 2);
+      PredicationCode = CC;
+    }
+  }
+
+  // Next, determine if we have a carry setting bit. We explicitly ignore all
+  // the instructions we know end in 's'.
+  if (Mnemonic.endswith("s") &&
+      !(Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+        Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+        Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" || Mnemonic == "flds" ||
+        Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
+        Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
+        Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" ||
+        (Mnemonic == "movs" && isThumb()))) {
+    Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
+    CarrySetting = true;
+  }
+
+  // The "cps" instruction can have a interrupt mode operand which is glued into
+  // the mnemonic. Check if this is the case, split it and parse the imod op
+  if (Mnemonic.startswith("cps")) {
+    // Split out any imod code.
+    unsigned IMod =
+      StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2, 2))
+      .Case("ie", ARM_PROC::IE)
+      .Case("id", ARM_PROC::ID)
+      .Default(~0U);
+    if (IMod != ~0U) {
+      Mnemonic = Mnemonic.slice(0, Mnemonic.size()-2);
+      ProcessorIMod = IMod;
+    }
+  }
+
+  // The "it" instruction has the condition mask on the end of the mnemonic.
+  if (Mnemonic.startswith("it")) {
+    ITMask = Mnemonic.slice(2, Mnemonic.size());
+    Mnemonic = Mnemonic.slice(0, 2);
+  }
+
+  return Mnemonic;
+}
+
+/// \brief Given a canonical mnemonic, determine if the instruction ever allows
+/// inclusion of carry set or predication code operands.
+//
+// FIXME: It would be nice to autogen this.
+void ARMAsmParser::
+getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+                      bool &CanAcceptPredicationCode) {
+  if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
+      Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
+      Mnemonic == "add" || Mnemonic == "adc" ||
+      Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
+      Mnemonic == "orr" || Mnemonic == "mvn" ||
+      Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
+      Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      Mnemonic == "vfm" || Mnemonic == "vfnm" ||
+      (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
+                      Mnemonic == "mla" || Mnemonic == "smlal" ||
+                      Mnemonic == "umlal" || Mnemonic == "umull"))) {
+    CanAcceptCarrySet = true;
+  } else
+    CanAcceptCarrySet = false;
+
+  if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
+      Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
+      Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
+      Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
+      Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" ||
+      (Mnemonic == "clrex" && !isThumb()) ||
+      (Mnemonic == "nop" && isThumbOne()) ||
+      ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" ||
+        Mnemonic == "ldc2" || Mnemonic == "ldc2l" ||
+        Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) ||
+      ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) &&
+       !isThumb()) ||
+      Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) {
+    CanAcceptPredicationCode = false;
+  } else
+    CanAcceptPredicationCode = true;
+
+  if (isThumb()) {
+    if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
+        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
+      CanAcceptPredicationCode = false;
+  }
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME: This is all horribly hacky. We really need a better way to deal
+  // with optional operands like this in the matcher table.
+
+  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+  // another does not. Specifically, the MOVW instruction does not. So we
+  // special case it here and remove the defaulted (non-setting) cc_out
+  // operand if that's the instruction we're trying to match.
+  //
+  // We do this as post-processing of the explicit operands rather than just
+  // conditionally adding the cc_out in the first place because we need
+  // to check the type of the parsed immediate operand.
+  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+      !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() &&
+      static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when there are only two register operands.
+  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+  // have to check the immediate range here since Thumb2 has a variant
+  // that can handle a different range and has a cc_out operand.
+  if (((isThumb() && Mnemonic == "add") ||
+       (isThumbTwo() && Mnemonic == "sub")) &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      ((Mnemonic == "add" &&static_cast<ARMOperand*>(Operands[5])->isReg()) ||
+       static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
+    return true;
+  // For Thumb2, add/sub immediate does not have a cc_out operand for the
+  // imm0_4095 variant. That's the least-preferred variant when
+  // selecting via the generic "add" mnemonic, so to know that we
+  // should remove the cc_out operand, we have to explicitly check that
+  // it's not one of the other variants. Ugh.
+  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    // Nest conditions rather than one big 'if' statement for readability.
+    //
+    // If either register is a high reg, it's either one of the SP
+    // variants (handled above) or a 32-bit encoding, so we just
+    // check against T3. If the second register is the PC, this is an
+    // alternate form of ADR, which uses encoding T4, so check for that too.
+    if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+         !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) &&
+        static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
+        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+      return false;
+    // If both registers are low, we're in an IT block, and the immediate is
+    // in range, we should use encoding T1 instead, which has a cc_out.
+    if (inITBlock() &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
+        static_cast<ARMOperand*>(Operands[5])->isImm0_7())
+      return false;
+
+    // Otherwise, we use encoding T4, which does not have a cc_out
+    // operand.
+    return true;
+  }
+
+  // The thumb2 multiply instruction doesn't have a CCOut register, so
+  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+  // use the 16-bit encoding or not.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isReg() &&
+      // If the registers aren't low regs, the destination reg isn't the
+      // same as one of the source regs, or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[5])->getReg()) ||
+       !inITBlock() ||
+       (static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[5])->getReg() &&
+        static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[4])->getReg())))
+    return true;
+
+  // Also check the 'mul' syntax variant that doesn't specify an explicit
+  // destination register.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      // If the registers aren't low regs  or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !inITBlock()))
+    return true;
+
+
+
+  // Register-register 'add/sub' for thumb does not have a cc_out operand
+  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+  // right, this will result in better diagnostics (which operand is off)
+  // anyway.
+  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      (Operands.size() == 5 || Operands.size() == 6) &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      (static_cast<ARMOperand*>(Operands[4])->isImm() ||
+       (Operands.size() == 6 &&
+        static_cast<ARMOperand*>(Operands[5])->isImm())))
+    return true;
+
+  return false;
+}
+
+static bool isDataTypeToken(StringRef Tok) {
+  return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
+    Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
+    Tok == ".u8" || Tok == ".u16" || Tok == ".u32" || Tok == ".u64" ||
+    Tok == ".s8" || Tok == ".s16" || Tok == ".s32" || Tok == ".s64" ||
+    Tok == ".p8" || Tok == ".p16" || Tok == ".f32" || Tok == ".f64" ||
+    Tok == ".f" || Tok == ".d";
+}
+
+// FIXME: This bit should probably be handled via an explicit match class
+// in the .td files that matches the suffix instead of having it be
+// a literal string token the way it is now.
+static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
+  return Mnemonic.startswith("vldm") || Mnemonic.startswith("vstm");
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features);
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Apply mnemonic aliases before doing anything else, as the destination
+  // mnemnonic may include suffices and we want to handle them normally.
+  // The generic tblgen'erated code does this later, at the start of
+  // MatchInstructionImpl(), but that's too late for aliases that include
+  // any sort of suffix.
+  unsigned AvailableFeatures = getAvailableFeatures();
+  applyMnemonicAliases(Name, AvailableFeatures);
+
+  // First check for the ARM-specific .req directive.
+  if (Parser.getTok().is(AsmToken::Identifier) &&
+      Parser.getTok().getIdentifier() == ".req") {
+    parseDirectiveReq(Name, NameLoc);
+    // We always return 'error' for this, as we're done with this
+    // statement and don't need to match the 'instruction."
+    return true;
+  }
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Mnemonic = Name.slice(Start, Next);
+
+  // Split out the predication code and carry setting flag from the mnemonic.
+  unsigned PredicationCode;
+  unsigned ProcessorIMod;
+  bool CarrySetting;
+  StringRef ITMask;
+  Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+                           ProcessorIMod, ITMask);
+
+  // In Thumb1, only the branch (B) instruction can be predicated.
+  if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "conditional execution not supported in Thumb1");
+  }
+
+  Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
+
+  // Handle the IT instruction ITMask. Convert it to a bitmask. This
+  // is the mask as it will be for the IT encoding if the conditional
+  // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+  // where the conditional bit0 is zero, the instruction post-processing
+  // will adjust the mask accordingly.
+  if (Mnemonic == "it") {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+    if (ITMask.size() > 3) {
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "too many conditions on IT instruction");
+    }
+    unsigned Mask = 8;
+    for (unsigned i = ITMask.size(); i != 0; --i) {
+      char pos = ITMask[i - 1];
+      if (pos != 't' && pos != 'e') {
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+      }
+      Mask >>= 1;
+      if (ITMask[i - 1] == 't')
+        Mask |= 8;
+    }
+    Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+  }
+
+  // FIXME: This is all a pretty gross hack. We should automatically handle
+  // optional operands like this via tblgen.
+
+  // Next, add the CCOut and ConditionCode operands, if needed.
+  //
+  // For mnemonics which can ever incorporate a carry setting bit or predication
+  // code, our matching model involves us always generating CCOut and
+  // ConditionCode operands to match the mnemonic "as written" and then we let
+  // the matcher deal with finding the right instruction or generating an
+  // appropriate error.
+  bool CanAcceptCarrySet, CanAcceptPredicationCode;
+  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
+
+  // If we had a carry-set on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptCarrySet && CarrySetting) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' can not set flags, but 's' suffix specified");
+  }
+  // If we had a predication code on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' is not predicable, but condition code specified");
+  }
+
+  // Add the carry setting operand, if necessary.
+  if (CanAcceptCarrySet) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
+    Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
+                                               Loc));
+  }
+
+  // Add the predication code operand, if necessary.
+  if (CanAcceptPredicationCode) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+                                      CarrySetting);
+    Operands.push_back(ARMOperand::CreateCondCode(
+                         ARMCC::CondCodes(PredicationCode), Loc));
+  }
+
+  // Add the processor imod operand, if necessary.
+  if (ProcessorIMod) {
+    Operands.push_back(ARMOperand::CreateImm(
+          MCConstantExpr::Create(ProcessorIMod, getContext()),
+                                 NameLoc, NameLoc));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    StringRef ExtraToken = Name.slice(Start, Next);
+
+    // Some NEON instructions have an optional datatype suffix that is
+    // completely ignored. Check for that.
+    if (isDataTypeToken(ExtraToken) &&
+        doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
+      continue;
+
+    if (ExtraToken != ".n") {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Mnemonic)) {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Mnemonic)) {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  // Some instructions, mostly Thumb, have forms for the same mnemonic that
+  // do and don't have a cc_out optional-def operand. With some spot-checks
+  // of the operand list, we can figure out which variant we're trying to
+  // parse and adjust accordingly before actually matching. We shouldn't ever
+  // try to remove a cc_out operand that was explicitly set on the the
+  // mnemonic, of course (CarrySetting == true). Reason number #317 the
+  // table driven matcher doesn't fit well with the ARM instruction set.
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+      static_cast<ARMOperand*>(Operands[2])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // Adjust operands of ldrexd/strexd to MCK_GPRPair.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when parsing from asm, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (!isThumb() && Operands.size() > 4 &&
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd")) {
+    bool isLoad = (Mnemonic == "ldrexd");
+    unsigned Idx = isLoad ? 2 : 3;
+    ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
+    ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
+
+    const MCRegisterClass& MRC = MRI->getRegClass(ARM::GPRRegClassID);
+    // Adjust only if Op1 and Op2 are GPRs.
+    if (Op1->isReg() && Op2->isReg() && MRC.contains(Op1->getReg()) &&
+        MRC.contains(Op2->getReg())) {
+      unsigned Reg1 = Op1->getReg();
+      unsigned Reg2 = Op2->getReg();
+      unsigned Rt = MRI->getEncodingValue(Reg1);
+      unsigned Rt2 = MRI->getEncodingValue(Reg2);
+
+      // Rt2 must be Rt + 1 and Rt must be even.
+      if (Rt + 1 != Rt2 || (Rt & 1)) {
+        Error(Op2->getStartLoc(), isLoad ?
+            "destination operands must be sequential" :
+            "source operands must be sequential");
+        return true;
+      }
+      unsigned NewReg = MRI->getMatchingSuperReg(Reg1, ARM::gsub_0,
+          &(MRI->getRegClass(ARM::GPRPairRegClassID)));
+      Operands.erase(Operands.begin() + Idx, Operands.begin() + Idx + 2);
+      Operands.insert(Operands.begin() + Idx, ARMOperand::CreateReg(
+            NewReg, Op1->getStartLoc(), Op2->getEndLoc()));
+      delete Op1;
+      delete Op2;
+    }
+  }
+
+  return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
+                                 unsigned HiReg, bool &containsReg) {
+  containsReg = false;
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      containsReg = true;
+    // Anything other than a low register isn't legal here.
+    if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+      return true;
+  }
+  return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      return true;
+  }
+  return false;
+}
+
+// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
+// the ARMInsts array) instead. Getting that here requires awkward
+// API changes, though. Better way?
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+}
+static const MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return ARMInsts[Opcode];
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::
+validateInstruction(MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  SMLoc Loc = Operands[0]->getStartLoc();
+  // Check the IT block state first.
+  // NOTE: BKPT instruction has the interesting property of being
+  // allowed in IT blocks, but not being predicable.  It just always
+  // executes.
+  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT &&
+      Inst.getOpcode() != ARM::BKPT) {
+    unsigned bit = 1;
+    if (ITState.FirstCond)
+      ITState.FirstCond = false;
+    else
+      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+    // The instruction must be predicable.
+    if (!MCID.isPredicable())
+      return Error(Loc, "instructions in IT block must be predicable");
+    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    unsigned ITCond = bit ? ITState.Cond :
+      ARMCC::getOppositeCondition(ITState.Cond);
+    if (Cond != ITCond) {
+      // Find the condition code Operand to get its SMLoc information.
+      SMLoc CondLoc;
+      for (unsigned i = 1; i < Operands.size(); ++i)
+        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
+          CondLoc = Operands[i]->getStartLoc();
+      return Error(CondLoc, "incorrect condition in IT block; got '" +
+                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+                   "', but expected '" +
+                   ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'");
+    }
+  // Check for non-'al' condition codes outside of the IT block.
+  } else if (isThumbTwo() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
+             Inst.getOpcode() != ARM::t2B)
+    return Error(Loc, "predicated instructions must be in IT block");
+
+  switch (Inst.getOpcode()) {
+  case ARM::LDRD:
+  case ARM::LDRD_PRE:
+  case ARM::LDRD_POST: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+    return false;
+  }
+  case ARM::STRD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STRD_PRE:
+  case ARM::STRD_POST: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(2).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::SBFX:
+  case ARM::UBFX: {
+    // width must be in range [1, 32-lsb]
+    unsigned lsb = Inst.getOperand(2).getImm();
+    unsigned widthm1 = Inst.getOperand(3).getImm();
+    if (widthm1 >= 32 - lsb)
+      return Error(Operands[5]->getStartLoc(),
+                   "bitfield width must be in range [1,32-lsb]");
+    return false;
+  }
+  case ARM::tLDMIA: {
+    // If we're parsing Thumb2, the .w variant is available and handles
+    // most cases that are normally illegal for a Thumb1 LDM
+    // instruction. We'll make the transformation in processInstruction()
+    // if necessary.
+    //
+    // Thumb LDM instructions are writeback iff the base register is not
+    // in the register list.
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    // If we should have writeback, then there should be a '!' token.
+    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "writeback operator '!' expected");
+    // If we should not have writeback, there must not be a '!'. This is
+    // true even for the 32-bit wide encodings.
+    if (listContainsBase && hasWritebackToken)
+      return Error(Operands[3]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    break;
+  }
+  case ARM::t2LDMIA_UPD: {
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+    break;
+  }
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[5])->getReg()) &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[4])->getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
+  // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
+  // so only issue a diagnostic for thumb1. The instructions will be
+  // switched to the t2 encodings in processInstruction() if necessary.
+  case ARM::tPOP: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or pc");
+    break;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase) &&
+        !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or lr");
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[4]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    break;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need thumb2 (for the wide encoding), or we have an error.
+    if (!isThumbTwo() &&
+        Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      return Error(Operands[4]->getStartLoc(),
+                   "source register must be the same as destination");
+    }
+    break;
+  }
+  }
+
+  return false;
+}
+
+static unsigned getRealVSTOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VST1LN
+  case ARM::VST1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST1LNd8_UPD;
+  case ARM::VST1LNdWB_register_Asm_16: Spacing = 1; return ARM::VST1LNd16_UPD;
+  case ARM::VST1LNdWB_register_Asm_32: Spacing = 1; return ARM::VST1LNd32_UPD;
+  case ARM::VST1LNdAsm_8:  Spacing = 1; return ARM::VST1LNd8;
+  case ARM::VST1LNdAsm_16: Spacing = 1; return ARM::VST1LNd16;
+  case ARM::VST1LNdAsm_32: Spacing = 1; return ARM::VST1LNd32;
+
+  // VST2LN
+  case ARM::VST2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST2LNd8_UPD;
+  case ARM::VST2LNdWB_register_Asm_16: Spacing = 1; return ARM::VST2LNd16_UPD;
+  case ARM::VST2LNdWB_register_Asm_32: Spacing = 1; return ARM::VST2LNd32_UPD;
+  case ARM::VST2LNqWB_register_Asm_16: Spacing = 2; return ARM::VST2LNq16_UPD;
+  case ARM::VST2LNqWB_register_Asm_32: Spacing = 2; return ARM::VST2LNq32_UPD;
+
+  case ARM::VST2LNdAsm_8:  Spacing = 1; return ARM::VST2LNd8;
+  case ARM::VST2LNdAsm_16: Spacing = 1; return ARM::VST2LNd16;
+  case ARM::VST2LNdAsm_32: Spacing = 1; return ARM::VST2LNd32;
+  case ARM::VST2LNqAsm_16: Spacing = 2; return ARM::VST2LNq16;
+  case ARM::VST2LNqAsm_32: Spacing = 2; return ARM::VST2LNq32;
+
+  // VST3LN
+  case ARM::VST3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST3LNd8_UPD;
+  case ARM::VST3LNdWB_register_Asm_16: Spacing = 1; return ARM::VST3LNd16_UPD;
+  case ARM::VST3LNdWB_register_Asm_32: Spacing = 1; return ARM::VST3LNd32_UPD;
+  case ARM::VST3LNqWB_register_Asm_16: Spacing = 2; return ARM::VST3LNq16_UPD;
+  case ARM::VST3LNqWB_register_Asm_32: Spacing = 2; return ARM::VST3LNq32_UPD;
+  case ARM::VST3LNdAsm_8:  Spacing = 1; return ARM::VST3LNd8;
+  case ARM::VST3LNdAsm_16: Spacing = 1; return ARM::VST3LNd16;
+  case ARM::VST3LNdAsm_32: Spacing = 1; return ARM::VST3LNd32;
+  case ARM::VST3LNqAsm_16: Spacing = 2; return ARM::VST3LNq16;
+  case ARM::VST3LNqAsm_32: Spacing = 2; return ARM::VST3LNq32;
+
+  // VST3
+  case ARM::VST3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_fixed_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_fixed_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_fixed_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_fixed_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dWB_register_Asm_8:  Spacing = 1; return ARM::VST3d8_UPD;
+  case ARM::VST3dWB_register_Asm_16: Spacing = 1; return ARM::VST3d16_UPD;
+  case ARM::VST3dWB_register_Asm_32: Spacing = 1; return ARM::VST3d32_UPD;
+  case ARM::VST3qWB_register_Asm_8:  Spacing = 2; return ARM::VST3q8_UPD;
+  case ARM::VST3qWB_register_Asm_16: Spacing = 2; return ARM::VST3q16_UPD;
+  case ARM::VST3qWB_register_Asm_32: Spacing = 2; return ARM::VST3q32_UPD;
+  case ARM::VST3dAsm_8:  Spacing = 1; return ARM::VST3d8;
+  case ARM::VST3dAsm_16: Spacing = 1; return ARM::VST3d16;
+  case ARM::VST3dAsm_32: Spacing = 1; return ARM::VST3d32;
+  case ARM::VST3qAsm_8:  Spacing = 2; return ARM::VST3q8;
+  case ARM::VST3qAsm_16: Spacing = 2; return ARM::VST3q16;
+  case ARM::VST3qAsm_32: Spacing = 2; return ARM::VST3q32;
+
+  // VST4LN
+  case ARM::VST4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VST4LNd8_UPD;
+  case ARM::VST4LNdWB_register_Asm_16: Spacing = 1; return ARM::VST4LNd16_UPD;
+  case ARM::VST4LNdWB_register_Asm_32: Spacing = 1; return ARM::VST4LNd32_UPD;
+  case ARM::VST4LNqWB_register_Asm_16: Spacing = 2; return ARM::VST4LNq16_UPD;
+  case ARM::VST4LNqWB_register_Asm_32: Spacing = 2; return ARM::VST4LNq32_UPD;
+  case ARM::VST4LNdAsm_8:  Spacing = 1; return ARM::VST4LNd8;
+  case ARM::VST4LNdAsm_16: Spacing = 1; return ARM::VST4LNd16;
+  case ARM::VST4LNdAsm_32: Spacing = 1; return ARM::VST4LNd32;
+  case ARM::VST4LNqAsm_16: Spacing = 2; return ARM::VST4LNq16;
+  case ARM::VST4LNqAsm_32: Spacing = 2; return ARM::VST4LNq32;
+
+  // VST4
+  case ARM::VST4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_fixed_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_fixed_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_fixed_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_fixed_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dWB_register_Asm_8:  Spacing = 1; return ARM::VST4d8_UPD;
+  case ARM::VST4dWB_register_Asm_16: Spacing = 1; return ARM::VST4d16_UPD;
+  case ARM::VST4dWB_register_Asm_32: Spacing = 1; return ARM::VST4d32_UPD;
+  case ARM::VST4qWB_register_Asm_8:  Spacing = 2; return ARM::VST4q8_UPD;
+  case ARM::VST4qWB_register_Asm_16: Spacing = 2; return ARM::VST4q16_UPD;
+  case ARM::VST4qWB_register_Asm_32: Spacing = 2; return ARM::VST4q32_UPD;
+  case ARM::VST4dAsm_8:  Spacing = 1; return ARM::VST4d8;
+  case ARM::VST4dAsm_16: Spacing = 1; return ARM::VST4d16;
+  case ARM::VST4dAsm_32: Spacing = 1; return ARM::VST4d32;
+  case ARM::VST4qAsm_8:  Spacing = 2; return ARM::VST4q8;
+  case ARM::VST4qAsm_16: Spacing = 2; return ARM::VST4q16;
+  case ARM::VST4qAsm_32: Spacing = 2; return ARM::VST4q32;
+  }
+}
+
+static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
+  switch(Opc) {
+  default: llvm_unreachable("unexpected opcode!");
+  // VLD1LN
+  case ARM::VLD1LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD1LNd8_UPD;
+  case ARM::VLD1LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD1LNd16_UPD;
+  case ARM::VLD1LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD1LNd32_UPD;
+  case ARM::VLD1LNdAsm_8:  Spacing = 1; return ARM::VLD1LNd8;
+  case ARM::VLD1LNdAsm_16: Spacing = 1; return ARM::VLD1LNd16;
+  case ARM::VLD1LNdAsm_32: Spacing = 1; return ARM::VLD1LNd32;
+
+  // VLD2LN
+  case ARM::VLD2LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD2LNd8_UPD;
+  case ARM::VLD2LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD2LNd16_UPD;
+  case ARM::VLD2LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD2LNd32_UPD;
+  case ARM::VLD2LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD2LNq16_UPD;
+  case ARM::VLD2LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD2LNq32_UPD;
+  case ARM::VLD2LNdAsm_8:  Spacing = 1; return ARM::VLD2LNd8;
+  case ARM::VLD2LNdAsm_16: Spacing = 1; return ARM::VLD2LNd16;
+  case ARM::VLD2LNdAsm_32: Spacing = 1; return ARM::VLD2LNd32;
+  case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16;
+  case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32;
+
+  // VLD3DUP
+  case ARM::VLD3DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+  case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+  case ARM::VLD3DUPdAsm_8:  Spacing = 1; return ARM::VLD3DUPd8;
+  case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16;
+  case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32;
+  case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8;
+  case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16;
+  case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32;
+
+  // VLD3LN
+  case ARM::VLD3LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3LNd8_UPD;
+  case ARM::VLD3LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
+  case ARM::VLD3LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD3LNd32_UPD;
+  case ARM::VLD3LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD3LNq16_UPD;
+  case ARM::VLD3LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD3LNq32_UPD;
+  case ARM::VLD3LNdAsm_8:  Spacing = 1; return ARM::VLD3LNd8;
+  case ARM::VLD3LNdAsm_16: Spacing = 1; return ARM::VLD3LNd16;
+  case ARM::VLD3LNdAsm_32: Spacing = 1; return ARM::VLD3LNd32;
+  case ARM::VLD3LNqAsm_16: Spacing = 2; return ARM::VLD3LNq16;
+  case ARM::VLD3LNqAsm_32: Spacing = 2; return ARM::VLD3LNq32;
+
+  // VLD3
+  case ARM::VLD3dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dWB_register_Asm_8:  Spacing = 1; return ARM::VLD3d8_UPD;
+  case ARM::VLD3dWB_register_Asm_16: Spacing = 1; return ARM::VLD3d16_UPD;
+  case ARM::VLD3dWB_register_Asm_32: Spacing = 1; return ARM::VLD3d32_UPD;
+  case ARM::VLD3qWB_register_Asm_8:  Spacing = 2; return ARM::VLD3q8_UPD;
+  case ARM::VLD3qWB_register_Asm_16: Spacing = 2; return ARM::VLD3q16_UPD;
+  case ARM::VLD3qWB_register_Asm_32: Spacing = 2; return ARM::VLD3q32_UPD;
+  case ARM::VLD3dAsm_8:  Spacing = 1; return ARM::VLD3d8;
+  case ARM::VLD3dAsm_16: Spacing = 1; return ARM::VLD3d16;
+  case ARM::VLD3dAsm_32: Spacing = 1; return ARM::VLD3d32;
+  case ARM::VLD3qAsm_8:  Spacing = 2; return ARM::VLD3q8;
+  case ARM::VLD3qAsm_16: Spacing = 2; return ARM::VLD3q16;
+  case ARM::VLD3qAsm_32: Spacing = 2; return ARM::VLD3q32;
+
+  // VLD4LN
+  case ARM::VLD4LNdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4LNd8_UPD;
+  case ARM::VLD4LNdWB_register_Asm_16: Spacing = 1; return ARM::VLD4LNd16_UPD;
+  case ARM::VLD4LNdWB_register_Asm_32: Spacing = 1; return ARM::VLD4LNd32_UPD;
+  case ARM::VLD4LNqWB_register_Asm_16: Spacing = 2; return ARM::VLD4LNq16_UPD;
+  case ARM::VLD4LNqWB_register_Asm_32: Spacing = 2; return ARM::VLD4LNq32_UPD;
+  case ARM::VLD4LNdAsm_8:  Spacing = 1; return ARM::VLD4LNd8;
+  case ARM::VLD4LNdAsm_16: Spacing = 1; return ARM::VLD4LNd16;
+  case ARM::VLD4LNdAsm_32: Spacing = 1; return ARM::VLD4LNd32;
+  case ARM::VLD4LNqAsm_16: Spacing = 2; return ARM::VLD4LNq16;
+  case ARM::VLD4LNqAsm_32: Spacing = 2; return ARM::VLD4LNq32;
+
+  // VLD4DUP
+  case ARM::VLD4DUPdWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD4DUPd8_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD4DUPd16_UPD;
+  case ARM::VLD4DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD4DUPd32_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD4DUPq8_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD4DUPq16_UPD;
+  case ARM::VLD4DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD4DUPq32_UPD;
+  case ARM::VLD4DUPdAsm_8:  Spacing = 1; return ARM::VLD4DUPd8;
+  case ARM::VLD4DUPdAsm_16: Spacing = 1; return ARM::VLD4DUPd16;
+  case ARM::VLD4DUPdAsm_32: Spacing = 1; return ARM::VLD4DUPd32;
+  case ARM::VLD4DUPqAsm_8: Spacing = 2; return ARM::VLD4DUPq8;
+  case ARM::VLD4DUPqAsm_16: Spacing = 2; return ARM::VLD4DUPq16;
+  case ARM::VLD4DUPqAsm_32: Spacing = 2; return ARM::VLD4DUPq32;
+
+  // VLD4
+  case ARM::VLD4dWB_fixed_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_fixed_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_fixed_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_fixed_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_fixed_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_fixed_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dWB_register_Asm_8:  Spacing = 1; return ARM::VLD4d8_UPD;
+  case ARM::VLD4dWB_register_Asm_16: Spacing = 1; return ARM::VLD4d16_UPD;
+  case ARM::VLD4dWB_register_Asm_32: Spacing = 1; return ARM::VLD4d32_UPD;
+  case ARM::VLD4qWB_register_Asm_8:  Spacing = 2; return ARM::VLD4q8_UPD;
+  case ARM::VLD4qWB_register_Asm_16: Spacing = 2; return ARM::VLD4q16_UPD;
+  case ARM::VLD4qWB_register_Asm_32: Spacing = 2; return ARM::VLD4q32_UPD;
+  case ARM::VLD4dAsm_8:  Spacing = 1; return ARM::VLD4d8;
+  case ARM::VLD4dAsm_16: Spacing = 1; return ARM::VLD4d16;
+  case ARM::VLD4dAsm_32: Spacing = 1; return ARM::VLD4d32;
+  case ARM::VLD4qAsm_8:  Spacing = 2; return ARM::VLD4q8;
+  case ARM::VLD4qAsm_16: Spacing = 2; return ARM::VLD4q16;
+  case ARM::VLD4qAsm_32: Spacing = 2; return ARM::VLD4q32;
+  }
+}
+
+bool ARMAsmParser::
+processInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  // Alias for alternate form of 'ADR Rd, #imm' instruction.
+  case ARM::ADDri: {
+    if (Inst.getOperand(1).getReg() != ARM::PC ||
+        Inst.getOperand(5).getReg() != 0)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::ADR);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  // Aliases for alternate PC+imm syntax of LDR instructions.
+  case ARM::t2LDRpcrel:
+    // Select the narrow version if the immediate will fit.
+    if (Inst.getOperand(1).getImm() > 0 &&
+        Inst.getOperand(1).getImm() <= 0xff)
+      Inst.setOpcode(ARM::tLDRpci);
+    else
+      Inst.setOpcode(ARM::t2LDRpci);
+    return true;
+  case ARM::t2LDRBpcrel:
+    Inst.setOpcode(ARM::t2LDRBpci);
+    return true;
+  case ARM::t2LDRHpcrel:
+    Inst.setOpcode(ARM::t2LDRHpci);
+    return true;
+  case ARM::t2LDRSBpcrel:
+    Inst.setOpcode(ARM::t2LDRSBpci);
+    return true;
+  case ARM::t2LDRSHpcrel:
+    Inst.setOpcode(ARM::t2LDRSHpci);
+    return true;
+  // Handle NEON VST complex aliases.
+  case ARM::VST1LNdWB_register_Asm_8:
+  case ARM::VST1LNdWB_register_Asm_16:
+  case ARM::VST1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_register_Asm_8:
+  case ARM::VST2LNdWB_register_Asm_16:
+  case ARM::VST2LNdWB_register_Asm_32:
+  case ARM::VST2LNqWB_register_Asm_16:
+  case ARM::VST2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_register_Asm_8:
+  case ARM::VST3LNdWB_register_Asm_16:
+  case ARM::VST3LNdWB_register_Asm_32:
+  case ARM::VST3LNqWB_register_Asm_16:
+  case ARM::VST3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_register_Asm_8:
+  case ARM::VST4LNdWB_register_Asm_16:
+  case ARM::VST4LNdWB_register_Asm_32:
+  case ARM::VST4LNqWB_register_Asm_16:
+  case ARM::VST4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdWB_fixed_Asm_8:
+  case ARM::VST1LNdWB_fixed_Asm_16:
+  case ARM::VST1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdWB_fixed_Asm_8:
+  case ARM::VST2LNdWB_fixed_Asm_16:
+  case ARM::VST2LNdWB_fixed_Asm_32:
+  case ARM::VST2LNqWB_fixed_Asm_16:
+  case ARM::VST2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdWB_fixed_Asm_8:
+  case ARM::VST3LNdWB_fixed_Asm_16:
+  case ARM::VST3LNdWB_fixed_Asm_32:
+  case ARM::VST3LNqWB_fixed_Asm_16:
+  case ARM::VST3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdWB_fixed_Asm_8:
+  case ARM::VST4LNdWB_fixed_Asm_16:
+  case ARM::VST4LNdWB_fixed_Asm_32:
+  case ARM::VST4LNqWB_fixed_Asm_16:
+  case ARM::VST4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST1LNdAsm_8:
+  case ARM::VST1LNdAsm_16:
+  case ARM::VST1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST2LNdAsm_8:
+  case ARM::VST2LNdAsm_16:
+  case ARM::VST2LNdAsm_32:
+  case ARM::VST2LNqAsm_16:
+  case ARM::VST2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3LNdAsm_8:
+  case ARM::VST3LNdAsm_16:
+  case ARM::VST3LNdAsm_32:
+  case ARM::VST3LNqAsm_16:
+  case ARM::VST3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4LNdAsm_8:
+  case ARM::VST4LNdAsm_16:
+  case ARM::VST4LNdAsm_32:
+  case ARM::VST4LNqAsm_16:
+  case ARM::VST4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle NEON VLD complex aliases.
+  case ARM::VLD1LNdWB_register_Asm_8:
+  case ARM::VLD1LNdWB_register_Asm_16:
+  case ARM::VLD1LNdWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_register_Asm_8:
+  case ARM::VLD2LNdWB_register_Asm_16:
+  case ARM::VLD2LNdWB_register_Asm_32:
+  case ARM::VLD2LNqWB_register_Asm_16:
+  case ARM::VLD2LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_register_Asm_8:
+  case ARM::VLD3LNdWB_register_Asm_16:
+  case ARM::VLD3LNdWB_register_Asm_32:
+  case ARM::VLD3LNqWB_register_Asm_16:
+  case ARM::VLD3LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_register_Asm_8:
+  case ARM::VLD4LNdWB_register_Asm_16:
+  case ARM::VLD4LNdWB_register_Asm_32:
+  case ARM::VLD4LNqWB_register_Asm_16:
+  case ARM::VLD4LNqWB_register_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(4)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(5)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(6));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdWB_fixed_Asm_8:
+  case ARM::VLD1LNdWB_fixed_Asm_16:
+  case ARM::VLD1LNdWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdWB_fixed_Asm_8:
+  case ARM::VLD2LNdWB_fixed_Asm_16:
+  case ARM::VLD2LNdWB_fixed_Asm_32:
+  case ARM::VLD2LNqWB_fixed_Asm_16:
+  case ARM::VLD2LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdWB_fixed_Asm_8:
+  case ARM::VLD3LNdWB_fixed_Asm_16:
+  case ARM::VLD3LNdWB_fixed_Asm_32:
+  case ARM::VLD3LNqWB_fixed_Asm_16:
+  case ARM::VLD3LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdWB_fixed_Asm_8:
+  case ARM::VLD4LNdWB_fixed_Asm_16:
+  case ARM::VLD4LNdWB_fixed_Asm_32:
+  case ARM::VLD4LNqWB_fixed_Asm_16:
+  case ARM::VLD4LNqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD1LNdAsm_8:
+  case ARM::VLD1LNdAsm_16:
+  case ARM::VLD1LNdAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD2LNdAsm_8:
+  case ARM::VLD2LNdAsm_16:
+  case ARM::VLD2LNdAsm_32:
+  case ARM::VLD2LNqAsm_16:
+  case ARM::VLD2LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3LNdAsm_8:
+  case ARM::VLD3LNdAsm_16:
+  case ARM::VLD3LNdAsm_32:
+  case ARM::VLD3LNqAsm_16:
+  case ARM::VLD3LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4LNdAsm_8:
+  case ARM::VLD4LNdAsm_16:
+  case ARM::VLD4LNdAsm_32:
+  case ARM::VLD4LNqAsm_16:
+  case ARM::VLD4LNqAsm_32: {
+    MCInst TmpInst;
+    // Shuffle the operands around so the lane index operand is in the
+    // right place.
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(2)); // Rn
+    TmpInst.addOperand(Inst.getOperand(3)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Tied operand src (== Vd)
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // lane
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD3DUPdAsm_8:
+  case ARM::VLD3DUPdAsm_16:
+  case ARM::VLD3DUPdAsm_32:
+  case ARM::VLD3DUPqAsm_8:
+  case ARM::VLD3DUPqAsm_16:
+  case ARM::VLD3DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_fixed_Asm_8:
+  case ARM::VLD3DUPdWB_fixed_Asm_16:
+  case ARM::VLD3DUPdWB_fixed_Asm_32:
+  case ARM::VLD3DUPqWB_fixed_Asm_8:
+  case ARM::VLD3DUPqWB_fixed_Asm_16:
+  case ARM::VLD3DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3DUPdWB_register_Asm_8:
+  case ARM::VLD3DUPdWB_register_Asm_16:
+  case ARM::VLD3DUPdWB_register_Asm_32:
+  case ARM::VLD3DUPqWB_register_Asm_8:
+  case ARM::VLD3DUPqWB_register_Asm_16:
+  case ARM::VLD3DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD3 multiple 3-element structure instructions.
+  case ARM::VLD3dAsm_8:
+  case ARM::VLD3dAsm_16:
+  case ARM::VLD3dAsm_32:
+  case ARM::VLD3qAsm_8:
+  case ARM::VLD3qAsm_16:
+  case ARM::VLD3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_fixed_Asm_8:
+  case ARM::VLD3dWB_fixed_Asm_16:
+  case ARM::VLD3dWB_fixed_Asm_32:
+  case ARM::VLD3qWB_fixed_Asm_8:
+  case ARM::VLD3qWB_fixed_Asm_16:
+  case ARM::VLD3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD3dWB_register_Asm_8:
+  case ARM::VLD3dWB_register_Asm_16:
+  case ARM::VLD3dWB_register_Asm_32:
+  case ARM::VLD3qWB_register_Asm_8:
+  case ARM::VLD3qWB_register_Asm_16:
+  case ARM::VLD3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4DUP single 3-element structure to all lanes instructions.
+  case ARM::VLD4DUPdAsm_8:
+  case ARM::VLD4DUPdAsm_16:
+  case ARM::VLD4DUPdAsm_32:
+  case ARM::VLD4DUPqAsm_8:
+  case ARM::VLD4DUPqAsm_16:
+  case ARM::VLD4DUPqAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_fixed_Asm_8:
+  case ARM::VLD4DUPdWB_fixed_Asm_16:
+  case ARM::VLD4DUPdWB_fixed_Asm_32:
+  case ARM::VLD4DUPqWB_fixed_Asm_8:
+  case ARM::VLD4DUPqWB_fixed_Asm_16:
+  case ARM::VLD4DUPqWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4DUPdWB_register_Asm_8:
+  case ARM::VLD4DUPdWB_register_Asm_16:
+  case ARM::VLD4DUPdWB_register_Asm_32:
+  case ARM::VLD4DUPqWB_register_Asm_8:
+  case ARM::VLD4DUPqWB_register_Asm_16:
+  case ARM::VLD4DUPqWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VLD4 multiple 4-element structure instructions.
+  case ARM::VLD4dAsm_8:
+  case ARM::VLD4dAsm_16:
+  case ARM::VLD4dAsm_32:
+  case ARM::VLD4qAsm_8:
+  case ARM::VLD4qAsm_16:
+  case ARM::VLD4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_fixed_Asm_8:
+  case ARM::VLD4dWB_fixed_Asm_16:
+  case ARM::VLD4dWB_fixed_Asm_32:
+  case ARM::VLD4qWB_fixed_Asm_8:
+  case ARM::VLD4qWB_fixed_Asm_16:
+  case ARM::VLD4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VLD4dWB_register_Asm_8:
+  case ARM::VLD4dWB_register_Asm_16:
+  case ARM::VLD4dWB_register_Asm_32:
+  case ARM::VLD4qWB_register_Asm_8:
+  case ARM::VLD4qWB_register_Asm_16:
+  case ARM::VLD4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST3 multiple 3-element structure instructions.
+  case ARM::VST3dAsm_8:
+  case ARM::VST3dAsm_16:
+  case ARM::VST3dAsm_32:
+  case ARM::VST3qAsm_8:
+  case ARM::VST3qAsm_16:
+  case ARM::VST3qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_fixed_Asm_8:
+  case ARM::VST3dWB_fixed_Asm_16:
+  case ARM::VST3dWB_fixed_Asm_32:
+  case ARM::VST3qWB_fixed_Asm_8:
+  case ARM::VST3qWB_fixed_Asm_16:
+  case ARM::VST3qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST3dWB_register_Asm_8:
+  case ARM::VST3dWB_register_Asm_16:
+  case ARM::VST3dWB_register_Asm_32:
+  case ARM::VST3qWB_register_Asm_8:
+  case ARM::VST3qWB_register_Asm_16:
+  case ARM::VST3qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // VST4 multiple 3-element structure instructions.
+  case ARM::VST4dAsm_8:
+  case ARM::VST4dAsm_16:
+  case ARM::VST4dAsm_32:
+  case ARM::VST4qAsm_8:
+  case ARM::VST4qAsm_16:
+  case ARM::VST4qAsm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_fixed_Asm_8:
+  case ARM::VST4dWB_fixed_Asm_16:
+  case ARM::VST4dWB_fixed_Asm_32:
+  case ARM::VST4qWB_fixed_Asm_8:
+  case ARM::VST4qWB_fixed_Asm_16:
+  case ARM::VST4qWB_fixed_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+
+  case ARM::VST4dWB_register_Asm_8:
+  case ARM::VST4dWB_register_Asm_16:
+  case ARM::VST4dWB_register_Asm_32:
+  case ARM::VST4qWB_register_Asm_8:
+  case ARM::VST4qWB_register_Asm_16:
+  case ARM::VST4qWB_register_Asm_32: {
+    MCInst TmpInst;
+    unsigned Spacing;
+    TmpInst.setOpcode(getRealVSTOpcode(Inst.getOpcode(), Spacing));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // alignment
+    TmpInst.addOperand(Inst.getOperand(3)); // Rm
+    TmpInst.addOperand(Inst.getOperand(0)); // Vd
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 2));
+    TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+                                            Spacing * 3));
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    Inst = TmpInst;
+    return true;
+  }
+
+  // Handle encoding choice for the shift-immediate instructions.
+  case ARM::t2LSLri:
+  case ARM::t2LSRri:
+  case ARM::t2ASRri: {
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !(static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ARM::t2LSLri: NewOpc = ARM::tLSLri; break;
+      case ARM::t2LSRri: NewOpc = ARM::tLSRri; break;
+      case ARM::t2ASRri: NewOpc = ARM::tASRri; break;
+      }
+      // The Thumb1 operands aren't in the same order. Awesome, eh?
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+
+  // Handle the Thumb2 mode MOV complex aliases.
+  case ARM::t2MOVsr:
+  case ARM::t2MOVSsr: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        isARMLowRegister(Inst.getOperand(2).getReg()) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(3).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRrr : ARM::t2ASRrr; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRrr : ARM::t2LSRrr; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLrr : ARM::t2LSLrr; break;
+    case ARM_AM::ror: newOpc = isNarrow ? ARM::tROR   : ARM::t2RORrr; break;
+    }
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(5));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2MOVsi:
+  case ARM::t2MOVSsi: {
+    // Which instruction to expand to depends on the CCOut operand and
+    // whether we're in an IT block if the register operands are low
+    // registers.
+    bool isNarrow = false;
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+      isNarrow = true;
+    MCInst TmpInst;
+    unsigned newOpc;
+    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+    }
+    unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    if (Amount == 32) Amount = 0;
+    TmpInst.setOpcode(newOpc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    if (isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (newOpc != ARM::t2RRX)
+      TmpInst.addOperand(MCOperand::CreateImm(Amount));
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    if (!isNarrow)
+      TmpInst.addOperand(MCOperand::CreateReg(
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+    Inst = TmpInst;
+    return true;
+  }
+  // Handle the ARM mode MOV complex aliases.
+  case ARM::ASRr:
+  case ARM::LSRr:
+  case ARM::LSLr:
+  case ARM::RORr: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRr: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRr: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLr: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORr: ShiftTy = ARM_AM::ror; break;
+    }
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsr);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::ASRi:
+  case ARM::LSRi:
+  case ARM::LSLi:
+  case ARM::RORi: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRi: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRi: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLi: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORi: ShiftTy = ARM_AM::ror; break;
+    }
+    // A shift by zero is a plain MOVr, not a MOVsi.
+    unsigned Amt = Inst.getOperand(2).getImm();
+    unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
+    // A shift by 32 should be encoded as 0 when permitted
+    if (Amt == 32 && (ShiftTy == ARM_AM::lsr || ShiftTy == ARM_AM::asr))
+      Amt = 0;
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
+    MCInst TmpInst;
+    TmpInst.setOpcode(Opc);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    if (Opc == ARM::MOVsi)
+      TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::RRXi: {
+    unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsi);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2LDMIA_UPD: {
+    // If this is a load of a single register, then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2LDR_POST);
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2STMDB_UPD: {
+    // If this is a store of a single register, then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (Inst.getNumOperands() != 5)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::t2STR_PRE);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+    TmpInst.addOperand(Inst.getOperand(4)); // Rt
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(-4));
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::LDMIA_UPD:
+    // If this is a load of a single register via a 'pop', then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDR_POST_IMM);
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(1)); // Rn
+      TmpInst.addOperand(MCOperand::CreateReg(0));  // am2offset
+      TmpInst.addOperand(MCOperand::CreateImm(4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  case ARM::STMDB_UPD:
+    // If this is a store of a single register via a 'push', then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STR_PRE_IMM);
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+      TmpInst.addOperand(MCOperand::CreateImm(-4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::t2ADDri12:
+    // If the immediate fits for encoding T3 (t2ADDri) and the generic "add"
+    // mnemonic was used (not "addw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "add" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2ADDri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
+  case ARM::t2SUBri12:
+    // If the immediate fits for encoding T3 (t2SUBri) and the generic "sub"
+    // mnemonic was used (not "subw"), encoding T3 is preferred.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() != "sub" ||
+        ARM_AM::getT2SOImmVal(Inst.getOperand(2).getImm()) == -1)
+      break;
+    Inst.setOpcode(ARM::t2SUBri);
+    Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+    break;
+  case ARM::tADDi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tADDi3);
+      return true;
+    }
+    break;
+  case ARM::tSUBi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if ((unsigned)Inst.getOperand(3).getImm() < 8 && Operands.size() == 6) {
+      Inst.setOpcode(ARM::tSUBi3);
+      return true;
+    }
+    break;
+  case ARM::t2ADDri:
+  case ARM::t2SUBri: {
+    // If the destination and first source operand are the same, and
+    // the flags are compatible with the current IT status, use encoding T2
+    // instead of T3. For compatibility with the system 'as'. Make sure the
+    // wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        !isARMLowRegister(Inst.getOperand(0).getReg()) ||
+        (unsigned)Inst.getOperand(2).getImm() > 255 ||
+        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
+        (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
+        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
+                      ARM::tADDi8 : ARM::tSUBi8);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(5));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::t2ADDrr: {
+    // If the destination and first source operand are the same, and
+    // there's no setting of the flags, use encoding T2 instead of T3.
+    // Note that this is only for ADD, not SUB. This mirrors the system
+    // 'as' behaviour. Make sure the wide encoding wasn't explicit.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
+        Inst.getOperand(5).getReg() != 0 ||
+        (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+         static_cast<ARMOperand*>(Operands[3])->getToken() == ".w"))
+      break;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::tADDhirr);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
+  case ARM::tADDrSP: {
+    // If the non-SP source operand and the destination operand are not the
+    // same, we need to use the 32-bit encoding if it's available.
+    if (Inst.getOperand(0).getReg() != Inst.getOperand(2).getReg()) {
+      Inst.setOpcode(ARM::t2ADDrr);
+      Inst.addOperand(MCOperand::CreateReg(0)); // cc_out
+      return true;
+    }
+    break;
+  }
+  case ARM::tB:
+    // A Thumb conditional branch outside of an IT block is a tBcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()) {
+      Inst.setOpcode(ARM::tBcc);
+      return true;
+    }
+    break;
+  case ARM::t2B:
+    // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock()){
+      Inst.setOpcode(ARM::t2Bcc);
+      return true;
+    }
+    break;
+  case ARM::t2Bcc:
+    // If the conditional is AL or we're in an IT block, we really want t2B.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock()) {
+      Inst.setOpcode(ARM::t2B);
+      return true;
+    }
+    break;
+  case ARM::tBcc:
+    // If the conditional is AL, we really want tB.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL) {
+      Inst.setOpcode(ARM::tB);
+      return true;
+    }
+    break;
+  case ARM::tLDMIA: {
+    // If the register list contains any high registers, or if the writeback
+    // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+    // instead if we're in Thumb2. Otherwise, this should have generated
+    // an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+        (!listContainsBase && !hasWritebackToken) ||
+        (listContainsBase && hasWritebackToken)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+      // If we're switching to the updating version, we need to insert
+      // the writeback tied operand.
+      if (hasWritebackToken)
+        Inst.insert(Inst.begin(),
+                    MCOperand::CreateReg(Inst.getOperand(0).getReg()));
+      return true;
+    }
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(ARM::t2STMIA_UPD);
+      return true;
+    }
+    break;
+  }
+  case ARM::tPOP: {
+    bool listContainsBase;
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2LDMIA_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    return true;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (!checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase))
+      return false;
+    assert (isThumbTwo());
+    Inst.setOpcode(ARM::t2STMDB_UPD);
+    // Add the base register and writeback operands.
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    Inst.insert(Inst.begin(), MCOperand::CreateReg(ARM::SP));
+    return true;
+  }
+  case ARM::t2MOVi: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        (unsigned)Inst.getOperand(1).getImm() <= 255 &&
+        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+         Inst.getOperand(4).getReg() == ARM::CPSR) ||
+        (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't in the same order for tMOVi8...
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2MOVr: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == ARMCC::AL &&
+        Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't the same for tMOV[S]r... (no cc_out)
+      MCInst TmpInst;
+      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::t2SXTH:
+  case ARM::t2SXTB:
+  case ARM::t2UXTH:
+  case ARM::t2UXTB: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("Illegal opcode!");
+      case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+      case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+      case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+      case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+      }
+      // The operands aren't the same for thumb1 (no rotate operand).
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    break;
+  }
+  case ARM::MOVsi: {
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
+    // rrx shifts and asr/lsr of #32 is encoded as 0
+    if (SOpc == ARM_AM::rrx || SOpc == ARM_AM::asr || SOpc == ARM_AM::lsr) 
+      return false;
+    if (ARM_AM::getSORegOffset(Inst.getOperand(2).getImm()) == 0) {
+      // Shifting by zero is accepted as a vanilla 'MOVr'
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ANDrsi:
+  case ARM::ORRrsi:
+  case ARM::EORrsi:
+  case ARM::BICrsi:
+  case ARM::SUBrsi:
+  case ARM::ADDrsi: {
+    unsigned newOpc;
+    ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(Inst.getOperand(3).getImm());
+    if (SOpc == ARM_AM::rrx) return false;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ANDrsi: newOpc = ARM::ANDrr; break;
+    case ARM::ORRrsi: newOpc = ARM::ORRrr; break;
+    case ARM::EORrsi: newOpc = ARM::EORrr; break;
+    case ARM::BICrsi: newOpc = ARM::BICrr; break;
+    case ARM::SUBrsi: newOpc = ARM::SUBrr; break;
+    case ARM::ADDrsi: newOpc = ARM::ADDrr; break;
+    }
+    // If the shift is by zero, use the non-shifted instruction definition.
+    // The exception is for right shifts, where 0 == 32
+    if (ARM_AM::getSORegOffset(Inst.getOperand(3).getImm()) == 0 &&
+        !(SOpc == ARM_AM::lsr || SOpc == ARM_AM::asr)) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(newOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(6));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::ITasm:
+  case ARM::t2IT: {
+    // The mask bits for all but the first condition are represented as
+    // the low bit of the condition code value implies 't'. We currently
+    // always have 1 implies 't', so XOR toggle the bits if the low bit
+    // of the condition code is zero. 
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned Mask = MO.getImm();
+    unsigned OrigMask = Mask;
+    unsigned TZ = CountTrailingZeros_32(Mask);
+    if ((Inst.getOperand(0).getImm() & 1) == 0) {
+      assert(Mask && TZ <= 3 && "illegal IT mask value!");
+      for (unsigned i = 3; i != TZ; --i)
+        Mask ^= 1 << i;
+    }
+    MO.setImm(Mask);
+
+    // Set up the IT block state according to the IT instruction we just
+    // matched.
+    assert(!inITBlock() && "nested IT blocks?!");
+    ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+    ITState.Mask = OrigMask; // Use the original mask, not the updated one.
+    ITState.CurPosition = 0;
+    ITState.FirstCond = true;
+    break;
+  }
+  case ARM::t2LSLrr:
+  case ARM::t2LSRrr:
+  case ARM::t2ASRrr:
+  case ARM::t2SBCrr:
+  case ARM::t2RORrr:
+  case ARM::t2BICrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2LSLrr: NewOpc = ARM::tLSLrr; break;
+        case ARM::t2LSRrr: NewOpc = ARM::tLSRrr; break;
+        case ARM::t2ASRrr: NewOpc = ARM::tASRrr; break;
+        case ARM::t2SBCrr: NewOpc = ARM::tSBC; break;
+        case ARM::t2RORrr: NewOpc = ARM::tROR; break;
+        case ARM::t2BICrr: NewOpc = ARM::tBIC; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  case ARM::t2ANDrr:
+  case ARM::t2EORrr:
+  case ARM::t2ADCrr:
+  case ARM::t2ORRrr:
+  {
+    // Assemblers should use the narrow encodings of these instructions when permissible.
+    // These instructions are special in that they are commutable, so shorter encodings
+    // are available more often.
+    if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
+         isARMLowRegister(Inst.getOperand(2).getReg())) &&
+        (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
+         Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
+        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
+         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) && 
+        (!static_cast<ARMOperand*>(Operands[3])->isToken() ||
+         !static_cast<ARMOperand*>(Operands[3])->getToken().equals_lower(".w"))) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+        default: llvm_unreachable("unexpected opcode");
+        case ARM::t2ADCrr: NewOpc = ARM::tADC; break;
+        case ARM::t2ANDrr: NewOpc = ARM::tAND; break;
+        case ARM::t2EORrr: NewOpc = ARM::tEOR; break;
+        case ARM::t2ORRrr: NewOpc = ARM::tORR; break;
+      }
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(5));
+      if (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) {
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+      } else {
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(1));
+      }
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+      return true;
+    }
+    return false;
+  }
+  }
+  return false;
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+  // suffix depending on whether they're in an IT block or not.
+  unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &MCID = getInstDesc(Opc);
+  if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+    assert(MCID.hasOptionalDef() &&
+           "optionally flag setting instruction missing optional def operand");
+    assert(MCID.NumOperands == Inst.getNumOperands() &&
+           "operand count mismatch!");
+    // Find the optional-def operand (cc_out).
+    unsigned OpNo;
+    for (OpNo = 0;
+         !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+         ++OpNo)
+      ;
+    // If we're parsing Thumb1, reject it completely.
+    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+      return Match_MnemonicFail;
+    // If we're parsing Thumb2, which form is legal depends on whether we're
+    // in an IT block.
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+        !inITBlock())
+      return Match_RequiresITBlock;
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+        inITBlock())
+      return Match_RequiresNotITBlock;
+  }
+  // Some high-register supporting Thumb1 encodings only allow both registers
+  // to be from r0-r7 when in Thumb2.
+  else if (Opc == ARM::tADDhirr && isThumbOne() &&
+           isARMLowRegister(Inst.getOperand(1).getReg()) &&
+           isARMLowRegister(Inst.getOperand(2).getReg()))
+    return Match_RequiresThumb2;
+  // Others only require ARMv6 or later.
+  else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
+           isARMLowRegister(Inst.getOperand(0).getReg()) &&
+           isARMLowRegister(Inst.getOperand(1).getReg()))
+    return Match_RequiresV6;
+  return Match_Success;
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+bool ARMAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult;
+
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
+  switch (MatchResult) {
+  default: break;
+  case Match_Success:
+    // Context sensitive operand constraints aren't handled by the matcher,
+    // so check them here.
+    if (validateInstruction(Inst, Operands)) {
+      // Still progress the IT block, otherwise one wrong condition causes
+      // nasty cascading errors.
+      forwardITPosition();
+      return true;
+    }
+
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other. E.g.,
+    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+    while (processInstruction(Inst, Operands))
+      ;
+
+    // Only move forward at the very end so that everything in validate
+    // and process gets a consistent answer about whether we're in an IT
+    // block.
+    forwardITPosition();
+
+    // ITasm is an ARM mode pseudo-instruction that just sets the ITblock and
+    // doesn't actually encode.
+    if (Inst.getOpcode() == ARM::ITasm)
+      return false;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (Thumb vs. ARM, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction",
+                 ((ARMOperand*)Operands[0])->getLocRange());
+  case Match_RequiresNotITBlock:
+    return Error(IDLoc, "flag setting instruction only valid outside IT block");
+  case Match_RequiresITBlock:
+    return Error(IDLoc, "instruction only valid inside IT block");
+  case Match_RequiresV6:
+    return Error(IDLoc, "instruction variant requires ARMv6 or later");
+  case Match_RequiresThumb2:
+    return Error(IDLoc, "instruction variant requires Thumb2");
+  case Match_ImmRange0_15: {
+    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+/// parseDirective parses the arm specific directives
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    return parseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".arm")
+    return parseDirectiveARM(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    return parseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    return parseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    return parseDirectiveSyntax(DirectiveID.getLoc());
+  else if (IDVal == ".unreq")
+    return parseDirectiveUnreq(DirectiveID.getLoc());
+  else if (IDVal == ".arch")
+    return parseDirectiveArch(DirectiveID.getLoc());
+  else if (IDVal == ".eabi_attribute")
+    return parseDirectiveEabiAttr(DirectiveID.getLoc());
+  return true;
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// parseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  if (!isThumb())
+    SwitchMode();
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  return false;
+}
+
+/// parseDirectiveARM
+///  ::= .arm
+bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  if (isThumb())
+    SwitchMode();
+  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  return false;
+}
+
+/// parseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
+  const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI.hasSubsectionsViaSymbols();
+  StringRef Name;
+  bool needFuncName = true;
+
+  // Darwin asm has (optionally) function name after .thumb_func direction
+  // ELF doesn't
+  if (isMachO) {
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.isNot(AsmToken::EndOfStatement)) {
+      if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+        return Error(L, "unexpected token in .thumb_func directive");
+      Name = Tok.getIdentifier();
+      Parser.Lex(); // Consume the identifier token.
+      needFuncName = false;
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+
+  // Eat the end of statement and any blank lines that follow.
+  while (getLexer().is(AsmToken::EndOfStatement))
+    Parser.Lex();
+
+  // FIXME: assuming function name will be the line following .thumb_func
+  // We really should be checking the next symbol definition even if there's
+  // stuff in between.
+  if (needFuncName) {
+    Name = Parser.getTok().getIdentifier();
+  }
+
+  // Mark symbol as a thumb symbol.
+  MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
+  getParser().getStreamer().EmitThumbFunc(Func);
+  return false;
+}
+
+/// parseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(L, "unexpected token in .syntax directive");
+  StringRef Mode = Tok.getString();
+  if (Mode == "unified" || Mode == "UNIFIED")
+    Parser.Lex();
+  else if (Mode == "divided" || Mode == "DIVIDED")
+    return Error(L, "'.syntax divided' arm asssembly not supported");
+  else
+    return Error(L, "unrecognized syntax mode in .syntax directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  Parser.Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// parseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Integer))
+    return Error(L, "unexpected token in .code directive");
+  int64_t Val = Parser.getTok().getIntVal();
+  if (Val == 16)
+    Parser.Lex();
+  else if (Val == 32)
+    Parser.Lex();
+  else
+    return Error(L, "invalid operand to .code directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  Parser.Lex();
+
+  if (Val == 16) {
+    if (!isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  } else {
+    if (isThumb())
+      SwitchMode();
+    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  }
+
+  return false;
+}
+
+/// parseDirectiveReq
+///  ::= name .req registername
+bool ARMAsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+  Parser.Lex(); // Eat the '.req' token.
+  unsigned Reg;
+  SMLoc SRegLoc, ERegLoc;
+  if (ParseRegister(Reg, SRegLoc, ERegLoc)) {
+    Parser.eatToEndOfStatement();
+    return Error(SRegLoc, "register name expected");
+  }
+
+  // Shouldn't be anything else.
+  if (Parser.getTok().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return Error(Parser.getTok().getLoc(),
+                 "unexpected input in .req directive.");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+
+  if (RegisterReqs.GetOrCreateValue(Name, Reg).getValue() != Reg)
+    return Error(SRegLoc, "redefinition of '" + Name +
+                          "' does not match original.");
+
+  return false;
+}
+
+/// parseDirectiveUneq
+///  ::= .unreq registername
+bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.eatToEndOfStatement();
+    return Error(L, "unexpected input in .unreq directive.");
+  }
+  RegisterReqs.erase(Parser.getTok().getIdentifier());
+  Parser.Lex(); // Eat the identifier.
+  return false;
+}
+
+/// parseDirectiveArch
+///  ::= .arch token
+bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
+  return true;
+}
+
+/// parseDirectiveEabiAttr
+///  ::= .eabi_attribute int, int
+bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
+  return true;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+  RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget);
+  RegisterMCAsmParser<ARMAsmParser> Y(TheThumbTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "ARMGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                  unsigned Kind) {
+  ARMOperand *Op = static_cast<ARMOperand*>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  if (Kind == MCK__35_0 && Op->isImm()) {
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (!CE)
+      return Match_InvalidOperand;
+    if (CE->getValue() == 0)
+      return Match_Success;
+  }
+  return Match_InvalidOperand;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
new file mode 100644
index 000000000000..2e009e55e3b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -0,0 +1,4553 @@
+//===-- ARMDisassembler.cpp - Disassembler for ARM/Thumb ISA --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-disassembler"
+
+#include "llvm/MC/MCDisassembler.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+  // Handles the condition code status of instructions in IT blocks
+  class ITStatus
+  {
+    public:
+      // Returns the condition code for instruction in IT block
+      unsigned getITCC() {
+        unsigned CC = ARMCC::AL;
+        if (instrInITBlock())
+          CC = ITStates.back();
+        return CC;
+      }
+
+      // Advances the IT block state to the next T or E
+      void advanceITState() {
+        ITStates.pop_back();
+      }
+
+      // Returns true if the current instruction is in an IT block
+      bool instrInITBlock() {
+        return !ITStates.empty();
+      }
+
+      // Returns true if current instruction is the last instruction in an IT block
+      bool instrLastInITBlock() {
+        return ITStates.size() == 1;
+      }
+
+      // Called when decoding an IT instruction. Sets the IT state for the following
+      // instructions that for the IT block. Firstcond and Mask correspond to the 
+      // fields in the IT instruction encoding.
+      void setITState(char Firstcond, char Mask) {
+        // (3 - the number of trailing zeros) is the number of then / else.
+        unsigned CondBit0 = Firstcond & 1;
+        unsigned NumTZ = CountTrailingZeros_32(Mask);
+        unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
+        assert(NumTZ <= 3 && "Invalid IT mask!");
+        // push condition codes onto the stack the correct order for the pops
+        for (unsigned Pos = NumTZ+1; Pos <= 3; ++Pos) {
+          bool T = ((Mask >> Pos) & 1) == CondBit0;
+          if (T)
+            ITStates.push_back(CCBits);
+          else
+            ITStates.push_back(CCBits ^ 1);
+        }
+        ITStates.push_back(CCBits);
+      }
+
+    private:
+      std::vector<unsigned char> ITStates;
+  };
+}
+
+namespace {
+/// ARMDisassembler - ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ARMDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ARMDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+};
+
+/// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ThumbDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ThumbDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+private:
+  mutable ITStatus ITBlock;
+  DecodeStatus AddThumbPredicate(MCInst&) const;
+  void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  llvm_unreachable("Invalid DecodeStatus!");
+}
+
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                               unsigned RegNo, uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(MCInst &Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst & Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Adddress,
+                                                  const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(MCInst &Inst,unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+#include "ARMGenDisassemblerTables.inc"
+
+static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ARMDisassembler(STI);
+}
+
+static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ThumbDisassembler(STI);
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             const MemoryObject &Region,
+                                             uint64_t Address,
+                                             raw_ostream &os,
+                                             raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert(!(STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+                  (bytes[2] << 16) |
+                  (bytes[1] <<  8) |
+                  (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeInstruction(DecoderTableARM32, MI, insn,
+                                          Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  // VFP and NEON instructions, similarly, are shared between ARM
+  // and Thumb modes.
+  MI.clear();
+  result = decodeInstruction(DecoderTableVFP32, MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONData32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONLoadStore32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONDup32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern const MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.  The immediate Value has had any PC
+/// adjustment made by the caller.  If the instruction is a branch instruction
+/// then isBranch is true, else false.  If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction.  If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst.  If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created.  This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+                                     bool isBranch, uint64_t InstSize,
+                                     MCInst &MI, const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  void *DisInfo = Dis->getDisInfoBlock();
+
+  if (!getOpInfo ||
+      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    // Clear SymbolicOp.Value from above and also all other fields.
+    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+    if (!SymbolLookUp)
+      return false;
+    uint64_t ReferenceType;
+    if (isBranch)
+       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+    else
+       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    const char *ReferenceName;
+    uint64_t SymbolValue = 0x00000000ffffffffULL & Value;
+    const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType,
+                                    Address, &ReferenceName);
+    if (Name) {
+      SymbolicOp.AddSymbol.Name = Name;
+      SymbolicOp.AddSymbol.Present = true;
+    }
+    // For branches always create an MCExpr so it gets printed as hex address.
+    else if (isBranch) {
+      SymbolicOp.Value = Value;
+    }
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
+    if (!Name && !isBranch)
+      return false;
+  }
+
+  MCContext *Ctx = Dis->getMCContext();
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
+    MI.addOperand(MCOperand::CreateExpr(Expr));
+  else
+    llvm_unreachable("bad SymbolicOp.VariantKind");
+
+  return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction.  The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry.  The SymbolLookUp call back will
+/// return the name of a symbol referenced by the literal pool's entry if
+/// the referenced address is that of a symbol.  Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+  if (SymbolLookUp) {
+    void *DisInfo = Dis->getDisInfoBlock();
+    uint64_t ReferenceType;
+    ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
+       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+  }
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+      MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (ITBlock.instrInITBlock())
+        S = SoftFail;
+      else
+        return Success;
+      break;
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.instrInITBlock() && !ITBlock.instrLastInITBlock())
+        S = SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  // If we're in an IT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (CC == 0xF) 
+    CC = ARMCC::AL;
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isPredicate()) {
+      I = MI.insert(I, MCOperand::CreateImm(CC));
+      ++I;
+      if (CC == ARMCC::AL)
+        MI.insert(I, MCOperand::CreateReg(0));
+      else
+        MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+      return S;
+    }
+  }
+
+  I = MI.insert(I, MCOperand::CreateImm(CC));
+  ++I;
+  if (CC == ARMCC::AL)
+    MI.insert(I, MCOperand::CreateReg(0));
+  else
+    MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+
+  return S;
+}
+
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+  unsigned CC;
+  CC = ITBlock.getITCC();
+  if (ITBlock.instrInITBlock())
+    ITBlock.advanceITState();
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(0);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert((STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+  // We want to read exactly 2 bytes of data.
+  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint16_t insn16 = (bytes[1] << 8) | bytes[0];
+  DecodeStatus result = decodeInstruction(DecoderTableThumb16, MI, insn16,
+                                          Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumbSBit16, MI, insn16,
+                             Address, this, STI);
+  if (result) {
+    Size = 2;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb216, MI, insn16,
+                             Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && ITBlock.instrInITBlock())
+      result = MCDisassembler::SoftFail;
+
+    Check(result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+
+      unsigned Firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      ITBlock.setITState(Firstcond, Mask);
+    }
+
+    return result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t insn32 = (bytes[3] <<  8) |
+                    (bytes[2] <<  0) |
+                    (bytes[1] << 24) |
+                    (bytes[0] << 16);
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb32, MI, insn32, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.instrInITBlock();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableThumb232, MI, insn32, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    UpdateThumbVFPPredicate(MI);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  if (fieldFromInstruction(insn32, 24, 8) == 0xF9) {
+    MI.clear();
+    uint32_t NEONLdStInsn = insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    result = decodeInstruction(DecoderTableNEONLoadStore32, MI, NEONLdStInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  if (fieldFromInstruction(insn32, 24, 4) == 0xF) {
+    MI.clear();
+    uint32_t NEONDataInsn = insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTableNEONData32, MI, NEONDataInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+
+extern "C" void LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
+                                         createThumbDisassembler);
+}
+
+static const uint16_t GPRDecoderTable[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+  ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+  ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+  ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  
+  if (RegNo == 15) 
+    S = MCDisassembler::SoftFail;
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+
+  return S;
+}
+
+static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+    case 0:
+      Register = ARM::R0;
+      break;
+    case 1:
+      Register = ARM::R1;
+      break;
+    case 2:
+      Register = ARM::R2;
+      break;
+    case 3:
+      Register = ARM::R3;
+      break;
+    case 9:
+      Register = ARM::R9;
+      break;
+    case 12:
+      Register = ARM::R12;
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t SPRDecoderTable[] = {
+     ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+     ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
+    ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+    ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+    ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+    ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+    ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPRDecoderTable[] = {
+     ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+     ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+     ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
+    ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+    ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+    ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+    ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+    ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDPR_8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const uint16_t QPRDecoderTable[] = {
+     ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+     ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
+     ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
+    ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
+
+
+static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  RegNo >>= 1;
+
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairDecoderTable[] = {
+  ARM::Q0,  ARM::D1_D2,   ARM::Q1,  ARM::D3_D4,   ARM::Q2,  ARM::D5_D6,
+  ARM::Q3,  ARM::D7_D8,   ARM::Q4,  ARM::D9_D10,  ARM::Q5,  ARM::D11_D12,
+  ARM::Q6,  ARM::D13_D14, ARM::Q7,  ARM::D15_D16, ARM::Q8,  ARM::D17_D18,
+  ARM::Q9,  ARM::D19_D20, ARM::Q10, ARM::D21_D22, ARM::Q11, ARM::D23_D24,
+  ARM::Q12, ARM::D25_D26, ARM::Q13, ARM::D27_D28, ARM::Q14, ARM::D29_D30,
+  ARM::Q15
+};
+
+static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const uint16_t DPairSpacedDecoderTable[] = {
+  ARM::D0_D2,   ARM::D1_D3,   ARM::D2_D4,   ARM::D3_D5,
+  ARM::D4_D6,   ARM::D5_D7,   ARM::D6_D8,   ARM::D7_D9,
+  ARM::D8_D10,  ARM::D9_D11,  ARM::D10_D12, ARM::D11_D13,
+  ARM::D12_D14, ARM::D13_D15, ARM::D14_D16, ARM::D15_D17,
+  ARM::D16_D18, ARM::D17_D19, ARM::D18_D20, ARM::D19_D21,
+  ARM::D20_D22, ARM::D21_D23, ARM::D22_D24, ARM::D23_D25,
+  ARM::D24_D26, ARM::D25_D27, ARM::D26_D28, ARM::D27_D29,
+  ARM::D28_D30, ARM::D29_D31
+};
+
+static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
+                                                   unsigned RegNo,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  if (RegNo > 29)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPairSpacedDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredicateOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val == 0xF) return MCDisassembler::Fail;
+  // AL predicate is not allowed on Thumb1 branches.
+  if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  if (Val == ARMCC::AL) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+  } else
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val)
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  else
+    Inst.addOperand(MCOperand::CreateReg(0));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSOImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  uint32_t imm = Val & 0xFF;
+  uint32_t rot = (Val & 0xF00) >> 7;
+  uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F));
+  Inst.addOperand(MCOperand::CreateImm(rot_imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+
+  // Register-immediate
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  if (Shift == ARM_AM::ror && imm == 0)
+    Shift = ARM_AM::rrx;
+
+  unsigned Op = Shift | (imm << 3);
+  Inst.addOperand(MCOperand::CreateImm(Op));
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegRegOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned Rs = fieldFromInstruction(Val, 8, 4);
+
+  // Register-register
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  return S;
+}
+
+static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  bool writebackLoad = false;
+  unsigned writebackReg = 0;
+  switch (Inst.getOpcode()) {
+    default:
+      break;
+    case ARM::LDMIA_UPD:
+    case ARM::LDMDB_UPD:
+    case ARM::LDMIB_UPD:
+    case ARM::LDMDA_UPD:
+    case ARM::t2LDMIA_UPD:
+    case ARM::t2LDMDB_UPD:
+      writebackLoad = true;
+      writebackReg = Inst.getOperand(0).getReg();
+      break;
+  }
+
+  // Empty register lists are not allowed.
+  if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail;
+  for (unsigned i = 0; i < 16; ++i) {
+    if (Val & (1 << i)) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+      // Writeback not allowed if Rn is in the target list.
+      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+        Check(S, MCDisassembler::SoftFail);
+    }
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction(Val, 8, 5);
+  unsigned regs = fieldFromInstruction(Val, 0, 8);
+
+  regs = regs >> 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeBitfieldMaskOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  // This operand encodes a mask of contiguous zeros between a specified MSB
+  // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
+  // the mask of all bits LSB-and-lower, and then xor them to create
+  // the mask of that's all ones on [msb, lsb].  Finally we not it to
+  // create the final mask.
+  unsigned msb = fieldFromInstruction(Val, 5, 5);
+  unsigned lsb = fieldFromInstruction(Val, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (lsb > msb) {
+    Check(S, MCDisassembler::SoftFail);
+    // The check above will cause the warning for the "potentially undefined
+    // instruction encoding" but we can't build a bad MCOperand value here
+    // with a lsb > msb or else printing the MCInst will cause a crash.
+    lsb = msb;
+  }
+
+  uint32_t msb_mask = 0xFFFFFFFF;
+  if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+  uint32_t lsb_mask = (1U << lsb) - 1;
+
+  Inst.addOperand(MCOperand::CreateImm(~(msb_mask ^ lsb_mask)));
+  return S;
+}
+
+static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned CRd = fieldFromInstruction(Insn, 12, 4);
+  unsigned coproc = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_OPTION:
+      if (coproc == 0xA || coproc == 0xB)
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(coproc));
+  Inst.addOperand(MCOperand::CreateImm(CRd));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STC2L_PRE:
+    case ARM::LDC2_OFFSET:
+    case ARM::LDC2L_OFFSET:
+    case ARM::LDC2_PRE:
+    case ARM::LDC2L_PRE:
+    case ARM::STC2_OFFSET:
+    case ARM::STC2L_OFFSET:
+    case ARM::STC2_PRE:
+    case ARM::STC2L_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STCL_PRE:
+    case ARM::LDC_OFFSET:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDCL_PRE:
+    case ARM::STC_OFFSET:
+    case ARM::STCL_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STCL_PRE:
+      imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2L_POST:
+    case ARM::LDC2_POST:
+    case ARM::LDC2L_POST:
+    case ARM::STC2_POST:
+    case ARM::STC2L_POST:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDCL_POST:
+    case ARM::t2STC_POST:
+    case ARM::t2STCL_POST:
+    case ARM::LDC_POST:
+    case ARM::LDCL_POST:
+    case ARM::STC_POST:
+    case ARM::STCL_POST:
+      imm |= U << 8;
+      // fall through.
+    default:
+      // The 'option' variant doesn't encode 'U' in the immediate since
+      // the immediate is unsigned [0,255].
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+      if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reg = fieldFromInstruction(Insn, 25, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+
+  // On stores, the writeback operand precedes Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::STR_POST_IMM:
+    case ARM::STR_POST_REG:
+    case ARM::STRB_POST_IMM:
+    case ARM::STRB_POST_REG:
+    case ARM::STRT_POST_REG:
+    case ARM::STRT_POST_IMM:
+    case ARM::STRBT_POST_REG:
+    case ARM::STRBT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // On loads, the writeback operand comes after Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::LDR_POST_IMM:
+    case ARM::LDR_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRBT_POST_REG:
+    case ARM::LDRBT_POST_IMM:
+    case ARM::LDRT_POST_REG:
+    case ARM::LDRT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::AddrOpc Op = ARM_AM::add;
+  if (!fieldFromInstruction(Insn, 23, 1))
+    Op = ARM_AM::sub;
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  if (writeback && (Rn == 15 || Rn == Rt))
+    S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+  if (reg) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+    switch( fieldFromInstruction(Insn, 5, 2)) {
+      case 0:
+        Opc = ARM_AM::lsl;
+        break;
+      case 1:
+        Opc = ARM_AM::lsr;
+        break;
+      case 2:
+        Opc = ARM_AM::asr;
+        break;
+      case 3:
+        Opc = ARM_AM::ror;
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+    unsigned amt = fieldFromInstruction(Insn, 7, 5);
+    if (Opc == ARM_AM::ror && amt == 0)
+      Opc = ARM_AM::rrx;
+    unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  } else {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+    Inst.addOperand(MCOperand::CreateImm(tmp));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned Rm = fieldFromInstruction(Val,  0, 4);
+  unsigned type = fieldFromInstruction(Val, 5, 2);
+  unsigned imm = fieldFromInstruction(Val, 7, 5);
+  unsigned U = fieldFromInstruction(Val, 12, 1);
+
+  ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      ShOp = ARM_AM::lsl;
+      break;
+    case 1:
+      ShOp = ARM_AM::lsr;
+      break;
+    case 2:
+      ShOp = ARM_AM::asr;
+      break;
+    case 3:
+      ShOp = ARM_AM::ror;
+      break;
+  }
+
+  if (ShOp == ARM_AM::ror && imm == 0)
+    ShOp = ARM_AM::rrx;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  unsigned shift;
+  if (U)
+    shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+  else
+    shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+  Inst.addOperand(MCOperand::CreateImm(shift));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned type = fieldFromInstruction(Insn, 22, 1);
+  unsigned imm = fieldFromInstruction(Insn, 8, 4);
+  unsigned U = ((~fieldFromInstruction(Insn, 23, 1)) & 1) << 8;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  unsigned Rt2 = Rt + 1;
+
+  bool writeback = (W == 1) | (P == 0);
+
+  // For {LD,ST}RD, Rt must be even, else undefined.
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (Rt & 0x1) S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      
+      if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (Rt2 == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && fieldFromInstruction(Insn, 8, 4))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (type && Rn == 15){
+        if (Rt2 == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (P == 0 && W == 1)
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt2 == 15 || Rm == 15 || Rm == Rt || Rm == Rt2))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && Rn == 15)
+        S = MCDisassembler::SoftFail;
+      if (writeback && (Rn == Rt || Rn == Rt2))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (Rt == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && Rm == 15)
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+      if (type && Rn == 15){
+        if (Rt == 15)
+          S = MCDisassembler::SoftFail;
+        break;
+      }
+      if (type && (Rt == 15 || (writeback && Rn == Rt)))
+        S = MCDisassembler::SoftFail;
+      if (!type && (Rt == 15 || Rm == 15))
+        S = MCDisassembler::SoftFail;
+      if (!type && writeback && (Rn == 15 || Rn == Rt))
+        S = MCDisassembler::SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) { // Writeback
+    if (P)
+      U |= ARMII::IndexModePre << 9;
+    else
+      U |= ARMII::IndexModePost << 9;
+
+    // On stores, the writeback operand precedes Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) {
+    // On loads, the writeback operand comes after Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+    case ARM::LDRHTr:
+    case ARM::LDRSBTr:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (type) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(U | (imm << 4) | Rm));
+  } else {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateImm(U));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned mode = fieldFromInstruction(Insn, 23, 2);
+
+  switch (mode) {
+    case 0:
+      mode = ARM_AM::da;
+      break;
+    case 1:
+      mode = ARM_AM::ia;
+      break;
+    case 2:
+      mode = ARM_AM::db;
+      break;
+    case 3:
+      mode = ARM_AM::ib;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(mode));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned reglist = fieldFromInstruction(Insn, 0, 16);
+
+  if (pred == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::LDMDA:
+        Inst.setOpcode(ARM::RFEDA);
+        break;
+      case ARM::LDMDA_UPD:
+        Inst.setOpcode(ARM::RFEDA_UPD);
+        break;
+      case ARM::LDMDB:
+        Inst.setOpcode(ARM::RFEDB);
+        break;
+      case ARM::LDMDB_UPD:
+        Inst.setOpcode(ARM::RFEDB_UPD);
+        break;
+      case ARM::LDMIA:
+        Inst.setOpcode(ARM::RFEIA);
+        break;
+      case ARM::LDMIA_UPD:
+        Inst.setOpcode(ARM::RFEIA_UPD);
+        break;
+      case ARM::LDMIB:
+        Inst.setOpcode(ARM::RFEIB);
+        break;
+      case ARM::LDMIB_UPD:
+        Inst.setOpcode(ARM::RFEIB_UPD);
+        break;
+      case ARM::STMDA:
+        Inst.setOpcode(ARM::SRSDA);
+        break;
+      case ARM::STMDA_UPD:
+        Inst.setOpcode(ARM::SRSDA_UPD);
+        break;
+      case ARM::STMDB:
+        Inst.setOpcode(ARM::SRSDB);
+        break;
+      case ARM::STMDB_UPD:
+        Inst.setOpcode(ARM::SRSDB_UPD);
+        break;
+      case ARM::STMIA:
+        Inst.setOpcode(ARM::SRSIA);
+        break;
+      case ARM::STMIA_UPD:
+        Inst.setOpcode(ARM::SRSIA_UPD);
+        break;
+      case ARM::STMIB:
+        Inst.setOpcode(ARM::SRSIB);
+        break;
+      case ARM::STMIB_UPD:
+        Inst.setOpcode(ARM::SRSIB_UPD);
+        break;
+      default:
+        if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail;
+    }
+
+    // For stores (which become SRS's, the only operand is the mode.
+    if (fieldFromInstruction(Insn, 20, 1) == 0) {
+      Inst.addOperand(
+          MCOperand::CreateImm(fieldFromInstruction(Insn, 0, 4)));
+      return S;
+    }
+
+    return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail; // Tied
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 18, 2);
+  unsigned M = fieldFromInstruction(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 9, 2);
+  unsigned M = fieldFromInstruction(Insn, 8, 1);
+  unsigned iflags = fieldFromInstruction(Insn, 5, 3);
+  unsigned mode = fieldFromInstruction(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::t2CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::t2CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 8, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 8) << 0);
+  imm |= (fieldFromInstruction(Insn, 12, 3) << 8);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+  imm |= (fieldFromInstruction(Insn, 26, 1) << 11);
+
+  if (Inst.getOpcode() == ARM::t2MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction(Insn, 0, 12) << 0);
+  imm |= (fieldFromInstruction(Insn, 16, 4) << 12);
+
+  if (Inst.getOpcode() == ARM::MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 8, 4);
+  unsigned Ra = fieldFromInstruction(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned add = fieldFromInstruction(Val, 12, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!add) imm *= -1;
+  if (imm == 0 && !add) imm = INT32_MIN;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  if (Rn == 15)
+    tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned U = fieldFromInstruction(Val, 8, 1);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
+                     uint64_t Address, const void *Decoder) {
+  DecodeStatus Status = MCDisassembler::Success;
+
+  // Note the J1 and J2 values are from the encoded instruction.  So here
+  // change them to I1 and I2 values via as documented:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = fieldFromInstruction(Insn, 26, 1);
+  unsigned J1 = fieldFromInstruction(Insn, 13, 1);
+  unsigned J2 = fieldFromInstruction(Insn, 11, 1);
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
+  unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
+  unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
+  int imm32 = SignExtend32<24>(tmp << 1);
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+
+  return Status;
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 24) << 2;
+
+  if (pred == 0xF) {
+    Inst.setOpcode(ARM::BLXi);
+    imm |= fieldFromInstruction(Insn, 24, 1) << 1;
+    if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                  true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+    return S;
+  }
+
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  unsigned align = fieldFromInstruction(Val, 4, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!align)
+    Inst.addOperand(MCOperand::CreateImm(0));
+  else
+    Inst.addOperand(MCOperand::CreateImm(4 << align));
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // First output register
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1q16: case ARM::VLD1q32: case ARM::VLD1q64: case ARM::VLD1q8:
+  case ARM::VLD1q16wb_fixed: case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_fixed: case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_fixed: case ARM::VLD1q64wb_register:
+  case ARM::VLD1q8wb_fixed: case ARM::VLD1q8wb_register:
+  case ARM::VLD2d16: case ARM::VLD2d32: case ARM::VLD2d8:
+  case ARM::VLD2d16wb_fixed: case ARM::VLD2d16wb_register:
+  case ARM::VLD2d32wb_fixed: case ARM::VLD2d32wb_register:
+  case ARM::VLD2d8wb_fixed: case ARM::VLD2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2b16:
+  case ARM::VLD2b32:
+  case ARM::VLD2b8:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b16wb_register:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2b32wb_register:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+    default:
+      break;
+  }
+
+  // Third output register
+  switch(Inst.getOpcode()) {
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Writeback operand
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8wb_fixed:
+    case ARM::VLD1d16wb_fixed:
+    case ARM::VLD1d32wb_fixed:
+    case ARM::VLD1d64wb_fixed:
+    case ARM::VLD1d8wb_register:
+    case ARM::VLD1d16wb_register:
+    case ARM::VLD1d32wb_register:
+    case ARM::VLD1d64wb_register:
+    case ARM::VLD1q8wb_fixed:
+    case ARM::VLD1q16wb_fixed:
+    case ARM::VLD1q32wb_fixed:
+    case ARM::VLD1q64wb_fixed:
+    case ARM::VLD1q8wb_register:
+    case ARM::VLD1q16wb_register:
+    case ARM::VLD1q32wb_register:
+    case ARM::VLD1q64wb_register:
+    case ARM::VLD1d8Twb_fixed:
+    case ARM::VLD1d8Twb_register:
+    case ARM::VLD1d16Twb_fixed:
+    case ARM::VLD1d16Twb_register:
+    case ARM::VLD1d32Twb_fixed:
+    case ARM::VLD1d32Twb_register:
+    case ARM::VLD1d64Twb_fixed:
+    case ARM::VLD1d64Twb_register:
+    case ARM::VLD1d8Qwb_fixed:
+    case ARM::VLD1d8Qwb_register:
+    case ARM::VLD1d16Qwb_fixed:
+    case ARM::VLD1d16Qwb_register:
+    case ARM::VLD1d32Qwb_fixed:
+    case ARM::VLD1d32Qwb_register:
+    case ARM::VLD1d64Qwb_fixed:
+    case ARM::VLD1d64Qwb_register:
+    case ARM::VLD2d8wb_fixed:
+    case ARM::VLD2d16wb_fixed:
+    case ARM::VLD2d32wb_fixed:
+    case ARM::VLD2q8wb_fixed:
+    case ARM::VLD2q16wb_fixed:
+    case ARM::VLD2q32wb_fixed:
+    case ARM::VLD2d8wb_register:
+    case ARM::VLD2d16wb_register:
+    case ARM::VLD2d32wb_register:
+    case ARM::VLD2q8wb_register:
+    case ARM::VLD2q16wb_register:
+    case ARM::VLD2q32wb_register:
+    case ARM::VLD2b8wb_fixed:
+    case ARM::VLD2b16wb_fixed:
+    case ARM::VLD2b32wb_fixed:
+    case ARM::VLD2b8wb_register:
+    case ARM::VLD2b16wb_register:
+    case ARM::VLD2b32wb_register:
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+  default:
+    // The below have been updated to have explicit am6offset split
+    // between fixed and register offset. For those instructions not
+    // yet updated, we need to add an additional reg0 operand for the
+    // fixed variant.
+    //
+    // The fixed offset encodes as Rm == 0xd, so we check for that.
+    if (Rm == 0xd) {
+      Inst.addOperand(MCOperand::CreateReg(0));
+      break;
+    }
+    // Fall through to handle the register offset variant.
+  case ARM::VLD1d8wb_fixed:
+  case ARM::VLD1d16wb_fixed:
+  case ARM::VLD1d32wb_fixed:
+  case ARM::VLD1d64wb_fixed:
+  case ARM::VLD1d8Twb_fixed:
+  case ARM::VLD1d16Twb_fixed:
+  case ARM::VLD1d32Twb_fixed:
+  case ARM::VLD1d64Twb_fixed:
+  case ARM::VLD1d8Qwb_fixed:
+  case ARM::VLD1d16Qwb_fixed:
+  case ARM::VLD1d32Qwb_fixed:
+  case ARM::VLD1d64Qwb_fixed:
+  case ARM::VLD1d8wb_register:
+  case ARM::VLD1d16wb_register:
+  case ARM::VLD1d32wb_register:
+  case ARM::VLD1d64wb_register:
+  case ARM::VLD1q8wb_fixed:
+  case ARM::VLD1q16wb_fixed:
+  case ARM::VLD1q32wb_fixed:
+  case ARM::VLD1q64wb_fixed:
+  case ARM::VLD1q8wb_register:
+  case ARM::VLD1q16wb_register:
+  case ARM::VLD1q32wb_register:
+  case ARM::VLD1q64wb_register:
+    // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+    // variant encodes Rm == 0xf. Anything else is a register offset post-
+    // increment and we need to add the register operand to the instruction.
+    if (Rm != 0xD && Rm != 0xF &&
+        !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2d8wb_fixed:
+  case ARM::VLD2d16wb_fixed:
+  case ARM::VLD2d32wb_fixed:
+  case ARM::VLD2b8wb_fixed:
+  case ARM::VLD2b16wb_fixed:
+  case ARM::VLD2b32wb_fixed:
+  case ARM::VLD2q8wb_fixed:
+  case ARM::VLD2q16wb_fixed:
+  case ARM::VLD2q32wb_fixed:
+    break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  // Writeback Operand
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1d8wb_register:
+    case ARM::VST1d16wb_register:
+    case ARM::VST1d32wb_register:
+    case ARM::VST1d64wb_register:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1q8wb_register:
+    case ARM::VST1q16wb_register:
+    case ARM::VST1q32wb_register:
+    case ARM::VST1q64wb_register:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Twb_register:
+    case ARM::VST1d16Twb_register:
+    case ARM::VST1d32Twb_register:
+    case ARM::VST1d64Twb_register:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST1d8Qwb_register:
+    case ARM::VST1d16Qwb_register:
+    case ARM::VST1d32Qwb_register:
+    case ARM::VST1d64Qwb_register:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2d8wb_register:
+    case ARM::VST2d16wb_register:
+    case ARM::VST2d32wb_register:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2q8wb_register:
+    case ARM::VST2q16wb_register:
+    case ARM::VST2q32wb_register:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+    case ARM::VST2b8wb_register:
+    case ARM::VST2b16wb_register:
+    case ARM::VST2b32wb_register:
+      if (Rm == 0xF)
+        return MCDisassembler::Fail;
+      Inst.addOperand(MCOperand::CreateImm(0));
+      break;
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  switch (Inst.getOpcode()) {
+    default:
+      if (Rm == 0xD)
+        Inst.addOperand(MCOperand::CreateReg(0));
+      else if (Rm != 0xF) {
+        if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+          return MCDisassembler::Fail;
+      }
+      break;
+    case ARM::VST1d8wb_fixed:
+    case ARM::VST1d16wb_fixed:
+    case ARM::VST1d32wb_fixed:
+    case ARM::VST1d64wb_fixed:
+    case ARM::VST1q8wb_fixed:
+    case ARM::VST1q16wb_fixed:
+    case ARM::VST1q32wb_fixed:
+    case ARM::VST1q64wb_fixed:
+    case ARM::VST1d8Twb_fixed:
+    case ARM::VST1d16Twb_fixed:
+    case ARM::VST1d32Twb_fixed:
+    case ARM::VST1d64Twb_fixed:
+    case ARM::VST1d8Qwb_fixed:
+    case ARM::VST1d16Qwb_fixed:
+    case ARM::VST1d32Qwb_fixed:
+    case ARM::VST1d64Qwb_fixed:
+    case ARM::VST2d8wb_fixed:
+    case ARM::VST2d16wb_fixed:
+    case ARM::VST2d32wb_fixed:
+    case ARM::VST2q8wb_fixed:
+    case ARM::VST2q16wb_fixed:
+    case ARM::VST2q32wb_fixed:
+    case ARM::VST2b8wb_fixed:
+    case ARM::VST2b16wb_fixed:
+    case ARM::VST2b32wb_fixed:
+      break;
+  }
+
+
+  // First input register
+  switch (Inst.getOpcode()) {
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
+  case ARM::VST1q8:
+  case ARM::VST1q16wb_fixed:
+  case ARM::VST1q16wb_register:
+  case ARM::VST1q32wb_fixed:
+  case ARM::VST1q32wb_register:
+  case ARM::VST1q64wb_fixed:
+  case ARM::VST1q64wb_register:
+  case ARM::VST1q8wb_fixed:
+  case ARM::VST1q8wb_register:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2d8:
+  case ARM::VST2d16wb_fixed:
+  case ARM::VST2d16wb_register:
+  case ARM::VST2d32wb_fixed:
+  case ARM::VST2d32wb_register:
+  case ARM::VST2d8wb_fixed:
+  case ARM::VST2d8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VST2b16:
+  case ARM::VST2b32:
+  case ARM::VST2b8:
+  case ARM::VST2b16wb_fixed:
+  case ARM::VST2b16wb_register:
+  case ARM::VST2b32wb_fixed:
+  case ARM::VST2b32wb_register:
+  case ARM::VST2b8wb_fixed:
+  case ARM::VST2b8wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  // Second input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Third input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
+  align *= (1 << size);
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD1DUPq16: case ARM::VLD1DUPq32: case ARM::VLD1DUPq8:
+  case ARM::VLD1DUPq16wb_fixed: case ARM::VLD1DUPq16wb_register:
+  case ARM::VLD1DUPq32wb_fixed: case ARM::VLD1DUPq32wb_register:
+  case ARM::VLD1DUPq8wb_fixed: case ARM::VLD1DUPq8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  // The fixed offset post-increment encodes Rm == 0xd. The no-writeback
+  // variant encodes Rm == 0xf. Anything else is a register offset post-
+  // increment and we need to add the register operand to the instruction.
+  if (Rm != 0xD && Rm != 0xF &&
+      !Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+  unsigned size = 1 << fieldFromInstruction(Insn, 6, 2);
+  align *= 2*size;
+
+  switch (Inst.getOpcode()) {
+  case ARM::VLD2DUPd16: case ARM::VLD2DUPd32: case ARM::VLD2DUPd8:
+  case ARM::VLD2DUPd16wb_fixed: case ARM::VLD2DUPd16wb_register:
+  case ARM::VLD2DUPd32wb_fixed: case ARM::VLD2DUPd32wb_register:
+  case ARM::VLD2DUPd8wb_fixed: case ARM::VLD2DUPd8wb_register:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  case ARM::VLD2DUPd16x2: case ARM::VLD2DUPd32x2: case ARM::VLD2DUPd8x2:
+  case ARM::VLD2DUPd16x2wb_fixed: case ARM::VLD2DUPd16x2wb_register:
+  case ARM::VLD2DUPd32x2wb_fixed: case ARM::VLD2DUPd32x2wb_register:
+  case ARM::VLD2DUPd8x2wb_fixed: case ARM::VLD2DUPd8x2wb_register:
+    if (!Check(S, DecodeDPairSpacedRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  }
+
+  if (Rm != 0xF)
+    Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm != 0xD && Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction(Insn, 5, 1) + 1;
+  unsigned align = fieldFromInstruction(Insn, 4, 1);
+
+  if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
+    size = 4;
+    align = 16;
+  } else {
+    if (size == 2) {
+      size = 1 << size;
+      align *= 8;
+    } else {
+      size = 1 << size;
+      align *= 4*size;
+    }
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeNEONModImmInstruction(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned imm = fieldFromInstruction(Insn, 0, 4);
+  imm |= fieldFromInstruction(Insn, 16, 3) << 4;
+  imm |= fieldFromInstruction(Insn, 24, 1) << 7;
+  imm |= fieldFromInstruction(Insn, 8, 4) << 8;
+  imm |= fieldFromInstruction(Insn, 5, 1) << 12;
+  unsigned Q = fieldFromInstruction(Insn, 6, 1);
+
+  if (Q) {
+    if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+    case ARM::VORRiv4i16:
+    case ARM::VORRiv2i32:
+    case ARM::VBICiv4i16:
+    case ARM::VBICiv2i32:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VORRiv8i16:
+    case ARM::VORRiv4i32:
+    case ARM::VBICiv8i16:
+    case ARM::VBICiv4i32:
+      if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 18, 2);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(8 << size));
+
+  return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight16Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight32Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRight64Imm(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  Rn |= fieldFromInstruction(Insn, 7, 1) << 4;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  Rm |= fieldFromInstruction(Insn, 5, 1) << 4;
+  unsigned op = fieldFromInstruction(Insn, 6, 1);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (op) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail; // Writeback
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::VTBL2:
+  case ARM::VTBX2:
+    if (!Check(S, DecodeDPairRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+    break;
+  default:
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned dst = fieldFromInstruction(Insn, 8, 3);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch(Inst.getOpcode()) {
+    default:
+      return MCDisassembler::Fail;
+    case ARM::tADR:
+      break; // tADR does not explicitly represent the PC as an operand.
+    case ARM::tADDrSPi:
+      Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<12>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<21>(Val) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, Address + (Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(Val << 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned Rm = fieldFromInstruction(Val, 3, 3);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 0, 3);
+  unsigned imm = fieldFromInstruction(Val, 3, 5);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  unsigned imm = Val << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 6, 4);
+  unsigned Rm = fieldFromInstruction(Val, 2, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2PLDs:
+    case ARM::t2PLDWs:
+    case ARM::t2PLIs:
+      break;
+    default: {
+      unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+      if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+    }
+  }
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  if (Rn == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBs:
+        Inst.setOpcode(ARM::t2LDRBpci);
+        break;
+      case ARM::t2LDRHs:
+        Inst.setOpcode(ARM::t2LDRHpci);
+        break;
+      case ARM::t2LDRSHs:
+        Inst.setOpcode(ARM::t2LDRSHpci);
+        break;
+      case ARM::t2LDRSBs:
+        Inst.setOpcode(ARM::t2LDRSBpci);
+        break;
+      case ARM::t2PLDs:
+        Inst.setOpcode(ARM::t2PLDi12);
+        Inst.addOperand(MCOperand::CreateReg(ARM::PC));
+        break;
+      default:
+        return MCDisassembler::Fail;
+    }
+
+    int imm = fieldFromInstruction(Insn, 0, 12);
+    if (!fieldFromInstruction(Insn, 23, 1)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+    return S;
+  }
+
+  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  if (Val == 0)
+    Inst.addOperand(MCOperand::CreateImm(INT32_MIN));
+  else {
+    int imm = Val & 0xFF;
+
+    if (!(Val & 0x100)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm * 4));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 8, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x100))
+    imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 9, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 9);
+
+  // Some instructions always use an additive offset.
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+    case ARM::t2LDRBT:
+    case ARM::t2LDRHT:
+    case ARM::t2LDRSBT:
+    case ARM::t2LDRSHT:
+    case ARM::t2STRT:
+    case ARM::t2STRBT:
+    case ARM::t2STRHT:
+      imm |= 0x100;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  addr |= fieldFromInstruction(Insn, 9, 1) << 8;
+  addr |= Rn << 9;
+  unsigned load = fieldFromInstruction(Insn, 20, 1);
+
+  if (!load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 13, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 12);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned imm = fieldFromInstruction(Insn, 0, 7);
+
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Inst.getOpcode() == ARM::tADDrSP) {
+    unsigned Rdm = fieldFromInstruction(Insn, 0, 3);
+    Rdm |= fieldFromInstruction(Insn, 7, 1) << 3;
+
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else if (Inst.getOpcode() == ARM::tADDspr) {
+    unsigned Rm = fieldFromInstruction(Insn, 3, 4);
+
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
+                           uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction(Insn, 4, 1) | 0x2;
+  unsigned flags = fieldFromInstruction(Insn, 0, 3);
+
+  Inst.addOperand(MCOperand::CreateImm(imod));
+  Inst.addOperand(MCOperand::CreateImm(flags));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned add = fieldFromInstruction(Insn, 4, 1);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(add));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
+  // Note only one trailing zero not two.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with two trailing zeros as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10H:imm10L:'00', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address,
+                                (Address & ~2u) + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
+                              uint64_t Address, const void *Decoder) {
+  if (Val == 0xA || Val == 0xB)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(MCInst &Inst, unsigned Insn,
+                       uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction(Insn, 22, 4);
+  if (pred == 0xE || pred == 0xF) {
+    unsigned opc = fieldFromInstruction(Insn, 4, 28);
+    switch (opc) {
+      default:
+        return MCDisassembler::Fail;
+      case 0xf3bf8f4:
+        Inst.setOpcode(ARM::t2DSB);
+        break;
+      case 0xf3bf8f5:
+        Inst.setOpcode(ARM::t2DMB);
+        break;
+      case 0xf3bf8f6:
+        Inst.setOpcode(ARM::t2ISB);
+        break;
+    }
+
+    unsigned imm = fieldFromInstruction(Insn, 0, 4);
+    return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+  }
+
+  unsigned brtarget = fieldFromInstruction(Insn, 0, 11) << 1;
+  brtarget |= fieldFromInstruction(Insn, 11, 1) << 19;
+  brtarget |= fieldFromInstruction(Insn, 13, 1) << 18;
+  brtarget |= fieldFromInstruction(Insn, 16, 6) << 12;
+  brtarget |= fieldFromInstruction(Insn, 26, 1) << 20;
+
+  if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Decode a shifted immediate operand.  These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  unsigned ctrl = fieldFromInstruction(Val, 10, 2);
+  if (ctrl == 0) {
+    unsigned byte = fieldFromInstruction(Val, 8, 2);
+    unsigned imm = fieldFromInstruction(Val, 0, 8);
+    switch (byte) {
+      case 0:
+        Inst.addOperand(MCOperand::CreateImm(imm));
+        break;
+      case 1:
+        Inst.addOperand(MCOperand::CreateImm((imm << 16) | imm));
+        break;
+      case 2:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 8)));
+        break;
+      case 3:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 16) |
+                                             (imm << 8)  |  imm));
+        break;
+    }
+  } else {
+    unsigned unrot = fieldFromInstruction(Val, 0, 7) | 0x80;
+    unsigned rot = fieldFromInstruction(Val, 7, 5);
+    unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  }
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
+                            uint64_t Address, const void *Decoder){
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
+                                true, 2, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<9>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder){
+  // Val is passed in as S:J1:J2:imm10:imm11
+  // Note no trailing zero after imm11.  Also the J1 and J2 values are from
+  // the encoded instruction.  So here change to I1 and I2 values via:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = (Val >> 23) & 1;
+  unsigned J1 = (Val >> 22) & 1;
+  unsigned J2 = (Val >> 21) & 1;
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned tmp = (Val & ~0x600000) | (I1 << 22) | (I2 << 21);
+  int imm32 = SignExtend32<25>(tmp << 1);
+
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  if (!Val) return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder){
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+  if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+  if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSTRPreReg(MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= fieldFromInstruction(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction(Insn, 5, 3);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  Rd |= fieldFromInstruction(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction(Insn, 6, 2);
+      if (fieldFromInstruction(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
+      index = fieldFromInstruction(Insn, 7, 1);
+      if (fieldFromInstruction(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction(Insn,  5, 1);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+  Rm |= fieldFromInstruction(Insn, 0, 4) << 1;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned pred = fieldFromInstruction(Insn, 4, 4);
+  unsigned mask = fieldFromInstruction(Insn, 0, 4);
+
+  if (pred == 0xF) {
+    pred = 0xE;
+    S = MCDisassembler::SoftFail;
+  }
+
+  if (mask == 0x0) {
+    mask |= 0x8;
+    S = MCDisassembler::SoftFail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(pred));
+  Inst.addOperand(MCOperand::CreateImm(mask));
+  return S;
+}
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+  if (Rt == Rt2)
+    Check(S, MCDisassembler::SoftFail);
+
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction(Insn, 0, 8);
+  unsigned W = fieldFromInstruction(Insn, 21, 1);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  unsigned P = fieldFromInstruction(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Adr(MCInst &Inst, uint32_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned sign1 = fieldFromInstruction(Insn, 21, 1);
+  unsigned sign2 = fieldFromInstruction(Insn, 23, 1);
+  if (sign1 != sign2) return MCDisassembler::Fail;
+
+  unsigned Val = fieldFromInstruction(Insn, 0, 8);
+  Val |= fieldFromInstruction(Insn, 12, 3) << 8;
+  Val |= fieldFromInstruction(Insn, 26, 1) << 11;
+  Val |= sign1 << 12;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<13>(Val)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, uint32_t Val,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  // Shift of "asr #32" is not allowed in Thumb2 mode.
+  if (Val == 0x20) S = MCDisassembler::SoftFail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return S;
+}
+
+static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  unsigned Rt   = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rt2  = fieldFromInstruction(Insn, 0,  4);
+  unsigned Rn   = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Rt == Rn || Rn == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv2f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    Inst.setOpcode(ARM::VMOVv2f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv4f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    Inst.setOpcode(ARM::VMOVv4f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 16, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rm = fieldFromInstruction(Val, 0, 4);
+  Rm |= (fieldFromInstruction(Val, 23, 1) << 4);
+  unsigned Cond = fieldFromInstruction(Val, 28, 4);
+ 
+  if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrMode7Operand(Inst, Rn, Address, Decoder))) 
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePostIdxReg(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Cond, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder) {
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned CRm = fieldFromInstruction(Val, 0, 4);
+  unsigned opc1 = fieldFromInstruction(Val, 4, 4);
+  unsigned cop = fieldFromInstruction(Val, 8, 4);
+  unsigned Rt = fieldFromInstruction(Val, 12, 4);
+  unsigned Rt2 = fieldFromInstruction(Val, 16, 4);
+
+  if ((cop & ~0x1) == 0xa)
+    return MCDisassembler::Fail;
+
+  if (Rt == Rt2)
+    S = MCDisassembler::SoftFail;
+
+  Inst.addOperand(MCOperand::CreateImm(cop));
+  Inst.addOperand(MCOperand::CreateImm(opc1));
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(CRm));
+
+  return S;
+}
+
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
new file mode 100644
index 000000000000..2afb20d6686a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -0,0 +1,1449 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#include "ARMGenAsmWriter.inc"
+
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+  // lsr #32 and asr #32 exist, but should be encoded as a 0.
+  assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
+  if (imm == 0)
+    return 32;
+  return imm;
+}
+
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+                          unsigned ShImm, bool UseMarkup) {
+  if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+    return;
+  O << ", ";
+
+  assert (!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+  O << getShiftOpcStr(ShOpc);
+
+  if (ShOpc != ARM_AM::rrx) {
+    O << " ";
+    if (UseMarkup)
+      O << "<imm:";
+    O << "#" << translateShiftImm(ShImm);
+    if (UseMarkup)
+      O << ">";
+  }
+}
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
+                               const MCInstrInfo &MII,
+                               const MCRegisterInfo &MRI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI, MII, MRI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << markup("<reg:")
+     << getRegisterName(RegNo)
+     << markup(">");
+}
+
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+
+  // Check for HINT instructions w/ canonical names.
+  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+    switch (MI->getOperand(0).getImm()) {
+    case 0: O << "\tnop"; break;
+    case 1: O << "\tyield"; break;
+    case 2: O << "\twfe"; break;
+    case 3: O << "\twfi"; break;
+    case 4: O << "\tsev"; break;
+    default:
+      // Anything else should just print normally.
+      printInstruction(MI, O);
+      printAnnotation(O, Annot);
+      return;
+    }
+    printPredicateOperand(MI, 1, O);
+    if (Opcode == ARM::t2HINT)
+      O << ".w";
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Check for MOVs and print canonical forms, instead.
+  if (Opcode == ARM::MOVsr) {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+    const MCOperand &MO3 = MI->getOperand(3);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()));
+    printSBitModifierOperand(MI, 6, O);
+    printPredicateOperand(MI, 4, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    O << ", ";
+    printRegName(O, MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == ARM::MOVsi) {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+    printSBitModifierOperand(MI, 5, O);
+    printPredicateOperand(MI, 3, O);
+
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
+
+    if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
+    O << ", "
+      << markup("<imm:")
+      << "#" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()))
+      << markup(">");
+    printAnnotation(O, Annot);
+    return;
+  }
+
+
+  // A8.6.123 PUSH
+  if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
+      MI->getOperand(0).getReg() == ARM::SP &&
+      MI->getNumOperands() > 5) {
+    // Should only print PUSH if there are at least two registers in the list.
+    O << '\t' << "push";
+    printPredicateOperand(MI, 2, O);
+    if (Opcode == ARM::t2STMDB_UPD)
+      O << ".w";
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(3).getImm() == -4) {
+    O << '\t' << "push";
+    printPredicateOperand(MI, 4, O);
+    O << "\t{";
+    printRegName(O, MI->getOperand(1).getReg());
+    O << "}";
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // A8.6.122 POP
+  if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) &&
+      MI->getOperand(0).getReg() == ARM::SP &&
+      MI->getNumOperands() > 5) {
+    // Should only print POP if there are at least two registers in the list.
+    O << '\t' << "pop";
+    printPredicateOperand(MI, 2, O);
+    if (Opcode == ARM::t2LDMIA_UPD)
+      O << ".w";
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(4).getImm() == 4) {
+    O << '\t' << "pop";
+    printPredicateOperand(MI, 5, O);
+    O << "\t{";
+    printRegName(O, MI->getOperand(0).getReg());
+    O << "}";
+    printAnnotation(O, Annot);
+    return;
+  }
+
+
+  // A8.6.355 VPUSH
+  if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
+      MI->getOperand(0).getReg() == ARM::SP) {
+    O << '\t' << "vpush";
+    printPredicateOperand(MI, 2, O);
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // A8.6.354 VPOP
+  if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) &&
+      MI->getOperand(0).getReg() == ARM::SP) {
+    O << '\t' << "vpop";
+    printPredicateOperand(MI, 2, O);
+    O << '\t';
+    printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == ARM::tLDMIA) {
+    bool Writeback = true;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg)
+        Writeback = false;
+    }
+
+    O << "\tldm";
+
+    printPredicateOperand(MI, 1, O);
+    O << '\t';
+    printRegName(O, BaseReg);
+    if (Writeback) O << "!";
+    O << ", ";
+    printRegisterList(MI, 3, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Thumb1 NOP
+  if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 &&
+      MI->getOperand(1).getReg() == ARM::R8) {
+    O << "\tnop";
+    printPredicateOperand(MI, 2, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
+  // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
+  // a single GPRPair reg operand is used in the .td file to replace the two
+  // GPRs. However, when decoding them, the two GRPs cannot be automatically
+  // expressed as a GPRPair, so we have to manually merge them.
+  // FIXME: We would really like to be able to tablegen'erate this.
+  if (Opcode == ARM::LDREXD || Opcode == ARM::STREXD) {
+    const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
+    bool isStore = Opcode == ARM::STREXD;
+    unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
+    if (MRC.contains(Reg)) {
+      MCInst NewMI;
+      MCOperand NewReg;
+      NewMI.setOpcode(Opcode);
+
+      if (isStore)
+        NewMI.addOperand(MI->getOperand(0));
+      NewReg = MCOperand::CreateReg(MRI.getMatchingSuperReg(Reg, ARM::gsub_0,
+        &MRI.getRegClass(ARM::GPRPairRegClassID)));
+      NewMI.addOperand(NewReg);
+
+      // Copy the rest operands into NewMI.
+      for(unsigned i= isStore ? 3 : 2; i < MI->getNumOperands(); ++i)
+        NewMI.addOperand(MI->getOperand(i));
+      printInstruction(&NewMI, O);
+      return;
+    }
+  }
+
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:")
+      << '#' << formatImm(Op.getImm())
+      << markup(">");
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex. And only print 32 unsigned bits for the address.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex((uint32_t)Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  if (MO1.isExpr())
+    O << *MO1.getExpr();
+  else if (MO1.isImm()) {
+    O << markup("<mem:") << "[pc, "
+      << markup("<imm:") << "#" << formatImm(MO1.getImm())
+      << markup(">]>", "]");
+  }
+  else
+    llvm_unreachable("Unknown LDR label operand?");
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc == ARM_AM::rrx)
+    return;
+
+  O << ' ';
+  printRegName(O, MO2.getReg());
+  assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  printRegName(O, MO1.getReg());
+
+  // Print the shift opc.
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #2
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
+      O << ", "
+        << markup("<imm:")
+        << "#"
+        << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+        << ARM_AM::getAM2Offset(MO3.getImm())
+        << markup(">");
+    }
+    O << "]" << markup(">");
+    return;
+  }
+
+  O << ", ";
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
+  printRegName(O, MO2.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+                   ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+#ifndef NDEBUG
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+  assert(IdxMode != ARMII::IndexModePost &&
+         "Should be pre or offset index op");
+#endif
+
+  printAM2PreOrOffsetIndexOp(MI, Op, O);
+}
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    O << markup("<imm:")
+      << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
+      << ImmOffs
+      << markup(">");
+    return;
+  }
+
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()));
+  printRegName(O, MO1.getReg());
+
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+                   ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup);
+}
+
+//===--------------------------------------------------------------------===//
+// Addressing Mode #3
+//===--------------------------------------------------------------------===//
+
+void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "], " << markup(">");
+
+  if (MO2.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO3.getImm());
+    printRegName(O, MO2.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  O << markup("<imm:")
+    << '#'
+    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
+    << ImmOffs
+    << markup(">");
+}
+
+void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+
+  O << markup("<mem:") << '[';
+  printRegName(O, MO1.getReg());
+
+  if (MO2.getReg()) {
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+    printRegName(O, MO2.getReg());
+    O << ']' << markup(">");
+    return;
+  }
+
+  //If the op is sub we have to print the immediate even if it is 0
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
+  ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
+
+  if (ImmOffs || (op == ARM_AM::sub)) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(op)
+      << ImmOffs
+      << markup(">");
+  }
+  O << ']' << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  if (!MO1.isReg()) {   //  For label symbolic references.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  unsigned IdxMode = ARM_AM::getAM3IdxMode(MO3.getImm());
+
+  if (IdxMode == ARMII::IndexModePost) {
+    printAM3PostIndexOp(MI, Op, O);
+    return;
+  }
+  printAM3PreOrOffsetIndexOp(MI, Op, O);
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (MO1.getReg()) {
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()));
+    printRegName(O, MO1.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  O << markup("<imm:")
+    << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+    << markup(">");
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << (MO2.getImm() ? "" : "-");
+  printRegName(O, MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+    << markup(">");
+}
+
+
+void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum)
+                                                 .getImm());
+  O << ARM_AM::getAMSubModeStr(Mode);
+}
+
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
+  if (ImmOffs || Op == ARM_AM::sub) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
+      << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
+      << ImmOffs * 4
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ":" << (MO2.getImm() << 3);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.getReg() == 0)
+    O << "!";
+  else {
+    O << ", ";
+    printRegName(O, MO.getReg());
+  }
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = CountTrailingZeros_32(v);
+  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << markup("<imm:") << '#' << lsb << markup(">")
+    << ", "
+    << markup("<imm:") << '#' << width << markup(">");
+}
+
+void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_MB::MemBOptToString(val);
+}
+
+void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned ShiftOp = MI->getOperand(OpNum).getImm();
+  bool isASR = (ShiftOp & (1 << 5)) != 0;
+  unsigned Amt = ShiftOp & 0x1f;
+  if (isASR) {
+    O << ", asr "
+      << markup("<imm:")
+      << "#" << (Amt == 0 ? 32 : Amt)
+      << markup(">");
+  }
+  else if (Amt) {
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << Amt
+      << markup(">");
+  }
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+  O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  // A shift amount of 32 is encoded as 0.
+  if (Imm == 0)
+    Imm = 32;
+  assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+  O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << "{";
+  for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum) O << ", ";
+    printRegName(O, MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printGPRPairOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_0));
+  O << ", ";
+  printRegName(O, MRI.getSubReg(Reg, ARM::gsub_1));
+}
+
+
+void ARMInstPrinter::printSetendOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  if (Op.getImm())
+    O << "be";
+  else
+    O << "le";
+}
+
+void ARMInstPrinter::printCPSIMod(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  O << ARM_PROC::IModToString(Op.getImm());
+}
+
+void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned IFlags = Op.getImm();
+  for (int i=2; i >= 0; --i)
+    if (IFlags & (1 << i))
+      O << ARM_PROC::IFlagsToString(1 << i);
+
+  if (IFlags == 0)
+    O << "none";
+}
+
+void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  unsigned SpecRegRBit = Op.getImm() >> 4;
+  unsigned Mask = Op.getImm() & 0xf;
+
+  if (getAvailableFeatures() & ARM::FeatureMClass) {
+    unsigned SYSm = Op.getImm();
+    unsigned Opcode = MI->getOpcode();
+    // For reads of the special registers ignore the "mask encoding" bits
+    // which are only for writes.
+    if (Opcode == ARM::t2MRS_M)
+      SYSm &= 0xff;
+    switch (SYSm) {
+    default: llvm_unreachable("Unexpected mask value!");
+    case     0:
+    case 0x800: O << "apsr"; return; // with _nzcvq bits is an alias for aspr
+    case 0x400: O << "apsr_g"; return;
+    case 0xc00: O << "apsr_nzcvqg"; return;
+    case     1:
+    case 0x801: O << "iapsr"; return; // with _nzcvq bits is an alias for iapsr
+    case 0x401: O << "iapsr_g"; return;
+    case 0xc01: O << "iapsr_nzcvqg"; return;
+    case     2:
+    case 0x802: O << "eapsr"; return; // with _nzcvq bits is an alias for eapsr
+    case 0x402: O << "eapsr_g"; return;
+    case 0xc02: O << "eapsr_nzcvqg"; return;
+    case     3:
+    case 0x803: O << "xpsr"; return; // with _nzcvq bits is an alias for xpsr
+    case 0x403: O << "xpsr_g"; return;
+    case 0xc03: O << "xpsr_nzcvqg"; return;
+    case     5:
+    case 0x805: O << "ipsr"; return;
+    case     6:
+    case 0x806: O << "epsr"; return;
+    case     7:
+    case 0x807: O << "iepsr"; return;
+    case     8:
+    case 0x808: O << "msp"; return;
+    case     9:
+    case 0x809: O << "psp"; return;
+    case  0x10:
+    case 0x810: O << "primask"; return;
+    case  0x11:
+    case 0x811: O << "basepri"; return;
+    case  0x12:
+    case 0x812: O << "basepri_max"; return;
+    case  0x13:
+    case 0x813: O << "faultmask"; return;
+    case  0x14:
+    case 0x814: O << "control"; return;
+    }
+  }
+
+  // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+  // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+  if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+    O << "APSR_";
+    switch (Mask) {
+    default: llvm_unreachable("Unexpected mask value!");
+    case 4:  O << "g"; return;
+    case 8:  O << "nzcvq"; return;
+    case 12: O << "nzcvqg"; return;
+    }
+  }
+
+  if (SpecRegRBit)
+    O << "SPSR";
+  else
+    O << "CPSR";
+
+  if (Mask) {
+    O << '_';
+    if (Mask & 8) O << 'f';
+    if (Mask & 4) O << 's';
+    if (Mask & 2) O << 'x';
+    if (Mask & 1) O << 'c';
+  }
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  // Handle the undefined 15 CC value here for printing so we don't abort().
+  if ((unsigned)CC == 15)
+    O << "<und>";
+  else if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printMandatoryPredicateOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  O << "p" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  O << "c" << MI->getOperand(OpNum).getImm();
+}
+
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
+}
+
+void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (MO.isExpr()) {
+    O << *MO.getExpr();
+    return;
+  }
+
+  int32_t OffImm = (int32_t)MO.getImm();
+
+  O << markup("<imm:");
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << formatImm(MI->getOperand(OpNum).getImm() * 4)
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << formatImm((Imm == 0 ? 32 : Imm))
+    << markup(">");
+}
+
+void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  // (3 - the number of trailing zeros) is the number of then / else.
+  unsigned Mask = MI->getOperand(OpNum).getImm();
+  unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
+  unsigned CondBit0 = Firstcond & 1;
+  unsigned NumTZ = CountTrailingZeros_32(Mask);
+  assert(NumTZ <= 3 && "Invalid IT mask!");
+  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+    bool T = ((Mask >> Pos) & 1) == CondBit0;
+    if (T)
+      O << 't';
+    else
+      O << 'e';
+  }
+}
+
+void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned RegNum = MO2.getReg()) {
+    O << ", ";
+    printRegName(O, RegNum);
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
+                                                    unsigned Op,
+                                                    raw_ostream &O,
+                                                    unsigned Scale) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op + 1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << formatImm(ImmOffs * Scale)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 1);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S2Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 2);
+}
+
+void ARMInstPrinter::printThumbAddrModeImm5S4Operand(const MCInst *MI,
+                                                     unsigned Op,
+                                                     raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
+}
+
+void ARMInstPrinter::printThumbAddrModeSPOperand(const MCInst *MI, unsigned Op,
+                                                 raw_ostream &O) {
+  printThumbAddrModeImm5SOperand(MI, Op, O, 4);
+}
+
+// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
+// register with shift forms.
+// REG 0   0           - e.g. R5
+// REG IMM, SH_OPC     - e.g. R5, LSL #3
+void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  unsigned Reg = MO1.getReg();
+  printRegName(O, Reg);
+
+  // Print the shift opc.
+  assert(MO2.isImm() && "Not a valid t2_so_reg value!");
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
+}
+
+void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:") 
+      << "#-" << -OffImm
+      << markup(">");
+  }
+  else if (OffImm > 0) {
+    O << ", "
+      << markup("<imm:") 
+      << "#" << OffImm
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else if (OffImm > 0)
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  if (!MO1.isReg()) {   //  For label symbolic references.
+    printOperand(MI, OpNum, O);
+    return;
+  }
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else if (OffImm > 0)
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << formatImm(MO2.getImm() * 4)
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
+                                                      unsigned OpNum,
+                                                      raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+  O << ", " << markup("<imm:");
+  if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else
+    O << "#" << OffImm;
+  O << markup(">");
+}
+
+void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+
+  assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
+
+  // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else if (OffImm > 0)
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
+}
+
+void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
+                                                 unsigned OpNum,
+                                                 raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  assert(MO2.getReg() && "Invalid so_reg load / store address!");
+  O << ", ";
+  printRegName(O, MO2.getReg());
+
+  unsigned ShAmt = MO3.getImm();
+  if (ShAmt) {
+    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << ShAmt
+      << markup(">");
+  }
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  O << markup("<imm:")
+    << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+    << markup(">");
+}
+
+void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned EncodedImm = MI->getOperand(OpNum).getImm();
+  unsigned EltBits;
+  uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
+  O << markup("<imm:")
+    << "#0x";
+  O.write_hex(Val);
+  O << markup(">");
+}
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << formatImm(Imm + 1)
+    << markup(">");
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  O << ", ror "
+    << markup("<imm:")
+    << "#";
+  switch (Imm) {
+  default: assert (0 && "illegal ror immediate!");
+  case 1: O << "8"; break;
+  case 2: O << "16"; break;
+  case 3: O << "24"; break;
+  }
+  O << markup(">");
+}
+
+void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << 16 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O) {
+  O << markup("<imm:")
+    << "#" << 32 - MI->getOperand(OpNum).getImm()
+    << markup(">");
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
+
+void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
+                                                  unsigned OpNum,
+                                                  raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
+                                                      unsigned OpNum,
+                                                      raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
+  unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O  << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListFourSpacedAllLanes(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "[]}";
+}
+
+void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "}";
+}
+
+void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI,
+                                                unsigned OpNum,
+                                                raw_ostream &O) {
+  // Normally, it's not safe to use register enum values directly with
+  // addition to get the next register, but for VFP registers, the
+  // sort order is guaranteed because they're all of the form D<n>.
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "}";
+}
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
new file mode 100644
index 000000000000..edff75d886e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -0,0 +1,163 @@
+//===- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTPRINTER_H
+#define ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class ARMInstPrinter : public MCInstPrinter {
+public:
+  ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printSORegRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printAddrModeTBB(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrModeTBH(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,raw_ostream &O);
+  void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+
+  void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode7Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrMode6OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+  void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printThumbAddrModeImm5SOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O, unsigned Scale);
+  void printThumbAddrModeImm5S1Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S2Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeImm5S4Operand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O);
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O);
+  void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O);
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+
+  void printSetendOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIMod(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCPSIFlag(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRMaskOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMandatoryPredicateOperand(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum,
+                                raw_ostream &O);
+  void printRegisterList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printGPRPairOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
+  void printFBits16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFBits32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListOne(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListTwo(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printVectorListThree(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListFour(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorListOneAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
+  void printVectorListFourAllLanes(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O);
+  void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O);
+  void printVectorListFourSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O);
+  void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+  void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/LICENSE.TXT b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
new file mode 100755
index 000000000000..68afea12ed44
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/LICENSE.TXT
@@ -0,0 +1,47 @@
+ARM Limited
+
+Software Grant License Agreement ("Agreement")
+
+Except for the license granted herein to you, ARM Limited ("ARM") reserves all
+right, title, and interest in and to the Software (defined below).
+
+Definition
+
+"Software" means the code and documentation as well as any original work of
+authorship, including any modifications or additions to an existing work, that
+is intentionally submitted by ARM to llvm.org (http://llvm.org) ("LLVM") for
+inclusion in, or documentation of, any of the products owned or managed by LLVM
+(the "Work"). For the purposes of this definition, "submitted" means any form of
+electronic, verbal, or written communication sent to LLVM or its
+representatives, including but not limited to communication on electronic
+mailing lists, source code control systems, and issue tracking systems that are
+managed by, or on behalf of, LLVM for the purpose of discussing and improving
+the Work, but excluding communication that is conspicuously marked otherwise.
+
+1. Grant of Copyright License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants to you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable copyright license to reproduce, prepare derivative
+   works of, publicly display, publicly perform, sublicense, and distribute the
+   Software and such derivative works.
+
+2. Grant of Patent License. Subject to the terms and conditions of this
+   Agreement, ARM hereby grants you and to recipients of the Software
+   distributed by LLVM a perpetual, worldwide, non-exclusive, no-charge,
+   royalty-free, irrevocable (except as stated in this section) patent license
+   to make, have made, use, offer to sell, sell, import, and otherwise transfer
+   the Work, where such license applies only to those patent claims licensable
+   by ARM that are necessarily infringed by ARM's Software alone or by
+   combination of the Software with the Work to which such Software was
+   submitted. If any entity institutes patent litigation against ARM or any
+   other entity (including a cross-claim or counterclaim in a lawsuit) alleging
+   that ARM's Software, or the Work to which ARM has contributed constitutes
+   direct or contributory patent infringement, then any patent licenses granted
+   to that entity under this Agreement for the Software or Work shall terminate
+   as of the date such litigation is filed.
+
+Unless required by applicable law or agreed to in writing, the software is
+provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+either express or implied, including, without limitation, any warranties or
+conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+PARTICULAR PURPOSE.
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
new file mode 100644
index 000000000000..62473b2bfdee
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -0,0 +1,668 @@
+//===-- ARMAddressingModes.h - ARM Addressing Modes -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+
+  enum AddrOpc {
+    sub = 0,
+    add
+  };
+
+  static inline const char *getAddrOpcStr(AddrOpc Op) {
+    return Op == sub ? "-" : "";
+  }
+
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+
+  static inline unsigned getShiftOpcEncoding(ShiftOpc Op) {
+    switch (Op) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::asr: return 2;
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::ror: return 3;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should ignore the low 6 bits, then
+    // retry the hunt.
+    if (Imm & 63U) {
+      unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U);
+      unsigned RotAmt2 = TZ2 & ~1;
+      if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
+        return (32-RotAmt2)&31;  // HW rotates right, not left.
+    }
+
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+    // 16-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~65535U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
+  /// obtained by left shifting a 16-bit immediate.
+  static inline bool isThumbImm16ShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~65535U << getThumbImm16ValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+
+  /// getT2SOImmValSplat - Return the 12-bit encoded representation
+  /// if the specified value can be obtained by splatting the low 8 bits
+  /// into every other byte or every byte of a 32-bit value. i.e.,
+  ///     00000000 00000000 00000000 abcdefgh    control = 0
+  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
+  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
+  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
+  /// Return -1 if none of the above apply.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValSplatVal(unsigned V) {
+    unsigned u, Vs, Imm;
+    // control = 0
+    if ((V & 0xffffff00) == 0)
+      return V;
+
+    // If the value is zeroes in the first byte, just shift those off
+    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+    // Any passing value only has 8 bits of payload, splatted across the word
+    Imm = Vs & 0xff;
+    // Likewise, any passing values have the payload splatted into the 3rd byte
+    u = Imm | (Imm << 16);
+
+    // control = 1 or 2
+    if (Vs == u)
+      return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+    // control = 3
+    if (Vs == (u | (u << 8)))
+      return (3 << 8) | Imm;
+
+    return -1;
+  }
+
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+  /// specified value is a rotated 8-bit value. Return -1 if no rotation
+  /// encoding is possible.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValRotateVal(unsigned V) {
+    unsigned RotAmt = CountLeadingZeros_32(V);
+    if (RotAmt >= 24)
+      return -1;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    if ((rotr32(0xff000000U, RotAmt) & V) == V)
+      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+    return -1;
+  }
+
+  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+  /// encoding for it.  If not, return -1.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmVal(unsigned Arg) {
+    // If 'Arg' is an 8-bit splat, then get the encoded value.
+    int Splat = getT2SOImmValSplatVal(Arg);
+    if (Splat != -1)
+      return Splat;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    int Rot = getT2SOImmValRotateVal(Arg);
+    if (Rot != -1)
+      return Rot;
+
+    return -1;
+  }
+
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = CountTrailingZeros_32(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15. The
+  // fourth operand 16-17 encodes the index mode.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  //
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO,
+                                   unsigned IdxMode = 0) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13) | (IdxMode << 16) ;
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)((AM2Opc >> 13) & 7);
+  }
+  static inline unsigned getAM2IdxMode(unsigned AM2Opc) {
+    return (AM2Opc >> 16);
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7. The fourth operand 9-10 encodes the
+  // index mode.
+
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset,
+                                   unsigned IdxMode = 0) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset | (IdxMode << 9);
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+  static inline unsigned getAM3IdxMode(unsigned AM3Opc) {
+    return (AM3Opc >> 9);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  // For VFP instructions, only the IA and DB modes are valid.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode) {
+    return (int)SubMode;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation in bit 8 and the immediate in bits 0-7.
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #6
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for NEON load / store instructions.
+  //
+  // addrmode6 := reg with optional alignment
+  //
+  // This is stored in two operands [regaddr, align].  The first is the
+  // address register.  The second operand is the value of the alignment
+  // specifier in bytes or zero if no explicit alignment.
+  // Valid alignments depend on the specific instruction.
+
+  //===--------------------------------------------------------------------===//
+  // NEON Modified Immediates
+  //===--------------------------------------------------------------------===//
+  //
+  // Several NEON instructions (e.g., VMOV) take a "modified immediate"
+  // vector operand, where a small immediate encoded in the instruction
+  // specifies a full NEON vector value.  These modified immediates are
+  // represented here as encoded integers.  The low 8 bits hold the immediate
+  // value; bit 12 holds the "Op" field of the instruction, and bits 11-8 hold
+  // the "Cmode" field of the instruction.  The interfaces below treat the
+  // Op and Cmode values as a single 5-bit value.
+
+  static inline unsigned createNEONModImm(unsigned OpCmode, unsigned Val) {
+    return (OpCmode << 8) | Val;
+  }
+  static inline unsigned getNEONModImmOpCmode(unsigned ModImm) {
+    return (ModImm >> 8) & 0x1f;
+  }
+  static inline unsigned getNEONModImmVal(unsigned ModImm) {
+    return ModImm & 0xff;
+  }
+
+  /// decodeNEONModImm - Decode a NEON modified immediate value into the
+  /// element value and the element size in bits.  (If the element size is
+  /// smaller than the vector, it is splatted into all the elements.)
+  static inline uint64_t decodeNEONModImm(unsigned ModImm, unsigned &EltBits) {
+    unsigned OpCmode = getNEONModImmOpCmode(ModImm);
+    unsigned Imm8 = getNEONModImmVal(ModImm);
+    uint64_t Val = 0;
+
+    if (OpCmode == 0xe) {
+      // 8-bit vector elements
+      Val = Imm8;
+      EltBits = 8;
+    } else if ((OpCmode & 0xc) == 0x8) {
+      // 16-bit vector elements
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 16;
+    } else if ((OpCmode & 0x8) == 0) {
+      // 32-bit vector elements, zero with one byte set
+      unsigned ByteNum = (OpCmode & 0x6) >> 1;
+      Val = Imm8 << (8 * ByteNum);
+      EltBits = 32;
+    } else if ((OpCmode & 0xe) == 0xc) {
+      // 32-bit vector elements, one byte with low bits set
+      unsigned ByteNum = 1 + (OpCmode & 0x1);
+      Val = (Imm8 << (8 * ByteNum)) | (0xffff >> (8 * (2 - ByteNum)));
+      EltBits = 32;
+    } else if (OpCmode == 0x1e) {
+      // 64-bit vector elements
+      for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+        if ((ModImm >> ByteNum) & 1)
+          Val |= (uint64_t)0xff << (8 * ByteNum);
+      }
+      EltBits = 64;
+    } else {
+      llvm_unreachable("Unsupported NEON immediate");
+    }
+    return Val;
+  }
+
+  AMSubMode getLoadStoreMultipleSubMode(int Opcode);
+
+  //===--------------------------------------------------------------------===//
+  // Floating-point Immediates
+  //
+  static inline float getFPImmFloat(unsigned Imm) {
+    // We expect an 8-bit binary encoding of a floating-point number here.
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    uint8_t Sign = (Imm >> 7) & 0x1;
+    uint8_t Exp = (Imm >> 4) & 0x7;
+    uint8_t Mantissa = Imm & 0xf;
+
+    //   8-bit FP    iEEEE Float Encoding
+    //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+    //
+    // where B = NOT(b);
+
+    FPUnion.I = 0;
+    FPUnion.I |= Sign << 31;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+    FPUnion.I |= (Exp & 0x3) << 23;
+    FPUnion.I |= Mantissa << 19;
+    return FPUnion.F;
+  }
+
+  /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP32Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+    int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x7ffff)
+      return -1;
+    Mantissa >>= 19;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP32Imm(const APFloat &FPImm) {
+    return getFP32Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP64Imm(const APInt &Imm) {
+    uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+    int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+    uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0xffffffffffffULL)
+      return -1;
+    Mantissa >>= 48;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP64Imm(const APFloat &FPImm) {
+    return getFP64Imm(FPImm.bitcastToAPInt());
+  }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
new file mode 100644
index 000000000000..e66e98567873
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -0,0 +1,688 @@
+//===-- ARMAsmBackend.cpp - ARM Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  ARMELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_ARM,
+                              /*HasRelocationAddend*/ false) {}
+};
+
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
+  bool isThumbMode;  // Currently emitting Thumb code.
+public:
+  ARMAsmBackend(const Target &T, const StringRef TT)
+    : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+      isThumbMode(TT.startswith("thumb")) {}
+
+  ~ARMAsmBackend() {
+    delete STI;
+  }
+
+  unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
+
+  bool hasNOP() const {
+    return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
+// This table *must* be in the order that the fixup_* kinds are defined in
+// ARMFixupKinds.h.
+//
+// Name                      Offset (bits) Size (bits)     Flags
+{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_pcrel_10_unscaled", 0,        32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_pcrel_10",       0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbranch",  0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_condbranch",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_uncondbl",      0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_condbl",        0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_blx",           0,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel |
+                                   MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+{ "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
+// movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
+{ "fixup_arm_movt_hi16",     0,            20,  0 },
+{ "fixup_arm_movw_lo16",     0,            20,  0 },
+{ "fixup_t2_movt_hi16",      0,            20,  0 },
+{ "fixup_t2_movw_lo16",      0,            20,  0 },
+{ "fixup_arm_movt_hi16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_movw_lo16_pcrel", 0,          20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movt_hi16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_t2_movw_lo16_pcrel", 0,           20,  MCFixupKindInfo::FKF_IsPCRel },
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  /// processFixupValue - Target hook to process the literal value of a fixup
+  /// if necessary.
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         MCValue &Target, uint64_t &Value,
+                         bool &IsResolved);
+
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  bool mayNeedRelaxation(const MCInst &Inst) const;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  void handleAssemblerFlag(MCAssemblerFlag Flag) {
+    switch (Flag) {
+    default: break;
+    case MCAF_Code16:
+      setIsThumb(true);
+      break;
+    case MCAF_Code32:
+      setIsThumb(false);
+      break;
+    }
+  }
+
+  unsigned getPointerSize() const { return 4; }
+  bool isThumb() const { return isThumbMode; }
+  void setIsThumb(bool it) { isThumbMode = it; }
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  switch (Op) {
+  default: return Op;
+  case ARM::tBcc:       return ARM::t2Bcc;
+  case ARM::tLDRpciASM: return ARM::t2LDRpci;
+  case ARM::tADR:       return ARM::t2ADR;
+  case ARM::tB:         return ARM::t2B;
+  }
+}
+
+bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  if (getRelaxedOpcode(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+  return false;
+}
+
+bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  switch ((unsigned)Fixup.getKind()) {
+  case ARM::fixup_arm_thumb_br: {
+    // Relaxing tB to t2B. tB has a signed 12-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 2046 || Offset < -2048;
+  }
+  case ARM::fixup_arm_thumb_bcc: {
+    // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the
+    // low bit being an implied zero. There's an implied +4 offset for the
+    // branch, so we adjust the other way here to determine what's
+    // encodable.
+    //
+    // Relax if the value is too big for a (signed) i8.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 254 || Offset < -256;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+  case ARM::fixup_arm_thumb_cp: {
+    // If the immediate is negative, greater than 1020, or not a multiple
+    // of four, the wide version of the instruction must be used.
+    int64_t Offset = int64_t(Value) - 4;
+    return Offset > 1020 || Offset < 0 || Offset & 3;
+  }
+  }
+  llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!");
+}
+
+void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  // The instructions we're relaxing have (so far) the same operands.
+  // We just need to update to the proper opcode.
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+  const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+  const uint32_t ARMv4_NopEncoding = 0xe1a00000; // using MOV r0,r0
+  const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
+  if (isThumb()) {
+    const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
+                                          : Thumb1_16bitNopEncoding;
+    uint64_t NumNops = Count / 2;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write16(nopEncoding);
+    if (Count & 1)
+      OW->Write8(0);
+    return true;
+  }
+  // ARM mode
+  const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
+                                        : ARMv4_NopEncoding;
+  uint64_t NumNops = Count / 4;
+  for (uint64_t i = 0; i != NumNops; ++i)
+    OW->Write32(nopEncoding);
+  // FIXME: should this function return false when unable to write exactly
+  // 'Count' bytes with NOP encodings?
+  switch (Count % 4) {
+  default: break; // No leftover bytes to write
+  case 1: OW->Write8(0); break;
+  case 2: OW->Write16(0); break;
+  case 3: OW->Write16(0); OW->Write8(0xa0); break;
+  }
+
+  return true;
+}
+
+static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx = NULL) {
+  unsigned Kind = Fixup.getKind();
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case ARM::fixup_arm_movt_hi16:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_arm_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned Lo12 = Value & 0x0FFF;
+    // inst{19-16} = Hi4;
+    // inst{11-0} = Lo12;
+    Value = (Hi4 << 16) | (Lo12);
+    return Value;
+  }
+  case ARM::fixup_t2_movt_hi16:
+    Value >>= 16;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:  //FIXME: Shouldn't this be shifted like
+                                       // the other hi16 fixup?
+  case ARM::fixup_t2_movw_lo16_pcrel: {
+    unsigned Hi4 = (Value & 0xF000) >> 12;
+    unsigned i = (Value & 0x800) >> 11;
+    unsigned Mid3 = (Value & 0x700) >> 8;
+    unsigned Lo8 = Value & 0x0FF;
+    // inst{19-16} = Hi4;
+    // inst{26} = i;
+    // inst{14-12} = Mid3;
+    // inst{7-0} = Lo8;
+    Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
+    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+    swapped |= (Value & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_ldst_pcrel_12:
+    // ARM PC-relative values are offset by 8.
+    Value -= 4;
+    // FALLTHROUGH
+  case ARM::fixup_t2_ldst_pcrel_12: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value -= 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    if (Ctx && Value >= 4096)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10,
+    // but with 16-bit halfwords swapped.
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
+      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return ((Value - 4) >> 2) & 0xff;
+  case ARM::fixup_arm_adr_pcrel_12: {
+    // ARM PC-relative values are offset by 8.
+    Value -= 8;
+    unsigned opc = 4; // bits {24-21}. Default to add: 0b0100
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 2; // 0b0010
+    }
+    if (Ctx && ARM_AM::getSOImmVal(Value) == -1)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    // Encode the immediate and shift the opcode into place.
+    return ARM_AM::getSOImmVal(Value) | (opc << 21);
+  }
+
+  case ARM::fixup_t2_adr_pcrel_12: {
+    Value -= 4;
+    unsigned opc = 0;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      opc = 5;
+    }
+
+    uint32_t out = (opc << 21);
+    out |= (Value & 0x800) << 15;
+    out |= (Value & 0x700) << 4;
+    out |= (Value & 0x0FF);
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    // These values don't encode the low two bits since they're always zero.
+    // Offset by 8 just as above.
+    return 0xffffff & ((Value - 8) >> 2);
+  case ARM::fixup_t2_uncondbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint32_t out = 0;
+    bool I =  Value & 0x800000;
+    bool J1 = Value & 0x400000;
+    bool J2 = Value & 0x200000;
+    J1 ^= I;
+    J2 ^= I;
+
+    out |= I  << 26; // S bit
+    out |= !J1 << 13; // J1 bit
+    out |= !J2 << 11; // J2 bit
+    out |= (Value & 0x1FF800)  << 5; // imm6 field
+    out |= (Value & 0x0007FF);        // imm11 field
+
+    uint64_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_t2_condbranch: {
+    Value = Value - 4;
+    Value >>= 1; // Low bit is not encoded.
+
+    uint64_t out = 0;
+    out |= (Value & 0x80000) << 7; // S bit
+    out |= (Value & 0x40000) >> 7; // J2 bit
+    out |= (Value & 0x20000) >> 4; // J1 bit
+    out |= (Value & 0x1F800) << 5; // imm6 field
+    out |= (Value & 0x007FF);      // imm11 field
+
+    uint32_t swapped = (out & 0xFFFF0000) >> 16;
+    swapped |= (out & 0x0000FFFF) << 16;
+    return swapped;
+  }
+  case ARM::fixup_arm_thumb_bl: {
+     // The value doesn't encode the low bit (always zero) and is offset by
+     // four. The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
+     // J = either J1 or J2 bit
+     //
+     //   BL:  xxxxxSIIIIIIIIII xxJxJIIIIIIIIIII
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 4) >> 1;
+     uint32_t signBit = (offset & 0x800000) >> 23;
+     uint32_t I1Bit = (offset & 0x400000) >> 22;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x200000) >> 21;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
+     uint32_t imm11Bits = (offset & 0x000007FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+                           (uint16_t)imm11Bits);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
+
+  }
+  case ARM::fixup_arm_thumb_blx: {
+     // The value doesn't encode the low two bits (always zero) and is offset by
+     // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
+     //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
+     // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
+     // The value is encoded into disjoint bit positions in the destination 
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, 
+     // J = either J1 or J2 bit, 0 = zero.
+     //
+     //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
+     //
+     // Note that the halfwords are stored high first, low second; so we need
+     // to transpose the fixup value here to map properly.
+     uint32_t offset = (Value - 2) >> 2;
+     uint32_t signBit = (offset & 0x400000) >> 22;
+     uint32_t I1Bit = (offset & 0x200000) >> 21;
+     uint32_t J1Bit = (I1Bit ^ 0x1) ^ signBit;
+     uint32_t I2Bit = (offset & 0x100000) >> 20;
+     uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
+     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
+     uint32_t imm10LBits = (offset & 0x3FF);
+ 
+     uint32_t Binary = 0;
+     uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | 
+                           ((uint16_t)imm10LBits) << 1);
+     Binary |= secondHalf << 16;
+     Binary |= firstHalf;
+     return Binary;
+  }
+  case ARM::fixup_arm_thumb_cp:
+    // Offset by 4, and don't encode the low two bits. Two bytes of that
+    // 'off by 4' is implicitly handled by the half-word ordering of the
+    // Thumb encoding, so we only need to adjust by 2 here.
+    return ((Value - 2) >> 2) & 0xff;
+  case ARM::fixup_arm_thumb_cb: {
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    uint32_t Binary = (Value - 4) >> 1;
+    return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3);
+  }
+  case ARM::fixup_arm_thumb_br:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0x7ff;
+  case ARM::fixup_arm_thumb_bcc:
+    // Offset by 4 and don't encode the lower bit, which is always 0.
+    return ((Value - 4) >> 1) & 0xff;
+  case ARM::fixup_arm_pcrel_10_unscaled: {
+    Value = Value - 8; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value = (Value & 0xf) | ((Value & 0xf0) << 4);
+    return Value | (isAdd << 23);
+  }
+  case ARM::fixup_arm_pcrel_10:
+    Value = Value - 4; // ARM fixups offset by an additional word and don't
+                       // need to adjust for the half-word ordering.
+    // Fall through.
+  case ARM::fixup_t2_pcrel_10: {
+    // Offset by 4, adjusted by two due to the half-word ordering of thumb.
+    Value = Value - 4;
+    bool isAdd = true;
+    if ((int64_t)Value < 0) {
+      Value = -Value;
+      isAdd = false;
+    }
+    // These values don't encode the low two bits since they're always zero.
+    Value >>= 2;
+    if (Ctx && Value >= 256)
+      Ctx->FatalError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    Value |= isAdd << 23;
+
+    // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
+    // swapped.
+    if (Kind == ARM::fixup_t2_pcrel_10) {
+      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
+      swapped |= (Value & 0x0000FFFF) << 16;
+      return swapped;
+    }
+
+    return Value;
+  }
+  }
+}
+
+void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
+                                      const MCAsmLayout &Layout,
+                                      const MCFixup &Fixup,
+                                      const MCFragment *DF,
+                                      MCValue &Target, uint64_t &Value,
+                                      bool &IsResolved) {
+  const MCSymbolRefExpr *A = Target.getSymA();
+  // Some fixups to thumb function symbols need the low bit (thumb bit)
+  // twiddled.
+  if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_ldst_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 &&
+      (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) {
+    if (A) {
+      const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
+      if (Asm.isThumbFunc(&Sym))
+        Value |= 1;
+    }
+  }
+  // We must always generate a relocation for BL/BLX instructions if we have
+  // a symbol to reference, as the linker relies on knowing the destination
+  // symbol's thumb-ness to get interworking right.
+  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
+            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
+    IsResolved = false;
+
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value aren't invalid.
+  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
+}
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+  case ARM::fixup_arm_thumb_bcc:
+  case ARM::fixup_arm_thumb_cp:
+  case ARM::fixup_thumb_adr_pcrel_10:
+    return 1;
+
+  case FK_Data_2:
+  case ARM::fixup_arm_thumb_br:
+  case ARM::fixup_arm_thumb_cb:
+    return 2;
+
+  case ARM::fixup_arm_pcrel_10_unscaled:
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+    return 3;
+
+  case FK_Data_4:
+  case ARM::fixup_t2_ldst_pcrel_12:
+  case ARM::fixup_t2_condbranch:
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_t2_pcrel_10:
+  case ARM::fixup_t2_adr_pcrel_12:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+  case ARM::fixup_arm_movw_lo16_pcrel:
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    return 4;
+  }
+}
+
+void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                               unsigned DataSize, uint64_t Value) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  Value = adjustFixupValue(Fixup, Value);
+  if (!Value) return;           // Doesn't change encoding.
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the bits from
+  // the fixup value. The Value has been "split up" into the appropriate
+  // bitfields above.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+namespace {
+
+// FIXME: This should be in a separate file.
+// ELF is an ELF of course...
+class ELFARMAsmBackend : public ARMAsmBackend {
+public:
+  uint8_t OSABI;
+  ELFARMAsmBackend(const Target &T, const StringRef TT,
+                   uint8_t _OSABI)
+    : ARMAsmBackend(T, TT), OSABI(_OSABI) { }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARMELFObjectWriter(OS, OSABI);
+  }
+};
+
+// FIXME: This should be in a separate file.
+class DarwinARMAsmBackend : public ARMAsmBackend {
+public:
+  const object::mach::CPUSubtypeARM Subtype;
+  DarwinARMAsmBackend(const Target &T, const StringRef TT,
+                      object::mach::CPUSubtypeARM st)
+    : ARMAsmBackend(T, TT), Subtype(st) {
+      HasDataInCodeSupport = true;
+    }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
+                                     object::mach::CTM_ARM,
+                                     Subtype);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    object::mach::CPUSubtypeARM CS =
+      StringSwitch<object::mach::CPUSubtypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", object::mach::CSARM_V4T)
+      .Cases("armv5e", "thumbv5e",object::mach::CSARM_V5TEJ)
+      .Cases("armv6", "thumbv6", object::mach::CSARM_V6)
+      .Cases("armv6m", "thumbv6m", object::mach::CSARM_V6M)
+      .Cases("armv7em", "thumbv7em", object::mach::CSARM_V7EM)
+      .Cases("armv7f", "thumbv7f", object::mach::CSARM_V7F)
+      .Cases("armv7k", "thumbv7k", object::mach::CSARM_V7K)
+      .Cases("armv7m", "thumbv7m", object::mach::CSARM_V7M)
+      .Cases("armv7s", "thumbv7s", object::mach::CSARM_V7S)
+      .Default(object::mach::CSARM_V7);
+
+    return new DarwinARMAsmBackend(T, TT, CS);
+  }
+
+  if (TheTriple.isOSWindows())
+    assert(0 && "Windows not supported on ARM");
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  return new ELFARMAsmBackend(T, TT, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
new file mode 100644
index 000000000000..de48a0e0f30a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -0,0 +1,421 @@
+//===-- ARMBaseInfo.h - Top level definitions for ARM -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the ARM target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINFO_H
+#define ARMBASEINFO_H
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes { // Meaning (integer)          Meaning (floating-point)
+    EQ,            // Equal                      Equal
+    NE,            // Not equal                  Not equal, or unordered
+    HS,            // Carry set                  >, ==, or unordered
+    LO,            // Carry clear                Less than
+    MI,            // Minus, negative            Less than
+    PL,            // Plus, positive or zero     >, ==, or unordered
+    VS,            // Overflow                   Unordered
+    VC,            // No overflow                Not unordered
+    HI,            // Unsigned higher            Greater than, or unordered
+    LS,            // Unsigned lower or same     Less than or equal
+    GE,            // Greater than or equal      Greater than or equal
+    LT,            // Less than                  Less than, or unordered
+    GT,            // Greater than               Greater than
+    LE,            // Less than or equal         <, ==, or unordered
+    AL             // Always (unconditional)     Always (unconditional)
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+} // namespace ARMCC
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+namespace ARM_PROC {
+  enum IMod {
+    IE = 2,
+    ID = 3
+  };
+
+  enum IFlags {
+    F = 1,
+    I = 2,
+    A = 4
+  };
+
+  inline static const char *IFlagsToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown iflags operand");
+    case F: return "f";
+    case I: return "i";
+    case A: return "a";
+    }
+  }
+
+  inline static const char *IModToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown imod operand");
+    case IE: return "ie";
+    case ID: return "id";
+    }
+  }
+}
+
+namespace ARM_MB {
+  // The Memory Barrier Option constants map directly to the 4-bit encoding of
+  // the option field for memory barrier operations.
+  enum MemBOpt {
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    OSHST = 2,
+    OSH   = 3,
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    NSHST = 6,
+    NSH   = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    ISHST = 10,
+    ISH   = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    ST    = 14,
+    SY    = 15
+  };
+
+  inline static const char *MemBOptToString(unsigned val) {
+    switch (val) {
+    default: llvm_unreachable("Unknown memory operation");
+    case SY:    return "sy";
+    case ST:    return "st";
+    case RESERVED_13: return "#0xd";
+    case RESERVED_12: return "#0xc";
+    case ISH:   return "ish";
+    case ISHST: return "ishst";
+    case RESERVED_9: return "#0x9";
+    case RESERVED_8: return "#0x8";
+    case NSH:   return "nsh";
+    case NSHST: return "nshst";
+    case RESERVED_5: return "#0x5";
+    case RESERVED_4: return "#0x4";
+    case OSH:   return "osh";
+    case OSHST: return "oshst";
+    case RESERVED_1: return "#0x1";
+    case RESERVED_0: return "#0x0";
+    }
+  }
+} // namespace ARM_MB
+
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+
+  /// ARM Index Modes
+  enum IndexMode {
+    IndexModeNone  = 0,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+    IndexModeUpd   = 3
+  };
+
+  /// ARM Addressing Modes
+  enum AddrMode {
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+    AddrMode_i12    = 16
+  };
+
+  inline static const char *AddrModeToString(AddrMode addrmode) {
+    switch (addrmode) {
+    case AddrModeNone:    return "AddrModeNone";
+    case AddrMode1:       return "AddrMode1";
+    case AddrMode2:       return "AddrMode2";
+    case AddrMode3:       return "AddrMode3";
+    case AddrMode4:       return "AddrMode4";
+    case AddrMode5:       return "AddrMode5";
+    case AddrMode6:       return "AddrMode6";
+    case AddrModeT1_1:    return "AddrModeT1_1";
+    case AddrModeT1_2:    return "AddrModeT1_2";
+    case AddrModeT1_4:    return "AddrModeT1_4";
+    case AddrModeT1_s:    return "AddrModeT1_s";
+    case AddrModeT2_i12:  return "AddrModeT2_i12";
+    case AddrModeT2_i8:   return "AddrModeT2_i8";
+    case AddrModeT2_so:   return "AddrModeT2_so";
+    case AddrModeT2_pc:   return "AddrModeT2_pc";
+    case AddrModeT2_i8s4: return "AddrModeT2_i8s4";
+    case AddrMode_i12:    return "AddrMode_i12";
+    }
+  }
+
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16,
+
+    /// MO_LO16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY,
+
+    /// MO_HI16_NONLAZY - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the non-lazy-ptr indirect symbol,
+    /// i.e. "FOO$non_lazy_ptr". Used only via movt instruction.
+    MO_HI16_NONLAZY,
+
+    /// MO_LO16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movw instruction.
+    MO_LO16_NONLAZY_PIC,
+
+    /// MO_HI16_NONLAZY_PIC - On a symbol operand "FOO", this represents a
+    /// relocation containing lower 16 bit of the PC relative address of the
+    /// non-lazy-ptr indirect symbol, i.e. "FOO$non_lazy_ptr - LABEL".
+    /// Used only via movt instruction.
+    MO_HI16_NONLAZY_PIC,
+
+    /// MO_PLT - On a symbol operand, this represents an ELF PLT reference on a
+    /// call operand.
+    MO_PLT
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
+    IndexModeShift = 5,
+    IndexModeMask  = 3 << IndexModeShift,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 7,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 11 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 14 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 25 << FormShift,
+
+    // Miscelleaneous format
+    MiscFrm       = 26 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 13,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 14,
+
+    // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+    // instruction. Used by the parser to determine whether to require the 'S'
+    // suffix on the mnemonic (when not in an IT block) or preclude it (when
+    // in an IT block).
+    ThumbArithFlagSetting = 1 << 18,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 15,
+    DomainMask    = 7 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
+} // end namespace ARMII
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
new file mode 100644
index 000000000000..f98bbd204c7a
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -0,0 +1,289 @@
+//===-- ARMELFObjectWriter.cpp - ARM ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  class ARMELFObjectWriter : public MCELFObjectTargetWriter {
+    enum { DefaultEABIVersion = 0x05000000U };
+    unsigned GetRelocTypeInner(const MCValue &Target,
+                               const MCFixup &Fixup,
+                               bool IsPCRel) const;
+
+
+  public:
+    ARMELFObjectWriter(uint8_t OSABI);
+
+    virtual ~ARMELFObjectWriter();
+
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend) const;
+    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
+                                   const MCValue &Target,
+                                   const MCFragment &F,
+                                   const MCFixup &Fixup,
+                                   bool IsPCRel) const;
+  };
+}
+
+ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
+                            ELF::EM_ARM,
+                            /*HasRelocationAddend*/ false) {}
+
+ARMELFObjectWriter::~ARMELFObjectWriter() {}
+
+// In ARM, _MergedGlobals and other most symbols get emitted directly.
+// I.e. not as an offset to a section symbol.
+// This code is an approximation of what ARM/gcc does.
+
+STATISTIC(PCRelCount, "Total number of PIC Relocations");
+STATISTIC(NonPCRelCount, "Total number of non-PIC relocations");
+
+const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
+                                                   const MCValue &Target,
+                                                   const MCFragment &F,
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
+  bool EmitThisSym = false;
+
+  const MCSectionELF &Section =
+    static_cast<const MCSectionELF&>(Symbol.getSection());
+  bool InNormalSection = true;
+  unsigned RelocType = 0;
+  RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel);
+
+  DEBUG(
+      const MCSymbolRefExpr::VariantKind Kind = Target.getSymA()->getKind();
+      MCSymbolRefExpr::VariantKind Kind2;
+      Kind2 = Target.getSymB() ?  Target.getSymB()->getKind() :
+        MCSymbolRefExpr::VK_None;
+      dbgs() << "considering symbol "
+        << Section.getSectionName() << "/"
+        << Symbol.getName() << "/"
+        << " Rel:" << (unsigned)RelocType
+        << " Kind: " << (int)Kind << "/" << (int)Kind2
+        << " Tmp:"
+        << Symbol.isAbsolute() << "/" << Symbol.isDefined() << "/"
+        << Symbol.isVariable() << "/" << Symbol.isTemporary()
+        << " Counts:" << PCRelCount << "/" << NonPCRelCount << "\n");
+
+  if (IsPCRel) { ++PCRelCount;
+    switch (RelocType) {
+    default:
+      // Most relocation types are emitted as explicit symbols
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = true;
+      break;
+    case ELF::R_ARM_ABS32:
+      // But things get strange with R_ARM_ABS32
+      // In this case, most things that go in .rodata show up
+      // as section relative relocations
+      InNormalSection =
+        StringSwitch<bool>(Section.getSectionName())
+        .Case(".data.rel.ro.local", false)
+        .Case(".data.rel", false)
+        .Case(".rodata", false)
+        .Case(".bss", false)
+        .Default(true);
+      EmitThisSym = false;
+      break;
+    }
+  } else {
+    NonPCRelCount++;
+    InNormalSection =
+      StringSwitch<bool>(Section.getSectionName())
+      .Case(".data.rel.ro.local", false)
+      .Case(".rodata", false)
+      .Case(".data.rel", false)
+      .Case(".bss", false)
+      .Default(true);
+
+    switch (RelocType) {
+    default: EmitThisSym = true; break;
+    case ELF::R_ARM_ABS32: EmitThisSym = false; break;
+    case ELF::R_ARM_PREL31: EmitThisSym = false; break;
+    }
+  }
+
+  if (EmitThisSym)
+    return &Symbol;
+  if (! Symbol.isTemporary() && InNormalSection) {
+    return &Symbol;
+  }
+  return NULL;
+}
+
+// Need to examine the Fixup when determining whether to 
+// emit the relocation as an explicit symbol or as a section relative
+// offset
+unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel,
+                                          bool IsRelocWithSymbol,
+                                          int64_t Addend) const {
+  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const  {
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  unsigned Type = 0;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("Unimplemented");
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_REL32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSGD:
+        llvm_unreachable("unimplemented");
+      case MCSymbolRefExpr::VK_ARM_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_blx:
+    case ARM::fixup_arm_uncondbl:
+      switch (Modifier) {
+      case MCSymbolRefExpr::VK_ARM_PLT:
+        Type = ELF::R_ARM_PLT32;
+        break;
+      default:
+        Type = ELF::R_ARM_CALL;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_condbl:
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_t2_condbranch:
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+    case ARM::fixup_arm_movt_hi16_pcrel:
+      Type = ELF::R_ARM_MOVT_PREL;
+      break;
+    case ARM::fixup_arm_movw_lo16:
+    case ARM::fixup_arm_movw_lo16_pcrel:
+      Type = ELF::R_ARM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_t2_movt_hi16:
+    case ARM::fixup_t2_movt_hi16_pcrel:
+      Type = ELF::R_ARM_THM_MOVT_PREL;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+    case ARM::fixup_t2_movw_lo16_pcrel:
+      Type = ELF::R_ARM_THM_MOVW_PREL_NC;
+      break;
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_blx:
+      Type = ELF::R_ARM_THM_CALL;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("invalid fixup kind!");
+    case FK_Data_4:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_ARM_NONE:
+        Type = ELF::R_ARM_NONE;
+        break;
+      case MCSymbolRefExpr::VK_ARM_GOT:
+        Type = ELF::R_ARM_GOT_BREL;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TLSGD:
+        Type = ELF::R_ARM_TLS_GD32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TPOFF:
+        Type = ELF::R_ARM_TLS_LE32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_GOTTPOFF:
+        Type = ELF::R_ARM_TLS_IE32;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_ABS32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_GOTOFF:
+        Type = ELF::R_ARM_GOTOFF32;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET1:
+        Type = ELF::R_ARM_TARGET1;
+        break;
+      case MCSymbolRefExpr::VK_ARM_TARGET2:
+        Type = ELF::R_ARM_TARGET2;
+        break;
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
+      }
+      break;
+    case ARM::fixup_arm_ldst_pcrel_12:
+    case ARM::fixup_arm_pcrel_10:
+    case ARM::fixup_arm_adr_pcrel_12:
+    case ARM::fixup_arm_thumb_bl:
+    case ARM::fixup_arm_thumb_cb:
+    case ARM::fixup_arm_thumb_cp:
+    case ARM::fixup_arm_thumb_br:
+      llvm_unreachable("Unimplemented");
+    case ARM::fixup_arm_condbranch:
+    case ARM::fixup_arm_uncondbranch:
+      Type = ELF::R_ARM_JUMP24;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+      Type = ELF::R_ARM_MOVT_ABS;
+      break;
+    case ARM::fixup_arm_movw_lo16:
+      Type = ELF::R_ARM_MOVW_ABS_NC;
+      break;
+    case ARM::fixup_t2_movt_hi16:
+      Type = ELF::R_ARM_THM_MOVT_ABS;
+      break;
+    case ARM::fixup_t2_movw_lo16:
+      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      break;
+    }
+  }
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createARMELFObjectWriter(raw_ostream &OS,
+                                               uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new ARMELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
new file mode 100644
index 000000000000..418971df3292
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -0,0 +1,418 @@
+//===- lib/MC/ARMELFStreamer.cpp - ELF Object Output for ARM --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits ARM ELF .o object files. Different
+// from generic ELF streamer in emitting mapping symbols ($a, $t and $d) to
+// delimit regions of data and code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMUnwindOp.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
+/// the appropriate points in the object files. These symbols are defined in the
+/// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
+///
+/// In brief: $a, $t or $d should be emitted at the start of each contiguous
+/// region of ARM code, Thumb code or data in a section. In practice, this
+/// emission does not rely on explicit assembler directives but on inherent
+/// properties of the directives doing the emission (e.g. ".byte" is data, "add
+/// r0, r0, r0" an instruction).
+///
+/// As a result this system is orthogonal to the DataRegion infrastructure used
+/// by MachO. Beware!
+class ARMELFStreamer : public MCELFStreamer {
+public:
+  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                 MCCodeEmitter *Emitter, bool IsThumb)
+      : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter),
+        IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None), ExTab(0),
+        FnStart(0), Personality(0), CantUnwind(false) {}
+
+  ~ARMELFStreamer() {}
+
+  // ARM exception handling directives
+  virtual void EmitFnStart();
+  virtual void EmitFnEnd();
+  virtual void EmitCantUnwind();
+  virtual void EmitPersonality(const MCSymbol *Per);
+  virtual void EmitHandlerData();
+  virtual void EmitSetFP(unsigned NewFpReg,
+                         unsigned NewSpReg,
+                         int64_t Offset = 0);
+  virtual void EmitPad(int64_t Offset);
+  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void ChangeSection(const MCSection *Section) {
+    // We have to keep track of the mapping symbol state of any sections we
+    // use. Each one should start off as EMS_None, which is provided as the
+    // default constructor by DenseMap::lookup.
+    LastMappingSymbols[getPreviousSection()] = LastEMS;
+    LastEMS = LastMappingSymbols.lookup(Section);
+
+    MCELFStreamer::ChangeSection(Section);
+  }
+
+  /// This function is the one used to emit instruction data into the ELF
+  /// streamer. We override it to add the appropriate mapping symbol if
+  /// necessary.
+  virtual void EmitInstruction(const MCInst& Inst) {
+    if (IsThumb)
+      EmitThumbMappingSymbol();
+    else
+      EmitARMMappingSymbol();
+
+    MCELFStreamer::EmitInstruction(Inst);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitBytes(Data, AddrSpace);
+  }
+
+  /// This is one of the functions used to emit data into an ELF section, so the
+  /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
+  /// necessary.
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                             unsigned AddrSpace) {
+    EmitDataMappingSymbol();
+    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+  }
+
+  virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
+    MCELFStreamer::EmitAssemblerFlag(Flag);
+
+    switch (Flag) {
+    case MCAF_SyntaxUnified:
+      return; // no-op here.
+    case MCAF_Code16:
+      IsThumb = true;
+      return; // Change to Thumb mode
+    case MCAF_Code32:
+      IsThumb = false;
+      return; // Change to ARM mode
+    case MCAF_Code64:
+      return;
+    case MCAF_SubsectionsViaSymbols:
+      return;
+    }
+  }
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_ARMELFStreamer;
+  }
+
+private:
+  enum ElfMappingSymbol {
+    EMS_None,
+    EMS_ARM,
+    EMS_Thumb,
+    EMS_Data
+  };
+
+  void EmitDataMappingSymbol() {
+    if (LastEMS == EMS_Data) return;
+    EmitMappingSymbol("$d");
+    LastEMS = EMS_Data;
+  }
+
+  void EmitThumbMappingSymbol() {
+    if (LastEMS == EMS_Thumb) return;
+    EmitMappingSymbol("$t");
+    LastEMS = EMS_Thumb;
+  }
+
+  void EmitARMMappingSymbol() {
+    if (LastEMS == EMS_ARM) return;
+    EmitMappingSymbol("$a");
+    LastEMS = EMS_ARM;
+  }
+
+  void EmitMappingSymbol(StringRef Name) {
+    MCSymbol *Start = getContext().CreateTempSymbol();
+    EmitLabel(Start);
+
+    MCSymbol *Symbol =
+      getContext().GetOrCreateSymbol(Name + "." +
+                                     Twine(MappingSymbolCounter++));
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
+    MCELF::SetType(SD, ELF::STT_NOTYPE);
+    MCELF::SetBinding(SD, ELF::STB_LOCAL);
+    SD.setExternal(false);
+    Symbol->setSection(*getCurrentSection());
+
+    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
+    Symbol->setVariableValue(Value);
+  }
+
+  void EmitThumbFunc(MCSymbol *Func) {
+    // FIXME: Anything needed here to flag the function as thumb?
+
+    getAssembler().setIsThumbFunc(Func);
+
+    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func);
+    SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
+  }
+
+  // Helper functions for ARM exception handling directives
+  void Reset();
+
+  void EmitPersonalityFixup(StringRef Name);
+
+  void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
+                         SectionKind Kind, const MCSymbol &Fn);
+  void SwitchToExTabSection(const MCSymbol &FnStart);
+  void SwitchToExIdxSection(const MCSymbol &FnStart);
+
+  bool IsThumb;
+  int64_t MappingSymbolCounter;
+
+  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
+  ElfMappingSymbol LastEMS;
+
+  // ARM Exception Handling Frame Information
+  MCSymbol *ExTab;
+  MCSymbol *FnStart;
+  const MCSymbol *Personality;
+  bool CantUnwind;
+};
+}
+
+inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
+                                              unsigned Type,
+                                              unsigned Flags,
+                                              SectionKind Kind,
+                                              const MCSymbol &Fn) {
+  const MCSectionELF &FnSection =
+    static_cast<const MCSectionELF &>(Fn.getSection());
+
+  // Create the name for new section
+  StringRef FnSecName(FnSection.getSectionName());
+  SmallString<128> EHSecName(Prefix);
+  if (FnSecName != ".text") {
+    EHSecName += FnSecName;
+  }
+
+  // Get .ARM.extab or .ARM.exidx section
+  const MCSectionELF *EHSection = NULL;
+  if (const MCSymbol *Group = FnSection.getGroup()) {
+    EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
+      FnSection.getEntrySize(), Group->getName());
+  } else {
+    EHSection = getContext().getELFSection(EHSecName, Type, Flags, Kind);
+  }
+  assert(EHSection);
+
+  // Switch to .ARM.extab or .ARM.exidx section
+  SwitchSection(EHSection);
+  EmitCodeAlignment(4, 0);
+}
+
+inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.extab",
+                    ELF::SHT_PROGBITS,
+                    ELF::SHF_ALLOC,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+
+inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
+  SwitchToEHSection(".ARM.exidx",
+                    ELF::SHT_ARM_EXIDX,
+                    ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
+                    SectionKind::getDataRel(),
+                    FnStart);
+}
+
+void ARMELFStreamer::Reset() {
+  ExTab = NULL;
+  FnStart = NULL;
+  Personality = NULL;
+  CantUnwind = false;
+}
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef =
+    MCSymbolRefExpr::Create(PersonalitySym,
+                            MCSymbolRefExpr::VK_ARM_NONE,
+                            getContext());
+
+  AddValueSymbols(PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(
+    MCFixup::Create(DF->getContents().size(), PersonalityRef,
+                    MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::EmitFnStart() {
+  assert(FnStart == 0);
+  FnStart = getContext().CreateTempSymbol();
+  EmitLabel(FnStart);
+}
+
+void ARMELFStreamer::EmitFnEnd() {
+  assert(FnStart && ".fnstart must preceeds .fnend");
+
+  // Emit unwind opcodes if there is no .handlerdata directive
+  int PersonalityIndex = -1;
+  if (!ExTab && !CantUnwind) {
+    // For __aeabi_unwind_cpp_pr1, we have to emit opcodes in .ARM.extab.
+    SwitchToExTabSection(*FnStart);
+
+    // Create .ARM.extab label for offset in .ARM.exidx
+    ExTab = getContext().CreateTempSymbol();
+    EmitLabel(ExTab);
+
+    PersonalityIndex = 1;
+
+    uint32_t Entry = 0;
+    uint32_t NumExtraEntryWords = 0;
+    Entry |= NumExtraEntryWords << 24;
+    Entry |= (EHT_COMPACT | PersonalityIndex) << 16;
+
+    // TODO: This should be generated according to .save, .vsave, .setfp
+    // directives.  Currently, we are simply generating FINISH opcode.
+    Entry |= UNWIND_OPCODE_FINISH << 8;
+    Entry |= UNWIND_OPCODE_FINISH;
+
+    EmitIntValue(Entry, 4, 0);
+  }
+
+  // Emit the exception index table entry
+  SwitchToExIdxSection(*FnStart);
+
+  if (PersonalityIndex == 1)
+    EmitPersonalityFixup("__aeabi_unwind_cpp_pr1");
+
+  const MCSymbolRefExpr *FnStartRef =
+    MCSymbolRefExpr::Create(FnStart,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(FnStartRef, 4, 0);
+
+  if (CantUnwind) {
+    EmitIntValue(EXIDX_CANTUNWIND, 4, 0);
+  } else {
+    const MCSymbolRefExpr *ExTabEntryRef =
+      MCSymbolRefExpr::Create(ExTab,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
+    EmitValue(ExTabEntryRef, 4, 0);
+  }
+
+  // Clean exception handling frame information
+  Reset();
+}
+
+void ARMELFStreamer::EmitCantUnwind() {
+  CantUnwind = true;
+}
+
+void ARMELFStreamer::EmitHandlerData() {
+  SwitchToExTabSection(*FnStart);
+
+  // Create .ARM.extab label for offset in .ARM.exidx
+  assert(!ExTab);
+  ExTab = getContext().CreateTempSymbol();
+  EmitLabel(ExTab);
+
+  // Emit Personality
+  assert(Personality && ".personality directive must preceed .handlerdata");
+
+  const MCSymbolRefExpr *PersonalityRef =
+    MCSymbolRefExpr::Create(Personality,
+                            MCSymbolRefExpr::VK_ARM_PREL31,
+                            getContext());
+
+  EmitValue(PersonalityRef, 4, 0);
+
+  // Emit unwind opcodes
+  uint32_t Entry = 0;
+  uint32_t NumExtraEntryWords = 0;
+
+  // TODO: This should be generated according to .save, .vsave, .setfp
+  // directives.  Currently, we are simply generating FINISH opcode.
+  Entry |= NumExtraEntryWords << 24;
+  Entry |= UNWIND_OPCODE_FINISH << 16;
+  Entry |= UNWIND_OPCODE_FINISH << 8;
+  Entry |= UNWIND_OPCODE_FINISH;
+
+  EmitIntValue(Entry, 4, 0);
+}
+
+void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
+  Personality = Per;
+}
+
+void ARMELFStreamer::EmitSetFP(unsigned NewFpReg,
+                               unsigned NewSpReg,
+                               int64_t Offset) {
+  // TODO: Not implemented
+}
+
+void ARMELFStreamer::EmitPad(int64_t Offset) {
+  // TODO: Not implemented
+}
+
+void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  // TODO: Not implemented
+}
+
+namespace llvm {
+  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack,
+                                      bool IsThumb) {
+    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    if (RelaxAll)
+      S->getAssembler().setRelaxAll(true);
+    if (NoExecStack)
+      S->getAssembler().setNoExecStack(true);
+    return S;
+  }
+
+}
+
+
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
new file mode 100644
index 000000000000..77ae5d23628e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
@@ -0,0 +1,27 @@
+//===-- ARMELFStreamer.h - ELF Streamer for ARM ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF streamer information for the ARM backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_ELF_STREAMER_H
+#define ARM_ELF_STREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+
+  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                      raw_ostream &OS, MCCodeEmitter *Emitter,
+                                      bool RelaxAll, bool NoExecStack,
+                                      bool IsThumb);
+}
+
+#endif // ARM_ELF_STREAMER_H
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
new file mode 100644
index 000000000000..0085feb82069
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -0,0 +1,119 @@
+//===-- ARMFixupKinds.h - ARM Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARM_ARMFIXUPKINDS_H
+#define LLVM_ARM_ARMFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace ARM {
+enum Fixups {
+  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
+  // addresses
+  fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
+
+  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
+  // the 16-bit halfwords reordered.
+  fixup_t2_ldst_pcrel_12,
+
+  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
+  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  fixup_arm_pcrel_10_unscaled,
+  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
+  // used in VFP instructions where the lower 2 bits are not encoded
+  // (so it's encoded as an 8-bit immediate).
+  fixup_arm_pcrel_10,
+  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
+  // the short-swapped encoding of Thumb2 instructions.
+  fixup_t2_pcrel_10,
+  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
+  // addresses where the lower 2 bits are not encoded (so it's encoded as an
+  // 8-bit immediate).
+  fixup_thumb_adr_pcrel_10,
+  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_arm_adr_pcrel_12,
+  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
+  // instruction.
+  fixup_t2_adr_pcrel_12,
+  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
+  // instructions. 
+  fixup_arm_condbranch,
+  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
+  // branch instructions. (unconditional)
+  fixup_arm_uncondbranch,
+  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
+  // uconditional branch instructions.
+  fixup_t2_condbranch,
+  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
+  // branch unconditional branch instructions.
+  fixup_t2_uncondbranch,
+
+  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  fixup_arm_thumb_br,
+
+  // The following fixups handle the ARM BL instructions. These can be
+  // conditionalised; however, the ARM ELF ABI requires a different relocation
+  // in that case: R_ARM_JUMP24 instead of R_ARM_CALL. The difference is that
+  // R_ARM_CALL is allowed to change the instruction to a BLX inline, which has
+  // no conditional version; R_ARM_JUMP24 would have to insert a veneer.
+  //
+  // MachO does not draw a distinction between the two cases, so it will treat
+  // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
+
+  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  fixup_arm_uncondbl,
+
+  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
+  // conditionalisation.
+  fixup_arm_condbl,
+
+  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  fixup_arm_blx,
+
+  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  fixup_arm_thumb_bl,
+
+  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  fixup_arm_thumb_blx,
+
+  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  fixup_arm_thumb_cb,
+
+  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  fixup_arm_thumb_cp,
+
+  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  fixup_arm_thumb_bcc,
+
+  // The next two are for the movt/movw pair
+  // the 16bit imm field are split into imm{15-12} and imm{11-0}
+  fixup_arm_movt_hi16, // :upper16:
+  fixup_arm_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16, // :upper16:
+  fixup_t2_movw_lo16, // :lower16:
+
+  // It is possible to create an "immediate" that happens to be pcrel.
+  // movw r0, :lower16:Foo-(Bar+8) and movt  r0, :upper16:Foo-(Bar+8)
+  // result in different reloc tags than the above two.
+  // Needed to support ELF::R_ARM_MOVT_PREL and ELF::R_ARM_MOVW_PREL_NC
+  fixup_arm_movt_hi16_pcrel, // :upper16:
+  fixup_arm_movw_lo16_pcrel, // :lower16:
+  fixup_t2_movt_hi16_pcrel, // :upper16:
+  fixup_t2_movw_lo16_pcrel, // :lower16:
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
new file mode 100644
index 000000000000..c1aab9c72cca
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -0,0 +1,60 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+cl::opt<bool>
+EnableARMEHABI("arm-enable-ehabi", cl::Hidden,
+  cl::desc("Generate ARM EHABI tables"),
+  cl::init(false));
+
+
+void ARMMCAsmInfoDarwin::anchor() { }
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+  UseDataRegionDirectives = true;
+
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::SjLj;
+}
+
+void ARMELFMCAsmInfo::anchor() { }
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  PrivateGlobalPrefix = ".L";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
+  WeakRefDirective = "\t.weak\t";
+
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  if (EnableARMEHABI)
+    ExceptionsType = ExceptionHandling::ARM;
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
new file mode 100644
index 000000000000..f0b289c6f3b6
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -0,0 +1,35 @@
+//===-- ARMMCAsmInfo.h - ARM asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARMTARGETASMINFO_H
+#define LLVM_ARMTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    virtual void anchor();
+  public:
+    explicit ARMMCAsmInfoDarwin();
+  };
+
+  class ARMELFMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit ARMELFMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
new file mode 100644
index 000000000000..7a59a7dd5055
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -0,0 +1,1531 @@
+//===-- ARM/ARMMCCodeEmitter.cpp - Convert ARM code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARMMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
+
+namespace {
+class ARMMCCodeEmitter : public MCCodeEmitter {
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+  const MCContext &CTX;
+
+public:
+  ARMMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx)
+    : MCII(mcii), STI(sti), CTX(ctx) {
+  }
+
+  ~ARMMCCodeEmitter() {}
+
+  bool isThumb() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & ARM::ModeThumb) != 0;
+  }
+  bool isThumb2() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) != 0;
+  }
+  bool isTargetDarwin() const {
+    Triple TT(STI.getTargetTriple());
+    Triple::OSType OS = TT.getOS();
+    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
+  }
+
+  unsigned getMachineSoImmOpValue(unsigned SoImm) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getHiLo16ImmOpValue - Return the encoding for the hi / low 16-bit of
+  /// the specified operand. This is used for operands with :lower16: and
+  /// :upper16: prefixes.
+  uint32_t getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  bool EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx,
+                              unsigned &Reg, unsigned &Imm,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLTargetOpValue - Return encoding info for Thumb immediate
+  /// BL branch target.
+  uint32_t getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+  /// BLX branch target.
+  uint32_t getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+  uint32_t getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+  /// immediate Thumb2 direct branch target.
+  uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
+  /// branch target.
+  uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
+  /// ADR label target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getThumbAddrModeRegRegOpValue - Return encoding for 'reg + reg' operand.
+  uint32_t getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups)const;
+
+  /// getT2AddrModeImm8s4OpValue - Return encoding info for 'reg +/- imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+  /// operand.
+  uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+
+  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
+  /// operand as needed by load/store instructions.
+  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getLdStmModeOpValue - Return encoding for load/store multiple mode.
+  uint32_t getLdStmModeOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+    ARM_AM::AMSubMode Mode = (ARM_AM::AMSubMode)MI.getOperand(OpIdx).getImm();
+    switch (Mode) {
+    default: llvm_unreachable("Unknown addressing sub-mode!");
+    case ARM_AM::da: return 0;
+    case ARM_AM::ia: return 1;
+    case ARM_AM::db: return 2;
+    case ARM_AM::ib: return 3;
+    }
+  }
+  /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+  ///
+  unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
+    switch (ShOpc) {
+    case ARM_AM::no_shift:
+    case ARM_AM::lsl: return 0;
+    case ARM_AM::lsr: return 1;
+    case ARM_AM::asr: return 2;
+    case ARM_AM::ror:
+    case ARM_AM::rrx: return 3;
+    }
+    llvm_unreachable("Invalid ShiftOpc!");
+  }
+
+  /// getAddrMode2OpValue - Return encoding for addrmode2 operands.
+  uint32_t getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
+  uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+  uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
+  uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode3OpValue - Return encoding for addrmode3 operands.
+  uint32_t getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeThumbSPOpValue - Return encoding info for 'reg +/- imm12'
+  /// operand.
+  uint32_t getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+  uint32_t getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+  uint32_t getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm8' operand.
+  uint32_t getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getCCOutOpValue - Return encoding of the 's' bit.
+  unsigned getCCOutOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    // The operand is either reg0 or CPSR. The 's' bit is encoded as '0' or
+    // '1' respectively.
+    return MI.getOperand(Op).getReg() == ARM::CPSR;
+  }
+
+  /// getSOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+    assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+    // Encode rotate_imm.
+    unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+      << ARMII::SoRotImmShift;
+
+    // Encode immed_8.
+    Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+    return Binary;
+  }
+
+  /// getT2SOImmOpValue - Return an encoded 12-bit shifted-immediate value.
+  unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+    unsigned SoImm = MI.getOperand(Op).getImm();
+    unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
+    assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
+    return Encoded;
+  }
+
+  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+    SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getSORegOpValue - Return an encoded so_reg shifted register value.
+  unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 64 - MI.getOperand(Op).getImm();
+  }
+
+  unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                                      SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                                        SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                      unsigned EncodedValue) const;
+  unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                          unsigned EncodedValue) const;
+  unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
+                                    unsigned EncodedValue) const;
+
+  unsigned VFPThumb2PostEncoder(const MCInst &MI,
+                                unsigned EncodedValue) const;
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DataIPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    // NEON Thumb2 data-processsing encodings are very simple: bit 24 is moved
+    // to bit 12 of the high half-word (i.e. bit 28), and bits 27-24 are
+    // set to 1111.
+    unsigned Bit24 = EncodedValue & 0x01000000;
+    unsigned Bit28 = Bit24 << 4;
+    EncodedValue &= 0xEFFFFFFF;
+    EncodedValue |= Bit28;
+    EncodedValue |= 0x0F000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2LoadStorePostEncoder - Post-process encoded NEON load/store
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2LoadStorePostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0xF0FFFFFF;
+    EncodedValue |= 0x09000000;
+  }
+
+  return EncodedValue;
+}
+
+/// NEONThumb2DupPostEncoder - Post-process encoded NEON vdup
+/// instructions, and rewrite them to their Thumb2 form if we are currently in
+/// Thumb2 mode.
+unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0x00FFFFFF;
+    EncodedValue |= 0xEE000000;
+  }
+
+  return EncodedValue;
+}
+
+/// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
+/// them to their Thumb2 form if we are currently in Thumb2 mode.
+unsigned ARMMCCodeEmitter::
+VFPThumb2PostEncoder(const MCInst &MI, unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue &= 0x0FFFFFFF;
+    EncodedValue |= 0xE0000000;
+  }
+  return EncodedValue;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+
+    // Q registers are encoded as 2x their register number.
+    switch (Reg) {
+    default:
+      return RegNo;
+    case ARM::Q0:  case ARM::Q1:  case ARM::Q2:  case ARM::Q3:
+    case ARM::Q4:  case ARM::Q5:  case ARM::Q6:  case ARM::Q7:
+    case ARM::Q8:  case ARM::Q9:  case ARM::Q10: case ARM::Q11:
+    case ARM::Q12: case ARM::Q13: case ARM::Q14: case ARM::Q15:
+      return 2 * RegNo;
+    }
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+                     .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  llvm_unreachable("Unable to encode MCOperand!");
+}
+
+/// getAddrModeImmOpValue - Return encoding info for 'reg +/- imm' operand.
+bool ARMMCCodeEmitter::
+EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
+                       unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+
+  Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+
+  int32_t SImm = MO1.getImm();
+  bool isAdd = true;
+
+  // Special value for #-0
+  if (SImm == INT32_MIN) {
+    SImm = 0;
+    isAdd = false;
+  }
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (SImm < 0) {
+    SImm = -SImm;
+    isAdd = false;
+  }
+
+  Imm = SImm;
+  return isAdd;
+}
+
+/// getBranchTargetOpValue - Helper function to get the branch target operand,
+/// which is either an immediate or requires a fixup.
+static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                       unsigned FixupKind,
+                                       SmallVectorImpl<MCFixup> &Fixups) {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() && "Unexpected branch target type!");
+  const MCExpr *Expr = MO.getExpr();
+  MCFixupKind Kind = MCFixupKind(FixupKind);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+  offset >>= 1;
+  uint32_t S  = (offset & 0x800000) >> 23;
+  uint32_t J1 = (offset & 0x400000) >> 22;
+  uint32_t J2 = (offset & 0x200000) >> 21;
+  J1 = (~J1 & 0x1);
+  J2 = (~J2 & 0x1);
+  J1 ^= S;
+  J2 ^= S;
+
+  offset &= ~0x600000;
+  offset |= J1 << 22;
+  offset |= J2 << 21;
+
+  return offset;
+}
+
+/// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
+/// BLX branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
+}
+
+/// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+                                    Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+                                    Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
+uint32_t ARMMCCodeEmitter::
+getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+  return (MO.getImm() >> 1);
+}
+
+/// Return true if this branch has a non-always predication
+static bool HasConditionalBranch(const MCInst &MI) {
+  int NumOp = MI.getNumOperands();
+  if (NumOp >= 2) {
+    for (int i = 0; i < NumOp-1; ++i) {
+      const MCOperand &MCOp1 = MI.getOperand(i);
+      const MCOperand &MCOp2 = MI.getOperand(i + 1);
+      if (MCOp1.isImm() && MCOp2.isReg() &&
+          (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: This really, really shouldn't use TargetMachine. We don't want
+  // coupling between MC and TM anywhere we can help it.
+  if (isThumb2())
+    return
+      ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_condbranch, Fixups);
+  return getARMBranchTargetOpValue(MI, OpIdx, Fixups);
+}
+
+/// getBranchTargetOpValue - Return encoding info for 24-bit immediate branch
+/// target.
+uint32_t ARMMCCodeEmitter::
+getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx, 
+                                      ARM::fixup_arm_condbl, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_uncondbl, Fixups);
+  }
+
+  return MO.getImm() >> 2;
+}
+
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_blx, Fixups);
+
+  return MO.getImm() >> 1;
+}
+
+/// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
+/// immediate branch target.
+uint32_t ARMMCCodeEmitter::
+getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Val =
+    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  bool I  = (Val & 0x800000);
+  bool J1 = (Val & 0x400000);
+  bool J2 = (Val & 0x200000);
+  if (I ^ J1)
+    Val &= ~0x400000;
+  else
+    Val |= 0x400000;
+
+  if (I ^ J2)
+    Val &= ~0x200000;
+  else
+    Val |= 0x200000;
+
+  return Val;
+}
+
+/// getAdrLabelOpValue - Return encoding info for 12-bit shifted-immediate
+/// ADR label target.
+uint32_t ARMMCCodeEmitter::
+getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                    Fixups);
+  int32_t offset = MO.getImm();
+  uint32_t Val = 0x2000;
+
+  int SoImmVal;
+  if (offset == INT32_MIN) {
+    Val = 0x1000;
+    SoImmVal = 0;
+  } else if (offset < 0) {
+    Val = 0x1000;
+    offset *= -1;
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x2000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  } else {
+    SoImmVal = ARM_AM::getSOImmVal(offset);
+    if(SoImmVal == -1) {
+      Val = 0x1000;
+      offset *= -1;
+      SoImmVal = ARM_AM::getSOImmVal(offset);
+    }
+  }
+
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  Val |= SoImmVal;
+  return Val;
+}
+
+/// getT2AdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                    Fixups);
+  int32_t Val = MO.getImm();
+  if (Val == INT32_MIN)
+    Val = 0x1000;
+  else if (Val < 0) {
+    Val *= -1;
+    Val |= 0x1000;
+  }
+  return Val;
+}
+
+/// getThumbAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
+/// target.
+uint32_t ARMMCCodeEmitter::
+getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                    Fixups);
+  return MO.getImm();
+}
+
+/// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &) const {
+  // [Rn, Rm]
+  //   {5-3} = Rm
+  //   {2-0} = Rn
+  const MCOperand &MO1 = MI.getOperand(OpIdx);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  return (Rm << 3) | Rn;
+}
+
+/// getAddrModeImm12OpValue - Return encoding info for 'reg +/- imm12' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-13} = reg
+  // {12}    = (U)nsigned (add == '1', sub == '0')
+  // {11-0}  = imm12
+  unsigned Reg, Imm12;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm12 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+
+      MCFixupKind Kind;
+      if (isThumb2())
+        Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+      else
+        Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+      ++MCNumCPRelocations;
+    } else {
+      Reg = ARM::PC;
+      int32_t Offset = MO.getImm();
+      // FIXME: Handle #-0.
+      if (Offset < 0) {
+        Offset *= -1;
+        isAdd = false;
+      }
+      Imm12 = Offset;
+    }
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
+
+  uint32_t Binary = Imm12 & 0xfff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 12);
+  Binary |= (Reg << 13);
+  return Binary;
+}
+
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+  bool isAdd = Imm8 >= 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (Imm8 < 0)
+    Imm8 = -(uint32_t)Imm8;
+
+  // Scaled by 4.
+  Imm8 /= 4;
+
+  uint32_t Binary = Imm8 & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  return Binary;
+}
+
+/// getT2AddrModeImm8s4OpValue - Return encoding info for
+/// 'reg +/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd = true;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false ; // 'U' bit is set as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else
+    isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+  uint32_t Binary = (Imm8 >> 2) & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {11-8} = reg
+  // {7-0}  = imm8
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Imm8 = MO1.getImm();
+  return (Reg << 8) | Imm8;
+}
+
+// FIXME: This routine assumes that a binary
+// expression will always result in a PCRel expression
+// In reality, its only true if one or more subexpressions
+// is itself a PCRel (i.e. "." in asm or some other pcrel construct)
+// but this is good enough for now.
+static bool EvaluateAsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  default: llvm_unreachable("Unexpected expression type");
+  case MCExpr::SymbolRef: return false;
+  case MCExpr::Binary: return true;
+  }
+}
+
+uint32_t
+ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // {20-16} = imm{15-12}
+  // {11-0}  = imm{11-0}
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (MO.isImm())
+    // Hi / lo 16 bits already extracted during earlier passes.
+    return static_cast<unsigned>(MO.getImm());
+
+  // Handle :upper16: and :lower16: assembly prefixes.
+  const MCExpr *E = MO.getExpr();
+  MCFixupKind Kind;
+  if (E->getKind() == MCExpr::Target) {
+    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    E = ARM16Expr->getSubExpr();
+
+    switch (ARM16Expr->getKind()) {
+    default: llvm_unreachable("Unsupported ARMFixup");
+    case ARMMCExpr::VK_ARM_HI16:
+      if (!isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movt_hi16_pcrel
+                           : ARM::fixup_arm_movt_hi16_pcrel);
+      else
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movt_hi16
+                           : ARM::fixup_arm_movt_hi16);
+      break;
+    case ARMMCExpr::VK_ARM_LO16:
+      if (!isTargetDarwin() && EvaluateAsPCRel(E))
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movw_lo16_pcrel
+                           : ARM::fixup_arm_movw_lo16_pcrel);
+      else
+        Kind = MCFixupKind(isThumb2()
+                           ? ARM::fixup_t2_movw_lo16
+                           : ARM::fixup_arm_movw_lo16);
+      break;
+    }
+    Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+    return 0;
+  }
+  // If the expression doesn't have :upper16: or :lower16: on it,
+  // it's just a plain immediate expression, and those evaluate to
+  // the lower 16 bits of the expression regardless of whether
+  // we have a movt or a movw.
+  if (!isTargetDarwin() && EvaluateAsPCRel(E))
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16_pcrel
+                       : ARM::fixup_arm_movw_lo16_pcrel);
+  else
+    Kind = MCFixupKind(isThumb2()
+                       ? ARM::fixup_t2_movw_lo16
+                       : ARM::fixup_arm_movw_lo16);
+  Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+  return 0;
+}
+
+uint32_t ARMMCCodeEmitter::
+getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
+  bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
+  ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
+  unsigned SBits = getShiftOp(ShOp);
+
+  // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+  // amount. However, it would be an easy mistake to make so check here.
+  assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
+  // {16-13} = Rn
+  // {12}    = isAdd
+  // {11-0}  = shifter
+  //  {3-0}  = Rm
+  //  {4}    = 0
+  //  {6-5}  = type
+  //  {11-7} = imm
+  uint32_t Binary = Rm;
+  Binary |= Rn << 13;
+  Binary |= SBits << 5;
+  Binary |= ShImm << 7;
+  if (isAdd)
+    Binary |= 1 << 12;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {17-14}  Rn
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
+  Binary |= Rn << 14;
+  return Binary;
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm12, 0 == Rm
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM2Op(Imm) == ARM_AM::add;
+  bool isReg = MO.getReg() != 0;
+  uint32_t Binary = ARM_AM::getAM2Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm12
+  if (isReg) {
+    ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
+    Binary <<= 7;                    // Shift amount is bits [11:7]
+    Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
+    Binary |= CTX.getRegisterInfo().getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+  }
+  return Binary | (isAdd << 12) | (isReg << 13);
+}
+
+uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // {4}      isAdd
+  // {3-0}    Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  bool isAdd = MO1.getImm() != 0;
+  return CTX.getRegisterInfo().getEncodingValue(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // {9}      1 == imm8, 0 == Rm
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  unsigned Imm = MO1.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  return Imm8 | (isAdd << 8) | (isImm << 9);
+}
+
+uint32_t ARMMCCodeEmitter::
+getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {13}     1 == imm8, 0 == Rm
+  // {12-9}   Rn
+  // {8}      isAdd
+  // {7-4}    imm7_4/zero
+  // {3-0}    imm3_0/Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx+2);
+
+  // If The first operand isn't a register, we have a label reference.
+  if (!MO.isReg()) {
+    unsigned Rn = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(ARM::fixup_arm_pcrel_10_unscaled);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+    return (Rn << 9) | (1 << 13);
+  }
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Imm = MO2.getImm();
+  bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
+  bool isImm = MO1.getReg() == 0;
+  uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
+  // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
+  if (!isImm)
+    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
+}
+
+/// getAddrModeThumbSPOpValue - Encode the t_addrmode_sp operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeThumbSPOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  // [SP, #imm]
+  //   {7-0} = imm8
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(MI.getOperand(OpIdx).getReg() == ARM::SP &&
+         "Unexpected base register!");
+
+  // The immediate is already shifted for the implicit zeroes, so no change
+  // here.
+  return MO1.getImm() & 0xff;
+}
+
+/// getAddrModeISOpValue - Encode the t_addrmode_is# operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // [Rn, #imm]
+  //   {7-3} = imm5
+  //   {2-0} = Rn
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Imm5 = MO1.getImm();
+  return ((Imm5 & 0x1f) << 3) | Rn;
+}
+
+/// getAddrModePCOpValue - Return encoding for t_addrmode_pc operands.
+uint32_t ARMMCCodeEmitter::
+getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+  return (MO.getImm() >> 2);
+}
+
+/// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
+uint32_t ARMMCCodeEmitter::
+getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  // {12-9} = reg
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  unsigned Reg, Imm8;
+  bool isAdd;
+  // If The first operand isn't a register, we have a label reference.
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  if (!MO.isReg()) {
+    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Imm8 = 0;
+    isAdd = false; // 'U' bit is handled as part of the fixup.
+
+    assert(MO.isExpr() && "Unexpected machine operand type!");
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind;
+    if (isThumb2())
+      Kind = MCFixupKind(ARM::fixup_t2_pcrel_10);
+    else
+      Kind = MCFixupKind(ARM::fixup_arm_pcrel_10);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+
+    ++MCNumCPRelocations;
+  } else {
+    EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
+    isAdd = ARM_AM::getAM5Op(Imm8) == ARM_AM::add;
+  }
+
+  uint32_t Binary = ARM_AM::getAM5Offset(Imm8);
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  Binary |= (Reg << 9);
+  return Binary;
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is Rs, the amount to shift by, and the third specifies
+  // the type of the shift.
+  //
+  // {3-0} = Rm.
+  // {4}   = 1
+  // {6-5} = type
+  // {11-8} = Rs
+  // {7}    = 0
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MCOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    }
+  }
+
+  Binary |= SBits << 4;
+
+  // Encode the shift operation Rs.
+  // Encode Rs bit[11:8].
+  assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+  return Binary | (CTX.getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  // RRX - 110 and bit[11:8] clear.
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  case ARM_AM::rrx:
+    Binary |= 0x60;
+    return Binary;
+  }
+
+  // Encode shift_imm bit[11:7].
+  Binary |= SBits << 4;
+  unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+  assert(Offset < 32 && "Offset must be in range 0-31!");
+  return Binary | (Offset << 7);
+}
+
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+  const MCOperand &MO3 = MI.getOperand(OpNum+2);
+
+  // Encoded as [Rn, Rm, imm].
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  Value <<= 4;
+  Value |= CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  Value <<= 2;
+  Value |= MO3.getImm();
+
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+  const MCOperand &MO2 = MI.getOperand(OpNum+1);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+
+  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // to represent the (inverse of the) sign bit.
+  Value <<= 9;
+  int32_t tmp = (int32_t)MO2.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 256; // Set the ADD bit
+  Value |= tmp & 255;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2AddrModeImm12OffsetOpValue(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO1 = MI.getOperand(OpNum);
+
+  // FIXME: Needs fixup support.
+  unsigned Value = 0;
+  int32_t tmp = (int32_t)MO1.getImm();
+  if (tmp < 0)
+    tmp = abs(tmp);
+  else
+    Value |= 4096; // Set the ADD bit
+  Value |= tmp & 4095;
+  return Value;
+}
+
+unsigned ARMMCCodeEmitter::
+getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::rrx: // FALLTHROUGH
+  case ARM_AM::ror: SBits = 0x6; break;
+  }
+
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO1.getImm()) << 7;
+}
+
+unsigned ARMMCCodeEmitter::
+getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
+                               SmallVectorImpl<MCFixup> &Fixups) const {
+  // 10 bits. lower 5 bits are are the lsb of the mask, high five bits are the
+  // msb of the mask.
+  const MCOperand &MO = MI.getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  uint32_t lsb = CountTrailingZeros_32(v);
+  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
+  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  return lsb | (msb << 5);
+}
+
+unsigned ARMMCCodeEmitter::
+getRegisterListOpValue(const MCInst &MI, unsigned Op,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // VLDM/VSTM:
+  //   {12-8} = Vd
+  //   {7-0}  = Number of registers
+  //
+  // LDM/STM:
+  //   {15-0}  = Bitfield of GPRs.
+  unsigned Reg = MI.getOperand(Op).getReg();
+  bool SPRRegs = ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
+
+  unsigned Binary = 0;
+
+  if (SPRRegs || DPRRegs) {
+    // VLDM/VSTM
+    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+    unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
+    Binary |= (RegNo & 0x1f) << 8;
+    if (SPRRegs)
+      Binary |= NumRegs;
+    else
+      Binary |= NumRegs * 2;
+  } else {
+    for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+      unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(MI.getOperand(I).getReg());
+      Binary |= 1 << RegNo;
+    }
+  }
+
+  return Binary;
+}
+
+/// getAddrMode6AddressOpValue - Encode an addrmode6 register number along
+/// with the alignment operand.
+unsigned ARMMCCodeEmitter::
+getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x02; break;
+  case 32: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+/// getAddrMode6OneLane32AddressOpValue - Encode an addrmode6 register number
+/// along  with the alignment operand for use in VST1 and VLD1 with size 32.
+unsigned ARMMCCodeEmitter::
+getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
+                                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 8:
+  case 16:
+  case 32: // Default '0' value for invalid alignments of 8, 16, 32 bytes.
+  case 2: Align = 0x00; break;
+  case 4: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+
+/// getAddrMode6DupAddressOpValue - Encode an addrmode6 register number and
+/// alignment operand for use in VLD-dup instructions.  This is the same as
+/// getAddrMode6AddressOpValue except for the alignment encoding, which is
+/// different for VLD4-dup.
+unsigned ARMMCCodeEmitter::
+getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &Reg = MI.getOperand(Op);
+  const MCOperand &Imm = MI.getOperand(Op + 1);
+
+  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned Align = 0;
+
+  switch (Imm.getImm()) {
+  default: break;
+  case 2:
+  case 4:
+  case 8:  Align = 0x01; break;
+  case 16: Align = 0x03; break;
+  }
+
+  return RegNo | (Align << 4);
+}
+
+unsigned ARMMCCodeEmitter::
+getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.getReg() == 0) return 0x0D;
+  return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight8Imm(const MCInst &MI, unsigned Op,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight16Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight32Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned ARMMCCodeEmitter::
+getShiftRight64Imm(const MCInst &MI, unsigned Op,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+void ARMMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  // Pseudo instructions don't get encoded.
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+  if ((TSFlags & ARMII::FormMask) == ARMII::Pseudo)
+    return;
+
+  int Size;
+  if (Desc.getSize() == 2 || Desc.getSize() == 4)
+    Size = Desc.getSize();
+  else
+    llvm_unreachable("Unexpected instruction size!");
+
+  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+  // Thumb 32-bit wide instructions need to emit the high order halfword
+  // first.
+  if (isThumb() && Size == 4) {
+    EmitConstant(Binary >> 16, 2, OS);
+    EmitConstant(Binary & 0xffff, 2, OS);
+  } else
+    EmitConstant(Binary, Size, OS);
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+}
+
+#include "ARMGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
new file mode 100644
index 000000000000..fc8505b052bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -0,0 +1,72 @@
+//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "armmcexpr"
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+const ARMMCExpr*
+ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                       MCContext &Ctx) {
+  return new (Ctx) ARMMCExpr(Kind, Expr);
+}
+
+void ARMMCExpr::PrintImpl(raw_ostream &OS) const {
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_ARM_HI16: OS << ":upper16:"; break;
+  case VK_ARM_LO16: OS << ":lower16:"; break;
+  }
+
+  const MCExpr *Expr = getSubExpr();
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  Expr->print(OS);
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+bool
+ARMMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  return false;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols_(BE->getLHS(), Asm);
+    AddValueSymbols_(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void ARMMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbols_(getSubExpr(), Asm);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
new file mode 100644
index 000000000000..cd4067a52955
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -0,0 +1,76 @@
+//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCEXPR_H
+#define ARMMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class ARMMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_ARM_None,
+    VK_ARM_HI16,  // The R_ARM_MOVT_ABS relocation (:upper16: in the .s file)
+    VK_ARM_LO16   // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
+    : Kind(_Kind), Expr(_Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const ARMMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                      MCContext &Ctx);
+
+  static const ARMMCExpr *CreateUpper16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_HI16, Expr, Ctx);
+  }
+
+  static const ARMMCExpr *CreateLower16(const MCExpr *Expr, MCContext &Ctx) {
+    return Create(VK_ARM_LO16, Expr, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS ARMMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
new file mode 100644
index 000000000000..f09fb5a94fd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -0,0 +1,298 @@
+//===-- ARMMCTargetDesc.cpp - ARM Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInfo.h"
+#include "ARMELFStreamer.h"
+#include "ARMMCAsmInfo.h"
+#include "ARMMCTargetDesc.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_REGINFO_MC_DESC
+#include "ARMGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "ARMGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
+  Triple triple(TT);
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  unsigned Len = TT.size();
+  unsigned Idx = 0;
+
+  // FIXME: Enhance Triple helper class to extract ARM version.
+  bool isThumb = false;
+  if (Len >= 5 && TT.substr(0, 4) == "armv")
+    Idx = 4;
+  else if (Len >= 6 && TT.substr(0, 5) == "thumb") {
+    isThumb = true;
+    if (Len >= 7 && TT[5] == 'v')
+      Idx = 6;
+  }
+
+  bool NoCPU = CPU == "generic" || CPU.empty();
+  std::string ARMArchFeature;
+  if (Idx) {
+    unsigned SubVer = TT[Idx];
+    if (SubVer >= '7' && SubVer <= '9') {
+      if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        if (NoCPU)
+          // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
+        if (NoCPU)
+          // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
+          //       FeatureT2XtPk, FeatureMClass
+          ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      } else if (Len >= Idx+2 && TT[Idx+1] == 's') {
+        if (NoCPU)
+          // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          //      Swift
+          ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+t2xtpk";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      } else {
+        // v7 CPUs have lots of different feature sets. If no CPU is specified,
+        // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
+        // the "minimum" feature set and use CPU string to figure out the exact
+        // features.
+        if (NoCPU)
+          // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
+      }
+    } else if (SubVer == '6') {
+      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
+        ARMArchFeature = "+v6t2";
+      else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        if (NoCPU)
+          // v6m: FeatureNoARM, FeatureMClass
+          ARMArchFeature = "+v6,+noarm,+mclass";
+        else
+          ARMArchFeature = "+v6";
+      } else
+        ARMArchFeature = "+v6";
+    } else if (SubVer == '5') {
+      if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
+        ARMArchFeature = "+v5te";
+      else
+        ARMArchFeature = "+v5t";
+    } else if (SubVer == '4' && Len >= Idx+2 && TT[Idx+1] == 't')
+      ARMArchFeature = "+v4t";
+  }
+
+  if (isThumb) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+thumb-mode";
+    else
+      ARMArchFeature += ",+thumb-mode";
+  }
+
+  if (triple.isOSNaCl()) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-trap";
+    else
+      ARMArchFeature += ",+nacl-trap";
+  }
+
+  return ARMArchFeature;
+}
+
+MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  std::string ArchFS = ARM_MC::ParseARMTriple(TT, CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitARMMCSubtargetInfo(X, TT, CPU, ArchFS);
+  return X;
+}
+
+static MCInstrInfo *createARMMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitARMMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitARMMCRegisterInfo(X, ARM::LR, 0, 0, ARM::PC);
+  return X;
+}
+
+static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return new ARMMCAsmInfoDarwin();
+
+  return new ARMELFMCAsmInfo();
+}
+
+static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    Triple TheTriple(TT);
+    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
+    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+  }
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, false);
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("ARM does not support Windows COFF format");
+  }
+
+  return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
+                              TheTriple.getArch() == Triple::thumb);
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(MAI, MII, MRI, STI);
+  return 0;
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                          uint64_t Size) const {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return -1ULL;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    // FIXME: This is not right for thumb.
+    return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+  }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn A(TheARMTarget, createARMMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheThumbTarget, createARMMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMTarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbTarget, createARMMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMTarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbTarget,
+                                          createARMMCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARMTarget, createARMAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbTarget, createARMAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
new file mode 100644
index 000000000000..a89981e4f060
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -0,0 +1,77 @@
+//===-- ARMMCTargetDesc.h - ARM Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides ARM specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCTARGETDESC_H
+#define ARMMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheARMTarget, TheThumbTarget;
+
+namespace ARM_MC {
+  std::string ParseARMTriple(StringRef TT, StringRef CPU);
+
+  /// createARMMCSubtargetInfo - Create a ARM MCSubtargetInfo instance.
+  /// This is exposed so Asm parser, etc. do not need to go through
+  /// TargetRegistry.
+  MCSubtargetInfo *createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                            StringRef FS);
+}
+
+MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+
+/// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
+MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
+                                         uint8_t OSABI);
+
+/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+} // End llvm namespace
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "ARMGenRegisterInfo.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "ARMGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "ARMGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
new file mode 100644
index 000000000000..b9efe74b41e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -0,0 +1,497 @@
+//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachOSymbolFlags.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+class ARMMachObjectWriter : public MCMachObjectTargetWriter {
+  void RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                    const MCAssembler &Asm,
+                                    const MCAsmLayout &Layout,
+                                    const MCFragment *Fragment,
+                                    const MCFixup &Fixup,
+                                    MCValue Target,
+                                    unsigned Log2Size,
+                                    uint64_t &FixedValue);
+  void RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                        const MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        uint64_t &FixedValue);
+
+  bool requiresExternRelocation(MachObjectWriter *Writer,
+                                const MCAssembler &Asm,
+                                const MCFragment &Fragment,
+                                unsigned RelocType, const MCSymbolData *SD,
+                                uint64_t FixedValue);
+
+public:
+  ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/true) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue);
+};
+}
+
+static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
+                              unsigned &Log2Size) {
+  RelocType = unsigned(macho::RIT_Vanilla);
+  Log2Size = ~0U;
+
+  switch (Kind) {
+  default:
+    return false;
+
+  case FK_Data_1:
+    Log2Size = llvm::Log2_32(1);
+    return true;
+  case FK_Data_2:
+    Log2Size = llvm::Log2_32(2);
+    return true;
+  case FK_Data_4:
+    Log2Size = llvm::Log2_32(4);
+    return true;
+  case FK_Data_8:
+    Log2Size = llvm::Log2_32(8);
+    return true;
+
+    // Handle 24-bit branch kinds.
+  case ARM::fixup_arm_ldst_pcrel_12:
+  case ARM::fixup_arm_pcrel_10:
+  case ARM::fixup_arm_adr_pcrel_12:
+  case ARM::fixup_arm_condbranch:
+  case ARM::fixup_arm_uncondbranch:
+  case ARM::fixup_arm_uncondbl:
+  case ARM::fixup_arm_condbl:
+  case ARM::fixup_arm_blx:
+    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
+    // Report as 'long', even though that is not quite accurate.
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+    // Handle Thumb branches.
+  case ARM::fixup_arm_thumb_br:
+    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    Log2Size = llvm::Log2_32(2);
+    return true;
+
+  case ARM::fixup_t2_uncondbranch:
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    Log2Size = llvm::Log2_32(4);
+    return true;
+
+  // For movw/movt r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 1;
+    return true;
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 3;
+    return true;
+
+  case ARM::fixup_arm_movw_lo16:
+  case ARM::fixup_arm_movw_lo16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 0;
+    return true;
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    RelocType = unsigned(macho::RIT_ARM_Half);
+    Log2Size = 2;
+    return true;
+  }
+}
+
+void ARMMachObjectWriter::
+RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_ARM_Half;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint32_t Value2 = 0;
+  uint64_t SecAddr =
+    Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = macho::RIT_ARM_HalfDifference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  // ARM_RELOC_HALF and ARM_RELOC_HALF_SECTDIFF abuse the r_length field:
+  //
+  // For these two r_type relocations they always have a pair following them and
+  // the r_length bits are used differently.  The encoding of the r_length is as
+  // follows:
+  //   low bit of r_length:
+  //      0 - :lower16: for movw instructions
+  //      1 - :upper16: for movt instructions
+  //   high bit of r_length:
+  //      0 - arm instructions
+  //      1 - thumb instructions
+  // the other half of the relocated expression is in the following pair
+  // relocation entry in the low 16 bits of r_address field.
+  unsigned ThumbBit = 0;
+  unsigned MovtBit = 0;
+  switch ((unsigned)Fixup.getKind()) {
+  default: break;
+  case ARM::fixup_arm_movt_hi16:
+  case ARM::fixup_arm_movt_hi16_pcrel:
+    MovtBit = 1;
+    // The thumb bit shouldn't be set in the 'other-half' bit of the
+    // relocation, but it will be set in FixedValue if the base symbol
+    // is a thumb function. Clear it out here.
+    if (A_SD->getFlags() & SF_ThumbFunc)
+      FixedValue &= 0xfffffffe;
+    break;
+  case ARM::fixup_t2_movt_hi16:
+  case ARM::fixup_t2_movt_hi16_pcrel:
+    if (A_SD->getFlags() & SF_ThumbFunc)
+      FixedValue &= 0xfffffffe;
+    MovtBit = 1;
+    // Fallthrough
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movw_lo16_pcrel:
+    ThumbBit = 1;
+    break;
+  }
+
+  if (Type == macho::RIT_ARM_HalfDifference) {
+    uint32_t OtherHalf = MovtBit
+      ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
+
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((OtherHalf       <<  0) |
+                 (macho::RIT_Pair << 24) |
+                 (MovtBit         << 28) |
+                 (ThumbBit        << 29) |
+                 (IsPCRel         << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (MovtBit     << 28) |
+               (ThumbBit    << 29) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_Vanilla;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                       "symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         "symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    Type = macho::RIT_Difference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == macho::RIT_Difference ||
+      Type == macho::RIT_Generic_LocalDifference) {
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((0         <<  0) |
+                 (macho::RIT_Pair  << 24) |
+                 (Log2Size  << 28) |
+                 (IsPCRel   << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (Log2Size    << 28) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
+                                                   const MCAssembler &Asm,
+                                                   const MCFragment &Fragment,
+                                                   unsigned RelocType,
+                                                   const MCSymbolData *SD,
+                                                   uint64_t FixedValue) {
+  // Most cases can be identified purely from the symbol.
+  if (Writer->doesSymbolRequireExternRelocation(SD))
+    return true;
+  int64_t Value = (int64_t)FixedValue;  // The displacement is signed.
+  int64_t Range;
+  switch (RelocType) {
+  default:
+    return false;
+  case macho::RIT_ARM_Branch24Bit:
+    // PC pre-adjustment of 8 for these instructions.
+    Value -= 8;
+    // ARM BL/BLX has a 25-bit offset.
+    Range = 0x1ffffff;
+    break;
+  case macho::RIT_ARM_ThumbBranch22Bit:
+    // PC pre-adjustment of 4 for these instructions.
+    Value -= 4;
+    // Thumb BL/BLX has a 24-bit offset.
+    Range = 0xffffff;
+  }
+  // BL/BLX also use external relocations when an internal relocation
+  // would result in the target being out of range. This gives the linker
+  // enough information to generate a branch island.
+  const MCSectionData &SymSD = Asm.getSectionData(
+    SD->getSymbol().getSection());
+  Value += Writer->getSectionAddress(&SymSD);
+  Value -= Writer->getSectionAddress(Fragment.getParent());
+  // If the resultant value would be out of range for an internal relocation,
+  // use an external instead.
+  if (Value > Range || Value < -(Range + 1))
+    return true;
+  return false;
+}
+
+void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
+                                           const MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup,
+                                           MCValue Target,
+                                           uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size;
+  unsigned RelocType = macho::RIT_Vanilla;
+  if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
+    // If we failed to get fixup kind info, it's because there's no legal
+    // relocation type for the fixup kind. This happens when it's a fixup that's
+    // expected to always be resolvable at assembly time and not have any
+    // relocations needed.
+    Asm.getContext().FatalError(Fixup.getLoc(),
+                                "unsupported relocation on symbol");
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry.  Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    if (RelocType == macho::RIT_ARM_Half)
+      return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
+                                              Fixup, Target, FixedValue);
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, Log2Size, FixedValue);
+  }
+
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // FIXME: For other platforms, we need to use scattered relocations for
+  // internal relocations with offsets.  If this is an internal relocation with
+  // an offset, it also needs a scattered relocation entry.
+  //
+  // Is this right for ARM?
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel && RelocType == macho::RIT_Vanilla)
+    Offset += 1 << Log2Size;
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
+    return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                        Target, Log2Size, FixedValue);
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // FIXME!
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, SD,
+                                 FixedValue)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    // The type is determined by the fixup kind.
+    Type = RelocType;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+
+  // Even when it's not a scattered relocation, movw/movt always uses
+  // a PAIR relocation.
+  if (Type == macho::RIT_ARM_Half) {
+    // The other-half value only gets populated for the movt and movw
+    // relocation entries.
+    uint32_t Value = 0;
+    switch ((unsigned)Fixup.getKind()) {
+    default: break;
+    case ARM::fixup_arm_movw_lo16:
+    case ARM::fixup_arm_movw_lo16_pcrel:
+    case ARM::fixup_t2_movw_lo16:
+    case ARM::fixup_t2_movw_lo16_pcrel:
+      Value = (FixedValue >> 16) & 0xffff;
+      break;
+    case ARM::fixup_arm_movt_hi16:
+    case ARM::fixup_arm_movt_hi16_pcrel:
+    case ARM::fixup_t2_movt_hi16:
+    case ARM::fixup_t2_movt_hi16_pcrel:
+      Value = FixedValue & 0xffff;
+      break;
+    }
+    macho::RelocationEntry MREPair;
+    MREPair.Word0 = Value;
+    MREPair.Word1 = ((0xffffff) |
+                     (Log2Size << 25) |
+                     (macho::RIT_Pair << 28));
+
+    Writer->addRelocation(Fragment->getParent(), MREPair);
+  }
+
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createARMMachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new ARMMachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
new file mode 100644
index 000000000000..dad5576df4cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOp.h
@@ -0,0 +1,112 @@
+//===-- ARMUnwindOp.h - ARM Unwind Opcodes ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the constants for the ARM unwind opcodes and exception
+// handling table entry kinds.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_UNWIND_OP_H
+#define ARM_UNWIND_OP_H
+
+namespace llvm {
+
+  /// ARM exception handling table entry kinds
+  enum ARMEHTEntryKind {
+    EHT_GENERIC = 0x00,
+    EHT_COMPACT = 0x80
+  };
+
+  enum {
+    /// Special entry for the function never unwind
+    EXIDX_CANTUNWIND = 0x1
+  };
+
+  /// ARM-defined frame unwinding opcodes
+  enum ARMUnwindOpcodes {
+    // Format: 00xxxxxx
+    // Purpose: vsp = vsp + ((x << 2) + 4)
+    UNWIND_OPCODE_INC_VSP = 0x00,
+
+    // Format: 01xxxxxx
+    // Purpose: vsp = vsp - ((x << 2) + 4)
+    UNWIND_OPCODE_DEC_VSP = 0x40,
+
+    // Format: 10000000 00000000
+    // Purpose: refuse to unwind
+    UNWIND_OPCODE_REFUSE = 0x8000,
+
+    // Format: 1000xxxx xxxxxxxx
+    // Purpose: pop r[15:12], r[11:4]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_REG_MASK_R4 = 0x8000,
+
+    // Format: 1001xxxx
+    // Purpose: vsp = r[x]
+    // Constraint: x != 13 && x != 15
+    UNWIND_OPCODE_SET_VSP = 0x90,
+
+    // Format: 10100xxx
+    // Purpose: pop r[(4+x):4]
+    UNWIND_OPCODE_POP_REG_RANGE_R4 = 0xa0,
+
+    // Format: 10101xxx
+    // Purpose: pop r14, r[(4+x):4]
+    UNWIND_OPCODE_POP_REG_RANGE_R4_R14 = 0xa8,
+
+    // Format: 10110000
+    // Purpose: finish
+    UNWIND_OPCODE_FINISH = 0xb0,
+
+    // Format: 10110001 0000xxxx
+    // Purpose: pop r[3:0]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_REG_MASK = 0xb100,
+
+    // Format: 10110010 x(uleb128)
+    // Purpose: vsp = vsp + ((x << 2) + 0x204)
+    UNWIND_OPCODE_INC_VSP_ULEB128 = 0xb2,
+
+    // Format: 10110011 xxxxyyyy
+    // Purpose: pop d[(x+y):x]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX = 0xb300,
+
+    // Format: 10111xxx
+    // Purpose: pop d[(8+x):8]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDX_D8 = 0xb8,
+
+    // Format: 11000xxx
+    // Purpose: pop wR[(10+x):10]
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE_WR10 = 0xc0,
+
+    // Format: 11000110 xxxxyyyy
+    // Purpose: pop wR[(x+y):x]
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_RANGE = 0xc600,
+
+    // Format: 11000111 0000xxxx
+    // Purpose: pop wCGR[3:0]
+    // Constraint: x != 0
+    UNWIND_OPCODE_POP_WIRELESS_MMX_REG_MASK = 0xc700,
+
+    // Format: 11001000 xxxxyyyy
+    // Purpose: pop d[(16+x+y):(16+x)]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 = 0xc800,
+
+    // Format: 11001001 xxxxyyyy
+    // Purpose: pop d[(x+y):x]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD = 0xc900,
+
+    // Format: 11010xxx
+    // Purpose: pop d[(8+x):8]
+    UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D8 = 0xd0
+  };
+
+}
+
+#endif // ARM_UNWIND_OP_H
diff --git a/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
new file mode 100644
index 000000000000..2e266c2e9624
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/MLxExpansionPass.cpp
@@ -0,0 +1,399 @@
+//===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
+// multiple and add / sub instructions) when special VMLx hazards are detected.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mlx-expansion"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
+static cl::opt<unsigned>
+ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
+
+STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
+
+namespace {
+  struct MLxExpansion : public MachineFunctionPass {
+    static char ID;
+    MLxExpansion() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM MLA / MLS expansion pass";
+    }
+
+  private:
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo *MRI;
+
+    bool isLikeA9;
+    bool isSwift;
+    unsigned MIIdx;
+    MachineInstr* LastMIs[4];
+    SmallPtrSet<MachineInstr*, 4> IgnoreStall;
+
+    void clearStack();
+    void pushStack(MachineInstr *MI);
+    MachineInstr *getAccDefMI(MachineInstr *MI) const;
+    unsigned getDefReg(MachineInstr *MI) const;
+    bool hasLoopHazard(MachineInstr *MI) const;
+    bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
+    bool FindMLxHazard(MachineInstr *MI);
+    void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                unsigned MulOpc, unsigned AddSubOpc,
+                                bool NegAcc, bool HasLane);
+    bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
+  };
+  char MLxExpansion::ID = 0;
+}
+
+void MLxExpansion::clearStack() {
+  std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
+  MIIdx = 0;
+}
+
+void MLxExpansion::pushStack(MachineInstr *MI) {
+  LastMIs[MIIdx] = MI;
+  if (++MIIdx == 4)
+    MIIdx = 0;
+}
+
+MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
+  // Look past COPY and INSERT_SUBREG instructions to find the
+  // real definition MI. This is important for _sfp instructions.
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+    if (DefMI->getParent() != MBB)
+      break;
+    if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+    break;
+  }
+  return DefMI;
+}
+
+unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+      !MRI->hasOneNonDBGUse(Reg))
+    return Reg;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
+  if (UseMI->getParent() != MBB)
+    return Reg;
+
+  while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
+    Reg = UseMI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI->hasOneNonDBGUse(Reg))
+      return Reg;
+    UseMI = &*MRI->use_nodbg_begin(Reg);
+    if (UseMI->getParent() != MBB)
+      return Reg;
+  }
+
+  return Reg;
+}
+
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return false;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+outer_continue:
+    if (DefMI->getParent() != MBB)
+      break;
+
+    if (DefMI->isPHI()) {
+      for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+        if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+          unsigned SrcReg = DefMI->getOperand(i).getReg();
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+            DefMI = MRI->getVRegDef(SrcReg);
+            goto outer_continue;
+          }
+        }
+      }
+    } else if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+
+    break;
+  }
+
+  return DefMI == MI;
+}
+
+bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
+  // FIXME: Detect integer instructions properly.
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+  if (MI->mayStore())
+    return false;
+  unsigned Opcode = MCID.getOpcode();
+  if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+    return false;
+  if ((Domain & ARMII::DomainVFP) || (Domain & ARMII::DomainNEON))
+    return MI->readsRegister(Reg, TRI);
+  return false;
+}
+
+static bool isFpMulInstruction(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::VMULS:
+  case ARM::VMULfd:
+  case ARM::VMULfq:
+  case ARM::VMULD:
+  case ARM::VMULslfd:
+  case ARM::VMULslfq:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
+  if (NumExpand >= ExpandLimit)
+    return false;
+
+  if (ForceExapnd)
+    return true;
+
+  MachineInstr *DefMI = getAccDefMI(MI);
+  if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
+    // r0 = vmla
+    // r3 = vmla r0, r1, r2
+    // takes 16 - 17 cycles
+    //
+    // r0 = vmla
+    // r4 = vmul r1, r2
+    // r3 = vadd r0, r4
+    // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
+    IgnoreStall.insert(DefMI);
+    return true;
+  }
+
+  // On Swift, we mostly care about hazards from multiplication instructions
+  // writing the accumulator and the pipelining of loop iterations by out-of-
+  // order execution. 
+  if (isSwift)
+    return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
+  if (IgnoreStall.count(MI))
+    return false;
+
+  // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
+  // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
+  // preserves the in-order retirement of the instructions.
+  // Look at the next few instructions, if *most* of them can cause hazards,
+  // then the scheduler can't *fix* this, we'd better break up the VMLA.
+  unsigned Limit1 = isLikeA9 ? 1 : 4;
+  unsigned Limit2 = isLikeA9 ? 1 : 4;
+  for (unsigned i = 1; i <= 4; ++i) {
+    int Idx = ((int)MIIdx - i + 4) % 4;
+    MachineInstr *NextMI = LastMIs[Idx];
+    if (!NextMI)
+      continue;
+
+    if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
+      if (i <= Limit1)
+        return true;
+    }
+
+    // Look for VMLx RAW hazard.
+    if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
+/// of MUL + ADD / SUB instructions.
+void
+MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     unsigned MulOpc, unsigned AddSubOpc,
+                                     bool NegAcc, bool HasLane) {
+  unsigned DstReg = MI->getOperand(0).getReg();
+  bool DstDead = MI->getOperand(0).isDead();
+  unsigned AccReg = MI->getOperand(1).getReg();
+  unsigned Src1Reg = MI->getOperand(2).getReg();
+  unsigned Src2Reg = MI->getOperand(3).getReg();
+  bool Src1Kill = MI->getOperand(2).isKill();
+  bool Src2Kill = MI->getOperand(3).isKill();
+  unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
+  unsigned NextOp = HasLane ? 5 : 4;
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
+  unsigned PredReg = MI->getOperand(++NextOp).getReg();
+
+  const MCInstrDesc &MCID1 = TII->get(MulOpc);
+  const MCInstrDesc &MCID2 = TII->get(AddSubOpc);
+  const MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TmpReg = MRI->createVirtualRegister(
+                      TII->getRegClass(MCID1, 0, TRI, MF));
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID1, TmpReg)
+    .addReg(Src1Reg, getKillRegState(Src1Kill))
+    .addReg(Src2Reg, getKillRegState(Src2Kill));
+  if (HasLane)
+    MIB.addImm(LaneImm);
+  MIB.addImm(Pred).addReg(PredReg);
+
+  MIB = BuildMI(MBB, MI, MI->getDebugLoc(), MCID2)
+    .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
+
+  if (NegAcc) {
+    bool AccKill = MRI->hasOneNonDBGUse(AccReg);
+    MIB.addReg(TmpReg, getKillRegState(true))
+       .addReg(AccReg, getKillRegState(AccKill));
+  } else {
+    MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
+  }
+  MIB.addImm(Pred).addReg(PredReg);
+
+  DEBUG({
+      dbgs() << "Expanding: " << *MI;
+      dbgs() << "  to:\n";
+      MachineBasicBlock::iterator MII = MI;
+      MII = llvm::prior(MII);
+      MachineInstr &MI2 = *MII;
+      MII = llvm::prior(MII);
+      MachineInstr &MI1 = *MII;
+      dbgs() << "    " << MI1;
+      dbgs() << "    " << MI2;
+   });
+
+  MI->eraseFromParent();
+  ++NumExpand;
+}
+
+bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  clearStack();
+  IgnoreStall.clear();
+
+  unsigned Skip = 0;
+  MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
+  while (MII != E) {
+    MachineInstr *MI = &*MII;
+
+    if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
+      ++MII;
+      continue;
+    }
+
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MI->isBarrier()) {
+      clearStack();
+      Skip = 0;
+      ++MII;
+      continue;
+    }
+
+    unsigned Domain = MCID.TSFlags & ARMII::DomainMask;
+    if (Domain == ARMII::DomainGeneral) {
+      if (++Skip == 2)
+        // Assume dual issues of non-VFP / NEON instructions.
+        pushStack(0);
+    } else {
+      Skip = 0;
+
+      unsigned MulOpc, AddSubOpc;
+      bool NegAcc, HasLane;
+      if (!TII->isFpMLxInstruction(MCID.getOpcode(),
+                                   MulOpc, AddSubOpc, NegAcc, HasLane) ||
+          !FindMLxHazard(MI))
+        pushStack(MI);
+      else {
+        ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
+        E = MBB.rend(); // May have changed if MI was the 1st instruction.
+        Changed = true;
+        continue;
+      }
+    }
+
+    ++MII;
+  }
+
+  return Changed;
+}
+
+bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const ARMBaseInstrInfo*>(Fn.getTarget().getInstrInfo());
+  TRI = Fn.getTarget().getRegisterInfo();
+  MRI = &Fn.getRegInfo();
+  const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  isLikeA9 = STI->isLikeA9() || STI->isSwift();
+  isSwift = STI->isSwift();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= ExpandFPMLxInstructions(MBB);
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createMLxExpansionPass() {
+  return new MLxExpansion();
+}
diff --git a/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 000000000000..fa5681fb12bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheARMTarget, llvm::TheThumbTarget;
+
+extern "C" void LLVMInitializeARMTargetInfo() { 
+  RegisterTarget<Triple::arm, /*HasJIT=*/true>
+    X(TheARMTarget, "arm", "ARM");
+
+  RegisterTarget<Triple::thumb, /*HasJIT=*/true>
+    Y(TheThumbTarget, "thumb", "Thumb");
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
new file mode 100644
index 000000000000..2c3388cc452c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -0,0 +1,407 @@
+//===-- Thumb1FrameLowering.cpp - Thumb1 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb1 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1FrameLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void
+emitSPUpdate(MachineBasicBlock &MBB,
+             MachineBasicBlock::iterator &MBBI,
+             const TargetInstrInfo &TII, DebugLoc dl,
+             const Thumb1RegisterInfo &MRI,
+             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags)  {
+  emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, MIFlags);
+}
+
+
+void Thumb1FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, -Amount);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, TII, dl, *RegInfo, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned BasePtr = RegInfo->getBaseRegister();
+
+  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+  NumBytes = (NumBytes + 3) & ~3;
+  MFI->setStackSize(NumBytes);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (VARegSaveSize)
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -VARegSaveSize,
+                 MachineInstr::FrameSetup);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                   MachineInstr::FrameSetup);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetIOS()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+    if (MBBI != MBB.end())
+      dl = MBBI->getDebugLoc();
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  bool HasFP = hasFP(MF);
+  if (HasFP)
+    AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
+                                NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+  NumBytes = DPRCSOffset;
+
+  // Adjust FP so it point to the stack slot that contains the previous FP.
+  if (HasFP) {
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup));
+    if (NumBytes > 508)
+      // If offset is > 508 then sp cannot be adjusted in a single instruction,
+      // try restoring from fp instead.
+      AFI->setShouldRestoreSPFromFP(true);
+  }
+
+  if (NumBytes)
+    // Insert it after all the callee-save spills.
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
+                 MachineInstr::FrameSetup);
+
+  if (STI.isTargetELF() && HasFP)
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // Thumb1 does not currently support dynamic stack realignment.  Report a
+  // fatal error rather then silently generate bad code.
+  if (RegInfo->needsStackRealignment(MF))
+      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF))
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+                   .addReg(ARM::SP));
+
+  // If the frame has variable sized objects then the epilogue must restore
+  // the sp from fp. We can assume there's an FP here since hasFP already
+  // checks for hasVarSizedObjects.
+  if (MFI->hasVarSizedObjects())
+    AFI->setShouldRestoreSPFromFP(true);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
+  if (MI->getOpcode() == ARM::tLDRspi &&
+      MI->getOperand(1).isFI() &&
+      isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
+    return true;
+  else if (MI->getOpcode() == ARM::tPOP) {
+    // The first two operands are predicates. The last two are
+    // imp-def and imp-use of SP. Check everything in between.
+    for (int i = 2, e = MI->getNumOperands() - 2; i != e; ++i)
+      if (!isCalleeSavedRegister(MI->getOperand(i).getReg(), CSRegs))
+        return false;
+    return true;
+  }
+  return false;
+}
+
+void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert((MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const Thumb1RegisterInfo *RegInfo =
+    static_cast<const Thumb1RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const Thumb1InstrInfo &TII =
+    *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    if (AFI->shouldRestoreSPFromFP()) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot, the target is ELF and the function has FP, or
+      // the target uses var sized objects.
+      if (NumBytes) {
+        assert(MF.getRegInfo().isPhysRegUsed(ARM::R4) &&
+               "No scratch register to restore SP from FP!");
+        emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
+                                  TII, *RegInfo);
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(ARM::R4));
+      } else
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
+                               ARM::SP)
+          .addReg(FramePtr));
+    } else {
+      if (MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != MBBI &&
+          prior(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = prior(MBBI);
+        emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else
+        emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
+    }
+  }
+
+  if (VARegSaveSize) {
+    // Unlike T2 and ARM mode, the T1 pop instruction cannot restore
+    // to LR, and we can't pop the value directly to the PC since
+    // we need to update the SP after popping the value. Therefore, we
+    // pop the old LR into R3 as a temporary.
+
+    // Move back past the callee-saved register restoration
+    while (MBBI != MBB.end() && isCSRestore(MBBI, CSRegs))
+      ++MBBI;
+    // Epilogue for vararg functions: pop LR to R3 and branch off it.
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(ARM::R3, RegState::Define);
+
+    emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, VARegSaveSize);
+
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+      .addReg(ARM::R3, RegState::Kill);
+    AddDefaultPred(MIB);
+    MIB.copyImplicitOps(&*MBBI);
+    // erase the old tBX_RET instruction
+    MBB.erase(MBBI);
+  }
+}
+
+bool Thumb1FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
+  AddDefaultPred(MIB);
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    bool isKill = true;
+
+    // Add the callee-saved register as live-in unless it's LR and
+    // @llvm.returnaddress is called. If LR is returned for @llvm.returnaddress
+    // then it's already added to the function and entry block live-in sets.
+    if (Reg == ARM::LR) {
+      MachineFunction &MF = *MBB.getParent();
+      if (MF.getFrameInfo()->isReturnAddressTaken() &&
+          MF.getRegInfo().isLiveIn(Reg))
+        isKill = false;
+    }
+
+    if (isKill)
+      MBB.addLiveIn(Reg);
+
+    MIB.addReg(Reg, getKillRegState(isKill));
+  }
+  MIB.setMIFlags(MachineInstr::FrameSetup);
+  return true;
+}
+
+bool Thumb1FrameLowering::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
+  AddDefaultPred(MIB);
+
+  bool NumRegs = false;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      (*MIB).setDesc(TII.get(ARM::tPOP_RET));
+      MIB.copyImplicitOps(&*MI);
+      MI = MBB.erase(MI);
+    }
+    MIB.addReg(Reg, getDefRegState(true));
+    NumRegs = true;
+  }
+
+  // It's illegal to emit pop instruction without operands.
+  if (NumRegs)
+    MBB.insert(MI, &*MIB);
+  else
+    MF.DeleteMachineInstr(MIB);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
new file mode 100644
index 000000000000..5a300afd5d36
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -0,0 +1,56 @@
+//===-- Thumb1FrameLowering.h - Thumb1-specific frame info stuff --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __THUMB_FRAMEINFO_H_
+#define __THUMB_FRAMEINFO_H_
+
+#include "ARM.h"
+#include "ARMFrameLowering.h"
+#include "ARMSubtarget.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1FrameLowering : public ARMFrameLowering {
+public:
+  explicit Thumb1FrameLowering(const ARMSubtarget &sti)
+    : ARMFrameLowering(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
new file mode 100644
index 000000000000..095736d52a88
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -0,0 +1,104 @@
+//===-- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1InstrInfo.h"
+#include "ARM.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tMOVr);
+  NopInst.addOperand(MCOperand::CreateReg(ARM::R8));
+  NopInst.addOperand(MCOperand::CreateReg(ARM::R8));
+  NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::CreateReg(0));
+}
+
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  return 0;
+}
+
+void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)));
+  assert(ARM::GPRRegClass.contains(DestReg, SrcReg) &&
+         "Thumb1 can only copy GPR registers");
+}
+
+void Thumb1InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           isARMLowRegister(SrcReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+       isARMLowRegister(SrcReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+void Thumb1InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  assert((RC == &ARM::tGPRRegClass ||
+          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+           isARMLowRegister(DestReg))) && "Unknown regclass!");
+
+  if (RC == &ARM::tGPRRegClass ||
+      (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+       isARMLowRegister(DestReg))) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
new file mode 100644
index 000000000000..36af20492d4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -0,0 +1,61 @@
+//===-- Thumb1InstrInfo.h - Thumb-1 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1INSTRUCTIONINFO_H
+#define THUMB1INSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1InstrInfo : public ARMBaseInstrInfo {
+  Thumb1RegisterInfo RI;
+public:
+  explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb1RegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+};
+}
+
+#endif // THUMB1INSTRUCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
new file mode 100644
index 000000000000..7452fb776ebd
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -0,0 +1,679 @@
+//===-- Thumb1RegisterInfo.cpp - Thumb-1 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1RegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+extern cl::opt<bool> ReuseFrameIndexVals;
+}
+
+using namespace llvm;
+
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
+                                       const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}
+
+const TargetRegisterClass*
+Thumb1RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
+                                                                         const {
+  if (ARM::tGPRRegClass.hasSubClassEq(RC))
+    return &ARM::tGPRRegClass;
+  return ARMBaseRegisterInfo::getLargestLegalSuperClass(RC);
+}
+
+const TargetRegisterClass *
+Thumb1RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &ARM::tGPRRegClass;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void
+Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+          Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg)
+    .setMIFlags(MIFlags);
+}
+
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static
+void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              DebugLoc dl,
+                              unsigned DestReg, unsigned BaseReg,
+                              int NumBytes, bool CanChangeCC,
+                              const TargetInstrInfo &TII,
+                              const ARMBaseRegisterInfo& MRI,
+                              unsigned MIFlags = MachineInstr::NoFlags) {
+    MachineFunction &MF = *MBB.getParent();
+    bool isHigh = !isARMLowRegister(DestReg) ||
+                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
+    bool isSub = false;
+    // Subtract doesn't have high register version. Load the negative value
+    // if either base or dest register is a high register. Also, if do not
+    // issue sub as part of the sequence if condition register is to be
+    // preserved.
+    if (NumBytes < 0 && !isHigh && CanChangeCC) {
+      isSub = true;
+      NumBytes = -NumBytes;
+    }
+    unsigned LdReg = DestReg;
+    if (DestReg == ARM::SP) {
+      assert(BaseReg == ARM::SP && "Unexpected!");
+      LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+    }
+
+    if (NumBytes <= 255 && NumBytes >= 0)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes).setMIFlags(MIFlags);
+    else if (NumBytes < 0 && NumBytes >= -255) {
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes).setMIFlags(MIFlags);
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill).setMIFlags(MIFlags);
+    } else
+      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes,
+                            ARMCC::AL, 0, MIFlags);
+
+    // Emit add / sub.
+    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+    if (Opc != ARM::tADDhirr)
+      MIB = AddDefaultT1CC(MIB);
+    if (DestReg == ARM::SP || isSub)
+      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+    else
+      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+    AddDefaultPred(MIB);
+}
+
+/// calcNumMI - Returns the number of instructions required to materialize
+/// the specific add / sub r, c instruction.
+static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
+                          unsigned NumBits, unsigned Scale) {
+  unsigned NumMIs = 0;
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+
+  if (Opc == ARM::tADDrSPi) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    NumMIs++;
+    NumBits = 8;
+    Scale = 1;  // Followed by a number of tADDi8.
+    Chunk = ((1 << NumBits) - 1) * Scale;
+  }
+
+  NumMIs += Bytes / Chunk;
+  if ((Bytes % Chunk) != 0)
+    NumMIs++;
+  if (ExtraOpc)
+    NumMIs++;
+  return NumMIs;
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code.
+void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator &MBBI,
+                                     DebugLoc dl,
+                                     unsigned DestReg, unsigned BaseReg,
+                                     int NumBytes, const TargetInstrInfo &TII,
+                                     const ARMBaseRegisterInfo& MRI,
+                                     unsigned MIFlags) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+  bool isMul4 = (Bytes & 3) == 0;
+  bool isTwoAddr = false;
+  bool DstNotEqBase = false;
+  unsigned NumBits = 1;
+  unsigned Scale = 1;
+  int Opc = 0;
+  int ExtraOpc = 0;
+  bool NeedCC = false;
+
+  if (DestReg == BaseReg && BaseReg == ARM::SP) {
+    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+    NumBits = 7;
+    Scale = 4;
+    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    isTwoAddr = true;
+  } else if (!isSub && BaseReg == ARM::SP) {
+    // r1 = add sp, 403
+    // =>
+    // r1 = add sp, 100 * 4
+    // r1 = add r1, 3
+    if (!isMul4) {
+      Bytes &= ~3;
+      ExtraOpc = ARM::tADDi3;
+    }
+    NumBits = 8;
+    Scale = 4;
+    Opc = ARM::tADDrSPi;
+  } else {
+    // sp = sub sp, c
+    // r1 = sub sp, c
+    // r8 = sub sp, c
+    if (DestReg != BaseReg)
+      DstNotEqBase = true;
+    NumBits = 8;
+    if (DestReg == ARM::SP) {
+      Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+      assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+      NumBits = 7;
+      Scale = 4;
+    } else {
+      Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+      NumBits = 8;
+      NeedCC = true;
+    }
+    isTwoAddr = true;
+  }
+
+  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+  if (NumMIs > Threshold) {
+    // This will expand into too many instructions. Load the immediate from a
+    // constpool entry.
+    emitThumbRegPlusImmInReg(MBB, MBBI, dl,
+                             DestReg, BaseReg, NumBytes, true,
+                             TII, MRI, MIFlags);
+    return;
+  }
+
+  if (DstNotEqBase) {
+    if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
+      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
+      unsigned Chunk = (1 << 3) - 1;
+      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+      Bytes -= ThisVal;
+      const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
+      const MachineInstrBuilder MIB =
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
+                         .setMIFlags(MIFlags));
+      AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
+    } else {
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+        .addReg(BaseReg, RegState::Kill))
+        .setMIFlags(MIFlags);
+    }
+    BaseReg = DestReg;
+  }
+
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  while (Bytes) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    ThisVal /= Scale;
+    // Build the new tADD / tSUB.
+    if (isTwoAddr) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(DestReg).addImm(ThisVal);
+      MIB = AddDefaultPred(MIB);
+      MIB.setMIFlags(MIFlags);
+    } else {
+      bool isKill = BaseReg != ARM::SP;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      MIB = AddDefaultPred(MIB);
+      MIB.setMIFlags(MIFlags);
+
+      BaseReg = DestReg;
+      if (Opc == ARM::tADDrSPi) {
+        // r4 = add sp, imm
+        // r4 = add r4, imm
+        // ...
+        NumBits = 8;
+        Scale = 1;
+        Chunk = ((1 << NumBits) - 1) * Scale;
+        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+        NeedCC = isTwoAddr = true;
+      }
+    }
+  }
+
+  if (ExtraOpc) {
+    const MCInstrDesc &MCID = TII.get(ExtraOpc);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
+                   .addReg(DestReg, RegState::Kill)
+                   .addImm(((unsigned)NumBytes) & 3)
+                   .setMIFlags(MIFlags));
+  }
+}
+
+/// emitThumbConstant - Emit a series of instructions to materialize a
+/// constant.
+static void emitThumbConstant(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Imm,
+                              const TargetInstrInfo &TII,
+                              const Thumb1RegisterInfo& MRI,
+                              DebugLoc dl) {
+  bool isSub = Imm < 0;
+  if (isSub) Imm = -Imm;
+
+  int Chunk = (1 << 8) - 1;
+  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
+  Imm -= ThisVal;
+  AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
+                                        DestReg))
+                 .addImm(ThisVal));
+  if (Imm > 0)
+    emitThumbRegPlusImmediate(MBB, MBBI, dl, DestReg, DestReg, Imm, TII, MRI);
+  if (isSub) {
+    const MCInstrDesc &MCID = TII.get(ARM::tRSB);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg))
+                   .addReg(DestReg, RegState::Kill));
+  }
+}
+
+static void removeOperands(MachineInstr &MI, unsigned i) {
+  unsigned Op = i;
+  for (unsigned e = MI.getNumOperands(); i != e; ++i)
+    MI.RemoveOperand(Op);
+}
+
+/// convertToNonSPOpcode - Change the opcode to the non-SP version, because
+/// we're replacing the frame index with a non-SP register.
+static unsigned convertToNonSPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::tLDRspi:
+    return ARM::tLDRi;
+
+  case ARM::tSTRspi:
+    return ARM::tSTRi;
+  }
+
+  return Opcode;
+}
+
+bool Thumb1RegisterInfo::
+rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                  unsigned FrameReg, int &Offset,
+                  const ARMBaseInstrInfo &TII) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+
+  if (Opcode == ARM::tADDrSPi) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    // Can't use tADDrSPi if it's based off the frame pointer.
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (FrameReg != ARM::SP) {
+      Opcode = ARM::tADDi3;
+      NumBits = 3;
+    } else {
+      NumBits = 8;
+      Scale = 4;
+      assert((Offset & 3) == 0 &&
+             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
+    }
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset
+      MI.RemoveOperand(FrameRegIdx+1);
+      return true;
+    }
+
+    // Common case: small offset, fits into instruction.
+    unsigned Mask = (1 << NumBits) - 1;
+    if (((Offset / Scale) & ~Mask) == 0) {
+      // Replace the FrameIndex with sp / fp
+      if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
+        removeOperands(MI, FrameRegIdx);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
+                       .addImm(Offset / Scale));
+      } else {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset / Scale);
+      }
+      return true;
+    }
+
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
+    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
+    // MI would expand into a large number of instructions. Don't try to
+    // simplify the immediate.
+    if (NumMIs > 2) {
+      emitThumbRegPlusImmediate(MBB, II, dl, DestReg, FrameReg, Offset, TII,
+                                *this);
+      MBB.erase(II);
+      return true;
+    }
+
+    if (Offset > 0) {
+      // Translate r0 = add sp, imm to
+      // r0 = add sp, 255*4
+      // r0 = add r0, (imm - 255*4)
+      if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
+        removeOperands(MI, FrameRegIdx);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
+      } else {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Mask);
+      }
+      Offset = (Offset - Mask * Scale);
+      MachineBasicBlock::iterator NII = llvm::next(II);
+      emitThumbRegPlusImmediate(MBB, NII, dl, DestReg, DestReg, Offset, TII,
+                                *this);
+    } else {
+      // Translate r0 = add sp, -imm to
+      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = add r0, sp
+      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+
+      MI.setDesc(TII.get(ARM::tADDhirr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
+    }
+    return true;
+  } else {
+    if (AddrMode != ARMII::AddrModeT1_s)
+      llvm_unreachable("Unsupported addressing mode!");
+
+    unsigned ImmIdx = FrameRegIdx + 1;
+    int InstrOffs = MI.getOperand(ImmIdx).getImm();
+    unsigned NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+    unsigned Scale = 4;
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale - 1)) == 0 && "Can't encode this offset!");
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with the frame register (e.g., sp).
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      ImmOp.ChangeToImmediate(ImmedOffset);
+
+      // If we're using a register where sp was stored, convert the instruction
+      // to the non-SP version.
+      unsigned NewOpc = convertToNonSPOpcode(Opcode);
+      if (NewOpc != Opcode && FrameReg != ARM::SP)
+        MI.setDesc(TII.get(NewOpc));
+
+      return true;
+    }
+
+    NumBits = 5;
+    Mask = (1 << NumBits) - 1;
+
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) {
+      ImmOp.ChangeToImmediate(0);
+    } else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask * Scale);
+    }
+  }
+
+  return Offset == 0;
+}
+
+void
+Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
+                                      unsigned BaseReg, int64_t Offset) const {
+  MachineInstr &MI = *I;
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  assert (Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+  DebugLoc DL;
+  AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+    .addReg(ARM::R12, RegState::Define)
+    .addReg(Reg, RegState::Kill));
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    if (II->isDebugValue())
+      continue;
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::R12)) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+
+  return true;
+}
+
+void
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  unsigned VReg = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc dl = MI.getDebugLoc();
+  MachineInstrBuilder MIB(*MBB.getParent(), &MI);
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+    assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
+           "Unexpected");
+    // There are alloca()'s in this function, must reference off the frame
+    // pointer or base pointer instead.
+    if (!hasBasePointer(MF)) {
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    } else
+      FrameReg = BasePtr;
+  }
+
+  // PEI::scavengeFrameVirtualRegs() cannot accurately track SPAdj because the
+  // call frame setup/destroy instructions have already been eliminated.  That
+  // means the stack pointer cannot be used to access the emergency spill slot
+  // when !hasReservedCallFrame().
+#ifndef NDEBUG
+  if (RS && FrameReg == ARM::SP && RS->isScavengingFrameIndex(FrameIndex)){
+    assert(MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF) &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions without a reserved call frame");
+    assert(!MF.getFrameInfo()->hasVarSizedObjects() &&
+           "Cannot use SP to access the emergency spill slot in "
+           "functions with variable sized frame objects");
+  }
+#endif // NDEBUG
+
+  // Special handling of dbg_value instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  assert(AFI->isThumbFunction() &&
+         "This eliminateFrameIndex only supports Thumb1!");
+  if (rewriteFrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  unsigned Opcode = MI.getOpcode();
+
+  // Remove predicate first.
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1)
+    removeOperands(MI, PIdx);
+
+  if (MI.mayLoad()) {
+    // Use the destination register to materialize sp + offset.
+    unsigned TmpReg = MI.getOperand(0).getReg();
+    bool UseRR = false;
+    if (Opcode == ARM::tLDRspi) {
+      if (FrameReg == ARM::SP)
+        emitThumbRegPlusImmInReg(MBB, II, dl, TmpReg, FrameReg,
+                                 Offset, false, TII, *this);
+      else {
+        emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
+        UseRR = true;
+      }
+    } else {
+      emitThumbRegPlusImmediate(MBB, II, dl, TmpReg, FrameReg, Offset, TII,
+                                *this);
+    }
+
+    MI.setDesc(TII.get(UseRR ? ARM::tLDRr : ARM::tLDRi));
+    MI.getOperand(FIOperandNum).ChangeToRegister(TmpReg, false, false, true);
+    if (UseRR)
+      // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+      // register. The offset is already handled in the vreg value.
+      MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                     false);
+  } else if (MI.mayStore()) {
+      VReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
+      bool UseRR = false;
+
+      if (Opcode == ARM::tSTRspi) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, dl, VReg, FrameReg,
+                                   Offset, false, TII, *this);
+        else {
+          emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, dl, VReg, FrameReg, Offset, TII,
+                                  *this);
+      MI.setDesc(TII.get(UseRR ? ARM::tSTRr : ARM::tSTRi));
+      MI.getOperand(FIOperandNum).ChangeToRegister(VReg, false, false, true);
+      if (UseRR)
+        // Use [reg, reg] addrmode. Replace the immediate operand w/ the frame
+        // register. The offset is already handled in the vreg value.
+        MI.getOperand(FIOperandNum+1).ChangeToRegister(FrameReg, false, false,
+                                                       false);
+  } else {
+    llvm_unreachable("Unexpected opcode!");
+  }
+
+  // Add predicate back if it's needed.
+  if (MI.isPredicable())
+    AddDefaultPred(MIB);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
new file mode 100644
index 000000000000..ebbab36dd7b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -0,0 +1,65 @@
+//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1REGISTERINFO_H
+#define THUMB1REGISTERINFO_H
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+
+struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+ void emitLoadConstPool(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        DebugLoc dl,
+                        unsigned DestReg, unsigned SubIdx, int Val,
+                        ARMCC::CondCodes Pred = ARMCC::AL,
+                        unsigned PredReg = 0,
+                        unsigned MIFlags = MachineInstr::NoFlags) const;
+
+  // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
+  // however much remains to be handled. Return 'true' if no further
+  // work is required.
+  bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII) const;
+  void resolveFrameIndex(MachineBasicBlock::iterator I,
+                         unsigned BaseReg, int64_t Offset) const;
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC,
+                             unsigned Reg) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+};
+}
+
+#endif // THUMB1REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 000000000000..97c254ce75a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -0,0 +1,274 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb-2 IT blocks ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "thumb2-it"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+STATISTIC(NumITs,        "Number of IT blocks inserted");
+STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
+
+namespace {
+  class Thumb2ITBlockPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
+    const Thumb2InstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ARMFunctionInfo *AFI;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Thumb IT blocks insertion pass";
+    }
+
+  private:
+    bool MoveCopyOutOfITBlock(MachineInstr *MI,
+                              ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                              SmallSet<unsigned, 4> &Defs,
+                              SmallSet<unsigned, 4> &Uses);
+    bool InsertITInstructions(MachineBasicBlock &MBB);
+  };
+  char Thumb2ITBlockPass::ID = 0;
+}
+
+/// TrackDefUses - Tracking what registers are being defined and used by
+/// instructions in the IT block. This also tracks "dependencies", i.e. uses
+/// in the IT block that are defined before the IT instruction.
+static void TrackDefUses(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &Defs,
+                         SmallSet<unsigned, 4> &Uses,
+                         const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallVector<unsigned, 4> LocalUses;
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || Reg == ARM::ITSTATE || Reg == ARM::SP)
+      continue;
+    if (MO.isUse())
+      LocalUses.push_back(Reg);
+    else
+      LocalDefs.push_back(Reg);
+  }
+
+  for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+    unsigned Reg = LocalUses[i];
+    Uses.insert(Reg);
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+      Uses.insert(*Subreg);
+  }
+
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    Defs.insert(Reg);
+    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+      Defs.insert(*Subreg);
+    if (Reg == ARM::CPSR)
+      continue;
+  }
+}
+
+static bool isCopy(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case ARM::MOVr:
+  case ARM::MOVr_TC:
+  case ARM::tMOVr:
+  case ARM::t2MOVr:
+    return true;
+  }
+}
+
+bool
+Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
+                                      ARMCC::CondCodes CC, ARMCC::CondCodes OCC,
+                                        SmallSet<unsigned, 4> &Defs,
+                                        SmallSet<unsigned, 4> &Uses) {
+  if (!isCopy(MI))
+    return false;
+  // llvm models select's as two-address instructions. That means a copy
+  // is inserted before a t2MOVccr, etc. If the copy is scheduled in
+  // between selects we would end up creating multiple IT blocks.
+  assert(MI->getOperand(0).getSubReg() == 0 &&
+         MI->getOperand(1).getSubReg() == 0 &&
+         "Sub-register indices still around?");
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+
+  // First check if it's safe to move it.
+  if (Uses.count(DstReg) || Defs.count(SrcReg))
+    return false;
+
+  // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+  // if we have:
+  //
+  //   movs  r1, r1
+  //   rsb   r1, 0
+  //   movs  r2, r2
+  //   rsb   r2, 0
+  //
+  // we don't want this to be converted to:
+  //
+  //   movs  r1, r1
+  //   movs  r2, r2
+  //   itt   mi
+  //   rsb   r1, 0
+  //   rsb   r2, 0
+  //
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->hasOptionalDef() &&
+      MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+    return false;
+
+  // Then peek at the next instruction to see if it's predicated on CC or OCC.
+  // If not, then there is nothing to be gained by moving the copy.
+  MachineBasicBlock::iterator I = MI; ++I;
+  MachineBasicBlock::iterator E = MI->getParent()->end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  if (I != E) {
+    unsigned NPredReg = 0;
+    ARMCC::CondCodes NCC = getITInstrPredicate(I, NPredReg);
+    if (NCC == CC || NCC == OCC)
+      return true;
+  }
+  return false;
+}
+
+bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  SmallSet<unsigned, 4> Defs;
+  SmallSet<unsigned, 4> Uses;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr *MI = &*MBBI;
+    DebugLoc dl = MI->getDebugLoc();
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getITInstrPredicate(MI, PredReg);
+    if (CC == ARMCC::AL) {
+      ++MBBI;
+      continue;
+    }
+
+    Defs.clear();
+    Uses.clear();
+    TrackDefUses(MI, Defs, Uses, TRI);
+
+    // Insert an IT instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+      .addImm(CC);
+
+    // Add implicit use of ITSTATE to IT block instructions.
+    MI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                             true/*isImp*/, false/*isKill*/));
+
+    MachineInstr *LastITMI = MI;
+    MachineBasicBlock::iterator InsertPos = MIB;
+    ++MBBI;
+
+    // Form IT block.
+    ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+    unsigned Mask = 0, Pos = 3;
+    // Branches, including tricky ones like LDM_RET, need to end an IT
+    // block so check the instruction we just put in the block.
+    for (; MBBI != E && Pos &&
+           (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+      if (MBBI->isDebugValue())
+        continue;
+
+      MachineInstr *NMI = &*MBBI;
+      MI = NMI;
+
+      unsigned NPredReg = 0;
+      ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+      if (NCC == CC || NCC == OCC) {
+        Mask |= (NCC & 1) << Pos;
+        // Add implicit use of ITSTATE.
+        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                               true/*isImp*/, false/*isKill*/));
+        LastITMI = NMI;
+      } else {
+        if (NCC == ARMCC::AL &&
+            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+          --MBBI;
+          MBB.remove(NMI);
+          MBB.insert(InsertPos, NMI);
+          ++NumMovedInsts;
+          continue;
+        }
+        break;
+      }
+      TrackDefUses(NMI, Defs, Uses, TRI);
+      --Pos;
+    }
+
+    // Finalize IT mask.
+    Mask |= (1 << Pos);
+    // Tag along (firstcond[0] << 4) with the mask.
+    Mask |= (CC & 1) << 4;
+    MIB.addImm(Mask);
+
+    // Last instruction in IT block kills ITSTATE.
+    LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill();
+
+    // Finalize the bundle.
+    MachineBasicBlock::instr_iterator LI = LastITMI;
+    finalizeBundle(MBB, InsertPos.getInstrIterator(), llvm::next(LI));
+
+    Modified = true;
+    ++NumITs;
+  }
+
+  return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  TRI = TM.getRegisterInfo();
+
+  if (!AFI->isThumbFunction())
+    return false;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ) {
+    MachineBasicBlock &MBB = *MFI;
+    ++MFI;
+    Modified |= InsertITInstructions(MBB);
+  }
+
+  if (Modified)
+    AFI->setHasITBlocks(true);
+
+  return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+  return new Thumb2ITBlockPass();
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
new file mode 100644
index 000000000000..67e8ec7c5ff2
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -0,0 +1,572 @@
+//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2InstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
+           cl::desc("Use old-style Thumb2 if-conversion heuristics"),
+           cl::init(false));
+
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(ARM::tNOP);
+  NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+  NopInst.addOperand(MCOperand::CreateReg(0));
+}
+
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  // FIXME
+  return 0;
+}
+
+void
+Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+  ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
+  if (!AFI->hasITBlocks()) {
+    TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+    return;
+  }
+
+  // If the first instruction of Tail is predicated, we may have to update
+  // the IT instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes CC = getInstrPredicate(Tail, PredReg);
+  MachineBasicBlock::iterator MBBI = Tail;
+  if (CC != ARMCC::AL)
+    // Expecting at least the t2IT instruction before it.
+    --MBBI;
+
+  // Actually replace the tail.
+  TargetInstrInfo::ReplaceTailWithBranchTo(Tail, NewDest);
+
+  // Fix up IT.
+  if (CC != ARMCC::AL) {
+    MachineBasicBlock::iterator E = MBB->begin();
+    unsigned Count = 4; // At most 4 instructions in an IT block.
+    while (Count && MBBI != E) {
+      if (MBBI->isDebugValue()) {
+        --MBBI;
+        continue;
+      }
+      if (MBBI->getOpcode() == ARM::t2IT) {
+        unsigned Mask = MBBI->getOperand(1).getImm();
+        if (Count == 4)
+          MBBI->eraseFromParent();
+        else {
+          unsigned MaskOn = 1 << Count;
+          unsigned MaskOff = ~(MaskOn - 1);
+          MBBI->getOperand(1).setImm((Mask & MaskOff) | MaskOn);
+        }
+        return;
+      }
+      --MBBI;
+      --Count;
+    }
+
+    // Ctrl flow can reach here if branch folding is run before IT block
+    // formation pass.
+  }
+}
+
+bool
+Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI) const {
+  while (MBBI->isDebugValue()) {
+    ++MBBI;
+    if (MBBI == MBB.end())
+      return false;
+  }
+
+  unsigned PredReg = 0;
+  return getITInstrPredicate(MBBI, PredReg) == ARMCC::AL;
+}
+
+void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  // Handle SPR, DPR, and QPR copies.
+  if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
+    return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
+
+  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)));
+}
+
+void Thumb2InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC, TRI);
+}
+
+void Thumb2InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
+      RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
+      RC == &ARM::GPRnopcRegClass) {
+    DebugLoc DL;
+    if (I != MBB.end()) DL = I->getDebugLoc();
+
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                              MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
+}
+
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  // If profitable, use a movw or movt to materialize the offset.
+  // FIXME: Use the scavenger to grab a scratch register.
+  if (DestReg != ARM::SP && DestReg != BaseReg &&
+      NumBytes >= 4096 &&
+      ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+    bool Fits = false;
+    if (NumBytes < 65536) {
+      // Use a movw to materialize the 16-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+        .addImm(NumBytes)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    } else if ((NumBytes & 0xffff) == 0) {
+      // Use a movt to materialize the 32-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+        .addReg(DestReg)
+        .addImm(NumBytes >> 16)
+        .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+      Fits = true;
+    }
+
+    if (Fits) {
+      if (isSub) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+          .addReg(BaseReg, RegState::Kill)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+          .addReg(DestReg, RegState::Kill)
+          .addReg(BaseReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+          .setMIFlags(MIFlags);
+      }
+      return;
+    }
+  }
+
+  while (NumBytes) {
+    unsigned ThisVal = NumBytes;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
+        .addReg(BaseReg).setMIFlags(MIFlags));
+      BaseReg = ARM::SP;
+      continue;
+    }
+
+    bool HasCCOut = true;
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    } else {
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        HasCCOut = false;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    }
+
+    // Build the new ADD / SUB.
+    MachineInstrBuilder MIB =
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                     .addReg(BaseReg, RegState::Kill)
+                     .addImm(ThisVal)).setMIFlags(MIFlags);
+    if (HasCCOut)
+      AddDefaultCC(MIB);
+
+    BaseReg = DestReg;
+  }
+}
+
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi12:   return ARM::t2LDRi8;
+  case ARM::t2LDRHi12:  return ARM::t2LDRHi8;
+  case ARM::t2LDRBi12:  return ARM::t2LDRBi8;
+  case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+  case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+  case ARM::t2STRi12:   return ARM::t2STRi8;
+  case ARM::t2STRBi12:  return ARM::t2STRBi8;
+  case ARM::t2STRHi12:  return ARM::t2STRHi8;
+
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi8:   return ARM::t2LDRi12;
+  case ARM::t2LDRHi8:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBi8:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+  case ARM::t2STRi8:   return ARM::t2STRi12;
+  case ARM::t2STRBi8:  return ARM::t2STRBi12;
+  case ARM::t2STRHi8:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRs:   return ARM::t2LDRi12;
+  case ARM::t2LDRHs:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBs:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+  case ARM::t2STRs:   return ARM::t2STRi12;
+  case ARM::t2STRBs:  return ARM::t2STRBi12;
+  case ARM::t2STRHs:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                               unsigned FrameReg, int &Offset,
+                               const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrModeT2_i12.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+  if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset and remaining explicit predicate operands.
+      do MI.RemoveOperand(FrameRegIdx+1);
+      while (MI.getNumOperands() > FrameRegIdx+1);
+      MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
+      AddDefaultPred(MIB);
+      return true;
+    }
+
+    bool HasCCOut = Opcode != ARM::t2ADDri12;
+
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(ARM::t2ADDri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Add cc_out operand if the original instruction did not have one.
+      if (!HasCCOut)
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+      Offset = 0;
+      return true;
+    }
+    // Another common case: imm12.
+    if (Offset < 4096 &&
+        (!HasCCOut || MI.getOperand(MI.getNumOperands()-1).getReg() == 0)) {
+      unsigned NewOpc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+      MI.setDesc(TII.get(NewOpc));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      // Remove the cc_out operand.
+      if (HasCCOut)
+        MI.RemoveOperand(MI.getNumOperands()-1);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, extract 8 adjacent bits from the immediate into this
+    // t2ADDri/t2SUBri.
+    unsigned RotAmt = CountLeadingZeros_32(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+    // Add cc_out operand if the original instruction did not have one.
+    if (!HasCCOut)
+      MI.addOperand(MachineOperand::CreateReg(0, false));
+
+  } else {
+
+    // AddrMode4 and AddrMode6 cannot handle any offset.
+    if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+      return false;
+
+    // AddrModeT2_so cannot handle any offset. If there is no offset
+    // register then we change to an immediate version.
+    unsigned NewOpc = Opcode;
+    if (AddrMode == ARMII::AddrModeT2_so) {
+      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      if (OffsetReg != 0) {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        return Offset == 0;
+      }
+
+      MI.RemoveOperand(FrameRegIdx+1);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+      NewOpc = immediateOffsetOpcode(Opcode);
+      AddrMode = ARMII::AddrModeT2_i12;
+    }
+
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+      // i8 supports only negative, and i12 supports only positive, so
+      // based on Offset sign convert Opcode to the appropriate
+      // instruction
+      Offset += MI.getOperand(FrameRegIdx+1).getImm();
+      if (Offset < 0) {
+        NewOpc = negativeOffsetOpcode(Opcode);
+        NumBits = 8;
+        isSub = true;
+        Offset = -Offset;
+      } else {
+        NewOpc = positiveOffsetOpcode(Opcode);
+        NumBits = 12;
+      }
+    } else if (AddrMode == ARMII::AddrMode5) {
+      // VFP address mode.
+      const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+      int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+      if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      Offset += InstrOffs * 4;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
+    } else {
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    if (NewOpc != Opcode)
+      MI.setDesc(TII.get(NewOpc));
+
+    MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+    // Attempt to fold address computation
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with fp/sp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode5)
+          // FIXME: Not consistent.
+          ImmedOffset |= 1 << NumBits;
+        else
+          ImmedOffset = -ImmedOffset;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, offset doesn't fit. Pull in what we can to simplify
+    ImmedOffset = ImmedOffset & Mask;
+    if (isSub) {
+      if (AddrMode == ARMII::AddrMode5)
+        // FIXME: Not consistent.
+        ImmedOffset |= 1 << NumBits;
+      else {
+        ImmedOffset = -ImmedOffset;
+        if (ImmedOffset == 0)
+          // Change the opcode back if the encoded offset is zero.
+          MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+      }
+    }
+    ImmOp.ChangeToImmediate(ImmedOffset);
+    Offset &= ~(Mask*Scale);
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}
+
+ARMCC::CondCodes
+llvm::getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return getInstrPredicate(MI, PredReg);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
new file mode 100644
index 000000000000..2cdcd06e9350
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -0,0 +1,75 @@
+//===-- Thumb2InstrInfo.h - Thumb-2 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2INSTRUCTIONINFO_H
+#define THUMB2INSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "Thumb2RegisterInfo.h"
+
+namespace llvm {
+class ARMSubtarget;
+class ScheduleHazardRecognizer;
+
+class Thumb2InstrInfo : public ARMBaseInstrInfo {
+  Thumb2RegisterInfo RI;
+public:
+  explicit Thumb2InstrInfo(const ARMSubtarget &STI);
+
+  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+  void getNoopForMachoTarget(MCInst &NopInst) const;
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                               MachineBasicBlock *NewDest) const;
+
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
+};
+
+/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
+/// to llvm::getInstrPredicate except it returns AL for conditional branch
+/// instructions which are "predicated", but are not in IT blocks.
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+
+}
+
+#endif // THUMB2INSTRUCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
new file mode 100644
index 000000000000..1a7a4d450cfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -0,0 +1,52 @@
+//===-- Thumb2RegisterInfo.cpp - Thumb-2 Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2RegisterInfo.h"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
+                                       const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void
+Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      DebugLoc dl,
+                                      unsigned DestReg, unsigned SubIdx,
+                                      int Val,
+                                      ARMCC::CondCodes Pred, unsigned PredReg,
+                                      unsigned MIFlags) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  const Constant *C = ConstantInt::get(
+           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
+    .setMIFlags(MIFlags);
+}
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h
new file mode 100644
index 000000000000..6b397e869683
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -0,0 +1,42 @@
+//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2REGISTERINFO_H
+#define THUMB2REGISTERINFO_H
+
+#include "ARM.h"
+#include "ARMBaseRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+
+struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void emitLoadConstPool(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         DebugLoc dl,
+                         unsigned DestReg, unsigned SubIdx, int Val,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0,
+                         unsigned MIFlags = MachineInstr::NoFlags) const;
+};
+}
+
+#endif // THUMB2REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 000000000000..d50f5d972232
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -0,0 +1,1037 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "t2-reduce-size"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"        // To access Function attributes
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+                                cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+                                     cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+                                     cl::init(-1), cl::Hidden);
+
+namespace {
+  /// ReduceTable - A static table with information on mapping from wide
+  /// opcodes to narrow
+  struct ReduceEntry {
+    uint16_t WideOpc;      // Wide opcode
+    uint16_t NarrowOpc1;   // Narrow opcode to transform to
+    uint16_t NarrowOpc2;   // Narrow opcode when it's two-address
+    uint8_t  Imm1Limit;    // Limit of immediate field (bits)
+    uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
+    unsigned LowRegs1 : 1; // Only possible if low-registers are used
+    unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+    unsigned PredCC1  : 2; // 0 - If predicated, cc is on and vice versa.
+                           // 1 - No cc field.
+                           // 2 - Always set CPSR.
+    unsigned PredCC2  : 2;
+    unsigned PartFlag : 1; // 16-bit instruction does partial flag update
+    unsigned Special  : 1; // Needs to be dealt with specially
+    unsigned AvoidMovs: 1; // Avoid movs with shifter operand (for Swift)
+  };
+
+  static const ReduceEntry ReduceTable[] = {
+  // Wide,        Narrow1,      Narrow2,     imm1,imm2, lo1, lo2, P/C,PF,S,AM
+  { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  0,0, 0,1,0 },
+  { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,   1,   0,  0,1, 0,0,0 },
+  { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,   1,   1,  2,2, 0,1,0 },
+  { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,   0,   1,  0,0, 1,0,0 },
+  //FIXME: Disable CMN, as CCodes are backwards from compare expectations
+  //{ ARM::t2CMNrr, ARM::tCMN,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMNzrr, ARM::tCMNz,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,   0,   0,  2,0, 0,1,0 },
+  { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  // FIXME: adr.n immediate offset must be multiple of 4.
+  //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,   0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,   1,   0,  0,0, 1,0,1 },
+  { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,   0,   1,  0,0, 1,0,1 },
+  { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,0,0 },
+  { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,   1,   0,  0,0, 1,1,0 },
+  // FIXME: Do we need the 16-bit 'S' variant?
+  { ARM::t2MOVr,ARM::tMOVr,     0,             0,   0,   0,   0,  1,0, 0,0,0 },
+  { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2REV,   ARM::tREV,    0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REV16, ARM::tREV16,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,   1,   0,  1,0, 0,0,0 },
+  { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,   0,   1,  0,0, 1,0,0 },
+  { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,   1,   0,  2,0, 0,1,0 },
+  { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,   0,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  0,0, 0,0,0 },
+  { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,   1,   0,  0,0, 0,0,0 },
+  { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,   1,   1,  2,2, 0,0,0 },
+  { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,   1,   0,  2,0, 0,0,0 },
+  { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+  { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,   1,   0,  1,0, 0,1,0 },
+
+  // FIXME: Clean this up after splitting each Thumb load / store opcode
+  // into multiple ones.
+  { ARM::t2LDRi12,ARM::tLDRi,   ARM::tLDRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRs,  ARM::tLDRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBi12,ARM::tLDRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRBs, ARM::tLDRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHi12,ARM::tLDRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRHs, ARM::tLDRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRi12,ARM::tSTRi,   ARM::tSTRspi,  5,   8,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRs,  ARM::tSTRr,   0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBi12,ARM::tSTRBi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRBs, ARM::tSTRBr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHi12,ARM::tSTRHi, 0,             5,   0,   1,   0,  0,0, 0,1,0 },
+  { ARM::t2STRHs, ARM::tSTRHr,  0,             0,   0,   1,   0,  0,0, 0,1,0 },
+
+  { ARM::t2LDMIA, ARM::tLDMIA,  0,             0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_RET,0,         ARM::tPOP_RET, 0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0,   0,   1,   1,  1,1, 0,1,0 },
+  // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent
+  { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0,       0,   0,   1,   1,  1,1, 0,1,0 },
+  { ARM::t2STMDB_UPD, 0,        ARM::tPUSH,    0,   0,   1,   1,  1,1, 0,1,0 }
+  };
+
+  class Thumb2SizeReduce : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2SizeReduce();
+
+    const Thumb2InstrInfo *TII;
+    const ARMSubtarget *STI;
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Thumb2 instruction size reduction pass";
+    }
+
+  private:
+    /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+    DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+    bool canAddPseudoFlagDep(MachineInstr *Use, bool IsSelfLoop);
+
+    bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                         bool is2Addr, ARMCC::CondCodes Pred,
+                         bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+    bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                         const ReduceEntry &Entry);
+
+    bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+    /// instruction.
+    bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR,
+                       bool IsSelfLoop);
+
+    /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+    /// non-two-address instruction.
+    bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                        const ReduceEntry &Entry, bool LiveCPSR,
+                        bool IsSelfLoop);
+
+    /// ReduceMI - Attempt to reduce MI, return true on success.
+    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                  bool LiveCPSR, bool IsSelfLoop);
+
+    /// ReduceMBB - Reduce width of instructions in the specified basic block.
+    bool ReduceMBB(MachineBasicBlock &MBB);
+
+    bool OptimizeSize;
+    bool MinimizeSize;
+
+    // Last instruction to define CPSR in the current block.
+    MachineInstr *CPSRDef;
+    // Was CPSR last defined by a high latency instruction?
+    // When CPSRDef is null, this refers to CPSR defs in predecessors.
+    bool HighLatencyCPSR;
+
+    struct MBBInfo {
+      // The flags leaving this block have high latency.
+      bool HighLatencyCPSR;
+      // Has this block been visited yet?
+      bool Visited;
+
+      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+    };
+
+    SmallVector<MBBInfo, 8> BlockInfo;
+  };
+  char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(ID) {
+  OptimizeSize = MinimizeSize = false;
+  for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+    unsigned FromOpc = ReduceTable[i].WideOpc;
+    if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+      assert(false && "Duplicated entries?");
+  }
+}
+
+static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) {
+  for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs)
+    if (*Regs == ARM::CPSR)
+      return true;
+  return false;
+}
+
+// Check for a likely high-latency flag def.
+static bool isHighLatencyCPSR(MachineInstr *Def) {
+  switch(Def->getOpcode()) {
+  case ARM::FMSTAT:
+  case ARM::tMUL:
+    return true;
+  }
+  return false;
+}
+
+/// canAddPseudoFlagDep - For A9 (and other out-of-order) implementations,
+/// the 's' 16-bit instruction partially update CPSR. Abort the
+/// transformation to avoid adding false dependency on last CPSR setting
+/// instruction which hurts the ability for out-of-order execution engine
+/// to do register renaming magic.
+/// This function checks if there is a read-of-write dependency between the
+/// last instruction that defines the CPSR and the current instruction. If there
+/// is, then there is no harm done since the instruction cannot be retired
+/// before the CPSR setting instruction anyway.
+/// Note, we are not doing full dependency analysis here for the sake of compile
+/// time. We're not looking for cases like:
+/// r0 = muls ...
+/// r1 = add.w r0, ...
+/// ...
+///    = mul.w r1
+/// In this case it would have been ok to narrow the mul.w to muls since there
+/// are indirect RAW dependency between the muls and the mul.w
+bool
+Thumb2SizeReduce::canAddPseudoFlagDep(MachineInstr *Use, bool FirstInSelfLoop) {
+  // Disable the check for -Oz (aka OptimizeForSizeHarder).
+  if (MinimizeSize || !STI->avoidCPSRPartialUpdate())
+    return false;
+
+  if (!CPSRDef)
+    // If this BB loops back to itself, conservatively avoid narrowing the
+    // first instruction that does partial flag update.
+    return HighLatencyCPSR || FirstInSelfLoop;
+
+  SmallSet<unsigned, 2> Defs;
+  for (unsigned i = 0, e = CPSRDef->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = CPSRDef->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    Defs.insert(Reg);
+  }
+
+  for (unsigned i = 0, e = Use->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = Use->getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Defs.count(Reg))
+      return false;
+  }
+
+  // If the current CPSR has high latency, try to avoid the false dependency.
+  if (HighLatencyCPSR)
+    return true;
+
+  // tMOVi8 usually doesn't start long dependency chains, and there are a lot
+  // of them, so always shrink them when CPSR doesn't have high latency.
+  if (Use->getOpcode() == ARM::t2MOVi ||
+      Use->getOpcode() == ARM::t2MOVi16)
+    return false;
+
+  // No read-after-write dependency. The narrowing will add false dependency.
+  return true;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                                  bool is2Addr, ARMCC::CondCodes Pred,
+                                  bool LiveCPSR, bool &HasCC, bool &CCDead) {
+  if ((is2Addr  && Entry.PredCC2 == 0) ||
+      (!is2Addr && Entry.PredCC1 == 0)) {
+    if (Pred == ARMCC::AL) {
+      // Not predicated, must set CPSR.
+      if (!HasCC) {
+        // Original instruction was not setting CPSR, but CPSR is not
+        // currently live anyway. It's ok to set it. The CPSR def is
+        // dead though.
+        if (!LiveCPSR) {
+          HasCC = true;
+          CCDead = true;
+          return true;
+        }
+        return false;
+      }
+    } else {
+      // Predicated, must not set CPSR.
+      if (HasCC)
+        return false;
+    }
+  } else if ((is2Addr  && Entry.PredCC2 == 2) ||
+             (!is2Addr && Entry.PredCC1 == 2)) {
+    /// Old opcode has an optional def of CPSR.
+    if (HasCC)
+      return true;
+    // If old opcode does not implicitly define CPSR, then it's not ok since
+    // these new opcodes' CPSR def is not meant to be thrown away. e.g. CMP.
+    if (!HasImplicitCPSRDef(MI->getDesc()))
+      return false;
+    HasCC = true;
+  } else {
+    // 16-bit instruction does not set CPSR.
+    if (HasCC)
+      return false;
+  }
+
+  return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  bool isPCOk = (Opc == ARM::t2LDMIA_RET || Opc == ARM::t2LDMIA     ||
+                 Opc == ARM::t2LDMDB     || Opc == ARM::t2LDMIA_UPD ||
+                 Opc == ARM::t2LDMDB_UPD);
+  bool isLROk = (Opc == ARM::t2STMIA_UPD || Opc == ARM::t2STMDB_UPD);
+  bool isSPOk = isPCOk || isLROk;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    if (isPCOk && Reg == ARM::PC)
+      continue;
+    if (isLROk && Reg == ARM::LR)
+      continue;
+    if (Reg == ARM::SP) {
+      if (isSPOk)
+        continue;
+      if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12))
+        // Special case for these ldr / str with sp as base register.
+        continue;
+    }
+    if (!isARMLowRegister(Reg))
+      return false;
+  }
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const ReduceEntry &Entry) {
+  if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+    return false;
+
+  unsigned Scale = 1;
+  bool HasImmOffset = false;
+  bool HasShift = false;
+  bool HasOffReg = true;
+  bool isLdStMul = false;
+  unsigned Opc = Entry.NarrowOpc1;
+  unsigned OpNum = 3; // First 'rest' of operands.
+  uint8_t  ImmLimit = Entry.Imm1Limit;
+
+  switch (Entry.WideOpc) {
+  default:
+    llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+  case ARM::t2LDRi12:
+  case ARM::t2STRi12:
+    if (MI->getOperand(1).getReg() == ARM::SP) {
+      Opc = Entry.NarrowOpc2;
+      ImmLimit = Entry.Imm2Limit;
+      HasOffReg = false;
+    }
+
+    Scale = 4;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRBi12:
+  case ARM::t2STRBi12:
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRHi12:
+  case ARM::t2STRHi12:
+    Scale = 2;
+    HasImmOffset = true;
+    HasOffReg = false;
+    break;
+  case ARM::t2LDRs:
+  case ARM::t2LDRBs:
+  case ARM::t2LDRHs:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRSHs:
+  case ARM::t2STRs:
+  case ARM::t2STRBs:
+  case ARM::t2STRHs:
+    HasShift = true;
+    OpNum = 4;
+    break;
+  case ARM::t2LDMIA:
+  case ARM::t2LDMDB: {
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    if (!isARMLowRegister(BaseReg) || Entry.WideOpc != ARM::t2LDMIA)
+      return false;
+
+    // For the non-writeback version (this one), the base register must be
+    // one of the registers being loaded.
+    bool isOK = false;
+    for (unsigned i = 4; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg) {
+        isOK = true;
+        break;
+      }
+    }
+
+    if (!isOK)
+      return false;
+
+    OpNum = 0;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2LDMIA_RET: {
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg != ARM::SP)
+      return false;
+    Opc = Entry.NarrowOpc2; // tPOP_RET
+    OpNum = 2;
+    isLdStMul = true;
+    break;
+  }
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
+    OpNum = 0;
+
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg == ARM::SP &&
+        (Entry.WideOpc == ARM::t2LDMIA_UPD ||
+         Entry.WideOpc == ARM::t2STMDB_UPD)) {
+      Opc = Entry.NarrowOpc2; // tPOP or tPUSH
+      OpNum = 2;
+    } else if (!isARMLowRegister(BaseReg) ||
+               (Entry.WideOpc != ARM::t2LDMIA_UPD &&
+                Entry.WideOpc != ARM::t2STMIA_UPD)) {
+      return false;
+    }
+
+    isLdStMul = true;
+    break;
+  }
+  }
+
+  unsigned OffsetReg = 0;
+  bool OffsetKill = false;
+  if (HasShift) {
+    OffsetReg  = MI->getOperand(2).getReg();
+    OffsetKill = MI->getOperand(2).isKill();
+
+    if (MI->getOperand(3).getImm())
+      // Thumb1 addressing mode doesn't support shift.
+      return false;
+  }
+
+  unsigned OffsetImm = 0;
+  if (HasImmOffset) {
+    OffsetImm = MI->getOperand(2).getImm();
+    unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
+
+    if ((OffsetImm & (Scale - 1)) || OffsetImm > MaxOffset)
+      // Make sure the immediate field fits.
+      return false;
+  }
+
+  // Add the 16-bit load / store instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc));
+  if (!isLdStMul) {
+    MIB.addOperand(MI->getOperand(0));
+    MIB.addOperand(MI->getOperand(1));
+
+    if (HasImmOffset)
+      MIB.addImm(OffsetImm / Scale);
+
+    assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+    if (HasOffReg)
+      MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+  }
+
+  // Transfer the rest of operands.
+  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumLdSts;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::t2ADDri) {
+    // If the source register is SP, try to reduce to tADDrSPi, otherwise
+    // it's a normal reduce.
+    if (MI->getOperand(1).getReg() != ARM::SP) {
+      if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+        return true;
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    }
+    // Try to reduce to tADDrSPi.
+    unsigned Imm = MI->getOperand(2).getImm();
+    // The immediate must be in range, the destination register must be a low
+    // reg, the predicate must be "always" and the condition flags must not
+    // be being set.
+    if (Imm & 3 || Imm > 1020)
+      return false;
+    if (!isARMLowRegister(MI->getOperand(0).getReg()))
+      return false;
+    if (MI->getOperand(3).getImm() != ARMCC::AL)
+      return false;
+    const MCInstrDesc &MCID = MI->getDesc();
+    if (MCID.hasOptionalDef() &&
+        MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
+      return false;
+
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::tADDrSPi))
+      .addOperand(MI->getOperand(0))
+      .addOperand(MI->getOperand(1))
+      .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
+
+    // Transfer MI flags.
+    MIB.setMIFlags(MI->getFlags());
+
+    DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " <<*MIB);
+
+    MBB.erase_instr(MI);
+    ++NumNarrows;
+    return true;
+  }
+
+  if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+    return false;
+
+  if (MI->mayLoad() || MI->mayStore())
+    return ReduceLoadStore(MBB, MI, Entry);
+
+  switch (Opc) {
+  default: break;
+  case ARM::t2ADDSri:
+  case ARM::t2ADDSrr: {
+    unsigned PredReg = 0;
+    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+      switch (Opc) {
+      default: break;
+      case ARM::t2ADDSri: {
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+          return true;
+        // fallthrough
+      }
+      case ARM::t2ADDSrr:
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+      }
+    }
+    break;
+  }
+  case ARM::t2RSBri:
+  case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
+    if (MI->getOperand(2).getImm() == 0)
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2MOVi16:
+    // Can convert only 'pure' immediate operands, not immediates obtained as
+    // globals' addresses.
+    if (MI->getOperand(1).isImm())
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+    break;
+  case ARM::t2CMPrr: {
+    // Try to reduce to the lo-reg only version first. Why there are two
+    // versions of the instruction is a mystery.
+    // It would be nice to just have two entries in the master table that
+    // are prioritized, but the table assumes a unique entry for each
+    // source insn opcode. So for now, we hack a local entry record to use.
+    static const ReduceEntry NarrowEntry =
+      { ARM::t2CMPrr,ARM::tCMPr, 0, 0, 0, 1, 1,2, 0, 0,1,0 };
+    if (ReduceToNarrow(MBB, MI, NarrowEntry, LiveCPSR, IsSelfLoop))
+      return true;
+    return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+  }
+  }
+  return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR, bool IsSelfLoop) {
+
+  if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+    return false;
+
+  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
+      STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing / minimizing for size.
+    return false;
+
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  // t2MUL is "special". The tied source operand is second, not first.
+  if (MI->getOpcode() == ARM::t2MUL) {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    // Early exit if the regs aren't all low regs.
+    if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
+        || !isARMLowRegister(Reg2))
+      return false;
+    if (Reg0 != Reg2) {
+      // If the other operand also isn't the same as the destination, we
+      // can't reduce.
+      if (Reg1 != Reg0)
+        return false;
+      // Try to commute the operands to make it a 2-address instruction.
+      MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+      if (!CommutedMI)
+        return false;
+    }
+  } else if (Reg0 != Reg1) {
+    // Try to commute the operands to make it a 2-address instruction.
+    unsigned CommOpIdx1, CommOpIdx2;
+    if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) ||
+        CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0)
+      return false;
+    MachineInstr *CommutedMI = TII->commuteInstruction(MI);
+    if (!CommutedMI)
+      return false;
+  }
+  if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+    return false;
+  if (Entry.Imm2Limit) {
+    unsigned Imm = MI->getOperand(2).getImm();
+    unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+    if (Imm > Limit)
+      return false;
+  } else {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+      return false;
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if (SkipPred && MCID.OpInfo[i].isPredicate())
+      continue;
+    MIB.addOperand(MI->getOperand(i));
+  }
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++Num2Addrs;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                                 const ReduceEntry &Entry,
+                                 bool LiveCPSR, bool IsSelfLoop) {
+  if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+    return false;
+
+  if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs &&
+      STI->avoidMOVsShifterOperand())
+    // Don't issue movs with shifter operand for some CPUs unless we
+    // are optimizing / minimizing for size.
+    return false;
+
+  unsigned Limit = ~0U;
+  if (Entry.Imm1Limit)
+    Limit = (1 << Entry.Imm1Limit) - 1;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
+    if (MCID.OpInfo[i].isPredicate())
+      continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      if (!Reg || Reg == ARM::CPSR)
+        continue;
+      if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+        return false;
+    } else if (MO.isImm() &&
+               !MCID.OpInfo[i].isPredicate()) {
+      if (((unsigned)MO.getImm()) > Limit)
+        return false;
+    }
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewMCID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewMCID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (MCID.hasOptionalDef()) {
+    unsigned NumOps = MCID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Avoid adding a false dependency on partial flag update by some 16-bit
+  // instructions which has the 's' bit set.
+  if (Entry.PartFlag && NewMCID.hasOptionalDef() && HasCC &&
+      canAddPseudoFlagDep(MI, IsSelfLoop))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = MCID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
+      continue;
+    if ((MCID.getOpcode() == ARM::t2RSBSri ||
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
+      // Skip the zero immediate operand, it's now implicit.
+      continue;
+    bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
+    if (SkipPred && isPred)
+        continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+      // Skip implicit def of CPSR. Either it's modeled as an optional
+      // def now or it's already an implicit def on the new instruction.
+      continue;
+    MIB.addOperand(MO);
+  }
+  if (!MCID.isPredicable() && NewMCID.isPredicable())
+    AddDefaultPred(MIB);
+
+  // Transfer MI flags.
+  MIB.setMIFlags(MI->getFlags());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase_instr(MI);
+  ++NumNarrows;
+  return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR, bool &DefCPSR) {
+  bool HasDef = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+
+    DefCPSR = true;
+    if (!MO.isDead())
+      HasDef = true;
+  }
+
+  return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+    if (MO.isKill()) {
+      LiveCPSR = false;
+      break;
+    }
+  }
+
+  return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                                bool LiveCPSR, bool IsSelfLoop) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+  if (OPI == ReduceOpcodeMap.end())
+    return false;
+  const ReduceEntry &Entry = ReduceTable[OPI->second];
+
+  // Don't attempt normal reductions on "special" cases for now.
+  if (Entry.Special)
+    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
+
+  // Try to transform to a 16-bit two-address instruction.
+  if (Entry.NarrowOpc2 &&
+      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  // Try to transform to a 16-bit non-two-address instruction.
+  if (Entry.NarrowOpc1 &&
+      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
+    return true;
+
+  return false;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // Yes, CPSR could be livein.
+  bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
+  MachineInstr *BundleMI = 0;
+
+  CPSRDef = 0;
+  HighLatencyCPSR = false;
+
+  // Check predecessors for the latest CPSRDef.
+  bool HasBackEdges = false;
+  for (MachineBasicBlock::pred_iterator
+       I = MBB.pred_begin(), E = MBB.pred_end(); I != E; ++I) {
+    const MBBInfo &PInfo = BlockInfo[(*I)->getNumber()];
+    if (!PInfo.Visited) {
+      // Since blocks are visited in RPO, this must be a back-edge.
+      HasBackEdges = true;
+      continue;
+    }
+    if (PInfo.HighLatencyCPSR) {
+      HighLatencyCPSR = true;
+      break;
+    }
+  }
+
+  // If this BB loops back to itself, conservatively avoid narrowing the
+  // first instruction that does partial flag update.
+  bool IsSelfLoop = MBB.isSuccessor(&MBB);
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = llvm::next(MII);
+
+    MachineInstr *MI = &*MII;
+    if (MI->isBundle()) {
+      BundleMI = MI;
+      continue;
+    }
+    if (MI->isDebugValue())
+      continue;
+
+    LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+    // Does NextMII belong to the same bundle as MI?
+    bool NextInSameBundle = NextMII != E && NextMII->isBundledWithPred();
+
+    if (ReduceMI(MBB, MI, LiveCPSR, IsSelfLoop)) {
+      Modified = true;
+      MachineBasicBlock::instr_iterator I = prior(NextMII);
+      MI = &*I;
+      // Removing and reinserting the first instruction in a bundle will break
+      // up the bundle. Fix the bundling if it was broken.
+      if (NextInSameBundle && !NextMII->isBundledWithPred())
+        NextMII->bundleWithPred();
+    }
+
+    if (!NextInSameBundle && MI->isInsideBundle()) {
+      // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
+      // marker is only on the BUNDLE instruction. Process the BUNDLE
+      // instruction as we finish with the bundled instruction to work around
+      // the inconsistency.
+      if (BundleMI->killsRegister(ARM::CPSR))
+        LiveCPSR = false;
+      MachineOperand *MO = BundleMI->findRegisterDefOperand(ARM::CPSR);
+      if (MO && !MO->isDead())
+        LiveCPSR = true;
+    }
+
+    bool DefCPSR = false;
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
+    if (MI->isCall()) {
+      // Calls don't really set CPSR.
+      CPSRDef = 0;
+      HighLatencyCPSR = false;
+      IsSelfLoop = false;
+    } else if (DefCPSR) {
+      // This is the last CPSR defining instruction.
+      CPSRDef = MI;
+      HighLatencyCPSR = isHighLatencyCPSR(CPSRDef);
+      IsSelfLoop = false;
+    }
+  }
+
+  MBBInfo &Info = BlockInfo[MBB.getNumber()];
+  Info.HighLatencyCPSR = HighLatencyCPSR;
+  Info.Visited = true;
+  return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+  STI = &TM.getSubtarget<ARMSubtarget>();
+
+  // Optimizing / minimizing size?
+  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+  OptimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::OptimizeForSize);
+  MinimizeSize = FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::MinSize);
+
+  BlockInfo.clear();
+  BlockInfo.resize(MF.getNumBlockIDs());
+
+  // Visit blocks in reverse post-order so LastCPSRDef is known for all
+  // predecessors.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
+    Modified |= ReduceMBB(**I);
+  return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass() {
+  return new Thumb2SizeReduce();
+}
diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
new file mode 100644
index 000000000000..3e69098edcc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp
@@ -0,0 +1,2114 @@
+//===-- CPPBackend.cpp - Library for converting LLVM code to C++ code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the writing of the LLVM IR as a set of C++ calls to the
+// LLVM IR interface. The input module is assumed to be verified.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cstdio>
+#include <map>
+#include <set>
+using namespace llvm;
+
+static cl::opt<std::string>
+FuncName("cppfname", cl::desc("Specify the name of the generated function"),
+         cl::value_desc("function name"));
+
+enum WhatToGenerate {
+  GenProgram,
+  GenModule,
+  GenContents,
+  GenFunction,
+  GenFunctions,
+  GenInline,
+  GenVariable,
+  GenType
+};
+
+static cl::opt<WhatToGenerate> GenerationType("cppgen", cl::Optional,
+  cl::desc("Choose what kind of output to generate"),
+  cl::init(GenProgram),
+  cl::values(
+    clEnumValN(GenProgram,  "program",   "Generate a complete program"),
+    clEnumValN(GenModule,   "module",    "Generate a module definition"),
+    clEnumValN(GenContents, "contents",  "Generate contents of a module"),
+    clEnumValN(GenFunction, "function",  "Generate a function definition"),
+    clEnumValN(GenFunctions,"functions", "Generate all function definitions"),
+    clEnumValN(GenInline,   "inline",    "Generate an inline function"),
+    clEnumValN(GenVariable, "variable",  "Generate a variable definition"),
+    clEnumValN(GenType,     "type",      "Generate a type definition"),
+    clEnumValEnd
+  )
+);
+
+static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
+  cl::desc("Specify the name of the thing to generate"),
+  cl::init("!bad!"));
+
+extern "C" void LLVMInitializeCppBackendTarget() {
+  // Register the target.
+  RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
+}
+
+namespace {
+  typedef std::vector<Type*> TypeList;
+  typedef std::map<Type*,std::string> TypeMap;
+  typedef std::map<const Value*,std::string> ValueMap;
+  typedef std::set<std::string> NameSet;
+  typedef std::set<Type*> TypeSet;
+  typedef std::set<const Value*> ValueSet;
+  typedef std::map<const Value*,std::string> ForwardRefMap;
+
+  /// CppWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C++ translation unit.
+  class CppWriter : public ModulePass {
+    formatted_raw_ostream &Out;
+    const Module *TheModule;
+    uint64_t uniqueNum;
+    TypeMap TypeNames;
+    ValueMap ValueNames;
+    NameSet UsedNames;
+    TypeSet DefinedTypes;
+    ValueSet DefinedValues;
+    ForwardRefMap ForwardRefs;
+    bool is_inline;
+    unsigned indent_level;
+
+  public:
+    static char ID;
+    explicit CppWriter(formatted_raw_ostream &o) :
+      ModulePass(ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
+
+    virtual const char *getPassName() const { return "C++ backend"; }
+
+    bool runOnModule(Module &M);
+
+    void printProgram(const std::string& fname, const std::string& modName );
+    void printModule(const std::string& fname, const std::string& modName );
+    void printContents(const std::string& fname, const std::string& modName );
+    void printFunction(const std::string& fname, const std::string& funcName );
+    void printFunctions();
+    void printInline(const std::string& fname, const std::string& funcName );
+    void printVariable(const std::string& fname, const std::string& varName );
+    void printType(const std::string& fname, const std::string& typeName );
+
+    void error(const std::string& msg);
+
+    
+    formatted_raw_ostream& nl(formatted_raw_ostream &Out, int delta = 0);
+    inline void in() { indent_level++; }
+    inline void out() { if (indent_level >0) indent_level--; }
+    
+  private:
+    void printLinkageType(GlobalValue::LinkageTypes LT);
+    void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM);
+    void printCallingConv(CallingConv::ID cc);
+    void printEscapedString(const std::string& str);
+    void printCFP(const ConstantFP* CFP);
+
+    std::string getCppName(Type* val);
+    inline void printCppName(Type* val);
+
+    std::string getCppName(const Value* val);
+    inline void printCppName(const Value* val);
+
+    void printAttributes(const AttributeSet &PAL, const std::string &name);
+    void printType(Type* Ty);
+    void printTypes(const Module* M);
+
+    void printConstant(const Constant *CPV);
+    void printConstants(const Module* M);
+
+    void printVariableUses(const GlobalVariable *GV);
+    void printVariableHead(const GlobalVariable *GV);
+    void printVariableBody(const GlobalVariable *GV);
+
+    void printFunctionUses(const Function *F);
+    void printFunctionHead(const Function *F);
+    void printFunctionBody(const Function *F);
+    void printInstruction(const Instruction *I, const std::string& bbname);
+    std::string getOpName(const Value*);
+
+    void printModuleBody();
+  };
+} // end anonymous namespace.
+
+formatted_raw_ostream &CppWriter::nl(formatted_raw_ostream &Out, int delta) {
+  Out << '\n';
+  if (delta >= 0 || indent_level >= unsigned(-delta))
+    indent_level += delta;
+  Out.indent(indent_level);
+  return Out;
+}
+
+static inline void sanitize(std::string &str) {
+  for (size_t i = 0; i < str.length(); ++i)
+    if (!isalnum(str[i]) && str[i] != '_')
+      str[i] = '_';
+}
+
+static std::string getTypePrefix(Type *Ty) {
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:     return "void_";
+  case Type::IntegerTyID:
+    return "int" + utostr(cast<IntegerType>(Ty)->getBitWidth()) + "_";
+  case Type::FloatTyID:    return "float_";
+  case Type::DoubleTyID:   return "double_";
+  case Type::LabelTyID:    return "label_";
+  case Type::FunctionTyID: return "func_";
+  case Type::StructTyID:   return "struct_";
+  case Type::ArrayTyID:    return "array_";
+  case Type::PointerTyID:  return "ptr_";
+  case Type::VectorTyID:   return "packed_";
+  default:                 return "other_";
+  }
+}
+
+void CppWriter::error(const std::string& msg) {
+  report_fatal_error(msg);
+}
+
+static inline std::string ftostr(const APFloat& V) {
+  std::string Buf;
+  if (&V.getSemantics() == &APFloat::IEEEdouble) {
+    raw_string_ostream(Buf) << V.convertToDouble();
+    return Buf;
+  } else if (&V.getSemantics() == &APFloat::IEEEsingle) {
+    raw_string_ostream(Buf) << (double)V.convertToFloat();
+    return Buf;
+  }
+  return "<unknown format in ftostr>"; // error
+}
+
+// printCFP - Print a floating point constant .. very carefully :)
+// This makes sure that conversion to/from floating yields the same binary
+// result so that we don't lose precision.
+void CppWriter::printCFP(const ConstantFP *CFP) {
+  bool ignored;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+  Out << "ConstantFP::get(mod->getContext(), ";
+  Out << "APFloat(";
+#if HAVE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%A", APF.convertToDouble());
+  if ((!strncmp(Buffer, "0x", 2) ||
+       !strncmp(Buffer, "-0x", 3) ||
+       !strncmp(Buffer, "+0x", 3)) &&
+      APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
+    if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(" << Buffer << ")";
+    else
+      Out << "BitsToFloat((float)" << Buffer << ")";
+    Out << ")";
+  } else {
+#endif
+    std::string StrVal = ftostr(CFP->getValueAPF());
+
+    while (StrVal[0] == ' ')
+      StrVal.erase(StrVal.begin());
+
+    // Check to make sure that the stringized number is not some string like
+    // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+    if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+         ((StrVal[0] == '-' || StrVal[0] == '+') &&
+          (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+        (CFP->isExactlyValue(atof(StrVal.c_str())))) {
+      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+        Out <<  StrVal;
+      else
+        Out << StrVal << "f";
+    } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+      Out << "BitsToDouble(0x"
+          << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
+          << "ULL) /* " << StrVal << " */";
+    else
+      Out << "BitsToFloat(0x"
+          << utohexstr((uint32_t)CFP->getValueAPF().
+                                      bitcastToAPInt().getZExtValue())
+          << "U) /* " << StrVal << " */";
+    Out << ")";
+#if HAVE_PRINTF_A
+  }
+#endif
+  Out << ")";
+}
+
+void CppWriter::printCallingConv(CallingConv::ID cc){
+  // Print the calling convention.
+  switch (cc) {
+  case CallingConv::C:     Out << "CallingConv::C"; break;
+  case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
+  case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
+  case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
+  default:                 Out << cc; break;
+  }
+}
+
+void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
+  switch (LT) {
+  case GlobalValue::InternalLinkage:
+    Out << "GlobalValue::InternalLinkage"; break;
+  case GlobalValue::PrivateLinkage:
+    Out << "GlobalValue::PrivateLinkage"; break;
+  case GlobalValue::LinkerPrivateLinkage:
+    Out << "GlobalValue::LinkerPrivateLinkage"; break;
+  case GlobalValue::LinkerPrivateWeakLinkage:
+    Out << "GlobalValue::LinkerPrivateWeakLinkage"; break;
+  case GlobalValue::AvailableExternallyLinkage:
+    Out << "GlobalValue::AvailableExternallyLinkage "; break;
+  case GlobalValue::LinkOnceAnyLinkage:
+    Out << "GlobalValue::LinkOnceAnyLinkage "; break;
+  case GlobalValue::LinkOnceODRLinkage:
+    Out << "GlobalValue::LinkOnceODRLinkage "; break;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    Out << "GlobalValue::LinkOnceODRAutoHideLinkage"; break;
+  case GlobalValue::WeakAnyLinkage:
+    Out << "GlobalValue::WeakAnyLinkage"; break;
+  case GlobalValue::WeakODRLinkage:
+    Out << "GlobalValue::WeakODRLinkage"; break;
+  case GlobalValue::AppendingLinkage:
+    Out << "GlobalValue::AppendingLinkage"; break;
+  case GlobalValue::ExternalLinkage:
+    Out << "GlobalValue::ExternalLinkage"; break;
+  case GlobalValue::DLLImportLinkage:
+    Out << "GlobalValue::DLLImportLinkage"; break;
+  case GlobalValue::DLLExportLinkage:
+    Out << "GlobalValue::DLLExportLinkage"; break;
+  case GlobalValue::ExternalWeakLinkage:
+    Out << "GlobalValue::ExternalWeakLinkage"; break;
+  case GlobalValue::CommonLinkage:
+    Out << "GlobalValue::CommonLinkage"; break;
+  }
+}
+
+void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
+  switch (VisType) {
+  case GlobalValue::DefaultVisibility:
+    Out << "GlobalValue::DefaultVisibility";
+    break;
+  case GlobalValue::HiddenVisibility:
+    Out << "GlobalValue::HiddenVisibility";
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Out << "GlobalValue::ProtectedVisibility";
+    break;
+  }
+}
+
+void CppWriter::printThreadLocalMode(GlobalVariable::ThreadLocalMode TLM) {
+  switch (TLM) {
+    case GlobalVariable::NotThreadLocal:
+      Out << "GlobalVariable::NotThreadLocal";
+      break;
+    case GlobalVariable::GeneralDynamicTLSModel:
+      Out << "GlobalVariable::GeneralDynamicTLSModel";
+      break;
+    case GlobalVariable::LocalDynamicTLSModel:
+      Out << "GlobalVariable::LocalDynamicTLSModel";
+      break;
+    case GlobalVariable::InitialExecTLSModel:
+      Out << "GlobalVariable::InitialExecTLSModel";
+      break;
+    case GlobalVariable::LocalExecTLSModel:
+      Out << "GlobalVariable::LocalExecTLSModel";
+      break;
+  }
+}
+
+// printEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+void CppWriter::printEscapedString(const std::string &Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '"' && C != '\\') {
+      Out << C;
+    } else {
+      Out << "\\x"
+          << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+          << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+    }
+  }
+}
+
+std::string CppWriter::getCppName(Type* Ty) {
+  // First, handle the primitive types .. easy
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+    switch (Ty->getTypeID()) {
+    case Type::VoidTyID:   return "Type::getVoidTy(mod->getContext())";
+    case Type::IntegerTyID: {
+      unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+      return "IntegerType::get(mod->getContext(), " + utostr(BitWidth) + ")";
+    }
+    case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(mod->getContext())";
+    case Type::FloatTyID:    return "Type::getFloatTy(mod->getContext())";
+    case Type::DoubleTyID:   return "Type::getDoubleTy(mod->getContext())";
+    case Type::LabelTyID:    return "Type::getLabelTy(mod->getContext())";
+    case Type::X86_MMXTyID:  return "Type::getX86_MMXTy(mod->getContext())";
+    default:
+      error("Invalid primitive type");
+      break;
+    }
+    // shouldn't be returned, but make it sensible
+    return "Type::getVoidTy(mod->getContext())";
+  }
+
+  // Now, see if we've seen the type before and return that
+  TypeMap::iterator I = TypeNames.find(Ty);
+  if (I != TypeNames.end())
+    return I->second;
+
+  // Okay, let's build a new name for this type. Start with a prefix
+  const char* prefix = 0;
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:    prefix = "FuncTy_"; break;
+  case Type::StructTyID:      prefix = "StructTy_"; break;
+  case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
+  case Type::PointerTyID:     prefix = "PointerTy_"; break;
+  case Type::VectorTyID:      prefix = "VectorTy_"; break;
+  default:                    prefix = "OtherTy_"; break; // prevent breakage
+  }
+
+  // See if the type has a name in the symboltable and build accordingly
+  std::string name;
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (STy->hasName())
+      name = STy->getName();
+  
+  if (name.empty())
+    name = utostr(uniqueNum++);
+  
+  name = std::string(prefix) + name;
+  sanitize(name);
+
+  // Save the name
+  return TypeNames[Ty] = name;
+}
+
+void CppWriter::printCppName(Type* Ty) {
+  printEscapedString(getCppName(Ty));
+}
+
+std::string CppWriter::getCppName(const Value* val) {
+  std::string name;
+  ValueMap::iterator I = ValueNames.find(val);
+  if (I != ValueNames.end() && I->first == val)
+    return  I->second;
+
+  if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
+    name = std::string("gvar_") +
+      getTypePrefix(GV->getType()->getElementType());
+  } else if (isa<Function>(val)) {
+    name = std::string("func_");
+  } else if (const Constant* C = dyn_cast<Constant>(val)) {
+    name = std::string("const_") + getTypePrefix(C->getType());
+  } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
+    if (is_inline) {
+      unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
+                                      Function::const_arg_iterator(Arg)) + 1;
+      name = std::string("arg_") + utostr(argNum);
+      NameSet::iterator NI = UsedNames.find(name);
+      if (NI != UsedNames.end())
+        name += std::string("_") + utostr(uniqueNum++);
+      UsedNames.insert(name);
+      return ValueNames[val] = name;
+    } else {
+      name = getTypePrefix(val->getType());
+    }
+  } else {
+    name = getTypePrefix(val->getType());
+  }
+  if (val->hasName())
+    name += val->getName();
+  else
+    name += utostr(uniqueNum++);
+  sanitize(name);
+  NameSet::iterator NI = UsedNames.find(name);
+  if (NI != UsedNames.end())
+    name += std::string("_") + utostr(uniqueNum++);
+  UsedNames.insert(name);
+  return ValueNames[val] = name;
+}
+
+void CppWriter::printCppName(const Value* val) {
+  printEscapedString(getCppName(val));
+}
+
+void CppWriter::printAttributes(const AttributeSet &PAL,
+                                const std::string &name) {
+  Out << "AttributeSet " << name << "_PAL;";
+  nl(Out);
+  if (!PAL.isEmpty()) {
+    Out << '{'; in(); nl(Out);
+    Out << "SmallVector<AttributeSet, 4> Attrs;"; nl(Out);
+    Out << "AttributeSet PAS;"; in(); nl(Out);
+    for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
+      unsigned index = PAL.getSlotIndex(i);
+      AttrBuilder attrs(PAL.getSlotAttributes(i), index);
+      Out << "{"; in(); nl(Out);
+      Out << "AttrBuilder B;"; nl(Out);
+
+#define HANDLE_ATTR(X)                                                  \
+      if (attrs.contains(Attribute::X)) {                               \
+        Out << "B.addAttribute(Attribute::" #X ");"; nl(Out);           \
+        attrs.removeAttribute(Attribute::X);                            \
+      }
+
+      HANDLE_ATTR(SExt);
+      HANDLE_ATTR(ZExt);
+      HANDLE_ATTR(NoReturn);
+      HANDLE_ATTR(InReg);
+      HANDLE_ATTR(StructRet);
+      HANDLE_ATTR(NoUnwind);
+      HANDLE_ATTR(NoAlias);
+      HANDLE_ATTR(ByVal);
+      HANDLE_ATTR(Nest);
+      HANDLE_ATTR(ReadNone);
+      HANDLE_ATTR(ReadOnly);
+      HANDLE_ATTR(NoInline);
+      HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeForSize);
+      HANDLE_ATTR(StackProtect);
+      HANDLE_ATTR(StackProtectReq);
+      HANDLE_ATTR(StackProtectStrong);
+      HANDLE_ATTR(NoCapture);
+      HANDLE_ATTR(NoRedZone);
+      HANDLE_ATTR(NoImplicitFloat);
+      HANDLE_ATTR(Naked);
+      HANDLE_ATTR(InlineHint);
+      HANDLE_ATTR(ReturnsTwice);
+      HANDLE_ATTR(UWTable);
+      HANDLE_ATTR(NonLazyBind);
+      HANDLE_ATTR(MinSize);
+#undef HANDLE_ATTR
+
+      if (attrs.contains(Attribute::StackAlignment)) {
+        Out << "B.addStackAlignmentAttr(" << attrs.getStackAlignment()<<')';
+        nl(Out);
+        attrs.removeAttribute(Attribute::StackAlignment);
+      }
+
+      Out << "PAS = AttributeSet::get(mod->getContext(), ";
+      if (index == ~0U)
+        Out << "~0U,";
+      else
+        Out << index << "U,";
+      Out << " B);"; out(); nl(Out);
+      Out << "}"; out(); nl(Out);
+      nl(Out);
+      Out << "Attrs.push_back(PAS);"; nl(Out);
+    }
+    Out << name << "_PAL = AttributeSet::get(mod->getContext(), Attrs);";
+    nl(Out);
+    out(); nl(Out);
+    Out << '}'; nl(Out);
+  }
+}
+
+void CppWriter::printType(Type* Ty) {
+  // We don't print definitions for primitive types
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy())
+    return;
+
+  // If we already defined this type, we don't need to define it again.
+  if (DefinedTypes.find(Ty) != DefinedTypes.end())
+    return;
+
+  // Everything below needs the name for the type so get it now.
+  std::string typeName(getCppName(Ty));
+
+  // Print the type definition
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID:  {
+    FunctionType* FT = cast<FunctionType>(Ty);
+    Out << "std::vector<Type*>" << typeName << "_args;";
+    nl(Out);
+    FunctionType::param_iterator PI = FT->param_begin();
+    FunctionType::param_iterator PE = FT->param_end();
+    for (; PI != PE; ++PI) {
+      Type* argTy = static_cast<Type*>(*PI);
+      printType(argTy);
+      std::string argName(getCppName(argTy));
+      Out << typeName << "_args.push_back(" << argName;
+      Out << ");";
+      nl(Out);
+    }
+    printType(FT->getReturnType());
+    std::string retTypeName(getCppName(FT->getReturnType()));
+    Out << "FunctionType* " << typeName << " = FunctionType::get(";
+    in(); nl(Out) << "/*Result=*/" << retTypeName;
+    Out << ",";
+    nl(Out) << "/*Params=*/" << typeName << "_args,";
+    nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
+    out();
+    nl(Out);
+    break;
+  }
+  case Type::StructTyID: {
+    StructType* ST = cast<StructType>(Ty);
+    if (!ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = mod->getTypeByName(\"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "if (!" << typeName << ") {";
+      nl(Out);
+      Out << typeName << " = ";
+      Out << "StructType::create(mod->getContext(), \"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "}";
+      nl(Out);
+      // Indicate that this type is now defined.
+      DefinedTypes.insert(Ty);
+    }
+
+    Out << "std::vector<Type*>" << typeName << "_fields;";
+    nl(Out);
+    StructType::element_iterator EI = ST->element_begin();
+    StructType::element_iterator EE = ST->element_end();
+    for (; EI != EE; ++EI) {
+      Type* fieldTy = static_cast<Type*>(*EI);
+      printType(fieldTy);
+      std::string fieldName(getCppName(fieldTy));
+      Out << typeName << "_fields.push_back(" << fieldName;
+      Out << ");";
+      nl(Out);
+    }
+
+    if (ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = ";
+      Out << "StructType::get(" << "mod->getContext(), ";
+    } else {
+      Out << "if (" << typeName << "->isOpaque()) {";
+      nl(Out);
+      Out << typeName << "->setBody(";
+    }
+
+    Out << typeName << "_fields, /*isPacked=*/"
+        << (ST->isPacked() ? "true" : "false") << ");";
+    nl(Out);
+    if (!ST->isLiteral()) {
+      Out << "}";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::ArrayTyID: {
+    ArrayType* AT = cast<ArrayType>(Ty);
+    Type* ET = AT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "ArrayType* " << typeName << " = ArrayType::get("
+          << elemName
+          << ", " << utostr(AT->getNumElements()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    PointerType* PT = cast<PointerType>(Ty);
+    Type* ET = PT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "PointerType* " << typeName << " = PointerType::get("
+          << elemName
+          << ", " << utostr(PT->getAddressSpace()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Type::VectorTyID: {
+    VectorType* PT = cast<VectorType>(Ty);
+    Type* ET = PT->getElementType();
+    printType(ET);
+    if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
+      std::string elemName(getCppName(ET));
+      Out << "VectorType* " << typeName << " = VectorType::get("
+          << elemName
+          << ", " << utostr(PT->getNumElements()) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  default:
+    error("Invalid TypeID");
+  }
+
+  // Indicate that this type is now defined.
+  DefinedTypes.insert(Ty);
+
+  // Finally, separate the type definition from other with a newline.
+  nl(Out);
+}
+
+void CppWriter::printTypes(const Module* M) {
+  // Add all of the global variables to the value table.
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer())
+      printType(I->getInitializer()->getType());
+    printType(I->getType());
+  }
+
+  // Add all the functions to the table
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    printType(FI->getReturnType());
+    printType(FI->getFunctionType());
+    // Add all the function arguments
+    for (Function::const_arg_iterator AI = FI->arg_begin(),
+           AE = FI->arg_end(); AI != AE; ++AI) {
+      printType(AI->getType());
+    }
+
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      printType(BB->getType());
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        printType(I->getType());
+        for (unsigned i = 0; i < I->getNumOperands(); ++i)
+          printType(I->getOperand(i)->getType());
+      }
+    }
+  }
+}
+
+
+// printConstant - Print out a constant pool entry...
+void CppWriter::printConstant(const Constant *CV) {
+  // First, if the constant is actually a GlobalValue (variable or function)
+  // or its already in the constant list then we've printed it already and we
+  // can just return.
+  if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
+    return;
+
+  std::string constName(getCppName(CV));
+  std::string typeName(getCppName(CV->getType()));
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    std::string constValue = CI->getValue().toString(10, true);
+    Out << "ConstantInt* " << constName
+        << " = ConstantInt::get(mod->getContext(), APInt("
+        << cast<IntegerType>(CI->getType())->getBitWidth()
+        << ", StringRef(\"" <<  constValue << "\"), 10));";
+  } else if (isa<ConstantAggregateZero>(CV)) {
+    Out << "ConstantAggregateZero* " << constName
+        << " = ConstantAggregateZero::get(" << typeName << ");";
+  } else if (isa<ConstantPointerNull>(CV)) {
+    Out << "ConstantPointerNull* " << constName
+        << " = ConstantPointerNull::get(" << typeName << ");";
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    Out << "ConstantFP* " << constName << " = ";
+    printCFP(CFP);
+    Out << ";";
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_elems;";
+    nl(Out);
+    unsigned N = CA->getNumOperands();
+    for (unsigned i = 0; i < N; ++i) {
+      printConstant(CA->getOperand(i)); // recurse to print operands
+      Out << constName << "_elems.push_back("
+          << getCppName(CA->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantArray::get("
+        << typeName << ", " << constName << "_elems);";
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_fields;";
+    nl(Out);
+    unsigned N = CS->getNumOperands();
+    for (unsigned i = 0; i < N; i++) {
+      printConstant(CS->getOperand(i));
+      Out << constName << "_fields.push_back("
+          << getCppName(CS->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantStruct::get("
+        << typeName << ", " << constName << "_fields);";
+  } else if (const ConstantVector *CVec = dyn_cast<ConstantVector>(CV)) {
+    Out << "std::vector<Constant*> " << constName << "_elems;";
+    nl(Out);
+    unsigned N = CVec->getNumOperands();
+    for (unsigned i = 0; i < N; ++i) {
+      printConstant(CVec->getOperand(i));
+      Out << constName << "_elems.push_back("
+          << getCppName(CVec->getOperand(i)) << ");";
+      nl(Out);
+    }
+    Out << "Constant* " << constName << " = ConstantVector::get("
+        << typeName << ", " << constName << "_elems);";
+  } else if (isa<UndefValue>(CV)) {
+    Out << "UndefValue* " << constName << " = UndefValue::get("
+        << typeName << ");";
+  } else if (const ConstantDataSequential *CDS =
+               dyn_cast<ConstantDataSequential>(CV)) {
+    if (CDS->isString()) {
+      Out << "Constant *" << constName <<
+      " = ConstantDataArray::getString(mod->getContext(), \"";
+      StringRef Str = CDS->getAsString();
+      bool nullTerminate = false;
+      if (Str.back() == 0) {
+        Str = Str.drop_back();
+        nullTerminate = true;
+      }
+      printEscapedString(Str);
+      // Determine if we want null termination or not.
+      if (nullTerminate)
+        Out << "\", true);";
+      else
+        Out << "\", false);";// No null terminator
+    } else {
+      // TODO: Could generate more efficient code generating CDS calls instead.
+      Out << "std::vector<Constant*> " << constName << "_elems;";
+      nl(Out);
+      for (unsigned i = 0; i != CDS->getNumElements(); ++i) {
+        Constant *Elt = CDS->getElementAsConstant(i);
+        printConstant(Elt);
+        Out << constName << "_elems.push_back(" << getCppName(Elt) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName;
+      
+      if (isa<ArrayType>(CDS->getType()))
+        Out << " = ConstantArray::get(";
+      else
+        Out << " = ConstantVector::get(";
+      Out << typeName << ", " << constName << "_elems);";
+    }
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      Out << "std::vector<Constant*> " << constName << "_indices;";
+      nl(Out);
+      printConstant(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
+        printConstant(CE->getOperand(i));
+        Out << constName << "_indices.push_back("
+            << getCppName(CE->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName
+          << " = ConstantExpr::getGetElementPtr("
+          << getCppName(CE->getOperand(0)) << ", "
+          << constName << "_indices);";
+    } else if (CE->isCast()) {
+      printConstant(CE->getOperand(0));
+      Out << "Constant* " << constName << " = ConstantExpr::getCast(";
+      switch (CE->getOpcode()) {
+      default: llvm_unreachable("Invalid cast opcode");
+      case Instruction::Trunc: Out << "Instruction::Trunc"; break;
+      case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
+      case Instruction::SExt:  Out << "Instruction::SExt"; break;
+      case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
+      case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
+      case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
+      case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
+      case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
+      case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
+      case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
+      case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
+      case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
+      }
+      Out << ", " << getCppName(CE->getOperand(0)) << ", "
+          << getCppName(CE->getType()) << ");";
+    } else {
+      unsigned N = CE->getNumOperands();
+      for (unsigned i = 0; i < N; ++i ) {
+        printConstant(CE->getOperand(i));
+      }
+      Out << "Constant* " << constName << " = ConstantExpr::";
+      switch (CE->getOpcode()) {
+      case Instruction::Add:    Out << "getAdd(";  break;
+      case Instruction::FAdd:   Out << "getFAdd(";  break;
+      case Instruction::Sub:    Out << "getSub("; break;
+      case Instruction::FSub:   Out << "getFSub("; break;
+      case Instruction::Mul:    Out << "getMul("; break;
+      case Instruction::FMul:   Out << "getFMul("; break;
+      case Instruction::UDiv:   Out << "getUDiv("; break;
+      case Instruction::SDiv:   Out << "getSDiv("; break;
+      case Instruction::FDiv:   Out << "getFDiv("; break;
+      case Instruction::URem:   Out << "getURem("; break;
+      case Instruction::SRem:   Out << "getSRem("; break;
+      case Instruction::FRem:   Out << "getFRem("; break;
+      case Instruction::And:    Out << "getAnd("; break;
+      case Instruction::Or:     Out << "getOr("; break;
+      case Instruction::Xor:    Out << "getXor("; break;
+      case Instruction::ICmp:
+        Out << "getICmp(ICmpInst::ICMP_";
+        switch (CE->getPredicate()) {
+        case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
+        case ICmpInst::ICMP_NE:  Out << "NE"; break;
+        case ICmpInst::ICMP_SLT: Out << "SLT"; break;
+        case ICmpInst::ICMP_ULT: Out << "ULT"; break;
+        case ICmpInst::ICMP_SGT: Out << "SGT"; break;
+        case ICmpInst::ICMP_UGT: Out << "UGT"; break;
+        case ICmpInst::ICMP_SLE: Out << "SLE"; break;
+        case ICmpInst::ICMP_ULE: Out << "ULE"; break;
+        case ICmpInst::ICMP_SGE: Out << "SGE"; break;
+        case ICmpInst::ICMP_UGE: Out << "UGE"; break;
+        default: error("Invalid ICmp Predicate");
+        }
+        break;
+      case Instruction::FCmp:
+        Out << "getFCmp(FCmpInst::FCMP_";
+        switch (CE->getPredicate()) {
+        case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
+        case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
+        case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
+        case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
+        case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
+        case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
+        case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
+        case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
+        case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
+        case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
+        case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
+        case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
+        case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
+        case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
+        case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
+        case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
+        default: error("Invalid FCmp Predicate");
+        }
+        break;
+      case Instruction::Shl:     Out << "getShl("; break;
+      case Instruction::LShr:    Out << "getLShr("; break;
+      case Instruction::AShr:    Out << "getAShr("; break;
+      case Instruction::Select:  Out << "getSelect("; break;
+      case Instruction::ExtractElement: Out << "getExtractElement("; break;
+      case Instruction::InsertElement:  Out << "getInsertElement("; break;
+      case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
+      default:
+        error("Invalid constant expression");
+        break;
+      }
+      Out << getCppName(CE->getOperand(0));
+      for (unsigned i = 1; i < CE->getNumOperands(); ++i)
+        Out << ", " << getCppName(CE->getOperand(i));
+      Out << ");";
+    }
+  } else if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
+    Out << "Constant* " << constName << " = ";
+    Out << "BlockAddress::get(" << getOpName(BA->getBasicBlock()) << ");";
+  } else {
+    error("Bad Constant");
+    Out << "Constant* " << constName << " = 0; ";
+  }
+  nl(Out);
+}
+
+void CppWriter::printConstants(const Module* M) {
+  // Traverse all the global variables looking for constant initializers
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I)
+    if (I->hasInitializer())
+      printConstant(I->getInitializer());
+
+  // Traverse the LLVM functions looking for constants
+  for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+       FI != FE; ++FI) {
+    // Add all of the basic blocks and instructions
+    for (Function::const_iterator BB = FI->begin(),
+           E = FI->end(); BB != E; ++BB) {
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+           ++I) {
+        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+          if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
+            printConstant(C);
+          }
+        }
+      }
+    }
+  }
+}
+
+void CppWriter::printVariableUses(const GlobalVariable *GV) {
+  nl(Out) << "// Type Definitions";
+  nl(Out);
+  printType(GV->getType());
+  if (GV->hasInitializer()) {
+    const Constant *Init = GV->getInitializer();
+    printType(Init->getType());
+    if (const Function *F = dyn_cast<Function>(Init)) {
+      nl(Out)<< "/ Function Declarations"; nl(Out);
+      printFunctionHead(F);
+    } else if (const GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+      nl(Out) << "// Global Variable Declarations"; nl(Out);
+      printVariableHead(gv);
+      
+      nl(Out) << "// Global Variable Definitions"; nl(Out);
+      printVariableBody(gv);
+    } else  {
+      nl(Out) << "// Constant Definitions"; nl(Out);
+      printConstant(Init);
+    }
+  }
+}
+
+void CppWriter::printVariableHead(const GlobalVariable *GV) {
+  nl(Out) << "GlobalVariable* " << getCppName(GV);
+  if (is_inline) {
+    Out << " = mod->getGlobalVariable(mod->getContext(), ";
+    printEscapedString(GV->getName());
+    Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
+    nl(Out) << "if (!" << getCppName(GV) << ") {";
+    in(); nl(Out) << getCppName(GV);
+  }
+  Out << " = new GlobalVariable(/*Module=*/*mod, ";
+  nl(Out) << "/*Type=*/";
+  printCppName(GV->getType()->getElementType());
+  Out << ",";
+  nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
+  Out << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(GV->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Initializer=*/0, ";
+  if (GV->hasInitializer()) {
+    Out << "// has initializer, specified below";
+  }
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(GV->getName());
+  Out << "\");";
+  nl(Out);
+
+  if (GV->hasSection()) {
+    printCppName(GV);
+    Out << "->setSection(\"";
+    printEscapedString(GV->getSection());
+    Out << "\");";
+    nl(Out);
+  }
+  if (GV->getAlignment()) {
+    printCppName(GV);
+    Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+    nl(Out);
+  }
+  if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
+    printCppName(GV);
+    Out << "->setVisibility(";
+    printVisibilityType(GV->getVisibility());
+    Out << ");";
+    nl(Out);
+  }
+  if (GV->isThreadLocal()) {
+    printCppName(GV);
+    Out << "->setThreadLocalMode(";
+    printThreadLocalMode(GV->getThreadLocalMode());
+    Out << ");";
+    nl(Out);
+  }
+  if (is_inline) {
+    out(); Out << "}"; nl(Out);
+  }
+}
+
+void CppWriter::printVariableBody(const GlobalVariable *GV) {
+  if (GV->hasInitializer()) {
+    printCppName(GV);
+    Out << "->setInitializer(";
+    Out << getCppName(GV->getInitializer()) << ");";
+    nl(Out);
+  }
+}
+
+std::string CppWriter::getOpName(const Value* V) {
+  if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
+    return getCppName(V);
+
+  // See if its alread in the map of forward references, if so just return the
+  // name we already set up for it
+  ForwardRefMap::const_iterator I = ForwardRefs.find(V);
+  if (I != ForwardRefs.end())
+    return I->second;
+
+  // This is a new forward reference. Generate a unique name for it
+  std::string result(std::string("fwdref_") + utostr(uniqueNum++));
+
+  // Yes, this is a hack. An Argument is the smallest instantiable value that
+  // we can make as a placeholder for the real value. We'll replace these
+  // Argument instances later.
+  Out << "Argument* " << result << " = new Argument("
+      << getCppName(V->getType()) << ");";
+  nl(Out);
+  ForwardRefs[V] = result;
+  return result;
+}
+
+static StringRef ConvertAtomicOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+    case NotAtomic: return "NotAtomic";
+    case Unordered: return "Unordered";
+    case Monotonic: return "Monotonic";
+    case Acquire: return "Acquire";
+    case Release: return "Release";
+    case AcquireRelease: return "AcquireRelease";
+    case SequentiallyConsistent: return "SequentiallyConsistent";
+  }
+  llvm_unreachable("Unknown ordering");
+}
+
+static StringRef ConvertAtomicSynchScope(SynchronizationScope SynchScope) {
+  switch (SynchScope) {
+    case SingleThread: return "SingleThread";
+    case CrossThread: return "CrossThread";
+  }
+  llvm_unreachable("Unknown synch scope");
+}
+
+// printInstruction - This member is called for each Instruction in a function.
+void CppWriter::printInstruction(const Instruction *I,
+                                 const std::string& bbname) {
+  std::string iName(getCppName(I));
+
+  // Before we emit this instruction, we need to take care of generating any
+  // forward references. So, we get the names of all the operands in advance
+  const unsigned Ops(I->getNumOperands());
+  std::string* opNames = new std::string[Ops];
+  for (unsigned i = 0; i < Ops; i++)
+    opNames[i] = getOpName(I->getOperand(i));
+
+  switch (I->getOpcode()) {
+  default:
+    error("Invalid instruction");
+    break;
+
+  case Instruction::Ret: {
+    const ReturnInst* ret =  cast<ReturnInst>(I);
+    Out << "ReturnInst::Create(mod->getContext(), "
+        << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
+    break;
+  }
+  case Instruction::Br: {
+    const BranchInst* br = cast<BranchInst>(I);
+    Out << "BranchInst::Create(" ;
+    if (br->getNumOperands() == 3) {
+      Out << opNames[2] << ", "
+          << opNames[1] << ", "
+          << opNames[0] << ", ";
+
+    } else if (br->getNumOperands() == 1) {
+      Out << opNames[0] << ", ";
+    } else {
+      error("Branch with 2 operands?");
+    }
+    Out << bbname << ");";
+    break;
+  }
+  case Instruction::Switch: {
+    const SwitchInst *SI = cast<SwitchInst>(I);
+    Out << "SwitchInst* " << iName << " = SwitchInst::Create("
+        << getOpName(SI->getCondition()) << ", "
+        << getOpName(SI->getDefaultDest()) << ", "
+        << SI->getNumCases() << ", " << bbname << ");";
+    nl(Out);
+    for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      const IntegersSubset CaseVal = i.getCaseValueEx();
+      const BasicBlock *BB = i.getCaseSuccessor();
+      Out << iName << "->addCase("
+          << getOpName(CaseVal) << ", "
+          << getOpName(BB) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::IndirectBr: {
+    const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
+    Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
+        << opNames[0] << ", " << IBI->getNumDestinations() << ");";
+    nl(Out);
+    for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
+      Out << iName << "->addDestination(" << opNames[i] << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::Resume: {
+    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
+        << ", " << bbname << ");";
+    break;
+  }
+  case Instruction::Invoke: {
+    const InvokeInst* inv = cast<InvokeInst>(I);
+    Out << "std::vector<Value*> " << iName << "_params;";
+    nl(Out);
+    for (unsigned i = 0; i < inv->getNumArgOperands(); ++i) {
+      Out << iName << "_params.push_back("
+          << getOpName(inv->getArgOperand(i)) << ");";
+      nl(Out);
+    }
+    // FIXME: This shouldn't use magic numbers -3, -2, and -1.
+    Out << "InvokeInst *" << iName << " = InvokeInst::Create("
+        << getOpName(inv->getCalledFunction()) << ", "
+        << getOpName(inv->getNormalDest()) << ", "
+        << getOpName(inv->getUnwindDest()) << ", "
+        << iName << "_params, \"";
+    printEscapedString(inv->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(inv->getCallingConv());
+    Out << ");";
+    printAttributes(inv->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Unreachable: {
+    Out << "new UnreachableInst("
+        << "mod->getContext(), "
+        << bbname << ");";
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:{
+    Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
+    switch (I->getOpcode()) {
+    case Instruction::Add: Out << "Instruction::Add"; break;
+    case Instruction::FAdd: Out << "Instruction::FAdd"; break;
+    case Instruction::Sub: Out << "Instruction::Sub"; break;
+    case Instruction::FSub: Out << "Instruction::FSub"; break;
+    case Instruction::Mul: Out << "Instruction::Mul"; break;
+    case Instruction::FMul: Out << "Instruction::FMul"; break;
+    case Instruction::UDiv:Out << "Instruction::UDiv"; break;
+    case Instruction::SDiv:Out << "Instruction::SDiv"; break;
+    case Instruction::FDiv:Out << "Instruction::FDiv"; break;
+    case Instruction::URem:Out << "Instruction::URem"; break;
+    case Instruction::SRem:Out << "Instruction::SRem"; break;
+    case Instruction::FRem:Out << "Instruction::FRem"; break;
+    case Instruction::And: Out << "Instruction::And"; break;
+    case Instruction::Or:  Out << "Instruction::Or";  break;
+    case Instruction::Xor: Out << "Instruction::Xor"; break;
+    case Instruction::Shl: Out << "Instruction::Shl"; break;
+    case Instruction::LShr:Out << "Instruction::LShr"; break;
+    case Instruction::AShr:Out << "Instruction::AShr"; break;
+    default: Out << "Instruction::BadOpCode"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::FCmp: {
+    Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
+    switch (cast<FCmpInst>(I)->getPredicate()) {
+    case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
+    case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
+    case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
+    case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
+    case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
+    case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
+    case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
+    case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
+    case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
+    case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
+    case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
+    case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
+    case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
+    case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
+    case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
+    case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
+    default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::ICmp: {
+    Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
+    switch (cast<ICmpInst>(I)->getPredicate()) {
+    case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
+    case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
+    case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
+    case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
+    case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
+    case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
+    case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
+    case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
+    case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
+    case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
+    default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
+    }
+    Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+    printEscapedString(I->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst* allocaI = cast<AllocaInst>(I);
+    Out << "AllocaInst* " << iName << " = new AllocaInst("
+        << getCppName(allocaI->getAllocatedType()) << ", ";
+    if (allocaI->isArrayAllocation())
+      Out << opNames[0] << ", ";
+    Out << "\"";
+    printEscapedString(allocaI->getName());
+    Out << "\", " << bbname << ");";
+    if (allocaI->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+          << allocaI->getAlignment() << ");";
+    break;
+  }
+  case Instruction::Load: {
+    const LoadInst* load = cast<LoadInst>(I);
+    Out << "LoadInst* " << iName << " = new LoadInst("
+        << opNames[0] << ", \"";
+    printEscapedString(load->getName());
+    Out << "\", " << (load->isVolatile() ? "true" : "false" )
+        << ", " << bbname << ");";
+    if (load->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+              << load->getAlignment() << ");";
+    if (load->isAtomic()) {
+      StringRef Ordering = ConvertAtomicOrdering(load->getOrdering());
+      StringRef CrossThread = ConvertAtomicSynchScope(load->getSynchScope());
+      nl(Out) << iName << "->setAtomic("
+              << Ordering << ", " << CrossThread << ");";
+    }
+    break;
+  }
+  case Instruction::Store: {
+    const StoreInst* store = cast<StoreInst>(I);
+    Out << "StoreInst* " << iName << " = new StoreInst("
+        << opNames[0] << ", "
+        << opNames[1] << ", "
+        << (store->isVolatile() ? "true" : "false")
+        << ", " << bbname << ");";
+    if (store->getAlignment())
+      nl(Out) << iName << "->setAlignment("
+              << store->getAlignment() << ");";
+    if (store->isAtomic()) {
+      StringRef Ordering = ConvertAtomicOrdering(store->getOrdering());
+      StringRef CrossThread = ConvertAtomicSynchScope(store->getSynchScope());
+      nl(Out) << iName << "->setAtomic("
+              << Ordering << ", " << CrossThread << ");";
+    }
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
+    if (gep->getNumOperands() <= 2) {
+      Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0];
+      if (gep->getNumOperands() == 2)
+        Out << ", " << opNames[1];
+    } else {
+      Out << "std::vector<Value*> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+        Out << iName << "_indices.push_back("
+            << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
+          << opNames[0] << ", " << iName << "_indices";
+    }
+    Out << ", \"";
+    printEscapedString(gep->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::PHI: {
+    const PHINode* phi = cast<PHINode>(I);
+
+    Out << "PHINode* " << iName << " = PHINode::Create("
+        << getCppName(phi->getType()) << ", "
+        << phi->getNumIncomingValues() << ", \"";
+    printEscapedString(phi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out);
+    for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
+      Out << iName << "->addIncoming("
+          << opNames[PHINode::getOperandNumForIncomingValue(i)] << ", "
+          << getOpName(phi->getIncomingBlock(i)) << ");";
+      nl(Out);
+    }
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast: {
+    const CastInst* cst = cast<CastInst>(I);
+    Out << "CastInst* " << iName << " = new ";
+    switch (I->getOpcode()) {
+    case Instruction::Trunc:    Out << "TruncInst"; break;
+    case Instruction::ZExt:     Out << "ZExtInst"; break;
+    case Instruction::SExt:     Out << "SExtInst"; break;
+    case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
+    case Instruction::FPExt:    Out << "FPExtInst"; break;
+    case Instruction::FPToUI:   Out << "FPToUIInst"; break;
+    case Instruction::FPToSI:   Out << "FPToSIInst"; break;
+    case Instruction::UIToFP:   Out << "UIToFPInst"; break;
+    case Instruction::SIToFP:   Out << "SIToFPInst"; break;
+    case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
+    case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
+    case Instruction::BitCast:  Out << "BitCastInst"; break;
+    default: llvm_unreachable("Unreachable");
+    }
+    Out << "(" << opNames[0] << ", "
+        << getCppName(cst->getType()) << ", \"";
+    printEscapedString(cst->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::Call: {
+    const CallInst* call = cast<CallInst>(I);
+    if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
+      Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
+          << getCppName(ila->getFunctionType()) << ", \""
+          << ila->getAsmString() << "\", \""
+          << ila->getConstraintString() << "\","
+          << (ila->hasSideEffects() ? "true" : "false") << ");";
+      nl(Out);
+    }
+    if (call->getNumArgOperands() > 1) {
+      Out << "std::vector<Value*> " << iName << "_params;";
+      nl(Out);
+      for (unsigned i = 0; i < call->getNumArgOperands(); ++i) {
+        Out << iName << "_params.push_back(" << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", "
+          << iName << "_params, \"";
+    } else if (call->getNumArgOperands() == 1) {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
+    } else {
+      Out << "CallInst* " << iName << " = CallInst::Create("
+          << opNames[call->getNumArgOperands()] << ", \"";
+    }
+    printEscapedString(call->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCallingConv(";
+    printCallingConv(call->getCallingConv());
+    Out << ");";
+    nl(Out) << iName << "->setTailCall("
+        << (call->isTailCall() ? "true" : "false");
+    Out << ");";
+    nl(Out);
+    printAttributes(call->getAttributes(), iName);
+    Out << iName << "->setAttributes(" << iName << "_PAL);";
+    nl(Out);
+    break;
+  }
+  case Instruction::Select: {
+    const SelectInst* sel = cast<SelectInst>(I);
+    Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
+    Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(sel->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::UserOp1:
+    /// FALL THROUGH
+  case Instruction::UserOp2: {
+    /// FIXME: What should be done here?
+    break;
+  }
+  case Instruction::VAArg: {
+    const VAArgInst* va = cast<VAArgInst>(I);
+    Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
+        << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
+    printEscapedString(va->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractElement: {
+    const ExtractElementInst* eei = cast<ExtractElementInst>(I);
+    Out << "ExtractElementInst* " << getCppName(eei)
+        << " = new ExtractElementInst(" << opNames[0]
+        << ", " << opNames[1] << ", \"";
+    printEscapedString(eei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertElement: {
+    const InsertElementInst* iei = cast<InsertElementInst>(I);
+    Out << "InsertElementInst* " << getCppName(iei)
+        << " = InsertElementInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(iei->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
+    Out << "ShuffleVectorInst* " << getCppName(svi)
+        << " = new ShuffleVectorInst(" << opNames[0]
+        << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+    printEscapedString(svi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::ExtractValue: {
+    const ExtractValueInst *evi = cast<ExtractValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << evi->idx_begin()[i] << ");";
+      nl(Out);
+    }
+    Out << "ExtractValueInst* " << getCppName(evi)
+        << " = ExtractValueInst::Create(" << opNames[0]
+        << ", "
+        << iName << "_indices, \"";
+    printEscapedString(evi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::InsertValue: {
+    const InsertValueInst *ivi = cast<InsertValueInst>(I);
+    Out << "std::vector<unsigned> " << iName << "_indices;";
+    nl(Out);
+    for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
+      Out << iName << "_indices.push_back("
+          << ivi->idx_begin()[i] << ");";
+      nl(Out);
+    }
+    Out << "InsertValueInst* " << getCppName(ivi)
+        << " = InsertValueInst::Create(" << opNames[0]
+        << ", " << opNames[1] << ", "
+        << iName << "_indices, \"";
+    printEscapedString(ivi->getName());
+    Out << "\", " << bbname << ");";
+    break;
+  }
+  case Instruction::Fence: {
+    const FenceInst *fi = cast<FenceInst>(I);
+    StringRef Ordering = ConvertAtomicOrdering(fi->getOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(fi->getSynchScope());
+    Out << "FenceInst* " << iName
+        << " = new FenceInst(mod->getContext(), "
+        << Ordering << ", " << CrossThread << ", " << bbname
+        << ");";
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    const AtomicCmpXchgInst *cxi = cast<AtomicCmpXchgInst>(I);
+    StringRef Ordering = ConvertAtomicOrdering(cxi->getOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(cxi->getSynchScope());
+    Out << "AtomicCmpXchgInst* " << iName
+        << " = new AtomicCmpXchgInst("
+        << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", "
+        << Ordering << ", " << CrossThread << ", " << bbname
+        << ");";
+    nl(Out) << iName << "->setName(\"";
+    printEscapedString(cxi->getName());
+    Out << "\");";
+    break;
+  }
+  case Instruction::AtomicRMW: {
+    const AtomicRMWInst *rmwi = cast<AtomicRMWInst>(I);
+    StringRef Ordering = ConvertAtomicOrdering(rmwi->getOrdering());
+    StringRef CrossThread = ConvertAtomicSynchScope(rmwi->getSynchScope());
+    StringRef Operation;
+    switch (rmwi->getOperation()) {
+      case AtomicRMWInst::Xchg: Operation = "AtomicRMWInst::Xchg"; break;
+      case AtomicRMWInst::Add:  Operation = "AtomicRMWInst::Add"; break;
+      case AtomicRMWInst::Sub:  Operation = "AtomicRMWInst::Sub"; break;
+      case AtomicRMWInst::And:  Operation = "AtomicRMWInst::And"; break;
+      case AtomicRMWInst::Nand: Operation = "AtomicRMWInst::Nand"; break;
+      case AtomicRMWInst::Or:   Operation = "AtomicRMWInst::Or"; break;
+      case AtomicRMWInst::Xor:  Operation = "AtomicRMWInst::Xor"; break;
+      case AtomicRMWInst::Max:  Operation = "AtomicRMWInst::Max"; break;
+      case AtomicRMWInst::Min:  Operation = "AtomicRMWInst::Min"; break;
+      case AtomicRMWInst::UMax: Operation = "AtomicRMWInst::UMax"; break;
+      case AtomicRMWInst::UMin: Operation = "AtomicRMWInst::UMin"; break;
+      case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Bad atomic operation");
+    }
+    Out << "AtomicRMWInst* " << iName
+        << " = new AtomicRMWInst("
+        << Operation << ", "
+        << opNames[0] << ", " << opNames[1] << ", "
+        << Ordering << ", " << CrossThread << ", " << bbname
+        << ");";
+    nl(Out) << iName << "->setName(\"";
+    printEscapedString(rmwi->getName());
+    Out << "\");";
+    break;
+  }
+  }
+  DefinedValues.insert(I);
+  nl(Out);
+  delete [] opNames;
+}
+
+// Print out the types, constants and declarations needed by one function
+void CppWriter::printFunctionUses(const Function* F) {
+  nl(Out) << "// Type Definitions"; nl(Out);
+  if (!is_inline) {
+    // Print the function's return type
+    printType(F->getReturnType());
+
+    // Print the function's function type
+    printType(F->getFunctionType());
+
+    // Print the types of each of the function's arguments
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      printType(AI->getType());
+    }
+  }
+
+  // Print type definitions for every type referenced by an instruction and
+  // make a note of any global values or constants that are referenced
+  SmallPtrSet<GlobalValue*,64> gvs;
+  SmallPtrSet<Constant*,64> consts;
+  for (Function::const_iterator BB = F->begin(), BE = F->end();
+       BB != BE; ++BB){
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      // Print the type of the instruction itself
+      printType(I->getType());
+
+      // Print the type of each of the instruction's operands
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value* operand = I->getOperand(i);
+        printType(operand->getType());
+
+        // If the operand references a GVal or Constant, make a note of it
+        if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+          gvs.insert(GV);
+          if (GenerationType != GenFunction)
+            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+              if (GVar->hasInitializer())
+                consts.insert(GVar->getInitializer());
+        } else if (Constant* C = dyn_cast<Constant>(operand)) {
+          consts.insert(C);
+          for (unsigned j = 0; j < C->getNumOperands(); ++j) {
+            // If the operand references a GVal or Constant, make a note of it
+            Value* operand = C->getOperand(j);
+            printType(operand->getType());
+            if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+              gvs.insert(GV);
+              if (GenerationType != GenFunction)
+                if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+                  if (GVar->hasInitializer())
+                    consts.insert(GVar->getInitializer());
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Print the function declarations for any functions encountered
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+       I != E; ++I) {
+    if (Function* Fun = dyn_cast<Function>(*I)) {
+      if (!is_inline || Fun != F)
+        printFunctionHead(Fun);
+    }
+  }
+
+  // Print the global variable declarations for any variables encountered
+  nl(Out) << "// Global Variable Declarations"; nl(Out);
+  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+       I != E; ++I) {
+    if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+      printVariableHead(F);
+  }
+
+  // Print the constants found
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
+         E = consts.end(); I != E; ++I) {
+    printConstant(*I);
+  }
+
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  if (GenerationType != GenFunction) {
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
+        printVariableBody(GV);
+    }
+  }
+}
+
+void CppWriter::printFunctionHead(const Function* F) {
+  nl(Out) << "Function* " << getCppName(F);
+  Out << " = mod->getFunction(\"";
+  printEscapedString(F->getName());
+  Out << "\");";
+  nl(Out) << "if (!" << getCppName(F) << ") {";
+  nl(Out) << getCppName(F);
+
+  Out<< " = Function::Create(";
+  nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
+  nl(Out) << "/*Linkage=*/";
+  printLinkageType(F->getLinkage());
+  Out << ",";
+  nl(Out) << "/*Name=*/\"";
+  printEscapedString(F->getName());
+  Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
+  nl(Out,-1);
+  printCppName(F);
+  Out << "->setCallingConv(";
+  printCallingConv(F->getCallingConv());
+  Out << ");";
+  nl(Out);
+  if (F->hasSection()) {
+    printCppName(F);
+    Out << "->setSection(\"" << F->getSection() << "\");";
+    nl(Out);
+  }
+  if (F->getAlignment()) {
+    printCppName(F);
+    Out << "->setAlignment(" << F->getAlignment() << ");";
+    nl(Out);
+  }
+  if (F->getVisibility() != GlobalValue::DefaultVisibility) {
+    printCppName(F);
+    Out << "->setVisibility(";
+    printVisibilityType(F->getVisibility());
+    Out << ");";
+    nl(Out);
+  }
+  if (F->hasGC()) {
+    printCppName(F);
+    Out << "->setGC(\"" << F->getGC() << "\");";
+    nl(Out);
+  }
+  Out << "}";
+  nl(Out);
+  printAttributes(F->getAttributes(), getCppName(F));
+  printCppName(F);
+  Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+  nl(Out);
+}
+
+void CppWriter::printFunctionBody(const Function *F) {
+  if (F->isDeclaration())
+    return; // external functions have no bodies.
+
+  // Clear the DefinedValues and ForwardRefs maps because we can't have
+  // cross-function forward refs
+  ForwardRefs.clear();
+  DefinedValues.clear();
+
+  // Create all the argument values
+  if (!is_inline) {
+    if (!F->arg_empty()) {
+      Out << "Function::arg_iterator args = " << getCppName(F)
+          << "->arg_begin();";
+      nl(Out);
+    }
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      Out << "Value* " << getCppName(AI) << " = args++;";
+      nl(Out);
+      if (AI->hasName()) {
+        Out << getCppName(AI) << "->setName(\"";
+        printEscapedString(AI->getName());
+        Out << "\");";
+        nl(Out);
+      }
+    }
+  }
+
+  // Create all the basic blocks
+  nl(Out);
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    Out << "BasicBlock* " << bbname <<
+           " = BasicBlock::Create(mod->getContext(), \"";
+    if (BI->hasName())
+      printEscapedString(BI->getName());
+    Out << "\"," << getCppName(BI->getParent()) << ",0);";
+    nl(Out);
+  }
+
+  // Output all of its basic blocks... for the function
+  for (Function::const_iterator BI = F->begin(), BE = F->end();
+       BI != BE; ++BI) {
+    std::string bbname(getCppName(BI));
+    nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+    nl(Out);
+
+    // Output all of the instructions in the basic block...
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
+         I != E; ++I) {
+      printInstruction(I,bbname);
+    }
+  }
+
+  // Loop over the ForwardRefs and resolve them now that all instructions
+  // are generated.
+  if (!ForwardRefs.empty()) {
+    nl(Out) << "// Resolve Forward References";
+    nl(Out);
+  }
+
+  while (!ForwardRefs.empty()) {
+    ForwardRefMap::iterator I = ForwardRefs.begin();
+    Out << I->second << "->replaceAllUsesWith("
+        << getCppName(I->first) << "); delete " << I->second << ";";
+    nl(Out);
+    ForwardRefs.erase(I);
+  }
+}
+
+void CppWriter::printInline(const std::string& fname,
+                            const std::string& func) {
+  const Function* F = TheModule->getFunction(func);
+  if (!F) {
+    error(std::string("Function '") + func + "' not found in input module");
+    return;
+  }
+  if (F->isDeclaration()) {
+    error(std::string("Function '") + func + "' is external!");
+    return;
+  }
+  nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
+          << getCppName(F);
+  unsigned arg_count = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI) {
+    Out << ", Value* arg_" << arg_count;
+  }
+  Out << ") {";
+  nl(Out);
+  is_inline = true;
+  printFunctionUses(F);
+  printFunctionBody(F);
+  is_inline = false;
+  Out << "return " << getCppName(F->begin()) << ";";
+  nl(Out) << "}";
+  nl(Out);
+}
+
+void CppWriter::printModuleBody() {
+  // Print out all the type definitions
+  nl(Out) << "// Type Definitions"; nl(Out);
+  printTypes(TheModule);
+
+  // Functions can call each other and global variables can reference them so
+  // define all the functions first before emitting their function bodies.
+  nl(Out) << "// Function Declarations"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I)
+    printFunctionHead(I);
+
+  // Process the global variables declarations. We can't initialze them until
+  // after the constants are printed so just print a header for each global
+  nl(Out) << "// Global Variable Declarations\n"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableHead(I);
+  }
+
+  // Print out all the constants definitions. Constants don't recurse except
+  // through GlobalValues. All GlobalValues have been declared at this point
+  // so we can proceed to generate the constants.
+  nl(Out) << "// Constant Definitions"; nl(Out);
+  printConstants(TheModule);
+
+  // Process the global variables definitions now that all the constants have
+  // been emitted. These definitions just couple the gvars with their constant
+  // initializers.
+  nl(Out) << "// Global Variable Definitions"; nl(Out);
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    printVariableBody(I);
+  }
+
+  // Finally, we can safely put out all of the function bodies.
+  nl(Out) << "// Function Definitions"; nl(Out);
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (!I->isDeclaration()) {
+      nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+              << ")";
+      nl(Out) << "{";
+      nl(Out,1);
+      printFunctionBody(I);
+      nl(Out,-1) << "}";
+      nl(Out);
+    }
+  }
+}
+
+void CppWriter::printProgram(const std::string& fname,
+                             const std::string& mName) {
+  Out << "#include <llvm/Pass.h>\n";
+  Out << "#include <llvm/PassManager.h>\n";
+
+  Out << "#include <llvm/ADT/SmallVector.h>\n";
+  Out << "#include <llvm/Analysis/Verifier.h>\n";
+  Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
+  Out << "#include <llvm/IR/BasicBlock.h>\n";
+  Out << "#include <llvm/IR/CallingConv.h>\n";
+  Out << "#include <llvm/IR/Constants.h>\n";
+  Out << "#include <llvm/IR/DerivedTypes.h>\n";
+  Out << "#include <llvm/IR/Function.h>\n";
+  Out << "#include <llvm/IR/GlobalVariable.h>\n";
+  Out << "#include <llvm/IR/InlineAsm.h>\n";
+  Out << "#include <llvm/IR/Instructions.h>\n";
+  Out << "#include <llvm/IR/LLVMContext.h>\n";
+  Out << "#include <llvm/IR/Module.h>\n";
+  Out << "#include <llvm/Support/FormattedStream.h>\n";
+  Out << "#include <llvm/Support/MathExtras.h>\n";
+  Out << "#include <algorithm>\n";
+  Out << "using namespace llvm;\n\n";
+  Out << "Module* " << fname << "();\n\n";
+  Out << "int main(int argc, char**argv) {\n";
+  Out << "  Module* Mod = " << fname << "();\n";
+  Out << "  verifyModule(*Mod, PrintMessageAction);\n";
+  Out << "  PassManager PM;\n";
+  Out << "  PM.add(createPrintModulePass(&outs()));\n";
+  Out << "  PM.run(*Mod);\n";
+  Out << "  return 0;\n";
+  Out << "}\n\n";
+  printModule(fname,mName);
+}
+
+void CppWriter::printModule(const std::string& fname,
+                            const std::string& mName) {
+  nl(Out) << "Module* " << fname << "() {";
+  nl(Out,1) << "// Module Construction";
+  nl(Out) << "Module* mod = new Module(\"";
+  printEscapedString(mName);
+  Out << "\", getGlobalContext());";
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
+  }
+  if (!TheModule->getTargetTriple().empty()) {
+    nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
+            << "\");";
+  }
+
+  if (!TheModule->getModuleInlineAsm().empty()) {
+    nl(Out) << "mod->setModuleInlineAsm(\"";
+    printEscapedString(TheModule->getModuleInlineAsm());
+    Out << "\");";
+  }
+  nl(Out);
+
+  printModuleBody();
+  nl(Out) << "return mod;";
+  nl(Out,-1) << "}";
+  nl(Out);
+}
+
+void CppWriter::printContents(const std::string& fname,
+                              const std::string& mName) {
+  Out << "\nModule* " << fname << "(Module *mod) {\n";
+  Out << "\nmod->setModuleIdentifier(\"";
+  printEscapedString(mName);
+  Out << "\");\n";
+  printModuleBody();
+  Out << "\nreturn mod;\n";
+  Out << "\n}\n";
+}
+
+void CppWriter::printFunction(const std::string& fname,
+                              const std::string& funcName) {
+  const Function* F = TheModule->getFunction(funcName);
+  if (!F) {
+    error(std::string("Function '") + funcName + "' not found in input module");
+    return;
+  }
+  Out << "\nFunction* " << fname << "(Module *mod) {\n";
+  printFunctionUses(F);
+  printFunctionHead(F);
+  printFunctionBody(F);
+  Out << "return " << getCppName(F) << ";\n";
+  Out << "}\n";
+}
+
+void CppWriter::printFunctions() {
+  const Module::FunctionListType &funcs = TheModule->getFunctionList();
+  Module::const_iterator I  = funcs.begin();
+  Module::const_iterator IE = funcs.end();
+
+  for (; I != IE; ++I) {
+    const Function &func = *I;
+    if (!func.isDeclaration()) {
+      std::string name("define_");
+      name += func.getName();
+      printFunction(name, func.getName());
+    }
+  }
+}
+
+void CppWriter::printVariable(const std::string& fname,
+                              const std::string& varName) {
+  const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
+
+  if (!GV) {
+    error(std::string("Variable '") + varName + "' not found in input module");
+    return;
+  }
+  Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
+  printVariableUses(GV);
+  printVariableHead(GV);
+  printVariableBody(GV);
+  Out << "return " << getCppName(GV) << ";\n";
+  Out << "}\n";
+}
+
+void CppWriter::printType(const std::string &fname,
+                          const std::string &typeName) {
+  Type* Ty = TheModule->getTypeByName(typeName);
+  if (!Ty) {
+    error(std::string("Type '") + typeName + "' not found in input module");
+    return;
+  }
+  Out << "\nType* " << fname << "(Module *mod) {\n";
+  printType(Ty);
+  Out << "return " << getCppName(Ty) << ";\n";
+  Out << "}\n";
+}
+
+bool CppWriter::runOnModule(Module &M) {
+  TheModule = &M;
+
+  // Emit a header
+  Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
+
+  // Get the name of the function we're supposed to generate
+  std::string fname = FuncName.getValue();
+
+  // Get the name of the thing we are to generate
+  std::string tgtname = NameToGenerate.getValue();
+  if (GenerationType == GenModule ||
+      GenerationType == GenContents ||
+      GenerationType == GenProgram ||
+      GenerationType == GenFunctions) {
+    if (tgtname == "!bad!") {
+      if (M.getModuleIdentifier() == "-")
+        tgtname = "<stdin>";
+      else
+        tgtname = M.getModuleIdentifier();
+    }
+  } else if (tgtname == "!bad!")
+    error("You must use the -for option with -gen-{function,variable,type}");
+
+  switch (WhatToGenerate(GenerationType)) {
+   case GenProgram:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printProgram(fname,tgtname);
+    break;
+   case GenModule:
+    if (fname.empty())
+      fname = "makeLLVMModule";
+    printModule(fname,tgtname);
+    break;
+   case GenContents:
+    if (fname.empty())
+      fname = "makeLLVMModuleContents";
+    printContents(fname,tgtname);
+    break;
+   case GenFunction:
+    if (fname.empty())
+      fname = "makeLLVMFunction";
+    printFunction(fname,tgtname);
+    break;
+   case GenFunctions:
+    printFunctions();
+    break;
+   case GenInline:
+    if (fname.empty())
+      fname = "makeLLVMInline";
+    printInline(fname,tgtname);
+    break;
+   case GenVariable:
+    if (fname.empty())
+      fname = "makeLLVMVariable";
+    printVariable(fname,tgtname);
+    break;
+   case GenType:
+    if (fname.empty())
+      fname = "makeLLVMType";
+    printType(fname,tgtname);
+    break;
+  }
+
+  return false;
+}
+
+char CppWriter::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                           formatted_raw_ostream &o,
+                                           CodeGenFileType FileType,
+                                           bool DisableVerify,
+                                           AnalysisID StartAfter,
+                                           AnalysisID StopAfter) {
+  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+  PM.add(new CppWriter(o));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h
new file mode 100644
index 000000000000..477e788ee2fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/CPPTargetMachine.h
@@ -0,0 +1,46 @@
+//===-- CPPTargetMachine.h - TargetMachine for the C++ backend --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C++ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPPTARGETMACHINE_H
+#define CPPTARGETMACHINE_H
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+struct CPPTargetMachine : public TargetMachine {
+  CPPTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL)
+    : TargetMachine(T, TT, CPU, FS, Options) {}
+
+  virtual bool addPassesToEmitFile(PassManagerBase &PM,
+                                   formatted_raw_ostream &Out,
+                                   CodeGenFileType FileType,
+                                   bool DisableVerify,
+                                   AnalysisID StartAfter,
+                                   AnalysisID StopAfter);
+
+  virtual const DataLayout *getDataLayout() const { return 0; }
+};
+
+extern Target TheCppBackendTarget;
+
+} // End llvm namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
new file mode 100644
index 000000000000..1ca74a4895c4
--- /dev/null
+++ b/contrib/llvm/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -0,0 +1,28 @@
+//===-- CppBackendTargetInfo.cpp - CppBackend Target Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCppBackendTarget;
+
+static unsigned CppBackend_TripleMatchQuality(const std::string &TT) {
+  // This class always works, but shouldn't be the default in most cases.
+  return 1;
+}
+
+extern "C" void LLVMInitializeCppBackendTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheCppBackendTarget, "cpp",    
+                                  "C++ backend",
+                                  &CppBackend_TripleMatchQuality);
+}
+
+extern "C" void LLVMInitializeCppBackendTargetMC() {}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
new file mode 100644
index 000000000000..dfbefc864283
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h
@@ -0,0 +1,79 @@
+//=-- Hexagon.h - Top-level interface for Hexagon representation --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Hexagon back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_Hexagon_H
+#define TARGET_Hexagon_H
+
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class ModulePass;
+  class TargetMachine;
+  class MachineInstr;
+  class HexagonMCInst;
+  class HexagonAsmPrinter;
+  class HexagonTargetMachine;
+  class raw_ostream;
+
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel);
+  FunctionPass *createHexagonDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createHexagonFPMoverPass(TargetMachine &TM);
+  FunctionPass *createHexagonRemoveExtendOps(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonCFGOptimizer(HexagonTargetMachine &TM);
+
+  FunctionPass *createHexagonSplitTFRCondSets(HexagonTargetMachine &TM);
+  FunctionPass *createHexagonExpandPredSpillCode(HexagonTargetMachine &TM);
+
+  FunctionPass *createHexagonHardwareLoops();
+  FunctionPass *createHexagonPeephole();
+  FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonPacketizer();
+  FunctionPass *createHexagonNewValueJump();
+
+
+/* TODO: object output.
+  MCCodeEmitter *createHexagonMCCodeEmitter(const Target &,
+                                            TargetMachine &TM,
+                                            MCContext &Ctx);
+*/
+/* TODO: assembler input.
+  TargetAsmBackend *createHexagonAsmBackend(const Target &,
+                                                  const std::string &);
+*/
+  void HexagonLowerToMC(const MachineInstr *MI, HexagonMCInst &MCI,
+                        HexagonAsmPrinter &AP);
+} // end namespace llvm;
+
+#define Hexagon_POINTER_SIZE 4
+
+#define Hexagon_PointerSize (Hexagon_POINTER_SIZE)
+#define Hexagon_PointerSize_Bits (Hexagon_POINTER_SIZE * 8)
+#define Hexagon_WordSize Hexagon_PointerSize
+#define Hexagon_WordSize_Bits Hexagon_PointerSize_Bits
+
+// allocframe saves LR and FP on stack before allocating
+// a new stack frame. This takes 8 bytes.
+#define HEXAGON_LRFP_SIZE 8
+
+// Normal instruction size (in bytes).
+#define HEXAGON_INSTR_SIZE 4
+
+// Maximum number of words and instructions in a packet.
+#define HEXAGON_PACKET_SIZE 4
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
new file mode 100644
index 000000000000..8a5ee40590bb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -0,0 +1,177 @@
+//===-- Hexagon.td - Describe the Hexagon Target Machine --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Hexagon Subtarget features.
+//===----------------------------------------------------------------------===//
+
+// Hexagon Archtectures
+def ArchV2       : SubtargetFeature<"v2", "HexagonArchVersion", "V2",
+                                    "Hexagon v2">;
+def ArchV3       : SubtargetFeature<"v3", "HexagonArchVersion", "V3",
+                                    "Hexagon v3">;
+def ArchV4       : SubtargetFeature<"v4", "HexagonArchVersion", "V4",
+                                    "Hexagon v4">;
+def ArchV5       : SubtargetFeature<"v5", "HexagonArchVersion", "V5",
+                                    "Hexagon v5">;
+
+//===----------------------------------------------------------------------===//
+// Hexagon Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasV2T                      : Predicate<"Subtarget.hasV2TOps()">;
+def HasV2TOnly                  : Predicate<"Subtarget.hasV2TOpsOnly()">;
+def NoV2T                       : Predicate<"!Subtarget.hasV2TOps()">;
+def HasV3T                      : Predicate<"Subtarget.hasV3TOps()">;
+def HasV3TOnly                  : Predicate<"Subtarget.hasV3TOpsOnly()">;
+def NoV3T                       : Predicate<"!Subtarget.hasV3TOps()">;
+def HasV4T                      : Predicate<"Subtarget.hasV4TOps()">;
+def NoV4T                       : Predicate<"!Subtarget.hasV4TOps()">;
+def HasV5T                      : Predicate<"Subtarget.hasV5TOps()">;
+def NoV5T                       : Predicate<"!Subtarget.hasV5TOps()">;
+def UseMEMOP                    : Predicate<"Subtarget.useMemOps()">;
+def IEEERndNearV5T              : Predicate<"Subtarget.modeIEEERndNear()">;
+
+//===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// PredRel - Filter class used to relate non-predicated instructions with their
+// predicated forms.
+class PredRel;
+// PredNewRel - Filter class used to relate predicated instructions with their
+// predicate-new forms.
+class PredNewRel: PredRel;
+// ImmRegRel - Filter class used to relate instructions having reg-reg form
+// with their reg-imm counterparts.
+class ImmRegRel;
+// NewValueRel - Filter class used to relate regular store instructions with
+// their new-value store form.
+class NewValueRel: PredNewRel;
+// NewValueRel - Filter class used to relate load/store instructions having
+// different addressing modes with each other.
+class AddrModeRel: NewValueRel;
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate non-predicate instructions with their
+// predicated formats - true and false.
+//
+
+def getPredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["PredSense"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = [""];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["true"], ["false"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicated instructions with their .new
+// format.
+//
+def getPredNewOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = [""];
+  let ValueCols = [["new"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate store instructions with their new-value
+// format.
+//
+def getNewValueOpcode : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["isNVStore"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+def getBasedWithImmOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore",
+                   "isMEMri", "isFloat"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["Absolute"];
+  let ValueCols = [["BaseImmOffset"]];
+}
+
+def getBaseWithRegOffset : InstrMapping {
+  let FilterClass = "AddrModeRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"];
+  let ColFields = ["addrMode"];
+  let KeyCol = ["BaseImmOffset"];
+  let ValueCols = [["BaseRegOffset"]];
+}
+
+def getRegForm : InstrMapping {
+  let FilterClass = "ImmRegRel";
+  let RowFields = ["CextOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["InputType"];
+  let KeyCol = ["imm"];
+  let ValueCols = [["reg"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+include "HexagonSchedule.td"
+include "HexagonRegisterInfo.td"
+include "HexagonCallingConv.td"
+include "HexagonInstrInfo.td"
+include "HexagonIntrinsics.td"
+include "HexagonIntrinsicsDerived.td"
+
+def HexagonInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Hexagon processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, SchedMachineModel Model,
+           list<SubtargetFeature> Features>
+ : ProcessorModel<Name, Model, Features>;
+
+def : Proc<"hexagonv2", HexagonModel,   [ArchV2]>;
+def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
+def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
+def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
+
+
+// Hexagon Uses the MC printer for assembler output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def HexagonAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Hexagon : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = HexagonInstrInfo;
+
+  let AssemblyWriters = [HexagonAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
new file mode 100644
index 000000000000..88cd3fbacea0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -0,0 +1,314 @@
+//===-- HexagonAsmPrinter.cpp - Print machine instrs to Hexagon assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Hexagon assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "InstPrinter/HexagonInstPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool> AlignCalls(
+         "hexagon-align-calls", cl::Hidden, cl::init(true),
+          cl::desc("Insert falign after call instruction for Hexagon target"));
+
+void HexagonAsmPrinter::EmitAlignment(unsigned NumBits,
+                                      const GlobalValue *GV) const {
+  // For basic block level alignment, use ".falign".
+  if (!GV) {
+    OutStreamer.EmitRawText(StringRef("\t.falign"));
+    return;
+  }
+
+  AsmPrinter::EmitAlignment(NumBits, GV);
+}
+
+void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MO.getType()) {
+  default: llvm_unreachable ("<unknown operand type>");
+  case MachineOperand::MO_Register:
+    O << HexagonInstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    // FIXME: PIC relocation model.
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    // Computing the address of a global symbol, not calling it.
+    O << *Mang->getSymbol(MO.getGlobal());
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+}
+
+//
+// isBlockOnlyReachableByFallthrough - We need to override this since the
+// default AsmPrinter does not print labels for any basic block that
+// is only reachable by a fall through. That works for all cases except
+// for the case in which the basic block is reachable by a fall through but
+// through an indirect from a jump table. In this case, the jump table
+// will contain a label not defined by AsmPrinter.
+//
+bool HexagonAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  if (MBB->hasAddressTaken()) {
+    return false;
+  }
+  return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB);
+}
+
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                        unsigned AsmVariant,
+                                        const char *ExtraCode,
+                                        raw_ostream &OS) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS);
+    case 'c': // Don't print "$" before a global var name or constant.
+      // Hexagon never has a prefix.
+      printOperand(MI, OpNo, OS);
+      return false;
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        OS << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, OS);
+  return false;
+}
+
+bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &Base  = MI->getOperand(OpNo);
+  const MachineOperand &Offset = MI->getOperand(OpNo+1);
+
+  if (Base.isReg())
+    printOperand(MI, OpNo, O);
+  else
+    llvm_unreachable("Unimplemented");
+
+  if (Offset.isImm()) {
+    if (Offset.getImm())
+      O << " + #" << Offset.getImm();
+  }
+  else
+    llvm_unreachable("Unimplemented");
+
+  return false;
+}
+
+void HexagonAsmPrinter::printPredicateOperand(const MachineInstr *MI,
+                                              unsigned OpNo,
+                                              raw_ostream &O) {
+  llvm_unreachable("Unimplemented");
+}
+
+
+/// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to
+/// the current output stream.
+///
+void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (MI->isBundle()) {
+    std::vector<const MachineInstr*> BundleMIs;
+
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator MII = MI;
+    ++MII;
+    unsigned int IgnoreCount = 0;
+    while (MII != MBB->end() && MII->isInsideBundle()) {
+      const MachineInstr *MInst = MII;
+      if (MInst->getOpcode() == TargetOpcode::DBG_VALUE ||
+          MInst->getOpcode() == TargetOpcode::IMPLICIT_DEF) {
+          IgnoreCount++;
+          ++MII;
+          continue;
+      }
+      //BundleMIs.push_back(&*MII);
+      BundleMIs.push_back(MInst);
+      ++MII;
+    }
+    unsigned Size = BundleMIs.size();
+    assert((Size+IgnoreCount) == MI->getBundleSize() && "Corrupt Bundle!");
+    for (unsigned Index = 0; Index < Size; Index++) {
+      HexagonMCInst MCI;
+      MCI.setPacketStart(Index == 0);
+      MCI.setPacketEnd(Index == (Size-1));
+
+      HexagonLowerToMC(BundleMIs[Index], MCI, *this);
+      OutStreamer.EmitInstruction(MCI);
+    }
+  }
+  else {
+    HexagonMCInst MCI;
+    if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+      MCI.setPacketStart(true);
+      MCI.setPacketEnd(true);
+    }
+    HexagonLowerToMC(MI, MCI, *this);
+    OutStreamer.EmitInstruction(MCI);
+  }
+
+  return;
+}
+
+/// PrintUnmangledNameSafely - Print out the printable characters in the name.
+/// Don't print things like \n or \0.
+// static void PrintUnmangledNameSafely(const Value *V, raw_ostream &OS) {
+//   for (const char *Name = V->getNameStart(), *E = Name+V->getNameLen();
+//        Name != E; ++Name)
+//     if (isprint(*Name))
+//       OS << *Name;
+// }
+
+
+void HexagonAsmPrinter::printAddrModeBasePlusOffset(const MachineInstr *MI,
+                                                    int OpNo, raw_ostream &O) {
+  const MachineOperand &MO1 = MI->getOperand(OpNo);
+  const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+  O << HexagonInstPrinter::getRegisterName(MO1.getReg())
+    << " + #"
+    << MO2.getImm();
+}
+
+
+void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
+                                           raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
+         "Expecting global address");
+
+  O << *Mang->getSymbol(MO.getGlobal());
+  if (MO.getOffset() != 0) {
+    O << " + ";
+    O << MO.getOffset();
+  }
+}
+
+void HexagonAsmPrinter::printJumpTable(const MachineInstr *MI, int OpNo,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_JumpTableIndex) && 
+           "Expecting jump table index");
+
+  // Hexagon_TODO: Do we need name mangling?
+  O << *GetJTISymbol(MO.getIndex());
+}
+
+void HexagonAsmPrinter::printConstantPool(const MachineInstr *MI, int OpNo,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  assert( (MO.getType() == MachineOperand::MO_ConstantPoolIndex) &&
+          "Expecting constant pool index");
+
+  // Hexagon_TODO: Do we need name mangling?
+  O << *GetCPISymbol(MO.getIndex());
+}
+
+static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
+                                                 unsigned SyntaxVariant,
+                                                 const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return(new HexagonInstPrinter(MAI, MII, MRI));
+  else
+   return NULL;
+}
+
+extern "C" void LLVMInitializeHexagonAsmPrinter() {
+  RegisterAsmPrinter<HexagonAsmPrinter> X(TheHexagonTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheHexagonTarget,
+                                        createHexagonMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
new file mode 100644
index 000000000000..bc2af636124c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -0,0 +1,165 @@
+//===-- HexagonAsmPrinter.h - Print machine code to an Hexagon .s file ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hexagon Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class HexagonAsmPrinter : public AsmPrinter {
+    const HexagonSubtarget *Subtarget;
+
+  public:
+    explicit HexagonAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {
+      Subtarget = &TM.getSubtarget<HexagonSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Hexagon Assembly Printer";
+    }
+
+    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+
+    virtual void EmitInstruction(const MachineInstr *MI);
+    virtual void EmitAlignment(unsigned NumBits,
+                               const GlobalValue *GV = 0) const;
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &OS);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS);
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    void printInstruction(const MachineInstr *MI, raw_ostream &O);
+
+    //    void printMachineInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO, raw_ostream &O);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero,
+                       raw_ostream &O) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+      O << getRegisterName(RegNo);
+    }
+
+    void printImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << value;
+    }
+
+    void printNegImmOperand(const MachineInstr *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+      int value = MI->getOperand(OpNo).getImm();
+      O << -value;
+    }
+
+    void printMEMriOperand(const MachineInstr *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << " + #"
+        << (int) MO2.getImm();
+    }
+
+    void printFrameIndexOperand(const MachineInstr *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+      const MachineOperand &MO1 = MI->getOperand(OpNo);
+      const MachineOperand &MO2 = MI->getOperand(OpNo+1);
+
+      O << getRegisterName(MO1.getReg())
+        << ", #"
+        << MO2.getImm();
+    }
+
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo,
+                            raw_ostream &O) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << "$+" << MI->getOperand(OpNo).getImm()*4;
+      } else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo,
+                          raw_ostream &O) {
+    }
+
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo,
+                             raw_ostream &O) {
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printImmOperand(MI, OpNo, O);
+      }
+      else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) {
+      O << "#HI(";
+      if (MI->getOperand(OpNo).isImm()) {
+        printImmOperand(MI, OpNo, O);
+      }
+      else {
+        printOp(MI->getOperand(OpNo), O);
+      }
+      O << ")";
+    }
+
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                               raw_ostream &O);
+
+#if 0
+    void printModuleLevelGV(const GlobalVariable* GVar, raw_ostream &O);
+#endif
+
+    void printAddrModeBasePlusOffset(const MachineInstr *MI, int OpNo,
+                                     raw_ostream &O);
+
+    void printGlobalOperand(const MachineInstr *MI, int OpNo, raw_ostream &O);
+    void printJumpTable(const MachineInstr *MI, int OpNo, raw_ostream &O);
+    void printConstantPool(const MachineInstr *MI, int OpNo, raw_ostream &O);
+
+    static const char *getRegisterName(unsigned RegNo);
+
+#if 0
+    void EmitStartOfAsmFile(Module &M);
+#endif
+  };
+
+} // end of llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
new file mode 100644
index 000000000000..d4078ad28b60
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -0,0 +1,236 @@
+//===-- HexagonCFGOptimizer.cpp - CFG optimizations -----------------------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon_cfg"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class HexagonCFGOptimizer : public MachineFunctionPass {
+
+private:
+  HexagonTargetMachine& QTM;
+  const HexagonSubtarget &QST;
+
+  void InvertAndChangeJumpTarget(MachineInstr*, MachineBasicBlock*);
+
+ public:
+  static char ID;
+  HexagonCFGOptimizer(HexagonTargetMachine& TM) : MachineFunctionPass(ID),
+                                                  QTM(TM),
+                                                  QST(*TM.getSubtargetImpl()) {}
+
+  const char *getPassName() const {
+    return "Hexagon CFG Optimizer";
+  }
+  bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonCFGOptimizer::ID = 0;
+
+static bool IsConditionalBranch(int Opc) {
+  return (Opc == Hexagon::JMP_c) || (Opc == Hexagon::JMP_cNot)
+    || (Opc == Hexagon::JMP_cdnPt) || (Opc == Hexagon::JMP_cdnNotPt);
+}
+
+
+static bool IsUnconditionalJump(int Opc) {
+  return (Opc == Hexagon::JMP);
+}
+
+
+void
+HexagonCFGOptimizer::InvertAndChangeJumpTarget(MachineInstr* MI,
+                                               MachineBasicBlock* NewTarget) {
+  const HexagonInstrInfo *QII = QTM.getInstrInfo();
+  int NewOpcode = 0;
+  switch(MI->getOpcode()) {
+  case Hexagon::JMP_c:
+    NewOpcode = Hexagon::JMP_cNot;
+    break;
+
+  case Hexagon::JMP_cNot:
+    NewOpcode = Hexagon::JMP_c;
+    break;
+
+  case Hexagon::JMP_cdnPt:
+    NewOpcode = Hexagon::JMP_cdnNotPt;
+    break;
+
+  case Hexagon::JMP_cdnNotPt:
+    NewOpcode = Hexagon::JMP_cdnPt;
+    break;
+
+  default:
+    llvm_unreachable("Cannot handle this case");
+  }
+
+  MI->setDesc(QII->get(NewOpcode));
+  MI->getOperand(1).setMBB(NewTarget);
+}
+
+
+bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    // Traverse the basic block.
+    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
+    if (MII != MBB->end()) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (IsConditionalBranch(Opc)) {
+
+        //
+        // (Case 1) Transform the code if the following condition occurs:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...next block in layout is BB3...
+        //   BB3: ...
+        //
+        //  Transform this to:
+        //  BB1: if (!p0) jump BB4
+        //  Remove BB2
+        //  BB3: ...
+        //
+        // (Case 2) A variation occurs when BB3 contains a JMP to BB4:
+        //   BB1: if (p0) jump BB3
+        //   ...falls-through to BB2 ...
+        //   BB2: jump BB4
+        //   ...other basic blocks ...
+        //   BB4:
+        //   ...not a fall-thru
+        //   BB3: ...
+        //     jump BB4
+        //
+        // Transform this to:
+        //   BB1: if (!p0) jump BB4
+        //   Remove BB2
+        //   BB3: ...
+        //   BB4: ...
+        //
+        unsigned NumSuccs = MBB->succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        MachineBasicBlock* FirstSucc = *SI;
+        MachineBasicBlock* SecondSucc = *(++SI);
+        MachineBasicBlock* LayoutSucc = NULL;
+        MachineBasicBlock* JumpAroundTarget = NULL;
+
+        if (MBB->isLayoutSuccessor(FirstSucc)) {
+          LayoutSucc = FirstSucc;
+          JumpAroundTarget = SecondSucc;
+        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+          LayoutSucc = SecondSucc;
+          JumpAroundTarget = FirstSucc;
+        } else {
+          // Odd case...cannot handle.
+        }
+
+        // The target of the unconditional branch must be JumpAroundTarget.
+        // TODO: If not, we should not invert the unconditional branch.
+        MachineBasicBlock* CondBranchTarget = NULL;
+        if ((MI->getOpcode() == Hexagon::JMP_c) ||
+            (MI->getOpcode() == Hexagon::JMP_cNot)) {
+          CondBranchTarget = MI->getOperand(1).getMBB();
+        }
+
+        if (!LayoutSucc || (CondBranchTarget != JumpAroundTarget)) {
+          continue;
+        }
+
+        if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
+
+          // Ensure that BB2 has one instruction -- an unconditional jump.
+          if ((LayoutSucc->size() == 1) &&
+              IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
+            MachineBasicBlock* UncondTarget =
+              LayoutSucc->front().getOperand(0).getMBB();
+            // Check if the layout successor of BB2 is BB3.
+            bool case1 = LayoutSucc->isLayoutSuccessor(JumpAroundTarget);
+            bool case2 = JumpAroundTarget->isSuccessor(UncondTarget) &&
+              JumpAroundTarget->size() >= 1 &&
+              IsUnconditionalJump(JumpAroundTarget->back().getOpcode()) &&
+              JumpAroundTarget->pred_size() == 1 &&
+              JumpAroundTarget->succ_size() == 1;
+
+            if (case1 || case2) {
+              InvertAndChangeJumpTarget(MI, UncondTarget);
+              MBB->removeSuccessor(JumpAroundTarget);
+              MBB->addSuccessor(UncondTarget);
+
+              // Remove the unconditional branch in LayoutSucc.
+              LayoutSucc->erase(LayoutSucc->begin());
+              LayoutSucc->removeSuccessor(UncondTarget);
+              LayoutSucc->addSuccessor(JumpAroundTarget);
+
+              // This code performs the conversion for case 2, which moves
+              // the block to the fall-thru case (BB3 in the code above).
+              if (case2 && !case1) {
+                JumpAroundTarget->moveAfter(LayoutSucc);
+                // only move a block if it doesn't have a fall-thru. otherwise
+                // the CFG will be incorrect.
+                if (!UncondTarget->canFallThrough()) {
+                  UncondTarget->moveAfter(JumpAroundTarget);
+                }
+              }
+
+              //
+              // Correct live-in information. Is used by post-RA scheduler
+              // The live-in to LayoutSucc is now all values live-in to
+              // JumpAroundTarget.
+              //
+              std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(),
+                                               LayoutSucc->livein_end());
+              std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(),
+                                              JumpAroundTarget->livein_end());
+              for (unsigned i = 0; i < OrigLiveIn.size(); ++i) {
+                LayoutSucc->removeLiveIn(OrigLiveIn[i]);
+              }
+              for (unsigned i = 0; i < NewLiveIn.size(); ++i) {
+                LayoutSucc->addLiveIn(NewLiveIn[i]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonCFGOptimizer(HexagonTargetMachine &TM) {
+  return new HexagonCFGOptimizer(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
new file mode 100644
index 000000000000..e61b2a7a58ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -0,0 +1,35 @@
+//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Hexagon architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Hexagon 32-bit C return-value convention.
+def RetCC_Hexagon32 : CallingConv<[
+  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Hexagon 32-bit C Calling convention.
+def CC_Hexagon32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
+  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp
new file mode 100644
index 000000000000..2c93d04f98e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -0,0 +1,204 @@
+//===-- llvm/CallingConvLower.cpp - Calling Convention lowering -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon_CCState class, used for lowering and
+// implementing calling conventions. Adapted from the machine independent
+// version of the class (CCState) but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonCallingConvLower.h"
+#include "Hexagon.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
+                                 const TargetMachine &tm,
+                                 SmallVector<CCValAssign, 16> &locs,
+                                 LLVMContext &c)
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
+    TRI(*TM.getRegisterInfo()), Locs(locs), Context(c) {
+  // No stack is used.
+  StackOffset = 0;
+
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+// HandleByVal - Allocate a stack slot large enough to pass an argument by
+// value. The size and alignment information of the argument is encoded in its
+// parameter attribute.
+void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
+                                EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                int MinSize, int MinAlign,
+                                ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  unsigned Offset = AllocateStack(Size, Align);
+
+  addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset,
+                             LocVT.getSimpleVT(), LocInfo));
+}
+
+/// MarkAllocated - Mark a register and all of its aliases as allocated.
+void Hexagon_CCState::MarkAllocated(unsigned Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
+}
+
+/// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+/// incorporating info about the formals into this state.
+void
+Hexagon_CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                        Hexagon_CCAssignFn Fn,
+                                        unsigned SretValueInRegs) {
+  unsigned NumArgs = Ins.size();
+  unsigned i = 0;
+
+  // If the function returns a small struct in registers, skip
+  // over the first (dummy) argument.
+  if (SretValueInRegs != 0) {
+    ++i;
+  }
+
+
+  for (; i != NumArgs; ++i) {
+    EVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, 0, 0, false)) {
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+/// incorporating info about the result values into this state.
+void
+Hexagon_CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               Hexagon_CCAssignFn Fn,
+                               unsigned SretValueInRegs) {
+
+  // For Hexagon, Return small structures in registers.
+  if (SretValueInRegs != 0) {
+    if (SretValueInRegs <= 32) {
+      unsigned Reg = Hexagon::R0;
+      addLoc(CCValAssign::getReg(0, MVT::i32, Reg, MVT::i32,
+                                 CCValAssign::Full));
+      return;
+    }
+    if (SretValueInRegs <= 64) {
+      unsigned Reg = Hexagon::D0;
+      addLoc(CCValAssign::getReg(0, MVT::i64, Reg, MVT::i64,
+                                 CCValAssign::Full));
+      return;
+    }
+  }
+
+
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this, -1, -1, false)){
+      dbgs() << "Return operand #" << i << " has unhandled type "
+           << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+
+/// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+/// about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg>
+                                     &Outs,
+                                     Hexagon_CCAssignFn Fn,
+                                     int NonVarArgsParams,
+                                     unsigned SretValueSize) {
+  unsigned NumOps = Outs.size();
+
+  unsigned i = 0;
+  // If the called function returns a small struct in registers, skip
+  // the first actual parameter. We do not want to pass a pointer to
+  // the stack location.
+  if (SretValueSize != 0) {
+    ++i;
+  }
+
+  for (; i != NumOps; ++i) {
+    EVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this,
+           NonVarArgsParams, i+1, false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallOperands - Same as above except it takes vectors of types
+/// and argument flags.
+void
+Hexagon_CCState::AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                                     SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                     Hexagon_CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    EVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this, -1, -1,
+           false)) {
+      dbgs() << "Call operand #" << i << " has unhandled type "
+           << ArgVT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+/// incorporating info about the passed values into this state.
+void
+Hexagon_CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   Hexagon_CCAssignFn Fn,
+                                   unsigned SretValueInRegs) {
+
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    EVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+      if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this, -1, -1, false)) {
+        dbgs() << "Call result #" << i << " has unhandled type "
+               << VT.getEVTString() << "\n";
+      abort();
+    }
+  }
+}
+
+/// AnalyzeCallResult - Same as above except it's specialized for calls which
+/// produce a single value.
+void Hexagon_CCState::AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this, -1, -1,
+         false)) {
+    dbgs() << "Call result has unhandled type "
+         << VT.getEVTString() << "\n";
+    abort();
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h
new file mode 100644
index 000000000000..489b3a3e5985
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -0,0 +1,189 @@
+//===-- HexagonCallingConvLower.h - Calling Conventions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon_CCState class, used for lowering
+// and implementing calling conventions. Adapted from the target independent
+// version but this handles calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+#define LLVM_Hexagon_CODEGEN_CALLINGCONVLOWER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+//
+// Need to handle varargs.
+//
+namespace llvm {
+  class TargetRegisterInfo;
+  class TargetMachine;
+  class Hexagon_CCState;
+  class SDNode;
+
+
+/// Hexagon_CCAssignFn - This function assigns a location for Val, updating
+/// State to reflect the change.
+typedef bool Hexagon_CCAssignFn(unsigned ValNo, EVT ValVT,
+                              EVT LocVT, CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, Hexagon_CCState &State,
+                              int NonVarArgsParams,
+                              int CurrentParam,
+                              bool ForceMem);
+
+
+/// CCState - This class holds information needed while lowering arguments and
+/// return values.  It captures which registers are already assigned and which
+/// stack slots are used.  It provides accessors to allocate these values.
+class Hexagon_CCState {
+  CallingConv::ID CallingConv;
+  bool IsVarArg;
+  const TargetMachine &TM;
+  const TargetRegisterInfo &TRI;
+  SmallVector<CCValAssign, 16> &Locs;
+  LLVMContext &Context;
+
+  unsigned StackOffset;
+  SmallVector<uint32_t, 16> UsedRegs;
+public:
+  Hexagon_CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
+                SmallVector<CCValAssign, 16> &locs, LLVMContext &c);
+
+  void addLoc(const CCValAssign &V) {
+    Locs.push_back(V);
+  }
+
+  LLVMContext &getContext() const { return Context; }
+  const TargetMachine &getTarget() const { return TM; }
+  unsigned getCallingConv() const { return CallingConv; }
+  bool isVarArg() const { return IsVarArg; }
+
+  unsigned getNextStackOffset() const { return StackOffset; }
+
+  /// isAllocated - Return true if the specified register (or an alias) is
+  /// allocated.
+  bool isAllocated(unsigned Reg) const {
+    return UsedRegs[Reg/32] & (1 << (Reg&31));
+  }
+
+  /// AnalyzeFormalArguments - Analyze an ISD::FORMAL_ARGUMENTS node,
+  /// incorporating info about the formals into this state.
+  void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                              Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeReturn - Analyze the returned values of an ISD::RET node,
+  /// incorporating info about the result values into this state.
+  void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallOperands - Analyze an ISD::CALL node, incorporating info
+  /// about the passed values into this state.
+  void AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           Hexagon_CCAssignFn Fn, int NonVarArgsParams,
+                           unsigned SretValueSize);
+
+  /// AnalyzeCallOperands - Same as above except it takes vectors of types
+  /// and argument flags.
+  void AnalyzeCallOperands(SmallVectorImpl<EVT> &ArgVTs,
+                           SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                           Hexagon_CCAssignFn Fn);
+
+  /// AnalyzeCallResult - Analyze the return values of an ISD::CALL node,
+  /// incorporating info about the passed values into this state.
+  void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                         Hexagon_CCAssignFn Fn, unsigned SretValueInRegs);
+
+  /// AnalyzeCallResult - Same as above except it's specialized for calls which
+  /// produce a single value.
+  void AnalyzeCallResult(EVT VT, Hexagon_CCAssignFn Fn);
+
+  /// getFirstUnallocated - Return the first unallocated register in the set, or
+  /// NumRegs if they are all allocated.
+  unsigned getFirstUnallocated(const unsigned *Regs, unsigned NumRegs) const {
+    for (unsigned i = 0; i != NumRegs; ++i)
+      if (!isAllocated(Regs[i]))
+        return i;
+    return NumRegs;
+  }
+
+  /// AllocateReg - Attempt to allocate one register.  If it is not available,
+  /// return zero.  Otherwise, return the register, marking it and any aliases
+  /// as allocated.
+  unsigned AllocateReg(unsigned Reg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with extra register to be shadowed.
+  unsigned AllocateReg(unsigned Reg, unsigned ShadowReg) {
+    if (isAllocated(Reg)) return 0;
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateReg - Attempt to allocate one of the specified registers.  If none
+  /// are available, return zero.  Otherwise, return the first one available,
+  /// marking it and any aliases as allocated.
+  unsigned AllocateReg(const unsigned *Regs, unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc];
+    MarkAllocated(Reg);
+    return Reg;
+  }
+
+  /// Version of AllocateReg with list of registers to be shadowed.
+  unsigned AllocateReg(const unsigned *Regs, const unsigned *ShadowRegs,
+                       unsigned NumRegs) {
+    unsigned FirstUnalloc = getFirstUnallocated(Regs, NumRegs);
+    if (FirstUnalloc == NumRegs)
+      return 0;    // Didn't find the reg.
+
+    // Mark the register and any aliases as allocated.
+    unsigned Reg = Regs[FirstUnalloc], ShadowReg = ShadowRegs[FirstUnalloc];
+    MarkAllocated(Reg);
+    MarkAllocated(ShadowReg);
+    return Reg;
+  }
+
+  /// AllocateStack - Allocate a chunk of stack space with the specified size
+  /// and alignment.
+  unsigned AllocateStack(unsigned Size, unsigned Align) {
+    assert(Align && ((Align-1) & Align) == 0); // Align is power of 2.
+    StackOffset = ((StackOffset + Align-1) & ~(Align-1));
+    unsigned Result = StackOffset;
+    StackOffset += Size;
+    return Result;
+  }
+
+  // HandleByVal - Allocate a stack slot large enough to pass an argument by
+  // value. The size and alignment information of the argument is encoded in its
+  // parameter attribute.
+  void HandleByVal(unsigned ValNo, EVT ValVT,
+                   EVT LocVT, CCValAssign::LocInfo LocInfo,
+                   int MinSize, int MinAlign, ISD::ArgFlagsTy ArgFlags);
+
+private:
+  /// MarkAllocated - Mark a register and all of its aliases as allocated.
+  void MarkAllocated(unsigned Reg);
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
new file mode 100644
index 000000000000..08144217fd30
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -0,0 +1,180 @@
+//===-- HexagonExpandPredSpillCode.cpp - Expand Predicate Spill Code ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// The Hexagon processor has no instructions that load or store predicate
+// registers directly.  So, when these registers must be spilled a general
+// purpose register must be found and the value copied to/from it from/to
+// the predicate register.  This code currently does not use the register
+// scavenger mechanism available in the allocator.  There are two registers
+// reserved to allow spilling/restoring predicate registers.  One is used to
+// hold the predicate value.  The other is used when stack frame offsets are
+// too large.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+
+namespace {
+
+class HexagonExpandPredSpillCode : public MachineFunctionPass {
+    HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonExpandPredSpillCode(HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Expand Predicate Spill Code";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonExpandPredSpillCode::ID = 0;
+
+
+bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) {
+
+  const HexagonInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::STriw_pred) {
+        // STriw_pred [R30], ofst, SrcReg;
+        unsigned FP = MI->getOperand(0).getReg();
+        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(1).isImm() && "Not an offset");
+        int Offset = MI->getOperand(1).getImm();
+        int SrcReg = MI->getOperand(2).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(SrcReg) &&
+               "Not a predicate register");
+        if (!TII->isValidOffset(Hexagon::STriw_indexed, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP).addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::STriw_indexed))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                      HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                          TII->get(Hexagon::STriw_indexed))
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0)
+              .addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_RsPd),
+                    HEXAGON_RESERVED_REG_2).addReg(SrcReg);
+          BuildMI(*MBB, MII, MI->getDebugLoc(),
+                        TII->get(Hexagon::STriw_indexed)).
+                    addReg(FP).addImm(Offset).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      } else if (Opc == Hexagon::LDriw_pred) {
+        // DstReg = LDriw_pred [R30], ofst.
+        int DstReg = MI->getOperand(0).getReg();
+        assert(Hexagon::PredRegsRegClass.contains(DstReg) &&
+               "Not a predicate register");
+        unsigned FP = MI->getOperand(1).getReg();
+        assert(FP == QTM.getRegisterInfo()->getFrameRegister() &&
+               "Not a Frame Pointer, Nor a Spill Slot");
+        assert(MI->getOperand(2).isImm() && "Not an offset");
+        int Offset = MI->getOperand(2).getImm();
+        if (!TII->isValidOffset(Hexagon::LDriw, Offset)) {
+          if (!TII->isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::CONST32_Int_Real),
+                      HEXAGON_RESERVED_REG_1).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_rr),
+                    HEXAGON_RESERVED_REG_1)
+              .addReg(FP)
+              .addReg(HEXAGON_RESERVED_REG_1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          } else {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::ADD_ri),
+                      HEXAGON_RESERVED_REG_1).addReg(FP).addImm(Offset);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                      HEXAGON_RESERVED_REG_2)
+              .addReg(HEXAGON_RESERVED_REG_1)
+              .addImm(0);
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                      DstReg).addReg(HEXAGON_RESERVED_REG_2);
+          }
+        } else {
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::LDriw),
+                    HEXAGON_RESERVED_REG_2).addReg(FP).addImm(Offset);
+          BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Hexagon::TFR_PdRs),
+                    DstReg).addReg(HEXAGON_RESERVED_REG_2);
+        }
+        MII = MBB->erase(MI);
+        --MII;
+      }
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonExpandPredSpillCode(HexagonTargetMachine &TM) {
+  return new HexagonExpandPredSpillCode(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
new file mode 100644
index 000000000000..240cc9566648
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -0,0 +1,183 @@
+//===---- HexagonFixupHwLoops.cpp - Fixup HW loops too far from LOOPn. ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// The loop start address in the LOOPn instruction is encoded as a distance
+// from the LOOPn instruction itself.  If the start address is too far from
+// the LOOPn instruction, the loop needs to be set up manually, i.e. via
+// direct transfers to SAn and LCn.
+// This pass will identify and convert such LOOPn instructions to a proper
+// form.
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
+using namespace llvm;
+
+namespace llvm {
+  void initializeHexagonFixupHwLoopsPass(PassRegistry&);
+}
+
+namespace {
+  struct HexagonFixupHwLoops : public MachineFunctionPass {
+  public:
+    static char ID;
+
+    HexagonFixupHwLoops() : MachineFunctionPass(ID) {
+      initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// \brief Maximum distance between the loop instr and the basic block.
+    /// Just an estimate.
+    static const unsigned MAX_LOOP_DISTANCE = 200;
+
+    /// \brief Check the offset between each loop instruction and
+    /// the loop basic block to determine if we can use the LOOP instruction
+    /// or if we need to set the LC/SA registers explicitly.
+    bool fixupLoopInstrs(MachineFunction &MF);
+
+    /// \brief Add the instruction to set the LC and SA registers explicitly.
+    void convertLoopInstr(MachineFunction &MF,
+                          MachineBasicBlock::iterator &MII,
+                          RegScavenger &RS);
+
+  };
+
+  char HexagonFixupHwLoops::ID = 0;
+}
+
+INITIALIZE_PASS(HexagonFixupHwLoops, "hwloopsfixup",
+                "Hexagon Hardware Loops Fixup", false, false)
+
+FunctionPass *llvm::createHexagonFixupHwLoops() {
+  return new HexagonFixupHwLoops();
+}
+
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+         MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+
+bool HexagonFixupHwLoops::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = fixupLoopInstrs(MF);
+  return Changed;
+}
+
+
+/// \brief For Hexagon, if the loop label is to far from the
+/// loop instruction then we need to set the LC0 and SA0 registers
+/// explicitly instead of using LOOP(start,count).  This function
+/// checks the distance, and generates register assignments if needed.
+///
+/// This function makes two passes over the basic blocks.  The first
+/// pass computes the offset of the basic block from the start.
+/// The second pass checks all the loop instructions.
+bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
+
+  // Offset of the current instruction from the start.
+  unsigned InstOffset = 0;
+  // Map for each basic block to it's first instruction.
+  DenseMap<MachineBasicBlock*, unsigned> BlockToInstOffset;
+
+  // First pass - compute the offset of each basic block.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    BlockToInstOffset[MBB] = InstOffset;
+    InstOffset += (MBB->size() * 4);
+  }
+
+  // Second pass - check each loop instruction to see if it needs to
+  // be converted.
+  InstOffset = 0;
+  bool Changed = false;
+  RegScavenger RS;
+
+  // Loop over all the basic blocks.
+  for (MachineFunction::iterator MBB = MF.begin(), MBBe = MF.end();
+       MBB != MBBe; ++MBB) {
+    InstOffset = BlockToInstOffset[MBB];
+    RS.enterBasicBlock(MBB);
+
+    // Loop over all the instructions.
+    MachineBasicBlock::iterator MIE = MBB->end();
+    MachineBasicBlock::iterator MII = MBB->begin();
+    while (MII != MIE) {
+      if (isHardwareLoop(MII)) {
+        RS.forward(MII);
+        assert(MII->getOperand(0).isMBB() &&
+               "Expect a basic block as loop operand");
+        int Sub = InstOffset - BlockToInstOffset[MII->getOperand(0).getMBB()];
+        unsigned Dist = Sub > 0 ? Sub : -Sub;
+        if (Dist > MAX_LOOP_DISTANCE) {
+          // Convert to explicity setting LC0 and SA0.
+          convertLoopInstr(MF, MII, RS);
+          MII = MBB->erase(MII);
+          Changed = true;
+        } else {
+          ++MII;
+        }
+      } else {
+        ++MII;
+      }
+      InstOffset += 4;
+    }
+  }
+
+  return Changed;
+}
+
+
+/// \brief convert a loop instruction to a sequence of instructions that
+/// set the LC0 and SA0 register explicitly.
+void HexagonFixupHwLoops::convertLoopInstr(MachineFunction &MF,
+                                           MachineBasicBlock::iterator &MII,
+                                           RegScavenger &RS) {
+  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+  MachineBasicBlock *MBB = MII->getParent();
+  DebugLoc DL = MII->getDebugLoc();
+  unsigned Scratch = RS.scavengeRegister(&Hexagon::IntRegsRegClass, MII, 0);
+
+  // First, set the LC0 with the trip count.
+  if (MII->getOperand(1).isReg()) {
+    // Trip count is a register
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(MII->getOperand(1).getReg());
+  } else {
+    // Trip count is an immediate.
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFRI), Scratch)
+      .addImm(MII->getOperand(1).getImm());
+    BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::LC0)
+      .addReg(Scratch);
+  }
+  // Then, set the SA0 with the loop start address.
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::CONST32_Label), Scratch)
+    .addMBB(MII->getOperand(0).getMBB());
+  BuildMI(*MBB, MII, DL, TII->get(Hexagon::TFCR), Hexagon::SA0)
+    .addReg(Scratch);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
new file mode 100644
index 000000000000..d6a9329cd407
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -0,0 +1,348 @@
+//===-- HexagonFrameLowering.cpp - Define frame lowering ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonFrameLowering.h"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisableDeallocRet(
+                       "disable-hexagon-dealloc-ret",
+                       cl::Hidden,
+                       cl::desc("Disable Dealloc Return for Hexagon target"));
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target.
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = RoundUpToAlignment(maxCallFrameSize, TargetAlign);
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = RoundUpToAlignment(FrameSize, TargetAlign);
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+
+void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  determineFrameLayout(MF);
+
+  // Check if frame moves are needed for EH.
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+    !MF.getFunction()->needsUnwindTableEntry();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  int NumBytes = (int) MFI->getStackSize();
+
+  // LLVM expects allocframe not to be the first instruction in the
+  // basic block.
+  MachineBasicBlock::iterator InsertPt = MBB.begin();
+
+  //
+  // ALLOCA adjust regs.  Iterate over ADJDYNALLOC nodes and change the offset.
+  //
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  const std::vector<MachineInstr*>& AdjustRegs =
+    FuncInfo->getAllocaAdjustInsts();
+  for (std::vector<MachineInstr*>::const_iterator i = AdjustRegs.begin(),
+         e = AdjustRegs.end();
+       i != e; ++i) {
+    MachineInstr* MI = *i;
+    assert((MI->getOpcode() == Hexagon::ADJDYNALLOC) &&
+           "Expected adjust alloca node");
+
+    MachineOperand& MO = MI->getOperand(2);
+    assert(MO.isImm() && "Expected immediate");
+    MO.setImm(MFI->getMaxCallFrameSize());
+  }
+
+ std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+
+ if (needsFrameMoves) {
+   // Advance CFA. DW_CFA_def_cfa
+   unsigned FPReg = QRI->getFrameRegister();
+   unsigned RAReg = QRI->getRARegister();
+
+   MachineLocation Dst(MachineLocation::VirtualFP);
+   MachineLocation Src(FPReg, -8);
+   Moves.push_back(MachineMove(0, Dst, Src));
+
+   // R31 = (R31 - #4)
+   MachineLocation LRDst(RAReg, -4);
+   MachineLocation LRSrc(RAReg);
+   Moves.push_back(MachineMove(0, LRDst, LRSrc));
+
+   // R30 = (R30 - #8)
+   MachineLocation SPDst(FPReg, -8);
+   MachineLocation SPSrc(FPReg);
+   Moves.push_back(MachineMove(0, SPDst, SPSrc));
+ }
+
+  //
+  // Only insert ALLOCFRAME if we need to.
+  //
+  if (hasFP(MF)) {
+    // Check for overflow.
+    // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+    const int ALLOCFRAME_MAX = 16384;
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    if (NumBytes >= ALLOCFRAME_MAX) {
+      // Emit allocframe(#0).
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(0);
+
+      // Subtract offset from frame pointer.
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::CONST32_Int_Real),
+                                      HEXAGON_RESERVED_REG_1).addImm(NumBytes);
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::SUB_rr),
+                                      QRI->getStackRegister()).
+                                      addReg(QRI->getStackRegister()).
+                                      addReg(HEXAGON_RESERVED_REG_1);
+    } else {
+      BuildMI(MBB, InsertPt, dl, TII.get(Hexagon::ALLOCFRAME)).addImm(NumBytes);
+    }
+  }
+}
+// Returns true if MBB has a machine instructions that indicates a tail call
+// in the block.
+bool HexagonFrameLowering::hasTailCall(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  return RetOpcode == Hexagon::TCRETURNtg || RetOpcode == Hexagon::TCRETURNtext;
+}
+
+void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  //
+  // Only insert deallocframe if we need to.
+  //
+  if (hasFP(MF)) {
+    MachineBasicBlock::iterator MBBI = prior(MBB.end());
+    MachineBasicBlock::iterator MBBI_end = MBB.end();
+    //
+    // For Hexagon, we don't need the frame size.
+    //
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int NumBytes = (int) MFI->getStackSize();
+
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+    // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
+    // versions.
+    if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPR
+                        && !DisableDeallocRet) {
+      // Remove jumpr node.
+      MBB.erase(MBBI);
+      // Add dealloc_return.
+      BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4))
+        .addImm(NumBytes);
+    } else { // Add deallocframe for V2 and V3.
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME)).addImm(NumBytes);
+    }
+  }
+}
+
+bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  return (MFI->hasCalls() || (MFI->getStackSize() > 0) ||
+          FuncInfo->hasClobberLR() );
+}
+
+static inline
+unsigned uniqueSuperReg(unsigned Reg, const TargetRegisterInfo *TRI) {
+  MCSuperRegIterator SRI(Reg, TRI);
+  assert(SRI.isValid() && "Expected a superreg");
+  unsigned SuperReg = *SRI;
+  ++SRI;
+  assert(!SRI.isValid() && "Expected exactly one superreg");
+  return SuperReg;
+}
+
+bool
+HexagonFrameLowering::spillCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word store.
+    //
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
+    bool CanUseDblStore = false;
+    const TargetRegisterClass* SuperRegClass = 0;
+
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblStore = (SuperRegNext == SuperReg);
+    }
+
+
+    if (CanUseDblStore) {
+      TII.storeRegToStackSlot(MBB, MI, SuperReg, true,
+                              CSI[i+1].getFrameIdx(), SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg);
+      ++i;
+    } else {
+      // Cannot use a double-word store.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RC,
+                              TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+
+bool HexagonFrameLowering::restoreCalleeSavedRegisters(
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  if (CSI.empty()) {
+    return false;
+  }
+
+  // We can only schedule double loads if we spill contiguous callee-saved regs
+  // For instance, we cannot scheduled double-word loads if we spill r24,
+  // r26, and r27.
+  // Hexagon_TODO: We can try to double-word align odd registers for -O2 and
+  // above.
+  bool ContiguousRegs = true;
+
+  for (unsigned i = 0; i < CSI.size(); ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    //
+    // Check if we can use a double-word load.
+    //
+    unsigned SuperReg = uniqueSuperReg(Reg, TRI);
+    const TargetRegisterClass* SuperRegClass = 0;
+    bool CanUseDblLoad = false;
+    if (ContiguousRegs && (i < CSI.size()-1)) {
+      unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
+      SuperRegClass = TRI->getMinimalPhysRegClass(SuperReg);
+      CanUseDblLoad = (SuperRegNext == SuperReg);
+    }
+
+
+    if (CanUseDblLoad) {
+      TII.loadRegFromStackSlot(MBB, MI, SuperReg, CSI[i+1].getFrameIdx(),
+                               SuperRegClass, TRI);
+      MBB.addLiveIn(SuperReg);
+      ++i;
+    } else {
+      // Cannot use a double-word load.
+      ContiguousRegs = false;
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
+      MBB.addLiveIn(Reg);
+    }
+  }
+  return true;
+}
+
+void HexagonFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+
+  if (MI.getOpcode() == Hexagon::ADJCALLSTACKDOWN) {
+    // Hexagon_TODO: add code
+  } else if (MI.getOpcode() == Hexagon::ADJCALLSTACKUP) {
+    // Hexagon_TODO: add code
+  } else {
+    llvm_unreachable("Cannot handle this call frame pseudo instruction");
+  }
+  MBB.erase(I);
+}
+
+int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  return MF.getFrameInfo()->getObjectOffset(FI);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
new file mode 100644
index 000000000000..a62c76aaf676
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -0,0 +1,55 @@
+//=- HexagonFrameLowering.h - Define frame lowering for Hexagon --*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGON_FRAMEINFO_H
+#define HEXAGON_FRAMEINFO_H
+
+#include "Hexagon.h"
+#include "HexagonSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+class HexagonFrameLowering : public TargetFrameLowering {
+private:
+  const HexagonSubtarget &STI;
+  void determineFrameLayout(MachineFunction &MF) const;
+
+public:
+  explicit HexagonFrameLowering(const HexagonSubtarget &sti)
+    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual bool
+  spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI,
+                            const TargetRegisterInfo *TRI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const std::vector<CalleeSavedInfo> &CSI,
+                              const TargetRegisterInfo *TRI) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasTailCall(MachineBasicBlock &MBB) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
new file mode 100644
index 000000000000..178662447a7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -0,0 +1,1554 @@
+//===-- HexagonHardwareLoops.cpp - Identify and generate hardware loops ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the Hexagon hardware
+// loop instruction.  The hardware loop can perform loop branches with a
+// zero-cycle overhead.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for hardware loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested hardware loops.
+//  - No function calls in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hwloops"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+
+#include <algorithm>
+#include <vector>
+
+using namespace llvm;
+
+#ifndef NDEBUG
+static cl::opt<int> HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1));
+#endif
+
+STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
+
+namespace llvm {
+  void initializeHexagonHardwareLoopsPass(PassRegistry&);
+}
+
+namespace {
+  class CountValue;
+  struct HexagonHardwareLoops : public MachineFunctionPass {
+    MachineLoopInfo            *MLI;
+    MachineRegisterInfo        *MRI;
+    MachineDominatorTree       *MDT;
+    const HexagonTargetMachine *TM;
+    const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
+#ifndef NDEBUG
+    static int Counter;
+#endif
+
+  public:
+    static char ID;
+
+    HexagonHardwareLoops() : MachineFunctionPass(ID) {
+      initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "Hexagon Hardware Loops"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// Kinds of comparisons in the compare instructions.
+    struct Comparison {
+      enum Kind {
+        EQ  = 0x01,
+        NE  = 0x02,
+        L   = 0x04, // Less-than property.
+        G   = 0x08, // Greater-than property.
+        U   = 0x40, // Unsigned property.
+        LTs = L,
+        LEs = L | EQ,
+        GTs = G,
+        GEs = G | EQ,
+        LTu = L      | U,
+        LEu = L | EQ | U,
+        GTu = G      | U,
+        GEu = G | EQ | U
+      };
+
+      static Kind getSwappedComparison(Kind Cmp) {
+        assert ((!((Cmp & L) && (Cmp & G))) && "Malformed comparison operator");
+        if ((Cmp & L) || (Cmp & G))
+          return (Kind)(Cmp ^ (L|G));
+        return Cmp;
+      }
+    };
+
+    /// \brief Find the register that contains the loop controlling
+    /// induction variable.
+    /// If successful, it will return true and set the \p Reg, \p IVBump
+    /// and \p IVOp arguments.  Otherwise it will return false.
+    /// The returned induction register is the register R that follows the
+    /// following induction pattern:
+    /// loop:
+    ///   R = phi ..., [ R.next, LatchBlock ]
+    ///   R.next = R + #bump
+    ///   if (R.next < #N) goto loop
+    /// IVBump is the immediate value added to R, and IVOp is the instruction
+    /// "R.next = R + #bump".
+    bool findInductionRegister(MachineLoop *L, unsigned &Reg,
+                               int64_t &IVBump, MachineInstr *&IVOp) const;
+
+    /// \brief Analyze the statements in a loop to determine if the loop
+    /// has a computable trip count and, if so, return a value that represents
+    /// the trip count expression.
+    CountValue *getLoopTripCount(MachineLoop *L,
+                                 SmallVector<MachineInstr*, 2> &OldInsts);
+
+    /// \brief Return the expression that represents the number of times
+    /// a loop iterates.  The function takes the operands that represent the
+    /// loop start value, loop end value, and induction value.  Based upon
+    /// these operands, the function attempts to compute the trip count.
+    /// If the trip count is not directly available (as an immediate value,
+    /// or a register), the function will attempt to insert computation of it
+    /// to the loop's preheader.
+    CountValue *computeCount(MachineLoop *Loop,
+                             const MachineOperand *Start,
+                             const MachineOperand *End,
+                             unsigned IVReg,
+                             int64_t IVBump,
+                             Comparison::Kind Cmp) const;
+
+    /// \brief Return true if the instruction is not valid within a hardware
+    /// loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// \brief Return true if the loop contains an instruction that inhibits
+    /// using the hardware loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// \brief Given a loop, check if we can convert it to a hardware loop.
+    /// If so, then perform the conversion and return true.
+    bool convertToHardwareLoop(MachineLoop *L);
+
+    /// \brief Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVector<MachineInstr*, 1> &DeadPhis) const;
+
+    /// \brief Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+
+    /// \brief Make sure that the "bump" instruction executes before the
+    /// compare.  We need that for the IV fixup, so that the compare
+    /// instruction would not use a bumped value that has not yet been
+    /// defined.  If the instructions are out of order, try to reorder them.
+    bool orderBumpCompare(MachineInstr *BumpI, MachineInstr *CmpI);
+
+    /// \brief Get the instruction that loads an immediate value into \p R,
+    /// or 0 if such an instruction does not exist.
+    MachineInstr *defWithImmediate(unsigned R);
+
+    /// \brief Get the immediate value referenced to by \p MO, either for
+    /// immediate operands, or for register operands, where the register
+    /// was defined with an immediate value.
+    int64_t getImmediate(MachineOperand &MO);
+
+    /// \brief Reset the given machine operand to now refer to a new immediate
+    /// value.  Assumes that the operand was already referencing an immediate
+    /// value, either directly, or via a register.
+    void setImmediate(MachineOperand &MO, int64_t Val);
+
+    /// \brief Fix the data flow of the induction varible.
+    /// The desired flow is: phi ---> bump -+-> comparison-in-latch.
+    ///                                     |
+    ///                                     +-> back to phi
+    /// where "bump" is the increment of the induction variable:
+    ///   iv = iv + #const.
+    /// Due to some prior code transformations, the actual flow may look
+    /// like this:
+    ///   phi -+-> bump ---> back to phi
+    ///        |
+    ///        +-> comparison-in-latch (against upper_bound-bump),
+    /// i.e. the comparison that controls the loop execution may be using
+    /// the value of the induction variable from before the increment.
+    ///
+    /// Return true if the loop's flow is the desired one (i.e. it's
+    /// either been fixed, or no fixing was necessary).
+    /// Otherwise, return false.  This can happen if the induction variable
+    /// couldn't be identified, or if the value in the latch's comparison
+    /// cannot be adjusted to reflect the post-bump value.
+    bool fixupInductionVariable(MachineLoop *L);
+
+    /// \brief Given a loop, if it does not have a preheader, create one.
+    /// Return the block that is the preheader.
+    MachineBasicBlock *createPreheaderForLoop(MachineLoop *L);
+  };
+
+  char HexagonHardwareLoops::ID = 0;
+#ifndef NDEBUG
+  int HexagonHardwareLoops::Counter = 0;
+#endif
+
+  /// \brief Abstraction for a trip count of a loop. A smaller vesrsion
+  /// of the MachineOperand class without the concerns of changing the
+  /// operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      struct {
+        unsigned Reg;
+        unsigned Sub;
+      } R;
+      unsigned ImmVal;
+    } Contents;
+
+  public:
+    explicit CountValue(CountValueType t, unsigned v, unsigned u = 0) {
+      Kind = t;
+      if (Kind == CV_Register) {
+        Contents.R.Reg = v;
+        Contents.R.Sub = u;
+      } else {
+        Contents.ImmVal = v;
+      }
+    }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Reg;
+    }
+    unsigned getSubReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.R.Sub;
+    }
+    unsigned getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      return Contents.ImmVal;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
+      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+      if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
+      if (isImm()) { OS << Contents.ImmVal; }
+    }
+  };
+} // end anonymous namespace
+
+
+INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
+                      "Hexagon Hardware Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
+                    "Hexagon Hardware Loops", false, false)
+
+
+/// \brief Returns true if the instruction is a hardware loop instruction.
+static bool isHardwareLoop(const MachineInstr *MI) {
+  return MI->getOpcode() == Hexagon::LOOP0_r ||
+    MI->getOpcode() == Hexagon::LOOP0_i;
+}
+
+FunctionPass *llvm::createHexagonHardwareLoops() {
+  return new HexagonHardwareLoops();
+}
+
+
+bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n");
+
+  bool Changed = false;
+
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  TM  = static_cast<const HexagonTargetMachine*>(&MF.getTarget());
+  TII = static_cast<const HexagonInstrInfo*>(TM->getInstrInfo());
+  TRI = static_cast<const HexagonRegisterInfo*>(TM->getRegisterInfo());
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop())
+      Changed |= convertToHardwareLoop(L);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
+                                                 unsigned &Reg,
+                                                 int64_t &IVBump,
+                                                 MachineInstr *&IVOp
+                                                 ) const {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // This pair represents an induction register together with an immediate
+  // value that will be added to it in each loop iteration.
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+
+  // Mapping:  R.next -> (R, bump), where R, R.next and bump are derived
+  // from an induction operation
+  //   R.next = R + bump
+  // where bump is an immediate value.
+  typedef std::map<unsigned,RegisterBump> InductionMap;
+
+  InductionMap IndMap;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.  Get the operand that corresponds to the
+    // latch block, and see if is a result of an addition of form "reg+imm",
+    // where the "reg" is defined by the PHI node we are looking at.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiOpReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiOpReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add is the PHI we're
+        // looking at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndMap.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = 0, *FB = 0;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  unsigned CSz = Cond.size();
+  assert (CSz == 1 || CSz == 2);
+  unsigned PredR = Cond[CSz-1].getReg();
+
+  MachineInstr *PredI = MRI->getVRegDef(PredR);
+  if (!PredI->isCompare())
+    return false;
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int CmpImm = 0, CmpMask = 0;
+  bool CmpAnalyzed = TII->analyzeCompare(PredI, CmpReg1, CmpReg2,
+                                         CmpMask, CmpImm);
+  // Fail if the compare was not analyzed, or it's not comparing a register
+  // with an immediate value.  Not checking the mask here, since we handle
+  // the individual compare opcodes (including CMPb) later on.
+  if (!CmpAnalyzed)
+    return false;
+
+  // Exactly one of the input registers to the comparison should be among
+  // the induction registers.
+  InductionMap::iterator IndMapEnd = IndMap.end();
+  InductionMap::iterator F = IndMapEnd;
+  if (CmpReg1 != 0) {
+    InductionMap::iterator F1 = IndMap.find(CmpReg1);
+    if (F1 != IndMapEnd)
+      F = F1;
+  }
+  if (CmpReg2 != 0) {
+    InductionMap::iterator F2 = IndMap.find(CmpReg2);
+    if (F2 != IndMapEnd) {
+      if (F != IndMapEnd)
+        return false;
+      F = F2;
+    }
+  }
+  if (F == IndMapEnd)
+    return false;
+
+  Reg = F->second.first;
+  IVBump = F->second.second;
+  IVOp = MRI->getVRegDef(F->first);
+  return true;
+}
+
+
+/// \brief Analyze the statements in a loop to determine if the loop has
+/// a computable trip count and, if so, return a value that represents
+/// the trip count expression.
+///
+/// This function iterates over the phi nodes in the loop to check for
+/// induction variable patterns that are used in the calculation for
+/// the number of time the loop is executed.
+CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
+                                SmallVector<MachineInstr*, 2> &OldInsts) {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end())  // dead loop?
+    return 0;
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end())  // multiple backedges?
+    return 0;
+
+  // Make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return 0;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return 0;
+
+  // Look for the cmp instruction to determine if we can get a useful trip
+  // count.  The trip count can be either a register or an immediate.  The
+  // location of the value depends upon the type (reg or imm).
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  if (!Latch)
+    return 0;
+
+  unsigned IVReg = 0;
+  int64_t IVBump = 0;
+  MachineInstr *IVOp;
+  bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
+  if (!FoundIV)
+    return 0;
+
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+
+  MachineOperand *InitialValue = 0;
+  MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
+  for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
+    MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
+    if (MBB == Preheader)
+      InitialValue = &IV_Phi->getOperand(i);
+    else if (MBB == Latch)
+      IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
+  }
+  if (!InitialValue)
+    return 0;
+
+  SmallVector<MachineOperand,2> Cond;
+  MachineBasicBlock *TB = 0, *FB = 0;
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return 0;
+
+  MachineBasicBlock *Header = L->getHeader();
+  // TB must be non-null.  If FB is also non-null, one of them must be
+  // the header.  Otherwise, branch to TB could be exiting the loop, and
+  // the fall through can go to the header.
+  assert (TB && "Latch block without a branch?");
+  assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
+  if (!TB || (FB && TB != Header && FB != Header))
+    return 0;
+
+  // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+  // to put imm(0), followed by P in the vector Cond.
+  // If TB is not the header, it means that the "not-taken" path must lead
+  // to the header.
+  bool Negated = (Cond.size() > 1) ^ (TB != Header);
+  unsigned PredReg = Cond[Cond.size()-1].getReg();
+  MachineInstr *CondI = MRI->getVRegDef(PredReg);
+  unsigned CondOpc = CondI->getOpcode();
+
+  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  int Mask = 0, ImmValue = 0;
+  bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
+                                         Mask, ImmValue);
+  if (!AnalyzedCmp)
+    return 0;
+
+  // The comparison operator type determines how we compute the loop
+  // trip count.
+  OldInsts.push_back(CondI);
+  OldInsts.push_back(IVOp);
+
+  // Sadly, the following code gets information based on the position
+  // of the operands in the compare instruction.  This has to be done
+  // this way, because the comparisons check for a specific relationship
+  // between the operands (e.g. is-less-than), rather than to find out
+  // what relationship the operands are in (as on PPC).
+  Comparison::Kind Cmp;
+  bool isSwapped = false;
+  const MachineOperand &Op1 = CondI->getOperand(1);
+  const MachineOperand &Op2 = CondI->getOperand(2);
+  const MachineOperand *EndValue = 0;
+
+  if (Op1.isReg()) {
+    if (Op2.isImm() || Op1.getReg() == IVReg)
+      EndValue = &Op2;
+    else {
+      EndValue = &Op1;
+      isSwapped = true;
+    }
+  }
+
+  if (!EndValue)
+    return 0;
+
+  switch (CondOpc) {
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
+    case Hexagon::CMPLTrr:
+      Cmp = !Negated ? Comparison::LTs : Comparison::GEs;
+      break;
+    case Hexagon::CMPLTUrr:
+      Cmp = !Negated ? Comparison::LTu : Comparison::GEu;
+      break;
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+      Cmp = !Negated ? Comparison::GTu : Comparison::LEu;
+      break;
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+      Cmp = !Negated ? Comparison::GTs : Comparison::LEs;
+      break;
+    // Very limited support for byte/halfword compares.
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPhEQri_V4: {
+      if (IVBump != 1)
+        return 0;
+
+      int64_t InitV, EndV;
+      // Since the comparisons are "ri", the EndValue should be an
+      // immediate.  Check it just in case.
+      assert(EndValue->isImm() && "Unrecognized latch comparison");
+      EndV = EndValue->getImm();
+      // Allow InitialValue to be a register defined with an immediate.
+      if (InitialValue->isReg()) {
+        if (!defWithImmediate(InitialValue->getReg()))
+          return 0;
+        InitV = getImmediate(*InitialValue);
+      } else {
+        assert(InitialValue->isImm());
+        InitV = InitialValue->getImm();
+      }
+      if (InitV >= EndV)
+        return 0;
+      if (CondOpc == Hexagon::CMPbEQri_V4) {
+        if (!isInt<8>(InitV) || !isInt<8>(EndV))
+          return 0;
+      } else {  // Hexagon::CMPhEQri_V4
+        if (!isInt<16>(InitV) || !isInt<16>(EndV))
+          return 0;
+      }
+      Cmp = !Negated ? Comparison::EQ : Comparison::NE;
+      break;
+    }
+    default:
+      return 0;
+  }
+
+  if (isSwapped)
+   Cmp = Comparison::getSwappedComparison(Cmp);
+
+  if (InitialValue->isReg()) {
+    unsigned R = InitialValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return 0;
+    OldInsts.push_back(MRI->getVRegDef(R));
+  }
+  if (EndValue->isReg()) {
+    unsigned R = EndValue->getReg();
+    MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
+    if (!MDT->properlyDominates(DefBB, Header))
+      return 0;
+  }
+
+  return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
+}
+
+/// \brief Helper function that returns the expression that represents the
+/// number of times a loop iterates.  The function takes the operands that
+/// represent the loop start value, loop end value, and induction value.
+/// Based upon these operands, the function attempts to compute the trip count.
+CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
+                                               const MachineOperand *Start,
+                                               const MachineOperand *End,
+                                               unsigned IVReg,
+                                               int64_t IVBump,
+                                               Comparison::Kind Cmp) const {
+  // Cannot handle comparison EQ, i.e. while (A == B).
+  if (Cmp == Comparison::EQ)
+    return 0;
+
+  // Check if either the start or end values are an assignment of an immediate.
+  // If so, use the immediate value rather than the register.
+  if (Start->isReg()) {
+    const MachineInstr *StartValInstr = MRI->getVRegDef(Start->getReg());
+    if (StartValInstr && StartValInstr->getOpcode() == Hexagon::TFRI)
+      Start = &StartValInstr->getOperand(1);
+  }
+  if (End->isReg()) {
+    const MachineInstr *EndValInstr = MRI->getVRegDef(End->getReg());
+    if (EndValInstr && EndValInstr->getOpcode() == Hexagon::TFRI)
+      End = &EndValInstr->getOperand(1);
+  }
+
+  assert (Start->isReg() || Start->isImm());
+  assert (End->isReg() || End->isImm());
+
+  bool CmpLess =     Cmp & Comparison::L;
+  bool CmpGreater =  Cmp & Comparison::G;
+  bool CmpHasEqual = Cmp & Comparison::EQ;
+
+  // Avoid certain wrap-arounds.  This doesn't detect all wrap-arounds.
+  // If loop executes while iv is "less" with the iv value going down, then
+  // the iv must wrap.
+  if (CmpLess && IVBump < 0)
+    return 0;
+  // If loop executes while iv is "greater" with the iv value going up, then
+  // the iv must wrap.
+  if (CmpGreater && IVBump > 0)
+    return 0;
+
+  if (Start->isImm() && End->isImm()) {
+    // Both, start and end are immediates.
+    int64_t StartV = Start->getImm();
+    int64_t EndV = End->getImm();
+    int64_t Dist = EndV - StartV;
+    if (Dist == 0)
+      return 0;
+
+    bool Exact = (Dist % IVBump) == 0;
+
+    if (Cmp == Comparison::NE) {
+      if (!Exact)
+        return 0;
+      if ((Dist < 0) ^ (IVBump < 0))
+        return 0;
+    }
+
+    // For comparisons that include the final value (i.e. include equality
+    // with the final value), we need to increase the distance by 1.
+    if (CmpHasEqual)
+      Dist = Dist > 0 ? Dist+1 : Dist-1;
+
+    // assert (CmpLess => Dist > 0);
+    assert ((!CmpLess || Dist > 0) && "Loop should never iterate!");
+    // assert (CmpGreater => Dist < 0);
+    assert ((!CmpGreater || Dist < 0) && "Loop should never iterate!");
+
+    // "Normalized" distance, i.e. with the bump set to +-1.
+    int64_t Dist1 = (IVBump > 0) ? (Dist +  (IVBump-1)) /   IVBump
+                               :  (-Dist + (-IVBump-1)) / (-IVBump);
+    assert (Dist1 > 0 && "Fishy thing.  Both operands have the same sign.");
+
+    uint64_t Count = Dist1;
+
+    if (Count > 0xFFFFFFFFULL)
+      return 0;
+
+    return new CountValue(CountValue::CV_Immediate, Count);
+  }
+
+  // A general case: Start and End are some values, but the actual
+  // iteration count may not be available.  If it is not, insert
+  // a computation of it into the preheader.
+
+  // If the induction variable bump is not a power of 2, quit.
+  // Othwerise we'd need a general integer division.
+  if (!isPowerOf2_64(abs64(IVBump)))
+    return 0;
+
+  MachineBasicBlock *PH = Loop->getLoopPreheader();
+  assert (PH && "Should have a preheader by now");
+  MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator();
+  DebugLoc DL = (InsertPos != PH->end()) ? InsertPos->getDebugLoc()
+                                         : DebugLoc();
+
+  // If Start is an immediate and End is a register, the trip count
+  // will be "reg - imm".  Hexagon's "subtract immediate" instruction
+  // is actually "reg + -imm".
+
+  // If the loop IV is going downwards, i.e. if the bump is negative,
+  // then the iteration count (computed as End-Start) will need to be
+  // negated.  To avoid the negation, just swap Start and End.
+  if (IVBump < 0) {
+    std::swap(Start, End);
+    IVBump = -IVBump;
+  }
+  // Cmp may now have a wrong direction, e.g.  LEs may now be GEs.
+  // Signedness, and "including equality" are preserved.
+
+  bool RegToImm = Start->isReg() && End->isImm(); // for (reg..imm)
+  bool RegToReg = Start->isReg() && End->isReg(); // for (reg..reg)
+
+  int64_t StartV = 0, EndV = 0;
+  if (Start->isImm())
+    StartV = Start->getImm();
+  if (End->isImm())
+    EndV = End->getImm();
+
+  int64_t AdjV = 0;
+  // To compute the iteration count, we would need this computation:
+  //   Count = (End - Start + (IVBump-1)) / IVBump
+  // or, when CmpHasEqual:
+  //   Count = (End - Start + (IVBump-1)+1) / IVBump
+  // The "IVBump-1" part is the adjustment (AdjV).  We can avoid
+  // generating an instruction specifically to add it if we can adjust
+  // the immediate values for Start or End.
+
+  if (CmpHasEqual) {
+    // Need to add 1 to the total iteration count.
+    if (Start->isImm())
+      StartV--;
+    else if (End->isImm())
+      EndV++;
+    else
+      AdjV += 1;
+  }
+
+  if (Cmp != Comparison::NE) {
+    if (Start->isImm())
+      StartV -= (IVBump-1);
+    else if (End->isImm())
+      EndV += (IVBump-1);
+    else
+      AdjV += (IVBump-1);
+  }
+
+  unsigned R = 0, SR = 0;
+  if (Start->isReg()) {
+    R = Start->getReg();
+    SR = Start->getSubReg();
+  } else {
+    R = End->getReg();
+    SR = End->getSubReg();
+  }
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  // Hardware loops cannot handle 64-bit registers.  If it's a double
+  // register, it has to have a subregister.
+  if (!SR && RC == &Hexagon::DoubleRegsRegClass)
+    return 0;
+  const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
+
+  // Compute DistR (register with the distance between Start and End).
+  unsigned DistR, DistSR;
+
+  // Avoid special case, where the start value is an imm(0).
+  if (Start->isImm() && StartV == 0) {
+    DistR = End->getReg();
+    DistSR = End->getSubReg();
+  } else {
+    const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::SUB_rr) :
+                              (RegToImm ? TII->get(Hexagon::SUB_ri) :
+                                          TII->get(Hexagon::ADD_ri));
+    unsigned SubR = MRI->createVirtualRegister(IntRC);
+    MachineInstrBuilder SubIB =
+      BuildMI(*PH, InsertPos, DL, SubD, SubR);
+
+    if (RegToReg) {
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else if (RegToImm) {
+      SubIB.addImm(EndV)
+           .addReg(Start->getReg(), 0, Start->getSubReg());
+    } else { // ImmToReg
+      SubIB.addReg(End->getReg(), 0, End->getSubReg())
+           .addImm(-StartV);
+    }
+    DistR = SubR;
+    DistSR = 0;
+  }
+
+  // From DistR, compute AdjR (register with the adjusted distance).
+  unsigned AdjR, AdjSR;
+
+  if (AdjV == 0) {
+    AdjR = DistR;
+    AdjSR = DistSR;
+  } else {
+    // Generate CountR = ADD DistR, AdjVal
+    unsigned AddR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &AddD = TII->get(Hexagon::ADD_ri);
+    BuildMI(*PH, InsertPos, DL, AddD, AddR)
+      .addReg(DistR, 0, DistSR)
+      .addImm(AdjV);
+
+    AdjR = AddR;
+    AdjSR = 0;
+  }
+
+  // From AdjR, compute CountR (register with the final count).
+  unsigned CountR, CountSR;
+
+  if (IVBump == 1) {
+    CountR = AdjR;
+    CountSR = AdjSR;
+  } else {
+    // The IV bump is a power of two. Log_2(IV bump) is the shift amount.
+    unsigned Shift = Log2_32(IVBump);
+
+    // Generate NormR = LSR DistR, Shift.
+    unsigned LsrR = MRI->createVirtualRegister(IntRC);
+    const MCInstrDesc &LsrD = TII->get(Hexagon::LSR_ri);
+    BuildMI(*PH, InsertPos, DL, LsrD, LsrR)
+      .addReg(AdjR, 0, AdjSR)
+      .addImm(Shift);
+
+    CountR = LsrR;
+    CountSR = 0;
+  }
+
+  return new CountValue(CountValue::CV_Register, CountR, CountSR);
+}
+
+
+/// \brief Return true if the operation is invalid within hardware loop.
+bool HexagonHardwareLoops::isInvalidLoopOperation(
+      const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a hardware loop
+  if (MI->getDesc().isCall())
+    return true;
+
+  // do not allow nested hardware loops
+  if (isHardwareLoop(MI))
+    return true;
+
+  // check if the instruction defines a hardware loop register
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned R = MO.getReg();
+    if (R == Hexagon::LC0 || R == Hexagon::LC1 ||
+        R == Hexagon::SA0 || R == Hexagon::SA1)
+      return true;
+  }
+  return false;
+}
+
+
+/// \brief - Return true if the loop contains an instruction that inhibits
+/// the use of the hardware loop function.
+bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI))
+        return true;
+    }
+  }
+  return false;
+}
+
+
+/// \brief Returns true if the instruction is dead.  This was essentially
+/// copied from DeadMachineInstructionElim::isDead, but with special cases
+/// for inline asm, physical registers and instructions with side effects
+/// removed.
+bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
+                             SmallVector<MachineInstr*, 1> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (MRI->use_nodbg_empty(Reg))
+      continue;
+
+    typedef MachineRegisterInfo::use_nodbg_iterator use_nodbg_iterator;
+
+    // This instruction has users, but if the only user is the phi node for the
+    // parent block, and the only use of that phi node is this instruction, then
+    // this instruction is dead: both it (and the phi node) can be removed.
+    use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+    use_nodbg_iterator End = MRI->use_nodbg_end();
+    if (llvm::next(I) != End || !I.getOperand().getParent()->isPHI())
+      return false;
+
+    MachineInstr *OnePhi = I.getOperand().getParent();
+    for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+      const MachineOperand &OPO = OnePhi->getOperand(j);
+      if (!OPO.isReg() || !OPO.isDef())
+        continue;
+
+      unsigned OPReg = OPO.getReg();
+      use_nodbg_iterator nextJ;
+      for (use_nodbg_iterator J = MRI->use_nodbg_begin(OPReg);
+           J != End; J = nextJ) {
+        nextJ = llvm::next(J);
+        MachineOperand &Use = J.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+
+        // If the phi node has a user that is not MI, bail...
+        if (MI != UseMI)
+          return false;
+      }
+    }
+    DeadPhis.push_back(OnePhi);
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void HexagonHardwareLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim.
+
+  SmallVector<MachineInstr*, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "HW looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I != E; I = nextI) {
+        nextI = llvm::next(I);  // I is invalidated by the setReg
+        MachineOperand &Use = I.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+        if (UseMI == MI)
+          continue;
+        if (Use.isDebug())
+          UseMI->getOperand(0).setReg(0U);
+        // This may also be a "instr -> phi -> instr" case which can
+        // be removed too.
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i)
+      DeadPhis[i]->eraseFromParent();
+  }
+}
+
+/// \brief Check if the loop is a candidate for converting to a hardware
+/// loop.  If so, then perform the transformation.
+///
+/// This function works on innermost loops first.  A loop can be converted
+/// if it is a counting loop; either a register value or an immediate.
+///
+/// The code makes several assumptions about the representation of the loop
+/// in llvm.
+bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
+  // This is just for sanity.
+  assert(L->getHeader() && "Loop without a header?");
+
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    Changed |= convertToHardwareLoop(*I);
+
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed)
+    return Changed;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = HWLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= HWLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L))
+    return false;
+
+  // Is the induction variable bump feeding the latch condition?
+  if (!fixupInductionVariable(L))
+    return false;
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate hw loop if the loop has more than one exit.
+  if (LastMBB == 0)
+    return false;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI == LastMBB->end())
+    return false;
+
+  // Ensure the loop has a preheader: the loop instruction will be
+  // placed there.
+  bool NewPreheader = false;
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = createPreheaderForLoop(L);
+    if (!Preheader)
+      return false;
+    NewPreheader = true;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  SmallVector<MachineInstr*, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getLoopTripCount(L, OldInsts);
+  if (TripCount == 0)
+    return false;
+
+  // Is the trip count available in the preheader?
+  if (TripCount->isReg()) {
+    // There will be a use of the register inserted into the preheader,
+    // so make sure that the register is actually defined at that point.
+    MachineInstr *TCDef = MRI->getVRegDef(TripCount->getReg());
+    MachineBasicBlock *BBDef = TCDef->getParent();
+    if (!NewPreheader) {
+      if (!MDT->dominates(BBDef, Preheader))
+        return false;
+    } else {
+      // If we have just created a preheader, the dominator tree won't be
+      // aware of it.  Check if the definition of the register dominates
+      // the header, but is not the header itself.
+      if (!MDT->properlyDominates(BBDef, L->getHeader()))
+        return false;
+    }
+  }
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the
+    // loop latch may contains instrs. that need to be executed after the
+    // first iteration.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart))
+      return false;
+  }
+
+  // Convert the loop to a hardware loop.
+  DEBUG(dbgs() << "Change to hardware loop at "; L->dump());
+  DebugLoc DL;
+  if (InsertPos != Preheader->end())
+    DL = InsertPos->getDebugLoc();
+
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+    BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg)
+      .addReg(TripCount->getReg(), 0, TripCount->getSubReg());
+    // Add the Loop instruction to the beginning of the loop.
+    BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+      .addMBB(LoopStart)
+      .addReg(CountReg);
+  } else {
+    assert(TripCount->isImm() && "Expecting immediate value for trip count");
+    // Add the Loop immediate instruction to the beginning of the loop,
+    // if the immediate fits in the instructions.  Otherwise, we need to
+    // create a new virtual register.
+    int64_t CountImm = TripCount->getImm();
+    if (!TII->isValidOffset(Hexagon::LOOP0_i, CountImm)) {
+      unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::TFRI), CountReg)
+        .addImm(CountImm);
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_r))
+        .addMBB(LoopStart).addReg(CountReg);
+    } else
+      BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::LOOP0_i))
+        .addMBB(LoopStart).addImm(CountImm);
+  }
+
+  // Make sure the loop start always has a reference in the CFG.  We need
+  // to create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object.
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with an endloop instruction.
+  DebugLoc LastIDL = LastI->getDebugLoc();
+  BuildMI(*LastMBB, LastI, LastIDL,
+          TII->get(Hexagon::ENDLOOP0)).addMBB(LoopStart);
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  if (LastI->getOpcode() == Hexagon::JMP_c ||
+      LastI->getOpcode() == Hexagon::JMP_cNot) {
+    // Delete one and change/add an uncond. branch to out of the loop.
+    MachineBasicBlock *BranchTarget = LastI->getOperand(1).getMBB();
+    LastI = LastMBB->erase(LastI);
+    if (!L->contains(BranchTarget)) {
+      if (LastI != LastMBB->end())
+        LastI = LastMBB->erase(LastI);
+      SmallVector<MachineOperand, 0> Cond;
+      TII->InsertBranch(*LastMBB, BranchTarget, 0, Cond, LastIDL);
+    }
+  } else {
+    // Conditional branch to loop start; just delete it.
+    LastMBB->erase(LastI);
+  }
+  delete TripCount;
+
+  // The induction operation and the comparison may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumHWLoops;
+  return true;
+}
+
+
+bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI,
+                                            MachineInstr *CmpI) {
+  assert (BumpI != CmpI && "Bump and compare in the same instruction?");
+
+  MachineBasicBlock *BB = BumpI->getParent();
+  if (CmpI->getParent() != BB)
+    return false;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  // Check if things are in order to begin with.
+  for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I)
+    if (&*I == CmpI)
+      return true;
+
+  // Out of order.
+  unsigned PredR = CmpI->getOperand(0).getReg();
+  bool FoundBump = false;
+  instr_iterator CmpIt = CmpI, NextIt = llvm::next(CmpIt);
+  for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) {
+    MachineInstr *In = &*I;
+    for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) {
+      MachineOperand &MO = In->getOperand(i);
+      if (MO.isReg() && MO.isUse()) {
+        if (MO.getReg() == PredR)  // Found an intervening use of PredR.
+          return false;
+      }
+    }
+
+    if (In == BumpI) {
+      instr_iterator After = BumpI;
+      instr_iterator From = CmpI;
+      BB->splice(llvm::next(After), BB, From);
+      FoundBump = true;
+      break;
+    }
+  }
+  assert (FoundBump && "Cannot determine instruction order");
+  return FoundBump;
+}
+
+
+MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
+  MachineInstr *DI = MRI->getVRegDef(R);
+  unsigned DOpc = DI->getOpcode();
+  switch (DOpc) {
+    case Hexagon::TFRI:
+    case Hexagon::TFRI64:
+    case Hexagon::CONST32_Int_Real:
+    case Hexagon::CONST64_Int_Real:
+      return DI;
+  }
+  return 0;
+}
+
+
+int64_t HexagonHardwareLoops::getImmediate(MachineOperand &MO) {
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  assert(DI && "Need an immediate operand");
+  // All currently supported "define-with-immediate" instructions have the
+  // actual immediate value in the operand(1).
+  int64_t v = DI->getOperand(1).getImm();
+  return v;
+}
+
+
+void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
+  if (MO.isImm()) {
+    MO.setImm(Val);
+    return;
+  }
+
+  assert(MO.isReg());
+  unsigned R = MO.getReg();
+  MachineInstr *DI = defWithImmediate(R);
+  if (MRI->hasOneNonDBGUse(R)) {
+    // If R has only one use, then just change its defining instruction to
+    // the new immediate value.
+    DI->getOperand(1).setImm(Val);
+    return;
+  }
+
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  unsigned NewR = MRI->createVirtualRegister(RC);
+  MachineBasicBlock &B = *DI->getParent();
+  DebugLoc DL = DI->getDebugLoc();
+  BuildMI(B, DI, DL, TII->get(DI->getOpcode()), NewR)
+    .addImm(Val);
+  MO.setReg(NewR);
+}
+
+
+bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+
+  if (!Header || !Preheader || !Latch)
+    return false;
+
+  // These data structures follow the same concept as the corresponding
+  // ones in findInductionRegister (where some comments are).
+  typedef std::pair<unsigned,int64_t> RegisterBump;
+  typedef std::pair<unsigned,RegisterBump> RegisterInduction;
+  typedef std::set<RegisterInduction> RegisterInductionSet;
+
+  // Register candidates for induction variables, with their associated bumps.
+  RegisterInductionSet IndRegs;
+
+  // Look for induction patterns:
+  //   vreg1 = PHI ..., [ latch, vreg2 ]
+  //   vreg2 = ADD vreg1, imm
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+  for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *Phi = &*I;
+
+    // Have a PHI instruction.
+    for (unsigned i = 1, n = Phi->getNumOperands(); i < n; i += 2) {
+      if (Phi->getOperand(i+1).getMBB() != Latch)
+        continue;
+
+      unsigned PhiReg = Phi->getOperand(i).getReg();
+      MachineInstr *DI = MRI->getVRegDef(PhiReg);
+      unsigned UpdOpc = DI->getOpcode();
+      bool isAdd = (UpdOpc == Hexagon::ADD_ri);
+
+      if (isAdd) {
+        // If the register operand to the add/sub is the PHI we are looking
+        // at, this meets the induction pattern.
+        unsigned IndReg = DI->getOperand(1).getReg();
+        if (MRI->getVRegDef(IndReg) == Phi) {
+          unsigned UpdReg = DI->getOperand(0).getReg();
+          int64_t V = DI->getOperand(2).getImm();
+          IndRegs.insert(std::make_pair(UpdReg, std::make_pair(IndReg, V)));
+        }
+      }
+    }  // for (i)
+  }  // for (instr)
+
+  if (IndRegs.empty())
+    return false;
+
+  MachineBasicBlock *TB = 0, *FB = 0;
+  SmallVector<MachineOperand,2> Cond;
+  // AnalyzeBranch returns true if it fails to analyze branch.
+  bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
+  if (NotAnalyzed)
+    return false;
+
+  // Check if the latch branch is unconditional.
+  if (Cond.empty())
+    return false;
+
+  if (TB != Header && FB != Header)
+    // The latch does not go back to the header.  Not a latch we know and love.
+    return false;
+
+  // Expecting a predicate register as a condition.  It won't be a hardware
+  // predicate register at this point yet, just a vreg.
+  // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+  // into Cond, followed by the predicate register.  For non-negated branches
+  // it's just the register.
+  unsigned CSz = Cond.size();
+  if (CSz != 1 && CSz != 2)
+    return false;
+
+  unsigned P = Cond[CSz-1].getReg();
+  MachineInstr *PredDef = MRI->getVRegDef(P);
+
+  if (!PredDef->isCompare())
+    return false;
+
+  SmallSet<unsigned,2> CmpRegs;
+  MachineOperand *CmpImmOp = 0;
+
+  // Go over all operands to the compare and look for immediate and register
+  // operands.  Assume that if the compare has a single register use and a
+  // single immediate operand, then the register is being compared with the
+  // immediate value.
+  for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+    MachineOperand &MO = PredDef->getOperand(i);
+    if (MO.isReg()) {
+      // Skip all implicit references.  In one case there was:
+      //   %vreg140<def> = FCMPUGT32_rr %vreg138, %vreg139, %USR<imp-use>
+      if (MO.isImplicit())
+        continue;
+      if (MO.isUse()) {
+        unsigned R = MO.getReg();
+        if (!defWithImmediate(R)) {
+          CmpRegs.insert(MO.getReg());
+          continue;
+        }
+        // Consider the register to be the "immediate" operand.
+        if (CmpImmOp)
+          return false;
+        CmpImmOp = &MO;
+      }
+    } else if (MO.isImm()) {
+      if (CmpImmOp)    // A second immediate argument?  Confusing.  Bail out.
+        return false;
+      CmpImmOp = &MO;
+    }
+  }
+
+  if (CmpRegs.empty())
+    return false;
+
+  // Check if the compared register follows the order we want.  Fix if needed.
+  for (RegisterInductionSet::iterator I = IndRegs.begin(), E = IndRegs.end();
+       I != E; ++I) {
+    // This is a success.  If the register used in the comparison is one that
+    // we have identified as a bumped (updated) induction register, there is
+    // nothing to do.
+    if (CmpRegs.count(I->first))
+      return true;
+
+    // Otherwise, if the register being compared comes out of a PHI node,
+    // and has been recognized as following the induction pattern, and is
+    // compared against an immediate, we can fix it.
+    const RegisterBump &RB = I->second;
+    if (CmpRegs.count(RB.first)) {
+      if (!CmpImmOp)
+        return false;
+
+      int64_t CmpImm = getImmediate(*CmpImmOp);
+      int64_t V = RB.second;
+      if (V > 0 && CmpImm+V < CmpImm)  // Overflow (64-bit).
+        return false;
+      if (V < 0 && CmpImm+V > CmpImm)  // Overflow (64-bit).
+        return false;
+      CmpImm += V;
+      // Some forms of cmp-immediate allow u9 and s10.  Assume the worst case
+      // scenario, i.e. an 8-bit value.
+      if (CmpImmOp->isImm() && !isInt<8>(CmpImm))
+        return false;
+
+      // Make sure that the compare happens after the bump.  Otherwise,
+      // after the fixup, the compare would use a yet-undefined register.
+      MachineInstr *BumpI = MRI->getVRegDef(I->first);
+      bool Order = orderBumpCompare(BumpI, PredDef);
+      if (!Order)
+        return false;
+
+      // Finally, fix the compare instruction.
+      setImmediate(*CmpImmOp, CmpImm);
+      for (unsigned i = 0, n = PredDef->getNumOperands(); i < n; ++i) {
+        MachineOperand &MO = PredDef->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RB.first) {
+          MO.setReg(I->first);
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+/// \brief Create a preheader for a given loop.
+MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
+      MachineLoop *L) {
+  if (MachineBasicBlock *TmpPH = L->getLoopPreheader())
+    return TmpPH;
+
+  MachineBasicBlock *Header = L->getHeader();
+  MachineBasicBlock *Latch = L->getLoopLatch();
+  MachineFunction *MF = Header->getParent();
+  DebugLoc DL;
+
+  if (!Latch || Header->hasAddressTaken())
+    return 0;
+
+  typedef MachineBasicBlock::instr_iterator instr_iterator;
+
+  // Verify that all existing predecessors have analyzable branches
+  // (or no branches at all).
+  typedef std::vector<MachineBasicBlock*> MBBVector;
+  MBBVector Preds(Header->pred_begin(), Header->pred_end());
+  SmallVector<MachineOperand,2> Tmp1;
+  MachineBasicBlock *TB = 0, *FB = 0;
+
+  if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false))
+    return 0;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
+      if (NotAnalyzed)
+        return 0;
+    }
+  }
+
+  MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock();
+  MF->insert(Header, NewPH);
+
+  if (Header->pred_size() > 2) {
+    // Ensure that the header has only two predecessors: the preheader and
+    // the loop latch.  Any additional predecessors of the header should
+    // join at the newly created preheader.  Inspect all PHI nodes from the
+    // header and create appropriate corresponding PHI nodes in the preheader.
+
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+
+      const MCInstrDesc &PD = TII->get(TargetOpcode::PHI);
+      MachineInstr *NewPN = MF->CreateMachineInstr(PD, DL);
+      NewPH->insert(NewPH->end(), NewPN);
+
+      unsigned PR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(PR);
+      unsigned NewPR = MRI->createVirtualRegister(RC);
+      NewPN->addOperand(MachineOperand::CreateReg(NewPR, true));
+
+      // Copy all non-latch operands of a header's PHI node to the newly
+      // created PHI node in the preheader.
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        unsigned PredR = PN->getOperand(i).getReg();
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB == Latch)
+          continue;
+
+        NewPN->addOperand(MachineOperand::CreateReg(PredR, false));
+        NewPN->addOperand(MachineOperand::CreateMBB(PredB));
+      }
+
+      // Remove copied operands from the old PHI node and add the value
+      // coming from the preheader's PHI.
+      for (int i = PN->getNumOperands()-2; i > 0; i -= 2) {
+        MachineBasicBlock *PredB = PN->getOperand(i+1).getMBB();
+        if (PredB != Latch) {
+          PN->RemoveOperand(i+1);
+          PN->RemoveOperand(i);
+        }
+      }
+      PN->addOperand(MachineOperand::CreateReg(NewPR, false));
+      PN->addOperand(MachineOperand::CreateMBB(NewPH));
+    }
+
+  } else {
+    assert(Header->pred_size() == 2);
+
+    // The header has only two predecessors, but the non-latch predecessor
+    // is not a preheader (e.g. it has other successors, etc.)
+    // In such a case we don't need any extra PHI nodes in the new preheader,
+    // all we need is to adjust existing PHIs in the header to now refer to
+    // the new preheader.
+    for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
+         I != E && I->isPHI(); ++I) {
+      MachineInstr *PN = &*I;
+      for (unsigned i = 1, n = PN->getNumOperands(); i < n; i += 2) {
+        MachineOperand &MO = PN->getOperand(i+1);
+        if (MO.getMBB() != Latch)
+          MO.setMBB(NewPH);
+      }
+    }
+  }
+
+  // "Reroute" the CFG edges to link in the new preheader.
+  // If any of the predecessors falls through to the header, insert a branch
+  // to the new preheader in that place.
+  SmallVector<MachineOperand,1> Tmp2;
+  SmallVector<MachineOperand,1> EmptyCond;
+
+  TB = FB = 0;
+
+  for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
+    MachineBasicBlock *PB = *I;
+    if (PB != Latch) {
+      Tmp2.clear();
+      bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp2, false);
+      (void)NotAnalyzed; // supress compiler warning
+      assert (!NotAnalyzed && "Should be analyzable!");
+      if (TB != Header && (Tmp2.empty() || FB != Header))
+        TII->InsertBranch(*PB, NewPH, 0, EmptyCond, DL);
+      PB->ReplaceUsesOfBlockWith(Header, NewPH);
+    }
+  }
+
+  // It can happen that the latch block will fall through into the header.
+  // Insert an unconditional branch to the header.
+  TB = FB = 0;
+  bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
+  (void)LatchNotAnalyzed; // supress compiler warning
+  assert (!LatchNotAnalyzed && "Should be analyzable!");
+  if (!TB && !FB)
+    TII->InsertBranch(*Latch, Header, 0, EmptyCond, DL);
+
+  // Finally, the branch from the preheader to the header.
+  TII->InsertBranch(*NewPH, Header, 0, EmptyCond, DL);
+  NewPH->addSuccessor(Header);
+
+  return NewPH;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
new file mode 100644
index 000000000000..8fc9ba1ee8cf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -0,0 +1,1663 @@
+//===-- HexagonISelDAGToDAG.cpp - A dag to dag inst selector for Hexagon --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Hexagon target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-isel"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+static
+cl::opt<unsigned>
+MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
+  cl::Hidden, cl::init(2),
+  cl::desc("Maximum number of uses of a global address such that we still us a"
+           "constant extended instruction"));
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+  void initializeHexagonDAGToDAGISelPass(PassRegistry&);
+}
+
+//===--------------------------------------------------------------------===//
+/// HexagonDAGToDAGISel - Hexagon specific code to select Hexagon machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class HexagonDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Hexagon Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const HexagonSubtarget &Subtarget;
+
+  // Keep a reference to HexagonTargetMachine.
+  HexagonTargetMachine& TM;
+  const HexagonInstrInfo *TII;
+  DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap;
+public:
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
+                               CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(targetmachine, OptLevel),
+      Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
+      TM(targetmachine),
+      TII(static_cast<const HexagonInstrInfo*>(TM.getInstrInfo())) {
+    initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
+  }
+  bool hasNumUsesBelowThresGA(SDNode *N) const;
+
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern Selectors.
+  inline bool foldGlobalAddress(SDValue &N, SDValue &R);
+  inline bool foldGlobalAddressGP(SDValue &N, SDValue &R);
+  bool foldGlobalAddressImpl(SDValue &N, SDValue &R, bool ShouldLookForGP);
+  bool SelectADDRri(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriS11_2(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectMEMriS11_2(SDValue& Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriS11_3(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRrr(SDValue &Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRriU6_0(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
+  bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
+
+  virtual const char *getPassName() const {
+    return "Hexagon DAG->DAG Pattern Instruction Selection";
+  }
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+  bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        DebugLoc dl);
+  SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
+                                        DebugLoc dl);
+  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, DebugLoc dl);
+  SDNode *SelectIndexedStore(StoreSDNode *ST, DebugLoc dl);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectSHL(SDNode *N);
+  SDNode *SelectSelect(SDNode *N);
+  SDNode *SelectTruncate(SDNode *N);
+  SDNode *SelectMul(SDNode *N);
+  SDNode *SelectZeroExtend(SDNode *N);
+  SDNode *SelectIntrinsicWOChain(SDNode *N);
+  SDNode *SelectIntrinsicWChain(SDNode *N);
+  SDNode *SelectConstant(SDNode *N);
+  SDNode *SelectConstantFP(SDNode *N);
+  SDNode *SelectAdd(SDNode *N);
+  bool isConstExtProfitable(SDNode *N) const;
+
+// XformMskToBitPosU5Imm - Returns the bit position which
+// the single bit 32 bit mask represents.
+// Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU5Imm(uint32_t Imm) {
+  int32_t bitPos;
+  bitPos = Log2_32(Imm);
+  assert(bitPos >= 0 && bitPos < 32 &&
+         "Constant out of range for 32 BitPos Memops");
+  return CurDAG->getTargetConstant(bitPos, MVT::i32);
+}
+
+// XformMskToBitPosU4Imm - Returns the bit position which the single bit 16 bit
+// mask represents. Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU4Imm(uint16_t Imm) {
+  return XformMskToBitPosU5Imm(Imm);
+}
+
+// XformMskToBitPosU3Imm - Returns the bit position which the single bit 8 bit
+// mask represents. Used in Clr and Set bit immediate memops.
+SDValue XformMskToBitPosU3Imm(uint8_t Imm) {
+  return XformMskToBitPosU5Imm(Imm);
+}
+
+// Return true if there is exactly one bit set in V, i.e., if V is one of the
+// following integers: 2^0, 2^1, ..., 2^31.
+bool ImmIsSingleBit(uint32_t v) const {
+  uint32_t c = CountPopulation_64(v);
+  // Only return true if we counted 1 bit.
+  return c == 1;
+}
+
+// XformM5ToU5Imm - Return a target constant with the specified value, of type
+// i32 where the negative literal is transformed into a positive literal for
+// use in -= memops.
+inline SDValue XformM5ToU5Imm(signed Imm) {
+   assert( (Imm >= -31 && Imm <= -1)  && "Constant out of range for Memops");
+   return CurDAG->getTargetConstant( - Imm, MVT::i32);
+}
+
+
+// XformU7ToU7M1Imm - Return a target constant decremented by 1, in range
+// [1..128], used in cmpb.gtu instructions.
+inline SDValue XformU7ToU7M1Imm(signed Imm) {
+  assert((Imm >= 1 && Imm <= 128) && "Constant out of range for cmpb op");
+  return CurDAG->getTargetConstant(Imm - 1, MVT::i8);
+}
+
+// Include the pieces autogenerated from the target description.
+#include "HexagonGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+
+/// createHexagonISelDag - This pass converts a legalized DAG into a
+/// Hexagon-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
+                                         CodeGenOpt::Level OptLevel) {
+  return new HexagonDAGToDAGISel(TM, OptLevel);
+}
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "hexagon-isel",
+                              &SelectionDAGISel::ID, 0, false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce)
+}
+
+
+static bool IsS11_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}
+
+
+static bool IsS11_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}
+
+
+static bool IsS11_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}
+
+
+static bool IsS11_3_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}
+
+
+static bool IsU6_0_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}
+
+
+static bool IsU6_1_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}
+
+
+static bool IsU6_2_Offset(SDNode * S) {
+    ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // u6 predicate - True if the immediate fits in a 6-bit unsigned extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}
+
+
+// Intrinsics that return a a predicate.
+static unsigned doesIntrinsicReturnPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_cmpeq:
+    case Intrinsic::hexagon_C2_cmpgt:
+    case Intrinsic::hexagon_C2_cmpgtu:
+    case Intrinsic::hexagon_C2_cmpgtup:
+    case Intrinsic::hexagon_C2_cmpgtp:
+    case Intrinsic::hexagon_C2_cmpeqp:
+    case Intrinsic::hexagon_C2_bitsset:
+    case Intrinsic::hexagon_C2_bitsclr:
+    case Intrinsic::hexagon_C2_cmpeqi:
+    case Intrinsic::hexagon_C2_cmpgti:
+    case Intrinsic::hexagon_C2_cmpgtui:
+    case Intrinsic::hexagon_C2_cmpgei:
+    case Intrinsic::hexagon_C2_cmpgeui:
+    case Intrinsic::hexagon_C2_cmplt:
+    case Intrinsic::hexagon_C2_cmpltu:
+    case Intrinsic::hexagon_C2_bitsclri:
+    case Intrinsic::hexagon_C2_and:
+    case Intrinsic::hexagon_C2_or:
+    case Intrinsic::hexagon_C2_xor:
+    case Intrinsic::hexagon_C2_andn:
+    case Intrinsic::hexagon_C2_not:
+    case Intrinsic::hexagon_C2_orn:
+    case Intrinsic::hexagon_C2_pxfer_map:
+    case Intrinsic::hexagon_C2_any8:
+    case Intrinsic::hexagon_C2_all8:
+    case Intrinsic::hexagon_A2_vcmpbeq:
+    case Intrinsic::hexagon_A2_vcmpbgtu:
+    case Intrinsic::hexagon_A2_vcmpheq:
+    case Intrinsic::hexagon_A2_vcmphgt:
+    case Intrinsic::hexagon_A2_vcmphgtu:
+    case Intrinsic::hexagon_A2_vcmpweq:
+    case Intrinsic::hexagon_A2_vcmpwgt:
+    case Intrinsic::hexagon_A2_vcmpwgtu:
+    case Intrinsic::hexagon_C2_tfrrp:
+    case Intrinsic::hexagon_S2_tstbit_i:
+    case Intrinsic::hexagon_S2_tstbit_r:
+      return 1;
+  }
+}
+
+
+// Intrinsics that have predicate operands.
+static unsigned doesIntrinsicContainPredicate(unsigned ID)
+{
+  switch (ID) {
+    default:
+      return 0;
+    case Intrinsic::hexagon_C2_tfrpr:
+      return Hexagon::TFR_RsPd;
+    case Intrinsic::hexagon_C2_and:
+      return Hexagon::AND_pp;
+    case Intrinsic::hexagon_C2_xor:
+      return Hexagon::XOR_pp;
+    case Intrinsic::hexagon_C2_or:
+      return Hexagon::OR_pp;
+    case Intrinsic::hexagon_C2_not:
+      return Hexagon::NOT_p;
+    case Intrinsic::hexagon_C2_any8:
+      return Hexagon::ANY_pp;
+    case Intrinsic::hexagon_C2_all8:
+      return Hexagon::ALL_pp;
+    case Intrinsic::hexagon_C2_vitpack:
+      return Hexagon::VITPACK_pp;
+    case Intrinsic::hexagon_C2_mask:
+      return Hexagon::MASK_p;
+    case Intrinsic::hexagon_C2_mux:
+      return Hexagon::MUX_rr;
+
+      // Mapping hexagon_C2_muxir to MUX_pri.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxir:
+      return Hexagon::MUX_ri;
+
+      // Mapping hexagon_C2_muxri to MUX_pir.  This is pretty weird - but
+      // that's how it's mapped in q6protos.h.
+    case Intrinsic::hexagon_C2_muxri:
+      return Hexagon::MUX_ir;
+
+    case Intrinsic::hexagon_C2_muxii:
+      return Hexagon::MUX_ii;
+    case Intrinsic::hexagon_C2_vmux:
+      return Hexagon::VMUX_prr64;
+    case Intrinsic::hexagon_S2_valignrb:
+      return Hexagon::VALIGN_rrp;
+    case Intrinsic::hexagon_S2_vsplicerb:
+      return Hexagon::VSPLICE_rrp;
+  }
+}
+
+
+static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
+  if (MemType == MVT::i64 && isShiftedInt<11,3>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i32 && isShiftedInt<11,2>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i16 && isShiftedInt<11,1>(Offset)) {
+    return true;
+  }
+  if (MemType == MVT::i8 && isInt<11>(Offset)) {
+    return true;
+  }
+  return false;
+}
+
+
+//
+// Try to lower loads of GlobalAdresses into base+offset loads.  Custom
+// lowering for GlobalAddress nodes has already turned it into a
+// CONST32.
+//
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
+  SDValue Chain = LD->getChain();
+  SDNode* Const32 = LD->getBasePtr().getNode();
+  unsigned Opcode = 0;
+
+  if (Const32->getOpcode() == HexagonISD::CONST32 &&
+      ISD::isNormalLoad(LD)) {
+    SDValue Base = Const32->getOperand(0);
+    EVT LoadedVT = LD->getMemoryVT();
+    int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+    if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) {
+      MVT PointerTy = TLI.getPointerTy();
+      const GlobalValue* GV =
+        cast<GlobalAddressSDNode>(Base)->getGlobal();
+      SDValue TargAddr =
+        CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+      SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                               dl, PointerTy,
+                                               TargAddr);
+      // Figure out base + offset opcode
+      if (LoadedVT == MVT::i64) Opcode = Hexagon::LDrid_indexed;
+      else if (LoadedVT == MVT::i32) Opcode = Hexagon::LDriw_indexed;
+      else if (LoadedVT == MVT::i16) Opcode = Hexagon::LDrih_indexed;
+      else if (LoadedVT == MVT::i8) Opcode = Hexagon::LDrib_indexed;
+      else llvm_unreachable("unknown memory type");
+
+      // Build indexed load.
+      SDValue TargetConstOff = CurDAG->getTargetConstant(Offset, PointerTy);
+      SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other,
+                                              SDValue(NewBase,0),
+                                              TargetConstOff,
+                                              Chain);
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+      ReplaceUses(LD, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           DebugLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
+                                                MVT::Other, Base, TargetConst,
+                                                Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl, MVT::i64,
+                                                SDValue(Result_1, 0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_2, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_2;
+    }
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::SXTW, dl,
+                                                MVT::i64, SDValue(Result_1, 0));
+    SDNode* Result_3 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl,
+                                              MVT::i32, Base, TargetConstVal,
+                                                SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_2, 0),
+                              SDValue(Result_3, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_2;
+  }
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
+                                                           unsigned Opcode,
+                                                           DebugLoc dl)
+{
+  SDValue Chain = LD->getChain();
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  SDValue N1 = LD->getOperand(1);
+  SDValue CPTmpN1_0;
+  SDValue CPTmpN1_1;
+  if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
+      N1.getNode()->getValueType(0) == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+      SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                                MVT::i32, MVT::Other, Base,
+                                                TargetConstVal, Chain);
+      SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                                TargetConst0);
+      SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                MVT::i64, MVT::Other,
+                                                SDValue(Result_2,0),
+                                                SDValue(Result_1,0));
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = LD->getMemOperand();
+      cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+      const SDValue Froms[] = { SDValue(LD, 0),
+                                SDValue(LD, 1),
+                                SDValue(LD, 2)
+      };
+      const SDValue Tos[]   = { SDValue(Result_3, 0),
+                                SDValue(Result_1, 1),
+                                SDValue(Result_1, 2)
+      };
+      ReplaceUses(Froms, Tos, 3);
+      return Result_3;
+    }
+
+    // Generate an indirect load.
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                              MVT::Other,
+                                              Base, TargetConst0, Chain);
+    SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                                              TargetConst0);
+    SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                              MVT::i64, MVT::Other,
+                                              SDValue(Result_2,0),
+                                              SDValue(Result_1,0));
+    // Add offset to base.
+    SDNode* Result_4 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_3, 0), // Load value.
+                              SDValue(Result_4, 0), // New address.
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_3;
+  }
+
+  return SelectCode(LD);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT LoadedVT = LD->getMemoryVT();
+  unsigned Opcode = 0;
+
+  // Check for zero ext loads.
+  bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
+
+  // Figure out the opcode.
+  if (LoadedVT == MVT::i64) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDrid;
+    else
+      Opcode = Hexagon::LDrid;
+  } else if (LoadedVT == MVT::i32) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = Hexagon::POST_LDriw;
+    else
+      Opcode = Hexagon::LDriw;
+  } else if (LoadedVT == MVT::i16) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriuh : Hexagon::POST_LDrih;
+    else
+      Opcode = zextval ? Hexagon::LDriuh : Hexagon::LDrih;
+  } else if (LoadedVT == MVT::i8) {
+    if (TII->isValidAutoIncImm(LoadedVT, Val))
+      Opcode = zextval ? Hexagon::POST_LDriub : Hexagon::POST_LDrib;
+    else
+      Opcode = zextval ? Hexagon::LDriub : Hexagon::LDrib;
+  } else
+    llvm_unreachable("unknown memory type");
+
+  // For zero ext i64 loads, we need to add combine instructions.
+  if (LD->getValueType(0) == MVT::i64 &&
+      LD->getExtensionType() == ISD::ZEXTLOAD) {
+    return SelectIndexedLoadZeroExtend64(LD, Opcode, dl);
+  }
+  if (LD->getValueType(0) == MVT::i64 &&
+             LD->getExtensionType() == ISD::SEXTLOAD) {
+    // Handle sign ext i64 loads.
+    return SelectIndexedLoadSignExtend64(LD, Opcode, dl);
+  }
+  if (TII->isValidAutoIncImm(LoadedVT, Val)) {
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                            LD->getValueType(0),
+                                            MVT::i32, MVT::Other, Base,
+                                            TargetConstVal, Chain);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result, 0),
+                              SDValue(Result, 1),
+                              SDValue(Result, 2)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result;
+  } else {
+    SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+    SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+    SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl,
+                                              LD->getValueType(0),
+                                              MVT::Other, Base, TargetConst0,
+                                              Chain);
+    SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                              Base, TargetConstVal,
+                                              SDValue(Result_1, 1));
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = LD->getMemOperand();
+    cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+    const SDValue Froms[] = { SDValue(LD, 0),
+                              SDValue(LD, 1),
+                              SDValue(LD, 2)
+    };
+    const SDValue Tos[]   = { SDValue(Result_1, 0),
+                              SDValue(Result_2, 0),
+                              SDValue(Result_1, 1)
+    };
+    ReplaceUses(Froms, Tos, 3);
+    return Result_1;
+  }
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
+  SDNode *result;
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+
+  // Handle indexed loads.
+  if (AM != ISD::UNINDEXED) {
+    result = SelectIndexedLoad(LD, dl);
+  } else {
+    result = SelectBaseOffsetLoad(LD, dl);
+  }
+
+  return result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDValue Base = ST->getBasePtr();
+  SDValue Offset = ST->getOffset();
+  SDValue Value = ST->getValue();
+  SDNode *OffsetNode = Offset.getNode();
+  // Get the constant value.
+  int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue();
+  EVT StoredVT = ST->getMemoryVT();
+
+  // Offset value must be within representable range
+  // and must have correct alignment properties.
+  if (TII->isValidAutoIncImm(StoredVT, Val)) {
+    SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
+                     Chain};
+    unsigned Opcode = 0;
+
+    // Figure out the post inc version of opcode.
+    if (StoredVT == MVT::i64) Opcode = Hexagon::POST_STdri;
+    else if (StoredVT == MVT::i32) Opcode = Hexagon::POST_STwri;
+    else if (StoredVT == MVT::i16) Opcode = Hexagon::POST_SThri;
+    else if (StoredVT == MVT::i8) Opcode = Hexagon::POST_STbri;
+    else llvm_unreachable("unknown memory type");
+
+    // Build post increment store.
+    SDNode* Result = CurDAG->getMachineNode(Opcode, dl, MVT::i32,
+                                            MVT::Other, Ops, 4);
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+    MemOp[0] = ST->getMemOperand();
+    cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+
+    ReplaceUses(ST, Result);
+    ReplaceUses(SDValue(ST,1), SDValue(Result,1));
+    return Result;
+  }
+
+  // Note: Order of operands matches the def of instruction:
+  // def STrid : STInst<(outs), (ins MEMri:$addr, DoubleRegs:$src1), ...
+  // and it differs for POST_ST* for instance.
+  SDValue Ops[] = { Base, CurDAG->getTargetConstant(0, MVT::i32), Value,
+                    Chain};
+  unsigned Opcode = 0;
+
+  // Figure out the opcode.
+  if (StoredVT == MVT::i64) Opcode = Hexagon::STrid;
+  else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
+  else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih;
+  else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib;
+  else llvm_unreachable("unknown memory type");
+
+  // Build regular store.
+  SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
+  SDNode* Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops,
+                                            4);
+  // Build splitted incriment instruction.
+  SDNode* Result_2 = CurDAG->getMachineNode(Hexagon::ADD_ri, dl, MVT::i32,
+                                            Base,
+                                            TargetConstVal,
+                                            SDValue(Result_1, 0));
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = ST->getMemOperand();
+  cast<MachineSDNode>(Result_1)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(ST,0), SDValue(Result_2,0));
+  ReplaceUses(SDValue(ST,1), SDValue(Result_1,0));
+  return Result_2;
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
+                                                   DebugLoc dl) {
+  SDValue Chain = ST->getChain();
+  SDNode* Const32 = ST->getBasePtr().getNode();
+  SDValue Value = ST->getValue();
+  unsigned Opcode = 0;
+
+  // Try to lower stores of GlobalAdresses into indexed stores.  Custom
+  // lowering for GlobalAddress nodes has already turned it into a
+  // CONST32.  Avoid truncating stores for the moment.  Post-inc stores
+  // do the same.  Don't think there's a reason for it, so will file a
+  // bug to fix.
+  if ((Const32->getOpcode() == HexagonISD::CONST32) &&
+      !(Value.getValueType() == MVT::i64 && ST->isTruncatingStore())) {
+    SDValue Base = Const32->getOperand(0);
+    if (Base.getOpcode() == ISD::TargetGlobalAddress) {
+      EVT StoredVT = ST->getMemoryVT();
+      int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
+      if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) {
+        MVT PointerTy = TLI.getPointerTy();
+        const GlobalValue* GV =
+          cast<GlobalAddressSDNode>(Base)->getGlobal();
+        SDValue TargAddr =
+          CurDAG->getTargetGlobalAddress(GV, dl, PointerTy, 0);
+        SDNode* NewBase = CurDAG->getMachineNode(Hexagon::CONST32_set,
+                                                 dl, PointerTy,
+                                                 TargAddr);
+
+        // Figure out base + offset opcode
+        if (StoredVT == MVT::i64) Opcode = Hexagon::STrid_indexed;
+        else if (StoredVT == MVT::i32) Opcode = Hexagon::STriw_indexed;
+        else if (StoredVT == MVT::i16) Opcode = Hexagon::STrih_indexed;
+        else if (StoredVT == MVT::i8) Opcode = Hexagon::STrib_indexed;
+        else llvm_unreachable("unknown memory type");
+
+        SDValue Ops[] = {SDValue(NewBase,0),
+                         CurDAG->getTargetConstant(Offset,PointerTy),
+                         Value, Chain};
+        // build indexed store
+        SDNode* Result = CurDAG->getMachineNode(Opcode, dl,
+                                                MVT::Other, Ops, 4);
+        MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+        MemOp[0] = ST->getMemOperand();
+        cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
+        ReplaceUses(ST, Result);
+        return Result;
+      }
+    }
+  }
+
+  return SelectCode(ST);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  ISD::MemIndexedMode AM = ST->getAddressingMode();
+
+  // Handle indexed stores.
+  if (AM != ISD::UNINDEXED) {
+    return SelectIndexedStore(ST, dl);
+  }
+
+  return SelectBaseOffsetStore(ST, dl);
+}
+
+SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  //
+  //   --- match with the following ---
+  //
+  // %mul.i = mpy (%tmp1, %add)
+  //
+
+  if (N->getValueType(0) == MVT::i64) {
+    // Shifting a i64 signed multiply.
+    SDValue MulOp0 = N->getOperand(0);
+    SDValue MulOp1 = N->getOperand(1);
+
+    SDValue OP0;
+    SDValue OP1;
+
+    // Handle sign_extend and sextload.
+    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext0 = MulOp0.getOperand(0);
+      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+        return SelectCode(N);
+      }
+
+      OP0 = Sext0;
+    } else if (MulOp0.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        return SelectCode(N);
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Same goes for the second operand.
+    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+      SDValue Sext1 = MulOp1.getOperand(0);
+      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
+        return SelectCode(N);
+      }
+
+      OP1 = Sext1;
+    } else if (MulOp1.getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+      if (LD->getMemoryVT() != MVT::i32 ||
+          LD->getExtensionType() != ISD::SEXTLOAD ||
+          LD->getAddressingMode() != ISD::UNINDEXED) {
+        return SelectCode(N);
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+      OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                            MVT::Other,
+                                            LD->getBasePtr(), TargetConst0,
+                                            Chain), 0);
+    } else {
+      return SelectCode(N);
+    }
+
+    // Generate a mpy instruction.
+    SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY64, dl, MVT::i64,
+                                            OP0, OP1);
+    ReplaceUses(N, Result);
+    return Result;
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+      SDValue N000 = N00.getOperand(0);
+      SDValue N001 = N00.getOperand(1);
+      if (cast<VTSDNode>(N001)->getVT() == MVT::i16) {
+        SDValue N01 = N0.getOperand(1);
+        SDValue N02 = N0.getOperand(2);
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other),IntRegs:i32:$src1, SETLT:Other),IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MAXh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETLT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MAXw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+
+        // Pattern: (select:i32 (setcc:i1 (sext_inreg:i32 IntRegs:i32:$src2,
+        // i16:Other), IntRegs:i32:$src1, SETGT:Other), IntRegs:i32:$src1,
+        // IntRegs:i32:$src2)
+        // Emits: (MINh_rr:i32 IntRegs:i32:$src1, IntRegs:i32:$src2)
+        // Pattern complexity = 9  cost = 1  size = 0.
+        if (cast<CondCodeSDNode>(N02)->get() == ISD::SETGT) {
+          SDValue N1 = N->getOperand(1);
+          if (N01 == N1) {
+            SDValue N2 = N->getOperand(2);
+            if (N000 == N2 &&
+                N0.getNode()->getValueType(N0.getResNo()) == MVT::i1 &&
+                N00.getNode()->getValueType(N00.getResNo()) == MVT::i32) {
+              SDNode *SextNode = CurDAG->getMachineNode(Hexagon::SXTH, dl,
+                                                        MVT::i32, N000);
+              SDNode *Result = CurDAG->getMachineNode(Hexagon::MINw_rr, dl,
+                                                      MVT::i32,
+                                                      SDValue(SextNode, 0),
+                                                      N1);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Shift = N->getOperand(0);
+
+  //
+  // %conv.i = sext i32 %tmp1 to i64
+  // %conv2.i = sext i32 %add to i64
+  // %mul.i = mul nsw i64 %conv2.i, %conv.i
+  // %shr5.i = lshr i64 %mul.i, 32
+  // %conv3.i = trunc i64 %shr5.i to i32
+  //
+  //   --- match with the following ---
+  //
+  // %conv3.i = mpy (%tmp1, %add)
+  //
+  // Trunc to i32.
+  if (N->getValueType(0) == MVT::i32) {
+    // Trunc from i64.
+    if (Shift.getNode()->getValueType(0) == MVT::i64) {
+      // Trunc child is logical shift right.
+      if (Shift.getOpcode() != ISD::SRL) {
+        return SelectCode(N);
+      }
+
+      SDValue ShiftOp0 = Shift.getOperand(0);
+      SDValue ShiftOp1 = Shift.getOperand(1);
+
+      // Shift by const 32
+      if (ShiftOp1.getOpcode() != ISD::Constant) {
+        return SelectCode(N);
+      }
+
+      int32_t ShiftConst =
+        cast<ConstantSDNode>(ShiftOp1.getNode())->getSExtValue();
+      if (ShiftConst != 32) {
+        return SelectCode(N);
+      }
+
+      // Shifting a i64 signed multiply
+      SDValue Mul = ShiftOp0;
+      if (Mul.getOpcode() != ISD::MUL) {
+        return SelectCode(N);
+      }
+
+      SDValue MulOp0 = Mul.getOperand(0);
+      SDValue MulOp1 = Mul.getOperand(1);
+
+      SDValue OP0;
+      SDValue OP1;
+
+      // Handle sign_extend and sextload
+      if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext0 = MulOp0.getOperand(0);
+        if (Sext0.getNode()->getValueType(0) != MVT::i32) {
+          return SelectCode(N);
+        }
+
+        OP0 = Sext0;
+      } else if (MulOp0.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP0 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Same goes for the second operand.
+      if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
+        SDValue Sext1 = MulOp1.getOperand(0);
+        if (Sext1.getNode()->getValueType(0) != MVT::i32)
+          return SelectCode(N);
+
+        OP1 = Sext1;
+      } else if (MulOp1.getOpcode() == ISD::LOAD) {
+        LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
+        if (LD->getMemoryVT() != MVT::i32 ||
+            LD->getExtensionType() != ISD::SEXTLOAD ||
+            LD->getAddressingMode() != ISD::UNINDEXED) {
+          return SelectCode(N);
+        }
+
+        SDValue Chain = LD->getChain();
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        OP1 = SDValue (CurDAG->getMachineNode(Hexagon::LDriw, dl, MVT::i32,
+                                              MVT::Other,
+                                              LD->getBasePtr(),
+                                              TargetConst0, Chain), 0);
+      } else {
+        return SelectCode(N);
+      }
+
+      // Generate a mpy instruction.
+      SDNode *Result = CurDAG->getMachineNode(Hexagon::MPY, dl, MVT::i32,
+                                              OP0, OP1);
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::i32) {
+    SDValue Shl_0 = N->getOperand(0);
+    SDValue Shl_1 = N->getOperand(1);
+    // RHS is const.
+    if (Shl_1.getOpcode() == ISD::Constant) {
+      if (Shl_0.getOpcode() == ISD::MUL) {
+        SDValue Mul_0 = Shl_0.getOperand(0); // Val
+        SDValue Mul_1 = Shl_0.getOperand(1); // Const
+        // RHS of mul is const.
+        if (Mul_1.getOpcode() == ISD::Constant) {
+          int32_t ShlConst =
+            cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+          int32_t MulConst =
+            cast<ConstantSDNode>(Mul_1.getNode())->getSExtValue();
+          int32_t ValConst = MulConst << ShlConst;
+          SDValue Val = CurDAG->getTargetConstant(ValConst,
+                                                  MVT::i32);
+          if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val.getNode()))
+            if (isInt<9>(CN->getSExtValue())) {
+              SDNode* Result =
+                CurDAG->getMachineNode(Hexagon::MPYI_ri, dl,
+                                       MVT::i32, Mul_0, Val);
+              ReplaceUses(N, Result);
+              return Result;
+            }
+
+        }
+      } else if (Shl_0.getOpcode() == ISD::SUB) {
+        SDValue Sub_0 = Shl_0.getOperand(0); // Const 0
+        SDValue Sub_1 = Shl_0.getOperand(1); // Val
+        if (Sub_0.getOpcode() == ISD::Constant) {
+          int32_t SubConst =
+            cast<ConstantSDNode>(Sub_0.getNode())->getSExtValue();
+          if (SubConst == 0) {
+            if (Sub_1.getOpcode() == ISD::SHL) {
+              SDValue Shl2_0 = Sub_1.getOperand(0); // Val
+              SDValue Shl2_1 = Sub_1.getOperand(1); // Const
+              if (Shl2_1.getOpcode() == ISD::Constant) {
+                int32_t ShlConst =
+                  cast<ConstantSDNode>(Shl_1.getNode())->getSExtValue();
+                int32_t Shl2Const =
+                  cast<ConstantSDNode>(Shl2_1.getNode())->getSExtValue();
+                int32_t ValConst = 1 << (ShlConst+Shl2Const);
+                SDValue Val = CurDAG->getTargetConstant(-ValConst, MVT::i32);
+                if (ConstantSDNode *CN =
+                    dyn_cast<ConstantSDNode>(Val.getNode()))
+                  if (isInt<9>(CN->getSExtValue())) {
+                    SDNode* Result =
+                      CurDAG->getMachineNode(Hexagon::MPYI_ri, dl, MVT::i32,
+                                             Shl2_0, Val);
+                    ReplaceUses(N, Result);
+                    return Result;
+                  }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// If there is an zero_extend followed an intrinsic in DAG (this means - the
+// result of the intrinsic is predicate); convert the zero_extend to
+// transfer instruction.
+//
+// Zero extend -> transfer is lowered here. Otherwise, zero_extend will be
+// converted into a MUX as predicate registers defined as 1 bit in the
+// compiler. Architecture defines them as 8-bit registers.
+// We want to preserve all the lower 8-bits and, not just 1 LSB bit.
+//
+SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDNode *IsIntrinsic = N->getOperand(0).getNode();
+  if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
+    unsigned ID =
+      cast<ConstantSDNode>(IsIntrinsic->getOperand(0))->getZExtValue();
+    if (doesIntrinsicReturnPredicate(ID)) {
+      // Now we need to differentiate target data types.
+      if (N->getValueType(0) == MVT::i64) {
+        // Convert the zero_extend to Rs = Pd followed by COMBINE_rr(0,Rs).
+        SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
+        SDNode *Result_1 = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                                  MVT::i32,
+                                                  SDValue(IsIntrinsic, 0));
+        SDNode *Result_2 = CurDAG->getMachineNode(Hexagon::TFRI, dl,
+                                                  MVT::i32,
+                                                  TargetConst0);
+        SDNode *Result_3 = CurDAG->getMachineNode(Hexagon::COMBINE_rr, dl,
+                                                  MVT::i64, MVT::Other,
+                                                  SDValue(Result_2, 0),
+                                                  SDValue(Result_1, 0));
+        ReplaceUses(N, Result_3);
+        return Result_3;
+      }
+      if (N->getValueType(0) == MVT::i32) {
+        // Convert the zero_extend to Rs = Pd
+        SDNode* RsPd = CurDAG->getMachineNode(Hexagon::TFR_RsPd, dl,
+                                              MVT::i32,
+                                              SDValue(IsIntrinsic, 0));
+        ReplaceUses(N, RsPd);
+        return RsPd;
+      }
+      llvm_unreachable("Unexpected value type");
+    }
+  }
+  return SelectCode(N);
+}
+
+
+//
+// Checking for intrinsics which have predicate registers as operand(s)
+// and lowering to the actual intrinsic.
+//
+SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
+
+  // We are concerned with only those intrinsics that have predicate registers
+  // as at least one of the operands.
+  if (IntrinsicWithPred) {
+    SmallVector<SDValue, 8> Ops;
+    const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
+    const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+
+    // Iterate over all the operands of the intrinsics.
+    // For PredRegs, do the transfer.
+    // For Double/Int Regs, just preserve the value
+    // For immediates, lower it.
+    for (unsigned i = 1; i < N->getNumOperands(); ++i) {
+      SDNode *Arg = N->getOperand(i).getNode();
+      const TargetRegisterClass *RC = TII->getRegClass(MCID, i, TRI, *MF);
+
+      if (RC == &Hexagon::IntRegsRegClass ||
+          RC == &Hexagon::DoubleRegsRegClass) {
+        Ops.push_back(SDValue(Arg, 0));
+      } else if (RC == &Hexagon::PredRegsRegClass) {
+        // Do the transfer.
+        SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                              SDValue(Arg, 0));
+        Ops.push_back(SDValue(PdRs,0));
+      } else if (RC == NULL && (dyn_cast<ConstantSDNode>(Arg) != NULL)) {
+        // This is immediate operand. Lower it here making sure that we DO have
+        // const SDNode for immediate value.
+        int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
+        SDValue SDVal = CurDAG->getTargetConstant(Val, MVT::i32);
+        Ops.push_back(SDVal);
+      } else {
+        llvm_unreachable("Unimplemented");
+      }
+    }
+    EVT ReturnValueVT = N->getValueType(0);
+    SDNode *Result = CurDAG->getMachineNode(IntrinsicWithPred, dl,
+                                            ReturnValueVT,
+                                            Ops.data(), Ops.size());
+    ReplaceUses(N, Result);
+    return Result;
+  }
+  return SelectCode(N);
+}
+
+//
+// Map floating point constant values.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+  APFloat APF = CN->getValueAPF();
+  if (N->getValueType(0) == MVT::f32) {
+    return CurDAG->getMachineNode(Hexagon::TFRI_f, dl, MVT::f32,
+              CurDAG->getTargetConstantFP(APF.convertToFloat(), MVT::f32));
+  }
+  else if (N->getValueType(0) == MVT::f64) {
+    return CurDAG->getMachineNode(Hexagon::CONST64_Float_Real, dl, MVT::f64,
+              CurDAG->getTargetConstantFP(APF.convertToDouble(), MVT::f64));
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Map predicate true (encoded as -1 in LLVM) to a XOR.
+//
+SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::i1) {
+    SDNode* Result;
+    int32_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (Val == -1) {
+      // Create the IntReg = 1 node.
+      SDNode* IntRegTFR =
+        CurDAG->getMachineNode(Hexagon::TFRI, dl, MVT::i32,
+                               CurDAG->getTargetConstant(0, MVT::i32));
+
+      // Pd = IntReg
+      SDNode* Pd = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
+                                          SDValue(IntRegTFR, 0));
+
+      // not(Pd)
+      SDNode* NotPd = CurDAG->getMachineNode(Hexagon::NOT_p, dl, MVT::i1,
+                                             SDValue(Pd, 0));
+
+      // xor(not(Pd))
+      Result = CurDAG->getMachineNode(Hexagon::XOR_pp, dl, MVT::i1,
+                                      SDValue(Pd, 0), SDValue(NotPd, 0));
+
+      // We have just built:
+      // Rs = Pd
+      // Pd = xor(not(Pd), Pd)
+
+      ReplaceUses(N, Result);
+      return Result;
+    }
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Map add followed by a asr -> asr +=.
+//
+SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+  // Identify nodes of the form: add(asr(...)).
+  SDNode* Src1 = N->getOperand(0).getNode();
+  if (Src1->getOpcode() != ISD::SRA || !Src1->hasOneUse()
+      || Src1->getValueType(0) != MVT::i32) {
+    return SelectCode(N);
+  }
+
+  // Build Rd = Rd' + asr(Rs, Rt). The machine constraints will ensure that
+  // Rd and Rd' are assigned to the same register
+  SDNode* Result = CurDAG->getMachineNode(Hexagon::ASR_ADD_rr, dl, MVT::i32,
+                                          N->getOperand(1),
+                                          Src1->getOperand(0),
+                                          Src1->getOperand(1));
+  ReplaceUses(N, Result);
+
+  return Result;
+}
+
+
+SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+
+  switch (N->getOpcode()) {
+  case ISD::Constant:
+    return SelectConstant(N);
+
+  case ISD::ConstantFP:
+    return SelectConstantFP(N);
+
+  case ISD::ADD:
+    return SelectAdd(N);
+
+  case ISD::SHL:
+    return SelectSHL(N);
+
+  case ISD::LOAD:
+    return SelectLoad(N);
+
+  case ISD::STORE:
+    return SelectStore(N);
+
+  case ISD::SELECT:
+    return SelectSelect(N);
+
+  case ISD::TRUNCATE:
+    return SelectTruncate(N);
+
+  case ISD::MUL:
+    return SelectMul(N);
+
+  case ISD::ZERO_EXTEND:
+    return SelectZeroExtend(N);
+
+  case ISD::INTRINSIC_WO_CHAIN:
+    return SelectIntrinsicWOChain(N);
+  }
+
+  return SelectCode(N);
+}
+
+
+//
+// Hexagon_TODO: Five functions for ADDRri?! Surely there must be a better way
+// to define these instructions.
+//
+bool HexagonDAGToDAGISel::SelectADDRri(SDValue& Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_0(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_0_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_0_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_1(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_1_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_1_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriU6_2(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsU6_2_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsU6_2_Offset(Offset.getNode()));
+}
+
+
+bool HexagonDAGToDAGISel::SelectMEMriS11_2(SDValue& Addr, SDValue &Base,
+                                           SDValue &Offset) {
+
+  if (Addr.getOpcode() != ISD::ADD) {
+    return(SelectADDRriS11_2(Addr, Base, Offset));
+  }
+
+  return SelectADDRriS11_2(Addr, Base, Offset);
+}
+
+
+bool HexagonDAGToDAGISel::SelectADDRriS11_3(SDValue& Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return (IsS11_3_Offset(Offset.getNode()));
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return (IsS11_3_Offset(Offset.getNode()));
+}
+
+bool HexagonDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1,
+                                       SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+
+  return true;
+}
+
+
+// Handle generic address case. It is accessed from inlined asm =m constraints,
+// which could have any kind of pointer.
+bool HexagonDAGToDAGISel::SelectAddr(SDNode *Op, SDValue Addr,
+                                          SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // Direct calls.
+
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+
+bool HexagonDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+
+  switch (ConstraintCode) {
+  case 'o':   // Offsetable.
+  case 'v':   // Not offsetable.
+  default: return true;
+  case 'm':   // Memory.
+    if (!SelectAddr(Op.getNode(), Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const {
+  unsigned UseCount = 0;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    UseCount++;
+  }
+
+  return (UseCount <= 1);
+
+}
+
+//===--------------------------------------------------------------------===//
+// Return 'true' if use count of the global address is below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::hasNumUsesBelowThresGA(SDNode *N) const {
+  assert(N->getOpcode() == ISD::TargetGlobalAddress &&
+         "Expecting a target global address");
+
+  // Always try to fold the address.
+  if (TM.getOptLevel() == CodeGenOpt::Aggressive)
+    return true;
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+  DenseMap<const GlobalValue *, unsigned>::const_iterator GI =
+    GlobalAddressUseCountMap.find(GA->getGlobal());
+
+  if (GI == GlobalAddressUseCountMap.end())
+    return false;
+
+  return GI->second <= MaxNumOfUsesForConstExtenders;
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the non GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddress(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, false);
+}
+
+//===--------------------------------------------------------------------===//
+// Return true if the GP-relative global address can be folded.
+//===--------------------------------------------------------------------===//
+inline bool HexagonDAGToDAGISel::foldGlobalAddressGP(SDValue &N, SDValue &R) {
+  return foldGlobalAddressImpl(N, R, true);
+}
+
+//===--------------------------------------------------------------------===//
+// Fold offset of the global address if number of uses are below threshold.
+//===--------------------------------------------------------------------===//
+bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
+                                                bool ShouldLookForGP) {
+  if (N.getOpcode() == ISD::ADD) {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    if ((ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32_GP)) ||
+        (!ShouldLookForGP && (N0.getOpcode() == HexagonISD::CONST32))) {
+      ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1);
+      GlobalAddressSDNode *GA =
+        dyn_cast<GlobalAddressSDNode>(N0.getOperand(0));
+
+      if (Const && GA &&
+          (GA->getOpcode() == ISD::TargetGlobalAddress)) {
+        if ((N0.getOpcode() == HexagonISD::CONST32) &&
+                !hasNumUsesBelowThresGA(GA))
+            return false;
+        R = CurDAG->getTargetGlobalAddress(GA->getGlobal(),
+                                          Const->getDebugLoc(),
+                                          N.getValueType(),
+                                          GA->getOffset() +
+                                          (uint64_t)Const->getSExtValue());
+        return true;
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
new file mode 100644
index 000000000000..15858a9368ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -0,0 +1,1687 @@
+//===-- HexagonISelLowering.cpp - Hexagon DAG Lowering Implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Hexagon uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonISelLowering.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+const unsigned Hexagon_MAX_RET_SIZE = 64;
+
+static cl::opt<bool>
+EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
+               cl::desc("Control jump table emission on Hexagon target"));
+
+int NumNamedVarArgParams = -1;
+
+// Implement calling convention for Hexagon.
+static bool
+CC_Hexagon(unsigned ValNo, MVT ValVT,
+           MVT LocVT, CCValAssign::LocInfo LocInfo,
+           ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon32(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon64(unsigned ValNo, MVT ValVT,
+             MVT LocVT, CCValAssign::LocInfo LocInfo,
+             ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+              MVT LocVT, CCValAssign::LocInfo LocInfo,
+              ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                MVT LocVT, CCValAssign::LocInfo LocInfo,
+                ISD::ArgFlagsTy ArgFlags, CCState &State);
+
+static bool
+CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  // NumNamedVarArgParams can not be zero for a VarArg function.
+  assert ( (NumNamedVarArgParams > 0) &&
+           "NumNamedVarArgParams is not bigger than zero.");
+
+  if ( (int)ValNo < NumNamedVarArgParams ) {
+    // Deal with named arguments.
+    return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
+  }
+
+  // Deal with un-named arguments.
+  unsigned ofst;
+  if (ArgFlags.isByVal()) {
+    // If pass-by-value, the size allocated on stack is decided
+    // by ArgFlags.getByValSize(), not by the size of LocVT.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    ofst = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    ofst = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    ofst = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
+    return false;
+  }
+  llvm_unreachable(0);
+}
+
+
+static bool
+CC_Hexagon (unsigned ValNo, MVT ValVT,
+            MVT LocVT, CCValAssign::LocInfo LocInfo,
+            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (ArgFlags.isByVal()) {
+    // Passed on stack.
+    assert ((ArgFlags.getByValSize() > 8) &&
+            "ByValSize must be bigger than 8 bytes");
+    unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return false;
+  }
+
+  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!CC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!CC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+      return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+
+static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const uint16_t RegList[] = {
+    Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+    Hexagon::R5
+  };
+  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
+                         MVT LocVT, CCValAssign::LocInfo LocInfo,
+                         ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  static const uint16_t RegList1[] = {
+    Hexagon::D1, Hexagon::D2
+  };
+  static const uint16_t RegList2[] = {
+    Hexagon::R1, Hexagon::R3
+  };
+  if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8, Hexagon::D2);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+
+  if (LocVT == MVT::i1 ||
+      LocVT == MVT::i8 ||
+      LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    ValVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State))
+    return false;
+  }
+
+  return true;  // CC didn't match.
+}
+
+static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  if (LocVT == MVT::i32 || LocVT == MVT::f32) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::R0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(4, 4);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  if (LocVT == MVT::i64 || LocVT == MVT::f64) {
+    if (unsigned Reg = State.AllocateReg(Hexagon::D0)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return false;
+    }
+  }
+
+  unsigned Offset = State.AllocateStack(8, 8);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  return false;
+}
+
+SDValue
+HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG)
+const {
+  return SDValue();
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.  Sometimes what we are copying is the end of a
+/// larger object, the part that does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile=*/false, /*AlwaysInline=*/false,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+
+// LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
+// passed by value, the function prototype is modified to return void and
+// the value is stored in memory pointed by a pointer passed by caller.
+SDValue
+HexagonTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   DebugLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_Hexagon);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. Returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDValue
+HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                       CallingConv::ID CallConv, bool isVarArg,
+                                       const
+                                       SmallVectorImpl<ISD::InputArg> &Ins,
+                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals,
+                                       const SmallVectorImpl<SDValue> &OutVals,
+                                       SDValue Callee) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Hexagon);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl,
+                               RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+/// LowerCall - Functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Check for varargs.
+  NumNamedVarArgParams = -1;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
+  {
+    const Function* CalleeFn = NULL;
+    Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
+    if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
+    {
+      // If a function has zero args and is a vararg function, that's
+      // disallowed so it must be an undeclared function.  Do not assume
+      // varargs if the callee is undefined.
+      if (CalleeFn->isVarArg() &&
+          CalleeFn->getFunctionType()->getNumParams() != 0) {
+        NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams();
+      }
+    }
+  }
+
+  if (NumNamedVarArgParams > 0)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
+  else
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
+
+
+  if(isTailCall) {
+    bool StructAttrFlag =
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr();
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                                                   isVarArg, IsStructRet,
+                                                   StructAttrFlag,
+                                                   Outs, OutVals, Ins, DAG);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i){
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isMemLoc()) {
+        isTailCall = false;
+        break;
+      }
+    }
+    if (isTailCall) {
+      DEBUG(dbgs () << "Eligible for Tail Call\n");
+    } else {
+      DEBUG(dbgs () <<
+            "Argument must be passed on stack. Not eligible for Tail Call\n");
+    }
+  }
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  SDValue StackPtr =
+    DAG.getCopyFromReg(Chain, dl, TM.getRegisterInfo()->getStackRegister(),
+                       getPointerTy());
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default:
+        // Loc info must be one of Full, SExt, ZExt, or AExt.
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    if (VA.isMemLoc()) {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getConstant(LocMemOffset, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+
+      if (Flags.isByVal()) {
+        // The argument is a struct passed by value. According to LLVM, "Arg"
+        // is is pointer.
+        MemOpChains.push_back(CreateCopyOfByValArgument(Arg, PtrOff, Chain,
+                                                        Flags, DAG, dl));
+      } else {
+        // The argument is not passed by value. "Arg" is a buildin type. It is
+        // not a pointer.
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),false, false,
+                                           0));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector.
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0],
+                        MemOpChains.size());
+  }
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
+                                                        getPointerTy(), true));
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  if (!isTailCall) {
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    //
+    // Do not flag preceding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (flag_aligned_memcpy) {
+    const char *MemcpyName =
+      "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes";
+    Callee =
+      DAG.getTargetExternalSymbol(MemcpyName, getPointerTy());
+    flag_aligned_memcpy = false;
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+  } else if (ExternalSymbolSDNode *S =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  if (isTailCall)
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, OutVals, Callee);
+}
+
+static bool getIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                   bool isSEXTLoad, SDValue &Base,
+                                   SDValue &Offset, bool &isInc,
+                                   SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD)
+  return false;
+
+  if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    // Ensure that Offset is a constant.
+    return (isa<ConstantSDNode>(Offset));
+  }
+
+  return false;
+}
+
+// TODO: Put this function along with the other isS* functions in
+// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the
+// functions defined in HexagonOperands.td.
+static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) {
+  ConstantSDNode *N = cast<ConstantSDNode>(S);
+
+  // immS4 predicate - True if the immediate fits in a 4-bit sign extended.
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  int64_t m = 0;
+  if (ShiftAmount > 0) {
+    m = v % ShiftAmount;
+    v = v >> ShiftAmount;
+  }
+  return (v <= 7) && (v >= -8) && (m == 0);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                       SDValue &Base,
+                                                       SDValue &Offset,
+                                                       ISD::MemIndexedMode &AM,
+                                                       SelectionDAG &DAG) const
+{
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+    if (ST->getValue().getValueType() == MVT::i64 && ST->isTruncatingStore()) {
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  bool isInc = false;
+  bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  // ShiftAmount = number of left-shifted bits in the Hexagon instruction.
+  int ShiftAmount = VT.getSizeInBits() / 16;
+  if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) {
+    AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+    return true;
+  }
+
+  return false;
+}
+
+SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+  switch (Node->getOpcode()) {
+    case ISD::INLINEASM: {
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the flag operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        if (FuncInfo->hasClobberLR())
+          break;
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+        ++i;  // Skip the ID value.
+
+        switch (InlineAsm::getKind(Flags)) {
+        default: llvm_unreachable("Bad flags!");
+          case InlineAsm::Kind_RegDef:
+          case InlineAsm::Kind_RegUse:
+          case InlineAsm::Kind_Imm:
+          case InlineAsm::Kind_Clobber:
+          case InlineAsm::Kind_Mem: {
+            for (; NumVals; --NumVals, ++i) {}
+            break;
+          }
+          case InlineAsm::Kind_RegDefEarlyClobber: {
+            for (; NumVals; --NumVals, ++i) {
+              unsigned Reg =
+                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+
+              // Check it to be lr
+              if (Reg == TM.getRegisterInfo()->getRARegister()) {
+                FuncInfo->setHasClobberLR(true);
+                break;
+              }
+            }
+            break;
+          }
+        }
+      }
+    }
+  } // Node->getOpcode
+  return Op;
+}
+
+
+//
+// Taken from the XCore backend.
+//
+SDValue HexagonTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  // Mark all jump table targets as address taken.
+  const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    MBB->setHasAddressTaken();
+    // This line is needed to set the hasAddressTaken flag on the BasicBlock
+    // object.
+    BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock()));
+  }
+
+  SDValue JumpTableBase = DAG.getNode(HexagonISD::WrapperJT, dl,
+                                      getPointerTy(), TargetJT);
+  SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                   DAG.getConstant(2, MVT::i32));
+  SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase,
+                                  ShiftIndex);
+  SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress,
+                                   MachinePointerInfo(), false, false, false,
+                                   0);
+  return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget);
+}
+
+
+SDValue
+HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned SPReg = getStackPointerRegisterToSaveRestore();
+
+  // Get a reference to the stack pointer.
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, StackPointer, Size);
+
+  //
+  // For Hexagon, the outgoing memory arguments area should be on top of the
+  // alloca area on the stack i.e., the outgoing memory arguments should be
+  // at a lower address than the alloca area. Move the alloca area down the
+  // stack by adding back the space reserved for outgoing arguments to SP
+  // here.
+  //
+  // We do not know what the size of the outgoing args is at this point.
+  // So, we add a pseudo instruction ADJDYNALLOC that will adjust the
+  // stack pointer. We patch this instruction with the correct, known
+  // offset in emitPrologue().
+  //
+  // Use a placeholder immediate (zero) for now. This will be patched up
+  // by emitPrologue().
+  SDValue ArgAdjust = DAG.getNode(HexagonISD::ADJDYNALLOC, dl,
+                                  MVT::i32,
+                                  Sub,
+                                  DAG.getConstant(0, MVT::i32));
+
+  // The Sub result contains the new stack start address, so it
+  // must be placed in the stack pointer register.
+  SDValue CopyChain = DAG.getCopyToReg(Chain, dl,
+                                       TM.getRegisterInfo()->getStackRegister(),
+                                       Sub);
+
+  SDValue Ops[2] = { ArgAdjust, CopyChain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue
+HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
+                                            CallingConv::ID CallConv,
+                                            bool isVarArg,
+                                            const
+                                            SmallVectorImpl<ISD::InputArg> &Ins,
+                                            DebugLoc dl, SelectionDAG &DAG,
+                                            SmallVectorImpl<SDValue> &InVals)
+const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  HexagonMachineFunctionInfo *FuncInfo =
+    MF.getInfo<HexagonMachineFunctionInfo>();
+
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
+
+  // For LLVM, in the case when returning a struct by value (>8byte),
+  // the first argument is a pointer that points to the location on caller's
+  // stack where the return value will be stored. For Hexagon, the location on
+  // caller's stack is passed only when the struct size is smaller than (and
+  // equal to) 8 bytes. If not, no address will be passed into callee and
+  // callee return the result direclty through R0/R1.
+
+  SmallVector<SDValue, 4> MemOps;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    unsigned ObjSize;
+    unsigned StackLocation;
+    int FI;
+
+    if (   (VA.isRegLoc() && !Flags.isByVal())
+        || (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() > 8)) {
+      // Arguments passed in registers
+      // 1. int, long long, ptr args that get allocated in register.
+      // 2. Large struct that gets an register to put its address in.
+      EVT RegVT = VA.getLocVT();
+      if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
+          RegVT == MVT::i32 || RegVT == MVT::f32) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else if (RegVT == MVT::i64) {
+        unsigned VReg =
+          RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      } else {
+        assert (0);
+      }
+    } else if (VA.isRegLoc() && Flags.isByVal() && Flags.getByValSize() <= 8) {
+      assert (0 && "ByValSize must be bigger than 8 bytes");
+    } else {
+      // Sanity check.
+      assert(VA.isMemLoc());
+
+      if (Flags.isByVal()) {
+        // If it's a byval parameter, then we need to compute the
+        // "real" size, not the size of the pointer.
+        ObjSize = Flags.getByValSize();
+      } else {
+        ObjSize = VA.getLocVT().getStoreSizeInBits() >> 3;
+      }
+
+      StackLocation = HEXAGON_LRFP_SIZE + VA.getLocMemOffset();
+      // Create the frame index object for this incoming parameter...
+      FI = MFI->CreateFixedObject(ObjSize, StackLocation, true);
+
+      // Create the SelectionDAG nodes cordl, responding to a load
+      // from this parameter.
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+
+      if (Flags.isByVal()) {
+        // If it's a pass-by-value aggregate, then do not dereference the stack
+        // location. Instead, we should generate a reference to the stack
+        // location.
+        InVals.push_back(FIN);
+      } else {
+        InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                     MachinePointerInfo(), false, false,
+                                     false, 0));
+      }
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
+                        MemOps.size());
+
+  if (isVarArg) {
+    // This will point to the next argument passed via stack.
+    int FrameIndex = MFI->CreateFixedObject(Hexagon_PointerSize,
+                                            HEXAGON_LRFP_SIZE +
+                                            CCInfo.getNextStackOffset(),
+                                            true);
+    FuncInfo->setVarArgsFrameIndex(FrameIndex);
+  }
+
+  return Chain;
+}
+
+SDValue
+HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  // VASTART stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  MachineFunction &MF = DAG.getMachineFunction();
+  HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), Addr,
+                      Op.getOperand(1), MachinePointerInfo(SV), false,
+                      false, 0);
+}
+
+SDValue
+HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC = Op.getOperand(4);
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  SDNode* OpNode = Op.getNode();
+  EVT SVT = OpNode->getValueType(0);
+
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i1, LHS, RHS, CC);
+  return DAG.getNode(ISD::SELECT, dl, SVT, Cond, TrueVal, FalseVal);
+}
+
+SDValue
+HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValTy = Op.getValueType();
+
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy,
+                                    CP->getAlignment());
+  return DAG.getNode(HexagonISD::CONST32, dl, ValTy, Res);
+}
+
+SDValue
+HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+SDValue
+HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  const HexagonRegisterInfo  *TRI = TM.getRegisterInfo();
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         TRI->getFrameRegister(), VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+
+SDValue HexagonTargetLowering::LowerMEMBARRIER(SDValue Op,
+                                               SelectionDAG& DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other,  Op.getOperand(0));
+}
+
+
+SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                                 SelectionDAG& DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
+SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDValue Result;
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  DebugLoc dl = Op.getDebugLoc();
+  Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+
+  const HexagonTargetObjectFile &TLOF =
+      static_cast<const HexagonTargetObjectFile &>(getObjFileLowering());
+  if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+    return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), Result);
+  }
+
+  return DAG.getNode(HexagonISD::CONST32, dl, getPointerTy(), Result);
+}
+
+SDValue
+HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
+                                             &targetmachine)
+  : TargetLowering(targetmachine, new HexagonTargetObjectFile()),
+    TM(targetmachine) {
+
+    const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+
+    // Set up the register classes.
+    addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass);
+    addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+      addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+      addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
+    }
+
+    addRegisterClass(MVT::i1, &Hexagon::PredRegsRegClass);
+
+    computeRegisterProperties();
+
+    // Align loop entry
+    setPrefLoopAlignment(4);
+
+    // Limits for inline expansion of memcpy/memmove
+    MaxStoresPerMemcpy = 6;
+    MaxStoresPerMemmove = 6;
+
+    //
+    // Library calls for unsupported operations
+    //
+
+    setLibcallName(RTLIB::SINTTOFP_I128_F64, "__hexagon_floattidf");
+    setLibcallName(RTLIB::SINTTOFP_I128_F32, "__hexagon_floattisf");
+
+    setLibcallName(RTLIB::FPTOUINT_F32_I128, "__hexagon_fixunssfti");
+    setLibcallName(RTLIB::FPTOUINT_F64_I128, "__hexagon_fixunsdfti");
+
+    setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
+    setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
+
+    setLibcallName(RTLIB::SDIV_I32, "__hexagon_divsi3");
+    setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_umodsi3");
+    setOperationAction(ISD::SREM,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::SDIV_I64, "__hexagon_divdi3");
+    setOperationAction(ISD::SDIV,  MVT::i64, Expand);
+    setLibcallName(RTLIB::SREM_I64, "__hexagon_moddi3");
+    setOperationAction(ISD::SREM,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::UDIV_I32, "__hexagon_udivsi3");
+    setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::UDIV_I64, "__hexagon_udivdi3");
+    setOperationAction(ISD::UDIV,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::UREM_I32, "__hexagon_umodsi3");
+    setOperationAction(ISD::UREM,  MVT::i32, Expand);
+
+    setLibcallName(RTLIB::UREM_I64, "__hexagon_umoddi3");
+    setOperationAction(ISD::UREM,  MVT::i64, Expand);
+
+    setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
+    setOperationAction(ISD::FDIV,  MVT::f32, Expand);
+
+    setLibcallName(RTLIB::DIV_F64, "__hexagon_divdf3");
+    setOperationAction(ISD::FDIV,  MVT::f64, Expand);
+
+    setOperationAction(ISD::FSQRT,  MVT::f32, Expand);
+    setOperationAction(ISD::FSQRT,  MVT::f64, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f32, Expand);
+    setOperationAction(ISD::FSIN,  MVT::f64, Expand);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+      // Hexagon V5 Support.
+      setOperationAction(ISD::FADD,       MVT::f32, Legal);
+      setOperationAction(ISD::FADD,       MVT::f64, Legal);
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOEQ,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUEQ,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGE,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOGT,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETUGT,      MVT::f64, Legal);
+
+      setCondCodeAction(ISD::SETOLE,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLE,      MVT::f64, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f32, Legal);
+      setCondCodeAction(ISD::SETOLT,      MVT::f64, Legal);
+
+      setOperationAction(ISD::ConstantFP,  MVT::f32, Legal);
+      setOperationAction(ISD::ConstantFP,  MVT::f64, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
+
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Legal);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+
+      setOperationAction(ISD::FNEG,  MVT::f32, Legal);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    } else {
+
+      // Expand fp<->uint.
+      setOperationAction(ISD::FP_TO_SINT,  MVT::i32, Expand);
+      setOperationAction(ISD::FP_TO_UINT,  MVT::i32, Expand);
+
+      setOperationAction(ISD::SINT_TO_FP,  MVT::i32, Expand);
+      setOperationAction(ISD::UINT_TO_FP,  MVT::i32, Expand);
+
+      setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
+
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
+
+      setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
+      setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
+
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
+
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
+      setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
+
+      setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
+      setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
+
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
+      setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
+
+      setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
+      setOperationAction(ISD::FADD,  MVT::f64, Expand);
+
+      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
+      setOperationAction(ISD::FADD,  MVT::f32, Expand);
+
+      setLibcallName(RTLIB::FPEXT_F32_F64, "__hexagon_extendsfdf2");
+      setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
+      setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
+      setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
+      setCondCodeAction(ISD::SETOGT, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
+      setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
+      setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
+      setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::MUL_F64, "__hexagon_muldf3");
+      setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
+      setOperationAction(ISD::MUL, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
+      setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
+
+      setLibcallName(RTLIB::SUB_F64, "__hexagon_subdf3");
+      setOperationAction(ISD::SUB, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
+      setOperationAction(ISD::SUB, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__hexagon_truncdfsf2");
+      setOperationAction(ISD::FP_ROUND, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::UO_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F64, "__hexagon_unorddf2");
+      setCondCodeAction(ISD::SETO, MVT::f64, Expand);
+
+      setLibcallName(RTLIB::O_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETO, MVT::f32, Expand);
+
+      setLibcallName(RTLIB::UO_F32, "__hexagon_unordsf2");
+      setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
+
+      setOperationAction(ISD::FABS,  MVT::f32, Expand);
+      setOperationAction(ISD::FABS,  MVT::f64, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f32, Expand);
+      setOperationAction(ISD::FNEG,  MVT::f64, Expand);
+    }
+
+    setLibcallName(RTLIB::SREM_I32, "__hexagon_modsi3");
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
+
+    setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal);
+    setIndexedLoadAction(ISD::POST_INC, MVT::i64, Legal);
+
+    setIndexedStoreAction(ISD::POST_INC, MVT::i8, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i16, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i64, Legal);
+
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+    // Turn FP extload into load/fextend.
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+    // Hexagon has a i1 sign extending load.
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+    // Turn FP truncstore into trunc + store.
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+    // Custom legalize GlobalAddress nodes into CONST32.
+    setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+    setOperationAction(ISD::GlobalAddress, MVT::i8, Custom);
+    setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+    // Truncate action?
+    setOperationAction(ISD::TRUNCATE, MVT::i64, Expand);
+
+    // Hexagon doesn't have sext_inreg, replace them with shl/sra.
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+    // Hexagon has no REM or DIVREM operations.
+    setOperationAction(ISD::UREM, MVT::i32, Expand);
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+    // Lower SELECT_CC to SETCC and SELECT.
+    setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i64,   Custom);
+
+    if (QRI->Subtarget.hasV5TOps()) {
+
+      // We need to make the operation type of SELECT node to be Custom,
+      // such that we don't go into the infinite loop of
+      // select ->  setcc -> select_cc -> select loop.
+      setOperationAction(ISD::SELECT, MVT::f32, Custom);
+      setOperationAction(ISD::SELECT, MVT::f64, Custom);
+
+      setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+      setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+    } else {
+
+      // Hexagon has no select or setcc: expand to SELECT_CC.
+      setOperationAction(ISD::SELECT, MVT::f32, Expand);
+      setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+      // This is a workaround documented in DAGCombiner.cpp:2892 We don't
+      // support SELECT_CC on every type.
+      setOperationAction(ISD::SELECT_CC, MVT::Other,   Expand);
+
+    }
+
+    setOperationAction(ISD::BRIND, MVT::Other, Expand);
+    if (EmitJumpTables) {
+      setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+    } else {
+      setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+    }
+    // Increase jump tables cutover to 5, was 4.
+    setMinimumJumpTableEntries(5);
+
+    setOperationAction(ISD::BR_CC, MVT::Other, Expand);
+    setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+    setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::i1,  Expand);
+    setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+    setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+
+    setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FREM , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FREM , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // In V4, we have double word add/sub with carry. The problem with
+    // modelling this instruction is that it produces 2 results - Rdd and Px.
+    // To model update of Px, we will have to use Defs[p0..p3] which will
+    // cause any predicate live range to spill. So, we pretend we dont't
+    // have these instructions.
+    setOperationAction(ISD::ADDE, MVT::i8, Expand);
+    setOperationAction(ISD::ADDE, MVT::i16, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i64, Expand);
+    setOperationAction(ISD::SUBE, MVT::i8, Expand);
+    setOperationAction(ISD::SUBE, MVT::i16, Expand);
+    setOperationAction(ISD::SUBE, MVT::i32, Expand);
+    setOperationAction(ISD::SUBE, MVT::i64, Expand);
+    setOperationAction(ISD::ADDC, MVT::i8, Expand);
+    setOperationAction(ISD::ADDC, MVT::i16, Expand);
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDC, MVT::i64, Expand);
+    setOperationAction(ISD::SUBC, MVT::i8, Expand);
+    setOperationAction(ISD::SUBC, MVT::i16, Expand);
+    setOperationAction(ISD::SUBC, MVT::i32, Expand);
+    setOperationAction(ISD::SUBC, MVT::i64, Expand);
+
+    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i32, Expand);
+    setOperationAction(ISD::ROTR , MVT::i32, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FPOW , MVT::f64, Expand);
+    setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+    setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+    setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+
+    setOperationAction(ISD::EH_RETURN,     MVT::Other, Expand);
+
+    if (TM.getSubtargetImpl()->isSubtargetV2()) {
+      setExceptionPointerRegister(Hexagon::R20);
+      setExceptionSelectorRegister(Hexagon::R21);
+    } else {
+      setExceptionPointerRegister(Hexagon::R0);
+      setExceptionSelectorRegister(Hexagon::R1);
+    }
+
+    // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+    setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+    // Use the default implementation.
+    setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+    setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+    setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+    setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+
+
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+    setOperationAction(ISD::INLINEASM         , MVT::Other, Custom);
+
+    setMinFunctionAlignment(2);
+
+    // Needed for DYNAMIC_STACKALLOC expansion.
+    unsigned StackRegister = TM.getRegisterInfo()->getStackRegister();
+    setStackPointerRegisterToSaveRestore(StackRegister);
+    setSchedulingPreference(Sched::VLIW);
+}
+
+
+const char*
+HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    default: return 0;
+    case HexagonISD::CONST32:     return "HexagonISD::CONST32";
+    case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
+    case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
+    case HexagonISD::ADJDYNALLOC: return "HexagonISD::ADJDYNALLOC";
+    case HexagonISD::CMPICC:      return "HexagonISD::CMPICC";
+    case HexagonISD::CMPFCC:      return "HexagonISD::CMPFCC";
+    case HexagonISD::BRICC:       return "HexagonISD::BRICC";
+    case HexagonISD::BRFCC:       return "HexagonISD::BRFCC";
+    case HexagonISD::SELECT_ICC:  return "HexagonISD::SELECT_ICC";
+    case HexagonISD::SELECT_FCC:  return "HexagonISD::SELECT_FCC";
+    case HexagonISD::Hi:          return "HexagonISD::Hi";
+    case HexagonISD::Lo:          return "HexagonISD::Lo";
+    case HexagonISD::FTOI:        return "HexagonISD::FTOI";
+    case HexagonISD::ITOF:        return "HexagonISD::ITOF";
+    case HexagonISD::CALL:        return "HexagonISD::CALL";
+    case HexagonISD::RET_FLAG:    return "HexagonISD::RET_FLAG";
+    case HexagonISD::BR_JT:       return "HexagonISD::BR_JT";
+    case HexagonISD::TC_RETURN:   return "HexagonISD::TC_RETURN";
+  }
+}
+
+bool
+HexagonTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  EVT MTy1 = EVT::getEVT(Ty1);
+  EVT MTy2 = EVT::getEVT(Ty2);
+  if (!MTy1.isSimple() || !MTy2.isSimple()) {
+    return false;
+  }
+  return ((MTy1.getSimpleVT() == MVT::i64) && (MTy2.getSimpleVT() == MVT::i32));
+}
+
+bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isSimple() || !VT2.isSimple()) {
+    return false;
+  }
+  return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32));
+}
+
+SDValue
+HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+    default: llvm_unreachable("Should not custom lower this!");
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+      // Frame & Return address.  Currently unimplemented.
+    case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+    case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::GlobalTLSAddress:
+                          llvm_unreachable("TLS not implemented for Hexagon.");
+    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
+    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGLOBALADDRESS(Op, DAG);
+    case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+    case ISD::VASTART:            return LowerVASTART(Op, DAG);
+    case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+
+    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+    case ISD::SELECT:             return Op;
+    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INLINEASM:          return LowerINLINEASM(Op, DAG);
+
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//                           Hexagon Scheduler Hooks
+//===----------------------------------------------------------------------===//
+MachineBasicBlock *
+HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB)
+const {
+  switch (MI->getOpcode()) {
+    case Hexagon::ADJDYNALLOC: {
+      MachineFunction *MF = BB->getParent();
+      HexagonMachineFunctionInfo *FuncInfo =
+        MF->getInfo<HexagonMachineFunctionInfo>();
+      FuncInfo->addAllocaAdjustInst(MI);
+      return BB;
+    }
+    default: llvm_unreachable("Unexpected instr type to insert");
+  } // switch
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass*>
+HexagonTargetLowering::getRegForInlineAsmConstraint(const
+                                                    std::string &Constraint,
+                                                    EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':   // R0-R31
+       switch (VT.getSimpleVT().SimpleTy) {
+       default:
+         llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
+       case MVT::i32:
+       case MVT::i16:
+       case MVT::i8:
+       case MVT::f32:
+         return std::make_pair(0U, &Hexagon::IntRegsRegClass);
+       case MVT::i64:
+       case MVT::f64:
+         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
+      }
+    default:
+      llvm_unreachable("Unknown asm register class");
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  const HexagonRegisterInfo* QRI = TM.getRegisterInfo();
+  return QRI->Subtarget.hasV5TOps();
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented by
+/// AM is legal for this target, for a load/store of the specified type.
+bool HexagonTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // Allows a signed-extended 11-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 13) || AM.BaseOffs >= (1LL << 13)-1) {
+    return false;
+  }
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV) {
+    return false;
+  }
+
+  int Scale = AM.Scale;
+  if (Scale < 0) Scale = -Scale;
+  switch (Scale) {
+  case 0:  // No scale reg, "r+i", "r", or just "i".
+    break;
+  default: // No scaled addressing mode.
+    return false;
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool HexagonTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return Imm >= -512 && Imm <= 511;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
+                                 SDValue Callee,
+                                 CallingConv::ID CalleeCC,
+                                 bool isVarArg,
+                                 bool isCalleeStructRet,
+                                 bool isCallerStructRet,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 SelectionDAG& DAG) const {
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  // ***************************************************************************
+  //  Look for obvious safe cases to perform tail call optimization that do not
+  //  require ABI changes.
+  // ***************************************************************************
+
+  // If this is a tail call via a function pointer, then don't do it!
+  if (!(dyn_cast<GlobalAddressSDNode>(Callee))
+      && !(dyn_cast<ExternalSymbolSDNode>(Callee))) {
+    return false;
+  }
+
+  // Do not optimize if the calling conventions do not match.
+  if (!CCMatch)
+    return false;
+
+  // Do not tail call optimize vararg calls.
+  if (isVarArg)
+    return false;
+
+  // Also avoid tail call optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // In addition to the cases above, we also disable Tail Call Optimization if
+  // the calling convention code that at least one outgoing argument needs to
+  // go on the stack. We cannot check that here because at this point that
+  // information is not available.
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
new file mode 100644
index 000000000000..3279cc652434
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -0,0 +1,173 @@
+//===-- HexagonISelLowering.h - Hexagon DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Hexagon uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_ISELLOWERING_H
+#define Hexagon_ISELLOWERING_H
+
+#include "Hexagon.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace HexagonISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      CONST32,
+      CONST32_GP,  // For marking data present in GP.
+      CONST32_Int_Real,
+      FCONST32,
+      SETCC,
+      ADJDYNALLOC,
+      ARGEXTEND,
+
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      BR_JT,       // Jump table.
+      BARRIER,     // Memory barrier.
+      WrapperJT,
+      WrapperCP,
+      WrapperCombineII,
+      WrapperCombineRR,
+      WrapperCombineRI_V4,
+      WrapperCombineIR_V4,
+      WrapperPackhl,
+      WrapperSplatB,
+      WrapperSplatH,
+      WrapperShuffEB,
+      WrapperShuffEH,
+      WrapperShuffOB,
+      WrapperShuffOH,
+      TC_RETURN
+    };
+  }
+
+  class HexagonTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+
+    bool CanReturnSmallStruct(const Function* CalleeFn,
+                              unsigned& RetSize) const;
+
+  public:
+    HexagonTargetMachine &TM;
+    explicit HexagonTargetLowering(HexagonTargetMachine &targetmachine);
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      bool isCalleeStructRet,
+                                      bool isCallerStructRet,
+                                      const
+                                      SmallVectorImpl<ISD::OutputArg> &Outs,
+                                      const SmallVectorImpl<SDValue> &OutVals,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFormalArguments(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SmallVectorImpl<SDValue> &OutVals,
+                            SDValue Callee) const;
+
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual MachineBasicBlock
+    *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+
+    SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    virtual EVT getSetCCResultType(EVT VT) const {
+      return MVT::i1;
+    }
+
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base, SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 EVT VT) const;
+
+    // Intrinsics
+    virtual SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                            SelectionDAG &DAG) const;
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    /// The type may be VoidTy, in which case only return true if the addressing
+    /// mode is legal for a load/store of any legal type.
+    /// TODO: Handle pre/postinc as well.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+  };
+} // end namespace llvm
+
+#endif    // Hexagon_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
new file mode 100644
index 000000000000..587fa7d7f10e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -0,0 +1,379 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                         Hexagon Intruction Flags +
+//
+//                    *** Must match HexagonBaseInfo.h ***
+//===----------------------------------------------------------------------===//
+
+class IType<bits<5> t> {
+  bits<5> Value = t;
+}
+def TypePSEUDO : IType<0>;
+def TypeALU32  : IType<1>;
+def TypeCR     : IType<2>;
+def TypeJR     : IType<3>;
+def TypeJ      : IType<4>;
+def TypeLD     : IType<5>;
+def TypeST     : IType<6>;
+def TypeSYSTEM : IType<7>;
+def TypeXTYPE  : IType<8>;
+def TypeENDLOOP: IType<31>;
+
+// Maintain list of valid subtargets for each instruction.
+class SubTarget<bits<4> value> {
+  bits<4> Value = value;
+}
+
+def HasV2SubT     : SubTarget<0xf>;
+def HasV2SubTOnly : SubTarget<0x1>;
+def NoV2SubT      : SubTarget<0x0>;
+def HasV3SubT     : SubTarget<0xe>;
+def HasV3SubTOnly : SubTarget<0x2>;
+def NoV3SubT      : SubTarget<0x1>;
+def HasV4SubT     : SubTarget<0xc>;
+def NoV4SubT      : SubTarget<0x3>;
+def HasV5SubT     : SubTarget<0x8>;
+def NoV5SubT      : SubTarget<0x7>;
+
+// Addressing modes for load/store instructions
+class AddrModeType<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoAddrMode     : AddrModeType<0>;  // No addressing mode
+def Absolute       : AddrModeType<1>;  // Absolute addressing mode
+def AbsoluteSet    : AddrModeType<2>;  // Absolute set addressing mode
+def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
+def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
+def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
+
+class MemAccessSize<bits<3> value> {
+  bits<3> Value = value;
+}
+
+def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
+def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
+def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
+def WordAccess       : MemAccessSize<3>;// Word access instrution (memw).
+def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
+
+
+//===----------------------------------------------------------------------===//
+//                         Intruction Class Declaration +
+//===----------------------------------------------------------------------===//
+
+class OpcodeHexagon {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+  bits<2> IParse = 0; // Parse bits.
+
+  let Inst{31-28} = IClass;
+  let Inst{15-14} = IParse;
+
+  bits<1> zero = 0;
+}
+
+class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
+                  string cstr, InstrItinClass itin, IType type>
+  : Instruction, OpcodeHexagon {
+  let Namespace = "Hexagon";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+  let Constraints = cstr;
+  let Itinerary = itin;
+  let Size = 4;
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+  // Instruction type according to the ISA.
+  IType Type = type;
+  let TSFlags{4-0} = Type.Value;
+
+  // Solo instructions, i.e., those that cannot be in a packet with others.
+  bits<1> isSolo = 0;
+  let TSFlags{5} = isSolo;
+
+  // Predicated instructions.
+  bits<1> isPredicated = 0;
+  let TSFlags{6} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{7} = isPredicatedFalse;
+  bits<1> isPredicatedNew = 0;
+  let TSFlags{8} = isPredicatedNew;
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{13-11} = opNewValue; // New-value produced operand.
+  bits<2> opNewBits = 0;
+  let TSFlags{15-14} = opNewBits; // New-value opcode bits location: 0, 8, 16.
+  bits<1> isNVStorable = 0;
+  let TSFlags{16} = isNVStorable; // Store that can become new-value store.
+  bits<1> isNVStore = 0;
+  let TSFlags{17} = isNVStore; // New-value store insn.
+
+  // Immediate extender helper fields.
+  bits<1> isExtendable = 0;
+  let TSFlags{18} = isExtendable; // Insn may be extended.
+  bits<1> isExtended = 0;
+  let TSFlags{19} = isExtended; // Insn must be extended.
+  bits<3> opExtendable = 0;
+  let TSFlags{22-20} = opExtendable; // Which operand may be extended.
+  bits<1> isExtentSigned = 0;
+  let TSFlags{23} = isExtentSigned; // Signed or unsigned range.
+  bits<5> opExtentBits = 0;
+  let TSFlags{28-24} = opExtentBits; //Number of bits of range before extending.
+
+  // If an instruction is valid on a subtarget (v2-v5), set the corresponding
+  // bit from validSubTargets. v2 is the least significant bit.
+  // By default, instruction is valid on all subtargets.
+  SubTarget validSubTargets = HasV2SubT;
+  let TSFlags{32-29} = validSubTargets.Value;
+
+  // Addressing mode for load/store instructions.
+  AddrModeType addrMode = NoAddrMode;
+  let TSFlags{35-33} = addrMode.Value;
+
+  // Memory access size for mem access instructions (load/store)
+  MemAccessSize accessSize = NoMemAccess;
+  let TSFlags{38-36} = accessSize.Value;
+
+  // Fields used for relation models.
+  string BaseOpcode = "";
+  string CextOpcode = "";
+  string PredSense = "";
+  string PNewValue = "";
+  string InputType = "";    // Input is "imm" or "reg" type.
+  string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
+  string isFloat = "false"; // Set to "true" for the floating-point load/store.
+  string isBrTaken = ""; // Set to "true"/"false" for jump instructions
+
+  let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
+                                    "");
+  let PNewValue = !if(isPredicatedNew, "new", "");
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+}
+
+//===----------------------------------------------------------------------===//
+//                         Intruction Classes Definitions +
+//===----------------------------------------------------------------------===//
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD>;
+
+let mayLoad = 1 in
+class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+// LD Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+let mayStore = 1 in
+class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST>;
+
+class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST0, TypeST>;
+
+// ST Instruction Class in V2/V3 can take SLOT0 only.
+// ST Instruction Class in V4    can take SLOT0 & SLOT1.
+// Definition of the instruction class CHANGED from V2/V3 to V4.
+class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// SYSTEM Instruction Class in V4 can take SLOT0 only
+// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
+class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, SYS, TypeSYSTEM>;
+
+// ALU32 Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU32, TypeALU32>;
+
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE>;
+
+class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE>;
+
+// M Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
+class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+    : MInst<outs, ins, asmstr, pattern, cstr>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE>;
+
+// S Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
+class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : SInst<outs, ins, asmstr, pattern, cstr>;
+
+// J Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, J, TypeJ>;
+
+// JR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, JR, TypeJR>;
+
+// CR Instruction Class in V2/V3/V4.
+// Definition of the instruction class NOT CHANGED.
+class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, CR, TypeCR>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, ENDLOOP, TypeENDLOOP>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDO, TypePSEUDO>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr="">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>;
+
+//===----------------------------------------------------------------------===//
+//                         Intruction Classes Definitions -
+//===----------------------------------------------------------------------===//
+
+
+//
+// ALU32 patterns
+//.
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+
+//
+// ALU64 patterns.
+//
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
+               string cstr = "">
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment ST Instruction.
+class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayStore = 1 in
+class STInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : STInst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment LD Instruction.
+class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1 in
+class LDInst2PI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : LDInst<outs, ins, asmstr, pattern, cstr>;
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormatsV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instruction Format Definitions +
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
new file mode 100644
index 000000000000..9fda0da91612
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -0,0 +1,66 @@
+//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instruction classes in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//----------------------------------------------------------------------------//
+//                         Hexagon Intruction Flags +
+//
+//                        *** Must match BaseInfo.h ***
+//----------------------------------------------------------------------------//
+
+def TypeMEMOP  : IType<9>;
+def TypeNV     : IType<10>;
+def TypePREFIX : IType<30>;
+
+//----------------------------------------------------------------------------//
+//                         Intruction Classes Definitions +
+//----------------------------------------------------------------------------//
+
+//
+// NV type instructions.
+//
+class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV>;
+
+class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+// Definition of Post increment new value store.
+class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+// Post increment ST Instruction.
+let mayStore = 1 in
+class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+// New-value conditional branch.
+class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : NVInst<outs, ins, asmstr, pattern, cstr>;
+
+let mayLoad = 1, mayStore = 1 in
+class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "">
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, MEM_V4, TypeMEMOP>;
+
+class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                 string cstr = "">
+  : MEMInst<outs, ins, asmstr, pattern, cstr>;
+
+let isCodeGenOnly = 1 in
+class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
+  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
new file mode 100644
index 000000000000..60b12ac01c9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -0,0 +1,2576 @@
+//===-- HexagonInstrInfo.cpp - Hexagon Instruction Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/MathExtras.h"
+#define GET_INSTRINFO_CTOR
+#define GET_INSTRMAP_INFO
+#include "HexagonGenInstrInfo.inc"
+#include "HexagonGenDFAPacketizer.inc"
+
+using namespace llvm;
+
+///
+/// Constants for Hexagon instructions.
+///
+const int Hexagon_MEMW_OFFSET_MAX = 4095;
+const int Hexagon_MEMW_OFFSET_MIN = -4096;
+const int Hexagon_MEMD_OFFSET_MAX = 8191;
+const int Hexagon_MEMD_OFFSET_MIN = -8192;
+const int Hexagon_MEMH_OFFSET_MAX = 2047;
+const int Hexagon_MEMH_OFFSET_MIN = -2048;
+const int Hexagon_MEMB_OFFSET_MAX = 1023;
+const int Hexagon_MEMB_OFFSET_MIN = -1024;
+const int Hexagon_ADDI_OFFSET_MAX = 32767;
+const int Hexagon_ADDI_OFFSET_MIN = -32768;
+const int Hexagon_MEMD_AUTOINC_MAX = 56;
+const int Hexagon_MEMD_AUTOINC_MIN = -64;
+const int Hexagon_MEMW_AUTOINC_MAX = 28;
+const int Hexagon_MEMW_AUTOINC_MIN = -32;
+const int Hexagon_MEMH_AUTOINC_MAX = 14;
+const int Hexagon_MEMH_AUTOINC_MIN = -16;
+const int Hexagon_MEMB_AUTOINC_MAX = 7;
+const int Hexagon_MEMB_AUTOINC_MIN = -8;
+
+
+HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
+  : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
+    RI(ST, *this), Subtarget(ST) {
+}
+
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+
+
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::LDriw:
+  case Hexagon::LDrid:
+  case Hexagon::LDrih:
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case Hexagon::STriw:
+  case Hexagon::STrid:
+  case Hexagon::STrih:
+  case Hexagon::STrib:
+    if (MI->getOperand(2).isFI() &&
+        MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+unsigned
+HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const{
+
+    int BOpc   = Hexagon::JMP;
+    int BccOpc = Hexagon::JMP_c;
+
+    assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+    int regPos = 0;
+    // Check if ReverseBranchCondition has asked to reverse this branch
+    // If we want to reverse the branch an odd number of times, we want
+    // JMP_cNot.
+    if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+      BccOpc = Hexagon::JMP_cNot;
+      regPos = 1;
+    }
+
+    if (FBB == 0) {
+      if (Cond.empty()) {
+        // Due to a bug in TailMerging/CFG Optimization, we need to add a
+        // special case handling of a predicated jump followed by an
+        // unconditional jump. If not, Tail Merging and CFG Optimization go
+        // into an infinite loop.
+        MachineBasicBlock *NewTBB, *NewFBB;
+        SmallVector<MachineOperand, 4> Cond;
+        MachineInstr *Term = MBB.getFirstTerminator();
+        if (isPredicated(Term) && !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond,
+                                                 false)) {
+          MachineBasicBlock *NextBB =
+            llvm::next(MachineFunction::iterator(&MBB));
+          if (NewTBB == NextBB) {
+            ReverseBranchCondition(Cond);
+            RemoveBranch(MBB);
+            return InsertBranch(MBB, TBB, 0, Cond, DL);
+          }
+        }
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+      } else {
+        BuildMI(&MBB, DL,
+                get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+      }
+      return 1;
+    }
+
+    BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[regPos].getReg()).addMBB(TBB);
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+
+    return 2;
+}
+
+
+bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  TBB = NULL;
+  FBB = NULL;
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+
+  // A basic block may looks like this:
+  //
+  //  [   insn
+  //     EH_LABEL
+  //      insn
+  //      insn
+  //      insn
+  //     EH_LABEL
+  //      insn     ]
+  //
+  // It has two succs but does not have a terminator
+  // Don't know how to handle it.
+  do {
+    --I;
+    if (I->isEHLabel())
+      return true;
+  } while (I != MBB.begin());
+
+  I = MBB.end();
+  --I;
+
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == Hexagon::JMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (LastInst->getOpcode() == Hexagon::JMP_c) {
+      // Block ends with fall-through true condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    if (LastInst->getOpcode() == Hexagon::JMP_cNot) {
+      // Block ends with fall-through false condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with Hexagon::BRCOND and Hexagon:JMP, handle it.
+  if (((SecondLastInst->getOpcode() == Hexagon::BRCOND) ||
+      (SecondLastInst->getOpcode() == Hexagon::JMP_c)) &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with Hexagon::JMP_cNot and Hexagon:JMP, handle it.
+  if ((SecondLastInst->getOpcode() == Hexagon::JMP_cNot) &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two Hexagon:JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == Hexagon::JMP &&
+      LastInst->getOpcode() == Hexagon::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  int BOpc   = Hexagon::JMP;
+  int BccOpc = Hexagon::JMP_c;
+  int BccOpcNot = Hexagon::JMP_cNot;
+
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != BOpc && I->getOpcode() != BccOpc &&
+      I->getOpcode() != BccOpcNot)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != BccOpc && I->getOpcode() != BccOpcNot)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+
+/// \brief For a comparison instruction, return the source registers in
+/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
+/// compares against in CmpValue. Return true if the comparison instruction
+/// can be analyzed.
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  unsigned Opc = MI->getOpcode();
+
+  // Set mask and the first source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPLTrr:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = ~0;
+      break;
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFF;
+      break;
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUri_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+      SrcReg = MI->getOperand(1).getReg();
+      Mask = 0xFFFF;
+      break;
+  }
+
+  // Set the value/second source register.
+  switch (Opc) {
+    case Hexagon::CMPEHexagon4rr:
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPGT64rr:
+    case Hexagon::CMPGTU64rr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPbEQrr_sbsb_V4:
+    case Hexagon::CMPbEQrr_ubub_V4:
+    case Hexagon::CMPbGTUrr_V4:
+    case Hexagon::CMPbGTrr_V4:
+    case Hexagon::CMPhEQrr_shl_V4:
+    case Hexagon::CMPhEQrr_xor_V4:
+    case Hexagon::CMPhGTUrr_V4:
+    case Hexagon::CMPhGTrr_shl_V4:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPLTrr:
+      SrcReg2 = MI->getOperand(2).getReg();
+      return true;
+
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPbEQri_V4:
+    case Hexagon::CMPbGTUri_V4:
+    case Hexagon::CMPhEQri_V4:
+    case Hexagon::CMPhGTUri_V4:
+      SrcReg2 = 0;
+      Value = MI->getOperand(2).getImm();
+      return true;
+  }
+
+  return false;
+}
+
+
+void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(SrcReg, DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR64), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg, DestReg)) {
+    // Map Pd = Ps to Pd = or(Ps, Ps).
+    BuildMI(MBB, I, DL, get(Hexagon::OR_pp),
+            DestReg).addReg(SrcReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::DoubleRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    // We can have an overlap between single and double reg: r1:0 = r0.
+    if(SrcReg == RI.getSubReg(DestReg, Hexagon::subreg_loreg)) {
+        // r1:0 = r0
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    } else {
+        // r1:0 = r1 or no overlap.
+        BuildMI(MBB, I, DL, get(Hexagon::TFR), (RI.getSubReg(DestReg,
+                Hexagon::subreg_loreg))).addReg(SrcReg);
+        BuildMI(MBB, I, DL, get(Hexagon::TFRI), (RI.getSubReg(DestReg,
+                Hexagon::subreg_hireg))).addImm(0);
+    }
+    return;
+  }
+  if (Hexagon::CRRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFCR), DestReg).addReg(SrcReg);
+    return;
+  }
+  if (Hexagon::PredRegsRegClass.contains(SrcReg) &&
+      Hexagon::IntRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_RsPd), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (Hexagon::IntRegsRegClass.contains(SrcReg) &&
+      Hexagon::PredRegsRegClass.contains(DestReg)) {
+    BuildMI(MBB, I, DL, get(Hexagon::TFR_PdRs), DestReg).
+      addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOStore,
+                      MFI.getObjectSize(FI),
+                      Align);
+
+  if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::DoubleRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STrid))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else if (Hexagon::PredRegsRegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(Hexagon::STriw_pred))
+          .addFrameIndex(FI).addImm(0)
+          .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Unimplemented");
+  }
+}
+
+
+void HexagonInstrInfo::storeRegToAddr(
+                                 MachineFunction &MF, unsigned SrcReg,
+                                 bool isKill,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const
+{
+  llvm_unreachable("Unimplemented");
+}
+
+
+void HexagonInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(
+                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                      MachineMemOperand::MOLoad,
+                      MFI.getObjectSize(FI),
+                      Align);
+  if (RC == &Hexagon::IntRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == &Hexagon::DoubleRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDrid), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else if (RC == &Hexagon::PredRegsRegClass) {
+    BuildMI(MBB, I, DL, get(Hexagon::LDriw_pred), DestReg)
+          .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
+  } else {
+    llvm_unreachable("Can't store this register to stack slot");
+  }
+}
+
+
+void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                        SmallVectorImpl<MachineOperand> &Addr,
+                                        const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("Unimplemented");
+}
+
+
+MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr* MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FI) const {
+  // Hexagon_TODO: Implement.
+  return(0);
+}
+
+MachineInstr*
+HexagonInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                           int FrameIx, uint64_t Offset,
+                                           const MDNode *MDPtr,
+                                           DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Hexagon::DBG_VALUE))
+    .addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *TRC;
+  if (VT == MVT::i1) {
+    TRC = &Hexagon::PredRegsRegClass;
+  } else if (VT == MVT::i32 || VT == MVT::f32) {
+    TRC = &Hexagon::IntRegsRegClass;
+  } else if (VT == MVT::i64 || VT == MVT::f64) {
+    TRC = &Hexagon::DoubleRegsRegClass;
+  } else {
+    llvm_unreachable("Cannot handle this register class");
+  }
+
+  unsigned NewReg = RegInfo.createVirtualRegister(TRC);
+  return NewReg;
+}
+
+bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const {
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const MCInstrDesc &MID = MI->getDesc();
+  const uint64_t F = MID.TSFlags;
+  if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask)
+    return true;
+
+  // TODO: This is largely obsolete now. Will need to be removed
+  // in consecutive patches.
+  switch(MI->getOpcode()) {
+    // TFR_FI Remains a special case.
+    case Hexagon::TFR_FI:
+      return true;
+    default:
+      return false;
+  }
+  return  false;
+}
+
+// This returns true in two cases:
+// - The OP code itself indicates that this is an extended instruction.
+// - One of MOs has been marked with HMOTF_ConstExtended flag.
+bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const {
+  // First check if this is permanently extended op code.
+  const uint64_t F = MI->getDesc().TSFlags;
+  if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask)
+    return true;
+  // Use MO operand flags to determine if one of MI's operands
+  // has HMOTF_ConstExtended flag set.
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+       E = MI->operands_end(); I != E; ++I) {
+    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+      return true;
+  }
+  return  false;
+}
+
+bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // JMP_EQri
+    case Hexagon::JMP_EQriPt_nv_V4:
+    case Hexagon::JMP_EQriPnt_nv_V4:
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+    case Hexagon::JMP_EQriPt_ie_nv_V4:
+    case Hexagon::JMP_EQriPnt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPnt_ie_nv_V4:
+
+    // JMP_EQri - with -1
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+    case Hexagon::JMP_EQriPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriPntneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_EQriNotPntneg_ie_nv_V4:
+
+    // JMP_EQrr
+    case Hexagon::JMP_EQrrPt_nv_V4:
+    case Hexagon::JMP_EQrrPnt_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+    case Hexagon::JMP_EQrrPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrPnt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_EQrrNotPnt_ie_nv_V4:
+
+    // JMP_GTri
+    case Hexagon::JMP_GTriPt_nv_V4:
+    case Hexagon::JMP_GTriPnt_nv_V4:
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+    case Hexagon::JMP_GTriPt_ie_nv_V4:
+    case Hexagon::JMP_GTriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPnt_ie_nv_V4:
+
+    // JMP_GTri - with -1
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+    case Hexagon::JMP_GTriPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriPntneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPtneg_ie_nv_V4:
+    case Hexagon::JMP_GTriNotPntneg_ie_nv_V4:
+
+    // JMP_GTrr
+    case Hexagon::JMP_GTrrPt_nv_V4:
+    case Hexagon::JMP_GTrrPnt_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrNotPnt_ie_nv_V4:
+
+    // JMP_GTrrdn
+    case Hexagon::JMP_GTrrdnPt_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTrrdnNotPnt_ie_nv_V4:
+
+    // JMP_GTUri
+    case Hexagon::JMP_GTUriPt_nv_V4:
+    case Hexagon::JMP_GTUriPnt_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+    case Hexagon::JMP_GTUriPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUriNotPnt_ie_nv_V4:
+
+    // JMP_GTUrr
+    case Hexagon::JMP_GTUrrPt_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrNotPnt_ie_nv_V4:
+
+    // JMP_GTUrrdn
+    case Hexagon::JMP_GTUrrdnPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+    case Hexagon::JMP_GTUrrdnPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnPnt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPt_ie_nv_V4:
+    case Hexagon::JMP_GTUrrdnNotPnt_ie_nv_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    // Store Byte
+    case Hexagon::STrib_nv_V4:
+    case Hexagon::STrib_indexed_nv_V4:
+    case Hexagon::STrib_indexed_shl_nv_V4:
+    case Hexagon::STrib_shl_nv_V4:
+    case Hexagon::STb_GP_nv_V4:
+    case Hexagon::POST_STbri_nv_V4:
+    case Hexagon::STrib_cPt_nv_V4:
+    case Hexagon::STrib_cdnPt_nv_V4:
+    case Hexagon::STrib_cNotPt_nv_V4:
+    case Hexagon::STrib_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STbri_cPt_nv_V4:
+    case Hexagon::POST_STbri_cdnPt_nv_V4:
+    case Hexagon::POST_STbri_cNotPt_nv_V4:
+    case Hexagon::POST_STbri_cdnNotPt_nv_V4:
+    case Hexagon::STb_GP_cPt_nv_V4:
+    case Hexagon::STb_GP_cNotPt_nv_V4:
+    case Hexagon::STb_GP_cdnPt_nv_V4:
+    case Hexagon::STb_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrib_abs_nv_V4:
+    case Hexagon::STrib_abs_cPt_nv_V4:
+    case Hexagon::STrib_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_nv_V4:
+    case Hexagon::STrib_imm_abs_cPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrib_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Halfword
+    case Hexagon::STrih_nv_V4:
+    case Hexagon::STrih_indexed_nv_V4:
+    case Hexagon::STrih_indexed_shl_nv_V4:
+    case Hexagon::STrih_shl_nv_V4:
+    case Hexagon::STh_GP_nv_V4:
+    case Hexagon::POST_SThri_nv_V4:
+    case Hexagon::STrih_cPt_nv_V4:
+    case Hexagon::STrih_cdnPt_nv_V4:
+    case Hexagon::STrih_cNotPt_nv_V4:
+    case Hexagon::STrih_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_SThri_cPt_nv_V4:
+    case Hexagon::POST_SThri_cdnPt_nv_V4:
+    case Hexagon::POST_SThri_cNotPt_nv_V4:
+    case Hexagon::POST_SThri_cdnNotPt_nv_V4:
+    case Hexagon::STh_GP_cPt_nv_V4:
+    case Hexagon::STh_GP_cNotPt_nv_V4:
+    case Hexagon::STh_GP_cdnPt_nv_V4:
+    case Hexagon::STh_GP_cdnNotPt_nv_V4:
+    case Hexagon::STrih_abs_nv_V4:
+    case Hexagon::STrih_abs_cPt_nv_V4:
+    case Hexagon::STrih_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_abs_cdnNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_nv_V4:
+    case Hexagon::STrih_imm_abs_cPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STrih_imm_abs_cdnNotPt_nv_V4:
+
+    // Store Word
+    case Hexagon::STriw_nv_V4:
+    case Hexagon::STriw_indexed_nv_V4:
+    case Hexagon::STriw_indexed_shl_nv_V4:
+    case Hexagon::STriw_shl_nv_V4:
+    case Hexagon::STw_GP_nv_V4:
+    case Hexagon::POST_STwri_nv_V4:
+    case Hexagon::STriw_cPt_nv_V4:
+    case Hexagon::STriw_cdnPt_nv_V4:
+    case Hexagon::STriw_cNotPt_nv_V4:
+    case Hexagon::STriw_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_cdnNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_nv_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4:
+    case Hexagon::POST_STwri_cPt_nv_V4:
+    case Hexagon::POST_STwri_cdnPt_nv_V4:
+    case Hexagon::POST_STwri_cNotPt_nv_V4:
+    case Hexagon::POST_STwri_cdnNotPt_nv_V4:
+    case Hexagon::STw_GP_cPt_nv_V4:
+    case Hexagon::STw_GP_cNotPt_nv_V4:
+    case Hexagon::STw_GP_cdnPt_nv_V4:
+    case Hexagon::STw_GP_cdnNotPt_nv_V4:
+    case Hexagon::STriw_abs_nv_V4:
+    case Hexagon::STriw_abs_cPt_nv_V4:
+    case Hexagon::STriw_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_abs_cdnNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_nv_V4:
+    case Hexagon::STriw_imm_abs_cPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cNotPt_nv_V4:
+    case Hexagon::STriw_imm_abs_cdnNotPt_nv_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    // Load Byte
+    case Hexagon::POST_LDrib:
+    case Hexagon::POST_LDrib_cPt:
+    case Hexagon::POST_LDrib_cNotPt:
+    case Hexagon::POST_LDrib_cdnPt_V4:
+    case Hexagon::POST_LDrib_cdnNotPt_V4:
+
+    // Load unsigned byte
+    case Hexagon::POST_LDriub:
+    case Hexagon::POST_LDriub_cPt:
+    case Hexagon::POST_LDriub_cNotPt:
+    case Hexagon::POST_LDriub_cdnPt_V4:
+    case Hexagon::POST_LDriub_cdnNotPt_V4:
+
+    // Load halfword
+    case Hexagon::POST_LDrih:
+    case Hexagon::POST_LDrih_cPt:
+    case Hexagon::POST_LDrih_cNotPt:
+    case Hexagon::POST_LDrih_cdnPt_V4:
+    case Hexagon::POST_LDrih_cdnNotPt_V4:
+
+    // Load unsigned halfword
+    case Hexagon::POST_LDriuh:
+    case Hexagon::POST_LDriuh_cPt:
+    case Hexagon::POST_LDriuh_cNotPt:
+    case Hexagon::POST_LDriuh_cdnPt_V4:
+    case Hexagon::POST_LDriuh_cdnNotPt_V4:
+
+    // Load word
+    case Hexagon::POST_LDriw:
+    case Hexagon::POST_LDriw_cPt:
+    case Hexagon::POST_LDriw_cNotPt:
+    case Hexagon::POST_LDriw_cdnPt_V4:
+    case Hexagon::POST_LDriw_cdnNotPt_V4:
+
+    // Load double word
+    case Hexagon::POST_LDrid:
+    case Hexagon::POST_LDrid_cPt:
+    case Hexagon::POST_LDrid_cNotPt:
+    case Hexagon::POST_LDrid_cdnPt_V4:
+    case Hexagon::POST_LDrid_cdnNotPt_V4:
+
+    // Store byte
+    case Hexagon::POST_STbri:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+
+    // Store halfword
+    case Hexagon::POST_SThri:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+
+    // Store word
+    case Hexagon::POST_STwri:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+
+    // Store double word
+    case Hexagon::POST_STdri:
+    case Hexagon::POST_STdri_cPt:
+    case Hexagon::POST_STdri_cNotPt:
+    case Hexagon::POST_STdri_cdnPt_V4:
+    case Hexagon::POST_STdri_cdnNotPt_V4:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
+  if (isNewValueJump(MI))
+    return true;
+
+  if (isNewValueStore(MI))
+    return true;
+
+  return false;
+}
+
+bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const {
+  return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4;
+}
+
+bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
+  bool isPred = MI->getDesc().isPredicable();
+
+  if (!isPred)
+    return false;
+
+  const int Opc = MI->getOpcode();
+
+  switch(Opc) {
+  case Hexagon::TFRI:
+    return isInt<12>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(1).getImm());
+
+  case Hexagon::STriw:
+  case Hexagon::STriw_indexed:
+  case Hexagon::STriw_nv_V4:
+    return isShiftedUInt<6,2>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrih:
+  case Hexagon::STrih_indexed:
+  case Hexagon::STrih_nv_V4:
+    return isShiftedUInt<6,1>(MI->getOperand(1).getImm());
+
+  case Hexagon::STrib:
+  case Hexagon::STrib_indexed:
+  case Hexagon::STrib_nv_V4:
+    return isUInt<6>(MI->getOperand(1).getImm());
+
+  case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
+    return isShiftedUInt<6,3>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
+    return isShiftedUInt<6,2>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::LDrih_indexed:
+  case Hexagon::LDriuh_indexed:
+    return isShiftedUInt<6,1>(MI->getOperand(2).getImm());
+
+  case Hexagon::LDrib:
+  case Hexagon::LDriub:
+  case Hexagon::LDrib_indexed:
+  case Hexagon::LDriub_indexed:
+    return isUInt<6>(MI->getOperand(2).getImm());
+
+  case Hexagon::POST_LDrid:
+    return isShiftedInt<4,3>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDriw:
+    return isShiftedInt<4,2>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrih:
+  case Hexagon::POST_LDriuh:
+    return isShiftedInt<4,1>(MI->getOperand(3).getImm());
+
+  case Hexagon::POST_LDrib:
+  case Hexagon::POST_LDriub:
+    return isInt<4>(MI->getOperand(3).getImm());
+
+  case Hexagon::STrib_imm_V4:
+  case Hexagon::STrih_imm_V4:
+  case Hexagon::STriw_imm_V4:
+    return (isUInt<6>(MI->getOperand(1).getImm()) &&
+            isInt<6>(MI->getOperand(2).getImm()));
+
+  case Hexagon::ADD_ri:
+    return isInt<8>(MI->getOperand(2).getImm());
+
+  case Hexagon::ASLH:
+  case Hexagon::ASRH:
+  case Hexagon::SXTB:
+  case Hexagon::SXTH:
+  case Hexagon::ZXTB:
+  case Hexagon::ZXTH:
+    return Subtarget.hasV4TOps();
+
+  case Hexagon::JMPR:
+    return false;
+  }
+
+  return true;
+}
+
+// This function performs the following inversiones:
+//
+//  cPt    ---> cNotPt
+//  cNotPt ---> cPt
+//
+// however, these inversiones are NOT included:
+//
+//  cdnPt      -X-> cdnNotPt
+//  cdnNotPt   -X-> cdnPt
+//  cPt_nv     -X-> cNotPt_nv (new value stores)
+//  cNotPt_nv  -X-> cPt_nv    (new value stores)
+//
+// because only the following transformations are allowed:
+//
+//  cNotPt  ---> cdnNotPt
+//  cPt     ---> cdnPt
+//  cNotPt  ---> cNotPt_nv
+//  cPt     ---> cPt_nv
+unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
+  switch(Opc) {
+    default: llvm_unreachable("Unexpected predicated instruction");
+    case Hexagon::TFR_cPt:
+      return Hexagon::TFR_cNotPt;
+    case Hexagon::TFR_cNotPt:
+      return Hexagon::TFR_cPt;
+
+    case Hexagon::TFRI_cPt:
+      return Hexagon::TFRI_cNotPt;
+    case Hexagon::TFRI_cNotPt:
+      return Hexagon::TFRI_cPt;
+
+    case Hexagon::JMP_c:
+      return Hexagon::JMP_cNot;
+    case Hexagon::JMP_cNot:
+      return Hexagon::JMP_c;
+
+    case Hexagon::ADD_ri_cPt:
+      return Hexagon::ADD_ri_cNotPt;
+    case Hexagon::ADD_ri_cNotPt:
+      return Hexagon::ADD_ri_cPt;
+
+    case Hexagon::ADD_rr_cPt:
+      return Hexagon::ADD_rr_cNotPt;
+    case Hexagon::ADD_rr_cNotPt:
+      return Hexagon::ADD_rr_cPt;
+
+    case Hexagon::XOR_rr_cPt:
+      return Hexagon::XOR_rr_cNotPt;
+    case Hexagon::XOR_rr_cNotPt:
+      return Hexagon::XOR_rr_cPt;
+
+    case Hexagon::AND_rr_cPt:
+      return Hexagon::AND_rr_cNotPt;
+    case Hexagon::AND_rr_cNotPt:
+      return Hexagon::AND_rr_cPt;
+
+    case Hexagon::OR_rr_cPt:
+      return Hexagon::OR_rr_cNotPt;
+    case Hexagon::OR_rr_cNotPt:
+      return Hexagon::OR_rr_cPt;
+
+    case Hexagon::SUB_rr_cPt:
+      return Hexagon::SUB_rr_cNotPt;
+    case Hexagon::SUB_rr_cNotPt:
+      return Hexagon::SUB_rr_cPt;
+
+    case Hexagon::COMBINE_rr_cPt:
+      return Hexagon::COMBINE_rr_cNotPt;
+    case Hexagon::COMBINE_rr_cNotPt:
+      return Hexagon::COMBINE_rr_cPt;
+
+    case Hexagon::ASLH_cPt_V4:
+      return Hexagon::ASLH_cNotPt_V4;
+    case Hexagon::ASLH_cNotPt_V4:
+      return Hexagon::ASLH_cPt_V4;
+
+    case Hexagon::ASRH_cPt_V4:
+      return Hexagon::ASRH_cNotPt_V4;
+    case Hexagon::ASRH_cNotPt_V4:
+      return Hexagon::ASRH_cPt_V4;
+
+    case Hexagon::SXTB_cPt_V4:
+      return Hexagon::SXTB_cNotPt_V4;
+    case Hexagon::SXTB_cNotPt_V4:
+      return Hexagon::SXTB_cPt_V4;
+
+    case Hexagon::SXTH_cPt_V4:
+      return Hexagon::SXTH_cNotPt_V4;
+    case Hexagon::SXTH_cNotPt_V4:
+      return Hexagon::SXTH_cPt_V4;
+
+    case Hexagon::ZXTB_cPt_V4:
+      return Hexagon::ZXTB_cNotPt_V4;
+    case Hexagon::ZXTB_cNotPt_V4:
+      return Hexagon::ZXTB_cPt_V4;
+
+    case Hexagon::ZXTH_cPt_V4:
+      return Hexagon::ZXTH_cNotPt_V4;
+    case Hexagon::ZXTH_cNotPt_V4:
+      return Hexagon::ZXTH_cPt_V4;
+
+
+    case Hexagon::JMPR_cPt:
+      return Hexagon::JMPR_cNotPt;
+    case Hexagon::JMPR_cNotPt:
+      return Hexagon::JMPR_cPt;
+
+  // V4 indexed+scaled load.
+    case Hexagon::LDrid_indexed_shl_cPt_V4:
+      return Hexagon::LDrid_indexed_shl_cNotPt_V4;
+    case Hexagon::LDrid_indexed_shl_cNotPt_V4:
+      return Hexagon::LDrid_indexed_shl_cPt_V4;
+
+    case Hexagon::LDrib_indexed_shl_cPt_V4:
+      return Hexagon::LDrib_indexed_shl_cNotPt_V4;
+    case Hexagon::LDrib_indexed_shl_cNotPt_V4:
+      return Hexagon::LDrib_indexed_shl_cPt_V4;
+
+    case Hexagon::LDriub_indexed_shl_cPt_V4:
+      return Hexagon::LDriub_indexed_shl_cNotPt_V4;
+    case Hexagon::LDriub_indexed_shl_cNotPt_V4:
+      return Hexagon::LDriub_indexed_shl_cPt_V4;
+
+    case Hexagon::LDrih_indexed_shl_cPt_V4:
+      return Hexagon::LDrih_indexed_shl_cNotPt_V4;
+    case Hexagon::LDrih_indexed_shl_cNotPt_V4:
+      return Hexagon::LDrih_indexed_shl_cPt_V4;
+
+    case Hexagon::LDriuh_indexed_shl_cPt_V4:
+      return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+    case Hexagon::LDriuh_indexed_shl_cNotPt_V4:
+      return Hexagon::LDriuh_indexed_shl_cPt_V4;
+
+    case Hexagon::LDriw_indexed_shl_cPt_V4:
+      return Hexagon::LDriw_indexed_shl_cNotPt_V4;
+    case Hexagon::LDriw_indexed_shl_cNotPt_V4:
+      return Hexagon::LDriw_indexed_shl_cPt_V4;
+
+    // Byte.
+    case Hexagon::POST_STbri_cPt:
+      return Hexagon::POST_STbri_cNotPt;
+    case Hexagon::POST_STbri_cNotPt:
+      return Hexagon::POST_STbri_cPt;
+
+    case Hexagon::STrib_cPt:
+      return Hexagon::STrib_cNotPt;
+    case Hexagon::STrib_cNotPt:
+      return Hexagon::STrib_cPt;
+
+    case Hexagon::STrib_indexed_cPt:
+      return Hexagon::STrib_indexed_cNotPt;
+    case Hexagon::STrib_indexed_cNotPt:
+      return Hexagon::STrib_indexed_cPt;
+
+    case Hexagon::STrib_imm_cPt_V4:
+      return Hexagon::STrib_imm_cNotPt_V4;
+    case Hexagon::STrib_imm_cNotPt_V4:
+      return Hexagon::STrib_imm_cPt_V4;
+
+    case Hexagon::STrib_indexed_shl_cPt_V4:
+      return Hexagon::STrib_indexed_shl_cNotPt_V4;
+    case Hexagon::STrib_indexed_shl_cNotPt_V4:
+      return Hexagon::STrib_indexed_shl_cPt_V4;
+
+  // Halfword.
+    case Hexagon::POST_SThri_cPt:
+      return Hexagon::POST_SThri_cNotPt;
+    case Hexagon::POST_SThri_cNotPt:
+      return Hexagon::POST_SThri_cPt;
+
+    case Hexagon::STrih_cPt:
+      return Hexagon::STrih_cNotPt;
+    case Hexagon::STrih_cNotPt:
+      return Hexagon::STrih_cPt;
+
+    case Hexagon::STrih_indexed_cPt:
+      return Hexagon::STrih_indexed_cNotPt;
+    case Hexagon::STrih_indexed_cNotPt:
+      return Hexagon::STrih_indexed_cPt;
+
+    case Hexagon::STrih_imm_cPt_V4:
+      return Hexagon::STrih_imm_cNotPt_V4;
+    case Hexagon::STrih_imm_cNotPt_V4:
+      return Hexagon::STrih_imm_cPt_V4;
+
+    case Hexagon::STrih_indexed_shl_cPt_V4:
+      return Hexagon::STrih_indexed_shl_cNotPt_V4;
+    case Hexagon::STrih_indexed_shl_cNotPt_V4:
+      return Hexagon::STrih_indexed_shl_cPt_V4;
+
+  // Word.
+    case Hexagon::POST_STwri_cPt:
+      return Hexagon::POST_STwri_cNotPt;
+    case Hexagon::POST_STwri_cNotPt:
+      return Hexagon::POST_STwri_cPt;
+
+    case Hexagon::STriw_cPt:
+      return Hexagon::STriw_cNotPt;
+    case Hexagon::STriw_cNotPt:
+      return Hexagon::STriw_cPt;
+
+    case Hexagon::STriw_indexed_cPt:
+      return Hexagon::STriw_indexed_cNotPt;
+    case Hexagon::STriw_indexed_cNotPt:
+      return Hexagon::STriw_indexed_cPt;
+
+    case Hexagon::STriw_indexed_shl_cPt_V4:
+      return Hexagon::STriw_indexed_shl_cNotPt_V4;
+    case Hexagon::STriw_indexed_shl_cNotPt_V4:
+      return Hexagon::STriw_indexed_shl_cPt_V4;
+
+    case Hexagon::STriw_imm_cPt_V4:
+      return Hexagon::STriw_imm_cNotPt_V4;
+    case Hexagon::STriw_imm_cNotPt_V4:
+      return Hexagon::STriw_imm_cPt_V4;
+
+  // Double word.
+    case Hexagon::POST_STdri_cPt:
+      return Hexagon::POST_STdri_cNotPt;
+    case Hexagon::POST_STdri_cNotPt:
+      return Hexagon::POST_STdri_cPt;
+
+    case Hexagon::STrid_cPt:
+      return Hexagon::STrid_cNotPt;
+    case Hexagon::STrid_cNotPt:
+      return Hexagon::STrid_cPt;
+
+    case Hexagon::STrid_indexed_cPt:
+      return Hexagon::STrid_indexed_cNotPt;
+    case Hexagon::STrid_indexed_cNotPt:
+      return Hexagon::STrid_indexed_cPt;
+
+    case Hexagon::STrid_indexed_shl_cPt_V4:
+      return Hexagon::STrid_indexed_shl_cNotPt_V4;
+    case Hexagon::STrid_indexed_shl_cNotPt_V4:
+      return Hexagon::STrid_indexed_shl_cPt_V4;
+
+    // V4 Store to global address.
+    case Hexagon::STd_GP_cPt_V4:
+      return Hexagon::STd_GP_cNotPt_V4;
+    case Hexagon::STd_GP_cNotPt_V4:
+      return Hexagon::STd_GP_cPt_V4;
+
+    case Hexagon::STb_GP_cPt_V4:
+      return Hexagon::STb_GP_cNotPt_V4;
+    case Hexagon::STb_GP_cNotPt_V4:
+      return Hexagon::STb_GP_cPt_V4;
+
+    case Hexagon::STh_GP_cPt_V4:
+      return Hexagon::STh_GP_cNotPt_V4;
+    case Hexagon::STh_GP_cNotPt_V4:
+      return Hexagon::STh_GP_cPt_V4;
+
+    case Hexagon::STw_GP_cPt_V4:
+      return Hexagon::STw_GP_cNotPt_V4;
+    case Hexagon::STw_GP_cNotPt_V4:
+      return Hexagon::STw_GP_cPt_V4;
+
+  // Load.
+    case Hexagon::LDrid_cPt:
+      return Hexagon::LDrid_cNotPt;
+    case Hexagon::LDrid_cNotPt:
+      return Hexagon::LDrid_cPt;
+
+    case Hexagon::LDriw_cPt:
+      return Hexagon::LDriw_cNotPt;
+    case Hexagon::LDriw_cNotPt:
+      return Hexagon::LDriw_cPt;
+
+    case Hexagon::LDrih_cPt:
+      return Hexagon::LDrih_cNotPt;
+    case Hexagon::LDrih_cNotPt:
+      return Hexagon::LDrih_cPt;
+
+    case Hexagon::LDriuh_cPt:
+      return Hexagon::LDriuh_cNotPt;
+    case Hexagon::LDriuh_cNotPt:
+      return Hexagon::LDriuh_cPt;
+
+    case Hexagon::LDrib_cPt:
+      return Hexagon::LDrib_cNotPt;
+    case Hexagon::LDrib_cNotPt:
+      return Hexagon::LDrib_cPt;
+
+    case Hexagon::LDriub_cPt:
+      return Hexagon::LDriub_cNotPt;
+    case Hexagon::LDriub_cNotPt:
+      return Hexagon::LDriub_cPt;
+
+ // Load Indexed.
+    case Hexagon::LDrid_indexed_cPt:
+      return Hexagon::LDrid_indexed_cNotPt;
+    case Hexagon::LDrid_indexed_cNotPt:
+      return Hexagon::LDrid_indexed_cPt;
+
+    case Hexagon::LDriw_indexed_cPt:
+      return Hexagon::LDriw_indexed_cNotPt;
+    case Hexagon::LDriw_indexed_cNotPt:
+      return Hexagon::LDriw_indexed_cPt;
+
+    case Hexagon::LDrih_indexed_cPt:
+      return Hexagon::LDrih_indexed_cNotPt;
+    case Hexagon::LDrih_indexed_cNotPt:
+      return Hexagon::LDrih_indexed_cPt;
+
+    case Hexagon::LDriuh_indexed_cPt:
+      return Hexagon::LDriuh_indexed_cNotPt;
+    case Hexagon::LDriuh_indexed_cNotPt:
+      return Hexagon::LDriuh_indexed_cPt;
+
+    case Hexagon::LDrib_indexed_cPt:
+      return Hexagon::LDrib_indexed_cNotPt;
+    case Hexagon::LDrib_indexed_cNotPt:
+      return Hexagon::LDrib_indexed_cPt;
+
+    case Hexagon::LDriub_indexed_cPt:
+      return Hexagon::LDriub_indexed_cNotPt;
+    case Hexagon::LDriub_indexed_cNotPt:
+      return Hexagon::LDriub_indexed_cPt;
+
+  // Post Inc Load.
+    case Hexagon::POST_LDrid_cPt:
+      return Hexagon::POST_LDrid_cNotPt;
+    case Hexagon::POST_LDriw_cNotPt:
+      return Hexagon::POST_LDriw_cPt;
+
+    case Hexagon::POST_LDrih_cPt:
+      return Hexagon::POST_LDrih_cNotPt;
+    case Hexagon::POST_LDrih_cNotPt:
+      return Hexagon::POST_LDrih_cPt;
+
+    case Hexagon::POST_LDriuh_cPt:
+      return Hexagon::POST_LDriuh_cNotPt;
+    case Hexagon::POST_LDriuh_cNotPt:
+      return Hexagon::POST_LDriuh_cPt;
+
+    case Hexagon::POST_LDrib_cPt:
+      return Hexagon::POST_LDrib_cNotPt;
+    case Hexagon::POST_LDrib_cNotPt:
+      return Hexagon::POST_LDrib_cPt;
+
+    case Hexagon::POST_LDriub_cPt:
+      return Hexagon::POST_LDriub_cNotPt;
+    case Hexagon::POST_LDriub_cNotPt:
+      return Hexagon::POST_LDriub_cPt;
+
+  // Dealloc_return.
+    case Hexagon::DEALLOC_RET_cPt_V4:
+      return Hexagon::DEALLOC_RET_cNotPt_V4;
+    case Hexagon::DEALLOC_RET_cNotPt_V4:
+      return Hexagon::DEALLOC_RET_cPt_V4;
+
+   // New Value Jump.
+   // JMPEQ_ri - with -1.
+    case Hexagon::JMP_EQriPtneg_nv_V4:
+      return Hexagon::JMP_EQriNotPtneg_nv_V4;
+    case Hexagon::JMP_EQriNotPtneg_nv_V4:
+      return Hexagon::JMP_EQriPtneg_nv_V4;
+
+    case Hexagon::JMP_EQriPntneg_nv_V4:
+      return Hexagon::JMP_EQriNotPntneg_nv_V4;
+    case Hexagon::JMP_EQriNotPntneg_nv_V4:
+      return Hexagon::JMP_EQriPntneg_nv_V4;
+
+   // JMPEQ_ri.
+     case Hexagon::JMP_EQriPt_nv_V4:
+      return Hexagon::JMP_EQriNotPt_nv_V4;
+    case Hexagon::JMP_EQriNotPt_nv_V4:
+      return Hexagon::JMP_EQriPt_nv_V4;
+
+     case Hexagon::JMP_EQriPnt_nv_V4:
+      return Hexagon::JMP_EQriNotPnt_nv_V4;
+    case Hexagon::JMP_EQriNotPnt_nv_V4:
+      return Hexagon::JMP_EQriPnt_nv_V4;
+
+   // JMPEQ_rr.
+     case Hexagon::JMP_EQrrPt_nv_V4:
+      return Hexagon::JMP_EQrrNotPt_nv_V4;
+    case Hexagon::JMP_EQrrNotPt_nv_V4:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+
+     case Hexagon::JMP_EQrrPnt_nv_V4:
+      return Hexagon::JMP_EQrrNotPnt_nv_V4;
+    case Hexagon::JMP_EQrrNotPnt_nv_V4:
+      return Hexagon::JMP_EQrrPnt_nv_V4;
+
+   // JMPGT_ri - with -1.
+    case Hexagon::JMP_GTriPtneg_nv_V4:
+      return Hexagon::JMP_GTriNotPtneg_nv_V4;
+    case Hexagon::JMP_GTriNotPtneg_nv_V4:
+      return Hexagon::JMP_GTriPtneg_nv_V4;
+
+    case Hexagon::JMP_GTriPntneg_nv_V4:
+      return Hexagon::JMP_GTriNotPntneg_nv_V4;
+    case Hexagon::JMP_GTriNotPntneg_nv_V4:
+      return Hexagon::JMP_GTriPntneg_nv_V4;
+
+   // JMPGT_ri.
+     case Hexagon::JMP_GTriPt_nv_V4:
+      return Hexagon::JMP_GTriNotPt_nv_V4;
+    case Hexagon::JMP_GTriNotPt_nv_V4:
+      return Hexagon::JMP_GTriPt_nv_V4;
+
+     case Hexagon::JMP_GTriPnt_nv_V4:
+      return Hexagon::JMP_GTriNotPnt_nv_V4;
+    case Hexagon::JMP_GTriNotPnt_nv_V4:
+      return Hexagon::JMP_GTriPnt_nv_V4;
+
+   // JMPGT_rr.
+     case Hexagon::JMP_GTrrPt_nv_V4:
+      return Hexagon::JMP_GTrrNotPt_nv_V4;
+    case Hexagon::JMP_GTrrNotPt_nv_V4:
+      return Hexagon::JMP_GTrrPt_nv_V4;
+
+     case Hexagon::JMP_GTrrPnt_nv_V4:
+      return Hexagon::JMP_GTrrNotPnt_nv_V4;
+    case Hexagon::JMP_GTrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrPnt_nv_V4;
+
+   // JMPGT_rrdn.
+     case Hexagon::JMP_GTrrdnPt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTrrdnPt_nv_V4;
+
+     case Hexagon::JMP_GTrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnNotPnt_nv_V4;
+    case Hexagon::JMP_GTrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTrrdnPnt_nv_V4;
+
+   // JMPGTU_ri.
+     case Hexagon::JMP_GTUriPt_nv_V4:
+      return Hexagon::JMP_GTUriNotPt_nv_V4;
+    case Hexagon::JMP_GTUriNotPt_nv_V4:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+
+     case Hexagon::JMP_GTUriPnt_nv_V4:
+      return Hexagon::JMP_GTUriNotPnt_nv_V4;
+    case Hexagon::JMP_GTUriNotPnt_nv_V4:
+      return Hexagon::JMP_GTUriPnt_nv_V4;
+
+   // JMPGTU_rr.
+     case Hexagon::JMP_GTUrrPt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrPt_nv_V4;
+
+     case Hexagon::JMP_GTUrrPnt_nv_V4:
+      return Hexagon::JMP_GTUrrNotPnt_nv_V4;
+    case Hexagon::JMP_GTUrrNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrPnt_nv_V4;
+
+   // JMPGTU_rrdn.
+     case Hexagon::JMP_GTUrrdnPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPt_nv_V4;
+
+     case Hexagon::JMP_GTUrrdnPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnNotPnt_nv_V4;
+    case Hexagon::JMP_GTUrrdnNotPnt_nv_V4:
+      return Hexagon::JMP_GTUrrdnPnt_nv_V4;
+  }
+}
+
+
+int HexagonInstrInfo::
+getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
+  enum Hexagon::PredSense inPredSense;
+  inPredSense = invertPredicate ? Hexagon::PredSense_false :
+                                  Hexagon::PredSense_true;
+  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+    return CondOpcode;
+
+  // This switch case will be removed once all the instructions have been
+  // modified to use relation maps.
+  switch(Opc) {
+  case Hexagon::TFR:
+    return !invertPredicate ? Hexagon::TFR_cPt :
+                              Hexagon::TFR_cNotPt;
+  case Hexagon::TFRI_f:
+    return !invertPredicate ? Hexagon::TFRI_cPt_f :
+                              Hexagon::TFRI_cNotPt_f;
+  case Hexagon::TFRI:
+    return !invertPredicate ? Hexagon::TFRI_cPt :
+                              Hexagon::TFRI_cNotPt;
+  case Hexagon::JMP:
+    return !invertPredicate ? Hexagon::JMP_c :
+                              Hexagon::JMP_cNot;
+  case Hexagon::JMP_EQrrPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQrrPt_nv_V4 :
+                              Hexagon::JMP_EQrrNotPt_nv_V4;
+  case Hexagon::JMP_EQriPt_nv_V4:
+    return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 :
+                              Hexagon::JMP_EQriNotPt_nv_V4;
+  case Hexagon::COMBINE_rr:
+    return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
+                              Hexagon::COMBINE_rr_cNotPt;
+  case Hexagon::ASLH:
+    return !invertPredicate ? Hexagon::ASLH_cPt_V4 :
+                              Hexagon::ASLH_cNotPt_V4;
+  case Hexagon::ASRH:
+    return !invertPredicate ? Hexagon::ASRH_cPt_V4 :
+                              Hexagon::ASRH_cNotPt_V4;
+  case Hexagon::SXTB:
+    return !invertPredicate ? Hexagon::SXTB_cPt_V4 :
+                              Hexagon::SXTB_cNotPt_V4;
+  case Hexagon::SXTH:
+    return !invertPredicate ? Hexagon::SXTH_cPt_V4 :
+                              Hexagon::SXTH_cNotPt_V4;
+  case Hexagon::ZXTB:
+    return !invertPredicate ? Hexagon::ZXTB_cPt_V4 :
+                              Hexagon::ZXTB_cNotPt_V4;
+  case Hexagon::ZXTH:
+    return !invertPredicate ? Hexagon::ZXTH_cPt_V4 :
+                              Hexagon::ZXTH_cNotPt_V4;
+
+  case Hexagon::JMPR:
+    return !invertPredicate ? Hexagon::JMPR_cPt :
+                              Hexagon::JMPR_cNotPt;
+
+  // V4 indexed+scaled load.
+  case Hexagon::LDrid_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrid_indexed_shl_cPt_V4 :
+                              Hexagon::LDrid_indexed_shl_cNotPt_V4;
+  case Hexagon::LDrib_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrib_indexed_shl_cPt_V4 :
+                              Hexagon::LDrib_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriub_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
+                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
+  case Hexagon::LDrih_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDrih_indexed_shl_cPt_V4 :
+                              Hexagon::LDrih_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriuh_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
+                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+  case Hexagon::LDriw_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
+                              Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 Load from global address
+  case Hexagon::LDd_GP_V4:
+    return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
+                              Hexagon::LDd_GP_cNotPt_V4;
+  case Hexagon::LDb_GP_V4:
+    return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 :
+                              Hexagon::LDb_GP_cNotPt_V4;
+  case Hexagon::LDub_GP_V4:
+    return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 :
+                              Hexagon::LDub_GP_cNotPt_V4;
+  case Hexagon::LDh_GP_V4:
+    return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 :
+                              Hexagon::LDh_GP_cNotPt_V4;
+  case Hexagon::LDuh_GP_V4:
+    return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 :
+                              Hexagon::LDuh_GP_cNotPt_V4;
+  case Hexagon::LDw_GP_V4:
+    return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 :
+                              Hexagon::LDw_GP_cNotPt_V4;
+
+    // Byte.
+  case Hexagon::POST_STbri:
+    return !invertPredicate ? Hexagon::POST_STbri_cPt :
+                              Hexagon::POST_STbri_cNotPt;
+  case Hexagon::STrib:
+    return !invertPredicate ? Hexagon::STrib_cPt :
+                              Hexagon::STrib_cNotPt;
+  case Hexagon::STrib_indexed:
+    return !invertPredicate ? Hexagon::STrib_indexed_cPt :
+                              Hexagon::STrib_indexed_cNotPt;
+  case Hexagon::STrib_imm_V4:
+    return !invertPredicate ? Hexagon::STrib_imm_cPt_V4 :
+                              Hexagon::STrib_imm_cNotPt_V4;
+  case Hexagon::STrib_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrib_indexed_shl_cPt_V4 :
+                              Hexagon::STrib_indexed_shl_cNotPt_V4;
+  // Halfword.
+  case Hexagon::POST_SThri:
+    return !invertPredicate ? Hexagon::POST_SThri_cPt :
+                              Hexagon::POST_SThri_cNotPt;
+  case Hexagon::STrih:
+    return !invertPredicate ? Hexagon::STrih_cPt :
+                              Hexagon::STrih_cNotPt;
+  case Hexagon::STrih_indexed:
+    return !invertPredicate ? Hexagon::STrih_indexed_cPt :
+                              Hexagon::STrih_indexed_cNotPt;
+  case Hexagon::STrih_imm_V4:
+    return !invertPredicate ? Hexagon::STrih_imm_cPt_V4 :
+                              Hexagon::STrih_imm_cNotPt_V4;
+  case Hexagon::STrih_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrih_indexed_shl_cPt_V4 :
+                              Hexagon::STrih_indexed_shl_cNotPt_V4;
+  // Word.
+  case Hexagon::POST_STwri:
+    return !invertPredicate ? Hexagon::POST_STwri_cPt :
+                              Hexagon::POST_STwri_cNotPt;
+  case Hexagon::STriw:
+    return !invertPredicate ? Hexagon::STriw_cPt :
+                              Hexagon::STriw_cNotPt;
+  case Hexagon::STriw_indexed:
+    return !invertPredicate ? Hexagon::STriw_indexed_cPt :
+                              Hexagon::STriw_indexed_cNotPt;
+  case Hexagon::STriw_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STriw_indexed_shl_cPt_V4 :
+                              Hexagon::STriw_indexed_shl_cNotPt_V4;
+  case Hexagon::STriw_imm_V4:
+    return !invertPredicate ? Hexagon::STriw_imm_cPt_V4 :
+                              Hexagon::STriw_imm_cNotPt_V4;
+  // Double word.
+  case Hexagon::POST_STdri:
+    return !invertPredicate ? Hexagon::POST_STdri_cPt :
+                              Hexagon::POST_STdri_cNotPt;
+  case Hexagon::STrid:
+    return !invertPredicate ? Hexagon::STrid_cPt :
+                              Hexagon::STrid_cNotPt;
+  case Hexagon::STrid_indexed:
+    return !invertPredicate ? Hexagon::STrid_indexed_cPt :
+                              Hexagon::STrid_indexed_cNotPt;
+  case Hexagon::STrid_indexed_shl_V4:
+    return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
+                              Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  // V4 Store to global address
+  case Hexagon::STd_GP_V4:
+    return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
+                              Hexagon::STd_GP_cNotPt_V4;
+  case Hexagon::STb_GP_V4:
+    return !invertPredicate ? Hexagon::STb_GP_cPt_V4 :
+                              Hexagon::STb_GP_cNotPt_V4;
+  case Hexagon::STh_GP_V4:
+    return !invertPredicate ? Hexagon::STh_GP_cPt_V4 :
+                              Hexagon::STh_GP_cNotPt_V4;
+  case Hexagon::STw_GP_V4:
+    return !invertPredicate ? Hexagon::STw_GP_cPt_V4 :
+                              Hexagon::STw_GP_cNotPt_V4;
+
+  // Load.
+  case Hexagon::LDrid:
+    return !invertPredicate ? Hexagon::LDrid_cPt :
+                              Hexagon::LDrid_cNotPt;
+  case Hexagon::LDriw:
+    return !invertPredicate ? Hexagon::LDriw_cPt :
+                              Hexagon::LDriw_cNotPt;
+  case Hexagon::LDrih:
+    return !invertPredicate ? Hexagon::LDrih_cPt :
+                              Hexagon::LDrih_cNotPt;
+  case Hexagon::LDriuh:
+    return !invertPredicate ? Hexagon::LDriuh_cPt :
+                              Hexagon::LDriuh_cNotPt;
+  case Hexagon::LDrib:
+    return !invertPredicate ? Hexagon::LDrib_cPt :
+                              Hexagon::LDrib_cNotPt;
+  case Hexagon::LDriub:
+    return !invertPredicate ? Hexagon::LDriub_cPt :
+                              Hexagon::LDriub_cNotPt;
+ // Load Indexed.
+  case Hexagon::LDrid_indexed:
+    return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
+                              Hexagon::LDrid_indexed_cNotPt;
+  case Hexagon::LDriw_indexed:
+    return !invertPredicate ? Hexagon::LDriw_indexed_cPt :
+                              Hexagon::LDriw_indexed_cNotPt;
+  case Hexagon::LDrih_indexed:
+    return !invertPredicate ? Hexagon::LDrih_indexed_cPt :
+                              Hexagon::LDrih_indexed_cNotPt;
+  case Hexagon::LDriuh_indexed:
+    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt :
+                              Hexagon::LDriuh_indexed_cNotPt;
+  case Hexagon::LDrib_indexed:
+    return !invertPredicate ? Hexagon::LDrib_indexed_cPt :
+                              Hexagon::LDrib_indexed_cNotPt;
+  case Hexagon::LDriub_indexed:
+    return !invertPredicate ? Hexagon::LDriub_indexed_cPt :
+                              Hexagon::LDriub_indexed_cNotPt;
+  // Post Increment Load.
+  case Hexagon::POST_LDrid:
+    return !invertPredicate ? Hexagon::POST_LDrid_cPt :
+                              Hexagon::POST_LDrid_cNotPt;
+  case Hexagon::POST_LDriw:
+    return !invertPredicate ? Hexagon::POST_LDriw_cPt :
+                              Hexagon::POST_LDriw_cNotPt;
+  case Hexagon::POST_LDrih:
+    return !invertPredicate ? Hexagon::POST_LDrih_cPt :
+                              Hexagon::POST_LDrih_cNotPt;
+  case Hexagon::POST_LDriuh:
+    return !invertPredicate ? Hexagon::POST_LDriuh_cPt :
+                              Hexagon::POST_LDriuh_cNotPt;
+  case Hexagon::POST_LDrib:
+    return !invertPredicate ? Hexagon::POST_LDrib_cPt :
+                              Hexagon::POST_LDrib_cNotPt;
+  case Hexagon::POST_LDriub:
+    return !invertPredicate ? Hexagon::POST_LDriub_cPt :
+                              Hexagon::POST_LDriub_cNotPt;
+  // DEALLOC_RETURN.
+  case Hexagon::DEALLOC_RET_V4:
+    return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
+                              Hexagon::DEALLOC_RET_cNotPt_V4;
+  }
+  llvm_unreachable("Unexpected predicable instruction");
+}
+
+
+bool HexagonInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Cond) const {
+  int Opc = MI->getOpcode();
+  assert (isPredicable(MI) && "Expected predicable instruction");
+  bool invertJump = (!Cond.empty() && Cond[0].isImm() &&
+                     (Cond[0].getImm() == 0));
+
+  // This will change MI's opcode to its predicate version.
+  // However, its operand list is still the old one, i.e. the
+  // non-predicate one.
+  MI->setDesc(get(getMatchingCondBranchOpcode(Opc, invertJump)));
+
+  int oper = -1;
+  unsigned int GAIdx = 0;
+
+  // Indicates whether the current MI has a GlobalAddress operand
+  bool hasGAOpnd = false;
+  std::vector<MachineOperand> tmpOpnds;
+
+  // Indicates whether we need to shift operands to right.
+  bool needShift = true;
+
+  // The predicate is ALWAYS the FIRST input operand !!!
+  if (MI->getNumOperands() == 0) {
+    // The non-predicate version of MI does not take any operands,
+    // i.e. no outs and no ins. In this condition, the predicate
+    // operand will be directly placed at Operands[0]. No operand
+    // shift is needed.
+    // Example: BARRIER
+    needShift = false;
+    oper = -1;
+  }
+  else if (   MI->getOperand(MI->getNumOperands()-1).isReg()
+           && MI->getOperand(MI->getNumOperands()-1).isDef()
+           && !MI->getOperand(MI->getNumOperands()-1).isImplicit()) {
+    // The non-predicate version of MI does not have any input operands.
+    // In this condition, we extend the length of Operands[] by one and
+    // copy the original last operand to the newly allocated slot.
+    // At this moment, it is just a place holder. Later, we will put
+    // predicate operand directly into it. No operand shift is needed.
+    // Example: r0=BARRIER (this is a faked insn used here for illustration)
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+    needShift = false;
+    oper = MI->getNumOperands() - 2;
+  }
+  else {
+    // We need to right shift all input operands by one. Duplicate the
+    // last operand into the newly allocated slot.
+    MI->addOperand(MI->getOperand(MI->getNumOperands()-1));
+  }
+
+  if (needShift)
+  {
+    // Operands[ MI->getNumOperands() - 2 ] has been copied into
+    // Operands[ MI->getNumOperands() - 1 ], so we start from
+    // Operands[ MI->getNumOperands() - 3 ].
+    // oper is a signed int.
+    // It is ok if "MI->getNumOperands()-3" is -3, -2, or -1.
+    for (oper = MI->getNumOperands() - 3; oper >= 0; --oper)
+    {
+      MachineOperand &MO = MI->getOperand(oper);
+
+      // Opnd[0] Opnd[1] Opnd[2] Opnd[3] Opnd[4]   Opnd[5]   Opnd[6]   Opnd[7]
+      // <Def0>  <Def1>  <Use0>  <Use1>  <ImpDef0> <ImpDef1> <ImpUse0> <ImpUse1>
+      //               /\~
+      //              /||\~
+      //               ||
+      //        Predicate Operand here
+      if (MO.isReg() && !MO.isUse() && !MO.isImplicit()) {
+        break;
+      }
+      if (MO.isReg()) {
+        MI->getOperand(oper+1).ChangeToRegister(MO.getReg(), MO.isDef(),
+                                                MO.isImplicit(), MO.isKill(),
+                                                MO.isDead(), MO.isUndef(),
+                                                MO.isDebug());
+      }
+      else if (MO.isImm()) {
+        MI->getOperand(oper+1).ChangeToImmediate(MO.getImm());
+      }
+      else if (MO.isGlobal()) {
+        // MI can not have more than one GlobalAddress operand.
+        assert(hasGAOpnd == false && "MI can only have one GlobalAddress opnd");
+
+        // There is no member function called "ChangeToGlobalAddress" in the
+        // MachineOperand class (not like "ChangeToRegister" and
+        // "ChangeToImmediate"). So we have to remove them from Operands[] list
+        // first, and then add them back after we have inserted the predicate
+        // operand. tmpOpnds[] is to remember these operands before we remove
+        // them.
+        tmpOpnds.push_back(MO);
+
+        // Operands[oper] is a GlobalAddress operand;
+        // Operands[oper+1] has been copied into Operands[oper+2];
+        hasGAOpnd = true;
+        GAIdx = oper;
+        continue;
+      }
+      else {
+        assert(false && "Unexpected operand type");
+      }
+    }
+  }
+
+  int regPos = invertJump ? 1 : 0;
+  MachineOperand PredMO = Cond[regPos];
+
+  // [oper] now points to the last explicit Def. Predicate operand must be
+  // located at [oper+1]. See diagram above.
+  // This assumes that the predicate is always the first operand,
+  // i.e. Operands[0+numResults], in the set of inputs
+  // It is better to have an assert here to check this. But I don't know how
+  // to write this assert because findFirstPredOperandIdx() would return -1
+  if (oper < -1) oper = -1;
+  MI->getOperand(oper+1).ChangeToRegister(PredMO.getReg(), PredMO.isDef(),
+                                          PredMO.isImplicit(), PredMO.isKill(),
+                                          PredMO.isDead(), PredMO.isUndef(),
+                                          PredMO.isDebug());
+
+  if (hasGAOpnd)
+  {
+    unsigned int i;
+
+    // Operands[GAIdx] is the original GlobalAddress operand, which is
+    // already copied into tmpOpnds[0].
+    // Operands[GAIdx] now stores a copy of Operands[GAIdx-1]
+    // Operands[GAIdx+1] has already been copied into Operands[GAIdx+2],
+    // so we start from [GAIdx+2]
+    for (i = GAIdx + 2; i < MI->getNumOperands(); ++i)
+      tmpOpnds.push_back(MI->getOperand(i));
+
+    // Remove all operands in range [ (GAIdx+1) ... (MI->getNumOperands()-1) ]
+    // It is very important that we always remove from the end of Operands[]
+    // MI->getNumOperands() is at least 2 if program goes to here.
+    for (i = MI->getNumOperands() - 1; i > GAIdx; --i)
+      MI->RemoveOperand(i);
+
+    for (i = 0; i < tmpOpnds.size(); ++i)
+      MI->addOperand(tmpOpnds[i]);
+  }
+
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles,
+                    unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+
+bool
+HexagonInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumTCycles,
+                    unsigned ExtraTCycles,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumFCycles,
+                    unsigned ExtraFCycles,
+                    const BranchProbability &Probability) const {
+  return true;
+}
+
+
+bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
+}
+
+bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  assert(isPredicated(MI));
+  return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
+}
+
+bool
+HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                   std::vector<MachineOperand> &Pred) const {
+  for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) {
+    MachineOperand MO = MI->getOperand(oper);
+    if (MO.isReg() && MO.isDef()) {
+      const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg());
+      if (RC == &Hexagon::PredRegsRegClass) {
+        Pred.push_back(MO);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+
+bool
+HexagonInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  // TODO: Fix this
+  return false;
+}
+
+
+//
+// We indicate that we want to reverse the branch by
+// inserting a 0 at the beginning of the Cond vector.
+//
+bool HexagonInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  if (!Cond.empty() && Cond[0].isImm() && Cond[0].getImm() == 0) {
+    Cond.erase(Cond.begin());
+  } else {
+    Cond.insert(Cond.begin(), MachineOperand::CreateImm(0));
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs,
+                          const BranchProbability &Probability) const {
+  return (NumInstrs <= 4);
+}
+
+bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case Hexagon::DEALLOC_RET_V4 :
+  case Hexagon::DEALLOC_RET_cPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotPt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPnt_V4 :
+  case Hexagon::DEALLOC_RET_cdnPt_V4 :
+  case Hexagon::DEALLOC_RET_cNotdnPt_V4 :
+   return true;
+  }
+}
+
+
+bool HexagonInstrInfo::
+isValidOffset(const int Opcode, const int Offset) const {
+  // This function is to check whether the "Offset" is in the correct range of
+  // the given "Opcode". If "Offset" is not in the correct range, "ADD_ri" is
+  // inserted to calculate the final address. Due to this reason, the function
+  // assumes that the "Offset" has correct alignment.
+  // We used to assert if the offset was not properly aligned, however,
+  // there are cases where a misaligned pointer recast can cause this
+  // problem, and we need to allow for it. The front end warns of such
+  // misaligns with respect to load size.
+
+  switch(Opcode) {
+
+  case Hexagon::LDriw:
+  case Hexagon::LDriw_indexed:
+  case Hexagon::LDriw_f:
+  case Hexagon::STriw_indexed:
+  case Hexagon::STriw:
+  case Hexagon::STriw_f:
+    return (Offset >= Hexagon_MEMW_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMW_OFFSET_MAX);
+
+  case Hexagon::LDrid:
+  case Hexagon::LDrid_indexed:
+  case Hexagon::LDrid_f:
+  case Hexagon::STrid:
+  case Hexagon::STrid_indexed:
+  case Hexagon::STrid_f:
+    return (Offset >= Hexagon_MEMD_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMD_OFFSET_MAX);
+
+  case Hexagon::LDrih:
+  case Hexagon::LDriuh:
+  case Hexagon::STrih:
+    return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMH_OFFSET_MAX);
+
+  case Hexagon::LDrib:
+  case Hexagon::STrib:
+  case Hexagon::LDriub:
+    return (Offset >= Hexagon_MEMB_OFFSET_MIN) &&
+      (Offset <= Hexagon_MEMB_OFFSET_MAX);
+
+  case Hexagon::ADD_ri:
+  case Hexagon::TFR_FI:
+    return (Offset >= Hexagon_ADDI_OFFSET_MIN) &&
+      (Offset <= Hexagon_ADDI_OFFSET_MAX);
+
+  case Hexagon::MemOPw_ADDi_V4 :
+  case Hexagon::MemOPw_SUBi_V4 :
+  case Hexagon::MemOPw_ADDr_V4 :
+  case Hexagon::MemOPw_SUBr_V4 :
+  case Hexagon::MemOPw_ANDr_V4 :
+  case Hexagon::MemOPw_ORr_V4 :
+    return (0 <= Offset && Offset <= 255);
+
+  case Hexagon::MemOPh_ADDi_V4 :
+  case Hexagon::MemOPh_SUBi_V4 :
+  case Hexagon::MemOPh_ADDr_V4 :
+  case Hexagon::MemOPh_SUBr_V4 :
+  case Hexagon::MemOPh_ANDr_V4 :
+  case Hexagon::MemOPh_ORr_V4 :
+    return (0 <= Offset && Offset <= 127);
+
+  case Hexagon::MemOPb_ADDi_V4 :
+  case Hexagon::MemOPb_SUBi_V4 :
+  case Hexagon::MemOPb_ADDr_V4 :
+  case Hexagon::MemOPb_SUBr_V4 :
+  case Hexagon::MemOPb_ANDr_V4 :
+  case Hexagon::MemOPb_ORr_V4 :
+    return (0 <= Offset && Offset <= 63);
+
+  // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of
+  // any size. Later pass knows how to handle it.
+  case Hexagon::STriw_pred:
+  case Hexagon::LDriw_pred:
+    return true;
+
+  case Hexagon::LOOP0_i:
+    return isUInt<10>(Offset);
+
+  // INLINEASM is very special.
+  case Hexagon::INLINEASM:
+    return true;
+  }
+
+  llvm_unreachable("No offset range is defined for this opcode. "
+                   "Please define it in the above switch statement!");
+}
+
+
+//
+// Check if the Offset is a valid auto-inc imm by Load/Store Type.
+//
+bool HexagonInstrInfo::
+isValidAutoIncImm(const EVT VT, const int Offset) const {
+
+  if (VT == MVT::i64) {
+      return (Offset >= Hexagon_MEMD_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMD_AUTOINC_MAX &&
+              (Offset & 0x7) == 0);
+  }
+  if (VT == MVT::i32) {
+      return (Offset >= Hexagon_MEMW_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMW_AUTOINC_MAX &&
+              (Offset & 0x3) == 0);
+  }
+  if (VT == MVT::i16) {
+      return (Offset >= Hexagon_MEMH_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMH_AUTOINC_MAX &&
+              (Offset & 0x1) == 0);
+  }
+  if (VT == MVT::i8) {
+      return (Offset >= Hexagon_MEMB_AUTOINC_MIN &&
+              Offset <= Hexagon_MEMB_AUTOINC_MAX);
+  }
+  llvm_unreachable("Not an auto-inc opc!");
+}
+
+
+bool HexagonInstrInfo::
+isMemOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::MemOPw_ADDi_V4 :
+    case Hexagon::MemOPw_SUBi_V4 :
+    case Hexagon::MemOPw_ADDr_V4 :
+    case Hexagon::MemOPw_SUBr_V4 :
+    case Hexagon::MemOPw_ANDr_V4 :
+    case Hexagon::MemOPw_ORr_V4 :
+    case Hexagon::MemOPh_ADDi_V4 :
+    case Hexagon::MemOPh_SUBi_V4 :
+    case Hexagon::MemOPh_ADDr_V4 :
+    case Hexagon::MemOPh_SUBr_V4 :
+    case Hexagon::MemOPh_ANDr_V4 :
+    case Hexagon::MemOPh_ORr_V4 :
+    case Hexagon::MemOPb_ADDi_V4 :
+    case Hexagon::MemOPb_SUBi_V4 :
+    case Hexagon::MemOPb_ADDr_V4 :
+    case Hexagon::MemOPb_SUBr_V4 :
+    case Hexagon::MemOPb_ANDr_V4 :
+    case Hexagon::MemOPb_ORr_V4 :
+    case Hexagon::MemOPb_SETBITi_V4:
+    case Hexagon::MemOPh_SETBITi_V4:
+    case Hexagon::MemOPw_SETBITi_V4:
+    case Hexagon::MemOPb_CLRBITi_V4:
+    case Hexagon::MemOPh_CLRBITi_V4:
+    case Hexagon::MemOPw_CLRBITi_V4:
+    return true;
+  }
+  return false;
+}
+
+
+bool HexagonInstrInfo::
+isSpillPredRegOp(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::STriw_pred :
+    case Hexagon::LDriw_pred :
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::CMPEQrr:
+    case Hexagon::CMPEQri:
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr:
+    case Hexagon::CMPGTri:
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr:
+    case Hexagon::CMPGTUri:
+    case Hexagon::CMPGEri:
+    case Hexagon::CMPGEUri:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::
+isConditionalTransfer (const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+    default: return false;
+    case Hexagon::TFR_cPt:
+    case Hexagon::TFR_cNotPt:
+    case Hexagon::TFRI_cPt:
+    case Hexagon::TFRI_cNotPt:
+    case Hexagon::TFR_cdnPt:
+    case Hexagon::TFR_cdnNotPt:
+    case Hexagon::TFRI_cdnPt:
+    case Hexagon::TFRI_cdnNotPt:
+      return true;
+  }
+}
+
+bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::ADD_ri_cPt:
+    case Hexagon::ADD_ri_cNotPt:
+    case Hexagon::ADD_rr_cPt:
+    case Hexagon::ADD_rr_cNotPt:
+    case Hexagon::XOR_rr_cPt:
+    case Hexagon::XOR_rr_cNotPt:
+    case Hexagon::AND_rr_cPt:
+    case Hexagon::AND_rr_cNotPt:
+    case Hexagon::OR_rr_cPt:
+    case Hexagon::OR_rr_cNotPt:
+    case Hexagon::SUB_rr_cPt:
+    case Hexagon::SUB_rr_cNotPt:
+    case Hexagon::COMBINE_rr_cPt:
+    case Hexagon::COMBINE_rr_cNotPt:
+      return true;
+    case Hexagon::ASLH_cPt_V4:
+    case Hexagon::ASLH_cNotPt_V4:
+    case Hexagon::ASRH_cPt_V4:
+    case Hexagon::ASRH_cNotPt_V4:
+    case Hexagon::SXTB_cPt_V4:
+    case Hexagon::SXTB_cNotPt_V4:
+    case Hexagon::SXTH_cPt_V4:
+    case Hexagon::SXTH_cNotPt_V4:
+    case Hexagon::ZXTB_cPt_V4:
+    case Hexagon::ZXTB_cNotPt_V4:
+    case Hexagon::ZXTH_cPt_V4:
+    case Hexagon::ZXTH_cNotPt_V4:
+      return QRI.Subtarget.hasV4TOps();
+  }
+}
+
+bool HexagonInstrInfo::
+isConditionalLoad (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::LDrid_cPt :
+    case Hexagon::LDrid_cNotPt :
+    case Hexagon::LDrid_indexed_cPt :
+    case Hexagon::LDrid_indexed_cNotPt :
+    case Hexagon::LDriw_cPt :
+    case Hexagon::LDriw_cNotPt :
+    case Hexagon::LDriw_indexed_cPt :
+    case Hexagon::LDriw_indexed_cNotPt :
+    case Hexagon::LDrih_cPt :
+    case Hexagon::LDrih_cNotPt :
+    case Hexagon::LDrih_indexed_cPt :
+    case Hexagon::LDrih_indexed_cNotPt :
+    case Hexagon::LDrib_cPt :
+    case Hexagon::LDrib_cNotPt :
+    case Hexagon::LDrib_indexed_cPt :
+    case Hexagon::LDrib_indexed_cNotPt :
+    case Hexagon::LDriuh_cPt :
+    case Hexagon::LDriuh_cNotPt :
+    case Hexagon::LDriuh_indexed_cPt :
+    case Hexagon::LDriuh_indexed_cNotPt :
+    case Hexagon::LDriub_cPt :
+    case Hexagon::LDriub_cNotPt :
+    case Hexagon::LDriub_indexed_cPt :
+    case Hexagon::LDriub_indexed_cNotPt :
+      return true;
+    case Hexagon::POST_LDrid_cPt :
+    case Hexagon::POST_LDrid_cNotPt :
+    case Hexagon::POST_LDriw_cPt :
+    case Hexagon::POST_LDriw_cNotPt :
+    case Hexagon::POST_LDrih_cPt :
+    case Hexagon::POST_LDrih_cNotPt :
+    case Hexagon::POST_LDrib_cPt :
+    case Hexagon::POST_LDrib_cNotPt :
+    case Hexagon::POST_LDriuh_cPt :
+    case Hexagon::POST_LDriuh_cNotPt :
+    case Hexagon::POST_LDriub_cPt :
+    case Hexagon::POST_LDriub_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+    case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+  }
+}
+
+// Returns true if an instruction is a conditional store.
+//
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+//
+//               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
+//                ^           ^
+//               /             \ (not OK. it will cause new-value store to be
+//              /               X conditional on p0.new while R2 producer is
+//             /                 \ on p0)
+//            /                   \.
+//     p.new store                 p.old NV store
+// [if(p0.new)memw(R0+#0)=R2]    [if(p0)memw(R0+#0)=R2.new]
+//            ^                  ^
+//             \                /
+//              \              /
+//               \            /
+//                 p.old store
+//             [if (p0)memw(R0+#0)=R2]
+//
+// The above diagram shows the steps involoved in the conversion of a predicated
+// store instruction to its .new predicated new-value form.
+//
+// The following set of instructions further explains the scenario where
+// conditional new-value store becomes invalid when promoted to .new predicate
+// form.
+//
+// { 1) if (p0) r0 = add(r1, r2)
+//   2) p0 = cmp.eq(r3, #0) }
+//
+//   3) if (p0) memb(r1+#0) = r0  --> this instruction can't be grouped with
+// the first two instructions because in instr 1, r0 is conditional on old value
+// of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which
+// is not valid for new-value stores.
+bool HexagonInstrInfo::
+isConditionalStore (const MachineInstr* MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    default: return false;
+    case Hexagon::STrib_imm_cPt_V4 :
+    case Hexagon::STrib_imm_cNotPt_V4 :
+    case Hexagon::STrib_indexed_shl_cPt_V4 :
+    case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    case Hexagon::STrib_cPt :
+    case Hexagon::STrib_cNotPt :
+    case Hexagon::POST_STbri_cPt :
+    case Hexagon::POST_STbri_cNotPt :
+    case Hexagon::STrid_indexed_cPt :
+    case Hexagon::STrid_indexed_cNotPt :
+    case Hexagon::STrid_indexed_shl_cPt_V4 :
+    case Hexagon::POST_STdri_cPt :
+    case Hexagon::POST_STdri_cNotPt :
+    case Hexagon::STrih_cPt :
+    case Hexagon::STrih_cNotPt :
+    case Hexagon::STrih_indexed_cPt :
+    case Hexagon::STrih_indexed_cNotPt :
+    case Hexagon::STrih_imm_cPt_V4 :
+    case Hexagon::STrih_imm_cNotPt_V4 :
+    case Hexagon::STrih_indexed_shl_cPt_V4 :
+    case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_SThri_cPt :
+    case Hexagon::POST_SThri_cNotPt :
+    case Hexagon::STriw_cPt :
+    case Hexagon::STriw_cNotPt :
+    case Hexagon::STriw_indexed_cPt :
+    case Hexagon::STriw_indexed_cNotPt :
+    case Hexagon::STriw_imm_cPt_V4 :
+    case Hexagon::STriw_imm_cNotPt_V4 :
+    case Hexagon::STriw_indexed_shl_cPt_V4 :
+    case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    case Hexagon::POST_STwri_cPt :
+    case Hexagon::POST_STwri_cNotPt :
+      return QRI.Subtarget.hasV4TOps();
+
+    // V4 global address store before promoting to dot new.
+    case Hexagon::STd_GP_cPt_V4 :
+    case Hexagon::STd_GP_cNotPt_V4 :
+    case Hexagon::STb_GP_cPt_V4 :
+    case Hexagon::STb_GP_cNotPt_V4 :
+    case Hexagon::STh_GP_cPt_V4 :
+    case Hexagon::STh_GP_cNotPt_V4 :
+    case Hexagon::STw_GP_cPt_V4 :
+    case Hexagon::STw_GP_cNotPt_V4 :
+      return QRI.Subtarget.hasV4TOps();
+
+    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
+    // from the "Conditional Store" list. Because a predicated new value store
+    // would NOT be promoted to a double dot new store. See diagram below:
+    // This function returns yes for those stores that are predicated but not
+    // yet promoted to predicate dot new instructions.
+    //
+    //                          +---------------------+
+    //                    /-----| if (p0) memw(..)=r0 |---------\~
+    //                   ||     +---------------------+         ||
+    //          promote  ||       /\       /\                   ||  promote
+    //                   ||      /||\     /||\                  ||
+    //                  \||/    demote     ||                  \||/
+    //                   \/       ||       ||                   \/
+    //       +-------------------------+   ||   +-------------------------+
+    //       | if (p0.new) memw(..)=r0 |   ||   | if (p0) memw(..)=r0.new |
+    //       +-------------------------+   ||   +-------------------------+
+    //                        ||           ||         ||
+    //                        ||         demote      \||/
+    //                      promote        ||         \/ NOT possible
+    //                        ||           ||         /\~
+    //                       \||/          ||        /||\~
+    //                        \/           ||         ||
+    //                      +-----------------------------+
+    //                      | if (p0.new) memw(..)=r0.new |
+    //                      +-----------------------------+
+    //                           Double Dot New Store
+    //
+  }
+}
+
+// Returns true, if any one of the operands is a dot new
+// insn, whether it is predicated dot new or register dot new.
+bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
+  return (isNewValueInst(MI) ||
+     (isPredicated(MI) && isPredicatedNew(MI)));
+}
+
+unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask);
+}
+
+/// immediateExtend - Changes the instruction in place to one using an immediate
+/// extender.
+void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const {
+  assert((isExtendable(MI)||isConstExtended(MI)) &&
+                               "Instruction must be extendable");
+  // Find which operand is extendable.
+  short ExtOpNum = getCExtOpNum(MI);
+  MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // This needs to be something we understand.
+  assert((MO.isMBB() || MO.isImm()) &&
+         "Branch with unknown extendable field type");
+  // Mark given operand as extended.
+  MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
+}
+
+DFAPacketizer *HexagonInstrInfo::
+CreateTargetScheduleState(const TargetMachine *TM,
+                           const ScheduleDAG *DAG) const {
+  const InstrItineraryData *II = TM->getInstrItineraryData();
+  return TM->getSubtarget<HexagonGenSubtargetInfo>().createDFAPacketizer(II);
+}
+
+bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  // Debug info is never a scheduling boundary. It's necessary to be explicit
+  // due to the special treatment of IT instructions below, otherwise a
+  // dbg_value followed by an IT will result in the IT instruction being
+  // considered a scheduling hazard, which is wrong. It should be the actual
+  // instruction preceding the dbg_value instruction(s), just like it is
+  // when debug info is not present.
+  if (MI->isDebugValue())
+    return false;
+
+  // Terminators and labels can't be scheduled around.
+  if (MI->getDesc().isTerminator() || MI->isLabel() || MI->isInlineAsm())
+    return true;
+
+  return false;
+}
+
+bool HexagonInstrInfo::isConstExtended(MachineInstr *MI) const {
+
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+  if (isExtended) // Instruction must be extended.
+    return true;
+
+  unsigned isExtendable = (F >> HexagonII::ExtendablePos)
+                          & HexagonII::ExtendableMask;
+  if (!isExtendable)
+    return false;
+
+  short ExtOpNum = getCExtOpNum(MI);
+  const MachineOperand &MO = MI->getOperand(ExtOpNum);
+  // Use MO operand flags to determine if MO
+  // has the HMOTF_ConstExtended flag set.
+  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+    return true;
+  // If this is a Machine BB address we are talking about, and it is
+  // not marked as extended, say so.
+  if (MO.isMBB())
+    return false;
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isGlobal() || MO.isSymbol())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int MinValue = getMinValue(MI);
+  int MaxValue = getMaxValue(MI);
+  int ImmValue = MO.getImm();
+
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Returns true if a particular operand is extendable for an instruction.
+bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI,
+                                         unsigned short OperandNum) const {
+  // Constant extenders are allowed only for V4 and above.
+  if (!Subtarget.hasV4TOps())
+    return false;
+
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Returns Operand Index for the constant extended instruction.
+unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Returns the min value that doesn't need to be extended.
+int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Returns the max value that doesn't need to be extended.
+int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
+
+// Returns true if an instruction can be converted into a non-extended
+// equivalent instruction.
+bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const {
+
+  short NonExtOpcode;
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  if (Hexagon::getRegForm(MI->getOpcode()) >= 0)
+    return true;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retreive non-ext equivalent instruction.
+
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      // Load/store with absolute addressing mode can be converted into
+      // base+offset mode.
+      NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode());
+      break;
+    case HexagonII::BaseImmOffset :
+      // Load/store with base+offset addressing mode can be converted into
+      // base+register offset addressing mode. However left shift operand should
+      // be set to 0.
+      NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode());
+      break;
+    default:
+      return false;
+    }
+    if (NonExtOpcode < 0)
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Returns opcode of the non-extended equivalent instruction.
+short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const {
+
+  // Check if the instruction has a register form that uses register in place
+  // of the extended operand, if so return that as the non-extended form.
+  short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode());
+    if (NonExtOpcode >= 0)
+      return NonExtOpcode;
+
+  if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) {
+    // Check addressing mode and retreive non-ext equivalent instruction.
+    switch (getAddrMode(MI)) {
+    case HexagonII::Absolute :
+      return Hexagon::getBasedWithImmOffset(MI->getOpcode());
+    case HexagonII::BaseImmOffset :
+      return Hexagon::getBaseWithRegOffset(MI->getOpcode());
+    default:
+      return -1;
+    }
+  }
+  return -1;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
new file mode 100644
index 000000000000..5df13a88b5d3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -0,0 +1,207 @@
+//===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonINSTRUCTIONINFO_H
+#define HexagonINSTRUCTIONINFO_H
+
+#include "HexagonRegisterInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+
+#define GET_INSTRINFO_HEADER
+#include "HexagonGenInstrInfo.inc"
+
+namespace llvm {
+
+class HexagonInstrInfo : public HexagonGenInstrInfo {
+  const HexagonRegisterInfo RI;
+  const HexagonSubtarget& Subtarget;
+public:
+  explicit HexagonInstrInfo(HexagonSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  virtual bool analyzeCompare(const MachineInstr *MI,
+                              unsigned &SrcReg, unsigned &SrcReg2,
+                              int &Mask, int &Value) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  unsigned createVR(MachineFunction* MF, MVT VT) const;
+
+  virtual bool isPredicable(MachineInstr *MI) const;
+  virtual bool
+  PredicateInstruction(MachineInstr *MI,
+                       const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles, unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles, unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const;
+
+  virtual bool isPredicated(const MachineInstr *MI) const;
+  virtual bool isPredicatedNew(const MachineInstr *MI) const;
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+  virtual bool
+  SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                    const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool
+  isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumCycles,
+                            const BranchProbability &Probability) const;
+
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
+  virtual DFAPacketizer*
+  CreateTargetScheduleState(const TargetMachine *TM,
+                            const ScheduleDAG *DAG) const;
+
+  virtual bool isSchedulingBoundary(const MachineInstr *MI,
+                                    const MachineBasicBlock *MBB,
+                                    const MachineFunction &MF) const;
+  bool isValidOffset(const int Opcode, const int Offset) const;
+  bool isValidAutoIncImm(const EVT VT, const int Offset) const;
+  bool isMemOp(const MachineInstr *MI) const;
+  bool isSpillPredRegOp(const MachineInstr *MI) const;
+  bool isU6_3Immediate(const int value) const;
+  bool isU6_2Immediate(const int value) const;
+  bool isU6_1Immediate(const int value) const;
+  bool isU6_0Immediate(const int value) const;
+  bool isS4_3Immediate(const int value) const;
+  bool isS4_2Immediate(const int value) const;
+  bool isS4_1Immediate(const int value) const;
+  bool isS4_0Immediate(const int value) const;
+  bool isS12_Immediate(const int value) const;
+  bool isU6_Immediate(const int value) const;
+  bool isS8_Immediate(const int value) const;
+  bool isS6_Immediate(const int value) const;
+
+  bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const;
+  bool isConditionalTransfer(const MachineInstr* MI) const;
+  bool isConditionalALU32 (const MachineInstr* MI) const;
+  bool isConditionalLoad (const MachineInstr* MI) const;
+  bool isConditionalStore(const MachineInstr* MI) const;
+  bool isNewValueInst(const MachineInstr* MI) const;
+  bool isDotNewInst(const MachineInstr* MI) const;
+  bool isDeallocRet(const MachineInstr *MI) const;
+  unsigned getInvertedPredicatedOpcode(const int Opc) const;
+  bool isExtendable(const MachineInstr* MI) const;
+  bool isExtended(const MachineInstr* MI) const;
+  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueJump(const MachineInstr* MI) const;
+  bool isNewValueJumpCandidate(const MachineInstr *MI) const;
+
+
+  void immediateExtend(MachineInstr *MI) const;
+  bool isConstExtended(MachineInstr *MI) const;
+  unsigned getAddrMode(const MachineInstr* MI) const;
+  bool isOperandExtended(const MachineInstr *MI,
+                         unsigned short OperandNum) const;
+  unsigned short getCExtOpNum(const MachineInstr *MI) const;
+  int getMinValue(const MachineInstr *MI) const;
+  int getMaxValue(const MachineInstr *MI) const;
+  bool NonExtEquivalentExists (const MachineInstr *MI) const;
+  short getNonExtOpcode(const MachineInstr *MI) const;
+private:
+  int getMatchingCondBranchOpcode(int Opc, bool sense) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
new file mode 100644
index 000000000000..74dc0ca72a04
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -0,0 +1,2765 @@
+//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrFormats.td"
+include "HexagonOperands.td"
+
+// Multi-class for logical operators.
+multiclass ALU32_rr_ri<string OpcStr, SDNode OpNode> {
+  def rr : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   (i32 IntRegs:$c)))]>;
+  def ri : ALU32_ri<(outs IntRegs:$dst), (ins s10Imm:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "(#$b, $c)")),
+                 [(set (i32 IntRegs:$dst), (OpNode s10Imm:$b,
+                                                   (i32 IntRegs:$c)))]>;
+}
+
+// Multi-class for compare ops.
+let isCompare = 1 in {
+multiclass CMP64_rr<string OpcStr, PatFrag OpNode> {
+  def rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i64 DoubleRegs:$b), (i64 DoubleRegs:$c)))]>;
+}
+multiclass CMP32_rr<string OpcStr, PatFrag OpNode> {
+  def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set (i1 PredRegs:$dst),
+                       (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+}
+
+multiclass CMP32_rr_ri_s10<string OpcStr, string CextOp, PatFrag OpNode> {
+  let CextOpcode = CextOp in {
+    let InputType = "reg" in
+    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+
+    let isExtendable = 1, opExtendable = 2, isExtentSigned = 1,
+    opExtentBits = 10, InputType = "imm" in
+    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s10Ext:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), s10ExtPred:$c))]>;
+  }
+}
+
+multiclass CMP32_rr_ri_u9<string OpcStr, string CextOp, PatFrag OpNode> {
+  let CextOpcode = CextOp in {
+    let InputType = "reg" in
+    def rr : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), (i32 IntRegs:$c)))]>;
+
+    let isExtendable = 1, opExtendable = 2, isExtentSigned = 0,
+    opExtentBits = 9, InputType = "imm" in
+    def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u9Ext:$c),
+                   !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                   [(set (i1 PredRegs:$dst),
+                         (OpNode (i32 IntRegs:$b), u9ExtPred:$c))]>;
+  }
+}
+
+multiclass CMP32_ri_u8<string OpcStr, PatFrag OpNode> {
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 8 in
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, u8Ext:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   u8ExtPred:$c))]>;
+}
+
+multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
+  def ri : ALU32_ri<(outs PredRegs:$dst), (ins IntRegs:$b, s8Ext:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, #$c)")),
+                 [(set (i1 PredRegs:$dst), (OpNode (i32 IntRegs:$b),
+                                                   s8ExtPred:$c))]>;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU (Instructions with register-register form)
+//===----------------------------------------------------------------------===//
+multiclass ALU32_Pbase<string mnemonic, bit isNot,
+                       bit isPredNew> {
+
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, $src3)",
+            []>;
+}
+
+multiclass ALU32_Pred<string mnemonic, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ALU32_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+let InputType = "reg" in
+multiclass ALU32_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_rr in {
+    let isPredicable = 1 in
+    def NAME : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = "#mnemonic#"($src1, $src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (i32 IntRegs:$src2)))]>;
+
+    let neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32_Pred<mnemonic, 0>;
+      defm NotPt : ALU32_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+let isCommutable = 1 in {
+  defm ADD_rr : ALU32_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+  defm AND_rr : ALU32_base<"and", "AND", and>, ImmRegRel, PredNewRel;
+  defm XOR_rr : ALU32_base<"xor", "XOR", xor>, ImmRegRel, PredNewRel;
+  defm OR_rr  : ALU32_base<"or", "OR", or>, ImmRegRel, PredNewRel;
+}
+
+defm SUB_rr : ALU32_base<"sub", "SUB", sub>, ImmRegRel, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU (ADD with register-immediate form)
+//===----------------------------------------------------------------------===//
+multiclass ALU32ri_Pbase<string mnemonic, bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Ext: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, #$src3)",
+            []>;
+}
+
+multiclass ALU32ri_Pred<string mnemonic, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ALU32ri_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32ri_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, InputType = "imm" in
+multiclass ALU32ri_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_ri in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = 16,
+    isPredicable = 1 in
+    def NAME : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s16Ext:$src2),
+            "$dst = "#mnemonic#"($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (s16ExtPred:$src2)))]>;
+
+    let opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32ri_Pred<mnemonic, 0>;
+      defm NotPt : ALU32ri_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm ADD_ri : ALU32ri_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+CextOpcode = "OR", InputType = "imm" in
+def OR_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s10Ext:$src2),
+            "$dst = or($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          s10ExtPred:$src2))]>, ImmRegRel;
+
+def NOT_rr : ALU32_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1),
+            "$dst = not($src1)",
+            [(set (i32 IntRegs:$dst), (not (i32 IntRegs:$src1)))]>;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
+InputType = "imm", CextOpcode = "AND" in
+def AND_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s10Ext:$src2),
+            "$dst = and($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           s10ExtPred:$src2))]>, ImmRegRel;
+// Negate.
+def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+          "$dst = neg($src1)",
+          [(set (i32 IntRegs:$dst), (ineg (i32 IntRegs:$src1)))]>;
+// Nop.
+let neverHasSideEffects = 1 in
+def NOP : ALU32_rr<(outs), (ins),
+          "nop",
+          []>;
+
+// Rd32=sub(#s10,Rs32)
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 10,
+CextOpcode = "SUB", InputType = "imm" in
+def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
+            (ins s10Ext:$src1, IntRegs:$src2),
+            "$dst = sub(#$src1, $src2)",
+            [(set IntRegs:$dst, (sub s10ExtPred:$src1, IntRegs:$src2))]>,
+            ImmRegRel;
+
+
+multiclass TFR_Pred<bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    def _c#NAME : ALU32_rr<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, IntRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = $src2",
+            []>;
+    // Predicate new
+    let PNewValue = "new" in
+    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, IntRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = $src2",
+            []>;
+  }
+}
+
+let InputType = "reg", neverHasSideEffects = 1 in
+multiclass TFR_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let isPredicable = 1 in
+    def NAME : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
+            "$dst = $src1",
+            []>;
+
+    let  isPredicated = 1 in {
+      defm Pt : TFR_Pred<0>;
+      defm NotPt : TFR_Pred<1>;
+    }
+  }
+}
+
+class T_TFR64_Pred<bit PredNot, bit isPredNew>
+            : ALU32_rr<(outs DoubleRegs:$dst),
+                       (ins PredRegs:$src1, DoubleRegs:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#
+            !if(isPredNew, ".new) ", ") ")#"$dst = $src2", []>
+{
+    bits<5> dst;
+    bits<2> src1;
+    bits<5> src2;
+
+    let IClass = 0b1111;
+    let Inst{27-24} = 0b1101;
+    let Inst{13} = isPredNew;
+    let Inst{7} = PredNot;
+    let Inst{4-0} = dst;
+    let Inst{6-5} = src1;
+    let Inst{20-17} = src2{4-1};
+    let Inst{16} = 0b1;
+    let Inst{12-9} = src2{4-1};
+    let Inst{8} = 0b0;
+}
+
+multiclass TFR64_Pred<bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    def _c#NAME : T_TFR64_Pred<PredNot, 0>;
+
+    let PNewValue = "new" in
+    def _cdn#NAME : T_TFR64_Pred<PredNot, 1>; // Predicate new
+  }
+}
+
+let neverHasSideEffects = 1 in
+multiclass TFR64_base<string BaseName> {
+  let BaseOpcode = BaseName in {
+    let isPredicable = 1 in
+    def NAME : ALU32Inst <(outs DoubleRegs:$dst),
+                          (ins DoubleRegs:$src1),
+                          "$dst = $src1" > {
+        bits<5> dst;
+        bits<5> src1;
+
+        let IClass = 0b1111;
+        let Inst{27-23} = 0b01010;
+        let Inst{4-0} = dst;
+        let Inst{20-17} = src1{4-1};
+        let Inst{16} = 0b1;
+        let Inst{12-9} = src1{4-1};
+        let Inst{8} = 0b0;
+    }
+
+    let  isPredicated = 1 in {
+      defm Pt : TFR64_Pred<0>;
+      defm NotPt : TFR64_Pred<1>;
+    }
+  }
+}
+
+multiclass TFRI_Pred<bit PredNot> {
+  let isMoveImm = 1, PredSense = !if(PredNot, "false", "true") in {
+    def _c#NAME : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, s12Ext:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#") $dst = #$src2",
+            []>;
+
+    // Predicate new
+    let PNewValue = "new" in
+    def _cdn#NAME : ALU32_rr<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, s12Ext:$src2),
+            !if(PredNot, "if (!$src1", "if ($src1")#".new) $dst = #$src2",
+            []>;
+  }
+}
+
+let InputType = "imm", isExtendable = 1, isExtentSigned = 1 in
+multiclass TFRI_base<string CextOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#I in {
+    let isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16,
+    isMoveImm = 1, isPredicable = 1, isReMaterializable = 1 in
+    def NAME : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1),
+            "$dst = #$src1",
+            [(set (i32 IntRegs:$dst), s16ExtPred:$src1)]>;
+
+    let opExtendable = 2,  opExtentBits = 12, neverHasSideEffects = 1,
+    isPredicated = 1 in {
+      defm Pt    : TFRI_Pred<0>;
+      defm NotPt : TFRI_Pred<1>;
+    }
+  }
+}
+
+defm TFRI : TFRI_base<"TFR">, ImmRegRel, PredNewRel;
+defm TFR : TFR_base<"TFR">, ImmRegRel, PredNewRel;
+defm TFR64 : TFR64_base<"TFR64">, PredNewRel;
+
+// Transfer control register.
+let neverHasSideEffects = 1 in
+def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
+           "$dst = $src1",
+           []>;
+//===----------------------------------------------------------------------===//
+// ALU32/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine.
+
+def SDTHexagonI64I32I32 : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+
+def HexagonWrapperCombineII :
+  SDNode<"HexagonISD::WrapperCombineII", SDTHexagonI64I32I32>;
+def HexagonWrapperCombineRR :
+  SDNode<"HexagonISD::WrapperCombineRR", SDTHexagonI64I32I32>;
+
+// Combines the two integer registers SRC1 and SRC2 into a double register.
+let isPredicable = 1 in
+def COMBINE_rr : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1,
+                                                       IntRegs:$src2),
+  "$dst = combine($src1, $src2)",
+  [(set (i64 DoubleRegs:$dst),
+        (i64 (HexagonWrapperCombineRR (i32 IntRegs:$src1),
+                                      (i32 IntRegs:$src2))))]>;
+
+// Rd=combine(Rt.[HL], Rs.[HL])
+class COMBINE_halves<string A, string B>: ALU32_rr<(outs IntRegs:$dst),
+                                                   (ins IntRegs:$src1,
+                                                        IntRegs:$src2),
+  "$dst = combine($src1."# A #", $src2."# B #")", []>;
+
+let isPredicable = 1 in {
+  def COMBINE_hh : COMBINE_halves<"H", "H">;
+  def COMBINE_hl : COMBINE_halves<"H", "L">;
+  def COMBINE_lh : COMBINE_halves<"L", "H">;
+  def COMBINE_ll : COMBINE_halves<"L", "L">;
+}
+
+def : Pat<(i32 (trunc (i64 (srl (i64 DoubleRegs:$a), (i32 16))))),
+  (COMBINE_lh (EXTRACT_SUBREG (i64 DoubleRegs:$a), subreg_hireg),
+              (EXTRACT_SUBREG (i64 DoubleRegs:$a), subreg_loreg))>;
+
+// Combines the two immediates SRC1 and SRC2 into a double register.
+class COMBINE_imm<Operand imm1, Operand imm2, PatLeaf pat1, PatLeaf pat2> :
+  ALU32_ii<(outs DoubleRegs:$dst), (ins imm1:$src1, imm2:$src2),
+  "$dst = combine(#$src1, #$src2)",
+  [(set (i64 DoubleRegs:$dst),
+        (i64 (HexagonWrapperCombineII (i32 pat1:$src1), (i32 pat2:$src2))))]>;
+
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8 in
+def COMBINE_Ii : COMBINE_imm<s8Ext, s8Imm, s8ExtPred, s8ImmPred>;
+
+// Mux.
+def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                   DoubleRegs:$src2,
+                                                   DoubleRegs:$src3),
+            "$dst = vmux($src1, $src2, $src3)",
+            []>;
+
+let CextOpcode = "MUX", InputType = "reg" in
+def MUX_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                            IntRegs:$src2, IntRegs:$src3),
+             "$dst = mux($src1, $src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                                (i32 IntRegs:$src3))))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "MUX", InputType = "imm" in
+def MUX_ir : ALU32_ir<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
+                                                IntRegs:$src3),
+             "$dst = mux($src1, #$src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), s8ExtPred:$src2,
+                                (i32 IntRegs:$src3))))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "MUX", InputType = "imm" in
+def MUX_ri : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2,
+                                                s8Ext:$src3),
+             "$dst = mux($src1, $src2, #$src3)",
+             [(set (i32 IntRegs:$dst),
+                   (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                                 s8ExtPred:$src3)))]>, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8 in
+def MUX_ii : ALU32_ii<(outs IntRegs:$dst), (ins PredRegs:$src1, s8Ext:$src2,
+                                                s8Imm:$src3),
+             "$dst = mux($src1, #$src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (i32 (select (i1 PredRegs:$src1),
+                                                    s8ExtPred:$src2,
+                                                    s8ImmPred:$src3)))]>;
+
+// ALU32 - aslh, asrh, sxtb, sxth, zxtb, zxth
+multiclass ALU32_2op_Pbase<string mnemonic, bit isNot, bit isPredNew> {
+  let isPredicatedNew = isPredNew in
+  def NAME : ALU32Inst<(outs IntRegs:$dst),
+                       (ins PredRegs:$src1, IntRegs:$src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2)">,
+            Requires<[HasV4T]>;
+}
+
+multiclass ALU32_2op_Pred<string mnemonic, bit PredNot> {
+  let isPredicatedFalse = PredNot in {
+    defm _c#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ALU32_2op_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+multiclass ALU32_2op_base<string mnemonic> {
+  let BaseOpcode = mnemonic in {
+    let isPredicable = 1, neverHasSideEffects = 1 in
+    def NAME : ALU32Inst<(outs IntRegs:$dst),
+                         (ins IntRegs:$src1),
+            "$dst = "#mnemonic#"($src1)">;
+
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT, isPredicated = 1,
+    neverHasSideEffects = 1 in {
+      defm Pt_V4    : ALU32_2op_Pred<mnemonic, 0>;
+      defm NotPt_V4 : ALU32_2op_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm ASLH : ALU32_2op_base<"aslh">, PredNewRel;
+defm ASRH : ALU32_2op_base<"asrh">, PredNewRel;
+defm SXTB : ALU32_2op_base<"sxtb">, PredNewRel;
+defm SXTH : ALU32_2op_base<"sxth">,  PredNewRel;
+defm ZXTB : ALU32_2op_base<"zxtb">, PredNewRel;
+defm ZXTH : ALU32_2op_base<"zxth">,  PredNewRel;
+
+def : Pat <(shl (i32 IntRegs:$src1), (i32 16)),
+           (ASLH IntRegs:$src1)>;
+
+def : Pat <(sra (i32 IntRegs:$src1), (i32 16)),
+           (ASRH IntRegs:$src1)>;
+
+def : Pat <(sext_inreg (i32 IntRegs:$src1), i8),
+           (SXTB IntRegs:$src1)>;
+
+def : Pat <(sext_inreg (i32 IntRegs:$src1), i16),
+           (SXTH IntRegs:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED +
+//===----------------------------------------------------------------------===//
+
+// Conditional combine.
+let neverHasSideEffects = 1, isPredicated = 1 in {
+def COMBINE_rr_cPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1) $dst = combine($src2, $src3)",
+            []>;
+
+let isPredicatedFalse = 1 in
+def COMBINE_rr_cNotPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1) $dst = combine($src2, $src3)",
+            []>;
+
+let isPredicatedNew = 1 in
+def COMBINE_rr_cdnPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if ($src1.new) $dst = combine($src2, $src3)",
+            []>;
+
+let isPredicatedNew = 1, isPredicatedFalse = 1 in
+def COMBINE_rr_cdnNotPt : ALU32_rr<(outs DoubleRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "if (!$src1.new) $dst = combine($src2, $src3)",
+            []>;
+}
+
+// Compare.
+defm CMPGTU : CMP32_rr_ri_u9<"cmp.gtu", "CMPGTU", setugt>, ImmRegRel;
+defm CMPGT : CMP32_rr_ri_s10<"cmp.gt", "CMPGT", setgt>, ImmRegRel;
+defm CMPLT : CMP32_rr<"cmp.lt", setlt>;
+defm CMPLTU : CMP32_rr<"cmp.ltu", setult>;
+defm CMPEQ : CMP32_rr_ri_s10<"cmp.eq", "CMPEQ", seteq>, ImmRegRel;
+defm CMPGE : CMP32_ri_s8<"cmp.ge", setge>;
+defm CMPGEU : CMP32_ri_u8<"cmp.geu", setuge>;
+
+def CTLZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+    "$dst = cl0($src1)",
+    [(set (i32 IntRegs:$dst), (ctlz (i32 IntRegs:$src1)))]>;
+
+def CTTZ_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+    "$dst = ct0($src1)",
+    [(set (i32 IntRegs:$dst), (cttz (i32 IntRegs:$src1)))]>;
+
+def CTLZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+    "$dst = cl0($src1)",
+    [(set (i32 IntRegs:$dst), (i32 (trunc (ctlz (i64 DoubleRegs:$src1)))))]>;
+
+def CTTZ64_rr : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+    "$dst = ct0($src1)",
+    [(set (i32 IntRegs:$dst), (i32 (trunc (cttz (i64 DoubleRegs:$src1)))))]>;
+
+def TSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+    "$dst = tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (setne (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>;
+
+def TSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+    "$dst = tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (setne (and (shl 1, (u5ImmPred:$src2)), (i32 IntRegs:$src1)), 0))]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PRED -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+// Add.
+def ADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = add($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (add (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Add halfword.
+
+// Compare.
+defm CMPEHexagon4 : CMP64_rr<"cmp.eq", seteq>;
+defm CMPGT64 : CMP64_rr<"cmp.gt", setgt>;
+defm CMPGTU64 : CMP64_rr<"cmp.gtu", setugt>;
+
+// Logical operations.
+def AND_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = and($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+def OR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = or($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (or (i64 DoubleRegs:$src1),
+                                               (i64 DoubleRegs:$src2)))]>;
+
+def XOR_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = xor($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Maximum.
+def MAXw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setlt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setult (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MAXd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MAXUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = maxu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setult (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+// Minimum.
+def MINw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setgt (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINUw_rr : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i32 IntRegs:$dst),
+                    (i32 (select (i1 (setugt (i32 IntRegs:$src2),
+                                             (i32 IntRegs:$src1))),
+                                 (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>;
+
+def MINd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+def MINUd_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+              "$dst = minu($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setugt (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>;
+
+// Subtract.
+def SUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = sub($src1, $src2)",
+               [(set (i64 DoubleRegs:$dst), (sub (i64 DoubleRegs:$src1),
+                                                 (i64 DoubleRegs:$src2)))]>;
+
+// Subtract halfword.
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/BIT +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/BIT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/PERM +
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+// ALU64/PERM -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// CR +
+//===----------------------------------------------------------------------===//
+// Logical reductions on predicates.
+
+// Looping instructions.
+
+// Pipelined looping instructions.
+
+// Logical operations on predicates.
+def AND_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = and($src1, $src2)",
+             [(set (i1 PredRegs:$dst), (and (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
+
+let neverHasSideEffects = 1 in
+def AND_pnotp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+                "$dst = and($src1, !$src2)",
+                []>;
+
+def ANY_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = any8($src1)",
+             []>;
+
+def ALL_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = all8($src1)",
+             []>;
+
+def VITPACK_pp : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                 PredRegs:$src2),
+             "$dst = vitpack($src1, $src2)",
+             []>;
+
+def VALIGN_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2,
+                                                    PredRegs:$src3),
+             "$dst = valignb($src1, $src2, $src3)",
+             []>;
+
+def VSPLICE_rrp : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2,
+                                                     PredRegs:$src3),
+             "$dst = vspliceb($src1, $src2, $src3)",
+             []>;
+
+def MASK_p : SInst<(outs DoubleRegs:$dst), (ins PredRegs:$src1),
+             "$dst = mask($src1)",
+             []>;
+
+def NOT_p : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1),
+             "$dst = not($src1)",
+             [(set (i1 PredRegs:$dst), (not (i1 PredRegs:$src1)))]>;
+
+def OR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+            "$dst = or($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (or (i1 PredRegs:$src1),
+                                          (i1 PredRegs:$src2)))]>;
+
+def XOR_pp : SInst<(outs PredRegs:$dst), (ins PredRegs:$src1, PredRegs:$src2),
+             "$dst = xor($src1, $src2)",
+             [(set (i1 PredRegs:$dst), (xor (i1 PredRegs:$src1),
+                                            (i1 PredRegs:$src2)))]>;
+
+
+// User control register transfer.
+//===----------------------------------------------------------------------===//
+// CR -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Jump to address.
+let isBranch = 1, isTerminator=1, isBarrier = 1, isPredicable = 1 in {
+  def JMP : JInst< (outs),
+            (ins brtarget:$offset),
+            "jump $offset",
+            [(br bb:$offset)]>;
+}
+
+// if (p0) jump
+let isBranch = 1, isTerminator=1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_c : JInst< (outs),
+                 (ins PredRegs:$src, brtarget:$offset),
+                 "if ($src) jump $offset",
+                 [(brcond (i1 PredRegs:$src), bb:$offset)]>;
+}
+
+// if (!p0) jump
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_cNot : JInst< (outs),
+                    (ins PredRegs:$src, brtarget:$offset),
+                    "if (!$src) jump $offset",
+                    []>;
+}
+
+let isTerminator = 1, isBranch = 1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def BRCOND : JInst < (outs), (ins PredRegs:$pred, brtarget:$dst),
+               "if ($pred) jump $dst",
+               []>;
+}
+
+// Jump to address conditioned on new predicate.
+// if (p0) jump:t
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_cdnPt : JInst< (outs),
+                   (ins PredRegs:$src, brtarget:$offset),
+                   "if ($src.new) jump:t $offset",
+                   []>;
+}
+
+// if (!p0) jump:t
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_cdnNotPt : JInst< (outs),
+                      (ins PredRegs:$src, brtarget:$offset),
+                      "if (!$src.new) jump:t $offset",
+                      []>;
+}
+
+// Not taken.
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_cdnPnt : JInst< (outs),
+                    (ins PredRegs:$src, brtarget:$offset),
+                    "if ($src.new) jump:nt $offset",
+                    []>;
+}
+
+// Not taken.
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC],
+    isPredicated = 1 in {
+  def JMP_cdnNotPnt : JInst< (outs),
+                       (ins PredRegs:$src, brtarget:$offset),
+                       "if (!$src.new) jump:nt $offset",
+                       []>;
+}
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
+                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Jump to address from register.
+let isPredicable =1, isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR: JRInst<(outs), (ins),
+                   "jumpr r31",
+                   [(retflag)]>;
+}
+
+// Jump to address from register.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cPt: JRInst<(outs), (ins PredRegs:$src1),
+                       "if ($src1) jumpr r31",
+                       []>;
+}
+
+// Jump to address from register.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicated = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cNotPt: JRInst<(outs), (ins PredRegs:$src1),
+                          "if (!$src1) jumpr r31",
+                          []>;
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+///
+// Load -- MEMri operand
+multiclass LD_MEMri_Pbase<string mnemonic, RegisterClass RC,
+                          bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2<(outs RC:$dst),
+                       (ins PredRegs:$src1, MEMri:$addr),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($addr)",
+            []>;
+}
+
+multiclass LD_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, neverHasSideEffects = 1 in
+multiclass LD_MEMri<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
+        isPredicable = 1 in
+      def NAME : LDInst2<(outs RC:$dst), (ins MEMri:$addr),
+                   "$dst = "#mnemonic#"($addr)",
+                   []>;
+
+    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : LD_MEMri_Pred<mnemonic, RC, 0 >;
+      defm NotPt : LD_MEMri_Pred<mnemonic, RC, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true" in {
+  defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
+  defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
+  defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
+  defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
+  defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
+  defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
+}
+
+def : Pat < (i32 (sextloadi8 ADDRriS11_0:$addr)),
+            (LDrib ADDRriS11_0:$addr) >;
+
+def : Pat < (i32 (zextloadi8 ADDRriS11_0:$addr)),
+            (LDriub ADDRriS11_0:$addr) >;
+
+def : Pat < (i32 (sextloadi16 ADDRriS11_1:$addr)),
+            (LDrih ADDRriS11_1:$addr) >;
+
+def : Pat < (i32 (zextloadi16 ADDRriS11_1:$addr)),
+            (LDriuh ADDRriS11_1:$addr) >;
+
+def : Pat < (i32 (load ADDRriS11_2:$addr)),
+            (LDriw ADDRriS11_2:$addr) >;
+
+def : Pat < (i64 (load ADDRriS11_3:$addr)),
+            (LDrid ADDRriS11_3:$addr) >;
+
+
+// Load - Base with Immediate offset addressing mode
+multiclass LD_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2<(outs RC:$dst),
+                     (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2+#$src3)",
+            []>;
+}
+
+multiclass LD_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, neverHasSideEffects = 1 in
+multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 2, isExtentSigned = 1, opExtentBits = ImmBits,
+        isPredicable = 1, AddedComplexity = 20 in
+      def NAME : LDInst2<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
+                   "$dst = "#mnemonic#"($src1+#$offset)",
+                   []>;
+
+    let opExtendable = 3, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 0 >;
+      defm NotPt : LD_Idxd_Pred<mnemonic, RC, predImmOp, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset in {
+  defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
+                               11, 6>, AddrModeRel;
+  defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
+                                11, 6>, AddrModeRel;
+  defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
+                               12, 7>, AddrModeRel;
+  defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
+                                12, 7>, AddrModeRel;
+  defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
+                               13, 8>, AddrModeRel;
+  defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
+                               14, 9>, AddrModeRel;
+}
+
+let AddedComplexity = 20 in {
+def : Pat < (i32 (sextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
+            (LDrib_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+
+def : Pat < (i32 (zextloadi8 (add IntRegs:$src1, s11_0ExtPred:$offset))),
+            (LDriub_indexed IntRegs:$src1, s11_0ExtPred:$offset) >;
+
+def : Pat < (i32 (sextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
+            (LDrih_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+
+def : Pat < (i32 (zextloadi16 (add IntRegs:$src1, s11_1ExtPred:$offset))),
+            (LDriuh_indexed IntRegs:$src1, s11_1ExtPred:$offset) >;
+
+def : Pat < (i32 (load (add IntRegs:$src1, s11_2ExtPred:$offset))),
+            (LDriw_indexed IntRegs:$src1, s11_2ExtPred:$offset) >;
+
+def : Pat < (i64 (load (add IntRegs:$src1, s11_3ExtPred:$offset))),
+            (LDrid_indexed IntRegs:$src1, s11_3ExtPred:$offset) >;
+}
+
+//===----------------------------------------------------------------------===//
+// Post increment load
+// Make sure that in post increment load, the first operand is always the post
+// increment operand.
+//===----------------------------------------------------------------------===//
+
+multiclass LD_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
+                       (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
+            [],
+            "$src2 = $dst2">;
+}
+
+multiclass LD_PostInc_Pred<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME#_V4 : LD_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+multiclass LD_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME : LDInst2PI<(outs RC:$dst, IntRegs:$dst2),
+                         (ins IntRegs:$src1, ImmOp:$offset),
+                 "$dst = "#mnemonic#"($src1++#$offset)",
+                 [],
+                 "$src1 = $dst2">;
+
+    let isPredicated = 1 in {
+      defm Pt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : LD_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+let hasCtrlDep = 1, neverHasSideEffects = 1 in {
+  defm POST_LDrib : LD_PostInc<"memb", "LDrib", IntRegs, s4_0Imm>,
+                    PredNewRel;
+  defm POST_LDriub : LD_PostInc<"memub", "LDriub", IntRegs, s4_0Imm>,
+                    PredNewRel;
+  defm POST_LDrih : LD_PostInc<"memh", "LDrih", IntRegs, s4_1Imm>,
+                    PredNewRel;
+  defm POST_LDriuh : LD_PostInc<"memuh", "LDriuh", IntRegs, s4_1Imm>,
+                    PredNewRel;
+  defm POST_LDriw : LD_PostInc<"memw", "LDriw", IntRegs, s4_2Imm>,
+                    PredNewRel;
+  defm POST_LDrid : LD_PostInc<"memd", "LDrid", DoubleRegs, s4_3Imm>,
+                    PredNewRel;
+}
+
+def : Pat< (i32 (extloadi1 ADDRriS11_0:$addr)),
+           (i32 (LDrib ADDRriS11_0:$addr)) >;
+
+// Load byte any-extend.
+def : Pat < (i32 (extloadi8 ADDRriS11_0:$addr)),
+            (i32 (LDrib ADDRriS11_0:$addr)) >;
+
+// Indexed load byte any-extend.
+let AddedComplexity = 20 in
+def : Pat < (i32 (extloadi8 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDrib_indexed IntRegs:$src1, s11_0ImmPred:$offset)) >;
+
+def : Pat < (i32 (extloadi16 ADDRriS11_1:$addr)),
+            (i32 (LDrih ADDRriS11_1:$addr))>;
+
+let AddedComplexity = 20 in
+def : Pat < (i32 (extloadi16 (add IntRegs:$src1, s11_1ImmPred:$offset))),
+            (i32 (LDrih_indexed IntRegs:$src1, s11_1ImmPred:$offset)) >;
+
+let AddedComplexity = 10 in
+def : Pat < (i32 (zextloadi1 ADDRriS11_0:$addr)),
+            (i32 (LDriub ADDRriS11_0:$addr))>;
+
+let AddedComplexity = 20 in
+def : Pat < (i32 (zextloadi1 (add IntRegs:$src1, s11_0ImmPred:$offset))),
+            (i32 (LDriub_indexed IntRegs:$src1, s11_0ImmPred:$offset))>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+isPseudo = 1, Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def LDriw_pred : LDInst2<(outs PredRegs:$dst),
+            (ins MEMri:$addr),
+            "Error; should not emit",
+            []>;
+
+// Deallocate stack frame.
+let Defs = [R29, R30, R31], Uses = [R29], neverHasSideEffects = 1 in {
+  def DEALLOCFRAME : LDInst2<(outs), (ins),
+                     "deallocframe",
+                     []>;
+}
+
+// Load and unpack bytes to halfwords.
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/COMPLEX -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH +
+//===----------------------------------------------------------------------===//
+// Multiply and use lower result.
+// Rd=+mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 8 in
+def MPYI_riu : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Ext:$src2),
+              "$dst =+ mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             u8ExtPred:$src2))]>;
+
+// Rd=-mpyi(Rs,#u8)
+def MPYI_rin : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+              "$dst =- mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (ineg (mul (i32 IntRegs:$src1),
+                                                   u8ImmPred:$src2)))]>;
+
+// Rd=mpyi(Rs,#m9)
+// s9 is NOT the same as m9 - but it works.. so far.
+// Assembler maps to either Rd=+mpyi(Rs,#u8 or Rd=-mpyi(Rs,#u8)
+// depending on the value of m9. See Arch Spec.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
+CextOpcode = "MPYI", InputType = "imm" in
+def MPYI_ri : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Ext:$src2),
+              "$dst = mpyi($src1, #$src2)",
+              [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                             s9ExtPred:$src2))]>, ImmRegRel;
+
+// Rd=mpyi(Rs,Rt)
+let CextOpcode = "MPYI", InputType = "reg" in
+def MPYI : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyi($src1, $src2)",
+           [(set (i32 IntRegs:$dst), (mul (i32 IntRegs:$src1),
+                                          (i32 IntRegs:$src2)))]>, ImmRegRel;
+
+// Rx+=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8,
+CextOpcode = "MPYI_acc", InputType = "imm" in
+def MPYI_acc_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
+            "$dst += mpyi($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u8ExtPred:$src3),
+                       (i32 IntRegs:$src1)))],
+            "$src1 = $dst">, ImmRegRel;
+
+// Rx+=mpyi(Rs,Rt)
+let CextOpcode = "MPYI_acc", InputType = "reg" in
+def MPYI_acc_rr : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpyi($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       (i32 IntRegs:$src1)))],
+            "$src1 = $dst">, ImmRegRel;
+
+// Rx-=mpyi(Rs,#u8)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 8 in
+def MPYI_sub_ri : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u8Ext:$src3),
+            "$dst -= mpyi($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (sub (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u8ExtPred:$src3)))],
+            "$src1 = $dst">;
+
+// Multiply and use upper result.
+// Rd=mpy(Rs,Rt.H):<<1:rnd:sat
+// Rd=mpy(Rs,Rt.L):<<1:rnd:sat
+// Rd=mpy(Rs,Rt)
+def MPY : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          "$dst = mpy($src1, $src2)",
+          [(set (i32 IntRegs:$dst), (mulhs (i32 IntRegs:$src1),
+                                           (i32 IntRegs:$src2)))]>;
+
+// Rd=mpy(Rs,Rt):rnd
+// Rd=mpyu(Rs,Rt)
+def MPYU : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+           "$dst = mpyu($src1, $src2)",
+           [(set (i32 IntRegs:$dst), (mulhu (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+// Multiply and use full result.
+// Rdd=mpyu(Rs,Rt)
+def MPYU64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpyu($src1, $src2)",
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (anyext (i32 IntRegs:$src1))),
+                        (i64 (anyext (i32 IntRegs:$src2)))))]>;
+
+// Rdd=mpy(Rs,Rt)
+def MPY64 : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = mpy($src1, $src2)",
+             [(set (i64 DoubleRegs:$dst),
+                   (mul (i64 (sext (i32 IntRegs:$src1))),
+                        (i64 (sext (i32 IntRegs:$src2)))))]>;
+
+// Multiply and accumulate, use full result.
+// Rxx[+-]=mpy(Rs,Rt)
+// Rxx+=mpy(Rs,Rt)
+def MPY64_acc : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst += mpy($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+            (add (mul (i64 (sext (i32 IntRegs:$src2))),
+                      (i64 (sext (i32 IntRegs:$src3)))),
+                 (i64 DoubleRegs:$src1)))],
+            "$src1 = $dst">;
+
+// Rxx-=mpy(Rs,Rt)
+def MPY64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst -= mpy($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (sext (i32 IntRegs:$src2))),
+                            (i64 (sext (i32 IntRegs:$src3))))))],
+            "$src1 = $dst">;
+
+// Rxx[+-]=mpyu(Rs,Rt)
+// Rxx+=mpyu(Rs,Rt)
+def MPYU64_acc : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += mpyu($src2, $src3)",
+             [(set (i64 DoubleRegs:$dst),
+                   (add (mul (i64 (anyext (i32 IntRegs:$src2))),
+                             (i64 (anyext (i32 IntRegs:$src3)))),
+                        (i64 DoubleRegs:$src1)))], "$src1 = $dst">;
+
+// Rxx-=mpyu(Rs,Rt)
+def MPYU64_sub : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst -= mpyu($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (sub (i64 DoubleRegs:$src1),
+                       (mul (i64 (anyext (i32 IntRegs:$src2))),
+                            (i64 (anyext (i32 IntRegs:$src3))))))],
+            "$src1 = $dst">;
+
+
+let InputType = "reg", CextOpcode = "ADD_acc" in
+def ADDrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst += add($src2, $src3)",
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3)),
+                                            (i32 IntRegs:$src1)))],
+             "$src1 = $dst">, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+InputType = "imm", CextOpcode = "ADD_acc" in
+def ADDri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Ext:$src3),
+             "$dst += add($src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (add (add (i32 IntRegs:$src2),
+                                                 s8_16ExtPred:$src3),
+                                            (i32 IntRegs:$src1)))],
+             "$src1 = $dst">, ImmRegRel;
+
+let CextOpcode = "SUB_acc", InputType = "reg" in
+def SUBrr_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, IntRegs:$src3),
+             "$dst -= add($src2, $src3)",
+             [(set (i32 IntRegs:$dst),
+                   (sub (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                                  (i32 IntRegs:$src3))))],
+             "$src1 = $dst">, ImmRegRel;
+
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 8,
+CextOpcode = "SUB_acc", InputType = "imm" in
+def SUBri_acc : MInst_acc<(outs IntRegs: $dst), (ins IntRegs:$src1,
+                            IntRegs:$src2, s8Ext:$src3),
+             "$dst -= add($src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
+                                            (add (i32 IntRegs:$src2),
+                                                 s8_16ExtPred:$src3)))],
+             "$src1 = $dst">, ImmRegRel;
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/MPYS -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VB +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VB -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MTYPE/VH  -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+// Store doubleword.
+
+//===----------------------------------------------------------------------===//
+// Post increment store
+//===----------------------------------------------------------------------===//
+
+multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2PI<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3",
+            [],
+            "$src2 = $dst">;
+}
+
+multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME# : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let hasCtrlDep = 1, BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME : STInst2PI<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                #mnemonic#"($src1++#$offset) = $src2",
+                [],
+                "$src1 = $dst">;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+defm POST_STbri: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+
+let isNVStorable = 0 in
+defm POST_STdri: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm>, AddrModeRel;
+
+def : Pat<(post_truncsti8 (i32 IntRegs:$src1), IntRegs:$src2,
+                           s4_3ImmPred:$offset),
+          (POST_STbri IntRegs:$src2, s4_0ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_truncsti16 (i32 IntRegs:$src1), IntRegs:$src2,
+                            s4_3ImmPred:$offset),
+          (POST_SThri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i32 IntRegs:$src1), IntRegs:$src2, s4_2ImmPred:$offset),
+          (POST_STwri IntRegs:$src2, s4_1ImmPred:$offset, IntRegs:$src1)>;
+
+def : Pat<(post_store (i64 DoubleRegs:$src1), IntRegs:$src2,
+                       s4_3ImmPred:$offset),
+          (POST_STdri IntRegs:$src2, s4_3ImmPred:$offset, DoubleRegs:$src1)>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for the store instructions with MEMri operand.
+//===----------------------------------------------------------------------===//
+multiclass ST_MEMri_Pbase<string mnemonic, RegisterClass RC, bit isNot,
+                          bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($addr) = $src2",
+            []>;
+}
+
+multiclass ST_MEMri_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_MEMri_Pbase<mnemonic, RC, PredNot, 0>;
+
+    // Predicate new
+    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
+    defm _cdn#NAME#_V4 : ST_MEMri_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_MEMri<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME : STInst2<(outs),
+            (ins MEMri:$addr, RC:$src),
+            mnemonic#"($addr) = $src",
+            []>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+        isPredicated = 1 in {
+      defm Pt : ST_MEMri_Pred<mnemonic, RC, 0>;
+      defm NotPt : ST_MEMri_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true" in {
+  defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+  defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+  defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let isNVStorable = 0 in
+  defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
+}
+
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr),
+          (STrib ADDRriS11_0:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), ADDRriS11_1:$addr),
+          (STrih ADDRriS11_1:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i32 IntRegs:$src1), ADDRriS11_2:$addr),
+          (STriw ADDRriS11_2:$addr, (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i64 DoubleRegs:$src1), ADDRriS11_3:$addr),
+          (STrid ADDRriS11_3:$addr, (i64 DoubleRegs:$src1))>;
+
+
+//===----------------------------------------------------------------------===//
+// multiclass for the store instructions with base+immediate offset
+// addressing mode
+//===----------------------------------------------------------------------===//
+multiclass ST_Idxd_Pbase<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = $src4",
+            []>;
+}
+
+multiclass ST_Idxd_Pred<string mnemonic, RegisterClass RC, Operand predImmOp,
+                        bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true"), isPredicated = 1 in {
+    defm _c#NAME : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 0>;
+
+    // Predicate new
+    let validSubTargets = HasV4SubT, Predicates = [HasV4T] in
+    defm _cdn#NAME#_V4 : ST_Idxd_Pbase<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isNVStorable = 1, neverHasSideEffects = 1 in
+multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME : STInst2<(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+            mnemonic#"($src1+#$src2) = $src3",
+            []>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits in {
+      defm Pt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 0>;
+      defm NotPt : ST_Idxd_Pred<mnemonic, RC, predImmOp, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "reg" in {
+  defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
+                                u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
+  defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
+                                u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
+  defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
+                                u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
+  let isNVStorable = 0 in
+  defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                                u6_3Ext, 14, 9>, AddrModeRel;
+}
+
+let AddedComplexity = 10 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                                  s11_0ExtPred:$offset)),
+          (STrib_indexed IntRegs:$src2, s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                                   s11_1ExtPred:$offset)),
+          (STrih_indexed IntRegs:$src2, s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i32 IntRegs:$src1), (add IntRegs:$src2,
+                                           s11_2ExtPred:$offset)),
+          (STriw_indexed IntRegs:$src2, s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(store (i64 DoubleRegs:$src1), (add IntRegs:$src2,
+                                              s11_3ExtPred:$offset)),
+          (STrid_indexed IntRegs:$src2, s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+}
+
+// memh(Rx++#s4:1)=Rt.H
+
+// Store word.
+// Store predicate.
+let Defs = [R10,R11,D5], neverHasSideEffects = 1 in
+def STriw_pred : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>;
+
+// Allocate stack frame.
+let Defs = [R29, R30], Uses = [R31, R30], neverHasSideEffects = 1 in {
+  def ALLOCFRAME : STInst2<(outs),
+             (ins i32imm:$amt),
+             "allocframe(#$amt)",
+             []>;
+}
+//===----------------------------------------------------------------------===//
+// ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/ALU +
+//===----------------------------------------------------------------------===//
+// Logical NOT.
+def NOT_rr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+               "$dst = not($src1)",
+               [(set (i64 DoubleRegs:$dst), (not (i64 DoubleRegs:$src1)))]>;
+
+
+// Sign extend word to doubleword.
+def SXTW : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+           "$dst = sxtw($src1)",
+           [(set (i64 DoubleRegs:$dst), (sext (i32 IntRegs:$src1)))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/BIT +
+//===----------------------------------------------------------------------===//
+// clrbit.
+def CLRBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
+                                           (not
+                                              (shl 1, u5ImmPred:$src2))))]>;
+
+def CLRBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = clrbit($src1, #$src2)",
+            []>;
+
+// Map from r0 = and(r1, 2147483647) to r0 = clrbit(r1, #31).
+def : Pat <(and (i32 IntRegs:$src1), 2147483647),
+      (CLRBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// setbit.
+def SETBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = or(r1, -2147483648) to r0 = setbit(r1, #31).
+def SETBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            []>;
+
+def : Pat <(or (i32 IntRegs:$src1), -2147483648),
+      (SETBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// togglebit.
+def TOGBIT : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = setbit($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
+                                          (shl 1, u5ImmPred:$src2)))]>;
+
+// Map from r0 = xor(r1, -2147483648) to r0 = togglebit(r1, #31).
+def TOGBIT_31 : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+            "$dst = togglebit($src1, #$src2)",
+            []>;
+
+def : Pat <(xor (i32 IntRegs:$src1), -2147483648),
+      (TOGBIT_31 (i32 IntRegs:$src1), 31)>;
+
+// Predicate transfer.
+let neverHasSideEffects = 1 in
+def TFR_RsPd : SInst<(outs IntRegs:$dst), (ins PredRegs:$src1),
+               "$dst = $src1  /* Should almost never emit this. */",
+               []>;
+
+def TFR_PdRs : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1),
+               "$dst = $src1  /* Should almost never emit this. */",
+               [(set (i1 PredRegs:$dst), (trunc (i32 IntRegs:$src1)))]>;
+//===----------------------------------------------------------------------===//
+// STYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+// Shift by immediate.
+def ASR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = asr($src1, #$src2)",
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
+
+def ASRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asr($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+def ASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          "$dst = asl($src1, #$src2)",
+          [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                         u5ImmPred:$src2))]>;
+
+def ASLd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = asl($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+def LSR_ri : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             "$dst = lsr($src1, #$src2)",
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            u5ImmPred:$src2))]>;
+
+def LSRd_ri : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+              "$dst = lsr($src1, #$src2)",
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                u6ImmPred:$src2))]>;
+
+// Shift by immediate and add.
+let AddedComplexity = 100 in
+def ADDASL : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                             u3Imm:$src3),
+             "$dst = addasl($src1, $src2, #$src3)",
+             [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
+                                       (shl (i32 IntRegs:$src2),
+                                            u3ImmPred:$src3)))]>;
+
+// Shift by register.
+def ASL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = asr($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (sra (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def LSL_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsl($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (shl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def LSR_rr : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             "$dst = lsr($src1, $src2)",
+             [(set (i32 IntRegs:$dst), (srl (i32 IntRegs:$src1),
+                                            (i32 IntRegs:$src2)))]>;
+
+def ASLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = asl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
+
+def LSLd : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+           "$dst = lsl($src1, $src2)",
+           [(set (i64 DoubleRegs:$dst), (shl (i64 DoubleRegs:$src1),
+                                             (i32 IntRegs:$src2)))]>;
+
+def ASRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = asr($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (sra (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
+
+def LSRd_rr : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                 IntRegs:$src2),
+              "$dst = lsr($src1, $src2)",
+              [(set (i64 DoubleRegs:$dst), (srl (i64 DoubleRegs:$src1),
+                                                (i32 IntRegs:$src2)))]>;
+
+//===----------------------------------------------------------------------===//
+// STYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VH +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VH -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// STYPE/VW +
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// STYPE/VW -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/USER +
+//===----------------------------------------------------------------------===//
+def SDHexagonBARRIER: SDTypeProfile<0, 0, []>;
+def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDHexagonBARRIER,
+                           [SDNPHasChain]>;
+
+let hasSideEffects = 1, isSolo = 1 in
+def BARRIER : SYSInst<(outs), (ins),
+                     "barrier",
+                     [(HexagonBARRIER)]>;
+
+//===----------------------------------------------------------------------===//
+// SYSTEM/SUPER -
+//===----------------------------------------------------------------------===//
+
+// TFRI64 - assembly mapped.
+let isReMaterializable = 1 in
+def TFRI64 : ALU64_rr<(outs DoubleRegs:$dst), (ins s8Imm64:$src1),
+             "$dst = #$src1",
+             [(set (i64 DoubleRegs:$dst), s8Imm64Pred:$src1)]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1),
+                                        (i32 IntRegs:$src2),
+                                        (i32 IntRegs:$src3))))]>;
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s12Imm:$src3),
+            "Error; should not emit",
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), (i32 IntRegs:$src2),
+                          s12ImmPred:$src3)))]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, s12Imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set (i32 IntRegs:$dst),
+             (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                          (i32 IntRegs:$src3))))]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, s12Imm:$src2, s12Imm:$src3),
+                     "Error; should not emit",
+                     [(set (i32 IntRegs:$dst),
+                           (i32 (select (i1 PredRegs:$src1), s12ImmPred:$src2,
+                                        s12ImmPred:$src3)))]>;
+
+// Generate frameindex addresses.
+let isReMaterializable = 1 in
+def TFR_FI : ALU32_ri<(outs IntRegs:$dst), (ins FrameIndex:$src1),
+             "$dst = add($src1)",
+             [(set (i32 IntRegs:$dst), ADDRri:$src1)]>;
+
+//
+// CR - Type.
+//
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_i : CRInst<(outs), (ins brtarget:$offset, u10Imm:$src2),
+                      "loop0($offset, #$src2)",
+                      []>;
+}
+
+let neverHasSideEffects = 1, Defs = [SA0, LC0] in {
+def LOOP0_r : CRInst<(outs), (ins brtarget:$offset, IntRegs:$src2),
+                      "loop0($offset, $src2)",
+                      []>;
+}
+
+let isBranch = 1, isTerminator = 1, neverHasSideEffects = 1,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+// Support for generating global address.
+// Taken from X86InstrInfo.td.
+def SDTHexagonCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisPtrTy<0>]>;
+def HexagonCONST32 : SDNode<"HexagonISD::CONST32",     SDTHexagonCONST32>;
+def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP",     SDTHexagonCONST32>;
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.l = #LO($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst.h = #HI($global)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LOi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.l = #LO($imm_value)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HIi : ALU32_ri<(outs IntRegs:$dst), (ins i32imm:$imm_value),
+                  "$dst.h = #HI($imm_value)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.l = #LO($jt)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def HI_jt : ALU32_ri<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                  "$dst.h = #HI($jt)",
+                  []>;
+
+
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def LO_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.l = #LO($label)",
+                  []>;
+
+let isReMaterializable = 1, isMoveImm = 1 , neverHasSideEffects = 1 in
+def HI_label : ALU32_ri<(outs IntRegs:$dst), (ins bblabel:$label),
+                  "$dst.h = #HI($label)",
+                  []>;
+
+// This pattern is incorrect. When we add small data, we should change
+// this pattern to use memw(#foo).
+// This is for sdata.
+let isMoveImm = 1 in
+def CONST32 : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (i32 IntRegs:$dst),
+                    (load (HexagonCONST32 tglobaltlsaddr:$global)))]>;
+
+// This is for non-sdata.
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
+                  "$dst = CONST32(#$global)",
+                  [(set (i32 IntRegs:$dst),
+                        (HexagonCONST32 tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_set_jt : LDInst2<(outs IntRegs:$dst), (ins jumptablebase:$jt),
+                     "$dst = CONST32(#$jt)",
+                     [(set (i32 IntRegs:$dst),
+                           (HexagonCONST32 tjumptable:$jt))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32GP_set : LDInst2<(outs IntRegs:$dst), (ins globaladdress:$global),
+                    "$dst = CONST32(#$global)",
+                    [(set (i32 IntRegs:$dst),
+                          (HexagonCONST32_GP tglobaladdr:$global))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Int_Real : LDInst2<(outs IntRegs:$dst), (ins i32imm:$global),
+                       "$dst = CONST32(#$global)",
+                       [(set (i32 IntRegs:$dst), imm:$global) ]>;
+
+// Map BlockAddress lowering to CONST32_Int_Real
+def : Pat<(HexagonCONST32_GP tblockaddress:$addr),
+          (CONST32_Int_Real tblockaddress:$addr)>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Label : LDInst2<(outs IntRegs:$dst), (ins bblabel:$label),
+                    "$dst = CONST32($label)",
+                    [(set (i32 IntRegs:$dst), (HexagonCONST32 bbl:$label))]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Int_Real : LDInst2<(outs DoubleRegs:$dst), (ins i64imm:$global),
+                       "$dst = CONST64(#$global)",
+                       [(set (i64 DoubleRegs:$dst), imm:$global) ]>;
+
+def TFR_PdFalse : SInst<(outs PredRegs:$dst), (ins),
+                  "$dst = xor($dst, $dst)",
+                  [(set (i1 PredRegs:$dst), 0)]>;
+
+def MPY_trsext : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+       "$dst = mpy($src1, $src2)",
+       [(set (i32 IntRegs:$dst),
+             (trunc (i64 (srl (i64 (mul (i64 (sext (i32 IntRegs:$src1))),
+                                        (i64 (sext (i32 IntRegs:$src2))))),
+                              (i32 32)))))]>;
+
+// Pseudo instructions.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+
+def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                    [SDNPHasChain, SDNPOutGlue]>;
+
+def SDT_SPCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def call : SDNode<"HexagonISD::CALL", SDT_SPCall,
+           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
+
+// For tailcalls a HexagonTCRet SDNode has 3 SDNode Properties - a chain,
+// Optional Flag and Variable Arguments.
+// Its 1 Operand has pointer type.
+def HexagonTCRet    : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
+                     [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29] in {
+ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                        "Should never be emitted",
+                        [(callseq_start timm:$amt)]>;
+}
+
+let Defs = [R29, R30, R31], Uses = [R29] in {
+ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                      "Should never be emitted",
+                      [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALL : JInst<(outs), (ins calltarget:$dst),
+             "call $dst", []>;
+}
+
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10,
+          R22, R23, R28, R31, P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLR : JRInst<(outs), (ins IntRegs:$dst),
+              "callr $dst",
+              []>;
+ }
+
+// Tail Calls.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNtg : JInst<(outs), (ins calltarget:$dst),
+             "jump $dst // TAILCALL", []>;
+}
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNtext : JInst<(outs), (ins calltarget:$dst),
+             "jump $dst // TAILCALL", []>;
+}
+
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+  def TCRETURNR : JInst<(outs), (ins IntRegs:$dst),
+             "jumpr $dst // TAILCALL", []>;
+}
+// Map call instruction.
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLR (i32 IntRegs:$dst))>, Requires<[HasV2TOnly]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALL tglobaladdr:$dst)>, Requires<[HasV2TOnly]>;
+def : Pat<(call texternalsym:$dst),
+      (CALL texternalsym:$dst)>, Requires<[HasV2TOnly]>;
+//Tail calls.
+def : Pat<(HexagonTCRet tglobaladdr:$dst),
+      (TCRETURNtg tglobaladdr:$dst)>;
+def : Pat<(HexagonTCRet texternalsym:$dst),
+      (TCRETURNtext texternalsym:$dst)>;
+def : Pat<(HexagonTCRet (i32 IntRegs:$dst)),
+      (TCRETURNR (i32 IntRegs:$dst))>;
+
+// Atomic load and store support
+// 8 bit atomic load
+def : Pat<(atomic_load_8 ADDRriS11_0:$src1),
+          (i32 (LDriub ADDRriS11_0:$src1))>;
+
+def : Pat<(atomic_load_8 (add (i32 IntRegs:$src1), s11_0ImmPred:$offset)),
+          (i32 (LDriub_indexed (i32 IntRegs:$src1), s11_0ImmPred:$offset))>;
+
+// 16 bit atomic load
+def : Pat<(atomic_load_16 ADDRriS11_1:$src1),
+          (i32 (LDriuh ADDRriS11_1:$src1))>;
+
+def : Pat<(atomic_load_16 (add (i32 IntRegs:$src1), s11_1ImmPred:$offset)),
+          (i32 (LDriuh_indexed (i32 IntRegs:$src1), s11_1ImmPred:$offset))>;
+
+def : Pat<(atomic_load_32 ADDRriS11_2:$src1),
+          (i32 (LDriw ADDRriS11_2:$src1))>;
+
+def : Pat<(atomic_load_32 (add (i32 IntRegs:$src1), s11_2ImmPred:$offset)),
+          (i32 (LDriw_indexed (i32 IntRegs:$src1), s11_2ImmPred:$offset))>;
+
+// 64 bit atomic load
+def : Pat<(atomic_load_64 ADDRriS11_3:$src1),
+          (i64 (LDrid ADDRriS11_3:$src1))>;
+
+def : Pat<(atomic_load_64 (add (i32 IntRegs:$src1), s11_3ImmPred:$offset)),
+          (i64 (LDrid_indexed (i32 IntRegs:$src1), s11_3ImmPred:$offset))>;
+
+
+def : Pat<(atomic_store_8 ADDRriS11_0:$src2, (i32 IntRegs:$src1)),
+          (STrib ADDRriS11_0:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_8 (add (i32 IntRegs:$src2), s11_0ImmPred:$offset),
+                          (i32 IntRegs:$src1)),
+          (STrib_indexed (i32 IntRegs:$src2), s11_0ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+def : Pat<(atomic_store_16 ADDRriS11_1:$src2, (i32 IntRegs:$src1)),
+          (STrih ADDRriS11_1:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_16 (i32 IntRegs:$src1),
+                          (add (i32 IntRegs:$src2), s11_1ImmPred:$offset)),
+          (STrih_indexed (i32 IntRegs:$src2), s11_1ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 ADDRriS11_2:$src2, (i32 IntRegs:$src1)),
+          (STriw ADDRriS11_2:$src2, (i32 IntRegs:$src1))>;
+
+def : Pat<(atomic_store_32 (add (i32 IntRegs:$src2), s11_2ImmPred:$offset),
+                           (i32 IntRegs:$src1)),
+          (STriw_indexed (i32 IntRegs:$src2), s11_2ImmPred:$offset,
+                         (i32 IntRegs:$src1))>;
+
+
+
+
+def : Pat<(atomic_store_64 ADDRriS11_3:$src2, (i64 DoubleRegs:$src1)),
+          (STrid ADDRriS11_3:$src2, (i64 DoubleRegs:$src1))>;
+
+def : Pat<(atomic_store_64 (add (i32 IntRegs:$src2), s11_3ImmPred:$offset),
+                           (i64 DoubleRegs:$src1)),
+          (STrid_indexed (i32 IntRegs:$src2), s11_3ImmPred:$offset,
+                         (i64 DoubleRegs:$src1))>;
+
+// Map from r0 = and(r1, 65535) to r0 = zxth(r1)
+def : Pat <(and (i32 IntRegs:$src1), 65535),
+      (ZXTH (i32 IntRegs:$src1))>;
+
+// Map from r0 = and(r1, 255) to r0 = zxtb(r1).
+def : Pat <(and (i32 IntRegs:$src1), 255),
+      (ZXTB (i32 IntRegs:$src1))>;
+
+// Map Add(p1, true) to p1 = not(p1).
+//     Add(p1, false) should never be produced,
+//     if it does, it got to be mapped to NOOP.
+def : Pat <(add (i1 PredRegs:$src1), -1),
+      (NOT_p (i1 PredRegs:$src1))>;
+
+// Map from p0 = setlt(r0, r1) r2 = mux(p0, r3, r4) =>
+//   p0 = cmp.lt(r0, r1), r0 = mux(p0, r2, r1).
+def : Pat <(select (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   (i32 IntRegs:$src3),
+                   (i32 IntRegs:$src4)),
+      (i32 (TFR_condset_rr (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                           (i32 IntRegs:$src4), (i32 IntRegs:$src3)))>,
+      Requires<[HasV2TOnly]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not (i1 PredRegs:$src1)), s8ImmPred:$src2, s8ImmPred:$src3),
+      (i32 (TFR_condset_ii (i1 PredRegs:$src1), s8ImmPred:$src3,
+                           s8ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not (i1 PredRegs:$src1)), s12ImmPred:$src2,
+                   (i32 IntRegs:$src3)),
+      (i32 (TFR_condset_ri (i1 PredRegs:$src1), (i32 IntRegs:$src3),
+                           s12ImmPred:$src2))>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, s12ImmPred:$src3),
+      (i32 (TFR_condset_ir (i1 PredRegs:$src1), s12ImmPred:$src3,
+                           (i32 IntRegs:$src2)))>;
+
+// Map from p0 = pnot(p0); if (p0) jump => if (!p0) jump.
+def : Pat <(brcond (not PredRegs:$src1), bb:$offset),
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
+
+// Map from p2 = pnot(p2); p1 = and(p0, p2) => p1 = and(p0, !p2).
+def : Pat <(and PredRegs:$src1, (not PredRegs:$src2)),
+      (i1 (AND_pnotp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+
+// Map from i1 loads to 32 bits. This assumes that the i1* is byte aligned.
+let AddedComplexity = 10 in
+def : Pat <(i32 (zextloadi1 ADDRriS11_0:$addr)),
+      (i32 (AND_rr (i32 (LDrib ADDRriS11_0:$addr)), (TFRI 0x1)))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i32) -> Rdd = SXTW(Rss.lo).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i32)),
+      (i64 (SXTW (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg))))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i16) -> Rdd = SXTW(SXTH(Rss.lo)).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i16)),
+      (i64 (SXTW (i32 (SXTH (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
+
+// Map from Rdd = sign_extend_inreg(Rss, i8) -> Rdd = SXTW(SXTB(Rss.lo)).
+def : Pat <(i64 (sext_inreg (i64 DoubleRegs:$src1), i8)),
+      (i64 (SXTW (i32 (SXTB (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                 subreg_loreg))))))>;
+
+// We want to prevent emitting pnot's as much as possible.
+// Map brcond with an unsupported setcc to a JMP_cNot.
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 -1))), bb:$offset),
+      (JMP_cNot (i1 PredRegs:$src1), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setne (i1 PredRegs:$src1), (i1 0))), bb:$offset),
+      (JMP_c (i1 PredRegs:$src1), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+                        bb:$offset),
+      (JMP_cNot (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_c (CMPLTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)), bb:$offset)>;
+
+def : Pat <(brcond (i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+                   bb:$offset),
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)),
+                   bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                        bb:$offset),
+      (JMP_cNot (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)),
+                bb:$offset)>;
+
+def : Pat <(brcond (i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+                   bb:$offset),
+      (JMP_cNot (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+                bb:$offset)>;
+
+// Map from a 64-bit select to an emulated 64-bit mux.
+// Hexagon does not support 64-bit MUXes; so emulate with combines.
+def : Pat <(select (i1 PredRegs:$src1), (i64 DoubleRegs:$src2),
+                   (i64 DoubleRegs:$src3)),
+      (i64 (COMBINE_rr (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_hireg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_hireg)))),
+                       (i32 (MUX_rr (i1 PredRegs:$src1),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                         subreg_loreg)),
+                                    (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src3),
+                                                         subreg_loreg))))))>;
+
+// Map from a 1-bit select to logical ops.
+// From LegalizeDAG.cpp: (B1 ? B2 : B3) <=> (B1 & B2)|(!B1&B3).
+def : Pat <(select (i1 PredRegs:$src1), (i1 PredRegs:$src2),
+                   (i1 PredRegs:$src3)),
+      (OR_pp (AND_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)),
+             (AND_pp (NOT_p (i1 PredRegs:$src1)), (i1 PredRegs:$src3)))>;
+
+// Map Pd = load(addr) -> Rs = load(addr); Pd = Rs.
+def : Pat<(i1 (load ADDRriS11_2:$addr)),
+      (i1 (TFR_PdRs (i32 (LDrib ADDRriS11_2:$addr))))>;
+
+// Map for truncating from 64 immediates to 32 bit immediates.
+def : Pat<(i32 (trunc (i64 DoubleRegs:$src))),
+      (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src), subreg_loreg))>;
+
+// Map for truncating from i64 immediates to i1 bit immediates.
+def :  Pat<(i1 (trunc (i64 DoubleRegs:$src))),
+       (i1 (TFR_PdRs (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                          subreg_loreg))))>;
+
+// Map memb(Rs) = Rdd -> memb(Rs) = Rt.
+def : Pat<(truncstorei8 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrib ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map memh(Rs) = Rdd -> memh(Rs) = Rt.
+def : Pat<(truncstorei16 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STrih ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt
+def : Pat<(truncstorei32 (i64  DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map memw(Rs) = Rdd -> memw(Rs) = Rt.
+def : Pat<(truncstorei32 (i64 DoubleRegs:$src), ADDRriS11_0:$addr),
+      (STriw ADDRriS11_0:$addr, (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src),
+                                                     subreg_loreg)))>;
+
+// Map from i1 = constant<-1>; memw(addr) = i1 -> r0 = 1; memw(addr) = r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+
+// Map from i1 = constant<-1>; store i1 -> r0 = 1; store r0.
+def : Pat<(store (i1 -1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (TFRI 1))>;
+
+// Map from memb(Rs) = Pd -> Rt = mux(Pd, #0, #1); store Rt.
+def : Pat<(store (i1 PredRegs:$src1), ADDRriS11_2:$addr),
+      (STrib ADDRriS11_2:$addr, (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0)) )>;
+
+// Map Rdd = anyext(Rs) -> Rdd = sxtw(Rs).
+// Hexagon_TODO: We can probably use combine but that will cost 2 instructions.
+// Better way to do this?
+def : Pat<(i64 (anyext (i32 IntRegs:$src1))),
+      (i64 (SXTW (i32 IntRegs:$src1)))>;
+
+// Map cmple -> cmpgt.
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p (CMPGTri (i32 IntRegs:$src1), s10ImmPred:$src2)))>;
+
+// rs <= rt -> !(rs > rt).
+def : Pat<(i1 (setle (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+
+// Rss <= Rtt -> !(Rss > Rtt).
+def : Pat<(i1 (setle (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGT64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+
+// Map cmpne -> cmpeq.
+// Hexagon_TODO: We should improve on this.
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i32 IntRegs:$src1), s10ImmPred:$src2)),
+      (i1 (NOT_p(i1 (CMPEQri (i32 IntRegs:$src1), s10ImmPred:$src2))))>;
+
+// Map cmpne(Rs) -> !cmpeqe(Rs).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEQrr (i32 IntRegs:$src1), (i32 IntRegs:$src2)))))>;
+
+// Convert setne back to xor for hexagon since we compute w/ pred registers.
+def : Pat <(i1 (setne (i1 PredRegs:$src1), (i1 PredRegs:$src2))),
+      (i1 (XOR_pp (i1 PredRegs:$src1), (i1 PredRegs:$src2)))>;
+
+// Map cmpne(Rss) -> !cmpew(Rss).
+// rs != rt -> !(rs == rt).
+def : Pat <(i1 (setne (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPEHexagon4rr (i64 DoubleRegs:$src1),
+                                     (i64 DoubleRegs:$src2)))))>;
+
+// Map cmpge(Rs, Rt) -> !(cmpgt(Rs, Rt).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))))>;
+
+def : Pat <(i1 (setge (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2))>;
+
+// Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
+// rss >= rtt -> !(rtt > rss).
+def : Pat <(i1 (setge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (i1 (CMPGT64rr (i64 DoubleRegs:$src2),
+                                (i64 DoubleRegs:$src1)))))>;
+
+// Map cmplt(Rs, Imm) -> !cmpge(Rs, Imm).
+// rs < rt -> !(rs >= rt).
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), s8ImmPred:$src2)),
+      (i1 (NOT_p (CMPGEri (i32 IntRegs:$src1), s8ImmPred:$src2)))>;
+
+// Map cmplt(Rs, Rt) -> cmpgt(Rt, Rs).
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setlt (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
+
+// Map cmplt(Rss, Rtt) -> cmpgt(Rtt, Rss).
+// rss < rtt -> (rtt > rss).
+def : Pat <(i1 (setlt (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGT64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Map from cmpltu(Rs, Rd) -> cmpgtu(Rd, Rs)
+// rs < rt -> rt > rs.
+// We can let assembler map it, or we can do in the compiler itself.
+def : Pat <(i1 (setult (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1)))>;
+
+// Map from cmpltu(Rss, Rdd) -> cmpgtu(Rdd, Rss).
+// rs < rt -> rt > rs.
+def : Pat <(i1 (setult (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1)))>;
+
+// Generate cmpgeu(Rs, #u8)
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), u8ImmPred:$src2)),
+      (i1 (CMPGEUri (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+
+// Generate cmpgtu(Rs, #u9)
+def : Pat <(i1 (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)),
+      (i1 (CMPGTUri (i32 IntRegs:$src1), u9ImmPred:$src2))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src2), (i32 IntRegs:$src1))))>;
+
+// Map from Rs >= Rt -> !(Rt > Rs).
+// rs >= rt -> !(rt > rs).
+def : Pat <(i1 (setuge (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src2), (i64 DoubleRegs:$src1))))>;
+
+// Map from cmpleu(Rs, Rs) -> !cmpgtu(Rs, Rs).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+      (i1 (NOT_p (CMPGTUrr (i32 IntRegs:$src1), (i32 IntRegs:$src2))))>;
+
+// Map from cmpleu(Rss, Rtt) -> !cmpgtu(Rss, Rtt-1).
+// Map from (Rs <= Rt) -> !(Rs > Rt).
+def : Pat <(i1 (setule (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))),
+      (i1 (NOT_p (CMPGTU64rr (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2))))>;
+
+// Sign extends.
+// i1 -> i32
+def : Pat <(i32 (sext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), -1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (sext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI -1), (MUX_ii (i1 PredRegs:$src1), -1, 0)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (sextloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert any-extended load back to load and sign extend.
+// i8 -> i64
+def:  Pat <(i64 (extloadi8 ADDRriS11_0:$src1)),
+      (i64 (SXTW (LDrib ADDRriS11_0:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i16 -> i64
+def:  Pat <(i64 (sextloadi16 ADDRriS11_1:$src1)),
+      (i64 (SXTW (LDrih ADDRriS11_1:$src1)))>;
+
+// Convert sign-extended load back to load and sign extend.
+// i32 -> i64
+def:  Pat <(i64 (sextloadi32 ADDRriS11_2:$src1)),
+      (i64 (SXTW (LDriw ADDRriS11_2:$src1)))>;
+
+
+// Zero extends.
+// i1 -> i32
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// i1 -> i64
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[NoV4T]>;
+
+// i32 -> i64
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
+
+// i8 -> i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i1 -> i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i16 -> i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// i32 -> i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+def:  Pat <(i32 (zextloadi1 ADDRriS11_0:$src1)),
+      (i32 (LDriw ADDRriS11_0:$src1))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (zext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// Map from Rs = Pd to Pd = mux(Pd, #1, #0)
+def : Pat <(i32 (anyext (i1 PredRegs:$src1))),
+      (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))>;
+
+// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
+def : Pat <(i64 (anyext (i1 PredRegs:$src1))),
+      (i64 (SXTW (i32 (MUX_ii (i1 PredRegs:$src1), 1, 0))))>;
+
+
+// Any extended 64-bit load.
+// anyext i32 -> i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+// When there is an offset we should prefer the pattern below over the pattern above.
+// The complexity of the above is 13 (gleaned from HexagonGenDAGIsel.inc)
+// So this complexity below is comfortably higher to allow for choosing the below.
+// If this is not done then we generate addresses such as
+// ********************************************
+//        r1 = add (r0, #4)
+//        r1 = memw(r1 + #0)
+//  instead of
+//        r1 = memw(r0 + #4)
+// ********************************************
+let AddedComplexity = 100 in
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_rr (TFRI 0), (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// anyext i16 -> i64.
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[NoV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_rr (TFRI 0), (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[NoV4T]>;
+
+// Map from Rdd = zxtw(Rs) -> Rdd = combine(0, Rs).
+def : Pat<(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_rr (TFRI 0), (i32 IntRegs:$src1)))>,
+      Requires<[NoV4T]>;
+
+// Multiply 64-bit unsigned and use upper result.
+def : Pat <(mulhu (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPYU64_acc
+        (i64
+         (COMBINE_rr
+          (TFRI 0),
+           (i32
+            (EXTRACT_SUBREG
+             (i64
+              (LSRd_ri
+               (i64
+                (MPYU64_acc
+                 (i64
+                  (MPYU64_acc
+                   (i64
+                    (COMBINE_rr (TFRI 0),
+                     (i32
+                      (EXTRACT_SUBREG
+                       (i64
+                        (LSRd_ri
+                         (i64
+                          (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                       subreg_loreg)),
+                                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                       subreg_loreg)))), 32)),
+                       subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+               32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
+
+// Multiply 64-bit signed and use upper result.
+def : Pat <(mulhs (i64 DoubleRegs:$src1), (i64 DoubleRegs:$src2)),
+      (i64
+       (MPY64_acc
+        (i64
+         (COMBINE_rr (TFRI 0),
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (LSRd_ri
+              (i64
+               (MPY64_acc
+                (i64
+                 (MPY64_acc
+                  (i64
+                   (COMBINE_rr (TFRI 0),
+                    (i32
+                     (EXTRACT_SUBREG
+                      (i64
+                       (LSRd_ri
+                        (i64
+                         (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                                      subreg_loreg)),
+                                 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                                      subreg_loreg)))), 32)),
+                      subreg_loreg)))),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+                  (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)))),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg)))),
+              32)), subreg_loreg)))),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg)),
+        (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))))>;
+
+// Hexagon specific ISD nodes.
+//def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>]>;
+def SDTHexagonADJDYNALLOC : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def Hexagon_ADJDYNALLOC : SDNode<"HexagonISD::ADJDYNALLOC",
+                                  SDTHexagonADJDYNALLOC>;
+// Needed to tag these instructions for stack layout.
+let usesCustomInserter = 1 in
+def ADJDYNALLOC : ALU32_ri<(outs IntRegs:$dst), (ins IntRegs:$src1,
+                                                     s16Imm:$src2),
+                  "$dst = add($src1, #$src2)",
+                  [(set (i32 IntRegs:$dst),
+                        (Hexagon_ADJDYNALLOC (i32 IntRegs:$src1),
+                                             s16ImmPred:$src2))]>;
+
+def SDTHexagonARGEXTEND : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>;
+def Hexagon_ARGEXTEND : SDNode<"HexagonISD::ARGEXTEND", SDTHexagonARGEXTEND>;
+def ARGEXTEND : ALU32_rr <(outs IntRegs:$dst), (ins IntRegs:$src1),
+                "$dst = $src1",
+                [(set (i32 IntRegs:$dst),
+                      (Hexagon_ARGEXTEND (i32 IntRegs:$src1)))]>;
+
+let AddedComplexity = 100 in
+def : Pat<(i32 (sext_inreg (Hexagon_ARGEXTEND (i32 IntRegs:$src1)), i16)),
+      (COPY (i32 IntRegs:$src1))>;
+
+def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : JRInst<(outs), (ins IntRegs:$src),
+                   "jumpr $src",
+                   [(HexagonBR_JT (i32 IntRegs:$src))]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1 in
+def BRIND : JRInst<(outs), (ins IntRegs:$src),
+                   "jumpr $src",
+                   [(brind (i32 IntRegs:$src))]>;
+
+def HexagonWrapperJT: SDNode<"HexagonISD::WrapperJT", SDTIntUnaryOp>;
+
+def : Pat<(HexagonWrapperJT tjumptable:$dst),
+          (i32 (CONST32_set_jt tjumptable:$dst))>;
+
+// XTYPE/SHIFT
+
+// Multi-class for logical operators :
+// Shift by immediate/register and accumulate/logical
+multiclass xtype_imm<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _ri : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u5Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    u5ImmPred:$src3)))],
+            "$src1 = $dst">;
+
+  def d_ri : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, u6Imm:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, #$src3)")),
+            [(set (i64 DoubleRegs:$dst), (OpNode2 (i64 DoubleRegs:$src1),
+                          (OpNode1 (i64 DoubleRegs:$src2), u6ImmPred:$src3)))],
+            "$src1 = $dst">;
+}
+
+// Multi-class for logical operators :
+// Shift by register and accumulate/logical (32/64 bits)
+multiclass xtype_reg<string OpcStr, SDNode OpNode1, SDNode OpNode2> {
+  def _rr : SInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i32 IntRegs:$dst),
+                  (OpNode2 (i32 IntRegs:$src1),
+                           (OpNode1 (i32 IntRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+  def d_rr : SInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            !strconcat("$dst ", !strconcat(OpcStr, "($src2, $src3)")),
+            [(set (i64 DoubleRegs:$dst),
+                  (OpNode2 (i64 DoubleRegs:$src1),
+                           (OpNode1 (i64 DoubleRegs:$src2),
+                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">;
+
+}
+
+multiclass basic_xtype_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_imm< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_imm< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_imm< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_imm< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass basic_xtype_reg<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _ADD : xtype_reg< !strconcat("+= ", OpcStr), OpNode, add>;
+  defm _SUB : xtype_reg< !strconcat("-= ", OpcStr), OpNode, sub>;
+  defm _AND : xtype_reg< !strconcat("&= ", OpcStr), OpNode, and>;
+  defm _OR  : xtype_reg< !strconcat("|= ", OpcStr), OpNode, or>;
+}
+
+multiclass xtype_xor_imm<string OpcStr, SDNode OpNode> {
+let AddedComplexity = 100 in
+  defm _XOR : xtype_imm< !strconcat("^= ", OpcStr), OpNode, xor>;
+}
+
+defm ASL : basic_xtype_imm<"asl", shl>, basic_xtype_reg<"asl", shl>,
+           xtype_xor_imm<"asl", shl>;
+
+defm LSR : basic_xtype_imm<"lsr", srl>, basic_xtype_reg<"lsr", srl>,
+           xtype_xor_imm<"lsr", srl>;
+
+defm ASR : basic_xtype_imm<"asr", sra>, basic_xtype_reg<"asr", sra>;
+defm LSL : basic_xtype_reg<"lsl", shl>;
+
+// Change the sign of the immediate for Rd=-mpyi(Rs,#u8)
+def : Pat <(mul (i32 IntRegs:$src1), (ineg n8ImmPred:$src2)),
+      (i32 (MPYI_rin (i32 IntRegs:$src1), u8ImmPred:$src2))>;
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV3.td"
+
+//===----------------------------------------------------------------------===//
+// V3 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Instructions -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions +
+//===----------------------------------------------------------------------===//
+
+include "HexagonInstrInfoV5.td"
+
+//===----------------------------------------------------------------------===//
+// V5 Instructions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
new file mode 100644
index 000000000000..157ab3d0e38c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
@@ -0,0 +1,137 @@
+//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// J +
+//===----------------------------------------------------------------------===//
+// Call subroutine.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLv3 : JInst<(outs), (ins calltarget:$dst),
+             "call $dst", []>, Requires<[HasV3T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// J -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// JR +
+//===----------------------------------------------------------------------===//
+// Call subroutine from register.
+let isCall = 1, neverHasSideEffects = 1,
+  Defs = [D0, D1, D2, D3, D4, D5, D6, D7, R28, R31,
+                P0, P1, P2, P3, LC0, LC1, SA0, SA1] in {
+  def CALLRv3 : JRInst<(outs), (ins IntRegs:$dst),
+              "callr $dst",
+              []>, Requires<[HasV3TOnly]>;
+ }
+
+
+// Jump to address from register
+// if(p?.new) jumpr:t r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cdnPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if ($src1.new) jumpr:t $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// if (!p?.new) jumpr:t r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cdnNotPt_V3: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if (!$src1.new) jumpr:t $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// Not taken.
+// if(p?.new) jumpr:nt r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cdnPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if ($src1.new) jumpr:nt $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+// if (!p?.new) jumpr:nt r?
+let isReturn = 1, isTerminator = 1, isBarrier = 1,
+  Defs = [PC], Uses = [R31] in {
+  def JMPR_cdnNotPnt: JRInst<(outs), (ins PredRegs:$src1, IntRegs:$src2),
+                       "if (!$src1.new) jumpr:nt $src2",
+                       []>, Requires<[HasV3T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// JR -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU +
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 200 in
+def MAXw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = max($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setlt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
+Requires<[HasV3T]>;
+
+let AddedComplexity = 200 in
+def MINw_dd : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                    DoubleRegs:$src2),
+              "$dst = min($src2, $src1)",
+              [(set (i64 DoubleRegs:$dst),
+                    (i64 (select (i1 (setgt (i64 DoubleRegs:$src2),
+                                            (i64 DoubleRegs:$src1))),
+                                 (i64 DoubleRegs:$src1),
+                                 (i64 DoubleRegs:$src2))))]>,
+Requires<[HasV3T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU64/ALU -
+//===----------------------------------------------------------------------===//
+
+
+
+
+//def : Pat <(brcond (i1 (seteq (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegEzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setne (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegNzt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setle (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegLezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setge (i32 IntRegs:$src1), 0)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+//def : Pat <(brcond (i1 (setgt (i32 IntRegs:$src1), -1)), bb:$offset),
+//      (JMP_RegGezt (i32 IntRegs:$src1), bb:$offset)>, Requires<[HasV3T]>;
+
+
+// Map call instruction
+def : Pat<(call (i32 IntRegs:$dst)),
+      (CALLRv3 (i32 IntRegs:$dst))>, Requires<[HasV3T]>;
+def : Pat<(call tglobaladdr:$dst),
+      (CALLv3 tglobaladdr:$dst)>, Requires<[HasV3T]>;
+def : Pat<(call texternalsym:$dst),
+      (CALLv3 texternalsym:$dst)>, Requires<[HasV3T]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
new file mode 100644
index 000000000000..cd0e4758968c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -0,0 +1,3531 @@
+//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V4 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in
+class T_Immext<dag ins> :
+  EXTENDERInst<(outs), ins, "immext(#$imm)", []>,
+  Requires<[HasV4T]>;
+
+def IMMEXT_b : T_Immext<(ins brtarget:$imm)>;
+def IMMEXT_c : T_Immext<(ins calltarget:$imm)>;
+def IMMEXT_g : T_Immext<(ins globaladdress:$imm)>;
+def IMMEXT_i : T_Immext<(ins u26_6Imm:$imm)>;
+
+// Fold (add (CONST32 tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddr : ComplexPattern<i32, 1, "foldGlobalAddress", [], []>;
+
+// Fold (add (CONST32_GP tglobaladdr:$addr) <offset>) into a global address.
+def FoldGlobalAddrGP : ComplexPattern<i32, 1, "foldGlobalAddressGP", [], []>;
+
+def NumUsesBelowThresCONST32 : PatFrag<(ops node:$addr),
+                                       (HexagonCONST32 node:$addr), [{
+  return hasNumUsesBelowThresGA(N->getOperand(0).getNode());
+}]>;
+
+// Hexagon V4 Architecture spec defines 8 instruction classes:
+// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
+// compiler)
+
+// LD Instructions:
+// ========================================
+// Loads (8/16/32/64 bit)
+// Deallocframe
+
+// ST Instructions:
+// ========================================
+// Stores (8/16/32/64 bit)
+// Allocframe
+
+// ALU32 Instructions:
+// ========================================
+// Arithmetic / Logical (32 bit)
+// Vector Halfword
+
+// XTYPE Instructions (32/64 bit):
+// ========================================
+// Arithmetic, Logical, Bit Manipulation
+// Multiply (Integer, Fractional, Complex)
+// Permute / Vector Permute Operations
+// Predicate Operations
+// Shift / Shift with Add/Sub/Logical
+// Vector Byte ALU
+// Vector Halfword (ALU, Shift, Multiply)
+// Vector Word (ALU, Shift)
+
+// J Instructions:
+// ========================================
+// Jump/Call PC-relative
+
+// JR Instructions:
+// ========================================
+// Jump/Call Register
+
+// MEMOP Instructions:
+// ========================================
+// Operation on memory (8/16/32 bit)
+
+// NV Instructions:
+// ========================================
+// New-value Jumps
+// New-value Stores
+
+// CR Instructions:
+// ========================================
+// Control-Register Transfers
+// Hardware Loop Setup
+// Predicate Logicals & Reductions
+
+// SYSTEM Instructions (not implemented in the compiler):
+// ========================================
+// Prefetch
+// Cache Maintenance
+// Bus Operations
+
+
+//===----------------------------------------------------------------------===//
+// ALU32 +
+//===----------------------------------------------------------------------===//
+// Generate frame index addresses.
+let neverHasSideEffects = 1, isReMaterializable = 1,
+isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in
+def TFR_FI_immext_V4 : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s32Imm:$offset),
+            "$dst = add($src1, ##$offset)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=cmp.eq(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpeqi : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, s8Ext:$s8),
+                    "$Rd = cmp.eq($Rs, #$s8)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                                s8ExtPred:$s8)))))]>,
+                    Requires<[HasV4T]>;
+
+// Preserve the TSTBIT generation
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 (shl 1, (i32 IntRegs:$src2))),
+                                           (i32 IntRegs:$src1))), 0)))),
+      (i32 (MUX_ii (i1 (TSTBIT_rr (i32 IntRegs:$src1), (i32 IntRegs:$src2))),
+                   1, 0))>;
+
+// Interfered with tstbit generation, above pattern preserves, see : tstbit.ll
+// Rd=cmp.ne(Rs,#s8)
+let validSubTargets = HasV4SubT, isExtendable = 1, opExtendable = 2,
+isExtentSigned = 1, opExtentBits = 8 in
+def V4_A4_rcmpneqi : ALU32_ri<(outs IntRegs:$Rd),
+                     (ins IntRegs:$Rs, s8Ext:$s8),
+                     "$Rd = !cmp.eq($Rs, #$s8)",
+                     [(set (i32 IntRegs:$Rd),
+                           (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                                 s8ExtPred:$s8)))))]>,
+                     Requires<[HasV4T]>;
+
+// Rd=cmp.eq(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpeq : ALU32_ri<(outs IntRegs:$Rd),
+                   (ins IntRegs:$Rs, IntRegs:$Rt),
+                   "$Rd = cmp.eq($Rs, $Rt)",
+                   [(set (i32 IntRegs:$Rd),
+                         (i32 (zext (i1 (seteq (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                   Requires<[HasV4T]>;
+
+// Rd=cmp.ne(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def V4_A4_rcmpneq : ALU32_ri<(outs IntRegs:$Rd),
+                    (ins IntRegs:$Rs, IntRegs:$Rt),
+                    "$Rd = !cmp.eq($Rs, $Rt)",
+                    [(set (i32 IntRegs:$Rd),
+                          (i32 (zext (i1 (setne (i32 IntRegs:$Rs),
+                                               IntRegs:$Rt)))))]>,
+                    Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32 -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+// Combine
+// Rdd=combine(Rs, #s8)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_rI_V4 : ALU32_ri<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s8Ext:$src2),
+            "$dst = combine($src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rdd=combine(#s8, Rs)
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_Ir_V4 : ALU32_ir<(outs DoubleRegs:$dst),
+            (ins s8Ext:$src1, IntRegs:$src2),
+            "$dst = combine(#$src1, $src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+def HexagonWrapperCombineRI_V4 :
+  SDNode<"HexagonISD::WrapperCombineRI_V4", SDTHexagonI64I32I32>;
+def HexagonWrapperCombineIR_V4 :
+  SDNode<"HexagonISD::WrapperCombineIR_V4", SDTHexagonI64I32I32>;
+
+def : Pat <(HexagonWrapperCombineRI_V4 IntRegs:$r, s8ExtPred:$i),
+           (COMBINE_rI_V4 IntRegs:$r, s8ExtPred:$i)>,
+          Requires<[HasV4T]>;
+
+def : Pat <(HexagonWrapperCombineIR_V4 s8ExtPred:$i, IntRegs:$r),
+           (COMBINE_Ir_V4 s8ExtPred:$i, IntRegs:$r)>,
+          Requires<[HasV4T]>;
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 6,
+    neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, u6Ext:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>,
+            Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// ALU32/PERM +
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LD +
+//===----------------------------------------------------------------------===//
+//
+// These absolute set addressing mode instructions accept immediate as
+// an operand. We have duplicated these patterns to take global address.
+
+let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
+def LDrid_abs_setimm_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memd($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+def LDrib_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memb($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memh(Re=#U6)
+def LDrih_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+def LDriub_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memub($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+def LDriuh_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memuh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+def LDriw_abs_setimm_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins u0AlwaysExt:$addr),
+            "$dst1 = memw($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Following patterns are defined for absolute set addressing mode
+// instruction which take global address as operand.
+let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
+def LDrid_abs_set_V4 : LDInst2<(outs DoubleRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memd($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memb(Re=#U6)
+def LDrib_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memb($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memh(Re=#U6)
+def LDrih_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memub(Re=#U6)
+def LDriub_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memub($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memuh(Re=#U6)
+def LDriuh_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memuh($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+
+// Rd=memw(Re=#U6)
+def LDriw_abs_set_V4 : LDInst2<(outs IntRegs:$dst1, IntRegs:$dst2),
+            (ins globaladdressExt:$addr),
+            "$dst1 = memw($dst2=##$addr)",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// multiclass for load instructions with base + register offset
+// addressing mode
+multiclass ld_idxd_shl_pbase<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2<(outs RC:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$offset),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$offset)",
+            []>, Requires<[HasV4T]>;
+}
+
+multiclass ld_idxd_shl_pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ld_idxd_shl_pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let neverHasSideEffects  = 1 in
+multiclass ld_idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$offset),
+            "$dst = "#mnemonic#"($src1+$src2<<#$offset)",
+            []>, Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt_V4 : ld_idxd_shl_pred<mnemonic, RC, 0 >;
+      defm NotPt_V4 : ld_idxd_shl_pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseRegOffset in {
+  defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>, AddrModeRel;
+  defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>, AddrModeRel;
+  defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
+  defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>, AddrModeRel;
+  defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
+  defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>, AddrModeRel;
+}
+
+// 'def pats' for load instructions with base + register offset and non-zero
+// immediate value. Immediate value is used to left-shift the second
+// register operand.
+let AddedComplexity = 40 in {
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrib_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi8 (add IntRegs:$src1,
+                                (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi16 (add IntRegs:$src1,
+                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrih_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi16 (add IntRegs:$src1,
+                                  (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi16 (add IntRegs:$src1,
+                                 (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (load (add IntRegs:$src1,
+                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDriw_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i64 (load (add IntRegs:$src1,
+                           (shl IntRegs:$src2, u2ImmPred:$offset)))),
+           (LDrid_indexed_shl_V4 IntRegs:$src1,
+            IntRegs:$src2, u2ImmPred:$offset)>,
+            Requires<[HasV4T]>;
+}
+
+
+// 'def pats' for load instruction base + register offset and
+// zero immediate value.
+let AddedComplexity = 10 in {
+def : Pat <(i64 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi8 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriub_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (sextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (zextloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (extloadi16 (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriuh_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+
+def : Pat <(i32 (load (add IntRegs:$src1, IntRegs:$src2))),
+           (LDriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2, 0)>,
+            Requires<[HasV4T]>;
+}
+
+// zext i1->i64
+def : Pat <(i64 (zext (i1 PredRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (MUX_ii (i1 PredRegs:$src1), 1, 0)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def : Pat <(i64 (zext (i32 IntRegs:$src1))),
+      (i64 (COMBINE_Ir_V4 0, (i32 IntRegs:$src1)))>,
+      Requires<[HasV4T]>;
+// zext i8->i64
+def:  Pat <(i64 (zextloadi8 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi8 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i1->i64
+def:  Pat <(i64 (zextloadi1 ADDRriS11_0:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub ADDRriS11_0:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi1 (add (i32 IntRegs:$src1),
+                                s11_0ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_indexed IntRegs:$src1,
+                                  s11_0ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i16->i64
+def:  Pat <(i64 (zextloadi16 ADDRriS11_1:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh ADDRriS11_1:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (zextloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// anyext i16->i64
+def:  Pat <(i64 (extloadi16 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 20 in
+def:  Pat <(i64 (extloadi16 (add (i32 IntRegs:$src1),
+                                  s11_1ExtPred:$offset))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_indexed IntRegs:$src1,
+                                  s11_1ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// zext i32->i64
+def:  Pat <(i64 (zextloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def:  Pat <(i64 (zextloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+// anyext i32->i64
+def:  Pat <(i64 (extloadi32 ADDRriS11_2:$src1)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw ADDRriS11_2:$src1)))>,
+      Requires<[HasV4T]>;
+
+let AddedComplexity = 100 in
+def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_indexed IntRegs:$src1,
+                                  s11_2ExtPred:$offset)))>,
+      Requires<[HasV4T]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// LD -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ST +
+//===----------------------------------------------------------------------===//
+///
+/// Assumptions::: ****** DO NOT IGNORE ********
+/// 1. Make sure that in post increment store, the zero'th operand is always the
+///    post increment operand.
+/// 2. Make sure that the store value operand(Rt/Rtt) in a store is always the
+///    last operand.
+///
+
+// memd(Re=#U)=Rtt
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in {
+def STrid_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, u0AlwaysExt:$src2),
+            "memd($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U)=Rs
+def STrib_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memb($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U)=Rs
+def STrih_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memh($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U)=Rs
+def STriw_abs_setimm_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, u0AlwaysExt:$src2),
+            "memw($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// memd(Re=#U)=Rtt
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in {
+def STrid_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins DoubleRegs:$src1, globaladdressExt:$src2),
+            "memd($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memb(Re=#U)=Rs
+def STrib_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
+            "memb($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Re=#U)=Rs
+def STrih_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
+            "memh($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Re=#U)=Rs
+def STriw_abs_set_V4 : STInst2<(outs IntRegs:$dst1),
+            (ins IntRegs:$src1, globaladdressExt:$src2),
+            "memw($dst1=##$src2) = $src1",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// multiclass for store instructions with base + register offset addressing
+// mode
+multiclass ST_Idxd_shl_Pbase<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 RC:$src5),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_shl_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_shl_Pbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isNVStorable = 1 in
+multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
+            mnemonic#"($src1+$src2<<#$src3) = $src4",
+            []>,
+            Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 0 >;
+      defm NotPt_V4 : ST_Idxd_shl_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+// multiclass for new-value store instructions with base + register offset
+// addressing mode.
+multiclass ST_Idxd_shl_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                             bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2Imm:$src4,
+                 RC:$src5),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+$src3<<#$src4) = $src5.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_shl_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_shl_Pbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_Idxd_shl_nv<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
+    let isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, u2Imm:$src3, RC:$src4),
+            mnemonic#"($src1+$src2<<#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 0 >;
+      defm NotPt : ST_Idxd_shl_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseRegOffset, neverHasSideEffects = 1,
+validSubTargets = HasV4SubT in {
+  defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
+                          ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+
+  defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
+                          ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+
+  defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
+                          ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+
+  let isNVStorable = 0 in
+  defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 10 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src4),
+                       (add IntRegs:$src1, (shl IntRegs:$src2,
+                                                u2ImmPred:$src3))),
+          (STrib_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src4),
+                        (add IntRegs:$src1, (shl IntRegs:$src2,
+                                                 u2ImmPred:$src3))),
+          (STrih_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(store (i32 IntRegs:$src4),
+                 (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
+          (STriw_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, IntRegs:$src4)>;
+
+def : Pat<(store (i64 DoubleRegs:$src4),
+                (add IntRegs:$src1, (shl IntRegs:$src2, u2ImmPred:$src3))),
+          (STrid_indexed_shl_V4 IntRegs:$src1, IntRegs:$src2,
+                                u2ImmPred:$src3, DoubleRegs:$src4)>;
+}
+
+// memd(Ru<<#u2+#U6)=Rtt
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10,
+validSubTargets = HasV4SubT in
+def STrid_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, DoubleRegs:$src4),
+            "memd($src1<<#$src2+#$src3) = $src4",
+            [(store (i64 DoubleRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                         u0AlwaysExtPred:$src3))]>,
+            Requires<[HasV4T]>;
+
+// memd(Rx++#s4:3)=Rtt
+// memd(Rx++#s4:3:circ(Mu))=Rtt
+// memd(Rx++I:circ(Mu))=Rtt
+// memd(Rx++Mu)=Rtt
+// memd(Rx++Mu:brev)=Rtt
+// memd(gp+#u16:3)=Rtt
+
+// Store doubleword conditionally.
+// if ([!]Pv[.new]) memd(#u6)=Rtt
+// TODO: needs to be implemented.
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with base + immediate offset
+// addressing mode and immediate stored value.
+// mem[bhw](Rx++#s4:3)=#s8
+// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
+//===----------------------------------------------------------------------===//
+multiclass ST_Imm_Pbase<string mnemonic, Operand OffsetOp, bit isNot,
+                        bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : STInst2<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, OffsetOp:$src3, s6Ext:$src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = #$src4",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Imm_Pred<string mnemonic, Operand OffsetOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Imm_Pbase<mnemonic, OffsetOp, PredNot, 1>;
+  }
+}
+
+let isExtendable = 1, isExtentSigned = 1, neverHasSideEffects = 1 in
+multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
+    let opExtendable = 2, opExtentBits = 8, isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins IntRegs:$src1, OffsetOp:$src2, s8Ext:$src3),
+            mnemonic#"($src1+#$src2) = #$src3",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 3, opExtentBits = 6, isPredicated = 1 in {
+      defm Pt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 0>;
+      defm NotPt_V4 : ST_Imm_Pred<mnemonic, OffsetOp, 1 >;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, InputType = "imm",
+    validSubTargets = HasV4SubT in {
+  defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+  defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+  defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 10 in {
+def: Pat<(truncstorei8 s8ExtPred:$src3, (add IntRegs:$src1, u6_0ImmPred:$src2)),
+            (STrib_imm_V4 IntRegs:$src1, u6_0ImmPred:$src2, s8ExtPred:$src3)>;
+
+def: Pat<(truncstorei16 s8ExtPred:$src3, (add IntRegs:$src1,
+                                              u6_1ImmPred:$src2)),
+            (STrih_imm_V4 IntRegs:$src1, u6_1ImmPred:$src2, s8ExtPred:$src3)>;
+
+def: Pat<(store s8ExtPred:$src3, (add IntRegs:$src1, u6_2ImmPred:$src2)),
+            (STriw_imm_V4 IntRegs:$src1, u6_2ImmPred:$src2, s8ExtPred:$src3)>;
+}
+
+let AddedComplexity = 6 in
+def : Pat <(truncstorei8 s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STrib_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memb(Ru<<#u2+#U6)=Rt
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
+def STrib_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memb($src1<<#$src2+#$src3) = $src4",
+            [(truncstorei8 (i32 IntRegs:$src4),
+                           (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                u0AlwaysExtPred:$src3))]>,
+            Requires<[HasV4T]>;
+
+// memb(Rx++#s4:0:circ(Mu))=Rt
+// memb(Rx++I:circ(Mu))=Rt
+// memb(Rx++Mu)=Rt
+// memb(Rx++Mu:brev)=Rt
+// memb(gp+#u16:0)=Rt
+
+
+// Store halfword.
+// TODO: needs to be implemented
+// memh(Re=#U6)=Rt.H
+// memh(Rs+#s11:1)=Rt.H
+let AddedComplexity = 6 in
+def : Pat <(truncstorei16 s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STrih_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memh(Rs+Ru<<#u2)=Rt.H
+// TODO: needs to be implemented.
+
+// memh(Ru<<#u2+#U6)=Rt.H
+// memh(Ru<<#u2+#U6)=Rt
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
+def STrih_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memh($src1<<#$src2+#$src3) = $src4",
+            [(truncstorei16 (i32 IntRegs:$src4),
+                            (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                                 u0AlwaysExtPred:$src3))]>,
+            Requires<[HasV4T]>;
+
+// memh(Rx++#s4:1:circ(Mu))=Rt.H
+// memh(Rx++#s4:1:circ(Mu))=Rt
+// memh(Rx++I:circ(Mu))=Rt.H
+// memh(Rx++I:circ(Mu))=Rt
+// memh(Rx++Mu)=Rt.H
+// memh(Rx++Mu)=Rt
+// memh(Rx++Mu:brev)=Rt.H
+// memh(Rx++Mu:brev)=Rt
+// memh(gp+#u16:1)=Rt
+// if ([!]Pv[.new]) memh(#u6)=Rt.H
+// if ([!]Pv[.new]) memh(#u6)=Rt
+
+
+// if ([!]Pv[.new]) memh(Rs+#u6:1)=Rt.H
+// TODO: needs to be implemented.
+
+// if ([!]Pv[.new]) memh(Rx++#s4:1)=Rt.H
+// TODO: Needs to be implemented.
+
+// Store word.
+// memw(Re=#U6)=Rt
+// TODO: Needs to be implemented.
+
+// Store predicate:
+let neverHasSideEffects = 1 in
+def STriw_pred_V4 : STInst2<(outs),
+            (ins MEMri:$addr, PredRegs:$src1),
+            "Error; should not emit",
+            []>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 6 in
+def : Pat <(store s8ExtPred:$src2, (i32 IntRegs:$src1)),
+           (STriw_imm_V4 IntRegs:$src1, 0, s8ExtPred:$src2)>,
+           Requires<[HasV4T]>;
+
+// memw(Ru<<#u2+#U6)=Rt
+let isExtended = 1, opExtendable = 2, AddedComplexity = 10, isNVStorable = 1,
+validSubTargets = HasV4SubT in
+def STriw_shl_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memw($src1<<#$src2+#$src3) = $src4",
+            [(store (i32 IntRegs:$src4),
+                    (add (shl (i32 IntRegs:$src1), u2ImmPred:$src2),
+                              u0AlwaysExtPred:$src3))]>,
+            Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2)=Rt
+// memw(Rx++#s4:2:circ(Mu))=Rt
+// memw(Rx++I:circ(Mu))=Rt
+// memw(Rx++Mu)=Rt
+// memw(Rx++Mu:brev)=Rt
+
+//===----------------------------------------------------------------------===
+// ST -
+//===----------------------------------------------------------------------===
+
+
+//===----------------------------------------------------------------------===//
+// NV/ST +
+//===----------------------------------------------------------------------===//
+
+// multiclass for new-value store instructions with base + immediate offset.
+//
+multiclass ST_Idxd_Pbase_nv<string mnemonic, RegisterClass RC,
+                            Operand predImmOp, bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC: $src4),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Idxd_Pred_nv<string mnemonic, RegisterClass RC, Operand predImmOp,
+                           bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Idxd_Pbase_nv<mnemonic, RC, predImmOp, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, neverHasSideEffects = 1, isExtendable = 1 in
+multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
+                   Operand ImmOp, Operand predImmOp, bits<5> ImmBits,
+                   bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+    isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
+            mnemonic#"($src1+#$src2) = $src3.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+    isPredicated = 1 in {
+      defm Pt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 0>;
+      defm NotPt : ST_Idxd_Pred_nv<mnemonic, RC, predImmOp, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
+  defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                                 u6_0Ext, 11, 6>, AddrModeRel;
+  defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                                 u6_1Ext, 12, 7>, AddrModeRel;
+  defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                                 u6_2Ext, 13, 8>, AddrModeRel;
+}
+
+// multiclass for new-value store instructions with base + immediate offset.
+// and MEMri operand.
+multiclass ST_MEMri_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                          bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_MEMri_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 0>;
+
+    // Predicate new
+    defm _cdn#NAME : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, isExtendable = 1, neverHasSideEffects = 1 in
+multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins MEMri:$addr, RC:$src),
+            mnemonic#"($addr) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+        neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ST_MEMri_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_MEMri_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
+mayStore = 1 in {
+  defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+  defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+  defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+}
+
+// memb(Ru<<#u2+#U6)=Nt.new
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
+def STrib_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memb($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// Post increment store
+// mem[bhwd](Rx++#s4:[0123])=Nt.new
+//===----------------------------------------------------------------------===//
+
+multiclass ST_PostInc_Pbase_nv<string mnemonic, RegisterClass RC, Operand ImmOp,
+                            bit isNot, bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($src2++#$offset) = $src3.new",
+            [],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_PostInc_Pred_nv<string mnemonic, RegisterClass RC,
+                           Operand ImmOp, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 0>;
+    // Predicate new
+    let Predicates = [HasV4T], validSubTargets = HasV4SubT in
+    defm _cdn#NAME : ST_PostInc_Pbase_nv<mnemonic, RC, ImmOp, PredNot, 1>;
+  }
+}
+
+let hasCtrlDep = 1, isNVStore = 1, neverHasSideEffects = 1 in
+multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
+                      Operand ImmOp> {
+
+  let BaseOpcode = "POST_"#BaseOp in {
+    let isPredicable = 1 in
+    def NAME#_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
+                (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
+                mnemonic#"($src1++#$offset) = $src2.new",
+                [],
+                "$src1 = $dst">,
+                Requires<[HasV4T]>;
+
+    let isPredicated = 1 in {
+      defm Pt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 0 >;
+      defm NotPt : ST_PostInc_Pred_nv<mnemonic, RC, ImmOp, 1 >;
+    }
+  }
+}
+
+let validSubTargets = HasV4SubT in {
+defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
+defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
+defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
+}
+
+// memb(Rx++#s4:0:circ(Mu))=Nt.new
+// memb(Rx++I:circ(Mu))=Nt.new
+// memb(Rx++Mu)=Nt.new
+// memb(Rx++Mu:brev)=Nt.new
+// memh(Ru<<#u2+#U6)=Nt.new
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
+def STrih_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memh($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memh(Rx++#s4:1:circ(Mu))=Nt.new
+// memh(Rx++I:circ(Mu))=Nt.new
+// memh(Rx++Mu)=Nt.new
+// memh(Rx++Mu:brev)=Nt.new
+
+// memw(Ru<<#u2+#U6)=Nt.new
+let isExtended = 1, opExtendable = 2, mayStore = 1, AddedComplexity = 10,
+isNVStore = 1, validSubTargets = HasV4SubT in
+def STriw_shl_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, u0AlwaysExt:$src3, IntRegs:$src4),
+            "memw($src1<<#$src2+#$src3) = $src4.new",
+            []>,
+            Requires<[HasV4T]>;
+
+// memw(Rx++#s4:2:circ(Mu))=Nt.new
+// memw(Rx++I:circ(Mu))=Nt.new
+// memw(Rx++Mu)=Nt.new
+// memw(Rx++Mu:brev)=Nt.new
+
+//===----------------------------------------------------------------------===//
+// NV/ST -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NV/J +
+//===----------------------------------------------------------------------===//
+
+multiclass NVJ_type_basic_reg<string NotStr, string OpcStr, string TakenStr> {
+  def _ie_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, $src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, $src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass NVJ_type_basic_2ndDotNew<string NotStr, string OpcStr,
+                                                   string TakenStr> {
+  def _ie_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1, $src2.new)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1, $src2.new)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass NVJ_type_basic_imm<string NotStr, string OpcStr, string TakenStr> {
+  def _ie_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass NVJ_type_basic_neg<string NotStr, string OpcStr, string TakenStr> {
+  def _ie_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, nOneImm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, nOneImm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass NVJ_type_basic_tstbit<string NotStr, string OpcStr,
+                                                string TakenStr> {
+  def _ie_nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _nv_V4 : NVInst_V4<(outs),
+            (ins IntRegs:$src1, u1Imm:$src2, brtarget:$offset),
+            !strconcat("if (", !strconcat(NotStr, !strconcat(OpcStr,
+            !strconcat("($src1.new, #$src2)) jump:",
+            !strconcat(TakenStr, " $offset"))))),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Multiclass for regular dot new of Ist operand register.
+multiclass NVJ_type_br_pred_reg<string NotStr, string OpcStr> {
+  defm Pt  : NVJ_type_basic_reg<NotStr, OpcStr, "t">;
+  defm Pnt : NVJ_type_basic_reg<NotStr, OpcStr, "nt">;
+}
+
+// Multiclass for dot new of 2nd operand register.
+multiclass NVJ_type_br_pred_2ndDotNew<string NotStr, string OpcStr> {
+  defm Pt  : NVJ_type_basic_2ndDotNew<NotStr, OpcStr, "t">;
+  defm Pnt : NVJ_type_basic_2ndDotNew<NotStr, OpcStr, "nt">;
+}
+
+// Multiclass for 2nd operand immediate, including -1.
+multiclass NVJ_type_br_pred_imm<string NotStr, string OpcStr> {
+  defm Pt     : NVJ_type_basic_imm<NotStr, OpcStr, "t">;
+  defm Pnt    : NVJ_type_basic_imm<NotStr, OpcStr, "nt">;
+  defm Ptneg  : NVJ_type_basic_neg<NotStr, OpcStr, "t">;
+  defm Pntneg : NVJ_type_basic_neg<NotStr, OpcStr, "nt">;
+}
+
+// Multiclass for 2nd operand immediate, excluding -1.
+multiclass NVJ_type_br_pred_imm_only<string NotStr, string OpcStr> {
+  defm Pt     : NVJ_type_basic_imm<NotStr, OpcStr, "t">;
+  defm Pnt    : NVJ_type_basic_imm<NotStr, OpcStr, "nt">;
+}
+
+// Multiclass for tstbit, where 2nd operand is always #0.
+multiclass NVJ_type_br_pred_tstbit<string NotStr, string OpcStr> {
+  defm Pt     : NVJ_type_basic_tstbit<NotStr, OpcStr, "t">;
+  defm Pnt    : NVJ_type_basic_tstbit<NotStr, OpcStr, "nt">;
+}
+
+// Multiclass for GT.
+multiclass NVJ_type_rr_ri<string OpcStr> {
+  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
+  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
+  defm rrdnNot : NVJ_type_br_pred_2ndDotNew<"!", OpcStr>;
+  defm rrdn    : NVJ_type_br_pred_2ndDotNew<"",  OpcStr>;
+  defm riNot   : NVJ_type_br_pred_imm<"!", OpcStr>;
+  defm ri      : NVJ_type_br_pred_imm<"",  OpcStr>;
+}
+
+// Multiclass for EQ.
+multiclass NVJ_type_rr_ri_no_2ndDotNew<string OpcStr> {
+  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
+  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
+  defm riNot   : NVJ_type_br_pred_imm<"!", OpcStr>;
+  defm ri      : NVJ_type_br_pred_imm<"",  OpcStr>;
+}
+
+// Multiclass for GTU.
+multiclass NVJ_type_rr_ri_no_nOne<string OpcStr> {
+  defm rrNot   : NVJ_type_br_pred_reg<"!", OpcStr>;
+  defm rr      : NVJ_type_br_pred_reg<"",  OpcStr>;
+  defm rrdnNot : NVJ_type_br_pred_2ndDotNew<"!", OpcStr>;
+  defm rrdn    : NVJ_type_br_pred_2ndDotNew<"",  OpcStr>;
+  defm riNot   : NVJ_type_br_pred_imm_only<"!", OpcStr>;
+  defm ri      : NVJ_type_br_pred_imm_only<"",  OpcStr>;
+}
+
+// Multiclass for tstbit.
+multiclass NVJ_type_r0<string OpcStr> {
+  defm r0Not : NVJ_type_br_pred_tstbit<"!", OpcStr>;
+  defm r0    : NVJ_type_br_pred_tstbit<"",  OpcStr>;
+ }
+
+// Base Multiclass for New Value Jump.
+multiclass NVJ_type {
+  defm GT     : NVJ_type_rr_ri<"cmp.gt">;
+  defm EQ     : NVJ_type_rr_ri_no_2ndDotNew<"cmp.eq">;
+  defm GTU    : NVJ_type_rr_ri_no_nOne<"cmp.gtu">;
+  defm TSTBIT : NVJ_type_r0<"tstbit">;
+}
+
+let isBranch = 1, isTerminator=1, neverHasSideEffects = 1, Defs = [PC] in {
+  defm JMP_ : NVJ_type;
+}
+
+//===----------------------------------------------------------------------===//
+// NV/J -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU +
+//===----------------------------------------------------------------------===//
+
+//  Add and accumulate.
+//  Rd=add(Rs,add(Ru,#s6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDr_ADDri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, IntRegs:$src2, s6Ext:$src3),
+          "$dst = add($src1, add($src2, #$src3))",
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (add (i32 IntRegs:$src2),
+                                          s6_16ExtPred:$src3)))]>,
+          Requires<[HasV4T]>;
+
+//  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDr_SUBri_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set (i32 IntRegs:$dst),
+           (add (i32 IntRegs:$src1), (sub s6_10ExtPred:$src2,
+                                          (i32 IntRegs:$src3))))]>,
+          Requires<[HasV4T]>;
+
+// Generates the same instruction as ADDr_SUBri_V4 but matches different
+// pattern.
+//  Rd=add(Rs,sub(#s6,Ru))
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDri_SUBr_V4 : MInst<(outs IntRegs:$dst),
+          (ins IntRegs:$src1, s6Ext:$src2, IntRegs:$src3),
+          "$dst = add($src1, sub(#$src2, $src3))",
+          [(set (i32 IntRegs:$dst),
+                (sub (add (i32 IntRegs:$src1), s6_10ExtPred:$src2),
+                     (i32 IntRegs:$src3)))]>,
+          Requires<[HasV4T]>;
+
+
+//  Add or subtract doublewords with carry.
+//TODO:
+//  Rdd=add(Rss,Rtt,Px):carry
+//TODO:
+//  Rdd=sub(Rss,Rtt,Px):carry
+
+
+//  Logical doublewords.
+//  Rdd=and(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
+def ANDd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = and($src1, ~$src2)",
+          [(set (i64 DoubleRegs:$dst), (and (i64 DoubleRegs:$src1),
+                                      (not (i64 DoubleRegs:$src2))))]>,
+          Requires<[HasV4T]>;
+
+//  Rdd=or(Rtt,~Rss)
+let validSubTargets = HasV4SubT in
+def ORd_NOTd_V4 : MInst<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          "$dst = or($src1, ~$src2)",
+          [(set (i64 DoubleRegs:$dst),
+           (or (i64 DoubleRegs:$src1), (not (i64 DoubleRegs:$src2))))]>,
+          Requires<[HasV4T]>;
+
+
+//  Logical-logical doublewords.
+//  Rxx^=xor(Rss,Rtt)
+let validSubTargets = HasV4SubT in
+def XORd_XORdd: MInst_acc<(outs DoubleRegs:$dst),
+          (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+          "$dst ^= xor($src2, $src3)",
+          [(set (i64 DoubleRegs:$dst),
+           (xor (i64 DoubleRegs:$src1), (xor (i64 DoubleRegs:$src2),
+                                             (i64 DoubleRegs:$src3))))],
+          "$src1 = $dst">,
+          Requires<[HasV4T]>;
+
+
+// Logical-logical words.
+// Rx=or(Ru,and(Rx,#s10))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT in
+def ORr_ANDri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst = or($src1, and($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,Rt)
+// Rx&=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "reg" in
+def ORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx^=and(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ANDrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=and(Rs,~Rt)
+// Rx&=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                 (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def ORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+             (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                           (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=and(Rs,~Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ANDr_NOTr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= and($src2, ~$src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                            (not (i32 IntRegs:$src3)))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=or(Rs,Rt)
+// Rx&=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                                (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=or(Rs,Rt)
+let validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "reg" in
+def ORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                               (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx^=or(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_ORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= or($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (xor (i32 IntRegs:$src1), (or (i32 IntRegs:$src2),
+                                           (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx[&|^]=xor(Rs,Rt)
+// Rx&=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ANDr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst &= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def ORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst |= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+                  (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                                 (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx^=xor(Rs,Rt)
+let validSubTargets = HasV4SubT in
+def XORr_XORrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, IntRegs:$src3),
+            "$dst ^= xor($src2, $src3)",
+            [(set (i32 IntRegs:$dst),
+             (and (i32 IntRegs:$src1), (xor (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx|=and(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ANDr", InputType = "imm" in
+def ORr_ANDri2_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst |= and($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx|=or(Rs,#s10)
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 1, opExtentBits = 10,
+validSubTargets = HasV4SubT, CextOpcode = "ORr_ORr", InputType = "imm" in
+def ORr_ORri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs: $src2, s10Ext:$src3),
+            "$dst |= or($src2, #$src3)",
+            [(set (i32 IntRegs:$dst),
+                  (or (i32 IntRegs:$src1), (and (i32 IntRegs:$src2),
+                                                s10ExtPred:$src3)))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+
+//    Modulo wrap
+//        Rd=modwrap(Rs,Rt)
+//    Round
+//        Rd=cround(Rs,#u5)
+//        Rd=cround(Rs,Rt)
+//        Rd=round(Rs,#u5)[:sat]
+//        Rd=round(Rs,Rt)[:sat]
+//    Vector reduce add unsigned halfwords
+//        Rd=vraddh(Rss,Rtt)
+//    Vector add bytes
+//        Rdd=vaddb(Rss,Rtt)
+//    Vector conditional negate
+//        Rdd=vcnegh(Rss,Rt)
+//        Rxx+=vrcnegh(Rss,Rt)
+//    Vector maximum bytes
+//        Rdd=vmaxb(Rtt,Rss)
+//    Vector reduce maximum halfwords
+//        Rxx=vrmaxh(Rss,Ru)
+//        Rxx=vrmaxuh(Rss,Ru)
+//    Vector reduce maximum words
+//        Rxx=vrmaxuw(Rss,Ru)
+//        Rxx=vrmaxw(Rss,Ru)
+//    Vector minimum bytes
+//        Rdd=vminb(Rtt,Rss)
+//    Vector reduce minimum halfwords
+//        Rxx=vrminh(Rss,Ru)
+//        Rxx=vrminuh(Rss,Ru)
+//    Vector reduce minimum words
+//        Rxx=vrminuw(Rss,Ru)
+//        Rxx=vrminw(Rss,Ru)
+//    Vector subtract bytes
+//        Rdd=vsubb(Rss,Rtt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/ALU -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY +
+//===----------------------------------------------------------------------===//
+
+// Multiply and user lower result.
+// Rd=add(#u6,mpyi(Rs,#U6))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT in
+def ADDi_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Ext:$src1, IntRegs:$src2, u6Imm:$src3),
+            "$dst = add(#$src1, mpyi($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                       u6ExtPred:$src1))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(##,mpyi(Rs,#U6))
+def : Pat <(add (mul (i32 IntRegs:$src2), u6ImmPred:$src3),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYri_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               u6ImmPred:$src3))>;
+
+// Rd=add(#u6,mpyi(Rs,Rt))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
+def ADDi_MPYrr_V4 : MInst<(outs IntRegs:$dst),
+            (ins u6Ext:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add(#$src1, mpyi($src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                       u6ExtPred:$src1))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rd=add(##,mpyi(Rs,Rt))
+def : Pat <(add (mul (i32 IntRegs:$src2), (i32 IntRegs:$src3)),
+                     (HexagonCONST32 tglobaladdr:$src1)),
+           (i32 (ADDi_MPYrr_V4 tglobaladdr:$src1, IntRegs:$src2,
+                               IntRegs:$src3))>;
+
+// Rd=add(Ru,mpyi(#u6:2,Rs))
+let validSubTargets = HasV4SubT in
+def ADDr_MPYir_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u6Imm:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi(#$src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src3),
+                                            u6_2ImmPred:$src2)))]>,
+            Requires<[HasV4T]>;
+
+// Rd=add(Ru,mpyi(Rs,#u6))
+let isExtendable = 1, opExtendable = 3, isExtentSigned = 0, opExtentBits = 6,
+validSubTargets = HasV4SubT, InputType = "imm", CextOpcode = "ADD_MPY" in
+def ADDr_MPYri_V4 : MInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, u6Ext:$src3),
+            "$dst = add($src1, mpyi($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                                 u6ExtPred:$src3)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Rx=add(Ru,mpyi(Rx,Rs))
+let validSubTargets = HasV4SubT, InputType = "reg", CextOpcode = "ADD_MPY" in
+def ADDr_MPYrr_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+            "$dst = add($src1, mpyi($src2, $src3))",
+            [(set (i32 IntRegs:$dst),
+             (add (i32 IntRegs:$src1), (mul (i32 IntRegs:$src2),
+                                            (i32 IntRegs:$src3))))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>, ImmRegRel;
+
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+// Rxx^=pmpyw(Rs,Rt)
+
+// Vector reduce multiply word by signed half (32x16)
+// Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+// Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpyweh(Rss,Rtt)[:<<1]
+// Rxx+=vrmpywoh(Rss,Rtt)[:<<1]
+
+// Multiply and use upper result
+// Rd=mpy(Rs,Rt.H):<<1:sat
+// Rd=mpy(Rs,Rt.L):<<1:sat
+// Rd=mpy(Rs,Rt):<<1
+// Rd=mpy(Rs,Rt):<<1:sat
+// Rd=mpysu(Rs,Rt)
+// Rx+=mpy(Rs,Rt):<<1:sat
+// Rx-=mpy(Rs,Rt):<<1:sat
+
+// Vector multiply bytes
+// Rdd=vmpybsu(Rs,Rt)
+// Rdd=vmpybu(Rs,Rt)
+// Rxx+=vmpybsu(Rs,Rt)
+// Rxx+=vmpybu(Rs,Rt)
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+// Rxx^=vpmpyh(Rs,Rt)
+
+//===----------------------------------------------------------------------===//
+// XTYPE/MPY -
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT +
+//===----------------------------------------------------------------------===//
+
+// Shift by immediate and accumulate.
+// Rx=add(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ADDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=add(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ADDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = add(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (add (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def SUBi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (sub (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+// Rx=sub(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def SUBi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = sub(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (sub (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by immediate and logical.
+//Rx=and(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ANDi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (and (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=and(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+validSubTargets = HasV4SubT in
+def ANDi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = and(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (and (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                       u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,asl(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
+def ORi_ASLri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, asl($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (shl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rx=or(#u8,lsr(Rx,#U5))
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
+AddedComplexity = 30, validSubTargets = HasV4SubT in
+def ORi_LSRri_V4 : MInst_acc<(outs IntRegs:$dst),
+            (ins u8Ext:$src1, IntRegs:$src2, u5Imm:$src3),
+            "$dst = or(#$src1, lsr($src2, #$src3))",
+            [(set (i32 IntRegs:$dst),
+                  (or (srl (i32 IntRegs:$src2), u5ImmPred:$src3),
+                      u8ExtPred:$src1))],
+            "$src2 = $dst">,
+            Requires<[HasV4T]>;
+
+
+//Shift by register.
+//Rd=lsl(#s6,Rt)
+let validSubTargets = HasV4SubT in {
+def LSLi_V4 : MInst<(outs IntRegs:$dst), (ins s6Imm:$src1, IntRegs:$src2),
+            "$dst = lsl(#$src1, $src2)",
+            [(set (i32 IntRegs:$dst), (shl s6ImmPred:$src1,
+                                           (i32 IntRegs:$src2)))]>,
+            Requires<[HasV4T]>;
+
+
+//Shift by register and logical.
+//Rxx^=asl(Rss,Rt)
+def ASLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asl($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (shl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=asr(Rss,Rt)
+def ASRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= asr($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (sra (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsl(Rss,Rt)
+def LSLd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsl($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst), (xor (i64 DoubleRegs:$src1),
+                                              (shl (i64 DoubleRegs:$src2),
+                                                   (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+
+//Rxx^=lsr(Rss,Rt)
+def LSRd_rr_xor_V4 : MInst_acc<(outs DoubleRegs:$dst),
+            (ins DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+            "$dst ^= lsr($src2, $src3)",
+            [(set (i64 DoubleRegs:$dst),
+                  (xor (i64 DoubleRegs:$src1), (srl (i64 DoubleRegs:$src2),
+                                                    (i32 IntRegs:$src3))))],
+            "$src1 = $dst">,
+            Requires<[HasV4T]>;
+}
+
+//===----------------------------------------------------------------------===//
+// XTYPE/SHIFT -
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MEMOP: Word, Half, Byte
+//===----------------------------------------------------------------------===//
+
+def MEMOPIMM : SDNodeXForm<imm, [{
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int32_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def MEMOPIMM_HALF : SDNodeXForm<imm, [{
+  // -1 .. -31 represented as 65535..65515
+  // assigning to a short restores our desired signed value.
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int16_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def MEMOPIMM_BYTE : SDNodeXForm<imm, [{
+  // -1 .. -31 represented as 255..235
+  // assigning to a char restores our desired signed value.
+  // Call the transformation function XformM5ToU5Imm to get the negative
+  // immediate's positive counterpart.
+  int8_t imm = N->getSExtValue();
+  return XformM5ToU5Imm(imm);
+}]>;
+
+def SETMEMIMM : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-31].
+   // As an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformMskToBitPosU5Imm(imm);
+}]>;
+
+def CLRMEMIMM : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-31].
+   // As an SDNode.
+   // we bit negate the value first
+   int32_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU5Imm(imm);
+}]>;
+
+def SETMEMIMM_SHORT : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-15].
+   // As an SDNode.
+   int16_t imm = N->getSExtValue();
+   return XformMskToBitPosU4Imm(imm);
+}]>;
+
+def CLRMEMIMM_SHORT : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-15].
+   // As an SDNode.
+   // we bit negate the value first
+   int16_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU4Imm(imm);
+}]>;
+
+def SETMEMIMM_BYTE : SDNodeXForm<imm, [{
+   // Return the bit position we will set [0-7].
+   // As an SDNode.
+   int8_t imm =  N->getSExtValue();
+   return XformMskToBitPosU3Imm(imm);
+}]>;
+
+def CLRMEMIMM_BYTE : SDNodeXForm<imm, [{
+   // Return the bit position we will clear [0-7].
+   // As an SDNode.
+   // we bit negate the value first
+   int8_t imm = ~(N->getSExtValue());
+   return XformMskToBitPosU3Imm(imm);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the register value.
+//===----------------------------------------------------------------------===//
+class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4<(outs),
+                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
+                 opc#"($base+#$offset)"#memOp#"$delta",
+                 []>,
+                 Requires<[HasV4T, UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1110;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+//===----------------------------------------------------------------------===//
+// Template class for MemOp instructions with the immediate value.
+//===----------------------------------------------------------------------===//
+class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
+                     string memOp, bits<2> memOpBits> :
+      MEMInst_V4 <(outs),
+                  (ins IntRegs:$base, ImmOp:$offset, u5Imm:$delta),
+                  opc#"($base+#$offset)"#memOp#"#$delta"
+                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
+                  []>,
+                  Requires<[HasV4T, UseMEMOP]> {
+
+    bits<5> base;
+    bits<5> delta;
+    bits<32> offset;
+    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
+
+    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
+                     !if (!eq(opcBits, 0b01), offset{6-1},
+                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
+
+    let IClass = 0b0011;
+    let Inst{27-24} = 0b1111;
+    let Inst{22-21} = opcBits;
+    let Inst{20-16} = base;
+    let Inst{13} = 0b0;
+    let Inst{12-7} = offsetBits;
+    let Inst{6-5} = memOpBits;
+    let Inst{4-0} = delta;
+}
+
+// multiclass to define MemOp instructions with register operand.
+multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
+  def _ADD#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
+  def _SUB#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
+  def _AND#NAME#_V4 : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
+  def _OR#NAME#_V4  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
+}
+
+// multiclass to define MemOp instructions with immediate Operand.
+multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
+  def _ADD#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
+  def _SUB#NAME#_V4 : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
+  def _CLRBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =clrbit(", 0b10>;
+  def _SETBIT#NAME#_V4 : MemOp_ri_base<opc, opcBits, ImmOp, " =setbit(", 0b11>;
+}
+
+multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
+  defm r : MemOp_rr <opc, opcBits, ImmOp>;
+  defm i : MemOp_ri <opc, opcBits, ImmOp>;
+}
+
+// Define MemOp instructions.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 0,
+validSubTargets =HasV4SubT in {
+  let opExtentBits = 6, accessSize = ByteAccess in
+  defm MemOPb : MemOp_base <"memb", 0b00, u6_0Ext>;
+
+  let opExtentBits = 7, accessSize = HalfWordAccess in
+  defm MemOPh : MemOp_base <"memh", 0b01, u6_1Ext>;
+
+  let opExtentBits = 8, accessSize = WordAccess in
+  defm MemOPw : MemOp_base <"memw", 0b10, u6_2Ext>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'Def Pats' for ALU operations on the memory
+// Here value used for the ALU operation is an immediate value.
+// mem[bh](Rs+#0) += #U5
+// mem[bh](Rs+#u6) += #U5
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_u5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
+                          InstHexagon MI, SDNode OpNode> {
+  let AddedComplexity = 180 in
+  def : Pat < (stOp (OpNode (ldOp IntRegs:$addr), u5ImmPred:$addend),
+                    IntRegs:$addr),
+              (MI IntRegs:$addr, #0, u5ImmPred:$addend )>;
+
+  let AddedComplexity = 190 in
+  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, ExtPred:$offset)),
+                     u5ImmPred:$addend),
+             (add IntRegs:$base, ExtPred:$offset)),
+       (MI IntRegs:$base, ExtPred:$offset, u5ImmPred:$addend)>;
+}
+
+multiclass MemOpi_u5ALUOp<PatFrag ldOp, PatFrag stOp, PatLeaf ExtPred,
+                          InstHexagon addMI, InstHexagon subMI> {
+  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, addMI, add>;
+  defm : MemOpi_u5Pats<ldOp, stOp, ExtPred, subMI, sub>;
+}
+
+multiclass MemOpi_u5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOpi_u5ALUOp <ldOpHalf, truncstorei16, u6_1ExtPred,
+                         MemOPh_ADDi_V4, MemOPh_SUBi_V4>;
+  // Byte
+  defm : MemOpi_u5ALUOp <ldOpByte, truncstorei8, u6ExtPred,
+                         MemOPb_ADDi_V4, MemOPb_SUBi_V4>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  defm : MemOpi_u5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_u5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_u5ExtType<extloadi8,  extloadi16>;  // any extend
+
+  // Word
+  defm : MemOpi_u5ALUOp <load, store, u6_2ExtPred, MemOPw_ADDi_V4,
+                         MemOPw_SUBi_V4>;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass to define 'Def Pats' for ALU operations on the memory.
+// Here value used for the ALU operation is a negative value.
+// mem[bh](Rs+#0) += #m5
+// mem[bh](Rs+#u6) += #m5
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_m5Pats <PatFrag ldOp, PatFrag stOp, PatLeaf extPred,
+                          PatLeaf immPred, ComplexPattern addrPred,
+                          SDNodeXForm xformFunc, InstHexagon MI> {
+  let AddedComplexity = 190 in
+  def : Pat <(stOp (add (ldOp IntRegs:$addr), immPred:$subend),
+                   IntRegs:$addr),
+             (MI IntRegs:$addr, #0, (xformFunc immPred:$subend) )>;
+
+  let AddedComplexity = 195 in
+  def : Pat<(stOp (add (ldOp (add IntRegs:$base, extPred:$offset)),
+                       immPred:$subend),
+                  (add IntRegs:$base, extPred:$offset)),
+            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$subend))>;
+}
+
+multiclass MemOpi_m5ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOpi_m5Pats <ldOpHalf, truncstorei16, u6_1ExtPred, m5HImmPred,
+                        ADDRriU6_1, MEMOPIMM_HALF, MemOPh_SUBi_V4>;
+  // Byte
+  defm : MemOpi_m5Pats <ldOpByte, truncstorei8, u6ExtPred, m5BImmPred,
+                        ADDRriU6_0, MEMOPIMM_BYTE, MemOPb_SUBi_V4>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  defm : MemOpi_m5ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_m5ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_m5ExtType<extloadi8,  extloadi16>;  // any extend
+
+  // Word
+  defm : MemOpi_m5Pats <load, store, u6_2ExtPred, m5ImmPred,
+                          ADDRriU6_2, MEMOPIMM, MemOPw_SUBi_V4>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'def Pats' for bit operations on the memory.
+// mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
+// mem[bhw](Rs+#u6) = [clrbit|setbit](#U5)
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
+                     PatLeaf extPred, ComplexPattern addrPred,
+                     SDNodeXForm xformFunc, InstHexagon MI, SDNode OpNode> {
+
+  // mem[bhw](Rs+#u6:[012]) = [clrbit|setbit](#U5)
+  let AddedComplexity = 250 in
+  def : Pat<(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                          immPred:$bitend),
+                  (add IntRegs:$base, extPred:$offset)),
+            (MI IntRegs:$base, extPred:$offset, (xformFunc immPred:$bitend))>;
+
+  // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
+  let AddedComplexity = 225 in
+  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), immPred:$bitend),
+                   addrPred:$addr),
+             (MI IntRegs:$addr, #0, (xformFunc immPred:$bitend))>;
+}
+
+multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Byte - clrbit
+  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Clr3ImmPred, u6ExtPred,
+                       ADDRriU6_0, CLRMEMIMM_BYTE, MemOPb_CLRBITi_V4, and>;
+  // Byte - setbit
+  defm : MemOpi_bitPats<ldOpByte, truncstorei8, Set3ImmPred,  u6ExtPred,
+                       ADDRriU6_0, SETMEMIMM_BYTE, MemOPb_SETBITi_V4, or>;
+  // Half Word - clrbit
+  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Clr4ImmPred, u6_1ExtPred,
+                       ADDRriU6_1, CLRMEMIMM_SHORT, MemOPh_CLRBITi_V4, and>;
+  // Half Word - setbit
+  defm : MemOpi_bitPats<ldOpHalf, truncstorei16, Set4ImmPred, u6_1ExtPred,
+                       ADDRriU6_1, SETMEMIMM_SHORT, MemOPh_SETBITi_V4, or>;
+}
+
+let Predicates = [HasV4T, UseMEMOP] in {
+  // mem[bh](Rs+#0) = [clrbit|setbit](#U5)
+  // mem[bh](Rs+#u6:[01]) = [clrbit|setbit](#U5)
+  defm : MemOpi_bitExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOpi_bitExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOpi_bitExtType<extloadi8,  extloadi16>;  // any extend
+
+  // memw(Rs+#0) = [clrbit|setbit](#U5)
+  // memw(Rs+#u6:2) = [clrbit|setbit](#U5)
+  defm : MemOpi_bitPats<load, store, Clr5ImmPred, u6_2ExtPred, ADDRriU6_2,
+                       CLRMEMIMM, MemOPw_CLRBITi_V4, and>;
+  defm : MemOpi_bitPats<load, store, Set5ImmPred, u6_2ExtPred, ADDRriU6_2,
+                       SETMEMIMM, MemOPw_SETBITi_V4, or>;
+}
+
+//===----------------------------------------------------------------------===//
+// Multiclass to define 'def Pats' for ALU operations on the memory
+// where addend is a register.
+// mem[bhw](Rs+#0) [+-&|]= Rt
+// mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
+//===----------------------------------------------------------------------===//
+
+multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, ComplexPattern addrPred,
+                     PatLeaf extPred, InstHexagon MI, SDNode OpNode> {
+  let AddedComplexity = 141 in
+  // mem[bhw](Rs+#0) [+-&|]= Rt
+  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), (i32 IntRegs:$addend)),
+                   addrPred:$addr),
+             (MI IntRegs:$addr, #0, (i32 IntRegs:$addend) )>;
+
+  // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
+  let AddedComplexity = 150 in
+  def : Pat <(stOp (OpNode (ldOp (add IntRegs:$base, extPred:$offset)),
+                           (i32 IntRegs:$orend)),
+                   (add IntRegs:$base, extPred:$offset)),
+             (MI IntRegs:$base, extPred:$offset, (i32 IntRegs:$orend) )>;
+}
+
+multiclass MemOPr_ALUOp<PatFrag ldOp, PatFrag stOp,
+                        ComplexPattern addrPred, PatLeaf extPred,
+                        InstHexagon addMI, InstHexagon subMI,
+                        InstHexagon andMI, InstHexagon orMI > {
+
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, addMI, add>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, subMI, sub>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, andMI, and>;
+  defm : MemOpr_Pats <ldOp, stOp, addrPred, extPred, orMI,  or>;
+}
+
+multiclass MemOPr_ExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
+  // Half Word
+  defm : MemOPr_ALUOp <ldOpHalf, truncstorei16, ADDRriU6_1, u6_1ExtPred,
+                       MemOPh_ADDr_V4, MemOPh_SUBr_V4,
+                       MemOPh_ANDr_V4, MemOPh_ORr_V4>;
+  // Byte
+  defm : MemOPr_ALUOp <ldOpByte, truncstorei8, ADDRriU6_0, u6ExtPred,
+                       MemOPb_ADDr_V4, MemOPb_SUBr_V4,
+                       MemOPb_ANDr_V4, MemOPb_ORr_V4>;
+}
+
+// Define 'def Pats' for MemOps with register addend.
+let Predicates = [HasV4T, UseMEMOP] in {
+  // Byte, Half Word
+  defm : MemOPr_ExtType<zextloadi8, zextloadi16>; // zero extend
+  defm : MemOPr_ExtType<sextloadi8, sextloadi16>; // sign extend
+  defm : MemOPr_ExtType<extloadi8,  extloadi16>;  // any extend
+  // Word
+  defm : MemOPr_ALUOp <load, store, ADDRriU6_2, u6_2ExtPred, MemOPw_ADDr_V4,
+                       MemOPw_SUBr_V4, MemOPw_ANDr_V4, MemOPw_ORr_V4 >;
+}
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED +
+//===----------------------------------------------------------------------===//
+
+// Hexagon V4 only supports these flavors of byte/half compare instructions:
+// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
+// hardware. However, compiler can still implement these patterns through
+// appropriate patterns combinations based on current implemented patterns.
+// The implemented patterns are: EQ/GT/GTU.
+// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
+
+// Following instruction is not being extended as it results into the
+// incorrect code for negative numbers.
+// Pd=cmpb.eq(Rs,#u8)
+
+// p=!cmp.eq(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.eq($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), (i32 IntRegs:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.eq(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotEQ_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.eq($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (setne (i32 IntRegs:$src1), s10ImmPred:$src2))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_rr : ALU32_rr<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gt($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gt(r1,#s10)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGT_ri : ALU32_ri<(outs PredRegs:$dst),
+                           (ins IntRegs:$src1, s10Ext:$src2),
+      "$dst = !cmp.gt($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setgt (i32 IntRegs:$src1), s10ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,r2)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_rr : ALU32_rr<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, IntRegs:$src2),
+      "$dst = !cmp.gtu($src1, $src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), (i32 IntRegs:$src2))))]>,
+      Requires<[HasV4T]>;
+
+// p=!cmp.gtu(r1,#u9)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPnotGTU_ri : ALU32_ri<(outs PredRegs:$dst),
+                            (ins IntRegs:$src1, u9Ext:$src2),
+      "$dst = !cmp.gtu($src1, #$src2)",
+      [(set (i1 PredRegs:$dst),
+            (not (setugt (i32 IntRegs:$src1), u9ImmPred:$src2)))]>,
+      Requires<[HasV4T]>;
+
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u8Imm:$src2),
+            "$dst = cmpb.eq($src1, #$src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+def : Pat <(brcond (i1 (setne (and (i32 IntRegs:$src1), 255), u8ImmPred:$src2)),
+                       bb:$offset),
+      (JMP_cNot (CMPbEQri_V4 (i32 IntRegs:$src1), u8ImmPred:$src2),
+                bb:$offset)>,
+      Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQrr_ubub_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (and (xor (i32 IntRegs:$src1),
+                                   (i32 IntRegs:$src2)), 255), 0))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbEQrr_sbsb_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gt(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPbGTrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gt($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 24)),
+                         (shl (i32 IntRegs:$src2), (i32 24))))]>,
+            Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,#u7)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU", InputType = "imm" in
+def CMPbGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Ext:$src2),
+            "$dst = cmpb.gtu($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// SDNode for converting immediate C to C-1.
+def DEC_CONST_BYTE : SDNodeXForm<imm, [{
+   // Return the byte immediate const-1 as an SDNode.
+   int32_t imm = N->getSExtValue();
+   return XformU7ToU7M1Imm(imm);
+}]>;
+
+// For the sequence
+//   zext( seteq ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.eq(Rs, #u8)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 (and (i32 IntRegs:$Rs), 255)),
+                                           u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQri_V4 (i32 IntRegs:$Rs),
+                                                 (u8ExtPred:$u8))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( seteq (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (seteq (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setne (Rs, and(Rt, 255)))
+// Generate
+//   Pd=cmpb.eq(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setne (i32 IntRegs:$Rt),
+                                 (i32 (and (i32 IntRegs:$Rs), 255)))))),
+           (i32 (TFR_condset_ii (i1 (CMPbEQrr_ubub_V4 (i32 IntRegs:$Rs),
+                                                      (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 255), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 255)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( and(Rs, 254), u8))
+// Generate
+//   Pd=cmpb.gtu(Rs, #u8)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 (and (i32 IntRegs:$Rs), 254)),
+                                            u8ExtPred:$u8)))),
+           (i32 (TFR_condset_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$Rs),
+                                                  (u8ExtPred:$u8))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setult (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setlt ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setlt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setugt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+def : Pat <(i32 (zext (i1 (setugt (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                1, 0))>,
+           Requires<[HasV4T]>;
+
+// This pattern interefers with coremark performance, not implementing at this
+// time.
+// For the sequence
+//   zext( setgt ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#1
+//   if (!Pd.new) Rd=#0
+
+// For the sequence
+//   zext( setuge ( Rs, Rt))
+// Generate
+//   Pd=cmp.ltu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.ltu(Rs, Rt) -> cmp.gtu(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setuge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rt),
+                                              (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setge ( Rs, Rt))
+// Generate
+//   Pd=cmp.lt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+// cmp.lt(Rs, Rt) -> cmp.gt(Rt, Rs)
+def : Pat <(i32 (zext (i1 (setge (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rt),
+                                             (i32 IntRegs:$Rs))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setule ( Rs, Rt))
+// Generate
+//   Pd=cmp.gtu(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setule (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTUrr (i32 IntRegs:$Rs),
+                                              (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setle ( Rs, Rt))
+// Generate
+//   Pd=cmp.gt(Rs, Rt)
+//   if (Pd.new) Rd=#0
+//   if (!Pd.new) Rd=#1
+def : Pat <(i32 (zext (i1 (setle (i32 IntRegs:$Rs), (i32 IntRegs:$Rt))))),
+           (i32 (TFR_condset_ii (i1 (CMPGTrr (i32 IntRegs:$Rs),
+                                             (i32 IntRegs:$Rt))),
+                                0, 1))>,
+           Requires<[HasV4T]>;
+
+// For the sequence
+//   zext( setult ( and(Rs, 255), u8))
+// Use the isdigit transformation below
+
+// Generate code of the form 'mux_ii(cmpbgtu(Rdd, C-1),0,1)'
+// for C code of the form r = ((c>='0') & (c<='9')) ? 1 : 0;.
+// The isdigit transformation relies on two 'clever' aspects:
+// 1) The data type is unsigned which allows us to eliminate a zero test after
+//    biasing the expression by 48. We are depending on the representation of
+//    the unsigned types, and semantics.
+// 2) The front end has converted <= 9 into < 10 on entry to LLVM
+//
+// For the C code:
+//   retval = ((c>='0') & (c<='9')) ? 1 : 0;
+// The code is transformed upstream of llvm into
+//   retval = (c-48) < 10 ? 1 : 0;
+let AddedComplexity = 139 in
+def : Pat <(i32 (zext (i1 (setult (i32 (and (i32 IntRegs:$src1), 255)),
+                                  u7StrictPosImmPred:$src2)))),
+  (i32 (MUX_ii (i1 (CMPbGTUri_V4 (i32 IntRegs:$src1),
+                                 (DEC_CONST_BYTE u7StrictPosImmPred:$src2))),
+                   0, 1))>,
+                   Requires<[HasV4T]>;
+
+// Pd=cmpb.gtu(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPbGTU",
+InputType = "reg" in
+def CMPbGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmpb.gtu($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 255),
+                                             (and (i32 IntRegs:$src2), 255)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Following instruction is not being extended as it results into the incorrect
+// code for negative numbers.
+
+// Signed half compare(.eq) ri.
+// Pd=cmph.eq(Rs,#s8)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s8Imm:$src2),
+            "$dst = cmph.eq($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (seteq (and (i32 IntRegs:$src1), 65535),
+                                             s8ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 1: xor + and, then compare:
+//   r0=xor(r0,r1)
+//   r0=and(r0,#0xffff)
+//   p0=cmp.eq(r0,#0)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQrr_xor_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst), (seteq (and (xor (i32 IntRegs:$src1),
+                                                       (i32 IntRegs:$src2)),
+                                                  65535), 0))]>,
+            Requires<[HasV4T]>;
+
+// Signed half compare(.eq) rr.
+// Case 2: shift left 16 bits then compare:
+//   r0=asl(r0,16)
+//   r1=asl(r1,16)
+//   p0=cmp.eq(r0,r1)
+// Pd=cmph.eq(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhEQrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.eq($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (seteq (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+/* Incorrect Pattern -- immediate should be right shifted before being
+used in the cmph.gt instruction.
+// Signed half compare(.gt) ri.
+// Pd=cmph.gt(Rs,#s8)
+
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 8,
+isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhGTri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, s8Ext:$src2),
+            "$dst = cmph.gt($src1, #$src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         s8ExtPred:$src2))]>,
+            Requires<[HasV4T]>;
+*/
+
+// Signed half compare(.gt) rr.
+// Pd=cmph.gt(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT in
+def CMPhGTrr_shl_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gt($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setgt (shl (i32 IntRegs:$src1), (i32 16)),
+                         (shl (i32 IntRegs:$src2), (i32 16))))]>,
+            Requires<[HasV4T]>;
+
+// Unsigned half compare rr (.gtu).
+// Pd=cmph.gtu(Rs,Rt)
+let isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "reg" in
+def CMPhGTUrr_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = cmph.gtu($src1, $src2)",
+            [(set (i1 PredRegs:$dst),
+                  (setugt (and (i32 IntRegs:$src1), 65535),
+                          (and (i32 IntRegs:$src2), 65535)))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+// Unsigned half compare ri (.gtu).
+// Pd=cmph.gtu(Rs,#u7)
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 0, opExtentBits = 7,
+isCompare = 1, validSubTargets = HasV4SubT, CextOpcode = "CMPhGTU",
+InputType = "imm" in
+def CMPhGTUri_V4 : MInst<(outs PredRegs:$dst),
+            (ins IntRegs:$src1, u7Ext:$src2),
+            "$dst = cmph.gtu($src1, #$src2)",
+            [(set (i1 PredRegs:$dst), (setugt (and (i32 IntRegs:$src1), 65535),
+                                              u7ExtPred:$src2))]>,
+            Requires<[HasV4T]>, ImmRegRel;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_rr : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, (i32 IntRegs:$src2)), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
+
+let validSubTargets = HasV4SubT in
+def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+    "$dst = !tstbit($src1, $src2)",
+    [(set (i1 PredRegs:$dst),
+          (seteq (and (shl 1, u5ImmPred:$src2), (i32 IntRegs:$src1)), 0))]>,
+    Requires<[HasV4T]>;
+
+//===----------------------------------------------------------------------===//
+// XTYPE/PRED -
+//===----------------------------------------------------------------------===//
+
+//Deallocate frame and return.
+//    dealloc_return
+let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
+  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
+  def DEALLOC_RET_V4 : NVInst_V4<(outs), (ins i32imm:$amt1),
+            "dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
+                                   (ins calltarget:$dst),
+             "jump $dst // Restore_and_dealloc_return",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, isBarrier = 1,
+  Defs = [R29, R30, R31, PC] in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
+                                           (ins calltarget:$dst),
+             "call $dst // Restore_and_dealloc_before_tailcall",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+// Save registers function call.
+let isCall = 1, isBarrier = 1,
+  Uses = [R29, R31] in {
+  def SAVE_REGISTERS_CALL_V4 : JInst<(outs),
+                               (ins calltarget:$dst),
+             "call $dst // Save_calle_saved_registers",
+             []>,
+             Requires<[HasV4T]>;
+}
+
+//    if (Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs),
+                           (ins PredRegs:$src1, i32imm:$amt1),
+            "if ($src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps) dealloc_return
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                     i32imm:$amt1),
+            "if (!$src1) dealloc_return",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                     i32imm:$amt1),
+            "if ($src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                        i32imm:$amt1),
+            "if (!$src1.new) dealloc_return:nt",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (Ps.new) dealloc_return:t
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                    i32imm:$amt1),
+            "if ($src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+//    if (!Ps.new) dealloc_return:nt
+let isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    isPredicated = 1 in {
+  def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
+                                                       i32imm:$amt1),
+            "if (!$src1.new) dealloc_return:t",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+// Load/Store with absolute addressing mode
+// memw(#u6)=Rt
+
+multiclass ST_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isNVStorable = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_V4 : STInst2<(outs),
+            (ins globaladdressExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+multiclass ST_Abs_Predbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"(##$absaddr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_Abs_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : ST_Abs_Predbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, isExtended = 1, neverHasSideEffects = 1 in
+multiclass ST_Abs_nv<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let opExtendable = 0, isPredicable = 1 in
+    def NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins globaladdressExt:$absaddr, RC:$src),
+            mnemonic#"(##$absaddr) = $src.new",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 1, isPredicated = 1 in {
+      defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = Absolute in {
+    defm STrib_abs : ST_Abs<"memb", "STrib", IntRegs>,
+                     ST_Abs_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+
+    defm STrih_abs : ST_Abs<"memh", "STrih", IntRegs>,
+                     ST_Abs_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+
+    defm STriw_abs : ST_Abs<"memw", "STriw", IntRegs>,
+                     ST_Abs_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+
+  let isNVStorable = 0 in
+    defm STrid_abs : ST_Abs<"memd", "STrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity = 30 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+                        (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrib_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                          (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrih_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STriw_abs_V4 tglobaladdr: $absaddr, IntRegs: $src1)>;
+
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 (HexagonCONST32 tglobaladdr:$absaddr)),
+          (STrid_abs_V4 tglobaladdr: $absaddr, DoubleRegs: $src1)>;
+}
+
+//===----------------------------------------------------------------------===//
+// multiclass for store instructions with GP-relative addressing mode.
+// mem[bhwd](#global)=Rt
+// if ([!]Pv[.new]) mem[bhwd](##global) = Rt
+//===----------------------------------------------------------------------===//
+multiclass ST_GP<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp, isPredicable = 1 in
+  def NAME#_V4 : STInst2<(outs),
+          (ins globaladdress:$global, RC:$src),
+          mnemonic#"(#$global) = $src",
+          []>;
+
+  // When GP-relative instructions are predicated, their addressing mode is
+  // changed to absolute and they are always constant extended.
+  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
+  isPredicated = 1 in {
+    defm Pt : ST_Abs_Pred <mnemonic, RC, 0>;
+    defm NotPt : ST_Abs_Pred <mnemonic, RC, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1 in
+multiclass ST_GP_nv<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp, isPredicable = 1 in
+  def NAME#_nv_V4 : NVInst_V4<(outs),
+          (ins u0AlwaysExt:$global, RC:$src),
+          mnemonic#"(#$global) = $src.new",
+          []>,
+          Requires<[HasV4T]>;
+
+  // When GP-relative instructions are predicated, their addressing mode is
+  // changed to absolute and they are always constant extended.
+  let BaseOpcode = BaseOp, isExtended = 1, opExtendable = 1,
+  isPredicated = 1 in {
+    defm Pt : ST_Abs_Pred_nv<mnemonic, RC, 0>;
+    defm NotPt : ST_Abs_Pred_nv<mnemonic, RC, 1>;
+  }
+}
+
+let validSubTargets = HasV4SubT,  validSubTargets = HasV4SubT in {
+defm STd_GP : ST_GP <"memd", "STd_GP", DoubleRegs>,
+              ST_GP_nv<"memd", "STd_GP", DoubleRegs>, NewValueRel ;
+defm STb_GP : ST_GP<"memb",  "STb_GP", IntRegs>,
+              ST_GP_nv<"memb", "STb_GP", IntRegs>, NewValueRel ;
+defm STh_GP : ST_GP<"memh",  "STh_GP", IntRegs>,
+              ST_GP_nv<"memh", "STh_GP", IntRegs>, NewValueRel ;
+defm STw_GP : ST_GP<"memw",  "STw_GP", IntRegs>,
+              ST_GP_nv<"memw", "STw_GP", IntRegs>, NewValueRel ;
+}
+
+// 64 bit atomic store
+def : Pat <(atomic_store_64 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i64 DoubleRegs:$src1)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+// Map from store(globaladdress) -> memd(#foo)
+let AddedComplexity = 100 in
+def : Pat <(store (i64 DoubleRegs:$src1),
+                  (HexagonCONST32_GP tglobaladdr:$global)),
+           (STd_GP_V4 tglobaladdr:$global, (i64 DoubleRegs:$src1))>;
+
+// 8 bit atomic store
+def : Pat < (atomic_store_8 (HexagonCONST32_GP tglobaladdr:$global),
+                            (i32 IntRegs:$src1)),
+            (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1),
+          (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from "i1 = constant<-1>; memw(CONST32(#foo)) = i1"
+//       to "r0 = 1; memw(#foo) = r0"
+let AddedComplexity = 100 in
+def : Pat<(store (i1 -1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STb_GP_V4 tglobaladdr:$global, (TFRI 1))>;
+
+def : Pat<(atomic_store_16 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1),
+                         (HexagonCONST32_GP tglobaladdr:$global)),
+          (STh_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// 32 bit atomic store
+def : Pat<(atomic_store_32 (HexagonCONST32_GP tglobaladdr:$global),
+                           (i32 IntRegs:$src1)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+// Map from store(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), (HexagonCONST32_GP tglobaladdr:$global)),
+          (STw_GP_V4 tglobaladdr:$global, (i32 IntRegs:$src1))>;
+
+//===----------------------------------------------------------------------===//
+// Multiclass for the load instructions with absolute addressing mode.
+//===----------------------------------------------------------------------===//
+multiclass LD_Abs_Predbase<string mnemonic, RegisterClass RC, bit isNot,
+                           bit isPredNew> {
+  let PNewValue = !if(isPredNew, "new", "") in
+  def NAME : LDInst2<(outs RC:$dst),
+            (ins PredRegs:$src1, globaladdressExt:$absaddr),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#"$dst = "#mnemonic#"(##$absaddr)",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = !if(PredNot, "false", "true") in {
+    defm _c#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME : LD_Abs_Predbase<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let isExtended = 1, neverHasSideEffects = 1 in
+multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
+    let  opExtendable = 1, isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins globaladdressExt:$absaddr),
+            "$dst = "#mnemonic#"(##$absaddr)",
+            []>,
+            Requires<[HasV4T]>;
+
+    let opExtendable = 2, isPredicated = 1 in {
+      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = Absolute in {
+    defm LDrib_abs  : LD_Abs<"memb", "LDrib", IntRegs>, AddrModeRel;
+    defm LDriub_abs : LD_Abs<"memub", "LDriub", IntRegs>, AddrModeRel;
+    defm LDrih_abs  : LD_Abs<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_abs : LD_Abs<"memuh", "LDriuh", IntRegs>, AddrModeRel;
+    defm LDriw_abs  : LD_Abs<"memw", "LDriw", IntRegs>, AddrModeRel;
+    defm LDrid_abs : LD_Abs<"memd",  "LDrid", DoubleRegs>, AddrModeRel;
+}
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in
+def : Pat<(i32 (load (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriw_abs_V4 tglobaladdr: $absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrib_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi8 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriub_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (sextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDrih_abs_V4 tglobaladdr:$absaddr)>;
+
+let Predicates = [HasV4T], AddedComplexity=30 in
+def : Pat<(i32 (zextloadi16 (HexagonCONST32 tglobaladdr:$absaddr))),
+          (LDriuh_abs_V4 tglobaladdr:$absaddr)>;
+
+//===----------------------------------------------------------------------===//
+// multiclass for load instructions with GP-relative addressing mode.
+// Rx=mem[bhwd](##global)
+// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1, validSubTargets = HasV4SubT in
+multiclass LD_GP<string mnemonic, string BaseOp, RegisterClass RC> {
+  let BaseOpcode = BaseOp in {
+    let isPredicable = 1 in
+    def NAME#_V4 : LDInst2<(outs RC:$dst),
+            (ins globaladdress:$global),
+            "$dst = "#mnemonic#"(#$global)",
+            []>;
+
+    let isExtended = 1, opExtendable = 2, isPredicated = 1 in {
+      defm Pt_V4 : LD_Abs_Pred<mnemonic, RC, 0>;
+      defm NotPt_V4 : LD_Abs_Pred<mnemonic, RC, 1>;
+    }
+  }
+}
+
+defm LDd_GP  : LD_GP<"memd",  "LDd_GP",  DoubleRegs>;
+defm LDb_GP  : LD_GP<"memb",  "LDb_GP",  IntRegs>;
+defm LDub_GP : LD_GP<"memub", "LDub_GP", IntRegs>;
+defm LDh_GP  : LD_GP<"memh",  "LDh_GP",  IntRegs>;
+defm LDuh_GP : LD_GP<"memuh", "LDuh_GP", IntRegs>;
+defm LDw_GP  : LD_GP<"memw",  "LDw_GP",  IntRegs>;
+
+def : Pat <(atomic_load_64 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_32 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_16 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+
+def : Pat <(atomic_load_8 (HexagonCONST32_GP tglobaladdr:$global)),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memw(#foo + 0)
+let AddedComplexity = 100 in
+def : Pat <(i64 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i64 (LDd_GP_V4 tglobaladdr:$global))>;
+
+// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd
+let AddedComplexity = 100 in
+def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i1 (TFR_PdRs (i32 (LDb_GP_V4 tglobaladdr:$global))))>;
+
+// When the Interprocedural Global Variable optimizer realizes that a certain
+// global variable takes only two constant values, it shrinks the global to
+// a boolean. Catch those loads here in the following 3 patterns.
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memb(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDb_GP_V4 tglobaladdr:$global))>;
+
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi1 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memub(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi8 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDub_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (extloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (sextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memuh(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (zextloadi16 (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDuh_GP_V4 tglobaladdr:$global))>;
+
+// Map from load(globaladdress) -> memw(#foo)
+let AddedComplexity = 100 in
+def : Pat <(i32 (load (HexagonCONST32_GP tglobaladdr:$global))),
+           (i32 (LDw_GP_V4 tglobaladdr:$global))>;
+
+
+// Transfer global address into a register
+let AddedComplexity=50, isMoveImm = 1, isReMaterializable = 1 in
+def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins globaladdress:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>,
+           Requires<[HasV4T]>;
+
+// Transfer a block address into a register
+def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
+          (TFRI_V4 tblockaddress:$src1)>,
+          Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                           (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, globaladdress:$src2),
+           "if($src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
+                                (ins PredRegs:$src1, globaladdress:$src2),
+           "if(!$src1.new) $dst = ##$src2",
+           []>,
+           Requires<[HasV4T]>;
+
+let AddedComplexity = 50, Predicates = [HasV4T] in
+def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
+           (TFRI_V4 tglobaladdr:$src1)>;
+
+
+// Load - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def LDrid_ind_lo_V4 : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            "$dst=memd($src1<<#$src2+##$offset)",
+            [(set (i64 DoubleRegs:$dst),
+                  (load (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                        (HexagonCONST32 tglobaladdr:$offset))))]>,
+            Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass LD_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$offset),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "($src1<<#$src2+##$offset)")),
+            [(set IntRegs:$dst,
+                  (i32 (OpNode (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                          (HexagonCONST32 tglobaladdr:$offset)))))]>,
+            Requires<[HasV4T]>;
+}
+
+defm LDrib_ind : LD_indirect_lo<"memb", sextloadi8>;
+defm LDriub_ind : LD_indirect_lo<"memub", zextloadi8>;
+defm LDrih_ind : LD_indirect_lo<"memh", sextloadi16>;
+defm LDriuh_ind : LD_indirect_lo<"memuh", zextloadi16>;
+defm LDriw_ind : LD_indirect_lo<"memw", load>;
+
+// Store - Indirect with long offset: These instructions take global address
+// as an operand
+let AddedComplexity = 10 in
+def STrid_ind_lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 DoubleRegs:$src4),
+            "memd($src1<<#$src2+#$src3) = $src4",
+            [(store (i64 DoubleRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+
+let AddedComplexity = 10 in
+multiclass ST_indirect_lo<string OpcStr, PatFrag OpNode> {
+  def _lo_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u2Imm:$src2, globaladdress:$src3,
+                 IntRegs:$src4),
+            !strconcat(OpcStr, "($src1<<#$src2+##$src3) = $src4"),
+            [(OpNode (i32 IntRegs:$src4),
+                 (add (shl IntRegs:$src1, u2ImmPred:$src2),
+                      (HexagonCONST32 tglobaladdr:$src3)))]>,
+             Requires<[HasV4T]>;
+}
+
+defm STrib_ind : ST_indirect_lo<"memb", truncstorei8>;
+defm STrih_ind : ST_indirect_lo<"memh", truncstorei16>;
+defm STriw_ind : ST_indirect_lo<"memw", store>;
+
+// Store - absolute addressing mode: These instruction take constant
+// value as the extended operand.
+multiclass ST_absimm<string OpcStr> {
+let isExtended = 1, opExtendable = 0, isPredicable = 1,
+validSubTargets = HasV4SubT in
+  def _abs_V4 : STInst2<(outs),
+            (ins u0AlwaysExt:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(##$src1) = $src2"),
+            []>,
+            Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 1, isPredicated = 1,
+validSubTargets = HasV4SubT in {
+  def _abs_cPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)", !strconcat(OpcStr, "(##$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)", !strconcat(OpcStr, "(##$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnNotPt_V4 : STInst2<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$src2) = $src3")),
+            []>,
+            Requires<[HasV4T]>;
+}
+
+let isExtended = 1, opExtendable = 0, mayStore = 1, isNVStore = 1,
+validSubTargets = HasV4SubT in
+  def _abs_nv_V4 : NVInst_V4<(outs),
+            (ins u0AlwaysExt:$src1, IntRegs:$src2),
+            !strconcat(OpcStr, "(##$src1) = $src2.new"),
+            []>,
+            Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 1, mayStore = 1, isPredicated = 1,
+isNVStore = 1, validSubTargets = HasV4SubT in {
+  def _abs_cPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if ($src1)",
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1)",
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if ($src1.new)",
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnNotPt_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2, IntRegs:$src3),
+            !strconcat("if (!$src1.new)",
+            !strconcat(OpcStr, "(##$src2) = $src3.new")),
+            []>,
+            Requires<[HasV4T]>;
+}
+}
+
+defm STrib_imm : ST_absimm<"memb">;
+defm STrih_imm : ST_absimm<"memh">;
+defm STriw_imm : ST_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrib_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STrih_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+
+def : Pat<(store (i32 IntRegs:$src1), u0AlwaysExtPred:$src2),
+          (STriw_imm_abs_V4 u0AlwaysExtPred:$src2, IntRegs: $src1)>;
+}
+
+// Load - absolute addressing mode: These instruction take constant
+// value as the extended operand
+
+multiclass LD_absimm<string OpcStr> {
+let isExtended = 1, opExtendable = 1, isPredicable = 1,
+validSubTargets = HasV4SubT in
+  def _abs_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins u0AlwaysExt:$src),
+            !strconcat("$dst = ",
+            !strconcat(OpcStr, "(##$src)")),
+            []>,
+            Requires<[HasV4T]>;
+
+let isExtended = 1, opExtendable = 2, isPredicated = 1,
+validSubTargets = HasV4SubT in {
+  def _abs_cPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
+            !strconcat("if ($src1) $dst = ",
+            !strconcat(OpcStr, "(##$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
+            !strconcat("if (!$src1) $dst = ",
+            !strconcat(OpcStr, "(##$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
+            !strconcat("if ($src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+
+  def _abs_cdnNotPt_V4 : LDInst2<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, u0AlwaysExt:$src2),
+            !strconcat("if (!$src1.new) $dst = ",
+            !strconcat(OpcStr, "(##$src2)")),
+            []>,
+            Requires<[HasV4T]>;
+}
+}
+
+defm LDrib_imm  : LD_absimm<"memb">;
+defm LDriub_imm : LD_absimm<"memub">;
+defm LDrih_imm  : LD_absimm<"memh">;
+defm LDriuh_imm : LD_absimm<"memuh">;
+defm LDriw_imm  : LD_absimm<"memw">;
+
+let Predicates = [HasV4T], AddedComplexity  = 30 in {
+def : Pat<(i32 (load u0AlwaysExtPred:$src)),
+          (LDriw_imm_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (sextloadi8 u0AlwaysExtPred:$src)),
+          (LDrib_imm_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (zextloadi8 u0AlwaysExtPred:$src)),
+          (LDriub_imm_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (sextloadi16 u0AlwaysExtPred:$src)),
+          (LDrih_imm_abs_V4 u0AlwaysExtPred:$src)>;
+
+def : Pat<(i32 (zextloadi16 u0AlwaysExtPred:$src)),
+          (LDriuh_imm_abs_V4 u0AlwaysExtPred:$src)>;
+}
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STriw_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_2Imm:$src2, globaladdress:$src3),
+            "memw($src1+#$src2) = ##$src3",
+            [(store (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+
+
+// Indexed store double word - global address.
+// memw(Rs+#u6:2)=#S8
+let AddedComplexity = 10 in
+def STrih_offset_ext_V4 : STInst<(outs),
+            (ins IntRegs:$src1, u6_1Imm:$src2, globaladdress:$src3),
+            "memh($src1+#$src2) = ##$src3",
+            [(truncstorei16 (HexagonCONST32 tglobaladdr:$src3),
+                    (add IntRegs:$src1, u6_1ImmPred:$src2))]>,
+            Requires<[HasV4T]>;
+// Map from store(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i64 DoubleRegs:$src1),
+                 FoldGlobalAddrGP:$addr),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_64 FoldGlobalAddrGP:$addr,
+                           (i64 DoubleRegs:$src1)),
+          (STrid_abs_V4 FoldGlobalAddrGP:$addr, (i64 DoubleRegs:$src1))>,
+          Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei8 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_8 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrib_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(truncstorei16 (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_16 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STrih_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from store(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(store (i32 IntRegs:$src1), FoldGlobalAddrGP:$addr),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_store_32 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1)),
+          (STriw_abs_V4 FoldGlobalAddrGP:$addr, (i32 IntRegs:$src1))>,
+            Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memd(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i64 (load FoldGlobalAddrGP:$addr)),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_64 FoldGlobalAddrGP:$addr),
+          (i64 (LDrid_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memb(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrib_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+//let AddedComplexity = 100 in
+let AddedComplexity = 100 in
+def : Pat<(i32 (extloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (sextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDrih_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memuh(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi16 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_16 FoldGlobalAddrGP:$addr),
+          (i32 (LDriuh_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memub(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (zextloadi8 FoldGlobalAddrGP:$addr)),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_8 FoldGlobalAddrGP:$addr),
+          (i32 (LDriub_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+// Map from load(globaladdress + x) -> memw(#foo + x)
+let AddedComplexity = 100 in
+def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
+def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
+          (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
+           Requires<[HasV4T]>;
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
new file mode 100644
index 000000000000..92d098cc0446
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -0,0 +1,626 @@
+def SDTHexagonFCONST32 : SDTypeProfile<1, 1, [
+                                            SDTCisVT<0, f32>,
+                                            SDTCisPtrTy<1>]>;
+def HexagonFCONST32 : SDNode<"HexagonISD::FCONST32",     SDTHexagonFCONST32>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def FCONST32_nsdata : LDInst<(outs IntRegs:$dst), (ins globaladdress:$global),
+              "$dst = CONST32(#$global)",
+              [(set (f32 IntRegs:$dst),
+              (HexagonFCONST32 tglobaladdr:$global))]>,
+               Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST64_Float_Real : LDInst<(outs DoubleRegs:$dst), (ins f64imm:$src1),
+                       "$dst = CONST64(#$src1)",
+                       [(set DoubleRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+let isReMaterializable = 1, isMoveImm = 1 in
+def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
+                       "$dst = CONST32(#$src1)",
+                       [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+// Transfer immediate float.
+// Only works with single precision fp value.
+// For double precision, use CONST64_float_real, as 64bit transfer
+// can only hold 40-bit values - 32 from const ext + 8 bit immediate.
+let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1),
+           "$dst = ##$src1",
+           [(set IntRegs:$dst, fpimm:$src1)]>,
+          Requires<[HasV5T]>;
+
+def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
+                          (ins PredRegs:$src1, f32imm:$src2),
+           "if ($src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+let isPredicated = 1 in
+def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
+                             (ins PredRegs:$src1, f32imm:$src2),
+           "if (!$src1) $dst = ##$src2",
+           []>,
+          Requires<[HasV5T]>;
+
+// Convert single precision to double precision and vice-versa.
+def CONVERT_sf2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+                "$dst = convert_sf2df($src)",
+                [(set DoubleRegs:$dst, (fextend IntRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+def CONVERT_df2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+                "$dst = convert_df2sf($src)",
+                [(set IntRegs:$dst, (fround DoubleRegs:$src))]>,
+          Requires<[HasV5T]>;
+
+
+// Load.
+def LDrid_f : LDInst<(outs DoubleRegs:$dst),
+            (ins MEMri:$addr),
+            "$dst = memd($addr)",
+            [(set DoubleRegs:$dst, (f64 (load ADDRriS11_3:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDrid_indexed_f : LDInst<(outs DoubleRegs:$dst),
+            (ins IntRegs:$src1, s11_3Imm:$offset),
+            "$dst = memd($src1+#$offset)",
+            [(set DoubleRegs:$dst, (f64 (load (add IntRegs:$src1,
+                                              s11_3ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+def LDriw_f : LDInst<(outs IntRegs:$dst),
+            (ins MEMri:$addr), "$dst = memw($addr)",
+            [(set IntRegs:$dst, (f32 (load ADDRriS11_2:$addr)))]>,
+          Requires<[HasV5T]>;
+
+
+let AddedComplexity = 20 in
+def LDriw_indexed_f : LDInst<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s11_2Imm:$offset),
+            "$dst = memw($src1+#$offset)",
+            [(set IntRegs:$dst, (f32 (load (add IntRegs:$src1,
+                                           s11_2ImmPred:$offset))))]>,
+          Requires<[HasV5T]>;
+
+// Store.
+def STriw_f : STInst<(outs),
+            (ins MEMri:$addr, IntRegs:$src1),
+            "memw($addr) = $src1",
+            [(store (f32 IntRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+let AddedComplexity = 10 in
+def STriw_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_2Imm:$src2, IntRegs:$src3),
+            "memw($src1+#$src2) = $src3",
+            [(store (f32 IntRegs:$src3),
+                (add IntRegs:$src1, s11_2ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+def STrid_f : STInst<(outs),
+            (ins MEMri:$addr, DoubleRegs:$src1),
+            "memd($addr) = $src1",
+            [(store (f64 DoubleRegs:$src1), ADDRriS11_2:$addr)]>,
+          Requires<[HasV5T]>;
+
+// Indexed store double word.
+let AddedComplexity = 10 in
+def STrid_indexed_f : STInst<(outs),
+            (ins IntRegs:$src1, s11_3Imm:$src2,  DoubleRegs:$src3),
+            "memd($src1+#$src2) = $src3",
+            [(store (f64 DoubleRegs:$src3),
+                                (add IntRegs:$src1, s11_3ImmPred:$src2))]>,
+          Requires<[HasV5T]>;
+
+
+// Add
+let isCommutable = 1 in
+def fADD_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfadd($src1, $src2)",
+            [(set IntRegs:$dst, (fadd IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fADD64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfadd($src1, $src2)",
+               [(set DoubleRegs:$dst, (fadd DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfsub($src1, $src2)",
+            [(set IntRegs:$dst, (fsub IntRegs:$src1, IntRegs:$src2))]>,
+          Requires<[HasV5T]>;
+
+def fSUB64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfsub($src1, $src2)",
+               [(set DoubleRegs:$dst, (fsub DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL_rr : ALU64_rr<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, IntRegs:$src2),
+            "$dst = sfmpy($src1, $src2)",
+            [(set IntRegs:$dst, (fmul IntRegs:$src1, IntRegs:$src2))]>,
+            Requires<[HasV5T]>;
+
+let isCommutable = 1 in
+def fMUL64_rr : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1,
+                                                     DoubleRegs:$src2),
+               "$dst = dfmpy($src1, $src2)",
+               [(set DoubleRegs:$dst, (fmul DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>,
+               Requires<[HasV5T]>;
+
+// Compare.
+let isCompare = 1 in {
+multiclass FCMP64_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$b, DoubleRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f64 DoubleRegs:$b), (f64 DoubleRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+
+multiclass FCMP32_rr<string OpcStr, PatFrag OpNode> {
+  def _rr : ALU64_rr<(outs PredRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat("$dst = ", !strconcat(OpcStr, "($b, $c)")),
+                 [(set PredRegs:$dst,
+                        (OpNode (f32 IntRegs:$b), (f32 IntRegs:$c)))]>,
+                 Requires<[HasV5T]>;
+}
+}
+
+defm FCMPOEQ64 : FCMP64_rr<"dfcmp.eq", setoeq>;
+defm FCMPUEQ64 : FCMP64_rr<"dfcmp.eq", setueq>;
+defm FCMPOGT64 : FCMP64_rr<"dfcmp.gt", setogt>;
+defm FCMPUGT64 : FCMP64_rr<"dfcmp.gt", setugt>;
+defm FCMPOGE64 : FCMP64_rr<"dfcmp.ge", setoge>;
+defm FCMPUGE64 : FCMP64_rr<"dfcmp.ge", setuge>;
+
+defm FCMPOEQ32 : FCMP32_rr<"sfcmp.eq", setoeq>;
+defm FCMPUEQ32 : FCMP32_rr<"sfcmp.eq", setueq>;
+defm FCMPOGT32 : FCMP32_rr<"sfcmp.gt", setogt>;
+defm FCMPUGT32 : FCMP32_rr<"sfcmp.gt", setugt>;
+defm FCMPOGE32 : FCMP32_rr<"sfcmp.ge", setoge>;
+defm FCMPUGE32 : FCMP32_rr<"sfcmp.ge", setuge>;
+
+// olt.
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setolt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// gt.
+def : Pat <(i1 (setugt (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 DoubleRegs:$src1),
+                        (f64 (CONST64_Float_Real fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setugt (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 IntRegs:$src1), (f32 (TFRI_f fpimm:$src2))))>,
+      Requires<[HasV5T]>;
+
+// ult.
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT32_rr (f32 (TFRI_f fpimm:$src2)), (f32 IntRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat <(i1 (setult (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGT64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                        (f64 DoubleRegs:$src1)))>,
+      Requires<[HasV5T]>;
+
+// le.
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPOGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPOGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setole (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPOGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// rs <= rt -> rt >= rs.
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (FCMPUGE32_rr IntRegs:$src2, IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE32_rr (f32 (TFRI_f fpimm:$src2)), IntRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// Rss <= Rtt -> Rtt >= Rss.
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (FCMPUGE64_rr DoubleRegs:$src2, DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setule (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (FCMPUGE64_rr (f64 (CONST64_Float_Real fpimm:$src2)),
+                                DoubleRegs:$src1))>,
+      Requires<[HasV5T]>;
+
+// ne.
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1, IntRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1, DoubleRegs:$src2)))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ32_rr IntRegs:$src1, (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setone (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPOEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f32 IntRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ32_rr IntRegs:$src1,  (f32 (TFRI_f fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+def : Pat<(i1 (setune (f64 DoubleRegs:$src1), (fpimm:$src2))),
+      (i1 (NOT_p (FCMPUEQ64_rr DoubleRegs:$src1,
+                              (f64 (CONST64_Float_Real fpimm:$src2)))))>,
+      Requires<[HasV5T]>;
+
+// Convert Integer to Floating Point.
+def CONVERT_d2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2sf : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2sf($src)",
+              [(set (f32 IntRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2sf : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2sf($src)",
+              [(set (f32 IntRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_d2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_d2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_ud2df : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_ud2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_uw2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_uw2df($src)",
+              [(set (f64 DoubleRegs:$dst), (uint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_w2df : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_w2df($src)",
+              [(set (f64 DoubleRegs:$dst), (sint_to_fp (i32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer - default.
+def CONVERT_df2uw : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2w : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2uw : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2w : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src):chop",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2d : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_df2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2d : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+def CONVERT_sf2ud : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src):chop",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T]>;
+
+// Convert Floating Point to Integer: non-chopped.
+let AddedComplexity = 20 in
+def CONVERT_df2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2uw_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2uw($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2w_nchop : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2w($src)",
+              [(set (i32 IntRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_df2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+              "$dst = convert_df2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f64 DoubleRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2d_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2d($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_sint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+let AddedComplexity = 20 in
+def CONVERT_sf2ud_nchop : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+              "$dst = convert_sf2ud($src)",
+              [(set (i64 DoubleRegs:$dst), (fp_to_uint (f32 IntRegs:$src)))]>,
+              Requires<[HasV5T, IEEERndNearV5T]>;
+
+
+
+// Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
+def : Pat <(i32 (bitconvert (f32 IntRegs:$src))),
+           (i32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f32 (bitconvert (i32 IntRegs:$src))),
+           (f32 (TFR IntRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(i64 (bitconvert (f64 DoubleRegs:$src))),
+           (i64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(f64 (bitconvert (i64 DoubleRegs:$src))),
+           (f64 (TFR64 DoubleRegs:$src))>,
+          Requires<[HasV5T]>;
+
+// Floating point fused multiply-add.
+def FMADD_dp : ALU64_acc<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+              "$dst += dfmpy($src2, $src3)",
+              [(set (f64 DoubleRegs:$dst),
+                  (fma DoubleRegs:$src2, DoubleRegs:$src3, DoubleRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+def FMADD_sp : ALU64_acc<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+              "$dst += sfmpy($src2, $src3)",
+              [(set (f32 IntRegs:$dst),
+                  (fma IntRegs:$src2, IntRegs:$src3, IntRegs:$src1))],
+                  "$src1 = $dst">,
+              Requires<[HasV5T]>;
+
+
+// Floating point max/min.
+let AddedComplexity = 100 in
+def FMAX_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmax($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setolt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMAX_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmax($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setolt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_dp : ALU64_rr<(outs DoubleRegs:$dst),
+                  (ins DoubleRegs:$src1, DoubleRegs:$src2),
+              "$dst = dfmin($src1, $src2)",
+              [(set DoubleRegs:$dst, (f64 (select (i1 (setogt DoubleRegs:$src2,
+                                                        DoubleRegs:$src1)),
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100 in
+def FMIN_sp : ALU64_rr<(outs IntRegs:$dst),
+                  (ins IntRegs:$src1, IntRegs:$src2),
+              "$dst = sfmin($src1, $src2)",
+              [(set IntRegs:$dst, (f32 (select (i1 (setogt IntRegs:$src2,
+                                                        IntRegs:$src1)),
+                                             IntRegs:$src1,
+                                             IntRegs:$src2)))]>,
+               Requires<[HasV5T]>;
+
+// Pseudo instruction to encode a set of conditional transfers.
+// This instruction is used instead of a mux and trades-off codesize
+// for performance. We conduct this transformation optimistically in
+// the hope that these instructions get promoted to dot-new transfers.
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr_f : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1,
+                                                        IntRegs:$src2,
+                                                        IntRegs:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 IntRegs:$src2,
+                                                 IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_rr64_f : ALU32_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
+                                                        DoubleRegs:$src2,
+                                                        DoubleRegs:$src3),
+                     "Error; should not emit",
+                     [(set DoubleRegs:$dst, (f64 (select PredRegs:$src1,
+                                                 DoubleRegs:$src2,
+                                                 DoubleRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ri_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, f32imm:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, IntRegs:$src2, fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ir_f : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, f32imm:$src2, IntRegs:$src3),
+            "Error; should not emit",
+            [(set IntRegs:$dst,
+             (f32 (select PredRegs:$src1, fpimm:$src2, IntRegs:$src3)))]>,
+               Requires<[HasV5T]>;
+
+let AddedComplexity = 100, isPredicated = 1 in
+def TFR_condset_ii_f : ALU32_rr<(outs IntRegs:$dst),
+                              (ins PredRegs:$src1, f32imm:$src2, f32imm:$src3),
+                     "Error; should not emit",
+                     [(set IntRegs:$dst, (f32 (select PredRegs:$src1,
+                                                 fpimm:$src2,
+                                                 fpimm:$src3)))]>,
+               Requires<[HasV5T]>;
+
+
+def : Pat <(select (i1 (setult (f32 IntRegs:$src1), (f32 IntRegs:$src2))),
+                   (f32 IntRegs:$src3),
+                   (f32 IntRegs:$src4)),
+    (TFR_condset_rr_f (FCMPUGT32_rr IntRegs:$src2, IntRegs:$src1), IntRegs:$src4,
+                      IntRegs:$src3)>, Requires<[HasV5T]>;
+
+def : Pat <(select (i1 (setult (f64 DoubleRegs:$src1), (f64 DoubleRegs:$src2))),
+                   (f64 DoubleRegs:$src3),
+                   (f64 DoubleRegs:$src4)),
+      (TFR_condset_rr64_f (FCMPUGT64_rr DoubleRegs:$src2, DoubleRegs:$src1),
+                DoubleRegs:$src4, DoubleRegs:$src3)>, Requires<[HasV5T]>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, #i, #j) => r0 = mux(p0, #j, #i).
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, fpimm:$src3),
+      (TFR_condset_ii_f PredRegs:$src1, fpimm:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = select(p0, #i, r1)
+// => r0 = TFR_condset_ri(p0, r1, #i)
+def : Pat <(select (not PredRegs:$src1), fpimm:$src2, IntRegs:$src3),
+      (TFR_condset_ri_f PredRegs:$src1, IntRegs:$src3, fpimm:$src2)>;
+
+// Map from p0 = pnot(p0); r0 = mux(p0, r1, #i)
+// => r0 = TFR_condset_ir(p0, #i, r1)
+def : Pat <(select (not PredRegs:$src1), IntRegs:$src2, fpimm:$src3),
+      (TFR_condset_ir_f PredRegs:$src1, fpimm:$src3, IntRegs:$src2)>;
+
+def : Pat <(i32 (fp_to_sint (f64 DoubleRegs:$src1))),
+          (i32 (EXTRACT_SUBREG (i64 (CONVERT_df2d (f64 DoubleRegs:$src1))), subreg_loreg))>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f32 IntRegs:$src1)),
+           (CLRBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fneg (f32 IntRegs:$src1)),
+           (TOGBIT_31 (f32 IntRegs:$src1), 31)>,
+          Requires<[HasV5T]>;
+
+/*
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+
+def : Pat <(fabs (f64 DoubleRegs:$src1)),
+          (CLRBIT_31 (f32 (EXTRACT_SUBREG DoubleRegs:$src1, subreg_hireg)), 31)>,
+          Requires<[HasV5T]>;
+          */
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
new file mode 100644
index 000000000000..99f59d5ea669
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -0,0 +1,3503 @@
+//===-- HexagonIntrinsics.td - Instruction intrinsics ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V2 Architecture
+// Application-Level Specification
+// 80-V9418-8 Rev. B
+// March 4, 2008
+//===----------------------------------------------------------------------===//
+
+//
+// ALU 32 types.
+//
+
+class qi_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_qisisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                      IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qis8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2,
+                                       IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_ALU32_qisis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                       s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_ALU32_qis8s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2, s8Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_sat<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sisi_rnd<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU32_sis16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s16Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s10si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s10Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set IntRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class si_lo_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.l = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_hi_ALU32_siu16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, u16Imm:$src2),
+             !strconcat("$dst.h = ", !strconcat(opc , "#$src2")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_s16<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins s16Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU32_s8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_ALU64_di<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_ALU32_si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU32_si_tfr<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+//
+// ALU 64 types.
+//
+
+class si_ALU64_si_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_sidi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_qididi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2,
+                                          DoubleRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2,
+                                           DoubleRegs:$src3))]>;
+
+class di_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU64_didi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_rnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_ALU64_didi_crnd_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):crnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class qi_ALU64_didi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_ALU64_sisi<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_l16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.H):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.H, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_sat_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1.L, $src2.L):sat:<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_hl<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_h16_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<16")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_lh<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_ll<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_ALU64_sisi_sat<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+//
+// SInst classes.
+//
+
+class qi_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qi_pxfer<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "$src")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class qi_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, !$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src),
+             !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class si_SInst_di_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src))]>;
+
+class di_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_didi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+          !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sisiu3<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                     imm:$src3))]>;
+
+class si_SInst_diu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_disi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_sidi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_SInst_disisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_siu6<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_si_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class di_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "($src)")),
+          [(set DoubleRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src),
+          !strconcat("$dst = ", !strconcat(opc , "$src")),
+          [(set PredRegs:$dst, (IntID IntRegs:$src))]>;
+
+class si_SInst_sisi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_SInst_diu6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5_rnd<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_SInst_siu5u5<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2, u5Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2, imm:$src3))]>;
+
+class si_SInst_sisisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didisi_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_SInst_didisi_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                        DoubleRegs:$src1, IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisiu5u5<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2, u5Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisidi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6u6<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2, u6Imm:$src3),
+              !strconcat("$dst = ", !strconcat(opc ,
+                                               "($src1, #$src2, #$src3)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2, imm:$src3))],
+              "$dst2 = $dst">;
+
+class di_SInst_dididi<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+              !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1,
+                                            DoubleRegs:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_diu6u6<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2,
+                                       u6Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didiqi<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       IntRegs:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, $src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_SInst_didiu3<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                       u3Imm:$src3),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2, #$src3)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2,
+                                        imm:$src3))]>;
+
+class di_SInst_didisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        IntRegs:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class di_SInst_didiu6_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u6Imm:$src2),
+          !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        imm:$src2))],
+          "$dst2 = $dst">;
+
+class si_SInst_sisisi_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisisi_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, $src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         IntRegs:$src2))],
+              "$dst2 = $dst">;
+
+
+class si_SInst_sisiu5_and<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst &= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_or<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst |= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_xor<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst ^= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class si_SInst_sisiu5_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                         imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_acc<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst += ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                            DoubleRegs:$src1, imm:$src2))],
+              "$dst2 = $dst">;
+
+class di_SInst_didiu6_nac<string opc, Intrinsic IntID>
+  : SInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           u5Imm:$src2),
+              !strconcat("$dst -= ", !strconcat(opc , "($src1, #$src2)")),
+              [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                            imm:$src2))],
+              "$dst2 = $dst">;
+
+
+//
+// MInst classes.
+//
+
+class di_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):rnd")),
+               [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_disisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst += ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+             !strconcat("$dst -= ", !strconcat(opc ,
+                                               "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2))],
+             "$dst2 = $dst">;
+
+class di_MInst_s8s8<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins s8Imm:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, imm:$src2))]>;
+
+class si_MInst_sis9<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+
+class si_MInst_sisi_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):<<1")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_up<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class di_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_SInst_sisi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_SInst_didi_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_SInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_l_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.L):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_h_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2.H):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_s1_rnd_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2*):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisisi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        IntRegs:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisis8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        s8Imm:$src3),
+             !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                        imm:$src3))],
+             "$dst2 = $dst">;
+
+class si_MInst_sisiu4u5<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        u4Imm:$src2, u5Imm:$src3),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1, #$src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          imm:$src2, imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst += ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisiu8_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src2,
+                                        u8Imm:$src3),
+               !strconcat("$dst -= ", !strconcat(opc , "($src2, #$src3)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src2,
+                                          imm:$src3))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_acc_sat_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_hl_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_lh_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_nac_ll_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                          IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_sat_conj<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2*):sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_sisi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                           DoubleRegs:$src2))]>;
+
+class si_MInst_didi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_didi_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.H, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.H):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.H):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1.L, $src2.L):sat")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.L, $src2.L):<<1:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ", !strconcat(opc ,
+                                                "($src1.H, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc ,
+                                     "($src1.H, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_hl_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_lh_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_sat_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd:sat")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_sisi_rnd_ll_s1<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+               !strconcat("$dst = ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1:rnd")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_dididi_acc_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1, DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2):sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+
+class di_MInst_dididi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2,
+                                           DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_dididi_acc_conj<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1, $src2*)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.H, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.H)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ", !strconcat(opc , "($src1.L, $src2.L)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_hl_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.H, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_lh_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.H):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_nac_ll_s1<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst -= ",
+                          !strconcat(opc , "($src1.L, $src2.L):<<1")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, IntRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, IntRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class di_MInst_disi_s1_sat<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2):<<1:sat")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class di_MInst_didisi_acc_s1_sat<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           IntRegs:$src2),
+               !strconcat("$dst += ",
+                          !strconcat(opc , "($src1, $src2):<<1:sat")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2,
+                                             DoubleRegs:$src1,
+                                             IntRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_disi_s1_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ",
+                        !strconcat(opc , "($src1, $src2):<<1:rnd:sat")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, IntRegs:$src2))]>;
+
+class si_MInst_didi<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+//
+// LDInst classes.
+//
+let mayLoad = 1, neverHasSideEffects = 1 in
+class di_LDInstPI_diu4<string opc, Intrinsic IntID>
+  : LDInstPI<(outs IntRegs:$dst, DoubleRegs:$dst2),
+           (ins IntRegs:$src1, IntRegs:$src2, CRRegs:$src3, s4Imm:$offset),
+           "$dst2 = memd($src1++#$offset:circ($src3))",
+           [],
+           "$src1 = $dst">;
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Add.
+def HEXAGON_A2_add:
+  si_ALU32_sisi                   <"add",      int_hexagon_A2_add>;
+def HEXAGON_A2_addi:
+  si_ALU32_sis16                  <"add",      int_hexagon_A2_addi>;
+
+// ALU32 / ALU / Logical operations.
+def HEXAGON_A2_and:
+  si_ALU32_sisi                   <"and",      int_hexagon_A2_and>;
+def HEXAGON_A2_andir:
+  si_ALU32_sis10                  <"and",      int_hexagon_A2_andir>;
+def HEXAGON_A2_not:
+  si_ALU32_si                     <"not",      int_hexagon_A2_not>;
+def HEXAGON_A2_or:
+  si_ALU32_sisi                   <"or",       int_hexagon_A2_or>;
+def HEXAGON_A2_orir:
+  si_ALU32_sis10                  <"or",       int_hexagon_A2_orir>;
+def HEXAGON_A2_xor:
+  si_ALU32_sisi                   <"xor",      int_hexagon_A2_xor>;
+
+// ALU32 / ALU / Negate.
+def HEXAGON_A2_neg:
+  si_ALU32_si                     <"neg",      int_hexagon_A2_neg>;
+
+// ALU32 / ALU / Subtract.
+def HEXAGON_A2_sub:
+  si_ALU32_sisi                   <"sub",      int_hexagon_A2_sub>;
+def HEXAGON_A2_subri:
+  si_ALU32_s10si                  <"sub",      int_hexagon_A2_subri>;
+
+// ALU32 / ALU / Transfer Immediate.
+def HEXAGON_A2_tfril:
+  si_lo_ALU32_siu16               <"",         int_hexagon_A2_tfril>;
+def HEXAGON_A2_tfrih:
+  si_hi_ALU32_siu16               <"",         int_hexagon_A2_tfrih>;
+def HEXAGON_A2_tfrsi:
+  si_ALU32_s16                    <"",         int_hexagon_A2_tfrsi>;
+def HEXAGON_A2_tfrpi:
+  di_ALU32_s8                     <"",         int_hexagon_A2_tfrpi>;
+
+// ALU32 / ALU / Transfer Register.
+def HEXAGON_A2_tfr:
+  si_ALU32_si_tfr                  <"",        int_hexagon_A2_tfr>;
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine.
+def HEXAGON_A2_combinew:
+  di_ALU32_sisi                   <"combine",  int_hexagon_A2_combinew>;
+def HEXAGON_A2_combine_hh:
+  si_MInst_sisi_hh                <"combine",  int_hexagon_A2_combine_hh>;
+def HEXAGON_A2_combine_lh:
+  si_MInst_sisi_lh                <"combine",  int_hexagon_A2_combine_lh>;
+def HEXAGON_A2_combine_hl:
+  si_MInst_sisi_hl                <"combine",  int_hexagon_A2_combine_hl>;
+def HEXAGON_A2_combine_ll:
+  si_MInst_sisi_ll                <"combine",  int_hexagon_A2_combine_ll>;
+def HEXAGON_A2_combineii:
+  di_MInst_s8s8                   <"combine",  int_hexagon_A2_combineii>;
+
+// ALU32 / PERM / Mux.
+def HEXAGON_C2_mux:
+  si_ALU32_qisisi                 <"mux",      int_hexagon_C2_mux>;
+def HEXAGON_C2_muxri:
+  si_ALU32_qis8si                 <"mux",      int_hexagon_C2_muxri>;
+def HEXAGON_C2_muxir:
+  si_ALU32_qisis8                 <"mux",      int_hexagon_C2_muxir>;
+def HEXAGON_C2_muxii:
+  si_ALU32_qis8s8                 <"mux",      int_hexagon_C2_muxii>;
+
+// ALU32 / PERM / Shift halfword.
+def HEXAGON_A2_aslh:
+  si_ALU32_si                     <"aslh",     int_hexagon_A2_aslh>;
+def HEXAGON_A2_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_A2_asrh>;
+def SI_to_SXTHI_asrh:
+  si_ALU32_si                     <"asrh",     int_hexagon_SI_to_SXTHI_asrh>;
+
+// ALU32 / PERM / Sign/zero extend.
+def HEXAGON_A2_sxth:
+  si_ALU32_si                     <"sxth",     int_hexagon_A2_sxth>;
+def HEXAGON_A2_sxtb:
+  si_ALU32_si                     <"sxtb",     int_hexagon_A2_sxtb>;
+def HEXAGON_A2_zxth:
+  si_ALU32_si                     <"zxth",     int_hexagon_A2_zxth>;
+def HEXAGON_A2_zxtb:
+  si_ALU32_si                     <"zxtb",     int_hexagon_A2_zxtb>;
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Compare.
+def HEXAGON_C2_cmpeq:
+  qi_ALU32_sisi                   <"cmp.eq",   int_hexagon_C2_cmpeq>;
+def HEXAGON_C2_cmpeqi:
+  qi_ALU32_sis10                  <"cmp.eq",   int_hexagon_C2_cmpeqi>;
+def HEXAGON_C2_cmpgei:
+  qi_ALU32_sis8                   <"cmp.ge",   int_hexagon_C2_cmpgei>;
+def HEXAGON_C2_cmpgeui:
+  qi_ALU32_siu8                   <"cmp.geu",  int_hexagon_C2_cmpgeui>;
+def HEXAGON_C2_cmpgt:
+  qi_ALU32_sisi                   <"cmp.gt",   int_hexagon_C2_cmpgt>;
+def HEXAGON_C2_cmpgti:
+  qi_ALU32_sis10                  <"cmp.gt",   int_hexagon_C2_cmpgti>;
+def HEXAGON_C2_cmpgtu:
+  qi_ALU32_sisi                   <"cmp.gtu",  int_hexagon_C2_cmpgtu>;
+def HEXAGON_C2_cmpgtui:
+  qi_ALU32_siu9                   <"cmp.gtu",  int_hexagon_C2_cmpgtui>;
+def HEXAGON_C2_cmplt:
+  qi_ALU32_sisi                   <"cmp.lt",   int_hexagon_C2_cmplt>;
+def HEXAGON_C2_cmpltu:
+  qi_ALU32_sisi                   <"cmp.ltu",  int_hexagon_C2_cmpltu>;
+
+/********************************************************************
+*            ALU32/VH                                               *
+*********************************************************************/
+
+// ALU32 / VH / Vector add halfwords.
+// Rd32=vadd[u]h(Rs32,Rt32:sat]
+def HEXAGON_A2_svaddh:
+  si_ALU32_sisi                   <"vaddh",    int_hexagon_A2_svaddh>;
+def HEXAGON_A2_svaddhs:
+  si_ALU32_sisi_sat               <"vaddh",    int_hexagon_A2_svaddhs>;
+def HEXAGON_A2_svadduhs:
+  si_ALU32_sisi_sat               <"vadduh",   int_hexagon_A2_svadduhs>;
+
+// ALU32 / VH / Vector average halfwords.
+def HEXAGON_A2_svavgh:
+  si_ALU32_sisi                   <"vavgh",    int_hexagon_A2_svavgh>;
+def HEXAGON_A2_svavghs:
+  si_ALU32_sisi_rnd               <"vavgh",    int_hexagon_A2_svavghs>;
+def HEXAGON_A2_svnavgh:
+  si_ALU32_sisi                   <"vnavgh",   int_hexagon_A2_svnavgh>;
+
+// ALU32 / VH / Vector subtract halfwords.
+def HEXAGON_A2_svsubh:
+  si_ALU32_sisi                   <"vsubh",    int_hexagon_A2_svsubh>;
+def HEXAGON_A2_svsubhs:
+  si_ALU32_sisi_sat               <"vsubh",    int_hexagon_A2_svsubhs>;
+def HEXAGON_A2_svsubuhs:
+  si_ALU32_sisi_sat               <"vsubuh",   int_hexagon_A2_svsubuhs>;
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def HEXAGON_A2_addp:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addp>;
+def HEXAGON_A2_addsat:
+  si_ALU64_sisi_sat               <"add",      int_hexagon_A2_addsat>;
+
+// ALU64 / ALU / Add halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def HEXAGON_A2_addh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"add",      int_hexagon_A2_addh_l16_hl>;
+def HEXAGON_A2_addh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"add",      int_hexagon_A2_addh_l16_ll>;
+
+def HEXAGON_A2_addh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"add",      int_hexagon_A2_addh_l16_sat_hl>;
+def HEXAGON_A2_addh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"add",      int_hexagon_A2_addh_l16_sat_ll>;
+
+def HEXAGON_A2_addh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"add",      int_hexagon_A2_addh_h16_hh>;
+def HEXAGON_A2_addh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"add",      int_hexagon_A2_addh_h16_hl>;
+def HEXAGON_A2_addh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"add",      int_hexagon_A2_addh_h16_lh>;
+def HEXAGON_A2_addh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"add",      int_hexagon_A2_addh_h16_ll>;
+
+def HEXAGON_A2_addh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"add",      int_hexagon_A2_addh_h16_sat_hh>;
+def HEXAGON_A2_addh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"add",      int_hexagon_A2_addh_h16_sat_hl>;
+def HEXAGON_A2_addh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"add",      int_hexagon_A2_addh_h16_sat_lh>;
+def HEXAGON_A2_addh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"add",      int_hexagon_A2_addh_h16_sat_ll>;
+
+// ALU64 / ALU / Compare.
+def HEXAGON_C2_cmpeqp:
+  qi_ALU64_didi                   <"cmp.eq",   int_hexagon_C2_cmpeqp>;
+def HEXAGON_C2_cmpgtp:
+  qi_ALU64_didi                   <"cmp.gt",   int_hexagon_C2_cmpgtp>;
+def HEXAGON_C2_cmpgtup:
+  qi_ALU64_didi                   <"cmp.gtu",  int_hexagon_C2_cmpgtup>;
+
+// ALU64 / ALU / Logical operations.
+def HEXAGON_A2_andp:
+  di_ALU64_didi                   <"and",      int_hexagon_A2_andp>;
+def HEXAGON_A2_orp:
+  di_ALU64_didi                   <"or",       int_hexagon_A2_orp>;
+def HEXAGON_A2_xorp:
+  di_ALU64_didi                   <"xor",      int_hexagon_A2_xorp>;
+
+// ALU64 / ALU / Maximum.
+def HEXAGON_A2_max:
+  si_ALU64_sisi                   <"max",      int_hexagon_A2_max>;
+def HEXAGON_A2_maxu:
+  si_ALU64_sisi                   <"maxu",     int_hexagon_A2_maxu>;
+
+// ALU64 / ALU / Minimum.
+def HEXAGON_A2_min:
+  si_ALU64_sisi                   <"min",      int_hexagon_A2_min>;
+def HEXAGON_A2_minu:
+  si_ALU64_sisi                   <"minu",     int_hexagon_A2_minu>;
+
+// ALU64 / ALU / Subtract.
+def HEXAGON_A2_subp:
+  di_ALU64_didi                   <"sub",      int_hexagon_A2_subp>;
+def HEXAGON_A2_subsat:
+  si_ALU64_sisi_sat               <"sub",      int_hexagon_A2_subsat>;
+
+// ALU64 / ALU / Subtract halfword.
+// Even though the definition says hl, it should be lh -
+//so DON'T change the class " si_ALU64_sisi_l16_lh " it inherits.
+def HEXAGON_A2_subh_l16_hl:
+  si_ALU64_sisi_l16_lh            <"sub",      int_hexagon_A2_subh_l16_hl>;
+def HEXAGON_A2_subh_l16_ll:
+  si_ALU64_sisi_l16_ll            <"sub",      int_hexagon_A2_subh_l16_ll>;
+
+def HEXAGON_A2_subh_l16_sat_hl:
+  si_ALU64_sisi_l16_sat_lh        <"sub",      int_hexagon_A2_subh_l16_sat_hl>;
+def HEXAGON_A2_subh_l16_sat_ll:
+  si_ALU64_sisi_l16_sat_ll        <"sub",      int_hexagon_A2_subh_l16_sat_ll>;
+
+def HEXAGON_A2_subh_h16_hh:
+  si_ALU64_sisi_h16_hh            <"sub",      int_hexagon_A2_subh_h16_hh>;
+def HEXAGON_A2_subh_h16_hl:
+  si_ALU64_sisi_h16_hl            <"sub",      int_hexagon_A2_subh_h16_hl>;
+def HEXAGON_A2_subh_h16_lh:
+  si_ALU64_sisi_h16_lh            <"sub",      int_hexagon_A2_subh_h16_lh>;
+def HEXAGON_A2_subh_h16_ll:
+  si_ALU64_sisi_h16_ll            <"sub",      int_hexagon_A2_subh_h16_ll>;
+
+def HEXAGON_A2_subh_h16_sat_hh:
+  si_ALU64_sisi_h16_sat_hh        <"sub",      int_hexagon_A2_subh_h16_sat_hh>;
+def HEXAGON_A2_subh_h16_sat_hl:
+  si_ALU64_sisi_h16_sat_hl        <"sub",      int_hexagon_A2_subh_h16_sat_hl>;
+def HEXAGON_A2_subh_h16_sat_lh:
+  si_ALU64_sisi_h16_sat_lh        <"sub",      int_hexagon_A2_subh_h16_sat_lh>;
+def HEXAGON_A2_subh_h16_sat_ll:
+  si_ALU64_sisi_h16_sat_ll        <"sub",      int_hexagon_A2_subh_h16_sat_ll>;
+
+// ALU64 / ALU / Transfer register.
+def HEXAGON_A2_tfrp:
+  di_ALU64_di                     <"",         int_hexagon_A2_tfrp>;
+
+/********************************************************************
+*            ALU64/BIT                                              *
+*********************************************************************/
+
+// ALU64 / BIT / Masked parity.
+def HEXAGON_S2_parityp:
+  si_ALU64_didi                   <"parity",   int_hexagon_S2_parityp>;
+
+/********************************************************************
+*            ALU64/PERM                                             *
+*********************************************************************/
+
+// ALU64 / PERM / Vector pack high and low halfwords.
+def HEXAGON_S2_packhl:
+  di_ALU64_sisi                   <"packhl",   int_hexagon_S2_packhl>;
+
+/********************************************************************
+*            ALU64/VB                                               *
+*********************************************************************/
+
+// ALU64 / VB / Vector add unsigned bytes.
+def HEXAGON_A2_vaddub:
+  di_ALU64_didi                   <"vaddub",   int_hexagon_A2_vaddub>;
+def HEXAGON_A2_vaddubs:
+  di_ALU64_didi_sat               <"vaddub",   int_hexagon_A2_vaddubs>;
+
+// ALU64 / VB / Vector average unsigned bytes.
+def HEXAGON_A2_vavgub:
+  di_ALU64_didi                   <"vavgub",   int_hexagon_A2_vavgub>;
+def HEXAGON_A2_vavgubr:
+  di_ALU64_didi_rnd               <"vavgub",   int_hexagon_A2_vavgubr>;
+
+// ALU64 / VB / Vector compare unsigned bytes.
+def HEXAGON_A2_vcmpbeq:
+  qi_ALU64_didi                   <"vcmpb.eq", int_hexagon_A2_vcmpbeq>;
+def HEXAGON_A2_vcmpbgtu:
+  qi_ALU64_didi                   <"vcmpb.gtu",int_hexagon_A2_vcmpbgtu>;
+
+// ALU64 / VB / Vector maximum/minimum unsigned bytes.
+def HEXAGON_A2_vmaxub:
+  di_ALU64_didi                   <"vmaxub",   int_hexagon_A2_vmaxub>;
+def HEXAGON_A2_vminub:
+  di_ALU64_didi                   <"vminub",   int_hexagon_A2_vminub>;
+
+// ALU64 / VB / Vector subtract unsigned bytes.
+def HEXAGON_A2_vsubub:
+  di_ALU64_didi                   <"vsubub",   int_hexagon_A2_vsubub>;
+def HEXAGON_A2_vsububs:
+  di_ALU64_didi_sat               <"vsubub",   int_hexagon_A2_vsububs>;
+
+// ALU64 / VB / Vector mux.
+def HEXAGON_C2_vmux:
+  di_ALU64_qididi                 <"vmux",     int_hexagon_C2_vmux>;
+
+
+/********************************************************************
+*            ALU64/VH                                               *
+*********************************************************************/
+
+// ALU64 / VH / Vector add halfwords.
+// Rdd64=vadd[u]h(Rss64,Rtt64:sat]
+def HEXAGON_A2_vaddh:
+  di_ALU64_didi                   <"vaddh",    int_hexagon_A2_vaddh>;
+def HEXAGON_A2_vaddhs:
+  di_ALU64_didi_sat               <"vaddh",    int_hexagon_A2_vaddhs>;
+def HEXAGON_A2_vadduhs:
+  di_ALU64_didi_sat               <"vadduh",   int_hexagon_A2_vadduhs>;
+
+// ALU64 / VH / Vector average halfwords.
+// Rdd64=v[n]avg[u]h(Rss64,Rtt64:rnd/:crnd][:sat]
+def HEXAGON_A2_vavgh:
+  di_ALU64_didi                   <"vavgh",    int_hexagon_A2_vavgh>;
+def HEXAGON_A2_vavghcr:
+  di_ALU64_didi_crnd              <"vavgh",    int_hexagon_A2_vavghcr>;
+def HEXAGON_A2_vavghr:
+  di_ALU64_didi_rnd               <"vavgh",    int_hexagon_A2_vavghr>;
+def HEXAGON_A2_vavguh:
+  di_ALU64_didi                   <"vavguh",   int_hexagon_A2_vavguh>;
+def HEXAGON_A2_vavguhr:
+  di_ALU64_didi_rnd               <"vavguh",   int_hexagon_A2_vavguhr>;
+def HEXAGON_A2_vnavgh:
+  di_ALU64_didi                   <"vnavgh",   int_hexagon_A2_vnavgh>;
+def HEXAGON_A2_vnavghcr:
+  di_ALU64_didi_crnd_sat          <"vnavgh",   int_hexagon_A2_vnavghcr>;
+def HEXAGON_A2_vnavghr:
+  di_ALU64_didi_rnd_sat           <"vnavgh",   int_hexagon_A2_vnavghr>;
+
+// ALU64 / VH / Vector compare halfwords.
+def HEXAGON_A2_vcmpheq:
+  qi_ALU64_didi                   <"vcmph.eq", int_hexagon_A2_vcmpheq>;
+def HEXAGON_A2_vcmphgt:
+  qi_ALU64_didi                   <"vcmph.gt", int_hexagon_A2_vcmphgt>;
+def HEXAGON_A2_vcmphgtu:
+  qi_ALU64_didi                   <"vcmph.gtu",int_hexagon_A2_vcmphgtu>;
+
+// ALU64 / VH / Vector maximum halfwords.
+def HEXAGON_A2_vmaxh:
+  di_ALU64_didi                   <"vmaxh",    int_hexagon_A2_vmaxh>;
+def HEXAGON_A2_vmaxuh:
+  di_ALU64_didi                   <"vmaxuh",   int_hexagon_A2_vmaxuh>;
+
+// ALU64 / VH / Vector minimum halfwords.
+def HEXAGON_A2_vminh:
+  di_ALU64_didi                   <"vminh",    int_hexagon_A2_vminh>;
+def HEXAGON_A2_vminuh:
+  di_ALU64_didi                   <"vminuh",   int_hexagon_A2_vminuh>;
+
+// ALU64 / VH / Vector subtract halfwords.
+def HEXAGON_A2_vsubh:
+  di_ALU64_didi                   <"vsubh",    int_hexagon_A2_vsubh>;
+def HEXAGON_A2_vsubhs:
+  di_ALU64_didi_sat               <"vsubh",    int_hexagon_A2_vsubhs>;
+def HEXAGON_A2_vsubuhs:
+  di_ALU64_didi_sat               <"vsubuh",   int_hexagon_A2_vsubuhs>;
+
+
+/********************************************************************
+*            ALU64/VW                                               *
+*********************************************************************/
+
+// ALU64 / VW / Vector add words.
+// Rdd32=vaddw(Rss32,Rtt32)[:sat]
+def HEXAGON_A2_vaddw:
+  di_ALU64_didi                   <"vaddw",    int_hexagon_A2_vaddw>;
+def HEXAGON_A2_vaddws:
+  di_ALU64_didi_sat               <"vaddw",   int_hexagon_A2_vaddws>;
+
+// ALU64 / VW / Vector average words.
+def HEXAGON_A2_vavguw:
+  di_ALU64_didi                   <"vavguw",   int_hexagon_A2_vavguw>;
+def HEXAGON_A2_vavguwr:
+  di_ALU64_didi_rnd               <"vavguw",   int_hexagon_A2_vavguwr>;
+def HEXAGON_A2_vavgw:
+  di_ALU64_didi                   <"vavgw",    int_hexagon_A2_vavgw>;
+def HEXAGON_A2_vavgwcr:
+  di_ALU64_didi_crnd              <"vavgw",    int_hexagon_A2_vavgwcr>;
+def HEXAGON_A2_vavgwr:
+  di_ALU64_didi_rnd               <"vavgw",    int_hexagon_A2_vavgwr>;
+def HEXAGON_A2_vnavgw:
+  di_ALU64_didi                   <"vnavgw",   int_hexagon_A2_vnavgw>;
+def HEXAGON_A2_vnavgwcr:
+  di_ALU64_didi_crnd_sat          <"vnavgw",   int_hexagon_A2_vnavgwcr>;
+def HEXAGON_A2_vnavgwr:
+  di_ALU64_didi_rnd_sat           <"vnavgw",   int_hexagon_A2_vnavgwr>;
+
+// ALU64 / VW / Vector compare words.
+def HEXAGON_A2_vcmpweq:
+  qi_ALU64_didi                   <"vcmpw.eq", int_hexagon_A2_vcmpweq>;
+def HEXAGON_A2_vcmpwgt:
+  qi_ALU64_didi                   <"vcmpw.gt", int_hexagon_A2_vcmpwgt>;
+def HEXAGON_A2_vcmpwgtu:
+  qi_ALU64_didi                   <"vcmpw.gtu",int_hexagon_A2_vcmpwgtu>;
+
+// ALU64 / VW / Vector maximum words.
+def HEXAGON_A2_vmaxw:
+  di_ALU64_didi                   <"vmaxw",    int_hexagon_A2_vmaxw>;
+def HEXAGON_A2_vmaxuw:
+  di_ALU64_didi                   <"vmaxuw",   int_hexagon_A2_vmaxuw>;
+
+// ALU64 / VW / Vector minimum words.
+def HEXAGON_A2_vminw:
+  di_ALU64_didi                   <"vminw",    int_hexagon_A2_vminw>;
+def HEXAGON_A2_vminuw:
+  di_ALU64_didi                   <"vminuw",   int_hexagon_A2_vminuw>;
+
+// ALU64 / VW / Vector subtract words.
+def HEXAGON_A2_vsubw:
+  di_ALU64_didi                   <"vsubw",    int_hexagon_A2_vsubw>;
+def HEXAGON_A2_vsubws:
+  di_ALU64_didi_sat               <"vsubw",    int_hexagon_A2_vsubws>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Logical reductions on predicates.
+def HEXAGON_C2_all8:
+  qi_SInst_qi                     <"all8",     int_hexagon_C2_all8>;
+def HEXAGON_C2_any8:
+  qi_SInst_qi                     <"any8",     int_hexagon_C2_any8>;
+
+// CR / Logical operations on predicates.
+def HEXAGON_C2_pxfer_map:
+  qi_SInst_qi_pxfer               <"",         int_hexagon_C2_pxfer_map>;
+def HEXAGON_C2_and:
+  qi_SInst_qiqi                   <"and",      int_hexagon_C2_and>;
+def HEXAGON_C2_andn:
+  qi_SInst_qiqi_neg               <"and",      int_hexagon_C2_andn>;
+def HEXAGON_C2_not:
+  qi_SInst_qi                     <"not",      int_hexagon_C2_not>;
+def HEXAGON_C2_or:
+  qi_SInst_qiqi                   <"or",       int_hexagon_C2_or>;
+def HEXAGON_C2_orn:
+  qi_SInst_qiqi_neg               <"or",       int_hexagon_C2_orn>;
+def HEXAGON_C2_xor:
+  qi_SInst_qiqi                   <"xor",      int_hexagon_C2_xor>;
+
+
+/********************************************************************
+*            MTYPE/ALU                                              *
+*********************************************************************/
+
+// MTYPE / ALU / Add and accumulate.
+def HEXAGON_M2_acci:
+  si_MInst_sisisi_acc             <"add",      int_hexagon_M2_acci>;
+def HEXAGON_M2_accii:
+  si_MInst_sisis8_acc             <"add",      int_hexagon_M2_accii>;
+def HEXAGON_M2_nacci:
+  si_MInst_sisisi_nac             <"add",      int_hexagon_M2_nacci>;
+def HEXAGON_M2_naccii:
+  si_MInst_sisis8_nac             <"add",      int_hexagon_M2_naccii>;
+
+// MTYPE / ALU / Subtract and accumulate.
+def HEXAGON_M2_subacc:
+  si_MInst_sisisi_acc             <"sub",      int_hexagon_M2_subacc>;
+
+// MTYPE / ALU / Vector absolute difference.
+def HEXAGON_M2_vabsdiffh:
+  di_MInst_didi                   <"vabsdiffh",int_hexagon_M2_vabsdiffh>;
+def HEXAGON_M2_vabsdiffw:
+  di_MInst_didi                   <"vabsdiffw",int_hexagon_M2_vabsdiffw>;
+
+// MTYPE / ALU / XOR and xor with destination.
+def HEXAGON_M2_xor_xacc:
+  si_MInst_sisisi_xacc            <"xor",      int_hexagon_M2_xor_xacc>;
+
+
+/********************************************************************
+*            MTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// MTYPE / COMPLEX / Complex multiply.
+// Rdd[-+]=cmpy(Rs, Rt:<<1]:sat
+def HEXAGON_M2_cmpys_s1:
+  di_MInst_sisi_s1_sat            <"cmpy",     int_hexagon_M2_cmpys_s1>;
+def HEXAGON_M2_cmpys_s0:
+  di_MInst_sisi_sat               <"cmpy",     int_hexagon_M2_cmpys_s0>;
+def HEXAGON_M2_cmpysc_s1:
+  di_MInst_sisi_s1_sat_conj       <"cmpy",     int_hexagon_M2_cmpysc_s1>;
+def HEXAGON_M2_cmpysc_s0:
+  di_MInst_sisi_sat_conj          <"cmpy",     int_hexagon_M2_cmpysc_s0>;
+
+def HEXAGON_M2_cmacs_s1:
+  di_MInst_disisi_acc_s1_sat      <"cmpy",     int_hexagon_M2_cmacs_s1>;
+def HEXAGON_M2_cmacs_s0:
+  di_MInst_disisi_acc_sat         <"cmpy",     int_hexagon_M2_cmacs_s0>;
+def HEXAGON_M2_cmacsc_s1:
+  di_MInst_disisi_acc_s1_sat_conj <"cmpy",     int_hexagon_M2_cmacsc_s1>;
+def HEXAGON_M2_cmacsc_s0:
+  di_MInst_disisi_acc_sat_conj    <"cmpy",     int_hexagon_M2_cmacsc_s0>;
+
+def HEXAGON_M2_cnacs_s1:
+  di_MInst_disisi_nac_s1_sat      <"cmpy",     int_hexagon_M2_cnacs_s1>;
+def HEXAGON_M2_cnacs_s0:
+  di_MInst_disisi_nac_sat         <"cmpy",     int_hexagon_M2_cnacs_s0>;
+def HEXAGON_M2_cnacsc_s1:
+  di_MInst_disisi_nac_s1_sat_conj <"cmpy",     int_hexagon_M2_cnacsc_s1>;
+def HEXAGON_M2_cnacsc_s0:
+  di_MInst_disisi_nac_sat_conj    <"cmpy",     int_hexagon_M2_cnacsc_s0>;
+
+// MTYPE / COMPLEX / Complex multiply real or imaginary.
+def HEXAGON_M2_cmpyr_s0:
+  di_MInst_sisi                   <"cmpyr",    int_hexagon_M2_cmpyr_s0>;
+def HEXAGON_M2_cmacr_s0:
+  di_MInst_disisi_acc             <"cmpyr",    int_hexagon_M2_cmacr_s0>;
+
+def HEXAGON_M2_cmpyi_s0:
+  di_MInst_sisi                   <"cmpyi",    int_hexagon_M2_cmpyi_s0>;
+def HEXAGON_M2_cmaci_s0:
+  di_MInst_disisi_acc             <"cmpyi",    int_hexagon_M2_cmaci_s0>;
+
+// MTYPE / COMPLEX / Complex multiply with round and pack.
+// Rxx32+=cmpy(Rs32,[*]Rt32:<<1]:rnd:sat
+def HEXAGON_M2_cmpyrs_s0:
+  si_MInst_sisi_rnd_sat           <"cmpy",     int_hexagon_M2_cmpyrs_s0>;
+def HEXAGON_M2_cmpyrs_s1:
+  si_MInst_sisi_s1_rnd_sat        <"cmpy",     int_hexagon_M2_cmpyrs_s1>;
+
+def HEXAGON_M2_cmpyrsc_s0:
+  si_MInst_sisi_rnd_sat_conj      <"cmpy",     int_hexagon_M2_cmpyrsc_s0>;
+def HEXAGON_M2_cmpyrsc_s1:
+  si_MInst_sisi_s1_rnd_sat_conj   <"cmpy",     int_hexagon_M2_cmpyrsc_s1>;
+
+//MTYPE / COMPLEX / Vector complex multiply real or imaginary.
+def HEXAGON_M2_vcmpy_s0_sat_i:
+  di_MInst_didi_sat               <"vcmpyi",   int_hexagon_M2_vcmpy_s0_sat_i>;
+def HEXAGON_M2_vcmpy_s1_sat_i:
+  di_MInst_didi_s1_sat            <"vcmpyi",   int_hexagon_M2_vcmpy_s1_sat_i>;
+
+def HEXAGON_M2_vcmpy_s0_sat_r:
+  di_MInst_didi_sat               <"vcmpyr",   int_hexagon_M2_vcmpy_s0_sat_r>;
+def HEXAGON_M2_vcmpy_s1_sat_r:
+  di_MInst_didi_s1_sat            <"vcmpyr",   int_hexagon_M2_vcmpy_s1_sat_r>;
+
+def HEXAGON_M2_vcmac_s0_sat_i:
+  di_MInst_dididi_acc_sat         <"vcmpyi",   int_hexagon_M2_vcmac_s0_sat_i>;
+def HEXAGON_M2_vcmac_s0_sat_r:
+  di_MInst_dididi_acc_sat         <"vcmpyr",   int_hexagon_M2_vcmac_s0_sat_r>;
+
+//MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def HEXAGON_M2_vrcmpyi_s0:
+  di_MInst_didi                   <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0>;
+def HEXAGON_M2_vrcmpyr_s0:
+  di_MInst_didi                   <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0>;
+
+def HEXAGON_M2_vrcmpyi_s0c:
+  di_MInst_didi_conj              <"vrcmpyi",  int_hexagon_M2_vrcmpyi_s0c>;
+def HEXAGON_M2_vrcmpyr_s0c:
+  di_MInst_didi_conj              <"vrcmpyr",  int_hexagon_M2_vrcmpyr_s0c>;
+
+def HEXAGON_M2_vrcmaci_s0:
+  di_MInst_dididi_acc             <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0>;
+def HEXAGON_M2_vrcmacr_s0:
+  di_MInst_dididi_acc             <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0>;
+
+def HEXAGON_M2_vrcmaci_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyi",  int_hexagon_M2_vrcmaci_s0c>;
+def HEXAGON_M2_vrcmacr_s0c:
+  di_MInst_dididi_acc_conj        <"vrcmpyr",  int_hexagon_M2_vrcmacr_s0c>;
+
+
+/********************************************************************
+*            MTYPE/MPYH                                             *
+*********************************************************************/
+
+// MTYPE / MPYH / Multiply and use lower result.
+//def HEXAGON_M2_mpysmi:
+//FIXME: Hexagon_M2_mpysmi should really by of the type si_MInst_sim9,
+// not si_MInst_sis9 - but for now, we will use s9.
+// def Hexagon_M2_mpysmi:
+//  si_MInst_sim9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def Hexagon_M2_mpysmi:
+  si_MInst_sis9                   <"mpyi",     int_hexagon_M2_mpysmi>;
+def HEXAGON_M2_mpyi:
+  si_MInst_sisi                   <"mpyi",     int_hexagon_M2_mpyi>;
+def HEXAGON_M2_mpyui:
+  si_MInst_sisi                   <"mpyui",    int_hexagon_M2_mpyui>;
+def HEXAGON_M2_macsip:
+  si_MInst_sisiu8_acc             <"mpyi",     int_hexagon_M2_macsip>;
+def HEXAGON_M2_maci:
+  si_MInst_sisisi_acc             <"mpyi",     int_hexagon_M2_maci>;
+def HEXAGON_M2_macsin:
+  si_MInst_sisiu8_nac             <"mpyi",     int_hexagon_M2_macsin>;
+
+// MTYPE / MPYH / Multiply word by half (32x16).
+//Rdd[+]=vmpywoh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweh(Rss,Rtt)[:<<1][:rnd][:sat]
+def HEXAGON_M2_mmpyl_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweh",  int_hexagon_M2_mmpyl_rs1>;
+def HEXAGON_M2_mmpyl_s1:
+  di_MInst_didi_s1_sat            <"vmpyweh",  int_hexagon_M2_mmpyl_s1>;
+def HEXAGON_M2_mmpyl_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweh",  int_hexagon_M2_mmpyl_rs0>;
+def HEXAGON_M2_mmpyl_s0:
+  di_MInst_didi_sat               <"vmpyweh",  int_hexagon_M2_mmpyl_s0>;
+def HEXAGON_M2_mmpyh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywoh",  int_hexagon_M2_mmpyh_rs1>;
+def HEXAGON_M2_mmpyh_s1:
+  di_MInst_didi_s1_sat            <"vmpywoh",  int_hexagon_M2_mmpyh_s1>;
+def HEXAGON_M2_mmpyh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywoh",  int_hexagon_M2_mmpyh_rs0>;
+def HEXAGON_M2_mmpyh_s0:
+  di_MInst_didi_sat               <"vmpywoh",  int_hexagon_M2_mmpyh_s0>;
+def HEXAGON_M2_mmacls_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweh",  int_hexagon_M2_mmacls_rs1>;
+def HEXAGON_M2_mmacls_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweh",  int_hexagon_M2_mmacls_s1>;
+def HEXAGON_M2_mmacls_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweh",  int_hexagon_M2_mmacls_rs0>;
+def HEXAGON_M2_mmacls_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweh",  int_hexagon_M2_mmacls_s0>;
+def HEXAGON_M2_mmachs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywoh",  int_hexagon_M2_mmachs_rs1>;
+def HEXAGON_M2_mmachs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywoh",  int_hexagon_M2_mmachs_s1>;
+def HEXAGON_M2_mmachs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywoh",  int_hexagon_M2_mmachs_rs0>;
+def HEXAGON_M2_mmachs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywoh",  int_hexagon_M2_mmachs_s0>;
+
+// MTYPE / MPYH / Multiply word by unsigned half (32x16).
+//Rdd[+]=vmpywouh(Rss,Rtt)[:<<1][:rnd][:sat]
+//Rdd[+]=vmpyweuh(Rss,Rtt)[:<<1][:rnd][:sat]
+def HEXAGON_M2_mmpyul_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpyweuh", int_hexagon_M2_mmpyul_rs1>;
+def HEXAGON_M2_mmpyul_s1:
+  di_MInst_didi_s1_sat            <"vmpyweuh", int_hexagon_M2_mmpyul_s1>;
+def HEXAGON_M2_mmpyul_rs0:
+  di_MInst_didi_rnd_sat           <"vmpyweuh", int_hexagon_M2_mmpyul_rs0>;
+def HEXAGON_M2_mmpyul_s0:
+  di_MInst_didi_sat               <"vmpyweuh", int_hexagon_M2_mmpyul_s0>;
+def HEXAGON_M2_mmpyuh_rs1:
+  di_MInst_didi_s1_rnd_sat        <"vmpywouh", int_hexagon_M2_mmpyuh_rs1>;
+def HEXAGON_M2_mmpyuh_s1:
+  di_MInst_didi_s1_sat            <"vmpywouh", int_hexagon_M2_mmpyuh_s1>;
+def HEXAGON_M2_mmpyuh_rs0:
+  di_MInst_didi_rnd_sat           <"vmpywouh", int_hexagon_M2_mmpyuh_rs0>;
+def HEXAGON_M2_mmpyuh_s0:
+  di_MInst_didi_sat               <"vmpywouh", int_hexagon_M2_mmpyuh_s0>;
+def HEXAGON_M2_mmaculs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpyweuh", int_hexagon_M2_mmaculs_rs1>;
+def HEXAGON_M2_mmaculs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyweuh", int_hexagon_M2_mmaculs_s1>;
+def HEXAGON_M2_mmaculs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpyweuh", int_hexagon_M2_mmaculs_rs0>;
+def HEXAGON_M2_mmaculs_s0:
+  di_MInst_dididi_acc_sat         <"vmpyweuh", int_hexagon_M2_mmaculs_s0>;
+def HEXAGON_M2_mmacuhs_rs1:
+  di_MInst_dididi_acc_s1_rnd_sat  <"vmpywouh", int_hexagon_M2_mmacuhs_rs1>;
+def HEXAGON_M2_mmacuhs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpywouh", int_hexagon_M2_mmacuhs_s1>;
+def HEXAGON_M2_mmacuhs_rs0:
+  di_MInst_dididi_acc_rnd_sat     <"vmpywouh", int_hexagon_M2_mmacuhs_rs0>;
+def HEXAGON_M2_mmacuhs_s0:
+  di_MInst_dididi_acc_sat         <"vmpywouh", int_hexagon_M2_mmacuhs_s0>;
+
+// MTYPE / MPYH / Multiply and use upper result.
+def HEXAGON_M2_hmmpyh_rs1:
+  si_MInst_sisi_h_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyh_rs1>;
+def HEXAGON_M2_hmmpyl_rs1:
+  si_MInst_sisi_l_s1_rnd_sat      <"mpy",      int_hexagon_M2_hmmpyl_rs1>;
+def HEXAGON_M2_mpy_up:
+  si_MInst_sisi                   <"mpy",      int_hexagon_M2_mpy_up>;
+def HEXAGON_M2_dpmpyss_rnd_s0:
+  si_MInst_sisi_rnd               <"mpy",      int_hexagon_M2_dpmpyss_rnd_s0>;
+def HEXAGON_M2_mpyu_up:
+  si_MInst_sisi                   <"mpyu",     int_hexagon_M2_mpyu_up>;
+
+// MTYPE / MPYH / Multiply and use full result.
+def HEXAGON_M2_dpmpyuu_s0:
+  di_MInst_sisi                   <"mpyu",     int_hexagon_M2_dpmpyuu_s0>;
+def HEXAGON_M2_dpmpyuu_acc_s0:
+  di_MInst_disisi_acc             <"mpyu",     int_hexagon_M2_dpmpyuu_acc_s0>;
+def HEXAGON_M2_dpmpyuu_nac_s0:
+  di_MInst_disisi_nac             <"mpyu",     int_hexagon_M2_dpmpyuu_nac_s0>;
+def HEXAGON_M2_dpmpyss_s0:
+  di_MInst_sisi                   <"mpy",      int_hexagon_M2_dpmpyss_s0>;
+def HEXAGON_M2_dpmpyss_acc_s0:
+  di_MInst_disisi_acc             <"mpy",      int_hexagon_M2_dpmpyss_acc_s0>;
+def HEXAGON_M2_dpmpyss_nac_s0:
+  di_MInst_disisi_nac             <"mpy",      int_hexagon_M2_dpmpyss_nac_s0>;
+
+
+/********************************************************************
+*            MTYPE/MPYS                                             *
+*********************************************************************/
+
+// MTYPE / MPYS / Scalar 16x16 multiply signed.
+//Rd=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]|
+//          [:<<0[:rnd|:sat|:rnd:sat]|:<<1[:rnd|:sat|:rnd:sat]]]
+def HEXAGON_M2_mpy_hh_s0:
+  si_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpy_hh_s0>;
+def HEXAGON_M2_mpy_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpy_hh_s1>;
+def HEXAGON_M2_mpy_rnd_hh_s1:
+  si_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hh_s1>;
+def HEXAGON_M2_mpy_sat_rnd_hh_s1:
+  si_MInst_sisi_sat_rnd_hh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s1>;
+def HEXAGON_M2_mpy_sat_hh_s1:
+  si_MInst_sisi_sat_hh_s1         <"mpy",     int_hexagon_M2_mpy_sat_hh_s1>;
+def HEXAGON_M2_mpy_rnd_hh_s0:
+  si_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpy_rnd_hh_s0>;
+def HEXAGON_M2_mpy_sat_rnd_hh_s0:
+  si_MInst_sisi_sat_rnd_hh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hh_s0>;
+def HEXAGON_M2_mpy_sat_hh_s0:
+  si_MInst_sisi_sat_hh            <"mpy",     int_hexagon_M2_mpy_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_hl_s0:
+  si_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpy_hl_s0>;
+def HEXAGON_M2_mpy_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpy_hl_s1>;
+def HEXAGON_M2_mpy_rnd_hl_s1:
+  si_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpy_rnd_hl_s1>;
+def HEXAGON_M2_mpy_sat_rnd_hl_s1:
+  si_MInst_sisi_sat_rnd_hl_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s1>;
+def HEXAGON_M2_mpy_sat_hl_s1:
+  si_MInst_sisi_sat_hl_s1         <"mpy",     int_hexagon_M2_mpy_sat_hl_s1>;
+def HEXAGON_M2_mpy_rnd_hl_s0:
+  si_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpy_rnd_hl_s0>;
+def HEXAGON_M2_mpy_sat_rnd_hl_s0:
+  si_MInst_sisi_sat_rnd_hl        <"mpy",     int_hexagon_M2_mpy_sat_rnd_hl_s0>;
+def HEXAGON_M2_mpy_sat_hl_s0:
+  si_MInst_sisi_sat_hl            <"mpy",     int_hexagon_M2_mpy_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_lh_s0:
+  si_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpy_lh_s0>;
+def HEXAGON_M2_mpy_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpy_lh_s1>;
+def HEXAGON_M2_mpy_rnd_lh_s1:
+  si_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpy_rnd_lh_s1>;
+def HEXAGON_M2_mpy_sat_rnd_lh_s1:
+  si_MInst_sisi_sat_rnd_lh_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s1>;
+def HEXAGON_M2_mpy_sat_lh_s1:
+  si_MInst_sisi_sat_lh_s1         <"mpy",     int_hexagon_M2_mpy_sat_lh_s1>;
+def HEXAGON_M2_mpy_rnd_lh_s0:
+  si_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpy_rnd_lh_s0>;
+def HEXAGON_M2_mpy_sat_rnd_lh_s0:
+  si_MInst_sisi_sat_rnd_lh        <"mpy",     int_hexagon_M2_mpy_sat_rnd_lh_s0>;
+def HEXAGON_M2_mpy_sat_lh_s0:
+  si_MInst_sisi_sat_lh            <"mpy",     int_hexagon_M2_mpy_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_ll_s0:
+  si_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpy_ll_s0>;
+def HEXAGON_M2_mpy_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpy_ll_s1>;
+def HEXAGON_M2_mpy_rnd_ll_s1:
+  si_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpy_rnd_ll_s1>;
+def HEXAGON_M2_mpy_sat_rnd_ll_s1:
+  si_MInst_sisi_sat_rnd_ll_s1     <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s1>;
+def HEXAGON_M2_mpy_sat_ll_s1:
+  si_MInst_sisi_sat_ll_s1         <"mpy",     int_hexagon_M2_mpy_sat_ll_s1>;
+def HEXAGON_M2_mpy_rnd_ll_s0:
+  si_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpy_rnd_ll_s0>;
+def HEXAGON_M2_mpy_sat_rnd_ll_s0:
+  si_MInst_sisi_sat_rnd_ll        <"mpy",     int_hexagon_M2_mpy_sat_rnd_ll_s0>;
+def HEXAGON_M2_mpy_sat_ll_s0:
+  si_MInst_sisi_sat_ll            <"mpy",     int_hexagon_M2_mpy_sat_ll_s0>;
+
+//Rdd=mpy(Rs.[H|L],Rt.[H|L])[[:<<0|:<<1]|[:<<0:rnd|:<<1:rnd]]
+def HEXAGON_M2_mpyd_hh_s0:
+  di_MInst_sisi_hh                <"mpy",     int_hexagon_M2_mpyd_hh_s0>;
+def HEXAGON_M2_mpyd_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpy",     int_hexagon_M2_mpyd_hh_s1>;
+def HEXAGON_M2_mpyd_rnd_hh_s1:
+  di_MInst_sisi_rnd_hh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s1>;
+def HEXAGON_M2_mpyd_rnd_hh_s0:
+  di_MInst_sisi_rnd_hh            <"mpy",     int_hexagon_M2_mpyd_rnd_hh_s0>;
+
+def HEXAGON_M2_mpyd_hl_s0:
+  di_MInst_sisi_hl                <"mpy",     int_hexagon_M2_mpyd_hl_s0>;
+def HEXAGON_M2_mpyd_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpy",     int_hexagon_M2_mpyd_hl_s1>;
+def HEXAGON_M2_mpyd_rnd_hl_s1:
+  di_MInst_sisi_rnd_hl_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s1>;
+def HEXAGON_M2_mpyd_rnd_hl_s0:
+  di_MInst_sisi_rnd_hl            <"mpy",     int_hexagon_M2_mpyd_rnd_hl_s0>;
+
+def HEXAGON_M2_mpyd_lh_s0:
+  di_MInst_sisi_lh                <"mpy",     int_hexagon_M2_mpyd_lh_s0>;
+def HEXAGON_M2_mpyd_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpy",     int_hexagon_M2_mpyd_lh_s1>;
+def HEXAGON_M2_mpyd_rnd_lh_s1:
+  di_MInst_sisi_rnd_lh_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s1>;
+def HEXAGON_M2_mpyd_rnd_lh_s0:
+  di_MInst_sisi_rnd_lh            <"mpy",     int_hexagon_M2_mpyd_rnd_lh_s0>;
+
+def HEXAGON_M2_mpyd_ll_s0:
+  di_MInst_sisi_ll                <"mpy",     int_hexagon_M2_mpyd_ll_s0>;
+def HEXAGON_M2_mpyd_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpy",     int_hexagon_M2_mpyd_ll_s1>;
+def HEXAGON_M2_mpyd_rnd_ll_s1:
+  di_MInst_sisi_rnd_ll_s1         <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s1>;
+def HEXAGON_M2_mpyd_rnd_ll_s0:
+  di_MInst_sisi_rnd_ll            <"mpy",     int_hexagon_M2_mpyd_rnd_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def HEXAGON_M2_mpy_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpy",  int_hexagon_M2_mpy_acc_hh_s0>;
+def HEXAGON_M2_mpy_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpy",  int_hexagon_M2_mpy_acc_hh_s1>;
+def HEXAGON_M2_mpy_acc_sat_hh_s1:
+  si_MInst_sisisi_acc_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s1>;
+def HEXAGON_M2_mpy_acc_sat_hh_s0:
+  si_MInst_sisisi_acc_sat_hh        <"mpy",  int_hexagon_M2_mpy_acc_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpy",  int_hexagon_M2_mpy_acc_hl_s0>;
+def HEXAGON_M2_mpy_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpy",  int_hexagon_M2_mpy_acc_hl_s1>;
+def HEXAGON_M2_mpy_acc_sat_hl_s1:
+  si_MInst_sisisi_acc_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s1>;
+def HEXAGON_M2_mpy_acc_sat_hl_s0:
+  si_MInst_sisisi_acc_sat_hl        <"mpy",  int_hexagon_M2_mpy_acc_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpy",  int_hexagon_M2_mpy_acc_lh_s0>;
+def HEXAGON_M2_mpy_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpy",  int_hexagon_M2_mpy_acc_lh_s1>;
+def HEXAGON_M2_mpy_acc_sat_lh_s1:
+  si_MInst_sisisi_acc_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s1>;
+def HEXAGON_M2_mpy_acc_sat_lh_s0:
+  si_MInst_sisisi_acc_sat_lh        <"mpy",  int_hexagon_M2_mpy_acc_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpy",  int_hexagon_M2_mpy_acc_ll_s0>;
+def HEXAGON_M2_mpy_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpy",  int_hexagon_M2_mpy_acc_ll_s1>;
+def HEXAGON_M2_mpy_acc_sat_ll_s1:
+  si_MInst_sisisi_acc_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s1>;
+def HEXAGON_M2_mpy_acc_sat_ll_s0:
+  si_MInst_sisisi_acc_sat_ll        <"mpy",  int_hexagon_M2_mpy_acc_sat_ll_s0>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L])[[[:<<0|:<<1]|[:<<0:sat|:<<1:sat]]
+def HEXAGON_M2_mpy_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpy",  int_hexagon_M2_mpy_nac_hh_s0>;
+def HEXAGON_M2_mpy_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpy",  int_hexagon_M2_mpy_nac_hh_s1>;
+def HEXAGON_M2_mpy_nac_sat_hh_s1:
+  si_MInst_sisisi_nac_sat_hh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s1>;
+def HEXAGON_M2_mpy_nac_sat_hh_s0:
+  si_MInst_sisisi_nac_sat_hh        <"mpy",  int_hexagon_M2_mpy_nac_sat_hh_s0>;
+
+def HEXAGON_M2_mpy_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpy",  int_hexagon_M2_mpy_nac_hl_s0>;
+def HEXAGON_M2_mpy_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpy",  int_hexagon_M2_mpy_nac_hl_s1>;
+def HEXAGON_M2_mpy_nac_sat_hl_s1:
+  si_MInst_sisisi_nac_sat_hl_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s1>;
+def HEXAGON_M2_mpy_nac_sat_hl_s0:
+  si_MInst_sisisi_nac_sat_hl        <"mpy",  int_hexagon_M2_mpy_nac_sat_hl_s0>;
+
+def HEXAGON_M2_mpy_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpy",  int_hexagon_M2_mpy_nac_lh_s0>;
+def HEXAGON_M2_mpy_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpy",  int_hexagon_M2_mpy_nac_lh_s1>;
+def HEXAGON_M2_mpy_nac_sat_lh_s1:
+  si_MInst_sisisi_nac_sat_lh_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s1>;
+def HEXAGON_M2_mpy_nac_sat_lh_s0:
+  si_MInst_sisisi_nac_sat_lh        <"mpy",  int_hexagon_M2_mpy_nac_sat_lh_s0>;
+
+def HEXAGON_M2_mpy_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpy",  int_hexagon_M2_mpy_nac_ll_s0>;
+def HEXAGON_M2_mpy_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpy",  int_hexagon_M2_mpy_nac_ll_s1>;
+def HEXAGON_M2_mpy_nac_sat_ll_s1:
+  si_MInst_sisisi_nac_sat_ll_s1     <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s1>;
+def HEXAGON_M2_mpy_nac_sat_ll_s0:
+  si_MInst_sisisi_nac_sat_ll        <"mpy",  int_hexagon_M2_mpy_nac_sat_ll_s0>;
+
+//Rx+=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def HEXAGON_M2_mpyd_acc_hh_s0:
+  di_MInst_disisi_acc_hh          <"mpy",    int_hexagon_M2_mpyd_acc_hh_s0>;
+def HEXAGON_M2_mpyd_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hh_s1>;
+
+def HEXAGON_M2_mpyd_acc_hl_s0:
+  di_MInst_disisi_acc_hl          <"mpy",    int_hexagon_M2_mpyd_acc_hl_s0>;
+def HEXAGON_M2_mpyd_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1       <"mpy",    int_hexagon_M2_mpyd_acc_hl_s1>;
+
+def HEXAGON_M2_mpyd_acc_lh_s0:
+  di_MInst_disisi_acc_lh          <"mpy",    int_hexagon_M2_mpyd_acc_lh_s0>;
+def HEXAGON_M2_mpyd_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1       <"mpy",    int_hexagon_M2_mpyd_acc_lh_s1>;
+
+def HEXAGON_M2_mpyd_acc_ll_s0:
+  di_MInst_disisi_acc_ll          <"mpy",    int_hexagon_M2_mpyd_acc_ll_s0>;
+def HEXAGON_M2_mpyd_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1       <"mpy",    int_hexagon_M2_mpyd_acc_ll_s1>;
+
+//Rx-=mpy(Rs.[H|L],Rt.[H|L:<<0|:<<1]
+def HEXAGON_M2_mpyd_nac_hh_s0:
+  di_MInst_disisi_nac_hh          <"mpy",    int_hexagon_M2_mpyd_nac_hh_s0>;
+def HEXAGON_M2_mpyd_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hh_s1>;
+
+def HEXAGON_M2_mpyd_nac_hl_s0:
+  di_MInst_disisi_nac_hl          <"mpy",    int_hexagon_M2_mpyd_nac_hl_s0>;
+def HEXAGON_M2_mpyd_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1       <"mpy",    int_hexagon_M2_mpyd_nac_hl_s1>;
+
+def HEXAGON_M2_mpyd_nac_lh_s0:
+  di_MInst_disisi_nac_lh          <"mpy",    int_hexagon_M2_mpyd_nac_lh_s0>;
+def HEXAGON_M2_mpyd_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1       <"mpy",    int_hexagon_M2_mpyd_nac_lh_s1>;
+
+def HEXAGON_M2_mpyd_nac_ll_s0:
+  di_MInst_disisi_nac_ll          <"mpy",    int_hexagon_M2_mpyd_nac_ll_s0>;
+def HEXAGON_M2_mpyd_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1       <"mpy",    int_hexagon_M2_mpyd_nac_ll_s1>;
+
+// MTYPE / MPYS / Scalar 16x16 multiply unsigned.
+//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_hh_s0:
+  si_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyu_hh_s0>;
+def HEXAGON_M2_mpyu_hh_s1:
+  si_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyu_hh_s1>;
+def HEXAGON_M2_mpyu_hl_s0:
+  si_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyu_hl_s0>;
+def HEXAGON_M2_mpyu_hl_s1:
+  si_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyu_hl_s1>;
+def HEXAGON_M2_mpyu_lh_s0:
+  si_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyu_lh_s0>;
+def HEXAGON_M2_mpyu_lh_s1:
+  si_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyu_lh_s1>;
+def HEXAGON_M2_mpyu_ll_s0:
+  si_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyu_ll_s0>;
+def HEXAGON_M2_mpyu_ll_s1:
+  si_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyu_ll_s1>;
+
+//Rdd=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_hh_s0:
+  di_MInst_sisi_hh                <"mpyu",    int_hexagon_M2_mpyud_hh_s0>;
+def HEXAGON_M2_mpyud_hh_s1:
+  di_MInst_sisi_hh_s1             <"mpyu",    int_hexagon_M2_mpyud_hh_s1>;
+def HEXAGON_M2_mpyud_hl_s0:
+  di_MInst_sisi_hl                <"mpyu",    int_hexagon_M2_mpyud_hl_s0>;
+def HEXAGON_M2_mpyud_hl_s1:
+  di_MInst_sisi_hl_s1             <"mpyu",    int_hexagon_M2_mpyud_hl_s1>;
+def HEXAGON_M2_mpyud_lh_s0:
+  di_MInst_sisi_lh                <"mpyu",    int_hexagon_M2_mpyud_lh_s0>;
+def HEXAGON_M2_mpyud_lh_s1:
+  di_MInst_sisi_lh_s1             <"mpyu",    int_hexagon_M2_mpyud_lh_s1>;
+def HEXAGON_M2_mpyud_ll_s0:
+  di_MInst_sisi_ll                <"mpyu",    int_hexagon_M2_mpyud_ll_s0>;
+def HEXAGON_M2_mpyud_ll_s1:
+  di_MInst_sisi_ll_s1             <"mpyu",    int_hexagon_M2_mpyud_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_acc_hh_s0:
+  si_MInst_sisisi_acc_hh            <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s0>;
+def HEXAGON_M2_mpyu_acc_hh_s1:
+  si_MInst_sisisi_acc_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hh_s1>;
+def HEXAGON_M2_mpyu_acc_hl_s0:
+  si_MInst_sisisi_acc_hl            <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s0>;
+def HEXAGON_M2_mpyu_acc_hl_s1:
+  si_MInst_sisisi_acc_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_hl_s1>;
+def HEXAGON_M2_mpyu_acc_lh_s0:
+  si_MInst_sisisi_acc_lh            <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s0>;
+def HEXAGON_M2_mpyu_acc_lh_s1:
+  si_MInst_sisisi_acc_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_lh_s1>;
+def HEXAGON_M2_mpyu_acc_ll_s0:
+  si_MInst_sisisi_acc_ll            <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s0>;
+def HEXAGON_M2_mpyu_acc_ll_s1:
+  si_MInst_sisisi_acc_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_acc_ll_s1>;
+
+//Rd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyu_nac_hh_s0:
+  si_MInst_sisisi_nac_hh            <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s0>;
+def HEXAGON_M2_mpyu_nac_hh_s1:
+  si_MInst_sisisi_nac_hh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hh_s1>;
+def HEXAGON_M2_mpyu_nac_hl_s0:
+  si_MInst_sisisi_nac_hl            <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s0>;
+def HEXAGON_M2_mpyu_nac_hl_s1:
+  si_MInst_sisisi_nac_hl_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_hl_s1>;
+def HEXAGON_M2_mpyu_nac_lh_s0:
+  si_MInst_sisisi_nac_lh            <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s0>;
+def HEXAGON_M2_mpyu_nac_lh_s1:
+  si_MInst_sisisi_nac_lh_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_lh_s1>;
+def HEXAGON_M2_mpyu_nac_ll_s0:
+  si_MInst_sisisi_nac_ll            <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s0>;
+def HEXAGON_M2_mpyu_nac_ll_s1:
+  si_MInst_sisisi_nac_ll_s1         <"mpyu",    int_hexagon_M2_mpyu_nac_ll_s1>;
+
+//Rdd+=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_acc_hh_s0:
+  di_MInst_disisi_acc_hh            <"mpyu", int_hexagon_M2_mpyud_acc_hh_s0>;
+def HEXAGON_M2_mpyud_acc_hh_s1:
+  di_MInst_disisi_acc_hh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hh_s1>;
+def HEXAGON_M2_mpyud_acc_hl_s0:
+  di_MInst_disisi_acc_hl            <"mpyu", int_hexagon_M2_mpyud_acc_hl_s0>;
+def HEXAGON_M2_mpyud_acc_hl_s1:
+  di_MInst_disisi_acc_hl_s1         <"mpyu", int_hexagon_M2_mpyud_acc_hl_s1>;
+def HEXAGON_M2_mpyud_acc_lh_s0:
+  di_MInst_disisi_acc_lh            <"mpyu", int_hexagon_M2_mpyud_acc_lh_s0>;
+def HEXAGON_M2_mpyud_acc_lh_s1:
+  di_MInst_disisi_acc_lh_s1         <"mpyu", int_hexagon_M2_mpyud_acc_lh_s1>;
+def HEXAGON_M2_mpyud_acc_ll_s0:
+  di_MInst_disisi_acc_ll            <"mpyu", int_hexagon_M2_mpyud_acc_ll_s0>;
+def HEXAGON_M2_mpyud_acc_ll_s1:
+  di_MInst_disisi_acc_ll_s1         <"mpyu", int_hexagon_M2_mpyud_acc_ll_s1>;
+
+//Rdd-=mpyu(Rs.[H|L],Rt.[H|L])[:<<0|:<<1]
+def HEXAGON_M2_mpyud_nac_hh_s0:
+  di_MInst_disisi_nac_hh            <"mpyu", int_hexagon_M2_mpyud_nac_hh_s0>;
+def HEXAGON_M2_mpyud_nac_hh_s1:
+  di_MInst_disisi_nac_hh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hh_s1>;
+def HEXAGON_M2_mpyud_nac_hl_s0:
+  di_MInst_disisi_nac_hl            <"mpyu", int_hexagon_M2_mpyud_nac_hl_s0>;
+def HEXAGON_M2_mpyud_nac_hl_s1:
+  di_MInst_disisi_nac_hl_s1         <"mpyu", int_hexagon_M2_mpyud_nac_hl_s1>;
+def HEXAGON_M2_mpyud_nac_lh_s0:
+  di_MInst_disisi_nac_lh            <"mpyu", int_hexagon_M2_mpyud_nac_lh_s0>;
+def HEXAGON_M2_mpyud_nac_lh_s1:
+  di_MInst_disisi_nac_lh_s1         <"mpyu", int_hexagon_M2_mpyud_nac_lh_s1>;
+def HEXAGON_M2_mpyud_nac_ll_s0:
+  di_MInst_disisi_nac_ll            <"mpyu", int_hexagon_M2_mpyud_nac_ll_s0>;
+def HEXAGON_M2_mpyud_nac_ll_s1:
+  di_MInst_disisi_nac_ll_s1         <"mpyu", int_hexagon_M2_mpyud_nac_ll_s1>;
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def HEXAGON_A2_vraddub:
+  di_MInst_didi                   <"vraddub", int_hexagon_A2_vraddub>;
+def HEXAGON_A2_vraddub_acc:
+  di_MInst_dididi_acc             <"vraddub", int_hexagon_A2_vraddub_acc>;
+
+// MTYPE / VB / Vector sum of absolute differences unsigned bytes.
+def HEXAGON_A2_vrsadub:
+  di_MInst_didi                   <"vrsadub", int_hexagon_A2_vrsadub>;
+def HEXAGON_A2_vrsadub_acc:
+  di_MInst_dididi_acc             <"vrsadub", int_hexagon_A2_vrsadub_acc>;
+
+/********************************************************************
+*            MTYPE/VH                                               *
+*********************************************************************/
+
+// MTYPE / VH / Vector dual multiply.
+def HEXAGON_M2_vdmpys_s1:
+  di_MInst_didi_s1_sat            <"vdmpy",   int_hexagon_M2_vdmpys_s1>;
+def HEXAGON_M2_vdmpys_s0:
+  di_MInst_didi_sat               <"vdmpy",   int_hexagon_M2_vdmpys_s0>;
+def HEXAGON_M2_vdmacs_s1:
+  di_MInst_dididi_acc_s1_sat      <"vdmpy",   int_hexagon_M2_vdmacs_s1>;
+def HEXAGON_M2_vdmacs_s0:
+  di_MInst_dididi_acc_sat         <"vdmpy",   int_hexagon_M2_vdmacs_s0>;
+
+// MTYPE / VH / Vector dual multiply with round and pack.
+def HEXAGON_M2_vdmpyrs_s0:
+  si_MInst_didi_rnd_sat           <"vdmpy",   int_hexagon_M2_vdmpyrs_s0>;
+def HEXAGON_M2_vdmpyrs_s1:
+  si_MInst_didi_s1_rnd_sat        <"vdmpy",   int_hexagon_M2_vdmpyrs_s1>;
+
+// MTYPE / VH / Vector multiply even halfwords.
+def HEXAGON_M2_vmpy2es_s1:
+  di_MInst_didi_s1_sat            <"vmpyeh",  int_hexagon_M2_vmpy2es_s1>;
+def HEXAGON_M2_vmpy2es_s0:
+  di_MInst_didi_sat               <"vmpyeh",  int_hexagon_M2_vmpy2es_s0>;
+def HEXAGON_M2_vmac2es:
+  di_MInst_dididi_acc             <"vmpyeh",  int_hexagon_M2_vmac2es>;
+def HEXAGON_M2_vmac2es_s1:
+  di_MInst_dididi_acc_s1_sat      <"vmpyeh",  int_hexagon_M2_vmac2es_s1>;
+def HEXAGON_M2_vmac2es_s0:
+  di_MInst_dididi_acc_sat         <"vmpyeh",  int_hexagon_M2_vmac2es_s0>;
+
+// MTYPE / VH / Vector multiply halfwords.
+def HEXAGON_M2_vmpy2s_s0:
+  di_MInst_sisi_sat               <"vmpyh",   int_hexagon_M2_vmpy2s_s0>;
+def HEXAGON_M2_vmpy2s_s1:
+  di_MInst_sisi_s1_sat            <"vmpyh",   int_hexagon_M2_vmpy2s_s1>;
+def HEXAGON_M2_vmac2:
+  di_MInst_disisi_acc             <"vmpyh",   int_hexagon_M2_vmac2>;
+def HEXAGON_M2_vmac2s_s0:
+  di_MInst_disisi_acc_sat         <"vmpyh",   int_hexagon_M2_vmac2s_s0>;
+def HEXAGON_M2_vmac2s_s1:
+  di_MInst_disisi_acc_s1_sat      <"vmpyh",   int_hexagon_M2_vmac2s_s1>;
+
+// MTYPE / VH / Vector multiply halfwords with round and pack.
+def HEXAGON_M2_vmpy2s_s0pack:
+  si_MInst_sisi_rnd_sat           <"vmpyh",   int_hexagon_M2_vmpy2s_s0pack>;
+def HEXAGON_M2_vmpy2s_s1pack:
+  si_MInst_sisi_s1_rnd_sat        <"vmpyh",   int_hexagon_M2_vmpy2s_s1pack>;
+
+// MTYPE / VH / Vector reduce multiply halfwords.
+// Rxx32+=vrmpyh(Rss32,Rtt32)
+def HEXAGON_M2_vrmpy_s0:
+  di_MInst_didi                   <"vrmpyh",  int_hexagon_M2_vrmpy_s0>;
+def HEXAGON_M2_vrmac_s0:
+  di_MInst_dididi_acc             <"vrmpyh",  int_hexagon_M2_vrmac_s0>;
+
+
+/********************************************************************
+*            STYPE/ALU                                              *
+*********************************************************************/
+
+// STYPE / ALU / Absolute value.
+def HEXAGON_A2_abs:
+  si_SInst_si                     <"abs",     int_hexagon_A2_abs>;
+def HEXAGON_A2_absp:
+  di_SInst_di                     <"abs",     int_hexagon_A2_absp>;
+def HEXAGON_A2_abssat:
+  si_SInst_si_sat                 <"abs",     int_hexagon_A2_abssat>;
+
+// STYPE / ALU / Negate.
+def HEXAGON_A2_negp:
+  di_SInst_di                     <"neg",     int_hexagon_A2_negp>;
+def HEXAGON_A2_negsat:
+  si_SInst_si_sat                 <"neg",     int_hexagon_A2_negsat>;
+
+// STYPE / ALU / Logical Not.
+def HEXAGON_A2_notp:
+  di_SInst_di                     <"not",     int_hexagon_A2_notp>;
+
+// STYPE / ALU / Sign extend word to doubleword.
+def HEXAGON_A2_sxtw:
+  di_SInst_si                     <"sxtw",     int_hexagon_A2_sxtw>;
+
+
+/********************************************************************
+*            STYPE/BIT                                              *
+*********************************************************************/
+
+// STYPE / BIT / Count leading.
+def HEXAGON_S2_cl0:
+  si_SInst_si                     <"cl0",     int_hexagon_S2_cl0>;
+def HEXAGON_S2_cl0p:
+  si_SInst_di                     <"cl0",     int_hexagon_S2_cl0p>;
+def HEXAGON_S2_cl1:
+  si_SInst_si                     <"cl1",     int_hexagon_S2_cl1>;
+def HEXAGON_S2_cl1p:
+  si_SInst_di                     <"cl1",     int_hexagon_S2_cl1p>;
+def HEXAGON_S2_clb:
+  si_SInst_si                     <"clb",     int_hexagon_S2_clb>;
+def HEXAGON_S2_clbp:
+  si_SInst_di                     <"clb",     int_hexagon_S2_clbp>;
+def HEXAGON_S2_clbnorm:
+  si_SInst_si                     <"normamt", int_hexagon_S2_clbnorm>;
+
+// STYPE / BIT / Count trailing.
+def HEXAGON_S2_ct0:
+  si_SInst_si                     <"ct0",     int_hexagon_S2_ct0>;
+def HEXAGON_S2_ct1:
+  si_SInst_si                     <"ct1",     int_hexagon_S2_ct1>;
+
+// STYPE / BIT / Compare bit mask.
+def Hexagon_C2_bitsclr:
+  qi_SInst_sisi                   <"bitsclr", int_hexagon_C2_bitsclr>;
+def Hexagon_C2_bitsclri:
+  qi_SInst_siu6                   <"bitsclr", int_hexagon_C2_bitsclri>;
+def Hexagon_C2_bitsset:
+  qi_SInst_sisi                   <"bitsset", int_hexagon_C2_bitsset>;
+
+// STYPE / BIT / Extract unsigned.
+// Rd[d][32/64]=extractu(Rs[s],Rt[t],[imm])
+def HEXAGON_S2_extractu:
+  si_SInst_siu5u5                 <"extractu",int_hexagon_S2_extractu>;
+def HEXAGON_S2_extractu_rp:
+  si_SInst_sidi                   <"extractu",int_hexagon_S2_extractu_rp>;
+def HEXAGON_S2_extractup:
+  di_SInst_diu6u6                 <"extractu",int_hexagon_S2_extractup>;
+def HEXAGON_S2_extractup_rp:
+  di_SInst_didi                   <"extractu",int_hexagon_S2_extractup_rp>;
+
+// STYPE / BIT / Insert bitfield.
+def Hexagon_S2_insert:
+  si_SInst_sisiu5u5               <"insert",  int_hexagon_S2_insert>;
+def Hexagon_S2_insert_rp:
+  si_SInst_sisidi                 <"insert",  int_hexagon_S2_insert_rp>;
+def Hexagon_S2_insertp:
+  di_SInst_didiu6u6               <"insert",  int_hexagon_S2_insertp>;
+def Hexagon_S2_insertp_rp:
+  di_SInst_dididi                 <"insert",  int_hexagon_S2_insertp_rp>;
+
+// STYPE / BIT / Innterleave/deinterleave.
+def Hexagon_S2_interleave:
+  di_SInst_di                     <"interleave", int_hexagon_S2_interleave>;
+def Hexagon_S2_deinterleave:
+  di_SInst_di                     <"deinterleave", int_hexagon_S2_deinterleave>;
+
+// STYPE / BIT / Linear feedback-shift Iteration.
+def Hexagon_S2_lfsp:
+  di_SInst_didi                   <"lfs",     int_hexagon_S2_lfsp>;
+
+// STYPE / BIT / Bit reverse.
+def Hexagon_S2_brev:
+  si_SInst_si                     <"brev",    int_hexagon_S2_brev>;
+
+// STYPE / BIT / Set/Clear/Toggle Bit.
+def HEXAGON_S2_setbit_i:
+  si_SInst_siu5                   <"setbit",  int_hexagon_S2_setbit_i>;
+def HEXAGON_S2_togglebit_i:
+  si_SInst_siu5                   <"togglebit", int_hexagon_S2_togglebit_i>;
+def HEXAGON_S2_clrbit_i:
+  si_SInst_siu5                   <"clrbit",  int_hexagon_S2_clrbit_i>;
+def HEXAGON_S2_setbit_r:
+  si_SInst_sisi                   <"setbit",  int_hexagon_S2_setbit_r>;
+def HEXAGON_S2_togglebit_r:
+  si_SInst_sisi                   <"togglebit", int_hexagon_S2_togglebit_r>;
+def HEXAGON_S2_clrbit_r:
+  si_SInst_sisi                   <"clrbit",  int_hexagon_S2_clrbit_r>;
+
+// STYPE / BIT / Test Bit.
+def HEXAGON_S2_tstbit_i:
+  qi_SInst_siu5                   <"tstbit",  int_hexagon_S2_tstbit_i>;
+def HEXAGON_S2_tstbit_r:
+  qi_SInst_sisi                   <"tstbit",  int_hexagon_S2_tstbit_r>;
+
+
+/********************************************************************
+*            STYPE/COMPLEX                                          *
+*********************************************************************/
+
+// STYPE / COMPLEX / Vector Complex conjugate.
+def HEXAGON_A2_vconj:
+  di_SInst_di_sat                 <"vconj",   int_hexagon_A2_vconj>;
+
+// STYPE / COMPLEX / Vector Complex rotate.
+def HEXAGON_S2_vcrotate:
+  di_SInst_disi                   <"vcrotate",int_hexagon_S2_vcrotate>;
+
+
+/********************************************************************
+*            STYPE/PERM                                             *
+*********************************************************************/
+
+// STYPE / PERM / Saturate.
+def HEXAGON_A2_sat:
+  si_SInst_di                     <"sat",     int_hexagon_A2_sat>;
+def HEXAGON_A2_satb:
+  si_SInst_si                     <"satb",    int_hexagon_A2_satb>;
+def HEXAGON_A2_sath:
+  si_SInst_si                     <"sath",    int_hexagon_A2_sath>;
+def HEXAGON_A2_satub:
+  si_SInst_si                     <"satub",   int_hexagon_A2_satub>;
+def HEXAGON_A2_satuh:
+  si_SInst_si                     <"satuh",   int_hexagon_A2_satuh>;
+
+// STYPE / PERM / Swizzle bytes.
+def HEXAGON_A2_swiz:
+  si_SInst_si                     <"swiz",    int_hexagon_A2_swiz>;
+
+// STYPE / PERM / Vector align.
+// Need custom lowering
+def HEXAGON_S2_valignib:
+  di_SInst_didiu3                 <"valignb", int_hexagon_S2_valignib>;
+def HEXAGON_S2_valignrb:
+  di_SInst_didiqi                 <"valignb", int_hexagon_S2_valignrb>;
+
+// STYPE / PERM / Vector round and pack.
+def HEXAGON_S2_vrndpackwh:
+  si_SInst_di                     <"vrndwh",  int_hexagon_S2_vrndpackwh>;
+def HEXAGON_S2_vrndpackwhs:
+  si_SInst_di_sat                 <"vrndwh",  int_hexagon_S2_vrndpackwhs>;
+
+// STYPE / PERM / Vector saturate and pack.
+def HEXAGON_S2_svsathb:
+  si_SInst_si                     <"vsathb",  int_hexagon_S2_svsathb>;
+def HEXAGON_S2_vsathb:
+  si_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb>;
+def HEXAGON_S2_svsathub:
+  si_SInst_si                     <"vsathub", int_hexagon_S2_svsathub>;
+def HEXAGON_S2_vsathub:
+  si_SInst_di                     <"vsathub", int_hexagon_S2_vsathub>;
+def HEXAGON_S2_vsatwh:
+  si_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh>;
+def HEXAGON_S2_vsatwuh:
+  si_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh>;
+
+// STYPE / PERM / Vector saturate without pack.
+def HEXAGON_S2_vsathb_nopack:
+  di_SInst_di                     <"vsathb",  int_hexagon_S2_vsathb_nopack>;
+def HEXAGON_S2_vsathub_nopack:
+  di_SInst_di                     <"vsathub", int_hexagon_S2_vsathub_nopack>;
+def HEXAGON_S2_vsatwh_nopack:
+  di_SInst_di                     <"vsatwh",  int_hexagon_S2_vsatwh_nopack>;
+def HEXAGON_S2_vsatwuh_nopack:
+  di_SInst_di                     <"vsatwuh", int_hexagon_S2_vsatwuh_nopack>;
+
+// STYPE / PERM / Vector shuffle.
+def HEXAGON_S2_shuffeb:
+  di_SInst_didi                   <"shuffeb", int_hexagon_S2_shuffeb>;
+def HEXAGON_S2_shuffeh:
+  di_SInst_didi                   <"shuffeh", int_hexagon_S2_shuffeh>;
+def HEXAGON_S2_shuffob:
+  di_SInst_didi                   <"shuffob", int_hexagon_S2_shuffob>;
+def HEXAGON_S2_shuffoh:
+  di_SInst_didi                   <"shuffoh", int_hexagon_S2_shuffoh>;
+
+// STYPE / PERM / Vector splat bytes.
+def HEXAGON_S2_vsplatrb:
+  si_SInst_si                     <"vsplatb", int_hexagon_S2_vsplatrb>;
+
+// STYPE / PERM / Vector splat halfwords.
+def HEXAGON_S2_vsplatrh:
+  di_SInst_si                     <"vsplath", int_hexagon_S2_vsplatrh>;
+
+// STYPE / PERM / Vector splice.
+def Hexagon_S2_vsplicerb:
+  di_SInst_didiqi                 <"vspliceb",int_hexagon_S2_vsplicerb>;
+def Hexagon_S2_vspliceib:
+  di_SInst_didiu3                 <"vspliceb",int_hexagon_S2_vspliceib>;
+
+// STYPE / PERM / Sign extend.
+def HEXAGON_S2_vsxtbh:
+  di_SInst_si                     <"vsxtbh",  int_hexagon_S2_vsxtbh>;
+def HEXAGON_S2_vsxthw:
+  di_SInst_si                     <"vsxthw",  int_hexagon_S2_vsxthw>;
+
+// STYPE / PERM / Truncate.
+def HEXAGON_S2_vtrunehb:
+  si_SInst_di                     <"vtrunehb",int_hexagon_S2_vtrunehb>;
+def HEXAGON_S2_vtrunohb:
+  si_SInst_di                     <"vtrunohb",int_hexagon_S2_vtrunohb>;
+def HEXAGON_S2_vtrunewh:
+  di_SInst_didi                   <"vtrunewh",int_hexagon_S2_vtrunewh>;
+def HEXAGON_S2_vtrunowh:
+  di_SInst_didi                   <"vtrunowh",int_hexagon_S2_vtrunowh>;
+
+// STYPE / PERM / Zero extend.
+def HEXAGON_S2_vzxtbh:
+  di_SInst_si                     <"vzxtbh",  int_hexagon_S2_vzxtbh>;
+def HEXAGON_S2_vzxthw:
+  di_SInst_si                     <"vzxthw",  int_hexagon_S2_vzxthw>;
+
+
+/********************************************************************
+*            STYPE/PRED                                             *
+*********************************************************************/
+
+// STYPE / PRED / Mask generate from predicate.
+def HEXAGON_C2_mask:
+  di_SInst_qi                     <"mask",   int_hexagon_C2_mask>;
+
+// STYPE / PRED / Predicate transfer.
+def HEXAGON_C2_tfrpr:
+  si_SInst_qi                     <"",       int_hexagon_C2_tfrpr>;
+def HEXAGON_C2_tfrrp:
+  qi_SInst_si                     <"",       int_hexagon_C2_tfrrp>;
+
+// STYPE / PRED / Viterbi pack even and odd predicate bits.
+def HEXAGON_C2_vitpack:
+  si_SInst_qiqi                   <"vitpack",int_hexagon_C2_vitpack>;
+
+
+/********************************************************************
+*            STYPE/SHIFT                                            *
+*********************************************************************/
+
+// STYPE / SHIFT / Shift by immediate.
+def HEXAGON_S2_asl_i_r:
+  si_SInst_siu5                   <"asl",     int_hexagon_S2_asl_i_r>;
+def HEXAGON_S2_asr_i_r:
+  si_SInst_siu5                   <"asr",     int_hexagon_S2_asr_i_r>;
+def HEXAGON_S2_lsr_i_r:
+  si_SInst_siu5                   <"lsr",     int_hexagon_S2_lsr_i_r>;
+def HEXAGON_S2_asl_i_p:
+  di_SInst_diu6                   <"asl",     int_hexagon_S2_asl_i_p>;
+def HEXAGON_S2_asr_i_p:
+  di_SInst_diu6                   <"asr",     int_hexagon_S2_asr_i_p>;
+def HEXAGON_S2_lsr_i_p:
+  di_SInst_diu6                   <"lsr",     int_hexagon_S2_lsr_i_p>;
+
+// STYPE / SHIFT / Shift by immediate and accumulate.
+def HEXAGON_S2_asl_i_r_acc:
+  si_SInst_sisiu5_acc             <"asl",     int_hexagon_S2_asl_i_r_acc>;
+def HEXAGON_S2_asr_i_r_acc:
+  si_SInst_sisiu5_acc             <"asr",     int_hexagon_S2_asr_i_r_acc>;
+def HEXAGON_S2_lsr_i_r_acc:
+  si_SInst_sisiu5_acc             <"lsr",     int_hexagon_S2_lsr_i_r_acc>;
+def HEXAGON_S2_asl_i_r_nac:
+  si_SInst_sisiu5_nac             <"asl",     int_hexagon_S2_asl_i_r_nac>;
+def HEXAGON_S2_asr_i_r_nac:
+  si_SInst_sisiu5_nac             <"asr",     int_hexagon_S2_asr_i_r_nac>;
+def HEXAGON_S2_lsr_i_r_nac:
+  si_SInst_sisiu5_nac             <"lsr",     int_hexagon_S2_lsr_i_r_nac>;
+def HEXAGON_S2_asl_i_p_acc:
+  di_SInst_didiu6_acc             <"asl",     int_hexagon_S2_asl_i_p_acc>;
+def HEXAGON_S2_asr_i_p_acc:
+  di_SInst_didiu6_acc             <"asr",     int_hexagon_S2_asr_i_p_acc>;
+def HEXAGON_S2_lsr_i_p_acc:
+  di_SInst_didiu6_acc             <"lsr",     int_hexagon_S2_lsr_i_p_acc>;
+def HEXAGON_S2_asl_i_p_nac:
+  di_SInst_didiu6_nac             <"asl",     int_hexagon_S2_asl_i_p_nac>;
+def HEXAGON_S2_asr_i_p_nac:
+  di_SInst_didiu6_nac             <"asr",     int_hexagon_S2_asr_i_p_nac>;
+def HEXAGON_S2_lsr_i_p_nac:
+  di_SInst_didiu6_nac             <"lsr",     int_hexagon_S2_lsr_i_p_nac>;
+
+// STYPE / SHIFT / Shift by immediate and add.
+def HEXAGON_S2_addasl_rrri:
+  si_SInst_sisiu3                 <"addasl",  int_hexagon_S2_addasl_rrri>;
+
+// STYPE / SHIFT / Shift by immediate and logical.
+def HEXAGON_S2_asl_i_r_and:
+  si_SInst_sisiu5_and             <"asl",     int_hexagon_S2_asl_i_r_and>;
+def HEXAGON_S2_asr_i_r_and:
+  si_SInst_sisiu5_and             <"asr",     int_hexagon_S2_asr_i_r_and>;
+def HEXAGON_S2_lsr_i_r_and:
+  si_SInst_sisiu5_and             <"lsr",     int_hexagon_S2_lsr_i_r_and>;
+
+def HEXAGON_S2_asl_i_r_xacc:
+  si_SInst_sisiu5_xor             <"asl",     int_hexagon_S2_asl_i_r_xacc>;
+def HEXAGON_S2_lsr_i_r_xacc:
+  si_SInst_sisiu5_xor             <"lsr",     int_hexagon_S2_lsr_i_r_xacc>;
+
+def HEXAGON_S2_asl_i_r_or:
+  si_SInst_sisiu5_or              <"asl",     int_hexagon_S2_asl_i_r_or>;
+def HEXAGON_S2_asr_i_r_or:
+  si_SInst_sisiu5_or              <"asr",     int_hexagon_S2_asr_i_r_or>;
+def HEXAGON_S2_lsr_i_r_or:
+  si_SInst_sisiu5_or              <"lsr",     int_hexagon_S2_lsr_i_r_or>;
+
+def HEXAGON_S2_asl_i_p_and:
+  di_SInst_didiu6_and             <"asl",     int_hexagon_S2_asl_i_p_and>;
+def HEXAGON_S2_asr_i_p_and:
+  di_SInst_didiu6_and             <"asr",     int_hexagon_S2_asr_i_p_and>;
+def HEXAGON_S2_lsr_i_p_and:
+  di_SInst_didiu6_and             <"lsr",     int_hexagon_S2_lsr_i_p_and>;
+
+def HEXAGON_S2_asl_i_p_xacc:
+  di_SInst_didiu6_xor             <"asl",     int_hexagon_S2_asl_i_p_xacc>;
+def HEXAGON_S2_lsr_i_p_xacc:
+  di_SInst_didiu6_xor             <"lsr",     int_hexagon_S2_lsr_i_p_xacc>;
+
+def HEXAGON_S2_asl_i_p_or:
+  di_SInst_didiu6_or              <"asl",     int_hexagon_S2_asl_i_p_or>;
+def HEXAGON_S2_asr_i_p_or:
+  di_SInst_didiu6_or              <"asr",     int_hexagon_S2_asr_i_p_or>;
+def HEXAGON_S2_lsr_i_p_or:
+  di_SInst_didiu6_or              <"lsr",     int_hexagon_S2_lsr_i_p_or>;
+
+// STYPE / SHIFT / Shift right by immediate with rounding.
+def HEXAGON_S2_asr_i_r_rnd:
+  si_SInst_siu5_rnd               <"asr",     int_hexagon_S2_asr_i_r_rnd>;
+def HEXAGON_S2_asr_i_r_rnd_goodsyntax:
+  si_SInst_siu5              <"asrrnd",  int_hexagon_S2_asr_i_r_rnd_goodsyntax>;
+
+// STYPE / SHIFT / Shift left by immediate with saturation.
+def HEXAGON_S2_asl_i_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_i_r_sat>;
+
+// STYPE / SHIFT / Shift by register.
+def HEXAGON_S2_asl_r_r:
+  si_SInst_sisi                   <"asl",     int_hexagon_S2_asl_r_r>;
+def HEXAGON_S2_asr_r_r:
+  si_SInst_sisi                   <"asr",     int_hexagon_S2_asr_r_r>;
+def HEXAGON_S2_lsl_r_r:
+  si_SInst_sisi                   <"lsl",     int_hexagon_S2_lsl_r_r>;
+def HEXAGON_S2_lsr_r_r:
+  si_SInst_sisi                   <"lsr",     int_hexagon_S2_lsr_r_r>;
+def HEXAGON_S2_asl_r_p:
+  di_SInst_disi                   <"asl",     int_hexagon_S2_asl_r_p>;
+def HEXAGON_S2_asr_r_p:
+  di_SInst_disi                   <"asr",     int_hexagon_S2_asr_r_p>;
+def HEXAGON_S2_lsl_r_p:
+  di_SInst_disi                   <"lsl",     int_hexagon_S2_lsl_r_p>;
+def HEXAGON_S2_lsr_r_p:
+  di_SInst_disi                   <"lsr",     int_hexagon_S2_lsr_r_p>;
+
+// STYPE / SHIFT / Shift by register and accumulate.
+def HEXAGON_S2_asl_r_r_acc:
+  si_SInst_sisisi_acc             <"asl",     int_hexagon_S2_asl_r_r_acc>;
+def HEXAGON_S2_asr_r_r_acc:
+  si_SInst_sisisi_acc             <"asr",     int_hexagon_S2_asr_r_r_acc>;
+def HEXAGON_S2_lsl_r_r_acc:
+  si_SInst_sisisi_acc             <"lsl",     int_hexagon_S2_lsl_r_r_acc>;
+def HEXAGON_S2_lsr_r_r_acc:
+  si_SInst_sisisi_acc             <"lsr",     int_hexagon_S2_lsr_r_r_acc>;
+def HEXAGON_S2_asl_r_p_acc:
+  di_SInst_didisi_acc             <"asl",     int_hexagon_S2_asl_r_p_acc>;
+def HEXAGON_S2_asr_r_p_acc:
+  di_SInst_didisi_acc             <"asr",     int_hexagon_S2_asr_r_p_acc>;
+def HEXAGON_S2_lsl_r_p_acc:
+  di_SInst_didisi_acc             <"lsl",     int_hexagon_S2_lsl_r_p_acc>;
+def HEXAGON_S2_lsr_r_p_acc:
+  di_SInst_didisi_acc             <"lsr",     int_hexagon_S2_lsr_r_p_acc>;
+
+def HEXAGON_S2_asl_r_r_nac:
+  si_SInst_sisisi_nac             <"asl",     int_hexagon_S2_asl_r_r_nac>;
+def HEXAGON_S2_asr_r_r_nac:
+  si_SInst_sisisi_nac             <"asr",     int_hexagon_S2_asr_r_r_nac>;
+def HEXAGON_S2_lsl_r_r_nac:
+  si_SInst_sisisi_nac             <"lsl",     int_hexagon_S2_lsl_r_r_nac>;
+def HEXAGON_S2_lsr_r_r_nac:
+  si_SInst_sisisi_nac             <"lsr",     int_hexagon_S2_lsr_r_r_nac>;
+def HEXAGON_S2_asl_r_p_nac:
+  di_SInst_didisi_nac             <"asl",     int_hexagon_S2_asl_r_p_nac>;
+def HEXAGON_S2_asr_r_p_nac:
+  di_SInst_didisi_nac             <"asr",     int_hexagon_S2_asr_r_p_nac>;
+def HEXAGON_S2_lsl_r_p_nac:
+  di_SInst_didisi_nac             <"lsl",     int_hexagon_S2_lsl_r_p_nac>;
+def HEXAGON_S2_lsr_r_p_nac:
+  di_SInst_didisi_nac             <"lsr",     int_hexagon_S2_lsr_r_p_nac>;
+
+// STYPE / SHIFT / Shift by register and logical.
+def HEXAGON_S2_asl_r_r_and:
+  si_SInst_sisisi_and             <"asl",     int_hexagon_S2_asl_r_r_and>;
+def HEXAGON_S2_asr_r_r_and:
+  si_SInst_sisisi_and             <"asr",     int_hexagon_S2_asr_r_r_and>;
+def HEXAGON_S2_lsl_r_r_and:
+  si_SInst_sisisi_and             <"lsl",     int_hexagon_S2_lsl_r_r_and>;
+def HEXAGON_S2_lsr_r_r_and:
+  si_SInst_sisisi_and             <"lsr",     int_hexagon_S2_lsr_r_r_and>;
+
+def HEXAGON_S2_asl_r_r_or:
+  si_SInst_sisisi_or              <"asl",     int_hexagon_S2_asl_r_r_or>;
+def HEXAGON_S2_asr_r_r_or:
+  si_SInst_sisisi_or              <"asr",     int_hexagon_S2_asr_r_r_or>;
+def HEXAGON_S2_lsl_r_r_or:
+  si_SInst_sisisi_or              <"lsl",     int_hexagon_S2_lsl_r_r_or>;
+def HEXAGON_S2_lsr_r_r_or:
+  si_SInst_sisisi_or              <"lsr",     int_hexagon_S2_lsr_r_r_or>;
+
+def HEXAGON_S2_asl_r_p_and:
+  di_SInst_didisi_and             <"asl",     int_hexagon_S2_asl_r_p_and>;
+def HEXAGON_S2_asr_r_p_and:
+  di_SInst_didisi_and             <"asr",     int_hexagon_S2_asr_r_p_and>;
+def HEXAGON_S2_lsl_r_p_and:
+  di_SInst_didisi_and             <"lsl",     int_hexagon_S2_lsl_r_p_and>;
+def HEXAGON_S2_lsr_r_p_and:
+  di_SInst_didisi_and             <"lsr",     int_hexagon_S2_lsr_r_p_and>;
+
+def HEXAGON_S2_asl_r_p_or:
+  di_SInst_didisi_or              <"asl",     int_hexagon_S2_asl_r_p_or>;
+def HEXAGON_S2_asr_r_p_or:
+  di_SInst_didisi_or              <"asr",     int_hexagon_S2_asr_r_p_or>;
+def HEXAGON_S2_lsl_r_p_or:
+  di_SInst_didisi_or              <"lsl",     int_hexagon_S2_lsl_r_p_or>;
+def HEXAGON_S2_lsr_r_p_or:
+  di_SInst_didisi_or              <"lsr",     int_hexagon_S2_lsr_r_p_or>;
+
+// STYPE / SHIFT / Shift by register with saturation.
+def HEXAGON_S2_asl_r_r_sat:
+  si_SInst_sisi_sat               <"asl",     int_hexagon_S2_asl_r_r_sat>;
+def HEXAGON_S2_asr_r_r_sat:
+  si_SInst_sisi_sat               <"asr",     int_hexagon_S2_asr_r_r_sat>;
+
+// STYPE / SHIFT / Table Index.
+def Hexagon_S2_tableidxb_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxb",int_hexagon_S2_tableidxb_goodsyntax>;
+def Hexagon_S2_tableidxd_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxd",int_hexagon_S2_tableidxd_goodsyntax>;
+def Hexagon_S2_tableidxh_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxh",int_hexagon_S2_tableidxh_goodsyntax>;
+def Hexagon_S2_tableidxw_goodsyntax:
+  si_MInst_sisiu4u5          <"tableidxw",int_hexagon_S2_tableidxw_goodsyntax>;
+
+
+/********************************************************************
+*            STYPE/VH                                               *
+*********************************************************************/
+
+// STYPE / VH / Vector absolute value halfwords.
+// Rdd64=vabsh(Rss64)
+def HEXAGON_A2_vabsh:
+  di_SInst_di                     <"vabsh",   int_hexagon_A2_vabsh>;
+def HEXAGON_A2_vabshsat:
+  di_SInst_di_sat                 <"vabsh",   int_hexagon_A2_vabshsat>;
+
+// STYPE / VH / Vector shift halfwords by immediate.
+// Rdd64=v[asl/asr/lsr]h(Rss64,Rt32)
+def HEXAGON_S2_asl_i_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_i_vh>;
+def HEXAGON_S2_asr_i_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_i_vh>;
+def HEXAGON_S2_lsr_i_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_i_vh>;
+
+// STYPE / VH / Vector shift halfwords by register.
+// Rdd64=v[asl/asr/lsl/lsr]w(Rss64,Rt32)
+def HEXAGON_S2_asl_r_vh:
+  di_SInst_disi                   <"vaslh",   int_hexagon_S2_asl_r_vh>;
+def HEXAGON_S2_asr_r_vh:
+  di_SInst_disi                   <"vasrh",   int_hexagon_S2_asr_r_vh>;
+def HEXAGON_S2_lsl_r_vh:
+  di_SInst_disi                   <"vlslh",   int_hexagon_S2_lsl_r_vh>;
+def HEXAGON_S2_lsr_r_vh:
+  di_SInst_disi                   <"vlsrh",   int_hexagon_S2_lsr_r_vh>;
+
+
+/********************************************************************
+*            STYPE/VW                                               *
+*********************************************************************/
+
+// STYPE / VW / Vector absolute value words.
+def HEXAGON_A2_vabsw:
+  di_SInst_di                     <"vabsw",   int_hexagon_A2_vabsw>;
+def HEXAGON_A2_vabswsat:
+  di_SInst_di_sat                 <"vabsw",   int_hexagon_A2_vabswsat>;
+
+// STYPE / VW / Vector shift words by immediate.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def HEXAGON_S2_asl_i_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_i_vw>;
+def HEXAGON_S2_asr_i_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_i_vw>;
+def HEXAGON_S2_lsr_i_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_i_vw>;
+
+// STYPE / VW / Vector shift words by register.
+// Rdd64=v[asl/vsl]w(Rss64,Rt32)
+def HEXAGON_S2_asl_r_vw:
+  di_SInst_disi                   <"vaslw",   int_hexagon_S2_asl_r_vw>;
+def HEXAGON_S2_asr_r_vw:
+  di_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_vw>;
+def HEXAGON_S2_lsl_r_vw:
+  di_SInst_disi                   <"vlslw",   int_hexagon_S2_lsl_r_vw>;
+def HEXAGON_S2_lsr_r_vw:
+  di_SInst_disi                   <"vlsrw",   int_hexagon_S2_lsr_r_vw>;
+
+// STYPE / VW / Vector shift words with truncate and pack.
+def HEXAGON_S2_asr_r_svw_trun:
+  si_SInst_disi                   <"vasrw",   int_hexagon_S2_asr_r_svw_trun>;
+def HEXAGON_S2_asr_i_svw_trun:
+  si_SInst_diu5                   <"vasrw",   int_hexagon_S2_asr_i_svw_trun>;
+
+// LD / Circular loads.
+def HEXAGON_circ_ldd:
+  di_LDInstPI_diu4                <"circ_ldd", int_hexagon_circ_ldd>;
+
+include "HexagonIntrinsicsV3.td"
+include "HexagonIntrinsicsV4.td"
+include "HexagonIntrinsicsV5.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
new file mode 100644
index 000000000000..2788101d5a66
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsDerived.td
@@ -0,0 +1,39 @@
+//===-- HexagonIntrinsicsDerived.td - Derived intrinsics ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Multiply 64-bit and use lower result
+//
+// Optimized with intrinisics accumulates
+//
+def : Pat <(mul DoubleRegs:$src1, DoubleRegs:$src2),
+      (i64
+       (COMBINE_rr
+        (HEXAGON_M2_maci
+         (HEXAGON_M2_maci
+          (i32
+           (EXTRACT_SUBREG
+            (i64
+             (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1),
+                                          subreg_loreg)),
+                     (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                          subreg_loreg)))),
+            subreg_hireg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+          (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_hireg))),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2), subreg_loreg)),
+         (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_hireg))),
+        (i32
+         (EXTRACT_SUBREG
+          (i64
+           (MPYU64 (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src1), subreg_loreg)),
+                   (i32 (EXTRACT_SUBREG (i64 DoubleRegs:$src2),
+                                        subreg_loreg)))), subreg_loreg))))>;
+
+
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
new file mode 100644
index 000000000000..2a54e62d20ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV3.td
@@ -0,0 +1,50 @@
+//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+
+// MTYPE / COMPLEX / Vector reduce complex multiply real or imaginary.
+def Hexagon_M2_vrcmpys_s1:
+  di_MInst_disi_s1_sat            <"vrcmpys",  int_hexagon_M2_vrcmpys_s1>;
+def Hexagon_M2_vrcmpys_acc_s1:
+  di_MInst_didisi_acc_s1_sat      <"vrcmpys",  int_hexagon_M2_vrcmpys_acc_s1>;
+def Hexagon_M2_vrcmpys_s1rp:
+  si_MInst_disi_s1_rnd_sat        <"vrcmpys",  int_hexagon_M2_vrcmpys_s1rp>;
+
+
+
+
+/********************************************************************
+*            MTYPE/VB                                               *
+*********************************************************************/
+
+// MTYPE / VB / Vector reduce add unsigned bytes.
+def Hexagon_M2_vradduh:
+  si_MInst_didi                   <"vradduh",  int_hexagon_M2_vradduh>;
+
+
+/********************************************************************
+*            ALU64/ALU                                              *
+*********************************************************************/
+
+// ALU64 / ALU / Add.
+def Hexagon_A2_addsp:
+  di_ALU64_sidi                   <"add",      int_hexagon_A2_addsp>;
+def Hexagon_A2_addpsat:
+  di_ALU64_didi                   <"add",      int_hexagon_A2_addpsat>;
+
+def Hexagon_A2_maxp:
+  di_ALU64_didi                   <"max",      int_hexagon_A2_maxp>;
+def Hexagon_A2_maxup:
+  di_ALU64_didi                   <"maxu",     int_hexagon_A2_maxup>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
new file mode 100644
index 000000000000..dd28ebb57231
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV4.td
@@ -0,0 +1,369 @@
+//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is populated based on the following specs:
+// Hexagon V4 Architecture Extensions
+// Application-Level Specification
+// 80-V9418-12 Rev. A
+// June 15, 2010
+
+
+//
+// ALU 32 types.
+//
+
+class si_ALU32_sisi_not<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_s8si<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins s8Imm:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "(#$src1, $src2)")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1, IntRegs:$src2))]>;
+
+class di_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs DoubleRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_neg_ALU32_sis10<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, s10Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class qi_neg_ALU32_siu9<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs PredRegs:$dst), (ins IntRegs:$src1, u9Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_neg_ALU32_sisi<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class si_neg_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class si_ALU32_sis8<string opc, Intrinsic IntID>
+  : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1, s8Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+//
+// SInst Classes.
+//
+class qi_neg_SInst_qiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = !", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_qi_andqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_andqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi_neg<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, !$src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class qi_SInst_qi_orqiqi<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                     IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, or($src2, $src3)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                         IntRegs:$src3))]>;
+
+class si_SInst_si_addsis6<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s6Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, add($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_SInst_si_subs6si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s6Imm:$src2, IntRegs:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, sub(#$src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2,
+                                        IntRegs:$src3))]>;
+
+class di_ALU64_didi_neg<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, ~$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class di_MInst_dididi_xacc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                           DoubleRegs:$src2),
+               !strconcat("$dst ^= ", !strconcat(opc , "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                             DoubleRegs:$src2))],
+               "$dst2 = $dst">;
+
+class si_MInst_sisisi_and<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_andn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst &= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_andi<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst = ", !strconcat(opc ,
+                                              "($src1, and($src2, #$src3))")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_xor<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_xorn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst ^= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_sisis10_or<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2, s10Imm:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, #$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        imm:$src3))]>;
+
+class si_MInst_sisisi_or<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, $src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_MInst_sisisi_orn<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$dst1, IntRegs:$src2,
+                                    IntRegs:$src3),
+             !strconcat("$dst |= ", !strconcat(opc , "($src2, ~$src3)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$dst1, IntRegs:$src2,
+                                        IntRegs:$src3))]>;
+
+class si_SInst_siu5_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+
+/********************************************************************
+*            ALU32/ALU                                              *
+*********************************************************************/
+
+// ALU32 / ALU / Logical Operations.
+def Hexagon_A4_orn  : si_ALU32_sisi_not <"or",  int_hexagon_A4_orn>;
+def Hexagon_A4_andn : si_ALU32_sisi_not <"and", int_hexagon_A4_andn>;
+
+
+/********************************************************************
+*            ALU32/PERM                                             *
+*********************************************************************/
+
+// ALU32 / PERM / Combine Words Into Doublewords.
+def Hexagon_A4_combineir : di_ALU32_s8si  <"combine", int_hexagon_A4_combineir>;
+def Hexagon_A4_combineri : di_ALU32_sis8  <"combine", int_hexagon_A4_combineri>;
+
+
+/********************************************************************
+*            ALU32/PRED                                             *
+*********************************************************************/
+
+// ALU32 / PRED / Conditional Shift Halfword.
+// ALU32 / PRED / Conditional Sign Extend.
+// ALU32 / PRED / Conditional Zero Extend.
+// ALU32 / PRED / Compare.
+def Hexagon_C4_cmpneq  : qi_neg_ALU32_sisi  <"cmp.eq", int_hexagon_C4_cmpneq>;
+def Hexagon_C4_cmpneqi : qi_neg_ALU32_sis10 <"cmp.eq", int_hexagon_C4_cmpneqi>;
+def Hexagon_C4_cmplte  : qi_neg_ALU32_sisi  <"cmp.gt", int_hexagon_C4_cmplte>;
+def Hexagon_C4_cmpltei : qi_neg_ALU32_sis10 <"cmp.gt", int_hexagon_C4_cmpltei>;
+def Hexagon_C4_cmplteu : qi_neg_ALU32_sisi  <"cmp.gtu",int_hexagon_C4_cmplteu>;
+def Hexagon_C4_cmplteui: qi_neg_ALU32_siu9  <"cmp.gtu",int_hexagon_C4_cmplteui>;
+
+// ALU32 / PRED / cmpare To General Register.
+def Hexagon_A4_rcmpneq : si_neg_ALU32_sisi <"cmp.eq", int_hexagon_A4_rcmpneq>;
+def Hexagon_A4_rcmpneqi: si_neg_ALU32_sis8 <"cmp.eq", int_hexagon_A4_rcmpneqi>;
+def Hexagon_A4_rcmpeq  : si_ALU32_sisi     <"cmp.eq", int_hexagon_A4_rcmpeq>;
+def Hexagon_A4_rcmpeqi : si_ALU32_sis8     <"cmp.eq", int_hexagon_A4_rcmpeqi>;
+
+
+/********************************************************************
+*            CR                                                     *
+*********************************************************************/
+
+// CR / Corner Detection Acceleration.
+def Hexagon_C4_fastcorner9:
+  qi_SInst_qiqi<"fastcorner9", int_hexagon_C4_fastcorner9>;
+def Hexagon_C4_fastcorner9_not:
+  qi_neg_SInst_qiqi<"fastcorner9",int_hexagon_C4_fastcorner9_not>;
+
+// CR / Logical Operations On Predicates.
+def Hexagon_C4_and_andn:
+  qi_SInst_qi_andqiqi_neg         <"and",      int_hexagon_C4_and_andn>;
+def Hexagon_C4_and_and:
+  qi_SInst_qi_andqiqi             <"and",      int_hexagon_C4_and_and>;
+def Hexagon_C4_and_orn:
+  qi_SInst_qi_orqiqi_neg          <"and",      int_hexagon_C4_and_orn>;
+def Hexagon_C4_and_or:
+  qi_SInst_qi_orqiqi              <"and",      int_hexagon_C4_and_or>;
+def Hexagon_C4_or_andn:
+  qi_SInst_qi_andqiqi_neg         <"or",       int_hexagon_C4_or_andn>;
+def Hexagon_C4_or_and:
+  qi_SInst_qi_andqiqi             <"or",       int_hexagon_C4_or_and>;
+def Hexagon_C4_or_orn:
+  qi_SInst_qi_orqiqi_neg          <"or",       int_hexagon_C4_or_orn>;
+def Hexagon_C4_or_or:
+  qi_SInst_qi_orqiqi              <"or",       int_hexagon_C4_or_or>;
+
+
+/********************************************************************
+*            XTYPE/ALU                                              *
+*********************************************************************/
+
+// XTYPE / ALU / Add And Accumulate.
+def Hexagon_S4_addaddi:
+  si_SInst_si_addsis6             <"add",      int_hexagon_S4_addaddi>;
+def Hexagon_S4_subaddi:
+  si_SInst_si_subs6si             <"add",      int_hexagon_S4_subaddi>;
+
+// XTYPE / ALU / Logical Doublewords.
+def Hexagon_S4_andnp:
+  di_ALU64_didi_neg               <"and",      int_hexagon_A4_andnp>;
+def Hexagon_S4_ornp:
+  di_ALU64_didi_neg               <"or",       int_hexagon_A4_ornp>;
+
+// XTYPE / ALU / Logical-logical Doublewords.
+def Hexagon_M4_xor_xacc:
+  di_MInst_dididi_xacc            <"xor",      int_hexagon_M4_xor_xacc>;
+
+// XTYPE / ALU / Logical-logical Words.
+def HEXAGON_M4_and_and:
+  si_MInst_sisisi_and             <"and",      int_hexagon_M4_and_and>;
+def HEXAGON_M4_and_or:
+  si_MInst_sisisi_and             <"or",       int_hexagon_M4_and_or>;
+def HEXAGON_M4_and_xor:
+  si_MInst_sisisi_and             <"xor",      int_hexagon_M4_and_xor>;
+def HEXAGON_M4_and_andn:
+  si_MInst_sisisi_andn            <"and",      int_hexagon_M4_and_andn>;
+def HEXAGON_M4_xor_and:
+  si_MInst_sisisi_xor             <"and",      int_hexagon_M4_xor_and>;
+def HEXAGON_M4_xor_or:
+  si_MInst_sisisi_xor             <"or",       int_hexagon_M4_xor_or>;
+def HEXAGON_M4_xor_andn:
+  si_MInst_sisisi_xorn            <"and",      int_hexagon_M4_xor_andn>;
+def HEXAGON_M4_or_and:
+  si_MInst_sisisi_or              <"and",      int_hexagon_M4_or_and>;
+def HEXAGON_M4_or_or:
+  si_MInst_sisisi_or              <"or",       int_hexagon_M4_or_or>;
+def HEXAGON_M4_or_xor:
+  si_MInst_sisisi_or              <"xor",      int_hexagon_M4_or_xor>;
+def HEXAGON_M4_or_andn:
+  si_MInst_sisisi_orn             <"and",      int_hexagon_M4_or_andn>;
+def HEXAGON_S4_or_andix:
+  si_SInst_sisis10_andi           <"or",       int_hexagon_S4_or_andix>;
+def HEXAGON_S4_or_andi:
+  si_SInst_sisis10_or             <"and",      int_hexagon_S4_or_andi>;
+def HEXAGON_S4_or_ori:
+  si_SInst_sisis10_or             <"or",       int_hexagon_S4_or_ori>;
+
+// XTYPE / ALU / Modulo wrap.
+def HEXAGON_A4_modwrapu:
+  si_ALU64_sisi                   <"modwrap",  int_hexagon_A4_modwrapu>;
+
+// XTYPE / ALU / Round.
+def HEXAGON_A4_cround_ri:
+  si_SInst_siu5                   <"cround",   int_hexagon_A4_cround_ri>;
+def HEXAGON_A4_cround_rr:
+  si_SInst_sisi                   <"cround",   int_hexagon_A4_cround_rr>;
+def HEXAGON_A4_round_ri:
+  si_SInst_siu5                   <"round",    int_hexagon_A4_round_ri>;
+def HEXAGON_A4_round_rr:
+  si_SInst_sisi                   <"round",    int_hexagon_A4_round_rr>;
+def HEXAGON_A4_round_ri_sat:
+  si_SInst_siu5_sat               <"round",    int_hexagon_A4_round_ri_sat>;
+def HEXAGON_A4_round_rr_sat:
+  si_SInst_sisi_sat               <"round",    int_hexagon_A4_round_rr_sat>;
+
+// XTYPE / ALU / Vector reduce add unsigned halfwords.
+// XTYPE / ALU / Vector add bytes.
+// XTYPE / ALU / Vector conditional negate.
+// XTYPE / ALU / Vector maximum bytes.
+// XTYPE / ALU / Vector reduce maximum halfwords.
+// XTYPE / ALU / Vector reduce maximum words.
+// XTYPE / ALU / Vector minimum bytes.
+// XTYPE / ALU / Vector reduce minimum halfwords.
+// XTYPE / ALU / Vector reduce minimum words.
+// XTYPE / ALU / Vector subtract bytes.
+
+
+/********************************************************************
+*            XTYPE/BIT                                              *
+*********************************************************************/
+
+// XTYPE / BIT / Count leading.
+// XTYPE / BIT / Count trailing.
+// XTYPE / BIT / Extract bitfield.
+// XTYPE / BIT / Masked parity.
+// XTYPE / BIT / Bit reverse.
+// XTYPE / BIT / Split bitfield.
+
+
+/********************************************************************
+*            XTYPE/COMPLEX                                          *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
+// XTYPE / COMPLEX / Complex add/sub words.
+// XTYPE / COMPLEX / Complex multiply 32x16.
+// XTYPE / COMPLEX / Vector reduce complex rotate.
+
+
+/********************************************************************
+*            XTYPE/MPY                                              *
+*********************************************************************/
+
+// XTYPE / COMPLEX / Complex add/sub halfwords.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
new file mode 100644
index 000000000000..1d44b526d298
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -0,0 +1,395 @@
+class sf_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class si_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class sf_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class si_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set IntRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class df_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class di_SInst_sf<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_si<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins IntRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID IntRegs:$src1))]>;
+
+class df_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class di_SInst_df<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+
+class df_SInst_di<string opc, Intrinsic IntID>
+  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "($src1)")),
+             [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1))]>;
+
+class sf_MInst_sfsf<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set IntRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class df_MInst_dfdf<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfdf<string opc, Intrinsic IntID>
+  : ALU64_rr<(outs PredRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, DoubleRegs:$src2))]>;
+
+class qi_ALU64_dfu5<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs PredRegs:$dst), (ins DoubleRegs:$src1, u5Imm:$src2),
+           !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+           [(set PredRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+class sf_MInst_sfsfsf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class sf_MInst_sfsfsfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set IntRegs:$dst, (IntID IntRegs:$dst2, IntRegs:$src1,
+                                        IntRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class sf_MInst_sfsfsf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2,
+                                        IntRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set IntRegs:$dst, (IntID IntRegs:$src1,
+                                          IntRegs:$src2, IntRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2)")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+
+class df_MInst_dfdfdfsi_sc<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2, $src3):scale")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$dst2, DoubleRegs:$src1,
+                                        DoubleRegs:$src2, IntRegs:$src3))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_acc_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst += ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class df_MInst_dfdfdf_nac_lib<string opc, Intrinsic IntID>
+  : MInst_acc<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, DoubleRegs:$src2,
+                                        DoubleRegs:$dst2),
+               !strconcat("$dst -= ", !strconcat(opc ,
+                                                 "($src1, $src2):lib")),
+               [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1,
+                                          DoubleRegs:$src2, DoubleRegs:$dst2))],
+               "$dst2 = $dst">;
+
+class qi_SInst_sfsf<string opc, Intrinsic IntID>
+  : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, $src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, IntRegs:$src2))]>;
+
+class qi_SInst_sfu5<string opc, Intrinsic IntID>
+  : MInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
+             !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+             [(set PredRegs:$dst, (IntID IntRegs:$src1, imm:$src2))]>;
+
+class sf_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class sf_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs IntRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set IntRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_pos<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):pos")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class df_ALU64_u10_neg<string opc, Intrinsic IntID>
+  : ALU64_ri<(outs DoubleRegs:$dst), (ins u10Imm:$src1),
+             !strconcat("$dst = ", !strconcat(opc , "#$src1):neg")),
+             [(set DoubleRegs:$dst, (IntID imm:$src1))]>;
+
+class di_MInst_diu6<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2)")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class di_MInst_diu4_rnd<string opc, Intrinsic IntID>
+  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd")),
+          [(set DoubleRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_MInst_diu4_rnd_sat<string opc, Intrinsic IntID>
+  : MInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):rnd:sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+class si_SInst_diu4_sat<string opc, Intrinsic IntID>
+  : SInst<(outs IntRegs:$dst), (ins DoubleRegs:$src1, u4Imm:$src2),
+          !strconcat("$dst = ", !strconcat(opc , "($src1, #$src2):sat")),
+          [(set IntRegs:$dst, (IntID DoubleRegs:$src1, imm:$src2))]>;
+
+
+def HEXAGON_C4_fastcorner9:
+    qi_SInst_qiqi       <"fastcorner9", int_hexagon_C4_fastcorner9>;
+def HEXAGON_C4_fastcorner9_not:
+    qi_SInst_qiqi <"!fastcorner9", int_hexagon_C4_fastcorner9_not>;
+def HEXAGON_M5_vrmpybuu:
+    di_MInst_didi <"vrmpybu", int_hexagon_M5_vrmpybuu>;
+def HEXAGON_M5_vrmacbuu:
+    di_MInst_dididi_acc <"vrmpybu", int_hexagon_M5_vrmacbuu>;
+def HEXAGON_M5_vrmpybsu:
+    di_MInst_didi <"vrmpybsu", int_hexagon_M5_vrmpybsu>;
+def HEXAGON_M5_vrmacbsu:
+    di_MInst_dididi_acc <"vrmpybsu", int_hexagon_M5_vrmacbsu>;
+def HEXAGON_M5_vmpybuu:
+    di_MInst_sisi <"vmpybu", int_hexagon_M5_vmpybuu>;
+def HEXAGON_M5_vmpybsu:
+    di_MInst_sisi <"vmpybsu", int_hexagon_M5_vmpybsu>;
+def HEXAGON_M5_vmacbuu:
+    di_MInst_disisi_acc <"vmpybu", int_hexagon_M5_vmacbuu>;
+def HEXAGON_M5_vmacbsu:
+    di_MInst_disisi_acc <"vmpybsu", int_hexagon_M5_vmacbsu>;
+def HEXAGON_M5_vdmpybsu:
+    di_MInst_didi_sat <"vdmpybsu", int_hexagon_M5_vdmpybsu>;
+def HEXAGON_M5_vdmacbsu:
+    di_MInst_dididi_acc_sat <"vdmpybsu", int_hexagon_M5_vdmacbsu>;
+def HEXAGON_A5_vaddhubs:
+    si_SInst_didi_sat <"vaddhub", int_hexagon_A5_vaddhubs>;
+def HEXAGON_S5_popcountp:
+    si_SInst_di <"popcount", int_hexagon_S5_popcountp>;
+def HEXAGON_S5_asrhub_rnd_sat_goodsyntax:
+    si_MInst_diu4_rnd_sat <"vasrhub", int_hexagon_S5_asrhub_rnd_sat_goodsyntax>;
+def HEXAGON_S5_asrhub_sat:
+    si_SInst_diu4_sat <"vasrhub", int_hexagon_S5_asrhub_sat>;
+def HEXAGON_S5_vasrhrnd_goodsyntax:
+    di_MInst_diu4_rnd <"vasrh", int_hexagon_S5_vasrhrnd_goodsyntax>;
+def HEXAGON_S2_asr_i_p_rnd:
+    di_SInst_diu6 <"asr", int_hexagon_S2_asr_i_p_rnd>;
+def HEXAGON_S2_asr_i_p_rnd_goodsyntax:
+    di_MInst_diu6 <"asrrnd", int_hexagon_S2_asr_i_p_rnd_goodsyntax>;
+def HEXAGON_F2_sfadd:
+    sf_MInst_sfsf <"sfadd", int_hexagon_F2_sfadd>;
+def HEXAGON_F2_sfsub:
+    sf_MInst_sfsf <"sfsub", int_hexagon_F2_sfsub>;
+def HEXAGON_F2_sfmpy:
+    sf_MInst_sfsf <"sfmpy", int_hexagon_F2_sfmpy>;
+def HEXAGON_F2_sffma:
+    sf_MInst_sfsfsf_acc <"sfmpy", int_hexagon_F2_sffma>;
+def HEXAGON_F2_sffma_sc:
+    sf_MInst_sfsfsfsi_sc <"sfmpy", int_hexagon_F2_sffma_sc>;
+def HEXAGON_F2_sffms:
+    sf_MInst_sfsfsf_nac <"sfmpy", int_hexagon_F2_sffms>;
+def HEXAGON_F2_sffma_lib:
+    sf_MInst_sfsfsf_acc_lib <"sfmpy", int_hexagon_F2_sffma_lib>;
+def HEXAGON_F2_sffms_lib:
+    sf_MInst_sfsfsf_nac_lib <"sfmpy", int_hexagon_F2_sffms_lib>;
+def HEXAGON_F2_sfcmpeq:
+    qi_SInst_sfsf <"sfcmp.eq", int_hexagon_F2_sfcmpeq>;
+def HEXAGON_F2_sfcmpgt:
+    qi_SInst_sfsf <"sfcmp.gt", int_hexagon_F2_sfcmpgt>;
+def HEXAGON_F2_sfcmpge:
+    qi_SInst_sfsf <"sfcmp.ge", int_hexagon_F2_sfcmpge>;
+def HEXAGON_F2_sfcmpuo:
+    qi_SInst_sfsf <"sfcmp.uo", int_hexagon_F2_sfcmpuo>;
+def HEXAGON_F2_sfmax:
+    sf_MInst_sfsf <"sfmax", int_hexagon_F2_sfmax>;
+def HEXAGON_F2_sfmin:
+    sf_MInst_sfsf <"sfmin", int_hexagon_F2_sfmin>;
+def HEXAGON_F2_sfclass:
+    qi_SInst_sfu5 <"sfclass", int_hexagon_F2_sfclass>;
+def HEXAGON_F2_sfimm_p:
+    sf_ALU64_u10_pos <"sfmake", int_hexagon_F2_sfimm_p>;
+def HEXAGON_F2_sfimm_n:
+    sf_ALU64_u10_neg <"sfmake", int_hexagon_F2_sfimm_n>;
+def HEXAGON_F2_sffixupn:
+    sf_MInst_sfsf <"sffixupn", int_hexagon_F2_sffixupn>;
+def HEXAGON_F2_sffixupd:
+    sf_MInst_sfsf <"sffixupd", int_hexagon_F2_sffixupd>;
+def HEXAGON_F2_sffixupr:
+    sf_SInst_sf <"sffixupr", int_hexagon_F2_sffixupr>;
+def HEXAGON_F2_dfadd:
+    df_MInst_dfdf <"dfadd", int_hexagon_F2_dfadd>;
+def HEXAGON_F2_dfsub:
+    df_MInst_dfdf <"dfsub", int_hexagon_F2_dfsub>;
+def HEXAGON_F2_dfmpy:
+    df_MInst_dfdf <"dfmpy", int_hexagon_F2_dfmpy>;
+def HEXAGON_F2_dffma:
+    df_MInst_dfdfdf_acc <"dfmpy", int_hexagon_F2_dffma>;
+def HEXAGON_F2_dffms:
+    df_MInst_dfdfdf_nac <"dfmpy", int_hexagon_F2_dffms>;
+def HEXAGON_F2_dffma_lib:
+    df_MInst_dfdfdf_acc_lib <"dfmpy", int_hexagon_F2_dffma_lib>;
+def HEXAGON_F2_dffms_lib:
+    df_MInst_dfdfdf_nac_lib <"dfmpy", int_hexagon_F2_dffms_lib>;
+def HEXAGON_F2_dffma_sc:
+    df_MInst_dfdfdfsi_sc <"dfmpy", int_hexagon_F2_dffma_sc>;
+def HEXAGON_F2_dfmax:
+    df_MInst_dfdf <"dfmax", int_hexagon_F2_dfmax>;
+def HEXAGON_F2_dfmin:
+    df_MInst_dfdf <"dfmin", int_hexagon_F2_dfmin>;
+def HEXAGON_F2_dfcmpeq:
+    qi_ALU64_dfdf <"dfcmp.eq", int_hexagon_F2_dfcmpeq>;
+def HEXAGON_F2_dfcmpgt:
+    qi_ALU64_dfdf <"dfcmp.gt", int_hexagon_F2_dfcmpgt>;
+def HEXAGON_F2_dfcmpge:
+    qi_ALU64_dfdf <"dfcmp.ge", int_hexagon_F2_dfcmpge>;
+def HEXAGON_F2_dfcmpuo:
+    qi_ALU64_dfdf <"dfcmp.uo", int_hexagon_F2_dfcmpuo>;
+def HEXAGON_F2_dfclass:
+    qi_ALU64_dfu5 <"dfclass", int_hexagon_F2_dfclass>;
+def HEXAGON_F2_dfimm_p:
+    df_ALU64_u10_pos <"dfmake", int_hexagon_F2_dfimm_p>;
+def HEXAGON_F2_dfimm_n:
+    df_ALU64_u10_neg <"dfmake", int_hexagon_F2_dfimm_n>;
+def HEXAGON_F2_dffixupn:
+    df_MInst_dfdf <"dffixupn", int_hexagon_F2_dffixupn>;
+def HEXAGON_F2_dffixupd:
+    df_MInst_dfdf <"dffixupd", int_hexagon_F2_dffixupd>;
+def HEXAGON_F2_dffixupr:
+    df_SInst_df <"dffixupr", int_hexagon_F2_dffixupr>;
+def HEXAGON_F2_conv_sf2df:
+    df_SInst_sf <"convert_sf2df", int_hexagon_F2_conv_sf2df>;
+def HEXAGON_F2_conv_df2sf:
+    sf_SInst_df <"convert_df2sf", int_hexagon_F2_conv_df2sf>;
+def HEXAGON_F2_conv_uw2sf:
+    sf_SInst_si <"convert_uw2sf", int_hexagon_F2_conv_uw2sf>;
+def HEXAGON_F2_conv_uw2df:
+    df_SInst_si <"convert_uw2df", int_hexagon_F2_conv_uw2df>;
+def HEXAGON_F2_conv_w2sf:
+    sf_SInst_si <"convert_w2sf", int_hexagon_F2_conv_w2sf>;
+def HEXAGON_F2_conv_w2df:
+    df_SInst_si <"convert_w2df", int_hexagon_F2_conv_w2df>;
+def HEXAGON_F2_conv_ud2sf:
+    sf_SInst_di <"convert_ud2sf", int_hexagon_F2_conv_ud2sf>;
+def HEXAGON_F2_conv_ud2df:
+    df_SInst_di <"convert_ud2df", int_hexagon_F2_conv_ud2df>;
+def HEXAGON_F2_conv_d2sf:
+    sf_SInst_di <"convert_d2sf", int_hexagon_F2_conv_d2sf>;
+def HEXAGON_F2_conv_d2df:
+    df_SInst_di <"convert_d2df", int_hexagon_F2_conv_d2df>;
+def HEXAGON_F2_conv_sf2uw:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw>;
+def HEXAGON_F2_conv_sf2w:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w>;
+def HEXAGON_F2_conv_sf2ud:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud>;
+def HEXAGON_F2_conv_sf2d:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d>;
+def HEXAGON_F2_conv_df2uw:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw>;
+def HEXAGON_F2_conv_df2w:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w>;
+def HEXAGON_F2_conv_df2ud:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud>;
+def HEXAGON_F2_conv_df2d:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d>;
+def HEXAGON_F2_conv_sf2uw_chop:
+    si_SInst_sf <"convert_sf2uw", int_hexagon_F2_conv_sf2uw_chop>;
+def HEXAGON_F2_conv_sf2w_chop:
+    si_SInst_sf <"convert_sf2w", int_hexagon_F2_conv_sf2w_chop>;
+def HEXAGON_F2_conv_sf2ud_chop:
+    di_SInst_sf <"convert_sf2ud", int_hexagon_F2_conv_sf2ud_chop>;
+def HEXAGON_F2_conv_sf2d_chop:
+    di_SInst_sf <"convert_sf2d", int_hexagon_F2_conv_sf2d_chop>;
+def HEXAGON_F2_conv_df2uw_chop:
+    si_SInst_df <"convert_df2uw", int_hexagon_F2_conv_df2uw_chop>;
+def HEXAGON_F2_conv_df2w_chop:
+    si_SInst_df <"convert_df2w", int_hexagon_F2_conv_df2w_chop>;
+def HEXAGON_F2_conv_df2ud_chop:
+    di_SInst_df <"convert_df2ud", int_hexagon_F2_conv_df2ud_chop>;
+def HEXAGON_F2_conv_df2d_chop:
+    di_SInst_df <"convert_df2d", int_hexagon_F2_conv_df2d_chop>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
new file mode 100644
index 000000000000..f011d51bd61a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -0,0 +1,95 @@
+//===- HexagonMCInstLower.cpp - Convert Hexagon MachineInstr to an MCInst -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Hexagon MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonAsmPrinter.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol,
+                              HexagonAsmPrinter& Printer) {
+  MCContext &MC = Printer.OutContext;
+  const MCExpr *ME;
+
+  ME = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, MC);
+
+  if (!MO.isJTI() && MO.getOffset())
+    ME = MCBinaryExpr::CreateAdd(ME, MCConstantExpr::Create(MO.getOffset(), MC),
+                                 MC);
+
+  return (MCOperand::CreateExpr(ME));
+}
+
+// Create an MCInst from a MachineInstr
+void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
+                            HexagonAsmPrinter& AP) {
+  MCI.setOpcode(MI->getOpcode());
+  MCI.setDesc(MI->getDesc());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCO;
+
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCO = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      APFloat Val = MO.getFPImm()->getValueAPF();
+      // FP immediates are used only when setting GPRs, so they may be dealt
+      // with like regular immediates from this point on.
+      MCO = MCOperand::CreateImm(*Val.bitcastToAPInt().getRawData());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCO = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCO = MCOperand::CreateExpr
+              (MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(),
+               AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCO = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
+                         AP);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP);
+      break;
+    }
+
+    MCI.addOperand(MCO);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
new file mode 100644
index 000000000000..0318c519e453
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -0,0 +1,75 @@
+//=- HexagonMachineFuctionInfo.h - Hexagon machine function info --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMACHINEFUNCTIONINFO_H
+#define HexagonMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  namespace Hexagon {
+    const unsigned int StartPacket = 0x1;
+    const unsigned int EndPacket = 0x2;
+  }
+
+
+/// Hexagon target-specific information for each MachineFunction.
+class HexagonMachineFunctionInfo : public MachineFunctionInfo {
+  // SRetReturnReg - Some subtargets require that sret lowering includes
+  // returning the value of the returned struct in a register. This field
+  // holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+  std::vector<MachineInstr*> AllocaAdjustInsts;
+  int VarArgsFrameIndex;
+  bool HasClobberLR;
+
+  std::map<const MachineInstr*, unsigned> PacketInfo;
+
+
+public:
+  HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0) {}
+
+  HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
+                                                    HasClobberLR(0) {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  void addAllocaAdjustInst(MachineInstr* MI) {
+    AllocaAdjustInsts.push_back(MI);
+  }
+  const std::vector<MachineInstr*>& getAllocaAdjustInsts() {
+    return AllocaAdjustInsts;
+  }
+
+  void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
+  int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
+
+  void setStartPacket(MachineInstr* MI) {
+    PacketInfo[MI] |= Hexagon::StartPacket;
+  }
+  void setEndPacket(MachineInstr* MI)   {
+    PacketInfo[MI] |= Hexagon::EndPacket;
+  }
+  bool isStartPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::StartPacket));
+  }
+  bool isEndPacket(const MachineInstr* MI) const {
+    return (PacketInfo.count(MI) &&
+            (PacketInfo.find(MI)->second & Hexagon::EndPacket));
+  }
+  void setHasClobberLR(bool v) { HasClobberLR = v;  }
+  bool hasClobberLR() const { return HasClobberLR; }
+
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 000000000000..1388ad4f167d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,692 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "HexagonMachineScheduler.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+/// Platform specific modifications to DAG.
+void VLIWMachineScheduler::postprocessDAG() {
+  SUnit* LastSequentialCall = NULL;
+  // Currently we only catch the situation when compare gets scheduled
+  // before preceding call.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    // Remember the call.
+    if (SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &(SUnits[su]);
+    // Look for a compare that defines a predicate.
+    else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+  }
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= SchedModel->getIssueWidth()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI.getLoopDepth(BB)
+        << " \n");
+
+  buildDAGWithRegPressure();
+
+  // Postprocess the DAG to add platform specific artificial dependencies.
+  postprocessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  SchedImpl->initialize(this);
+
+  // To view Height/Depth correctly, they should be accessed at least once.
+  //
+  // FIXME: SUnit::dumpAll always recompute depth and height now. The max
+  // depth/height could be computed directly from the roots and leaves.
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler*>(dag);
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  delete Top.HazardRec;
+  delete Bot.HazardRec;
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  Top.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+  Bot.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+
+  assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    ResourceModel->reserveResources(0);
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q,
+                                             SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return 0;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 100;
+static const unsigned PriorityThree = 50;
+static const unsigned PriorityFour = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I)
+      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I)
+      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  // Factor in reg pressure as a heuristic.
+  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+
+  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
+  }
+  SUnit *SU;
+  if (llvm::ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (llvm::ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 000000000000..f68dadf29210
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,244 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const TargetSchedModel *SchedModel;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
+    SchedModel(SM), TotalPackets(0) {
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(SchedModel->getIssueWidth());
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  unsigned getTotalPackets() const { return TotalPackets; }
+};
+
+/// Extend the standard ScheduleDAGMI to provide more context and override the
+/// top-level schedule() driver.
+class VLIWMachineScheduler : public ScheduleDAGMI {
+public:
+  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGMI(C, S) {}
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  virtual void schedule();
+  /// Perform platform specific DAG postprocessing.
+  void postprocessDAG();
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(NULL), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    VLIWMachineScheduler *DAG;
+    const TargetSchedModel *SchedModel;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), SchedModel(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
+      DAG = dag;
+      SchedModel = smodel;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetSchedModel *SchedModel;
+  const TargetRegisterInfo *TRI;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler():
+    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+  unsigned ReportPackets() {
+    return Top.ResourceModel->getTotalPackets() +
+           Bot.ResourceModel->getTotalPackets();
+  }
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
new file mode 100644
index 000000000000..5e80e48b01d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -0,0 +1,645 @@
+//===----- HexagonNewValueJump.cpp - Hexagon Backend New Value Jump -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements NewValueJump pass in Hexagon.
+// Ideally, we should merge this as a Peephole pass prior to register
+// allocation, but because we have a spill in between the feeder and new value
+// jump instructions, we are forced to write after register allocation.
+// Having said that, we should re-attempt to pull this earlier at some point
+// in future.
+
+// The basic approach looks for sequence of predicated jump, compare instruciton
+// that genereates the predicate and, the feeder to the predicate. Once it finds
+// all, it collapses compare and jump instruction into a new valu jump
+// intstructions.
+//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-nvj"
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <map>
+using namespace llvm;
+
+STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
+
+static cl::opt<int>
+DbgNVJCount("nvj-count", cl::init(-1), cl::Hidden, cl::desc(
+  "Maximum number of predicated jumps to be converted to New Value Jump"));
+
+static cl::opt<bool> DisableNewValueJumps("disable-nvjump", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable New Value Jumps"));
+
+namespace {
+  struct HexagonNewValueJump : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+
+  public:
+    static char ID;
+
+    HexagonNewValueJump() : MachineFunctionPass(ID) { }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon NewValueJump";
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+  private:
+
+  };
+
+} // end of anonymous namespace
+
+char HexagonNewValueJump::ID = 0;
+
+// We have identified this II could be feeder to NVJ,
+// verify that it can be.
+static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
+                                      const TargetRegisterInfo *TRI,
+                                      MachineBasicBlock::iterator II,
+                                      MachineBasicBlock::iterator end,
+                                      MachineBasicBlock::iterator skip,
+                                      MachineFunction &MF) {
+
+  // Predicated instruction can not be feeder to NVJ.
+  if (QII->isPredicated(II))
+    return false;
+
+  // Bail out if feederReg is a paired register (double regs in
+  // our case). One would think that we can check to see if a given
+  // register cmpReg1 or cmpReg2 is a sub register of feederReg
+  // using -- if (QRI->isSubRegister(feederReg, cmpReg1) logic
+  // before the callsite of this function
+  // But we can not as it comes in the following fashion.
+  //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+  //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+  //    %P0<def> = CMPEQri %R0<kill>, 0
+  // Hence, we need to check if it's a KILL instruction.
+  if (II->getOpcode() == TargetOpcode::KILL)
+    return false;
+
+
+  // Make sure there there is no 'def' or 'use' of any of the uses of
+  // feeder insn between it's definition, this MI and jump, jmpInst
+  // skipping compare, cmpInst.
+  // Here's the example.
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    r4=memub(r3+r21<<#0)
+  //    if (p0.new) jump:t .LBB29_45
+  // Without this check, it will be converted into
+  //    r4=memub(r3+r21<<#0)
+  //    r21=memub(r22+r24<<#0)
+  //    p0 = cmp.eq(r21, #0)
+  //    if (p0.new) jump:t .LBB29_45
+  // and result WAR hazards if converted to New Value Jump.
+
+  for (unsigned i = 0; i < II->getNumOperands(); ++i) {
+    if (II->getOperand(i).isReg() &&
+        (II->getOperand(i).isUse() || II->getOperand(i).isDef())) {
+      MachineBasicBlock::iterator localII = II;
+      ++localII;
+      unsigned Reg = II->getOperand(i).getReg();
+      for (MachineBasicBlock::iterator localBegin = localII;
+                        localBegin != end; ++localBegin) {
+        if (localBegin == skip ) continue;
+        // Check for Subregisters too.
+        if (localBegin->modifiesRegister(Reg, TRI) ||
+            localBegin->readsRegister(Reg, TRI))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+// These are the common checks that need to performed
+// to determine if
+// 1. compare instruction can be moved before jump.
+// 2. feeder to the compare instruction can be moved before jump.
+static bool commonChecksToProhibitNewValueJump(bool afterRA,
+                          MachineBasicBlock::iterator MII) {
+
+  // If store in path, bail out.
+  if (MII->getDesc().mayStore())
+    return false;
+
+  // if call in path, bail out.
+  if (MII->getOpcode() == Hexagon::CALLv3)
+    return false;
+
+  // if NVJ is running prior to RA, do the following checks.
+  if (!afterRA) {
+    // The following Target Opcode instructions are spurious
+    // to new value jump. If they are in the path, bail out.
+    // KILL sets kill flag on the opcode. It also sets up a
+    // single register, out of pair.
+    //    %D0<def> = Hexagon_S2_lsr_r_p %D0<kill>, %R2<kill>
+    //    %R0<def> = KILL %R0, %D0<imp-use,kill>
+    //    %P0<def> = CMPEQri %R0<kill>, 0
+    // PHI can be anything after RA.
+    // COPY can remateriaze things in between feeder, compare and nvj.
+    if (MII->getOpcode() == TargetOpcode::KILL ||
+        MII->getOpcode() == TargetOpcode::PHI  ||
+        MII->getOpcode() == TargetOpcode::COPY)
+      return false;
+
+    // The following pseudo Hexagon instructions sets "use" and "def"
+    // of registers by individual passes in the backend. At this time,
+    // we don't know the scope of usage and definitions of these
+    // instructions.
+    if (MII->getOpcode() == Hexagon::TFR_condset_rr ||
+        MII->getOpcode() == Hexagon::TFR_condset_ii ||
+        MII->getOpcode() == Hexagon::TFR_condset_ri ||
+        MII->getOpcode() == Hexagon::TFR_condset_ir ||
+        MII->getOpcode() == Hexagon::LDriw_pred     ||
+        MII->getOpcode() == Hexagon::STriw_pred)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
+                                     const TargetRegisterInfo *TRI,
+                                     MachineBasicBlock::iterator II,
+                                     unsigned pReg,
+                                     bool secondReg,
+                                     bool optLocation,
+                                     MachineBasicBlock::iterator end,
+                                     MachineFunction &MF) {
+
+  MachineInstr *MI = II;
+
+  // If the second operand of the compare is an imm, make sure it's in the
+  // range specified by the arch.
+  if (!secondReg) {
+    int64_t v = MI->getOperand(2).getImm();
+    if (MI->getOpcode() == Hexagon::CMPGEri ||
+       (MI->getOpcode() == Hexagon::CMPGEUri && v > 0))
+      --v;
+
+    if (!(isUInt<5>(v) ||
+         ((MI->getOpcode() == Hexagon::CMPEQri ||
+           MI->getOpcode() == Hexagon::CMPGTri ||
+           MI->getOpcode() == Hexagon::CMPGEri) &&
+          (v == -1))))
+      return false;
+  }
+
+  unsigned cmpReg1, cmpOp2;
+  cmpReg1 = MI->getOperand(1).getReg();
+
+  if (secondReg) {
+    cmpOp2 = MI->getOperand(2).getReg();
+
+    // Make sure that that second register is not from COPY
+    // At machine code level, we don't need this, but if we decide
+    // to move new value jump prior to RA, we would be needing this.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    if (secondReg && !TargetRegisterInfo::isPhysicalRegister(cmpOp2)) {
+      MachineInstr *def = MRI.getVRegDef(cmpOp2);
+      if (def->getOpcode() == TargetOpcode::COPY)
+        return false;
+    }
+  }
+
+  // Walk the instructions after the compare (predicate def) to the jump,
+  // and satisfy the following conditions.
+  ++II ;
+  for (MachineBasicBlock::iterator localII = II; localII != end;
+       ++localII) {
+
+    // Check 1.
+    // If "common" checks fail, bail out.
+    if (!commonChecksToProhibitNewValueJump(optLocation, localII))
+      return false;
+
+    // Check 2.
+    // If there is a def or use of predicate (result of compare), bail out.
+    if (localII->modifiesRegister(pReg, TRI) ||
+        localII->readsRegister(pReg, TRI))
+      return false;
+
+    // Check 3.
+    // If there is a def of any of the use of the compare (operands of compare),
+    // bail out.
+    // Eg.
+    //    p0 = cmp.eq(r2, r0)
+    //    r2 = r4
+    //    if (p0.new) jump:t .LBB28_3
+    if (localII->modifiesRegister(cmpReg1, TRI) ||
+        (secondReg && localII->modifiesRegister(cmpOp2, TRI)))
+      return false;
+  }
+  return true;
+}
+
+// Given a compare operator, return a matching New Value Jump
+// compare operator. Make sure that MI here is included in
+// HexagonInstrInfo.cpp::isNewValueJumpCandidate
+static unsigned getNewValueJumpOpcode(const MachineInstr *MI, int reg,
+                                      bool secondRegNewified) {
+  switch (MI->getOpcode()) {
+    case Hexagon::CMPEQrr:
+      return Hexagon::JMP_EQrrPt_nv_V4;
+
+    case Hexagon::CMPEQri: {
+      if (reg >= 0)
+        return Hexagon::JMP_EQriPt_nv_V4;
+      else
+        return Hexagon::JMP_EQriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTrr:
+    case Hexagon::CMPGTrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGEri: {
+      if (reg >= 1)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPGTri: {
+      if (reg >= 0)
+        return Hexagon::JMP_GTriPt_nv_V4;
+      else
+        return Hexagon::JMP_GTriPtneg_nv_V4;
+    }
+
+    case Hexagon::CMPLTUrr:
+    case Hexagon::CMPGTUrr: {
+      if (secondRegNewified)
+        return Hexagon::JMP_GTUrrdnPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUrrPt_nv_V4;
+    }
+
+    case Hexagon::CMPGTUri:
+      return Hexagon::JMP_GTUriPt_nv_V4;
+
+    case Hexagon::CMPGEUri: {
+      if (reg == 0)
+        return Hexagon::JMP_EQrrPt_nv_V4;
+      else
+        return Hexagon::JMP_GTUriPt_nv_V4;
+    }
+
+    default:
+       llvm_unreachable("Could not find matching New Value Jump instruction.");
+  }
+  // return *some value* to avoid compiler warning
+  return 0;
+}
+
+bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
+               << "********** Function: "
+               << MF.getName() << "\n");
+
+#if 0
+  // for now disable this, if we move NewValueJump before register
+  // allocation we need this information.
+  LiveVariables &LVs = getAnalysis<LiveVariables>();
+#endif
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+  QRI =
+    static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
+
+  if (!QRI->Subtarget.hasV4TOps() ||
+      DisableNewValueJumps) {
+    return false;
+  }
+
+  int nvjCount = DbgNVJCount;
+  int nvjGenerated = 0;
+
+  // Loop through all the bb's of the function
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+        MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+
+    DEBUG(dbgs() << "** dumping bb ** "
+                 << MBB->getNumber() << "\n");
+    DEBUG(MBB->dump());
+    DEBUG(dbgs() << "\n" << "********** dumping instr bottom up **********\n");
+    bool foundJump    = false;
+    bool foundCompare = false;
+    bool invertPredicate = false;
+    unsigned predReg = 0; // predicate reg of the jump.
+    unsigned cmpReg1 = 0;
+    int cmpOp2 = 0;
+    bool MO1IsKill = false;
+    bool MO2IsKill = false;
+    MachineBasicBlock::iterator jmpPos;
+    MachineBasicBlock::iterator cmpPos;
+    MachineInstr *cmpInstr = NULL, *jmpInstr = NULL;
+    MachineBasicBlock *jmpTarget = NULL;
+    bool afterRA = false;
+    bool isSecondOpReg = false;
+    bool isSecondOpNewified = false;
+    // Traverse the basic block - bottom up
+    for (MachineBasicBlock::iterator MII = MBB->end(), E = MBB->begin();
+             MII != E;) {
+      MachineInstr *MI = --MII;
+      if (MI->isDebugValue()) {
+        continue;
+      }
+
+      if ((nvjCount == 0) || (nvjCount > -1 && nvjCount <= nvjGenerated))
+        break;
+
+      DEBUG(dbgs() << "Instr: "; MI->dump(); dbgs() << "\n");
+
+      if (!foundJump &&
+         (MI->getOpcode() == Hexagon::JMP_c ||
+          MI->getOpcode() == Hexagon::JMP_cNot ||
+          MI->getOpcode() == Hexagon::JMP_cdnPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnPnt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+          MI->getOpcode() == Hexagon::JMP_cdnNotPnt)) {
+        // This is where you would insert your compare and
+        // instr that feeds compare
+        jmpPos = MII;
+        jmpInstr = MI;
+        predReg = MI->getOperand(0).getReg();
+        afterRA = TargetRegisterInfo::isPhysicalRegister(predReg);
+
+        // If ifconverter had not messed up with the kill flags of the
+        // operands, the following check on the kill flag would suffice.
+        // if(!jmpInstr->getOperand(0).isKill()) break;
+
+        // This predicate register is live out out of BB
+        // this would only work if we can actually use Live
+        // variable analysis on phy regs - but LLVM does not
+        // provide LV analysis on phys regs.
+        //if(LVs.isLiveOut(predReg, *MBB)) break;
+
+        // Get all the successors of this block - which will always
+        // be 2. Check if the predicate register is live in in those
+        // successor. If yes, we can not delete the predicate -
+        // I am doing this only because LLVM does not provide LiveOut
+        // at the BB level.
+        bool predLive = false;
+        for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+                            SIE = MBB->succ_end(); SI != SIE; ++SI) {
+          MachineBasicBlock* succMBB = *SI;
+         if (succMBB->isLiveIn(predReg)) {
+            predLive = true;
+          }
+        }
+        if (predLive)
+          break;
+
+        jmpTarget = MI->getOperand(1).getMBB();
+        foundJump = true;
+        if (MI->getOpcode() == Hexagon::JMP_cNot ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPt ||
+            MI->getOpcode() == Hexagon::JMP_cdnNotPnt) {
+          invertPredicate = true;
+        }
+        continue;
+      }
+
+      // No new value jump if there is a barrier. A barrier has to be in its
+      // own packet. A barrier has zero operands. We conservatively bail out
+      // here if we see any instruction with zero operands.
+      if (foundJump && MI->getNumOperands() == 0)
+        break;
+
+      if (foundJump &&
+         !foundCompare &&
+          MI->getOperand(0).isReg() &&
+          MI->getOperand(0).getReg() == predReg) {
+
+        // Not all compares can be new value compare. Arch Spec: 7.6.1.1
+        if (QII->isNewValueJumpCandidate(MI)) {
+
+          assert((MI->getDesc().isCompare()) &&
+              "Only compare instruction can be collapsed into New Value Jump");
+          isSecondOpReg = MI->getOperand(2).isReg();
+
+          if (!canCompareBeNewValueJump(QII, QRI, MII, predReg, isSecondOpReg,
+                                        afterRA, jmpPos, MF))
+            break;
+
+          cmpInstr = MI;
+          cmpPos = MII;
+          foundCompare = true;
+
+          // We need cmpReg1 and cmpOp2(imm or reg) while building
+          // new value jump instruction.
+          cmpReg1 = MI->getOperand(1).getReg();
+          if (MI->getOperand(1).isKill())
+            MO1IsKill = true;
+
+          if (isSecondOpReg) {
+            cmpOp2 = MI->getOperand(2).getReg();
+            if (MI->getOperand(2).isKill())
+              MO2IsKill = true;
+          } else
+            cmpOp2 = MI->getOperand(2).getImm();
+          continue;
+        }
+      }
+
+      if (foundCompare && foundJump) {
+
+        // If "common" checks fail, bail out on this BB.
+        if (!commonChecksToProhibitNewValueJump(afterRA, MII))
+          break;
+
+        bool foundFeeder = false;
+        MachineBasicBlock::iterator feederPos = MII;
+        if (MI->getOperand(0).isReg() &&
+            MI->getOperand(0).isDef() &&
+           (MI->getOperand(0).getReg() == cmpReg1 ||
+            (isSecondOpReg &&
+             MI->getOperand(0).getReg() == (unsigned) cmpOp2))) {
+
+          unsigned feederReg = MI->getOperand(0).getReg();
+
+          // First try to see if we can get the feeder from the first operand
+          // of the compare. If we can not, and if secondOpReg is true
+          // (second operand of the compare is also register), try that one.
+          // TODO: Try to come up with some heuristic to figure out which
+          // feeder would benefit.
+
+          if (feederReg == cmpReg1) {
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF)) {
+              if (!isSecondOpReg)
+                break;
+              else
+                continue;
+            } else
+              foundFeeder = true;
+          }
+
+          if (!foundFeeder &&
+               isSecondOpReg &&
+               feederReg == (unsigned) cmpOp2)
+            if (!canBeFeederToNewValueJump(QII, QRI, MII, jmpPos, cmpPos, MF))
+              break;
+
+          if (isSecondOpReg) {
+            // In case of CMPLT, or CMPLTU, or EQ with the second register
+            // to newify, swap the operands.
+            if (cmpInstr->getOpcode() == Hexagon::CMPLTrr  ||
+                cmpInstr->getOpcode() == Hexagon::CMPLTUrr ||
+                (cmpInstr->getOpcode() == Hexagon::CMPEQrr &&
+                                     feederReg == (unsigned) cmpOp2)) {
+              unsigned tmp = cmpReg1;
+              bool tmpIsKill = MO1IsKill;
+              cmpReg1 = cmpOp2;
+              MO1IsKill = MO2IsKill;
+              cmpOp2 = tmp;
+              MO2IsKill = tmpIsKill;
+            }
+
+            // Now we have swapped the operands, all we need to check is,
+            // if the second operand (after swap) is the feeder.
+            // And if it is, make a note.
+            if (feederReg == (unsigned)cmpOp2)
+              isSecondOpNewified = true;
+          }
+
+          // Now that we are moving feeder close the jump,
+          // make sure we are respecting the kill values of
+          // the operands of the feeder.
+
+          bool updatedIsKill = false;
+          for (unsigned i = 0; i < MI->getNumOperands(); i++) {
+            MachineOperand &MO = MI->getOperand(i);
+            if (MO.isReg() && MO.isUse()) {
+              unsigned feederReg = MO.getReg();
+              for (MachineBasicBlock::iterator localII = feederPos,
+                   end = jmpPos; localII != end; localII++) {
+                MachineInstr *localMI = localII;
+                for (unsigned j = 0; j < localMI->getNumOperands(); j++) {
+                  MachineOperand &localMO = localMI->getOperand(j);
+                  if (localMO.isReg() && localMO.isUse() &&
+                      localMO.isKill() && feederReg == localMO.getReg()) {
+                    // We found that there is kill of a use register
+                    // Set up a kill flag on the register
+                    localMO.setIsKill(false);
+                    MO.setIsKill();
+                    updatedIsKill = true;
+                    break;
+                  }
+                }
+                if (updatedIsKill) break;
+              }
+            }
+            if (updatedIsKill) break;
+          }
+
+          MBB->splice(jmpPos, MI->getParent(), MI);
+          MBB->splice(jmpPos, MI->getParent(), cmpInstr);
+          DebugLoc dl = MI->getDebugLoc();
+          MachineInstr *NewMI;
+
+           assert((QII->isNewValueJumpCandidate(cmpInstr)) &&
+                      "This compare is not a New Value Jump candidate.");
+          unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2,
+                                               isSecondOpNewified);
+          if (invertPredicate)
+            opc = QII->getInvertedPredicatedOpcode(opc);
+
+          // Manage the conversions from CMPGEUri to either CMPEQrr
+          // or CMPGTUri properly. See Arch spec for CMPGEUri instructions.
+          // This has to be after the getNewValueJumpOpcode function call as
+          // second operand of the compare could be modified in this logic.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEUri) {
+            if (cmpOp2 == 0) {
+              cmpOp2 = cmpReg1;
+              MO2IsKill = MO1IsKill;
+              isSecondOpReg = true;
+            } else
+              --cmpOp2;
+          }
+
+          // Manage the conversions from CMPGEri to CMPGTUri properly.
+          // See Arch spec for CMPGEri instructions.
+          if (cmpInstr->getOpcode() == Hexagon::CMPGEri)
+            --cmpOp2;
+
+          if (isSecondOpReg) {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addReg(cmpOp2, getKillRegState(MO2IsKill))
+                                    .addMBB(jmpTarget);
+          }
+          else {
+            NewMI = BuildMI(*MBB, jmpPos, dl,
+                                  QII->get(opc))
+                                    .addReg(cmpReg1, getKillRegState(MO1IsKill))
+                                    .addImm(cmpOp2)
+                                    .addMBB(jmpTarget);
+          }
+
+          assert(NewMI && "New Value Jump Instruction Not created!");
+          if (cmpInstr->getOperand(0).isReg() &&
+              cmpInstr->getOperand(0).isKill())
+            cmpInstr->getOperand(0).setIsKill(false);
+          if (cmpInstr->getOperand(1).isReg() &&
+              cmpInstr->getOperand(1).isKill())
+            cmpInstr->getOperand(1).setIsKill(false);
+          cmpInstr->eraseFromParent();
+          jmpInstr->eraseFromParent();
+          ++nvjGenerated;
+          ++NumNVJGenerated;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+
+}
+
+FunctionPass *llvm::createHexagonNewValueJump() {
+  return new HexagonNewValueJump();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
new file mode 100644
index 000000000000..c79d78f21080
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -0,0 +1,858 @@
+//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illnois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Immediate operands.
+
+let PrintMethod = "printImmOperand" in {
+  // f32Ext type is used to identify constant extended floating point immediates.
+  def f32Ext : Operand<f32>;
+  def s32Imm : Operand<i32>;
+  def s26_6Imm : Operand<i32>;
+  def s16Imm : Operand<i32>;
+  def s12Imm : Operand<i32>;
+  def s11Imm : Operand<i32>;
+  def s11_0Imm : Operand<i32>;
+  def s11_1Imm : Operand<i32>;
+  def s11_2Imm : Operand<i32>;
+  def s11_3Imm : Operand<i32>;
+  def s10Imm : Operand<i32>;
+  def s9Imm : Operand<i32>;
+  def m9Imm : Operand<i32>;
+  def s8Imm : Operand<i32>;
+  def s8Imm64 : Operand<i64>;
+  def s6Imm : Operand<i32>;
+  def s4Imm : Operand<i32>;
+  def s4_0Imm : Operand<i32>;
+  def s4_1Imm : Operand<i32>;
+  def s4_2Imm : Operand<i32>;
+  def s4_3Imm : Operand<i32>;
+  def u64Imm : Operand<i64>;
+  def u32Imm : Operand<i32>;
+  def u26_6Imm : Operand<i32>;
+  def u16Imm : Operand<i32>;
+  def u16_0Imm : Operand<i32>;
+  def u16_1Imm : Operand<i32>;
+  def u16_2Imm : Operand<i32>;
+  def u11_3Imm : Operand<i32>;
+  def u10Imm : Operand<i32>;
+  def u9Imm : Operand<i32>;
+  def u8Imm : Operand<i32>;
+  def u7Imm : Operand<i32>;
+  def u6Imm : Operand<i32>;
+  def u6_0Imm : Operand<i32>;
+  def u6_1Imm : Operand<i32>;
+  def u6_2Imm : Operand<i32>;
+  def u6_3Imm : Operand<i32>;
+  def u5Imm : Operand<i32>;
+  def u4Imm : Operand<i32>;
+  def u3Imm : Operand<i32>;
+  def u2Imm : Operand<i32>;
+  def u1Imm : Operand<i32>;
+  def n8Imm : Operand<i32>;
+  def m6Imm : Operand<i32>;
+}
+
+let PrintMethod = "printNOneImmOperand" in
+def nOneImm : Operand<i32>;
+
+//
+// Immediate predicates
+//
+def s32ImmPred  : PatLeaf<(i32 imm), [{
+  // s32ImmPred predicate - True if the immediate fits in a 32-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<32>(v);
+}]>;
+
+def s32_24ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_24ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x1000000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<32,24>(v);
+}]>;
+
+def s32_16s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s32_16s8ImmPred predicate - True if the immediate fits in a 32-bit sign
+  // extended field that is a multiple of 0x10000.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<24,16>(v);
+}]>;
+
+def s26_6ImmPred  : PatLeaf<(i32 imm), [{
+  // s26_6ImmPred predicate - True if the immediate fits in a 32-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<26,6>(v);
+}]>;
+
+
+def s16ImmPred  : PatLeaf<(i32 imm), [{
+  // s16ImmPred predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<16>(v);
+}]>;
+
+
+def s13ImmPred  : PatLeaf<(i32 imm), [{
+  // s13ImmPred predicate - True if the immediate fits in a 13-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<13>(v);
+}]>;
+
+
+def s12ImmPred  : PatLeaf<(i32 imm), [{
+  // s12ImmPred predicate - True if the immediate fits in a 12-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<12>(v);
+}]>;
+
+def s11_0ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_0ImmPred predicate - True if the immediate fits in a 11-bit
+  // sign extended field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<11>(v);
+}]>;
+
+
+def s11_1ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_1ImmPred predicate - True if the immediate fits in a 12-bit
+  // sign extended field and is a multiple of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,1>(v);
+}]>;
+
+
+def s11_2ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_2ImmPred predicate - True if the immediate fits in a 13-bit
+  // sign extended field and is a multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,2>(v);
+}]>;
+
+
+def s11_3ImmPred  : PatLeaf<(i32 imm), [{
+  // s11_3ImmPred predicate - True if the immediate fits in a 14-bit
+  // sign extended field and is a multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<11,3>(v);
+}]>;
+
+
+def s10ImmPred  : PatLeaf<(i32 imm), [{
+  // s10ImmPred predicate - True if the immediate fits in a 10-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<10>(v);
+}]>;
+
+
+def s9ImmPred  : PatLeaf<(i32 imm), [{
+  // s9ImmPred predicate - True if the immediate fits in a 9-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<9>(v);
+}]>;
+
+def m9ImmPred  : PatLeaf<(i32 imm), [{
+  // m9ImmPred predicate - True if the immediate fits in a 9-bit magnitude
+  // field. The range of m9 is -255 to 255.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<9>(v) && (v != -256);
+}]>;
+
+def s8ImmPred  : PatLeaf<(i32 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s8Imm64Pred  : PatLeaf<(i64 imm), [{
+  // s8ImmPred predicate - True if the immediate fits in a 8-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<8>(v);
+}]>;
+
+
+def s6ImmPred  : PatLeaf<(i32 imm), [{
+  // s6ImmPred predicate - True if the immediate fits in a 6-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<6>(v);
+}]>;
+
+
+def s4_0ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_0ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isInt<4>(v);
+}]>;
+
+
+def s4_1ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_1ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,1>(v);
+}]>;
+
+
+def s4_2ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_2ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,2>(v);
+}]>;
+
+
+def s4_3ImmPred  : PatLeaf<(i32 imm), [{
+  // s4_3ImmPred predicate - True if the immediate fits in a 4-bit sign extended
+  // field that is a multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedInt<4,3>(v);
+}]>;
+
+
+def u64ImmPred  : PatLeaf<(i64 imm), [{
+  // Adding "N ||" to suppress gcc unused warning.
+  return (N || true);
+}]>;
+
+def u32ImmPred  : PatLeaf<(i32 imm), [{
+  // u32ImmPred predicate - True if the immediate fits in a 32-bit field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<32>(v);
+}]>;
+
+def u26_6ImmPred  : PatLeaf<(i32 imm), [{
+  // u26_6ImmPred - True if the immediate fits in a 32-bit field and
+  // is a multiple of 64.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<26,6>(v);
+}]>;
+
+def u16ImmPred  : PatLeaf<(i32 imm), [{
+  // u16ImmPred predicate - True if the immediate fits in a 16-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<16>(v);
+}]>;
+
+def u16_s8ImmPred  : PatLeaf<(i32 imm), [{
+  // u16_s8ImmPred predicate - True if the immediate fits in a 16-bit sign
+  // extended s8 field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<16,8>(v);
+}]>;
+
+def u9ImmPred  : PatLeaf<(i32 imm), [{
+  // u9ImmPred predicate - True if the immediate fits in a 9-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<9>(v);
+}]>;
+
+
+def u8ImmPred  : PatLeaf<(i32 imm), [{
+  // u8ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<8>(v);
+}]>;
+
+def u7StrictPosImmPred : ImmLeaf<i32, [{
+  // u7StrictPosImmPred predicate - True if the immediate fits in an 7-bit
+  // unsigned field and is strictly greater than 0.
+  return isUInt<7>(Imm) && Imm > 0;
+}]>;
+
+def u7ImmPred  : PatLeaf<(i32 imm), [{
+  // u7ImmPred predicate - True if the immediate fits in a 7-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<7>(v);
+}]>;
+
+
+def u6ImmPred  : PatLeaf<(i32 imm), [{
+  // u6ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_0ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_0ImmPred predicate - True if the immediate fits in a 6-bit unsigned
+  // field. Same as u6ImmPred.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<6>(v);
+}]>;
+
+def u6_1ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_1ImmPred predicate - True if the immediate fits in a 7-bit unsigned
+  // field that is 1 bit alinged - multiple of 2.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,1>(v);
+}]>;
+
+def u6_2ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_2ImmPred predicate - True if the immediate fits in a 8-bit unsigned
+  // field that is 2 bits alinged - multiple of 4.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,2>(v);
+}]>;
+
+def u6_3ImmPred  : PatLeaf<(i32 imm), [{
+  // u6_3ImmPred predicate - True if the immediate fits in a 9-bit unsigned
+  // field that is 3 bits alinged - multiple of 8.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isShiftedUInt<6,3>(v);
+}]>;
+
+def u5ImmPred  : PatLeaf<(i32 imm), [{
+  // u5ImmPred predicate - True if the immediate fits in a 5-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<5>(v);
+}]>;
+
+
+def u3ImmPred  : PatLeaf<(i32 imm), [{
+  // u3ImmPred predicate - True if the immediate fits in a 3-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<3>(v);
+}]>;
+
+
+def u2ImmPred  : PatLeaf<(i32 imm), [{
+  // u2ImmPred predicate - True if the immediate fits in a 2-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<2>(v);
+}]>;
+
+
+def u1ImmPred  : PatLeaf<(i1 imm), [{
+  // u1ImmPred predicate - True if the immediate fits in a 1-bit unsigned
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return isUInt<1>(v);
+}]>;
+
+def m5BImmPred  : PatLeaf<(i32 imm), [{
+  // m5BImmPred predicate - True if the (char) number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  // this is specific to the zero extending of a negative by CombineInstr
+  int8_t v = (int8_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+def m5HImmPred  : PatLeaf<(i32 imm), [{
+  // m5HImmPred predicate - True if the (short) number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  // this is specific to the zero extending of a negative by CombineInstr
+  int16_t v = (int16_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+def m5ImmPred  : PatLeaf<(i32 imm), [{
+  // m5ImmPred predicate - True if the number is in range -1 .. -31
+  // and will fit in a 5 bit field when made positive, for use in memops.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-31 <= v && v <= -1);
+}]>;
+
+//InN means negative integers in [-(2^N - 1), 0]
+def n8ImmPred  : PatLeaf<(i32 imm), [{
+  // n8ImmPred predicate - True if the immediate fits in a 8-bit signed
+  // field.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-255 <= v && v <= 0);
+}]>;
+
+def nOneImmPred  : PatLeaf<(i32 imm), [{
+  // nOneImmPred predicate - True if the immediate is -1.
+  int64_t v = (int64_t)N->getSExtValue();
+  return (-1 == v);
+}]>;
+
+def Set5ImmPred : PatLeaf<(i32 imm), [{
+  // Set5ImmPred predicate - True if the number is in the series of values.
+  // [ 2^0, 2^1, ... 2^31 ]
+  // For use in setbit immediate.
+  uint32_t v = (int32_t)N->getSExtValue();
+  // Constrain to 32 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr5ImmPred : PatLeaf<(i32 imm), [{
+  // Clr5ImmPred predicate - True if the number is in the series of
+  // bit negated values.
+  // [ 2^0, 2^1, ... 2^31 ]
+  // For use in clrbit immediate.
+  // Note: we are bit NOTing the value.
+  uint32_t v = ~ (int32_t)N->getSExtValue();
+  // Constrain to 32 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr5ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr5ImmPred predicate - True if the immediate is in range 0..31.
+  int32_t v = (int32_t)N->getSExtValue();
+  return (v >= 0 && v <= 31);
+}]>;
+
+def Set4ImmPred : PatLeaf<(i32 imm), [{
+  // Set4ImmPred predicate - True if the number is in the series of values:
+  // [ 2^0, 2^1, ... 2^15 ].
+  // For use in setbit immediate.
+  uint16_t v = (int16_t)N->getSExtValue();
+  // Constrain to 16 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr4ImmPred : PatLeaf<(i32 imm), [{
+  // Clr4ImmPred predicate - True if the number is in the series of
+  // bit negated values:
+  // [ 2^0, 2^1, ... 2^15 ].
+  // For use in setbit and clrbit immediate.
+  uint16_t v = ~ (int16_t)N->getSExtValue();
+  // Constrain to 16 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr4ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr4ImmPred predicate - True if the immediate is in the range 0..15.
+  int16_t v = (int16_t)N->getSExtValue();
+  return (v >= 0 && v <= 15);
+}]>;
+
+def Set3ImmPred : PatLeaf<(i32 imm), [{
+  // Set3ImmPred predicate - True if the number is in the series of values:
+  // [ 2^0, 2^1, ... 2^7 ].
+  // For use in setbit immediate.
+  uint8_t v = (int8_t)N->getSExtValue();
+  // Constrain to 8 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def Clr3ImmPred : PatLeaf<(i32 imm), [{
+  // Clr3ImmPred predicate - True if the number is in the series of
+  // bit negated values:
+  // [ 2^0, 2^1, ... 2^7 ].
+  // For use in setbit and clrbit immediate.
+  uint8_t v = ~ (int8_t)N->getSExtValue();
+  // Constrain to 8 bits, and then check for single bit.
+  return ImmIsSingleBit(v);
+}]>;
+
+def SetClr3ImmPred : PatLeaf<(i32 imm), [{
+  // SetClr3ImmPred predicate - True if the immediate is in the range  0..7.
+  int8_t v = (int8_t)N->getSExtValue();
+  return (v >= 0 && v <= 7);
+}]>;
+
+
+// Extendable immediate operands.
+
+let PrintMethod = "printExtOperand" in {
+  def s16Ext : Operand<i32>;
+  def s12Ext : Operand<i32>;
+  def s10Ext : Operand<i32>;
+  def s9Ext : Operand<i32>;
+  def s8Ext : Operand<i32>;
+  def s6Ext : Operand<i32>;
+  def s11_0Ext : Operand<i32>;
+  def s11_1Ext : Operand<i32>;
+  def s11_2Ext : Operand<i32>;
+  def s11_3Ext : Operand<i32>;
+  def u6Ext : Operand<i32>;
+  def u7Ext : Operand<i32>;
+  def u8Ext : Operand<i32>;
+  def u9Ext : Operand<i32>;
+  def u10Ext : Operand<i32>;
+  def u6_0Ext : Operand<i32>;
+  def u6_1Ext : Operand<i32>;
+  def u6_2Ext : Operand<i32>;
+  def u6_3Ext : Operand<i32>;
+}
+
+let PrintMethod = "printImmOperand" in
+def u0AlwaysExt : Operand<i32>;
+
+// Predicates for constant extendable operands
+def s16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 16-bit sign extended field.
+    return isInt<16>(v);
+  else {
+    if (isInt<16>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s10ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 10-bit sign extended field.
+    return isInt<10>(v);
+  else {
+    if (isInt<10>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s9ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit sign extended field.
+    return isInt<9>(v);
+  else {
+    if (isInt<9>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s8ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit sign extended field.
+    return isInt<8>(v);
+  else {
+    if (isInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s8_16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate fits in a 8-bit sign extended field.
+    return isInt<8>(v);
+  else {
+    if (isInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 16-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<16>(v);
+  }
+}]>;
+
+def s6ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s6_16ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate fits in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 16-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<16>(v);
+  }
+}]>;
+
+def s6_10ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit sign extended field.
+    return isInt<6>(v);
+  else {
+    if (isInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can't fit in a 10-bit signed field. This is required to avoid
+    // unnecessary constant extenders.
+    return isConstExtProfitable(Node) && !isInt<10>(v);
+  }
+}]>;
+
+def s11_0ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 11-bit sign extended field.
+    return isShiftedInt<11,0>(v);
+  else {
+    if (isInt<11>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit signed field.
+    return isConstExtProfitable(Node) && isInt<32>(v);
+  }
+}]>;
+
+def s11_1ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 12-bit sign extended field and
+    // is 2 byte aligned.
+    return isShiftedInt<11,1>(v);
+  else {
+    if (isInt<12>(v))
+      return isShiftedInt<11,1>(v);
+
+    // Return true if extending this immediate is profitable and the low 1 bit
+    // is zero (2-byte aligned).
+    return isConstExtProfitable(Node) && isInt<32>(v) && ((v % 2) == 0);
+  }
+}]>;
+
+def s11_2ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 13-bit sign extended field and
+    // is 4-byte aligned.
+    return isShiftedInt<11,2>(v);
+  else {
+    if (isInt<13>(v))
+      return isShiftedInt<11,2>(v);
+
+    // Return true if extending this immediate is profitable and the low 2-bits
+    // are zero (4-byte aligned).
+    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 4) == 0);
+  }
+}]>;
+
+def s11_3ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 14-bit sign extended field and
+    // is 8-byte aligned.
+    return isShiftedInt<11,3>(v);
+  else {
+    if (isInt<14>(v))
+     return isShiftedInt<11,3>(v);
+
+    // Return true if extending this immediate is profitable and the low 3-bits
+    // are zero (8-byte aligned).
+    return isConstExtProfitable(Node)  && isInt<32>(v) && ((v % 8) == 0);
+  }
+}]>;
+
+def u0AlwaysExtPred : PatLeaf<(i32 imm), [{
+  // Predicate for an unsigned 32-bit value that always needs to be extended.
+  if (Subtarget.hasV4TOps()) {
+    if (isConstExtProfitable(Node)) {
+      int64_t v = (int64_t)N->getSExtValue();
+      return isUInt<32>(v);
+    }
+  }
+  return false;
+}]>;
+
+def u6ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 6-bit unsigned field.
+    return isUInt<6>(v);
+  else {
+    if (isUInt<6>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u7ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 7-bit unsigned field.
+    return isUInt<7>(v);
+  else {
+    if (isUInt<7>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u8ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit unsigned field.
+    return isUInt<8>(v);
+  else {
+    if (isUInt<8>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u9ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit unsigned field.
+    return isUInt<9>(v);
+  else {
+    if (isUInt<9>(v))
+      return true;
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v);
+  }
+}]>;
+
+def u6_1ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 7-bit unsigned field and
+    // is 2-byte aligned.
+    return isShiftedUInt<6,1>(v);
+  else {
+    if (isUInt<7>(v))
+      return isShiftedUInt<6,1>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 2) == 0);
+  }
+}]>;
+
+def u6_2ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 8-bit unsigned field and
+    // is 4-byte aligned.
+    return isShiftedUInt<6,2>(v);
+  else {
+    if (isUInt<8>(v))
+      return isShiftedUInt<6,2>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 4) == 0);
+  }
+}]>;
+
+def u6_3ExtPred  : PatLeaf<(i32 imm), [{
+  int64_t v = (int64_t)N->getSExtValue();
+  if (!Subtarget.hasV4TOps())
+    // Return true if the immediate can fit in a 9-bit unsigned field and
+    // is 8-byte aligned.
+    return isShiftedUInt<6,3>(v);
+  else {
+    if (isUInt<9>(v))
+      return isShiftedUInt<6,3>(v);
+
+    // Return true if extending this immediate is profitable and the value
+    // can fit in a 32-bit unsigned field.
+    return isConstExtProfitable(Node) && isUInt<32>(v) && ((v % 8) == 0);
+  }
+}]>;
+
+// Addressing modes.
+
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+def ADDRriS11_0 : ComplexPattern<i32, 2, "SelectADDRriS11_0", [frameindex], []>;
+def ADDRriS11_1 : ComplexPattern<i32, 2, "SelectADDRriS11_1", [frameindex], []>;
+def ADDRriS11_2 : ComplexPattern<i32, 2, "SelectADDRriS11_2", [frameindex], []>;
+def ADDRriS11_3 : ComplexPattern<i32, 2, "SelectADDRriS11_3", [frameindex], []>;
+def ADDRriU6_0 : ComplexPattern<i32, 2, "SelectADDRriU6_0", [frameindex], []>;
+def ADDRriU6_1 : ComplexPattern<i32, 2, "SelectADDRriU6_1", [frameindex], []>;
+def ADDRriU6_2 : ComplexPattern<i32, 2, "SelectADDRriU6_2", [frameindex], []>;
+
+// Address operands.
+
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printMEMrrOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+
+def MEMri_s11_2 : Operand<i32>,
+  ComplexPattern<i32, 2, "SelectMEMriS11_2", []> {
+  let PrintMethod = "printMEMriOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+def FrameIndex : Operand<i32> {
+  let PrintMethod = "printFrameIndexOperand";
+  let MIOperandInfo = (ops IntRegs, s11Imm);
+}
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let PrintMethod = "printJumpTable" in
+def jumptablebase : Operand<i32>;
+
+def brtarget : Operand<OtherVT>;
+def brtargetExt : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+def bblabel : Operand<i32>;
+def bbl   : SDNode<"ISD::BasicBlock", SDTPtrLeaf   , [], "BasicBlockSDNode">;
+
+def symbolHi32 : Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo32 : Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
new file mode 100644
index 000000000000..576f1d7d0790
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -0,0 +1,323 @@
+//===-- HexagonPeephole.cpp - Hexagon Peephole Optimiztions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This peephole pass optimizes in the following cases.
+// 1. Optimizes redundant sign extends for the following case
+//    Transform the following pattern
+//    %vreg170<def> = SXTW %vreg166
+//    ...
+//    %vreg176<def> = COPY %vreg170:subreg_loreg
+//
+//    Into
+//    %vreg176<def> = COPY vreg166
+//
+//  2. Optimizes redundant negation of predicates.
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2
+//     ...
+//     %vreg16<def> = NOT_p %vreg15<kill>
+//     ...
+//     JMP_c %vreg16<kill>, <BB#1>, %PC<imp-def,dead>
+//
+//     Into
+//     %vreg15<def> = CMPGTrr %vreg6, %vreg2;
+//     ...
+//     JMP_cNot %vreg15<kill>, <BB#1>, %PC<imp-def,dead>;
+//
+// Note: The peephole pass makes the instrucstions like
+// %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
+// redundant and relies on some form of dead removal instrucions, like
+// DCE or DIE to actually eliminate them.
+
+
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-peephole"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Peephole Optimization"));
+
+static cl::opt<int>
+DbgPNPCount("pnp-count", cl::init(-1), cl::Hidden,
+  cl::desc("Maximum number of P=NOT(P) to be optimized"));
+
+static cl::opt<bool> DisablePNotP("disable-hexagon-pnotp",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of PNotP"));
+
+static cl::opt<bool> DisableOptSZExt("disable-hexagon-optszext",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Optimization of Sign/Zero Extends"));
+
+namespace {
+  struct HexagonPeephole : public MachineFunctionPass {
+    const HexagonInstrInfo    *QII;
+    const HexagonRegisterInfo *QRI;
+    const MachineRegisterInfo *MRI;
+
+  public:
+    static char ID;
+    HexagonPeephole() : MachineFunctionPass(ID) { }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const {
+      return "Hexagon optimize redundant zero and size extends";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
+  };
+}
+
+char HexagonPeephole::ID = 0;
+
+bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
+
+  QII = static_cast<const HexagonInstrInfo *>(MF.getTarget().
+                                        getInstrInfo());
+  QRI = static_cast<const HexagonRegisterInfo *>(MF.getTarget().
+                                       getRegisterInfo());
+  MRI = &MF.getRegInfo();
+
+  DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
+
+  if (DisableHexagonPeephole) return false;
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
+
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+                                     ++MII) {
+      MachineInstr *MI = MII;
+      // Look for sign extends:
+      // %vreg170<def> = SXTW %vreg166
+      if (!DisableOptSZExt && MI->getOpcode() == Hexagon::SXTW) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = SXTW %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
+      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+        assert(MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+      }
+
+      // Look for P=NOT(P).
+      if (!DisablePNotP &&
+          (MI->getOpcode() == Hexagon::NOT_p)) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        // Just handle virtual registers.
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Map the following:
+          // %vreg170<def> = NOT_xx %vreg166
+          // PeepholeMap[170] = vreg166
+          PeepholeMap[DstReg] = SrcReg;
+        }
+      }
+
+      // Look for copy:
+      // %vreg176<def> = COPY %vreg170:subreg_loreg
+      if (!DisableOptSZExt && MI->isCopy()) {
+        assert (MI->getNumOperands() == 2);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src  = MI->getOperand(1);
+
+        // Make sure we are copying the lower 32 bits.
+        if (Src.getSubReg() != Hexagon::subreg_loreg)
+          continue;
+
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src.getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+          // Try to find in the map.
+          if (unsigned PeepholeSrc = PeepholeMap.lookup(SrcReg)) {
+            // Change the 1st operand.
+            MI->RemoveOperand(1);
+            MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI->RemoveOperand(1);
+              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
+                                                       false /*isDef*/,
+                                                       false /*isImp*/,
+                                                       false /*isKill*/,
+                                                       false /*isDead*/,
+                                                       false /*isUndef*/,
+                                                       false /*isEarlyClobber*/,
+                                                       PeepholeSrc.second));
+            }
+          }
+        }
+      }
+
+      // Look for Predicated instructions.
+      if (!DisablePNotP) {
+        bool Done = false;
+        if (QII->isPredicated(MI)) {
+          MachineOperand &Op0 = MI->getOperand(0);
+          unsigned Reg0 = Op0.getReg();
+          const TargetRegisterClass *RC0 = MRI->getRegClass(Reg0);
+          if (RC0->getID() == Hexagon::PredRegsRegClassID) {
+            // Handle instructions that have a prediate register in op0
+            // (most cases of predicable instructions).
+            if (TargetRegisterInfo::isVirtualRegister(Reg0)) {
+              // Try to find in the map.
+              if (unsigned PeepholeSrc = PeepholeMap.lookup(Reg0)) {
+                // Change the 1st operand and, flip the opcode.
+                MI->getOperand(0).setReg(PeepholeSrc);
+                int NewOp = QII->getInvertedPredicatedOpcode(MI->getOpcode());
+                MI->setDesc(QII->get(NewOp));
+                Done = true;
+              }
+            }
+          }
+        }
+
+        if (!Done) {
+          // Handle special instructions.
+          unsigned Op = MI->getOpcode();
+          unsigned NewOp = 0;
+          unsigned PR = 1, S1 = 2, S2 = 3;   // Operand indices.
+
+          switch (Op) {
+            case Hexagon::TFR_condset_rr:
+            case Hexagon::TFR_condset_ii:
+            case Hexagon::MUX_ii:
+            case Hexagon::MUX_rr:
+              NewOp = Op;
+              break;
+            case Hexagon::TFR_condset_ri:
+              NewOp = Hexagon::TFR_condset_ir;
+              break;
+            case Hexagon::TFR_condset_ir:
+              NewOp = Hexagon::TFR_condset_ri;
+              break;
+            case Hexagon::MUX_ri:
+              NewOp = Hexagon::MUX_ir;
+              break;
+            case Hexagon::MUX_ir:
+              NewOp = Hexagon::MUX_ri;
+              break;
+          }
+          if (NewOp) {
+            unsigned PSrc = MI->getOperand(PR).getReg();
+            if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
+              MI->getOperand(PR).setReg(POrig);
+              MI->setDesc(QII->get(NewOp));
+              // Swap operands S1 and S2.
+              MachineOperand Op1 = MI->getOperand(S1);
+              MachineOperand Op2 = MI->getOperand(S2);
+              ChangeOpInto(MI->getOperand(S1), Op2);
+              ChangeOpInto(MI->getOperand(S2), Op1);
+            }
+          } // if (NewOp)
+        } // if (!Done)
+
+      } // if (!DisablePNotP)
+
+    } // Instruction
+  } // Basic Block
+  return true;
+}
+
+void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
+  assert (&Dst != &Src && "Cannot duplicate into itself");
+  switch (Dst.getType()) {
+    case MachineOperand::MO_Register:
+      if (Src.isReg()) {
+        Dst.setReg(Src.getReg());
+      } else if (Src.isImm()) {
+        Dst.ChangeToImmediate(Src.getImm());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    case MachineOperand::MO_Immediate:
+      if (Src.isImm()) {
+        Dst.setImm(Src.getImm());
+      } else if (Src.isReg()) {
+        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
+                             Src.isKill(), Src.isDead(), Src.isUndef(),
+                             Src.isDebug());
+      } else {
+        llvm_unreachable("Unexpected src operand type");
+      }
+      break;
+
+    default:
+      llvm_unreachable("Unexpected dst operand type");
+      break;
+  }
+}
+
+FunctionPass *llvm::createHexagonPeephole() {
+  return new HexagonPeephole();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
new file mode 100644
index 000000000000..d8b4e2fcb368
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -0,0 +1,317 @@
+//===-- HexagonRegisterInfo.cpp - Hexagon Register Information ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonRegisterInfo.h"
+#include "Hexagon.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+
+HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st,
+                                     const HexagonInstrInfo &tii)
+  : HexagonGenRegisterInfo(Hexagon::R31),
+    Subtarget(st),
+   TII(tii) {
+}
+
+const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
+                                                        *MF)
+  const {
+  static const uint16_t CalleeSavedRegsV2[] = {
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+  static const uint16_t CalleeSavedRegsV3[] = {
+    Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
+    Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
+    Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V1:
+    break;
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegsV2;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
+    return CalleeSavedRegsV3;
+  }
+  llvm_unreachable("Callee saved registers requested for unknown architecture "
+                   "version");
+}
+
+BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
+  const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(HEXAGON_RESERVED_REG_1);
+  Reserved.set(HEXAGON_RESERVED_REG_2);
+  Reserved.set(Hexagon::R29);
+  Reserved.set(Hexagon::R30);
+  Reserved.set(Hexagon::R31);
+  Reserved.set(Hexagon::D14);
+  Reserved.set(Hexagon::D15);
+  Reserved.set(Hexagon::LC0);
+  Reserved.set(Hexagon::LC1);
+  Reserved.set(Hexagon::SA0);
+  Reserved.set(Hexagon::SA1);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+HexagonRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClassesV2[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    };
+  static const TargetRegisterClass * const CalleeSavedRegClassesV3[] = {
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+    &Hexagon::IntRegsRegClass,     &Hexagon::IntRegsRegClass,
+  };
+
+  switch(Subtarget.getHexagonArchVersion()) {
+  case HexagonSubtarget::V1:
+    break;
+  case HexagonSubtarget::V2:
+    return CalleeSavedRegClassesV2;
+  case HexagonSubtarget::V3:
+  case HexagonSubtarget::V4:
+  case HexagonSubtarget::V5:
+    return CalleeSavedRegClassesV3;
+  }
+  llvm_unreachable("Callee saved register classes requested for unknown "
+                   "architecture version");
+}
+
+void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  //
+  // Hexagon_TODO: Do we need to enforce this for Hexagon?
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp.
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  unsigned FrameReg = getFrameRegister(MF);
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasFP(MF)) {
+    // We will not reserve space on the stack for the lr and fp registers.
+    Offset -= 2 * Hexagon_WordSize;
+  }
+
+  const unsigned FrameSize = MFI.getStackSize();
+
+  if (!MFI.hasVarSizedObjects() &&
+      TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset)) &&
+      !TII.isSpillPredRegOp(&MI)) {
+    // Replace frame index with a stack pointer reference.
+    MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(), false,
+                                                 false, true);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(FrameSize+Offset);
+  } else {
+    // Replace frame index with a frame pointer reference.
+    if (!TII.isValidOffset(MI.getOpcode(), Offset)) {
+
+      // If the offset overflows, then correct it.
+      //
+      // For loads, we do not need a reserved register
+      // r0 = memw(r30 + #10000) to:
+      //
+      // r0 = add(r30, #10000)
+      // r0 = memw(r0)
+      if ( (MI.getOpcode() == Hexagon::LDriw)  ||
+           (MI.getOpcode() == Hexagon::LDrid)   ||
+           (MI.getOpcode() == Hexagon::LDrih)   ||
+           (MI.getOpcode() == Hexagon::LDriuh)  ||
+           (MI.getOpcode() == Hexagon::LDrib)   ||
+           (MI.getOpcode() == Hexagon::LDriub)  ||
+           (MI.getOpcode() == Hexagon::LDriw_f) ||
+           (MI.getOpcode() == Hexagon::LDrid_f)) {
+        unsigned dstReg = (MI.getOpcode() == Hexagon::LDrid) ?
+          getSubReg(MI.getOperand(0).getReg(), Hexagon::subreg_loreg) :
+          MI.getOperand(0).getReg();
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  dstReg).addReg(FrameReg).addReg(dstReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  dstReg).addReg(FrameReg).addImm(Offset);
+        }
+
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      } else if ((MI.getOpcode() == Hexagon::STriw_indexed) ||
+                 (MI.getOpcode() == Hexagon::STriw) ||
+                 (MI.getOpcode() == Hexagon::STrid) ||
+                 (MI.getOpcode() == Hexagon::STrih) ||
+                 (MI.getOpcode() == Hexagon::STrib) ||
+                 (MI.getOpcode() == Hexagon::STrid_f) ||
+                 (MI.getOpcode() == Hexagon::STriw_f)) {
+        // For stores, we need a reserved register. Change
+        // memw(r30 + #10000) = r0 to:
+        //
+        // rs = add(r30, #10000);
+        // memw(rs) = r0
+        unsigned resReg = HEXAGON_RESERVED_REG_1;
+
+        // Check if offset can fit in addi.
+        if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::CONST32_Int_Real), resReg).addImm(Offset);
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_rr),
+                  resReg).addReg(FrameReg).addReg(resReg);
+        } else {
+          BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                  TII.get(Hexagon::ADD_ri),
+                  resReg).addReg(FrameReg).addImm(Offset);
+        }
+        MI.getOperand(FIOperandNum).ChangeToRegister(resReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      } else if (TII.isMemOp(&MI)) {
+        // use the constant extender if the instruction provides it
+        // and we are V4TOps.
+        if (Subtarget.hasV4TOps()) {
+          if (TII.isConstExtended(&MI)) {
+            MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+            TII.immediateExtend(&MI);
+          } else {
+            llvm_unreachable("Need to implement for memops");
+          }
+        } else {
+          // Only V3 and older instructions here.
+          unsigned ResReg = HEXAGON_RESERVED_REG_1;
+          if (!MFI.hasVarSizedObjects() &&
+              TII.isValidOffset(MI.getOpcode(), (FrameSize+Offset))) {
+            MI.getOperand(FIOperandNum).ChangeToRegister(getStackRegister(),
+                                                         false, false, false);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(FrameSize+Offset);
+          } else if (!TII.isValidOffset(Hexagon::ADD_ri, Offset)) {
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::CONST32_Int_Real), ResReg).addImm(Offset);
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::ADD_rr), ResReg).addReg(FrameReg).
+              addReg(ResReg);
+            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
+                                                         true);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+          } else {
+            BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                    TII.get(Hexagon::ADD_ri), ResReg).addReg(FrameReg).
+              addImm(Offset);
+            MI.getOperand(FIOperandNum).ChangeToRegister(ResReg, false, false,
+                                                         true);
+            MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+          }
+        }
+      } else {
+        unsigned dstReg = MI.getOperand(0).getReg();
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::CONST32_Int_Real), dstReg).addImm(Offset);
+        BuildMI(*MI.getParent(), II, MI.getDebugLoc(),
+                TII.get(Hexagon::ADD_rr),
+                dstReg).addReg(FrameReg).addReg(dstReg);
+        // Can we delete MI??? r2 = add (r2, #0).
+        MI.getOperand(FIOperandNum).ChangeToRegister(dstReg, false, false,true);
+        MI.getOperand(FIOperandNum+1).ChangeToImmediate(0);
+      }
+    } else {
+      // If the offset is small enough to fit in the immediate field, directly
+      // encode it.
+      MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FIOperandNum+1).ChangeToImmediate(Offset);
+    }
+  }
+
+}
+
+unsigned HexagonRegisterInfo::getRARegister() const {
+  return Hexagon::R31;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction
+                                               &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (TFI->hasFP(MF)) {
+    return Hexagon::R30;
+  }
+
+  return Hexagon::R29;
+}
+
+unsigned HexagonRegisterInfo::getFrameRegister() const {
+  return Hexagon::R30;
+}
+
+unsigned HexagonRegisterInfo::getStackRegister() const {
+  return Hexagon::R29;
+}
+
+void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
+                                               &Moves)  const
+{
+  // VirtualFP = (R30 + #0).
+  unsigned FPReg = getFrameRegister();
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(FPReg, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned HexagonRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
new file mode 100644
index 000000000000..8a3f94a3fd12
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -0,0 +1,91 @@
+//==- HexagonRegisterInfo.h - Hexagon Register Information Impl --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Hexagon implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonREGISTERINFO_H
+#define HexagonREGISTERINFO_H
+
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "HexagonGenRegisterInfo.inc"
+
+//
+//  We try not to hard code the reserved registers in our code,
+//  so the following two macros were defined. However, there
+//  are still a few places that R11 and R10 are hard wired.
+//  See below. If, in the future, we decided to change the reserved
+//  register. Don't forget changing the following places.
+//
+//  1. the "Defs" set of STriw_pred in HexagonInstrInfo.td
+//  2. the "Defs" set of LDri_pred in HexagonInstrInfo.td
+//  3. the definition of "IntRegs" in HexagonRegisterInfo.td
+//  4. the definition of "DoubleRegs" in HexagonRegisterInfo.td
+//
+#define HEXAGON_RESERVED_REG_1 Hexagon::R10
+#define HEXAGON_RESERVED_REG_2 Hexagon::R11
+
+namespace llvm {
+
+class HexagonSubtarget;
+class HexagonInstrInfo;
+class Type;
+
+struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
+  HexagonSubtarget &Subtarget;
+  const HexagonInstrInfo &TII;
+
+  HexagonRegisterInfo(HexagonSubtarget &st, const HexagonInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  /// requiresRegisterScavenging - returns true since we may need scavenging for
+  /// a temporary register when generating hardware loop instructions.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return true;
+  }
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister() const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+  unsigned getStackRegister() const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
new file mode 100644
index 000000000000..fe41fc3bc60c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -0,0 +1,167 @@
+//===-- HexagonRegisterInfo.td - Hexagon Register defs -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Hexagon register file.
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Hexagon" in {
+
+  class HexagonReg<string n> : Register<n> {
+    field bits<5> Num;
+  }
+
+  class HexagonDoubleReg<string n, list<Register> subregs> :
+        RegisterWithSubRegs<n, subregs> {
+    field bits<5> Num;
+  }
+
+  // Registers are identified with 5-bit ID numbers.
+  // Ri - 32-bit integer registers.
+  class Ri<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rf - 32-bit floating-point registers.
+  class Rf<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+
+  // Rd - 64-bit registers.
+  class Rd<bits<5> num, string n, list<Register> subregs> :
+        HexagonDoubleReg<n, subregs> {
+    let Num = num;
+    let SubRegs = subregs;
+  }
+
+  // Rp - predicate registers
+  class Rp<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rc - control registers
+  class Rc<bits<5> num, string n> : HexagonReg<n> {
+    let Num = num;
+  }
+
+  // Rj - aliased integer registers
+  class Rj<string n, Ri R>: HexagonReg<n> {
+    let Num = R.Num;
+    let Aliases = [R];
+  }
+
+  def subreg_loreg  : SubRegIndex;
+  def subreg_hireg  : SubRegIndex;
+
+  // Integer registers.
+  def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
+  def R1 : Ri< 1, "r1">, DwarfRegNum<[1]>;
+  def R2 : Ri< 2, "r2">, DwarfRegNum<[2]>;
+  def R3 : Ri< 3, "r3">, DwarfRegNum<[3]>;
+  def R4 : Ri< 4, "r4">, DwarfRegNum<[4]>;
+  def R5 : Ri< 5, "r5">, DwarfRegNum<[5]>;
+  def R6 : Ri< 6, "r6">, DwarfRegNum<[6]>;
+  def R7 : Ri< 7, "r7">, DwarfRegNum<[7]>;
+  def R8 : Ri< 8, "r8">, DwarfRegNum<[8]>;
+  def R9 : Ri< 9, "r9">, DwarfRegNum<[9]>;
+  def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+  def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+  def R12 : Ri<12, "r12">, DwarfRegNum<[12]>;
+  def R13 : Ri<13, "r13">, DwarfRegNum<[13]>;
+  def R14 : Ri<14, "r14">, DwarfRegNum<[14]>;
+  def R15 : Ri<15, "r15">, DwarfRegNum<[15]>;
+  def R16 : Ri<16, "r16">, DwarfRegNum<[16]>;
+  def R17 : Ri<17, "r17">, DwarfRegNum<[17]>;
+  def R18 : Ri<18, "r18">, DwarfRegNum<[18]>;
+  def R19 : Ri<19, "r19">, DwarfRegNum<[19]>;
+  def R20 : Ri<20, "r20">, DwarfRegNum<[20]>;
+  def R21 : Ri<21, "r21">, DwarfRegNum<[21]>;
+  def R22 : Ri<22, "r22">, DwarfRegNum<[22]>;
+  def R23 : Ri<23, "r23">, DwarfRegNum<[23]>;
+  def R24 : Ri<24, "r24">, DwarfRegNum<[24]>;
+  def R25 : Ri<25, "r25">, DwarfRegNum<[25]>;
+  def R26 : Ri<26, "r26">, DwarfRegNum<[26]>;
+  def R27 : Ri<27, "r27">, DwarfRegNum<[27]>;
+  def R28 : Ri<28, "r28">, DwarfRegNum<[28]>;
+  def R29 : Ri<29, "r29">, DwarfRegNum<[29]>;
+  def R30 : Ri<30, "r30">, DwarfRegNum<[30]>;
+  def R31 : Ri<31, "r31">, DwarfRegNum<[31]>;
+
+  def SP : Rj<"sp", R29>, DwarfRegNum<[29]>;
+  def FP : Rj<"fp", R30>, DwarfRegNum<[30]>;
+  def LR : Rj<"lr", R31>, DwarfRegNum<[31]>;
+
+  // Aliases of the R* registers used to hold 64-bit int values (doubles).
+  let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in {
+  def D0  : Rd< 0,  "r1:0",  [R0,  R1]>, DwarfRegNum<[32]>;
+  def D1  : Rd< 2,  "r3:2",  [R2,  R3]>, DwarfRegNum<[34]>;
+  def D2  : Rd< 4,  "r5:4",  [R4,  R5]>, DwarfRegNum<[36]>;
+  def D3  : Rd< 6,  "r7:6",  [R6,  R7]>, DwarfRegNum<[38]>;
+  def D4  : Rd< 8,  "r9:8",  [R8,  R9]>, DwarfRegNum<[40]>;
+  def D5  : Rd<10, "r11:10", [R10, R11]>, DwarfRegNum<[42]>;
+  def D6  : Rd<12, "r13:12", [R12, R13]>, DwarfRegNum<[44]>;
+  def D7  : Rd<14, "r15:14", [R14, R15]>, DwarfRegNum<[46]>;
+  def D8  : Rd<16, "r17:16", [R16, R17]>, DwarfRegNum<[48]>;
+  def D9  : Rd<18, "r19:18", [R18, R19]>, DwarfRegNum<[50]>;
+  def D10 : Rd<20, "r21:20", [R20, R21]>, DwarfRegNum<[52]>;
+  def D11 : Rd<22, "r23:22", [R22, R23]>, DwarfRegNum<[54]>;
+  def D12 : Rd<24, "r25:24", [R24, R25]>, DwarfRegNum<[56]>;
+  def D13 : Rd<26, "r27:26", [R26, R27]>, DwarfRegNum<[58]>;
+  def D14 : Rd<28, "r29:28", [R28, R29]>, DwarfRegNum<[60]>;
+  def D15 : Rd<30, "r31:30", [R30, R31]>, DwarfRegNum<[62]>;
+  }
+
+  // Predicate registers.
+  def P0 : Rp<0, "p0">, DwarfRegNum<[63]>;
+  def P1 : Rp<1, "p1">, DwarfRegNum<[64]>;
+  def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
+  def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
+
+  // Control registers.
+  def SA0 : Rc<0, "sa0">, DwarfRegNum<[67]>;
+  def LC0 : Rc<1, "lc0">, DwarfRegNum<[68]>;
+
+  def SA1 : Rc<2, "sa1">, DwarfRegNum<[69]>;
+  def LC1 : Rc<3, "lc1">, DwarfRegNum<[70]>;
+
+  def M0 : Rc<6, "m0">, DwarfRegNum<[71]>;
+  def M1 : Rc<7, "m1">, DwarfRegNum<[72]>;
+
+  def PC : Rc<9,  "pc">, DwarfRegNum<[32]>; // is the Dwarf number correct?
+  def GP : Rc<11, "gp">, DwarfRegNum<[33]>; // is the Dwarf number correct?
+}
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"Hexagon", [i32,f32], 32,
+                            (add (sequence "R%u", 0, 9),
+                                 (sequence "R%u", 12, 28),
+                                 R10, R11, R29, R30, R31)> {
+}
+
+def DoubleRegs : RegisterClass<"Hexagon", [i64,f64], 64,
+                               (add (sequence "D%u", 0, 4),
+                                    (sequence "D%u", 6, 13), D5, D14, D15)>;
+
+
+def PredRegs : RegisterClass<"Hexagon", [i1], 32, (add (sequence "P%u", 0, 3))>
+{
+  let Size = 32;
+}
+
+def CRRegs : RegisterClass<"Hexagon", [i32], 32,
+                           (add (sequence "LC%u", 0, 1),
+                                (sequence "SA%u", 0, 1),
+                                (sequence "M%u", 0, 1), PC, GP)> {
+  let Size = 32;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
new file mode 100644
index 000000000000..34bf4eacfdc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -0,0 +1,83 @@
+//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends //
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass that removes sign extends for function parameters. These parameters
+// are already sign extended by the caller per Hexagon's ABI
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+namespace {
+  struct HexagonRemoveExtendArgs : public FunctionPass {
+  public:
+    static char ID;
+    HexagonRemoveExtendArgs() : FunctionPass(ID) {}
+    virtual bool runOnFunction(Function &F);
+
+    const char *getPassName() const {
+      return "Remove sign extends";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineFunctionAnalysis>();
+      AU.addPreserved<MachineFunctionAnalysis>();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char HexagonRemoveExtendArgs::ID = 0;
+RegisterPass<HexagonRemoveExtendArgs> X("reargs",
+                                        "Remove Sign and Zero Extends for Args"
+                                        );
+
+
+
+bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
+  unsigned Idx = 1;
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI, ++Idx) {
+    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+      Argument* Arg = AI;
+      if (!isa<PointerType>(Arg->getType())) {
+        for (Instruction::use_iterator UI = Arg->use_begin();
+             UI != Arg->use_end();) {
+          if (isa<SExtInst>(*UI)) {
+            Instruction* Use = cast<Instruction>(*UI);
+            SExtInst* SI = new SExtInst(Arg, Use->getType());
+            assert (EVT::getEVT(SI->getType()) ==
+                    (EVT::getEVT(Use->getType())));
+            ++UI;
+            Use->replaceAllUsesWith(SI);
+            Instruction* First = F.getEntryBlock().begin();
+            SI->insertBefore(First);
+            Use->eraseFromParent();
+          } else {
+            ++UI;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+
+
+FunctionPass *llvm::createHexagonRemoveExtendOps(HexagonTargetMachine &TM) {
+  return new HexagonRemoveExtendArgs();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
new file mode 100644
index 000000000000..c2cfbb9710a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -0,0 +1,69 @@
+//===- HexagonSchedule.td - Hexagon Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Functional Units
+def LSUNIT    : FuncUnit; // SLOT0
+def LUNIT     : FuncUnit; // SLOT1
+def MUNIT     : FuncUnit; // SLOT2
+def SUNIT     : FuncUnit; // SLOT3
+def LOOPUNIT  : FuncUnit;
+
+// Itinerary classes
+def ALU32     : InstrItinClass;
+def ALU64     : InstrItinClass;
+def CR        : InstrItinClass;
+def J         : InstrItinClass;
+def JR        : InstrItinClass;
+def LD        : InstrItinClass;
+def LD0       : InstrItinClass;
+def M         : InstrItinClass;
+def ST        : InstrItinClass;
+def ST0       : InstrItinClass;
+def S         : InstrItinClass;
+def SYS       : InstrItinClass;
+def ENDLOOP   : InstrItinClass;
+def PSEUDO    : InstrItinClass;
+def PSEUDOM   : InstrItinClass;
+
+def HexagonItineraries :
+      ProcessorItineraries<[LSUNIT, LUNIT, MUNIT, SUNIT, LOOPUNIT], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
+        InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
+        InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
+        InstrItinData<LD0    , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
+        InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<ST0    , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
+        InstrItinData<ENDLOOP, [InstrStage<1, [LOOPUNIT]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [MUNIT, SUNIT], 0>,
+                                InstrStage<1, [MUNIT, SUNIT]>]>
+      ]>;
+
+def HexagonModel : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItineraries;
+  let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV4.td"
+
+//===----------------------------------------------------------------------===//
+// V4 Machine Info -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
new file mode 100644
index 000000000000..ef72cf4068bf
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -0,0 +1,74 @@
+//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
+// This file describes that machine information.
+
+//
+//    |===========|==================================================|
+//    | PIPELINE  |              Instruction Classes                 |
+//    |===========|==================================================|
+//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
+//    |-----------|--------------------------------------------------|
+//    | SLOT1     |  LD       ST    ALU32                            |
+//    |-----------|--------------------------------------------------|
+//    | SLOT2     |  XTYPE          ALU32     J         JR           |
+//    |-----------|--------------------------------------------------|
+//    | SLOT3     |  XTYPE          ALU32     J         CR           |
+//    |===========|==================================================|
+
+// Functional Units.
+def SLOT0       : FuncUnit;
+def SLOT1       : FuncUnit;
+def SLOT2       : FuncUnit;
+def SLOT3       : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
+
+// Itinerary classes.
+def NV_V4       : InstrItinClass;
+def MEM_V4      : InstrItinClass;
+// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def PREFIX      : InstrItinClass;
+
+def HexagonItinerariesV4 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
+        InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
+        InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD0    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST0    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>]>,
+        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>]>
+      ]>;
+
+def HexagonModelV4 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V4 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td
new file mode 100644
index 000000000000..d8feb89c0ab5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectCCInfo.td
@@ -0,0 +1,121 @@
+//===-- HexagoSelectCCInfo.td - Selectcc mappings ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+//
+// selectcc mappings.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (CMPEQrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPEQrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGT)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGT)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs)),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULT)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTUrr IntRegs:$lhs,
+                                         (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLT)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTrr IntRegs:$lhs,
+                                        (ADD_ri IntRegs:$rhs, -1)))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETLE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETULE)),
+      (i32 (MUX_rr (i1 (NOT_p (CMPGTUrr IntRegs:$lhs, IntRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for greater-equal-to Rs => greater-than Rs-1.
+//
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETGE)),
+      (i32 (MUX_rr (i1 (CMPGTrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc IntRegs:$lhs, IntRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETUGE)),
+      (i32 (MUX_rr (i1 (CMPGTUrr IntRegs:$lhs, (ADD_ri IntRegs:$rhs, -1))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+
+//
+// selectcc mappings for predicate comparisons.
+//
+// Convert Rd = selectcc(p0, p1, true_val, false_val, SETEQ) into:
+//  pt = not(p1 xor p2)
+//  Rd = mux(pt, true_val, false_val)
+// and similarly for SETNE
+//
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETNE)),
+      (i32 (MUX_rr (i1 (XOR_pp PredRegs:$lhs, PredRegs:$rhs)), IntRegs:$tval,
+                   IntRegs:$fval))>;
+
+def : Pat <(i32 (selectcc PredRegs:$lhs, PredRegs:$rhs, IntRegs:$tval,
+                          IntRegs:$fval, SETEQ)),
+      (i32 (MUX_rr (i1 (NOT_p (XOR_pp PredRegs:$lhs, PredRegs:$rhs))),
+                   IntRegs:$tval, IntRegs:$fval))>;
+
+
+//
+// selectcc mappings for 64-bit operands are messy. Hexagon does not have a
+// MUX64 o, use this:
+// selectcc(Rss, Rdd, tval, fval, cond) ->
+//   combine(mux(cmp_cond(Rss, Rdd), tval.hi, fval.hi),
+//           mux(cmp_cond(Rss, Rdd), tval.lo, fval.lo))
+
+// setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETGT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs, DoubleRegs:$rhs),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
+
+
+// setlt-64 -> setgt-64.
+def : Pat<(i64 (selectcc DoubleRegs:$lhs, DoubleRegs:$rhs, DoubleRegs:$tval,
+                         DoubleRegs:$fval, SETLT)),
+      (COMBINE_rr (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                     (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_hireg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_hireg)),
+                   (MUX_rr (CMPGT64rr DoubleRegs:$lhs,
+                                      (ADD64_rr DoubleRegs:$rhs, (TFRI64 -1))),
+                           (EXTRACT_SUBREG DoubleRegs:$tval, subreg_loreg),
+                           (EXTRACT_SUBREG DoubleRegs:$fval, subreg_loreg)))>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..a52c604505b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -0,0 +1,46 @@
+//===-- HexagonSelectionDAGInfo.cpp - Hexagon SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the HexagonSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+#include "HexagonTargetMachine.h"
+using namespace llvm;
+
+bool llvm::flag_aligned_memcpy;
+
+HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
+                                                 &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
+}
+
+SDValue
+HexagonSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool isVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const {
+  flag_aligned_memcpy = false;
+  if ((Align & 0x3) == 0) {
+    ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+    if (ConstantSize) {
+      uint64_t SizeVal = ConstantSize->getZExtValue();
+      if ((SizeVal > 32) && ((SizeVal % 8) == 0))
+        flag_aligned_memcpy = true;
+    }
+  }
+
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
new file mode 100644
index 000000000000..0673e4d35472
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -0,0 +1,40 @@
+//===-- HexagonSelectionDAGInfo.h - Hexagon SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonSELECTIONDAGINFO_H
+#define HexagonSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class HexagonTargetMachine;
+
+class HexagonSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM);
+  ~HexagonSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
new file mode 100644
index 000000000000..814249fa6832
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -0,0 +1,216 @@
+//===-- HexagonSplitTFRCondSets.cpp - split TFR condsets into xfers -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//
+//===----------------------------------------------------------------------===//
+// This pass tries to provide opportunities for better optimization of muxes.
+// The default code generated for something like: flag = (a == b) ? 1 : 3;
+// would be:
+//
+//   {p0 = cmp.eq(r0,r1)}
+//   {r3 = mux(p0,#1,#3)}
+//
+// This requires two packets.  If we use .new predicated immediate transfers,
+// then we can do this in a single packet, e.g.:
+//
+//   {p0 = cmp.eq(r0,r1)
+//    if (p0.new) r3 = #1
+//    if (!p0.new) r3 = #3}
+//
+// Note that the conditional assignments are not generated in .new form here.
+// We assume opptimisically that they will be formed later.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xfer"
+#include "Hexagon.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class HexagonSplitTFRCondSets : public MachineFunctionPass {
+    HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonSplitTFRCondSets(HexagonTargetMachine& TM) :
+      MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Split TFRCondSets";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonSplitTFRCondSets::ID = 0;
+
+
+bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block.
+    for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
+         ++MII) {
+      MachineInstr *MI = MII;
+      int Opc1, Opc2;
+      switch(MI->getOpcode()) {
+        case Hexagon::TFR_condset_rr:
+        case Hexagon::TFR_condset_rr_f:
+        case Hexagon::TFR_condset_rr64_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() == Hexagon::TFR_condset_rr ||
+              MI->getOpcode() == Hexagon::TFR_condset_rr_f) {
+            Opc1 = Hexagon::TFR_cPt;
+            Opc2 = Hexagon::TFR_cNotPt;
+          }
+          else if (MI->getOpcode() == Hexagon::TFR_condset_rr64_f) {
+            Opc1 = Hexagon::TFR64_cPt;
+            Opc2 = Hexagon::TFR64_cNotPt;
+          }
+
+          // Minor optimization: do not emit the predicated copy if the source
+          // and the destination is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc1),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(), TII->get(Opc2),
+                    DestReg).addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ri:
+        case Hexagon::TFR_condset_ri_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(2).getReg();
+
+          //  Do not emit the predicated copy if the source and the destination
+          // is the same register.
+          if (DestReg != SrcReg1) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg1);
+          }
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ri ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(3).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ri_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(3).getFPImm());
+          }
+
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ir:
+        case Hexagon::TFR_condset_ir_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg2 = MI->getOperand(3).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ir ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addImm(MI->getOperand(2).getImm());
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ir_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFRI_cPt_f), DestReg).
+              addReg(MI->getOperand(1).getReg()).
+              addFPImm(MI->getOperand(2).getFPImm());
+          }
+
+          // Do not emit the predicated copy if the source and
+          // the destination is the same register.
+          if (DestReg != SrcReg2) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+              TII->get(Hexagon::TFR_cNotPt), DestReg).
+              addReg(MI->getOperand(1).getReg()).addReg(SrcReg2);
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+        case Hexagon::TFR_condset_ii:
+        case Hexagon::TFR_condset_ii_f: {
+          int DestReg = MI->getOperand(0).getReg();
+          int SrcReg1 = MI->getOperand(1).getReg();
+
+          if (MI->getOpcode() ==  Hexagon::TFR_condset_ii ) {
+            int Immed1 = MI->getOperand(2).getImm();
+            int Immed2 = MI->getOperand(3).getImm();
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed1);
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt),
+                    DestReg).addReg(SrcReg1).addImm(Immed2);
+          } else if (MI->getOpcode() ==  Hexagon::TFR_condset_ii_f ) {
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(2).getFPImm());
+            BuildMI(*MBB, MII, MI->getDebugLoc(),
+                    TII->get(Hexagon::TFRI_cNotPt_f), DestReg).
+                    addReg(SrcReg1).
+                    addFPImm(MI->getOperand(3).getFPImm());
+          }
+          MII = MBB->erase(MI);
+          --MII;
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonSplitTFRCondSets(HexagonTargetMachine &TM) {
+  return new HexagonSplitTFRCondSets(TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
new file mode 100644
index 000000000000..07d5ce1d8ab0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -0,0 +1,88 @@
+//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonSubtarget.h"
+#include "Hexagon.h"
+#include "HexagonRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+static cl::opt<bool>
+EnableV3("enable-hexagon-v3", cl::Hidden,
+         cl::desc("Enable Hexagon V3 instructions."));
+
+static cl::opt<bool>
+EnableMemOps(
+    "enable-hexagon-memops",
+    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true),
+    cl::desc(
+      "Generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool>
+DisableMemOps(
+    "disable-hexagon-memops",
+    cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false),
+    cl::desc(
+      "Do not generate V4 MEMOP in code generation for Hexagon target"));
+
+static cl::opt<bool>
+EnableIEEERndNear(
+    "enable-hexagon-ieee-rnd-near",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Generate non-chopped conversion from fp to int."));
+
+HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
+  HexagonGenSubtargetInfo(TT, CPU, FS),
+  CPUString(CPU.str()) {
+
+  // If the programmer has not specified a Hexagon version, default to -mv4.
+  if (CPUString.empty())
+    CPUString = "hexagonv4";
+
+  if (CPUString == "hexagonv2") {
+    HexagonArchVersion = V2;
+  } else if (CPUString == "hexagonv3") {
+    EnableV3 = true;
+    HexagonArchVersion = V3;
+  } else if (CPUString == "hexagonv4") {
+    HexagonArchVersion = V4;
+  } else if (CPUString == "hexagonv5") {
+    HexagonArchVersion = V5;
+  } else {
+    llvm_unreachable("Unrecognized Hexagon processor version");
+  }
+
+  ParseSubtargetFeatures(CPUString, FS);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUString);
+
+  // UseMemOps on by default unless disabled explicitly
+  if (DisableMemOps)
+    UseMemOps = false;
+  else if (EnableMemOps)
+    UseMemOps = true;
+  else
+    UseMemOps = false;
+
+  if (EnableIEEERndNear)
+    ModeIEEERndNear = true;
+  else
+    ModeIEEERndNear = false;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
new file mode 100644
index 000000000000..76a8fba195f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -0,0 +1,80 @@
+//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Hexagon_SUBTARGET_H
+#define Hexagon_SUBTARGET_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "HexagonGenSubtargetInfo.inc"
+
+#define Hexagon_SMALL_DATA_THRESHOLD 8
+#define Hexagon_SLOTS 4
+
+namespace llvm {
+
+class HexagonSubtarget : public HexagonGenSubtargetInfo {
+
+  bool UseMemOps;
+  bool ModeIEEERndNear;
+
+public:
+  enum HexagonArchEnum {
+    V1, V2, V3, V4, V5
+  };
+
+  HexagonArchEnum HexagonArchVersion;
+  std::string CPUString;
+  InstrItineraryData InstrItins;
+
+public:
+  HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasV2TOps () const { return HexagonArchVersion >= V2; }
+  bool hasV2TOpsOnly () const { return HexagonArchVersion == V2; }
+  bool hasV3TOps () const { return HexagonArchVersion >= V3; }
+  bool hasV3TOpsOnly () const { return HexagonArchVersion == V3; }
+  bool hasV4TOps () const { return HexagonArchVersion >= V4; }
+  bool hasV4TOpsOnly () const { return HexagonArchVersion == V4; }
+  bool useMemOps () const { return HexagonArchVersion >= V4 && UseMemOps; }
+  bool hasV5TOps () const { return HexagonArchVersion >= V5; }
+  bool hasV5TOpsOnly () const { return HexagonArchVersion == V5; }
+  bool modeIEEERndNear () const { return ModeIEEERndNear; }
+
+  bool isSubtargetV2() const { return HexagonArchVersion == V2;}
+  const std::string &getCPUString () const { return CPUString; }
+
+  // Threshold for small data section
+  unsigned getSmallDataThreshold() const {
+    return Hexagon_SMALL_DATA_THRESHOLD;
+  }
+  const HexagonArchEnum &getHexagonArchVersion() const {
+    return  HexagonArchVersion;
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
new file mode 100644
index 000000000000..ce45c626f799
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -0,0 +1,180 @@
+//===-- HexagonTargetMachine.cpp - Define TargetMachine for Hexagon -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Hexagon target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetMachine.h"
+#include "Hexagon.h"
+#include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+static cl::
+opt<bool> DisableHardwareLoops(
+                        "disable-hexagon-hwloops", cl::Hidden,
+                        cl::desc("Disable Hardware Loops for Hexagon target"));
+
+static cl::
+opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+                                cl::Hidden, cl::ZeroOrMore, cl::init(false),
+                                cl::desc("Disable Hexagon MI Scheduling"));
+
+static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt",
+    cl::Hidden, cl::ZeroOrMore, cl::init(false),
+    cl::desc("Disable Hexagon CFG Optimization"));
+
+/// HexagonTargetMachineModule - Note that this is used on hosts that
+/// cannot link in a library unless there are references into the
+/// library.  In particular, it seems that it is not possible to get
+/// things to work on Win32 without this.  Though it is unused, do not
+/// remove it.
+extern "C" int HexagonTargetMachineModule;
+int HexagonTargetMachineModule = 0;
+
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
+}
+
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
+
+/// HexagonTargetMachine ctor - Create an ILP32 architecture model.
+///
+
+/// Hexagon_TODO: Do I need an aggregate alignment?
+///
+HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    DL("e-p:32:32:32-"
+                "i64:64:64-i32:32:32-i16:16:16-i1:32:32-"
+                "f64:64:64-f32:32:32-a0:0-n32") ,
+    Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
+    TSInfo(*this),
+    FrameLowering(Subtarget),
+    InstrItins(&Subtarget.getInstrItineraryData()) {
+    setMCUseCFI(false);
+}
+
+// addPassesForOptimizations - Allow the backend (target) to add Target
+// Independent Optimization passes to the Pass Manager.
+bool HexagonTargetMachine::addPassesForOptimizations(PassManagerBase &PM) {
+  if (getOptLevel() != CodeGenOpt::None) {
+    PM.add(createConstantPropagationPass());
+    PM.add(createLoopSimplifyPass());
+    PM.add(createDeadCodeEliminationPass());
+    PM.add(createConstantPropagationPass());
+    PM.add(createLoopUnrollPass());
+    PM.add(createLoopStrengthReducePass());
+  }
+  return true;
+}
+
+namespace {
+/// Hexagon Code Generator Pass Configuration Options.
+class HexagonPassConfig : public TargetPassConfig {
+public:
+  HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    // Enable MI scheduler.
+    if (!DisableHexagonMISched) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createVLIWMachineSched);
+    }
+  }
+
+  HexagonTargetMachine &getHexagonTargetMachine() const {
+    return getTM<HexagonTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new HexagonPassConfig(this, PM);
+}
+
+bool HexagonPassConfig::addInstSelector() {
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonRemoveExtendOps(getHexagonTargetMachine()));
+
+  addPass(createHexagonISelDag(getHexagonTargetMachine(), getOptLevel()));
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonPeephole());
+
+  return false;
+}
+
+
+bool HexagonPassConfig::addPreRegAlloc() {
+  if (!DisableHardwareLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonHardwareLoops());
+  return false;
+}
+
+bool HexagonPassConfig::addPostRegAlloc() {
+  if (!DisableHexagonCFGOpt && getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonCFGOptimizer(getHexagonTargetMachine()));
+  return true;
+}
+
+
+bool HexagonPassConfig::addPreSched2() {
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&IfConverterID);
+  return true;
+}
+
+bool HexagonPassConfig::addPreEmitPass() {
+
+  if (!DisableHardwareLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonFixupHwLoops());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonNewValueJump());
+
+  // Expand Spill code for predicate registers.
+  addPass(createHexagonExpandPredSpillCode(getHexagonTargetMachine()));
+
+  // Split up TFRcondsets into conditional transfers.
+  addPass(createHexagonSplitTFRCondSets(getHexagonTargetMachine()));
+
+  // Create Packets.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createHexagonPacketizer());
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
new file mode 100644
index 000000000000..cf8f9aa3612f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -0,0 +1,83 @@
+//=-- HexagonTargetMachine.h - Define TargetMachine for Hexagon ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Hexagon specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETMACHINE_H
+#define HexagonTARGETMACHINE_H
+
+#include "HexagonFrameLowering.h"
+#include "HexagonISelLowering.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonSelectionDAGInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class Module;
+
+class HexagonTargetMachine : public LLVMTargetMachine {
+  const DataLayout DL;       // Calculates type size & alignment.
+  HexagonSubtarget Subtarget;
+  HexagonInstrInfo InstrInfo;
+  HexagonTargetLowering TLInfo;
+  HexagonSelectionDAGInfo TSInfo;
+  HexagonFrameLowering FrameLowering;
+  const InstrItineraryData* InstrItins;
+
+public:
+  HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+
+  virtual const HexagonInstrInfo *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual const HexagonSubtarget *getSubtargetImpl() const {
+    return &Subtarget;
+  }
+  virtual const HexagonRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const InstrItineraryData* getInstrItineraryData() const {
+    return InstrItins;
+  }
+
+
+  virtual const HexagonTargetLowering* getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const HexagonFrameLowering* getFrameLowering() const {
+    return &FrameLowering;
+  }
+
+  virtual const HexagonSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  static unsigned getModuleMatchQuality(const Module &M);
+
+  // Pass Pipeline Configuration.
+  virtual bool addPassesForOptimizations(PassManagerBase &PM);
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+extern bool flag_aligned_memcpy;
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
new file mode 100644
index 000000000000..993fcfaed43e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -0,0 +1,94 @@
+//===-- HexagonTargetObjectFile.cpp - Hexagon asm properties --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonTargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonTargetObjectFile.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
+                                cl::init(8), cl::Hidden);
+
+void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
+                                         const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE | ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+}
+
+// sdata/sbss support taken largely from the MIPS Backend.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
+}
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  // If the primary definition of this global value is outside the current
+  // translation unit or the global value is available for inspection but not
+  // emission, then do nothing.
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  // Otherwise, Check if GV should be in sdata/sbss, when normally it would end
+  // up in getKindForGlobal(GV, TM).
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global value should be
+/// placed into small data/bss section.
+bool HexagonTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
+    Type *Ty = GV->getType()->getElementType();
+    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+  }
+
+  return false;
+}
+
+const MCSection *HexagonTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
new file mode 100644
index 000000000000..693345081ee3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonTARGETOBJECTFILE_H
+#define HexagonTARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSectionELF *SmallDataSection;
+    const MCSectionELF *SmallBSSSection;
+  public:
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM,
+                                SectionKind Kind) const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    const MCSection* SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
new file mode 100644
index 000000000000..c0d86da1c05e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -0,0 +1,3086 @@
+//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a simple VLIW packetizer using DFA. The packetizer works on
+// machine basic blocks. For each instruction I in BB, the packetizer consults
+// the DFA to see if machine resources are available to execute I. If so, the
+// packetizer checks if I depends on any instruction J in the current packet.
+// If no dependency is found, I is added to current packet and machine resource
+// is marked as taken. If any dependency is found, a target API call is made to
+// prune the dependence.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "packets"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "Hexagon.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+
+#include <map>
+
+using namespace llvm;
+
+namespace {
+  class HexagonPacketizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    HexagonPacketizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    const char *getPassName() const {
+      return "Hexagon Packetizer";
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn);
+  };
+  char HexagonPacketizer::ID = 0;
+
+  class HexagonPacketizerList : public VLIWPacketizerList {
+
+  private:
+
+    // Has the instruction been promoted to a dot-new instruction.
+    bool PromotedToDotNew;
+
+    // Has the instruction been glued to allocframe.
+    bool GlueAllocframeStore;
+
+    // Has the feeder instruction been glued to new value jump.
+    bool GlueToNewValueJump;
+
+    // Check if there is a dependence between some instruction already in this
+    // packet and this instruction.
+    bool Dependence;
+
+    // Only check for dependence if there are resources available to
+    // schedule this instruction.
+    bool FoundSequentialDependence;
+
+  public:
+    // Ctor.
+    HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
+                          MachineDominatorTree &MDT);
+
+    // initPacketizerState - initialize some internal flags.
+    void initPacketizerState();
+
+    // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+    bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB);
+
+    // isSoloInstruction - return true if instruction MI can not be packetized
+    // with any other instruction, which means that MI itself is a packet.
+    bool isSoloInstruction(MachineInstr *MI);
+
+    // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
+    // together.
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ);
+
+    // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
+    // and SUJ.
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ);
+
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI);
+  private:
+    bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
+    bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToDotNew(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII,
+                    const TargetRegisterClass* RC);
+    bool CanPromoteToNewValue(MachineInstr* MI, SUnit* PacketSU,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit,
+                    MachineBasicBlock::iterator &MII);
+    bool CanPromoteToNewValueStore(MachineInstr* MI, MachineInstr* PacketMI,
+                    unsigned DepReg,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool DemoteToDotOld(MachineInstr* MI);
+    bool ArePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2,
+                    std::map <MachineInstr*, SUnit*> MIToSUnit);
+    bool RestrictingDepExistInPacket(MachineInstr*,
+                    unsigned, std::map <MachineInstr*, SUnit*>);
+    bool isNewifiable(MachineInstr* MI);
+    bool isCondInst(MachineInstr* MI);
+    bool IsNewifyStore (MachineInstr* MI);
+    bool tryAllocateResourcesForConstExt(MachineInstr* MI);
+    bool canReserveResourcesForConstExt(MachineInstr *MI);
+    void reserveResourcesForConstExt(MachineInstr* MI);
+    bool isNewValueInst(MachineInstr* MI);
+  };
+}
+
+// HexagonPacketizerList Ctor.
+HexagonPacketizerList::HexagonPacketizerList(
+  MachineFunction &MF, MachineLoopInfo &MLI,MachineDominatorTree &MDT)
+  : VLIWPacketizerList(MF, MLI, MDT, true){
+}
+
+bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+
+  // Instantiate the packetizer.
+  HexagonPacketizerList Packetizer(Fn, MLI, MDT);
+
+  // DFA state table should not be empty.
+  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
+
+  //
+  // Loop over all basic blocks and remove KILL pseudo-instructions
+  // These instructions confuse the dependence analysis. Consider:
+  // D0 = ...   (Insn 0)
+  // R0 = KILL R0, D0 (Insn 1)
+  // R0 = ... (Insn 2)
+  // Here, Insn 1 will result in the dependence graph not emitting an output
+  // dependence between Insn 0 and Insn 2. This can lead to incorrect
+  // packetization
+  //
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock::iterator End = MBB->end();
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != End) {
+      if (MI->isKill()) {
+        MachineBasicBlock::iterator DeleteMI = MI;
+        ++MI;
+        MBB->erase(DeleteMI);
+        End = MBB->end();
+        continue;
+      }
+      ++MI;
+    }
+  }
+
+  // Loop over all of the basic blocks.
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    // Find scheduling regions and schedule / packetize each region.
+    unsigned RemainingCount = MBB->size();
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin();) {
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      MachineBasicBlock::iterator I = RegionEnd;
+      for(;I != MBB->begin(); --I, --RemainingCount) {
+        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
+          break;
+      }
+      I = MBB->begin();
+
+      // Skip empty scheduling regions.
+      if (I == RegionEnd) {
+        RegionEnd = llvm::prior(RegionEnd);
+        --RemainingCount;
+        continue;
+      }
+      // Skip regions with one instruction.
+      if (I == llvm::prior(RegionEnd)) {
+        RegionEnd = llvm::prior(RegionEnd);
+        continue;
+      }
+
+      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
+      RegionEnd = I;
+    }
+  }
+
+  return true;
+}
+
+
+static bool IsIndirectCall(MachineInstr* MI) {
+  return ((MI->getOpcode() == Hexagon::CALLR) ||
+          (MI->getOpcode() == Hexagon::CALLRv3));
+}
+
+// Reserve resources for constant extender. Trigure an assertion if
+// reservation fail.
+void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    llvm_unreachable("can not reserve resources for constant extender.");
+  }
+  return;
+}
+
+bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  assert((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
+         "Should only be called for constant extended instructions");
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+  bool CanReserve = ResourceTracker->canReserveResources(PseudoMI);
+  MF->DeleteMachineInstr(PseudoMI);
+  return CanReserve;
+}
+
+// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return
+// true, otherwise, return false.
+bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  MachineFunction *MF = MI->getParent()->getParent();
+  MachineInstr *PseudoMI = MF->CreateMachineInstr(QII->get(Hexagon::IMMEXT_i),
+                                                  MI->getDebugLoc());
+
+  if (ResourceTracker->canReserveResources(PseudoMI)) {
+    ResourceTracker->reserveResources(PseudoMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return true;
+  } else {
+    MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI);
+    return false;
+  }
+}
+
+
+bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI,
+                                          SDep::Kind DepType,
+                                          unsigned DepReg) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const HexagonRegisterInfo* QRI =
+              (const HexagonRegisterInfo *) TM.getRegisterInfo();
+
+  // Check for lr dependence
+  if (DepReg == QRI->getRARegister()) {
+    return true;
+  }
+
+  if (QII->isDeallocRet(MI)) {
+    if (DepReg == QRI->getFrameRegister() ||
+        DepReg == QRI->getStackRegister())
+      return true;
+  }
+
+  // Check if this is a predicate dependence
+  const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg);
+  if (RC == &Hexagon::PredRegsRegClass) {
+    return true;
+  }
+
+  //
+  // Lastly check for an operand used in an indirect call
+  // If we had an attribute for checking if an instruction is an indirect call,
+  // then we could have avoided this relatively brittle implementation of
+  // IsIndirectCall()
+  //
+  // Assumes that the first operand of the CALLr is the function address
+  //
+  if (IsIndirectCall(MI) && (DepType == SDep::Data)) {
+    MachineOperand MO = MI->getOperand(0);
+    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool IsRegDependence(const SDep::Kind DepType) {
+  return (DepType == SDep::Data || DepType == SDep::Anti ||
+          DepType == SDep::Output);
+}
+
+static bool IsDirectJump(MachineInstr* MI) {
+  return (MI->getOpcode() == Hexagon::JMP);
+}
+
+static bool IsSchedBarrier(MachineInstr* MI) {
+  switch (MI->getOpcode()) {
+  case Hexagon::BARRIER:
+    return true;
+  }
+  return false;
+}
+
+static bool IsControlFlow(MachineInstr* MI) {
+  return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
+}
+
+// Function returns true if an instruction can be promoted to the new-value
+// store. It will always return false for v2 and v3.
+// It lists all the conditional and unconditional stores that can be promoted
+// to the new-value stores.
+
+bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
+  const HexagonRegisterInfo* QRI =
+                          (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  switch (MI->getOpcode())
+  {
+    // store byte
+    case Hexagon::STrib:
+    case Hexagon::STrib_indexed:
+    case Hexagon::STrib_indexed_shl_V4:
+    case Hexagon::STrib_shl_V4:
+    case Hexagon::STb_GP_V4:
+    case Hexagon::POST_STbri:
+    case Hexagon::STrib_cPt:
+    case Hexagon::STrib_cdnPt_V4:
+    case Hexagon::STrib_cNotPt:
+    case Hexagon::STrib_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_cPt:
+    case Hexagon::STrib_indexed_cdnPt_V4:
+    case Hexagon::STrib_indexed_cNotPt:
+    case Hexagon::STrib_indexed_cdnNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STbri_cPt:
+    case Hexagon::POST_STbri_cdnPt_V4:
+    case Hexagon::POST_STbri_cNotPt:
+    case Hexagon::POST_STbri_cdnNotPt_V4:
+    case Hexagon::STb_GP_cPt_V4:
+    case Hexagon::STb_GP_cNotPt_V4:
+    case Hexagon::STb_GP_cdnPt_V4:
+    case Hexagon::STb_GP_cdnNotPt_V4:
+
+    // store halfword
+    case Hexagon::STrih:
+    case Hexagon::STrih_indexed:
+    case Hexagon::STrih_indexed_shl_V4:
+    case Hexagon::STrih_shl_V4:
+    case Hexagon::STh_GP_V4:
+    case Hexagon::POST_SThri:
+    case Hexagon::STrih_cPt:
+    case Hexagon::STrih_cdnPt_V4:
+    case Hexagon::STrih_cNotPt:
+    case Hexagon::STrih_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_cPt:
+    case Hexagon::STrih_indexed_cdnPt_V4:
+    case Hexagon::STrih_indexed_cNotPt:
+    case Hexagon::STrih_indexed_cdnNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_SThri_cPt:
+    case Hexagon::POST_SThri_cdnPt_V4:
+    case Hexagon::POST_SThri_cNotPt:
+    case Hexagon::POST_SThri_cdnNotPt_V4:
+    case Hexagon::STh_GP_cPt_V4:
+    case Hexagon::STh_GP_cNotPt_V4:
+    case Hexagon::STh_GP_cdnPt_V4:
+    case Hexagon::STh_GP_cdnNotPt_V4:
+
+    // store word
+    case Hexagon::STriw:
+    case Hexagon::STriw_indexed:
+    case Hexagon::STriw_indexed_shl_V4:
+    case Hexagon::STriw_shl_V4:
+    case Hexagon::STw_GP_V4:
+    case Hexagon::POST_STwri:
+    case Hexagon::STriw_cPt:
+    case Hexagon::STriw_cdnPt_V4:
+    case Hexagon::STriw_cNotPt:
+    case Hexagon::STriw_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_cPt:
+    case Hexagon::STriw_indexed_cdnPt_V4:
+    case Hexagon::STriw_indexed_cNotPt:
+    case Hexagon::STriw_indexed_cdnNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    case Hexagon::POST_STwri_cPt:
+    case Hexagon::POST_STwri_cdnPt_V4:
+    case Hexagon::POST_STwri_cNotPt:
+    case Hexagon::POST_STwri_cdnNotPt_V4:
+    case Hexagon::STw_GP_cPt_V4:
+    case Hexagon::STw_GP_cNotPt_V4:
+    case Hexagon::STw_GP_cdnPt_V4:
+    case Hexagon::STw_GP_cdnNotPt_V4:
+        return QRI->Subtarget.hasV4TOps();
+  }
+  return false;
+}
+
+static bool IsLoopN(MachineInstr *MI) {
+  return (MI->getOpcode() == Hexagon::LOOP0_i ||
+          MI->getOpcode() == Hexagon::LOOP0_r);
+}
+
+/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a
+/// callee-saved register.
+static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
+                                     const TargetRegisterInfo *TRI) {
+  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+    unsigned CalleeSavedReg = *CSR;
+    if (MI->modifiesRegister(CalleeSavedReg, TRI))
+      return true;
+  }
+  return false;
+}
+
+// Return the new value instruction for a given store.
+static int GetDotNewOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib:
+    return Hexagon::STrib_nv_V4;
+
+  case Hexagon::STrib_indexed:
+    return Hexagon::STrib_indexed_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_V4:
+    return Hexagon::STrib_indexed_shl_nv_V4;
+
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STb_GP_V4:
+    return Hexagon::STb_GP_nv_V4;
+
+  case Hexagon::POST_STbri:
+    return Hexagon::POST_STbri_nv_V4;
+
+  case Hexagon::STrib_cPt:
+    return Hexagon::STrib_cPt_nv_V4;
+
+  case Hexagon::STrib_cdnPt_V4:
+    return Hexagon::STrib_cdnPt_nv_V4;
+
+  case Hexagon::STrib_cNotPt:
+    return Hexagon::STrib_cNotPt_nv_V4;
+
+  case Hexagon::STrib_cdnNotPt_V4:
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt:
+    return Hexagon::STrib_indexed_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnPt_V4:
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cNotPt:
+    return Hexagon::STrib_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4:
+    return Hexagon::STrib_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt:
+    return Hexagon::POST_STbri_cPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnPt_V4:
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STbri_cNotPt:
+    return Hexagon::POST_STbri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cdnNotPt_V4:
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_V4:
+    return Hexagon::STb_GP_cPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4:
+    return Hexagon::STb_GP_cNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnPt_V4:
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_V4:
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  // store new value halfword
+  case Hexagon::STrih:
+    return Hexagon::STrih_nv_V4;
+
+  case Hexagon::STrih_indexed:
+    return Hexagon::STrih_indexed_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_V4:
+    return Hexagon::STrih_indexed_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STh_GP_V4:
+    return Hexagon::STh_GP_nv_V4;
+
+  case Hexagon::POST_SThri:
+    return Hexagon::POST_SThri_nv_V4;
+
+  case Hexagon::STrih_cPt:
+    return Hexagon::STrih_cPt_nv_V4;
+
+  case Hexagon::STrih_cdnPt_V4:
+    return Hexagon::STrih_cdnPt_nv_V4;
+
+  case Hexagon::STrih_cNotPt:
+    return Hexagon::STrih_cNotPt_nv_V4;
+
+  case Hexagon::STrih_cdnNotPt_V4:
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt:
+    return Hexagon::STrih_indexed_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnPt_V4:
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cNotPt:
+    return Hexagon::STrih_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4:
+    return Hexagon::STrih_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt:
+    return Hexagon::POST_SThri_cPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnPt_V4:
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+
+  case Hexagon::POST_SThri_cNotPt:
+    return Hexagon::POST_SThri_cNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cdnNotPt_V4:
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_V4:
+    return Hexagon::STh_GP_cPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4:
+    return Hexagon::STh_GP_cNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnPt_V4:
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_V4:
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  // store new value word
+  case Hexagon::STriw:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_V4:
+    return Hexagon::STriw_indexed_shl_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  case Hexagon::STw_GP_V4:
+    return Hexagon::STw_GP_nv_V4;
+
+  case Hexagon::POST_STwri:
+    return Hexagon::POST_STwri_nv_V4;
+
+  case Hexagon::STriw_cPt:
+    return Hexagon::STriw_cPt_nv_V4;
+
+  case Hexagon::STriw_cdnPt_V4:
+    return Hexagon::STriw_cdnPt_nv_V4;
+
+  case Hexagon::STriw_cNotPt:
+    return Hexagon::STriw_cNotPt_nv_V4;
+
+  case Hexagon::STriw_cdnNotPt_V4:
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt:
+    return Hexagon::STriw_indexed_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnPt_V4:
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cNotPt:
+    return Hexagon::STriw_indexed_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4:
+    return Hexagon::STriw_indexed_shl_cPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt:
+    return Hexagon::POST_STwri_cPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnPt_V4:
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+
+  case Hexagon::POST_STwri_cNotPt:
+    return Hexagon::POST_STwri_cNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cdnNotPt_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_V4:
+    return Hexagon::STw_GP_cPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4:
+    return Hexagon::STw_GP_cNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnPt_V4:
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_V4:
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  }
+}
+
+// Return .new predicate version for an instruction
+static int GetDotNewPredOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .new type");
+  // Conditional stores
+  // Store byte conditionally
+  case Hexagon::STrib_cPt :
+    return Hexagon::STrib_cdnPt_V4;
+
+  case Hexagon::STrib_cNotPt :
+    return Hexagon::STrib_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_cPt :
+    return Hexagon::STrib_indexed_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_cNotPt :
+    return Hexagon::STrib_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrib_imm_cPt_V4 :
+    return Hexagon::STrib_imm_cdnPt_V4;
+
+  case Hexagon::STrib_imm_cNotPt_V4 :
+    return Hexagon::STrib_imm_cdnNotPt_V4;
+
+  case Hexagon::POST_STbri_cPt :
+    return Hexagon::POST_STbri_cdnPt_V4;
+
+  case Hexagon::POST_STbri_cNotPt :
+    return Hexagon::POST_STbri_cdnNotPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::STb_GP_cPt_V4 :
+    return Hexagon::STb_GP_cdnPt_V4;
+
+  case Hexagon::STb_GP_cNotPt_V4 :
+    return Hexagon::STb_GP_cdnNotPt_V4;
+
+  // Store doubleword conditionally
+  case Hexagon::STrid_cPt :
+    return Hexagon::STrid_cdnPt_V4;
+
+  case Hexagon::STrid_cNotPt :
+    return Hexagon::STrid_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_cPt :
+    return Hexagon::STrid_indexed_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_cNotPt :
+    return Hexagon::STrid_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STdri_cPt :
+    return Hexagon::POST_STdri_cdnPt_V4;
+
+  case Hexagon::POST_STdri_cNotPt :
+    return Hexagon::POST_STdri_cdnNotPt_V4;
+
+  case Hexagon::STd_GP_cPt_V4 :
+    return Hexagon::STd_GP_cdnPt_V4;
+
+  case Hexagon::STd_GP_cNotPt_V4 :
+    return Hexagon::STd_GP_cdnNotPt_V4;
+
+  // Store halfword conditionally
+  case Hexagon::STrih_cPt :
+    return Hexagon::STrih_cdnPt_V4;
+
+  case Hexagon::STrih_cNotPt :
+    return Hexagon::STrih_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_cPt :
+    return Hexagon::STrih_indexed_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_cNotPt :
+    return Hexagon::STrih_indexed_cdnNotPt_V4;
+
+  case Hexagon::STrih_imm_cPt_V4 :
+    return Hexagon::STrih_imm_cdnPt_V4;
+
+  case Hexagon::STrih_imm_cNotPt_V4 :
+    return Hexagon::STrih_imm_cdnNotPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_SThri_cPt :
+    return Hexagon::POST_SThri_cdnPt_V4;
+
+  case Hexagon::POST_SThri_cNotPt :
+    return Hexagon::POST_SThri_cdnNotPt_V4;
+
+  case Hexagon::STh_GP_cPt_V4 :
+    return Hexagon::STh_GP_cdnPt_V4;
+
+  case Hexagon::STh_GP_cNotPt_V4 :
+    return Hexagon::STh_GP_cdnNotPt_V4;
+
+  // Store word conditionally
+  case Hexagon::STriw_cPt :
+    return Hexagon::STriw_cdnPt_V4;
+
+  case Hexagon::STriw_cNotPt :
+    return Hexagon::STriw_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_cPt :
+    return Hexagon::STriw_indexed_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_cNotPt :
+    return Hexagon::STriw_indexed_cdnNotPt_V4;
+
+  case Hexagon::STriw_imm_cPt_V4 :
+    return Hexagon::STriw_imm_cdnPt_V4;
+
+  case Hexagon::STriw_imm_cNotPt_V4 :
+    return Hexagon::STriw_imm_cdnNotPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::POST_STwri_cPt :
+    return Hexagon::POST_STwri_cdnPt_V4;
+
+  case Hexagon::POST_STwri_cNotPt :
+    return Hexagon::POST_STwri_cdnNotPt_V4;
+
+  case Hexagon::STw_GP_cPt_V4 :
+    return Hexagon::STw_GP_cdnPt_V4;
+
+  case Hexagon::STw_GP_cNotPt_V4 :
+    return Hexagon::STw_GP_cdnNotPt_V4;
+
+  // Condtional Jumps
+  case Hexagon::JMP_c:
+    return Hexagon::JMP_cdnPt;
+
+  case Hexagon::JMP_cNot:
+    return Hexagon::JMP_cdnNotPt;
+
+  case Hexagon::JMPR_cPt:
+    return Hexagon::JMPR_cdnPt_V3;
+
+  case Hexagon::JMPR_cNotPt:
+    return Hexagon::JMPR_cdnNotPt_V3;
+
+  // Conditional Transfers
+  case Hexagon::TFR_cPt:
+    return Hexagon::TFR_cdnPt;
+
+  case Hexagon::TFR_cNotPt:
+    return Hexagon::TFR_cdnNotPt;
+
+  case Hexagon::TFRI_cPt:
+    return Hexagon::TFRI_cdnPt;
+
+  case Hexagon::TFRI_cNotPt:
+    return Hexagon::TFRI_cdnNotPt;
+
+  // Load double word
+  case Hexagon::LDrid_cPt :
+    return Hexagon::LDrid_cdnPt;
+
+  case Hexagon::LDrid_cNotPt :
+    return Hexagon::LDrid_cdnNotPt;
+
+  case Hexagon::LDrid_indexed_cPt :
+    return Hexagon::LDrid_indexed_cdnPt;
+
+  case Hexagon::LDrid_indexed_cNotPt :
+    return Hexagon::LDrid_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrid_cPt :
+    return Hexagon::POST_LDrid_cdnPt_V4;
+
+  case Hexagon::POST_LDrid_cNotPt :
+    return Hexagon::POST_LDrid_cdnNotPt_V4;
+
+  // Load word
+  case Hexagon::LDriw_cPt :
+    return Hexagon::LDriw_cdnPt;
+
+  case Hexagon::LDriw_cNotPt :
+    return Hexagon::LDriw_cdnNotPt;
+
+  case Hexagon::LDriw_indexed_cPt :
+    return Hexagon::LDriw_indexed_cdnPt;
+
+  case Hexagon::LDriw_indexed_cNotPt :
+    return Hexagon::LDriw_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriw_cPt :
+    return Hexagon::POST_LDriw_cdnPt_V4;
+
+  case Hexagon::POST_LDriw_cNotPt :
+    return Hexagon::POST_LDriw_cdnNotPt_V4;
+
+  // Load halfword
+  case Hexagon::LDrih_cPt :
+    return Hexagon::LDrih_cdnPt;
+
+  case Hexagon::LDrih_cNotPt :
+    return Hexagon::LDrih_cdnNotPt;
+
+  case Hexagon::LDrih_indexed_cPt :
+    return Hexagon::LDrih_indexed_cdnPt;
+
+  case Hexagon::LDrih_indexed_cNotPt :
+    return Hexagon::LDrih_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrih_cPt :
+    return Hexagon::POST_LDrih_cdnPt_V4;
+
+  case Hexagon::POST_LDrih_cNotPt :
+    return Hexagon::POST_LDrih_cdnNotPt_V4;
+
+  // Load byte
+  case Hexagon::LDrib_cPt :
+    return Hexagon::LDrib_cdnPt;
+
+  case Hexagon::LDrib_cNotPt :
+    return Hexagon::LDrib_cdnNotPt;
+
+  case Hexagon::LDrib_indexed_cPt :
+    return Hexagon::LDrib_indexed_cdnPt;
+
+  case Hexagon::LDrib_indexed_cNotPt :
+    return Hexagon::LDrib_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDrib_cPt :
+    return Hexagon::POST_LDrib_cdnPt_V4;
+
+  case Hexagon::POST_LDrib_cNotPt :
+    return Hexagon::POST_LDrib_cdnNotPt_V4;
+
+  // Load unsigned halfword
+  case Hexagon::LDriuh_cPt :
+    return Hexagon::LDriuh_cdnPt;
+
+  case Hexagon::LDriuh_cNotPt :
+    return Hexagon::LDriuh_cdnNotPt;
+
+  case Hexagon::LDriuh_indexed_cPt :
+    return Hexagon::LDriuh_indexed_cdnPt;
+
+  case Hexagon::LDriuh_indexed_cNotPt :
+    return Hexagon::LDriuh_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriuh_cPt :
+    return Hexagon::POST_LDriuh_cdnPt_V4;
+
+  case Hexagon::POST_LDriuh_cNotPt :
+    return Hexagon::POST_LDriuh_cdnNotPt_V4;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cPt :
+    return Hexagon::LDriub_cdnPt;
+
+  case Hexagon::LDriub_cNotPt :
+    return Hexagon::LDriub_cdnNotPt;
+
+  case Hexagon::LDriub_indexed_cPt :
+    return Hexagon::LDriub_indexed_cdnPt;
+
+  case Hexagon::LDriub_indexed_cNotPt :
+    return Hexagon::LDriub_indexed_cdnNotPt;
+
+  case Hexagon::POST_LDriub_cPt :
+    return Hexagon::POST_LDriub_cdnPt_V4;
+
+  case Hexagon::POST_LDriub_cNotPt :
+    return Hexagon::POST_LDriub_cdnNotPt_V4;
+
+  // V4 indexed+scaled load
+
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cdnNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cPt_V4:
+    return Hexagon::LDd_GP_cdnPt_V4;
+
+  case Hexagon::LDd_GP_cNotPt_V4:
+    return Hexagon::LDd_GP_cdnNotPt_V4;
+
+  case Hexagon::LDb_GP_cPt_V4:
+    return Hexagon::LDb_GP_cdnPt_V4;
+
+  case Hexagon::LDb_GP_cNotPt_V4:
+    return Hexagon::LDb_GP_cdnNotPt_V4;
+
+  case Hexagon::LDub_GP_cPt_V4:
+    return Hexagon::LDub_GP_cdnPt_V4;
+
+  case Hexagon::LDub_GP_cNotPt_V4:
+    return Hexagon::LDub_GP_cdnNotPt_V4;
+
+  case Hexagon::LDh_GP_cPt_V4:
+    return Hexagon::LDh_GP_cdnPt_V4;
+
+  case Hexagon::LDh_GP_cNotPt_V4:
+    return Hexagon::LDh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDuh_GP_cPt_V4:
+    return Hexagon::LDuh_GP_cdnPt_V4;
+
+  case Hexagon::LDuh_GP_cNotPt_V4:
+    return Hexagon::LDuh_GP_cdnNotPt_V4;
+
+  case Hexagon::LDw_GP_cPt_V4:
+    return Hexagon::LDw_GP_cdnPt_V4;
+
+  case Hexagon::LDw_GP_cNotPt_V4:
+    return Hexagon::LDw_GP_cdnNotPt_V4;
+
+  // Conditional store new-value byte
+  case Hexagon::STrib_cPt_nv_V4 :
+    return Hexagon::STrib_cdnPt_nv_V4;
+  case Hexagon::STrib_cNotPt_nv_V4 :
+    return Hexagon::STrib_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnPt_nv_V4;
+  case Hexagon::POST_STbri_cNotPt_nv_V4 :
+    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
+
+  case Hexagon::STb_GP_cPt_nv_V4 :
+    return Hexagon::STb_GP_cdnPt_nv_V4;
+
+  case Hexagon::STb_GP_cNotPt_nv_V4 :
+    return Hexagon::STb_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value halfword
+  case Hexagon::STrih_cPt_nv_V4 :
+    return Hexagon::STrih_cdnPt_nv_V4;
+  case Hexagon::STrih_cNotPt_nv_V4 :
+    return Hexagon::STrih_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnPt_nv_V4;
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
+
+  case Hexagon::STh_GP_cPt_nv_V4 :
+    return Hexagon::STh_GP_cdnPt_nv_V4;
+
+  case Hexagon::STh_GP_cNotPt_nv_V4 :
+    return Hexagon::STh_GP_cdnNotPt_nv_V4;
+
+  // Conditional store new-value word
+  case Hexagon::STriw_cPt_nv_V4 :
+    return  Hexagon::STriw_cdnPt_nv_V4;
+  case Hexagon::STriw_cNotPt_nv_V4 :
+    return Hexagon::STriw_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
+
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
+
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+    return Hexagon::POST_STwri_cdnPt_nv_V4;
+  case Hexagon::POST_STwri_cNotPt_nv_V4:
+    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
+
+  case Hexagon::STw_GP_cPt_nv_V4 :
+    return Hexagon::STw_GP_cdnPt_nv_V4;
+
+  case Hexagon::STw_GP_cNotPt_nv_V4 :
+    return Hexagon::STw_GP_cdnNotPt_nv_V4;
+
+  // Conditional add
+  case Hexagon::ADD_ri_cPt :
+    return Hexagon::ADD_ri_cdnPt;
+  case Hexagon::ADD_ri_cNotPt :
+    return Hexagon::ADD_ri_cdnNotPt;
+
+  case Hexagon::ADD_rr_cPt :
+    return Hexagon::ADD_rr_cdnPt;
+  case Hexagon::ADD_rr_cNotPt :
+    return Hexagon::ADD_rr_cdnNotPt;
+
+  // Conditional logical Operations
+  case Hexagon::XOR_rr_cPt :
+    return Hexagon::XOR_rr_cdnPt;
+  case Hexagon::XOR_rr_cNotPt :
+    return Hexagon::XOR_rr_cdnNotPt;
+
+  case Hexagon::AND_rr_cPt :
+    return Hexagon::AND_rr_cdnPt;
+  case Hexagon::AND_rr_cNotPt :
+    return Hexagon::AND_rr_cdnNotPt;
+
+  case Hexagon::OR_rr_cPt :
+    return Hexagon::OR_rr_cdnPt;
+  case Hexagon::OR_rr_cNotPt :
+    return Hexagon::OR_rr_cdnNotPt;
+
+  // Conditional Subtract
+  case Hexagon::SUB_rr_cPt :
+    return Hexagon::SUB_rr_cdnPt;
+  case Hexagon::SUB_rr_cNotPt :
+    return Hexagon::SUB_rr_cdnNotPt;
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+
+  case Hexagon::ASLH_cPt_V4 :
+    return Hexagon::ASLH_cdnPt_V4;
+  case Hexagon::ASLH_cNotPt_V4 :
+    return Hexagon::ASLH_cdnNotPt_V4;
+
+  case Hexagon::ASRH_cPt_V4 :
+    return Hexagon::ASRH_cdnPt_V4;
+  case Hexagon::ASRH_cNotPt_V4 :
+    return Hexagon::ASRH_cdnNotPt_V4;
+
+  case Hexagon::SXTB_cPt_V4 :
+    return Hexagon::SXTB_cdnPt_V4;
+  case Hexagon::SXTB_cNotPt_V4 :
+    return Hexagon::SXTB_cdnNotPt_V4;
+
+  case Hexagon::SXTH_cPt_V4 :
+    return Hexagon::SXTH_cdnPt_V4;
+  case Hexagon::SXTH_cNotPt_V4 :
+    return Hexagon::SXTH_cdnNotPt_V4;
+
+  case Hexagon::ZXTB_cPt_V4 :
+    return Hexagon::ZXTB_cdnPt_V4;
+  case Hexagon::ZXTB_cNotPt_V4 :
+    return Hexagon::ZXTB_cdnNotPt_V4;
+
+  case Hexagon::ZXTH_cPt_V4 :
+    return Hexagon::ZXTH_cdnPt_V4;
+  case Hexagon::ZXTH_cNotPt_V4 :
+    return Hexagon::ZXTH_cdnNotPt_V4;
+  }
+}
+
+// Returns true if an instruction can be promoted to .new predicate
+// or new-value store.
+bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
+  if ( isCondInst(MI) || IsNewifyStore(MI))
+    return true;
+  else
+    return false;
+}
+
+bool HexagonPacketizerList::isCondInst (MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const MCInstrDesc& TID = MI->getDesc();
+                                    // bug 5670: until that is fixed,
+                                    // this portion is disabled.
+  if (   TID.isConditionalBranch()  // && !IsRegisterJump(MI)) ||
+      || QII->isConditionalTransfer(MI)
+      || QII->isConditionalALU32(MI)
+      || QII->isConditionalLoad(MI)
+      || QII->isConditionalStore(MI)) {
+    return true;
+  }
+  return false;
+}
+
+
+// Promote an instructiont to its .new form.
+// At this time, we have already made a call to CanPromoteToDotNew
+// and made sure that it can *indeed* be promoted.
+bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
+                        SDep::Kind DepType, MachineBasicBlock::iterator &MII,
+                        const TargetRegisterClass* RC) {
+
+  assert (DepType == SDep::Data);
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  int NewOpcode;
+  if (RC == &Hexagon::PredRegsRegClass)
+    NewOpcode = GetDotNewPredOp(MI->getOpcode());
+  else
+    NewOpcode = GetDotNewOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+
+  return true;
+}
+
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+// To understand the translation of instruction 1 to its original form, consider
+// a packet with 3 instructions.
+// { p0 = cmp.eq(R0,R1)
+//   if (p0.new) R2 = add(R3, R4)
+//   R5 = add (R3, R1)
+//   }
+// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
+//
+// This instruction can be part of the previous packet only if both p0 and R2
+// are promoted to .new values. This promotion happens in steps, first
+// predicate register is promoted to .new and in the next iteration R2 is
+// promoted. Therefore, in case of dependence check failure (due to R5) during
+// next iteration, it should be converted back to its most basic form.
+
+static int GetDotOldOp(const int opc) {
+  switch (opc) {
+  default: llvm_unreachable("Unknown .old type");
+  case Hexagon::TFR_cdnPt:
+    return Hexagon::TFR_cPt;
+
+  case Hexagon::TFR_cdnNotPt:
+    return Hexagon::TFR_cNotPt;
+
+  case Hexagon::TFRI_cdnPt:
+    return Hexagon::TFRI_cPt;
+
+  case Hexagon::TFRI_cdnNotPt:
+    return Hexagon::TFRI_cNotPt;
+
+  case Hexagon::JMP_cdnPt:
+    return Hexagon::JMP_c;
+
+  case Hexagon::JMP_cdnNotPt:
+    return Hexagon::JMP_cNot;
+
+  case Hexagon::JMPR_cdnPt_V3:
+    return Hexagon::JMPR_cPt;
+
+  case Hexagon::JMPR_cdnNotPt_V3:
+    return Hexagon::JMPR_cNotPt;
+
+  // Load double word
+
+  case Hexagon::LDrid_cdnPt :
+    return Hexagon::LDrid_cPt;
+
+  case Hexagon::LDrid_cdnNotPt :
+    return Hexagon::LDrid_cNotPt;
+
+  case Hexagon::LDrid_indexed_cdnPt :
+    return Hexagon::LDrid_indexed_cPt;
+
+  case Hexagon::LDrid_indexed_cdnNotPt :
+    return Hexagon::LDrid_indexed_cNotPt;
+
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+    return Hexagon::POST_LDrid_cPt;
+
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+    return Hexagon::POST_LDrid_cNotPt;
+
+  // Load word
+
+  case Hexagon::LDriw_cdnPt :
+    return Hexagon::LDriw_cPt;
+
+  case Hexagon::LDriw_cdnNotPt :
+    return Hexagon::LDriw_cNotPt;
+
+  case Hexagon::LDriw_indexed_cdnPt :
+    return Hexagon::LDriw_indexed_cPt;
+
+  case Hexagon::LDriw_indexed_cdnNotPt :
+    return Hexagon::LDriw_indexed_cNotPt;
+
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+    return Hexagon::POST_LDriw_cPt;
+
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+    return Hexagon::POST_LDriw_cNotPt;
+
+  // Load half
+
+  case Hexagon::LDrih_cdnPt :
+    return Hexagon::LDrih_cPt;
+
+  case Hexagon::LDrih_cdnNotPt :
+    return Hexagon::LDrih_cNotPt;
+
+  case Hexagon::LDrih_indexed_cdnPt :
+    return Hexagon::LDrih_indexed_cPt;
+
+  case Hexagon::LDrih_indexed_cdnNotPt :
+    return Hexagon::LDrih_indexed_cNotPt;
+
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+    return Hexagon::POST_LDrih_cPt;
+
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+    return Hexagon::POST_LDrih_cNotPt;
+
+  // Load byte
+
+  case Hexagon::LDrib_cdnPt :
+    return Hexagon::LDrib_cPt;
+
+  case Hexagon::LDrib_cdnNotPt :
+    return Hexagon::LDrib_cNotPt;
+
+  case Hexagon::LDrib_indexed_cdnPt :
+    return Hexagon::LDrib_indexed_cPt;
+
+  case Hexagon::LDrib_indexed_cdnNotPt :
+    return Hexagon::LDrib_indexed_cNotPt;
+
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+    return Hexagon::POST_LDrib_cPt;
+
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+    return Hexagon::POST_LDrib_cNotPt;
+
+  // Load unsigned half
+
+  case Hexagon::LDriuh_cdnPt :
+    return Hexagon::LDriuh_cPt;
+
+  case Hexagon::LDriuh_cdnNotPt :
+    return Hexagon::LDriuh_cNotPt;
+
+  case Hexagon::LDriuh_indexed_cdnPt :
+    return Hexagon::LDriuh_indexed_cPt;
+
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+    return Hexagon::LDriuh_indexed_cNotPt;
+
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+    return Hexagon::POST_LDriuh_cPt;
+
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+    return Hexagon::POST_LDriuh_cNotPt;
+
+  // Load unsigned byte
+  case Hexagon::LDriub_cdnPt :
+    return Hexagon::LDriub_cPt;
+
+  case Hexagon::LDriub_cdnNotPt :
+    return Hexagon::LDriub_cNotPt;
+
+  case Hexagon::LDriub_indexed_cdnPt :
+    return Hexagon::LDriub_indexed_cPt;
+
+  case Hexagon::LDriub_indexed_cdnNotPt :
+    return Hexagon::LDriub_indexed_cNotPt;
+
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+    return Hexagon::POST_LDriub_cPt;
+
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+    return Hexagon::POST_LDriub_cNotPt;
+
+  // V4 indexed+scaled Load
+
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriub_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cPt_V4;
+
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cPt_V4;
+
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::LDriw_indexed_shl_cNotPt_V4;
+
+  // V4 global address load
+
+  case Hexagon::LDd_GP_cdnPt_V4:
+    return Hexagon::LDd_GP_cPt_V4;
+
+  case Hexagon::LDd_GP_cdnNotPt_V4:
+    return Hexagon::LDd_GP_cNotPt_V4;
+
+  case Hexagon::LDb_GP_cdnPt_V4:
+    return Hexagon::LDb_GP_cPt_V4;
+
+  case Hexagon::LDb_GP_cdnNotPt_V4:
+    return Hexagon::LDb_GP_cNotPt_V4;
+
+  case Hexagon::LDub_GP_cdnPt_V4:
+    return Hexagon::LDub_GP_cPt_V4;
+
+  case Hexagon::LDub_GP_cdnNotPt_V4:
+    return Hexagon::LDub_GP_cNotPt_V4;
+
+  case Hexagon::LDh_GP_cdnPt_V4:
+    return Hexagon::LDh_GP_cPt_V4;
+
+  case Hexagon::LDh_GP_cdnNotPt_V4:
+    return Hexagon::LDh_GP_cNotPt_V4;
+
+  case Hexagon::LDuh_GP_cdnPt_V4:
+    return Hexagon::LDuh_GP_cPt_V4;
+
+  case Hexagon::LDuh_GP_cdnNotPt_V4:
+    return Hexagon::LDuh_GP_cNotPt_V4;
+
+  case Hexagon::LDw_GP_cdnPt_V4:
+    return Hexagon::LDw_GP_cPt_V4;
+
+  case Hexagon::LDw_GP_cdnNotPt_V4:
+    return Hexagon::LDw_GP_cNotPt_V4;
+
+  // Conditional add
+
+  case Hexagon::ADD_ri_cdnPt :
+    return Hexagon::ADD_ri_cPt;
+  case Hexagon::ADD_ri_cdnNotPt :
+    return Hexagon::ADD_ri_cNotPt;
+
+  case Hexagon::ADD_rr_cdnPt :
+    return Hexagon::ADD_rr_cPt;
+  case Hexagon::ADD_rr_cdnNotPt:
+    return Hexagon::ADD_rr_cNotPt;
+
+  // Conditional logical Operations
+
+  case Hexagon::XOR_rr_cdnPt :
+    return Hexagon::XOR_rr_cPt;
+  case Hexagon::XOR_rr_cdnNotPt :
+    return Hexagon::XOR_rr_cNotPt;
+
+  case Hexagon::AND_rr_cdnPt :
+    return Hexagon::AND_rr_cPt;
+  case Hexagon::AND_rr_cdnNotPt :
+    return Hexagon::AND_rr_cNotPt;
+
+  case Hexagon::OR_rr_cdnPt :
+    return Hexagon::OR_rr_cPt;
+  case Hexagon::OR_rr_cdnNotPt :
+    return Hexagon::OR_rr_cNotPt;
+
+  // Conditional Subtract
+
+  case Hexagon::SUB_rr_cdnPt :
+    return Hexagon::SUB_rr_cPt;
+  case Hexagon::SUB_rr_cdnNotPt :
+    return Hexagon::SUB_rr_cNotPt;
+
+  // Conditional combine
+
+  case Hexagon::COMBINE_rr_cdnPt :
+    return Hexagon::COMBINE_rr_cPt;
+  case Hexagon::COMBINE_rr_cdnNotPt :
+    return Hexagon::COMBINE_rr_cNotPt;
+
+// Conditional shift operations
+
+  case Hexagon::ASLH_cdnPt_V4 :
+    return Hexagon::ASLH_cPt_V4;
+  case Hexagon::ASLH_cdnNotPt_V4 :
+    return Hexagon::ASLH_cNotPt_V4;
+
+  case Hexagon::ASRH_cdnPt_V4 :
+    return Hexagon::ASRH_cPt_V4;
+  case Hexagon::ASRH_cdnNotPt_V4 :
+    return Hexagon::ASRH_cNotPt_V4;
+
+  case Hexagon::SXTB_cdnPt_V4 :
+    return Hexagon::SXTB_cPt_V4;
+  case Hexagon::SXTB_cdnNotPt_V4 :
+    return Hexagon::SXTB_cNotPt_V4;
+
+  case Hexagon::SXTH_cdnPt_V4 :
+    return Hexagon::SXTH_cPt_V4;
+  case Hexagon::SXTH_cdnNotPt_V4 :
+    return Hexagon::SXTH_cNotPt_V4;
+
+  case Hexagon::ZXTB_cdnPt_V4 :
+    return Hexagon::ZXTB_cPt_V4;
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+    return Hexagon::ZXTB_cNotPt_V4;
+
+  case Hexagon::ZXTH_cdnPt_V4 :
+    return Hexagon::ZXTH_cPt_V4;
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+    return Hexagon::ZXTH_cNotPt_V4;
+
+  // Store byte
+
+  case Hexagon::STrib_imm_cdnPt_V4 :
+    return Hexagon::STrib_imm_cPt_V4;
+
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+    return Hexagon::STrib_imm_cNotPt_V4;
+
+  case Hexagon::STrib_cdnPt_nv_V4 :
+  case Hexagon::STrib_cPt_nv_V4 :
+  case Hexagon::STrib_cdnPt_V4 :
+    return Hexagon::STrib_cPt;
+
+  case Hexagon::STrib_cdnNotPt_nv_V4 :
+  case Hexagon::STrib_cNotPt_nv_V4 :
+  case Hexagon::STrib_cdnNotPt_V4 :
+    return Hexagon::STrib_cNotPt;
+
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnPt_nv_V4 :
+    return Hexagon::STrib_indexed_cPt;
+
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_cdnNotPt_nv_V4 :
+    return Hexagon::STrib_indexed_cNotPt;
+
+  case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrib_indexed_shl_cPt_V4;
+
+  case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
+  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrib_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STbri_cdnPt_nv_V4 :
+  case Hexagon::POST_STbri_cPt_nv_V4 :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+    return Hexagon::POST_STbri_cPt;
+
+  case Hexagon::POST_STbri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STbri_cNotPt_nv_V4:
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+    return Hexagon::POST_STbri_cNotPt;
+
+  case Hexagon::STb_GP_cdnPt_nv_V4:
+  case Hexagon::STb_GP_cdnPt_V4:
+  case Hexagon::STb_GP_cPt_nv_V4:
+    return Hexagon::STb_GP_cPt_V4;
+
+  case Hexagon::STb_GP_cdnNotPt_nv_V4:
+  case Hexagon::STb_GP_cdnNotPt_V4:
+  case Hexagon::STb_GP_cNotPt_nv_V4:
+    return Hexagon::STb_GP_cNotPt_V4;
+
+  // Store new-value byte - unconditional
+  case Hexagon::STrib_nv_V4:
+    return Hexagon::STrib;
+
+  case Hexagon::STrib_indexed_nv_V4:
+    return Hexagon::STrib_indexed;
+
+  case Hexagon::STrib_indexed_shl_nv_V4:
+    return Hexagon::STrib_indexed_shl_V4;
+
+  case Hexagon::STrib_shl_nv_V4:
+    return Hexagon::STrib_shl_V4;
+
+  case Hexagon::STb_GP_nv_V4:
+    return Hexagon::STb_GP_V4;
+
+  case Hexagon::POST_STbri_nv_V4:
+    return Hexagon::POST_STbri;
+
+  // Store halfword
+  case Hexagon::STrih_imm_cdnPt_V4 :
+    return Hexagon::STrih_imm_cPt_V4;
+
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+    return Hexagon::STrih_imm_cNotPt_V4;
+
+  case Hexagon::STrih_cdnPt_nv_V4 :
+  case Hexagon::STrih_cPt_nv_V4 :
+  case Hexagon::STrih_cdnPt_V4 :
+    return Hexagon::STrih_cPt;
+
+  case Hexagon::STrih_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_cNotPt_nv_V4 :
+  case Hexagon::STrih_cdnNotPt_V4 :
+    return Hexagon::STrih_cNotPt;
+
+  case Hexagon::STrih_indexed_cdnPt_nv_V4:
+  case Hexagon::STrih_indexed_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+    return Hexagon::STrih_indexed_cPt;
+
+  case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
+  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_cNotPt;
+
+  case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrih_indexed_shl_cPt_V4;
+
+  case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrih_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_SThri_cdnPt_nv_V4 :
+  case Hexagon::POST_SThri_cPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+    return Hexagon::POST_SThri_cPt;
+
+  case Hexagon::POST_SThri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cNotPt_nv_V4 :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+    return Hexagon::POST_SThri_cNotPt;
+
+  case Hexagon::STh_GP_cdnPt_nv_V4:
+  case Hexagon::STh_GP_cdnPt_V4:
+  case Hexagon::STh_GP_cPt_nv_V4:
+    return Hexagon::STh_GP_cPt_V4;
+
+  case Hexagon::STh_GP_cdnNotPt_nv_V4:
+  case Hexagon::STh_GP_cdnNotPt_V4:
+  case Hexagon::STh_GP_cNotPt_nv_V4:
+    return Hexagon::STh_GP_cNotPt_V4;
+
+  // Store new-value halfword - unconditional
+
+  case Hexagon::STrih_nv_V4:
+    return Hexagon::STrih;
+
+  case Hexagon::STrih_indexed_nv_V4:
+    return Hexagon::STrih_indexed;
+
+  case Hexagon::STrih_indexed_shl_nv_V4:
+    return Hexagon::STrih_indexed_shl_V4;
+
+  case Hexagon::STrih_shl_nv_V4:
+    return Hexagon::STrih_shl_V4;
+
+  case Hexagon::STh_GP_nv_V4:
+    return Hexagon::STh_GP_V4;
+
+  case Hexagon::POST_SThri_nv_V4:
+    return Hexagon::POST_SThri;
+
+   // Store word
+
+   case Hexagon::STriw_imm_cdnPt_V4 :
+    return Hexagon::STriw_imm_cPt_V4;
+
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+    return Hexagon::STriw_imm_cNotPt_V4;
+
+  case Hexagon::STriw_cdnPt_nv_V4 :
+  case Hexagon::STriw_cPt_nv_V4 :
+  case Hexagon::STriw_cdnPt_V4 :
+    return Hexagon::STriw_cPt;
+
+  case Hexagon::STriw_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_cNotPt_nv_V4 :
+  case Hexagon::STriw_cdnNotPt_V4 :
+    return Hexagon::STriw_cNotPt;
+
+  case Hexagon::STriw_indexed_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+    return Hexagon::STriw_indexed_cPt;
+
+  case Hexagon::STriw_indexed_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_cNotPt;
+
+  case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+    return Hexagon::STriw_indexed_shl_cPt_V4;
+
+  case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STriw_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STwri_cdnPt_nv_V4 :
+  case Hexagon::POST_STwri_cPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+    return Hexagon::POST_STwri_cPt;
+
+  case Hexagon::POST_STwri_cdnNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cNotPt_nv_V4 :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+    return Hexagon::POST_STwri_cNotPt;
+
+  case Hexagon::STw_GP_cdnPt_nv_V4:
+  case Hexagon::STw_GP_cdnPt_V4:
+  case Hexagon::STw_GP_cPt_nv_V4:
+    return Hexagon::STw_GP_cPt_V4;
+
+  case Hexagon::STw_GP_cdnNotPt_nv_V4:
+  case Hexagon::STw_GP_cdnNotPt_V4:
+  case Hexagon::STw_GP_cNotPt_nv_V4:
+    return Hexagon::STw_GP_cNotPt_V4;
+
+  // Store new-value word - unconditional
+
+  case Hexagon::STriw_nv_V4:
+    return Hexagon::STriw;
+
+  case Hexagon::STriw_indexed_nv_V4:
+    return Hexagon::STriw_indexed;
+
+  case Hexagon::STriw_indexed_shl_nv_V4:
+    return Hexagon::STriw_indexed_shl_V4;
+
+  case Hexagon::STriw_shl_nv_V4:
+    return Hexagon::STriw_shl_V4;
+
+  case Hexagon::STw_GP_nv_V4:
+    return Hexagon::STw_GP_V4;
+
+  case Hexagon::POST_STwri_nv_V4:
+    return Hexagon::POST_STwri;
+
+ // Store doubleword
+
+  case Hexagon::STrid_cdnPt_V4 :
+    return Hexagon::STrid_cPt;
+
+  case Hexagon::STrid_cdnNotPt_V4 :
+    return Hexagon::STrid_cNotPt;
+
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+    return Hexagon::STrid_indexed_cPt;
+
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_cNotPt;
+
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+    return Hexagon::STrid_indexed_shl_cPt_V4;
+
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+    return Hexagon::STrid_indexed_shl_cNotPt_V4;
+
+  case Hexagon::POST_STdri_cdnPt_V4 :
+    return Hexagon::POST_STdri_cPt;
+
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+    return Hexagon::POST_STdri_cNotPt;
+
+  case Hexagon::STd_GP_cdnPt_V4 :
+    return Hexagon::STd_GP_cPt_V4;
+
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+    return Hexagon::STd_GP_cNotPt_V4;
+
+  }
+}
+
+bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  int NewOpcode = GetDotOldOp(MI->getOpcode());
+  MI->setDesc(QII->get(NewOpcode));
+  return true;
+}
+
+// Returns true if an instruction is predicated on p0 and false if it's
+// predicated on !p0.
+
+static bool GetPredicateSense(MachineInstr* MI,
+                              const HexagonInstrInfo *QII) {
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown predicate sense of the instruction");
+  case Hexagon::TFR_cPt:
+  case Hexagon::TFR_cdnPt:
+  case Hexagon::TFRI_cPt:
+  case Hexagon::TFRI_cdnPt:
+  case Hexagon::STrib_cPt :
+  case Hexagon::STrib_cdnPt_V4 :
+  case Hexagon::STrib_indexed_cPt :
+  case Hexagon::STrib_indexed_cdnPt_V4 :
+  case Hexagon::STrib_indexed_shl_cPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STbri_cPt :
+  case Hexagon::POST_STbri_cdnPt_V4 :
+  case Hexagon::STrih_cPt :
+  case Hexagon::STrih_cdnPt_V4 :
+  case Hexagon::STrih_indexed_cPt :
+  case Hexagon::STrih_indexed_cdnPt_V4 :
+  case Hexagon::STrih_indexed_shl_cPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_SThri_cPt :
+  case Hexagon::POST_SThri_cdnPt_V4 :
+  case Hexagon::STriw_cPt :
+  case Hexagon::STriw_cdnPt_V4 :
+  case Hexagon::STriw_indexed_cPt :
+  case Hexagon::STriw_indexed_cdnPt_V4 :
+  case Hexagon::STriw_indexed_shl_cPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STwri_cPt :
+  case Hexagon::POST_STwri_cdnPt_V4 :
+  case Hexagon::STrib_imm_cPt_V4 :
+  case Hexagon::STrib_imm_cdnPt_V4 :
+  case Hexagon::STrid_cPt :
+  case Hexagon::STrid_cdnPt_V4 :
+  case Hexagon::STrid_indexed_cPt :
+  case Hexagon::STrid_indexed_cdnPt_V4 :
+  case Hexagon::STrid_indexed_shl_cPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::POST_STdri_cPt :
+  case Hexagon::POST_STdri_cdnPt_V4 :
+  case Hexagon::STrih_imm_cPt_V4 :
+  case Hexagon::STrih_imm_cdnPt_V4 :
+  case Hexagon::STriw_imm_cPt_V4 :
+  case Hexagon::STriw_imm_cdnPt_V4 :
+  case Hexagon::JMP_cdnPt :
+  case Hexagon::LDrid_cPt :
+  case Hexagon::LDrid_cdnPt :
+  case Hexagon::LDrid_indexed_cPt :
+  case Hexagon::LDrid_indexed_cdnPt :
+  case Hexagon::POST_LDrid_cPt :
+  case Hexagon::POST_LDrid_cdnPt_V4 :
+  case Hexagon::LDriw_cPt :
+  case Hexagon::LDriw_cdnPt :
+  case Hexagon::LDriw_indexed_cPt :
+  case Hexagon::LDriw_indexed_cdnPt :
+  case Hexagon::POST_LDriw_cPt :
+  case Hexagon::POST_LDriw_cdnPt_V4 :
+  case Hexagon::LDrih_cPt :
+  case Hexagon::LDrih_cdnPt :
+  case Hexagon::LDrih_indexed_cPt :
+  case Hexagon::LDrih_indexed_cdnPt :
+  case Hexagon::POST_LDrih_cPt :
+  case Hexagon::POST_LDrih_cdnPt_V4 :
+  case Hexagon::LDrib_cPt :
+  case Hexagon::LDrib_cdnPt :
+  case Hexagon::LDrib_indexed_cPt :
+  case Hexagon::LDrib_indexed_cdnPt :
+  case Hexagon::POST_LDrib_cPt :
+  case Hexagon::POST_LDrib_cdnPt_V4 :
+  case Hexagon::LDriuh_cPt :
+  case Hexagon::LDriuh_cdnPt :
+  case Hexagon::LDriuh_indexed_cPt :
+  case Hexagon::LDriuh_indexed_cdnPt :
+  case Hexagon::POST_LDriuh_cPt :
+  case Hexagon::POST_LDriuh_cdnPt_V4 :
+  case Hexagon::LDriub_cPt :
+  case Hexagon::LDriub_cdnPt :
+  case Hexagon::LDriub_indexed_cPt :
+  case Hexagon::LDriub_indexed_cdnPt :
+  case Hexagon::POST_LDriub_cPt :
+  case Hexagon::POST_LDriub_cdnPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
+  case Hexagon::ADD_ri_cPt :
+  case Hexagon::ADD_ri_cdnPt :
+  case Hexagon::ADD_rr_cPt :
+  case Hexagon::ADD_rr_cdnPt :
+  case Hexagon::XOR_rr_cPt :
+  case Hexagon::XOR_rr_cdnPt :
+  case Hexagon::AND_rr_cPt :
+  case Hexagon::AND_rr_cdnPt :
+  case Hexagon::OR_rr_cPt :
+  case Hexagon::OR_rr_cdnPt :
+  case Hexagon::SUB_rr_cPt :
+  case Hexagon::SUB_rr_cdnPt :
+  case Hexagon::COMBINE_rr_cPt :
+  case Hexagon::COMBINE_rr_cdnPt :
+  case Hexagon::ASLH_cPt_V4 :
+  case Hexagon::ASLH_cdnPt_V4 :
+  case Hexagon::ASRH_cPt_V4 :
+  case Hexagon::ASRH_cdnPt_V4 :
+  case Hexagon::SXTB_cPt_V4 :
+  case Hexagon::SXTB_cdnPt_V4 :
+  case Hexagon::SXTH_cPt_V4 :
+  case Hexagon::SXTH_cdnPt_V4 :
+  case Hexagon::ZXTB_cPt_V4 :
+  case Hexagon::ZXTB_cdnPt_V4 :
+  case Hexagon::ZXTH_cPt_V4 :
+  case Hexagon::ZXTH_cdnPt_V4 :
+  case Hexagon::LDd_GP_cPt_V4 :
+  case Hexagon::LDb_GP_cPt_V4 :
+  case Hexagon::LDub_GP_cPt_V4 :
+  case Hexagon::LDh_GP_cPt_V4 :
+  case Hexagon::LDuh_GP_cPt_V4 :
+  case Hexagon::LDw_GP_cPt_V4 :
+  case Hexagon::STd_GP_cPt_V4 :
+  case Hexagon::STb_GP_cPt_V4 :
+  case Hexagon::STh_GP_cPt_V4 :
+  case Hexagon::STw_GP_cPt_V4 :
+  case Hexagon::LDd_GP_cdnPt_V4 :
+  case Hexagon::LDb_GP_cdnPt_V4 :
+  case Hexagon::LDub_GP_cdnPt_V4 :
+  case Hexagon::LDh_GP_cdnPt_V4 :
+  case Hexagon::LDuh_GP_cdnPt_V4 :
+  case Hexagon::LDw_GP_cdnPt_V4 :
+  case Hexagon::STd_GP_cdnPt_V4 :
+  case Hexagon::STb_GP_cdnPt_V4 :
+  case Hexagon::STh_GP_cdnPt_V4 :
+  case Hexagon::STw_GP_cdnPt_V4 :
+    return true;
+
+  case Hexagon::TFR_cNotPt:
+  case Hexagon::TFR_cdnNotPt:
+  case Hexagon::TFRI_cNotPt:
+  case Hexagon::TFRI_cdnNotPt:
+  case Hexagon::STrib_cNotPt :
+  case Hexagon::STrib_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_cNotPt :
+  case Hexagon::STrib_indexed_cdnNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STbri_cNotPt :
+  case Hexagon::POST_STbri_cdnNotPt_V4 :
+  case Hexagon::STrih_cNotPt :
+  case Hexagon::STrih_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_cNotPt :
+  case Hexagon::STrih_indexed_cdnNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_SThri_cNotPt :
+  case Hexagon::POST_SThri_cdnNotPt_V4 :
+  case Hexagon::STriw_cNotPt :
+  case Hexagon::STriw_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_cNotPt :
+  case Hexagon::STriw_indexed_cdnNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STwri_cNotPt :
+  case Hexagon::POST_STwri_cdnNotPt_V4 :
+  case Hexagon::STrib_imm_cNotPt_V4 :
+  case Hexagon::STrib_imm_cdnNotPt_V4 :
+  case Hexagon::STrid_cNotPt :
+  case Hexagon::STrid_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cdnNotPt_V4 :
+  case Hexagon::STrid_indexed_cNotPt :
+  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::POST_STdri_cNotPt :
+  case Hexagon::POST_STdri_cdnNotPt_V4 :
+  case Hexagon::STrih_imm_cNotPt_V4 :
+  case Hexagon::STrih_imm_cdnNotPt_V4 :
+  case Hexagon::STriw_imm_cNotPt_V4 :
+  case Hexagon::STriw_imm_cdnNotPt_V4 :
+  case Hexagon::JMP_cdnNotPt :
+  case Hexagon::LDrid_cNotPt :
+  case Hexagon::LDrid_cdnNotPt :
+  case Hexagon::LDrid_indexed_cNotPt :
+  case Hexagon::LDrid_indexed_cdnNotPt :
+  case Hexagon::POST_LDrid_cNotPt :
+  case Hexagon::POST_LDrid_cdnNotPt_V4 :
+  case Hexagon::LDriw_cNotPt :
+  case Hexagon::LDriw_cdnNotPt :
+  case Hexagon::LDriw_indexed_cNotPt :
+  case Hexagon::LDriw_indexed_cdnNotPt :
+  case Hexagon::POST_LDriw_cNotPt :
+  case Hexagon::POST_LDriw_cdnNotPt_V4 :
+  case Hexagon::LDrih_cNotPt :
+  case Hexagon::LDrih_cdnNotPt :
+  case Hexagon::LDrih_indexed_cNotPt :
+  case Hexagon::LDrih_indexed_cdnNotPt :
+  case Hexagon::POST_LDrih_cNotPt :
+  case Hexagon::POST_LDrih_cdnNotPt_V4 :
+  case Hexagon::LDrib_cNotPt :
+  case Hexagon::LDrib_cdnNotPt :
+  case Hexagon::LDrib_indexed_cNotPt :
+  case Hexagon::LDrib_indexed_cdnNotPt :
+  case Hexagon::POST_LDrib_cNotPt :
+  case Hexagon::POST_LDrib_cdnNotPt_V4 :
+  case Hexagon::LDriuh_cNotPt :
+  case Hexagon::LDriuh_cdnNotPt :
+  case Hexagon::LDriuh_indexed_cNotPt :
+  case Hexagon::LDriuh_indexed_cdnNotPt :
+  case Hexagon::POST_LDriuh_cNotPt :
+  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
+  case Hexagon::LDriub_cNotPt :
+  case Hexagon::LDriub_cdnNotPt :
+  case Hexagon::LDriub_indexed_cNotPt :
+  case Hexagon::LDriub_indexed_cdnNotPt :
+  case Hexagon::POST_LDriub_cNotPt :
+  case Hexagon::POST_LDriub_cdnNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
+  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
+  case Hexagon::ADD_ri_cNotPt :
+  case Hexagon::ADD_ri_cdnNotPt :
+  case Hexagon::ADD_rr_cNotPt :
+  case Hexagon::ADD_rr_cdnNotPt :
+  case Hexagon::XOR_rr_cNotPt :
+  case Hexagon::XOR_rr_cdnNotPt :
+  case Hexagon::AND_rr_cNotPt :
+  case Hexagon::AND_rr_cdnNotPt :
+  case Hexagon::OR_rr_cNotPt :
+  case Hexagon::OR_rr_cdnNotPt :
+  case Hexagon::SUB_rr_cNotPt :
+  case Hexagon::SUB_rr_cdnNotPt :
+  case Hexagon::COMBINE_rr_cNotPt :
+  case Hexagon::COMBINE_rr_cdnNotPt :
+  case Hexagon::ASLH_cNotPt_V4 :
+  case Hexagon::ASLH_cdnNotPt_V4 :
+  case Hexagon::ASRH_cNotPt_V4 :
+  case Hexagon::ASRH_cdnNotPt_V4 :
+  case Hexagon::SXTB_cNotPt_V4 :
+  case Hexagon::SXTB_cdnNotPt_V4 :
+  case Hexagon::SXTH_cNotPt_V4 :
+  case Hexagon::SXTH_cdnNotPt_V4 :
+  case Hexagon::ZXTB_cNotPt_V4 :
+  case Hexagon::ZXTB_cdnNotPt_V4 :
+  case Hexagon::ZXTH_cNotPt_V4 :
+  case Hexagon::ZXTH_cdnNotPt_V4 :
+
+  case Hexagon::LDd_GP_cNotPt_V4 :
+  case Hexagon::LDb_GP_cNotPt_V4 :
+  case Hexagon::LDub_GP_cNotPt_V4 :
+  case Hexagon::LDh_GP_cNotPt_V4 :
+  case Hexagon::LDuh_GP_cNotPt_V4 :
+  case Hexagon::LDw_GP_cNotPt_V4 :
+  case Hexagon::STd_GP_cNotPt_V4 :
+  case Hexagon::STb_GP_cNotPt_V4 :
+  case Hexagon::STh_GP_cNotPt_V4 :
+  case Hexagon::STw_GP_cNotPt_V4 :
+  case Hexagon::LDd_GP_cdnNotPt_V4 :
+  case Hexagon::LDb_GP_cdnNotPt_V4 :
+  case Hexagon::LDub_GP_cdnNotPt_V4 :
+  case Hexagon::LDh_GP_cdnNotPt_V4 :
+  case Hexagon::LDuh_GP_cdnNotPt_V4 :
+  case Hexagon::LDw_GP_cdnNotPt_V4 :
+  case Hexagon::STd_GP_cdnNotPt_V4 :
+  case Hexagon::STb_GP_cdnNotPt_V4 :
+  case Hexagon::STh_GP_cdnNotPt_V4 :
+  case Hexagon::STw_GP_cdnNotPt_V4 :
+    return false;
+  }
+  // return *some value* to avoid compiler warning
+  return false;
+}
+
+static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
+                                               const HexagonInstrInfo *QII) {
+  assert(QII->isPostIncrement(MI) && "Not a post increment operation.");
+#ifndef NDEBUG
+  // Post Increment means duplicates. Use dense map to find duplicates in the
+  // list. Caution: Densemap initializes with the minimum of 64 buckets,
+  // whereas there are at most 5 operands in the post increment.
+  DenseMap<unsigned,  unsigned> DefRegsSet;
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isDef()) {
+      DefRegsSet[MI->getOperand(opNum).getReg()] = 1;
+    }
+
+  for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++)
+    if (MI->getOperand(opNum).isReg() &&
+        MI->getOperand(opNum).isUse()) {
+      if (DefRegsSet[MI->getOperand(opNum).getReg()]) {
+        return MI->getOperand(opNum);
+      }
+    }
+#else
+  if (MI->getDesc().mayLoad()) {
+    // The 2nd operand is always the post increment operand in load.
+    assert(MI->getOperand(1).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(1));
+  }
+  if (MI->getDesc().mayStore()) {
+    // The 1st operand is always the post increment operand in store.
+    assert(MI->getOperand(0).isReg() &&
+                "Post increment operand has be to a register.");
+    return (MI->getOperand(0));
+  }
+#endif
+  // we should never come here.
+  llvm_unreachable("mayLoad or mayStore not set for Post Increment operation");
+}
+
+// get the value being stored
+static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
+  // value being stored is always the last operand.
+  return (MI->getOperand(MI->getNumOperands()-1));
+}
+
+// can be new value store?
+// Following restrictions are to be respected in convert a store into
+// a new value store.
+// 1. If an instruction uses auto-increment, its address register cannot
+//    be a new-value register. Arch Spec 5.4.2.1
+// 2. If an instruction uses absolute-set addressing mode,
+//    its address register cannot be a new-value register.
+//    Arch Spec 5.4.2.1.TODO: This is not enabled as
+//    as absolute-set address mode patters are not implemented.
+// 3. If an instruction produces a 64-bit result, its registers cannot be used
+//    as new-value registers. Arch Spec 5.4.2.2.
+// 4. If the instruction that sets a new-value register is conditional, then
+//    the instruction that uses the new-value register must also be conditional,
+//    and both must always have their predicates evaluate identically.
+//    Arch Spec 5.4.2.3.
+// 5. There is an implied restriction of a packet can not have another store,
+//    if there is a  new value store in the packet. Corollary, if there is
+//    already a store in a packet, there can not be a new value store.
+//    Arch Spec: 3.4.4.2
+bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
+                MachineInstr *PacketMI, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit)
+{
+  // Make sure we are looking at the store
+  if (!IsNewifyStore(MI))
+    return false;
+
+  // Make sure there is dependency and can be new value'ed
+  if (GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() != DepReg)
+    return false;
+
+  const HexagonRegisterInfo* QRI = 
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const MCInstrDesc& MCID = PacketMI->getDesc();
+  // first operand is always the result
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
+
+  // if there is already an store in the packet, no can do new value store
+  // Arch Spec 3.4.4.2.
+  for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(),
+         VE = CurrentPacketMIs.end();
+       (VI != VE); ++VI) {
+    SUnit* PacketSU = MIToSUnit[*VI];
+    if (PacketSU->getInstr()->getDesc().mayStore() ||
+        // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME,
+        // then we don't need this
+        PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+        PacketSU->getInstr()->getOpcode() == Hexagon::DEALLOCFRAME)
+      return false;
+  }
+
+  if (PacketRC == &Hexagon::DoubleRegsRegClass) {
+    // new value store constraint: double regs can not feed into new value store
+    // arch spec section: 5.4.2.2
+    return false;
+  }
+
+  // Make sure it's NOT the post increment register that we are going to
+  // new value.
+  if (QII->isPostIncrement(MI) &&
+      MI->getDesc().mayStore() &&
+      GetPostIncrementOperand(MI, QII).getReg() == DepReg) {
+    return false;
+  }
+
+  if (QII->isPostIncrement(PacketMI) &&
+      PacketMI->getDesc().mayLoad() &&
+      GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) {
+    // if source is post_inc, or absolute-set addressing,
+    // it can not feed into new value store
+    //  r3 = memw(r2++#4)
+    //  memw(r30 + #-1404) = r2.new -> can not be new value store
+    // arch spec section: 5.4.2.1
+    return false;
+  }
+
+  // If the source that feeds the store is predicated, new value store must
+  // also be also predicated.
+  if (QII->isPredicated(PacketMI)) {
+    if (!QII->isPredicated(MI))
+      return false;
+
+    // Check to make sure that they both will have their predicates
+    // evaluate identically
+    unsigned predRegNumSrc = 0;
+    unsigned predRegNumDst = 0;
+    const TargetRegisterClass* predRegClass = NULL;
+
+    // Get predicate register used in the source instruction
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if ( PacketMI->getOperand(opNum).isReg())
+      predRegNumSrc = PacketMI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+        ("predicate register not found in a predicated PacketMI instruction"));
+
+    // Get predicate register used in new-value store instruction
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if ( MI->getOperand(opNum).isReg())
+      predRegNumDst = MI->getOperand(opNum).getReg();
+      predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst);
+      if (predRegClass == &Hexagon::PredRegsRegClass) {
+        break;
+      }
+    }
+    assert ((predRegClass == &Hexagon::PredRegsRegClass ) &&
+            ("predicate register not found in a predicated MI instruction"));
+
+    // New-value register producer and user (store) need to satisfy these
+    // constraints:
+    // 1) Both instructions should be predicated on the same register.
+    // 2) If producer of the new-value register is .new predicated then store
+    // should also be .new predicated and if producer is not .new predicated
+    // then store should not be .new predicated.
+    // 3) Both new-value register producer and user should have same predicate
+    // sense, i.e, either both should be negated or both should be none negated.
+
+    if (( predRegNumDst != predRegNumSrc) ||
+          QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI)  ||
+          GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) {
+      return false;
+    }
+  }
+
+  // Make sure that other than the new-value register no other store instruction
+  // register has been modified in the same packet. Predicate registers can be
+  // modified by they should not be modified between the producer and the store
+  // instruction as it will make them both conditional on different values.
+  // We already know this to be true for all the instructions before and
+  // including PacketMI. Howerver, we need to perform the check for the
+  // remaining instructions in the packet.
+
+  std::vector<MachineInstr*>::iterator VI;
+  std::vector<MachineInstr*>::iterator VE;
+  unsigned StartCheck = 0;
+
+  for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end();
+      (VI != VE); ++VI) {
+    SUnit* TempSU = MIToSUnit[*VI];
+    MachineInstr* TempMI = TempSU->getInstr();
+
+    // Following condition is true for all the instructions until PacketMI is
+    // reached (StartCheck is set to 0 before the for loop).
+    // StartCheck flag is 1 for all the instructions after PacketMI.
+    if (TempMI != PacketMI && !StartCheck) // start processing only after
+      continue;                            // encountering PacketMI
+
+    StartCheck = 1;
+    if (TempMI == PacketMI) // We don't want to check PacketMI for dependence
+      continue;
+
+    for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(),
+                                               QRI))
+        return false;
+    }
+  }
+
+  // Make sure that for non POST_INC stores:
+  // 1. The only use of reg is DepReg and no other registers.
+  //    This handles V4 base+index registers.
+  //    The following store can not be dot new.
+  //    Eg.   r0 = add(r0, #3)a
+  //          memw(r1+r0<<#2) = r0
+  if (!QII->isPostIncrement(MI) &&
+      GetStoreValueOperand(MI).isReg() &&
+      GetStoreValueOperand(MI).getReg() == DepReg) {
+    for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) {
+      if (MI->getOperand(opNum).isReg() &&
+          MI->getOperand(opNum).getReg() == DepReg) {
+        return false;
+      }
+    }
+    // 2. If data definition is because of implicit definition of the register,
+    //    do not newify the store. Eg.
+    //    %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
+    //    STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
+    for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
+      if (PacketMI->getOperand(opNum).isReg() &&
+          PacketMI->getOperand(opNum).getReg() == DepReg &&
+          PacketMI->getOperand(opNum).isDef() &&
+          PacketMI->getOperand(opNum).isImplicit()) {
+        return false;
+      }
+    }
+  }
+
+  // Can be dot new store.
+  return true;
+}
+
+// can this MI to promoted to either
+// new value store or new value jump
+bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
+                SUnit *PacketSU, unsigned DepReg,
+                std::map <MachineInstr*, SUnit*> MIToSUnit,
+                MachineBasicBlock::iterator &MII)
+{
+
+  const HexagonRegisterInfo* QRI =
+                            (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  if (!QRI->Subtarget.hasV4TOps() ||
+      !IsNewifyStore(MI))
+    return false;
+
+  MachineInstr *PacketMI = PacketSU->getInstr();
+
+  // Check to see the store can be new value'ed.
+  if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit))
+    return true;
+
+  // Check to see the compare/jump can be new value'ed.
+  // This is done as a pass on its own. Don't need to check it here.
+  return false;
+}
+
+// Check to see if an instruction can be dot new
+// There are three kinds.
+// 1. dot new on predicate - V2/V3/V4
+// 2. dot new on stores NV/ST - V4
+// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
+                              SUnit *PacketSU, unsigned DepReg,
+                              std::map <MachineInstr*, SUnit*> MIToSUnit,
+                              MachineBasicBlock::iterator &MII,
+                              const TargetRegisterClass* RC )
+{
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Already a dot new instruction.
+  if (QII->isDotNewInst(MI) && !IsNewifyStore(MI))
+    return false;
+
+  if (!isNewifiable(MI))
+    return false;
+
+  // predicate .new
+  if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
+      return true;
+  else if (RC != &Hexagon::PredRegsRegClass &&
+      !IsNewifyStore(MI)) // MI is not a new-value store
+    return false;
+  else {
+    // Create a dot new machine instruction to see if resources can be
+    // allocated. If not, bail out now.
+    int NewOpcode = GetDotNewOp(MI->getOpcode());
+    const MCInstrDesc &desc = QII->get(NewOpcode);
+    DebugLoc dl;
+    MachineInstr *NewMI =
+                    MI->getParent()->getParent()->CreateMachineInstr(desc, dl);
+    bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI);
+    MI->getParent()->getParent()->DeleteMachineInstr(NewMI);
+
+    if (!ResourcesAvailable)
+      return false;
+
+    // new value store only
+    // new new value jump generated as a passes
+    if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Go through the packet instructions and search for anti dependency
+// between them and DepReg from MI
+// Consider this case:
+// Trying to add
+// a) %R1<def> = TFRI_cdNotPt %P3, 2
+// to this packet:
+// {
+//   b) %P0<def> = OR_pp %P3<kill>, %P0<kill>
+//   c) %P3<def> = TFR_PdRs %R23
+//   d) %R1<def> = TFRI_cdnPt %P3, 4
+//  }
+// The P3 from a) and d) will be complements after
+// a)'s P3 is converted to .new form
+// Anti Dep between c) and b) is irrelevant for this case
+bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
+      unsigned DepReg,
+      std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  SUnit* PacketSUDep = MIToSUnit[MI];
+
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // We only care for dependencies to predicated instructions
+    if(!QII->isPredicated(*VIN)) continue;
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // Look at dependencies between current members of the packet
+    // and predicate defining instruction MI.
+    // Make sure that dependency is on the exact register
+    // we care about.
+    if (PacketSU->isSucc(PacketSUDep)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) &&
+            (PacketSU->Succs[i].getKind() == SDep::Anti) &&
+            (PacketSU->Succs[i].getReg() == DepReg)) {
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+
+// Given two predicated instructions, this function detects whether
+// the predicates are complements
+bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
+     MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
+
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Currently can only reason about conditional transfers
+  if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) {
+    return false;
+  }
+
+  // Scheduling unit for candidate
+  SUnit* SU = MIToSUnit[MI1];
+
+  // One corner case deals with the following scenario:
+  // Trying to add
+  // a) %R24<def> = TFR_cPt %P0, %R25
+  // to this packet:
+  //
+  // {
+  //   b) %R25<def> = TFR_cNotPt %P0, %R24
+  //   c) %P0<def> = CMPEQri %R26, 1
+  // }
+  //
+  // On general check a) and b) are complements, but
+  // presence of c) will convert a) to .new form, and
+  // then it is not a complement
+  // We attempt to detect it by analyzing  existing
+  // dependencies in the packet
+
+  // Analyze relationships between all existing members of the packet.
+  // Look for Anti dependecy on the same predicate reg
+  // as used in the candidate
+  for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(),
+       VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) {
+
+    // Scheduling Unit for current insn in the packet
+    SUnit* PacketSU = MIToSUnit[*VIN];
+
+    // If this instruction in the packet is succeeded by the candidate...
+    if (PacketSU->isSucc(SU)) {
+      for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) {
+        // The corner case exist when there is true data
+        // dependency between candidate and one of current
+        // packet members, this dep is on predicate reg, and
+        // there already exist anti dep on the same pred in
+        // the packet.
+        if (PacketSU->Succs[i].getSUnit() == SU &&
+            Hexagon::PredRegsRegClass.contains(
+              PacketSU->Succs[i].getReg()) &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
+            // Here I know that *VIN is predicate setting instruction
+            // with true data dep to candidate on the register
+            // we care about - c) in the above example.
+            // Now I need to see if there is an anti dependency
+            // from c) to any other instruction in the
+            // same packet on the pred reg of interest
+            RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(),
+                                        MIToSUnit)) {
+           return false;
+        }
+      }
+    }
+  }
+
+  // If the above case does not apply, check regular
+  // complement condition.
+  // Check that the predicate register is the same and
+  // that the predicate sense is different
+  // We also need to differentiate .old vs. .new:
+  // !p0 is not complimentary to p0.new
+  return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) &&
+          (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) &&
+          (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
+}
+
+// initPacketizerState - Initialize packetizer flags
+void HexagonPacketizerList::initPacketizerState() {
+
+  Dependence = false;
+  PromotedToDotNew = false;
+  GlueToNewValueJump = false;
+  GlueAllocframeStore = false;
+  FoundSequentialDependence = false;
+
+  return;
+}
+
+// ignorePseudoInstruction - Ignore bundling of pseudo instructions.
+bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI,
+                                                    MachineBasicBlock *MBB) {
+  if (MI->isDebugValue())
+    return true;
+
+  // We must print out inline assembly
+  if (MI->isInlineAsm())
+    return false;
+
+  // We check if MI has any functional units mapped to it.
+  // If it doesn't, we ignore the instruction.
+  const MCInstrDesc& TID = MI->getDesc();
+  unsigned SchedClass = TID.getSchedClass();
+  const InstrStage* IS =
+                    ResourceTracker->getInstrItins()->beginStage(SchedClass);
+  unsigned FuncUnits = IS->getUnits();
+  return !FuncUnits;
+}
+
+// isSoloInstruction: - Returns true for instructions that must be
+// scheduled in their own packet.
+bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) {
+
+  if (MI->isInlineAsm())
+    return true;
+
+  if (MI->isEHLabel())
+    return true;
+
+  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
+  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
+  // They must not be grouped with other instructions in a packet.
+  if (IsSchedBarrier(MI))
+    return true;
+
+  return false;
+}
+
+// isLegalToPacketizeTogether:
+// SUI is the current instruction that is out side of the current packet.
+// SUJ is the current instruction inside the current packet against which that
+// SUI will be packetized.
+bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  MachineInstr *J = SUJ->getInstr();
+  assert(I && J && "Unable to packetize null instruction!");
+
+  const MCInstrDesc &MCIDI = I->getDesc();
+  const MCInstrDesc &MCIDJ = J->getDesc();
+
+  MachineBasicBlock::iterator II = I;
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+  const HexagonRegisterInfo* QRI =
+                      (const HexagonRegisterInfo *) TM.getRegisterInfo();
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+  // Inline asm cannot go in the packet.
+  if (I->getOpcode() == Hexagon::INLINEASM)
+    llvm_unreachable("Should not meet inline asm here!");
+
+  if (isSoloInstruction(I))
+    llvm_unreachable("Should not meet solo instr here!");
+
+  // A save callee-save register function call can only be in a packet
+  // with instructions that don't write to the callee-save registers.
+  if ((QII->isSaveCalleeSavedRegsCall(I) &&
+       DoesModifyCalleeSavedReg(J, QRI)) ||
+      (QII->isSaveCalleeSavedRegsCall(J) &&
+       DoesModifyCalleeSavedReg(I, QRI))) {
+    Dependence = true;
+    return false;
+  }
+
+  // Two control flow instructions cannot go in the same packet.
+  if (IsControlFlow(I) && IsControlFlow(J)) {
+    Dependence = true;
+    return false;
+  }
+
+  // A LoopN instruction cannot appear in the same packet as a jump or call.
+  if (IsLoopN(I) && (   IsDirectJump(J)
+                     || MCIDJ.isCall()
+                     || QII->isDeallocRet(J))) {
+    Dependence = true;
+    return false;
+  }
+  if (IsLoopN(J) && (   IsDirectJump(I)
+                     || MCIDI.isCall()
+                     || QII->isDeallocRet(I))) {
+    Dependence = true;
+    return false;
+  }
+
+  // dealloc_return cannot appear in the same packet as a conditional or
+  // unconditional jump.
+  if (QII->isDeallocRet(I) && (   MCIDJ.isBranch()
+                               || MCIDJ.isCall()
+                               || MCIDJ.isBarrier())) {
+    Dependence = true;
+    return false;
+  }
+
+
+  // V4 allows dual store. But does not allow second store, if the
+  // first store is not in SLOT0. New value store, new value jump,
+  // dealloc_return and memop always take SLOT0.
+  // Arch spec 3.4.4.2
+  if (QRI->Subtarget.hasV4TOps()) {
+    if (MCIDI.mayStore() && MCIDJ.mayStore() &&
+       (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    if ((QII->isMemOp(J) && MCIDI.mayStore())
+        || (MCIDJ.mayStore() && QII->isMemOp(I))
+        || (QII->isMemOp(J) && QII->isMemOp(I))) {
+      Dependence = true;
+      return false;
+    }
+
+    //if dealloc_return
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)){
+      Dependence = true;
+      return false;
+    }
+
+    // If an instruction feeds new value jump, glue it.
+    MachineBasicBlock::iterator NextMII = I;
+    ++NextMII;
+    MachineInstr *NextMI = NextMII;
+
+    if (QII->isNewValueJump(NextMI)) {
+
+      bool secondRegMatch = false;
+      bool maintainNewValueJump = false;
+
+      if (NextMI->getOperand(1).isReg() &&
+          I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) {
+        secondRegMatch = true;
+        maintainNewValueJump = true;
+      }
+
+      if (!secondRegMatch &&
+           I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) {
+        maintainNewValueJump = true;
+      }
+
+      for (std::vector<MachineInstr*>::iterator
+            VI = CurrentPacketMIs.begin(),
+             VE = CurrentPacketMIs.end();
+           (VI != VE && maintainNewValueJump); ++VI) {
+        SUnit* PacketSU = MIToSUnit[*VI];
+
+        // NVJ can not be part of the dual jump - Arch Spec: section 7.8
+        if (PacketSU->getInstr()->getDesc().isCall()) {
+          Dependence = true;
+          break;
+        }
+        // Validate
+        // 1. Packet does not have a store in it.
+        // 2. If the first operand of the nvj is newified, and the second
+        //    operand is also a reg, it (second reg) is not defined in
+        //    the same packet.
+        // 3. If the second operand of the nvj is newified, (which means
+        //    first operand is also a reg), first reg is not defined in
+        //    the same packet.
+        if (PacketSU->getInstr()->getDesc().mayStore()               ||
+            PacketSU->getInstr()->getOpcode() == Hexagon::ALLOCFRAME ||
+            // Check #2.
+            (!secondRegMatch && NextMI->getOperand(1).isReg() &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(1).getReg(), QRI)) ||
+            // Check #3.
+            (secondRegMatch &&
+             PacketSU->getInstr()->modifiesRegister(
+                               NextMI->getOperand(0).getReg(), QRI))) {
+          Dependence = true;
+          break;
+        }
+      }
+      if (!Dependence)
+        GlueToNewValueJump = true;
+      else
+        return false;
+    }
+  }
+
+  if (SUJ->isSucc(SUI)) {
+    for (unsigned i = 0;
+         (i < SUJ->Succs.size()) && !FoundSequentialDependence;
+         ++i) {
+
+      if (SUJ->Succs[i].getSUnit() != SUI) {
+        continue;
+      }
+
+      SDep::Kind DepType = SUJ->Succs[i].getKind();
+
+      // For direct calls:
+      // Ignore register dependences for call instructions for
+      // packetization purposes except for those due to r31 and
+      // predicate registers.
+      //
+      // For indirect calls:
+      // Same as direct calls + check for true dependences to the register
+      // used in the indirect call.
+      //
+      // We completely ignore Order dependences for call instructions
+      //
+      // For returns:
+      // Ignore register dependences for return instructions like jumpr,
+      // dealloc return unless we have dependencies on the explicit uses
+      // of the registers used by jumpr (like r31) or dealloc return
+      // (like r29 or r30).
+      //
+      // TODO: Currently, jumpr is handling only return of r31. So, the
+      // following logic (specificaly IsCallDependent) is working fine.
+      // We need to enable jumpr for register other than r31 and then,
+      // we need to rework the last part, where it handles indirect call
+      // of that (IsCallDependent) function. Bug 6216 is opened for this.
+      //
+      unsigned DepReg = 0;
+      const TargetRegisterClass* RC = NULL;
+      if (DepType == SDep::Data) {
+        DepReg = SUJ->Succs[i].getReg();
+        RC = QRI->getMinimalPhysRegClass(DepReg);
+      }
+      if ((MCIDI.isCall() || MCIDI.isReturn()) &&
+          (!IsRegDependence(DepType) ||
+            !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) {
+        /* do nothing */
+      }
+
+      // For instructions that can be promoted to dot-new, try to promote.
+      else if ((DepType == SDep::Data) &&
+               CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) &&
+               PromoteToDotNew(I, DepType, II, RC)) {
+        PromotedToDotNew = true;
+        /* do nothing */
+      }
+
+      else if ((DepType == SDep::Data) &&
+               (QII->isNewValueJump(I))) {
+        /* do nothing */
+      }
+
+      // For predicated instructions, if the predicates are complements
+      // then there can be no dependence.
+      else if (QII->isPredicated(I) &&
+               QII->isPredicated(J) &&
+          ArePredicatesComplements(I, J, MIToSUnit)) {
+        /* do nothing */
+
+      }
+      else if (IsDirectJump(I) &&
+               !MCIDJ.isBranch() &&
+               !MCIDJ.isCall() &&
+               (DepType == SDep::Order)) {
+        // Ignore Order dependences between unconditional direct branches
+        // and non-control-flow instructions
+        /* do nothing */
+      }
+      else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) &&
+               (DepType != SDep::Output)) {
+        // Ignore all dependences for jumps except for true and output
+        // dependences
+        /* do nothing */
+      }
+
+      // Ignore output dependences due to superregs. We can
+      // write to two different subregisters of R1:0 for instance
+      // in the same cycle
+      //
+
+      //
+      // Let the
+      // If neither I nor J defines DepReg, then this is a
+      // superfluous output dependence. The dependence must be of the
+      // form:
+      //  R0 = ...
+      //  R1 = ...
+      // and there is an output dependence between the two instructions
+      // with
+      // DepReg = D0
+      // We want to ignore these dependences.
+      // Ideally, the dependence constructor should annotate such
+      // dependences. We can then avoid this relatively expensive check.
+      //
+      else if (DepType == SDep::Output) {
+        // DepReg is the register that's responsible for the dependence.
+        unsigned DepReg = SUJ->Succs[i].getReg();
+
+        // Check if I and J really defines DepReg.
+        if (I->definesRegister(DepReg) ||
+            J->definesRegister(DepReg)) {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // We ignore Order dependences for
+      // 1. Two loads unless they are volatile.
+      // 2. Two stores in V4 unless they are volatile.
+      else if ((DepType == SDep::Order) &&
+               !I->hasOrderedMemoryRef() &&
+               !J->hasOrderedMemoryRef()) {
+        if (QRI->Subtarget.hasV4TOps() &&
+            // hexagonv4 allows dual store.
+            MCIDI.mayStore() && MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        // store followed by store-- not OK on V2
+        // store followed by load -- not OK on all (OK if addresses
+        // are not aliased)
+        // load followed by store -- OK on all
+        // load followed by load  -- OK on all
+        else if ( !MCIDJ.mayStore()) {
+          /* do nothing */
+        }
+        else {
+          FoundSequentialDependence = true;
+          break;
+        }
+      }
+
+      // For V4, special case ALLOCFRAME. Even though there is dependency
+      // between ALLOCAFRAME and subsequent store, allow it to be
+      // packetized in a same packet. This implies that the store is using
+      // caller's SP. Hense, offset needs to be updated accordingly.
+      else if (DepType == SDep::Data
+               && QRI->Subtarget.hasV4TOps()
+               && J->getOpcode() == Hexagon::ALLOCFRAME
+               && (I->getOpcode() == Hexagon::STrid
+                   || I->getOpcode() == Hexagon::STriw
+                   || I->getOpcode() == Hexagon::STrib)
+               && I->getOperand(0).getReg() == QRI->getStackRegister()
+               && QII->isValidOffset(I->getOpcode(),
+                                     I->getOperand(1).getImm() -
+                                     (FrameSize + HEXAGON_LRFP_SIZE)))
+      {
+        GlueAllocframeStore = true;
+        // Since this store is to be glued with allocframe in the same
+        // packet, it will use SP of the previous stack frame, i.e
+        // caller's SP. Therefore, we need to recalculate offset according
+        // to this change.
+        I->getOperand(1).setImm(I->getOperand(1).getImm() -
+                                        (FrameSize + HEXAGON_LRFP_SIZE));
+      }
+
+      //
+      // Skip over anti-dependences. Two instructions that are
+      // anti-dependent can share a packet
+      //
+      else if (DepType != SDep::Anti) {
+        FoundSequentialDependence = true;
+        break;
+      }
+    }
+
+    if (FoundSequentialDependence) {
+      Dependence = true;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// isLegalToPruneDependencies
+bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
+  MachineInstr *I = SUI->getInstr();
+  assert(I && SUJ->getInstr() && "Unable to packetize null instruction!");
+
+  const unsigned FrameSize = MF.getFrameInfo()->getStackSize();
+
+  if (Dependence) {
+
+    // Check if the instruction was promoted to a dot-new. If so, demote it
+    // back into a dot-old.
+    if (PromotedToDotNew) {
+      DemoteToDotOld(I);
+    }
+
+    // Check if the instruction (must be a store) was glued with an Allocframe
+    // instruction. If so, restore its offset to its original value, i.e. use
+    // curent SP instead of caller's SP.
+    if (GlueAllocframeStore) {
+      I->getOperand(1).setImm(I->getOperand(1).getImm() +
+                                             FrameSize + HEXAGON_LRFP_SIZE);
+    }
+
+    return false;
+  }
+  return true;
+}
+
+MachineBasicBlock::iterator
+HexagonPacketizerList::addToPacket(MachineInstr *MI) {
+
+    MachineBasicBlock::iterator MII = MI;
+    MachineBasicBlock *MBB = MI->getParent();
+
+    const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+
+    if (GlueToNewValueJump) {
+
+      ++MII;
+      MachineInstr *nvjMI = MII;
+      assert(ResourceTracker->canReserveResources(MI));
+      ResourceTracker->reserveResources(MI);
+      if ((QII->isExtended(MI) || QII->isConstExtended(MI)) &&
+          !tryAllocateResourcesForConstExt(MI)) {
+        endPacket(MBB, MI);
+        ResourceTracker->reserveResources(MI);
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(MI);
+        // Reserve resources for new value jump constant extender.
+        assert(canReserveResourcesForConstExt(MI) &&
+               "Ensure that there is a slot");
+        reserveResourcesForConstExt(nvjMI);
+        assert(ResourceTracker->canReserveResources(nvjMI) &&
+               "Ensure that there is a slot");
+
+      } else if (   // Extended instruction takes two slots in the packet.
+        // Try reserve and allocate 4-byte in the current packet first.
+        (QII->isExtended(nvjMI)
+            && (!tryAllocateResourcesForConstExt(nvjMI)
+                || !ResourceTracker->canReserveResources(nvjMI)))
+        || // For non-extended instruction, no need to allocate extra 4 bytes.
+        (!QII->isExtended(nvjMI) &&
+              !ResourceTracker->canReserveResources(nvjMI)))
+      {
+        endPacket(MBB, MI);
+        // A new and empty packet starts.
+        // We are sure that the resources requirements can be satisfied.
+        // Therefore, do not need to call "canReserveResources" anymore.
+        ResourceTracker->reserveResources(MI);
+        if (QII->isExtended(nvjMI))
+          reserveResourcesForConstExt(nvjMI);
+      }
+      // Here, we are sure that "reserveResources" would succeed.
+      ResourceTracker->reserveResources(nvjMI);
+      CurrentPacketMIs.push_back(MI);
+      CurrentPacketMIs.push_back(nvjMI);
+    } else {
+      if (   (QII->isExtended(MI) || QII->isConstExtended(MI))
+          && (   !tryAllocateResourcesForConstExt(MI)
+              || !ResourceTracker->canReserveResources(MI)))
+      {
+        endPacket(MBB, MI);
+        // Check if the instruction was promoted to a dot-new. If so, demote it
+        // back into a dot-old
+        if (PromotedToDotNew) {
+          DemoteToDotOld(MI);
+        }
+        reserveResourcesForConstExt(MI);
+      }
+      // In case that "MI" is not an extended insn,
+      // the resource availability has already been checked.
+      ResourceTracker->reserveResources(MI);
+      CurrentPacketMIs.push_back(MI);
+    }
+    return MII;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *llvm::createHexagonPacketizer() {
+  return new HexagonPacketizer();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
new file mode 100644
index 000000000000..c607b5d35649
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
@@ -0,0 +1,141 @@
+//===-- HexagonVarargsCallingConvention.h - Calling Conventions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the functions that assign locations to outgoing function
+// arguments. Adapted from the target independent version but this handles
+// calls to varargs functions
+//
+//===----------------------------------------------------------------------===//
+//
+
+
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem);
+
+
+static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                 EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags,
+                                 Hexagon_CCState &State,
+                                 int NonVarArgsParams,
+                                 int CurrentParam,
+                                 bool ForceMem) {
+  unsigned ByValSize = 0;
+  if (ArgFlags.isByVal() &&
+      ((ByValSize = ArgFlags.getByValSize()) >
+       (MVT(MVT::i64).getSizeInBits() / 8))) {
+    ForceMem = true;
+  }
+
+
+  // Only assign registers for named (non varargs) arguments
+  if ( !ForceMem && ((NonVarArgsParams == -1) || (CurrentParam <=
+                                                  NonVarArgsParams))) {
+
+    if (LocVT == MVT::i32 ||
+        LocVT == MVT::i16 ||
+        LocVT == MVT::i8 ||
+        LocVT == MVT::f32) {
+      static const unsigned RegList1[] = {
+        Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+        Hexagon::R5
+      };
+      if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+
+    if (LocVT == MVT::i64 ||
+        LocVT == MVT::f64) {
+      static const unsigned RegList2[] = {
+        Hexagon::D0, Hexagon::D1, Hexagon::D2
+      };
+      if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                         LocVT.getSimpleVT(), LocInfo));
+        return false;
+      }
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+
+  // If it's passed by value, then we need the size of the aggregate not of
+  // the pointer.
+  if (ArgFlags.isByVal()) {
+    Size = ByValSize;
+
+    // Hexagon_TODO: Get the alignment of the contained type here.
+    Alignment = 8;
+  }
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
+
+
+static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
+                                    EVT LocVT, CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags,
+                                    Hexagon_CCState &State,
+                                    int NonVarArgsParams,
+                                    int CurrentParam,
+                                    bool ForceMem) {
+
+  if (LocVT == MVT::i32 ||
+      LocVT == MVT::f32) {
+    static const unsigned RegList1[] = {
+      Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
+      Hexagon::R5
+    };
+    if (unsigned Reg = State.AllocateReg(RegList1, 6)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  if (LocVT == MVT::i64 ||
+      LocVT == MVT::f64) {
+    static const unsigned RegList2[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2
+    };
+    if (unsigned Reg = State.AllocateReg(RegList2, 3)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT.getSimpleVT(), Reg,
+                                       LocVT.getSimpleVT(), LocInfo));
+      return false;
+    }
+  }
+
+  const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
+  unsigned Alignment =
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
+  unsigned Size =
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
+
+  unsigned Offset3 = State.AllocateStack(Size, Alignment);
+  State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
+                                   LocVT.getSimpleVT(), LocInfo));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
new file mode 100644
index 000000000000..36da6dfcc3d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -0,0 +1,210 @@
+//===- HexagonInstPrinter.cpp - Convert Hexagon MCInst to assembly syntax -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "HexagonAsmPrinter.h"
+#include "Hexagon.h"
+#include "HexagonInstPrinter.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdio>
+
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#include "HexagonGenAsmWriter.inc"
+
+const char HexagonInstPrinter::PacketPadding = '\t';
+
+StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return MII.getName(Opcode);
+}
+
+StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const {
+  return getRegisterName(RegNo);
+}
+
+void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  printInst((const HexagonMCInst*)(MI), O, Annot);
+}
+
+void HexagonInstPrinter::printInst(const HexagonMCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  const char startPacket = '{',
+             endPacket = '}';
+  // TODO: add outer HW loop when it's supported too.
+  if (MI->getOpcode() == Hexagon::ENDLOOP0) {
+    // Ending a harware loop is different from ending an regular packet.
+    assert(MI->isPacketEnd() && "Loop-end must also end the packet");
+
+    if (MI->isPacketStart()) {
+      // There must be a packet to end a loop.
+      // FIXME: when shuffling is always run, this shouldn't be needed.
+      HexagonMCInst Nop;
+      StringRef NoAnnot;
+
+      Nop.setOpcode (Hexagon::NOP);
+      Nop.setPacketStart (MI->isPacketStart());
+      printInst (&Nop, O, NoAnnot);
+    }
+
+    // Close the packet.
+    if (MI->isPacketEnd())
+      O << PacketPadding << endPacket;
+
+    printInstruction(MI, O);
+  }
+  else {
+    // Prefix the insn opening the packet.
+    if (MI->isPacketStart())
+      O << PacketPadding << startPacket << '\n';
+
+    printInstruction(MI, O);
+
+    // Suffix the insn closing the packet.
+    if (MI->isPacketEnd())
+      // Suffix the packet in a new line always, since the GNU assembler has
+      // issues with a closing brace on the same line as CONST{32,64}.
+      O << '\n' << PacketPadding << endPacket;
+  }
+
+  printAnnotation(O, Annot);
+}
+
+void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if (MO.isReg()) {
+    O << getRegisterName(MO.getReg());
+  } else if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    printImmOperand(MI, OpNo, O);
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  if(MO.isExpr()) {
+    O << *MO.getExpr();
+  } else if(MO.isImm()) {
+    O << MI->getOperand(OpNo).getImm();
+  } else {
+    llvm_unreachable("Unknown operand");
+  }
+}
+
+void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) const {
+  const HexagonMCInst *HMCI = static_cast<const HexagonMCInst*>(MI);
+  if (HMCI->isConstExtended())
+    O << "#";
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI,
+                                    unsigned OpNo, raw_ostream &O) const {
+  O << MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  O << -MI->getOperand(OpNo).getImm();
+}
+
+void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+  O << -1;
+}
+
+void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg());
+  O << " + #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) const {
+  const MCOperand& MO0 = MI->getOperand(OpNo);
+  const MCOperand& MO1 = MI->getOperand(OpNo + 1);
+
+  O << getRegisterName(MO0.getReg()) << ", #" << MO1.getImm();
+}
+
+void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) const {
+  assert(MI->getOperand(OpNo).isExpr() && "Expecting expression");
+
+  printOperand(MI, OpNo, O);
+}
+
+void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) const {
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  assert("Unknown branch operand.");
+}
+
+void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) const {
+}
+
+void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, bool hi) const {
+  const MCOperand& MO = MI->getOperand(OpNo);
+
+  O << '#' << (hi? "HI": "LO") << '(';
+  if (MO.isImm()) {
+    O << '#';
+    printOperand(MI, OpNo, O);
+  } else {
+    assert("Unknown symbol operand");
+    printOperand(MI, OpNo, O);
+  }
+  O << ')';
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
new file mode 100644
index 000000000000..d0cef683da95
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -0,0 +1,87 @@
+//===-- HexagonInstPrinter.h - Convert Hexagon MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Hexagon MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONINSTPRINTER_H
+#define HEXAGONINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+namespace llvm {
+  class HexagonMCInst;
+
+  class HexagonInstPrinter : public MCInstPrinter {
+  public:
+    explicit HexagonInstPrinter(const MCAsmInfo &MAI,
+                                const MCInstrInfo &MII,
+                                const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
+
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
+    virtual StringRef getOpcodeName(unsigned Opcode) const;
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    StringRef getRegName(unsigned RegNo) const;
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+    void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) const;
+    void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printFrameIndexOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O) const;
+    void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O)
+           const;
+    void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const;
+
+    void printConstantPool(const MCInst *MI, unsigned OpNo,
+                           raw_ostream &O) const;
+
+    void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, true); }
+    void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const
+      { printSymbol(MI, OpNo, O, false); }
+
+    const MCInstrInfo &getMII() const {
+      return MII;
+    }
+
+  protected:
+    void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi)
+           const;
+
+    static const char PacketPadding;
+
+  private:
+    const MCInstrInfo &MII;
+
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
new file mode 100644
index 000000000000..d4a93b5c87a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -0,0 +1,183 @@
+//===-- HexagonBaseInfo.h - Top level definitions for Hexagon --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Hexagon target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONBASEINFO_H
+#define HEXAGONBASEINFO_H
+
+#include "HexagonMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// HexagonII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace HexagonII {
+  // *** The code below must match HexagonInstrFormat*.td *** //
+
+  // Insn types.
+  // *** Must match HexagonInstrFormat*.td ***
+  enum Type {
+    TypePSEUDO  = 0,
+    TypeALU32   = 1,
+    TypeCR      = 2,
+    TypeJR      = 3,
+    TypeJ       = 4,
+    TypeLD      = 5,
+    TypeST      = 6,
+    TypeSYSTEM  = 7,
+    TypeXTYPE   = 8,
+    TypeMEMOP   = 9,
+    TypeNV      = 10,
+    TypePREFIX  = 30, // Such as extenders.
+    TypeENDLOOP = 31  // Such as end of a HW loop.
+  };
+
+  enum SubTarget {
+    HasV2SubT     = 0xf,
+    HasV2SubTOnly = 0x1,
+    NoV2SubT      = 0x0,
+    HasV3SubT     = 0xe,
+    HasV3SubTOnly = 0x2,
+    NoV3SubT      = 0x1,
+    HasV4SubT     = 0xc,
+    NoV4SubT      = 0x3,
+    HasV5SubT     = 0x8,
+    NoV5SubT      = 0x7
+  };
+
+  enum AddrMode {
+    NoAddrMode     = 0,  // No addressing mode
+    Absolute       = 1,  // Absolute addressing mode
+    AbsoluteSet    = 2,  // Absolute set addressing mode
+    BaseImmOffset  = 3,  // Indirect with offset
+    BaseLongOffset = 4,  // Indirect with long offset
+    BaseRegOffset  = 5   // Indirect with register offset
+  };
+
+  enum MemAccessSize {
+    NoMemAccess = 0,            // Not a memory acces instruction.
+    ByteAccess = 1,             // Byte access instruction (memb).
+    HalfWordAccess = 2,         // Half word access instruction (memh).
+    WordAccess = 3,             // Word access instrution (memw).
+    DoubleWordAccess = 4        // Double word access instruction (memd)
+  };
+
+  // MCInstrDesc TSFlags
+  // *** Must match HexagonInstrFormat*.td ***
+  enum {
+    // This 5-bit field describes the insn type.
+    TypePos  = 0,
+    TypeMask = 0x1f,
+
+    // Solo instructions.
+    SoloPos  = 5,
+    SoloMask = 0x1,
+
+    // Predicated instructions.
+    PredicatedPos  = 6,
+    PredicatedMask = 0x1,
+    PredicatedFalsePos  = 7,
+    PredicatedFalseMask = 0x1,
+    PredicatedNewPos  = 8,
+    PredicatedNewMask = 0x1,
+
+    // New-Value consumer instructions.
+    NewValuePos  = 9,
+    NewValueMask = 0x1,
+
+    // New-Value producer instructions.
+    hasNewValuePos  = 10,
+    hasNewValueMask = 0x1,
+
+    // Which operand consumes or produces a new value.
+    NewValueOpPos  = 11,
+    NewValueOpMask = 0x7,
+
+    // Which bits encode the new value.
+    NewValueBitsPos  = 14,
+    NewValueBitsMask = 0x3,
+
+    // Stores that can become new-value stores.
+    mayNVStorePos  = 16,
+    mayNVStoreMask = 0x1,
+
+    // New-value store instructions.
+    NVStorePos  = 17,
+    NVStoreMask = 0x1,
+
+    // Extendable insns.
+    ExtendablePos  = 18,
+    ExtendableMask = 0x1,
+
+    // Insns must be extended.
+    ExtendedPos  = 19,
+    ExtendedMask = 0x1,
+
+    // Which operand may be extended.
+    ExtendableOpPos  = 20,
+    ExtendableOpMask = 0x7,
+
+    // Signed or unsigned range.
+    ExtentSignedPos = 23,
+    ExtentSignedMask = 0x1,
+
+    // Number of bits of range before extending operand.
+    ExtentBitsPos  = 24,
+    ExtentBitsMask = 0x1f,
+
+    // Valid subtargets
+    validSubTargetPos = 29,
+    validSubTargetMask = 0xf,
+
+    // Addressing mode for load/store instructions.
+    AddrModePos = 33,
+    AddrModeMask = 0x7,
+
+    // Access size of memory access instructions (load/store).
+    MemAccessSizePos = 36,
+    MemAccesSizeMask = 0x7
+  };
+
+  // *** The code above must match HexagonInstrFormat*.td *** //
+
+  // Hexagon specific MO operand flag mask.
+  enum HexagonMOTargetFlagVal {
+    //===------------------------------------------------------------------===//
+    // Hexagon Specific MachineOperand flags.
+    MO_NO_FLAG,
+
+    HMOTF_ConstExtended = 1,
+
+    /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
+    /// Used for computing a global address for PIC compilations
+    MO_PCREL,
+
+    /// MO_GOT - Indicates a GOT-relative relocation
+    MO_GOT,
+
+    // Low or high part of a symbol.
+    MO_LO16, MO_HI16,
+
+    // Offset from the base of the SDA.
+    MO_GPREL
+  };
+
+} // End namespace HexagonII.
+
+} // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
new file mode 100644
index 000000000000..3deb8d1deb42
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -0,0 +1,37 @@
+//===-- HexagonMCAsmInfo.cpp - Hexagon asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the HexagonMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCAsmInfo.h"
+
+using namespace llvm;
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "//";
+  HasLEB128 = true;
+
+  PrivateGlobalPrefix = ".L";
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
+  InlineAsmStart = "# InlineAsm Start";
+  InlineAsmEnd = "# InlineAsm End";
+  ZeroDirective = "\t.space\t";
+  AscizDirective = "\t.string\t";
+  WeakRefDirective = "\t.weak\t";
+
+  SupportsDebugInformation = true;
+  UsesELFSectionDirectiveForBSS  = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
new file mode 100644
index 000000000000..d336cd5be917
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- HexagonTargetAsmInfo.h - Hexagon asm properties --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the HexagonMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HexagonMCASMINFO_H
+#define HexagonMCASMINFO_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+
+  class HexagonMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit HexagonMCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
new file mode 100644
index 000000000000..9260b4a27661
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.cpp
@@ -0,0 +1,175 @@
+//===- HexagonMCInst.cpp - Hexagon sub-class of MCInst --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some Hexagon VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonInstrInfo.h"
+#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "MCTargetDesc/HexagonMCInst.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+
+using namespace llvm;
+
+// Return the slots used by the insn.
+unsigned HexagonMCInst::getUnits(const HexagonTargetMachine* TM) const {
+  const HexagonInstrInfo* QII = TM->getInstrInfo();
+  const InstrItineraryData* II = TM->getInstrItineraryData();
+  const InstrStage*
+    IS = II->beginStage(QII->get(this->getOpcode()).getSchedClass());
+
+  return (IS->getUnits());
+}
+
+// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInst::getType() const {
+  const uint64_t F = MCID->TSFlags;
+
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
+// Return whether the insn is an actual insn.
+bool HexagonMCInst::isCanon() const {
+  return (!MCID->isPseudo() &&
+          !isPrefix() &&
+          getType() != HexagonII::TypeENDLOOP);
+}
+
+// Return whether the insn is a prefix.
+bool HexagonMCInst::isPrefix() const {
+  return (getType() == HexagonII::TypePREFIX);
+}
+
+// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInst::isSolo() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
+}
+
+// Return whether the insn is a new-value consumer.
+bool HexagonMCInst::isNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
+}
+
+// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInst::hasNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask);
+}
+
+// Return the operand that consumes or produces a new value.
+const MCOperand& HexagonMCInst::getNewValue() const {
+  const uint64_t F = MCID->TSFlags;
+  const unsigned O = (F >> HexagonII::NewValueOpPos) &
+                     HexagonII::NewValueOpMask;
+  const MCOperand& MCO = getOperand(O);
+
+  assert ((isNewValue() || hasNewValue()) && MCO.isReg());
+  return (MCO);
+}
+
+// Return whether the instruction needs to be constant extended.
+// 1) Always return true if the instruction has 'isExtended' flag set.
+//
+// isExtendable:
+// 2) For immediate extended operands, return true only if the value is
+//    out-of-range.
+// 3) For global address, always return true.
+
+bool HexagonMCInst::isConstExtended(void) const {
+  if (isExtended())
+    return true;
+
+  if (!isExtendable())
+    return false;
+
+  short ExtOpNum = getCExtOpNum();
+  int MinValue   = getMinValue();
+  int MaxValue   = getMaxValue();
+  const MCOperand& MO = getOperand(ExtOpNum);
+
+  // We could be using an instruction with an extendable immediate and shoehorn
+  // a global address into it. If it is a global address it will be constant
+  // extended. We do this for COMBINE.
+  // We currently only handle isGlobal() because it is the only kind of
+  // object we are going to end up with here for now.
+  // In the future we probably should add isSymbol(), etc.
+  if (MO.isExpr())
+    return true;
+
+  // If the extendable operand is not 'Immediate' type, the instruction should
+  // have 'isExtended' flag set.
+  assert(MO.isImm() && "Extendable operand must be Immediate type");
+
+  int ImmValue = MO.getImm();
+  return (ImmValue < MinValue || ImmValue > MaxValue);
+}
+
+// Return whether the instruction must be always extended.
+bool HexagonMCInst::isExtended(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask;
+}
+
+// Return true if the instruction may be extended based on the operand value.
+bool HexagonMCInst::isExtendable(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask;
+}
+
+// Return number of bits in the constant extended operand.
+unsigned HexagonMCInst::getBitCount(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
+}
+
+// Return constant extended operand number.
+unsigned short HexagonMCInst::getCExtOpNum(void) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
+}
+
+// Return whether the operand can be constant extended.
+bool HexagonMCInst::isOperandExtended(const unsigned short OperandNum) const {
+  const uint64_t F = MCID->TSFlags;
+  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask)
+          == OperandNum;
+}
+
+// Return the min value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMinValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return -1 << (bits - 1);
+  else
+    return 0;
+}
+
+// Return the max value that a constant extendable operand can have
+// without being extended.
+int HexagonMCInst::getMaxValue(void) const {
+  const uint64_t F = MCID->TSFlags;
+  unsigned isSigned = (F >> HexagonII::ExtentSignedPos)
+                    & HexagonII::ExtentSignedMask;
+  unsigned bits =  (F >> HexagonII::ExtentBitsPos)
+                    & HexagonII::ExtentBitsMask;
+
+  if (isSigned) // if value is signed
+    return ~(-1 << (bits - 1));
+  else
+    return ~(-1 << bits);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
new file mode 100644
index 000000000000..3ca71f00b241
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
@@ -0,0 +1,100 @@
+//===- HexagonMCInst.h - Hexagon sub-class of MCInst ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class extends MCInst to allow some VLIW annotations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCINST_H
+#define HEXAGONMCINST_H
+
+#include "HexagonTargetMachine.h"
+#include "llvm/MC/MCInst.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class HexagonMCInst: public MCInst {
+    // MCID is set during instruction lowering.
+    // It is needed in order to access TSFlags for
+    // use in checking MC instruction properties.
+    const MCInstrDesc *MCID;
+
+    // Packet start and end markers
+    unsigned packetStart: 1, packetEnd: 1;
+
+  public:
+    explicit HexagonMCInst():
+      MCInst(), MCID(0), packetStart(0), packetEnd(0) {};
+    HexagonMCInst(const MCInstrDesc& mcid):
+      MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
+
+    bool isPacketStart() const { return (packetStart); };
+    bool isPacketEnd() const { return (packetEnd); };
+    void setPacketStart(bool Y) { packetStart = Y; };
+    void setPacketEnd(bool Y) { packetEnd = Y; };
+    void resetPacket() { setPacketStart(false); setPacketEnd(false); };
+
+    // Return the slots used by the insn.
+    unsigned getUnits(const HexagonTargetMachine* TM) const;
+
+    // Return the Hexagon ISA class for the insn.
+    unsigned getType() const;
+
+    void setDesc(const MCInstrDesc& mcid) { MCID = &mcid; };
+    const MCInstrDesc& getDesc(void) const { return *MCID; };
+
+    // Return whether the insn is an actual insn.
+    bool isCanon() const;
+
+    // Return whether the insn is a prefix.
+    bool isPrefix() const;
+
+    // Return whether the insn is solo, i.e., cannot be in a packet.
+    bool isSolo() const;
+
+    // Return whether the instruction needs to be constant extended.
+    bool isConstExtended() const;
+
+    // Return constant extended operand number.
+    unsigned short getCExtOpNum(void) const;
+
+    // Return whether the insn is a new-value consumer.
+    bool isNewValue() const;
+
+    // Return whether the instruction is a legal new-value producer.
+    bool hasNewValue() const;
+
+    // Return the operand that consumes or produces a new value.
+    const MCOperand& getNewValue() const;
+
+    // Return number of bits in the constant extended operand.
+    unsigned getBitCount(void) const;
+
+  private:
+    // Return whether the instruction must be always extended.
+    bool isExtended() const;
+
+    // Return true if the insn may be extended based on the operand value.
+    bool isExtendable() const;
+
+    // Return true if the operand can be constant extended.
+    bool isOperandExtended(const unsigned short OperandNum) const;
+
+    // Return the min value that a constant extendable operand can have
+    // without being extended.
+    int getMinValue() const;
+
+    // Return the max value that a constant extendable operand can have
+    // without being extended.
+    int getMaxValue() const;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
new file mode 100644
index 000000000000..6b1d2d161958
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -0,0 +1,97 @@
+//===-- HexagonMCTargetDesc.cpp - Hexagon Target Descriptions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonMCTargetDesc.h"
+#include "HexagonMCAsmInfo.h"
+#include "InstPrinter/HexagonInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "HexagonGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "HexagonGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R0);
+  return X;
+}
+
+static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
+                                                     StringRef CPU,
+                                                     StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitHexagonMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createHexagonMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(T, TT);
+
+  // VirtualFP = (R30 + #0).
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Hexagon::R30, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createHexagonMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
+  return X;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeHexagonTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheHexagonTarget, createHexagonMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheHexagonTarget,
+                                        createHexagonMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheHexagonTarget, createHexagonMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheHexagonTarget,
+                                    createHexagonMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheHexagonTarget,
+                                          createHexagonMCSubtargetInfo);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
new file mode 100644
index 000000000000..2238b1ae5f35
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -0,0 +1,39 @@
+//===-- HexagonMCTargetDesc.h - Hexagon Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Hexagon specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONMCTARGETDESC_H
+#define HEXAGONMCTARGETDESC_H
+
+namespace llvm {
+class MCSubtargetInfo;
+class Target;
+
+extern Target TheHexagonTarget;
+
+} // End llvm namespace
+
+// Define symbolic names for Hexagon registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "HexagonGenRegisterInfo.inc"
+
+// Defines symbolic names for the Hexagon instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "HexagonGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "HexagonGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
new file mode 100644
index 000000000000..40f6c8d23ea8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- HexagonTargetInfo.cpp - Hexagon Target Implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheHexagonTarget;
+
+extern "C" void LLVMInitializeHexagonTargetInfo() {
+  RegisterTarget<Triple::hexagon, /*HasJIT=*/false>  X(TheHexagonTarget, "hexagon", "Hexagon");
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/contrib/llvm/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
new file mode 100644
index 000000000000..dda6e247ac4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -0,0 +1,572 @@
+//===-- MBlazeAsmParser.cpp - Parse MBlaze asm to MCInst instructions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MBlazeBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+struct MBlazeOperand;
+
+class MBlazeAsmParser : public MCTargetAsmParser {
+  MCAsmParser &Parser;
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  MBlazeOperand *ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  MBlazeOperand *ParseRegister();
+  MBlazeOperand *ParseRegister(SMLoc &StartLoc, SMLoc &EndLoc);
+  MBlazeOperand *ParseImmediate();
+  MBlazeOperand *ParseFsl();
+  MBlazeOperand* ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "MBlazeGenAsmMatcher.inc"
+
+  /// }
+
+public:
+  MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
+    : MCTargetAsmParser(), Parser(_Parser) {}
+
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+
+/// MBlazeOperand - Instances of this class represent a parsed MBlaze machine
+/// instruction.
+struct MBlazeOperand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Immediate,
+    Register,
+    Memory,
+    Fsl
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned OffReg;
+    const MCExpr *Off;
+  };
+
+  struct FslImmOp {
+    const MCExpr *Val;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+    struct FslImmOp FslImm;
+  };
+
+  MBlazeOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+public:
+  MBlazeOperand(const MBlazeOperand &o) : MCParsedAsmOperand() {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case Register:
+      Reg = o.Reg;
+      break;
+    case Immediate:
+      Imm = o.Imm;
+      break;
+    case Token:
+      Tok = o.Tok;
+      break;
+    case Memory:
+      Mem = o.Mem;
+      break;
+    case Fsl:
+      FslImm = o.FslImm;
+      break;
+    }
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getFslImm() const {
+    assert(Kind == Fsl && "Invalid access!");
+    return FslImm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr* getMemOff() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Off;
+  }
+
+  unsigned getMemOffReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.OffReg;
+  }
+
+  bool isToken() const { return Kind == Token; }
+  bool isImm() const { return Kind == Immediate; }
+  bool isMem() const { return Kind == Memory; }
+  bool isFsl() const { return Kind == Fsl; }
+  bool isReg() const { return Kind == Register; }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addFslOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getFslImm());
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    unsigned RegOff = getMemOffReg();
+    if (RegOff)
+      Inst.addOperand(MCOperand::CreateReg(RegOff));
+    else
+      addExpr(Inst, getMemOff());
+  }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  virtual void print(raw_ostream &OS) const;
+
+  static MBlazeOperand *CreateToken(StringRef Str, SMLoc S) {
+    MBlazeOperand *Op = new MBlazeOperand(Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static MBlazeOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MBlazeOperand *Op = new MBlazeOperand(Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MBlazeOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MBlazeOperand *Op = new MBlazeOperand(Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MBlazeOperand *CreateFslImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MBlazeOperand *Op = new MBlazeOperand(Fsl);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MBlazeOperand *CreateMem(unsigned Base, const MCExpr *Off, SMLoc S,
+                                  SMLoc E) {
+    MBlazeOperand *Op = new MBlazeOperand(Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.Off = Off;
+    Op->Mem.OffReg = 0;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MBlazeOperand *CreateMem(unsigned Base, unsigned Off, SMLoc S,
+                                  SMLoc E) {
+    MBlazeOperand *Op = new MBlazeOperand(Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.OffReg = Off;
+    Op->Mem.Off = 0;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+};
+
+} // end anonymous namespace.
+
+void MBlazeOperand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case Immediate:
+    getImm()->print(OS);
+    break;
+  case Register:
+    OS << "<register R";
+    OS << getMBlazeRegisterNumbering(getReg()) << ">";
+    break;
+  case Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case Memory: {
+    OS << "<memory R";
+    OS << getMBlazeRegisterNumbering(getMemBase());
+    OS << ", ";
+
+    unsigned RegOff = getMemOffReg();
+    if (RegOff)
+      OS << "R" << getMBlazeRegisterNumbering(RegOff);
+    else
+      OS << getMemOff();
+    OS << ">";
+    }
+    break;
+  case Fsl:
+    getFslImm()->print(OS);
+    break;
+  }
+}
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+//
+bool MBlazeAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                               MatchingInlineAsm)) {
+  default: break;
+  case Match_Success:
+    Out.EmitInstruction(Inst);
+    return false;
+  case Match_MissingFeature:
+    return Error(IDLoc, "instruction use requires an option to be enabled");
+  case Match_MnemonicFail:
+      return Error(IDLoc, "unrecognized instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MBlazeOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+}
+
+MBlazeOperand *MBlazeAsmParser::
+ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  if (Operands.size() != 4)
+    return 0;
+
+  MBlazeOperand &Base = *(MBlazeOperand*)Operands[2];
+  MBlazeOperand &Offset = *(MBlazeOperand*)Operands[3];
+
+  SMLoc S = Base.getStartLoc();
+  SMLoc O = Offset.getStartLoc();
+  SMLoc E = Offset.getEndLoc();
+
+  if (!Base.isReg()) {
+    Error(S, "base address must be a register");
+    return 0;
+  }
+
+  if (!Offset.isReg() && !Offset.isImm()) {
+    Error(O, "offset must be a register or immediate");
+    return 0;
+  }
+
+  MBlazeOperand *Op;
+  if (Offset.isReg())
+    Op = MBlazeOperand::CreateMem(Base.getReg(), Offset.getReg(), S, E);
+  else
+    Op = MBlazeOperand::CreateMem(Base.getReg(), Offset.getImm(), S, E);
+
+  delete Operands.pop_back_val();
+  delete Operands.pop_back_val();
+  Operands.push_back(Op);
+
+  return Op;
+}
+
+bool MBlazeAsmParser::ParseRegister(unsigned &RegNo,
+                                    SMLoc &StartLoc, SMLoc &EndLoc) {
+  MBlazeOperand *Reg = ParseRegister(StartLoc, EndLoc);
+  if (!Reg)
+    return true;
+  RegNo = Reg->getReg();
+  return false;
+}
+
+MBlazeOperand *MBlazeAsmParser::ParseRegister() {
+  SMLoc S, E;
+  return ParseRegister(S, E);
+}
+
+MBlazeOperand *MBlazeAsmParser::ParseRegister(SMLoc &StartLoc, SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
+  EndLoc = Parser.getTok().getEndLoc();
+
+  if (getLexer().getKind() != AsmToken::Identifier)
+    return 0;
+
+  unsigned RegNo = MatchRegisterName(getLexer().getTok().getIdentifier());
+  if (RegNo == 0)
+    return 0;
+
+  getLexer().Lex();
+  return MBlazeOperand::CreateReg(RegNo, StartLoc, EndLoc);
+}
+
+static unsigned MatchFslRegister(StringRef String) {
+  if (!String.startswith("rfsl"))
+    return -1;
+
+  unsigned regNum;
+  if (String.substr(4).getAsInteger(10,regNum))
+    return -1;
+
+  return regNum;
+}
+
+MBlazeOperand *MBlazeAsmParser::ParseFsl() {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = Parser.getTok().getEndLoc();
+
+  switch (getLexer().getKind()) {
+  default: return 0;
+  case AsmToken::Identifier:
+    unsigned reg = MatchFslRegister(getLexer().getTok().getIdentifier());
+    if (reg >= 16)
+      return 0;
+
+    getLexer().Lex();
+    const MCExpr *EVal = MCConstantExpr::Create(reg,getContext());
+    return MBlazeOperand::CreateFslImm(EVal,S,E);
+  }
+}
+
+MBlazeOperand *MBlazeAsmParser::ParseImmediate() {
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = Parser.getTok().getEndLoc();
+
+  const MCExpr *EVal;
+  switch (getLexer().getKind()) {
+  default: return 0;
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Identifier:
+    if (getParser().parseExpression(EVal))
+      return 0;
+
+    return MBlazeOperand::CreateImm(EVal, S, E);
+  }
+}
+
+MBlazeOperand *MBlazeAsmParser::
+ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  MBlazeOperand *Op;
+
+  // Attempt to parse the next token as a register name
+  Op = ParseRegister();
+
+  // Attempt to parse the next token as an FSL immediate
+  if (!Op)
+    Op = ParseFsl();
+
+  // Attempt to parse the next token as an immediate
+  if (!Op)
+    Op = ParseImmediate();
+
+  // If the token could not be parsed then fail
+  if (!Op) {
+    Error(Parser.getTok().getLoc(), "unknown operand");
+    return 0;
+  }
+
+  // Push the parsed operand into the list of operands
+  Operands.push_back(Op);
+  return Op;
+}
+
+/// Parse an mblaze instruction mnemonic followed by its operands.
+bool MBlazeAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // The first operands is the token for the instruction name
+  size_t dotLoc = Name.find('.');
+  Operands.push_back(MBlazeOperand::CreateToken(Name.substr(0,dotLoc),NameLoc));
+  if (dotLoc < Name.size())
+    Operands.push_back(MBlazeOperand::CreateToken(Name.substr(dotLoc),NameLoc));
+
+  // If there are no more operands then finish
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse the first operand
+  if (!ParseOperand(Operands))
+    return true;
+
+  while (getLexer().isNot(AsmToken::EndOfStatement) &&
+         getLexer().is(AsmToken::Comma)) {
+    // Consume the comma token
+    getLexer().Lex();
+
+    // Parse the next operand
+    if (!ParseOperand(Operands))
+      return true;
+  }
+
+  // If the instruction requires a memory operand then we need to
+  // replace the last two operands (base+offset) with a single
+  // memory operand.
+  if (Name.startswith("lw") || Name.startswith("sw") ||
+      Name.startswith("lh") || Name.startswith("sh") ||
+      Name.startswith("lb") || Name.startswith("sb"))
+    return (ParseMemory(Operands) == NULL);
+
+  return false;
+}
+
+/// ParseDirective parses the MBlaze specific directives
+bool MBlazeAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool MBlazeAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeMBlazeAsmParser() {
+  RegisterMCAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MBlazeGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
new file mode 100644
index 000000000000..c03ab3803b60
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -0,0 +1,719 @@
+//===-- MBlazeDisassembler.cpp - Disassembler for MicroBlaze  -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the MBlaze Disassembler. It contains code to translate
+// the data produced by the decoder into MCInsts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeDisassembler.h"
+#include "MBlaze.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+// #include "MBlazeGenDecoderTables.inc"
+// #include "MBlazeGenRegisterNames.inc"
+
+namespace llvm {
+extern const MCInstrDesc MBlazeInsts[];
+}
+
+using namespace llvm;
+
+const uint16_t UNSUPPORTED = -1;
+
+static const uint16_t mblazeBinary2Opcode[] = {
+  MBlaze::ADD,   MBlaze::RSUB,   MBlaze::ADDC,   MBlaze::RSUBC,   //00,01,02,03
+  MBlaze::ADDK,  MBlaze::RSUBK,  MBlaze::ADDKC,  MBlaze::RSUBKC,  //04,05,06,07
+  MBlaze::ADDI,  MBlaze::RSUBI,  MBlaze::ADDIC,  MBlaze::RSUBIC,  //08,09,0A,0B
+  MBlaze::ADDIK, MBlaze::RSUBIK, MBlaze::ADDIKC, MBlaze::RSUBIKC, //0C,0D,0E,0F
+
+  MBlaze::MUL,   MBlaze::BSRL,   MBlaze::IDIV,   MBlaze::GETD,    //10,11,12,13
+  UNSUPPORTED,   UNSUPPORTED,    MBlaze::FADD,   UNSUPPORTED,     //14,15,16,17
+  MBlaze::MULI,  MBlaze::BSRLI,  UNSUPPORTED,    MBlaze::GET,     //18,19,1A,1B
+  UNSUPPORTED,   UNSUPPORTED,    UNSUPPORTED,    UNSUPPORTED,     //1C,1D,1E,1F
+
+  MBlaze::OR,    MBlaze::AND,    MBlaze::XOR,    MBlaze::ANDN,    //20,21,22,23
+  MBlaze::SEXT8, MBlaze::MFS,    MBlaze::BR,     MBlaze::BEQ,     //24,25,26,27
+  MBlaze::ORI,   MBlaze::ANDI,   MBlaze::XORI,   MBlaze::ANDNI,   //28,29,2A,2B
+  MBlaze::IMM,   MBlaze::RTSD,   MBlaze::BRI,    MBlaze::BEQI,    //2C,2D,2E,2F
+
+  MBlaze::LBU,   MBlaze::LHU,    MBlaze::LW,     UNSUPPORTED,     //30,31,32,33
+  MBlaze::SB,    MBlaze::SH,     MBlaze::SW,     UNSUPPORTED,     //34,35,36,37
+  MBlaze::LBUI,  MBlaze::LHUI,   MBlaze::LWI,    UNSUPPORTED,     //38,39,3A,3B
+  MBlaze::SBI,   MBlaze::SHI,    MBlaze::SWI,    UNSUPPORTED,     //3C,3D,3E,3F
+};
+
+static unsigned getRD(uint32_t insn) {
+  if (!isMBlazeRegister((insn>>21)&0x1F))
+    return UNSUPPORTED;
+  return getMBlazeRegisterFromNumbering((insn>>21)&0x1F);
+}
+
+static unsigned getRA(uint32_t insn) {
+  if (!getMBlazeRegisterFromNumbering((insn>>16)&0x1F))
+    return UNSUPPORTED;
+  return getMBlazeRegisterFromNumbering((insn>>16)&0x1F);
+}
+
+static unsigned getRB(uint32_t insn) {
+  if (!getMBlazeRegisterFromNumbering((insn>>11)&0x1F))
+    return UNSUPPORTED;
+  return getMBlazeRegisterFromNumbering((insn>>11)&0x1F);
+}
+
+static int64_t getRS(uint32_t insn) {
+  if (!isSpecialMBlazeRegister(insn&0x3FFF))
+    return UNSUPPORTED;
+  return getSpecialMBlazeRegisterFromNumbering(insn&0x3FFF);
+}
+
+static int64_t getIMM(uint32_t insn) {
+    int16_t val = (insn & 0xFFFF);
+    return val;
+}
+
+static int64_t getSHT(uint32_t insn) {
+    int16_t val = (insn & 0x1F);
+    return val;
+}
+
+static unsigned getFLAGS(int32_t insn) {
+    return (insn & 0x7FF);
+}
+
+static int64_t getFSL(uint32_t insn) {
+    int16_t val = (insn & 0xF);
+    return val;
+}
+
+static unsigned decodeMUL(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default: return UNSUPPORTED;
+    case 0:  return MBlaze::MUL;
+    case 1:  return MBlaze::MULH;
+    case 2:  return MBlaze::MULHSU;
+    case 3:  return MBlaze::MULHU;
+    }
+}
+
+static unsigned decodeSEXT(uint32_t insn) {
+    switch (insn&0x7FF) {
+    default:   return UNSUPPORTED;
+    case 0x60: return MBlaze::SEXT8;
+    case 0x68: return MBlaze::WIC;
+    case 0x64: return MBlaze::WDC;
+    case 0x66: return MBlaze::WDCC;
+    case 0x74: return MBlaze::WDCF;
+    case 0x61: return MBlaze::SEXT16;
+    case 0x41: return MBlaze::SRL;
+    case 0x21: return MBlaze::SRC;
+    case 0x01: return MBlaze::SRA;
+    case 0xE0: return MBlaze::CLZ;
+    }
+}
+
+static unsigned decodeBEQ(uint32_t insn) {
+    switch ((insn>>21)&0x1F) {
+    default:    return UNSUPPORTED;
+    case 0x00:  return MBlaze::BEQ;
+    case 0x10:  return MBlaze::BEQD;
+    case 0x05:  return MBlaze::BGE;
+    case 0x15:  return MBlaze::BGED;
+    case 0x04:  return MBlaze::BGT;
+    case 0x14:  return MBlaze::BGTD;
+    case 0x03:  return MBlaze::BLE;
+    case 0x13:  return MBlaze::BLED;
+    case 0x02:  return MBlaze::BLT;
+    case 0x12:  return MBlaze::BLTD;
+    case 0x01:  return MBlaze::BNE;
+    case 0x11:  return MBlaze::BNED;
+    }
+}
+
+static unsigned decodeBEQI(uint32_t insn) {
+    switch ((insn>>21)&0x1F) {
+    default:    return UNSUPPORTED;
+    case 0x00:  return MBlaze::BEQI;
+    case 0x10:  return MBlaze::BEQID;
+    case 0x05:  return MBlaze::BGEI;
+    case 0x15:  return MBlaze::BGEID;
+    case 0x04:  return MBlaze::BGTI;
+    case 0x14:  return MBlaze::BGTID;
+    case 0x03:  return MBlaze::BLEI;
+    case 0x13:  return MBlaze::BLEID;
+    case 0x02:  return MBlaze::BLTI;
+    case 0x12:  return MBlaze::BLTID;
+    case 0x01:  return MBlaze::BNEI;
+    case 0x11:  return MBlaze::BNEID;
+    }
+}
+
+static unsigned decodeBR(uint32_t insn) {
+    switch ((insn>>16)&0x1F) {
+    default:   return UNSUPPORTED;
+    case 0x00: return MBlaze::BR;
+    case 0x08: return MBlaze::BRA;
+    case 0x0C: return MBlaze::BRK;
+    case 0x10: return MBlaze::BRD;
+    case 0x14: return MBlaze::BRLD;
+    case 0x18: return MBlaze::BRAD;
+    case 0x1C: return MBlaze::BRALD;
+    }
+}
+
+static unsigned decodeBRI(uint32_t insn) {
+    switch (insn&0x3FFFFFF) {
+    default:        break;
+    case 0x0020004: return MBlaze::IDMEMBAR;
+    case 0x0220004: return MBlaze::DMEMBAR;
+    case 0x0420004: return MBlaze::IMEMBAR;
+    }
+
+    switch ((insn>>16)&0x1F) {
+    default:   return UNSUPPORTED;
+    case 0x00: return MBlaze::BRI;
+    case 0x08: return MBlaze::BRAI;
+    case 0x0C: return MBlaze::BRKI;
+    case 0x10: return MBlaze::BRID;
+    case 0x14: return MBlaze::BRLID;
+    case 0x18: return MBlaze::BRAID;
+    case 0x1C: return MBlaze::BRALID;
+    }
+}
+
+static unsigned decodeBSRL(uint32_t insn) {
+    switch ((insn>>9)&0x3) {
+    default:  return UNSUPPORTED;
+    case 0x2: return MBlaze::BSLL;
+    case 0x1: return MBlaze::BSRA;
+    case 0x0: return MBlaze::BSRL;
+    }
+}
+
+static unsigned decodeBSRLI(uint32_t insn) {
+    switch ((insn>>9)&0x3) {
+    default:  return UNSUPPORTED;
+    case 0x2: return MBlaze::BSLLI;
+    case 0x1: return MBlaze::BSRAI;
+    case 0x0: return MBlaze::BSRLI;
+    }
+}
+
+static unsigned decodeRSUBK(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::RSUBK;
+    case 0x1: return MBlaze::CMP;
+    case 0x3: return MBlaze::CMPU;
+    }
+}
+
+static unsigned decodeFADD(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default:    return UNSUPPORTED;
+    case 0x000: return MBlaze::FADD;
+    case 0x080: return MBlaze::FRSUB;
+    case 0x100: return MBlaze::FMUL;
+    case 0x180: return MBlaze::FDIV;
+    case 0x200: return MBlaze::FCMP_UN;
+    case 0x210: return MBlaze::FCMP_LT;
+    case 0x220: return MBlaze::FCMP_EQ;
+    case 0x230: return MBlaze::FCMP_LE;
+    case 0x240: return MBlaze::FCMP_GT;
+    case 0x250: return MBlaze::FCMP_NE;
+    case 0x260: return MBlaze::FCMP_GE;
+    case 0x280: return MBlaze::FLT;
+    case 0x300: return MBlaze::FINT;
+    case 0x380: return MBlaze::FSQRT;
+    }
+}
+
+static unsigned decodeGET(uint32_t insn) {
+    switch ((insn>>10)&0x3F) {
+    default:   return UNSUPPORTED;
+    case 0x00: return MBlaze::GET;
+    case 0x01: return MBlaze::EGET;
+    case 0x02: return MBlaze::AGET;
+    case 0x03: return MBlaze::EAGET;
+    case 0x04: return MBlaze::TGET;
+    case 0x05: return MBlaze::TEGET;
+    case 0x06: return MBlaze::TAGET;
+    case 0x07: return MBlaze::TEAGET;
+    case 0x08: return MBlaze::CGET;
+    case 0x09: return MBlaze::ECGET;
+    case 0x0A: return MBlaze::CAGET;
+    case 0x0B: return MBlaze::ECAGET;
+    case 0x0C: return MBlaze::TCGET;
+    case 0x0D: return MBlaze::TECGET;
+    case 0x0E: return MBlaze::TCAGET;
+    case 0x0F: return MBlaze::TECAGET;
+    case 0x10: return MBlaze::NGET;
+    case 0x11: return MBlaze::NEGET;
+    case 0x12: return MBlaze::NAGET;
+    case 0x13: return MBlaze::NEAGET;
+    case 0x14: return MBlaze::TNGET;
+    case 0x15: return MBlaze::TNEGET;
+    case 0x16: return MBlaze::TNAGET;
+    case 0x17: return MBlaze::TNEAGET;
+    case 0x18: return MBlaze::NCGET;
+    case 0x19: return MBlaze::NECGET;
+    case 0x1A: return MBlaze::NCAGET;
+    case 0x1B: return MBlaze::NECAGET;
+    case 0x1C: return MBlaze::TNCGET;
+    case 0x1D: return MBlaze::TNECGET;
+    case 0x1E: return MBlaze::TNCAGET;
+    case 0x1F: return MBlaze::TNECAGET;
+    case 0x20: return MBlaze::PUT;
+    case 0x22: return MBlaze::APUT;
+    case 0x24: return MBlaze::TPUT;
+    case 0x26: return MBlaze::TAPUT;
+    case 0x28: return MBlaze::CPUT;
+    case 0x2A: return MBlaze::CAPUT;
+    case 0x2C: return MBlaze::TCPUT;
+    case 0x2E: return MBlaze::TCAPUT;
+    case 0x30: return MBlaze::NPUT;
+    case 0x32: return MBlaze::NAPUT;
+    case 0x34: return MBlaze::TNPUT;
+    case 0x36: return MBlaze::TNAPUT;
+    case 0x38: return MBlaze::NCPUT;
+    case 0x3A: return MBlaze::NCAPUT;
+    case 0x3C: return MBlaze::TNCPUT;
+    case 0x3E: return MBlaze::TNCAPUT;
+    }
+}
+
+static unsigned decodeGETD(uint32_t insn) {
+    switch ((insn>>5)&0x3F) {
+    default:   return UNSUPPORTED;
+    case 0x00: return MBlaze::GETD;
+    case 0x01: return MBlaze::EGETD;
+    case 0x02: return MBlaze::AGETD;
+    case 0x03: return MBlaze::EAGETD;
+    case 0x04: return MBlaze::TGETD;
+    case 0x05: return MBlaze::TEGETD;
+    case 0x06: return MBlaze::TAGETD;
+    case 0x07: return MBlaze::TEAGETD;
+    case 0x08: return MBlaze::CGETD;
+    case 0x09: return MBlaze::ECGETD;
+    case 0x0A: return MBlaze::CAGETD;
+    case 0x0B: return MBlaze::ECAGETD;
+    case 0x0C: return MBlaze::TCGETD;
+    case 0x0D: return MBlaze::TECGETD;
+    case 0x0E: return MBlaze::TCAGETD;
+    case 0x0F: return MBlaze::TECAGETD;
+    case 0x10: return MBlaze::NGETD;
+    case 0x11: return MBlaze::NEGETD;
+    case 0x12: return MBlaze::NAGETD;
+    case 0x13: return MBlaze::NEAGETD;
+    case 0x14: return MBlaze::TNGETD;
+    case 0x15: return MBlaze::TNEGETD;
+    case 0x16: return MBlaze::TNAGETD;
+    case 0x17: return MBlaze::TNEAGETD;
+    case 0x18: return MBlaze::NCGETD;
+    case 0x19: return MBlaze::NECGETD;
+    case 0x1A: return MBlaze::NCAGETD;
+    case 0x1B: return MBlaze::NECAGETD;
+    case 0x1C: return MBlaze::TNCGETD;
+    case 0x1D: return MBlaze::TNECGETD;
+    case 0x1E: return MBlaze::TNCAGETD;
+    case 0x1F: return MBlaze::TNECAGETD;
+    case 0x20: return MBlaze::PUTD;
+    case 0x22: return MBlaze::APUTD;
+    case 0x24: return MBlaze::TPUTD;
+    case 0x26: return MBlaze::TAPUTD;
+    case 0x28: return MBlaze::CPUTD;
+    case 0x2A: return MBlaze::CAPUTD;
+    case 0x2C: return MBlaze::TCPUTD;
+    case 0x2E: return MBlaze::TCAPUTD;
+    case 0x30: return MBlaze::NPUTD;
+    case 0x32: return MBlaze::NAPUTD;
+    case 0x34: return MBlaze::TNPUTD;
+    case 0x36: return MBlaze::TNAPUTD;
+    case 0x38: return MBlaze::NCPUTD;
+    case 0x3A: return MBlaze::NCAPUTD;
+    case 0x3C: return MBlaze::TNCPUTD;
+    case 0x3E: return MBlaze::TNCAPUTD;
+    }
+}
+
+static unsigned decodeIDIV(uint32_t insn) {
+    switch (insn&0x3) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::IDIV;
+    case 0x2: return MBlaze::IDIVU;
+    }
+}
+
+static unsigned decodeLBU(uint32_t insn) {
+    switch ((insn>>9)&0x1) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::LBU;
+    case 0x1: return MBlaze::LBUR;
+    }
+}
+
+static unsigned decodeLHU(uint32_t insn) {
+    switch ((insn>>9)&0x1) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::LHU;
+    case 0x1: return MBlaze::LHUR;
+    }
+}
+
+static unsigned decodeLW(uint32_t insn) {
+    switch ((insn>>9)&0x3) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::LW;
+    case 0x1: return MBlaze::LWR;
+    case 0x2: return MBlaze::LWX;
+    }
+}
+
+static unsigned decodeSB(uint32_t insn) {
+    switch ((insn>>9)&0x1) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::SB;
+    case 0x1: return MBlaze::SBR;
+    }
+}
+
+static unsigned decodeSH(uint32_t insn) {
+    switch ((insn>>9)&0x1) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::SH;
+    case 0x1: return MBlaze::SHR;
+    }
+}
+
+static unsigned decodeSW(uint32_t insn) {
+    switch ((insn>>9)&0x3) {
+    default:  return UNSUPPORTED;
+    case 0x0: return MBlaze::SW;
+    case 0x1: return MBlaze::SWR;
+    case 0x2: return MBlaze::SWX;
+    }
+}
+
+static unsigned decodeMFS(uint32_t insn) {
+    switch ((insn>>15)&0x1) {
+    default:   return UNSUPPORTED;
+    case 0x0:
+      switch ((insn>>16)&0x1) {
+      default:   return UNSUPPORTED;
+      case 0x0: return MBlaze::MSRSET;
+      case 0x1: return MBlaze::MSRCLR;
+      }
+    case 0x1:
+      switch ((insn>>14)&0x1) {
+      default:   return UNSUPPORTED;
+      case 0x0: return MBlaze::MFS;
+      case 0x1: return MBlaze::MTS;
+      }
+    }
+}
+
+static unsigned decodeOR(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default:    return UNSUPPORTED;
+    case 0x000: return MBlaze::OR;
+    case 0x400: return MBlaze::PCMPBF;
+    }
+}
+
+static unsigned decodeXOR(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default:    return UNSUPPORTED;
+    case 0x000: return MBlaze::XOR;
+    case 0x400: return MBlaze::PCMPEQ;
+    }
+}
+
+static unsigned decodeANDN(uint32_t insn) {
+    switch (getFLAGS(insn)) {
+    default:    return UNSUPPORTED;
+    case 0x000: return MBlaze::ANDN;
+    case 0x400: return MBlaze::PCMPNE;
+    }
+}
+
+static unsigned decodeRTSD(uint32_t insn) {
+    switch ((insn>>21)&0x1F) {
+    default:   return UNSUPPORTED;
+    case 0x10: return MBlaze::RTSD;
+    case 0x11: return MBlaze::RTID;
+    case 0x12: return MBlaze::RTBD;
+    case 0x14: return MBlaze::RTED;
+    }
+}
+
+static unsigned getOPCODE(uint32_t insn) {
+  unsigned opcode = mblazeBinary2Opcode[ (insn>>26)&0x3F ];
+  switch (opcode) {
+  case MBlaze::MUL:     return decodeMUL(insn);
+  case MBlaze::SEXT8:   return decodeSEXT(insn);
+  case MBlaze::BEQ:     return decodeBEQ(insn);
+  case MBlaze::BEQI:    return decodeBEQI(insn);
+  case MBlaze::BR:      return decodeBR(insn);
+  case MBlaze::BRI:     return decodeBRI(insn);
+  case MBlaze::BSRL:    return decodeBSRL(insn);
+  case MBlaze::BSRLI:   return decodeBSRLI(insn);
+  case MBlaze::RSUBK:   return decodeRSUBK(insn);
+  case MBlaze::FADD:    return decodeFADD(insn);
+  case MBlaze::GET:     return decodeGET(insn);
+  case MBlaze::GETD:    return decodeGETD(insn);
+  case MBlaze::IDIV:    return decodeIDIV(insn);
+  case MBlaze::LBU:     return decodeLBU(insn);
+  case MBlaze::LHU:     return decodeLHU(insn);
+  case MBlaze::LW:      return decodeLW(insn);
+  case MBlaze::SB:      return decodeSB(insn);
+  case MBlaze::SH:      return decodeSH(insn);
+  case MBlaze::SW:      return decodeSW(insn);
+  case MBlaze::MFS:     return decodeMFS(insn);
+  case MBlaze::OR:      return decodeOR(insn);
+  case MBlaze::XOR:     return decodeXOR(insn);
+  case MBlaze::ANDN:    return decodeANDN(insn);
+  case MBlaze::RTSD:    return decodeRTSD(insn);
+  default:              return opcode;
+  }
+}
+
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
+                                        uint64_t &size,
+                                        const MemoryObject &region,
+                                        uint64_t address,
+                                        raw_ostream &vStream,
+                                        raw_ostream &cStream) const {
+  // The machine instruction.
+  uint32_t insn;
+  uint64_t read;
+  uint8_t bytes[4];
+
+  // By default we consume 1 byte on failure
+  size = 1;
+
+  // We want to read exactly 4 bytes of data.
+  if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
+    return Fail;
+
+  // Encoded as a big-endian 32-bit word in the stream.
+  insn = (bytes[0]<<24) | (bytes[1]<<16) | (bytes[2]<< 8) | (bytes[3]<<0);
+
+  // Get the MCInst opcode from the binary instruction and make sure
+  // that it is a valid instruction.
+  unsigned opcode = getOPCODE(insn);
+  if (opcode == UNSUPPORTED)
+    return Fail;
+
+  instr.setOpcode(opcode);
+
+  unsigned RD = getRD(insn);
+  unsigned RA = getRA(insn);
+  unsigned RB = getRB(insn);
+  unsigned RS = getRS(insn);
+
+  uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
+  switch ((tsFlags & MBlazeII::FormMask)) {
+  default: 
+    return Fail;
+
+  case MBlazeII::FC:
+    break;
+
+  case MBlazeII::FRRRR:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+
+  case MBlazeII::FRRR:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    break;
+
+  case MBlazeII::FRR:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+
+  case MBlazeII::FRI:
+    switch (opcode) {
+    default: 
+      return Fail;
+    case MBlaze::MFS:
+      if (RD == UNSUPPORTED)
+        return Fail;
+      instr.addOperand(MCOperand::CreateReg(RD));
+      instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
+      break;
+    case MBlaze::MTS:
+      if (RA == UNSUPPORTED)
+        return Fail;
+      instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
+      instr.addOperand(MCOperand::CreateReg(RA));
+      break;
+    case MBlaze::MSRSET:
+    case MBlaze::MSRCLR:
+      if (RD == UNSUPPORTED)
+        return Fail;
+      instr.addOperand(MCOperand::CreateReg(RD));
+      instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
+      break;
+    }
+    break;
+
+  case MBlazeII::FRRI:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    switch (opcode) {
+    default:
+      instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
+      break;
+    case MBlaze::BSRLI:
+    case MBlaze::BSRAI:
+    case MBlaze::BSLLI:
+      instr.addOperand(MCOperand::CreateImm(insn&0x1F));
+      break;
+    }
+    break;
+
+  case MBlazeII::FCRR:
+    if (RA == UNSUPPORTED || RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    break;
+
+  case MBlazeII::FCRI:
+    if (RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
+    break;
+
+  case MBlazeII::FRCR:
+    if (RD == UNSUPPORTED || RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RB));
+    break;
+
+  case MBlazeII::FRCI:
+    if (RD == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
+    break;
+
+  case MBlazeII::FCCR:
+    if (RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RB));
+    break;
+
+  case MBlazeII::FCCI:
+    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
+    break;
+
+  case MBlazeII::FRRCI:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
+    break;
+
+  case MBlazeII::FRRC:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+
+  case MBlazeII::FRCX:
+    if (RD == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
+    break;
+
+  case MBlazeII::FRCS:
+    if (RD == UNSUPPORTED || RS == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateReg(RS));
+    break;
+
+  case MBlazeII::FCRCS:
+    if (RS == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RS));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+
+  case MBlazeII::FCRCX:
+    if (RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RA));
+    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
+    break;
+
+  case MBlazeII::FCX:
+    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
+    break;
+
+  case MBlazeII::FCR:
+    if (RB == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RB));
+    break;
+
+  case MBlazeII::FRIR:
+    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
+      return Fail;
+    instr.addOperand(MCOperand::CreateReg(RD));
+    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
+    instr.addOperand(MCOperand::CreateReg(RA));
+    break;
+  }
+
+  // We always consume 4 bytes of data on success
+  size = 4;
+
+  return Success;
+}
+
+static MCDisassembler *createMBlazeDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI) {
+  return new MBlazeDisassembler(STI);
+}
+
+extern "C" void LLVMInitializeMBlazeDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheMBlazeTarget,
+                                         createMBlazeDisassembler);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h b/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
new file mode 100644
index 000000000000..b8ff8f607265
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
@@ -0,0 +1,49 @@
+//===-- MBlazeDisassembler.h - Disassembler for MicroBlaze  -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the MBlaze Disassembler. It it the header for
+// MBlazeDisassembler, a subclass of MCDisassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEDISASSEMBLER_H
+#define MBLAZEDISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+  
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+/// MBlazeDisassembler - Disassembler for all MBlaze platforms.
+class MBlazeDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MBlazeDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~MBlazeDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus getInstruction(MCInst &instr,
+                      uint64_t &size,
+                      const MemoryObject &region,
+                      uint64_t address,
+                      raw_ostream &vStream,
+                      raw_ostream &cStream) const;
+};
+
+} // namespace llvm
+  
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp b/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
new file mode 100644
index 000000000000..fc2b3d51b44c
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
@@ -0,0 +1,71 @@
+//===-- MBlazeInstPrinter.cpp - Convert MBlaze MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MBlaze MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MBlazeInstPrinter.h"
+#include "MBlaze.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "MBlazeGenAsmWriter.inc"
+
+void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void MBlazeInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << (int32_t)Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void MBlazeInstPrinter::printFSLImm(const MCInst *MI, int OpNo,
+                                    raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  if (MO.isImm())
+    O << "rfsl" << MO.getImm();
+  else
+    printOperand(MI, OpNo, O, NULL);
+}
+
+void MBlazeInstPrinter::printUnsignedImm(const MCInst *MI, int OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  if (MO.isImm())
+    O << (uint32_t)MO.getImm();
+  else
+    printOperand(MI, OpNo, O, NULL);
+}
+
+void MBlazeInstPrinter::printMemOperand(const MCInst *MI, int OpNo,
+                                        raw_ostream &O, const char *Modifier) {
+  printOperand(MI, OpNo, O, NULL);
+  O << ", ";
+  printOperand(MI, OpNo+1, O, NULL);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
new file mode 100644
index 000000000000..51ba7c359a1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -0,0 +1,43 @@
+//= MBlazeInstPrinter.h - Convert MBlaze MCInst to assembly syntax -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MBlaze MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEINSTPRINTER_H
+#define MBLAZEINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class MBlazeInstPrinter : public MCInstPrinter {
+  public:
+    MBlazeInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                      const char *Modifier = 0);
+    void printFSLImm(const MCInst *MI, int OpNo, raw_ostream &O);
+    void printUnsignedImm(const MCInst *MI, int OpNo, raw_ostream &O);
+    void printMemOperand(const MCInst *MI, int OpNo,raw_ostream &O,
+                         const char *Modifier = 0);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlaze.h b/contrib/llvm/lib/Target/MBlaze/MBlaze.h
new file mode 100644
index 000000000000..1399b85d34d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlaze.h
@@ -0,0 +1,32 @@
+//===-- MBlaze.h - Top-level interface for MBlaze ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MBlaze back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MBLAZE_H
+#define TARGET_MBLAZE_H
+
+#include "MCTargetDesc/MBlazeBaseInfo.h"
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MBlazeTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+
+  FunctionPass *createMBlazeISelDag(MBlazeTargetMachine &TM);
+  FunctionPass *createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &TM);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlaze.td b/contrib/llvm/lib/Target/MBlaze/MBlaze.td
new file mode 100644
index 000000000000..c2888553c5e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlaze.td
@@ -0,0 +1,73 @@
+//===-- MBlaze.td - Describe the MBlaze Target Machine -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MBlaze target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MBlazeRegisterInfo.td"
+include "MBlazeSchedule.td"
+include "MBlazeIntrinsics.td"
+include "MBlazeInstrInfo.td"
+include "MBlazeCallingConv.td"
+
+def MBlazeInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Microblaze Subtarget features                                              //
+//===----------------------------------------------------------------------===//
+
+def FeatureBarrel      : SubtargetFeature<"barrel", "HasBarrel", "true",
+                                "Implements barrel shifter">;
+def FeatureDiv         : SubtargetFeature<"div", "HasDiv", "true",
+                                "Implements hardware divider">;
+def FeatureMul         : SubtargetFeature<"mul", "HasMul", "true",
+                                "Implements hardware multiplier">;
+def FeaturePatCmp      : SubtargetFeature<"patcmp", "HasPatCmp", "true",
+                                "Implements pattern compare instruction">;
+def FeatureFPU         : SubtargetFeature<"fpu", "HasFPU", "true",
+                                "Implements floating point unit">;
+def FeatureMul64       : SubtargetFeature<"mul64", "HasMul64", "true",
+                                "Implements multiplier with 64-bit result">;
+def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
+                                "Implements sqrt and floating point convert">;
+
+//===----------------------------------------------------------------------===//
+// MBlaze processors supported.
+//===----------------------------------------------------------------------===//
+
+def : Processor<"mblaze",  NoItineraries, []>;
+def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
+def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+def MBlazeAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MBlaze : Target {
+  let InstructionSet = MBlazeInstrInfo;
+  let AssemblyWriters = [MBlazeAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
new file mode 100644
index 000000000000..7dafaef0af08
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -0,0 +1,326 @@
+//===-- MBlazeAsmPrinter.cpp - MBlaze LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MBlaze assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-asm-printer"
+
+#include "MBlaze.h"
+#include "InstPrinter/MBlazeInstPrinter.h"
+#include "MBlazeInstrInfo.h"
+#include "MBlazeMCInstLower.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cctype>
+
+using namespace llvm;
+
+namespace {
+  class MBlazeAsmPrinter : public AsmPrinter {
+    const MBlazeSubtarget *Subtarget;
+  public:
+    explicit MBlazeAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {
+      Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "MBlaze Assembly Printer";
+    }
+
+    void printSavedRegsBitmask();
+    void emitFrameDirective();
+    virtual void EmitFunctionBodyStart();
+    virtual void EmitFunctionBodyEnd();
+    virtual void EmitFunctionEntryLabel();
+
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
+      const;
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+    void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+    void printFSLImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                         const char *Modifier = 0);
+
+    void EmitInstruction(const MachineInstr *MI);
+  };
+} // end of anonymous namespace
+
+// #include "MBlazeGenAsmWriter.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//  MBlaze Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "mask  bitmask, offset"
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  R19,48,R15
+//    .mask   0xc0000000,-8
+//       addiu R1, R1, -48
+//       sw R15, 40(R1)
+//       sw R19, 36(R1)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 15 (R15) and
+//    19 (R19) are saved at prologue. As the save order on prologue is from
+//    left to right, R15 is saved first. A -8 offset means that after the
+//    stack pointer subtration, the first register in the mask (R15) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+// Print a 32 bit hex number with all numbers.
+static void printHex32(unsigned int Value, raw_ostream &O) {
+  O << "0x";
+  for (int i = 7; i >= 0; i--)
+    O.write_hex((Value & (0xF << (i*4))) >> (i*4));
+}
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MBlazeAsmPrinter::printSavedRegsBitmask() {
+  const TargetFrameLowering *TFI = TM.getFrameLowering();
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  // CPU Saved Registers Bitmasks
+  unsigned int CPUBitmask = 0;
+
+  // Set the CPU Bitmasks
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = getMBlazeRegisterNumbering(Reg);
+    if (MBlaze::GPRRegClass.contains(Reg))
+      CPUBitmask |= (1 << RegNum);
+  }
+
+  // Return Address and Frame registers must also be set in CPUBitmask.
+  if (TFI->hasFP(*MF))
+    CPUBitmask |= (1 <<  getMBlazeRegisterNumbering(RI.getFrameRegister(*MF)));
+
+  if (MFI->adjustsStack())
+    CPUBitmask |= (1 << getMBlazeRegisterNumbering(RI.getRARegister()));
+
+  // Print CPUBitmask
+  OutStreamer.EmitRawText("\t.mask\t0x" + Twine::utohexstr(CPUBitmask));
+}
+
+/// Frame Directive
+void MBlazeAsmPrinter::emitFrameDirective() {
+  if (!OutStreamer.hasRawTextSupport())
+    return;
+
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  unsigned stkReg = RI.getFrameRegister(*MF);
+  unsigned retReg = RI.getRARegister();
+  unsigned stkSze = MF->getFrameInfo()->getStackSize();
+
+  OutStreamer.EmitRawText("\t.frame\t" +
+                          Twine(MBlazeInstPrinter::getRegisterName(stkReg)) +
+                          "," + Twine(stkSze) + "," +
+                          Twine(MBlazeInstPrinter::getRegisterName(retReg)));
+}
+
+void MBlazeAsmPrinter::EmitFunctionEntryLabel() {
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
+  AsmPrinter::EmitFunctionEntryLabel();
+}
+
+void MBlazeAsmPrinter::EmitFunctionBodyStart() {
+  if (!OutStreamer.hasRawTextSupport())
+    return;
+
+  emitFrameDirective();
+  printSavedRegsBitmask();
+}
+
+void MBlazeAsmPrinter::EmitFunctionBodyEnd() {
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
+}
+
+//===----------------------------------------------------------------------===//
+void MBlazeAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MBlazeMCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+// Print out an operand for an inline asm expression.
+bool MBlazeAsmPrinter::
+PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+void MBlazeAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << MBlazeInstPrinter::getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int32_t)MO.getImm();
+    break;
+
+  case MachineOperand::MO_FPImmediate: {
+    const ConstantFP *fp = MO.getFPImm();
+    printHex32(fp->getValueAPF().bitcastToAPInt().getZExtValue(), O);
+    O << ";\t# immediate = " << *fp;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI"
+      << getFunctionNumber() << "_" << MO.getIndex();
+    if (MO.getOffset())
+      O << "+" << MO.getOffset();
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+}
+
+void MBlazeAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
+                                        raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (uint32_t)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MBlazeAsmPrinter::printFSLImm(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << "rfsl" << (unsigned int)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MBlazeAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                const char *Modifier) {
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MBlazeAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+  if (PI2 != MBB->pred_end())
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *PI;
+
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator())
+    ; // Noop
+  return I == Pred->end() || !I->isBarrier();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMBlazeAsmPrinter() {
+  RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeCallingConv.td b/contrib/llvm/lib/Target/MBlaze/MBlazeCallingConv.td
new file mode 100644
index 000000000000..00a4219d047b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeCallingConv.td
@@ -0,0 +1,24 @@
+//===- MBlazeCallingConv.td - Calling Conventions for MBlaze -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MBlaze architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze ABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def RetCC_MBlaze : CallingConv<[
+  // i32 are returned in registers R3, R4
+  CCIfType<[i32,f32], CCAssignToReg<[R3, R4]>>
+]>;
+
+def CC_MBlaze : CallingConv<[
+  CCIfType<[i32,f32], CCCustom<"CC_MBlaze_AssignReg">>,
+  CCIfType<[i32,f32], CCAssignToStack<4, 4>>
+]>;
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
new file mode 100644
index 000000000000..3d0d1cecd1f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
@@ -0,0 +1,254 @@
+//===-- DelaySlotFiller.cpp - MBlaze delay slot filler --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that attempts to fill instructions with delay slots. If no
+// instructions can be moved into the delay slot then a NOP is placed there.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+#include "MBlaze.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool> MBDisableDelaySlotFiller(
+  "disable-mblaze-delay-filler",
+  cl::init(false),
+  cl::desc("Disable the MBlaze delay slot filter."),
+  cl::Hidden);
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "MBlaze Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) {
+    // Any instruction with an immediate mode operand greater than
+    // 16-bits requires an implicit IMM instruction.
+    unsigned numOper = candidate->getNumOperands();
+    for (unsigned op = 0; op < numOper; ++op) {
+        MachineOperand &mop = candidate->getOperand(op);
+
+        // The operand requires more than 16-bits to represent.
+        if (mop.isImm() && (mop.getImm() < -0x8000 || mop.getImm() > 0x7fff))
+          return true;
+
+        // We must assume that unknown immediate values require more than
+        // 16-bits to represent.
+        if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI())
+          return true;
+
+        // FIXME: we could probably check to see if the FP value happens
+        //        to not need an IMM instruction. For now we just always
+        //        assume that FP values do.
+        if (mop.isFPImm())
+          return true;
+    }
+
+    return false;
+}
+
+static unsigned getLastRealOperand(MachineBasicBlock::iterator &instr) {
+  switch (instr->getOpcode()) {
+  default: return instr->getNumOperands();
+
+  // These instructions have a variable number of operands but the first two
+  // are the "real" operands that we care about during hazard detection.
+  case MBlaze::BRLID:
+  case MBlaze::BRALID:
+  case MBlaze::BRLD:
+  case MBlaze::BRALD:
+    return 2;
+  }
+}
+
+static bool delayHasHazard(MachineBasicBlock::iterator &candidate,
+                           MachineBasicBlock::iterator &slot) {
+  // Hazard check
+  MachineBasicBlock::iterator a = candidate;
+  MachineBasicBlock::iterator b = slot;
+
+  // MBB layout:-
+  //    candidate := a0 = operation(a1, a2)
+  //    ...middle bit...
+  //    slot := b0 = operation(b1, b2)
+
+  // Possible hazards:-/
+  // 1. a1 or a2 was written during the middle bit
+  // 2. a0 was read or written during the middle bit
+  // 3. a0 is one or more of {b0, b1, b2}
+  // 4. b0 is one or more of {a1, a2}
+  // 5. a accesses memory, and the middle bit
+  //    contains a store operation.
+  bool a_is_memory = candidate->mayLoad() || candidate->mayStore();
+
+  // Determine the number of operands in the slot instruction and in the
+  // candidate instruction.
+  const unsigned aend = getLastRealOperand(a);
+  const unsigned bend = getLastRealOperand(b);
+
+  // Check hazards type 1, 2 and 5 by scanning the middle bit
+  MachineBasicBlock::iterator m = a;
+  for (++m; m != b; ++m) {
+    for (unsigned aop = 0; aop<aend; ++aop) {
+      bool aop_is_reg = a->getOperand(aop).isReg();
+      if (!aop_is_reg) continue;
+
+      bool aop_is_def = a->getOperand(aop).isDef();
+      unsigned aop_reg = a->getOperand(aop).getReg();
+
+      const unsigned mend = getLastRealOperand(m);
+      for (unsigned mop = 0; mop<mend; ++mop) {
+        bool mop_is_reg = m->getOperand(mop).isReg();
+        if (!mop_is_reg) continue;
+
+        bool mop_is_def = m->getOperand(mop).isDef();
+        unsigned mop_reg = m->getOperand(mop).getReg();
+
+        if (aop_is_def && (mop_reg == aop_reg))
+            return true; // Hazard type 2, because aop = a0
+        else if (mop_is_def && (mop_reg == aop_reg))
+            return true; // Hazard type 1, because aop in {a1, a2}
+      }
+    }
+
+    // Check hazard type 5
+    if (a_is_memory && m->mayStore())
+      return true;
+  }
+
+  // Check hazard type 3 & 4
+  for (unsigned aop = 0; aop<aend; ++aop) {
+    if (a->getOperand(aop).isReg()) {
+      unsigned aop_reg = a->getOperand(aop).getReg();
+
+      for (unsigned bop = 0; bop<bend; ++bop) {
+        if (b->getOperand(bop).isReg() && !b->getOperand(bop).isImplicit()) {
+          unsigned bop_reg = b->getOperand(bop).getReg();
+          if (aop_reg == bop_reg)
+            return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+static bool isDelayFiller(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator candidate) {
+  if (candidate == MBB.begin())
+    return false;
+
+  --candidate;
+  return (candidate->hasDelaySlot());
+}
+
+static bool hasUnknownSideEffects(MachineBasicBlock::iterator &I) {
+  if (!I->hasUnmodeledSideEffects())
+    return false;
+
+  unsigned op = I->getOpcode();
+  if (op == MBlaze::ADDK || op == MBlaze::ADDIK ||
+      op == MBlaze::ADDC || op == MBlaze::ADDIC ||
+      op == MBlaze::ADDKC || op == MBlaze::ADDIKC ||
+      op == MBlaze::RSUBK || op == MBlaze::RSUBIK ||
+      op == MBlaze::RSUBC || op == MBlaze::RSUBIC ||
+      op == MBlaze::RSUBKC || op == MBlaze::RSUBIKC)
+    return false;
+
+  return true;
+}
+
+static MachineBasicBlock::iterator
+findDelayInstr(MachineBasicBlock &MBB,MachineBasicBlock::iterator slot) {
+  MachineBasicBlock::iterator I = slot;
+  while (true) {
+    if (I == MBB.begin())
+      break;
+
+    --I;
+    if (I->hasDelaySlot() || I->isBranch() || isDelayFiller(MBB,I) ||
+        I->isCall() || I->isReturn() || I->isBarrier() ||
+        hasUnknownSideEffects(I))
+      break;
+
+    if (hasImmInstruction(I) || delayHasHazard(I,slot))
+      continue;
+
+    return I;
+  }
+
+  return MBB.end();
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->hasDelaySlot()) {
+      MachineBasicBlock::iterator D = MBB.end();
+      MachineBasicBlock::iterator J = I;
+
+      if (!MBDisableDelaySlotFiller)
+        D = findDelayInstr(MBB,I);
+
+      ++FilledSlots;
+      Changed = true;
+
+      if (D == MBB.end())
+        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(MBlaze::NOP));
+      else
+        MBB.splice(++J, &MBB, D);
+    }
+  return Changed;
+}
+
+/// createMBlazeDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in MBlaze MachineFunctions
+FunctionPass *llvm::createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &tm) {
+  return new Filler(tm);
+}
+
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.cpp
new file mode 100644
index 000000000000..172304bd5b45
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -0,0 +1,488 @@
+//===-- MBlazeFrameLowering.cpp - MBlaze Frame Information ---------------====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-frame-lowering"
+
+#include "MBlazeFrameLowering.h"
+#include "InstPrinter/MBlazeInstPrinter.h"
+#include "MBlazeInstrInfo.h"
+#include "MBlazeMachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+static cl::opt<bool> MBDisableStackAdjust(
+  "disable-mblaze-stack-adjust",
+  cl::init(false),
+  cl::desc("Disable MBlaze stack layout adjustment."),
+  cl::Hidden);
+
+static void replaceFrameIndexes(MachineFunction &MF, 
+                                SmallVector<std::pair<int,int64_t>, 16> &FR) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  const SmallVector<std::pair<int,int64_t>, 16>::iterator FRB = FR.begin();
+  const SmallVector<std::pair<int,int64_t>, 16>::iterator FRE = FR.end();
+
+  SmallVector<std::pair<int,int64_t>, 16>::iterator FRI = FRB;
+  for (; FRI != FRE; ++FRI) {
+    MFI->RemoveStackObject(FRI->first);
+    int NFI = MFI->CreateFixedObject(4, FRI->second, true);
+    MBlazeFI->recordReplacement(FRI->first, NFI);
+
+    for (MachineFunction::iterator MB=MF.begin(), ME=MF.end(); MB!=ME; ++MB) {
+      MachineBasicBlock::iterator MBB = MB->begin();
+      const MachineBasicBlock::iterator MBE = MB->end();
+
+      for (; MBB != MBE; ++MBB) {
+        MachineInstr::mop_iterator MIB = MBB->operands_begin();
+        const MachineInstr::mop_iterator MIE = MBB->operands_end();
+
+        for (MachineInstr::mop_iterator MII = MIB; MII != MIE; ++MII) {
+          if (!MII->isFI() || MII->getIndex() != FRI->first) continue;
+          DEBUG(dbgs() << "FOUND FI#" << MII->getIndex() << "\n");
+          MII->setIndex(NFI);
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are are done through a positive offset
+// from the stack/frame pointer, so the stack is considered
+// to grow up.
+//
+//===----------------------------------------------------------------------===//
+
+static void analyzeFrameIndexes(MachineFunction &MF) {
+  if (MBDisableStackAdjust) return;
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  MachineRegisterInfo::livein_iterator LII = MRI.livein_begin();
+  MachineRegisterInfo::livein_iterator LIE = MRI.livein_end();
+  const SmallVector<int, 16> &LiveInFI = MBlazeFI->getLiveIn();
+  SmallVector<MachineInstr*, 16> EraseInstr;
+  SmallVector<std::pair<int,int64_t>, 16> FrameRelocate;
+
+  MachineBasicBlock *MBB = MF.getBlockNumbered(0);
+  MachineBasicBlock::iterator MIB = MBB->begin();
+  MachineBasicBlock::iterator MIE = MBB->end();
+
+  int StackAdjust = 0;
+  int StackOffset = -28;
+
+  // In this loop we are searching frame indexes that corrospond to incoming
+  // arguments that are already in the stack. We look for instruction sequences
+  // like the following:
+  //    
+  //    LWI REG, FI1, 0
+  //    ...
+  //    SWI REG, FI2, 0
+  //
+  // As long as there are no defs of REG in the ... part, we can eliminate
+  // the SWI instruction because the value has already been stored to the
+  // stack by the caller. All we need to do is locate FI at the correct
+  // stack location according to the calling convensions.
+  //
+  // Additionally, if the SWI operation kills the def of REG then we don't
+  // need the LWI operation so we can erase it as well.
+  for (unsigned i = 0, e = LiveInFI.size(); i < e; ++i) {
+    for (MachineBasicBlock::iterator I=MIB; I != MIE; ++I) {
+      if (I->getOpcode() != MBlaze::LWI || I->getNumOperands() != 3 ||
+          !I->getOperand(1).isFI() || !I->getOperand(0).isReg() ||
+          I->getOperand(1).getIndex() != LiveInFI[i]) continue;
+
+      unsigned FIReg = I->getOperand(0).getReg();
+      MachineBasicBlock::iterator SI = I;
+      for (SI++; SI != MIE; ++SI) {
+        if (!SI->getOperand(0).isReg() ||
+            !SI->getOperand(1).isFI() ||
+            SI->getOpcode() != MBlaze::SWI) continue;
+
+        int FI = SI->getOperand(1).getIndex();
+        if (SI->getOperand(0).getReg() != FIReg ||
+            MFI->isFixedObjectIndex(FI) ||
+            MFI->getObjectSize(FI) != 4) continue;
+
+        if (SI->getOperand(0).isDef()) break;
+
+        if (SI->getOperand(0).isKill()) {
+          DEBUG(dbgs() << "LWI for FI#" << I->getOperand(1).getIndex() 
+                       << " removed\n");
+          EraseInstr.push_back(I);
+        }
+
+        EraseInstr.push_back(SI);
+        DEBUG(dbgs() << "SWI for FI#" << FI << " removed\n");
+
+        FrameRelocate.push_back(std::make_pair(FI,StackOffset));
+        DEBUG(dbgs() << "FI#" << FI << " relocated to " << StackOffset << "\n");
+
+        StackOffset -= 4;
+        StackAdjust += 4;
+        break;
+      }
+    }
+  }
+
+  // In this loop we are searching for frame indexes that corrospond to
+  // incoming arguments that are in registers. We look for instruction
+  // sequences like the following:
+  //    
+  //    ...  SWI REG, FI, 0
+  // 
+  // As long as the ... part does not define REG and if REG is an incoming
+  // parameter register then we know that, according to ABI convensions, the
+  // caller has allocated stack space for it already.  Instead of allocating
+  // stack space on our frame, we record the correct location in the callers
+  // frame.
+  for (MachineRegisterInfo::livein_iterator LI = LII; LI != LIE; ++LI) {
+    for (MachineBasicBlock::iterator I=MIB; I != MIE; ++I) {
+      if (I->definesRegister(LI->first))
+        break;
+
+      if (I->getOpcode() != MBlaze::SWI || I->getNumOperands() != 3 ||
+          !I->getOperand(1).isFI() || !I->getOperand(0).isReg() ||
+          I->getOperand(1).getIndex() < 0) continue;
+
+      if (I->getOperand(0).getReg() == LI->first) {
+        int FI = I->getOperand(1).getIndex();
+        MBlazeFI->recordLiveIn(FI);
+
+        int FILoc = 0;
+        switch (LI->first) {
+        default: llvm_unreachable("invalid incoming parameter!");
+        case MBlaze::R5:  FILoc = -4; break;
+        case MBlaze::R6:  FILoc = -8; break;
+        case MBlaze::R7:  FILoc = -12; break;
+        case MBlaze::R8:  FILoc = -16; break;
+        case MBlaze::R9:  FILoc = -20; break;
+        case MBlaze::R10: FILoc = -24; break;
+        }
+
+        StackAdjust += 4;
+        FrameRelocate.push_back(std::make_pair(FI,FILoc));
+        DEBUG(dbgs() << "FI#" << FI << " relocated to " << FILoc << "\n");
+        break;
+      }
+    }
+  }
+
+  // Go ahead and erase all of the instructions that we determined were
+  // no longer needed.
+  for (int i = 0, e = EraseInstr.size(); i < e; ++i)
+    MBB->erase(EraseInstr[i]);
+
+  // Replace all of the frame indexes that we have relocated with new
+  // fixed object frame indexes.
+  replaceFrameIndexes(MF, FrameRelocate);
+}
+
+static void interruptFrameLayout(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  CallingConv::ID CallConv = F->getCallingConv();
+
+  // If this function is not using either the interrupt_handler
+  // calling convention or the save_volatiles calling convention
+  // then we don't need to do any additional frame layout.
+  if (CallConv != CallingConv::MBLAZE_INTR &&
+      CallConv != CallingConv::MBLAZE_SVOL)
+      return;
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MBlazeInstrInfo &TII =
+    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  // Determine if the calling convention is the interrupt_handler
+  // calling convention. Some pieces of the prologue and epilogue
+  // only need to be emitted if we are lowering and interrupt handler.
+  bool isIntr = CallConv == CallingConv::MBLAZE_INTR;
+
+  // Determine where to put prologue and epilogue additions
+  MachineBasicBlock &MENT   = MF.front();
+  MachineBasicBlock &MEXT   = MF.back();
+
+  MachineBasicBlock::iterator MENTI = MENT.begin();
+  MachineBasicBlock::iterator MEXTI = prior(MEXT.end());
+
+  DebugLoc ENTDL = MENTI != MENT.end() ? MENTI->getDebugLoc() : DebugLoc();
+  DebugLoc EXTDL = MEXTI != MEXT.end() ? MEXTI->getDebugLoc() : DebugLoc();
+
+  // Store the frame indexes generated during prologue additions for use
+  // when we are generating the epilogue additions.
+  SmallVector<int, 10> VFI;
+
+  // Build the prologue SWI for R3 - R12 if needed. Note that R11 must
+  // always have a SWI because it is used when processing RMSR.
+  for (unsigned r = MBlaze::R3; r <= MBlaze::R12; ++r) {
+    if (!MRI.isPhysRegUsed(r) && !(isIntr && r == MBlaze::R11)) continue;
+    
+    int FI = MFI->CreateStackObject(4,4,false,false);
+    VFI.push_back(FI);
+
+    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), r)
+      .addFrameIndex(FI).addImm(0);
+  }
+    
+  // Build the prologue SWI for R17, R18
+  int R17FI = MFI->CreateStackObject(4,4,false,false);
+  int R18FI = MFI->CreateStackObject(4,4,false,false);
+
+  BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R17)
+    .addFrameIndex(R17FI).addImm(0);
+    
+  BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R18)
+    .addFrameIndex(R18FI).addImm(0);
+
+  // Buid the prologue SWI and the epilogue LWI for RMSR if needed
+  if (isIntr) {
+    int MSRFI = MFI->CreateStackObject(4,4,false,false);
+    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::MFS), MBlaze::R11)
+      .addReg(MBlaze::RMSR);
+    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R11)
+      .addFrameIndex(MSRFI).addImm(0);
+
+    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R11)
+      .addFrameIndex(MSRFI).addImm(0);
+    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::MTS), MBlaze::RMSR)
+      .addReg(MBlaze::R11);
+  }
+
+  // Build the epilogue LWI for R17, R18
+  BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R18)
+    .addFrameIndex(R18FI).addImm(0);
+
+  BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R17)
+    .addFrameIndex(R17FI).addImm(0);
+
+  // Build the epilogue LWI for R3 - R12 if needed
+  for (unsigned r = MBlaze::R12, i = VFI.size(); r >= MBlaze::R3; --r) {
+    if (!MRI.isPhysRegUsed(r)) continue;
+    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), r)
+      .addFrameIndex(VFI[--i]).addImm(0);
+  }
+}
+
+static void determineFrameLayout(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+
+  // Replace the dummy '0' SPOffset by the negative offsets, as explained on
+  // LowerFORMAL_ARGUMENTS. Leaving '0' for while is necessary to avoid
+  // the approach done by calculateFrameObjectOffsets to the stack frame.
+  MBlazeFI->adjustLoadArgsFI(MFI);
+  MBlazeFI->adjustStoreVarArgsFI(MFI);
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+  DEBUG(dbgs() << "Original Frame Size: " << FrameSize << "\n" );
+
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  // unsigned MaxAlign = MFI->getMaxAlignment();
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+  MFI->setStackSize(FrameSize);
+  DEBUG(dbgs() << "Aligned Frame Size: " << FrameSize << "\n" );
+}
+
+int MBlazeFrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) 
+  const {
+  const MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  if (MBlazeFI->hasReplacement(FI))
+    FI = MBlazeFI->getReplacement(FI);
+  return TargetFrameLowering::getFrameIndexOffset(MF,FI);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MBlazeFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+         MFI->hasVarSizedObjects();
+}
+
+void MBlazeFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  const MBlazeInstrInfo &TII =
+    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
+
+  // Determine the correct frame layout
+  determineFrameLayout(MF);
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack() && !requiresRA) return;
+
+  int FPOffset = MBlazeFI->getFPStackOffset();
+  int RAOffset = MBlazeFI->getRAStackOffset();
+
+  // Adjust stack : addi R1, R1, -imm
+  BuildMI(MBB, MBBI, DL, TII.get(MBlaze::ADDIK), MBlaze::R1)
+      .addReg(MBlaze::R1).addImm(-StackSize);
+
+  // swi  R15, R1, stack_loc
+  if (MFI->adjustsStack() || requiresRA) {
+    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::SWI))
+        .addReg(MBlaze::R15).addReg(MBlaze::R1).addImm(RAOffset);
+  }
+
+  if (hasFP(MF)) {
+    // swi  R19, R1, stack_loc
+    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::SWI))
+      .addReg(MBlaze::R19).addReg(MBlaze::R1).addImm(FPOffset);
+
+    // add R19, R1, R0
+    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::ADD), MBlaze::R19)
+      .addReg(MBlaze::R1).addReg(MBlaze::R0);
+  }
+}
+
+void MBlazeFrameLowering::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI     = MF.getInfo<MBlazeFunctionInfo>();
+  const MBlazeInstrInfo &TII =
+    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
+
+  // Get the FI's where RA and FP are saved.
+  int FPOffset = MBlazeFI->getFPStackOffset();
+  int RAOffset = MBlazeFI->getRAStackOffset();
+
+  if (hasFP(MF)) {
+    // add R1, R19, R0
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADD), MBlaze::R1)
+      .addReg(MBlaze::R19).addReg(MBlaze::R0);
+
+    // lwi  R19, R1, stack_loc
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R19)
+      .addReg(MBlaze::R1).addImm(FPOffset);
+  }
+
+  // lwi R15, R1, stack_loc
+  if (MFI->adjustsStack() || requiresRA) {
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R15)
+      .addReg(MBlaze::R1).addImm(RAOffset);
+  }
+
+  // Get the number of bytes from FrameInfo
+  int StackSize = (int) MFI->getStackSize();
+
+  // addi R1, R1, imm
+  if (StackSize) {
+    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADDIK), MBlaze::R1)
+      .addReg(MBlaze::R1).addImm(StackSize);
+  }
+}
+
+// Eliminate ADJCALLSTACKDOWN/ADJCALLSTACKUP pseudo instructions
+void MBlazeFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MBlazeInstrInfo &TII =
+    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'addi r1, r1, -<amt>' and the adjcallstackdown instruction into
+    // 'addi r1, r1, <amt>'
+    MachineInstr *Old = I;
+    int Amount = Old->getOperand(0).getImm() + 4;
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == MBlaze::ADJCALLSTACKDOWN) {
+        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
+                .addReg(MBlaze::R1).addImm(-Amount);
+      } else {
+        assert(Old->getOpcode() == MBlaze::ADJCALLSTACKUP);
+        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
+                .addReg(MBlaze::R1).addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+
+void MBlazeFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
+  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
+
+  if (MFI->adjustsStack() || requiresRA) {
+    MBlazeFI->setRAStackOffset(0);
+    MFI->CreateFixedObject(4,0,true);
+  }
+
+  if (hasFP(MF)) {
+    MBlazeFI->setFPStackOffset(4);
+    MFI->CreateFixedObject(4,4,true);
+  }
+
+  interruptFrameLayout(MF);
+  analyzeFrameIndexes(MF);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.h b/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.h
new file mode 100644
index 000000000000..f4228c5f0890
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeFrameLowering.h
@@ -0,0 +1,56 @@
+//=- MBlazeFrameLowering.h - Define frame lowering for MicroBlaze -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_FRAMEINFO_H
+#define MBLAZE_FRAMEINFO_H
+
+#include "MBlaze.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class MBlazeSubtarget;
+
+class MBlazeFrameLowering : public TargetFrameLowering {
+protected:
+  const MBlazeSubtarget &STI;
+
+public:
+  explicit MBlazeFrameLowering(const MBlazeSubtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 4, 0), STI(sti) {
+  }
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const { return true; }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+
+  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                    RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
new file mode 100644
index 000000000000..78ad24debb1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
@@ -0,0 +1,277 @@
+//===-- MBlazeISelDAGToDAG.cpp - A dag to dag inst selector for MBlaze ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MBlaze target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-isel"
+#include "MBlaze.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeRegisterInfo.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlazeDAGToDAGISel - MBlaze specific code to select MBlaze machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class MBlazeDAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to MBlazeTargetMachine.
+  MBlazeTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the MBlazeSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MBlazeSubtarget &Subtarget;
+
+public:
+  explicit MBlazeDAGToDAGISel(MBlazeTargetMachine &tm) :
+  SelectionDAGISel(tm),
+  TM(tm), Subtarget(tm.getSubtarget<MBlazeSubtarget>()) {}
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MBlaze DAG->DAG Pattern Instruction Selection";
+  }
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MBlazeGenDAGISel.inc"
+
+  /// getTargetMachine - Return a reference to the TargetMachine, casted
+  /// to the target-specific type.
+  const MBlazeTargetMachine &getTargetMachine() {
+    return static_cast<const MBlazeTargetMachine &>(TM);
+  }
+
+  /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+  /// to the target-specific type.
+  const MBlazeInstrInfo *getInstrInfo() {
+    return getTargetMachine().getInstrInfo();
+  }
+
+  SDNode *getGlobalBaseReg();
+  SDNode *Select(SDNode *N);
+
+  // Address Selection
+  bool SelectAddrRegReg(SDValue N, SDValue &Base, SDValue &Index);
+  bool SelectAddrRegImm(SDValue N, SDValue &Disp, SDValue &Base);
+
+  // getI32Imm - Return a target constant with the specified value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+};
+
+}
+
+/// isIntS32Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 32-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS32Immediate(SDNode *N, int32_t &Imm) {
+  unsigned Opc = N->getOpcode();
+  if (Opc != ISD::Constant)
+    return false;
+
+  Imm = (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS32Immediate(SDValue Op, int32_t &Imm) {
+  return isIntS32Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool MBlazeDAGToDAGISel::
+SelectAddrRegReg(SDValue N, SDValue &Base, SDValue &Index) {
+  if (N.getOpcode() == ISD::FrameIndex) return false;
+  if (N.getOpcode() == ISD::TargetExternalSymbol ||
+      N.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  int32_t imm = 0;
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    if (isIntS32Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+
+    if (N.getOperand(0).getOpcode() == ISD::TargetJumpTable ||
+        N.getOperand(1).getOpcode() == ISD::TargetJumpTable)
+      return false; // jump tables.
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 32-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool MBlazeDAGToDAGISel::
+SelectAddrRegImm(SDValue N, SDValue &Base, SDValue &Disp) {
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddrRegReg(N, Base, Disp))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
+    int32_t imm = 0;
+    if (isIntS32Immediate(N.getOperand(1), imm)) {
+      Disp = CurDAG->getTargetConstant(imm, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+    uint32_t Imm = CN->getZExtValue();
+    Disp = CurDAG->getTargetConstant(Imm, CN->getValueType(0));
+    Base = CurDAG->getRegister(MBlaze::R0, CN->getValueType(0));
+    return true;
+  }
+
+  Disp = CurDAG->getTargetConstant(0, TM.getTargetLowering()->getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MBlazeDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MBlazeDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode())
+    return NULL;
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  switch (Opcode) {
+    default: break;
+
+    // Get target GOT address.
+    case ISD::GLOBAL_OFFSET_TABLE:
+      return getGlobalBaseReg();
+
+    case ISD::FrameIndex: {
+        SDValue imm = CurDAG->getTargetConstant(0, MVT::i32);
+        int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
+        EVT VT = Node->getValueType(0);
+        SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
+        unsigned Opc = MBlaze::ADDIK;
+        if (Node->hasOneUse())
+          return CurDAG->SelectNodeTo(Node, Opc, VT, TFI, imm);
+        return CurDAG->getMachineNode(Opc, dl, VT, TFI, imm);
+    }
+
+
+    /// Handle direct and indirect calls when using PIC. On PIC, when
+    /// GOT is smaller than about 64k (small code) the GA target is
+    /// loaded with only one instruction. Otherwise GA's target must
+    /// be loaded with 3 instructions.
+    case MBlazeISD::JmpLink: {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        SDValue Chain  = Node->getOperand(0);
+        SDValue Callee = Node->getOperand(1);
+        SDValue R20Reg = CurDAG->getRegister(MBlaze::R20, MVT::i32);
+        SDValue InFlag(0, 0);
+
+        if ((isa<GlobalAddressSDNode>(Callee)) ||
+            (isa<ExternalSymbolSDNode>(Callee)))
+        {
+          /// Direct call for global addresses and external symbols
+          SDValue GPReg = CurDAG->getRegister(MBlaze::R15, MVT::i32);
+
+          // Use load to get GOT target
+          SDValue Ops[] = { Callee, GPReg, Chain };
+          SDValue Load = SDValue(CurDAG->getMachineNode(MBlaze::LW, dl,
+                                 MVT::i32, MVT::Other, Ops, 3), 0);
+          Chain = Load.getValue(1);
+
+          // Call target must be on T9
+          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Load, InFlag);
+        } else
+          /// Indirect call
+          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Callee, InFlag);
+
+        // Emit Jump and Link Register
+        SDNode *ResNode = CurDAG->getMachineNode(MBlaze::BRLID, dl, MVT::Other,
+                                                 MVT::Glue, R20Reg, Chain);
+        Chain  = SDValue(ResNode, 0);
+        InFlag = SDValue(ResNode, 1);
+        ReplaceUses(SDValue(Node, 0), Chain);
+        ReplaceUses(SDValue(Node, 1), InFlag);
+        return ResNode;
+      }
+    }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  return ResNode;
+}
+
+/// createMBlazeISelDag - This pass converts a legalized DAG into a
+/// MBlaze-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMBlazeISelDag(MBlazeTargetMachine &TM) {
+  return new MBlazeDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.cpp
new file mode 100644
index 000000000000..d4f943297acb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -0,0 +1,1154 @@
+//===-- MBlazeISelLowering.cpp - MBlaze DAG Lowering Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MBlaze uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-lower"
+#include "MBlazeISelLowering.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeSubtarget.h"
+#include "MBlazeTargetMachine.h"
+#include "MBlazeTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                CCValAssign::LocInfo &LocInfo,
+                                ISD::ArgFlagsTy &ArgFlags,
+                                CCState &State);
+
+const char *MBlazeTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    case MBlazeISD::JmpLink    : return "MBlazeISD::JmpLink";
+    case MBlazeISD::GPRel      : return "MBlazeISD::GPRel";
+    case MBlazeISD::Wrap       : return "MBlazeISD::Wrap";
+    case MBlazeISD::ICmp       : return "MBlazeISD::ICmp";
+    case MBlazeISD::Ret        : return "MBlazeISD::Ret";
+    case MBlazeISD::Select_CC  : return "MBlazeISD::Select_CC";
+    default                    : return NULL;
+  }
+}
+
+MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
+  : TargetLowering(TM, new MBlazeTargetObjectFile()) {
+  Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
+
+  // MBlaze does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &MBlaze::GPRRegClass);
+  if (Subtarget->hasFPU()) {
+    addRegisterClass(MVT::f32, &MBlaze::GPRRegClass);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  }
+
+  // Floating point operations which are not supported
+  setOperationAction(ISD::FREM,       MVT::f32, Expand);
+  setOperationAction(ISD::FMA,        MVT::f32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f32, Expand);
+  setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,       MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,       MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS,    MVT::f32, Expand);
+  setOperationAction(ISD::FPOWI,      MVT::f32, Expand);
+  setOperationAction(ISD::FPOW,       MVT::f32, Expand);
+  setOperationAction(ISD::FLOG,       MVT::f32, Expand);
+  setOperationAction(ISD::FLOG2,      MVT::f32, Expand);
+  setOperationAction(ISD::FLOG10,     MVT::f32, Expand);
+  setOperationAction(ISD::FEXP,       MVT::f32, Expand);
+
+  // Load extented operations for i1 types must be promoted
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // Sign extended loads must be expanded
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // MBlaze has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM,    MVT::i32, Expand);
+  setOperationAction(ISD::SREM,    MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // If the processor doesn't support multiply then expand it
+  if (!Subtarget->hasMul()) {
+    setOperationAction(ISD::MUL, MVT::i32, Expand);
+  }
+
+  // If the processor doesn't support 64-bit multiply then expand
+  if (!Subtarget->hasMul() || !Subtarget->hasMul64()) {
+    setOperationAction(ISD::MULHS, MVT::i32, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::i32, Expand);
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  }
+
+  // If the processor doesn't support division then expand
+  if (!Subtarget->hasDiv()) {
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  }
+
+  // Expand unsupported conversions
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+
+  // Expand SELECT_CC
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+  // MBlaze doesn't have MUL_LOHI
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  // Used by legalize types to correctly generate the setcc result.
+  // Without this, every float setcc comes with a AND/OR with the result,
+  // we don't want this, since the fpcmp result goes to a flag register,
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+  AddPromotedToType(ISD::SELECT, MVT::i1, MVT::i32);
+  AddPromotedToType(ISD::SELECT_CC, MVT::i1, MVT::i32);
+
+  // MBlaze Custom Operations
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+
+  // Variable Argument support
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+
+
+  // Operations not directly supported by MBlaze.
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
+  setOperationAction(ISD::BR_JT,              MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,              MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,              MVT::i32,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG,  MVT::i1,    Expand);
+  setOperationAction(ISD::ROTL,               MVT::i32,   Expand);
+  setOperationAction(ISD::ROTR,               MVT::i32,   Expand);
+  setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,    MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,               MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::i32,   Expand);
+  setOperationAction(ISD::CTPOP,              MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP,              MVT::i32,   Expand);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+
+  // MBlaze doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  setMinFunctionAlignment(2);
+
+  setStackPointerRegisterToSaveRestore(MBlaze::R1);
+  computeRegisterProperties();
+}
+
+EVT MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i32;
+}
+
+SDValue MBlazeTargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode())
+  {
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+    case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+MachineBasicBlock*
+MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *MBB)
+                                                  const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+
+  case MBlaze::ShiftRL:
+  case MBlaze::ShiftRA:
+  case MBlaze::ShiftL:
+    return EmitCustomShift(MI, MBB);
+
+  case MBlaze::Select_FCC:
+  case MBlaze::Select_CC:
+    return EmitCustomSelect(MI, MBB);
+
+  case MBlaze::CAS32:
+  case MBlaze::SWP32:
+  case MBlaze::LAA32:
+  case MBlaze::LAS32:
+  case MBlaze::LAD32:
+  case MBlaze::LAO32:
+  case MBlaze::LAX32:
+  case MBlaze::LAN32:
+    return EmitCustomAtomic(MI, MBB);
+
+  case MBlaze::MEMBARRIER:
+    // The Microblaze does not need memory barriers. Just delete the pseudo
+    // instruction and finish.
+    MI->eraseFromParent();
+    return MBB;
+  }
+}
+
+MachineBasicBlock*
+MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
+                                      MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // To "insert" a shift left instruction, we actually have to insert a
+  // simple loop.  The incoming instruction knows the destination vreg to
+  // set, the source vreg to operate over and the shift amount.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  // start:
+  //   andi     samt, samt, 31
+  //   beqid    samt, finish
+  //   add      dst, src, r0
+  // loop:
+  //   addik    samt, samt, -1
+  //   sra      dst, dst
+  //   bneid    samt, loop
+  //   nop
+  // finish:
+  MachineFunction *F = MBB->getParent();
+  MachineRegisterInfo &R = F->getRegInfo();
+  MachineBasicBlock *loop = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *finish = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loop);
+  F->insert(It, finish);
+
+  // Update machine-CFG edges by transferring adding all successors and
+  // remaining instructions from the current block to the new block which
+  // will contain the Phi node for the select.
+  finish->splice(finish->begin(), MBB,
+                 llvm::next(MachineBasicBlock::iterator(MI)),
+                 MBB->end());
+  finish->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the true and fallthrough blocks as its successors.
+  MBB->addSuccessor(loop);
+  MBB->addSuccessor(finish);
+
+  // Next, add the finish block as a successor of the loop block
+  loop->addSuccessor(finish);
+  loop->addSuccessor(loop);
+
+  unsigned IAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(MBB, dl, TII->get(MBlaze::ANDI), IAMT)
+    .addReg(MI->getOperand(2).getReg())
+    .addImm(31);
+
+  unsigned IVAL = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(MBB, dl, TII->get(MBlaze::ADDIK), IVAL)
+    .addReg(MI->getOperand(1).getReg())
+    .addImm(0);
+
+  BuildMI(MBB, dl, TII->get(MBlaze::BEQID))
+    .addReg(IAMT)
+    .addMBB(finish);
+
+  unsigned DST = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NDST = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
+    .addReg(IVAL).addMBB(MBB)
+    .addReg(NDST).addMBB(loop);
+
+  unsigned SAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  unsigned NAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT)
+    .addReg(IAMT).addMBB(MBB)
+    .addReg(NAMT).addMBB(loop);
+
+  if (MI->getOpcode() == MBlaze::ShiftL)
+    BuildMI(loop, dl, TII->get(MBlaze::ADD), NDST).addReg(DST).addReg(DST);
+  else if (MI->getOpcode() == MBlaze::ShiftRA)
+    BuildMI(loop, dl, TII->get(MBlaze::SRA), NDST).addReg(DST);
+  else if (MI->getOpcode() == MBlaze::ShiftRL)
+    BuildMI(loop, dl, TII->get(MBlaze::SRL), NDST).addReg(DST);
+  else
+    llvm_unreachable("Cannot lower unknown shift instruction");
+
+  BuildMI(loop, dl, TII->get(MBlaze::ADDIK), NAMT)
+    .addReg(SAMT)
+    .addImm(-1);
+
+  BuildMI(loop, dl, TII->get(MBlaze::BNEID))
+    .addReg(NAMT)
+    .addMBB(loop);
+
+  BuildMI(*finish, finish->begin(), dl,
+          TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+    .addReg(IVAL).addMBB(MBB)
+    .addReg(NDST).addMBB(loop);
+
+  // The pseudo instruction is no longer needed so remove it
+  MI->eraseFromParent();
+  return finish;
+}
+
+MachineBasicBlock*
+MBlazeTargetLowering::EmitCustomSelect(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *flsBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *dneBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  unsigned Opc;
+  switch (MI->getOperand(4).getImm()) {
+  default: llvm_unreachable("Unknown branch condition");
+  case MBlazeCC::EQ: Opc = MBlaze::BEQID; break;
+  case MBlazeCC::NE: Opc = MBlaze::BNEID; break;
+  case MBlazeCC::GT: Opc = MBlaze::BGTID; break;
+  case MBlazeCC::LT: Opc = MBlaze::BLTID; break;
+  case MBlazeCC::GE: Opc = MBlaze::BGEID; break;
+  case MBlazeCC::LE: Opc = MBlaze::BLEID; break;
+  }
+
+  F->insert(It, flsBB);
+  F->insert(It, dneBB);
+
+  // Transfer the remainder of MBB and its successor edges to dneBB.
+  dneBB->splice(dneBB->begin(), MBB,
+                llvm::next(MachineBasicBlock::iterator(MI)),
+                MBB->end());
+  dneBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  MBB->addSuccessor(flsBB);
+  MBB->addSuccessor(dneBB);
+  flsBB->addSuccessor(dneBB);
+
+  BuildMI(MBB, dl, TII->get(Opc))
+    .addReg(MI->getOperand(3).getReg())
+    .addMBB(dneBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  //BuildMI(dneBB, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+  //  .addReg(MI->getOperand(1).getReg()).addMBB(flsBB)
+  //  .addReg(MI->getOperand(2).getReg()).addMBB(BB);
+
+  BuildMI(*dneBB, dneBB->begin(), dl,
+          TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(flsBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return dneBB;
+}
+
+MachineBasicBlock*
+MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // All atomic instructions on the Microblaze are implemented using the
+  // load-linked / store-conditional style atomic instruction sequences.
+  // Thus, all operations will look something like the following:
+  //
+  //  start:
+  //    lwx     RV, RP, 0
+  //    <do stuff>
+  //    swx     RV, RP, 0
+  //    addic   RC, R0, 0
+  //    bneid   RC, start
+  //
+  //  exit:
+  //
+  // To "insert" a shift left instruction, we actually have to insert a
+  // simple loop.  The incoming instruction knows the destination vreg to
+  // set, the source vreg to operate over and the shift amount.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator It = MBB;
+  ++It;
+
+  // start:
+  //   andi     samt, samt, 31
+  //   beqid    samt, finish
+  //   add      dst, src, r0
+  // loop:
+  //   addik    samt, samt, -1
+  //   sra      dst, dst
+  //   bneid    samt, loop
+  //   nop
+  // finish:
+  MachineFunction *F = MBB->getParent();
+  MachineRegisterInfo &R = F->getRegInfo();
+
+  // Create the start and exit basic blocks for the atomic operation
+  MachineBasicBlock *start = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exit = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, start);
+  F->insert(It, exit);
+
+  // Update machine-CFG edges by transferring adding all successors and
+  // remaining instructions from the current block to the new block which
+  // will contain the Phi node for the select.
+  exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)),
+               MBB->end());
+  exit->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the fallthrough block as its successors.
+  MBB->addSuccessor(start);
+
+  BuildMI(start, dl, TII->get(MBlaze::LWX), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(MBlaze::R0);
+
+  MachineBasicBlock *final = start;
+  unsigned finalReg = 0;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Cannot lower unknown atomic instruction!");
+
+  case MBlaze::SWP32:
+    finalReg = MI->getOperand(2).getReg();
+    start->addSuccessor(exit);
+    start->addSuccessor(start);
+    break;
+
+  case MBlaze::LAN32:
+  case MBlaze::LAX32:
+  case MBlaze::LAO32:
+  case MBlaze::LAD32:
+  case MBlaze::LAS32:
+  case MBlaze::LAA32: {
+    unsigned opcode = 0;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Cannot lower unknown atomic load!");
+    case MBlaze::LAA32: opcode = MBlaze::ADDIK; break;
+    case MBlaze::LAS32: opcode = MBlaze::RSUBIK; break;
+    case MBlaze::LAD32: opcode = MBlaze::AND; break;
+    case MBlaze::LAO32: opcode = MBlaze::OR; break;
+    case MBlaze::LAX32: opcode = MBlaze::XOR; break;
+    case MBlaze::LAN32: opcode = MBlaze::AND; break;
+    }
+
+    finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
+    start->addSuccessor(exit);
+    start->addSuccessor(start);
+
+    BuildMI(start, dl, TII->get(opcode), finalReg)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(2).getReg());
+
+    if (MI->getOpcode() == MBlaze::LAN32) {
+      unsigned tmp = finalReg;
+      finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
+      BuildMI(start, dl, TII->get(MBlaze::XORI), finalReg)
+        .addReg(tmp)
+        .addImm(-1);
+    }
+    break;
+  }
+
+  case MBlaze::CAS32: {
+    finalReg = MI->getOperand(3).getReg();
+    final = F->CreateMachineBasicBlock(LLVM_BB);
+
+    F->insert(It, final);
+    start->addSuccessor(exit);
+    start->addSuccessor(final);
+    final->addSuccessor(exit);
+    final->addSuccessor(start);
+
+    unsigned CMP = R.createVirtualRegister(&MBlaze::GPRRegClass);
+    BuildMI(start, dl, TII->get(MBlaze::CMP), CMP)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(2).getReg());
+
+    BuildMI(start, dl, TII->get(MBlaze::BNEID))
+      .addReg(CMP)
+      .addMBB(exit);
+
+    final->moveAfter(start);
+    exit->moveAfter(final);
+    break;
+  }
+  }
+
+  unsigned CHK = R.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(final, dl, TII->get(MBlaze::SWX))
+    .addReg(finalReg)
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(MBlaze::R0);
+
+  BuildMI(final, dl, TII->get(MBlaze::ADDIC), CHK)
+    .addReg(MBlaze::R0)
+    .addImm(0);
+
+  BuildMI(final, dl, TII->get(MBlaze::BNEID))
+    .addReg(CHK)
+    .addMBB(start);
+
+  // The pseudo instruction is no longer needed so remove it
+  MI->eraseFromParent();
+  return exit;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+//
+
+SDValue MBlazeTargetLowering::LowerSELECT_CC(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc;
+
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    Opc = MBlazeISD::Select_CC;
+    CompareFlag = DAG.getNode(MBlazeISD::ICmp, dl, MVT::i32, LHS, RHS)
+                    .getValue(1);
+  } else {
+    llvm_unreachable("Cannot lower select_cc with unknown type");
+  }
+
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     CompareFlag);
+}
+
+SDValue MBlazeTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, GA);
+}
+
+SDValue MBlazeTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+  llvm_unreachable("TLS not implemented for MicroBlaze.");
+}
+
+SDValue MBlazeTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  SDValue ResNode;
+  SDValue HiPart;
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
+
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 0);
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, JTI);
+}
+
+SDValue MBlazeTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  SDValue ResNode;
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = N->getConstVal();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+                                         N->getOffset(), 0);
+  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, CP);
+}
+
+SDValue MBlazeTargetLowering::LowerVASTART(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MBlazeFunctionInfo *FuncInfo = MF.getInfo<MBlazeFunctionInfo>();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy());
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV),
+                      false, false, 0);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeGenCallingConv.inc"
+
+static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                CCValAssign::LocInfo &LocInfo,
+                                ISD::ArgFlagsTy &ArgFlags,
+                                CCState &State) {
+  static const uint16_t ArgRegs[] = {
+    MBlaze::R5, MBlaze::R6, MBlaze::R7,
+    MBlaze::R8, MBlaze::R9, MBlaze::R10
+  };
+
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+  unsigned Reg = State.AllocateReg(ArgRegs, NumArgRegs);
+  if (!Reg) return false;
+
+  unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+  State.AllocateStack(SizeInBytes, SizeInBytes);
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall.
+SDValue MBlazeTargetLowering::
+LowerCall(TargetLowering::CallLoweringInfo &CLI,
+          SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // MBlaze does not yet support tail call optimization
+  isTailCall = false;
+
+  // The MBlaze requires stack slots for arguments passed to var arg
+  // functions even if they are passed in registers.
+  bool needsRegArgSlots = isVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Variable argument function calls require a minimum of 24-bytes of stack
+  if (isVarArg && NumBytes < 24) NumBytes = 24;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    MVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Register can't get to this point...
+      assert(VA.isMemLoc());
+
+      // Since we are alread passing values on the stack we don't
+      // need to worry about creating additional slots for the
+      // values passed via registers.
+      needsRegArgSlots = false;
+
+      // Create the frame index object for this incoming parameter
+      unsigned ArgSize = VA.getValVT().getSizeInBits()/8;
+      unsigned StackLoc = VA.getLocMemOffset() + 4;
+      int FI = MFI->CreateFixedObject(ArgSize, StackLoc, true);
+
+      SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+      // emit ISD::STORE whichs stores the
+      // parameter value to a stack Location
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         MachinePointerInfo(),
+                                         false, false, 0));
+    }
+  }
+
+  // If we need to reserve stack space for the arguments passed via registers
+  // then create a fixed stack object at the beginning of the stack.
+  if (needsRegArgSlots && TFI.hasReservedCallFrame(MF))
+    MFI->CreateFixedObject(28,0,true);
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                getPointerTy(), 0, 0);
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
+                                getPointerTy(), 0);
+
+  // MBlazeJmpLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+  }
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(MBlazeISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue MBlazeTargetLowering::
+LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
+                bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerFormalArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+SDValue MBlazeTargetLowering::
+LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+
+  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+  MBlazeFI->setVarArgsFrameIndex(0);
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Keep track of the last register used for arguments
+  unsigned ArgRegEnd = 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
+  SDValue StackPtr;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored on registers
+    if (VA.isRegLoc()) {
+      MVT RegVT = VA.getLocVT();
+      ArgRegEnd = VA.getLocReg();
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &MBlaze::GPRRegClass;
+      else if (RegVT == MVT::f32)
+        RC = &MBlaze::GPRRegClass;
+      else
+        llvm_unreachable("RegVT not supported by LowerFormalArguments");
+
+      // Transform the arguments stored on
+      // physical registers into virtual ones
+      unsigned Reg = MF.addLiveIn(ArgRegEnd, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it has been passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size. If if is a floating point value
+      // then convert to the correct type.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+
+      InVals.push_back(ArgValue);
+    } else { // VA.isRegLoc()
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The last argument is not a register
+      ArgRegEnd = 0;
+
+      // The stack pointer offset is relative to the caller stack frame.
+      // Since the real stack size is unknown here, a negative SPOffset
+      // is used so there's a way to adjust these offsets when the stack
+      // size get known (on EliminateFrameIndex). A dummy SPOffset is
+      // used instead of a direct negative address (which is recorded to
+      // be used on emitPrologue) to avoid mis-calc of the first stack
+      // offset on PEI::calculateFrameObjectOffsets.
+      // Arguments are always 32-bit.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      unsigned StackLoc = VA.getLocMemOffset() + 4;
+      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
+      MBlazeFI->recordLoadArgsFI(FI, -StackLoc);
+      MBlazeFI->recordLiveIn(FI);
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                   MachinePointerInfo::getFixedStack(FI),
+                                   false, false, false, 0));
+    }
+  }
+
+  // To meet ABI, when VARARGS are passed on registers, the registers
+  // must have their values written to the caller stack frame. If the last
+  // argument was placed in the stack, there's no need to save any register.
+  if ((isVarArg) && ArgRegEnd) {
+    if (StackPtr.getNode() == 0)
+      StackPtr = DAG.getRegister(StackReg, getPointerTy());
+
+    // The last register argument that must be saved is MBlaze::R10
+    const TargetRegisterClass *RC = &MBlaze::GPRRegClass;
+
+    unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
+    unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
+    unsigned End   = getMBlazeRegisterNumbering(MBlaze::R10);
+    unsigned StackLoc = Start - Begin + 1;
+
+    for (; Start <= End; ++Start, ++StackLoc) {
+      unsigned Reg = getMBlazeRegisterFromNumbering(Start);
+      unsigned LiveReg = MF.addLiveIn(Reg, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
+
+      int FI = MFI->CreateFixedObject(4, 0, true);
+      MBlazeFI->recordStoreVarArgsFI(FI, -(StackLoc*4));
+      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+
+      // Record the frame index of the first variable argument
+      // which is a value necessary to VASTART.
+      if (!MBlazeFI->getVarArgsFrameIndex())
+        MBlazeFI->setVarArgsFrameIndex(FI);
+    }
+  }
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens when on varg functions
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &OutChains[0], OutChains.size());
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue MBlazeTargetLowering::
+LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::OutputArg> &Outs,
+            const SmallVectorImpl<SDValue> &OutVals,
+            DebugLoc dl, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // If this function is using the interrupt_handler calling convention
+  // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
+  unsigned Ret = (CallConv == CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
+                                                        : MBlazeISD::Ret;
+  unsigned Reg = (CallConv == CallingConv::MBLAZE_INTR) ? MBlaze::R14
+                                                        : MBlaze::R15;
+  RetOps.push_back(DAG.getRegister(Reg, MVT::i32));
+
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Ret, dl, MVT::Other, &RetOps[0], RetOps.size());
+}
+
+//===----------------------------------------------------------------------===//
+//                           MBlaze Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MBlazeTargetLowering::ConstraintType MBlazeTargetLowering::
+getConstraintType(const std::string &Constraint) const
+{
+  // MBlaze specific constrainy
+  //
+  // 'd' : An address register. Equivalent to r.
+  // 'y' : Equivalent to r; retained for
+  //       backwards compatibility.
+  // 'f' : Floating Point registers.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':
+      case 'y':
+      case 'f':
+        return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+MBlazeTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'd':
+  case 'y':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f':
+    if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  }
+  return weight;
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::pair<unsigned, const TargetRegisterClass*> MBlazeTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, &MBlaze::GPRRegClass);
+      // TODO: These can't possibly be right, but match what was in
+      // getRegClassForInlineAsmConstraint.
+    case 'd':
+    case 'y':
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &MBlaze::GPRRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+bool MBlazeTargetLowering::
+isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The MBlaze target isn't yet aware of offsets.
+  return false;
+}
+
+bool MBlazeTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  return VT != MVT::f32;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.h b/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.h
new file mode 100644
index 000000000000..f6b4095a93dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -0,0 +1,179 @@
+//===-- MBlazeISelLowering.h - MBlaze DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MBlaze uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBlazeISELLOWERING_H
+#define MBlazeISELLOWERING_H
+
+#include "MBlaze.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MBlazeCC {
+    enum CC {
+      FIRST = 0,
+      EQ,
+      NE,
+      GT,
+      LT,
+      GE,
+      LE
+    };
+
+    inline static CC getOppositeCondition(CC cc) {
+      switch (cc) {
+      default: llvm_unreachable("Unknown condition code");
+      case EQ: return NE;
+      case NE: return EQ;
+      case GT: return LE;
+      case LT: return GE;
+      case GE: return LT;
+      case LE: return GE;
+      }
+    }
+
+    inline static const char *MBlazeCCToString(CC cc) {
+      switch (cc) {
+      default: llvm_unreachable("Unknown condition code");
+      case EQ: return "eq";
+      case NE: return "ne";
+      case GT: return "gt";
+      case LT: return "lt";
+      case GE: return "ge";
+      case LE: return "le";
+      }
+    }
+  }
+
+  namespace MBlazeISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Select CC Pseudo Instruction
+      Select_CC,
+
+      // Wrap up multiple types of instructions
+      Wrap,
+
+      // Integer Compare
+      ICmp,
+
+      // Return from subroutine
+      Ret,
+
+      // Return from interrupt
+      IRet
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+
+  class MBlazeTargetLowering : public TargetLowering  {
+  public:
+    explicit MBlazeTargetLowering(MBlazeTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    EVT getSetCCResultType(EVT VT) const;
+
+  private:
+    // Subtarget Info
+    const MBlazeSubtarget *Subtarget;
+
+
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+    // Lower Operand specifics
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual MachineBasicBlock*
+      EmitCustomShift(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    virtual MachineBasicBlock*
+      EmitCustomSelect(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    virtual MachineBasicBlock*
+            EmitCustomAtomic(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
+    // Inline asm support
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  };
+}
+
+#endif // MBlazeISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFPU.td b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFPU.td
new file mode 100644
index 000000000000..3f145938728c
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFPU.td
@@ -0,0 +1,219 @@
+//===-- MBlazeInstrFPU.td - MBlaze FPU Instruction defs ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze profiles and nodes
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Memory Access Instructions
+//===----------------------------------------------------------------------===//
+class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> :
+             TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr),
+                !strconcat(instr_asm, "   $dst, $addr"),
+                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>;
+
+class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TB<op, (outs GPR:$dst), (ins memri:$addr),
+                 !strconcat(instr_asm, "   $dst, $addr"),
+                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
+
+class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr),
+                 !strconcat(instr_asm, "   $dst, $addr"),
+                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>;
+
+class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+               TB<op, (outs), (ins GPR:$dst, memrr:$addr),
+                  !strconcat(instr_asm, "   $dst, $addr"),
+                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
+
+class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+             InstrItinClass itin> :
+             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
+
+class CmpFN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [], itin>;
+
+class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+             InstrItinClass itin> :
+             TAR<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+                 !strconcat(instr_asm, "   $dst, $c, $b"),
+                 [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
+
+class LogicFI<bits<6> op, string instr_asm> :
+             TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [], IIC_ALU>;
+
+let rb=0 in {
+  class ArithF2<bits<6> op, bits<11> flags, string instr_asm,
+                InstrItinClass itin> :
+                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
+                   !strconcat(instr_asm, "   $dst, $b"),
+                   [], itin>;
+
+  class ArithIF<bits<6> op, bits<11> flags, string instr_asm,
+                InstrItinClass itin> :
+                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
+                   !strconcat(instr_asm, "   $dst, $b"),
+                   [], itin>;
+
+  class ArithFI<bits<6> op, bits<11> flags, string instr_asm,
+                InstrItinClass itin> :
+                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
+                   !strconcat(instr_asm, "   $dst, $b"),
+                   [], itin>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPU Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+let Predicates=[HasFPU] in {
+  def FORI   : LogicFI<0x28, "ori    ">;
+  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIC_FPU>;
+  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIC_FPU>;
+  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIC_FPU>;
+  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIC_FPUd>;
+}
+
+let Predicates=[HasFPU], isCodeGenOnly=1 in {
+  def LWF    :   LoadFM<0x32, "lw      ", load>;
+  def LWFI   :  LoadFMI<0x3A, "lwi     ", load>;
+
+  def SWF    :  StoreFM<0x36, "sw      ", store>;
+  def SWFI   : StoreFMI<0x3E, "swi     ", store>;
+}
+
+let Predicates=[HasFPU,HasSqrt] in {
+  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIC_FPUf>;
+  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIC_FPUi>;
+  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIC_FPUs>;
+}
+
+let isAsCheapAsAMove = 1 in {
+  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>;
+  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>;
+  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>;
+  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>;
+  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>;
+  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>;
+  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>;
+}
+
+
+let usesCustomInserter = 1 in {
+  def Select_FCC : MBlazePseudo<(outs GPR:$dst),
+    (ins GPR:$T, GPR:$F, GPR:$CMP, i32imm:$CC),
+    "; SELECT_FCC PSEUDO!",
+    []>;
+}
+
+// Floating point conversions
+let Predicates=[HasFPU] in {
+  def : Pat<(sint_to_fp GPR:$V), (FLT GPR:$V)>;
+  def : Pat<(fp_to_sint GPR:$V), (FINT GPR:$V)>;
+  def : Pat<(fsqrt GPR:$V), (FSQRT GPR:$V)>;
+}
+
+// SET_CC operations
+let Predicates=[HasFPU] in {
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETEQ),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_EQ GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETNE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_EQ GPR:$L, GPR:$R), 1)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOEQ),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_EQ GPR:$L, GPR:$R), 2)>;
+ def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETONE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (XOR (FCMP_UN GPR:$L, GPR:$R),
+                            (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETONE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETGT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_GT GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETLT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_LT GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETGE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_GE GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETLE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_LE GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOGT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_GT GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOLT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_LT GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOGE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_GE GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOLE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_LE GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUEQ),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUNE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_NE GPR:$L, GPR:$R), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUGT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_GT GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETULT),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_LT GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUGE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_GE GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETULE),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (OR (FCMP_UN GPR:$L, GPR:$R),
+                           (FCMP_LE GPR:$L, GPR:$R)), 2)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETO),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_UN GPR:$L, GPR:$R), 1)>;
+  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUO),
+            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                       (FCMP_UN GPR:$L, GPR:$R), 2)>;
+}
+
+// SELECT operations
+def : Pat<(select (i32 GPR:$C), (f32 GPR:$T), (f32 GPR:$F)),
+          (Select_FCC GPR:$T, GPR:$F, GPR:$C, 2)>;
+
+//===----------------------------------------------------------------------===//
+// Patterns for Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def : Pat<(f32 fpimm:$imm), (FORI (i32 R0), fpimm:$imm)>;
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFSL.td b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFSL.td
new file mode 100644
index 000000000000..91b69de05102
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFSL.td
@@ -0,0 +1,229 @@
+//===-- MBlazeInstrFSL.td - MBlaze FSL Instruction defs ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FSL Instruction Formats
+//===----------------------------------------------------------------------===//
+class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
+             MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b),
+                        !strconcat(instr_asm, " $dst, $b"),
+                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg>
+{
+    bits<5> rd;
+    bits<4> fslno;
+
+    let Inst{6-10}  = rd;
+    let Inst{11-15} = 0x0;
+    let Inst{16}    = 0x0;
+    let Inst{17-21} = flags; // NCTAE
+    let Inst{22-27} = 0x0;
+    let Inst{28-31} = fslno;
+}
+
+class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
+              MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b),
+                         !strconcat(instr_asm, " $dst, $b"),
+                         [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg>
+{
+    bits<5> rd;
+    bits<5> rb;
+
+    let Inst{6-10}  = rd;
+    let Inst{11-15} = 0x0;
+    let Inst{16-20} = rb;
+    let Inst{21}    = 0x0;
+    let Inst{22-26} = flags; // NCTAE
+    let Inst{27-31} = 0x0;
+}
+
+class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
+             MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b),
+                        !strconcat(instr_asm, " $v, $b"),
+                        [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp>
+{
+    bits<5> ra;
+    bits<4> fslno;
+
+    let Inst{6-10}  = 0x0;
+    let Inst{11-15} = ra;
+    let Inst{16}    = 0x1;
+    let Inst{17-20} = flags; // NCTA
+    let Inst{21-27} = 0x0;
+    let Inst{28-31} = fslno;
+}
+
+class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
+              MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b),
+                         !strconcat(instr_asm, " $v, $b"),
+                         [(OpNode GPR:$v, GPR:$b)], IIC_FSLp>
+{
+    bits<5> ra;
+    bits<5> rb;
+
+    let Inst{6-10}  = 0x0;
+    let Inst{11-15} = ra;
+    let Inst{16-20} = rb;
+    let Inst{21}    = 0x1;
+    let Inst{22-25} = flags; // NCTA
+    let Inst{26-31} = 0x0;
+}
+
+class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
+              MBlazeInst<op, FCX, (outs), (ins fslimm:$b),
+                         !strconcat(instr_asm, " $b"),
+                         [(OpNode immZExt4:$b)], IIC_FSLp>
+{
+    bits<4> fslno;
+
+    let Inst{6-10}  = 0x0;
+    let Inst{11-15} = 0x0;
+    let Inst{16}    = 0x1;
+    let Inst{17-20} = flags; // NCTA
+    let Inst{21-27} = 0x0;
+    let Inst{28-31} = fslno;
+}
+
+class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
+               MBlazeInst<op, FCR, (outs), (ins GPR:$b),
+                          !strconcat(instr_asm, " $b"),
+                          [(OpNode GPR:$b)], IIC_FSLp>
+{
+    bits<5> rb;
+
+    let Inst{6-10}  = 0x0;
+    let Inst{11-15} = 0x0;
+    let Inst{16-20} = rb;
+    let Inst{21}    = 0x1;
+    let Inst{22-25} = flags; // NCTA
+    let Inst{26-31} = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// FSL Get Instructions
+//===----------------------------------------------------------------------===//
+def GET      : FSLGet<0x1B, 0x00, "get      ", int_mblaze_fsl_get>;
+def AGET     : FSLGet<0x1B, 0x02, "aget     ", int_mblaze_fsl_aget>;
+def CGET     : FSLGet<0x1B, 0x08, "cget     ", int_mblaze_fsl_cget>;
+def CAGET    : FSLGet<0x1B, 0x0A, "caget    ", int_mblaze_fsl_caget>;
+def EGET     : FSLGet<0x1B, 0x01, "eget     ", int_mblaze_fsl_eget>;
+def EAGET    : FSLGet<0x1B, 0x03, "eaget    ", int_mblaze_fsl_eaget>;
+def ECGET    : FSLGet<0x1B, 0x09, "ecget    ", int_mblaze_fsl_ecget>;
+def ECAGET   : FSLGet<0x1B, 0x0B, "ecaget   ", int_mblaze_fsl_ecaget>;
+def TGET     : FSLGet<0x1B, 0x04, "tget     ", int_mblaze_fsl_tget>;
+def TAGET    : FSLGet<0x1B, 0x06, "taget    ", int_mblaze_fsl_taget>;
+def TCGET    : FSLGet<0x1B, 0x0C, "tcget    ", int_mblaze_fsl_tcget>;
+def TCAGET   : FSLGet<0x1B, 0x0E, "tcaget   ", int_mblaze_fsl_tcaget>;
+def TEGET    : FSLGet<0x1B, 0x05, "teget    ", int_mblaze_fsl_teget>;
+def TEAGET   : FSLGet<0x1B, 0x07, "teaget   ", int_mblaze_fsl_teaget>;
+def TECGET   : FSLGet<0x1B, 0x0D, "tecget   ", int_mblaze_fsl_tecget>;
+def TECAGET  : FSLGet<0x1B, 0x0F, "tecaget  ", int_mblaze_fsl_tecaget>;
+
+let Defs = [CARRY] in {
+  def NGET     : FSLGet<0x1B, 0x10, "nget     ", int_mblaze_fsl_nget>;
+  def NAGET    : FSLGet<0x1B, 0x12, "naget    ", int_mblaze_fsl_naget>;
+  def NCGET    : FSLGet<0x1B, 0x18, "ncget    ", int_mblaze_fsl_ncget>;
+  def NCAGET   : FSLGet<0x1B, 0x1A, "ncaget   ", int_mblaze_fsl_ncaget>;
+  def NEGET    : FSLGet<0x1B, 0x11, "neget    ", int_mblaze_fsl_neget>;
+  def NEAGET   : FSLGet<0x1B, 0x13, "neaget   ", int_mblaze_fsl_neaget>;
+  def NECGET   : FSLGet<0x1B, 0x19, "necget   ", int_mblaze_fsl_necget>;
+  def NECAGET  : FSLGet<0x1B, 0x1B, "necaget  ", int_mblaze_fsl_necaget>;
+  def TNGET    : FSLGet<0x1B, 0x14, "tnget    ", int_mblaze_fsl_tnget>;
+  def TNAGET   : FSLGet<0x1B, 0x16, "tnaget   ", int_mblaze_fsl_tnaget>;
+  def TNCGET   : FSLGet<0x1B, 0x1C, "tncget   ", int_mblaze_fsl_tncget>;
+  def TNCAGET  : FSLGet<0x1B, 0x1E, "tncaget  ", int_mblaze_fsl_tncaget>;
+  def TNEGET   : FSLGet<0x1B, 0x15, "tneget   ", int_mblaze_fsl_tneget>;
+  def TNEAGET  : FSLGet<0x1B, 0x17, "tneaget  ", int_mblaze_fsl_tneaget>;
+  def TNECGET  : FSLGet<0x1B, 0x1D, "tnecget  ", int_mblaze_fsl_tnecget>;
+  def TNECAGET : FSLGet<0x1B, 0x1F, "tnecaget ", int_mblaze_fsl_tnecaget>;
+}
+
+//===----------------------------------------------------------------------===//
+// FSL Dynamic Get Instructions
+//===----------------------------------------------------------------------===//
+def GETD      : FSLGetD<0x13, 0x00, "getd     ", int_mblaze_fsl_get>;
+def AGETD     : FSLGetD<0x13, 0x02, "agetd    ", int_mblaze_fsl_aget>;
+def CGETD     : FSLGetD<0x13, 0x08, "cgetd    ", int_mblaze_fsl_cget>;
+def CAGETD    : FSLGetD<0x13, 0x0A, "cagetd   ", int_mblaze_fsl_caget>;
+def EGETD     : FSLGetD<0x13, 0x01, "egetd    ", int_mblaze_fsl_eget>;
+def EAGETD    : FSLGetD<0x13, 0x03, "eagetd   ", int_mblaze_fsl_eaget>;
+def ECGETD    : FSLGetD<0x13, 0x09, "ecgetd   ", int_mblaze_fsl_ecget>;
+def ECAGETD   : FSLGetD<0x13, 0x0B, "ecagetd  ", int_mblaze_fsl_ecaget>;
+def TGETD     : FSLGetD<0x13, 0x04, "tgetd    ", int_mblaze_fsl_tget>;
+def TAGETD    : FSLGetD<0x13, 0x06, "tagetd   ", int_mblaze_fsl_taget>;
+def TCGETD    : FSLGetD<0x13, 0x0C, "tcgetd   ", int_mblaze_fsl_tcget>;
+def TCAGETD   : FSLGetD<0x13, 0x0E, "tcagetd  ", int_mblaze_fsl_tcaget>;
+def TEGETD    : FSLGetD<0x13, 0x05, "tegetd   ", int_mblaze_fsl_teget>;
+def TEAGETD   : FSLGetD<0x13, 0x07, "teagetd  ", int_mblaze_fsl_teaget>;
+def TECGETD   : FSLGetD<0x13, 0x0D, "tecgetd  ", int_mblaze_fsl_tecget>;
+def TECAGETD  : FSLGetD<0x13, 0x0F, "tecagetd ", int_mblaze_fsl_tecaget>;
+
+let Defs = [CARRY] in {
+  def NGETD     : FSLGetD<0x13, 0x10, "ngetd    ", int_mblaze_fsl_nget>;
+  def NAGETD    : FSLGetD<0x13, 0x12, "nagetd   ", int_mblaze_fsl_naget>;
+  def NCGETD    : FSLGetD<0x13, 0x18, "ncgetd   ", int_mblaze_fsl_ncget>;
+  def NCAGETD   : FSLGetD<0x13, 0x1A, "ncagetd  ", int_mblaze_fsl_ncaget>;
+  def NEGETD    : FSLGetD<0x13, 0x11, "negetd   ", int_mblaze_fsl_neget>;
+  def NEAGETD   : FSLGetD<0x13, 0x13, "neagetd  ", int_mblaze_fsl_neaget>;
+  def NECGETD   : FSLGetD<0x13, 0x19, "necgetd  ", int_mblaze_fsl_necget>;
+  def NECAGETD  : FSLGetD<0x13, 0x1B, "necagetd ", int_mblaze_fsl_necaget>;
+  def TNGETD    : FSLGetD<0x13, 0x14, "tngetd   ", int_mblaze_fsl_tnget>;
+  def TNAGETD   : FSLGetD<0x13, 0x16, "tnagetd  ", int_mblaze_fsl_tnaget>;
+  def TNCGETD   : FSLGetD<0x13, 0x1C, "tncgetd  ", int_mblaze_fsl_tncget>;
+  def TNCAGETD  : FSLGetD<0x13, 0x1E, "tncagetd ", int_mblaze_fsl_tncaget>;
+  def TNEGETD   : FSLGetD<0x13, 0x15, "tnegetd  ", int_mblaze_fsl_tneget>;
+  def TNEAGETD  : FSLGetD<0x13, 0x17, "tneagetd ", int_mblaze_fsl_tneaget>;
+  def TNECGETD  : FSLGetD<0x13, 0x1D, "tnecgetd ", int_mblaze_fsl_tnecget>;
+  def TNECAGETD : FSLGetD<0x13, 0x1F, "tnecagetd", int_mblaze_fsl_tnecaget>;
+}
+
+//===----------------------------------------------------------------------===//
+// FSL Put Instructions
+//===----------------------------------------------------------------------===//
+def PUT     :  FSLPut<0x1B, 0x0, "put      ", int_mblaze_fsl_put>;
+def APUT    :  FSLPut<0x1B, 0x1, "aput     ", int_mblaze_fsl_aput>;
+def CPUT    :  FSLPut<0x1B, 0x4, "cput     ", int_mblaze_fsl_cput>;
+def CAPUT   :  FSLPut<0x1B, 0x5, "caput    ", int_mblaze_fsl_caput>;
+def TPUT    : FSLPutT<0x1B, 0x2, "tput     ", int_mblaze_fsl_tput>;
+def TAPUT   : FSLPutT<0x1B, 0x3, "taput    ", int_mblaze_fsl_taput>;
+def TCPUT   : FSLPutT<0x1B, 0x6, "tcput    ", int_mblaze_fsl_tcput>;
+def TCAPUT  : FSLPutT<0x1B, 0x7, "tcaput   ", int_mblaze_fsl_tcaput>;
+
+let Defs = [CARRY] in {
+  def NPUT    :  FSLPut<0x1B, 0x8, "nput     ", int_mblaze_fsl_nput>;
+  def NAPUT   :  FSLPut<0x1B, 0x9, "naput    ", int_mblaze_fsl_naput>;
+  def NCPUT   :  FSLPut<0x1B, 0xC, "ncput    ", int_mblaze_fsl_ncput>;
+  def NCAPUT  :  FSLPut<0x1B, 0xD, "ncaput   ", int_mblaze_fsl_ncaput>;
+  def TNPUT   : FSLPutT<0x1B, 0xA, "tnput    ", int_mblaze_fsl_tnput>;
+  def TNAPUT  : FSLPutT<0x1B, 0xB, "tnaput   ", int_mblaze_fsl_tnaput>;
+  def TNCPUT  : FSLPutT<0x1B, 0xE, "tncput   ", int_mblaze_fsl_tncput>;
+  def TNCAPUT : FSLPutT<0x1B, 0xF, "tncaput  ", int_mblaze_fsl_tncaput>;
+}
+
+//===----------------------------------------------------------------------===//
+// FSL Dynamic Put Instructions
+//===----------------------------------------------------------------------===//
+def PUTD     :  FSLPutD<0x13, 0x0, "putd     ", int_mblaze_fsl_put>;
+def APUTD    :  FSLPutD<0x13, 0x1, "aputd    ", int_mblaze_fsl_aput>;
+def CPUTD    :  FSLPutD<0x13, 0x4, "cputd    ", int_mblaze_fsl_cput>;
+def CAPUTD   :  FSLPutD<0x13, 0x5, "caputd   ", int_mblaze_fsl_caput>;
+def TPUTD    : FSLPutTD<0x13, 0x2, "tputd    ", int_mblaze_fsl_tput>;
+def TAPUTD   : FSLPutTD<0x13, 0x3, "taputd   ", int_mblaze_fsl_taput>;
+def TCPUTD   : FSLPutTD<0x13, 0x6, "tcputd   ", int_mblaze_fsl_tcput>;
+def TCAPUTD  : FSLPutTD<0x13, 0x7, "tcaputd  ", int_mblaze_fsl_tcaput>;
+
+let Defs = [CARRY] in {
+  def NPUTD    :  FSLPutD<0x13, 0x8, "nputd    ", int_mblaze_fsl_nput>;
+  def NAPUTD   :  FSLPutD<0x13, 0x9, "naputd   ", int_mblaze_fsl_naput>;
+  def NCPUTD   :  FSLPutD<0x13, 0xC, "ncputd   ", int_mblaze_fsl_ncput>;
+  def NCAPUTD  :  FSLPutD<0x13, 0xD, "ncaputd  ", int_mblaze_fsl_ncaput>;
+  def TNPUTD   : FSLPutTD<0x13, 0xA, "tnputd   ", int_mblaze_fsl_tnput>;
+  def TNAPUTD  : FSLPutTD<0x13, 0xB, "tnaputd  ", int_mblaze_fsl_tnaput>;
+  def TNCPUTD  : FSLPutTD<0x13, 0xE, "tncputd  ", int_mblaze_fsl_tncput>;
+  def TNCAPUTD : FSLPutTD<0x13, 0xF, "tncaputd ", int_mblaze_fsl_tncaput>;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFormats.td b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFormats.td
new file mode 100644
index 000000000000..e40432a1b9a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrFormats.td
@@ -0,0 +1,228 @@
+//===-- MBlazeInstrFormats.td - MB Instruction defs --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+      bits<6> Value = val;
+}
+
+def FPseudo : Format<0>;
+def FRRR    : Format<1>;  // ADD, OR, etc.
+def FRRI    : Format<2>;  // ADDI, ORI, etc.
+def FCRR    : Format<3>;  // PUTD, WDC, WIC, BEQ, BNE, BGE, etc.
+def FCRI    : Format<4>;  // RTID, RTED, RTSD, BEQI, BNEI, BGEI, etc.
+def FRCR    : Format<5>;  // BRLD, BRALD, GETD
+def FRCI    : Format<6>;  // BRLID, BRALID, MSRCLR, MSRSET
+def FCCR    : Format<7>;  // BR, BRA, BRD, etc.
+def FCCI    : Format<8>;  // IMM, BRI, BRAI, BRID, etc.
+def FRRCI   : Format<9>;  // BSRLI, BSRAI, BSLLI
+def FRRC    : Format<10>; // SEXT8, SEXT16, SRA, SRC, SRL, FLT, FINT, FSQRT
+def FRCX    : Format<11>; // GET
+def FRCS    : Format<12>; // MFS
+def FCRCS   : Format<13>; // MTS
+def FCRCX   : Format<14>; // PUT
+def FCX     : Format<15>; // TPUT
+def FCR     : Format<16>; // TPUTD
+def FRIR    : Format<17>; // RSUBI
+def FRRRR   : Format<18>; // RSUB, FRSUB
+def FRI     : Format<19>; // RSUB, FRSUB
+def FC      : Format<20>; // NOP
+def FRR     : Format<21>; // CLZ
+
+//===----------------------------------------------------------------------===//
+//  Describe MBlaze instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rd      - dst reg.
+//  ra      - first src. reg.
+//  rb      - second src. reg.
+//  imm16   - 16-bit immediate value.
+//
+//===----------------------------------------------------------------------===//
+
+// Generic MBlaze Format
+class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin> : Instruction {
+  let Namespace = "MBlaze";
+  field bits<32> Inst;
+
+  bits<6> opcode = op;
+  Format Form = form;
+  bits<6> FormBits = Form.Value;
+
+  // Top 6 bits are the 'opcode' field
+  let Inst{0-5} = opcode;
+
+  // If the instruction is marked as a pseudo, set isCodeGenOnly so that the
+  // assembler and disassmbler ignore it.
+  let isCodeGenOnly = !eq(!cast<string>(form), "FPseudo");
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  // TSFlags layout should be kept in sync with MBlazeInstrInfo.h.
+  let TSFlags{5-0}   = FormBits;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instruction class
+//===----------------------------------------------------------------------===//
+class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
+      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>;
+
+//===----------------------------------------------------------------------===//
+// Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|>
+//===----------------------------------------------------------------------===//
+
+class TA<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin> :
+         MBlazeInst<op,FRRR,outs, ins, asmstr, pattern, itin>
+{
+  bits<5> rd;
+  bits<5> ra;
+  bits<5> rb;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra;
+  let Inst{16-20} = rb;
+  let Inst{21-31} = flags;
+}
+
+//===----------------------------------------------------------------------===//
+// Type B instruction class in MBlaze : <|opcode|rd|ra|immediate|>
+//===----------------------------------------------------------------------===//
+
+class TB<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin> :
+         MBlazeInst<op, FRRI, outs, ins, asmstr, pattern, itin>
+{
+  bits<5>  rd;
+  bits<5>  ra;
+  bits<16> imm16;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra;
+  let Inst{16-31} = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Type A instruction class in MBlaze but with the operands reversed
+// in the LLVM DAG : <|opcode|rd|ra|rb|flags|>
+//===----------------------------------------------------------------------===//
+
+class TAR<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
+          list<dag> pattern, InstrItinClass itin> :
+          TA<op, flags, outs, ins, asmstr, pattern, itin>
+{
+  bits<5> rrd;
+  bits<5> rrb;
+  bits<5> rra;
+
+  let Form = FRRRR;
+
+  let rd = rrd;
+  let ra = rra;
+  let rb = rrb;
+}
+
+//===----------------------------------------------------------------------===//
+// Type B instruction class in MBlaze but with the operands reversed in
+// the LLVM DAG : <|opcode|rd|ra|immediate|>
+//===----------------------------------------------------------------------===//
+class TBR<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin> :
+         TB<op, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rrd;
+  bits<16> rimm16;
+  bits<5>  rra;
+
+  let Form = FRIR;
+
+  let rd = rrd;
+  let ra = rra;
+  let imm16 = rimm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Shift immediate instruction class in MBlaze : <|opcode|rd|ra|immediate|>
+//===----------------------------------------------------------------------===//
+class SHT<bits<6> op, bits<2> flags, dag outs, dag ins, string asmstr,
+          list<dag> pattern, InstrItinClass itin> :
+          MBlazeInst<op, FRRI, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+  bits<5>  ra;
+  bits<5>  imm5;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = ra;
+  let Inst{16-20} = 0x0;
+  let Inst{21-22} = flags;
+  let Inst{23-26} = 0x0;
+  let Inst{27-31} = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Special instruction class in MBlaze : <|opcode|rd|imm14|>
+//===----------------------------------------------------------------------===//
+class SPC<bits<6> op, bits<2> flags, dag outs, dag ins, string asmstr,
+          list<dag> pattern, InstrItinClass itin> :
+          MBlazeInst<op, FRI, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+  bits<14> imm14;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15} = 0x0;
+  let Inst{16-17} = flags;
+  let Inst{18-31} = imm14;
+}
+
+//===----------------------------------------------------------------------===//
+// MSR instruction class in MBlaze : <|opcode|rd|imm15|>
+//===----------------------------------------------------------------------===//
+class MSR<bits<6> op, bits<6> flags, dag outs, dag ins, string asmstr,
+          list<dag> pattern, InstrItinClass itin> :
+          MBlazeInst<op, FRI, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+  bits<15> imm15;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-16} = flags;
+  let Inst{17-31} = imm15;
+}
+
+//===----------------------------------------------------------------------===//
+// TCLZ instruction class in MBlaze : <|opcode|rd|imm15|>
+//===----------------------------------------------------------------------===//
+class TCLZ<bits<6> op, bits<16> flags, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<op, FRR, outs, ins, asmstr, pattern, itin> {
+  bits<5>  rd;
+  bits<5>  ra;
+
+  let Inst{6-10}  = rd;
+  let Inst{11-15}  = ra;
+  let Inst{16-31}  = flags;
+}
+
+//===----------------------------------------------------------------------===//
+// MBAR instruction class in MBlaze : <|opcode|rd|imm15|>
+//===----------------------------------------------------------------------===//
+class MBAR<bits<6> op, bits<26> flags, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin> :
+           MBlazeInst<op, FC, outs, ins, asmstr, pattern, itin> {
+  let Inst{6-31}  = flags;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.cpp
new file mode 100644
index 000000000000..79449f73f74e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -0,0 +1,297 @@
+//===-- MBlazeInstrInfo.cpp - MBlaze Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeInstrInfo.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR
+#include "MBlazeGenInstrInfo.inc"
+
+using namespace llvm;
+
+MBlazeInstrInfo::MBlazeInstrInfo(MBlazeTargetMachine &tm)
+  : MBlazeGenInstrInfo(MBlaze::ADJCALLSTACKDOWN, MBlaze::ADJCALLSTACKUP),
+    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MBlazeInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const {
+  if (MI->getOpcode() == MBlaze::LWI) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MBlazeInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const {
+  if (MI->getOpcode() == MBlaze::SWI) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MBlazeInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const {
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(MBlaze::NOP));
+}
+
+void MBlazeInstrInfo::
+copyPhysReg(MachineBasicBlock &MBB,
+            MachineBasicBlock::iterator I, DebugLoc DL,
+            unsigned DestReg, unsigned SrcReg,
+            bool KillSrc) const {
+  llvm::BuildMI(MBB, I, DL, get(MBlaze::ADDK), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc)).addReg(MBlaze::R0);
+}
+
+void MBlazeInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  BuildMI(MBB, I, DL, get(MBlaze::SWI)).addReg(SrcReg,getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(0); //.addFrameIndex(FI);
+}
+
+void MBlazeInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  BuildMI(MBB, I, DL, get(MBlaze::LWI), DestReg)
+      .addFrameIndex(FI).addImm(0); //.addFrameIndex(FI);
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+bool MBlazeInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (MBlaze::isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (MBlaze::isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with something like BEQID then BRID, handle it.
+  if (MBlaze::isCondBranchOpcode(SecondLastInst->getOpcode()) &&
+      MBlaze::isUncondBranchOpcode(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.
+  // The second one is not executed, so remove it.
+  if (MBlaze::isUncondBranchOpcode(SecondLastInst->getOpcode()) &&
+      MBlaze::isUncondBranchOpcode(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned MBlazeInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "MBlaze branch conditions have two components!");
+
+  unsigned Opc = MBlaze::BRID;
+  if (!Cond.empty())
+    Opc = (unsigned)Cond[0].getImm();
+
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch
+      BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+    else              // Conditional branch
+      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+
+  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(MBlaze::BRID)).addMBB(FBB);
+  return 2;
+}
+
+unsigned MBlazeInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+
+  if (!MBlaze::isUncondBranchOpcode(I->getOpcode()) &&
+      !MBlaze::isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!MBlaze::isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand>
+                                               &Cond) const {
+  assert(Cond.size() == 2 && "Invalid MBlaze branch opcode!");
+  switch (Cond[0].getImm()) {
+  default:            return true;
+  case MBlaze::BEQ:   Cond[0].setImm(MBlaze::BNE); return false;
+  case MBlaze::BNE:   Cond[0].setImm(MBlaze::BEQ); return false;
+  case MBlaze::BGT:   Cond[0].setImm(MBlaze::BLE); return false;
+  case MBlaze::BGE:   Cond[0].setImm(MBlaze::BLT); return false;
+  case MBlaze::BLT:   Cond[0].setImm(MBlaze::BGE); return false;
+  case MBlaze::BLE:   Cond[0].setImm(MBlaze::BGT); return false;
+  case MBlaze::BEQI:  Cond[0].setImm(MBlaze::BNEI); return false;
+  case MBlaze::BNEI:  Cond[0].setImm(MBlaze::BEQI); return false;
+  case MBlaze::BGTI:  Cond[0].setImm(MBlaze::BLEI); return false;
+  case MBlaze::BGEI:  Cond[0].setImm(MBlaze::BLTI); return false;
+  case MBlaze::BLTI:  Cond[0].setImm(MBlaze::BGEI); return false;
+  case MBlaze::BLEI:  Cond[0].setImm(MBlaze::BGTI); return false;
+  case MBlaze::BEQD:  Cond[0].setImm(MBlaze::BNED); return false;
+  case MBlaze::BNED:  Cond[0].setImm(MBlaze::BEQD); return false;
+  case MBlaze::BGTD:  Cond[0].setImm(MBlaze::BLED); return false;
+  case MBlaze::BGED:  Cond[0].setImm(MBlaze::BLTD); return false;
+  case MBlaze::BLTD:  Cond[0].setImm(MBlaze::BGED); return false;
+  case MBlaze::BLED:  Cond[0].setImm(MBlaze::BGTD); return false;
+  case MBlaze::BEQID: Cond[0].setImm(MBlaze::BNEID); return false;
+  case MBlaze::BNEID: Cond[0].setImm(MBlaze::BEQID); return false;
+  case MBlaze::BGTID: Cond[0].setImm(MBlaze::BLEID); return false;
+  case MBlaze::BGEID: Cond[0].setImm(MBlaze::BLTID); return false;
+  case MBlaze::BLTID: Cond[0].setImm(MBlaze::BGEID); return false;
+  case MBlaze::BLEID: Cond[0].setImm(MBlaze::BGTID); return false;
+  }
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  MBlazeFunctionInfo *MBlazeFI = MF->getInfo<MBlazeFunctionInfo>();
+  unsigned GlobalBaseReg = MBlazeFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(&MBlaze::GPRRegClass);
+  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+          GlobalBaseReg).addReg(MBlaze::R20);
+  RegInfo.addLiveIn(MBlaze::R20);
+
+  MBlazeFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.h b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.h
new file mode 100644
index 000000000000..5252147b48e6
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -0,0 +1,240 @@
+//===-- MBlazeInstrInfo.h - MBlaze Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEINSTRUCTIONINFO_H
+#define MBLAZEINSTRUCTIONINFO_H
+
+#include "MBlaze.h"
+#include "MBlazeRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MBlazeGenInstrInfo.inc"
+
+namespace llvm {
+
+namespace MBlaze {
+
+  // MBlaze Branch Codes
+  enum FPBranchCode {
+    BRANCH_F,
+    BRANCH_T,
+    BRANCH_FL,
+    BRANCH_TL,
+    BRANCH_INVALID
+  };
+
+  // MBlaze Condition Codes
+  enum CondCode {
+    // To be used with float branch True
+    FCOND_F,
+    FCOND_UN,
+    FCOND_EQ,
+    FCOND_UEQ,
+    FCOND_OLT,
+    FCOND_ULT,
+    FCOND_OLE,
+    FCOND_ULE,
+    FCOND_SF,
+    FCOND_NGLE,
+    FCOND_SEQ,
+    FCOND_NGL,
+    FCOND_LT,
+    FCOND_NGE,
+    FCOND_LE,
+    FCOND_NGT,
+
+    // To be used with float branch False
+    // This conditions have the same mnemonic as the
+    // above ones, but are used with a branch False;
+    FCOND_T,
+    FCOND_OR,
+    FCOND_NEQ,
+    FCOND_OGL,
+    FCOND_UGE,
+    FCOND_OGE,
+    FCOND_UGT,
+    FCOND_OGT,
+    FCOND_ST,
+    FCOND_GLE,
+    FCOND_SNE,
+    FCOND_GL,
+    FCOND_NLT,
+    FCOND_GE,
+    FCOND_NLE,
+    FCOND_GT,
+
+    // Only integer conditions
+    COND_EQ,
+    COND_GT,
+    COND_GE,
+    COND_LT,
+    COND_LE,
+    COND_NE,
+    COND_INVALID
+  };
+
+  // Turn condition code into conditional branch opcode.
+  inline static unsigned GetCondBranchFromCond(CondCode CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case COND_EQ: return MBlaze::BEQID;
+    case COND_NE: return MBlaze::BNEID;
+    case COND_GT: return MBlaze::BGTID;
+    case COND_GE: return MBlaze::BGEID;
+    case COND_LT: return MBlaze::BLTID;
+    case COND_LE: return MBlaze::BLEID;
+    }
+  }
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  // CondCode GetOppositeBranchCondition(MBlaze::CondCode CC);
+
+  /// MBlazeCCToString - Map each FP condition code to its string
+  inline static const char *MBlazeFCCToString(MBlaze::CondCode CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case FCOND_F:
+    case FCOND_T:   return "f";
+    case FCOND_UN:
+    case FCOND_OR:  return "un";
+    case FCOND_EQ:
+    case FCOND_NEQ: return "eq";
+    case FCOND_UEQ:
+    case FCOND_OGL: return "ueq";
+    case FCOND_OLT:
+    case FCOND_UGE: return "olt";
+    case FCOND_ULT:
+    case FCOND_OGE: return "ult";
+    case FCOND_OLE:
+    case FCOND_UGT: return "ole";
+    case FCOND_ULE:
+    case FCOND_OGT: return "ule";
+    case FCOND_SF:
+    case FCOND_ST:  return "sf";
+    case FCOND_NGLE:
+    case FCOND_GLE: return "ngle";
+    case FCOND_SEQ:
+    case FCOND_SNE: return "seq";
+    case FCOND_NGL:
+    case FCOND_GL:  return "ngl";
+    case FCOND_LT:
+    case FCOND_NLT: return "lt";
+    case FCOND_NGE:
+    case FCOND_GE:  return "ge";
+    case FCOND_LE:
+    case FCOND_NLE: return "nle";
+    case FCOND_NGT:
+    case FCOND_GT:  return "gt";
+    }
+  }
+
+  inline static bool isUncondBranchOpcode(int Opc) {
+    switch (Opc) {
+    default: return false;
+    case MBlaze::BRI:
+    case MBlaze::BRAI:
+    case MBlaze::BRID:
+    case MBlaze::BRAID:
+      return true;
+    }
+  }
+
+  inline static bool isCondBranchOpcode(int Opc) {
+    switch (Opc) {
+    default: return false;
+    case MBlaze::BEQI: case MBlaze::BEQID:
+    case MBlaze::BNEI: case MBlaze::BNEID:
+    case MBlaze::BGTI: case MBlaze::BGTID:
+    case MBlaze::BGEI: case MBlaze::BGEID:
+    case MBlaze::BLTI: case MBlaze::BLTID:
+    case MBlaze::BLEI: case MBlaze::BLEID:
+      return true;
+    }
+  }
+}
+
+class MBlazeInstrInfo : public MBlazeGenInstrInfo {
+  MBlazeTargetMachine &TM;
+  const MBlazeRegisterInfo RI;
+public:
+  explicit MBlazeInstrInfo(MBlazeTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MBlazeRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  /// Branch Analysis
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+    const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  /// Insert nop instruction when hazard condition is found
+  virtual void insertNoop(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.td b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.td
new file mode 100644
index 000000000000..f86bc0b0b5a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -0,0 +1,1052 @@
+//===-- MBlazeInstrInfo.td - MBlaze Instruction defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+include "MBlazeInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// MBlaze type profiles
+//===----------------------------------------------------------------------===//
+
+// def SDTMBlazeSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>]>;
+def SDT_MBlazeRet     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_MBlazeIRet    : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_MBlazeJmpLink : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def SDT_MBCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MBCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze specific nodes
+//===----------------------------------------------------------------------===//
+
+def MBlazeRet     : SDNode<"MBlazeISD::Ret", SDT_MBlazeRet,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MBlazeIRet    : SDNode<"MBlazeISD::IRet", SDT_MBlazeIRet,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MBlazeJmpLink : SDNode<"MBlazeISD::JmpLink",SDT_MBlazeJmpLink,
+                           [SDNPHasChain,SDNPOptInGlue,SDNPOutGlue,
+                            SDNPVariadic]>;
+
+def MBWrapper   : SDNode<"MBlazeISD::Wrap", SDTIntUnaryOp>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MBCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+// def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
+def HasBarrel    : Predicate<"Subtarget.hasBarrel()">;
+// def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
+def HasDiv       : Predicate<"Subtarget.hasDiv()">;
+def HasMul       : Predicate<"Subtarget.hasMul()">;
+// def HasFSL       : Predicate<"Subtarget.hasFSL()">;
+// def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
+// def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
+// def HasException : Predicate<"Subtarget.hasException()">;
+def HasPatCmp    : Predicate<"Subtarget.hasPatCmp()">;
+def HasFPU       : Predicate<"Subtarget.hasFPU()">;
+// def HasESR       : Predicate<"Subtarget.hasESR()">;
+// def HasPVR       : Predicate<"Subtarget.hasPVR()">;
+def HasMul64     : Predicate<"Subtarget.hasMul64()">;
+def HasSqrt      : Predicate<"Subtarget.hasSqrt()">;
+// def HasMMU       : Predicate<"Subtarget.hasMMU()">;
+
+//===----------------------------------------------------------------------===//
+// MBlaze Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def MBlazeMemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let SuperClasses = [];
+}
+
+def MBlazeFslAsmOperand : AsmOperandClass {
+  let Name = "Fsl";
+  let SuperClasses = [];
+}
+
+// Instruction operand types
+def brtarget    : Operand<OtherVT>;
+def calltarget  : Operand<i32>;
+def simm16      : Operand<i32>;
+def uimm5       : Operand<i32>;
+def uimm15      : Operand<i32>;
+def fimm        : Operand<f32>;
+
+// Unsigned Operand
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// FSL Operand
+def fslimm      : Operand<i32> {
+  let PrintMethod = "printFSLImm";
+  let ParserMatchClass = MBlazeFslAsmOperand;
+}
+
+// Address operand
+def memri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR, simm16);
+  let ParserMatchClass = MBlazeMemAsmOperand;
+}
+
+def memrr : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR, GPR);
+  let ParserMatchClass = MBlazeMemAsmOperand;
+}
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+def immSExt16  : PatLeaf<(imm), [{
+  return (N->getZExtValue() >> 16) == 0;
+}]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  return (N->getZExtValue() >> 16) == 0;
+}]>;
+
+// FSL immediate field must fit in 4 bits.
+def immZExt4 : PatLeaf<(imm), [{
+  return N->getZExtValue() == ((N->getZExtValue()) & 0xf) ;
+}]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : PatLeaf<(imm), [{
+  return N->getZExtValue() == ((N->getZExtValue()) & 0x1f) ;
+}]>;
+
+// MBlaze Address Mode. SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def iaddr : ComplexPattern<i32, 2, "SelectAddrRegImm", [frameindex], []>;
+def xaddr : ComplexPattern<i32, 2, "SelectAddrRegReg", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : MBlazePseudo<(outs), (ins simm16:$amt),
+                                  "#ADJCALLSTACKDOWN $amt",
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MBlazePseudo<(outs),
+                                  (ins uimm16:$amt1, simm16:$amt2),
+                                  "#ADJCALLSTACKUP $amt1",
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+class Arith<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+            InstrItinClass itin> :
+            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
+
+class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+             TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>;
+
+class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
+               TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
+                  !strconcat(instr_asm, "   $dst, $b, $c"),
+                  [], IIC_ALU>;
+
+class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+             SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>;
+
+class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
+            InstrItinClass itin> :
+            TAR<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+                !strconcat(instr_asm, "   $dst, $c, $b"),
+                [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
+
+class ArithRI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+             TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c),
+                 !strconcat(instr_asm, "   $dst, $c, $b"),
+                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>;
+
+class ArithN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [], itin>;
+
+class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
+             TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [], IIC_ALU>;
+
+class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
+            InstrItinClass itin> :
+            TAR<op, flags, (outs GPR:$dst), (ins GPR:$c, GPR:$b),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [], itin>;
+
+class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
+             TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b),
+                 !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [], IIC_ALU>;
+
+//===----------------------------------------------------------------------===//
+// Misc Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> :
+            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+               !strconcat(instr_asm, "   $dst, $b, $c"),
+               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>;
+
+class LogicI<bits<6> op, string instr_asm, SDNode OpNode> :
+             TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))],
+                IIC_ALU>;
+
+class LogicI32<bits<6> op, string instr_asm> :
+               TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
+                  !strconcat(instr_asm, "   $dst, $b, $c"),
+                  [], IIC_ALU>;
+
+class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
+             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
+                !strconcat(instr_asm, "   $dst, $b, $c"),
+                 [], IIC_ALU>;
+
+//===----------------------------------------------------------------------===//
+// Memory Access Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+class LoadM<bits<6> op, bits<11> flags, string instr_asm> :
+            TA<op, flags, (outs GPR:$dst), (ins memrr:$addr),
+               !strconcat(instr_asm, "   $dst, $addr"),
+               [], IIC_MEMl>;
+}
+
+class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+             TB<op, (outs GPR:$dst), (ins memri:$addr),
+                !strconcat(instr_asm, "   $dst, $addr"),
+                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
+
+let mayStore = 1 in {
+class StoreM<bits<6> op, bits<11> flags, string instr_asm> :
+             TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr),
+                !strconcat(instr_asm, "   $dst, $addr"),
+                [], IIC_MEMs>;
+}
+
+class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
+              TB<op, (outs), (ins GPR:$dst, memri:$addr),
+                 !strconcat(instr_asm, "   $dst, $addr"),
+                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
+             TA<op, flags, (outs), (ins GPR:$target),
+                !strconcat(instr_asm, "   $target"),
+                [], IIC_BR> {
+  let rd = 0x0;
+  let ra = br;
+  let Form = FCCR;
+}
+
+class BranchI<bits<6> op, bits<5> br, string instr_asm> :
+              TB<op, (outs), (ins brtarget:$target),
+                 !strconcat(instr_asm, "   $target"),
+                 [], IIC_BR> {
+  let rd = 0;
+  let ra = br;
+  let Form = FCCI;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch and Link Instructions
+//===----------------------------------------------------------------------===//
+class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
+              TA<op, flags, (outs), (ins GPR:$link, GPR:$target),
+                 !strconcat(instr_asm, "   $link, $target"),
+                 [], IIC_BRl> {
+  let ra = br;
+  let Form = FRCR;
+}
+
+class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
+               TB<op, (outs), (ins GPR:$link, calltarget:$target),
+                  !strconcat(instr_asm, "   $link, $target"),
+                  [], IIC_BRl> {
+  let ra = br;
+  let Form = FRCI;
+}
+
+//===----------------------------------------------------------------------===//
+// Conditional Branch Instructions
+//===----------------------------------------------------------------------===//
+class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
+              TA<op, flags, (outs),
+                 (ins GPR:$a, GPR:$b),
+                 !strconcat(instr_asm, "   $a, $b"),
+                 [], IIC_BRc> {
+  let rd = br;
+  let Form = FCRR;
+}
+
+class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
+               TB<op, (outs), (ins GPR:$a, brtarget:$offset),
+                  !strconcat(instr_asm, "   $a, $offset"),
+                  [], IIC_BRc> {
+  let rd = br;
+  let Form = FCRI;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze arithmetic instructions
+//===----------------------------------------------------------------------===//
+
+let isCommutable = 1, isAsCheapAsAMove = 1 in {
+  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIC_ALU>;
+  def AND    :  Logic<0x21, 0x000, "and    ", and>;
+  def OR     :  Logic<0x20, 0x000, "or     ", or>;
+  def XOR    :  Logic<0x22, 0x000, "xor    ", xor>;
+
+  let Predicates=[HasPatCmp] in {
+    def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
+    def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
+    def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
+  }
+
+  let Defs = [CARRY] in {
+    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIC_ALU>;
+
+    let Uses = [CARRY] in {
+      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIC_ALU>;
+    }
+  }
+
+  let Uses = [CARRY] in {
+    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIC_ALU>;
+  }
+}
+
+let isAsCheapAsAMove = 1 in {
+  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIC_ALU>;
+  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIC_ALU>;
+  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIC_ALU>;
+  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIC_ALU>;
+
+  let Defs = [CARRY] in {
+    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIC_ALU>;
+
+    let Uses = [CARRY] in {
+      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIC_ALU>;
+    }
+  }
+
+  let Uses = [CARRY] in {
+    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>;
+  }
+}
+
+let isCommutable = 1, Predicates=[HasMul] in {
+  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIC_ALUm>;
+}
+
+let isCommutable = 1, Predicates=[HasMul,HasMul64] in {
+  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIC_ALUm>;
+  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIC_ALUm>;
+}
+
+let Predicates=[HasMul,HasMul64] in {
+  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>;
+}
+
+let Predicates=[HasBarrel] in {
+  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIC_SHT>;
+  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIC_SHT>;
+  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIC_SHT>;
+  def BSRLI  :  ShiftI<0x19, 0x0, "bsrli  ", srl, uimm5, immZExt5>;
+  def BSRAI  :  ShiftI<0x19, 0x1, "bsrai  ", sra, uimm5, immZExt5>;
+  def BSLLI  :  ShiftI<0x19, 0x2, "bslli  ", shl, uimm5, immZExt5>;
+}
+
+let Predicates=[HasDiv] in {
+  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIC_ALUd>;
+  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIC_ALUd>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze immediate mode arithmetic instructions
+//===----------------------------------------------------------------------===//
+
+let isAsCheapAsAMove = 1 in {
+  def ADDIK   :   ArithI<0x0C, "addik  ", add,  simm16, immSExt16>;
+  def RSUBIK  :  ArithRI<0x0D, "rsubik ", sub, simm16, immSExt16>;
+  def ANDNI   :  ArithNI<0x2B, "andni  ", uimm16, immZExt16>;
+  def ANDI    :   LogicI<0x29, "andi   ", and>;
+  def ORI     :   LogicI<0x28, "ori    ", or>;
+  def XORI    :   LogicI<0x2A, "xori   ", xor>;
+
+  let Defs = [CARRY] in {
+    def ADDI    :   ArithI<0x08, "addi   ", addc, simm16, immSExt16>;
+    def RSUBI   :  ArithRI<0x09, "rsubi  ", subc,  simm16, immSExt16>;
+
+    let Uses = [CARRY] in {
+      def ADDIC   :   ArithI<0x0A, "addic  ", adde, simm16, immSExt16>;
+      def RSUBIC  :  ArithRI<0x0B, "rsubic ", sube, simm16, immSExt16>;
+    }
+  }
+
+  let Uses = [CARRY] in {
+    def ADDIKC  :  ArithNI<0x0E, "addikc ", simm16, immSExt16>;
+    def RSUBIKC : ArithRNI<0x0F, "rsubikc", simm16, immSExt16>;
+  }
+}
+
+let Predicates=[HasMul] in {
+  def MULI    :   ArithI<0x18, "muli   ", mul, simm16, immSExt16>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze memory access instructions
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  let neverHasSideEffects = 1 in {
+    def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
+    def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
+
+    def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
+    def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
+
+    def LW   :  LoadM<0x32, 0x000, "lw     ">;
+    def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
+
+    let Defs = [CARRY] in {
+      def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
+    }
+  }
+
+  def LBUI : LoadMI<0x38, "lbui   ", zextloadi8>;
+  def LHUI : LoadMI<0x39, "lhui   ", zextloadi16>;
+  def LWI  : LoadMI<0x3A, "lwi    ", load>;
+}
+
+def SB  :  StoreM<0x34, 0x000, "sb     ">;
+def SBR :  StoreM<0x34, 0x200, "sbr    ">;
+
+def SH  :  StoreM<0x35, 0x000, "sh     ">;
+def SHR :  StoreM<0x35, 0x200, "shr    ">;
+
+def SW  :  StoreM<0x36, 0x000, "sw     ">;
+def SWR :  StoreM<0x36, 0x200, "swr    ">;
+
+let Defs = [CARRY] in {
+  def SWX :  StoreM<0x36, 0x400, "swx    ">;
+}
+
+def SBI : StoreMI<0x3C, "sbi    ", truncstorei8>;
+def SHI : StoreMI<0x3D, "shi    ", truncstorei16>;
+def SWI : StoreMI<0x3E, "swi    ", store>;
+
+//===----------------------------------------------------------------------===//
+// MBlaze branch instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+  def BRI    :  BranchI<0x2E, 0x00, "bri    ">;
+  def BRAI   :  BranchI<0x2E, 0x08, "brai   ">;
+}
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  def BEQI   : BranchCI<0x2F, 0x00, "beqi   ">;
+  def BNEI   : BranchCI<0x2F, 0x01, "bnei   ">;
+  def BLTI   : BranchCI<0x2F, 0x02, "blti   ">;
+  def BLEI   : BranchCI<0x2F, 0x03, "blei   ">;
+  def BGTI   : BranchCI<0x2F, 0x04, "bgti   ">;
+  def BGEI   : BranchCI<0x2F, 0x05, "bgei   ">;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1, hasCtrlDep = 1,
+    isBarrier = 1 in {
+  def BR     :   Branch<0x26, 0x00, 0x000, "br     ">;
+  def BRA    :   Branch<0x26, 0x08, 0x000, "bra    ">;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  def BEQ    :  BranchC<0x27, 0x00, 0x000, "beq    ">;
+  def BNE    :  BranchC<0x27, 0x01, 0x000, "bne    ">;
+  def BLT    :  BranchC<0x27, 0x02, 0x000, "blt    ">;
+  def BLE    :  BranchC<0x27, 0x03, 0x000, "ble    ">;
+  def BGT    :  BranchC<0x27, 0x04, 0x000, "bgt    ">;
+  def BGE    :  BranchC<0x27, 0x05, 0x000, "bge    ">;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1, hasCtrlDep = 1,
+    isBarrier = 1 in {
+  def BRID   :  BranchI<0x2E, 0x10, "brid   ">;
+  def BRAID  :  BranchI<0x2E, 0x18, "braid  ">;
+}
+
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 1, hasCtrlDep = 1 in {
+  def BEQID  : BranchCI<0x2F, 0x10, "beqid  ">;
+  def BNEID  : BranchCI<0x2F, 0x11, "bneid  ">;
+  def BLTID  : BranchCI<0x2F, 0x12, "bltid  ">;
+  def BLEID  : BranchCI<0x2F, 0x13, "bleid  ">;
+  def BGTID  : BranchCI<0x2F, 0x14, "bgtid  ">;
+  def BGEID  : BranchCI<0x2F, 0x15, "bgeid  ">;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1,
+    hasDelaySlot = 1, hasCtrlDep = 1, isBarrier = 1 in {
+  def BRD    :   Branch<0x26, 0x10, 0x000, "brd    ">;
+  def BRAD   :   Branch<0x26, 0x18, 0x000, "brad   ">;
+}
+
+let isBranch = 1, isIndirectBranch = 1, isTerminator = 1,
+    hasDelaySlot = 1, hasCtrlDep = 1 in {
+  def BEQD   :  BranchC<0x27, 0x10, 0x000, "beqd   ">;
+  def BNED   :  BranchC<0x27, 0x11, 0x000, "bned   ">;
+  def BLTD   :  BranchC<0x27, 0x12, 0x000, "bltd   ">;
+  def BLED   :  BranchC<0x27, 0x13, 0x000, "bled   ">;
+  def BGTD   :  BranchC<0x27, 0x14, 0x000, "bgtd   ">;
+  def BGED   :  BranchC<0x27, 0x15, 0x000, "bged   ">;
+}
+
+let isCall =1, hasDelaySlot = 1,
+    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,CARRY],
+    Uses = [R1] in {
+  def BRLID  : BranchLI<0x2E, 0x14, "brlid  ">;
+  def BRALID : BranchLI<0x2E, 0x1C, "bralid ">;
+}
+
+let isCall = 1, hasDelaySlot = 1,
+    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,CARRY],
+    Uses = [R1] in {
+  def BRLD   : BranchL<0x26, 0x14, 0x000, "brld   ">;
+  def BRALD  : BranchL<0x26, 0x1C, 0x000, "brald  ">;
+}
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
+    rd=0x10, Form=FCRI in {
+  def RTSD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
+                  "rtsd      $target, $imm",
+                  [],
+                  IIC_BR>;
+}
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
+    rd=0x11, Form=FCRI in {
+  def RTID   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
+                  "rtid      $target, $imm",
+                  [],
+                  IIC_BR>;
+}
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
+    rd=0x12, Form=FCRI in {
+  def RTBD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
+                  "rtbd      $target, $imm",
+                  [],
+                  IIC_BR>;
+}
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
+    rd=0x14, Form=FCRI in {
+  def RTED   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
+                  "rted      $target, $imm",
+                  [],
+                  IIC_BR>;
+}
+
+//===----------------------------------------------------------------------===//
+// MBlaze misc instructions
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+  def NOP :  MBlazeInst<0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
+}
+
+let Predicates=[HasPatCmp] in {
+  def CLZ :  TCLZ<0x24, 0x00E0, (outs GPR:$dst), (ins GPR:$src),
+                  "clz    $dst, $src", [], IIC_ALU>;
+}
+
+def IMEMBAR  : MBAR<0x2E, 0x0420004, (outs), (ins), "mbar   2", [], IIC_ALU>;
+def DMEMBAR  : MBAR<0x2E, 0x0220004, (outs), (ins), "mbar   1", [], IIC_ALU>;
+def IDMEMBAR : MBAR<0x2E, 0x0020004, (outs), (ins), "mbar   0", [], IIC_ALU>;
+
+let usesCustomInserter = 1 in {
+  def Select_CC : MBlazePseudo<(outs GPR:$dst),
+    (ins GPR:$T, GPR:$F, GPR:$CMP, i32imm:$CC), // F T reversed
+    "; SELECT_CC PSEUDO!",
+    []>;
+
+  def ShiftL : MBlazePseudo<(outs GPR:$dst),
+    (ins GPR:$L, GPR:$R),
+    "; ShiftL PSEUDO!",
+    []>;
+
+  def ShiftRA : MBlazePseudo<(outs GPR:$dst),
+    (ins GPR:$L, GPR:$R),
+    "; ShiftRA PSEUDO!",
+    []>;
+
+  def ShiftRL : MBlazePseudo<(outs GPR:$dst),
+    (ins GPR:$L, GPR:$R),
+    "; ShiftRL PSEUDO!",
+    []>;
+}
+
+let rb = 0 in {
+  def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src),
+                  "sext16    $dst, $src", [], IIC_ALU>;
+  def SEXT8  : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src),
+                  "sext8     $dst, $src", [], IIC_ALU>;
+  let Defs = [CARRY] in {
+    def SRL    : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src),
+                    "srl       $dst, $src", [], IIC_ALU>;
+    def SRA    : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src),
+                    "sra       $dst, $src", [], IIC_ALU>;
+    let Uses = [CARRY] in {
+      def SRC    : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src),
+                      "src       $dst, $src", [], IIC_ALU>;
+    }
+  }
+}
+
+let isCodeGenOnly=1 in {
+  def ADDIK32 : ArithI32<0x08, "addik  ", simm16, immSExt16>;
+  def ORI32   : LogicI32<0x28, "ori    ">;
+  def BRLID32 : BranchLI<0x2E, 0x14, "brlid  ">;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc. instructions
+//===----------------------------------------------------------------------===//
+let Form=FRCS in {
+  def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src),
+                "mfs       $dst, $src", [], IIC_ALU>;
+}
+
+let Form=FCRCS in {
+  def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src),
+                "mts       $dst, $src", [], IIC_ALU>;
+}
+
+def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set),
+                 "msrset    $dst, $set", [], IIC_ALU>;
+
+def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr),
+                 "msrclr    $dst, $clr", [], IIC_ALU>;
+
+let rd=0x0, Form=FCRR in {
+  def WDC  : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b),
+                "wdc       $a, $b", [], IIC_WDC>;
+  def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b),
+                "wdc.flush $a, $b", [], IIC_WDC>;
+  def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b),
+                "wdc.clear $a, $b", [], IIC_WDC>;
+  def WIC  : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b),
+                "wic       $a, $b", [], IIC_WDC>;
+}
+
+def BRK  :  BranchL<0x26, 0x0C, 0x000, "brk    ">;
+def BRKI : BranchLI<0x2E, 0x0C, "brki   ">;
+
+def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm),
+                     "imm       $imm", [], IIC_ALU>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions for atomic operations
+//===----------------------------------------------------------------------===//
+let usesCustomInserter=1 in {
+  def CAS32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$cmp, GPR:$swp),
+    "# atomic compare and swap",
+    [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$cmp, GPR:$swp))]>;
+
+  def SWP32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$swp),
+    "# atomic swap",
+    [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$swp))]>;
+
+  def LAA32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and add",
+    [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$val))]>;
+
+  def LAS32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and sub",
+    [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$val))]>;
+
+  def LAD32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and and",
+    [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$val))]>;
+
+  def LAO32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and or",
+    [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$val))]>;
+
+  def LAX32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and xor",
+    [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$val))]>;
+
+  def LAN32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
+    "# atomic load and nand",
+    [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$val))]>;
+
+  def MEMBARRIER : MBlazePseudo<(outs), (ins),
+    "# memory barrier",
+    [(membarrier (i32 imm), (i32 imm), (i32 imm), (i32 imm), (i32 imm))]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i32 0), (ADDK (i32 R0), (i32 R0))>;
+def : Pat<(i32 immSExt16:$imm), (ADDIK (i32 R0), imm:$imm)>;
+def : Pat<(i32 immZExt16:$imm), (ORI (i32 R0), imm:$imm)>;
+
+// Arbitrary immediates
+def : Pat<(i32 imm:$imm), (ADDIK (i32 R0), imm:$imm)>;
+
+// In register sign extension
+def : Pat<(sext_inreg GPR:$src, i16), (SEXT16 GPR:$src)>;
+def : Pat<(sext_inreg GPR:$src, i8),  (SEXT8 GPR:$src)>;
+
+// Call
+def : Pat<(MBlazeJmpLink (i32 tglobaladdr:$dst)),
+          (BRLID (i32 R15), tglobaladdr:$dst)>;
+
+def : Pat<(MBlazeJmpLink (i32 texternalsym:$dst)),
+          (BRLID (i32 R15), texternalsym:$dst)>;
+
+def : Pat<(MBlazeJmpLink GPR:$dst),
+          (BRALD (i32 R15), GPR:$dst)>;
+
+// Shift Instructions
+def : Pat<(shl GPR:$L, GPR:$R), (ShiftL GPR:$L, GPR:$R)>;
+def : Pat<(sra GPR:$L, GPR:$R), (ShiftRA GPR:$L, GPR:$R)>;
+def : Pat<(srl GPR:$L, GPR:$R), (ShiftRL GPR:$L, GPR:$R)>;
+
+// SET_CC operations
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETEQ),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 1)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETNE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 2)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 6)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU (i32 R0), GPR:$L), 6)>;
+
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETEQ),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 1)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETNE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 2)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 3)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 4)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 5)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 6)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 3)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 4)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 5)>;
+def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, (i32 R0)), 6)>;
+
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 1)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETNE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 2)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETLT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETLE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMP GPR:$R, GPR:$L), 6)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETUGT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, GPR:$L), 3)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETULT),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, GPR:$L), 4)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETUGE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, GPR:$L), 5)>;
+def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETULE),
+          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
+                     (CMPU GPR:$R, GPR:$L), 6)>;
+
+// SELECT operations
+def : Pat<(select (i32 GPR:$C), (i32 GPR:$T), (i32 GPR:$F)),
+          (Select_CC GPR:$T, GPR:$F, GPR:$C, 2)>;
+
+// SELECT_CC
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 1)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 2)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$L, 6)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 0),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 6)>;
+
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 1)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 2)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 3)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 4)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 5)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
+          (Select_CC GPR:$T, GPR:$F, GPR:$R, 6)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 3)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 4)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 5)>;
+def : Pat<(selectcc (i32 0), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 6)>;
+
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 1)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 2)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
+          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 6)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 3)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 4)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 5)>;
+def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
+                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
+          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 6)>;
+
+// Ret instructions
+def : Pat<(MBlazeRet GPR:$target), (RTSD GPR:$target, 0x8)>;
+def : Pat<(MBlazeIRet GPR:$target), (RTID GPR:$target, 0x0)>;
+
+// BR instructions
+def : Pat<(br bb:$T), (BRID bb:$T)>;
+def : Pat<(brind GPR:$T), (BRAD GPR:$T)>;
+
+// BRCOND instructions
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETEQ), bb:$T),
+          (BEQID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETNE), bb:$T),
+          (BNEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGT), bb:$T),
+          (BGTID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLT), bb:$T),
+          (BLTID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGE), bb:$T),
+          (BGEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLE), bb:$T),
+          (BLEID GPR:$L, bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGT), bb:$T),
+          (BGTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULT), bb:$T),
+          (BLTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGE), bb:$T),
+          (BGEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULE), bb:$T),
+          (BLEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
+
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETEQ), bb:$T),
+          (BEQID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETNE), bb:$T),
+          (BNEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGT), bb:$T),
+          (BGTID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLT), bb:$T),
+          (BLTID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGE), bb:$T),
+          (BGEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLE), bb:$T),
+          (BLEID GPR:$R, bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGT), bb:$T),
+          (BGTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULT), bb:$T),
+          (BLTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGE), bb:$T),
+          (BGEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULE), bb:$T),
+          (BLEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
+
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ), bb:$T),
+          (BEQID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETNE), bb:$T),
+          (BNEID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETGT), bb:$T),
+          (BGTID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETLT), bb:$T),
+          (BLTID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETGE), bb:$T),
+          (BGEID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETLE), bb:$T),
+          (BLEID (CMP GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETUGT), bb:$T),
+          (BGTID (CMPU GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETULT), bb:$T),
+          (BLTID (CMPU GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETUGE), bb:$T),
+          (BGEID (CMPU GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETULE), bb:$T),
+          (BLEID (CMPU GPR:$R, GPR:$L), bb:$T)>;
+def : Pat<(brcond (i32 GPR:$C), bb:$T),
+          (BNEID GPR:$C, bb:$T)>;
+
+// Jump tables, global addresses, and constant pools
+def : Pat<(MBWrapper tglobaladdr:$in), (ORI (i32 R0), tglobaladdr:$in)>;
+def : Pat<(MBWrapper tjumptable:$in),  (ORI (i32 R0), tjumptable:$in)>;
+def : Pat<(MBWrapper tconstpool:$in),  (ORI (i32 R0), tconstpool:$in)>;
+
+// Misc instructions
+def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>;
+
+// Convert any extend loads into zero extend loads
+def : Pat<(extloadi8  iaddr:$src), (i32 (LBUI iaddr:$src))>;
+def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>;
+def : Pat<(extloadi8  xaddr:$src), (i32 (LBU xaddr:$src))>;
+def : Pat<(extloadi16 xaddr:$src), (i32 (LHU xaddr:$src))>;
+
+// 32-bit load and store
+def : Pat<(store (i32 GPR:$dst), xaddr:$addr), (SW GPR:$dst, xaddr:$addr)>;
+def : Pat<(load xaddr:$addr), (i32 (LW xaddr:$addr))>;
+
+// 16-bit load and store
+def : Pat<(truncstorei16 (i32 GPR:$dst), xaddr:$ad), (SH GPR:$dst, xaddr:$ad)>;
+def : Pat<(zextloadi16 xaddr:$addr), (i32 (LHU xaddr:$addr))>;
+
+// 8-bit load and store
+def : Pat<(truncstorei8 (i32 GPR:$dst), xaddr:$ad), (SB GPR:$dst, xaddr:$ad)>;
+def : Pat<(zextloadi8 xaddr:$addr), (i32 (LBU xaddr:$addr))>;
+
+// Peepholes
+def : Pat<(store (i32 0), iaddr:$dst), (SWI (i32 R0), iaddr:$dst)>;
+
+// Atomic fence
+def : Pat<(atomic_fence (imm), (imm)), (MEMBARRIER)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+include "MBlazeInstrFSL.td"
+include "MBlazeInstrFPU.td"
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
new file mode 100644
index 000000000000..8d262a01e706
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -0,0 +1,112 @@
+//===-- MBlazeIntrinsicInfo.cpp - Intrinsic Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeIntrinsicInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+namespace mblazeIntrinsic {
+
+  enum ID {
+    last_non_mblaze_intrinsic = Intrinsic::num_intrinsics-1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+    , num_mblaze_intrinsics
+  };
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+}
+
+std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
+                                         unsigned numTys) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
+  if (IntrID < Intrinsic::num_intrinsics)
+    return 0;
+  assert(IntrID < mblazeIntrinsic::num_mblaze_intrinsics &&
+         "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned MBlazeIntrinsicInfo::
+lookupName(const char *Name, unsigned Len) const {
+  if (Len < 5 || Name[4] != '.' || Name[0] != 'l' || Name[1] != 'l'
+      || Name[2] != 'v' || Name[3] != 'm')
+    return 0;  // All intrinsics start with 'llvm.'
+
+#define GET_FUNCTION_RECOGNIZER
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  return 0;
+}
+
+unsigned MBlazeIntrinsicInfo::
+lookupGCCName(const char *Name) const {
+    return mblazeIntrinsic::getIntrinsicForGCCBuiltin("mblaze",Name);
+}
+
+bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
+  if (IntrID == 0)
+    return false;
+
+  unsigned id = IntrID - Intrinsic::num_intrinsics + 1;
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+/// This defines the "getAttributes(LLVMContext &C, ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+static FunctionType *getType(LLVMContext &Context, unsigned id) {
+  Type *ResultTy = NULL;
+  SmallVector<Type*, 8> ArgTys;
+  bool IsVarArg = false;
+
+#define GET_INTRINSIC_GENERATOR
+#include "MBlazeGenIntrinsics.inc"
+#undef GET_INTRINSIC_GENERATOR
+
+  return FunctionType::get(ResultTy, ArgTys, IsVarArg);
+}
+
+Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                                Type **Tys,
+                                                unsigned numTy) const {
+  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
+  AttributeSet AList = getAttributes(M->getContext(),
+                                    (mblazeIntrinsic::ID) IntrID);
+  return cast<Function>(M->getOrInsertFunction(getName(IntrID),
+                                               getType(M->getContext(), IntrID),
+                                               AList));
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.h b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
new file mode 100644
index 000000000000..34f379230def
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
@@ -0,0 +1,33 @@
+//===-- MBlazeIntrinsicInfo.h - MBlaze Intrinsic Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MBLAZEINTRINSICS_H
+#define MBLAZEINTRINSICS_H
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+
+  class MBlazeIntrinsicInfo : public TargetIntrinsicInfo {
+  public:
+    std::string getName(unsigned IntrID, Type **Tys = 0,
+                        unsigned numTys = 0) const;
+    unsigned lookupName(const char *Name, unsigned Len) const;
+    unsigned lookupGCCName(const char *Name) const;
+    bool isOverloaded(unsigned IID) const;
+    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
+                             unsigned numTys = 0) const;
+  };
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsics.td b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsics.td
new file mode 100644
index 000000000000..b5dc59547bbf
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeIntrinsics.td
@@ -0,0 +1,131 @@
+//===-- IntrinsicsMBlaze.td - Defines MBlaze intrinsics ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the MicroBlaze-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Definitions for all MBlaze intrinsics.
+//
+
+// MBlaze intrinsic classes.
+let TargetPrefix = "mblaze", isTarget = 1 in {
+  class MBFSL_Get_Intrinsic : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
+
+  class MBFSL_Put_Intrinsic : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
+
+  class MBFSL_PutT_Intrinsic : Intrinsic<[], [llvm_i32_ty], []>;
+}
+
+//===----------------------------------------------------------------------===//
+// MicroBlaze FSL Get Intrinsic Definitions.
+//
+
+def int_mblaze_fsl_get      : GCCBuiltin<"__builtin_mblaze_fsl_get">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_aget     : GCCBuiltin<"__builtin_mblaze_fsl_aget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_cget     : GCCBuiltin<"__builtin_mblaze_fsl_cget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_caget    : GCCBuiltin<"__builtin_mblaze_fsl_caget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_eget     : GCCBuiltin<"__builtin_mblaze_fsl_eget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_eaget    : GCCBuiltin<"__builtin_mblaze_fsl_eaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ecget    : GCCBuiltin<"__builtin_mblaze_fsl_ecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ecaget   : GCCBuiltin<"__builtin_mblaze_fsl_ecaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_nget     : GCCBuiltin<"__builtin_mblaze_fsl_nget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_naget    : GCCBuiltin<"__builtin_mblaze_fsl_naget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ncget    : GCCBuiltin<"__builtin_mblaze_fsl_ncget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_ncaget   : GCCBuiltin<"__builtin_mblaze_fsl_ncaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_neget    : GCCBuiltin<"__builtin_mblaze_fsl_neget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_neaget   : GCCBuiltin<"__builtin_mblaze_fsl_neaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_necget   : GCCBuiltin<"__builtin_mblaze_fsl_necget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_necaget  : GCCBuiltin<"__builtin_mblaze_fsl_necaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tget     : GCCBuiltin<"__builtin_mblaze_fsl_tget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_taget    : GCCBuiltin<"__builtin_mblaze_fsl_taget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tcget    : GCCBuiltin<"__builtin_mblaze_fsl_tcget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tcaget   : GCCBuiltin<"__builtin_mblaze_fsl_tcaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_teget    : GCCBuiltin<"__builtin_mblaze_fsl_teget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_teaget   : GCCBuiltin<"__builtin_mblaze_fsl_teaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tecget   : GCCBuiltin<"__builtin_mblaze_fsl_tecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tecaget  : GCCBuiltin<"__builtin_mblaze_fsl_tecaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnget    : GCCBuiltin<"__builtin_mblaze_fsl_tnget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnaget   : GCCBuiltin<"__builtin_mblaze_fsl_tnaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tncget   : GCCBuiltin<"__builtin_mblaze_fsl_tncget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tncaget  : GCCBuiltin<"__builtin_mblaze_fsl_tncaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tneget   : GCCBuiltin<"__builtin_mblaze_fsl_tneget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tneaget  : GCCBuiltin<"__builtin_mblaze_fsl_tneaget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnecget  : GCCBuiltin<"__builtin_mblaze_fsl_tnecget">,
+                              MBFSL_Get_Intrinsic;
+def int_mblaze_fsl_tnecaget : GCCBuiltin<"__builtin_mblaze_fsl_tnecaget">,
+                              MBFSL_Get_Intrinsic;
+
+//===----------------------------------------------------------------------===//
+// MicroBlaze FSL Put Intrinsic Definitions.
+//
+
+def int_mblaze_fsl_put     : GCCBuiltin<"__builtin_mblaze_fsl_put">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_aput    : GCCBuiltin<"__builtin_mblaze_fsl_aput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_cput    : GCCBuiltin<"__builtin_mblaze_fsl_cput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_caput   : GCCBuiltin<"__builtin_mblaze_fsl_caput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_nput    : GCCBuiltin<"__builtin_mblaze_fsl_nput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_naput   : GCCBuiltin<"__builtin_mblaze_fsl_naput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_ncput   : GCCBuiltin<"__builtin_mblaze_fsl_ncput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_ncaput  : GCCBuiltin<"__builtin_mblaze_fsl_ncaput">,
+                             MBFSL_Put_Intrinsic;
+def int_mblaze_fsl_tput    : GCCBuiltin<"__builtin_mblaze_fsl_tput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_taput   : GCCBuiltin<"__builtin_mblaze_fsl_taput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tcput   : GCCBuiltin<"__builtin_mblaze_fsl_tcput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tcaput  : GCCBuiltin<"__builtin_mblaze_fsl_tcaput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tnput   : GCCBuiltin<"__builtin_mblaze_fsl_tnput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tnaput  : GCCBuiltin<"__builtin_mblaze_fsl_tnaput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tncput  : GCCBuiltin<"__builtin_mblaze_fsl_tncput">,
+                             MBFSL_PutT_Intrinsic;
+def int_mblaze_fsl_tncaput : GCCBuiltin<"__builtin_mblaze_fsl_tncaput">,
+                             MBFSL_PutT_Intrinsic;
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.cpp
new file mode 100644
index 000000000000..ad414ac40fd7
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.cpp
@@ -0,0 +1,167 @@
+//===-- MBlazeMCInstLower.cpp - Convert MBlaze MachineInstr to an MCInst---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MBlaze MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeMCInstLower.h"
+#include "MBlazeInstrInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+MCSymbol *MBlazeMCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0:  break;
+  }
+
+  return Printer.Mang->getSymbol(MO.getGlobal());
+}
+
+MCSymbol *MBlazeMCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0:  break;
+  }
+
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MBlazeMCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0:  break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MBlazeMCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default:
+      llvm_unreachable("Unknown target flag on GV operand");
+
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MBlazeMCInstLower::
+GetBlockAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCOperand MBlazeMCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default:
+      llvm_unreachable("Unknown target flag on GV operand");
+
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void MBlazeMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default: llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                         MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_FPImmediate: {
+      bool ignored;
+      APFloat FVal = MO.getFPImm()->getValueAPF();
+      FVal.convert(APFloat::IEEEsingle, APFloat::rmTowardZero, &ignored);
+
+      APInt IVal = FVal.bitcastToAPInt();
+      uint64_t Val = *IVal.getRawData();
+      MCOp = MCOperand::CreateImm(Val);
+      break;
+    }
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.h b/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.h
new file mode 100644
index 000000000000..8ab2c9a3a633
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeMCInstLower.h
@@ -0,0 +1,47 @@
+//===-- MBlazeMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_MCINSTLOWER_H
+#define MBLAZE_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+
+  /// MBlazeMCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY MBlazeMCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+public:
+  MBlazeMCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.cpp
new file mode 100644
index 000000000000..2217b5477d6b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.cpp
@@ -0,0 +1,14 @@
+//===-- MBlazeMachineFunctionInfo.cpp - Private data ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeMachineFunction.h"
+
+using namespace llvm;
+
+void MBlazeFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.h b/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.h
new file mode 100644
index 000000000000..10d507f37bbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeMachineFunction.h
@@ -0,0 +1,169 @@
+//===-- MBlazeMachineFunctionInfo.h - Private data --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_MACHINE_FUNCTION_INFO_H
+#define MBLAZE_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MBlazeFunctionInfo - This class is derived from MachineFunction private
+/// MBlaze target-specific information for each MachineFunction.
+class MBlazeFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// Holds for each function where on the stack the Frame Pointer must be
+  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
+  int FPStackOffset;
+
+  /// Holds for each function where on the stack the Return Address must be
+  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
+  int RAStackOffset;
+
+  /// MBlazeFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
+  struct MBlazeFIHolder {
+
+    int FI;
+    int SPOffset;
+
+    MBlazeFIHolder(int FrameIndex, int StackPointerOffset)
+      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
+  };
+
+  /// When PIC is used the GP must be saved on the stack on the function
+  /// prologue and must be reloaded from this stack location after every
+  /// call. A reference to its stack location and frame index must be kept
+  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
+  MBlazeFIHolder GPHolder;
+
+  /// On LowerFormalArguments the stack size is unknown, so the Stack
+  /// Pointer Offset calculation of "not in register arguments" must be
+  /// postponed to emitPrologue.
+  SmallVector<MBlazeFIHolder, 16> FnLoadArgs;
+  bool HasLoadArgs;
+
+  // When VarArgs, we must write registers back to caller stack, preserving
+  // on register arguments. Since the stack size is unknown on
+  // LowerFormalArguments, the Stack Pointer Offset calculation must be
+  // postponed to emitPrologue.
+  SmallVector<MBlazeFIHolder, 4> FnStoreVarArgs;
+  bool HasStoreVarArgs;
+
+  // When determining the final stack layout some of the frame indexes may
+  // be replaced by new frame indexes that reside in the caller's stack
+  // frame. The replacements are recorded in this structure.
+  DenseMap<int,int> FIReplacements;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  // VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// LiveInFI - keeps track of the frame indexes in a callers stack
+  /// frame that are live into a function.
+  SmallVector<int, 16> LiveInFI;
+
+public:
+  MBlazeFunctionInfo(MachineFunction& MF)
+  : FPStackOffset(0), RAStackOffset(0), GPHolder(-1,-1), HasLoadArgs(false),
+    HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0),
+    VarArgsFrameIndex(0), LiveInFI()
+  {}
+
+  int getFPStackOffset() const { return FPStackOffset; }
+  void setFPStackOffset(int Off) { FPStackOffset = Off; }
+
+  int getRAStackOffset() const { return RAStackOffset; }
+  void setRAStackOffset(int Off) { RAStackOffset = Off; }
+
+  int getGPStackOffset() const { return GPHolder.SPOffset; }
+  int getGPFI() const { return GPHolder.FI; }
+  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
+  void setGPFI(int FI) { GPHolder.FI = FI; }
+  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
+
+  bool hasLoadArgs() const { return HasLoadArgs; }
+  bool hasStoreVarArgs() const { return HasStoreVarArgs; }
+
+  void recordLiveIn(int FI) {
+    LiveInFI.push_back(FI);
+  }
+
+  bool isLiveIn(int FI) {
+    for (unsigned i = 0, e = LiveInFI.size(); i < e; ++i)
+      if (FI == LiveInFI[i]) return true;
+
+    return false;
+  }
+
+  const SmallVector<int, 16>& getLiveIn() const { return LiveInFI; }
+
+  void recordReplacement(int OFI, int NFI) {
+    FIReplacements.insert(std::make_pair(OFI,NFI));
+  }
+
+  bool hasReplacement(int OFI) const {
+    return FIReplacements.find(OFI) != FIReplacements.end();
+  }
+
+  int getReplacement(int OFI) const {
+    return FIReplacements.lookup(OFI);
+  }
+
+  void recordLoadArgsFI(int FI, int SPOffset) {
+    if (!HasLoadArgs) HasLoadArgs=true;
+    FnLoadArgs.push_back(MBlazeFIHolder(FI, SPOffset));
+  }
+
+  void recordStoreVarArgsFI(int FI, int SPOffset) {
+    if (!HasStoreVarArgs) HasStoreVarArgs=true;
+    FnStoreVarArgs.push_back(MBlazeFIHolder(FI, SPOffset));
+  }
+
+  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasLoadArgs()) return;
+    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i)
+      MFI->setObjectOffset(FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset);
+  }
+
+  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasStoreVarArgs()) return;
+    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i)
+      MFI->setObjectOffset(FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset);
+  }
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // end of namespace llvm
+
+#endif // MBLAZE_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
new file mode 100644
index 000000000000..bd83afc1cc83
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -0,0 +1,145 @@
+//===-- MBlazeRegisterInfo.cpp - MBlaze Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-frame-info"
+
+#include "MBlazeRegisterInfo.h"
+#include "MBlaze.h"
+#include "MBlazeMachineFunction.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MBlazeGenRegisterInfo.inc"
+
+using namespace llvm;
+
+MBlazeRegisterInfo::
+MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
+  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST), TII(tii) {}
+
+unsigned MBlazeRegisterInfo::getPICCallReg() {
+  return MBlaze::R20;
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// MBlaze Callee Saved Registers
+const uint16_t* MBlazeRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  // MBlaze callee-save register range is R20 - R31
+  static const uint16_t CalleeSavedRegs[] = {
+    MBlaze::R20, MBlaze::R21, MBlaze::R22, MBlaze::R23,
+    MBlaze::R24, MBlaze::R25, MBlaze::R26, MBlaze::R27,
+    MBlaze::R28, MBlaze::R29, MBlaze::R30, MBlaze::R31,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+BitVector MBlazeRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(MBlaze::R0);
+  Reserved.set(MBlaze::R1);
+  Reserved.set(MBlaze::R2);
+  Reserved.set(MBlaze::R13);
+  Reserved.set(MBlaze::R14);
+  Reserved.set(MBlaze::R15);
+  Reserved.set(MBlaze::R16);
+  Reserved.set(MBlaze::R17);
+  Reserved.set(MBlaze::R18);
+  Reserved.set(MBlaze::R19);
+  return Reserved;
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MBlazeRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    unsigned FIOperandNum, RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned OFIOperandNum = FIOperandNum == 2 ? 1 : 2;
+
+  DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
+        dbgs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  int stackSize  = MFI->getStackSize();
+  int spOffset   = MFI->getObjectOffset(FrameIndex);
+
+  DEBUG(MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+        dbgs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n"
+               << "isFixed    : " << MFI->isFixedObjectIndex(FrameIndex) << "\n"
+               << "isLiveIn   : " << MBlazeFI->isLiveIn(FrameIndex) << "\n"
+               << "isSpill    : " << MFI->isSpillSlotObjectIndex(FrameIndex)
+               << "\n" );
+
+  // as explained on LowerFormalArguments, detect negative offsets
+  // and adjust SPOffsets considering the final stack size.
+  int Offset = (spOffset < 0) ? (stackSize - spOffset) : spOffset;
+  Offset += MI.getOperand(OFIOperandNum).getImm();
+
+  DEBUG(dbgs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  MI.getOperand(OFIOperandNum).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum).ChangeToRegister(getFrameRegister(MF), false);
+}
+
+void MBlazeRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *) const {
+  // Set the stack offset where GP must be saved/loaded from.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
+  if (MBlazeFI->needGPSaveRestore())
+    MFI->setObjectOffset(MBlazeFI->getGPFI(), MBlazeFI->getGPStackOffset());
+}
+
+unsigned MBlazeRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? MBlaze::R19 : MBlaze::R1;
+}
+
+unsigned MBlazeRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned MBlazeRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.h b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.h
new file mode 100644
index 000000000000..497f3866c9ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -0,0 +1,71 @@
+//===-- MBlazeRegisterInfo.h - MBlaze Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MBlaze implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEREGISTERINFO_H
+#define MBLAZEREGISTERINFO_H
+
+#include "MBlaze.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MBlazeGenRegisterInfo.inc"
+
+namespace llvm {
+class MBlazeSubtarget;
+class TargetInstrInfo;
+class Type;
+
+namespace MBlaze {
+  /// SubregIndex - The index of various sized subregister classes. Note that
+  /// these indices must be kept in sync with the class indices in the
+  /// MBlazeRegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_FPEVEN = 1, SUBREG_FPODD = 2
+  };
+}
+
+struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
+  const MBlazeSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+
+  MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
+                     const TargetInstrInfo &tii);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                           RegScavenger *RS = NULL) const;
+
+  /// Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  /// Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.td b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.td
new file mode 100644
index 000000000000..64cae5cff85d
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeRegisterInfo.td
@@ -0,0 +1,148 @@
+//===-- MBlazeRegisterInfo.td - MBlaze Register defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MicroBlaze register file
+//===----------------------------------------------------------------------===//
+
+// We have banks of 32 registers each.
+class MBlazeReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "MBlaze";
+}
+
+// Special purpose registers have 15-bit values
+class MBlazeSReg<string n> : Register<n> {
+  field bits<15> Num;
+  let Namespace = "MBlaze";
+}
+
+// MBlaze general purpose registers
+class MBlazeGPRReg<bits<5> num, string n> : MBlazeReg<n> {
+  let Num = num;
+}
+
+// MBlaze special purpose registers
+class MBlazeSPRReg<bits<15> num, string n> : MBlazeSReg<n> {
+  let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "MBlaze" in {
+  // General Purpose Registers
+  def R0  : MBlazeGPRReg< 0,  "r0">,   DwarfRegNum<[0]>;
+  def R1  : MBlazeGPRReg< 1,  "r1">,   DwarfRegNum<[1]>;
+  def R2  : MBlazeGPRReg< 2,  "r2">,   DwarfRegNum<[2]>;
+  def R3  : MBlazeGPRReg< 3,  "r3">,   DwarfRegNum<[3]>;
+  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[4]>;
+  def R5  : MBlazeGPRReg< 5,  "r5">,   DwarfRegNum<[5]>;
+  def R6  : MBlazeGPRReg< 6,  "r6">,   DwarfRegNum<[6]>;
+  def R7  : MBlazeGPRReg< 7,  "r7">,   DwarfRegNum<[7]>;
+  def R8  : MBlazeGPRReg< 8,  "r8">,   DwarfRegNum<[8]>;
+  def R9  : MBlazeGPRReg< 9,  "r9">,   DwarfRegNum<[9]>;
+  def R10 : MBlazeGPRReg< 10, "r10">,  DwarfRegNum<[10]>;
+  def R11 : MBlazeGPRReg< 11, "r11">,  DwarfRegNum<[11]>;
+  def R12 : MBlazeGPRReg< 12, "r12">,  DwarfRegNum<[12]>;
+  def R13 : MBlazeGPRReg< 13, "r13">,  DwarfRegNum<[13]>;
+  def R14 : MBlazeGPRReg< 14, "r14">,  DwarfRegNum<[14]>;
+  def R15 : MBlazeGPRReg< 15, "r15">,  DwarfRegNum<[15]>;
+  def R16 : MBlazeGPRReg< 16, "r16">,  DwarfRegNum<[16]>;
+  def R17 : MBlazeGPRReg< 17, "r17">,  DwarfRegNum<[17]>;
+  def R18 : MBlazeGPRReg< 18, "r18">,  DwarfRegNum<[18]>;
+  def R19 : MBlazeGPRReg< 19, "r19">,  DwarfRegNum<[19]>;
+  def R20 : MBlazeGPRReg< 20, "r20">,  DwarfRegNum<[20]>;
+  def R21 : MBlazeGPRReg< 21, "r21">,  DwarfRegNum<[21]>;
+  def R22 : MBlazeGPRReg< 22, "r22">,  DwarfRegNum<[22]>;
+  def R23 : MBlazeGPRReg< 23, "r23">,  DwarfRegNum<[23]>;
+  def R24 : MBlazeGPRReg< 24, "r24">,  DwarfRegNum<[24]>;
+  def R25 : MBlazeGPRReg< 25, "r25">,  DwarfRegNum<[25]>;
+  def R26 : MBlazeGPRReg< 26, "r26">,  DwarfRegNum<[26]>;
+  def R27 : MBlazeGPRReg< 27, "r27">,  DwarfRegNum<[27]>;
+  def R28 : MBlazeGPRReg< 28, "r28">,  DwarfRegNum<[28]>;
+  def R29 : MBlazeGPRReg< 29, "r29">,  DwarfRegNum<[29]>;
+  def R30 : MBlazeGPRReg< 30, "r30">,  DwarfRegNum<[30]>;
+  def R31 : MBlazeGPRReg< 31, "r31">,  DwarfRegNum<[31]>;
+
+  // Special Purpose Registers
+  def RPC    : MBlazeSPRReg<0x0000, "rpc">,    DwarfRegNum<[32]>;
+  def RMSR   : MBlazeSPRReg<0x0001, "rmsr">,   DwarfRegNum<[33]>;
+  def REAR   : MBlazeSPRReg<0x0003, "rear">,   DwarfRegNum<[34]>;
+  def RESR   : MBlazeSPRReg<0x0005, "resr">,   DwarfRegNum<[35]>;
+  def RFSR   : MBlazeSPRReg<0x0007, "rfsr">,   DwarfRegNum<[36]>;
+  def RBTR   : MBlazeSPRReg<0x000B, "rbtr">,   DwarfRegNum<[37]>;
+  def REDR   : MBlazeSPRReg<0x000D, "redr">,   DwarfRegNum<[38]>;
+  def RPID   : MBlazeSPRReg<0x1000, "rpid">,   DwarfRegNum<[39]>;
+  def RZPR   : MBlazeSPRReg<0x1001, "rzpr">,   DwarfRegNum<[40]>;
+  def RTLBX  : MBlazeSPRReg<0x1002, "rtlbx">,  DwarfRegNum<[41]>;
+  def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>;
+  def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>;
+  def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>;
+  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[45]>;
+  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[46]>;
+  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[47]>;
+  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[48]>;
+  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[49]>;
+  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[50]>;
+  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[51]>;
+  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[52]>;
+  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[53]>;
+  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[54]>;
+  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>;
+  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>;
+
+  // The carry bit. In the Microblaze this is really bit 29 of the
+  // MSR register but this is the only bit of that register that we
+  // are interested in modeling.
+  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def GPR : RegisterClass<"MBlaze", [i32,f32], 32, (sequence "R%u", 0, 31)>;
+
+def SPR : RegisterClass<"MBlaze", [i32], 32, (add
+  // Reserved
+  RPC,
+  RMSR,
+  REAR,
+  RESR,
+  RFSR,
+  RBTR,
+  REDR,
+  RPID,
+  RZPR,
+  RTLBX,
+  RTLBLO,
+  RTLBHI,
+  RPVR0,
+  RPVR1,
+  RPVR2,
+  RPVR3,
+  RPVR4,
+  RPVR5,
+  RPVR6,
+  RPVR7,
+  RPVR8,
+  RPVR9,
+  RPVR10,
+  RPVR11
+  )>
+{
+  // None of the special purpose registers are allocatable.
+  let isAllocatable = 0;
+}
+
+def CRC : RegisterClass<"MBlaze", [i32], 32, (add CARRY)> {
+  let CopyCost = -1;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeRelocations.h b/contrib/llvm/lib/Target/MBlaze/MBlazeRelocations.h
new file mode 100644
index 000000000000..6387ee23ec9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeRelocations.h
@@ -0,0 +1,47 @@
+//===-- MBlazeRelocations.h - MBlaze Code Relocations -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MBlaze target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZERELOCATIONS_H
+#define MBLAZERELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace MBlaze {
+    enum RelocationType {
+      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+      /// the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      /// reloc_picrel_word - PIC base relative relocation, add the relocated
+      /// value to the value already in memory, after we adjust it for where the
+      /// PIC base is.
+      reloc_picrel_word = 1,
+
+      /// reloc_absolute_word - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_word = 2,
+
+      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+      /// value to the value already in memory. In object files, it represents a
+      /// value which must be sign-extended when resolving the relocation.
+      reloc_absolute_word_sext = 3,
+
+      /// reloc_absolute_dword - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_dword = 4
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule.td b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule.td
new file mode 100644
index 000000000000..cd5691ce644f
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule.td
@@ -0,0 +1,50 @@
+//===-- MBlazeSchedule.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze functional units.
+//===----------------------------------------------------------------------===//
+def IF : FuncUnit;
+def ID : FuncUnit;
+def EX : FuncUnit;
+def MA : FuncUnit;
+def WB : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for MBlaze
+//===----------------------------------------------------------------------===//
+def IIC_ALU    : InstrItinClass;
+def IIC_ALUm   : InstrItinClass;
+def IIC_ALUd   : InstrItinClass;
+def IIC_SHT    : InstrItinClass;
+def IIC_FSLg   : InstrItinClass;
+def IIC_FSLp   : InstrItinClass;
+def IIC_MEMs   : InstrItinClass;
+def IIC_MEMl   : InstrItinClass;
+def IIC_FPU    : InstrItinClass;
+def IIC_FPUd   : InstrItinClass;
+def IIC_FPUf   : InstrItinClass;
+def IIC_FPUi   : InstrItinClass;
+def IIC_FPUs   : InstrItinClass;
+def IIC_FPUc   : InstrItinClass;
+def IIC_BR     : InstrItinClass;
+def IIC_BRc    : InstrItinClass;
+def IIC_BRl    : InstrItinClass;
+def IIC_WDC    : InstrItinClass;
+def IIC_Pseudo : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for three stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule3.td"
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for five stage pipeline.
+//===----------------------------------------------------------------------===//
+include "MBlazeSchedule5.td"
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule3.td b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule3.td
new file mode 100644
index 000000000000..20257a60a0fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule3.td
@@ -0,0 +1,236 @@
+//===-- MBlazeSchedule3.td - MBlaze Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the three stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe3Itineraries : ProcessorItineraries<
+  [IF,ID,EX], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes three cycles. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the execute stage, which takes 34 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the execute stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<34,[EX]>], // 34 cycles in execute stage
+               [ 35                    // result ready after 35 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages
+  // except the execute stage, which takes two cycles.  The two source operands
+  // are read during the decode stage and the result is ready after the execute
+  // stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the execute stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>], // one cycle in execute stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the execute stage, which takes six cycles. The
+  // source operands are read during the decode stage and the results are ready
+  // after the execute stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 30
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<30,[EX]>], // one cycle in execute stage
+               [ 31                    // result ready after 31 cycles
+               , 1                     // first operand read after one cycle
+               , 1 ]>,                 // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes seven cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<7,[EX]>], // seven cycles in execute stage
+               [ 8                    // result ready after eight cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the execute stage,
+  // which takes six cycles. The source operands are read during the decode
+  // stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<6,[EX]>], // six cycles in execute stage
+               [ 7                    // result ready after seven cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes 29
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>    // one cycle in fetch stage
+               , InstrStage<1,[ID]>    // one cycle in decode stage
+               , InstrStage<29,[EX]>], // 29 cycles in execute stage
+               [ 30                    // result ready after 30 cycles
+               , 1 ]>,                 // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages except the execute stage, which takes three
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<3,[EX]>], // three cycles in execute stage
+               [ 4                    // result ready after four cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages except the execute stage, which takes two cycles.
+  // The one source operand is read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. The two source operands are read during the decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the execute stage, which takes two
+  // cycles. All of the source operands are read during the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages except the execute stage, which takes two cycles. All of
+  // the source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<2,[EX]>], // two cycles in execute stage
+               [ 3                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule5.td b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule5.td
new file mode 100644
index 000000000000..ab53b424ded3
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSchedule5.td
@@ -0,0 +1,267 @@
+//===-- MBlazeSchedule5.td - MBlaze Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MBlaze instruction itineraries for the five stage pipeline.
+//===----------------------------------------------------------------------===//
+def MBlazePipe5Itineraries : ProcessorItineraries<
+  [IF,ID,EX,MA,WB], [], [
+
+  // ALU instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the stages. The
+  // two source operands are read during the decode stage and the result is
+  // ready after the execute stage.
+  InstrItinData< IIC_ALU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU multiply instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand.  The instruction takes one cycle to execute in each of the
+  // pipeline stages. The two source operands are read during the decode stage
+  // and the result is ready after the execute stage.
+  InstrItinData< IIC_ALUm,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // ALU divide instruction with one destination register two register source
+  // operands. The instruction takes one cycle to execute in each the pipeline
+  // stages except the memory access stage, which takes 31 cycles. The two
+  // source operands are read during the decode stage and the result is ready
+  // after the memory access stage.
+  InstrItinData< IIC_ALUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<31,[MA]>  // 31 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 33                   // result ready after 33 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Shift instruction with one destination register and either two register
+  // source operands or one register source operand and one immediate operand.
+  // The instruction takes one cycle to execute in each of the pipeline stages.
+  // The two source operands are read during the decode stage and the result is
+  // ready after the memory access stage.
+  InstrItinData< IIC_SHT,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 3                    // result ready after three cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch instruction with one source operand register. The instruction takes
+  // one cycle to execute in each of the pipeline stages. The source operand is
+  // read during the decode stage.
+  InstrItinData< IIC_BR,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1 ]>,                // first operand read after one cycle
+
+  // Conditional branch instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages. The
+  // two source operands are read during the decode stage.
+  InstrItinData< IIC_BRc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Branch and link instruction with one destination register and one source
+  // operand register. The instruction takes one cycle to execute in each of
+  // the pipeline stages. The source operand is read during the decode stage
+  // and the destination register is ready after the writeback stage.
+  InstrItinData< IIC_BRl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Cache control instruction with two source operand registers. The
+  // instruction takes one cycle to execute in each of the pipeline stages
+  // except the memory access stage, which takes two cycles. The source
+  // operands are read during the decode stage.
+  InstrItinData< IIC_WDC,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point instruction with one destination register and two source
+  // operand registers. The instruction takes one cycle to execute in each of
+  // the pipeline stages except the memory access stage, which takes two
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPU,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Floating point divide instruction with one destination register and two
+  // source operand registers. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 26
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUd,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<26,[MA]>  // 26 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 29                   // result ready after 29 cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Convert floating point to integer instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes three cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUi,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<3,[MA]>   // three cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 6                   // result ready after six cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Convert integer to floating point instruction with one destination
+  // register and one source operand register. The instruction takes one cycle
+  // to execute in each of the pipeline stages except the memory access stage,
+  // which takes two cycles. The source operands are read during the decode
+  // stage and the results are ready after the writeback stage.
+  InstrItinData< IIC_FPUf,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<2,[MA]>   // two cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 5                    // result ready after five cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point square root instruction with one destination register and
+  // one source operand register. The instruction takes one cycle to execute in
+  // each of the pipeline stages except the memory access stage, which takes 25
+  // cycles. The source operands are read during the decode stage and the
+  // results are ready after the writeback stage.
+  InstrItinData< IIC_FPUs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<25,[MA]>  // 25 cycles in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 28                   // result ready after 28 cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // Floating point comparison instruction with one destination register and
+  // two source operand registers. The instruction takes one cycle to execute
+  // in each of the pipeline stages. The source operands are read during the
+  // decode stage and the results are ready after the execute stage.
+  InstrItinData< IIC_FPUc,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // FSL get instruction with one register or immediate source operand and one
+  // destination register. The instruction takes one cycle to execute in each
+  // of the pipeline stages. The one source operand is read during the decode
+  // stage and the result is ready after the execute stage.
+  InstrItinData< IIC_FSLg,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 2                    // result ready after two cycles
+               , 1 ]>,                // first operand read after one cycle
+
+  // FSL put instruction with either two register source operands or one
+  // register source operand and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. The two source operands are read during the
+  // decode stage.
+  InstrItinData< IIC_FSLp,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1 ]>,                // second operand read after one cycle
+
+  // Memory store instruction with either three register source operands or two
+  // register source operands and one immediate operand. There is no result
+  // produced by the instruction. The instruction takes one cycle to execute in
+  // each of the pipeline stages. All of the source operands are read during
+  // the decode stage.
+  InstrItinData< IIC_MEMs,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 1                    // first operand read after one cycle
+               , 1                    // second operand read after one cycle
+               , 1 ]>,                // third operand read after one cycle
+
+  // Memory load instruction with one destination register and either two
+  // register source operands or one register source operand and one immediate
+  // operand. The instruction takes one cycle to execute in each of the
+  // pipeline stages. All of the source operands are read during the decode
+  // stage and the result is ready after the writeback stage.
+  InstrItinData< IIC_MEMl,
+               [ InstrStage<1,[IF]>   // one cycle in fetch stage
+               , InstrStage<1,[ID]>   // one cycle in decode stage
+               , InstrStage<1,[EX]>   // one cycle in execute stage
+               , InstrStage<1,[MA]>   // one cycle in memory access stage
+               , InstrStage<1,[WB]>], // one cycle in write back stage
+               [ 4                    // result ready after four cycles
+               , 1                    // second operand read after one cycle
+               , 1 ]>                 // third operand read after one cycle
+]>;
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..6a115b22bd4a
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MBlazeSelectionDAGInfo.cpp - MBlaze SelectionDAG Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MBlazeSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mblaze-selectiondag-info"
+#include "MBlazeTargetMachine.h"
+using namespace llvm;
+
+MBlazeSelectionDAGInfo::MBlazeSelectionDAGInfo(const MBlazeTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+MBlazeSelectionDAGInfo::~MBlazeSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h b/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h
new file mode 100644
index 000000000000..9f8e2aad4b20
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- MBlazeSelectionDAGInfo.h - MBlaze SelectionDAG Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MBlaze subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZESELECTIONDAGINFO_H
+#define MBLAZESELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class MBlazeTargetMachine;
+
+class MBlazeSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit MBlazeSelectionDAGInfo(const MBlazeTargetMachine &TM);
+  ~MBlazeSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.cpp
new file mode 100644
index 000000000000..dc2ad29be2a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -0,0 +1,56 @@
+//===-- MBlazeSubtarget.cpp - MBlaze Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MBlaze specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeSubtarget.h"
+#include "MBlaze.h"
+#include "MBlazeRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MBlazeGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+MBlazeSubtarget::MBlazeSubtarget(const std::string &TT,
+                                 const std::string &CPU,
+                                 const std::string &FS):
+  MBlazeGenSubtargetInfo(TT, CPU, FS),
+  HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false),
+  HasFPU(false), HasMul64(false), HasSqrt(false)
+{
+  // Parse features string.
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "mblaze";
+  ParseSubtargetFeatures(CPUName, FS);
+
+  // Only use instruction scheduling if the selected CPU has an instruction
+  // itinerary (the default CPU is the only one that doesn't).
+  HasItin = CPUName != "mblaze";
+  DEBUG(dbgs() << "CPU " << CPUName << "(" << HasItin << ")\n");
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+}
+
+bool MBlazeSubtarget::
+enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                      TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                      RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&MBlaze::GPRRegClass);
+  return HasItin && OptLevel >= CodeGenOpt::Default;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.h b/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.h
new file mode 100644
index 000000000000..ed43d21f30c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeSubtarget.h
@@ -0,0 +1,75 @@
+//===-- MBlazeSubtarget.h - Define Subtarget for the MBlaze ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZESUBTARGET_H
+#define MBLAZESUBTARGET_H
+
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MBlazeGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MBlazeSubtarget : public MBlazeGenSubtargetInfo {
+
+protected:
+  bool HasBarrel;
+  bool HasDiv;
+  bool HasMul;
+  bool HasPatCmp;
+  bool HasFPU;
+  bool HasMul64;
+  bool HasSqrt;
+  bool HasItin;
+
+  InstrItineraryData InstrItins;
+
+public:
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MBlazeSubtarget(const std::string &TT, const std::string &CPU,
+                  const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// Compute the number of maximum number of issues per cycle for the
+  /// MBlaze scheduling itineraries.
+  void computeIssueWidth();
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  bool hasItin()   const { return HasItin; }
+  bool hasPCMP()   const { return HasPatCmp; }
+  bool hasFPU()    const { return HasFPU; }
+  bool hasSqrt()   const { return HasSqrt; }
+  bool hasMul()    const { return HasMul; }
+  bool hasMul64()  const { return HasMul64; }
+  bool hasDiv()    const { return HasDiv; }
+  bool hasBarrel() const { return HasBarrel; }
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.cpp
new file mode 100644
index 000000000000..bcdd32fed947
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -0,0 +1,81 @@
+//===-- MBlazeTargetMachine.cpp - Define TargetMachine for MBlaze ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about MBlaze target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeTargetMachine.h"
+#include "MBlaze.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMBlazeTarget() {
+  // Register the target.
+  RegisterTargetMachine<MBlazeTargetMachine> X(TheMBlazeTarget);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+MBlazeTargetMachine::
+MBlazeTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    DL("E-p:32:32:32-i8:8:8-i16:16:16"),
+    InstrInfo(*this),
+    FrameLowering(Subtarget),
+    TLInfo(*this), TSInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+}
+
+namespace {
+/// MBlaze Code Generator Pass Configuration Options.
+class MBlazePassConfig : public TargetPassConfig {
+public:
+  MBlazePassConfig(MBlazeTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  MBlazeTargetMachine &getMBlazeTargetMachine() const {
+    return getTM<MBlazeTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *MBlazeTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MBlazePassConfig(this, PM);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen MBlaze code.
+bool MBlazePassConfig::addInstSelector() {
+  addPass(createMBlazeISelDag(getMBlazeTargetMachine()));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+bool MBlazePassConfig::addPreEmitPass() {
+  addPass(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.h b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.h
new file mode 100644
index 000000000000..956794dddaf9
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -0,0 +1,80 @@
+//===-- MBlazeTargetMachine.h - Define TargetMachine for MBlaze -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MBlaze specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZE_TARGETMACHINE_H
+#define MBLAZE_TARGETMACHINE_H
+
+#include "MBlazeFrameLowering.h"
+#include "MBlazeISelLowering.h"
+#include "MBlazeInstrInfo.h"
+#include "MBlazeIntrinsicInfo.h"
+#include "MBlazeSelectionDAGInfo.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class formatted_raw_ostream;
+
+  class MBlazeTargetMachine : public LLVMTargetMachine {
+    MBlazeSubtarget        Subtarget;
+    const DataLayout       DL; // Calculates type size & alignment
+    MBlazeInstrInfo        InstrInfo;
+    MBlazeFrameLowering    FrameLowering;
+    MBlazeTargetLowering   TLInfo;
+    MBlazeSelectionDAGInfo TSInfo;
+    MBlazeIntrinsicInfo    IntrinsicInfo;
+    InstrItineraryData     InstrItins;
+
+  public:
+    MBlazeTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        const TargetOptions &Options,
+                        Reloc::Model RM, CodeModel::Model CM,
+                        CodeGenOpt::Level OL);
+
+    virtual const MBlazeInstrInfo *getInstrInfo() const
+    { return &InstrInfo; }
+
+    virtual const InstrItineraryData *getInstrItineraryData() const
+    {  return &InstrItins; }
+
+    virtual const TargetFrameLowering *getFrameLowering() const
+    { return &FrameLowering; }
+
+    virtual const MBlazeSubtarget *getSubtargetImpl() const
+    { return &Subtarget; }
+
+    virtual const DataLayout *getDataLayout() const
+    { return &DL;}
+
+    virtual const MBlazeRegisterInfo *getRegisterInfo() const
+    { return &InstrInfo.getRegisterInfo(); }
+
+    virtual const MBlazeTargetLowering *getTargetLowering() const
+    { return &TLInfo; }
+
+    virtual const MBlazeSelectionDAGInfo* getSelectionDAGInfo() const
+    { return &TSInfo; }
+
+    const TargetIntrinsicInfo *getIntrinsicInfo() const
+    { return &IntrinsicInfo; }
+
+    // Pass Pipeline Configuration
+    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  };
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
new file mode 100644
index 000000000000..a7a0a68b1612
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
@@ -0,0 +1,90 @@
+//===-- MBlazeTargetObjectFile.cpp - MBlaze object files ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeTargetObjectFile.h"
+#include "MBlazeSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+void MBlazeTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= 8;
+}
+
+bool MBlazeTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MBlazeTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+}
+
+const MCSection *MBlazeTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.h b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.h
new file mode 100644
index 000000000000..c313722427db
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MBlazeTargetObjectFile.h
@@ -0,0 +1,40 @@
+//===-- llvm/Target/MBlazeTargetObjectFile.h - MBlaze Obj. Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+  class MBlazeTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+  public:
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM,
+                                SectionKind Kind) const;
+
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
new file mode 100644
index 000000000000..6f9752c42951
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -0,0 +1,171 @@
+//===-- MBlazeAsmBackend.cpp - MBlaze Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static unsigned getFixupKindSize(unsigned Kind) {
+  switch (Kind) {
+  default: llvm_unreachable("invalid fixup kind!");
+  case FK_Data_1: return 1;
+  case FK_PCRel_2:
+  case FK_Data_2: return 2;
+  case FK_PCRel_4:
+  case FK_Data_4: return 4;
+  case FK_Data_8: return 8;
+  }
+}
+
+
+namespace {
+
+class MBlazeAsmBackend : public MCAsmBackend {
+public:
+  MBlazeAsmBackend(const Target &T)
+    : MCAsmBackend() {
+  }
+
+  unsigned getNumFixupKinds() const {
+    return 2;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+
+  unsigned getPointerSize() const {
+    return 4;
+  }
+};
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+    switch (Op) {
+    default:            return Op;
+    case MBlaze::ADDIK: return MBlaze::ADDIK32;
+    case MBlaze::ORI:   return MBlaze::ORI32;
+    case MBlaze::BRLID: return MBlaze::BRLID32;
+    }
+}
+
+bool MBlazeAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  if (getRelaxedOpcode(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+  bool hasExprOrImm = false;
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+    hasExprOrImm |= Inst.getOperand(i).isExpr();
+
+  return hasExprOrImm;
+}
+
+bool MBlazeAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                            uint64_t Value,
+                                            const MCRelaxableFragment *DF,
+                                            const MCAsmLayout &Layout) const {
+  // FIXME: Is this right? It's what the "generic" code was doing before,
+  // but is X86 specific. Is it actually true for MBlaze also, or was it
+  // just close enough to not be a big deal?
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void MBlazeAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  Res = Inst;
+  Res.setOpcode(getRelaxedOpcode(Inst.getOpcode()));
+}
+
+bool MBlazeAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  if ((Count % 4) != 0)
+    return false;
+
+  for (uint64_t i = 0; i < Count; i += 4)
+      OW->Write32(0x00000000);
+
+  return true;
+}
+} // end anonymous namespace
+
+namespace {
+class ELFMBlazeAsmBackend : public MBlazeAsmBackend {
+public:
+  uint8_t OSABI;
+  ELFMBlazeAsmBackend(const Target &T, uint8_t _OSABI)
+    : MBlazeAsmBackend(T), OSABI(_OSABI) { }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const;
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createMBlazeELFObjectWriter(OS, OSABI);
+  }
+};
+
+void ELFMBlazeAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                     unsigned DataSize, uint64_t Value) const {
+  unsigned Size = getFixupKindSize(Fixup.getKind());
+
+  assert(Fixup.getOffset() + Size <= DataSize &&
+         "Invalid fixup offset!");
+
+  char *data = Data + Fixup.getOffset();
+  switch (Size) {
+  default: llvm_unreachable("Cannot fixup unknown value.");
+  case 1:  llvm_unreachable("Cannot fixup 1 byte value.");
+  case 8:  llvm_unreachable("Cannot fixup 8 byte value.");
+
+  case 4:
+    *(data+7) = uint8_t(Value);
+    *(data+6) = uint8_t(Value >> 8);
+    *(data+3) = uint8_t(Value >> 16);
+    *(data+2) = uint8_t(Value >> 24);
+    break;
+
+  case 2:
+    *(data+3) = uint8_t(Value >> 0);
+    *(data+2) = uint8_t(Value >> 8);
+  }
+}
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                           StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    assert(0 && "Mac not supported on MBlaze");
+
+  if (TheTriple.isOSWindows())
+    assert(0 && "Windows not supported on MBlaze");
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFMBlazeAsmBackend(T, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
new file mode 100644
index 000000000000..437026e7bbc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
@@ -0,0 +1,237 @@
+//===-- MBlazeBaseInfo.h - Top level definitions for MBlaze -- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the MBlaze target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBlazeBASEINFO_H
+#define MBlazeBASEINFO_H
+
+#include "MBlazeMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MBlazeII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MBlazeII {
+  enum {
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    FPseudo = 0,
+    FRRR,
+    FRRI,
+    FCRR,
+    FCRI,
+    FRCR,
+    FRCI,
+    FCCR,
+    FCCI,
+    FRRCI,
+    FRRC,
+    FRCX,
+    FRCS,
+    FCRCS,
+    FCRCX,
+    FCX,
+    FCR,
+    FRIR,
+    FRRRR,
+    FRI,
+    FC,
+    FRR,
+    FormMask = 63
+
+    //===------------------------------------------------------------------===//
+    // MBlaze Specific MachineOperand flags.
+    // MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    // MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    // MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    // MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address.
+    // MO_ABS_HILO
+
+  };
+}
+
+static inline bool isMBlazeRegister(unsigned Reg) {
+  return Reg <= 31;
+}
+
+static inline bool isSpecialMBlazeRegister(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
+    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
+    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
+    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
+    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
+    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
+      return true;
+
+    default:
+      return false;
+  }
+}
+
+/// getMBlazeRegisterNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterNumbering(unsigned RegEnum) {
+  switch (RegEnum) {
+    case MBlaze::R0     : return 0;
+    case MBlaze::R1     : return 1;
+    case MBlaze::R2     : return 2;
+    case MBlaze::R3     : return 3;
+    case MBlaze::R4     : return 4;
+    case MBlaze::R5     : return 5;
+    case MBlaze::R6     : return 6;
+    case MBlaze::R7     : return 7;
+    case MBlaze::R8     : return 8;
+    case MBlaze::R9     : return 9;
+    case MBlaze::R10    : return 10;
+    case MBlaze::R11    : return 11;
+    case MBlaze::R12    : return 12;
+    case MBlaze::R13    : return 13;
+    case MBlaze::R14    : return 14;
+    case MBlaze::R15    : return 15;
+    case MBlaze::R16    : return 16;
+    case MBlaze::R17    : return 17;
+    case MBlaze::R18    : return 18;
+    case MBlaze::R19    : return 19;
+    case MBlaze::R20    : return 20;
+    case MBlaze::R21    : return 21;
+    case MBlaze::R22    : return 22;
+    case MBlaze::R23    : return 23;
+    case MBlaze::R24    : return 24;
+    case MBlaze::R25    : return 25;
+    case MBlaze::R26    : return 26;
+    case MBlaze::R27    : return 27;
+    case MBlaze::R28    : return 28;
+    case MBlaze::R29    : return 29;
+    case MBlaze::R30    : return 30;
+    case MBlaze::R31    : return 31;
+    case MBlaze::RPC    : return 0x0000;
+    case MBlaze::RMSR   : return 0x0001;
+    case MBlaze::REAR   : return 0x0003;
+    case MBlaze::RESR   : return 0x0005;
+    case MBlaze::RFSR   : return 0x0007;
+    case MBlaze::RBTR   : return 0x000B;
+    case MBlaze::REDR   : return 0x000D;
+    case MBlaze::RPID   : return 0x1000;
+    case MBlaze::RZPR   : return 0x1001;
+    case MBlaze::RTLBX  : return 0x1002;
+    case MBlaze::RTLBLO : return 0x1003;
+    case MBlaze::RTLBHI : return 0x1004;
+    case MBlaze::RPVR0  : return 0x2000;
+    case MBlaze::RPVR1  : return 0x2001;
+    case MBlaze::RPVR2  : return 0x2002;
+    case MBlaze::RPVR3  : return 0x2003;
+    case MBlaze::RPVR4  : return 0x2004;
+    case MBlaze::RPVR5  : return 0x2005;
+    case MBlaze::RPVR6  : return 0x2006;
+    case MBlaze::RPVR7  : return 0x2007;
+    case MBlaze::RPVR8  : return 0x2008;
+    case MBlaze::RPVR9  : return 0x2009;
+    case MBlaze::RPVR10 : return 0x200A;
+    case MBlaze::RPVR11 : return 0x200B;
+    default: llvm_unreachable("Unknown register number!");
+  }
+}
+
+/// getRegisterFromNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0  : return MBlaze::R0;
+    case 1  : return MBlaze::R1;
+    case 2  : return MBlaze::R2;
+    case 3  : return MBlaze::R3;
+    case 4  : return MBlaze::R4;
+    case 5  : return MBlaze::R5;
+    case 6  : return MBlaze::R6;
+    case 7  : return MBlaze::R7;
+    case 8  : return MBlaze::R8;
+    case 9  : return MBlaze::R9;
+    case 10 : return MBlaze::R10;
+    case 11 : return MBlaze::R11;
+    case 12 : return MBlaze::R12;
+    case 13 : return MBlaze::R13;
+    case 14 : return MBlaze::R14;
+    case 15 : return MBlaze::R15;
+    case 16 : return MBlaze::R16;
+    case 17 : return MBlaze::R17;
+    case 18 : return MBlaze::R18;
+    case 19 : return MBlaze::R19;
+    case 20 : return MBlaze::R20;
+    case 21 : return MBlaze::R21;
+    case 22 : return MBlaze::R22;
+    case 23 : return MBlaze::R23;
+    case 24 : return MBlaze::R24;
+    case 25 : return MBlaze::R25;
+    case 26 : return MBlaze::R26;
+    case 27 : return MBlaze::R27;
+    case 28 : return MBlaze::R28;
+    case 29 : return MBlaze::R29;
+    case 30 : return MBlaze::R30;
+    case 31 : return MBlaze::R31;
+    default: llvm_unreachable("Unknown register number!");
+  }
+}
+
+static inline unsigned getSpecialMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : return MBlaze::RPC;
+    case 0x0001 : return MBlaze::RMSR;
+    case 0x0003 : return MBlaze::REAR;
+    case 0x0005 : return MBlaze::RESR;
+    case 0x0007 : return MBlaze::RFSR;
+    case 0x000B : return MBlaze::RBTR;
+    case 0x000D : return MBlaze::REDR;
+    case 0x1000 : return MBlaze::RPID;
+    case 0x1001 : return MBlaze::RZPR;
+    case 0x1002 : return MBlaze::RTLBX;
+    case 0x1003 : return MBlaze::RTLBLO;
+    case 0x1004 : return MBlaze::RTLBHI;
+    case 0x2000 : return MBlaze::RPVR0;
+    case 0x2001 : return MBlaze::RPVR1;
+    case 0x2002 : return MBlaze::RPVR2;
+    case 0x2003 : return MBlaze::RPVR3;
+    case 0x2004 : return MBlaze::RPVR4;
+    case 0x2005 : return MBlaze::RPVR5;
+    case 0x2006 : return MBlaze::RPVR6;
+    case 0x2007 : return MBlaze::RPVR7;
+    case 0x2008 : return MBlaze::RPVR8;
+    case 0x2009 : return MBlaze::RPVR9;
+    case 0x200A : return MBlaze::RPVR10;
+    case 0x200B : return MBlaze::RPVR11;
+    default: llvm_unreachable("Unknown register number!");
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp
new file mode 100644
index 000000000000..2824b3c35cf1
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp
@@ -0,0 +1,77 @@
+//===-- MBlazeELFObjectWriter.cpp - MBlaze ELF Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class MBlazeELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    MBlazeELFObjectWriter(uint8_t OSABI);
+
+    virtual ~MBlazeELFObjectWriter();
+  protected:
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend) const;
+  };
+}
+
+MBlazeELFObjectWriter::MBlazeELFObjectWriter(uint8_t OSABI)
+  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_MBLAZE,
+                            /*HasRelocationAddend*/ false) {}
+
+MBlazeELFObjectWriter::~MBlazeELFObjectWriter() {
+}
+
+unsigned MBlazeELFObjectWriter::GetRelocType(const MCValue &Target,
+                                             const MCFixup &Fixup,
+                                             bool IsPCRel,
+                                             bool IsRelocWithSymbol,
+                                             int64_t Addend) const {
+  // determine the type of the relocation
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented");
+    case FK_PCRel_4:
+      Type = ELF::R_MICROBLAZE_64_PCREL;
+      break;
+    case FK_PCRel_2:
+      Type = ELF::R_MICROBLAZE_32_PCREL;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+    default: llvm_unreachable("invalid fixup kind!");
+    case FK_Data_4:
+      Type = ((IsRelocWithSymbol || Addend !=0)
+              ? ELF::R_MICROBLAZE_32
+              : ELF::R_MICROBLAZE_64);
+      break;
+    case FK_Data_2:
+      Type = ELF::R_MICROBLAZE_32;
+      break;
+    }
+  }
+  return Type;
+}
+
+
+
+MCObjectWriter *llvm::createMBlazeELFObjectWriter(raw_ostream &OS,
+                                                  uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new MBlazeELFObjectWriter(OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/ false);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp
new file mode 100644
index 000000000000..8231f07dfa80
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp
@@ -0,0 +1,26 @@
+//===-- MBlazeMCAsmInfo.cpp - MBlaze asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MBlazeMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeMCAsmInfo.h"
+using namespace llvm;
+
+void MBlazeMCAsmInfo::anchor() { }
+
+MBlazeMCAsmInfo::MBlazeMCAsmInfo() {
+  IsLittleEndian              = false;
+  StackGrowsUp                = false;
+  SupportsDebugInformation    = true;
+  AlignmentIsInBytes          = false;
+  PrivateGlobalPrefix         = "$";
+  GPRel32Directive            = "\t.gpword\t";
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
new file mode 100644
index 000000000000..977f9a6866d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MBlazeMCAsmInfo.h - MBlaze asm properties --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MBlazeMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZETARGETASMINFO_H
+#define MBLAZETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+
+  class MBlazeMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit MBlazeMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
new file mode 100644
index 000000000000..8faff6ade441
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -0,0 +1,222 @@
+//===-- MBlazeMCCodeEmitter.cpp - Convert MBlaze code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MBlazeMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class MBlazeMCCodeEmitter : public MCCodeEmitter {
+  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+
+public:
+  MBlazeMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                      MCContext &ctx)
+    : MCII(mcii) {
+  }
+
+  ~MBlazeMCCodeEmitter() {}
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO) const;
+  unsigned getMachineOpValue(const MCInst &MI, unsigned OpIdx) const {
+    return getMachineOpValue(MI, MI.getOperand(OpIdx));
+  }
+
+  static unsigned GetMBlazeRegNum(const MCOperand &MO) {
+    // FIXME: getMBlazeRegisterNumbering() is sufficient?
+    llvm_unreachable("MBlazeMCCodeEmitter::GetMBlazeRegNum() not yet "
+                     "implemented.");
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    // The MicroBlaze uses a bit reversed format so we need to reverse the
+    // order of the bits. Taken from:
+    // http://graphics.stanford.edu/~seander/bithacks.html
+    C = ((C * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitRawByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    assert(Size <= 8 && "size too big in emit constant");
+
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const;
+  void EmitIMM(const MCInst &MI, unsigned &CurByte, raw_ostream &OS) const;
+
+  void EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MBlazeMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                             const MCOperand &MO) const {
+  if (MO.isReg())
+    return getMBlazeRegisterNumbering(MO.getReg());
+  if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  if (MO.isExpr())
+    return 0; // The relocation has already been recorded at this point.
+#ifndef NDEBUG
+  errs() << MO;
+#endif
+  llvm_unreachable(0);
+}
+
+void MBlazeMCCodeEmitter::
+EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const {
+  int32_t val = (int32_t)imm.getImm();
+  if (val > 32767 || val < -32768) {
+    EmitByte(0x0D, CurByte, OS);
+    EmitByte(0x00, CurByte, OS);
+    EmitRawByte((val >> 24) & 0xFF, CurByte, OS);
+    EmitRawByte((val >> 16) & 0xFF, CurByte, OS);
+  }
+}
+
+void MBlazeMCCodeEmitter::
+EmitIMM(const MCInst &MI, unsigned &CurByte,raw_ostream &OS) const {
+  switch (MI.getOpcode()) {
+  default: break;
+
+  case MBlaze::ADDIK32:
+  case MBlaze::ORI32:
+  case MBlaze::BRLID32:
+    EmitByte(0x0D, CurByte, OS);
+    EmitByte(0x00, CurByte, OS);
+    EmitRawByte(0, CurByte, OS);
+    EmitRawByte(0, CurByte, OS);
+  }
+}
+
+void MBlazeMCCodeEmitter::
+EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel, unsigned &CurByte,
+              raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getNumOperands()>opNo && "Not enought operands for instruction");
+
+  MCOperand oper = MI.getOperand(opNo);
+
+  if (oper.isImm()) {
+    EmitIMM(oper, CurByte, OS);
+  } else if (oper.isExpr()) {
+    MCFixupKind FixupKind;
+    switch (MI.getOpcode()) {
+    default:
+      FixupKind = pcrel ? FK_PCRel_2 : FK_Data_2;
+      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
+      break;
+    case MBlaze::ORI32:
+    case MBlaze::ADDIK32:
+    case MBlaze::BRLID32:
+      FixupKind = pcrel ? FK_PCRel_4 : FK_Data_4;
+      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
+      break;
+    }
+  }
+}
+
+
+
+void MBlazeMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Emit an IMM instruction if the instruction we are encoding requires it
+  EmitIMM(MI,CurByte,OS);
+
+  switch ((TSFlags & MBlazeII::FormMask)) {
+  default: break;
+  case MBlazeII::FPseudo:
+    // Pseudo instructions don't get encoded.
+    return;
+  case MBlazeII::FRRI:
+    EmitImmediate(MI, 2, false, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FRIR:
+    EmitImmediate(MI, 1, false, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FCRI:
+    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
+    break;
+  case MBlazeII::FRCI:
+    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
+  case MBlazeII::FCCI:
+    EmitImmediate(MI, 0, true, CurByte, OS, Fixups);
+    break;
+  }
+
+  ++MCNumEmitted;  // Keep track of the # of mi's emitted
+  unsigned Value = getBinaryCodeForInstr(MI);
+  EmitConstant(Value, 4, CurByte, OS);
+}
+
+// FIXME: These #defines shouldn't be necessary. Instead, tblgen should
+// be able to generate code emitter helpers for either variant, like it
+// does for the AsmWriter.
+#define MBlazeCodeEmitter MBlazeMCCodeEmitter
+#define MachineInstr MCInst
+#include "MBlazeGenCodeEmitter.inc"
+#undef MBlazeCodeEmitter
+#undef MachineInstr
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
new file mode 100644
index 000000000000..380750d50f4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -0,0 +1,141 @@
+//===-- MBlazeMCTargetDesc.cpp - MBlaze Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MBlaze specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlazeMCTargetDesc.h"
+#include "InstPrinter/MBlazeInstPrinter.h"
+#include "MBlazeMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "MBlazeGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MBlazeGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MBlazeGenRegisterInfo.inc"
+
+using namespace llvm;
+
+
+static MCInstrInfo *createMBlazeMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMBlazeMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMBlazeMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMBlazeMCRegisterInfo(X, MBlaze::R15);
+  return X;
+}
+
+static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                    StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitMBlazeMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  switch (TheTriple.getOS()) {
+  default:
+    return new MBlazeMCAsmInfo();
+  }
+}
+
+static MCCodeGenInfo *createMBlazeMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
+  }
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("MBlaze does not support Windows COFF format");
+  }
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MBlazeInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMBlazeTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheMBlazeTarget, createMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMBlazeTarget,
+                                        createMBlazeMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMBlazeTarget,
+                                    createMBlazeMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
+                                          createMBlazeMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMBlazeTarget,
+                                        llvm::createMBlazeMCCodeEmitter);
+
+  // Register the asm backend
+  TargetRegistry::RegisterMCAsmBackend(TheMBlazeTarget,
+                                       createMBlazeAsmBackend);
+
+  // Register the object streamer
+  TargetRegistry::RegisterMCObjectStreamer(TheMBlazeTarget,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
+                                        createMBlazeMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
new file mode 100644
index 000000000000..7bc7d8f20724
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -0,0 +1,56 @@
+//===-- MBlazeMCTargetDesc.h - MBlaze Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MBlaze specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBLAZEMCTARGETDESC_H
+#define MBLAZEMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCContext;
+class MCCodeEmitter;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target TheMBlazeTarget;
+
+MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                     StringRef CPU);
+
+MCObjectWriter *createMBlazeELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
+} // End llvm namespace
+
+// Defines symbolic names for MBlaze registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "MBlazeGenRegisterInfo.inc"
+
+// Defines symbolic names for the MBlaze instructions.
+#define GET_INSTRINFO_ENUM
+#include "MBlazeGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MBlazeGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp b/contrib/llvm/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
new file mode 100644
index 000000000000..323a7f647d56
--- /dev/null
+++ b/contrib/llvm/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- MBlazeTargetInfo.cpp - MBlaze Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MBlaze.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMBlazeTarget;
+
+extern "C" void LLVMInitializeMBlazeTargetInfo() {
+  RegisterTarget<Triple::mblaze> X(TheMBlazeTarget, "mblaze", "MBlaze");
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 000000000000..4b12aeadd3e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -0,0 +1,114 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430InstPrinter.h"
+#include "MSP430.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+
+// Include the auto-generated portion of the assembly writer.
+#include "MSP430GenAsmWriter.inc"
+
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
+                                             raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O, const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '#' << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O,
+                                           const char *Modifier) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+  // Print displacement first
+
+  // If the global address expression is a part of displacement field with a
+  // register base, we should not emit any prefix symbol here, e.g.
+  //   mov.w &foo, r1
+  // vs
+  //   mov.w glb(r1), r2
+  // Otherwise (!) msp430-as will silently miscompile the output :(
+  if (!Base.getReg())
+    O << '&';
+
+  if (Disp.isExpr())
+    O << *Disp.getExpr();
+  else {
+    assert(Disp.isImm() && "Expected immediate in displacement field");
+    O << Disp.getImm();
+  }
+
+  // Print register base field
+  if (Base.getReg())
+    O << '(' << getRegisterName(Base.getReg()) << ')';
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned CC = MI->getOperand(OpNo).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+  case MSP430CC::COND_E:
+   O << "eq";
+   break;
+  case MSP430CC::COND_NE:
+   O << "ne";
+   break;
+  case MSP430CC::COND_HS:
+   O << "hs";
+   break;
+  case MSP430CC::COND_LO:
+   O << "lo";
+   break;
+  case MSP430CC::COND_GE:
+   O << "ge";
+   break;
+  case MSP430CC::COND_L:
+   O << 'l';
+   break;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
new file mode 100644
index 000000000000..d32eb3a21a37
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -0,0 +1,44 @@
+//= MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax -*- C++ -*-//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430INSTPRINTER_H
+#define MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+
+  class MSP430InstPrinter : public MCInstPrinter {
+  public:
+    MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+      : MCInstPrinter(MAI, MII, MRI) {}
+
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI, raw_ostream &O);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                      const char *Modifier = 0);
+    void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+    void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                            const char *Modifier = 0);
+    void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
new file mode 100644
index 000000000000..3c9576056946
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -0,0 +1,31 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+using namespace llvm;
+
+void MSP430MCAsmInfo::anchor() { }
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, StringRef TT) {
+  PointerSize = CalleeSaveStackSlotSize = 2;
+
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective ="\t.weak\t";
+  PCSymbol=".";
+  CommentString = ";";
+
+  AlignmentIsInBytes = false;
+  AllowNameToStartWithDigit = true;
+  UsesELFSectionDirectiveForBSS = true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
new file mode 100644
index 000000000000..e5c2fc283b17
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MSP430MCAsmInfo.h - MSP430 asm properties --------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430TARGETASMINFO_H
+#define MSP430TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class StringRef;
+  class Target;
+
+  class MSP430MCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit MSP430MCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
new file mode 100644
index 000000000000..530e6aae92fd
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -0,0 +1,94 @@
+//===-- MSP430MCTargetDesc.cpp - MSP430 Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430MCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MSP430GenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createMSP430MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMSP430MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMSP430MCRegisterInfo(X, MSP430::PCW);
+  return X;
+}
+
+static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                    StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitMSP430MCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+extern "C" void LLVMInitializeMSP430TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<MSP430MCAsmInfo> X(TheMSP430Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMSP430Target,
+                                        createMSP430MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMSP430Target,
+                                    createMSP430MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
+                                          createMSP430MCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
+                                        createMSP430MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
new file mode 100644
index 000000000000..7f3505ca5514
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- MSP430MCTargetDesc.h - MSP430 Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MCTARGETDESC_H
+#define MSP430MCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheMSP430Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#define GET_INSTRINFO_ENUM
+#include "MSP430GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MSP430GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.h b/contrib/llvm/lib/Target/MSP430/MSP430.h
new file mode 100644
index 000000000000..4574ce5f98b7
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.h
@@ -0,0 +1,47 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_H
+#define LLVM_TARGET_MSP430_H
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace MSP430CC {
+  // MSP430 specific condition code.
+  enum CondCodes {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID = -1
+  };
+}
+
+namespace llvm {
+  class MSP430TargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createMSP430BranchSelectionPass();
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.td b/contrib/llvm/lib/Target/MSP430/MSP430.td
new file mode 100644
index 000000000000..c6796b3789ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.td
@@ -0,0 +1,66 @@
+//===-- MSP430.td - Describe the MSP430 Target Machine -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+                    "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo;
+
+def MSP430InstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+  let InstructionSet = MSP430InstrInfo;
+  let AssemblyWriters = [MSP430InstPrinter];
+}
+
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
new file mode 100644
index 000000000000..0a04e5ddb75d
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -0,0 +1,167 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MCInstLower.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+namespace {
+  class MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {}
+
+    virtual const char *getPassName() const {
+      return "MSP430 Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      raw_ostream &O, const char* Modifier = 0);
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo, unsigned AsmVariant,
+                               const char *ExtraCode, raw_ostream &O);
+    void EmitInstruction(const MachineInstr *MI);
+  };
+} // end of anonymous namespace
+
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Not implemented yet!");
+  case MachineOperand::MO_Register:
+    O << MSP430InstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    uint64_t Offset = MO.getOffset();
+
+    // If the global address expression is a part of displacement field with a
+    // register base, we should not emit any prefix symbol here, e.g.
+    //   mov.w &foo, r1
+    // vs
+    //   mov.w glb(r1), r2
+    // Otherwise (!) msp430-as will silently miscompile the output :(
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << (isMemOp ? '&' : '#');
+    if (Offset)
+      O << '(' << Offset << '+';
+
+    O << *Mang->getSymbol(MO.getGlobal());
+
+    if (Offset)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    O << (isMemOp ? '&' : '#');
+    O << MAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  }
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          raw_ostream &O) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  // Print displacement first
+
+  // Imm here is in fact global address - print extra modifier.
+  if (Disp.isImm() && !Base.getReg())
+    O << '&';
+  printOperand(MI, OpNum+1, O, "nohash");
+
+  // Print register base field
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum, O);
+    O << ')';
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo, unsigned AsmVariant,
+                                             const char *ExtraCode,
+                                             raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printSrcMemOperand(MI, OpNo, O);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MSP430MCInstLower MCInstLowering(OutContext, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+  RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
new file mode 100644
index 000000000000..f128427f8066
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -0,0 +1,180 @@
+//===-- MSP430BranchSelector.cpp - Emit long conditional branches ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 10 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-branch-select"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct MSP430BSel : public MachineFunctionPass {
+    static char ID;
+    MSP430BSel() : MachineFunctionPass(ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "MSP430 Branch Selector";
+    }
+  };
+  char MSP430BSel::ID = 0;
+}
+
+/// createMSP430BranchSelectionPass - returns an instance of the Branch
+/// Selection Pass
+///
+FunctionPass *llvm::createMSP430BranchSelectionPass() {
+  return new MSP430BSel();
+}
+
+bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
+  const MSP430InstrInfo *TII =
+             static_cast<const MSP430InstrInfo*>(Fn.getTarget().getInstrInfo());
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 9)) {
+    BlockSizes.clear();
+    return false;
+  }
+
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+6
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if ((I->getOpcode() != MSP430::JCC || I->getOperand(0).isImm()) &&
+            I->getOpcode() != MSP430::JMP) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(0).getMBB();
+
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<10>(BranchSize)) {
+          MBBStartOffset += 2;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        unsigned NewSize;
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+
+        if (I->getOpcode() == MSP430::JMP) {
+          NewSize = 4;
+        } else {
+          // The BCC operands are:
+          // 0. MSP430 branch predicate
+          // 1. Target MBB
+          SmallVector<MachineOperand, 1> Cond;
+          Cond.push_back(I->getOperand(1));
+
+          // Jump over the uncond branch inst (i.e. $+6) on opposite condition.
+          TII->ReverseBranchCondition(Cond);
+          BuildMI(MBB, I, dl, TII->get(MSP430::JCC))
+            .addImm(4).addOperand(Cond[0]);
+
+          NewSize = 6;
+        }
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(MSP430::Bi)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+
+        // Remember that this instruction is NewSize bytes, increase the size of the
+        // block by NewSize-2, remember to iterate.
+        BlockSizes[MBB.getNumber()] += NewSize-2;
+        MBBStartOffset += NewSize;
+
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+
+  BlockSizes.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 000000000000..b448cc4ed9b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
@@ -0,0 +1,40 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+  // i8 are returned in registers R15B, R14B, R13B, R12B
+  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+  // i16 are returned in registers R15, R14, R13, R12
+  CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430 : CallingConv<[
+  // Pass by value if the byval attribute is given
+  CCIfByVal<CCPassByVal<2, 2>>,
+
+  // Promote i8 arguments to i16.
+  CCIfType<[i8], CCPromoteToType<i16>>,
+
+  // The first 4 integer arguments of non-varargs functions are passed in
+  // integer registers.
+  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
+
+  // Integer values get stored in stack slots that are 2 bytes in
+  // size and 2-byte aligned.
+  CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
new file mode 100644
index 000000000000..e504011dfdc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -0,0 +1,297 @@
+//===-- MSP430FrameLowering.cpp - MSP430 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430FrameLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool MSP430FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void MSP430FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save FPW into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(MSP430::FPW, RegState::Kill);
+
+    // Update FPW with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FPW)
+      .addReg(MSP430::SPW);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(MSP430::FPW);
+
+  } else
+    NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
+    // If there is an SUB16ri of SPW immediately before this instruction, merge
+    // the two.
+    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+    // If there is an ADD16ri or SUB16ri of SPW immediately after this
+    // instruction, merge the two instructions.
+    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case MSP430::RET:
+  case MSP430::RETI: break;  // These are ok
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - CSSize;
+
+    // pop FPW.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FPW);
+  } else
+    NumBytes = StackSize - CSSize;
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != MSP430::POP16r && !PI->isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD16ri or SUB16ri of SPW immediately before this
+  // instruction, merge the two instructions.
+  //if (NumBytes || MFI->hasVarSizedObjects())
+  //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  if (MFI->hasVarSizedObjects()) {
+    BuildMI(MBB, MBBI, DL,
+            TII.get(MSP430::MOV16rr), MSP430::SPW).addReg(MSP430::FPW);
+    if (CSSize) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(CSSize);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    // adjust stack pointer back: SPW += numbytes
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+// FIXME: Can we eleminate these in favour of generic code?
+bool
+MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool
+MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    BuildMI(MBB, MI, DL, TII.get(MSP430::POP16r), CSI[i].getReg());
+
+  return true;
+}
+
+void MSP430FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MSP430InstrInfo &TII =
+    *static_cast<const MSP430InstrInfo*>(MF.getTarget().getInstrInfo());
+  unsigned StackAlign = getStackAlignment();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
+    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(MSP430::SUB16ri), MSP430::SPW)
+          .addReg(MSP430::SPW).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == TII.getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old->getDebugLoc(),
+                        TII.get(MSP430::ADD16ri), MSP430::SPW)
+            .addReg(MSP430::SPW).addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
+                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                         RegScavenger *) const {
+  // Create a frame entry for the FPW register that must be saved.
+  if (hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
new file mode 100644
index 000000000000..c673f59b5efc
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -0,0 +1,59 @@
+//==- MSP430FrameLowering.h - Define frame lowering for MSP430 --*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_FRAMEINFO_H
+#define MSP430_FRAMEINFO_H
+
+#include "MSP430.h"
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class MSP430Subtarget;
+
+class MSP430FrameLowering : public TargetFrameLowering {
+protected:
+  const MSP430Subtarget &STI;
+
+public:
+  explicit MSP430FrameLowering(const MSP430Subtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..1566c096037e
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -0,0 +1,492 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+  struct MSP430ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    int16_t Disp;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    MSP430ISelAddressMode()
+      : BaseType(RegBase), Disp(0), GV(0), CP(0), BlockAddr(0),
+        ES(0), JT(-1), Align(0) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+    }
+
+    void dump() {
+      errs() << "MSP430ISelAddressMode " << this << '\n';
+      if (BaseType == RegBase && Base.Reg.getNode() != 0) {
+        errs() << "Base.Reg ";
+        Base.Reg.getNode()->dump();
+      } else if (BaseType == FrameIndexBase) {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      errs() << " Disp " << Disp << '\n';
+      if (GV) {
+        errs() << "GV ";
+        GV->dump();
+      } else if (CP) {
+        errs() << " CP ";
+        CP->dump();
+        errs() << " Align" << Align << '\n';
+      } else if (ES) {
+        errs() << "ES ";
+        errs() << ES << '\n';
+      } else if (JT != -1)
+        errs() << " JT" << JT << " Align" << Align << '\n';
+    }
+  };
+}
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class MSP430DAGToDAGISel : public SelectionDAGISel {
+    const MSP430TargetLowering &Lowering;
+    const MSP430Subtarget &Subtarget;
+
+  public:
+    MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    virtual const char *getPassName() const {
+      return "MSP430 DAG->DAG Pattern Instruction Selection";
+    }
+
+    bool MatchAddress(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
+
+    virtual bool
+    SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                 std::vector<SDValue> &OutOps);
+
+    // Include the pieces autogenerated from the target description.
+  #include "MSP430GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N);
+    SDNode *SelectIndexedLoad(SDNode *Op);
+    SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+                               unsigned Opc8, unsigned Opc16);
+
+    bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Disp);
+  };
+}  // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+
+/// MatchWrapper - Try to match MSP430ISD::Wrapper node into an addressing mode.
+/// These wrap things that will resolve down into a symbol reference.  If no
+/// match is possible, this returns true, otherwise it returns false.
+bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    AM.GV = G->getGlobal();
+    AM.Disp += G->getOffset();
+    //AM.SymbolFlags = G->getTargetFlags();
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    AM.CP = CP->getConstVal();
+    AM.Align = CP->getAlignment();
+    AM.Disp += CP->getOffset();
+    //AM.SymbolFlags = CP->getTargetFlags();
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    //AM.SymbolFlags = S->getTargetFlags();
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    //AM.SymbolFlags = J->getTargetFlags();
+  } else {
+    AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+    //AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+  }
+  return false;
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != MSP430ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = MSP430ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    AM.Disp += Val;
+    return false;
+  }
+
+  case MSP430ISD::Wrapper:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == MSP430ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::ADD: {
+    MSP430ISelAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM))
+      return false;
+    AM = Backup;
+
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      MSP430ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == NULL &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
+                                    SDValue &Base, SDValue &Disp) {
+  MSP430ISelAddressMode AM;
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
+    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+    AM.Base.Reg;
+
+  if (AM.GV)
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, N->getDebugLoc(),
+                                          MVT::i16, AM.Disp,
+                                          0/*AM.SymbolFlags*/);
+  else if (AM.CP)
+    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
+                                         AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+  else if (AM.ES)
+    Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.JT != -1)
+    Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.BlockAddr)
+    Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+                                         0/*AM.SymbolFlags*/);
+  else
+    Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i16);
+
+  return true;
+}
+
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+static bool isValidIndexedLoad(const LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+      return false;
+
+    break;
+  case MVT::i16:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+      return false;
+
+    break;
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!isValidIndexedLoad(LD))
+    return NULL;
+
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+
+  unsigned Opcode = 0;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opcode = MSP430::MOV8rm_POST;
+    break;
+  case MVT::i16:
+    Opcode = MSP430::MOV16rm_POST;
+    break;
+  default:
+    return NULL;
+  }
+
+   return CurDAG->getMachineNode(Opcode, N->getDebugLoc(),
+                                 VT, MVT::i16, MVT::Other,
+                                 LD->getBasePtr(), LD->getChain());
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
+                                               SDValue N1, SDValue N2,
+                                               unsigned Opc8, unsigned Opc16) {
+  if (N1.getOpcode() == ISD::LOAD &&
+      N1.hasOneUse() &&
+      IsLegalToFold(N1, Op, Op, OptLevel)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N1);
+    if (!isValidIndexedLoad(LD))
+      return NULL;
+
+    MVT VT = LD->getMemoryVT().getSimpleVT();
+    unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
+    SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
+    SDNode *ResNode =
+      CurDAG->SelectNodeTo(Op, Opc,
+                           VT, MVT::i16, MVT::Other,
+                           Ops0, 3);
+    cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
+    // Transfer chain.
+    ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
+    // Transfer writeback.
+    ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
+    return ResNode;
+  }
+
+  return NULL;
+}
+
+
+SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    return NULL;
+  }
+
+  // Few custom selection stuff.
+  switch (Node->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    assert(Node->getValueType(0) == MVT::i16);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    if (Node->hasOneUse())
+      return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+    return CurDAG->getMachineNode(MSP430::ADD16ri, dl, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+  }
+  case ISD::LOAD:
+    if (SDNode *ResNode = SelectIndexedLoad(Node))
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  case ISD::ADD:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SUB:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::AND:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::OR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::XOR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+
+  return ResNode;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 000000000000..09cdf3268553
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -0,0 +1,1247 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-lower"
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430Subtarget.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+typedef enum {
+  NoHWMult,
+  HWMultIntr,
+  HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode",
+           cl::desc("Hardware multiplier use mode"),
+           cl::init(HWMultNoIntr),
+           cl::values(
+             clEnumValN(NoHWMult, "no",
+                "Do not use hardware multiplier"),
+             clEnumValN(HWMultIntr, "interrupts",
+                "Assume hardware multiplier can be used inside interrupts"),
+             clEnumValN(HWMultNoIntr, "use",
+                "Assume hardware multiplier cannot be used inside interrupts"),
+             clEnumValEnd));
+
+MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
+  TargetLowering(tm, new TargetLoweringObjectFileELF()),
+  Subtarget(*tm.getSubtargetImpl()) {
+
+  TD = getDataLayout();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
+  addRegisterClass(MVT::i16, &MSP430::GR16RegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setStackPointerRegisterToSaveRestore(MSP430::SPW);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // We have post-incremented loads / stores.
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // We don't have any truncstores
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,              MVT::i8,    Custom);
+  setOperationAction(ISD::SHL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRA,              MVT::i16,   Custom);
+  setOperationAction(ISD::SHL,              MVT::i16,   Custom);
+  setOperationAction(ISD::SRL,              MVT::i16,   Custom);
+  setOperationAction(ISD::ROTL,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
+  setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
+  setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
+  setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
+  setOperationAction(ISD::BlockAddress,     MVT::i16,   Custom);
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
+  setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,  MVT::i16,   Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
+
+  setOperationAction(ISD::SHL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
+
+  // FIXME: Implement efficiently multiplication by a constant
+  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
+  setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+
+  // varargs support
+  setOperationAction(ISD::VASTART,          MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,            MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
+
+  // Libcalls names.
+  if (HWMultMode == HWMultIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+  } else if (HWMultMode == HWMultNoIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  }
+
+  setMinFunctionAlignment(1);
+  setPrefFunctionAlignment(2);
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::SHL: // FALLTHROUGH
+  case ISD::SRL:
+  case ISD::SRA:              return LowerShifts(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
+  case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::SETCC:            return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                       MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+MSP430TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, &MSP430::GR8RegClass);
+
+      return std::make_pair(0U, &MSP430::GR16RegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+SDValue
+MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
+                                           CallingConv::ID CallConv,
+                                           bool isVarArg,
+                                           const SmallVectorImpl<ISD::InputArg>
+                                             &Ins,
+                                           DebugLoc dl,
+                                           SelectionDAG &DAG,
+                                           SmallVectorImpl<SDValue> &InVals)
+                                             const {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    if (Ins.empty())
+      return Chain;
+    report_fatal_error("ISRs cannot have arguments");
+  }
+}
+
+SDValue
+MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // MSP430 target does not yet support tail call optimization.
+  isTailCall = false;
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, OutVals, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    report_fatal_error("ISRs cannot be called directly");
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+SDValue
+MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
+
+  // Create frame index for the start of the first vararg value
+  if (isVarArg) {
+    unsigned Offset = CCInfo.getNextStackOffset();
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, Offset, true));
+  }
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
+      case MVT::i16:
+        unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+        // If this is an 8-bit value, it is really passed promoted to 16
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+
+      SDValue InVal;
+      ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+      if (Flags.isByVal()) {
+        int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                        VA.getLocMemOffset(), true);
+        InVal = DAG.getFrameIndex(FI, getPointerTy());
+      } else {
+        // Load the argument to a virtual register
+        unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+        if (ObjSize > 2) {
+            errs() << "LowerFormalArguments Unhandled argument type: "
+                << EVT(VA.getLocVT()).getEVTString()
+                << "\n";
+        }
+        // Create the frame index object for this incoming parameter...
+        int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true);
+
+        // Create the SelectionDAG nodes corresponding to a load
+        //from this parameter
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+        InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                            MachinePointerInfo::getFixedStack(FI),
+                            false, false, false, 0);
+      }
+
+      InVals.push_back(InVal);
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<SDValue> &OutVals,
+                                  DebugLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // ISRs cannot return any value.
+  if (CallConv == CallingConv::MSP430_INTR && !Outs.empty())
+    report_fatal_error("ISRs cannot return any value");
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+                  MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(Opc, dl, MVT::Other, &RetOps[0], RetOps.size());
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: sret.
+SDValue
+MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     bool isTailCall,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+      SDValue MemOp;
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+      if (Flags.isByVal()) {
+        SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i16);
+        MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode,
+                              Flags.getByValAlign(),
+                              /*isVolatile*/false,
+                              /*AlwaysInline=*/true,
+                              MachinePointerInfo(),
+                              MachinePointerInfo());
+      } else {
+        MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo(),
+                             false, false, 0);
+      }
+
+      MemOpChains.push_back(MemOp);
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emitted instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i16);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  unsigned Opc = Op.getOpcode();
+  SDNode* N = Op.getNode();
+  EVT VT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand non-constant shifts to loops:
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    switch (Opc) {
+    default: llvm_unreachable("Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(MSP430ISD::SHL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(MSP430ISD::SRA, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(MSP430ISD::SRL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    }
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+  // Expand the stuff into sequence of shifts.
+  // FIXME: for some shift amounts this might be done better!
+  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+  SDValue Victim = N->getOperand(0);
+
+  if (Opc == ISD::SRL && ShiftAmount) {
+    // Emit a special goodness here:
+    // srl A, 1 => clrc; rrc A
+    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    ShiftAmount -= 1;
+  }
+
+  while (ShiftAmount--)
+    Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+                         dl, VT, Victim);
+
+  return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                              getPointerTy(), Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
+                     getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
+                       ISD::CondCode CC,
+                       DebugLoc dl, SelectionDAG &DAG) {
+  // FIXME: Handle bittests someday
+  assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+  // FIXME: Handle jump negative someday
+  MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
+  switch (CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+    TCC = MSP430CC::COND_E;     // aka COND_Z
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETNE:
+    TCC = MSP430CC::COND_NE;    // aka COND_NZ
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETULE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETUGE:
+    // Turn lhs u>= rhs with lhs constant into rhs u< lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_LO;
+      break;
+    }
+    TCC = MSP430CC::COND_HS;    // aka COND_C
+    break;
+  case ISD::SETUGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETULT:
+    // Turn lhs u< rhs with lhs constant into rhs u>= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_HS;
+      break;
+    }
+    TCC = MSP430CC::COND_LO;    // aka COND_NC
+    break;
+  case ISD::SETLE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETGE:
+    // Turn lhs >= rhs with lhs constant into rhs < lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_L;
+      break;
+    }
+    TCC = MSP430CC::COND_GE;
+    break;
+  case ISD::SETGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETLT:
+    // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_GE;
+      break;
+    }
+    TCC = MSP430CC::COND_L;
+    break;
+  }
+
+  TargetCC = DAG.getConstant(TCC, MVT::i8);
+  return DAG.getNode(MSP430ISD::CMP, dl, MVT::Glue, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+                     Chain, Dest, TargetCC, Flag);
+}
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  // If we are doing an AND and testing against zero, then the CMP
+  // will not be generated.  The AND (or BIT) will generate the condition codes,
+  // but they are different from CMP.
+  // FIXME: since we're doing a post-processing, use a pseudoinstr here, so
+  // lowering & isel wouldn't diverge.
+  bool andCC = false;
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::AND ||
+         (LHS.getOpcode() == ISD::TRUNCATE &&
+          LHS.getOperand(0).getOpcode() == ISD::AND))) {
+      andCC = true;
+    }
+  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  // Get the condition codes directly from the status register, if its easy.
+  // Otherwise a branch will be generated.  Note that the AND and BIT
+  // instructions generate different flags than CMP, the carry bit can be used
+  // for NE/EQ.
+  bool Invert = false;
+  bool Shift = false;
+  bool Convert = true;
+  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+   default:
+    Convert = false;
+    break;
+   case MSP430CC::COND_HS:
+     // Res = SRW & 1, no processing is required
+     break;
+   case MSP430CC::COND_LO:
+     // Res = ~(SRW & 1)
+     Invert = true;
+     break;
+   case MSP430CC::COND_NE:
+     if (andCC) {
+       // C = ~Z, thus Res = SRW & 1, no processing is required
+     } else {
+       // Res = ~((SRW >> 1) & 1)
+       Shift = true;
+       Invert = true;
+     }
+     break;
+   case MSP430CC::COND_E:
+     Shift = true;
+     // C = ~Z for AND instruction, thus we can put Res = ~(SRW & 1), however,
+     // Res = (SRW >> 1) & 1 is 1 word shorter.
+     break;
+  }
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, VT);
+  if (Convert) {
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SRW,
+                                    MVT::i16, Flag);
+    if (Shift)
+      // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+      SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+    SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+    if (Invert)
+      SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+    return SR;
+  } else {
+    SDValue Zero = DAG.getConstant(0, VT);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(One);
+    Ops.push_back(Zero);
+    Ops.push_back(TargetCC);
+    Ops.push_back(Flag);
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+  }
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  DebugLoc dl    = Op.getDebugLoc();
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(TargetCC);
+  Ops.push_back(Flag);
+
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue Val = Op.getOperand(0);
+  EVT VT      = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert(VT == MVT::i16 && "Only support i16 for now!");
+
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+                     DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+                     DAG.getValueType(Val.getValueType()));
+}
+
+SDValue
+MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = TD->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           true);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(TD->getPointerSize(), MVT::i16);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         MSP430::FPW, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+
+  // Frame index of first vararg argument
+  SDValue FrameIndex = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                         getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Create a store of the frame index to the location operand
+  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), FrameIndex,
+                      Op.getOperand(1), MachinePointerInfo(SV),
+                      false, false, 0);
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                      SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+  if (VT != MVT::i8 && VT != MVT::i16)
+    return false;
+
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    uint64_t RHSC = RHS->getZExtValue();
+    if ((VT == MVT::i16 && RHSC != 2) ||
+        (VT == MVT::i8 && RHSC != 1))
+      return false;
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, VT);
+    AM = ISD::POST_INC;
+    return true;
+  }
+
+  return false;
+}
+
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
+  case MSP430ISD::RRA:                return "MSP430ISD::RRA";
+  case MSP430ISD::RLA:                return "MSP430ISD::RLA";
+  case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::CALL:               return "MSP430ISD::CALL";
+  case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
+  case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
+  case MSP430ISD::CMP:                return "MSP430ISD::CMP";
+  case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
+  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
+  }
+}
+
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+                                          Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+}
+
+bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+
+  return (VT1.getSizeInBits() > VT2.getSizeInBits());
+}
+
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
+}
+
+bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+}
+
+bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  return isZExtFree(Val.getValueType(), VT2);
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  const TargetRegisterClass * RC;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Invalid shift opcode!");
+  case MSP430::Shl8:
+   Opc = MSP430::SHL8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Shl16:
+   Opc = MSP430::SHL16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Sra8:
+   Opc = MSP430::SAR8r1;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Sra16:
+   Opc = MSP430::SAR16r1;
+   RC = &MSP430::GR16RegClass;
+   break;
+  case MSP430::Srl8:
+   Opc = MSP430::SAR8r1c;
+   RC = &MSP430::GR8RegClass;
+   break;
+  case MSP430::Srl16:
+   Opc = MSP430::SAR16r1c;
+   RC = &MSP430::GR16RegClass;
+   break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  // Create loop block
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->splice(RemBB->begin(), BB,
+                llvm::next(MachineBasicBlock::iterator(MI)),
+                BB->end());
+  RemBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&MSP430::GR8RegClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // BB:
+  // cmp 0, N
+  // je RemBB
+  BuildMI(BB, dl, TII.get(MSP430::CMP8ri))
+    .addReg(ShiftAmtSrcReg).addImm(0);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(RemBB)
+    .addImm(MSP430CC::COND_E);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+    .addReg(ShiftAmtSrcReg).addMBB(BB)
+    .addReg(ShiftAmtReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+    .addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+    .addReg(ShiftAmtReg).addImm(1);
+  BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+    .addMBB(LoopBB)
+    .addImm(MSP430CC::COND_NE);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(*RemBB, RemBB->begin(), dl, TII.get(MSP430::PHI), DstReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return RemBB;
+}
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+    return EmitShiftInstr(MI, BB);
+
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->splice(copy1MBB->begin(), BB,
+                   llvm::next(MachineBasicBlock::iterator(MI)),
+                   BB->end());
+  copy1MBB->transferSuccessorsAndUpdatePHIs(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(copy1MBB)
+    .addImm(MI->getOperand(3).getImm());
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(MSP430::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 000000000000..e0ed870f5653
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -0,0 +1,178 @@
+//===-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_ISELLOWERING_H
+#define LLVM_TARGET_MSP430_ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MSP430ISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// Same as RET_FLAG, but used for returning from ISRs.
+      RETI_FLAG,
+
+      /// Y = R{R,L}A X, rotate right (left) arithmetically
+      RRA, RLA,
+
+      /// Y = RRC X, rotate right via carry
+      RRC,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+      /// and TargetGlobalAddress.
+      Wrapper,
+
+      /// CMP - Compare instruction.
+      CMP,
+
+      /// SetCC - Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// instruction.
+      BR_CC,
+
+      /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+      /// is condition code and operand 4 is flag operand.
+      SELECT_CC,
+
+      /// SHL, SRA, SRL - Non-constant shifts.
+      SHL, SRA, SRL
+    };
+  }
+
+  class MSP430Subtarget;
+  class MSP430TargetMachine;
+
+  class MSP430TargetLowering : public TargetLowering {
+  public:
+    explicit MSP430TargetLowering(MSP430TargetMachine &TM);
+
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of type
+    /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
+    /// register R15W to i8 by referencing its sub-register R15B.
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    /// isZExtFree - Return true if any actual instruction that defines a value
+    /// of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in unknown
+    /// ways, such as incoming arguments, or copies from unknown virtual
+    /// registers. Also, if isTruncateFree(Ty2, Ty1) is true, this does not
+    /// necessarily apply to truncate instructions. e.g. on msp430, all
+    /// instructions that define 8-bit values implicit zero-extend the result
+    /// out to 16 bits.
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB) const;
+    MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base,
+                                            SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    const MSP430Subtarget &Subtarget;
+    const DataLayout *TD;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_MSP430_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 000000000000..a9e87dad0cd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrFormats.td
@@ -0,0 +1,211 @@
+//===-- MSP430InstrFormats.td - MSP430 Instruction Formats -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MSP430 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def SingleOpFrm : Format<1>;
+def DoubleOpFrm : Format<2>;
+def CondJumpFrm : Format<3>;
+
+class SourceMode<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def SrcReg      : SourceMode<0>;
+def SrcMem      : SourceMode<1>;
+def SrcIndReg   : SourceMode<2>;
+def SrcPostInc  : SourceMode<3>;
+def SrcImm      : SourceMode<3>;
+
+class DestMode<bit val> {
+  bit Value = val;
+}
+
+def DstReg      : DestMode<0>;
+def DstMem      : DestMode<1>;
+
+class SizeVal<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def SizeUnknown : SizeVal<0>; // Unknown / unset size
+def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
+def Size2Bytes  : SizeVal<2>;
+def Size4Bytes  : SizeVal<3>;
+def Size6Bytes  : SizeVal<4>;
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
+                 string asmstr> : Instruction {
+  field bits<16> Inst;
+
+  let Namespace = "MSP430";
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  Format Form = f;
+  SizeVal Sz = sz;
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
+  let TSFlags{1-0} = Form.Value;
+  let TSFlags{4-2} = Sz.Value;
+
+  let AsmString   = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  let Pattern = pattern;
+
+  DestMode ad = dest;
+  SourceMode as = src;
+  
+  let Inst{12-15} = opcode;
+  let Inst{7}     = ad.Value;
+  let Inst{6}     = bw;
+  let Inst{4-5}   = as.Value;
+}
+
+// 8 bit IForm instructions
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class I8rr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I8ri<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8rm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mi<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I8mm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IForm instructions
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class I16rr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I16ri<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16rm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mi<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I16mm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  SourceMode as = src;
+
+  let Inst{7-15} = opcode;
+  let Inst{6}    = bw;
+  let Inst{4-5}  = as.Value;
+}
+
+// 8 bit IIForm instructions
+class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class II8r<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II8m<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II8i<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IIForm instructions
+class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+               dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class II16r<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II16m<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II16i<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{13-15} = opcode;
+  let Inst{10-12} = cond;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  let Pattern = pattern;
+  let Inst{15-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 000000000000..a6b5f2f6d0bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -0,0 +1,326 @@
+//===-- MSP430InstrInfo.cpp - MSP430 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430InstrInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR
+#include "MSP430GenInstrInfo.inc"
+
+using namespace llvm;
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
+  : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    RI(tm, *this) {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC,
+                                           const TargetRegisterInfo *TRI) const{
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc;
+  if (MSP430::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV16rr;
+  else if (MSP430::GR8RegClass.contains(DestReg, SrcReg))
+    Opc = MSP430::MOV8rr;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  BuildMI(MBB, I, DL, get(Opc), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != MSP430::JMP &&
+        I->getOpcode() != MSP430::JCC &&
+        I->getOpcode() != MSP430::Br &&
+        I->getOpcode() != MSP430::Bm)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool MSP430InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+  switch (CC) {
+  default: llvm_unreachable("Invalid branch condition!");
+  case MSP430CC::COND_E:
+    CC = MSP430CC::COND_NE;
+    break;
+  case MSP430CC::COND_NE:
+    CC = MSP430CC::COND_E;
+    break;
+  case MSP430CC::COND_L:
+    CC = MSP430CC::COND_GE;
+    break;
+  case MSP430CC::COND_GE:
+    CC = MSP430CC::COND_L;
+    break;
+  case MSP430CC::COND_HS:
+    CC = MSP430CC::COND_LO;
+    break;
+  case MSP430CC::COND_LO:
+    CC = MSP430CC::COND_HS;
+    break;
+  }
+
+  Cond[0].setImm(CC);
+  return false;
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Cannot handle indirect branches.
+    if (I->getOpcode() == MSP430::Br ||
+        I->getOpcode() == MSP430::Bm)
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == MSP430::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+    MSP430CC::CondCodes BranchCode =
+      static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+    if (BranchCode == MSP430CC::COND_INVALID)
+      return true;  // Can't handle weird stuff.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned
+MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                              const SmallVectorImpl<MachineOperand> &Cond,
+                              DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "MSP430 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  BuildMI(&MBB, DL, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(MSP430::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
+
+  switch (Desc.TSFlags & MSP430II::SizeMask) {
+  default:
+    switch (Desc.getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::DBG_VALUE:
+      return 0;
+    case TargetOpcode::INLINEASM: {
+      const MachineFunction *MF = MI->getParent()->getParent();
+      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
+                                    *MF->getTarget().getMCAsmInfo());
+    }
+    }
+  case MSP430II::SizeSpecial:
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unknown instruction size!");
+    case MSP430::SAR8r1c:
+    case MSP430::SAR16r1c:
+      return 4;
+    }
+  case MSP430II::Size2Bytes:
+    return 2;
+  case MSP430II::Size4Bytes:
+    return 4;
+  case MSP430II::Size6Bytes:
+    return 6;
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 000000000000..d79f99245e71
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -0,0 +1,91 @@
+//===-- MSP430InstrInfo.h - MSP430 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430INSTRINFO_H
+#define LLVM_TARGET_MSP430INSTRINFO_H
+
+#include "MSP430RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MSP430GenInstrInfo.inc"
+
+namespace llvm {
+
+class MSP430TargetMachine;
+
+/// MSP430II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MSP430II {
+  enum {
+    SizeShift   = 2,
+    SizeMask    = 7 << SizeShift,
+
+    SizeUnknown = 0 << SizeShift,
+    SizeSpecial = 1 << SizeShift,
+    Size2Bytes  = 2 << SizeShift,
+    Size4Bytes  = 3 << SizeShift,
+    Size6Bytes  = 4 << SizeShift
+  };
+}
+
+class MSP430InstrInfo : public MSP430GenInstrInfo {
+  const MSP430RegisterInfo RI;
+public:
+  explicit MSP430InstrInfo(MSP430TargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill,
+                                   int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  // Branch folding goodness
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB,
+                     MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 000000000000..e45780d05803
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -0,0 +1,1211 @@
+//===-- MSP430InstrInfo.td - MSP430 Instruction defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                  SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<1, 2>, 
+                                                  SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisI8<2>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+def MSP430callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def MSP430callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_MSP430CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutGlue]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
+                            [SDNPHasChain, SDNPInGlue]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
+                            [SDNPInGlue]>;
+def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Short jump targets have OtherVT type and are printed as pcrel imm values.
+def jmptarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+  let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SRW.
+let Defs = [SPW, SRW], Uses = [SPW] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
+                        "# Select8 PSEUDO",
+                        [(set GR8:$dst,
+                          (MSP430selectcc GR8:$src, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR16:$src2, i8imm:$cc),
+                        "# Select16 PSEUDO",
+                        [(set GR16:$dst,
+                          (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+  let Defs = [SRW] in {
+  def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Shl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+  def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Shl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+  def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Sra8 PSEUDO",
+                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+  def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Sra16 PSEUDO",
+                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+  def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Srl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+  def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Srl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+  }
+}
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                     (outs), (ins), "ret",  [(MSP430retflag)]>;
+  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// FIXME: expand opcode & cond field for branches!
+
+// Direct branch
+let isBarrier = 1 in {
+  // Short branch
+  def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+                   "jmp\t$dst",
+                   [(br bb:$dst)]>;
+  let isIndirectBranch = 1 in {
+    // Long branches
+    def Bi  : I16ri<0, (outs), (ins i16imm:$brdst),
+                    "br\t$brdst",
+                    [(brind tblockaddress:$brdst)]>;
+    def Br  : I16rr<0, (outs), (ins GR16:$brdst),
+                    "mov.w\t{$brdst, pc}",
+                    [(brind GR16:$brdst)]>;
+    def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
+                    "mov.w\t{$brdst, pc}",
+                    [(brind (load addr:$brdst))]>;
+  }
+}
+
+// Conditional branches
+let Uses = [SRW] in
+  def JCC : CJForm<0, 0,
+                   (outs), (ins jmptarget:$dst, cc:$cc),
+                   "j$cc\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. SPW is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [R12W, R13W, R14W, R15W, SRW],
+      Uses = [SPW] in {
+    def CALLi     : II16i<0x0,
+                          (outs), (ins i16imm:$dst),
+                          "call\t$dst", [(MSP430call imm:$dst)]>;
+    def CALLr     : II16r<0x0,
+                          (outs), (ins GR16:$dst),
+                          "call\t$dst", [(MSP430call GR16:$dst)]>;
+    def CALLm     : II16m<0x0,
+                          (outs), (ins memsrc:$dst),
+                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+  }
+
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [SPW], Uses = [SPW], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r  : II16r<0x0,
+                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   []>;
+def MOV16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def MOV8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : I8rr<0x0,
+                      (outs GR16:$dst), (ins GR8:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : I8rm<0x0,
+                      (outs GR16:$dst), (ins memsrc:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
+def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
+                         "mov.b\t{@$base+, $dst}", []>;
+def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
+                           "mov.w\t{@$base+, $dst}", []>;
+}
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+def MOV8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store GR16:$src, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 (load addr:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Constraints = "$src = $dst" in {
+
+let Defs = [SRW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+
+def ADD8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADD16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def ADD8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADD16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "add.b\t{@$base+, $dst}", []>;
+def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                          "add.w\t{@$base+, $dst}", []>;
+}
+
+
+def ADD8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def ADD16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def ADD8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), 
+                                  (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
+def ADC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+} // isCommutable
+
+def ADC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def ADC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def ADC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def ADC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mm : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.w\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def AND16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def AND8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def AND16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def AND8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def AND16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "and.b\t{@$base+, $dst}", []>;
+def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "and.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def AND8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR8rr  : I8rr<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
+def OR16rr : I16rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
+}
+
+def OR8ri  : I8ri<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
+def OR16ri : I16ri<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
+
+def OR8rm  : I8rm<0x0,
+                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
+def OR16rm : I16rm<0x0,
+                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                        (outs GR8:$dst, GR16:$base_wb),
+                        (ins GR8:$src, GR16:$base),
+                        "bis.b\t{@$base+, $dst}", []>;
+def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "bis.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def OR8mr  : I8mr<0x0,
+                  (outs), (ins memdst:$dst, GR8:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+def OR16mr : I16mr<0x0,
+                   (outs), (ins memdst:$dst, GR16:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
+
+def OR8mi  : I8mi<0x0, 
+                  (outs), (ins memdst:$dst, i8imm:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def OR16mi : I16mi<0x0,
+                   (outs), (ins memdst:$dst, i16imm:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
+
+def OR8mm  : I8mm<0x0,
+                  (outs), (ins memdst:$dst, memsrc:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (i8 (load addr:$dst)),
+                              (i8 (load addr:$src))), addr:$dst)]>;
+def OR16mm : I16mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (i16 (load addr:$dst)),
+                               (i16 (load addr:$src))), addr:$dst)]>;
+}
+
+// bic does not modify condition codes
+def BIC8rr :  I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "bic.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
+def BIC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
+
+def BIC8rm :  I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "bic.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
+def BIC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
+
+let Constraints = "" in {
+def BIC8mr :  I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
+def BIC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
+
+def BIC8mm :  I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst),
+                                (not (i8 (load addr:$src)))), addr:$dst)]>;
+def BIC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst),
+                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def XOR16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def XOR8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def XOR16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def XOR8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def XOR16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "xor.b\t{@$base+, $dst}", []>;
+def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src, GR16:$base),
+                           "xor.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def XOR8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+
+def SUB8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def SUB16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def SUB16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SUB16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src = $dst" in {
+def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src, GR16:$base),
+                         "sub.b\t{@$base+, $dst}", []>;
+def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src, GR16:$base),
+                          "sub.w\t{@$base+, $dst}", []>;
+}
+
+let Constraints = "" in {
+def SUB8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+def SBC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
+                    (implicit SRW)]>;
+def SBC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+                    (implicit SRW)]>;
+def SBC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SBC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let Constraints = "" in {
+def SBC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "subc.b\t{$src, $dst}",
+                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+                   (implicit SRW)]>;
+def SBC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst),
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst),
+                            (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+// FIXME: memory variant!
+def SAR8r1  : II8r<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rra.b\t$dst",
+                   [(set GR8:$dst, (MSP430rra GR8:$src)),
+                    (implicit SRW)]>;
+def SAR16r1 : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rra.w\t$dst",
+                    [(set GR16:$dst, (MSP430rra GR16:$src)),
+                     (implicit SRW)]>;
+
+def SHL8r1  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rla.b\t$dst",
+                   [(set GR8:$dst, (MSP430rla GR8:$src)),
+                    (implicit SRW)]>;
+def SHL16r1 : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rla.w\t$dst",
+                    [(set GR16:$dst, (MSP430rla GR16:$src)),
+                     (implicit SRW)]>;
+
+def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                      "clrc\n\t"
+                      "rrc.b\t$dst",
+                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
+                       (implicit SRW)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                      "clrc\n\t"
+                      "rrc.w\t$dst",
+                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
+                       (implicit SRW)]>;
+
+// FIXME: Memory sext's ?
+def SEXT16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "sxt\t$dst",
+                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+                     (implicit SRW)]>;
+
+} // Defs = [SRW]
+
+def ZEXT16r : I8rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+
+// FIXME: Memory bitswaps?
+def SWPB16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "swpb\t$dst",
+                    [(set GR16:$dst, (bswap GR16:$src))]>;
+
+} // Constraints = "$src = $dst"
+
+// Integer comparisons
+let Defs = [SRW] in {
+def CMP8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SRW)]>;
+def CMP16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SRW)]>;
+
+def CMP8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SRW)]>;
+def CMP16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SRW)]>;
+
+def CMP8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src),
+                               (i8 imm:$src2)), (implicit SRW)]>;
+def CMP16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "cmp.w\t{$src2, $src}",
+                     [(MSP430cmp (load addr:$src),
+                                 (i16 imm:$src2)), (implicit SRW)]>;
+
+def CMP8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memsrc:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
+                    (implicit SRW)]>;
+def CMP16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memsrc:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp GR16:$src, (load addr:$src2)),
+                     (implicit SRW)]>;
+
+def CMP8mr  : I8mr<0x0,
+                   (outs), (ins memsrc:$src, GR8:$src2),
+                   "cmp.b\t{$src2, $src}",
+                   [(MSP430cmp (load addr:$src), GR8:$src2),
+                    (implicit SRW)]>;
+def CMP16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "cmp.w\t{$src2, $src}",
+                    [(MSP430cmp (load addr:$src), GR16:$src2), 
+                     (implicit SRW)]>;
+
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src, GR8:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+                     (implicit SRW)]>;
+}
+def BIT8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src, memdst:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src, memdst:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mr  : I8mr<0x0,
+                  (outs), (ins memsrc:$src, GR8:$src2),
+                  "bit.b\t{$src2, $src}",
+                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+                   (implicit SRW)]>;
+def BIT16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src, GR16:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src, i8imm:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src, i16imm:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mm  : I8mm<0x0,
+                   (outs), (ins memsrc:$src, memsrc:$src2),
+                   "bit.b\t{$src2, $src}",
+                   [(MSP430cmp (and_su (i8 (load addr:$src)),
+                                       (load addr:$src2)),
+                                 0),
+                      (implicit SRW)]>;
+def BIT16mm : I16mm<0x0,
+                    (outs), (ins memsrc:$src, memsrc:$src2),
+                    "bit.w\t{$src2, $src}",
+                    [(MSP430cmp (and_su (i16 (load addr:$src)),
+                                        (load addr:$src2)),
+                                 0),
+                     (implicit SRW)]>;
+} // Defs = [SRW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+def : Pat<(i16 (MSP430Wrapper tblockaddress:$dst)), (MOV16ri tblockaddress:$dst)>;
+
+def : Pat<(add GR16:$src, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src, texternalsym:$src2)>;
+def : Pat<(add GR16:$src, (MSP430Wrapper tblockaddress:$src2)),
+          (ADD16ri GR16:$src, tblockaddress:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+          (MOV16mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tblockaddress:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+          (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+          (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src, GR16:$src2),
+          (ADD16rr GR16:$src, GR16:$src2)>;
+def : Pat<(addc GR16:$src, (load addr:$src2)),
+          (ADD16rm GR16:$src, addr:$src2)>;
+def : Pat<(addc GR16:$src, imm:$src2),
+          (ADD16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+          (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src, GR8:$src2),
+          (ADD8rr GR8:$src, GR8:$src2)>;
+def : Pat<(addc GR8:$src, (load addr:$src2)),
+          (ADD8rm GR8:$src, addr:$src2)>;
+def : Pat<(addc GR8:$src, imm:$src2),
+          (ADD8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+          (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src, GR16:$src2),
+          (SUB16rr GR16:$src, GR16:$src2)>;
+def : Pat<(subc GR16:$src, (load addr:$src2)),
+          (SUB16rm GR16:$src, addr:$src2)>;
+def : Pat<(subc GR16:$src, imm:$src2),
+          (SUB16ri GR16:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+          (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src, GR8:$src2),
+          (SUB8rr GR8:$src, GR8:$src2)>;
+def : Pat<(subc GR8:$src, (load addr:$src2)),
+          (SUB8rm GR8:$src, addr:$src2)>;
+def : Pat<(subc GR8:$src, imm:$src2),
+          (SUB8ri GR8:$src, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+          (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (SUB8mm addr:$dst, addr:$src)>;
+
+// peephole patterns
+def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src, GR16:$src2)), 0),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src, subreg_8bit),
+                  (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
new file mode 100644
index 000000000000..043e5becadbb
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -0,0 +1,153 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.Mang->getSymbol(MO.getGlobal());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetBlockAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                         MO.getMBB()->getSymbol(), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
new file mode 100644
index 000000000000..794aa56bf04c
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.h
@@ -0,0 +1,47 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_MCINSTLOWER_H
+#define MSP430_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+
+  /// MSP430MCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class LLVM_LIBRARY_VISIBILITY MSP430MCInstLower {
+  MCContext &Ctx;
+
+  AsmPrinter &Printer;
+public:
+  MSP430MCInstLower(MCContext &ctx, AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..0f7539908458
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void MSP430MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 000000000000..d1697f478cc2
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -0,0 +1,54 @@
+//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MACHINEFUNCTIONINFO_H
+#define MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+public:
+  MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex;}
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 000000000000..0b3e9e259649
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -0,0 +1,162 @@
+//===-- MSP430RegisterInfo.cpp - MSP430 Register Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#include "MSP430RegisterInfo.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MSP430GenRegisterInfo.inc"
+
+using namespace llvm;
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
+                                       const TargetInstrInfo &tii)
+  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm), TII(tii) {
+  StackAlign = TM.getFrameLowering()->getStackAlignment();
+}
+
+const uint16_t*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
+  const Function* F = MF->getFunction();
+  static const uint16_t CalleeSavedRegs[] = {
+    MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const uint16_t CalleeSavedRegsFP[] = {
+    MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const uint16_t CalleeSavedRegsIntr[] = {
+    MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+  static const uint16_t CalleeSavedRegsIntrFP[] = {
+    MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+
+  if (TFI->hasFP(*MF))
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntrFP : CalleeSavedRegsFP);
+  else
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntr : CalleeSavedRegs);
+
+}
+
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Mark 4 special registers with subregisters as reserved.
+  Reserved.set(MSP430::PCB);
+  Reserved.set(MSP430::SPB);
+  Reserved.set(MSP430::SRB);
+  Reserved.set(MSP430::CGB);
+  Reserved.set(MSP430::PCW);
+  Reserved.set(MSP430::SPW);
+  Reserved.set(MSP430::SRW);
+  Reserved.set(MSP430::CGW);
+
+  // Mark frame pointer as reserved if needed.
+  if (TFI->hasFP(MF))
+    Reserved.set(MSP430::FPW);
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  return &MSP430::GR16RegClass;
+}
+
+void
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, unsigned FIOperandNum,
+                                        RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW);
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // Skip the saved PC
+  Offset += 2;
+
+  if (!TFI->hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += 2; // Skip the saved FPW
+
+  // Fold imm into offset
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+
+  if (MI.getOpcode() == MSP430::ADD16ri) {
+    // This is actually "load effective address" of the stack slot
+    // instruction. We have only two-address instructions, thus we need to
+    // expand it into mov + add
+
+    MI.setDesc(TII.get(MSP430::MOV16rr));
+    MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+    if (Offset == 0)
+      return;
+
+    // We need to materialize the offset via add instruction.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if (Offset < 0)
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+        .addReg(DstReg).addImm(-Offset);
+    else
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+        .addReg(DstReg).addImm(Offset);
+
+    return;
+  }
+
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 000000000000..69cccb275259
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -0,0 +1,55 @@
+//===-- MSP430RegisterInfo.h - MSP430 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430REGISTERINFO_H
+#define LLVM_TARGET_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MSP430GenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class MSP430TargetMachine;
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+private:
+  MSP430TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  /// StackAlign - Default stack alignment.
+  ///
+  unsigned StackAlign;
+public:
+  MSP430RegisterInfo(MSP430TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  const TargetRegisterClass*
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430REGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 000000000000..07619d0675b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -0,0 +1,81 @@
+//===-- MSP430RegisterInfo.td - MSP430 Register defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+def PCB  : MSP430Reg<0,  "r0">;
+def SPB  : MSP430Reg<1,  "r1">;
+def SRB  : MSP430Reg<2,  "r2">;
+def CGB  : MSP430Reg<3,  "r3">;
+def FPB  : MSP430Reg<4,  "r4">;
+def R5B  : MSP430Reg<5,  "r5">;
+def R6B  : MSP430Reg<6,  "r6">;
+def R7B  : MSP430Reg<7,  "r7">;
+def R8B  : MSP430Reg<8,  "r8">;
+def R9B  : MSP430Reg<9,  "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def subreg_8bit : SubRegIndex { let Namespace = "MSP430"; }
+
+let SubRegIndices = [subreg_8bit] in {
+def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SPW  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SRW  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CGW  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FPW  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5W  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6W  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7W  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8W  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9W  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10W : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11W : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12W : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13W : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14W : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
+}
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+   // Volatile registers
+  (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+   // Frame pointer, sometimes allocable
+   FPB,
+   // Volatile, but not allocable
+   PCB, SPB, SRB, CGB)>;
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+   // Volatile registers
+  (add R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+   // Frame pointer, sometimes allocable
+   FPW,
+   // Volatile, but not allocable
+   PCW, SPW, SRW, CGW)>;
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..24f45fa3881b
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MSP430SelectionDAGInfo.cpp - MSP430 SelectionDAG Info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-selectiondag-info"
+#include "MSP430TargetMachine.h"
+using namespace llvm;
+
+MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+MSP430SelectionDAGInfo::~MSP430SelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h
new file mode 100644
index 000000000000..fa8194830ff8
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430SelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- MSP430SelectionDAGInfo.h - MSP430 SelectionDAG Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MSP430 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430SELECTIONDAGINFO_H
+#define MSP430SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class MSP430TargetMachine;
+
+class MSP430SelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit MSP430SelectionDAGInfo(const MSP430TargetMachine &TM);
+  ~MSP430SelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 000000000000..edeaf34676bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -0,0 +1,34 @@
+//===-- MSP430Subtarget.cpp - MSP430 Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MSP430GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void MSP430Subtarget::anchor() { }
+
+MSP430Subtarget::MSP430Subtarget(const std::string &TT,
+                                 const std::string &CPU,
+                                 const std::string &FS) :
+  MSP430GenSubtargetInfo(TT, CPU, FS) {
+  std::string CPUName = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 000000000000..4d8792eede7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -0,0 +1,42 @@
+//===-- MSP430Subtarget.h - Define Subtarget for the MSP430 ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_SUBTARGET_H
+#define LLVM_TARGET_MSP430_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MSP430GenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MSP430Subtarget : public MSP430GenSubtargetInfo {
+  virtual void anchor();
+  bool ExtendedInsts;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  MSP430Subtarget(const std::string &TT, const std::string &CPU,
+                  const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_MSP430_SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 000000000000..164e351df952
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -0,0 +1,71 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430TargetMachine.h"
+#include "MSP430.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMSP430Target() {
+  // Register the target.
+  RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
+}
+
+MSP430TargetMachine::MSP430TargetMachine(const Target &T,
+                                         StringRef TT,
+                                         StringRef CPU,
+                                         StringRef FS,
+                                         const TargetOptions &Options,
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    // FIXME: Check DataLayout string.
+    DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
+    InstrInfo(*this), TLInfo(*this), TSInfo(*this),
+    FrameLowering(Subtarget) { }
+
+namespace {
+/// MSP430 Code Generator Pass Configuration Options.
+class MSP430PassConfig : public TargetPassConfig {
+public:
+  MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  MSP430TargetMachine &getMSP430TargetMachine() const {
+    return getTM<MSP430TargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MSP430PassConfig(this, PM);
+}
+
+bool MSP430PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool MSP430PassConfig::addPreEmitPass() {
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createMSP430BranchSelectionPass());
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 000000000000..be695a211109
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.h
@@ -0,0 +1,69 @@
+//===-- MSP430TargetMachine.h - Define TargetMachine for MSP430 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
+#define LLVM_TARGET_MSP430_TARGETMACHINE_H
+
+#include "MSP430FrameLowering.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430SelectionDAGInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+  MSP430Subtarget        Subtarget;
+  const DataLayout       DL;       // Calculates type size & alignment
+  MSP430InstrInfo        InstrInfo;
+  MSP430TargetLowering   TLInfo;
+  MSP430SelectionDAGInfo TSInfo;
+  MSP430FrameLowering    FrameLowering;
+
+public:
+  MSP430TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const DataLayout *getDataLayout() const     { return &DL;}
+  virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const MSP430TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const MSP430SelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430_TARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 000000000000..0d71d04ebe22
--- /dev/null
+++ b/contrib/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSP430Target;
+
+extern "C" void LLVMInitializeMSP430TargetInfo() { 
+  RegisterTarget<Triple::msp430> 
+    X(TheMSP430Target, "msp430", "MSP430 [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/Mangler.cpp b/contrib/llvm/lib/Target/Mangler.cpp
new file mode 100644
index 000000000000..edfd421d8532
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mangler.cpp
@@ -0,0 +1,237 @@
+//===-- Mangler.cpp - Self-contained c/asm llvm name mangler --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Unified name mangler for assembly backends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static bool isAcceptableChar(char C, bool AllowPeriod, bool AllowUTF8) {
+  if ((C < 'a' || C > 'z') &&
+      (C < 'A' || C > 'Z') &&
+      (C < '0' || C > '9') &&
+      C != '_' && C != '$' && C != '@' &&
+      !(AllowPeriod && C == '.') &&
+      !(AllowUTF8 && (C & 0x80)))
+    return false;
+  return true;
+}
+
+static char HexDigit(int V) {
+  return V < 10 ? V+'0' : V+'A'-10;
+}
+
+static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
+  OutName.push_back('_');
+  OutName.push_back(HexDigit(C >> 4));
+  OutName.push_back(HexDigit(C & 15));
+  OutName.push_back('_');
+}
+
+/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
+/// for this assembler.
+static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
+  assert(!Str.empty() && "Cannot create an empty MCSymbol");
+  
+  // If the first character is a number and the target does not allow this, we
+  // need quotes.
+  if (!MAI.doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9')
+    return true;
+  
+  // If any of the characters in the string is an unacceptable character, force
+  // quotes.
+  bool AllowPeriod = MAI.doesAllowPeriodsInName();
+  bool AllowUTF8 = MAI.doesAllowUTF8();
+  for (unsigned i = 0, e = Str.size(); i != e; ++i)
+    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
+      return true;
+  return false;
+}
+
+/// appendMangledName - Add the specified string in mangled form if it uses
+/// any unusual characters.
+static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
+                              const MCAsmInfo &MAI) {
+  // The first character is not allowed to be a number unless the target
+  // explicitly allows it.
+  if (!MAI.doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9') {
+    MangleLetter(OutName, Str[0]);
+    Str = Str.substr(1);
+  }
+
+  bool AllowPeriod = MAI.doesAllowPeriodsInName();
+  bool AllowUTF8 = MAI.doesAllowUTF8();
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
+      MangleLetter(OutName, Str[i]);
+    else
+      OutName.push_back(Str[i]);
+  }
+}
+
+
+/// appendMangledQuotedName - On systems that support quoted symbols, we still
+/// have to escape some (obscure) characters like " and \n which would break the
+/// assembler's lexing.
+static void appendMangledQuotedName(SmallVectorImpl<char> &OutName,
+                                   StringRef Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (Str[i] == '"' || Str[i] == '\n')
+      MangleLetter(OutName, Str[i]);
+    else
+      OutName.push_back(Str[i]);
+  }
+}
+
+
+/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
+/// and the specified name as the global variable name.  GVName must not be
+/// empty.
+void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
+                                const Twine &GVName, ManglerPrefixTy PrefixTy) {
+  SmallString<256> TmpData;
+  StringRef Name = GVName.toStringRef(TmpData);
+  assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
+  
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  
+  // If the global name is not led with \1, add the appropriate prefixes.
+  if (Name[0] == '\1') {
+    Name = Name.substr(1);
+  } else {
+    if (PrefixTy == Mangler::Private) {
+      const char *Prefix = MAI.getPrivateGlobalPrefix();
+      OutName.append(Prefix, Prefix+strlen(Prefix));
+    } else if (PrefixTy == Mangler::LinkerPrivate) {
+      const char *Prefix = MAI.getLinkerPrivateGlobalPrefix();
+      OutName.append(Prefix, Prefix+strlen(Prefix));
+    }
+
+    const char *Prefix = MAI.getGlobalPrefix();
+    if (Prefix[0] == 0)
+      ; // Common noop, no prefix.
+    else if (Prefix[1] == 0)
+      OutName.push_back(Prefix[0]);  // Common, one character prefix.
+    else
+      OutName.append(Prefix, Prefix+strlen(Prefix)); // Arbitrary length prefix.
+  }
+  
+  // If this is a simple string that doesn't need escaping, just append it.
+  if (!NameNeedsEscaping(Name, MAI) ||
+      // If quotes are supported, they can be used unless the string contains
+      // a quote or newline.
+      (MAI.doesAllowQuotesInName() &&
+       Name.find_first_of("\n\"") == StringRef::npos)) {
+    OutName.append(Name.begin(), Name.end());
+    return;
+  }
+  
+  // On systems that do not allow quoted names, we need to mangle most
+  // strange characters.
+  if (!MAI.doesAllowQuotesInName())
+    return appendMangledName(OutName, Name, MAI);
+  
+  // Okay, the system allows quoted strings.  We can quote most anything, the
+  // only characters that need escaping are " and \n.
+  assert(Name.find_first_of("\n\"") != StringRef::npos);
+  return appendMangledQuotedName(OutName, Name);
+}
+
+/// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
+/// a suffix on their name indicating the number of words of arguments they
+/// take.
+static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
+                                     const Function *F, const DataLayout &TD) {
+  // Calculate arguments size total.
+  unsigned ArgWords = 0;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI) {
+    Type *Ty = AI->getType();
+    // 'Dereference' type in case of byval parameter attribute
+    if (AI->hasByValAttr())
+      Ty = cast<PointerType>(Ty)->getElementType();
+    // Size should be aligned to DWORD boundary
+    ArgWords += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
+  }
+  
+  raw_svector_ostream(OutName) << '@' << ArgWords;
+}
+
+
+/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
+/// and the specified global variable's name.  If the global variable doesn't
+/// have a name, this fills in a unique name for the global.
+void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
+                                const GlobalValue *GV,
+                                bool isImplicitlyPrivate) {
+  ManglerPrefixTy PrefixTy = Mangler::Default;
+  if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
+    PrefixTy = Mangler::Private;
+  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage())
+    PrefixTy = Mangler::LinkerPrivate;
+  
+  // If this global has a name, handle it simply.
+  if (GV->hasName()) {
+    getNameWithPrefix(OutName, GV->getName(), PrefixTy);
+  } else {
+    // Get the ID for the global, assigning a new one if we haven't got one
+    // already.
+    unsigned &ID = AnonGlobalIDs[GV];
+    if (ID == 0) ID = NextAnonGlobalID++;
+  
+    // Must mangle the global into a unique ID.
+    getNameWithPrefix(OutName, "__unnamed_" + Twine(ID), PrefixTy);
+  }
+  
+  // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
+  // add it.
+  if (Context.getAsmInfo().hasMicrosoftFastStdCallMangling()) {
+    if (const Function *F = dyn_cast<Function>(GV)) {
+      CallingConv::ID CC = F->getCallingConv();
+    
+      // fastcall functions need to start with @.
+      // FIXME: This logic seems unlikely to be right.
+      if (CC == CallingConv::X86_FastCall) {
+        if (OutName[0] == '_')
+          OutName[0] = '@';
+        else
+          OutName.insert(OutName.begin(), '@');
+      }
+    
+      // fastcall and stdcall functions usually need @42 at the end to specify
+      // the argument info.
+      FunctionType *FT = F->getFunctionType();
+      if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
+          // "Pure" variadic functions do not receive @0 suffix.
+          (!FT->isVarArg() || FT->getNumParams() == 0 ||
+           (FT->getNumParams() == 1 && F->hasStructRetAttr())))
+        AddFastCallStdCallSuffix(OutName, F, TD);
+    }
+  }
+}
+
+/// getSymbol - Return the MCSymbol for the specified global value.  This
+/// symbol is the main label that is the address of the global.
+MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
+  SmallString<60> NameStr;
+  getNameWithPrefix(NameStr, GV, false);
+  return Context.GetOrCreateSymbol(NameStr.str());
+}
+
+
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
new file mode 100644
index 000000000000..c403f216b0d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -0,0 +1,1775 @@
+//===-- MipsAsmParser.cpp - Parse Mips assembly to MCInst instructions ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+class MipsAssemblerOptions {
+public:
+  MipsAssemblerOptions():
+    aTReg(1), reorder(true), macro(true) {
+  }
+
+  unsigned getATRegNum() {return aTReg;}
+  bool setATReg(unsigned Reg);
+
+  bool isReorder() {return reorder;}
+  void setReorder() {reorder = true;}
+  void setNoreorder() {reorder = false;}
+
+  bool isMacro() {return macro;}
+  void setMacro() {macro = true;}
+  void setNomacro() {macro = false;}
+
+private:
+  unsigned aTReg;
+  bool reorder;
+  bool macro;
+};
+}
+
+namespace {
+class MipsAsmParser : public MCTargetAsmParser {
+
+  enum FpFormatTy {
+    FP_FORMAT_NONE = -1,
+    FP_FORMAT_S,
+    FP_FORMAT_D,
+    FP_FORMAT_L,
+    FP_FORMAT_W
+  } FpFormat;
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  MipsAssemblerOptions Options;
+
+
+#define GET_ASSEMBLER_HEADER
+#include "MipsGenAsmMatcher.inc"
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool ParseDirective(AsmToken DirectiveID);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                         unsigned RegisterClass);
+
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+                    StringRef Mnemonic);
+
+  int tryParseRegister(bool is64BitReg);
+
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               bool is64BitReg);
+
+  bool needsExpansion(MCInst &Inst);
+
+  void expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                         SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+  void expandMemInst(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions,
+                     bool isLoad,bool isImmOpnd);
+  bool reportParseError(StringRef ErrorMsg);
+
+  bool parseMemOffset(const MCExpr *&Res);
+  bool parseRelocOperand(const MCExpr *&Res);
+
+  bool parseDirectiveSet();
+
+  bool parseSetAtDirective();
+  bool parseSetNoAtDirective();
+  bool parseSetMacroDirective();
+  bool parseSetNoMacroDirective();
+  bool parseSetReorderDirective();
+  bool parseSetNoReorderDirective();
+
+  bool parseSetAssignment();
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+
+  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
+
+  bool isMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64) != 0;
+  }
+
+  bool isFP64() const {
+    return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
+  }
+
+  int matchRegisterName(StringRef Symbol, bool is64BitReg);
+
+  int matchCPURegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
+
+  void setFpFormat(FpFormatTy Format) {
+    FpFormat = Format;
+  }
+
+  void setDefaultFpFormat();
+
+  void setFpFormat(StringRef Format);
+
+  FpFormatTy getFpFormat() {return FpFormat;}
+
+  bool requestsDoubleOperand(StringRef Mnemonic);
+
+  unsigned getReg(int RC,int RegNo);
+
+  int getATReg();
+
+  bool processInstruction(MCInst &Inst, SMLoc IDLoc,
+                        SmallVectorImpl<MCInst> &Instructions);
+public:
+  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+};
+}
+
+namespace {
+
+/// MipsOperand - Instances of this class represent a parsed Mips machine
+/// instruction.
+class MipsOperand : public MCParsedAsmOperand {
+
+public:
+  enum RegisterKind {
+    Kind_None,
+    Kind_CPURegs,
+    Kind_CPU64Regs,
+    Kind_HWRegs,
+    Kind_HW64Regs,
+    Kind_FGR32Regs,
+    Kind_FGR64Regs,
+    Kind_AFGR64Regs,
+    Kind_CCRRegs
+  };
+
+private:
+  enum KindTy {
+    k_CondCode,
+    k_CoprocNum,
+    k_Immediate,
+    k_Memory,
+    k_PostIndexRegister,
+    k_Register,
+    k_Token
+  } Kind;
+
+  MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+    RegisterKind Kind;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    const MCExpr *Off;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
+public:
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst,Expr);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst,Expr);
+  }
+
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMem() const { return Kind == k_Memory; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  void setRegKind(RegisterKind RegKind) {
+    assert((Kind == k_Register) && "Invalid access!");
+    Reg.Kind = RegKind;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  static MipsOperand *CreateToken(StringRef Str, SMLoc S) {
+    MipsOperand *Op = new MipsOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
+                                 SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  bool isCPURegsAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_CPURegs;
+  }
+  void addCPURegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isCPU64RegsAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_CPU64Regs;
+  }
+  void addCPU64RegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isHWRegsAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_HWRegs;
+  }
+  void addHWRegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isHW64RegsAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_HW64Regs;
+  }
+  void addHW64RegsAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  void addCCRAsmOperands(MCInst &Inst, unsigned N) const {
+    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  }
+
+  bool isCCRAsm() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.Kind == Kind_CCRRegs;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  virtual void print(raw_ostream &OS) const {
+    llvm_unreachable("unimplemented!");
+  }
+};
+}
+
+namespace llvm {
+extern const MCInstrDesc MipsInsts[];
+}
+static const MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return MipsInsts[Opcode];
+}
+
+bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
+                        SmallVectorImpl<MCInst> &Instructions) {
+  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  Inst.setLoc(IDLoc);
+  if (MCID.mayLoad() || MCID.mayStore()) {
+    // Check the offset of memory operand, if it is a symbol
+    // reference or immediate we may have to expand instructions
+    for (unsigned i=0;i<MCID.getNumOperands();i++) {
+      const MCOperandInfo &OpInfo = MCID.OpInfo[i];
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+        MCOperand &Op = Inst.getOperand(i);
+        if (Op.isImm()) {
+          int MemOffset = Op.getImm();
+          if (MemOffset < -32768 || MemOffset > 32767) {
+            // Offset can't exceed 16bit value
+            expandMemInst(Inst,IDLoc,Instructions,MCID.mayLoad(),true);
+            return false;
+          }
+        } else if (Op.isExpr()) {
+          const MCExpr *Expr = Op.getExpr();
+          if (Expr->getKind() == MCExpr::SymbolRef){
+            const MCSymbolRefExpr *SR =
+                    static_cast<const MCSymbolRefExpr*>(Expr);
+            if (SR->getKind() == MCSymbolRefExpr::VK_None) {
+              // Expand symbol
+              expandMemInst(Inst,IDLoc,Instructions,MCID.mayLoad(),false);
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (needsExpansion(Inst))
+    expandInstruction(Inst, IDLoc, Instructions);
+  else
+    Instructions.push_back(Inst);
+
+  return false;
+}
+
+bool MipsAsmParser::needsExpansion(MCInst &Inst) {
+
+  switch(Inst.getOpcode()) {
+    case Mips::LoadImm32Reg:
+    case Mips::LoadAddr32Imm:
+    case Mips::LoadAddr32Reg:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                        SmallVectorImpl<MCInst> &Instructions){
+  switch(Inst.getOpcode()) {
+    case Mips::LoadImm32Reg:
+      return expandLoadImm(Inst, IDLoc, Instructions);
+    case Mips::LoadAddr32Imm:
+      return expandLoadAddressImm(Inst,IDLoc,Instructions);
+    case Mips::LoadAddr32Reg:
+      return expandLoadAddressReg(Inst,IDLoc,Instructions);
+    }
+}
+
+void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                                  SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+
+  int ImmValue = ImmOp.getImm();
+  tmpInst.setLoc(IDLoc);
+  if ( 0 <= ImmValue && ImmValue <= 65535) {
+    // for 0 <= j <= 65535.
+    // li d,j => ori d,$zero,j
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else if ( ImmValue < 0 && ImmValue >= -32768) {
+    // for -32768 <= j < 0.
+    // li d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    // for any other value of j that is representable as a 32-bit integer.
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,lo16(j)
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    tmpInst.setLoc(IDLoc);
+    Instructions.push_back(tmpInst);
+  }
+}
+
+void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                         SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(2);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if ( -32768 <= ImmValue && ImmValue <= 65535) {
+    //for -32768 <= j <= 65535.
+    //la d,j(s) => addiu d,s,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    //for any other value of j that is representable as a 32-bit integer.
+    //la d,j(s) => lui d,hi16(j)
+    //             ori d,d,lo16(j)
+    //             addu d,d,s
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ADDu);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    Instructions.push_back(tmpInst);
+  }
+}
+
+void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                         SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if ( -32768 <= ImmValue && ImmValue <= 65535) {
+    //for -32768 <= j <= 65535.
+    //la d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    //for any other value of j that is representable as a 32-bit integer.
+    //la d,j => lui d,hi16(j)
+    //          ori d,d,lo16(j)
+    tmpInst.setOpcode(Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+  }
+}
+
+void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions,
+                     bool isLoad,bool isImmOpnd) {
+  const MCSymbolRefExpr *SR;
+  MCInst TempInst;
+  unsigned ImmOffset,HiOffset,LoOffset;
+  const MCExpr *ExprOffset;
+  unsigned TmpRegNum;
+  unsigned AtRegNum = getReg((isMips64()) ? Mips::CPU64RegsRegClassID:
+                                            Mips::CPURegsRegClassID,
+                                            getATReg());
+  // 1st operand is either source or dst register
+  assert(Inst.getOperand(0).isReg() && "expected register operand kind");
+  unsigned RegOpNum = Inst.getOperand(0).getReg();
+  // 2nd operand is base register
+  assert(Inst.getOperand(1).isReg() && "expected register operand kind");
+  unsigned BaseRegNum = Inst.getOperand(1).getReg();
+  // 3rd operand is either immediate or expression
+  if (isImmOpnd) {
+    assert(Inst.getOperand(2).isImm() && "expected immediate operand kind");
+    ImmOffset = Inst.getOperand(2).getImm();
+    LoOffset = ImmOffset & 0x0000ffff;
+    HiOffset = (ImmOffset & 0xffff0000) >> 16;
+    // If msb of LoOffset is 1(negative number) we must increment HiOffset
+    if (LoOffset & 0x8000)
+      HiOffset++;
+  }
+  else
+    ExprOffset = Inst.getOperand(2).getExpr();
+  // All instructions will have the same location
+  TempInst.setLoc(IDLoc);
+  // 1st instruction in expansion is LUi. For load instruction we can use
+  // the dst register as a temporary if base and dst are different,
+  // but for stores we must use $at
+  TmpRegNum = (isLoad && (BaseRegNum != RegOpNum))?RegOpNum:AtRegNum;
+  TempInst.setOpcode(Mips::LUi);
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  if (isImmOpnd)
+    TempInst.addOperand(MCOperand::CreateImm(HiOffset));
+  else {
+    if (ExprOffset->getKind() == MCExpr::SymbolRef) {
+      SR = static_cast<const MCSymbolRefExpr*>(ExprOffset);
+      const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::
+                                        Create(SR->getSymbol().getName(),
+                                        MCSymbolRefExpr::VK_Mips_ABS_HI,
+                                        getContext());
+      TempInst.addOperand(MCOperand::CreateExpr(HiExpr));
+    }
+  }
+  // Add the instruction to the list
+  Instructions.push_back(TempInst);
+  // and prepare TempInst for next instruction
+  TempInst.clear();
+  // which is add temp register to base
+  TempInst.setOpcode(Mips::ADDu);
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  TempInst.addOperand(MCOperand::CreateReg(BaseRegNum));
+  Instructions.push_back(TempInst);
+  TempInst.clear();
+  // and finaly, create original instruction with low part
+  // of offset and new base
+  TempInst.setOpcode(Inst.getOpcode());
+  TempInst.addOperand(MCOperand::CreateReg(RegOpNum));
+  TempInst.addOperand(MCOperand::CreateReg(TmpRegNum));
+  if (isImmOpnd)
+    TempInst.addOperand(MCOperand::CreateImm(LoOffset));
+  else {
+    if (ExprOffset->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::
+                                      Create(SR->getSymbol().getName(),
+                                      MCSymbolRefExpr::VK_Mips_ABS_LO,
+                                      getContext());
+      TempInst.addOperand(MCOperand::CreateExpr(LoExpr));
+    }
+  }
+  Instructions.push_back(TempInst);
+  TempInst.clear();
+}
+
+bool MipsAsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  SmallVector<MCInst, 8> Instructions;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                              MatchingInlineAsm);
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success: {
+    if (processInstruction(Inst,IDLoc,Instructions))
+      return true;
+    for(unsigned i =0; i < Instructions.size(); i++)
+      Out.EmitInstruction(Instructions[i]);
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MipsOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
+  return true;
+}
+
+int MipsAsmParser::matchCPURegisterName(StringRef Name) {
+   int CC;
+
+  if (Name == "at")
+    return getATReg();
+
+    CC = StringSwitch<unsigned>(Name)
+    .Case("zero", 0)
+    .Case("a0",   4)
+    .Case("a1",   5)
+    .Case("a2",   6)
+    .Case("a3",   7)
+    .Case("v0",   2)
+    .Case("v1",   3)
+    .Case("s0",  16)
+    .Case("s1",  17)
+    .Case("s2",  18)
+    .Case("s3",  19)
+    .Case("s4",  20)
+    .Case("s5",  21)
+    .Case("s6",  22)
+    .Case("s7",  23)
+    .Case("k0",  26)
+    .Case("k1",  27)
+    .Case("sp",  29)
+    .Case("fp",  30)
+    .Case("gp",  28)
+    .Case("ra",  31)
+    .Case("t0",   8)
+    .Case("t1",   9)
+    .Case("t2",  10)
+    .Case("t3",  11)
+    .Case("t4",  12)
+    .Case("t5",  13)
+    .Case("t6",  14)
+    .Case("t7",  15)
+    .Case("t8",  24)
+    .Case("t9",  25)
+    .Default(-1);
+
+  // Although SGI documentation just cut out t0-t3 for n32/n64,
+  // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
+  // We are supporting both cases, so for t0-t3 we'll just push them to t4-t7.
+  if (isMips64() && 8 <= CC  && CC <= 11)
+    CC += 4;
+
+  if (CC == -1 && isMips64())
+    CC = StringSwitch<unsigned>(Name)
+      .Case("a4",   8)
+      .Case("a5",   9)
+      .Case("a6",  10)
+      .Case("a7",  11)
+      .Case("kt0", 26)
+      .Case("kt1", 27)
+      .Case("s8",  30)
+      .Default(-1);
+
+  return CC;
+}
+int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
+
+  int CC;
+  CC = matchCPURegisterName(Name);
+  if (CC != -1)
+    return matchRegisterByNumber(CC,is64BitReg?Mips::CPU64RegsRegClassID:
+                               Mips::CPURegsRegClassID);
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if( NumString.getAsInteger(10, IntVal))
+      return -1; // not integer
+    if (IntVal > 31)
+      return -1;
+
+    FpFormatTy Format = getFpFormat();
+
+    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
+      return getReg(Mips::FGR32RegClassID, IntVal);
+    if (Format == FP_FORMAT_D) {
+      if(isFP64()) {
+        return getReg(Mips::FGR64RegClassID, IntVal);
+      }
+      // only even numbers available as register pairs
+      if (( IntVal > 31) || (IntVal%2 !=  0))
+        return -1;
+      return getReg(Mips::AFGR64RegClassID, IntVal/2);
+    }
+  }
+
+  return -1;
+}
+void MipsAsmParser::setDefaultFpFormat() {
+
+  if (isMips64() || isFP64())
+    FpFormat = FP_FORMAT_D;
+  else
+    FpFormat = FP_FORMAT_S;
+}
+
+bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){
+
+  bool IsDouble = StringSwitch<bool>(Mnemonic.lower())
+    .Case("ldxc1", true)
+    .Case("ldc1",  true)
+    .Case("sdxc1", true)
+    .Case("sdc1",  true)
+    .Default(false);
+
+  return IsDouble;
+}
+void MipsAsmParser::setFpFormat(StringRef Format) {
+
+  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
+    .Case(".s",  FP_FORMAT_S)
+    .Case(".d",  FP_FORMAT_D)
+    .Case(".l",  FP_FORMAT_L)
+    .Case(".w",  FP_FORMAT_W)
+    .Default(FP_FORMAT_NONE);
+}
+
+bool MipsAssemblerOptions::setATReg(unsigned Reg) {
+  if (Reg > 31)
+    return false;
+
+  aTReg = Reg;
+  return true;
+}
+
+int MipsAsmParser::getATReg() {
+  return Options.getATRegNum();
+}
+
+unsigned MipsAsmParser::getReg(int RC,int RegNo) {
+  return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
+}
+
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
+
+  if (RegNum > 31)
+    return -1;
+
+  return getReg(RegClass, RegNum);
+}
+
+int MipsAsmParser::tryParseRegister(bool is64BitReg) {
+  const AsmToken &Tok = Parser.getTok();
+  int RegNum = -1;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    std::string lowerCase = Tok.getString().lower();
+    RegNum = matchRegisterName(lowerCase, is64BitReg);
+  } else if (Tok.is(AsmToken::Integer))
+    RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
+                                   is64BitReg ? Mips::CPU64RegsRegClassID
+                                              : Mips::CPURegsRegClassID);
+  return RegNum;
+}
+
+bool MipsAsmParser::
+  tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                          bool is64BitReg){
+
+  SMLoc S = Parser.getTok().getLoc();
+  int RegNo = -1;
+
+  RegNo = tryParseRegister(is64BitReg);
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
+      Parser.getTok().getLoc()));
+  Parser.Lex(); // Eat register token.
+  return false;
+}
+
+bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
+                                 StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Dollar: {
+    // parse register
+    SMLoc S = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat dollar token.
+    // parse register operand
+    if (!tryParseRegisterOperand(Operands, isMips64())) {
+      if (getLexer().is(AsmToken::LParen)) {
+        // check if it is indexed addressing operand
+        Operands.push_back(MipsOperand::CreateToken("(", S));
+        Parser.Lex(); // eat parenthesis
+        if (getLexer().isNot(AsmToken::Dollar))
+          return true;
+
+        Parser.Lex(); // eat dollar
+        if (tryParseRegisterOperand(Operands, isMips64()))
+          return true;
+
+        if (!getLexer().is(AsmToken::RParen))
+          return true;
+
+        S = Parser.getTok().getLoc();
+        Operands.push_back(MipsOperand::CreateToken(")", S));
+        Parser.Lex();
+      }
+      return false;
+    }
+    // maybe it is a symbol reference
+    StringRef Identifier;
+    if (Parser.parseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+
+    // Otherwise create a symbol ref.
+    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+    return false;
+  }
+  case AsmToken::Identifier:
+    // Look for the existing symbol, we should check if
+    // we need to assigne the propper RegisterKind
+   if (searchSymbolAlias(Operands,MipsOperand::Kind_None))
+     return false;
+    //else drop to expression parsing
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::String: {
+     // quoted label names
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::Percent: {
+    // it is a symbol reference or constant expression
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc(); // start location of the operand
+    if (parseRelocOperand(IdVal))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  } // case AsmToken::Percent
+  } // switch(getLexer().getKind())
+  return true;
+}
+
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+
+  Parser.Lex(); // eat % token
+  const AsmToken &Tok = Parser.getTok(); // get next token, operation
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+
+  std::string Str = Tok.getIdentifier().str();
+
+  Parser.Lex(); // eat identifier
+  // now make expression from the rest of the operand
+  const MCExpr *IdVal;
+  SMLoc EndLoc;
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    while (1) {
+      Parser.Lex(); // eat '(' token
+      if (getLexer().getKind() == AsmToken::Percent) {
+        Parser.Lex(); // eat % token
+        const AsmToken &nextTok = Parser.getTok();
+        if (nextTok.isNot(AsmToken::Identifier))
+          return true;
+        Str += "(%";
+        Str += nextTok.getIdentifier();
+        Parser.Lex(); // eat identifier
+        if (getLexer().getKind() != AsmToken::LParen)
+          return true;
+      } else
+        break;
+    }
+    if (getParser().parseParenExpression(IdVal,EndLoc))
+      return true;
+
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex(); // eat ')' token
+
+  } else
+    return true; // parenthesis must follow reloc operand
+
+  // Check the type of the expression
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal)) {
+    // It's a constant, evaluate lo or hi value
+    if (Str == "lo") {
+      short Val = MCE->getValue();
+      Res = MCConstantExpr::Create(Val, getContext());
+    } else if (Str == "hi") {
+      int Val = MCE->getValue();
+      int LoSign = Val & 0x8000;
+      Val = (Val & 0xffff0000) >> 16;
+      // Lower part is treated as a signed int, so if it is negative
+      // we must add 1 to the hi part to compensate
+      if (LoSign)
+        Val++;
+      Res = MCConstantExpr::Create(Val, getContext());
+    }
+    return false;
+  }
+
+  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(IdVal)) {
+    // It's a symbol, create symbolic expression from symbol
+    StringRef Symbol = MSRE->getSymbol().getName();
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(Str);
+    Res = MCSymbolRefExpr::Create(Symbol,VK,getContext());
+    return false;
+  }
+  return true;
+}
+
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = tryParseRegister(isMips64());
+  EndLoc = Parser.getTok().getLoc();
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) {
+
+  SMLoc S;
+
+  switch(getLexer().getKind()) {
+  default:
+    return true;
+  case AsmToken::Identifier:
+  case AsmToken::Integer:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+    return (getParser().parseExpression(Res));
+  case AsmToken::Percent:
+    return parseRelocOperand(Res);
+  case AsmToken::LParen:
+    return false;  // it's probably assuming 0
+  }
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
+               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+
+  const MCExpr *IdVal = 0;
+  SMLoc S;
+  // first operand is the offset
+  S = Parser.getTok().getLoc();
+
+  if (parseMemOffset(IdVal))
+    return MatchOperand_ParseFail;
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::LParen)) {
+    MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
+    if (Mnemonic->getToken() == "la") {
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() -1);
+      Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+      return MatchOperand_Success;
+    }
+    Error(Parser.getTok().getLoc(), "'(' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat '(' token.
+
+  const AsmToken &Tok1 = Parser.getTok(); // get next token
+  if (Tok1.is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '$' token.
+    if (tryParseRegisterOperand(Operands, isMips64())) {
+      Error(Parser.getTok().getLoc(), "unexpected token in operand");
+      return MatchOperand_ParseFail;
+    }
+
+  } else {
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return MatchOperand_ParseFail;
+  }
+
+  const AsmToken &Tok2 = Parser.getTok(); // get next token
+  if (Tok2.isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat ')' token.
+
+  if (IdVal == 0)
+    IdVal = MCConstantExpr::Create(0, getContext());
+
+  // now replace register operand with the mem operand
+  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  int RegNo = op->getReg();
+  // remove register from operands
+  Operands.pop_back();
+  // and add memory operand
+  Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E));
+  delete op;
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    if (searchSymbolAlias(Operands,MipsOperand::Kind_CPU64Regs))
+      return MatchOperand_Success;
+    return MatchOperand_NoMatch;
+  }
+  // if the first token is not '$' we have an error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat $
+  if(!tryParseRegisterOperand(Operands, true)) {
+    // set the proper register kind
+    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+    op->setRegKind(MipsOperand::Kind_CPU64Regs);
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+bool MipsAsmParser::
+searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                  unsigned RegisterKind) {
+
+  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
+  if (Sym) {
+    SMLoc S = Parser.getTok().getLoc();
+    const MCExpr *Expr;
+    if (Sym->isVariable())
+      Expr = Sym->getVariableValue();
+    else
+      return false;
+    if (Expr->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+      const StringRef DefSymbol = Ref->getSymbol().getName();
+      if (DefSymbol.startswith("$")) {
+        // Lookup for the register with corresponding name
+        int RegNum = matchRegisterName(DefSymbol.substr(1),isMips64());
+        if (RegNum > -1) {
+          Parser.Lex();
+          MipsOperand *op = MipsOperand::CreateReg(RegNum,S,
+                                         Parser.getTok().getLoc());
+          op->setRegKind((MipsOperand::RegisterKind)RegisterKind);
+          Operands.push_back(op);
+          return true;
+        }
+      }
+    } else if (Expr->getKind() == MCExpr::Constant) {
+      Parser.Lex();
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr*>(Expr);
+      MipsOperand *op = MipsOperand::CreateImm(Const,S,
+                                     Parser.getTok().getLoc());
+      Operands.push_back(op);
+      return true;
+    }
+  }
+  return false;
+}
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    if (searchSymbolAlias(Operands,MipsOperand::Kind_CPURegs))
+      return MatchOperand_Success;
+    return MatchOperand_NoMatch;
+  }
+  // if the first token is not '$' we have an error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat $
+  if(!tryParseRegisterOperand(Operands, false)) {
+    // set the propper register kind
+    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+    op->setRegKind(MipsOperand::Kind_CPURegs);
+    return MatchOperand_Success;
+  }
+  return MatchOperand_NoMatch;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (isMips64())
+    return MatchOperand_NoMatch;
+
+  // if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = Tok.getIntVal();
+  // at the moment only hwreg29 is supported
+  if (RegNum != 29)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_HWRegs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+
+  if (!isMips64())
+    return MatchOperand_NoMatch;
+    //if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned RegNum = Tok.getIntVal();
+  // at the moment only hwreg29 is supported
+  if (RegNum != 29)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29_64, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_HW64Regs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  unsigned RegNum;
+  //if the first token is not '$' we have error
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat $
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.is(AsmToken::Integer)) {
+    RegNum = Tok.getIntVal();
+    // at the moment only fcc0 is supported
+    if (RegNum != 0)
+      return MatchOperand_ParseFail;
+  } else if (Tok.is(AsmToken::Identifier)) {
+    // at the moment only fcc0 is supported
+    if (Tok.getIdentifier() != "fcc0")
+      return MatchOperand_ParseFail;
+  } else
+    return MatchOperand_NoMatch;
+
+  MipsOperand *op = MipsOperand::CreateReg(Mips::FCC0, S,
+        Parser.getTok().getLoc());
+  op->setRegKind(MipsOperand::Kind_CCRRegs);
+  Operands.push_back(op);
+
+  Parser.Lex(); // Eat reg number
+  return MatchOperand_Success;
+}
+
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+
+  MCSymbolRefExpr::VariantKind VK
+                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
+    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
+    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
+    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
+    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
+    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
+    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
+    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
+    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
+    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
+    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
+    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
+    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+    .Default(MCSymbolRefExpr::VK_None);
+
+  return VK;
+}
+
+static int ConvertCcString(StringRef CondString) {
+  int CC = StringSwitch<unsigned>(CondString)
+      .Case(".f",    0)
+      .Case(".un",   1)
+      .Case(".eq",   2)
+      .Case(".ueq",  3)
+      .Case(".olt",  4)
+      .Case(".ult",  5)
+      .Case(".ole",  6)
+      .Case(".ule",  7)
+      .Case(".sf",   8)
+      .Case(".ngle", 9)
+      .Case(".seq",  10)
+      .Case(".ngl",  11)
+      .Case(".lt",   12)
+      .Case(".nge",  13)
+      .Case(".le",   14)
+      .Case(".ngt",  15)
+      .Default(-1);
+
+  return CC;
+}
+
+bool MipsAsmParser::
+parseMathOperation(StringRef Name, SMLoc NameLoc,
+                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // split the format
+  size_t Start = Name.find('.'), Next = Name.rfind('.');
+  StringRef Format1 = Name.slice(Start, Next);
+  // and add the first format to the operands
+  Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc));
+  // now for the second format
+  StringRef Format2 = Name.slice(Next, StringRef::npos);
+  Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc));
+
+  // set the format for the first register
+  setFpFormat(Format1);
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+
+    }
+    Parser.Lex();  // Eat the comma.
+
+    //set the format for the first register
+    setFpFormat(Format2);
+
+    // Parse and remember the operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  StringRef Mnemonic;
+  // floating point instructions: should register be treated as double?
+  if (requestsDoubleOperand(Name)) {
+    setFpFormat(FP_FORMAT_D);
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  Mnemonic = Name;
+  }
+  else {
+    setDefaultFpFormat();
+    // Create the leading tokens for the mnemonic, split by '.' characters.
+    size_t Start = 0, Next = Name.find('.');
+    Mnemonic = Name.slice(Start, Next);
+
+    Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
+
+    if (Next != StringRef::npos) {
+      // there is a format token in mnemonic
+      // StringRef Rest = Name.slice(Next, StringRef::npos);
+      size_t Dot = Name.find('.', Next+1);
+      StringRef Format = Name.slice(Next, Dot);
+      if (Dot == StringRef::npos) //only one '.' in a string, it's a format
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      else {
+        if (Name.startswith("c.")){
+          // floating point compare, add '.' and immediate represent for cc
+          Operands.push_back(MipsOperand::CreateToken(".", NameLoc));
+          int Cc = ConvertCcString(Format);
+          if (Cc == -1) {
+            return Error(NameLoc, "Invalid conditional code");
+          }
+          SMLoc E = SMLoc::getFromPointer(
+              Parser.getTok().getLoc().getPointer() -1 );
+          Operands.push_back(MipsOperand::CreateImm(
+              MCConstantExpr::Create(Cc, getContext()), NameLoc, E));
+        } else {
+          // trunc, ceil, floor ...
+          return parseMathOperation(Name, NameLoc, Operands);
+        }
+
+        // the rest is a format
+        Format = Name.slice(Dot, StringRef::npos);
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      }
+
+      setFpFormat(Format);
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Mnemonic)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    while (getLexer().is(AsmToken::Comma) ) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.eatToEndOfStatement();
+        return Error(Loc, "unexpected token in argument list");
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.eatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
+   SMLoc Loc = getLexer().getLoc();
+   Parser.eatToEndOfStatement();
+   return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::parseSetNoAtDirective() {
+  // Line should look like:
+  //  .set noat
+  // set at reg to 0
+  Options.setATReg(0);
+  // eat noat
+  Parser.Lex();
+  // If this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+bool MipsAsmParser::parseSetAtDirective() {
+  // line can be
+  //  .set at - defaults to $1
+  // or .set at=$reg
+  int AtRegNo;
+  getParser().Lex();
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Options.setATReg(1);
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+  } else if (getLexer().is(AsmToken::Equal)) {
+    getParser().Lex(); // eat '='
+    if (getLexer().isNot(AsmToken::Dollar)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Parser.Lex(); // Eat '$'
+    const AsmToken &Reg = Parser.getTok();
+    if (Reg.is(AsmToken::Identifier)) {
+      AtRegNo = matchCPURegisterName(Reg.getIdentifier());
+    } else if (Reg.is(AsmToken::Integer)) {
+      AtRegNo = Reg.getIntVal();
+    } else {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+
+    if ( AtRegNo < 1 || AtRegNo > 31) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+
+    if (!Options.setATReg(AtRegNo)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    getParser().Lex(); // Eat reg
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token in statement");
+      return false;
+     }
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+  } else {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+}
+
+bool MipsAsmParser::parseSetReorderDirective() {
+  Parser.Lex();
+  // If this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setReorder();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoReorderDirective() {
+    Parser.Lex();
+    // if this is not the end of the statement, report error
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Options.setNoreorder();
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+}
+
+bool MipsAsmParser::parseSetMacroDirective() {
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setMacro();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMacroDirective() {
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  if (Options.isReorder()) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  Options.setNomacro();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::parseSetAssignment() {
+  StringRef Name;
+  const MCExpr *Value;
+
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier after .set");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return reportParseError("unexpected token in .set directive");
+  Lex(); //eat comma
+
+  if (Parser.parseExpression(Value))
+    reportParseError("expected valid expression after comma");
+
+  // check if the Name already exists as a symbol
+  MCSymbol *Sym = getContext().LookupSymbol(Name);
+  if (Sym) {
+    return reportParseError("symbol already defined");
+  }
+  Sym = getContext().GetOrCreateSymbol(Name);
+  Sym->setVariableValue(Value);
+
+  return false;
+}
+bool MipsAsmParser::parseDirectiveSet() {
+
+  // get next token
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.getString() == "noat") {
+    return parseSetNoAtDirective();
+  } else if (Tok.getString() == "at") {
+    return parseSetAtDirective();
+  } else if (Tok.getString() == "reorder") {
+    return parseSetReorderDirective();
+  } else if (Tok.getString() == "noreorder") {
+    return parseSetNoReorderDirective();
+  } else if (Tok.getString() == "macro") {
+    return parseSetMacroDirective();
+  } else if (Tok.getString() == "nomacro") {
+    return parseSetNoMacroDirective();
+  } else if (Tok.getString() == "nomips16") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  } else if (Tok.getString() == "nomicromips") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  } else {
+    // it is just an identifier, look for assignment
+    parseSetAssignment();
+    return false;
+  }
+
+  return true;
+}
+
+/// parseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+
+  StringRef IDVal = DirectiveID.getString();
+
+  if ( IDVal == ".ent") {
+    // ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (IDVal == ".end") {
+    // ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (IDVal == ".frame") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".set") {
+    return parseDirectiveSet();
+  }
+
+  if (IDVal == ".fmask") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".mask") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".gpword") {
+    // ignore this directive for now
+    Parser.eatToEndOfStatement();
+    return false;
+  }
+
+  if (IDVal == ".word") {
+    parseDirectiveWord(4, DirectiveID.getLoc());
+    return false;
+  }
+
+  return true;
+}
+
+extern "C" void LLVMInitializeMipsAsmParser() {
+  RegisterMCAsmParser<MipsAsmParser> X(TheMipsTarget);
+  RegisterMCAsmParser<MipsAsmParser> Y(TheMipselTarget);
+  RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target);
+  RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
new file mode 100644
index 000000000000..59e49d8ddc6c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -0,0 +1,556 @@
+//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Mips Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// MipsDisassemblerBase - a disasembler class for Mips.
+class MipsDisassemblerBase : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                       bool bigEndian) :
+    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
+
+  virtual ~MipsDisassemblerBase() {}
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+
+private:
+  const MCRegisterInfo *RegInfo;
+protected:
+  bool isBigEndian;
+};
+
+/// MipsDisassembler - a disasembler class for Mips32.
+class MipsDisassembler : public MipsDisassemblerBase {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                   bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
+
+  /// getInstruction - See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
+};
+
+
+/// Mips64Disassembler - a disasembler class for Mips64.
+class Mips64Disassembler : public MipsDisassemblerBase {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+                     bool bigEndian) :
+    MipsDisassemblerBase(STI, Info, bigEndian) {}
+
+  /// getInstruction - See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
+};
+
+} // end anonymous namespace
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
+                                                unsigned Insn,
+                                                uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
+static DecodeStatus DecodeBC1(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder);
+
+static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder);
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
+
+static DecodeStatus DecodeCondCode(MCInst &Inst,
+                                   unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder);
+
+namespace llvm {
+extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
+              TheMips64elTarget;
+}
+
+static MCDisassembler *createMipsDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), true);
+}
+
+static MCDisassembler *createMipselDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new MipsDisassembler(STI, T.createMCRegInfo(""), false);
+}
+
+static MCDisassembler *createMips64Disassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), true);
+}
+
+static MCDisassembler *createMips64elDisassembler(
+                       const Target &T,
+                       const MCSubtargetInfo &STI) {
+  return new Mips64Disassembler(STI, T.createMCRegInfo(""), false);
+}
+
+extern "C" void LLVMInitializeMipsDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheMipsTarget,
+                                         createMipsDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMipselTarget,
+                                         createMipselDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64Target,
+                                         createMips64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheMips64elTarget,
+                                         createMips64elDisassembler);
+}
+
+
+#include "MipsGenDisassemblerTables.inc"
+
+  /// readInstruction - read four bytes from the MemoryObject
+  /// and return 32 bit word sorted according to the given endianess
+static DecodeStatus readInstruction32(const MemoryObject &region,
+                                      uint64_t address,
+                                      uint64_t &size,
+                                      uint32_t &insn,
+                                      bool isBigEndian) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, (uint8_t*)Bytes, NULL) == -1) {
+    size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  if (isBigEndian) {
+    // Encoded as a big-endian 32-bit word in the stream.
+    insn = (Bytes[3] <<  0) |
+           (Bytes[2] <<  8) |
+           (Bytes[1] << 16) |
+           (Bytes[0] << 24);
+  }
+  else {
+    // Encoded as a small-endian 32-bit word in the stream.
+    insn = (Bytes[0] <<  0) |
+           (Bytes[1] <<  8) |
+           (Bytes[2] << 16) |
+           (Bytes[3] << 24);
+  }
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus
+MipsDisassembler::getInstruction(MCInst &instr,
+                                 uint64_t &Size,
+                                 const MemoryObject &Region,
+                                 uint64_t Address,
+                                 raw_ostream &vStream,
+                                 raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+DecodeStatus
+Mips64Disassembler::getInstruction(MCInst &instr,
+                                   uint64_t &Size,
+                                   const MemoryObject &Region,
+                                   uint64_t Address,
+                                   raw_ostream &vStream,
+                                   raw_ostream &cStream) const {
+  uint32_t Insn;
+
+  DecodeStatus Result = readInstruction32(Region, Address, Size,
+                                          Insn, isBigEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTableMips6432, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+  // If we fail to decode in Mips64 decoder space we can try in Mips32
+  Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
+                             this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  return MCDisassembler::Fail;
+
+}
+
+static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::CPU64RegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::CPURegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeCPURegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR64RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGR32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::CPURegsRegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+
+  if(Inst.getOpcode() == Mips::SC){
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+  }
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFMem(MCInst &Inst,
+                               unsigned Insn,
+                               uint64_t Address,
+                               const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 16, 5);
+  unsigned Base = fieldFromInstruction(Insn, 21, 5);
+
+  Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
+  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  // Currently only hardware register 29 is supported.
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCondCode(MCInst &Inst,
+                                   unsigned Insn,
+                                   uint64_t Address,
+                                   const void *Decoder) {
+  int CondCode = Insn & 0xf;
+  Inst.addOperand(MCOperand::CreateImm(CondCode));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 30 || RegNo %2)
+    return MCDisassembler::Fail;
+
+  ;
+  unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  //Currently only hardware register 29 is supported
+  if (RegNo != 29)
+    return  MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29_64));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACRegsDSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget(MCInst &Inst,
+                                       unsigned Offset,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned BranchOffset = Offset & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBC1(MCInst &Inst,
+                              unsigned Insn,
+                              uint64_t Address,
+                              const void *Decoder) {
+  unsigned BranchOffset = Insn & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTarget(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeSimm16(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Insn)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeInsSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  // First we need to grab the pos(lsb) from MCInst.
+  int Pos = Inst.getOperand(2).getImm();
+  int Size = (int) Insn - Pos + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeExtSize(MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address,
+                                  const void *Decoder) {
+  int Size = (int) Insn  + 1;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
new file mode 100644
index 000000000000..fc23cd380352
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -0,0 +1,211 @@
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MipsInstPrinter.h"
+#include "MipsInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define PRINT_ALIAS_INSTR
+#include "MipsGenAsmWriter.inc"
+
+const char* Mips::MipsFCCToString(Mips::CondCode CC) {
+  switch (CC) {
+  case FCOND_F:
+  case FCOND_T:   return "f";
+  case FCOND_UN:
+  case FCOND_OR:  return "un";
+  case FCOND_OEQ:
+  case FCOND_UNE: return "eq";
+  case FCOND_UEQ:
+  case FCOND_ONE: return "ueq";
+  case FCOND_OLT:
+  case FCOND_UGE: return "olt";
+  case FCOND_ULT:
+  case FCOND_OGE: return "ult";
+  case FCOND_OLE:
+  case FCOND_UGT: return "ole";
+  case FCOND_ULE:
+  case FCOND_OGT: return "ule";
+  case FCOND_SF:
+  case FCOND_ST:  return "sf";
+  case FCOND_NGLE:
+  case FCOND_GLE: return "ngle";
+  case FCOND_SEQ:
+  case FCOND_SNE: return "seq";
+  case FCOND_NGL:
+  case FCOND_GL:  return "ngl";
+  case FCOND_LT:
+  case FCOND_NLT: return "lt";
+  case FCOND_NGE:
+  case FCOND_GE:  return "nge";
+  case FCOND_LE:
+  case FCOND_NLE: return "le";
+  case FCOND_NGT:
+  case FCOND_GT:  return "ngt";
+  }
+  llvm_unreachable("Impossible condition code!");
+}
+
+void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << '$' << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                StringRef Annot) {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\t.set\tpush\n";
+    O << "\t.set\tmips32r2\n";
+  }
+
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
+  printAnnotation(O, Annot);
+
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case Mips::RDHWR:
+  case Mips::RDHWR64:
+    O << "\n\t.set\tpop";
+  }
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  }
+  else if (!(SRE = dyn_cast<MCSymbolRefExpr>(Expr)))
+    assert(false && "Unexpected MCExpr type.");
+
+  MCSymbolRefExpr::VariantKind Kind = SRE->getKind();
+
+  switch (Kind) {
+  default:                                 llvm_unreachable("Invalid kind!");
+  case MCSymbolRefExpr::VK_None:           break;
+  case MCSymbolRefExpr::VK_Mips_GPREL:     OS << "%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_CALL:  OS << "%call16("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT16:     OS << "%got(";    break;
+  case MCSymbolRefExpr::VK_Mips_GOT:       OS << "%got(";    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:    OS << "%hi(";     break;
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:    OS << "%lo(";     break;
+  case MCSymbolRefExpr::VK_Mips_TLSGD:     OS << "%tlsgd(";  break;
+  case MCSymbolRefExpr::VK_Mips_TLSLDM:    OS << "%tlsldm(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_HI: OS << "%dtprel_hi(";  break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_LO: OS << "%dtprel_lo(";  break;
+  case MCSymbolRefExpr::VK_Mips_GOTTPREL:  OS << "%gottprel("; break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_HI:  OS << "%tprel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_LO:  OS << "%tprel_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_HI:  OS << "%hi(%neg(%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_LO:  OS << "%lo(%neg(%gp_rel("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_DISP:  OS << "%got_disp("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_PAGE:  OS << "%got_page("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_OFST:  OS << "%got_ofst("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:    OS << "%higher("; break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:   OS << "%highest("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_HI16:  OS << "%got_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
+  }
+
+  OS << SRE->getSymbol();
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+
+  if ((Kind == MCSymbolRefExpr::VK_Mips_GPOFF_HI) ||
+      (Kind == MCSymbolRefExpr::VK_Mips_GPOFF_LO))
+    OS << ")))";
+  else if (Kind != MCSymbolRefExpr::VK_None)
+    OS << ')';
+}
+
+void MipsInstPrinter::printCPURegs(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printRegName(O, MI->getOperand(OpNo).getReg());
+}
+
+void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), O);
+}
+
+void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsInstPrinter::
+printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsInstPrinter::
+printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  const MCOperand& MO = MI->getOperand(opNum);
+  O << MipsFCCToString((Mips::CondCode)MO.getImm());
+}
diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
new file mode 100644
index 000000000000..d1b561f9764e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -0,0 +1,103 @@
+//=== MipsInstPrinter.h - Convert Mips MCInst to assembly syntax -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a Mips MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTPRINTER_H
+#define MIPSINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+// These enumeration declarations were originally in MipsInstrInfo.h but
+// had to be moved here to avoid circular dependencies between
+// LLVMMipsCodeGen and LLVMMipsAsmPrinter.
+namespace Mips {
+// Mips Branch Codes
+enum FPBranchCode {
+  BRANCH_F,
+  BRANCH_T,
+  BRANCH_FL,
+  BRANCH_TL,
+  BRANCH_INVALID
+};
+
+// Mips Condition Codes
+enum CondCode {
+  // To be used with float branch True
+  FCOND_F,
+  FCOND_UN,
+  FCOND_OEQ,
+  FCOND_UEQ,
+  FCOND_OLT,
+  FCOND_ULT,
+  FCOND_OLE,
+  FCOND_ULE,
+  FCOND_SF,
+  FCOND_NGLE,
+  FCOND_SEQ,
+  FCOND_NGL,
+  FCOND_LT,
+  FCOND_NGE,
+  FCOND_LE,
+  FCOND_NGT,
+
+  // To be used with float branch False
+  // This conditions have the same mnemonic as the
+  // above ones, but are used with a branch False;
+  FCOND_T,
+  FCOND_OR,
+  FCOND_UNE,
+  FCOND_ONE,
+  FCOND_UGE,
+  FCOND_OGE,
+  FCOND_UGT,
+  FCOND_OGT,
+  FCOND_ST,
+  FCOND_GLE,
+  FCOND_SNE,
+  FCOND_GL,
+  FCOND_NLT,
+  FCOND_GE,
+  FCOND_NLE,
+  FCOND_GT
+};
+
+const char *MipsFCCToString(Mips::CondCode CC);
+} // end namespace Mips
+
+class TargetMachine;
+
+class MipsInstPrinter : public MCInstPrinter {
+public:
+  MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printCPURegs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 000000000000..0b13607a572d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,279 @@
+//===-- MipsASMBackend.cpp - Mips Asm Backend  ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsAsmBackend and MipsELFObjectWriter classes.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Prepare value for the target space for it
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+
+  // Add/subtract and shift
+  switch (Kind) {
+  default:
+    return 0;
+  case FK_GPRel_4:
+  case FK_Data_4:
+  case FK_Data_8:
+  case Mips::fixup_Mips_LO16:
+  case Mips::fixup_Mips_GPREL16:
+  case Mips::fixup_Mips_GPOFF_HI:
+  case Mips::fixup_Mips_GPOFF_LO:
+  case Mips::fixup_Mips_GOT_PAGE:
+  case Mips::fixup_Mips_GOT_OFST:
+  case Mips::fixup_Mips_GOT_DISP:
+  case Mips::fixup_Mips_GOT_LO16:
+  case Mips::fixup_Mips_CALL_LO16:
+    break;
+  case Mips::fixup_Mips_PC16:
+    // So far we are only using this type for branches.
+    // For branches we start 1 instruction after the branch
+    // so the displacement will be one instruction size less.
+    Value -= 4;
+    // The displacement is then divided by 4 to give us an 18 bit
+    // address range.
+    Value >>= 2;
+    break;
+  case Mips::fixup_Mips_26:
+    // So far we are only using this type for jumps.
+    // The displacement is then divided by 4 to give us an 28 bit
+    // address range.
+    Value >>= 2;
+    break;
+  case Mips::fixup_Mips_HI16:
+  case Mips::fixup_Mips_GOT_Local:
+  case Mips::fixup_Mips_GOT_HI16:
+  case Mips::fixup_Mips_CALL_HI16:
+    // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
+    Value = ((Value + 0x8000) >> 16) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    // Get the 3rd 16-bits.
+    Value = ((Value + 0x80008000LL) >> 32) & 0xffff;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    // Get the 4th 16-bits.
+    Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
+    break;
+  }
+
+  return Value;
+}
+
+namespace {
+class MipsAsmBackend : public MCAsmBackend {
+  Triple::OSType OSType;
+  bool IsLittle; // Big or little endian
+  bool Is64Bit;  // 32 or 64 bit words
+
+public:
+  MipsAsmBackend(const Target &T,  Triple::OSType _OSType,
+                 bool _isLittle, bool _is64Bit)
+    :MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle), Is64Bit(_is64Bit) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createMipsELFObjectWriter(OS,
+      MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
+  }
+
+  /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
+  /// data fragment, at the offset specified by the fixup and following the
+  /// fixup kind as appropriate.
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    MCFixupKind Kind = Fixup.getKind();
+    Value = adjustFixupValue((unsigned)Kind, Value);
+
+    if (!Value)
+      return; // Doesn't change encoding.
+
+    // Where do we start in the object
+    unsigned Offset = Fixup.getOffset();
+    // Number of bytes we need to fixup
+    unsigned NumBytes = (getFixupKindInfo(Kind).TargetSize + 7) / 8;
+    // Used to point to big endian bytes
+    unsigned FullSize;
+
+    switch ((unsigned)Kind) {
+    case Mips::fixup_Mips_16:
+      FullSize = 2;
+      break;
+    case Mips::fixup_Mips_64:
+      FullSize = 8;
+      break;
+    default:
+      FullSize = 4;
+      break;
+    }
+
+    // Grab current value, if any, from bits.
+    uint64_t CurVal = 0;
+
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
+      CurVal |= (uint64_t)((uint8_t)Data[Offset + Idx]) << (i*8);
+    }
+
+    uint64_t Mask = ((uint64_t)(-1) >>
+                     (64 - getFixupKindInfo(Kind).TargetSize));
+    CurVal |= Value & Mask;
+
+    // Write out the fixed up bytes back to the code/data bits.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = IsLittle ? i : (FullSize - 1 - i);
+      Data[Offset + Idx] = (uint8_t)((CurVal >> (i*8)) & 0xff);
+    }
+  }
+
+  unsigned getNumFixupKinds() const { return Mips::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
+      // This table *must* be in same the order of fixup_* kinds in
+      // MipsFixupKinds.h.
+      //
+      // name                    offset  bits  flags
+      { "fixup_Mips_16",           0,     16,   0 },
+      { "fixup_Mips_32",           0,     32,   0 },
+      { "fixup_Mips_REL32",        0,     32,   0 },
+      { "fixup_Mips_26",           0,     26,   0 },
+      { "fixup_Mips_HI16",         0,     16,   0 },
+      { "fixup_Mips_LO16",         0,     16,   0 },
+      { "fixup_Mips_GPREL16",      0,     16,   0 },
+      { "fixup_Mips_LITERAL",      0,     16,   0 },
+      { "fixup_Mips_GOT_Global",   0,     16,   0 },
+      { "fixup_Mips_GOT_Local",    0,     16,   0 },
+      { "fixup_Mips_PC16",         0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Mips_CALL16",       0,     16,   0 },
+      { "fixup_Mips_GPREL32",      0,     32,   0 },
+      { "fixup_Mips_SHIFT5",       6,      5,   0 },
+      { "fixup_Mips_SHIFT6",       6,      5,   0 },
+      { "fixup_Mips_64",           0,     64,   0 },
+      { "fixup_Mips_TLSGD",        0,     16,   0 },
+      { "fixup_Mips_GOTTPREL",     0,     16,   0 },
+      { "fixup_Mips_TPREL_HI",     0,     16,   0 },
+      { "fixup_Mips_TPREL_LO",     0,     16,   0 },
+      { "fixup_Mips_TLSLDM",       0,     16,   0 },
+      { "fixup_Mips_DTPREL_HI",    0,     16,   0 },
+      { "fixup_Mips_DTPREL_LO",    0,     16,   0 },
+      { "fixup_Mips_Branch_PCRel", 0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Mips_GPOFF_HI",     0,     16,   0 },
+      { "fixup_Mips_GPOFF_LO",     0,     16,   0 },
+      { "fixup_Mips_GOT_PAGE",     0,     16,   0 },
+      { "fixup_Mips_GOT_OFST",     0,     16,   0 },
+      { "fixup_Mips_GOT_DISP",     0,     16,   0 },
+      { "fixup_Mips_HIGHER",       0,     16,   0 },
+      { "fixup_Mips_HIGHEST",      0,     16,   0 },
+      { "fixup_Mips_GOT_HI16",     0,     16,   0 },
+      { "fixup_Mips_GOT_LO16",     0,     16,   0 },
+      { "fixup_Mips_CALL_HI16",    0,     16,   0 },
+      { "fixup_Mips_CALL_LO16",    0,     16,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool mayNeedRelaxation(const MCInst &Inst) const {
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const {
+    // FIXME.
+    assert(0 && "RelaxInstruction() unimplemented");
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment
+  /// to the next wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same
+  /// as the output.
+  /// \param [out] Res On return, the relaxed instruction.
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  }
+
+  /// @}
+
+  /// WriteNopData - Write an (optimal) nop sequence of Count bytes
+  /// to the given output. If the target cannot generate such a sequence,
+  /// it should return an error.
+  ///
+  /// \return - True on success.
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    // Check for a less than instruction size number of bytes
+    // FIXME: 16 bit instructions are not handled yet here.
+    // We shouldn't be using a hard coded number for instruction size.
+    if (Count % 4) return false;
+
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write32(0);
+    return true;
+  }
+}; // class MipsAsmBackend
+
+} // namespace
+
+// MCAsmBackend
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/true, /*Is64Bit*/false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/false, /*Is64Bit*/false);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/true, /*Is64Bit*/true);
+}
+
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
+  return new MipsAsmBackend(T, Triple(TT).getOS(),
+                            /*IsLittle*/false, /*Is64Bit*/true);
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 000000000000..7a55efd5c330
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,153 @@
+//===-- MipsBaseInfo.h - Top level definitions for MIPS MC ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPSBASEINFO_H
+#define MIPSBASEINFO_H
+
+#include "MipsFixupKinds.h"
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // Mips Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT16 - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT16,
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HI/LO - Represents the hi or low part of an absolute symbol
+    /// address.
+    MO_ABS_HI,
+    MO_ABS_LO,
+
+    /// MO_TLSGD - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (General
+    // Dynamic TLS).
+    MO_TLSGD,
+
+    /// MO_TLSLDM - Represents the offset into the global offset table at which
+    // the module ID and TSL block offset reside during execution (Local
+    // Dynamic TLS).
+    MO_TLSLDM,
+    MO_DTPREL_HI,
+    MO_DTPREL_LO,
+
+    /// MO_GOTTPREL - Represents the offset from the thread pointer (Initial
+    // Exec TLS).
+    MO_GOTTPREL,
+
+    /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
+    // the thread pointer (Local Exec TLS).
+    MO_TPREL_HI,
+    MO_TPREL_LO,
+
+    // N32/64 Flags.
+    MO_GPOFF_HI,
+    MO_GPOFF_LO,
+    MO_GOT_DISP,
+    MO_GOT_PAGE,
+    MO_GOT_OFST,
+
+    /// MO_HIGHER/HIGHEST - Represents the highest or higher half word of a
+    /// 64-bit symbol address.
+    MO_HIGHER,
+    MO_HIGHEST,
+
+    /// MO_GOT_HI16/LO16, MO_CALL_HI16/LO16 - Relocations used for large GOTs.
+    MO_GOT_HI16,
+    MO_GOT_LO16,
+    MO_CALL_HI16,
+    MO_CALL_LO16
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for
+    // Mips instructions.
+    //
+
+    // Pseudo - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo   = 0,
+
+    /// FrmR - This form is for instructions of the format R.
+    FrmR  = 1,
+    /// FrmI - This form is for instructions of the format I.
+    FrmI  = 2,
+    /// FrmJ - This form is for instructions of the format J.
+    FrmJ  = 3,
+    /// FrmFR - This form is for instructions of the format FR.
+    FrmFR = 4,
+    /// FrmFI - This form is for instructions of the format FI.
+    FrmFI = 5,
+    /// FrmOther - This form is for instructions that have no specific format.
+    FrmOther = 6,
+
+    FormMask = 15
+  };
+}
+
+inline static std::pair<const MCSymbolRefExpr*, int64_t>
+MipsGetSymAndOffset(const MCFixup &Fixup) {
+  MCFixupKind FixupKind = Fixup.getKind();
+
+  if ((FixupKind < FirstTargetFixupKind) ||
+      (FixupKind >= MCFixupKind(Mips::LastTargetFixupKind)))
+    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+  const MCExpr *Expr = Fixup.getValue();
+  MCExpr::ExprKind Kind = Expr->getKind();
+
+  if (Kind == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Expr);
+    const MCExpr *LHS = BE->getLHS();
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+
+    if ((LHS->getKind() != MCExpr::SymbolRef) || !CE)
+      return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+    return std::make_pair(cast<MCSymbolRefExpr>(LHS), CE->getValue());
+  }
+
+  if (Kind != MCExpr::SymbolRef)
+    return std::make_pair((const MCSymbolRefExpr*)0, (int64_t)0);
+
+  return std::make_pair(cast<MCSymbolRefExpr>(Expr), 0);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
new file mode 100644
index 000000000000..15c4282030db
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
@@ -0,0 +1,81 @@
+//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MCInst records that are normally
+// left to the assembler to lower such as large shifts.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsInstrInfo.h"
+#include "MCTargetDesc/MipsDirectObjLower.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void Mips::LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+void Mips::LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DEXT/DINS, do nothing
+      return;
+    // DEXTU/DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    return;
+  }
+  // DEXTM/DINSM
+  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  return;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
new file mode 100644
index 000000000000..8813cc9ac7a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
@@ -0,0 +1,28 @@
+//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSDIRECTOBJLOWER_H
+#define MIPSDIRECTOBJLOWER_H
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+
+  namespace Mips {
+  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
+  //                       where the assembler usually finishes the lowering
+  //                       such as large shifts.
+    void LowerLargeShift(MCInst &Inst);
+    void LowerDextDins(MCInst &Inst);
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
new file mode 100644
index 000000000000..6471b51583ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -0,0 +1,285 @@
+//===-- MipsELFObjectWriter.cpp - Mips ELF Writer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <list>
+
+using namespace llvm;
+
+namespace {
+  struct RelEntry {
+    RelEntry(const ELFRelocationEntry &R, const MCSymbol *S, int64_t O) :
+      Reloc(R), Sym(S), Offset(O) {}
+    ELFRelocationEntry Reloc;
+    const MCSymbol *Sym;
+    int64_t Offset;
+  };
+
+  typedef std::list<RelEntry> RelLs;
+  typedef RelLs::iterator RelLsIter;
+
+  class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                        bool _isN64, bool IsLittleEndian);
+
+    virtual ~MipsELFObjectWriter();
+
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend) const;
+    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
+                                           const MCValue &Target,
+                                           const MCFragment &F,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const;
+    virtual void sortRelocs(const MCAssembler &Asm,
+                            std::vector<ELFRelocationEntry> &Relocs);
+  };
+}
+
+MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                                         bool _isN64, bool IsLittleEndian)
+  : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
+                            /*HasRelocationAddend*/ (_isN64) ? true : false,
+                            /*IsN64*/ _isN64) {}
+
+MipsELFObjectWriter::~MipsELFObjectWriter() {}
+
+const MCSymbol *MipsELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
+                                                    const MCValue &Target,
+                                                    const MCFragment &F,
+                                                    const MCFixup &Fixup,
+                                                    bool IsPCRel) const {
+  assert(Target.getSymA() && "SymA cannot be 0.");
+  const MCSymbol &Sym = Target.getSymA()->getSymbol().AliasedSymbol();
+
+  if (Sym.getSection().getKind().isMergeableCString() ||
+      Sym.getSection().getKind().isMergeableConst())
+    return &Sym;
+
+  return NULL;
+}
+
+unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel,
+                                           bool IsRelocWithSymbol,
+                                           int64_t Addend) const {
+  // determine the type of the relocation
+  unsigned Type = (unsigned)ELF::R_MIPS_NONE;
+  unsigned Kind = (unsigned)Fixup.getKind();
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_Data_4:
+    Type = ELF::R_MIPS_32;
+    break;
+  case FK_Data_8:
+    Type = ELF::R_MIPS_64;
+    break;
+  case FK_GPRel_4:
+    if (isN64()) {
+      Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
+      Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
+      Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
+    }
+    else
+      Type = ELF::R_MIPS_GPREL32;
+    break;
+  case Mips::fixup_Mips_GPREL16:
+    Type = ELF::R_MIPS_GPREL16;
+    break;
+  case Mips::fixup_Mips_26:
+    Type = ELF::R_MIPS_26;
+    break;
+  case Mips::fixup_Mips_CALL16:
+    Type = ELF::R_MIPS_CALL16;
+    break;
+  case Mips::fixup_Mips_GOT_Global:
+  case Mips::fixup_Mips_GOT_Local:
+    Type = ELF::R_MIPS_GOT16;
+    break;
+  case Mips::fixup_Mips_HI16:
+    Type = ELF::R_MIPS_HI16;
+    break;
+  case Mips::fixup_Mips_LO16:
+    Type = ELF::R_MIPS_LO16;
+    break;
+  case Mips::fixup_Mips_TLSGD:
+    Type = ELF::R_MIPS_TLS_GD;
+    break;
+  case Mips::fixup_Mips_GOTTPREL:
+    Type = ELF::R_MIPS_TLS_GOTTPREL;
+    break;
+  case Mips::fixup_Mips_TPREL_HI:
+    Type = ELF::R_MIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_Mips_TPREL_LO:
+    Type = ELF::R_MIPS_TLS_TPREL_LO16;
+    break;
+  case Mips::fixup_Mips_TLSLDM:
+    Type = ELF::R_MIPS_TLS_LDM;
+    break;
+  case Mips::fixup_Mips_DTPREL_HI:
+    Type = ELF::R_MIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_Mips_DTPREL_LO:
+    Type = ELF::R_MIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_Mips_Branch_PCRel:
+  case Mips::fixup_Mips_PC16:
+    Type = ELF::R_MIPS_PC16;
+    break;
+  case Mips::fixup_Mips_GOT_PAGE:
+    Type = ELF::R_MIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_Mips_GOT_OFST:
+    Type = ELF::R_MIPS_GOT_OFST;
+    break;
+  case Mips::fixup_Mips_GOT_DISP:
+    Type = ELF::R_MIPS_GOT_DISP;
+    break;
+  case Mips::fixup_Mips_GPOFF_HI:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
+    break;
+  case Mips::fixup_Mips_GPOFF_LO:
+    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
+    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
+    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
+    break;
+  case Mips::fixup_Mips_HIGHER:
+    Type = ELF::R_MIPS_HIGHER;
+    break;
+  case Mips::fixup_Mips_HIGHEST:
+    Type = ELF::R_MIPS_HIGHEST;
+    break;
+  case Mips::fixup_Mips_GOT_HI16:
+    Type = ELF::R_MIPS_GOT_HI16;
+    break;
+  case Mips::fixup_Mips_GOT_LO16:
+    Type = ELF::R_MIPS_GOT_LO16;
+    break;
+  case Mips::fixup_Mips_CALL_HI16:
+    Type = ELF::R_MIPS_CALL_HI16;
+    break;
+  case Mips::fixup_Mips_CALL_LO16:
+    Type = ELF::R_MIPS_CALL_LO16;
+    break;
+  }
+  return Type;
+}
+
+// Return true if R is either a GOT16 against a local symbol or HI16.
+static bool NeedsMatchingLo(const MCAssembler &Asm, const RelEntry &R) {
+  if (!R.Sym)
+    return false;
+
+  MCSymbolData &SD = Asm.getSymbolData(R.Sym->AliasedSymbol());
+
+  return ((R.Reloc.Type == ELF::R_MIPS_GOT16) && !SD.isExternal()) ||
+    (R.Reloc.Type == ELF::R_MIPS_HI16);
+}
+
+static bool HasMatchingLo(const MCAssembler &Asm, RelLsIter I, RelLsIter Last) {
+  if (I == Last)
+    return false;
+
+  RelLsIter Hi = I++;
+
+  return (I->Reloc.Type == ELF::R_MIPS_LO16) && (Hi->Sym == I->Sym) &&
+    (Hi->Offset == I->Offset);
+}
+
+static bool HasSameSymbol(const RelEntry &R0, const RelEntry &R1) {
+  return R0.Sym == R1.Sym;
+}
+
+static int CompareOffset(const RelEntry &R0, const RelEntry &R1) {
+  return (R0.Offset > R1.Offset) ? 1 : ((R0.Offset == R1.Offset) ? 0 : -1);
+}
+
+void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+                                     std::vector<ELFRelocationEntry> &Relocs) {
+  // Call the default function first. Relocations are sorted in descending
+  // order of r_offset.
+  MCELFObjectTargetWriter::sortRelocs(Asm, Relocs);
+
+  RelLs RelocLs;
+  std::vector<RelLsIter> Unmatched;
+
+  // Fill RelocLs. Traverse Relocs backwards so that relocations in RelocLs
+  // are in ascending order of r_offset.
+  for (std::vector<ELFRelocationEntry>::reverse_iterator R = Relocs.rbegin();
+       R != Relocs.rend(); ++R) {
+     std::pair<const MCSymbolRefExpr*, int64_t> P =
+       MipsGetSymAndOffset(*R->Fixup);
+     RelocLs.push_back(RelEntry(*R, P.first ? &P.first->getSymbol() : 0,
+                                P.second));
+  }
+
+  // Get list of unmatched HI16 and GOT16.
+  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
+    if (NeedsMatchingLo(Asm, *R) && !HasMatchingLo(Asm, R, --RelocLs.end()))
+      Unmatched.push_back(R);
+
+  // Insert unmatched HI16 and GOT16 immediately before their matching LO16.
+  for (std::vector<RelLsIter>::iterator U = Unmatched.begin();
+       U != Unmatched.end(); ++U) {
+    RelLsIter LoPos = RelocLs.end(), HiPos = *U;
+    bool MatchedLo = false;
+
+    for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R) {
+      if ((R->Reloc.Type == ELF::R_MIPS_LO16) && HasSameSymbol(*HiPos, *R) &&
+          (CompareOffset(*R, *HiPos) >= 0) &&
+          ((LoPos == RelocLs.end()) || ((CompareOffset(*R, *LoPos) < 0)) ||
+           (!MatchedLo && !CompareOffset(*R, *LoPos))))
+        LoPos = R;
+
+      MatchedLo = NeedsMatchingLo(Asm, *R) &&
+        HasMatchingLo(Asm, R, --RelocLs.end());
+    }
+
+    // If a matching LoPos was found, move HiPos and insert it before LoPos.
+    // Make the offsets of HiPos and LoPos match.
+    if (LoPos != RelocLs.end()) {
+      HiPos->Offset = LoPos->Offset;
+      RelocLs.insert(LoPos, *HiPos);
+      RelocLs.erase(HiPos);
+    }
+  }
+
+  // Put the sorted list back in reverse order.
+  assert(Relocs.size() == RelocLs.size());
+  unsigned I = RelocLs.size();
+
+  for (RelLsIter R = RelocLs.begin(); R != RelocLs.end(); ++R)
+    Relocs[--I] = R->Reloc;
+}
+
+MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
+                                                uint8_t OSABI,
+                                                bool IsLittleEndian,
+                                                bool Is64Bit) {
+  MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI,
+                                                (Is64Bit) ? true : false,
+                                                IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
new file mode 100644
index 000000000000..c33bc9ae3034
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -0,0 +1,89 @@
+//===-- MipsELFStreamer.cpp - MipsELFStreamer ---------------------------===//
+//
+//                       The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+#include "MCTargetDesc/MipsELFStreamer.h"
+#include "MipsSubtarget.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFSymbolFlags.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                       raw_ostream &OS, MCCodeEmitter *Emitter,
+                                       bool RelaxAll, bool NoExecStack) {
+    MipsELFStreamer *S = new MipsELFStreamer(Context, TAB, OS, Emitter,
+                                             RelaxAll, NoExecStack);
+    return S;
+  }
+
+  // For llc. Set a group of ELF header flags
+  void
+  MipsELFStreamer::emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget) {
+
+    if (hasRawTextSupport())
+      return;
+
+    // Update e_header flags
+    MCAssembler& MCA = getAssembler();
+    unsigned EFlags = MCA.getELFHeaderEFlags();
+
+    if (Subtarget.inMips16Mode())
+      EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
+    else
+      EFlags |= ELF::EF_MIPS_NOREORDER;
+
+    // Architecture
+    if (Subtarget.hasMips64r2())
+      EFlags |= ELF::EF_MIPS_ARCH_64R2;
+    else if (Subtarget.hasMips64())
+      EFlags |= ELF::EF_MIPS_ARCH_64;
+    else if (Subtarget.hasMips32r2())
+      EFlags |= ELF::EF_MIPS_ARCH_32R2;
+    else
+      EFlags |= ELF::EF_MIPS_ARCH_32;
+
+    if (Subtarget.inMicroMipsMode())
+      EFlags |= ELF::EF_MIPS_MICROMIPS;
+
+    // ABI
+    if (Subtarget.isABI_O32())
+      EFlags |= ELF::EF_MIPS_ABI_O32;
+
+    // Relocation Model
+    Reloc::Model RM = Subtarget.getRelocationModel();
+    if (RM == Reloc::PIC_ || RM == Reloc::Default)
+      EFlags |= ELF::EF_MIPS_PIC;
+    else if (RM == Reloc::Static)
+      ; // Do nothing for Reloc::Static
+    else
+      llvm_unreachable("Unsupported relocation model for e_flags");
+
+    MCA.setELFHeaderEFlags(EFlags);
+  }
+
+  // For llc. Set a symbol's STO flags
+  void
+  MipsELFStreamer::emitMipsSTOCG(const MipsSubtarget &Subtarget,
+                                 MCSymbol *Sym,
+                                 unsigned Val) {
+
+    if (hasRawTextSupport())
+      return;
+
+    MCSymbolData &Data = getOrCreateSymbolData(Sym);
+    // The "other" values are stored in the last 6 bits of the second byte
+    // The traditional defines for STO values assume the full byte and thus
+    // the shift to pack it.
+    MCELF::setOther(Data, Val >> 2);
+  }
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
new file mode 100644
index 000000000000..b10ccc78e665
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -0,0 +1,43 @@
+//=== MipsELFStreamer.h - MipsELFStreamer ------------------------------===//
+//
+//                    The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENCE.TXT for details.
+//
+//===-------------------------------------------------------------------===//
+#ifndef MIPSELFSTREAMER_H_
+#define MIPSELFSTREAMER_H_
+
+#include "llvm/MC/MCELFStreamer.h"
+
+namespace llvm {
+class MipsAsmPrinter;
+class MipsSubtarget;
+class MCSymbol;
+
+class MipsELFStreamer : public MCELFStreamer {
+public:
+  MipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                  raw_ostream &OS, MCCodeEmitter *Emitter,
+                  bool RelaxAll, bool NoExecStack)
+    : MCELFStreamer(SK_MipsELFStreamer, Context, TAB, OS, Emitter) {
+  }
+
+  ~MipsELFStreamer() {}
+  void emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget);
+  void emitMipsSTOCG(const MipsSubtarget &Subtarget,
+                     MCSymbol *Sym,
+                     unsigned Val);
+
+  static bool classof(const MCStreamer *S) {
+    return S->getKind() == SK_MipsELFStreamer;
+  }
+};
+
+  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                       raw_ostream &OS, MCCodeEmitter *Emitter,
+                                       bool RelaxAll, bool NoExecStack);
+}
+
+#endif /* MIPSELFSTREAMER_H_ */
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 000000000000..f96390043a3b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,139 @@
+//===-- MipsFixupKinds.h - Mips Specific Fixup Entries ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MIPS_MIPSFIXUPKINDS_H
+#define LLVM_MIPS_MIPSFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+  // Although most of the current fixup types reflect a unique relocation
+  // one can have multiple fixup types for a given relocation and thus need
+  // to be uniquely named.
+  //
+  // This table *must* be in the save order of
+  // MCFixupKindInfo Infos[Mips::NumTargetFixupKinds]
+  // in MipsAsmBackend.cpp.
+  //
+  enum Fixups {
+    // Branch fixups resulting in R_MIPS_16.
+    fixup_Mips_16 = FirstTargetFixupKind,
+
+    // Pure 32 bit data fixup resulting in - R_MIPS_32.
+    fixup_Mips_32,
+
+    // Full 32 bit data relative data fixup resulting in - R_MIPS_REL32.
+    fixup_Mips_REL32,
+
+    // Jump 26 bit fixup resulting in - R_MIPS_26.
+    fixup_Mips_26,
+
+    // Pure upper 16 bit fixup resulting in - R_MIPS_HI16.
+    fixup_Mips_HI16,
+
+    // Pure lower 16 bit fixup resulting in - R_MIPS_LO16.
+    fixup_Mips_LO16,
+
+    // 16 bit fixup for GP offest resulting in - R_MIPS_GPREL16.
+    fixup_Mips_GPREL16,
+
+    // 16 bit literal fixup resulting in - R_MIPS_LITERAL.
+    fixup_Mips_LITERAL,
+
+    // Global symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Global,
+
+    // Local symbol fixup resulting in - R_MIPS_GOT16.
+    fixup_Mips_GOT_Local,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16.
+    fixup_Mips_PC16,
+
+    // resulting in - R_MIPS_CALL16.
+    fixup_Mips_CALL16,
+
+    // resulting in - R_MIPS_GPREL32.
+    fixup_Mips_GPREL32,
+
+    // resulting in - R_MIPS_SHIFT5.
+    fixup_Mips_SHIFT5,
+
+    // resulting in - R_MIPS_SHIFT6.
+    fixup_Mips_SHIFT6,
+
+    // Pure 64 bit data fixup resulting in - R_MIPS_64.
+    fixup_Mips_64,
+
+    // resulting in - R_MIPS_TLS_GD.
+    fixup_Mips_TLSGD,
+
+    // resulting in - R_MIPS_TLS_GOTTPREL.
+    fixup_Mips_GOTTPREL,
+
+    // resulting in - R_MIPS_TLS_TPREL_HI16.
+    fixup_Mips_TPREL_HI,
+
+    // resulting in - R_MIPS_TLS_TPREL_LO16.
+    fixup_Mips_TPREL_LO,
+
+    // resulting in - R_MIPS_TLS_LDM.
+    fixup_Mips_TLSLDM,
+
+    // resulting in - R_MIPS_TLS_DTPREL_HI16.
+    fixup_Mips_DTPREL_HI,
+
+    // resulting in - R_MIPS_TLS_DTPREL_LO16.
+    fixup_Mips_DTPREL_LO,
+
+    // PC relative branch fixup resulting in - R_MIPS_PC16
+    fixup_Mips_Branch_PCRel,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_HI16
+    fixup_Mips_GPOFF_HI,
+
+    // resulting in - R_MIPS_GPREL16/R_MIPS_SUB/R_MIPS_LO16
+    fixup_Mips_GPOFF_LO,
+
+    // resulting in - R_MIPS_PAGE
+    fixup_Mips_GOT_PAGE,
+
+    // resulting in - R_MIPS_GOT_OFST
+    fixup_Mips_GOT_OFST,
+
+    // resulting in - R_MIPS_GOT_DISP
+    fixup_Mips_GOT_DISP,
+
+    // resulting in - R_MIPS_GOT_HIGHER
+    fixup_Mips_HIGHER,
+
+    // resulting in - R_MIPS_HIGHEST
+    fixup_Mips_HIGHEST,
+
+    // resulting in - R_MIPS_GOT_HI16
+    fixup_Mips_GOT_HI16,
+
+    // resulting in - R_MIPS_GOT_LO16
+    fixup_Mips_GOT_LO16,
+
+    // resulting in - R_MIPS_CALL_HI16
+    fixup_Mips_CALL_HI16,
+
+    // resulting in - R_MIPS_CALL_LO16
+    fixup_Mips_CALL_LO16,
+
+    // Marker
+    LastTargetFixupKind,
+    NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+  };
+} // namespace Mips
+} // namespace llvm
+
+
+#endif // LLVM_MIPS_MIPSFIXUPKINDS_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
new file mode 100644
index 000000000000..5d4b32d30578
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -0,0 +1,47 @@
+//===-- MipsMCAsmInfo.cpp - Mips Asm Properties ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void MipsMCAsmInfo::anchor() { }
+
+MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  if ((TheTriple.getArch() == Triple::mips) ||
+      (TheTriple.getArch() == Triple::mips64))
+    IsLittleEndian = false;
+
+  if ((TheTriple.getArch() == Triple::mips64el) ||
+      (TheTriple.getArch() == Triple::mips64)) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  AlignmentIsInBytes          = false;
+  Data16bitsDirective         = "\t.2byte\t";
+  Data32bitsDirective         = "\t.4byte\t";
+  Data64bitsDirective         = "\t.8byte\t";
+  PrivateGlobalPrefix         = "$";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  GPRel32Directive            = "\t.gpword\t";
+  GPRel64Directive            = "\t.gpdword\t";
+  WeakRefDirective            = "\t.weak\t";
+  DebugLabelSuffix            = "=.";
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  HasLEB128 = true;
+  DwarfRegNumForCFI = true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
new file mode 100644
index 000000000000..e1d878936f31
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- MipsMCAsmInfo.h - Mips Asm Info ------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class StringRef;
+  class Target;
+
+  class MipsMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit MipsMCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 000000000000..e198a7c983f0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,344 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips Code to Machine Code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsDirectObjLower.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+  bool IsLittleEndian;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
+                    const MCSubtargetInfo &sti, bool IsLittle) :
+    MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
+
+  ~MipsMCCodeEmitter() {}
+
+  void EmitByte(unsigned char C, raw_ostream &OS) const {
+    OS << (char)C;
+  }
+
+  void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
+    // Output the instruction encoding in little endian byte order.
+    for (unsigned i = 0; i < Size; ++i) {
+      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+      EmitByte((Val >> Shift) & 0xff, OS);
+    }
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  // getBranchJumpOpValue - Return binary encoding of the jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+   unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+   // getBranchTargetOpValue - Return binary encoding of the branch
+   // target operand. If the machine operand requires relocation,
+   // record the relocation and return zero.
+  unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
+   // getMachineOpValue - Return binary encoding of operand. If the machin
+   // operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
+                          SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+}; // class MipsMCCodeEmitter
+}  // namespace
+
+MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx)
+{
+  return new MipsMCCodeEmitter(MCII, Ctx, STI, false);
+}
+
+MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx)
+{
+  return new MipsMCCodeEmitter(MCII, Ctx, STI, true);
+}
+
+/// EncodeInstruction - Emit the instruction.
+/// Size the instruction (currently only 4 bytes
+void MipsMCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const
+{
+
+  // Non-pseudo instructions that get changed for direct object
+  // only based on operand values.
+  // If this list of instructions get much longer we will move
+  // the check to a function call. Until then, this is more efficient.
+  MCInst TmpInst = MI;
+  switch (MI.getOpcode()) {
+  // If shift amount is >= 32 it the inst needs to be lowered further
+  case Mips::DSLL:
+  case Mips::DSRL:
+  case Mips::DSRA:
+    Mips::LowerLargeShift(TmpInst);
+    break;
+    // Double extract instruction is chosen by pos and size operands
+  case Mips::DEXT:
+  case Mips::DINS:
+    Mips::LowerDextDins(TmpInst);
+  }
+
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
+
+  // Check for unimplemented opcodes.
+  // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
+  // so we have to special check for them.
+  unsigned Opcode = TmpInst.getOpcode();
+  if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && !Binary)
+    llvm_unreachable("unimplemented opcode in EncodeInstruction()");
+
+  const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
+
+  // Get byte count of instruction
+  unsigned Size = Desc.getSize();
+  if (!Size)
+    llvm_unreachable("Desc.getSize() returns 0");
+
+  EmitInstruction(Binary, Size, OS);
+}
+
+/// getBranchTargetOpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_Mips_PC16)));
+  return 0;
+}
+
+/// getJumpTargetOpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm()>>2;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_Mips_26)));
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    unsigned RegNo = Ctx.getRegisterInfo().getEncodingValue(Reg);
+    return RegNo;
+  } else if (MO.isImm()) {
+    return static_cast<unsigned>(MO.getImm());
+  } else if (MO.isFPImm()) {
+    return static_cast<unsigned>(APFloat(MO.getFPImm())
+        .bitcastToAPInt().getHiBits(32).getLimitedValue());
+  }
+
+  // MO must be an Expr.
+  assert(MO.isExpr());
+
+  const MCExpr *Expr = MO.getExpr();
+  MCExpr::ExprKind Kind = Expr->getKind();
+
+  if (Kind == MCExpr::Binary) {
+    Expr = static_cast<const MCBinaryExpr*>(Expr)->getLHS();
+    Kind = Expr->getKind();
+  }
+
+  assert (Kind == MCExpr::SymbolRef);
+
+  Mips::Fixups FixupKind = Mips::Fixups(0);
+
+  switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
+  default: llvm_unreachable("Unknown fixup kind!");
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_HI :
+    FixupKind = Mips::fixup_Mips_GPOFF_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPOFF_LO :
+    FixupKind = Mips::fixup_Mips_GPOFF_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
+    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_OFST :
+    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_DISP :
+    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GPREL:
+    FixupKind = Mips::fixup_Mips_GPREL16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_CALL:
+    FixupKind = Mips::fixup_Mips_CALL16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT16:
+    FixupKind = Mips::fixup_Mips_GOT_Global;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT:
+    FixupKind = Mips::fixup_Mips_GOT_Local;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_HI:
+    FixupKind = Mips::fixup_Mips_HI16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_ABS_LO:
+    FixupKind = Mips::fixup_Mips_LO16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TLSGD:
+    FixupKind = Mips::fixup_Mips_TLSGD;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TLSLDM:
+    FixupKind = Mips::fixup_Mips_TLSLDM;
+    break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
+    FixupKind = Mips::fixup_Mips_DTPREL_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
+    FixupKind = Mips::fixup_Mips_DTPREL_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOTTPREL:
+    FixupKind = Mips::fixup_Mips_GOTTPREL;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_HI:
+    FixupKind = Mips::fixup_Mips_TPREL_HI;
+    break;
+  case MCSymbolRefExpr::VK_Mips_TPREL_LO:
+    FixupKind = Mips::fixup_Mips_TPREL_LO;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHER:
+    FixupKind = Mips::fixup_Mips_HIGHER;
+    break;
+  case MCSymbolRefExpr::VK_Mips_HIGHEST:
+    FixupKind = Mips::fixup_Mips_HIGHEST;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_HI16:
+    FixupKind = Mips::fixup_Mips_GOT_HI16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_GOT_LO16:
+    FixupKind = Mips::fixup_Mips_GOT_LO16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_CALL_HI16:
+    FixupKind = Mips::fixup_Mips_CALL_HI16;
+    break;
+  case MCSymbolRefExpr::VK_Mips_CALL_LO16:
+    FixupKind = Mips::fixup_Mips_CALL_LO16;
+    break;
+  } // switch
+
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), MCFixupKind(FixupKind)));
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
+/// getMemEncoding - Return binary encoding of memory related operand.
+/// If the offset operand requires relocation, record the relocation.
+unsigned
+MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo),Fixups) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+
+  return (OffBits & 0xFFFF) | RegBits;
+}
+
+unsigned
+MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  return SizeEncoding - 1;
+}
+
+// FIXME: should be called getMSBEncoding
+//
+unsigned
+MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpNo-1).isImm());
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned Position = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups);
+  unsigned Size = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+
+  return Position + Size - 1;
+}
+
+#include "MipsGenMCCodeEmitter.inc"
+
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
new file mode 100644
index 000000000000..be83b54b6124
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -0,0 +1,215 @@
+//===-- MipsMCTargetDesc.cpp - Mips Target Descriptions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsELFStreamer.h"
+#include "MipsMCTargetDesc.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "MipsGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "MipsGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static std::string ParseMipsTriple(StringRef TT, StringRef CPU) {
+  std::string MipsArchFeature;
+  size_t DashPosition = 0;
+  StringRef TheTriple;
+
+  // Let's see if there is a dash, like mips-unknown-linux.
+  DashPosition = TT.find('-');
+
+  if (DashPosition == StringRef::npos) {
+    // No dash, we check the string size.
+    TheTriple = TT.substr(0);
+  } else {
+    // We are only interested in substring before dash.
+    TheTriple = TT.substr(0,DashPosition);
+  }
+
+  if (TheTriple == "mips" || TheTriple == "mipsel") {
+    if (CPU.empty() || CPU == "mips32") {
+      MipsArchFeature = "+mips32";
+    } else if (CPU == "mips32r2") {
+      MipsArchFeature = "+mips32r2";
+    }
+  } else {
+      if (CPU.empty() || CPU == "mips64") {
+        MipsArchFeature = "+mips64";
+      } else if (CPU == "mips64r2") {
+        MipsArchFeature = "+mips64r2";
+      }
+  }
+  return MipsArchFeature;
+}
+
+static MCInstrInfo *createMipsMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitMipsMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMipsMCRegisterInfo(X, Mips::RA);
+  return X;
+}
+
+static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  std::string ArchFS = ParseMipsTriple(TT,CPU);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitMipsMCSubtargetInfo(X, TT, CPU, ArchFS);
+  return X;
+}
+
+static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(T, TT);
+
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Mips::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                              CodeModel::Model CM,
+                                              CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (CM == CodeModel::JITDefault)
+    RM = Reloc::Static;
+  else if (RM == Reloc::Default)
+    RM = Reloc::PIC_;
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCInstrInfo &MII,
+                                              const MCRegisterInfo &MRI,
+                                              const MCSubtargetInfo &STI) {
+  return new MipsInstPrinter(MAI, MII, MRI);
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  return createMipsELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheMipselTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheMips64Target, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheMips64elTarget, createMipsMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipsTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipselTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64Target,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64elTarget,
+                                        createMipsMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMipselTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64Target, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64elTarget,
+                                      createMipsMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMipsTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMipselTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64Target, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64elTarget,
+                                    createMipsMCRegisterInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMipsTarget,
+                                        createMipsMCCodeEmitterEB);
+  TargetRegistry::RegisterMCCodeEmitter(TheMipselTarget,
+                                        createMipsMCCodeEmitterEL);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64Target,
+                                        createMipsMCCodeEmitterEB);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64elTarget,
+                                        createMipsMCCodeEmitterEL);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheMipsTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMipselTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget,
+                                           createMCStreamer);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
+                                       createMipsAsmBackendEB32);
+  TargetRegistry::RegisterMCAsmBackend(TheMipselTarget,
+                                       createMipsAsmBackendEL32);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64Target,
+                                       createMipsAsmBackendEB64);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget,
+                                       createMipsAsmBackendEL64);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipselTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64Target,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64elTarget,
+                                          createMipsMCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64Target,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64elTarget,
+                                        createMipsMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
new file mode 100644
index 000000000000..71954a4bd862
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -0,0 +1,72 @@
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCTARGETDESC_H
+#define MIPSMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class StringRef;
+class Target;
+class raw_ostream;
+
+extern Target TheMipsTarget;
+extern Target TheMipselTarget;
+extern Target TheMips64Target;
+extern Target TheMips64elTarget;
+
+MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                       StringRef CPU);
+
+MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
+                                          uint8_t OSABI,
+                                          bool IsLittleEndian,
+                                          bool Is64Bit);
+} // End llvm namespace
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "MipsGenRegisterInfo.inc"
+
+// Defines symbolic names for the Mips instructions.
+#define GET_INSTRINFO_ENUM
+#include "MipsGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "MipsGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
new file mode 100644
index 000000000000..1dc9bcb36a5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.cpp
@@ -0,0 +1,80 @@
+//===-- MipsReginfo.cpp - Registerinfo handling  --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// .reginfo
+//    Elf32_Word ri_gprmask
+//    Elf32_Word ri_cprmask[4]
+//    Elf32_Word ri_gp_value
+//
+// .MIPS.options - N64
+//    Elf64_Byte    kind (ODK_REGINFO)
+//    Elf64_Byte    size (40 bytes)
+//    Elf64_Section section (0)
+//    Elf64_Word    info (unused)
+//    Elf64_Word    ri_gprmask ()
+//    Elf64_Word    ri_pad ()
+//    Elf64_Word[4] ri_cprmask ()
+//    Elf64_Addr    ri_gp_value ()
+//
+// .MIPS.options - N32
+//    Elf32_Byte    kind (ODK_REGINFO)
+//    Elf32_Byte    size (36 bytes)
+//    Elf32_Section section (0)
+//    Elf32_Word    info (unused)
+//    Elf32_Word    ri_gprmask ()
+//    Elf32_Word    ri_pad ()
+//    Elf32_Word[4] ri_cprmask ()
+//    Elf32_Addr    ri_gp_value ()
+//
+//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/MipsReginfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// Integrated assembler version
+void
+MipsReginfo::emitMipsReginfoSectionCG(MCStreamer &OS,
+    const TargetLoweringObjectFile &TLOF,
+    const MipsSubtarget &MST) const
+{
+
+  if (OS.hasRawTextSupport())
+    return;
+
+  const MipsTargetObjectFile &TLOFELF =
+      static_cast<const MipsTargetObjectFile &>(TLOF);
+  OS.SwitchSection(TLOFELF.getReginfoSection());
+
+  // .reginfo
+  if (MST.isABI_O32()) {
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 4); // ri_gp_value
+  }
+  // .MIPS.options
+  else if (MST.isABI_N64()) {
+    OS.EmitIntValue(1, 1); // kind
+    OS.EmitIntValue(40, 1); // size
+    OS.EmitIntValue(0, 2); // section
+    OS.EmitIntValue(0, 4); // info
+    OS.EmitIntValue(0, 4); // ri_gprmask
+    OS.EmitIntValue(0, 4); // pad
+    OS.EmitIntValue(0, 4); // ri_cpr[0]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[1]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[2]mask
+    OS.EmitIntValue(0, 4); // ri_cpr[3]mask
+    OS.EmitIntValue(0, 8); // ri_gp_value
+  }
+  else llvm_unreachable("Unsupported abi for reginfo");
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
new file mode 100644
index 000000000000..039b8eaaf287
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsReginfo.h
@@ -0,0 +1,31 @@
+//=== MipsReginfo.h - MipsReginfo -----------------------------------------===//
+//
+//                    The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENCE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGINFO_H
+#define MIPSREGINFO_H
+
+namespace llvm {
+  class MCStreamer;
+  class TargetLoweringObjectFile;
+  class MipsSubtarget;
+
+  class MipsReginfo {
+    void anchor();
+  public:
+    MipsReginfo() {}
+
+    void emitMipsReginfoSectionCG(MCStreamer &OS,
+        const TargetLoweringObjectFile &TLOF,
+        const MipsSubtarget &MST) const;
+  };
+
+} // namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
new file mode 100644
index 000000000000..8c65bb4020b5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -0,0 +1,34 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPS_H
+#define TARGET_MIPS_H
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MipsTargetMachine;
+  class FunctionPass;
+
+  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                             JITCodeEmitter &JCE);
+  FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
new file mode 100644
index 000000000000..eefb02a494ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -0,0 +1,118 @@
+//===-- Mips.td - Describe the Mips Target Machine ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+def MipsInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features                                                    //
+//===----------------------------------------------------------------------===//
+
+def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
+                                "General Purpose Registers are 64-bit wide.">;
+def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
+                                "Support 64-bit FP registers.">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+                                "true", "Only supports single precision float">;
+def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
+                                "Enable o32 ABI">;
+def FeatureN32         : SubtargetFeature<"n32", "MipsABI", "N32",
+                                "Enable n32 ABI">;
+def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
+                                "Enable n64 ABI">;
+def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
+                                "Enable eabi ABI">;
+def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
+                                "true", "Enable vector FPU instructions.">;
+def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true",
+                                "Enable 'signext in register' instructions.">;
+def FeatureCondMov     : SubtargetFeature<"condmov", "HasCondMov", "true",
+                                "Enable 'conditional move' instructions.">;
+def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
+                                "Enable 'byte/half swap' instructions.">;
+def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
+                                "Enable 'count leading bits' instructions.">;
+def FeatureFPIdx       : SubtargetFeature<"FPIdx", "HasFPIdx", "true",
+                                "Enable 'FP indexed load/store' instructions.">;
+def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
+                                "Mips32 ISA Support",
+                                [FeatureCondMov, FeatureBitCount]>;
+def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
+                                "Mips32r2", "Mips32r2 ISA Support",
+                                [FeatureMips32, FeatureSEInReg, FeatureSwap,
+                                 FeatureFPIdx]>;
+def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
+                                "Mips64", "Mips64 ISA Support",
+                                [FeatureGP64Bit, FeatureFP64Bit,
+                                 FeatureMips32, FeatureFPIdx]>;
+def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
+                                "Mips64r2", "Mips64r2 ISA Support",
+                                [FeatureMips64, FeatureMips32r2]>;
+
+def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
+                                      "Mips16 mode">;
+
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+                                    "Mips DSP-R2 ASE", [FeatureDSP]>;
+
+def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
+                                         "microMips mode">;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, MipsGenericItineraries, Features>;
+
+def : Proc<"mips32", [FeatureMips32]>;
+def : Proc<"mips32r2", [FeatureMips32r2]>;
+def : Proc<"mips64", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
+def : Proc<"mips16", [FeatureMips16]>;
+
+def MipsAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def MipsAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def MipsAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+}
+
+def Mips : Target {
+  let InstructionSet = MipsInstrInfo;
+  let AssemblyParsers = [MipsAsmParser];
+  let AssemblyWriters = [MipsAsmWriter];
+  let AssemblyParserVariants = [MipsAsmParserVariant];
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
new file mode 100644
index 000000000000..1bb6fe46295b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -0,0 +1,180 @@
+//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16FrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP);
+  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
+  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+
+  MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP, -8);
+  SrcML = MachineLocation(Mips::S1);
+  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+
+  DstML = MachineLocation(MachineLocation::VirtualFP, -12);
+  SrcML = MachineLocation(Mips::S0);
+  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+
+  DstML = MachineLocation(MachineLocation::VirtualFP, -4);
+  SrcML = MachineLocation(Mips::RA);
+  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
+      .addReg(Mips::SP);
+
+}
+
+void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Mips16InstrInfo &TII =
+    *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::Move32R16), Mips::SP)
+      .addReg(Mips::S0);
+
+  // Adjust stack.
+  // assumes stacksize multiple of 8
+  TII.restoreFrame(Mips::SP, StackSize, MBB, MBBI);
+}
+
+bool Mips16FrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+
+  //
+  // Registers RA, S0,S1 are the callee saved registers and they
+  // will be saved with the "save" instruction
+  // during emitPrologue
+  //
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+      && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+  }
+
+  return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                       const std::vector<CalleeSavedInfo> &CSI,
+                                       const TargetRegisterInfo *TRI) const {
+  //
+  // Registers RA,S0,S1 are the callee saved registers and they will be restored
+  // with the restore instruction during emitEpilogue.
+  // We need to override this virtual function, otherwise llvm will try and
+  // restore the registers on it's on from the stack.
+  //
+
+  return true;
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void Mips16FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(MF.getTarget().getInstrInfo());
+
+    TII.adjustStackPtr(Mips::SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
+bool
+Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Reserve call frame if the size of the maximum call frame fits into 15-bit
+  // immediate field and there are no variable sized objects on the stack.
+  return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
+}
+
+void Mips16FrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MF.getRegInfo().setPhysRegUsed(Mips::RA);
+  MF.getRegInfo().setPhysRegUsed(Mips::S0);
+  MF.getRegInfo().setPhysRegUsed(Mips::S1);
+}
+
+const MipsFrameLowering *
+llvm::createMips16FrameLowering(const MipsSubtarget &ST) {
+  return new Mips16FrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
new file mode 100644
index 000000000000..54fdb7871466
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.h
@@ -0,0 +1,52 @@
+//===-- Mips16FrameLowering.h - Mips16 frame lowering  ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16_FRAMEINFO_H
+#define MIPS16_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+class Mips16FrameLowering : public MipsFrameLowering {
+public:
+  explicit Mips16FrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, 8) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..00b3449300c5
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -0,0 +1,308 @@
+//===-- Mips16ISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips16 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "Mips16ISelDAGToDAG.h"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+/// Select multiply instructions.
+std::pair<SDNode*, SDNode*>
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = Mips::Mflo16;
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = Mips::Mfhi16;
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC =
+    (const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+  V2 = RegInfo.createVirtualRegister(RC);
+
+  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
+  BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
+    .addReg(V1).addReg(V2);
+}
+
+// Insert instructions to initialize the Mips16 SP Alias register in the
+// first MBB of the function.
+//
+void Mips16DAGToDAGISel::initMips16SPAliasReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->mips16SPAliasRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
+
+  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
+    .addReg(Mips::SP);
+}
+
+void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+  initMips16SPAliasReg(MF);
+}
+
+/// getMips16SPAliasReg - Output the instructions required to put the
+/// SP into a Mips16 accessible aliased register.
+SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
+  unsigned Mips16SPAliasReg =
+    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
+  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
+}
+
+void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
+  if (Parent) {
+    switch (Parent->getOpcode()) {
+      case ISD::LOAD: {
+        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+      case ISD::STORE: {
+        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+    }
+  }
+  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
+  return;
+
+}
+
+bool Mips16DAGToDAGISel::selectAddr16(
+  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
+  SDValue &Alias) {
+  EVT ValTy = Addr.getValueType();
+
+  Alias = CurDAG->getTargetConstant(0, ValTy);
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    getMips16SPRefReg(Parent, Alias);
+    return true;
+  }
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+        getMips16SPRefReg(Parent, Alias);
+      }
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+
+    // If an indexed floating point load/store can be emitted, return false.
+    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
+
+    if (LS &&
+        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
+        Subtarget.hasFPIdx())
+      return false;
+  }
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, ValTy);
+  return true;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc DL = Node->getDebugLoc();
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  unsigned MultOpc;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE:
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2), CmpLHS;
+    unsigned Opc = InFlag.getOpcode(); (void)Opc;
+    assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+            (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+           "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+    unsigned MOp;
+    if (Opcode == ISD::ADDE) {
+      CmpLHS = InFlag.getValue(0);
+      MOp = Mips::AdduRxRyRz16;
+    } else {
+      CmpLHS = InFlag.getOperand(0);
+      MOp = Mips::SubuRxRyRz16;
+    }
+
+    SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+
+    EVT VT = LHS.getValueType();
+
+    unsigned Sltu_op = Mips::SltuRxRyRz16;
+    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, DL, VT, Ops, 2);
+    unsigned Addu_op = Mips::AdduRxRyRz16;
+    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, DL, VT,
+                                              SDValue(Carry,0), RHS);
+
+    SDNode *Result = CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                                          SDValue(AddCarry,0));
+    return std::make_pair(true, Result);
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
+                                                  true, true);
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    return std::make_pair(true, (SDNode*)NULL);
+  }
+
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    MultOpc = (Opcode == ISD::MULHU ? Mips::MultuRxRy16 : Mips::MultRxRy16);
+    SDNode *Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
+    return std::make_pair(true, Result);
+  }
+  }
+
+  return std::make_pair(false, (SDNode*)NULL);
+}
+
+FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
+  return new Mips16DAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
new file mode 100644
index 000000000000..baa85877d957
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -0,0 +1,51 @@
+//===---- Mips16ISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16ISELDAGTODAG_H
+#define MIPS16ISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class Mips16DAGToDAGISel : public MipsDAGToDAGISel {
+public:
+  explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc DL,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDValue getMips16SPAliasReg();
+
+  void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
+
+  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                            SDValue &Offset, SDValue &Alias);
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+
+  virtual void processFunctionAfterISel(MachineFunction &MF);
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+
+  void initMips16SPAliasReg(MachineFunction &MF);
+};
+
+FunctionPass *createMips16ISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
new file mode 100644
index 000000000000..23eb5375ac94
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -0,0 +1,689 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-lower"
+#include "Mips16ISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <set>
+
+using namespace llvm;
+
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
+static cl::opt<bool> DontExpandCondPseudos16(
+  "mips16-dont-expand-cond-pseudo",
+  cl::init(false),
+  cl::desc("Dont expand conditional move related "
+           "pseudos for Mips 16"),
+  cl::Hidden);
+
+namespace {
+  std::set<const char*, MipsTargetLowering::LTStr> NoHelperNeeded;
+}
+
+Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
+  : MipsTargetLowering(TM) {
+  //
+  // set up as if mips32 and then revert so we can test the mechanism
+  // for switching
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+  addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+  computeRegisterProperties();
+  clearRegisterClasses();
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+
+  if (Mips16HardFloat)
+    setMips16HardFloatLibCalls();
+
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_SWAP,        MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_ADD,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_AND,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_OR,     MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_XOR,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_NAND,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MIN,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_MAX,    MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
+  setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMips16TargetLowering(MipsTargetMachine &TM) {
+  return new Mips16TargetLowering(TM);
+}
+
+bool
+Mips16TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  return false;
+}
+
+MachineBasicBlock *
+Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::SelBeqZ:
+    return emitSel16(Mips::BeqzRxImm16, MI, BB);
+  case Mips::SelBneZ:
+    return emitSel16(Mips::BnezRxImm16, MI, BB);
+  case Mips::SelTBteqZCmpi:
+    return emitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSlti:
+    return emitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBteqZSltiu:
+    return emitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBtneZCmpi:
+    return emitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSlti:
+    return emitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SelTBtneZSltiu:
+    return emitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SelTBteqZCmp:
+    return emitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBteqZSlt:
+    return emitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBteqZSltu:
+    return emitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::SelTBtneZCmp:
+    return emitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::SelTBtneZSlt:
+    return emitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::SelTBtneZSltu:
+    return emitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BteqzT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BteqzT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BtnezT8CmpX16:
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+  case Mips::BtnezT8SltX16:
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+  case Mips::BtnezT8SltuX16:
+    // TBD: figure out a way to get this or remove the instruction
+    // altogether.
+    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+  case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
+    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    break;
+  case Mips::SltCCRxRy16:
+    return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
+    break;
+  case Mips::SltiCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+  case Mips::SltiuCCRxImmX16:
+    return emitFEXT_CCRXI16_ins
+      (Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+  case Mips::SltuCCRxRy16:
+    return emitFEXT_CCRX16_ins
+      (Mips::SltuRxRy16, MI, BB);
+  }
+}
+
+bool Mips16TargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  // No tail call optimization for mips16.
+  return false;
+}
+
+void Mips16TargetLowering::setMips16LibcallName
+  (RTLIB::Libcall L, const char *Name) {
+  setLibcallName(L, Name);
+  NoHelperNeeded.insert(Name);
+}
+
+void Mips16TargetLowering::setMips16HardFloatLibCalls() {
+  setMips16LibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
+  setMips16LibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
+  setMips16LibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
+  setMips16LibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
+  setMips16LibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
+  setMips16LibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
+  setMips16LibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
+  setMips16LibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
+  setMips16LibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
+  setMips16LibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
+  setMips16LibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
+  setMips16LibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
+  setMips16LibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
+  setMips16LibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
+  setMips16LibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
+  setMips16LibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
+  setMips16LibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
+  setMips16LibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
+  setMips16LibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
+  setMips16LibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
+  setMips16LibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
+  setMips16LibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
+  setMips16LibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
+  setMips16LibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
+  setMips16LibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
+  setMips16LibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
+  setMips16LibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
+  setMips16LibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
+  setMips16LibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
+  setMips16LibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
+  setMips16LibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+  setMips16LibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+}
+
+
+//
+// The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
+// cleaner way to do all of this but it will have to wait until the traditional
+// gcc mechanism is completed.
+//
+// For Pic, in order for Mips16 code to call Mips32 code which according the abi
+// have either arguments or returned values placed in floating point registers,
+// we use a set of helper functions. (This includes functions which return type
+//  complex which on Mips are returned in a pair of floating point registers).
+//
+// This is an encoding that we inherited from gcc.
+// In Mips traditional O32, N32 ABI, floating point numbers are passed in
+// floating point argument registers 1,2 only when the first and optionally
+// the second arguments are float (sf) or double (df).
+// For Mips16 we are only concerned with the situations where floating point
+// arguments are being passed in floating point registers by the ABI, because
+// Mips16 mode code cannot execute floating point instructions to load those
+// values and hence helper functions are needed.
+// The possibilities are (), (sf), (sf, sf), (sf, df), (df), (df, sf), (df, df)
+// the helper function suffixs for these are:
+//                        0,  1,    5,        9,         2,   6,        10
+// this suffix can then be calculated as follows:
+// for a given argument Arg:
+//     Arg1x, Arg2x = 1 :  Arg is sf
+//                    2 :  Arg is df
+//                    0:   Arg is neither sf or df
+// So this stub is the string for number Arg1x + Arg2x*4.
+// However not all numbers between 0 and 10 are possible, we check anyway and
+// assert if the impossible exists.
+//
+
+unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
+  (ArgListTy &Args) const {
+  unsigned int resultNum = 0;
+  if (Args.size() >= 1) {
+    Type *t = Args[0].Ty;
+    if (t->isFloatTy()) {
+      resultNum = 1;
+    }
+    else if (t->isDoubleTy()) {
+      resultNum = 2;
+    }
+  }
+  if (resultNum) {
+    if (Args.size() >=2) {
+      Type *t = Args[1].Ty;
+      if (t->isFloatTy()) {
+        resultNum += 4;
+      }
+      else if (t->isDoubleTy()) {
+        resultNum += 8;
+      }
+    }
+  }
+  return resultNum;
+}
+
+//
+// prefixs are attached to stub numbers depending on the return type .
+// return type: float  sf_
+//              double df_
+//              single complex sc_
+//              double complext dc_
+//              others  NO PREFIX
+//
+//
+// The full name of a helper function is__mips16_call_stub +
+//    return type dependent prefix + stub number
+//
+//
+// This is something that probably should be in a different source file and
+// perhaps done differently but my main purpose is to not waste runtime
+// on something that we can enumerate in the source. Another possibility is
+// to have a python script to generate these mapping tables. This will do
+// for now. There are a whole series of helper function mapping arrays, one
+// for each return type class as outlined above. There there are 11 possible
+//  entries. Ones with 0 are ones which should never be selected
+//
+// All the arrays are similar except for ones which return neither
+// sf, df, sc, dc, in which only care about ones which have sf or df as a
+// first parameter.
+//
+#define P_ "__mips16_call_stub_"
+#define MAX_STUB_NUMBER 10
+#define T1 P "1", P "2", 0, 0, P "5", P "6", 0, 0, P "9", P "10"
+#define T P "0" , T1
+#define P P_
+static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
+  {0, T1 };
+#undef P
+#define P P_ "sf_"
+static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "df_"
+static char const * dfMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "sc_"
+static char const * scMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#define P P_ "dc_"
+static char const * dcMips16Helper[MAX_STUB_NUMBER+1] =
+  { T };
+#undef P
+#undef P_
+
+
+const char* Mips16TargetLowering::
+  getMips16HelperFunction
+    (Type* RetTy, ArgListTy &Args, bool &needHelper) const {
+  const unsigned int stubNum = getMips16HelperFunctionStubNumber(Args);
+#ifndef NDEBUG
+  const unsigned int maxStubNum = 10;
+  assert(stubNum <= maxStubNum);
+  const bool validStubNum[maxStubNum+1] =
+    {true, true, true, false, false, true, true, false, false, true, true};
+  assert(validStubNum[stubNum]);
+#endif
+  const char *result;
+  if (RetTy->isFloatTy()) {
+    result = sfMips16Helper[stubNum];
+  }
+  else if (RetTy ->isDoubleTy()) {
+    result = dfMips16Helper[stubNum];
+  }
+  else if (RetTy->isStructTy()) {
+    // check if it's complex
+    if (RetTy->getNumContainedTypes() == 2) {
+      if ((RetTy->getContainedType(0)->isFloatTy()) &&
+          (RetTy->getContainedType(1)->isFloatTy())) {
+        result = scMips16Helper[stubNum];
+      }
+      else if ((RetTy->getContainedType(0)->isDoubleTy()) &&
+               (RetTy->getContainedType(1)->isDoubleTy())) {
+        result = dcMips16Helper[stubNum];
+      }
+      else {
+        llvm_unreachable("Uncovered condition");
+      }
+    }
+    else {
+      llvm_unreachable("Uncovered condition");
+    }
+  }
+  else {
+    if (stubNum == 0) {
+      needHelper = false;
+      return "";
+    }
+    result = vMips16Helper[stubNum];
+  }
+  needHelper = true;
+  return result;
+}
+
+void Mips16TargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  SelectionDAG &DAG = CLI.DAG;
+  const char* Mips16HelperFunction = 0;
+  bool NeedMips16Helper = false;
+
+  if (getTargetMachine().Options.UseSoftFloat && Mips16HardFloat) {
+    //
+    // currently we don't have symbols tagged with the mips16 or mips32
+    // qualifier so we will assume that we don't know what kind it is.
+    // and generate the helper
+    //
+    bool LookupHelper = true;
+    if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
+      if (NoHelperNeeded.find(S->getSymbol()) != NoHelperNeeded.end()) {
+        LookupHelper = false;
+      }
+    }
+    if (LookupHelper) Mips16HelperFunction =
+      getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
+
+  }
+
+  SDValue JumpTarget = Callee;
+
+  // T9 should contain the address of the callee function if
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned V0Reg = Mips::V0;
+    if (NeedMips16Helper) {
+      RegsToPass.push_front(std::make_pair(V0Reg, Callee));
+      JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
+      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+    } else
+      RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
+  }
+
+  Ops.push_back(JumpTarget);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+MachineBasicBlock *Mips16TargetLowering::
+emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc)).addReg(MI->getOperand(3).getReg())
+    .addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSelT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addReg(MI->getOperand(4).getReg());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitSeliT16
+  (unsigned Opc1, unsigned Opc2,
+   MachineInstr *MI, MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   setcc r1, r2, r3
+  //   bNE   r1, r0, copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB  = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, DL, TII->get(Opc2)).addReg(MI->getOperand(3).getReg())
+    .addImm(MI->getOperand(4).getImm());
+  BuildMI(BB, DL, TII->get(Opc1)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copy0MBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL,
+          TII->get(Mips::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
+
+MachineBasicBlock
+  *Mips16TargetLowering::emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                             MachineInstr *MI,
+                                             MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  unsigned regY = MI->getOperand(1).getReg();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned regX = MI->getOperand(0).getReg();
+  int64_t imm = MI->getOperand(1).getImm();
+  MachineBasicBlock *target = MI->getOperand(2).getMBB();
+  unsigned CmpOpc;
+  if (isUInt<8>(imm))
+    CmpOpc = CmpiOpc;
+  else if (isUInt<16>(imm))
+    CmpOpc = CmpiXOpc;
+  else
+    llvm_unreachable("immediate field not usable");
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(CmpOpc)).addReg(regX).addImm(imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(), TII->get(BtOpc)).addMBB(target);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+static unsigned Mips16WhichOp8uOr16simm
+  (unsigned shortOp, unsigned longOp, int64_t Imm) {
+  if (isUInt<8>(Imm))
+    return shortOp;
+  else if (isInt<16>(Imm))
+    return longOp;
+  else
+    llvm_unreachable("immediate field not usable");
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRX16_ins(
+  unsigned SltOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB) const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  unsigned regY = MI->getOperand(2).getReg();
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+		  TII->get(SltOpc)).addReg(regX).addReg(regY);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *Mips16TargetLowering::emitFEXT_CCRXI16_ins(
+  unsigned SltiOpc, unsigned SltiXOpc,
+  MachineInstr *MI,  MachineBasicBlock *BB )const {
+  if (DontExpandCondPseudos16)
+    return BB;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  unsigned CC = MI->getOperand(0).getReg();
+  unsigned regX = MI->getOperand(1).getReg();
+  int64_t Imm = MI->getOperand(2).getImm();
+  unsigned SltOpc = Mips16WhichOp8uOr16simm(SltiOpc, SltiXOpc, Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(SltOpc)).addReg(regX).addImm(Imm);
+  BuildMI(*BB, MI, MI->getDebugLoc(),
+          TII->get(Mips::MoveR3216), CC).addReg(Mips::T8);
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
new file mode 100644
index 000000000000..b23e2a1f37db
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.h
@@ -0,0 +1,80 @@
+//===-- Mips16ISelLowering.h - Mips16 DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips16.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef Mips16ISELLOWERING_H
+#define Mips16ISELLOWERING_H
+
+#include "MipsISelLowering.h"
+
+namespace llvm {
+  class Mips16TargetLowering : public MipsTargetLowering  {
+  public:
+    explicit Mips16TargetLowering(MipsTargetMachine &TM);
+
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+  private:
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const;
+
+    void setMips16LibcallName(RTLIB::Libcall, const char *Name);
+
+    void setMips16HardFloatLibCalls();
+
+    unsigned int
+      getMips16HelperFunctionStubNumber(ArgListTy &Args) const;
+
+    const char *getMips16HelperFunction
+      (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
+
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
+                                 MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSeliT16(unsigned Opc1, unsigned Opc2,
+                                   MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitSelT16(unsigned Opc1, unsigned Opc2,
+                                  MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I816_ins(unsigned BtOpc, unsigned CmpOpc,
+                                           MachineInstr *MI,
+                                           MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_T8I8I16_ins(
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRX16_ins(
+      unsigned SltOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitFEXT_CCRXI16_ins(
+      unsigned SltiOpc, unsigned SltiXOpc,
+      MachineInstr *MI,  MachineBasicBlock *BB )const;
+  };
+}
+
+#endif // Mips16ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
new file mode 100644
index 000000000000..4ff62ef3b6f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrFormats.td
@@ -0,0 +1,626 @@
+//===- Mips16InstrFormats.td - Mips Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  funct or f      Function field
+//
+//  immediate       4-,5-,8- or 11-bit immediate, branch displacement, or
+//  or imm          address displacement
+//
+//  op              5-bit major operation code
+//
+//  rx              3-bit source or destination register
+//
+//  ry              3-bit source or destination register
+//
+//  rz              3-bit source or destination register
+//
+//  sa              3- or 5-bit shift amount
+//
+//===----------------------------------------------------------------------===//
+
+
+// Base class for Mips 16 Format
+// This class does not depend on the instruction size
+//
+class MipsInst16_Base<dag outs, dag ins, string asmstr, list<dag> pattern,
+                      InstrItinClass itin>: Instruction
+{
+
+  let Namespace = "Mips";
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  let Predicates = [InMips16Mode];
+}
+
+//
+// Generic Mips 16 Format
+//
+class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
+                 InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<16> Inst;
+  bits<5> Opcode = 0;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{15-11} = Opcode;
+  
+  let Size=2;
+  field bits<16> SoftFail = 0;
+}
+
+//
+// For 32 bit extended instruction forms.
+//
+class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
+                    InstrItinClass itin>:
+  MipsInst16_Base<outs, ins, asmstr, pattern, itin>
+{
+  field bits<32> Inst;
+  
+  let Size=4;
+  field bits<32> SoftFail = 0;
+}
+
+class MipsInst16_EXTEND<dag outs, dag ins, string asmstr, list<dag> pattern,
+                        InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  let Inst{31-27} = 0b11110;
+}
+
+
+
+// Mips Pseudo Instructions Format
+class MipsPseudo16<dag outs, dag ins, string asmstr, list<dag> pattern>:
+  MipsInst16<outs, ins, asmstr, pattern, IIPseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|imm11|>
+//===----------------------------------------------------------------------===//
+
+class FI16<bits<5> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+           InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<11> imm11;
+
+  let Opcode = op;
+
+  let Inst{10-0}  = imm11;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RI instruction class in Mips : <|opcode|rx|imm8|>
+//===----------------------------------------------------------------------===//
+
+class FRI16<bits<5> op, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<8>   imm8;
+
+  let Opcode = op;
+
+  let Inst{10-8} = rx;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RR instruction class in Mips : <|opcode|rx|ry|funct|>
+//===----------------------------------------------------------------------===//
+
+class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = _funct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = funct;
+}
+
+//
+// For conversion functions.
+//
+class FRR_SF16<bits<5> _funct, bits<3> _subfunct, dag outs, dag ins,
+               string asmstr, list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  subfunct;
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+  let subfunct = _subfunct;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = subfunct;
+  let Inst{4-0}   = funct;
+}
+
+//
+// just used for breakpoint (hardware and software) instructions.
+//
+class FC16<bits<5> _funct, dag outs, dag ins, string asmstr,
+           list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  _code;  // code is a keyword in tablegen
+  bits<5>  funct;
+
+  let Opcode = 0b11101; // RR
+  let funct  = _funct;
+
+  let Inst{10-5} = _code;
+  let Inst{4-0}   = funct;
+}
+
+//
+// J(AL)R(C) subformat
+//
+class FRR16_JALRC<bits<1> _nd, bits<1> _l, bits<1> r_a,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<1>  nd;
+  bits<1>  l;
+  bits<1>  ra;
+
+  let nd = _nd;
+  let l = _l;
+  let ra = r_a;
+
+  let Opcode = 0b11101;
+
+  let Inst{10-8} = rx;
+  let Inst{7} = nd;
+  let Inst{6} = l;
+  let Inst{5} = ra;
+  let Inst{4-0} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI instruction class in Mips : <|opcode|rx|ry|imm5|>
+//===----------------------------------------------------------------------===//
+
+class FRRI16<bits<5> op, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<5>  imm5;
+
+  let Opcode = op;
+
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0}   = imm5;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRR instruction class in Mips : <|opcode|rx|ry|rz|f|>
+//===----------------------------------------------------------------------===//
+
+class FRRR16<bits<2> _f, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  rz;
+  bits<2>  f;
+
+  let Opcode = 0b11100;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = rz;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format RRI-A instruction class in Mips : <|opcode|rx|ry|f|imm4|>
+//===----------------------------------------------------------------------===//
+
+class FRRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<1>  f;
+  bits<4>  imm4;
+
+  let Opcode = 0b01000;
+  let  f = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0}   = imm4;
+}
+
+//===----------------------------------------------------------------------===//
+// Format Shift instruction class in Mips : <|opcode|rx|ry|sa|f|>
+//===----------------------------------------------------------------------===//
+
+class FSHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  rx;
+  bits<3>  ry;
+  bits<3>  sa;
+  bits<2>  f;
+
+  let Opcode = 0b00110;
+  let f  = _f;
+
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = sa;
+  let Inst{1-0}   = f;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8 instruction class in Mips : <|opcode|funct|imm8>
+//===----------------------------------------------------------------------===//
+
+class FI816<bits<3> _func, dag outs, dag ins, string asmstr,
+            list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<3>  func;
+  bits<8>   imm8;
+
+  let Opcode = 0b01100;
+  let func  = _func;
+
+  let Inst{10-8} = func;
+  let Inst{7-0} = imm8;
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOVR32 instruction class in Mips : <|opcode|func|ry|r32>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOVR3216<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<4> ry;
+  bits<4> r32;
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b111;
+  let Inst{7-4} = ry;
+  let Inst{3-0} = r32;
+
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Format i8_MOV32R instruction class in Mips : <|opcode|func|r32|rz>
+//===----------------------------------------------------------------------===//
+
+class FI8_MOV32R16<dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+
+  bits<3>  func;
+  bits<5> r32;
+  bits<3> rz;
+
+
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b101;
+  let Inst{7-5} = r32{2-0};
+  let Inst{4-3} = r32{4-3};
+  let Inst{2-0} = rz;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format i8_SVRS instruction class in Mips :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FI8_SVRS16<bits<1> _s, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+  bits<4> framesize = 0;
+
+  let s =_s;
+  let Opcode = 0b01100;
+
+  let Inst{10-8} = 0b100;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format JAL instruction class in Mips16 :
+//    <|opcode|svrs|s|ra|s0|s1|framesize>
+//===----------------------------------------------------------------------===//
+
+class FJAL16<bits<1> _X, dag outs, dag ins, string asmstr,
+             list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_32<outs, ins, asmstr, pattern, itin>
+{
+  bits<1> X;
+  bits<26> imm26;
+
+
+  let X = _X;
+
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = X;
+  let Inst{25-21} = imm26{20-16};
+  let Inst{20-16} = imm26{25-21};
+  let Inst{15-0}  = imm26{15-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|0|0|0|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I16<bits<5> _eop, dag outs, dag ins, string asmstr,
+               list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> eop;
+
+  let eop = _eop;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = eop;
+  let Inst{10-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format ASMACRO instruction class in Mips16 :
+//    <EXTEND|select|p4|p3|RRR|p2|p1|p0>
+//===----------------------------------------------------------------------===//
+
+class FASMACRO16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> select;
+  bits<3> p4;
+  bits<5> p3;
+  bits<5> RRR = 0b11100;
+  bits<3> p2;
+  bits<3> p1;
+  bits<5> p0;
+
+
+  let Inst{26-24} = select;
+  let Inst{23-21} = p4;
+  let Inst{20-16} = p3;
+  let Inst{15-11} = RRR;
+  let Inst{10-8} = p2;
+  let Inst{7-5} = p1;
+  let Inst{4-0} = p0;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RI instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|op|rx|0|0|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> op;
+  bits<3> rx;
+
+  let op = _op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI instruction class in Mips16 :
+//     <|EXTEND|imm10:5|imm15:11|op|rx|ry|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI16<bits<5> _op, dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<5> op;
+  bits<16> imm16;
+  bits<3> rx;
+  bits<3> ry;
+
+  let op=_op;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = op;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-RRI-A instruction class in Mips16 :
+//    <|EXTEND|imm10:4|imm14:11|RRI-A|rx|ry|f|imm3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_RRI_A16<bits<1> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<15> imm15;
+  bits<3> rx;
+  bits<3> ry;
+  bits<1> f;
+
+  let f = _f;
+
+  let Inst{26-20} = imm15{10-4};
+  let Inst{19-16} = imm15{14-11};
+  let Inst{15-11} = 0b01000;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4} = f;
+  let Inst{3-0} = imm15{3-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-SHIFT instruction class in Mips16 :
+//    <|EXTEND|sa 4:0|s5|0|SHIFT|rx|ry|0|f>
+//===----------------------------------------------------------------------===//
+
+class FEXT_SHIFT16<bits<2> _f, dag outs, dag ins, string asmstr,
+                   list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<6> sa6;
+  bits<3> rx;
+  bits<3> ry;
+  bits<2> f;
+
+  let f = _f;
+
+  let Inst{26-22} = sa6{4-0};
+  let Inst{21} = sa6{5};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00110;
+  let Inst{10-8} = rx;
+  let Inst{7-5} = ry;
+  let Inst{4-2} = 0;
+  let Inst{1-0} = f;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8 instruction class in Mips16 :
+//    <|EXTEND|imm10:5|imm15:11|I8|funct|0|imm4:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I816<bits<3> _funct, dag outs, dag ins, string asmstr,
+                list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<16> imm16;
+  bits<5> I8;
+  bits<3> funct;
+
+  let funct = _funct;
+  let I8 = 0b0110;
+
+  let Inst{26-21} = imm16{10-5};
+  let Inst{20-16} = imm16{15-11};
+  let Inst{15-11} = I8;
+  let Inst{10-8} = funct;
+  let Inst{7-5} = 0;
+  let Inst{4-0} = imm16{4-0};
+
+}
+
+//===----------------------------------------------------------------------===//
+// Format EXT-I8_SVRS instruction class in Mips16 :
+//    <|EXTEND|xsregs|framesize7:4|aregs|I8|SVRS|s|ra|s0|s1|framesize3:0>
+//===----------------------------------------------------------------------===//
+
+class FEXT_I8_SVRS16<bits<1> s_, dag outs, dag ins, string asmstr,
+                     list<dag> pattern, InstrItinClass itin>:
+  MipsInst16_EXTEND<outs, ins, asmstr, pattern, itin>
+{
+  bits<3> xsregs =0;
+  bits<8> framesize =0;
+  bits<3> aregs =0;
+  bits<5> I8 = 0b01100;
+  bits<3> SVRS = 0b100;
+  bits<1> s;
+  bits<1> ra = 0;
+  bits<1> s0 = 0;
+  bits<1> s1 = 0;
+
+  let s= s_;
+
+  let Inst{26-24} = xsregs;
+  let Inst{23-20} = framesize{7-4};
+  let Inst{19} = 0;
+  let Inst{18-16} = aregs;
+  let Inst{15-11} = I8;
+  let Inst{10-8} = SVRS;
+  let Inst{7} = s;
+  let Inst{6} = ra;
+  let Inst{5} = s0;
+  let Inst{4} = s1;
+  let Inst{3-0} = framesize{3-0};
+
+
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
new file mode 100644
index 000000000000..17dd2c07967a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -0,0 +1,417 @@
+//===-- Mips16InstrInfo.cpp - Mips16 Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16InstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static cl::opt<bool> NeverUseSaveRestore(
+  "mips16-never-use-save-restore",
+  cl::init(false),
+  cl::desc("For testing ability to adjust stack pointer "
+           "without save/restore instruction"),
+  cl::Hidden);
+
+
+Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
+  : MipsInstrInfo(tm, Mips::BimmX16),
+    RI(*tm.getSubtargetImpl(), *this) {}
+
+const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned Mips16InstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned Mips16InstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  return 0;
+}
+
+void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.contains(DestReg) &&
+      Mips::CPURegsRegClass.contains(SrcReg))
+    Opc = Mips::MoveR3216;
+  else if (Mips::CPURegsRegClass.contains(DestReg) &&
+           Mips::CPU16RegsRegClass.contains(SrcReg))
+    Opc = Mips::Move32R16;
+  else if ((SrcReg == Mips::HI) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mfhi16, SrcReg = 0;
+
+  else if ((SrcReg == Mips::LO) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mflo16, SrcReg = 0;
+
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+void Mips16InstrInfo::
+storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                unsigned SrcReg, bool isKill, int FI,
+                const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
+                int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+  unsigned Opc = 0;
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::SwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+}
+
+void Mips16InstrInfo::
+loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
+                 const TargetRegisterInfo *TRI, int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::LwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+    .addMemOperand(MMO);
+}
+
+bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA16:
+    ExpandRetRA16(MBB, MI, Mips::JrcRa16);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:  llvm_unreachable("Illegal opcode!");
+  case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+  case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+  case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+  case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::BtnezX16: return Mips::BteqzX16;
+  case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+  case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+  case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::BteqzX16: return Mips::BtnezX16;
+  case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+  case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+  case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+  case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+  case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+  case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+  }
+  assert(false && "Implement this function.");
+  return 0;
+}
+
+// Adjust SP by FrameSize bytes. Save RA, S0, S1
+void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize,
+                    MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  if (!NeverUseSaveRestore) {
+    if (isUInt<11>(FrameSize))
+      BuildMI(MBB, I, DL, get(Mips::SaveRaF16)).addImm(FrameSize);
+    else {
+      int Base = 2040; // should create template function like isUInt that
+                       // returns largest possible n bit unsigned integer
+      int64_t Remainder = FrameSize - Base;
+      BuildMI(MBB, I, DL, get(Mips::SaveRaF16)). addImm(Base);
+      if (isInt<16>(-Remainder))
+        BuildAddiuSpImm(MBB, I, -Remainder);
+      else
+        adjustStackPtrBig(SP, -Remainder, MBB, I, Mips::V0, Mips::V1);
+    }
+
+  }
+  else {
+    //
+    // sw ra, -4[sp]
+    // sw s1, -8[sp]
+    // sw s0, -12[sp]
+
+    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::RA);
+    MIB1.addReg(Mips::SP);
+    MIB1.addImm(-4);
+    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::S1);
+    MIB2.addReg(Mips::SP);
+    MIB2.addImm(-8);
+    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::SwRxSpImmX16),
+                                       Mips::S0);
+    MIB3.addReg(Mips::SP);
+    MIB3.addImm(-12);
+    adjustStackPtrBig(SP, -FrameSize, MBB, I, Mips::V0, Mips::V1);
+  }
+}
+
+// Adjust SP by FrameSize bytes. Restore RA, S0, S1
+void Mips16InstrInfo::restoreFrame(unsigned SP, int64_t FrameSize,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  if (!NeverUseSaveRestore) {
+    if (isUInt<11>(FrameSize))
+      BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)).addImm(FrameSize);
+    else {
+      int Base = 2040; // should create template function like isUInt that
+                       // returns largest possible n bit unsigned integer
+      int64_t Remainder = FrameSize - Base;
+      if (isInt<16>(Remainder))
+        BuildAddiuSpImm(MBB, I, Remainder);
+      else
+        adjustStackPtrBig(SP, Remainder, MBB, I, Mips::A0, Mips::A1);
+      BuildMI(MBB, I, DL, get(Mips::RestoreRaF16)). addImm(Base);
+    }
+  }
+  else {
+    adjustStackPtrBig(SP, FrameSize, MBB, I, Mips::A0, Mips::A1);
+    // lw ra, -4[sp]
+    // lw s1, -8[sp]
+    // lw s0, -12[sp]
+    MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::A0);
+    MIB1.addReg(Mips::SP);
+    MIB1.addImm(-4);
+    MachineInstrBuilder MIB0 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                       Mips::RA);
+     MIB0.addReg(Mips::A0);
+    MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::S1);
+    MIB2.addReg(Mips::SP);
+    MIB2.addImm(-8);
+    MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::LwRxSpImmX16),
+                                       Mips::S0);
+    MIB3.addReg(Mips::SP);
+    MIB3.addImm(-12);
+  }
+
+}
+
+// Adjust SP by Amount bytes where bytes can be up to 32bit number.
+// This can only be called at times that we know that there is at least one free
+// register.
+// This is clearly safe at prologue and epilogue.
+//
+void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned Reg1, unsigned Reg2) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+//  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+//  unsigned Reg1 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass);
+//  unsigned Reg2 = RegInfo.createVirtualRegister(&Mips::CPU16RegsRegClass);
+  //
+  // li reg1, constant
+  // move reg2, sp
+  // add reg1, reg1, reg2
+  // move sp, reg1
+  //
+  //
+  MachineInstrBuilder MIB1 = BuildMI(MBB, I, DL, get(Mips::LwConstant32), Reg1);
+  MIB1.addImm(Amount);
+  MachineInstrBuilder MIB2 = BuildMI(MBB, I, DL, get(Mips::MoveR3216), Reg2);
+  MIB2.addReg(Mips::SP, RegState::Kill);
+  MachineInstrBuilder MIB3 = BuildMI(MBB, I, DL, get(Mips::AdduRxRyRz16), Reg1);
+  MIB3.addReg(Reg1);
+  MIB3.addReg(Reg2, RegState::Kill);
+  MachineInstrBuilder MIB4 = BuildMI(MBB, I, DL, get(Mips::Move32R16),
+                                                     Mips::SP);
+  MIB4.addReg(Reg1, RegState::Kill);
+}
+
+void Mips16InstrInfo::adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+                    MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I) const {
+   assert(false && "adjust stack pointer amount exceeded");
+}
+
+/// Adjust SP by Amount bytes.
+void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  if (isInt<16>(Amount))  // need to change to addiu sp, ....and isInt<16>
+    BuildAddiuSpImm(MBB, I, Amount);
+  else
+    adjustStackPtrBigUnrestricted(SP, Amount, MBB, I);
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+Mips16InstrInfo::loadImmediate(unsigned FrameReg,
+                               int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned &NewImm) const {
+  //
+  // given original instruction is:
+  // Instr rx, T[offset] where offset is too big.
+  //
+  // lo = offset & 0xFFFF
+  // hi = ((offset >> 16) + (lo >> 15)) & 0xFFFF;
+  //
+  // let T = temporary register
+  // li T, hi
+  // shl T, 16
+  // add T, Rx, T
+  //
+  RegScavenger rs;
+  int32_t lo = Imm & 0xFFFF;
+  int32_t hi = ((Imm >> 16) + (lo >> 15)) & 0xFFFF;
+  NewImm = lo;
+  unsigned Reg =0;
+  unsigned SpReg = 0;
+  rs.enterBasicBlock(&MBB);
+  rs.forward(II);
+  //
+  // we use T0 for the first register, if we need to save something away.
+  // we use T1 for the second register, if we need to save something away.
+  //
+  unsigned FirstRegSaved =0, SecondRegSaved=0;
+  unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
+
+  Reg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
+  if (Reg == 0) {
+    FirstRegSaved = Reg = Mips::V0;
+    FirstRegSavedTo = Mips::T0;
+    copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+  }
+  else
+    rs.setUsed(Reg);
+  BuildMI(MBB, II, DL, get(Mips::LiRxImmX16), Reg).addImm(hi);
+  BuildMI(MBB, II, DL, get(Mips::SllX16), Reg).addReg(Reg).
+    addImm(16);
+  if (FrameReg == Mips::SP) {
+    SpReg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
+    if (SpReg == 0) {
+      if (Reg != Mips::V1) {
+        SecondRegSaved = SpReg = Mips::V1;
+        SecondRegSavedTo = Mips::T1;
+      }
+      else {
+        SecondRegSaved = SpReg = Mips::V0;
+        SecondRegSavedTo = Mips::T0;
+      }
+      copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+    }
+    else
+      rs.setUsed(SpReg);
+
+    copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg)
+      .addReg(Reg);
+  }
+  else
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
+  if (FirstRegSaved || SecondRegSaved) {
+    II = llvm::next(II);
+    if (FirstRegSaved)
+      copyPhysReg(MBB, II, DL, FirstRegSaved, FirstRegSavedTo, true);
+    if (SecondRegSaved)
+      copyPhysReg(MBB, II, DL, SecondRegSaved, SecondRegSavedTo, true);
+  }
+  return Reg;
+}
+
+unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
+          Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
+          Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
+          Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+          Opc == Mips::BtnezX16       || Opc == Mips::BtnezT8CmpX16 ||
+          Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+          Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+          Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
+}
+
+void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc));
+}
+
+
+const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const {
+  if (validSpImm8(Imm))
+    return get(Mips::AddiuSpImm16);
+  else
+    return get(Mips::AddiuSpImmX16);
+}
+
+void Mips16InstrInfo::BuildAddiuSpImm
+  (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm);
+}
+
+const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
+  return new Mips16InstrInfo(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
new file mode 100644
index 000000000000..a77a9043bb17
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -0,0 +1,124 @@
+//===-- Mips16InstrInfo.h - Mips16 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16INSTRUCTIONINFO_H
+#define MIPS16INSTRUCTIONINFO_H
+
+#include "Mips16RegisterInfo.h"
+#include "MipsInstrInfo.h"
+
+namespace llvm {
+
+class Mips16InstrInfo : public MipsInstrInfo {
+  const Mips16RegisterInfo RI;
+
+public:
+  explicit Mips16InstrInfo(MipsTargetMachine &TM);
+
+  virtual const MipsRegisterInfo &getRegisterInfo() const;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               int64_t Offset) const;
+
+  virtual void loadRegFromStack(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                unsigned DestReg, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI,
+                                int64_t Offset) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+
+  // Adjust SP by FrameSize bytes. Save RA, S0, S1
+  void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+  // Adjust SP by FrameSize bytes. Restore RA, S0, S1
+  void restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+  /// Emit a series of instructions to load an immediate.
+  // This is to adjust some FrameReg. We return the new register to be used
+  // in place of FrameReg and the adjusted immediate field (&NewImm)
+  //
+  unsigned loadImmediate(unsigned FrameReg,
+                         int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned &NewImm) const;
+
+  static bool validSpImm8(int offset) {
+    return ((offset & 7) == 0) && isInt<11>(offset);
+  }
+
+  //
+  // build the proper one based on the Imm field
+  //
+
+  const MCInstrDesc& AddiuSpImm(int64_t Imm) const;
+
+  void BuildAddiuSpImm
+    (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+
+  void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         unsigned Reg1, unsigned Reg2) const;
+
+  // Adjust SP by Amount bytes where bytes can be up to 32bit number.
+  void adjustStackPtrBigUnrestricted(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
new file mode 100644
index 000000000000..aa51aaf46565
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -0,0 +1,1791 @@
+//===- Mips16InstrInfo.td - Target Description for Mips16  -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips16 instructions.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Mips Address
+//
+def addr16 :
+  ComplexPattern<iPTR, 3, "selectAddr16", [frameindex], [SDNPWantParent]>;
+
+//
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16Regs);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16_ea : Operand<i32> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+//
+//
+// I8 instruction format
+//
+
+class FI816_ins_base<bits<3> _func, string asmstr,
+                     string asmstr2, InstrItinClass itin>:
+  FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+        [], itin>;
+
+
+class FI816_SP_ins<bits<3> _func, string asmstr,
+                   InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
+
+//
+// RI instruction format
+//
+
+
+class FRI16_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
+                     InstrItinClass itin>:
+  FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
+        !strconcat(asmstr, asmstr2), [], itin>;
+
+class FRI16R_ins<bits<5> op, string asmstr,
+                InstrItinClass itin>:
+  FRI16R_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
+
+class F2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+        !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+class FRI16_B_ins<bits<5> _op, string asmstr,
+                  InstrItinClass itin>:
+  FRI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+        !strconcat(asmstr, "\t$rx, $imm  # 16 bit inst"), [], itin>;
+//
+// Compare a register and immediate and place result in CC
+// Implicit use of T8
+//
+// EXT-CCRR Instruction format
+//
+class FEXT_CCRXI16_ins<string asmstr>:
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+               !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+// JAL and JALX instruction format
+//
+class FJAL16_ins<bits<1> _X, string asmstr,
+                 InstrItinClass itin>:
+  FJAL16<_X, (outs), (ins simm20:$imm),
+         !strconcat(asmstr, "\t$imm\n\tnop"),[],
+         itin>  {
+  let isCodeGenOnly=1;
+}
+//
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+  FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+           !strconcat(asmstr, "\t$imm16"),[], itin>;
+
+//
+// EXT-I8 instruction format
+//
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+                         string asmstr2, InstrItinClass itin>:
+  FEXT_I816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
+            [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+class FEXT_I816_SP_ins<bits<3> _func, string asmstr,
+                       InstrItinClass itin>:
+      FEXT_I816_ins_base<_func, asmstr, "\t$$sp, $imm", itin>;
+
+//
+// Assembler formats in alphabetical order.
+// Natural and pseudos are mixed together.
+//
+// Compare two registers and place result in CC
+// Implicit use of T8
+//
+// CC-RR Instruction format
+//
+class FCCRR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-RI instruction format
+//
+
+class FEXT_RI16_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16R_ins_base<bits<5> _op, string asmstr, string asmstr2,
+                         InstrItinClass itin>:
+  FEXT_RI16<_op, (outs ), (ins CPU16Regs:$rx, simm16:$imm),
+                  !strconcat(asmstr, asmstr2), [], itin>;
+
+class FEXT_RI16R_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_RI16R_ins_base<_op, asmstr, "\t$rx, $imm", itin>;
+
+class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
+  FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
+
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+                      InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
+class FEXT_2RI16_ins<bits<5> _op, string asmstr,
+                     InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin> {
+  let Constraints = "$rx_ = $rx";
+}
+
+
+// this has an explicit sp argument that we ignore to work around a problem
+// in the compiler
+class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
+                                InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPUSPReg:$ry, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                          InstrItinClass itin>:
+  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+//
+// EXT-RRI-A instruction format
+//
+
+class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
+                           InstrItinClass itin>:
+  FEXT_RRI_A16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+               !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+               !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+               !strconcat(asmstr, "\t$imm"))),[]> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
+  MipsPseudo16<(outs),
+               (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+               !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+               !strconcat(asmstr, "\t$targ"))), []> {
+  let isCodeGenOnly=1;
+  let usesCustomInserter = 1;
+}
+//
+
+
+//
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins CPURegs:$r32),
+       !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// This are pseudo formats for multiply
+// This first one can be changed to non pseudo now.
+//
+// MULT
+//
+class FMULT16_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry"), []>;
+
+//
+// MULT-LO
+//
+class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
+  let isCodeGenOnly=1;
+}
+
+//
+// RR-type instruction format
+//
+
+class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs), (ins  CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
+}
+
+class FRRTR16_ins<string asmstr> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), []> ;
+
+//
+// maybe refactor but need a $zero as a dummy first parameter
+//
+class FRR16_div_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs ), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$$zero, $rx, $ry"), [], itin> ;
+
+class FUnaryRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> ;
+
+
+class FRR16_M_ins<bits<5> f, string asmstr,
+                  InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins),
+        !strconcat(asmstr, "\t$rx"), [], itin>;
+
+class FRxRxRy16_ins<bits<5> f, string asmstr,
+                    InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+            !strconcat(asmstr, "\t$rz, $ry"),
+            [], itin> {
+  let Constraints = "$rx = $rz";
+}
+
+let rx=0 in
+class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
+                              string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
+              [], itin> ;
+
+
+class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
+                      string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
+              !strconcat(asmstr, "\t $rx"), [], itin> ;
+
+//
+// RRR-type instruction format
+//
+
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+
+//
+// These Sel patterns support the generation of conditional move
+// pseudo instructions.
+//
+// The nomenclature uses the components making up the pseudo and may
+// be a bit counter intuitive when compared with the end result we seek.
+// For example using a bqez in the example directly below results in the
+// conditional move being done if the tested register is not zero.
+// I considered in easier to check by keeping the pseudo consistent with
+// it's components but it could have been done differently.
+//
+// The simplest case is when can test and operand directly and do the
+// conditional move based on a simple mips16 conditional
+//  branch instruction.
+// for example:
+// if $op == beqz or bnez:
+//
+// $op1 $rt, .+4
+// move $rd, $rs
+//
+// if $op == beqz, then if $rt != 0, then the conditional assignment
+// $rd = $rs is done.
+
+// if $op == bnez, then if $rt == 0, then the conditional assignment
+// $rd = $rs is done.
+//
+// So this pseudo class only has one operand, i.e. op
+//
+class Sel<string op>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+               CPU16Regs:$rt),
+               !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), []> {
+  //let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// The next two instruction classes allow for an operand which tests
+// two operands and returns a value in register T8 and
+//then does a conditional branch based on the value of T8
+//
+
+// op2 can be cmpi or slti/sltiu
+// op1 can bteqz or btnez
+// the operands for op2 are a register and a signed constant
+//
+// $op2 $t, $imm  ;test register t and branch conditionally
+// $op1 .+4       ;op1 is a conditional branch
+// move $rd, $rs
+//
+//
+class SeliT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                                       CPU16Regs:$rl, simm16:$imm),
+               !strconcat(op2,
+               !strconcat("\t$rl, $imm\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// op2 can be cmp or slt/sltu
+// op1 can be bteqz or btnez
+// the operands for op2 are two registers
+// op1 is a conditional branch
+//
+//
+// $op2 $rl, $rr  ;test registers rl,rr
+// $op1 .+4       ;op2 is a conditional branch
+// move $rd, $rs
+//
+//
+class SelT<string op1, string op2>:
+  MipsPseudo16<(outs CPU16Regs:$rd_),
+               (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                CPU16Regs:$rl, CPU16Regs:$rr),
+               !strconcat(op2,
+               !strconcat("\t$rl, $rr\n\t",
+               !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), []> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+  let usesCustomInserter = 1;
+}
+
+//
+// 32 bit constant
+//
+def imm32: Operand<i32>;
+
+def Constant32:
+  MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
+
+def LwConstant32:
+  MipsPseudo16<(outs), (ins CPU16Regs:$rx, imm32:$imm),
+    "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
+
+
+//
+// Some general instruction class info
+//
+//
+
+class ArithLogic16Defs<bit isCom=0> {
+  bits<5> shamt = 0;
+  bit isCommutable = isCom;
+  bit isReMaterializable = 1;
+  bit neverHasSideEffects = 1;
+}
+
+class branch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit isBarrier = 1;
+}
+
+class cbranch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+}
+
+class MayLoad {
+  bit mayLoad = 1;
+}
+
+class MayStore {
+  bit mayStore = 1;
+}
+//
+
+
+// Format: ADDIU rx, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, Extended)
+// To add a constant to a 32-bit integer.
+//
+def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
+
+def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0> {
+  let AddedComplexity = 5;
+}
+def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
+  ArithLogic16Defs<0> {
+  let isCodeGenOnly = 1;
+}
+
+def AddiuRxRyOffMemX16:
+  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
+
+//
+
+// Format: ADDIU rx, pc, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended)
+// To add a constant to the program counter.
+//
+def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>;
+
+//
+// Format: ADDIU sp, immediate MIPS16e
+// Purpose: Add Immediate Unsigned Word (2-Operand, SP-Relative, Extended)
+// To add a constant to the stack pointer.
+//
+def AddiuSpImm16
+  : FI816_SP_ins<0b011, "addiu", IIAlu> {
+  let Defs = [SP];
+  let Uses = [SP];
+  let AddedComplexity = 5;
+}
+
+def AddiuSpImmX16
+  : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> {
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: ADDU rz, rx, ry MIPS16e
+// Purpose: Add Unsigned Word (3-Operand)
+// To add 32-bit integers.
+//
+
+def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: AND rx, ry MIPS16e
+// Purpose: AND
+// To do a bitwise logical AND.
+
+def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<"bteqz", "cmp">, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<"bteqz", "cmpi">,
+  cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<"bteqz", "slt">, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<"bteqz", "sltu">, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<"bteqz", "slti">, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
+  cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
+  let Uses = [T8];
+}
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<"btnez", "cmp">, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<"btnez", "cmpi">, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<"btnez", "slt">, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<"btnez", "sltu">, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<"btnez", "slti">, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">,
+  cbranch16;
+
+//
+// Format: CMP rx, ry MIPS16e
+// Purpose: Compare
+// To compare the contents of two GPRs.
+//
+def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: CMPI rx, immediate MIPS16e
+// Purpose: Compare Immediate (Extended)
+// To compare a constant with the contents of a GPR.
+//
+def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
+  let Defs = [T8];
+}
+
+
+//
+// Format: DIV rx, ry MIPS16e
+// Purpose: Divide Word
+// To divide 32-bit signed integers.
+//
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
+  let Defs = [HI, LO];
+}
+
+//
+// Format: DIVU rx, ry MIPS16e
+// Purpose: Divide Unsigned Word
+// To divide 32-bit unsigned integers.
+//
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
+  let Defs = [HI, LO];
+}
+//
+// Format: JAL target MIPS16e
+// Purpose: Jump and Link
+// To execute a procedure call within the current 256 MB-aligned
+// region and preserve the current ISA.
+//
+
+def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
+  let isBranch = 1;
+  let hasDelaySlot = 0;  // not true, but we add the nop for now
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+//
+// Format: JR ra MIPS16e
+// Purpose: Jump Register Through Register ra
+// To execute a branch to the instruction address in the return
+// address register.
+//
+
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let hasDelaySlot = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+//
+// Format: LB ry, offset(rx) MIPS16e
+// Purpose: Load Byte (Extended)
+// To load a byte from memory as a signed value.
+//
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LBU ry, offset(rx) MIPS16e
+// Purpose: Load Byte Unsigned (Extended)
+// To load a byte from memory as a unsigned value.
+//
+def LbuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IILoad>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LH ry, offset(rx) MIPS16e
+// Purpose: Load Halfword signed (Extended)
+// To load a halfword from memory as a signed value.
+//
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LHU ry, offset(rx) MIPS16e
+// Purpose: Load Halfword unsigned (Extended)
+// To load a halfword from memory as an unsigned value.
+//
+def LhuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IILoad>, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate
+// To load a constant into a GPR.
+//
+def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
+
+//
+// Format: LI rx, immediate MIPS16e
+// Purpose: Load Immediate (Extended)
+// To load a constant into a GPR.
+//
+def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
+
+//
+// Format: LW ry, offset(rx) MIPS16e
+// Purpose: Load Word (Extended)
+// To load a word from memory as a signed value.
+//
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
+  let isCodeGenOnly = 1;
+}
+
+// Format: LW rx, offset(sp) MIPS16e
+// Purpose: Load Word (SP-Relative, Extended)
+// To load an SP-relative word from memory as a signed value.
+//
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10110, "lw", IILoad>, MayLoad{
+  let Uses = [SP];
+}
+
+//
+// Format: MOVE r32, rz MIPS16e
+// Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
+
+//
+// Format: MFHI rx MIPS16e
+// Purpose: Move From HI Register
+// To copy the special purpose HI register to a GPR.
+//
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
+  let Uses = [HI];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Format: MFLO rx MIPS16e
+// Purpose: Move From LO Register
+// To copy the special purpose LO register to a GPR.
+//
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
+  let Uses = [LO];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Pseudo Instruction for mult
+//
+def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+//
+// Format: MULT rx, ry MIPS16e
+// Purpose: Multiply Word
+// To multiply 32-bit signed integers.
+//
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+//
+// Format: MULTU rx, ry MIPS16e
+// Purpose: Multiply Unsigned Word
+// To multiply 32-bit unsigned integers.
+//
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+//
+// Format: NEG rx, ry MIPS16e
+// Purpose: Negate
+// To negate an integer value.
+//
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>;
+
+//
+// Format: NOT rx, ry MIPS16e
+// Purpose: Not
+// To complement an integer value
+//
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>;
+
+//
+// Format: OR rx, ry MIPS16e
+// Purpose: Or
+// To do a bitwise logical OR.
+//
+def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
+
+//
+// Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize}
+// (All args are optional) MIPS16e
+// Purpose: Restore Registers and Deallocate Stack Frame
+// To deallocate a stack frame before exit from a subroutine,
+// restoring return address and static registers, and adjusting
+// stack
+//
+
+// fixed form for restoring RA and the frame
+// for direct object emitter, encoding needs to be adjusted for the
+// frame size
+//
+let ra=1, s=0,s0=1,s1=1 in
+def RestoreRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "restore\t$$ra,  $$s0, $$s1, $frame_size", [], IILoad >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [S0, S1, RA, SP];
+  let Uses = [SP];
+}
+
+// Use Restore to increment SP since SP is not a Mip 16 register, this
+// is an easy way to do that which does not require a register.
+//
+let ra=0, s=0,s0=0,s1=0 in
+def RestoreIncSpF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "restore\t$frame_size", [], IILoad >, MayLoad {
+  let isCodeGenOnly = 1;
+  let Defs = [SP];
+  let Uses = [SP];
+}
+
+//
+// Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
+// MIPS16e
+// Purpose: Save Registers and Set Up Stack Frame
+// To set up a stack frame on entry to a subroutine,
+// saving return address and static registers, and adjusting stack
+//
+let ra=1, s=1,s0=1,s1=1 in
+def SaveRaF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "save\t$$ra, $$s0, $$s1, $frame_size", [], IIStore >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [RA, SP, S0, S1];
+  let Defs = [SP];
+}
+
+//
+// Use Save to decrement the SP by a constant since SP is not
+// a Mips16 register.
+//
+let ra=0, s=0,s0=0,s1=0 in
+def SaveDecSpF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "save\t$frame_size", [], IIStore >, MayStore {
+  let isCodeGenOnly = 1;
+  let Uses = [SP];
+  let Defs = [SP];
+}
+//
+// Format: SB ry, offset(rx) MIPS16e
+// Purpose: Store Byte (Extended)
+// To store a byte to memory.
+//
+def SbRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
+
+//
+// The Sel(T) instructions are pseudos
+// T means that they use T8 implicitly.
+//
+//
+// Format: SelBeqZ rd, rs, rt
+// Purpose: if rt==0, do nothing
+//          else rs = rt
+//
+def SelBeqZ: Sel<"beqz">;
+
+//
+// Format:  SelTBteqZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmp: SelT<"bteqz", "cmp">;
+
+//
+// Format:  SelTBteqZCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmpi: SeliT<"bteqz", "cmpi">;
+
+//
+// Format:  SelTBteqZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlt: SelT<"bteqz", "slt">;
+
+//
+// Format:  SelTBteqZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlti: SeliT<"bteqz", "slti">;
+
+//
+// Format:  SelTBteqZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltu: SelT<"bteqz", "sltu">;
+
+//
+// Format:  SelTBteqZSltiu rd, rs, rl, rr
+// Purpose: b = Sltiu rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltiu: SeliT<"bteqz", "sltiu">;
+
+//
+// Format: SelBnez rd, rs, rt
+// Purpose: if rt!=0, do nothing
+//          else rs = rt
+//
+def SelBneZ: Sel<"bnez">;
+
+//
+// Format:  SelTBtneZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b!=0 then do nothing.
+//          if b0=0 then rd = rs
+//
+def SelTBtneZCmp: SelT<"btnez", "cmp">;
+
+//
+// Format:  SelTBtnezCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZCmpi: SeliT<"btnez", "cmpi">;
+
+//
+// Format:  SelTBtneZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlt: SelT<"btnez", "slt">;
+
+//
+// Format:  SelTBtneZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlti: SeliT<"btnez", "slti">;
+
+//
+// Format:  SelTBtneZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltu: SelT<"btnez", "sltu">;
+
+//
+// Format:  SelTBtneZSltiu rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltiu: SeliT<"btnez", "sltiu">;
+//
+//
+// Format: SH ry, offset(rx) MIPS16e
+// Purpose: Store Halfword (Extended)
+// To store a halfword to memory.
+//
+def ShRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, IIStore>, MayStore;
+
+//
+// Format: SLL rx, ry, sa MIPS16e
+// Purpose: Shift Word Left Logical (Extended)
+// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+//
+def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
+
+//
+// Format: SLLV ry, rx MIPS16e
+// Purpose: Shift Word Left Logical Variable
+// To execute a left-shift of a word by a variable number of bits.
+//
+def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
+
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> {
+  let Defs = [T8];
+}
+
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">;
+
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
+
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+//
+def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> {
+  let Defs = [T8];
+}
+//
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">;
+
+//
+// Format: SLT rx, ry MIPS16e
+// Purpose: Set on Less Than
+// To record the result of a less-than comparison.
+//
+def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{
+  let Defs = [T8];
+}
+
+def SltCCRxRy16: FCCRR16_ins<"slt">;
+
+// Format: SLTU rx, ry MIPS16e
+// Purpose: Set on Less Than Unsigned
+// To record the result of an unsigned less-than comparison.
+//
+def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{
+  let Defs = [T8];
+}
+
+def SltuRxRyRz16: FRRTR16_ins<"sltu"> {
+  let isCodeGenOnly=1;
+  let Defs = [T8];
+}
+
+
+def SltuCCRxRy16: FCCRR16_ins<"sltu">;
+//
+// Format: SRAV ry, rx MIPS16e
+// Purpose: Shift Word Right Arithmetic Variable
+// To execute an arithmetic right-shift of a word by a variable
+// number of bits.
+//
+def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
+
+
+//
+// Format: SRA rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Arithmetic (Extended)
+// To execute an arithmetic right-shift of a word by a fixed
+// number of bits—1 to 8 bits.
+//
+def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
+
+
+//
+// Format: SRLV ry, rx MIPS16e
+// Purpose: Shift Word Right Logical Variable
+// To execute a logical right-shift of a word by a variable
+// number of bits.
+//
+def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
+
+
+//
+// Format: SRL rx, ry, sa MIPS16e
+// Purpose: Shift Word Right Logical (Extended)
+// To execute a logical right-shift of a word by a fixed
+// number of bits—1 to 31 bits.
+//
+def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
+
+//
+// Format: SUBU rz, rx, ry MIPS16e
+// Purpose: Subtract Unsigned Word
+// To subtract 32-bit integers
+//
+def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
+
+//
+// Format: SW ry, offset(rx) MIPS16e
+// Purpose: Store Word (Extended)
+// To store a word to memory.
+//
+def SwRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, IIStore>, MayStore;
+
+//
+// Format: SW rx, offset(sp) MIPS16e
+// Purpose: Store Word rx (SP-Relative)
+// To store an SP-relative word to memory.
+//
+def SwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b11010, "sw", IIStore>, MayStore;
+
+//
+//
+// Format: XOR rx, ry MIPS16e
+// Purpose: Xor
+// To do a bitwise logical XOR.
+//
+def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>;
+
+class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [InMips16Mode];
+}
+
+// Unary Arith/Logic
+//
+class ArithLogicU_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r),
+            (I CPU16Regs:$r)>;
+
+def: ArithLogicU_pat<not, NotRxRy16>;
+def: ArithLogicU_pat<ineg, NegRxRy16>;
+
+class ArithLogic16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$l, CPU16Regs:$r),
+            (I CPU16Regs:$l, CPU16Regs:$r)>;
+
+def: ArithLogic16_pat<add, AdduRxRyRz16>;
+def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<mul, MultRxRyRz16>;
+def: ArithLogic16_pat<or, OrRxRxRy16>;
+def: ArithLogic16_pat<sub, SubuRxRyRz16>;
+def: ArithLogic16_pat<xor, XorRxRxRy16>;
+
+// Arithmetic and logical instructions with 2 register operands.
+
+class ArithLogicI16_pat<SDNode OpNode, PatFrag imm_type, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$in, imm_type:$imm),
+            (I CPU16Regs:$in, imm_type:$imm)>;
+
+def: ArithLogicI16_pat<add, immSExt8, AddiuRxRxImm16>;
+def: ArithLogicI16_pat<add, immSExt16, AddiuRxRxImmX16>;
+def: ArithLogicI16_pat<shl, immZExt5, SllX16>;
+def: ArithLogicI16_pat<srl, immZExt5, SrlX16>;
+def: ArithLogicI16_pat<sra, immZExt5, SraX16>;
+
+class shift_rotate_reg16_pat<SDNode OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, CPU16Regs:$ra),
+            (I CPU16Regs:$r, CPU16Regs:$ra)>;
+
+def: shift_rotate_reg16_pat<shl, SllvRxRy16>;
+def: shift_rotate_reg16_pat<sra, SravRxRy16>;
+def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
+
+class LoadM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode addr16:$addr), (I addr16:$addr)>;
+
+def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16>;
+def: LoadM16_pat<load, LwRxRyOffMemX16>;
+
+class StoreM16_pat<PatFrag OpNode, Instruction I> :
+  Mips16Pat<(OpNode CPU16Regs:$r, addr16:$addr),
+            (I CPU16Regs:$r, addr16:$addr)>;
+
+def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
+def: StoreM16_pat<store, SwRxRyOffMemX16>;
+
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+  Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+    let Predicates = [InMips16Mode];
+  }
+
+def : Mips16Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+                (Jal16 tglobaladdr:$dst)>;
+
+def : Mips16Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+                (Jal16 texternalsym:$dst)>;
+
+// Indirect branch
+def: Mips16Pat<
+  (brind CPU16Regs:$rs),
+  (JrcRx16 CPU16Regs:$rs)>;
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=0 in
+def JumpLinkReg16:
+  FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
+
+// Mips16 pseudos
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
+  hasExtraSrcRegAllocReq = 1 in
+def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
+
+
+// setcc patterns
+
+class SetCC_R16<PatFrag cond_op, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, CPU16Regs:$ry),
+            (I CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, imm_type:$imm16),
+            (I CPU16Regs:$rx, imm_type:$imm16)>;
+
+
+def: Mips16Pat<(i32  addr16:$addr),
+               (AddiuRxRyOffMemX16  addr16:$addr)>;
+
+
+// Large (>16 bit) immediate loads
+def : Mips16Pat<(i32 imm:$imm),
+                (OrRxRxRy16 (SllX16 (LiRxImmX16 (HI16 imm:$imm)), 16),
+                (LiRxImmX16 (LO16 imm:$imm)))>;
+
+// Carry MipsPatterns
+def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
+                (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+  <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+  <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+  <(brcond CPU16Regs:$rx, bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+//  >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+def: UncondBranch16_pat<br, BimmX16>;
+
+// Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+               (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>;
+
+def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
+
+//
+// MipsDivRem
+//
+def: Mips16Pat
+  <(MipsDivRem16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//
+// MipsDivRemU
+//
+def: Mips16Pat
+  <(MipsDivRemU16 CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivuRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//  signed a,b
+//  x = (a>=b)?x:y
+//
+//  if !(a < b) x = y
+//
+def : Mips16Pat<(select (i32 (setge CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  signed a,b
+//  x = (a>b)?x:y
+//
+//  if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setgt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// unsigned a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setuge CPU16Regs:$a, CPU16Regs:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  unsigned a,b
+//  x = (a>b)?x:y
+//
+//  if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setugt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// signed
+// x = (a >= k)?x:y
+// due to an llvm optimization, i don't think that this will ever
+// be used. This is transformed into x = (a > k-1)?x:y
+//
+//
+
+//def : Mips16Pat<
+//  (select (i32 (setge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSlti CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+//def : Mips16Pat<
+//  (select (i32 (setuge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSltiu CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+// signed
+// x = (a < k)?x:y
+//
+// if !(a < k) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setlt CPU16Regs:$a, immSExt16:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBtneZSlti CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, immSExt16:$b)>;
+
+
+//
+//
+// signed
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// unnsigned
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setule CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == b)? x : y
+//
+// if (a != b) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == 0)? x : y
+//
+// if (a != 0) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBeqZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a == k)? x : y
+//
+// if (a != k) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+
+//
+// signed/unsigned
+// x = (a != b)? x : y
+//
+// if (a == b) x = y
+//
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a != 0)? x : y
+//
+// if (a == 0) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+// signed/unsigned
+// x = (a)? x : y
+//
+// if (!a) x = y
+//
+def : Mips16Pat<(select  CPU16Regs:$a,
+                 CPU16Regs:$x, CPU16Regs:$y),
+      (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+       CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a != k)? x : y
+//
+// if (a == k) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+//
+// When writing C code to test setxx these patterns,
+// some will be transformed into
+// other things. So we test using C code but using -O3 and -O0
+//
+// seteq
+//
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltiuCCRxImmX16 (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs), 1)>;
+
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs, 0),
+   (SltiuCCRxImmX16 CPU16Regs:$lhs, 1)>;
+
+
+//
+// setge
+//
+
+def: Mips16Pat
+  <(setge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+//
+// For constants, llvm transforms this to:
+// x > (k -1) and then reverses the operands to use setlt. So this pattern
+// is not used now by the compiler. (Presumably checking that k-1 does not
+// overflow). The compiler never uses this at a the current time, due to
+// other optimizations.
+//
+//def: Mips16Pat
+//  <(setge CPU16Regs:$lhs, immSExt16:$rhs),
+//   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, immSExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+// This catches the x >= -32768 case by transforming it to  x > -32769
+//
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, -32769),
+   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, -32768),
+   (LiRxImmX16 1))>;
+
+//
+// setgt
+//
+//
+
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setle
+//
+def: Mips16Pat
+  <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImm16 1))>;
+
+//
+// setlt
+//
+def: SetCC_R16<setlt, SltCCRxRy16>;
+
+def: SetCC_I16<setlt, immSExt16, SltiCCRxImmX16>;
+
+//
+// setne
+//
+def : Mips16Pat
+  <(setne CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltuCCRxRy16 (LiRxImmX16 0),
+   (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs))>;
+
+
+//
+// setuge
+//
+def: Mips16Pat
+  <(setuge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+// this pattern will never be used because the compiler will transform
+// x >= k to x > (k - 1) and then use SLT
+//
+//def: Mips16Pat
+//  <(setuge CPU16Regs:$lhs, immZExt16:$rhs),
+//   (XorRxRxRy16 (SltiuCCRxImmX16 CPU16Regs:$lhs, immZExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+//
+// setugt
+//
+def: Mips16Pat
+  <(setugt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setule
+//
+def: Mips16Pat
+  <(setule CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setult
+//
+def: SetCC_R16<setult, SltuCCRxRy16>;
+
+def: SetCC_I16<setult, immSExt16, SltiuCCRxImmX16>;
+
+def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
+               (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
+
+// hi/lo relocs
+
+def : Mips16Pat<(MipsHi tglobaladdr:$in),
+                (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
+def : Mips16Pat<(MipsHi tjumptable:$in),
+                (SllX16 (LiRxImmX16 tjumptable:$in), 16)>;
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
+                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
+// wrapper_pic
+class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+  Mips16Pat<(MipsWrapper RC:$gp, node:$in),
+            (ADDiuOp RC:$gp, node:$in)>;
+
+
+def : Wrapper16Pat<tglobaladdr, AddiuRxRxImmX16, CPU16Regs>;
+def : Wrapper16Pat<tglobaltlsaddr, AddiuRxRxImmX16, CPU16Regs>;
+
+def : Mips16Pat<(i32 (extloadi8   addr16:$src)),
+                (LbuRxRyOffMemX16  addr16:$src)>;
+def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
+                (LhuRxRyOffMemX16  addr16:$src)>;
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
new file mode 100644
index 000000000000..6cca2276856d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -0,0 +1,151 @@
+
+//===-- Mips16RegisterInfo.cpp - MIPS16 Register Information -== ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips16RegisterInfo.h"
+#include "Mips16InstrInfo.h"
+#include "Mips.h"
+#include "Mips16InstrInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
+    const Mips16InstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
+
+bool Mips16RegisterInfo::requiresRegisterScavenging
+  (const MachineFunction &MF) const {
+  return true;
+}
+bool Mips16RegisterInfo::requiresFrameIndexScavenging
+  (const MachineFunction &MF) const {
+  return true;
+}
+
+bool Mips16RegisterInfo::useFPForScavengingIndex
+  (const MachineFunction &MF) const {
+  return false;
+}
+
+bool Mips16RegisterInfo::saveScavengerRegister
+  (MachineBasicBlock &MBB,
+   MachineBasicBlock::iterator I,
+   MachineBasicBlock::iterator &UseMI,
+   const TargetRegisterClass *RC,
+   unsigned Reg) const {
+  DebugLoc DL;
+  TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
+  TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
+  return true;
+}
+
+const TargetRegisterClass *
+Mips16RegisterInfo::intRegClass(unsigned Size) const {
+  assert(Size == 4);
+  return &Mips::CPU16RegsRegClass;
+}
+
+void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The following stack frame objects are always
+  // referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+    FrameReg = Mips::SP;
+  else {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    if (TFI->hasFP(MF)) {
+      FrameReg = Mips::S0;
+    }
+    else {
+      if ((MI.getNumOperands()> OpNo+2) && MI.getOperand(OpNo+2).isReg())
+        FrameReg = MI.getOperand(OpNo+2).getReg();
+      else
+        FrameReg = Mips::SP;
+    }
+  }
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object
+  //   is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following,
+  //   its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  int64_t Offset;
+  bool IsKill = false;
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  if (!MI.isDebugValue() && ( ((FrameReg != Mips::SP) && !isInt<16>(Offset)) ||
+      ((FrameReg == Mips::SP) && !isInt<15>(Offset)) )) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned NewImm;
+    FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
+  }
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+
+
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
new file mode 100644
index 000000000000..2b3d2b1a4ecb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -0,0 +1,50 @@
+//===-- Mips16RegisterInfo.h - Mips16 Register Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS16REGISTERINFO_H
+#define MIPS16REGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class Mips16InstrInfo;
+
+class Mips16RegisterInfo : public MipsRegisterInfo {
+  const Mips16InstrInfo &TII;
+public:
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
+                     const Mips16InstrInfo &TII);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     MachineBasicBlock::iterator &UseMI,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Reg) const;
+
+  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 000000000000..846a8224af35
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,392 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def shamt_64       : Operand<i64>;
+
+// Unsigned Operand
+def uimm16_64      : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Transformation Function - get Imm - 32.
+def Subtract32 : SDNodeXForm<imm, [{
+  return getImm(N, (unsigned)N->getZExtValue() - 32);
+}]>;
+
+// shamt must fit in 6 bits.
+def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+let DecoderNamespace = "Mips64" in {
+
+multiclass Atomic2Ops64<PatFrag Op> {
+  def NAME : Atomic2Ops<Op, CPU64Regs, CPURegs>,
+             Requires<[NotN64, HasStdEnc]>;
+  def _P8  : Atomic2Ops<Op, CPU64Regs, CPU64Regs>,
+             Requires<[IsN64, HasStdEnc]> {
+    let isCodeGenOnly = 1;
+  }
+}
+
+multiclass AtomicCmpSwap64<PatFrag Op>  {
+  def NAME : AtomicCmpSwap<Op, CPU64Regs, CPURegs>,
+             Requires<[NotN64, HasStdEnc]>;
+  def _P8  : AtomicCmpSwap<Op, CPU64Regs, CPU64Regs>,
+             Requires<[IsN64, HasStdEnc]> {
+    let isCodeGenOnly = 1;
+  }
+}
+}
+let usesCustomInserter = 1, Predicates = [HasStdEnc],
+  DecoderNamespace = "Mips64" in {
+  defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64>;
+  defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64>;
+  defm ATOMIC_LOAD_AND_I64  : Atomic2Ops64<atomic_load_and_64>;
+  defm ATOMIC_LOAD_OR_I64   : Atomic2Ops64<atomic_load_or_64>;
+  defm ATOMIC_LOAD_XOR_I64  : Atomic2Ops64<atomic_load_xor_64>;
+  defm ATOMIC_LOAD_NAND_I64 : Atomic2Ops64<atomic_load_nand_64>;
+  defm ATOMIC_SWAP_I64      : Atomic2Ops64<atomic_swap_64>;
+  defm ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap64<atomic_cmp_swap_64>;
+}
+
+/// Pseudo instructions for loading, storing and copying accumulator registers.
+let isPseudo = 1 in {
+  defm LOAD_AC128  : LoadM<"load_ac128", ACRegs128>;
+  defm STORE_AC128 : StoreM<"store_ac128", ACRegs128>;
+}
+
+def COPY_AC128 : PseudoSE<(outs ACRegs128:$dst), (ins ACRegs128:$src), []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+let DecoderNamespace = "Mips64" in {
+/// Arithmetic Instructions (ALU Immediate)
+def DADDi   : ArithLogicI<"daddi", simm16_64, CPU64RegsOpnd>, ADDI_FM<0x18>;
+def DADDiu  : ArithLogicI<"daddiu", simm16_64, CPU64RegsOpnd, immSExt16, add>,
+              ADDI_FM<0x19>, IsAsCheapAsAMove;
+def DANDi   : ArithLogicI<"andi", uimm16_64, CPU64RegsOpnd, immZExt16, and>,
+              ADDI_FM<0xc>;
+def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, CPU64Regs>,
+              SLTI_FM<0xa>;
+def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, CPU64Regs>,
+              SLTI_FM<0xb>;
+def ORi64   : ArithLogicI<"ori", uimm16_64, CPU64RegsOpnd, immZExt16, or>,
+              ADDI_FM<0xd>;
+def XORi64  : ArithLogicI<"xori", uimm16_64, CPU64RegsOpnd, immZExt16, xor>,
+              ADDI_FM<0xe>;
+def LUi64   : LoadUpper<"lui", CPU64Regs, uimm16_64>, LUI_FM;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def DADD   : ArithLogicR<"dadd", CPU64RegsOpnd>, ADD_FM<0, 0x2c>;
+def DADDu  : ArithLogicR<"daddu", CPU64RegsOpnd, 1, IIAlu, add>,
+                              ADD_FM<0, 0x2d>;
+def DSUBu  : ArithLogicR<"dsubu", CPU64RegsOpnd, 0, IIAlu, sub>,
+                              ADD_FM<0, 0x2f>;
+def SLT64  : SetCC_R<"slt", setlt, CPU64Regs>, ADD_FM<0, 0x2a>;
+def SLTu64 : SetCC_R<"sltu", setult, CPU64Regs>, ADD_FM<0, 0x2b>;
+def AND64  : ArithLogicR<"and", CPU64RegsOpnd, 1, IIAlu, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", CPU64RegsOpnd, 1, IIAlu, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", CPU64RegsOpnd, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
+def NOR64  : LogicNOR<"nor", CPU64RegsOpnd>, ADD_FM<0, 0x27>;
+
+/// Shift Instructions
+def DSLL   : shift_rotate_imm<"dsll", shamt, CPU64RegsOpnd, shl, immZExt6>,
+             SRA_FM<0x38, 0>;
+def DSRL   : shift_rotate_imm<"dsrl", shamt, CPU64RegsOpnd, srl, immZExt6>,
+             SRA_FM<0x3a, 0>;
+def DSRA   : shift_rotate_imm<"dsra", shamt, CPU64RegsOpnd, sra, immZExt6>,
+             SRA_FM<0x3b, 0>;
+def DSLLV  : shift_rotate_reg<"dsllv", CPU64RegsOpnd, shl>, SRLV_FM<0x14, 0>;
+def DSRLV  : shift_rotate_reg<"dsrlv", CPU64RegsOpnd, srl>, SRLV_FM<0x16, 0>;
+def DSRAV  : shift_rotate_reg<"dsrav", CPU64RegsOpnd, sra>, SRLV_FM<0x17, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", shamt, CPU64RegsOpnd>, SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", shamt, CPU64RegsOpnd>, SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", shamt, CPU64RegsOpnd>, SRA_FM<0x3f, 0>;
+}
+// Rotate Instructions
+let Predicates = [HasMips64r2, HasStdEnc],
+    DecoderNamespace = "Mips64" in {
+  def DROTR  : shift_rotate_imm<"drotr", shamt, CPU64RegsOpnd, rotr, immZExt6>,
+               SRA_FM<0x3a, 1>;
+  def DROTRV : shift_rotate_reg<"drotrv", CPU64RegsOpnd, rotr>,
+               SRLV_FM<0x16, 1>;
+}
+
+let DecoderNamespace = "Mips64" in {
+/// Load and Store Instructions
+///  aligned
+defm LB64  : LoadM<"lb", CPU64Regs, sextloadi8>, LW_FM<0x20>;
+defm LBu64 : LoadM<"lbu", CPU64Regs, zextloadi8>, LW_FM<0x24>;
+defm LH64  : LoadM<"lh", CPU64Regs, sextloadi16>, LW_FM<0x21>;
+defm LHu64 : LoadM<"lhu", CPU64Regs, zextloadi16>, LW_FM<0x25>;
+defm LW64  : LoadM<"lw", CPU64Regs, sextloadi32>, LW_FM<0x23>;
+defm LWu64 : LoadM<"lwu", CPU64Regs, zextloadi32>, LW_FM<0x27>;
+defm SB64  : StoreM<"sb", CPU64Regs, truncstorei8>, LW_FM<0x28>;
+defm SH64  : StoreM<"sh", CPU64Regs, truncstorei16>, LW_FM<0x29>;
+defm SW64  : StoreM<"sw", CPU64Regs, truncstorei32>, LW_FM<0x2b>;
+defm LD    : LoadM<"ld", CPU64Regs, load>, LW_FM<0x37>;
+defm SD    : StoreM<"sd", CPU64Regs, store>, LW_FM<0x3f>;
+
+/// load/store left/right
+defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, CPU64Regs>, LW_FM<0x22>;
+defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, CPU64Regs>, LW_FM<0x26>;
+defm SWL64 : StoreLeftRightM<"swl", MipsSWL, CPU64Regs>, LW_FM<0x2a>;
+defm SWR64 : StoreLeftRightM<"swr", MipsSWR, CPU64Regs>, LW_FM<0x2e>;
+
+defm LDL   : LoadLeftRightM<"ldl", MipsLDL, CPU64Regs>, LW_FM<0x1a>;
+defm LDR   : LoadLeftRightM<"ldr", MipsLDR, CPU64Regs>, LW_FM<0x1b>;
+defm SDL   : StoreLeftRightM<"sdl", MipsSDL, CPU64Regs>, LW_FM<0x2c>;
+defm SDR   : StoreLeftRightM<"sdr", MipsSDR, CPU64Regs>, LW_FM<0x2d>;
+
+/// Load-linked, Store-conditional
+let Predicates = [NotN64, HasStdEnc] in {
+  def LLD : LLBase<"lld", CPU64RegsOpnd, mem>, LW_FM<0x34>;
+  def SCD : SCBase<"scd", CPU64RegsOpnd, mem>, LW_FM<0x3c>;
+}
+
+let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
+  def LLD_P8 : LLBase<"lld", CPU64RegsOpnd, mem64>, LW_FM<0x34>;
+  def SCD_P8 : SCBase<"scd", CPU64RegsOpnd, mem64>, LW_FM<0x3c>;
+}
+
+/// Jump and Branch Instructions
+def JR64   : IndirectBranch<CPU64Regs>, MTLO_FM<8>;
+def BEQ64  : CBranch<"beq", seteq, CPU64Regs>, BEQ_FM<4>;
+def BNE64  : CBranch<"bne", setne, CPU64Regs>, BEQ_FM<5>;
+def BGEZ64 : CBranchZero<"bgez", setge, CPU64Regs>, BGEZ_FM<1, 1>;
+def BGTZ64 : CBranchZero<"bgtz", setgt, CPU64Regs>, BGEZ_FM<7, 0>;
+def BLEZ64 : CBranchZero<"blez", setle, CPU64Regs>, BGEZ_FM<6, 0>;
+def BLTZ64 : CBranchZero<"bltz", setlt, CPU64Regs>, BGEZ_FM<1, 0>;
+}
+let DecoderNamespace = "Mips64" in
+def JALR64 : JumpLinkReg<"jalr", CPU64Regs>, JALR_FM;
+def JALR64Pseudo : JumpLinkRegPseudo<CPU64Regs, JALR64, RA_64>;
+def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+
+let DecoderNamespace = "Mips64" in {
+/// Multiply and Divide Instructions.
+def DMULT  : Mult<"dmult", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+             MULT_FM<0, 0x1c>;
+def DMULTu : Mult<"dmultu", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+             MULT_FM<0, 0x1d>;
+def PseudoDMULT  : MultDivPseudo<DMULT, ACRegs128, CPU64RegsOpnd, MipsMult,
+                                 IIImul>;
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACRegs128, CPU64RegsOpnd, MipsMultu,
+                                 IIImul>;
+def DSDIV : Div<"ddiv", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1e>;
+def DUDIV : Div<"ddivu", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1f>;
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACRegs128, CPU64RegsOpnd, MipsDivRem,
+                                IIIdiv, 0>;
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACRegs128, CPU64RegsOpnd, MipsDivRemU,
+                                IIIdiv, 0>;
+
+def MTHI64 : MoveToLOHI<"mthi", CPU64Regs, [HI64]>, MTLO_FM<0x11>;
+def MTLO64 : MoveToLOHI<"mtlo", CPU64Regs, [LO64]>, MTLO_FM<0x13>;
+def MFHI64 : MoveFromLOHI<"mfhi", CPU64Regs, [HI64]>, MFLO_FM<0x10>;
+def MFLO64 : MoveFromLOHI<"mflo", CPU64Regs, [LO64]>, MFLO_FM<0x12>;
+
+/// Sign Ext In Register Instructions.
+def SEB64 : SignExtInReg<"seb", i8, CPU64Regs>, SEB_FM<0x10, 0x20>;
+def SEH64 : SignExtInReg<"seh", i16, CPU64Regs>, SEB_FM<0x18, 0x20>;
+
+/// Count Leading
+def DCLZ : CountLeading0<"dclz", CPU64RegsOpnd>, CLO_FM<0x24>;
+def DCLO : CountLeading1<"dclo", CPU64RegsOpnd>, CLO_FM<0x25>;
+
+/// Double Word Swap Bytes/HalfWords
+def DSBH : SubwordSwap<"dsbh", CPU64RegsOpnd>, SEB_FM<2, 0x24>;
+def DSHD : SubwordSwap<"dshd", CPU64RegsOpnd>, SEB_FM<5, 0x24>;
+
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", CPU64Regs, mem_ea_64>, LW_FM<0x19>;
+
+}
+let DecoderNamespace = "Mips64" in {
+def RDHWR64 : ReadHardware<CPU64Regs, HW64RegsOpnd>, RDHWR_FM;
+
+def DEXT : ExtBase<"dext", CPU64RegsOpnd>, EXT_FM<3>;
+let Pattern = []<dag> in {
+  def DEXTU : ExtBase<"dextu", CPU64RegsOpnd>, EXT_FM<2>;
+  def DEXTM : ExtBase<"dextm", CPU64RegsOpnd>, EXT_FM<1>;
+}
+def DINS : InsBase<"dins", CPU64RegsOpnd>, EXT_FM<7>;
+let Pattern = []<dag> in {
+  def DINSU : InsBase<"dinsu", CPU64RegsOpnd>, EXT_FM<6>;
+  def DINSM : InsBase<"dinsm", CPU64RegsOpnd>, EXT_FM<5>;
+}
+
+let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
+  def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                     "dsll\t$rd, $rt, 32", [], IIAlu>;
+  def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
+                    "sll\t$rd, $rt, 0", [], IIAlu>;
+  def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
+                    "sll\t$rd, $rt, 0", [], IIAlu>;
+}
+}
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// extended loads
+let Predicates = [NotN64, HasStdEnc] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
+}
+let Predicates = [IsN64, HasStdEnc] in {
+  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
+}
+
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+              (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
+
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
+              (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
+              (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
+
+def : WrapperPat<tglobaladdr, DADDiu, CPU64Regs>;
+def : WrapperPat<tconstpool, DADDiu, CPU64Regs>;
+def : WrapperPat<texternalsym, DADDiu, CPU64Regs>;
+def : WrapperPat<tblockaddress, DADDiu, CPU64Regs>;
+def : WrapperPat<tjumptable, DADDiu, CPU64Regs>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, CPU64Regs>;
+
+defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+                  ZERO_64>;
+
+// setcc patterns
+defm : SeteqPats<CPU64Regs, SLTiu64, XOR64, SLTu64, ZERO_64>;
+defm : SetlePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgtPats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
+
+// truncate
+def : MipsPat<(i32 (trunc CPU64Regs:$src)),
+              (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>,
+      Requires<[IsN64, HasStdEnc]>;
+
+// 32-to-64-bit extension
+def : MipsPat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
+def : MipsPat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+
+// Sign extend in register
+def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)),
+              (SLL64_64 CPU64Regs:$src)>;
+
+// bswap MipsPattern
+def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
+
+// mflo/hi patterns.
+def : MipsPat<(i64 (ExtractLOHI ACRegs128:$ac, imm:$lohi_idx)),
+              (EXTRACT_SUBREG ACRegs128:$ac, imm:$lohi_idx)>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst, $src",
+                (DADDu CPU64RegsOpnd:$dst,  CPU64RegsOpnd:$src, ZERO_64), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"move $dst, $src",
+                (OR64 CPU64RegsOpnd:$dst, CPU64RegsOpnd:$src, ZERO_64), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"and $rs, $rt, $imm",
+                (DANDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
+                1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"slt $rs, $rt, $imm",
+                (SLTi64 CPURegsOpnd:$rs, CPU64Regs:$rt, simm16_64:$imm), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"xor $rs, $rt, $imm",
+                (XORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
+                1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"not $rt, $rs",
+                (NOR64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rs, ZERO_64), 1>,
+      Requires<[HasMips64]>;
+def : InstAlias<"j $rs", (JR64 CPU64Regs:$rs), 0>, Requires<[HasMips64]>;
+def : InstAlias<"jalr $rs", (JALR64 RA_64, CPU64Regs:$rs)>,
+      Requires<[HasMips64]>;
+def : InstAlias<"jal $rs", (JALR64 RA_64, CPU64Regs:$rs), 0>,
+                 Requires<[HasMips64]>;
+def : InstAlias<"jal $rd,$rs", (JALR64 CPU64Regs:$rd, CPU64Regs:$rs), 0>,
+                 Requires<[HasMips64]>;
+def : InstAlias<"daddu $rs, $rt, $imm",
+                (DADDiu CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
+                1>;
+def : InstAlias<"dadd $rs, $rt, $imm",
+                (DADDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
+                1>;
+def : InstAlias<"or $rs, $rt, $imm",
+                (ORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
+                1>, Requires<[HasMips64]>;
+/// Move between CPU and coprocessor registers
+
+let DecoderNamespace = "Mips64" in {
+def DMFC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
+                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
+                         "dmfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 1>;
+def DMTC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
+                         (ins CPU64RegsOpnd:$rt),
+                         "dmtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 5>;
+def DMFC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
+                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
+                         "dmfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 1>;
+def DMTC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
+                         (ins CPU64RegsOpnd:$rt),
+                         "dmtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 5>;
+}
+
+// Two operand (implicit 0 selector) versions:
+def : InstAlias<"dmfc0 $rt, $rd",
+                (DMFC0_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
+def : InstAlias<"dmtc0 $rt, $rd",
+                (DMTC0_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
+def : InstAlias<"dmfc2 $rt, $rd",
+                (DMFC2_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
+def : InstAlias<"dmtc2 $rt, $rd",
+                (DMTC2_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
new file mode 100644
index 000000000000..99b163ec33ac
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -0,0 +1,153 @@
+//===-- MipsAnalyzeImmediate.cpp - Analyze Immediates ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsAnalyzeImmediate.h"
+#include "Mips.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+MipsAnalyzeImmediate::Inst::Inst(unsigned O, unsigned I) : Opc(O), ImmOpnd(I) {}
+
+// Add I to the instruction sequences.
+void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
+  // Add an instruction seqeunce consisting of just I.
+  if (SeqLs.empty()) {
+    SeqLs.push_back(InstSeq(1, I));
+    return;
+  }
+
+  for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter)
+    Iter->push_back(I);
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
+                                             InstSeqLs &SeqLs) {
+  GetInstSeqLs((Imm + 0x8000ULL) & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ADDiu, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  GetInstSeqLs(Imm & 0xffffffffffff0000ULL, RemSize, SeqLs);
+  AddInstr(SeqLs, Inst(ORi, Imm & 0xffffULL));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
+                                           InstSeqLs &SeqLs) {
+  unsigned Shamt = CountTrailingZeros_64(Imm);
+  GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
+  AddInstr(SeqLs, Inst(SLL, Shamt));
+}
+
+void MipsAnalyzeImmediate::GetInstSeqLs(uint64_t Imm, unsigned RemSize,
+                                        InstSeqLs &SeqLs) {
+  uint64_t MaskedImm = Imm & (0xffffffffffffffffULL >> (64 - Size));
+
+  // Do nothing if Imm is 0.
+  if (!MaskedImm)
+    return;
+
+  // A single ADDiu will do if RemSize <= 16.
+  if (RemSize <= 16) {
+    AddInstr(SeqLs, Inst(ADDiu, MaskedImm));
+    return;
+  }
+
+  // Shift if the lower 16-bit is cleared.
+  if (!(Imm & 0xffff)) {
+    GetInstSeqLsSLL(Imm, RemSize, SeqLs);
+    return;
+  }
+
+  GetInstSeqLsADDiu(Imm, RemSize, SeqLs);
+
+  // If bit 15 is cleared, it doesn't make a difference whether the last
+  // instruction is an ADDiu or ORi. In that case, do not call GetInstSeqLsORi.
+  if (Imm & 0x8000) {
+    InstSeqLs SeqLsORi;
+    GetInstSeqLsORi(Imm, RemSize, SeqLsORi);
+    SeqLs.insert(SeqLs.end(), SeqLsORi.begin(), SeqLsORi.end());
+  }
+}
+
+// Replace a ADDiu & SLL pair with a LUi.
+// e.g. the following two instructions
+//  ADDiu 0x0111
+//  SLL 18
+// are replaced with
+//  LUi 0x444
+void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
+  // Check if the first two instructions are ADDiu and SLL and the shift amount
+  // is at least 16.
+  if ((Seq.size() < 2) || (Seq[0].Opc != ADDiu) ||
+      (Seq[1].Opc != SLL) || (Seq[1].ImmOpnd < 16))
+    return;
+
+  // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
+  int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
+
+  if (!isInt<16>(ShiftedImm))
+    return;
+
+  // Replace the first instruction and erase the second.
+  Seq[0].Opc = LUi;
+  Seq[0].ImmOpnd = (unsigned)(ShiftedImm & 0xffff);
+  Seq.erase(Seq.begin() + 1);
+}
+
+void MipsAnalyzeImmediate::GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts) {
+  InstSeqLs::iterator ShortestSeq = SeqLs.end();
+  // The length of an instruction sequence is at most 7.
+  unsigned ShortestLength = 8;
+
+  for (InstSeqLs::iterator S = SeqLs.begin(); S != SeqLs.end(); ++S) {
+    ReplaceADDiuSLLWithLUi(*S);
+    assert(S->size() <= 7);
+
+    if (S->size() < ShortestLength) {
+      ShortestSeq = S;
+      ShortestLength = S->size();
+    }
+  }
+
+  Insts.clear();
+  Insts.append(ShortestSeq->begin(), ShortestSeq->end());
+}
+
+const MipsAnalyzeImmediate::InstSeq
+&MipsAnalyzeImmediate::Analyze(uint64_t Imm, unsigned Size,
+                               bool LastInstrIsADDiu) {
+  this->Size = Size;
+
+  if (Size == 32) {
+    ADDiu = Mips::ADDiu;
+    ORi = Mips::ORi;
+    SLL = Mips::SLL;
+    LUi = Mips::LUi;
+  } else {
+    ADDiu = Mips::DADDiu;
+    ORi = Mips::ORi64;
+    SLL = Mips::DSLL;
+    LUi = Mips::LUi64;
+  }
+
+  InstSeqLs SeqLs;
+
+  // Get the list of instruction sequences.
+  if (LastInstrIsADDiu | !Imm)
+    GetInstSeqLsADDiu(Imm, Size, SeqLs);
+  else
+    GetInstSeqLs(Imm, Size, SeqLs);
+
+  // Set Insts to the shortest instruction sequence.
+  GetShortestSeq(SeqLs, Insts);
+
+  return Insts;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
new file mode 100644
index 000000000000..a094ddae45de
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -0,0 +1,63 @@
+//===-- MipsAnalyzeImmediate.h - Analyze Immediates ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPS_ANALYZE_IMMEDIATE_H
+#define MIPS_ANALYZE_IMMEDIATE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+  class MipsAnalyzeImmediate {
+  public:
+    struct Inst {
+      unsigned Opc, ImmOpnd;
+      Inst(unsigned Opc, unsigned ImmOpnd);
+    };
+    typedef SmallVector<Inst, 7 > InstSeq;
+
+    /// Analyze - Get an instrucion sequence to load immediate Imm. The last
+    /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
+    /// true;
+    const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
+  private:
+    typedef SmallVector<InstSeq, 5> InstSeqLs;
+
+    /// AddInstr - Add I to all instruction sequences in SeqLs.
+    void AddInstr(InstSeqLs &SeqLs, const Inst &I);
+
+    /// GetInstSeqLsADDiu - Get instrucion sequences which end with an ADDiu to
+    /// load immediate Imm
+    void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsORi - Get instrucion sequences which end with an ORi to
+    /// load immediate Imm
+    void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLsSLL - Get instrucion sequences which end with a SLL to
+    /// load immediate Imm
+    void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// GetInstSeqLs - Get instrucion sequences to load immediate Imm.
+    void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
+
+    /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
+    void ReplaceADDiuSLLWithLUi(InstSeq &Seq);
+
+    /// GetShortestSeq - Find the shortest instruction sequence in SeqLs and
+    /// return it in Insts.
+    void GetShortestSeq(InstSeqLs &SeqLs, InstSeq &Insts);
+
+    unsigned Size;
+    unsigned ADDiu, ORi, SLL, LUi;
+    InstSeq Insts;
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
new file mode 100644
index 000000000000..1876cb6ffae4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -0,0 +1,597 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM Assembly Printer -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-asm-printer"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsELFStreamer.h"
+#include "Mips.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "MipsMCInstLower.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  MipsFI = MF.getInfo<MipsFunctionInfo>();
+  AsmPrinter::runOnMachineFunction(MF);
+  return true;
+}
+
+bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
+  MCOp = MCInstLowering.LowerOperand(MO);
+  return MCOp.isValid();
+}
+
+#include "MipsGenMCPseudoLowering.inc"
+
+void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (MI->isDebugValue()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    PrintDebugValueComment(MI, OS);
+    return;
+  }
+
+  MachineBasicBlock::const_instr_iterator I = MI;
+  MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
+
+  do {
+    // Do any auto-generated pseudo lowerings.
+    if (emitPseudoExpansionLowering(OutStreamer, &*I))
+      continue;
+
+    // The inMips16Mode() test is not permanent.
+    // Some instructions are marked as pseudo right now which
+    // would make the test fail for the wrong reason but
+    // that will be fixed soon. We need this here because we are
+    // removing another test for this situation downstream in the
+    // callchain.
+    //
+    if (I->isPseudo() && !Subtarget->inMips16Mode())
+      llvm_unreachable("Pseudo opcode found in EmitInstruction()");
+
+    MCInst TmpInst0;
+    MCInstLowering.Lower(I, TmpInst0);
+    OutStreamer.EmitInstruction(TmpInst0);
+  } while ((++I != E) && I->isInsideBundle()); // Delay slot check
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  Mips Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "(f)mask  bitmask, offset"
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are
+//            saved on function prologue (e.g. with a 0x80000000 mask, the
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  $fp,48,$ra
+//    .mask   0xc0000000,-8
+//       addiu $sp, $sp, -48
+//       sw $ra, 40($sp)
+//       sw $fp, 36($sp)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and
+//    30 (FP) are saved at prologue. As the save order on prologue is from
+//    left to right, RA is saved first. A -8 offset means that after the
+//    stack pointer subtration, the first register in the mask (RA) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
+  // CPU and FPU Saved Registers Bitmasks
+  unsigned CPUBitmask = 0, FPUBitmask = 0;
+  int CPUTopSavedRegOff, FPUTopSavedRegOff;
+
+  // Set the CPU and FPU Bitmasks
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  // size of stack area to which FP callee-saved regs are saved.
+  unsigned CPURegSize = Mips::CPURegsRegClass.getSize();
+  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
+  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+  bool HasAFGR64Reg = false;
+  unsigned CSFPRegsSize = 0;
+  unsigned i, e = CSI.size();
+
+  // Set FPU Bitmask.
+  for (i = 0; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (Mips::CPURegsRegClass.contains(Reg))
+      break;
+
+    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    if (Mips::AFGR64RegClass.contains(Reg)) {
+      FPUBitmask |= (3 << RegNum);
+      CSFPRegsSize += AFGR64RegSize;
+      HasAFGR64Reg = true;
+      continue;
+    }
+
+    FPUBitmask |= (1 << RegNum);
+    CSFPRegsSize += FGR32RegSize;
+  }
+
+  // Set CPU Bitmask.
+  for (; i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
+    CPUBitmask |= (1 << RegNum);
+  }
+
+  // FP Regs are saved right below where the virtual frame pointer points to.
+  FPUTopSavedRegOff = FPUBitmask ?
+    (HasAFGR64Reg ? -AFGR64RegSize : -FGR32RegSize) : 0;
+
+  // CPU Regs are saved below FP Regs.
+  CPUTopSavedRegOff = CPUBitmask ? -CSFPRegsSize - CPURegSize : 0;
+
+  // Print CPUBitmask
+  O << "\t.mask \t"; printHex32(CPUBitmask, O);
+  O << ',' << CPUTopSavedRegOff << '\n';
+
+  // Print FPUBitmask
+  O << "\t.fmask\t"; printHex32(FPUBitmask, O);
+  O << "," << FPUTopSavedRegOff << '\n';
+}
+
+// Print a 32 bit hex number with all numbers.
+void MipsAsmPrinter::printHex32(unsigned Value, raw_ostream &O) {
+  O << "0x";
+  for (int i = 7; i >= 0; i--)
+    O.write_hex((Value & (0xF << (i*4))) >> (i*4));
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::emitFrameDirective() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(*MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF->getFrameInfo()->getStackSize();
+
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText("\t.frame\t$" +
+           StringRef(MipsInstPrinter::getRegisterName(stackReg)).lower() +
+           "," + Twine(stackSize) + ",$" +
+           StringRef(MipsInstPrinter::getRegisterName(returnReg)).lower());
+}
+
+/// Emit Set directives.
+const char *MipsAsmPrinter::getCurrentABIString() const {
+  switch (Subtarget->getTargetABI()) {
+  case MipsSubtarget::O32:  return "abi32";
+  case MipsSubtarget::N32:  return "abiN32";
+  case MipsSubtarget::N64:  return "abi64";
+  case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
+  default: llvm_unreachable("Unknown Mips ABI");
+  }
+}
+
+void MipsAsmPrinter::EmitFunctionEntryLabel() {
+  if (OutStreamer.hasRawTextSupport()) {
+    if (Subtarget->inMips16Mode())
+      OutStreamer.EmitRawText(StringRef("\t.set\tmips16"));
+    else
+      OutStreamer.EmitRawText(StringRef("\t.set\tnomips16"));
+    // leave out until FSF available gas has micromips changes
+    // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
+    OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
+  }
+
+  if (Subtarget->inMicroMipsMode())
+    if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
+      MES->emitMipsSTOCG(*Subtarget, CurrentFnSym,
+      (unsigned)ELF::STO_MIPS_MICROMIPS);
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(Mang, &MF->getContext());
+
+  emitFrameDirective();
+
+  if (OutStreamer.hasRawTextSupport()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+    printSavedRegsBitmask(OS);
+    OutStreamer.EmitRawText(OS.str());
+    if (!Subtarget->inMips16Mode()) {
+      OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+    }
+  }
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyEnd() {
+  // There are instruction for this macros, but they must
+  // always be at the function end, and we can't emit and
+  // break with BB logic.
+  if (OutStreamer.hasRawTextSupport()) {
+    if (!Subtarget->inMips16Mode()) {
+      OutStreamer.EmitRawText(StringRef("\t.set\tat"));
+      OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
+      OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
+    }
+    OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
+  }
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                       MBB) const {
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *MBB->pred_begin();
+
+  // If the predecessor is a switch statement, assume a jump table
+  // implementation, so it is not a fall through.
+  if (const BasicBlock *bb = Pred->getBasicBlock())
+    if (isa<SwitchInst>(bb->getTerminator()))
+      return false;
+
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+
+  if (PI2 != MBB->pred_end())
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Otherwise, check the last instruction.
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator()) ;
+
+  return !I->isBarrier();
+}
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                     unsigned AsmVariant,const char *ExtraCode,
+                                     raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI,OpNum,AsmVariant,ExtraCode,O);
+    case 'X': // hex const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm())).lower();
+      return false;
+    case 'x': // hex const int (low 16 bits)
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << "0x" << StringRef(utohexstr(MO.getImm() & 0xffff)).lower();
+      return false;
+    case 'd': // decimal const int
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'm': // decimal const int minus 1
+      if ((MO.getType()) != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm() - 1;
+      return false;
+    case 'z': {
+      // $0 if zero, regular printing otherwise
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      int64_t Val = MO.getImm();
+      if (Val)
+        O << Val;
+      else
+        O << "$0";
+      return false;
+    }
+    case 'D': // Second part of a double word register operand
+    case 'L': // Low order register of a double word register operand
+    case 'M': // High order register of a double word register operand
+    {
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      // Number of registers represented by this operand. We are looking
+      // for 2 for 32 bit mode and 1 for 64 bit mode.
+      if (NumVals != 2) {
+        if (Subtarget->isGP64bit() && NumVals == 1 && MO.isReg()) {
+          unsigned Reg = MO.getReg();
+          O << '$' << MipsInstPrinter::getRegisterName(Reg);
+          return false;
+        }
+        return true;
+      }
+
+      unsigned RegOp = OpNum;
+      if (!Subtarget->isGP64bit()){
+        // Endianess reverses which register holds the high or low value
+        // between M and L.
+        switch(ExtraCode[0]) {
+        case 'M':
+          RegOp = (Subtarget->isLittle()) ? OpNum + 1 : OpNum;
+          break;
+        case 'L':
+          RegOp = (Subtarget->isLittle()) ? OpNum : OpNum + 1;
+          break;
+        case 'D': // Always the second part
+          RegOp = OpNum + 1;
+        }
+        if (RegOp >= MI->getNumOperands())
+          return true;
+        const MachineOperand &MO = MI->getOperand(RegOp);
+        if (!MO.isReg())
+          return true;
+        unsigned Reg = MO.getReg();
+        O << '$' << MipsInstPrinter::getRegisterName(Reg);
+        return false;
+      }
+    }
+    }
+  }
+
+  printOperand(MI, OpNum, O);
+  return false;
+}
+
+bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                           unsigned OpNum, unsigned AsmVariant,
+                                           const char *ExtraCode,
+                                           raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "0($" << MipsInstPrinter::getRegisterName(MO.getReg()) << ")";
+
+  return false;
+}
+
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                  raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  bool closeP = false;
+
+  if (MO.getTargetFlags())
+    closeP = true;
+
+  switch(MO.getTargetFlags()) {
+  case MipsII::MO_GPREL:    O << "%gp_rel("; break;
+  case MipsII::MO_GOT_CALL: O << "%call16("; break;
+  case MipsII::MO_GOT:      O << "%got(";    break;
+  case MipsII::MO_ABS_HI:   O << "%hi(";     break;
+  case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
+  case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
+  case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
+  case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+  case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+  case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+  case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+  case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+  case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
+  }
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << '$'
+        << StringRef(MipsInstPrinter::getRegisterName(MO.getReg())).lower();
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << MO.getImm();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      O << *MO.getMBB()->getSymbol();
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      O << *Mang->getSymbol(MO.getGlobal());
+      break;
+
+    case MachineOperand::MO_BlockAddress: {
+      MCSymbol *BA = GetBlockAddressSymbol(MO.getBlockAddress());
+      O << BA->getName();
+      break;
+    }
+
+    case MachineOperand::MO_ExternalSymbol:
+      O << *GetExternalSymbolSymbol(MO.getSymbolName());
+      break;
+
+    case MachineOperand::MO_JumpTableIndex:
+      O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+        << '_' << MO.getIndex();
+      break;
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << MAI->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      if (MO.getOffset())
+        O << "+" << MO.getOffset();
+      break;
+
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // Load/Store memory operands -- imm($reg)
+  // If PIC target the target is loaded as the
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum+1, O);
+  O << "(";
+  printOperand(MI, opNum, O);
+  O << ")";
+}
+
+void MipsAsmPrinter::
+printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  printOperand(MI, opNum, O);
+  O << ", ";
+  printOperand(MI, opNum+1, O);
+  return;
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
+}
+
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  // FIXME: Use SwitchSection.
+
+  // Tell the assembler which ABI we are using
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText("\t.section .mdebug." +
+                            Twine(getCurrentABIString()));
+
+  // TODO: handle O64 ABI
+  if (OutStreamer.hasRawTextSupport()) {
+    if (Subtarget->isABI_EABI()) {
+      if (Subtarget->isGP32bit())
+        OutStreamer.EmitRawText(StringRef("\t.section .gcc_compiled_long32"));
+      else
+        OutStreamer.EmitRawText(StringRef("\t.section .gcc_compiled_long64"));
+    }
+  }
+
+  // return to previous section
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText(StringRef("\t.previous"));
+
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+
+  if (OutStreamer.hasRawTextSupport()) return;
+
+  // Emit Mips ELF register info
+  Subtarget->getMReginfo().emitMipsReginfoSectionCG(
+             OutStreamer, getObjFileLowering(), *Subtarget);
+  if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
+    MES->emitELFHeaderFlagsCG(*Subtarget);
+}
+
+MachineLocation
+MipsAsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  // Handles frame addresses emitted in MipsInstrInfo::emitFrameIndexDebugValue.
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+         "Unexpected MachineOperand types");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
+void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &OS) {
+  // TODO: implement
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMipsAsmPrinter() {
+  RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> A(TheMips64Target);
+  RegisterAsmPrinter<MipsAsmPrinter> B(TheMips64elTarget);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
new file mode 100644
index 000000000000..dbdaf266b75f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -0,0 +1,90 @@
+//===-- MipsAsmPrinter.h - Mips LLVM Assembly Printer ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Mips Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSASMPRINTER_H
+#define MIPSASMPRINTER_H
+
+#include "MipsMCInstLower.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCStreamer;
+class MachineInstr;
+class MachineBasicBlock;
+class Module;
+class raw_ostream;
+
+class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+
+  void EmitInstrWithMacroNoAT(const MachineInstr *MI);
+
+private:
+  // tblgen'erated function.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
+public:
+
+  const MipsSubtarget *Subtarget;
+  const MipsFunctionInfo *MipsFI;
+  MipsMCInstLower MCInstLowering;
+
+  explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer), MCInstLowering(*this) {
+    Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  }
+
+  virtual const char *getPassName() const {
+    return "Mips Assembly Printer";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  void EmitInstruction(const MachineInstr *MI);
+  void printSavedRegsBitmask(raw_ostream &O);
+  void printHex32(unsigned int Value, raw_ostream &O);
+  void emitFrameDirective();
+  const char *getCurrentABIString() const;
+  virtual void EmitFunctionEntryLabel();
+  virtual void EmitFunctionBodyStart();
+  virtual void EmitFunctionBodyEnd();
+  virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
+                                                 MBB) const;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = 0);
+  void EmitStartOfAsmFile(Module &M);
+  void EmitEndOfAsmFile(Module &M);
+  virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 000000000000..462def76cc80
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -0,0 +1,225 @@
+//===-- MipsCallingConv.td - Calling Conventions for Mips --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>:
+  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Only the return rules are defined here for O32. The rules for argument
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+  // i32 are returned in registers V0, V1, A0, A1
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in register D0, D1
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[A0, A1, A2, A3,
+                                           T0, T1, T2, T3],
+                                          [F12, F13, F14, F15,
+                                           F16, F17, F18, F19]>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64],
+                                          [D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64]>>,
+
+  // f32 arguments are passed in single precision FP registers.
+  CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+                                           F16, F17, F18, F19],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // f64 arguments are passed in double precision FP registers.
+  CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+// N32/64 variable arguments.
+// All arguments are passed in integer registers.
+def CC_MipsN_VarArg : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  CCIfType<[i32, f32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  CCIfType<[i64, f64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64,
+                                      T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 8>>,
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // i64 are returned in registers V0_64, V1_64
+  CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in registers D0, D2
+  CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
+// In soft-mode, register A0_64, instead of V1_64, is used to return a long
+// double value.
+def RetCC_F128Soft : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[V0_64, A0_64]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips EABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsEABI : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  // Single fp arguments are passed in pairs within 32-bit mode
+  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()",
+                  CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
+
+  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()",
+                  CCAssignToReg<[F12, F14, F16, F18]>>>,
+
+  // The first 4 double fp arguments are passed in single fp registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()",
+                  CCAssignToReg<[D6, D7, D8, D9]>>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToStack<8, 8>>>
+]>;
+
+def RetCC_MipsEABI : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F1
+  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
+
+  // f64 are returned in register D0
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips FastCC Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_MipsO32_FastCC : CallingConv<[
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+
+  // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_MipsN_FastCC : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToReg<[A0_64, A1_64, A2_64, A3_64, T0_64, T1_64,
+                                 T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+                                 T8_64, V1_64]>>,
+
+  // f64 arguments are passed in double-precision floating pointer registers.
+  CCIfType<[f64], CCAssignToReg<[D0_64, D1_64, D2_64, D3_64, D4_64, D5_64,
+                                 D6_64, D7_64, D8_64, D9_64, D10_64, D11_64,
+                                 D12_64, D13_64, D14_64, D15_64, D16_64, D17_64,
+                                 D18_64, D19_64]>>,
+
+  // Stack parameter slots for i64 and f64 are 64-bit doublewords and
+  // 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_Mips_FastCC : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers. All scratch registers,
+  // except for AT, V0 and T9, are available to be used as argument registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6,
+                                 T7, T8, V1]>>,
+
+  // f32 arguments are passed in single-precision floating pointer registers.
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10,
+                                 F11, F12, F13, F14, F15, F16, F17, F18, F19]>>,
+
+  // Stack parameter slots for i32 and f32 are 32-bit words and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_O32()", CCDelegateTo<CC_MipsO32_FastCC>>,
+  CCDelegateTo<CC_MipsN_FastCC>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def RetCC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
+  CCDelegateTo<RetCC_MipsO32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved register lists.
+//===----------------------------------------------------------------------===//
+
+def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
+                                               (sequence "S%u", 7, 0))>;
+
+def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
+                                   (sequence "S%u", 7, 0))>;
+
+def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
+                                   D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
+                                   (sequence "S%u_64", 7, 0))>;
+
+def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
+                                   GP_64, (sequence "S%u_64", 7, 0))>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp
new file mode 100644
index 000000000000..1d86d903c12e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -0,0 +1,363 @@
+//===-- Mips/MipsCodeEmitter.cpp - Convert Mips Code to Machine Code ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Mips machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class MipsCodeEmitter : public MachineFunctionPass {
+  MipsJITInfo *JTI;
+  const MipsInstrInfo *II;
+  const DataLayout *TD;
+  const MipsSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  const std::vector<MachineJumpTableEntry> *MJTEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(0),
+      II((const MipsInstrInfo *) tm.getInstrInfo()), TD(tm.getDataLayout()),
+      TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "Mips Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+
+  void emitWord(unsigned Word);
+
+  /// Routines that handle operands which add machine relocations which are
+  /// fixed up by the relocation stage.
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                         bool MayNeedFarStub) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+
+  unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+
+  void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
+                                  int Offset) const;
+
+  /// Expand pseudo instructions with accumulator register operands.
+  void expandACCInstr(MachineBasicBlock::instr_iterator &MI,
+                      MachineBasicBlock &MBB, unsigned Opc) const;
+
+  /// \brief Expand pseudo instruction. Return true if MI was expanded.
+  bool expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                     MachineBasicBlock &MBB) const;
+};
+}
+
+char MipsCodeEmitter::ID = 0;
+
+bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  MipsTargetMachine &Target = static_cast<MipsTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<MipsSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = 0;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  JTI->Initialize(MF, IsPIC, Subtarget->isLittle());
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
+                                        const MachineOperand &MO) const {
+  // NOTE: This relocations are for static.
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  uint64_t Form = TSFlags & MipsII::FormMask;
+  if (Form == MipsII::FrmJ)
+    return Mips::reloc_mips_26;
+  if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
+       && MI.isBranch())
+    return Mips::reloc_mips_pc16;
+  if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
+    return Mips::reloc_mips_hi;
+  return Mips::reloc_mips_lo;
+}
+
+unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
+                                               unsigned OpNo) const {
+  MachineOperand MO = MI.getOperand(OpNo);
+  if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unexpected jump target operand kind.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  MachineOperand MO = MI.getOperand(OpNo);
+  emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
+                                         unsigned OpNo) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 15-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo)) << 16;
+  return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
+}
+
+unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  // size is encoded as size-1.
+  return getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
+}
+
+unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  // size is encoded as pos+size-1.
+  return getMachineOpValue(MI, MI.getOperand(OpNo-1)) +
+         getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                            const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                        bool MayNeedFarStub) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             MayNeedFarStub));
+}
+
+void MipsCodeEmitter::emitGlobalAddressUnaligned(const GlobalValue *GV,
+                                           unsigned Reloc, int Offset) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                             const_cast<GlobalValue *>(GV), 0, false));
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset() + Offset,
+                      Reloc, const_cast<GlobalValue *>(GV), 0, false));
+}
+
+void MipsCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void MipsCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, false));
+}
+
+void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+void MipsCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  // Expand pseudo instruction. Skip if MI was not expanded.
+  if (((MI->getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo) &&
+      !expandPseudos(MI, MBB))
+    return;
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  emitWord(getBinaryCodeForInstr(*MI));
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void MipsCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  if (Subtarget->isLittle())
+    MCE.emitWordLE(Word);
+  else
+    MCE.emitWordBE(Word);
+}
+
+void MipsCodeEmitter::expandACCInstr(MachineBasicBlock::instr_iterator &MI,
+                                     MachineBasicBlock &MBB,
+                                     unsigned Opc) const {
+  // Expand "pseudomult $ac0, $t0, $t1" to "mult $t0, $t1".
+  BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Opc))
+    .addReg(MI->getOperand(1).getReg()).addReg(MI->getOperand(2).getReg());
+}
+
+bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
+                                    MachineBasicBlock &MBB) const {
+  switch (MI->getOpcode()) {
+  case Mips::NOP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
+      .addReg(Mips::ZERO).addImm(0);
+    break;
+  case Mips::JALRPseudo:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+  case Mips::PseudoMULT:
+    expandACCInstr(MI, MBB, Mips::MULT);
+    break;
+  case Mips::PseudoMULTu:
+    expandACCInstr(MI, MBB, Mips::MULTu);
+    break;
+  case Mips::PseudoSDIV:
+    expandACCInstr(MI, MBB, Mips::SDIV);
+    break;
+  case Mips::PseudoUDIV:
+    expandACCInstr(MI, MBB, Mips::UDIV);
+    break;
+  case Mips::PseudoMADD:
+    expandACCInstr(MI, MBB, Mips::MADD);
+    break;
+  case Mips::PseudoMADDU:
+    expandACCInstr(MI, MBB, Mips::MADDU);
+    break;
+  case Mips::PseudoMSUB:
+    expandACCInstr(MI, MBB, Mips::MSUB);
+    break;
+  case Mips::PseudoMSUBU:
+    expandACCInstr(MI, MBB, Mips::MSUBU);
+    break;
+  default:
+    return false;
+  }
+
+  (MI--)->eraseFromBundle();
+  return true;
+}
+
+/// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
+/// code to the specified MCE object.
+FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new MipsCodeEmitter(TM, JCE);
+}
+
+#include "MipsGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/Mips/MipsCondMov.td b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
new file mode 100644
index 000000000000..42e4c99f05d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsCondMov.td
@@ -0,0 +1,253 @@
+//===-- MipsCondMov.td - Describe Mips Conditional Moves --*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the Conditional Moves implementation.
+//
+//===----------------------------------------------------------------------===//
+
+// Conditional moves:
+// These instructions are expanded in
+// MipsISelLowering::EmitInstrWithCustomInserter if target does not have
+// conditional move instructions.
+// cond:int, data:int
+class CMov_I_I_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR> {
+  let Constraints = "$F = $rd";
+}
+
+// cond:int, data:float
+class CMov_I_F_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR> {
+  let Constraints = "$F = $fd";
+}
+
+// cond:float, data:int
+class CMov_F_I_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $$fcc0"),
+         [(set RC:$rd, (OpNode RC:$rs, RC:$F))], Itin, FrmFR> {
+  let Uses = [FCR31];
+  let Constraints = "$F = $rd";
+}
+
+// cond:float, data:float
+class CMov_F_F_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $$fcc0"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$F))], Itin, FrmFR> {
+  let Uses = [FCR31];
+  let Constraints = "$F = $fd";
+}
+
+// select patterns
+multiclass MovzPats0<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction SLTOp,
+                     Instruction SLTuOp, Instruction SLTiOp,
+                     Instruction SLTiuOp> {
+  def : MipsPat<(select (i32 (setge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setge CRC:$lhs, immSExt16:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, immSExt16:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setuge CRC:$lh, immSExt16:$rh)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lh, immSExt16:$rh), DRC:$F)>;
+  def : MipsPat<(select (i32 (setle CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setule CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTuOp CRC:$rhs, CRC:$lhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (setgt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiOp CRC:$lhs, (Plus1 imm:$rhs)), DRC:$F)>;
+  def : MipsPat<(select (i32 (setugt CRC:$lhs, immSExt16Plus1:$rhs)),
+                        DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (SLTiuOp CRC:$lhs, (Plus1 imm:$rhs)),
+                          DRC:$F)>;
+}
+
+multiclass MovzPats1<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XOROp> {
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select (i32 (seteq CRC:$lhs, 0)), DRC:$T, DRC:$F),
+                (MOVZInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+multiclass MovzPats2<RegisterClass CRC, RegisterClass DRC,
+                     Instruction MOVZInst, Instruction XORiOp> {
+  def : MipsPat<
+            (select (i32 (seteq CRC:$lhs, immZExt16:$uimm16)), DRC:$T, DRC:$F),
+            (MOVZInst DRC:$T, (XORiOp CRC:$lhs, immZExt16:$uimm16), DRC:$F)>;
+}
+
+multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
+                    Instruction XOROp> {
+  def : MipsPat<(select (i32 (setne CRC:$lhs, CRC:$rhs)), DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, (XOROp CRC:$lhs, CRC:$rhs), DRC:$F)>;
+  def : MipsPat<(select CRC:$cond, DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$cond, DRC:$F)>;
+  def : MipsPat<(select (i32 (setne CRC:$lhs, 0)),DRC:$T, DRC:$F),
+                (MOVNInst DRC:$T, CRC:$lhs, DRC:$F)>;
+}
+
+// Instantiation of instructions.
+def MOVZ_I_I : CMov_I_I_FT<"movz", CPURegs, CPURegs, NoItinerary>,
+               ADD_FM<0, 0xa>;
+let Predicates = [HasStdEnc],
+                  DecoderNamespace = "Mips64" in {
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", CPURegs, CPU64Regs, NoItinerary>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", CPU64Regs, CPURegs, NoItinerary>,
+                     ADD_FM<0, 0xa> {
+    let isCodeGenOnly = 1;
+  }
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", CPU64Regs, CPU64Regs, NoItinerary>,
+                     ADD_FM<0, 0xa> {
+    let isCodeGenOnly = 1;
+  }
+}
+
+def MOVN_I_I       : CMov_I_I_FT<"movn", CPURegs, CPURegs, NoItinerary>,
+                     ADD_FM<0, 0xb>;
+let Predicates = [HasStdEnc],
+                  DecoderNamespace = "Mips64" in {
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", CPURegs, CPU64Regs, NoItinerary>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", CPU64Regs, CPURegs, NoItinerary>,
+                     ADD_FM<0, 0xb> {
+    let isCodeGenOnly = 1;
+  }
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", CPU64Regs, CPU64Regs, NoItinerary>,
+                     ADD_FM<0, 0xb> {
+    let isCodeGenOnly = 1;
+  }
+}
+
+def MOVZ_I_S : CMov_I_F_FT<"movz.s", CPURegs, FGR32, IIFmove>,
+               CMov_I_F_FM<18, 16>;
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", CPU64Regs, FGR32, IIFmove>,
+                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]> {
+  let DecoderNamespace = "Mips64";
+}
+
+def MOVN_I_S : CMov_I_F_FT<"movn.s", CPURegs, FGR32, IIFmove>,
+               CMov_I_F_FM<19, 16>;
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", CPU64Regs, FGR32, IIFmove>,
+                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]> {
+  let DecoderNamespace = "Mips64";
+}
+
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", CPURegs, AFGR64, IIFmove>,
+                   CMov_I_F_FM<18, 17>;
+  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", CPURegs, AFGR64, IIFmove>,
+                   CMov_I_F_FM<19, 17>;
+}
+let Predicates = [IsFP64bit, HasStdEnc],
+                  DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", CPURegs, FGR64, IIFmove>,
+                   CMov_I_F_FM<18, 17>;
+  def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", CPU64Regs, FGR64, IIFmove>,
+                     CMov_I_F_FM<18, 17> {
+    let isCodeGenOnly = 1;
+  }
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", CPURegs, FGR64, IIFmove>,
+                   CMov_I_F_FM<19, 17>;
+  def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", CPU64Regs, FGR64, IIFmove>,
+                     CMov_I_F_FM<19, 17> {
+    let isCodeGenOnly = 1;
+  }
+}
+
+def MOVT_I : CMov_F_I_FT<"movt", CPURegs, IIAlu, MipsCMovFP_T>, CMov_F_I_FM<1>;
+def MOVT_I64 : CMov_F_I_FT<"movt", CPU64Regs, IIAlu, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]> {
+  let DecoderNamespace = "Mips64";
+}
+
+def MOVF_I : CMov_F_I_FT<"movf", CPURegs, IIAlu, MipsCMovFP_F>, CMov_F_I_FM<0>;
+def MOVF_I64 : CMov_F_I_FT<"movf", CPU64Regs, IIAlu, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]> {
+  let DecoderNamespace = "Mips64";
+}
+
+def MOVT_S : CMov_F_F_FT<"movt.s", FGR32, IIFmove, MipsCMovFP_T>,
+             CMov_F_F_FM<16, 1>;
+def MOVF_S : CMov_F_F_FT<"movf.s", FGR32, IIFmove, MipsCMovFP_F>,
+             CMov_F_F_FM<16, 0>;
+
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64, IIFmove, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>;
+  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64, IIFmove, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>;
+}
+let Predicates = [IsFP64bit, HasStdEnc],
+    DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64, IIFmove, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>;
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64, IIFmove, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>;
+}
+
+// Instantiation of conditional move patterns.
+defm : MovzPats0<CPURegs, CPURegs, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
+defm : MovzPats1<CPURegs, CPURegs, MOVZ_I_I, XOR>;
+defm : MovzPats2<CPURegs, CPURegs, MOVZ_I_I, XORi>;
+let Predicates = [HasMips64, HasStdEnc] in {
+  defm : MovzPats0<CPURegs, CPU64Regs, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats0<CPU64Regs, CPURegs, MOVZ_I_I, SLT64, SLTu64, SLTi64,
+                   SLTiu64>;
+  defm : MovzPats0<CPU64Regs, CPU64Regs, MOVZ_I_I64, SLT64, SLTu64, SLTi64,
+                   SLTiu64>;
+  defm : MovzPats1<CPURegs, CPU64Regs, MOVZ_I_I64, XOR>;
+  defm : MovzPats1<CPU64Regs, CPURegs, MOVZ_I64_I, XOR64>;
+  defm : MovzPats1<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XOR64>;
+  defm : MovzPats2<CPURegs, CPU64Regs, MOVZ_I_I64, XORi>;
+  defm : MovzPats2<CPU64Regs, CPURegs, MOVZ_I64_I, XORi64>;
+  defm : MovzPats2<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XORi64>;
+}
+
+defm : MovnPats<CPURegs, CPURegs, MOVN_I_I, XOR>;
+let Predicates = [HasMips64, HasStdEnc] in {
+  defm : MovnPats<CPURegs, CPU64Regs, MOVN_I_I64, XOR>;
+  defm : MovnPats<CPU64Regs, CPURegs, MOVN_I64_I, XOR64>;
+  defm : MovnPats<CPU64Regs, CPU64Regs, MOVN_I64_I64, XOR64>;
+}
+
+defm : MovzPats0<CPURegs, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
+defm : MovzPats1<CPURegs, FGR32, MOVZ_I_S, XOR>;
+defm : MovnPats<CPURegs, FGR32, MOVN_I_S, XOR>;
+let Predicates = [HasMips64, HasStdEnc] in {
+  defm : MovzPats0<CPU64Regs, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
+                   SLTiu64>;
+  defm : MovzPats1<CPU64Regs, FGR32, MOVZ_I64_S, XOR64>;
+  defm : MovnPats<CPU64Regs, FGR32, MOVN_I64_S, XOR64>;
+}
+
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  defm : MovzPats0<CPURegs, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats1<CPURegs, AFGR64, MOVZ_I_D32, XOR>;
+  defm : MovnPats<CPURegs, AFGR64, MOVN_I_D32, XOR>;
+}
+let Predicates = [IsFP64bit, HasStdEnc] in {
+  defm : MovzPats0<CPURegs, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats0<CPU64Regs, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
+                   SLTiu64>;
+  defm : MovzPats1<CPURegs, FGR64, MOVZ_I_D64, XOR>;
+  defm : MovzPats1<CPU64Regs, FGR64, MOVZ_I64_D64, XOR64>;
+  defm : MovnPats<CPURegs, FGR64, MOVN_I_D64, XOR>;
+  defm : MovnPats<CPU64Regs, FGR64, MOVN_I64_D64, XOR64>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
new file mode 100644
index 000000000000..b5de1ebad22b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -0,0 +1,85 @@
+//===-- MipsConstantIslandPass.cpp - Emit Pc Relative loads----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// This pass is used to make Pc relative loads of constants.
+// For now, only Mips16 will use this. While it has the same name and
+// uses many ideas from the LLVM ARM Constant Island Pass, it's not intended
+// to reuse any of the code from the ARM version.
+//
+// Loading constants inline is expensive on Mips16 and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16 bit load instruction.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non linux targets.
+//
+//
+
+#define DEBUG_TYPE "mips-constant-islands"
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  class MipsConstantIslands : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsConstantIslands(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Constant Islands";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+
+
+    const TargetMachine &TM;
+    const MipsInstrInfo *TII;
+    bool IsPIC;
+    unsigned ABI;
+
+  };
+
+  char MipsConstantIslands::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
+  return new MipsConstantIslands(tm);
+}
+
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &F) {
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 000000000000..a72a763fde06
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,310 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasDSP : Predicate<"Subtarget.hasDSP()">,
+             AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">,
+               AssemblerPredicate<"FeatureDSPR2">;
+
+// Fields.
+class Field6<bits<6> val> {
+  bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasDSP];
+}
+
+class PseudoDSP<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasDSP];
+}
+
+// ADDU.QB sub-class format.
+class ADDU_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+class RADDU_W_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+// CMPU.EQ.QB sub-class format.
+class CMP_EQ_QB_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class CMP_EQ_QB_R3_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class PRECR_SRA_PH_W_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+// ABSQ_S.PH sub-class format.
+class ABSQ_S_PH_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+
+class REPL_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+// SHLL.QB sub-class format.
+class SHLL_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs_sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs_sa;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010011;
+}
+
+// LX sub-class format.
+class LX_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b001010;
+}
+
+// ADDUH.QB sub-class format.
+class ADDUH_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b011000;
+}
+
+// APPEND sub-class format.
+class APPEND_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b110001;
+}
+
+// DPA.W.PH sub-class format.
+class DPA_W_PH_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b110000;
+}
+
+// MULT sub-class format.
+class MULT_FMT<bits<6> opcode, bits<6> funct> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = opcode;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = 0;
+  let Inst{5-0} = funct;
+}
+
+// EXTR.W sub-class format (type 1).
+class EXTR_W_TY1_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<2> ac;
+  bits<5> shift_rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = shift_rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+// SHILO sub-class format.
+class SHILO_R1_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<6> shift;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-20} = shift;
+  let Inst{19-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class SHILO_R2_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class RDDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = mask;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class WRDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-11} = mask;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class BPOSGE32_FMT<bits<5> op> : DSPInst {
+  bits<16> offset;
+
+  let Opcode = REGIMM_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = op;
+  let Inst{15-0} = offset;
+}
+
+// INSV sub-class format.
+class INSV_FMT<bits<6> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6} = 0;
+  let Inst{5-0} = op;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 000000000000..3c116e1264b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,1271 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+
+// Mips-specific dsp nodes
+def SDT_MipsExtr : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                        SDTCisVT<2, untyped>]>;
+def SDT_MipsShilo : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<1, i32>]>;
+def SDT_MipsDPA : SDTypeProfile<1, 3, [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                       SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+
+class MipsDSPBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof>;
+
+class MipsDSPSideEffectBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof, [SDNPHasChain, SDNPSideEffect]>;
+
+def MipsEXTP : MipsDSPSideEffectBase<"EXTP", SDT_MipsExtr>;
+def MipsEXTPDP : MipsDSPSideEffectBase<"EXTPDP", SDT_MipsExtr>;
+def MipsEXTR_S_H : MipsDSPSideEffectBase<"EXTR_S_H", SDT_MipsExtr>;
+def MipsEXTR_W : MipsDSPSideEffectBase<"EXTR_W", SDT_MipsExtr>;
+def MipsEXTR_R_W : MipsDSPSideEffectBase<"EXTR_R_W", SDT_MipsExtr>;
+def MipsEXTR_RS_W : MipsDSPSideEffectBase<"EXTR_RS_W", SDT_MipsExtr>;
+
+def MipsSHILO : MipsDSPBase<"SHILO", SDT_MipsShilo>;
+def MipsMTHLIP : MipsDSPSideEffectBase<"MTHLIP", SDT_MipsShilo>;
+
+def MipsMULSAQ_S_W_PH : MipsDSPSideEffectBase<"MULSAQ_S_W_PH", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHL : MipsDSPSideEffectBase<"MAQ_S_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHR : MipsDSPSideEffectBase<"MAQ_S_W_PHR", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHL : MipsDSPSideEffectBase<"MAQ_SA_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHR : MipsDSPSideEffectBase<"MAQ_SA_W_PHR", SDT_MipsDPA>;
+
+def MipsDPAU_H_QBL : MipsDSPBase<"DPAU_H_QBL", SDT_MipsDPA>;
+def MipsDPAU_H_QBR : MipsDSPBase<"DPAU_H_QBR", SDT_MipsDPA>;
+def MipsDPSU_H_QBL : MipsDSPBase<"DPSU_H_QBL", SDT_MipsDPA>;
+def MipsDPSU_H_QBR : MipsDSPBase<"DPSU_H_QBR", SDT_MipsDPA>;
+def MipsDPAQ_S_W_PH : MipsDSPSideEffectBase<"DPAQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQ_S_W_PH : MipsDSPSideEffectBase<"DPSQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQ_SA_L_W : MipsDSPSideEffectBase<"DPAQ_SA_L_W", SDT_MipsDPA>;
+def MipsDPSQ_SA_L_W : MipsDSPSideEffectBase<"DPSQ_SA_L_W", SDT_MipsDPA>;
+
+def MipsDPA_W_PH : MipsDSPBase<"DPA_W_PH", SDT_MipsDPA>;
+def MipsDPS_W_PH : MipsDSPBase<"DPS_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_S_W_PH : MipsDSPSideEffectBase<"DPAQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_SA_W_PH : MipsDSPSideEffectBase<"DPAQX_SA_W_PH", SDT_MipsDPA>;
+def MipsDPAX_W_PH : MipsDSPBase<"DPAX_W_PH", SDT_MipsDPA>;
+def MipsDPSX_W_PH : MipsDSPBase<"DPSX_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_S_W_PH : MipsDSPSideEffectBase<"DPSQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_SA_W_PH : MipsDSPSideEffectBase<"DPSQX_SA_W_PH", SDT_MipsDPA>;
+def MipsMULSA_W_PH : MipsDSPBase<"MULSA_W_PH", SDT_MipsDPA>;
+
+def MipsMULT : MipsDSPBase<"MULT", SDT_MipsDPA>;
+def MipsMULTU : MipsDSPBase<"MULTU", SDT_MipsDPA>;
+def MipsMADD_DSP : MipsDSPBase<"MADD_DSP", SDT_MipsDPA>;
+def MipsMADDU_DSP : MipsDSPBase<"MADDU_DSP", SDT_MipsDPA>;
+def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
+def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
+
+// Flags.
+class UseAC {
+  list<Register> Uses = [AC0];
+}
+
+class UseDSPCtrl {
+  list<Register> Uses = [DSPCtrl];
+}
+
+class ClearDefs {
+  list<Register> Defs = [];
+}
+
+// Instruction encoding.
+class ADDU_QB_ENC : ADDU_QB_FMT<0b00000>;
+class ADDU_S_QB_ENC : ADDU_QB_FMT<0b00100>;
+class SUBU_QB_ENC : ADDU_QB_FMT<0b00001>;
+class SUBU_S_QB_ENC : ADDU_QB_FMT<0b00101>;
+class ADDQ_PH_ENC : ADDU_QB_FMT<0b01010>;
+class ADDQ_S_PH_ENC : ADDU_QB_FMT<0b01110>;
+class SUBQ_PH_ENC : ADDU_QB_FMT<0b01011>;
+class SUBQ_S_PH_ENC : ADDU_QB_FMT<0b01111>;
+class ADDQ_S_W_ENC : ADDU_QB_FMT<0b10110>;
+class SUBQ_S_W_ENC : ADDU_QB_FMT<0b10111>;
+class ADDSC_ENC : ADDU_QB_FMT<0b10000>;
+class ADDWC_ENC : ADDU_QB_FMT<0b10001>;
+class MODSUB_ENC : ADDU_QB_FMT<0b10010>;
+class RADDU_W_QB_ENC : RADDU_W_QB_FMT<0b10100>;
+class ABSQ_S_PH_ENC : ABSQ_S_PH_R2_FMT<0b01001>;
+class ABSQ_S_W_ENC : ABSQ_S_PH_R2_FMT<0b10001>;
+class PRECRQ_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01100>;
+class PRECRQ_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10100>;
+class PRECRQ_RS_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10101>;
+class PRECRQU_S_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01111>;
+class PRECEQ_W_PHL_ENC : ABSQ_S_PH_R2_FMT<0b01100>;
+class PRECEQ_W_PHR_ENC : ABSQ_S_PH_R2_FMT<0b01101>;
+class PRECEQU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b00100>;
+class PRECEQU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b00101>;
+class PRECEQU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b00110>;
+class PRECEQU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b00111>;
+class PRECEU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b11100>;
+class PRECEU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b11101>;
+class PRECEU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b11110>;
+class PRECEU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b11111>;
+class SHLL_QB_ENC : SHLL_QB_FMT<0b00000>;
+class SHLLV_QB_ENC : SHLL_QB_FMT<0b00010>;
+class SHRL_QB_ENC : SHLL_QB_FMT<0b00001>;
+class SHRLV_QB_ENC : SHLL_QB_FMT<0b00011>;
+class SHLL_PH_ENC : SHLL_QB_FMT<0b01000>;
+class SHLLV_PH_ENC : SHLL_QB_FMT<0b01010>;
+class SHLL_S_PH_ENC : SHLL_QB_FMT<0b01100>;
+class SHLLV_S_PH_ENC : SHLL_QB_FMT<0b01110>;
+class SHRA_PH_ENC : SHLL_QB_FMT<0b01001>;
+class SHRAV_PH_ENC : SHLL_QB_FMT<0b01011>;
+class SHRA_R_PH_ENC : SHLL_QB_FMT<0b01101>;
+class SHRAV_R_PH_ENC : SHLL_QB_FMT<0b01111>;
+class SHLL_S_W_ENC : SHLL_QB_FMT<0b10100>;
+class SHLLV_S_W_ENC : SHLL_QB_FMT<0b10110>;
+class SHRA_R_W_ENC : SHLL_QB_FMT<0b10101>;
+class SHRAV_R_W_ENC : SHLL_QB_FMT<0b10111>;
+class MULEU_S_PH_QBL_ENC : ADDU_QB_FMT<0b00110>;
+class MULEU_S_PH_QBR_ENC : ADDU_QB_FMT<0b00111>;
+class MULEQ_S_W_PHL_ENC : ADDU_QB_FMT<0b11100>;
+class MULEQ_S_W_PHR_ENC : ADDU_QB_FMT<0b11101>;
+class MULQ_RS_PH_ENC : ADDU_QB_FMT<0b11111>;
+class MULSAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00110>;
+class MAQ_S_W_PHL_ENC : DPA_W_PH_FMT<0b10100>;
+class MAQ_S_W_PHR_ENC : DPA_W_PH_FMT<0b10110>;
+class MAQ_SA_W_PHL_ENC : DPA_W_PH_FMT<0b10000>;
+class MAQ_SA_W_PHR_ENC : DPA_W_PH_FMT<0b10010>;
+class DPAU_H_QBL_ENC : DPA_W_PH_FMT<0b00011>;
+class DPAU_H_QBR_ENC : DPA_W_PH_FMT<0b00111>;
+class DPSU_H_QBL_ENC : DPA_W_PH_FMT<0b01011>;
+class DPSU_H_QBR_ENC : DPA_W_PH_FMT<0b01111>;
+class DPAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00100>;
+class DPSQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00101>;
+class DPAQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01100>;
+class DPSQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01101>;
+class MULT_DSP_ENC : MULT_FMT<0b000000, 0b011000>;
+class MULTU_DSP_ENC : MULT_FMT<0b000000, 0b011001>;
+class MADD_DSP_ENC : MULT_FMT<0b011100, 0b000000>;
+class MADDU_DSP_ENC : MULT_FMT<0b011100, 0b000001>;
+class MSUB_DSP_ENC : MULT_FMT<0b011100, 0b000100>;
+class MSUBU_DSP_ENC : MULT_FMT<0b011100, 0b000101>;
+class CMPU_EQ_QB_ENC : CMP_EQ_QB_R2_FMT<0b00000>;
+class CMPU_LT_QB_ENC : CMP_EQ_QB_R2_FMT<0b00001>;
+class CMPU_LE_QB_ENC : CMP_EQ_QB_R2_FMT<0b00010>;
+class CMPGU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b00100>;
+class CMPGU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b00101>;
+class CMPGU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b00110>;
+class CMP_EQ_PH_ENC : CMP_EQ_QB_R2_FMT<0b01000>;
+class CMP_LT_PH_ENC : CMP_EQ_QB_R2_FMT<0b01001>;
+class CMP_LE_PH_ENC : CMP_EQ_QB_R2_FMT<0b01010>;
+class BITREV_ENC : ABSQ_S_PH_R2_FMT<0b11011>;
+class PACKRL_PH_ENC : CMP_EQ_QB_R3_FMT<0b01110>;
+class REPL_QB_ENC : REPL_FMT<0b00010>;
+class REPL_PH_ENC : REPL_FMT<0b01010>;
+class REPLV_QB_ENC : ABSQ_S_PH_R2_FMT<0b00011>;
+class REPLV_PH_ENC : ABSQ_S_PH_R2_FMT<0b01011>;
+class PICK_QB_ENC : CMP_EQ_QB_R3_FMT<0b00011>;
+class PICK_PH_ENC : CMP_EQ_QB_R3_FMT<0b01011>;
+class LWX_ENC : LX_FMT<0b00000>;
+class LHX_ENC : LX_FMT<0b00100>;
+class LBUX_ENC : LX_FMT<0b00110>;
+class BPOSGE32_ENC : BPOSGE32_FMT<0b11100>;
+class INSV_ENC : INSV_FMT<0b001100>;
+
+class EXTP_ENC : EXTR_W_TY1_FMT<0b00010>;
+class EXTPV_ENC : EXTR_W_TY1_FMT<0b00011>;
+class EXTPDP_ENC : EXTR_W_TY1_FMT<0b01010>;
+class EXTPDPV_ENC : EXTR_W_TY1_FMT<0b01011>;
+class EXTR_W_ENC : EXTR_W_TY1_FMT<0b00000>;
+class EXTRV_W_ENC : EXTR_W_TY1_FMT<0b00001>;
+class EXTR_R_W_ENC : EXTR_W_TY1_FMT<0b00100>;
+class EXTRV_R_W_ENC : EXTR_W_TY1_FMT<0b00101>;
+class EXTR_RS_W_ENC : EXTR_W_TY1_FMT<0b00110>;
+class EXTRV_RS_W_ENC : EXTR_W_TY1_FMT<0b00111>;
+class EXTR_S_H_ENC : EXTR_W_TY1_FMT<0b01110>;
+class EXTRV_S_H_ENC : EXTR_W_TY1_FMT<0b01111>;
+class SHILO_ENC : SHILO_R1_FMT<0b11010>;
+class SHILOV_ENC : SHILO_R2_FMT<0b11011>;
+class MTHLIP_ENC : SHILO_R2_FMT<0b11111>;
+
+class RDDSP_ENC : RDDSP_FMT<0b10010>;
+class WRDSP_ENC : WRDSP_FMT<0b10011>;
+class ADDU_PH_ENC : ADDU_QB_FMT<0b01000>;
+class ADDU_S_PH_ENC : ADDU_QB_FMT<0b01100>;
+class SUBU_PH_ENC : ADDU_QB_FMT<0b01001>;
+class SUBU_S_PH_ENC : ADDU_QB_FMT<0b01101>;
+class CMPGDU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b11000>;
+class CMPGDU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b11001>;
+class CMPGDU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b11010>;
+class ABSQ_S_QB_ENC : ABSQ_S_PH_R2_FMT<0b00001>;
+class ADDUH_QB_ENC : ADDUH_QB_FMT<0b00000>;
+class ADDUH_R_QB_ENC : ADDUH_QB_FMT<0b00010>;
+class SUBUH_QB_ENC : ADDUH_QB_FMT<0b00001>;
+class SUBUH_R_QB_ENC : ADDUH_QB_FMT<0b00011>;
+class ADDQH_PH_ENC : ADDUH_QB_FMT<0b01000>;
+class ADDQH_R_PH_ENC : ADDUH_QB_FMT<0b01010>;
+class SUBQH_PH_ENC : ADDUH_QB_FMT<0b01001>;
+class SUBQH_R_PH_ENC : ADDUH_QB_FMT<0b01011>;
+class ADDQH_W_ENC : ADDUH_QB_FMT<0b10000>;
+class ADDQH_R_W_ENC : ADDUH_QB_FMT<0b10010>;
+class SUBQH_W_ENC : ADDUH_QB_FMT<0b10001>;
+class SUBQH_R_W_ENC : ADDUH_QB_FMT<0b10011>;
+class MUL_PH_ENC : ADDUH_QB_FMT<0b01100>;
+class MUL_S_PH_ENC : ADDUH_QB_FMT<0b01110>;
+class MULQ_S_W_ENC : ADDUH_QB_FMT<0b10110>;
+class MULQ_RS_W_ENC : ADDUH_QB_FMT<0b10111>;
+class MULQ_S_PH_ENC : ADDU_QB_FMT<0b11110>;
+class DPA_W_PH_ENC : DPA_W_PH_FMT<0b00000>;
+class DPS_W_PH_ENC : DPA_W_PH_FMT<0b00001>;
+class DPAQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11000>;
+class DPAQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11010>;
+class DPAX_W_PH_ENC : DPA_W_PH_FMT<0b01000>;
+class DPSX_W_PH_ENC : DPA_W_PH_FMT<0b01001>;
+class DPSQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11001>;
+class DPSQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11011>;
+class MULSA_W_PH_ENC : DPA_W_PH_FMT<0b00010>;
+class PRECR_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01101>;
+class PRECR_SRA_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11110>;
+class PRECR_SRA_R_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11111>;
+class SHRA_QB_ENC : SHLL_QB_FMT<0b00100>;
+class SHRAV_QB_ENC : SHLL_QB_FMT<0b00110>;
+class SHRA_R_QB_ENC : SHLL_QB_FMT<0b00101>;
+class SHRAV_R_QB_ENC : SHLL_QB_FMT<0b00111>;
+class SHRL_PH_ENC : SHLL_QB_FMT<0b11001>;
+class SHRLV_PH_ENC : SHLL_QB_FMT<0b11011>;
+class APPEND_ENC : APPEND_FMT<0b00000>;
+class BALIGN_ENC : APPEND_FMT<0b10000>;
+class PREPEND_ENC : APPEND_FMT<0b00001>;
+
+// Instruction desc.
+class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        InstrItinClass itin, RegisterClass RCD,
+                        RegisterClass RCS,  RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterClass RCD,
+                           RegisterClass RCS = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCS,
+                             RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
+  list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCD,
+                             RegisterClass RCS,  RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                               InstrItinClass itin, RegisterClass RCT,
+                               RegisterClass RCS = RCT> {
+  dag OutOperandList = (outs RCT:$rt);
+  dag InOperandList = (ins RCS:$rs, shamt:$sa, RCS:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCD,
+                             RegisterClass RCT = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     ImmLeaf immPat, InstrItinClass itin, RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
+  list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList =  (ins RC:$rt, CPURegs:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, CPURegs:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList = (ins RC:$rt, uimm16:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rd);
+  dag InOperandList = (ins CPURegs:$base, CPURegs:$index);
+  string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
+  list<dag> Pattern = [(set CPURegs:$rd,
+                       (OpNode CPURegs:$base, CPURegs:$index))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  bit mayLoad = 1;
+}
+
+class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         InstrItinClass itin, RegisterClass RCD,
+                         RegisterClass RCS = RCD,  RegisterClass RCT = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SDPatternOperator ImmOp, InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins CPURegs:$rs, shamt:$sa, CPURegs:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern =  [(set CPURegs:$rt,
+                        (OpNode CPURegs:$src, CPURegs:$rs, ImmOp:$sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins ACRegsDSP:$ac, CPURegs:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins ACRegsDSP:$ac, uimm16:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins simm16:$shift, ACRegsDSP:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
+  list<dag> Pattern = [(set ACRegsDSP:$ac,
+                        (OpNode immSExt6:$shift, ACRegsDSP:$acin))];
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$acin = $ac";
+}
+
+class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins CPURegs:$rs, ACRegsDSP:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
+  list<dag> Pattern = [(set ACRegsDSP:$ac,
+                        (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$acin = $ac";
+}
+
+class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins CPURegs:$rs, ACRegsDSP:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+  list<dag> Pattern = [(set ACRegsDSP:$ac,
+                        (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
+  list<Register> Uses = [DSPCtrl];
+  string Constraints = "$acin = $ac";
+}
+
+class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rd);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
+  list<dag> Pattern = [(set CPURegs:$rd, (OpNode immZExt10:$mask))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+}
+
+class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins CPURegs:$rs, uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
+  list<dag> Pattern = [(OpNode CPURegs:$rs, immZExt10:$mask)];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACRegsDSP:$ac,
+                        (OpNode CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin))];
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$acin = $ac";
+}
+
+class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACRegsDSP:$ac, (OpNode CPURegs:$rs, CPURegs:$rt))];
+  InstrItinClass Itinerary = itin;
+  int AddedComplexity = 20;
+  bit isCommutable = 1;
+}
+
+class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs ACRegsDSP:$ac);
+  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+  list<dag> Pattern = [(set ACRegsDSP:$ac,
+                        (OpNode CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin))];
+  InstrItinClass Itinerary = itin;
+  int AddedComplexity = 20;
+  string Constraints = "$acin = $ac";
+}
+
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+  MipsPseudo<(outs CPURegs:$dst), (ins), [(set CPURegs:$dst, (OpNode))]> {
+  list<Register> Uses = [DSPCtrl];
+  bit usesCustomInserter = 1;
+}
+
+class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+}
+
+class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins CPURegs:$src, CPURegs:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [(set CPURegs:$rt, (OpNode CPURegs:$src, CPURegs:$rs))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+//===----------------------------------------------------------------------===//
+
+// Addition/subtraction
+class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", int_mips_addu_qb, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", int_mips_subu_qb, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", int_mips_addq_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", int_mips_subq_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
+                                        NoItinerary, CPURegs, CPURegs>,
+                      IsCommutable;
+
+class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
+                                        NoItinerary, CPURegs, CPURegs>;
+
+class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", int_mips_addsc, NoItinerary,
+                                     CPURegs, CPURegs>, IsCommutable;
+
+class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", int_mips_addwc, NoItinerary,
+                                     CPURegs, CPURegs>,
+                   IsCommutable, UseDSPCtrl;
+
+class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
+                                      CPURegs, CPURegs>, ClearDefs;
+
+class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                        ClearDefs;
+
+// Absolute value
+class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
+                                              NoItinerary, DSPRegs>;
+
+class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
+                                             NoItinerary, CPURegs>;
+
+// Precision reduce/expand
+class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
+                                                 int_mips_precrq_qb_ph,
+                                                 NoItinerary, DSPRegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
+                                                int_mips_precrq_ph_w,
+                                                NoItinerary, DSPRegs, CPURegs>,
+                         ClearDefs;
+
+class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
+                                                   int_mips_precrq_rs_ph_w,
+                                                   NoItinerary, DSPRegs,
+                                                   CPURegs>;
+
+class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
+                                                    int_mips_precrqu_s_qb_ph,
+                                                    NoItinerary, DSPRegs,
+                                                    DSPRegs>;
+
+class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
+                                                 int_mips_preceq_w_phl,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
+                                                 int_mips_preceq_w_phr,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
+                                                   int_mips_precequ_ph_qbl,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
+                                                   int_mips_precequ_ph_qbr,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
+                                                    int_mips_precequ_ph_qbla,
+                                                    NoItinerary, DSPRegs>,
+                             ClearDefs;
+
+class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
+                                                    int_mips_precequ_ph_qbra,
+                                                    NoItinerary, DSPRegs>,
+                             ClearDefs;
+
+class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
+                                                  int_mips_preceu_ph_qbl,
+                                                  NoItinerary, DSPRegs>,
+                           ClearDefs;
+
+class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
+                                                  int_mips_preceu_ph_qbr,
+                                                  NoItinerary, DSPRegs>,
+                           ClearDefs;
+
+class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
+                                                   int_mips_preceu_ph_qbla,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
+                                                   int_mips_preceu_ph_qbra,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+// Shift
+class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", int_mips_shll_qb, immZExt3,
+                                          NoItinerary, DSPRegs>;
+
+class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
+                                           NoItinerary, DSPRegs>;
+
+class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", int_mips_shrl_qb, immZExt3,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", int_mips_shll_ph, immZExt4,
+                                          NoItinerary, DSPRegs>;
+
+class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
+                                           NoItinerary, DSPRegs>;
+
+class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
+                                            immZExt4, NoItinerary, DSPRegs>;
+
+class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
+                                             NoItinerary, DSPRegs>;
+
+class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", int_mips_shra_ph, immZExt4,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
+                                            immZExt4, NoItinerary, DSPRegs>,
+                       ClearDefs;
+
+class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
+                                             NoItinerary, DSPRegs>, ClearDefs;
+
+class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
+                                           immZExt5, NoItinerary, CPURegs>;
+
+class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
+                                            NoItinerary, CPURegs>;
+
+class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
+                                           immZExt5, NoItinerary, CPURegs>,
+                      ClearDefs;
+
+class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
+                                            NoItinerary, CPURegs>;
+
+// Multiplication
+class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
+                                              int_mips_muleu_s_ph_qbl,
+                                              NoItinerary, DSPRegs, DSPRegs>;
+
+class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
+                                              int_mips_muleu_s_ph_qbr,
+                                              NoItinerary, DSPRegs, DSPRegs>;
+
+class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
+                                             int_mips_muleq_s_w_phl,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                           IsCommutable;
+
+class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
+                                             int_mips_muleq_s_w_phr,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                           IsCommutable;
+
+class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
+                                          NoItinerary, DSPRegs, DSPRegs>,
+                        IsCommutable;
+
+class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
+                                              MipsMULSAQ_S_W_PH>;
+
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl", MipsMAQ_S_W_PHL>;
+
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr", MipsMAQ_S_W_PHR>;
+
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl", MipsMAQ_SA_W_PHL>;
+
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>;
+
+// Dot product with accumulate/subtract
+class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
+
+class DPAU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbr", MipsDPAU_H_QBR>;
+
+class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl", MipsDPSU_H_QBL>;
+
+class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr", MipsDPSU_H_QBR>;
+
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph", MipsDPAQ_S_W_PH>;
+
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph", MipsDPSQ_S_W_PH>;
+
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w", MipsDPAQ_SA_L_W>;
+
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w", MipsDPSQ_SA_L_W>;
+
+class MULT_DSP_DESC  : MULT_DESC_BASE<"mult", MipsMult, NoItinerary>;
+class MULTU_DSP_DESC : MULT_DESC_BASE<"multu", MipsMultu, NoItinerary>;
+class MADD_DSP_DESC  : MADD_DESC_BASE<"madd", MipsMAdd, NoItinerary>;
+class MADDU_DSP_DESC : MADD_DESC_BASE<"maddu", MipsMAddu, NoItinerary>;
+class MSUB_DSP_DESC  : MADD_DESC_BASE<"msub", MipsMSub, NoItinerary>;
+class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
+
+// Comparison
+class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
+                                               int_mips_cmpu_eq_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
+                                               int_mips_cmpu_lt_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
+                                               int_mips_cmpu_le_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
+                                                int_mips_cmpgu_eq_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
+                                                int_mips_cmpgu_lt_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
+                                                int_mips_cmpgu_le_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+// Misc
+class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                           NoItinerary, CPURegs>, ClearDefs;
+
+class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
+                                              NoItinerary, DSPRegs, DSPRegs>,
+                       ClearDefs;
+
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
+                                    NoItinerary, DSPRegs>, ClearDefs;
+
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
+                                    NoItinerary, DSPRegs>, ClearDefs;
+
+class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                             NoItinerary, DSPRegs, CPURegs>,
+                      ClearDefs;
+
+class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                             NoItinerary, DSPRegs, CPURegs>,
+                      ClearDefs;
+
+class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
+                                            NoItinerary, DSPRegs, DSPRegs>,
+                     ClearDefs, UseDSPCtrl;
+
+class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
+                                            NoItinerary, DSPRegs, DSPRegs>,
+                     ClearDefs, UseDSPCtrl;
+
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>, ClearDefs;
+
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>, ClearDefs;
+
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>, ClearDefs;
+
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", NoItinerary>;
+
+// Extr
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>;
+
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>;
+
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>;
+
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
+                                             NoItinerary>;
+
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>;
+
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
+                                             NoItinerary>;
+
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
+                                              NoItinerary>;
+
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
+                                               NoItinerary>;
+
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
+                                               NoItinerary>;
+
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
+                                                NoItinerary>;
+
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
+                                              NoItinerary>;
+
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
+                                               NoItinerary>;
+
+class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo", MipsSHILO>;
+
+class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov", MipsSHILO>;
+
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip", MipsMTHLIP>;
+
+class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
+
+class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
+
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+// Addition/subtraction
+class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
+                                         NoItinerary, DSPRegs>,
+                      ClearDefs, IsCommutable;
+
+class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
+                                           NoItinerary, DSPRegs>,
+                        ClearDefs, IsCommutable;
+
+class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
+                                         NoItinerary, DSPRegs>, ClearDefs;
+
+class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
+                                         NoItinerary, DSPRegs>,
+                      ClearDefs, IsCommutable;
+
+class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
+                                           NoItinerary, DSPRegs>,
+                        ClearDefs, IsCommutable;
+
+class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
+                                         NoItinerary, DSPRegs>, ClearDefs;
+
+class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
+                                        NoItinerary, CPURegs>,
+                     ClearDefs, IsCommutable;
+
+class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
+                                          NoItinerary, CPURegs>,
+                       ClearDefs, IsCommutable;
+
+class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
+                                        NoItinerary, CPURegs>, ClearDefs;
+
+class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
+                                          NoItinerary, CPURegs>, ClearDefs;
+
+// Comparison
+class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
+                                                 int_mips_cmpgdu_eq_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
+                                                 int_mips_cmpgdu_lt_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
+                                                 int_mips_cmpgdu_le_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+// Absolute
+class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
+                                              NoItinerary, DSPRegs>;
+
+// Multiplication
+class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", int_mips_mul_ph, NoItinerary,
+                                       DSPRegs>, IsCommutable;
+
+class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
+                                         NoItinerary, DSPRegs>, IsCommutable;
+
+class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
+                                         NoItinerary, CPURegs>, IsCommutable;
+
+class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
+                                          NoItinerary, CPURegs>, IsCommutable;
+
+class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+// Dot product with accumulate/subtract
+class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph", MipsDPA_W_PH>;
+
+class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph", MipsDPS_W_PH>;
+
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph", MipsDPAQX_S_W_PH>;
+
+class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph",
+                                              MipsDPAQX_SA_W_PH>;
+
+class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph", MipsDPAX_W_PH>;
+
+class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph", MipsDPSX_W_PH>;
+
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph", MipsDPSQX_S_W_PH>;
+
+class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph",
+                                              MipsDPSQX_SA_W_PH>;
+
+class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
+
+// Precision reduce/expand
+class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
+                                                int_mips_precr_qb_ph,
+                                                NoItinerary, DSPRegs, DSPRegs>;
+
+class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
+                                                     int_mips_precr_sra_ph_w,
+                                                     NoItinerary, DSPRegs,
+                                                     CPURegs>, ClearDefs;
+
+class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
+                                                      int_mips_precr_sra_r_ph_w,
+                                                       NoItinerary, DSPRegs,
+                                                       CPURegs>, ClearDefs;
+
+// Shift
+class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", int_mips_shra_qb, immZExt3,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
+                                            immZExt3, NoItinerary, DSPRegs>,
+                       ClearDefs;
+
+class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
+                                             NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", int_mips_shrl_ph, immZExt4,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+// Misc
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
+                                     NoItinerary>, ClearDefs;
+
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
+                                     NoItinerary>, ClearDefs;
+
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
+                                      NoItinerary>, ClearDefs;
+
+// Pseudos.
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, NoItinerary>;
+
+// Instruction defs.
+// MIPS DSP Rev 1
+def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : ADDSC_ENC, ADDSC_DESC;
+def ADDWC : ADDWC_ENC, ADDWC_DESC;
+def MODSUB : MODSUB_ENC, MODSUB_DESC;
+def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : BITREV_ENC, BITREV_DESC;
+def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
+def LWX : LWX_ENC, LWX_DESC;
+def LHX : LHX_ENC, LHX_DESC;
+def LBUX : LBUX_ENC, LBUX_DESC;
+def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
+def INSV : INSV_ENC, INSV_DESC;
+def EXTP : EXTP_ENC, EXTP_DESC;
+def EXTPV : EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : SHILO_ENC, SHILO_DESC;
+def SHILOV : SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : RDDSP_ENC, RDDSP_DESC;
+def WRDSP : WRDSP_ENC, WRDSP_DESC;
+
+// MIPS DSP Rev 2
+let Predicates = [HasDSPR2] in {
+
+def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC;
+def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC;
+def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC;
+def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC;
+def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC;
+def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC;
+def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC;
+def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC;
+def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC;
+def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC;
+def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC;
+def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC;
+def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC;
+def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC;
+def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC;
+def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC;
+def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC;
+def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC;
+def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC;
+def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC;
+def MUL_PH : MUL_PH_ENC, MUL_PH_DESC;
+def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC;
+def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC;
+def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC;
+def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC;
+def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC;
+def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC;
+def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC;
+def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC;
+def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC;
+def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC;
+def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
+def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
+def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
+def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
+def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
+def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
+def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC;
+def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC;
+def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC;
+def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC;
+def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC;
+def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC;
+def APPEND : APPEND_ENC, APPEND_DESC;
+def BALIGN : BALIGN_ENC, BALIGN_DESC;
+def PREPEND : PREPEND_ENC, PREPEND_DESC;
+
+}
+
+// Pseudos.
+/// Pseudo instructions for loading, storing and copying accumulator registers.
+let isPseudo = 1 in {
+  defm LOAD_AC_DSP  : LoadM<"load_ac_dsp", ACRegsDSP>;
+  defm STORE_AC_DSP : StoreM<"store_ac_dsp", ACRegsDSP>;
+}
+
+def COPY_AC_DSP : PseudoSE<(outs ACRegsDSP:$dst), (ins ACRegsDSP:$src), []>;
+
+// Patterns.
+class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
+  Pat<pattern, result>, Requires<[pred]>;
+
+class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
+                    RegisterClass SrcRC> :
+   DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
+          (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
+
+def : BitconvertPat<i32, v2i16, CPURegs, DSPRegs>;
+def : BitconvertPat<i32, v4i8, CPURegs, DSPRegs>;
+def : BitconvertPat<v2i16, i32, DSPRegs, CPURegs>;
+def : BitconvertPat<v4i8, i32, DSPRegs, CPURegs>;
+
+def : DSPPat<(v2i16 (load addr:$a)),
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+def : DSPPat<(v4i8 (load addr:$a)),
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+def : DSPPat<(store (v2i16 DSPRegs:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPRegs:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
+
+// Extr patterns.
+class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode CPURegs:$rs, ACRegsDSP:$ac)),
+         (Instr ACRegsDSP:$ac, CPURegs:$rs)>;
+
+class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACRegsDSP:$ac)),
+         (Instr ACRegsDSP:$ac, immZExt5:$shift)>;
+
+def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTPDP, EXTPDP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTPDP, EXTPDPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_W, EXTR_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_W, EXTRV_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_R_W, EXTR_R_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_R_W, EXTRV_R_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_RS_W, EXTR_RS_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
+
+// mflo/hi patterns.
+let AddedComplexity = 20 in
+def : DSPPat<(i32 (ExtractLOHI ACRegsDSP:$ac, imm:$lohi_idx)),
+             (EXTRACT_SUBREG ACRegsDSP:$ac, imm:$lohi_idx)>;
+
+// Indexed load patterns.
+class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
+  DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
+         (Instr i32:$base, i32:$index)>;
+
+let AddedComplexity = 20 in {
+  def : IndexedLoadPat<zextloadi8, LBUX>;
+  def : IndexedLoadPat<sextloadi16, LHX>;
+  def : IndexedLoadPat<load, LWX>;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 000000000000..d07a595af38a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -0,0 +1,721 @@
+//===-- MipsDelaySlotFiller.cpp - Mips Delay Slot Filler ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fill delay slots with useful instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+                       " are not NOP.");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Fill all delay slots with NOPs."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableForwardSearch(
+  "disable-mips-df-forward-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search forward."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableSuccBBSearch(
+  "disable-mips-df-succbb-search",
+  cl::init(true),
+  cl::desc("Disallow MIPS delay filler to search successor basic blocks."),
+  cl::Hidden);
+
+static cl::opt<bool> DisableBackwardSearch(
+  "disable-mips-df-backward-search",
+  cl::init(false),
+  cl::desc("Disallow MIPS delay filler to search backward."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+  typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
+
+  /// \brief A functor comparing edge weight of two blocks.
+  struct CmpWeight {
+    CmpWeight(const MachineBasicBlock &S,
+              const MachineBranchProbabilityInfo &P) : Src(S), Prob(P) {}
+
+    bool operator()(const MachineBasicBlock *Dst0,
+                    const MachineBasicBlock *Dst1) const {
+      return Prob.getEdgeWeight(&Src, Dst0) < Prob.getEdgeWeight(&Src, Dst1);
+    }
+
+    const MachineBasicBlock &Src;
+    const MachineBranchProbabilityInfo &Prob;
+  };
+
+  class RegDefsUses {
+  public:
+    RegDefsUses(TargetMachine &TM);
+    void init(const MachineInstr &MI);
+
+    /// This function sets all caller-saved registers in Defs.
+    void setCallerSaved(const MachineInstr &MI);
+
+    /// This function sets all unallocatable registers in Defs.
+    void setUnallocatableRegs(const MachineFunction &MF);
+
+    /// Set bits in Uses corresponding to MBB's live-out registers except for
+    /// the registers that are live-in to SuccBB.
+    void addLiveOut(const MachineBasicBlock &MBB,
+                    const MachineBasicBlock &SuccBB);
+
+    bool update(const MachineInstr &MI, unsigned Begin, unsigned End);
+
+  private:
+    bool checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses, unsigned Reg,
+                          bool IsDef) const;
+
+    /// Returns true if Reg or its alias is in RegSet.
+    bool isRegInSet(const BitVector &RegSet, unsigned Reg) const;
+
+    const TargetRegisterInfo &TRI;
+    BitVector Defs, Uses;
+  };
+
+  /// Base class for inspecting loads and stores.
+  class InspectMemInstr {
+  public:
+    InspectMemInstr(bool ForbidMemInstr_)
+      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
+        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+
+    /// Return true if MI cannot be moved to delay slot.
+    bool hasHazard(const MachineInstr &MI);
+
+    virtual ~InspectMemInstr() {}
+
+  protected:
+    /// Flags indicating whether loads or stores have been seen.
+    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+
+    /// Memory instructions are not allowed to move to delay slot if this flag
+    /// is true.
+    bool ForbidMemInstr;
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) = 0;
+  };
+
+  /// This subclass rejects any memory instructions.
+  class NoMemInstr : public InspectMemInstr {
+  public:
+    NoMemInstr() : InspectMemInstr(true) {}
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI) { return true; }
+  };
+
+  /// This subclass accepts loads from stacks and constant loads.
+  class LoadFromStackOrConst : public InspectMemInstr {
+  public:
+    LoadFromStackOrConst() : InspectMemInstr(false) {}
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI);
+  };
+
+  /// This subclass uses memory dependence information to determine whether a
+  /// memory instruction can be moved to a delay slot.
+  class MemDefsUses : public InspectMemInstr {
+  public:
+    MemDefsUses(const MachineFrameInfo *MFI);
+
+  private:
+    virtual bool hasHazard_(const MachineInstr &MI);
+
+    /// Update Defs and Uses. Return true if there exist dependences that
+    /// disqualify the delay slot candidate between V and values in Uses and
+    /// Defs.
+    bool updateDefsUses(const Value *V, bool MayStore);
+
+    /// Get the list of underlying objects of MI's memory operand.
+    bool getUnderlyingObjects(const MachineInstr &MI,
+                              SmallVectorImpl<const Value *> &Objects) const;
+
+    const MachineFrameInfo *MFI;
+    SmallPtrSet<const Value*, 4> Uses, Defs;
+
+    /// Flags indicating whether loads or stores with no underlying objects have
+    /// been seen.
+    bool SeenNoObjLoad, SeenNoObjStore;
+  };
+
+  class Filler : public MachineFunctionPass {
+  public:
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips Delay Slot Filler";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    /// This function checks if it is valid to move Candidate to the delay slot
+    /// and returns true if it isn't. It also updates memory and register
+    /// dependence information.
+    bool delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                        InspectMemInstr &IM) const;
+
+    /// This function searches range [Begin, End) for an instruction that can be
+    /// moved to the delay slot. Returns true on success.
+    template<typename IterTy>
+    bool searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                     RegDefsUses &RegDU, InspectMemInstr &IM,
+                     IterTy &Filler) const;
+
+    /// This function searches in the backward direction for an instruction that
+    /// can be moved to the delay slot. Returns true on success.
+    bool searchBackward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches MBB in the forward direction for an instruction
+    /// that can be moved to the delay slot. Returns true on success.
+    bool searchForward(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// This function searches one of MBB's successor blocks for an instruction
+    /// that can be moved to the delay slot and inserts clones of the
+    /// instruction into the successor's predecessor blocks.
+    bool searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const;
+
+    /// Pick a successor block of MBB. Return NULL if MBB doesn't have a
+    /// successor block that is not a landing pad.
+    MachineBasicBlock *selectSuccBB(MachineBasicBlock &B) const;
+
+    /// This function analyzes MBB and returns an instruction with an unoccupied
+    /// slot that branches to Dst.
+    std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+    getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const;
+
+    /// Examine Pred and see if it is possible to insert an instruction into
+    /// one of its branches delay slot or its end.
+    bool examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                     RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                     BB2BrMap &BrMap) const;
+
+    bool terminateSearch(const MachineInstr &Candidate) const;
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+static bool hasUnoccupiedSlot(const MachineInstr *MI) {
+  return MI->hasDelaySlot() && !MI->isBundledWithSucc();
+}
+
+/// This function inserts clones of Filler into predecessor blocks.
+static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
+  MachineFunction *MF = Filler->getParent()->getParent();
+
+  for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
+    if (I->second) {
+      MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+      ++UsefulSlots;
+    } else {
+      I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+    }
+  }
+}
+
+/// This function adds registers Filler defines to MBB's live-in register list.
+static void addLiveInRegs(Iter Filler, MachineBasicBlock &MBB) {
+  for (unsigned I = 0, E = Filler->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = Filler->getOperand(I);
+    unsigned R;
+
+    if (!MO.isReg() || !MO.isDef() || !(R = MO.getReg()))
+      continue;
+
+#ifndef NDEBUG
+    const MachineFunction &MF = *MBB.getParent();
+    assert(MF.getTarget().getRegisterInfo()->getAllocatableSet(MF).test(R) &&
+           "Shouldn't move an instruction with unallocatable registers across "
+           "basic block boundaries.");
+#endif
+
+    if (!MBB.isLiveIn(R))
+      MBB.addLiveIn(R);
+  }
+}
+
+RegDefsUses::RegDefsUses(TargetMachine &TM)
+  : TRI(*TM.getRegisterInfo()), Defs(TRI.getNumRegs(), false),
+    Uses(TRI.getNumRegs(), false) {}
+
+void RegDefsUses::init(const MachineInstr &MI) {
+  // Add all register operands which are explicit and non-variadic.
+  update(MI, 0, MI.getDesc().getNumOperands());
+
+  // If MI is a call, add RA to Defs to prevent users of RA from going into
+  // delay slot.
+  if (MI.isCall())
+    Defs.set(Mips::RA);
+
+  // Add all implicit register operands of branch instructions except
+  // register AT.
+  if (MI.isBranch()) {
+    update(MI, MI.getDesc().getNumOperands(), MI.getNumOperands());
+    Defs.reset(Mips::AT);
+  }
+}
+
+void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
+  assert(MI.isCall());
+
+  // If MI is a call, add all caller-saved registers to Defs.
+  BitVector CallerSavedRegs(TRI.getNumRegs(), true);
+
+  CallerSavedRegs.reset(Mips::ZERO);
+  CallerSavedRegs.reset(Mips::ZERO_64);
+
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(); *R; ++R)
+    for (MCRegAliasIterator AI(*R, &TRI, true); AI.isValid(); ++AI)
+      CallerSavedRegs.reset(*AI);
+
+  Defs |= CallerSavedRegs;
+}
+
+void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
+  BitVector AllocSet = TRI.getAllocatableSet(MF);
+
+  for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+    for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
+      AllocSet.set(*AI);
+
+  AllocSet.set(Mips::ZERO);
+  AllocSet.set(Mips::ZERO_64);
+
+  Defs |= AllocSet.flip();
+}
+
+void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB,
+                             const MachineBasicBlock &SuccBB) {
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+       SE = MBB.succ_end(); SI != SE; ++SI)
+    if (*SI != &SuccBB)
+      for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI)
+        Uses.set(*LI);
+}
+
+bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) {
+  BitVector NewDefs(TRI.getNumRegs()), NewUses(TRI.getNumRegs());
+  bool HasHazard = false;
+
+  for (unsigned I = Begin; I != End; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+
+    if (MO.isReg() && MO.getReg())
+      HasHazard |= checkRegDefsUses(NewDefs, NewUses, MO.getReg(), MO.isDef());
+  }
+
+  Defs |= NewDefs;
+  Uses |= NewUses;
+
+  return HasHazard;
+}
+
+bool RegDefsUses::checkRegDefsUses(BitVector &NewDefs, BitVector &NewUses,
+                                   unsigned Reg, bool IsDef) const {
+  if (IsDef) {
+    NewDefs.set(Reg);
+    // check whether Reg has already been defined or used.
+    return (isRegInSet(Defs, Reg) || isRegInSet(Uses, Reg));
+  }
+
+  NewUses.set(Reg);
+  // check whether Reg has already been defined.
+  return isRegInSet(Defs, Reg);
+}
+
+bool RegDefsUses::isRegInSet(const BitVector &RegSet, unsigned Reg) const {
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    if (RegSet.test(*AI))
+      return true;
+  return false;
+}
+
+bool InspectMemInstr::hasHazard(const MachineInstr &MI) {
+  if (!MI.mayStore() && !MI.mayLoad())
+    return false;
+
+  if (ForbidMemInstr)
+    return true;
+
+  OrigSeenLoad = SeenLoad;
+  OrigSeenStore = SeenStore;
+  SeenLoad |= MI.mayLoad();
+  SeenStore |= MI.mayStore();
+
+  // If MI is an ordered or volatile memory reference, disallow moving
+  // subsequent loads and stores to delay slot.
+  if (MI.hasOrderedMemoryRef() && (OrigSeenLoad || OrigSeenStore)) {
+    ForbidMemInstr = true;
+    return true;
+  }
+
+  return hasHazard_(MI);
+}
+
+bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
+  if (MI.mayStore())
+    return true;
+
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+    return true;
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  if (isa<FixedStackPseudoSourceValue>(V))
+    return false;
+
+  if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
+    return !PSV->PseudoSourceValue::isConstant(0) &&
+      (V != PseudoSourceValue::getStack());
+
+  return true;
+}
+
+MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
+  : InspectMemInstr(false), MFI(MFI_), SeenNoObjLoad(false),
+    SeenNoObjStore(false) {}
+
+bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
+  bool HasHazard = false;
+  SmallVector<const Value *, 4> Objs;
+
+  // Check underlying object list.
+  if (getUnderlyingObjects(MI, Objs)) {
+    for (SmallVector<const Value *, 4>::const_iterator I = Objs.begin();
+         I != Objs.end(); ++I)
+      HasHazard |= updateDefsUses(*I, MI.mayStore());
+
+    return HasHazard;
+  }
+
+  // No underlying objects found.
+  HasHazard = MI.mayStore() && (OrigSeenLoad || OrigSeenStore);
+  HasHazard |= MI.mayLoad() || OrigSeenStore;
+
+  SeenNoObjLoad |= MI.mayLoad();
+  SeenNoObjStore |= MI.mayStore();
+
+  return HasHazard;
+}
+
+bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
+  if (MayStore)
+    return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
+
+  Uses.insert(V);
+  return Defs.count(V) || SeenNoObjStore;
+}
+
+bool MemDefsUses::
+getUnderlyingObjects(const MachineInstr &MI,
+                     SmallVectorImpl<const Value *> &Objects) const {
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+    return false;
+
+  const Value *V = (*MI.memoperands_begin())->getValue();
+
+  SmallVector<Value *, 4> Objs;
+  GetUnderlyingObjects(const_cast<Value *>(V), Objs);
+
+  for (SmallVector<Value*, 4>::iterator I = Objs.begin(), E = Objs.end();
+       I != E; ++I) {
+    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(*I)) {
+      if (PSV->isAliased(MFI))
+        return false;
+    } else if (!isIdentifiedObject(V))
+      return false;
+
+    Objects.push_back(*I);
+  }
+
+  return true;
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (Iter I = MBB.begin(); I != MBB.end(); ++I) {
+    if (!hasUnoccupiedSlot(&*I))
+      continue;
+
+    ++FilledSlots;
+    Changed = true;
+
+    // Delay slot filling is disabled at -O0.
+    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+      if (searchBackward(MBB, I))
+        continue;
+
+      if (I->isTerminator()) {
+        if (searchSuccBBs(MBB, I))
+          continue;
+      } else if (searchForward(MBB, I)) {
+        continue;
+      }
+    }
+
+    // Bundle the NOP to the instruction with the delay slot.
+    BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+    MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
+  }
+
+  return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
+
+template<typename IterTy>
+bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
+                         RegDefsUses &RegDU, InspectMemInstr& IM,
+                         IterTy &Filler) const {
+  for (IterTy I = Begin; I != End; ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    if (terminateSearch(*I))
+      break;
+
+    assert((!I->isCall() && !I->isReturn() && !I->isBranch()) &&
+           "Cannot put calls, returns or branches in delay slot.");
+
+    if (delayHasHazard(*I, RegDU, IM))
+      continue;
+
+    Filler = I;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableBackwardSearch)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  MemDefsUses MemDU(MBB.getParent()->getFrameInfo());
+  ReverseIter Filler;
+
+  RegDU.init(*Slot);
+
+  if (searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler)) {
+    MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
+    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+    ++UsefulSlots;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
+  // Can handle only calls.
+  if (DisableForwardSearch || !Slot->isCall())
+    return false;
+
+  RegDefsUses RegDU(TM);
+  NoMemInstr NM;
+  Iter Filler;
+
+  RegDU.setCallerSaved(*Slot);
+
+  if (searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler)) {
+    MBB.splice(llvm::next(Slot), &MBB, Filler);
+    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+    ++UsefulSlots;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
+  if (DisableSuccBBSearch)
+    return false;
+
+  MachineBasicBlock *SuccBB = selectSuccBB(MBB);
+
+  if (!SuccBB)
+    return false;
+
+  RegDefsUses RegDU(TM);
+  bool HasMultipleSuccs = false;
+  BB2BrMap BrMap;
+  OwningPtr<InspectMemInstr> IM;
+  Iter Filler;
+
+  // Iterate over SuccBB's predecessor list.
+  for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(),
+       PE = SuccBB->pred_end(); PI != PE; ++PI)
+    if (!examinePred(**PI, *SuccBB, RegDU, HasMultipleSuccs, BrMap))
+      return false;
+
+  // Do not allow moving instructions which have unallocatable register operands
+  // across basic block boundaries.
+  RegDU.setUnallocatableRegs(*MBB.getParent());
+
+  // Only allow moving loads from stack or constants if any of the SuccBB's
+  // predecessors have multiple successors.
+  if (HasMultipleSuccs) {
+    IM.reset(new LoadFromStackOrConst());
+  } else {
+    const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo();
+    IM.reset(new MemDefsUses(MFI));
+  }
+
+  if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Filler))
+    return false;
+
+  insertDelayFiller(Filler, BrMap);
+  addLiveInRegs(Filler, *SuccBB);
+  Filler->eraseFromParent();
+
+  return true;
+}
+
+MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
+  if (B.succ_empty())
+    return NULL;
+
+  // Select the successor with the larget edge weight.
+  CmpWeight Cmp(B, getAnalysis<MachineBranchProbabilityInfo>());
+  MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(), Cmp);
+  return S->isLandingPad() ? NULL : S;
+}
+
+std::pair<MipsInstrInfo::BranchType, MachineInstr *>
+Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  MachineBasicBlock *TrueBB = 0, *FalseBB = 0;
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  SmallVector<MachineOperand, 2> Cond;
+
+  MipsInstrInfo::BranchType R =
+    TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
+
+  if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
+    return std::make_pair(R, (MachineInstr*)NULL);
+
+  if (R != MipsInstrInfo::BT_CondUncond) {
+    if (!hasUnoccupiedSlot(BranchInstrs[0]))
+      return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+
+    assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
+
+    return std::make_pair(R, BranchInstrs[0]);
+  }
+
+  assert((TrueBB == &Dst) || (FalseBB == &Dst));
+
+  // Examine the conditional branch. See if its slot is occupied.
+  if (hasUnoccupiedSlot(BranchInstrs[0]))
+    return std::make_pair(MipsInstrInfo::BT_Cond, BranchInstrs[0]);
+
+  // If that fails, try the unconditional branch.
+  if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
+    return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
+
+  return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+}
+
+bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
+                         RegDefsUses &RegDU, bool &HasMultipleSuccs,
+                         BB2BrMap &BrMap) const {
+  std::pair<MipsInstrInfo::BranchType, MachineInstr *> P =
+    getBranch(Pred, Succ);
+
+  // Return if either getBranch wasn't able to analyze the branches or there
+  // were no branches with unoccupied slots.
+  if (P.first == MipsInstrInfo::BT_None)
+    return false;
+
+  if ((P.first != MipsInstrInfo::BT_Uncond) &&
+      (P.first != MipsInstrInfo::BT_NoBranch)) {
+    HasMultipleSuccs = true;
+    RegDU.addLiveOut(Pred, Succ);
+  }
+
+  BrMap[&Pred] = P.second;
+  return true;
+}
+
+bool Filler::delayHasHazard(const MachineInstr &Candidate, RegDefsUses &RegDU,
+                            InspectMemInstr &IM) const {
+  bool HasHazard = (Candidate.isImplicitDef() || Candidate.isKill());
+
+  HasHazard |= IM.hasHazard(Candidate);
+  HasHazard |= RegDU.update(Candidate, 0, Candidate.getNumOperands());
+
+  return HasHazard;
+}
+
+bool Filler::terminateSearch(const MachineInstr &Candidate) const {
+  return (Candidate.isTerminator() || Candidate.isCall() ||
+          Candidate.isLabel() || Candidate.isInlineAsm() ||
+          Candidate.hasUnmodeledSideEffects());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
new file mode 100644
index 000000000000..eb9d49fefb2f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -0,0 +1,134 @@
+//===-- MipsFrameLowering.cpp - Mips Frame Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+//  The stack frame required by the ABI (after call):
+//  Offset
+//
+//  0                 ----------
+//  4                 Args to pass
+//  .                 saved $GP  (used in PIC)
+//  .                 Alloca allocations
+//  .                 Local Area
+//  .                 CPU "Callee Saved" Registers
+//  .                 saved FP
+//  .                 saved RA
+//  .                 FPU "Callee Saved" Registers
+//  StackSize         -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: 4
+//     sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+//   suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFormalArguments, all
+// stack references (ObjectOffset) created to reference the function
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+const MipsFrameLowering *MipsFrameLowering::create(MipsTargetMachine &TM,
+                                                   const MipsSubtarget &ST) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16FrameLowering(ST);
+
+  return llvm::createMipsSEFrameLowering(ST);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+      MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo();
+
+  int64_t Offset = 0;
+
+  // Iterate over fixed sized objects.
+  for (int I = MFI->getObjectIndexBegin(); I != 0; ++I)
+    Offset = std::max(Offset, -MFI->getObjectOffset(I));
+
+  // Conservatively assume all callee-saved registers will be saved.
+  for (const uint16_t *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    Offset = RoundUpToAlignment(Offset + Size, Size);
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Check that MaxAlign is not zero if there is a stack object that is not a
+  // callee-saved spill.
+  assert(!MFI->getObjectIndexEnd() || MaxAlign);
+
+  // Iterate over other objects.
+  for (unsigned I = 0, E = MFI->getObjectIndexEnd(); I != E; ++I)
+    Offset = RoundUpToAlignment(Offset + MFI->getObjectSize(I), MaxAlign);
+
+  // Call frame.
+  if (MFI->adjustsStack() && hasReservedCallFrame(MF))
+    Offset = RoundUpToAlignment(Offset + MFI->getMaxCallFrameSize(),
+                                std::max(MaxAlign, getStackAlignment()));
+
+  return RoundUpToAlignment(Offset, getStackAlignment());
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
new file mode 100644
index 000000000000..6a5f79d0dfc4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -0,0 +1,47 @@
+//===-- MipsFrameLowering.h - Define frame lowering for Mips ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_FRAMEINFO_H
+#define MIPS_FRAMEINFO_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class MipsSubtarget;
+
+class MipsFrameLowering : public TargetFrameLowering {
+protected:
+  const MipsSubtarget &STI;
+
+public:
+  explicit MipsFrameLowering(const MipsSubtarget &sti, unsigned Alignment)
+    : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {}
+
+  static const MipsFrameLowering *create(MipsTargetMachine &TM,
+                                         const MipsSubtarget &ST);
+
+  bool hasFP(const MachineFunction &MF) const;
+
+protected:
+  uint64_t estimateStackSize(const MachineFunction &MF) const;
+};
+
+/// Create MipsFrameLowering objects.
+const MipsFrameLowering *createMips16FrameLowering(const MipsSubtarget &ST);
+const MipsFrameLowering *createMipsSEFrameLowering(const MipsSubtarget &ST);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 000000000000..77b08cb11e0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -0,0 +1,154 @@
+//===-- MipsISelDAGToDAG.cpp - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "MipsISelDAGToDAG.h"
+#include "Mips16ISelDAGToDAG.h"
+#include "MipsSEISelDAGToDAG.h"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+
+bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
+  bool Ret = SelectionDAGISel::runOnMachineFunction(MF);
+
+  processFunctionAfterISel(MF);
+
+  return Ret;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                     SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                                    SDValue &Offset, SDValue &Alias) {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    return NULL;
+  }
+
+  // See if subclasses can handle this node.
+  std::pair<bool, SDNode*> Ret = selectNode(Node);
+
+  if (Ret.first)
+    return Ret.second;
+
+  switch(Opcode) {
+  default: break;
+
+  // Get target GOT address.
+  case ISD::GLOBAL_OFFSET_TABLE:
+    return getGlobalBaseReg();
+
+#ifndef NDEBUG
+  case ISD::LOAD:
+  case ISD::STORE:
+    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+           cast<MemSDNode>(Node)->getAlignment() &&
+           "Unexpected unaligned loads/stores.");
+    break;
+#endif
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  return ResNode;
+}
+
+bool MipsDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16ISelDag(TM);
+
+  return llvm::createMipsSEISelDag(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
new file mode 100644
index 000000000000..cf0f9c58aa9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -0,0 +1,93 @@
+//===---- MipsISelDAGToDAG.h - A Dag to Dag Inst Selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSISELDAGTODAG_H
+#define MIPSISELDAGTODAG_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace llvm {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &TM)
+    : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+protected:
+  SDNode *getGlobalBaseReg();
+
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const MipsSubtarget &Subtarget;
+
+private:
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  // Complex Pattern.
+  /// (reg + imm).
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  /// Fall back on this function if all else fails.
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  /// Match integer address pattern.
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                            SDValue &Offset, SDValue &Alias);
+
+  virtual SDNode *Select(SDNode *N);
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
+
+  // getImm - Return a target constant with the specified value.
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+    return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
+  }
+
+  virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
+
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+};
+
+/// createMipsISelDag - This pass converts a legalized DAG into a
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 000000000000..e2219f257ecd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -0,0 +1,3596 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-lower"
+#include "MipsISelLowering.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+LargeGOT("mxgot", cl::Hidden,
+         cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
+
+static const uint16_t O32IntRegs[4] = {
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3
+};
+
+static const uint16_t Mips64IntRegs[8] = {
+  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
+};
+
+static const uint16_t Mips64DPRegs[8] = {
+  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
+};
+
+// If I is a shifted mask, set the size (Size) and the first bit of the
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
+static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+  if (!isShiftedMask_64(I))
+     return false;
+
+  Size = CountPopulation_64(I);
+  Pos = CountTrailingZeros_64(I);
+  return true;
+}
+
+SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
+  MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
+  return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
+}
+
+static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
+  EVT Ty = Op.getValueType();
+
+  if (GlobalAddressSDNode *N = dyn_cast<GlobalAddressSDNode>(Op))
+    return DAG.getTargetGlobalAddress(N->getGlobal(), Op.getDebugLoc(), Ty, 0,
+                                      Flag);
+  if (ExternalSymbolSDNode *N = dyn_cast<ExternalSymbolSDNode>(Op))
+    return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
+  if (BlockAddressSDNode *N = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+  if (JumpTableSDNode *N = dyn_cast<JumpTableSDNode>(Op))
+    return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+  if (ConstantPoolSDNode *N = dyn_cast<ConstantPoolSDNode>(Op))
+    return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                     N->getOffset(), Flag);
+
+  llvm_unreachable("Unexpected node type.");
+  return SDValue();
+}
+
+static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getValueType();
+  SDValue Hi = getTargetNode(Op, DAG, MipsII::MO_ABS_HI);
+  SDValue Lo = getTargetNode(Op, DAG, MipsII::MO_ABS_LO);
+  return DAG.getNode(ISD::ADD, DL, Ty,
+                     DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                     DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+}
+
+SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
+                                         bool HasMips64) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getValueType();
+  unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                            getTargetNode(Op, DAG, GOTFlag));
+  SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                             MachinePointerInfo::getGOT(), false, false, false,
+                             0);
+  unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+  SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(Op, DAG, LoFlag));
+  return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+}
+
+SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getValueType();
+  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                            getTargetNode(Op, DAG, Flag));
+  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Tgt,
+                     MachinePointerInfo::getGOT(), false, false, false, 0);
+}
+
+SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
+                                                  unsigned HiFlag,
+                                                  unsigned LoFlag) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getValueType();
+  SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
+  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+  SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                getTargetNode(Op, DAG, LoFlag));
+  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Wrapper,
+                     MachinePointerInfo::getGOT(), false, false, false, 0);
+}
+
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::TailCall:          return "MipsISD::TailCall";
+  case MipsISD::Hi:                return "MipsISD::Hi";
+  case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GPRel:             return "MipsISD::GPRel";
+  case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
+  case MipsISD::Ret:               return "MipsISD::Ret";
+  case MipsISD::EH_RETURN:         return "MipsISD::EH_RETURN";
+  case MipsISD::FPBrcond:          return "MipsISD::FPBrcond";
+  case MipsISD::FPCmp:             return "MipsISD::FPCmp";
+  case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
+  case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
+  case MipsISD::FPRound:           return "MipsISD::FPRound";
+  case MipsISD::ExtractLOHI:       return "MipsISD::ExtractLOHI";
+  case MipsISD::InsertLOHI:        return "MipsISD::InsertLOHI";
+  case MipsISD::Mult:              return "MipsISD::Mult";
+  case MipsISD::Multu:             return "MipsISD::Multu";
+  case MipsISD::MAdd:              return "MipsISD::MAdd";
+  case MipsISD::MAddu:             return "MipsISD::MAddu";
+  case MipsISD::MSub:              return "MipsISD::MSub";
+  case MipsISD::MSubu:             return "MipsISD::MSubu";
+  case MipsISD::DivRem:            return "MipsISD::DivRem";
+  case MipsISD::DivRemU:           return "MipsISD::DivRemU";
+  case MipsISD::DivRem16:          return "MipsISD::DivRem16";
+  case MipsISD::DivRemU16:         return "MipsISD::DivRemU16";
+  case MipsISD::BuildPairF64:      return "MipsISD::BuildPairF64";
+  case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
+  case MipsISD::Wrapper:           return "MipsISD::Wrapper";
+  case MipsISD::Sync:              return "MipsISD::Sync";
+  case MipsISD::Ext:               return "MipsISD::Ext";
+  case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::LWL:               return "MipsISD::LWL";
+  case MipsISD::LWR:               return "MipsISD::LWR";
+  case MipsISD::SWL:               return "MipsISD::SWL";
+  case MipsISD::SWR:               return "MipsISD::SWR";
+  case MipsISD::LDL:               return "MipsISD::LDL";
+  case MipsISD::LDR:               return "MipsISD::LDR";
+  case MipsISD::SDL:               return "MipsISD::SDL";
+  case MipsISD::SDR:               return "MipsISD::SDR";
+  case MipsISD::EXTP:              return "MipsISD::EXTP";
+  case MipsISD::EXTPDP:            return "MipsISD::EXTPDP";
+  case MipsISD::EXTR_S_H:          return "MipsISD::EXTR_S_H";
+  case MipsISD::EXTR_W:            return "MipsISD::EXTR_W";
+  case MipsISD::EXTR_R_W:          return "MipsISD::EXTR_R_W";
+  case MipsISD::EXTR_RS_W:         return "MipsISD::EXTR_RS_W";
+  case MipsISD::SHILO:             return "MipsISD::SHILO";
+  case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
+  case MipsISD::MULT:              return "MipsISD::MULT";
+  case MipsISD::MULTU:             return "MipsISD::MULTU";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSP";
+  case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
+  case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
+  case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
+  default:                         return NULL;
+  }
+}
+
+MipsTargetLowering::
+MipsTargetLowering(MipsTargetMachine &TM)
+  : TargetLowering(TM, new MipsTargetObjectFile()),
+    Subtarget(&TM.getSubtarget<MipsSubtarget>()),
+    HasMips64(Subtarget->hasMips64()), IsN64(Subtarget->isABI_N64()),
+    IsO32(Subtarget->isABI_O32()) {
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // Load extented operations for i1 types must be promoted
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // MIPS doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Used by legalize types to correctly generate the setcc result.
+  // Without this, every float setcc comes with a AND/OR with the result,
+  // we don't want this, since the fpcmp result goes to a flag register,
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  // Mips Custom Operations
+  setOperationAction(ISD::BR_JT,              MVT::Other, Custom);
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT_CC,          MVT::f64,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
+  setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
+  setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+
+  if (!TM.Options.NoNaNsFPMath) {
+    setOperationAction(ISD::FABS,             MVT::f32,   Custom);
+    setOperationAction(ISD::FABS,             MVT::f64,   Custom);
+  }
+
+  if (HasMips64) {
+    setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
+    setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
+    setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
+    setOperationAction(ISD::JumpTable,          MVT::i64,   Custom);
+    setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
+    setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
+    setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
+    setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+  }
+
+  if (!HasMips64) {
+    setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
+    setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
+  }
+
+  setOperationAction(ISD::ADD,                MVT::i32,   Custom);
+  if (HasMips64)
+    setOperationAction(ISD::ADD,                MVT::i64,   Custom);
+
+  setOperationAction(ISD::SDIV, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Operations not directly supported by Mips.
+  setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC,             MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i64,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i64,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
+  setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
+  setOperationAction(ISD::CTPOP,             MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF,   MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i32,   Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF,   MVT::i64,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,  Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64,  Expand);
+
+  if (!Subtarget->hasMips32r2())
+    setOperationAction(ISD::ROTR, MVT::i32,   Expand);
+
+  if (!Subtarget->hasMips64r2())
+    setOperationAction(ISD::ROTR, MVT::i64,   Expand);
+
+  setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f64,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
+  setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
+  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
+  setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
+  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f32,   Expand);
+  setOperationAction(ISD::FMA,               MVT::f64,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f32,   Expand);
+  setOperationAction(ISD::FREM,              MVT::f64,   Expand);
+
+  if (!TM.Options.NoNaNsFPMath) {
+    setOperationAction(ISD::FNEG,             MVT::f32,   Expand);
+    setOperationAction(ISD::FNEG,             MVT::f64,   Expand);
+  }
+
+  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,       MVT::i64, Expand);
+
+  setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+
+  setOperationAction(ISD::VAARG,             MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,             MVT::Other, Expand);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i64,    Expand);
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
+
+  setInsertFencesForAtomic(true);
+
+  if (!Subtarget->hasSEInReg()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
+
+  if (!Subtarget->hasBitCount()) {
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+    setOperationAction(ISD::CTLZ, MVT::i64, Expand);
+  }
+
+  if (!Subtarget->hasSwap()) {
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+  }
+
+  if (HasMips64) {
+    setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
+    setTruncStoreAction(MVT::i64, MVT::i32, Custom);
+  }
+
+  setTargetDAGCombine(ISD::SDIVREM);
+  setTargetDAGCombine(ISD::UDIVREM);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::ADD);
+
+  setMinFunctionAlignment(HasMips64 ? 3 : 2);
+
+  setStackPointerRegisterToSaveRestore(IsN64 ? Mips::SP_64 : Mips::SP);
+
+  setExceptionPointerRegister(IsN64 ? Mips::A0_64 : Mips::A0);
+  setExceptionSelectorRegister(IsN64 ? Mips::A1_64 : Mips::A1);
+
+  MaxStoresPerMemcpy = 16;
+}
+
+const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16TargetLowering(TM);
+
+  return llvm::createMipsSETargetLowering(TM);
+}
+
+EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
+  unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
+                                                  MipsISD::DivRemU16;
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
+                               N->getOperand(0), N->getOperand(1));
+  SDValue InChain = DAG.getEntryNode();
+  SDValue InGlue = DivRem;
+
+  // insert MFLO
+  if (N->hasAnyUseOfValue(0)) {
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, DL, LO, Ty,
+                                            InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
+    InChain = CopyFromLo.getValue(1);
+    InGlue = CopyFromLo.getValue(2);
+  }
+
+  // insert MFHI
+  if (N->hasAnyUseOfValue(1)) {
+    SDValue CopyFromHi = DAG.getCopyFromReg(InChain, DL,
+                                            HI, Ty, InGlue);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
+  }
+
+  return SDValue();
+}
+
+static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return Mips::FCOND_OEQ;
+  case ISD::SETUNE: return Mips::FCOND_UNE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:
+  case ISD::SETOLE: return Mips::FCOND_OLE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE;
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN;
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:
+  case ISD::SETONE: return Mips::FCOND_ONE;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+
+/// This function returns true if the floating point conditional branches and
+/// conditional moves which use condition code CC should be inverted.
+static bool invertFPCondCodeUser(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return false;
+
+  assert((CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT) &&
+         "Illegal Condition Code");
+
+  return true;
+}
+
+// Creates and returns an FPCmp node from a setcc node.
+// Returns Op if setcc is not a floating point comparison.
+static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
+  // must be a SETCC node
+  if (Op.getOpcode() != ISD::SETCC)
+    return Op;
+
+  SDValue LHS = Op.getOperand(0);
+
+  if (!LHS.getValueType().isFloatingPoint())
+    return Op;
+
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
+  // node if necessary.
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  return DAG.getNode(MipsISD::FPCmp, DL, MVT::Glue, LHS, RHS,
+                     DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
+}
+
+// Creates and returns a CMovFPT/F node.
+static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
+                            SDValue False, DebugLoc DL) {
+  ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
+  bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
+
+  return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
+                     True.getValueType(), True, False, Cond);
+}
+
+static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const MipsSubtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue SetCC = N->getOperand(0);
+
+  if ((SetCC.getOpcode() != ISD::SETCC) ||
+      !SetCC.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue False = N->getOperand(2);
+  EVT FalseTy = False.getValueType();
+
+  if (!FalseTy.isInteger())
+    return SDValue();
+
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(False);
+
+  if (!CN || CN->getZExtValue())
+    return SDValue();
+
+  const DebugLoc DL = N->getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
+  SDValue True = N->getOperand(1);
+
+  SetCC = DAG.getSetCC(DL, SetCC.getValueType(), SetCC.getOperand(0),
+                       SetCC.getOperand(1), ISD::getSetCCInverse(CC, true));
+
+  return DAG.getNode(ISD::SELECT, DL, FalseTy, SetCC, False, True);
+}
+
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  // Pattern match EXT.
+  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+  //  => ext $dst, $src, size, pos
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+  unsigned ShiftRightOpc = ShiftRight.getOpcode();
+
+  // Op's first operand must be a shift right.
+  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
+    return SDValue();
+
+  // The second operand of the shift must be an immediate.
+  ConstantSDNode *CN;
+  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+    return SDValue();
+
+  uint64_t Pos = CN->getZExtValue();
+  uint64_t SMPos, SMSize;
+
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  EVT ValTy = N->getValueType(0);
+  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), ValTy,
+                     ShiftRight.getOperand(0), DAG.getConstant(Pos, MVT::i32),
+                     DAG.getConstant(SMSize, MVT::i32));
+}
+
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget *Subtarget) {
+  // Pattern match INS.
+  //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+  //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
+  //  => ins $dst, $src, size, pos, $src1
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  ConstantSDNode *CN;
+
+  // See if Op's first operand matches (and $src1 , mask0).
+  if (And0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+      !isShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+    return SDValue();
+
+  // See if Op's second operand matches (and (shl $src, pos), mask1).
+  if (And1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+    return SDValue();
+
+  // The shift masks must have the same position and size.
+  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+    return SDValue();
+
+  SDValue Shl = And1.getOperand(0);
+  if (Shl.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+    return SDValue();
+
+  unsigned Shamt = CN->getZExtValue();
+
+  // Return if the shift amount and the first bit position of mask are not the
+  // same.
+  EVT ValTy = N->getValueType(0);
+  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), ValTy, Shl.getOperand(0),
+                     DAG.getConstant(SMPos0, MVT::i32),
+                     DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
+}
+
+static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Add = N->getOperand(1);
+
+  if (Add.getOpcode() != ISD::ADD)
+    return SDValue();
+
+  SDValue Lo = Add.getOperand(1);
+
+  if ((Lo.getOpcode() != MipsISD::Lo) ||
+      (Lo.getOperand(0).getOpcode() != ISD::TargetJumpTable))
+    return SDValue();
+
+  EVT ValTy = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
+                             Add.getOperand(0));
+  return DAG.getNode(ISD::ADD, DL, ValTy, Add1, Lo);
+}
+
+SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
+  const {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned Opc = N->getOpcode();
+
+  switch (Opc) {
+  default: break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    return performDivRemCombine(N, DAG, DCI, Subtarget);
+  case ISD::SELECT:
+    return performSELECTCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    return performANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return performORCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:
+    return performADDCombine(N, DAG, DCI, Subtarget);
+  }
+
+  return SDValue();
+}
+
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  switch (Op.getOpcode())
+  {
+  case ISD::BR_JT:              return lowerBR_JT(Op, DAG);
+  case ISD::BRCOND:             return lowerBRCOND(Op, DAG);
+  case ISD::ConstantPool:       return lowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return lowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return lowerBlockAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return lowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return lowerJumpTable(Op, DAG);
+  case ISD::SELECT:             return lowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:          return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:              return lowerSETCC(Op, DAG);
+  case ISD::VASTART:            return lowerVASTART(Op, DAG);
+  case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
+  case ISD::FABS:               return lowerFABS(Op, DAG);
+  case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
+  case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
+  case ISD::MEMBARRIER:         return lowerMEMBARRIER(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return lowerATOMIC_FENCE(Op, DAG);
+  case ISD::SHL_PARTS:          return lowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:          return lowerShiftRightParts(Op, DAG, true);
+  case ISD::SRL_PARTS:          return lowerShiftRightParts(Op, DAG, false);
+  case ISD::LOAD:               return lowerLOAD(Op, DAG);
+  case ISD::STORE:              return lowerSTORE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::ADD:                return lowerADD(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// addLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
+{
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case Mips::ATOMIC_LOAD_ADD_I8:
+  case Mips::ATOMIC_LOAD_ADD_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I16:
+  case Mips::ATOMIC_LOAD_ADD_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I32:
+  case Mips::ATOMIC_LOAD_ADD_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
+  case Mips::ATOMIC_LOAD_ADD_I64:
+  case Mips::ATOMIC_LOAD_ADD_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
+
+  case Mips::ATOMIC_LOAD_AND_I8:
+  case Mips::ATOMIC_LOAD_AND_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I16:
+  case Mips::ATOMIC_LOAD_AND_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I32:
+  case Mips::ATOMIC_LOAD_AND_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, Mips::AND);
+  case Mips::ATOMIC_LOAD_AND_I64:
+  case Mips::ATOMIC_LOAD_AND_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, Mips::AND64);
+
+  case Mips::ATOMIC_LOAD_OR_I8:
+  case Mips::ATOMIC_LOAD_OR_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I16:
+  case Mips::ATOMIC_LOAD_OR_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I32:
+  case Mips::ATOMIC_LOAD_OR_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, Mips::OR);
+  case Mips::ATOMIC_LOAD_OR_I64:
+  case Mips::ATOMIC_LOAD_OR_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, Mips::OR64);
+
+  case Mips::ATOMIC_LOAD_XOR_I8:
+  case Mips::ATOMIC_LOAD_XOR_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I16:
+  case Mips::ATOMIC_LOAD_XOR_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I32:
+  case Mips::ATOMIC_LOAD_XOR_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, Mips::XOR);
+  case Mips::ATOMIC_LOAD_XOR_I64:
+  case Mips::ATOMIC_LOAD_XOR_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
+
+  case Mips::ATOMIC_LOAD_NAND_I8:
+  case Mips::ATOMIC_LOAD_NAND_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I16:
+  case Mips::ATOMIC_LOAD_NAND_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I32:
+  case Mips::ATOMIC_LOAD_NAND_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, 0, true);
+  case Mips::ATOMIC_LOAD_NAND_I64:
+  case Mips::ATOMIC_LOAD_NAND_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, 0, true);
+
+  case Mips::ATOMIC_LOAD_SUB_I8:
+  case Mips::ATOMIC_LOAD_SUB_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I16:
+  case Mips::ATOMIC_LOAD_SUB_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I32:
+  case Mips::ATOMIC_LOAD_SUB_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
+  case Mips::ATOMIC_LOAD_SUB_I64:
+  case Mips::ATOMIC_LOAD_SUB_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
+
+  case Mips::ATOMIC_SWAP_I8:
+  case Mips::ATOMIC_SWAP_I8_P8:
+    return emitAtomicBinaryPartword(MI, BB, 1, 0);
+  case Mips::ATOMIC_SWAP_I16:
+  case Mips::ATOMIC_SWAP_I16_P8:
+    return emitAtomicBinaryPartword(MI, BB, 2, 0);
+  case Mips::ATOMIC_SWAP_I32:
+  case Mips::ATOMIC_SWAP_I32_P8:
+    return emitAtomicBinary(MI, BB, 4, 0);
+  case Mips::ATOMIC_SWAP_I64:
+  case Mips::ATOMIC_SWAP_I64_P8:
+    return emitAtomicBinary(MI, BB, 8, 0);
+
+  case Mips::ATOMIC_CMP_SWAP_I8:
+  case Mips::ATOMIC_CMP_SWAP_I8_P8:
+    return emitAtomicCmpSwapPartword(MI, BB, 1);
+  case Mips::ATOMIC_CMP_SWAP_I16:
+  case Mips::ATOMIC_CMP_SWAP_I16_P8:
+    return emitAtomicCmpSwapPartword(MI, BB, 2);
+  case Mips::ATOMIC_CMP_SWAP_I32:
+  case Mips::ATOMIC_CMP_SWAP_I32_P8:
+    return emitAtomicCmpSwap(MI, BB, 4);
+  case Mips::ATOMIC_CMP_SWAP_I64:
+  case Mips::ATOMIC_CMP_SWAP_I64_P8:
+    return emitAtomicCmpSwap(MI, BB, 8);
+  }
+}
+
+// This function also handles Mips::ATOMIC_SWAP_I32 (when BinOpcode == 0), and
+// Mips::ATOMIC_LOAD_NAND_I32 (when Nand == true)
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                     unsigned Size, unsigned BinOpcode,
+                                     bool Nand) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicBinary.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL, SC, AND, NOR, ZERO, BEQ;
+
+  if (Size == 4) {
+    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
+    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    AND = Mips::AND;
+    NOR = Mips::NOR;
+    ZERO = Mips::ZERO;
+    BEQ = Mips::BEQ;
+  }
+  else {
+    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
+    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+    AND = Mips::AND64;
+    NOR = Mips::NOR64;
+    ZERO = Mips::ZERO_64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned OldVal = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(exitMBB);
+
+  //  loopMBB:
+  //    ll oldval, 0(ptr)
+  //    <binop> storeval, oldval, incr
+  //    sc success, storeval, 0(ptr)
+  //    beq success, $0, loopMBB
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr
+    //  nor storeval, $0, andres
+    BuildMI(BB, DL, TII->get(AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, DL, TII->get(NOR), StoreVal).addReg(ZERO).addReg(AndRes);
+  } else if (BinOpcode) {
+    //  <binop> storeval, oldval, incr
+    BuildMI(BB, DL, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
+  } else {
+    StoreVal = Incr;
+  }
+  BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
+                                             MachineBasicBlock *BB,
+                                             unsigned Size, unsigned BinOpcode,
+                                             bool Nand) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicBinaryPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
+  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Ptr = MI->getOperand(1).getReg();
+  unsigned Incr = MI->getOperand(2).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned NewVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    sll     incr2,incr,shiftamt
+
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
+
+  // atomic.load.binop
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   binop   binopres,oldval,incr2
+  //   and     newval,binopres,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  // atomic.swap
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   and     newval,incr2,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  BB = loopMBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  if (Nand) {
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::NOR), BinOpRes)
+      .addReg(Mips::ZERO).addReg(AndRes);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else if (BinOpcode) {
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else {// atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
+  }
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal0).addReg(NewVal);
+  BuildMI(BB, DL, TII->get(SC), Success)
+    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal1);
+  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                      MachineBasicBlock *BB,
+                                      unsigned Size) const {
+  assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::getIntegerVT(Size * 8));
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL, SC, ZERO, BNE, BEQ;
+
+  if (Size == 4) {
+    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
+    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    ZERO = Mips::ZERO;
+    BNE = Mips::BNE;
+    BEQ = Mips::BEQ;
+  }
+  else {
+    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
+    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+    ZERO = Mips::ZERO_64;
+    BNE = Mips::BNE64;
+    BEQ = Mips::BEQ64;
+  }
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned OldVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
+
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  //  thisMBB:
+  //    ...
+  //    fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
+
+  // loop1MBB:
+  //   ll dest, 0(ptr)
+  //   bne dest, oldval, exitMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(LL), Dest).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BNE))
+    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
+
+  // loop2MBB:
+  //   sc success, newval, 0(ptr)
+  //   beq success, $0, loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(SC), Success)
+    .addReg(NewVal).addReg(Ptr).addImm(0);
+  BuildMI(BB, DL, TII->get(BEQ))
+    .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return exitMBB;
+}
+
+MachineBasicBlock *
+MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
+                                              MachineBasicBlock *BB,
+                                              unsigned Size) const {
+  assert((Size == 1 || Size == 2) &&
+      "Unsupported size for EmitAtomicCmpSwapPartial.");
+
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
+  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+
+  unsigned Dest    = MI->getOperand(0).getReg();
+  unsigned Ptr     = MI->getOperand(1).getReg();
+  unsigned CmpVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
+
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
+  unsigned Mask = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
+
+  // insert new blocks after the current block
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineFunction::iterator It = BB;
+  ++It;
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  // FIXME: computation of newval2 can be moved to loop2MBB.
+  //  thisMBB:
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
+  //    nor     mask2,$0,mask
+  //    andi    maskedcmpval,cmpval,255
+  //    sll     shiftedcmpval,maskedcmpval,shiftamt
+  //    andi    maskednewval,newval,255
+  //    sll     shiftednewval,maskednewval,shiftamt
+  int64_t MaskImm = (Size == 1) ? 255 : 65535;
+  BuildMI(BB, DL, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
+  BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
+    .addReg(CmpVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
+    .addReg(ShiftAmt).addReg(MaskedCmpVal);
+  BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
+    .addReg(NewVal).addImm(MaskImm);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
+    .addReg(ShiftAmt).addReg(MaskedNewVal);
+
+  //  loop1MBB:
+  //    ll      oldval,0(alginedaddr)
+  //    and     maskedoldval0,oldval,mask
+  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
+  BB = loop1MBB;
+  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, DL, TII->get(Mips::BNE))
+    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
+
+  //  loop2MBB:
+  //    and     maskedoldval1,oldval,mask2
+  //    or      storeval,maskedoldval1,shiftednewval
+  //    sc      success,storeval,0(alignedaddr)
+  //    beq     success,$0,loop1MBB
+  BB = loop2MBB;
+  BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+  BuildMI(BB, DL, TII->get(SC), Success)
+      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::BEQ))
+      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
+
+  //  sinkMBB:
+  //    srl     srlres,maskedoldval0,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
+  int64_t ShiftImm = (Size == 1) ? 24 : 16;
+
+  BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal0);
+  BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return exitMBB;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PTy = getPointerTy();
+  unsigned EntrySize =
+    DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
+
+  Index = DAG.getNode(ISD::MUL, DL, PTy, Index,
+                      DAG.getConstant(EntrySize, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table);
+
+  EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+  Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr,
+                        MachinePointerInfo::getJumpTable(), MemVT, false, false,
+                        0);
+  Chain = Addr.getValue(1);
+
+  if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || IsN64) {
+    // For PIC, the sequence is:
+    // BRIND(load(Jumptable + index) + RelocBase)
+    // RelocBase can be JumpTable, GOT or some sort of global base.
+    Addr = DAG.getNode(ISD::ADD, DL, PTy, Addr,
+                       getPICJumpTableRelocBase(Table, DAG));
+  }
+
+  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
+}
+
+SDValue MipsTargetLowering::
+lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
+{
+  // The first operand is the chain, the second is the condition, the third is
+  // the block to branch to if the condition is true.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
+
+  // Return if flag is not set by a floating point comparison.
+  if (CondRes.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  SDValue CCNode  = CondRes.getOperand(2);
+  Mips::CondCode CC =
+    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+  unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
+  SDValue BrCode = DAG.getConstant(Opc, MVT::i32);
+  return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
+                     Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond = createFPCmp(DAG, Op.getOperand(0));
+
+  // Return if flag is not set by a floating point comparison.
+  if (Cond.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
+                      Op.getDebugLoc());
+}
+
+SDValue MipsTargetLowering::
+lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT Ty = Op.getOperand(0).getValueType();
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty),
+                             Op.getOperand(0), Op.getOperand(1),
+                             Op.getOperand(4));
+
+  return DAG.getNode(ISD::SELECT, DL, Op.getValueType(), Cond, Op.getOperand(2),
+                     Op.getOperand(3));
+}
+
+SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = createFPCmp(DAG, Op);
+
+  assert(Cond.getOpcode() == MipsISD::FPCmp &&
+         "Floating point operand expected.");
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  return createCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
+}
+
+SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // FIXME there isn't actually debug info here
+  DebugLoc DL = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
+    const MipsTargetObjectFile &TLOF =
+      (const MipsTargetObjectFile&)getObjFileLowering();
+
+    // %gp_rel relocation
+    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
+      SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                              MipsII::MO_GPREL);
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
+                                      DAG.getVTList(MVT::i32), &GA, 1);
+      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
+      return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
+    }
+
+    // %hi/%lo relocation
+    return getAddrNonPIC(Op, DAG);
+  }
+
+  if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
+    return getAddrLocal(Op, DAG, HasMips64);
+
+  if (LargeGOT)
+    return getAddrGlobalLargeGOT(Op, DAG, MipsII::MO_GOT_HI16,
+                                 MipsII::MO_GOT_LO16);
+
+  return getAddrGlobal(Op, DAG,
+                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16);
+}
+
+SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+    return getAddrNonPIC(Op, DAG);
+
+  return getAddrLocal(Op, DAG, HasMips64);
+}
+
+SDValue MipsTargetLowering::
+lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  // If the relocation model is PIC, use the General Dynamic TLS Model or
+  // Local Dynamic TLS model, otherwise use the Initial Exec or
+  // Local Exec TLS Model.
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc DL = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    // General Dynamic and Local Dynamic TLS Model.
+    unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
+                                                      : MipsII::MO_TLSGD;
+
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, Flag);
+    SDValue Argument = DAG.getNode(MipsISD::Wrapper, DL, PtrVT,
+                                   getGlobalReg(DAG, PtrVT), TGA);
+    unsigned PtrSize = PtrVT.getSizeInBits();
+    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+    SDValue TlsGetAddr = DAG.getExternalSymbol("__tls_get_addr", PtrVT);
+
+    ArgListTy Args;
+    ArgListEntry Entry;
+    Entry.Node = Argument;
+    Entry.Ty = PtrTy;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
+                  false, false, false, false, 0, CallingConv::C,
+                  /*IsTailCall=*/false, /*doesNotRet=*/false,
+                  /*isReturnValueUsed=*/true,
+                  TlsGetAddr, Args, DAG, DL);
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+    SDValue Ret = CallResult.first;
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_HI);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_DTPREL_LO);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Ret);
+    return DAG.getNode(ISD::ADD, DL, PtrVT, Add, Lo);
+  }
+
+  SDValue Offset;
+  if (model == TLSModel::InitialExec) {
+    // Initial Exec TLS Model
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                             MipsII::MO_GOTTPREL);
+    TGA = DAG.getNode(MipsISD::Wrapper, DL, PtrVT, getGlobalReg(DAG, PtrVT),
+                      TGA);
+    Offset = DAG.getLoad(PtrVT, DL,
+                         DAG.getEntryNode(), TGA, MachinePointerInfo(),
+                         false, false, false, 0);
+  } else {
+    // Local Exec TLS Model
+    assert(model == TLSModel::LocalExec);
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_HI);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                               MipsII::MO_TPREL_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, DL, PtrVT, TGAHi);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, DL, PtrVT, TGALo);
+    Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+  }
+
+  SDValue ThreadPointer = DAG.getNode(MipsISD::ThreadPointer, DL, PtrVT);
+  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue MipsTargetLowering::
+lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+    return getAddrNonPIC(Op, DAG);
+
+  return getAddrLocal(Op, DAG, HasMips64);
+}
+
+SDValue MipsTargetLowering::
+lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  // gp_rel relocation
+  // FIXME: we should reference the constant pool using small data sections,
+  // but the asm printer currently doesn't support this feature without
+  // hacking it. This feature should come soon so we can uncomment the
+  // stuff below.
+  //if (IsInSmallSection(C->getType())) {
+  //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
+  //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+  //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
+    return getAddrNonPIC(Op, DAG);
+
+  return getAddrLocal(Op, DAG, HasMips64);
+}
+
+SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
+
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy());
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  EVT TyX = Op.getOperand(0).getValueType();
+  EVT TyY = Op.getOperand(1).getValueType();
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  SDValue Const31 = DAG.getConstant(31, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Res;
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (TyX == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+  SDValue Y = (TyY == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(1)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
+                Const1);
+
+  if (HasR2) {
+    // ext  E, Y, 31, 1  ; extract bit31 of Y
+    // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32, E, Const31, Const1, X);
+  } else {
+    // sll SllX, X, 1
+    // srl SrlX, SllX, 1
+    // srl SrlY, Y, 31
+    // sll SllY, SrlX, 31
+    // or  Or, SrlX, SllY
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    SDValue SrlX = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+    SDValue SrlY = DAG.getNode(ISD::SRL, DL, MVT::i32, Y, Const31);
+    SDValue SllY = DAG.getNode(ISD::SHL, DL, MVT::i32, SrlY, Const31);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, SrlX, SllY);
+  }
+
+  if (TyX == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Res);
+
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
+  unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
+  EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
+  SDValue Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Bitcast to integer nodes.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
+  SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
+
+  if (HasR2) {
+    // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
+    // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
+    SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
+                            DAG.getConstant(WidthY - 1, MVT::i32), Const1);
+
+    if (WidthX > WidthY)
+      E = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, E);
+    else if (WidthY > WidthX)
+      E = DAG.getNode(ISD::TRUNCATE, DL, TyX, E);
+
+    SDValue I = DAG.getNode(MipsISD::Ins, DL, TyX, E,
+                            DAG.getConstant(WidthX - 1, MVT::i32), Const1, X);
+    return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), I);
+  }
+
+  // (d)sll SllX, X, 1
+  // (d)srl SrlX, SllX, 1
+  // (d)srl SrlY, Y, width(Y)-1
+  // (d)sll SllY, SrlX, width(Y)-1
+  // or     Or, SrlX, SllY
+  SDValue SllX = DAG.getNode(ISD::SHL, DL, TyX, X, Const1);
+  SDValue SrlX = DAG.getNode(ISD::SRL, DL, TyX, SllX, Const1);
+  SDValue SrlY = DAG.getNode(ISD::SRL, DL, TyY, Y,
+                             DAG.getConstant(WidthY - 1, MVT::i32));
+
+  if (WidthX > WidthY)
+    SrlY = DAG.getNode(ISD::ZERO_EXTEND, DL, TyX, SrlY);
+  else if (WidthY > WidthX)
+    SrlY = DAG.getNode(ISD::TRUNCATE, DL, TyX, SrlY);
+
+  SDValue SllY = DAG.getNode(ISD::SHL, DL, TyX, SrlY,
+                             DAG.getConstant(WidthX - 1, MVT::i32));
+  SDValue Or = DAG.getNode(ISD::OR, DL, TyX, SrlX, SllY);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getOperand(0).getValueType(), Or);
+}
+
+SDValue
+MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->hasMips64())
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
+
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+}
+
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
+  // to i32.
+  SDValue X = (Op.getValueType() == MVT::f32) ?
+    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
+    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
+                Const1);
+
+  // Clear MSB.
+  if (HasR2)
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
+                      DAG.getRegister(Mips::ZERO, MVT::i32),
+                      DAG.getConstant(31, MVT::i32), Const1, X);
+  else {
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
+    Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
+  }
+
+  if (Op.getValueType() == MVT::f32)
+    return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res);
+
+  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
+  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
+}
+
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Bitcast to integer node.
+  SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
+
+  // Clear MSB.
+  if (HasR2)
+    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
+                      DAG.getRegister(Mips::ZERO_64, MVT::i64),
+                      DAG.getConstant(63, MVT::i32), Const1, X);
+  else {
+    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1);
+    Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res);
+}
+
+SDValue
+MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
+  if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
+    return lowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+
+  return lowerFABS32(Op, DAG, Subtarget->hasMips32r2());
+}
+
+SDValue MipsTargetLowering::
+lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Frame address can only be determined for current frame.");
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
+                                         IsN64 ? Mips::FP_64 : Mips::FP, VT);
+  return FrameAddr;
+}
+
+SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // check the depth
+  assert((cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() == 0) &&
+         "Return address can be determined only for current frame.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MVT VT = Op.getSimpleValueType();
+  unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA;
+  MFI->setReturnAddressIsTaken(true);
+
+  // Return RA, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
+}
+
+// An EH_RETURN is the result of lowering llvm.eh.return which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+                                                                     const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setCallsEhReturn();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc DL       = Op.getDebugLoc();
+  EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+
+  // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
+  // EH_RETURN nodes, so that instructions are emitted back-to-back.
+  unsigned OffsetReg = IsN64 ? Mips::V1_64 : Mips::V1;
+  unsigned AddrReg = IsN64 ? Mips::V0_64 : Mips::V0;
+  Chain = DAG.getCopyToReg(Chain, DL, OffsetReg, Offset, SDValue());
+  Chain = DAG.getCopyToReg(Chain, DL, AddrReg, Handler, Chain.getValue(1));
+  return DAG.getNode(MipsISD::EH_RETURN, DL, MVT::Other, Chain,
+                     DAG.getRegister(OffsetReg, Ty),
+                     DAG.getRegister(AddrReg, getPointerTy()),
+                     Chain.getValue(1));
+}
+
+// TODO: set SType according to the desired memory barrier behavior.
+SDValue
+MipsTargetLowering::lowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const {
+  unsigned SType = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
+SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // FIXME: Need pseudo-fence for 'singlethread' fences
+  // FIXME: Set SType for weaker fences where supported/appropriate.
+  unsigned SType = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
+SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (shl lo, shamt)
+  //  hi = (or (shl hi, shamt) (srl (srl lo, 1), ~shamt))
+  // else:
+  //  lo = 0
+  //  hi = (shl lo, shamt[4:0])
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftRight1Lo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo,
+                                      DAG.getConstant(1, MVT::i32));
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, ShiftRight1Lo,
+                                     Not);
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftLeftLo = DAG.getNode(ISD::SHL, DL, MVT::i32, Lo, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   DAG.getConstant(0, MVT::i32), ShiftLeftLo);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
+                                                 bool IsSRA) const {
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
+  SDValue Shamt = Op.getOperand(2);
+
+  // if shamt < 32:
+  //  lo = (or (shl (shl hi, 1), ~shamt) (srl lo, shamt))
+  //  if isSRA:
+  //    hi = (sra hi, shamt)
+  //  else:
+  //    hi = (srl hi, shamt)
+  // else:
+  //  if isSRA:
+  //   lo = (sra hi, shamt[4:0])
+  //   hi = (sra hi, 31)
+  //  else:
+  //   lo = (srl hi, shamt[4:0])
+  //   hi = 0
+  SDValue Not = DAG.getNode(ISD::XOR, DL, MVT::i32, Shamt,
+                            DAG.getConstant(-1, MVT::i32));
+  SDValue ShiftLeft1Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+                                     DAG.getConstant(1, MVT::i32));
+  SDValue ShiftLeftHi = DAG.getNode(ISD::SHL, DL, MVT::i32, ShiftLeft1Hi, Not);
+  SDValue ShiftRightLo = DAG.getNode(ISD::SRL, DL, MVT::i32, Lo, Shamt);
+  SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, ShiftLeftHi, ShiftRightLo);
+  SDValue ShiftRightHi = DAG.getNode(IsSRA ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                                     Hi, Shamt);
+  SDValue Cond = DAG.getNode(ISD::AND, DL, MVT::i32, Shamt,
+                             DAG.getConstant(0x20, MVT::i32));
+  SDValue Shift31 = DAG.getNode(ISD::SRA, DL, MVT::i32, Hi,
+                                DAG.getConstant(31, MVT::i32));
+  Lo = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftRightHi, Or);
+  Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond,
+                   IsSRA ? Shift31 : DAG.getConstant(0, MVT::i32),
+                   ShiftRightHi);
+
+  SDValue Ops[2] = {Lo, Hi};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
+                            SDValue Chain, SDValue Src, unsigned Offset) {
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
+  EVT BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = LD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Ptr, Src };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 LD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer load node.
+SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  EVT MemVT = LD->getMemoryVT();
+
+  // Return if load is aligned or if MemVT is neither i32 nor i64.
+  if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  EVT VT = Op.getValueType();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Chain = LD->getChain(), Undef = DAG.getUNDEF(VT);
+
+  assert((VT == MVT::i32) || (VT == MVT::i64));
+
+  // Expand
+  //  (set dst, (i64 (load baseptr)))
+  // to
+  //  (set tmp, (ldl (add baseptr, 7), undef))
+  //  (set dst, (ldr baseptr, tmp))
+  if ((VT == MVT::i64) && (ExtType == ISD::NON_EXTLOAD)) {
+    SDValue LDL = CreateLoadLR(MipsISD::LDL, DAG, LD, Chain, Undef,
+                               IsLittle ? 7 : 0);
+    return CreateLoadLR(MipsISD::LDR, DAG, LD, LDL.getValue(1), LDL,
+                        IsLittle ? 0 : 7);
+  }
+
+  SDValue LWL = CreateLoadLR(MipsISD::LWL, DAG, LD, Chain, Undef,
+                             IsLittle ? 3 : 0);
+  SDValue LWR = CreateLoadLR(MipsISD::LWR, DAG, LD, LWL.getValue(1), LWL,
+                             IsLittle ? 0 : 3);
+
+  // Expand
+  //  (set dst, (i32 (load baseptr))) or
+  //  (set dst, (i64 (sextload baseptr))) or
+  //  (set dst, (i64 (extload baseptr)))
+  // to
+  //  (set tmp, (lwl (add baseptr, 3), undef))
+  //  (set dst, (lwr baseptr, tmp))
+  if ((VT == MVT::i32) || (ExtType == ISD::SEXTLOAD) ||
+      (ExtType == ISD::EXTLOAD))
+    return LWR;
+
+  assert((VT == MVT::i64) && (ExtType == ISD::ZEXTLOAD));
+
+  // Expand
+  //  (set dst, (i64 (zextload baseptr)))
+  // to
+  //  (set tmp0, (lwl (add baseptr, 3), undef))
+  //  (set tmp1, (lwr baseptr, tmp0))
+  //  (set tmp2, (shl tmp1, 32))
+  //  (set dst, (srl tmp2, 32))
+  DebugLoc DL = LD->getDebugLoc();
+  SDValue Const32 = DAG.getConstant(32, MVT::i32);
+  SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
+  SDValue Ops[] = { SRL, LWR.getValue(1) };
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+static SDValue CreateStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
+                             SDValue Chain, unsigned Offset) {
+  SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
+  EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
+  DebugLoc DL = SD->getDebugLoc();
+  SDVTList VTList = DAG.getVTList(MVT::Other);
+
+  if (Offset)
+    Ptr = DAG.getNode(ISD::ADD, DL, BasePtrVT, Ptr,
+                      DAG.getConstant(Offset, BasePtrVT));
+
+  SDValue Ops[] = { Chain, Value, Ptr };
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+                                 SD->getMemOperand());
+}
+
+// Expand an unaligned 32 or 64-bit integer store node.
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Return if store is aligned or if MemVT is neither i32 nor i64.
+  if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
+      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
+    return SDValue();
+
+  bool IsLittle = Subtarget->isLittle();
+  SDValue Value = SD->getValue(), Chain = SD->getChain();
+  EVT VT = Value.getValueType();
+
+  // Expand
+  //  (store val, baseptr) or
+  //  (truncstore val, baseptr)
+  // to
+  //  (swl val, (add baseptr, 3))
+  //  (swr val, baseptr)
+  if ((VT == MVT::i32) || SD->isTruncatingStore()) {
+    SDValue SWL = CreateStoreLR(MipsISD::SWL, DAG, SD, Chain,
+                                IsLittle ? 3 : 0);
+    return CreateStoreLR(MipsISD::SWR, DAG, SD, SWL, IsLittle ? 0 : 3);
+  }
+
+  assert(VT == MVT::i64);
+
+  // Expand
+  //  (store val, baseptr)
+  // to
+  //  (sdl val, (add baseptr, 7))
+  //  (sdr val, baseptr)
+  SDValue SDL = CreateStoreLR(MipsISD::SDL, DAG, SD, Chain, IsLittle ? 7 : 0);
+  return CreateStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
+}
+
+static SDValue initAccumulator(SDValue In, DebugLoc DL, SelectionDAG &DAG) {
+  SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(0, MVT::i32));
+  SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
+                             DAG.getConstant(1, MVT::i32));
+  return DAG.getNode(MipsISD::InsertLOHI, DL, MVT::Untyped, InLo, InHi);
+}
+
+static SDValue extractLOHI(SDValue Op, DebugLoc DL, SelectionDAG &DAG) {
+  SDValue Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
+                           DAG.getConstant(Mips::sub_lo, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
+                           DAG.getConstant(Mips::sub_hi, MVT::i32));
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
+}
+
+// This function expands mips intrinsic nodes which have 64-bit input operands
+// or output values.
+//
+// out64 = intrinsic-node in64
+// =>
+// lo = copy (extract-element (in64, 0))
+// hi = copy (extract-element (in64, 1))
+// mips-specific-node
+// v0 = copy lo
+// v1 = copy hi
+// out64 = merge-values (v0, v1)
+//
+static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  DebugLoc DL = Op.getDebugLoc();
+  bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
+  SmallVector<SDValue, 3> Ops;
+  unsigned OpNo = 0;
+
+  // See if Op has a chain input.
+  if (HasChainIn)
+    Ops.push_back(Op->getOperand(OpNo++));
+
+  // The next operand is the intrinsic opcode.
+  assert(Op->getOperand(OpNo).getOpcode() == ISD::TargetConstant);
+
+  // See if the next operand has type i64.
+  SDValue Opnd = Op->getOperand(++OpNo), In64;
+
+  if (Opnd.getValueType() == MVT::i64)
+    In64 = initAccumulator(Opnd, DL, DAG);
+  else
+    Ops.push_back(Opnd);
+
+  // Push the remaining operands.
+  for (++OpNo ; OpNo < Op->getNumOperands(); ++OpNo)
+    Ops.push_back(Op->getOperand(OpNo));
+
+  // Add In64 to the end of the list.
+  if (In64.getNode())
+    Ops.push_back(In64);
+
+  // Scan output.
+  SmallVector<EVT, 2> ResTys;
+
+  for (SDNode::value_iterator I = Op->value_begin(), E = Op->value_end();
+       I != E; ++I)
+    ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
+
+  // Create node.
+  SDValue Val = DAG.getNode(Opc, DL, ResTys, &Ops[0], Ops.size());
+  SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
+
+  if (!HasChainIn)
+    return Out;
+
+  assert(Val->getValueType(1) == MVT::Other);
+  SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
+  return DAG.getMergeValues(Vals, 2, DL);
+}
+
+SDValue MipsTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_shilo:
+    return lowerDSPIntr(Op, DAG, MipsISD::SHILO);
+  case Intrinsic::mips_dpau_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL);
+  case Intrinsic::mips_dpau_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR);
+  case Intrinsic::mips_dpsu_h_qbl:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL);
+  case Intrinsic::mips_dpsu_h_qbr:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR);
+  case Intrinsic::mips_dpa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH);
+  case Intrinsic::mips_dps_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH);
+  case Intrinsic::mips_dpax_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH);
+  case Intrinsic::mips_dpsx_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH);
+  case Intrinsic::mips_mulsa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH);
+  case Intrinsic::mips_mult:
+    return lowerDSPIntr(Op, DAG, MipsISD::Mult);
+  case Intrinsic::mips_multu:
+    return lowerDSPIntr(Op, DAG, MipsISD::Multu);
+  case Intrinsic::mips_madd:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAdd);
+  case Intrinsic::mips_maddu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAddu);
+  case Intrinsic::mips_msub:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSub);
+  case Intrinsic::mips_msubu:
+    return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  }
+}
+
+SDValue MipsTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_extp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTP);
+  case Intrinsic::mips_extpdp:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTPDP);
+  case Intrinsic::mips_extr_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_W);
+  case Intrinsic::mips_extr_r_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W);
+  case Intrinsic::mips_extr_rs_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W);
+  case Intrinsic::mips_extr_s_h:
+    return lowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H);
+  case Intrinsic::mips_mthlip:
+    return lowerDSPIntr(Op, DAG, MipsISD::MTHLIP);
+  case Intrinsic::mips_mulsaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH);
+  case Intrinsic::mips_maq_s_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL);
+  case Intrinsic::mips_maq_s_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR);
+  case Intrinsic::mips_maq_sa_w_phl:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL);
+  case Intrinsic::mips_maq_sa_w_phr:
+    return lowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR);
+  case Intrinsic::mips_dpaq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH);
+  case Intrinsic::mips_dpsq_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH);
+  case Intrinsic::mips_dpaq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W);
+  case Intrinsic::mips_dpsq_sa_l_w:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W);
+  case Intrinsic::mips_dpaqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH);
+  case Intrinsic::mips_dpaqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH);
+  case Intrinsic::mips_dpsqx_s_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
+  case Intrinsic::mips_dpsqx_sa_w_ph:
+    return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  }
+}
+
+SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
+      || cast<ConstantSDNode>
+        (Op->getOperand(0).getOperand(0))->getZExtValue() != 0
+      || Op->getOperand(1).getOpcode() != ISD::FRAME_TO_ARGS_OFFSET)
+    return SDValue();
+
+  // The pattern
+  //   (add (frameaddr 0), (frame_to_args_offset))
+  // results from lowering llvm.eh.dwarf.cfa intrinsic. Transform it to
+  //   (add FrameObject, 0)
+  // where FrameObject is a fixed StackObject with offset 0 which points to
+  // the old stack pointer.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ValTy = Op->getValueType(0);
+  int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+  SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
+  return DAG.getNode(ISD::ADD, Op->getDebugLoc(), ValTy, InArgsAddr,
+                     DAG.getConstant(0, ValTy));
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this.
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold
+//       an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used
+//       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
+//       not used, it must be shadowed. If only A3 is avaiable, shadow it and
+//       go to stack.
+//
+//  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
+                       MVT LocVT, CCValAssign::LocInfo LocInfo,
+                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const unsigned IntRegsSize=4, FloatRegsSize=2;
+
+  static const uint16_t IntRegs[] = {
+      Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const uint16_t F32Regs[] = {
+      Mips::F12, Mips::F14
+  };
+  static const uint16_t F64Regs[] = {
+      Mips::D6, Mips::D7
+  };
+
+  // Do not process byval args here.
+  if (ArgFlags.isByVal())
+    return true;
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  unsigned Reg;
+
+  // f32 and f64 are allocated in A0, A1, A2, A3 when either of the following
+  // is true: function is vararg, argument is 3rd or higher, there is previous
+  // argument which is not f32 or f64.
+  bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1
+      || State.getFirstUnallocated(F32Regs, FloatRegsSize) != ValNo;
+  unsigned OrigAlign = ArgFlags.getOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    // If this is the first part of an i64 arg,
+    // the allocated register must be either A0 or A2.
+    if (isI64 && (Reg == Mips::A1 || Reg == Mips::A3))
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else if (ValVT == MVT::f64 && AllocateFloatsInIntReg) {
+    // Allocate int register and shadow next int register. If first
+    // available register is Mips::A1 or Mips::A3, shadow it too.
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    if (Reg == Mips::A1 || Reg == Mips::A3)
+      Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    State.AllocateReg(IntRegs, IntRegsSize);
+    LocVT = MVT::i32;
+  } else if (ValVT.isFloatingPoint() && !AllocateFloatsInIntReg) {
+    // we are guaranteed to find an available float register
+    if (ValVT == MVT::f32) {
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+      // Shadow int register
+      State.AllocateReg(IntRegs, IntRegsSize);
+    } else {
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+      // Shadow int registers
+      unsigned Reg2 = State.AllocateReg(IntRegs, IntRegsSize);
+      if (Reg2 == Mips::A1 || Reg2 == Mips::A3)
+        State.AllocateReg(IntRegs, IntRegsSize);
+      State.AllocateReg(IntRegs, IntRegsSize);
+    }
+  } else
+    llvm_unreachable("Cannot handle this ValVT.");
+
+  if (!Reg) {
+    unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() >> 3,
+                                          OrigAlign);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return false;
+}
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static const unsigned O32IntRegsSize = 4;
+
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+  assert((Reg == Mips::A0) || (Reg == Mips::A2));
+  return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
+SDValue
+MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+                                   SDValue Chain, SDValue Arg, DebugLoc DL,
+                                   bool IsTailCall, SelectionDAG &DAG) const {
+  if (!IsTailCall) {
+    SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                 DAG.getIntPtrConstant(Offset));
+    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
+                        false, 0);
+  }
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
+                      /*isVolatile=*/ true, false, 0);
+}
+
+void MipsTargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  // Insert node "GP copy globalreg" before call to function.
+  //
+  // R_MIPS_CALL* operators (emitted when non-internal functions are called
+  // in PIC mode) allow symbols to be resolved via lazy binding.
+  // The lazy binding stub requires GP to point to the GOT.
+  if (IsPICCall && !InternalLinkage) {
+    unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
+    EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
+    RegsToPass.push_back(std::make_pair(GPReg, getGlobalReg(CLI.DAG, Ty)));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
+                                 RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
+                                      RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(CLI.DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+}
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+SDValue
+MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                              SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &DL                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &IsTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool IsVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+
+  MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
+                                 getTargetMachine().Options.UseSoftFloat,
+                                 Callee.getNode(), CLI.Args);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NextStackOffset = CCInfo.getNextStackOffset();
+
+  // Check if it's really possible to do a tail call.
+  if (IsTailCall)
+    IsTailCall =
+      isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
+                                        *MF.getInfo<MipsFunctionInfo>());
+
+  if (IsTailCall)
+    ++NumTailCalls;
+
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
+
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
+
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL,
+                                        IsN64 ? Mips::SP_64 : Mips::SP,
+                                        getPointerTy());
+
+  // With EABI is it possible to have 16 args on registers.
+  std::deque< std::pair<unsigned, SDValue> > RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue Arg = OutVals[i];
+    CCValAssign &VA = ArgLocs[i];
+    MVT ValVT = VA.getValVT(), LocVT = VA.getLocVT();
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // ByVal Arg.
+    if (Flags.isByVal()) {
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValArg != MipsCCInfo.byval_end());
+      assert(!IsTailCall &&
+             "Do not tail-call optimize if there is a byval argument.");
+      passByValArg(Chain, DL, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+                   MipsCCInfo, *ByValArg, Flags, Subtarget->isLittle());
+      ++ByValArg;
+      continue;
+    }
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (VA.isRegLoc()) {
+        if ((ValVT == MVT::f32 && LocVT == MVT::i32) ||
+            (ValVT == MVT::f64 && LocVT == MVT::i64) ||
+            (ValVT == MVT::i64 && LocVT == MVT::f64))
+          Arg = DAG.getNode(ISD::BITCAST, DL, LocVT, Arg);
+        else if (ValVT == MVT::f64 && LocVT == MVT::i32) {
+          SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(0, MVT::i32));
+          SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                                   Arg, DAG.getConstant(1, MVT::i32));
+          if (!Subtarget->isLittle())
+            std::swap(Lo, Hi);
+          unsigned LocRegLo = VA.getLocReg();
+          unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+          RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+          RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
+          continue;
+        }
+      }
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, LocVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, LocVT, Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    // Register can't get to this point...
+    assert(VA.isMemLoc());
+
+    // emit ISD::STORE whichs stores the
+    // parameter value to a stack Location
+    MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
+                                         Chain, Arg, DL, IsTailCall, DAG));
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
+  bool GlobalOrExternal = false, InternalLinkage = false;
+  SDValue CalleeLo;
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    if (IsPICCall) {
+      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+
+      if (InternalLinkage)
+        Callee = getAddrLocal(Callee, DAG, HasMips64);
+      else if (LargeGOT)
+        Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16);
+      else
+        Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+    } else
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
+                                          MipsII::MO_NO_FLAG);
+    GlobalOrExternal = true;
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    if (!IsN64 && !IsPIC) // !N64 && static
+      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+                                            MipsII::MO_NO_FLAG);
+    else if (LargeGOT)
+      Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16);
+    else // N64 || PIC
+      Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+
+    GlobalOrExternal = true;
+  }
+
+  SmallVector<SDValue, 8> Ops(1, Chain);
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+              CLI, Callee, Chain);
+
+  if (IsTailCall)
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, &Ops[0], Ops.size());
+
+  Chain  = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
+  SDValue InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg,
+                         Ins, DL, DAG, InVals, CLI.Callee.getNode(), CLI.RetTy);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                    CallingConv::ID CallConv, bool IsVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc DL, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals,
+                                    const SDNode *CallNode,
+                                    const Type *RetTy) const {
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+
+  MipsCCInfo.analyzeCallResult(Ins, getTargetMachine().Options.UseSoftFloat,
+                               CallNode, RetTy);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, RVLocs[i].getLocReg(),
+                                     RVLocs[i].getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getValVT(), Val);
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+/// LowerFormalArguments - transform physical registers into virtual registers
+/// and generate load operations for arguments places on the stack.
+SDValue
+MipsTargetLowering::LowerFormalArguments(SDValue Chain,
+                                         CallingConv::ID CallConv,
+                                         bool IsVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                         DebugLoc DL, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  MipsFI->setVarArgsFrameIndex(0);
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  Function::const_arg_iterator FuncArg =
+    DAG.getMachineFunction().getFunction()->arg_begin();
+  bool UseSoftFloat = getTargetMachine().Options.UseSoftFloat;
+
+  MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
+  MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
+                           MipsCCInfo.hasByValArg());
+
+  unsigned CurArgIdx = 0;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+    EVT ValVT = VA.getValVT();
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    bool IsRegLoc = VA.isRegLoc();
+
+    if (Flags.isByVal()) {
+      assert(Flags.getByValSize() &&
+             "ByVal args of size 0 should have been ignored by front-end.");
+      assert(ByValArg != MipsCCInfo.byval_end());
+      copyByValRegs(Chain, DL, OutChains, DAG, Flags, InVals, &*FuncArg,
+                    MipsCCInfo, *ByValArg);
+      ++ByValArg;
+      continue;
+    }
+
+    // Arguments stored on registers
+    if (IsRegLoc) {
+      EVT RegVT = VA.getLocVT();
+      unsigned ArgReg = VA.getLocReg();
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = Subtarget->inMips16Mode()? &Mips::CPU16RegsRegClass :
+                                        &Mips::CPURegsRegClass;
+      else if (RegVT == MVT::i64)
+        RC = &Mips::CPU64RegsRegClass;
+      else if (RegVT == MVT::f32)
+        RC = &Mips::FGR32RegClass;
+      else if (RegVT == MVT::f64)
+        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FormalArguments Lowering");
+
+      // Transform the arguments stored on
+      // physical registers into virtual ones
+      unsigned Reg = addLiveIn(DAG.getMachineFunction(), ArgReg, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it has been passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, DL, RegVT, ArgValue,
+                                 DAG.getValueType(ValVT));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, ValVT, ArgValue);
+      }
+
+      // Handle floating point arguments passed in integer registers and
+      // long double arguments passed in floating point registers.
+      if ((RegVT == MVT::i32 && ValVT == MVT::f32) ||
+          (RegVT == MVT::i64 && ValVT == MVT::f64) ||
+          (RegVT == MVT::f64 && ValVT == MVT::i64))
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, ValVT, ArgValue);
+      else if (IsO32 && RegVT == MVT::i32 && ValVT == MVT::f64) {
+        unsigned Reg2 = addLiveIn(DAG.getMachineFunction(),
+                                  getNextIntArgReg(ArgReg), RC);
+        SDValue ArgValue2 = DAG.getCopyFromReg(Chain, DL, Reg2, RegVT);
+        if (!Subtarget->isLittle())
+          std::swap(ArgValue, ArgValue2);
+        ArgValue = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64,
+                               ArgValue, ArgValue2);
+      }
+
+      InVals.push_back(ArgValue);
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The stack pointer offset is relative to the caller stack frame.
+      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true);
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(ValVT, DL, Chain, FIN,
+                                   MachinePointerInfo::getFixedStack(FI),
+                                   false, false, false, 0));
+    }
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    unsigned Reg = MipsFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().
+        createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32));
+      MipsFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+  }
+
+  if (IsVarArg)
+    writeVarArgRegs(OutChains, MipsCCInfo, Chain, DL, DAG);
+
+  // All stores are grouped in one node to allow the matching between
+  // the size of Ins and InVals. This only happens when on varg functions
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                        &OutChains[0], OutChains.size());
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool
+MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                   MachineFunction &MF, bool IsVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Mips);
+}
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain,
+                                CallingConv::ID CallConv, bool IsVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<SDValue> &OutVals,
+                                DebugLoc DL, SelectionDAG &DAG) const {
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+
+  // Analyze return values.
+  MipsCCInfo.analyzeReturn(Outs, getTargetMachine().Options.UseSoftFloat,
+                           MF.getFunction()->getReturnType());
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    SDValue Val = OutVals[i];
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    if (RVLocs[i].getValVT() != RVLocs[i].getLocVT())
+      Val = DAG.getNode(ISD::BITCAST, DL, RVLocs[i].getLocVT(), Val);
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into $v0.
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    unsigned Reg = MipsFI->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    unsigned V0 = IsN64 ? Mips::V0_64 : Mips::V0;
+
+    Chain = DAG.getCopyToReg(Chain, DL, V0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(V0, getPointerTy()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  // Return on Mips is always a "jr $ra"
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, &RetOps[0], RetOps.size());
+}
+
+//===----------------------------------------------------------------------===//
+//                           Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType MipsTargetLowering::
+getConstraintType(const std::string &Constraint) const
+{
+  // Mips specific constrainy
+  // GCC config/mips/constraints.md
+  //
+  // 'd' : An address register. Equivalent to r
+  //       unless generating MIPS16 code.
+  // 'y' : Equivalent to r; retained for
+  //       backwards compatibility.
+  // 'c' : A register suitable for use in an indirect
+  //       jump. This will always be $25 for -mabicalls.
+  // 'l' : The lo register. 1 word storage.
+  // 'x' : The hilo register pair. Double word storage.
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':
+      case 'y':
+      case 'f':
+      case 'c':
+      case 'l':
+      case 'x':
+        return C_RegisterClass;
+      case 'R':
+        return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+MipsTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'd':
+  case 'y':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f':
+    if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'c': // $25 for indirect jumps
+  case 'l': // lo register
+  case 'x': // hilo register pair
+      if (type->isIntegerTy())
+      weight = CW_SpecificReg;
+      break;
+  case 'I': // signed 16 bit immediate
+  case 'J': // integer zero
+  case 'K': // unsigned 16 bit immediate
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+  case 'O': // signed 15 bit immediate (+- 16383)
+  case 'P': // immediate in the range of 65535 to 1 (inclusive)
+    if (isa<ConstantInt>(CallOperandVal))
+      weight = CW_Constant;
+    break;
+  case 'R':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
+{
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
+    case 'y': // Same as 'r'. Exists for compatibility.
+    case 'r':
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+        if (Subtarget->inMips16Mode())
+          return std::make_pair(0U, &Mips::CPU16RegsRegClass);
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      }
+      if (VT == MVT::i64 && !HasMips64)
+        return std::make_pair(0U, &Mips::CPURegsRegClass);
+      if (VT == MVT::i64 && HasMips64)
+        return std::make_pair(0U, &Mips::CPU64RegsRegClass);
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &Mips::FGR32RegClass);
+      if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
+        if (Subtarget->isFP64bit())
+          return std::make_pair(0U, &Mips::FGR64RegClass);
+        return std::make_pair(0U, &Mips::AFGR64RegClass);
+      }
+      break;
+    case 'c': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::T9, &Mips::CPURegsRegClass);
+      assert(VT == MVT::i64 && "Unexpected type.");
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::CPU64RegsRegClass);
+    case 'l': // register suitable for indirect jump
+      if (VT == MVT::i32)
+        return std::make_pair((unsigned)Mips::LO, &Mips::HILORegClass);
+      return std::make_pair((unsigned)Mips::LO64, &Mips::HILO64RegClass);
+    case 'x': // register suitable for indirect jump
+      // Fixme: Not triggering the use of both hi and low
+      // This will generate an error message
+      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break; // This will fall through to the generic implementation
+  case 'I': // Signed 16 bit constant
+    // If this fails, the parent routine will give an error
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if (isInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'J': // integer zero
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getZExtValue();
+      if (Val == 0) {
+        Result = DAG.getTargetConstant(0, Type);
+        break;
+      }
+    }
+    return;
+  case 'K': // unsigned 16 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      uint64_t Val = (uint64_t)C->getZExtValue();
+      if (isUInt<16>(Val)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'L': // signed 32 bit immediate where lower 16 bits are 0
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<32>(Val)) && ((Val & 0xffff) == 0)){
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'N': // immediate in the range of -65535 to -1 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val >= -65535) && (Val <= -1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'O': // signed 15 bit immediate
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((isInt<15>(Val))) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  case 'P': // immediate in the range of 1 to 65535 (inclusive)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      EVT Type = Op.getValueType();
+      int64_t Val = C->getSExtValue();
+      if ((Val <= 65535) && (Val >= 1)) {
+        Result = DAG.getTargetConstant(Val, Type);
+        break;
+      }
+    }
+    return;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+bool
+MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM, Type *Ty) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0: // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (!AM.HasBaseReg) // allow "r+i".
+      break;
+    return false; // disallow "r+r" or "r+r+i".
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Mips target isn't yet aware of offsets.
+  return false;
+}
+
+EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                            unsigned SrcAlign,
+                                            bool IsMemset, bool ZeroMemset,
+                                            bool MemcpyStrSrc,
+                                            MachineFunction &MF) const {
+  if (Subtarget->hasMips64())
+    return MVT::i64;
+
+  return MVT::i32;
+}
+
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return false;
+  if (Imm.isNegZero())
+    return false;
+  return Imm.isZero();
+}
+
+unsigned MipsTargetLowering::getJumpTableEncoding() const {
+  if (IsN64)
+    return MachineJumpTableInfo::EK_GPRel64BlockAddress;
+
+  return TargetLowering::getJumpTableEncoding();
+}
+
+/// This function returns true if CallSym is a long double emulation routine.
+static bool isF128SoftLibCall(const char *CallSym) {
+  const char *const LibCalls[] =
+    {"__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", "__extendsftf2",
+     "__fixtfdi", "__fixtfsi", "__fixtfti", "__fixunstfdi", "__fixunstfsi",
+     "__fixunstfti", "__floatditf", "__floatsitf", "__floattitf",
+     "__floatunditf", "__floatunsitf", "__floatuntitf", "__getf2", "__gttf2",
+     "__letf2", "__lttf2", "__multf3", "__netf2", "__powitf2", "__subtf3",
+     "__trunctfdf2", "__trunctfsf2", "__unordtf2",
+     "ceill", "copysignl", "cosl", "exp2l", "expl", "floorl", "fmal", "fmodl",
+     "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
+     "truncl"};
+
+  const char * const *End = LibCalls + array_lengthof(LibCalls);
+
+  // Check that LibCalls is sorted alphabetically.
+  MipsTargetLowering::LTStr Comp;
+
+#ifndef NDEBUG
+  for (const char * const *I = LibCalls; I < End - 1; ++I)
+    assert(Comp(*I, *(I + 1)));
+#endif
+
+  return std::binary_search(LibCalls, End, CallSym, Comp);
+}
+
+/// This function returns true if Ty is fp128 or i128 which was originally a
+/// fp128.
+static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
+  if (Ty->isFP128Ty())
+    return true;
+
+  const ExternalSymbolSDNode *ES =
+    dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
+
+  // If the Ty is i128 and the function being called is a long double emulation
+  // routine, then the original type is f128.
+  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+}
+
+MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
+                                   CCState &Info)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_) {
+  // Pre-allocate reserved argument area.
+  CCInfo.AllocateStack(reservedArgArea(), 1);
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
+                    bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
+                    std::vector<ArgListEntry> &FuncArgs) {
+  assert((CallConv != CallingConv::Fast || !IsVarArg) &&
+         "CallingConv::Fast shouldn't be used for vararg functions.");
+
+  unsigned NumOpnds = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn(), *VarFn = varArgFn();
+
+  for (unsigned I = 0; I != NumOpnds; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    bool R;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    if (IsVarArg && !Args[I].IsFixed)
+      R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    else {
+      MVT RegVT = getRegVT(ArgVT, FuncArgs[Args[I].OrigArgIndex].Ty, CallNode,
+                           IsSoftFloat);
+      R = FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo);
+    }
+
+    if (R) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << I << " has unhandled type "
+             << EVT(ArgVT).getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
+                       bool IsSoftFloat, Function::const_arg_iterator FuncArg) {
+  unsigned NumArgs = Args.size();
+  llvm::CCAssignFn *FixedFn = fixedArgFn();
+  unsigned CurArgIdx = 0;
+
+  for (unsigned I = 0; I != NumArgs; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    std::advance(FuncArg, Args[I].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Args[I].OrigArgIndex;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), 0, IsSoftFloat);
+
+    if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
+      continue;
+
+#ifndef NDEBUG
+    dbgs() << "Formal Arg #" << I << " has unhandled type "
+           << EVT(ArgVT).getEVTString();
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+template<typename Ty>
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+              const SDNode *CallNode, const Type *RetTy) const {
+  CCAssignFn *Fn;
+
+  if (IsSoftFloat && originalTypeIsF128(RetTy, CallNode))
+    Fn = RetCC_F128Soft;
+  else
+    Fn = RetCC_Mips;
+
+  for (unsigned I = 0, E = RetVals.size(); I < E; ++I) {
+    MVT VT = RetVals[I].VT;
+    ISD::ArgFlagsTy Flags = RetVals[I].Flags;
+    MVT RegVT = this->getRegVT(VT, RetTy, CallNode, IsSoftFloat);
+
+    if (Fn(I, VT, RegVT, CCValAssign::Full, Flags, this->CCInfo)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << I << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
+                  const SDNode *CallNode, const Type *RetTy) const {
+  analyzeReturn(Ins, IsSoftFloat, CallNode, RetTy);
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
+              const Type *RetTy) const {
+  analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
+}
+
+void
+MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                           MVT LocVT,
+                                           CCValAssign::LocInfo LocInfo,
+                                           ISD::ArgFlagsTy ArgFlags) {
+  assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
+
+  struct ByValArgInfo ByVal;
+  unsigned RegSize = regSize();
+  unsigned ByValSize = RoundUpToAlignment(ArgFlags.getByValSize(), RegSize);
+  unsigned Align = std::min(std::max(ArgFlags.getByValAlign(), RegSize),
+                            RegSize * 2);
+
+  if (useRegsForByval())
+    allocateRegs(ByVal, ByValSize, Align);
+
+  // Allocate space on caller's stack.
+  ByVal.Address = CCInfo.AllocateStack(ByValSize - RegSize * ByVal.NumRegs,
+                                       Align);
+  CCInfo.addLoc(CCValAssign::getMem(ValNo, ValVT, ByVal.Address, LocVT,
+                                    LocInfo));
+  ByValArgs.push_back(ByVal);
+}
+
+unsigned MipsTargetLowering::MipsCC::numIntArgRegs() const {
+  return IsO32 ? array_lengthof(O32IntRegs) : array_lengthof(Mips64IntRegs);
+}
+
+unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
+  return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
+}
+
+const uint16_t *MipsTargetLowering::MipsCC::intArgRegs() const {
+  return IsO32 ? O32IntRegs : Mips64IntRegs;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
+  if (CallConv == CallingConv::Fast)
+    return CC_Mips_FastCC;
+
+  return IsO32 ? CC_MipsO32 : CC_MipsN;
+}
+
+llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
+  return IsO32 ? CC_MipsO32 : CC_MipsN_VarArg;
+}
+
+const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
+  return IsO32 ? O32IntRegs : Mips64DPRegs;
+}
+
+void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
+                                              unsigned ByValSize,
+                                              unsigned Align) {
+  unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
+  const uint16_t *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
+  assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
+         "Byval argument's size and alignment should be a multiple of"
+         "RegSize.");
+
+  ByVal.FirstIdx = CCInfo.getFirstUnallocated(IntArgRegs, NumIntArgRegs);
+
+  // If Align > RegSize, the first arg register must be even.
+  if ((Align > RegSize) && (ByVal.FirstIdx % 2)) {
+    CCInfo.AllocateReg(IntArgRegs[ByVal.FirstIdx], ShadowRegs[ByVal.FirstIdx]);
+    ++ByVal.FirstIdx;
+  }
+
+  // Mark the registers allocated.
+  for (unsigned I = ByVal.FirstIdx; ByValSize && (I < NumIntArgRegs);
+       ByValSize -= RegSize, ++I, ++ByVal.NumRegs)
+    CCInfo.AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+}
+
+MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
+                                         const SDNode *CallNode,
+                                         bool IsSoftFloat) const {
+  if (IsSoftFloat || IsO32)
+    return VT;
+
+  // Check if the original type was fp128.
+  if (originalTypeIsF128(OrigTy, CallNode)) {
+    assert(VT == MVT::i64);
+    return MVT::f64;
+  }
+
+  return VT;
+}
+
+void MipsTargetLowering::
+copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
+              SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+              SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+              const MipsCC &CC, const ByValArgInfo &ByVal) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned RegAreaSize = ByVal.NumRegs * CC.regSize();
+  unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+  int FrameObjOffset;
+
+  if (RegAreaSize)
+    FrameObjOffset = (int)CC.reservedArgArea() -
+      (int)((CC.numIntArgRegs() - ByVal.FirstIdx) * CC.regSize());
+  else
+    FrameObjOffset = ByVal.Address;
+
+  // Create frame object.
+  EVT PtrTy = getPointerTy();
+  int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+  InVals.push_back(FIN);
+
+  if (!ByVal.NumRegs)
+    return;
+
+  // Copy arg registers.
+  MVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+  for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
+    unsigned ArgReg = CC.intArgRegs()[ByVal.FirstIdx + I];
+    unsigned VReg = addLiveIn(MF, ArgReg, RC);
+    unsigned Offset = I * CC.regSize();
+    SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+                                   DAG.getConstant(Offset, PtrTy));
+    SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset),
+                                 false, false, 0);
+    OutChains.push_back(Store);
+  }
+}
+
+// Copy byVal arg to registers and stack.
+void MipsTargetLowering::
+passByValArg(SDValue Chain, DebugLoc DL,
+             std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+             SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+             MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+             const MipsCC &CC, const ByValArgInfo &ByVal,
+             const ISD::ArgFlagsTy &Flags, bool isLittle) const {
+  unsigned ByValSize = Flags.getByValSize();
+  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
+  unsigned RegSize = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSize);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSize * 8);
+
+  if (ByVal.NumRegs) {
+    const uint16_t *ArgRegs = CC.intArgRegs();
+    bool LeftoverBytes = (ByVal.NumRegs * RegSize > ByValSize);
+    unsigned I = 0;
+
+    // Copy words to registers.
+    for (; I < ByVal.NumRegs - LeftoverBytes; ++I, Offset += RegSize) {
+      SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                    DAG.getConstant(Offset, PtrTy));
+      SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+                                    MachinePointerInfo(), false, false, false,
+                                    Alignment);
+      MemOpChains.push_back(LoadVal.getValue(1));
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+    }
+
+    // Return if the struct has been fully copied.
+    if (ByValSize == Offset)
+      return;
+
+    // Copy the remainder of the byval argument with sub-word loads and shifts.
+    if (LeftoverBytes) {
+      assert((ByValSize > Offset) && (ByValSize < Offset + RegSize) &&
+             "Size of the remainder should be smaller than RegSize.");
+      SDValue Val;
+
+      for (unsigned LoadSize = RegSize / 2, TotalSizeLoaded = 0;
+           Offset < ByValSize; LoadSize /= 2) {
+        unsigned RemSize = ByValSize - Offset;
+
+        if (RemSize < LoadSize)
+          continue;
+
+        // Load subword.
+        SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                      DAG.getConstant(Offset, PtrTy));
+        SDValue LoadVal =
+          DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr,
+                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
+                         false, false, Alignment);
+        MemOpChains.push_back(LoadVal.getValue(1));
+
+        // Shift the loaded value.
+        unsigned Shamt;
+
+        if (isLittle)
+          Shamt = TotalSizeLoaded;
+        else
+          Shamt = (RegSize - (TotalSizeLoaded + LoadSize)) * 8;
+
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+                                    DAG.getConstant(Shamt, MVT::i32));
+
+        if (Val.getNode())
+          Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+        else
+          Val = Shift;
+
+        Offset += LoadSize;
+        TotalSizeLoaded += LoadSize;
+        Alignment = std::min(Alignment, LoadSize);
+      }
+
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, Val));
+      return;
+    }
+  }
+
+  // Copy remainder of byval arg to it with memcpy.
+  unsigned MemCpySize = ByValSize - Offset;
+  SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                            DAG.getConstant(Offset, PtrTy));
+  SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+                            DAG.getIntPtrConstant(ByVal.Address));
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
+                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
+                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        MachinePointerInfo(0), MachinePointerInfo(0));
+  MemOpChains.push_back(Chain);
+}
+
+void
+MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                    const MipsCC &CC, SDValue Chain,
+                                    DebugLoc DL, SelectionDAG &DAG) const {
+  unsigned NumRegs = CC.numIntArgRegs();
+  const uint16_t *ArgRegs = CC.intArgRegs();
+  const CCState &CCInfo = CC.getCCInfo();
+  unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
+  unsigned RegSize = CC.regSize();
+  MVT RegTy = MVT::getIntegerVT(RegSize * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  // Offset of the first variable argument from stack pointer.
+  int VaArgOffset;
+
+  if (NumRegs == Idx)
+    VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
+  else
+    VaArgOffset =
+      (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+
+  // Record the frame index of the first variable argument
+  // which is a value necessary to VASTART.
+  int FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+  MipsFI->setVarArgsFrameIndex(FI);
+
+  // Copy the integer registers that have not been used for argument passing
+  // to the argument register save area. For O32, the save area is allocated
+  // in the caller's stack frame, while for N32/64, it is allocated in the
+  // callee's stack frame.
+  for (unsigned I = Idx; I < NumRegs; ++I, VaArgOffset += RegSize) {
+    unsigned Reg = addLiveIn(MF, ArgRegs[I], RC);
+    SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
+    FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+                                 MachinePointerInfo(), false, false, 0);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(0);
+    OutChains.push_back(Store);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 000000000000..cab71a61e07a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -0,0 +1,465 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsISELLOWERING_H
+#define MipsISELLOWERING_H
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetLowering.h"
+#include <deque>
+#include <string>
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Tail call
+      TailCall,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi,
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo,
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Thread Pointer
+      ThreadPointer,
+
+      // Floating Point Branch Conditional
+      FPBrcond,
+
+      // Floating Point Compare
+      FPCmp,
+
+      // Floating Point Conditional Moves
+      CMovFP_T,
+      CMovFP_F,
+
+      // Floating Point Rounding
+      FPRound,
+
+      // Return
+      Ret,
+
+      EH_RETURN,
+
+      // Node used to extract integer from accumulator.
+      ExtractLOHI,
+
+      // Node used to insert integers to accumulator.
+      InsertLOHI,
+
+      // Mult nodes.
+      Mult,
+      Multu,
+
+      // MAdd/Sub nodes
+      MAdd,
+      MAddu,
+      MSub,
+      MSubu,
+
+      // DivRem(u)
+      DivRem,
+      DivRemU,
+      DivRem16,
+      DivRemU16,
+
+      BuildPairF64,
+      ExtractElementF64,
+
+      Wrapper,
+
+      DynAlloc,
+
+      Sync,
+
+      Ext,
+      Ins,
+
+      // EXTR.W instrinsic nodes.
+      EXTP,
+      EXTPDP,
+      EXTR_S_H,
+      EXTR_W,
+      EXTR_R_W,
+      EXTR_RS_W,
+      SHILO,
+      MTHLIP,
+
+      // DPA.W intrinsic nodes.
+      MULSAQ_S_W_PH,
+      MAQ_S_W_PHL,
+      MAQ_S_W_PHR,
+      MAQ_SA_W_PHL,
+      MAQ_SA_W_PHR,
+      DPAU_H_QBL,
+      DPAU_H_QBR,
+      DPSU_H_QBL,
+      DPSU_H_QBR,
+      DPAQ_S_W_PH,
+      DPSQ_S_W_PH,
+      DPAQ_SA_L_W,
+      DPSQ_SA_L_W,
+      DPA_W_PH,
+      DPS_W_PH,
+      DPAQX_S_W_PH,
+      DPAQX_SA_W_PH,
+      DPAX_W_PH,
+      DPSX_W_PH,
+      DPSQX_S_W_PH,
+      DPSQX_SA_W_PH,
+      MULSA_W_PH,
+
+      MULT,
+      MULTU,
+      MADD_DSP,
+      MADDU_DSP,
+      MSUB_DSP,
+      MSUBU_DSP,
+
+      // Load/Store Left/Right nodes.
+      LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      LWR,
+      SWL,
+      SWR,
+      LDL,
+      LDR,
+      SDL,
+      SDR
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class MipsFunctionInfo;
+
+  class MipsTargetLowering : public TargetLowering  {
+  public:
+    explicit MipsTargetLowering(MipsTargetMachine &TM);
+
+    static const MipsTargetLowering *create(MipsTargetMachine &TM);
+
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
+    virtual void LowerOperationWrapper(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    EVT getSetCCResultType(EVT VT) const;
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    struct LTStr {
+      bool operator()(const char *S1, const char *S2) const {
+        return strcmp(S1, S2) < 0;
+      }
+    };
+
+  protected:
+    SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
+
+    SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) const;
+
+    SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) const;
+
+    SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag) const;
+
+    /// This function fills Ops, which is the list of operands that will later
+    /// be used when a function call node is created. It also generates
+    /// copyToReg nodes to set up argument registers.
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    /// ByValArgInfo - Byval argument information.
+    struct ByValArgInfo {
+      unsigned FirstIdx; // Index of the first register used.
+      unsigned NumRegs;  // Number of registers used for this argument.
+      unsigned Address;  // Offset of the stack area used to pass this argument.
+
+      ByValArgInfo() : FirstIdx(0), NumRegs(0), Address(0) {}
+    };
+
+    /// MipsCC - This class provides methods used to analyze formal and call
+    /// arguments and inquire about calling convention information.
+    class MipsCC {
+    public:
+      MipsCC(CallingConv::ID CallConv, bool IsO32, CCState &Info);
+
+      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               bool IsVarArg, bool IsSoftFloat,
+                               const SDNode *CallNode,
+                               std::vector<ArgListEntry> &FuncArgs);
+      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  bool IsSoftFloat,
+                                  Function::const_arg_iterator FuncArg);
+
+      void analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                             bool IsSoftFloat, const SDNode *CallNode,
+                             const Type *RetTy) const;
+
+      void analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         bool IsSoftFloat, const Type *RetTy) const;
+
+      const CCState &getCCInfo() const { return CCInfo; }
+
+      /// hasByValArg - Returns true if function has byval arguments.
+      bool hasByValArg() const { return !ByValArgs.empty(); }
+
+      /// regSize - Size (in number of bits) of integer registers.
+      unsigned regSize() const { return IsO32 ? 4 : 8; }
+
+      /// numIntArgRegs - Number of integer registers available for calls.
+      unsigned numIntArgRegs() const;
+
+      /// reservedArgArea - The size of the area the caller reserves for
+      /// register arguments. This is 16-byte if ABI is O32.
+      unsigned reservedArgArea() const;
+
+      /// Return pointer to array of integer argument registers.
+      const uint16_t *intArgRegs() const;
+
+      typedef SmallVector<ByValArgInfo, 2>::const_iterator byval_iterator;
+      byval_iterator byval_begin() const { return ByValArgs.begin(); }
+      byval_iterator byval_end() const { return ByValArgs.end(); }
+
+    private:
+      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags);
+
+      /// useRegsForByval - Returns true if the calling convention allows the
+      /// use of registers to pass byval arguments.
+      bool useRegsForByval() const { return CallConv != CallingConv::Fast; }
+
+      /// Return the function that analyzes fixed argument list functions.
+      llvm::CCAssignFn *fixedArgFn() const;
+
+      /// Return the function that analyzes variable argument list functions.
+      llvm::CCAssignFn *varArgFn() const;
+
+      const uint16_t *shadowRegs() const;
+
+      void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
+                        unsigned Align);
+
+      /// Return the type of the register which is used to pass an argument or
+      /// return a value. This function returns f64 if the argument is an i64
+      /// value which has been generated as a result of softening an f128 value.
+      /// Otherwise, it just returns VT.
+      MVT getRegVT(MVT VT, const Type *OrigTy, const SDNode *CallNode,
+                   bool IsSoftFloat) const;
+
+      template<typename Ty>
+      void analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
+                         const SDNode *CallNode, const Type *RetTy) const;
+
+      CCState &CCInfo;
+      CallingConv::ID CallConv;
+      bool IsO32;
+      SmallVector<ByValArgInfo, 2> ByValArgs;
+    };
+
+    // Subtarget Info
+    const MipsSubtarget *Subtarget;
+
+    bool HasMips64, IsN64, IsO32;
+
+  private:
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals,
+                            const SDNode *CallNode, const Type *RetTy) const;
+
+    // Lower Operand specifics
+    SDValue lowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
+    SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
+                                 bool IsSRA) const;
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
+
+    /// isEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization.
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const = 0;
+
+    /// copyByValArg - Copy argument registers which were used to pass a byval
+    /// argument to the stack. Create a stack frame object for the byval
+    /// argument.
+    void copyByValRegs(SDValue Chain, DebugLoc DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
+                       SmallVectorImpl<SDValue> &InVals,
+                       const Argument *FuncArg,
+                       const MipsCC &CC, const ByValArgInfo &ByVal) const;
+
+    /// passByValArg - Pass a byval argument in registers or on stack.
+    void passByValArg(SDValue Chain, DebugLoc DL,
+                      std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                      SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+                      MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+                      const MipsCC &CC, const ByValArgInfo &ByVal,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle) const;
+
+    /// writeVarArgRegs - Write variable function arguments passed in registers
+    /// to the stack. Also create a stack frame object for the first variable
+    /// argument.
+    void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
+                         SDValue Chain, DebugLoc DL, SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
+                           SDValue Arg, DebugLoc DL, bool IsTailCall,
+                           SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                     bool isVarArg,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     LLVMContext &Context) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    // Inline asm support
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                    unsigned SrcAlign,
+                                    bool IsMemset, bool ZeroMemset,
+                                    bool MemcpyStrSrc,
+                                    MachineFunction &MF) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    virtual unsigned getJumpTableEncoding() const;
+
+    MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                    unsigned Size, unsigned BinOpcode, bool Nand = false) const;
+    MachineBasicBlock *emitAtomicBinaryPartword(MachineInstr *MI,
+                    MachineBasicBlock *BB, unsigned Size, unsigned BinOpcode,
+                    bool Nand = false) const;
+    MachineBasicBlock *emitAtomicCmpSwap(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+    MachineBasicBlock *emitAtomicCmpSwapPartword(MachineInstr *MI,
+                                  MachineBasicBlock *BB, unsigned Size) const;
+  };
+
+  /// Create MipsTargetLowering objects.
+  const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM);
+  const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM);
+}
+
+#endif // MipsISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 000000000000..6b23057c9cdb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -0,0 +1,529 @@
+//===-- MipsInstrFPU.td - Mips FPU Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Mips FPU instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+//    - 32 64-bit registers (default mode)
+//    - 16 even 32-bit registers (32-bit compatible mode) for
+//      single and double access.
+// * 32bit fp:
+//    - 16 even 32-bit registers - single and double (aliased)
+//    - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>,
+                                            SDTCisVT<1, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<1, 2>]>;
+def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
+                                                SDTCisVT<1, i32>,
+                                                SDTCisSameAs<1, 2>]>;
+def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                                     SDTCisVT<1, f64>,
+                                                     SDTCisVT<2, i32>]>;
+
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
+def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
+                          [SDNPHasChain, SDNPOptInGlue]>;
+def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
+                                   SDT_MipsExtractElementF64>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
+  def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"FeatureFP64Bit">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">,
+                       AssemblerPredicate<"!FeatureFP64Bit">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"FeatureSingleFloat">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
+                       AssemblerPredicate<"!FeatureSingleFloat">;
+
+// FP immediate patterns.
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimm0neg : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage.
+//
+// S32 - single precision in 16 32bit even fp registers
+//       single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+
+class ADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin, bit IsComm,
+              SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fs, $ft"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR> {
+  let isCommutable = IsComm;
+}
+
+multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
+                  SDPatternOperator OpNode = null_frag> {
+  def _D32 : ADDS_FT<opstr, AFGR64, Itin, IsComm, OpNode>,
+             Requires<[NotFP64bit, HasStdEnc]>;
+  def _D64 : ADDS_FT<opstr, FGR64, Itin, IsComm, OpNode>,
+             Requires<[IsFP64bit, HasStdEnc]> {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+class ABSS_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>,
+  NeverHasSideEffects;
+
+multiclass ABSS_M<string opstr, InstrItinClass Itin,
+                  SDPatternOperator OpNode= null_frag> {
+  def _D32 : ABSS_FT<opstr, AFGR64, AFGR64, Itin, OpNode>,
+             Requires<[NotFP64bit, HasStdEnc]>;
+  def _D64 : ABSS_FT<opstr, FGR64, FGR64, Itin, OpNode>,
+             Requires<[IsFP64bit, HasStdEnc]> {
+    string DecoderNamespace = "Mips64";
+  }
+}
+
+multiclass ROUND_M<string opstr, InstrItinClass Itin> {
+  def _D32 : ABSS_FT<opstr, FGR32, AFGR64, Itin>,
+             Requires<[NotFP64bit, HasStdEnc]>;
+  def _D64 : ABSS_FT<opstr, FGR32, FGR64, Itin>,
+             Requires<[IsFP64bit, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+  }
+}
+
+class MFC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+
+class MTC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
+
+class MFC1_FT_CCR<string opstr, RegisterClass DstRC, RegisterOperand SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+
+class MTC1_FT_CCR<string opstr, RegisterOperand DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
+
+class LW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
+  let DecoderMethod = "DecodeFMem";
+}
+
+class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
+  let DecoderMethod = "DecodeFMem";
+}
+
+class MADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+               SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin, FrmFR>;
+
+class NMADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
+         Itin, FrmFR>;
+
+class LWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
+         !strconcat(opstr, "\t$fd, ${index}(${base})"),
+         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI> {
+  let AddedComplexity = 20;
+}
+
+class SWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
+         !strconcat(opstr, "\t$fs, ${index}(${base})"),
+         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI> {
+  let AddedComplexity = 20;
+}
+
+class BC1F_FT<string opstr, InstrItinClass Itin,
+              SDPatternOperator Op = null_frag>  :
+  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
+         [(MipsFPBrcond Op, bb:$offset)], Itin, FrmFI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+  let Uses = [FCR31];
+}
+
+class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
+              SDPatternOperator OpNode = null_frag>  :
+  InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
+         !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
+         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR> {
+  let Defs = [FCR31];
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xc, 16>;
+def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xd, 16>;
+def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xe, 16>;
+def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xf, 16>;
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0x24, 16>;
+
+defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
+defm TRUNC_W : ROUND_M<"trunc.w.d", IIFcvt>, ABSS_FM<0xd, 17>;
+defm CEIL_W  : ROUND_M<"ceil.w.d", IIFcvt>, ABSS_FM<0xe, 17>;
+defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
+defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x8, 16>;
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0x8, 17>;
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x9, 16>;
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0x9, 17>;
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xa, 16>;
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0xa, 17>;
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xb, 16>;
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0xb, 17>;
+}
+
+def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32, FGR32, IIFcvt>, ABSS_FM<0x20, 20>;
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x25, 16>;
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0x25, 17>;
+
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32, AFGR64, IIFcvt>, ABSS_FM<0x20, 17>;
+  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
+  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
+}
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+ def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
+ def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
+ def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
+ def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
+ def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64, FGR64, IIFcvt>, ABSS_FM<0x21, 21>;
+}
+
+let Predicates = [NoNaNsFPMath, HasStdEnc] in {
+  def FABS_S : ABSS_FT<"abs.s", FGR32, FGR32, IIFcvt, fabs>, ABSS_FM<0x5, 16>;
+  def FNEG_S : ABSS_FT<"neg.s", FGR32, FGR32, IIFcvt, fneg>, ABSS_FM<0x7, 16>;
+  defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
+  defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
+}
+
+def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32, FGR32, IIFsqrtSingle, fsqrt>,
+               ABSS_FM<0x4, 16>;
+defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers,
+// regardless of register aliasing.
+
+/// Move Control Registers From/To CPU Registers
+def CFC1 : MFC1_FT_CCR<"cfc1", CPURegs, CCROpnd, IIFmove>, MFC1_FM<2>;
+def CTC1 : MTC1_FT_CCR<"ctc1", CCROpnd, CPURegs, IIFmove>, MFC1_FM<6>;
+def MFC1 : MFC1_FT<"mfc1", CPURegs, FGR32, IIFmove, bitconvert>, MFC1_FM<0>;
+def MTC1 : MTC1_FT<"mtc1", FGR32, CPURegs, IIFmove, bitconvert>, MFC1_FM<4>;
+def DMFC1 : MFC1_FT<"dmfc1", CPU64Regs, FGR64, IIFmove, bitconvert>, MFC1_FM<1>;
+def DMTC1 : MTC1_FT<"dmtc1", FGR64, CPU64Regs, IIFmove, bitconvert>, MFC1_FM<5>;
+
+def FMOV_S   : ABSS_FT<"mov.s", FGR32, FGR32, IIFmove>, ABSS_FM<0x6, 16>;
+def FMOV_D32 : ABSS_FT<"mov.d", AFGR64, AFGR64, IIFmove>, ABSS_FM<0x6, 17>,
+               Requires<[NotFP64bit, HasStdEnc]>;
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64, FGR64, IIFmove>, ABSS_FM<0x6, 17>,
+               Requires<[IsFP64bit, HasStdEnc]> {
+  let DecoderNamespace = "Mips64";
+}
+
+/// Floating Point Memory Instructions
+let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def LWC1_P8 : LW_FT<"lwc1", FGR32, IILoad, mem64, load>, LW_FM<0x31>;
+  def SWC1_P8 : SW_FT<"swc1", FGR32, IIStore, mem64, store>, LW_FM<0x39>;
+  def LDC164_P8 : LW_FT<"ldc1", FGR64, IILoad, mem64, load>, LW_FM<0x35> {
+    let isCodeGenOnly =1;
+  }
+  def SDC164_P8 : SW_FT<"sdc1", FGR64, IIStore, mem64, store>, LW_FM<0x3d> {
+    let isCodeGenOnly =1;
+  }
+}
+
+let Predicates = [NotN64, HasStdEnc] in {
+  def LWC1 : LW_FT<"lwc1", FGR32, IILoad, mem, load>, LW_FM<0x31>;
+  def SWC1 : SW_FT<"swc1", FGR32, IIStore, mem, store>, LW_FM<0x39>;
+}
+
+let Predicates = [NotN64, HasMips64, HasStdEnc],
+  DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64, IILoad, mem, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64, IIStore, mem, store>, LW_FM<0x3d>;
+}
+
+let Predicates = [NotN64, NotMips64, HasStdEnc] in {
+  def LDC1 : LW_FT<"ldc1", AFGR64, IILoad, mem, load>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64, IIStore, mem, store>, LW_FM<0x3d>;
+}
+
+// Indexed loads and stores.
+let Predicates = [HasFPIdx, HasStdEnc] in {
+  def LWXC1 : LWXC1_FT<"lwxc1", FGR32, CPURegs, IILoad, load>, LWXC1_FM<0>;
+  def SWXC1 : SWXC1_FT<"swxc1", FGR32, CPURegs, IIStore, store>, SWXC1_FM<8>;
+}
+
+let Predicates = [HasMips32r2, NotMips64, HasStdEnc] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
+}
+
+let Predicates = [HasMips64, NotN64, HasStdEnc], DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
+}
+
+// n64
+let Predicates = [IsN64, HasStdEnc], isCodeGenOnly=1 in {
+  def LWXC1_P8 : LWXC1_FT<"lwxc1", FGR32, CPU64Regs, IILoad, load>, LWXC1_FM<0>;
+  def LDXC164_P8 : LWXC1_FT<"ldxc1", FGR64, CPU64Regs, IILoad, load>,
+                   LWXC1_FM<1>;
+  def SWXC1_P8 : SWXC1_FT<"swxc1", FGR32, CPU64Regs, IIStore, store>,
+                 SWXC1_FM<8>;
+  def SDXC164_P8 : SWXC1_FT<"sdxc1", FGR64, CPU64Regs, IIStore, store>,
+                   SWXC1_FM<9>;
+}
+
+// Load/store doubleword indexed unaligned.
+let Predicates = [NotMips64, HasStdEnc] in {
+  def LUXC1 : LWXC1_FT<"luxc1", AFGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
+  def SUXC1 : SWXC1_FT<"suxc1", AFGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
+}
+
+let Predicates = [HasMips64, HasStdEnc],
+  DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
+}
+
+/// Floating-point Aritmetic
+def FADD_S : ADDS_FT<"add.s", FGR32, IIFadd, 1, fadd>, ADDS_FM<0x00, 16>;
+defm FADD : ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : ADDS_FT<"div.s", FGR32, IIFdivSingle, 0, fdiv>, ADDS_FM<0x03, 16>;
+defm FDIV : ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : ADDS_FT<"mul.s", FGR32, IIFmulSingle, 1, fmul>, ADDS_FM<0x02, 16>;
+defm FMUL : ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : ADDS_FT<"sub.s", FGR32, IIFadd, 0, fsub>, ADDS_FM<0x01, 16>;
+defm FSUB : ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
+
+let Predicates = [HasMips32r2, HasStdEnc] in {
+  def MADD_S : MADDS_FT<"madd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<4, 0>;
+  def MSUB_S : MADDS_FT<"msub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<5, 0>;
+}
+
+let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
+  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<6, 0>;
+  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<7, 0>;
+}
+
+let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
+  def MADD_D32 : MADDS_FT<"madd.d", AFGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
+  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
+}
+
+let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
+  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64, IIFmulDouble, fadd>,
+                  MADDS_FM<6, 1>;
+  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64, IIFmulDouble, fsub>,
+                  MADDS_FM<7, 1>;
+}
+
+let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
+}
+
+let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
+    isCodeGenOnly=1 in {
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64, IIFmulDouble, fadd>,
+                  MADDS_FM<6, 1>;
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64, IIFmulDouble, fsub>,
+                  MADDS_FM<7, 1>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
+
+let DecoderMethod = "DecodeBC1" in {
+def BC1F : BC1F_FT<"bc1f", IIBranch, MIPS_BRANCH_F>, BC1F_FM<0, 0>;
+def BC1T : BC1F_FT<"bc1t", IIBranch, MIPS_BRANCH_T>, BC1F_FM<0, 1>;
+}
+//===----------------------------------------------------------------------===//
+// Floating Point Flag Conditions
+//===----------------------------------------------------------------------===//
+// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h.
+// They must be kept in synch.
+def MIPS_FCOND_F    : PatLeaf<(i32 0)>;
+def MIPS_FCOND_UN   : PatLeaf<(i32 1)>;
+def MIPS_FCOND_OEQ  : PatLeaf<(i32 2)>;
+def MIPS_FCOND_UEQ  : PatLeaf<(i32 3)>;
+def MIPS_FCOND_OLT  : PatLeaf<(i32 4)>;
+def MIPS_FCOND_ULT  : PatLeaf<(i32 5)>;
+def MIPS_FCOND_OLE  : PatLeaf<(i32 6)>;
+def MIPS_FCOND_ULE  : PatLeaf<(i32 7)>;
+def MIPS_FCOND_SF   : PatLeaf<(i32 8)>;
+def MIPS_FCOND_NGLE : PatLeaf<(i32 9)>;
+def MIPS_FCOND_SEQ  : PatLeaf<(i32 10)>;
+def MIPS_FCOND_NGL  : PatLeaf<(i32 11)>;
+def MIPS_FCOND_LT   : PatLeaf<(i32 12)>;
+def MIPS_FCOND_NGE  : PatLeaf<(i32 13)>;
+def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
+def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
+
+/// Floating Point Compare
+def FCMP_S32 : CEQS_FT<"s", FGR32, IIFcmp, MipsFPCmp>, CEQS_FM<16>;
+def FCMP_D32 : CEQS_FT<"d", AFGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+               Requires<[NotFP64bit, HasStdEnc]>;
+let DecoderNamespace = "Mips64" in
+def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+               Requires<[IsFP64bit, HasStdEnc]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+def MOVCCRToCCR : PseudoSE<(outs CCR:$dst), (ins CCROpnd:$src), []>;
+
+// This pseudo instr gets expanded into 2 mtc1 instrs after register
+// allocation.
+def BuildPairF64 :
+  PseudoSE<(outs AFGR64:$dst),
+           (ins CPURegs:$lo, CPURegs:$hi),
+           [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
+
+// This pseudo instr gets expanded into 2 mfc1 instrs after register
+// allocation.
+// if n is 0, lower part of src is extracted.
+// if n is 1, higher part of src is extracted.
+def ExtractElementF64 :
+  PseudoSE<(outs CPURegs:$dst), (ins AFGR64:$src, i32imm:$n),
+           [(set CPURegs:$dst, (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
+def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
+
+def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
+def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
+
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D32_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(i32 (fp_to_sint AFGR64:$src)),
+                (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
+  def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
+}
+
+let Predicates = [IsFP64bit, HasStdEnc] in {
+  def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
+  def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
+
+  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
+                (CVT_D64_W (MTC1 CPURegs:$src))>;
+  def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)),
+                (CVT_S_L (DMTC1 CPU64Regs:$src))>;
+  def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)),
+                (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
+
+  def : MipsPat<(i32 (fp_to_sint FGR64:$src)),
+                (MFC1 (TRUNC_W_D64 FGR64:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
+  def : MipsPat<(i64 (fp_to_sint FGR64:$src)),
+                (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
+
+  def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
+  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
+}
+
+// Patterns for loads/stores with a reg+imm operand.
+let AddedComplexity = 40 in {
+  let Predicates = [IsN64, HasStdEnc] in {
+    def : LoadRegImmPat<LWC1_P8, f32, load>;
+    def : StoreRegImmPat<SWC1_P8, f32>;
+    def : LoadRegImmPat<LDC164_P8, f64, load>;
+    def : StoreRegImmPat<SDC164_P8, f64>;
+  }
+
+  let Predicates = [NotN64, HasStdEnc] in {
+    def : LoadRegImmPat<LWC1, f32, load>;
+    def : StoreRegImmPat<SWC1, f32>;
+  }
+
+  let Predicates = [NotN64, HasMips64, HasStdEnc] in {
+    def : LoadRegImmPat<LDC164, f64, load>;
+    def : StoreRegImmPat<SDC164, f64>;
+  }
+
+  let Predicates = [NotN64, NotMips64, HasStdEnc] in {
+    def : LoadRegImmPat<LDC1, f64, load>;
+    def : StoreRegImmPat<SDC1, f64>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 000000000000..ee432c875355
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -0,0 +1,665 @@
+//===-- MipsInstrFormats.td - Mips Instruction Formats -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<4> val> {
+  bits<4> Value = val;
+}
+
+def Pseudo    : Format<0>;
+def FrmR      : Format<1>;
+def FrmI      : Format<2>;
+def FrmJ      : Format<3>;
+def FrmFR     : Format<4>;
+def FrmFI     : Format<5>;
+def FrmOther  : Format<6>; // Instruction w/ a custom format
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+               InstrItinClass itin, Format f>: Instruction
+{
+  field bits<32> Inst;
+  Format Form = f;
+
+  let Namespace = "Mips";
+
+  let Size = 4;
+
+  bits<6> Opcode = 0;
+
+  // Top 6 bits are the 'opcode' field
+  let Inst{31-26} = Opcode;
+
+  let OutOperandList = outs;
+  let InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+
+  //
+  // Attributes specific to Mips instructions...
+  //
+  bits<4> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with MipsInstrInfo.h.
+  let TSFlags{3-0}   = FormBits;
+
+  let DecoderNamespace = "Mips";
+
+  field bits<32> SoftFail = 0;
+}
+
+// Mips32/64 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
+             InstrItinClass itin, Format f>:
+  MipsInst<outs, ins, asmstr, pattern, itin, f> {
+  let Predicates = [HasStdEnc];
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, list<dag> pattern,
+                 InstrItinClass itin = IIPseudo> :
+  MipsInst<outs, ins, "", pattern, itin, Pseudo> {
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+}
+
+// Mips32/64 Pseudo Instruction Format
+class PseudoSE<dag outs, dag ins, list<dag> pattern,
+               InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasStdEnc];
+}
+
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
+  let isPseudo = 1;
+  let Pattern = [];
+}
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmR>
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let Opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  InstSE<outs, ins, asmstr, pattern, itin, FrmI>
+{
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op>
+{
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+//===----------------------------------------------------------------------===//
+// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+//===----------------------------------------------------------------------===//
+class MFC3OP_FM<bits<6> op, bits<5> mfmt>
+{
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> sel;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-3}  = 0;
+  let Inst{2-0}   = sel;
+}
+
+class ADD_FM<bits<6> op, bits<6> funct> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class ADDI_FM<bits<6> op> {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class SRA_FM<bits<6> funct, bit rotate> {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> shamt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-22} = 0;
+  let Inst{21}    = rotate;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+class SRLV_FM<bits<6> funct, bit rotate> {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-7}  = 0;
+  let Inst{6}     = rotate;
+  let Inst{5-0}   = funct;
+}
+
+class BEQ_FM<bits<6> op> {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM<bits<6> op, bits<5> funct> {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class B_FM {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 4;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0;
+  let Inst{15-0}  = offset;
+}
+
+class SLTI_FM<bits<6> op> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class MFLO_FM<bits<6> funct> {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class MTLO_FM<bits<6> funct> {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class SEB_FM<bits<5> funct, bits<6> funct2> {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = funct;
+  let Inst{5-0}   = funct2;
+}
+
+class CLO_FM<bits<6> funct> {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+  let rt = rd;
+}
+
+class LUI_FM {
+  bits<5> rt;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0xf;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
+class JALR_FM {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 9;
+}
+
+class BAL_FM {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0x11;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM<bits<5> funct> {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = funct;
+  let Inst{15-0}  = offset;
+}
+
+class SYNC_FM {
+  bits<5> stype;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{10-6}  = stype;
+  let Inst{5-0}   = 0xf;
+}
+
+class MULT_FM<bits<6> op, bits<6> funct> {
+  bits<5>  rs;
+  bits<5>  rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class EXT_FM<bits<6> funct> {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class RDHWR_FM {
+  bits<5> rt;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1f;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = 0x3b;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  FLOATING POINT INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  fs      - src reg.
+//  ft      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  fd      - dst reg, only used on 3 regs instr.
+//  fmt     - double or single precision.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
+  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
+{
+  bits<5>  ft;
+  bits<5>  base;
+  bits<16> imm16;
+
+  let Opcode = op;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = ft;
+  let Inst{15-0}  = imm16;
+}
+
+class ADDS_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class ABSS_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class MFC1_FM<bits<5> funct> {
+  bits<5> rt;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-0}  = 0;
+}
+
+class LW_FM<bits<6> op> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = addr{20-16};
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = addr{15-0};
+}
+
+class MADDS_FM<bits<3> funct, bits<3> fmt> {
+  bits<5> fd;
+  bits<5> fr;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = fr;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-3}   = funct;
+  let Inst{2-0}   = fmt;
+}
+
+class LWXC1_FM<bits<6> funct> {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class SWXC1_FM<bits<6> funct> {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class BC1F_FM<bit nd, bit tf> {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = 0x8;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = nd;
+  let Inst{16} = tf;
+  let Inst{15-0} = offset;
+}
+
+class CEQS_FM<bits<5> fmt> {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-8} = 0; // cc
+  let Inst{7-4} = 0x3;
+  let Inst{3-0} = cond;
+}
+
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class CMov_F_I_FM<bit tf> {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 1;
+}
+
+class CMov_F_F_FM<bits<5> fmt, bit tf> {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = 0x11;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 000000000000..ad92d41209e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -0,0 +1,284 @@
+//===-- MipsInstrInfo.cpp - Mips Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR
+#include "MipsGenInstrInfo.inc"
+
+using namespace llvm;
+
+MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
+  : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+    TM(tm), UncondBrOpc(UncondBr) {}
+
+const MipsInstrInfo *MipsInstrInfo::create(MipsTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->inMips16Mode())
+    return llvm::createMips16InstrInfo(TM);
+
+  return llvm::createMipsSEInstrInfo(TM);
+}
+
+bool MipsInstrInfo::isZeroImm(const MachineOperand &op) const {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const
+{
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                                unsigned Flag) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag,
+                                 MFI.getObjectSize(FI), Align);
+}
+
+MachineInstr*
+MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                        uint64_t Offset, const MDNode *MDPtr,
+                                        DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Mips::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                                  MachineBasicBlock *&BB,
+                                  SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  int NumOp = Inst->getNumExplicitOperands();
+
+  // for both int and fp branches, the last explicit operand is the
+  // MBB.
+  BB = Inst->getOperand(NumOp-1).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(Opc));
+
+  for (int i=0; i<NumOp-1; i++)
+    Cond.push_back(Inst->getOperand(i));
+}
+
+bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  SmallVector<MachineInstr*, 2> BranchInstrs;
+  BranchType BT = AnalyzeBranch(MBB, TBB, FBB, Cond, AllowModify, BranchInstrs);
+
+  return (BT == BT_None) || (BT == BT_Indirect);
+}
+
+void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
+                                MachineBasicBlock *TBB, DebugLoc DL,
+                                const SmallVectorImpl<MachineOperand>& Cond)
+  const {
+  unsigned Opc = Cond[0].getImm();
+  const MCInstrDesc &MCID = get(Opc);
+  MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
+
+  for (unsigned i = 1; i < Cond.size(); ++i) {
+    if (Cond[i].isReg())
+      MIB.addReg(Cond[i].getReg());
+    else if (Cond[i].isImm())
+      MIB.addImm(Cond[i].getImm());
+    else
+       assert(true && "Cannot copy operand");
+  }
+  MIB.addMBB(TBB);
+}
+
+unsigned MipsInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond,
+             DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  // # of condition operands:
+  //  Unconditional branches: 0
+  //  Floating point branches: 1 (opc)
+  //  Int BranchZero: 2 (opc, reg)
+  //  Int Branch: 3 (opc, reg0, reg1)
+  assert((Cond.size() <= 3) &&
+         "# of Mips branch conditions must be <= 3!");
+
+  // Two-way Conditional branch.
+  if (FBB) {
+    BuildCondBr(MBB, TBB, DL, Cond);
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(FBB);
+    return 2;
+  }
+
+  // One way branch.
+  // Unconditional branch.
+  if (Cond.empty())
+    BuildMI(&MBB, DL, get(UncondBrOpc)).addMBB(TBB);
+  else // Conditional branch.
+    BuildCondBr(MBB, TBB, DL, Cond);
+  return 1;
+}
+
+unsigned MipsInstrInfo::
+RemoveBranch(MachineBasicBlock &MBB) const
+{
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+  MachineBasicBlock::reverse_iterator FirstBr;
+  unsigned removed;
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  FirstBr = I;
+
+  // Up to 2 branches are removed.
+  // Note that indirect branches are not removed.
+  for(removed = 0; I != REnd && removed < 2; ++I, ++removed)
+    if (!GetAnalyzableBrOpc(I->getOpcode()))
+      break;
+
+  MBB.erase(I.base(), FirstBr.base());
+
+  return removed;
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the
+/// specified Branch instruction.
+bool MipsInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
+{
+  assert( (Cond.size() && Cond.size() <= 3) &&
+          "Invalid Mips branch condition!");
+  Cond[0].setImm(GetOppositeBranchOpc(Cond[0].getImm()));
+  return false;
+}
+
+MipsInstrInfo::BranchType MipsInstrInfo::
+AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+              MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond,
+              bool AllowModify,
+              SmallVectorImpl<MachineInstr*> &BranchInstrs) const {
+
+  MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
+
+  // Skip all the debug instructions.
+  while (I != REnd && I->isDebugValue())
+    ++I;
+
+  if (I == REnd || !isUnpredicatedTerminator(&*I)) {
+    // This block ends with no branches (it just falls through to its succ).
+    // Leave TBB/FBB null.
+    TBB = FBB = NULL;
+    return BT_NoBranch;
+  }
+
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+  BranchInstrs.push_back(LastInst);
+
+  // Not an analyzable branch (e.g., indirect jump).
+  if (!GetAnalyzableBrOpc(LastOpc))
+    return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
+
+  // Get the second to last instruction in the block.
+  unsigned SecondLastOpc = 0;
+  MachineInstr *SecondLastInst = NULL;
+
+  if (++I != REnd) {
+    SecondLastInst = &*I;
+    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
+
+    // Not an analyzable branch (must be an indirect jump).
+    if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
+      return BT_None;
+  }
+
+  // If there is only one terminator instruction, process it.
+  if (!SecondLastOpc) {
+    // Unconditional branch
+    if (LastOpc == UncondBrOpc) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return BT_Uncond;
+    }
+
+    // Conditional branch
+    AnalyzeCondBr(LastInst, LastOpc, TBB, Cond);
+    return BT_Cond;
+  }
+
+  // If we reached here, there are two branches.
+  // If there are three terminators, we don't know what sort of block this is.
+  if (++I != REnd && isUnpredicatedTerminator(&*I))
+    return BT_None;
+
+  BranchInstrs.insert(BranchInstrs.begin(), SecondLastInst);
+
+  // If second to last instruction is an unconditional branch,
+  // analyze it and remove the last instruction.
+  if (SecondLastOpc == UncondBrOpc) {
+    // Return if the last instruction cannot be removed.
+    if (!AllowModify)
+      return BT_None;
+
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    LastInst->eraseFromParent();
+    BranchInstrs.pop_back();
+    return BT_Uncond;
+  }
+
+  // Conditional branch followed by an unconditional branch.
+  // The last one must be unconditional.
+  if (LastOpc != UncondBrOpc)
+    return BT_None;
+
+  AnalyzeCondBr(SecondLastInst, SecondLastOpc, TBB, Cond);
+  FBB = LastInst->getOperand(0).getMBB();
+
+  return BT_CondUncond;
+}
+
+/// Return the number of bytes of code the specified instruction may be.
+unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MI->getDesc().getSize();
+  case  TargetOpcode::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 000000000000..8c05d97beac2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -0,0 +1,142 @@
+//===-- MipsInstrInfo.h - Mips Instruction Information ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTRUCTIONINFO_H
+#define MIPSINSTRUCTIONINFO_H
+
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "MipsGenInstrInfo.inc"
+
+namespace llvm {
+
+class MipsInstrInfo : public MipsGenInstrInfo {
+protected:
+  MipsTargetMachine &TM;
+  unsigned UncondBrOpc;
+
+public:
+  enum BranchType {
+    BT_None,       // Couldn't analyze branch.
+    BT_NoBranch,   // No branches found.
+    BT_Uncond,     // One unconditional branch.
+    BT_Cond,       // One conditional branch.
+    BT_CondUncond, // A conditional branch followed by an unconditional branch.
+    BT_Indirect    // One indirct branch.
+  };
+
+  explicit MipsInstrInfo(MipsTargetMachine &TM, unsigned UncondBrOpc);
+
+  static const MipsInstrInfo *create(MipsTargetMachine &TM);
+
+  /// Branch Analysis
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                           MachineBasicBlock *&FBB,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           bool AllowModify,
+                           SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
+
+  virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx, uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
+
+  /// Insert nop instruction when hazard condition is found
+  virtual void insertNoop(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const = 0;
+
+  /// Return the number of bytes of code the specified instruction may be.
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const {
+    storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
+  }
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+    loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
+  }
+
+  virtual void storeRegToStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               int64_t Offset) const = 0;
+
+  virtual void loadRegFromStack(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI,
+                                int64_t Offset) const = 0;
+
+protected:
+  bool isZeroImm(const MachineOperand &op) const;
+
+  MachineMemOperand *GetMemOperand(MachineBasicBlock &MBB, int FI,
+                                   unsigned Flag) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const = 0;
+
+  void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
+                     MachineBasicBlock *&BB,
+                     SmallVectorImpl<MachineOperand> &Cond) const;
+
+  void BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB, DebugLoc DL,
+                   const SmallVectorImpl<MachineOperand>& Cond) const;
+};
+
+/// Create MipsInstrInfo objects.
+const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
+const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 000000000000..3a82e8171301
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -0,0 +1,1328 @@
+//===- MipsInstrInfo.td - Target Description for Mips Target -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
+                                                SDTCisSameAs<1, 2>,
+                                                SDTCisSameAs<3, 4>,
+                                                SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+def SDT_ExtractLOHI : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, untyped>,
+                                           SDTCisVT<2, i32>]>;
+def SDT_InsertLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                          SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
+                                    SDTCisSameAs<1, 2>]>;
+def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
+                                     [SDTCisVT<0, untyped>, SDTCisSameAs<0, 3>,
+                                      SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MipsDivRem16 : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+
+def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+                                   SDTCisSameAs<0, 4>]>;
+
+def SDTMipsLoadLR  : SDTypeProfile<1, 2,
+                                   [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                    SDTCisSameAs<0, 2>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
+                         [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                          SDNPVariadic]>;
+
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// TlsGd node is used to handle General Dynamic TLS
+def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
+
+// TprelHi and TprelLo nodes are used to handle Local Exec TLS
+def MipsTprelHi    : SDNode<"MipsISD::TprelHi", SDTIntUnaryOp>;
+def MipsTprelLo    : SDNode<"MipsISD::TprelLo", SDTIntUnaryOp>;
+
+// Thread pointer
+def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
+
+// Node used to extract integer from LO/HI register.
+def ExtractLOHI : SDNode<"MipsISD::ExtractLOHI", SDT_ExtractLOHI>;
+
+// Node used to insert 32-bit integers to LOHI register pair.
+def InsertLOHI : SDNode<"MipsISD::InsertLOHI", SDT_InsertLOHI>;
+
+// Mult nodes.
+def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
+def MipsMultu : SDNode<"MipsISD::Multu", SDT_MipsMultDiv>;
+
+// MAdd*/MSub* nodes
+def MipsMAdd  : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub>;
+def MipsMAddu : SDNode<"MipsISD::MAddu", SDT_MipsMAddMSub>;
+def MipsMSub  : SDNode<"MipsISD::MSub", SDT_MipsMAddMSub>;
+def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
+
+// DivRem(u) nodes
+def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
+def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
+def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16, [SDNPOutGlue]>;
+def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
+
+// Target constant nodes that are not part of any isel patterns and remain
+// unchanged can cause instructions with illegal operands to be emitted.
+// Wrapper node patterns give the instruction selector a chance to replace
+// target constant nodes that would otherwise remain unchanged with ADDiu
+// nodes. Without these wrapper node patterns, the following conditional move
+// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// compiled:
+//  movn  %got(d)($gp), %got(c)($gp), $4
+// This instruction is illegal since movn can take only register operands.
+
+def MipsWrapper    : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
+
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
+
+def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+
+def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLWR : SDNode<"MipsISD::LWR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSWL : SDNode<"MipsISD::SWL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSWR : SDNode<"MipsISD::SWR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsLDL : SDNode<"MipsISD::LDL", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsLDR : SDNode<"MipsISD::LDR", SDTMipsLoadLR,
+                     [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def MipsSDL : SDNode<"MipsISD::SDL", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
+                     [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasSEInReg  :     Predicate<"Subtarget.hasSEInReg()">,
+                      AssemblerPredicate<"FeatureSEInReg">;
+def HasBitCount :     Predicate<"Subtarget.hasBitCount()">,
+                      AssemblerPredicate<"FeatureBitCount">;
+def HasSwap     :     Predicate<"Subtarget.hasSwap()">,
+                      AssemblerPredicate<"FeatureSwap">;
+def HasCondMov  :     Predicate<"Subtarget.hasCondMov()">,
+                      AssemblerPredicate<"FeatureCondMov">;
+def HasFPIdx    :     Predicate<"Subtarget.hasFPIdx()">,
+                      AssemblerPredicate<"FeatureFPIdx">;
+def HasMips32    :    Predicate<"Subtarget.hasMips32()">,
+                      AssemblerPredicate<"FeatureMips32">;
+def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
+                      AssemblerPredicate<"FeatureMips32r2">;
+def HasMips64    :    Predicate<"Subtarget.hasMips64()">,
+                      AssemblerPredicate<"FeatureMips64">;
+def NotMips64    :    Predicate<"!Subtarget.hasMips64()">,
+                      AssemblerPredicate<"!FeatureMips64">;
+def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
+                      AssemblerPredicate<"FeatureMips64r2">;
+def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
+                      AssemblerPredicate<"FeatureN64">;
+def NotN64      :     Predicate<"!Subtarget.isABI_N64()">,
+                      AssemblerPredicate<"!FeatureN64">;
+def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
+                      AssemblerPredicate<"FeatureMips16">;
+def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
+                      AssemblerPredicate<"FeatureMips32">;
+def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
+                      AssemblerPredicate<"FeatureMips32">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
+                      AssemblerPredicate<"FeatureMips32">;
+def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
+                      AssemblerPredicate<"!FeatureMips16">;
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
+  let Predicates = [HasStdEnc];
+}
+
+class IsCommutable {
+  bit isCommutable = 1;
+}
+
+class IsBranch {
+  bit isBranch = 1;
+}
+
+class IsReturn {
+  bit isReturn = 1;
+}
+
+class IsCall {
+  bit isCall = 1;
+}
+
+class IsTailCall {
+  bit isCall = 1;
+  bit isTerminator = 1;
+  bit isReturn = 1;
+  bit isBarrier = 1;
+  bit hasExtraSrcRegAllocReq = 1;
+  bit isCodeGenOnly = 1;
+}
+
+class IsAsCheapAsAMove {
+  bit isAsCheapAsAMove = 1;
+}
+
+class NeverHasSideEffects {
+  bit neverHasSideEffects = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def jmptarget   : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValue";
+}
+def brtarget    : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget";
+}
+def calltarget  : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValue";
+}
+def calltarget64: Operand<i64>;
+def simm16      : Operand<i32> {
+  let DecoderMethod= "DecodeSimm16";
+}
+
+def simm20      : Operand<i32> {
+}
+
+def simm16_64   : Operand<i64>;
+def shamt       : Operand<i32>;
+
+// Unsigned Operand
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def MipsMemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let ParserMethod = "parseMemOperand";
+}
+
+// Address operand
+def mem : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPURegs, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU64Regs, simm16_64);
+  let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_ea : Operand<i32> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPURegs, simm16);
+  let EncoderMethod = "getMemEncoding";
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def mem_ea_64 : Operand<i64> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPU64Regs, simm16_64);
+  let EncoderMethod = "getMemEncoding";
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// size operand of ext instruction
+def size_ext : Operand<i32> {
+  let EncoderMethod = "getSizeExtEncoding";
+  let DecoderMethod = "DecodeExtSize";
+}
+
+// size operand of ins instruction
+def size_ins : Operand<i32> {
+  let EncoderMethod = "getSizeInsEncoding";
+  let DecoderMethod = "DecodeInsSize";
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getImm(N, N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getImm(N, (N->getZExtValue() >> 16) & 0xFFFF);
+}]>;
+
+// Plus 1.
+def Plus1 : SDNodeXForm<imm, [{ return getImm(N, N->getSExtValue() + 1); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt8  : PatLeaf<(imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 15-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt15  : PatLeaf<(imm), [{ return isInt<15>(N->getSExtValue()); }]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+  else
+    return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// Immediate can be loaded with LUi (32-bit int with lower 16-bit cleared).
+def immLow16Zero : PatLeaf<(imm), [{
+  int64_t Val = N->getSExtValue();
+  return isInt<32>(Val) && !(Val & 0xffff);
+}]>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : ImmLeaf<i32, [{return Imm == (Imm & 0x1f);}]>;
+
+// True if (N + 1) fits in 16-bit field.
+def immSExt16Plus1 : PatLeaf<(imm), [{
+  return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr :
+  ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
+
+def addrRegImm :
+  ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
+
+def addrDefault :
+  ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
+                  InstrItinClass Itin = NoItinerary,
+                  SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RO:$rd, (OpNode RO:$rs, RO:$rt))], Itin, FrmR> {
+  let isCommutable = isComm;
+  let isReMaterializable = 1;
+  string BaseOpcode;
+  string Arch;
+}
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
+                  SDPatternOperator imm_type = null_frag,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))], IIAlu, FrmI> {
+  let isReMaterializable = 1;
+}
+
+// Arithmetic Multiply ADD/SUB
+class MArithR<string opstr, bit isComm = 0> :
+  InstSE<(outs), (ins CPURegsOpnd:$rs, CPURegsOpnd:$rt),
+         !strconcat(opstr, "\t$rs, $rt"), [], IIImul, FrmR> {
+  let Defs = [HI, LO];
+  let Uses = [HI, LO];
+  let isCommutable = isComm;
+}
+
+//  Logical
+class LogicNOR<string opstr, RegisterOperand RC>:
+  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set RC:$rd, (not (or RC:$rs, RC:$rt)))], IIAlu, FrmR> {
+  let isCommutable = 1;
+}
+
+// Shifts
+class shift_rotate_imm<string opstr, Operand ImmOpnd,
+                       RegisterOperand RC, SDPatternOperator OpNode = null_frag,
+                       SDPatternOperator PF = null_frag> :
+  InstSE<(outs RC:$rd), (ins RC:$rt, ImmOpnd:$shamt),
+         !strconcat(opstr, "\t$rd, $rt, $shamt"),
+         [(set RC:$rd, (OpNode RC:$rt, PF:$shamt))], IIAlu, FrmR>;
+
+class shift_rotate_reg<string opstr, RegisterOperand RC,
+                       SDPatternOperator OpNode = null_frag>:
+  InstSE<(outs RC:$rd), (ins CPURegsOpnd:$rs, RC:$rt),
+         !strconcat(opstr, "\t$rd, $rt, $rs"),
+         [(set RC:$rd, (OpNode RC:$rt, CPURegsOpnd:$rs))], IIAlu, FrmR>;
+
+// Load Upper Imediate
+class LoadUpper<string opstr, RegisterClass RC, Operand Imm>:
+  InstSE<(outs RC:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
+         [], IIAlu, FrmI>, IsAsCheapAsAMove {
+  let neverHasSideEffects = 1;
+  let isReMaterializable = 1;
+}
+
+class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+          InstrItinClass itin>: FFI<op, outs, ins, asmstr, pattern> {
+  bits<21> addr;
+  let Inst{25-21} = addr{20-16};
+  let Inst{15-0}  = addr{15-0};
+  let DecoderMethod = "DecodeMem";
+}
+
+// Memory Load/Store
+class Load<string opstr, SDPatternOperator OpNode, RegisterClass RC,
+           Operand MemOpnd, ComplexPattern Addr> :
+  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode Addr:$addr))], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let canFoldAsLoad = 1;
+  let mayLoad = 1;
+}
+
+class Store<string opstr, SDPatternOperator OpNode, RegisterClass RC,
+            Operand MemOpnd, ComplexPattern Addr> :
+  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, Addr:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+}
+
+multiclass LoadM<string opstr, RegisterClass RC,
+                 SDPatternOperator OpNode = null_frag,
+                 ComplexPattern Addr = addr> {
+  def NAME : Load<opstr, OpNode, RC, mem, Addr>, Requires<[NotN64, HasStdEnc]>;
+  def _P8  : Load<opstr, OpNode, RC, mem64, Addr>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+multiclass StoreM<string opstr, RegisterClass RC,
+                  SDPatternOperator OpNode = null_frag,
+                  ComplexPattern Addr = addr> {
+  def NAME : Store<opstr, OpNode, RC, mem, Addr>, Requires<[NotN64, HasStdEnc]>;
+  def _P8  : Store<opstr, OpNode, RC, mem64, Addr>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// Load/Store Left/Right
+let canFoldAsLoad = 1 in
+class LoadLeftRight<string opstr, SDNode OpNode, RegisterClass RC,
+                    Operand MemOpnd> :
+  InstSE<(outs RC:$rt), (ins MemOpnd:$addr, RC:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addr:$addr, RC:$src))], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRight<string opstr, SDNode OpNode, RegisterClass RC,
+                     Operand MemOpnd>:
+  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addr:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+}
+
+multiclass LoadLeftRightM<string opstr, SDNode OpNode, RegisterClass RC> {
+  def NAME : LoadLeftRight<opstr, OpNode, RC, mem>,
+             Requires<[NotN64, HasStdEnc]>;
+  def _P8  : LoadLeftRight<opstr, OpNode, RC, mem64>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+multiclass StoreLeftRightM<string opstr, SDNode OpNode, RegisterClass RC> {
+  def NAME : StoreLeftRight<opstr, OpNode, RC, mem>,
+             Requires<[NotN64, HasStdEnc]>;
+  def _P8  : StoreLeftRight<opstr, OpNode, RC, mem64>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+    let isCodeGenOnly = 1;
+  }
+}
+
+// Conditional Branch
+class CBranch<string opstr, PatFrag cond_op, RegisterClass RC> :
+  InstSE<(outs), (ins RC:$rs, RC:$rt, brtarget:$offset),
+         !strconcat(opstr, "\t$rs, $rt, $offset"),
+         [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$offset)], IIBranch,
+         FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+class CBranchZero<string opstr, PatFrag cond_op, RegisterClass RC> :
+  InstSE<(outs), (ins RC:$rs, brtarget:$offset),
+         !strconcat(opstr, "\t$rs, $offset"),
+         [(brcond (i32 (cond_op RC:$rs, 0)), bb:$offset)], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+}
+
+// SetCC
+class SetCC_R<string opstr, PatFrag cond_op, RegisterClass RC> :
+  InstSE<(outs CPURegsOpnd:$rd), (ins RC:$rs, RC:$rt),
+         !strconcat(opstr, "\t$rd, $rs, $rt"),
+         [(set CPURegsOpnd:$rd, (cond_op RC:$rs, RC:$rt))], IIAlu, FrmR>;
+
+class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
+              RegisterClass RC>:
+  InstSE<(outs CPURegsOpnd:$rt), (ins RC:$rs, Od:$imm16),
+         !strconcat(opstr, "\t$rt, $rs, $imm16"),
+         [(set CPURegsOpnd:$rt, (cond_op RC:$rs, imm_type:$imm16))],
+         IIAlu, FrmI>;
+
+// Jump
+class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
+             SDPatternOperator targetoperator> :
+  InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+         [(operator targetoperator:$target)], IIBranch, FrmJ> {
+  let isTerminator=1;
+  let isBarrier=1;
+  let hasDelaySlot = 1;
+  let DecoderMethod = "DecodeJumpTarget";
+  let Defs = [AT];
+}
+
+// Unconditional branch
+class UncondBranch<string opstr> :
+  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
+         [(br bb:$offset)], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Predicates = [RelocPIC, HasStdEnc];
+  let Defs = [AT];
+}
+
+// Base class for indirect branch and return instruction classes.
+let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFR<RegisterClass RC, SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RC:$rs), "jr\t$rs", [(operator RC:$rs)], IIBranch, FrmR>;
+
+// Indirect branch
+class IndirectBranch<RegisterClass RC>: JumpFR<RC, brind> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+}
+
+// Return instruction
+class RetBase<RegisterClass RC>: JumpFR<RC> {
+  let isReturn = 1;
+  let isCodeGenOnly = 1;
+  let hasCtrlDep = 1;
+  let hasExtraSrcRegAllocReq = 1;
+}
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1, Defs = [RA] in {
+  class JumpLink<string opstr> :
+    InstSE<(outs), (ins calltarget:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink imm:$target)], IIBranch, FrmJ> {
+    let DecoderMethod = "DecodeJumpTarget";
+  }
+
+  class JumpLinkRegPseudo<RegisterClass RC, Instruction JALRInst,
+                          Register RetReg>:
+    PseudoSE<(outs), (ins RC:$rs), [(MipsJmpLink RC:$rs)], IIBranch>,
+    PseudoInstExpansion<(JALRInst RetReg, RC:$rs)>;
+
+  class JumpLinkReg<string opstr, RegisterClass RC>:
+    InstSE<(outs RC:$rd), (ins RC:$rs), !strconcat(opstr, "\t$rd, $rs"),
+           [], IIBranch, FrmR>;
+
+  class BGEZAL_FT<string opstr, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, brtarget:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI>;
+
+}
+
+class BAL_FT :
+  InstSE<(outs), (ins brtarget:$offset), "bal\t$offset", [], IIBranch, FrmI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let hasDelaySlot = 1;
+  let Defs = [RA];
+}
+
+// Sync
+let hasSideEffects = 1 in
+class SYNC_FT :
+  InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
+         NoItinerary, FrmOther>;
+
+// Mul, Div
+class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
+           list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$rs, $rt"), [],
+         itin, FrmR> {
+  let isCommutable = 1;
+  let Defs = DefRegs;
+  let neverHasSideEffects = 1;
+}
+
+// Pseudo multiply/divide instruction with explicit accumulator register
+// operands.
+class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
+                    SDPatternOperator OpNode, InstrItinClass Itin,
+                    bit IsComm = 1, bit HasSideEffects = 0> :
+  PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
+           [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
+  PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
+  let isCommutable = IsComm;
+  let hasSideEffects = HasSideEffects;
+}
+
+// Pseudo multiply add/sub instruction with explicit accumulator register
+// operands.
+class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
+  : PseudoSE<(outs ACRegs:$ac),
+             (ins CPURegsOpnd:$rs, CPURegsOpnd:$rt, ACRegs:$acin),
+             [(set ACRegs:$ac,
+              (OpNode CPURegsOpnd:$rs, CPURegsOpnd:$rt, ACRegs:$acin))],
+             IIImul>,
+    PseudoInstExpansion<(RealInst CPURegsOpnd:$rs, CPURegsOpnd:$rt)> {
+  string Constraints = "$acin = $ac";
+}
+
+class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
+          list<Register> DefRegs> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
+         [], itin, FrmR> {
+  let Defs = DefRegs;
+}
+
+// Move from Hi/Lo
+class MoveFromLOHI<string opstr, RegisterClass RC, list<Register> UseRegs>:
+  InstSE<(outs RC:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR> {
+  let Uses = UseRegs;
+  let neverHasSideEffects = 1;
+}
+
+class MoveToLOHI<string opstr, RegisterClass RC, list<Register> DefRegs>:
+  InstSE<(outs), (ins RC:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo, FrmR> {
+  let Defs = DefRegs;
+  let neverHasSideEffects = 1;
+}
+
+class EffectiveAddress<string opstr, RegisterClass RC, Operand Mem> :
+  InstSE<(outs RC:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, addr:$addr)], NoItinerary, FrmI> {
+  let isCodeGenOnly = 1;
+  let DecoderMethod = "DecodeMem";
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading0<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz RO:$rs))], IIAlu, FrmR>,
+  Requires<[HasBitCount, HasStdEnc]>;
+
+class CountLeading1<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+         [(set RO:$rd, (ctlz (not RO:$rs)))], IIAlu, FrmR>,
+  Requires<[HasBitCount, HasStdEnc]>;
+
+
+// Sign Extend in Register.
+class SignExtInReg<string opstr, ValueType vt, RegisterClass RC> :
+  InstSE<(outs RC:$rd), (ins RC:$rt), !strconcat(opstr, "\t$rd, $rt"),
+         [(set RC:$rd, (sext_inreg RC:$rt, vt))], NoItinerary, FrmR> {
+  let Predicates = [HasSEInReg, HasStdEnc];
+}
+
+// Subword Swap
+class SubwordSwap<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
+         NoItinerary, FrmR> {
+  let Predicates = [HasSwap, HasStdEnc];
+  let neverHasSideEffects = 1;
+}
+
+// Read Hardware
+class ReadHardware<RegisterClass CPURegClass, RegisterOperand RO> :
+  InstSE<(outs CPURegClass:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
+         IIAlu, FrmR>;
+
+// Ext and Ins
+class ExtBase<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ext:$size),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (MipsExt RO:$rs, imm:$pos, imm:$size))], NoItinerary,
+         FrmR> {
+  let Predicates = [HasMips32r2, HasStdEnc];
+}
+
+class InsBase<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ins:$size, RO:$src),
+         !strconcat(opstr, " $rt, $rs, $pos, $size"),
+         [(set RO:$rt, (MipsIns RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         NoItinerary, FrmR> {
+  let Predicates = [HasMips32r2, HasStdEnc];
+  let Constraints = "$src = $rt";
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
+  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
+
+multiclass Atomic2Ops32<PatFrag Op> {
+  def NAME : Atomic2Ops<Op, CPURegs, CPURegs>, Requires<[NotN64, HasStdEnc]>;
+  def _P8  : Atomic2Ops<Op, CPURegs, CPU64Regs>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+  }
+}
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
+  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
+
+multiclass AtomicCmpSwap32<PatFrag Op>  {
+  def NAME : AtomicCmpSwap<Op, CPURegs, CPURegs>,
+             Requires<[NotN64, HasStdEnc]>;
+  def _P8  : AtomicCmpSwap<Op, CPURegs, CPU64Regs>,
+             Requires<[IsN64, HasStdEnc]> {
+    let DecoderNamespace = "Mips64";
+  }
+}
+
+class LLBase<string opstr, RegisterOperand RO, Operand Mem> :
+  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayLoad = 1;
+}
+
+class SCBase<string opstr, RegisterOperand RO, Operand Mem> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, Mem:$addr),
+         !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMem";
+  let mayStore = 1;
+  let Constraints = "$rt = $dst";
+}
+
+class MFC3OP<dag outs, dag ins, string asmstr> :
+  InstSE<outs, ins, asmstr, [], NoItinerary, FrmFR>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// Return RA.
+let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
+def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>;
+
+let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8>;
+  defm ATOMIC_LOAD_ADD_I16  : Atomic2Ops32<atomic_load_add_16>;
+  defm ATOMIC_LOAD_ADD_I32  : Atomic2Ops32<atomic_load_add_32>;
+  defm ATOMIC_LOAD_SUB_I8   : Atomic2Ops32<atomic_load_sub_8>;
+  defm ATOMIC_LOAD_SUB_I16  : Atomic2Ops32<atomic_load_sub_16>;
+  defm ATOMIC_LOAD_SUB_I32  : Atomic2Ops32<atomic_load_sub_32>;
+  defm ATOMIC_LOAD_AND_I8   : Atomic2Ops32<atomic_load_and_8>;
+  defm ATOMIC_LOAD_AND_I16  : Atomic2Ops32<atomic_load_and_16>;
+  defm ATOMIC_LOAD_AND_I32  : Atomic2Ops32<atomic_load_and_32>;
+  defm ATOMIC_LOAD_OR_I8    : Atomic2Ops32<atomic_load_or_8>;
+  defm ATOMIC_LOAD_OR_I16   : Atomic2Ops32<atomic_load_or_16>;
+  defm ATOMIC_LOAD_OR_I32   : Atomic2Ops32<atomic_load_or_32>;
+  defm ATOMIC_LOAD_XOR_I8   : Atomic2Ops32<atomic_load_xor_8>;
+  defm ATOMIC_LOAD_XOR_I16  : Atomic2Ops32<atomic_load_xor_16>;
+  defm ATOMIC_LOAD_XOR_I32  : Atomic2Ops32<atomic_load_xor_32>;
+  defm ATOMIC_LOAD_NAND_I8  : Atomic2Ops32<atomic_load_nand_8>;
+  defm ATOMIC_LOAD_NAND_I16 : Atomic2Ops32<atomic_load_nand_16>;
+  defm ATOMIC_LOAD_NAND_I32 : Atomic2Ops32<atomic_load_nand_32>;
+
+  defm ATOMIC_SWAP_I8       : Atomic2Ops32<atomic_swap_8>;
+  defm ATOMIC_SWAP_I16      : Atomic2Ops32<atomic_swap_16>;
+  defm ATOMIC_SWAP_I32      : Atomic2Ops32<atomic_swap_32>;
+
+  defm ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap32<atomic_cmp_swap_8>;
+  defm ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap32<atomic_cmp_swap_16>;
+  defm ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap32<atomic_cmp_swap_32>;
+}
+
+/// Pseudo instructions for loading, storing and copying accumulator registers.
+let isPseudo = 1 in {
+  defm LOAD_AC64  : LoadM<"load_ac64", ACRegs>;
+  defm STORE_AC64 : StoreM<"store_ac64", ACRegs>;
+}
+
+def COPY_AC64 : PseudoSE<(outs ACRegs:$dst), (ins ACRegs:$src), []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDiu : ArithLogicI<"addiu", simm16, CPURegsOpnd, immSExt16, add>,
+            ADDI_FM<0x9>, IsAsCheapAsAMove;
+def ADDi  : ArithLogicI<"addi", simm16, CPURegsOpnd>, ADDI_FM<0x8>;
+def SLTi  : SetCC_I<"slti", setlt, simm16, immSExt16, CPURegs>, SLTI_FM<0xa>;
+def SLTiu : SetCC_I<"sltiu", setult, simm16, immSExt16, CPURegs>, SLTI_FM<0xb>;
+def ANDi  : ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16, and>,
+            ADDI_FM<0xc>;
+def ORi   : ArithLogicI<"ori", uimm16, CPURegsOpnd, immZExt16, or>,
+            ADDI_FM<0xd>;
+def XORi  : ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16, xor>,
+            ADDI_FM<0xe>;
+def LUi   : LoadUpper<"lui", CPURegs, uimm16>, LUI_FM;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def ADDu : ArithLogicR<"addu", CPURegsOpnd, 1, IIAlu, add>, ADD_FM<0, 0x21>;
+def SUBu : ArithLogicR<"subu", CPURegsOpnd, 0, IIAlu, sub>, ADD_FM<0, 0x23>;
+def MUL  : ArithLogicR<"mul", CPURegsOpnd, 1, IIImul, mul>, ADD_FM<0x1c, 2>;
+def ADD  : ArithLogicR<"add", CPURegsOpnd>, ADD_FM<0, 0x20>;
+def SUB  : ArithLogicR<"sub", CPURegsOpnd>, ADD_FM<0, 0x22>;
+def SLT  : SetCC_R<"slt", setlt, CPURegs>, ADD_FM<0, 0x2a>;
+def SLTu : SetCC_R<"sltu", setult, CPURegs>, ADD_FM<0, 0x2b>;
+def AND  : ArithLogicR<"and", CPURegsOpnd, 1, IIAlu, and>, ADD_FM<0, 0x24>;
+def OR   : ArithLogicR<"or", CPURegsOpnd, 1, IIAlu, or>, ADD_FM<0, 0x25>;
+def XOR  : ArithLogicR<"xor", CPURegsOpnd, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
+def NOR  : LogicNOR<"nor", CPURegsOpnd>, ADD_FM<0, 0x27>;
+
+/// Shift Instructions
+def SLL  : shift_rotate_imm<"sll", shamt, CPURegsOpnd, shl, immZExt5>,
+           SRA_FM<0, 0>;
+def SRL  : shift_rotate_imm<"srl", shamt, CPURegsOpnd, srl, immZExt5>,
+           SRA_FM<2, 0>;
+def SRA  : shift_rotate_imm<"sra", shamt, CPURegsOpnd, sra, immZExt5>,
+           SRA_FM<3, 0>;
+def SLLV : shift_rotate_reg<"sllv", CPURegsOpnd, shl>, SRLV_FM<4, 0>;
+def SRLV : shift_rotate_reg<"srlv", CPURegsOpnd, srl>, SRLV_FM<6, 0>;
+def SRAV : shift_rotate_reg<"srav", CPURegsOpnd, sra>, SRLV_FM<7, 0>;
+
+// Rotate Instructions
+let Predicates = [HasMips32r2, HasStdEnc] in {
+  def ROTR  : shift_rotate_imm<"rotr", shamt, CPURegsOpnd, rotr, immZExt5>,
+              SRA_FM<2, 1>;
+  def ROTRV : shift_rotate_reg<"rotrv", CPURegsOpnd, rotr>, SRLV_FM<6, 1>;
+}
+
+/// Load and Store Instructions
+///  aligned
+defm LB  : LoadM<"lb", CPURegs, sextloadi8>, LW_FM<0x20>;
+defm LBu : LoadM<"lbu", CPURegs, zextloadi8, addrDefault>, LW_FM<0x24>;
+defm LH  : LoadM<"lh", CPURegs, sextloadi16, addrDefault>, LW_FM<0x21>;
+defm LHu : LoadM<"lhu", CPURegs, zextloadi16>, LW_FM<0x25>;
+defm LW  : LoadM<"lw", CPURegs, load, addrDefault>, LW_FM<0x23>;
+defm SB  : StoreM<"sb", CPURegs, truncstorei8>, LW_FM<0x28>;
+defm SH  : StoreM<"sh", CPURegs, truncstorei16>, LW_FM<0x29>;
+defm SW  : StoreM<"sw", CPURegs, store>, LW_FM<0x2b>;
+
+/// load/store left/right
+defm LWL : LoadLeftRightM<"lwl", MipsLWL, CPURegs>, LW_FM<0x22>;
+defm LWR : LoadLeftRightM<"lwr", MipsLWR, CPURegs>, LW_FM<0x26>;
+defm SWL : StoreLeftRightM<"swl", MipsSWL, CPURegs>, LW_FM<0x2a>;
+defm SWR : StoreLeftRightM<"swr", MipsSWR, CPURegs>, LW_FM<0x2e>;
+
+def SYNC : SYNC_FT, SYNC_FM;
+
+/// Load-linked, Store-conditional
+let Predicates = [NotN64, HasStdEnc] in {
+  def LL : LLBase<"ll", CPURegsOpnd, mem>, LW_FM<0x30>;
+  def SC : SCBase<"sc", CPURegsOpnd, mem>, LW_FM<0x38>;
+}
+
+let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def LL_P8 : LLBase<"ll", CPURegsOpnd, mem64>, LW_FM<0x30>;
+  def SC_P8 : SCBase<"sc", CPURegsOpnd, mem64>, LW_FM<0x38>;
+}
+
+/// Jump and Branch Instructions
+def J       : JumpFJ<jmptarget, "j", br, bb>, FJ<2>,
+              Requires<[RelocStatic, HasStdEnc]>, IsBranch;
+def JR      : IndirectBranch<CPURegs>, MTLO_FM<8>;
+def B       : UncondBranch<"b">, B_FM;
+def BEQ     : CBranch<"beq", seteq, CPURegs>, BEQ_FM<4>;
+def BNE     : CBranch<"bne", setne, CPURegs>, BEQ_FM<5>;
+def BGEZ    : CBranchZero<"bgez", setge, CPURegs>, BGEZ_FM<1, 1>;
+def BGTZ    : CBranchZero<"bgtz", setgt, CPURegs>, BGEZ_FM<7, 0>;
+def BLEZ    : CBranchZero<"blez", setle, CPURegs>, BGEZ_FM<6, 0>;
+def BLTZ    : CBranchZero<"bltz", setlt, CPURegs>, BGEZ_FM<1, 0>;
+
+def BAL_BR: BAL_FT, BAL_FM;
+
+def JAL  : JumpLink<"jal">, FJ<3>;
+def JALR : JumpLinkReg<"jalr", CPURegs>, JALR_FM;
+def JALRPseudo : JumpLinkRegPseudo<CPURegs, JALR, RA>;
+def BGEZAL : BGEZAL_FT<"bgezal", CPURegsOpnd>, BGEZAL_FM<0x11>;
+def BLTZAL : BGEZAL_FT<"bltzal", CPURegsOpnd>, BGEZAL_FM<0x10>;
+def TAILCALL : JumpFJ<calltarget, "j", MipsTailCall, imm>, FJ<2>, IsTailCall;
+def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
+
+def RET : RetBase<CPURegs>, MTLO_FM<8>;
+
+// Exception handling related node and instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> MipsISD::EH_RETURN ->
+// MIPSeh_return -> (stack change + indirect branch)
+//
+// MIPSeh_return takes the place of regular return instruction
+// but takes two arguments (V1, V0) which are used for storing
+// the offset and return address respectively.
+def SDT_MipsEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def MIPSeh_return32 : MipsPseudo<(outs), (ins CPURegs:$spoff, CPURegs:$dst),
+                                [(MIPSehret CPURegs:$spoff, CPURegs:$dst)]>;
+  def MIPSeh_return64 : MipsPseudo<(outs), (ins CPU64Regs:$spoff,
+                                                CPU64Regs:$dst),
+                                [(MIPSehret CPU64Regs:$spoff, CPU64Regs:$dst)]>;
+}
+
+/// Multiply and Divide Instructions.
+def MULT  : Mult<"mult", IIImul, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x18>;
+def MULTu : Mult<"multu", IIImul, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x19>;
+def PseudoMULT  : MultDivPseudo<MULT, ACRegs, CPURegsOpnd, MipsMult, IIImul>;
+def PseudoMULTu : MultDivPseudo<MULTu, ACRegs, CPURegsOpnd, MipsMultu, IIImul>;
+def SDIV  : Div<"div", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1a>;
+def UDIV  : Div<"divu", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1b>;
+def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, CPURegsOpnd, MipsDivRem, IIIdiv, 0>;
+def PseudoUDIV : MultDivPseudo<UDIV, ACRegs, CPURegsOpnd, MipsDivRemU, IIIdiv,
+                               0>;
+
+def MTHI : MoveToLOHI<"mthi", CPURegs, [HI]>, MTLO_FM<0x11>;
+def MTLO : MoveToLOHI<"mtlo", CPURegs, [LO]>, MTLO_FM<0x13>;
+def MFHI : MoveFromLOHI<"mfhi", CPURegs, [HI]>, MFLO_FM<0x10>;
+def MFLO : MoveFromLOHI<"mflo", CPURegs, [LO]>, MFLO_FM<0x12>;
+
+/// Sign Ext In Register Instructions.
+def SEB : SignExtInReg<"seb", i8, CPURegs>, SEB_FM<0x10, 0x20>;
+def SEH : SignExtInReg<"seh", i16, CPURegs>, SEB_FM<0x18, 0x20>;
+
+/// Count Leading
+def CLZ : CountLeading0<"clz", CPURegsOpnd>, CLO_FM<0x20>;
+def CLO : CountLeading1<"clo", CPURegsOpnd>, CLO_FM<0x21>;
+
+/// Word Swap Bytes Within Halfwords
+def WSBH : SubwordSwap<"wsbh", CPURegsOpnd>, SEB_FM<2, 0x20>;
+
+/// No operation.
+def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : EffectiveAddress<"addiu", CPURegs, mem_ea>, LW_FM<9>;
+
+// MADD*/MSUB*
+def MADD  : MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
+def MADDU : MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
+def MSUB  : MArithR<"msub">, MULT_FM<0x1c, 4>;
+def MSUBU : MArithR<"msubu">, MULT_FM<0x1c, 5>;
+def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd>;
+def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu>;
+def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub>;
+def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu>;
+
+def RDHWR : ReadHardware<CPURegs, HWRegsOpnd>, RDHWR_FM;
+
+def EXT : ExtBase<"ext", CPURegsOpnd>, EXT_FM<0>;
+def INS : InsBase<"ins", CPURegsOpnd>, EXT_FM<4>;
+
+/// Move Control Registers From/To CPU Registers
+def MFC0_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
+                      (ins CPURegsOpnd:$rd, uimm16:$sel),
+                      "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
+
+def MTC0_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
+                      (ins CPURegsOpnd:$rt),
+                      "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
+
+def MFC2_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
+                      (ins CPURegsOpnd:$rd, uimm16:$sel),
+                      "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
+
+def MTC2_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
+                      (ins CPURegsOpnd:$rt),
+                      "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst, $src",
+                (ADDu CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
+      Requires<[NotMips64]>;
+def : InstAlias<"move $dst, $src",
+                (OR CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
+      Requires<[NotMips64]>;
+def : InstAlias<"bal $offset", (BGEZAL RA, brtarget:$offset), 1>;
+def : InstAlias<"addu $rs, $rt, $imm",
+                (ADDiu CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"add $rs, $rt, $imm",
+                (ADDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"and $rs, $rt, $imm",
+                (ANDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"j $rs", (JR CPURegs:$rs), 0>,
+      Requires<[NotMips64]>;
+def : InstAlias<"jalr $rs", (JALR RA, CPURegs:$rs)>, Requires<[NotMips64]>;
+def : InstAlias<"jal $rs", (JALR RA, CPURegs:$rs), 0>, Requires<[NotMips64]>;
+def : InstAlias<"jal $rd,$rs", (JALR CPURegs:$rd, CPURegs:$rs), 0>,
+                 Requires<[NotMips64]>;
+def : InstAlias<"not $rt, $rs",
+                (NOR CPURegsOpnd:$rt, CPURegsOpnd:$rs, ZERO), 1>;
+def : InstAlias<"neg $rt, $rs",
+                (SUB CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+def : InstAlias<"negu $rt, $rs",
+                (SUBu CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+def : InstAlias<"slt $rs, $rt, $imm",
+                (SLTi CPURegsOpnd:$rs, CPURegs:$rt, simm16:$imm), 0>;
+def : InstAlias<"xor $rs, $rt, $imm",
+                (XORi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>,
+      Requires<[NotMips64]>;
+def : InstAlias<"or $rs, $rt, $imm",
+                (ORi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>,
+                 Requires<[NotMips64]>;
+def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+def : InstAlias<"mfc0 $rt, $rd",
+                (MFC0_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
+def : InstAlias<"mtc0 $rt, $rd",
+                (MTC0_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
+def : InstAlias<"mfc2 $rt, $rd",
+                (MFC2_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
+def : InstAlias<"mtc2 $rt, $rd",
+                (MTC2_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
+
+//===----------------------------------------------------------------------===//
+// Assembler Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadImm32Reg : LoadImm32<"li", shamt,CPURegsOpnd>;
+
+class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
+                     !strconcat(instr_asm, "\t$rt, $addr")> ;
+def LoadAddr32Reg : LoadAddress<"la", mem, CPURegsOpnd>;
+
+class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
+  MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadAddr32Imm : LoadAddressImm<"la", shamt,CPURegsOpnd>;
+
+
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Load/store pattern templates.
+class LoadRegImmPat<Instruction LoadInst, ValueType ValTy, PatFrag Node> :
+  MipsPat<(ValTy (Node addrRegImm:$a)), (LoadInst addrRegImm:$a)>;
+
+class StoreRegImmPat<Instruction StoreInst, ValueType ValTy> :
+  MipsPat<(store ValTy:$v, addrRegImm:$a), (StoreInst ValTy:$v, addrRegImm:$a)>;
+
+// Small immediates
+def : MipsPat<(i32 immSExt16:$in),
+              (ADDiu ZERO, imm:$in)>;
+def : MipsPat<(i32 immZExt16:$in),
+              (ORi ZERO, imm:$in)>;
+def : MipsPat<(i32 immLow16Zero:$in),
+              (LUi (HI16 imm:$in))>;
+
+// Arbitrary immediates
+def : MipsPat<(i32 imm:$imm),
+          (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Carry MipsPatterns
+def : MipsPat<(subc CPURegs:$lhs, CPURegs:$rhs),
+              (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc CPURegs:$lhs, CPURegs:$rhs),
+              (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(addc  CPURegs:$src, immSExt16:$imm),
+              (ADDiu CPURegs:$src, imm:$imm)>;
+
+// Call
+def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+              (JAL tglobaladdr:$dst)>;
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL texternalsym:$dst)>;
+//def : MipsPat<(MipsJmpLink CPURegs:$dst),
+//              (JALR CPURegs:$dst)>;
+
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL texternalsym:$dst)>;
+// hi/lo relocs
+def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
+
+def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
+def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
+
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu CPURegs:$hi, tjumptable:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+
+// gp_rel relocs
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu CPURegs:$gp, tconstpool:$in)>;
+
+// wrapper_pic
+class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+      MipsPat<(MipsWrapper RC:$gp, node:$in),
+              (ADDiuOp RC:$gp, node:$in)>;
+
+def : WrapperPat<tglobaladdr, ADDiu, CPURegs>;
+def : WrapperPat<tconstpool, ADDiu, CPURegs>;
+def : WrapperPat<texternalsym, ADDiu, CPURegs>;
+def : WrapperPat<tblockaddress, ADDiu, CPURegs>;
+def : WrapperPat<tjumptable, ADDiu, CPURegs>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>;
+
+// Mips does not have "not", so we expand our way
+def : MipsPat<(not CPURegs:$in),
+              (NOR CPURegsOpnd:$in, ZERO)>;
+
+// extended loads
+let Predicates = [NotN64, HasStdEnc] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
+}
+let Predicates = [IsN64, HasStdEnc] in {
+  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
+}
+
+// peepholes
+let Predicates = [NotN64, HasStdEnc] in {
+  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+}
+let Predicates = [IsN64, HasStdEnc] in {
+  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
+}
+
+// brcond patterns
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
+                      Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
+                      Instruction SLTiuOp, Register ZEROReg> {
+def : MipsPat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+              (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : MipsPat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+              (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+              (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : MipsPat<(brcond RC:$cond, bb:$dst),
+              (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+
+defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+
+// setcc patterns
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+                     Instruction SLTuOp, Register ZEROReg> {
+  def : MipsPat<(seteq RC:$lhs, RC:$rhs),
+                (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setne RC:$lhs, RC:$rhs),
+                (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setle RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : MipsPat<(setule RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setgt RC:$lhs, RC:$rhs),
+                (SLTOp RC:$rhs, RC:$lhs)>;
+  def : MipsPat<(setugt RC:$lhs, RC:$rhs),
+                (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : MipsPat<(setge RC:$lhs, RC:$rhs),
+                (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, RC:$rhs),
+                (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
+                        Instruction SLTiuOp> {
+  def : MipsPat<(setge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : MipsPat<(setuge RC:$lhs, immSExt16:$rhs),
+                (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
+defm : SetlePats<CPURegs, SLT, SLTu>;
+defm : SetgtPats<CPURegs, SLT, SLTu>;
+defm : SetgePats<CPURegs, SLT, SLTu>;
+defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
+
+// bswap pattern
+def : MipsPat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
+
+// mflo/hi patterns.
+def : MipsPat<(i32 (ExtractLOHI ACRegs:$ac, imm:$lohi_idx)),
+              (EXTRACT_SUBREG ACRegs:$ac, imm:$lohi_idx)>;
+
+// Load halfword/word patterns.
+let AddedComplexity = 40 in {
+  let Predicates = [NotN64, HasStdEnc] in {
+    def : LoadRegImmPat<LBu, i32, zextloadi8>;
+    def : LoadRegImmPat<LH, i32, sextloadi16>;
+    def : LoadRegImmPat<LW, i32, load>;
+  }
+  let Predicates = [IsN64, HasStdEnc] in {
+    def : LoadRegImmPat<LBu_P8, i32, zextloadi8>;
+    def : LoadRegImmPat<LH_P8, i32, sextloadi16>;
+    def : LoadRegImmPat<LW_P8, i32, load>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
+include "MipsCondMov.td"
+
+//
+// Mips16
+
+include "Mips16InstrFormats.td"
+include "Mips16InstrInfo.td"
+
+// DSP
+include "MipsDSPInstrFormats.td"
+include "MipsDSPInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp
new file mode 100644
index 000000000000..1b2a325d3ce6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsJITInfo.cpp
@@ -0,0 +1,285 @@
+//===-- MipsJITInfo.cpp - Implement the Mips JIT Interface ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Mips target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "MipsJITInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+
+void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)Old;
+  const unsigned NopInstr = 0x0;
+
+  // If the functions are in the same memory segment, insert PC-region branch.
+  if ((NewAddr & 0xF0000000) == ((OldAddr + 4) & 0xF0000000)) {
+    unsigned *OldInstruction = (unsigned *)Old;
+    *OldInstruction = 0x08000000;
+    unsigned JTargetAddr = NewAddr & 0x0FFFFFFC;
+
+    JTargetAddr >>= 2;
+    *OldInstruction |= JTargetAddr;
+
+    // Insert a NOP.
+    OldInstruction++;
+    *OldInstruction = NopInstr;
+
+    sys::Memory::InvalidateInstructionCache(Old, 2 * 4);
+  } else {
+    // We need to clear hint bits from the instruction, in case it is 'jr ra'.
+    const unsigned HintMask = 0xFFFFF83F, ReturnSequence = 0x03e00008;
+    unsigned* CurrentInstr = (unsigned*)Old;
+    unsigned CurrInstrHintClear = (*CurrentInstr) & HintMask;
+    unsigned* NextInstr = CurrentInstr + 1;
+    unsigned NextInstrHintClear = (*NextInstr) & HintMask;
+
+    // Do absolute jump if there are 2 or more instructions before return from
+    // the old function.
+    if ((CurrInstrHintClear != ReturnSequence) &&
+        (NextInstrHintClear != ReturnSequence)) {
+      const unsigned LuiT0Instr = 0x3c080000, AddiuT0Instr = 0x25080000;
+      const unsigned JrT0Instr = 0x01000008;
+      // lui  t0,  high 16 bit of the NewAddr
+      (*(CurrentInstr++)) = LuiT0Instr | ((NewAddr & 0xffff0000) >> 16);
+      // addiu  t0, t0, low 16 bit of the NewAddr
+      (*(CurrentInstr++)) = AddiuT0Instr | (NewAddr & 0x0000ffff);
+      // jr t0
+      (*(CurrentInstr++)) = JrT0Instr;
+      (*CurrentInstr) = NopInstr;
+
+      sys::Memory::InvalidateInstructionCache(Old, 4 * 4);
+    } else {
+      // Unsupported case
+      report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+    }
+  }
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring. This code saves registers, calls
+// MipsCompilationCallbackC and restores registers.
+extern "C" {
+#if defined (__mips__)
+void MipsCompilationCallback();
+
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "MipsCompilationCallback\n"
+    ASMPREFIX "MipsCompilationCallback:\n"
+    ".ent " ASMPREFIX "MipsCompilationCallback\n"
+    ".frame  $sp, 32, $ra\n"
+    ".set  noreorder\n"
+    ".cpload $t9\n"
+
+    "addiu $sp, $sp, -64\n"
+    ".cprestore 16\n"
+
+    // Save argument registers a0, a1, a2, a3, f12, f14 since they may contain
+    // stuff for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned. We also need to save the ra register since it contains the
+    // original return address, and t8 register since it contains the address
+    // of the end of function stub.
+    "sw $a0, 20($sp)\n"
+    "sw $a1, 24($sp)\n"
+    "sw $a2, 28($sp)\n"
+    "sw $a3, 32($sp)\n"
+    "sw $ra, 36($sp)\n"
+    "sw $t8, 40($sp)\n"
+    "sdc1 $f12, 48($sp)\n"
+    "sdc1 $f14, 56($sp)\n"
+
+    // t8 points at the end of function stub. Pass the beginning of the stub
+    // to the MipsCompilationCallbackC.
+    "addiu $a0, $t8, -16\n"
+    "jal " ASMPREFIX "MipsCompilationCallbackC\n"
+    "nop\n"
+
+    // Restore registers.
+    "lw $a0, 20($sp)\n"
+    "lw $a1, 24($sp)\n"
+    "lw $a2, 28($sp)\n"
+    "lw $a3, 32($sp)\n"
+    "lw $ra, 36($sp)\n"
+    "lw $t8, 40($sp)\n"
+    "ldc1 $f12, 48($sp)\n"
+    "ldc1 $f14, 56($sp)\n"
+    "addiu $sp, $sp, 64\n"
+
+    // Jump to the (newly modified) stub to invoke the real function.
+    "addiu $t8, $t8, -16\n"
+    "jr $t8\n"
+    "nop\n"
+
+    ".set  reorder\n"
+    ".end " ASMPREFIX "MipsCompilationCallback\n"
+      );
+#else  // host != Mips
+  void MipsCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call MipsCompilationCallback() on a non-Mips arch!");
+  }
+#endif
+}
+
+/// MipsCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void MipsCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first four instructions of the
+  // stub with code that jumps to the compiled function:
+  //   lui $t9, %hi(NewVal)
+  //   addiu $t9, $t9, %lo(NewVal)
+  //   jr $t9
+  //   nop
+
+  int Hi = ((unsigned)NewVal & 0xffff0000) >> 16;
+  if ((NewVal & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(NewVal & 0xffff);
+
+  *(intptr_t *)(StubAddr) = 0xf << 26 | 25 << 16 | Hi;
+  *(intptr_t *)(StubAddr + 4) = 9 << 26 | 25 << 21 | 25 << 16 | Lo;
+  *(intptr_t *)(StubAddr + 8) = 25 << 21 | 8;
+  *(intptr_t *)(StubAddr + 12) = 0;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 16);
+}
+
+TargetJITInfo::LazyResolverFn MipsJITInfo::getLazyResolverFunction(
+    JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return MipsCompilationCallback;
+}
+
+TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
+  // The stub contains 4 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+  StubLayout Result = { 4*4, 4 };
+  return Result;
+}
+
+void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                    JITCodeEmitter &JCE) {
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)MipsCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)MipsCompilationCallback;
+
+
+  int Hi = ((unsigned)EmittedAddr & 0xffff0000) >> 16;
+  if ((EmittedAddr & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(EmittedAddr & 0xffff);
+
+  // lui t9, %hi(EmittedAddr)
+  // addiu t9, t9, %lo(EmittedAddr)
+  // jalr t8, t9
+  // nop
+  if (IsLittleEndian) {
+    JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
+    JCE.emitWordLE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+    JCE.emitWordLE(25 << 21 | 24 << 11 | 9);
+    JCE.emitWordLE(0);
+  } else {
+    JCE.emitWordBE(0xf << 26 | 25 << 16 | Hi);
+    JCE.emitWordBE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+    JCE.emitWordBE(25 << 21 | 24 << 11 | 9);
+    JCE.emitWordBE(0);
+  }
+
+  sys::Memory::InvalidateInstructionCache(Addr, 16);
+  if (!sys::Memory::setRangeExecutable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
+                           unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((Mips::RelocationType) MR->getRelocationType()) {
+    case Mips::reloc_mips_pc16:
+      ResultPtr = (((ResultPtr - (intptr_t) RelocPos) - 4) >> 2) & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_26:
+      ResultPtr = (ResultPtr & 0x0fffffff) >> 2;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_hi:
+      ResultPtr = ResultPtr >> 16;
+      if ((((intptr_t) (MR->getResultPointer()) & 0xffff) >> 15) == 1) {
+        ResultPtr += 1;
+      }
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_lo: {
+      // Addend is needed for unaligned load/store instructions, where offset
+      // for the second load/store in the expanded instruction sequence must
+      // be modified by +1 or +3. Otherwise, Addend is 0.
+      int Addend = *((unsigned*) RelocPos) & 0xffff;
+      ResultPtr = (ResultPtr + Addend) & 0xffff;
+      *((unsigned*) RelocPos) &= 0xffff0000;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+    }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsJITInfo.h b/contrib/llvm/lib/Target/Mips/MipsJITInfo.h
new file mode 100644
index 000000000000..ecda3101a003
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsJITInfo.h
@@ -0,0 +1,71 @@
+//===- MipsJITInfo.h - Mips Implementation of the JIT Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSJITINFO_H
+#define MIPSJITINFO_H
+
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class MipsTargetMachine;
+
+class MipsJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+  bool IsLittleEndian;
+
+  public:
+    explicit MipsJITInfo() :
+      IsPIC(false), IsLittleEndian(true) {}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on Mips.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                   JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char *GOTBase);
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    void Initialize(const MachineFunction &MF, bool isPIC,
+                    bool isLittleEndian) {
+      IsPIC = isPIC;
+      IsLittleEndian = isLittleEndian;
+    }
+
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
new file mode 100644
index 000000000000..2efe534053a2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -0,0 +1,455 @@
+//===-- MipsLongBranch.cpp - Emit long branches ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands a branch or jump instruction into a long branch if its
+// offset is too large to fit into its immediate field.
+//
+// FIXME:
+// 1. Fix pc-region jump instructions which cross 256MB segment boundaries.
+// 2. If program has inline assembly statements whose size cannot be
+//    determined accurately, load branch target addresses from the GOT.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-long-branch"
+
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+static cl::opt<bool> SkipLongBranch(
+  "skip-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Skip long branch pass."),
+  cl::Hidden);
+
+static cl::opt<bool> ForceLongBranch(
+  "force-mips-long-branch",
+  cl::init(false),
+  cl::desc("MIPS: Expand all branches to long format."),
+  cl::Hidden);
+
+namespace {
+  typedef MachineBasicBlock::iterator Iter;
+  typedef MachineBasicBlock::reverse_iterator ReverseIter;
+
+  struct MBBInfo {
+    uint64_t Size, Address;
+    bool HasLongBranch;
+    MachineInstr *Br;
+
+    MBBInfo() : Size(0), HasLongBranch(false), Br(0) {}
+  };
+
+  class MipsLongBranch : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    MipsLongBranch(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
+
+    virtual const char *getPassName() const {
+      return "Mips Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    void splitMBB(MachineBasicBlock *MBB);
+    void initMBBInfo();
+    int64_t computeOffset(const MachineInstr *Br);
+    void replaceBranch(MachineBasicBlock &MBB, Iter Br, DebugLoc DL,
+                       MachineBasicBlock *MBBOpnd);
+    void expandToLongBranch(MBBInfo &Info);
+
+    const TargetMachine &TM;
+    const MipsInstrInfo *TII;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    unsigned ABI;
+    unsigned LongBranchSeqSize;
+  };
+
+  char MipsLongBranch::ID = 0;
+} // end of anonymous namespace
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
+
+/// Iterate over list of Br's operands and search for a MachineBasicBlock
+/// operand.
+static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
+  for (unsigned I = 0, E = Br.getDesc().getNumOperands(); I < E; ++I) {
+    const MachineOperand &MO = Br.getOperand(I);
+
+    if (MO.isMBB())
+      return MO.getMBB();
+  }
+
+  assert(false && "This instruction does not have an MBB operand.");
+  return 0;
+}
+
+// Traverse the list of instructions backwards until a non-debug instruction is
+// found or it reaches E.
+static ReverseIter getNonDebugInstr(ReverseIter B, ReverseIter E) {
+  for (; B != E; ++B)
+    if (!B->isDebugValue())
+      return B;
+
+  return E;
+}
+
+// Split MBB if it has two direct jumps/branches.
+void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) {
+  ReverseIter End = MBB->rend();
+  ReverseIter LastBr = getNonDebugInstr(MBB->rbegin(), End);
+
+  // Return if MBB has no branch instructions.
+  if ((LastBr == End) ||
+      (!LastBr->isConditionalBranch() && !LastBr->isUnconditionalBranch()))
+    return;
+
+  ReverseIter FirstBr = getNonDebugInstr(llvm::next(LastBr), End);
+
+  // MBB has only one branch instruction if FirstBr is not a branch
+  // instruction.
+  if ((FirstBr == End) ||
+      (!FirstBr->isConditionalBranch() && !FirstBr->isUnconditionalBranch()))
+    return;
+
+  assert(!FirstBr->isIndirectBranch() && "Unexpected indirect branch found.");
+
+  // Create a new MBB. Move instructions in MBB to the newly created MBB.
+  MachineBasicBlock *NewMBB =
+    MF->CreateMachineBasicBlock(MBB->getBasicBlock());
+
+  // Insert NewMBB and fix control flow.
+  MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
+  NewMBB->transferSuccessors(MBB);
+  NewMBB->removeSuccessor(Tgt);
+  MBB->addSuccessor(NewMBB);
+  MBB->addSuccessor(Tgt);
+  MF->insert(llvm::next(MachineFunction::iterator(MBB)), NewMBB);
+
+  NewMBB->splice(NewMBB->end(), MBB, (++LastBr).base(), MBB->end());
+}
+
+// Fill MBBInfos.
+void MipsLongBranch::initMBBInfo() {
+  // Split the MBBs if they have two branches. Each basic block should have at
+  // most one branch after this loop is executed.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;)
+    splitMBB(I++);
+
+  MF->RenumberBlocks();
+  MBBInfos.clear();
+  MBBInfos.resize(MF->size());
+
+  for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+
+    // Compute size of MBB.
+    for (MachineBasicBlock::instr_iterator MI = MBB->instr_begin();
+         MI != MBB->instr_end(); ++MI)
+      MBBInfos[I].Size += TII->GetInstSizeInBytes(&*MI);
+
+    // Search for MBB's branch instruction.
+    ReverseIter End = MBB->rend();
+    ReverseIter Br = getNonDebugInstr(MBB->rbegin(), End);
+
+    if ((Br != End) && !Br->isIndirectBranch() &&
+        (Br->isConditionalBranch() ||
+         (Br->isUnconditionalBranch() &&
+          TM.getRelocationModel() == Reloc::PIC_)))
+      MBBInfos[I].Br = (++Br).base();
+  }
+}
+
+// Compute offset of branch in number of bytes.
+int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
+  int64_t Offset = 0;
+  int ThisMBB = Br->getParent()->getNumber();
+  int TargetMBB = getTargetMBB(*Br)->getNumber();
+
+  // Compute offset of a forward branch.
+  if (ThisMBB < TargetMBB) {
+    for (int N = ThisMBB + 1; N < TargetMBB; ++N)
+      Offset += MBBInfos[N].Size;
+
+    return Offset + 4;
+  }
+
+  // Compute offset of a backward branch.
+  for (int N = ThisMBB; N >= TargetMBB; --N)
+    Offset += MBBInfos[N].Size;
+
+  return -Offset + 4;
+}
+
+// Replace Br with a branch which has the opposite condition code and a
+// MachineBasicBlock operand MBBOpnd.
+void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
+                                   DebugLoc DL, MachineBasicBlock *MBBOpnd) {
+  unsigned NewOpc = TII->GetOppositeBranchOpc(Br->getOpcode());
+  const MCInstrDesc &NewDesc = TII->get(NewOpc);
+
+  MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
+
+  for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
+    MachineOperand &MO = Br->getOperand(I);
+
+    if (!MO.isReg()) {
+      assert(MO.isMBB() && "MBB operand expected.");
+      break;
+    }
+
+    MIB.addReg(MO.getReg());
+  }
+
+  MIB.addMBB(MBBOpnd);
+
+  Br->eraseFromParent();
+}
+
+// Expand branch instructions to long branches.
+void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
+  DebugLoc DL = I.Br->getDebugLoc();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
+  MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
+
+  MF->insert(FallThroughMBB, LongBrMBB);
+  MBB->removeSuccessor(TgtMBB);
+  MBB->addSuccessor(LongBrMBB);
+
+  if (IsPIC) {
+    MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(FallThroughMBB, BalTgtMBB);
+    LongBrMBB->addSuccessor(BalTgtMBB);
+    BalTgtMBB->addSuccessor(TgtMBB);
+
+    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
+    unsigned BalTgtMBBSize = 5;
+    int64_t Offset = TgtAddress - (I.Address + I.Size - BalTgtMBBSize * 4);
+    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
+    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
+
+    if (ABI != MipsSubtarget::N64) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  bal $baltgt
+      //  lui $at, %hi($tgt - $baltgt)
+      // $baltgt:
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      MIBundleBuilder(*LongBrMBB, Pos)
+        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
+        .append(BuildMI(*MF, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
+        .addReg(Mips::AT).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+
+      MIBundleBuilder(*BalTgtMBB, Pos)
+        .append(BuildMI(*MF, DL, TII->get(Mips::JR)).addReg(Mips::AT))
+        .append(BuildMI(*MF, DL, TII->get(Mips::ADDiu), Mips::SP)
+                .addReg(Mips::SP).addImm(8));
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  lui64 $at, %highest($tgt - $baltgt)
+      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  daddiu $at, $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  dsll $at, $at, 16
+      // $baltgt:
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
+      int64_t Highest =
+        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
+        .addImm(Highest);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Hi);
+
+      MIBundleBuilder(*LongBrMBB, Pos)
+        .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
+        .append(BuildMI(*MF, DL, TII->get(Mips::DSLL), Mips::AT_64)
+                .addReg(Mips::AT_64).addImm(16));
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+
+      MIBundleBuilder(*BalTgtMBB, Pos)
+        .append(BuildMI(*MF, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64))
+        .append(BuildMI(*MF, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+                .addReg(Mips::SP_64).addImm(16));
+    }
+
+    assert(BalTgtMBBSize == BalTgtMBB->size());
+    assert(LongBrMBB->size() + BalTgtMBBSize == LongBranchSeqSize);
+  } else {
+    // $longbr:
+    //  j $tgt
+    //  nop
+    // $fallthrough:
+    //
+    Pos = LongBrMBB->begin();
+    LongBrMBB->addSuccessor(TgtMBB);
+    MIBundleBuilder(*LongBrMBB, Pos)
+      .append(BuildMI(*MF, DL, TII->get(Mips::J)).addMBB(TgtMBB))
+      .append(BuildMI(*MF, DL, TII->get(Mips::NOP)));
+
+    assert(LongBrMBB->size() == LongBranchSeqSize);
+  }
+
+  if (I.Br->isUnconditionalBranch()) {
+    // Change branch destination.
+    assert(I.Br->getDesc().getNumOperands() == 1);
+    I.Br->RemoveOperand(0);
+    I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB));
+  } else
+    // Change branch destination and reverse condition.
+    replaceBranch(*MBB, I.Br, DL, FallThroughMBB);
+}
+
+static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
+  MachineBasicBlock &MBB = F.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL = MBB.findDebugLoc(MBB.begin());
+  BuildMI(MBB, I, DL, TII->get(Mips::LUi), Mips::V0)
+    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
+  BuildMI(MBB, I, DL, TII->get(Mips::ADDiu), Mips::V0)
+    .addReg(Mips::V0).addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  MBB.removeLiveIn(Mips::V0);
+}
+
+bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  if ((TM.getRelocationModel() == Reloc::PIC_) &&
+      TM.getSubtarget<MipsSubtarget>().isABI_O32() &&
+      F.getInfo<MipsFunctionInfo>()->globalBaseRegSet())
+    emitGPDisp(F, TII);
+
+  if (SkipLongBranch)
+    return true;
+
+  MF = &F;
+  initMBBInfo();
+
+  SmallVector<MBBInfo, 16>::iterator I, E = MBBInfos.end();
+  bool EverMadeChange = false, MadeChange = true;
+
+  while (MadeChange) {
+    MadeChange = false;
+
+    for (I = MBBInfos.begin(); I != E; ++I) {
+      // Skip if this MBB doesn't have a branch or the branch has already been
+      // converted to a long branch.
+      if (!I->Br || I->HasLongBranch)
+        continue;
+
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+        continue;
+
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
+      ++LongBranches;
+      EverMadeChange = MadeChange = true;
+    }
+  }
+
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
new file mode 100644
index 000000000000..d836975eb7d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -0,0 +1,166 @@
+//===-- MipsMCInstLower.cpp - Convert Mips MachineInstr to MCInst ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsMCInstLower.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAsmPrinter.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
+  : AsmPrinter(asmprinter) {}
+
+void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
+  Mang = M;
+  Ctx = C;
+}
+
+MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                              MachineOperandType MOTy,
+                                              unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind;
+  const MCSymbol *Symbol;
+
+  switch(MO.getTargetFlags()) {
+  default:                   llvm_unreachable("Invalid target flag!");
+  case MipsII::MO_NO_FLAG:   Kind = MCSymbolRefExpr::VK_None; break;
+  case MipsII::MO_GPREL:     Kind = MCSymbolRefExpr::VK_Mips_GPREL; break;
+  case MipsII::MO_GOT_CALL:  Kind = MCSymbolRefExpr::VK_Mips_GOT_CALL; break;
+  case MipsII::MO_GOT16:     Kind = MCSymbolRefExpr::VK_Mips_GOT16; break;
+  case MipsII::MO_GOT:       Kind = MCSymbolRefExpr::VK_Mips_GOT; break;
+  case MipsII::MO_ABS_HI:    Kind = MCSymbolRefExpr::VK_Mips_ABS_HI; break;
+  case MipsII::MO_ABS_LO:    Kind = MCSymbolRefExpr::VK_Mips_ABS_LO; break;
+  case MipsII::MO_TLSGD:     Kind = MCSymbolRefExpr::VK_Mips_TLSGD; break;
+  case MipsII::MO_TLSLDM:    Kind = MCSymbolRefExpr::VK_Mips_TLSLDM; break;
+  case MipsII::MO_DTPREL_HI: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_HI; break;
+  case MipsII::MO_DTPREL_LO: Kind = MCSymbolRefExpr::VK_Mips_DTPREL_LO; break;
+  case MipsII::MO_GOTTPREL:  Kind = MCSymbolRefExpr::VK_Mips_GOTTPREL; break;
+  case MipsII::MO_TPREL_HI:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_HI; break;
+  case MipsII::MO_TPREL_LO:  Kind = MCSymbolRefExpr::VK_Mips_TPREL_LO; break;
+  case MipsII::MO_GPOFF_HI:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_HI; break;
+  case MipsII::MO_GPOFF_LO:  Kind = MCSymbolRefExpr::VK_Mips_GPOFF_LO; break;
+  case MipsII::MO_GOT_DISP:  Kind = MCSymbolRefExpr::VK_Mips_GOT_DISP; break;
+  case MipsII::MO_GOT_PAGE:  Kind = MCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
+  case MipsII::MO_GOT_OFST:  Kind = MCSymbolRefExpr::VK_Mips_GOT_OFST; break;
+  case MipsII::MO_HIGHER:    Kind = MCSymbolRefExpr::VK_Mips_HIGHER; break;
+  case MipsII::MO_HIGHEST:   Kind = MCSymbolRefExpr::VK_Mips_HIGHEST; break;
+  case MipsII::MO_GOT_HI16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_HI16; break;
+  case MipsII::MO_GOT_LO16:  Kind = MCSymbolRefExpr::VK_Mips_GOT_LO16; break;
+  case MipsII::MO_CALL_HI16: Kind = MCSymbolRefExpr::VK_Mips_CALL_HI16; break;
+  case MipsII::MO_CALL_LO16: Kind = MCSymbolRefExpr::VK_Mips_CALL_LO16; break;
+  }
+
+  switch (MOTy) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = Mang->getSymbol(MO.getGlobal());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    Offset += MO.getOffset();
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    Offset += MO.getOffset();
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::CreateExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
+}
+
+/*
+static void CreateMCInst(MCInst& Inst, unsigned Opc, const MCOperand &Opnd0,
+                         const MCOperand &Opnd1,
+                         const MCOperand &Opnd2 = MCOperand()) {
+  Inst.setOpcode(Opc);
+  Inst.addOperand(Opnd0);
+  Inst.addOperand(Opnd1);
+  if (Opnd2.isValid())
+    Inst.addOperand(Opnd2);
+}
+*/
+
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
+                                        unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit()) break;
+    return MCOperand::CreateReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm() + offset);
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(MO, MOTy, offset);
+  case MachineOperand::MO_RegisterMask:
+    break;
+ }
+
+  return MCOperand();
+}
+
+void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
new file mode 100644
index 000000000000..c4a6016105b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -0,0 +1,44 @@
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCINSTLOWER_H
+#define MIPSMCINSTLOWER_H
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class Mangler;
+  class MipsAsmPrinter;
+
+/// MipsMCInstLower - This class is used to lower an MachineInstr into an
+//                    MCInst.
+class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  Mangler *Mang;
+  MipsAsmPrinter &AsmPrinter;
+public:
+  MipsMCInstLower(MipsAsmPrinter &asmprinter);
+  void Initialize(Mangler *mang, MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
new file mode 100644
index 000000000000..59b23f7ad7c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -0,0 +1,75 @@
+//===-- MipsMachineFunctionInfo.cpp - Private data used for Mips ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMachineFunction.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
+                 cl::desc("Always use $gp as the global base register."));
+
+bool MipsFunctionInfo::globalBaseRegSet() const {
+  return GlobalBaseReg;
+}
+
+unsigned MipsFunctionInfo::getGlobalBaseReg() {
+  // Return if it has already been initialized.
+  if (GlobalBaseReg)
+    return GlobalBaseReg;
+
+  const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
+
+  const TargetRegisterClass *RC;
+  if (ST.inMips16Mode())
+    RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  else
+    RC = ST.isABI_N64() ?
+      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass :
+      (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+  return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+bool MipsFunctionInfo::mips16SPAliasRegSet() const {
+  return Mips16SPAliasReg;
+}
+unsigned MipsFunctionInfo::getMips16SPAliasReg() {
+  // Return if it has already been initialized.
+  if (Mips16SPAliasReg)
+    return Mips16SPAliasReg;
+
+  const TargetRegisterClass *RC;
+  RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
+void MipsFunctionInfo::createEhDataRegsFI() {
+  for (int I = 0; I < 4; ++I) {
+    const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
+    const TargetRegisterClass *RC = ST.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    EhDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+        RC->getAlignment(), false);
+  }
+}
+
+bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
+  return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1]
+                        || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
+}
+
+void MipsFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 000000000000..b05b348037d9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -0,0 +1,99 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_MACHINE_FUNCTION_INFO_H
+#define MIPS_MACHINE_FUNCTION_INFO_H
+
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include <utility>
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+public:
+  MipsFunctionInfo(MachineFunction& MF)
+   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
+     VarArgsFrameIndex(0), CallsEhReturn(false)
+  {}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  bool globalBaseRegSet() const;
+  unsigned getGlobalBaseReg();
+
+  bool mips16SPAliasRegSet() const;
+  unsigned getMips16SPAliasReg();
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  bool hasByvalArg() const { return HasByvalArg; }
+  void setFormalArgInfo(unsigned Size, bool HasByval) {
+    IncomingArgSize = Size;
+    HasByvalArg = HasByval;
+  }
+
+  unsigned getIncomingArgSize() const { return IncomingArgSize; }
+
+  bool callsEhReturn() const { return CallsEhReturn; }
+  void setCallsEhReturn() { CallsEhReturn = true; }
+
+  void createEhDataRegsFI();
+  int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
+  bool isEhDataRegFI(int FI) const;
+
+};
+
+} // end of namespace llvm
+
+#endif // MIPS_MACHINE_FUNCTION_INFO_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 000000000000..32507334e9c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -0,0 +1,219 @@
+//===-- MipsRegisterInfo.cpp - MIPS Register Information -== --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#include "MipsRegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsInstrInfo.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "MipsGenRegisterInfo.inc"
+
+using namespace llvm;
+
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
+
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
+
+
+unsigned
+MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                      MachineFunction &MF) const {
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case Mips::CPURegsRegClassID:
+  case Mips::CPU64RegsRegClassID:
+  case Mips::DSPRegsRegClassID: {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    return 28 - TFI->hasFP(MF);
+  }
+  case Mips::FGR32RegClassID:
+    return 32;
+  case Mips::AFGR64RegClassID:
+    return 16;
+  case Mips::FGR64RegClassID:
+    return 32;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const uint16_t* MipsRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_SaveList;
+  else if (!Subtarget.hasMips64())
+    return CSR_O32_SaveList;
+  else if (Subtarget.isABI_N32())
+    return CSR_N32_SaveList;
+
+  assert(Subtarget.isABI_N64());
+  return CSR_N64_SaveList;
+}
+
+const uint32_t*
+MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  if (Subtarget.isSingleFloat())
+    return CSR_SingleFloatOnly_RegMask;
+  else if (!Subtarget.hasMips64())
+    return CSR_O32_RegMask;
+  else if (Subtarget.isABI_N32())
+    return CSR_N32_RegMask;
+
+  assert(Subtarget.isABI_N64());
+  return CSR_N64_RegMask;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  static const uint16_t ReservedCPURegs[] = {
+    Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
+  };
+
+  static const uint16_t ReservedCPU64Regs[] = {
+    Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
+  };
+
+  BitVector Reserved(getNumRegs());
+  typedef TargetRegisterClass::const_iterator RegIter;
+
+  for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I)
+    Reserved.set(ReservedCPURegs[I]);
+
+  for (unsigned I = 0; I < array_lengthof(ReservedCPU64Regs); ++I)
+    Reserved.set(ReservedCPU64Regs[I]);
+
+  if (Subtarget.hasMips64()) {
+    // Reserve all registers in AFGR64.
+    for (RegIter Reg = Mips::AFGR64RegClass.begin(),
+         EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  } else {
+    // Reserve all registers in FGR64.
+    for (RegIter Reg = Mips::FGR64RegClass.begin(),
+         EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
+      Reserved.set(*Reg);
+  }
+  // Reserve FP if this function should have a dedicated frame pointer register.
+  if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
+    if (Subtarget.inMips16Mode())
+      Reserved.set(Mips::S0);
+    else {
+      Reserved.set(Mips::FP);
+      Reserved.set(Mips::FP_64);
+    }
+  }
+
+  // Reserve hardware registers.
+  Reserved.set(Mips::HWR29);
+  Reserved.set(Mips::HWR29_64);
+
+  // Reserve DSP control register.
+  Reserved.set(Mips::DSPCtrl);
+
+  // Reserve RA if in mips16 mode.
+  if (Subtarget.inMips16Mode()) {
+    Reserved.set(Mips::RA);
+    Reserved.set(Mips::RA_64);
+  }
+
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
+  return Reserved;
+}
+
+bool
+MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return true;
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+void MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    unsigned FIOperandNum, RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
+        errs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  uint64_t stackSize = MF.getFrameInfo()->getStackSize();
+  int64_t spOffset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
+
+  eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  bool IsN64 = Subtarget.isABI_N64();
+
+  if (Subtarget.inMips16Mode())
+    return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
+  else
+    return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+                            (IsN64 ? Mips::SP_64 : Mips::SP);
+
+}
+
+unsigned MipsRegisterInfo::
+getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned MipsRegisterInfo::
+getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 000000000000..5ed51241391f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -0,0 +1,82 @@
+//===-- MipsRegisterInfo.h - Mips Register Information Impl -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGISTERINFO_H
+#define MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "MipsGenRegisterInfo.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class Type;
+
+class MipsRegisterInfo : public MipsGenRegisterInfo {
+protected:
+  const MipsSubtarget &Subtarget;
+
+public:
+  MipsRegisterInfo(const MipsSubtarget &Subtarget);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// Mips::RA, return the number that it corresponds to (e.g. 31).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Adjust the Mips stack frame.
+  void adjustMipsStackFrame(MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
+  /// Stack Frame Processing Methods
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
+
+  /// Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  /// Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  /// \brief Return GPR register class.
+  virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 000000000000..64458bcef7ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -0,0 +1,406 @@
+//===-- MipsRegisterInfo.td - Mips Register defs -----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_fpeven : SubRegIndex;
+def sub_fpodd  : SubRegIndex;
+def sub_32     : SubRegIndex;
+def sub_lo     : SubRegIndex;
+def sub_hi     : SubRegIndex;
+}
+
+class Unallocatable {
+  bit isAllocatable = 0;
+}
+
+// We have banks of 32 registers each.
+class MipsReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+class MipsRegWithSubRegs<bits<16> Enc, string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let HWEncoding = Enc;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_32];
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_fpeven, sub_fpodd];
+  let CoveredBySubRegs = 1;
+}
+
+class AFPR64<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_32];
+}
+
+// Accumulator Registers
+class ACC<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips Hardware Registers
+class HWR<bits<16> Enc, string n> : MipsReg<Enc, n>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+  // General Purpose Registers
+  def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "1">,    DwarfRegNum<[1]>;
+  def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
+  def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
+  def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
+  def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
+  def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
+  def T0   : MipsGPRReg< 8, "8">,    DwarfRegNum<[8]>;
+  def T1   : MipsGPRReg< 9, "9">,    DwarfRegNum<[9]>;
+  def T2   : MipsGPRReg< 10, "10">,  DwarfRegNum<[10]>;
+  def T3   : MipsGPRReg< 11, "11">,  DwarfRegNum<[11]>;
+  def T4   : MipsGPRReg< 12, "12">,  DwarfRegNum<[12]>;
+  def T5   : MipsGPRReg< 13, "13">,  DwarfRegNum<[13]>;
+  def T6   : MipsGPRReg< 14, "14">,  DwarfRegNum<[14]>;
+  def T7   : MipsGPRReg< 15, "15">,  DwarfRegNum<[15]>;
+  def S0   : MipsGPRReg< 16, "16">,  DwarfRegNum<[16]>;
+  def S1   : MipsGPRReg< 17, "17">,  DwarfRegNum<[17]>;
+  def S2   : MipsGPRReg< 18, "18">,  DwarfRegNum<[18]>;
+  def S3   : MipsGPRReg< 19, "19">,  DwarfRegNum<[19]>;
+  def S4   : MipsGPRReg< 20, "20">,  DwarfRegNum<[20]>;
+  def S5   : MipsGPRReg< 21, "21">,  DwarfRegNum<[21]>;
+  def S6   : MipsGPRReg< 22, "22">,  DwarfRegNum<[22]>;
+  def S7   : MipsGPRReg< 23, "23">,  DwarfRegNum<[23]>;
+  def T8   : MipsGPRReg< 24, "24">,  DwarfRegNum<[24]>;
+  def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
+  def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
+  def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
+  def GP   : MipsGPRReg< 28, "gp">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "sp">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "fp">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "ra">,  DwarfRegNum<[31]>;
+
+  // General Purpose 64-bit Registers
+  def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
+  def AT_64   : Mips64GPRReg< 1, "1",    [AT]>, DwarfRegNum<[1]>;
+  def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
+  def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
+  def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
+  def A1_64   : Mips64GPRReg< 5, "5",    [A1]>, DwarfRegNum<[5]>;
+  def A2_64   : Mips64GPRReg< 6, "6",    [A2]>, DwarfRegNum<[6]>;
+  def A3_64   : Mips64GPRReg< 7, "7",    [A3]>, DwarfRegNum<[7]>;
+  def T0_64   : Mips64GPRReg< 8, "8",    [T0]>, DwarfRegNum<[8]>;
+  def T1_64   : Mips64GPRReg< 9, "9",    [T1]>, DwarfRegNum<[9]>;
+  def T2_64   : Mips64GPRReg< 10, "10",  [T2]>, DwarfRegNum<[10]>;
+  def T3_64   : Mips64GPRReg< 11, "11",  [T3]>, DwarfRegNum<[11]>;
+  def T4_64   : Mips64GPRReg< 12, "12",  [T4]>, DwarfRegNum<[12]>;
+  def T5_64   : Mips64GPRReg< 13, "13",  [T5]>, DwarfRegNum<[13]>;
+  def T6_64   : Mips64GPRReg< 14, "14",  [T6]>, DwarfRegNum<[14]>;
+  def T7_64   : Mips64GPRReg< 15, "15",  [T7]>, DwarfRegNum<[15]>;
+  def S0_64   : Mips64GPRReg< 16, "16",  [S0]>, DwarfRegNum<[16]>;
+  def S1_64   : Mips64GPRReg< 17, "17",  [S1]>, DwarfRegNum<[17]>;
+  def S2_64   : Mips64GPRReg< 18, "18",  [S2]>, DwarfRegNum<[18]>;
+  def S3_64   : Mips64GPRReg< 19, "19",  [S3]>, DwarfRegNum<[19]>;
+  def S4_64   : Mips64GPRReg< 20, "20",  [S4]>, DwarfRegNum<[20]>;
+  def S5_64   : Mips64GPRReg< 21, "21",  [S5]>, DwarfRegNum<[21]>;
+  def S6_64   : Mips64GPRReg< 22, "22",  [S6]>, DwarfRegNum<[22]>;
+  def S7_64   : Mips64GPRReg< 23, "23",  [S7]>, DwarfRegNum<[23]>;
+  def T8_64   : Mips64GPRReg< 24, "24",  [T8]>, DwarfRegNum<[24]>;
+  def T9_64   : Mips64GPRReg< 25, "25",  [T9]>, DwarfRegNum<[25]>;
+  def K0_64   : Mips64GPRReg< 26, "26",  [K0]>, DwarfRegNum<[26]>;
+  def K1_64   : Mips64GPRReg< 27, "27",  [K1]>, DwarfRegNum<[27]>;
+  def GP_64   : Mips64GPRReg< 28, "gp",  [GP]>, DwarfRegNum<[28]>;
+  def SP_64   : Mips64GPRReg< 29, "sp",  [SP]>, DwarfRegNum<[29]>;
+  def FP_64   : Mips64GPRReg< 30, "fp",  [FP]>, DwarfRegNum<[30]>;
+  def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
+
+  /// Mips Single point precision FPU Registers
+  def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
+  def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+
+  /// Mips Double point precision FPU Registers (aliased
+  /// with the single precision to hold 64 bit values)
+  def D0  : AFPR< 0,  "f0", [F0,   F1]>;
+  def D1  : AFPR< 2,  "f2", [F2,   F3]>;
+  def D2  : AFPR< 4,  "f4", [F4,   F5]>;
+  def D3  : AFPR< 6,  "f6", [F6,   F7]>;
+  def D4  : AFPR< 8,  "f8", [F8,   F9]>;
+  def D5  : AFPR<10, "f10", [F10, F11]>;
+  def D6  : AFPR<12, "f12", [F12, F13]>;
+  def D7  : AFPR<14, "f14", [F14, F15]>;
+  def D8  : AFPR<16, "f16", [F16, F17]>;
+  def D9  : AFPR<18, "f18", [F18, F19]>;
+  def D10 : AFPR<20, "f20", [F20, F21]>;
+  def D11 : AFPR<22, "f22", [F22, F23]>;
+  def D12 : AFPR<24, "f24", [F24, F25]>;
+  def D13 : AFPR<26, "f26", [F26, F27]>;
+  def D14 : AFPR<28, "f28", [F28, F29]>;
+  def D15 : AFPR<30, "f30", [F30, F31]>;
+
+  /// Mips Double point precision FPU Registers in MFP64 mode.
+  def D0_64  : AFPR64<0, "f0", [F0]>, DwarfRegNum<[32]>;
+  def D1_64  : AFPR64<1, "f1", [F1]>, DwarfRegNum<[33]>;
+  def D2_64  : AFPR64<2, "f2", [F2]>, DwarfRegNum<[34]>;
+  def D3_64  : AFPR64<3, "f3", [F3]>, DwarfRegNum<[35]>;
+  def D4_64  : AFPR64<4, "f4", [F4]>, DwarfRegNum<[36]>;
+  def D5_64  : AFPR64<5, "f5", [F5]>, DwarfRegNum<[37]>;
+  def D6_64  : AFPR64<6, "f6", [F6]>, DwarfRegNum<[38]>;
+  def D7_64  : AFPR64<7, "f7", [F7]>, DwarfRegNum<[39]>;
+  def D8_64  : AFPR64<8, "f8", [F8]>, DwarfRegNum<[40]>;
+  def D9_64  : AFPR64<9, "f9", [F9]>, DwarfRegNum<[41]>;
+  def D10_64  : AFPR64<10, "f10", [F10]>, DwarfRegNum<[42]>;
+  def D11_64  : AFPR64<11, "f11", [F11]>, DwarfRegNum<[43]>;
+  def D12_64  : AFPR64<12, "f12", [F12]>, DwarfRegNum<[44]>;
+  def D13_64  : AFPR64<13, "f13", [F13]>, DwarfRegNum<[45]>;
+  def D14_64  : AFPR64<14, "f14", [F14]>, DwarfRegNum<[46]>;
+  def D15_64  : AFPR64<15, "f15", [F15]>, DwarfRegNum<[47]>;
+  def D16_64  : AFPR64<16, "f16", [F16]>, DwarfRegNum<[48]>;
+  def D17_64  : AFPR64<17, "f17", [F17]>, DwarfRegNum<[49]>;
+  def D18_64  : AFPR64<18, "f18", [F18]>, DwarfRegNum<[50]>;
+  def D19_64  : AFPR64<19, "f19", [F19]>, DwarfRegNum<[51]>;
+  def D20_64  : AFPR64<20, "f20", [F20]>, DwarfRegNum<[52]>;
+  def D21_64  : AFPR64<21, "f21", [F21]>, DwarfRegNum<[53]>;
+  def D22_64  : AFPR64<22, "f22", [F22]>, DwarfRegNum<[54]>;
+  def D23_64  : AFPR64<23, "f23", [F23]>, DwarfRegNum<[55]>;
+  def D24_64  : AFPR64<24, "f24", [F24]>, DwarfRegNum<[56]>;
+  def D25_64  : AFPR64<25, "f25", [F25]>, DwarfRegNum<[57]>;
+  def D26_64  : AFPR64<26, "f26", [F26]>, DwarfRegNum<[58]>;
+  def D27_64  : AFPR64<27, "f27", [F27]>, DwarfRegNum<[59]>;
+  def D28_64  : AFPR64<28, "f28", [F28]>, DwarfRegNum<[60]>;
+  def D29_64  : AFPR64<29, "f29", [F29]>, DwarfRegNum<[61]>;
+  def D30_64  : AFPR64<30, "f30", [F30]>, DwarfRegNum<[62]>;
+  def D31_64  : AFPR64<31, "f31", [F31]>, DwarfRegNum<[63]>;
+
+  // Hi/Lo registers
+  def HI  : Register<"hi">, DwarfRegNum<[64]>;
+  def HI1 : Register<"hi1">, DwarfRegNum<[176]>;
+  def HI2 : Register<"hi2">, DwarfRegNum<[178]>;
+  def HI3 : Register<"hi3">, DwarfRegNum<[180]>;
+  def LO  : Register<"lo">, DwarfRegNum<[65]>;
+  def LO1 : Register<"lo1">, DwarfRegNum<[177]>;
+  def LO2 : Register<"lo2">, DwarfRegNum<[179]>;
+  def LO3 : Register<"lo3">, DwarfRegNum<[181]>;
+
+  let SubRegIndices = [sub_32] in {
+  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
+  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  }
+
+  // Status flags register
+  def FCR31 : Register<"31">;
+
+  // fcc0 register
+  def FCC0 : MipsReg<0, "fcc0">;
+
+  // PC register
+  def PC : Register<"pc">;
+
+  // Hardware register $29
+  def HWR29 : MipsReg<29, "29">;
+  def HWR29_64 : MipsReg<29, "29">;
+
+  // Accum registers
+  def AC0 : ACC<0, "ac0", [LO, HI]>;
+  def AC1 : ACC<1, "ac1", [LO1, HI1]>;
+  def AC2 : ACC<2, "ac2", [LO2, HI2]>;
+  def AC3 : ACC<3, "ac3", [LO3, HI3]>;
+
+  def AC0_64 : ACC<0, "ac0", [LO64, HI64]>;
+
+  def DSPCtrl : Register<"dspctrl">;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+class CPURegsClass<list<ValueType> regTypes> :
+  RegisterClass<"Mips", regTypes, 32, (add
+  // Reserved
+  ZERO, AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
+def CPURegs : CPURegsClass<[i32]>;
+def DSPRegs : CPURegsClass<[v4i8, v2i16]>;
+
+def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+// Reserved
+  ZERO_64, AT_64,
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+  // Not preserved across procedure calls
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64,
+  // Callee save
+  S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Not preserved across procedure calls
+  T8_64, T9_64,
+  // Reserved
+  K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)>;
+
+def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1)>;
+
+def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
+
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
+
+// 64bit fp:
+// * FGR64  - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
+  // Return Values and Arguments
+  D0, D1,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5,
+  // Return Values and Arguments
+  D6, D7,
+  // Not preserved across procedure calls
+  D8, D9,
+  // Callee save
+  D10, D11, D12, D13, D14, D15)>;
+
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
+
+// Condition Register for floating point operations
+def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>, Unallocatable;
+
+// Hi/Lo Registers
+def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>, Unallocatable;
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>, Unallocatable;
+
+// Hardware registers
+def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
+def HWRegs64 : RegisterClass<"Mips", [i64], 64, (add HWR29_64)>, Unallocatable;
+
+// Accumulator Registers
+def ACRegs : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+  let Size = 64;
+}
+
+def ACRegs128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+  let Size = 128;
+}
+
+def ACRegsDSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+  let Size = 64;
+}
+
+def CPURegsAsmOperand : AsmOperandClass {
+  let Name = "CPURegsAsm";
+  let ParserMethod = "parseCPURegs";
+}
+
+def CPU64RegsAsmOperand : AsmOperandClass {
+  let Name = "CPU64RegsAsm";
+  let ParserMethod = "parseCPU64Regs";
+}
+
+def CCRAsmOperand : AsmOperandClass {
+  let Name = "CCRAsm";
+  let ParserMethod = "parseCCRRegs";
+}
+
+def CPURegsOpnd : RegisterOperand<CPURegs, "printCPURegs"> {
+  let ParserMatchClass = CPURegsAsmOperand;
+}
+
+def CPU64RegsOpnd : RegisterOperand<CPU64Regs, "printCPURegs"> {
+  let ParserMatchClass = CPU64RegsAsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR, "printCPURegs"> {
+  let ParserMatchClass = CCRAsmOperand;
+}
+
+def HWRegsAsmOperand : AsmOperandClass {
+  let Name = "HWRegsAsm";
+  let ParserMethod = "parseHWRegs";
+}
+
+def HW64RegsAsmOperand : AsmOperandClass {
+  let Name = "HW64RegsAsm";
+  let ParserMethod = "parseHW64Regs";
+}
+
+def HWRegsOpnd : RegisterOperand<HWRegs, "printCPURegs"> {
+  let ParserMatchClass = HWRegsAsmOperand;
+}
+
+def HW64RegsOpnd : RegisterOperand<HWRegs64, "printCPURegs"> {
+  let ParserMatchClass = HW64RegsAsmOperand;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsRelocations.h b/contrib/llvm/lib/Target/Mips/MipsRelocations.h
new file mode 100644
index 000000000000..0787ed399d5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsRelocations.h
@@ -0,0 +1,41 @@
+//===-- MipsRelocations.h - Mips Code Relocations ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Mips target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSRELOCATIONS_H_
+#define MIPSRELOCATIONS_H_
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Mips{
+    enum RelocationType {
+      // reloc_mips_pc16 - pc relative relocation for branches. The lower 18
+      // bits of the difference between the branch target and the branch
+      // instruction, shifted right by 2.
+      reloc_mips_pc16 = 1,
+
+      // reloc_mips_hi - upper 16 bits of the address (modified by +1 if the
+      // lower 16 bits of the address is negative).
+      reloc_mips_hi = 2,
+
+      // reloc_mips_lo - lower 16 bits of the address.
+      reloc_mips_lo = 3,
+
+      // reloc_mips_26 - lower 28 bits of the address, shifted right by 2.
+      reloc_mips_26 = 4
+    };
+  }
+}
+
+#endif /* MIPSRELOCATIONS_H_ */
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
new file mode 100644
index 000000000000..68ec92188802
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -0,0 +1,468 @@
+//===-- MipsSEFrameLowering.cpp - Mips32/64 Frame Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEFrameLowering.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+namespace {
+typedef MachineBasicBlock::iterator Iter;
+
+/// Helper class to expand accumulator pseudos.
+class ExpandACCPseudo {
+public:
+  ExpandACCPseudo(MachineFunction &MF);
+  bool expand();
+
+private:
+  bool expandInstr(MachineBasicBlock &MBB, Iter I);
+  void expandLoad(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStore(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandCopy(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+
+  MachineFunction &MF;
+  const MipsSEInstrInfo &TII;
+  const MipsRegisterInfo &RegInfo;
+  MachineRegisterInfo &MRI;
+};
+}
+
+ExpandACCPseudo::ExpandACCPseudo(MachineFunction &MF_)
+  : MF(MF_),
+    TII(*static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo())),
+    RegInfo(TII.getRegisterInfo()), MRI(MF.getRegInfo()) {}
+
+bool ExpandACCPseudo::expand() {
+  bool Expanded = false;
+
+  for (MachineFunction::iterator BB = MF.begin(), BBEnd = MF.end();
+       BB != BBEnd; ++BB)
+    for (Iter I = BB->begin(), End = BB->end(); I != End;)
+      Expanded |= expandInstr(*BB, I++);
+
+  return Expanded;
+}
+
+bool ExpandACCPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
+  switch(I->getOpcode()) {
+  case Mips::LOAD_AC64:
+  case Mips::LOAD_AC64_P8:
+  case Mips::LOAD_AC_DSP:
+  case Mips::LOAD_AC_DSP_P8:
+    expandLoad(MBB, I, 4);
+    break;
+  case Mips::LOAD_AC128:
+  case Mips::LOAD_AC128_P8:
+    expandLoad(MBB, I, 8);
+    break;
+  case Mips::STORE_AC64:
+  case Mips::STORE_AC64_P8:
+  case Mips::STORE_AC_DSP:
+  case Mips::STORE_AC_DSP_P8:
+    expandStore(MBB, I, 4);
+    break;
+  case Mips::STORE_AC128:
+  case Mips::STORE_AC128_P8:
+    expandStore(MBB, I, 8);
+    break;
+  case Mips::COPY_AC64:
+  case Mips::COPY_AC_DSP:
+    expandCopy(MBB, I, 4);
+    break;
+  case Mips::COPY_AC128:
+    expandCopy(MBB, I, 8);
+    break;
+  default:
+    return false;
+  }
+
+  MBB.erase(I);
+  return true;
+}
+
+void ExpandACCPseudo::expandLoad(MachineBasicBlock &MBB, Iter I,
+                                 unsigned RegSize) {
+  //  load $vr0, FI
+  //  copy lo, $vr0
+  //  load $vr1, FI + 4
+  //  copy hi, $vr1
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned Lo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned Hi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+
+  TII.loadRegFromStack(MBB, I, VR0, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, Desc, Lo).addReg(VR0, RegState::Kill);
+  TII.loadRegFromStack(MBB, I, VR1, FI, RC, &RegInfo, RegSize);
+  BuildMI(MBB, I, DL, Desc, Hi).addReg(VR1, RegState::Kill);
+}
+
+void ExpandACCPseudo::expandStore(MachineBasicBlock &MBB, Iter I,
+                                  unsigned RegSize) {
+  //  copy $vr0, lo
+  //  store $vr0, FI
+  //  copy $vr1, hi
+  //  store $vr1, FI + 4
+
+  assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
+  unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
+  unsigned Lo = RegInfo.getSubReg(Src, Mips::sub_lo);
+  unsigned Hi = RegInfo.getSubReg(Src, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(Lo, SrcKill);
+  TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(Hi, SrcKill);
+  TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
+}
+
+void ExpandACCPseudo::expandCopy(MachineBasicBlock &MBB, Iter I,
+                                 unsigned RegSize) {
+  //  copy $vr0, src_lo
+  //  copy dst_lo, $vr0
+  //  copy $vr1, src_hi
+  //  copy dst_hi, $vr1
+
+  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  unsigned VR0 = MRI.createVirtualRegister(RC);
+  unsigned VR1 = MRI.createVirtualRegister(RC);
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
+  unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
+  unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
+  unsigned SrcLo = RegInfo.getSubReg(Src, Mips::sub_lo);
+  unsigned SrcHi = RegInfo.getSubReg(Src, Mips::sub_hi);
+  DebugLoc DL = I->getDebugLoc();
+
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(SrcLo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
+    .addReg(VR0, RegState::Kill);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(SrcHi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
+    .addReg(VR1, RegState::Kill);
+}
+
+unsigned MipsSEFrameLowering::ehDataReg(unsigned I) const {
+  static const unsigned EhDataReg[] = {
+    Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned EhDataReg64[] = {
+    Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64
+  };
+
+  return STI.isABI_N64() ? EhDataReg64[I] : EhDataReg[I];
+}
+
+void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // First, compute final stack size.
+  uint64_t StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->adjustsStack()) return;
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  MachineLocation DstML, SrcML;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
+
+  // emit ".cfi_def_cfa_offset StackSize"
+  MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl,
+          TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
+  DstML = MachineLocation(MachineLocation::VirtualFP);
+  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
+  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  if (CSI.size()) {
+    // Find the instruction past the last instruction that saves a callee-saved
+    // register to the stack.
+    for (unsigned i = 0; i < CSI.size(); ++i)
+      ++MBBI;
+
+    // Iterate over list of callee-saved registers and emit .cfi_offset
+    // directives.
+    MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
+
+    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
+           E = CSI.end(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = I->getReg();
+
+      // If Reg is a double precision register, emit two cfa_offsets,
+      // one for each of the paired single precision registers.
+      if (Mips::AFGR64RegClass.contains(Reg)) {
+        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
+        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
+        MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven));
+        MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd));
+
+        if (!STI.isLittle())
+          std::swap(SrcML0, SrcML1);
+
+        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
+        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+      } else {
+        // Reg is either in CPURegs or FGR32.
+        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+        SrcML = MachineLocation(Reg);
+        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+      }
+    }
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    // Insert instructions that spill eh data registers.
+    for (int I = 0; I < 4; ++I) {
+      if (!MBB.isLiveIn(ehDataReg(I)))
+        MBB.addLiveIn(ehDataReg(I));
+      TII.storeRegToStackSlot(MBB, MBBI, ehDataReg(I), false,
+                              MipsFI->getEhDataRegFI(I), RC, RegInfo);
+    }
+
+    // Emit .cfi_offset directives for eh data registers.
+    MCSymbol *CSLabel2 = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel2);
+    for (int I = 0; I < 4; ++I) {
+      int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
+      DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
+      SrcML = MachineLocation(ehDataReg(I));
+      Moves.push_back(MachineMove(CSLabel2, DstML, SrcML));
+    }
+  }
+
+  // if framepointer enabled, set it to point to the stack pointer.
+  if (hasFP(MF)) {
+    // Insert instruction "move $fp, $sp" at this location.
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
+
+    // emit ".cfi_def_cfa_register $fp"
+    MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl,
+            TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
+    DstML = MachineLocation(FP);
+    SrcML = MachineLocation(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
+  }
+}
+
+void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
+                                       MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  const MipsRegisterInfo *RegInfo =
+    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+
+  // if framepointer enabled, restore the stack pointer.
+  if (hasFP(MF)) {
+    // Find the first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instruction "move $sp, $fp" at this location.
+    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
+  }
+
+  if (MipsFI->callsEhReturn()) {
+    const TargetRegisterClass *RC = STI.isABI_N64() ?
+        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+
+    // Find first instruction that restores a callee-saved register.
+    MachineBasicBlock::iterator I = MBBI;
+    for (unsigned i = 0; i < MFI->getCalleeSavedInfo().size(); ++i)
+      --I;
+
+    // Insert instructions that restore eh data registers.
+    for (int J = 0; J < 4; ++J) {
+      TII.loadRegFromStackSlot(MBB, I, ehDataReg(J), MipsFI->getEhDataRegFI(J),
+                               RC, RegInfo);
+    }
+  }
+
+  // Get the number of bytes from FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (!StackSize)
+    return;
+
+  // Adjust stack.
+  TII.adjustStackPtr(SP, StackSize, MBB, MBBI);
+}
+
+bool MipsSEFrameLowering::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI,
+                          const TargetRegisterInfo *TRI) const {
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
+        && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    bool IsKill = !IsRAAndRetAddrIsTaken;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(*EntryBlock, MI, Reg, IsKill,
+                            CSI[i].getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool
+MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Reserve call frame if the size of the maximum call frame fits into 16-bit
+  // immediate field and there are no variable sized objects on the stack.
+  // Make sure the second register scavenger spill slot can be accessed with one
+  // instruction.
+  return isInt<16>(MFI->getMaxCallFrameSize() + getStackAlignment()) &&
+    !MFI->hasVarSizedObjects();
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions
+void MipsSEFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+    TII.adjustStackPtr(SP, Amount, MBB, I);
+  }
+
+  MBB.erase(I);
+}
+
+void MipsSEFrameLowering::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+
+  // Mark $fp as used if function has dedicated frame pointer.
+  if (hasFP(MF))
+    MRI.setPhysRegUsed(FP);
+
+  // Create spill slots for eh data registers if function calls eh_return.
+  if (MipsFI->callsEhReturn())
+    MipsFI->createEhDataRegsFI();
+
+  // Expand pseudo instructions which load, store or copy accumulators.
+  // Add an emergency spill slot if a pseudo was expanded.
+  if (ExpandACCPseudo(MF).expand()) {
+    // The spill slot should be half the size of the accumulator. If target is
+    // mips64, it should be 64-bit, otherwise it should be 32-bt.
+    const TargetRegisterClass *RC = STI.hasMips64() ?
+      &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+    int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                  RC->getAlignment(), false);
+    RS->addScavengingFrameIndex(FI);
+  }
+
+  // Set scavenging frame index if necessary.
+  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
+    estimateStackSize(MF);
+
+  if (isInt<16>(MaxSPOffset))
+    return;
+
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+  int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment(), false);
+  RS->addScavengingFrameIndex(FI);
+}
+
+const MipsFrameLowering *
+llvm::createMipsSEFrameLowering(const MipsSubtarget &ST) {
+  return new MipsSEFrameLowering(ST);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
new file mode 100644
index 000000000000..193a66cc65a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -0,0 +1,49 @@
+//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSE_FRAMEINFO_H
+#define MIPSSE_FRAMEINFO_H
+
+#include "MipsFrameLowering.h"
+
+namespace llvm {
+
+class MipsSEFrameLowering : public MipsFrameLowering {
+public:
+  explicit MipsSEFrameLowering(const MipsSubtarget &STI)
+    : MipsFrameLowering(STI, STI.hasMips64() ? 16 : 8) {}
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+  unsigned ehDataReg(unsigned I) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
new file mode 100644
index 000000000000..d6d220750c61
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -0,0 +1,473 @@
+//===-- MipsSEISelDAGToDAG.cpp - A Dag to Dag Inst Selector for MipsSE ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "MipsSEISelDAGToDAG.h"
+#include "Mips.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+
+bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
+                                                const MachineInstr& MI) {
+  unsigned DstReg = 0, ZeroReg = 0;
+
+  // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
+  if ((MI.getOpcode() == Mips::ADDiu) &&
+      (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO;
+  } else if ((MI.getOpcode() == Mips::DADDiu) &&
+             (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).getImm() == 0)) {
+    DstReg = MI.getOperand(0).getReg();
+    ZeroReg = Mips::ZERO_64;
+  }
+
+  if (!DstReg)
+    return false;
+
+  // Replace uses with ZeroReg.
+  for (MachineRegisterInfo::use_iterator U = MRI->use_begin(DstReg),
+       E = MRI->use_end(); U != E;) {
+    MachineOperand &MO = U.getOperand();
+    unsigned OpNo = U.getOperandNo();
+    MachineInstr *MI = MO.getParent();
+    ++U;
+
+    // Do not replace if it is a phi's operand or is tied to def operand.
+    if (MI->isPHI() || MI->isRegTiedToDefOperand(OpNo) || MI->isPseudo())
+      continue;
+
+    MO.setReg(ZeroReg);
+  }
+
+  return true;
+}
+
+void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->globalBaseRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  const TargetRegisterClass *RC;
+
+  if (Subtarget.isABI_N64())
+    RC = (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
+  else
+    RC = (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+
+  V0 = RegInfo.createVirtualRegister(RC);
+  V1 = RegInfo.createVirtualRegister(RC);
+
+  if (Subtarget.isABI_N64()) {
+    MF.getRegInfo().addLiveIn(Mips::T9_64);
+    MBB.addLiveIn(Mips::T9_64);
+
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // daddu $v1, $v0, $t9
+    // daddiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi64), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDu), V1).addReg(V0)
+      .addReg(Mips::T9_64);
+    BuildMI(MBB, I, DL, TII.get(Mips::DADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  if (MF.getTarget().getRelocationModel() == Reloc::Static) {
+    // Set global register to __gnu_local_gp.
+    //
+    // lui   $v0, %hi(__gnu_local_gp)
+    // addiu $globalbasereg, $v0, %lo(__gnu_local_gp)
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V0)
+      .addExternalSymbol("__gnu_local_gp", MipsII::MO_ABS_LO);
+    return;
+  }
+
+  MF.getRegInfo().addLiveIn(Mips::T9);
+  MBB.addLiveIn(Mips::T9);
+
+  if (Subtarget.isABI_N32()) {
+    // lui $v0, %hi(%neg(%gp_rel(fname)))
+    // addu $v1, $v0, $t9
+    // addiu $globalbasereg, $v1, %lo(%neg(%gp_rel(fname)))
+    const GlobalValue *FName = MF.getFunction();
+    BuildMI(MBB, I, DL, TII.get(Mips::LUi), V0)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_HI);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDu), V1).addReg(V0).addReg(Mips::T9);
+    BuildMI(MBB, I, DL, TII.get(Mips::ADDiu), GlobalBaseReg).addReg(V1)
+      .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
+    return;
+  }
+
+  assert(Subtarget.isABI_O32());
+
+  // For O32 ABI, the following instruction sequence is emitted to initialize
+  // the global base register:
+  //
+  //  0. lui   $2, %hi(_gp_disp)
+  //  1. addiu $2, $2, %lo(_gp_disp)
+  //  2. addu  $globalbasereg, $2, $t9
+  //
+  // We emit only the last instruction here.
+  //
+  // GNU linker requires that the first two instructions appear at the beginning
+  // of a function and no instructions be inserted before or between them.
+  // The two instructions are emitted during lowering to MC layer in order to
+  // avoid any reordering.
+  //
+  // Register $2 (Mips::V0) is added to the list of live-in registers to ensure
+  // the value instruction 1 (addiu) defines is valid when instruction 2 (addu)
+  // reads it.
+  MF.getRegInfo().addLiveIn(Mips::V0);
+  MBB.addLiveIn(Mips::V0);
+  BuildMI(MBB, I, DL, TII.get(Mips::ADDu), GlobalBaseReg)
+    .addReg(Mips::V0).addReg(Mips::T9);
+}
+
+void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
+  initGlobalBaseReg(MF);
+
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); MFI != MFE;
+       ++MFI)
+    for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
+      replaceUsesWithZeroReg(MRI, *I);
+}
+
+/// Select multiply instructions.
+std::pair<SDNode*, SDNode*>
+MipsSEDAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+                               bool HasLo, bool HasHi) {
+  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
+                                       N->getOperand(1));
+  SDValue InFlag = SDValue(Mul, 0);
+
+  if (HasLo) {
+    unsigned Opcode = (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
+    Lo = CurDAG->getMachineNode(Opcode, DL, Ty, MVT::Glue, InFlag);
+    InFlag = SDValue(Lo, 1);
+  }
+  if (HasHi) {
+    unsigned Opcode = (Ty == MVT::i32 ? Mips::MFHI : Mips::MFHI64);
+    Hi = CurDAG->getMachineNode(Opcode, DL, Ty, InFlag);
+  }
+  return std::make_pair(Lo, Hi);
+}
+
+SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
+                                           SDValue CmpLHS, DebugLoc DL,
+                                           SDNode *Node) const {
+  unsigned Opc = InFlag.getOpcode(); (void)Opc;
+
+  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
+          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
+         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+  SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
+  EVT VT = LHS.getValueType();
+
+  SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, DL, VT, Ops, 2);
+  SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, DL, VT,
+                                            SDValue(Carry, 0), RHS);
+  return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS,
+                              SDValue(AddCarry, 0));
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  EVT ValTy = Addr.getValueType();
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    return true;
+  }
+
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) const {
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType());
+  return true;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  return selectAddrRegImm(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc DL = Node->getDebugLoc();
+
+  ///
+  // Instruction Selection not handled by the auto-generated
+  // tablegen selection should be handled here.
+  ///
+  EVT NodeTy = Node->getValueType(0);
+  SDNode *Result;
+  unsigned MultOpc;
+
+  switch(Opcode) {
+  default: break;
+
+  case ISD::SUBE: {
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::SUBu, InFlag, InFlag.getOperand(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ADDE: {
+    SDValue InFlag = Node->getOperand(2);
+    Result = selectAddESubE(Mips::ADDu, InFlag, InFlag.getValue(0), DL, Node);
+    return std::make_pair(true, Result);
+  }
+
+  /// Mul with two results
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    if (NodeTy == MVT::i32)
+      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+    else
+      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::DMULTu : Mips::DMULT);
+
+    std::pair<SDNode*, SDNode*> LoHi = selectMULT(Node, MultOpc, DL, NodeTy,
+                                                  true, true);
+
+    if (!SDValue(Node, 0).use_empty())
+      ReplaceUses(SDValue(Node, 0), SDValue(LoHi.first, 0));
+
+    if (!SDValue(Node, 1).use_empty())
+      ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
+
+    return std::make_pair(true, (SDNode*)NULL);
+  }
+
+  /// Special Muls
+  case ISD::MUL: {
+    // Mips32 has a 32-bit three operand mul instruction.
+    if (Subtarget.hasMips32() && NodeTy == MVT::i32)
+      break;
+    MultOpc = NodeTy == MVT::i32 ? Mips::MULT : Mips::DMULT;
+    Result = selectMULT(Node, MultOpc, DL, NodeTy, true, false).first;
+    return std::make_pair(true, Result);
+  }
+  case ISD::MULHS:
+  case ISD::MULHU: {
+    if (NodeTy == MVT::i32)
+      MultOpc = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+    else
+      MultOpc = (Opcode == ISD::MULHU ? Mips::DMULTu : Mips::DMULT);
+
+    Result = selectMULT(Node, MultOpc, DL, NodeTy, false, true).second;
+    return std::make_pair(true, Result);
+  }
+
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+    if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
+      if (Subtarget.hasMips64()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO_64, MVT::i64);
+        Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64, DL, MVT::f64, Zero,
+                                        Zero);
+      }
+
+      return std::make_pair(true, Result);
+    }
+    break;
+  }
+
+  case ISD::Constant: {
+    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Node);
+    unsigned Size = CN->getValueSizeInBits(0);
+
+    if (Size == 32)
+      break;
+
+    MipsAnalyzeImmediate AnalyzeImm;
+    int64_t Imm = CN->getSExtValue();
+
+    const MipsAnalyzeImmediate::InstSeq &Seq =
+      AnalyzeImm.Analyze(Imm, Size, false);
+
+    MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+    DebugLoc DL = CN->getDebugLoc();
+    SDNode *RegOpnd;
+    SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                                MVT::i64);
+
+    // The first instruction can be a LUi which is different from other
+    // instructions (ADDiu, ORI and SLL) in that it does not have a register
+    // operand.
+    if (Inst->Opc == Mips::LUi64)
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64, ImmOpnd);
+    else
+      RegOpnd =
+        CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                               CurDAG->getRegister(Mips::ZERO_64, MVT::i64),
+                               ImmOpnd);
+
+    // The remaining instructions in the sequence are handled here.
+    for (++Inst; Inst != Seq.end(); ++Inst) {
+      ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
+                                          MVT::i64);
+      RegOpnd = CurDAG->getMachineNode(Inst->Opc, DL, MVT::i64,
+                                       SDValue(RegOpnd, 0), ImmOpnd);
+    }
+
+    return std::make_pair(true, RegOpnd);
+  }
+
+  case MipsISD::ThreadPointer: {
+    EVT PtrVT = TLI.getPointerTy();
+    unsigned RdhwrOpc, SrcReg, DestReg;
+
+    if (PtrVT == MVT::i32) {
+      RdhwrOpc = Mips::RDHWR;
+      SrcReg = Mips::HWR29;
+      DestReg = Mips::V1;
+    } else {
+      RdhwrOpc = Mips::RDHWR64;
+      SrcReg = Mips::HWR29_64;
+      DestReg = Mips::V1_64;
+    }
+
+    SDNode *Rdhwr =
+      CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
+                             Node->getValueType(0),
+                             CurDAG->getRegister(SrcReg, PtrVT));
+    SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
+                                         SDValue(Rdhwr, 0));
+    SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
+    ReplaceUses(SDValue(Node, 0), ResNode);
+    return std::make_pair(true, ResNode.getNode());
+  }
+
+  case MipsISD::InsertLOHI: {
+    unsigned RCID = Subtarget.hasDSP() ? Mips::ACRegsDSPRegClassID :
+                                         Mips::ACRegsRegClassID;
+    SDValue RegClass = CurDAG->getTargetConstant(RCID, MVT::i32);
+    SDValue LoIdx = CurDAG->getTargetConstant(Mips::sub_lo, MVT::i32);
+    SDValue HiIdx = CurDAG->getTargetConstant(Mips::sub_hi, MVT::i32);
+    const SDValue Ops[] = { RegClass, Node->getOperand(0), LoIdx,
+                            Node->getOperand(1), HiIdx };
+    SDNode *Res = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
+                                         MVT::Untyped, Ops, 5);
+    return std::make_pair(true, Res);
+  }
+  }
+
+  return std::make_pair(false, (SDNode*)NULL);
+}
+
+FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
+  return new MipsSEDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
new file mode 100644
index 000000000000..6137ab040bbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -0,0 +1,57 @@
+//===-- MipsSEISelDAGToDAG.h - A Dag to Dag Inst Selector for MipsSE -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsDAGToDAGISel specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEISELDAGTODAG_H
+#define MIPSSEISELDAGTODAG_H
+
+#include "MipsISelDAGToDAG.h"
+
+namespace llvm {
+
+class MipsSEDAGToDAGISel : public MipsDAGToDAGISel {
+
+public:
+  explicit MipsSEDAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
+
+private:
+  bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
+
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
+                                         EVT Ty, bool HasLo, bool HasHi);
+
+  SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
+                         DebugLoc DL, SDNode *Node) const;
+
+  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
+  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                                 SDValue &Offset) const;
+
+  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
+                             SDValue &Offset) const;
+
+  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+
+  virtual void processFunctionAfterISel(MachineFunction &MF);
+
+  // Insert instructions to initialize the global base register in the
+  // first MBB of the function.
+  void initGlobalBaseReg(MachineFunction &MF);
+};
+
+FunctionPass *createMipsSEISelDag(MipsTargetMachine &TM);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
new file mode 100644
index 000000000000..4f219218d31f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -0,0 +1,442 @@
+//===-- MipsSEISelLowering.cpp - MipsSE DAG Lowering Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsSEISelLowering.h"
+#include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
+
+MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
+  : MipsTargetLowering(TM) {
+  // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+
+  if (HasMips64)
+    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
+
+  if (Subtarget->hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
+  }
+
+  if (!TM.Options.UseSoftFloat) {
+    addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
+
+    // When dealing with single precision only, use libcalls
+    if (!Subtarget->isSingleFloat()) {
+      if (HasMips64)
+        addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
+      else
+        addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
+    }
+  }
+
+  setOperationAction(ISD::SMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI,          MVT::i32, Custom);
+  setOperationAction(ISD::MULHS,              MVT::i32, Custom);
+  setOperationAction(ISD::MULHU,              MVT::i32, Custom);
+
+  if (HasMips64)
+    setOperationAction(ISD::MUL,              MVT::i64, Custom);
+
+  setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+
+  setTargetDAGCombine(ISD::ADDE);
+  setTargetDAGCombine(ISD::SUBE);
+
+  computeRegisterProperties();
+}
+
+const MipsTargetLowering *
+llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
+  return new MipsSETargetLowering(TM);
+}
+
+
+bool
+MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+
+  switch (SVT) {
+  case MVT::i64:
+  case MVT::i32:
+    if (Fast)
+      *Fast = true;
+    return true;
+  default:
+    return false;
+  }
+}
+
+SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  switch(Op.getOpcode()) {
+  case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
+  case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
+  case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
+  case ISD::MULHU:     return lowerMulDiv(Op, MipsISD::Multu, false, true, DAG);
+  case ISD::MUL:       return lowerMulDiv(Op, MipsISD::Mult, true, false, DAG);
+  case ISD::SDIVREM:   return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
+  case ISD::UDIVREM:   return lowerMulDiv(Op, MipsISD::DivRemU, true, true, DAG);
+  }
+
+  return MipsTargetLowering::LowerOperation(Op, DAG);
+}
+
+// selectMADD -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc multLo, Lo0), (adde multHi, Hi0),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
+  // ADDENode's second operand must be a flag output of an ADDC node in order
+  // for the matching to be successful.
+  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
+
+  if (ADDCNode->getOpcode() != ISD::ADDC)
+    return false;
+
+  SDValue MultHi = ADDENode->getOperand(0);
+  SDValue MultLo = ADDCNode->getOperand(0);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than ADDENode or ADDCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MADD instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  DebugLoc DL = ADDENode->getDebugLoc();
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+                                  ADDCNode->getOperand(1),
+                                  ADDENode->getOperand(1));
+
+  // create MipsMAdd(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
+
+  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of adde and addc here
+  if (!SDValue(ADDCNode, 0).use_empty()) {
+    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
+    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
+                                    LoIdx);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
+  }
+  if (!SDValue(ADDENode, 0).use_empty()) {
+    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
+    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
+                                    HiIdx);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+// selectMSUB -
+// Transforms a subgraph in CurDAG if the following pattern is found:
+//  (addc Lo0, multLo), (sube Hi0, multHi),
+// where,
+//  multHi/Lo: product of multiplication
+//  Lo0: initial value of Lo register
+//  Hi0: initial value of Hi register
+// Return true if pattern matching was successful.
+static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
+  // SUBENode's second operand must be a flag output of an SUBC node in order
+  // for the matching to be successful.
+  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
+
+  if (SUBCNode->getOpcode() != ISD::SUBC)
+    return false;
+
+  SDValue MultHi = SUBENode->getOperand(1);
+  SDValue MultLo = SUBCNode->getOperand(1);
+  SDNode *MultNode = MultHi.getNode();
+  unsigned MultOpc = MultHi.getOpcode();
+
+  // MultHi and MultLo must be generated by the same node,
+  if (MultLo.getNode() != MultNode)
+    return false;
+
+  // and it must be a multiplication.
+  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
+    return false;
+
+  // MultLo amd MultHi must be the first and second output of MultNode
+  // respectively.
+  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
+    return false;
+
+  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
+  // of the values of MultNode, in which case MultNode will be removed in later
+  // phases.
+  // If there exist users other than SUBENode or SUBCNode, this function returns
+  // here, which will result in MultNode being mapped to a single MULT
+  // instruction node rather than a pair of MULT and MSUB instructions being
+  // produced.
+  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
+    return false;
+
+  DebugLoc DL = SUBENode->getDebugLoc();
+
+  // Initialize accumulator.
+  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+                                  SUBCNode->getOperand(0),
+                                  SUBENode->getOperand(0));
+
+  // create MipsSub(u) node
+  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
+
+  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
+                                 MultNode->getOperand(0),// Factor 0
+                                 MultNode->getOperand(1),// Factor 1
+                                 ACCIn);
+
+  // replace uses of sube and subc here
+  if (!SDValue(SUBCNode, 0).use_empty()) {
+    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
+    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
+                                    LoIdx);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
+  }
+  if (!SDValue(SUBENode, 0).use_empty()) {
+    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
+    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
+                                    HiIdx);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
+  }
+
+  return true;
+}
+
+static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget *Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMADD(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const MipsSubtarget *Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  if (Subtarget->hasMips32() && N->getValueType(0) == MVT::i32 &&
+      selectMSUB(N, &DAG))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+SDValue
+MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (N->getOpcode()) {
+  case ISD::ADDE:
+    return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::SUBE:
+    return performSUBECombine(N, DAG, DCI, Subtarget);
+  default:
+    return MipsTargetLowering::PerformDAGCombine(N, DCI);
+  }
+}
+
+MachineBasicBlock *
+MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default:
+    return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case Mips::BPOSGE32_PSEUDO:
+    return emitBPOSGE32(MI, BB);
+  }
+}
+
+bool MipsSETargetLowering::
+isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  if (!EnableMipsTailCalls)
+    return false;
+
+  // Return false if either the callee or caller has a byval argument.
+  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
+    return false;
+
+  // Return true if the callee's argument area is no larger than the
+  // caller's.
+  return NextStackOffset <= FI.getIncomingArgSize();
+}
+
+void MipsSETargetLowering::
+getOpndList(SmallVectorImpl<SDValue> &Ops,
+            std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+            bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+            CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
+  // T9 should contain the address of the callee function if
+  // -reloction-model=pic or it is an indirect call.
+  if (IsPICCall || !GlobalOrExternal) {
+    unsigned T9Reg = IsN64 ? Mips::T9_64 : Mips::T9;
+    RegsToPass.push_front(std::make_pair(T9Reg, Callee));
+  } else
+    Ops.push_back(Callee);
+
+  MipsTargetLowering::getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal,
+                                  InternalLinkage, CLI, Callee, Chain);
+}
+
+SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
+                                          bool HasLo, bool HasHi,
+                                          SelectionDAG &DAG) const {
+  EVT Ty = Op.getOperand(0).getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
+                             Op.getOperand(0), Op.getOperand(1));
+  SDValue Lo, Hi;
+
+  if (HasLo)
+    Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
+                     DAG.getConstant(Mips::sub_lo, MVT::i32));
+  if (HasHi)
+    Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
+                     DAG.getConstant(Mips::sub_hi, MVT::i32));
+
+  if (!HasLo || !HasHi)
+    return HasLo ? Lo : Hi;
+
+  SDValue Vals[] = { Lo, Hi };
+  return DAG.getMergeValues(Vals, 2, DL);
+}
+
+MachineBasicBlock * MipsSETargetLowering::
+emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
+  // $bb:
+  //  bposge32_pseudo $vr0
+  //  =>
+  // $bb:
+  //  bposge32 $tbb
+  // $fbb:
+  //  li $vr2, 0
+  //  b $sink
+  // $tbb:
+  //  li $vr1, 1
+  // $sink:
+  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::CPURegsRegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bposge32 instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
new file mode 100644
index 000000000000..186f6a343dee
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.h
@@ -0,0 +1,62 @@
+//===-- MipsSEISelLowering.h - MipsSE DAG Lowering Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Subclass of MipsTargetLowering specialized for mips32/64.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsSEISELLOWERING_H
+#define MipsSEISELLOWERING_H
+
+#include "MipsISelLowering.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+  class MipsSETargetLowering : public MipsTargetLowering  {
+  public:
+    explicit MipsSETargetLowering(MipsTargetMachine &TM);
+
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+
+    virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
+      if (VT == MVT::Untyped)
+        return Subtarget->hasDSP() ? &Mips::ACRegsDSPRegClass :
+                                     &Mips::ACRegsRegClass;
+
+      return TargetLowering::getRepRegClassFor(VT);
+    }
+
+  private:
+    virtual bool
+    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                      unsigned NextStackOffset,
+                                      const MipsFunctionInfo& FI) const;
+
+    virtual void
+    getOpndList(SmallVectorImpl<SDValue> &Ops,
+                std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
+                bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
+                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+
+    SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
+                        SelectionDAG &DAG) const;
+
+    MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const;
+  };
+}
+
+#endif // MipsSEISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
new file mode 100644
index 000000000000..ca0315ed9f6e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -0,0 +1,410 @@
+//===-- MipsSEInstrInfo.cpp - Mips32/64 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSEInstrInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
+  : MipsInstrInfo(tm,
+                  tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
+    RI(*tm.getSubtargetImpl(), *this),
+    IsN64(tm.getSubtarget<MipsSubtarget>().isABI_N64()) {}
+
+const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsSEInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
+      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
+      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
+      (Opc == Mips::LDC164_P8)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsSEInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
+{
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
+      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
+      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
+      (Opc == Mips::SDC164_P8)) {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I, DebugLoc DL,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  bool KillSrc) const {
+  unsigned Opc = 0, ZeroReg = 0;
+
+  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::CPURegsRegClass.contains(SrcReg))
+      Opc = Mips::OR, ZeroReg = Mips::ZERO;
+    else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
+    else if (Mips::FGR32RegClass.contains(SrcReg))
+      Opc = Mips::MFC1;
+    else if (SrcReg == Mips::HI)
+      Opc = Mips::MFHI, SrcReg = 0;
+    else if (SrcReg == Mips::LO)
+      Opc = Mips::MFLO, SrcReg = 0;
+  }
+  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
+    if (Mips::CCRRegClass.contains(DestReg))
+      Opc = Mips::CTC1;
+    else if (Mips::FGR32RegClass.contains(DestReg))
+      Opc = Mips::MTC1;
+    else if (DestReg == Mips::HI)
+      Opc = Mips::MTHI, DestReg = 0;
+    else if (DestReg == Mips::LO)
+      Opc = Mips::MTLO, DestReg = 0;
+  }
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D64;
+  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::MOVCCRToCCR;
+  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::CPU64RegsRegClass.contains(SrcReg))
+      Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
+    else if (SrcReg == Mips::HI64)
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (SrcReg == Mips::LO64)
+      Opc = Mips::MFLO64, SrcReg = 0;
+    else if (Mips::FGR64RegClass.contains(SrcReg))
+      Opc = Mips::DMFC1;
+  }
+  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (DestReg == Mips::HI64)
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (DestReg == Mips::LO64)
+      Opc = Mips::MTLO64, DestReg = 0;
+    else if (Mips::FGR64RegClass.contains(DestReg))
+      Opc = Mips::DMTC1;
+  }
+  else if (Mips::ACRegsRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::COPY_AC64;
+  else if (Mips::ACRegsDSPRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::COPY_AC_DSP;
+  else if (Mips::ACRegs128RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::COPY_AC128;
+
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+}
+
+void MipsSEInstrInfo::
+storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                unsigned SrcReg, bool isKill, int FI,
+                const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
+                int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+
+  unsigned Opc = 0;
+
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
+  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::STORE_AC64_P8 : Mips::STORE_AC64;
+  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::STORE_AC_DSP_P8 : Mips::STORE_AC_DSP;
+  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::STORE_AC128_P8 : Mips::STORE_AC128;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+}
+
+void MipsSEInstrInfo::
+loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
+                 const TargetRegisterInfo *TRI, int64_t Offset) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
+  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LOAD_AC64_P8 : Mips::LOAD_AC64;
+  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LOAD_AC_DSP_P8 : Mips::LOAD_AC_DSP;
+  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LOAD_AC128_P8 : Mips::LOAD_AC128;
+  else if (Mips::FGR32RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+  else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LDC1;
+  else if (Mips::FGR64RegClass.hasSubClassEq(RC))
+    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
+    .addMemOperand(MMO);
+}
+
+bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  switch(MI->getDesc().getOpcode()) {
+  default:
+    return false;
+  case Mips::RetRA:
+    ExpandRetRA(MBB, MI, Mips::RET);
+    break;
+  case Mips::BuildPairF64:
+    ExpandBuildPairF64(MBB, MI);
+    break;
+  case Mips::ExtractElementF64:
+    ExpandExtractElementF64(MBB, MI);
+    break;
+  case Mips::MIPSeh_return32:
+  case Mips::MIPSeh_return64:
+    ExpandEhReturn(MBB, MI);
+    break;
+  }
+
+  MBB.erase(MI);
+  return true;
+}
+
+/// GetOppositeBranchOpc - Return the inverse of the specified
+/// opcode, e.g. turning BEQ to BNE.
+unsigned MipsSEInstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:           llvm_unreachable("Illegal opcode!");
+  case Mips::BEQ:    return Mips::BNE;
+  case Mips::BNE:    return Mips::BEQ;
+  case Mips::BGTZ:   return Mips::BLEZ;
+  case Mips::BGEZ:   return Mips::BLTZ;
+  case Mips::BLTZ:   return Mips::BGEZ;
+  case Mips::BLEZ:   return Mips::BGTZ;
+  case Mips::BEQ64:  return Mips::BNE64;
+  case Mips::BNE64:  return Mips::BEQ64;
+  case Mips::BGTZ64: return Mips::BLEZ64;
+  case Mips::BGEZ64: return Mips::BLTZ64;
+  case Mips::BLTZ64: return Mips::BGEZ64;
+  case Mips::BLEZ64: return Mips::BGTZ64;
+  case Mips::BC1T:   return Mips::BC1F;
+  case Mips::BC1F:   return Mips::BC1T;
+  }
+}
+
+/// Adjust SP by Amount bytes.
+void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+
+  if (isInt<16>(Amount))// addi sp, sp, amount
+    BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
+  else { // Expand immediate that doesn't fit in 16-bit.
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
+  }
+}
+
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  unsigned Reg = RegInfo.createVirtualRegister(RC);
+
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(Reg, RegState::Kill)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return Reg;
+}
+
+unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
+          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
+          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
+          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::B      ||
+          Opc == Mips::J) ?
+         Opc : 0;
+}
+
+void MipsSEInstrInfo::ExpandRetRA(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I,
+                                unsigned Opc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA);
+}
+
+void MipsSEInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned SrcReg = I->getOperand(1).getReg();
+  unsigned N = I->getOperand(2).getImm();
+  const MCInstrDesc& Mfc1Tdd = get(Mips::MFC1);
+  DebugLoc dl = I->getDebugLoc();
+
+  assert(N < 2 && "Invalid immediate");
+  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
+
+  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+}
+
+void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const {
+  unsigned DstReg = I->getOperand(0).getReg();
+  unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
+  const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
+  DebugLoc dl = I->getDebugLoc();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+
+  // mtc1 Lo, $fp
+  // mtc1 Hi, $fp + 1
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpeven))
+    .addReg(LoReg);
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpodd))
+    .addReg(HiReg);
+}
+
+void MipsSEInstrInfo::ExpandEhReturn(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  // This pseudo instruction is generated as part of the lowering of
+  // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
+  // indirect jump to TargetReg
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned OR = STI.isABI_N64() ? Mips::OR64 : Mips::OR;
+  unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR;
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA;
+  unsigned T9 = STI.isABI_N64() ? Mips::T9_64 : Mips::T9;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned OffsetReg = I->getOperand(0).getReg();
+  unsigned TargetReg = I->getOperand(1).getReg();
+
+  // or   $ra, $v0, $zero
+  // addu $sp, $sp, $v1
+  // jr   $ra
+  if (TM.getRelocationModel() == Reloc::PIC_)
+    BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(OR), T9)
+        .addReg(TargetReg).addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(OR), RA)
+      .addReg(TargetReg).addReg(ZERO);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
+      .addReg(SP).addReg(OffsetReg);
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(JR)).addReg(RA);
+}
+
+const MipsInstrInfo *llvm::createMipsSEInstrInfo(MipsTargetMachine &TM) {
+  return new MipsSEInstrInfo(TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
new file mode 100644
index 000000000000..0bf7876f0fe0
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -0,0 +1,96 @@
+//===-- MipsSEInstrInfo.h - Mips32/64 Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEINSTRUCTIONINFO_H
+#define MIPSSEINSTRUCTIONINFO_H
+
+#include "MipsInstrInfo.h"
+#include "MipsSERegisterInfo.h"
+
+namespace llvm {
+
+class MipsSEInstrInfo : public MipsInstrInfo {
+  const MipsSERegisterInfo RI;
+  bool IsN64;
+
+public:
+  explicit MipsSEInstrInfo(MipsTargetMachine &TM);
+
+  virtual const MipsRegisterInfo &getRegisterInfo() const;
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStack(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI,
+                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               int64_t Offset) const;
+
+  virtual void loadRegFromStack(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI,
+                                int64_t Offset) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned *NewImm) const;
+
+private:
+  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+
+  void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   unsigned Opc) const;
+  void ExpandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I) const;
+  void ExpandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I) const;
+  void ExpandEhReturn(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
new file mode 100644
index 000000000000..96967380b29d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -0,0 +1,134 @@
+//===-- MipsSERegisterInfo.cpp - MIPS32/64 Register Information -== -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSERegisterInfo.h"
+#include "Mips.h"
+#include "MipsAnalyzeImmediate.h"
+#include "MipsMachineFunction.h"
+#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
+                                       const MipsSEInstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
+
+bool MipsSERegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool MipsSERegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+const TargetRegisterClass *
+MipsSERegisterInfo::intRegClass(unsigned Size) const {
+  if (Size == 4)
+    return &Mips::CPURegsRegClass;
+
+  assert(Size == 8);
+  return &Mips::CPU64RegsRegClass;
+}
+
+void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
+                                     unsigned OpNo, int FrameIndex,
+                                     uint64_t StackSize,
+                                     int64_t SPOffset) const {
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex);
+
+  // The following stack frame objects are always referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  //  4. Locations for eh data registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI)
+    FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  else
+    FrameReg = getFrameRegister(MF);
+
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following, its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  bool IsKill = false;
+  int64_t Offset;
+
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
+  // field.
+  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
+    MachineBasicBlock &MBB = *MI.getParent();
+    DebugLoc DL = II->getDebugLoc();
+    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+    unsigned NewImm;
+
+    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
+    BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
+
+    FrameReg = Reg;
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
+  }
+
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
new file mode 100644
index 000000000000..2f7c37bb460d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -0,0 +1,44 @@
+//===-- MipsSERegisterInfo.h - Mips32/64 Register Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips32/64 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSEREGISTERINFO_H
+#define MIPSSEREGISTERINFO_H
+
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+class MipsSEInstrInfo;
+
+class MipsSERegisterInfo : public MipsRegisterInfo {
+  const MipsSEInstrInfo &TII;
+
+public:
+  MipsSERegisterInfo(const MipsSubtarget &Subtarget,
+                     const MipsSEInstrInfo &TII);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
+  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+
+private:
+  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                           int FrameIndex, uint64_t StackSize,
+                           int64_t SPOffset) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 000000000000..1add02ff83e9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -0,0 +1,63 @@
+//===-- MipsSchedule.td - Mips Scheduling Definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips
+//===----------------------------------------------------------------------===//
+def IIAlu              : InstrItinClass;
+def IILoad             : InstrItinClass;
+def IIStore            : InstrItinClass;
+def IIXfer             : InstrItinClass;
+def IIBranch           : InstrItinClass;
+def IIHiLo             : InstrItinClass;
+def IIImul             : InstrItinClass;
+def IIIdiv             : InstrItinClass;
+def IIFcvt             : InstrItinClass;
+def IIFmove            : InstrItinClass;
+def IIFcmp             : InstrItinClass;
+def IIFadd             : InstrItinClass;
+def IIFmulSingle       : InstrItinClass;
+def IIFmulDouble       : InstrItinClass;
+def IIFdivSingle       : InstrItinClass;
+def IIFdivDouble       : InstrItinClass;
+def IIFsqrtSingle      : InstrItinClass;
+def IIFsqrtDouble      : InstrItinClass;
+def IIFrecipFsqrtStep  : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
+  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
+  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
+  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
+  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
+  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..e4d70fcec591
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- MipsSelectionDAGInfo.cpp - Mips SelectionDAG Info -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-selectiondag-info"
+#include "MipsTargetMachine.h"
+using namespace llvm;
+
+MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+MipsSelectionDAGInfo::~MipsSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h
new file mode 100644
index 000000000000..6cafb558b35a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- MipsSelectionDAGInfo.h - Mips SelectionDAG Info ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Mips subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSELECTIONDAGINFO_H
+#define MIPSSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class MipsTargetMachine;
+
+class MipsSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit MipsSelectionDAGInfo(const MipsTargetMachine &TM);
+  ~MipsSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 000000000000..e11e5d142b74
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -0,0 +1,74 @@
+//===-- MipsSubtarget.cpp - Mips Subtarget Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSubtarget.h"
+#include "Mips.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "MipsGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void MipsSubtarget::anchor() { }
+
+MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
+                             const std::string &FS, bool little,
+                             Reloc::Model _RM) :
+  MipsGenSubtargetInfo(TT, CPU, FS),
+  MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
+  IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
+  IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
+  HasBitCount(false), HasFPIdx(false),
+  InMips16Mode(false), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+  RM(_RM)
+{
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "mips32";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // Set MipsABI if it hasn't been set yet.
+  if (MipsABI == UnknownABI)
+    MipsABI = hasMips64() ? N64 : O32;
+
+  // Check if Architecture and ABI are compatible.
+  assert(((!hasMips64() && (isABI_O32() || isABI_EABI())) ||
+          (hasMips64() && (isABI_N32() || isABI_N64()))) &&
+         "Invalid  Arch & ABI pair.");
+
+  // Is the target system Linux ?
+  if (TT.find("linux") == std::string::npos)
+    IsLinux = false;
+
+  // Set UseSmallSection.
+  UseSmallSection = !IsLinux && (RM == Reloc::Static);
+}
+
+bool
+MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+                                     RegClassVector &CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(hasMips64() ?
+                            &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass);
+  return OptLevel >= CodeGenOpt::Aggressive;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 000000000000..7a2e47ce5a9d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -0,0 +1,164 @@
+//===-- MipsSubtarget.h - Define Subtarget for the Mips ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSUBTARGET_H
+#define MIPSSUBTARGET_H
+
+#include "MCTargetDesc/MipsReginfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "MipsGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class MipsSubtarget : public MipsGenSubtargetInfo {
+  virtual void anchor();
+
+public:
+  // NOTE: O64 will not be supported.
+  enum MipsABIEnum {
+    UnknownABI, O32, N32, N64, EABI
+  };
+
+protected:
+
+  enum MipsArchEnum {
+    Mips32, Mips32r2, Mips64, Mips64r2
+  };
+
+  // Mips architecture version
+  MipsArchEnum MipsArchVersion;
+
+  // Mips supported ABIs
+  MipsABIEnum MipsABI;
+
+  // IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  // IsSingleFloat - The target only supports single precision float
+  // point operations. This enable the target to use all 32 32-bit
+  // floating point registers instead of only using even ones.
+  bool IsSingleFloat;
+
+  // IsFP64bit - The target processor has 64-bit floating point registers.
+  bool IsFP64bit;
+
+  // IsFP64bit - General-purpose registers are 64 bits wide
+  bool IsGP64bit;
+
+  // HasVFPU - Processor has a vector floating point unit.
+  bool HasVFPU;
+
+  // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+  bool IsLinux;
+
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
+  /// Features related to the presence of specific instructions.
+
+  // HasSEInReg - SEB and SEH (signext in register) instructions.
+  bool HasSEInReg;
+
+  // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
+  bool HasCondMov;
+
+  // HasSwap - Byte and half swap instructions.
+  bool HasSwap;
+
+  // HasBitCount - Count leading '1' and '0' bits.
+  bool HasBitCount;
+
+  // HasFPIdx -- Floating point indexed load/store instructions.
+  bool HasFPIdx;
+
+  // InMips16 -- can process Mips16 instructions
+  bool InMips16Mode;
+
+  // InMicroMips -- can process MicroMips instructions
+  bool InMicroMipsMode;
+
+  // HasDSP, HasDSPR2 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2;
+
+  InstrItineraryData InstrItins;
+
+  // The instance to the register info section object
+  MipsReginfo MRI;
+
+  // Relocation Model
+  Reloc::Model RM;
+
+public:
+  virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                     AntiDepBreakMode& Mode,
+                                     RegClassVector& CriticalPathRCs) const;
+
+  /// Only O32 and EABI supported right now.
+  bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_N64() const { return MipsABI == N64; }
+  bool isABI_N32() const { return MipsABI == N32; }
+  bool isABI_O32() const { return MipsABI == O32; }
+  unsigned getTargetABI() const { return MipsABI; }
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MipsSubtarget(const std::string &TT, const std::string &CPU,
+                const std::string &FS, bool little, Reloc::Model RM);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool hasMips32() const { return MipsArchVersion >= Mips32; }
+  bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
+                                   MipsArchVersion == Mips64r2; }
+  bool hasMips64() const { return MipsArchVersion >= Mips64; }
+  bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
+
+  bool isLittle() const { return IsLittle; }
+  bool isFP64bit() const { return IsFP64bit; }
+  bool isGP64bit() const { return IsGP64bit; }
+  bool isGP32bit() const { return !IsGP64bit; }
+  bool isSingleFloat() const { return IsSingleFloat; }
+  bool isNotSingleFloat() const { return !IsSingleFloat; }
+  bool hasVFPU() const { return HasVFPU; }
+  bool inMips16Mode() const { return InMips16Mode; }
+  bool inMicroMipsMode() const { return InMicroMipsMode; }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSPR2() const { return HasDSPR2; }
+  bool isLinux() const { return IsLinux; }
+  bool useSmallSection() const { return UseSmallSection; }
+
+  bool hasStandardEncoding() const { return !inMips16Mode(); }
+
+  /// Features related to the presence of specific instructions.
+  bool hasSEInReg()   const { return HasSEInReg; }
+  bool hasCondMov()   const { return HasCondMov; }
+  bool hasSwap()      const { return HasSwap; }
+  bool hasBitCount()  const { return HasBitCount; }
+  bool hasFPIdx()     const { return HasFPIdx; }
+
+  // Grab MipsRegInfo object
+  const MipsReginfo &getMReginfo() const { return MRI; }
+
+  // Grab relocation model
+  Reloc::Model getRelocationModel() const {return RM;}
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 000000000000..33363580aba7
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -0,0 +1,130 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetMachine.h"
+#include "Mips.h"
+#include "MipsFrameLowering.h"
+#include "MipsInstrInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMipsTarget() {
+  // Register the target.
+  RegisterTargetMachine<MipsebTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterTargetMachine<MipsebTargetMachine> A(TheMips64Target);
+  RegisterTargetMachine<MipselTargetMachine> B(TheMips64elTarget);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::
+MipsTargetMachine(const Target &T, StringRef TT,
+                  StringRef CPU, StringRef FS, const TargetOptions &Options,
+                  Reloc::Model RM, CodeModel::Model CM,
+                  CodeGenOpt::Level OL,
+                  bool isLittle)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, isLittle, RM),
+    DL(isLittle ?
+               (Subtarget.isABI_N64() ?
+                "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-"
+                "n32:64-S128" :
+                "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64") :
+               (Subtarget.isABI_N64() ?
+                "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-"
+                "n32:64-S128" :
+                "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64")),
+    InstrInfo(MipsInstrInfo::create(*this)),
+    FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
+    TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this), JITInfo() {
+}
+
+void MipsebTargetMachine::anchor() { }
+
+MipsebTargetMachine::
+MipsebTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void MipselTargetMachine::anchor() { }
+
+MipselTargetMachine::
+MipselTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL)
+  : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace {
+/// Mips Code Generator Pass Configuration Options.
+class MipsPassConfig : public TargetPassConfig {
+public:
+  MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  MipsTargetMachine &getMipsTargetMachine() const {
+    return getTM<MipsTargetMachine>();
+  }
+
+  const MipsSubtarget &getMipsSubtarget() const {
+    return *getMipsTargetMachine().getSubtargetImpl();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new MipsPassConfig(this, PM);
+}
+
+// Install an instruction selector pass using
+// the ISelDag to gen Mips code.
+bool MipsPassConfig::addInstSelector() {
+  addPass(createMipsISelDag(getMipsTargetMachine()));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before
+// machine code is emitted. return true if -print-machineinstrs should
+// print out the code after the passes.
+bool MipsPassConfig::addPreEmitPass() {
+  MipsTargetMachine &TM = getMipsTargetMachine();
+  addPass(createMipsDelaySlotFillerPass(TM));
+
+  // NOTE: long branch has not been implemented for mips16.
+  if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding())
+    addPass(createMipsLongBranchPass(TM));
+  if (TM.getSubtarget<MipsSubtarget>().inMips16Mode())
+    addPass(createMipsConstantIslandPass(TM));
+
+  return true;
+}
+
+bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                       JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Mips.
+  PM.add(createMipsJITCodeEmitterPass(*this, JCE));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 000000000000..7e5f19226433
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -0,0 +1,102 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETMACHINE_H
+#define MIPSTARGETMACHINE_H
+
+#include "MipsFrameLowering.h"
+#include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
+#include "MipsJITInfo.h"
+#include "MipsSelectionDAGInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class formatted_raw_ostream;
+class MipsRegisterInfo;
+
+class MipsTargetMachine : public LLVMTargetMachine {
+  MipsSubtarget       Subtarget;
+  const DataLayout    DL; // Calculates type size & alignment
+  OwningPtr<const MipsInstrInfo> InstrInfo;
+  OwningPtr<const MipsFrameLowering> FrameLowering;
+  OwningPtr<const MipsTargetLowering> TLInfo;
+  MipsSelectionDAGInfo TSInfo;
+  MipsJITInfo JITInfo;
+
+public:
+  MipsTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS, const TargetOptions &Options,
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL,
+                    bool isLittle);
+
+  virtual ~MipsTargetMachine() {}
+
+  virtual const MipsInstrInfo *getInstrInfo() const
+  { return InstrInfo.get(); }
+  virtual const TargetFrameLowering *getFrameLowering() const
+  { return FrameLowering.get(); }
+  virtual const MipsSubtarget *getSubtargetImpl() const
+  { return &Subtarget; }
+  virtual const DataLayout *getDataLayout()    const
+  { return &DL;}
+  virtual MipsJITInfo *getJITInfo()
+  { return &JITInfo; }
+
+  virtual const MipsRegisterInfo *getRegisterInfo()  const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+  virtual const MipsTargetLowering *getTargetLowering() const {
+    return TLInfo.get();
+  }
+
+  virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+};
+
+/// MipsebTargetMachine - Mips32/64 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipsebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+/// MipselTargetMachine - Mips32/64 little endian target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+  virtual void anchor();
+public:
+  MipselTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 000000000000..4c748c5b57cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -0,0 +1,118 @@
+//===-- MipsTargetObjectFile.cpp - Mips Object Files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+
+  SmallDataSection =
+    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+
+  SmallBSSSection =
+    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
+                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                               SectionKind::getBSS());
+
+  // Register info information
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if (Subtarget.isABI_N64() || Subtarget.isABI_N32())
+    ReginfoSection =
+      getContext().getELFSection(".MIPS.options",
+                                 ELF::SHT_MIPS_OPTIONS,
+                                 ELF::SHF_ALLOC |ELF::SHF_MIPS_NOSTRIP,
+                                 SectionKind::getMetadata());
+  else
+    ReginfoSection =
+      getContext().getELFSection(".reginfo",
+                                 ELF::SHT_MIPS_REGINFO,
+                                 ELF::SHF_ALLOC,
+                                 SectionKind::getMetadata());
+}
+
+// A address must be loaded from a small section if its size is less than the
+// small section size threshold. Data in this section must be addressed using
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= SSThreshold;
+}
+
+bool MipsTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
+    return false;
+
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
+}
+
+
+
+const MCSection *MipsTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 000000000000..c0e9140c829c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -0,0 +1,43 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+  class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+    const MCSection *ReginfoSection;
+  public:
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM, SectionKind Kind)const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;
+
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+
+    // TODO: Classify globals as mips wishes.
+    const MCSection *getReginfoSection() const { return ReginfoSection; }
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 000000000000..3615c146a527
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -0,0 +1,31 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+Target llvm::TheMips64Target, llvm::TheMips64elTarget;
+
+extern "C" void LLVMInitializeMipsTargetInfo() {
+  RegisterTarget<Triple::mips,
+        /*HasJIT=*/true> X(TheMipsTarget, "mips", "Mips");
+
+  RegisterTarget<Triple::mipsel,
+        /*HasJIT=*/true> Y(TheMipselTarget, "mipsel", "Mipsel");
+
+  RegisterTarget<Triple::mips64,
+        /*HasJIT=*/false> A(TheMips64Target, "mips64", "Mips64 [experimental]");
+
+  RegisterTarget<Triple::mips64el,
+        /*HasJIT=*/false> B(TheMips64elTarget,
+                            "mips64el", "Mips64el [experimental]");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
new file mode 100644
index 000000000000..10051c768058
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -0,0 +1 @@
+// Placeholder
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
new file mode 100644
index 000000000000..b3e8b5d2622d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -0,0 +1,86 @@
+//===-- NVPTXBaseInfo.h - Top-level definitions for NVPTX -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the NVPTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXBASEINFO_H
+#define NVPTXBASEINFO_H
+
+namespace llvm {
+
+enum AddressSpace {
+  ADDRESS_SPACE_GENERIC = 0,
+  ADDRESS_SPACE_GLOBAL = 1,
+  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
+  ADDRESS_SPACE_SHARED = 3,
+  ADDRESS_SPACE_CONST = 4,
+  ADDRESS_SPACE_LOCAL = 5,
+
+  // NVVM Internal
+  ADDRESS_SPACE_PARAM = 101
+};
+
+enum PropertyAnnotation {
+  PROPERTY_MAXNTID_X = 0,
+  PROPERTY_MAXNTID_Y,
+  PROPERTY_MAXNTID_Z,
+  PROPERTY_REQNTID_X,
+  PROPERTY_REQNTID_Y,
+  PROPERTY_REQNTID_Z,
+  PROPERTY_MINNCTAPERSM,
+  PROPERTY_ISTEXTURE,
+  PROPERTY_ISSURFACE,
+  PROPERTY_ISSAMPLER,
+  PROPERTY_ISREADONLY_IMAGE_PARAM,
+  PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISKERNEL_FUNCTION,
+  PROPERTY_ALIGN,
+
+  // last property
+  PROPERTY_LAST
+};
+
+const unsigned AnnotationNameLen = 8; // length of each annotation name
+const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
+  "maxntidx",                         // PROPERTY_MAXNTID_X
+  "maxntidy",                         // PROPERTY_MAXNTID_Y
+  "maxntidz",                         // PROPERTY_MAXNTID_Z
+  "reqntidx",                         // PROPERTY_REQNTID_X
+  "reqntidy",                         // PROPERTY_REQNTID_Y
+  "reqntidz",                         // PROPERTY_REQNTID_Z
+  "minctasm",                         // PROPERTY_MINNCTAPERSM
+  "texture",                          // PROPERTY_ISTEXTURE
+  "surface",                          // PROPERTY_ISSURFACE
+  "sampler",                          // PROPERTY_ISSAMPLER
+  "rdoimage",                         // PROPERTY_ISREADONLY_IMAGE_PARAM
+  "wroimage",                         // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "kernel",                           // PROPERTY_ISKERNEL_FUNCTION
+  "align",                            // PROPERTY_ALIGN
+
+              // last property
+  "proplast", // PROPERTY_LAST
+};
+
+// name of named metadata used for global annotations
+#if defined(__GNUC__)
+// As this is declared to be static but some of the .cpp files that
+// include NVVM.h do not use this array, gcc gives a warning when
+// compiling those .cpp files, hence __attribute__((unused)).
+__attribute__((unused))
+#endif
+    static const char *NamedMDForAnnotations = "nvvm.annotations";
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
new file mode 100644
index 000000000000..459cd96cb0cd
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -0,0 +1,63 @@
+//===-- NVPTXMCAsmInfo.cpp - NVPTX asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the NVPTXMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+bool CompileForDebugging;
+
+// -debug-compile - Command line option to inform opt and llc passes to
+// compile for debugging
+static cl::opt<bool, true>
+Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
+      cl::location(CompileForDebugging), cl::init(false));
+
+void NVPTXMCAsmInfo::anchor() {}
+
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::nvptx64) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  CommentString = "//";
+
+  PrivateGlobalPrefix = "$L__";
+
+  AllowPeriodsInName = false;
+
+  HasSetDirective = false;
+
+  HasSingleParameterDotFile = false;
+
+  InlineAsmStart = " inline asm";
+  InlineAsmEnd = " inline asm";
+
+  SupportsDebugInformation = CompileForDebugging;
+  HasDotTypeDotSizeDirective = false;
+
+  Data8bitsDirective = " .b8 ";
+  Data16bitsDirective = " .b16 ";
+  Data32bitsDirective = " .b32 ";
+  Data64bitsDirective = " .b64 ";
+  PrivateGlobalPrefix = "";
+  ZeroDirective = " .b8";
+  AsciiDirective = " .b8";
+  AscizDirective = " .b8";
+
+  // @TODO: Can we just disable this?
+  GlobalDirective = "\t// .globl\t";
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
new file mode 100644
index 000000000000..82097daa4ae5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- NVPTXMCAsmInfo.h - NVPTX asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVPTXMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_MCASM_INFO_H
+#define NVPTX_MCASM_INFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class Target;
+class StringRef;
+
+class NVPTXMCAsmInfo : public MCAsmInfo {
+  virtual void anchor();
+public:
+  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+};
+} // namespace llvm
+
+#endif // NVPTX_MCASM_INFO_H
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
new file mode 100644
index 000000000000..ccd29705df72
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -0,0 +1,88 @@
+//===-- NVPTXMCTargetDesc.cpp - NVPTX Target Descriptions -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXMCTargetDesc.h"
+#include "NVPTXMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "NVPTXGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createNVPTXMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitNVPTXMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createNVPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitNVPTXMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *
+createNVPTXMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitNVPTXMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
+    StringRef TT, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> X(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> Y(TheNVPTXTarget64);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget32,
+                                        createNVPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheNVPTXTarget64,
+                                        createNVPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget32, createNVPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheNVPTXTarget64, createNVPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget32,
+                                    createNVPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheNVPTXTarget64,
+                                    createNVPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget32,
+                                          createNVPTXMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
+                                          createNVPTXMCSubtargetInfo);
+
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
new file mode 100644
index 000000000000..af95c76f92b2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- NVPTXMCTargetDesc.h - NVPTX Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides NVPTX specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXMCTARGETDESC_H
+#define NVPTXMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+} // End llvm namespace
+
+// Defines symbolic names for PTX registers.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the PTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "NVPTXGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
new file mode 100644
index 000000000000..d6c79b5110cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/ManagedStringPool.h
@@ -0,0 +1,48 @@
+//===-- ManagedStringPool.h - Managed String Pool ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The strings allocated from a managed string pool are owned by the string
+// pool and will be deleted together with the managed string pool.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_MANAGED_STRING_H
+#define LLVM_SUPPORT_MANAGED_STRING_H
+
+#include "llvm/ADT/SmallVector.h"
+#include <string>
+
+namespace llvm {
+
+/// ManagedStringPool - The strings allocated from a managed string pool are
+/// owned by the string pool and will be deleted together with the managed
+/// string pool.
+class ManagedStringPool {
+  SmallVector<std::string *, 8> Pool;
+
+public:
+  ManagedStringPool() {}
+  ~ManagedStringPool() {
+    SmallVector<std::string *, 8>::iterator Current = Pool.begin();
+    while (Current != Pool.end()) {
+      delete *Current;
+      Current++;
+    }
+  }
+
+  std::string *getManagedString(const char *S) {
+    std::string *Str = new std::string(S);
+    Pool.push_back(Str);
+    return Str;
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
new file mode 100644
index 000000000000..6a53a443bfb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -0,0 +1,141 @@
+//===-- NVPTX.h - Top-level interface for NVPTX representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM NVPTX back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_H
+#define LLVM_TARGET_NVPTX_H
+
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iosfwd>
+
+namespace llvm {
+class NVPTXTargetMachine;
+class FunctionPass;
+class formatted_raw_ostream;
+
+namespace NVPTXCC {
+enum CondCodes {
+  EQ,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE
+};
+}
+
+inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
+  switch (CC) {
+  case NVPTXCC::NE:
+    return "ne";
+  case NVPTXCC::EQ:
+    return "eq";
+  case NVPTXCC::LT:
+    return "lt";
+  case NVPTXCC::LE:
+    return "le";
+  case NVPTXCC::GT:
+    return "gt";
+  case NVPTXCC::GE:
+    return "ge";
+  }
+  llvm_unreachable("Unknown condition code");
+}
+
+FunctionPass *
+createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
+FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
+FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+
+bool isImageOrSamplerVal(const Value *, const Module *);
+
+extern Target TheNVPTXTarget32;
+extern Target TheNVPTXTarget64;
+
+namespace NVPTX {
+enum DrvInterface {
+  NVCL,
+  CUDA,
+  TEST
+};
+
+// A field inside TSFlags needs a shift and a mask. The usage is
+// always as follows :
+// ((TSFlags & fieldMask) >> fieldShift)
+// The enum keeps the mask, the shift, and all valid values of the
+// field in one place.
+enum VecInstType {
+  VecInstTypeShift = 0,
+  VecInstTypeMask = 0xF,
+
+  VecNOP = 0,
+  VecLoad = 1,
+  VecStore = 2,
+  VecBuild = 3,
+  VecShuffle = 4,
+  VecExtract = 5,
+  VecInsert = 6,
+  VecDest = 7,
+  VecOther = 15
+};
+
+enum SimpleMove {
+  SimpleMoveMask = 0x10,
+  SimpleMoveShift = 4
+};
+enum LoadStore {
+  isLoadMask = 0x20,
+  isLoadShift = 5,
+  isStoreMask = 0x40,
+  isStoreShift = 6
+};
+
+namespace PTXLdStInstCode {
+enum AddressSpace {
+  GENERIC = 0,
+  GLOBAL = 1,
+  CONSTANT = 2,
+  SHARED = 3,
+  PARAM = 4,
+  LOCAL = 5
+};
+enum FromType {
+  Unsigned = 0,
+  Signed,
+  Float
+};
+enum VecType {
+  Scalar = 1,
+  V2 = 2,
+  V4 = 4
+};
+}
+}
+} // end namespace llvm;
+
+// Defines symbolic names for NVPTX registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "NVPTXGenRegisterInfo.inc"
+
+// Defines symbolic names for the NVPTX instructions.
+#define GET_INSTRINFO_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.td b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
new file mode 100644
index 000000000000..d78b4e81a3e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.td
@@ -0,0 +1,62 @@
+//===- NVPTX.td - Describe the NVPTX Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the NVPTX target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "NVPTXRegisterInfo.td"
+include "NVPTXInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features.
+// - We use the SM version number instead of explicit feature table.
+// - Need at least one feature to avoid generating zero sized array by
+//   TableGen in NVPTXGenSubtarget.inc.
+//===----------------------------------------------------------------------===//
+
+// SM Versions
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+                            "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+                            "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+                            "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+                            "Target SM 3.5">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+                             "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+                             "Use PTX version 3.1">;
+
+//===----------------------------------------------------------------------===//
+// NVPTX supported processors.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
+
+
+def NVPTXInstrInfo : InstrInfo {
+}
+
+def NVPTX : Target {
+  let InstructionSet = NVPTXInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
new file mode 100644
index 000000000000..0f792ec6826e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -0,0 +1,46 @@
+//===-- AllocaHoisting.cpp - Hoist allocas to the entry block --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAllocaHoisting.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
+  bool functionModified = false;
+  Function::iterator I = function.begin();
+  TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+
+  for (Function::iterator E = function.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
+      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
+        allocaInst->moveBefore(firstTerminatorInst);
+        functionModified = true;
+      }
+    }
+  }
+
+  return functionModified;
+}
+
+char NVPTXAllocaHoisting::ID = 1;
+RegisterPass<NVPTXAllocaHoisting>
+X("alloca-hoisting", "Hoisting alloca instructions in non-entry "
+                     "blocks to the entry block");
+
+FunctionPass *createAllocaHoisting() { return new NVPTXAllocaHoisting(); }
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
new file mode 100644
index 000000000000..19d73c5783cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -0,0 +1,49 @@
+//===-- AllocaHoisting.h - Hosist allocas to the entry block ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Hoist the alloca instructions in the non-entry blocks to the entry blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_ALLOCA_HOISTING_H_
+#define NVPTX_ALLOCA_HOISTING_H_
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class FunctionPass;
+class Function;
+
+// Hoisting the alloca instructions in the non-entry blocks to the entry
+// block.
+class NVPTXAllocaHoisting : public FunctionPass {
+public:
+  static char ID; // Pass ID
+  NVPTXAllocaHoisting() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DataLayout>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual const char *getPassName() const {
+    return "NVPTX specific alloca hoisting";
+  }
+
+  virtual bool runOnFunction(Function &function);
+};
+
+extern FunctionPass *createAllocaHoisting();
+
+} // end namespace llvm
+
+#endif // NVPTX_ALLOCA_HOISTING_H_
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
new file mode 100644
index 000000000000..ce5d78afa332
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -0,0 +1,2117 @@
+//===-- NVPTXAsmPrinter.cpp - NVPTX LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXAsmPrinter.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXNumRegisters.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
+#include "cl_common_defines.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TimeValue.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <sstream>
+using namespace llvm;
+
+#include "NVPTXGenAsmWriter.inc"
+
+bool RegAllocNilUsed = true;
+
+#define DEPOTNAME "__local_depot"
+
+static cl::opt<bool>
+EmitLineNumbers("nvptx-emit-line-numbers",
+                cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
+                cl::init(true));
+
+namespace llvm { bool InterleaveSrcInPtx = false; }
+
+static cl::opt<bool, true>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
+              cl::desc("NVPTX Specific: Emit source line in ptx file"),
+              cl::location(llvm::InterleaveSrcInPtx));
+
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Globals.insert(GV);
+  else {
+    if (User *U = dyn_cast<User>(V)) {
+      for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+        DiscoverDependentGlobals(U->getOperand(i), Globals);
+      }
+    }
+  }
+}
+
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(
+    GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order,
+    DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) {
+  // Have we already visited this one?
+  if (Visited.count(GV))
+    return;
+
+  // Do we have a circular dependency?
+  if (Visiting.count(GV))
+    report_fatal_error("Circular dependency found in global variable set");
+
+  // Start visiting this global
+  Visiting.insert(GV);
+
+  // Make sure we visit all dependents first
+  DenseSet<GlobalVariable *> Others;
+  for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+    DiscoverDependentGlobals(GV->getOperand(i), Others);
+
+  for (DenseSet<GlobalVariable *>::iterator I = Others.begin(),
+                                            E = Others.end();
+       I != E; ++I)
+    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+  // Now we can visit ourself
+  Order.push_back(GV);
+  Visited.insert(GV);
+  Visiting.erase(GV);
+}
+}
+
+// @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
+// cannot just link to the existing version.
+/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+///
+using namespace nvptx;
+const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
+  MCContext &Ctx = AP.OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::Create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (CE == 0)
+    llvm_unreachable("Unknown constant value to lower!");
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using DataLayout as a
+    // last resort before giving up.
+    if (Constant *C = ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
+      if (C != CE)
+        return LowerConstant(C, AP);
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      WriteAsOperand(OS, CE, /*PrintType=*/ false,
+                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+  case Instruction::GetElementPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+
+    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+  // FALL THROUGH.
+  case Instruction::BitCast:
+    return LowerConstant(CE->getOperand(0), AP);
+
+  case Instruction::IntToPtr: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+                                      false /*ZExt*/);
+    return LowerConstant(Op, AP);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &TD = *AP.TM.getDataLayout();
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = LowerConstant(Op, AP);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr =
+        MCConstantExpr::Create(~0ULL >> (64 - InBits), Ctx);
+    return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+    // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    switch (CE->getOpcode()) {
+    default:
+      llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add:
+      return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
+    case Instruction::Sub:
+      return MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+    case Instruction::Mul:
+      return MCBinaryExpr::CreateMul(LHS, RHS, Ctx);
+    case Instruction::SDiv:
+      return MCBinaryExpr::CreateDiv(LHS, RHS, Ctx);
+    case Instruction::SRem:
+      return MCBinaryExpr::CreateMod(LHS, RHS, Ctx);
+    case Instruction::Shl:
+      return MCBinaryExpr::CreateShl(LHS, RHS, Ctx);
+    case Instruction::And:
+      return MCBinaryExpr::CreateAnd(LHS, RHS, Ctx);
+    case Instruction::Or:
+      return MCBinaryExpr::CreateOr(LHS, RHS, Ctx);
+    case Instruction::Xor:
+      return MCBinaryExpr::CreateXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
+  if (!EmitLineNumbers)
+    return;
+  if (ignoreLoc(MI))
+    return;
+
+  DebugLoc curLoc = MI.getDebugLoc();
+
+  if (prevDebugLoc.isUnknown() && curLoc.isUnknown())
+    return;
+
+  if (prevDebugLoc == curLoc)
+    return;
+
+  prevDebugLoc = curLoc;
+
+  if (curLoc.isUnknown())
+    return;
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  //const TargetMachine &TM = MF->getTarget();
+
+  const LLVMContext &ctx = MF->getFunction()->getContext();
+  DIScope Scope(curLoc.getScope(ctx));
+
+  if (!Scope.Verify())
+    return;
+
+  StringRef fileName(Scope.getFilename());
+  StringRef dirName(Scope.getDirectory());
+  SmallString<128> FullPathName = dirName;
+  if (!dirName.empty() && !sys::path::is_absolute(fileName)) {
+    sys::path::append(FullPathName, fileName);
+    fileName = FullPathName.str();
+  }
+
+  if (filenameMap.find(fileName.str()) == filenameMap.end())
+    return;
+
+  // Emit the line from the source file.
+  if (llvm::InterleaveSrcInPtx)
+    this->emitSrcInText(fileName.str(), curLoc.getLine());
+
+  std::stringstream temp;
+  temp << "\t.loc " << filenameMap[fileName.str()] << " " << curLoc.getLine()
+       << " " << curLoc.getCol();
+  OutStreamer.EmitRawText(Twine(temp.str().c_str()));
+}
+
+void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    emitLineNumberAsDotLoc(*MI);
+  printInstruction(MI, OS);
+  OutStreamer.EmitRawText(OS.str());
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
+  const DataLayout *TD = TM.getDataLayout();
+  const TargetLowering *TLI = TM.getTargetLowering();
+
+  Type *Ty = F->getReturnType();
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return;
+
+  O << " (";
+
+  if (isABI) {
+    if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(Ty->isFloatingPointTy() && "Floating point type expected here");
+        size = Ty->getPrimitiveSizeInBits();
+      }
+
+      O << ".param .b" << size << " func_retval0";
+    } else if (isa<PointerType>(Ty)) {
+      O << ".param .b" << TLI->getPointerTy().getSizeInBits()
+        << " func_retval0";
+    } else {
+      if ((Ty->getTypeID() == Type::StructTyID) || isa<VectorType>(Ty)) {
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*TLI, Ty, vtparts);
+        unsigned totalsz = 0;
+        for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+          unsigned elems = 1;
+          EVT elemtype = vtparts[i];
+          if (vtparts[i].isVector()) {
+            elems = vtparts[i].getVectorNumElements();
+            elemtype = vtparts[i].getVectorElementType();
+          }
+          for (unsigned j = 0, je = elems; j != je; ++j) {
+            unsigned sz = elemtype.getSizeInBits();
+            if (elemtype.isInteger() && (sz < 8))
+              sz = 8;
+            totalsz += sz / 8;
+          }
+        }
+        unsigned retAlignment = 0;
+        if (!llvm::getAlign(*F, 0, retAlignment))
+          retAlignment = TD->getABITypeAlignment(Ty);
+        O << ".param .align " << retAlignment << " .b8 func_retval0[" << totalsz
+          << "]";
+      } else
+        assert(false && "Unknown return type");
+    }
+  } else {
+    SmallVector<EVT, 16> vtparts;
+    ComputeValueVTs(*TLI, Ty, vtparts);
+    unsigned idx = 0;
+    for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[i];
+      if (vtparts[i].isVector()) {
+        elems = vtparts[i].getVectorNumElements();
+        elemtype = vtparts[i].getVectorElementType();
+      }
+
+      for (unsigned j = 0, je = elems; j != je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32))
+          sz = 32;
+        O << ".reg .b" << sz << " func_retval" << idx;
+        if (j < je - 1)
+          O << ", ";
+        ++idx;
+      }
+      if (i < e - 1)
+        O << ", ";
+    }
+  }
+  O << ") ";
+  return;
+}
+
+void NVPTXAsmPrinter::printReturnValStr(const MachineFunction &MF,
+                                        raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  printReturnValStr(F, O);
+}
+
+void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Set up
+  MRI = &MF->getRegInfo();
+  F = MF->getFunction();
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else {
+    O << ".func ";
+    printReturnValStr(*MF, O);
+  }
+
+  O << *CurrentFnSym;
+
+  emitFunctionParamList(*MF, O);
+
+  if (llvm::isKernelFunction(*F))
+    emitKernelFunctionDirectives(*F, O);
+
+  OutStreamer.EmitRawText(O.str());
+
+  prevDebugLoc = DebugLoc();
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  unsigned numRegClasses = TRI.getNumRegClasses();
+  VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1];
+  OutStreamer.EmitRawText(StringRef("{\n"));
+  setAndEmitFunctionVirtualRegisters(*MF);
+
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+  emitDemotedVars(MF->getFunction(), O);
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(StringRef("}\n"));
+  delete[] VRidGlobal2LocalMap;
+}
+
+void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
+                                                   raw_ostream &O) const {
+  // If the NVVM IR has some of reqntid* specified, then output
+  // the reqntid directive, and set the unspecified ones to 1.
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (llvm::getReqNTIDx(F, reqntidx) == false)
+    reqntidx = 1;
+  else
+    specified = true;
+  if (llvm::getReqNTIDy(F, reqntidy) == false)
+    reqntidy = 1;
+  else
+    specified = true;
+  if (llvm::getReqNTIDz(F, reqntidz) == false)
+    reqntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+      << "\n";
+
+  // If the NVVM IR has some of maxntid* specified, then output
+  // the maxntid directive, and set the unspecified ones to 1.
+  // If none of maxntid* is specified, don't output maxntid directive.
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (llvm::getMaxNTIDx(F, maxntidx) == false)
+    maxntidx = 1;
+  else
+    specified = true;
+  if (llvm::getMaxNTIDy(F, maxntidy) == false)
+    maxntidy = 1;
+  else
+    specified = true;
+  if (llvm::getMaxNTIDz(F, maxntidz) == false)
+    maxntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+      << "\n";
+
+  unsigned mincta;
+  if (llvm::getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
+}
+
+void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
+                                             raw_ostream &O) {
+  const TargetRegisterClass *RC = MRI->getRegClass(vr);
+  unsigned id = RC->getID();
+
+  std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
+  unsigned mapped_vr = regmap[vr];
+
+  if (!isVec) {
+    O << getNVPTXRegClassStr(RC) << mapped_vr;
+    return;
+  }
+  report_fatal_error("Bad register!");
+}
+
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+                                          raw_ostream &O) {
+  getVirtualRegisterName(vr, isVec, O);
+}
+
+void NVPTXAsmPrinter::printVecModifiedImmediate(
+    const MachineOperand &MO, const char *Modifier, raw_ostream &O) {
+  static const char vecelem[] = { '0', '1', '2', '3', '0', '1', '2', '3' };
+  int Imm = (int) MO.getImm();
+  if (0 == strcmp(Modifier, "vecelem"))
+    O << "_" << vecelem[Imm];
+  else if (0 == strcmp(Modifier, "vecv4comm1")) {
+    if ((Imm < 0) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4comm2")) {
+    if ((Imm < 4) || (Imm > 7))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv4pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 4];
+  } else if (0 == strcmp(Modifier, "vecv2comm1")) {
+    if ((Imm < 0) || (Imm > 1))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2comm2")) {
+    if ((Imm < 2) || (Imm > 3))
+      O << "//";
+  } else if (0 == strcmp(Modifier, "vecv2pos")) {
+    if (Imm < 0)
+      Imm = 0;
+    O << "_" << vecelem[Imm % 2];
+  } else
+    llvm_unreachable("Unknown Modifier on immediate operand");
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << getRegisterName(MO.getReg());
+    } else {
+      if (!Modifier)
+        emitVirtualRegister(MO.getReg(), false, O);
+      else {
+        if (strcmp(Modifier, "vecfull") == 0)
+          emitVirtualRegister(MO.getReg(), true, O);
+        else
+          llvm_unreachable(
+              "Don't know how to handle the modifier on virtual register.");
+      }
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 6, "%u[];", &index);
+      printParamName(index, O);
+    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    } else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printImplicitDef(const MachineInstr *MI,
+                                       raw_ostream &O) const {
+#ifndef __OPTIMIZE__
+  O << "\t// Implicit def :";
+  //printOperand(MI, 0);
+  O << "\n";
+#endif
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+void NVPTXAsmPrinter::printLdStCode(const MachineInstr *MI, int opNum,
+                                    raw_ostream &O, const char *Modifier) {
+  if (Modifier) {
+    const MachineOperand &MO = MI->getOperand(opNum);
+    int Imm = (int) MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL:
+        O << ".global";
+        break;
+      case NVPTX::PTXLdStInstCode::SHARED:
+        O << ".shared";
+        break;
+      case NVPTX::PTXLdStInstCode::LOCAL:
+        O << ".local";
+        break;
+      case NVPTX::PTXLdStInstCode::PARAM:
+        O << ".param";
+        break;
+      case NVPTX::PTXLdStInstCode::CONSTANT:
+        O << ".const";
+        break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        if (!nvptxSubtarget.hasGenericLdSt())
+          O << ".global";
+        break;
+      default:
+        llvm_unreachable("Wrong Address Space");
+      }
+    } else if (!strcmp(Modifier, "sign")) {
+      if (Imm == NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    } else if (!strcmp(Modifier, "vec")) {
+      if (Imm == NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm == NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    } else
+      llvm_unreachable("Unknown Modifier");
+  } else
+    llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
+
+  emitLinkageDirective(F, O);
+  if (llvm::isKernelFunction(*F))
+    O << ".entry ";
+  else
+    O << ".func ";
+  printReturnValStr(F, O);
+  O << *CurrentFnSym << "\n";
+  emitFunctionParamList(F, O);
+  O << ";\n";
+}
+
+static bool usedInGlobalVarDef(const Constant *C) {
+  if (!C)
+    return false;
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (GV->getName().str() == "llvm.used")
+      return false;
+    return true;
+  }
+
+  for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end();
+       ui != ue; ++ui) {
+    const Constant *C = dyn_cast<Constant>(*ui);
+    if (usedInGlobalVarDef(C))
+      return true;
+  }
+  return false;
+}
+
+static bool usedInOneFunc(const User *U, Function const *&oneFunc) {
+  if (const GlobalVariable *othergv = dyn_cast<GlobalVariable>(U)) {
+    if (othergv->getName().str() == "llvm.used")
+      return true;
+  }
+
+  if (const Instruction *instr = dyn_cast<Instruction>(U)) {
+    if (instr->getParent() && instr->getParent()->getParent()) {
+      const Function *curFunc = instr->getParent()->getParent();
+      if (oneFunc && (curFunc != oneFunc))
+        return false;
+      oneFunc = curFunc;
+      return true;
+    } else
+      return false;
+  }
+
+  if (const MDNode *md = dyn_cast<MDNode>(U))
+    if (md->hasName() && ((md->getName().str() == "llvm.dbg.gv") ||
+                          (md->getName().str() == "llvm.dbg.sp")))
+      return true;
+
+  for (User::const_use_iterator ui = U->use_begin(), ue = U->use_end();
+       ui != ue; ++ui) {
+    if (usedInOneFunc(*ui, oneFunc) == false)
+      return false;
+  }
+  return true;
+}
+
+/* Find out if a global variable can be demoted to local scope.
+ * Currently, this is valid for CUDA shared variables, which have local
+ * scope and global lifetime. So the conditions to check are :
+ * 1. Is the global variable in shared address space?
+ * 2. Does it have internal linkage?
+ * 3. Is the global variable referenced only in one function?
+ */
+static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
+  if (gv->hasInternalLinkage() == false)
+    return false;
+  const PointerType *Pty = gv->getType();
+  if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
+    return false;
+
+  const Function *oneFunc = 0;
+
+  bool flag = usedInOneFunc(gv, oneFunc);
+  if (flag == false)
+    return false;
+  if (!oneFunc)
+    return false;
+  f = oneFunc;
+  return true;
+}
+
+static bool useFuncSeen(const Constant *C,
+                        llvm::DenseMap<const Function *, bool> &seenMap) {
+  for (Value::const_use_iterator ui = C->use_begin(), ue = C->use_end();
+       ui != ue; ++ui) {
+    if (const Constant *cu = dyn_cast<Constant>(*ui)) {
+      if (useFuncSeen(cu, seenMap))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(*ui)) {
+      const BasicBlock *bb = I->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+      if (seenMap.find(caller) != seenMap.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
+  llvm::DenseMap<const Function *, bool> seenMap;
+  for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+    const Function *F = FI;
+
+    if (F->isDeclaration()) {
+      if (F->use_empty())
+        continue;
+      if (F->getIntrinsicID())
+        continue;
+      CurrentFnSym = Mang->getSymbol(F);
+      emitDeclaration(F, O);
+      continue;
+    }
+    for (Value::const_use_iterator iter = F->use_begin(),
+                                   iterEnd = F->use_end();
+         iter != iterEnd; ++iter) {
+      if (const Constant *C = dyn_cast<Constant>(*iter)) {
+        if (usedInGlobalVarDef(C)) {
+          // The use is in the initialization of a global variable
+          // that is a function pointer, so print a declaration
+          // for the original function
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+        // Emit a declaration of this function if the function that
+        // uses this constant expr has already been seen.
+        if (useFuncSeen(C, seenMap)) {
+          CurrentFnSym = Mang->getSymbol(F);
+          emitDeclaration(F, O);
+          break;
+        }
+      }
+
+      if (!isa<Instruction>(*iter))
+        continue;
+      const Instruction *instr = cast<Instruction>(*iter);
+      const BasicBlock *bb = instr->getParent();
+      if (!bb)
+        continue;
+      const Function *caller = bb->getParent();
+      if (!caller)
+        continue;
+
+      // If a caller has already been seen, then the caller is
+      // appearing in the module before the callee. so print out
+      // a declaration for the callee.
+      if (seenMap.find(caller) != seenMap.end()) {
+        CurrentFnSym = Mang->getSymbol(F);
+        emitDeclaration(F, O);
+        break;
+      }
+    }
+    seenMap[F] = true;
+  }
+}
+
+void NVPTXAsmPrinter::recordAndEmitFilenames(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+
+  unsigned i = 1;
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+                                 E = DbgFinder.compile_unit_end();
+       I != E; ++I) {
+    DICompileUnit DIUnit(*I);
+    StringRef Filename(DIUnit.getFilename());
+    StringRef Dirname(DIUnit.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    OutStreamer.EmitDwarfFileDirective(i, "", Filename.str());
+    ++i;
+  }
+
+  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+                                 E = DbgFinder.subprogram_end();
+       I != E; ++I) {
+    DISubprogram SP(*I);
+    StringRef Filename(SP.getFilename());
+    StringRef Dirname(SP.getDirectory());
+    SmallString<128> FullPathName = Dirname;
+    if (!Dirname.empty() && !sys::path::is_absolute(Filename)) {
+      sys::path::append(FullPathName, Filename);
+      Filename = FullPathName.str();
+    }
+    if (filenameMap.find(Filename.str()) != filenameMap.end())
+      continue;
+    filenameMap[Filename.str()] = i;
+    ++i;
+  }
+}
+
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
+
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MMI->AnalyzeModule(M);
+
+  // We need to call the parent's one explicitly.
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
+      .Initialize(OutContext, TM);
+
+  Mang = new Mangler(OutContext, *TM.getDataLayout());
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1);
+  OutStreamer.EmitRawText(OS1.str());
+
+  // Already commented out
+  //bool Result = AsmPrinter::doInitialization(M);
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
+    recordAndEmitFilenames(M);
+
+  SmallString<128> Str2;
+  raw_svector_ostream OS2(Str2);
+
+  emitDeclarations(M, OS2);
+
+  // As ptxas does not support forward references of globals, we need to first
+  // sort the list of module-level globals in def-use order. We visit each
+  // global variable in order, and ensure that we emit it *after* its dependent
+  // globals. We use a little extra memory maintaining both a set and a list to
+  // have fast searches while maintaining a strict ordering.
+  SmallVector<GlobalVariable *, 8> Globals;
+  DenseSet<GlobalVariable *> GVVisited;
+  DenseSet<GlobalVariable *> GVVisiting;
+
+  // Visit each global variable, in order
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I)
+    VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+
+  assert(GVVisited.size() == M.getGlobalList().size() &&
+         "Missed a global variable");
+  assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+  // Print out module-level global variables in proper order
+  for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+    printModuleLevelGV(Globals[i], OS2);
+
+  OS2 << '\n';
+
+  OutStreamer.EmitRawText(OS2.str());
+  return false; // success
+}
+
+void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
+  O << "//\n";
+  O << "// Generated by LLVM NVPTX Back-End\n";
+  O << "//\n";
+  O << "\n";
+
+  unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+  O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
+
+  O << ".target ";
+  O << nvptxSubtarget.getTargetName();
+
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::NVCL)
+    O << ", texmode_independent";
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (!nvptxSubtarget.hasDouble())
+      O << ", map_f64_to_f32";
+  }
+
+  if (MAI->doesSupportDebugInformation())
+    O << ", debug";
+
+  O << "\n";
+
+  O << ".address_size ";
+  if (nvptxSubtarget.is64Bit())
+    O << "64";
+  else
+    O << "32";
+  O << "\n";
+
+  O << "\n";
+}
+
+bool NVPTXAsmPrinter::doFinalization(Module &M) {
+  // XXX Temproarily remove global variables so that doFinalization() will not
+  // emit them again (global variables are emitted at beginning).
+
+  Module::GlobalListType &global_list = M.getGlobalList();
+  int i, n = global_list.size();
+  GlobalVariable **gv_array = new GlobalVariable *[n];
+
+  // first, back-up GlobalVariable in gv_array
+  i = 0;
+  for (Module::global_iterator I = global_list.begin(), E = global_list.end();
+       I != E; ++I)
+    gv_array[i++] = &*I;
+
+  // second, empty global_list
+  while (!global_list.empty())
+    global_list.remove(global_list.begin());
+
+  // call doFinalization
+  bool ret = AsmPrinter::doFinalization(M);
+
+  // now we restore global variables
+  for (i = 0; i < n; i++)
+    global_list.insert(global_list.end(), gv_array[i]);
+
+  delete[] gv_array;
+  return ret;
+
+  //bool Result = AsmPrinter::doFinalization(M);
+  // Instead of calling the parents doFinalization, we may
+  // clone parents doFinalization and customize here.
+  // Currently, we if NVISA out the EmitGlobals() in
+  // parent's doFinalization, which is too intrusive.
+  //
+  // Same for the doInitialization.
+  //return Result;
+}
+
+// This function emits appropriate linkage directives for
+// functions and global variables.
+//
+// extern function declaration            -> .extern
+// extern function definition             -> .visible
+// external global variable with init     -> .visible
+// external without init                  -> .extern
+// appending                              -> not allowed, assert.
+
+void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
+                                           raw_ostream &O) {
+  if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) {
+    if (V->hasExternalLinkage()) {
+      if (isa<GlobalVariable>(V)) {
+        const GlobalVariable *GVar = cast<GlobalVariable>(V);
+        if (GVar) {
+          if (GVar->hasInitializer())
+            O << ".visible ";
+          else
+            O << ".extern ";
+        }
+      } else if (V->isDeclaration())
+        O << ".extern ";
+      else
+        O << ".visible ";
+    } else if (V->hasAppendingLinkage()) {
+      std::string msg;
+      msg.append("Error: ");
+      msg.append("Symbol ");
+      if (V->hasName())
+        msg.append(V->getName().str());
+      msg.append("has unsupported appending linkage type");
+      llvm_unreachable(msg.c_str());
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
+                                         bool processDemoted) {
+
+  // Skip meta data
+  if (GVar->hasSection()) {
+    if (GVar->getSection() == "llvm.metadata")
+      return;
+  }
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  if (GVar->hasExternalLinkage()) {
+    if (GVar->hasInitializer())
+      O << ".visible ";
+    else
+      O << ".extern ";
+  }
+
+  if (llvm::isTexture(*GVar)) {
+    O << ".global .texref " << llvm::getTextureName(*GVar) << ";\n";
+    return;
+  }
+
+  if (llvm::isSurface(*GVar)) {
+    O << ".global .surfref " << llvm::getSurfaceName(*GVar) << ";\n";
+    return;
+  }
+
+  if (GVar->isDeclaration()) {
+    // (extern) declarations, no definition or initializer
+    // Currently the only known declaration is for an automatic __local
+    // (.shared) promoted to global.
+    emitPTXGlobalVariable(GVar, O);
+    O << ";\n";
+    return;
+  }
+
+  if (llvm::isSampler(*GVar)) {
+    O << ".global .samplerref " << llvm::getSamplerName(*GVar);
+
+    Constant *Initializer = NULL;
+    if (GVar->hasInitializer())
+      Initializer = GVar->getInitializer();
+    ConstantInt *CI = NULL;
+    if (Initializer)
+      CI = dyn_cast<ConstantInt>(Initializer);
+    if (CI) {
+      unsigned sample = CI->getZExtValue();
+
+      O << " = { ";
+
+      for (int i = 0,
+               addr = ((sample & __CLK_ADDRESS_MASK) >> __CLK_ADDRESS_BASE);
+           i < 3; i++) {
+        O << "addr_mode_" << i << " = ";
+        switch (addr) {
+        case 0:
+          O << "wrap";
+          break;
+        case 1:
+          O << "clamp_to_border";
+          break;
+        case 2:
+          O << "clamp_to_edge";
+          break;
+        case 3:
+          O << "wrap";
+          break;
+        case 4:
+          O << "mirror";
+          break;
+        }
+        O << ", ";
+      }
+      O << "filter_mode = ";
+      switch ((sample & __CLK_FILTER_MASK) >> __CLK_FILTER_BASE) {
+      case 0:
+        O << "nearest";
+        break;
+      case 1:
+        O << "linear";
+        break;
+      case 2:
+        assert(0 && "Anisotropic filtering is not supported");
+      default:
+        O << "nearest";
+        break;
+      }
+      if (!((sample & __CLK_NORMALIZED_MASK) >> __CLK_NORMALIZED_BASE)) {
+        O << ", force_unnormalized_coords = 1";
+      }
+      O << " }";
+    }
+
+    O << ";\n";
+    return;
+  }
+
+  if (GVar->hasPrivateLinkage()) {
+
+    if (!strncmp(GVar->getName().data(), "unrollpragma", 12))
+      return;
+
+    // FIXME - need better way (e.g. Metadata) to avoid generating this global
+    if (!strncmp(GVar->getName().data(), "filename", 8))
+      return;
+    if (GVar->use_empty())
+      return;
+  }
+
+  const Function *demotedFunc = 0;
+  if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
+    O << "// " << GVar->getName().str() << " has been demoted\n";
+    if (localDecls.find(demotedFunc) != localDecls.end())
+      localDecls[demotedFunc].push_back(GVar);
+    else {
+      std::vector<GlobalVariable *> temp;
+      temp.push_back(GVar);
+      localDecls[demotedFunc] = temp;
+    }
+    return;
+  }
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy, false);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+
+    // Ptx allows variable initilization only for constant and global state
+    // spaces.
+    if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+         (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+         (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+        GVar->hasInitializer()) {
+      Constant *Initializer = GVar->getInitializer();
+      if (!Initializer->isNullValue()) {
+        O << " = ";
+        printScalarConstant(Initializer, O);
+      }
+    }
+  } else {
+    unsigned int ElementSize = 0;
+
+    // Although PTX has direct support for struct type and array type and
+    // LLVM IR is very similar to PTX, the LLVM CodeGen does not support for
+    // targets that support these high level field accesses. Structs, arrays
+    // and vectors are lowered into arrays of bytes.
+    switch (ETy->getTypeID()) {
+    case Type::StructTyID:
+    case Type::ArrayTyID:
+    case Type::VectorTyID:
+      ElementSize = TD->getTypeStoreSize(ETy);
+      // Ptx allows variable initilization only for constant and
+      // global state spaces.
+      if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
+           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
+           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
+          GVar->hasInitializer()) {
+        Constant *Initializer = GVar->getInitializer();
+        if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
+          AggBuffer aggBuffer(ElementSize, O, *this);
+          bufferAggregateConstant(Initializer, &aggBuffer);
+          if (aggBuffer.numSymbols) {
+            if (nvptxSubtarget.is64Bit()) {
+              O << " .u64 " << *Mang->getSymbol(GVar) << "[";
+              O << ElementSize / 8;
+            } else {
+              O << " .u32 " << *Mang->getSymbol(GVar) << "[";
+              O << ElementSize / 4;
+            }
+            O << "]";
+          } else {
+            O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+            O << ElementSize;
+            O << "]";
+          }
+          O << " = {";
+          aggBuffer.print();
+          O << "}";
+        } else {
+          O << " .b8 " << *Mang->getSymbol(GVar);
+          if (ElementSize) {
+            O << "[";
+            O << ElementSize;
+            O << "]";
+          }
+        }
+      } else {
+        O << " .b8 " << *Mang->getSymbol(GVar);
+        if (ElementSize) {
+          O << "[";
+          O << ElementSize;
+          O << "]";
+        }
+      }
+      break;
+    default:
+      assert(0 && "type not supported yet");
+    }
+
+  }
+  O << ";\n";
+}
+
+void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
+  if (localDecls.find(f) == localDecls.end())
+    return;
+
+  std::vector<GlobalVariable *> &gvars = localDecls[f];
+
+  for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+    O << "\t// demoted variable\n\t";
+    printModuleLevelGV(gvars[i], O, true);
+  }
+}
+
+void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
+                                          raw_ostream &O) const {
+  switch (AddressSpace) {
+  case llvm::ADDRESS_SPACE_LOCAL:
+    O << "local";
+    break;
+  case llvm::ADDRESS_SPACE_GLOBAL:
+    O << "global";
+    break;
+  case llvm::ADDRESS_SPACE_CONST:
+    // This logic should be consistent with that in
+    // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp)
+    if (nvptxSubtarget.hasGenericLdSt())
+      O << "global";
+    else
+      O << "const";
+    break;
+  case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+    O << "const";
+    break;
+  case llvm::ADDRESS_SPACE_SHARED:
+    O << "shared";
+    break;
+  default:
+    report_fatal_error("Bad address space found while emitting PTX");
+    break;
+  }
+}
+
+std::string
+NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
+  switch (Ty->getTypeID()) {
+  default:
+    llvm_unreachable("unexpected type");
+    break;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return "pred";
+    else if (NumBits <= 64) {
+      std::string name = "u";
+      return name + utostr(NumBits);
+    } else {
+      llvm_unreachable("Integer too large");
+      break;
+    }
+    break;
+  }
+  case Type::FloatTyID:
+    return "f32";
+  case Type::DoubleTyID:
+    return "f64";
+  case Type::PointerTyID:
+    if (nvptxSubtarget.is64Bit())
+      if (useB4PTR)
+        return "b64";
+      else
+        return "u64";
+    else if (useB4PTR)
+      return "b32";
+    else
+      return "u32";
+  }
+  llvm_unreachable("unexpected type");
+  return NULL;
+}
+
+void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
+                                            raw_ostream &O) {
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  // GlobalVariables are always constant pointers themselves.
+  const PointerType *PTy = GVar->getType();
+  Type *ETy = PTy->getElementType();
+
+  O << ".";
+  emitPTXAddressSpace(PTy->getAddressSpace(), O);
+  if (GVar->getAlignment() == 0)
+    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
+  else
+    O << " .align " << GVar->getAlignment();
+
+  if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
+    O << " .";
+    O << getPTXFundamentalTypeStr(ETy);
+    O << " ";
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+
+  int64_t ElementSize = 0;
+
+  // Although PTX has direct support for struct type and array type and LLVM IR
+  // is very similar to PTX, the LLVM CodeGen does not support for targets that
+  // support these high level field accesses. Structs and arrays are lowered
+  // into arrays of bytes.
+  switch (ETy->getTypeID()) {
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    ElementSize = TD->getTypeStoreSize(ETy);
+    O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+    if (ElementSize) {
+      O << itostr(ElementSize);
+    }
+    O << "]";
+    break;
+  default:
+    assert(0 && "type not supported yet");
+  }
+  return;
+}
+
+static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) {
+  if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty))
+    return TD->getPrefTypeAlignment(Ty);
+
+  const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+  if (ATy)
+    return getOpenCLAlignment(TD, ATy->getElementType());
+
+  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  if (VTy) {
+    Type *ETy = VTy->getElementType();
+    unsigned int numE = VTy->getNumElements();
+    unsigned int alignE = TD->getPrefTypeAlignment(ETy);
+    if (numE == 3)
+      return 4 * alignE;
+    else
+      return numE * alignE;
+  }
+
+  const StructType *STy = dyn_cast<StructType>(Ty);
+  if (STy) {
+    unsigned int alignStruct = 1;
+    // Go through each element of the struct and find the
+    // largest alignment.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) {
+      Type *ETy = STy->getElementType(i);
+      unsigned int align = getOpenCLAlignment(TD, ETy);
+      if (align > alignStruct)
+        alignStruct = align;
+    }
+    return alignStruct;
+  }
+
+  const FunctionType *FTy = dyn_cast<FunctionType>(Ty);
+  if (FTy)
+    return TD->getPointerPrefAlignment();
+  return TD->getPrefTypeAlignment(Ty);
+}
+
+void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
+                                     int paramIndex, raw_ostream &O) {
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
+    O << *CurrentFnSym << "_param_" << paramIndex;
+  else {
+    std::string argName = I->getName();
+    const char *p = argName.c_str();
+    while (*p) {
+      if (*p == '.')
+        O << "_";
+      else
+        O << *p;
+      p++;
+    }
+  }
+}
+
+void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
+  Function::const_arg_iterator I, E;
+  int i = 0;
+
+  if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
+      (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) {
+    O << *CurrentFnSym << "_param_" << paramIndex;
+    return;
+  }
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, i++) {
+    if (i == paramIndex) {
+      printParamName(I, paramIndex, O);
+      return;
+    }
+  }
+  llvm_unreachable("paramIndex out of bound");
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
+  const DataLayout *TD = TM.getDataLayout();
+  const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  Function::const_arg_iterator I, E;
+  unsigned paramIndex = 0;
+  bool first = true;
+  bool isKernelFunc = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  MVT thePointerTy = TLI->getPointerTy();
+
+  O << "(\n";
+
+  for (I = F->arg_begin(), E = F->arg_end(); I != E; ++I, paramIndex++) {
+    Type *Ty = I->getType();
+
+    if (!first)
+      O << ",\n";
+
+    first = false;
+
+    // Handle image/sampler parameters
+    if (llvm::isSampler(*I) || llvm::isImage(*I)) {
+      if (llvm::isImage(*I)) {
+        std::string sname = I->getName();
+        if (llvm::isImageWriteOnly(*I))
+          O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+        else // Default image is read_only
+          O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+      } else // Should be llvm::isSampler(*I)
+        O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+          << paramIndex;
+      continue;
+    }
+
+    if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
+      if (Ty->isVectorTy()) {
+        // Just print .param .b8 .align <a> .param[size];
+        // <a> = PAL.getparamalignment
+        // size = typeallocsize of element type
+        unsigned align = PAL.getParamAlignment(paramIndex + 1);
+        if (align == 0)
+          align = TD->getABITypeAlignment(Ty);
+
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        O << "\t.param .align " << align << " .b8 ";
+        printParamName(I, paramIndex, O);
+        O << "[" << sz << "]";
+
+        continue;
+      }
+      // Just a scalar
+      const PointerType *PTy = dyn_cast<PointerType>(Ty);
+      if (isKernelFunc) {
+        if (PTy) {
+          // Special handling for pointer arguments to kernel
+          O << "\t.param .u" << thePointerTy.getSizeInBits() << " ";
+
+          if (nvptxSubtarget.getDrvInterface() != NVPTX::CUDA) {
+            Type *ETy = PTy->getElementType();
+            int addrSpace = PTy->getAddressSpace();
+            switch (addrSpace) {
+            default:
+              O << ".ptr ";
+              break;
+            case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+              O << ".ptr .const ";
+              break;
+            case llvm::ADDRESS_SPACE_SHARED:
+              O << ".ptr .shared ";
+              break;
+            case llvm::ADDRESS_SPACE_GLOBAL:
+            case llvm::ADDRESS_SPACE_CONST:
+              O << ".ptr .global ";
+              break;
+            }
+            O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " ";
+          }
+          printParamName(I, paramIndex, O);
+          continue;
+        }
+
+        // non-pointer scalar to kernel func
+        O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " ";
+        printParamName(I, paramIndex, O);
+        continue;
+      }
+      // Non-kernel function, just print .param .b<size> for ABI
+      // and .reg .b<size> for non ABY
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << "\t.param .b" << sz << " ";
+      else
+        O << "\t.reg .b" << sz << " ";
+      printParamName(I, paramIndex, O);
+      continue;
+    }
+
+    // param has byVal attribute. So should be a pointer
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI || isKernelFunc) {
+      // Just print .param .b8 .align <a> .param[size];
+      // <a> = PAL.getparamalignment
+      // size = typeallocsize of element type
+      unsigned align = PAL.getParamAlignment(paramIndex + 1);
+      if (align == 0)
+        align = TD->getABITypeAlignment(ETy);
+
+      unsigned sz = TD->getTypeAllocSize(ETy);
+      O << "\t.param .align " << align << " .b8 ";
+      printParamName(I, paramIndex, O);
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      // Split the ETy into constituent parts and
+      // print .param .b<size> <name> for each part.
+      // Further, if a part is vector, print the above for
+      // each vector element.
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*TLI, ETy, vtparts);
+      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j = 0, je = elems; j != je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+          O << "\t.reg .b" << sz << " ";
+          printParamName(I, paramIndex, O);
+          if (j < je - 1)
+            O << ",\n";
+          ++paramIndex;
+        }
+        if (i < e - 1)
+          O << ",\n";
+      }
+      --paramIndex;
+      continue;
+    }
+  }
+
+  O << "\n)\n";
+}
+
+void NVPTXAsmPrinter::emitFunctionParamList(const MachineFunction &MF,
+                                            raw_ostream &O) {
+  const Function *F = MF.getFunction();
+  emitFunctionParamList(F, O);
+}
+
+void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
+    const MachineFunction &MF) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  // Map the global virtual register number to a register class specific
+  // virtual register number starting from 1 with that class.
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  //unsigned numRegClasses = TRI->getNumRegClasses();
+
+  // Emit the Fake Stack Object
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes) {
+    O << "\t.local .align " << MFI->getMaxAlignment() << " .b8 \t" << DEPOTNAME
+      << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (nvptxSubtarget.is64Bit()) {
+      O << "\t.reg .b64 \t%SP;\n";
+      O << "\t.reg .b64 \t%SPL;\n";
+    } else {
+      O << "\t.reg .b32 \t%SP;\n";
+      O << "\t.reg .b32 \t%SPL;\n";
+    }
+  }
+
+  // Go through all virtual registers to establish the mapping between the
+  // global virtual
+  // register number and the per class virtual register number.
+  // We use the per class virtual register number in the ptx output.
+  unsigned int numVRs = MRI->getNumVirtRegs();
+  for (unsigned i = 0; i < numVRs; i++) {
+    unsigned int vr = TRI->index2VirtReg(i);
+    const TargetRegisterClass *RC = MRI->getRegClass(vr);
+    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+    int n = regmap.size();
+    regmap.insert(std::make_pair(vr, n + 1));
+  }
+
+  // Emit register declarations
+  // @TODO: Extract out the real register usage
+  O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+
+  // Emit declaration of the virtual registers or 'physical' registers for
+  // each register class
+  //for (unsigned i=0; i< numRegClasses; i++) {
+  //    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
+  //    const TargetRegisterClass *RC = TRI->getRegClass(i);
+  //    std::string rcname = getNVPTXRegClassName(RC);
+  //    std::string rcStr = getNVPTXRegClassStr(RC);
+  //    //int n = regmap.size();
+  //    if (!isNVPTXVectorRegClass(RC)) {
+  //      O << "\t.reg " << rcname << " \t" << rcStr << "<"
+  //        << NVPTXNumRegisters << ">;\n";
+  //    }
+
+  // Only declare those registers that may be used. And do not emit vector
+  // registers as
+  // they are all elementized to scalar registers.
+  //if (n && !isNVPTXVectorRegClass(RC)) {
+  //    if (RegAllocNilUsed) {
+  //        O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+  //          << ">;\n";
+  //    }
+  //    else {
+  //        O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
+  //          << "<" << 32 << ">;\n";
+  //    }
+  //}
+  //}
+
+  OutStreamer.EmitRawText(O.str());
+}
+
+void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
+  APFloat APF = APFloat(Fp->getValueAPF()); // make a copy
+  bool ignored;
+  unsigned int numHex;
+  const char *lead;
+
+  if (Fp->getType()->getTypeID() == Type::FloatTyID) {
+    numHex = 8;
+    lead = "0f";
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &ignored);
+  } else if (Fp->getType()->getTypeID() == Type::DoubleTyID) {
+    numHex = 16;
+    lead = "0d";
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+  } else
+    llvm_unreachable("unsupported fp type");
+
+  APInt API = APF.bitcastToAPInt();
+  std::string hexstr(utohexstr(API.getZExtValue()));
+  O << lead;
+  if (hexstr.length() < numHex)
+    O << std::string(numHex - hexstr.length(), '0');
+  O << utohexstr(API.getZExtValue());
+}
+
+void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    O << CI->getValue();
+    return;
+  }
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+    printFPConstant(CFP, O);
+    return;
+  }
+  if (isa<ConstantPointerNull>(CPV)) {
+    O << "0";
+    return;
+  }
+  if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    O << *Mang->getSymbol(GVar);
+    return;
+  }
+  if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    Value *v = Cexpr->stripPointerCasts();
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+      O << *Mang->getSymbol(GVar);
+      return;
+    } else {
+      O << *LowerConstant(CPV, *this);
+      return;
+    }
+  }
+  llvm_unreachable("Not scalar type found in printScalarConstant()");
+}
+
+void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+                                   AggBuffer *aggBuffer) {
+
+  const DataLayout *TD = TM.getDataLayout();
+
+  if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
+    int s = TD->getTypeAllocSize(CPV->getType());
+    if (s < Bytes)
+      s = Bytes;
+    aggBuffer->addZeros(s);
+    return;
+  }
+
+  unsigned char *ptr;
+  switch (CPV->getType()->getTypeID()) {
+
+  case Type::IntegerTyID: {
+    const Type *ETy = CPV->getType();
+    if (ETy == Type::getInt8Ty(CPV->getContext())) {
+      unsigned char c =
+          (unsigned char)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = &c;
+      aggBuffer->addBytes(ptr, 1, Bytes);
+    } else if (ETy == Type::getInt16Ty(CPV->getContext())) {
+      short int16 = (short)(dyn_cast<ConstantInt>(CPV))->getZExtValue();
+      ptr = (unsigned char *)&int16;
+      aggBuffer->addBytes(ptr, 2, Bytes);
+    } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        int int32 = (int)(constInt->getZExtValue());
+        ptr = (unsigned char *)&int32;
+        aggBuffer->addBytes(ptr, 4, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+                ConstantFoldConstantExpression(Cexpr, TD))) {
+          int int32 = (int)(constInt->getZExtValue());
+          ptr = (unsigned char *)&int32;
+          aggBuffer->addBytes(ptr, 4, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(4);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
+      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+        long long int64 = (long long)(constInt->getZExtValue());
+        ptr = (unsigned char *)&int64;
+        aggBuffer->addBytes(ptr, 8, Bytes);
+        break;
+      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+                ConstantFoldConstantExpression(Cexpr, TD))) {
+          long long int64 = (long long)(constInt->getZExtValue());
+          ptr = (unsigned char *)&int64;
+          aggBuffer->addBytes(ptr, 8, Bytes);
+          break;
+        }
+        if (Cexpr->getOpcode() == Instruction::PtrToInt) {
+          Value *v = Cexpr->getOperand(0)->stripPointerCasts();
+          aggBuffer->addSymbol(v);
+          aggBuffer->addZeros(8);
+          break;
+        }
+      }
+      llvm_unreachable("unsupported integer const type");
+    } else
+      llvm_unreachable("unsupported integer const type");
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const Type *Ty = CFP->getType();
+    if (Ty == Type::getFloatTy(CPV->getContext())) {
+      float float32 = (float) CFP->getValueAPF().convertToFloat();
+      ptr = (unsigned char *)&float32;
+      aggBuffer->addBytes(ptr, 4, Bytes);
+    } else if (Ty == Type::getDoubleTy(CPV->getContext())) {
+      double float64 = CFP->getValueAPF().convertToDouble();
+      ptr = (unsigned char *)&float64;
+      aggBuffer->addBytes(ptr, 8, Bytes);
+    } else {
+      llvm_unreachable("unsupported fp const type");
+    }
+    break;
+  }
+  case Type::PointerTyID: {
+    if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+      aggBuffer->addSymbol(GVar);
+    } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      Value *v = Cexpr->stripPointerCasts();
+      aggBuffer->addSymbol(v);
+    }
+    unsigned int s = TD->getTypeAllocSize(CPV->getType());
+    aggBuffer->addZeros(s);
+    break;
+  }
+
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID: {
+    if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
+        isa<ConstantStruct>(CPV)) {
+      int ElementSize = TD->getTypeAllocSize(CPV->getType());
+      bufferAggregateConstant(CPV, aggBuffer);
+      if (Bytes > ElementSize)
+        aggBuffer->addZeros(Bytes - ElementSize);
+    } else if (isa<ConstantAggregateZero>(CPV))
+      aggBuffer->addZeros(Bytes);
+    else
+      llvm_unreachable("Unexpected Constant type");
+    break;
+  }
+
+  default:
+    llvm_unreachable("unsupported type");
+  }
+}
+
+void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+                                              AggBuffer *aggBuffer) {
+  const DataLayout *TD = TM.getDataLayout();
+  int Bytes;
+
+  // Old constants
+  if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
+    if (CPV->getNumOperands())
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i)
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), 0, aggBuffer);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(CPV)) {
+    if (CDS->getNumElements())
+      for (unsigned i = 0; i < CDS->getNumElements(); ++i)
+        bufferLEByte(cast<Constant>(CDS->getElementAsConstant(i)), 0,
+                     aggBuffer);
+    return;
+  }
+
+  if (isa<ConstantStruct>(CPV)) {
+    if (CPV->getNumOperands()) {
+      StructType *ST = cast<StructType>(CPV->getType());
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        if (i == (e - 1))
+          Bytes = TD->getStructLayout(ST)->getElementOffset(0) +
+                  TD->getTypeAllocSize(ST) -
+                  TD->getStructLayout(ST)->getElementOffset(i);
+        else
+          Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) -
+                  TD->getStructLayout(ST)->getElementOffset(i);
+        bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("unsupported constant type in printAggregateConstant()");
+}
+
+// buildTypeNameMap - Run through symbol table looking for type names.
+//
+
+bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
+
+  std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty);
+
+  if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") ||
+                                  !PI->second.compare("struct._image2d_t") ||
+                                  !PI->second.compare("struct._image3d_t")))
+    return true;
+
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::CallArgBeginInst:
+  case NVPTX::CallArgEndInst0:
+  case NVPTX::CallArgEndInst1:
+  case NVPTX::CallArgF32:
+  case NVPTX::CallArgF64:
+  case NVPTX::CallArgI16:
+  case NVPTX::CallArgI32:
+  case NVPTX::CallArgI32imm:
+  case NVPTX::CallArgI64:
+  case NVPTX::CallArgI8:
+  case NVPTX::CallArgParam:
+  case NVPTX::CallVoidInst:
+  case NVPTX::CallVoidInstReg:
+  case NVPTX::Callseq_End:
+  case NVPTX::CallVoidInstReg64:
+  case NVPTX::DeclareParamInst:
+  case NVPTX::DeclareRetMemInst:
+  case NVPTX::DeclareRetRegInst:
+  case NVPTX::DeclareRetScalarInst:
+  case NVPTX::DeclareScalarParamInst:
+  case NVPTX::DeclareScalarRegInst:
+  case NVPTX::StoreParamF32:
+  case NVPTX::StoreParamF64:
+  case NVPTX::StoreParamI16:
+  case NVPTX::StoreParamI32:
+  case NVPTX::StoreParamI64:
+  case NVPTX::StoreParamI8:
+  case NVPTX::StoreParamS32I8:
+  case NVPTX::StoreParamU32I8:
+  case NVPTX::StoreParamS32I16:
+  case NVPTX::StoreParamU32I16:
+  case NVPTX::StoreRetvalF32:
+  case NVPTX::StoreRetvalF64:
+  case NVPTX::StoreRetvalI16:
+  case NVPTX::StoreRetvalI32:
+  case NVPTX::StoreRetvalI64:
+  case NVPTX::StoreRetvalI8:
+  case NVPTX::LastCallArgF32:
+  case NVPTX::LastCallArgF64:
+  case NVPTX::LastCallArgI16:
+  case NVPTX::LastCallArgI32:
+  case NVPTX::LastCallArgI32imm:
+  case NVPTX::LastCallArgI64:
+  case NVPTX::LastCallArgI8:
+  case NVPTX::LastCallArgParam:
+  case NVPTX::LoadParamMemF32:
+  case NVPTX::LoadParamMemF64:
+  case NVPTX::LoadParamMemI16:
+  case NVPTX::LoadParamMemI32:
+  case NVPTX::LoadParamMemI64:
+  case NVPTX::LoadParamMemI8:
+  case NVPTX::LoadParamRegF32:
+  case NVPTX::LoadParamRegF64:
+  case NVPTX::LoadParamRegI16:
+  case NVPTX::LoadParamRegI32:
+  case NVPTX::LoadParamRegI64:
+  case NVPTX::LoadParamRegI8:
+  case NVPTX::PrototypeInst:
+  case NVPTX::DBG_VALUE:
+    return true;
+  }
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
+
+void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
+  std::stringstream temp;
+  LineReader *reader = this->getReader(filename.str());
+  temp << "\n//";
+  temp << filename.str();
+  temp << ":";
+  temp << line;
+  temp << " ";
+  temp << reader->readLine(line);
+  temp << "\n";
+  this->OutStreamer.EmitRawText(Twine(temp.str()));
+}
+
+LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
+  if (reader == NULL) {
+    reader = new LineReader(filename);
+  }
+
+  if (reader->fileName() != filename) {
+    delete reader;
+    reader = new LineReader(filename);
+  }
+
+  return reader;
+}
+
+std::string LineReader::readLine(unsigned lineNum) {
+  if (lineNum < theCurLine) {
+    theCurLine = 0;
+    fstr.seekg(0, std::ios::beg);
+  }
+  while (theCurLine < lineNum) {
+    fstr.getline(buff, 500);
+    theCurLine++;
+  }
+  return buff;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeNVPTXAsmPrinter() {
+  RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
+  RegisterAsmPrinter<NVPTXAsmPrinter> Y(TheNVPTXTarget64);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
new file mode 100644
index 000000000000..6dc9fc0ffeff
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -0,0 +1,298 @@
+//===-- NVPTXAsmPrinter.h - NVPTX LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to NVPTX assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXASMPRINTER_H
+#define NVPTXASMPRINTER_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+#include <fstream>
+
+// The ptx syntax and format is very different from that usually seem in a .s
+// file,
+// therefore we are not able to use the MCAsmStreamer interface here.
+//
+// We are handcrafting the output method here.
+//
+// A better approach is to clone the MCAsmStreamer to a MCPTXAsmStreamer
+// (subclass of MCStreamer).
+
+// This is defined in AsmPrinter.cpp.
+// Used to process the constant expressions in initializers.
+namespace nvptx {
+const llvm::MCExpr *
+LowerConstant(const llvm::Constant *CV, llvm::AsmPrinter &AP);
+}
+
+namespace llvm {
+
+class LineReader {
+private:
+  unsigned theCurLine;
+  std::ifstream fstr;
+  char buff[512];
+  std::string theFileName;
+  SmallVector<unsigned, 32> lineOffset;
+public:
+  LineReader(std::string filename) {
+    theCurLine = 0;
+    fstr.open(filename.c_str());
+    theFileName = filename;
+  }
+  std::string fileName() { return theFileName; }
+  ~LineReader() { fstr.close(); }
+  std::string readLine(unsigned line);
+};
+
+class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
+
+  class AggBuffer {
+    // Used to buffer the emitted string for initializing global
+    // aggregates.
+    //
+    // Normally an aggregate (array, vector or structure) is emitted
+    // as a u8[]. However, if one element/field of the aggregate
+    // is a non-NULL address, then the aggregate is emitted as u32[]
+    // or u64[].
+    //
+    // We first layout the aggregate in 'buffer' in bytes, except for
+    // those symbol addresses. For the i-th symbol address in the
+    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
+    // are filled with 0s. symbolPosInBuffer[i-1] records its position
+    // in 'buffer', and Symbols[i-1] records the Value*.
+    //
+    // Once we have this AggBuffer setup, we can choose how to print
+    // it out.
+  public:
+    unsigned size;         // size of the buffer in bytes
+    unsigned char *buffer; // the buffer
+    unsigned numSymbols;   // number of symbol addresses
+    SmallVector<unsigned, 4> symbolPosInBuffer;
+    SmallVector<Value *, 4> Symbols;
+
+  private:
+    unsigned curpos;
+    raw_ostream &O;
+    NVPTXAsmPrinter &AP;
+
+  public:
+    AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
+        : O(_O), AP(_AP) {
+      buffer = new unsigned char[_size];
+      size = _size;
+      curpos = 0;
+      numSymbols = 0;
+    }
+    ~AggBuffer() { delete[] buffer; }
+    unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
+      assert((curpos + Num) <= size);
+      assert((curpos + Bytes) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = Ptr[i];
+        curpos++;
+      }
+      for (int i = Num; i < Bytes; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    unsigned addZeros(int Num) {
+      assert((curpos + Num) <= size);
+      for (int i = 0; i < Num; ++i) {
+        buffer[curpos] = 0;
+        curpos++;
+      }
+      return curpos;
+    }
+    void addSymbol(Value *GVar) {
+      symbolPosInBuffer.push_back(curpos);
+      Symbols.push_back(GVar);
+      numSymbols++;
+    }
+    void print() {
+      if (numSymbols == 0) {
+        // print out in bytes
+        for (unsigned i = 0; i < size; i++) {
+          if (i)
+            O << ", ";
+          O << (unsigned int) buffer[i];
+        }
+      } else {
+        // print out in 4-bytes or 8-bytes
+        unsigned int pos = 0;
+        unsigned int nSym = 0;
+        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+        unsigned int nBytes = 4;
+        if (AP.nvptxSubtarget.is64Bit())
+          nBytes = 8;
+        for (pos = 0; pos < size; pos += nBytes) {
+          if (pos)
+            O << ", ";
+          if (pos == nextSymbolPos) {
+            Value *v = Symbols[nSym];
+            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              O << *Name;
+            } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+              O << *nvptx::LowerConstant(Cexpr, AP);
+            } else
+              llvm_unreachable("symbol type unknown");
+            nSym++;
+            if (nSym >= numSymbols)
+              nextSymbolPos = size + 1;
+            else
+              nextSymbolPos = symbolPosInBuffer[nSym];
+          } else if (nBytes == 4)
+            O << *(unsigned int *)(buffer + pos);
+          else
+            O << *(unsigned long long *)(buffer + pos);
+        }
+      }
+    }
+  };
+
+  friend class AggBuffer;
+
+  virtual void emitSrcInText(StringRef filename, unsigned line);
+
+private:
+  virtual const char *getPassName() const { return "NVPTX Assembly Printer"; }
+
+  const Function *F;
+  std::string CurrentFnName;
+
+  void EmitFunctionEntryLabel();
+  void EmitFunctionBodyStart();
+  void EmitFunctionBodyEnd();
+
+  void EmitInstruction(const MachineInstr *);
+
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+
+  void printGlobalVariable(const GlobalVariable *GVar);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = 0);
+  void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O,
+                     const char *Modifier = 0);
+  void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
+                                 raw_ostream &O);
+  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const char *Modifier = 0);
+  void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
+  // definition autogenerated.
+  void printInstruction(const MachineInstr *MI, raw_ostream &O);
+  void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false);
+  void printParamName(int paramIndex, raw_ostream &O);
+  void printParamName(Function::const_arg_iterator I, int paramIndex,
+                      raw_ostream &O);
+  void emitHeader(Module &M, raw_ostream &O);
+  void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
+  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitFunctionExternParamList(const MachineFunction &MF);
+  void emitFunctionParamList(const Function *, raw_ostream &O);
+  void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
+  void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
+  void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
+  bool isImageType(const Type *Ty);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
+
+protected:
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+private:
+  std::string CurrentBankselLabelInBasicBlock;
+
+  // This is specific per MachineFunction.
+  const MachineRegisterInfo *MRI;
+  // The contents are specific for each
+  // MachineFunction. But the size of the
+  // array is not.
+  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  // cache the subtarget here.
+  const NVPTXSubtarget &nvptxSubtarget;
+  // Build the map between type name and ID based on module's type
+  // symbol table.
+  std::map<const Type *, std::string> TypeNameMap;
+
+  // List of variables demoted to a function scope.
+  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+
+  // To record filename to ID mapping
+  std::map<std::string, unsigned> filenameMap;
+  void recordAndEmitFilenames(Module &);
+
+  void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
+  void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
+  std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
+  void printScalarConstant(Constant *CPV, raw_ostream &O);
+  void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
+  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer);
+
+  void printOperandProper(const MachineOperand &MO);
+
+  void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
+  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclaration(const Function *, raw_ostream &O);
+
+  static const char *getRegisterName(unsigned RegNo);
+  void emitDemotedVars(const Function *, raw_ostream &);
+
+  LineReader *reader;
+  LineReader *getReader(std::string);
+public:
+  NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+    CurrentBankselLabelInBasicBlock = "";
+    VRidGlobal2LocalMap = NULL;
+    reader = NULL;
+  }
+
+  ~NVPTXAsmPrinter() {
+    if (!reader)
+      delete reader;
+  }
+
+  bool ignoreLoc(const MachineInstr &);
+
+  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+
+  DebugLoc prevDebugLoc;
+  void emitLineNumberAsDotLoc(const MachineInstr &);
+};
+} // end of namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
new file mode 100644
index 000000000000..6533da5102b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -0,0 +1,78 @@
+//=======- NVPTXFrameLowering.cpp - NVPTX Frame Information ---*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXFrameLowering.h"
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+
+void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
+  if (MF.getFrameInfo()->hasStackObjects()) {
+    MachineBasicBlock &MBB = MF.front();
+    // Insert "mov.u32 %SP, %Depot"
+    MachineBasicBlock::iterator MBBI = MBB.begin();
+    // This instruction really occurs before first instruction
+    // in the BB, so giving it no debug location.
+    DebugLoc dl = DebugLoc();
+
+    if (tm.getSubtargetImpl()->hasGenericLdSt()) {
+      // mov %SPL, %depot;
+      // cvta.local %SP, %SPL;
+      if (is64bit) {
+        MachineInstr *MI = BuildMI(
+            MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+            NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::IMOV64rr),
+                NVPTX::VRFrameLocal).addReg(NVPTX::VRDepot);
+      } else {
+        MachineInstr *MI = BuildMI(
+            MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
+            NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
+        BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::IMOV32rr),
+                NVPTX::VRFrameLocal).addReg(NVPTX::VRDepot);
+      }
+    } else {
+      // mov %SP, %depot;
+      if (is64bit)
+        BuildMI(MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::IMOV64rr),
+                NVPTX::VRFrame).addReg(NVPTX::VRDepot);
+      else
+        BuildMI(MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::IMOV32rr),
+                NVPTX::VRFrame).addReg(NVPTX::VRDepot);
+    }
+  }
+}
+
+void NVPTXFrameLowering::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void NVPTXFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
new file mode 100644
index 000000000000..819f1dd3f4be
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -0,0 +1,42 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_FRAMELOWERING_H
+#define NVPTX_FRAMELOWERING_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+class NVPTXTargetMachine;
+
+class NVPTXFrameLowering : public TargetFrameLowering {
+  NVPTXTargetMachine &tm;
+  bool is64bit;
+
+public:
+  explicit NVPTXFrameLowering(NVPTXTargetMachine &_tm, bool _is64bit)
+      : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm),
+        is64bit(_is64bit) {}
+
+  virtual bool hasFP(const MachineFunction &MF) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
new file mode 100644
index 000000000000..e862988c85d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -0,0 +1,1733 @@
+//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelDAGToDAG.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-isel"
+
+using namespace llvm;
+
+static cl::opt<bool> UseFMADInstruction(
+    "nvptx-mad-enable", cl::ZeroOrMore,
+    cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
+    cl::init(false));
+
+static cl::opt<int>
+FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
+                 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                          " 1: do it  2: do it aggressively"),
+                 cl::init(2));
+
+static cl::opt<int> UsePrecDivF32(
+    "nvptx-prec-divf32", cl::ZeroOrMore,
+    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+             " IEEE Compliant F32 div.rnd if avaiable."),
+    cl::init(2));
+
+/// createNVPTXISelDag - This pass converts a legalized DAG into a
+/// NVPTX-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
+                                       llvm::CodeGenOpt::Level OptLevel) {
+  return new NVPTXDAGToDAGISel(TM, OptLevel);
+}
+
+NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                                     CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel),
+      Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
+  // Always do fma.f32 fpcontract if the target supports the instruction.
+  // Always do fma.f64 fpcontract if the target supports the instruction.
+  // Do mad.f32 is nvptx-mad-enable is specified and the target does not
+  // support fma.f32.
+
+  doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
+  doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
+  doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
+  doFMAF32AGG =
+      (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel == 2);
+  doFMAF64AGG =
+      (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
+
+  allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
+
+  UseF32FTZ = false;
+
+  doMulWide = (OptLevel > 0);
+
+  // Decide how to translate f32 div
+  do_DIVF32_PREC = UsePrecDivF32;
+  // sm less than sm_20 does not support div.rnd. Use div.full.
+  if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
+    do_DIVF32_PREC = 1;
+
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
+
+  if (N->isMachineOpcode())
+    return NULL; // Already selected.
+
+  SDNode *ResNode = NULL;
+  switch (N->getOpcode()) {
+  case ISD::LOAD:
+    ResNode = SelectLoad(N);
+    break;
+  case ISD::STORE:
+    ResNode = SelectStore(N);
+    break;
+  case NVPTXISD::LoadV2:
+  case NVPTXISD::LoadV4:
+    ResNode = SelectLoadVector(N);
+    break;
+  case NVPTXISD::LDGV2:
+  case NVPTXISD::LDGV4:
+  case NVPTXISD::LDUV2:
+  case NVPTXISD::LDUV4:
+    ResNode = SelectLDGLDUVector(N);
+    break;
+  case NVPTXISD::StoreV2:
+  case NVPTXISD::StoreV4:
+    ResNode = SelectStoreVector(N);
+    break;
+  default:
+    break;
+  }
+  if (ResNode)
+    return ResNode;
+  return SelectCode(N);
+}
+
+static unsigned int getCodeAddrSpace(MemSDNode *N,
+                                     const NVPTXSubtarget &Subtarget) {
+  const Value *Src = N->getSrcValue();
+  if (!Src)
+    return NVPTX::PTXLdStInstCode::LOCAL;
+
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
+    switch (PT->getAddressSpace()) {
+    case llvm::ADDRESS_SPACE_LOCAL:
+      return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL:
+      return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED:
+      return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+      return NVPTX::PTXLdStInstCode::CONSTANT;
+    case llvm::ADDRESS_SPACE_GENERIC:
+      return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM:
+      return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST:
+      // If the arch supports generic address space, translate it to GLOBAL
+      // for correctness.
+      // If the arch does not support generic address space, then the arch
+      // does not really support ADDRESS_SPACE_CONST, translate it to
+      // to CONSTANT for better performance.
+      if (Subtarget.hasGenericLdSt())
+        return NVPTX::PTXLdStInstCode::GLOBAL;
+      else
+        return NVPTX::PTXLdStInstCode::CONSTANT;
+    default:
+      break;
+    }
+  }
+  return NVPTX::PTXLdStInstCode::LOCAL;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDNode *NVPTXLD = NULL;
+
+  // do not support pre/post inc/dec
+  if (LD->isIndexed())
+    return NULL;
+
+  if (!LoadedVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = LD->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned fromTypeWidth = ScalarVT.getSizeInBits();
+  unsigned int fromType;
+  if ((LD->getExtensionType() == ISD::SEXTLOAD))
+    fromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    fromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    fromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N1, Addr)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_avar;
+      break;
+    default:
+      return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Addr, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops, 7);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
+                 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
+    switch (TargetVT) {
+    case MVT::i8:
+      Opcode = NVPTX::LD_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::LD_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::LD_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::LD_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::LD_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::LD_f64_asi;
+      break;
+    default:
+      return NULL;
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
+                 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari_64;
+        break;
+      default:
+        return NULL;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_ari;
+        break;
+      default:
+        return NULL;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), Base, Offset, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops, 8);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg_64;
+        break;
+      default:
+        return NULL;
+      }
+    } else {
+      switch (TargetVT) {
+      case MVT::i8:
+        Opcode = NVPTX::LD_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LD_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LD_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LD_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LD_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LD_f64_areg;
+        break;
+      default:
+        return NULL;
+      }
+    }
+    SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(fromType),
+                      getI32Imm(fromTypeWidth), N1, Chain };
+    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops, 7);
+  }
+
+  if (NVPTXLD != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXLD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *LD;
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT LoadedVT = MemSD->getMemoryVT();
+
+  if (!LoadedVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = LoadedVT.getSimpleVT();
+
+  // Type Setting: fromType + fromTypeWidth
+  //
+  // Sign   : ISD::SEXTLOAD
+  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
+  //          type is integer
+  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned FromTypeWidth = ScalarVT.getSizeInBits();
+  unsigned int FromType;
+  // The last operand holds the original LoadSDNode::getExtensionType() value
+  unsigned ExtensionType = cast<ConstantSDNode>(
+      N->getOperand(N->getNumOperands() - 1))->getZExtValue();
+  if (ExtensionType == ISD::SEXTLOAD)
+    FromType = NVPTX::PTXLdStInstCode::Signed;
+  else if (ScalarVT.isFloatingPoint())
+    FromType = NVPTX::PTXLdStInstCode::Float;
+  else
+    FromType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::LoadV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    break;
+  case NVPTXISD::LoadV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    break;
+  default:
+    return NULL;
+  }
+
+  EVT EltVT = N->getValueType(0);
+
+  if (SelectDirectAddr(Op1, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::LoadV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::LDV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::LDV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::LoadV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::LDV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::LDV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::LDV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::LDV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Base, Offset, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 8);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::LoadV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::LDV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::LDV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::LoadV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::LDV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::LDV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::LDV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::LDV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+
+    SDValue Ops[] = { getI32Imm(IsVolatile), getI32Imm(CodeAddrSpace),
+                      getI32Imm(VecType), getI32Imm(FromType),
+                      getI32Imm(FromTypeWidth), Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops, 7);
+  }
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *LD;
+
+  EVT RetVT = N->getValueType(0);
+
+  // Select opcode
+  if (Subtarget.is64Bit()) {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::LDGV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_64;
+        break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_64;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_64;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_64;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::LDGV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_32;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_32;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_32;
+        break;
+      }
+      break;
+    case NVPTXISD::LDGV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_32;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_32;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_32;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV2:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_32;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_32;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_32;
+        break;
+      }
+      break;
+    case NVPTXISD::LDUV4:
+      switch (RetVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_32;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_32;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_32;
+        break;
+      }
+      break;
+    }
+  }
+
+  SDValue Ops[] = { Op1, Chain };
+  LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), &Ops[0], 2);
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(LD)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return LD;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  EVT StoreVT = ST->getMemoryVT();
+  SDNode *NVPTXST = NULL;
+
+  // do not support pre/post inc/dec
+  if (ST->isIndexed())
+    return NULL;
+
+  if (!StoreVT.isSimple())
+    return NULL;
+
+  // Address Space Setting
+  unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool isVolatile = ST->isVolatile();
+  if (codeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    isVolatile = false;
+
+  // Vector Setting
+  MVT SimpleVT = StoreVT.getSimpleVT();
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    unsigned num = SimpleVT.getVectorNumElements();
+    if (num == 2)
+      vecType = NVPTX::PTXLdStInstCode::V2;
+    else if (num == 4)
+      vecType = NVPTX::PTXLdStInstCode::V4;
+    else
+      return NULL;
+  }
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  //
+  MVT ScalarVT = SimpleVT.getScalarType();
+  unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  unsigned int toType;
+  if (ScalarVT.isFloatingPoint())
+    toType = NVPTX::PTXLdStInstCode::Float;
+  else
+    toType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  // Create the machine instruction DAG
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Addr;
+  SDValue Offset, Base;
+  unsigned Opcode;
+  MVT::SimpleValueType SourceVT =
+      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_avar;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_avar;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_avar;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_avar;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_avar;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_avar;
+      break;
+    default:
+      return NULL;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Addr, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops, 8);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (SourceVT) {
+    case MVT::i8:
+      Opcode = NVPTX::ST_i8_asi;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::ST_i16_asi;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::ST_i32_asi;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::ST_i64_asi;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::ST_f32_asi;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::ST_f64_asi;
+      break;
+    default:
+      return NULL;
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops, 9);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari_64;
+        break;
+      default:
+        return NULL;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_ari;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_ari;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_ari;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_ari;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_ari;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_ari;
+        break;
+      default:
+        return NULL;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), Base, Offset, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops, 9);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg_64;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg_64;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg_64;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg_64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg_64;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg_64;
+        break;
+      default:
+        return NULL;
+      }
+    } else {
+      switch (SourceVT) {
+      case MVT::i8:
+        Opcode = NVPTX::ST_i8_areg;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::ST_i16_areg;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::ST_i32_areg;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::ST_i64_areg;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::ST_f32_areg;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::ST_f64_areg;
+        break;
+      default:
+        return NULL;
+      }
+    }
+    SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
+                      getI32Imm(vecType), getI32Imm(toType),
+                      getI32Imm(toTypeWidth), N2, Chain };
+    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops, 8);
+  }
+
+  if (NVPTXST != NULL) {
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+    cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  }
+
+  return NVPTXST;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Addr, Offset, Base;
+  unsigned Opcode;
+  DebugLoc DL = N->getDebugLoc();
+  SDNode *ST;
+  EVT EltVT = Op1.getValueType();
+  MemSDNode *MemSD = cast<MemSDNode>(N);
+  EVT StoreVT = MemSD->getMemoryVT();
+
+  // Address Space Setting
+  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
+
+  if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+    report_fatal_error("Cannot store to pointer that points to constant "
+                       "memory space");
+  }
+
+  // Volatile Setting
+  // - .volatile is only availalble for .global and .shared
+  bool IsVolatile = MemSD->isVolatile();
+  if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+      CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+    IsVolatile = false;
+
+  // Type Setting: toType + toTypeWidth
+  // - for integer type, always use 'u'
+  assert(StoreVT.isSimple() && "Store value is not simple");
+  MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
+  unsigned ToTypeWidth = ScalarVT.getSizeInBits();
+  unsigned ToType;
+  if (ScalarVT.isFloatingPoint())
+    ToType = NVPTX::PTXLdStInstCode::Float;
+  else
+    ToType = NVPTX::PTXLdStInstCode::Unsigned;
+
+  SmallVector<SDValue, 12> StOps;
+  SDValue N2;
+  unsigned VecType;
+
+  switch (N->getOpcode()) {
+  case NVPTXISD::StoreV2:
+    VecType = NVPTX::PTXLdStInstCode::V2;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    N2 = N->getOperand(3);
+    break;
+  case NVPTXISD::StoreV4:
+    VecType = NVPTX::PTXLdStInstCode::V4;
+    StOps.push_back(N->getOperand(1));
+    StOps.push_back(N->getOperand(2));
+    StOps.push_back(N->getOperand(3));
+    StOps.push_back(N->getOperand(4));
+    N2 = N->getOperand(5);
+    break;
+  default:
+    return NULL;
+  }
+
+  StOps.push_back(getI32Imm(IsVolatile));
+  StOps.push_back(getI32Imm(CodeAddrSpace));
+  StOps.push_back(getI32Imm(VecType));
+  StOps.push_back(getI32Imm(ToType));
+  StOps.push_back(getI32Imm(ToTypeWidth));
+
+  if (SelectDirectAddr(N2, Addr)) {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_avar;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_avar;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_avar;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_avar;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_avar;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Addr);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
+    switch (N->getOpcode()) {
+    default:
+      return NULL;
+    case NVPTXISD::StoreV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v2_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v2_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v2_asi;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::STV_i64_v2_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v2_asi;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::STV_f64_v2_asi;
+        break;
+      }
+      break;
+    case NVPTXISD::StoreV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i8:
+        Opcode = NVPTX::STV_i8_v4_asi;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::STV_i16_v4_asi;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::STV_i32_v4_asi;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::STV_f32_v4_asi;
+        break;
+      }
+      break;
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
+                 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_ari;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_ari;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_ari;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_ari;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_ari;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_ari;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_ari;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(Base);
+    StOps.push_back(Offset);
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg_64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg_64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg_64;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg_64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg_64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg_64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg_64;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch (N->getOpcode()) {
+      default:
+        return NULL;
+      case NVPTXISD::StoreV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v2_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v2_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v2_areg;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::STV_i64_v2_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v2_areg;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::STV_f64_v2_areg;
+          break;
+        }
+        break;
+      case NVPTXISD::StoreV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::STV_i8_v4_areg;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::STV_i16_v4_areg;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::STV_i32_v4_areg;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::STV_f32_v4_areg;
+          break;
+        }
+        break;
+      }
+    }
+    StOps.push_back(N2);
+  }
+
+  StOps.push_back(Chain);
+
+  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, &StOps[0], StOps.size());
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(ST)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return ST;
+}
+
+// SelectDirectAddr - Match a direct address for DAG.
+// A direct address could be a globaladdress or externalsymbol.
+bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress ||
+      N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+  if (N.getOpcode() == NVPTXISD::Wrapper) {
+    Address = N.getOperand(0);
+    return true;
+  }
+  if (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IID = cast<ConstantSDNode>(N.getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::nvvm_ptr_gen_to_param)
+      if (N.getOperand(1).getOpcode() == NVPTXISD::MoveParam)
+        return (SelectDirectAddr(N.getOperand(1).getOperand(0), Address));
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      SDValue base = Addr.getOperand(0);
+      if (SelectDirectAddr(base, Base)) {
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// symbol+offset
+bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri_imp(
+    SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+    Offset = CurDAG->getTargetConstant(0, mvt);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false; // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
+      return false;
+    }
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
+      else
+        Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), mvt);
+      return true;
+    }
+  }
+  return false;
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
+}
+
+// register+offset
+bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
+                                       SDValue &Base, SDValue &Offset) {
+  return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
+}
+
+bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
+                                                 unsigned int spN) const {
+  const Value *Src = NULL;
+  // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
+  // the classof() for MemSDNode does not include MemIntrinsicSDNode
+  // (See SelectionDAGNodes.h). So we need to check for both.
+  if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
+    Src = mN->getSrcValue();
+  } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
+    Src = mN->getSrcValue();
+  }
+  if (!Src)
+    return false;
+  if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+    return (PT->getAddressSpace() == spN);
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default:
+    return true;
+  case 'm': // memory
+    if (SelectDirectAddr(Op, Op0)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(CurDAG->getTargetConstant(0, MVT::i32));
+      return false;
+    }
+    if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+    break;
+  }
+  return true;
+}
+
+// Return true if N is a undef or a constant.
+// If N was undef, return a (i8imm 0) in Retval
+// If N was imm, convert it to i8imm and return in Retval
+// Note: The convert to i8imm is required, otherwise the
+// pattern matcher inserts a bunch of IMOVi8rr to convert
+// the imm to i8imm, and this causes instruction selection
+// to fail.
+bool NVPTXDAGToDAGISel::UndefOrImm(SDValue Op, SDValue N, SDValue &Retval) {
+  if (!(N.getOpcode() == ISD::UNDEF) && !(N.getOpcode() == ISD::Constant))
+    return false;
+
+  if (N.getOpcode() == ISD::UNDEF)
+    Retval = CurDAG->getTargetConstant(0, MVT::i8);
+  else {
+    ConstantSDNode *cn = cast<ConstantSDNode>(N.getNode());
+    unsigned retval = cn->getZExtValue();
+    Retval = CurDAG->getTargetConstant(retval, MVT::i8);
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
new file mode 100644
index 000000000000..70e8e464297d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -0,0 +1,106 @@
+//===-- NVPTXISelDAGToDAG.h - A dag to dag inst selector for NVPTX --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-isel"
+
+#include "NVPTX.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+
+class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
+
+  // If true, generate corresponding FPCONTRACT. This is
+  // language dependent (i.e. CUDA and OpenCL works differently).
+  bool doFMADF32;
+  bool doFMAF64;
+  bool doFMAF32;
+  bool doFMAF64AGG;
+  bool doFMAF32AGG;
+  bool allowFMA;
+
+  // 0: use div.approx
+  // 1: use div.full
+  // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
+  //    Otherwise, use div.full
+  int do_DIVF32_PREC;
+
+  // If true, add .ftz to f32 instructions.
+  // This is only meaningful for sm_20 and later, as the default
+  // is not ftz.
+  // For sm earlier than sm_20, f32 denorms are always ftz by the
+  // hardware.
+  // We always add the .ftz modifier regardless of the sm value
+  // when Use32FTZ is true.
+  bool UseF32FTZ;
+
+  // If true, generate mul.wide from sext and mul
+  bool doMulWide;
+
+public:
+  explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
+                             CodeGenOpt::Level OptLevel);
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "NVPTX DAG->DAG Pattern Instruction Selection";
+  }
+
+  const NVPTXSubtarget &Subtarget;
+
+  virtual bool SelectInlineAsmMemoryOperand(
+      const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps);
+private:
+// Include the pieces autogenerated from the target description.
+#include "NVPTXGenDAGISel.inc"
+
+  SDNode *Select(SDNode *N);
+  SDNode *SelectLoad(SDNode *N);
+  SDNode *SelectLoadVector(SDNode *N);
+  SDNode *SelectLDGLDUVector(SDNode *N);
+  SDNode *SelectStore(SDNode *N);
+  SDNode *SelectStoreVector(SDNode *N);
+
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDValue N, SDValue &Address);
+
+  bool SelectADDRri_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRri(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRri64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool SelectADDRsi_imp(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                        SDValue &Offset, MVT mvt);
+  bool SelectADDRsi(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                    SDValue &Offset);
+  bool SelectADDRsi64(SDNode *OpNode, SDValue Addr, SDValue &Base,
+                      SDValue &Offset);
+
+  bool ChkMemSDNodeAddressSpace(SDNode *N, unsigned int spN) const;
+
+  bool UndefOrImm(SDValue Op, SDValue N, SDValue &Retval);
+
+};
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
new file mode 100644
index 000000000000..6e01a5a82071
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -0,0 +1,1693 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "NVPTX.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+    "nvptx-sched4reg",
+    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static bool IsPTXVectorType(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::v2i8:
+  case MVT::v4i8:
+  case MVT::v2i16:
+  case MVT::v4i16:
+  case MVT::v2i32:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    return true;
+  }
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
+    : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
+      nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
+
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (nvptxSubtarget.hasROT64()) {
+    setOperationAction(ISD::ROTL, MVT::i64, Legal);
+    setOperationAction(ISD::ROTR, MVT::i64, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i64, Expand);
+    setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  }
+  if (nvptxSubtarget.hasROT32()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Legal);
+    setOperationAction(ISD::ROTR, MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i32, Expand);
+    setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL, MVT::i16, Expand);
+  setOperationAction(ISD::ROTR, MVT::i16, Expand);
+  setOperationAction(ISD::ROTL, MVT::i8, Expand);
+  setOperationAction(ISD::ROTR, MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i16, MVT::i1, Expand);
+  setTruncStoreAction(MVT::i8, MVT::i1, Expand);
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Register custom handling for vector loads/stores
+  for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
+       ++i) {
+    MVT VT = (MVT::SimpleValueType) i;
+    if (IsPTXVectorType(VT)) {
+      setOperationAction(ISD::LOAD, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+    }
+  }
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties();
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return 0;
+  case NVPTXISD::CALL:
+    return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:
+    return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::Wrapper:
+    return "NVPTXISD::Wrapper";
+  case NVPTXISD::NVBuiltin:
+    return "NVPTXISD::NVBuiltin";
+  case NVPTXISD::DeclareParam:
+    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:
+    return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareRetParam:
+    return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:
+    return "NVPTXISD::PrintCall";
+  case NVPTXISD::LoadParam:
+    return "NVPTXISD::LoadParam";
+  case NVPTXISD::StoreParam:
+    return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamS32:
+    return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:
+    return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::MoveToParam:
+    return "NVPTXISD::MoveToParam";
+  case NVPTXISD::CallArgBegin:
+    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:
+    return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:
+    return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:
+    return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:
+    return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:
+    return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:
+    return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:
+    return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:
+    return "NVPTXISD::MoveParam";
+  case NVPTXISD::MoveRetval:
+    return "NVPTXISD::MoveRetval";
+  case NVPTXISD::MoveToRetval:
+    return "NVPTXISD::MoveToRetval";
+  case NVPTXISD::StoreRetval:
+    return "NVPTXISD::StoreRetval";
+  case NVPTXISD::PseudoUseParam:
+    return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:
+    return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:
+    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:
+    return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::LoadV2:
+    return "NVPTXISD::LoadV2";
+  case NVPTXISD::LoadV4:
+    return "NVPTXISD::LoadV4";
+  case NVPTXISD::LDGV2:
+    return "NVPTXISD::LDGV2";
+  case NVPTXISD::LDGV4:
+    return "NVPTXISD::LDGV4";
+  case NVPTXISD::LDUV2:
+    return "NVPTXISD::LDUV2";
+  case NVPTXISD::LDUV4:
+    return "NVPTXISD::LDUV4";
+  case NVPTXISD::StoreV2:
+    return "NVPTXISD::StoreV2";
+  case NVPTXISD::StoreV4:
+    return "NVPTXISD::StoreV4";
+  }
+}
+
+bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const {
+  return VT == MVT::i1;
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(
+    Type *retTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID)
+    O << "()";
+  else {
+    O << "(";
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
+        unsigned size = 0;
+        if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+          size = ITy->getBitWidth();
+          if (size < 32)
+            size = 32;
+        } else {
+          assert(retTy->isFloatingPointTy() &&
+                 "Floating point type expected here");
+          size = retTy->getPrimitiveSizeInBits();
+        }
+
+        O << ".param .b" << size << " _";
+      } else if (isa<PointerType>(retTy))
+        O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+      else {
+        if ((retTy->getTypeID() == Type::StructTyID) ||
+            isa<VectorType>(retTy)) {
+          SmallVector<EVT, 16> vtparts;
+          ComputeValueVTs(*this, retTy, vtparts);
+          unsigned totalsz = 0;
+          for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+            unsigned elems = 1;
+            EVT elemtype = vtparts[i];
+            if (vtparts[i].isVector()) {
+              elems = vtparts[i].getVectorNumElements();
+              elemtype = vtparts[i].getVectorElementType();
+            }
+            for (unsigned j = 0, je = elems; j != je; ++j) {
+              unsigned sz = elemtype.getSizeInBits();
+              if (elemtype.isInteger() && (sz < 8))
+                sz = 8;
+              totalsz += sz / 8;
+            }
+          }
+          O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]";
+        } else {
+          assert(false && "Unknown return type");
+        }
+      }
+    } else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, retTy, vtparts);
+      unsigned idx = 0;
+      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j = 0, je = elems; j != je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+          O << ".reg .b" << sz << " _";
+          if (j < je - 1)
+            O << ", ";
+          ++idx;
+        }
+        if (i < e - 1)
+          O << ", ";
+      }
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+  MVT thePointerTy = getPointerTy();
+
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    const Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty))
+        sz = thePointerTy.getSizeInBits();
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      if (isABI)
+        O << ".param .b" << sz << " ";
+      else
+        O << ".reg .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    const PointerType *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    if (isABI) {
+      unsigned align = Outs[i].Flags.getByValAlign();
+      unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+      O << ".param .align " << align << " .b8 ";
+      O << "_";
+      O << "[" << sz << "]";
+      continue;
+    } else {
+      SmallVector<EVT, 16> vtparts;
+      ComputeValueVTs(*this, ETy, vtparts);
+      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[i];
+        if (vtparts[i].isVector()) {
+          elems = vtparts[i].getVectorNumElements();
+          elemtype = vtparts[i].getVectorElementType();
+        }
+
+        for (unsigned j = 0, je = elems; j != je; ++j) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+          O << ".reg .b" << sz << " ";
+          O << "_";
+          if (j < je - 1)
+            O << ", ";
+        }
+        if (i < e - 1)
+          O << ", ";
+      }
+      continue;
+    }
+  }
+  O << ");";
+  return O.str();
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  DebugLoc &dl = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  ArgListTy &Args = CLI.Args;
+  Type *retTy = CLI.RetTy;
+  ImmutableCallSite *CS = CLI.CS;
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  SDValue tempChain = Chain;
+  Chain =
+      DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true));
+  SDValue InFlag = Chain.getValue(1);
+
+  assert((Outs.size() == Args.size()) &&
+         "Unexpected number of arguments to function call");
+  unsigned paramCount = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    EVT VT = Outs[i].VT;
+
+    if (Outs[i].Flags.isByVal() == false) {
+      // Plain scalar
+      // for ABI,    declare .param .b<size> .param<n>;
+      // for nonABI, declare .reg .b<size> .param<n>;
+      unsigned isReg = 1;
+      if (isABI)
+        isReg = 0;
+      unsigned sz = VT.getSizeInBits();
+      if (VT.isInteger() && (sz < 32))
+        sz = 32;
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareParamOps[] = { Chain,
+                                    DAG.getConstant(paramCount, MVT::i32),
+                                    DAG.getConstant(sz, MVT::i32),
+                                    DAG.getConstant(isReg, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                 DAG.getConstant(0, MVT::i32), OutVals[i],
+                                 InFlag };
+
+      unsigned opcode = NVPTXISD::StoreParam;
+      if (isReg)
+        opcode = NVPTXISD::MoveToParam;
+      else {
+        if (Outs[i].Flags.isZExt())
+          opcode = NVPTXISD::StoreParamU32;
+        else if (Outs[i].Flags.isSExt())
+          opcode = NVPTXISD::StoreParamS32;
+      }
+      Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5);
+
+      InFlag = Chain.getValue(1);
+      ++paramCount;
+      continue;
+    }
+    // struct or vector
+    SmallVector<EVT, 16> vtparts;
+    const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy && "Type of a byval parameter should be pointer");
+    ComputeValueVTs(*this, PTy->getElementType(), vtparts);
+
+    if (isABI) {
+      // declare .param .align 16 .b8 .param<n>[<size>];
+      unsigned sz = Outs[i].Flags.getByValSize();
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      // The ByValAlign in the Outs[i].Flags is alway set at this point, so we
+      // don't need to
+      // worry about natural alignment or not. See TargetLowering::LowerCallTo()
+      SDValue DeclareParamOps[] = {
+        Chain, DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32),
+        DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
+        InFlag
+      };
+      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                          DeclareParamOps, 5);
+      InFlag = Chain.getValue(1);
+      unsigned curOffset = 0;
+      for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+        unsigned elems = 1;
+        EVT elemtype = vtparts[j];
+        if (vtparts[j].isVector()) {
+          elems = vtparts[j].getVectorNumElements();
+          elemtype = vtparts[j].getVectorElementType();
+        }
+        for (unsigned k = 0, ke = elems; k != ke; ++k) {
+          unsigned sz = elemtype.getSizeInBits();
+          if (elemtype.isInteger() && (sz < 8))
+            sz = 8;
+          SDValue srcAddr =
+              DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
+                          DAG.getConstant(curOffset, getPointerTy()));
+          SDValue theVal =
+              DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                          MachinePointerInfo(), false, false, false, 0);
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(curOffset, MVT::i32),
+                                     theVal, InFlag };
+          Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                              CopyParamOps, 5);
+          InFlag = Chain.getValue(1);
+          curOffset += sz / 8;
+        }
+      }
+      ++paramCount;
+      continue;
+    }
+    // Non-abi, struct or vector
+    // Declare a bunch or .reg .b<size> .param<n>
+    unsigned curOffset = 0;
+    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+      unsigned elems = 1;
+      EVT elemtype = vtparts[j];
+      if (vtparts[j].isVector()) {
+        elems = vtparts[j].getVectorNumElements();
+        elemtype = vtparts[j].getVectorElementType();
+      }
+      for (unsigned k = 0, ke = elems; k != ke; ++k) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (elemtype.isInteger() && (sz < 32))
+          sz = 32;
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain,
+                                      DAG.getConstant(paramCount, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32),
+                                      DAG.getConstant(1, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareParamOps, 5);
+        InFlag = Chain.getValue(1);
+        SDValue srcAddr =
+            DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
+                        DAG.getConstant(curOffset, getPointerTy()));
+        SDValue theVal =
+            DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(),
+                        false, false, false, 0);
+        SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
+                                   DAG.getConstant(0, MVT::i32), theVal,
+                                   InFlag };
+        Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs,
+                            CopyParamOps, 5);
+        InFlag = Chain.getValue(1);
+        ++paramCount;
+      }
+    }
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  unsigned retCount = 0;
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, retTy, resvtparts);
+
+    // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or
+    // individual .reg .b<size> func_retval<0..> for non ABI
+    unsigned resultsz = 0;
+    for (unsigned i = 0, e = resvtparts.size(); i != e; ++i) {
+      unsigned elems = 1;
+      EVT elemtype = resvtparts[i];
+      if (resvtparts[i].isVector()) {
+        elems = resvtparts[i].getVectorNumElements();
+        elemtype = resvtparts[i].getVectorElementType();
+      }
+      for (unsigned j = 0, je = elems; j != je; ++j) {
+        unsigned sz = elemtype.getSizeInBits();
+        if (isABI == false) {
+          if (elemtype.isInteger() && (sz < 32))
+            sz = 32;
+        } else {
+          if (elemtype.isInteger() && (sz < 8))
+            sz = 8;
+        }
+        if (isABI == false) {
+          SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32),
+                                      DAG.getConstant(retCount, MVT::i32),
+                                      InFlag };
+          Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                              DeclareRetOps, 5);
+          InFlag = Chain.getValue(1);
+          ++retCount;
+        }
+        resultsz += sz;
+      }
+    }
+    if (isABI) {
+      if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
+          retTy->isPointerTy()) {
+        // Scalar needs to be at least 32bit wide
+        if (resultsz < 32)
+          resultsz = 32;
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                    DAG.getConstant(resultsz, MVT::i32),
+                                    DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      } else {
+        if (Func) { // direct call
+          if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment))
+            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
+        } else { // indirect call
+          const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
+          if (!llvm::getAlign(*CallI, 0, retAlignment))
+            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
+        }
+        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareRetOps[] = { Chain,
+                                    DAG.getConstant(retAlignment, MVT::i32),
+                                    DAG.getConstant(resultsz / 8, MVT::i32),
+                                    DAG.getConstant(0, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                            DeclareRetOps, 5);
+        InFlag = Chain.getValue(1);
+      }
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for an
+    // INLINEASM SDNode.
+    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment);
+    const char *asmstr = nvTM->getManagedStrPool()
+        ->getManagedString(proto_string.c_str())->c_str();
+    SDValue InlineAsmOps[] = {
+      Chain, DAG.getTargetExternalSymbol(asmstr, getPointerTy()),
+      DAG.getMDNode(0), DAG.getTargetConstant(0, MVT::i32), InFlag
+    };
+    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = {
+    Chain,
+    DAG.getConstant(isABI ? ((Ins.size() == 0) ? 0 : 1) : retCount, MVT::i32),
+    InFlag
+  };
+  Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
+                      dl, PrintCallVTs, PrintCallOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps, 2);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i = 0, e = paramCount; i != e; ++i) {
+    unsigned opcode;
+    if (i == (e - 1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                             DAG.getConstant(i, MVT::i32), InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
+                              InFlag };
+  Chain =
+      DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps, 3);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    if (isABI) {
+      unsigned resoffset = 0;
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        unsigned sz = Ins[i].VT.getSizeInBits();
+        if (Ins[i].VT.isInteger() && (sz < 8))
+          sz = 8;
+        EVT LoadRetVTs[] = { Ins[i].VT, MVT::Other, MVT::Glue };
+        SDValue LoadRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                 DAG.getConstant(resoffset, MVT::i32), InFlag };
+        SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs,
+                                     LoadRetOps, array_lengthof(LoadRetOps));
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        InVals.push_back(retval);
+        resoffset += sz / 8;
+      }
+    } else {
+      SmallVector<EVT, 16> resvtparts;
+      ComputeValueVTs(*this, retTy, resvtparts);
+
+      assert(Ins.size() == resvtparts.size() &&
+             "Unexpected number of return values in non-ABI case");
+      unsigned paramNum = 0;
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        assert(EVT(Ins[i].VT) == resvtparts[i] &&
+               "Unexpected EVT type in non-ABI case");
+        unsigned numelems = 1;
+        EVT elemtype = Ins[i].VT;
+        if (Ins[i].VT.isVector()) {
+          numelems = Ins[i].VT.getVectorNumElements();
+          elemtype = Ins[i].VT.getVectorElementType();
+        }
+        std::vector<SDValue> tempRetVals;
+        for (unsigned j = 0; j < numelems; ++j) {
+          EVT MoveRetVTs[] = { elemtype, MVT::Other, MVT::Glue };
+          SDValue MoveRetOps[] = { Chain, DAG.getConstant(0, MVT::i32),
+                                   DAG.getConstant(paramNum, MVT::i32),
+                                   InFlag };
+          SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs,
+                                       MoveRetOps, array_lengthof(MoveRetOps));
+          Chain = retval.getValue(1);
+          InFlag = retval.getValue(2);
+          tempRetVals.push_back(retval);
+          ++paramNum;
+        }
+        if (Ins[i].VT.isVector())
+          InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT,
+                                       &tempRetVals[0], tempRetVals.size()));
+        else
+          InVals.push_back(tempRetVals[0]);
+      }
+    }
+  }
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+                             DAG.getIntPtrConstant(uniqueCallSite + 1, true),
+                             InFlag);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j = 0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j)));
+    }
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), &Ops[0],
+                     Ops.size());
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR:
+    return SDValue();
+  case ISD::FRAMEADDR:
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return Op;
+  case ISD::BUILD_VECTOR:
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
+  case ISD::LOAD:
+    return LowerLOAD(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    return LowerLOADi1(Op, DAG);
+  else
+    return SDValue();
+}
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr
+// v = trunc v1 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  DebugLoc dl = Node->getDebugLoc();
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+  assert(Node->getValueType(0) == MVT::i1 &&
+         "Custom lowering for i1 load only");
+  SDValue newLD =
+      DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(),
+                  LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
+                  LD->isInvariant(), LD->getAlignment());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = { result, LD->getChain() };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  EVT ValVT = Op.getOperand(1).getValueType();
+  if (ValVT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+  else if (ValVT.isVector())
+    return LowerSTOREVector(Op, DAG);
+  else
+    return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue Val = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  EVT ValVT = Val.getValueType();
+
+  if (ValVT.isVector()) {
+    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+    // legal.  We can (and should) split that into 2 stores of <2 x double> here
+    // but I'm leaving that as a TODO for now.
+    if (!ValVT.isSimple())
+      return SDValue();
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    default:
+      return SDValue();
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v2i64:
+    case MVT::v2f32:
+    case MVT::v2f64:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      // This is a "native" vector type
+      break;
+    }
+
+    unsigned Opcode = 0;
+    EVT EltVT = ValVT.getVectorElementType();
+    unsigned NumElts = ValVT.getVectorNumElements();
+
+    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+    // stored type to i16 and propogate the "real" type as the memory type.
+    bool NeedExt = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExt = true;
+
+    switch (NumElts) {
+    default:
+      return SDValue();
+    case 2:
+      Opcode = NVPTXISD::StoreV2;
+      break;
+    case 4: {
+      Opcode = NVPTXISD::StoreV4;
+      break;
+    }
+    }
+
+    SmallVector<SDValue, 8> Ops;
+
+    // First is the chain
+    Ops.push_back(N->getOperand(0));
+
+    // Then the split values
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                   DAG.getIntPtrConstant(i));
+      if (NeedExt)
+        // ANY_EXTEND is correct here since the store will only look at the
+        // lower-order bits anyway.
+        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+      Ops.push_back(ExtVal);
+    }
+
+    // Then any remaining arguments
+    for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+
+    SDValue NewSt = DAG.getMemIntrinsicNode(
+        Opcode, DL, DAG.getVTList(MVT::Other), &Ops[0], Ops.size(),
+        MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+    //return DCI.CombineTo(N, NewSt, true);
+    return NewSt;
+  }
+
+  return SDValue();
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i8
+// st i8, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Tmp3);
+  SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+                                isVolatile, isNonTemporal, Alignment);
+  return Result;
+}
+
+SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
+                                        int idx, EVT v) const {
+  std::string *name = nvTM->getManagedStrPool()->getManagedString(inname);
+  std::stringstream suffix;
+  suffix << idx;
+  *name += suffix.str();
+  return DAG.getTargetExternalSymbol(name->c_str(), v);
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  return getExtSymb(DAG, ".PARAM", idx, v);
+}
+
+SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
+  return getExtSymb(DAG, ".HLPPARAM", idx);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = { "struct._image2d_t",
+                                              "struct._image3d_t",
+                                              "struct._sampler_t" };
+
+  const Type *Ty = arg->getType();
+  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  const StructType *STy = dyn_cast<StructType>(PTy->getElementType());
+  const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
+
+  for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i)
+    if (TypeName == specialTypes[i])
+      return true;
+
+  return false;
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const DataLayout *TD = getDataLayout();
+
+  const Function *F = MF.getFunction();
+  const AttributeSet &PAL = F->getAttributes();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isKernel = llvm::isKernelFunction(*F);
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I) {
+    theArgs.push_back(I);
+    argTypes.push_back(I->getType());
+  }
+  //assert(argTypes.size() == Ins.size() &&
+  //       "Ins types and function types did not match");
+
+  int idx = 0;
+  for (unsigned i = 0, e = argTypes.size(); i != e; ++i, ++idx) {
+    Type *Ty = argTypes[i];
+    EVT ObjectVT = getValueType(Ty);
+    //assert(ObjectVT == Ins[i].VT &&
+    //       "Ins type did not match function type");
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(
+            theArgs[i],
+            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+                                     : 0))) {
+      assert(isKernel && "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      if (ObjectVT.isVector()) {
+        EVT EltVT = ObjectVT.getVectorElementType();
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        for (unsigned vi = 0; vi < NumElts; ++vi) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, EltVT));
+        }
+      } else {
+        InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT));
+      }
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNOdes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) {
+      if (ObjectVT.isVector()) {
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        EVT EltVT = ObjectVT.getVectorElementType();
+        unsigned Offset = 0;
+        for (unsigned vi = 0; vi < NumElts; ++vi) {
+          SDValue A = getParamSymbol(DAG, idx, getPointerTy());
+          SDValue B = DAG.getIntPtrConstant(Offset);
+          SDValue Addr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                     //getParamSymbol(DAG, idx, EltVT),
+                                     //DAG.getConstant(Offset, getPointerTy()));
+                                     A, B);
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue Ld = DAG.getLoad(
+              EltVT, dl, Root, Addr, MachinePointerInfo(SrcValue), false, false,
+              false,
+              TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
+          Offset += EltVT.getStoreSizeInBits() / 8;
+          InVals.push_back(Ld);
+        }
+        continue;
+      }
+
+      // A plain scalar.
+      if (isABI || isKernel) {
+        // If ABI, load from the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx);
+        // Conjure up a value that we can get the address space from.
+        // FIXME: Using a constant here is a hack.
+        Value *srcValue = Constant::getNullValue(
+            PointerType::get(ObjectVT.getTypeForEVT(F->getContext()),
+                             llvm::ADDRESS_SPACE_PARAM));
+        SDValue p = DAG.getLoad(
+            ObjectVT, dl, Root, Arg, MachinePointerInfo(srcValue), false, false,
+            false,
+            TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx + 1);
+        InVals.push_back(p);
+      } else {
+        // If no ABI, just move the param symbol
+        SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
+        SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+        if (p.getNode())
+          DAG.AssignOrdering(p.getNode(), idx + 1);
+        InVals.push_back(p);
+      }
+      continue;
+    }
+
+    // Param has ByVal attribute
+    if (isABI || isKernel) {
+      // Return MoveParam(param symbol).
+      // Ideally, the param symbol can be returned directly,
+      // but when SDNode builder decides to use it in a CopyToReg(),
+      // machine instruction fails because TargetExternalSymbol
+      // (not lowered) is target dependent, and CopyToReg assumes
+      // the source is lowered.
+      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+      if (p.getNode())
+        DAG.AssignOrdering(p.getNode(), idx + 1);
+      if (isKernel)
+        InVals.push_back(p);
+      else {
+        SDValue p2 = DAG.getNode(
+            ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+            DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
+        InVals.push_back(p2);
+      }
+    } else {
+      // Have to move a set of param symbols to registers and
+      // store them locally and return the local pointer in InVals
+      const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]);
+      assert(elemPtrType && "Byval parameter should be a pointer type");
+      Type *elemType = elemPtrType->getElementType();
+      // Compute the constituent parts
+      SmallVector<EVT, 16> vtparts;
+      SmallVector<uint64_t, 16> offsets;
+      ComputeValueVTs(*this, elemType, vtparts, &offsets, 0);
+      unsigned totalsize = 0;
+      for (unsigned j = 0, je = vtparts.size(); j != je; ++j)
+        totalsize += vtparts[j].getStoreSizeInBits();
+      SDValue localcopy = DAG.getFrameIndex(
+          MF.getFrameInfo()->CreateStackObject(totalsize / 8, 16, false),
+          getPointerTy());
+      unsigned sizesofar = 0;
+      std::vector<SDValue> theChains;
+      for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+        unsigned numElems = 1;
+        if (vtparts[j].isVector())
+          numElems = vtparts[j].getVectorNumElements();
+        for (unsigned k = 0, ke = numElems; k != ke; ++k) {
+          EVT tmpvt = vtparts[j];
+          if (tmpvt.isVector())
+            tmpvt = tmpvt.getVectorElementType();
+          SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt,
+                                    getParamSymbol(DAG, idx, tmpvt));
+          SDValue addr =
+              DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy,
+                          DAG.getConstant(sizesofar, getPointerTy()));
+          theChains.push_back(DAG.getStore(
+              Chain, dl, arg, addr, MachinePointerInfo(), false, false, 0));
+          sizesofar += tmpvt.getStoreSizeInBits() / 8;
+          ++idx;
+        }
+      }
+      --idx;
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0],
+                          theChains.size());
+      InVals.push_back(localcopy);
+    }
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass.
+  // We treat this case as if the arg list is empty.
+  //if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &OutChains[0],
+                            OutChains.size()));
+
+  return Chain;
+}
+
+SDValue NVPTXTargetLowering::LowerReturn(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+    SelectionDAG &DAG) const {
+
+  bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+
+  unsigned sizesofar = 0;
+  unsigned idx = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    SDValue theVal = OutVals[i];
+    EVT theValType = theVal.getValueType();
+    unsigned numElems = 1;
+    if (theValType.isVector())
+      numElems = theValType.getVectorNumElements();
+    for (unsigned j = 0, je = numElems; j != je; ++j) {
+      SDValue tmpval = theVal;
+      if (theValType.isVector())
+        tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                             theValType.getVectorElementType(), tmpval,
+                             DAG.getIntPtrConstant(j));
+      Chain = DAG.getNode(
+          isABI ? NVPTXISD::StoreRetval : NVPTXISD::MoveToRetval, dl,
+          MVT::Other, Chain, DAG.getConstant(isABI ? sizesofar : idx, MVT::i32),
+          tmpval);
+      if (theValType.isVector())
+        sizesofar += theValType.getVectorElementType().getStoreSizeInBits() / 8;
+      else
+        sizesofar += theValType.getStoreSizeInBits() / 8;
+      ++idx;
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// NVPTX suuport vector of legal types of any length in Intrinsics because the
+// NVPTX specific type legalizer
+// will legalize them to the PTX supported length.
+bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
+  if (isTypeLegal(VT))
+    return true;
+  if (VT.isVector()) {
+    MVT eVT = VT.getVectorElementType();
+    if (isTypeLegal(eVT))
+      return true;
+  }
+  return false;
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p:
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = MVT::i32;
+    else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy();
+    else
+      Info.memVT = MVT::f32;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 0;
+    return true;
+
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                Type *Ty) const {
+
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    if (AM.BaseOffs || AM.HasBaseReg || AM.Scale)
+      return false;
+    return true;
+  }
+
+  switch (AM.Scale) {
+  case 0: // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int8RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
+  return 4;
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &Results) {
+  EVT ResVT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+  // legal.  We can (and should) split that into 2 loads of <2 x double> here
+  // but I'm leaving that as a TODO for now.
+  assert(ResVT.isSimple() && "Can only handle simple types");
+  switch (ResVT.getSimpleVT().SimpleTy) {
+  default:
+    return;
+  case MVT::v2i8:
+  case MVT::v2i16:
+  case MVT::v2i32:
+  case MVT::v2i64:
+  case MVT::v2f32:
+  case MVT::v2f64:
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    // This is a "native" vector type
+    break;
+  }
+
+  EVT EltVT = ResVT.getVectorElementType();
+  unsigned NumElts = ResVT.getVectorNumElements();
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propogate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+
+  switch (NumElts) {
+  default:
+    return;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+    LdResVTs = DAG.getVTList(ListVTs, 5);
+    break;
+  }
+  }
+
+  SmallVector<SDValue, 8> OtherOps;
+
+  // Copy regular operands
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    OtherOps.push_back(N->getOperand(i));
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0],
+                                          OtherOps.size(), LD->getMemoryVT(),
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue, 4> ScalarRes;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Res = NewLD.getValue(i);
+    if (NeedTrunc)
+      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+    ScalarRes.push_back(Res);
+  }
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  SDValue BuildVec =
+      DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+
+  Results.push_back(BuildVec);
+  Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Intrin = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Get the intrinsic ID
+  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+  switch (IntrinNo) {
+  default:
+    return;
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    EVT ResVT = N->getValueType(0);
+
+    if (ResVT.isVector()) {
+      // Vector LDG/LDU
+
+      unsigned NumElts = ResVT.getVectorNumElements();
+      EVT EltVT = ResVT.getVectorElementType();
+
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type legalization.
+      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+      // loaded type to i16 and propogate the "real" type as the memory type.
+      bool NeedTrunc = false;
+      if (EltVT.getSizeInBits() < 16) {
+        EltVT = MVT::i16;
+        NeedTrunc = true;
+      }
+
+      unsigned Opcode = 0;
+      SDVTList LdResVTs;
+
+      switch (NumElts) {
+      default:
+        return;
+      case 2:
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV2;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV2;
+          break;
+        }
+        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+        break;
+      case 4: {
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV4;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV4;
+          break;
+        }
+        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+        LdResVTs = DAG.getVTList(ListVTs, 5);
+        break;
+      }
+      }
+
+      SmallVector<SDValue, 8> OtherOps;
+
+      // Copy regular operands
+
+      OtherOps.push_back(Chain); // Chain
+                                 // Skip operand 1 (intrinsic ID)
+                                 // Others
+      for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
+        OtherOps.push_back(N->getOperand(i));
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      SDValue NewLD = DAG.getMemIntrinsicNode(
+          Opcode, DL, LdResVTs, &OtherOps[0], OtherOps.size(),
+          MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+      SmallVector<SDValue, 4> ScalarRes;
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res =
+              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+
+      SDValue LoadChain = NewLD.getValue(NumElts);
+
+      SDValue BuildVec =
+          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+
+      Results.push_back(BuildVec);
+      Results.push_back(LoadChain);
+    } else {
+      // i8 LDG/LDU
+      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+             "Custom handling of non-i8 ldu/ldg?");
+
+      // Just copy all operands as-is
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+        Ops.push_back(N->getOperand(i));
+
+      // Force output to i16
+      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      // We make sure the memory type is i8, which will be used during isel
+      // to select the proper instruction.
+      SDValue NewLD =
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, &Ops[0],
+                                  Ops.size(), MVT::i8, MemSD->getMemOperand());
+
+      Results.push_back(NewLD.getValue(0));
+      Results.push_back(NewLD.getValue(1));
+    }
+  }
+  }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    report_fatal_error("Unhandled custom legalization");
+  case ISD::LOAD:
+    ReplaceLoadVector(N, DAG, Results);
+    return;
+  case ISD::INTRINSIC_W_CHAIN:
+    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+    return;
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
new file mode 100644
index 000000000000..3cd49d38af76
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -0,0 +1,164 @@
+//===-- NVPTXISelLowering.h - NVPTX DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXISELLOWERING_H
+#define NVPTXISELLOWERING_H
+
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+namespace NVPTXISD {
+enum NodeType {
+  // Start the numbering from where ISD NodeType finishes.
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  Wrapper,
+  CALL,
+  RET_FLAG,
+  LOAD_PARAM,
+  NVBuiltin,
+  DeclareParam,
+  DeclareScalarParam,
+  DeclareRetParam,
+  DeclareRet,
+  DeclareScalarRet,
+  LoadParam,
+  StoreParam,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently
+  MoveToParam,
+  PrintCall,
+  PrintCallUni,
+  CallArgBegin,
+  CallArg,
+  LastCallArg,
+  CallArgEnd,
+  CallVoid,
+  CallVal,
+  CallSymbol,
+  Prototype,
+  MoveParam,
+  MoveRetval,
+  MoveToRetval,
+  StoreRetval,
+  PseudoUseParam,
+  RETURN,
+  CallSeqBegin,
+  CallSeqEnd,
+  Dummy,
+
+  LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LoadV4,
+  LDGV2, // LDG.v2
+  LDGV4, // LDG.v4
+  LDUV2, // LDU.v2
+  LDUV4, // LDU.v4
+  StoreV2,
+  StoreV4
+};
+}
+
+//===--------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===--------------------------------------------------------------------===//
+class NVPTXTargetLowering : public TargetLowering {
+public:
+  explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
+                             SelectionDAG &DAG) const;
+
+  virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+  bool isTypeSupportedInIntrinsic(MVT VT) const;
+
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const;
+
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type
+  /// Used to guide target specific optimizations, like loop strength
+  /// reduction (LoopStrengthReduce.cpp) and memory optimization for
+  /// address mode (CodeGenPrepare.cpp)
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  virtual EVT getSetCCResultType(EVT VT) const {
+    if (VT.isVector())
+      return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    return MVT::i1;
+  }
+
+  ConstraintType getConstraintType(const std::string &Constraint) const;
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+  virtual SDValue LowerFormalArguments(
+      SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+      const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+      SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual SDValue
+  LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const;
+
+  std::string getPrototype(Type *, const ArgListTy &,
+                           const SmallVectorImpl<ISD::OutputArg> &,
+                           unsigned retAlignment) const;
+
+  virtual SDValue
+  LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+              const SmallVectorImpl<ISD::OutputArg> &Outs,
+              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              SelectionDAG &DAG) const;
+
+  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                            std::vector<SDValue> &Ops,
+                                            SelectionDAG &DAG) const;
+
+  NVPTXTargetMachine *nvTM;
+
+  // PTX always uses 32-bit shift amounts
+  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
+  virtual bool shouldSplitVectorElementType(EVT VT) const;
+
+private:
+  const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
+
+  SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
+                     EVT = MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const;
+  SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
+
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
+
+  virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const;
+};
+} // namespace llvm
+
+#endif // NVPTXISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
new file mode 100644
index 000000000000..f11f1b8f96fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrFormats.td
@@ -0,0 +1,43 @@
+//===- NVPTXInstrFormats.td - NVPTX Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NVPTX instructions format
+//
+//===----------------------------------------------------------------------===//
+
+// Vector instruction type enum
+class VecInstTypeEnum<bits<4> val> {
+  bits<4> Value=val;
+}
+def VecNOP : VecInstTypeEnum<0>;
+
+// Generic NVPTX Format
+
+class NVPTXInst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "NVPTX";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  // TSFlagFields
+  bits<4> VecInstType = VecNOP.Value;
+  bit IsSimpleMove = 0;
+  bit IsLoad = 0;
+  bit IsStore = 0;
+
+  let TSFlags{3-0} = VecInstType;
+  let TSFlags{4-4} = IsSimpleMove;
+  let TSFlags{5-5} = IsLoad;
+  let TSFlags{6-6} = IsStore;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
new file mode 100644
index 000000000000..33a63c26f4e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -0,0 +1,273 @@
+//===- NVPTXInstrInfo.cpp - NVPTX Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXTargetMachine.h"
+#define GET_INSTRINFO_CTOR
+#include "NVPTXGenInstrInfo.inc"
+#include "llvm/IR/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include <cstdio>
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
+    : NVPTXGenInstrInfo(), TM(tm), RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+void NVPTXInstrInfo::copyPhysReg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
+  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
+      NVPTX::Int32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
+           NVPTX::Int8RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
+           NVPTX::Int1RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
+           NVPTX::Float32RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
+           NVPTX::Int16RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
+           NVPTX::Int64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
+           NVPTX::Float64RegsRegClass.contains(SrcReg))
+    BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+  else {
+    llvm_unreachable("Don't know how to copy a register");
+  }
+}
+
+bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                                 unsigned &DestReg) const {
+  // Look for the appropriate part of TSFlags
+  bool isMove = false;
+
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::SimpleMoveMask) >> NVPTX::SimpleMoveShift;
+  isMove = (TSFlags == 1);
+
+  if (isMove) {
+    MachineOperand dest = MI.getOperand(0);
+    MachineOperand src = MI.getOperand(1);
+    assert(dest.isReg() && "dest of a movrr is not a reg");
+    assert(src.isReg() && "src of a movrr is not a reg");
+
+    SrcReg = src.getReg();
+    DestReg = dest.getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case NVPTX::INT_PTX_SREG_NTID_X:
+  case NVPTX::INT_PTX_SREG_NTID_Y:
+  case NVPTX::INT_PTX_SREG_NTID_Z:
+  case NVPTX::INT_PTX_SREG_TID_X:
+  case NVPTX::INT_PTX_SREG_TID_Y:
+  case NVPTX::INT_PTX_SREG_TID_Z:
+  case NVPTX::INT_PTX_SREG_CTAID_X:
+  case NVPTX::INT_PTX_SREG_CTAID_Y:
+  case NVPTX::INT_PTX_SREG_CTAID_Z:
+  case NVPTX::INT_PTX_SREG_NCTAID_X:
+  case NVPTX::INT_PTX_SREG_NCTAID_Y:
+  case NVPTX::INT_PTX_SREG_NCTAID_Z:
+  case NVPTX::INT_PTX_SREG_WARPSIZE:
+    return true;
+  }
+}
+
+bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI,
+                                 unsigned &AddrSpace) const {
+  bool isLoad = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isLoadMask) >> NVPTX::isLoadShift;
+  isLoad = (TSFlags == 1);
+  if (isLoad)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isLoad;
+}
+
+bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
+                                  unsigned &AddrSpace) const {
+  bool isStore = false;
+  unsigned TSFlags =
+      (MI.getDesc().TSFlags & NVPTX::isStoreMask) >> NVPTX::isStoreShift;
+  isStore = (TSFlags == 1);
+  if (isStore)
+    AddrSpace = getLdStCodeAddrSpace(MI);
+  return isStore;
+}
+
+bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
+  unsigned addrspace = 0;
+  if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
+    return false;
+  if (isLoadInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  if (isStoreInstr(*MI, addrspace))
+    if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
+      return false;
+  return true;
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool NVPTXInstrInfo::AnalyzeBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+    SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == NVPTX::GOTO) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == NVPTX::CBranch) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with NVPTX::GOTO and NVPTX:CBranch, handle it.
+  if (SecondLastInst->getOpcode() == NVPTX::CBranch &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two NVPTX:GOTOs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == NVPTX::GOTO &&
+      LastInst->getOpcode() == NVPTX::GOTO) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned NVPTXInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  if (I->getOpcode() != NVPTX::GOTO && I->getOpcode() != NVPTX::CBranch)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (I->getOpcode() != NVPTX::CBranch)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned NVPTXInstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "NVPTX branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch
+      BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
+    else // Conditional branch
+      BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg())
+          .addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, DL, get(NVPTX::CBranch)).addReg(Cond[0].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(FBB);
+  return 2;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
new file mode 100644
index 000000000000..b1972e9b7254
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -0,0 +1,78 @@
+//===- NVPTXInstrInfo.h - NVPTX Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXINSTRUCTIONINFO_H
+#define NVPTXINSTRUCTIONINFO_H
+
+#include "NVPTX.h"
+#include "NVPTXRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "NVPTXGenInstrInfo.inc"
+
+namespace llvm {
+
+class NVPTXInstrInfo : public NVPTXGenInstrInfo {
+  NVPTXTargetMachine &TM;
+  const NVPTXRegisterInfo RegInfo;
+public:
+  explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
+
+  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /* The following virtual functions are used in register allocation.
+   * They are not implemented because the existing interface and the logic
+   * at the caller side do not work for the elementized vector load and store.
+   *
+   * virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+   *                                  int &FrameIndex) const;
+   * virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+   *                                 int &FrameIndex) const;
+   * virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+   *                              MachineBasicBlock::iterator MBBI,
+   *                             unsigned SrcReg, bool isKill, int FrameIndex,
+   *                              const TargetRegisterClass *RC) const;
+   * virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+   *                               MachineBasicBlock::iterator MBBI,
+   *                               unsigned DestReg, int FrameIndex,
+   *                               const TargetRegisterClass *RC) const;
+   */
+
+  virtual void copyPhysReg(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+      unsigned DestReg, unsigned SrcReg, bool KillSrc) const;
+  virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
+                           unsigned &DestReg) const;
+  bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
+  bool isReadSpecialReg(MachineInstr &MI) const;
+
+  virtual bool CanTailMerge(const MachineInstr *MI) const;
+  // Branch analysis.
+  virtual bool AnalyzeBranch(
+      MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(
+      MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+  unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
+    return MI.getOperand(2).getImm();
+  }
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
new file mode 100644
index 000000000000..f43abe283b58
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -0,0 +1,2877 @@
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+def NOP : NVPTXInst<(outs), (ins), "", []>;
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget.hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget.hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget.hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget.hasAtomRedGen32() && Subtarget.hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget.hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget.hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget.hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget.hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget.hasAtomRedGen64() && Subtarget.hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget.hasAtomAddF32()">;
+def hasVote : Predicate<"Subtarget.hasVote()">;
+def hasDouble : Predicate<"Subtarget.hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget.reqPTX20()">;
+def hasLDG : Predicate<"Subtarget.hasLDG()">;
+def hasLDU : Predicate<"Subtarget.hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"UseF32FTZ">;
+
+def doFMAF32      : Predicate<"doFMAF32">;
+def doFMAF32_ftz  : Predicate<"(doFMAF32 && UseF32FTZ)">;
+def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
+def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
+def doFMAF64      : Predicate<"doFMAF64">;
+def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
+def doFMADF32     : Predicate<"doFMADF32">;
+def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA">;
+def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
+
+def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
+def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
+
+def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
+
+def true : Predicate<"1">;
+
+//===----------------------------------------------------------------------===//
+// Special Handling for 8-bit Operands and Operations
+//
+// PTX supports 8-bit signed and unsigned types, but does not support 8-bit
+// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have
+// 8-bit registers.
+//
+// PTX ld, st and cvt instructions permit source and destination data operands
+// to be wider than the instruction-type size, so that narrow values may be
+// loaded, stored, and converted using regular-width registers.
+//
+// So in PTX generation, we
+// - always use 16-bit registers in place in 8-bit registers.
+//   (8-bit variables should stay as 8-bit as they represent memory layout.)
+// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values
+//   before operation
+//   . div
+//   . rem
+//   . neg (sign)
+//   . set, setp
+//   . shr
+//
+// We are patching the operations by inserting the cvt instructions in the
+// asm strings of the affected instructions.
+//
+// Since vector operations, except for ld/st, are eventually elementized. We
+// do not need to special-hand the vector 8-bit operations.
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt.s16.s8 %temp2, %b;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   mov.b16    %temp2, %b;
+//   cvt.s16.s8 %temp2, %temp2;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .",
+             !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
+             !strconcat("mov.b16 \t%temp2, $b;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+// Generate string block like
+// {
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   mov.b16    %temp1, %b;
+//   cvt.s16.s8 %temp1, %temp1;
+//   cvt.s16.s8 %temp2, %a;
+//   opc.s16    %dst, %temp1, %temp2;
+// }
+// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
+class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp1;\n\t",
+             !strconcat(".reg .", !strconcat(TypeStr,
+             !strconcat(" \t%temp2;\n\t",
+             !strconcat("mov.b16 \t%temp1, $a;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t",
+             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
+             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
+}
+
+multiclass I3_noi8<string OpcStr, SDNode OpNode> {
+  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                       Int64Regs:$b))]>;
+  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                       Int32Regs:$b))]>;
+  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                       Int16Regs:$b))]>;
+  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+       Int32Regs:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA_ftz]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[allowFMA]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[allowFMA]>;
+}
+
+multiclass F3_rn<string OpcStr, SDNode OpNode> {
+   def f64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+   def f64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+                      [(set Float64Regs:$dst,
+                        (OpNode Float64Regs:$a, fpimm:$b))]>;
+   def f32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ]>;
+   def f32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+   def f32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+                      [(set Float32Regs:$dst,
+                        (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                      !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                      [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                      Requires<[doF32FTZ]>;
+   def f32 : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                      !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                      [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+          "xor.pred \t$dst, $a, $b;",
+      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+//mul.wide PTX instruction
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(32))
+    return true;
+  return false;
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(32))
+    return true;
+  return false;
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isSignedIntN(16))
+    return true;
+  return false;
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  if (v.isIntN(16))
+    return true;
+  return false;
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 32
+  // Only then the result from (x << v) will be i32
+  if (v.sge(0) && v.slt(32))
+    return true;
+  return false;
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  // Check if 0 <= v < 16
+  // Only then the result from (x << v) will be i16
+  if (v.sge(0) && v.slt(16))
+    return true;
+  return false;
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), MVT::i16);
+}]>;
+
+def MULWIDES64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 : NVPTXInst<(outs Int64Regs:$dst),
+                           (ins Int32Regs:$a, Int32Regs:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm : NVPTXInst<(outs Int64Regs:$dst),
+                            (ins Int32Regs:$a, i64imm:$b),
+                           "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, Int16Regs:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm : NVPTXInst<(outs Int32Regs:$dst),
+                            (ins Int16Regs:$a, i32imm:$b),
+                           "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+          Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (i64 SInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (i64 UInt32Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (i32 SInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>, Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (i32 UInt16Const:$b))>,
+          Requires<[doMulWide]>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3_noi8<"mul.hi.s", mulhs>;
+defm MULTHU : I3_noi8<"mul.hi.u", mulhu>;
+def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.s16 temp1; \n\t",
+            !strconcat(".reg \t.s16 temp2; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>;
+def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", "")))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>;
+def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+            !strconcat("{{ \n\t",
+            !strconcat(".reg \t.u16 temp1; \n\t",
+            !strconcat(".reg \t.u16 temp2; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
+            !strconcat("mov.b16 \ttemp2, $b; \n\t",
+            !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t",
+            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
+            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
+            !strconcat("}}", ""))))))))),
+      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>;
+
+
+defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">;
+defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">;
+
+defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">;
+// The ri version will not be selected as DAGCombiner::visitSREM will lower it.
+defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">;
+// The ri version will not be selected as DAGCombiner::visitUREM will lower it.
+
+def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        Int8Regs:$c))]>;
+def MAD8rri : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
+                        imm:$c))]>;
+def MAD8rir : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        Int8Regs:$c))]>;
+def MAD8rii : NVPTXInst<(outs Int8Regs:$dst),
+                      (ins Int8Regs:$a, i8imm:$b, i8imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), Int16Regs:$c))]>;
+def MAD16rri : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, Int16Regs:$b), imm:$c))]>;
+def MAD16rir : NVPTXInst<(outs Int16Regs:$dst),
+                      (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add
+                        (mul Int16Regs:$a, imm:$b), Int16Regs:$c))]>;
+def MAD16rii : NVPTXInst<(outs Int16Regs:$dst),
+    (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+                      "mad.lo.s16 \t$dst, $a, $b, $c;",
+                      [(set Int16Regs:$dst, (add (mul Int16Regs:$a, imm:$b),
+                        imm:$c))]>;
+
+def MAD32rrr : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), Int32Regs:$c))]>;
+def MAD32rri : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, Int32Regs:$b), imm:$c))]>;
+def MAD32rir : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), Int32Regs:$c))]>;
+def MAD32rii : NVPTXInst<(outs Int32Regs:$dst),
+                      (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+                      "mad.lo.s32 \t$dst, $a, $b, $c;",
+                      [(set Int32Regs:$dst, (add
+                        (mul Int32Regs:$a, imm:$b), imm:$c))]>;
+
+def MAD64rrr : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), Int64Regs:$c))]>;
+def MAD64rri : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, Int64Regs:$b), imm:$c))]>;
+def MAD64rir : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), Int64Regs:$c))]>;
+def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
+                      (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+                      "mad.lo.s64 \t$dst, $a, $b, $c;",
+                      [(set Int64Regs:$dst, (add
+                        (mul Int64Regs:$a, imm:$b), imm:$c))]>;
+
+
+def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                     !strconcat("cvt.s16.s8 \t$dst, $src;\n\t",
+                                 "neg.s16 \t$dst, $dst;"),
+         [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>;
+def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                     "neg.s16 \t$dst, $src;",
+         [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                     "neg.s32 \t$dst, $src;",
+         [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                     "neg.s64 \t$dst, $src;",
+         [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEsingle)
+      return false;
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+// Constand (double)1.0
+def DoubleConst1 : PatLeaf<(fpimm), [{
+    if (&(N->getValueAPF().getSemantics()) != &llvm::APFloat::IEEEdouble)
+      return false;
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+defm FADD : F3<"add", fadd>;
+defm FSUB : F3<"sub", fsub>;
+defm FMUL : F3<"mul", fmul>;
+
+defm FADD_rn : F3_rn<"add", fadd>;
+defm FSUB_rn : F3_rn<"sub", fsub>;
+defm FMUL_rn : F3_rn<"mul", fmul>;
+
+defm FABS : F2<"abs", fabs>;
+defm FNEG : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins f64imm:$a, Float64Regs:$b),
+                      "rcp.rn.f64 \t$dst, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b),
+                      "div.rn.f64 \t$dst, $a, $b;",
+                      [(set Float64Regs:$dst,
+                        (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr     : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.approx.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                      "rcp.approx.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+def FDIV32ri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.full.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.ftz.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins f32imm:$a, Float32Regs:$b),
+                       "rcp.rn.f32 \t$dst, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.ftz.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+                      Requires<[reqPTX20]>;
+def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b),
+                      "div.rn.f32 \t$dst, $a, $b;",
+                      [(set Float32Regs:$dst,
+                        (fdiv Float32Regs:$a, fpimm:$b))]>,
+                      Requires<[reqPTX20]>;
+
+
+multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b),
+                        Float32Regs:$c))]>, Requires<[Pred]>;
+   // This is to WAR a weird bug in Tablegen that does not automatically
+   // generate the following permutated rule rrr2 from the above rrr.
+   // So we explicitly add it here. This happens to FMA32 only.
+   // See the comments at FMAD32 and FMA32 for more information.
+   def rrr2 : NVPTXInst<(outs Float32Regs:$dst),
+                        (ins Float32Regs:$a, Float32Regs:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd Float32Regs:$c,
+                        (fmul Float32Regs:$a, Float32Regs:$b)))]>,
+                      Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, Float32Regs:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, Float32Regs:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, Float32Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), Float32Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float32Regs:$dst),
+                      (ins Float32Regs:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float32Regs:$dst, (fadd
+                        (fmul Float32Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
+   def rrr : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, Float64Regs:$b),
+                        Float64Regs:$c))]>, Requires<[Pred]>;
+   def rri : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, Float64Regs:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd (fmul Float64Regs:$a,
+                        Float64Regs:$b), fpimm:$c))]>, Requires<[Pred]>;
+   def rir : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, Float64Regs:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), Float64Regs:$c))]>,
+                      Requires<[Pred]>;
+   def rii : NVPTXInst<(outs Float64Regs:$dst),
+                      (ins Float64Regs:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                      [(set Float64Regs:$dst, (fadd
+                        (fmul Float64Regs:$a, fpimm:$b), fpimm:$c))]>,
+                      Requires<[Pred]>;
+}
+
+// Due to a unknown reason (most likely a bug in tablegen), tablegen does not
+// automatically generate the rrr2 rule from
+// the rrr rule (see FPCONTRACT32) for FMA32, though it does for FMAD32.
+// If we reverse the order of the following two lines, then rrr2 rule will be
+// generated for FMA32, but not for rrr.
+// Therefore, we manually write the rrr2 rule in FPCONTRACT32.
+defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
+defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
+defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
+defm FMA32  : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
+defm FMA64  : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
+
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT32_SUB_PAT_MAD<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// - legal because a-b*c <=> a+(-b*c) <=> a+(-b)*c
+// b*c-a => fmad(b, c, -a)
+// - legal because b*c-a <=> b*c+(-a)
+multiclass FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float32Regs:$a, (fmul Float32Regs:$b, Float32Regs:$c)),
+          (Inst (FNEGf32 Float32Regs:$b), Float32Regs:$c, Float32Regs:$a)>,
+          Requires<[Pred]>;
+  def : Pat<(fsub (fmul Float32Regs:$b, Float32Regs:$c), Float32Regs:$a),
+          (Inst Float32Regs:$b, Float32Regs:$c, (FNEGf32 Float32Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+// a-b*c => fmad(-b,c, a)
+// b*c-a => fmad(b, c, -a)
+multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub Float64Regs:$a, (fmul Float64Regs:$b, Float64Regs:$c)),
+          (Inst (FNEGf64 Float64Regs:$b), Float64Regs:$c, Float64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul Float64Regs:$b, Float64Regs:$c), Float64Regs:$a),
+          (Inst Float64Regs:$b, Float64Regs:$c, (FNEGf64 Float64Regs:$a))>,
+          Requires<[Pred]>;
+}
+
+defm FMAF32ext_ftz  : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
+defm FMAF32ext  : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
+defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
+defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
+defm FMAF64ext  : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
+
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
+
+//-----------------------------------
+// Logical Arithmetic
+//-----------------------------------
+
+multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
+  def b1rr:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b8rr:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def b8ri:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int16Regs:$b))]>;
+  def b16ri:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+  def b32ri:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int64Regs:$b))]>;
+  def b64ri:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                      !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : LOG_FORMAT<"or", or>;
+defm AND : LOG_FORMAT<"and", and>;
+defm XOR : LOG_FORMAT<"xor", xor>;
+
+def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT8:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int8Regs:$dst, (not Int8Regs:$src))]>;
+def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32:  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64:  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                      "not.b64 \t$dst, $src;",
+                      [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// For shifts, the second src operand must be 32-bit value
+multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
+
+// For shifts, the second src operand must be 32-bit value
+// Need to add cvt for the 8-bits.
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> {
+   def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        Int32Regs:$b))]>;
+   def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+                      [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        Int32Regs:$b))]>;
+   def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i32ii : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+                      !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+                      [(set Int32Regs:$dst, (OpNode (i32 imm:$a),
+                        (i32 imm:$b)))]>;
+   def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+                      Int32Regs:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        Int32Regs:$b))]>;
+   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
+                        (i32 imm:$b)))]>;
+   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        Int32Regs:$b))]>;
+   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
+                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
+                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
+                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
+                        (i32 imm:$b)))]>;
+}
+
+defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl, "cvt.u16.u8">;
+
+// 32bit
+def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(32-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>;
+
+def ROTL32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shl.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shr.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR32reg_sw : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b32 %lhs;\n\t",
+    !strconcat(".reg .b32 %rhs;\n\t",
+    !strconcat(".reg .b32 %amt2;\n\t",
+    !strconcat("shr.b32 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.s32 \t%amt2, 32, $amt;\n\t",
+    !strconcat("shl.b32 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u32 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>;
+
+// 64bit
+def ROT64imm_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    i32imm:$amt1, i32imm:$amt2),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt1;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, $amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))),
+    []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+def ROTL64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shl.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shr.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
+    Int32Regs:$amt),
+    !strconcat("{{\n\t",
+    !strconcat(".reg .b64 %lhs;\n\t",
+    !strconcat(".reg .b64 %rhs;\n\t",
+    !strconcat(".reg .u32 %amt2;\n\t",
+    !strconcat("shr.b64 \t%lhs, $src, $amt;\n\t",
+    !strconcat("sub.u32 \t%amt2, 64, $amt;\n\t",
+    !strconcat("shl.b64 \t%rhs, $src, %amt2;\n\t",
+    !strconcat("add.u64 \t$dst, %lhs, %rhs;\n\t",
+    !strconcat("}}", ""))))))))),
+    [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+  [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+  [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+    let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+    let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+    let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                     "mov.u32 \t$dst, $a;",
+                     [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                     "mov.u64 \t$dst, $a;",
+                     [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1 in {
+def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                   "mov.pred \t$dst, $sss;", []>;
+def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                    "mov.u16 \t$dst, $sss;", []>;
+def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                    "mov.u32 \t$dst, $sss;", []>;
+def IMOV64rr: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                    "mov.u64 \t$dst, $sss;", []>;
+
+def FMOV32rr: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                    "mov.f32 \t$dst, $src;", []>;
+def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                    "mov.f64 \t$dst, $src;", []>;
+}
+def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                    "mov.pred \t$dst, $src;",
+          [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int8Regs:$dst, imm:$src)]>;
+def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                    "mov.u16 \t$dst, $src;",
+          [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                    "mov.u32 \t$dst, $src;",
+          [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i: NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                    "mov.u64 \t$dst, $src;",
+          [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri: NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                    "mov.f32 \t$dst, $src;",
+          [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri: NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                    "mov.f64 \t$dst, $src;",
+          [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi : NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                        "add.u32 \t$dst, ${addr:add};",
+                        [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                        "add.u64 \t$dst, ${addr:add};",
+                        [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   setp.gt.s16 p, %a, %b;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s sz1=16 sz2=16 d=%dst a=%a b=%b
+class Set_Str<string OpcStr, string sz1, string sz2, string d, string a,
+  string b> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1 , OpcStr);
+  string t3  = !strconcat(t2 , sz1);
+  string t4  = !strconcat(t3 , " \tp, ");
+  string t5  = !strconcat(t4 , a);
+  string t6  = !strconcat(t5 , ", ");
+  string t7  = !strconcat(t6 , b);
+  string t8  = !strconcat(t7 , ";\n\tselp.s");
+  string t9  = !strconcat(t8 , sz2);
+  string t10 = !strconcat(t9, " \t");
+  string t11 = !strconcat(t10, d);
+  string s   = !strconcat(t11, ", -1, 0, p;\n\t}}");
+}
+
+// Generate string block like
+// {
+//   .reg .pred p;
+//   .reg .s16 %temp1;
+//   .reg .s16 %temp2;
+//   cvt.s16.s8 %temp1, %a;
+//   cvt s16.s8 %temp1, %b;
+//   setp.gt.s16 p, %temp1, %temp2;
+//   selp.s16 %dst, -1, 0, p;
+// }
+// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8
+class Set_Stri8<string OpcStr, string d, string a, string b, string type,
+  string cvt> {
+  string t1  = "{{\n\t.reg .pred p;\n\t";
+  string t2  = !strconcat(t1, ".reg .");
+  string t3  = !strconcat(t2, type);
+  string t4  = !strconcat(t3, " %temp1;\n\t");
+  string t5  = !strconcat(t4, ".reg .");
+  string t6  = !strconcat(t5, type);
+  string t7  = !strconcat(t6, " %temp2;\n\t");
+  string t8  = !strconcat(t7, cvt);
+  string t9  = !strconcat(t8, " \t%temp1, ");
+  string t10 = !strconcat(t9, a);
+  string t11 = !strconcat(t10, ";\n\t");
+  string t12 = !strconcat(t11, cvt);
+  string t13 = !strconcat(t12, " \t%temp2, ");
+  string t14 = !strconcat(t13, b);
+  string t15 = !strconcat(t14, ";\n\t");
+  string t16 = !strconcat(t15, OpcStr);
+  string t17 = !strconcat(t16, "16");
+  string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t");
+  string t19 = !strconcat(t18, "selp.s16 \t");
+  string t20 = !strconcat(t19, d);
+  string s   = !strconcat(t20, ", -1, 0, p;\n\t}}");
+}
+
+multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode,
+  string TypeStr, string CVTStr> {
+  def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s,
+               []>;
+  def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                     Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s,
+               []>;
+  def i32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                     Set_Str<OpcStr, "32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def i64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                     Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s,
+               []>;
+
+  def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr, TypeStr, CVTStr>.s,
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+
+  def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
+                     Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
+  def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
+                     Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
+  def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
+                     Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s,
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
+  def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a,
+      Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def i16ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
+                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
+  def i32rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
+      Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i32ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
+                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
+  def i64rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a,
+      Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i64ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
+                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+}
+
+multiclass FSET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode> {
+  def f32rr_toi32_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "ftz.f32", "32", "$dst", "$a", "$b">.s,
+               []>, Requires<[doF32FTZ]>;
+  def f32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
+      Float32Regs:$b),
+                     Set_Str<OpcStr, "f32", "32", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "64", "$dst", "$a", "$b">.s,
+               []>;
+  def f64rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float64Regs:$a,
+      Float64Regs:$b),
+                     Set_Str<OpcStr, "f64", "32", "$dst", "$a", "$b">.s,
+               []>;
+
+  def f32rr_p_ftz: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a
+      , Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>
+  , Requires<[doF32FTZ]>;
+  def f32rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>,
+  Requires<[doF32FTZ]>;
+  def f32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_p: NVPTXInst<(outs Int1Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
+               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+
+  def f32rr_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+  def f32ri_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float32Regs:$a, f32imm:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+  def f32ir_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f32ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f32imm:$a, Float32Regs:$b),
+                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
+  def f64rr_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+  def f64ri_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins Float64Regs:$a, f64imm:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+  def f64ir_u32: NVPTXInst<(outs Int32Regs:$dst),
+    (ins f64imm:$a, Float64Regs:$b),
+                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+}
+
+defm ISetSGT
+: ISET_FORMAT<"setp.gt.s", "set.gt.u32.s", setgt, "s16", "cvt.s16.s8">;
+defm ISetUGT
+: ISET_FORMAT<"setp.gt.u", "set.gt.u32.u", setugt, "u16", "cvt.u16.u8">;
+defm ISetSLT
+: ISET_FORMAT<"setp.lt.s", "set.lt.u32.s", setlt, "s16", "cvt.s16.s8">;
+defm ISetULT
+: ISET_FORMAT<"setp.lt.u", "set.lt.u32.u", setult, "u16", "cvt.u16.u8">;
+defm ISetSGE
+: ISET_FORMAT<"setp.ge.s", "set.ge.u32.s", setge, "s16", "cvt.s16.s8">;
+defm ISetUGE
+: ISET_FORMAT<"setp.ge.u", "set.ge.u32.u", setuge, "u16", "cvt.u16.u8">;
+defm ISetSLE
+: ISET_FORMAT<"setp.le.s", "set.le.u32.s", setle, "s16", "cvt.s16.s8">;
+defm ISetULE
+: ISET_FORMAT<"setp.le.u", "set.le.u32.u", setule, "u16", "cvt.u16.u8">;
+defm ISetSEQ
+: ISET_FORMAT<"setp.eq.s", "set.eq.u32.s", seteq, "s16", "cvt.s16.s8">;
+defm ISetUEQ
+: ISET_FORMAT<"setp.eq.u", "set.eq.u32.u", setueq, "u16", "cvt.u16.u8">;
+defm ISetSNE
+: ISET_FORMAT<"setp.ne.s", "set.ne.u32.s", setne, "s16", "cvt.s16.s8">;
+defm ISetUNE
+: ISET_FORMAT<"setp.ne.u", "set.ne.u32.u", setune, "u16", "cvt.u16.u8">;
+
+def ISetSNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+            [(set Int1Regs:$dst, (setune Int1Regs:$a, Int1Regs:$b))]>;
+def ISetSEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+def ISetUEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+            !strconcat("{{\n\t",
+            !strconcat(".reg .pred temp;\n\t",
+            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
+            [(set Int1Regs:$dst, (setueq Int1Regs:$a, Int1Regs:$b))]>;
+
+// Compare 2 i1's and produce a u32
+def ISETSNEi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, -1, 0, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
+def ISETSEQi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int1Regs:$a, Int1Regs:$b),
+                  !strconcat("{{\n\t",
+                  !strconcat(".reg .pred temp;\n\t",
+                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
+                  !strconcat("selp.u32 \t$dst, 0, -1, temp;", "\n\t}}")))),
+                  [(set Int32Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
+
+defm FSetGT : FSET_FORMAT<"setp.gt.", "set.gt.u32.", setogt>;
+defm FSetLT : FSET_FORMAT<"setp.lt.", "set.lt.u32.", setolt>;
+defm FSetGE : FSET_FORMAT<"setp.ge.", "set.ge.u32.", setoge>;
+defm FSetLE : FSET_FORMAT<"setp.le.", "set.le.u32.", setole>;
+defm FSetEQ : FSET_FORMAT<"setp.eq.", "set.eq.u32.", setoeq>;
+defm FSetNE : FSET_FORMAT<"setp.ne.", "set.ne.u32.", setone>;
+
+defm FSetUGT : FSET_FORMAT<"setp.gtu.", "set.gtu.u32.", setugt>;
+defm FSetULT : FSET_FORMAT<"setp.ltu.", "set.ltu.u32.",setult>;
+defm FSetUGE : FSET_FORMAT<"setp.geu.", "set.geu.u32.",setuge>;
+defm FSetULE : FSET_FORMAT<"setp.leu.", "set.leu.u32.",setule>;
+defm FSetUEQ : FSET_FORMAT<"setp.equ.", "set.equ.u32.",setueq>;
+defm FSetUNE : FSET_FORMAT<"setp.neu.", "set.neu.u32.",setune>;
+
+defm FSetNUM : FSET_FORMAT<"setp.num.", "set.num.u32.",seto>;
+defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>;
+
+def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+                     (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+                             (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>;
+def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst),
+  (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>;
+def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>;
+def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst),
+  (ins i8imm:$a, i8imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, Int16Regs:$b))]>;
+def SELECTi16ri : NVPTXInst<(outs Int16Regs:$dst),
+  (ins Int16Regs:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, imm:$b))]>;
+def SELECTi16ir : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, Int16Regs:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, Int16Regs:$b))]>;
+def SELECTi16ii : NVPTXInst<(outs Int16Regs:$dst),
+  (ins i16imm:$a, i16imm:$b, Int1Regs:$p),
+                      "selp.b16 \t$dst, $a, $b, $p;",
+      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi32rr : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, Int32Regs:$b))]>;
+def SELECTi32ri : NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int32Regs:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, imm:$b))]>;
+def SELECTi32ir : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, Int32Regs:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, Int32Regs:$b))]>;
+def SELECTi32ii : NVPTXInst<(outs Int32Regs:$dst),
+  (ins i32imm:$a, i32imm:$b, Int1Regs:$p),
+                      "selp.b32 \t$dst, $a, $b, $p;",
+      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTi64rr : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, Int64Regs:$b))]>;
+def SELECTi64ri : NVPTXInst<(outs Int64Regs:$dst),
+  (ins Int64Regs:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, imm:$b))]>;
+def SELECTi64ir : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, Int64Regs:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, Int64Regs:$b))]>;
+def SELECTi64ii : NVPTXInst<(outs Int64Regs:$dst),
+  (ins i64imm:$a, i64imm:$b, Int1Regs:$p),
+                      "selp.b64 \t$dst, $a, $b, $p;",
+      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
+
+def SELECTf32rr : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst,
+        (select Int1Regs:$p, Float32Regs:$a, Float32Regs:$b))]>;
+def SELECTf32ri : NVPTXInst<(outs Float32Regs:$dst),
+  (ins Float32Regs:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, Float32Regs:$a, fpimm:$b))]>;
+def SELECTf32ir : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, Float32Regs:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float32Regs:$b))]>;
+def SELECTf32ii : NVPTXInst<(outs Float32Regs:$dst),
+  (ins f32imm:$a, f32imm:$b, Int1Regs:$p),
+                      "selp.f32 \t$dst, $a, $b, $p;",
+      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+def SELECTf64rr : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst,
+        (select Int1Regs:$p, Float64Regs:$a, Float64Regs:$b))]>;
+def SELECTf64ri : NVPTXInst<(outs Float64Regs:$dst),
+  (ins Float64Regs:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, Float64Regs:$a, fpimm:$b))]>;
+def SELECTf64ir : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, Float64Regs:$b, Int1Regs:$p),
+                      "selp.f64 \t$dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float64Regs:$b))]>;
+def SELECTf64ii : NVPTXInst<(outs Float64Regs:$dst),
+  (ins f64imm:$a, f64imm:$b, Int1Regs:$p),
+                      "selp.f64 \t $dst, $a, $b, $p;",
+      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+
+//def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+  SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
+  SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTMoveRetvalProfile : SDTypeProfile<0, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam : SDNode<"NVPTXISD::DeclareScalarParam",
+  SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam : SDNode<"NVPTXISD::DeclareRetParam",
+  SDTDeclareParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveToParam  : SDNode<"NVPTXISD::MoveToParam", SDTStoreParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg  : SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd   : SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid     : SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype    : SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
+                         []>;
+def MoveRetval   : SDNode<"NVPTXISD::MoveRetval", SDTMoveRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def MoveToRetval : SDNode<"NVPTXISD::MoveToRetval", SDTStoreRetvalProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
+  SDTPseudoUseParamProfile,
+                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode   : SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+                         [SDNPHasChain, SDNPSideEffect]>;
+
+class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                "\t$dst, [retval0+$b];"),
+                [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[param$a+$b], $val;"),
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class MoveToParamInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("mov", opstr),
+                "\tparam$a, $val;"),
+                [(MoveToParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+
+class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                "\t[func_retval0+$a], $val;"),
+                [(StoreRetval (i32 imm:$a), regclass:$val)]>;
+
+class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins i32imm:$num, regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval$num, $val;"),
+                [(MoveToRetval (i32 imm:$num), regclass:$val)]>;
+
+class MoveRetvalInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val),
+                !strconcat(!strconcat("mov", opstr),
+                "\tfunc_retval0, $val;"),
+                [(MoveRetval regclass:$val)]>;
+
+def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+"call (retval0), ",
+                                [(PrintCall (i32 1))]>;
+def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1), ",
+                                [(PrintCall (i32 2))]>;
+def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2), ",
+                                [(PrintCall (i32 3))]>;
+def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3), ",
+                                [(PrintCall (i32 4))]>;
+def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCall (i32 5))]>;
+def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCall (i32 6))]>;
+def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+"call (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCall (i32 7))]>;
+def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCall (i32 8))]>;
+
+def PrintCallNoRetInst : NVPTXInst<(outs), (ins), "call ",
+                                [(PrintCall (i32 0))]>;
+
+def PrintCallUniRetInst1 : NVPTXInst<(outs), (ins),
+"call.uni (retval0), ",
+                                [(PrintCallUni (i32 1))]>;
+def PrintCallUniRetInst2 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1), ",
+                                [(PrintCallUni (i32 2))]>;
+def PrintCallUniRetInst3 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2), ",
+                                [(PrintCallUni (i32 3))]>;
+def PrintCallUniRetInst4 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3), ",
+                                [(PrintCallUni (i32 4))]>;
+def PrintCallUniRetInst5 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4), ",
+                                [(PrintCallUni (i32 5))]>;
+def PrintCallUniRetInst6 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5), ",
+                                [(PrintCallUni (i32 6))]>;
+def PrintCallUniRetInst7 : NVPTXInst<(outs), (ins),
+"call.uni (retval0, retval1, retval2, retval3, retval4, retval5, retval6), ",
+                                [(PrintCallUni (i32 7))]>;
+def PrintCallUniRetInst8 : NVPTXInst<(outs), (ins),
+!strconcat("call.uni (retval0, retval1, retval2, retval3, retval4",
+           ", retval5, retval6, retval7), "),
+                                [(PrintCallUni (i32 8))]>;
+
+def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
+                                [(PrintCallUni (i32 0))]>;
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int8Regs, ".b8">;
+
+//def LoadParamMemI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+//def LoadParamMemI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
+//                "cvt.u16.u32\t$dst, temp_param_reg;"),
+//                [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+
+def LoadParamRegI64    : LoadParamRegInst<Int64Regs, ".b64">;
+def LoadParamRegI32    : LoadParamRegInst<Int32Regs, ".b32">;
+def LoadParamRegI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int16Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+def LoadParamRegI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
+                         "cvt.u16.u32\t$dst, retval$b;",
+                         [(set Int8Regs:$dst,
+                           (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+def LoadParamRegF32    : LoadParamRegInst<Float32Regs, ".f32">;
+def LoadParamRegF64    : LoadParamRegInst<Float64Regs, ".f64">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b16\t[param$a+$b], $val;",
+           [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamI8     : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                       "st.param.b8\t[param$a+$b], $val;",
+                       [(StoreParam
+                         (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamS32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def StoreParamU32I16 : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+
+def StoreParamU32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+def StoreParamS32I8   : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                 !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t",
+                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
+                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+
+def MoveToParamI64   : MoveToParamInst<Int64Regs, ".b64">;
+def MoveToParamI32   : MoveToParamInst<Int32Regs, ".b32">;
+def MoveToParamF64   : MoveToParamInst<Float64Regs, ".f64">;
+def MoveToParamF32   : MoveToParamInst<Float32Regs, ".f32">;
+def MoveToParamI16   : NVPTXInst<(outs),
+  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
+def MoveToParamI8    : NVPTXInst<(outs),
+  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
+                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
+                              "mov.b32\tparam$a, temp_param_reg;"),
+                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int8Regs, ".b8">;
+
+//def StoreRetvalI16    : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>;
+//def StoreRetvalI8     : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a),
+//     !strconcat("\{\n\t",
+//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
+//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
+//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
+//     [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+
+def MoveRetvalI64    : MoveRetvalInst<Int64Regs, ".b64">;
+def MoveRetvalI32    : MoveRetvalInst<Int32Regs, ".b32">;
+def MoveRetvalI16    : MoveRetvalInst<Int16Regs, ".b16">;
+def MoveRetvalI8     : MoveRetvalInst<Int8Regs, ".b8">;
+def MoveRetvalF64    : MoveRetvalInst<Float64Regs, ".f64">;
+def MoveRetvalF32    : MoveRetvalInst<Float32Regs, ".f32">;
+
+def MoveToRetvalI64    : MoveToRetvalInst<Int64Regs, ".b64">;
+def MoveToRetvalI32    : MoveToRetvalInst<Int32Regs, ".b32">;
+def MoveToRetvalF64    : MoveToRetvalInst<Float64Regs, ".f64">;
+def MoveToRetvalF32    : MoveToRetvalInst<Float32Regs, ".f32">;
+def MoveToRetvalI16    : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>;
+def MoveToRetvalI8     : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val),
+                         "cvt.u32.u16\tfunc_retval$num, $val;",
+                         [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+                [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$a), "$a",
+                [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgI8      : CallArgInst<Int8Regs>;
+
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgI8  : LastCallArgInst<Int8Regs>;
+
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                              [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                             [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst : NVPTXInst<(outs), (ins imem:$addr),
+                             "$addr, ",
+                             [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg : NVPTXInst<(outs), (ins Int32Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+                             "$addr, ",
+                             [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst : NVPTXInst<(outs), (ins i32imm:$val),
+                             ", prototype_$val;",
+                             [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$size, i32imm:$num),
+         ".param .align $align .b8 retval$num[$size];",
+         [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".param .b$size retval$num;",
+         [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst : NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+         ".reg .b$size retval$num;",
+         [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst : NVPTXInst<(outs),
+  (ins i32imm:$align, i32imm:$a, i32imm:$size),
+         ".param .align $align .b8 param$a[$size];",
+         [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".param .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+         ".reg .b$size param$a;",
+         [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+      NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+                !strconcat(!strconcat("mov", asmstr), "\t$dst, $src;"),
+                [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamI8  : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
+                   "cvt.u16.u32\t$dst, $src;",
+                   [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+      NVPTXInst<(outs), (ins regclass:$src),
+      "// Pseudo use of $src",
+      [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamI8  : PseudoUseParamInst<Int8Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr];"), []>;
+  def _areg_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+     !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+                " \t$dst, [$addr];"), []>;
+  def _ari : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+  def _ari_64 : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth",
+               " \t$dst, [$addr+$offset];"), []>;
+  def _asi : NVPTXInst<(outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+!strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+           "$fromWidth \t$dst, [$addr+$offset];"), []>;
+}
+
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LD_i8  : LD<Int8Regs>;
+defm LD_i16 : LD<Int16Regs>;
+defm LD_i32 : LD<Int32Regs>;
+defm LD_i64 : LD<Int64Regs>;
+defm LD_f32 : LD<Float32Regs>;
+defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr], $src;"), []>;
+  def _areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr], $src;"), []>;
+  def _ari : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+  def _ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+     LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+  !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth ",
+               "\t[$addr+$offset], $src;"), []>;
+  def _asi : NVPTXInst<(outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+      LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+!strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth",
+           " \t[$addr+$offset], $src;"), []>;
+}
+
+let mayStore=1, neverHasSideEffects=1 in {
+defm ST_i8  : ST<Int8Regs>;
+defm ST_i16 : ST<Int16Regs>;
+defm ST_i32 : ST<Int32Regs>;
+defm ST_i64 : ST<Int64Regs>;
+defm ST_f32 : ST<Float32Regs>;
+defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations.
+// Vector elementization happens at the machine instruction level, so the
+// following instruction
+// never appears in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr];"), []>;
+  def _v2_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v2_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2}}, [$addr+$offset];"), []>;
+  def _v4_avar : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+      regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                               regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];"), []>;
+  def _v4_ari : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+  def _v4_ari_64 : NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                              regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+    []>;
+  def _v4_asi : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+      regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];"),
+                []>;
+}
+let mayLoad=1, neverHasSideEffects=1 in {
+defm LDV_i8  : LD_VEC<Int8Regs>;
+defm LDV_i16 : LD_VEC<Int16Regs>;
+defm LDV_i32 : LD_VEC<Int32Regs>;
+defm LDV_i64 : LD_VEC<Int64Regs>;
+defm LDV_f32 : LD_VEC<Float32Regs>;
+defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2}};"), []>;
+  def _v2_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+     LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+     i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v2_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+      LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+      i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2}};"), []>;
+  def _v4_avar : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_areg_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr], {{$src1, $src2, $src3, $src4}};"), []>;
+  def _v4_ari : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+  def _v4_ari_64 : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+     LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+     i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+     []>;
+  def _v4_asi : NVPTXInst<(outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+      LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+      i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    !strconcat("st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}",
+               "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};"),
+    []>;
+}
+let mayStore=1, neverHasSideEffects=1 in {
+defm STV_i8  : ST_VEC<Int8Regs>;
+defm STV_i16 : ST_VEC<Int16Regs>;
+defm STV_i32 : ST_VEC<Int32Regs>;
+defm STV_i64 : ST_VEC<Int64Regs>;
+defm STV_f32 : ST_VEC<Float32Regs>;
+defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+
+//---- Conversion ----
+
+multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTf16i8 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>;
+//  def CVTf16i16 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int16Regs:$a))]>;
+//  def CVTf16i32 :
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int32Regs:$a),
+//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "32 \t$d, $a;"),
+//        [(set Float16Regs:$d, (OpNode Int32Regs:$a))]>;
+//  def CVTf16i64:
+//    NVPTXInst<(outs Float16Regs:$d), (ins Int64Regs:$a),
+//          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+//            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf32i1 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a),
+              "selp.f32 \t$d, 1.0, 0.0, $a;",
+        [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf32i8 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf32i16 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf32i32 :
+    NVPTXInst<(outs Float32Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "32 \t$d, $a;"),
+        [(set Float32Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf32i64:
+    NVPTXInst<(outs Float32Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
+            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
+
+  def CVTf64i1 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a),
+              "selp.f64 \t$d, 1.0, 0.0, $a;",
+        [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>;
+  def CVTf64i8 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def CVTf64i16 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def CVTf64i32 :
+    NVPTXInst<(outs Float64Regs:$d), (ins Int32Regs:$a),
+              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "32 \t$d, $a;"),
+        [(set Float64Regs:$d, (OpNode Int32Regs:$a))]>;
+  def CVTf64i64:
+    NVPTXInst<(outs Float64Regs:$d), (ins Int64Regs:$a),
+          !strconcat(!strconcat("cvt.rn.f64.", OpStr), "64 \t$d, $a;"),
+            [(set Float64Regs:$d, (OpNode Int64Regs:$a))]>;
+}
+
+defm Sint_to_fp : CVT_INT_TO_FP <"s", sint_to_fp>;
+defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>;
+
+multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> {
+// FIXME: need to add f16 support
+//  def CVTi8f16:
+//    NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"),
+//        [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi8f32_ftz:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi8f32:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi8f64:
+    NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi16f16:
+//    NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"),
+//        [(set Int16Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi16f32_ftz:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi16f32:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi16f64:
+    NVPTXInst<(outs Int16Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
+        [(set Int16Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi32f16:  def CVTi32f16:
+//    NVPTXInst<(outs Int32Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f16 \t$d, $a;"),
+//        [(set Int32Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi32f32_ftz:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi32f32:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f32 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi32f64:
+    NVPTXInst<(outs Int32Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f64 \t$d, $a;"),
+        [(set Int32Regs:$d, (OpNode Float64Regs:$a))]>;
+
+// FIXME: need to add f16 support
+//  def CVTi64f16:
+//    NVPTXInst<(outs Int64Regs:$d), (ins Float16Regs:$a),
+//              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f16 \t$d, $a;"),
+//        [(set Int64Regs:$d, (OpNode Float16Regs:$a))]>;
+  def CVTi64f32_ftz:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+  def CVTi64f32:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f32 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>;
+  def CVTi64f64:
+    NVPTXInst<(outs Int64Regs:$d), (ins Float64Regs:$a),
+              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f64 \t$d, $a;"),
+        [(set Int64Regs:$d, (OpNode Float64Regs:$a))]>;
+}
+
+defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>;
+defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>;
+
+multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.u16 \t$d, 1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.u32 \t$d, 1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.u64 \t$d, 1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> {
+  def ext1to8:
+       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
+           "selp.s16 \t$d, -1, 0, $a;",
+     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
+           "selp.s32 \t$d, -1, 0, $a;",
+     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
+  def ext1to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
+           "selp.s64 \t$d, -1, 0, $a;",
+     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
+}
+
+multiclass INT_EXTEND <string OpStr, SDNode OpNode> {
+  // All Int8Regs are emiited as 16bit registers in ptx.
+  // And there is no selp.u8 in ptx.
+  def ext8to16:
+       NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext8to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "8 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>;
+  def ext16to32:
+       NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int32Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext16to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "16 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int16Regs:$a))]>;
+  def ext32to64:
+       NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$a),
+           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
+             !strconcat(OpStr, "32 \t$d, $a;")))),
+     [(set Int64Regs:$d, (OpNode Int32Regs:$a))]>;
+}
+
+defm Sint_extend_1 : INT_EXTEND_SIGNED_1<sext>;
+defm Zint_extend_1 : INT_EXTEND_UNSIGNED_1<zext>;
+defm Aint_extend_1 : INT_EXTEND_UNSIGNED_1<anyext>;
+
+defm Sint_extend : INT_EXTEND <"s", sext>;
+defm Zint_extend : INT_EXTEND <"u", zext>;
+defm Aint_extend : INT_EXTEND <"u", anyext>;
+
+class TRUNC_to1_asm<string sz> {
+  string s = !strconcat("{{\n\t",
+             !strconcat(".reg ",
+             !strconcat(sz,
+             !strconcat(" temp;\n\t",
+             !strconcat("and",
+             !strconcat(sz,
+             !strconcat("\t temp, $a, 1;\n\t",
+             !strconcat("setp",
+             !strconcat(sz, ".eq \t $d, temp, 1;\n\t}}")))))))));
+}
+
+def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+             "cvt.u32.u64 \t$d, $a;",
+       [(set Int32Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a),
+             "cvt.u16.u64 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a),
+             "cvt.u8.u64 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a),
+             "cvt.u16.u32 \t$d, $a;",
+       [(set Int16Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a),
+             "cvt.u8.u32 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a),
+             "cvt.u8.u16 \t$d, $a;",
+       [(set Int8Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+             TRUNC_to1_asm<".b64">.s,
+             [(set Int1Regs:$d, (trunc Int64Regs:$a))]>;
+def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
+             TRUNC_to1_asm<".b32">.s,
+             [(set Int1Regs:$d, (trunc Int32Regs:$a))]>;
+def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int16Regs:$a))]>;
+def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a),
+             TRUNC_to1_asm<".b16">.s,
+             [(set Int1Regs:$d, (trunc Int8Regs:$a))]>;
+
+// Select instructions
+def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b),
+          (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELECTi16rr Int16Regs:$a, Int16Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELECTi32rr Int32Regs:$a, Int32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELECTi64rr Int64Regs:$a, Int64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELECTf32rr Float32Regs:$a, Float32Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELECTf64rr Float64Regs:$a, Float64Regs:$b,
+            (TRUNC_32to1 Int32Regs:$pred))>;
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+
+// pack a set of smaller int registers to a larger int register
+def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2,
+                               Int8Regs:$s3, Int8Regs:$s4),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                          !strconcat("\n\tcvt.u8.u8\t%t2, $s3;",
+                          !strconcat("\n\tcvt.u8.u8\t%t3, $s4;",
+                           "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))),
+                          []>;
+def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2,
+                               Int16Regs:$s3, Int16Regs:$s4),
+                          "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
+                          []>;
+def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d),
+                          (ins Int8Regs:$s1, Int8Regs:$s2),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
+                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
+                                     "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))),
+                          []>;
+def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                          (ins Int16Regs:$s1, Int16Regs:$s2),
+                          "mov.b32\t$d, {{$s1, $s2}};",
+                          []>;
+def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                          (ins Int32Regs:$s1, Int32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                          (ins Float32Regs:$s1, Float32Regs:$s2),
+                          "mov.b64\t$d, {{$s1, $s2}};",
+                          []>;
+
+// unpack a larger int register to a set of smaller int registers
+def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2,
+                                Int8Regs:$d3, Int8Regs:$d4),
+                          (ins Int32Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
+                          !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                          !strconcat("\n\tcvt.u8.u8\t$d2, %t1;",
+                          !strconcat("\n\tcvt.u8.u8\t$d3, %t2;",
+                                     "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))),
+                          []>;
+def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                 Int16Regs:$d3, Int16Regs:$d4),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
+                          []>;
+def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2),
+                          (ins Int16Regs:$s),
+                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
+                          !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;",
+                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
+                                     "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))),
+                          []>;
+def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                           (ins Int32Regs:$s),
+                           "mov.b32\t{{$d1, $d2}}, $s;",
+                          []>;
+def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                           (ins Int64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                           (ins Float64Regs:$s),
+                           "mov.b64\t{{$d1, $d2}}, $s;",
+                          []>;
+
+def FPRound_ftz : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.ftz.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPRound : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
+            "cvt.rn.f32.f64 \t$d, $a;",
+      [(set Float32Regs:$d, (fround Float64Regs:$a))]>;
+
+def FPExtend_ftz : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.ftz.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>, Requires<[doF32FTZ]>;
+
+def FPExtend : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
+            "cvt.f64.f32 \t$d, $a;",
+      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>;
+
+def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@$a bra \t$target;",
+                           [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                          "@!$a bra \t$target;",
+                           []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                        "bra.uni \t$target;",
+                  [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch
+    (ISetUNEi32ri_p Int32Regs:$a, 0), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if
+// the target block is the next block so that the code can fall through to the
+// target block.
+// The invertion is done by 'xor condition, 1', which will be translated to
+// (setne condition, -1).
+// Since ptx supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+  (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst),
+                  "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+// @TODO: We use some tricks here to emit curly braces.  Can we clean this up
+// a bit without TableGen modifications?
+def Callseq_Start : NVPTXInst<(outs), (ins i32imm:$amt),
+  "// Callseq Start $amt\n\t{{\n\t.reg .b32 temp_param_reg;\n\t// <end>}}",
+                               [(callseq_start timm:$amt)]>;
+def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+  "\n\t//{{\n\t}}// Callseq End $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+
+def trapinst : NVPTXInst<(outs), (ins),
+                         "trap;",
+                         [(trap)]>;
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
new file mode 100644
index 000000000000..49e2568dfa2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -0,0 +1,1736 @@
+//===- NVPTXIntrinsics.td - PTX Intrinsics Instructions -------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def immFloat0 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==0.0f);
+}]>;
+
+def immFloat1 : PatLeaf<(fpimm), [{
+    float f = (float)N->getValueAPF().convertToFloat();
+    return (f==1.0f);
+}]>;
+
+def immDouble0 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==0.0);
+}]>;
+
+def immDouble1 : PatLeaf<(fpimm), [{
+    double d = (double)N->getValueAPF().convertToDouble();
+    return (d==1.0);
+}]>;
+
+
+
+//-----------------------------------
+// Synchronization Functions
+//-----------------------------------
+def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_cuda_syncthreads)]>;
+def INT_BARRIER0 : NVPTXInst<(outs), (ins),
+                  "bar.sync \t0;",
+      [(int_nvvm_barrier0)]>;
+def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+        !strconcat("}}", ""))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
+def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
+def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
+  !strconcat("{{ \n\t",
+      !strconcat(".reg .pred \t%p1; \n\t",
+      !strconcat(".reg .pred \t%p2; \n\t",
+      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
+      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
+      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
+        !strconcat("}}", ""))))))),
+      [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
+
+
+//-----------------------------------
+// Explicit Memory Fence Functions
+//-----------------------------------
+class MEMBAR<string StrOp, Intrinsic IntOP> :
+              NVPTXInst<(outs), (ins),
+            StrOp, [(IntOP)]>;
+
+def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
+def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
+def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+
+
+//-----------------------------------
+// Math Functions
+//-----------------------------------
+
+// Map min(1.0, max(0.0, x)) to sat(x)
+multiclass SAT<NVPTXRegClass regclass, Operand fimm, Intrinsic IntMinOp,
+  Intrinsic IntMaxOp, PatLeaf f0, PatLeaf f1, string OpStr> {
+
+   // fmin(1.0, fmax(0.0, x)) => sat(x)
+   def SAT11 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp f0:$srcf1, regclass:$src)))]>;
+
+   // fmin(1.0, fmax(x, 0.0)) => sat(x)
+   def SAT12 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
+       (IntMaxOp regclass:$src, f0:$srcf1)))]>;
+
+   // fmin(fmax(0.0, x), 1.0) => sat(x)
+   def SAT13 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+           OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp f0:$srcf0, regclass:$src), f1:$srcf1))]>;
+
+   // fmin(fmax(x, 0.0), 1.0) => sat(x)
+   def SAT14 : NVPTXInst<(outs regclass:$dst),
+     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
+         OpStr,
+     [(set regclass:$dst, (IntMinOp
+       (IntMaxOp regclass:$src, f0:$srcf0), f1:$srcf1))]>;
+
+}
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x
+// is NaN
+// max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
+// Same story for fmax, fmin.
+
+defm SAT_fmin_fmax_f : SAT<Float32Regs, f32imm, int_nvvm_fmin_f,
+  int_nvvm_fmax_f, immFloat0, immFloat1,
+           "cvt.sat.f32.f32 \t$dst, $src; \n">;
+defm SAT_fmin_fmax_d : SAT<Float64Regs, f64imm, int_nvvm_fmin_d,
+  int_nvvm_fmax_d, immDouble0, immDouble1,
+           "cvt.sat.f64.f64 \t$dst, $src; \n">;
+
+
+// We need a full string for OpcStr here because we need to deal with case like
+// INT_PTX_RECIP.
+class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass,
+  NVPTXRegClass src_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0),
+            OpcStr,
+        [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>;
+
+// We need a full string for OpcStr here because we need to deal with the case
+// like INT_PTX_NATIVE_POWR_F.
+class F_MATH_2<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1),
+            OpcStr,
+        [(set t_regclass:$dst, (IntOP s0_regclass:$src0, s1_regclass:$src1))]>;
+
+class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
+  NVPTXRegClass s0_regclass, NVPTXRegClass s1_regclass,
+  NVPTXRegClass s2_regclass, Intrinsic IntOP>
+            : NVPTXInst<(outs t_regclass:$dst),
+              (ins s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2),
+            OpcStr,
+        [(set t_regclass:$dst,
+          (IntOP s0_regclass:$src0, s1_regclass:$src1, s2_regclass:$src2))]>;
+
+//
+// MISC
+//
+
+def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_clz_i>;
+def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_clz_ll>;
+
+def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_popc_i>;
+def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
+  int_nvvm_popc_ll>;
+
+def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
+
+//
+// Min Max
+//
+
+def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_i>;
+def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_min_ui>;
+
+def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ll>;
+def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_min_ull>;
+
+def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_i>;
+def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_max_ui>;
+
+def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ll>;
+def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_max_ull>;
+
+def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmin_f>;
+def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmin_ftz_f>;
+
+def INT_NVVM_FMAX_F : F_MATH_2<"max.f32 \t$dst, $src0, $src1;", Float32Regs,
+  Float32Regs, Float32Regs, int_nvvm_fmax_f>;
+def INT_NVVM_FMAX_FTZ_F : F_MATH_2<"max.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_fmax_ftz_f>;
+
+def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmin_d>;
+def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
+  Float64Regs, Float64Regs, int_nvvm_fmax_d>;
+
+//
+// Multiplication
+//
+
+def INT_NVVM_MULHI_I : F_MATH_2<"mul.hi.s32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_i>;
+def INT_NVVM_MULHI_UI : F_MATH_2<"mul.hi.u32 \t$dst, $src0, $src1;", Int32Regs,
+  Int32Regs, Int32Regs, int_nvvm_mulhi_ui>;
+
+def INT_NVVM_MULHI_LL : F_MATH_2<"mul.hi.s64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ll>;
+def INT_NVVM_MULHI_ULL : F_MATH_2<"mul.hi.u64 \t$dst, $src0, $src1;", Int64Regs,
+  Int64Regs, Int64Regs, int_nvvm_mulhi_ull>;
+
+def INT_NVVM_MUL_RN_FTZ_F : F_MATH_2<"mul.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_ftz_f>;
+def INT_NVVM_MUL_RN_F : F_MATH_2<"mul.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rn_f>;
+def INT_NVVM_MUL_RZ_FTZ_F : F_MATH_2<"mul.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_ftz_f>;
+def INT_NVVM_MUL_RZ_F : F_MATH_2<"mul.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rz_f>;
+def INT_NVVM_MUL_RM_FTZ_F : F_MATH_2<"mul.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_ftz_f>;
+def INT_NVVM_MUL_RM_F : F_MATH_2<"mul.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rm_f>;
+def INT_NVVM_MUL_RP_FTZ_F : F_MATH_2<"mul.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_ftz_f>;
+def INT_NVVM_MUL_RP_F : F_MATH_2<"mul.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_mul_rp_f>;
+
+def INT_NVVM_MUL_RN_D : F_MATH_2<"mul.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rn_d>;
+def INT_NVVM_MUL_RZ_D : F_MATH_2<"mul.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rz_d>;
+def INT_NVVM_MUL_RM_D : F_MATH_2<"mul.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rm_d>;
+def INT_NVVM_MUL_RP_D : F_MATH_2<"mul.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_mul_rp_d>;
+
+def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_i>;
+def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32 \t$dst, $src0, $src1;",
+  Int32Regs, Int32Regs, Int32Regs, int_nvvm_mul24_ui>;
+
+//
+// Div
+//
+
+def INT_NVVM_DIV_APPROX_FTZ_F
+  : F_MATH_2<"div.approx.ftz.f32 \t$dst, $src0, $src1;", Float32Regs,
+    Float32Regs, Float32Regs, int_nvvm_div_approx_ftz_f>;
+def INT_NVVM_DIV_APPROX_F : F_MATH_2<"div.approx.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_approx_f>;
+
+def INT_NVVM_DIV_RN_FTZ_F : F_MATH_2<"div.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_ftz_f>;
+def INT_NVVM_DIV_RN_F     : F_MATH_2<"div.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rn_f>;
+def INT_NVVM_DIV_RZ_FTZ_F : F_MATH_2<"div.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_ftz_f>;
+def INT_NVVM_DIV_RZ_F     : F_MATH_2<"div.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rz_f>;
+def INT_NVVM_DIV_RM_FTZ_F : F_MATH_2<"div.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_ftz_f>;
+def INT_NVVM_DIV_RM_F     : F_MATH_2<"div.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rm_f>;
+def INT_NVVM_DIV_RP_FTZ_F : F_MATH_2<"div.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_ftz_f>;
+def INT_NVVM_DIV_RP_F     : F_MATH_2<"div.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_div_rp_f>;
+
+def INT_NVVM_DIV_RN_D : F_MATH_2<"div.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rn_d>;
+def INT_NVVM_DIV_RZ_D : F_MATH_2<"div.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rz_d>;
+def INT_NVVM_DIV_RM_D : F_MATH_2<"div.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rm_d>;
+def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
+
+//
+// Brev
+//
+
+def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_brev32>;
+def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_brev64>;
+
+//
+// Sad
+//
+
+def INT_NVVM_SAD_I : F_MATH_3<"sad.s32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_i>;
+def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
+  Int32Regs, Int32Regs, Int32Regs, Int32Regs, int_nvvm_sad_ui>;
+
+//
+// Floor  Ceil
+//
+
+def INT_NVVM_FLOOR_FTZ_F : F_MATH_1<"cvt.rmi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_ftz_f>;
+def INT_NVVM_FLOOR_F : F_MATH_1<"cvt.rmi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_floor_f>;
+def INT_NVVM_FLOOR_D : F_MATH_1<"cvt.rmi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_floor_d>;
+
+def INT_NVVM_CEIL_FTZ_F : F_MATH_1<"cvt.rpi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_ftz_f>;
+def INT_NVVM_CEIL_F : F_MATH_1<"cvt.rpi.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ceil_f>;
+def INT_NVVM_CEIL_D : F_MATH_1<"cvt.rpi.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ceil_d>;
+
+//
+// Abs
+//
+
+def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
+  int_nvvm_abs_i>;
+def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
+  int_nvvm_abs_ll>;
+
+def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_ftz_f>;
+def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_fabs_f>;
+
+def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_fabs_d>;
+
+//
+// Round
+//
+
+def INT_NVVM_ROUND_FTZ_F : F_MATH_1<"cvt.rni.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_round_ftz_f>;
+def INT_NVVM_ROUND_F : F_MATH_1<"cvt.rni.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_round_f>;
+
+def INT_NVVM_ROUND_D : F_MATH_1<"cvt.rni.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_round_d>;
+
+//
+// Trunc
+//
+
+def INT_NVVM_TRUNC_FTZ_F : F_MATH_1<"cvt.rzi.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_trunc_ftz_f>;
+def INT_NVVM_TRUNC_F : F_MATH_1<"cvt.rzi.f32.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_trunc_f>;
+
+def INT_NVVM_TRUNC_D : F_MATH_1<"cvt.rzi.f64.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_trunc_d>;
+
+//
+// Saturate
+//
+
+def INT_NVVM_SATURATE_FTZ_F : F_MATH_1<"cvt.sat.ftz.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_ftz_f>;
+def INT_NVVM_SATURATE_F : F_MATH_1<"cvt.sat.f32.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_saturate_f>;
+
+def INT_NVVM_SATURATE_D : F_MATH_1<"cvt.sat.f64.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_saturate_d>;
+
+//
+// Exp2  Log2
+//
+
+def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>;
+def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>;
+def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>;
+
+def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>;
+def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>;
+def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>;
+
+//
+// Sin  Cos
+//
+
+def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>;
+def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sin_approx_f>;
+
+def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>;
+def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
+
+//
+// Fma
+//
+
+def INT_NVVM_FMA_RN_FTZ_F
+  : F_MATH_3<"fma.rn.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_ftz_f>;
+def INT_NVVM_FMA_RN_F : F_MATH_3<"fma.rn.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rn_f>;
+def INT_NVVM_FMA_RZ_FTZ_F
+  : F_MATH_3<"fma.rz.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_ftz_f>;
+def INT_NVVM_FMA_RZ_F : F_MATH_3<"fma.rz.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rz_f>;
+def INT_NVVM_FMA_RM_FTZ_F
+  : F_MATH_3<"fma.rm.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_ftz_f>;
+def INT_NVVM_FMA_RM_F : F_MATH_3<"fma.rm.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rm_f>;
+def INT_NVVM_FMA_RP_FTZ_F
+  : F_MATH_3<"fma.rp.ftz.f32 \t$dst, $src0, $src1, $src2;", Float32Regs,
+    Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_ftz_f>;
+def INT_NVVM_FMA_RP_F : F_MATH_3<"fma.rp.f32 \t$dst, $src0, $src1, $src2;",
+  Float32Regs, Float32Regs, Float32Regs, Float32Regs, int_nvvm_fma_rp_f>;
+
+def INT_NVVM_FMA_RN_D : F_MATH_3<"fma.rn.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rn_d>;
+def INT_NVVM_FMA_RZ_D : F_MATH_3<"fma.rz.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rz_d>;
+def INT_NVVM_FMA_RM_D : F_MATH_3<"fma.rm.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rm_d>;
+def INT_NVVM_FMA_RP_D : F_MATH_3<"fma.rp.f64 \t$dst, $src0, $src1, $src2;",
+  Float64Regs, Float64Regs, Float64Regs, Float64Regs, int_nvvm_fma_rp_d>;
+
+//
+// Rcp
+//
+
+def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>;
+def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>;
+def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>;
+def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>;
+def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>;
+def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>;
+def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>;
+def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>;
+
+def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rn_d>;
+def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rz_d>;
+def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rm_d>;
+def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_rcp_rp_d>;
+
+def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>;
+
+//
+// Sqrt
+//
+
+def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>;
+def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rn_f>;
+def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>;
+def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rz_f>;
+def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>;
+def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rm_f>;
+def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>;
+def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs,
+  Float32Regs, int_nvvm_sqrt_rp_f>;
+def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>;
+def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>;
+
+def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rn_d>;
+def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rz_d>;
+def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rm_d>;
+def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
+  Float64Regs, int_nvvm_sqrt_rp_d>;
+
+//
+// Rsqrt
+//
+
+def INT_NVVM_RSQRT_APPROX_FTZ_F
+  : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs,
+    int_nvvm_rsqrt_approx_ftz_f>;
+def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;",
+  Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>;
+def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;",
+  Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>;
+
+//
+// Add
+//
+
+def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_ftz_f>;
+def INT_NVVM_ADD_RN_F : F_MATH_2<"add.rn.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rn_f>;
+def INT_NVVM_ADD_RZ_FTZ_F : F_MATH_2<"add.rz.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_ftz_f>;
+def INT_NVVM_ADD_RZ_F : F_MATH_2<"add.rz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rz_f>;
+def INT_NVVM_ADD_RM_FTZ_F : F_MATH_2<"add.rm.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_ftz_f>;
+def INT_NVVM_ADD_RM_F : F_MATH_2<"add.rm.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rm_f>;
+def INT_NVVM_ADD_RP_FTZ_F : F_MATH_2<"add.rp.ftz.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_ftz_f>;
+def INT_NVVM_ADD_RP_F : F_MATH_2<"add.rp.f32 \t$dst, $src0, $src1;",
+  Float32Regs, Float32Regs, Float32Regs, int_nvvm_add_rp_f>;
+
+def INT_NVVM_ADD_RN_D : F_MATH_2<"add.rn.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rn_d>;
+def INT_NVVM_ADD_RZ_D : F_MATH_2<"add.rz.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rz_d>;
+def INT_NVVM_ADD_RM_D : F_MATH_2<"add.rm.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rm_d>;
+def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
+  Float64Regs, Float64Regs, Float64Regs, int_nvvm_add_rp_d>;
+
+//
+// Convert
+//
+
+def INT_NVVM_D2F_RN_FTZ : F_MATH_1<"cvt.rn.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn_ftz>;
+def INT_NVVM_D2F_RN : F_MATH_1<"cvt.rn.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rn>;
+def INT_NVVM_D2F_RZ_FTZ : F_MATH_1<"cvt.rz.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz_ftz>;
+def INT_NVVM_D2F_RZ : F_MATH_1<"cvt.rz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rz>;
+def INT_NVVM_D2F_RM_FTZ : F_MATH_1<"cvt.rm.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm_ftz>;
+def INT_NVVM_D2F_RM : F_MATH_1<"cvt.rm.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rm>;
+def INT_NVVM_D2F_RP_FTZ : F_MATH_1<"cvt.rp.ftz.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp_ftz>;
+def INT_NVVM_D2F_RP : F_MATH_1<"cvt.rp.f32.f64 \t$dst, $src0;",
+  Float32Regs, Float64Regs, int_nvvm_d2f_rp>;
+
+def INT_NVVM_D2I_RN : F_MATH_1<"cvt.rni.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rn>;
+def INT_NVVM_D2I_RZ : F_MATH_1<"cvt.rzi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rz>;
+def INT_NVVM_D2I_RM : F_MATH_1<"cvt.rmi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rm>;
+def INT_NVVM_D2I_RP : F_MATH_1<"cvt.rpi.s32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2i_rp>;
+
+def INT_NVVM_D2UI_RN : F_MATH_1<"cvt.rni.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rn>;
+def INT_NVVM_D2UI_RZ : F_MATH_1<"cvt.rzi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rz>;
+def INT_NVVM_D2UI_RM : F_MATH_1<"cvt.rmi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rm>;
+def INT_NVVM_D2UI_RP : F_MATH_1<"cvt.rpi.u32.f64 \t$dst, $src0;",
+  Int32Regs, Float64Regs, int_nvvm_d2ui_rp>;
+
+def INT_NVVM_I2D_RN : F_MATH_1<"cvt.rn.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rn>;
+def INT_NVVM_I2D_RZ : F_MATH_1<"cvt.rz.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rz>;
+def INT_NVVM_I2D_RM : F_MATH_1<"cvt.rm.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rm>;
+def INT_NVVM_I2D_RP : F_MATH_1<"cvt.rp.f64.s32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_i2d_rp>;
+
+def INT_NVVM_UI2D_RN : F_MATH_1<"cvt.rn.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rn>;
+def INT_NVVM_UI2D_RZ : F_MATH_1<"cvt.rz.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rz>;
+def INT_NVVM_UI2D_RM : F_MATH_1<"cvt.rm.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rm>;
+def INT_NVVM_UI2D_RP : F_MATH_1<"cvt.rp.f64.u32 \t$dst, $src0;",
+  Float64Regs, Int32Regs, int_nvvm_ui2d_rp>;
+
+def INT_NVVM_F2I_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rn_ftz>;
+def INT_NVVM_F2I_RN : F_MATH_1<"cvt.rni.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rn>;
+def INT_NVVM_F2I_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rz_ftz>;
+def INT_NVVM_F2I_RZ : F_MATH_1<"cvt.rzi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rz>;
+def INT_NVVM_F2I_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rm_ftz>;
+def INT_NVVM_F2I_RM : F_MATH_1<"cvt.rmi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rm>;
+def INT_NVVM_F2I_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2i_rp_ftz>;
+def INT_NVVM_F2I_RP : F_MATH_1<"cvt.rpi.s32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2i_rp>;
+
+def INT_NVVM_F2UI_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rn_ftz>;
+def INT_NVVM_F2UI_RN : F_MATH_1<"cvt.rni.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rn>;
+def INT_NVVM_F2UI_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rz_ftz>;
+def INT_NVVM_F2UI_RZ : F_MATH_1<"cvt.rzi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rz>;
+def INT_NVVM_F2UI_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rm_ftz>;
+def INT_NVVM_F2UI_RM : F_MATH_1<"cvt.rmi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rm>;
+def INT_NVVM_F2UI_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u32.f32 \t$dst, $src0;",
+  Int32Regs, Float32Regs, int_nvvm_f2ui_rp_ftz>;
+def INT_NVVM_F2UI_RP : F_MATH_1<"cvt.rpi.u32.f32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_f2ui_rp>;
+
+def INT_NVVM_I2F_RN : F_MATH_1<"cvt.rn.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rn>;
+def INT_NVVM_I2F_RZ : F_MATH_1<"cvt.rz.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rz>;
+def INT_NVVM_I2F_RM : F_MATH_1<"cvt.rm.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rm>;
+def INT_NVVM_I2F_RP : F_MATH_1<"cvt.rp.f32.s32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_i2f_rp>;
+
+def INT_NVVM_UI2F_RN : F_MATH_1<"cvt.rn.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rn>;
+def INT_NVVM_UI2F_RZ : F_MATH_1<"cvt.rz.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rz>;
+def INT_NVVM_UI2F_RM : F_MATH_1<"cvt.rm.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rm>;
+def INT_NVVM_UI2F_RP : F_MATH_1<"cvt.rp.f32.u32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_ui2f_rp>;
+
+def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
+  Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
+
+def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
+               "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
+                       !strconcat(".reg .b32 %temp; \n\t",
+                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
+                           "}}"))),
+             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+
+def INT_NVVM_F2LL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rn_ftz>;
+def INT_NVVM_F2LL_RN : F_MATH_1<"cvt.rni.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rn>;
+def INT_NVVM_F2LL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rz_ftz>;
+def INT_NVVM_F2LL_RZ : F_MATH_1<"cvt.rzi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rz>;
+def INT_NVVM_F2LL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rm_ftz>;
+def INT_NVVM_F2LL_RM : F_MATH_1<"cvt.rmi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rm>;
+def INT_NVVM_F2LL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ll_rp_ftz>;
+def INT_NVVM_F2LL_RP : F_MATH_1<"cvt.rpi.s64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ll_rp>;
+
+def INT_NVVM_F2ULL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rn_ftz>;
+def INT_NVVM_F2ULL_RN : F_MATH_1<"cvt.rni.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rn>;
+def INT_NVVM_F2ULL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rz_ftz>;
+def INT_NVVM_F2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rz>;
+def INT_NVVM_F2ULL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rm_ftz>;
+def INT_NVVM_F2ULL_RM : F_MATH_1<"cvt.rmi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rm>;
+def INT_NVVM_F2ULL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u64.f32 \t$dst, $src0;",
+  Int64Regs, Float32Regs, int_nvvm_f2ull_rp_ftz>;
+def INT_NVVM_F2ULL_RP : F_MATH_1<"cvt.rpi.u64.f32 \t$dst, $src0;", Int64Regs,
+  Float32Regs, int_nvvm_f2ull_rp>;
+
+def INT_NVVM_D2LL_RN : F_MATH_1<"cvt.rni.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rn>;
+def INT_NVVM_D2LL_RZ : F_MATH_1<"cvt.rzi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rz>;
+def INT_NVVM_D2LL_RM : F_MATH_1<"cvt.rmi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rm>;
+def INT_NVVM_D2LL_RP : F_MATH_1<"cvt.rpi.s64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ll_rp>;
+
+def INT_NVVM_D2ULL_RN : F_MATH_1<"cvt.rni.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rn>;
+def INT_NVVM_D2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rz>;
+def INT_NVVM_D2ULL_RM : F_MATH_1<"cvt.rmi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rm>;
+def INT_NVVM_D2ULL_RP : F_MATH_1<"cvt.rpi.u64.f64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_d2ull_rp>;
+
+def INT_NVVM_LL2F_RN : F_MATH_1<"cvt.rn.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rn>;
+def INT_NVVM_LL2F_RZ : F_MATH_1<"cvt.rz.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rz>;
+def INT_NVVM_LL2F_RM : F_MATH_1<"cvt.rm.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rm>;
+def INT_NVVM_LL2F_RP : F_MATH_1<"cvt.rp.f32.s64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ll2f_rp>;
+def INT_NVVM_ULL2F_RN : F_MATH_1<"cvt.rn.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rn>;
+def INT_NVVM_ULL2F_RZ : F_MATH_1<"cvt.rz.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rz>;
+def INT_NVVM_ULL2F_RM : F_MATH_1<"cvt.rm.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rm>;
+def INT_NVVM_ULL2F_RP : F_MATH_1<"cvt.rp.f32.u64 \t$dst, $src0;", Float32Regs,
+  Int64Regs, int_nvvm_ull2f_rp>;
+
+def INT_NVVM_LL2D_RN : F_MATH_1<"cvt.rn.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rn>;
+def INT_NVVM_LL2D_RZ : F_MATH_1<"cvt.rz.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rz>;
+def INT_NVVM_LL2D_RM : F_MATH_1<"cvt.rm.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rm>;
+def INT_NVVM_LL2D_RP : F_MATH_1<"cvt.rp.f64.s64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ll2d_rp>;
+def INT_NVVM_ULL2D_RN : F_MATH_1<"cvt.rn.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rn>;
+def INT_NVVM_ULL2D_RZ : F_MATH_1<"cvt.rz.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rz>;
+def INT_NVVM_ULL2D_RM : F_MATH_1<"cvt.rm.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rm>;
+def INT_NVVM_ULL2D_RP : F_MATH_1<"cvt.rp.f64.u64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_ull2d_rp>;
+
+def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
+def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
+                                   !strconcat(".reg .b16 %temp;\n\t",
+           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
+           !strconcat("mov.b16 \t$dst, %temp;\n",
+             "}}")))),
+           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
+
+def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
+                            !strconcat(".reg .b16 %temp;\n\t",
+          !strconcat("mov.b16 \t%temp, $src0;\n\t",
+          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
+            "}}")))),
+          Float32Regs, Int16Regs, int_nvvm_h2f>;
+
+//
+// Bitcast
+//
+
+def INT_NVVM_BITCAST_F2I : F_MATH_1<"mov.b32 \t$dst, $src0;", Int32Regs,
+  Float32Regs, int_nvvm_bitcast_f2i>;
+def INT_NVVM_BITCAST_I2F : F_MATH_1<"mov.b32 \t$dst, $src0;", Float32Regs,
+  Int32Regs, int_nvvm_bitcast_i2f>;
+
+def INT_NVVM_BITCAST_LL2D : F_MATH_1<"mov.b64 \t$dst, $src0;", Float64Regs,
+  Int64Regs, int_nvvm_bitcast_ll2d>;
+def INT_NVVM_BITCAST_D2LL : F_MATH_1<"mov.b64 \t$dst, $src0;", Int64Regs,
+  Float64Regs, int_nvvm_bitcast_d2ll>;
+
+//-----------------------------------
+// Atomic Functions
+//-----------------------------------
+
+class ATOMIC_GLOBAL_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
+}]>;
+class ATOMIC_SHARED_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
+}]>;
+class ATOMIC_GENERIC_CHK <dag ops, dag frag>
+ : PatFrag<ops, frag, [{
+   return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GENERIC);
+}]>;
+
+multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, SDNode IMM, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+  def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, SDNode IMM, Predicate Pred> {
+  defm p32 : F_ATOMIC_2_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+  defm p64 : F_ATOMIC_2_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, IMM, Pred>;
+}
+
+// has 2 operands, neg the second one
+multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
+    !strconcat("{{ \n\t",
+         !strconcat(".reg \t.s",
+         !strconcat(TypeStr,
+         !strconcat(" temp; \n\t",
+         !strconcat("neg.s",
+         !strconcat(TypeStr,
+         !strconcat(" \ttemp, $b; \n\t",
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(".u",
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], temp; \n\t",
+           !strconcat("}}", "")))))))))))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
+  string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+  Predicate Pred> {
+ defm p32: F_ATOMIC_2_NEG_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+ defm p64: F_ATOMIC_2_NEG_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+   IntOp, IMMType, Pred> ;
+}
+
+// has 3 operands
+multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
+  string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
+  Operand IMMType, Predicate Pred> {
+  def reg : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst,
+           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+         Requires<[Pred]>;
+  def imm1 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, regclass:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+  def imm2 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, regclass:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+  Requires<[Pred]>;
+  def imm3 : NVPTXInst<(outs regclass:$dst),
+    (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
+               !strconcat("atom",
+         !strconcat(SpaceStr,
+         !strconcat(OpcStr,
+         !strconcat(TypeStr,
+         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
+         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+  Requires<[Pred]>;
+}
+multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
+  string OpcStr, PatFrag IntOp, Operand IMMType, Predicate Pred> {
+  defm p32 : F_ATOMIC_3_imp<Int32Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+  defm p64 : F_ATOMIC_3_imp<Int64Regs, regclass, SpaceStr, TypeStr, OpcStr,
+    IntOp, IMMType, Pred>;
+}
+
+// atom_add
+
+def atomic_load_add_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_32 node:$a, node:$b)>;
+def atomic_load_add_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_add_64 node:$a, node:$b)>;
+def atomic_load_add_f32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+def atomic_load_add_f32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_add_f32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".add",
+  atomic_load_add_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".add",
+  atomic_load_add_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".add",
+  atomic_load_add_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".add", atomic_load_add_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".u64", ".add",
+  atomic_load_add_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".u64", ".add",
+  atomic_load_add_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".u64", ".add",
+  atomic_load_add_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".u64",
+  ".add", atomic_load_add_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2<Float32Regs, ".global", ".f32", ".add",
+  atomic_load_add_f32_g, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2<Float32Regs, ".shared", ".f32", ".add",
+  atomic_load_add_f32_s, f32imm, fpimm, hasAtomAddF32>;
+defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2<Float32Regs, "", ".f32", ".add",
+  atomic_load_add_f32_gen, f32imm, fpimm, hasAtomAddF32>;
+
+// atom_sub
+
+def atomic_load_sub_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_32 node:$a, node:$b)>;
+def atomic_load_sub_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+def atomic_load_sub_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_sub_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG<Int32Regs, ".global", "32", ".add",
+  atomic_load_sub_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG<Int64Regs, ".global", "64", ".add",
+  atomic_load_sub_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG<Int32Regs, "", "32", ".add",
+  atomic_load_sub_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG<Int32Regs, ".global", "32",
+  ".add", atomic_load_sub_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG<Int32Regs, ".shared", "32", ".add",
+  atomic_load_sub_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG<Int64Regs, ".shared", "64", ".add",
+  atomic_load_sub_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG<Int64Regs, "", "64", ".add",
+  atomic_load_sub_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG<Int64Regs, ".global", "64",
+  ".add", atomic_load_sub_64_gen, i64imm, useAtomRedG64forGen64>;
+
+// atom_swap
+
+def atomic_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_32 node:$a, node:$b)>;
+def atomic_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+def atomic_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_swap_64 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".exch",
+  atomic_swap_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".exch",
+  atomic_swap_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".exch",
+  atomic_swap_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".exch", atomic_swap_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2<Int64Regs, ".global", ".b64", ".exch",
+  atomic_swap_64_g, i64imm, imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2<Int64Regs, ".shared", ".b64", ".exch",
+  atomic_swap_64_s, i64imm, imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2<Int64Regs, "", ".b64", ".exch",
+  atomic_swap_64_gen, i64imm, imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2<Int64Regs, ".global", ".b64",
+  ".exch", atomic_swap_64_gen, i64imm, imm, useAtomRedG64forGen64>;
+
+// atom_max
+
+def atomic_load_max_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b)
+  , (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_max_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_max_32 node:$a, node:$b)>;
+def atomic_load_umax_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+def atomic_load_umax_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umax_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".max", atomic_load_max_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".max", atomic_load_max_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".max",
+  atomic_load_max_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".max", atomic_load_max_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".max", atomic_load_umax_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".max", atomic_load_umax_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".max",
+  atomic_load_umax_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".max", atomic_load_umax_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_min
+
+def atomic_load_min_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_min_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_min_32 node:$a, node:$b)>;
+def atomic_load_umin_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+def atomic_load_umin_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_umin_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".s32",
+  ".min", atomic_load_min_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".s32",
+  ".min", atomic_load_min_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".s32", ".min",
+  atomic_load_min_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".s32", ".min", atomic_load_min_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".min", atomic_load_umin_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32",
+  ".min", atomic_load_umin_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".min",
+  atomic_load_umin_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global",
+  ".u32", ".min", atomic_load_umin_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_inc  atom_dec
+
+def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>;
+def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".inc",
+  atomic_load_inc_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".inc",
+  atomic_load_inc_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".inc",
+  atomic_load_inc_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".inc", atomic_load_inc_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".u32", ".dec",
+  atomic_load_dec_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".u32", ".dec",
+  atomic_load_dec_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".u32", ".dec",
+  atomic_load_dec_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".u32",
+  ".dec", atomic_load_dec_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_and
+
+def atomic_load_and_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+def atomic_load_and_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_and_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".and",
+  atomic_load_and_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".and",
+  atomic_load_and_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".and",
+  atomic_load_and_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".and", atomic_load_and_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_or
+
+def atomic_load_or_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+def atomic_load_or_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_or_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".or",
+  atomic_load_or_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".or",
+  atomic_load_or_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".or", atomic_load_or_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".or",
+  atomic_load_or_32_s, i32imm, imm, hasAtomRedS32>;
+
+// atom_xor
+
+def atomic_load_xor_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+def atomic_load_xor_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b),
+  (atomic_load_xor_32 node:$a, node:$b)>;
+
+defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2<Int32Regs, ".global", ".b32", ".xor",
+  atomic_load_xor_32_g, i32imm, imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2<Int32Regs, ".shared", ".b32", ".xor",
+  atomic_load_xor_32_s, i32imm, imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2<Int32Regs, "", ".b32", ".xor",
+  atomic_load_xor_32_gen, i32imm, imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2<Int32Regs, ".global", ".b32",
+  ".xor", atomic_load_xor_32_gen, i32imm, imm, useAtomRedG32forGen32>;
+
+// atom_cas
+
+def atomic_cmp_swap_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_32 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+def atomic_cmp_swap_64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
+  (atomic_cmp_swap_64 node:$a, node:$b, node:$c)>;
+
+defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<Int32Regs, ".global", ".b32", ".cas",
+  atomic_cmp_swap_32_g, i32imm, hasAtomRedG32>;
+defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<Int32Regs, ".shared", ".b32", ".cas",
+  atomic_cmp_swap_32_s, i32imm, hasAtomRedS32>;
+defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<Int32Regs, "", ".b32", ".cas",
+  atomic_cmp_swap_32_gen, i32imm, hasAtomRedGen32>;
+defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<Int32Regs, ".global", ".b32",
+  ".cas", atomic_cmp_swap_32_gen, i32imm, useAtomRedG32forGen32>;
+defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<Int64Regs, ".global", ".b64", ".cas",
+  atomic_cmp_swap_64_g, i64imm, hasAtomRedG64>;
+defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<Int64Regs, ".shared", ".b64", ".cas",
+  atomic_cmp_swap_64_s, i64imm, hasAtomRedS64>;
+defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<Int64Regs, "", ".b64", ".cas",
+  atomic_cmp_swap_64_gen, i64imm, hasAtomRedGen64>;
+defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<Int64Regs, ".global", ".b64",
+  ".cas", atomic_cmp_swap_64_gen, i64imm, useAtomRedG64forGen64>;
+
+
+//-----------------------------------
+// Read Special Registers
+//-----------------------------------
+class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
+      NVPTXInst<(outs regclassOut:$dst), (ins),
+               OpStr,
+         [(set regclassOut:$dst, (IntOp))]>;
+
+def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_x>;
+def INT_PTX_SREG_TID_Y : F_SREG<"mov.u32 \t$dst, %tid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_y>;
+def INT_PTX_SREG_TID_Z : F_SREG<"mov.u32 \t$dst, %tid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_tid_z>;
+
+def INT_PTX_SREG_NTID_X : F_SREG<"mov.u32 \t$dst, %ntid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_x>;
+def INT_PTX_SREG_NTID_Y : F_SREG<"mov.u32 \t$dst, %ntid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_y>;
+def INT_PTX_SREG_NTID_Z : F_SREG<"mov.u32 \t$dst, %ntid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ntid_z>;
+
+def INT_PTX_SREG_CTAID_X : F_SREG<"mov.u32 \t$dst, %ctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_x>;
+def INT_PTX_SREG_CTAID_Y : F_SREG<"mov.u32 \t$dst, %ctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_y>;
+def INT_PTX_SREG_CTAID_Z : F_SREG<"mov.u32 \t$dst, %ctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_ctaid_z>;
+
+def INT_PTX_SREG_NCTAID_X : F_SREG<"mov.u32 \t$dst, %nctaid.x;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_x>;
+def INT_PTX_SREG_NCTAID_Y : F_SREG<"mov.u32 \t$dst, %nctaid.y;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_y>;
+def INT_PTX_SREG_NCTAID_Z : F_SREG<"mov.u32 \t$dst, %nctaid.z;", Int32Regs,
+  int_nvvm_read_ptx_sreg_nctaid_z>;
+
+def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
+  int_nvvm_read_ptx_sreg_warpsize>;
+
+
+//-----------------------------------
+// Support for ldu on sm_20 or later
+//-----------------------------------
+
+// Scalar
+// @TODO: Revisit this, Changed imemAny to imem
+multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ldu.global.", TyStr),
+                [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+                Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];",  Int8Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_i>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs,
+int_nvvm_ldu_global_f>;
+defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
+int_nvvm_ldu_global_p>;
+defm INT_PTX_LDU_GLOBAL_p64 : LDU_G<"u64 \t$result, [$src];", Int64Regs,
+int_nvvm_ldu_global_p>;
+
+// vector
+
+// Elementized vector ldu
+multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+   (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+}
+
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
+ def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+ def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+     regclass:$dst4), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr), []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int8Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE
+  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE
+  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE
+  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE
+  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE
+  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDU_G_v4i8_ELE
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE
+  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE
+  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE
+  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float32Regs>;
+
+
+//-----------------------------------
+// Support for ldg on sm_35 or later 
+//-----------------------------------
+
+def ldg_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldg_global_i node:$ptr), [{
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  return M->getMemoryVT() == MVT::i8;
+}]>;
+
+multiclass LDG_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+         Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+}
+
+multiclass LDG_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDG]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDG]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+        Requires<[hasLDG]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDG]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ld.global.nc.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDG]>;
+}
+
+defm INT_PTX_LDG_GLOBAL_i8
+  : LDG_G_NOINTRIN<"u8 \t$result, [$src];",  Int16Regs, ldg_i8>;
+defm INT_PTX_LDG_GLOBAL_i16
+  : LDG_G<"u16 \t$result, [$src];", Int16Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_i32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_i64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_i>;
+defm INT_PTX_LDG_GLOBAL_f32
+  : LDG_G<"f32 \t$result, [$src];", Float32Regs, int_nvvm_ldg_global_f>;
+defm INT_PTX_LDG_GLOBAL_f64
+  : LDG_G<"f64 \t$result, [$src];", Float64Regs, int_nvvm_ldg_global_f>;
+defm INT_PTX_LDG_GLOBAL_p32
+  : LDG_G<"u32 \t$result, [$src];", Int32Regs,   int_nvvm_ldg_global_p>;
+defm INT_PTX_LDG_GLOBAL_p64
+  : LDG_G<"u64 \t$result, [$src];", Int64Regs,   int_nvvm_ldg_global_p>;
+
+// vector
+
+// Elementized vector ldg 
+multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
+ def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+ def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                        regclass:$dst3, regclass:$dst4), (ins Int32Regs:$src),
+               !strconcat("ld.global.nc.", TyStr), []>;
+ def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
+                        regclass:$dst3, regclass:$dst4), (ins Int64Regs:$src),
+               !strconcat("ld.global.nc.", TyStr), []>;
+}
+
+// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
+defm INT_PTX_LDG_G_v2i8_ELE
+  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE
+  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE
+  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE
+  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE
+  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE
+  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
+defm INT_PTX_LDG_G_v4i8_ELE
+  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE
+  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE
+  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE
+  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+
+
+multiclass NG_TO_G<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def __addr_yes : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;
+   def __addr_yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>,
+      Requires<[hasGenericLdSt]>;*/
+
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+
+// @TODO: Are these actually needed?  I believe global addresses will be copied
+// to register values anyway.
+   /*def _addr_no : NVPTXInst<(outs Int32Regs:$result), (ins imem:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;
+   def _addr_no_64 : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin (Wrapper tglobaladdr:$src)))]>;*/
+}
+
+multiclass G_TO_NG<string Str, Intrinsic Intrin> {
+   def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
+   Requires<[hasGenericLdSt]>;
+   def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+          "mov.u32 \t$result, $src;",
+      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+   def _no_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+          "mov.u64 \t$result, $src;",
+      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
+}
+
+defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
+defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
+defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+
+defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
+defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
+defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
+
+def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
+def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
+
+
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the def.
+/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
+               "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;
+def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
+               "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
+     (Wrapper tglobaladdr:$src)))]>;*/
+
+
+def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
+            "mov.u32 \t$result, $src;",
+     [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
+def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
+            "mov.u64 \t$result, $src;",
+     [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+
+
+// nvvm.ptr.gen.to.param
+def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
+  (ins Int32Regs:$src),
+                        "mov.u32 \t$result, $src;",
+                              [(set Int32Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
+def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
+  (ins Int64Regs:$src),
+                        "mov.u64 \t$result, $src;",
+                              [(set Int64Regs:$result,
+                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+
+
+// nvvm.move intrinsicc
+def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int8Regs:$r,
+                               (int_nvvm_move_i8 Int8Regs:$s))]>;
+def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
+                             "mov.b16 \t$r, $s;",
+                             [(set Int16Regs:$r,
+                               (int_nvvm_move_i16 Int16Regs:$s))]>;
+def nvvm_move_i32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.b32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_i32 Int32Regs:$s))]>;
+def nvvm_move_i64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.b64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_i64 Int64Regs:$s))]>;
+def nvvm_move_float : NVPTXInst<(outs Float32Regs:$r), (ins Float32Regs:$s),
+                             "mov.f32 \t$r, $s;",
+                             [(set Float32Regs:$r,
+                               (int_nvvm_move_float Float32Regs:$s))]>;
+def nvvm_move_double : NVPTXInst<(outs Float64Regs:$r), (ins Float64Regs:$s),
+                             "mov.f64 \t$r, $s;",
+                             [(set Float64Regs:$r,
+                               (int_nvvm_move_double Float64Regs:$s))]>;
+def nvvm_move_ptr32 : NVPTXInst<(outs Int32Regs:$r), (ins Int32Regs:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                               (int_nvvm_move_ptr Int32Regs:$s))]>;
+def nvvm_move_ptr64 : NVPTXInst<(outs Int64Regs:$r), (ins Int64Regs:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                               (int_nvvm_move_ptr Int64Regs:$s))]>;
+
+// @TODO: Are these actually needed, or will we always just see symbols
+// copied to registers first?
+/*def nvvm_move_sym32 : NVPTXInst<(outs Int32Regs:$r), (ins imem:$s),
+                             "mov.u32 \t$r, $s;",
+                             [(set Int32Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;
+def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
+                             "mov.u64 \t$r, $s;",
+                             [(set Int64Regs:$r,
+                             (int_nvvm_move_ptr texternalsym:$s))]>;*/
+
+
+// MoveParam        %r1, param
+// ptr_local_to_gen %r2, %r1
+// ptr_gen_to_local %r3, %r2
+// ->
+// mov %r1, param
+
+// @TODO: Revisit this.  There is a type
+// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
+// instructions are not currently defined. However, we can use the ptr
+// variants and the asm printer will do the right thing.
+def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr64  texternalsym:$src)>;
+def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
+                (MoveParam texternalsym:$src)))),
+               (nvvm_move_ptr32  texternalsym:$src)>;
+
+
+//-----------------------------------
+// Compiler Error Warn
+// - Just ignore them in codegen
+//-----------------------------------
+
+def INT_NVVM_COMPILER_WARN_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_WARN_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.warn()",
+                [(int_nvvm_compiler_warn Int64Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_32 : NVPTXInst<(outs), (ins Int32Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int32Regs:$a)]>;
+def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
+                "// llvm.nvvm.compiler.error()",
+                [(int_nvvm_compiler_error Int64Regs:$a)]>;
+
+
+
+//===-- Old PTX Back-end Intrinsics ---------------------------------------===//
+
+// These intrinsics are handled to retain compatibility with the old backend.
+
+// PTX Special Purpose Register Accessor Intrinsics
+
+class PTX_READ_SPECIAL_REGISTER_R64<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int64Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              [(set Int64Regs:$d, (intop))]>;
+
+class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
+  : NVPTXInst<(outs Int32Regs:$d), (ins),
+              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              [(set Int32Regs:$d, (intop))]>;
+
+// TODO Add read vector-version of special registers
+
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
+
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
+
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
+
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
+
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
+
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
+
+def PTX_READ_LANEMASK_EQ
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
+def PTX_READ_LANEMASK_LE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_le", int_ptx_read_lanemask_le>;
+def PTX_READ_LANEMASK_LT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_lt", int_ptx_read_lanemask_lt>;
+def PTX_READ_LANEMASK_GE
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_ge", int_ptx_read_lanemask_ge>;
+def PTX_READ_LANEMASK_GT
+  : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_gt", int_ptx_read_lanemask_gt>;
+
+def PTX_READ_CLOCK
+  : PTX_READ_SPECIAL_REGISTER_R32<"clock", int_ptx_read_clock>;
+def PTX_READ_CLOCK64
+  : PTX_READ_SPECIAL_REGISTER_R64<"clock64", int_ptx_read_clock64>;
+
+def PTX_READ_PM0 : PTX_READ_SPECIAL_REGISTER_R32<"pm0", int_ptx_read_pm0>;
+def PTX_READ_PM1 : PTX_READ_SPECIAL_REGISTER_R32<"pm1", int_ptx_read_pm1>;
+def PTX_READ_PM2 : PTX_READ_SPECIAL_REGISTER_R32<"pm2", int_ptx_read_pm2>;
+def PTX_READ_PM3 : PTX_READ_SPECIAL_REGISTER_R32<"pm3", int_ptx_read_pm3>;
+
+// PTX Parallel Synchronization and Communication Intrinsics
+
+def PTX_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+                             [(int_ptx_bar_sync imm:$i)]>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
new file mode 100644
index 000000000000..7c257b4c6a89
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -0,0 +1,205 @@
+//===- NVPTXLowerAggrCopies.cpp - ------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
+// the size is large or is not a compile-time constant.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXLowerAggrCopies.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+namespace llvm { FunctionPass *createLowerAggrCopies(); }
+
+char NVPTXLowerAggrCopies::ID = 0;
+
+// Lower MemTransferInst or load-store pair to loop
+static void convertTransferToLoop(
+    Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len,
+    //unsigned numLoads,
+    bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {
+  Type *indType = len->getType();
+
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  // srcAddr and dstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned srcAS = dyn_cast<PointerType>(srcAddr->getType())->getAddressSpace();
+  unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS));
+  dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS));
+
+  IRBuilder<> loop(loopBB);
+  // The loop index (ind) is a phi node.
+  PHINode *ind = loop.CreatePHI(indType, 0);
+  // Incoming value for ind is 0
+  ind->addIncoming(ConstantInt::get(indType, 0), origBB);
+
+  // load from srcAddr+ind
+  Value *val = loop.CreateLoad(loop.CreateGEP(srcAddr, ind), srcVolatile);
+  // store at dstAddr+ind
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), dstVolatile);
+
+  // The value for ind coming from backedge is (ind + 1)
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+// Lower MemSetInst to loop
+static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr,
+                                Value *len, Value *val, LLVMContext &Context,
+                                Function &F) {
+  BasicBlock *origBB = splitAt->getParent();
+  BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
+  BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
+
+  origBB->getTerminator()->setSuccessor(0, loopBB);
+  IRBuilder<> builder(origBB, origBB->getTerminator());
+
+  unsigned dstAS = dyn_cast<PointerType>(dstAddr->getType())->getAddressSpace();
+
+  // Cast pointer to the type of value getting stored
+  dstAddr =
+      builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS));
+
+  IRBuilder<> loop(loopBB);
+  PHINode *ind = loop.CreatePHI(len->getType(), 0);
+  ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB);
+
+  loop.CreateStore(val, loop.CreateGEP(dstAddr, ind), false);
+
+  Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1));
+  ind->addIncoming(newind, loopBB);
+
+  loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
+}
+
+bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
+  SmallVector<LoadInst *, 4> aggrLoads;
+  SmallVector<MemTransferInst *, 4> aggrMemcpys;
+  SmallVector<MemSetInst *, 4> aggrMemsets;
+
+  DataLayout *TD = &getAnalysis<DataLayout>();
+  LLVMContext &Context = F.getParent()->getContext();
+
+  //
+  // Collect all the aggrLoads, aggrMemcpys and addrMemsets.
+  //
+  //const BasicBlock *firstBB = &F.front();  // first BB in F
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    //BasicBlock *bb = BI;
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
+         ++II) {
+      if (LoadInst *load = dyn_cast<LoadInst>(II)) {
+
+        if (load->hasOneUse() == false)
+          continue;
+
+        if (TD->getTypeStoreSize(load->getType()) < MaxAggrCopySize)
+          continue;
+
+        User *use = *(load->use_begin());
+        if (StoreInst *store = dyn_cast<StoreInst>(use)) {
+          if (store->getOperand(0) != load) //getValueOperand
+            continue;
+          aggrLoads.push_back(load);
+        }
+      } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {
+        Value *len = intr->getLength();
+        // If the number of elements being copied is greater
+        // than MaxAggrCopySize, lower it to a loop
+        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemcpys.push_back(intr);
+          }
+        } else {
+          // turn variable length memcpy/memmov into loop
+          aggrMemcpys.push_back(intr);
+        }
+      } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
+        Value *len = memsetintr->getLength();
+        if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
+          if (len_int->getZExtValue() >= MaxAggrCopySize) {
+            aggrMemsets.push_back(memsetintr);
+          }
+        } else {
+          // turn variable length memset into loop
+          aggrMemsets.push_back(memsetintr);
+        }
+      }
+    }
+  }
+  if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
+      (aggrMemsets.size() == 0))
+    return false;
+
+  //
+  // Do the transformation of an aggr load/copy/set to a loop
+  //
+  for (unsigned i = 0, e = aggrLoads.size(); i != e; ++i) {
+    LoadInst *load = aggrLoads[i];
+    StoreInst *store = dyn_cast<StoreInst>(*load->use_begin());
+    Value *srcAddr = load->getOperand(0);
+    Value *dstAddr = store->getOperand(1);
+    unsigned numLoads = TD->getTypeStoreSize(load->getType());
+    Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);
+
+    convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
+                          store->isVolatile(), Context, F);
+
+    store->eraseFromParent();
+    load->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemcpys.size(); i != e; ++i) {
+    MemTransferInst *cpy = aggrMemcpys[i];
+    Value *len = cpy->getLength();
+    // llvm 2.7 version of memcpy does not have volatile
+    // operand yet. So always making it non-volatile
+    // optimistically, so that we don't see unnecessary
+    // st.volatile in ptx
+    convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false,
+                          false, Context, F);
+    cpy->eraseFromParent();
+  }
+
+  for (unsigned i = 0, e = aggrMemsets.size(); i != e; ++i) {
+    MemSetInst *memsetinst = aggrMemsets[i];
+    Value *len = memsetinst->getLength();
+    Value *val = memsetinst->getValue();
+    convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,
+                        F);
+    memsetinst->eraseFromParent();
+  }
+
+  return true;
+}
+
+FunctionPass *llvm::createLowerAggrCopies() {
+  return new NVPTXLowerAggrCopies();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
new file mode 100644
index 000000000000..286e753fa92b
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -0,0 +1,47 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.h ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific lowering of
+// aggregate copies
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_LOWER_AGGR_COPIES_H
+#define NVPTX_LOWER_AGGR_COPIES_H
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXLowerAggrCopies : public FunctionPass {
+  static char ID;
+
+  NVPTXLowerAggrCopies() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DataLayout>();
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+
+  virtual bool runOnFunction(Function &F);
+
+  static const unsigned MaxAggrCopySize = 128;
+
+  virtual const char *getPassName() const {
+    return "Lower aggregate copies/intrinsics into loops";
+  }
+};
+
+extern FunctionPass *createLowerAggrCopies();
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXNumRegisters.h b/contrib/llvm/lib/Target/NVPTX/NVPTXNumRegisters.h
new file mode 100644
index 000000000000..a95c16b1e67e
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXNumRegisters.h
@@ -0,0 +1,16 @@
+
+//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_NUM_REGISTERS_H
+#define NVPTX_NUM_REGISTERS_H
+
+namespace llvm { const unsigned NVPTXNumRegisters = 396; }
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
new file mode 100644
index 000000000000..282465359b07
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -0,0 +1,131 @@
+//===- NVPTXRegisterInfo.cpp - NVPTX Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-reg-info"
+
+#include "NVPTXRegisterInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return ".f32";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return ".f64";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    return ".s64";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return ".s32";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return ".s16";
+  }
+      // Int8Regs become 16-bit registers in PTX
+      else if (RC == &NVPTX::Int8RegsRegClass) {
+    return ".s16";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return ".pred";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+
+std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
+  if (RC == &NVPTX::Float32RegsRegClass) {
+    return "%f";
+  }
+  if (RC == &NVPTX::Float64RegsRegClass) {
+    return "%fd";
+  } else if (RC == &NVPTX::Int64RegsRegClass) {
+    return "%rd";
+  } else if (RC == &NVPTX::Int32RegsRegClass) {
+    return "%r";
+  } else if (RC == &NVPTX::Int16RegsRegClass) {
+    return "%rs";
+  } else if (RC == &NVPTX::Int8RegsRegClass) {
+    return "%rc";
+  } else if (RC == &NVPTX::Int1RegsRegClass) {
+    return "%p";
+  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+    return "!Special!";
+  } else {
+    return "INTERNAL";
+  }
+  return "";
+}
+}
+
+NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
+                                     const NVPTXSubtarget &st)
+    : NVPTXGenRegisterInfo(0), Is64Bit(st.is64Bit()) {}
+
+#define GET_REGINFO_TARGET_DESC
+#include "NVPTXGenRegisterInfo.inc"
+
+/// NVPTX Callee Saved Registers
+const uint16_t *
+NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// NVPTX Callee Saved Reg Classes
+const TargetRegisterClass *const *
+NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass *const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                            int SPAdj, unsigned FIOperandNum,
+                                            RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Using I0 as the frame pointer
+  MI.getOperand(FIOperandNum).ChangeToRegister(NVPTX::VRFrame, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+}
+
+int NVPTXRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return 0;
+}
+
+unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return NVPTX::VRFrame;
+}
+
+unsigned NVPTXRegisterInfo::getRARegister() const { return 0; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
new file mode 100644
index 000000000000..d40682066142
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -0,0 +1,79 @@
+//===- NVPTXRegisterInfo.h - NVPTX Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the NVPTX implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXREGISTERINFO_H
+#define NVPTXREGISTERINFO_H
+
+#include "ManagedStringPool.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "NVPTXGenRegisterInfo.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
+
+namespace llvm {
+
+// Forward Declarations.
+class TargetInstrInfo;
+class NVPTXSubtarget;
+
+class NVPTXRegisterInfo : public NVPTXGenRegisterInfo {
+private:
+  bool Is64Bit;
+  // Hold Strings that can be free'd all together with NVPTXRegisterInfo
+  ManagedStringPool ManagedStrPool;
+
+public:
+  NVPTXRegisterInfo(const TargetInstrInfo &tii, const NVPTXSubtarget &st);
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // NVPTX callee saved registers
+  virtual const uint16_t *
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // NVPTX callee saved register classes
+  virtual const TargetRegisterClass *const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                                   unsigned FIOperandNum,
+                                   RegScavenger *RS = NULL) const;
+
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+  ManagedStringPool *getStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  const char *getName(unsigned RegNo) const {
+    std::stringstream O;
+    O << "reg" << RegNo;
+    return getStrPool()->getManagedString(O.str().c_str())->c_str();
+  }
+
+};
+
+std::string getNVPTXRegClassName(const TargetRegisterClass *RC);
+std::string getNVPTXRegClassStr(const TargetRegisterClass *RC);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
new file mode 100644
index 000000000000..8d100d631683
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -0,0 +1,64 @@
+//===-- NVPTXRegisterInfo.td - NVPTX Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PTX register file
+//===----------------------------------------------------------------------===//
+
+class NVPTXReg<string n> : Register<n> {
+  let Namespace = "NVPTX";
+}
+
+class NVPTXRegClass<list<ValueType> regTypes, int alignment, dag regList>
+     : RegisterClass <"NVPTX", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+// Special Registers used as stack pointer
+def VRFrame         : NVPTXReg<"%SP">;
+def VRFrameLocal    : NVPTXReg<"%SPL">;
+
+// Special Registers used as the stack
+def VRDepot  : NVPTXReg<"%Depot">;
+
+foreach i = 0-395 in {
+  def P#i  : NVPTXReg<"%p"#i>;  // Predicate
+  def RC#i : NVPTXReg<"%rc"#i>; // 8-bit
+  def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
+  def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
+  def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
+  def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
+  def FL#i : NVPTXReg<"%fl"#i>; // 64-bit float
+
+  // Arguments
+  def ia#i : NVPTXReg<"%ia"#i>;
+  def la#i : NVPTXReg<"%la"#i>;
+  def fa#i : NVPTXReg<"%fa"#i>;
+  def da#i : NVPTXReg<"%da"#i>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register classes
+//===----------------------------------------------------------------------===//
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>;
+def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 395))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 395))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 395))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 395))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 395))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>;
+
+// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
new file mode 100644
index 000000000000..e166be5a68e4
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -0,0 +1,47 @@
+//===- NVPTXSection.h - NVPTX-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTXSection class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_NVPTXSECTION_H
+#define LLVM_NVPTXSECTION_H
+
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCSection.h"
+#include <vector>
+
+namespace llvm {
+/// NVPTXSection - Represents a section in PTX
+/// PTX does not have sections. We create this class in order to use
+/// the ASMPrint interface.
+///
+class NVPTXSection : public MCSection {
+
+public:
+  NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
+  ~NVPTXSection() {}
+
+  /// Override this as NVPTX has its own way of printing switching
+  /// to a section.
+  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                    raw_ostream &OS) const {}
+
+  /// Base address of PTX sections is zero.
+  virtual bool isBaseAddressKnownZero() const { return true; }
+  virtual bool UseCodeAlign() const { return false; }
+  virtual bool isVirtualSection() const { return false; }
+  virtual std::string getLabelBeginName() const { return ""; }
+  virtual std::string getLabelEndName() const { return ""; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
new file mode 100644
index 000000000000..83dfe120899a
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -0,0 +1,73 @@
+//===- NVPTXSplitBBatBar.cpp - Split BB at Barrier  --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Split basic blocks so that a basic block that contains a barrier instruction
+// only contains the barrier instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSplitBBatBar.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+namespace llvm { FunctionPass *createSplitBBatBarPass(); }
+
+char NVPTXSplitBBatBar::ID = 0;
+
+bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
+
+  SmallVector<Instruction *, 4> SplitPoints;
+  bool changed = false;
+
+  // Collect all the split points in SplitPoints
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    BasicBlock::iterator IB = BI->begin();
+    BasicBlock::iterator II = IB;
+    BasicBlock::iterator IE = BI->end();
+
+    // Skit the first intruction. No splitting is needed at this
+    // point even if this is a bar.
+    while (II != IE) {
+      if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
+        Intrinsic::ID id = inst->getIntrinsicID();
+        // If this is a barrier, split at this instruction
+        // and the next instruction.
+        if (llvm::isBarrierIntrinsic(id)) {
+          if (II != IB)
+            SplitPoints.push_back(II);
+          II++;
+          if ((II != IE) && (!II->isTerminator())) {
+            SplitPoints.push_back(II);
+            II++;
+          }
+          continue;
+        }
+      }
+      II++;
+    }
+  }
+
+  for (unsigned i = 0; i != SplitPoints.size(); i++) {
+    changed = true;
+    Instruction *inst = SplitPoints[i];
+    inst->getParent()->splitBasicBlock(inst, "bar_split");
+  }
+
+  return changed;
+}
+
+// This interface will most likely not be necessary, because this pass will
+// not be invoked by the driver, but will be used as a prerequisite to
+// another pass.
+FunctionPass *llvm::createSplitBBatBarPass() { return new NVPTXSplitBBatBar(); }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h
new file mode 100644
index 000000000000..bdafba9075a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h
@@ -0,0 +1,41 @@
+//===-- llvm/lib/Target/NVPTX/NVPTXSplitBBatBar.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVIDIA specific declarations
+// for splitting basic blocks at barrier instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_SPLIT_BB_AT_BAR_H
+#define NVPTX_SPLIT_BB_AT_BAR_H
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+// actual analysis class, which is a functionpass
+struct NVPTXSplitBBatBar : public FunctionPass {
+  static char ID;
+
+  NVPTXSplitBBatBar() : FunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+  }
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "Split basic blocks at barrier";
+  }
+};
+
+extern FunctionPass *createSplitBBatBarPass();
+}
+
+#endif //NVPTX_SPLIT_BB_AT_BAR_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
new file mode 100644
index 000000000000..2dcd73dcff9c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -0,0 +1,61 @@
+//===- NVPTXSubtarget.cpp - NVPTX Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXSubtarget.h"
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "NVPTXGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// Select Driver Interface
+#include "llvm/Support/CommandLine.h"
+namespace {
+cl::opt<NVPTX::DrvInterface> DriverInterface(
+    cl::desc("Choose driver interface:"),
+    cl::values(clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"),
+               clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"),
+               clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"), clEnumValEnd),
+    cl::init(NVPTX::NVCL));
+}
+
+NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS, bool is64Bit)
+    : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
+      SmVersion(20) {
+
+  drvInterface = DriverInterface;
+
+  // Provide the default CPU if none
+  std::string defCPU = "sm_20";
+
+  ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
+
+  // Get the TargetName from the FS if available
+  if (FS.empty() && CPU.empty())
+    TargetName = defCPU;
+  else if (!CPU.empty())
+    TargetName = CPU;
+  else
+    llvm_unreachable("we are not using FeatureStr");
+
+  // We default to PTX 3.1, but we cannot just default to it in the initializer
+  // since the attribute parser checks if the given option is >= the default.
+  // So if we set ptx31 as the default, the ptx30 attribute would never match.
+  // Instead, we use 0 as the default and manually set 31 if the default is
+  // used.
+  if (PTXVersion == 0) {
+    PTXVersion = 31;
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
new file mode 100644
index 000000000000..670077daaa69
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -0,0 +1,96 @@
+//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXSUBTARGET_H
+#define NVPTXSUBTARGET_H
+
+#include "NVPTX.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "NVPTXGenSubtargetInfo.inc"
+
+#include <string>
+
+namespace llvm {
+
+class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
+
+  std::string TargetName;
+  NVPTX::DrvInterface drvInterface;
+  bool Is64Bit;
+
+  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned PTXVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned int SmVersion;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  NVPTXSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool is64Bit);
+
+  bool hasBrkPt() const { return SmVersion >= 11; }
+  bool hasAtomRedG32() const { return SmVersion >= 11; }
+  bool hasAtomRedS32() const { return SmVersion >= 12; }
+  bool hasAtomRedG64() const { return SmVersion >= 12; }
+  bool hasAtomRedS64() const { return SmVersion >= 20; }
+  bool hasAtomRedGen32() const { return SmVersion >= 20; }
+  bool hasAtomRedGen64() const { return SmVersion >= 20; }
+  bool hasAtomAddF32() const { return SmVersion >= 20; }
+  bool hasVote() const { return SmVersion >= 12; }
+  bool hasDouble() const { return SmVersion >= 13; }
+  bool reqPTX20() const { return SmVersion >= 20; }
+  bool hasF32FTZ() const { return SmVersion >= 20; }
+  bool hasFMAF32() const { return SmVersion >= 20; }
+  bool hasFMAF64() const { return SmVersion >= 13; }
+  bool hasLDG() const { return SmVersion >= 32; }
+  bool hasLDU() const { return SmVersion >= 20; }
+  bool hasGenericLdSt() const { return SmVersion >= 20; }
+  inline bool hasHWROT32() const { return false; }
+  inline bool hasSWROT32() const { return true; }
+  inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
+  inline bool hasROT64() const { return SmVersion >= 20; }
+
+  bool is64Bit() const { return Is64Bit; }
+
+  unsigned int getSmVersion() const { return SmVersion; }
+  NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
+  std::string getTargetName() const { return TargetName; }
+
+  unsigned getPTXVersion() const { return PTXVersion; }
+
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+    else
+      p = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-"
+          "f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-"
+          "n16:32:64";
+
+    return std::string(p);
+  }
+
+};
+
+} // End llvm namespace
+
+#endif // NVPTXSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
new file mode 100644
index 000000000000..67ca6b58e5a6
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -0,0 +1,121 @@
+//===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the NVPTX target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXTargetMachine.h"
+#include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "NVPTX.h"
+#include "NVPTXAllocaHoisting.h"
+#include "NVPTXLowerAggrCopies.h"
+#include "NVPTXSplitBBatBar.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVVMReflectPass(PassRegistry&);
+}
+
+extern "C" void LLVMInitializeNVPTXTarget() {
+  // Register the target.
+  RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
+  RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
+
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
+  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
+
+  // FIXME: This pass is really intended to be invoked during IR optimization,
+  // but it's very NVPTX-specific.
+  initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+}
+
+NVPTXTargetMachine::NVPTXTargetMachine(
+    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL, bool is64bit)
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()),
+      InstrInfo(*this), TLInfo(*this), TSInfo(*this),
+      FrameLowering(
+          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {}
+
+void NVPTXTargetMachine32::anchor() {}
+
+NVPTXTargetMachine32::NVPTXTargetMachine32(
+    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+
+void NVPTXTargetMachine64::anchor() {}
+
+NVPTXTargetMachine64::NVPTXTargetMachine64(
+    const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL)
+    : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
+
+namespace llvm {
+class NVPTXPassConfig : public TargetPassConfig {
+public:
+  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+      : TargetPassConfig(TM, PM) {}
+
+  NVPTXTargetMachine &getNVPTXTargetMachine() const {
+    return getTM<NVPTXTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+};
+}
+
+TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
+  NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
+  return PassConfig;
+}
+
+bool NVPTXPassConfig::addInstSelector() {
+  addPass(createLowerAggrCopies());
+  addPass(createSplitBBatBarPass());
+  addPass(createAllocaHoisting());
+  addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+  return false;
+}
+
+bool NVPTXPassConfig::addPreRegAlloc() { return false; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
new file mode 100644
index 000000000000..5fbcf735b48f
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -0,0 +1,118 @@
+//===-- NVPTXTargetMachine.h - Define TargetMachine for NVPTX ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the NVPTX specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_TARGETMACHINE_H
+#define NVPTX_TARGETMACHINE_H
+
+#include "ManagedStringPool.h"
+#include "NVPTXFrameLowering.h"
+#include "NVPTXISelLowering.h"
+#include "NVPTXInstrInfo.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// NVPTXTargetMachine
+///
+class NVPTXTargetMachine : public LLVMTargetMachine {
+  NVPTXSubtarget Subtarget;
+  const DataLayout DL; // Calculates type size & alignment
+  NVPTXInstrInfo InstrInfo;
+  NVPTXTargetLowering TLInfo;
+  TargetSelectionDAGInfo TSInfo;
+
+  // NVPTX does not have any call stack frame, but need a NVPTX specific
+  // FrameLowering class because TargetFrameLowering is abstract.
+  NVPTXFrameLowering FrameLowering;
+
+  // Hold Strings that can be free'd all together with NVPTXTargetMachine
+  ManagedStringPool ManagedStrPool;
+
+  //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+  //                            bool DisableVerify, MCContext *&OutCtx);
+
+public:
+  NVPTXTargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
+                     const TargetOptions &Options, Reloc::Model RM,
+                     CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
+
+  virtual const TargetFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const DataLayout *getDataLayout() const { return &DL; }
+  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+    return &(InstrInfo.getRegisterInfo());
+  }
+
+  virtual NVPTXTargetLowering *getTargetLowering() const {
+    return const_cast<NVPTXTargetLowering *>(&TLInfo);
+  }
+
+  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  //virtual bool addInstSelector(PassManagerBase &PM,
+  //                             CodeGenOpt::Level OptLevel);
+
+  //virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+
+  ManagedStringPool *getManagedStrPool() const {
+    return const_cast<ManagedStringPool *>(&ManagedStrPool);
+  }
+
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  // Emission of machine code through JITCodeEmitter is not supported.
+  virtual bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
+                                          bool = true) {
+    return true;
+  }
+
+  // Emission of machine code through MCJIT is not supported.
+  virtual bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
+                                 bool = true) {
+    return true;
+  }
+
+}; // NVPTXTargetMachine.
+
+class NVPTXTargetMachine32 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine32(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+class NVPTXTargetMachine64 : public NVPTXTargetMachine {
+  virtual void anchor();
+public:
+  NVPTXTargetMachine64(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
new file mode 100644
index 000000000000..6ab0e08ad091
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -0,0 +1,102 @@
+//===-- NVPTXTargetObjectFile.h - NVPTX Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+#define LLVM_TARGET_NVPTX_TARGETOBJECTFILE_H
+
+#include "NVPTXSection.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <string>
+
+namespace llvm {
+class GlobalVariable;
+class Module;
+
+class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
+
+public:
+  NVPTXTargetObjectFile() {}
+  ~NVPTXTargetObjectFile() {
+    delete TextSection;
+    delete DataSection;
+    delete BSSSection;
+    delete ReadOnlySection;
+
+    delete StaticCtorSection;
+    delete StaticDtorSection;
+    delete LSDASection;
+    delete EHFrameSection;
+    delete DwarfAbbrevSection;
+    delete DwarfInfoSection;
+    delete DwarfLineSection;
+    delete DwarfFrameSection;
+    delete DwarfPubTypesSection;
+    delete DwarfDebugInlineSection;
+    delete DwarfStrSection;
+    delete DwarfLocSection;
+    delete DwarfARangesSection;
+    delete DwarfRangesSection;
+    delete DwarfMacroInfoSection;
+  }
+
+  virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
+    DataSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
+    BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS());
+    ReadOnlySection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly());
+
+    StaticCtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    StaticDtorSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    LSDASection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    EHFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfAbbrevSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfInfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfFrameSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfPubTypesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfDebugInlineSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfStrSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfLocSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfARangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfRangesSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+    DwarfMacroInfoSection =
+        new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata());
+  }
+
+  virtual const MCSection *getSectionForConstant(SectionKind Kind) const {
+    return ReadOnlySection;
+  }
+
+  virtual const MCSection *
+  getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang, const TargetMachine &TM) const {
+    return DataSection;
+  }
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
new file mode 100644
index 000000000000..6786eb02240c
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -0,0 +1,504 @@
+//===- NVPTXUtilities.cpp - Utility Functions -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains miscellaneous utility functions
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "NVPTX.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+//#include <iostream>
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/InstIterator.h"
+
+using namespace llvm;
+
+typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
+typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
+typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
+
+ManagedStatic<per_module_annot_t> annotationCache;
+
+static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  assert(md && "Invalid mdnode for annotation");
+  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  // start index = 1, to skip the global variable key
+  // increment = 2, to skip the value for each property-value pairs
+  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+    // property
+    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    assert(prop && "Annotation property not a string");
+
+    // value
+    ConstantInt *Val = dyn_cast<ConstantInt>(md->getOperand(i + 1));
+    assert(Val && "Value operand not a constant int");
+
+    std::string keyname = prop->getString().str();
+    if (retval.find(keyname) != retval.end())
+      retval[keyname].push_back(Val->getZExtValue());
+    else {
+      std::vector<unsigned> tmp;
+      tmp.push_back(Val->getZExtValue());
+      retval[keyname] = tmp;
+    }
+  }
+}
+
+static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
+  if (!NMD)
+    return;
+  key_val_pair_t tmp;
+  for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+    const MDNode *elem = NMD->getOperand(i);
+
+    Value *entity = elem->getOperand(0);
+    // entity may be null due to DCE
+    if (!entity)
+      continue;
+    if (entity != gv)
+      continue;
+
+    // accumulate annotations for entity in tmp
+    cacheAnnotationFromMD(elem, tmp);
+  }
+
+  if (tmp.empty()) // no annotations for this gv
+    return;
+
+  if ((*annotationCache).find(m) != (*annotationCache).end())
+    (*annotationCache)[m][gv] = tmp;
+  else {
+    global_val_annot_t tmp1;
+    tmp1[gv] = tmp;
+    (*annotationCache)[m] = tmp1;
+  }
+}
+
+bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 unsigned &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop][0];
+  return true;
+}
+
+bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
+                                 std::vector<unsigned> &retval) {
+  const Module *m = gv->getParent();
+  if ((*annotationCache).find(m) == (*annotationCache).end())
+    cacheAnnotationFromMD(m, gv);
+  else if ((*annotationCache)[m].find(gv) == (*annotationCache)[m].end())
+    cacheAnnotationFromMD(m, gv);
+  if ((*annotationCache)[m][gv].find(prop) == (*annotationCache)[m][gv].end())
+    return false;
+  retval = (*annotationCache)[m][gv][prop];
+  return true;
+}
+
+bool llvm::isTexture(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISTEXTURE],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSurface(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSURFACE],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isSampler(const llvm::Value &val) {
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if (llvm::findOneNVVMAnnotation(
+            gv, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+            annot)) {
+      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
+    }
+  }
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(
+            func, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISSAMPLER],
+            annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageReadOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISREADONLY_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImageWriteOnly(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISWRITEONLY_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::isImage(const llvm::Value &val) {
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+}
+
+std::string llvm::getTextureName(const llvm::Value &val) {
+  assert(val.hasName() && "Found texture variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSurfaceName(const llvm::Value &val) {
+  assert(val.hasName() && "Found surface variable with no name");
+  return val.getName();
+}
+
+std::string llvm::getSamplerName(const llvm::Value &val) {
+  assert(val.hasName() && "Found sampler variable with no name");
+  return val.getName();
+}
+
+bool llvm::getMaxNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_X], x));
+}
+
+bool llvm::getMaxNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Y], y));
+}
+
+bool llvm::getMaxNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MAXNTID_Z], z));
+}
+
+bool llvm::getReqNTIDx(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_X], x));
+}
+
+bool llvm::getReqNTIDy(const Function &F, unsigned &y) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Y], y));
+}
+
+bool llvm::getReqNTIDz(const Function &F, unsigned &z) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_REQNTID_Z], z));
+}
+
+bool llvm::getMinCTASm(const Function &F, unsigned &x) {
+  return (llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_MINNCTAPERSM], x));
+}
+
+bool llvm::isKernelFunction(const Function &F) {
+  unsigned x = 0;
+  bool retval = llvm::findOneNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ISKERNEL_FUNCTION], x);
+  if (retval == false) {
+    // There is no NVVM metadata, check the calling convention
+    if (F.getCallingConv() == llvm::CallingConv::PTX_Kernel)
+      return true;
+    else
+      return false;
+  }
+  return (x == 1);
+}
+
+bool llvm::getAlign(const Function &F, unsigned index, unsigned &align) {
+  std::vector<unsigned> Vs;
+  bool retval = llvm::findAllNVVMAnnotation(
+      &F, llvm::PropertyAnnotationNames[llvm::PROPERTY_ALIGN], Vs);
+  if (retval == false)
+    return false;
+  for (int i = 0, e = Vs.size(); i < e; i++) {
+    unsigned v = Vs[i];
+    if ((v >> 16) == index) {
+      align = v & 0xFFFF;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) {
+  if (MDNode *alignNode = I.getMetadata("callalign")) {
+    for (int i = 0, n = alignNode->getNumOperands(); i < n; i++) {
+      if (const ConstantInt *CI =
+              dyn_cast<ConstantInt>(alignNode->getOperand(i))) {
+        unsigned v = CI->getZExtValue();
+        if ((v >> 16) == index) {
+          align = v & 0xFFFF;
+          return true;
+        }
+        if ((v >> 16) > index) {
+          return false;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool llvm::isBarrierIntrinsic(Intrinsic::ID id) {
+  if ((id == Intrinsic::nvvm_barrier0) ||
+      (id == Intrinsic::nvvm_barrier0_popc) ||
+      (id == Intrinsic::nvvm_barrier0_and) ||
+      (id == Intrinsic::nvvm_barrier0_or) ||
+      (id == Intrinsic::cuda_syncthreads))
+    return true;
+  return false;
+}
+
+// Interface for checking all memory space transfer related intrinsics
+bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) {
+  if (id == Intrinsic::nvvm_ptr_local_to_gen ||
+      id == Intrinsic::nvvm_ptr_shared_to_gen ||
+      id == Intrinsic::nvvm_ptr_global_to_gen ||
+      id == Intrinsic::nvvm_ptr_constant_to_gen ||
+      id == Intrinsic::nvvm_ptr_gen_to_global ||
+      id == Intrinsic::nvvm_ptr_gen_to_shared ||
+      id == Intrinsic::nvvm_ptr_gen_to_local ||
+      id == Intrinsic::nvvm_ptr_gen_to_constant ||
+      id == Intrinsic::nvvm_ptr_gen_to_param) {
+    return true;
+  }
+
+  return false;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// provide an option to ignore GEP indicies for find out the base address only
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
+  V = V->stripPointerCasts();
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (ignore_GEP_indices)
+      if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+        V = GEP->getPointerOperand()->stripPointerCasts();
+        continue;
+      }
+    break;
+  }
+  return V;
+}
+
+// consider several special intrinsics in striping pointer casts, and
+// - ignore GEP indicies for find out the base address only, and
+// - tracking PHINode
+// which could be used in simple alias disambigurate.
+const Value *
+llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
+  if (processed.find(V) != processed.end())
+    return NULL;
+  processed.insert(V);
+
+  const Value *V2 = V->stripPointerCasts();
+  if (V2 != V && processed.find(V2) != processed.end())
+    return NULL;
+  processed.insert(V2);
+
+  V = V2;
+
+  while (true) {
+    if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) {
+      if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) {
+        V = IS->getArgOperand(0)->stripPointerCasts();
+        continue;
+      }
+    } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      V = GEP->getPointerOperand()->stripPointerCasts();
+      continue;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (V != V2 && processed.find(V) != processed.end())
+        return NULL;
+      processed.insert(PN);
+      const Value *common = 0;
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        const Value *pv = PN->getIncomingValue(i);
+        const Value *base = skipPointerTransfer(pv, processed);
+        if (base) {
+          if (common == 0)
+            common = base;
+          else if (common != base)
+            return PN;
+        }
+      }
+      if (common == 0)
+        return PN;
+      V = common;
+    }
+    break;
+  }
+  return V;
+}
+
+// The following are some useful utilities for debuggung
+
+BasicBlock *llvm::getParentBlock(Value *v) {
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent();
+
+  return 0;
+}
+
+Function *llvm::getParentFunction(Value *v) {
+  if (Function *F = dyn_cast<Function>(v))
+    return F;
+
+  if (Instruction *I = dyn_cast<Instruction>(v))
+    return I->getParent()->getParent();
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v))
+    return B->getParent();
+
+  return 0;
+}
+
+// Dump a block by name
+void llvm::dumpBlock(Value *v, char *blockName) {
+  Function *F = getParentFunction(v);
+  if (F == 0)
+    return;
+
+  for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
+    BasicBlock *B = it;
+    if (strcmp(B->getName().data(), blockName) == 0) {
+      B->dump();
+      return;
+    }
+  }
+}
+
+// Find an instruction by name
+Instruction *llvm::getInst(Value *base, char *instName) {
+  Function *F = getParentFunction(base);
+  if (F == 0)
+    return 0;
+
+  for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
+    Instruction *I = &*it;
+    if (strcmp(I->getName().data(), instName) == 0) {
+      return I;
+    }
+  }
+
+  return 0;
+}
+
+// Dump an instruction by nane
+void llvm::dumpInst(Value *base, char *instName) {
+  Instruction *I = getInst(base, instName);
+  if (I)
+    I->dump();
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v, std::set<Instruction *> *visited) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+
+    if (visited->find(I) != visited->end())
+      return;
+
+    visited->insert(I);
+
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      dumpInstRec(I->getOperand(i), visited);
+
+    I->dump();
+  }
+}
+
+// Dump an instruction and all dependent instructions
+void llvm::dumpInstRec(Value *v) {
+  std::set<Instruction *> visited;
+
+  //BasicBlock *B = getParentBlock(v);
+
+  dumpInstRec(v, &visited);
+}
+
+// Dump the parent for Instruction, block or function
+void llvm::dumpParent(Value *v) {
+  if (Instruction *I = dyn_cast<Instruction>(v)) {
+    I->getParent()->dump();
+    return;
+  }
+
+  if (BasicBlock *B = dyn_cast<BasicBlock>(v)) {
+    B->getParent()->dump();
+    return;
+  }
+
+  if (Function *F = dyn_cast<Function>(v)) {
+    F->getParent()->dump();
+    return;
+  }
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
new file mode 100644
index 000000000000..a208004297d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -0,0 +1,92 @@
+//===-- NVPTXUtilities - Utilities -----------------------------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the NVVM specific utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTXUTILITIES_H
+#define NVPTXUTILITIES_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include <cstdarg>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+#define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
+#define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
+
+bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
+bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
+                           std::vector<unsigned> &);
+
+bool isTexture(const llvm::Value &);
+bool isSurface(const llvm::Value &);
+bool isSampler(const llvm::Value &);
+bool isImage(const llvm::Value &);
+bool isImageReadOnly(const llvm::Value &);
+bool isImageWriteOnly(const llvm::Value &);
+
+std::string getTextureName(const llvm::Value &);
+std::string getSurfaceName(const llvm::Value &);
+std::string getSamplerName(const llvm::Value &);
+
+bool getMaxNTIDx(const llvm::Function &, unsigned &);
+bool getMaxNTIDy(const llvm::Function &, unsigned &);
+bool getMaxNTIDz(const llvm::Function &, unsigned &);
+
+bool getReqNTIDx(const llvm::Function &, unsigned &);
+bool getReqNTIDy(const llvm::Function &, unsigned &);
+bool getReqNTIDz(const llvm::Function &, unsigned &);
+
+bool getMinCTASm(const llvm::Function &, unsigned &);
+bool isKernelFunction(const llvm::Function &);
+
+bool getAlign(const llvm::Function &, unsigned index, unsigned &);
+bool getAlign(const llvm::CallInst &, unsigned index, unsigned &);
+
+bool isBarrierIntrinsic(llvm::Intrinsic::ID);
+
+/// make_vector - Helper function which is useful for building temporary vectors
+/// to pass into type construction of CallInst ctors.  This turns a null
+/// terminated list of pointers (or other value types) into a real live vector.
+///
+template <typename T> inline std::vector<T> make_vector(T A, ...) {
+  va_list Args;
+  va_start(Args, A);
+  std::vector<T> Result;
+  Result.push_back(A);
+  while (T Val = va_arg(Args, T))
+    Result.push_back(Val);
+  va_end(Args);
+  return Result;
+}
+
+bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id);
+const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices);
+const Value *
+skipPointerTransfer(const Value *V, std::set<const Value *> &processed);
+BasicBlock *getParentBlock(Value *v);
+Function *getParentFunction(Value *v);
+void dumpBlock(Value *v, char *blockName);
+Instruction *getInst(Value *base, char *instName);
+void dumpInst(Value *base, char *instName);
+void dumpInstRec(Value *v, std::set<Instruction *> *visited);
+void dumpInstRec(Value *v);
+void dumpParent(Value *v);
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
new file mode 100644
index 000000000000..775df19be162
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td
@@ -0,0 +1,1481 @@
+//===- NVPTXVector.td - NVPTX Vector Specific Instruction defs -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Vector Specific
+//-----------------------------------
+
+//
+// All vector instructions derive from NVPTXVecInst
+//
+
+class NVPTXVecInst<dag outs, dag ins, string asmstr, list<dag> pattern,
+  NVPTXInst sInst=NOP>
+  : NVPTXInst<outs, ins, asmstr, pattern> {
+  NVPTXInst scalarInst=sInst;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in {
+// Extract v2i16
+def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V2I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v2i16 V2I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v4i16
+def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst),
+  (ins V4I16Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int16Regs:$dst, (vector_extract
+                           (v4i16 V4I16Regs:$src), imm:$c))],
+                         IMOV16rr>;
+
+// Extract v2i8
+def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V2I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v2i8 V2I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v4i8
+def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst),
+  (ins V4I8Regs:$src, i8imm:$c),
+                         "mov.u16 \t$dst, $src${c:vecelem};",
+                         [(set Int8Regs:$dst, (vector_extract
+                           (v4i8 V4I8Regs:$src), imm:$c))],
+                         IMOV8rr>;
+
+// Extract v2i32
+def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V2I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v2i32 V2I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v2f32
+def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V2F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v2f32 V2F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+
+// Extract v2i64
+def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst),
+  (ins V2I64Regs:$src, i8imm:$c),
+                         "mov.u64 \t$dst, $src${c:vecelem};",
+                         [(set Int64Regs:$dst, (vector_extract
+                           (v2i64 V2I64Regs:$src), imm:$c))],
+                         IMOV64rr>;
+
+// Extract v2f64
+def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst),
+  (ins V2F64Regs:$src, i8imm:$c),
+                         "mov.f64 \t$dst, $src${c:vecelem};",
+                         [(set Float64Regs:$dst, (vector_extract
+                           (v2f64 V2F64Regs:$src), imm:$c))],
+                         FMOV64rr>;
+
+// Extract v4i32
+def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst),
+  (ins V4I32Regs:$src, i8imm:$c),
+                         "mov.u32 \t$dst, $src${c:vecelem};",
+                         [(set Int32Regs:$dst, (vector_extract
+                           (v4i32 V4I32Regs:$src), imm:$c))],
+                         IMOV32rr>;
+
+// Extract v4f32
+def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst),
+  (ins V4F32Regs:$src, i8imm:$c),
+                         "mov.f32 \t$dst, $src${c:vecelem};",
+                         [(set Float32Regs:$dst, (vector_extract
+                           (v4f32 V4F32Regs:$src), imm:$c))],
+                         FMOV32rr>;
+}
+
+let isAsCheapAsAMove=1, VecInstType=isVecInsert.Value in {
+// Insert v2i8
+def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst),
+  (ins V2I8Regs:$src, Int8Regs:$val, i8imm:$c),
+        "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+        "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I8Regs:$dst,
+         (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v4i8
+def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst),
+  (ins V4I8Regs:$src, Int8Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I8Regs:$dst,
+         (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))],
+                         IMOV8rr>;
+
+// Insert v2i16
+def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst),
+  (ins V2I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V2I16Regs:$dst,
+         (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v4i16
+def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst),
+  (ins V4I16Regs:$src, Int16Regs:$val, i8imm:$c),
+                       "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u16 \t$dst${c:vecelem}, $val;",
+       [(set V4I16Regs:$dst,
+         (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))],
+                         IMOV16rr>;
+
+// Insert v2i32
+def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst),
+  (ins V2I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V2I32Regs:$dst,
+         (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v2f32
+def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst),
+  (ins V2F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V2F32Regs:$dst,
+         (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+
+// Insert v2i64
+def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst),
+  (ins V2I64Regs:$src, Int64Regs:$val, i8imm:$c),
+                       "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u64 \t$dst${c:vecelem}, $val;",
+       [(set V2I64Regs:$dst,
+         (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))],
+                         IMOV64rr>;
+
+// Insert v2f64
+def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst),
+  (ins V2F64Regs:$src, Float64Regs:$val, i8imm:$c),
+                       "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f64 \t$dst${c:vecelem}, $val;",
+       [(set V2F64Regs:$dst,
+         (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))],
+                         FMOV64rr>;
+
+// Insert v4i32
+def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst),
+  (ins V4I32Regs:$src, Int32Regs:$val, i8imm:$c),
+                       "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.u32 \t$dst${c:vecelem}, $val;",
+       [(set V4I32Regs:$dst,
+         (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))],
+                         IMOV32rr>;
+
+// Insert v4f32
+def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst),
+  (ins V4F32Regs:$src, Float32Regs:$val, i8imm:$c),
+                       "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};"
+                       "\n\tmov.f32 \t$dst${c:vecelem}, $val;",
+       [(set V4F32Regs:$dst,
+         (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))],
+                         FMOV32rr>;
+}
+
+class BinOpAsmString<string c> {
+  string s = c;
+}
+
+class V4AsmStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3;")>;
+
+class V2AsmStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1;")>;
+
+class V4MADStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2, ${b}_2, ${c}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3, ${b}_3, ${c}_3;")>;
+
+class V2MADStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0, ${b}_0, ${c}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1, ${b}_1, ${c}_1;")>;
+
+class V4UnaryStr<string opcode> : BinOpAsmString<
+                          !strconcat(!strconcat(!strconcat(!strconcat(
+                            !strconcat(!strconcat(!strconcat(
+                          opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                          opcode), " \t${dst}_1, ${a}_1;\n\t"),
+                          opcode), " \t${dst}_2, ${a}_2;\n\t"),
+                          opcode), " \t${dst}_3, ${a}_3;")>;
+
+class V2UnaryStr<string opcode> : BinOpAsmString<
+                           !strconcat(!strconcat(!strconcat(
+                           opcode,  " \t${dst}_0, ${a}_0;\n\t"),
+                           opcode), " \t${dst}_1, ${a}_1;")>;
+
+class VecBinaryOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a, regclass:$b),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a, regclass:$b))],
+                 sInst>;
+
+class VecShiftOp<BinOpAsmString asmstr, SDNode OpNode, NVPTXRegClass regclass1,
+                 NVPTXRegClass regclass2, NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass1:$dst), (ins regclass1:$a, regclass2:$b),
+                 asmstr.s,
+                 [(set regclass1:$dst, (OpNode regclass1:$a, regclass2:$b))],
+                 sInst>;
+
+class VecUnaryOp<BinOpAsmString asmstr, PatFrag OpNode, NVPTXRegClass regclass,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs regclass:$dst), (ins regclass:$a),
+                 asmstr.s,
+                 [(set regclass:$dst, (OpNode regclass:$a))], sInst>;
+
+multiclass IntBinVOp<string asmstr, SDNode OpNode,
+                     NVPTXInst i64op=NOP, NVPTXInst i32op=NOP, NVPTXInst
+                     i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "64")>, OpNode, V2I64Regs,
+    i64op>;
+  def V4I32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "32")>, OpNode, V4I32Regs,
+    i32op>;
+  def V2I32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "32")>, OpNode, V2I32Regs,
+    i32op>;
+  def V4I16 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I16Regs,
+    i16op>;
+  def V2I16 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I16Regs,
+    i16op>;
+  def V4I8 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "16")>, OpNode, V4I8Regs,
+    i8op>;
+  def V2I8 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "16")>, OpNode, V2I8Regs,
+    i8op>;
+}
+
+multiclass FloatBinVOp<string asmstr, SDNode OpNode,
+                       NVPTXInst f64=NOP, NVPTXInst f32=NOP,
+                       NVPTXInst f32_ftz=NOP> {
+  def V2F64 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f64")>, OpNode,
+    V2F64Regs, f64>;
+  def V4F32_ftz : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V4F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V2F32_ftz : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "ftz.f32")>, OpNode,
+    V2F32Regs, f32_ftz>, Requires<[doF32FTZ]>;
+  def V4F32 : VecBinaryOp<V4AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V4F32Regs, f32>;
+  def V2F32 : VecBinaryOp<V2AsmStr<!strconcat(asmstr, "f32")>, OpNode,
+    V2F32Regs, f32>;
+}
+
+multiclass IntUnaryVOp<string asmstr, PatFrag OpNode,
+                       NVPTXInst i64op=NOP, NVPTXInst i32op=NOP,
+                       NVPTXInst i16op=NOP, NVPTXInst i8op=NOP> {
+  def V2I64 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "64")>, OpNode,
+    V2I64Regs, i64op>;
+  def V4I32 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V4I32Regs, i32op>;
+  def V2I32 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "32")>, OpNode,
+    V2I32Regs, i32op>;
+  def V4I16 : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I16Regs, i16op>;
+  def V2I16 : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I16Regs, i16op>;
+  def V4I8  : VecUnaryOp<V4UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V4I8Regs,   i8op>;
+  def V2I8  : VecUnaryOp<V2UnaryStr<!strconcat(asmstr, "16")>, OpNode,
+    V2I8Regs,   i8op>;
+}
+
+
+// Integer Arithmetic
+let VecInstType=isVecOther.Value in {
+defm VAdd : IntBinVOp<"add.s", add, ADDi64rr, ADDi32rr, ADDi16rr, ADDi8rr>;
+defm VSub : IntBinVOp<"sub.s", sub, SUBi64rr, SUBi32rr, SUBi16rr, SUBi8rr>;
+
+def AddCCV4I32 : VecBinaryOp<V4AsmStr<"add.cc.s32">, addc, V4I32Regs,
+  ADDCCi32rr>;
+def AddCCV2I32 : VecBinaryOp<V2AsmStr<"add.cc.s32">, addc, V2I32Regs,
+  ADDCCi32rr>;
+def SubCCV4I32 : VecBinaryOp<V4AsmStr<"sub.cc.s32">, subc, V4I32Regs,
+  SUBCCi32rr>;
+def SubCCV2I32 : VecBinaryOp<V2AsmStr<"sub.cc.s32">, subc, V2I32Regs,
+  SUBCCi32rr>;
+def AddCCCV4I32 : VecBinaryOp<V4AsmStr<"addc.cc.s32">, adde, V4I32Regs,
+  ADDCCCi32rr>;
+def AddCCCV2I32 : VecBinaryOp<V2AsmStr<"addc.cc.s32">, adde, V2I32Regs,
+  ADDCCCi32rr>;
+def SubCCCV4I32 : VecBinaryOp<V4AsmStr<"subc.cc.s32">, sube, V4I32Regs,
+  SUBCCCi32rr>;
+def SubCCCV2I32 : VecBinaryOp<V2AsmStr<"subc.cc.s32">, sube, V2I32Regs,
+  SUBCCCi32rr>;
+
+def ShiftLV2I64 : VecShiftOp<V2AsmStr<"shl.b64">, shl, V2I64Regs, V2I32Regs,
+  SHLi64rr>;
+def ShiftLV2I32 : VecShiftOp<V2AsmStr<"shl.b32">, shl, V2I32Regs, V2I32Regs,
+  SHLi32rr>;
+def ShiftLV4I32 : VecShiftOp<V4AsmStr<"shl.b32">, shl, V4I32Regs, V4I32Regs,
+  SHLi32rr>;
+def ShiftLV2I16 : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I16Regs, V2I32Regs,
+  SHLi16rr>;
+def ShiftLV4I16 : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I16Regs, V4I32Regs,
+  SHLi16rr>;
+def ShiftLV2I8  : VecShiftOp<V2AsmStr<"shl.b16">, shl, V2I8Regs,  V2I32Regs,
+  SHLi8rr>;
+def ShiftLV4I8  : VecShiftOp<V4AsmStr<"shl.b16">, shl, V4I8Regs,  V4I32Regs,
+  SHLi8rr>;
+}
+
+// cvt to v*i32, helpers for shift
+class CVTtoVeci32<NVPTXRegClass inclass, NVPTXRegClass outclass, string asmstr,
+  NVPTXInst sInst=NOP> :
+      NVPTXVecInst<(outs outclass:$d), (ins inclass:$s), asmstr, [], sInst>;
+
+class VecCVTStrHelper<string op, string dest, string src> {
+  string s=!strconcat(op, !strconcat("\t",
+           !strconcat(dest, !strconcat(", ", !strconcat(src, ";")))));
+}
+
+class Vec2CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_1", "${s}_1">.s));
+}
+
+class Vec4CVTStr<string op> {
+  string s=!strconcat(VecCVTStrHelper<op, "${d}_0", "${s}_0">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_1", "${s}_1">.s,
+           !strconcat("\n\t",
+           !strconcat(VecCVTStrHelper<op, "${d}_2", "${s}_2">.s,
+           !strconcat("\n\t", VecCVTStrHelper<op, "${d}_3", "${s}_3">.s))))));
+}
+
+let VecInstType=isVecOther.Value in {
+def CVTv2i8tov2i32 : CVTtoVeci32<V2I8Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv2i16tov2i32 : CVTtoVeci32<V2I16Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv4i8tov4i32 : CVTtoVeci32<V4I8Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext8to32>;
+def CVTv4i16tov4i32 : CVTtoVeci32<V4I16Regs, V4I32Regs,
+  Vec4CVTStr<"cvt.u32.u16">.s, Zint_extendext16to32>;
+def CVTv2i64tov2i32 : CVTtoVeci32<V2I64Regs, V2I32Regs,
+  Vec2CVTStr<"cvt.u32.u64">.s, TRUNC_64to32>;
+}
+
+def : Pat<(shl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(shl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(shl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(shl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(shl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+let VecInstType=isVecOther.Value in {
+def ShiftRAV2I64 : VecShiftOp<V2AsmStr<"shr.s64">, sra, V2I64Regs, V2I32Regs,
+  SRAi64rr>;
+def ShiftRAV2I32 : VecShiftOp<V2AsmStr<"shr.s32">, sra, V2I32Regs, V2I32Regs,
+  SRAi32rr>;
+def ShiftRAV4I32 : VecShiftOp<V4AsmStr<"shr.s32">, sra, V4I32Regs, V4I32Regs,
+  SRAi32rr>;
+def ShiftRAV2I16 : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I16Regs, V2I32Regs,
+  SRAi16rr>;
+def ShiftRAV4I16 : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I16Regs, V4I32Regs,
+  SRAi16rr>;
+def ShiftRAV2I8  : VecShiftOp<V2AsmStr<"shr.s16">, sra, V2I8Regs,  V2I32Regs,
+  SRAi8rr>;
+def ShiftRAV4I8  : VecShiftOp<V4AsmStr<"shr.s16">, sra, V4I8Regs,  V4I32Regs,
+  SRAi8rr>;
+
+def ShiftRLV2I64 : VecShiftOp<V2AsmStr<"shr.u64">, srl, V2I64Regs, V2I32Regs,
+  SRLi64rr>;
+def ShiftRLV2I32 : VecShiftOp<V2AsmStr<"shr.u32">, srl, V2I32Regs, V2I32Regs,
+  SRLi32rr>;
+def ShiftRLV4I32 : VecShiftOp<V4AsmStr<"shr.u32">, srl, V4I32Regs, V4I32Regs,
+  SRLi32rr>;
+def ShiftRLV2I16 : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I16Regs, V2I32Regs,
+  SRLi16rr>;
+def ShiftRLV4I16 : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I16Regs, V4I32Regs,
+  SRLi16rr>;
+def ShiftRLV2I8  : VecShiftOp<V2AsmStr<"shr.u16">, srl, V2I8Regs,  V2I32Regs,
+  SRLi8rr>;
+def ShiftRLV4I8  : VecShiftOp<V4AsmStr<"shr.u16">, srl, V4I8Regs,  V4I32Regs,
+  SRLi8rr>;
+
+defm VMult   : IntBinVOp<"mul.lo.s", mul, MULTi64rr, MULTi32rr, MULTi16rr,
+  MULTi8rr>;
+defm VMultHS : IntBinVOp<"mul.hi.s", mulhs, MULTHSi64rr, MULTHSi32rr,
+  MULTHSi16rr,
+  MULTHSi8rr>;
+defm VMultHU : IntBinVOp<"mul.hi.u", mulhu, MULTHUi64rr, MULTHUi32rr,
+  MULTHUi16rr,
+  MULTHUi8rr>;
+defm VSDiv   : IntBinVOp<"div.s", sdiv, SDIVi64rr, SDIVi32rr, SDIVi16rr,
+  SDIVi8rr>;
+defm VUDiv   : IntBinVOp<"div.u", udiv, UDIVi64rr, UDIVi32rr, UDIVi16rr,
+  UDIVi8rr>;
+defm VSRem   : IntBinVOp<"rem.s", srem, SREMi64rr, SREMi32rr, SREMi16rr,
+  SREMi8rr>;
+defm VURem   : IntBinVOp<"rem.u", urem, UREMi64rr, UREMi32rr, UREMi16rr,
+  UREMi8rr>;
+}
+
+def : Pat<(sra V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRAV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(sra V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRAV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(sra V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRAV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(sra V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRAV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(sra V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRAV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+def : Pat<(srl V2I16Regs:$src1, V2I16Regs:$src2),
+          (ShiftRLV2I16 V2I16Regs:$src1, (CVTv2i16tov2i32 V2I16Regs:$src2))>;
+def : Pat<(srl V2I8Regs:$src1, V2I8Regs:$src2),
+          (ShiftRLV2I8 V2I8Regs:$src1, (CVTv2i8tov2i32 V2I8Regs:$src2))>;
+def : Pat<(srl V2I64Regs:$src1, V2I64Regs:$src2),
+          (ShiftRLV2I64 V2I64Regs:$src1, (CVTv2i64tov2i32 V2I64Regs:$src2))>;
+
+def : Pat<(srl V4I16Regs:$src1, V4I16Regs:$src2),
+          (ShiftRLV4I16 V4I16Regs:$src1, (CVTv4i16tov4i32 V4I16Regs:$src2))>;
+def : Pat<(srl V4I8Regs:$src1, V4I8Regs:$src2),
+          (ShiftRLV4I8 V4I8Regs:$src1, (CVTv4i8tov4i32 V4I8Regs:$src2))>;
+
+multiclass VMAD<string asmstr, NVPTXRegClass regclassv4,
+  NVPTXRegClass regclassv2,
+                SDNode an=add, SDNode mn=mul, NVPTXInst sop=NOP,
+                Predicate Pred> {
+  def V4 : NVPTXVecInst<(outs regclassv4:$dst),
+    (ins regclassv4:$a, regclassv4:$b, regclassv4:$c),
+                      V4MADStr<asmstr>.s,
+                      [(set regclassv4:$dst,
+                        (an (mn regclassv4:$a, regclassv4:$b), regclassv4:$c))],
+                      sop>,
+           Requires<[Pred]>;
+  def V2 : NVPTXVecInst<(outs regclassv2:$dst),
+    (ins regclassv2:$a, regclassv2:$b, regclassv2:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclassv2:$dst,
+                        (an (mn regclassv2:$a, regclassv2:$b), regclassv2:$c))],
+                      sop>,
+           Requires<[Pred]>;
+}
+
+multiclass VMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (add
+                        (mul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+multiclass VFMADV2Only<string asmstr, NVPTXRegClass regclass, NVPTXInst sop=NOP,
+  Predicate Pred> {
+  def V2 : NVPTXVecInst<(outs regclass:$dst),
+    (ins regclass:$a, regclass:$b, regclass:$c),
+                      V2MADStr<asmstr>.s,
+                      [(set regclass:$dst, (fadd
+                        (fmul regclass:$a, regclass:$b), regclass:$c))], sop>,
+           Requires<[Pred]>;
+}
+
+let VecInstType=isVecOther.Value in {
+defm I8MAD  : VMAD<"mad.lo.s16", V4I8Regs, V2I8Regs, add, mul, MAD8rrr, true>;
+defm I16MAD : VMAD<"mad.lo.s16", V4I16Regs, V2I16Regs, add, mul, MAD16rrr,
+  true>;
+defm I32MAD : VMAD<"mad.lo.s32", V4I32Regs, V2I32Regs, add, mul, MAD32rrr,
+  true>;
+defm I64MAD : VMADV2Only<"mad.lo.s64", V2I64Regs, MAD64rrr, true>;
+
+defm VNeg : IntUnaryVOp<"neg.s", ineg, INEG64, INEG32, INEG16, INEG8>;
+
+defm VAddf : FloatBinVOp<"add.", fadd, FADDf64rr, FADDf32rr, FADDf32rr_ftz>;
+defm VSubf : FloatBinVOp<"sub.", fsub, FSUBf64rr, FSUBf32rr, FSUBf32rr_ftz>;
+defm VMulf : FloatBinVOp<"mul.", fmul, FMULf64rr, FMULf32rr, FMULf32rr_ftz>;
+
+defm F32MAD_ftz : VMAD<"mad.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMAD32_ftzrrr, doFMADF32_ftz>;
+defm F32FMA_ftz : VMAD<"fma.rn.ftz.f32", V4F32Regs, V2F32Regs, fadd, fmul,
+  FMA32_ftzrrr, doFMAF32_ftz>;
+defm F32MAD : VMAD<"mad.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMAD32rrr,
+  doFMADF32>;
+defm F32FMA : VMAD<"fma.rn.f32", V4F32Regs, V2F32Regs, fadd, fmul, FMA32rrr,
+  doFMAF32>;
+defm F64FMA : VFMADV2Only<"fma.rn.f64", V2F64Regs, FMA64rrr, doFMAF64>;
+}
+
+let VecInstType=isVecOther.Value in {
+def V4F32Div_prec_ftz : VecBinaryOp<V4AsmStr<"div.rn.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V2F32Div_prec_ftz : VecBinaryOp<V2AsmStr<"div.rn.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec_ftz>, Requires<[doF32FTZ, reqPTX20]>;
+def V4F32Div_prec : VecBinaryOp<V4AsmStr<"div.rn.f32">, fdiv, V4F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_prec : VecBinaryOp<V2AsmStr<"div.rn.f32">, fdiv, V2F32Regs,
+  FDIV32rr_prec>, Requires<[reqPTX20]>;
+def V2F32Div_ftz : VecBinaryOp<V2AsmStr<"div.full.ftz.f32">, fdiv, V2F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V4F32Div_ftz : VecBinaryOp<V4AsmStr<"div.full.ftz.f32">, fdiv, V4F32Regs,
+  FDIV32rr_ftz>, Requires<[doF32FTZ]>;
+def V2F32Div : VecBinaryOp<V2AsmStr<"div.full.f32">, fdiv, V2F32Regs, FDIV32rr>;
+def V4F32Div : VecBinaryOp<V4AsmStr<"div.full.f32">, fdiv, V4F32Regs, FDIV32rr>;
+def V2F64Div : VecBinaryOp<V2AsmStr<"div.rn.f64">, fdiv, V2F64Regs, FDIV64rr>;
+}
+
+def fnegpat : PatFrag<(ops node:$in), (fneg node:$in)>;
+
+let VecInstType=isVecOther.Value in {
+def VNegv2f32_ftz : VecUnaryOp<V2UnaryStr<"neg.ftz.f32">, fnegpat, V2F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv4f32_ftz : VecUnaryOp<V4UnaryStr<"neg.ftz.f32">, fnegpat, V4F32Regs,
+  FNEGf32_ftz>, Requires<[doF32FTZ]>;
+def VNegv2f32 : VecUnaryOp<V2UnaryStr<"neg.f32">, fnegpat, V2F32Regs, FNEGf32>;
+def VNegv4f32 : VecUnaryOp<V4UnaryStr<"neg.f32">, fnegpat, V4F32Regs, FNEGf32>;
+def VNegv2f64 : VecUnaryOp<V2UnaryStr<"neg.f64">, fnegpat, V2F64Regs, FNEGf64>;
+
+// Logical Arithmetic
+defm VAnd : IntBinVOp<"and.b", and, ANDb64rr, ANDb32rr, ANDb16rr, ANDb8rr>;
+defm VOr  : IntBinVOp<"or.b", or, ORb64rr, ORb32rr, ORb16rr, ORb8rr>;
+defm VXor : IntBinVOp<"xor.b", xor, XORb64rr, XORb32rr, XORb16rr, XORb8rr>;
+
+defm VNot : IntUnaryVOp<"not.b", not, NOT64, NOT32, NOT16, NOT8>;
+}
+
+
+multiclass V2FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F32Regs:$a, (fmul V2F32Regs:$b, V2F32Regs:$c)),
+          (Inst (VNegv2f32 V2F32Regs:$b), V2F32Regs:$c,  V2F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F32Regs:$a, V2F32Regs:$b), V2F32Regs:$c),
+          (Inst V2F32Regs:$a, V2F32Regs:$b, (VNegv2f32 V2F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF32ext_ftz  : V2FPCONTRACT32_SUB_PAT<F32FMA_ftzV2, doFMAF32AGG_ftz>;
+defm V2FMADF32ext_ftz : V2FPCONTRACT32_SUB_PAT<F32MAD_ftzV2, doFMADF32_ftz>;
+defm V2FMAF32ext  : V2FPCONTRACT32_SUB_PAT<F32FMAV2, doFMAF32AGG>;
+defm V2FMADF32ext : V2FPCONTRACT32_SUB_PAT<F32MADV2, doFMADF32>;
+
+multiclass V4FPCONTRACT32_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V4F32Regs:$a, (fmul V4F32Regs:$b, V4F32Regs:$c)),
+          (Inst (VNegv4f32 V4F32Regs:$b), V4F32Regs:$c,  V4F32Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V4F32Regs:$a, V4F32Regs:$b), V4F32Regs:$c),
+          (Inst V4F32Regs:$a, V4F32Regs:$b, (VNegv4f32 V4F32Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V4FMAF32ext_ftz  : V4FPCONTRACT32_SUB_PAT<F32FMA_ftzV4, doFMAF32AGG_ftz>;
+defm V4FMADF32ext_ftz : V4FPCONTRACT32_SUB_PAT<F32MAD_ftzV4, doFMADF32_ftz>;
+defm V4FMAF32ext  : V4FPCONTRACT32_SUB_PAT<F32FMAV4, doFMAF32AGG>;
+defm V4FMADF32ext : V4FPCONTRACT32_SUB_PAT<F32MADV4, doFMADF32>;
+
+multiclass V2FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
+  def : Pat<(fsub V2F64Regs:$a, (fmul V2F64Regs:$b, V2F64Regs:$c)),
+          (Inst (VNegv2f64 V2F64Regs:$b), V2F64Regs:$c, V2F64Regs:$a)>,
+          Requires<[Pred]>;
+
+  def : Pat<(fsub (fmul V2F64Regs:$a, V2F64Regs:$b), V2F64Regs:$c),
+          (Inst V2F64Regs:$a, V2F64Regs:$b, (VNegv2f64 V2F64Regs:$c))>,
+          Requires<[Pred]>;
+}
+
+defm V2FMAF64ext : V2FPCONTRACT64_SUB_PAT<F64FMAV2, doFMAF64AGG>;
+
+class VecModStr<string vecsize, string elem, string extra, string l="">
+{
+  string t1 = !strconcat("${c", elem);
+  string t2 = !strconcat(t1, ":vecv");
+  string t3 = !strconcat(t2, vecsize);
+  string t4 = !strconcat(t3, extra);
+  string t5 = !strconcat(t4, l);
+  string s =  !strconcat(t5, "}");
+}
+class ShuffleOneLine<string vecsize, string elem, string type>
+{
+  string t1 = VecModStr<vecsize, elem, "comm", "1">.s;
+  string t2 = !strconcat(t1, "mov.");
+  string t3 = !strconcat(t2, type);
+  string t4 = !strconcat(t3, " \t${dst}_");
+  string t5 = !strconcat(t4, elem);
+  string t6 = !strconcat(t5, ", $src1");
+  string t7 = !strconcat(t6, VecModStr<vecsize, elem, "pos">.s);
+  string t8 = !strconcat(t7, ";\n\t");
+  string t9 = !strconcat(t8, VecModStr<vecsize, elem, "comm", "2">.s);
+  string t10 = !strconcat(t9, "mov.");
+  string t11 = !strconcat(t10, type);
+  string t12 = !strconcat(t11, " \t${dst}_");
+  string t13 = !strconcat(t12, elem);
+  string t14 = !strconcat(t13, ", $src2");
+  string t15 = !strconcat(t14, VecModStr<vecsize, elem, "pos">.s);
+  string s =   !strconcat(t15, ";");
+}
+class ShuffleAsmStr2<string type>
+{
+  string t1 = ShuffleOneLine<"2", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, ShuffleOneLine<"2", "1", type>.s);
+}
+class ShuffleAsmStr4<string type>
+{
+  string t1 = ShuffleOneLine<"4", "0", type>.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, ShuffleOneLine<"4", "1", type>.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, ShuffleOneLine<"4", "2", type>.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, ShuffleOneLine<"4", "3", type>.s);
+}
+
+let neverHasSideEffects=1, VecInstType=isVecShuffle.Value in {
+def VecShuffle_v4f32 : NVPTXVecInst<(outs V4F32Regs:$dst),
+                       (ins  V4F32Regs:$src1, V4F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v4i32 : NVPTXVecInst<(outs V4I32Regs:$dst),
+                       (ins  V4I32Regs:$src1, V4I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v4i16 : NVPTXVecInst<(outs V4I16Regs:$dst),
+                       (ins  V4I16Regs:$src1, V4I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v4i8 : NVPTXVecInst<(outs V4I8Regs:$dst),
+                       (ins  V4I8Regs:$src1, V4I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1, i8imm:$c2, i8imm:$c3),
+                 !strconcat("//Mov $dst, $src1, $src2, $c0, $c1, $c2, $c3;\n\t",
+                                 ShuffleAsmStr4<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2f32 : NVPTXVecInst<(outs V2F32Regs:$dst),
+                       (ins  V2F32Regs:$src1, V2F32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f32">.s),
+                       [], FMOV32rr>;
+
+def VecShuffle_v2i32 : NVPTXVecInst<(outs V2I32Regs:$dst),
+                       (ins  V2I32Regs:$src1, V2I32Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u32">.s),
+                       [], IMOV32rr>;
+
+def VecShuffle_v2i8 : NVPTXVecInst<(outs V2I8Regs:$dst),
+                       (ins  V2I8Regs:$src1, V2I8Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV8rr>;
+
+def VecShuffle_v2i16 : NVPTXVecInst<(outs V2I16Regs:$dst),
+                       (ins  V2I16Regs:$src1, V2I16Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u16">.s),
+                       [], IMOV16rr>;
+
+def VecShuffle_v2f64 : NVPTXVecInst<(outs V2F64Regs:$dst),
+                       (ins  V2F64Regs:$src1, V2F64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"f64">.s),
+                       [], FMOV64rr>;
+
+def VecShuffle_v2i64 : NVPTXVecInst<(outs V2I64Regs:$dst),
+                       (ins  V2I64Regs:$src1, V2I64Regs:$src2,
+                             i8imm:$c0, i8imm:$c1),
+                       !strconcat("//Mov $dst, $src1, $src2, $c0, $c1;\n\t",
+                                 ShuffleAsmStr2<"u64">.s),
+                       [], IMOV64rr>;
+}
+
+def ShuffleMask0 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(0), MVT::i32);
+}]>;
+def ShuffleMask1 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(1), MVT::i32);
+}]>;
+def ShuffleMask2 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(2), MVT::i32);
+}]>;
+def ShuffleMask3 : SDNodeXForm<vector_shuffle, [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return CurDAG->getTargetConstant(SVOp->getMaskElt(3), MVT::i32);
+}]>;
+
+// The spurious call is here to silence a compiler warning about N being
+// unused.
+def vec_shuf : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs),
+                       [{ N->getGluedNode(); return true; }]>;
+
+def : Pat<(v2f64 (vec_shuf:$op V2F64Regs:$src1, V2F64Regs:$src2)),
+          (VecShuffle_v2f64 V2F64Regs:$src1, V2F64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4f32 (vec_shuf:$op V4F32Regs:$src1, V4F32Regs:$src2)),
+          (VecShuffle_v4f32 V4F32Regs:$src1, V4F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2f32 (vec_shuf:$op V2F32Regs:$src1, V2F32Regs:$src2)),
+          (VecShuffle_v2f32 V2F32Regs:$src1, V2F32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v2i64 (vec_shuf:$op V2I64Regs:$src1, V2I64Regs:$src2)),
+          (VecShuffle_v2i64 V2I64Regs:$src1, V2I64Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i32 (vec_shuf:$op V4I32Regs:$src1, V4I32Regs:$src2)),
+          (VecShuffle_v4i32 V4I32Regs:$src1, V4I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i32 (vec_shuf:$op V2I32Regs:$src1, V2I32Regs:$src2)),
+          (VecShuffle_v2i32 V2I32Regs:$src1, V2I32Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i16 (vec_shuf:$op V4I16Regs:$src1, V4I16Regs:$src2)),
+          (VecShuffle_v4i16 V4I16Regs:$src1, V4I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i16 (vec_shuf:$op V2I16Regs:$src1, V2I16Regs:$src2)),
+          (VecShuffle_v2i16 V2I16Regs:$src1, V2I16Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+def : Pat<(v4i8 (vec_shuf:$op V4I8Regs:$src1, V4I8Regs:$src2)),
+          (VecShuffle_v4i8 V4I8Regs:$src1, V4I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op),
+                            (ShuffleMask2 node:$op), (ShuffleMask3 node:$op))>;
+
+def : Pat<(v2i8 (vec_shuf:$op V2I8Regs:$src1, V2I8Regs:$src2)),
+          (VecShuffle_v2i8 V2I8Regs:$src1, V2I8Regs:$src2,
+                            (ShuffleMask0 node:$op), (ShuffleMask1 node:$op))>;
+
+class Build_Vector2<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2),
+                   !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2}};"),
+                   [(set vclass:$dst, (build_vector sclass:$a1, sclass:$a2))],
+                   si>;
+class Build_Vector4<string asmstr, NVPTXRegClass vclass, NVPTXRegClass sclass,
+  NVPTXInst si>
+                   : NVPTXVecInst<(outs vclass:$dst),
+                   (ins  sclass:$a1, sclass:$a2, sclass:$a3, sclass:$a4),
+               !strconcat(asmstr, "\t${dst:vecfull}, {{$a1, $a2, $a3, $a4}};"),
+                   [(set vclass:$dst,
+                     (build_vector sclass:$a1, sclass:$a2,
+                       sclass:$a3, sclass:$a4))], si>;
+
+let isAsCheapAsAMove=1, VecInstType=isVecBuild.Value in {
+def Build_Vector2_f32 : Build_Vector2<"mov.v2.f32", V2F32Regs, Float32Regs,
+  FMOV32rr>;
+def Build_Vector2_f64 : Build_Vector2<"mov.v2.f64", V2F64Regs, Float64Regs,
+  FMOV64rr>;
+
+def Build_Vector2_i32 : Build_Vector2<"mov.v2.u32", V2I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector2_i64 : Build_Vector2<"mov.v2.u64", V2I64Regs, Int64Regs,
+  IMOV64rr>;
+def Build_Vector2_i16 : Build_Vector2<"mov.v2.u16", V2I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector2_i8  : Build_Vector2<"mov.v2.u16",  V2I8Regs,  Int8Regs,
+  IMOV8rr>;
+
+def Build_Vector4_f32 : Build_Vector4<"mov.v4.f32", V4F32Regs, Float32Regs,
+  FMOV32rr>;
+
+def Build_Vector4_i32 : Build_Vector4<"mov.v4.u32", V4I32Regs, Int32Regs,
+  IMOV32rr>;
+def Build_Vector4_i16 : Build_Vector4<"mov.v4.u16", V4I16Regs, Int16Regs,
+  IMOV16rr>;
+def Build_Vector4_i8  : Build_Vector4<"mov.v4.u16", V4I8Regs, Int8Regs,
+  IMOV8rr>;
+}
+
+class Vec_Move<string asmstr, NVPTXRegClass vclass, NVPTXInst sop=NOP>
+                 : NVPTXVecInst<(outs vclass:$dst), (ins vclass:$src),
+                   !strconcat(asmstr, "\t${dst:vecfull}, ${src:vecfull};"),
+                   [], sop>;
+
+let isAsCheapAsAMove=1, neverHasSideEffects=1, IsSimpleMove=1,
+  VecInstType=isVecOther.Value in {
+def V4f32Mov : Vec_Move<"mov.v4.f32", V4F32Regs, FMOV32rr>;
+def V2f32Mov : Vec_Move<"mov.v2.f32", V2F32Regs, FMOV32rr>;
+
+def V4i32Mov : Vec_Move<"mov.v4.u32", V4I32Regs, IMOV32rr>;
+def V2i32Mov : Vec_Move<"mov.v2.u32", V2I32Regs, IMOV32rr>;
+
+def V4i16Mov : Vec_Move<"mov.v4.u16", V4I16Regs, IMOV16rr>;
+def V2i16Mov : Vec_Move<"mov.v2.u16", V2I16Regs, IMOV16rr>;
+
+def V4i8Mov : Vec_Move<"mov.v4.u16", V4I8Regs, IMOV8rr>;
+def V2i8Mov : Vec_Move<"mov.v2.u16", V2I8Regs, IMOV8rr>;
+
+def V2f64Mov : Vec_Move<"mov.v2.f64", V2F64Regs, FMOV64rr>;
+def V2i64Mov : Vec_Move<"mov.v2.u64", V2I64Regs, IMOV64rr>;
+}
+
+// extract subvector patterns
+def extract_subvec : SDNode<"ISD::EXTRACT_SUBVECTOR",
+                        SDTypeProfile<1, 2, [SDTCisPtrTy<2>]>>;
+
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 0)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 0),
+                                    (V4f32Extract V4F32Regs:$src, 1))>;
+def : Pat<(v2f32 (extract_subvec V4F32Regs:$src, 2)),
+                 (Build_Vector2_f32 (V4f32Extract V4F32Regs:$src, 2),
+                                    (V4f32Extract V4F32Regs:$src, 3))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 0)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 0),
+                                    (V4i32Extract V4I32Regs:$src, 1))>;
+def : Pat<(v2i32 (extract_subvec V4I32Regs:$src, 2)),
+                 (Build_Vector2_i32 (V4i32Extract V4I32Regs:$src, 2),
+                                    (V4i32Extract V4I32Regs:$src, 3))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 0)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 0),
+                                    (V4i16Extract V4I16Regs:$src, 1))>;
+def : Pat<(v2i16 (extract_subvec V4I16Regs:$src, 2)),
+                 (Build_Vector2_i16 (V4i16Extract V4I16Regs:$src, 2),
+                                    (V4i16Extract V4I16Regs:$src, 3))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 0)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 0),
+                                    (V4i8Extract V4I8Regs:$src, 1))>;
+def : Pat<(v2i8 (extract_subvec V4I8Regs:$src, 2)),
+                 (Build_Vector2_i8 (V4i8Extract V4I8Regs:$src, 2),
+                                    (V4i8Extract V4I8Regs:$src, 3))>;
+
+// Select instructions
+class Select_OneLine<string type, string pos> {
+  string t1 = !strconcat("selp.", type);
+  string t2 = !strconcat(t1, " \t${dst}_");
+  string t3 = !strconcat(t2, pos);
+  string t4 = !strconcat(t3, ", ${src1}_");
+  string t5 = !strconcat(t4, pos);
+  string t6 = !strconcat(t5, ", ${src2}_");
+  string t7 = !strconcat(t6, pos);
+  string s  = !strconcat(t7, ", $p;");
+}
+
+class Select_Str2<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string s  = !strconcat(t2, Select_OneLine<type, "1">.s);
+}
+
+class Select_Str4<string type> {
+  string t1 = Select_OneLine<type, "0">.s;
+  string t2 = !strconcat(t1, "\n\t");
+  string t3 = !strconcat(t2, Select_OneLine<type, "1">.s);
+  string t4 = !strconcat(t3, "\n\t");
+  string t5 = !strconcat(t4, Select_OneLine<type, "2">.s);
+  string t6 = !strconcat(t5, "\n\t");
+  string s  = !strconcat(t6, Select_OneLine<type, "3">.s);
+
+}
+
+class Vec_Select<NVPTXRegClass vclass, string asmstr, NVPTXInst sop>
+      : NVPTXVecInst<(outs vclass:$dst),
+                     (ins  vclass:$src1, vclass:$src2, Int1Regs:$p),
+                     asmstr,
+                     [(set vclass:$dst, (select Int1Regs:$p, vclass:$src1,
+                       vclass:$src2))],
+                     sop>;
+
+let VecInstType=isVecOther.Value in {
+def V2I64_Select : Vec_Select<V2I64Regs, Select_Str2<"b64">.s, SELECTi64rr>;
+def V4I32_Select : Vec_Select<V4I32Regs, Select_Str4<"b32">.s, SELECTi32rr>;
+def V2I32_Select : Vec_Select<V2I32Regs, Select_Str2<"b32">.s, SELECTi32rr>;
+def V4I16_Select : Vec_Select<V4I16Regs, Select_Str4<"b16">.s, SELECTi16rr>;
+def V2I16_Select : Vec_Select<V2I16Regs, Select_Str2<"b16">.s, SELECTi16rr>;
+def V4I8_Select  : Vec_Select<V4I8Regs,  Select_Str4<"b16">.s, SELECTi8rr>;
+def V2I8_Select  : Vec_Select<V2I8Regs,  Select_Str2<"b16">.s, SELECTi8rr>;
+
+def V2F64_Select : Vec_Select<V2F64Regs, Select_Str2<"f64">.s, SELECTf64rr>;
+def V4F32_Select : Vec_Select<V4F32Regs, Select_Str4<"f32">.s, SELECTf32rr>;
+def V2F32_Select : Vec_Select<V2F32Regs, Select_Str2<"f32">.s, SELECTf32rr>;
+}
+
+// Comparison instructions
+
+// setcc convenience fragments.
+def vsetoeq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOEQ)>;
+def vsetogt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGT)>;
+def vsetoge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOGE)>;
+def vsetolt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLT)>;
+def vsetole : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETOLE)>;
+def vsetone : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETONE)>;
+def vseto   : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETO)>;
+def vsetuo  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUO)>;
+def vsetueq : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUEQ)>;
+def vsetugt : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGT)>;
+def vsetuge : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUGE)>;
+def vsetult : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULT)>;
+def vsetule : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETULE)>;
+def vsetune : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETUNE)>;
+def vseteq  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETEQ)>;
+def vsetgt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGT)>;
+def vsetge  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETGE)>;
+def vsetlt  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLT)>;
+def vsetle  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETLE)>;
+def vsetne  : PatFrag<(ops node:$lhs, node:$rhs),
+                      (setcc node:$lhs, node:$rhs, SETNE)>;
+
+class Vec_Compare<PatFrag op, NVPTXRegClass outrclass, NVPTXRegClass inrclass,
+  NVPTXInst sop>
+    : NVPTXVecInst<(outs outrclass:$dst),
+                   (ins  inrclass:$a, inrclass:$b),
+                   "Unsupported",
+                   [(set outrclass:$dst, (op inrclass:$a, inrclass:$b))],
+                   sop>;
+
+multiclass Vec_Compare_All<PatFrag op,
+                           NVPTXInst inst8,
+                           NVPTXInst inst16,
+                           NVPTXInst inst32,
+                           NVPTXInst inst64>
+{
+  def  V2I8 : Vec_Compare<op, V2I8Regs,  V2I8Regs,  inst8>;
+  def  V4I8 : Vec_Compare<op, V4I8Regs,  V4I8Regs,  inst8>;
+  def V2I16 : Vec_Compare<op, V2I16Regs, V2I16Regs, inst16>;
+  def V4I16 : Vec_Compare<op, V4I16Regs, V4I16Regs, inst16>;
+  def V2I32 : Vec_Compare<op, V2I32Regs, V2I32Regs, inst32>;
+  def V4I32 : Vec_Compare<op, V4I32Regs, V4I32Regs, inst32>;
+  def V2I64 : Vec_Compare<op, V2I64Regs, V2I64Regs, inst64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm VecSGT : Vec_Compare_All<vsetgt,  ISetSGTi8rr_toi8, ISetSGTi16rr_toi16,
+    ISetSGTi32rr_toi32, ISetSGTi64rr_toi64>;
+  defm VecUGT : Vec_Compare_All<vsetugt, ISetUGTi8rr_toi8, ISetUGTi16rr_toi16,
+    ISetUGTi32rr_toi32, ISetUGTi64rr_toi64>;
+  defm VecSLT : Vec_Compare_All<vsetlt,  ISetSLTi8rr_toi8, ISetSLTi16rr_toi16,
+    ISetSLTi32rr_toi32, ISetSLTi64rr_toi64>;
+  defm VecULT : Vec_Compare_All<vsetult, ISetULTi8rr_toi8, ISetULTi16rr_toi16,
+    ISetULTi32rr_toi32, ISetULTi64rr_toi64>;
+  defm VecSGE : Vec_Compare_All<vsetge,  ISetSGEi8rr_toi8, ISetSGEi16rr_toi16,
+    ISetSGEi32rr_toi32, ISetSGEi64rr_toi64>;
+  defm VecUGE : Vec_Compare_All<vsetuge, ISetUGEi8rr_toi8, ISetUGEi16rr_toi16,
+    ISetUGEi32rr_toi32, ISetUGEi64rr_toi64>;
+  defm VecSLE : Vec_Compare_All<vsetle,  ISetSLEi8rr_toi8, ISetSLEi16rr_toi16,
+    ISetSLEi32rr_toi32, ISetSLEi64rr_toi64>;
+  defm VecULE : Vec_Compare_All<vsetule, ISetULEi8rr_toi8, ISetULEi16rr_toi16,
+    ISetULEi32rr_toi32, ISetULEi64rr_toi64>;
+  defm VecSEQ : Vec_Compare_All<vseteq,  ISetSEQi8rr_toi8, ISetSEQi16rr_toi16,
+    ISetSEQi32rr_toi32, ISetSEQi64rr_toi64>;
+  defm VecUEQ : Vec_Compare_All<vsetueq, ISetUEQi8rr_toi8, ISetUEQi16rr_toi16,
+    ISetUEQi32rr_toi32, ISetUEQi64rr_toi64>;
+  defm VecSNE : Vec_Compare_All<vsetne,  ISetSNEi8rr_toi8, ISetSNEi16rr_toi16,
+    ISetSNEi32rr_toi32, ISetSNEi64rr_toi64>;
+  defm VecUNE : Vec_Compare_All<vsetune, ISetUNEi8rr_toi8, ISetUNEi16rr_toi16,
+    ISetUNEi32rr_toi32, ISetUNEi64rr_toi64>;
+}
+
+multiclass FVec_Compare_All<PatFrag op,
+                            NVPTXInst instf32,
+                            NVPTXInst instf64>
+{
+  def V2F32 : Vec_Compare<op, V2I32Regs, V2F32Regs, instf32>;
+  def V4F32 : Vec_Compare<op, V4I32Regs, V4F32Regs, instf32>;
+  def V2F64 : Vec_Compare<op, V2I64Regs, V2F64Regs, instf64>;
+}
+
+let VecInstType=isVecOther.Value in {
+  defm FVecGT :  FVec_Compare_All<vsetogt, FSetGTf32rr_toi32,
+    FSetGTf64rr_toi64>;
+  defm FVecLT :  FVec_Compare_All<vsetolt, FSetLTf32rr_toi32,
+    FSetLTf64rr_toi64>;
+  defm FVecGE :  FVec_Compare_All<vsetoge, FSetGEf32rr_toi32,
+    FSetGEf64rr_toi64>;
+  defm FVecLE :  FVec_Compare_All<vsetole, FSetLEf32rr_toi32,
+    FSetLEf64rr_toi64>;
+  defm FVecEQ :  FVec_Compare_All<vsetoeq, FSetEQf32rr_toi32,
+    FSetEQf64rr_toi64>;
+  defm FVecNE :  FVec_Compare_All<vsetone, FSetNEf32rr_toi32,
+    FSetNEf64rr_toi64>;
+
+  defm FVecUGT :  FVec_Compare_All<vsetugt, FSetUGTf32rr_toi32,
+    FSetUGTf64rr_toi64>;
+  defm FVecULT :  FVec_Compare_All<vsetult, FSetULTf32rr_toi32,
+    FSetULTf64rr_toi64>;
+  defm FVecUGE :  FVec_Compare_All<vsetuge, FSetUGEf32rr_toi32,
+    FSetUGEf64rr_toi64>;
+  defm FVecULE :  FVec_Compare_All<vsetule, FSetULEf32rr_toi32,
+    FSetULEf64rr_toi64>;
+  defm FVecUEQ :  FVec_Compare_All<vsetueq, FSetUEQf32rr_toi32,
+    FSetUEQf64rr_toi64>;
+  defm FVecUNE :  FVec_Compare_All<vsetune, FSetUNEf32rr_toi32,
+    FSetUNEf64rr_toi64>;
+
+  defm FVecNUM :  FVec_Compare_All<vseto,  FSetNUMf32rr_toi32,
+    FSetNUMf64rr_toi64>;
+  defm FVecNAN :  FVec_Compare_All<vsetuo, FSetNANf32rr_toi32,
+    FSetNANf64rr_toi64>;
+}
+
+class LoadParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2, regclass:$d3, regclass:$d4),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2, $d3, $d4}}, [retval0+$b];"), []>;
+
+class LoadParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$d1, regclass:$d2),
+                (ins i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("ld.param", opstr),
+                  "\t{{$d1, $d2}}, [retval0+$b];"), []>;
+
+
+class StoreParamScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreParamScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[param$a+$b], {{$s1, $s2}};"), []>;
+
+class StoreRetvalScalar4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, regclass:$s3, regclass:$s4,
+                  i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2, $s3, $s4}};"), []>;
+
+class StoreRetvalScalar2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$s1, regclass:$s2, i32imm:$a),
+                !strconcat(!strconcat("st.param", opstr),
+                  "\t[func_retval+$a], {{$s1, $s2}};"), []>;
+
+def LoadParamScalar4I32 : LoadParamScalar4Inst<Int32Regs, ".v4.b32">;
+def LoadParamScalar4I16 : LoadParamScalar4Inst<Int16Regs, ".v4.b16">;
+def LoadParamScalar4I8  : LoadParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def LoadParamScalar2I64 : LoadParamScalar2Inst<Int32Regs, ".v2.b64">;
+def LoadParamScalar2I32 : LoadParamScalar2Inst<Int32Regs, ".v2.b32">;
+def LoadParamScalar2I16 : LoadParamScalar2Inst<Int32Regs, ".v2.b16">;
+def LoadParamScalar2I8  : LoadParamScalar2Inst<Int32Regs, ".v2.b8">;
+
+def LoadParamScalar4F32 : LoadParamScalar4Inst<Float32Regs, ".v4.f32">;
+def LoadParamScalar2F32 : LoadParamScalar2Inst<Float32Regs, ".v2.f32">;
+def LoadParamScalar2F64 : LoadParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreParamScalar4I32 : StoreParamScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreParamScalar4I16 : StoreParamScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreParamScalar4I8  : StoreParamScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreParamScalar2I64 : StoreParamScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreParamScalar2I32 : StoreParamScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreParamScalar2I16 : StoreParamScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreParamScalar2I8  : StoreParamScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreParamScalar4F32 : StoreParamScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreParamScalar2F32 : StoreParamScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreParamScalar2F64 : StoreParamScalar2Inst<Float64Regs, ".v2.f64">;
+
+def StoreRetvalScalar4I32 : StoreRetvalScalar4Inst<Int32Regs, ".v4.b32">;
+def StoreRetvalScalar4I16 : StoreRetvalScalar4Inst<Int16Regs, ".v4.b16">;
+def StoreRetvalScalar4I8  : StoreRetvalScalar4Inst<Int8Regs, ".v4.b8">;
+
+def StoreRetvalScalar2I64 : StoreRetvalScalar2Inst<Int64Regs, ".v2.b64">;
+def StoreRetvalScalar2I32 : StoreRetvalScalar2Inst<Int32Regs, ".v2.b32">;
+def StoreRetvalScalar2I16 : StoreRetvalScalar2Inst<Int16Regs, ".v2.b16">;
+def StoreRetvalScalar2I8  : StoreRetvalScalar2Inst<Int8Regs, ".v2.b8">;
+
+def StoreRetvalScalar4F32 : StoreRetvalScalar4Inst<Float32Regs, ".v4.f32">;
+def StoreRetvalScalar2F32 : StoreRetvalScalar2Inst<Float32Regs, ".v2.f32">;
+def StoreRetvalScalar2F64 : StoreRetvalScalar2Inst<Float64Regs, ".v2.f64">;
+
+class LoadParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>:
+      NVPTXVecInst<(outs regclass:$dst), (ins i32imm:$a, i32imm:$b),
+                "loadparam : $dst <- [$a, $b]",
+                [(set regclass:$dst, (LoadParam (i32 imm:$a), (i32 imm:$b)))],
+                sop>;
+
+class StoreParamVecInst<NVPTXRegClass regclass, string opstr, NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                "storeparam : [$a, $b] <- $val",
+                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)], sop>;
+
+class StoreRetvalVecInst<NVPTXRegClass regclass, string opstr,
+  NVPTXInst sop=NOP>
+      : NVPTXVecInst<(outs), (ins regclass:$val, i32imm:$a),
+                "storeretval : retval[$a] <- $val",
+                [(StoreRetval (i32 imm:$a), regclass:$val)], sop>;
+
+let VecInstType=isVecLD.Value in {
+def LoadParamV4I32  : LoadParamVecInst<V4I32Regs, ".v4.b32",
+  LoadParamScalar4I32>;
+def LoadParamV4I16  : LoadParamVecInst<V4I16Regs, ".v4.b16",
+  LoadParamScalar4I16>;
+def LoadParamV4I8   : LoadParamVecInst<V4I8Regs, ".v4.b8",
+  LoadParamScalar4I8>;
+
+def LoadParamV2I64  : LoadParamVecInst<V2I64Regs, ".v2.b64",
+  LoadParamScalar2I64>;
+def LoadParamV2I32  : LoadParamVecInst<V2I32Regs, ".v2.b32",
+  LoadParamScalar2I32>;
+def LoadParamV2I16  : LoadParamVecInst<V2I16Regs, ".v2.b16",
+  LoadParamScalar2I16>;
+def LoadParamV2I8   : LoadParamVecInst<V2I8Regs, ".v2.b8",
+  LoadParamScalar2I8>;
+
+def LoadParamV4F32  : LoadParamVecInst<V4F32Regs, ".v4.f32",
+  LoadParamScalar4F32>;
+def LoadParamV2F32  : LoadParamVecInst<V2F32Regs, ".v2.f32",
+  LoadParamScalar2F32>;
+def LoadParamV2F64  : LoadParamVecInst<V2F64Regs, ".v2.f64",
+  LoadParamScalar2F64>;
+}
+
+let VecInstType=isVecST.Value in {
+def StoreParamV4I32  : StoreParamVecInst<V4I32Regs, ".v4.b32",
+  StoreParamScalar4I32>;
+def StoreParamV4I16  : StoreParamVecInst<V4I16Regs, ".v4.b16",
+  StoreParamScalar4I16>;
+def StoreParamV4I8   : StoreParamVecInst<V4I8Regs, ".v4.b8",
+  StoreParamScalar4I8>;
+
+def StoreParamV2I64  : StoreParamVecInst<V2I64Regs, ".v2.b64",
+  StoreParamScalar2I64>;
+def StoreParamV2I32  : StoreParamVecInst<V2I32Regs, ".v2.b32",
+  StoreParamScalar2I32>;
+def StoreParamV2I16  : StoreParamVecInst<V2I16Regs, ".v2.b16",
+  StoreParamScalar2I16>;
+def StoreParamV2I8   : StoreParamVecInst<V2I8Regs, ".v2.b8",
+  StoreParamScalar2I8>;
+
+def StoreParamV4F32  : StoreParamVecInst<V4F32Regs, ".v4.f32",
+  StoreParamScalar4F32>;
+def StoreParamV2F32  : StoreParamVecInst<V2F32Regs, ".v2.f32",
+  StoreParamScalar2F32>;
+def StoreParamV2F64  : StoreParamVecInst<V2F64Regs, ".v2.f64",
+  StoreParamScalar2F64>;
+
+def StoreRetvalV4I32  : StoreRetvalVecInst<V4I32Regs, ".v4.b32",
+  StoreRetvalScalar4I32>;
+def StoreRetvalV4I16  : StoreRetvalVecInst<V4I16Regs, ".v4.b16",
+  StoreRetvalScalar4I16>;
+def StoreRetvalV4I8   : StoreRetvalVecInst<V4I8Regs,  ".v4.b8",
+  StoreRetvalScalar4I8>;
+
+def StoreRetvalV2I64  : StoreRetvalVecInst<V2I64Regs, ".v2.b64",
+  StoreRetvalScalar2I64>;
+def StoreRetvalV2I32  : StoreRetvalVecInst<V2I32Regs, ".v2.b32",
+  StoreRetvalScalar2I32>;
+def StoreRetvalV2I16  : StoreRetvalVecInst<V2I16Regs, ".v2.b16",
+  StoreRetvalScalar2I16>;
+def StoreRetvalV2I8   : StoreRetvalVecInst<V2I8Regs,  ".v2.b8",
+  StoreRetvalScalar2I8>;
+
+def StoreRetvalV4F32  : StoreRetvalVecInst<V4F32Regs, ".v4.f32",
+  StoreRetvalScalar4F32>;
+def StoreRetvalV2F32  : StoreRetvalVecInst<V2F32Regs, ".v2.f32",
+  StoreRetvalScalar2F32>;
+def StoreRetvalV2F64  : StoreRetvalVecInst<V2F64Regs, ".v2.f64",
+  StoreRetvalScalar2F64>;
+
+}
+
+
+// Int vector to int scalar bit convert
+// v4i8 -> i32
+def : Pat<(i32 (bitconvert V4I8Regs:$s)),
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                     (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3))>;
+// v4i16 -> i64
+def : Pat<(i64 (bitconvert V4I16Regs:$s)),
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                     (V4i16Extract V4I16Regs:$s,2),
+                     (V4i16Extract V4I16Regs:$s,3))>;
+// v2i8 -> i16
+def : Pat<(i16 (bitconvert V2I8Regs:$s)),
+          (V2I8toI16 (V2i8Extract V2I8Regs:$s,0), (V2i8Extract V2I8Regs:$s,1))>;
+// v2i16 -> i32
+def : Pat<(i32 (bitconvert V2I16Regs:$s)),
+          (V2I16toI32 (V2i16Extract V2I16Regs:$s,0),
+            (V2i16Extract V2I16Regs:$s,1))>;
+// v2i32 -> i64
+def : Pat<(i64 (bitconvert V2I32Regs:$s)),
+          (V2I32toI64 (V2i32Extract V2I32Regs:$s,0),
+            (V2i32Extract V2I32Regs:$s,1))>;
+
+// Int scalar to int vector bit convert
+let VecInstType=isVecDest.Value in {
+// i32 -> v4i8
+def VecI32toV4I8 : NVPTXVecInst<(outs V4I8Regs:$d), (ins Int32Regs:$s),
+                                "Error!",
+                                [(set V4I8Regs:$d, (bitconvert Int32Regs:$s))],
+                                I32toV4I8>;
+// i64 -> v4i16
+def VecI64toV4I16 : NVPTXVecInst<(outs V4I16Regs:$d), (ins Int64Regs:$s),
+                                 "Error!",
+                                [(set V4I16Regs:$d, (bitconvert Int64Regs:$s))],
+                                 I64toV4I16>;
+// i16 -> v2i8
+def VecI16toV2I8 : NVPTXVecInst<(outs V2I8Regs:$d), (ins Int16Regs:$s),
+                                "Error!",
+                               [(set V2I8Regs:$d, (bitconvert Int16Regs:$s))],
+                                I16toV2I8>;
+// i32 -> v2i16
+def VecI32toV2I16 : NVPTXVecInst<(outs V2I16Regs:$d), (ins Int32Regs:$s),
+                                 "Error!",
+                                [(set V2I16Regs:$d, (bitconvert Int32Regs:$s))],
+                                 I32toV2I16>;
+// i64 -> v2i32
+def VecI64toV2I32 : NVPTXVecInst<(outs V2I32Regs:$d), (ins Int64Regs:$s),
+                                  "Error!",
+                                [(set V2I32Regs:$d, (bitconvert Int64Regs:$s))],
+                                  I64toV2I32>;
+}
+
+// Int vector to int vector bit convert
+// v4i8 -> v2i16
+def : Pat<(v2i16 (bitconvert V4I8Regs:$s)),
+          (VecI32toV2I16
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> v2i32
+def : Pat<(v2i32 (bitconvert V4I16Regs:$s)),
+          (VecI64toV2I32
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> v4i8
+def : Pat<(v4i8 (bitconvert V2I16Regs:$s)),
+          (VecI32toV4I8
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> v4i16
+def : Pat<(v4i16 (bitconvert V2I32Regs:$s)),
+          (VecI64toV4I16
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+// v2i64 -> v4i32
+def : Pat<(v4i32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_i32
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 0)), 1),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 0),
+            (V2i32Extract (VecI64toV2I32 (V2i64Extract V2I64Regs:$s, 1)), 1))>;
+// v4i32 -> v2i64
+def : Pat<(v2i64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_i64
+      (V2I32toI64 (V4i32Extract V4I32Regs:$s,0), (V4i32Extract V4I32Regs:$s,1)),
+    (V2I32toI64 (V4i32Extract V4I32Regs:$s,2), (V4i32Extract V4I32Regs:$s,3)))>;
+
+// Fp scalar to fp vector convert
+// f64 -> v2f32
+let VecInstType=isVecDest.Value in {
+def VecF64toV2F32 : NVPTXVecInst<(outs V2F32Regs:$d), (ins Float64Regs:$s),
+                                  "Error!",
+                              [(set V2F32Regs:$d, (bitconvert Float64Regs:$s))],
+                                  F64toV2F32>;
+}
+
+// Fp vector to fp scalar convert
+// v2f32 -> f64
+def : Pat<(f64 (bitconvert V2F32Regs:$s)),
+     (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1))>;
+
+// Fp scalar to int vector convert
+// f32 -> v4i8
+def : Pat<(v4i8 (bitconvert Float32Regs:$s)),
+          (VecI32toV4I8 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f32 -> v2i16
+def : Pat<(v2i16 (bitconvert Float32Regs:$s)),
+          (VecI32toV2I16 (BITCONVERT_32_F2I Float32Regs:$s))>;
+// f64 -> v4i16
+def : Pat<(v4i16 (bitconvert Float64Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I Float64Regs:$s))>;
+// f64 -> v2i32
+def : Pat<(v2i32 (bitconvert Float64Regs:$s)),
+          (VecI64toV2I32 (BITCONVERT_64_F2I Float64Regs:$s))>;
+
+// Int vector to fp scalar convert
+// v4i8 -> f32
+def : Pat<(f32 (bitconvert V4I8Regs:$s)),
+          (BITCONVERT_32_I2F
+          (V4I8toI32 (V4i8Extract V4I8Regs:$s,0), (V4i8Extract V4I8Regs:$s,1),
+                    (V4i8Extract V4I8Regs:$s,2), (V4i8Extract V4I8Regs:$s,3)))>;
+// v4i16 -> f64
+def : Pat<(f64 (bitconvert V4I16Regs:$s)),
+          (BITCONVERT_64_I2F
+       (V4I16toI64 (V4i16Extract V4I16Regs:$s,0), (V4i16Extract V4I16Regs:$s,1),
+                (V4i16Extract V4I16Regs:$s,2), (V4i16Extract V4I16Regs:$s,3)))>;
+// v2i16 -> f32
+def : Pat<(f32 (bitconvert V2I16Regs:$s)),
+          (BITCONVERT_32_I2F
+    (V2I16toI32 (V2i16Extract V2I16Regs:$s,0), (V2i16Extract V2I16Regs:$s,1)))>;
+// v2i32 -> f64
+def : Pat<(f64 (bitconvert V2I32Regs:$s)),
+          (BITCONVERT_64_I2F
+    (V2I32toI64 (V2i32Extract V2I32Regs:$s,0), (V2i32Extract V2I32Regs:$s,1)))>;
+
+// Int scalar to fp vector convert
+// i64 -> v2f32
+def : Pat<(v2f32 (bitconvert Int64Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F Int64Regs:$s))>;
+
+// Fp vector to int scalar convert
+// v2f32 -> i64
+def : Pat<(i64 (bitconvert V2F32Regs:$s)),
+          (BITCONVERT_64_F2I
+    (V2F32toF64 (V2f32Extract V2F32Regs:$s,0), (V2f32Extract V2F32Regs:$s,1)))>;
+
+// Int vector to fp vector convert
+// v2i64 -> v4f32
+def : Pat<(v4f32 (bitconvert V2I64Regs:$s)),
+          (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_I2F (V2i32Extract (VecI64toV2I32
+              (V2i64Extract V2I64Regs:$s, 1)), 1)))>;
+// v2i64 -> v2f64
+def : Pat<(v2f64 (bitconvert V2I64Regs:$s)),
+    (Build_Vector2_f64
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,0)),
+            (BITCONVERT_64_I2F (V2i64Extract V2I64Regs:$s,1)))>;
+// v2i32 -> v2f32
+def : Pat<(v2f32 (bitconvert V2I32Regs:$s)),
+    (Build_Vector2_f32
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V2i32Extract V2I32Regs:$s,1)))>;
+// v4i32 -> v2f64
+def : Pat<(v2f64 (bitconvert V4I32Regs:$s)),
+          (Build_Vector2_f64
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,0),
+             (V4i32Extract V4I32Regs:$s,1))),
+           (BITCONVERT_64_I2F (V2I32toI64 (V4i32Extract V4I32Regs:$s,2),
+             (V4i32Extract V4I32Regs:$s,3))))>;
+// v4i32 -> v4f32
+def : Pat<(v4f32 (bitconvert V4I32Regs:$s)),
+    (Build_Vector4_f32
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,0)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,1)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,2)),
+            (BITCONVERT_32_I2F (V4i32Extract V4I32Regs:$s,3)))>;
+// v4i16 -> v2f32
+def : Pat<(v2f32 (bitconvert V4I16Regs:$s)),
+          (VecF64toV2F32 (BITCONVERT_64_I2F
+          (V4I16toI64 (V4i16Extract V4I16Regs:$s,0),
+            (V4i16Extract V4I16Regs:$s,1),
+                      (V4i16Extract V4I16Regs:$s,2),
+                      (V4i16Extract V4I16Regs:$s,3))))>;
+
+// Fp vector to int vector convert
+// v2i64 <- v4f32
+def : Pat<(v2i64 (bitconvert V4F32Regs:$s)),
+          (Build_Vector2_i64
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,0),
+             (V4f32Extract V4F32Regs:$s,1))),
+           (BITCONVERT_64_F2I (V2F32toF64 (V4f32Extract V4F32Regs:$s,2),
+             (V4f32Extract V4F32Regs:$s,3))))>;
+// v2i64 <- v2f64
+def : Pat<(v2i64 (bitconvert V2F64Regs:$s)),
+    (Build_Vector2_i64
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,0)),
+            (BITCONVERT_64_F2I (V2f64Extract V2F64Regs:$s,1)))>;
+// v2i32 <- v2f32
+def : Pat<(v2i32 (bitconvert V2F32Regs:$s)),
+    (Build_Vector2_i32
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V2f32Extract V2F32Regs:$s,1)))>;
+// v4i32 <- v2f64
+def : Pat<(v4i32 (bitconvert V2F64Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 0)), 1)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 0)),
+            (BITCONVERT_32_F2I (V2f32Extract (VecF64toV2F32
+              (V2f64Extract V2F64Regs:$s, 1)), 1)))>;
+// v4i32 <- v4f32
+def : Pat<(v4i32 (bitconvert V4F32Regs:$s)),
+          (Build_Vector4_i32
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,0)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,1)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,2)),
+            (BITCONVERT_32_F2I (V4f32Extract V4F32Regs:$s,3)))>;
+// v4i16 <- v2f32
+def : Pat<(v4i16 (bitconvert V2F32Regs:$s)),
+          (VecI64toV4I16 (BITCONVERT_64_F2I
+          (V2F32toF64 (V2f32Extract V2F32Regs:$s,0),
+            (V2f32Extract V2F32Regs:$s,1))))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
new file mode 100644
index 000000000000..5f074b33a2d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.cpp
@@ -0,0 +1,90 @@
+//===-- NVPTXutil.cpp - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXutil.h"
+#include "NVPTX.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+bool isParamLoad(const MachineInstr *MI) {
+  if ((MI->getOpcode() != NVPTX::LD_i32_avar) &&
+      (MI->getOpcode() != NVPTX::LD_i64_avar))
+    return false;
+  if (MI->getOperand(2).isImm() == false)
+    return false;
+  if (MI->getOperand(2).getImm() != NVPTX::PTXLdStInstCode::PARAM)
+    return false;
+  return true;
+}
+
+#define DATA_MASK 0x7f
+#define DIGIT_WIDTH 7
+#define MORE_BYTES 0x80
+
+static int encode_leb128(uint64_t val, int *nbytes, char *space, int splen) {
+  char *a;
+  char *end = space + splen;
+
+  a = space;
+  do {
+    unsigned char uc;
+
+    if (a >= end)
+      return 1;
+    uc = val & DATA_MASK;
+    val >>= DIGIT_WIDTH;
+    if (val != 0)
+      uc |= MORE_BYTES;
+    *a = uc;
+    a++;
+  } while (val);
+  *nbytes = a - space;
+  return 0;
+}
+
+#undef DATA_MASK
+#undef DIGIT_WIDTH
+#undef MORE_BYTES
+
+uint64_t encode_leb128(const char *str) {
+  union {
+    uint64_t x;
+    char a[8];
+  } temp64;
+
+  temp64.x = 0;
+
+  for (unsigned i = 0, e = strlen(str); i != e; ++i)
+    temp64.a[i] = str[e - 1 - i];
+
+  char encoded[16];
+  int nbytes;
+
+  int retval = encode_leb128(temp64.x, &nbytes, encoded, 16);
+
+  (void) retval;
+  assert(retval == 0 && "Encoding to leb128 failed");
+
+  assert(nbytes <= 8 &&
+         "Cannot support register names with leb128 encoding > 8 bytes");
+
+  temp64.x = 0;
+  for (int i = 0; i < nbytes; ++i)
+    temp64.a[i] = encoded[i];
+
+  return temp64.x;
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
new file mode 100644
index 000000000000..d1d117159486
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXutil.h
@@ -0,0 +1,25 @@
+//===-- NVPTXutil.h - Functions exported to CodeGen --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions that can be used in CodeGen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_NVPTX_UTIL_H
+#define LLVM_TARGET_NVPTX_UTIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+bool isParamLoad(const MachineInstr *);
+uint64_t encode_leb128(const char *str);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
new file mode 100644
index 000000000000..0ad62ce39b0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -0,0 +1,177 @@
+//===- NVVMReflect.cpp - NVVM Emulate conditional compilation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces occurences of __nvvm_reflect("string") with an
+// integer based on -nvvm-reflect-list string=<int> option given to this pass.
+// If an undefined string value is seen in a call to __nvvm_reflect("string"),
+// a default value of 0 will be used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#define NVVM_REFLECT_FUNCTION "__nvvm_reflect"
+
+using namespace llvm;
+
+namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
+
+namespace {
+class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass {
+private:
+  StringMap<int> VarMap;
+  typedef DenseMap<std::string, int>::iterator VarMapIter;
+  Function *ReflectFunction;
+
+public:
+  static char ID;
+  NVVMReflect() : ModulePass(ID) {
+    VarMap.clear();
+    ReflectFunction = 0;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
+  virtual bool runOnModule(Module &);
+
+  void setVarMap();
+};
+}
+
+static cl::opt<bool>
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
+                   cl::desc("NVVM reflection, enabled by default"));
+
+char NVVMReflect::ID = 0;
+INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
+                "Replace occurences of __nvvm_reflect() calls with 0/1", false,
+                false)
+
+static cl::list<std::string>
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"),
+            cl::desc("A list of string=num assignments"),
+            cl::ValueRequired);
+
+/// The command line can look as follows :
+/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
+/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
+/// ReflectList vector. First, each of ReflectList[i] is 'split'
+/// using "," as the delimiter. Then each of this part is split
+/// using "=" as the delimiter.
+void NVVMReflect::setVarMap() {
+  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
+    SmallVector<StringRef, 4> NameValList;
+    StringRef(ReflectList[i]).split(NameValList, ",");
+    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
+      SmallVector<StringRef, 2> NameValPair;
+      NameValList[j].split(NameValPair, "=");
+      assert(NameValPair.size() == 2 && "name=val expected");
+      std::stringstream ValStream(NameValPair[1]);
+      int Val;
+      ValStream >> Val;
+      assert((!(ValStream.fail())) && "integer value expected");
+      VarMap[NameValPair[0]] = Val;
+    }
+  }
+}
+
+bool NVVMReflect::runOnModule(Module &M) {
+  if (!NVVMReflectEnabled)
+    return false;
+
+  setVarMap();
+
+  ReflectFunction = M.getFunction(NVVM_REFLECT_FUNCTION);
+
+  // If reflect function is not used, then there will be
+  // no entry in the module.
+  if (ReflectFunction == 0)
+    return false;
+
+  // Validate _reflect function
+  assert(ReflectFunction->isDeclaration() &&
+         "_reflect function should not have a body");
+  assert(ReflectFunction->getReturnType()->isIntegerTy() &&
+         "_reflect's return type should be integer");
+
+  std::vector<Instruction *> ToRemove;
+
+  // Go through the uses of ReflectFunction in this Function.
+  // Each of them should a CallInst with a ConstantArray argument.
+  // First validate that. If the c-string corresponding to the
+  // ConstantArray can be found successfully, see if it can be
+  // found in VarMap. If so, replace the uses of CallInst with the
+  // value found in VarMap. If not, replace the use  with value 0.
+  for (Value::use_iterator I = ReflectFunction->use_begin(),
+                           E = ReflectFunction->use_end();
+       I != E; ++I) {
+    assert(isa<CallInst>(*I) && "Only a call instruction can use _reflect");
+    CallInst *Reflect = cast<CallInst>(*I);
+
+    assert((Reflect->getNumOperands() == 2) &&
+           "Only one operand expect for _reflect function");
+    // In cuda, we will have an extra constant-to-generic conversion of
+    // the string.
+    const Value *conv = Reflect->getArgOperand(0);
+    assert(isa<CallInst>(conv) && "Expected a const-to-gen conversion");
+    const CallInst *ConvCall = cast<CallInst>(conv);
+    const Value *str = ConvCall->getArgOperand(0);
+    assert(isa<ConstantExpr>(str) &&
+           "Format of _reflect function not recognized");
+    const ConstantExpr *GEP = cast<ConstantExpr>(str);
+
+    const Value *Sym = GEP->getOperand(0);
+    assert(isa<Constant>(Sym) && "Format of _reflect function not recognized");
+
+    const Constant *SymStr = cast<Constant>(Sym);
+
+    assert(isa<ConstantDataSequential>(SymStr->getOperand(0)) &&
+           "Format of _reflect function not recognized");
+
+    assert(cast<ConstantDataSequential>(SymStr->getOperand(0))->isCString() &&
+           "Format of _reflect function not recognized");
+
+    std::string ReflectArg =
+        cast<ConstantDataSequential>(SymStr->getOperand(0))->getAsString();
+
+    ReflectArg = ReflectArg.substr(0, ReflectArg.size() - 1);
+    DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
+
+    int ReflectVal = 0; // The default value is 0
+    if (VarMap.find(ReflectArg) != VarMap.end()) {
+      ReflectVal = VarMap[ReflectArg];
+    }
+    Reflect->replaceAllUsesWith(
+        ConstantInt::get(Reflect->getType(), ReflectVal));
+    ToRemove.push_back(Reflect);
+  }
+  if (ToRemove.size() == 0)
+    return false;
+
+  for (unsigned i = 0, e = ToRemove.size(); i != e; ++i)
+    ToRemove[i]->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
new file mode 100644
index 000000000000..cc7d4dc5ece7
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- NVPTXTargetInfo.cpp - NVPTX Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheNVPTXTarget32;
+Target llvm::TheNVPTXTarget64;
+
+extern "C" void LLVMInitializeNVPTXTargetInfo() {
+  RegisterTarget<Triple::nvptx> X(TheNVPTXTarget32, "nvptx",
+                                  "NVIDIA PTX 32-bit");
+  RegisterTarget<Triple::nvptx64> Y(TheNVPTXTarget64, "nvptx64",
+                                    "NVIDIA PTX 64-bit");
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
new file mode 100644
index 000000000000..45cc0b8b67f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/NVPTX/cl_common_defines.h
@@ -0,0 +1,122 @@
+#ifndef __CL_COMMON_DEFINES_H__
+#define __CL_COMMON_DEFINES_H__
+// This file includes defines that are common to both kernel code and
+// the NVPTX back-end.
+
+//
+// Common defines for Image intrinsics
+// Channel order
+enum {
+  CLK_R = 0x10B0,
+  CLK_A = 0x10B1,
+  CLK_RG = 0x10B2,
+  CLK_RA = 0x10B3,
+  CLK_RGB = 0x10B4,
+  CLK_RGBA = 0x10B5,
+  CLK_BGRA = 0x10B6,
+  CLK_ARGB = 0x10B7,
+
+#if (__NV_CL_C_VERSION == __NV_CL_C_VERSION_1_0)
+  CLK_xRGB = 0x10B7,
+#endif
+
+  CLK_INTENSITY = 0x10B8,
+  CLK_LUMINANCE = 0x10B9
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+      ,
+  CLK_Rx = 0x10BA,
+  CLK_RGx = 0x10BB,
+  CLK_RGBx = 0x10BC
+#endif
+};
+
+typedef enum clk_channel_type {
+  // valid formats for float return types
+  CLK_SNORM_INT8 = 0x10D0,  // four channel RGBA unorm8
+  CLK_SNORM_INT16 = 0x10D1, // four channel RGBA unorm16
+  CLK_UNORM_INT8 = 0x10D2,  // four channel RGBA unorm8
+  CLK_UNORM_INT16 = 0x10D3, // four channel RGBA unorm16
+  CLK_HALF_FLOAT = 0x10DD,  // four channel RGBA half
+  CLK_FLOAT = 0x10DE,       // four channel RGBA float
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_UNORM_SHORT_565 = 0x10D4,
+  CLK_UNORM_SHORT_555 = 0x10D5,
+  CLK_UNORM_INT_101010 = 0x10D6,
+#endif
+
+  // valid only for integer return types
+  CLK_SIGNED_INT8 = 0x10D7,
+  CLK_SIGNED_INT16 = 0x10D8,
+  CLK_SIGNED_INT32 = 0x10D9,
+  CLK_UNSIGNED_INT8 = 0x10DA,
+  CLK_UNSIGNED_INT16 = 0x10DB,
+  CLK_UNSIGNED_INT32 = 0x10DC,
+
+  // CI SPI for CPU
+  __CLK_UNORM_INT8888,  // four channel ARGB unorm8
+  __CLK_UNORM_INT8888R, // four channel BGRA unorm8
+
+  __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_INVALID_IMAGE_TYPE = __CLK_VALID_IMAGE_TYPE_COUNT,
+  __CLK_VALID_IMAGE_TYPE_MASK_BITS = 4, // number of bits required to
+                                        // represent any image type
+  __CLK_VALID_IMAGE_TYPE_MASK = (1 << __CLK_VALID_IMAGE_TYPE_MASK_BITS) - 1
+} clk_channel_type;
+
+typedef enum clk_sampler_type {
+  __CLK_ADDRESS_BASE = 0,
+  CLK_ADDRESS_NONE = 0 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP = 1 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_CLAMP_TO_EDGE = 2 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_REPEAT = 3 << __CLK_ADDRESS_BASE,
+  CLK_ADDRESS_MIRROR = 4 << __CLK_ADDRESS_BASE,
+
+#if (__NV_CL_C_VERSION >= __NV_CL_C_VERSION_1_1)
+  CLK_ADDRESS_MIRRORED_REPEAT = CLK_ADDRESS_MIRROR,
+#endif
+  __CLK_ADDRESS_MASK =
+      CLK_ADDRESS_NONE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP_TO_EDGE |
+      CLK_ADDRESS_REPEAT | CLK_ADDRESS_MIRROR,
+  __CLK_ADDRESS_BITS = 3, // number of bits required to
+                          // represent address info
+
+  __CLK_NORMALIZED_BASE = __CLK_ADDRESS_BITS,
+  CLK_NORMALIZED_COORDS_FALSE = 0,
+  CLK_NORMALIZED_COORDS_TRUE = 1 << __CLK_NORMALIZED_BASE,
+  __CLK_NORMALIZED_MASK =
+      CLK_NORMALIZED_COORDS_FALSE | CLK_NORMALIZED_COORDS_TRUE,
+  __CLK_NORMALIZED_BITS = 1, // number of bits required to
+                             // represent normalization
+
+  __CLK_FILTER_BASE = __CLK_NORMALIZED_BASE + __CLK_NORMALIZED_BITS,
+  CLK_FILTER_NEAREST = 0 << __CLK_FILTER_BASE,
+  CLK_FILTER_LINEAR = 1 << __CLK_FILTER_BASE,
+  CLK_FILTER_ANISOTROPIC = 2 << __CLK_FILTER_BASE,
+  __CLK_FILTER_MASK =
+      CLK_FILTER_NEAREST | CLK_FILTER_LINEAR | CLK_FILTER_ANISOTROPIC,
+  __CLK_FILTER_BITS = 2, // number of bits required to
+                         // represent address info
+
+  __CLK_MIP_BASE = __CLK_FILTER_BASE + __CLK_FILTER_BITS,
+  CLK_MIP_NEAREST = 0 << __CLK_MIP_BASE,
+  CLK_MIP_LINEAR = 1 << __CLK_MIP_BASE,
+  CLK_MIP_ANISOTROPIC = 2 << __CLK_MIP_BASE,
+  __CLK_MIP_MASK = CLK_MIP_NEAREST | CLK_MIP_LINEAR | CLK_MIP_ANISOTROPIC,
+  __CLK_MIP_BITS = 2,
+
+  __CLK_SAMPLER_BITS = __CLK_MIP_BASE + __CLK_MIP_BITS,
+  __CLK_SAMPLER_MASK = __CLK_MIP_MASK | __CLK_FILTER_MASK |
+                       __CLK_NORMALIZED_MASK | __CLK_ADDRESS_MASK,
+
+  __CLK_ANISOTROPIC_RATIO_BITS = 5,
+  __CLK_ANISOTROPIC_RATIO_MASK =
+      (int) 0x80000000 >> (__CLK_ANISOTROPIC_RATIO_BITS - 1)
+} clk_sampler_type;
+
+// Memory synchronization
+#define CLK_LOCAL_MEM_FENCE (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+#endif // __CL_COMMON_DEFINES_H__
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
new file mode 100644
index 000000000000..bacc108c62b4
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -0,0 +1,294 @@
+//===-- PPCInstPrinter.cpp - Convert PPC MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "PPCInstPrinter.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#include "PPCGenAsmWriter.inc"
+
+void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char MB = MI->getOperand(3).getImm();
+    unsigned char ME = MI->getOperand(4).getImm();
+    bool useSubstituteMnemonic = false;
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "\tslwi "; useSubstituteMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "\tsrwi "; useSubstituteMnemonic = true;
+      SH = 32-SH;
+    }
+    if (useSubstituteMnemonic) {
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
+      MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+    O << "\tmr ";
+    printOperand(MI, 0, O);
+    O << ", ";
+    printOperand(MI, 1, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  
+  if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char ME = MI->getOperand(3).getImm();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "\tsldi ";
+      printOperand(MI, 0, O);
+      O << ", ";
+      printOperand(MI, 1, O);
+      O << ", " << (unsigned int)SH;
+      printAnnotation(O, Annot);
+      return;
+    }
+  }
+  
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+
+void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O, 
+                                           const char *Modifier) {
+  unsigned Code = MI->getOperand(OpNo).getImm();
+
+  if (StringRef(Modifier) == "cc") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT: O << "lt"; return;
+    case PPC::PRED_LE: O << "le"; return;
+    case PPC::PRED_EQ: O << "eq"; return;
+    case PPC::PRED_GE: O << "ge"; return;
+    case PPC::PRED_GT: O << "gt"; return;
+    case PPC::PRED_NE: O << "ne"; return;
+    case PPC::PRED_UN: O << "un"; return;
+    case PPC::PRED_NU: O << "nu"; return;
+    }
+  }
+  
+  assert(StringRef(Modifier) == "reg" &&
+         "Need to specify 'cc' or 'reg' as predicate op modifier!");
+  printOperand(MI, OpNo+1, O);
+}
+
+void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  int Value = MI->getOperand(OpNo).getImm();
+  Value = SignExtend32<5>(Value);
+  O << (int)Value;
+}
+
+void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 31 && "Invalid u5imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value <= 63 && "Invalid u6imm argument!");
+  O << (unsigned int)Value;
+}
+
+void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << (short)MI->getOperand(OpNo).getImm();
+}
+
+void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  O << (unsigned short)MI->getOperand(OpNo).getImm();
+}
+
+void PPCInstPrinter::printS16X4ImmOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    O << (short)(MI->getOperand(OpNo).getImm()*4);
+  else
+    printOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
+  // Branches can take an immediate operand.  This is used by the branch
+  // selection pass to print $+8, an eight byte displacement from the PC.
+  O << "$+";
+  printAbsAddrOperand(MI, OpNo, O);
+}
+
+void PPCInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  O << (int)MI->getOperand(OpNo).getImm()*4;
+}
+
+
+void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  unsigned CCReg = MI->getOperand(OpNo).getReg();
+  unsigned RegNo;
+  switch (CCReg) {
+  default: llvm_unreachable("Unknown CR register");
+  case PPC::CR0: RegNo = 0; break;
+  case PPC::CR1: RegNo = 1; break;
+  case PPC::CR2: RegNo = 2; break;
+  case PPC::CR3: RegNo = 3; break;
+  case PPC::CR4: RegNo = 4; break;
+  case PPC::CR5: RegNo = 5; break;
+  case PPC::CR6: RegNo = 6; break;
+  case PPC::CR7: RegNo = 7; break;
+  }
+  O << (0x80 >> RegNo);
+}
+
+void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  printSymbolLo(MI, OpNo, O);
+  O << '(';
+  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo+1, O);
+  O << ')';
+}
+
+void PPCInstPrinter::printMemRegImmShifted(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    printS16X4ImmOperand(MI, OpNo, O);
+  else
+    printSymbolLo(MI, OpNo, O);
+  O << '(';
+  
+  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo+1, O);
+  O << ')';
+}
+
+
+void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  // When used as the base register, r0 reads constant zero rather than
+  // the value contained in the register.  For this reason, the darwin
+  // assembler requires that we print r0 as 0 (no r) when used as the base.
+  if (MI->getOperand(OpNo).getReg() == PPC::R0)
+    O << "0";
+  else
+    printOperand(MI, OpNo, O);
+  O << ", ";
+  printOperand(MI, OpNo+1, O);
+}
+
+
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  switch (RegName[0]) {
+  case 'r':
+  case 'f':
+  case 'v': return RegName + 1;
+  case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+  
+  return RegName;
+}
+
+void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    const char *RegName = getRegisterName(Op.getReg());
+    // The linux and AIX assembler does not take register prefixes.
+    if (!isDarwinSyntax())
+      RegName = stripRegisterPrefix(RegName);
+    
+    O << RegName;
+    return;
+  }
+  
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+  
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  O << *Op.getExpr();
+}
+  
+void PPCInstPrinter::printSymbolLo(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    return printS16ImmOperand(MI, OpNo, O);
+  
+  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
+  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
+  if (MI->getOperand(OpNo).isExpr() &&
+      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
+    O << "lo16(";
+    printOperand(MI, OpNo, O);
+    O << ')';
+  } else {
+    printOperand(MI, OpNo, O);
+  }
+}
+
+void PPCInstPrinter::printSymbolHi(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm())
+    return printS16ImmOperand(MI, OpNo, O);
+
+  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
+  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
+  if (MI->getOperand(OpNo).isExpr() &&
+      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
+    O << "ha16(";
+    printOperand(MI, OpNo, O);
+    O << ')';
+  } else {
+    printOperand(MI, OpNo, O);
+  }
+}
+
+
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
new file mode 100644
index 000000000000..8f1e211c3e96
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -0,0 +1,69 @@
+//===- PPCInstPrinter.h - Convert PPC MCInst to assembly syntax -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an PPC MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCINSTPRINTER_H
+#define PPCINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class PPCInstPrinter : public MCInstPrinter {
+  // 0 -> AIX, 1 -> Darwin.
+  unsigned SyntaxVariant;
+public:
+  PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                 const MCRegisterInfo &MRI, unsigned syntaxVariant)
+    : MCInstPrinter(MAI, MII, MRI), SyntaxVariant(syntaxVariant) {}
+  
+  bool isDarwinSyntax() const {
+    return SyntaxVariant == 1;
+  }
+  
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPredicateOperand(const MCInst *MI, unsigned OpNo,
+                             raw_ostream &O, const char *Modifier = 0);
+
+
+  void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16X4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
+  void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegImmShifted(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  
+  // FIXME: Remove
+  void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
new file mode 100644
index 000000000000..ec2657403e0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -0,0 +1,195 @@
+//===-- PPCAsmBackend.cpp - PPC Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+  case PPC::fixup_ppc_tlsreg:
+  case PPC::fixup_ppc_nofixup:
+    return Value;
+  case PPC::fixup_ppc_brcond14:
+    return Value & 0xfffc;
+  case PPC::fixup_ppc_br24:
+    return Value & 0x3fffffc;
+#if 0
+  case PPC::fixup_ppc_hi16:
+    return (Value >> 16) & 0xffff;
+#endif
+  case PPC::fixup_ppc_ha16:
+    return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
+  case PPC::fixup_ppc_lo16:
+    return Value & 0xffff;
+  case PPC::fixup_ppc_lo16_ds:
+    return Value & 0xfffc;
+  }
+}
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) {
+    llvm_unreachable("Relocation emission for MachO/PPC unimplemented!");
+  }
+};
+
+class PPCAsmBackend : public MCAsmBackend {
+const Target &TheTarget;
+public:
+  PPCAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
+
+  unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[PPC::NumTargetFixupKinds] = {
+      // name                    offset  bits  flags
+      { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_lo16",        16,     16,   0 },
+      { "fixup_ppc_ha16",        16,     16,   0 },
+      { "fixup_ppc_lo16_ds",     16,     14,   0 },
+      { "fixup_ppc_tlsreg",       0,      0,   0 },
+      { "fixup_ppc_nofixup",      0,      0,   0 }
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value) return;           // Doesn't change encoding.
+
+    unsigned Offset = Fixup.getOffset();
+
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value. The Value has been "split up" into the appropriate
+    // bitfields above.
+    for (unsigned i = 0; i != 4; ++i)
+      Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const {
+    // FIXME.
+    return false;
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    // FIXME.
+    llvm_unreachable("relaxInstruction() unimplemented");
+  }
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    // FIXME: Zero fill for now. That's not right, but at least will get the
+    // section size right.
+    for (uint64_t i = 0; i != Count; ++i)
+      OW->Write8(0);
+    return true;
+  }
+
+  unsigned getPointerSize() const {
+    StringRef Name = TheTarget.getName();
+    if (Name == "ppc64") return 8;
+    assert(Name == "ppc32" && "Unknown target name!");
+    return 4;
+  }
+};
+} // end anonymous namespace
+
+
+// FIXME: This should be in a separate file.
+namespace {
+  class DarwinPPCAsmBackend : public PPCAsmBackend {
+  public:
+    DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T) { }
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      bool is64 = getPointerSize() == 8;
+      return createMachObjectWriter(new PPCMachObjectWriter(
+                                      /*Is64Bit=*/is64,
+                                      (is64 ? object::mach::CTM_PowerPC64 :
+                                       object::mach::CTM_PowerPC),
+                                      object::mach::CSPPC_ALL),
+                                    OS, /*IsLittleEndian=*/false);
+    }
+
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+      return false;
+    }
+  };
+
+  class ELFPPCAsmBackend : public PPCAsmBackend {
+    uint8_t OSABI;
+  public:
+    ELFPPCAsmBackend(const Target &T, uint8_t OSABI) :
+      PPCAsmBackend(T), OSABI(OSABI) { }
+
+
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      bool is64 = getPointerSize() == 8;
+      return createPPCELFObjectWriter(OS, is64, OSABI);
+    }
+
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+      return false;
+    }
+  };
+
+} // end anonymous namespace
+
+
+
+
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  if (Triple(TT).isOSDarwin())
+    return new DarwinPPCAsmBackend(T);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+  return new ELFPPCAsmBackend(T, OSABI);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
new file mode 100644
index 000000000000..84e4175e635b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -0,0 +1,277 @@
+//===-- PPCELFObjectWriter.cpp - PPC ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI);
+
+    virtual ~PPCELFObjectWriter();
+  protected:
+    virtual unsigned getRelocTypeInner(const MCValue &Target,
+                                       const MCFixup &Fixup,
+                                       bool IsPCRel) const;
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend) const;
+    virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                                    const MCFixup &Fixup,
+                                                    bool IsPCRel) const;
+    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
+
+    virtual void sortRelocs(const MCAssembler &Asm,
+                            std::vector<ELFRelocationEntry> &Relocs);
+  };
+
+  class PPCELFRelocationEntry : public ELFRelocationEntry {
+  public:
+    PPCELFRelocationEntry(const ELFRelocationEntry &RE);
+    bool operator<(const PPCELFRelocationEntry &RE) const {
+      return (RE.r_offset < r_offset ||
+              (RE.r_offset == r_offset && RE.Type > Type));
+    }
+  };
+}
+
+PPCELFRelocationEntry::PPCELFRelocationEntry(const ELFRelocationEntry &RE)
+  : ELFRelocationEntry(RE.r_offset, RE.Index, RE.Type, RE.Symbol,
+                       RE.r_addend, *RE.Fixup) {}
+
+PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
+  : MCELFObjectTargetWriter(Is64Bit, OSABI,
+                            Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
+                            /*HasRelocationAddend*/ true) {}
+
+PPCELFObjectWriter::~PPCELFObjectWriter() {
+}
+
+unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const
+{
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  // determine the type of the relocation
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented");
+    case PPC::fixup_ppc_br24:
+      Type = ELF::R_PPC_REL24;
+      break;
+    case FK_Data_4:
+    case FK_PCRel_4:
+      Type = ELF::R_PPC_REL32;
+      break;
+    case FK_Data_8:
+    case FK_PCRel_8:
+      Type = ELF::R_PPC64_REL64;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+    case PPC::fixup_ppc_br24:
+      Type = ELF::R_PPC_ADDR24;
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
+      break;
+    case PPC::fixup_ppc_ha16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
+        Type = ELF::R_PPC_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
+        Type = ELF::R_PPC64_DTPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16_HA;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TOC16_HA:
+        Type = ELF::R_PPC64_TOC16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
+        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_lo16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
+        Type = ELF::R_PPC_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16_LO;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
+        Type = ELF::R_PPC64_TOC16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC16_LO:
+        Type = ELF::R_PPC64_TOC16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+        Type = ELF::R_PPC64_GOT_TLSGD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+        Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_lo16_ds:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
+        Type = ELF::R_PPC64_TOC16_DS;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TOC16_LO:
+        Type = ELF::R_PPC64_TOC16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      }
+      break;
+    case PPC::fixup_ppc_tlsreg:
+      Type = ELF::R_PPC64_TLS;
+      break;
+    case PPC::fixup_ppc_nofixup:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TLSGD:
+        Type = ELF::R_PPC64_TLSGD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLSLD:
+        Type = ELF::R_PPC64_TLSLD;
+        break;
+      }
+      break;
+    case FK_Data_8:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR64;
+	break;
+      }
+      break;
+    case FK_Data_4:
+      Type = ELF::R_PPC_ADDR32;
+      break;
+    case FK_Data_2:
+      Type = ELF::R_PPC_ADDR16;
+      break;
+    }
+  }
+  return Type;
+}
+
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel,
+                                          bool IsRelocWithSymbol,
+                                          int64_t Addend) const {
+  return getRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                            const MCFixup &Fixup,
+                                                            bool IsPCRel) const {
+  assert(Target.getSymA() && "SymA cannot be 0");
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
+
+  unsigned RelocType = getRelocTypeInner(Target, Fixup, IsPCRel);
+
+  // The .odp creation emits a relocation against the symbol ".TOC." which
+  // create a R_PPC64_TOC relocation. However the relocation symbol name
+  // in final object creation should be NULL, since the symbol does not
+  // really exist, it is just the reference to TOC base for the current
+  // object file.
+  bool EmitThisSym = RelocType != ELF::R_PPC64_TOC;
+
+  if (EmitThisSym && !Symbol.isTemporary())
+    return &Symbol;
+  return NULL;
+}
+
+void PPCELFObjectWriter::
+adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
+  switch ((unsigned)Fixup.getKind()) {
+    case PPC::fixup_ppc_ha16:
+    case PPC::fixup_ppc_lo16:
+    case PPC::fixup_ppc_lo16_ds:
+      RelocOffset += 2;
+      break;
+    default:
+      break;
+  }
+}
+
+// The standard sorter only sorts on the r_offset field, but PowerPC can
+// have multiple relocations at the same offset.  Sort secondarily on the
+// relocation type to avoid nondeterminism.
+void PPCELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+                                    std::vector<ELFRelocationEntry> &Relocs) {
+
+  // Copy to a temporary vector of relocation entries having a different
+  // sort function.
+  std::vector<PPCELFRelocationEntry> TmpRelocs;
+  
+  for (std::vector<ELFRelocationEntry>::iterator R = Relocs.begin();
+       R != Relocs.end(); ++R) {
+    TmpRelocs.push_back(PPCELFRelocationEntry(*R));
+  }
+
+  // Sort in place by ascending r_offset and descending r_type.
+  array_pod_sort(TmpRelocs.begin(), TmpRelocs.end());
+
+  // Copy back to the original vector.
+  unsigned I = 0;
+  for (std::vector<PPCELFRelocationEntry>::iterator R = TmpRelocs.begin();
+       R != TmpRelocs.end(); ++R, ++I) {
+    Relocs[I] = ELFRelocationEntry(R->r_offset, R->Index, R->Type,
+                                   R->Symbol, R->r_addend, *R->Fixup);
+  }
+}
+
+
+MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
+                                               bool Is64Bit,
+                                               uint8_t OSABI) {
+  MCELFObjectTargetWriter *MOTW = new PPCELFObjectWriter(Is64Bit, OSABI);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/false);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
new file mode 100644
index 000000000000..86c44f57a5e2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -0,0 +1,54 @@
+//===-- PPCFixupKinds.h - PPC Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PPC_PPCFIXUPKINDS_H
+#define LLVM_PPC_PPCFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+enum Fixups {
+  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
+  // and 'bl'.
+  fixup_ppc_br24 = FirstTargetFixupKind,
+  
+  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14,
+  
+  /// fixup_ppc_lo16 - A 16-bit fixup corresponding to lo16(_foo) for instrs
+  /// like 'li'.
+  fixup_ppc_lo16,
+  
+  /// fixup_ppc_ha16 - A 16-bit fixup corresponding to ha16(_foo) for instrs
+  /// like 'lis'.
+  fixup_ppc_ha16,
+  
+  /// fixup_ppc_lo16_ds - A 14-bit fixup corresponding to lo16(_foo) with
+  /// implied 2 zero bits for instrs like 'std'.
+  fixup_ppc_lo16_ds,
+
+  /// fixup_ppc_tlsreg - Insert thread-pointer register number.
+  fixup_ppc_tlsreg,
+
+  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
+  /// to __tls_get_addr for the TLS general and local dynamic models.
+  fixup_ppc_nofixup,
+  
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
new file mode 100644
index 000000000000..a25d7fe64f3a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -0,0 +1,70 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+using namespace llvm;
+
+void PPCMCAsmInfoDarwin::anchor() { }
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = false;
+
+  PCSymbol = ".";
+  CommentString = ";";
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
+
+  AssemblerDialect = 1;           // New-Style mnemonics.
+  SupportsDebugInformation= true; // Debug information.
+}
+
+void PPCLinuxMCAsmInfo::anchor() { }
+
+PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+  if (is64Bit) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+  IsLittleEndian = false;
+
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  
+  // Uses '.section' before '.bss' directive
+  UsesELFSectionDirectiveForBSS = true;  
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+    
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
+  AssemblerDialect = 0;           // Old-Style mnemonics.
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
new file mode 100644
index 000000000000..7b4ed9f14eb6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -0,0 +1,35 @@
+//===-- PPCMCAsmInfo.h - PPC asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    virtual void anchor();
+  public:
+    explicit PPCMCAsmInfoDarwin(bool is64Bit);
+  };
+
+  class PPCLinuxMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit PPCLinuxMCAsmInfo(bool is64Bit);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
new file mode 100644
index 000000000000..2223cd623cb5
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -0,0 +1,239 @@
+//===-- PPCMCCodeEmitter.cpp - Convert PPC code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+class PPCMCCodeEmitter : public MCCodeEmitter {
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+
+  const MCSubtargetInfo &STI;
+  const MCContext &CTX;
+  Triple TT;
+
+public:
+  PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx)
+    : STI(sti), CTX(ctx), TT(STI.getTargetTriple()) {
+  }
+  
+  ~PPCMCCodeEmitter() {}
+
+  unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getHA16Encoding(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getLO16Encoding(const MCInst &MI, unsigned OpNo,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+
+    // BL8_NOP etc. all have a size of 8 because of the following 'nop'.
+    unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::BL8_NOP || Opcode == PPC::BLA8_NOP ||
+        Opcode == PPC::BL8_NOP_TLSGD || Opcode == PPC::BL8_NOP_TLSLD)
+      Size = 8;
+    
+    // Output the constant in big endian byte order.
+    int ShiftValue = (Size * 8) - 8;
+    for (unsigned i = 0; i != Size; ++i) {
+      OS << (char)(Bits >> ShiftValue);
+      Bits <<= 8;
+    }
+    
+    ++MCNumEmitted;  // Keep track of the # of mi's emitted.
+  }
+  
+};
+  
+} // end anonymous namespace
+  
+MCCodeEmitter *llvm::createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new PPCMCCodeEmitter(MCII, STI, Ctx);
+}
+
+unsigned PPCMCCodeEmitter::
+getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_br24));
+
+  // For special TLS calls, add another fixup for the symbol.  Apparently
+  // BL8_NOP, BL8_NOP_TLSGD, and BL8_NOP_TLSLD are sufficiently
+  // similar that TblGen will not generate a separate case for the latter
+  // two, so this is the only way to get the extra fixup generated.
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == PPC::BL8_NOP_TLSGD || Opcode == PPC::BL8_NOP_TLSLD) {
+    const MCOperand &MO2 = MI.getOperand(OpNo+1);
+    Fixups.push_back(MCFixup::Create(0, MO2.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_nofixup));
+  }
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getHA16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_ha16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getLO16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the branch target.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
+                                            SmallVectorImpl<MCFixup> &Fixups) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 16;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups) << 14;
+  
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_lo16_ds));
+  return RegBits;
+}
+
+
+unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the TLS register, which simply provides a relocation
+  // hint to the linker that this statement is part of a relocation sequence.
+  // Return the thread-pointer register's encoding.
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_tlsreg));
+  return CTX.getRegisterInfo().getEncodingValue(PPC::X13);
+}
+
+unsigned PPCMCCodeEmitter::
+get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTCRF || 
+          MI.getOpcode() == PPC::MFOCRF ||
+          MI.getOpcode() == PPC::MTCRF8) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+}
+
+
+unsigned PPCMCCodeEmitter::
+getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+
+#include "PPCGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
new file mode 100644
index 000000000000..2209f936ec33
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -0,0 +1,160 @@
+//===-- PPCMCTargetDesc.cpp - PowerPC Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCTargetDesc.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "PPCMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "PPCGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "PPCGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createPPCMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitPPCMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64);
+  unsigned Flavour = isPPC64 ? 0 : 1;
+  unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+  return X;
+}
+
+static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                 StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitPPCMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new PPCMCAsmInfoDarwin(isPPC64);
+  else
+    MAI = new PPCLinuxMCAsmInfo(isPPC64);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(isPPC64? PPC::X1 : PPC::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  if (RM == Reloc::Default) {
+    Triple T(TT);
+    if (T.isOSDarwin())
+      RM = Reloc::DynamicNoPIC;
+    else
+      RM = Reloc::Static;
+  }
+  if (CM == CodeModel::Default) {
+    Triple T(TT);
+    if (!T.isOSDarwin() && T.getArch() == Triple::ppc64)
+      CM = CodeModel::Medium;
+  }
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  if (Triple(TT).isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  return new PPCInstPrinter(MAI, MII, MRI, SyntaxVariant);
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo);
+  RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo);  
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target,
+                                          createPPCMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target,
+                                          createPPCMCSubtargetInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
+  
+    // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend);
+  
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
new file mode 100644
index 000000000000..38a7420d972d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -0,0 +1,68 @@
+//===-- PPCMCTargetDesc.h - PowerPC Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides PowerPC specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCMCTARGETDESC_H
+#define PPCMCTARGETDESC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target ThePPC32Target;
+extern Target ThePPC64Target;
+  
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+
+/// createPPCELFObjectWriter - Construct an PPC ELF object writer.
+MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
+                                         bool Is64Bit,
+                                         uint8_t OSABI);
+} // End llvm namespace
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "PPCGenRegisterInfo.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "PPCGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "PPCGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
new file mode 100644
index 000000000000..d84eb9c6aa03
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -0,0 +1,31 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  }
+  llvm_unreachable("Unknown PPC branch opcode!");
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
new file mode 100644
index 000000000000..ad2b01812816
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -0,0 +1,43 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+// Generated files will use "namespace PPC". To avoid symbol clash,
+// undefine PPC here. PPC may be predefined on some hosts.
+#undef PPC
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_LT     = (0 << 5) | 12,
+    PRED_LE     = (1 << 5) |  4,
+    PRED_EQ     = (2 << 5) | 12,
+    PRED_GE     = (0 << 5) |  4,
+    PRED_GT     = (1 << 5) | 12,
+    PRED_NE     = (2 << 5) |  4,
+    PRED_UN     = (3 << 5) | 12,
+    PRED_NU     = (3 << 5) |  4
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
new file mode 100644
index 000000000000..446b6854fb5b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -0,0 +1,90 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_H
+#define LLVM_TARGET_POWERPC_H
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include <string>
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+  class PPCTargetMachine;
+  class FunctionPass;
+  class ImmutablePass;
+  class JITCodeEmitter;
+  class MachineInstr;
+  class AsmPrinter;
+  class MCInst;
+
+  FunctionPass *createPPCCTRLoops();
+  FunctionPass *createPPCBranchSelectionPass();
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                            JITCodeEmitter &MCE);
+  void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    AsmPrinter &AP, bool isDarwin);
+
+  /// \brief Creates an PPC-specific Target Transformation Info pass.
+  ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
+  
+  namespace PPCII {
+    
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // PPC Specific MachineOperand flags.
+    MO_NO_FLAG,
+    
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB = 1,
+    
+    /// MO_PIC_FLAG - If this bit is set, the symbol reference is relative to
+    /// the function's picbase, e.g. lo16(symbol-picbase).
+    MO_PIC_FLAG = 2,
+
+    /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
+    /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
+    MO_NLP_FLAG = 4,
+    
+    /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
+    /// symbol with hidden visibility.  This causes a different kind of
+    /// non-lazy-pointer to be generated.
+    MO_NLP_HIDDEN_FLAG = 8,
+
+    /// The next are not flags but distinct values.
+    MO_ACCESS_MASK = 0xf0,
+
+    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
+    MO_LO16 = 1 << 4,
+    MO_HA16 = 2 << 4,
+
+    MO_TPREL16_HA = 3 << 4,
+    MO_TPREL16_LO = 4 << 4,
+
+    /// These values identify relocations on immediates folded
+    /// into memory operations.
+    MO_DTPREL16_LO = 5 << 4,
+    MO_TLSLD16_LO  = 6 << 4,
+    MO_TOC16_LO    = 7 << 4
+  };
+  } // end namespace PPCII
+  
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td
new file mode 100644
index 000000000000..389216278ee4
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.td
@@ -0,0 +1,240 @@
+//===-- PPC.td - Describe the PowerPC Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+ 
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive440 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_440", "">;
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+                                       "PPC::DIR_E5500", "">;
+def DirectivePwr3: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR3", "">;
+def DirectivePwr4: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR4", "">;
+def DirectivePwr5: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5", "">;
+def DirectivePwr5x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR5X", "">;
+def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
+def DirectivePwr6x: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6X", "">;
+def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions">;
+def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
+                                        "Enable the MFOCRF instruction">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction">;
+def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
+                                        "Enable the fre instruction">;
+def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
+                                        "Enable the fres instruction">;
+def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
+                                        "Enable the frsqrte instruction">;
+def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
+                                        "Enable the frsqrtes instruction">;
+def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
+                              "Assume higher precision reciprocal estimates">;
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction">;
+def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
+                                        "Enable the lfiwax instruction">;
+def FeatureFPRND     : SubtargetFeature<"fprnd", "HasFPRND", "true",
+                                        "Enable the fri[mnpz] instructions">;
+def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
+  "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions">;
+def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
+                                        "Enable the isel instruction">;
+def FeaturePOPCNTD   : SubtargetFeature<"popcntd","HasPOPCNTD", "true",
+                                        "Enable the popcnt[dw] instructions">;
+def FeatureLDBRX     : SubtargetFeature<"ldbrx","HasLDBRX", "true",
+                                        "Enable the ldbrx instruction">;
+def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
+                                        "Enable Book E instructions">;
+def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
+                                        "Enable QPX instructions">;
+
+// Note: Future features to add when support is extended to more
+// recent ISA levels:
+//
+// CMPB         p6, p6x, p7        cmpb
+// DFP          p6, p6x, p7        decimal floating-point instructions
+// POPCNTB      p5 through p7      popcntb and related instructions
+// VSX          p7                 vector-scalar instruction set
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+include "PPCInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32]>;
+def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
+                                           FeatureBookE]>;
+def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
+                                           FeatureFRES, FeatureFRSQRTE,
+                                           FeatureBookE]>;
+def : Processor<"601", G3Itineraries, [Directive601]>;
+def : Processor<"602", G3Itineraries, [Directive602]>;
+def : Processor<"603", G3Itineraries, [Directive603,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603e", G3Itineraries, [Directive603,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"603ev", G3Itineraries, [Directive603,
+                                         FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604", G3Itineraries, [Directive604,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"604e", G3Itineraries, [Directive604,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"620", G3Itineraries, [Directive620,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"750", G4Itineraries, [Directive750,
+                                       FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g3", G3Itineraries, [Directive750,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec,
+                                        FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec,
+                                      FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                            FeatureFRES, FeatureFRSQRTE]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive7400, FeatureAltivec,
+                                           FeatureFRES, FeatureFRSQRTE]>;
+def : ProcessorModel<"970", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt,
+                   FeatureFRES, FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"g5", G5Model,
+                  [Directive970, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
+                   FeatureFRES, FeatureFRSQRTE,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc, FeatureMFOCRF,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+def : ProcessorModel<"a2", PPCA2Model,
+                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+               /*, Feature64BitRegs */]>;
+def : ProcessorModel<"a2q", PPCA2Model,
+                  [DirectiveA2, FeatureBookE, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
+               /*, Feature64BitRegs */, FeatureQPX]>;
+def : ProcessorModel<"pwr3", G5Model,
+                  [DirectivePwr3, FeatureAltivec,
+                   FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr4", G5Model,
+                  [DirectivePwr4, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRES, FeatureFRSQRTE,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr5", G5Model,
+                  [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, Feature64Bit]>;
+def : ProcessorModel<"pwr5x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
+def : ProcessorModel<"pwr6", G5Model,
+                  [DirectivePwr6, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"pwr6x", G5Model,
+                  [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
+                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
+                   FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, Feature64Bit]>;
+def : ProcessorModel<"pwr7", G5Model,
+                  [DirectivePwr7, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
+                   FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
+                   FeatureFPRND, FeatureFPCVT, FeatureISEL,
+                   FeaturePOPCNTD, FeatureLDBRX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"ppc", G3Itineraries, [Directive32]>;
+def : ProcessorModel<"ppc64", G5Model,
+                  [Directive64, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+                   FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+}
+
+def PPCAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+  
+  let AssemblyWriters = [PPCAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
new file mode 100644
index 000000000000..96a9f0a39006
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -0,0 +1,1126 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "PPC.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+namespace {
+  class PPCAsmPrinter : public AsmPrinter {
+  protected:
+    MapVector<MCSymbol*, MCSymbol*> TOC;
+    const PPCSubtarget &Subtarget;
+    uint64_t TOCLabelID;
+  public:
+    explicit PPCAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
+
+    virtual const char *getPassName() const {
+      return "PowerPC Assembly Printer";
+    }
+
+    MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
+
+    virtual void EmitInstruction(const MachineInstr *MI);
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O);
+
+    MachineLocation getDebugValueLocation(const MachineInstr *MI) const {
+      MachineLocation Location;
+      assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+      // Frame address.  Currently handles register +- offset only.
+      if (MI->getOperand(0).isReg() && MI->getOperand(2).isImm())
+        Location.set(MI->getOperand(0).getReg(), MI->getOperand(2).getImm());
+      else {
+        DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+      }
+      return Location;
+    }
+  };
+
+  /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : PPCAsmPrinter(TM, Streamer) {}
+
+    virtual const char *getPassName() const {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M);
+
+    virtual void EmitFunctionEntryLabel();
+
+    void EmitFunctionBodyEnd();
+  };
+
+  /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+  /// OS X
+  class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : PPCAsmPrinter(TM, Streamer) {}
+
+    virtual const char *getPassName() const {
+      return "Darwin PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
+
+    void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
+  };
+} // end of anonymous namespace
+
+/// stripRegisterPrefix - This method strips the character prefix from a
+/// register name so that only the number is left.  Used by for linux asm.
+static const char *stripRegisterPrefix(const char *RegName) {
+  switch (RegName[0]) {
+    case 'r':
+    case 'f':
+    case 'v': return RegName + 1;
+    case 'c': if (RegName[1] == 'r') return RegName + 2;
+  }
+  
+  return RegName;
+}
+
+void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg());
+    // Linux assembler (Others?) does not take register mnemonics.
+    // FIXME - What about special registers used in mfspr/mtspr?
+    if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+    O << RegName;
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    // FIXME: PIC relocation model
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    return;
+  case MachineOperand::MO_ExternalSymbol: {
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() == Reloc::Static) {
+      O << *GetExternalSymbolSymbol(MO.getSymbolName());
+      return;
+    }
+
+    MCSymbol *NLPSym = 
+      OutContext.GetOrCreateSymbol(StringRef(MAI->getGlobalPrefix())+
+                                   MO.getSymbolName()+"$non_lazy_ptr");
+    MachineModuleInfoImpl::StubValueTy &StubSym = 
+      MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(NLPSym);
+    if (StubSym.getPointer() == 0)
+      StubSym = MachineModuleInfoImpl::
+        StubValueTy(GetExternalSymbolSymbol(MO.getSymbolName()), true);
+    
+    O << *NLPSym;
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    const GlobalValue *GV = MO.getGlobal();
+    MCSymbol *SymToPrint;
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (TM.getRelocationModel() != Reloc::Static &&
+        (GV->isDeclaration() || GV->isWeakForLinker())) {
+      if (!GV->hasHiddenVisibility()) {
+        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        MachineModuleInfoImpl::StubValueTy &StubSym = 
+          MMI->getObjFileInfo<MachineModuleInfoMachO>()
+            .getGVStubEntry(SymToPrint);
+        if (StubSym.getPointer() == 0)
+          StubSym = MachineModuleInfoImpl::
+            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+      } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
+                 GV->hasAvailableExternallyLinkage()) {
+        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        
+        MachineModuleInfoImpl::StubValueTy &StubSym = 
+          MMI->getObjFileInfo<MachineModuleInfoMachO>().
+                    getHiddenGVStubEntry(SymToPrint);
+        if (StubSym.getPointer() == 0)
+          StubSym = MachineModuleInfoImpl::
+            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+      } else {
+        SymToPrint = Mang->getSymbol(GV);
+      }
+    } else {
+      SymToPrint = Mang->getSymbol(GV);
+    }
+    
+    O << *SymToPrint;
+
+    printOffset(MO.getOffset(), O);
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'c': // Don't print "$" before a global var name or constant.
+      break; // PPC never has a prefix.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'y': // A memory reference for an X-form instruction
+      {
+        const char *RegName = "r0";
+        if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+        O << RegName << ", ";
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+    }
+  }
+
+  assert(MI->getOperand(OpNo).isReg());
+  O << "0(";
+  printOperand(MI, OpNo, O);
+  O << ")";
+  return false;
+}
+
+
+/// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry
+/// exists for it.  If not, create one.  Then return a symbol that references
+/// the TOC entry.
+MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
+
+  MCSymbol *&TOCEntry = TOC[Sym];
+
+  // To avoid name clash check if the name already exists.
+  while (TOCEntry == 0) {
+    if (OutContext.LookupSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                                "C" + Twine(TOCLabelID++)) == 0) {
+      TOCEntry = GetTempSymbol("C", TOCLabelID);
+    }
+  }
+
+  return TOCEntry;
+}
+
+
+/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  
+  // Lower multi-instruction pseudo operations.
+  switch (MI->getOpcode()) {
+  default: break;
+  case TargetOpcode::DBG_VALUE: {
+    if (!isVerbose() || !OutStreamer.hasRawTextSupport()) return;
+      
+    SmallString<32> Str;
+    raw_svector_ostream O(Str);
+    unsigned NOps = MI->getNumOperands();
+    assert(NOps==4);
+    O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+    // cast away const; DIetc do not take const operands for some reason.
+    DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+    O << V.getName();
+    O << " <- ";
+    // Frame address.  Currently handles register +- offset only.
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+    O << '['; printOperand(MI, 0, O); O << '+'; printOperand(MI, 1, O);
+    O << ']';
+    O << "+";
+    printOperand(MI, NOps-2, O);
+    OutStreamer.EmitRawText(O.str());
+    return;
+  }
+      
+  case PPC::MovePCtoLR:
+  case PPC::MovePCtoLR8: {
+    // Transform %LR = MovePCtoLR
+    // Into this, where the label is the PIC base: 
+    //     bl L1$pb
+    // L1$pb:
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    
+    // Emit the 'bl'.
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL)
+      // FIXME: We would like an efficient form for this, so we don't have to do
+      // a lot of extra uniquing.
+      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+    
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+    return;
+  }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
+  case PPC::LDtoc: {
+    // Transform %X3 = LDtoc <ga:@min1>, %X2
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD, and the global address operand to be a
+    // reference to the TOC entry we will synthesize later.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = 0;
+    if (MO.isGlobal())
+      MOSymbol = Mang->getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+
+    MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+      
+  case PPC::ADDIStocHA: {
+    // Transform %Xd = ADDIStocHA %X2, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to ADDIS8.  If the global address is external,
+    // has common linkage, is a function address, or is a jump table
+    // address, then generate a TOC entry and reference that.  Otherwise
+    // reference the symbol directly.
+    TmpInst.setOpcode(PPC::ADDIS8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
+           "Invalid operand for ADDIStocHA!");
+    MCSymbol *MOSymbol = 0;
+    bool IsExternal = false;
+    bool IsFunction = false;
+    bool IsCommon = false;
+    bool IsAvailExt = false;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GValue = MO.getGlobal();
+      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
+      const GlobalValue *RealGValue = GAlias ?
+        GAlias->resolveAliasedGlobal(false) : GValue;
+      MOSymbol = Mang->getSymbol(RealGValue);
+      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
+      IsExternal = GVar && !GVar->hasInitializer();
+      IsCommon = GVar && RealGValue->hasCommonLinkage();
+      IsFunction = !GVar;
+      IsAvailExt = GVar && RealGValue->hasAvailableExternallyLinkage();
+    } else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+
+    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI())
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_HA,
+                              OutContext);
+    TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case PPC::LDtocL: {
+    // Transform %Xd = LDtocL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD.  If the global address is external, has
+    // common linkage, or is a jump table address, then reference the
+    // associated TOC entry.  Otherwise reference the symbol directly.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+    assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
+           "Invalid operand for LDtocL!");
+    MCSymbol *MOSymbol = 0;
+
+    if (MO.isJTI())
+      MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isGlobal()) {
+      const GlobalValue *GValue = MO.getGlobal();
+      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
+      const GlobalValue *RealGValue = GAlias ?
+        GAlias->resolveAliasedGlobal(false) : GValue;
+      MOSymbol = Mang->getSymbol(RealGValue);
+      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
+    
+      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
+          RealGValue->hasAvailableExternallyLinkage())
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case PPC::ADDItocL: {
+    // Transform %Xd = ADDItocL %Xs, <ga:@sym>
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to ADDI8.  If the global address is external, then
+    // generate a TOC entry and reference that.  Otherwise reference the
+    // symbol directly.
+    TmpInst.setOpcode(PPC::ADDI8);
+    const MachineOperand &MO = MI->getOperand(2);
+    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
+    MCSymbol *MOSymbol = 0;
+    bool IsExternal = false;
+    bool IsFunction = false;
+
+    if (MO.isGlobal()) {
+      const GlobalValue *GValue = MO.getGlobal();
+      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
+      const GlobalValue *RealGValue = GAlias ?
+        GAlias->resolveAliasedGlobal(false) : GValue;
+      MOSymbol = Mang->getSymbol(RealGValue);
+      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
+      IsExternal = GVar && !GVar->hasInitializer();
+      IsFunction = !GVar;
+    } else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+
+    if (IsFunction || IsExternal)
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_LO,
+                              OutContext);
+    TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case PPC::ADDISgotTprelHA: {
+    // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTprel));
+    return;
+  }
+  case PPC::LDgotTprelL: {
+    // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+
+    // Change the opcode to LD.
+    TmpInst.setOpcode(PPC::LD);
+    const MachineOperand &MO = MI->getOperand(1);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *Exp =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO,
+                              OutContext);
+    TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case PPC::ADDIStlsgdHA: {
+    // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::ADDItlsgdL: {
+    // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsgd@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::GETtlsADDR: {
+    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_TLSGD __tls_get_addr(sym@tlsgd)
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLSGD)
+                                .addExpr(TlsRef)
+                                .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDIStlsldHA: {
+    // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::ADDItlsldL: {
+    // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@got@tlsld@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::GETtlsldADDR: {
+    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_TLSLD __tls_get_addr(sym@tlsld)
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLSLD)
+                                .addExpr(TlsRef)
+                                .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDISdtprelHA: {
+    // Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X3, sym@dtprel@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X3)
+                                .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::ADDIdtprelL: {
+    // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8 %Xs, sym@dtprel@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::MFCRpseud:
+  case PPC::MFCR8pseud:
+    // Transform: %R3 = MFCRpseud %CR7
+    // Into:      %R3 = MFCR      ;; cr7
+    OutStreamer.AddComment(PPCInstPrinter::
+                           getRegisterName(MI->getOperand(1).getReg()));
+    OutStreamer.EmitInstruction(MCInstBuilder(Subtarget.isPPC64() ? PPC::MFCR8 : PPC::MFCR)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  case PPC::SYNC:
+    // In Book E sync is called msync, handle this special case here...
+    if (Subtarget.isBookE()) {
+      OutStreamer.EmitRawText(StringRef("\tmsync"));
+      return;
+    }
+  }
+
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+  if (!Subtarget.isPPC64())  // linux/ppc32 - Normal entry label.
+    return AsmPrinter::EmitFunctionEntryLabel();
+    
+  // Emit an official procedure descriptor.
+  const MCSection *Current = OutStreamer.getCurrentSection();
+  const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".opd",
+      ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+      SectionKind::getReadOnly());
+  OutStreamer.SwitchSection(Section);
+  OutStreamer.EmitLabel(CurrentFnSym);
+  OutStreamer.EmitValueToAlignment(8);
+  MCSymbol *Symbol1 = 
+    OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
+  // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+  // entry point.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
+			8 /*size*/);
+  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+  // Generates a R_PPC64_TOC relocation for TOC base insertion.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
+                        MCSymbolRefExpr::VK_PPC_TOC, OutContext),
+                        8/*size*/);
+  // Emit a null environment pointer.
+  OutStreamer.EmitIntValue(0, 8 /* size */);
+  OutStreamer.SwitchSection(Current);
+
+  MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
+                          ".L." + Twine(CurrentFnSym->getName()));
+  OutStreamer.EmitLabel(RealFnSym);
+  CurrentFnSymForSize = RealFnSym;
+}
+
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+  const DataLayout *TD = TM.getDataLayout();
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  if (isPPC64 && !TOC.empty()) {
+    const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
+        ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
+        SectionKind::getReadOnly());
+    OutStreamer.SwitchSection(Section);
+
+    for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
+         E = TOC.end(); I != E; ++I) {
+      OutStreamer.EmitLabel(I->second);
+      MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
+      OutStreamer.EmitTCEntry(*S);
+    }
+  }
+
+  MachineModuleInfoELF &MMIELF =
+    MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+  MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .long _foo
+      OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(),
+                                                    OutContext),
+                            isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget.isPPC64()) {
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.EmitIntValue(0, 8/*size*/);
+  }
+}
+
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  static const char *const CPUDirectives[] = {
+    "",
+    "ppc",
+    "ppc440",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppcA2",
+    "ppce500mc",
+    "ppce5500",
+    "power3",
+    "power4",
+    "power5",
+    "power5x",
+    "power6",
+    "power6x",
+    "power7",
+    "ppc64"
+  };
+
+  unsigned Directive = Subtarget.getDarwinDirective();
+  if (Subtarget.hasMFOCRF() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_970;
+  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
+    Directive = PPC::DIR_7400;
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_64)
+    Directive = PPC::DIR_64;
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+  
+  // FIXME: This is a total hack, finish mc'izing the PPC backend.
+  if (OutStreamer.hasRawTextSupport()) {
+    assert(Directive < sizeof(CPUDirectives) / sizeof(*CPUDirectives) &&
+           "CPUDirectives[] might not be up-to-date!");
+    OutStreamer.EmitRawText("\t.machine " + Twine(CPUDirectives[Directive]));
+  }
+
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    OutStreamer.SwitchSection(
+           OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      32, SectionKind::getText()));
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    OutStreamer.SwitchSection(
+           OutContext.getMachOSection("__TEXT","__symbol_stub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      16, SectionKind::getText()));
+  }
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+static MCSymbol *GetLazyPtr(MCSymbol *Sym, MCContext &Ctx) {
+  // Remove $stub suffix, add $lazy_ptr.
+  StringRef NoStub = Sym->getName().substr(0, Sym->getName().size()-5);
+  return Ctx.GetOrCreateSymbol(NoStub + "$lazy_ptr");
+}
+
+static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
+  // Add $tmp suffix to $stub, yielding $stub$tmp.
+  return Ctx.GetOrCreateSymbol(Sym->getName() + "$tmp");
+}
+
+void PPCDarwinAsmPrinter::
+EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+
+  // .lazy_symbol_pointer
+  const MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
+  
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    const MCSection *StubSection = 
+    OutContext.getMachOSection("__TEXT", "__picsymbolstub1",
+                               MCSectionMachO::S_SYMBOL_STUBS |
+                               MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                               32, SectionKind::getText());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      OutStreamer.SwitchSection(StubSection);
+      EmitAlignment(4);
+      
+      MCSymbol *Stub = Stubs[i].first;
+      MCSymbol *RawSym = Stubs[i].second.getPointer();
+      MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+      MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
+                                           
+      OutStreamer.EmitLabel(Stub);
+      OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+      const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
+
+      // mflr r0
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
+      // bcl 20, 31, AnonSymbol
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCLalways).addExpr(Anon));
+      OutStreamer.EmitLabel(AnonSymbol);
+      // mflr r11
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
+      // addis r11, r11, ha16(LazyPtr - AnonSymbol)
+      const MCExpr *Sub =
+        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(LazyPtr, OutContext),
+                                Anon, OutContext);
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS)
+        .addReg(PPC::R11)
+        .addReg(PPC::R11)
+        .addExpr(Sub));
+      // mtlr r0
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
+
+      // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+        .addReg(PPC::R12)
+        .addExpr(Sub).addExpr(Sub)
+        .addReg(PPC::R11));
+      // mtctr r12
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+      // bctr
+      OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCTR));
+
+      OutStreamer.SwitchSection(LSPSection);
+      OutStreamer.EmitLabel(LazyPtr);
+      OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+      MCSymbol *DyldStubBindingHelper =
+        OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
+      if (isPPC64) {
+        // .quad dyld_stub_binding_helper
+        OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
+      } else {
+        // .long dyld_stub_binding_helper
+        OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
+      }
+    }
+    OutStreamer.AddBlankLine();
+    return;
+  }
+  
+  const MCSection *StubSection =
+    OutContext.getMachOSection("__TEXT","__symbol_stub1",
+                               MCSectionMachO::S_SYMBOL_STUBS |
+                               MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                               16, SectionKind::getText());
+  for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+    MCSymbol *Stub = Stubs[i].first;
+    MCSymbol *RawSym = Stubs[i].second.getPointer();
+    MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+
+    OutStreamer.SwitchSection(StubSection);
+    EmitAlignment(4);
+    OutStreamer.EmitLabel(Stub);
+    OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+    // lis r11, ha16(LazyPtr)
+    const MCExpr *LazyPtrHa16 =
+      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_HA16,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::LIS)
+      .addReg(PPC::R11)
+      .addExpr(LazyPtrHa16));
+
+    const MCExpr *LazyPtrLo16 =
+      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_LO16,
+                              OutContext);
+    // ldu r12, lo16(LazyPtr)(r11)
+    // lwzu r12, lo16(LazyPtr)(r11)
+    OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
+      .addReg(PPC::R12)
+      .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
+      .addReg(PPC::R11));
+
+    // mtctr r12
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
+    // bctr
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BCTR));
+
+    OutStreamer.SwitchSection(LSPSection);
+    OutStreamer.EmitLabel(LazyPtr);
+    OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
+    MCSymbol *DyldStubBindingHelper =
+      OutContext.GetOrCreateSymbol(StringRef("dyld_stub_binding_helper"));
+    if (isPPC64) {
+      // .quad dyld_stub_binding_helper
+      OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 8);
+    } else {
+      // .long dyld_stub_binding_helper
+      OutStreamer.EmitSymbolValue(DyldStubBindingHelper, 4);
+    }
+  }
+  
+  OutStreamer.AddBlankLine();
+}
+
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+
+  // Darwin/PPC always uses mach-o.
+  const TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  MachineModuleInfoMachO &MMIMacho =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  
+  MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
+  if (!Stubs.empty())
+    EmitFunctionStubs(Stubs);
+
+  if (MAI->doesSupportExceptionHandling() && MMI) {
+    // Add the (possibly multiple) personalities to the set of global values.
+    // Only referenced functions get into the Personalities list.
+    const std::vector<const Function*> &Personalities = MMI->getPersonalities();
+    for (std::vector<const Function*>::const_iterator I = Personalities.begin(),
+         E = Personalities.end(); I != E; ++I) {
+      if (*I) {
+        MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+        MachineModuleInfoImpl::StubValueTy &StubSym =
+          MMIMacho.getGVStubEntry(NLPSym);
+        StubSym = MachineModuleInfoImpl::StubValueTy(Mang->getSymbol(*I), true);
+      }
+    }
+  }
+
+  // Output stubs for dynamically-linked functions.
+  Stubs = MMIMacho.GetGVStubList();
+  
+  // Output macho stubs for external and common global variables.
+  if (!Stubs.empty()) {
+    // Switch with ".non_lazy_symbol_pointer" directive.
+    OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .indirect_symbol _foo
+      MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+      OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+      if (MCSym.getInt())
+        // External to current translation unit.
+        OutStreamer.EmitIntValue(0, isPPC64 ? 8 : 4/*size*/);
+      else
+        // Internal to current translation unit.
+        //
+        // When we place the LSDA into the TEXT section, the type info pointers
+        // need to be indirect and pc-rel. We accomplish this by using NLPs.
+        // However, sometimes the types are local to the file. So we need to
+        // fill in the value for the NLP in those cases.
+        OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+                                                      OutContext),
+                              isPPC64 ? 8 : 4/*size*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  Stubs = MMIMacho.GetHiddenGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      // L_foo$stub:
+      OutStreamer.EmitLabel(Stubs[i].first);
+      //   .long _foo
+      OutStreamer.EmitValue(MCSymbolRefExpr::
+                            Create(Stubs[i].second.getPointer(),
+                                   OutContext),
+                            isPPC64 ? 8 : 4/*size*/);
+    }
+
+    Stubs.clear();
+    OutStreamer.AddBlankLine();
+  }
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return AsmPrinter::doFinalization(M);
+}
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
+                                           MCStreamer &Streamer) {
+  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
+
+  if (Subtarget->isDarwin())
+    return new PPCDarwinAsmPrinter(tm, Streamer);
+  return new PPCLinuxAsmPrinter(tm, Streamer);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmPrinter() { 
+  TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 000000000000..bd1c37868110
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -0,0 +1,195 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch pseudo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-branch-select"
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace llvm {
+  void initializePPCBSelPass(PassRegistry&);
+}
+
+namespace {
+  struct PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass(ID) {
+      initializePPCBSelPass(*PassRegistry::getPassRegistry());
+    }
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC Branch Selector";
+    }
+  };
+  char PPCBSel::ID = 0;
+}
+
+INITIALIZE_PASS(PPCBSel, "ppc-branch-select", "PowerPC Branch Selector",
+                false, false)
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const PPCInstrInfo *TII =
+                static_cast<const PPCInstrInfo*>(Fn.getTarget().getInstrInfo());
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+    
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(2).getMBB();
+        
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<16>(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+ 
+        if (I->getOpcode() == PPC::BCC) {
+          // The BCC operands are:
+          // 0. PPC branch predicate
+          // 1. CR register
+          // 2. Target MBB
+          PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+          unsigned CRReg = I->getOperand(1).getReg();
+       
+          // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+          BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+            .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDNZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDZ8)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ)).addImm(2);
+        } else if (I->getOpcode() == PPC::BDZ8) {
+          BuildMI(MBB, I, dl, TII->get(PPC::BDNZ8)).addImm(2);
+        } else {
+           llvm_unreachable("Unhandled branch type!");
+        }
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+        
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()] += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
new file mode 100644
index 000000000000..81a54d7015b0
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -0,0 +1,775 @@
+//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies loops where we can generate the PPC branch instructions
+// that decrement and test the count register (CTR) (bdnz and friends).
+// This pass is based on the HexagonHardwareLoops pass.
+//
+// The pattern that defines the induction variable can changed depending on
+// prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
+// normalizes induction variables, and the Loop Strength Reduction pass
+// run by 'llc' may also make changes to the induction variable.
+// The pattern detected by this phase is due to running Strength Reduction.
+//
+// Criteria for CTR loops:
+//  - Countable loops (w/ ind. var for a trip count)
+//  - Assumes loops are normalized by IndVarSimplify
+//  - Try inner-most loops first
+//  - No nested CTR loops.
+//  - No function calls in loops.
+//
+//  Note: As with unconverted loops, PPCBranchSelector must be run after this
+//  pass in order to convert long-displacement jumps into jump pairs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ctrloops"
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
+
+namespace llvm {
+  void initializePPCCTRLoopsPass(PassRegistry&);
+}
+
+namespace {
+  class CountValue;
+  struct PPCCTRLoops : public MachineFunctionPass {
+    MachineLoopInfo       *MLI;
+    MachineRegisterInfo   *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;   // Pass identification, replacement for typeid
+
+    PPCCTRLoops() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "PPC CTR Loops"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
+    /// induction variable.
+    /// Should be defined in MachineLoop. Based upon version in class Loop.
+    void getCanonicalInductionVariable(MachineLoop *L,
+                              SmallVector<MachineInstr *, 4> &IVars,
+                              SmallVector<MachineInstr *, 4> &IOps) const;
+
+    /// getTripCount - Return a loop-invariant LLVM register indicating the
+    /// number of times the loop will be executed.  If the trip-count cannot
+    /// be determined, this return null.
+    CountValue *getTripCount(MachineLoop *L,
+                             SmallVector<MachineInstr *, 2> &OldInsts) const;
+
+    /// isInductionOperation - Return true if the instruction matches the
+    /// pattern for an opertion that defines an induction variable.
+    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
+
+    /// isInvalidOperation - Return true if the instruction is not valid within
+    /// a CTR loop.
+    bool isInvalidLoopOperation(const MachineInstr *MI) const;
+
+    /// containsInavlidInstruction - Return true if the loop contains an
+    /// instruction that inhibits using the CTR loop.
+    bool containsInvalidInstruction(MachineLoop *L) const;
+
+    /// converToCTRLoop - Given a loop, check if we can convert it to a
+    /// CTR loop.  If so, then perform the conversion and return true.
+    bool convertToCTRLoop(MachineLoop *L);
+
+    /// isDead - Return true if the instruction is now dead.
+    bool isDead(const MachineInstr *MI,
+                SmallVector<MachineInstr *, 1> &DeadPhis) const;
+
+    /// removeIfDead - Remove the instruction if it is now dead.
+    void removeIfDead(MachineInstr *MI);
+  };
+
+  char PPCCTRLoops::ID = 0;
+
+
+  // CountValue class - Abstraction for a trip count of a loop. A
+  // smaller vesrsion of the MachineOperand class without the concerns
+  // of changing the operand representation.
+  class CountValue {
+  public:
+    enum CountValueType {
+      CV_Register,
+      CV_Immediate
+    };
+  private:
+    CountValueType Kind;
+    union Values {
+      unsigned RegNum;
+      int64_t ImmVal;
+      Values(unsigned r) : RegNum(r) {}
+      Values(int64_t i) : ImmVal(i) {}
+    } Contents;
+    bool isNegative;
+
+  public:
+    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
+                                       isNegative(neg) {}
+    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
+                                     isNegative(i < 0) {}
+    CountValueType getType() const { return Kind; }
+    bool isReg() const { return Kind == CV_Register; }
+    bool isImm() const { return Kind == CV_Immediate; }
+    bool isNeg() const { return isNegative; }
+
+    unsigned getReg() const {
+      assert(isReg() && "Wrong CountValue accessor");
+      return Contents.RegNum;
+    }
+    void setReg(unsigned Val) {
+      Contents.RegNum = Val;
+    }
+    int64_t getImm() const {
+      assert(isImm() && "Wrong CountValue accessor");
+      if (isNegative) {
+        return -Contents.ImmVal;
+      }
+      return Contents.ImmVal;
+    }
+    void setImm(int64_t Val) {
+      Contents.ImmVal = Val;
+    }
+
+    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
+      if (isReg()) { OS << PrintReg(getReg()); }
+      if (isImm()) { OS << getImm(); }
+    }
+  };
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
+                    false, false)
+
+/// isCompareEquals - Returns true if the instruction is a compare equals
+/// instruction with an immediate operand.
+static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp,
+                               bool &Int64Cmp) {
+  if (MI->getOpcode() == PPC::CMPWI) {
+    SignedCmp = true;
+    Int64Cmp = false;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPDI) {
+    SignedCmp = true;
+    Int64Cmp = true;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLWI) {
+    SignedCmp = false;
+    Int64Cmp = false;
+    return true;
+  } else if (MI->getOpcode() == PPC::CMPLDI) {
+    SignedCmp = false;
+    Int64Cmp = true;
+    return true;
+  }
+
+  return false;
+}
+
+
+/// createPPCCTRLoops - Factory for creating
+/// the CTR loop phase.
+FunctionPass *llvm::createPPCCTRLoops() {
+  return new PPCCTRLoops();
+}
+
+
+bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********* PPC CTR Loops *********\n");
+
+  bool Changed = false;
+
+  // get the loop information
+  MLI = &getAnalysis<MachineLoopInfo>();
+  // get the register information
+  MRI = &MF.getRegInfo();
+  // the target specific instructio info.
+  TII = MF.getTarget().getInstrInfo();
+
+  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+       I != E; ++I) {
+    MachineLoop *L = *I;
+    if (!L->getParentLoop()) {
+      Changed |= convertToCTRLoop(L);
+    }
+  }
+
+  return Changed;
+}
+
+/// getCanonicalInductionVariable - Check to see if the loop has a canonical
+/// induction variable. We check for a simple recurrence pattern - an
+/// integer recurrence that decrements by one each time through the loop and
+/// ends at zero.  If so, return the phi node that corresponds to it.
+///
+/// Based upon the similar code in LoopInfo except this code is specific to
+/// the machine.
+/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
+///
+void
+PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
+                                  SmallVector<MachineInstr *, 4> &IVars,
+                                  SmallVector<MachineInstr *, 4> &IOps) const {
+  MachineBasicBlock *TopMBB = L->getTopBlock();
+  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
+  assert(PI != TopMBB->pred_end() &&
+         "Loop must have more than one incoming edge!");
+  MachineBasicBlock *Backedge = *PI++;
+  if (PI == TopMBB->pred_end()) return;  // dead loop
+  MachineBasicBlock *Incoming = *PI++;
+  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
+
+  // make sure there is one incoming and one backedge and determine which
+  // is which.
+  if (L->contains(Incoming)) {
+    if (L->contains(Backedge))
+      return;
+    std::swap(Incoming, Backedge);
+  } else if (!L->contains(Backedge))
+    return;
+
+  // Loop over all of the PHI nodes, looking for a canonical induction variable:
+  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
+  //   - The recurrence comes from the backedge.
+  //   - the definition is an induction operatio.n
+  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
+       I != E && I->isPHI(); ++I) {
+    MachineInstr *MPhi = &*I;
+    unsigned DefReg = MPhi->getOperand(0).getReg();
+    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
+      // Check each operand for the value from the backedge.
+      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
+      if (L->contains(MBB)) { // operands comes from the backedge
+        // Check if the definition is an induction operation.
+        MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
+        if (isInductionOperation(DI, DefReg)) {
+          IOps.push_back(DI);
+          IVars.push_back(MPhi);
+        }
+      }
+    }
+  }
+  return;
+}
+
+/// getTripCount - Return a loop-invariant LLVM value indicating the
+/// number of times the loop will be executed.  The trip count can
+/// be either a register or a constant value.  If the trip-count
+/// cannot be determined, this returns null.
+///
+/// We find the trip count from the phi instruction that defines the
+/// induction variable.  We follow the links to the CMP instruction
+/// to get the trip count.
+///
+/// Based upon getTripCount in LoopInfo.
+///
+CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
+                           SmallVector<MachineInstr *, 2> &OldInsts) const {
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate a CTR loop if the loop has more than one exit.
+  if (LastMBB == 0)
+    return 0;
+
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+  if (LastI->getOpcode() != PPC::BCC)
+    return 0;
+
+  // We need to make sure that this compare is defining the condition
+  // register actually used by the terminating branch.
+
+  unsigned PredReg = LastI->getOperand(1).getReg();
+  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
+
+  unsigned PredCond = LastI->getOperand(0).getImm();
+  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
+    return 0;
+
+  // Check that the loop has a induction variable.
+  SmallVector<MachineInstr *, 4> IVars, IOps;
+  getCanonicalInductionVariable(L, IVars, IOps);
+  for (unsigned i = 0; i < IVars.size(); ++i) {
+    MachineInstr *IOp = IOps[i];
+    MachineInstr *IV_Inst = IVars[i];
+
+    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
+    //  if Imm is 0, get the count from the PHI opnd
+    //  if Imm is -M, than M is the count
+    //  Otherwise, Imm is the count
+    MachineOperand *IV_Opnd;
+    const MachineOperand *InitialValue;
+    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
+      InitialValue = &IV_Inst->getOperand(1);
+      IV_Opnd = &IV_Inst->getOperand(3);
+    } else {
+      InitialValue = &IV_Inst->getOperand(3);
+      IV_Opnd = &IV_Inst->getOperand(1);
+    }
+
+    DEBUG(dbgs() << "Considering:\n");
+    DEBUG(dbgs() << "  induction operation: " << *IOp);
+    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
+    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
+  
+    // Look for the cmp instruction to determine if we
+    // can get a useful trip count.  The trip count can
+    // be either a register or an immediate.  The location
+    // of the value depends upon the type (reg or imm).
+    for (MachineRegisterInfo::reg_iterator
+         RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
+         RI != RE; ++RI) {
+      IV_Opnd = &RI.getOperand();
+      bool SignedCmp, Int64Cmp;
+      MachineInstr *MI = IV_Opnd->getParent();
+      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp, Int64Cmp) &&
+          MI->getOperand(0).getReg() == PredReg) {
+
+        OldInsts.push_back(MI);
+        OldInsts.push_back(IOp);
+ 
+        DEBUG(dbgs() << "  compare: " << *MI);
+ 
+        const MachineOperand &MO = MI->getOperand(2);
+        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
+
+        int64_t ImmVal;
+        if (SignedCmp)
+          ImmVal = (short) MO.getImm();
+        else
+          ImmVal = MO.getImm();
+  
+        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
+        assert(L->contains(IV_DefInstr->getParent()) &&
+               "IV definition should occurs in loop");
+        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
+  
+        assert(InitialValue->isReg() && "Expecting register for init value");
+        unsigned InitialValueReg = InitialValue->getReg();
+  
+        MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
+  
+        // Here we need to look for an immediate load (an li or lis/ori pair).
+        if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
+                         DefInstr->getOpcode() == PPC::ORI)) {
+          int64_t start = DefInstr->getOperand(2).getImm();
+          MachineInstr *DefInstr2 =
+            MRI->getVRegDef(DefInstr->getOperand(1).getReg());
+          if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
+                            DefInstr2->getOpcode() == PPC::LIS)) {
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
+
+            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
+  
+            int64_t count = ImmVal - start;
+            if ((count % iv_value) != 0) {
+              return 0;
+            }
+
+            OldInsts.push_back(DefInstr);
+            OldInsts.push_back(DefInstr2);
+
+            // count/iv_value, the trip count, should be positive here. If it
+            // is negative, that indicates that the counter will wrap.
+            if (Int64Cmp)
+              return new CountValue(count/iv_value);
+            else
+              return new CountValue(uint32_t(count/iv_value));
+          }
+        } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
+                                DefInstr->getOpcode() == PPC::LI)) {
+          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
+
+          int64_t count = ImmVal -
+            int64_t(short(DefInstr->getOperand(1).getImm()));
+          if ((count % iv_value) != 0) {
+            return 0;
+          }
+
+          OldInsts.push_back(DefInstr);
+
+          if (Int64Cmp)
+            return new CountValue(count/iv_value);
+          else
+            return new CountValue(uint32_t(count/iv_value));
+        } else if (iv_value == 1 || iv_value == -1) {
+          // We can't determine a constant starting value.
+          if (ImmVal == 0) {
+            return new CountValue(InitialValueReg, iv_value > 0);
+          }
+          // FIXME: handle non-zero end value.
+        }
+        // FIXME: handle non-unit increments (we might not want to introduce
+        // division but we can handle some 2^n cases with shifts).
+  
+      }
+    }
+  }
+  return 0;
+}
+
+/// isInductionOperation - return true if the operation is matches the
+/// pattern that defines an induction variable:
+///    addi iv, c
+///
+bool
+PPCCTRLoops::isInductionOperation(const MachineInstr *MI,
+                                           unsigned IVReg) const {
+  return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) &&
+          MI->getOperand(1).isReg() && // could be a frame index instead
+          MI->getOperand(1).getReg() == IVReg);
+}
+
+/// isInvalidOperation - Return true if the operation is invalid within
+/// CTR loop.
+bool
+PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
+
+  // call is not allowed because the callee may use a CTR loop
+  if (MI->getDesc().isCall()) {
+    return true;
+  }
+  // check if the instruction defines a CTR loop register
+  // (this will also catch nested CTR loops)
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// containsInvalidInstruction - Return true if the loop contains
+/// an instruction that inhibits the use of the CTR loop function.
+///
+bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const {
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    for (MachineBasicBlock::iterator
+           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
+      const MachineInstr *MI = &*MII;
+      if (isInvalidLoopOperation(MI)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isDead returns true if the instruction is dead
+/// (this was essentially copied from DeadMachineInstructionElim::isDead, but
+/// with special cases for inline asm, physical registers and instructions with
+/// side effects removed)
+bool PPCCTRLoops::isDead(const MachineInstr *MI,
+                         SmallVector<MachineInstr *, 1> &DeadPhis) const {
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (!MRI->use_nodbg_empty(Reg)) {
+        // This instruction has users, but if the only user is the phi node for
+        // the parent block, and the only use of that phi node is this
+        // instruction, then this instruction is dead: both it (and the phi
+        // node) can be removed.
+        MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
+        if (llvm::next(I) == MRI->use_end() &&
+            I.getOperand().getParent()->isPHI()) {
+          MachineInstr *OnePhi = I.getOperand().getParent();
+
+          for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
+            const MachineOperand &OPO = OnePhi->getOperand(j);
+            if (OPO.isReg() && OPO.isDef()) {
+              unsigned OPReg = OPO.getReg();
+
+              MachineRegisterInfo::use_iterator nextJ;
+              for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg),
+                   E = MRI->use_end(); J!=E; J=nextJ) {
+                nextJ = llvm::next(J);
+                MachineOperand& Use = J.getOperand();
+                MachineInstr *UseMI = Use.getParent();
+
+                if (MI != UseMI) {
+                  // The phi node has a user that is not MI, bail...
+                  return false;
+                }
+              }
+            }
+          }
+
+          DeadPhis.push_back(OnePhi);
+        } else {
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+        }
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+void PPCCTRLoops::removeIfDead(MachineInstr *MI) {
+  // This procedure was essentially copied from DeadMachineInstructionElim
+
+  SmallVector<MachineInstr *, 1> DeadPhis;
+  if (isDead(MI, DeadPhis)) {
+    DEBUG(dbgs() << "CTR looping will remove: " << *MI);
+
+    // It is possible that some DBG_VALUE instructions refer to this
+    // instruction.  Examine each def operand for such references;
+    // if found, mark the DBG_VALUE as undef (but don't delete it).
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      MachineRegisterInfo::use_iterator nextI;
+      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
+           E = MRI->use_end(); I!=E; I=nextI) {
+        nextI = llvm::next(I);  // I is invalidated by the setReg
+        MachineOperand& Use = I.getOperand();
+        MachineInstr *UseMI = Use.getParent();
+        if (UseMI==MI)
+          continue;
+        if (Use.isDebug()) // this might also be a instr -> phi -> instr case
+                           // which can also be removed.
+          UseMI->getOperand(0).setReg(0U);
+      }
+    }
+
+    MI->eraseFromParent();
+    for (unsigned i = 0; i < DeadPhis.size(); ++i) {
+      DeadPhis[i]->eraseFromParent();
+    }
+  }
+}
+
+/// converToCTRLoop - check if the loop is a candidate for
+/// converting to a CTR loop.  If so, then perform the
+/// transformation.
+///
+/// This function works on innermost loops first.  A loop can
+/// be converted if it is a counting loop; either a register
+/// value or an immediate.
+///
+/// The code makes several assumptions about the representation
+/// of the loop in llvm.
+bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
+  bool Changed = false;
+  // Process nested loops first.
+  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    Changed |= convertToCTRLoop(*I);
+  }
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (Changed) {
+    return Changed;
+  }
+
+  SmallVector<MachineInstr *, 2> OldInsts;
+  // Are we able to determine the trip count for the loop?
+  CountValue *TripCount = getTripCount(L, OldInsts);
+  if (TripCount == 0) {
+    DEBUG(dbgs() << "failed to get trip count!\n");
+    return false;
+  }
+
+  if (TripCount->isImm()) {
+    DEBUG(dbgs() << "constant trip count: " << TripCount->getImm() << "\n");
+
+    // FIXME: We currently can't form 64-bit constants
+    // (including 32-bit unsigned constants)
+    if (!isInt<32>(TripCount->getImm()))
+      return false;
+  }
+
+  // Does the loop contain any invalid instructions?
+  if (containsInvalidInstruction(L)) {
+    return false;
+  }
+  MachineBasicBlock *Preheader = L->getLoopPreheader();
+  // No preheader means there's not place for the loop instr.
+  if (Preheader == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
+
+  DebugLoc dl;
+  if (InsertPos != Preheader->end())
+    dl = InsertPos->getDebugLoc();
+
+  MachineBasicBlock *LastMBB = L->getExitingBlock();
+  // Don't generate CTR loop if the loop has more than one exit.
+  if (LastMBB == 0) {
+    return false;
+  }
+  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
+
+  // Determine the loop start.
+  MachineBasicBlock *LoopStart = L->getTopBlock();
+  if (L->getLoopLatch() != LastMBB) {
+    // When the exit and latch are not the same, use the latch block as the
+    // start.
+    // The loop start address is used only after the 1st iteration, and the loop
+    // latch may contains instrs. that need to be executed after the 1st iter.
+    LoopStart = L->getLoopLatch();
+    // Make sure the latch is a successor of the exit, otherwise it won't work.
+    if (!LastMBB->isSuccessor(LoopStart)) {
+      return false;
+    }
+  }
+
+  // Convert the loop to a CTR loop
+  DEBUG(dbgs() << "Change to CTR loop at "; L->dump());
+
+  MachineFunction *MF = LastMBB->getParent();
+  const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
+  bool isPPC64 = Subtarget.isPPC64();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+
+  unsigned CountReg;
+  if (TripCount->isReg()) {
+    // Create a copy of the loop count register.
+    const TargetRegisterClass *SrcRC =
+      MF->getRegInfo().getRegClass(TripCount->getReg());
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    unsigned CopyOp = (isPPC64 && GPRC->hasSubClassEq(SrcRC)) ?
+                        (unsigned) PPC::EXTSW_32_64 :
+                        (unsigned) TargetOpcode::COPY;
+    BuildMI(*Preheader, InsertPos, dl,
+            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
+    if (TripCount->isNeg()) {
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
+                       CountReg).addReg(CountReg1);
+    }
+  } else {
+    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
+    // Put the trip count in a register for transfer into the count register.
+
+    int64_t CountImm = TripCount->getImm();
+    if (TripCount->isNeg())
+      CountImm = -CountImm;
+
+    CountReg = MF->getRegInfo().createVirtualRegister(RC);
+    if (abs64(CountImm) > 0x7FFF) {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
+              CountReg).addImm((CountImm >> 16) & 0xFFFF);
+      unsigned CountReg1 = CountReg;
+      CountReg = MF->getRegInfo().createVirtualRegister(RC);
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
+              CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF);
+    } else {
+      BuildMI(*Preheader, InsertPos, dl,
+              TII->get(isPPC64 ? PPC::LI8 : PPC::LI),
+              CountReg).addImm(CountImm);
+    }
+  }
+
+  // Add the mtctr instruction to the beginning of the loop.
+  BuildMI(*Preheader, InsertPos, dl,
+          TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg,
+            TripCount->isImm() ? RegState::Kill : 0);
+
+  // Make sure the loop start always has a reference in the CFG.  We need to
+  // create a BlockAddress operand to get this mechanism to work both the
+  // MachineBasicBlock and BasicBlock objects need the flag set.
+  LoopStart->setHasAddressTaken();
+  // This line is needed to set the hasAddressTaken flag on the BasicBlock
+  // object
+  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
+
+  // Replace the loop branch with a bdnz instruction.
+  dl = LastI->getDebugLoc();
+  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = Blocks[i];
+    if (MBB != Preheader)
+      MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR);
+  }
+
+  // The loop ends with either:
+  //  - a conditional branch followed by an unconditional branch, or
+  //  - a conditional branch to the loop start.
+  assert(LastI->getOpcode() == PPC::BCC &&
+         "loop end must start with a BCC instruction");
+  // Either the BCC branches to the beginning of the loop, or it
+  // branches out of the loop and there is an unconditional branch
+  // to the start of the loop.
+  MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB();
+  BuildMI(*LastMBB, LastI, dl,
+        TII->get((BranchTarget == LoopStart) ?
+                 (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                 (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
+
+  // Conditional branch; just delete it.
+  DEBUG(dbgs() << "Removing old branch: " << *LastI);
+  LastMBB->erase(LastI);
+
+  delete TripCount;
+
+  // The induction operation (add) and the comparison (cmpwi) may now be
+  // unneeded. If these are unneeded, then remove them.
+  for (unsigned i = 0; i < OldInsts.size(); ++i)
+    removeIfDead(OldInsts[i]);
+
+  ++NumCTRLoops;
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 000000000000..c8a29a3d2cfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -0,0 +1,144 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  // On PPC64, integer return values are always promoted to i64
+  CCIfType<[i32], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  
+  CCIfType<[f32], CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4]>>,
+  
+  // Vector types are always returned in V2.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC System V Release 4 32-bit ABI
+//===----------------------------------------------------------------------===//
+
+def CC_PPC32_SVR4_Common : CallingConv<[
+  // The ABI requires i64 to be passed in two adjacent registers with the first
+  // register having an odd register number.
+  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignArgRegs">>>,
+
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+
+  // Make sure the i64 words from a long double are either both passed in
+  // registers or both passed on the stack.
+  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
+  
+  // FP values are passed in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+  // Split arguments have an alignment of 8 bytes on the stack.
+  CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
+  
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  
+  // Floats are stored in double precision format, thus they have the same
+  // alignment and size as doubles.
+  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToStack<16, 16>>
+]>;
+
+// This calling convention puts vector arguments always on the stack. It is used
+// to assign vector arguments which belong to the variable portion of the
+// parameter list of a variable argument function.
+def CC_PPC32_SVR4_VarArg : CallingConv<[
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;
+
+// In contrast to CC_PPC32_SVR4_VarArg, this calling convention first tries to
+// put vector arguments in vector registers before putting them on the stack.
+def CC_PPC32_SVR4 : CallingConv<[
+  // The first 12 Vector arguments are passed in AltiVec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
+           
+  CCDelegateTo<CC_PPC32_SVR4_Common>
+]>;  
+
+// Helper "calling convention" to handle aggregate by value arguments.
+// Aggregate by value arguments are always placed in the local variable space
+// of the caller. This calling convention is only used to assign those stack
+// offsets in the callers stack frame.
+//
+// Still, the address of the aggregate copy in the callers stack frame is passed
+// in a GPR (or in the parameter list area if all GPRs are allocated) from the
+// caller to the callee. The location for the address argument is assigned by
+// the CC_PPC32_SVR4 calling convention.
+//
+// The only purpose of CC_PPC32_SVR4_Custom_Dummy is to skip arguments which are
+// not passed by value.
+ 
+def CC_PPC32_SVR4_ByVal : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  
+  CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
+]>;
+
+def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20, VRSAVE,
+                                        R21, R22, R23, R24, R25, R26, R27, R28,
+                                        R29, R30, R31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, VRSAVE,
+                                        X21, X22, X23, X24, X25, X26, X27, X28,
+                                        X29, X30, X31, F14, F15, F16, F17, F18,
+                                        F19, F20, F21, F22, F23, F24, F25, F26,
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
+                                        V20, V21, V22, V23, V24, V25, V26, V27,
+                                        V28, V29, V30, V31)>;
+
+def CSR_NoRegs : CalleeSavedRegs<(add VRSAVE)>;
+def CSR_NoRegs_Darwin : CalleeSavedRegs<(add)>;
+
+def CSR_NoRegs_Altivec : CalleeSavedRegs<(add (sequence "V%u", 0, 31), VRSAVE)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp
new file mode 100644
index 000000000000..64787185138b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -0,0 +1,271 @@
+//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
+// JIT-compile bitcode to native PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  class PPCCodeEmitter : public MachineFunctionPass {
+    TargetMachine &TM;
+    JITCodeEmitter &MCE;
+    MachineModuleInfo *MMI;
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    static char ID;
+    
+    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
+    /// its address in the function into this pointer.
+    void *MovePCtoLROffset;
+  public:
+    
+    PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(ID), TM(tm), MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    
+    MachineRelocation GetRelocation(const MachineOperand &MO,
+                                    unsigned RelocID) const;
+    
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+
+    unsigned get_crbitm_encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+
+    unsigned getHA16Encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getLO16Encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
+
+    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+
+    /// runOnMachineFunction - emits the given MachineFunction to memory
+    ///
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    /// emitBasicBlock - emits the given MachineBasicBlock to memory
+    ///
+    void emitBasicBlock(MachineBasicBlock &MBB);
+  };
+}
+
+char PPCCodeEmitter::ID = 0;
+
+/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
+/// to the specified MCE object.
+FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new PPCCodeEmitter(TM, JCE);
+}
+
+bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+  do {
+    MovePCtoLROffset = 0;
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      emitBasicBlock(*BB);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
+    const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
+    switch (MI.getOpcode()) {
+    default:
+      MCE.emitWordBE(getBinaryCodeForInstr(MI));
+      break;
+    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+      break;
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break; // pseudo opcode, no side effects
+    case PPC::MovePCtoLR:
+    case PPC::MovePCtoLR8:
+      assert(TM.getRelocationModel() == Reloc::PIC_);
+      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
+      MCE.emitWordBE(0x48000005);   // bl 1
+      break;
+    }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
+  }
+}
+
+unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MTCRF8 ||
+            MI.getOpcode() == PPC::MFOCRF) &&
+         (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
+  return 0x80 >> TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+}
+
+MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, 
+                                                unsigned RelocID) const {
+  // If in PIC mode, we need to encode the negated address of the
+  // 'movepctolr' into the unrelocated field.  After relocation, we'll have
+  // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
+  // field, we get &gv.  This doesn't happen for branch relocations, which are
+  // always implicitly pc relative.
+  intptr_t Cst = 0;
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+    Cst = -(intptr_t)MovePCtoLROffset - 4;
+  }
+  
+  if (MO.isGlobal())
+    return MachineRelocation::getGV(MCE.getCurrentPCOffset(), RelocID,
+                                    const_cast<GlobalValue *>(MO.getGlobal()),
+                                    Cst, isa<Function>(MO.getGlobal()));
+  if (MO.isSymbol())
+    return MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                        RelocID, MO.getSymbolName(), Cst);
+  if (MO.isCPI())
+    return MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                           RelocID, MO.getIndex(), Cst);
+
+  if (MO.isMBB())
+    return MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                    RelocID, MO.getMBB());
+  
+  assert(MO.isJTI());
+  return MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                         RelocID, MO.getIndex(), Cst);
+}
+
+unsigned PPCCodeEmitter::getDirectBrEncoding(const MachineInstr &MI,
+                                             unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+  
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bx));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_pcrel_bcx));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getHA16Encoding(const MachineInstr &MI,
+                                         unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_high));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getLO16Encoding(const MachineInstr &MI,
+                                         unsigned OpNo) const {
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
+  
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
+  return 0;
+}
+
+unsigned PPCCodeEmitter::getMemRIEncoding(const MachineInstr &MI,
+                                          unsigned OpNo) const {
+  // Encode (imm, reg) as a memri, which has the low 16-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 16;
+  
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO) & 0xFFFF) | RegBits;
+  
+  // Add a fixup for the displacement field.
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
+  return RegBits;
+}
+
+unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  // Encode (imm, reg) as a memrix, which has the low 14-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo+1).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1)) << 14;
+  
+  const MachineOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return (getMachineOpValue(MI, MO) & 0x3FFF) | RegBits;
+  
+  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix));
+  return RegBits;
+}
+
+
+unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI,
+                                           unsigned OpNo) const {
+  llvm_unreachable("TLS not supported on the old JIT.");
+  return 0;
+}
+
+
+unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+
+  if (MO.isReg()) {
+    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // The GPR operand should come through here though.
+    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MTCRF8 &&
+             MI.getOpcode() != PPC::MFOCRF) ||
+           MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  }
+  
+  assert(MO.isImm() &&
+         "Relocation required in an instruction that we cannot encode!");
+  return MO.getImm();
+}
+
+#include "PPCGenCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
new file mode 100644
index 000000000000..3244b904ee64
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -0,0 +1,1329 @@
+//===-- PPCFrameLowering.cpp - PPC Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PPC implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+// FIXME This disables some code that aligns the stack to a boundary bigger than
+// the default (16 bytes on Darwin) when there is a stack local of greater
+// alignment.  This does not currently work, because the delta between old and
+// new stack pointers is added to offsets that reference incoming parameters
+// after the prolog is generated, and the code that does that doesn't handle a
+// variable delta.  You don't want to do that anyway; a better approach is to
+// reserve another register that retains to the incoming stack pointer, and
+// reference parameters relative to that.
+#define ALIGN_STACK 0
+
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const uint16_t VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+/// FIXME: The removal of the code results in a compile failure at -O0 when the
+/// function contains a function call, as the GPR containing original VRSAVE
+/// contents is spilled and reloaded around the call.  Without the prolog code,
+/// the spill instruction refers to an undefined register.  This code needs
+/// to account for all uses of that GPR.
+static void RemoveVRSaveCode(MachineInstr *MI) {
+  MachineBasicBlock *Entry = MI->getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn()) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI->eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI->getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineRegisterInfo::livein_iterator
+       I = MF->getRegInfo().livein_begin(),
+       E = MF->getRegInfo().livein_end(); I != E; ++I) {
+    unsigned RegNo = TRI->getEncodingValue(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+
+  // Live out registers appear as use operands on return instructions.
+  for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end();
+       UsedRegMask != 0 && BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+    if (MBB.empty() || !MBB.back().isReturn())
+      continue;
+    const MachineInstr &Ret = MBB.back();
+    for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) {
+      const MachineOperand &MO = Ret.getOperand(I);
+      if (!MO.isReg() || !PPC::VRRCRegClass.contains(MO.getReg()))
+        continue;
+      unsigned RegNo = TRI->getEncodingValue(MO.getReg());
+      UsedRegMask &= ~(1 << (31-RegNo));
+    }
+  }
+
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  }
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+  } else {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+
+    BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+      .addReg(DstReg, RegState::Kill)
+      .addImm(UsedRegMask & 0xFFFF);
+  }
+
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI->eraseFromParent();
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
+static bool spillsVRSAVE(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isVRSAVESpilled();
+}
+
+static bool hasSpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasSpills();
+}
+
+static bool hasNonRISpills(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->hasNonRISpills();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
+                                                bool UpdateMF,
+                                                bool UseEstimate) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize =
+    UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
+
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  unsigned TargetAlign = getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;  //
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).  For 64-bit
+  // SVR4, we also require a stack frame if we need to spill the CR,
+  // since this spill area is addressed relative to the stack pointer.
+  // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
+  // stackless code if all local vars are reg-allocated.
+  bool DisableRedZone = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone);
+  if (!DisableRedZone &&
+      (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
+       !Subtarget.isSVR4ABI() ||                   //   allocated locals.
+	FrameSize == 0) &&
+      FrameSize <= 224 &&                          // Fits in red zone.
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->adjustsStack() &&                      // No calls.
+      !(Subtarget.isPPC64() &&                     // No 64-bit SVR4 CRsave.
+	Subtarget.isSVR4ABI()
+	&& spillsCR(MF)) &&
+      (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
+    // No need for frame
+    if (UpdateMF)
+      MFI->setStackSize(0);
+    return 0;
+  }
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // Maximum call frame needs to be at least big enough for linkage and 8 args.
+  unsigned minCallFrameSize = getMinCallFrameSize(Subtarget.isPPC64(),
+                                                  Subtarget.isDarwinABI());
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  if (UpdateMF)
+    MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  if (UpdateMF)
+    MFI->setStackSize(FrameSize);
+
+  return FrameSize;
+}
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.
+bool PPCFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // FIXME: This is pretty much broken by design: hasFP() might be called really
+  // early, before the stack layout was calculated and thus hasFP() might return
+  // true or false here depending on the time of call.
+  return (MFI->getStackSize()) && needsFP(MF);
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Naked functions have no stack frame pushed, so we don't have a frame
+  // pointer.
+  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                     Attribute::Naked))
+    return false;
+
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() ||
+    (MF.getTarget().Options.GuaranteedTailCallOpt &&
+     MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
+  bool is31 = needsFP(MF);
+  unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
+  unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI)
+    for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
+      --MBBI;
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = MBBI->getOperand(I);
+        if (!MO.isReg())
+          continue;
+
+        switch (MO.getReg()) {
+        case PPC::FP:
+          MO.setReg(FPReg);
+          break;
+        case PPC::FP8:
+          MO.setReg(FP8Reg);
+          break;
+        }
+      }
+    }
+}
+
+void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  MachineModuleInfo &MMI = MF.getMMI();
+  DebugLoc dl;
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+
+  // Prepare for frame info.
+  MCSymbol *FrameLabel = 0;
+
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  if (!Subtarget.isSVR4ABI())
+    for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+      if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+        HandleVRSaveUpdate(MBBI, TII);
+        break;
+      }
+    }
+
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  unsigned FrameSize = determineFrameLayout(MF);
+  int NegFrameSize = -FrameSize;
+
+  if (MFI->isFrameAddressTaken())
+    replaceFPWithRealFP(MF);
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  // Check if the link register (LR) must be saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF);
+
+  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (Subtarget.isSVR4ABI()) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
+  if (isPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
+
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X31)
+        .addImm(FPOffset/4)
+        .addReg(PPC::X1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X0)
+        .addImm(LROffset / 4)
+        .addReg(PPC::X1);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
+
+    if (HasFP)
+      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
+      // offsets of R1 is not allowed.
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R31)
+        .addImm(FPOffset)
+        .addReg(PPC::R1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R0)
+        .addImm(LROffset)
+        .addReg(PPC::R1);
+  }
+
+  // Skip if a leaf routine.
+  if (!FrameSize) return;
+
+  // Get stack alignments.
+  unsigned TargetAlign = getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+  if (!isPPC64) {
+    // PPC32.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R1)
+        .addImm(0)
+        .addImm(32 - Log2_32(MaxAlign))
+        .addImm(31);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
+        .addReg(PPC::R1, RegState::Kill)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    } else if (isInt<16>(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
+        .addReg(PPC::R1)
+        .addImm(NegFrameSize)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
+        .addReg(PPC::R1, RegState::Kill)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    }
+  } else {    // PPC64.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+             "Invalid alignment!");
+      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
+        .addReg(PPC::X1)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
+        .addReg(PPC::X0)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
+        .addReg(PPC::X1, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    } else if (isInt<16>(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
+        .addReg(PPC::X1)
+        .addImm(NegFrameSize / 4)
+        .addReg(PPC::X1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+        .addReg(PPC::X0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
+        .addReg(PPC::X1, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    }
+  }
+
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+
+  // Add the "machine moves" for the instructions we generated above, but in
+  // reverse order.
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer becomes valid.
+    FrameLabel = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(FrameLabel);
+
+    // Show update of SP.
+    if (NegFrameSize) {
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
+      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+    } else {
+      MachineLocation SP(isPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabel, SP, SP));
+    }
+
+    if (HasFP) {
+      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
+      MachineLocation FPSrc(isPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+    }
+
+    if (MustSaveLR) {
+      MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
+      MachineLocation LRSrc(isPPC64 ? PPC::LR8 : PPC::LR);
+      Moves.push_back(MachineMove(FrameLabel, LRDst, LRSrc));
+    }
+  }
+
+  MCSymbol *ReadyLabel = 0;
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    if (!isPPC64) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
+        .addReg(PPC::R1)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
+        .addReg(PPC::X1)
+        .addReg(PPC::X1);
+    }
+
+    if (needsFrameMoves) {
+      ReadyLabel = MMI.getContext().CreateTempSymbol();
+
+      // Mark effective beginning of when frame pointer is ready.
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
+
+      MachineLocation FPDst(HasFP ? (isPPC64 ? PPC::X31 : PPC::R31) :
+                                    (isPPC64 ? PPC::X1 : PPC::R1));
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
+    }
+  }
+
+  if (needsFrameMoves) {
+    MCSymbol *Label = HasFP ? ReadyLabel : FrameLabel;
+
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+
+      // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
+      // subregisters of CR2. We just need to emit a move of CR2.
+      if (PPC::CRBITRCRegClass.contains(Reg))
+        continue;
+
+      // For SVR4, don't emit a move for the CR spill slot if we haven't
+      // spilled CRs.
+      if (Subtarget.isSVR4ABI()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+	  && !spillsCR(MF))
+	continue;
+
+      // For 64-bit SVR4 when we have spilled CRs, the spill location
+      // is SP+8, not a frame-relative slot.
+      if (Subtarget.isSVR4ABI()
+	  && Subtarget.isPPC64()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	MachineLocation CSDst(PPC::X1, 8);
+	MachineLocation CSSrc(PPC::CR2);
+	Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+	continue;
+      }
+
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+    }
+  }
+}
+
+void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
+                                MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI != MBB.end() && "Returning block has no terminator");
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl;
+
+  assert((RetOpcode == PPC::BLR ||
+          RetOpcode == PPC::TCRETURNri ||
+          RetOpcode == PPC::TCRETURNdi ||
+          RetOpcode == PPC::TCRETURNai ||
+          RetOpcode == PPC::TCRETURNri8 ||
+          RetOpcode == PPC::TCRETURNdi8 ||
+          RetOpcode == PPC::TCRETURNai8) &&
+         "Can only insert epilog into returning blocks");
+
+  // Get alignment info so we know how to restore r1
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned TargetAlign = getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  int FrameSize = MFI->getStackSize();
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  // Check if the link register (LR) has been saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF);
+
+  int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (Subtarget.isSVR4ABI()) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
+  bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+    RetOpcode == PPC::TCRETURNdi ||
+    RetOpcode == PPC::TCRETURNai ||
+    RetOpcode == PPC::TCRETURNri8 ||
+    RetOpcode == PPC::TCRETURNdi8 ||
+    RetOpcode == PPC::TCRETURNai8;
+
+  if (UsesTCRet) {
+    int MaxTCRetDelta = FI->getTailCallSPDelta();
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int Delta = StackAdj - MaxTCRetDelta;
+    assert((Delta >= 0) && "Delta must be positive");
+    if (MaxTCRetDelta>0)
+      FrameSize += (StackAdj +Delta);
+    else
+      FrameSize += StackAdj;
+  }
+
+  if (FrameSize) {
+    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
+    // on entry to the function.  Add this offset back now.
+    if (!isPPC64) {
+      // If this function contained a fastcc call and GuaranteedTailCallOpt is
+      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+      // call which invalidates the stack pointer value in SP(0). So we use the
+      // value of R31 in this case.
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+          .addReg(PPC::R0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
+          .addReg(PPC::R1)
+          .addReg(PPC::R31)
+          .addReg(PPC::R0);
+      } else if (isInt<16>(FrameSize) &&
+                 (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
+                 !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
+          .addImm(0).addReg(PPC::R1);
+      }
+    } else {
+      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+          .addReg(PPC::X31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+          .addReg(PPC::X0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
+          .addReg(PPC::X1)
+          .addReg(PPC::X31)
+          .addReg(PPC::X0);
+      } else if (isInt<16>(FrameSize) && TargetAlign >= MaxAlign &&
+            !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+           .addReg(PPC::X1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
+           .addImm(0).addReg(PPC::X1);
+      }
+    }
+  }
+
+  if (isPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
+        .addImm(LROffset/4).addReg(PPC::X1);
+
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
+        .addImm(FPOffset/4).addReg(PPC::X1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
+          .addImm(LROffset).addReg(PPC::R1);
+
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
+          .addImm(FPOffset).addReg(PPC::R1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
+  }
+
+  // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+  // call optimization
+  if (MF.getTarget().Options.GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
+      MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+     PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+     unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+     unsigned StackReg = isPPC64 ? PPC::X1 : PPC::R1;
+     unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
+     unsigned TmpReg = isPPC64 ? PPC::X0 : PPC::R0;
+     unsigned ADDIInstr = isPPC64 ? PPC::ADDI8 : PPC::ADDI;
+     unsigned ADDInstr = isPPC64 ? PPC::ADD8 : PPC::ADD4;
+     unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
+     unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
+
+     if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
+       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
+         .addReg(StackReg).addImm(CallerAllocatedAmt);
+     } else {
+       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CallerAllocatedAmt >> 16);
+       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CallerAllocatedAmt & 0xFFFF);
+       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
+          .addReg(StackReg)
+          .addReg(FPReg)
+          .addReg(TmpReg);
+     }
+  } else if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+void
+PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                   RegScavenger *) const {
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = RegInfo->getRARegister();
+  FI->setMustSaveLR(MustSaveLR(MF, LR));
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MRI.setPhysRegUnused(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI  = Subtarget.isDarwinABI();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI && needsFP(MF)) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+
+  // Reserve stack space to move the linkage area to in case of a tail call.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+    MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
+  }
+
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the 
+  // function uses CR 2, 3, or 4.
+  if (!isPPC64 && !isDarwinABI && 
+      (MRI.isPhysRegUsed(PPC::CR2) ||
+       MRI.isPhysRegUsed(PPC::CR3) ||
+       MRI.isPhysRegUsed(PPC::CR4))) {
+    int FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+    FI->setCRSpillFrameIndex(FrameIdx);
+  }
+}
+
+void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  // Get callee saved register information.
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty() && !needsFP(MF)) {
+    addScavengingSpillSlot(MF, RS);
+    return;
+  }
+
+  unsigned MinGPR = PPC::R31;
+  unsigned MinG8R = PPC::X31;
+  unsigned MinFPR = PPC::F31;
+  unsigned MinVR = PPC::V31;
+
+  bool HasGPSaveArea = false;
+  bool HasG8SaveArea = false;
+  bool HasFPSaveArea = false;
+  bool HasVRSAVESaveArea = false;
+  bool HasVRSaveArea = false;
+
+  SmallVector<CalleeSavedInfo, 18> GPRegs;
+  SmallVector<CalleeSavedInfo, 18> G8Regs;
+  SmallVector<CalleeSavedInfo, 18> FPRegs;
+  SmallVector<CalleeSavedInfo, 18> VRegs;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (PPC::GPRCRegClass.contains(Reg)) {
+      HasGPSaveArea = true;
+
+      GPRegs.push_back(CSI[i]);
+
+      if (Reg < MinGPR) {
+        MinGPR = Reg;
+      }
+    } else if (PPC::G8RCRegClass.contains(Reg)) {
+      HasG8SaveArea = true;
+
+      G8Regs.push_back(CSI[i]);
+
+      if (Reg < MinG8R) {
+        MinG8R = Reg;
+      }
+    } else if (PPC::F8RCRegClass.contains(Reg)) {
+      HasFPSaveArea = true;
+
+      FPRegs.push_back(CSI[i]);
+
+      if (Reg < MinFPR) {
+        MinFPR = Reg;
+      }
+    } else if (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)) {
+      ; // do nothing, as we already know whether CRs are spilled
+    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
+      HasVRSAVESaveArea = true;
+    } else if (PPC::VRRCRegClass.contains(Reg)) {
+      HasVRSaveArea = true;
+
+      VRegs.push_back(CSI[i]);
+
+      if (Reg < MinVR) {
+        MinVR = Reg;
+      }
+    } else {
+      llvm_unreachable("Unknown RegisterClass!");
+    }
+  }
+
+  PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+
+  int64_t LowerBound = 0;
+
+  // Take into account stack space reserved for tail calls.
+  int TCSPDelta = 0;
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+    LowerBound = TCSPDelta;
+  }
+
+  // The Floating-point register save area is right below the back chain word
+  // of the previous stack frame.
+  if (HasFPSaveArea) {
+    for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) {
+      int FI = FPRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    LowerBound -= (31 - TRI->getEncodingValue(MinFPR) + 1) * 8;
+  }
+
+  // Check whether the frame pointer register is allocated. If so, make sure it
+  // is spilled to the correct offset.
+  if (needsFP(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getFramePointerSaveIndex();
+    assert(FI && "No Frame Pointer Save Slot!");
+
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+
+  // General register save area starts right below the Floating-point
+  // register save area.
+  if (HasGPSaveArea || HasG8SaveArea) {
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
+      int FI = GPRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+      int FI = G8Regs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    unsigned MinReg =
+      std::min<unsigned>(TRI->getEncodingValue(MinGPR),
+                         TRI->getEncodingValue(MinG8R));
+
+    if (Subtarget.isPPC64()) {
+      LowerBound -= (31 - MinReg + 1) * 8;
+    } else {
+      LowerBound -= (31 - MinReg + 1) * 4;
+    }
+  }
+
+  // For 32-bit only, the CR save area is below the general register
+  // save area.  For 64-bit SVR4, the CR save area is addressed relative
+  // to the stack pointer and hence does not need an adjustment here.
+  // Only CR2 (the first nonvolatile spilled) has an associated frame
+  // index so that we have a single uniform save area.
+  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
+    // Adjust the frame index of the CR spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+	  // Leave Darwin logic as-is.
+	  || (!Subtarget.isSVR4ABI() &&
+	      (PPC::CRBITRCRegClass.contains(Reg) ||
+	       PPC::CRRCRegClass.contains(Reg)))) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The CR save area is always 4 bytes long.
+  }
+
+  if (HasVRSAVESaveArea) {
+    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+    //             which have the VRSAVE register class?
+    // Adjust the frame index of the VRSAVE spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+
+      if (PPC::VRSAVERCRegClass.contains(Reg)) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+
+    LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
+  }
+
+  if (HasVRSaveArea) {
+    // Insert alignment padding, we need 16-byte alignment.
+    LowerBound = (LowerBound - 15) & ~(15);
+
+    for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
+      int FI = VRegs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+  }
+
+  addScavengingSpillSlot(MF, RS);
+}
+
+void
+PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
+                                         RegScavenger *RS) const {
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+
+  // We need to have a scavenger spill slot for spills if the frame size is
+  // large. In case there is no free register for large-offset addressing,
+  // this slot is used for the necessary emergency spill. Also, we need the
+  // slot for dynamic stack allocations.
+
+  // The scavenger might be invoked if the frame offset does not fit into
+  // the 16-bit immediate. We don't know the complete frame size here
+  // because we've not yet computed callee-saved register spills or the
+  // needed alignment padding.
+  unsigned StackSize = determineFrameLayout(MF, false, true);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
+      hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
+    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+
+    // These kinds of spills might need two registers.
+    if (spillsCR(MF) || spillsVRSAVE(MF))
+      RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                         RC->getAlignment(),
+                                                         false));
+
+  }
+}
+
+bool 
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+				     MachineBasicBlock::iterator MI,
+				     const std::vector<CalleeSavedInfo> &CSI,
+				     const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  bool CRSpilled = false;
+  
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // CR2 through CR4 are the nonvolatile CR fields.
+    bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+    if (CRSpilled && IsCRField)
+      continue;
+
+    // Add the callee-saved register as live-in; it's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    if (IsCRField) {
+      CRSpilled = true;
+      // The first time we see a CR field, store the whole CR into the
+      // save slot via GPR12 (available in the prolog for 32- and 64-bit).
+      if (Subtarget.isPPC64()) {
+	// 64-bit:  SP+8
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR8), PPC::X12));
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::STW8))
+			       .addReg(PPC::X12,
+				       getKillRegState(true))
+			       .addImm(8)
+			       .addReg(PPC::X1));
+      } else {
+	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12));
+	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+					 .addReg(PPC::R12,
+						 getKillRegState(true)),
+					 CSI[i].getFrameIdx()));
+      }
+      
+      // Record that we spill the CR in this function.
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      FuncInfo->setSpillsCR();
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+			      CSI[i].getFrameIdx(), RC, TRI);
+    }
+  }
+  return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  unsigned RestoreOp, MoveReg;
+
+  if (isPPC64) {
+    // 64-bit:  SP+8
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::LWZ8), PPC::X12)
+	       .addImm(8)
+	       .addReg(PPC::X1));
+    RestoreOp = PPC::MTCRF8;
+    MoveReg = PPC::X12;
+  } else {
+    // 32-bit:  FP-relative
+    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+					     PPC::R12),
+				     CSI[CSIIndex].getFrameIdx()));
+    RestoreOp = PPC::MTCRF;
+    MoveReg = PPC::R12;
+  }
+  
+  if (CR2Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+               .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
+
+  if (CR3Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+               .addReg(MoveReg, getKillRegState(!CR4Spilled)));
+
+  if (CR4Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+               .addReg(MoveReg, getKillRegState(true)));
+}
+
+void PPCFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (MF.getTarget().Options.GuaranteedTailCallOpt &&
+      I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+
+      if (isInt<16>(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr), StackReg)
+          .addReg(StackReg, RegState::Kill)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+bool 
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+					MachineBasicBlock::iterator MI,
+				        const std::vector<CalleeSavedInfo> &CSI,
+					const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  bool CR2Spilled = false;
+  bool CR3Spilled = false;
+  bool CR4Spilled = false;
+  unsigned CSIIndex = 0;
+
+  // Initialize insertion-point logic; we will be restoring in reverse
+  // order of spill.
+  MachineBasicBlock::iterator I = MI, BeforeI = I;
+  bool AtStart = I == MBB.begin();
+
+  if (!AtStart)
+    --BeforeI;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    if (Reg == PPC::CR2) {
+      CR2Spilled = true;
+      // The spill slot is associated only with CR2, which is the
+      // first nonvolatile spilled.  Save it here.
+      CSIIndex = i;
+      continue;
+    } else if (Reg == PPC::CR3) {
+      CR3Spilled = true;
+      continue;
+    } else if (Reg == PPC::CR4) {
+      CR4Spilled = true;
+      continue;
+    } else {
+      // When we first encounter a non-CR register after seeing at
+      // least one CR register, restore all spilled CRs together.
+      if ((CR2Spilled || CR3Spilled || CR4Spilled)
+	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+		   MBB, I, CSI, CSIIndex);
+	CR2Spilled = CR3Spilled = CR4Spilled = false;
+      }
+
+      // Default behavior for non-CR saves.
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+			       RC, TRI);
+      assert(I != MBB.begin() &&
+	     "loadRegFromStackSlot didn't insert any code!");
+      }
+
+    // Insert in reverse order.
+    if (AtStart)
+      I = MBB.begin();
+    else {
+      I = BeforeI;
+      ++I;
+    }	    
+  }
+
+  // If we haven't yet spilled the CRs, do so now.
+  if (CR2Spilled || CR3Spilled || CR4Spilled)
+    restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+	       MBB, I, CSI, CSIIndex);
+
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
new file mode 100644
index 000000000000..6f5f9368c6c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -0,0 +1,293 @@
+//===-- PPCFrameLowering.h - Define frame lowering for PowerPC --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_FRAMEINFO_H
+#define POWERPC_FRAMEINFO_H
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class PPCSubtarget;
+
+class PPCFrameLowering: public TargetFrameLowering {
+  const PPCSubtarget &Subtarget;
+
+public:
+  PPCFrameLowering(const PPCSubtarget &sti)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown,
+        (sti.hasQPX() || sti.isBGQ()) ? 32 : 16, 0),
+      Subtarget(sti) {
+  }
+
+  unsigned determineFrameLayout(MachineFunction &MF,
+                                bool UpdateMF = true,
+                                bool UseEstimate = false) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool needsFP(const MachineFunction &MF) const;
+  void replaceFPWithRealFP(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
+  void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const { return true; }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI)
+      return isPPC64 ? 16 : 8;
+    // SVR4 ABI:
+    return isPPC64 ? 16 : 4;
+  }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI:
+    // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+    // for saving the frame pointer (if needed.)  While the published ABI has
+    // not used this slot since at least MacOSX 10.2, there is older code
+    // around that does use it, and that needs to continue to work.
+    if (isDarwinABI)
+      return isPPC64 ? -8U : -4U;
+
+    // SVR4 ABI: First slot in the general register save area.
+    return isPPC64 ? -8U : -4U;
+  }
+
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI || isPPC64)
+      return 6 * (isPPC64 ? 8 : 4);
+
+    // SVR4 ABI:
+    return 8;
+  }
+
+  /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
+  /// argument area.
+  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI / 64-bit SVR4 ABI:
+    // The prolog code of the callee may store up to 8 GPR argument registers to
+    // the stack, allowing va_start to index over them in memory if its varargs.
+    // Because we cannot tell if this is needed on the caller side, we have to
+    // conservatively assume that it is needed.  As such, make sure we have at
+    // least enough stack space for the caller to store the 8 GPRs.
+    if (isDarwinABI || isPPC64)
+      return 8 * (isPPC64 ? 8 : 4);
+
+    // 32-bit SVR4 ABI:
+    // There is no default stack allocated for the 8 first GPR arguments.
+    return 0;
+  }
+
+  /// getMinCallFrameSize - Return the minimum size a call frame can be using
+  /// the PowerPC ABI.
+  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
+    // The call frame needs to be at least big enough for linkage and 8 args.
+    return getLinkageSize(isPPC64, isDarwinABI) +
+           getMinCallArgumentsSize(isPPC64, isDarwinABI);
+  }
+
+  // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+  const SpillSlot *
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+    if (Subtarget.isDarwinABI()) {
+      NumEntries = 1;
+      if (Subtarget.isPPC64()) {
+        static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+        return &darwin64Offsets;
+      } else {
+        static const SpillSlot darwinOffsets = {PPC::R31, -4};
+        return &darwinOffsets;
+      }
+    }
+
+    // Early exit if not using the SVR4 ABI.
+    if (!Subtarget.isSVR4ABI()) {
+      NumEntries = 0;
+      return 0;
+    }
+
+    // Note that the offsets here overlap, but this is fixed up in
+    // processFunctionBeforeFrameFinalized.
+
+    static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
+    };
+
+    static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
+    };
+
+    if (Subtarget.isPPC64()) {
+      NumEntries = array_lengthof(Offsets64);
+
+      return Offsets64;
+    } else {
+      NumEntries = array_lengthof(Offsets);
+
+      return Offsets;
+    }
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 000000000000..4bf1e3396429
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -0,0 +1,245 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Scoreboard Hazard Recognizer
+void PPCScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID)
+    // This is a PPC pseudo-instruction.
+    return;
+
+  ScoreboardHazardRecognizer::EmitInstruction(SU);
+}
+
+ScheduleHazardRecognizer::HazardType
+PPCScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  return ScoreboardHazardRecognizer::getHazardType(SU, Stalls);
+}
+
+void PPCScoreboardHazardRecognizer::AdvanceCycle() {
+  ScoreboardHazardRecognizer::AdvanceCycle();
+}
+
+void PPCScoreboardHazardRecognizer::Reset() {
+  ScoreboardHazardRecognizer::Reset();
+}
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
+  : TII(tii) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DEBUG(errs() << "=== Start of dispatch group\n");
+  NumIssued = 0;
+
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  const MCInstrDesc &MCID = TII.get(Opcode);
+
+  isLoad  = MCID.mayLoad();
+  isStore = MCID.mayStore();
+
+  uint64_t TSFlags = MCID.TSFlags;
+
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+  const Value *LoadValue) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (LoadValue == StoreValue[i] && LoadOffset == StoreOffset[i])
+      return true;
+
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StoreValue[i] == LoadValue) {
+      // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+      // to see if the load and store actually overlap.
+      if (StoreOffset[i] < LoadOffset) {
+        if (int64_t(StoreOffset[i]+StoreSize[i]) > LoadOffset) return true;
+      } else {
+        if (int64_t(LoadOffset+LoadSize) > StoreOffset[i]) return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU, int Stalls) {
+  assert(Stalls == 0 && "PPC hazards don't support scoreboard lookahead");
+
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return NoHazard;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+
+  switch (InstrType) {
+  default: llvm_unreachable("Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && Opcode == PPC::BCTRL)
+    return NoopHazard;
+
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    if (isLoadOfStoredAddress(MO->getSize(),
+                              MO->getOffset(), MO->getValue()))
+      return NoopHazard;
+  }
+
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+  MachineInstr *MI = SU->getInstr();
+
+  if (MI->isDebugValue())
+    return;
+
+  unsigned Opcode = MI->getOpcode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType =
+    GetInstrType(Opcode, isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR || Opcode == PPC::MTCTR8) HasCTRSet = true;
+
+  // Track the address stored to.
+  if (isStore && NumStores < 4 && !MI->memoperands_empty()) {
+    MachineMemOperand *MO = *MI->memoperands_begin();
+    StoreSize[NumStores] = MO->getSize();
+    StoreOffset[NumStores] = MO->getOffset();
+    StoreValue[NumStores] = MO->getValue();
+    ++NumStores;
+  }
+
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::Reset() {
+  EndDispatchGroup();
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 000000000000..55b45d01b20e
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -0,0 +1,91 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCHAZRECS_H
+#define PPCHAZRECS_H
+
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+/// PPCScoreboardHazardRecognizer - This class implements a scoreboard-based
+/// hazard recognizer for generic PPC processors.
+class PPCScoreboardHazardRecognizer : public ScoreboardHazardRecognizer {
+  const ScheduleDAG *DAG;
+public:
+  PPCScoreboardHazardRecognizer(const InstrItineraryData *ItinData,
+                         const ScheduleDAG *DAG_) :
+    ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_) {}
+
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void Reset();
+};
+
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+  const TargetInstrInfo &TII;
+
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+
+  // Various things that can cause a structural hazard.
+
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  const Value *StoreValue[4];
+  int64_t StoreOffset[4];
+  uint64_t StoreSize[4];
+  unsigned NumStores;
+
+public:
+  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU, int Stalls);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void Reset();
+
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+
+  bool isLoadOfStoredAddress(uint64_t LoadSize, int64_t LoadOffset,
+                             const Value *LoadValue) const;
+};
+
+} // end namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 000000000000..95efc11b53c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -0,0 +1,1565 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-codegen"
+#include "PPC.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace llvm {
+  void initializePPCDAGToDAGISelPass(PassRegistry&);
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class PPCDAGToDAGISel : public SelectionDAGISel {
+    const PPCTargetMachine &TM;
+    const PPCTargetLowering &PPCLowering;
+    const PPCSubtarget &PPCSubTarget;
+    unsigned GlobalBaseReg;
+  public:
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+      : SelectionDAGISel(tm), TM(tm),
+        PPCLowering(*TM.getTargetLowering()),
+        PPCSubTarget(*TM.getSubtargetImpl()) {
+      initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnMachineFunction(MF);
+
+      if (!PPCSubTarget.isSVR4ABI())
+        InsertVRSaveCode(MF);
+
+      return true;
+    }
+
+    virtual void PostprocessISelDAG();
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+    }
+
+    /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
+    /// with any number of 0s on either side.  The 1s are allowed to wrap from
+    /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
+    /// 0x0F0F0000 is not, since all 1s are not contiguous.
+    static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
+
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N);
+
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, DebugLoc dl);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDValue N, SDValue &Disp,
+                       SDValue &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+    }
+
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Note that the operand at this point is already the
+    /// result of a prior SelectAddressRegImm call.
+    bool SelectAddrImmOffs(SDValue N, SDValue &Out) const {
+      if (N.getOpcode() == ISD::TargetConstant ||
+          N.getOpcode() == ISD::TargetGlobalAddress) {
+        Out = N;
+        return true;
+      }
+
+      return false;
+    }
+
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
+      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmShift - Returns true if the address N can be represented by
+    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
+    /// for use by STD and friends.
+    bool SelectAddrImmShift(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    }
+
+    // Select an address into a single register.
+    bool SelectAddr(SDValue N, SDValue &Base) {
+      Base = N;
+      return true;
+    }
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.  It is always correct to compute the value into
+    /// a register.  The case of adding a (possibly relocatable) constant to a
+    /// register can be improved, but it is wrong to substitute Reg+Reg for
+    /// Reg in an asm, because the load or store opcode would have to change.
+   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      OutOps.push_back(Op);
+      return false;
+    }
+
+    void InsertVRSaveCode(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+
+private:
+    SDNode *SelectSETCC(SDNode *N);
+  };
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.
+  //
+  // In this case, there will be virtual registers of vector type created
+  // by the scheduler.  Detect them now.
+  bool HasVectorVReg = false;
+  for (unsigned i = 0, e = RegInfo->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (RegInfo->getRegClass(Reg) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  }
+  if (!HasVectorVReg) return;  // nothing to do.
+
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  DebugLoc dl;
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+          UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (!BB->empty() && BB->back().isReturn()) {
+      IP = BB->end(); --IP;
+
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && (--I2)->isTerminator())
+        IP = I2;
+
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = MF->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    DebugLoc dl;
+
+    if (PPCLowering.getPointerTy() == MVT::i32) {
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+    } else {
+      GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::G8RCRegClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8));
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg,
+                             PPCLowering.getPointerTy()).getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc
+         && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = CountLeadingZeros_32(Val);
+    // look for the first zero bit after the run of ones
+    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = CountLeadingZeros_32(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
+                                      bool isShiftMask, unsigned &SH,
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+    return false;
+
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (isShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) {
+    // apply shift right to mask if it comes first
+    if (isShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// SelectBitfieldInsert - turn an or of two masked values into
+/// the rotate left word immediate then mask insert (rlwimi) instruction.
+SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+
+  APInt LKZ, LKO, RKZ, RKO;
+  CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
+  CurDAG->ComputeMaskedBits(Op1, RKZ, RKO);
+
+  unsigned TargetMask = LKZ.getZExtValue();
+  unsigned InsertMask = RKZ.getZExtValue();
+
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+
+    unsigned MB, ME;
+    if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) {
+      SDValue Tmp1, Tmp2;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        } else {
+          Op1 = Op1.getOperand(0);
+        }
+      }
+
+      SH &= 31;
+      SDValue Ops[] = { Op0, Op1, getI32Imm(SH), getI32Imm(MB),
+                          getI32Imm(ME) };
+      return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+    }
+  }
+  return 0;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
+                                    ISD::CondCode CC, DebugLoc dl) {
+  // Always select the LHS.
+  unsigned Opc;
+
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>((int)Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt<16>(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt<32>(Imm)) {
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF)), 0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt<16>(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPC::FCMPUD;
+  }
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+///
+/// If this returns with Other != -1, then the returned comparison is an or of
+/// two simpler comparisons.  In this case, Invert is guaranteed to be false.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
+  Invert = false;
+  Other = -1;
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ:
+  case ISD::SETOGE:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
+  }
+}
+
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
+static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
+  switch (CC) {
+    case ISD::SETEQ:
+    case ISD::SETUEQ:
+    case ISD::SETNE:
+    case ISD::SETUNE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPEQUB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPEQUH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPEQUW;
+      // v4f32 != v4f32 could be translate to unordered not equal
+      else if (VecVT == MVT::v4f32)
+        return PPC::VCMPEQFP;
+      break;
+    case ISD::SETLT:
+    case ISD::SETGT:
+    case ISD::SETLE:
+    case ISD::SETGE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPGTSB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPGTSH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPGTSW;
+      else if (VecVT == MVT::v4f32)
+        return PPC::VCMPGTFP;
+      break;
+    case ISD::SETULT:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+    case ISD::SETULE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPGTUB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPGTUH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPGTUW;
+      break;
+    case ISD::SETOEQ:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPEQFP;
+      break;
+    case ISD::SETOLT:
+    case ISD::SETOGT:
+    case ISD::SETOLE:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPGTFP;
+      break;
+    case ISD::SETOGE:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPGEFP;
+      break;
+    default:
+      break;
+  }
+  llvm_unreachable("Invalid integer vector compare condition");
+}
+
+// getVCmpEQInst: return the equal compare instruction for the specified vector
+// type. Since this is for altivec specific code, only support the altivec
+// types (v16i8, v8i16, v4i32, and v4f32).
+static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
+  switch (VecVT) {
+    case MVT::v16i8:
+      return PPC::VCMPEQUB;
+    case MVT::v8i16:
+      return PPC::VCMPEQUH;
+    case MVT::v4i32:
+      return PPC::VCMPEQUW;
+    case MVT::v4f32:
+      return PPC::VCMPEQFP;
+    default:
+      llvm_unreachable("Invalid integer vector compare condition");
+  }
+}
+
+
+SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+
+  if (isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        SDValue AD =
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                         Op, getI32Imm(~0U)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op,
+                                    AD.getValue(1));
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(1)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                              SDValue(CurDAG->getMachineNode(PPC::LI, dl,
+                                                             MVT::i32,
+                                                             getI32Imm(0)), 0),
+                                      Op.getValue(1));
+      case ISD::SETNE: {
+        if (isPPC64) break;
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                            Op, getI32Imm(~0U));
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
+                                    Op, SDValue(AD, 1));
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4),
+                     0);
+        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op,
+                                    getI32Imm(1));
+      }
+      }
+    }
+  }
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Altivec Vector compare instructions do not set any CR register by default and
+  // vector compare operations return the same type as the operands.
+  if (LHS.getValueType().isVector()) {
+    EVT VecVT = LHS.getValueType();
+    MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
+    unsigned int VCmpInst = getVCmpInst(VT, CC);
+
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+      case ISD::SETUEQ:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+      case ISD::SETNE:
+      case ISD::SETONE:
+      case ISD::SETUNE: {
+        SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+        return CurDAG->SelectNodeTo(N, PPC::VNOR, VecVT, VCmp, VCmp);
+      } 
+      case ISD::SETLT:
+      case ISD::SETOLT:
+      case ISD::SETULT:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, RHS, LHS);
+      case ISD::SETGT:
+      case ISD::SETOGT:
+      case ISD::SETUGT:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+      case ISD::SETGE:
+      case ISD::SETOGE:
+      case ISD::SETUGE: {
+        // Small optimization: Altivec provides a 'Vector Compare Greater Than
+        // or Equal To' instruction (vcmpgefp), so in this case there is no
+        // need for extra logic for the equal compare.
+        if (VecVT.getSimpleVT().isFloatingPoint()) {
+          return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+        } else {
+          SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+          unsigned int VCmpEQInst = getVCmpEQInst(VT);
+          SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
+          return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpGT, VCmpEQ);
+        }
+      }
+      case ISD::SETLE:
+      case ISD::SETOLE:
+      case ISD::SETULE: {
+        SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
+        unsigned int VCmpEQInst = getVCmpEQInst(VT);
+        SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
+        return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpLE, VCmpEQ);
+      }
+      default:
+        llvm_unreachable("Invalid vector compare type: should be expanded by legalize");
+    }
+  }
+
+  bool Inv;
+  int OtherCondIdx;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv, OtherCondIdx);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
+  SDValue IntCR;
+
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+
+  SDValue InFlag(0, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
+                               InFlag).getValue(1);
+
+  if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1)
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                           CCReg), 0);
+  else
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFCRpseud, dl, MVT::i32,
+                                           CR7Reg, CCReg), 0);
+
+  SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
+                      getI32Imm(31), getI32Imm(31) };
+  if (OtherCondIdx == -1 && !Inv)
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+  if (Inv) {
+    assert(OtherCondIdx == -1 && "Can't have split plus negation");
+    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
+  }
+
+  // Otherwise, we have to turn an operation like SETONE -> SETOLT | SETOGT.
+  // We already got the bit for the first part of the comparison (e.g. SETULE).
+
+  // Get the other bit of the comparison.
+  Ops[1] = getI32Imm((32-(3-OtherCondIdx)) & 31);
+  SDValue OtherCond =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+
+  return CurDAG->SelectNodeTo(N, PPC::OR, MVT::i32, Tmp, OtherCond);
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      // Get 64 bit value.
+      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+      // Assume no remaining bits.
+      unsigned Remainder = 0;
+      // Assume no shift required.
+      unsigned Shift = 0;
+
+      // If it can't be represented as a 32 bit value.
+      if (!isInt<32>(Imm)) {
+        Shift = CountTrailingZeros_64(Imm);
+        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+        // If the shifted value fits 32 bits.
+        if (isInt<32>(ImmSh)) {
+          // Go with the shifted value.
+          Imm = ImmSh;
+        } else {
+          // Still stuck with a 64 bit value.
+          Remainder = Imm;
+          Shift = 32;
+          Imm >>= 32;
+        }
+      }
+
+      // Intermediate operand.
+      SDNode *Result;
+
+      // Handle first 32 bits.
+      unsigned Lo = Imm & 0xFFFF;
+      unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+      // Simple value.
+      if (isInt<16>(Imm)) {
+       // Just the Lo bits.
+        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+      } else if (Lo) {
+        // Handle the Hi bits.
+        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+        // And Lo bits.
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      } else {
+       // Just the Hi bits.
+        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+      }
+
+      // If no shift, we're done.
+      if (!Shift) return Result;
+
+      // Shift for next step if the upper 32-bits were not zero.
+      if (Imm) {
+        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                        SDValue(Result, 0),
+                                        getI32Imm(Shift),
+                                        getI32Imm(63 - Shift));
+      }
+
+      // Add in the last bits as required.
+      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Hi));
+      }
+      if ((Lo = Remainder & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      }
+
+      return Result;
+    }
+    break;
+  }
+
+  case ISD::SETCC:
+    return SelectSETCC(N);
+  case PPCISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+    unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+    return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+  }
+
+  case PPCISD::MFCR: {
+    SDValue InFlag = N->getOperand(1);
+    // Use MFOCRF if supported.
+    if (PPCSubTarget.hasMFOCRF())
+      return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                    N->getOperand(0), InFlag);
+    else
+      return CurDAG->getMachineNode(PPC::MFCRpseud, dl, MVT::i32,
+                                    N->getOperand(0), InFlag);
+  }
+
+  case ISD::SDIV: {
+    // FIXME: since this depends on the setting of the carry flag from the srawi
+    //        we should really be making notes about that for the scheduler.
+    // FIXME: It sure would be nice if we could cheaply recognize the
+    //        srl/add/sra pattern the dag combiner will generate for this as
+    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
+    unsigned Imm;
+    if (isInt32Immediate(N->getOperand(1), Imm)) {
+      SDValue N0 = N->getOperand(0);
+      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                                 N0, getI32Imm(Log2_32(Imm)));
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32,
+                                    SDValue(Op, 0), SDValue(Op, 1));
+      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Glue,
+                                 N0, getI32Imm(Log2_32(-Imm)));
+        SDValue PT =
+          SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
+                                         SDValue(Op, 0), SDValue(Op, 1)),
+                    0);
+        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT LoadedVT = LD->getMemoryVT();
+
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+
+    SDValue Offset = LD->getOffset();
+    if (Offset.getOpcode() == ISD::TargetConstant ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
+    } else {
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDUX; break;
+          case MVT::f32: Opcode = PPC::LFSUX; break;
+          case MVT::i32: Opcode = PPC::LWZUX; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX : PPC::LHZUX; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16 || LoadedVT == MVT::i32) &&
+               "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDUX; break;
+          case MVT::i32: Opcode = isSExt ? PPC::LWAUX  : PPC::LWZUX8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAUX8 : PPC::LHZUX8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZUX8; break;
+        }
+      }
+
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Base, Offset, Chain };
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
+    }
+  }
+
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+      SDValue Val = N->getOperand(0).getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) &&
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDValue Val = N->getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - CountTrailingOnes_64(Imm64);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) };
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      ReplaceUses(SDValue(N, 0), N->getOperand(1));
+      return NULL;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      unsigned MB, ME;
+      Imm = ~(Imm^Imm2);
+      if (isRunOfOnes(Imm, MB, ME)) {
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
+        return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+      }
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR:
+    if (N->getValueType(0) == MVT::i32)
+      if (SDNode *I = SelectBitfieldInsert(N))
+        return I;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
+    bool isPPC64 = (PtrVT == MVT::i64);
+
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (!isPPC64)
+      if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+        if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+          if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+            if (N1C->isNullValue() && N3C->isNullValue() &&
+                N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+                // FIXME: Implement this optzn for PPC64.
+                N->getValueType(0) == MVT::i32) {
+              SDNode *Tmp =
+                CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Glue,
+                                       N->getOperand(0), getI32Imm(~0U));
+              return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
+                                          SDValue(Tmp, 0), N->getOperand(0),
+                                          SDValue(Tmp, 1));
+            }
+
+    SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      SelectCCOp = PPC::SELECT_CC_F8;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc) };
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+  }
+  case PPCISD::COND_BRANCH: {
+    // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    SDValue Pred =
+      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+  }
+  case ISD::BR_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+    SDValue Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode,
+                        N->getOperand(4), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    unsigned Reg = Target.getValueType() == MVT::i32 ? PPC::BCTR : PPC::BCTR8;
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Target,
+                                           Chain), 0);
+    return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
+  }
+  case PPCISD::TOC_ENTRY: {
+    assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
+
+    // For medium and large code model, we generate two instructions as
+    // described below.  Otherwise we allow SelectCodeCommon to handle this,
+    // selecting one of LDtoc, LDtocJTI, and LDtocCPT.
+    CodeModel::Model CModel = TM.getCodeModel();
+    if (CModel != CodeModel::Medium && CModel != CodeModel::Large)
+      break;
+
+    // The first source operand is a TargetGlobalAddress or a
+    // TargetJumpTable.  If it is an externally defined symbol, a symbol
+    // with common linkage, a function address, or a jump table address,
+    // or if we are generating code for large code model, we generate:
+    //   LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>))
+    // Otherwise we generate:
+    //   ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>)
+    SDValue GA = N->getOperand(0);
+    SDValue TOCbase = N->getOperand(1);
+    SDNode *Tmp = CurDAG->getMachineNode(PPC::ADDIStocHA, dl, MVT::i64,
+                                        TOCbase, GA);
+
+    if (isa<JumpTableSDNode>(GA) || CModel == CodeModel::Large)
+      return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                    SDValue(Tmp, 0));
+
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
+      const GlobalValue *GValue = G->getGlobal();
+      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
+      const GlobalValue *RealGValue = GAlias ?
+        GAlias->resolveAliasedGlobal(false) : GValue;
+      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
+      assert((GVar || isa<Function>(RealGValue)) &&
+             "Unexpected global value subclass!");
+
+      // An external variable is one without an initializer.  For these,
+      // for variables with common linkage, and for Functions, generate
+      // the LDtocL form.
+      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
+          RealGValue->hasAvailableExternallyLinkage())
+        return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
+                                      SDValue(Tmp, 0));
+    }
+
+    return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+                                  SDValue(Tmp, 0), GA);
+  }
+  case PPCISD::VADD_SPLAT: {
+    // This expands into one of three sequences, depending on whether
+    // the first operand is odd or even, positive or negative.
+    assert(isa<ConstantSDNode>(N->getOperand(0)) &&
+           isa<ConstantSDNode>(N->getOperand(1)) &&
+           "Invalid operand on VADD_SPLAT!");
+
+    int Elt     = N->getConstantOperandVal(0);
+    int EltSize = N->getConstantOperandVal(1);
+    unsigned Opc1, Opc2, Opc3;
+    EVT VT;
+
+    if (EltSize == 1) {
+      Opc1 = PPC::VSPLTISB;
+      Opc2 = PPC::VADDUBM;
+      Opc3 = PPC::VSUBUBM;
+      VT = MVT::v16i8;
+    } else if (EltSize == 2) {
+      Opc1 = PPC::VSPLTISH;
+      Opc2 = PPC::VADDUHM;
+      Opc3 = PPC::VSUBUHM;
+      VT = MVT::v8i16;
+    } else {
+      assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!");
+      Opc1 = PPC::VSPLTISW;
+      Opc2 = PPC::VADDUWM;
+      Opc3 = PPC::VSUBUWM;
+      VT = MVT::v4i32;
+    }
+
+    if ((Elt & 1) == 0) {
+      // Elt is even, in the range [-32,-18] + [16,30].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp = VSPLTIS[BHW] elt
+      //          VADDU[BHW]M tmp, tmp
+      // Where:   [BHW] = B for size = 1, H for size = 2, W for size = 4
+      SDValue EltVal = getI32Imm(Elt >> 1);
+      SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      SDValue TmpVal = SDValue(Tmp, 0);
+      return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal);
+
+    } else if (Elt > 0) {
+      // Elt is odd and positive, in the range [17,31].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt-16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VSUBU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt - 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+
+    } else {
+      // Elt is odd and negative, in the range [-31,-17].
+      //
+      // Convert: VADD_SPLAT elt, size
+      // Into:    tmp1 = VSPLTIS[BHW] elt+16
+      //          tmp2 = VSPLTIS[BHW] -16
+      //          VADDU[BHW]M tmp1, tmp2
+      SDValue EltVal = getI32Imm(Elt + 16);
+      SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      EltVal = getI32Imm(-16);
+      SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal);
+      return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0),
+                                    SDValue(Tmp2, 0));
+    }
+  }
+  }
+
+  return SelectCode(N);
+}
+
+/// PostProcessISelDAG - Perform some late peephole optimizations
+/// on the DAG representation.
+void PPCDAGToDAGISel::PostprocessISelDAG() {
+
+  // Skip peepholes at -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // These optimizations are currently supported only for 64-bit SVR4.
+  if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
+    return;
+
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = --Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    unsigned FirstOp;
+    unsigned StorageOpcode = N->getMachineOpcode();
+
+    switch (StorageOpcode) {
+    default: continue;
+
+    case PPC::LBZ:
+    case PPC::LBZ8:
+    case PPC::LD:
+    case PPC::LFD:
+    case PPC::LFS:
+    case PPC::LHA:
+    case PPC::LHA8:
+    case PPC::LHZ:
+    case PPC::LHZ8:
+    case PPC::LWA:
+    case PPC::LWZ:
+    case PPC::LWZ8:
+      FirstOp = 0;
+      break;
+
+    case PPC::STB:
+    case PPC::STB8:
+    case PPC::STD:
+    case PPC::STFD:
+    case PPC::STFS:
+    case PPC::STH:
+    case PPC::STH8:
+    case PPC::STW:
+    case PPC::STW8:
+      FirstOp = 1;
+      break;
+    }
+
+    // If this is a load or store with a zero offset, we may be able to
+    // fold an add-immediate into the memory operation.
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
+        N->getConstantOperandVal(FirstOp) != 0)
+      continue;
+
+    SDValue Base = N->getOperand(FirstOp + 1);
+    if (!Base.isMachineOpcode())
+      continue;
+
+    unsigned Flags = 0;
+    bool ReplaceFlags = true;
+
+    // When the feeding operation is an add-immediate of some sort,
+    // determine whether we need to add relocation information to the
+    // target flags on the immediate operand when we fold it into the
+    // load instruction.
+    //
+    // For something like ADDItocL, the relocation information is
+    // inferred from the opcode; when we process it in the AsmPrinter,
+    // we add the necessary relocation there.  A load, though, can receive
+    // relocation from various flavors of ADDIxxx, so we need to carry
+    // the relocation information in the target flags.
+    switch (Base.getMachineOpcode()) {
+    default: continue;
+
+    case PPC::ADDI8:
+    case PPC::ADDI:
+      // In some cases (such as TLS) the relocation information
+      // is already in place on the operand, so copying the operand
+      // is sufficient.
+      ReplaceFlags = false;
+      // For these cases, the immediate may not be divisible by 4, in
+      // which case the fold is illegal for DS-form instructions.  (The
+      // other cases provide aligned addresses and are always safe.)
+      if ((StorageOpcode == PPC::LWA ||
+           StorageOpcode == PPC::LD  ||
+           StorageOpcode == PPC::STD) &&
+          (!isa<ConstantSDNode>(Base.getOperand(1)) ||
+           Base.getConstantOperandVal(1) % 4 != 0))
+        continue;
+      break;
+    case PPC::ADDIdtprelL:
+      Flags = PPCII::MO_DTPREL16_LO;
+      break;
+    case PPC::ADDItlsldL:
+      Flags = PPCII::MO_TLSLD16_LO;
+      break;
+    case PPC::ADDItocL:
+      Flags = PPCII::MO_TOC16_LO;
+      break;
+    }
+
+    // We found an opportunity.  Reverse the operands from the add
+    // immediate and substitute them into the load or store.  If
+    // needed, update the target flags for the immediate operand to
+    // reflect the necessary relocation information.
+    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    DEBUG(Base->dump(CurDAG));
+    DEBUG(dbgs() << "\nN: ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    SDValue ImmOpnd = Base.getOperand(1);
+
+    // If the relocation information isn't already present on the
+    // immediate operand, add it now.
+    if (ReplaceFlags) {
+      if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+        DebugLoc dl = GA->getDebugLoc();
+        const GlobalValue *GV = GA->getGlobal();
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+      } else if (ConstantPoolSDNode *CP =
+                 dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
+        const Constant *C = CP->getConstVal();
+        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
+                                                CP->getAlignment(),
+                                                0, Flags);
+      }
+    }
+
+    if (FirstOp == 1) // Store
+      (void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
+                                       Base.getOperand(0), N->getOperand(3));
+    else // Load
+      (void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
+                                       N->getOperand(2));
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
+
+static void initializePassOnce(PassRegistry &Registry) {
+  const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
+  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, 0,
+                              false, false);
+  Registry.registerPass(*PI, true);
+}
+
+void llvm::initializePPCDAGToDAGISelPass(PassRegistry &Registry) {
+  CALL_ONCE_INITIALIZATION(initializePassOnce);
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 000000000000..16fc8a0e3726
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -0,0 +1,7554 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPerfectShuffle.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State);
+static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                              MVT &LocVT,
+                                              CCValAssign::LocInfo &LocInfo,
+                                              ISD::ArgFlagsTy &ArgFlags,
+                                              CCState &State);
+static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                                MVT &LocVT,
+                                                CCValAssign::LocInfo &LocInfo,
+                                                ISD::ArgFlagsTy &ArgFlags,
+                                                CCState &State);
+
+static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
+cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
+cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
+
+static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
+cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
+
+static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->isDarwin())
+    return new TargetLoweringObjectFileMachO();
+
+  return new TargetLoweringObjectFileELF();
+}
+
+PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
+  : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
+  const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
+  PPCRegInfo = TM.getRegisterInfo();
+
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
+  // arguments are at least 4/8 bytes aligned.
+  bool isPPC64 = Subtarget->isPPC64();
+  setMinStackArgumentAlignment(isPPC64 ? 8:4);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
+  addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
+  addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+
+  // This is used in the ppcf128->int sequence.  Note it has different semantics
+  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
+  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+  // We do not currently implement these libm ops for PowerPC.
+  setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FCEIL,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FRINT,  MVT::ppcf128, Expand);
+  setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
+  setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
+
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Legal);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Legal);
+
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+  // If we're enabling GP optimizations, use hardware square root
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTE() && Subtarget->hasFRE()))
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+
+  if (!Subtarget->hasFSQRT() &&
+      !(TM.Options.UnsafeFPMath &&
+        Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  if (Subtarget->hasFPRND()) {
+    setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+
+    setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+
+    // frin does not implement "ties to even." Thus, this is safe only in
+    // fast-math mode.
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
+
+      // These need to set FE_INEXACT, and use a custom inserter.
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f32, Legal);
+    }
+  }
+
+  // PowerPC does not have BSWAP, CTPOP or CTTZ
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+
+  if (Subtarget->hasPOPCNTD()) {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Legal);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Legal);
+  } else {
+    setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+    setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  }
+
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  setOperationAction(ISD::ROTR, MVT::i64   , Expand);
+
+  // PowerPC does not have Select
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+  // PowerPC does not have BRCOND which requires SetCC
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+  setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // TRAP is legal.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  if (Subtarget->isSVR4ABI()) {
+    if (isPPC64) {
+      // VAARG always uses double-word chunks, so promote anything smaller.
+      setOperationAction(ISD::VAARG, MVT::i1, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i8, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i16, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::i32, Promote);
+      AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
+      setOperationAction(ISD::VAARG, MVT::Other, Expand);
+    } else {
+      // VAARG is custom lowered with the 32-bit SVR4 ABI.
+      setOperationAction(ISD::VAARG, MVT::Other, Custom);
+      setOperationAction(ISD::VAARG, MVT::i64, Custom);
+    }
+  } else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Comparisons that require checking two conditions.
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+  if (Subtarget->has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    // This is just the low 32 bits of a (signed) fp->i64 conversion.
+    // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    if (PPCSubTarget.hasLFIWAX() || Subtarget->isPPC64())
+      setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  // With the instructions enabled under FPCVT, we can do everything.
+  if (PPCSubTarget.hasFPCVT()) {
+    if (Subtarget->has64BitSupport()) {
+      setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+      setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+      setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+    }
+
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  }
+
+  if (Subtarget->use64BitRegs()) {
+    // 64-bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+    // 64-bit PowerPC wants to expand i128 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  } else {
+    // 32-bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (Subtarget->hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD , VT, Legal);
+      setOperationAction(ISD::SUB , VT, Legal);
+
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , VT, Promote);
+      AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , VT, Promote);
+      AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , VT, Promote);
+      AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , VT, Promote);
+      AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, VT, Promote);
+      AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , VT, Expand);
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::FSQRT, VT, Expand);
+      setOperationAction(ISD::FLOG, VT, Expand);
+      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(ISD::FLOG2, VT, Expand);
+      setOperationAction(ISD::FEXP, VT, Expand);
+      setOperationAction(ISD::FEXP2, VT, Expand);
+      setOperationAction(ISD::FSIN, VT, Expand);
+      setOperationAction(ISD::FCOS, VT, Expand);
+      setOperationAction(ISD::FABS, VT, Expand);
+      setOperationAction(ISD::FPOWI, VT, Expand);
+      setOperationAction(ISD::FFLOOR, VT, Expand);
+      setOperationAction(ISD::FCEIL,  VT, Expand);
+      setOperationAction(ISD::FTRUNC, VT, Expand);
+      setOperationAction(ISD::FRINT,  VT, Expand);
+      setOperationAction(ISD::FNEARBYINT, VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::CTPOP, VT, Expand);
+      setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::CTTZ, VT, Expand);
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+        setTruncStoreAction(VT, InnerVT, Expand);
+      }
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+
+    addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
+    addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
+
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+
+    if (TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    }
+
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
+  }
+
+  if (Subtarget->has64BitSupport()) {
+    setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
+    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+  }
+
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i64, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  if (isPPC64) {
+    setStackPointerRegisterToSaveRestore(PPC::X1);
+    setExceptionPointerRegister(PPC::X3);
+    setExceptionSelectorRegister(PPC::X4);
+  } else {
+    setStackPointerRegisterToSaveRestore(PPC::R1);
+    setExceptionPointerRegister(PPC::R3);
+    setExceptionSelectorRegister(PPC::R4);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  setTargetDAGCombine(ISD::BSWAP);
+
+  // Use reciprocal estimates.
+  if (TM.Options.UnsafeFPMath) {
+    setTargetDAGCombine(ISD::FDIV);
+    setTargetDAGCombine(ISD::FSQRT);
+  }
+
+  // Darwin long double math library functions have $LDBL128 appended.
+  if (Subtarget->isDarwin()) {
+    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+  }
+
+  setMinFunctionAlignment(2);
+  if (PPCSubTarget.isDarwin())
+    setPrefFunctionAlignment(4);
+
+  if (isPPC64 && Subtarget->isJITCodeModel())
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    setSupportJumpTables(false);
+
+  setInsertFencesForAtomic(true);
+
+  setSchedulingPreference(Sched::Hybrid);
+
+  computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    MaxStoresPerMemset = 32;
+    MaxStoresPerMemsetOptSize = 16;
+    MaxStoresPerMemcpy = 32;
+    MaxStoresPerMemcpyOptSize = 8;
+    MaxStoresPerMemmove = 32;
+    MaxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
+  const TargetMachine &TM = getTargetMachine();
+  // Darwin passes everything on 4 byte boundary.
+  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+    return 4;
+
+  // 16byte and wider vectors are passed on 16byte boundary.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    if (VTy->getBitWidth() >= 128)
+      return 16;
+
+  // The rest is 8 on PPC64 and 4 on PPC32 boundary.
+   if (PPCSubTarget.isPPC64())
+     return 8;
+
+  return 4;
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case PPCISD::FSEL:            return "PPCISD::FSEL";
+  case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::FRE:             return "PPCISD::FRE";
+  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
+  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::Hi:              return "PPCISD::Hi";
+  case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
+  case PPCISD::TOC_RESTORE:     return "PPCISD::TOC_RESTORE";
+  case PPCISD::LOAD:            return "PPCISD::LOAD";
+  case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
+  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:             return "PPCISD::SRL";
+  case PPCISD::SRA:             return "PPCISD::SRA";
+  case PPCISD::SHL:             return "PPCISD::SHL";
+  case PPCISD::CALL:            return "PPCISD::CALL";
+  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
+  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
+  case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
+  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
+  case PPCISD::MFCR:            return "PPCISD::MFCR";
+  case PPCISD::VCMP:            return "PPCISD::VCMP";
+  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::LBRX:            return "PPCISD::LBRX";
+  case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LARX:            return "PPCISD::LARX";
+  case PPCISD::STCX:            return "PPCISD::STCX";
+  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::MFFS:            return "PPCISD::MFFS";
+  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
+  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
+  case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
+  case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
+  case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
+  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
+  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
+  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
+  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
+  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
+  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
+  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
+  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
+  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
+  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
+  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
+  case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
+  }
+}
+
+EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->getValueAPF().isZero();
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+  return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+1))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+3))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+                     unsigned LHSStart, unsigned RHSStart) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+  return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 8, 24);
+  return isVMerge(N, UnitSize, 8, 8);
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 0, 16);
+  return isVMerge(N, UnitSize, 0, 0);
+}
+
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 16) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  if (!isUnary) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  }
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = N->getMaskElt(0);
+
+  // FIXME: Handle UNDEF elements too!
+  if (ElementBase >= 16)
+    return false;
+
+  // Check that the indices are consecutive, in the case of a multi-byte element
+  // splatted with a v16i8 mask.
+  for (unsigned i = 1; i != EltSize; ++i)
+    if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+      return false;
+
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getMaskElt(i) < 0) continue;
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getMaskElt(i+j) != N->getMaskElt(j))
+        return false;
+  }
+  return true;
+}
+
+/// isAllNegativeZeroVector - Returns true if all elements of build_vector
+/// are -0.0.
+bool PPC::isAllNegativeZeroVector(SDNode *N) {
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+
+  APInt APVal, APUndef;
+  unsigned BitSize;
+  bool HasAnyUndefs;
+
+  if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32, true))
+    if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+      return CFP->getValueAPF().isNegZero();
+
+  return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  assert(isSplatShuffleMask(SVOp, EltSize));
+  return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDValue OpVal(0, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDValue UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+      if (UniquedVals[i&(Multiple-1)].getNode() == 0)
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDValue();  // no match.
+    }
+
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (UniquedVals[i].getNode() == 0) continue;  // Must have been undefs.
+
+      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
+      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+      if (Val < 16)
+        return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
+    }
+    if (LeadingOnes) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, MVT::i32);
+    }
+
+    return SDValue();
+  }
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDValue();
+  }
+
+  if (OpVal.getNode() == 0) return SDValue();  // All UNDEF: use implicit def.
+
+  unsigned ValSizeInBytes = EltSize;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getZExtValue();
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValueAPF().convertToFloat());
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDValue();
+
+  // If the element value is larger than the splat value, cut it in half and
+  // check to see if the two halves are equal.  Continue doing this until we
+  // get to ByteSize.  This allows us to handle 0x01010101 as 0x01.
+  while (ValSizeInBytes > ByteSize) {
+    ValSizeInBytes >>= 1;
+
+    // If the top half equals the bottom half, we're still ok.
+    if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
+         (Value                        & ((1 << (8*ValSizeInBytes))-1)))
+      return SDValue();
+  }
+
+  // Properly sign extend the value.
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
+
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDValue();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (SignExtend32<5>(MaskVal) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, MVT::i32);
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+                                            SDValue &Index,
+                                            SelectionDAG &DAG) const {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    APInt LHSKnownZero, LHSKnownOne;
+    APInt RHSKnownZero, RHSKnownOne;
+    DAG.ComputeMaskedBits(N.getOperand(0),
+                          LHSKnownZero, LHSKnownOne);
+
+    if (LHSKnownZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N.getOperand(1),
+                            RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+                                            SDValue &Base,
+                                            SelectionDAG &DAG) const {
+  // FIXME dl should come from parent load or store, not from address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm)) {
+      Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
+      Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                             CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if (CN->getValueType(0) == MVT::i32 ||
+        (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+      int Addr = (int)CN->getZExtValue();
+
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
+
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+                                                SDValue &Index,
+                                                SelectionDAG &DAG) const {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                         N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// SelectAddressRegImmShift - Returns true if the address N can be
+/// represented by a base register plus a signed 14-bit displacement
+/// [r+imm*4].  Suitable for use by STD and friends.
+bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
+                                                 SDValue &Base,
+                                                 SelectionDAG &DAG) const {
+  // FIXME dl should come from the parent load or store, not the address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.  Verify low two bits are clear.
+    if ((CN->getZExtValue() & 3) == 0) {
+      // If this address fits entirely in a 14-bit sext immediate field, codegen
+      // this as "d, 0"
+      short Imm;
+      if (isIntS16Immediate(CN, Imm)) {
+        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
+        Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+                               CN->getValueType(0));
+        return true;
+      }
+
+      // Fold the low-part of 32-bit absolute addresses into addr mode.
+      if (CN->getValueType(0) == MVT::i32 ||
+          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+        int Addr = (int)CN->getZExtValue();
+
+        // Otherwise, break this down into an LIS + disp.
+        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
+        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
+        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+        Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0);
+        return true;
+      }
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  if (DisablePPCPreinc) return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  unsigned Alignment;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+    Alignment = LD->getAlignment();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+    Alignment = ST->getAlignment();
+    isLoad = false;
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors.
+  if (VT.isVector())
+    return false;
+
+  if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
+
+    // Common code will reject creating a pre-inc form if the base pointer
+    // is a frame index, or if N is a store and the base pointer is either
+    // the same as or a predecessor of the value being stored.  Check for
+    // those situations here, and try with swapped Base/Offset instead.
+    bool Swap = false;
+
+    if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
+      Swap = true;
+    else if (!isLoad) {
+      SDValue Val = cast<StoreSDNode>(N)->getValue();
+      if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
+        Swap = true;
+    }
+
+    if (Swap)
+      std::swap(Base, Offset);
+
+    AM = ISD::PRE_INC;
+    return true;
+  }
+
+  // LDU/STU use reg+imm*4, others use reg+imm.
+  if (VT != MVT::i64) {
+    // reg + imm
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+      return false;
+  } else {
+    // LDU/STU need an address with at least 4-byte alignment.
+    if (Alignment < 4)
+      return false;
+
+    // reg + imm * 4.
+    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }
+
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// GetLabelAccessInfo - Return true if we should reference labels using a
+/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
+static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
+                               unsigned &LoOpFlags, const GlobalValue *GV = 0) {
+  HiOpFlags = PPCII::MO_HA16;
+  LoOpFlags = PPCII::MO_LO16;
+
+  // Don't use the pic base if not in PIC relocation model.  Or if we are on a
+  // non-darwin platform.  We don't support PIC on other platforms yet.
+  bool isPIC = TM.getRelocationModel() == Reloc::PIC_ &&
+               TM.getSubtarget<PPCSubtarget>().isDarwin();
+  if (isPIC) {
+    HiOpFlags |= PPCII::MO_PIC_FLAG;
+    LoOpFlags |= PPCII::MO_PIC_FLAG;
+  }
+
+  // If this is a reference to a global value that requires a non-lazy-ptr, make
+  // sure that instruction lowering adds it.
+  if (GV && TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM)) {
+    HiOpFlags |= PPCII::MO_NLP_FLAG;
+    LoOpFlags |= PPCII::MO_NLP_FLAG;
+
+    if (GV->hasHiddenVisibility()) {
+      HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+      LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
+    }
+  }
+
+  return isPIC;
+}
+
+static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
+                             SelectionDAG &DAG) {
+  EVT PtrVT = HiPart.getValueType();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  DebugLoc DL = HiPart.getDebugLoc();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
+
+  // With PIC, the first instruction is actually "GR+hi(&G)".
+  if (isPIC)
+    Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
+
+  // Generate non-pic code that has direct accesses to the constant pool.
+  // The address of the global is just (hi(&g)+lo(&g)).
+  return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+}
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  const Constant *C = CP->getConstVal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  SDValue CPIHi =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+  SDValue CPILo =
+    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+  return LowerLabelRef(CPIHi, CPILo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
+  SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
+  return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
+  return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  DebugLoc dl = GA->getDebugLoc();
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
+
+  if (Model == TLSModel::LocalExec) {
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL16_HA);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                               PPCII::MO_TPREL16_LO);
+    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
+                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
+    return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
+  }
+
+  if (!is64bit)
+    llvm_unreachable("only local-exec is currently supported for ppc32");
+
+  if (Model == TLSModel::InitialExec) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+                                     PtrVT, GOTReg, TGA);
+    SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
+                                   PtrVT, TGA, TPOffsetHi);
+    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGA);
+  }
+
+  if (Model == TLSModel::GeneralDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+                                     GOTReg, TGA);
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
+                                   GOTEntryHi, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLS_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+    return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT);
+  }
+
+  if (Model == TLSModel::LocalDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+                                     GOTReg, TGA);
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
+                                   GOTEntryHi, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLSLD_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
+                                      Chain, ParmReg, TGA);
+    return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
+  }
+
+  llvm_unreachable("Unknown TLS model!");
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  DebugLoc DL = GSDN->getDebugLoc();
+  const GlobalValue *GV = GSDN->getGlobal();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
+    return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  unsigned MOHiFlag, MOLoFlag;
+  bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
+
+  SDValue GAHi =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
+  SDValue GALo =
+    DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
+
+  SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG);
+
+  // If the global reference is actually to a non-lazy-pointer, we have to do an
+  // extra load to get the address of the global.
+  if (MOHiFlag & PPCII::MO_NLP_FLAG)
+    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(),
+                      false, false, false, 0);
+  return Ptr;
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isNullValue() && CC == ISD::SETEQ) {
+      EVT VT = Op.getOperand(0).getValueType();
+      SDValue Zext = Op.getOperand(0);
+      if (VT.bitsLT(MVT::i32)) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
+      }
+      unsigned Log2b = Log2_32(VT.getSizeInBits());
+      SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
+      SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
+                                DAG.getConstant(Log2b, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+    }
+    // Leave comparisons against 0 and -1 alone for now, since they're usually
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDValue();
+  }
+
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  EVT LHSVT = Op.getOperand(0).getValueType();
+  if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    EVT VT = Op.getValueType();
+    SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+                                Op.getOperand(1));
+    return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  DebugLoc dl = Node->getDebugLoc();
+
+  assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
+
+  // gpr_index
+  SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    VAListPtr, MachinePointerInfo(SV), MVT::i8,
+                                    false, false, 0);
+  InChain = GprIndex.getValue(1);
+
+  if (VT == MVT::i64) {
+    // Check if GprIndex is even
+    SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
+                                 DAG.getConstant(1, MVT::i32));
+    SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
+                                DAG.getConstant(0, MVT::i32), ISD::SETNE);
+    SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
+                                          DAG.getConstant(1, MVT::i32));
+    // Align GprIndex to be even if it isn't
+    GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
+                           GprIndex);
+  }
+
+  // fpr index is 1 byte after gpr
+  SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                               DAG.getConstant(1, MVT::i32));
+
+  // fpr
+  SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
+                                    FprPtr, MachinePointerInfo(SV), MVT::i8,
+                                    false, false, 0);
+  InChain = FprIndex.getValue(1);
+
+  SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                       DAG.getConstant(8, MVT::i32));
+
+  SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
+                                        DAG.getConstant(4, MVT::i32));
+
+  // areas
+  SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr,
+                                     MachinePointerInfo(), false, false,
+                                     false, 0);
+  InChain = OverflowArea.getValue(1);
+
+  SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr,
+                                    MachinePointerInfo(), false, false,
+                                    false, 0);
+  InChain = RegSaveArea.getValue(1);
+
+  // select overflow_area if index > 8
+  SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
+                            DAG.getConstant(8, MVT::i32), ISD::SETLT);
+
+  // adjustment constant gpr_index * 4/8
+  SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
+                                    VT.isInteger() ? GprIndex : FprIndex,
+                                    DAG.getConstant(VT.isInteger() ? 4 : 8,
+                                                    MVT::i32));
+
+  // OurReg = RegSaveArea + RegConstant
+  SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
+                               RegConstant);
+
+  // Floating types are 32 bytes into RegSaveArea
+  if (VT.isFloatingPoint())
+    OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
+                         DAG.getConstant(32, MVT::i32));
+
+  // increase {f,g}pr_index by 1 (or 2 if VT is i64)
+  SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                   VT.isInteger() ? GprIndex : FprIndex,
+                                   DAG.getConstant(VT == MVT::i64 ? 2 : 1,
+                                                   MVT::i32));
+
+  InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
+                              VT.isInteger() ? VAListPtr : FprPtr,
+                              MachinePointerInfo(SV),
+                              MVT::i8, false, false, 0);
+
+  // determine if we should load from reg_save_area or overflow_area
+  SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
+
+  // increase overflow_area by 4/8 if gpr/fpr > 8
+  SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
+                                          DAG.getConstant(VT.isInteger() ? 4 : 8,
+                                          MVT::i32));
+
+  OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
+                             OverflowAreaPlusN);
+
+  InChain = DAG.getTruncStore(InChain, dl, OverflowArea,
+                              OverflowAreaPtr,
+                              MachinePointerInfo(),
+                              MVT::i32, false, false, 0);
+
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+  Type *IntPtrTy =
+    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
+                                                             *DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp; Args.push_back(Entry);
+
+  // TrampSize == (isPPC64 ? 48 : 40);
+  Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40,
+                               isPPC64 ? MVT::i64 : MVT::i32);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr; Args.push_back(Entry);
+  Entry.Node = Nest; Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                                       Type::getVoidTy(*DAG.getContext()),
+                                       false, false, false, false, 0,
+                                       CallingConv::C,
+                /*isTailCall=*/false,
+                                       /*doesNotRet=*/false,
+                                       /*isReturnValueUsed=*/true,
+                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+                Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  return CallResult.second;
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                                        const PPCSubtarget &Subtarget) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV),
+                        false, false, 0);
+  }
+
+  // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+
+  SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), MVT::i32);
+  SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), MVT::i32);
+
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
+                                            PtrVT);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 PtrVT);
+
+  uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+  SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT);
+
+  uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+  SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT);
+
+  uint64_t FPROffset = 1;
+  SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT);
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Store first byte : number of int regs
+  SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR,
+                                         Op.getOperand(1),
+                                         MachinePointerInfo(SV),
+                                         MVT::i8, false, false, 0);
+  uint64_t nextOffset = FPROffset;
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+
+  // Store second byte : number of float regs
+  SDValue secondStore =
+    DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
+                      MachinePointerInfo(SV, nextOffset), MVT::i8,
+                      false, false, 0);
+  nextOffset += StackOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+  // Store second word : arguments given on stack
+  SDValue thirdStore =
+    DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
+                 MachinePointerInfo(SV, nextOffset),
+                 false, false, 0);
+  nextOffset += FrameOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, dl, FR, nextPtr,
+                      MachinePointerInfo(SV, nextOffset),
+                      false, false, 0);
+
+}
+
+#include "PPCGenCallingConv.inc"
+
+static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return true;
+}
+
+static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                              MVT &LocVT,
+                                              CCValAssign::LocInfo &LocInfo,
+                                              ISD::ArgFlagsTy &ArgFlags,
+                                              CCState &State) {
+  static const uint16_t ArgRegs[] = {
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // Skip one register if the first unallocated register has an even register
+  // number and there are still argument registers available which have not been
+  // allocated yet. RegNum is actually an index into ArgRegs, which means we
+  // need to skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an odd register number and does not actually
+  // allocate a register for the current argument.
+  return false;
+}
+
+static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                                MVT &LocVT,
+                                                CCValAssign::LocInfo &LocInfo,
+                                                ISD::ArgFlagsTy &ArgFlags,
+                                                CCState &State) {
+  static const uint16_t ArgRegs[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // If there is only one Floating-point register left we need to put both f64
+  // values of a split ppc_fp128 value on the stack.
+  if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+
+  // Always return false here, as this function only makes sure that the two f64
+  // values a ppc_fp128 value is split into are both passed in registers or both
+  // passed on the stack and does not actually allocate a register for the
+  // current argument.
+  return false;
+}
+
+/// GetFPR - Get the set of FP registers that should be allocated for arguments,
+/// on Darwin.
+static const uint16_t *GetFPR() {
+  static const uint16_t FPR[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
+  };
+
+  return FPR;
+}
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
+                                       unsigned PtrByteSize) {
+  unsigned ArgSize = ArgVT.getSizeInBits()/8;
+  if (Flags.isByVal())
+    ArgSize = Flags.getByValSize();
+  ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  return ArgSize;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+                                       dl, DAG, InVals);
+  }
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_32SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  // 32-bit SVR4 ABI Stack Frame Layout:
+  //              +-----------------------------------+
+  //        +-->  |            Back chain             |
+  //        |     +-----------------------------------+
+  //        |     | Floating-point register save area |
+  //        |     +-----------------------------------+
+  //        |     |    General register save area     |
+  //        |     +-----------------------------------+
+  //        |     |          CR save word             |
+  //        |     +-----------------------------------+
+  //        |     |         VRSAVE save word          |
+  //        |     +-----------------------------------+
+  //        |     |         Alignment padding         |
+  //        |     +-----------------------------------+
+  //        |     |     Vector register save area     |
+  //        |     +-----------------------------------+
+  //        |     |       Local variable space        |
+  //        |     +-----------------------------------+
+  //        |     |        Parameter list area        |
+  //        |     +-----------------------------------+
+  //        |     |           LR save word            |
+  //        |     +-----------------------------------+
+  // SP-->  +---  |            Back chain             |
+  //              +-----------------------------------+
+  //
+  // Specifications:
+  //   System V Application Binary Interface PowerPC Processor Supplement
+  //   AltiVec Technology Programming Interface Manual
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 4;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      const TargetRegisterClass *RC;
+      EVT ValVT = VA.getValVT();
+
+      switch (ValVT.getSimpleVT().SimpleTy) {
+        default:
+          llvm_unreachable("ValVT not supported by formal arguments Lowering");
+        case MVT::i32:
+          RC = &PPC::GPRCRegClass;
+          break;
+        case MVT::f32:
+          RC = &PPC::F4RCRegClass;
+          break;
+        case MVT::f64:
+          RC = &PPC::F8RCRegClass;
+          break;
+        case MVT::v16i8:
+        case MVT::v8i16:
+        case MVT::v4i32:
+        case MVT::v4f32:
+          RC = &PPC::VRRCRegClass;
+          break;
+      }
+
+      // Transform the arguments stored in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT);
+
+      InVals.push_back(ArgValue);
+    } else {
+      // Argument stored in memory.
+      assert(VA.isMemLoc());
+
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+                                      isImmutable);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0));
+    }
+  }
+
+  // Assign locations to all of the incoming aggregate by value arguments.
+  // Aggregates passed by value are stored in the local variable space of the
+  // caller's stack frame, right above the parameter list area.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameLowering::getMinCallFrameSize(false, false));
+
+  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
+    getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+
+  FI->setMinReservedArea(MinReservedArea);
+
+  SmallVector<SDValue, 8> MemOps;
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    static const uint16_t GPArgRegs[] = {
+      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+    };
+    const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
+
+    static const uint16_t FPArgRegs[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8
+    };
+    const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+
+    FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
+                                                          NumGPArgRegs));
+    FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs,
+                                                          NumFPArgRegs));
+
+    // Make room for NumGPArgRegs and NumFPArgRegs.
+    int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
+                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+
+    FuncInfo->setVarArgsStackOffset(
+      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                             CCInfo.getNextStackOffset(), true));
+
+    FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // The fixed integer arguments of a variadic function are stored to the
+    // VarArgsFrameIndex on the stack so that they may be loaded by deferencing
+    // the result of va_next.
+    for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+    // is set.
+    // The double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
+      // Get an existing live-in vreg, or add a new one.
+      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
+      if (!VReg)
+        VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by eight for the next argument to store
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+                                         PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue
+PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                                     SelectionDAG &DAG, SDValue ArgVal,
+                                     DebugLoc dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+}
+
+// Set the size that is at least reserved in caller of this function.  Tail
+// call optimized functions' reserved stack space needs to be aligned so that
+// taking the difference between two stack areas will result in an aligned
+// stack.
+void
+PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                                      unsigned nAltivecParamsAtEnd,
+                                      unsigned MinReservedArea,
+                                      bool isPPC64) const {
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+  unsigned TargetAlign
+    = DAG.getMachineFunction().getTarget().getFrameLowering()->
+        getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_64SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
+
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else
+        nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
+      }
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (ObjSize < PtrByteSize)
+        CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+
+      if (ObjSize < 8) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.  The address of the parameter was already calculated
+            // above (InVals.push_back(FIN)) to be the right-justified
+            // offset within the slot.  For this store, we need a new
+            // frame index that points at the beginning of the slot.
+            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 false, false, 0);
+          }
+
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - j;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 8 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        // Vectors are aligned.
+        ArgOffset = ((ArgOffset+15)/16)*16;
+        CurArgOffset = ArgOffset;
+        ArgOffset += 16;
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrByteSize, Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_Darwin(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+         ++ArgNo) {
+      EVT ObjectVT = Ins[ArgNo].VT;
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        unsigned ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  // FIXME: FuncArg and Ins[ArgNo] must reference the same argument.
+  // When passing anonymous aggregates, this is currently not true.
+  // See LowerFormalArguments_64SVR4 for a fix.
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
+          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                            MachinePointerInfo(FuncArg,
+                                              CurArgOffset),
+                                            ObjType, false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+
+        ArgOffset += PtrByteSize;
+
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          if (isPPC64)
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          else
+            VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+      if (!isPPC64) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // All int arguments reserve stack space in the Darwin ABI.
+        ArgOffset += PtrByteSize;
+        break;
+      }
+      // FALLTHROUGH
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in the Darwin ABI.
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // All FP arguments reserve stack space in the Darwin ABI.
+      ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                             Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+
+      if (isPPC64)
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      else
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
+/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
+static unsigned
+CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
+                                     bool isPPC64,
+                                     bool isVarArg,
+                                     unsigned CC,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     unsigned &nAltivecParamsAtEnd) {
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned NumOps = Outs.size();
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Outs[i].VT;
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
+        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+   // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes,
+                      PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+
+  // Tail call needs the stack to be aligned.
+  if (CC == CallingConv::Fast && DAG.getTarget().Options.GuaranteedTailCallOpt){
+    unsigned TargetAlign = DAG.getMachineFunction().getTarget().
+      getFrameLowering()->getStackAlignment();
+    unsigned AlignMask = TargetAlign-1;
+    NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  }
+
+  return NumBytes;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accommodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+                                   unsigned ParamSize) {
+
+  if (!isTailCall) return 0;
+
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustement is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
+
+  return SPDiff;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Variable argument functions are not supported.
+  if (isVarArg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+    // Functions containing by val parameters are not supported.
+    for (unsigned i = 0; i != Ins.size(); i++) {
+       ISD::ArgFlagsTy Flags = Ins[i].Flags;
+       if (Flags.isByVal()) return false;
+    }
+
+    // Non PIC/GOT  tail calls are supported.
+    if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+      return true;
+
+    // At the moment we can only do local tail calls (in same module, hidden
+    // or protected) if we are generating PIC.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+      return G->getGlobal()->hasHiddenVisibility()
+          || G->getGlobal()->hasProtectedVisibility();
+  }
+
+  return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      SignExtend32<26>(Addr) != Addr)
+    return 0;  // Top 6 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2,
+                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+  SDValue Arg;
+  SDValue FrameIdxOp;
+  int       FrameIdx;
+
+  TailCallArgumentInfo() : FrameIdx(0) {}
+};
+
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void
+StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
+                                           SDValue Chain,
+                   const SmallVector<TailCallArgumentInfo, 8> &TailCallArgs,
+                   SmallVector<SDValue, 8> &MemOpChains,
+                   DebugLoc dl) {
+  for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+    SDValue Arg = TailCallArgs[i].Arg;
+    SDValue FIN = TailCallArgs[i].FrameIdxOp;
+    int FI = TailCallArgs[i].FrameIdx;
+    // Store relative to framepointer.
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
+                                       MachinePointerInfo::getFixedStack(FI),
+                                       false, false, 0));
+  }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
+                                               MachineFunction &MF,
+                                               SDValue Chain,
+                                               SDValue OldRetAddr,
+                                               SDValue OldFP,
+                                               int SPDiff,
+                                               bool isPPC64,
+                                               bool isDarwinABI,
+                                               DebugLoc dl) {
+  if (SPDiff) {
+    // Calculate the new stack slot for the return address.
+    int SlotSize = isPPC64 ? 8 : 4;
+    int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64,
+                                                                   isDarwinABI);
+    int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                          NewRetAddrLoc, true);
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         MachinePointerInfo::getFixedStack(NewRetAddr),
+                         false, false, 0);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      int NewFPLoc =
+        SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
+                                                          true);
+      SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                           MachinePointerInfo::getFixedStack(NewFPIdx),
+                           false, false, 0);
+    }
+  }
+  return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+                         SDValue Arg, int SPDiff, unsigned ArgOffset,
+                      SmallVector<TailCallArgumentInfo, 8>& TailCallArguments) {
+  int Offset = ArgOffset + SPDiff;
+  uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
+  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+  EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  SDValue FIN = DAG.getFrameIndex(FI, VT);
+  TailCallArgumentInfo Info;
+  Info.Arg = Arg;
+  Info.FrameIdxOp = FIN;
+  Info.FrameIdx = FI;
+  TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                                        int SPDiff,
+                                                        SDValue Chain,
+                                                        SDValue &LROpOut,
+                                                        SDValue &FPOpOut,
+                                                        bool isDarwinABI,
+                                                        DebugLoc dl) const {
+  if (SPDiff) {
+    // Load the LR and FP stack slot for later adjusting.
+    EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    LROpOut = getReturnAddrFrameIndex(DAG);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
+                          false, false, false, 0);
+    Chain = SDValue(LROpOut.getNode(), 1);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      FPOpOut = getFramePointerFrameIndex(DAG);
+      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(),
+                            false, false, false, 0);
+      Chain = SDValue(FPOpOut.getNode(), 1);
+    }
+  }
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       false, false, MachinePointerInfo(0),
+                       MachinePointerInfo(0));
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void
+LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
+                 SDValue Arg, SDValue PtrOff, int SPDiff,
+                 unsigned ArgOffset, bool isPPC64, bool isTailCall,
+                 bool isVector, SmallVector<SDValue, 8> &MemOpChains,
+                 SmallVector<TailCallArgumentInfo, 8> &TailCallArguments,
+                 DebugLoc dl) {
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  if (!isTailCall) {
+    if (isVector) {
+      SDValue StackPtr;
+      if (isPPC64)
+        StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+      else
+        StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                           DAG.getConstant(ArgOffset, PtrVT));
+    }
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(), false, false, 0));
+  // Calculate and remember argument location.
+  } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+                                  TailCallArguments);
+}
+
+static
+void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+                     DebugLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
+                     SDValue LROp, SDValue FPOp, bool isDarwinABI,
+                     SmallVector<TailCallArgumentInfo, 8> &TailCallArguments) {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+  // might overwrite each other in case of tail call optimization.
+  SmallVector<SDValue, 8> MemOpChains2;
+  // Do not flag preceding copytoreg stuff together with the following stuff.
+  InFlag = SDValue();
+  StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+                                    MemOpChains2, dl);
+  if (!MemOpChains2.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains2[0], MemOpChains2.size());
+
+  // Store the return address to the appropriate stack slot.
+  Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
+                                        isPPC64, isDarwinABI, dl);
+
+  // Emit callseq_end just before tailcall node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+}
+
+static
+unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
+                     SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall,
+                     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
+                     SmallVector<SDValue, 8> &Ops, std::vector<EVT> &NodeTys,
+                     const PPCSubtarget &PPCSubTarget) {
+
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isSVR4ABI = PPCSubTarget.isSVR4ABI();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Glue);    // Returns a flag for retval copy to use.
+
+  unsigned CallOpc = PPCISD::CALL;
+
+  bool needIndirectCall = true;
+  if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
+    // If this is an absolute destination address, use the munged value.
+    Callee = SDValue(Dest, 0);
+    needIndirectCall = false;
+  }
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
+    // Use indirect calls for ALL functions calls in JIT mode, since the
+    // far-call stubs may be outside relocation limits for a BL instruction.
+    if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
+      unsigned OpFlags = 0;
+      if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
+          (PPCSubTarget.getTargetTriple().isMacOSX() &&
+           PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
+          (G->getGlobal()->isDeclaration() ||
+           G->getGlobal()->isWeakForLinker())) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = PPCII::MO_DARWIN_STUB;
+      }
+
+      // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+      // every direct call is) turn it into a TargetGlobalAddress /
+      // TargetExternalSymbol node so that legalize doesn't hack it.
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
+                                          Callee.getValueType(),
+                                          0, OpFlags);
+      needIndirectCall = false;
+    }
+  }
+
+  if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
+        (PPCSubTarget.getTargetTriple().isMacOSX() &&
+         PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = PPCII::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
+                                         OpFlags);
+    needIndirectCall = false;
+  }
+
+  if (needIndirectCall) {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDValue MTCTROps[] = {Chain, Callee, InFlag};
+
+    if (isSVR4ABI && isPPC64) {
+      // Function pointers in the 64-bit SVR4 ABI do not point to the function
+      // entry point, but to the function descriptor (the function entry point
+      // address is part of the function descriptor though).
+      // The function descriptor is a three doubleword structure with the
+      // following fields: function entry point, TOC base address and
+      // environment pointer.
+      // Thus for a call through a function pointer, the following actions need
+      // to be performed:
+      //   1. Save the TOC of the caller in the TOC save area of its stack
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
+      //   2. Load the address of the function entry point from the function
+      //      descriptor.
+      //   3. Load the TOC of the callee from the function descriptor into r2.
+      //   4. Load the environment pointer from the function descriptor into
+      //      r11.
+      //   5. Branch to the function entry point address.
+      //   6. On return of the callee, the TOC of the caller needs to be
+      //      restored (this is done in FinishCall()).
+      //
+      // All those operations are flagged together to ensure that no other
+      // operations can be scheduled in between. E.g. without flagging the
+      // operations together, a TOC access in the caller could be scheduled
+      // between the load of the callee TOC and the branch to the callee, which
+      // results in the TOC access going through the TOC of the callee instead
+      // of going through the TOC of the caller, which leads to incorrect code.
+
+      // Load the address of the function entry point from the function
+      // descriptor.
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
+      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, MTCTROps,
+                                        InFlag.getNode() ? 3 : 2);
+      Chain = LoadFuncPtr.getValue(1);
+      InFlag = LoadFuncPtr.getValue(2);
+
+      // Load environment pointer into r11.
+      // Offset of the environment pointer within the function descriptor.
+      SDValue PtrOff = DAG.getIntPtrConstant(16);
+
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
+      SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr,
+                                       InFlag);
+      Chain = LoadEnvPtr.getValue(1);
+      InFlag = LoadEnvPtr.getValue(2);
+
+      SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                        InFlag);
+      Chain = EnvVal.getValue(0);
+      InFlag = EnvVal.getValue(1);
+
+      // Load TOC of the callee into r2. We are using a target-specific load
+      // with r2 hard coded, because the result of a target-independent load
+      // would never go directly into r2, since r2 is a reserved register (which
+      // prevents the register allocator from allocating it), resulting in an
+      // additional register being allocated and an unnecessary move instruction
+      // being generated.
+      VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
+                                       Callee, InFlag);
+      Chain = LoadTOCPtr.getValue(0);
+      InFlag = LoadTOCPtr.getValue(1);
+
+      MTCTROps[0] = Chain;
+      MTCTROps[1] = LoadFuncPtr;
+      MTCTROps[2] = InFlag;
+    }
+
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, MTCTROps,
+                        2 + (InFlag.getNode() != 0));
+    InFlag = Chain.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Glue);
+    Ops.push_back(Chain);
+    CallOpc = PPCISD::BCTRL;
+    Callee.setNode(0);
+    // Add use of X11 (holding environment pointer)
+    if (isSVR4ABI && isPPC64)
+      Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
+    // Add CTR register as callee so a bctr can be emitted later.
+    if (isTailCall)
+      Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  // If this is a tail call add stack pointer delta.
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(SPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  return CallOpc;
+}
+
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
+SDValue
+PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
+                              bool isTailCall, bool isVarArg,
+                              SelectionDAG &DAG,
+                              SmallVector<std::pair<unsigned, SDValue>, 8>
+                                &RegsToPass,
+                              SDValue InFlag, SDValue Chain,
+                              SDValue &Callee,
+                              int SPDiff, unsigned NumBytes,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SmallVectorImpl<SDValue> &InVals) const {
+  std::vector<EVT> NodeTys;
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
+                                 isTailCall, RegsToPass, Ops, NodeTys,
+                                 PPCSubTarget);
+
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
+  // When performing tail call optimization the callee pops its arguments off
+  // the stack. Account for this here so these bytes can be pushed back on in
+  // PPCFrameLowering::eliminateCallFramePseudoInstr.
+  int BytesCalleePops =
+    (CallConv == CallingConv::Fast &&
+     getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  // Emit tail call.
+  if (isTailCall) {
+    assert(((Callee.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+            Callee.getOpcode() == ISD::TargetExternalSymbol ||
+            Callee.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(Callee)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
+  }
+
+  // Add a NOP immediately after the branch instruction when using the 64-bit
+  // SVR4 ABI. At link time, if caller and callee are in a different module and
+  // thus have a different TOC, the call will be replaced with a call to a stub
+  // function which saves the current TOC, loads the TOC of the callee and
+  // branches to the callee. The NOP will be replaced with a load instruction
+  // which restores the TOC of the caller from the TOC save slot of the current
+  // stack frame. If caller and callee belong to the same module (and have the
+  // same TOC), the NOP will remain unchanged.
+
+  bool needsTOCRestore = false;
+  if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
+    if (CallOpc == PPCISD::BCTRL) {
+      // This is a call through a function pointer.
+      // Restore the caller TOC from the save area into R2.
+      // See PrepareCall() for more information about calls through function
+      // pointers in the 64-bit SVR4 ABI.
+      // We are using a target-specific load with r2 hard coded, because the
+      // result of a target-independent load would never go directly into r2,
+      // since r2 is a reserved register (which prevents the register allocator
+      // from allocating it), resulting in an additional register being
+      // allocated and an unnecessary move instruction being generated.
+      needsTOCRestore = true;
+    } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) {
+      // Otherwise insert NOP for non-local calls.
+      CallOpc = PPCISD::CALL_NOP;
+    }
+  }
+
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  if (needsTOCRestore) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(BytesCalleePops, true),
+                             InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  if (isTailCall)
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                   Ins, DAG);
+
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+  }
+
+  return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+                          isTailCall, Outs, OutVals, Ins,
+                          dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
+  // of the 32-bit SVR4 ABI stack frame layout.
+
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+  unsigned PtrByteSize = 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, parameter list area and the part of the local variable space which
+  // contains copies of aggregates which are passed by value.
+
+  // Assign locations to all of the outgoing arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
+
+  if (isVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Fixed vector arguments go into registers as long as registers are
+    // available. Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      bool Result;
+
+      if (Outs[i].IsFixed) {
+        Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                               CCInfo);
+      } else {
+        Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+                                      ArgFlags, CCInfo);
+      }
+
+      if (Result) {
+#ifndef NDEBUG
+        errs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << "\n";
+#endif
+        llvm_unreachable(0);
+      }
+    }
+  } else {
+    // All arguments are treated the same.
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
+  }
+
+  // Assign locations to all of the outgoing aggregate by value arguments.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                      getTargetMachine(), ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
+
+  // Size of the linkage area, parameter list area and the part of the local
+  // space variable where copies of aggregates which are passed by value are
+  // stored.
+  unsigned NumBytes = CCByValInfo.getNextStackOffset();
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be moved somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  bool seenFloatArg = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, j = 0, e = ArgLocs.size();
+       i != e;
+       ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    if (Flags.isByVal()) {
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+
+    if (VA.isRegLoc()) {
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue
+PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                              SDValue CallSeqStart,
+                                              ISD::ArgFlagsTy Flags,
+                                              SelectionDAG &DAG,
+                                              DebugLoc dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue
+PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with at least 48 bytes, which
+  // is reserved space for [SP][CR][LR][3 x unused].
+  // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result
+  // of this call.
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv,
+                                         Outs, OutVals, nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
+
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
+
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
+
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                        PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the 
+        // parameter save area instead of a new local variable.
+        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          // A single float or an aggregate containing only a single float
+          // must be passed right-justified in the stack doubleword, and
+          // in the GPR, if one is available.
+          SDValue StoreOff;
+          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+          } else
+            StoreOff = PtrOff;
+
+          SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else if (GPR_idx != NumGPRs)
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          ++GPR_idx;
+      } else {
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32) {
+          SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+                         false, false, 0);
+    // R12 must contain the address of an indirect callee.  This does not
+    // mean the MTCTR instruction must use R12; it's easier to model this
+    // as an extra parameter, so do that.
+    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv,
+                                         Outs, OutVals,
+                                         nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const uint16_t GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const uint16_t GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR_32);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
+      if (Size==1 || Size==2) {
+        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+        } else {
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
+          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (GPR_idx != NumGPRs)
+            ++GPR_idx;
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+              !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+            ++GPR_idx;
+        }
+      } else
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      if (isPPC64)
+        ArgOffset += 8;
+      else
+        ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else if (nAltivecParamsAtEnd==0) {
+        // We are emitting Altivec params in order.
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+  // If all Altivec parameters fit in registers, as they usually do,
+  // they get stack space following the non-Altivec parameters.  We
+  // don't track this here because nobody below needs it.
+  // If there are more Altivec parameters than fit in registers emit
+  // the stores here.
+  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+    unsigned j = 0;
+    // Offset is aligned; skip 1st 12 params which go in V registers.
+    ArgOffset = ((ArgOffset+15)/16)*16;
+    ArgOffset += 12*16;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      SDValue Arg = OutVals[i];
+      EVT ArgType = Outs[i].VT;
+      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+        if (++j > NumVRs) {
+          SDValue PtrOff;
+          // We are emitting Altivec params in order.
+          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                           isPPC64, isTailCall, true, MemOpChains,
+                           TailCallArguments, dl);
+          ArgOffset += 16;
+        }
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // On Darwin, R12 must contain the address of an indirect callee.  This does
+  // not mean the MTCTR instruction must use R12; it's easier to model this as
+  // an extra parameter, so do that.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG))
+    RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
+                                                   PPC::R12), Callee));
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               DebugLoc dl, SelectionDAG &DAG) const {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                   const PPCSubtarget &Subtarget) const {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Construct the stack pointer operand.
+  bool isPPC64 = Subtarget.isPPC64();
+  unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
+  SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDValue Chain = Op.getOperand(0);
+  SDValue SaveSP = Op.getOperand(1);
+
+  // Load the old link SP.
+  SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+  // Store the old link SP.
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(),
+                      false, false, 0);
+}
+
+
+
+SDValue
+PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int RASI = FI->getReturnAddrSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!RASI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
+    // Save the result.
+    FI->setReturnAddrSaveIndex(RASI);
+  }
+  return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64,
+                                                           isDarwinABI);
+
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+  return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) const {
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Negate the size.
+  SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+                                  DAG.getConstant(0, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNALLOC node.
+  SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3);
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  // Not FP? Not a fsel.
+  if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+      !Op.getOperand(2).getValueType().isFloatingPoint())
+    return Op;
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  // Cannot handle SETEQ/SETNE.
+  if (CC == ISD::SETEQ || CC == ISD::SETNE) return Op;
+
+  EVT ResVT = Op.getValueType();
+  EVT CmpVT = Op.getOperand(0).getValueType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETULT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+    }
+
+  SDValue Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETULT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  }
+  return Op;
+}
+
+// FIXME: Split this code up when LegalizeDAGTypes lands.
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                           DebugLoc dl) const {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
+                        (PPCSubTarget.hasFPCVT() ? PPCISD::FCTIWUZ :
+                                                   PPCISD::FCTIDZ),
+                      dl, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    assert((Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT()) &&
+           "i64 FP_TO_UINT is supported only with FPCVT");
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
+                                                        PPCISD::FCTIDUZ,
+                      dl, MVT::f64, Src);
+    break;
+  }
+
+  // Convert the FP value to an int value through memory.
+  bool i32Stack = Op.getValueType() == MVT::i32 && PPCSubTarget.hasSTFIWX() &&
+    (Op.getOpcode() == ISD::FP_TO_SINT || PPCSubTarget.hasFPCVT());
+  SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
+  int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
+  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
+
+  // Emit a store to the stack slot.
+  SDValue Chain;
+  if (i32Stack) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
+    SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
+    Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
+              MVT::i32, MMO);
+  } else
+    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
+                         MPI, false, false, 0);
+
+  // Result is a load from the stack slot.  If loading 4 bytes, make sure to
+  // add in a bias.
+  if (Op.getValueType() == MVT::i32 && !i32Stack) {
+    FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+                        DAG.getConstant(4, FIPtr.getValueType()));
+    MPI = MachinePointerInfo();
+  }
+
+  return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI,
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+
+  assert((Op.getOpcode() == ISD::SINT_TO_FP || PPCSubTarget.hasFPCVT()) &&
+         "UINT_TO_FP is supported only with FPCVT");
+
+  // If we have FCFIDS, then use it when converting to single-precision.
+  // Otherwise, convert to double-precision and then round.
+  unsigned FCFOp = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+                   (Op.getOpcode() == ISD::UINT_TO_FP ?
+                    PPCISD::FCFIDUS : PPCISD::FCFIDS) :
+                   (Op.getOpcode() == ISD::UINT_TO_FP ?
+                    PPCISD::FCFIDU : PPCISD::FCFID);
+  MVT      FCFTy = (PPCSubTarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
+                   MVT::f32 : MVT::f64;
+
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !PPCSubTarget.hasFPCVT() &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
+    SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
+
+    if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0));
+    return FP;
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled INT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue Ld;
+  if (PPCSubTarget.hasLFIWAX() || PPCSubTarget.hasFPCVT()) {
+    int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
+                                 MachinePointerInfo::getFixedStack(FrameIdx),
+                                 false, false, 0);
+
+    assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
+           "Expected an i32 store");
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                              MachineMemOperand::MOLoad, 4, 4);
+    SDValue Ops[] = { Store, FIdx };
+    Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
+                                   PPCISD::LFIWZX : PPCISD::LFIWAX,
+                                 dl, DAG.getVTList(MVT::f64, MVT::Other),
+                                 Ops, 2, MVT::i32, MMO);
+  } else {
+    assert(PPCSubTarget.isPPC64() &&
+           "i32->FP without LFIWAX supported only on PPC64");
+
+    int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
+    SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+    SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
+                                Op.getOperand(0));
+
+    // STD the extended value into the stack slot.
+    SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx,
+                                 MachinePointerInfo::getFixedStack(FrameIdx),
+                                 false, false, 0);
+
+    // Load the value as a double.
+    Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx,
+                     MachinePointerInfo::getFixedStack(FrameIdx),
+                     false, false, false, 0);
+  }
+
+  // FCFID it and return it.
+  SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
+  if (Op.getValueType() == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  /*
+   The rounding mode is in bits 30:31 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to 0
+     10 Round to +inf
+     11 Round to -inf
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT VT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue MFFSreg, InFlag;
+
+  // Save FP Control Word to register
+  EVT NodeTys[] = {
+    MVT::f64,    // return register
+    MVT::Glue    // unused in this context
+  };
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+
+  // Save FP register to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
+                               StackSlot, MachinePointerInfo(), false, false,0);
+
+  // Load FP Control Word from low 32 bits of stack slot.
+  SDValue Four = DAG.getConstant(4, PtrVT);
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(),
+                            false, false, false, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::AND, dl, MVT::i32,
+                CWD, DAG.getConstant(3, MVT::i32));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i32,
+                DAG.getNode(ISD::AND, dl, MVT::i32,
+                            DAG.getNode(ISD::XOR, dl, MVT::i32,
+                                        CWD, DAG.getConstant(3, MVT::i32)),
+                            DAG.getConstant(3, MVT::i32)),
+                DAG.getConstant(1, MVT::i32));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SHL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+  SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+  SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRA!");
+
+  // Expand into a bunch of logical ops, followed by a select_cc.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+  SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+  SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
+                                  Tmp4, Tmp6, ISD::SETLE);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+                             SelectionDAG &DAG, DebugLoc dl) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const EVT VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+
+  EVT CanonicalVT = VTys[SplatSize-1];
+
+  // Build a canonical splat for this value.
+  SDValue Elt = DAG.getConstant(Val, MVT::i32);
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT,
+                              &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+                                SelectionDAG &DAG, DebugLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+                                SDValue Op2, SelectionDAG &DAG,
+                                DebugLoc dl, EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
+}
+
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
+                             EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
+
+  int Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = i + Amt;
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+  // Check if this is a splat of a constant value.
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, true) || SplatBitSize > 32)
+    return SDValue();
+
+  unsigned SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;
+
+  // First, handle single instruction cases.
+
+  // All zeros?
+  if (SplatBits == 0) {
+    // Canonicalize all zero vectors to be v4i32.
+    if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+      SDValue Z = DAG.getConstant(0, MVT::i32);
+      Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
+      Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
+    }
+    return Op;
+  }
+
+  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+                    (32-SplatBitSize));
+  if (SextVal >= -16 && SextVal <= 15)
+    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+
+  // Two instruction sequences.
+
+  // If this value is in the range [-32,30] and is even, use:
+  //     VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
+  // If this value is in the range [17,31] and is odd, use:
+  //     VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
+  // If this value is in the range [-31,-17] and is odd, use:
+  //     VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
+  // Note the last two are three-instruction sequences.
+  if (SextVal >= -32 && SextVal <= 31) {
+    // To avoid having these optimizations undone by constant folding,
+    // we convert to a pseudo that will be expanded later into one of
+    // the above forms.
+    SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
+    EVT VT = Op.getValueType();
+    int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4);
+    SDValue EltSize = DAG.getConstant(Size, MVT::i32);
+    return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+  }
+
+  // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
+  // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+  // for fneg/fabs.
+  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+    // Make -1 and vspltisw -1:
+    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+    // Make the VSLW intrinsic, computing 0x8000_0000.
+    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+                                   OnesV, DAG, dl);
+
+    // xor by OnesV to invert it.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+  }
+
+  // Check to see if this is a wide variety of vsplti*, binop self cases.
+  static const signed char SplatCsts[] = {
+    -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+    -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+  };
+
+  for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+    // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+    // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+    int i = SplatCsts[idx];
+
+    // Figure out what shift amount will be used by altivec if shifted by i in
+    // this splat size.
+    unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+    // vsplti + shl self.
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+        Intrinsic::ppc_altivec_vslw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + srl self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+        Intrinsic::ppc_altivec_vsrw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + sra self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+        Intrinsic::ppc_altivec_vsraw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + rol self.
+    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+        Intrinsic::ppc_altivec_vrlw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
+    }
+
+    // t = vsplti c, result = vsldoi t, t, 1
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 2
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 3
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+    }
+  }
+
+  return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+  int ShufIdxs[16];
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+  }
+  EVT VT = OpLHS.getValueType();
+  OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
+  OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+  return DAG.getNode(ISD::BITCAST, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  EVT VT = Op.getValueType();
+
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.getOpcode() == ISD::UNDEF) {
+    if (PPC::isSplatShuffleMask(SVOp, 1) ||
+        PPC::isSplatShuffleMask(SVOp, 2) ||
+        PPC::isSplatShuffleMask(SVOp, 4) ||
+        PPC::isVPKUWUMShuffleMask(SVOp, true) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, true) ||
+        PPC::isVSLDOIShuffleMask(SVOp, true) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, true)) {
+      return Op;
+    }
+  }
+
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  if (PPC::isVPKUWUMShuffleMask(SVOp, false) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, false) ||
+      PPC::isVSLDOIShuffleMask(SVOp, false) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, false))
+    return Op;
+
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  ArrayRef<int> PermMask = SVOp->getMask();
+
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask[i*4+j] < 0)
+        continue;   // Undef, ignore it.
+
+      unsigned ByteSource = PermMask[i*4+j];
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  if (isFourElementShuffle) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+  EVT EltVT = V1.getValueType().getVectorElementType();
+  unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+  SmallVector<SDValue, 16> ResultMask;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                           MVT::i32));
+  }
+
+  SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+  return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
+}
+
+/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
+/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
+                                  bool &isDot) {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  DebugLoc dl = Op.getDebugLoc();
+  int CompareOpc;
+  bool isDot;
+  if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
+    return SDValue();    // Don't custom lower most intrinsics.
+
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+                              Op.getOperand(1), Op.getOperand(2),
+                              DAG.getConstant(CompareOpc, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
+  }
+
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDValue Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, MVT::i32)
+  };
+  EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDValue Flags = DAG.getNode(PPCISD::MFCR, dl, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1));
+
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+                      DAG.getConstant(8-(3-BitNo), MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+                      DAG.getConstant(1, MVT::i32));
+
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+                        DAG.getConstant(1, MVT::i32));
+  return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+  EVT PtrVT = getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  // Store the input value into Value#0 of the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                               Op.getOperand(0), FIdx, MachinePointerInfo(),
+                               false, false, 0);
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  if (Op.getValueType() == MVT::v4i32) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
+    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+    SDValue RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
+
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, dl, MVT::v4i32);
+
+    SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                      LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+                              Neg16, DAG, dl);
+    return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG, dl);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, dl, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
+
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, dl, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
+
+    // Merge the results together.
+    int Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      Ops[i*2  ] = 2*i+1;
+      Ops[i*2+1] = 2*i+1+16;
+    }
+    return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+  } else {
+    llvm_unreachable("Unknown mul to lower!");
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG, PPCSubTarget);
+
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG, PPCSubTarget);
+
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
+                                                       Op.getDebugLoc());
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+
+  // Frame & Return address.
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  const TargetMachine &TM = getTargetMachine();
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::VAARG: {
+    if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
+        || TM.getSubtarget<PPCSubtarget>().isPPC64())
+      return;
+
+    EVT VT = N->getValueType(0);
+
+    if (VT == MVT::i64) {
+      SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, PPCSubTarget);
+
+      Results.push_back(NewNode);
+      Results.push_back(NewNode.getValue(1));
+    }
+    return;
+  }
+  case ISD::FP_ROUND_INREG: {
+    assert(N->getValueType(0) == MVT::ppcf128);
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(0));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(1));
+
+    // Add the two halves of the long double in round-to-zero mode.
+    SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
+
+    // We know the low half is about to be thrown away, so just use something
+    // convenient.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                FPreg, FPreg));
+    return;
+  }
+  case ISD::FP_TO_SINT:
+    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+    return;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    bool is64bit, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned TmpReg = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(
+       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   add r0, dest, incr
+  //   st[wd]cx. r0, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+    .addReg(ptrA).addReg(ptrB);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+    .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool is8bit,    // operation
+                                            unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // In 64 bit mode we have to use 64 bits for addresses, even though the
+  // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
+  // registers without caring whether they're 32 or 64, but here we're
+  // doing actual arithmetic on the addresses.
+  bool is64bit = PPCSubTarget.isPPC64();
+  unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  const TargetRegisterClass *RC =
+    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+  unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+  unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+  unsigned Ptr1Reg;
+  unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  // The 4-byte load must be aligned, while a char or short may be
+  // anywhere in the word.  Hence all this nasty bookkeeping code.
+  //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+  //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+  //   xori shift, shift1, 24 [16]
+  //   rlwinm ptr, ptr1, 0, 0, 29
+  //   slw incr2, incr, shift
+  //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+  //   slw mask, mask2, shift
+  //  loopMBB:
+  //   lwarx tmpDest, ptr
+  //   add tmp, tmpDest, incr2
+  //   andc tmp2, tmpDest, mask
+  //   and tmp3, tmp, mask
+  //   or tmp4, tmp3, tmp2
+  //   stwcx. tmp4, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //   srw dest, tmpDest, shift
+  if (ptrA != ZeroReg) {
+    Ptr1Reg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+      .addReg(ptrA).addReg(ptrB);
+  } else {
+    Ptr1Reg = ptrB;
+  }
+  BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+      .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+      .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+  if (is64bit)
+    BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(61);
+  else
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+  BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+      .addReg(incr).addReg(ShiftReg);
+  if (is8bit)
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+  else {
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+    BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+  }
+  BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+      .addReg(Mask2Reg).addReg(ShiftReg);
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+    .addReg(ZeroReg).addReg(PtrReg);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+      .addReg(Incr2Reg).addReg(TmpDestReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+    .addReg(TmpDestReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+    .addReg(TmpReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+    .addReg(Tmp3Reg).addReg(Tmp2Reg);
+  BuildMI(BB, dl, TII->get(PPC::STWCX))
+    .addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
+    .addReg(ShiftReg);
+  return BB;
+}
+
+llvm::MachineBasicBlock*
+PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  SjLjSetup mainMBB
+  //  bl mainMBB
+  //  v_restore = 1
+  //  b sinkMBB
+  //
+  // mainMBB:
+  //  buf[LabelOffset] = LR
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Note that the structure of the jmp_buf used here is not compatible
+  // with that used by libc, and is not designed to be. Specifically, it
+  // stores only those 'reserved' registers that LLVM does not otherwise
+  // understand how to spill. Also, by convention, by the time this
+  // intrinsic is called, Clang has already stored the frame address in the
+  // first slot of the buffer and stack address in the third. Following the
+  // X86 target code, we'll store the jump address in the second slot. We also
+  // need to save the TOC pointer (R2) to handle jumps between shared
+  // libraries, and that will be stored in the fourth slot. The thread
+  // identifier (R13) is not affected.
+
+  // thisMBB:
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+
+  // Prepare IP either in reg.
+  const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+  unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
+  unsigned BufReg = MI->getOperand(1).getReg();
+
+  if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) {
+    MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
+            .addReg(PPC::X2)
+            .addImm(TOCOffset / 4)
+            .addReg(BufReg);
+
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
+  MIB.addRegMask(PPCRegInfo->getNoPreservedMask());
+
+  BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
+
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
+          .addMBB(mainMBB);
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
+
+  thisMBB->addSuccessor(mainMBB, /* weight */ 0);
+  thisMBB->addSuccessor(sinkMBB, /* weight */ 1);
+
+  // mainMBB:
+  //  mainDstReg = 0
+  MIB = BuildMI(mainMBB, DL,
+    TII->get(PPCSubTarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
+
+  // Store IP
+  if (PPCSubTarget.isPPC64()) {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
+            .addReg(LabelReg)
+            .addImm(LabelOffset / 4)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
+            .addReg(LabelReg)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(PPC::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(thisMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
+  unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset    = 2 * PVT.getStoreSize();
+  const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+
+  unsigned BufReg = MI->getOperand(0).getReg();
+
+  // Reload FP (the jumped-to function may not have had a
+  // frame pointer, and if so, then its r31 will be restored
+  // as necessary).
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
+            .addImm(0)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload IP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
+            .addImm(LabelOffset / 4)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
+            .addImm(LabelOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Reload SP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
+            .addImm(SPOffset / 4)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
+            .addImm(SPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // FIXME: When we also support base pointers, that register must also be
+  // restored here.
+
+  // Reload TOC
+  if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
+            .addImm(TOCOffset / 4)
+            .addReg(BufReg);
+
+    MIB.setMemRefs(MMOBegin, MMOEnd);
+  }
+
+  // Jump
+  BuildMI(*MBB, MI, DL,
+          TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
+  BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
+      MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
+    return emitEHSjLjSetJmp(MI, BB);
+  } else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 ||
+             MI->getOpcode() == PPC::EH_SjLj_LongJmp64) {
+    return emitEHSjLjLongJmp(MI, BB);
+  }
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  // To "insert" these instructions we actually have to insert their
+  // control-flow patterns.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  MachineFunction *F = BB->getParent();
+
+  if (PPCSubTarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+                                 MI->getOpcode() == PPC::SELECT_CC_I8)) {
+    unsigned OpCode = MI->getOpcode() == PPC::SELECT_CC_I8 ?
+                                         PPC::ISEL8 : PPC::ISEL;
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+
+    unsigned SubIdx;
+    bool SwapOps;
+    switch (SelectPred) {
+    default: llvm_unreachable("invalid predicate for isel");
+    case PPC::PRED_EQ: SubIdx = PPC::sub_eq; SwapOps = false; break;
+    case PPC::PRED_NE: SubIdx = PPC::sub_eq; SwapOps = true; break;
+    case PPC::PRED_LT: SubIdx = PPC::sub_lt; SwapOps = false; break;
+    case PPC::PRED_GE: SubIdx = PPC::sub_lt; SwapOps = true; break;
+    case PPC::PRED_GT: SubIdx = PPC::sub_gt; SwapOps = false; break;
+    case PPC::PRED_LE: SubIdx = PPC::sub_gt; SwapOps = true; break;
+    case PPC::PRED_UN: SubIdx = PPC::sub_un; SwapOps = false; break;
+    case PPC::PRED_NU: SubIdx = PPC::sub_un; SwapOps = true; break;
+    }
+
+    BuildMI(*BB, MI, dl, TII->get(OpCode), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(SwapOps? 3 : 2).getReg())
+      .addReg(MI->getOperand(SwapOps? 2 : 3).getReg())
+      .addReg(MI->getOperand(1).getReg(), 0, SubIdx);
+  } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+             MI->getOpcode() == PPC::SELECT_CC_I8 ||
+             MI->getOpcode() == PPC::SELECT_CC_F4 ||
+             MI->getOpcode() == PPC::SELECT_CC_F8 ||
+             MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
+
+    // The incoming instruction knows the destination vreg to set, the
+    // condition code register to branch on, the true/false values to
+    // select between, and a branch opcode to use.
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    sinkMBB->splice(sinkMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(*BB, BB->begin(), dl,
+            TII->get(PPC::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  }
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::AND8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::OR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
+    BB = EmitAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
+    BB = EmitAtomicBinary(MI, BB, true, 0);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) {
+    bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // loop1MBB:
+    //   l[wd]arx dest, ptr
+    //   cmp[wd] dest, oldval
+    //   bne- midMBB
+    // loop2MBB:
+    //   st[wd]cx. newval, ptr
+    //   bne- loopMBB
+    //   b exitBB
+    // midMBB:
+    //   st[wd]cx. dest, ptr
+    // exitBB:
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+      .addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+      .addReg(oldval).addReg(dest);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(newval).addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(dest).addReg(ptrA).addReg(ptrB);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+  } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+    // We must use 64-bit registers for addresses when targeting 64-bit,
+    // since we're actually doing arithmetic on them.  Other registers
+    // can be 32-bit.
+    bool is64bit = PPCSubTarget.isPPC64();
+    bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    const TargetRegisterClass *RC =
+      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+    unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+    unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+    unsigned Ptr1Reg;
+    unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // The 4-byte load must be aligned, while a char or short may be
+    // anywhere in the word.  Hence all this nasty bookkeeping code.
+    //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+    //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+    //   xori shift, shift1, 24 [16]
+    //   rlwinm ptr, ptr1, 0, 0, 29
+    //   slw newval2, newval, shift
+    //   slw oldval2, oldval,shift
+    //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+    //   slw mask, mask2, shift
+    //   and newval3, newval2, mask
+    //   and oldval3, oldval2, mask
+    // loop1MBB:
+    //   lwarx tmpDest, ptr
+    //   and tmp, tmpDest, mask
+    //   cmpw tmp, oldval3
+    //   bne- midMBB
+    // loop2MBB:
+    //   andc tmp2, tmpDest, mask
+    //   or tmp4, tmp2, newval3
+    //   stwcx. tmp4, ptr
+    //   bne- loop1MBB
+    //   b exitBB
+    // midMBB:
+    //   stwcx. tmpDest, ptr
+    // exitBB:
+    //   srw dest, tmpDest, shift
+    if (ptrA != ZeroReg) {
+      Ptr1Reg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+        .addReg(ptrA).addReg(ptrB);
+    } else {
+      Ptr1Reg = ptrB;
+    }
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+        .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+        .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+    if (is64bit)
+      BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(61);
+    else
+      BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+    BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+        .addReg(newval).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+        .addReg(oldval).addReg(ShiftReg);
+    if (is8bit)
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+    else {
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+      BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+        .addReg(Mask3Reg).addImm(65535);
+    }
+    BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+        .addReg(Mask2Reg).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+        .addReg(NewVal2Reg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+        .addReg(OldVal2Reg).addReg(MaskReg);
+
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+        .addReg(TmpReg).addReg(OldVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+        .addReg(Tmp2Reg).addReg(NewVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+        .addReg(ZeroReg).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+      .addReg(ZeroReg).addReg(PtrReg);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+    BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
+      .addReg(ShiftReg);
+  } else if (MI->getOpcode() == PPC::FADDrtz) {
+    // This pseudo performs an FADD with rounding mode temporarily forced
+    // to round-to-zero.  We emit this via custom inserter since the FPSCR
+    // is not modeled at the SelectionDAG level.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    unsigned Src2 = MI->getOperand(2).getReg();
+    DebugLoc dl   = MI->getDebugLoc();
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
+
+    // Save FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
+
+    // Set rounding mode to round-to-zero.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
+
+    // Perform addition.
+    BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
+
+    // Restore FPSCR value.
+    BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
+  } else if (MI->getOpcode() == PPC::FRINDrint ||
+             MI->getOpcode() == PPC::FRINSrint) {
+    bool isf32 = MI->getOpcode() == PPC::FRINSrint;
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    DebugLoc dl   = MI->getDebugLoc();
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
+
+    // Perform the rounding.
+    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest)
+      .addReg(Src);
+
+    // Compare the results.
+    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg)
+      .addReg(Dest).addReg(Src);
+
+    // If the results were not equal, then set the FPSCR XX bit.
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->splice(exitMBB->begin(), BB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    BB->end());
+    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BuildMI(*BB, MI, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB);
+
+    BB->addSuccessor(midMBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+
+    // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set
+    // the FI bit here because that will not automatically set XX also,
+    // and XX is what libm interprets as the FE_INEXACT flag.
+    BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+
+    BB->addSuccessor(exitMBB);
+
+    BB = exitMBB;
+  } else {
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
+                                               DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  if ((VT == MVT::f32 && PPCSubTarget.hasFRES()) ||
+      (VT == MVT::f64 && PPCSubTarget.hasFRE())  ||
+      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal, we need to find the zero of the function:
+    //   F(X) = A X - 1 [which has a zero at X = 1/A]
+    //     =>
+    //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+    //     does not require additional intermediate precision]
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (VT.getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = Op.getDebugLoc();
+
+    SDValue FPOne =
+      DAG.getConstantFP(1.0, VT.getScalarType());
+    if (VT.isVector()) {
+      assert(VT.getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                          FPOne, FPOne, FPOne, FPOne);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
+    DCI.AddToWorklist(Est.getNode());
+
+    // Newton iterations: Est = Est + Est (1 - Arg * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
+                                             DAGCombinerInfo &DCI) const {
+  if (DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  if ((VT == MVT::f32 && PPCSubTarget.hasFRSQRTES()) ||
+      (VT == MVT::f64 && PPCSubTarget.hasFRSQRTE())  ||
+      (VT == MVT::v4f32 && PPCSubTarget.hasAltivec())) {
+
+    // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+    // For the reciprocal sqrt, we need to find the zero of the function:
+    //   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+    //     =>
+    //   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+    // As a result, we precompute A/2 prior to the iteration loop.
+
+    // Convergence is quadratic, so we essentially double the number of digits
+    // correct after every iteration. The minimum architected relative
+    // accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
+    // 23 digits and double has 52 digits.
+    int Iterations = PPCSubTarget.hasRecipPrec() ? 1 : 3;
+    if (VT.getScalarType() == MVT::f64)
+      ++Iterations;
+
+    SelectionDAG &DAG = DCI.DAG;
+    DebugLoc dl = Op.getDebugLoc();
+
+    SDValue FPThreeHalves =
+      DAG.getConstantFP(1.5, VT.getScalarType());
+    if (VT.isVector()) {
+      assert(VT.getVectorNumElements() == 4 &&
+             "Unknown vector type");
+      FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                                  FPThreeHalves, FPThreeHalves,
+                                  FPThreeHalves, FPThreeHalves);
+    }
+
+    SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
+    DCI.AddToWorklist(Est.getNode());
+
+    // We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
+    // this entire sequence requires only one FP constant.
+    SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
+    DCI.AddToWorklist(HalfArg.getNode());
+
+    // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+    for (int i = 0; i < Iterations; ++i) {
+      SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
+      DCI.AddToWorklist(NewEst.getNode());
+
+      Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
+      DCI.AddToWorklist(Est.getNode());
+    }
+
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  const TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue())   // 0 << V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue())   // 0 >>u V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->isNullValue() ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+  case ISD::FDIV: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
+      if (RV.getNode() != 0) {
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    } else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
+               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
+                                 DCI);
+      if (RV.getNode() != 0) {
+        DCI.AddToWorklist(RV.getNode());
+        RV = DAG.getNode(ISD::FP_EXTEND, N->getOperand(1).getDebugLoc(),
+                         N->getValueType(0), RV);
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    } else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
+               N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
+      SDValue RV =
+        DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
+                                 DCI);
+      if (RV.getNode() != 0) {
+        DCI.AddToWorklist(RV.getNode());
+        RV = DAG.getNode(ISD::FP_ROUND, N->getOperand(1).getDebugLoc(),
+                         N->getValueType(0), RV,
+                         N->getOperand(1).getOperand(1));
+        DCI.AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                           N->getOperand(0), RV);
+      }
+    }
+
+    SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
+                         N->getOperand(0), RV);
+    }
+
+    }
+    break;
+  case ISD::FSQRT: {
+    assert(TM.Options.UnsafeFPMath &&
+           "Reciprocal estimates require UnsafeFPMath");
+
+    // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
+    // reciprocal sqrt.
+    SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
+    if (RV.getNode() != 0) {
+      DCI.AddToWorklist(RV.getNode());
+      RV = DAGCombineFastRecip(RV, DCI);
+      if (RV.getNode() != 0)
+        return RV;
+    }
+
+    }
+    break;
+  case ISD::SINT_TO_FP:
+    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
+        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
+        // We allow the src/dst to be either f32/f64, but the intermediate
+        // type must be i64.
+        if (N->getOperand(0).getValueType() == MVT::i64 &&
+            N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
+          SDValue Val = N->getOperand(0).getOperand(0);
+          if (Val.getValueType() == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+            DCI.AddToWorklist(Val.getNode());
+          }
+
+          Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          if (N->getValueType(0) == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
+                              DAG.getIntPtrConstant(0));
+            DCI.AddToWorklist(Val.getNode());
+          }
+          return Val;
+        } else if (N->getOperand(0).getValueType() == MVT::i32) {
+          // If the intermediate type is i32, we can avoid the load/store here
+          // too.
+        }
+      }
+    }
+    break;
+  case ISD::STORE:
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
+        !cast<StoreSDNode>(N)->isTruncatingStore() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        N->getOperand(1).getValueType() == MVT::i32 &&
+        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+      SDValue Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        DCI.AddToWorklist(Val.getNode());
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+      DCI.AddToWorklist(Val.getNode());
+
+      SDValue Ops[] = {
+        N->getOperand(0), Val, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+
+      Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
+              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
+              cast<StoreSDNode>(N)->getMemoryVT(),
+              cast<StoreSDNode>(N)->getMemOperand());
+      DCI.AddToWorklist(Val.getNode());
+      return Val;
+    }
+
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (cast<StoreSDNode>(N)->isUnindexed() &&
+        N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).getNode()->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16 ||
+         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
+          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+          N->getOperand(1).getValueType() == MVT::i64))) {
+      SDValue BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+      return
+        DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+                                Ops, array_lengthof(Ops),
+                                cast<StoreSDNode>(N)->getMemoryVT(),
+                                cast<StoreSDNode>(N)->getMemOperand());
+    }
+    break;
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
+         (TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
+          TM.getSubtarget<PPCSubtarget>().isPPC64() &&
+          N->getValueType(0) == MVT::i64))) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
+                                              MVT::i64 : MVT::i32, MVT::Other),
+                                Ops, 3, LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = 0;
+
+      SDNode *LHSN = N->getOperand(0).getNode();
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if (UI->getOpcode() == PPCISD::VCMPo &&
+            UI->getOperand(1) == N->getOperand(1) &&
+            UI->getOperand(2) == N->getOperand(2) &&
+            UI->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+
+      // Look at the (necessarily single) use of the flag value.  If it has a
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = 0;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+           FlagUser == 0; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+
+      // If the user is a MFCR instruction, we know this is safe.  Otherwise we
+      // give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFCR)
+        return SDValue(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFCR: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+    int CompareOpc;
+    bool isDot;
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
+      assert(isDot && "Can't compare against a vector result!");
+
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, dl, MVT::Other,
+                           N->getOperand(0), N->getOperand(4));
+      }
+
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+      // Create the PPCISD altivec 'dot' comparison node.
+      SDValue Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, MVT::i32)
+      };
+      EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }
+  }
+  }
+}
+
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+PPCTargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'b':
+    if (type->isIntegerTy())
+      weight = CW_Register;
+    break;
+  case 'f':
+    if (type->isFloatTy())
+      weight = CW_Register;
+    break;
+  case 'd':
+    if (type->isDoubleTy())
+      weight = CW_Register;
+    break;
+  case 'v':
+    if (type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'y':
+    weight = CW_Register;
+    break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
+  }
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
+      return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+        return std::make_pair(0U, &PPC::G8RCRegClass);
+      return std::make_pair(0U, &PPC::GPRCRegClass);
+    case 'f':
+      if (VT == MVT::f32 || VT == MVT::i32)
+        return std::make_pair(0U, &PPC::F4RCRegClass);
+      if (VT == MVT::f64 || VT == MVT::i64)
+        return std::make_pair(0U, &PPC::F8RCRegClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, &PPC::VRRCRegClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, &PPC::CRRCRegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0,0);
+
+  // Only support length 1 constraints.
+  if (Constraint.length() > 1) return;
+
+  char Letter = Constraint[0];
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return; // Must be an immediate to match.
+    unsigned Value = CST->getZExtValue();
+    switch (Letter) {
+    default: llvm_unreachable("Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if ((short)Value == (int)Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if ((short)Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if ((Value >> 16) == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if ((int)Value > 0 && isPowerOf2_32(Value))
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'O':  // "O" is the constant zero.
+      if (Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if ((short)-Value == (int)-Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    }
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // Handle standard constraint letters.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  // FIXME: PPC does not allow r+i addressing modes for vectors!
+
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // PPC only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+
+  return true;
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
+  // PPC allows a sign-extended 16-bit immediate field.
+  return (V > -(1 << 16) && V < (1 << 16)-1);
+}
+
+bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
+  return false;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  // Make sure the function does not optimize away the store of the RA to
+  // the stack.
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setLRStoreRequired();
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+
+      DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI),
+                      isPPC64? MVT::i64 : MVT::i32);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address off the stack.
+  SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  // Naked functions never have a frame pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned FrameReg;
+  if (MF.getFunction()->getAttributes().hasAttribute(
+        AttributeSet::FunctionIndex, Attribute::Naked))
+    FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
+  else
+    FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
+
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
+                                         PtrVT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
+                            FrameAddr, MachinePointerInfo(), false, false,
+                            false, 0);
+  return FrameAddr;
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The PowerPC target isn't yet aware of offsets.
+  return false;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
+                                           unsigned DstAlign, unsigned SrcAlign,
+                                           bool IsMemset, bool ZeroMemset,
+                                           bool MemcpyStrSrc,
+                                           MachineFunction &MF) const {
+  if (this->PPCSubTarget.isPPC64()) {
+    return MVT::i64;
+  } else {
+    return MVT::i32;
+  }
+}
+
+bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                      bool *Fast) const {
+  if (DisablePPCUnaligned)
+    return false;
+
+  // PowerPC supports unaligned memory access for simple non-vector types.
+  // Although accessing unaligned addresses is not as efficient as accessing
+  // aligned addresses, it is generally more efficient than manual expansion,
+  // and generally only traps for software emulation when crossing page
+  // boundaries.
+
+  if (!VT.isSimple())
+    return false;
+
+  if (VT.getSimpleVT().isVector())
+    return false;
+
+  if (VT == MVT::ppcf128)
+    return false;
+
+  if (Fast)
+    *Fast = true;
+
+  return true;
+}
+
+/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+/// is expanded to mul + add.
+bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+  case MVT::v4f32:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
+  if (DisableILPPref)
+    return TargetLowering::getSchedulingPreference(N);
+
+  return Sched::ILP;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 000000000000..7157b70d8622
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -0,0 +1,632 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+
+      /// Newer FCFID[US] integer-to-floating-point conversion instructions for
+      /// unsigned integers and single-precision outputs.
+      FCFIDU, FCFIDS, FCFIDUS,
+
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+
+      /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+      /// unsigned integers.
+      FCTIDUZ, FCTIWUZ,
+
+      /// Reciprocal estimate instructions (unary FP ops).
+      FRE, FRSQRTE,
+
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+
+      TOC_ENTRY,
+
+      /// The following three target-specific nodes are used for calls through
+      /// function pointers in the 64-bit SVR4 ABI.
+
+      /// Restore the TOC from the TOC save area of the current stack frame.
+      /// This is basically a hard coded load instruction which additionally
+      /// takes/produces a flag.
+      TOC_RESTORE,
+
+      /// Like a regular LOAD but additionally taking/producing a flag.
+      LOAD,
+
+      /// LOAD into r2 (also taking/producing a flag). Like TOC_RESTORE, this is
+      /// a hard coded load instruction.
+      LOAD_TOC,
+
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+
+      /// CALL - A direct function call.
+      /// CALL_NOP is a call with the special NOP which follows 64-bit
+      /// SVR4 calls.
+      CALL, CALL_NOP,
+
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL,
+
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+
+      /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCRpseud/MFOCRF
+      /// instructions.  This copies the bits corresponding to the specified
+      /// CRREG into the resultant GPR.  Bits corresponding to other CR regs
+      /// are undefined.
+      MFCR,
+
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+
+      /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+      /// towards zero.  Used only as part of the long double-to-int
+      /// conversion sequence.
+      FADDRTZ,
+
+      /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
+      MFFS,
+
+      /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and
+      /// reserve indexed. This is used to implement atomic operations.
+      LARX,
+
+      /// STCX = This corresponds to PPC stcx. instrcution: store conditional
+      /// indexed. This is used to implement atomic operations.
+      STCX,
+
+      /// TC_RETURN - A tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
+      /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+      /// TLS model, produces an ADDIS8 instruction that adds the GOT
+      /// base to sym@got@tprel@ha.
+      ADDIS_GOT_TPREL_HA,
+
+      /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+      /// TLS model, produces a LD instruction with base register G8RReg
+      /// and offset sym@got@tprel@l.  This completes the addition that
+      /// finds the offset of "sym" relative to the thread pointer.
+      LD_GOT_TPREL_L,
+
+      /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
+      /// model, produces an ADD instruction that adds the contents of
+      /// G8RReg to the thread pointer.  Symbol contains a relocation
+      /// sym@tls which is to be replaced by the thread pointer and
+      /// identifies to the linker that the instruction is part of a
+      /// TLS sequence.
+      ADD_TLS,
+
+      /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym@got@tlsgd@ha.
+      ADDIS_TLSGD_HA,
+
+      /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@tlsgd@l.
+      ADDI_TLSGD_L,
+
+      /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym@tlsgd).
+      GET_TLS_ADDR,
+
+      /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym@got@tlsld@ha.
+      ADDIS_TLSLD_HA,
+
+      /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@tlsld@l.
+      ADDI_TLSLD_L,
+
+      /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym@tlsld).
+      GET_TLSLD_ADDR,
+
+      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
+      /// local-dynamic TLS model, produces an ADDIS8 instruction
+      /// that adds X3 to sym@dtprel@ha.  The Chain operand is needed 
+      /// to tie this in place following a copy to %X3 from the result
+      /// of a GET_TLSLD_ADDR.
+      ADDIS_DTPREL_HA,
+
+      /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@dtprel@l.
+      ADDI_DTPREL_L,
+
+      /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+      /// during instruction selection to optimize a BUILD_VECTOR into
+      /// operations on splats.  This is necessary to avoid losing these
+      /// optimizations due to constant folding.
+      VADD_SPLAT,
+
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX = ISD::FIRST_TARGET_MEMORY_OPCODE,
+
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX,
+
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to.
+      STFIWX,
+
+      /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+      /// load which sign-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWAX,
+
+      /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+      /// load which zero-extends from a 32-bit integer value into the
+      /// destination 64-bit register.
+      LFIWZX,
+
+      /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
+      /// produces an ADDIS8 instruction that adds the TOC base register to
+      /// sym@toc@ha.
+      ADDIS_TOC_HA,
+
+      /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
+      /// produces a LD instruction with base register G8RReg and offset
+      /// sym@toc@l.  Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      LD_TOC_L,
+
+      /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
+      /// an ADDI8 instruction that adds G8RReg to sym@toc@l.
+      /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      ADDI_TOC_L
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+
+    /// isAllNegativeZeroVector - Returns true if all elements of build_vector
+    /// are -0.0.
+    bool isAllNegativeZeroVector(SDNode *N);
+
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+
+  class PPCTargetLowering : public TargetLowering {
+    const PPCSubtarget &PPCSubTarget;
+    const PPCRegisterInfo *PPCRegInfo;
+
+  public:
+    explicit PPCTargetLowering(PPCTargetMachine &TM);
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+                             SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.
+    bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+                             SelectionDAG &DAG) const;
+
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+                                 SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImmShift - Returns true if the address N can be
+    /// represented by a base register plus a signed 14-bit displacement
+    /// [r+imm*4].  Suitable for use by STD and friends.
+    bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
+                                  SelectionDAG &DAG) const;
+
+    Sched::Preference getSchedulingPreference(SDNode *N) const;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+                                        MachineBasicBlock *MBB, bool is64Bit,
+                                        unsigned BinOpcode) const;
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI,
+                                                MachineBasicBlock *MBB,
+                                            bool is8bit, unsigned Opcode) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area.  This is the actual
+    /// alignment, not its logarithm.
+    unsigned getByValTypeAlignment(Type *Ty) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode for load / store of the
+    /// given type.
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
+
+    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
+    /// the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    virtual EVT
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                        MachineFunction &MF) const;
+
+    /// Is unaligned memory access allowed for the given type, and is it fast
+    /// relative to software emulation.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const;
+
+    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+    /// is expanded to mul + add.
+    virtual bool isFMAFasterThanMulAndAdd(EVT VT) const;
+
+  private:
+    SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+    SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                         int SPDiff,
+                                         SDValue Chain,
+                                         SDValue &LROpOut,
+                                         SDValue &FPOpOut,
+                                         bool isDarwinABI,
+                                         DebugLoc dl) const;
+
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                         const PPCSubtarget &Subtarget) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                       const PPCSubtarget &Subtarget) const;
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                const PPCSubtarget &Subtarget) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool isTailCall,
+                       bool isVarArg,
+                       SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8>
+                         &RegsToPass,
+                       SDValue InFlag, SDValue Chain,
+                       SDValue &Callee,
+                       int SPDiff, unsigned NumBytes,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    SDValue
+      extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
+                        SDValue ArgVal, DebugLoc dl) const;
+
+    void
+      setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                         unsigned nAltivecParamsAtEnd,
+                         unsigned MinReservedArea, bool isPPC64) const;
+
+    SDValue
+      LowerFormalArguments_Darwin(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_64SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_32SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                 SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+                                 SelectionDAG &DAG, DebugLoc dl) const;
+
+    SDValue
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                     bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
+    SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+  };
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 000000000000..fa5b65f0ba2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -0,0 +1,968 @@
+//===-- PPCInstr64Bit.td - The PowerPC 64-bit Support ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def symbolHi64 : Operand<i64> {
+  let PrintMethod = "printSymbolHi";
+  let EncoderMethod = "getHA16Encoding";
+}
+def symbolLo64 : Operand<i64> {
+  let PrintMethod = "printSymbolLo";
+  let EncoderMethod = "getLO16Encoding";
+}
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i64imm:$imm);
+}
+def tlsreg : Operand<i64> {
+  let EncoderMethod = "getTLSRegEncoding";
+}
+def tlsgd : Operand<i64> {}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getZExtValue());
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def HI32_48 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 32));
+}]>;
+
+def HI48_64 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 48));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in
+    def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+        Requires<[In64BitMode]>;
+}
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
+                    PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let Defs = [CTR8], Uses = [CTR8] in {
+    def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdz $dst">;
+    def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdnz $dst">;
+  }
+}
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                     "bl $func", BrB, []>;  // See Pat patterns below.
+
+    def BLA8 : IForm<18, 1, 1, (outs), (ins aaddr:$func),
+                     "bla $func", BrB, [(PPCcall (i64 imm:$func))]>;
+  }
+  let Uses = [RM], isCodeGenOnly = 1 in {
+    def BL8_NOP  : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                             (outs), (ins calltarget:$func),
+                             "bl $func\n\tnop", BrB, []>;
+
+    def BL8_NOP_TLSGD : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins calltarget:$func, tlsgd:$sym),
+                                  "bl $func($sym)\n\tnop", BrB, []>;
+
+    def BL8_NOP_TLSLD : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins calltarget:$func, tlsgd:$sym),
+                                  "bl $func($sym)\n\tnop", BrB, []>;
+
+    def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
+                             (outs), (ins aaddr:$func),
+                             "bla $func\n\tnop", BrB,
+                             [(PPCcall_nop (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                              "bctrl", BrB, [(PPCbctrl)]>,
+                 Requires<[In64BitMode]>;
+  }
+}
+
+
+// Calls
+def : Pat<(PPCcall (i64 tglobaladdr:$dst)),
+          (BL8 tglobaladdr:$dst)>;
+def : Pat<(PPCcall_nop (i64 tglobaladdr:$dst)),
+          (BL8_NOP tglobaladdr:$dst)>;
+
+def : Pat<(PPCcall (i64 texternalsym:$dst)),
+          (BL8 texternalsym:$dst)>;
+def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
+          (BL8_NOP texternalsym:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_ADD_I64",
+      [(set i64:$dst, (atomic_load_add_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_SUB_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_SUB_I64",
+      [(set i64:$dst, (atomic_load_sub_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_OR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_OR_I64",
+      [(set i64:$dst, (atomic_load_or_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_XOR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_XOR_I64",
+      [(set i64:$dst, (atomic_load_xor_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_AND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_AND_i64",
+      [(set i64:$dst, (atomic_load_and_64 xoaddr:$ptr, i64:$incr))]>;
+    def ATOMIC_LOAD_NAND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_NAND_I64",
+      [(set i64:$dst, (atomic_load_nand_64 xoaddr:$ptr, i64:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new), "#ATOMIC_CMP_SWAP_I64",
+      [(set i64:$dst, (atomic_cmp_swap_64 xoaddr:$ptr, i64:$old, i64:$new))]>;
+
+    def ATOMIC_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new), "#ATOMIC_SWAP_I64",
+      [(set i64:$dst, (atomic_swap_64 xoaddr:$ptr, i64:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LDARX : XForm_1<31,  84, (outs G8RC:$rD), (ins memrr:$ptr),
+                   "ldarx $rD, $ptr", LdStLDARX,
+                   [(set i64:$rD, (PPClarx xoaddr:$ptr))]>;
+
+let Defs = [CR0] in
+def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdcx. $rS, $dst", LdStSTDCX,
+                   [(PPCstcx i64:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd8 $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
+                 "#TC_RETURNa8 $func $offset",
+                 [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset),
+                 "#TC_RETURNr8 $dst $offset",
+                 []>;
+
+let isCodeGenOnly = 1 in {
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+    Requires<[In64BitMode]>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+}
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+          (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+// 64-bit CR instructions
+def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+
+let isCodeGenOnly = 1 in
+def MFCR8pseud: XFXForm_3<31, 19, (outs G8RC:$rT), (ins crbitm:$FXM),
+                       "#MFCR8pseud", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+            
+def MFCR8 : XFXForm_3<31, 19, (outs G8RC:$rT), (ins),
+                     "mfcr $rT", SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp64  : Pseudo<(outs GPRC:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp64 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs G8RC:$rT), (ins),
+                           "mfctr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr i64:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins G8RC:$rS),
+                           "mtctr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Pattern = [(set i64:$rT, readcyclecounter)] in
+def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs G8RC:$rT), (ins),
+                          "mfspr $rT, 268", SprMFTB>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+// Note that encoding mftb using mfspr is now the preferred form,
+// and has been since at least ISA v2.03. The mftb instruction has
+// now been phased out. Using mfspr, however, is known not to work on
+// the POWER3.
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"#DYNALLOC8",
+                       [(set i64:$result,
+                             (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins G8RC:$rS),
+                           "mtlr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
+                           "mflr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
+                      "li $rD, $imm", IntSimple,
+                      [(set i64:$rD, immSExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
+                      "lis $rD, $imm", IntSimple,
+                      [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
+}
+
+// Logical ops.
+def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nand $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (not (and i64:$rS, i64:$rB)))]>;
+def AND8 : XForm_6<31,  28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "and $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (and i64:$rS, i64:$rB))]>;
+def ANDC8: XForm_6<31,  60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "andc $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (and i64:$rS, (not i64:$rB)))]>;
+def OR8  : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (or i64:$rS, i64:$rB))]>;
+def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nor $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (not (or i64:$rS, i64:$rB)))]>;
+def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "orc $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (or i64:$rS, (not i64:$rB)))]>;
+def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "eqv $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (not (xor i64:$rS, i64:$rB)))]>;
+def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "xor $rA, $rS, $rB", IntSimple,
+                   [(set i64:$rA, (xor i64:$rS, i64:$rB))]>;
+
+// Logical ops with immediate.
+def ANDIo8  : DForm_4<28, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "andi. $dst, $src1, $src2", IntGeneral,
+                      [(set i64:$dst, (and i64:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                     "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set i64:$dst, (and i64:$src1, imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+def ORI8    : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "ori $dst, $src1, $src2", IntSimple,
+                      [(set i64:$dst, (or i64:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "oris $dst, $src1, $src2", IntSimple,
+                    [(set i64:$dst, (or i64:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xori $dst, $src1, $src2", IntSimple,
+                      [(set i64:$dst, (xor i64:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xoris $dst, $src1, $src2", IntSimple,
+                   [(set i64:$dst, (xor i64:$src1, imm16ShiftedZExt:$src2))]>;
+
+def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "add $rT, $rA, $rB", IntSimple,
+                     [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
+// ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
+// initial-exec thread-local storage model.
+let isCodeGenOnly = 1 in
+def ADD8TLS  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, tlsreg:$rB),
+                        "add $rT, $rA, $rB@tls", IntSimple,
+                        [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
+                     
+let Defs = [CARRY] in {
+def ADDC8 : XOForm_1<31, 10, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set i64:$rT, (addc i64:$rA, i64:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set i64:$rD, (addc i64:$rA, immSExt16:$imm))]>;
+}
+def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC_NOX0:$rA, symbolLo64:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set i64:$rD, (add i64:$rA, immSExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC_NOX0:$rA, symbolHi64:$imm),
+                     "addis $rD, $rA, $imm", IntSimple,
+                     [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
+
+let Defs = [CARRY] in {
+def SUBFIC8: DForm_2< 8, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set i64:$rD, (subc immSExt16:$imm, i64:$rA))]>;
+def SUBFC8 : XOForm_1<31, 8, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                      "subfc $rT, $rA, $rB", IntGeneral,
+                      [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
+                      PPC970_DGroup_Cracked;
+}
+def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set i64:$rT, (sub i64:$rB, i64:$rA))]>;
+def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "neg $rT, $rA", IntSimple,
+                       [(set i64:$rT, (ineg i64:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "adde $rT, $rA, $rB", IntGeneral,
+                       [(set i64:$rT, (adde i64:$rA, i64:$rB))]>;
+def ADDME8  : XOForm_3<31, 234, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addme $rT, $rA", IntGeneral,
+                       [(set i64:$rT, (adde i64:$rA, -1))]>;
+def ADDZE8  : XOForm_3<31, 202, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addze $rT, $rA", IntGeneral,
+                       [(set i64:$rT, (adde i64:$rA, 0))]>;
+def SUBFE8  : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "subfe $rT, $rA, $rB", IntGeneral,
+                       [(set i64:$rT, (sube i64:$rB, i64:$rA))]>;
+def SUBFME8 : XOForm_3<31, 232, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfme $rT, $rA", IntGeneral,
+                       [(set i64:$rT, (sube -1, i64:$rA))]>;
+def SUBFZE8 : XOForm_3<31, 200, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfze $rT, $rA", IntGeneral,
+                       [(set i64:$rT, (sube 0, i64:$rA))]>;
+}
+
+
+def MULHD : XOForm_1<31, 73, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhd $rT, $rA, $rB", IntMulHW,
+                     [(set i64:$rT, (mulhs i64:$rA, i64:$rB))]>;
+def MULHDU : XOForm_1<31, 9, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhdu $rT, $rA, $rB", IntMulHWU,
+                     [(set i64:$rT, (mulhu i64:$rA, i64:$rB))]>;
+
+def CMPD   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpd $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPLD  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpld $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPDI  : DForm_5_ext<11, (outs CRRC:$crD), (ins G8RC:$rA, s16imm:$imm),
+                         "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64;
+def CMPLDI : DForm_6_ext<10, (outs CRRC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                         "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64;
+
+def SLD  : XForm_6<31,  27, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "sld $rA, $rS, $rB", IntRotateD,
+                   [(set i64:$rA, (PPCshl i64:$rS, i32:$rB))]>, isPPC64;
+def SRD  : XForm_6<31, 539, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srd $rA, $rS, $rB", IntRotateD,
+                   [(set i64:$rA, (PPCsrl i64:$rS, i32:$rB))]>, isPPC64;
+let Defs = [CARRY] in {
+def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srad $rA, $rS, $rB", IntRotateD,
+                   [(set i64:$rA, (PPCsra i64:$rS, i32:$rB))]>, isPPC64;
+}
+                   
+def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsb $rA, $rS", IntSimple,
+                      [(set i64:$rA, (sext_inreg i64:$rS, i8))]>;
+def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsh $rA, $rS", IntSimple,
+                      [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
+
+def EXTSW  : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsw $rA, $rS", IntSimple,
+                      [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
+def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
+                      "extsw $rA, $rS", IntSimple,
+                      [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+
+let Defs = [CARRY] in {
+def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
+                      "sradi $rA, $rS, $SH", IntRotateDI,
+                      [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+}
+def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "cntlzd $rA, $rS", IntGeneral,
+                      [(set i64:$rA, (ctlz i64:$rS))]>;
+def POPCNTD : XForm_11<31, 506, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "popcntd $rA, $rS", IntGeneral,
+                      [(set i64:$rA, (ctpop i64:$rS))]>;
+
+// popcntw also does a population count on the high 32 bits (storing the
+// results in the high 32-bits of the output). We'll ignore that here (which is
+// safe because we never separately use the high part of the 64-bit registers).
+def POPCNTW : XForm_11<31, 378, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "popcntw $rA, $rS", IntGeneral,
+                      [(set i32:$rA, (ctpop i32:$rS))]>;
+
+def DIVD  : XOForm_1<31, 489, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divd $rT, $rA, $rB", IntDivD,
+                     [(set i64:$rT, (sdiv i64:$rA, i64:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVDU : XOForm_1<31, 457, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divdu $rT, $rA, $rB", IntDivD,
+                     [(set i64:$rT, (udiv i64:$rA, i64:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulld $rT, $rA, $rB", IntMulHD,
+                     [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+
+
+let isCommutable = 1 in {
+def RLDIMI : MDForm_1<30, 3,
+                      (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateDI,
+                      []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+
+// Rotate instructions.
+def RLDCL  : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MBE),
+                      "rldcl $rA, $rS, $rB, $MBE", IntRotateD,
+                      []>, isPPC64;
+def RLDICL : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
+                      []>, isPPC64;
+def RLDICR : MDForm_1<30, 1,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicr $rA, $rS, $SH, $MBE", IntRotateDI,
+                      []>, isPPC64;
+
+def RLWINM8 : MForm_2<21,
+                     (outs G8RC:$rA), (ins G8RC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+
+def ISEL8   : AForm_4<31, 15,
+                     (outs G8RC:$rT), (ins G8RC_NOX0:$rA, G8RC:$rB, CRBITRC:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LHA8: DForm_1<42, (outs G8RC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set i64:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (outs G8RC:$rD), (ins memrix:$src),
+                    "lwa $rD, $src", LdStLWA,
+                    [(set i64:$rD,
+                          (aligned4sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+def LHAX8: XForm_1<31, 343, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set i64:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwax $rD, $src", LdStLHA,
+                   [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+
+// Update forms.
+let mayLoad = 1 in {
+def LHAU8 : DForm_1<43, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memri:$addr),
+                    "lhau $rD, $addr", LdStLHAU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lhaux $rD, $addr", LdStLHAU,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWAUX : XForm_1<31, 373, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                    (ins memrr:$addr),
+                    "lwaux $rD, $addr", LdStLHAU,
+                    []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                    NoEncode<"$ea_result">, isPPC64;
+}
+}
+
+// Zero extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs G8RC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStLoad,
+                  [(set i64:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs G8RC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStLoad,
+                  [(set i64:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs G8RC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStLoad,
+                  [(set i64:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (outs G8RC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStLoad,
+                   [(set i64:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStLoad,
+                   [(set i64:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStLoad,
+                   [(set i64:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+let mayLoad = 1 in {
+def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lbzu $rD, $addr", LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lhzu $rD, $addr", LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                    "lwzu $rD, $addr", LdStLoadUpd,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+
+def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LHZUX8 : XForm_1<31, 311, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+
+// Full 8-byte loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
+                    "ld $rD, $src", LdStLD,
+                    [(set i64:$rD, (aligned4load ixaddr:$src))]>, isPPC64;
+// The following three definitions are selected for small code model only.
+// Otherwise, we need to create two instructions to form a 32-bit offset,
+// so we have a custom matcher for TOC_ENTRY in PPCDAGToDAGIsel::Select().
+def LDtoc: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtoc",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocJTI",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tjumptable:$disp, i64:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocCPT",
+                  [(set i64:$rD,
+                     (PPCtoc_entry tconstpool:$disp, i64:$reg))]>, isPPC64;
+
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
+let RST = 2, DS = 2 in
+def LDinto_toc: DSForm_1a<58, 0, (outs), (ins G8RC:$reg),
+                    "ld 2, 8($reg)", LdStLD,
+                    [(PPCload_toc i64:$reg)]>, isPPC64;
+                    
+let RST = 2, DS = 10, RA = 1 in
+def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
+                    "ld 2, 40(1)", LdStLD,
+                    [(PPCtoc_restore)]>, isPPC64;
+}
+def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
+                   "ldx $rD, $src", LdStLD,
+                   [(set i64:$rD, (load xaddr:$src))]>, isPPC64;
+def LDBRX : XForm_1<31,  532, (outs G8RC:$rD), (ins memrr:$src),
+                   "ldbrx $rD, $src", LdStLoad,
+                   [(set i64:$rD, (PPClbrx xoaddr:$src, i64))]>, isPPC64;
+
+let mayLoad = 1 in
+def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc_nor0:$ea_result), (ins memrix:$addr),
+                    "ldu $rD, $addr", LdStLDU,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "ldux $rD, $addr", LdStLDU,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">, isPPC64;
+}
+
+def : Pat<(PPCload ixaddr:$src),
+          (LD ixaddr:$src)>;
+def : Pat<(PPCload xaddr:$src),
+          (LDX xaddr:$src)>;
+
+// Support for medium and large code model.
+def ADDIStocHA: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, tocentry:$disp),
+                       "#ADDIStocHA",
+                       [(set i64:$rD,
+                         (PPCaddisTocHA i64:$reg, tglobaladdr:$disp))]>,
+                       isPPC64;
+def LDtocL: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC_NOX0:$reg),
+                   "#LDtocL",
+                   [(set i64:$rD,
+                     (PPCldTocL tglobaladdr:$disp, i64:$reg))]>, isPPC64;
+def ADDItocL: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, tocentry:$disp),
+                     "#ADDItocL",
+                     [(set i64:$rD,
+                       (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
+
+// Support for thread-local storage.
+def ADDISgotTprelHA: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolHi64:$disp),
+                         "#ADDISgotTprelHA",
+                         [(set i64:$rD,
+                           (PPCaddisGotTprelHA i64:$reg,
+                                               tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def LDgotTprelL: Pseudo<(outs G8RC:$rD), (ins symbolLo64:$disp, G8RC_NOX0:$reg),
+                        "#LDgotTprelL",
+                        [(set i64:$rD,
+                          (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
+                 isPPC64;
+def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
+          (ADD8TLS $in, tglobaltlsaddr:$g)>;
+def ADDIStlsgdHA: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolHi64:$disp),
+                         "#ADDIStlsgdHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsgdL : Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolLo64:$disp),
+                       "#ADDItlsgdL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsADDR : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tlsgd:$sym),
+                        "#GETtlsADDR",
+                        [(set i64:$rD,
+                          (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                 isPPC64;
+def ADDIStlsldHA: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolHi64:$disp),
+                         "#ADDIStlsldHA",
+                         [(set i64:$rD,
+                           (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsldL : Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolLo64:$disp),
+                       "#ADDItlsldL",
+                       [(set i64:$rD,
+                         (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsldADDR : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tlsgd:$sym),
+                          "#GETtlsldADDR",
+                          [(set i64:$rD,
+                            (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
+                   isPPC64;
+def ADDISdtprelHA: Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolHi64:$disp),
+                          "#ADDISdtprelHA",
+                          [(set i64:$rD,
+                            (PPCaddisDtprelHA i64:$reg,
+                                              tglobaltlsaddr:$disp))]>,
+                   isPPC64;
+def ADDIdtprelL : Pseudo<(outs G8RC:$rD), (ins G8RC_NOX0:$reg, symbolLo64:$disp),
+                         "#ADDIdtprelL",
+                         [(set i64:$rD,
+                           (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+
+let PPC970_Unit = 2 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (outs), (ins G8RC:$rS, memri:$src),
+                   "stb $rS, $src", LdStStore,
+                   [(truncstorei8 i64:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins G8RC:$rS, memri:$src),
+                   "sth $rS, $src", LdStStore,
+                   [(truncstorei16 i64:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins G8RC:$rS, memri:$src),
+                   "stw $rS, $src", LdStStore,
+                   [(truncstorei32 i64:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStStore,
+                   [(truncstorei8 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStStore,
+                   [(truncstorei16 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStStore,
+                   [(truncstorei32 i64:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (outs), (ins G8RC:$rS, memrix:$dst),
+                    "std $rS, $dst", LdStSTD,
+                    [(aligned4store i64:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdx $rS, $dst", LdStSTD,
+                   [(store i64:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+def STDBRX: XForm_8<31, 660, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdbrx $rS, $dst", LdStStore,
+                   [(PPCstbrx i64:$rS, xoaddr:$dst, i64)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+// Stores with Update (pre-inc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memri:$dst),
+                   "stbu $rS, $dst", LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memri:$dst),
+                   "sthu $rS, $dst", LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memri:$dst),
+                   "stwu $rS, $dst", LdStStoreUpd, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STDU : DSForm_1<62, 1, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memrix:$dst),
+                   "stdu $rS, $dst", LdStSTDU, []>,
+                   RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">,
+                   isPPC64;
+
+def STBUX8: XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memrr:$dst),
+                    "stbux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX8: XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memrr:$dst),
+                    "sthux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX8: XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memrr:$dst),
+                    "stwux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STDUX : XForm_8<31, 181, (outs ptr_rc_nor0:$ea_res), (ins G8RC:$rS, memrr:$dst),
+                    "stdux $rS, $dst", LdStSTDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked, isPPC64;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU8 $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(aligned4pre_store i64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+def : Pat<(pre_truncsti8 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti32 i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX8 $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STDUX $rS, $ptrreg, $ptroff)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, Uses = [RM] in {  // FPU Operations.
+def FCFID  : XForm_26<63, 846, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fcfid $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fctidz $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
+
+def FCFIDU  : XForm_26<63, 974, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fcfidu $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (PPCfcfidu f64:$frB))]>, isPPC64;
+def FCFIDS  : XForm_26<59, 846, (outs F4RC:$frD), (ins F8RC:$frB),
+                      "fcfids $frD, $frB", FPGeneral,
+                      [(set f32:$frD, (PPCfcfids f64:$frB))]>, isPPC64;
+def FCFIDUS : XForm_26<59, 974, (outs F4RC:$frD), (ins F8RC:$frB),
+                      "fcfidus $frD, $frB", FPGeneral,
+                      [(set f32:$frD, (PPCfcfidus f64:$frB))]>, isPPC64;
+def FCTIDUZ : XForm_26<63, 943, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fctiduz $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (PPCfctiduz f64:$frB))]>, isPPC64;
+def FCTIWUZ : XForm_26<63, 143, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fctiwuz $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (PPCfctiwuz f64:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext i32:$in)),
+          (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
+                  0, 32)>;
+def : Pat<(i64 (anyext i32:$in)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32)>;
+def : Pat<(i32 (trunc i64:$in)),
+          (EXTRACT_SUBREG $in, sub_32)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra i64:$rS, i32:$rB),
+          (SRAD $rS, $rB)>;
+def : Pat<(srl i64:$rS, i32:$rB),
+          (SRD $rS, $rB)>;
+def : Pat<(shl i64:$rS, i32:$rB),
+          (SLD $rS, $rB)>;
+
+// SHL/SRL
+def : Pat<(shl i64:$in, (i32 imm:$imm)),
+          (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl i64:$in, i32:$sh),
+          (RLDCL $in, $sh, 0)>;
+def : Pat<(rotl i64:$in, (i32 imm:$imm)),
+          (RLDICL $in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i64:$in),
+          (ADDIS8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i64:$in),
+          (ADDI8 $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i64:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 $in, tglobaladdr:$g)>;
+def : Pat<(add i64:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 $in, tconstpool:$g)>;
+def : Pat<(add i64:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 $in, tjumptable:$g)>;
+def : Pat<(add i64:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS8 $in, tblockaddress:$g)>;
+
+// Patterns to match r+r indexed loads and stores for
+// addresses without at least 4-byte alignment.
+def : Pat<(i64 (unaligned4sextloadi32 xoaddr:$src)),
+          (LWAX xoaddr:$src)>;
+def : Pat<(i64 (unaligned4load xoaddr:$src)),
+          (LDX xoaddr:$src)>;
+def : Pat<(unaligned4store i64:$rS, xoaddr:$dst),
+          (STDX $rS, xoaddr:$dst)>;
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 000000000000..a5ba4c8aebef
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -0,0 +1,828 @@
+//===-- PPCInstrAltivec.td - The PowerPC Altivec Extension -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+// Since we canonicalize buildvectors to v16i8, all vnots "-1" operands will be
+// of that type.
+def vnot_ppc : PatFrag<(ops node:$in),
+                       (xor node:$in, (bitconvert (v16i8 immAllOnesV)))>;
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle (v16i8 node:$lhs), node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0;
+}], VSPLTISW_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int_Ty - A VAForm_1a intrinsic definition of specific type.
+class VA1a_Int_Ty<bits<6> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VAForm_1a<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB, Ty:$vC))]>;
+
+// VA1a_Int_Ty2 - A VAForm_1a intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VA1a_Int_Ty2<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType InTy>
+  : VAForm_1a<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB, InTy:$vC))]>;
+
+// VA1a_Int_Ty3 - A VAForm_1a intrinsic definition where there are two
+// input types and an output type.
+class VA1a_Int_Ty3<bits<6> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                   ValueType In1Ty, ValueType In2Ty>
+  : VAForm_1a<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set OutTy:$vD,
+                         (IntID In1Ty:$vA, In1Ty:$vB, In2Ty:$vC))]>;
+
+// VX1_Int_Ty - A VXForm_1 intrinsic definition of specific type.
+class VX1_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
+  : VXForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set Ty:$vD, (IntID Ty:$vA, Ty:$vB))]>;
+
+// VX1_Int_Ty2 - A VXForm_1 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX1_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vA, InTy:$vB))]>;
+
+// VX1_Int_Ty3 - A VXForm_1 intrinsic definition where there are two
+// input types and an output type.
+class VX1_Int_Ty3<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType In1Ty, ValueType In2Ty>
+  : VXForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set OutTy:$vD, (IntID In1Ty:$vA, In2Ty:$vB))]>;
+
+// VX2_Int_SP - A VXForm_2 intrinsic definition of vector single-precision type.
+class VX2_Int_SP<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (outs VRRC:$vD), (ins VRRC:$vB),
+             !strconcat(opc, " $vD, $vB"), VecFP,
+             [(set v4f32:$vD, (IntID v4f32:$vB))]>;
+
+// VX2_Int_Ty2 - A VXForm_2 intrinsic definition where the type of the
+// inputs doesn't match the type of the output.
+class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
+                  ValueType InTy>
+  : VXForm_2<xo, (outs VRRC:$vD), (ins VRRC:$vB),
+             !strconcat(opc, " $vD, $vB"), VecFP,
+             [(set OutTy:$vD, (IntID InTy:$vB))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def HasAltivec : Predicate<"PPCSubTarget.hasAltivec()">;
+let Predicates = [HasAltivec] in {
+
+let isCodeGenOnly = 1 in {
+def DSS      : DSS_Form<822, (outs),
+                        (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dss $STRM", LdStLoad /*FIXME*/, []>;
+def DSSALL   : DSS_Form<822, (outs),
+                        (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dssall", LdStLoad /*FIXME*/, []>;
+def DST      : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTT     : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTST    : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTSTT   : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+
+def DST64    : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTT64   : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTST64  : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+def DSTSTT64 : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+}
+
+def MFVSCR : VXForm_4<1540, (outs VRRC:$vD), (ins),
+                      "mfvscr $vD", LdStStore,
+                      [(set v8i16:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (outs), (ins VRRC:$vB),
+                      "mtvscr $vB", LdStLoad,
+                      [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
+
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvebx $vD, $src", LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvehx $vD, $src", LdStLoad,
+                   [(set v8i16:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvewx $vD, $src", LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvx $vD, $src", LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvxl $vD, $src", LdStLoad,
+                   [(set v4i32:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsl $vD, $src", LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsr $vD, $src", LdStLoad,
+                   [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", LdStStore,
+                   [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", LdStStore,
+                   [(int_ppc_altivec_stvehx v8i16:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", LdStStore,
+                   [(int_ppc_altivec_stvewx v4i32:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvx $rS, $dst", LdStStore,
+                   [(int_ppc_altivec_stvx v4i32:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", LdStStore,
+                   [(int_ppc_altivec_stvxl v4i32:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set v4f32:$vD,
+                        (fma v4f32:$vA, v4f32:$vC, v4f32:$vB))]>;
+
+// FIXME: The fma+fneg pattern won't match because fneg is not legal.
+def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set v4f32:$vD, (fneg (fma v4f32:$vA, v4f32:$vC,
+                                                  (fneg v4f32:$vB))))]>; 
+
+def VMHADDSHS  : VA1a_Int_Ty<32, "vmhaddshs", int_ppc_altivec_vmhaddshs, v8i16>;
+def VMHRADDSHS : VA1a_Int_Ty<33, "vmhraddshs", int_ppc_altivec_vmhraddshs,
+                             v8i16>;
+def VMLADDUHM  : VA1a_Int_Ty<34, "vmladduhm", int_ppc_altivec_vmladduhm, v8i16>;
+
+def VPERM      : VA1a_Int_Ty3<43, "vperm", int_ppc_altivec_vperm,
+                              v4i32, v4i32, v16i8>;
+def VSEL       : VA1a_Int_Ty<42, "vsel",  int_ppc_altivec_vsel, v4i32>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", VecFP,
+                       [(set v16i8:$vD, 
+                         (vsldoi_shuffle:$SH v16i8:$vA, v16i8:$vB))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+def VADDFP : VXForm_1<10, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddfp $vD, $vA, $vB", VecFP,
+                      [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddubm $vD, $vA, $vB", VecGeneral,
+                      [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduhm $vD, $vA, $vB", VecGeneral,
+                      [(set v8i16:$vD, (add v8i16:$vA, v8i16:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduwm $vD, $vA, $vB", VecGeneral,
+                      [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
+                      
+def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
+def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
+def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
+def VADDSWS : VX1_Int_Ty<896, "vaddsws", int_ppc_altivec_vaddsws, v4i32>;
+def VADDUBS : VX1_Int_Ty<512, "vaddubs", int_ppc_altivec_vaddubs, v16i8>;
+def VADDUHS : VX1_Int_Ty<576, "vadduhs", int_ppc_altivec_vadduhs, v8i16>;
+def VADDUWS : VX1_Int_Ty<640, "vadduws", int_ppc_altivec_vadduws, v4i32>;
+                             
+                             
+def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vand $vD, $vA, $vB", VecFP,
+                    [(set v4i32:$vD, (and v4i32:$vA, v4i32:$vB))]>;
+def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                     "vandc $vD, $vA, $vB", VecFP,
+                     [(set v4i32:$vD, (and v4i32:$vA,
+                                           (vnot_ppc v4i32:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfsx $vD, $vB, $UIMM", VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfux $vD, $vB, $UIMM", VecFP,
+                      [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfux v4i32:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctsxs $vD, $vB, $UIMM", VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctuxs $vD, $vB, $UIMM", VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctuxs v4f32:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfsx $vD, $vB, 0", VecFP,
+                       [(set v4f32:$vD,
+                             (int_ppc_altivec_vcfsx v4i32:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs VRRC:$vD), (ins VRRC:$vB),
+                        "vctuxs $vD, $vB, 0", VecFP,
+                        [(set v4i32:$vD,
+                               (int_ppc_altivec_vctuxs v4f32:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfux $vD, $vB, 0", VecFP,
+                       [(set v4f32:$vD,
+                               (int_ppc_altivec_vcfux v4i32:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs VRRC:$vD), (ins VRRC:$vB),
+                      "vctsxs $vD, $vB, 0", VecFP,
+                      [(set v4i32:$vD,
+                             (int_ppc_altivec_vctsxs v4f32:$vB, 0))]>;
+}
+def VEXPTEFP : VX2_Int_SP<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int_SP<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+def VAVGSB : VX1_Int_Ty<1282, "vavgsb", int_ppc_altivec_vavgsb, v16i8>;
+def VAVGSH : VX1_Int_Ty<1346, "vavgsh", int_ppc_altivec_vavgsh, v8i16>;
+def VAVGSW : VX1_Int_Ty<1410, "vavgsw", int_ppc_altivec_vavgsw, v4i32>;
+def VAVGUB : VX1_Int_Ty<1026, "vavgub", int_ppc_altivec_vavgub, v16i8>;
+def VAVGUH : VX1_Int_Ty<1090, "vavguh", int_ppc_altivec_vavguh, v8i16>;
+def VAVGUW : VX1_Int_Ty<1154, "vavguw", int_ppc_altivec_vavguw, v4i32>;
+
+def VMAXFP : VX1_Int_Ty<1034, "vmaxfp", int_ppc_altivec_vmaxfp, v4f32>;
+def VMAXSB : VX1_Int_Ty< 258, "vmaxsb", int_ppc_altivec_vmaxsb, v16i8>;
+def VMAXSH : VX1_Int_Ty< 322, "vmaxsh", int_ppc_altivec_vmaxsh, v8i16>;
+def VMAXSW : VX1_Int_Ty< 386, "vmaxsw", int_ppc_altivec_vmaxsw, v4i32>;
+def VMAXUB : VX1_Int_Ty<   2, "vmaxub", int_ppc_altivec_vmaxub, v16i8>;
+def VMAXUH : VX1_Int_Ty<  66, "vmaxuh", int_ppc_altivec_vmaxuh, v8i16>;
+def VMAXUW : VX1_Int_Ty< 130, "vmaxuw", int_ppc_altivec_vmaxuw, v4i32>;
+def VMINFP : VX1_Int_Ty<1098, "vminfp", int_ppc_altivec_vminfp, v4f32>;
+def VMINSB : VX1_Int_Ty< 770, "vminsb", int_ppc_altivec_vminsb, v16i8>;
+def VMINSH : VX1_Int_Ty< 834, "vminsh", int_ppc_altivec_vminsh, v8i16>;
+def VMINSW : VX1_Int_Ty< 898, "vminsw", int_ppc_altivec_vminsw, v4i32>;
+def VMINUB : VX1_Int_Ty< 514, "vminub", int_ppc_altivec_vminub, v16i8>;
+def VMINUH : VX1_Int_Ty< 578, "vminuh", int_ppc_altivec_vminuh, v8i16>;
+def VMINUW : VX1_Int_Ty< 642, "vminuw", int_ppc_altivec_vminuw, v4i32>;
+
+def VMRGHB : VXForm_1< 12, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghb $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrghb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghh $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrghh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghw $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrghw_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglb $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrglb_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglh $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrglh_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglw $vD, $vA, $vB", VecFP,
+                      [(set v16i8:$vD, (vmrglw_shuffle v16i8:$vA, v16i8:$vB))]>;
+
+def VMSUMMBM : VA1a_Int_Ty3<37, "vmsummbm", int_ppc_altivec_vmsummbm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMSHM : VA1a_Int_Ty3<40, "vmsumshm", int_ppc_altivec_vmsumshm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMSHS : VA1a_Int_Ty3<41, "vmsumshs", int_ppc_altivec_vmsumshs,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUBM : VA1a_Int_Ty3<36, "vmsumubm", int_ppc_altivec_vmsumubm,
+                            v4i32, v16i8, v4i32>;
+def VMSUMUHM : VA1a_Int_Ty3<38, "vmsumuhm", int_ppc_altivec_vmsumuhm,
+                            v4i32, v8i16, v4i32>;
+def VMSUMUHS : VA1a_Int_Ty3<39, "vmsumuhs", int_ppc_altivec_vmsumuhs,
+                            v4i32, v8i16, v4i32>;
+
+def VMULESB : VX1_Int_Ty2<776, "vmulesb", int_ppc_altivec_vmulesb,
+                          v8i16, v16i8>;
+def VMULESH : VX1_Int_Ty2<840, "vmulesh", int_ppc_altivec_vmulesh,
+                          v4i32, v8i16>;
+def VMULEUB : VX1_Int_Ty2<520, "vmuleub", int_ppc_altivec_vmuleub,
+                          v8i16, v16i8>;
+def VMULEUH : VX1_Int_Ty2<584, "vmuleuh", int_ppc_altivec_vmuleuh,
+                          v4i32, v8i16>;
+def VMULOSB : VX1_Int_Ty2<264, "vmulosb", int_ppc_altivec_vmulosb,
+                          v8i16, v16i8>;
+def VMULOSH : VX1_Int_Ty2<328, "vmulosh", int_ppc_altivec_vmulosh,
+                          v4i32, v8i16>;
+def VMULOUB : VX1_Int_Ty2<  8, "vmuloub", int_ppc_altivec_vmuloub,
+                          v8i16, v16i8>;
+def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
+                          v4i32, v8i16>;
+                       
+def VREFP     : VX2_Int_SP<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int_SP<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int_SP<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int_SP<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int_SP<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int_SP<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int_Ty<74, "vsubcuw", int_ppc_altivec_vsubcuw, v4i32>;
+
+def VSUBFP  : VXForm_1<74, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubfp $vD, $vA, $vB", VecGeneral,
+                      [(set v4f32:$vD, (fsub v4f32:$vA, v4f32:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsububm $vD, $vA, $vB", VecGeneral,
+                      [(set v16i8:$vD, (sub v16i8:$vA, v16i8:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuhm $vD, $vA, $vB", VecGeneral,
+                      [(set v8i16:$vD, (sub v8i16:$vA, v8i16:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuwm $vD, $vA, $vB", VecGeneral,
+                      [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
+                      
+def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
+def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
+def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
+def VSUBUBS : VX1_Int_Ty<1536, "vsububs" , int_ppc_altivec_vsububs, v16i8>;
+def VSUBUHS : VX1_Int_Ty<1600, "vsubuhs" , int_ppc_altivec_vsubuhs, v8i16>;
+def VSUBUWS : VX1_Int_Ty<1664, "vsubuws" , int_ppc_altivec_vsubuws, v4i32>;
+
+def VSUMSWS : VX1_Int_Ty<1928, "vsumsws" , int_ppc_altivec_vsumsws, v4i32>;
+def VSUM2SWS: VX1_Int_Ty<1672, "vsum2sws", int_ppc_altivec_vsum2sws, v4i32>;
+
+def VSUM4SBS: VX1_Int_Ty3<1672, "vsum4sbs", int_ppc_altivec_vsum4sbs,
+                          v4i32, v16i8, v4i32>;
+def VSUM4SHS: VX1_Int_Ty3<1608, "vsum4shs", int_ppc_altivec_vsum4shs,
+                          v4i32, v8i16, v4i32>;
+def VSUM4UBS: VX1_Int_Ty3<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs,
+                          v4i32, v16i8, v4i32>;
+
+def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vnor $vD, $vA, $vB", VecFP,
+                    [(set v4i32:$vD, (vnot_ppc (or v4i32:$vA,
+                                                   v4i32:$vB)))]>;
+def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vor $vD, $vA, $vB", VecFP,
+                      [(set v4i32:$vD, (or v4i32:$vA, v4i32:$vB))]>;
+def VXOR : VXForm_1<1220, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vxor $vD, $vA, $vB", VecFP,
+                      [(set v4i32:$vD, (xor v4i32:$vA, v4i32:$vB))]>;
+
+def VRLB   : VX1_Int_Ty<   4, "vrlb", int_ppc_altivec_vrlb, v16i8>;
+def VRLH   : VX1_Int_Ty<  68, "vrlh", int_ppc_altivec_vrlh, v8i16>;
+def VRLW   : VX1_Int_Ty< 132, "vrlw", int_ppc_altivec_vrlw, v4i32>;
+
+def VSL    : VX1_Int_Ty< 452, "vsl" , int_ppc_altivec_vsl,  v4i32 >;
+def VSLO   : VX1_Int_Ty<1036, "vslo", int_ppc_altivec_vslo, v4i32>;
+
+def VSLB   : VX1_Int_Ty< 260, "vslb", int_ppc_altivec_vslb, v16i8>;
+def VSLH   : VX1_Int_Ty< 324, "vslh", int_ppc_altivec_vslh, v8i16>;
+def VSLW   : VX1_Int_Ty< 388, "vslw", int_ppc_altivec_vslw, v4i32>;
+
+def VSPLTB : VXForm_1<524, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltb $vD, $vB, $UIMM", VecPerm,
+                      [(set v16i8:$vD,
+                        (vspltb_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vsplth $vD, $vB, $UIMM", VecPerm,
+                      [(set v16i8:$vD,
+                        (vsplth_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltw $vD, $vB, $UIMM", VecPerm,
+                      [(set v16i8:$vD, 
+                        (vspltw_shuffle:$UIMM v16i8:$vB, (undef)))]>;
+
+def VSR    : VX1_Int_Ty< 708, "vsr"  , int_ppc_altivec_vsr,  v4i32>;
+def VSRO   : VX1_Int_Ty<1100, "vsro" , int_ppc_altivec_vsro, v4i32>;
+
+def VSRAB  : VX1_Int_Ty< 772, "vsrab", int_ppc_altivec_vsrab, v16i8>;
+def VSRAH  : VX1_Int_Ty< 836, "vsrah", int_ppc_altivec_vsrah, v8i16>;
+def VSRAW  : VX1_Int_Ty< 900, "vsraw", int_ppc_altivec_vsraw, v4i32>;
+def VSRB   : VX1_Int_Ty< 516, "vsrb" , int_ppc_altivec_vsrb , v16i8>;
+def VSRH   : VX1_Int_Ty< 580, "vsrh" , int_ppc_altivec_vsrh , v8i16>;
+def VSRW   : VX1_Int_Ty< 644, "vsrw" , int_ppc_altivec_vsrw , v4i32>;
+
+
+def VSPLTISB : VXForm_3<780, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", VecPerm,
+                       [(set v16i8:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", VecPerm,
+                       [(set v8i16:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", VecPerm,
+                       [(set v4i32:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int_Ty2<782, "vpkpx", int_ppc_altivec_vpkpx,
+                          v8i16, v4i32>;
+def VPKSHSS : VX1_Int_Ty2<398, "vpkshss", int_ppc_altivec_vpkshss,
+                          v16i8, v8i16>;
+def VPKSHUS : VX1_Int_Ty2<270, "vpkshus", int_ppc_altivec_vpkshus,
+                          v16i8, v8i16>;
+def VPKSWSS : VX1_Int_Ty2<462, "vpkswss", int_ppc_altivec_vpkswss,
+                          v16i8, v4i32>;
+def VPKSWUS : VX1_Int_Ty2<334, "vpkswus", int_ppc_altivec_vpkswus,
+                          v8i16, v4i32>;
+def VPKUHUM : VXForm_1<14, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuhum $vD, $vA, $vB", VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuhum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUHUS : VX1_Int_Ty2<142, "vpkuhus", int_ppc_altivec_vpkuhus,
+                          v16i8, v8i16>;
+def VPKUWUM : VXForm_1<78, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuwum $vD, $vA, $vB", VecFP,
+                       [(set v16i8:$vD,
+                         (vpkuwum_shuffle v16i8:$vA, v16i8:$vB))]>;
+def VPKUWUS : VX1_Int_Ty2<206, "vpkuwus", int_ppc_altivec_vpkuwus,
+                          v8i16, v4i32>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int_Ty2<846, "vupkhpx", int_ppc_altivec_vupkhpx,
+                          v4i32, v8i16>;
+def VUPKHSB : VX2_Int_Ty2<526, "vupkhsb", int_ppc_altivec_vupkhsb,
+                          v8i16, v16i8>;
+def VUPKHSH : VX2_Int_Ty2<590, "vupkhsh", int_ppc_altivec_vupkhsh,
+                          v4i32, v8i16>;
+def VUPKLPX : VX2_Int_Ty2<974, "vupklpx", int_ppc_altivec_vupklpx,
+                          v4i32, v8i16>;
+def VUPKLSB : VX2_Int_Ty2<654, "vupklsb", int_ppc_altivec_vupklsb,
+                          v8i16, v16i8>;
+def VUPKLSH : VX2_Int_Ty2<718, "vupklsh", int_ppc_altivec_vupklsh,
+                          v4i32, v8i16>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp Ty:$vA, Ty:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set Ty:$vD, (Ty (PPCvcmp_o Ty:$vA, Ty:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+                      
+let isCodeGenOnly = 1 in
+def V_SET0 : VXForm_setzero<1220, (outs VRRC:$vD), (ins),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
+let IMM=-1 in {
+def V_SETALLONES : VXForm_3<908, (outs VRRC:$vD), (ins),
+                      "vspltisw $vD, -1", VecFP,
+                      [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
+}
+} // VALU Operations.
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// DS* intrinsics
+def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>;
+def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
+
+//  * 32-bit
+def : Pat<(int_ppc_altivec_dst i32:$rA, i32:$rB, imm:$STRM),
+          (DST 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstt i32:$rA, i32:$rB, imm:$STRM),
+          (DSTT 1, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstst i32:$rA, i32:$rB, imm:$STRM),
+          (DSTST 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dststt i32:$rA, i32:$rB, imm:$STRM),
+          (DSTSTT 1, imm:$STRM, $rA, $rB)>;
+
+//  * 64-bit
+def : Pat<(int_ppc_altivec_dst i64:$rA, i32:$rB, imm:$STRM),
+          (DST64 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstt i64:$rA, i32:$rB, imm:$STRM),
+          (DSTT64 1, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dstst i64:$rA, i32:$rB, imm:$STRM),
+          (DSTST64 0, imm:$STRM, $rA, $rB)>;
+def : Pat<(int_ppc_altivec_dststt i64:$rA, i32:$rB, imm:$STRM),
+          (DSTSTT64 1, imm:$STRM, $rA, $rB)>;
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store v4i32:$rS, xoaddr:$dst),
+          (STVX $rS, xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in v16i8:$vA, undef),
+        (VSLDOI $vA, $vA, (VSLDOI_unary_get_imm $in))>;
+def:Pat<(vpkuwum_unary_shuffle v16i8:$vA, undef),
+        (VPKUWUM $vA, $vA)>;
+def:Pat<(vpkuhum_unary_shuffle v16i8:$vA, undef),
+        (VPKUHUM $vA, $vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle v16i8:$vA, undef),
+        (VMRGLB $vA, $vA)>;
+def:Pat<(vmrglh_unary_shuffle v16i8:$vA, undef),
+        (VMRGLH $vA, $vA)>;
+def:Pat<(vmrglw_unary_shuffle v16i8:$vA, undef),
+        (VMRGLW $vA, $vA)>;
+def:Pat<(vmrghb_unary_shuffle v16i8:$vA, undef),
+        (VMRGHB $vA, $vA)>;
+def:Pat<(vmrghh_unary_shuffle v16i8:$vA, undef),
+        (VMRGHH $vA, $vA)>;
+def:Pat<(vmrghw_unary_shuffle v16i8:$vA, undef),
+        (VMRGHW $vA, $vA)>;
+
+// Logical Operations
+def : Pat<(vnot_ppc v4i32:$vA), (VNOR $vA, $vA)>;
+
+def : Pat<(vnot_ppc (or v4i32:$A, v4i32:$B)),
+          (VNOR $A, $B)>;
+def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
+          (VANDC $A, $B)>;
+
+def : Pat<(fmul v4f32:$vA, v4f32:$vB),
+          (VMADDFP $vA, $vB,
+             (v4i32 (VSLW (V_SETALLONES), (V_SETALLONES))))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VMADDFP $A, $B, $C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+          (VNMSUBFP $A, $B, $C)>;
+
+def : Pat<(PPCvperm v16i8:$vA, v16i8:$vB, v16i8:$vC),
+          (VPERM $vA, $vB, $vC)>;
+
+def : Pat<(PPCfre v4f32:$A), (VREFP $A)>;
+def : Pat<(PPCfrsqrte v4f32:$A), (VRSQRTEFP $A)>;
+
+// Vector shifts
+def : Pat<(v16i8 (shl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
+
+def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
+
+def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
+           (VCTSXS_0 $vA)>;
+def : Pat<(v4i32 (fp_to_uint v4f32:$vA)),
+           (VCTUXS_0 $vA)>;
+def : Pat<(v4f32 (sint_to_fp v4i32:$vA)),
+           (VCFSX_0 $vA)>;
+def : Pat<(v4f32 (uint_to_fp v4i32:$vA)),
+           (VCFUX_0 $vA)>;
+
+// Floating-point rounding
+def : Pat<(v4f32 (ffloor v4f32:$vA)),
+          (VRFIM $vA)>;
+def : Pat<(v4f32 (fceil v4f32:$vA)),
+          (VRFIP $vA)>;
+def : Pat<(v4f32 (ftrunc v4f32:$vA)),
+          (VRFIZ $vA)>;
+def : Pat<(v4f32 (fnearbyint v4f32:$vA)),
+          (VRFIN $vA)>;
+
+} // end HasAltivec
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 000000000000..b424d1101416
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrBuilder.h
@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRBUILDER_H
+#define POWERPC_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 000000000000..400b7e367bfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -0,0 +1,1003 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+// Two joined instructions; used to emit two adjacent instructions as one.
+// The itinerary from the first instruction is used for scheduling and
+// classification.
+class I2<bits<6> opcode1, bits<6> opcode2, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin>
+        : Instruction {
+  field bits<64> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode1;
+  let Inst{32-37} = opcode2;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+}
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+             string asmstr>
+  : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+  let BIBO{4-0} = bo;
+  let BIBO{6-5} = 0;
+  let CR = 0;
+}
+
+class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, BrB> {
+  bits<14> BD;
+
+  let Inst{6-10}  = bo;
+  let Inst{11-15} = bi;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = Addr{20-16}; // Base Reg
+  let Inst{16-31} = Addr{15-0};  // Displacement
+}
+
+class DForm_1a<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern>;
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class IForm_and_DForm_1<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I2<opcode1, opcode2, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<21> Addr;
+
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+
+  let Inst{38-42}  = A;
+  let Inst{43-47} = Addr{20-16}; // Base Reg
+  let Inst{48-63} = Addr{15-0};  // Displacement
+}
+
+// This is used to emit BL8+NOP.
+class IForm_and_DForm_4_zero<bits<6> opcode1, bit aa, bit lk, bits<6> opcode2,
+            dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         :  IForm_and_DForm_1<opcode1, aa, lk, opcode2,
+                              OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let Addr = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> 
+  : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RST;
+  bits<19> DS_RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = DS_RA{18-14};  // Register #
+  let Inst{16-29} = DS_RA{13-0};   // Displacement.
+  let Inst{30-31} = xo;
+}
+
+class DSForm_1a<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+   bits<5>  RST;
+   bits<14> DS;
+   bits<5>  RA;
+ 
+   let Pattern = pattern;
+   
+   let Inst{6-10}  = RST;
+   let Inst{11-15} = RA;
+   let Inst{16-29} = DS;
+   let Inst{30-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 31;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1.  42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FM;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FM;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<1> T;
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{2-6};
+  let BI{0-1} = BIBO{0-1};
+  let BI{2-4} = CR;
+  let BH = 0;
+}
+
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  rS;
+   
+  let Inst{6-10}  = rS;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8> FM;
+  bits<5> rT;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6} = 0;
+  let Inst{7-14}  = FM;
+  let Inst{15} = 0;
+  let Inst{16-20} = rT;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+  bits<5> COND;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = COND;
+  let Inst{26-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<0, OOL, IOL, asmstr, NoItinerary> {
+  let isCodeGenOnly = 1;
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 000000000000..69c54ed084be
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -0,0 +1,731 @@
+//===-- PPCInstrInfo.cpp - PowerPC Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCHazardRecognizers.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_INSTRINFO_CTOR
+#include "PPCGenInstrInfo.inc"
+
+using namespace llvm;
+
+static cl::
+opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
+            cl::desc("Disable analysis for CTR loops"));
+
+PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
+  : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+
+/// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+/// this target when scheduling the DAG.
+ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
+  const TargetMachine *TM,
+  const ScheduleDAG *DAG) const {
+  unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
+    const InstrItineraryData *II = TM->getInstrItineraryData();
+    return new PPCScoreboardHazardRecognizer(II, DAG);
+  }
+
+  return TargetInstrInfo::CreateTargetHazardRecognizer(TM, DAG);
+}
+
+/// CreateTargetPostRAHazardRecognizer - Return the postRA hazard recognizer
+/// to use for this target when scheduling the DAG.
+ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
+  const InstrItineraryData *II,
+  const ScheduleDAG *DAG) const {
+  unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
+
+  // Most subtargets use a PPC970 recognizer.
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    assert(TII && "No InstrInfo?");
+
+    return new PPCHazardRecognizer970(*TII);
+  }
+
+  return new PPCScoreboardHazardRecognizer(II, DAG);
+}
+
+// Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
+bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                         unsigned &SrcReg, unsigned &DstReg,
+                                         unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: return false;
+  case PPC::EXTSW:
+  case PPC::EXTSW_32_64:
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = PPC::sub_32;
+    return true;
+  }
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  // Note: This list must be kept consistent with LoadRegFromStackSlot.
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+  case PPC::RESTORE_CR:
+  case PPC::LVX:
+  case PPC::RESTORE_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  // Note: This list must be kept consistent with StoreRegToStackSlot.
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+  case PPC::SPILL_CR:
+  case PPC::STVX:
+  case PPC::SPILL_VRSAVE:
+    // Check for the operands added by addFrameReference (the immediate is the
+    // offset which defaults to 0).
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+// commuteInstruction - We can commute rlwimi instructions, but only if the
+// rotate amt is zero.  We also have to munge the immediates a bit.
+MachineInstr *
+PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+
+  // Normal instructions can be commuted the obvious way.
+  if (MI->getOpcode() != PPC::RLWIMI)
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI->getOperand(3).getImm() != 0)
+    return 0;
+
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  // If machine instrs are no longer in two-address forms, update
+  // destination register as well.
+  if (Reg0 == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  // Masks.
+  unsigned MB = MI->getOperand(4).getImm();
+  unsigned ME = MI->getOperand(5).getImm();
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
+    bool Reg0IsDead = MI->getOperand(0).isDead();
+    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+      .addReg(Reg2, getKillRegState(Reg2IsKill))
+      .addReg(Reg1, getKillRegState(Reg1IsKill))
+      .addImm((ME+1) & 31)
+      .addImm((MB-1) & 31);
+  }
+
+  if (ChangeReg0)
+    MI->getOperand(0).setReg(Reg2);
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setIsKill(Reg1IsKill);
+  MI->getOperand(1).setIsKill(Reg2IsKill);
+
+  // Swap the mask around.
+  MI->getOperand(4).setImm((ME+1) & 31);
+  MI->getOperand(5).setImm((MB-1) & 31);
+  return MI;
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI) const {
+  DebugLoc DL;
+  BuildMI(MBB, MI, DL, get(PPC::NOP));
+}
+
+
+// Branch analysis.
+// Note: If the condition register is set to CTR or CTR8 then this is a
+// BDNZ (imm == 1) or BDZ (imm == 0) branch.
+bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == PPC::B) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCC) {
+      if (!LastInst->getOperand(2).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDNZ8 ||
+               LastInst->getOpcode() == PPC::BDNZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(1));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BDZ8 ||
+               LastInst->getOpcode() == PPC::BDZ) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      if (DisableCTRLoopAnal)
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(0));
+      Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                               true));
+      return false;
+    }
+
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst->getOpcode() == PPC::BCC &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(2).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDNZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDNZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(1));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  } else if ((SecondLastInst->getOpcode() == PPC::BDZ8 ||
+              SecondLastInst->getOpcode() == PPC::BDZ) &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    if (DisableCTRLoopAnal)
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(0));
+    Cond.push_back(MachineOperand::CreateReg(isPPC64 ? PPC::CTR8 : PPC::CTR,
+                                             true));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == PPC::B &&
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC &&
+      I->getOpcode() != PPC::BDNZ8 && I->getOpcode() != PPC::BDNZ &&
+      I->getOpcode() != PPC::BDZ8  && I->getOpcode() != PPC::BDZ)
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "PPC branch conditions have two components!");
+
+  bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
+    else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+      BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                              (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                              (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, DL, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+
+  // Two-way Conditional Branch.
+  if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
+    BuildMI(&MBB, DL, get(Cond[0].getImm() ?
+                            (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                            (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
+  else
+    BuildMI(&MBB, DL, get(PPC::BCC))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  unsigned Opc;
+  if (PPC::GPRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR;
+  else if (PPC::G8RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::OR8;
+  else if (PPC::F4RCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::FMR;
+  else if (PPC::CRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::MCRF;
+  else if (PPC::VRRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::VOR;
+  else if (PPC::CRBITRCRegClass.contains(DestReg, SrcReg))
+    Opc = PPC::CROR;
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+
+  const MCInstrDesc &MCID = get(Opc);
+  if (MCID.getNumOperands() == 3)
+    BuildMI(MBB, I, DL, MCID, DestReg)
+      .addReg(SrcReg).addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+// This function returns true if a CR spill is necessary and false otherwise.
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+                                  unsigned SrcReg, bool isKill,
+                                  int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs,
+                                  bool &NonRI, bool &SpillsVRS) const{
+  // Note: If additional store instructions are added here,
+  // update isStoreToStackSlot.
+
+  DebugLoc DL;
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
+    // backend currently only uses CR1EQ as an individual bit, this should
+    // not cause any bug. If we need other uses of CR bits, the following
+    // code may be invalid.
+    unsigned Reg = 0;
+    if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+        SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+      Reg = PPC::CR0;
+    else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+             SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+      Reg = PPC::CR1;
+    else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+             SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+      Reg = PPC::CR2;
+    else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+             SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+      Reg = PPC::CR3;
+    else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+             SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+      Reg = PPC::CR4;
+    else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+             SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+      Reg = PPC::CR5;
+    else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+             SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+      Reg = PPC::CR6;
+    else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+             SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+      Reg = PPC::CR7;
+
+    return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
+                               &PPC::CRRCRegClass, NewMIs, NonRI, SpillsVRS);
+
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STVX))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(TM.getSubtargetImpl()->isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_VRSAVE))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false, SpillsVRS = false;
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs,
+                          NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+bool
+PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs,
+                                   bool &NonRI, bool &SpillsVRS) const{
+  // Note: If additional load instructions are added here,
+  // update isLoadFromStackSlot.
+
+  if (PPC::GPRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                               DestReg), FrameIdx));
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+                                       FrameIdx));
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+                                       FrameIdx));
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_CR), DestReg),
+                                       FrameIdx));
+    return true;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+
+    unsigned Reg = 0;
+    if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
+        DestReg == PPC::CR0EQ || DestReg == PPC::CR0UN)
+      Reg = PPC::CR0;
+    else if (DestReg == PPC::CR1LT || DestReg == PPC::CR1GT ||
+             DestReg == PPC::CR1EQ || DestReg == PPC::CR1UN)
+      Reg = PPC::CR1;
+    else if (DestReg == PPC::CR2LT || DestReg == PPC::CR2GT ||
+             DestReg == PPC::CR2EQ || DestReg == PPC::CR2UN)
+      Reg = PPC::CR2;
+    else if (DestReg == PPC::CR3LT || DestReg == PPC::CR3GT ||
+             DestReg == PPC::CR3EQ || DestReg == PPC::CR3UN)
+      Reg = PPC::CR3;
+    else if (DestReg == PPC::CR4LT || DestReg == PPC::CR4GT ||
+             DestReg == PPC::CR4EQ || DestReg == PPC::CR4UN)
+      Reg = PPC::CR4;
+    else if (DestReg == PPC::CR5LT || DestReg == PPC::CR5GT ||
+             DestReg == PPC::CR5EQ || DestReg == PPC::CR5UN)
+      Reg = PPC::CR5;
+    else if (DestReg == PPC::CR6LT || DestReg == PPC::CR6GT ||
+             DestReg == PPC::CR6EQ || DestReg == PPC::CR6UN)
+      Reg = PPC::CR6;
+    else if (DestReg == PPC::CR7LT || DestReg == PPC::CR7GT ||
+             DestReg == PPC::CR7EQ || DestReg == PPC::CR7UN)
+      Reg = PPC::CR7;
+
+    return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
+                                &PPC::CRRCRegClass, NewMIs, NonRI, SpillsVRS);
+
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LVX), DestReg),
+                                       FrameIdx));
+    NonRI = true;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    assert(TM.getSubtargetImpl()->isDarwin() &&
+           "VRSAVE only needs spill/restore on Darwin");
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL,
+                                               get(PPC::RESTORE_VRSAVE),
+                                               DestReg),
+                                       FrameIdx));
+    SpillsVRS = true;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasSpills();
+
+  bool NonRI = false, SpillsVRS = false;
+  if (LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs,
+                           NonRI, SpillsVRS))
+    FuncInfo->setSpillsCR();
+
+  if (SpillsVRS)
+    FuncInfo->setSpillsVRSAVE();
+
+  if (NonRI)
+    FuncInfo->setHasNonRISpills();
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+  NewMIs.back()->addMemOperand(MF, MMO);
+}
+
+MachineInstr*
+PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                       int FrameIx, uint64_t Offset,
+                                       const MDNode *MDPtr,
+                                       DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(PPC::DBG_VALUE));
+  addFrameReference(MIB, FrameIx, 0, false).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+bool PPCInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  if (Cond[1].getReg() == PPC::CTR8 || Cond[1].getReg() == PPC::CTR)
+    Cond[0].setImm(Cond[0].getImm() == 0 ? 1 : 0);
+  else
+    // Leave the CR# the same, but invert the condition.
+    Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case PPC::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  case PPC::PROLOG_LABEL:
+  case PPC::EH_LABEL:
+  case PPC::GC_LABEL:
+  case PPC::DBG_VALUE:
+    return 0;
+  case PPC::BL8_NOP:
+  case PPC::BLA8_NOP:
+    return 8;
+  default:
+    return 4; // PowerPC instructions are all 4 bytes
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 000000000000..635e3480b06d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -0,0 +1,157 @@
+//===-- PPCInstrInfo.h - PowerPC Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRUCTIONINFO_H
+#define POWERPC_INSTRUCTIONINFO_H
+
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "PPCGenInstrInfo.inc"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+} // end namespace PPCII
+
+
+class PPCInstrInfo : public PPCGenInstrInfo {
+  PPCTargetMachine &TM;
+  const PPCRegisterInfo RI;
+
+  bool StoreRegToStackSlot(MachineFunction &MF,
+                           unsigned SrcReg, bool isKill, int FrameIdx,
+                           const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr*> &NewMIs,
+                           bool &NonRI, bool &SpillsVRS) const;
+  bool LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr*> &NewMIs,
+                            bool &NonRI, bool &SpillsVRS) const;
+public:
+  explicit PPCInstrInfo(PPCTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+  ScheduleHazardRecognizer *
+  CreateTargetHazardRecognizer(const TargetMachine *TM,
+                               const ScheduleDAG *DAG) const;
+  ScheduleHazardRecognizer *
+  CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                     const ScheduleDAG *DAG) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SubIdx) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const;
+
+  // commuteInstruction - We can commute rlwimi instructions, but only if the
+  // rotate amt is zero.  We also have to munge the immediates a bit.
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+  virtual void insertNoop(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI) const;
+
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// GetInstSize - Return the number of bytes of code the specified
+  /// instruction may be.  This returns the maximum number of bytes.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 000000000000..ab907622beeb
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -0,0 +1,1717 @@
+//===-- PPCInstrInfo.td - The PowerPC Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPClfiwx : SDTypeProfile<1, 1, [ // lfiw[az]x
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+  SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClarx : SDTypeProfile<1, 1, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstcx : SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+  SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
+def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
+def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
+def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
+def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+                       [SDNPHasChain, SDNPMayStore]>;
+def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
+                       [SDNPHasChain, SDNPMayLoad]>;
+
+// Extract FPSCR (not modeled at the DAG level).
+def PPCmffs   : SDNode<"PPCISD::MFFS",
+                       SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
+
+// Perform FADD in round-to-zero mode.
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
+                            [SDNPMayLoad]>;
+def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp,
+                              [SDNPHasChain]>;
+def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                           SDNPVariadic]>;
+def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
+                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
+def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
+                            [SDNPHasChain, SDNPSideEffect,
+                             SDNPInGlue, SDNPOutGlue]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                       SDNPVariadic]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInGlue]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+                           [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Instructions to support atomic operations
+def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
+                          [SDNPHasChain, SDNPMayLoad]>;
+def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
+                          [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support medium and large code model
+def PPCaddisTocHA : SDNode<"PPCISD::ADDIS_TOC_HA", SDTIntBinOp, []>;
+def PPCldTocL     : SDNode<"PPCISD::LD_TOC_L", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCaddiTocL   : SDNode<"PPCISD::ADDI_TOC_L", SDTIntBinOp, []>;
+
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getZExtValue());
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getZExtValue());
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  signed int Val = N->getZExtValue();
+  return getI32Imm((Val - (signed short)Val) >> 16);
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb = 0, me;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(mb);
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me = 0;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(me);
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.  Used by instructions like 'addi'.
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
+  else
+    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getZExtValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+// Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
+// restricted memrix (offset/4) constants are alignment sensitive. If these
+// offsets are hidden behind TOC entries than the values of the lower-order
+// bits cannot be checked directly. As a result, we need to also incorporate
+// an alignment check into the relevant patterns.
+
+def aligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4store : PatFrag<(ops node:$val, node:$ptr),
+                            (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned4pre_store : PatFrag<
+                          (ops node:$val, node:$base, node:$offset),
+                          (pre_store node:$val, node:$base, node:$offset), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def unaligned4load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4store : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() < 4;
+}]>;
+def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() < 4;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   {
+  list<Register> Defs = [CR0];
+  bit RC  = 1;
+}
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def directbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getDirectBrEncoding";
+}
+def condbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+  let EncoderMethod = "getCondBrEncoding";
+}
+def calltarget : Operand<iPTR> {
+  let EncoderMethod = "getDirectBrEncoding";
+}
+def aaddr : Operand<iPTR> {
+  let PrintMethod = "printAbsAddrOperand";
+}
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+  let EncoderMethod = "getHA16Encoding";
+}
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+  let EncoderMethod = "getLO16Encoding";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+  let EncoderMethod = "get_crbitm_encoding";
+}
+// Address operands
+// A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
+def ptr_rc_nor0 : PointerLikeRegClass<1>;
+
+def dispRI : Operand<iPTR>;
+def dispRIX : Operand<iPTR>;
+
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops dispRI:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIEncoding";
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc:$offreg);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
+  let PrintMethod = "printMemRegImmShifted";
+  let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRIXEncoding";
+}
+
+// A single-register address. This is used with the SjLj
+// pseudo-instructions.
+def memr : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+}
+
+// PowerPC Predicate operand.
+def pred : Operand<OtherVT> {
+  let PrintMethod = "printPredicateOperand";
+  let MIOperandInfo = (ops i32imm:$bibo, CRRC:$reg);
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+
+// The address in a single register. This is used with the SjLj
+// pseudo-instructions.
+def addr   : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE    : Pseudo<(outs GPRC:$rD), (ins GPRC:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "#DYNALLOC",
+                       [(set i32:$result,
+                             (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
+                         
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+  // Note that SELECT_CC_I4 and SELECT_CC_I8 use the no-r0 register classes
+  // because either operand might become the first operand in an isel, and
+  // that operand cannot be r0.
+  def SELECT_CC_I4 : Pseudo<(outs GPRC:$dst), (ins CRRC:$cond,
+                              GPRC_NOR0:$T, GPRC_NOR0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I4",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(outs G8RC:$dst), (ins CRRC:$cond,
+                              G8RC_NOX0:$T, G8RC_NOX0:$F,
+                              i32imm:$BROPC), "#SELECT_CC_I8",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(outs F4RC:$dst), (ins CRRC:$cond, F4RC:$T, F4RC:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F4",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(outs F8RC:$dst), (ins CRRC:$cond, F8RC:$T, F8RC:$F,
+                              i32imm:$BROPC), "#SELECT_CC_F8",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(outs VRRC:$dst), (ins CRRC:$cond, VRRC:$T, VRRC:$F,
+                              i32imm:$BROPC), "#SELECT_CC_VRRC",
+                              []>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+let mayStore = 1 in
+def SPILL_CR : Pseudo<(outs), (ins CRRC:$cond, memri:$F),
+                     "#SPILL_CR", []>;
+
+// RESTORE_CR - Indicate that we're restoring the CR register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_CR : Pseudo<(outs CRRC:$cond), (ins memri:$F),
+                     "#RESTORE_CR", []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR, RM] in
+    def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", BrB,
+                           [(retflag)]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>;
+}
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
+                   PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
+                  "b $dst", BrB,
+                  [(br bb:$dst)]>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :(
+  let isCodeGenOnly = 1 in
+    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                    "b${cond:cc} ${cond:reg}, $dst"
+                    /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+
+  let Defs = [CTR], Uses = [CTR] in {
+    def BDZ  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz $dst">;
+    def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz $dst">;
+  }
+}
+
+// The unconditional BCL used by the SjLj setjmp code.
+let isCall = 1, hasCtrlDep = 1, isCodeGenOnly = 1, PPC970_Unit = 7 in {
+  let Defs = [LR], Uses = [RM] in {
+    def BCLalways  : BForm_2<16, 20, 31, 0, 1, (outs), (ins condbrtarget:$dst),
+                            "bcl 20, 31, $dst">;
+  }
+}
+
+let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
+                    "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA : IForm<18, 1, 1, (outs), (ins aaddr:$func),
+                    "bla $func", BrB, [(PPCcall (i32 imm:$func))]>;
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
+                             "bctrl", BrB, [(PPCbctrl)]>,
+                Requires<[In32BitMode]>;
+  }
+}
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset),
+                 "#TC_RETURNd $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
+                 "#TC_RETURNa $func $offset",
+                 [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset),
+                 "#TC_RETURNr $dst $offset",
+                 []>;
+
+
+let isCodeGenOnly = 1 in {
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+     Requires<[In32BitMode]>;
+
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+}
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : Pseudo<(outs GPRC:$dst), (ins memr:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  let isTerminator = 1 in
+  def EH_SjLj_LongJmp32 : Pseudo<(outs), (ins memr:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(PPCeh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  def EH_SjLj_Setup : Pseudo<(outs), (ins directbrtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
+                      "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst),
+                      "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst),
+                      "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst),
+                      "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst),
+                      "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst),
+                      "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst),
+                      "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
+                      "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
+          (DCBT xoaddr:$dst)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Defs = [CR0] in {
+    def ATOMIC_LOAD_ADD_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I8",
+      [(set i32:$dst, (atomic_load_add_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I8",
+      [(set i32:$dst, (atomic_load_sub_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I8",
+      [(set i32:$dst, (atomic_load_and_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I8",
+      [(set i32:$dst, (atomic_load_or_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "ATOMIC_LOAD_XOR_I8",
+      [(set i32:$dst, (atomic_load_xor_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I8",
+      [(set i32:$dst, (atomic_load_nand_8 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I16",
+      [(set i32:$dst, (atomic_load_add_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I16",
+      [(set i32:$dst, (atomic_load_sub_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I16",
+      [(set i32:$dst, (atomic_load_and_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I16",
+      [(set i32:$dst, (atomic_load_or_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I16",
+      [(set i32:$dst, (atomic_load_xor_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I16",
+      [(set i32:$dst, (atomic_load_nand_16 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I32",
+      [(set i32:$dst, (atomic_load_add_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I32",
+      [(set i32:$dst, (atomic_load_sub_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I32",
+      [(set i32:$dst, (atomic_load_and_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I32",
+      [(set i32:$dst, (atomic_load_or_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I32",
+      [(set i32:$dst, (atomic_load_xor_32 xoaddr:$ptr, i32:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I32",
+      [(set i32:$dst, (atomic_load_nand_32 xoaddr:$ptr, i32:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I8",
+      [(set i32:$dst, (atomic_cmp_swap_8 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_16 xoaddr:$ptr, i32:$old, i32:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
+      [(set i32:$dst, (atomic_cmp_swap_32 xoaddr:$ptr, i32:$old, i32:$new))]>;
+
+    def ATOMIC_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_i8",
+      [(set i32:$dst, (atomic_swap_8 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I16",
+      [(set i32:$dst, (atomic_swap_16 xoaddr:$ptr, i32:$new))]>;
+    def ATOMIC_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I32",
+      [(set i32:$dst, (atomic_swap_32 xoaddr:$ptr, i32:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LWARX : XForm_1<31,  20, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwarx $rD, $src", LdStLWARX,
+                   [(set i32:$rD, (PPClarx xoaddr:$src))]>;
+
+let Defs = [CR0] in
+def STWCX : XForm_1<31, 150, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwcx. $rS, $dst", LdStSTWCX,
+                   [(PPCstcx i32:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStLoad, [(trap)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs GPRC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStLoad,
+                  [(set i32:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs GPRC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set i32:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs GPRC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStLoad,
+                  [(set i32:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStLoad,
+                  [(set i32:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
+                  "lfs $rD, $src", LdStLFD,
+                  [(set f32:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
+                  "lfd $rD, $src", LdStLFD,
+                  [(set f64:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1 in {
+def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lbzu $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhau $rD, $addr", LdStLHAU,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lhzu $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                   "lwzu $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfsu $rD, $addr", LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
+                  "lfdu $rD, $addr", LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+
+// Indexed (r+r) Loads with Update (preinc).
+def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lbzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhaux $rD, $addr", LdStLHAU,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZUX : XForm_1<31, 311, (outs GPRC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lhzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lwzux $rD, $addr", LdStLoadUpd,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfsux $rD, $addr", LdStLFDU,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc_nor0:$ea_result),
+                   (ins memrr:$addr),
+                   "lfdux $rD, $addr", LdStLFDU,
+                   []>, RegConstraint<"$addr.ptrreg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (outs GPRC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStLoad,
+                   [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set i32:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStLoad,
+                   [(set i32:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStLoad,
+                   [(set i32:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
+def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", LdStLoad,
+                   [(set i32:$rD, (PPClbrx xoaddr:$src, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
+                      "lfsx $frD, $src", LdStLFD,
+                      [(set f32:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
+                      "lfdx $frD, $src", LdStLFD,
+                      [(set f64:$frD, (load xaddr:$src))]>;
+
+def LFIWAX : XForm_25<31, 855, (outs F8RC:$frD), (ins memrr:$src),
+                      "lfiwax $frD, $src", LdStLFD,
+                      [(set f64:$frD, (PPClfiwax xoaddr:$src))]>;
+def LFIWZX : XForm_25<31, 887, (outs F8RC:$frD), (ins memrr:$src),
+                      "lfiwzx $frD, $src", LdStLFD,
+                      [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (outs), (ins GPRC:$rS, memri:$src),
+                   "stb $rS, $src", LdStStore,
+                   [(truncstorei8 i32:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (outs), (ins GPRC:$rS, memri:$src),
+                   "sth $rS, $src", LdStStore,
+                   [(truncstorei16 i32:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
+                   "stw $rS, $src", LdStStore,
+                   [(store i32:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
+                   "stfs $rS, $dst", LdStSTFD,
+                   [(store f32:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
+                   "stfd $rS, $dst", LdStSTFD,
+                   [(store f64:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memri:$dst),
+                    "stbu $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memri:$dst),
+                    "sthu $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memri:$dst),
+                    "stwu $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins F4RC:$rS, memri:$dst),
+                    "stfsu $rS, $dst", LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<37, (outs ptr_rc_nor0:$ea_res), (ins F8RC:$rS, memri:$dst),
+                    "stfdu $rS, $dst", LdStSTFDU, []>,
+                    RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STBU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STHU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STWU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFSU $rS, iaddroff:$ptroff, $ptrreg)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iaddroff:$ptroff),
+          (STFDU $rS, iaddroff:$ptroff, $ptrreg)>;
+
+// Indexed (r+r) Stores.
+let PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStStore,
+                   [(truncstorei8 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStStore,
+                   [(truncstorei16 i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStStore,
+                   [(store i32:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+ 
+def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i16)]>,
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", LdStStore,
+                   [(PPCstbrx i32:$rS, xoaddr:$dst, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", LdStSTFD,
+                     [(PPCstfiwx f64:$frS, xoaddr:$dst)]>;
+                     
+def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", LdStSTFD,
+                     [(store f32:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", LdStSTFD,
+                     [(store f64:$frS, xaddr:$dst)]>;
+}
+
+// Indexed (r+r) Stores with Update (preinc).
+let PPC970_Unit = 2, mayStore = 1 in {
+def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memrr:$dst),
+                    "stbux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STHUX : XForm_8<31, 439, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memrr:$dst),
+                    "sthux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STWUX : XForm_8<31, 183, (outs ptr_rc_nor0:$ea_res), (ins GPRC:$rS, memrr:$dst),
+                    "stwux $rS, $dst", LdStStoreUpd, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFSUX: XForm_8<31, 695, (outs ptr_rc_nor0:$ea_res), (ins F4RC:$rS, memrr:$dst),
+                    "stfsux $rS, $dst", LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+def STFDUX: XForm_8<31, 759, (outs ptr_rc_nor0:$ea_res), (ins F8RC:$rS, memrr:$dst),
+                    "stfdux $rS, $dst", LdStSTFDU, []>,
+                    RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    PPC970_DGroup_Cracked;
+}
+
+// Patterns to match the pre-inc stores.  We can't put the patterns on
+// the instruction definitions directly as ISel wants the address base
+// and offset to be separate operands, not a single complex operand.
+def : Pat<(pre_truncsti8 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STBUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_truncsti16 i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STHUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store i32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STWUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFSUX $rS, $ptrreg, $ptroff)>;
+def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
+          (STFDUX $rS, $ptrreg, $ptroff)>;
+
+def SYNC : XForm_24_sync<31, 598, (outs), (ins),
+                        "sync", LdStSync,
+                        [(int_ppc_sync)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC_NOR0:$rA, symbolLo:$imm),
+                     "addi $rD, $rA, $imm", IntSimple,
+                     [(set i32:$rD, (add i32:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
+def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set i32:$rD, (addc i32:$rA, immSExt16:$imm))]>,
+                     PPC970_DGroup_Cracked;
+def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IntGeneral,
+                     []>;
+}
+def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC_NOR0:$rA, symbolHi:$imm),
+                     "addis $rD, $rA, $imm", IntSimple,
+                     [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
+let isCodeGenOnly = 1 in
+def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC_NOR0:$rA, symbolLo:$sym),
+                     "la $rD, $sym($rA)", IntGeneral,
+                     [(set i32:$rD, (add i32:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IntMulLI,
+                     [(set i32:$rD, (mul i32:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
+def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set i32:$rD, (subc immSExt16:$imm, i32:$rA))]>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+  def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
+                       "li $rD, $imm", IntSimple,
+                       [(set i32:$rD, immSExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm),
+                       "lis $rD, $imm", IntSimple,
+                       [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ANDIo : DForm_4<28, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set i32:$dst, (and i32:$src1, imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+def ORI   : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IntSimple,
+                    [(set i32:$dst, (or i32:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IntSimple,
+                    [(set i32:$dst, (or i32:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IntSimple,
+                    [(set i32:$dst, (xor i32:$src1, imm16ShiftedZExt:$src2))]>;
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntSimple,
+                         []>;
+def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm),
+                        "cmpwi $crD, $rA, $imm", IntCompare>;
+def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                         "cmplwi $dst, $src1, $src2", IntCompare>;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nand $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (not (and i32:$rS, i32:$rB)))]>;
+def AND  : XForm_6<31,  28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "and $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (and i32:$rS, i32:$rB))]>;
+def ANDC : XForm_6<31,  60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "andc $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (and i32:$rS, (not i32:$rB)))]>;
+def OR   : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (or i32:$rS, i32:$rB))]>;
+def NOR  : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nor $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (not (or i32:$rS, i32:$rB)))]>;
+def ORC  : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "orc $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (or i32:$rS, (not i32:$rB)))]>;
+def EQV  : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "eqv $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (not (xor i32:$rS, i32:$rB)))]>;
+def XOR  : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "xor $rA, $rS, $rB", IntSimple,
+                   [(set i32:$rA, (xor i32:$rS, i32:$rB))]>;
+def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "slw $rA, $rS, $rB", IntGeneral,
+                   [(set i32:$rA, (PPCshl i32:$rS, i32:$rB))]>;
+def SRW  : XForm_6<31, 536, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "srw $rA, $rS, $rB", IntGeneral,
+                   [(set i32:$rA, (PPCsrl i32:$rS, i32:$rB))]>;
+let Defs = [CARRY] in {
+def SRAW : XForm_6<31, 792, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "sraw $rA, $rS, $rB", IntShift,
+                   [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Defs = [CARRY] in {
+def SRAWI : XForm_10<31, 824, (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH), 
+                     "srawi $rA, $rS, $SH", IntShift,
+                     [(set i32:$rA, (sra i32:$rS, (i32 imm:$SH)))]>;
+}
+def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "cntlzw $rA, $rS", IntGeneral,
+                      [(set i32:$rA, (ctlz i32:$rS))]>;
+def EXTSB  : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsb $rA, $rS", IntSimple,
+                      [(set i32:$rA, (sext_inreg i32:$rS, i8))]>;
+def EXTSH  : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsh $rA, $rS", IntSimple,
+                      [(set i32:$rA, (sext_inreg i32:$rS, i16))]>;
+
+def CMPW   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmpw $crD, $rA, $rB", IntCompare>;
+def CMPLW  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmplw $crD, $rA, $rB", IntCompare>;
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", FPCompare>;
+def FCMPUS : XForm_17<63, 0, (outs CRRC:$crD), (ins F4RC:$fA, F4RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+def FCMPUD : XForm_17<63, 0, (outs CRRC:$crD), (ins F8RC:$fA, F8RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+
+let Uses = [RM] in {
+  def FCTIWZ : XForm_26<63, 15, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fctiwz $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
+
+  def FRSP   : XForm_26<63, 12, (outs F4RC:$frD), (ins F8RC:$frB),
+                        "frsp $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (fround f64:$frB))]>;
+
+  // The frin -> nearbyint mapping is valid only in fast-math mode.
+  def FRIND  : XForm_26<63, 392, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frin $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (fnearbyint f64:$frB))]>;
+  def FRINS  : XForm_26<63, 392, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frin $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (fnearbyint f32:$frB))]>;
+
+  // These pseudos expand to rint but also set FE_INEXACT when the result does
+  // not equal the argument.
+  let usesCustomInserter = 1, Defs = [RM] in { // FIXME: Model FPSCR!
+    def FRINDrint : Pseudo<(outs F8RC:$frD), (ins F8RC:$frB),
+                            "#FRINDrint", [(set f64:$frD, (frint f64:$frB))]>;
+    def FRINSrint : Pseudo<(outs F4RC:$frD), (ins F4RC:$frB),
+                            "#FRINSrint", [(set f32:$frD, (frint f32:$frB))]>;
+  }
+
+  def FRIPD  : XForm_26<63, 456, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frip $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (fceil f64:$frB))]>;
+  def FRIPS  : XForm_26<63, 456, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frip $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (fceil f32:$frB))]>;
+  def FRIZD  : XForm_26<63, 424, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "friz $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (ftrunc f64:$frB))]>;
+  def FRIZS  : XForm_26<63, 424, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "friz $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (ftrunc f32:$frB))]>;
+  def FRIMD  : XForm_26<63, 488, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frim $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (ffloor f64:$frB))]>;
+  def FRIMS  : XForm_26<63, 488, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frim $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (ffloor f32:$frB))]>;
+
+  def FSQRT  : XForm_26<63, 22, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fsqrt $frD, $frB", FPSqrt,
+                        [(set f64:$frD, (fsqrt f64:$frB))]>;
+  def FSQRTS : XForm_26<59, 22, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fsqrts $frD, $frB", FPSqrt,
+                        [(set f32:$frD, (fsqrt f32:$frB))]>;
+  }
+}
+
+/// Note that FMR is defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+def FMR   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
+                     "fmr $frD, $frB", FPGeneral,
+                     []>,  // (set f32:$frD, f32:$frB)
+                     PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+def FABSS  : XForm_26<63, 264, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set f32:$frD, (fabs f32:$frB))]>;
+def FABSD  : XForm_26<63, 264, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (fabs f64:$frB))]>;
+def FNABSS : XForm_26<63, 136, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set f32:$frD, (fneg (fabs f32:$frB)))]>;
+def FNABSD : XForm_26<63, 136, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (fneg (fabs f64:$frB)))]>;
+def FNEGS  : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set f32:$frD, (fneg f32:$frB))]>;
+def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set f64:$frD, (fneg f64:$frB))]>;
+
+// Reciprocal estimates.
+def FRE      : XForm_26<63, 24, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fre $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfre f64:$frB))]>;
+def FRES     : XForm_26<59, 24, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fres $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfre f32:$frB))]>;
+def FRSQRTE  : XForm_26<63, 26, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "frsqrte $frD, $frB", FPGeneral,
+                        [(set f64:$frD, (PPCfrsqrte f64:$frB))]>;
+def FRSQRTES : XForm_26<59, 26, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "frsqrtes $frD, $frB", FPGeneral,
+                        [(set f32:$frD, (PPCfrsqrte f32:$frB))]>;
+}
+
+// XL-Form instructions.  condition register logical ops.
+//
+def MCRF   : XLForm_3<19, 0, (outs CRRC:$BF), (ins CRRC:$BFA),
+                      "mcrf $BF, $BFA", BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def CREQV  : XLForm_1<19, 289, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "creqv $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def CROR  : XLForm_1<19, 449, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "cror $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+let isCodeGenOnly = 1 in {
+def CRSET  : XLForm_1_ext<19, 289, (outs CRBITRC:$dst), (ins),
+              "creqv $dst, $dst, $dst", BrCR,
+              []>;
+
+def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins),
+              "crxor $dst, $dst, $dst", BrCR,
+              []>;
+
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", BrCR,
+              [(PPCcr6unset)]>;
+}
+}
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs GPRC:$rT), (ins),
+                          "mfctr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr i32:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins GPRC:$rS),
+                          "mtctr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins GPRC:$rS),
+                          "mtlr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR  : XFXForm_1_ext<31, 339, 8, (outs GPRC:$rT), (ins),
+                          "mflr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like
+// a GPR on the PPC970.  As such, copies in and out have the same performance
+// characteristics as an OR instruction.
+def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins GPRC:$rS),
+                             "mtspr 256, $rS", IntGeneral>,
+               PPC970_DGroup_Single, PPC970_Unit_FXU;
+def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs GPRC:$rT), (ins),
+                             "mfspr $rT, 256", IntGeneral>,
+               PPC970_DGroup_First, PPC970_Unit_FXU;
+
+let isCodeGenOnly = 1 in {
+  def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
+                                (outs VRSAVERC:$reg), (ins GPRC:$rS),
+                                "mtspr 256, $rS", IntGeneral>,
+                  PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVEv : XFXForm_1_ext<31, 339, 256, (outs GPRC:$rT),
+                                (ins VRSAVERC:$reg),
+                                "mfspr $rT, 256", IntGeneral>,
+                  PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// SPILL_VRSAVE - Indicate that we're dumping the VRSAVE register,
+// so we'll need to scavenge a register for it.
+let mayStore = 1 in
+def SPILL_VRSAVE : Pseudo<(outs), (ins VRSAVERC:$vrsave, memri:$F),
+                     "#SPILL_VRSAVE", []>;
+
+// RESTORE_VRSAVE - Indicate that we're restoring the VRSAVE register (previously
+// spilled), so we'll need to scavenge a register for it.
+let mayLoad = 1 in
+def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
+                     "#RESTORE_VRSAVE", []>;
+
+def MTCRF : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins GPRC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+
+// This is a pseudo for MFCR, which implicitly uses all 8 of its subregisters;
+// declaring that here gives the local register allocator problems with this:
+//  vreg = MCRF  CR0
+//  MFCR  <kill of whatever preg got assigned to vreg>
+// while not declaring it breaks DeadMachineInstructionElimination.
+// As it turns out, in all cases where we currently use this,
+// we're only interested in one subregister of it.  Represent this in the
+// instruction to keep the register allocator from becoming confused.
+//
+// FIXME: Make this a real Pseudo instruction when the JIT switches to MC.
+let isCodeGenOnly = 1 in
+def MFCRpseud: XFXForm_3<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
+                       "#MFCRpseud", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+            
+def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins),
+                     "mfcr $rT", SprMFCR>,
+                     PPC970_MicroCode, PPC970_Unit_CRU;
+
+def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
+                       "mfocrf $rT, $FXM", SprMFCR>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Pseudo instruction to perform FADD in round-to-zero mode.
+let usesCustomInserter = 1, Uses = [RM] in {
+  def FADDrtz: Pseudo<(outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB), "",
+                      [(set f64:$FRT, (PPCfaddrtz f64:$FRA, f64:$FRB))]>;
+}
+
+// The above pseudo gets expanded to make use of the following instructions
+// to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
+let Uses = [RM], Defs = [RM] in { 
+  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                        "mtfsb0 $FM", IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                        "mtfsb1 $FM", IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSF  : XFLForm<63, 711, (outs), (ins i32imm:$FM, F8RC:$rT),
+                       "mtfsf $FM, $rT", IntMTFSB0, []>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+  def MFFS   : XForm_42<63, 583, (outs F8RC:$rT), (ins), 
+                         "mffs $rT", IntMFFS,
+                         [(set f64:$rT, (PPCmffs))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+//
+def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "add $rT, $rA, $rB", IntSimple,
+                     [(set i32:$rT, (add i32:$rA, i32:$rB))]>;
+let Defs = [CARRY] in {
+def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set i32:$rT, (addc i32:$rA, i32:$rB))]>,
+                     PPC970_DGroup_Cracked;
+}
+def DIVW  : XOForm_1<31, 491, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divw $rT, $rA, $rB", IntDivW,
+                     [(set i32:$rT, (sdiv i32:$rA, i32:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVWU : XOForm_1<31, 459, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divwu $rT, $rA, $rB", IntDivW,
+                     [(set i32:$rT, (udiv i32:$rA, i32:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULHW : XOForm_1<31, 75, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhw $rT, $rA, $rB", IntMulHW,
+                     [(set i32:$rT, (mulhs i32:$rA, i32:$rB))]>;
+def MULHWU : XOForm_1<31, 11, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhwu $rT, $rA, $rB", IntMulHWU,
+                     [(set i32:$rT, (mulhu i32:$rA, i32:$rB))]>;
+def MULLW : XOForm_1<31, 235, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mullw $rT, $rA, $rB", IntMulHW,
+                     [(set i32:$rT, (mul i32:$rA, i32:$rB))]>;
+def SUBF  : XOForm_1<31, 40, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set i32:$rT, (sub i32:$rB, i32:$rA))]>;
+let Defs = [CARRY] in {
+def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subfc $rT, $rA, $rB", IntGeneral,
+                     [(set i32:$rT, (subc i32:$rB, i32:$rA))]>,
+                     PPC970_DGroup_Cracked;
+}
+def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "neg $rT, $rA", IntSimple,
+                      [(set i32:$rT, (ineg i32:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "adde $rT, $rA, $rB", IntGeneral,
+                      [(set i32:$rT, (adde i32:$rA, i32:$rB))]>;
+def ADDME  : XOForm_3<31, 234, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addme $rT, $rA", IntGeneral,
+                      [(set i32:$rT, (adde i32:$rA, -1))]>;
+def ADDZE  : XOForm_3<31, 202, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addze $rT, $rA", IntGeneral,
+                      [(set i32:$rT, (adde i32:$rA, 0))]>;
+def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "subfe $rT, $rA, $rB", IntGeneral,
+                      [(set i32:$rT, (sube i32:$rB, i32:$rA))]>;
+def SUBFME : XOForm_3<31, 232, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfme $rT, $rA", IntGeneral,
+                      [(set i32:$rT, (sube -1, i32:$rA))]>;
+def SUBFZE : XOForm_3<31, 200, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfze $rT, $rA", IntGeneral,
+                      [(set i32:$rT, (sube 0, i32:$rA))]>;
+}
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3 in {  // FPU Operations.
+let Uses = [RM] in {
+  def FMADD : AForm_1<63, 29, 
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
+  def FMADDS : AForm_1<59, 29,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
+  def FMSUB : AForm_1<63, 28,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set f64:$FRT,
+                            (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
+  def FMSUBS : AForm_1<59, 28,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set f32:$FRT,
+                            (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
+  def FNMADD : AForm_1<63, 31,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set f64:$FRT,
+                            (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
+  def FNMADDS : AForm_1<59, 31,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set f32:$FRT,
+                            (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
+  def FNMSUB : AForm_1<63, 30,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
+                                                 (fneg f64:$FRB))))]>;
+  def FNMSUBS : AForm_1<59, 30,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
+                                                 (fneg f32:$FRB))))]>;
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+def FSELD : AForm_1<63, 23,
+                    (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set f64:$FRT, (PPCfsel f64:$FRA, f64:$FRC, f64:$FRB))]>;
+def FSELS : AForm_1<63, 23,
+                     (outs F4RC:$FRT), (ins F8RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                     "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
+let Uses = [RM] in {
+  def FADD  : AForm_2<63, 21,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
+                      [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
+  def FADDS : AForm_2<59, 21,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fadds $FRT, $FRA, $FRB", FPGeneral,
+                      [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+  def FDIV  : AForm_2<63, 18,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fdiv $FRT, $FRA, $FRB", FPDivD,
+                      [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
+  def FDIVS : AForm_2<59, 18,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fdivs $FRT, $FRA, $FRB", FPDivS,
+                      [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+  def FMUL  : AForm_3<63, 25,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC),
+                      "fmul $FRT, $FRA, $FRC", FPFused,
+                      [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
+  def FMULS : AForm_3<59, 25,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC),
+                      "fmuls $FRT, $FRA, $FRC", FPGeneral,
+                      [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+  def FSUB  : AForm_2<63, 20,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fsub $FRT, $FRA, $FRB", FPAddSub,
+                      [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
+  def FSUBS : AForm_2<59, 20,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fsubs $FRT, $FRA, $FRB", FPGeneral,
+                      [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
+  }
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+  def ISEL  : AForm_4<31, 15,
+                     (outs GPRC:$rT), (ins GPRC_NOR0:$rA, GPRC:$rB, CRBITRC:$cond),
+                     "isel $rT, $rA, $rB, $cond", IntGeneral,
+                     []>;
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+def RLWIMI : MForm_2<20,
+                     (outs GPRC:$rA), (ins GPRC:$rSi, GPRC:$rS, u5imm:$SH, u5imm:$MB, 
+                      u5imm:$ME), "rlwimi $rA, $rS, $SH, $MB, $ME", IntRotate,
+                      []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+def RLWINM : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+def RLWINMo : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>, isDOT, PPC970_DGroup_Cracked;
+def RLWNM  : MForm_2<23,
+                     (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB, u5imm:$MB, u5imm:$ME),
+                     "rlwnm $rA, $rS, $rB, $MB, $ME", IntGeneral,
+                     []>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def NOT : Pat<(not i32:$in),
+              (NOR $in, $in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add i32:$in, imm:$imm),
+          (ADDIS (ADDI $in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or i32:$in, imm:$imm),
+          (ORIS (ORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor i32:$in, imm:$imm),
+          (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub immSExt16:$imm, i32:$in),
+          (SUBFIC $in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl i32:$in, i32:$sh),
+          (RLWNM $in, $sh, 0, 31)>;
+def : Pat<(rotl i32:$in, (i32 imm:$imm)),
+          (RLWINM $in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl i32:$in, i32:$sh), maskimm32:$imm),
+          (RLWNM $in, $sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall (i32 tglobaladdr:$dst)),
+          (BL tglobaladdr:$dst)>;
+def : Pat<(PPCcall (i32 texternalsym:$dst)),
+          (BL texternalsym:$dst)>;
+
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+          (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(PPChi tglobaltlsaddr:$g, i32:$in),
+          (ADDIS $in, tglobaltlsaddr:$g)>;
+def : Pat<(PPClo tglobaltlsaddr:$g, i32:$in),
+          (ADDI $in, tglobaltlsaddr:$g)>;
+def : Pat<(add i32:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS $in, tglobaladdr:$g)>;
+def : Pat<(add i32:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS $in, tconstpool:$g)>;
+def : Pat<(add i32:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS $in, tjumptable:$g)>;
+def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS $in, tblockaddress:$g)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra i32:$rS, i32:$rB),
+          (SRAW $rS, $rB)>;
+def : Pat<(srl i32:$rS, i32:$rB),
+          (SRW $rS, $rB)>;
+def : Pat<(shl i32:$rS, i32:$rB),
+          (SLW $rS, $rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(f64 (extloadf32 iaddr:$src)),
+          (COPY_TO_REGCLASS (LFS iaddr:$src), F8RC)>;
+def : Pat<(f64 (extloadf32 xaddr:$src)),
+          (COPY_TO_REGCLASS (LFSX xaddr:$src), F8RC)>;
+
+def : Pat<(f64 (fextend f32:$src)),
+          (COPY_TO_REGCLASS $src, F8RC)>;
+
+// Memory barriers
+def : Pat<(membarrier (i32 imm /*ll*/),
+                      (i32 imm /*ls*/),
+                      (i32 imm /*sl*/),
+                      (i32 imm /*ss*/),
+                      (i32 imm /*device*/)),
+           (SYNC)>;
+
+def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
+
+// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
+def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
+          (FNMSUB $A, $C, $B)>;
+def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
+          (FNMSUBS $A, $C, $B)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstr64Bit.td"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp
new file mode 100644
index 000000000000..cfcd7490ed0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -0,0 +1,471 @@
+//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the 32-bit PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "PPCJITInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+#define BUILD_ADDIS(RD,RS,IMM16) \
+  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_ORI(RD,RS,UIMM16) \
+  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_ORIS(RD,RS,UIMM16) \
+  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_RLDICR(RD,RS,SH,ME) \
+  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
+   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
+#define BUILD_MTSPR(RS,SPR)      \
+  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
+#define BUILD_BCCTRx(BO,BI,LINK) \
+  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
+#define BUILD_B(TARGET, LINK) \
+  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
+
+// Pseudo-ops
+#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
+#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
+#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
+#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
+
+static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
+  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
+  unsigned *AtI = (unsigned*)(intptr_t)At;
+
+  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
+  } else if (!is64Bit) {
+    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
+    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
+  } else {
+    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
+    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
+    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
+    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
+    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
+  }
+}
+
+extern "C" void PPC32CompilationCallback();
+extern "C" void PPC64CompilationCallback();
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    !(defined(__ppc64__) || defined(__FreeBSD__))
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC32CompilationCallback\n"
+"_PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   PowerPC32 ABI linkage    -  24 bytes
+    //                 parameters -  32 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  32 bytes
+    "mflr r0\n"
+    "stw r0,  8(r1)\n"
+    "stwu r1, -208(r1)\n"
+    // Save all int arg registers
+    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
+    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
+    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
+    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
+    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
+    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
+    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
+    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
+    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
+    "stfd f1,   72(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   r3, r0\n"
+    "lwz  r2, 208(r1)\n" // stub's frame
+    "lwz  r4, 8(r2)\n" // stub's lr
+    "li   r5, 0\n"       // 0 == 32 bit
+    "bl _LLVMPPCCompilationCallback\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
+    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
+    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
+    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
+    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
+    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
+    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
+    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
+    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
+    "lfd f1,   72(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  r1, 208(r1)\n"
+    "lwz  r2, 8(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+
+#elif defined(__PPC__) && !defined(__ppc64__)
+// Linux & FreeBSD / PPC 32 support
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC32CompilationCallback\n"
+"PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   8 double registers       -  64 bytes
+    //   8 int registers          -  32 bytes
+    "mflr 0\n"
+    "stw 0,  4(1)\n"
+    "stwu 1, -104(1)\n"
+    // Save all int arg registers
+    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
+    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
+    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
+    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 8,  64(1)\n"
+    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
+    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
+    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
+    "stfd 1,  8(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   3, 0\n"
+    "lwz  5, 104(1)\n" // stub's frame
+    "lwz  4, 4(5)\n" // stub's lr
+    "li   5, 0\n"       // 0 == 32 bit
+    "bl LLVMPPCCompilationCallback\n"
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
+    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
+    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
+    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
+    // Restore all FP arg registers
+    "lfd 8,  64(1)\n"
+    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
+    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
+    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
+    "lfd 1,  8(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  1, 104(1)\n"
+    "lwz  0, 4(1)\n"
+    "mtlr 0\n"
+    "bctr\n"
+    );
+#else
+void PPC32CompilationCallback() {
+  llvm_unreachable("This is not a power pc, you can't execute this!");
+}
+#endif
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    defined(__ppc64__)
+#ifdef __ELF__
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC64CompilationCallback\n"
+    ".section \".opd\",\"aw\",@progbits\n"
+    ".align 3\n"
+"PPC64CompilationCallback:\n"
+    ".quad .L.PPC64CompilationCallback,.TOC.@tocbase,0\n"
+    ".size PPC64CompilationCallback,24\n"
+    ".previous\n"
+    ".align 4\n"
+    ".type PPC64CompilationCallback,@function\n"
+".L.PPC64CompilationCallback:\n"
+#else
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC64CompilationCallback\n"
+"_PPC64CompilationCallback:\n"
+#endif
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // Set up a proper stack frame
+    // Layout
+    //   PowerPC64 ABI linkage    -  48 bytes
+    //                 parameters -  64 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  64 bytes
+    "mflr 0\n"
+    "std  0,  16(1)\n"
+    "stdu 1, -280(1)\n"
+    // Save all int arg registers
+    "std 10, 272(1)\n"    "std 9,  264(1)\n"
+    "std 8,  256(1)\n"    "std 7,  248(1)\n"
+    "std 6,  240(1)\n"    "std 5,  232(1)\n"
+    "std 4,  224(1)\n"    "std 3,  216(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 13, 208(1)\n"    "stfd 12, 200(1)\n"
+    "stfd 11, 192(1)\n"    "stfd 10, 184(1)\n"
+    "stfd 9,  176(1)\n"    "stfd 8,  168(1)\n"
+    "stfd 7,  160(1)\n"    "stfd 6,  152(1)\n"
+    "stfd 5,  144(1)\n"    "stfd 4,  136(1)\n"
+    "stfd 3,  128(1)\n"    "stfd 2,  120(1)\n"
+    "stfd 1,  112(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 1.
+    "mr   3, 0\n"      // return address (still in r0)
+    "ld   5, 280(1)\n" // stub's frame
+    "ld   4, 16(5)\n"  // stub's lr
+    "li   5, 1\n"      // 1 == 64 bit
+#ifdef __ELF__
+    "bl LLVMPPCCompilationCallback\n"
+    "nop\n"
+#else
+    "bl _LLVMPPCCompilationCallback\n"
+#endif
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "ld 10, 272(1)\n"    "ld 9,  264(1)\n"
+    "ld 8,  256(1)\n"    "ld 7,  248(1)\n"
+    "ld 6,  240(1)\n"    "ld 5,  232(1)\n"
+    "ld 4,  224(1)\n"    "ld 3,  216(1)\n"
+    // Restore all FP arg registers
+    "lfd 13, 208(1)\n"    "lfd 12, 200(1)\n"
+    "lfd 11, 192(1)\n"    "lfd 10, 184(1)\n"
+    "lfd 9,  176(1)\n"    "lfd 8,  168(1)\n"
+    "lfd 7,  160(1)\n"    "lfd 6,  152(1)\n"
+    "lfd 5,  144(1)\n"    "lfd 4,  136(1)\n"
+    "lfd 3,  128(1)\n"    "lfd 2,  120(1)\n"
+    "lfd 1,  112(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "ld  1, 280(1)\n"
+    "ld  0, 16(1)\n"
+    "mtlr 0\n"
+    // XXX: any special TOC handling in the ELF case for JIT?
+    "bctr\n"
+    );
+#else
+void PPC64CompilationCallback() {
+  llvm_unreachable("This is not a power pc, you can't execute this!");
+}
+#endif
+
+extern "C" {
+LLVM_LIBRARY_VISIBILITY void *
+LLVMPPCCompilationCallback(unsigned *StubCallAddrPlus4,
+                           unsigned *OrigCallAddrPlus4,
+                           bool is64Bit) {
+  // Adjust the pointer to the address of the call instruction in the stub
+  // emitted by emitFunctionStub, rather than the instruction after it.
+  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
+  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
+
+  void *Target = JITCompilerFunction(StubCallAddr);
+
+  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
+  // it to branch directly to the destination.  If so, rewrite it so it does not
+  // need to go through the stub anymore.
+  unsigned OrigCallInst = *OrigCallAddr;
+  if ((OrigCallInst >> 26) == 18) {     // Direct call.
+    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
+    
+    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+      // Clear the original target out.
+      OrigCallInst &= (63 << 26) | 3;
+      // Fill in the new target.
+      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
+      // Replace the call.
+      *OrigCallAddr = OrigCallInst;
+    }
+  }
+
+  // Assert that we are coming from a stub that was created with our
+  // emitFunctionStub.
+  if ((*StubCallAddr >> 26) == 18)
+    StubCallAddr -= 3;
+  else {
+  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
+    StubCallAddr -= is64Bit ? 9 : 6;
+  }
+
+  // Rewrite the stub with an unconditional branch to the target, for any users
+  // who took the address of the stub.
+  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(StubCallAddr, 7*4);
+
+  // Put the address of the target function to call and the address to return to
+  // after calling the target function in a place that is easy to get on the
+  // stack after we restore all regs.
+  return Target;
+}
+}
+
+
+
+TargetJITInfo::LazyResolverFn
+PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
+  JITCompilerFunction = Fn;
+  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
+}
+
+TargetJITInfo::StubLayout PPCJITInfo::getStubLayout() {
+  // The stub contains up to 10 4-byte instructions, aligned at 4 bytes: 3
+  // instructions to save the caller's address if this is a lazy-compilation
+  // stub, plus a 1-, 4-, or 7-instruction sequence to load an arbitrary address
+  // into a register and jump through it.
+  StubLayout Result = {10*4, 4};
+  return Result;
+}
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+defined(__APPLE__)
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#endif
+
+void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
+      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
+    void *Addr = (void*)JCE.getCurrentPCValue();
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    EmitBranchToAt((intptr_t)Addr, (intptr_t)Fn, false, is64Bit);
+    sys::Memory::InvalidateInstructionCache(Addr, 7*4);
+    return Addr;
+  }
+
+  void *Addr = (void*)JCE.getCurrentPCValue();
+  if (is64Bit) {
+    JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
+  } else if (TM.getSubtargetImpl()->isDarwinABI()){
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
+  } else {
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
+  }
+  intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue();
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Addr, 10*4);
+  return Addr;
+}
+
+
+void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((PPC::RelocationType)MR->getRelocationType()) {
+    default: llvm_unreachable("Unknown relocation type!");
+    case PPC::reloc_pcrel_bx:
+      // PC-relative relocation for b and bl instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
+      break;
+    case PPC::reloc_pcrel_bcx:
+      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
+      // bcx instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
+      break;
+    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
+    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
+      ResultPtr += MR->getConstantVal();
+
+      // If this is a high-part access, get the high-part.
+      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
+        // If the low part will have a carry (really a borrow) from the low
+        // 16-bits into the high 16, add a bit to borrow from.
+        if (((int)ResultPtr << 16) < 0)
+          ResultPtr += 1 << 16;
+        ResultPtr >>= 16;
+      }
+
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
+      unsigned HighBits = *RelocPos & ~65535;
+      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
+      break;
+    }
+    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
+      ResultPtr += MR->getConstantVal();
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
+      unsigned HighBits = *RelocPos & 0xFFFF0003;
+      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
+      break;
+    }
+    }
+  }
+}
+
+void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Old, 7*4);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h
new file mode 100644
index 000000000000..46d4a08eb687
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCJITInfo.h
@@ -0,0 +1,49 @@
+//===-- PPCJITInfo.h - PowerPC impl. of the JIT interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_JITINFO_H
+#define POWERPC_JITINFO_H
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+
+  class PPCJITInfo : public TargetJITInfo {
+  protected:
+    PPCTargetMachine &TM;
+    bool is64Bit;
+  public:
+    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
+      useGOT = 0;
+      is64Bit = tmIs64Bit;
+    }
+
+    virtual StubLayout getStubLayout();
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
new file mode 100644
index 000000000000..9b0df3e86a75
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -0,0 +1,194 @@
+//===-- PPCMCInstLower.cpp - Convert PPC MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PPC MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
+  return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
+  MCContext &Ctx = AP.OutContext;
+
+  SmallString<128> Name;
+  if (!MO.isGlobal()) {
+    assert(MO.isSymbol() && "Isn't a symbol reference");
+    Name += AP.MAI->getGlobalPrefix();
+    Name += MO.getSymbolName();
+  } else {    
+    const GlobalValue *GV = MO.getGlobal();
+    bool isImplicitlyPrivate = false;
+    if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB ||
+        (MO.getTargetFlags() & PPCII::MO_NLP_FLAG))
+      isImplicitlyPrivate = true;
+    
+    AP.Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  }
+  
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  if (MO.getTargetFlags() == PPCII::MO_DARWIN_STUB) {
+    Name += "$stub";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI(AP).getFnStubEntry(Sym);
+    if (StubSym.getPointer())
+      return Sym;
+    
+    if (MO.isGlobal()) {
+      StubSym =
+      MachineModuleInfoImpl::
+      StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+                  !MO.getGlobal()->hasInternalLinkage());
+    } else {
+      Name.erase(Name.end()-5, Name.end());
+      StubSym =
+      MachineModuleInfoImpl::
+      StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+    }
+    return Sym;
+  }
+
+  // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
+  // then add the suffix.
+  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+  
+    MachineModuleInfoMachO &MachO = getMachOMMI(AP);
+    
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? 
+         MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym);
+    
+    if (StubSym.getPointer() == 0) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym = MachineModuleInfoImpl::
+                   StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+                               !MO.getGlobal()->hasInternalLinkage());
+    }
+    return Sym;
+  }
+  
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
+                              AsmPrinter &Printer, bool isDarwin) {
+  MCContext &Ctx = Printer.OutContext;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
+
+  switch (access) {
+    case PPCII::MO_HA16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_HA16; 
+                         break;
+    case PPCII::MO_LO16: RefKind = isDarwin ? 
+                           MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : 
+                           MCSymbolRefExpr::VK_PPC_GAS_LO16; 
+                         break;
+    case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
+                               break;
+    case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
+                               break;
+    case PPCII::MO_DTPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_DTPREL16_LO;
+                                break;
+    case PPCII::MO_TLSLD16_LO: RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO;
+                               break;
+    case PPCII::MO_TOC16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TOC16_LO;
+                             break;
+   }
+
+  // FIXME: This isn't right, but we don't have a good way to express this in
+  // the MC Level, see below.
+  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG)
+    RefKind = MCSymbolRefExpr::VK_None;
+  
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+
+  // Subtract off the PIC base if required.
+  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG) {
+    const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+    
+    const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+    Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx);
+    // FIXME: We have no way to make the result be VK_PPC_LO16/VK_PPC_HA16,
+    // since it is not a symbol!
+  }
+  
+  return MCOperand::CreateExpr(Expr);
+}
+
+void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        AsmPrinter &AP, bool isDarwin) {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      assert(!MO.getSubReg() && "Subregs should be eliminated!");
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                      MO.getMBB()->getSymbol(), AP.OutContext));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, isDarwin);
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, isDarwin);
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = GetSymbolRef(MO,AP.GetBlockAddressSymbol(MO.getBlockAddress()),AP,
+                          isDarwin);
+      break;
+    case MachineOperand::MO_RegisterMask:
+      continue;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..6a0aec842be7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -0,0 +1,15 @@
+//===-- PPCMachineFunctionInfo.cpp - Private data used for PowerPC --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void PPCFunctionInfo::anchor() { }
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 000000000000..ee18eadf6e5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -0,0 +1,162 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHINE_FUNCTION_INFO_H
+#define PPC_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+  ///
+  int ReturnAddrSaveIndex;
+
+  /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+  /// function.  This is only valid after the initial scan of the function by
+  /// PEI.
+  bool MustSaveLR;
+
+  /// Does this function have any stack spills.
+  bool HasSpills;
+
+  /// Does this function spill using instructions with only r+r (not r+i)
+  /// forms.
+  bool HasNonRISpills;
+
+  /// SpillsCR - Indicates whether CR is spilled in the current function.
+  bool SpillsCR;
+
+  /// Indicates whether VRSAVE is spilled in the current function.
+  bool SpillsVRSAVE;
+
+  /// LRStoreRequired - The bool indicates whether there is some explicit use of
+  /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
+  /// requires that the code generator produce a store of LR to the stack on
+  /// entry, even though LR may otherwise apparently not be used.
+  bool LRStoreRequired;
+
+  /// MinReservedArea - This is the frame size that is at least reserved in a
+  /// potential caller (parameter+linkage area).
+  unsigned MinReservedArea;
+
+  /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+  /// amount the stack pointer is adjusted to make the frame bigger for tail
+  /// calls. Used for creating an area before the register spill area.
+  int TailCallSPDelta;
+
+  /// HasFastCall - Does this function contain a fast call. Used to determine
+  /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+  bool HasFastCall;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+  /// VarArgsStackOffset - StackOffset for start of stack
+  /// arguments.
+  int VarArgsStackOffset;
+  /// VarArgsNumGPR - Index of the first unused integer
+  /// register for parameter passing.
+  unsigned VarArgsNumGPR;
+  /// VarArgsNumFPR - Index of the first unused double
+  /// register for parameter passing.
+  unsigned VarArgsNumFPR;
+
+  /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
+  int CRSpillFrameIndex;
+
+public:
+  explicit PPCFunctionInfo(MachineFunction &MF) 
+    : FramePointerSaveIndex(0),
+      ReturnAddrSaveIndex(0),
+      HasSpills(false),
+      HasNonRISpills(false),
+      SpillsCR(false),
+      SpillsVRSAVE(false),
+      LRStoreRequired(false),
+      MinReservedArea(0),
+      TailCallSPDelta(0),
+      HasFastCall(false),
+      VarArgsFrameIndex(0),
+      VarArgsStackOffset(0),
+      VarArgsNumGPR(0),
+      VarArgsNumFPR(0),
+      CRSpillFrameIndex(0) {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+  void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+  unsigned getMinReservedArea() const { return MinReservedArea; }
+  void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+  int getTailCallSPDelta() const { return TailCallSPDelta; }
+  void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+  /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+  /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+  /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+  /// which is used in PIC generation), or if the LR stack slot is explicitly
+  /// referenced by builtin_return_address.
+  void setMustSaveLR(bool U) { MustSaveLR = U; }
+  bool mustSaveLR() const    { return MustSaveLR; }
+
+  void setHasSpills()      { HasSpills = true; }
+  bool hasSpills() const   { return HasSpills; }
+
+  void setHasNonRISpills()    { HasNonRISpills = true; }
+  bool hasNonRISpills() const { return HasNonRISpills; }
+
+  void setSpillsCR()       { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setSpillsVRSAVE()       { SpillsVRSAVE = true; }
+  bool isVRSAVESpilled() const { return SpillsVRSAVE; }
+
+  void setLRStoreRequired() { LRStoreRequired = true; }
+  bool isLRStoreRequired() const { return LRStoreRequired; }
+
+  void setHasFastCall() { HasFastCall = true; }
+  bool hasFastCall() const { return HasFastCall;}
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
+
+  int getVarArgsStackOffset() const { return VarArgsStackOffset; }
+  void setVarArgsStackOffset(int Offset) { VarArgsStackOffset = Offset; }
+
+  unsigned getVarArgsNumGPR() const { return VarArgsNumGPR; }
+  void setVarArgsNumGPR(unsigned Num) { VarArgsNumGPR = Num; }
+
+  unsigned getVarArgsNumFPR() const { return VarArgsNumFPR; }
+  void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
+
+  int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
+  void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
+};
+
+} // end of namespace llvm
+
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 000000000000..17b836d1ed97
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCPerfectShuffle.h
@@ -0,0 +1,6586 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 000000000000..1d61a3a8eac2
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -0,0 +1,618 @@
+//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "PPCRegisterInfo.h"
+#include "PPC.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cstdlib>
+
+#define GET_REGINFO_TARGET_DESC
+#include "PPCGenRegisterInfo.inc"
+
+using namespace llvm;
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
+                                 const TargetInstrInfo &tii)
+  : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
+                       ST.isPPC64() ? 0 : 1,
+                       ST.isPPC64() ? 0 : 1),
+    Subtarget(ST), TII(tii) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+
+  // 64-bit
+  ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+  ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+  ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+  ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                       const {
+  if (Kind == 1) {
+    if (Subtarget.isPPC64())
+      return &PPC::G8RC_NOX0RegClass;
+    return &PPC::GPRC_NOR0RegClass;
+  }
+
+  if (Subtarget.isPPC64())
+    return &PPC::G8RCRegClass;
+  return &PPC::GPRCRegClass;
+}
+
+const uint16_t*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
+                                 CSR_Darwin32_SaveList;
+
+  return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
+}
+
+const uint32_t*
+PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
+                                 CSR_Darwin32_RegMask;
+
+  return Subtarget.isPPC64() ? CSR_SVR464_RegMask : CSR_SVR432_RegMask;
+}
+
+const uint32_t*
+PPCRegisterInfo::getNoPreservedMask() const {
+  // The naming here is inverted: The CSR_NoRegs_Altivec has the
+  // Altivec registers masked so that they're not saved and restored around
+  // instructions with this preserved mask.
+
+  if (!Subtarget.hasAltivec())
+    return CSR_NoRegs_Altivec_RegMask;
+
+  if (Subtarget.isDarwin())
+    return CSR_NoRegs_Darwin_RegMask;
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const PPCFrameLowering *PPCFI =
+    static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
+
+  // The ZERO register is not really a register, but the representation of r0
+  // when used in instructions that treat r0 as the constant 0.
+  Reserved.set(PPC::ZERO);
+  Reserved.set(PPC::ZERO8);
+
+  // The FP register is also not really a register, but is the representation
+  // of the frame pointer register used by ISD::FRAMEADDR.
+  Reserved.set(PPC::FP);
+  Reserved.set(PPC::FP8);
+
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  Reserved.set(PPC::LR8);
+  Reserved.set(PPC::RM);
+
+  // The SVR4 ABI reserves r2 and r13
+  if (Subtarget.isSVR4ABI()) {
+    Reserved.set(PPC::R2);  // System-reserved register
+    Reserved.set(PPC::R13); // Small Data Area pointer register
+  }
+  
+  // On PPC64, r13 is the thread pointer. Never allocate this register.
+  if (Subtarget.isPPC64()) {
+    Reserved.set(PPC::R13);
+
+    Reserved.set(PPC::X1);
+    Reserved.set(PPC::X13);
+
+    if (PPCFI->needsFP(MF))
+      Reserved.set(PPC::X31);
+
+    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+    if (Subtarget.isSVR4ABI()) {
+      Reserved.set(PPC::X2);
+    }
+  }
+
+  if (PPCFI->needsFP(MF))
+    Reserved.set(PPC::R31);
+
+  return Reserved;
+}
+
+unsigned
+PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const unsigned DefaultSafety = 1;
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case PPC::G8RC_NOX0RegClassID:
+  case PPC::GPRC_NOR0RegClassID: 
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 32 - FP - DefaultSafety;
+  }
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::VRRCRegClassID:
+    return 32 - DefaultSafety;
+  case PPC::CRRCRegClassID:
+    return 8 - DefaultSafety;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code with be in the general form
+///
+///   addi   R0, SP, \#frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = Subtarget.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  if (MaxAlign > TargetAlign)
+    report_fatal_error("Dynamic alloca with large aligns not supported");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
+  // Constructing the constant and adding would take 3 instructions. 
+  // Fortunately, a frame greater than 32K is rare.
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  
+  if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+      .addImm(0)
+      .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+  
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::X1)
+      .addReg(MI.getOperand(1).getReg());
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::R1)
+      .addReg(MI.getOperand(1).getReg());
+
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  }
+  
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+///   mfcr rA                  ; Move the conditional register into GPR rA.
+///   rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+///   stw rA, FI               ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  // We need to store the CR in the low 4-bits of the saved value. First, issue
+  // an MFCRpsued to save all of the CRBits and, if needed, kill the SrcReg.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFCR8pseud : PPC::MFCRpseud), Reg)
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+    
+  // If the saved register wasn't CR0, shift the bits left so that they are in
+  // CR0's slot.
+  if (SrcReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    // rlwinm rA, rA, ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+      .addReg(Reg1, RegState::Kill)
+      .addImm(getEncodingValue(SrcReg) * 4)
+      .addImm(0)
+      .addImm(31);
+  }
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_CR <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  bool LP64 = Subtarget.isPPC64();
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_CR does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
+                              Reg), FrameIndex);
+
+  // If the reloaded register isn't CR0, shift the bits right so that they are
+  // in the right CR's slot.
+  if (DestReg != PPC::CR0) {
+    unsigned Reg1 = Reg;
+    Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+
+    unsigned ShiftBits = getEncodingValue(DestReg)*4;
+    // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
+             .addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0)
+             .addImm(31);
+  }
+
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTCRF8 : PPC::MTCRF), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                                          unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_VRSAVE <SrcReg>, <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned SrcReg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
+          .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
+    
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
+                    .addReg(Reg, RegState::Kill),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                                         unsigned FrameIndex) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; <DestReg> = RESTORE_VRSAVE <offset>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(MI.definesRegister(DestReg) &&
+    "RESTORE_VRSAVE does not define its destination");
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
+                              Reg), FrameIndex);
+
+  BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
+             .addReg(Reg, RegState::Kill);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+bool
+PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+				      unsigned Reg, int &FrameIdx) const {
+
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we have
+  // previously created the stack slot if needed, so return its FrameIdx.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (Subtarget.isPPC64())
+      FrameIdx = 0;
+    else {
+      const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+      FrameIdx = FI->getCRSpillFrameIndex();
+    }
+    return true;
+  }
+  return false;
+}
+
+void
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Take into account whether it's an add or mem instruction
+  unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
+  if (MI.isInlineAsm())
+    OffsetOperandNo = FIOperandNum-1;
+
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+  
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II);
+    return;
+  }
+
+  // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
+  if (OpC == PPC::SPILL_CR) {
+    lowerCRSpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_CR) {
+    lowerCRRestore(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::SPILL_VRSAVE) {
+    lowerVRSAVESpilling(II, FrameIndex);
+    return;
+  } else if (OpC == PPC::RESTORE_VRSAVE) {
+    lowerVRSAVERestore(II, FrameIndex);
+    return;
+  }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+
+  bool is64Bit = Subtarget.isPPC64();
+  MI.getOperand(FIOperandNum).ChangeToRegister(TFI->hasFP(MF) ?
+                                              (is64Bit ? PPC::X31 : PPC::R31) :
+                                                (is64Bit ? PPC::X1 : PPC::R1),
+                                              false);
+
+  // Figure out if the offset in the instruction is shifted right two bits. This
+  // is true for instructions like "STD", which the machine implicitly adds two
+  // low zeros to.
+  bool isIXAddr = false;
+  switch (OpC) {
+  case PPC::LWA:
+  case PPC::LD:
+  case PPC::STD:
+    isIXAddr = true;
+    break;
+  }
+
+  // If the instruction is not present in ImmToIdxMap, then it has no immediate
+  // form (and must be r+r).
+  bool noImmForm = !MI.isInlineAsm() && !ImmToIdxMap.count(OpC);
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+  if (!isIXAddr)
+    Offset += MI.getOperand(OffsetOperandNo).getImm();
+  else
+    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  // Naked functions have stack size 0, although getStackSize may not reflect that
+  // because we didn't call all the pieces that compute it for naked functions.
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked))
+    Offset += MFI->getStackSize();
+
+  // If we can, encode the offset directly into the instruction.  If this is a
+  // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
+  // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+  // clear can be encoded.  This is extremely uncommon, because normally you
+  // only "std" to a stack slot that is at least 4-byte aligned, but it can
+  // happen in invalid code.
+  if (OpC == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
+      (!noImmForm &&
+       isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
+    if (isIXAddr)
+      Offset >>= 2;    // The actual encoded value has the low two bits zero.
+    MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // The offset doesn't fit into a single register, scavenge one to build the
+  // offset in.
+
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
+  unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
+           SReg = MF.getRegInfo().createVirtualRegister(RC);
+
+  // Insert a set of rA with the full offset value before the ld, st, or add
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
+    .addImm(Offset >> 16);
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
+    .addReg(SRegHi, RegState::Kill)
+    .addImm(Offset);
+
+  // Convert into indexed form of the instruction:
+  // 
+  //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+  //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+  unsigned OperandBase;
+
+  if (noImmForm)
+    OperandBase = 1;
+  else if (OpC != TargetOpcode::INLINEASM) {
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setDesc(TII.get(NewOpcode));
+    OperandBase = 1;
+  } else {
+    OperandBase = OffsetOperandNo;
+  }
+
+  unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
+  MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+  MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (!Subtarget.isPPC64())
+    return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+unsigned PPCRegisterInfo::getEHExceptionRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
+}
+
+unsigned PPCRegisterInfo::getEHHandlerRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 000000000000..7e6683eeb2ef
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -0,0 +1,90 @@
+//===-- PPCRegisterInfo.h - PowerPC Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_REGISTERINFO_H
+#define POWERPC32_REGISTERINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "PPC.h"
+
+#define GET_REGINFO_HEADER
+#include "PPCGenRegisterInfo.inc"
+
+namespace llvm {
+class PPCSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  DenseMap<unsigned, unsigned> ImmToIdxMap;
+  const PPCSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+public:
+  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  virtual const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
+  const uint32_t *getNoPreservedMask() const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// We require the register scavenger.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+    return true;
+  }
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+    return true;
+  }
+
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
+  void lowerCRSpilling(MachineBasicBlock::iterator II,
+                       unsigned FrameIndex) const;
+  void lowerCRRestore(MachineBasicBlock::iterator II,
+                      unsigned FrameIndex) const;
+  void lowerVRSAVESpilling(MachineBasicBlock::iterator II,
+                           unsigned FrameIndex) const;
+  void lowerVRSAVERestore(MachineBasicBlock::iterator II,
+                          unsigned FrameIndex) const;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+			    int &FrameIdx) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 000000000000..57a25f5143fa
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -0,0 +1,232 @@
+//===-- PPCRegisterInfo.td - The PowerPC Register File -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "PPC" in {
+def sub_lt : SubRegIndex;
+def sub_gt : SubRegIndex;
+def sub_eq : SubRegIndex;
+def sub_un : SubRegIndex;
+def sub_32 : SubRegIndex;
+}
+
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+  let HWEncoding = SubReg.HWEncoding;
+  let SubRegs = [SubReg];
+  let SubRegIndices = [sub_32];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  let HWEncoding{9-0} = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  let HWEncoding{2-0} = num;
+  let SubRegs = subregs;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  let HWEncoding{4-0} = num;
+}
+
+// General-purpose registers
+foreach Index = 0-31 in {
+  def R#Index : GPR<Index, "r"#Index>, DwarfRegNum<[-2, Index]>;
+}
+
+// 64-bit General-purpose registers
+foreach Index = 0-31 in {
+  def X#Index : GP8<!cast<GPR>("R"#Index), "r"#Index>,
+                    DwarfRegNum<[Index, -2]>;
+}
+
+// Floating-point registers
+foreach Index = 0-31 in {
+  def F#Index : FPR<Index, "f"#Index>,
+                DwarfRegNum<[!add(Index, 32), !add(Index, 32)]>;
+}
+
+// Vector registers
+foreach Index = 0-31 in {
+  def V#Index : VR<Index, "v"#Index>,
+                DwarfRegNum<[!add(Index, 77), !add(Index, 77)]>;
+}
+
+// The reprsentation of r0 when treated as the constant 0.
+def ZERO  : GPR<0, "0">;
+def ZERO8 : GP8<ZERO, "0">;
+
+// Representations of the frame pointer used by ISD::FRAMEADDR.
+def FP   : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
+def FP8  : GP8<FP, "**FRAME POINTER**">;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">;
+def CR0GT : CRBIT< 1, "1">;
+def CR0EQ : CRBIT< 2, "2">;
+def CR0UN : CRBIT< 3, "3">;
+def CR1LT : CRBIT< 4, "4">;
+def CR1GT : CRBIT< 5, "5">;
+def CR1EQ : CRBIT< 6, "6">;
+def CR1UN : CRBIT< 7, "7">;
+def CR2LT : CRBIT< 8, "8">;
+def CR2GT : CRBIT< 9, "9">;
+def CR2EQ : CRBIT<10, "10">;
+def CR2UN : CRBIT<11, "11">;
+def CR3LT : CRBIT<12, "12">;
+def CR3GT : CRBIT<13, "13">;
+def CR3EQ : CRBIT<14, "14">;
+def CR3UN : CRBIT<15, "15">;
+def CR4LT : CRBIT<16, "16">;
+def CR4GT : CRBIT<17, "17">;
+def CR4EQ : CRBIT<18, "18">;
+def CR4UN : CRBIT<19, "19">;
+def CR5LT : CRBIT<20, "20">;
+def CR5GT : CRBIT<21, "21">;
+def CR5EQ : CRBIT<22, "22">;
+def CR5UN : CRBIT<23, "23">;
+def CR6LT : CRBIT<24, "24">;
+def CR6GT : CRBIT<25, "25">;
+def CR6EQ : CRBIT<26, "26">;
+def CR6UN : CRBIT<27, "27">;
+def CR7LT : CRBIT<28, "28">;
+def CR7GT : CRBIT<29, "29">;
+def CR7EQ : CRBIT<30, "30">;
+def CR7UN : CRBIT<31, "31">;
+
+// Condition registers
+let SubRegIndices = [sub_lt, sub_gt, sub_eq, sub_un] in {
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68, 68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69, 69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70, 70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71, 71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72, 72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73, 73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
+}
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[109]>;
+
+// Carry bit.  In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1);  this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">;
+
+// FP rounding mode:  bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs.  The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+// Also, in the architecture it is not really a SPR; 512 is arbitrary.
+def RM: SPR<512, "**ROUNDING MODE**">;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
+                                                (sequence "R%u", 30, 13),
+                                                R31, R0, R1, FP)>;
+
+def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
+                                                (sequence "X%u", 30, 14),
+                                                X31, X13, X0, X1, FP8)>;
+
+// For some instructions r0 is special (representing the value 0 instead of
+// the value in the r0 register), and we use these register subclasses to
+// prevent r0 from being allocated for use by those instructions.
+def GPRC_NOR0 : RegisterClass<"PPC", [i32], 32, (add (sub GPRC, R0), ZERO)>;
+def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)>;
+
+// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
+// ABI the size of the Floating-point register save area is determined by the
+// allocated non-volatile register with the lowest register number, as FP
+// register N is spilled to offset 8 * (32 - N) below the back chain word of the
+// previous stack frame. By allocating non-volatiles in reverse order we make
+// sure that the Floating-point register save area is always as small as
+// possible because there aren't any unused spill slots.
+def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
+                                                (sequence "F%u", 31, 14))>;
+def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
+
+def CRBITRC : RegisterClass<"PPC", [i32], 32,
+  (add CR0LT, CR0GT, CR0EQ, CR0UN,
+       CR1LT, CR1GT, CR1EQ, CR1UN,
+       CR2LT, CR2GT, CR2EQ, CR2UN,
+       CR3LT, CR3GT, CR3EQ, CR3UN,
+       CR4LT, CR4GT, CR4EQ, CR4UN,
+       CR5LT, CR5GT, CR5EQ, CR5UN,
+       CR6LT, CR6GT, CR6EQ, CR6UN,
+       CR7LT, CR7GT, CR7EQ, CR7UN)>
+{
+  let CopyCost = -1;
+}
+
+def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
+                                                CR7, CR2, CR3, CR4)>;
+
+// The CTR registers are not allocatable because they're used by the
+// decrement-and-branch instructions, and thus need to stay live across
+// multiple basic blocks.
+def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
+  let isAllocatable = 0;
+}
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
+  let isAllocatable = 0;
+}
+
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
+  let CopyCost = -1;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h
new file mode 100644
index 000000000000..0b392f99b6d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRelocations.h
@@ -0,0 +1,56 @@
+//===-- PPCRelocations.h - PPC Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCRELOCATIONS_H
+#define PPCRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+// Hack to rid us of a PPC pre-processor symbol which is erroneously
+// defined in a PowerPC header file (bug in Linux/PPC)
+#ifdef PPC
+#undef PPC
+#endif
+
+namespace llvm {
+  namespace PPC {
+    enum RelocationType {
+      // reloc_vanilla - A standard relocation, where the address of the
+      // relocated object completely overwrites the address of the relocation.
+      reloc_vanilla,
+    
+      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
+      reloc_pcrel_bx,
+
+      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
+      // and other bcx instructions.
+      reloc_pcrel_bcx,
+
+      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
+      // (which is really addis).  Add the high 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_high,
+
+      // reloc_absolute_low - Absolute relocation, for the la instruction (which
+      // is really an addi).  Add the low 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_low,
+      
+      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
+      // instruction which have two implicit zero bits.
+      reloc_absolute_low_ix
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 000000000000..660c0c3b6359
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule.td
@@ -0,0 +1,519 @@
+//===-- PPCSchedule.td - PowerPC Scheduling Definitions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across PowerPC chips sets
+//
+def BPU    : FuncUnit; // Branch unit
+def SLU    : FuncUnit; // Store/load unit
+def SRU    : FuncUnit; // special register unit
+def IU1    : FuncUnit; // integer unit 1 (simple)
+def IU2    : FuncUnit; // integer unit 2 (complex)
+def FPU1   : FuncUnit; // floating point unit 1
+def FPU2   : FuncUnit; // floating point unit 2
+def VPU    : FuncUnit; // vector permutation unit
+def VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def VFPU   : FuncUnit; // vector floating point unit
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IntSimple    : InstrItinClass;
+def IntGeneral   : InstrItinClass;
+def IntCompare   : InstrItinClass;
+def IntDivD      : InstrItinClass;
+def IntDivW      : InstrItinClass;
+def IntMFFS      : InstrItinClass;
+def IntMFVSCR    : InstrItinClass;
+def IntMTFSB0    : InstrItinClass;
+def IntMTSRD     : InstrItinClass;
+def IntMulHD     : InstrItinClass;
+def IntMulHW     : InstrItinClass;
+def IntMulHWU    : InstrItinClass;
+def IntMulLI     : InstrItinClass;
+def IntRFID      : InstrItinClass;
+def IntRotateD   : InstrItinClass;
+def IntRotateDI  : InstrItinClass;
+def IntRotate    : InstrItinClass;
+def IntShift     : InstrItinClass;
+def IntTrapD     : InstrItinClass;
+def IntTrapW     : InstrItinClass;
+def BrB          : InstrItinClass;
+def BrCR         : InstrItinClass;
+def BrMCR        : InstrItinClass;
+def BrMCRX       : InstrItinClass;
+def LdStDCBA     : InstrItinClass;
+def LdStDCBF     : InstrItinClass;
+def LdStDCBI     : InstrItinClass;
+def LdStLoad     : InstrItinClass;
+def LdStLoadUpd  : InstrItinClass;
+def LdStStore    : InstrItinClass;
+def LdStStoreUpd : InstrItinClass;
+def LdStDSS      : InstrItinClass;
+def LdStICBI     : InstrItinClass;
+def LdStLD       : InstrItinClass;
+def LdStLDU      : InstrItinClass;
+def LdStLDARX    : InstrItinClass;
+def LdStLFD      : InstrItinClass;
+def LdStLFDU     : InstrItinClass;
+def LdStLHA      : InstrItinClass;
+def LdStLHAU     : InstrItinClass;
+def LdStLMW      : InstrItinClass;
+def LdStLVecX    : InstrItinClass;
+def LdStLWA      : InstrItinClass;
+def LdStLWARX    : InstrItinClass;
+def LdStSLBIA    : InstrItinClass;
+def LdStSLBIE    : InstrItinClass;
+def LdStSTD      : InstrItinClass;
+def LdStSTDCX    : InstrItinClass;
+def LdStSTDU     : InstrItinClass;
+def LdStSTFD     : InstrItinClass;
+def LdStSTFDU    : InstrItinClass;
+def LdStSTVEBX   : InstrItinClass;
+def LdStSTWCX    : InstrItinClass;
+def LdStSync     : InstrItinClass;
+def SprISYNC     : InstrItinClass;
+def SprMFSR      : InstrItinClass;
+def SprMTMSR     : InstrItinClass;
+def SprMTSR      : InstrItinClass;
+def SprTLBSYNC   : InstrItinClass;
+def SprMFCR      : InstrItinClass;
+def SprMFMSR     : InstrItinClass;
+def SprMFSPR     : InstrItinClass;
+def SprMFTB      : InstrItinClass;
+def SprMTSPR     : InstrItinClass;
+def SprMTSRIN    : InstrItinClass;
+def SprRFI       : InstrItinClass;
+def SprSC        : InstrItinClass;
+def FPGeneral    : InstrItinClass;
+def FPAddSub     : InstrItinClass;
+def FPCompare    : InstrItinClass;
+def FPDivD       : InstrItinClass;
+def FPDivS       : InstrItinClass;
+def FPFused      : InstrItinClass;
+def FPRes        : InstrItinClass;
+def FPSqrt       : InstrItinClass;
+def VecGeneral   : InstrItinClass;
+def VecFP        : InstrItinClass;
+def VecFPCompare : InstrItinClass;
+def VecComplex   : InstrItinClass;
+def VecPerm      : InstrItinClass;
+def VecFPRound   : InstrItinClass;
+def VecVSL       : InstrItinClass;
+def VecVSR       : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCSchedule440.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction to itinerary class map - When add new opcodes to the supported
+// set, refer to the following table to determine which itinerary class the
+// opcode belongs.
+//
+//    opcode     itinerary class
+//    ======     ===============
+//    add        IntSimple
+//    addc       IntGeneral
+//    adde       IntGeneral
+//    addi       IntSimple
+//    addic      IntGeneral
+//    addic.     IntGeneral
+//    addis      IntSimple
+//    addme      IntGeneral
+//    addze      IntGeneral
+//    and        IntSimple
+//    andc       IntSimple
+//    andi.      IntGeneral
+//    andis.     IntGeneral
+//    b          BrB
+//    bc         BrB
+//    bcctr      BrB
+//    bclr       BrB
+//    cmp        IntCompare
+//    cmpi       IntCompare
+//    cmpl       IntCompare
+//    cmpli      IntCompare
+//    cntlzd     IntRotateD
+//    cntlzw     IntGeneral
+//    crand      BrCR
+//    crandc     BrCR
+//    creqv      BrCR
+//    crnand     BrCR
+//    crnor      BrCR
+//    cror       BrCR
+//    crorc      BrCR
+//    crxor      BrCR
+//    dcba       LdStDCBA
+//    dcbf       LdStDCBF
+//    dcbi       LdStDCBI
+//    dcbst      LdStDCBF
+//    dcbt       LdStLoad
+//    dcbtst     LdStLoad
+//    dcbz       LdStDCBF
+//    divd       IntDivD
+//    divdu      IntDivD
+//    divw       IntDivW
+//    divwu      IntDivW
+//    dss        LdStDSS
+//    dst        LdStDSS
+//    dstst      LdStDSS
+//    eciwx      LdStLoad
+//    ecowx      LdStLoad
+//    eieio      LdStLoad
+//    eqv        IntSimple
+//    extsb      IntSimple
+//    extsh      IntSimple
+//    extsw      IntSimple
+//    fabs       FPGeneral
+//    fadd       FPAddSub
+//    fadds      FPGeneral
+//    fcfid      FPGeneral
+//    fcmpo      FPCompare
+//    fcmpu      FPCompare
+//    fctid      FPGeneral
+//    fctidz     FPGeneral
+//    fctiw      FPGeneral
+//    fctiwz     FPGeneral
+//    fdiv       FPDivD
+//    fdivs      FPDivS
+//    fmadd      FPFused
+//    fmadds     FPGeneral
+//    fmr        FPGeneral
+//    fmsub      FPFused
+//    fmsubs     FPGeneral
+//    fmul       FPFused
+//    fmuls      FPGeneral
+//    fnabs      FPGeneral
+//    fneg       FPGeneral
+//    fnmadd     FPFused
+//    fnmadds    FPGeneral
+//    fnmsub     FPFused
+//    fnmsubs    FPGeneral
+//    fres       FPRes
+//    frsp       FPGeneral
+//    frsqrte    FPGeneral
+//    fsel       FPGeneral
+//    fsqrt      FPSqrt
+//    fsqrts     FPSqrt
+//    fsub       FPAddSub
+//    fsubs      FPGeneral
+//    icbi       LdStICBI
+//    isync      SprISYNC
+//    lbz        LdStLoad
+//    lbzu       LdStLoadUpd
+//    lbzux      LdStLoadUpd
+//    lbzx       LdStLoad
+//    ld         LdStLD
+//    ldarx      LdStLDARX
+//    ldu        LdStLDU
+//    ldux       LdStLDU
+//    ldx        LdStLD
+//    lfd        LdStLFD
+//    lfdu       LdStLFDU
+//    lfdux      LdStLFDU
+//    lfdx       LdStLFD
+//    lfs        LdStLFD
+//    lfsu       LdStLFDU
+//    lfsux      LdStLFDU
+//    lfsx       LdStLFD
+//    lha        LdStLHA
+//    lhau       LdStLHAU
+//    lhaux      LdStLHAU
+//    lhax       LdStLHA
+//    lhbrx      LdStLoad
+//    lhz        LdStLoad
+//    lhzu       LdStLoadUpd
+//    lhzux      LdStLoadUpd
+//    lhzx       LdStLoad
+//    lmw        LdStLMW
+//    lswi       LdStLMW
+//    lswx       LdStLMW
+//    lvebx      LdStLVecX
+//    lvehx      LdStLVecX
+//    lvewx      LdStLVecX
+//    lvsl       LdStLVecX
+//    lvsr       LdStLVecX
+//    lvx        LdStLVecX
+//    lvxl       LdStLVecX
+//    lwa        LdStLWA
+//    lwarx      LdStLWARX
+//    lwaux      LdStLHAU
+//    lwax       LdStLHA
+//    lwbrx      LdStLoad
+//    lwz        LdStLoad
+//    lwzu       LdStLoadUpd
+//    lwzux      LdStLoadUpd
+//    lwzx       LdStLoad
+//    mcrf       BrMCR
+//    mcrfs      FPGeneral
+//    mcrxr      BrMCRX
+//    mfcr       SprMFCR
+//    mffs       IntMFFS
+//    mfmsr      SprMFMSR
+//    mfspr      SprMFSPR
+//    mfsr       SprMFSR
+//    mfsrin     SprMFSR
+//    mftb       SprMFTB
+//    mfvscr     IntMFVSCR
+//    mtcrf      BrMCRX
+//    mtfsb0     IntMTFSB0
+//    mtfsb1     IntMTFSB0
+//    mtfsf      IntMTFSB0
+//    mtfsfi     IntMTFSB0
+//    mtmsr      SprMTMSR
+//    mtmsrd     LdStLD
+//    mtspr      SprMTSPR
+//    mtsr       SprMTSR
+//    mtsrd      IntMTSRD
+//    mtsrdin    IntMTSRD
+//    mtsrin     SprMTSRIN
+//    mtvscr     IntMFVSCR
+//    mulhd      IntMulHD
+//    mulhdu     IntMulHD
+//    mulhw      IntMulHW
+//    mulhwu     IntMulHWU
+//    mulld      IntMulHD
+//    mulli      IntMulLI
+//    mullw      IntMulHW
+//    nand       IntSimple
+//    neg        IntSimple
+//    nor        IntSimple
+//    or         IntSimple
+//    orc        IntSimple
+//    ori        IntSimple
+//    oris       IntSimple
+//    rfi        SprRFI
+//    rfid       IntRFID
+//    rldcl      IntRotateD
+//    rldcr      IntRotateD
+//    rldic      IntRotateDI
+//    rldicl     IntRotateDI
+//    rldicr     IntRotateDI
+//    rldimi     IntRotateDI
+//    rlwimi     IntRotate
+//    rlwinm     IntGeneral
+//    rlwnm      IntGeneral
+//    sc         SprSC
+//    slbia      LdStSLBIA
+//    slbie      LdStSLBIE
+//    sld        IntRotateD
+//    slw        IntGeneral
+//    srad       IntRotateD
+//    sradi      IntRotateDI
+//    sraw       IntShift
+//    srawi      IntShift
+//    srd        IntRotateD
+//    srw        IntGeneral
+//    stb        LdStStore
+//    stbu       LdStStoreUpd
+//    stbux      LdStStoreUpd
+//    stbx       LdStStore
+//    std        LdStSTD
+//    stdcx.     LdStSTDCX
+//    stdu       LdStSTDU
+//    stdux      LdStSTDU
+//    stdx       LdStSTD
+//    stfd       LdStSTFD
+//    stfdu      LdStSTFDU
+//    stfdux     LdStSTFDU
+//    stfdx      LdStSTFD
+//    stfiwx     LdStSTFD
+//    stfs       LdStSTFD
+//    stfsu      LdStSTFDU
+//    stfsux     LdStSTFDU
+//    stfsx      LdStSTFD
+//    sth        LdStStore
+//    sthbrx     LdStStore
+//    sthu       LdStStoreUpd
+//    sthux      LdStStoreUpd
+//    sthx       LdStStore
+//    stmw       LdStLMW
+//    stswi      LdStLMW
+//    stswx      LdStLMW
+//    stvebx     LdStSTVEBX
+//    stvehx     LdStSTVEBX
+//    stvewx     LdStSTVEBX
+//    stvx       LdStSTVEBX
+//    stvxl      LdStSTVEBX
+//    stw        LdStStore
+//    stwbrx     LdStStore
+//    stwcx.     LdStSTWCX
+//    stwu       LdStStoreUpd
+//    stwux      LdStStoreUpd
+//    stwx       LdStStore
+//    subf       IntGeneral
+//    subfc      IntGeneral
+//    subfe      IntGeneral
+//    subfic     IntGeneral
+//    subfme     IntGeneral
+//    subfze     IntGeneral
+//    sync       LdStSync
+//    td         IntTrapD
+//    tdi        IntTrapD
+//    tlbia      LdStSLBIA
+//    tlbie      LdStDCBF
+//    tlbsync    SprTLBSYNC
+//    tw         IntTrapW
+//    twi        IntTrapW
+//    vaddcuw    VecGeneral
+//    vaddfp     VecFP
+//    vaddsbs    VecGeneral
+//    vaddshs    VecGeneral
+//    vaddsws    VecGeneral
+//    vaddubm    VecGeneral
+//    vaddubs    VecGeneral
+//    vadduhm    VecGeneral
+//    vadduhs    VecGeneral
+//    vadduwm    VecGeneral
+//    vadduws    VecGeneral
+//    vand       VecGeneral
+//    vandc      VecGeneral
+//    vavgsb     VecGeneral
+//    vavgsh     VecGeneral
+//    vavgsw     VecGeneral
+//    vavgub     VecGeneral
+//    vavguh     VecGeneral
+//    vavguw     VecGeneral
+//    vcfsx      VecFP
+//    vcfux      VecFP
+//    vcmpbfp    VecFPCompare
+//    vcmpeqfp   VecFPCompare
+//    vcmpequb   VecGeneral
+//    vcmpequh   VecGeneral
+//    vcmpequw   VecGeneral
+//    vcmpgefp   VecFPCompare
+//    vcmpgtfp   VecFPCompare
+//    vcmpgtsb   VecGeneral
+//    vcmpgtsh   VecGeneral
+//    vcmpgtsw   VecGeneral
+//    vcmpgtub   VecGeneral
+//    vcmpgtuh   VecGeneral
+//    vcmpgtuw   VecGeneral
+//    vctsxs     VecFP
+//    vctuxs     VecFP
+//    vexptefp   VecFP
+//    vlogefp    VecFP
+//    vmaddfp    VecFP
+//    vmaxfp     VecFPCompare
+//    vmaxsb     VecGeneral
+//    vmaxsh     VecGeneral
+//    vmaxsw     VecGeneral
+//    vmaxub     VecGeneral
+//    vmaxuh     VecGeneral
+//    vmaxuw     VecGeneral
+//    vmhaddshs  VecComplex
+//    vmhraddshs VecComplex
+//    vminfp     VecFPCompare
+//    vminsb     VecGeneral
+//    vminsh     VecGeneral
+//    vminsw     VecGeneral
+//    vminub     VecGeneral
+//    vminuh     VecGeneral
+//    vminuw     VecGeneral
+//    vmladduhm  VecComplex
+//    vmrghb     VecPerm
+//    vmrghh     VecPerm
+//    vmrghw     VecPerm
+//    vmrglb     VecPerm
+//    vmrglh     VecPerm
+//    vmrglw     VecPerm
+//    vmsubfp    VecFP
+//    vmsummbm   VecComplex
+//    vmsumshm   VecComplex
+//    vmsumshs   VecComplex
+//    vmsumubm   VecComplex
+//    vmsumuhm   VecComplex
+//    vmsumuhs   VecComplex
+//    vmulesb    VecComplex
+//    vmulesh    VecComplex
+//    vmuleub    VecComplex
+//    vmuleuh    VecComplex
+//    vmulosb    VecComplex
+//    vmulosh    VecComplex
+//    vmuloub    VecComplex
+//    vmulouh    VecComplex
+//    vnor       VecGeneral
+//    vor        VecGeneral
+//    vperm      VecPerm
+//    vpkpx      VecPerm
+//    vpkshss    VecPerm
+//    vpkshus    VecPerm
+//    vpkswss    VecPerm
+//    vpkswus    VecPerm
+//    vpkuhum    VecPerm
+//    vpkuhus    VecPerm
+//    vpkuwum    VecPerm
+//    vpkuwus    VecPerm
+//    vrefp      VecFPRound
+//    vrfim      VecFPRound
+//    vrfin      VecFPRound
+//    vrfip      VecFPRound
+//    vrfiz      VecFPRound
+//    vrlb       VecGeneral
+//    vrlh       VecGeneral
+//    vrlw       VecGeneral
+//    vrsqrtefp  VecFP
+//    vsel       VecGeneral
+//    vsl        VecVSL
+//    vslb       VecGeneral
+//    vsldoi     VecPerm
+//    vslh       VecGeneral
+//    vslo       VecPerm
+//    vslw       VecGeneral
+//    vspltb     VecPerm
+//    vsplth     VecPerm
+//    vspltisb   VecPerm
+//    vspltish   VecPerm
+//    vspltisw   VecPerm
+//    vspltw     VecPerm
+//    vsr        VecVSR
+//    vsrab      VecGeneral
+//    vsrah      VecGeneral
+//    vsraw      VecGeneral
+//    vsrb       VecGeneral
+//    vsrh       VecGeneral
+//    vsro       VecPerm
+//    vsrw       VecGeneral
+//    vsubcuw    VecGeneral
+//    vsubfp     VecFP
+//    vsubsbs    VecGeneral
+//    vsubshs    VecGeneral
+//    vsubsws    VecGeneral
+//    vsububm    VecGeneral
+//    vsububs    VecGeneral
+//    vsubuhm    VecGeneral
+//    vsubuhs    VecGeneral
+//    vsubuwm    VecGeneral
+//    vsubuws    VecGeneral
+//    vsum2sws   VecComplex
+//    vsum4sbs   VecComplex
+//    vsum4shs   VecComplex
+//    vsum4ubs   VecComplex
+//    vsumsws    VecComplex
+//    vupkhpx    VecPerm
+//    vupkhsb    VecPerm
+//    vupkhsh    VecPerm
+//    vupklpx    VecPerm
+//    vupklsb    VecPerm
+//    vupklsh    VecPerm
+//    vxor       VecGeneral
+//    xor        IntSimple
+//    xori       IntSimple
+//    xoris      IntSimple
+//
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
new file mode 100644
index 000000000000..37b6eac10cfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSchedule440.td
@@ -0,0 +1,663 @@
+//===-- PPCSchedule440.td - PPC 440 Scheduling Definitions -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// PowerPC 440x6 Embedded Processor Core User's Manual.
+// IBM (as updated in) 2010.
+
+// The basic PPC 440 does not include a floating-point unit; the pipeline
+// timings here are constructed to match the FP2 unit shipped with the
+// PPC-440- and PPC-450-based Blue Gene (L and P) supercomputers.
+// References:
+// S. Chatterjee, et al. Design and exploitation of a high-performance
+// SIMD floating-point unit for Blue Gene/L.
+// IBM J. Res. & Dev. 49 (2/3) March/May 2005.
+// also:
+// Carlos Sosa and Brant Knudson. IBM System Blue Gene Solution:
+// Blue Gene/P Application Development.
+// IBM (as updated in) 2009.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC 440/450 chip sets
+//
+def IFTH1  : FuncUnit; // Fetch unit 1
+def IFTH2  : FuncUnit; // Fetch unit 2
+def PDCD1  : FuncUnit; // Decode unit 1
+def PDCD2  : FuncUnit; // Decode unit 2
+def DISS1  : FuncUnit; // Issue unit 1
+def DISS2  : FuncUnit; // Issue unit 2
+def LRACC  : FuncUnit; // Register access and dispatch for
+                       // the simple integer (J-pipe) and
+                       // load/store (L-pipe) pipelines
+def IRACC  : FuncUnit; // Register access and dispatch for
+                       // the complex integer (I-pipe) pipeline
+def FRACC  : FuncUnit; // Register access and dispatch for
+                       // the floating-point execution (F-pipe) pipeline
+def IEXE1  : FuncUnit; // Execution stage 1 for the I pipeline
+def IEXE2  : FuncUnit; // Execution stage 2 for the I pipeline
+def IWB    : FuncUnit; // Write-back unit for the I pipeline
+def JEXE1  : FuncUnit; // Execution stage 1 for the J pipeline
+def JEXE2  : FuncUnit; // Execution stage 2 for the J pipeline
+def JWB    : FuncUnit; // Write-back unit for the J pipeline
+def AGEN   : FuncUnit; // Address generation for the L pipeline
+def CRD    : FuncUnit; // D-cache access for the L pipeline
+def LWB    : FuncUnit; // Write-back unit for the L pipeline
+def FEXE1  : FuncUnit; // Execution stage 1 for the F pipeline
+def FEXE2  : FuncUnit; // Execution stage 2 for the F pipeline
+def FEXE3  : FuncUnit; // Execution stage 3 for the F pipeline
+def FEXE4  : FuncUnit; // Execution stage 4 for the F pipeline
+def FEXE5  : FuncUnit; // Execution stage 5 for the F pipeline
+def FEXE6  : FuncUnit; // Execution stage 6 for the F pipeline
+def FWB    : FuncUnit; // Write-back unit for the F pipeline
+
+def LWARX_Hold : FuncUnit; // This is a pseudo-unit which is used
+                           // to make sure that no lwarx/stwcx.
+                           // instructions are issued while another
+                           // lwarx/stwcx. is in the L pipe.
+
+def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+// Notes:
+// Instructions are held in the FRACC, LRACC and IRACC pipeline
+// stages until their source operands become ready. Exceptions:
+//  - Store instructions will hold in the AGEN stage
+//  - The integer multiply-accumulate instruction will hold in
+//    the IEXE1 stage
+//
+// For most I-pipe operations, the result is available at the end of
+// the IEXE1 stage. Operations such as multiply and divide must
+// continue to execute in IEXE2 and IWB. Divide resides in IWB for
+// 33 cycles (multiply also calculates its result in IWB). For all
+// J-pipe instructions, the result is available
+// at the end of the JEXE1 stage. Loads have a 3-cycle latency
+// (data is not available until after the LWB stage).
+//
+// The L1 cache hit latency is four cycles for floating point loads
+// and three cycles for integer loads.
+//
+// The stwcx. instruction requires both the LRACC and the IRACC
+// dispatch stages. It must be issued from DISS0.
+//
+// All lwarx/stwcx. instructions hold in LRACC if another
+// uncommitted lwarx/stwcx. is in AGEN, CRD, or LWB.
+//
+// msync (a.k.a. sync) and mbar will hold in LWB until all load/store
+// resources are empty. AGEN and CRD are held empty until the msync/mbar
+// commits.
+//
+// Most floating-point instructions, computational and move,
+// have a 5-cycle latency. Divide takes longer (30 cycles). Instructions that
+// update the CR take 2 cycles. Stores take 3 cycles and, as mentioned above,
+// loads take 4 cycles (for L1 hit).
+
+//
+// This file defines the itinerary class data for the PPC 440 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPC440Itineraries : ProcessorItineraries<
+  [IFTH1, IFTH2, PDCD1, PDCD2, DISS1, DISS2, FRACC,
+   IRACC, IEXE1, IEXE2, IWB, LRACC, JEXE1, JEXE2, JWB, AGEN, CRD, LWB,
+   FEXE1, FEXE2, FEXE3, FEXE4, FEXE5, FEXE6, FWB, LWARX_Hold],
+  [GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<33, [IWB]>],
+                              [40, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [7, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [7, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC, LRACC]>,
+                               InstrStage<1, [IEXE1, JEXE1]>,
+                               InstrStage<1, [IEXE2, JEXE2]>,
+                               InstrStage<1, [IWB, JWB]>],
+                              [6, 4, 4],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [6, 4],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4, 4],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [9, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHA     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1]>,
+                               InstrStage<1, [IRACC], 0>,
+                               InstrStage<4, [LWARX_Hold], 0>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1]>,
+                               InstrStage<1, [IRACC], 0>,
+                               InstrStage<4, [LWARX_Hold], 0>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1]>,
+                               InstrStage<1, [IRACC], 0>,
+                               InstrStage<4, [LWARX_Hold], 0>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<3, [AGEN], 1>,
+                               InstrStage<2, [CRD],  1>,
+                               InstrStage<1, [LWB]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC], 0>,
+                               InstrStage<1, [LRACC], 0>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [FEXE1], 0>,
+                               InstrStage<1, [AGEN],  0>,
+                               InstrStage<1, [JEXE1], 0>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [FEXE2], 0>,
+                               InstrStage<1, [CRD],   0>,
+                               InstrStage<1, [JEXE2], 0>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<6, [FEXE3], 0>,
+                               InstrStage<6, [LWB],   0>,
+                               InstrStage<6, [JWB],   0>,
+                               InstrStage<6, [IWB]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [6, 4],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [6, 4],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<3, [IWB]>],
+                              [9, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [7, 4],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<3, [IWB]>],
+                              [10, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<3, [IWB]>],
+                              [10, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<3, [IWB]>],
+                              [10, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<3, [IWB]>],
+                              [10, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprSC       , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [IRACC]>,
+                               InstrStage<1, [IEXE1]>,
+                               InstrStage<1, [IEXE2]>,
+                               InstrStage<1, [IWB]>],
+                              [8, 4],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<25, [FWB]>],
+                              [35, 4, 4],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<13, [FWB]>],
+                              [23, 4, 4],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4],
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
new file mode 100644
index 000000000000..ae084aa0e8c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleA2.td
@@ -0,0 +1,766 @@
+//===- PPCScheduleA2.td - PPC A2 Scheduling Definitions --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+// Primary reference:
+// A2 Processor User's Manual.
+// IBM (as updated in) 2010.
+
+//===----------------------------------------------------------------------===//
+// Functional units on the PowerPC A2 chip sets
+//
+def IU0to3_0  : FuncUnit; // Fetch unit 1 to 4 slot 1
+def IU0to3_1  : FuncUnit; // Fetch unit 1 to 4 slot 2
+def IU0to3_2  : FuncUnit; // Fetch unit 1 to 4 slot 3
+def IU0to3_3  : FuncUnit; // Fetch unit 1 to 4 slot 4
+def IU4_0  : FuncUnit; // Instruction buffer slot 1
+def IU4_1  : FuncUnit; // Instruction buffer slot 2
+def IU4_2  : FuncUnit; // Instruction buffer slot 3
+def IU4_3  : FuncUnit; // Instruction buffer slot 4
+def IU4_4  : FuncUnit; // Instruction buffer slot 5
+def IU4_5  : FuncUnit; // Instruction buffer slot 6
+def IU4_6  : FuncUnit; // Instruction buffer slot 7
+def IU4_7  : FuncUnit; // Instruction buffer slot 8
+def IU5    : FuncUnit; // Dependency resolution
+def IU6    : FuncUnit; // Instruction issue
+def RF0    : FuncUnit;
+def XRF1   : FuncUnit;
+def XEX1   : FuncUnit; // Execution stage 1 for the XU pipeline
+def XEX2   : FuncUnit; // Execution stage 2 for the XU pipeline
+def XEX3   : FuncUnit; // Execution stage 3 for the XU pipeline
+def XEX4   : FuncUnit; // Execution stage 4 for the XU pipeline
+def XEX5   : FuncUnit; // Execution stage 5 for the XU pipeline
+def XEX6   : FuncUnit; // Execution stage 6 for the XU pipeline
+def FRF1   : FuncUnit;
+def FEX1   : FuncUnit; // Execution stage 1 for the FU pipeline
+def FEX2   : FuncUnit; // Execution stage 2 for the FU pipeline
+def FEX3   : FuncUnit; // Execution stage 3 for the FU pipeline
+def FEX4   : FuncUnit; // Execution stage 4 for the FU pipeline
+def FEX5   : FuncUnit; // Execution stage 5 for the FU pipeline
+def FEX6   : FuncUnit; // Execution stage 6 for the FU pipeline
+
+def CR_Bypass  : Bypass; // The bypass for condition regs.
+//def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
+//def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+
+//
+// This file defines the itinerary class data for the PPC A2 processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def PPCA2Itineraries : ProcessorItineraries<
+  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3,
+   IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7,
+   IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
+   FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<38, [XEX6]>],
+                              [53, 7, 7],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntShift    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapD    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7], 
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 11],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStore   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStICBI    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<LdStLFD     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHA     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLMW     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDCX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [26, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<12, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [GPR_Bypass, NoBypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7], 
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [GPR_Bypass, NoBypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [15, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprRFI      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprSC       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
+                              [29, 7],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPCompare   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [13, 7, 7],
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<71, [FRF1], 0>,
+                               InstrStage<71, [FEX1], 0>,
+                                  InstrStage<71, [FEX2], 0>,
+                               InstrStage<71, [FEX3], 0>,
+                                  InstrStage<71, [FEX4], 0>,
+                               InstrStage<71, [FEX5], 0>,
+                                  InstrStage<71, [FEX6]>],
+                              [86, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<58, [FRF1], 0>,
+                               InstrStage<58, [FEX1], 0>,
+                                  InstrStage<58, [FEX2], 0>,
+                               InstrStage<58, [FEX3], 0>,
+                                  InstrStage<58, [FEX4], 0>,
+                               InstrStage<58, [FEX5], 0>,
+                                  InstrStage<58, [FEX6]>],
+                              [73, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPSqrt      , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<68, [FRF1], 0>,
+                               InstrStage<68, [FEX1], 0>,
+                                  InstrStage<68, [FEX2], 0>,
+                               InstrStage<68, [FEX3], 0>,
+                                  InstrStage<68, [FEX4], 0>,
+                               InstrStage<68, [FEX5], 0>,
+                                  InstrStage<68, [FEX6]>],
+                              [86, 7], // FIXME: should be [86, 7] for double
+                                       // and [82, 7] for single. Likewise,
+                                       // the FEX? cycle count should be 68
+                                       // for double and 64 for single.
+                              [NoBypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7],
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// A2 machine model for scheduling and other instruction cost heuristics.
+
+def PPCA2Model : SchedMachineModel {
+  let IssueWidth = 1;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 6;
+
+  let Itineraries = PPCA2Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 000000000000..9bb779a0e62b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,265 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// Power processor.
+// 
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def DIS0 : FuncUnit; // Dispatch stage - insn 1
+def DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is executed.
+def SFX0  : FuncUnit; // Simple unit 0
+def SFX1  : FuncUnit; // Simple unit 1
+def BU    : FuncUnit; // Branch unit
+def CFX_DivBypass 
+          : FuncUnit; // CFX divide bypass path
+def CFX_0 : FuncUnit; // CFX pipeline
+def LSU_0 : FuncUnit; // LSU pipeline
+def FPU_0 : FuncUnit; // FPU pipeline
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 1, 1], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<14, [CFX_DivBypass]>],
+                              [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11], // Latency = 8
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [5, 1], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1], // Latency = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [5, 1], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [5, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [SFX0]>],
+                              [8, 1],
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [4, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<68, [FPU_0]>],
+                              [71, 1, 1], // Latency = 68, Repeat rate = 68
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE500mcItineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 000000000000..d7e11acd9fc7
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,309 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// Power processor.
+// 
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+// def DIS0 : FuncUnit;
+// def DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is being executed.
+// def SFX0  : FuncUnit; // Simple unit 0
+// def SFX1  : FuncUnit; // Simple unit 1
+// def BU    : FuncUnit; // Branch unit
+// def CFX_DivBypass 
+//           : FuncUnit; // CFX divide bypass path
+// def CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def CFX_1 : FuncUnit; // CFX pipeline stage 1 
+
+// def LSU_0 : FuncUnit; // LSU pipeline
+// def FPU_0 : FuncUnit; // FPU pipeline
+
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
+   LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<26, [CFX_DivBypass]>],
+                              [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<16, [CFX_DivBypass]>],
+                              [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<7, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 7
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                                                            
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLDARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [LSU_0]>],
+                              [8, 2], // Latency = r+3, Repeat rate = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [CFX_0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [CFX_0]>],
+                              [9, 2], // Latency = 5, Repeat rate = 5
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [CFX_0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<31, [FPU_0]>],
+                              [39, 2, 2], // Latency = 35, Repeat rate = 31
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<16, [FPU_0]>],
+                              [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [12, 2], // Latency = 8, Repeat rate = 2
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE5500Itineraries;
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 000000000000..72a0a392631a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG3.td
@@ -0,0 +1,71 @@
+//===-- PPCScheduleG3.td - PPC G3 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def G3Itineraries : ProcessorItineraries<
+  [IU1, IU2, FPU1, BPU, SRU, SLU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
+  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
+  InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<2, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 000000000000..fc9120dfa290
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4.td
@@ -0,0 +1,81 @@
+//===-- PPCScheduleG4.td - PPC G4 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4Itineraries : ProcessorItineraries<
+  [IU1, IU2, SLU, SRU, BPU, FPU1, VIU1, VIU2, VPU, VFPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<8, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<8, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecComplex  , [InstrStage<3, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<1, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<1, [VIU1]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 000000000000..a4e82ce23e6f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -0,0 +1,88 @@
+//===-- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def IU3    : FuncUnit; // integer unit 3 (7450 simple)
+def IU4    : FuncUnit; // integer unit 4 (7450 simple)
+
+def G4PlusItineraries : ProcessorItineraries<
+  [IU1, IU2, IU3, IU4, BPU, SLU, FPU1, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<2, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
+  InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
+  InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<5, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
+  InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<14, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<4, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VIU1]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecVSR      , [InstrStage<2, [VPU]>]>
+]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 000000000000..c64998d52a0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleG5.td
@@ -0,0 +1,109 @@
+//===-- PPCScheduleG5.td - PPC G5 Scheduling Definitions ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5Itineraries : ProcessorItineraries<
+  [IU1, IU2, SLU, BPU, FPU1, FPU2, VFPU, VIU1, VIU2, VPU], [], [
+  InstrItinData<IntSimple   , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
+  InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
+  InstrItinData<IntDivW     , [InstrStage<36, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<6, [IU2]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<IntMulHD    , [InstrStage<7, [IU1, IU2]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
+  InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
+  InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<4, [BPU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
+  InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
+  InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
+  InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
+  InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<SprMFSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<10, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
+  InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPFused     , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPRes       , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPSqrt      , [InstrStage<40, [FPU1, FPU2]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<5, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<3, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<3, [VPU]>]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def G5Model : SchedMachineModel {
+  let IssueWidth = 4;  // 4 (non-branch) instructions are dispatched per cycle.
+  let MinLatency = 0;  // Out-of-order dispatch.
+  let LoadLatency = 3; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+  let MispredictPenalty = 16;
+
+  let Itineraries = G5Itineraries;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..d4258b4a0eb1
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- PPCSelectionDAGInfo.cpp - PowerPC SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "powerpc-selectiondag-info"
+#include "PPCTargetMachine.h"
+using namespace llvm;
+
+PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+PPCSelectionDAGInfo::~PPCSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
new file mode 100644
index 000000000000..341b69cdfb5f
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- PPCSelectionDAGInfo.h - PowerPC SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCCSELECTIONDAGINFO_H
+#define POWERPCCSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class PPCTargetMachine;
+
+class PPCSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit PPCSelectionDAGInfo(const PPCTargetMachine &TM);
+  ~PPCSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 000000000000..a8f2b3f47d1b
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -0,0 +1,159 @@
+//===-- PowerPCSubtarget.cpp - PPC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "PPCRegisterInfo.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cstdlib>
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "PPCGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS, bool is64Bit)
+  : PPCGenSubtargetInfo(TT, CPU, FS)
+  , StackAlignment(16)
+  , DarwinDirective(PPC::DIR_NONE)
+  , HasMFOCRF(false)
+  , Has64BitSupport(false)
+  , Use64BitRegs(false)
+  , IsPPC64(is64Bit)
+  , HasAltivec(false)
+  , HasQPX(false)
+  , HasFSQRT(false)
+  , HasFRE(false)
+  , HasFRES(false)
+  , HasFRSQRTE(false)
+  , HasFRSQRTES(false)
+  , HasRecipPrec(false)
+  , HasSTFIWX(false)
+  , HasLFIWAX(false)
+  , HasFPRND(false)
+  , HasFPCVT(false)
+  , HasISEL(false)
+  , HasPOPCNTD(false)
+  , HasLDBRX(false)
+  , IsBookE(false)
+  , HasLazyResolverStubs(false)
+  , IsJITCodeModel(false)
+  , TargetTriple(TT) {
+
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "generic";
+#if (defined(__APPLE__) || defined(__linux__)) && \
+    (defined(__ppc__) || defined(__powerpc__))
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
+
+  // Initialize scheduling itinerary for the specified CPU.
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // Make sure 64-bit features are available when CPUname is generic
+  std::string FullFS = FS;
+
+  // If we are generating code for ppc64, verify that options make sense.
+  if (is64Bit) {
+    Has64BitSupport = true;
+    // Silently force 64-bit register use on ppc64.
+    Use64BitRegs = true;
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
+  }
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, ignore.
+  if (use64BitRegs() && !has64BitSupport())
+    Use64BitRegs = false;
+
+  // Set up darwin-specific properties.
+  if (isDarwin())
+    HasLazyResolverStubs = true;
+
+  // QPX requires a 32-byte aligned stack. Note that we need to do this if
+  // we're compiling for a BG/Q system regardless of whether or not QPX
+  // is enabled because external functions will assume this alignment.
+  if (hasQPX() || isBGQ())
+    StackAlignment = 32;
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+
+  // Calls to external functions need to use indirect calls
+  IsJITCodeModel = true;
+}
+
+
+/// hasLazyResolverStub - Return true if accesses to the specified global have
+/// to go through a dyld lazy resolution stub.  This means that an extra load
+/// is required to get the address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
+                                       const TargetMachine &TM) const {
+  // We never have stubs if HasLazyResolverStubs=false or if in static mode.
+  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
+    return false;
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
+    return false;
+  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+         GV->hasCommonLinkage() || isDecl;
+}
+
+bool PPCSubtarget::enablePostRAScheduler(
+           CodeGenOpt::Level OptLevel,
+           TargetSubtargetInfo::AntiDepBreakMode& Mode,
+           RegClassVector& CriticalPathRCs) const {
+  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
+  // but we can't because we can't reassign the cr registers. There is a
+  // dependence between the cr register and the RLWINM instruction used
+  // to extract its value which the anti-dependency breaker can't currently
+  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
+  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
+
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+
+  CriticalPathRCs.clear();
+
+  if (isPPC64())
+    CriticalPathRCs.push_back(&PPC::G8RCRegClass);
+  else
+    CriticalPathRCs.push_back(&PPC::GPRCRegClass);
+    
+  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
+  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
+
+  return OptLevel >= CodeGenOpt::Default;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 000000000000..65b4d211fc6a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -0,0 +1,200 @@
+//===-- PPCSubtarget.h - Define Subtarget for the PPC ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCSUBTARGET_H
+#define POWERPCSUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "PPCGenSubtargetInfo.inc"
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+class StringRef;
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
+    DIR_7400,
+    DIR_750,
+    DIR_970,
+    DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
+    DIR_PWR3,
+    DIR_PWR4,
+    DIR_PWR5,
+    DIR_PWR5X,
+    DIR_PWR6,
+    DIR_PWR6X,
+    DIR_PWR7,
+    DIR_64
+  };
+}
+
+class GlobalValue;
+class TargetMachine;
+
+class PPCSubtarget : public PPCGenSubtargetInfo {
+protected:
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool HasMFOCRF;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasQPX;
+  bool HasFSQRT;
+  bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
+  bool HasRecipPrec;
+  bool HasSTFIWX;
+  bool HasLFIWAX;
+  bool HasFPRND;
+  bool HasFPCVT;
+  bool HasISEL;
+  bool HasPOPCNTD;
+  bool HasLDBRX;
+  bool IsBookE;
+  bool HasLazyResolverStubs;
+  bool IsJITCodeModel;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  PPCSubtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS, bool is64Bit);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// SetJITMode - This is called to inform the subtarget info that we are
+  /// producing code for the JIT.
+  void SetJITMode();
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getDataLayoutString - Return the pointer size and type alignment
+  /// properties of this subtarget.
+  const char *getDataLayoutString() const {
+    // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+    // documentation are wrong; these are correct (i.e. "what gcc does").
+    if (isPPC64() && isSVR4ABI()) {
+      if (TargetTriple.getOS() == llvm::Triple::FreeBSD)
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:64:64-v128:128:128-n32:64";
+      else
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:128:128-v128:128:128-n32:64";
+    }
+
+    return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128-n32:64"
+                     : "E-p:32:32-f64:64:64-i64:64:64-f128:64:128-n32";
+  }
+
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const { return IsPPC64; }
+
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV,
+                           const TargetMachine &TM) const;
+
+  // isJITCodeModel - True if we're generating code for the JIT
+  bool isJITCodeModel() const { return IsJITCodeModel; }
+
+  // Specific obvious features.
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasFRE() const { return HasFRE; }
+  bool hasFRES() const { return HasFRES; }
+  bool hasFRSQRTE() const { return HasFRSQRTE; }
+  bool hasFRSQRTES() const { return HasFRSQRTES; }
+  bool hasRecipPrec() const { return HasRecipPrec; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasLFIWAX() const { return HasLFIWAX; }
+  bool hasFPRND() const { return HasFPRND; }
+  bool hasFPCVT() const { return HasFPCVT; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool hasQPX() const { return HasQPX; }
+  bool hasMFOCRF() const { return HasMFOCRF; }
+  bool hasISEL() const { return HasISEL; }
+  bool hasPOPCNTD() const { return HasPOPCNTD; }
+  bool hasLDBRX() const { return HasLDBRX; }
+  bool isBookE() const { return IsBookE; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return TargetTriple.isMacOSX(); }
+  /// isBGP - True if this is a BG/P platform.
+  bool isBGP() const { return TargetTriple.getVendor() == Triple::BGP; }
+  /// isBGQ - True if this is a BG/Q platform.
+  bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
+
+  bool isDarwinABI() const { return isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwin(); }
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 000000000000..fe851c1b6fb8
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -0,0 +1,137 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static cl::
+opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
+                        cl::desc("Disable CTR loops for PPC"));
+
+extern "C" void LLVMInitializePowerPCTarget() {
+  // Register the targets
+  RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
+  RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
+}
+
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, is64Bit),
+    DL(Subtarget.getDataLayoutString()), InstrInfo(*this),
+    FrameLowering(Subtarget), JITInfo(*this, is64Bit),
+    TLInfo(*this), TSInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+
+  // The binutils for the BG/P are too old for CFI.
+  if (Subtarget.isBGP())
+    setMCUseCFI(false);
+}
+
+void PPC32TargetMachine::anchor() { }
+
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void PPC64TargetMachine::anchor() { }
+
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU,  StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// PPC Code Generator Pass Configuration Options.
+class PPCPassConfig : public TargetPassConfig {
+public:
+  PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  PPCTargetMachine &getPPCTargetMachine() const {
+    return getTM<PPCTargetMachine>();
+  }
+
+  virtual bool addPreRegAlloc();
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new PPCPassConfig(this, PM);
+}
+
+bool PPCPassConfig::addPreRegAlloc() {
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoops());
+
+  return false;
+}
+
+bool PPCPassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createPPCISelDag(getPPCTargetMachine()));
+  return false;
+}
+
+bool PPCPassConfig::addPreEmitPass() {
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(createPPCBranchSelectionPass());
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      JITCodeEmitter &JCE) {
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
+
+void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our PPC pass. This
+  // allows the PPC pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createPPCTargetTransformInfoPass(this));
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 000000000000..606ccb314126
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -0,0 +1,100 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_TARGETMACHINE_H
+#define PPC_TARGETMACHINE_H
+
+#include "PPCFrameLowering.h"
+#include "PPCISelLowering.h"
+#include "PPCInstrInfo.h"
+#include "PPCJITInfo.h"
+#include "PPCSelectionDAGInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+  PPCSubtarget        Subtarget;
+  const DataLayout    DL;       // Calculates type size & alignment
+  PPCInstrInfo        InstrInfo;
+  PPCFrameLowering    FrameLowering;
+  PPCJITInfo          JITInfo;
+  PPCTargetLowering   TLInfo;
+  PPCSelectionDAGInfo TSInfo;
+  InstrItineraryData  InstrItins;
+
+public:
+  PPCTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL, bool is64Bit);
+
+  virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
+  virtual const PPCFrameLowering  *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual       PPCJITInfo        *getJITInfo()         { return &JITInfo; }
+  virtual const PPCTargetLowering *getTargetLowering() const {
+   return &TLInfo;
+  }
+  virtual const PPCSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual const PPCRegisterInfo   *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual const DataLayout    *getDataLayout() const    { return &DL; }
+  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
+  }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM,
+                              JITCodeEmitter &JCE);
+
+  /// \brief Register PPC analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
+};
+
+/// PPC32TargetMachine - PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+/// PPC64TargetMachine - PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+  virtual void anchor();
+public:
+  PPC64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
new file mode 100644
index 000000000000..2504ba70c25a
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -0,0 +1,240 @@
+//===-- PPCTargetTransformInfo.cpp - PPC specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// PPC target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppctti"
+#include "PPC.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializePPCTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class PPCTTI : public ImmutablePass, public TargetTransformInfo {
+  const PPCTargetMachine *TM;
+  const PPCSubtarget *ST;
+  const PPCTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  PPCTTI(const PPCTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializePPCTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind,
+                                          OperandValueKind) const;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                  int Index, Type *SubTp) const;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(PPCTTI, TargetTransformInfo, "ppctti",
+                   "PPC Target Transform Info", true, true, false)
+char PPCTTI::ID = 0;
+
+ImmutablePass *
+llvm::createPPCTargetTransformInfoPass(const PPCTargetMachine *TM) {
+  return new PPCTTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// PPC cost model.
+//
+//===----------------------------------------------------------------------===//
+
+PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (ST->hasPOPCNTD() && TyWidth <= 64)
+    return PSK_FastHardware;
+  return PSK_Software;
+}
+
+unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasAltivec())
+    return 0;
+  return 32;
+}
+
+unsigned PPCTTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAltivec()) return 128;
+    return 0;
+  }
+
+  if (ST->isPPC64())
+    return 64;
+  return 32;
+
+}
+
+unsigned PPCTTI::getMaximumUnrollFactor() const {
+  unsigned Directive = ST->getDarwinDirective();
+  // The 440 has no SIMD support, but floating-point instructions
+  // have a 5-cycle latency, so unroll by 5x for latency hiding.
+  if (Directive == PPC::DIR_440)
+    return 5;
+
+  // The A2 has no SIMD support, but floating-point instructions
+  // have a 6-cycle latency, so unroll by 6x for latency hiding.
+  if (Directive == PPC::DIR_A2)
+    return 6;
+
+  // FIXME: For lack of any better information, do no harm...
+  if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
+    return 1;
+
+  // For most things, modern systems have two execution units (and
+  // out-of-order execution).
+  return 2;
+}
+
+unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
+                                        OperandValueKind Op2Info) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  // Fallback to the default implementation.
+  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+                                                     Op2Info);
+}
+
+unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+unsigned PPCTTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+  assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned PPCTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Estimated cost of a load-hit-store delay.  This was obtained
+  // experimentally as a minimum needed to prevent unprofitable
+  // vectorization for the paq8p benchmark.  It may need to be
+  // raised further if other unprofitable cases remain.
+  unsigned LHSPenalty = 12;
+
+  // Vector element insert/extract with Altivec is very expensive,
+  // because they require store and reload with the attendant
+  // processor stall for load-hit-store.  Until VSX is available,
+  // these need to be estimated as very costly.
+  if (ISD == ISD::EXTRACT_VECTOR_ELT ||
+      ISD == ISD::INSERT_VECTOR_ELT)
+    return LHSPenalty +
+      TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // PPC in general does not support unaligned loads and stores. They'll need
+  // to be decomposed based on the alignment factor.
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (SrcBytes && Alignment && Alignment < SrcBytes)
+    Cost *= (SrcBytes/Alignment);
+
+  return Cost;
+}
+
diff --git a/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 000000000000..fa44331b8af6
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePPC32Target, llvm::ThePPC64Target;
+
+extern "C" void LLVMInitializePowerPCTargetInfo() { 
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true>
+    X(ThePPC32Target, "ppc32", "PowerPC 32");
+
+  RegisterTarget<Triple::ppc64, /*HasJIT=*/true>
+    Y(ThePPC64Target, "ppc64", "PowerPC 64");
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.h b/contrib/llvm/lib/Target/R600/AMDGPU.h
new file mode 100644
index 000000000000..0b01433cc926
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPU.h
@@ -0,0 +1,51 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_H
+#define AMDGPU_H
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class AMDGPUTargetMachine;
+
+// R600 Passes
+FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+
+// SI Passes
+FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);
+
+// Passes common to R600 and SI
+Pass *createAMDGPUStructurizeCFGPass();
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm);
+
+} // End namespace llvm
+
+namespace ShaderType {
+  enum Type {
+    PIXEL = 0,
+    VERTEX = 1,
+    GEOMETRY = 2,
+    COMPUTE = 3
+  };
+}
+
+#endif // AMDGPU_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPU.td b/contrib/llvm/lib/Target/R600/AMDGPU.td
new file mode 100644
index 000000000000..1a26c77d6bb2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPU.td
@@ -0,0 +1,41 @@
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+// Include AMDIL TD files
+include "AMDILBase.td"
+
+
+def AMDGPUInstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+def AMDGPUAsmWriter : AsmWriter {
+    string AsmWriterClassName = "InstPrinter";
+    int Variant = 0;
+    bit isMCAsmWriter = 1;
+}
+
+def AMDGPU : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDGPUInstrInfo;
+  let AssemblyWriters = [AMDGPUAsmWriter];
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
+include "AMDGPUCallingConv.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
new file mode 100644
index 000000000000..f6001445f4b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,145 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPU.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+
+static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                                              MCStreamer &Streamer) {
+  return new AMDGPUAsmPrinter(tm, Streamer);
+}
+
+extern "C" void LLVMInitializeR600AsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+}
+
+/// We need to override this function so we can avoid
+/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+  }
+  SetupMachineFunction(MF);
+  if (OutStreamer.hasRawTextSupport()) {
+    OutStreamer.EmitRawText("@" + MF.getName() + ":");
+  }
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+  if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    EmitProgramInfo(MF);
+  }
+  EmitFunctionBody();
+  return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
+  unsigned MaxSGPR = 0;
+  unsigned MaxVGPR = 0;
+  bool VCCUsed = false;
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                    I != E; ++I) {
+      MachineInstr &MI = *I;
+
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        MachineOperand & MO = MI.getOperand(op_idx);
+        unsigned maxUsed;
+        unsigned width = 0;
+        bool isSGPR = false;
+        unsigned reg;
+        unsigned hwReg;
+        if (!MO.isReg()) {
+          continue;
+        }
+        reg = MO.getReg();
+        if (reg == AMDGPU::VCC) {
+          VCCUsed = true;
+          continue;
+        }
+        switch (reg) {
+        default: break;
+        case AMDGPU::EXEC:
+        case AMDGPU::M0:
+          continue;
+        }
+
+        if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 8;
+        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 16;
+        } else {
+          assert(!"Unknown register class");
+        }
+        hwReg = RI->getEncodingValue(reg) & 0xff;
+        maxUsed = hwReg + width - 1;
+        if (isSGPR) {
+          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+        } else {
+          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+        }
+      }
+    }
+  }
+  if (VCCUsed) {
+    MaxSGPR += 2;
+  }
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
+  OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
+  OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h
new file mode 100644
index 000000000000..3812282b1798
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -0,0 +1,44 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_ASMPRINTER_H
+#define AMDGPU_ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class AMDGPUAsmPrinter : public AsmPrinter {
+
+public:
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "AMDGPU Assembly Printer";
+  }
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfo(MachineFunction &MF);
+
+  /// Implemented in AMDGPUMCInstLower.cpp
+  virtual void EmitInstruction(const MachineInstr *MI);
+};
+
+} // End anonymous llvm
+
+#endif //AMDGPU_ASMPRINTER_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td
new file mode 100644
index 000000000000..45ae37ef0c7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUCallingConv.td
@@ -0,0 +1,42 @@
+//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMD Radeon GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+// Inversion of CCIfInReg
+class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+
+// Calling convention for SI
+def CC_SI : CallingConv<[
+
+  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+  ]>>>,
+
+  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+  >>>,
+
+  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
+  ]>>>
+
+]>;
+
+def CC_AMDGPU : CallingConv<[
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
+       "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
+]>;
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUConvertToISA.cpp b/contrib/llvm/lib/Target/R600/AMDGPUConvertToISA.cpp
new file mode 100644
index 000000000000..50297d1f60c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -0,0 +1,62 @@
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers AMDIL machine instructions to the appropriate
+/// hardware instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUConvertToISAPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  TargetMachine &TM;
+
+public:
+  AMDGPUConvertToISAPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TM(tm) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+
+};
+
+} // End anonymous namespace
+
+char AMDGPUConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
+  return new AMDGPUConvertToISAPass(tm);
+}
+
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
+  const AMDGPUInstrInfo * TII =
+                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp
new file mode 100644
index 000000000000..815d6f71c3be
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -0,0 +1,122 @@
+//===----------------------- AMDGPUFrameLowering.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() { }
+
+unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
+
+  // XXX: Hardcoding to 1 for now.
+  //
+  // I think the StackWidth should stored as metadata associated with the
+  // MachineFunction.  This metadata can either be added by a frontend, or
+  // calculated by a R600 specific LLVM IR pass.
+  //
+  // The StackWidth determines how stack objects are laid out in memory.
+  // For a vector stack variable, like: int4 stack[2], the data will be stored
+  // in the following ways depending on the StackWidth.
+  //
+  // StackWidth = 1:
+  //
+  // T0.X = stack[0].x
+  // T1.X = stack[0].y
+  // T2.X = stack[0].z
+  // T3.X = stack[0].w
+  // T4.X = stack[1].x
+  // T5.X = stack[1].y
+  // T6.X = stack[1].z
+  // T7.X = stack[1].w
+  //
+  // StackWidth = 2:
+  //
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T1.X = stack[0].z
+  // T1.Y = stack[0].w
+  // T2.X = stack[1].x
+  // T2.Y = stack[1].y
+  // T3.X = stack[1].z
+  // T3.Y = stack[1].w
+  // 
+  // StackWidth = 4:
+  // T0.X = stack[0].x
+  // T0.Y = stack[0].y
+  // T0.Z = stack[0].z
+  // T0.W = stack[0].w
+  // T1.X = stack[1].x
+  // T1.Y = stack[1].y
+  // T1.Z = stack[1].z
+  // T1.W = stack[1].w
+  return 1;
+}
+
+/// \returns The number of registers allocated for \p FI.
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Offset = 0;
+  int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
+
+  for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
+    const AllocaInst *Alloca = MFI->getObjectAllocation(i);
+    unsigned ArrayElements;
+    const Type *AllocaType = Alloca->getAllocatedType();
+    const Type *ElementType;
+
+    if (AllocaType->isArrayTy()) {
+      ArrayElements = AllocaType->getArrayNumElements();
+      ElementType = AllocaType->getArrayElementType();
+    } else {
+      ArrayElements = 1;
+      ElementType = AllocaType;
+    }
+
+    unsigned VectorElements;
+    if (ElementType->isVectorTy()) {
+      VectorElements = ElementType->getVectorNumElements();
+    } else {
+      VectorElements = 1;
+    }
+
+    Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+  }
+  return Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 0;
+  return 0;
+}
+void
+AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
+}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+}
+
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h
new file mode 100644
index 000000000000..cf5742ee0952
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUFrameLowering.h
@@ -0,0 +1,44 @@
+//===--------------------- AMDGPUFrameLowering.h ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#ifndef AMDILFRAME_LOWERING_H
+#define AMDILFRAME_LOWERING_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                      unsigned TransAl = 1);
+  virtual ~AMDGPUFrameLowering();
+
+  /// \returns The number of 32-bit sub-registers that are used when storing
+  /// values to the stack.
+  virtual unsigned getStackWidth(const MachineFunction &MF) const;
+  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual bool hasFP(const MachineFunction &MF) const;
+};
+} // namespace llvm
+#endif // AMDILFRAME_LOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
new file mode 100644
index 000000000000..a266df535d56
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -0,0 +1,414 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+using namespace llvm;
+
+#include "AMDGPUGenCallingConv.inc"
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
+  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+
+  // Initialize target lowering borrowed from AMDIL
+  InitAMDILLowering();
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
+
+  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
+}
+
+SDValue AMDGPUTargetLowering::LowerReturn(
+                                     SDValue Chain,
+                                     CallingConv::ID CallConv,
+                                     bool isVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     DebugLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+    const {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    assert(0 && "Custom lowering code for this"
+        "instruction is not implemented yet!");
+    break;
+  // AMDIL DAG lowering
+  case ISD::SDIV: return LowerSDIV(Op, DAG);
+  case ISD::SREM: return LowerSREM(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  // AMDGPU DAG lowering
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  }
+  return Op;
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDIL_abs:
+      return LowerIntrinsicIABS(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_exp:
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_lrp:
+      return LowerIntrinsicLRP(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_fraction:
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDIL_max:
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imax:
+      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umax:
+      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_min:
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imin:
+      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_round_nearest:
+      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+  }
+}
+
+///IABS(a) = SMAX(sub(0, a), a)
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
+    SelectionDAG &DAG) const {
+
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                              Op.getOperand(1));
+
+  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+}
+
+/// Linear Interpolation
+/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
+                                DAG.getConstantFP(1.0f, MVT::f32),
+                                Op.getOperand(1));
+  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
+                                                    Op.getOperand(3));
+  return DAG.getNode(ISD::FADD, DL, VT,
+      DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
+      OneSubAC);
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+
+  if (VT != MVT::f32 ||
+      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
+    return SDValue();
+  }
+
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETOEQ:
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETNE:
+  case ISD::SETUEQ:
+  case ISD::SETEQ:
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2:
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:
+  case ISD::SETUO:
+  case ISD::SETO:
+    assert(0 && "Operation should already be optimised !");
+  case ISD::SETULE:
+  case ISD::SETULT:
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+    else
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGE:
+  case ISD::SETOGE:
+  case ISD::SETUGT:
+  case ISD::SETOGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+    else
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+  }
+  case ISD::SETCC_INVALID:
+    assert(0 && "Invalid setcc condcode !");
+  }
+  return Op;
+}
+
+
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  SmallVector<SDValue, 8> Results;
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = umulo(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, VT),
+                                                 DAG.getConstant(0, VT),
+                                                 ISD::SETGE);
+  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
+                                                  DAG.getConstant(0, VT),
+                                                  DAG.getConstant(-1, VT),
+                                                  DAG.getConstant(0, VT),
+                                                  ISD::SETGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+  SDValue Ops[2];
+  Ops[0] = Div;
+  Ops[1] = Rem;
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isAllOnesValue();
+  }
+  return false;
+}
+
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isNullValue();
+  }
+  return false;
+}
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                                  const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned VirtualRegister;
+  if (!MRI.isLiveIn(Reg)) {
+    VirtualRegister = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VirtualRegister);
+  } else {
+    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+  }
+  return DAG.getRegister(VirtualRegister, VT);
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  // AMDIL DAG nodes
+  NODE_NAME_CASE(CALL);
+  NODE_NAME_CASE(UMUL);
+  NODE_NAME_CASE(DIV_INF);
+  NODE_NAME_CASE(RET_FLAG);
+  NODE_NAME_CASE(BRANCH_COND);
+
+  // AMDGPU DAG nodes
+  NODE_NAME_CASE(DWORDADDR)
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(SMAX)
+  NODE_NAME_CASE(UMAX)
+  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(SMIN)
+  NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(EXPORT)
+  NODE_NAME_CASE(CONST_ADDRESS)
+  NODE_NAME_CASE(REGISTER_LOAD)
+  NODE_NAME_CASE(REGISTER_STORE)
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h
new file mode 100644
index 000000000000..f31b6466bd46
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUISelLowering.h
@@ -0,0 +1,140 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUISELLOWERING_H
+#define AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+private:
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+
+protected:
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                                                  unsigned Reg, EVT VT) const;
+
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+  void AnalyzeFormalArguments(CCState &State,
+                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM);
+
+  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              DebugLoc DL, SelectionDAG &DAG) const;
+  virtual SDValue LowerCall(CallLoweringInfo &CLI,
+                            SmallVectorImpl<SDValue> &InVals) const {
+    CLI.Callee.dump();
+    llvm_unreachable("Undefined function");
+  }
+
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
+  virtual const char* getTargetNodeName(unsigned Opcode) const;
+
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+    return N;
+  }
+
+// Functions defined in AMDILISelLowering.cpp
+public:
+
+  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// either zero or one and return them in the \p KnownZero and \p KnownOne
+  /// bitsets.
+  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                              APInt &KnownZero,
+                                              APInt &KnownOne,
+                                              const SelectionDAG &DAG,
+                                              unsigned Depth = 0) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                  const CallInst &I, unsigned Intrinsic) const;
+
+  /// We want to mark f32/f64 floating point values as legal.
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  /// We don't want to shrink f64/f32 constants.
+  bool ShouldShrinkFPConstant(EVT VT) const;
+
+private:
+  void InitAMDILLowering();
+  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+  EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+};
+
+namespace AMDGPUISD {
+
+enum {
+  // AMDIL ISD Opcodes
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  CALL,        // Function call based on a single integer
+  UMUL,        // 32bit unsigned multiplication
+  DIV_INF,      // Divide with infinity returned on zero divisor
+  RET_FLAG,
+  BRANCH_COND,
+  // End AMDIL ISD Opcodes
+  BITALIGN,
+  DWORDADDR,
+  FRACT,
+  FMAX,
+  SMAX,
+  UMAX,
+  FMIN,
+  SMIN,
+  UMIN,
+  URECIP,
+  EXPORT,
+  CONST_ADDRESS,
+  REGISTER_LOAD,
+  REGISTER_STORE,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+} // End namespace llvm
+
+#endif // AMDGPUISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/contrib/llvm/lib/Target/R600/AMDGPUIndirectAddressing.cpp
new file mode 100644
index 000000000000..ed6c8ec55dd2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -0,0 +1,343 @@
+//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// Instructions can use indirect addressing to index the register file as if it
+/// were memory.  This pass lowers RegisterLoad and RegisterStore instructions
+/// to either a COPY or a MOV that uses indirect addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const AMDGPUInstrInfo *TII;
+
+  bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
+
+public:
+  AMDGPUIndirectAddressingPass(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo()))
+    { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const { return "R600 Handle indirect addressing"; }
+
+};
+
+} // End anonymous namespace
+
+char AMDGPUIndirectAddressingPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
+  return new AMDGPUIndirectAddressingPass(tm);
+}
+
+bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  int IndirectBegin = TII->getIndirectIndexBegin(MF);
+  int IndirectEnd = TII->getIndirectIndexEnd(MF);
+
+  if (IndirectBegin == -1) {
+    // No indirect addressing, we can skip this pass
+    assert(IndirectEnd == -1);
+    return false;
+  }
+
+  // The map keeps track of the indirect address that is represented by
+  // each virtual register. The key is the register and the value is the
+  // indirect address it uses.
+  std::map<unsigned, unsigned> RegisterAddressMap;
+
+  // First pass - Lower all of the RegisterStore instructions and track which
+  // registers are live.
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                      BB != BB_E; ++BB) {
+    // This map keeps track of the current live indirect registers.
+    // The key is the address and the value is the register
+    std::map<unsigned, unsigned> LiveAddressRegisterMap;
+    MachineBasicBlock &MBB = *BB;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+
+      if (!TII->isRegisterStore(MI)) {
+        continue;
+      }
+
+      // Lower RegisterStore
+
+      unsigned RegIndex = MI.getOperand(2).getImm();
+      unsigned Channel = MI.getOperand(3).getImm();
+      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
+      const TargetRegisterClass *IndirectStoreRegClass =
+                   TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
+
+      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
+        // Direct register access.
+        unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
+
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
+                .addOperand(MI.getOperand(0));
+
+        RegisterAddressMap[DstReg] = Address;
+        LiveAddressRegisterMap[Address] = DstReg;
+      } else {
+        // Indirect register access.
+        MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
+                                           MI.getOperand(0).getReg(), // Value
+                                           Address,
+                                           MI.getOperand(1).getReg()); // Offset
+        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
+          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
+          unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
+          MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
+          RegisterAddressMap[DstReg] = Addr;
+          LiveAddressRegisterMap[Addr] = DstReg;
+        }
+      }
+      MI.eraseFromParent();
+    }
+
+    // Update the live-ins of the succesor blocks
+    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
+                                          SuccEnd = MBB.succ_end();
+                                          SuccEnd != Succ; ++Succ) {
+      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
+      for (Key = LiveAddressRegisterMap.begin(),
+           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
+        (*Succ)->addLiveIn(Key->second);
+      }
+    }
+  }
+
+  // Second pass - Lower the RegisterLoad instructions
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                      BB != BB_E; ++BB) {
+    // Key is the address and the value is the register
+    std::map<unsigned, unsigned> LiveAddressRegisterMap;
+    MachineBasicBlock &MBB = *BB;
+
+    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
+    while (LI != MBB.livein_end()) {
+      std::vector<unsigned> PhiRegisters;
+
+      // Make sure this live in is used for indirect addressing
+      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
+        ++LI;
+        continue;
+      }
+
+      unsigned Address = RegisterAddressMap[*LI];
+      LiveAddressRegisterMap[Address] = *LI;
+      PhiRegisters.push_back(*LI);
+
+      // Check if there are other live in registers which map to the same
+      // indirect address.
+      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
+                                              LE = MBB.livein_end();
+                                              LJ != LE; ++LJ) {
+        unsigned Reg = *LJ;
+        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
+          continue;
+        }
+
+        if (RegisterAddressMap[Reg] == Address) {
+          PhiRegisters.push_back(Reg);
+        }
+      }
+
+      if (PhiRegisters.size() == 1) {
+        // We don't need to insert a Phi instruction, so we can just add the
+        // registers to the live list for the block.
+        LiveAddressRegisterMap[Address] = *LI;
+        MBB.removeLiveIn(*LI);
+      } else {
+        // We need to insert a PHI, because we have the same address being
+        // written in multiple predecessor blocks.
+        const TargetRegisterClass *PhiDstClass =
+                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
+        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
+        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
+                                          MBB.findDebugLoc(MBB.begin()),
+                                          TII->get(AMDGPU::PHI), PhiDstReg);
+
+        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
+                                                   RE = PhiRegisters.end();
+                                                   RI != RE; ++RI) {
+          unsigned Reg = *RI;
+          MachineInstr *DefInst = MRI.getVRegDef(Reg);
+          assert(DefInst);
+          MachineBasicBlock *RegBlock = DefInst->getParent();
+          Phi.addReg(Reg);
+          Phi.addMBB(RegBlock);
+          MBB.removeLiveIn(Reg);
+        }
+        RegisterAddressMap[PhiDstReg] = Address;
+        LiveAddressRegisterMap[Address] = PhiDstReg;
+      }
+      LI = MBB.livein_begin();
+    }
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+
+      if (!TII->isRegisterLoad(MI)) {
+        if (MI.getOpcode() == AMDGPU::PHI) {
+          continue;
+        }
+        // Check for indirect register defs
+        for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
+                                 OpIdx < NumOperands; ++OpIdx) {
+          MachineOperand &MO = MI.getOperand(OpIdx);
+          if (MO.isReg() && MO.isDef() &&
+              RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
+            unsigned Reg = MO.getReg();
+            unsigned LiveAddress = RegisterAddressMap[Reg];
+            // Chain the live-ins
+            if (LiveAddressRegisterMap.find(LiveAddress) !=
+                                                     RegisterAddressMap.end()) {
+              MI.addOperand(MachineOperand::CreateReg(
+                                  LiveAddressRegisterMap[LiveAddress],
+                                  false, // isDef
+                                  true,  // isImp
+                                  true));  // isKill
+            }
+            LiveAddressRegisterMap[LiveAddress] = Reg;
+          }
+        }
+        continue;
+      }
+
+      const TargetRegisterClass *SuperIndirectRegClass =
+                                                TII->getSuperIndirectRegClass();
+      const TargetRegisterClass *IndirectLoadRegClass =
+                                             TII->getIndirectAddrLoadRegClass();
+      unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
+
+      unsigned RegIndex = MI.getOperand(2).getImm();
+      unsigned Channel = MI.getOperand(3).getImm();
+      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
+
+      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
+        // Direct register access
+        unsigned Reg = LiveAddressRegisterMap[Address];
+        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
+
+        if (regHasExplicitDef(MRI, Reg)) {
+          // If the register we are reading from has an explicit def, then that
+          // means it was written via a direct register access (i.e. COPY
+          // or other instruction that doesn't use indirect addressing).  In
+          // this case we know where the value has been stored, so we can just
+          // issue a copy.
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
+                  MI.getOperand(0).getReg())
+                  .addReg(Reg);
+        } else {
+          // If the register we are reading has an implicit def, then that
+          // means it was written by an indirect register access (i.e. An
+          // instruction that uses indirect addressing. 
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
+                   MI.getOperand(0).getReg())
+                   .addReg(AddrReg)
+                   .addReg(Reg, RegState::Implicit);
+        }
+      } else {
+        // Indirect register access
+
+        // Note on REQ_SEQUENCE instructons: You can't actually use the register
+        // it defines unless  you have an instruction that takes the defined
+        // register class as an operand.
+
+        MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
+                                               TII->get(AMDGPU::REG_SEQUENCE),
+                                               IndirectReg);
+        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
+          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
+          if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
+            continue;
+          }
+          unsigned Reg = LiveAddressRegisterMap[Addr];
+
+          // We only need to use REG_SEQUENCE for explicit defs, since the
+          // register coalescer won't do anything with the implicit defs.
+          if (!regHasExplicitDef(MRI, Reg)) {
+            continue;
+          }
+
+          // Insert a REQ_SEQUENCE instruction to force the register allocator
+          // to allocate the virtual register to the correct physical register.
+          Sequence.addReg(LiveAddressRegisterMap[Addr]);
+          Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
+        }
+        MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
+                                           MI.getOperand(0).getReg(), // Value
+                                           Address,
+                                           MI.getOperand(1).getReg()); // Offset
+
+
+
+        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
+        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
+
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
+                                                  unsigned Reg) const {
+  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
+
+  if (!DefInstr) {
+    return false;
+  }
+
+  if (DefInstr->getOpcode() == AMDGPU::PHI) {
+    bool Explicit = false;
+    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
+                                          E = DefInstr->operands_end();
+                                          I != E; ++I) {
+      const MachineOperand &MO = *I;
+      if (!MO.isReg() || MO.isDef()) {
+        continue;
+      }
+
+      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
+    }
+    return Explicit;
+  }
+
+  return DefInstr->getOperand(0).isReg() &&
+         DefInstr->getOperand(0).getReg() == Reg;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp
new file mode 100644
index 000000000000..30f736c84c25
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -0,0 +1,267 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_INSTRINFO_CTOR
+#define GET_INSTRMAP_INFO
+#include "AMDGPUGenInstrInfo.inc"
+
+using namespace llvm;
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
+  : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
+
+const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+// TODO: Implement this function
+  return false;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                   int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+
+MachineInstr *
+AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const {
+// TODO: Implement this function
+  return NULL;
+}
+bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                                        MachineBasicBlock &MBB) const {
+  while (iter != MBB.end()) {
+    switch (iter->getOpcode()) {
+    default:
+      break;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32:
+    case AMDGPU::BRANCH:
+      return true;
+    };
+    ++iter;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator tmp = MBB->end();
+  if (!MBB->size()) {
+    return MBB->end();
+  }
+  while (--tmp) {
+    if (tmp->getOpcode() == AMDGPU::ENDLOOP
+        || tmp->getOpcode() == AMDGPU::ENDIF
+        || tmp->getOpcode() == AMDGPU::ELSE) {
+      if (tmp == MBB->begin()) {
+        return tmp;
+      } else {
+        continue;
+      }
+    }  else {
+      return ++tmp;
+    }
+  }
+  return MBB->end();
+}
+
+void
+AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill,
+                                    int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+  assert(!"Not Implemented");
+}
+
+void
+AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const {
+  assert(!"Not Implemented");
+}
+
+MachineInstr *
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+MachineInstr*
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const {
+  // TODO: Implement this function
+  return 0;
+}
+bool
+AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                     const SmallVectorImpl<unsigned> &Ops) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                 unsigned Reg, bool UnfoldLoad,
+                                 bool UnfoldStore,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                    SmallVectorImpl<SDNode*> &NewNodes) const {
+  // TODO: Implement this function
+  return false;
+}
+
+unsigned
+AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                           bool UnfoldLoad, bool UnfoldStore,
+                                           unsigned *LoadRegIndex) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                             int64_t Offset1, int64_t Offset2,
+                                             unsigned NumLoads) const {
+  assert(Offset2 > Offset1
+         && "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 16,
+  // then schedule together.
+  // TODO: Make the loads schedule near if it fits in a cacheline
+  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+}
+
+bool
+AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // TODO: Implement this function
+  return true;
+}
+void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                                  const SmallVectorImpl<MachineOperand> &Pred2)
+  const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                      std::vector<MachineOperand> &Pred) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
+  // TODO: Implement this function
+  return MI->getDesc().isPredicable();
+}
+
+bool
+AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // TODO: Implement this function
+  return true;
+}
+
+bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE;
+}
+
+bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
+}
+
+
+void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPURegisterInfo & RI = getRegisterInfo();
+
+  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+    MachineOperand &MO = MI.getOperand(i);
+    // Convert dst regclass to one that is supported by the ISA
+    if (MO.isReg() && MO.isDef()) {
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
+        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
+
+        assert(newRegClass);
+
+        MRI.setRegClass(MO.getReg(), newRegClass);
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
new file mode 100644
index 000000000000..3909e4e105ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.h
@@ -0,0 +1,206 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTRUCTIONINFO_H
+#define AMDGPUINSTRUCTIONINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <map>
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
+#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
+#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
+#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+  const AMDGPURegisterInfo RI;
+  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                          MachineBasicBlock &MBB) const;
+protected:
+  TargetMachine &TM;
+public:
+  explicit AMDGPUInstrInfo(TargetMachine &tm);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const;
+  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  bool hasStoreFromStackSlot(const MachineInstr *MI,
+                             const MachineMemOperand *&MMO,
+                             int &FrameIndex) const;
+
+  MachineInstr *
+  convertToThreeAddress(MachineFunction::iterator &MFI,
+                        MachineBasicBlock::iterator &MBBI,
+                        LiveVariables *LV) const;
+
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const = 0;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+protected:
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const;
+public:
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                            const SmallVectorImpl<unsigned> &Ops) const;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode *> &NewNodes) const;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex = 0) const;
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const;
+  bool isPredicated(const MachineInstr *MI) const;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const;
+  bool isPredicable(MachineInstr *MI) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  // Helper functions that check the opcode for status information
+  bool isLoadInst(llvm::MachineInstr *MI) const;
+  bool isExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isStoreInst(llvm::MachineInstr *MI) const;
+  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
+  bool isRegisterStore(const MachineInstr &MI) const;
+  bool isRegisterLoad(const MachineInstr &MI) const;
+
+//===---------------------------------------------------------------------===//
+// Pure virtual funtions to be implemented by sub-classes.
+//===---------------------------------------------------------------------===//
+
+  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                       int64_t Imm) const = 0;
+  virtual unsigned getIEQOpcode() const = 0;
+  virtual bool isMov(unsigned opcode) const = 0;
+
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
+
+  /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
+  ///        \p Channel
+  ///
+  /// We model indirect addressing using a virtual address space that can be
+  /// accesed with loads and stores.  The "Indirect Address" is the memory
+  /// address in this virtual address space that maps to the given \p RegIndex
+  /// and \p Channel.
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const = 0;
+
+  /// \returns The register class to be used for storing values to an
+  /// "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                  unsigned SourceReg) const = 0;
+
+  /// \returns The register class to be used for loading values from
+  /// an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
+
+  /// \brief Build instruction(s) for an indirect register write.
+  ///
+  /// \returns The instruction that performs the indirect register write
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \brief Build instruction(s) for an indirect register read.
+  ///
+  /// \returns The instruction that performs the indirect register read
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned ValueReg, unsigned Address,
+                                    unsigned OffsetReg) const = 0;
+
+  /// \returns the register class whose sub registers are the set of all
+  /// possible registers that can be used for indirect addressing.
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
+
+
+  /// \brief Convert the AMDIL MachineInstr to a supported ISA
+  /// MachineInstr
+  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const;
+
+};
+
+} // End llvm namespace
+
+#define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
+#define AMDGPU_FLAG_REGISTER_STORE (UINT64_C(1) << 62)
+
+#endif // AMDGPUINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
new file mode 100644
index 000000000000..b66ae879dc20
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstrInfo.td
@@ -0,0 +1,82 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+// out = ((a << 32) | b) >> c)
+//
+// Can be used to optimize rtol:
+// rotl(a, b) = bitalign(a, a, 32 - b)
+def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = max(a, b) a and b are floats
+def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats
+def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a snd b are signed ints
+def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
+
+def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
+                          SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                          [SDNPHasChain, SDNPMayLoad]>;
+
+def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
+                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
+                           [SDNPHasChain, SDNPMayStore]>;
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
new file mode 100644
index 000000000000..e740348717c7
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUInstructions.td
@@ -0,0 +1,266 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bit isRegisterLoad = 0;
+  field bit isRegisterStore = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+
+  let TSFlags{63} = isRegisterLoad;
+  let TSFlags{62} = isRegisterStore;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+
+def COND_EQ : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOEQ: case ISD::SETUEQ:
+                     case ISD::SETEQ: return true;}}}]
+>;
+
+def COND_NE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETONE: case ISD::SETUNE:
+                     case ISD::SETNE: return true;}}}]
+>;
+def COND_GT : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOGT: case ISD::SETUGT:
+                     case ISD::SETGT: return true;}}}]
+>;
+
+def COND_GE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOGE: case ISD::SETUGE:
+                     case ISD::SETGE: return true;}}}]
+>;
+
+def COND_LT : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOLT: case ISD::SETULT:
+                     case ISD::SETLT: return true;}}}]
+>;
+
+def COND_LE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOLE: case ISD::SETULE:
+                     case ISD::SETLE: return true;}}}]
+>;
+
+def COND_NULL : PatLeaf <
+  (cond),
+  [{return false;}]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+let usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set rc:$dst, (fabs rc:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set rc:$dst, (fneg rc:$src0))]
+>;
+
+} // usesCustomInserter = 1
+
+multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
+                    ComplexPattern addrPat> {
+  def RegisterLoad : AMDGPUShaderInst <
+    (outs dstClass:$dst),
+    (ins addrClass:$addr, i32imm:$chan),
+    "RegisterLoad $dst, $addr",
+    [(set (i32 dstClass:$dst), (AMDGPUregister_load addrPat:$addr,
+                                                    (i32 timm:$chan)))]
+  > {
+    let isRegisterLoad = 1;
+  }
+
+  def RegisterStore : AMDGPUShaderInst <
+    (outs),
+    (ins dstClass:$val, addrClass:$addr, i32imm:$chan),
+    "RegisterStore $val, $addr",
+    [(AMDGPUregister_store (i32 dstClass:$val), addrPat:$addr, (i32 timm:$chan))]
+  > {
+    let isRegisterStore = 1;
+  }
+}
+
+} // End isCodeGenOnly = 1, isPseudo = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
+                  RegisterClass rc> : Pat <
+  (fpow rc:$src0, rc:$src1),
+  (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type,
+                     RegisterClass vec_class, int sub_idx, 
+                     SubRegIndex sub_reg>: Pat<
+  (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
+  (EXTRACT_SUBREG vec_class:$src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      RegisterClass elem_class, RegisterClass vec_class,
+                      int sub_idx, SubRegIndex sub_reg> : Pat <
+
+  (vec_type (vector_insert (vec_type vec_class:$vec),
+                           (elem_type elem_class:$elem), sub_idx)),
+  (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
+>;
+
+// Vector Build pattern
+class Vector1_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$src))),
+  (vecType elemClass:$src)
+>;
+
+class Vector2_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1))),
+  (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1)
+>;
+
+class Vector4_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
+                         (elemType elemClass:$z), (elemType elemClass:$w))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$x, sub0), elemClass:$y, sub1),
+                            elemClass:$z, sub2), elemClass:$w, sub3)
+>;
+
+class Vector8_Build <ValueType vecType, RegisterClass vectorClass,
+                     ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
+                         (elemType elemClass:$sub2), (elemType elemClass:$sub3),
+                         (elemType elemClass:$sub4), (elemType elemClass:$sub5),
+                         (elemType elemClass:$sub6), (elemType elemClass:$sub7))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
+                            elemClass:$sub2, sub2), elemClass:$sub3, sub3),
+                            elemClass:$sub4, sub4), elemClass:$sub5, sub5),
+                            elemClass:$sub6, sub6), elemClass:$sub7, sub7)
+>;
+
+class Vector16_Build <ValueType vecType, RegisterClass vectorClass,
+                      ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$sub0), (elemType elemClass:$sub1),
+                         (elemType elemClass:$sub2), (elemType elemClass:$sub3),
+                         (elemType elemClass:$sub4), (elemType elemClass:$sub5),
+                         (elemType elemClass:$sub6), (elemType elemClass:$sub7),
+                         (elemType elemClass:$sub8), (elemType elemClass:$sub9),
+                         (elemType elemClass:$sub10), (elemType elemClass:$sub11),
+                         (elemType elemClass:$sub12), (elemType elemClass:$sub13),
+                         (elemType elemClass:$sub14), (elemType elemClass:$sub15))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$sub0, sub0), elemClass:$sub1, sub1),
+                            elemClass:$sub2, sub2), elemClass:$sub3, sub3),
+                            elemClass:$sub4, sub4), elemClass:$sub5, sub5),
+                            elemClass:$sub6, sub6), elemClass:$sub7, sub7),
+                            elemClass:$sub8, sub8), elemClass:$sub9, sub9),
+                            elemClass:$sub10, sub10), elemClass:$sub11, sub11),
+                            elemClass:$sub12, sub12), elemClass:$sub13, sub13),
+                            elemClass:$sub14, sub14), elemClass:$sub15, sub15)
+>;
+
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+  (vt (AMDGPUdwordaddr (vt rc:$addr))),
+  (vt rc:$addr)
+>;
+
+include "R600Instructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
new file mode 100644
index 000000000000..eecb25b04f79
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUIntrinsics.td
@@ -0,0 +1,60 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp
new file mode 100644
index 000000000000..1dc1c657dfe5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -0,0 +1,83 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
+  Ctx(ctx)
+{ }
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_FPImmediate: {
+      const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
+      assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
+             "Only floating point immediates are supported at the moment.");
+      MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_Register:
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                   MO.getMBB()->getSymbol(), Ctx));
+    }
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  AMDGPUMCInstLower MCInstLowering(OutContext);
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = MI;
+    ++I;
+    while (I != MBB->end() && I->isInsideBundle()) {
+      MCInst MCBundleInst;
+      const MachineInstr *BundledInst = I;
+      MCInstLowering.lower(BundledInst, MCBundleInst);
+      OutStreamer.EmitInstruction(MCBundleInst);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    OutStreamer.EmitInstruction(TmpInst);
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h
new file mode 100644
index 000000000000..d7d538e92599
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMCInstLower.h
@@ -0,0 +1,34 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_MCINSTLOWER_H
+#define AMDGPU_MCINSTLOWER_H
+
+namespace llvm {
+
+class MCInst;
+class MCContext;
+class MachineInstr;
+
+class AMDGPUMCInstLower {
+
+  MCContext &Ctx;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx);
+
+  /// \brief Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif //AMDGPU_MCINSTLOWER_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp
new file mode 100644
index 000000000000..0223ec8e4f3f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -0,0 +1,22 @@
+#include "AMDGPUMachineFunction.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+
+namespace llvm {
+
+const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType";
+
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
+    MachineFunctionInfo() {
+  AttributeSet Set = MF.getFunction()->getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
+                                 ShaderTypeAttribute);
+
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    if (Str.getAsInteger(0, ShaderType))
+      llvm_unreachable("Can't parse shader type!");
+  }
+}
+
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h
new file mode 100644
index 000000000000..21c8c51dae45
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUMachineFunction.h
@@ -0,0 +1,29 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUMACHINEFUNCTION_H
+#define AMDGPUMACHINEFUNCTION_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class AMDGPUMachineFunction : public MachineFunctionInfo {
+private:
+  static const char *ShaderTypeAttribute;
+public:
+  AMDGPUMachineFunction(const MachineFunction &MF);
+  unsigned ShaderType;
+};
+
+}
+#endif // AMDGPUMACHINEFUNCTION_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp
new file mode 100644
index 000000000000..fe994d2d05a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPUGenRegisterInfo(0),
+  TM(tm),
+  TII(tii)
+  { }
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+
+const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  return &CalleeSavedReg;
+}
+
+void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                             int SPAdj,
+                                             unsigned FIOperandNum,
+                                             RegScavenger *RS) const {
+  assert(!"Subroutines not supported yet");
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  assert(!"Subroutines not supported yet");
+  return 0;
+}
+
+unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
+
+  switch(IndirectIndex) {
+  case 0: return AMDGPU::sub0;
+  case 1: return AMDGPU::sub1;
+  case 2: return AMDGPU::sub2;
+  case 3: return AMDGPU::sub3;
+  case 4: return AMDGPU::sub4;
+  case 5: return AMDGPU::sub5;
+  case 6: return AMDGPU::sub6;
+  case 7: return AMDGPU::sub7;
+  case 8: return AMDGPU::sub8;
+  case 9: return AMDGPU::sub9;
+  case 10: return AMDGPU::sub10;
+  case 11: return AMDGPU::sub11;
+  case 12: return AMDGPU::sub12;
+  case 13: return AMDGPU::sub13;
+  case 14: return AMDGPU::sub14;
+  case 15: return AMDGPU::sub15;
+  default: llvm_unreachable("indirect index out of range");
+  }
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h
new file mode 100644
index 000000000000..1fc88e7455b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.h
@@ -0,0 +1,66 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUREGISTERINFO_H
+#define AMDGPUREGISTERINFO_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+  TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  static const uint16_t CalleeSavedReg;
+
+  AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+    assert(!"Unimplemented");  return BitVector();
+  }
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns The ISA reg class that is equivalent to \p RC.
+  virtual const TargetRegisterClass * getISARegClass(
+                                         const TargetRegisterClass * RC) const {
+    assert(!"Unimplemented"); return NULL;
+  }
+
+  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
+    assert(!"Unimplemented"); return NULL;
+  }
+
+  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  unsigned getIndirectSubReg(unsigned IndirectIndex) const;
+
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td
new file mode 100644
index 000000000000..b5aca0347fb0
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPURegisterInfo.td
@@ -0,0 +1,25 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+
+foreach Index = 0-15 in {
+  def sub#Index : SubRegIndex;
+}
+
+def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
+
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/contrib/llvm/lib/Target/R600/AMDGPUStructurizeCFG.cpp
new file mode 100644
index 000000000000..dea43b874c6f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUStructurizeCFG.cpp
@@ -0,0 +1,896 @@
+//===-- AMDGPUStructurizeCFG.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// The pass implemented in this file transforms the programs control flow
+/// graph into a form that's suitable for code generation on hardware that
+/// implements control flow by execution masking. This currently includes all
+/// AMD GPUs but may as well be useful for other types of hardware.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Support/PatternMatch.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+namespace {
+
+// Definition of the complex types used in this pass.
+
+typedef std::pair<BasicBlock *, Value *> BBValuePair;
+
+typedef SmallVector<RegionNode*, 8> RNVector;
+typedef SmallVector<BasicBlock*, 8> BBVector;
+typedef SmallVector<BranchInst*, 8> BranchVector;
+typedef SmallVector<BBValuePair, 2> BBValueVector;
+
+typedef SmallPtrSet<BasicBlock *, 8> BBSet;
+
+typedef MapVector<PHINode *, BBValueVector> PhiMap;
+typedef MapVector<BasicBlock *, BBVector> BB2BBVecMap;
+
+typedef DenseMap<DomTreeNode *, unsigned> DTN2UnsignedMap;
+typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap;
+typedef DenseMap<BasicBlock *, Value *> BBPredicates;
+typedef DenseMap<BasicBlock *, BBPredicates> PredMap;
+typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
+
+// The name for newly created blocks.
+
+static const char *FlowBlockName = "Flow";
+
+/// @brief Find the nearest common dominator for multiple BasicBlocks
+///
+/// Helper class for AMDGPUStructurizeCFG
+/// TODO: Maybe move into common code
+class NearestCommonDominator {
+
+  DominatorTree *DT;
+
+  DTN2UnsignedMap IndexMap;
+
+  BasicBlock *Result;
+  unsigned ResultIndex;
+  bool ExplicitMentioned;
+
+public:
+  /// \brief Start a new query
+  NearestCommonDominator(DominatorTree *DomTree) {
+    DT = DomTree;
+    Result = 0;
+  }
+
+  /// \brief Add BB to the resulting dominator
+  void addBlock(BasicBlock *BB, bool Remember = true) {
+
+    DomTreeNode *Node = DT->getNode(BB);
+
+    if (Result == 0) {
+      unsigned Numbering = 0;
+      for (;Node;Node = Node->getIDom())
+        IndexMap[Node] = ++Numbering;
+      Result = BB;
+      ResultIndex = 1;
+      ExplicitMentioned = Remember;
+      return;
+    }
+
+    for (;Node;Node = Node->getIDom())
+      if (IndexMap.count(Node))
+        break;
+      else
+        IndexMap[Node] = 0;
+
+    assert(Node && "Dominator tree invalid!");
+
+    unsigned Numbering = IndexMap[Node];
+    if (Numbering > ResultIndex) {
+      Result = Node->getBlock();
+      ResultIndex = Numbering;
+      ExplicitMentioned = Remember && (Result == BB);
+    } else if (Numbering == ResultIndex) {
+      ExplicitMentioned |= Remember;
+    }
+  }
+
+  /// \brief Is "Result" one of the BBs added with "Remember" = True?
+  bool wasResultExplicitMentioned() {
+    return ExplicitMentioned;
+  }
+
+  /// \brief Get the query result
+  BasicBlock *getResult() {
+    return Result;
+  }
+};
+
+/// @brief Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/   
+/// 3
+/// ||   Where:
+/// | |  1 = "If" block, calculates the condition
+/// 4 |  2 = "Then" subregion, runs if the condition is true
+/// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/   4 = "Else" optional subregion, runs if the condition is false
+/// 5    5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incomming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^  Where:
+/// | /  1 = "Entry" block
+/// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3    3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+class AMDGPUStructurizeCFG : public RegionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+
+  Function *Func;
+  Region *ParentRegion;
+
+  DominatorTree *DT;
+
+  RNVector Order;
+  BBSet Visited;
+
+  BBPhiMap DeletedPhis;
+  BB2BBVecMap AddedPhis;
+
+  PredMap Predicates;
+  BranchVector Conditions;
+
+  BB2BBMap Loops;
+  PredMap LoopPreds;
+  BranchVector LoopConds;
+
+  RegionNode *PrevNode;
+
+  void orderNodes();
+
+  void analyzeLoops(RegionNode *N);
+
+  Value *invert(Value *Condition);
+
+  Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+  void gatherPredicates(RegionNode *N);
+
+  void collectInfos();
+
+  void insertConditions(bool Loops);
+
+  void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+  void setPhiValues();
+
+  void killTerminator(BasicBlock *BB);
+
+  void changeExit(RegionNode *Node, BasicBlock *NewExit,
+                  bool IncludeDominator);
+
+  BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+  BasicBlock *needPrefix(bool NeedEmpty);
+
+  BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+  void setPrevNode(BasicBlock *BB);
+
+  bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+  bool isPredictableTrue(RegionNode *Node);
+
+  void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+  void createFlow();
+
+  void rebuildSSA();
+
+public:
+  AMDGPUStructurizeCFG():
+    RegionPass(ID) {
+
+    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  using Pass::doInitialization;
+  virtual bool doInitialization(Region *R, RGPassManager &RGM);
+
+  virtual bool runOnRegion(Region *R, RGPassManager &RGM);
+
+  virtual const char *getPassName() const {
+    return "AMDGPU simplify control flow";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<DominatorTree>();
+    RegionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char AMDGPUStructurizeCFG::ID = 0;
+
+/// \brief Initialize the types and constants used in the pass
+bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+  LLVMContext &Context = R->getEntry()->getContext();
+
+  Boolean = Type::getInt1Ty(Context);
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+
+  return false;
+}
+
+/// \brief Build up the general order of nodes
+void AMDGPUStructurizeCFG::orderNodes() {
+  scc_iterator<Region *> I = scc_begin(ParentRegion),
+                         E = scc_end(ParentRegion);
+  for (Order.clear(); I != E; ++I) {
+    std::vector<RegionNode *> &Nodes = *I;
+    Order.append(Nodes.begin(), Nodes.end());
+  }
+}
+
+/// \brief Determine the end of the loops
+void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
+
+  if (N->isSubRegion()) {
+    // Test for exit as back edge
+    BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+    if (Visited.count(Exit))
+      Loops[Exit] = N->getEntry();
+
+  } else {
+    // Test for sucessors as back edge
+    BasicBlock *BB = N->getNodeAs<BasicBlock>();
+    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+      BasicBlock *Succ = Term->getSuccessor(i);
+
+      if (Visited.count(Succ))
+        Loops[Succ] = BB;
+    }
+  }
+}
+
+/// \brief Invert the given condition
+Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
+
+  // First: Check if it's a constant
+  if (Condition == BoolTrue)
+    return BoolFalse;
+
+  if (Condition == BoolFalse)
+    return BoolTrue;
+
+  if (Condition == BoolUndef)
+    return BoolUndef;
+
+  // Second: If the condition is already inverted, return the original value
+  if (match(Condition, m_Not(m_Value(Condition))))
+    return Condition;
+
+  // Third: Check all the users for an invert
+  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
+  for (Value::use_iterator I = Condition->use_begin(),
+       E = Condition->use_end(); I != E; ++I) {
+
+    Instruction *User = dyn_cast<Instruction>(*I);
+    if (!User || User->getParent() != Parent)
+      continue;
+
+    if (match(*I, m_Not(m_Specific(Condition))))
+      return *I;
+  }
+
+  // Last option: Create a new instruction
+  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+}
+
+/// \brief Build the condition for one edge
+Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                            bool Invert) {
+  Value *Cond = Invert ? BoolFalse : BoolTrue;
+  if (Term->isConditional()) {
+    Cond = Term->getCondition();
+
+    if (Idx != Invert)
+      Cond = invert(Cond);
+  }
+  return Cond;
+}
+
+/// \brief Analyze the predecessors of each block and build up predicates
+void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
+
+  RegionInfo *RI = ParentRegion->getRegionInfo();
+  BasicBlock *BB = N->getEntry();
+  BBPredicates &Pred = Predicates[BB];
+  BBPredicates &LPred = LoopPreds[BB];
+
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+       PI != PE; ++PI) {
+
+    // Ignore it if it's a branch from outside into our region entry
+    if (!ParentRegion->contains(*PI))
+      continue;
+
+    Region *R = RI->getRegionFor(*PI);
+    if (R == ParentRegion) {
+
+      // It's a top level block in our region
+      BranchInst *Term = cast<BranchInst>((*PI)->getTerminator());
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        BasicBlock *Succ = Term->getSuccessor(i);
+        if (Succ != BB)
+          continue;
+
+        if (Visited.count(*PI)) {
+          // Normal forward edge
+          if (Term->isConditional()) {
+            // Try to treat it like an ELSE block
+            BasicBlock *Other = Term->getSuccessor(!i);
+            if (Visited.count(Other) && !Loops.count(Other) &&
+                !Pred.count(Other) && !Pred.count(*PI)) {
+
+              Pred[Other] = BoolFalse;
+              Pred[*PI] = BoolTrue;
+              continue;
+            }
+          }
+          Pred[*PI] = buildCondition(Term, i, false);
+ 
+        } else {
+          // Back edge
+          LPred[*PI] = buildCondition(Term, i, true);
+        }
+      }
+
+    } else {
+
+      // It's an exit from a sub region
+      while(R->getParent() != ParentRegion)
+        R = R->getParent();
+
+      // Edge from inside a subregion to its entry, ignore it
+      if (R == N)
+        continue;
+
+      BasicBlock *Entry = R->getEntry();
+      if (Visited.count(Entry))
+        Pred[Entry] = BoolTrue;
+      else
+        LPred[Entry] = BoolFalse;
+    }
+  }
+}
+
+/// \brief Collect various loop and predicate infos
+void AMDGPUStructurizeCFG::collectInfos() {
+
+  // Reset predicate
+  Predicates.clear();
+
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
+
+  // Reset the visited nodes
+  Visited.clear();
+
+  for (RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend();
+       OI != OE; ++OI) {
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(*OI);
+
+    // Remember that we've seen this node
+    Visited.insert((*OI)->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(*OI);
+  }
+}
+
+/// \brief Insert the missing branch conditions
+void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
+  BranchVector &Conds = Loops ? LoopConds : Conditions;
+  Value *Default = Loops ? BoolTrue : BoolFalse;
+  SSAUpdater PhiInserter;
+
+  for (BranchVector::iterator I = Conds.begin(),
+       E = Conds.end(); I != E; ++I) {
+
+    BranchInst *Term = *I;
+    assert(Term->isConditional());
+
+    BasicBlock *Parent = Term->getParent();
+    BasicBlock *SuccTrue = Term->getSuccessor(0);
+    BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+    PhiInserter.Initialize(Boolean, "");
+    PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+    PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+    BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+    NearestCommonDominator Dominator(DT);
+    Dominator.addBlock(Parent, false);
+
+    Value *ParentValue = 0;
+    for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+         PI != PE; ++PI) {
+
+      if (PI->first == Parent) {
+        ParentValue = PI->second;
+        break;
+      }
+      PhiInserter.AddAvailableValue(PI->first, PI->second);
+      Dominator.addBlock(PI->first);
+    }
+
+    if (ParentValue) {
+      Term->setCondition(ParentValue);
+    } else {
+      if (!Dominator.wasResultExplicitMentioned())
+        PhiInserter.AddAvailableValue(Dominator.getResult(), Default);
+
+      Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+    }
+  }
+}
+
+/// \brief Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+  PhiMap &Map = DeletedPhis[To];
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    while (Phi.getBasicBlockIndex(From) != -1) {
+      Value *Deleted = Phi.removeIncomingValue(From, false);
+      Map[&Phi].push_back(std::make_pair(From, Deleted));
+    }
+  }
+}
+
+/// \brief Add a dummy PHI value as soon as we knew the new predecessor
+void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+  for (BasicBlock::iterator I = To->begin(), E = To->end();
+       I != E && isa<PHINode>(*I);) {
+
+    PHINode &Phi = cast<PHINode>(*I++);
+    Value *Undef = UndefValue::get(Phi.getType());
+    Phi.addIncoming(Undef, From);
+  }
+  AddedPhis[To].push_back(From);
+}
+
+/// \brief Add the real PHI value as soon as everything is set up
+void AMDGPUStructurizeCFG::setPhiValues() {
+
+  SSAUpdater Updater;
+  for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
+       AI != AE; ++AI) {
+
+    BasicBlock *To = AI->first;
+    BBVector &From = AI->second;
+
+    if (!DeletedPhis.count(To))
+      continue;
+
+    PhiMap &Map = DeletedPhis[To];
+    for (PhiMap::iterator PI = Map.begin(), PE = Map.end();
+         PI != PE; ++PI) {
+
+      PHINode *Phi = PI->first;
+      Value *Undef = UndefValue::get(Phi->getType());
+      Updater.Initialize(Phi->getType(), "");
+      Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+      Updater.AddAvailableValue(To, Undef);
+
+      NearestCommonDominator Dominator(DT);
+      Dominator.addBlock(To, false);
+      for (BBValueVector::iterator VI = PI->second.begin(),
+           VE = PI->second.end(); VI != VE; ++VI) {
+
+        Updater.AddAvailableValue(VI->first, VI->second);
+        Dominator.addBlock(VI->first);
+      }
+
+      if (!Dominator.wasResultExplicitMentioned())
+        Updater.AddAvailableValue(Dominator.getResult(), Undef);
+
+      for (BBVector::iterator FI = From.begin(), FE = From.end();
+           FI != FE; ++FI) {
+
+        int Idx = Phi->getBasicBlockIndex(*FI);
+        assert(Idx != -1);
+        Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(*FI));
+      }
+    }
+
+    DeletedPhis.erase(To);
+  }
+  assert(DeletedPhis.empty());
+}
+
+/// \brief Remove phi values from all successors and then remove the terminator.
+void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
+  TerminatorInst *Term = BB->getTerminator();
+  if (!Term)
+    return;
+
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; ++SI) {
+
+    delPhiValues(BB, *SI);
+  }
+
+  Term->eraseFromParent();
+}
+
+/// \brief Let node exit(s) point to NewExit
+void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                      bool IncludeDominator) {
+
+  if (Node->isSubRegion()) {
+    Region *SubRegion = Node->getNodeAs<Region>();
+    BasicBlock *OldExit = SubRegion->getExit();
+    BasicBlock *Dominator = 0;
+
+    // Find all the edges from the sub region to the exit
+    for (pred_iterator I = pred_begin(OldExit), E = pred_end(OldExit);
+         I != E;) {
+
+      BasicBlock *BB = *I++;
+      if (!SubRegion->contains(BB))
+        continue;
+
+      // Modify the edges to point to the new exit
+      delPhiValues(BB, OldExit);
+      BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+      addPhiValues(BB, NewExit);
+
+      // Find the new dominator (if requested)
+      if (IncludeDominator) {
+        if (!Dominator)
+          Dominator = BB;
+        else
+          Dominator = DT->findNearestCommonDominator(Dominator, BB);
+      }
+    }
+
+    // Change the dominator (if requested)
+    if (Dominator)
+      DT->changeImmediateDominator(NewExit, Dominator);
+
+    // Update the region info
+    SubRegion->replaceExit(NewExit);
+
+  } else {
+    BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+    killTerminator(BB);
+    BranchInst::Create(NewExit, BB);
+    addPhiValues(BB, NewExit);
+    if (IncludeDominator)
+      DT->changeImmediateDominator(NewExit, BB);
+  }
+}
+
+/// \brief Create a new flow node and update dominator tree and region info
+BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+  LLVMContext &Context = Func->getContext();
+  BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+                       Order.back()->getEntry();
+  BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+                                        Func, Insert);
+  DT->addNewBlock(Flow, Dominator);
+  ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+  return Flow;
+}
+
+/// \brief Create a new or reuse the previous node as flow node
+BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
+
+  BasicBlock *Entry = PrevNode->getEntry();
+
+  if (!PrevNode->isSubRegion()) {
+    killTerminator(Entry);
+    if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+      return Entry;
+
+  } 
+
+  // create a new flow node
+  BasicBlock *Flow = getNextFlow(Entry);
+
+  // and wire it up
+  changeExit(PrevNode, Flow, true);
+  PrevNode = ParentRegion->getBBNode(Flow);
+  return Flow;
+}
+
+/// \brief Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
+                                              bool ExitUseAllowed) {
+
+  if (Order.empty() && ExitUseAllowed) {
+    BasicBlock *Exit = ParentRegion->getExit();
+    DT->changeImmediateDominator(Exit, Flow);
+    addPhiValues(Flow, Exit);
+    return Exit;
+  }
+  return getNextFlow(Flow);
+}
+
+/// \brief Set the previous node
+void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) {
+  PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
+}
+
+/// \brief Does BB dominate all the predicates of Node ?
+bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+
+    if (!DT->dominates(BB, PI->first))
+      return false;
+  }
+  return true;
+}
+
+/// \brief Can we predict that this node will always be called?
+bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
+
+  BBPredicates &Preds = Predicates[Node->getEntry()];
+  bool Dominated = false;
+
+  // Regionentry is always true
+  if (PrevNode == 0)
+    return true;
+
+  for (BBPredicates::iterator I = Preds.begin(), E = Preds.end();
+       I != E; ++I) {
+
+    if (I->second != BoolTrue)
+      return false;
+
+    if (!Dominated && DT->dominates(I->first, PrevNode->getEntry()))
+      Dominated = true;
+  }
+
+  // TODO: The dominator check is too strict
+  return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
+                                    BasicBlock *LoopEnd) {
+
+  RegionNode *Node = Order.pop_back_val();
+  Visited.insert(Node->getEntry());
+
+  if (isPredictableTrue(Node)) {
+    // Just a linear flow
+    if (PrevNode) {
+      changeExit(PrevNode, Node->getEntry(), true);
+    }
+    PrevNode = Node;
+
+  } else {
+    // Insert extra prefix node (or reuse last one)
+    BasicBlock *Flow = needPrefix(false);
+
+    // Insert extra postfix node (or use exit instead)
+    BasicBlock *Entry = Node->getEntry();
+    BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+    // let it point to entry and next block
+    Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+    addPhiValues(Flow, Entry);
+    DT->changeImmediateDominator(Entry, Flow);
+
+    PrevNode = Node;
+    while (!Order.empty() && !Visited.count(LoopEnd) &&
+           dominatesPredicates(Entry, Order.back())) {
+      handleLoops(false, LoopEnd);
+    }
+
+    changeExit(PrevNode, Next, false);
+    setPrevNode(Next);
+  }
+}
+
+void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                       BasicBlock *LoopEnd) {
+  RegionNode *Node = Order.back();
+  BasicBlock *LoopStart = Node->getEntry();
+
+  if (!Loops.count(LoopStart)) {
+    wireFlow(ExitUseAllowed, LoopEnd);
+    return;
+  }
+
+  if (!isPredictableTrue(Node))
+    LoopStart = needPrefix(true);
+
+  LoopEnd = Loops[Node->getEntry()];
+  wireFlow(false, LoopEnd);
+  while (!Visited.count(LoopEnd)) {
+    handleLoops(false, LoopEnd);
+  }
+
+  // Create an extra loop end node
+  LoopEnd = needPrefix(false);
+  BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+  LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+                                         BoolUndef, LoopEnd));
+  addPhiValues(LoopEnd, LoopStart);
+  setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void AMDGPUStructurizeCFG::createFlow() {
+
+  BasicBlock *Exit = ParentRegion->getExit();
+  bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Conditions.clear();
+  LoopConds.clear();
+
+  PrevNode = 0;
+  Visited.clear();
+
+  while (!Order.empty()) {
+    handleLoops(EntryDominatesExit, 0);
+  }
+
+  if (PrevNode)
+    changeExit(PrevNode, Exit, EntryDominatesExit);
+  else
+    assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really nessasary
+void AMDGPUStructurizeCFG::rebuildSSA() {
+  SSAUpdater Updater;
+  for (Region::block_iterator I = ParentRegion->block_begin(),
+                              E = ParentRegion->block_end();
+       I != E; ++I) {
+
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+
+      bool Initialized = false;
+      for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) {
+
+        Next = I->getNext();
+
+        Instruction *User = cast<Instruction>(I->getUser());
+        if (User->getParent() == BB) {
+          continue;
+
+        } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+          if (UserPN->getIncomingBlock(*I) == BB)
+            continue;
+        }
+
+        if (DT->dominates(II, User))
+          continue;
+
+        if (!Initialized) {
+          Value *Undef = UndefValue::get(II->getType());
+          Updater.Initialize(II->getType(), "");
+          Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+          Updater.AddAvailableValue(BB, II);
+          Initialized = true;
+        }
+        Updater.RewriteUseAfterInsertions(*I);
+      }
+    }
+  }
+}
+
+/// \brief Run the transformation for each region found
+bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+  if (R->isTopLevelRegion())
+    return false;
+
+  Func = R->getEntry()->getParent();
+  ParentRegion = R;
+
+  DT = &getAnalysis<DominatorTree>();
+
+  orderNodes();
+  collectInfos();
+  createFlow();
+  insertConditions(false);
+  insertConditions(true);
+  setPhiValues();
+  rebuildSSA();
+
+  // Cleanup
+  Order.clear();
+  Visited.clear();
+  DeletedPhis.clear();
+  AddedPhis.clear();
+  Predicates.clear();
+  Conditions.clear();
+  Loops.clear();
+  LoopPreds.clear();
+  LoopConds.clear();
+
+  return true;
+}
+
+/// \brief Create the pass
+Pass *llvm::createAMDGPUStructurizeCFGPass() {
+  return new AMDGPUStructurizeCFG();
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
new file mode 100644
index 000000000000..0f356a1c3f11
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
+  AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
+    InstrItins = getInstrItineraryForCPU(CPU);
+
+  memset(CapsOverride, 0, sizeof(*CapsOverride)
+      * AMDGPUDeviceInfo::MaxNumberCapabilities);
+  // Default card
+  StringRef GPU = CPU;
+  Is64bit = false;
+  DefaultSize[0] = 64;
+  DefaultSize[1] = 1;
+  DefaultSize[2] = 1;
+  ParseSubtargetFeatures(GPU, FS);
+  DevName = GPU;
+  Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
+}
+
+AMDGPUSubtarget::~AMDGPUSubtarget() {
+  delete Device;
+}
+
+bool
+AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
+  assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
+      "Caps index is out of bounds!");
+  return CapsOverride[caps];
+}
+bool
+AMDGPUSubtarget::is64bit() const  {
+  return Is64bit;
+}
+bool
+AMDGPUSubtarget::isTargetELF() const {
+  return false;
+}
+size_t
+AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
+  if (dim > 3) {
+    return 1;
+  } else {
+    return DefaultSize[dim];
+  }
+}
+
+std::string
+AMDGPUSubtarget::getDataLayout() const {
+    if (!Device) {
+        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
+    }
+    return Device->getDataLayout();
+}
+
+std::string
+AMDGPUSubtarget::getDeviceName() const {
+  return DevName;
+}
+const AMDGPUDevice *
+AMDGPUSubtarget::device() const {
+  return Device;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
new file mode 100644
index 000000000000..1973fc6d544c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUSubtarget.h
@@ -0,0 +1,65 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUSUBTARGET_H
+#define AMDGPUSUBTARGET_H
+#include "AMDILDevice.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define MAX_CB_SIZE (1 << 16)
+
+namespace llvm {
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+private:
+  bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
+  const AMDGPUDevice *Device;
+  size_t DefaultSize[3];
+  std::string DevName;
+  bool Is64bit;
+  bool Is32on64bit;
+  bool DumpCode;
+  bool R600ALUInst;
+
+  InstrItineraryData InstrItins;
+
+public:
+  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+  virtual ~AMDGPUSubtarget();
+
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  bool isOverride(AMDGPUDeviceInfo::Caps) const;
+  bool is64bit() const;
+
+  // Helper functions to simplify if statements
+  bool isTargetELF() const;
+  const AMDGPUDevice* device() const;
+  std::string getDataLayout() const;
+  std::string getDeviceName() const;
+  virtual size_t getDefaultSize(uint32_t dim) const;
+  bool dumpCode() const { return DumpCode; }
+  bool r600ALUEncoding() const { return R600ALUInst; }
+
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp
new file mode 100644
index 000000000000..e7ea876e2abb
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -0,0 +1,164 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information  needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "R600MachineScheduler.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include <llvm/CodeGen/Passes.h>
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeR600Target() {
+  // Register the target
+  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+}
+
+static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new R600SchedStrategy());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("r600", "Run R600's custom scheduler",
+                    createR600MachineScheduler);
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
+    StringRef CPU, StringRef FS,
+  TargetOptions Options,
+  Reloc::Model RM, CodeModel::Model CM,
+  CodeGenOpt::Level OptLevel
+)
+:
+  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+  Subtarget(TT, CPU, FS),
+  Layout(Subtarget.getDataLayout()),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+      Subtarget.device()->getStackAlignment(), 0),
+  IntrinsicInfo(this),
+  InstrItins(&Subtarget.getInstrItineraryData()) {
+  // TLInfo uses InstrInfo so it must be initialized after.
+  if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    InstrInfo = new R600InstrInfo(*this);
+    TLInfo = new R600TargetLowering(*this);
+  } else {
+    InstrInfo = new SIInstrInfo(*this);
+    TLInfo = new SITargetLowering(*this);
+  }
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+}
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createR600MachineScheduler);
+    }
+  }
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDGPUPassConfig(this, PM);
+}
+
+bool
+AMDGPUPassConfig::addPreISel() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createAMDGPUStructurizeCFGPass());
+    addPass(createSIAnnotateControlFlowPass());
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUPeepholeOpt(*TM));
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    // This callbacks this pass uses are not implemented yet on SI.
+    addPass(createAMDGPUIndirectAddressingPass(*TM));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreRegAlloc() {
+  addPass(createAMDGPUConvertToISAPass(*TM));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPostRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createSIInsertWaits(*TM));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreSched2() {
+
+  addPass(&IfConverterID);
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreEmitPass() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createAMDGPUCFGPreparationPass(*TM));
+    addPass(createAMDGPUCFGStructurizerPass(*TM));
+    addPass(createR600EmitClauseMarkers(*TM));
+    addPass(createR600ExpandSpecialInstrsPass(*TM));
+    addPass(createR600ControlFlowFinalizer(*TM));
+    addPass(&FinalizeMachineBundlesID);
+  } else {
+    addPass(createSILowerControlFlowPass(*TM));
+  }
+
+  return false;
+}
+
diff --git a/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h
new file mode 100644
index 000000000000..2afe7873a90c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDGPUTargetMachine.h
@@ -0,0 +1,70 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TARGET_MACHINE_H
+#define AMDGPU_TARGET_MACHINE_H
+
+#include "AMDGPUFrameLowering.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILIntrinsicInfo.h"
+#include "R600ISelLowering.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/IR/DataLayout.h"
+
+namespace llvm {
+
+MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+
+  AMDGPUSubtarget Subtarget;
+  const DataLayout Layout;
+  AMDGPUFrameLowering FrameLowering;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+  const AMDGPUInstrInfo * InstrInfo;
+  AMDGPUTargetLowering * TLInfo;
+  const InstrItineraryData* InstrItins;
+
+public:
+   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                       StringRef CPU,
+                       TargetOptions Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+   ~AMDGPUTargetMachine();
+   virtual const AMDGPUFrameLowering* getFrameLowering() const {
+     return &FrameLowering;
+   }
+   virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
+     return &IntrinsicInfo;
+   }
+   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
+   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
+   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+      return &InstrInfo->getRegisterInfo();
+   }
+   virtual AMDGPUTargetLowering * getTargetLowering() const {
+      return TLInfo;
+   }
+   virtual const InstrItineraryData* getInstrItineraryData() const {
+      return InstrItins;
+   }
+   virtual const DataLayout* getDataLayout() const { return &Layout; }
+   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPU_TARGET_MACHINE_H
diff --git a/contrib/llvm/lib/Target/R600/AMDIL.h b/contrib/llvm/lib/Target/R600/AMDIL.h
new file mode 100644
index 000000000000..39ab664d1018
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDIL.h
@@ -0,0 +1,121 @@
+//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// This file contains the entry points for global functions defined in the LLVM
+/// AMDGPU back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_H
+#define AMDIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define ARENA_SEGMENT_RESERVED_UAVS 12
+#define DEFAULT_ARENA_UAV_ID 8
+#define DEFAULT_RAW_UAV_ID 7
+#define GLOBAL_RETURN_RAW_UAV_ID 11
+#define HW_MAX_NUM_CB 8
+#define MAX_NUM_UNIQUE_UAVS 8
+#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
+#define OPENCL_MAX_READ_IMAGES 128
+#define OPENCL_MAX_WRITE_IMAGES 8
+#define OPENCL_MAX_SAMPLERS 16
+
+// The next two values can never be zero, as zero is the ID that is
+// used to assert against.
+#define DEFAULT_LDS_ID     1
+#define DEFAULT_GDS_ID     1
+#define DEFAULT_SCRATCH_ID 1
+#define DEFAULT_VEC_SLOTS  8
+
+#define OCL_DEVICE_RV710        0x0001
+#define OCL_DEVICE_RV730        0x0002
+#define OCL_DEVICE_RV770        0x0004
+#define OCL_DEVICE_CEDAR        0x0008
+#define OCL_DEVICE_REDWOOD      0x0010
+#define OCL_DEVICE_JUNIPER      0x0020
+#define OCL_DEVICE_CYPRESS      0x0040
+#define OCL_DEVICE_CAICOS       0x0080
+#define OCL_DEVICE_TURKS        0x0100
+#define OCL_DEVICE_BARTS        0x0200
+#define OCL_DEVICE_CAYMAN       0x0400
+#define OCL_DEVICE_ALL          0x3FFF
+
+/// The number of function ID's that are reserved for 
+/// internal compiler usage.
+const unsigned int RESERVED_FUNCS = 1024;
+
+namespace llvm {
+class AMDGPUInstrPrinter;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+// Instruction selection passes.
+FunctionPass*
+  createAMDGPUISelDag(TargetMachine &TM);
+FunctionPass*
+  createAMDGPUPeepholeOpt(TargetMachine &TM);
+
+// Pre emit passes.
+FunctionPass*
+  createAMDGPUCFGPreparationPass(TargetMachine &TM);
+FunctionPass*
+  createAMDGPUCFGStructurizerPass(TargetMachine &TM);
+
+extern Target TheAMDGPUTarget;
+} // end namespace llvm;
+
+// Include device information enumerations
+#include "AMDILDeviceInfo.h"
+
+namespace llvm {
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  REGION_ADDRESS   = 4, ///< Address space for region memory.
+  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  LAST_ADDRESS     = 24
+};
+
+} // namespace AMDGPUAS
+
+} // end namespace llvm
+#endif // AMDIL_H
diff --git a/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.cpp b/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.cpp
new file mode 100644
index 000000000000..ea6ac34f570c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.cpp
@@ -0,0 +1,115 @@
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDIL7XXDevice.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILDevice.h"
+
+using namespace llvm;
+
+AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
+  setCaps();
+  std::string name = mSTM->getDeviceName();
+  if (name == "rv710") {
+    DeviceFlag = OCL_DEVICE_RV710;
+  } else if (name == "rv730") {
+    DeviceFlag = OCL_DEVICE_RV730;
+  } else {
+    DeviceFlag = OCL_DEVICE_RV770;
+  }
+}
+
+AMDGPU7XXDevice::~AMDGPU7XXDevice() {
+}
+
+void AMDGPU7XXDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+}
+
+size_t AMDGPU7XXDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_700;
+  }
+  return 0;
+}
+
+size_t AMDGPU7XXDevice::getWavefrontSize() const {
+  return AMDGPUDevice::HalfWavefrontSize;
+}
+
+uint32_t AMDGPU7XXDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD4XXX;
+}
+
+uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
+  switch (DeviceID) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case GLOBAL_ID:
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+  case ARENA_UAV_ID:
+    break;
+  case LDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    }
+    break;
+  case SCRATCH_ID:
+    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    }
+    break;
+  case GDS_ID:
+    assert(0 && "GDS UAV ID is not supported on this chip");
+    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    }
+    break;
+  };
+
+  return 0;
+}
+
+uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
+  return 1;
+}
+
+AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
+  setCaps();
+}
+
+AMDGPU770Device::~AMDGPU770Device() {
+}
+
+void AMDGPU770Device::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mSWBits.set(AMDGPUDeviceInfo::FMA);
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+  }
+  mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
+  mHWBits.reset(AMDGPUDeviceInfo::LongOps);
+  mSWBits.set(AMDGPUDeviceInfo::LongOps);
+  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+}
+
+size_t AMDGPU770Device::getWavefrontSize() const {
+  return AMDGPUDevice::WavefrontSize;
+}
+
+AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
+}
+
+AMDGPU710Device::~AMDGPU710Device() {
+}
+
+size_t AMDGPU710Device::getWavefrontSize() const {
+  return AMDGPUDevice::QuarterWavefrontSize;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.h b/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.h
new file mode 100644
index 000000000000..1cf4ca415a4c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDIL7XXDevice.h
@@ -0,0 +1,72 @@
+//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDIL7XXDEVICEIMPL_H
+#define AMDIL7XXDEVICEIMPL_H
+#include "AMDILDevice.h"
+
+namespace llvm {
+class AMDGPUSubtarget;
+
+//===----------------------------------------------------------------------===//
+// 7XX generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
+///
+/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
+/// support the minimal features that are required to be considered OpenCL 1.0
+/// compliant and nothing more.
+class AMDGPU7XXDevice : public AMDGPUDevice {
+public:
+  AMDGPU7XXDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU7XXDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getResourceID(uint32_t DeviceID) const;
+  virtual uint32_t getMaxNumUAVs() const;
+
+protected:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPU770Device class represents the RV770 chip and it's
+/// derivative cards.
+///
+/// The difference between this device and the base class is this device device
+/// adds support for double precision and has a larger wavefront size.
+class AMDGPU770Device : public AMDGPU7XXDevice {
+public:
+  AMDGPU770Device(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU770Device();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPU710Device class derives from the 7XX base class.
+///
+/// This class is a smaller derivative, so we need to overload some of the
+/// functions in order to correctly specify this information.
+class AMDGPU710Device : public AMDGPU7XXDevice {
+public:
+  AMDGPU710Device(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU710Device();
+  virtual size_t getWavefrontSize() const;
+};
+
+} // namespace llvm
+#endif // AMDILDEVICEIMPL_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILBase.td b/contrib/llvm/lib/Target/R600/AMDILBase.td
new file mode 100644
index 000000000000..c12cedcf7fd5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILBase.td
@@ -0,0 +1,85 @@
+//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// AMDIL Subtarget features.
+//===----------------------------------------------------------------------===//
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
+        "true",
+        "Enable 64bit double precision operations">;
+def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
+        "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
+        "true",
+        "Enable byte addressable stores">;
+def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
+        "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
+        "true",
+        "Enable duplicate barrier detection(HD5XXX or later).">;
+def FeatureImages : SubtargetFeature<"images",
+        "CapsOverride[AMDGPUDeviceInfo::Images]",
+        "true",
+        "Enable image functions">;
+def FeatureMultiUAV : SubtargetFeature<"multi_uav",
+        "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
+        "true",
+        "Generate multiple UAV code(HD5XXX family or later)">;
+def FeatureMacroDB : SubtargetFeature<"macrodb",
+        "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
+        "true",
+        "Use internal macrodb, instead of macrodb in driver">;
+def FeatureNoAlias : SubtargetFeature<"noalias",
+        "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
+        "true",
+        "assert that all kernel argument pointers are not aliased">;
+def FeatureNoInline : SubtargetFeature<"no-inline",
+        "CapsOverride[AMDGPUDeviceInfo::NoInline]",
+        "true",
+        "specify whether to not inline functions">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "false",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "Is32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+def FeatureDebug : SubtargetFeature<"debug",
+        "CapsOverride[AMDGPUDeviceInfo::Debug]",
+        "true",
+        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+
+include "AMDILRegisterInfo.td"
+include "AMDILInstrInfo.td"
+
diff --git a/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp
new file mode 100644
index 000000000000..b0cd0f9756a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -0,0 +1,3051 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#define DEBUGME 0
+#define DEBUG_TYPE "structcfg"
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDIL.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
+    "pattern matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace llvmCFGStruct {
+#define SHOWNEWINSTR(i) \
+  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+
+#define SHOWNEWBLK(b, msg) \
+if (DEBUGME) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  errs() << "\n"; \
+}
+
+#define SHOWBLK_DETAIL(b, msg) \
+if (DEBUGME) { \
+  if (b) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(errs()); \
+  errs() << "\n"; \
+  } \
+}
+
+#define INVALIDSCCNUM -1
+#define INVALIDREGNUM 0
+
+template<class LoopinfoT>
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
+  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
+       iterEnd = LoopInfo.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->print(OS, 0);
+  }
+}
+
+template<class NodeT>
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} //end namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+template<class PassT>
+struct CFGStructTraits {
+};
+
+template <class InstrT>
+class BlockInformation {
+public:
+  bool isRetired;
+  int  sccNum;
+  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
+  //Instructions defining the corresponding successor.
+  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
+};
+
+template <class BlockT, class InstrT, class RegiT>
+class LandInformation {
+public:
+  BlockT *landBlk;
+  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
+                                  //WHILELOOP(thisloop) init before entering
+                                  //thisloop.
+  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
+                                  //WHILELOOP(thisloop) init after entering
+                                  //thisloop.
+  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
+                                     //land block, branch cond on this reg.
+  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
+                                     //endif" after ENDLOOP(thisloop) break
+                                     //outerLoopOf(thisLoop).
+  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
+                                    //endif" after ENDLOOP(thisloop) continue on
+                                    //outerLoopOf(thisLoop).
+  LandInformation() : landBlk(NULL) {}
+};
+
+} //end of namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
+template<class PassT>
+class  CFGStructurizer {
+public:
+  typedef enum {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  } PathToKind;
+
+public:
+  typedef typename PassT::InstructionType         InstrT;
+  typedef typename PassT::FunctionType            FuncT;
+  typedef typename PassT::DominatortreeType       DomTreeT;
+  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
+  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
+  typedef typename PassT::LoopinfoType            LoopInfoT;
+
+  typedef GraphTraits<FuncT *>                    FuncGTraits;
+  //typedef FuncGTraits::nodes_iterator BlockIterator;
+  typedef typename FuncT::iterator                BlockIterator;
+
+  typedef typename FuncGTraits::NodeType          BlockT;
+  typedef GraphTraits<BlockT *>                   BlockGTraits;
+  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
+  //typedef BlockGTraits::succ_iterator InstructionIterator;
+  typedef typename BlockT::iterator               InstrIterator;
+
+  typedef CFGStructTraits<PassT>                  CFGTraits;
+  typedef BlockInformation<InstrT>                BlockInfo;
+  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
+
+  typedef int                                     RegiT;
+  typedef typename PassT::LoopType                LoopT;
+  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
+        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
+        //landing info for loop break
+  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+
+public:
+  CFGStructurizer();
+  ~CFGStructurizer();
+
+  /// Perform the CFG structurization
+  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+
+  /// Perform the CFG preparation
+  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+
+private:
+  void reversePredicateSetter(typename BlockT::iterator);
+  void   orderBlocks();
+  void   printOrderedBlocks(llvm::raw_ostream &OS);
+  int patternMatch(BlockT *CurBlock);
+  int patternMatchGroup(BlockT *CurBlock);
+
+  int serialPatternMatch(BlockT *CurBlock);
+  int ifPatternMatch(BlockT *CurBlock);
+  int switchPatternMatch(BlockT *CurBlock);
+  int loopendPatternMatch(BlockT *CurBlock);
+  int loopPatternMatch(BlockT *CurBlock);
+
+  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  //int loopWithoutBreak(BlockT *);
+
+  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
+                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
+  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
+                           BlockT *ContBlock, LoopT *contLoop);
+  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
+  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                       BlockT *FalseBlock);
+  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
+                          BlockT *FalseBlock);
+  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                              BlockT *FalseBlock, BlockT **LandBlockPtr);
+  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                                   BlockT *FalseBlock, BlockT *LandBlock,
+                                   bool Detail = false);
+  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
+                          bool AllowSideEntry = true);
+  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
+                        bool AllowSideEntry = true);
+  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
+  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
+
+  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
+                            BlockT *TrueBlock, BlockT *FalseBlock,
+                            BlockT *LandBlock);
+  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
+  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
+                           BlockT *ExitLandBlock, RegiT SetReg);
+  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
+                           RegiT SetReg);
+  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
+                                std::set<BlockT*> &ExitBlockSet,
+                                BlockT *ExitLandBlk);
+  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
+                                BlockTSmallerVector &ExitingBlocks,
+                                BlockTSmallerVector &ExitBlocks);
+  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
+  void removeUnconditionalBranch(BlockT *SrcBlock);
+  void removeRedundantConditionalBranch(BlockT *SrcBlock);
+  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
+
+  void removeSuccessor(BlockT *SrcBlock);
+  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
+  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
+
+  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
+                          InstrIterator InsertPos);
+
+  void recordSccnum(BlockT *SrcBlock, int SCCNum);
+  int getSCCNum(BlockT *srcBlk);
+
+  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
+  bool isRetiredBlock(BlockT *SrcBlock);
+  bool isActiveLoophead(BlockT *CurBlock);
+  bool needMigrateBlock(BlockT *Block);
+
+  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
+                              BlockTSmallerVector &exitBlocks,
+                              std::set<BlockT*> &ExitBlockSet);
+  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
+  BlockT *getLoopLandBlock(LoopT *LoopRep);
+  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
+
+  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
+
+  bool hasBackEdge(BlockT *curBlock);
+  unsigned getLoopDepth  (LoopT *LoopRep);
+  int countActiveBlock(
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
+    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
+  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+
+private:
+  DomTreeT *domTree;
+  PostDomTreeT *postDomTree;
+  LoopInfoT *loopInfo;
+  PassT *passRep;
+  FuncT *funcRep;
+
+  BlockInfoMap blockInfoMap;
+  LoopLandInfoMap loopLandInfoMap;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
+  const AMDGPURegisterInfo *TRI;
+
+};  //template class CFGStructurizer
+
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
+  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+}
+
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
+  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
+       E = blockInfoMap.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
+                                     const AMDGPURegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  bool changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  if (DEBUGME) {
+        errs() << "AMDGPUCFGStructurizer::prepare\n";
+  }
+
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+
+  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
+       iterEnd = loopInfo->end();
+       iter != iterEnd; ++iter) {
+    LoopT* loopRep = (*iter);
+    BlockTSmallerVector exitingBlks;
+    loopRep->getExitingBlocks(exitingBlks);
+    
+    if (exitingBlks.size() == 0) {
+      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
+      if (dummyExitBlk != NULL)
+        retBlks.push_back(dummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
+       iterBlk != iterEndBlk;
+       ++iterBlk) {
+    BlockT *curBlk = *iterBlk;
+    removeUnconditionalBranch(curBlk);
+    removeRedundantConditionalBranch(curBlk);
+    if (CFGTraits::isReturnBlock(curBlk)) {
+      retBlks.push_back(curBlk);
+    }
+    assert(curBlk->succ_size() <= 2);
+  } //for
+
+  if (retBlks.size() >= 2) {
+    addDummyExitBlock(retBlks);
+    changed = true;
+  }
+
+  return changed;
+} //CFGStructurizer::prepare
+
+template<class PassT>
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
+    const AMDGPURegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  //Assume reducible CFG...
+  if (DEBUGME) {
+    errs() << "AMDGPUCFGStructurizer::run\n";
+    func.viewCFG();
+  }
+
+  domTree = CFGTraits::getDominatorTree(pass);
+  if (DEBUGME) {
+    domTree->print(errs(), (const llvm::Module*)0);
+  }
+
+  postDomTree = CFGTraits::getPostDominatorTree(pass);
+  if (DEBUGME) {
+    postDomTree->print(errs());
+  }
+
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+  int numIter = 0;
+  bool finish = false;
+  BlockT *curBlk;
+  bool makeProgress = false;
+  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
+                                        orderedBlks.end());
+
+  do {
+    ++numIter;
+    if (DEBUGME) {
+      errs() << "numIter = " << numIter
+             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+    }
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin();
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlkEnd = orderedBlks.end();
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      sccBeginIter = iterBlk;
+    BlockT *sccBeginBlk = NULL;
+    int sccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int sccNumIter;     // Number of iteration in this SCC.
+
+    while (iterBlk != iterBlkEnd) {
+      curBlk = *iterBlk;
+
+      if (sccBeginBlk == NULL) {
+        sccBeginIter = iterBlk;
+        sccBeginBlk = curBlk;
+        sccNumIter = 0;
+        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+        if (DEBUGME) {
+              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              errs() << "\n";
+        }
+      }
+
+      if (!isRetiredBlock(curBlk)) {
+        patternMatch(curBlk);
+      }
+
+      ++iterBlk;
+
+      bool contNextScc = true;
+      if (iterBlk == iterBlkEnd
+          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+        // Just finish one scc.
+        ++sccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+          if (DEBUGME) {
+            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
+                   << ", sccNumIter = " << sccNumIter;
+            errs() << "doesn't make any progress\n";
+          }
+          contNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
+          sccNumBlk = sccRemainedNumBlk;
+          iterBlk = sccBeginIter;
+          contNextScc = false;
+          if (DEBUGME) {
+            errs() << "repeat processing SCC" << getSCCNum(curBlk)
+                   << "sccNumIter = " << sccNumIter << "\n";
+            func.viewCFG();
+          }
+        } else {
+          // Finish the current scc.
+          contNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        contNextScc = false;
+      }
+
+      if (contNextScc) {
+        sccBeginBlk = NULL;
+      }
+    } //while, "one iteration" over the function.
+
+    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
+    if (entryBlk->succ_size() == 0) {
+      finish = true;
+      if (DEBUGME) {
+        errs() << "Reduce to one block\n";
+      }
+    } else {
+      int newnumRemainedBlk
+        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      // consider cloned blocks ??
+      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
+        makeProgress = true;
+        numRemainedBlk = newnumRemainedBlk;
+      } else {
+        makeProgress = false;
+        if (DEBUGME) {
+          errs() << "No progress\n";
+        }
+      }
+    }
+  } while (!finish && makeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+
+  // Detach retired Block, release memory.
+  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
+       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    if ((*iterMap).second && (*iterMap).second->isRetired) {
+      assert(((*iterMap).first)->getNumber() != -1);
+      if (DEBUGME) {
+        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+      }
+      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*iterMap).second;
+  }
+  blockInfoMap.clear();
+
+  // clear loopLandInfoMap
+  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
+       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    delete (*iterMap).second;
+  }
+  loopLandInfoMap.clear();
+
+  if (DEBUGME) {
+    func.viewCFG();
+  }
+
+  if (!finish) {
+    assert(!"IRREDUCIBL_CF");
+  }
+
+  return true;
+} //CFGStructurizer::run
+
+/// Print the ordered Blocks.
+///
+template<class PassT>
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
+  size_t i = 0;
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
+       iterBlk != iterBlkEnd;
+       ++iterBlk, ++i) {
+    os << "BB" << (*iterBlk)->getNumber();
+    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+    if (i != 0 && i % 10 == 0) {
+      os << "\n";
+    } else {
+      os << " ";
+    }
+  }
+} //printOrderedBlocks
+
+/// Compute the reversed DFS post order of Blocks
+///
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
+  int sccNum = 0;
+  BlockT *bb;
+  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
+       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
+    std::vector<BlockT *> &sccNext = *sccIter;
+    for (typename std::vector<BlockT *>::const_iterator
+         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      bb = *blockIter;
+      orderedBlks.push_back(bb);
+      recordSccnum(bb, sccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
+       blockEnd1 = FuncGTraits::nodes_end(funcRep);
+       blockIter1 != blockEnd1; ++blockIter1) {
+    BlockT *bb = &(*blockIter1);
+    sccNum = getSCCNum(bb);
+    if (sccNum == INVALIDSCCNUM) {
+      errs() << "unreachable block BB" << bb->getNumber() << "\n";
+    }
+  }
+} //orderBlocks
+
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
+  int numMatch = 0;
+  int curMatch;
+
+  if (DEBUGME) {
+        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+  }
+
+  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
+    numMatch += curMatch;
+  }
+
+  if (DEBUGME) {
+        errs() << "End patternMatch BB" << curBlk->getNumber()
+      << ", numMatch = " << numMatch << "\n";
+  }
+
+  return numMatch;
+} //patternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
+  int numMatch = 0;
+  numMatch += serialPatternMatch(curBlk);
+  numMatch += ifPatternMatch(curBlk);
+  numMatch += loopendPatternMatch(curBlk);
+  numMatch += loopPatternMatch(curBlk);
+  return numMatch;
+}//patternMatchGroup
+
+template<class PassT>
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 1) {
+    return 0;
+  }
+
+  BlockT *childBlk = *curBlk->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+    return 0;
+  }
+
+  mergeSerialBlock(curBlk, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+} //serialPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+  //two edges
+  if (curBlk->succ_size() != 2) {
+    return 0;
+  }
+
+  if (hasBackEdge(curBlk)) {
+    return 0;
+  }
+
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
+  if (branchInstr == NULL) {
+    return 0;
+  }
+
+  assert(CFGTraits::isCondBranch(branchInstr));
+
+  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
+  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
+  BlockT *landBlk;
+  int cloned = 0;
+
+  // TODO: Simplify
+  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
+    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
+    landBlk = *trueBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
+    landBlk = NULL;
+  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
+    landBlk = falseBlk;
+    falseBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && *falseBlk->succ_begin() == trueBlk) {
+    landBlk = trueBlk;
+    trueBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
+    landBlk = *falseBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 1
+    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
+    landBlk = *trueBlk->succ_begin();
+  } else {
+    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (landBlk != NULL &&
+      ((trueBlk && trueBlk->pred_size() > 1)
+      || (falseBlk && falseBlk->pred_size() > 1))) {
+     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  }
+
+  if (trueBlk && trueBlk->pred_size() > 1) {
+    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
+    ++cloned;
+  }
+
+  if (falseBlk && falseBlk->pred_size() > 1) {
+    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
+    ++cloned;
+  }
+
+  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += cloned;
+
+  return 1 + cloned;
+} //ifPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
+  return 0;
+} //switchPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  typename std::vector<LoopT *> nestedLoops;
+  while (loopRep) {
+    nestedLoops.push_back(loopRep);
+    loopRep = loopRep->getParentLoop();
+  }
+
+  if (nestedLoops.size() == 0) {
+    return 0;
+  }
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int num = 0;
+  for (typename std::vector<LoopT *>::reverse_iterator
+       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
+       iter != iterEnd; ++iter) {
+    loopRep = *iter;
+
+    if (getLoopLandBlock(loopRep) != NULL) {
+      continue;
+    }
+
+    BlockT *loopHeader = loopRep->getHeader();
+
+    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+
+    if (numBreak == -1) {
+      break;
+    }
+
+    int numCont = loopcontPatternMatch(loopRep, loopHeader);
+    num += numBreak + numCont;
+  }
+
+  return num;
+} //loopendPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 0) {
+    return 0;
+  }
+
+  int numLoop = 0;
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+    if (loopLand) {
+      BlockT *landBlk = loopLand->landBlk;
+      assert(landBlk);
+      if (!isRetiredBlock(landBlk)) {
+        mergeLooplandBlock(curBlk, loopLand);
+        ++numLoop;
+      }
+    }
+    loopRep = loopRep->getParentLoop();
+  }
+
+  numLoopPatternMatch += numLoop;
+
+  return numLoop;
+} //loopPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
+                                                  BlockT *loopHeader) {
+  BlockTSmallerVector exitingBlks;
+  loopRep->getExitingBlocks(exitingBlks);
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
+  }
+
+  if (exitingBlks.size() == 0) {
+    setLoopLandBlock(loopRep);
+    return 0;
+  }
+
+  // Compute the corresponding exitBlks and exit block set.
+  BlockTSmallerVector exitBlks;
+  std::set<BlockT *> exitBlkSet;
+  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
+       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *exitingBlk = *iter;
+    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+    exitBlks.push_back(exitBlk);
+    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  }
+
+  assert(exitBlkSet.size() > 0);
+  assert(exitBlks.size() == exitingBlks.size());
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  }
+
+  // Find exitLandBlk.
+  BlockT *exitLandBlk = NULL;
+  int numCloned = 0;
+  int numSerial = 0;
+
+  if (exitBlkSet.size() == 1) {
+    exitLandBlk = *exitBlkSet.begin();
+  } else {
+    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+
+    if (exitLandBlk == NULL) {
+      return -1;
+    }
+
+    bool allInPath = true;
+    bool allNotInPath = true;
+    for (typename std::set<BlockT*>::const_iterator
+         iter = exitBlkSet.begin(),
+         iterEnd = exitBlkSet.end();
+         iter != iterEnd; ++iter) {
+      BlockT *exitBlk = *iter;
+
+      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
+      if (DEBUGME) {
+        errs() << "BB" << exitBlk->getNumber()
+               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
+               << pathKind << "\n";
+      }
+
+      allInPath = allInPath && (pathKind == SinglePath_InPath);
+      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+
+      if (!allInPath && !allNotInPath) {
+        if (DEBUGME) {
+              errs() << "singlePath check fail\n";
+        }
+        return -1;
+      }
+    } // check all exit blocks
+
+    if (allNotInPath) {
+
+      // TODO: Simplify, maybe separate function?
+      LoopT *parentLoopRep = loopRep->getParentLoop();
+      BlockT *parentLoopHeader = NULL;
+      if (parentLoopRep)
+        parentLoopHeader = parentLoopRep->getHeader();
+
+      if (exitLandBlk == parentLoopHeader &&
+          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
+                                               loopRep,
+                                               exitBlkSet,
+                                               exitLandBlk)) != NULL) {
+        if (DEBUGME) {
+          errs() << "relocateLoopcontBlock success\n";
+        }
+      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
+                                                      exitingBlks,
+                                                      exitBlks)) != NULL) {
+        if (DEBUGME) {
+          errs() << "insertEndbranchBlock success\n";
+        }
+      } else {
+        if (DEBUGME) {
+          errs() << "loop exit fail\n";
+        }
+        return -1;
+      }
+    }
+
+    // Handle side entry to exit path.
+    exitBlks.clear();
+    exitBlkSet.clear();
+    for (typename BlockTSmallerVector::iterator iterExiting =
+           exitingBlks.begin(),
+         iterExitingEnd = exitingBlks.end();
+         iterExiting != iterExitingEnd; ++iterExiting) {
+      BlockT *exitingBlk = *iterExiting;
+      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+      BlockT *newExitBlk = exitBlk;
+
+      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
+        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
+        ++numCloned;
+      }
+
+      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+
+      exitBlks.push_back(newExitBlk);
+      exitBlkSet.insert(newExitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      numSerial += serialPatternMatch(exitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      if (exitBlk->pred_size() > 1) {
+        if (exitBlk != exitLandBlk) {
+          return -1;
+        }
+      } else {
+        if (exitBlk != exitLandBlk &&
+            (exitBlk->succ_size() != 1 ||
+            *exitBlk->succ_begin() != exitLandBlk)) {
+          return -1;
+        }
+      }
+    }
+  } // else
+
+  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+
+  // Fold break into the breaking block. Leverage across level breaks.
+  assert(exitingBlks.size() == exitBlks.size());
+  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
+       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
+       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
+    BlockT *exitBlk = *iterExit;
+    BlockT *exitingBlk = *iterExiting;
+    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
+    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
+    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
+  }
+
+  int numBreak = static_cast<int>(exitingBlks.size());
+  numLoopbreakPatternMatch += numBreak;
+  numClonedBlock += numCloned;
+  return numBreak + numSerial + numCloned;
+} //loopbreakPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
+                                                 BlockT *loopHeader) {
+  int numCont = 0;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
+  for (typename InvBlockGTraits::ChildIteratorType iter =
+       InvBlockGTraits::child_begin(loopHeader),
+       iterEnd = InvBlockGTraits::child_end(loopHeader);
+       iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    if (loopRep->contains(curBlk)) {
+      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
+                          loopHeader, loopRep);
+      contBlk.push_back(curBlk);
+      ++numCont;
+    }
+  }
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
+       iter = contBlk.begin(), iterEnd = contBlk.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->removeSuccessor(loopHeader);
+  }
+
+  numLoopcontPatternMatch += numCont;
+
+  return numCont;
+} //loopcontPatternMatch
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
+                                                         BlockT *src2Blk) {
+  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
+  // same loop with LoopLandInfo without explicitly keeping track of
+  // loopContBlks and loopBreakBlks, this is a method to get the information.
+  //
+  if (src1Blk->succ_size() == 0) {
+    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
+    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
+      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+      if (theEntry != NULL) {
+        if (DEBUGME) {
+          errs() << "isLoopContBreakBlock yes src1 = BB"
+                 << src1Blk->getNumber()
+                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}  //isSameloopDetachedContbreak
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
+                                             BlockT *trueBlk,
+                                             BlockT *falseBlk) {
+  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
+  if (num == 0) {
+    if (DEBUGME) {
+      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    }
+    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+  }
+  return num;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
+                                                BlockT *trueBlk,
+                                                BlockT *falseBlk) {
+  int num = 0;
+  BlockT *downBlk;
+
+  //trueBlk could be the common post dominator
+  downBlk = trueBlk;
+
+  if (DEBUGME) {
+    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
+           << " true = BB" << trueBlk->getNumber()
+           << ", numSucc=" << trueBlk->succ_size()
+           << " false = BB" << falseBlk->getNumber() << "\n";
+  }
+
+  while (downBlk) {
+    if (DEBUGME) {
+      errs() << "check down = BB" << downBlk->getNumber();
+    }
+
+    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+      if (DEBUGME) {
+        errs() << " working\n";
+      }
+
+      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
+      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+
+      numClonedBlock += num;
+      num += serialPatternMatch(*headBlk->succ_begin());
+      num += serialPatternMatch(*(++headBlk->succ_begin()));
+      num += ifPatternMatch(headBlk);
+      assert(num > 0);
+
+      break;
+    }
+    if (DEBUGME) {
+      errs() << " not working\n";
+    }
+    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
+
+  return num;
+} //handleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
+                                                         BlockT *trueBlk,
+                                                         BlockT *falseBlk,
+                                                         BlockT *landBlk,
+                                                         bool detail) {
+  errs() << "head = BB" << headBlk->getNumber()
+         << " size = " << headBlk->size();
+  if (detail) {
+    errs() << "\n";
+    headBlk->print(errs());
+    errs() << "\n";
+  }
+
+  if (trueBlk) {
+    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
+           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      trueBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (falseBlk) {
+    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
+           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      falseBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (landBlk) {
+    errs() << ", land = BB" << landBlk->getNumber() << " size = "
+           << landBlk->size() << " numPred = " << landBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      landBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+
+    errs() << "\n";
+} //showImproveSimpleJumpintoIf
+
+template<class PassT>
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
+                                                    BlockT *trueBlk,
+                                                    BlockT *falseBlk,
+                                                    BlockT **plandBlk) {
+  bool migrateTrue = false;
+  bool migrateFalse = false;
+
+  BlockT *landBlk = *plandBlk;
+
+  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
+         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+
+  if (trueBlk == falseBlk) {
+    return 0;
+  }
+
+  migrateTrue = needMigrateBlock(trueBlk);
+  migrateFalse = needMigrateBlock(falseBlk);
+
+  if (!migrateTrue && !migrateFalse) {
+    return 0;
+  }
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
+    migrateTrue = true;
+  }
+  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
+    migrateFalse = true;
+  }
+
+  if (DEBUGME) {
+    errs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  unsigned initReg =
+    funcRep->getRegInfo().createVirtualRegister(I32RC);
+  if (!migrateTrue || !migrateFalse) {
+    int initVal = migrateTrue ? 0 : 1;
+    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  }
+
+  int numNewBlk = 0;
+
+  if (landBlk == NULL) {
+    landBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(landBlk);  //insert to function
+
+    if (trueBlk) {
+      trueBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    if (falseBlk) {
+      falseBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    numNewBlk ++;
+  }
+
+  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  typename BlockT::iterator insertPos =
+    CFGTraits::getInstrPos
+    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
+
+  if (landBlkHasOtherPred) {
+    unsigned immReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
+    unsigned cmpResReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+
+    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
+                                        initReg, immReg);
+    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
+                                      AMDGPU::IF_PREDICATE_SET, passRep,
+                                      cmpResReg, DebugLoc());
+  }
+
+  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
+                                    passRep, initReg, DebugLoc());
+
+  if (migrateTrue) {
+    migrateInstruction(trueBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+  }
+  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+
+  if (migrateFalse) {
+    migrateInstruction(falseBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+  }
+
+  if (landBlkHasOtherPred) {
+    // add endif
+    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
+         ++predIter) {
+      BlockT *curBlk = *predIter;
+      if (curBlk != trueBlk && curBlk != falseBlk) {
+        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
+      }
+    } //for
+  }
+  if (DEBUGME) {
+    errs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+
+  // update landBlk
+  *plandBlk = landBlk;
+
+  return numNewBlk;
+} //improveSimpleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
+                                              LoopT *exitingLoop,
+                                             BlockT *exitBlk,
+                                              LoopT *exitLoop,
+                                             BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
+           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
+  }
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT initReg = INVALIDREGNUM;
+  if (exitingLoop != exitLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopBreakInitReg(exitLoop, initReg);
+    while (exitingLoop != exitLoop && exitingLoop) {
+      addLoopBreakOnReg(exitingLoop, initReg);
+      exitingLoop = exitingLoop->getParentLoop();
+    }
+    assert(exitingLoop == exitLoop);
+  }
+
+  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+
+} //handleLoopbreak
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
+                                                  LoopT *contingLoop,
+                                                 BlockT *contBlk,
+                                                  LoopT *contLoop) {
+  if (DEBUGME) {
+    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
+           << " header = BB" << contBlk->getNumber() << "\n";
+
+    errs() << "Trying to continue loop-depth = "
+           << getLoopDepth(contLoop)
+           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (contingLoop != contLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopContInitReg(contLoop, initReg);
+    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
+      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
+      contingLoop = contingLoop->getParentLoop();
+    }
+    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
+    addLoopContOnReg(contingLoop, initReg);
+  }
+
+  settleLoopcontBlock(contingBlk, contBlk, initReg);
+} //handleLoopcontBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+    errs() << "serialPattern BB" << dstBlk->getNumber()
+           << " <= BB" << srcBlk->getNumber() << "\n";
+  }
+  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
+
+  dstBlk->removeSuccessor(srcBlk);
+  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+
+  removeSuccessor(srcBlk);
+  retireBlock(dstBlk, srcBlk);
+} //mergeSerialBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
+                                                  BlockT *curBlk,
+                                                  BlockT *trueBlk,
+                                                  BlockT *falseBlk,
+                                                  BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "ifPattern BB" << curBlk->getNumber();
+    errs() << "{  ";
+    if (trueBlk) {
+      errs() << "BB" << trueBlk->getNumber();
+    }
+    errs() << "  } else ";
+    errs() << "{  ";
+    if (falseBlk) {
+      errs() << "BB" << falseBlk->getNumber();
+    }
+    errs() << "  }\n ";
+    errs() << "landBlock: ";
+    if (landBlk == NULL) {
+      errs() << "NULL";
+    } else {
+      errs() << "BB" << landBlk->getNumber();
+    }
+    errs() << "\n";
+  }
+
+  int oldOpcode = branchInstr->getOpcode();
+  DebugLoc branchDL = branchInstr->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(curBlk, branchInstr);
+  CFGTraits::insertCondBranchBefore(branchInstrPos,
+                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
+                                    passRep,
+                                    branchDL);
+
+  if (trueBlk) {
+    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
+    curBlk->removeSuccessor(trueBlk);
+    if (landBlk && trueBlk->succ_size()!=0) {
+      trueBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, trueBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
+
+  if (falseBlk) {
+    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
+                   falseBlk->end());
+    curBlk->removeSuccessor(falseBlk);
+    if (landBlk && falseBlk->succ_size() != 0) {
+      falseBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, falseBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
+
+  branchInstr->eraseFromParent();
+
+  if (landBlk && trueBlk && falseBlk) {
+    curBlk->addSuccessor(landBlk);
+  }
+
+} //mergeIfthenelseBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
+                                                LoopLandInfo *loopLand) {
+  BlockT *landBlk = loopLand->landBlk;
+
+  if (DEBUGME) {
+    errs() << "loopPattern header = BB" << dstBlk->getNumber()
+           << " land = BB" << landBlk->getNumber() << "\n";
+  }
+
+  // Loop contInitRegs are init at the beginning of the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->contInitRegs.begin(),
+       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the
+   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
+   * search for the DebugLoc in the that statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
+  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
+  // Loop breakInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakInitRegs.begin(),
+       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+  // Loop endbranchInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->endbranchInitRegs.begin(),
+       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+   * search for the DebugLoc in the continue statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
+  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
+  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
+  // loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakOnRegs.begin(),
+       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
+                                   *iter);
+  }
+
+  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
+  // loop.
+  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
+       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
+                                   passRep, *iter);
+  }
+
+  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+
+  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
+       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
+    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
+  }
+
+  removeSuccessor(landBlk);
+  retireBlock(dstBlk, landBlk);
+} //mergeLooplandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        assert(0 && "PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
+                                                 BlockT *exitBlk,
+                                                 BlockT *exitLandBlk,
+                                                 RegiT  setReg) {
+  if (DEBUGME) {
+    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
+           << " exit = BB" << exitBlk->getNumber()
+           << " land = BB" << exitLandBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
+  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+
+  DebugLoc DL = branchInstr->getDebugLoc();
+
+  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+
+  //    transform exitingBlk to
+  //    if ( ) {
+  //       exitBlk (if exitBlk != exitLandBlk)
+  //       setReg = 1
+  //       break
+  //    }endif
+  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(exitingBlk, branchInstr);
+
+  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
+    //break_logical
+
+    if (trueBranch != exitBlk) {
+      reversePredicateSetter(branchInstrPos);
+    }
+    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
+  } else {
+    if (trueBranch != exitBlk) {
+      reversePredicateSetter(branchInstr);
+    }
+    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
+    if (exitBlk != exitLandBlk) {
+      //splice is insert-before ...
+      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
+                         exitBlk->end());
+    }
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+    }
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
+  } //if_logical
+
+  //now branchInst can be erase safely
+  branchInstr->eraseFromParent();
+
+  //now take care of successors, retire blocks
+  exitingBlk->removeSuccessor(exitBlk);
+  if (exitBlk != exitLandBlk) {
+    //splice is insert-before ...
+    exitBlk->removeSuccessor(exitLandBlk);
+    retireBlock(exitingBlk, exitBlk);
+  }
+
+} //mergeLoopbreakBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
+                                                 BlockT *contBlk,
+                                                 RegiT   setReg) {
+  if (DEBUGME) {
+    errs() << "settleLoopcontBlock conting = BB"
+           << contingBlk->getNumber()
+           << ", cont = BB" << contBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
+  if (branchInstr) {
+    assert(CFGTraits::isCondBranch(branchInstr));
+    typename BlockT::iterator branchInstrPos =
+      CFGTraits::getInstrPos(contingBlk, branchInstr);
+    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+    int oldOpcode = branchInstr->getOpcode();
+    DebugLoc DL = branchInstr->getDebugLoc();
+
+    //    transform contingBlk to
+    //     if () {
+    //          move instr after branchInstr
+    //          continue
+    //        or
+    //          setReg = 1
+    //          break
+    //     }endif
+    //     successor = {orgSuccessor(contingBlk) - loopHeader}
+
+    bool useContinueLogical = 
+      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
+
+    if (useContinueLogical == false) {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+
+      if (setReg != INVALIDREGNUM) {
+        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
+      } else {
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
+      }
+
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+    } else {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
+                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+    }
+
+    branchInstr->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug location
+    // we've just inserted that reference here so it should be representative
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    } else {
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    }
+  } //else
+
+} //settleLoopcontBlock
+
+// BBs in exitBlkSet are determined as in break-path for loopRep,
+// before we can put code for BBs as inside loop-body for loopRep
+// check whether those BBs are determined as cont-BB for parentLoopRep
+// earlier.
+// If so, generate a new BB newBlk
+//    (1) set newBlk common successor of BBs in exitBlkSet
+//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
+//    (3) generate continue-instr in newBlk
+//
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
+                                              LoopT *loopRep,
+                                              std::set<BlockT *> &exitBlkSet,
+                                              BlockT *exitLandBlk) {
+  std::set<BlockT *> endBlkSet;
+
+
+
+  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
+       iterEnd = exitBlkSet.end();
+       iter != iterEnd; ++iter) {
+    BlockT *exitBlk = *iter;
+    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
+
+    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
+      return NULL;
+
+    endBlkSet.insert(endBlk);
+  }
+
+  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(newBlk);  //insert to function
+  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
+  SHOWNEWBLK(newBlk, "New continue block: ");
+
+  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
+       iterEnd = endBlkSet.end();
+       iter != iterEnd; ++iter) {
+      BlockT *endBlk = *iter;
+      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
+      if (contInstr) {
+        contInstr->eraseFromParent();
+      }
+      endBlk->addSuccessor(newBlk);
+      if (DEBUGME) {
+        errs() << "Add new continue Block to BB"
+               << endBlk->getNumber() << " successors\n";
+      }
+  }
+
+  return newBlk;
+} //relocateLoopcontBlock
+
+
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
+// pathes corresponding to the loop exiting branches.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
+                                              BlockTSmallerVector &exitingBlks,
+                                              BlockTSmallerVector &exitBlks) {
+  const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT endBranchReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+  assert(endBranchReg >= 0);
+
+  // reg = 0 before entering the loop
+  addLoopEndbranchInitReg(loopRep, endBranchReg);
+
+  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
+  assert(numBlks >=2 && numBlks == exitBlks.size());
+
+  BlockT *preExitingBlk = exitingBlks[0];
+  BlockT *preExitBlk = exitBlks[0];
+  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(preBranchBlk);  //insert to function
+  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
+
+  BlockT *newLandBlk = preBranchBlk;
+
+      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
+        newLandBlk);
+  preExitingBlk->removeSuccessor(preExitBlk);
+  preExitingBlk->addSuccessor(newLandBlk);
+
+  //it is redundant to add reg = 0 to exitingBlks[0]
+
+  // For 1..n th exiting path (the last iteration handles two pathes) create the
+  // branch to the previous path and the current path.
+  for (uint32_t i = 1; i < numBlks; ++i) {
+    BlockT *curExitingBlk = exitingBlks[i];
+    BlockT *curExitBlk = exitBlks[i];
+    BlockT *curBranchBlk;
+
+    if (i == numBlks - 1) {
+      curBranchBlk = curExitBlk;
+    } else {
+      curBranchBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(curBranchBlk);  //insert to function
+      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
+    }
+
+    // Add reg = i to exitingBlks[i].
+    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
+                                       endBranchReg, i);
+
+    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
+    // (exitingBlks[i], newLandBlk).
+    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
+                                          newLandBlk);
+    curExitingBlk->removeSuccessor(curExitBlk);
+    curExitingBlk->addSuccessor(newLandBlk);
+
+    // add to preBranchBlk the branch instruction:
+    // if (endBranchReg == preVal)
+    //    preExitBlk
+    // else
+    //    curBranchBlk
+    //
+    // preValReg = i - 1
+
+  DebugLoc DL;
+  RegiT preValReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+
+  preBranchBlk->insert(preBranchBlk->begin(),
+                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
+                       i - 1));
+
+  // condResReg = (endBranchReg == preValReg)
+    RegiT condResReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
+      .addReg(endBranchReg).addReg(preValReg);
+
+    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
+      .addMBB(preExitBlk).addReg(condResReg);
+
+    preBranchBlk->addSuccessor(preExitBlk);
+    preBranchBlk->addSuccessor(curBranchBlk);
+
+    // Update preExitingBlk, preExitBlk, preBranchBlk.
+    preExitingBlk = curExitingBlk;
+    preExitBlk = curExitBlk;
+    preBranchBlk = curBranchBlk;
+
+  }  //end for 1 .. n blocks
+
+  return newLandBlk;
+} //addLoopEndbranchBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::PathToKind
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
+                                     bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return SinglePath_InPath;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == dstBlk) {
+      return SinglePath_InPath;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return Not_SinglePath;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return SinglePath_NotInPath;
+  }
+
+  return Not_SinglePath;
+} //singlePathTo
+
+// If there is a single path from srcBlk to dstBlk, return the last block before
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
+// last block in the path Otherwise, return NULL
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
+                                      bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return srcBlk;
+  }
+
+  if (srcBlk->succ_size() == 0) {
+    return srcBlk;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    BlockT *preBlk = srcBlk;
+
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == NULL) {
+      return preBlk;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return NULL;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return srcBlk;
+  }
+
+  return NULL;
+
+} //singlePathEnd
+
+template<class PassT>
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
+                                               BlockT *dstBlk) {
+  int cloned = 0;
+  assert(preBlk->isSuccessor(srcBlk));
+  while (srcBlk && srcBlk != dstBlk) {
+    assert(srcBlk->succ_size() == 1);
+    if (srcBlk->pred_size() > 1) {
+      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
+      ++cloned;
+    }
+
+    preBlk = srcBlk;
+    srcBlk = *srcBlk->succ_begin();
+  }
+
+  return cloned;
+} //cloneOnSideEntryTo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
+                                                 BlockT *predBlk) {
+  assert(predBlk->isSuccessor(curBlk) &&
+         "succBlk is not a prececessor of curBlk");
+
+  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
+  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  //srcBlk, oldBlk, newBlk
+
+  predBlk->removeSuccessor(curBlk);
+  predBlk->addSuccessor(cloneBlk);
+
+  // add all successor to cloneBlk
+  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+
+  numClonedInstr += curBlk->size();
+
+  if (DEBUGME) {
+    errs() << "Cloned block: " << "BB"
+           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  }
+
+  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+
+  return cloneBlk;
+} //cloneBlockForPredecessor
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
+                                               BlockT *exitingBlk) {
+  BlockT *exitBlk = NULL;
+
+  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
+       iterSuccEnd = exitingBlk->succ_end();
+       iterSucc != iterSuccEnd; ++iterSucc) {
+    BlockT *curBlk = *iterSucc;
+    if (!loopRep->contains(curBlk)) {
+      assert(exitBlk == NULL);
+      exitBlk = curBlk;
+    }
+  }
+
+  assert(exitBlk != NULL);
+
+  return exitBlk;
+} //exitingBlock2ExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
+                                                BlockT *dstBlk,
+                                                InstrIterator insertPos) {
+  InstrIterator spliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+  if (branchInstr == NULL) {
+    if (DEBUGME) {
+      errs() << "migrateInstruction don't see branch instr\n" ;
+    }
+    spliceEnd = srcBlk->end();
+  } else {
+    if (DEBUGME) {
+      errs() << "migrateInstruction see branch instr\n" ;
+      branchInstr->dump();
+    }
+    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+  }
+  if (DEBUGME) {
+    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+
+  //splice insert before insertPos
+  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+
+  if (DEBUGME) {
+    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+} //migrateInstruction
+
+// normalizeInfiniteLoopExit change
+//   B1:
+//        uncond_br LoopHeader
+//
+// to
+//   B1:
+//        cond_br 1 LoopHeader dummyExit
+// and return the newly added dummy exit block
+// 
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
+  BlockT *loopHeader;
+  BlockT *loopLatch;
+  loopHeader = LoopRep->getHeader();
+  loopLatch = LoopRep->getLoopLatch();
+  BlockT *dummyExitBlk = NULL;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (loopHeader!=NULL && loopLatch!=NULL) {
+    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
+    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
+      dummyExitBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(dummyExitBlk);  //insert to function
+      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+
+      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
+
+      typename BlockT::iterator insertPos =
+        CFGTraits::getInstrPos(loopLatch, branchInstr);
+      unsigned immReg =
+        funcRep->getRegInfo().createVirtualRegister(I32RC);
+      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
+      InstrT *newInstr = 
+        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
+      MachineInstrBuilder MIB(*funcRep, newInstr);
+      MIB.addMBB(loopHeader);
+      MIB.addReg(immReg, false);
+
+      SHOWNEWINSTR(newInstr);
+
+      branchInstr->eraseFromParent();
+      loopLatch->addSuccessor(dummyExitBlk);
+    }
+  }
+
+  return dummyExitBlk;
+} //normalizeInfiniteLoopExit
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
+  InstrT *branchInstr;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
+          && CFGTraits::isUncondBranch(branchInstr)) {
+    if (DEBUGME) {
+          errs() << "Removing unconditional branch instruction" ;
+      branchInstr->dump();
+    }
+    branchInstr->eraseFromParent();
+  }
+} //removeUnconditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
+  if (srcBlk->succ_size() == 2) {
+    BlockT *blk1 = *srcBlk->succ_begin();
+    BlockT *blk2 = *(++srcBlk->succ_begin());
+
+    if (blk1 == blk2) {
+      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+      if (DEBUGME) {
+        errs() << "Removing unneeded conditional branch instruction" ;
+        branchInstr->dump();
+      }
+      branchInstr->eraseFromParent();
+      SHOWNEWBLK(blk1, "Removing redundant successor");
+      srcBlk->removeSuccessor(blk1);
+    }
+  }
+} //removeRedundantConditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
+                                               DEFAULT_VEC_SLOTS> &retBlks) {
+  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(dummyExitBlk);  //insert to function
+  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
+         retBlks.begin(),
+       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
+    if (curInstr) {
+      curInstr->eraseFromParent();
+    }
+    curBlk->addSuccessor(dummyExitBlk);
+    if (DEBUGME) {
+      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+             << " successors\n";
+    }
+  } //for
+
+  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
+} //addDummyExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
+  while (srcBlk->succ_size()) {
+    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->sccNum = sccNum;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+  }
+
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->isRetired = true;
+  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+         && "can't retire block yet");
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return (srcBlkInfo && srcBlkInfo->isRetired);
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+
+    if(loopLand == NULL)
+      return true;
+
+    BlockT *landBlk = loopLand->landBlk;
+    assert(landBlk);
+    if (!isRetiredBlock(landBlk)) {
+      return true;
+    }
+
+    loopRep = loopRep->getParentLoop();
+  }
+
+  return false;
+} //isActiveLoophead
+
+template<class PassT>
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
+  const unsigned blockSizeThreshold = 30;
+  const unsigned cloneInstrThreshold = 100;
+
+  bool multiplePreds = blk && (blk->pred_size() > 1);
+
+  if(!multiplePreds)
+    return false;
+
+  unsigned blkSize = blk->size();
+  return ((blkSize > blockSizeThreshold)
+          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
+} //needMigrateBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
+                                            BlockTSmallerVector &exitBlks,
+                                            std::set<BlockT *> &exitBlkSet) {
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
+
+  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+       predIterEnd = landBlk->pred_end();
+       predIter != predIterEnd; ++predIter) {
+    BlockT *curBlk = *predIter;
+    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
+      inpathBlks.push_back(curBlk);
+    }
+  } //for
+
+  //if landBlk has predecessors that are not in the given loop,
+  //create a new block
+  BlockT *newLandBlk = landBlk;
+  if (inpathBlks.size() != landBlk->pred_size()) {
+    newLandBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(newLandBlk);  //insert to function
+    newLandBlk->addSuccessor(landBlk);
+    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
+         inpathBlks.begin(),
+         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
+      BlockT *curBlk = *iter;
+      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
+      //srcBlk, oldBlk, newBlk
+      curBlk->removeSuccessor(landBlk);
+      curBlk->addSuccessor(newLandBlk);
+    }
+    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
+      if (exitBlks[i] == landBlk) {
+        exitBlks[i] = newLandBlk;
+      }
+    }
+    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
+  }
+
+  setLoopLandBlock(loopRep, newLandBlk);
+
+  return newLandBlk;
+} // recordLoopbreakLand
+
+template<class PassT>
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  assert(theEntry->landBlk == NULL);
+
+  if (blk == NULL) {
+    blk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(blk);  //insert to function
+    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
+  }
+
+  theEntry->landBlk = blk;
+
+  if (DEBUGME) {
+    errs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << blk->getNumber() << "\n";
+  }
+} // setLoopLandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+
+  theEntry->breakOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->breakInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
+                                                     RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->endbranchInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+        errs() << "addLoopEndbranchInitReg loop-header = BB"
+      << loopRep->getHeader()->getNumber()
+      << "  regNum = " << regNum << "\n";
+  }
+} // addLoopEndbranchInitReg
+
+template<class PassT>
+typename CFGStructurizer<PassT>::LoopLandInfo *
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry;
+} // getLoopLandInfo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry ? theEntry->landBlk : NULL;
+} // getLoopLandBlock
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  if (loopRep == NULL)
+    return false;
+
+  BlockT *loopHeader = loopRep->getHeader();
+
+  return curBlk->isSuccessor(loopHeader);
+
+} //hasBackEdge
+
+template<class PassT>
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
+  return loopRep ? loopRep->getLoopDepth() : 0;
+} //getLoopDepth
+
+template<class PassT>
+int CFGStructurizer<PassT>::countActiveBlock
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
+  int count = 0;
+  while (iterStart != iterEnd) {
+    if (!isRetiredBlock(*iterStart)) {
+      ++count;
+    }
+    ++iterStart;
+  }
+
+  return count;
+} //countActiveBlock
+
+// This is work around solution for findNearestCommonDominator not avaiable to
+// post dom a proper fix should go to Dominators.h.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT*
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+
+  if (postDomTree->dominates(blk1, blk2)) {
+    return blk1;
+  }
+  if (postDomTree->dominates(blk2, blk1)) {
+    return blk2;
+  }
+
+  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
+  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+
+  // Handle newly cloned node.
+  if (node1 == NULL && blk1->succ_size() == 1) {
+    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
+  }
+  if (node2 == NULL && blk2->succ_size() == 1) {
+    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
+  }
+
+  if (node1 == NULL || node2 == NULL) {
+    return NULL;
+  }
+
+  node1 = node1->getIDom();
+  while (node1) {
+    if (postDomTree->dominates(node1, node2)) {
+      return node1->getBlock();
+    }
+    node1 = node1->getIDom();
+  }
+
+  return NULL;
+}
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::findNearestCommonPostDom
+(typename std::set<BlockT *> &blks) {
+  BlockT *commonDom;
+  typename std::set<BlockT *>::const_iterator iter = blks.begin();
+  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
+  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
+    BlockT *curBlk = *iter;
+    if (curBlk != commonDom) {
+      commonDom = findNearestCommonPostDom(curBlk, commonDom);
+    }
+  }
+
+  if (DEBUGME) {
+    errs() << "Common post dominator for exit blocks is ";
+    if (commonDom) {
+          errs() << "BB" << commonDom->getNumber() << "\n";
+    } else {
+      errs() << "NULL\n";
+    }
+  }
+
+  return commonDom;
+} //findNearestCommonPostDom
+
+} //end namespace llvm
+
+//todo: move-end
+
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer for AMDGPU
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+  typedef MachineInstr              InstructionType;
+  typedef MachineFunction           FunctionType;
+  typedef MachineBasicBlock         BlockType;
+  typedef MachineLoopInfo           LoopinfoType;
+  typedef MachineDominatorTree      DominatortreeType;
+  typedef MachinePostDominatorTree  PostDominatortreeType;
+  typedef MachineDomTreeNode        DomTreeNodeType;
+  typedef MachineLoop               LoopType;
+
+protected:
+  TargetMachine &TM;
+  const TargetInstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+public:
+  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
+  const TargetInstrInfo *getTargetInstrInfo() const;
+
+private:
+
+};
+
+} //end of namespace llvm
+AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
+  TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
+}
+
+const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
+  return TII;
+}
+//===----------------------------------------------------------------------===//
+//
+// CFGPrepare
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
+public:
+  static char ID;
+
+public:
+  AMDGPUCFGPrepare(TargetMachine &tm);
+
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};
+
+char AMDGPUCFGPrepare::ID = 0;
+} //end of namespace llvm
+
+AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
+  : AMDGPUCFGStructurizer(ID, tm )  {
+}
+const char *AMDGPUCFGPrepare::getPassName() const {
+  return "AMD IL Control Flow Graph Preparation Pass";
+}
+
+void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGPerform
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
+public:
+  static char ID;
+
+public:
+  AMDGPUCFGPerform(TargetMachine &tm);
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};
+
+char AMDGPUCFGPerform::ID = 0;
+} //end of namespace llvm
+
+  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
+: AMDGPUCFGStructurizer(ID, tm) {
+}
+
+const char *AMDGPUCFGPerform::getPassName() const {
+  return "AMD IL Control Flow Graph structurizer Pass";
+}
+
+void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructTraits<AMDGPUCFGStructurizer>
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+// this class is tailor to the AMDGPU backend
+template<>
+struct CFGStructTraits<AMDGPUCFGStructurizer> {
+  typedef int RegiT;
+
+  static int getBranchNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static int getBranchZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static int getContinueNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
+    return instr->getOperand(0).getMBB();
+  }
+
+  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
+    instr->getOperand(0).setMBB(blk);
+  }
+
+  static MachineBasicBlock *
+  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(blk->succ_size() == 2);
+    MachineBasicBlock *trueBranch = getTrueBranch(instr);
+    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
+    MachineBasicBlock::succ_iterator iterNext = iter;
+    ++iterNext;
+
+    return (*iter == trueBranch) ? *iterNext : *iter;
+  }
+
+  static bool isCondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      case AMDGPU::JUMP_COND:
+      case AMDGPU::BRANCH_COND_i32:
+      case AMDGPU::BRANCH_COND_f32:
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isUncondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+    case AMDGPU::JUMP:
+    case AMDGPU::BRANCH:
+      return true;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
+    //get DebugLoc from the first MachineBasicBlock instruction with debug info
+    DebugLoc DL;
+    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getDebugLoc().isUnknown() == false) {
+        DL = instr->getDebugLoc();
+      }
+    }
+    return DL;
+  }
+
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    MachineInstr *instr = &*iter;
+    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
+      return instr;
+    }
+    return NULL;
+  }
+
+  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  //
+  // BB with backward-edge could have move instructions after the branch
+  // instruction.  Such move instruction "belong to" the loop backward-edge.
+  //
+  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
+    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
+                                  blk->getParent()->getTarget().getInstrInfo());
+
+    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
+         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
+      // FIXME: Simplify
+      MachineInstr *instr = &*iter;
+      if (instr) {
+        if (isCondBranch(instr) || isUncondBranch(instr)) {
+          return instr;
+        } else if (!TII->isMov(instr->getOpcode())) {
+          break;
+        }
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::RETURN) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::CONTINUE) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static bool isReturnBlock(MachineBasicBlock *blk) {
+    MachineInstr *instr = getReturnInstr(blk);
+    bool isReturn = (blk->succ_size() == 0);
+    if (instr) {
+      assert(isReturn);
+    } else if (isReturn) {
+      if (DEBUGME) {
+        errs() << "BB" << blk->getNumber()
+               <<" is return block without RETURN instr\n";
+      }
+    }
+
+    return  isReturn;
+  }
+
+  static MachineBasicBlock::iterator
+  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(instr->getParent() == blk && "instruction doesn't belong to block");
+    MachineBasicBlock::iterator iter = blk->begin();
+    MachineBasicBlock::iterator iterEnd = blk->end();
+    while (&(*iter) != instr && iter != iterEnd) {
+      ++iter;
+    }
+
+    assert(iter != iterEnd);
+    return iter;
+  }//getInstrPos
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDGPUCFGStructurizer *passRep) {
+    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrBefore
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    MachineBasicBlock::iterator res;
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDGPUCFGStructurizer *passRep) {
+    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrEnd
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+   MachineInstr *newInstr = blk->getParent()
+      ->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->push_back(newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+  } //insertInstrEnd
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
+                                         int newOpcode, 
+                                         AMDGPUCFGStructurizer *passRep) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DebugLoc());
+
+    blk->insert(instrPos, newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
+                                     int newOpcode,
+                                     AMDGPUCFGStructurizer *passRep,
+                                     DebugLoc DL) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineFunction *MF = blk->getParent();
+    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->insert(instrPos, newInstr);
+    MachineInstrBuilder MIB(*MF, newInstr);
+    MIB.addReg(oldInstr->getOperand(1).getReg(), false);
+
+    SHOWNEWINSTR(newInstr);
+    //erase later oldInstr->eraseFromParent();
+  } //insertCondBranchBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock *blk,
+                                     MachineBasicBlock::iterator insertPos,
+                                     int newOpcode,
+                                     AMDGPUCFGStructurizer *passRep,
+                                     RegiT regNum,
+                                     DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineFunction *MF = blk->getParent();
+
+    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    //insert before
+    blk->insert(insertPos, newInstr);
+    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchBefore
+
+  static void insertCondBranchEnd(MachineBasicBlock *blk,
+                                  int newOpcode,
+                                  AMDGPUCFGStructurizer *passRep,
+                                  RegiT regNum) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineFunction *MF = blk->getParent();
+    MachineInstr *newInstr =
+      MF->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
+
+    blk->push_back(newInstr);
+    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchEnd
+
+
+  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
+                                      AMDGPUCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    blk->insert(instrPos, newInstr);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertAssignInstrBefore
+
+  static void insertAssignInstrBefore(MachineBasicBlock *blk,
+                                      AMDGPUCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+  } //insertInstrBefore
+
+  static void insertCompareInstrBefore(MachineBasicBlock *blk,
+                                       MachineBasicBlock::iterator instrPos,
+                                       AMDGPUCFGStructurizer *passRep,
+                                       RegiT dstReg, RegiT src1Reg,
+                                       RegiT src2Reg) {
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineFunction *MF = blk->getParent();
+    MachineInstr *newInstr =
+      MF->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
+
+    MachineInstrBuilder MIB(*MF, newInstr);
+    MIB.addReg(dstReg, RegState::Define); //set target
+    MIB.addReg(src1Reg); //set src value
+    MIB.addReg(src2Reg); //set src value
+
+    blk->insert(instrPos, newInstr);
+    SHOWNEWINSTR(newInstr);
+
+  } //insertCompareInstrBefore
+
+  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
+                                 MachineBasicBlock *srcBlk) {
+    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
+         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
+      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
+    }
+  } //cloneSuccessorList
+
+  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
+    MachineFunction *func = srcBlk->getParent();
+    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
+    func->push_back(newBlk);  //insert to function
+    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
+         iterEnd = srcBlk->end();
+         iter != iterEnd; ++iter) {
+      MachineInstr *instr = func->CloneMachineInstr(iter);
+      newBlk->push_back(instr);
+    }
+    return newBlk;
+  }
+
+  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
+  //the AMDGPU instruction is not recognized as terminator fix this and retire
+  //this routine
+  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
+                                         MachineBasicBlock *oldBlk,
+                                         MachineBasicBlock *newBlk) {
+    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
+    if (branchInstr && isCondBranch(branchInstr) &&
+        getTrueBranch(branchInstr) == oldBlk) {
+      setTrueBranch(branchInstr, newBlk);
+    }
+  }
+
+  static void wrapup(MachineBasicBlock *entryBlk) {
+    assert((!entryBlk->getParent()->getJumpTableInfo()
+            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
+           && "found a jump table");
+
+     //collect continue right before endloop
+     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
+     MachineBasicBlock::iterator pre = entryBlk->begin();
+     MachineBasicBlock::iterator iterEnd = entryBlk->end();
+     MachineBasicBlock::iterator iter = pre;
+     while (iter != iterEnd) {
+       if (pre->getOpcode() == AMDGPU::CONTINUE
+           && iter->getOpcode() == AMDGPU::ENDLOOP) {
+         contInstr.push_back(pre);
+       }
+       pre = iter;
+       ++iter;
+     } //end while
+
+     //delete continue right before endloop
+     for (unsigned i = 0; i < contInstr.size(); ++i) {
+        contInstr[i]->eraseFromParent();
+     }
+
+     // TODO to fix up jump table so later phase won't be confused.  if
+     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+     // there isn't such an interface yet.  alternatively, replace all the other
+     // blocks in the jump table with the entryBlk //}
+
+  } //wrapup
+
+  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineDominatorTree>();
+  }
+
+  static MachinePostDominatorTree*
+  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachinePostDominatorTree>();
+  }
+
+  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineLoopInfo>();
+  }
+}; // template class CFGStructTraits
+} //end of namespace llvm
+
+// createAMDGPUCFGPreparationPass- Returns a pass
+FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
+                                                 ) {
+  return new AMDGPUCFGPrepare(tm );
+}
+
+bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
+                                                                        *this,
+                                                                        TRI);
+}
+
+// createAMDGPUCFGStructurizerPass- Returns a pass
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
+                                                  ) {
+  return new AMDGPUCFGPerform(tm );
+}
+
+bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
+                                                                    *this,
+                                                                    TRI);
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILDevice.cpp b/contrib/llvm/lib/Target/R600/AMDILDevice.cpp
new file mode 100644
index 000000000000..db8e01ea4043
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILDevice.cpp
@@ -0,0 +1,132 @@
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevice.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+// Default implementation for all of the classes.
+AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
+  mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
+  mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
+  setCaps();
+  DeviceFlag = OCL_DEVICE_ALL;
+}
+
+AMDGPUDevice::~AMDGPUDevice() {
+    mHWBits.clear();
+    mSWBits.clear();
+}
+
+size_t AMDGPUDevice::getMaxGDSSize() const {
+  return 0;
+}
+
+uint32_t 
+AMDGPUDevice::getDeviceFlag() const {
+  return DeviceFlag;
+}
+
+size_t AMDGPUDevice::getMaxNumCBs() const {
+  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
+    return HW_MAX_NUM_CB;
+  }
+
+  return 0;
+}
+
+size_t AMDGPUDevice::getMaxCBSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
+    return MAX_CB_SIZE;
+  }
+
+  return 0;
+}
+
+size_t AMDGPUDevice::getMaxScratchSize() const {
+  return 65536;
+}
+
+uint32_t AMDGPUDevice::getStackAlignment() const {
+  return 16;
+}
+
+void AMDGPUDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::HalfOps);
+  mSWBits.set(AMDGPUDeviceInfo::ByteOps);
+  mSWBits.set(AMDGPUDeviceInfo::ShortOps);
+  mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
+    mSWBits.set(AMDGPUDeviceInfo::NoInline);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
+    mSWBits.set(AMDGPUDeviceInfo::MacroDB);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
+    mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
+  }
+  mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
+  mSWBits.set(AMDGPUDeviceInfo::LongOps);
+}
+
+AMDGPUDeviceInfo::ExecutionMode
+AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
+  if (mHWBits[Caps]) {
+    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDGPUDeviceInfo::Hardware;
+  }
+
+  if (mSWBits[Caps]) {
+    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDGPUDeviceInfo::Software;
+  }
+
+  return AMDGPUDeviceInfo::Unsupported;
+
+}
+
+bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
+}
+
+bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
+}
+
+bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
+}
+
+std::string
+AMDGPUDevice::getDataLayout() const {
+  std::string DataLayout = std::string(
+   "e"
+   "-p:32:32:32"
+   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
+   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+   "-n32:64"
+  );
+
+  if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) {
+    DataLayout.append("-f64:64:64");
+  }
+
+  return DataLayout;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILDevice.h b/contrib/llvm/lib/Target/R600/AMDILDevice.h
new file mode 100644
index 000000000000..97df98cafb2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILDevice.h
@@ -0,0 +1,117 @@
+//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+//
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDILDEVICEIMPL_H
+#define AMDILDEVICEIMPL_H
+#include "AMDIL.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+  class AMDGPUSubtarget;
+  class MCStreamer;
+//===----------------------------------------------------------------------===//
+// Interface for data that is specific to a single device
+//===----------------------------------------------------------------------===//
+class AMDGPUDevice {
+public:
+  AMDGPUDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUDevice();
+
+  // Enum values for the various memory types.
+  enum {
+    RAW_UAV_ID   = 0,
+    ARENA_UAV_ID = 1,
+    LDS_ID       = 2,
+    GDS_ID       = 3,
+    SCRATCH_ID   = 4,
+    CONSTANT_ID  = 5,
+    GLOBAL_ID    = 6,
+    MAX_IDS      = 7
+  } IO_TYPE_IDS;
+
+  /// \returns The max LDS size that the hardware supports.  Size is in
+  /// bytes.
+  virtual size_t getMaxLDSSize() const = 0;
+
+  /// \returns The max GDS size that the hardware supports if the GDS is
+  /// supported by the hardware.  Size is in bytes.
+  virtual size_t getMaxGDSSize() const;
+
+  /// \returns The max number of hardware constant address spaces that
+  /// are supported by this device.
+  virtual size_t getMaxNumCBs() const;
+
+  /// \returns The max number of bytes a single hardware constant buffer
+  /// can support.  Size is in bytes.
+  virtual size_t getMaxCBSize() const;
+
+  /// \returns The max number of bytes allowed by the hardware scratch
+  /// buffer.  Size is in bytes.
+  virtual size_t getMaxScratchSize() const;
+
+  /// \brief Get the flag that corresponds to the device.
+  virtual uint32_t getDeviceFlag() const;
+
+  /// \returns The number of work-items that exist in a single hardware
+  /// wavefront.
+  virtual size_t getWavefrontSize() const = 0;
+
+  /// \brief Get the generational name of this specific device.
+  virtual uint32_t getGeneration() const = 0;
+
+  /// \brief Get the stack alignment of this specific device.
+  virtual uint32_t getStackAlignment() const;
+
+  /// \brief Get the resource ID for this specific device.
+  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
+
+  /// \brief Get the max number of UAV's for this device.
+  virtual uint32_t getMaxNumUAVs() const = 0;
+
+
+  // API utilizing more detailed capabilities of each family of
+  // cards. If a capability is supported, then either usesHardware or
+  // usesSoftware returned true.  If usesHardware returned true, then
+  // usesSoftware must return false for the same capability.  Hardware
+  // execution means that the feature is done natively by the hardware
+  // and is not emulated by the softare.  Software execution means
+  // that the feature could be done in the hardware, but there is
+  // software that emulates it with possibly using the hardware for
+  // support since the hardware does not fully comply with OpenCL
+  // specs.
+
+  bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
+  bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
+  bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
+  virtual std::string getDataLayout() const;
+  static const unsigned int MAX_LDS_SIZE_700 = 16384;
+  static const unsigned int MAX_LDS_SIZE_800 = 32768;
+  static const unsigned int WavefrontSize = 64;
+  static const unsigned int HalfWavefrontSize = 32;
+  static const unsigned int QuarterWavefrontSize = 16;
+protected:
+  virtual void setCaps();
+  BitVector mHWBits;
+  llvm::BitVector mSWBits;
+  AMDGPUSubtarget *mSTM;
+  uint32_t DeviceFlag;
+private:
+  AMDGPUDeviceInfo::ExecutionMode
+  getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
+};
+
+} // namespace llvm
+#endif // AMDILDEVICEIMPL_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.cpp b/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.cpp
new file mode 100644
index 000000000000..9605fbe63340
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.cpp
@@ -0,0 +1,94 @@
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Function that creates DeviceInfo from a device name and other information.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevices.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+namespace llvm {
+namespace AMDGPUDeviceInfo {
+
+AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
+                                AMDGPUSubtarget *ptr,
+                                bool is64bit, bool is64on32bit) {
+  if (deviceName.c_str()[2] == '7') {
+    switch (deviceName.c_str()[3]) {
+    case '1':
+      return new AMDGPU710Device(ptr);
+    case '7':
+      return new AMDGPU770Device(ptr);
+    default:
+      return new AMDGPU7XXDevice(ptr);
+    }
+  } else if (deviceName == "cypress") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCypressDevice(ptr);
+  } else if (deviceName == "juniper") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUEvergreenDevice(ptr);
+  } else if (deviceName == "redwood") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPURedwoodDevice(ptr);
+  } else if (deviceName == "cedar") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCedarDevice(ptr);
+  } else if (deviceName == "barts" || deviceName == "turks") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUNIDevice(ptr);
+  } else if (deviceName == "cayman") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCaymanDevice(ptr);
+  } else if (deviceName == "caicos") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUNIDevice(ptr);
+  } else if (deviceName == "SI") {
+    return new AMDGPUSIDevice(ptr);
+  } else {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPU7XXDevice(ptr);
+  }
+}
+} // End namespace AMDGPUDeviceInfo
+} // End namespace llvm
diff --git a/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.h b/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.h
new file mode 100644
index 000000000000..4b2c3a53c79f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILDeviceInfo.h
@@ -0,0 +1,88 @@
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#ifndef AMDILDEVICEINFO_H
+#define AMDILDEVICEINFO_H
+
+
+#include <string>
+
+namespace llvm {
+  class AMDGPUDevice;
+  class AMDGPUSubtarget;
+  namespace AMDGPUDeviceInfo {
+    /// Each Capabilities can be executed using a hardware instruction,
+    /// emulated with a sequence of software instructions, or not
+    /// supported at all.
+    enum ExecutionMode {
+      Unsupported = 0, ///< Unsupported feature on the card(Default value)
+       /// This is the execution mode that is set if the feature is emulated in
+       /// software.
+      Software,
+      /// This execution mode is set if the feature exists natively in hardware
+      Hardware
+    };
+
+    enum Caps {
+      HalfOps          = 0x1,  ///< Half float is supported or not.
+      DoubleOps        = 0x2,  ///< Double is supported or not.
+      ByteOps          = 0x3,  ///< Byte(char) is support or not.
+      ShortOps         = 0x4,  ///< Short is supported or not.
+      LongOps          = 0x5,  ///< Long is supported or not.
+      Images           = 0x6,  ///< Images are supported or not.
+      ByteStores       = 0x7,  ///< ByteStores available(!HD4XXX).
+      ConstantMem      = 0x8,  ///< Constant/CB memory.
+      LocalMem         = 0x9,  ///< Local/LDS memory.
+      PrivateMem       = 0xA,  ///< Scratch/Private/Stack memory.
+      RegionMem        = 0xB,  ///< OCL GDS Memory Extension.
+      FMA              = 0xC,  ///< Use HW FMA or SW FMA.
+      ArenaSegment     = 0xD,  ///< Use for Arena UAV per pointer 12-1023.
+      MultiUAV         = 0xE,  ///< Use for UAV per Pointer 0-7.
+      Reserved0        = 0xF,  ///< ReservedFlag
+      NoAlias          = 0x10, ///< Cached loads.
+      Signed24BitOps   = 0x11, ///< Peephole Optimization.
+      /// Debug mode implies that no hardware features or optimizations
+      /// are performned and that all memory access go through a single
+      /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
+      Debug            = 0x12,
+      CachedMem        = 0x13, ///< Cached mem is available or not.
+      BarrierDetect    = 0x14, ///< Detect duplicate barriers.
+      Reserved1        = 0x15, ///< Reserved flag
+      ByteLDSOps       = 0x16, ///< Flag to specify if byte LDS ops are available.
+      ArenaVectors     = 0x17, ///< Flag to specify if vector loads from arena work.
+      TmrReg           = 0x18, ///< Flag to specify if Tmr register is supported.
+      NoInline         = 0x19, ///< Flag to specify that no inlining should occur.
+      MacroDB          = 0x1A, ///< Flag to specify that backend handles macrodb.
+      HW64BitDivMod    = 0x1B, ///< Flag for backend to generate 64bit div/mod.
+      ArenaUAV         = 0x1C, ///< Flag to specify that arena uav is supported.
+      PrivateUAV       = 0x1D, ///< Flag to specify that private memory uses uav's.
+      /// If more capabilities are required, then
+      /// this number needs to be increased.
+      /// All capabilities must come before this
+      /// number.
+      MaxNumberCapabilities = 0x20
+    };
+    /// These have to be in order with the older generations
+    /// having the lower number enumerations.
+    enum Generation {
+      HD4XXX = 0, ///< 7XX based devices.
+      HD5XXX, ///< Evergreen based devices.
+      HD6XXX, ///< NI/Evergreen+ based devices.
+      HD7XXX, ///< Southern Islands based devices.
+      HDTEST, ///< Experimental feature testing device.
+      HDNUMGEN
+    };
+
+
+  AMDGPUDevice*
+    getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
+                      bool is64bit = false, bool is64on32bit = false);
+  } // namespace AMDILDeviceInfo
+} // namespace llvm
+#endif // AMDILDEVICEINFO_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILDevices.h b/contrib/llvm/lib/Target/R600/AMDILDevices.h
new file mode 100644
index 000000000000..636fa6d35947
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILDevices.h
@@ -0,0 +1,19 @@
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#ifndef AMDIL_DEVICES_H
+#define AMDIL_DEVICES_H
+// Include all of the device specific header files
+#include "AMDIL7XXDevice.h"
+#include "AMDILDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSIDevice.h"
+
+#endif // AMDIL_DEVICES_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.cpp b/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.cpp
new file mode 100644
index 000000000000..c5213a041005
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.cpp
@@ -0,0 +1,169 @@
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILEvergreenDevice.h"
+
+using namespace llvm;
+
+AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
+: AMDGPUDevice(ST) {
+  setCaps();
+  std::string name = ST->getDeviceName();
+  if (name == "cedar") {
+    DeviceFlag = OCL_DEVICE_CEDAR;
+  } else if (name == "redwood") {
+    DeviceFlag = OCL_DEVICE_REDWOOD;
+  } else if (name == "cypress") {
+    DeviceFlag = OCL_DEVICE_CYPRESS;
+  } else {
+    DeviceFlag = OCL_DEVICE_JUNIPER;
+  }
+}
+
+AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
+}
+
+size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
+  return 12;
+}
+
+uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
+  switch(id) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+    return GLOBAL_RETURN_RAW_UAV_ID;
+  case GLOBAL_ID:
+  case ARENA_UAV_ID:
+    return DEFAULT_ARENA_UAV_ID;
+  case LDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case GDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case SCRATCH_ID:
+    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  };
+  return 0;
+}
+
+size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
+  return AMDGPUDevice::WavefrontSize;
+}
+
+uint32_t AMDGPUEvergreenDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD5XXX;
+}
+
+void AMDGPUEvergreenDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
+  mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
+  mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
+  mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
+  mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
+    mHWBits.set(AMDGPUDeviceInfo::ByteStores);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+    mSWBits.set(AMDGPUDeviceInfo::RegionMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::LocalMem);
+    mHWBits.set(AMDGPUDeviceInfo::RegionMem);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::Images);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
+    mHWBits.set(AMDGPUDeviceInfo::NoAlias);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::CachedMem);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
+    mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
+  mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
+  mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
+  mHWBits.set(AMDGPUDeviceInfo::LongOps);
+  mSWBits.reset(AMDGPUDeviceInfo::LongOps);
+  mHWBits.set(AMDGPUDeviceInfo::TmrReg);
+}
+
+AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCypressDevice::~AMDGPUCypressDevice() {
+}
+
+void AMDGPUCypressDevice::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+    mHWBits.set(AMDGPUDeviceInfo::FMA);
+  }
+}
+
+
+AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCedarDevice::~AMDGPUCedarDevice() {
+}
+
+void AMDGPUCedarDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::FMA);
+}
+
+size_t AMDGPUCedarDevice::getWavefrontSize() const {
+  return AMDGPUDevice::QuarterWavefrontSize;
+}
+
+AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
+}
+
+void AMDGPURedwoodDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::FMA);
+}
+
+size_t AMDGPURedwoodDevice::getWavefrontSize() const {
+  return AMDGPUDevice::HalfWavefrontSize;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.h b/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.h
new file mode 100644
index 000000000000..ea90f774a856
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILEvergreenDevice.h
@@ -0,0 +1,93 @@
+//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDILEVERGREENDEVICE_H
+#define AMDILEVERGREENDEVICE_H
+#include "AMDGPUSubtarget.h"
+#include "AMDILDevice.h"
+
+namespace llvm {
+  class AMDGPUSubtarget;
+//===----------------------------------------------------------------------===//
+// Evergreen generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+
+/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
+/// series of cards.
+///
+/// This class contains information required to differentiate
+/// the Evergreen device from the generic AMDGPUDevice. This device represents
+/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
+class AMDGPUEvergreenDevice : public AMDGPUDevice {
+public:
+  AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUEvergreenDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getMaxGDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getMaxNumUAVs() const;
+  virtual uint32_t getResourceID(uint32_t) const;
+protected:
+  virtual void setCaps();
+};
+
+/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
+/// support for double precision operations. This device is used to represent
+/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
+/// and HD59XX cards.
+class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUCypressDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUCypressDevice();
+private:
+  virtual void setCaps();
+};
+
+
+/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
+/// devices.
+///
+/// This class differs from the base AMDGPUEvergreenDevice in that the
+/// device is a ~quarter of the 'Juniper'. These are commercially known as the
+/// HD54XX and HD53XX series of cards.
+class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUCedarDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUCedarDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
+/// devices.
+///
+/// This class differs from the base class, in that these devices are
+/// considered about half of a 'Juniper' device. These are commercially known as
+/// the HD55XX and HD56XX series of cards.
+class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPURedwoodDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+  
+} // namespace llvm
+#endif // AMDILEVERGREENDEVICE_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILISelDAGToDAG.cpp b/contrib/llvm/lib/Target/R600/AMDILISelDAGToDAG.cpp
new file mode 100644
index 000000000000..fa8f62de9c0a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -0,0 +1,643 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "AMDILDevices.h"
+#include "R600InstrInfo.h"
+#include "SIISelLowering.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <list>
+#include <queue>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget &Subtarget;
+public:
+  AMDGPUDAGToDAGISel(TargetMachine &TM);
+  virtual ~AMDGPUDAGToDAGISel();
+
+  SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+  virtual void PostprocessISelDAG();
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static const Value *getBasePointerValue(const Value *V);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  static bool isCPLoad(const LoadSDNode *N);
+  static bool isConstantLoad(const LoadSDNode *N, int cbID);
+  static bool isGlobalLoad(const LoadSDNode *N);
+  static bool isParamLoad(const LoadSDNode *N);
+  static bool isPrivateLoad(const LoadSDNode *N);
+  static bool isLocalLoad(const LoadSDNode *N);
+  static bool isRegionLoad(const LoadSDNode *N);
+
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr,
+      SDValue &BaseReg, SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
+                                       ) {
+  return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
+                                     )
+  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+    SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  switch (Opc) {
+  default: break;
+  case ISD::BUILD_VECTOR: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+      break;
+    }
+    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+    // that adds a 128 bits reg copy when going through TwoAddressInstructions
+    // pass. We want to avoid 128 bits copies as much as possible because they
+    // can't be bundled by our scheduler.
+    SDValue RegSeqArgs[9] = {
+      CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
+      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
+    };
+    bool IsRegSeq = true;
+    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[2 * i + 1] = N->getOperand(i);
+    }
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+        RegSeqArgs, 2 * N->getNumOperands() + 1);
+  }
+  case ISD::ConstantFP:
+  case ISD::Constant: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    // XXX: Custom immediate lowering not implemented yet.  Instead we use
+    // pseudo instructions defined in SIInstructions.td
+    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+      break;
+    }
+    const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
+    uint64_t ImmValue = 0;
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+
+    if (N->getOpcode() == ISD::ConstantFP) {
+      // XXX: 64-bit Immediates not supported yet
+      assert(N->getValueType(0) != MVT::f64);
+
+      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
+      APFloat Value = C->getValueAPF();
+      float FloatValue = Value.convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = Value.bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      // XXX: 64-bit Immediates not supported yet
+      assert(N->getValueType(0) != MVT::i64);
+
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+      if (C->getZExtValue() == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (C->getZExtValue() == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = C->getZExtValue();
+      }
+    }
+
+    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
+                              Use != SDNode::use_end(); Use = Next) {
+      Next = llvm::next(Use);
+      std::vector<SDValue> Ops;
+      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
+        Ops.push_back(Use->getOperand(i));
+      }
+
+      if (!Use->isMachineOpcode()) {
+          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+            // We can only use literal constants (e.g. AMDGPU::ZERO,
+            // AMDGPU::ONE, etc) in machine opcodes.
+            continue;
+          }
+      } else {
+        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
+            (TII->get(Use->getMachineOpcode()).TSFlags &
+            R600_InstFlag::VECTOR)) {
+          continue;
+        }
+
+        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
+        assert(ImmIdx != -1);
+
+        // subtract one from ImmIdx, because the DST operand is usually index
+        // 0 for MachineInstrs, but we have no DST in the Ops vector.
+        ImmIdx--;
+
+        // Check that we aren't already using an immediate.
+        // XXX: It's possible for an instruction to have more than one
+        // immediate operand, but this is not supported yet.
+        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
+          assert(C);
+
+          if (C->getZExtValue() != 0) {
+            // This instruction is already using an immediate.
+            continue;
+          }
+
+          // Set the immediate value
+          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
+        }
+      }
+      // Set the immediate register
+      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
+
+      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
+    }
+    break;
+  }
+  }
+  SDNode *Result = SelectCode(N);
+
+  // Fold operands of selected node
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    const R600InstrInfo *TII =
+        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+    if (Result && Result->isMachineOpcode() &&
+        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
+        && TII->isALUInstr(Result->getMachineOpcode())) {
+      // Fold FNEG/FABS/CONST_ADDRESS
+      // TODO: Isel can generate multiple MachineInst, we need to recursively
+      // parse Result
+      bool IsModified = false;
+      do {
+        std::vector<SDValue> Ops;
+        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
+            I != E; ++I)
+          Ops.push_back(*I);
+        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
+        if (IsModified) {
+          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
+        }
+      } while (IsModified);
+
+      // If node has a single use which is CLAMP_R600, folds it
+      if (Result->hasOneUse() && Result->isMachineOpcode()) {
+        SDNode *PotentialClamp = *Result->use_begin();
+        if (PotentialClamp->isMachineOpcode() &&
+            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
+          unsigned ClampIdx =
+            TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP);
+          std::vector<SDValue> Ops;
+          unsigned NumOp = Result->getNumOperands();
+          for (unsigned i = 0; i < NumOp; ++i) {
+            Ops.push_back(Result->getOperand(i));
+          }
+          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+          Result = CurDAG->SelectNodeTo(PotentialClamp,
+              Result->getMachineOpcode(), PotentialClamp->getVTList(),
+              Ops.data(), NumOp);
+        }
+      }
+    }
+  }
+
+  return Result;
+}
+
+bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
+    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
+  int OperandIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2)
+  };
+  int SelIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
+  };
+  int NegIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG),
+    TII->getOperandIdx(Opcode, R600Operands::SRC2_NEG)
+  };
+  int AbsIdx[] = {
+    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS),
+    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS),
+    -1
+  };
+
+  for (unsigned i = 0; i < 3; i++) {
+    if (OperandIdx[i] < 0)
+      return false;
+    SDValue Operand = Ops[OperandIdx[i] - 1];
+    switch (Operand.getOpcode()) {
+    case AMDGPUISD::CONST_ADDRESS: {
+      SDValue CstOffset;
+      if (Operand.getValueType().isVector() ||
+          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
+        break;
+
+      // Gather others constants values
+      std::vector<unsigned> Consts;
+      for (unsigned j = 0; j < 3; j++) {
+        int SrcIdx = OperandIdx[j];
+        if (SrcIdx < 0)
+          break;
+        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
+          if (Reg->getReg() == AMDGPU::ALU_CONST) {
+            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
+            Consts.push_back(Cst->getZExtValue());
+          }
+        }
+      }
+
+      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+      Consts.push_back(Cst->getZExtValue());
+      if (!TII->fitsConstReadLimitations(Consts))
+        break;
+
+      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
+      Ops[SelIdx[i] - 1] = CstOffset;
+      return true;
+      }
+    case ISD::FNEG:
+      if (NegIdx[i] < 0)
+        break;
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+      return true;
+    case ISD::FABS:
+      if (AbsIdx[i] < 0)
+        break;
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
+      return true;
+    case ISD::BITCAST:
+      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        assert(!"Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
+  if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
+    return true;
+  }
+  MachineMemOperand *MMO = N->getMemOperand();
+  const Value *V = MMO->getValue();
+  const Value *BV = getBasePointerValue(V);
+  if (MMO
+      && MMO->getValue()
+      && ((V && dyn_cast<GlobalValue>(V))
+          || (BV && dyn_cast<GlobalValue>(
+                        getBasePointerValue(MMO->getValue()))))) {
+    return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
+  } else {
+    return false;
+  }
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    if (MMO) {
+      const Value *V = MMO->getValue();
+      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+///==== AMDGPU Functions ====///
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+    SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!dyn_cast<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode * IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  CurDAG->getEntryNode().getDebugLoc(),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+
+  return true;
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+
+  // Go over all selected nodes and try to fold them a bit more
+  const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI);
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ++I) {
+
+    MachineSDNode *Node = dyn_cast<MachineSDNode>(I);
+    if (!Node)
+      continue;
+
+    SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG);
+    if (ResNode != Node)
+      ReplaceUses(Node, ResNode);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/R600/AMDILISelLowering.cpp b/contrib/llvm/lib/Target/R600/AMDILISelLowering.cpp
new file mode 100644
index 000000000000..922cac12b98e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILISelLowering.cpp
@@ -0,0 +1,647 @@
+//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetLowering functions borrowed from AMDIL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILDevices.h"
+#include "AMDILIntrinsicInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions End
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Class Implementation Begins
+//===----------------------------------------------------------------------===//
+void AMDGPUTargetLowering::InitAMDILLowering() {
+  int types[] = {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::f32,
+    (int)MVT::f64,
+    (int)MVT::i64,
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+
+  int IntTypes[] = {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::i64
+  };
+
+  int FloatTypes[] = {
+    (int)MVT::f32,
+    (int)MVT::f64
+  };
+
+  int VectorTypes[] = {
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+  size_t NumTypes = sizeof(types) / sizeof(*types);
+  size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
+  size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
+  size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
+
+  const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
+  // These are the current register classes that are
+  // supported
+
+  for (unsigned int x  = 0; x < NumTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+
+    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
+    // We cannot sextinreg, expand to shifts
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::BRCOND, VT, Custom);
+    setOperationAction(ISD::BR_JT, VT, Expand);
+    setOperationAction(ISD::BRIND, VT, Expand);
+    // TODO: Implement custom UREM/SREM routines
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    if (VT != MVT::i64 && VT != MVT::v2i64) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+    }
+  }
+  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
+
+    // IL does not have these operations for floating point types
+    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
+    setOperationAction(ISD::SETOLT, VT, Expand);
+    setOperationAction(ISD::SETOGE, VT, Expand);
+    setOperationAction(ISD::SETOGT, VT, Expand);
+    setOperationAction(ISD::SETOLE, VT, Expand);
+    setOperationAction(ISD::SETULT, VT, Expand);
+    setOperationAction(ISD::SETUGE, VT, Expand);
+    setOperationAction(ISD::SETUGT, VT, Expand);
+    setOperationAction(ISD::SETULE, VT, Expand);
+  }
+
+  for (unsigned int x = 0; x < NumIntTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
+
+    // GPU also does not have divrem function for signed or unsigned
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    // GPU doesn't have a rotl, rotr, or byteswap instruction
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+
+    // GPU doesn't have any counting operators
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    // setOperationAction(ISD::VSETCC, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+  }
+  if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  }
+  if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
+    // we support loading/storing v2f64 but not operations on the type
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
+    // We want to expand vector conversions into their scalar
+    // counterparts.
+    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+  }
+  // TODO: Fix the UDIV24 algorithm so it works for these
+  // types correctly. This needs vector comparisons
+  // for this to work correctly.
+  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SUBC, MVT::Other, Expand);
+  setOperationAction(ISD::ADDE, MVT::Other, Expand);
+  setOperationAction(ISD::ADDC, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+
+  // Use the default implementation.
+  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
+  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setPow2DivIsCheap(false);
+  setSelectIsExpensive(true);
+  setJumpIsExpensive(true);
+
+  MaxStoresPerMemcpy  = 4096;
+  MaxStoresPerMemmove = 4096;
+  MaxStoresPerMemset  = 4096;
+
+}
+
+bool
+AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+    const CallInst &I, unsigned Intrinsic) const {
+  return false;
+}
+
+// The backend supports 32 and 64 bit floating point immediates
+bool
+AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool
+AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+// be zero. Op is expected to be a target specific node. Used by DAG
+// combiner.
+
+void
+AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+    const SDValue Op,
+    APInt &KnownZero,
+    APInt &KnownOne,
+    const SelectionDAG &DAG,
+    unsigned Depth) const {
+  APInt KnownZero2;
+  APInt KnownOne2;
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
+  switch (Op.getOpcode()) {
+    default: break;
+    case ISD::SELECT_CC:
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(1),
+                 KnownZero,
+                 KnownOne,
+                 Depth + 1
+                 );
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(0),
+                 KnownZero2,
+                 KnownOne2
+                 );
+             assert((KnownZero & KnownOne) == 0
+                 && "Bits known to be one AND zero?");
+             assert((KnownZero2 & KnownOne2) == 0
+                 && "Bits known to be one AND zero?");
+             // Only known if known in both the LHS and RHS
+             KnownOne &= KnownOne2;
+             KnownZero &= KnownZero2;
+             break;
+  };
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSDIV32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16
+      || OVT.getScalarType() == MVT::i8) {
+    DST = LowerSDIV24(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSREM64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSREM32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16) {
+    DST = LowerSREM16(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i8) {
+    DST = LowerSREM8(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Data = Op.getOperand(0);
+  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
+  DebugLoc DL = Op.getDebugLoc();
+  EVT DVT = Data.getValueType();
+  EVT BVT = BaseType->getVT();
+  unsigned baseBits = BVT.getScalarType().getSizeInBits();
+  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
+  unsigned shiftBits = srcBits - baseBits;
+  if (srcBits < 32) {
+    // If the op is less than 32 bits, then it needs to extend to 32bits
+    // so it can properly keep the upper bits valid.
+    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
+    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
+    shiftBits = 32 - baseBits;
+    DVT = IVT;
+  }
+  SDValue Shift = DAG.getConstant(shiftBits, DVT);
+  // Shift left by 'Shift' bits.
+  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
+  // Signed shift Right by 'Shift' bits.
+  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
+  if (srcBits < 32) {
+    // Once the sign extension is done, the op needs to be converted to
+    // its original type.
+    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
+  }
+  return Data;
+}
+EVT
+AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
+  int iSize = (size * numEle);
+  int vEle = (iSize >> ((size == 64) ? 6 : 5));
+  if (!vEle) {
+    vEle = 1;
+  }
+  if (size == 64) {
+    if (vEle == 1) {
+      return EVT(MVT::i64);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i64, vEle));
+    }
+  } else {
+    if (vEle == 1) {
+      return EVT(MVT::i32);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i32, vEle));
+    }
+  }
+}
+
+SDValue
+AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+  SDValue Result;
+  Result = DAG.getNode(
+      AMDGPUISD::BRANCH_COND,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Chain, Jump, Cond);
+  return Result;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(ISD::FADD, DL, FLTTY,
+      DAG.getNode(ISD::MUL, DL, FLTTY, fqneg, fb), fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, 
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i8) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i8) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i16) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i16) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILInstrInfo.td b/contrib/llvm/lib/Target/R600/AMDILInstrInfo.td
new file mode 100644
index 000000000000..110f1476513b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILInstrInfo.td
@@ -0,0 +1,207 @@
+//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file describes the AMDIL instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+// AMDIL Instruction Predicate Definitions
+// Predicate that is set to true if the hardware supports double precision
+// divide
+def HasHWDDiv                 : Predicate<"Subtarget.device()"
+                           "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
+              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware supports double, but not double
+// precision divide in hardware
+def HasSWDDiv             : Predicate<"Subtarget.device()"
+                           "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
+              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware support 24bit signed
+// math ops. Otherwise a software expansion to 32bit math ops is used instead.
+def HasHWSign24Bit          : Predicate<"Subtarget.device()"
+                            "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
+
+// Predicate that is set to true if 64bit operations are supported or not
+def HasHW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
+def HasSW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
+
+// Predicate that is set to true if the timer register is supported
+def HasTmrRegister          : Predicate<"Subtarget.device()"
+                            "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
+// Predicate that is true if we are at least evergreen series
+def HasDeviceIDInst         : Predicate<"Subtarget.device()"
+                            "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
+
+// Predicate that is true if we have region address space.
+def hasRegionAS             : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
+
+// Predicate that is false if we don't have region address space.
+def noRegionAS             : Predicate<"!Subtarget.device()"
+                            "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
+
+
+// Predicate that is set to true if 64bit Mul is supported in the IL or not
+def HasHW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          ">= CAL_VERSION_SC_139"
+                                          "&& Subtarget.device()"
+                                          "->getGeneration() >="
+                                          "AMDGPUDeviceInfo::HD5XXX">;
+def HasSW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          "< CAL_VERSION_SC_139">;
+// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
+def HasHW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
+def HasSW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
+
+// Predicate that is set to true if 64bit pointer are used.
+def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
+def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Type Profiles
+//===--------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Generic Profile Types
+//===----------------------------------------------------------------------===//
+
+def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
+    ]>;
+def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
+    SDTCisEltOfVec<1, 0>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Nodes
+//===--------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+//===--------------------------------------------------------------------===//
+// Instructions
+//===--------------------------------------------------------------------===//
+// Floating point math functions
+def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Integer functions
+//===----------------------------------------------------------------------===//
+def IL_umul        : SDNode<"AMDGPUISD::UMUL"    , SDTIntBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+//===--------------------------------------------------------------------===//
+// Custom Pattern DAG Nodes
+//===--------------------------------------------------------------------===//
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Load pattern fragments
+//===----------------------------------------------------------------------===//
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Complex addressing mode patterns
+//===----------------------------------------------------------------------===//
+def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
+def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
+def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
+def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction format classes
+//===----------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// Multiclass Instruction formats
+//===--------------------------------------------------------------------===//
+// Multiclass that handles branch instructions
+multiclass BranchConditional<SDNode Op> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, GPRI32:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, GPRI32:$src0)]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, GPRF32:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, GPRF32:$src0)]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins GPRI32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins GPRF32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===--------------------------------------------------------------------===//
+// Intrinsics support
+//===--------------------------------------------------------------------===//
+include "AMDILIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.cpp b/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.cpp
new file mode 100644
index 000000000000..4ddb057d80a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -0,0 +1,79 @@
+//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDIL.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) 
+  : TargetIntrinsicInfo() {
+}
+
+std::string 
+AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
+    unsigned int numTys) const  {
+  static const char* const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  if (IntrID < Intrinsic::num_intrinsics) {
+    return 0;
+  }
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
+      && "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned int
+AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
+#define GET_FUNCTION_RECOGNIZER
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  AMDGPUIntrinsic::ID IntrinsicID
+    = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
+
+  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
+    return IntrinsicID;
+  }
+  return 0;
+}
+
+bool 
+AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const  {
+  // Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function*
+AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+    Type **Tys,
+    unsigned numTys) const  {
+  llvm_unreachable("Not implemented");
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.h b/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.h
new file mode 100644
index 000000000000..35559e23fceb
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -0,0 +1,49 @@
+//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef AMDIL_INTRINSICS_H
+#define AMDIL_INTRINSICS_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+public:
+  AMDGPUIntrinsicInfo(TargetMachine *tm);
+  std::string getName(unsigned int IntrId, Type **Tys = 0,
+                      unsigned int numTys = 0) const;
+  unsigned int lookupName(const char *Name, unsigned int Len) const;
+  bool isOverloaded(unsigned int IID) const;
+  Function *getDeclaration(Module *M, unsigned int ID,
+                           Type **Tys = 0,
+                           unsigned int numTys = 0) const;
+};
+
+} // end namespace llvm
+
+#endif // AMDIL_INTRINSICS_H
+
diff --git a/contrib/llvm/lib/Target/R600/AMDILIntrinsics.td b/contrib/llvm/lib/Target/R600/AMDILIntrinsics.td
new file mode 100644
index 000000000000..6ec3559af24c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILIntrinsics.td
@@ -0,0 +1,232 @@
+//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines all of the amdil-specific intrinsics
+//
+//===---------------------------------------------------------------===//
+//===--------------------------------------------------------------------===//
+// Intrinsic classes
+// Generic versions of the above classes but for Target specific intrinsics
+// instead of SDNode patterns.
+//===--------------------------------------------------------------------===//
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+     class VoidIntLong :
+          Intrinsic<[llvm_i64_ty], [], []>;
+     class VoidIntInt :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class VoidIntBool :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class UnaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class ConvertIntFTOI :
+          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+     class ConvertIntITOF :
+          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
+     class UnaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty], []>;
+     class UnaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty], []>;
+     class BinaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
+     class BinaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
+     class TernaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class TernaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class QuaternaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+     class UnaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
+
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
+
+  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
+          UnaryIntInt;
+  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
+          UnaryIntInt;
+  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
+                    TernaryIntInt;
+  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
+                    TernaryIntInt;
+  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
+                    QuaternaryIntInt;
+  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
+      TernaryIntInt;
+  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
+      BinaryIntInt;
+  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
+          BinaryIntInt;
+  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
+          BinaryIntInt;
+  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
+          BinaryIntInt;
+  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
+          BinaryIntInt;
+  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
+          BinaryIntFloat;
+  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
+          BinaryIntInt;
+  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
+          BinaryIntInt;
+  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
+          BinaryIntFloat;
+  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
+          TernaryIntInt;
+  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
+          UnaryIntFloat;
+  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
+          TernaryIntFloat;
+  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
+          UnaryIntFloat;
+  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
+          UnaryIntFloat;
+  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
+          UnaryIntFloat;
+  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
+          UnaryIntFloat;
+  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
+          UnaryIntFloat;
+  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
+          UnaryIntFloat;
+  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
+          UnaryIntFloat;
+  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
+          UnaryIntFloat;
+  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
+          UnaryIntFloat;
+  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
+          UnaryIntFloat;
+  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
+  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
+  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
+  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
+          UnaryIntFloat;
+  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
+          UnaryIntFloat;
+  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
+          UnaryIntFloat;
+  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
+          UnaryIntFloat;
+  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
+          UnaryIntFloat;
+  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
+          TernaryIntFloat;
+  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
+          UnaryIntFloat;
+  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
+          UnaryIntFloat;
+  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
+          TernaryIntFloat;
+  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
+           llvm_v4i32_ty, llvm_i32_ty], []>;
+
+  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
+ def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
+    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
+  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
+  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
+      ConvertIntITOF;
+  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
+      ConvertIntFTOI;
+ def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
+  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
+      ConvertIntITOF;
+  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty, llvm_float_ty], []>;
+  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty], []>;
+  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILNIDevice.cpp b/contrib/llvm/lib/Target/R600/AMDILNIDevice.cpp
new file mode 100644
index 000000000000..47c3f7f209d6
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILNIDevice.cpp
@@ -0,0 +1,65 @@
+//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILNIDevice.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILEvergreenDevice.h"
+
+using namespace llvm;
+
+AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  std::string name = ST->getDeviceName();
+  if (name == "caicos") {
+    DeviceFlag = OCL_DEVICE_CAICOS;
+  } else if (name == "turks") {
+    DeviceFlag = OCL_DEVICE_TURKS;
+  } else if (name == "cayman") {
+    DeviceFlag = OCL_DEVICE_CAYMAN;
+  } else {
+    DeviceFlag = OCL_DEVICE_BARTS;
+  }
+}
+AMDGPUNIDevice::~AMDGPUNIDevice() {
+}
+
+size_t
+AMDGPUNIDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDGPUNIDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD6XXX;
+}
+
+
+AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
+  : AMDGPUNIDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
+}
+
+void
+AMDGPUCaymanDevice::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+    mHWBits.set(AMDGPUDeviceInfo::FMA);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
+  mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
+  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
+}
+
diff --git a/contrib/llvm/lib/Target/R600/AMDILNIDevice.h b/contrib/llvm/lib/Target/R600/AMDILNIDevice.h
new file mode 100644
index 000000000000..24a640845eab
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILNIDevice.h
@@ -0,0 +1,57 @@
+//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef AMDILNIDEVICE_H
+#define AMDILNIDEVICE_H
+#include "AMDGPUSubtarget.h"
+#include "AMDILEvergreenDevice.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+//===---------------------------------------------------------------------===//
+// NI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
+/// cards.
+///
+/// It is very similiar to the AMDGPUEvergreenDevice, with the major
+/// exception being differences in wavefront size and hardware capabilities.  The
+/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
+/// integer operations
+class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUNIDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUNIDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual uint32_t getGeneration() const;
+};
+
+/// Just as the AMDGPUCypressDevice is the double capable version of the
+/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
+/// of the AMDGPUNIDevice.  The other major difference is that the Cayman Device
+/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
+class AMDGPUCaymanDevice: public AMDGPUNIDevice {
+public:
+  AMDGPUCaymanDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUCaymanDevice();
+private:
+  virtual void setCaps();
+};
+
+static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
+} // namespace llvm
+#endif // AMDILNIDEVICE_H
diff --git a/contrib/llvm/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/contrib/llvm/lib/Target/R600/AMDILPeepholeOptimizer.cpp
new file mode 100644
index 000000000000..3a28038666f7
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILPeepholeOptimizer.cpp
@@ -0,0 +1,1215 @@
+//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "PeepholeOpt"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILDevices.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <sstream>
+
+#if 0
+STATISTIC(PointerAssignments, "Number of dynamic pointer "
+    "assigments discovered");
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
+#endif
+
+using namespace llvm;
+// The Peephole optimization pass is used to do simple last minute optimizations
+// that are required for correct code or to remove redundant functions
+namespace {
+
+class OpaqueType;
+
+class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
+public:
+  TargetMachine &TM;
+  static char ID;
+  AMDGPUPeepholeOpt(TargetMachine &tm);
+  ~AMDGPUPeepholeOpt();
+  const char *getPassName() const;
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+protected:
+private:
+  // Function to initiate all of the instruction level optimizations.
+  bool instLevelOptimizations(BasicBlock::iterator *inst);
+  // Quick check to see if we need to dump all of the pointers into the
+  // arena. If this is correct, then we set all pointers to exist in arena. This
+  // is a workaround for aliasing of pointers in a struct/union.
+  bool dumpAllIntoArena(Function &F);
+  // Because I don't want to invalidate any pointers while in the
+  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
+  // it later. This function does the conversions if required.
+  void doAtomicConversionIfNeeded(Function &F);
+  // Because __amdil_is_constant cannot be properly evaluated if
+  // optimizations are disabled, the call's are placed in a vector
+  // and evaluated after the __amdil_image* functions are evaluated
+  // which should allow the __amdil_is_constant function to be
+  // evaluated correctly.
+  void doIsConstCallConversionIfNeeded();
+  bool mChanged;
+  bool mDebug;
+  bool mConvertAtomics;
+  CodeGenOpt::Level optLevel;
+  // Run a series of tests to see if we can optimize a CALL instruction.
+  bool optimizeCallInst(BasicBlock::iterator *bbb);
+  // A peephole optimization to optimize bit extract sequences.
+  bool optimizeBitExtract(Instruction *inst);
+  // A peephole optimization to optimize bit insert sequences.
+  bool optimizeBitInsert(Instruction *inst);
+  bool setupBitInsert(Instruction *base, 
+                      Instruction *&src, 
+                      Constant *&mask, 
+                      Constant *&shift);
+  // Expand the bit field insert instruction on versions of OpenCL that
+  // don't support it.
+  bool expandBFI(CallInst *CI);
+  // Expand the bit field mask instruction on version of OpenCL that 
+  // don't support it.
+  bool expandBFM(CallInst *CI);
+  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
+  // this case we need to expand them. These functions check for 24bit functions
+  // and then expand.
+  bool isSigned24BitOps(CallInst *CI);
+  void expandSigned24BitOps(CallInst *CI);
+  // One optimization that can occur is that if the required workgroup size is
+  // specified then the result of get_local_size is known at compile time and
+  // can be returned accordingly.
+  bool isRWGLocalOpt(CallInst *CI);
+  // On northern island cards, the division is slightly less accurate than on
+  // previous generations, so we need to utilize a more accurate division. So we
+  // can translate the accurate divide to a normal divide on all other cards.
+  bool convertAccurateDivide(CallInst *CI);
+  void expandAccurateDivide(CallInst *CI);
+  // If the alignment is set incorrectly, it can produce really inefficient
+  // code. This checks for this scenario and fixes it if possible.
+  bool correctMisalignedMemOp(Instruction *inst);
+
+  // If we are in no opt mode, then we need to make sure that
+  // local samplers are properly propagated as constant propagation 
+  // doesn't occur and we need to know the value of kernel defined
+  // samplers at compile time.
+  bool propagateSamplerInst(CallInst *CI);
+
+  // Helper functions
+
+  // Group of functions that recursively calculate the size of a structure based
+  // on it's sub-types.
+  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
+  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
+  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
+  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
+  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
+  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
+  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
+  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
+
+  LLVMContext *mCTX;
+  Function *mF;
+  const AMDGPUSubtarget *mSTM;
+  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
+  SmallVector<CallInst *, 16> isConstVec;
+}; // class AMDGPUPeepholeOpt
+  char AMDGPUPeepholeOpt::ID = 0;
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator.
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F) {
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
+
+} // anonymous namespace
+
+namespace llvm {
+  FunctionPass *
+  createAMDGPUPeepholeOpt(TargetMachine &tm) {
+    return new AMDGPUPeepholeOpt(tm);
+  }
+} // llvm namespace
+
+AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
+  : FunctionPass(ID), TM(tm)  {
+  mDebug = DEBUGME;
+  optLevel = TM.getOptLevel();
+
+}
+
+AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
+}
+
+const char *
+AMDGPUPeepholeOpt::getPassName() const  {
+  return "AMDGPU PeepHole Optimization Pass";
+}
+
+bool 
+containsPointerType(Type *Ty)  {
+  if (!Ty) {
+    return false;
+  }
+  switch(Ty->getTypeID()) {
+  default:
+    return false;
+  case Type::StructTyID: {
+    const StructType *ST = dyn_cast<StructType>(Ty);
+    for (StructType::element_iterator stb = ST->element_begin(),
+           ste = ST->element_end(); stb != ste; ++stb) {
+      if (!containsPointerType(*stb)) {
+        continue;
+      }
+      return true;
+    }
+    break;
+  }
+  case Type::VectorTyID:
+  case Type::ArrayTyID:
+    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
+  case Type::PointerTyID:
+    return true;
+  };
+  return false;
+}
+
+bool 
+AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
+  bool dumpAll = false;
+  for (Function::const_arg_iterator cab = F.arg_begin(),
+       cae = F.arg_end(); cab != cae; ++cab) {
+    const Argument *arg = cab;
+    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
+    if (!PT) {
+      continue;
+    }
+    Type *DereferencedType = PT->getElementType();
+    if (!dyn_cast<StructType>(DereferencedType) 
+        ) {
+      continue;
+    }
+    if (!containsPointerType(DereferencedType)) {
+      continue;
+    }
+    // FIXME: Because a pointer inside of a struct/union may be aliased to
+    // another pointer we need to take the conservative approach and place all
+    // pointers into the arena until more advanced detection is implemented.
+    dumpAll = true;
+  }
+  return dumpAll;
+}
+void
+AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
+  if (isConstVec.empty()) {
+    return;
+  }
+  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
+    CallInst *CI = isConstVec[x];
+    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+      : ConstantInt::get(aType, 0);
+    CI->replaceAllUsesWith(Val);
+    CI->eraseFromParent();
+  }
+  isConstVec.clear();
+}
+void 
+AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
+  // Don't do anything if we don't have any atomic operations.
+  if (atomicFuncs.empty()) {
+    return;
+  }
+  // Change the function name for the atomic if it is required
+  uint32_t size = atomicFuncs.size();
+  for (uint32_t x = 0; x < size; ++x) {
+    atomicFuncs[x].first->setOperand(
+        atomicFuncs[x].first->getNumOperands()-1, 
+        atomicFuncs[x].second);
+
+  }
+  mChanged = true;
+  if (mConvertAtomics) {
+    return;
+  }
+}
+
+bool 
+AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
+  mChanged = false;
+  mF = &MF;
+  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
+  if (mDebug) {
+    MF.dump();
+  }
+  mCTX = &MF.getType()->getContext();
+  mConvertAtomics = true;
+  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
+                  this));
+
+  doAtomicConversionIfNeeded(MF);
+  doIsConstCallConversionIfNeeded();
+
+  if (mDebug) {
+    MF.dump();
+  }
+  return mChanged;
+}
+
+bool 
+AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
+  Instruction *inst = (*bbb);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+  if (!CI) {
+    return false;
+  }
+  if (isSigned24BitOps(CI)) {
+    expandSigned24BitOps(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (propagateSamplerInst(CI)) {
+    return false;
+  }
+  if (expandBFI(CI) || expandBFM(CI)) {
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (convertAccurateDivide(CI)) {
+    expandAccurateDivide(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+
+  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
+  if (calleeName.startswith("__amdil_is_constant")) {
+    // If we do not have optimizations, then this
+    // cannot be properly evaluated, so we add the
+    // call instruction to a vector and process
+    // them at the end of processing after the
+    // samplers have been correctly handled.
+    if (optLevel == CodeGenOpt::None) {
+      isConstVec.push_back(CI);
+      return false;
+    } else {
+      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+      Type *aType = Type::getInt32Ty(*mCTX);
+      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+        : ConstantInt::get(aType, 0);
+      CI->replaceAllUsesWith(Val);
+      ++(*bbb);
+      CI->eraseFromParent();
+      return true;
+    }
+  }
+
+  if (calleeName.equals("__amdil_is_asic_id_i32")) {
+    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = CV;
+    if (Val) {
+      Val = ConstantInt::get(aType, 
+          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
+    } else {
+      Val = ConstantInt::get(aType, 0);
+    }
+    CI->replaceAllUsesWith(Val);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
+  if (!F) {
+    return false;
+  } 
+  if (F->getName().startswith("__atom") && !CI->getNumUses() 
+      && F->getName().find("_xchg") == StringRef::npos) {
+    std::string buffer(F->getName().str() + "_noret");
+    F = dyn_cast<Function>(
+          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
+    atomicFuncs.push_back(std::make_pair(CI, F));
+  }
+  
+  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
+      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
+    return false;
+  }
+  if (!mConvertAtomics) {
+    return false;
+  }
+  StringRef name = F->getName();
+  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
+    mConvertAtomics = false;
+  }
+  return false;
+}
+
+bool
+AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
+    Instruction *&src, 
+    Constant *&mask, 
+    Constant *&shift) {
+  if (!base) {
+    if (mDebug) {
+      dbgs() << "Null pointer passed into function.\n";
+    }
+    return false;
+  }
+  bool andOp = false;
+  if (base->getOpcode() == Instruction::Shl) {
+    shift = dyn_cast<Constant>(base->getOperand(1));
+  } else if (base->getOpcode() == Instruction::And) {
+    mask = dyn_cast<Constant>(base->getOperand(1));
+    andOp = true;
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
+    }
+    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
+    return false;
+  }
+  src = dyn_cast<Instruction>(base->getOperand(0));
+  if (!src) {
+    if (mDebug) {
+      dbgs() << "Failed setup since the base operand is not an instruction!\n";
+    }
+    return false;
+  }
+  // If we find an 'and' operation, then we don't need to
+  // find the next operation as we already know the
+  // bits that are valid at this point.
+  if (andOp) {
+    return true;
+  }
+  if (src->getOpcode() == Instruction::Shl && !shift) {
+    shift = dyn_cast<Constant>(src->getOperand(1));
+    src = dyn_cast<Instruction>(src->getOperand(0));
+  } else if (src->getOpcode() == Instruction::And && !mask) {
+    mask = dyn_cast<Constant>(src->getOperand(1));
+  }
+  if (!mask && !shift) {
+    if (mDebug) {
+      dbgs() << "Failed setup since both mask and shift are NULL!\n";
+    }
+    // Did not find a constant mask or a shift.
+    return false;
+  }
+  return true;
+}
+bool
+AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::Or) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do an optimization on a sequence of ops that in the end equals a
+  // single ISA instruction.
+  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
+  // Some simplified versions of this pattern are as follows:
+  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
+  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
+  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
+  // (A & B) | (D << F) when (1 << F) >= B
+  // (A << C) | (D & E) when (1 << C) >= E
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
+    // The HD4XXX hardware doesn't support the ubit_insert instruction.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This optimization only works on 32bit integers.
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  // TODO: Handle vectors.
+  if (isVector) {
+    if (mDebug) {
+      dbgs() << "!!! Vectors are not supported yet!\n";
+    }
+    return false;
+  }
+  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
+  Constant *LHSMask = NULL, *RHSMask = NULL;
+  Constant *LHSShift = NULL, *RHSShift = NULL;
+  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
+  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
+  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (LHS) { LHS->dump(); }
+      if (LHSSrc) { LHSSrc->dump(); }
+      if (LHSMask) { LHSMask->dump(); }
+      if (LHSShift) { LHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (RHS) { RHS->dump(); }
+      if (RHSSrc) { RHSSrc->dump(); }
+      if (RHSMask) { RHSMask->dump(); }
+      if (RHSShift) { RHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
+    dbgs() << "Op:        "; inst->dump();
+    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
+  }
+  Constant *offset = NULL;
+  Constant *width = NULL;
+  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
+  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
+  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
+  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
+  lhsMaskVal = (LHSMask 
+      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
+  rhsMaskVal = (RHSMask 
+      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
+  lhsShiftVal = (LHSShift 
+      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
+  rhsShiftVal = (RHSShift 
+      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
+  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
+  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
+  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
+  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
+  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
+  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
+    return false;
+  }
+  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
+    offset = ConstantInt::get(aType, lhsMaskOffset, false);
+    width = ConstantInt::get(aType, lhsMaskWidth, false);
+    RHSSrc = RHS;
+    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
+      return false;
+    }
+    if (!LHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    } else if (lhsShiftVal != lhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing LHS!\n";
+    }
+  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
+    offset = ConstantInt::get(aType, rhsMaskOffset, false);
+    width = ConstantInt::get(aType, rhsMaskWidth, false);
+    LHSSrc = RHSSrc;
+    RHSSrc = LHS;
+    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
+      return false;
+    }
+    if (!RHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    } else if (rhsShiftVal != rhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing RHS!\n";
+    }
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed constraint 3!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+  }
+  if (!offset || !width) {
+    if (mDebug) {
+      dbgs() << "Either width or offset are NULL, failed detection!\n";
+    }
+    return false;
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_insert";
+  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+        getOrInsertFunction(StringRef(name), funcType));
+  Value *Operands[4] = {
+    width,
+    offset,
+    LHSSrc,
+    RHSSrc
+  };
+  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
+  if (mDebug) {
+    dbgs() << "Old Inst: ";
+    inst->dump();
+    dbgs() << "New Inst: ";
+    CI->dump();
+    dbgs() << "\n\n";
+  }
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool 
+AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::And) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do some simple optimizations on Shift right/And patterns. The
+  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
+  // value smaller than 32 and C is a mask. If C is a constant value, then the
+  // following transformation can occur. For signed integers, it turns into the
+  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
+  // integers, it turns into the function call dst =
+  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
+  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
+  // Evergreen hardware.
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
+    // This does not work on HD4XXX hardware.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+
+  // XXX Support vector types
+  if (isVector) {
+    return false;
+  }
+  int numEle = 1;
+  // This only works on 32bit integers
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
+  // If the first operand is not a shift instruction, then we can return as it
+  // doesn't match this pattern.
+  if (!ShiftInst || !ShiftInst->isShift()) {
+    return false;
+  }
+  // If we are a shift left, then we need don't match this pattern.
+  if (ShiftInst->getOpcode() == Instruction::Shl) {
+    return false;
+  }
+  bool isSigned = ShiftInst->isArithmeticShift();
+  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
+  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
+  // Lets make sure that the shift value and the and mask are constant integers.
+  if (!AndMask || !ShrVal) {
+    return false;
+  }
+  Constant *newMaskConst;
+  Constant *shiftValConst;
+  if (isVector) {
+    // Handle the vector case
+    std::vector<Constant *> maskVals;
+    std::vector<Constant *> shiftVals;
+    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
+    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
+    Type *scalarType = AndMaskVec->getType()->getScalarType();
+    assert(AndMaskVec->getNumOperands() ==
+           ShrValVec->getNumOperands() && "cannot have a "
+           "combination where the number of elements to a "
+           "shift and an and are different!");
+    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
+      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
+      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
+      if (!AndCI || !ShiftIC) {
+        return false;
+      }
+      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
+      if (!isMask_32(maskVal)) {
+        return false;
+      }
+      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
+      // If the mask or shiftval is greater than the bitcount, then break out.
+      if (maskVal >= 32 || shiftVal >= 32) {
+        return false;
+      }
+      // If the mask val is greater than the the number of original bits left
+      // then this optimization is invalid.
+      if (maskVal > (32 - shiftVal)) {
+        return false;
+      }
+      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
+      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
+    }
+    newMaskConst = ConstantVector::get(maskVals);
+    shiftValConst = ConstantVector::get(shiftVals);
+  } else {
+    // Handle the scalar case
+    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
+    // This must be a mask value where all lower bits are set to 1 and then any
+    // bit higher is set to 0.
+    if (!isMask_32(maskVal)) {
+      return false;
+    }
+    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+    // Count the number of bits set in the mask, this is the width of the
+    // resulting bit set that is extracted from the source value.
+    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
+    // If the mask or shift val is greater than the bitcount, then break out.
+    if (maskVal >= 32 || shiftVal >= 32) {
+      return false;
+    }
+    // If the mask val is greater than the the number of original bits left then
+    // this optimization is invalid.
+    if (maskVal > (32 - shiftVal)) {
+      return false;
+    }
+    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
+    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "llvm.AMDGPU.bit.extract.u32";
+  if (isVector) {
+    name += ".v" + itostr(numEle) + "i32";
+  } else {
+    name += ".";
+  }
+  // Lets create the function.
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(StringRef(name), funcType));
+  Value *Operands[3] = {
+    ShiftInst->getOperand(0),
+    shiftValConst,
+    newMaskConst
+  };
+  // Lets create the Call with the operands
+  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
+  CI->setDoesNotAccessMemory();
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfi")) {
+    return false;
+  }
+  Type* type = CI->getOperand(0)->getType();
+  Constant *negOneConst = NULL;
+  if (type->isVectorTy()) {
+    std::vector<Constant *> negOneVals;
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      negOneVals.push_back(negOneConst);
+    }
+    negOneConst = ConstantVector::get(negOneVals);
+  } else {
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+  }
+  // __amdil_bfi => (A & B) | (~A & C)
+  BinaryOperator *lhs = 
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        CI->getOperand(1), "bfi_and", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
+        "bfi_not", CI);
+  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
+      "bfi_and", CI);
+  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfm")) {
+    return false;
+  }
+  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
+  Constant *newMaskConst = NULL;
+  Constant *newShiftConst = NULL;
+  Type* type = CI->getOperand(0)->getType();
+  if (type->isVectorTy()) {
+    std::vector<Constant*> newMaskVals, newShiftVals;
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      newMaskVals.push_back(newMaskConst);
+      newShiftVals.push_back(newShiftConst);
+    }
+    newMaskConst = ConstantVector::get(newMaskVals);
+    newShiftConst = ConstantVector::get(newShiftVals);
+  } else {
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+  }
+  BinaryOperator *lhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
+      lhs, "bfm_shl", CI);
+  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
+      newShiftConst, "bfm_sub", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
+  Instruction *inst = (*bbb);
+  if (optimizeCallInst(bbb)) {
+    return true;
+  }
+  if (optimizeBitExtract(inst)) {
+    return false;
+  }
+  if (optimizeBitInsert(inst)) {
+    return false;
+  }
+  if (correctMisalignedMemOp(inst)) {
+    return false;
+  }
+  return false;
+}
+bool
+AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
+  LoadInst *linst = dyn_cast<LoadInst>(inst);
+  StoreInst *sinst = dyn_cast<StoreInst>(inst);
+  unsigned alignment;
+  Type* Ty = inst->getType();
+  if (linst) {
+    alignment = linst->getAlignment();
+    Ty = inst->getType();
+  } else if (sinst) {
+    alignment = sinst->getAlignment();
+    Ty = sinst->getValueOperand()->getType();
+  } else {
+    return false;
+  }
+  unsigned size = getTypeSize(Ty);
+  if (size == alignment || size < alignment) {
+    return false;
+  }
+  if (!Ty->isStructTy()) {
+    return false;
+  }
+  if (alignment < 4) {
+    if (linst) {
+      linst->setAlignment(0);
+      return true;
+    } else if (sinst) {
+      sinst->setAlignment(0);
+      return true;
+    }
+  }
+  return false;
+}
+bool 
+AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  std::string namePrefix = LHS->getName().substr(0, 14);
+  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
+      && namePrefix != "__amdil__imul24_high") {
+    return false;
+  }
+  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
+    return false;
+  }
+  return true;
+}
+
+void 
+AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
+  assert(isSigned24BitOps(CI) && "Must be a "
+      "signed 24 bit operation to call this function!");
+  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
+  // On 7XX and 8XX we do not have signed 24bit, so we need to
+  // expand it to the following:
+  // imul24 turns into 32bit imul
+  // imad24 turns into 32bit imad
+  // imul24_high turns into 32bit imulhigh
+  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
+    Type *aType = CI->getOperand(0)->getType();
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    callTypes.push_back(CI->getOperand(2)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imad";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(StringRef(name), funcType));
+    Value *Operands[3] = {
+      CI->getOperand(0),
+      CI->getOperand(1),
+      CI->getOperand(2)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
+    BinaryOperator *mulOp =
+      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
+          CI->getOperand(1), "imul24", CI);
+    CI->replaceAllUsesWith(mulOp);
+  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
+    Type *aType = CI->getOperand(0)->getType();
+
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imul_high";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(StringRef(name), funcType));
+    Value *Operands[2] = {
+      CI->getOperand(0),
+      CI->getOperand(1)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  }
+}
+
+bool 
+AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
+  return (CI != NULL
+          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
+          == "__amdil_get_local_size_int");
+}
+
+bool 
+AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
+  if (!CI) {
+    return false;
+  }
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
+      && (mSTM->getDeviceName() == "cayman")) {
+    return false;
+  }
+  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
+      == "__amdil_improved_div";
+}
+
+void 
+AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
+  assert(convertAccurateDivide(CI)
+         && "expanding accurate divide can only happen if it is expandable!");
+  BinaryOperator *divOp =
+    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
+                           CI->getOperand(1), "fdiv32", CI);
+  CI->replaceAllUsesWith(divOp);
+}
+
+bool
+AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
+  if (optLevel != CodeGenOpt::None) {
+    return false;
+  }
+
+  if (!CI) {
+    return false;
+  }
+
+  unsigned funcNameIdx = 0;
+  funcNameIdx = CI->getNumOperands() - 1;
+  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
+  if (calleeName != "__amdil_image2d_read_norm"
+   && calleeName != "__amdil_image2d_read_unnorm"
+   && calleeName != "__amdil_image3d_read_norm"
+   && calleeName != "__amdil_image3d_read_unnorm") {
+    return false;
+  }
+
+  unsigned samplerIdx = 2;
+  samplerIdx = 1;
+  Value *sampler = CI->getOperand(samplerIdx);
+  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
+  if (!lInst) {
+    return false;
+  }
+
+  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return false;
+  }
+
+  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
+  // If we are loading from what is not a global value, then we
+  // fail and return.
+  if (!gv) {
+    return false;
+  }
+
+  // If we don't have an initializer or we have an initializer and
+  // the initializer is not a 32bit integer, we fail.
+  if (!gv->hasInitializer() 
+      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
+      return false;
+  }
+
+  // Now that we have the global variable initializer, lets replace
+  // all uses of the load instruction with the samplerVal and
+  // reparse the __amdil_is_constant() function.
+  Constant *samplerVal = gv->getInitializer();
+  lInst->replaceAllUsesWith(samplerVal);
+  return true;
+}
+
+bool 
+AMDGPUPeepholeOpt::doInitialization(Module &M)  {
+  return false;
+}
+
+bool 
+AMDGPUPeepholeOpt::doFinalization(Module &M)  {
+  return false;
+}
+
+void 
+AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
+    bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
+    bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
+    bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
+    bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
+    bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
+    bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
+    bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILRegisterInfo.td b/contrib/llvm/lib/Target/R600/AMDILRegisterInfo.td
new file mode 100644
index 000000000000..b9d033432e8c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILRegisterInfo.td
@@ -0,0 +1,107 @@
+//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//  Declarations that describe the AMDIL register file
+//
+//===----------------------------------------------------------------------===//
+
+class AMDILReg<bits<16> num, string n> : Register<n> {
+  field bits<16> Value;
+  let Value = num;
+  let Namespace = "AMDGPU";
+}
+
+// We will start with 8 registers for each class before expanding to more
+// Since the swizzle is added based on the register class, we can leave it
+// off here and just specify different registers for different register classes
+def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
+def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
+
+// All registers between 1000 and 1024 are reserved and cannot be used
+// unless commented in this section
+// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
+// r1020 is used to hold the frame index for local arrays
+// r1019 is used to hold the dynamic stack allocation pointer
+// r1018 is used as a temporary register for handwritten code
+// r1017 is used as a temporary register for handwritten code
+// r1016 is used as a temporary register for load/store code
+// r1015 is used as a temporary register for data segment offset
+// r1014 is used as a temporary register for store code
+// r1013 is used as the section data pointer register
+// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
+// r1009 is used as the frame pointer register
+// r999 is used as the mem register.
+// r998 is used as the return address register.
+//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
+//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
+//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
+//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
+//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
+//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
+def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
+def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
+def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
+def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
+def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
+def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
+def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
+def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
+def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
+def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
+def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
+def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
+def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
+def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
+def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
+def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
+def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
+def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
+def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
+def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
+def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
+def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
+def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
diff --git a/contrib/llvm/lib/Target/R600/AMDILSIDevice.cpp b/contrib/llvm/lib/Target/R600/AMDILSIDevice.cpp
new file mode 100644
index 000000000000..0d1de3d11eb4
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILSIDevice.cpp
@@ -0,0 +1,48 @@
+//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILSIDevice.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+
+using namespace llvm;
+
+AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+}
+AMDGPUSIDevice::~AMDGPUSIDevice() {
+}
+
+size_t
+AMDGPUSIDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDGPUSIDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD7XXX;
+}
+
+std::string
+AMDGPUSIDevice::getDataLayout() const {
+  return std::string(
+    "e"
+    "-p:64:64:64"
+    "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64"
+    "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128"
+    "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+    "-v2048:2048:2048"
+    "-n32:64"
+  );
+}
diff --git a/contrib/llvm/lib/Target/R600/AMDILSIDevice.h b/contrib/llvm/lib/Target/R600/AMDILSIDevice.h
new file mode 100644
index 000000000000..5b2cb2502211
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/AMDILSIDevice.h
@@ -0,0 +1,39 @@
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef AMDILSIDEVICE_H
+#define AMDILSIDEVICE_H
+#include "AMDILEvergreenDevice.h"
+
+namespace llvm {
+class AMDGPUSubtarget;
+//===---------------------------------------------------------------------===//
+// SI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
+/// of cards.
+class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUSIDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUSIDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual std::string getDataLayout() const;
+};
+
+} // namespace llvm
+#endif // AMDILSIDEVICE_H
diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 000000000000..10547a598805
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,172 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
+
+using namespace llvm;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                             StringRef Annot) {
+  printInstruction(MI, OS);
+
+  printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case AMDGPU::PRED_SEL_OFF: break;
+    default: O << getRegisterName(Op.getReg()); break;
+    }
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    O << Op.getFPImm();
+  } else if (Op.isExpr()) {
+    const MCExpr *Exp = Op.getExpr();
+    Exp->print(O);
+  } else {
+    assert(!"unknown operand type in printOperand");
+  }
+}
+
+void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+
+  if (Imm == 2) {
+    O << "P0";
+  } else if (Imm == 1) {
+    O << "P20";
+  } else if (Imm == 0) {
+    O << "P10";
+  } else {
+    assert(!"Invalid interpolation parameter slot");
+  }
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+  O  << ", ";
+  printOperand(MI, OpNo + 1, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O, StringRef Asm) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1) {
+    O << Asm;
+  }
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "|");
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  union Literal {
+    float f;
+    int32_t i;
+  } L;
+
+  L.i = MI->getOperand(OpNo).getImm();
+  O << L.i << "(" << L.f << ")";
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printIfSet(MI, OpNo, O, " *");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "-");
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default: break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "+");
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const char * chans = "XYZW";
+  int sel = MI->getOperand(OpNo).getImm();
+
+  int chan = sel & 3;
+  sel >>= 2;
+
+  if (sel >= 512) {
+    sel -= 512;
+    int cb = sel >> 12;
+    sel &= 4095;
+    O << cb << "[" << sel << "]";
+  } else if (sel >= 448) {
+    sel -= 448;
+    O << sel;
+  } else if (sel >= 0){
+    O << sel;
+  }
+
+  if (sel >= 0)
+    O << "." << chans[chan];
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 000000000000..767a7082cc2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,54 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTPRINTER_H
+#define AMDGPUINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  //Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
+  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUINSTRPRINTER_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 000000000000..98fca432670d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,90 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUMCObjectWriter : public MCObjectWriter {
+public:
+  AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
+  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout) {
+    //XXX: Implement if necessary.
+  }
+  virtual void RecordRelocation(const MCAssembler &Asm,
+                                const MCAsmLayout &Layout,
+                                const MCFragment *Fragment,
+                                const MCFixup &Fixup,
+                                MCValue Target, uint64_t &FixedValue) {
+    assert(!"Not implemented");
+  }
+
+  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+
+};
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+  AMDGPUAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+  virtual unsigned getNumFixupKinds() const { return 0; };
+  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                          uint64_t Value) const;
+  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout) const {
+    return false;
+  }
+  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    assert(!"Not implemented");
+  }
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
+  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    return true;
+  }
+};
+
+} //End anonymous namespace
+
+void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
+    Asm.writeSectionData(I, Layout);
+  }
+}
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+                                           StringRef CPU) {
+  return new AMDGPUAsmBackend(T);
+}
+
+AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
+                                                        raw_ostream &OS) const {
+  return new AMDGPUMCObjectWriter(OS);
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value) const {
+
+  uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+  assert(Fixup.getKind() == FK_PCRel_4);
+  *Dst = (Value - 4) / 4;
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 000000000000..b7cdd7c8cde9
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,83 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+
+using namespace llvm;
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
+  HasSingleParameterDotFile = false;
+  WeakDefDirective = 0;
+  //===------------------------------------------------------------------===//
+  HasSubsectionsViaSymbols = true;
+  HasMachoZeroFillDirective = false;
+  HasMachoTBSSDirective = false;
+  HasStaticCtorDtorReferenceInStaticMode = false;
+  LinkerRequiresNonEmptyDwarfLines = true;
+  MaxInstLength = 16;
+  PCSymbol = "$";
+  SeparatorString = "\n";
+  CommentColumn = 40;
+  CommentString = ";";
+  LabelSuffix = ":";
+  GlobalPrefix = "@";
+  PrivateGlobalPrefix = ";.";
+  LinkerPrivateGlobalPrefix = "!";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+  AssemblerDialect = 0;
+  AllowQuotesInName = false;
+  AllowNameToStartWithDigit = false;
+  AllowPeriodsInName = false;
+
+  //===--- Data Emission Directives -------------------------------------===//
+  ZeroDirective = ".zero";
+  AsciiDirective = ".ascii\t";
+  AscizDirective = ".asciz\t";
+  Data8bitsDirective = ".byte\t";
+  Data16bitsDirective = ".short\t";
+  Data32bitsDirective = ".long\t";
+  Data64bitsDirective = ".quad\t";
+  GPRel32Directive = 0;
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+  HasMicrosoftFastStdCallMangling = false;
+
+  //===--- Alignment Information ----------------------------------------===//
+  AlignDirective = ".align\t";
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  GlobalDirective = ".global";
+  ExternDirective = ".extern";
+  HasSetDirective = false;
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasDotTypeDotSizeDirective = false;
+  HasNoDeadStrip = true;
+  HasSymbolResolver = false;
+  WeakRefDirective = ".weakref\t";
+  LinkOnceDirective = 0;
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  DwarfSectionOffsetDirective = ".offset";
+
+}
+
+const char*
+AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
+  return 0;
+}
+
+const MCSection*
+AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 000000000000..3ad0fa6824ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface  ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUMCASMINFO_H
+#define AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+namespace llvm {
+
+class Target;
+class StringRef;
+
+class AMDGPUMCAsmInfo : public MCAsmInfo {
+public:
+  explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
+  const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+};
+} // namespace llvm
+#endif // AMDGPUMCASMINFO_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 000000000000..cd3a7ce65aa5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,40 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUCODEEMITTER_H
+#define AMDGPUCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCOperand;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+public:
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUCODEEMITTER_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 000000000000..072ee49b6311
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,113 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDGPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo * X = new MCSubtargetInfo();
+  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
+    return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
+  } else {
+    return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+  }
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  return createPureStreamer(Ctx, MAB, _OS, _Emitter);
+}
+
+extern "C" void LLVMInitializeR600TargetMC() {
+
+  RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+
+  TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+
+  TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+
+  TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+
+  TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+
+  TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+
+  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+
+  TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+
+  TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+}
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 000000000000..363a4af3f3a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,55 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef AMDGPUMCTARGETDESC_H
+#define AMDGPUMCTARGETDESC_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+
+extern Target TheAMDGPUTarget;
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI,
+                                       MCContext &Ctx);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI,
+                                     const MCSubtargetInfo &STI,
+                                     MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
+                                     StringRef CPU);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif // AMDGPUMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 000000000000..927bcbd8305c
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,585 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This code emitter outputs bytecode that is understood by the r600g driver
+/// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
+/// but it still needs to be run through a finalizer in order to be executed
+/// by the GPU.
+///
+/// [1] http://www.mesa3d.org/
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include <stdio.h>
+
+#define SRC_BYTE_COUNT 11
+#define DST_BYTE_COUNT 5
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+  R600MCCodeEmitter(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const R600MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  const MCSubtargetInfo &STI;
+  MCContext &Ctx;
+
+public:
+
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                    const MCSubtargetInfo &sti, MCContext &ctx)
+    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+
+  /// \brief Encode the instruction and write it to the OS.
+  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// \returns the encoding for an MCOperand.
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+private:
+
+  void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+                    raw_ostream &OS) const;
+  void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
+  void EmitSrcISA(const MCInst &MI, unsigned RegOpIdx, unsigned SelOpIdx,
+                    raw_ostream &OS) const;
+  void EmitDst(const MCInst &MI, raw_ostream &OS) const;
+  void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
+
+  void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
+
+  void EmitByte(unsigned int byte, raw_ostream &OS) const;
+
+  void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
+
+  void Emit(uint32_t value, raw_ostream &OS) const;
+  void Emit(uint64_t value, raw_ostream &OS) const;
+
+  unsigned getHWRegChan(unsigned reg) const;
+  unsigned getHWReg(unsigned regNo) const;
+
+  bool isFCOp(unsigned opcode) const;
+  bool isTexOp(unsigned opcode) const;
+  bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
+
+};
+
+} // End anonymous namespace
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum InstrTypes {
+  INSTR_ALU = 0,
+  INSTR_TEX,
+  INSTR_FC,
+  INSTR_NATIVE,
+  INSTR_VTX,
+  INSTR_EXPORT,
+  INSTR_CFALU
+};
+
+enum FCInstr {
+  FC_IF_PREDICATE = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK_PREDICATE,
+  FC_CONTINUE
+};
+
+enum TextureTypes {
+  TEXTURE_1D = 1,
+  TEXTURE_2D,
+  TEXTURE_3D,
+  TEXTURE_CUBE,
+  TEXTURE_RECT,
+  TEXTURE_SHADOW1D,
+  TEXTURE_SHADOW2D,
+  TEXTURE_SHADOWRECT,
+  TEXTURE_1D_ARRAY,
+  TEXTURE_2D_ARRAY,
+  TEXTURE_SHADOW1D_ARRAY,
+  TEXTURE_SHADOW2D_ARRAY
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+}
+
+void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (isFCOp(MI.getOpcode())){
+    EmitFCInstr(MI, OS);
+  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::BUNDLE ||
+    MI.getOpcode() == AMDGPU::KILL) {
+    return;
+  } else {
+    switch(MI.getOpcode()) {
+    case AMDGPU::STACK_SIZE: {
+      EmitByte(MI.getOperand(0).getImm(), OS);
+      break;
+    }
+    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+      uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_NATIVE, OS);
+      Emit(inst, OS);
+      break;
+    }
+    case AMDGPU::CONSTANT_LOAD_eg:
+    case AMDGPU::VTX_READ_PARAM_8_eg:
+    case AMDGPU::VTX_READ_PARAM_16_eg:
+    case AMDGPU::VTX_READ_PARAM_32_eg:
+    case AMDGPU::VTX_READ_PARAM_128_eg:
+    case AMDGPU::VTX_READ_GLOBAL_8_eg:
+    case AMDGPU::VTX_READ_GLOBAL_32_eg:
+    case AMDGPU::VTX_READ_GLOBAL_128_eg:
+    case AMDGPU::TEX_VTX_CONSTBUF:
+    case AMDGPU::TEX_VTX_TEXBUF : {
+      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+
+      EmitByte(INSTR_VTX, OS);
+      Emit(InstWord01, OS);
+      Emit(InstWord2, OS);
+      break;
+    }
+    case AMDGPU::TEX_LD:
+    case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+    case AMDGPU::TEX_SAMPLE:
+    case AMDGPU::TEX_SAMPLE_C:
+    case AMDGPU::TEX_SAMPLE_L:
+    case AMDGPU::TEX_SAMPLE_C_L:
+    case AMDGPU::TEX_SAMPLE_LB:
+    case AMDGPU::TEX_SAMPLE_C_LB:
+    case AMDGPU::TEX_SAMPLE_G:
+    case AMDGPU::TEX_SAMPLE_C_G:
+    case AMDGPU::TEX_GET_GRADIENTS_H:
+    case AMDGPU::TEX_GET_GRADIENTS_V:
+    case AMDGPU::TEX_SET_GRADIENTS_H:
+    case AMDGPU::TEX_SET_GRADIENTS_V: {
+      unsigned Opcode = MI.getOpcode();
+      bool HasOffsets = (Opcode == AMDGPU::TEX_LD);
+      unsigned OpOffset = HasOffsets ? 3 : 0;
+      int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
+      int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
+
+      uint32_t SrcSelect[4] = {0, 1, 2, 3};
+      uint32_t Offsets[3] = {0, 0, 0};
+      uint64_t CoordType[4] = {1, 1, 1, 1};
+
+      if (HasOffsets)
+        for (unsigned i = 0; i < 3; i++) {
+          int SignedOffset = MI.getOperand(i + 2).getImm();
+          Offsets[i] = (SignedOffset & 0x1F);
+        }
+          
+
+      if (TextureType == TEXTURE_RECT ||
+          TextureType == TEXTURE_SHADOWRECT) {
+        CoordType[ELEMENT_X] = 0;
+        CoordType[ELEMENT_Y] = 0;
+      }
+
+      if (TextureType == TEXTURE_1D_ARRAY ||
+          TextureType == TEXTURE_SHADOW1D_ARRAY) {
+        if (Opcode == AMDGPU::TEX_SAMPLE_C_L ||
+            Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
+          CoordType[ELEMENT_Y] = 0;
+        } else {
+          CoordType[ELEMENT_Z] = 0;
+          SrcSelect[ELEMENT_Z] = ELEMENT_Y;
+        }
+      } else if (TextureType == TEXTURE_2D_ARRAY ||
+          TextureType == TEXTURE_SHADOW2D_ARRAY) {
+        CoordType[ELEMENT_Z] = 0;
+      }
+
+
+      if ((TextureType == TEXTURE_SHADOW1D ||
+          TextureType == TEXTURE_SHADOW2D ||
+          TextureType == TEXTURE_SHADOWRECT ||
+          TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+          Opcode != AMDGPU::TEX_SAMPLE_C_L &&
+          Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
+        SrcSelect[ELEMENT_W] = ELEMENT_Z;
+      }
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
+          CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 |
+          CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63;
+      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+          Offsets[2] << 10;
+
+      EmitByte(INSTR_TEX, OS);
+      Emit(Word01, OS);
+      Emit(Word2, OS);
+      break;
+    }
+    case AMDGPU::EG_ExportSwz:
+    case AMDGPU::R600_ExportSwz:
+    case AMDGPU::EG_ExportBuf:
+    case AMDGPU::R600_ExportBuf: {
+      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_EXPORT, OS);
+      Emit(Inst, OS);
+      break;
+    }
+    case AMDGPU::CF_ALU:
+    case AMDGPU::CF_ALU_PUSH_BEFORE: {
+      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_CFALU, OS);
+      Emit(Inst, OS);
+      break;
+    }
+    case AMDGPU::CF_TC:
+    case AMDGPU::CF_VC:
+    case AMDGPU::CF_CALL_FS:
+      return;
+    case AMDGPU::WHILE_LOOP:
+    case AMDGPU::END_LOOP:
+    case AMDGPU::LOOP_BREAK:
+    case AMDGPU::CF_CONTINUE:
+    case AMDGPU::CF_JUMP:
+    case AMDGPU::CF_ELSE:
+    case AMDGPU::POP: {
+      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_NATIVE, OS);
+      Emit(Inst, OS);
+      break;
+    }
+    default:
+      EmitALUInstr(MI, Fixups, OS);
+      break;
+    }
+  }
+}
+
+void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     raw_ostream &OS) const {
+  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
+
+  // Emit instruction type
+  EmitByte(INSTR_ALU, OS);
+
+  uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+
+  //older alu have different encoding for instructions with one or two src
+  //parameters.
+  if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+      !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
+    uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
+    InstWord01 &= ~(0x3FFULL << 39);
+    InstWord01 |= ISAOpCode << 1;
+  }
+
+  unsigned SrcNum = MCDesc.TSFlags & R600_InstFlag::OP3 ? 3 :
+      MCDesc.TSFlags & R600_InstFlag::OP2 ? 2 : 1;
+
+  EmitByte(SrcNum, OS);
+
+  const unsigned SrcOps[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL}
+  };
+
+  for (unsigned SrcIdx = 0; SrcIdx < SrcNum; ++SrcIdx) {
+    unsigned RegOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][0]];
+    unsigned SelOpIdx = R600Operands::ALUOpTable[SrcNum-1][SrcOps[SrcIdx][1]];
+    EmitSrcISA(MI, RegOpIdx, SelOpIdx, OS);
+  }
+
+  Emit(InstWord01, OS);
+  return;
+}
+
+void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
+                                raw_ostream &OS) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  union {
+    float f;
+    uint32_t i;
+  } Value;
+  Value.i = 0;
+  // Emit the source select (2 bytes).  For GPRs, this is the register index.
+  // For other potential instruction operands, (e.g. constant registers) the
+  // value of the source select is defined in the r600isa docs.
+  if (MO.isReg()) {
+    unsigned reg = MO.getReg();
+    EmitTwoBytes(getHWReg(reg), OS);
+    if (reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned ImmOpIndex = MI.getNumOperands() - 1;
+      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+      if (ImmOp.isFPImm()) {
+        Value.f = ImmOp.getFPImm();
+      } else {
+        assert(ImmOp.isImm());
+        Value.i = ImmOp.getImm();
+      }
+    }
+  } else {
+    // XXX: Handle other operand types.
+    EmitTwoBytes(0, OS);
+  }
+
+  // Emit the source channel (1 byte)
+  if (MO.isReg()) {
+    EmitByte(getHWRegChan(MO.getReg()), OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // XXX: Emit isNegated (1 byte)
+  if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
+      && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
+     (MO.isReg() &&
+      (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
+    EmitByte(1, OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // Emit isAbsolute (1 byte)
+  if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
+    EmitByte(1, OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // XXX: Emit relative addressing mode (1 byte)
+  EmitByte(0, OS);
+
+  // Emit kc_bank, This will be adjusted later by r600_asm
+  EmitByte(0, OS);
+
+  // Emit the literal value, if applicable (4 bytes).
+  Emit(Value.i, OS);
+
+}
+
+void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned RegOpIdx,
+                                   unsigned SelOpIdx, raw_ostream &OS) const {
+  const MCOperand &RegMO = MI.getOperand(RegOpIdx);
+  const MCOperand &SelMO = MI.getOperand(SelOpIdx);
+
+  union {
+    float f;
+    uint32_t i;
+  } InlineConstant;
+  InlineConstant.i = 0;
+  // Emit source type (1 byte) and source select (4 bytes). For GPRs type is 0
+  // and select is 0 (GPR index is encoded in the instr encoding. For constants
+  // type is 1 and select is the original const select passed from the driver.
+  unsigned Reg = RegMO.getReg();
+  if (Reg == AMDGPU::ALU_CONST) {
+    EmitByte(1, OS);
+    uint32_t Sel = SelMO.getImm();
+    Emit(Sel, OS);
+  } else {
+    EmitByte(0, OS);
+    Emit((uint32_t)0, OS);
+  }
+
+  if (Reg == AMDGPU::ALU_LITERAL_X) {
+    unsigned ImmOpIndex = MI.getNumOperands() - 1;
+    MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+    if (ImmOp.isFPImm()) {
+      InlineConstant.f = ImmOp.getFPImm();
+    } else {
+      assert(ImmOp.isImm());
+      InlineConstant.i = ImmOp.getImm();
+    }
+  }
+
+  // Emit the literal value, if applicable (4 bytes).
+  Emit(InlineConstant.i, OS);
+}
+
+void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
+
+  // Emit instruction type
+  EmitByte(INSTR_FC, OS);
+
+  // Emit SRC
+  unsigned NumOperands = MI.getNumOperands();
+  if (NumOperands > 0) {
+    assert(NumOperands == 1);
+    EmitSrc(MI, 0, OS);
+  } else {
+    EmitNullBytes(SRC_BYTE_COUNT, OS);
+  }
+
+  // Emit FC Instruction
+  enum FCInstr instr;
+  switch (MI.getOpcode()) {
+  case AMDGPU::PREDICATED_BREAK:
+    instr = FC_BREAK_PREDICATE;
+    break;
+  case AMDGPU::CONTINUE:
+    instr = FC_CONTINUE;
+    break;
+  case AMDGPU::IF_PREDICATE_SET:
+    instr = FC_IF_PREDICATE;
+    break;
+  case AMDGPU::ELSE:
+    instr = FC_ELSE;
+    break;
+  case AMDGPU::ENDIF:
+    instr = FC_ENDIF;
+    break;
+  case AMDGPU::ENDLOOP:
+    instr = FC_ENDLOOP;
+    break;
+  case AMDGPU::WHILELOOP:
+    instr = FC_BGNLOOP;
+    break;
+  default:
+    abort();
+    break;
+  }
+  EmitByte(instr, OS);
+}
+
+void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
+                                      raw_ostream &OS) const {
+
+  for (unsigned int i = 0; i < ByteCount; i++) {
+    EmitByte(0, OS);
+  }
+}
+
+void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
+  OS.write((uint8_t) Byte & 0xff);
+}
+
+void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
+                                     raw_ostream &OS) const {
+  OS.write((uint8_t) (Bytes & 0xff));
+  OS.write((uint8_t) ((Bytes >> 8) & 0xff));
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 4; i++) {
+    OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
+  }
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 8; i++) {
+    EmitByte((Value >> (8 * i)) & 0xff, OS);
+  }
+}
+
+unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
+  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixup) const {
+  if (MO.isReg()) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
+      return MRI.getEncodingValue(MO.getReg());
+    } else {
+      return getHWReg(MO.getReg());
+    }
+  } else if (MO.isImm()) {
+    return MO.getImm();
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding helper functions
+//===----------------------------------------------------------------------===//
+
+bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::PREDICATED_BREAK:
+  case AMDGPU::CONTINUE:
+  case AMDGPU::IF_PREDICATE_SET:
+  case AMDGPU::ELSE:
+  case AMDGPU::ENDIF:
+  case AMDGPU::ENDLOOP:
+  case AMDGPU::WHILELOOP:
+    return true;
+  }
+}
+
+bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::TEX_LD:
+  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+  case AMDGPU::TEX_SAMPLE:
+  case AMDGPU::TEX_SAMPLE_C:
+  case AMDGPU::TEX_SAMPLE_L:
+  case AMDGPU::TEX_SAMPLE_C_L:
+  case AMDGPU::TEX_SAMPLE_LB:
+  case AMDGPU::TEX_SAMPLE_C_LB:
+  case AMDGPU::TEX_SAMPLE_G:
+  case AMDGPU::TEX_SAMPLE_C_G:
+  case AMDGPU::TEX_GET_GRADIENTS_H:
+  case AMDGPU::TEX_GET_GRADIENTS_V:
+  case AMDGPU::TEX_SET_GRADIENTS_H:
+  case AMDGPU::TEX_SET_GRADIENTS_V:
+    return true;
+  }
+}
+
+bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
+                                  unsigned Flag) const {
+  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
+  unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
+  if (FlagIndex == 0) {
+    return false;
+  }
+  assert(MI.getOperand(FlagIndex).isImm());
+  return !!((MI.getOperand(FlagIndex).getImm() >>
+            (NUM_MO_FLAGS * Operand)) & Flag);
+}
+
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 000000000000..5af83209a0d5
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,201 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Helper type used in encoding
+typedef union {
+  int32_t I;
+  float F;
+} IntFloatUnion;
+
+class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
+  SIMCCodeEmitter(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const SIMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+
+  /// \brief Can this operand also contain immediate values?
+  bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const;
+
+  /// \brief Encode an fp or int literal
+  uint32_t getLitEncoding(const MCOperand &MO) const;
+
+public:
+  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                  const MCSubtargetInfo &sti, MCContext &ctx)
+    : MCII(mcii), MRI(mri) { }
+
+  ~SIMCCodeEmitter() { }
+
+  /// \breif Encode the instruction and write it to the OS.
+  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// \returns the encoding for an MCOperand.
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+};
+
+} // End anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
+}
+
+bool SIMCCodeEmitter::isSrcOperand(const MCInstrDesc &Desc,
+                                   unsigned OpNo) const {
+
+  unsigned RegClass = Desc.OpInfo[OpNo].RegClass;
+  return (AMDGPU::SSrc_32RegClassID == RegClass) ||
+         (AMDGPU::SSrc_64RegClassID == RegClass) ||
+         (AMDGPU::VSrc_32RegClassID == RegClass) ||
+         (AMDGPU::VSrc_64RegClassID == RegClass);
+}
+
+uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO) const {
+
+  IntFloatUnion Imm;
+  if (MO.isImm())
+    Imm.I = MO.getImm();
+  else if (MO.isFPImm())
+    Imm.F = MO.getFPImm();
+  else
+    return ~0;
+
+  if (Imm.I >= 0 && Imm.I <= 64)
+    return 128 + Imm.I;
+
+  if (Imm.I >= -16 && Imm.I <= -1)
+    return 192 + abs(Imm.I);
+
+  if (Imm.F == 0.5f)
+    return 240;
+
+  if (Imm.F == -0.5f)
+    return 241;
+
+  if (Imm.F == 1.0f)
+    return 242;
+
+  if (Imm.F == -1.0f)
+    return 243;
+
+  if (Imm.F == 2.0f)
+    return 244;
+
+  if (Imm.F == -2.0f)
+    return 245;
+
+  if (Imm.F == 4.0f)
+    return 246;
+
+  if (Imm.F == -4.0f)
+    return 247;
+
+  return 255;
+}
+
+void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  unsigned bytes = Desc.getSize();
+
+  for (unsigned i = 0; i < bytes; i++) {
+    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+  }
+
+  if (bytes > 4)
+    return;
+
+  // Check for additional literals in SRC0/1/2 (Op 1/2/3)
+  for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
+
+    // Check if this operand should be encoded as [SV]Src
+    if (!isSrcOperand(Desc, i))
+      continue;
+
+    // Is this operand a literal immediate?
+    const MCOperand &Op = MI.getOperand(i);
+    if (getLitEncoding(Op) != 255)
+      continue;
+
+    // Yes! Encode it
+    IntFloatUnion Imm;
+    if (Op.isImm())
+      Imm.I = Op.getImm();
+    else
+      Imm.F = Op.getFPImm();
+
+    for (unsigned j = 0; j < 4; j++) {
+      OS.write((uint8_t) ((Imm.I >> (8 * j)) & 0xff));
+    }
+
+    // Only one literal value allowed
+    break;
+  }
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg())
+    return MRI.getEncodingValue(MO.getReg());
+
+  if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  }
+
+  // Figure out the operand number, needed for isSrcOperand check
+  unsigned OpNo = 0;
+  for (unsigned e = MI.getNumOperands(); OpNo < e; ++OpNo) {
+    if (&MO == &MI.getOperand(OpNo))
+      break;
+  }
+
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  if (isSrcOperand(Desc, OpNo)) {
+    uint32_t Enc = getLitEncoding(MO);
+    if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
+      return Enc;
+
+  } else if (MO.isImm())
+    return MO.getImm();
+
+  llvm_unreachable("Encoding of this operand type is not supported yet.");
+  return 0;
+}
+
diff --git a/contrib/llvm/lib/Target/R600/Processors.td b/contrib/llvm/lib/Target/R600/Processors.td
new file mode 100644
index 000000000000..868810c613b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/Processors.td
@@ -0,0 +1,30 @@
+//===-- Processors.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDIL processors supported.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+def : Proc<"",           R600_EG_Itin, [FeatureR600ALUInst]>;
+def : Proc<"r600",       R600_EG_Itin, [FeatureR600ALUInst]>;
+def : Proc<"rv710",      R600_EG_Itin, []>;
+def : Proc<"rv730",      R600_EG_Itin, []>;
+def : Proc<"rv770",      R600_EG_Itin, [FeatureFP64]>;
+def : Proc<"cedar",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"redwood",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"juniper",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cypress",    R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"barts",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"turks",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"caicos",     R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cayman",     R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
+
diff --git a/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp
new file mode 100644
index 000000000000..3a6c7eac730f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -0,0 +1,268 @@
+//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass compute turns all control flow pseudo instructions into native one
+/// computing their address on the fly ; it also sets STACK_SIZE info.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "r600cf"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+class R600ControlFlowFinalizer : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+  unsigned MaxFetchInst;
+
+  bool isFetch(const MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::TEX_VTX_CONSTBUF:
+    case AMDGPU::TEX_VTX_TEXBUF:
+    case AMDGPU::TEX_LD:
+    case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+    case AMDGPU::TEX_GET_GRADIENTS_H:
+    case AMDGPU::TEX_GET_GRADIENTS_V:
+    case AMDGPU::TEX_SET_GRADIENTS_H:
+    case AMDGPU::TEX_SET_GRADIENTS_V:
+    case AMDGPU::TEX_SAMPLE:
+    case AMDGPU::TEX_SAMPLE_C:
+    case AMDGPU::TEX_SAMPLE_L:
+    case AMDGPU::TEX_SAMPLE_C_L:
+    case AMDGPU::TEX_SAMPLE_LB:
+    case AMDGPU::TEX_SAMPLE_C_LB:
+    case AMDGPU::TEX_SAMPLE_G:
+    case AMDGPU::TEX_SAMPLE_C_G:
+    case AMDGPU::TXD:
+    case AMDGPU::TXD_SHADOW:
+     return true;
+    default:
+      return false;
+    }
+  }
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  MachineBasicBlock::iterator
+  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+      unsigned CfAddress) const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    unsigned AluInstCount = 0;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (!isFetch(I))
+        break;
+      AluInstCount ++;
+      if (AluInstCount > MaxFetchInst)
+        break;
+    }
+    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+        TII->get(AMDGPU::CF_TC))
+        .addImm(CfAddress) // ADDR
+        .addImm(AluInstCount); // COUNT
+    return I;
+  }
+  void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
+    MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
+  }
+  void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
+      const {
+    for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
+        It != E; ++It) {
+      MachineInstr *MI = *It;
+      CounterPropagateAddr(MI, Addr);
+    }
+  }
+
+public:
+  R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) {
+      const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
+      if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
+        MaxFetchInst = 8;
+      else
+        MaxFetchInst = 16;
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) {
+    unsigned MaxStack = 0;
+    unsigned CurrentStack = 0;
+    for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
+        ++MB) {
+      MachineBasicBlock &MBB = *MB;
+      unsigned CfCount = 0;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<MachineInstr * > IfThenElseStack;
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      if (MFI->ShaderType == 1) {
+        BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+            TII->get(AMDGPU::CF_CALL_FS));
+        CfCount++;
+      }
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+          I != E;) {
+        if (isFetch(I)) {
+          DEBUG(dbgs() << CfCount << ":"; I->dump(););
+          I = MakeFetchClause(MBB, I, 0);
+          CfCount++;
+          continue;
+        }
+
+        MachineBasicBlock::iterator MI = I;
+        I++;
+        switch (MI->getOpcode()) {
+        case AMDGPU::CF_ALU_PUSH_BEFORE:
+          CurrentStack++;
+          MaxStack = std::max(MaxStack, CurrentStack);
+        case AMDGPU::CF_ALU:
+        case AMDGPU::EG_ExportBuf:
+        case AMDGPU::EG_ExportSwz:
+        case AMDGPU::R600_ExportBuf:
+        case AMDGPU::R600_ExportSwz:
+          DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+          CfCount++;
+          break;
+        case AMDGPU::WHILELOOP: {
+          CurrentStack++;
+          MaxStack = std::max(MaxStack, CurrentStack);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::WHILE_LOOP))
+              .addImm(2);
+          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+              std::set<MachineInstr *>());
+          Pair.second.insert(MIb);
+          LoopStack.push_back(Pair);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDLOOP: {
+          CurrentStack--;
+          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+              LoopStack.back();
+          LoopStack.pop_back();
+          CounterPropagateAddr(Pair.second, CfCount);
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::END_LOOP))
+              .addImm(Pair.first + 1);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::IF_PREDICATE_SET: {
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::CF_JUMP))
+              .addImm(0)
+              .addImm(0);
+          IfThenElseStack.push_back(MIb);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ELSE: {
+          MachineInstr * JumpInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(JumpInst, CfCount);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::CF_ELSE))
+              .addImm(0)
+              .addImm(1);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          IfThenElseStack.push_back(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::ENDIF: {
+          CurrentStack--;
+          MachineInstr *IfOrElseInst = IfThenElseStack.back();
+          IfThenElseStack.pop_back();
+          CounterPropagateAddr(IfOrElseInst, CfCount + 1);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::POP))
+              .addImm(CfCount + 1)
+              .addImm(1);
+          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        case AMDGPU::PREDICATED_BREAK: {
+          CurrentStack--;
+          CfCount += 3;
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_JUMP))
+              .addImm(CfCount)
+              .addImm(1);
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::LOOP_BREAK))
+              .addImm(0);
+          BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::POP))
+              .addImm(CfCount)
+              .addImm(1);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          break;
+        }
+        case AMDGPU::CONTINUE: {
+          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+              TII->get(AMDGPU::CF_CONTINUE))
+              .addImm(0);
+          LoopStack.back().second.insert(MIb);
+          MI->eraseFromParent();
+          CfCount++;
+          break;
+        }
+        default:
+          break;
+        }
+      }
+      BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
+          TII->get(AMDGPU::STACK_SIZE))
+          .addImm(MaxStack);
+    }
+
+    return false;
+  }
+
+  const char *getPassName() const {
+    return "R600 Control Flow Finalizer Pass";
+  }
+};
+
+char R600ControlFlowFinalizer::ID = 0;
+
+}
+
+
+llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
+  return new R600ControlFlowFinalizer(TM);
+}
+
diff --git a/contrib/llvm/lib/Target/R600/R600Defines.h b/contrib/llvm/lib/Target/R600/R600Defines.h
new file mode 100644
index 000000000000..16cfcf59eb3d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Defines.h
@@ -0,0 +1,97 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600DEFINES_H_
+#define R600DEFINES_H_
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+#define MO_FLAG_PUSH  (1 << 4)
+#define MO_FLAG_NOT_LAST  (1 << 5)
+#define MO_FLAG_LAST  (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+  enum TIF {
+    TRANS_ONLY = (1 << 0),
+    TEX = (1 << 1),
+    REDUCTION = (1 << 2),
+    FC = (1 << 3),
+    TRIG = (1 << 4),
+    OP3 = (1 << 5),
+    VECTOR = (1 << 6),
+    //FlagOperand bits 7, 8
+    NATIVE_OPERANDS = (1 << 9),
+    OP1 = (1 << 10),
+    OP2 = (1 << 11)
+  };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register infomation from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+#define GET_REG_CHAN(reg) ((reg) >> HW_CHAN_SHIFT)
+#define GET_REG_INDEX(reg) ((reg) & HW_REG_MASK)
+
+namespace R600Operands {
+  enum Ops {
+    DST,
+    UPDATE_EXEC_MASK,
+    UPDATE_PREDICATE,
+    WRITE,
+    OMOD,
+    DST_REL,
+    CLAMP,
+    SRC0,
+    SRC0_NEG,
+    SRC0_REL,
+    SRC0_ABS,
+    SRC0_SEL,
+    SRC1,
+    SRC1_NEG,
+    SRC1_REL,
+    SRC1_ABS,
+    SRC1_SEL,
+    SRC2,
+    SRC2_NEG,
+    SRC2_REL,
+    SRC2_SEL,
+    LAST,
+    PRED_SEL,
+    IMM,
+    COUNT
+ };
+
+  const static int ALUOpTable[3][R600Operands::COUNT] = {
+//            W        C     S  S  S  S     S  S  S  S     S  S  S
+//            R  O  D  L  S  R  R  R  R  S  R  R  R  R  S  R  R  R  L  P
+//   D  U     I  M  R  A  R  C  C  C  C  R  C  C  C  C  R  C  C  C  A  R  I
+//   S  E  U  T  O  E  M  C  0  0  0  0  C  1  1  1  1  C  2  2  2  S  E  M
+//   T  M  P  E  D  L  P  0  N  R  A  S  1  N  R  A  S  2  N  R  S  T  D  M
+    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12},
+    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19},
+    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17}
+  };
+
+}
+
+#endif // R600DEFINES_H_
diff --git a/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp
new file mode 100644
index 000000000000..3fdc678b9ef1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -0,0 +1,255 @@
+//===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
+/// 128 Alu instructions ; these instructions can access up to 4 prefetched
+/// 4 lines of 16 registers from constant buffers. Such ALU clauses are
+/// initiated by CF_ALU instructions.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+class R600EmitClauseMarkersPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned OccupiedDwords(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT4_eg_pseudo:
+    case AMDGPU::DOT4_r600_pseudo:
+      return 4;
+    case AMDGPU::KILL:
+      return 0;
+    default:
+      break;
+    }
+
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()))
+      return 4;
+
+    unsigned NumLiteral = 0;
+    for (MachineInstr::mop_iterator It = MI->operands_begin(),
+        E = MI->operands_end(); It != E; ++It) {
+      MachineOperand &MO = *It;
+      if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+        ++NumLiteral;
+    }
+    return 1 + NumLiteral;
+  }
+
+  bool isALU(const MachineInstr *MI) const {
+    if (TII->isALUInstr(MI->getOpcode()))
+      return true;
+    if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
+      return true;
+    switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::COPY:
+    case AMDGPU::DOT4_eg_pseudo:
+    case AMDGPU::DOT4_r600_pseudo:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  bool IsTrivialInst(MachineInstr *MI) const {
+    switch (MI->getOpcode()) {
+    case AMDGPU::KILL:
+    case AMDGPU::RETURN:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  // Register Idx, then Const value
+  std::vector<std::pair<unsigned, unsigned> > ExtractConstRead(MachineInstr *MI)
+      const {
+    const R600Operands::Ops OpTable[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL},
+    };
+    std::vector<std::pair<unsigned, unsigned> > Result;
+
+    if (!TII->isALUInstr(MI->getOpcode()))
+      return Result;
+    for (unsigned j = 0; j < 3; j++) {
+      int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+      if (SrcIdx < 0)
+        break;
+      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
+        unsigned Const = MI->getOperand(
+            TII->getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+        Result.push_back(std::pair<unsigned, unsigned>(SrcIdx, Const));
+      }
+    }
+    return Result;
+  }
+
+  std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
+    // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
+    // (See also R600ISelLowering.cpp)
+    // ConstIndex value is in [0, 4095];
+    return std::pair<unsigned, unsigned>(
+        ((Sel >> 2) - 512) >> 12, // KC_BANK
+        // Line Number of ConstIndex
+        // A line contains 16 constant registers however KCX bank can lock
+        // two line at the same time ; thus we want to get an even line number.
+        // Line number can be retrieved with (>>4), using (>>5) <<1 generates
+        // an even number.
+        ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
+  }
+
+  bool SubstituteKCacheBank(MachineInstr *MI,
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const {
+    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+    std::vector<std::pair<unsigned, unsigned> > Consts = ExtractConstRead(MI);
+    assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const");
+    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      unsigned Sel = Consts[i].second;
+      unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
+      unsigned KCacheIndex = Index * 4 + Chan;
+      const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
+      if (CachedConsts.empty()) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[0] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts.size() == 1) {
+        CachedConsts.push_back(BankLine);
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      if (CachedConsts[1] == BankLine) {
+        UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
+        continue;
+      }
+      return false;
+    }
+
+    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      switch(UsedKCache[i].first) {
+      case 0:
+        MI->getOperand(Consts[i].first).setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[i].second));
+        break;
+      case 1:
+        MI->getOperand(Consts[i].first).setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[i].second));
+        break;
+      default:
+        llvm_unreachable("Wrong Cache Line");
+      }
+    }
+    return true;
+  }
+
+  MachineBasicBlock::iterator
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+    MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    bool PushBeforeModifier = false;
+    unsigned AluInstCount = 0;
+    for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
+      if (IsTrivialInst(I))
+        continue;
+      if (!isALU(I))
+        break;
+      if (AluInstCount > TII->getMaxAlusPerClause())
+        break;
+      if (I->getOpcode() == AMDGPU::PRED_X) {
+        if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
+          PushBeforeModifier = true;
+        AluInstCount ++;
+        continue;
+      }
+      if (I->getOpcode() == AMDGPU::KILLGT) {
+        I++;
+        break;
+      }
+      if (TII->isALUInstr(I->getOpcode()) &&
+          !SubstituteKCacheBank(I, KCacheBanks))
+        break;
+      AluInstCount += OccupiedDwords(I);
+    }
+    unsigned Opcode = PushBeforeModifier ?
+        AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
+    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
+        .addImm(0) // ADDR
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
+        .addImm(KCacheBanks.empty()?0:2) // KM0
+        .addImm((KCacheBanks.size() < 2)?0:2) // KM1
+        .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
+        .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
+        .addImm(AluInstCount); // COUNT
+    return I;
+  }
+
+public:
+  R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) {
+    for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                    BB != BB_E; ++BB) {
+      MachineBasicBlock &MBB = *BB;
+      MachineBasicBlock::iterator I = MBB.begin();
+      if (I->getOpcode() == AMDGPU::CF_ALU)
+        continue; // BB was already parsed
+      for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
+        if (isALU(I))
+          I = MakeALUClause(MBB, I);
+        else
+          ++I;
+      }
+    }
+    return false;
+  }
+
+  const char *getPassName() const {
+    return "R600 Emit Clause Markers Pass";
+  }
+};
+
+char R600EmitClauseMarkersPass::ID = 0;
+
+}
+
+
+llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) {
+  return new R600EmitClauseMarkersPass(TM);
+}
+
diff --git a/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 000000000000..f8c900f72776
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,297 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly.  This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  bool ExpandInputPerspective(MachineInstr& MI);
+  bool ExpandInputConstant(MachineInstr& MI);
+
+public:
+  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "R600 Expand special instructions pass";
+  }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+  return new R600ExpandSpecialInstrsPass(TM);
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin();
+    while (I != MBB.end()) {
+      MachineInstr &MI = *I;
+      I = llvm::next(I);
+
+      switch (MI.getOpcode()) {
+      default: break;
+      // Expand PRED_X to one of the PRED_SET instructions.
+      case AMDGPU::PRED_X: {
+        uint64_t Flags = MI.getOperand(3).getImm();
+        // The native opcode used by PRED_X is stored as an immediate in the
+        // third operand.
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                            MI.getOperand(2).getImm(), // opcode
+                                            MI.getOperand(0).getReg(), // dst
+                                            MI.getOperand(1).getReg(), // src0
+                                            AMDGPU::ZERO);             // src1
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        if (Flags & MO_FLAG_PUSH) {
+          TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
+        } else {
+          TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
+        }
+        MI.eraseFromParent();
+        continue;
+        }
+      case AMDGPU::BREAK: {
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                          AMDGPU::PRED_SETE_INT,
+                                          AMDGPU::PREDICATE_BIT,
+                                          AMDGPU::ZERO,
+                                          AMDGPU::ZERO);
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
+
+        BuildMI(MBB, I, MBB.findDebugLoc(I),
+                TII->get(AMDGPU::PREDICATED_BREAK))
+                .addReg(AMDGPU::PREDICATE_BIT);
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_XY: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = MI.getOperand(Chan).getReg();
+          else
+            DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W;
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan >= 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_PAIR_ZW: {
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(2).getImm());
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          unsigned DstReg;
+
+          if (Chan < 2)
+            DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y;
+          else
+            DstReg = MI.getOperand(Chan-2).getReg();
+
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW,
+              DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg);
+
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan < 2)
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+
+      case AMDGPU::INTERP_VEC_LOAD: {
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+        MachineInstr *BMI;
+        unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+                MI.getOperand(1).getImm());
+        unsigned DstReg = MI.getOperand(0).getReg();
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0,
+              TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+        }
+
+        MI.eraseFromParent();
+        continue;
+        }
+      }
+
+      bool IsReduction = TII->isReductionOp(MI.getOpcode());
+      bool IsVector = TII->isVector(MI);
+      bool IsCube = TII->isCubeOp(MI.getOpcode());
+      if (!IsReduction && !IsVector && !IsCube) {
+        continue;
+      }
+
+      // Expand the instruction
+      //
+      // Reduction instructions:
+      // T0_X = DP4 T1_XYZW, T2_XYZW
+      // becomes:
+      // TO_X = DP4 T1_X, T2_X
+      // TO_Y (write masked) = DP4 T1_Y, T2_Y
+      // TO_Z (write masked) = DP4 T1_Z, T2_Z
+      // TO_W (write masked) = DP4 T1_W, T2_W
+      //
+      // Vector instructions:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // becomes:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+      // T0_W (write masked) = MULLO_INT T1_X, T2_X
+      //
+      // Cube instructions:
+      // T0_XYZW = CUBE T1_XYZW
+      // becomes:
+      // TO_X = CUBE T1_Z, T1_Y
+      // T0_Y = CUBE T1_Z, T1_X
+      // T0_Z = CUBE T1_X, T1_Z
+      // T0_W = CUBE T1_Y, T1_Z
+      for (unsigned Chan = 0; Chan < 4; Chan++) {
+        unsigned DstReg = MI.getOperand(
+                            TII->getOperandIdx(MI, R600Operands::DST)).getReg();
+        unsigned Src0 = MI.getOperand(
+                           TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
+        unsigned Src1 = 0;
+
+        // Determine the correct source registers
+        if (!IsCube) {
+          int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
+          if (Src1Idx != -1) {
+            Src1 = MI.getOperand(Src1Idx).getReg();
+          }
+        }
+        if (IsReduction) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex);
+          Src1 = TRI.getSubReg(Src1, SubRegIndex);
+        } else if (IsCube) {
+          static const int CubeSrcSwz[] = {2, 2, 0, 1};
+          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+        }
+
+        // Determine the correct destination registers;
+        bool Mask = false;
+        bool NotLast = true;
+        if (IsCube) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+        } else {
+          // Mask the write if the original instruction does not write to
+          // the current Channel.
+          Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+        }
+
+        // Set the IsLast bit
+        NotLast = (Chan != 3 );
+
+        // Add the new instruction
+        unsigned Opcode = MI.getOpcode();
+        switch (Opcode) {
+        case AMDGPU::CUBE_r600_pseudo:
+          Opcode = AMDGPU::CUBE_r600_real;
+          break;
+        case AMDGPU::CUBE_eg_pseudo:
+          Opcode = AMDGPU::CUBE_eg_real;
+          break;
+        case AMDGPU::DOT4_r600_pseudo:
+          Opcode = AMDGPU::DOT4_r600_real;
+          break;
+        case AMDGPU::DOT4_eg_pseudo:
+          Opcode = AMDGPU::DOT4_eg_real;
+          break;
+        default:
+          break;
+        }
+
+        MachineInstr *NewMI =
+          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+        if (Chan != 0)
+          NewMI->bundleWithPred();
+        if (Mask) {
+          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+        }
+        if (NotLast) {
+          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+        }
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
new file mode 100644
index 000000000000..53e6e51dd2b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.cpp
@@ -0,0 +1,1106 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  computeRegisterProperties();
+
+  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+
+  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
+  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+
+  setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
+
+  setOperationAction(ISD::ROTL, MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+
+  // Legalize loads and stores to the private address space.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i8, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+  setTargetDAGCombine(ISD::FP_TO_SINT);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::SELECT_CC);
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setSchedulingPreference(Sched::VLIW);
+}
+
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+  MachineFunction * MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
+
+  switch (MI->getOpcode()) {
+  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::CLAMP_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                   AMDGPU::MOV,
+                                                   MI->getOperand(0).getReg(),
+                                                   MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    break;
+  }
+
+  case AMDGPU::FABS_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    break;
+  }
+
+  case AMDGPU::FNEG_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    break;
+  }
+
+  case AMDGPU::MASK_WRITE: {
+    unsigned maskedRegister = MI->getOperand(0).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    break;
+  }
+
+  case AMDGPU::MOV_IMM_F32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getFPImm()->getValueAPF()
+                         .bitcastToAPInt().getZExtValue());
+    break;
+  case AMDGPU::MOV_IMM_I32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getImm());
+    break;
+  case AMDGPU::CONST_COPY: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
+        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
+    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+        MI->getOperand(1).getImm());
+    break;
+  }
+
+  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(EOP); // Set End of program bit
+    break;
+  }
+
+  case AMDGPU::TXD: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::TXD_SHADOW: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::BRANCH:
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+              .addOperand(MI->getOperand(0));
+      break;
+
+  case AMDGPU::BRANCH_COND_f32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+              AMDGPU::PREDICATE_BIT)
+              .addOperand(MI->getOperand(1))
+              .addImm(OPCODE_IS_NOT_ZERO)
+              .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+            .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::BRANCH_COND_i32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+            AMDGPU::PREDICATE_BIT)
+            .addOperand(MI->getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO_INT)
+            .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
+           .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::EG_ExportSwz:
+  case AMDGPU::R600_ExportSwz: {
+    // Instruction is left unmodified if its not the last one of its type
+    bool isLastInstructionOfItsType = true;
+    unsigned InstExportType = MI->getOperand(1).getImm();
+    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
+         EndBlock = BB->end(); NextExportInst != EndBlock;
+         NextExportInst = llvm::next(NextExportInst)) {
+      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
+          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
+        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
+            .getImm();
+        if (CurrentInstExportType == InstExportType) {
+          isLastInstructionOfItsType = false;
+          break;
+        }
+      }
+    }
+    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
+    if (!EOP && !isLastInstructionOfItsType)
+      return BB;
+    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addImm(CfInst)
+            .addImm(EOP);
+    break;
+  }
+  case AMDGPU::RETURN: {
+    // RETURN instructions must have the live-out registers as implicit uses,
+    // otherwise they appear dead.
+    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+    MachineInstrBuilder MIB(*MF, MI);
+    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
+      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
+    return BB;
+  }
+  }
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+using namespace llvm::Intrinsic;
+using namespace llvm::AMDGPUIntrinsic;
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::ROTL: return LowerROTL(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::INTRINSIC_VOID: {
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::AMDGPU_store_output: {
+      MachineFunction &MF = DAG.getMachineFunction();
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      MFI->LiveOuts.push_back(Reg);
+      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
+    }
+    case AMDGPUIntrinsic::R600_store_swizzle: {
+      const SDValue Args[8] = {
+        Chain,
+        Op.getOperand(2), // Export Value
+        Op.getOperand(3), // ArrayBase
+        Op.getOperand(4), // Type
+        DAG.getConstant(0, MVT::i32), // SWZ_X
+        DAG.getConstant(1, MVT::i32), // SWZ_Y
+        DAG.getConstant(2, MVT::i32), // SWZ_Z
+        DAG.getConstant(3, MVT::i32) // SWZ_W
+      };
+      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
+          Args, 8);
+    }
+
+    // default for switch(IntrinsicID)
+    default: break;
+    }
+    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    DebugLoc DL = Op.getDebugLoc();
+    switch(IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case AMDGPUIntrinsic::R600_load_input: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
+    }
+
+    case AMDGPUIntrinsic::R600_interp_input: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
+      MachineSDNode *interp;
+      if (ijb < 0) {
+        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
+            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
+        return DAG.getTargetExtractSubreg(
+            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
+            DL, MVT::f32, SDValue(interp, 0));
+      }
+
+      if (slot % 4 < 2)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
+            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+
+      return SDValue(interp, slot % 2);
+    }
+
+    case r600_read_ngroups_x:
+      return LowerImplicitParameter(DAG, VT, DL, 0);
+    case r600_read_ngroups_y:
+      return LowerImplicitParameter(DAG, VT, DL, 1);
+    case r600_read_ngroups_z:
+      return LowerImplicitParameter(DAG, VT, DL, 2);
+    case r600_read_global_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 3);
+    case r600_read_global_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 4);
+    case r600_read_global_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 5);
+    case r600_read_local_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 6);
+    case r600_read_local_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 7);
+    case r600_read_local_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 8);
+
+    case r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_X, VT);
+    case r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Y, VT);
+    case r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Z, VT);
+    case r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_X, VT);
+    case r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Y, VT);
+    case r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Z, VT);
+    }
+    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+    break;
+  }
+  } // end switch(Op.getOpcode())
+  return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default: return;
+  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+    return;
+  case ISD::LOAD: {
+    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
+    Results.push_back(SDValue(Node, 0));
+    Results.push_back(SDValue(Node, 1));
+    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
+    // function
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
+    return;
+  }
+  case ISD::STORE:
+    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
+    Results.push_back(SDValue(Node, 0));
+    return;
+  }
+}
+
+SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(
+      ISD::SETCC,
+      Op.getDebugLoc(),
+      MVT::i1,
+      Op, DAG.getConstantFP(0.0f, MVT::f32),
+      DAG.getCondCode(ISD::SETNE)
+      );
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                                   DebugLoc DL,
+                                                   unsigned DwordOffset) const {
+  unsigned ByteOffset = DwordOffset * 4;
+  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                      AMDGPUAS::PARAM_I_ADDRESS);
+
+  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+  assert(isInt<16>(ByteOffset));
+
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
+                     false, false, false, 0);
+}
+
+SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+  assert(FIN);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
+}
+
+SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
+                     Op.getOperand(0),
+                     Op.getOperand(0),
+                     DAG.getNode(ISD::SUB, DL, VT,
+                                 DAG.getConstant(32, MVT::i32),
+                                 Op.getOperand(1)));
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+    return Cst->isNullValue();
+  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+    return CstFP->isZero();
+  } else {
+    return false;
+  }
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  SDValue Temp;
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // Check if we can lower this to a native operation.
+
+  // Try to lower to a SET* instruction:
+  //
+  // SET* can match the following patterns:
+  //
+  // select_cc f32, f32, -1,  0, cc_any
+  // select_cc f32, f32, 1.0f, 0.0f, cc_any
+  // select_cc i32, i32, -1,  0, cc_any
+  //
+
+  // Move hardware True/False values to the correct operand.
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    std::swap(False, True);
+    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False) &&
+      (CompareVT == VT || VT == MVT::i32)) {
+    // This can be matched by a SET* instruction.
+    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+  }
+
+  // Try to lower to a CND* instruction:
+  //
+  // CND* can match the following patterns:
+  //
+  // select_cc f32, 0.0, f32, f32, cc_any
+  // select_cc f32, 0.0, i32, i32, cc_any
+  // select_cc i32, 0,   f32, f32, cc_any
+  // select_cc i32, 0,   i32, i32, cc_any
+  //
+  if (isZero(LHS) || isZero(RHS)) {
+    SDValue Cond = (isZero(LHS) ? RHS : LHS);
+    SDValue Zero = (isZero(LHS) ? LHS : RHS);
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    if (CompareVT != VT) {
+      // Bitcast True / False to the correct types.  This will end up being
+      // a nop, but it allows us to define only a single pattern in the
+      // .TD files for each CND* instruction rather than having to have
+      // one pattern for integer True/False and one for fp True/False
+      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+    }
+    if (isZero(LHS)) {
+      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
+    }
+
+    switch (CCOpcode) {
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+    case ISD::SETULE:
+    case ISD::SETULT:
+    case ISD::SETOLE:
+    case ISD::SETOLT:
+    case ISD::SETLE:
+    case ISD::SETLT:
+      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+      Temp = True;
+      True = False;
+      False = Temp;
+      break;
+    default:
+      break;
+    }
+    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+        Cond, Zero,
+        True, False,
+        DAG.getCondCode(CCOpcode));
+    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+  }
+
+
+  // Possible Min/Max pattern
+  SDValue MinMax = LowerMinMax(Op, DAG);
+  if (MinMax.getNode()) {
+    return MinMax;
+  }
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (CompareVT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
+    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
+  } else if (CompareVT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, CompareVT);
+    HWFalse = DAG.getConstant(0, CompareVT);
+  }
+  else {
+    assert(!"Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT_CC, DL, VT,
+      Cond, HWFalse,
+      True, False,
+      DAG.getCondCode(ISD::SETNE));
+}
+
+SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(ISD::SELECT_CC,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      DAG.getConstant(0, MVT::i32),
+      Op.getOperand(1),
+      Op.getOperand(2),
+      DAG.getCondCode(ISD::SETNE));
+}
+
+/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
+/// convert these pointers to a register index.  Each register holds
+/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
+/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
+/// for indirect addressing.
+SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
+                                               unsigned StackWidth,
+                                               SelectionDAG &DAG) const {
+  unsigned SRLPad;
+  switch(StackWidth) {
+  case 1:
+    SRLPad = 2;
+    break;
+  case 2:
+    SRLPad = 3;
+    break;
+  case 4:
+    SRLPad = 4;
+    break;
+  default: llvm_unreachable("Invalid stack width");
+  }
+
+  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+                     DAG.getConstant(SRLPad, MVT::i32));
+}
+
+void R600TargetLowering::getStackAddress(unsigned StackWidth,
+                                         unsigned ElemIdx,
+                                         unsigned &Channel,
+                                         unsigned &PtrIncr) const {
+  switch (StackWidth) {
+  default:
+  case 1:
+    Channel = 0;
+    if (ElemIdx > 0) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 2:
+    Channel = ElemIdx % 2;
+    if (ElemIdx == 2) {
+      PtrIncr = 1;
+    } else {
+      PtrIncr = 0;
+    }
+    break;
+  case 4:
+    Channel = ElemIdx;
+    PtrIncr = 0;
+    break;
+  }
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Ptr = Op.getOperand(2);
+
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
+    // Convert pointer from byte address to dword address.
+    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                  Ptr, DAG.getConstant(2, MVT::i32)));
+
+    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+      assert(!"Truncated and indexed stores not supported yet");
+    } else {
+      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+    }
+    return Chain;
+  }
+
+  EVT ValueVT = Value.getValueType();
+
+  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (ValueVT.isVector()) {
+    unsigned NumElemVT = ValueVT.getVectorNumElements();
+    EVT ElemVT = ValueVT.getVectorElementType();
+    SDValue Stores[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
+                                 Value, DAG.getConstant(i, MVT::i32));
+
+      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                              Chain, Elem, Ptr,
+                              DAG.getTargetConstant(Channel, MVT::i32));
+    }
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+   } else {
+    if (ValueVT == MVT::i8) {
+      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
+    }
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
+    DAG.getTargetConstant(0, MVT::i32)); // Channel 
+  }
+
+  return Chain;
+}
+
+// return (512 + (kc_bank << 12)
+static int
+ConstantAddressBlock(unsigned AddressSpace) {
+  switch (AddressSpace) {
+  case AMDGPUAS::CONSTANT_BUFFER_0:
+    return 512;
+  case AMDGPUAS::CONSTANT_BUFFER_1:
+    return 512 + 4096;
+  case AMDGPUAS::CONSTANT_BUFFER_2:
+    return 512 + 4096 * 2;
+  case AMDGPUAS::CONSTANT_BUFFER_3:
+    return 512 + 4096 * 3;
+  case AMDGPUAS::CONSTANT_BUFFER_4:
+    return 512 + 4096 * 4;
+  case AMDGPUAS::CONSTANT_BUFFER_5:
+    return 512 + 4096 * 5;
+  case AMDGPUAS::CONSTANT_BUFFER_6:
+    return 512 + 4096 * 6;
+  case AMDGPUAS::CONSTANT_BUFFER_7:
+    return 512 + 4096 * 7;
+  case AMDGPUAS::CONSTANT_BUFFER_8:
+    return 512 + 4096 * 8;
+  case AMDGPUAS::CONSTANT_BUFFER_9:
+    return 512 + 4096 * 9;
+  case AMDGPUAS::CONSTANT_BUFFER_10:
+    return 512 + 4096 * 10;
+  case AMDGPUAS::CONSTANT_BUFFER_11:
+    return 512 + 4096 * 11;
+  case AMDGPUAS::CONSTANT_BUFFER_12:
+    return 512 + 4096 * 12;
+  case AMDGPUAS::CONSTANT_BUFFER_13:
+    return 512 + 4096 * 13;
+  case AMDGPUAS::CONSTANT_BUFFER_14:
+    return 512 + 4096 * 14;
+  case AMDGPUAS::CONSTANT_BUFFER_15:
+    return 512 + 4096 * 15;
+  default:
+    return -1;
+  }
+}
+
+SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ptr = Op.getOperand(1);
+  SDValue LoweredLoad;
+
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  if (ConstantBlock > -1) {
+    SDValue Result;
+    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
+        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
+        dyn_cast<ConstantSDNode>(Ptr)) {
+      SDValue Slots[4];
+      for (unsigned i = 0; i < 4; i++) {
+        // We want Const position encoded with the following formula :
+        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
+        // const_index is Ptr computed by llvm using an alignment of 16.
+        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
+        // then div by 4 at the ISel step
+        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
+        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
+    } else {
+      // non constant ptr cant be folded, keeps it as a v4f32 load
+      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
+          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
+          DAG.getConstant(LoadNode->getAddressSpace() -
+	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
+          );
+    }
+
+    if (!VT.isVector()) {
+      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
+          DAG.getConstant(0, MVT::i32));
+    }
+
+    SDValue MergedValues[2] = {
+        Result,
+        Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
+  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return SDValue();
+  }
+
+  // Lowering for indirect addressing
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
+                                         getTargetMachine().getFrameLowering());
+  unsigned StackWidth = TFL->getStackWidth(MF);
+
+  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
+
+  if (VT.isVector()) {
+    unsigned NumElemVT = VT.getVectorNumElements();
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Loads[4];
+
+    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
+                                      "vector width in load");
+
+    for (unsigned i = 0; i < NumElemVT; ++i) {
+      unsigned Channel, PtrIncr;
+      getStackAddress(StackWidth, i, Channel, PtrIncr);
+      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
+                        DAG.getConstant(PtrIncr, MVT::i32));
+      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
+                             Chain, Ptr,
+                             DAG.getTargetConstant(Channel, MVT::i32),
+                             Op.getOperand(2));
+    }
+    for (unsigned i = NumElemVT; i < 4; ++i) {
+      Loads[i] = DAG.getUNDEF(ElemVT);
+    }
+    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+  } else {
+    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
+                              Chain, Ptr,
+                              DAG.getTargetConstant(0, MVT::i32), // Channel
+                              Op.getOperand(2));
+  }
+
+  SDValue Ops[2];
+  Ops[0] = LoweredLoad;
+  Ops[1] = Chain;
+
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  unsigned ParamOffsetBytes = 36;
+  Function::const_arg_iterator FuncArg =
+                            DAG.getMachineFunction().getFunction()->arg_begin();
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
+    EVT VT = Ins[i].VT;
+    Type *ArgType = FuncArg->getType();
+    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
+                             32 : ArgType->getPrimitiveSizeInBits();
+    unsigned ArgBytes = ArgSizeInBits >> 3;
+    EVT ArgVT;
+    if (ArgSizeInBits < VT.getSizeInBits()) {
+      assert(!ArgType->isFloatTy() &&
+             "Extending floating point arguments not supported yet");
+      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
+    } else {
+      ArgVT = VT;
+    }
+    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                                    AMDGPUAS::PARAM_I_ADDRESS);
+    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
+                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
+                                       MachinePointerInfo(UndefValue::get(PtrTy)),
+                                       ArgVT, false, false, ArgBytes);
+    InVals.push_back(Arg);
+    ParamOffsetBytes += ArgBytes;
+  }
+  return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
+   if (!VT.isVector()) return MVT::i32;
+   return VT.changeVectorElementTypeToInteger();
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (N->getOpcode()) {
+  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+  case ISD::FP_ROUND: {
+      SDValue Arg = N->getOperand(0);
+      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
+                           Arg.getOperand(0));
+      }
+      break;
+    }
+
+  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
+  // (i32 select_cc f32, f32, -1, 0 cc)
+  //
+  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
+  // this to one of the SET*_DX10 instructions.
+  case ISD::FP_TO_SINT: {
+    SDValue FNeg = N->getOperand(0);
+    if (FNeg.getOpcode() != ISD::FNEG) {
+      return SDValue();
+    }
+    SDValue SelectCC = FNeg.getOperand(0);
+    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
+        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
+        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
+        !isHWTrueValue(SelectCC.getOperand(2)) ||
+        !isHWFalseValue(SelectCC.getOperand(3))) {
+      return SDValue();
+    }
+
+    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
+                           SelectCC.getOperand(0), // LHS
+                           SelectCC.getOperand(1), // RHS
+                           DAG.getConstant(-1, MVT::i32), // True
+                           DAG.getConstant(0, MVT::i32),  // Flase
+                           SelectCC.getOperand(4)); // CC
+
+    break;
+  }
+  // Extract_vec (Build_vector) generated by custom lowering
+  // also needs to be customly combined
+  case ISD::EXTRACT_VECTOR_ELT: {
+    SDValue Arg = N->getOperand(0);
+    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return Arg->getOperand(Element);
+      }
+    }
+    if (Arg.getOpcode() == ISD::BITCAST &&
+        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+        unsigned Element = Const->getZExtValue();
+        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
+            Arg->getOperand(0).getOperand(Element));
+      }
+    }
+  }
+
+  case ISD::SELECT_CC: {
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
+    //      selectcc x, y, a, b, inv(cc)
+    //
+    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
+    //      selectcc x, y, a, b, cc
+    SDValue LHS = N->getOperand(0);
+    if (LHS.getOpcode() != ISD::SELECT_CC) {
+      return SDValue();
+    }
+
+    SDValue RHS = N->getOperand(1);
+    SDValue True = N->getOperand(2);
+    SDValue False = N->getOperand(3);
+    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+    if (LHS.getOperand(2).getNode() != True.getNode() ||
+        LHS.getOperand(3).getNode() != False.getNode() ||
+        RHS.getNode() != False.getNode()) {
+      return SDValue();
+    }
+
+    switch (NCC) {
+    default: return SDValue();
+    case ISD::SETNE: return LHS;
+    case ISD::SETEQ: {
+      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
+      LHSCC = ISD::getSetCCInverse(LHSCC,
+                                  LHS.getOperand(0).getValueType().isInteger());
+      return DAG.getSelectCC(N->getDebugLoc(),
+                             LHS.getOperand(0),
+                             LHS.getOperand(1),
+                             LHS.getOperand(2),
+                             LHS.getOperand(3),
+                             LHSCC);
+    }
+    }
+  }
+  case AMDGPUISD::EXPORT: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+    SDValue NewBldVec[4] = {
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32),
+        DAG.getUNDEF(MVT::f32)
+      };
+    SDValue NewArgs[8] = {
+      N->getOperand(0), // Chain
+      SDValue(),
+      N->getOperand(2), // ArrayBase
+      N->getOperand(3), // Type
+      N->getOperand(4), // SWZ_X
+      N->getOperand(5), // SWZ_Y
+      N->getOperand(6), // SWZ_Z
+      N->getOperand(7) // SWZ_W
+    };
+    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
+      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
+        if (C->isZero()) {
+          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
+        } else if (C->isExactlyValue(1.0)) {
+          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
+        } else {
+          NewBldVec[i] = Arg.getOperand(i);
+        }
+      } else {
+        NewBldVec[i] = Arg.getOperand(i);
+      }
+    }
+    DebugLoc DL = N->getDebugLoc();
+    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+  }
+  }
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/R600/R600ISelLowering.h b/contrib/llvm/lib/Target/R600/R600ISelLowering.h
new file mode 100644
index 000000000000..2c09acb9af30
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600ISelLowering.h
@@ -0,0 +1,74 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600ISELLOWERING_H
+#define R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+
+class R600TargetLowering : public AMDGPUTargetLowering {
+public:
+  R600TargetLowering(TargetMachine &TM);
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const;
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  void ReplaceNodeResults(SDNode * N,
+      SmallVectorImpl<SDValue> &Results,
+      SelectionDAG &DAG) const;
+  virtual SDValue LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const;
+  virtual EVT getSetCCResultType(EVT VT) const;
+private:
+  const R600InstrInfo * TII;
+
+  /// Each OpenCL kernel has nine implicit parameters that are stored in the
+  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
+  /// lowered to load instructions which retreive the values from the Vertex
+  /// Buffer.
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                 DebugLoc DL, unsigned DwordOffset) const;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+
+  /// \brief Lower ROTL opcode to BITALIGN
+  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
+                                          SelectionDAG &DAG) const;
+  void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
+                       unsigned &Channel, unsigned &PtrIncr) const;
+  bool isZero(SDValue Op) const;
+};
+
+} // End namespace llvm;
+
+#endif // R600ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp
new file mode 100644
index 000000000000..b232188a2641
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600InstrInfo.cpp
@@ -0,0 +1,841 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_INSTRINFO_CTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+using namespace llvm;
+
+R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this)
+  { }
+
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
+}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const {
+  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
+      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+    for (unsigned I = 0; I < 4; I++) {
+      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                              RI.getSubReg(DestReg, SubRegIndex),
+                              RI.getSubReg(SrcReg, SubRegIndex))
+                              .addReg(DestReg,
+                                      RegState::Define | RegState::Implicit);
+    }
+  } else {
+
+    // We can't copy vec4 registers
+    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
+           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
+
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                                                  DestReg, SrcReg);
+    NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
+                                    .setIsKill(KillSrc);
+  }
+}
+
+MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
+                                             unsigned DstReg, int64_t Imm) const {
+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
+  MachineInstrBuilder MIB(*MF, MI);
+  MIB.addReg(DstReg, RegState::Define);
+  MIB.addReg(AMDGPU::ALU_LITERAL_X);
+  MIB.addImm(Imm);
+  MIB.addReg(0); // PREDICATE_BIT
+
+  return MI;
+}
+
+unsigned R600InstrInfo::getIEQOpcode() const {
+  return AMDGPU::SETE_INT;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::MOV:
+  case AMDGPU::MOV_IMM_F32:
+  case AMDGPU::MOV_IMM_I32:
+    return true;
+  }
+}
+
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return false;
+  case AMDGPU::RETURN:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT4_eg_pseudo:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600_pseudo:
+    case AMDGPU::CUBE_r600_real:
+    case AMDGPU::CUBE_eg_pseudo:
+    case AMDGPU::CUBE_eg_real:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+bool
+R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
+    const {
+  assert (Consts.size() <= 12 && "Too many operands in instructions group");
+  unsigned Pair1 = 0, Pair2 = 0;
+  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+    unsigned ReadConstHalf = Consts[i] & 2;
+    unsigned ReadConstIndex = Consts[i] & (~3);
+    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
+    if (!Pair1) {
+      Pair1 = ReadHalfConst;
+      continue;
+    }
+    if (Pair1 == ReadHalfConst)
+      continue;
+    if (!Pair2) {
+      Pair2 = ReadHalfConst;
+      continue;
+    }
+    if (Pair2 != ReadHalfConst)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+  std::vector<unsigned> Consts;
+  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
+    const MachineInstr *MI = MIs[i];
+
+    const R600Operands::Ops OpTable[3][2] = {
+      {R600Operands::SRC0, R600Operands::SRC0_SEL},
+      {R600Operands::SRC1, R600Operands::SRC1_SEL},
+      {R600Operands::SRC2, R600Operands::SRC2_SEL},
+    };
+
+    if (!isALUInstr(MI->getOpcode()))
+      continue;
+
+    for (unsigned j = 0; j < 3; j++) {
+      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+      if (SrcIdx < 0)
+        break;
+      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
+        unsigned Const = MI->getOperand(
+            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+        Consts.push_back(Const);
+      }
+    }
+  }
+  return fitsConstReadLimitations(Consts);
+}
+
+DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
+    const ScheduleDAG *DAG) const {
+  const InstrItineraryData *II = TM->getInstrItineraryData();
+  return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    MachineInstr *MI = I;
+    if (isPredicateSetter(MI->getOpcode()))
+      return MI;
+  }
+
+  return NULL;
+}
+
+static
+bool isJump(unsigned Opcode) {
+  return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
+}
+
+bool
+R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const {
+  // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
+    return false;
+  }
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() ||
+          !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) {
+    if (LastOpc == AMDGPU::JUMP) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastOpc == AMDGPU::JUMP_COND) {
+      MachineInstr *predSet = I;
+      while (!isPredicateSetter(predSet->getOpcode())) {
+        predSet = --I;
+      }
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(predSet->getOperand(1));
+      Cond.push_back(predSet->getOperand(2));
+      Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) {
+    MachineInstr *predSet = --I;
+    while (!isPredicateSetter(predSet->getOpcode())) {
+      predSet = --I;
+    }
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    FBB = LastInst->getOperand(0).getMBB();
+    Cond.push_back(predSet->getOperand(1));
+    Cond.push_back(predSet->getOperand(2));
+    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
+  const MachineInstr *MI = op.getParent();
+
+  switch (MI->getDesc().OpInfo->RegClass) {
+  default: // FIXME: fallthrough??
+  case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
+  case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
+  };
+}
+
+unsigned
+R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
+      return 1;
+    } else {
+      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+      assert(PredSet && "No previous predicate !");
+      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+             .addMBB(TBB)
+             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      return 1;
+    }
+  } else {
+    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+    assert(PredSet && "No previous predicate !");
+    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    PredSet->getOperand(2).setImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
+            .addMBB(TBB)
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+    return 2;
+  }
+}
+
+unsigned
+R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+
+  // Note : we leave PRED* instructions there.
+  // They may be needed when predicating instructions.
+
+  MachineBasicBlock::iterator I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+  case AMDGPU::JUMP_COND: {
+    MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+    clearFlag(predSet, 0, MO_FLAG_PUSH);
+    I->eraseFromParent();
+    break;
+  }
+  case AMDGPU::JUMP:
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+bool
+R600InstrInfo::isPredicated(const MachineInstr *MI) const {
+  int idx = MI->findFirstPredOperandIdx();
+  if (idx < 0)
+    return false;
+
+  unsigned Reg = MI->getOperand(idx).getReg();
+  switch (Reg) {
+  default: return false;
+  case AMDGPU::PRED_SEL_ONE:
+  case AMDGPU::PRED_SEL_ZERO:
+  case AMDGPU::PREDICATE_BIT:
+    return true;
+  }
+}
+
+bool
+R600InstrInfo::isPredicable(MachineInstr *MI) const {
+  // XXX: KILL* instructions can be predicated, but they must be the last
+  // instruction in a clause, so this means any instructions after them cannot
+  // be predicated.  Until we have proper support for instruction clauses in the
+  // backend, we will mark KILL* instructions as unpredicable.
+
+  if (MI->getOpcode() == AMDGPU::KILLGT) {
+    return false;
+  } else if (isVector(*MI)) {
+    return false;
+  } else {
+    return AMDGPUInstrInfo::isPredicable(MI);
+  }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const{
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles,
+                                   unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles,
+                                   unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCyles,
+                                         const BranchProbability &Probability)
+                                         const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  MachineOperand &MO = Cond[1];
+  switch (MO.getImm()) {
+  case OPCODE_IS_ZERO_INT:
+    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
+    break;
+  case OPCODE_IS_NOT_ZERO_INT:
+    MO.setImm(OPCODE_IS_ZERO_INT);
+    break;
+  case OPCODE_IS_ZERO:
+    MO.setImm(OPCODE_IS_NOT_ZERO);
+    break;
+  case OPCODE_IS_NOT_ZERO:
+    MO.setImm(OPCODE_IS_ZERO);
+    break;
+  default:
+    return true;
+  }
+
+  MachineOperand &MO2 = Cond[2];
+  switch (MO2.getReg()) {
+  case AMDGPU::PRED_SEL_ZERO:
+    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+    break;
+  case AMDGPU::PRED_SEL_ONE:
+    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+    break;
+  default:
+    return true;
+  }
+  return false;
+}
+
+bool
+R600InstrInfo::DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI->getOpcode());
+}
+
+
+bool
+R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                       const SmallVectorImpl<MachineOperand> &Pred2) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::PredicateInstruction(MachineInstr *MI,
+                      const SmallVectorImpl<MachineOperand> &Pred) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr *MI,
+                                            unsigned *PredCost) const {
+  if (PredCost)
+    *PredCost = 2;
+  return 2;
+}
+
+int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = 0;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    Offset = std::max(Offset,
+                      GET_REG_INDEX(RI.getEncodingValue(LI->first)));
+  }
+
+  return Offset + 1;
+}
+
+int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
+std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
+                                             const MachineFunction &MF) const {
+  const AMDGPUFrameLowering *TFL =
+                 static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
+  std::vector<unsigned> Regs;
+
+  unsigned StackWidth = TFL->getStackWidth(MF);
+  int End = getIndirectIndexEnd(MF);
+
+  if (End == -1) {
+    return Regs;
+  }
+
+  for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
+    unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
+    Regs.push_back(SuperReg);
+    for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
+      Regs.push_back(Reg);
+    }
+  }
+  return Regs;
+}
+
+unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  // XXX: Remove when we support a stack width > 2
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
+                                                     unsigned SourceReg) const {
+  return &AMDGPU::R600_TReg32RegClass;
+}
+
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
+  return &AMDGPU::TRegMemRegClass;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                               AMDGPU::AR_X, OffsetReg);
+  setImmOperand(MOVA, R600Operands::WRITE, 0);
+
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      AddrReg, ValueReg)
+                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+  setImmOperand(Mov, R600Operands::DST_REL, 1);
+  return Mov;
+}
+
+MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned ValueReg, unsigned Address,
+                                       unsigned OffsetReg) const {
+  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
+  MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
+                                                       AMDGPU::AR_X,
+                                                       OffsetReg);
+  setImmOperand(MOVA, R600Operands::WRITE, 0);
+  MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
+                                      ValueReg,
+                                      AddrReg)
+                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
+  setImmOperand(Mov, R600Operands::SRC0_REL, 1);
+
+  return Mov;
+}
+
+const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
+  return &AMDGPU::IndirectRegRegClass;
+}
+
+unsigned R600InstrInfo::getMaxAlusPerClause() const {
+  return 115;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+                                                  MachineBasicBlock::iterator I,
+                                                  unsigned Opcode,
+                                                  unsigned DstReg,
+                                                  unsigned Src0Reg,
+                                                  unsigned Src1Reg) const {
+  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+    DstReg);           // $dst
+
+  if (Src1Reg) {
+    MIB.addImm(0)     // $update_exec_mask
+       .addImm(0);    // $update_predicate
+  }
+  MIB.addImm(1)        // $write
+     .addImm(0)        // $omod
+     .addImm(0)        // $dst_rel
+     .addImm(0)        // $dst_clamp
+     .addReg(Src0Reg)  // $src0
+     .addImm(0)        // $src0_neg
+     .addImm(0)        // $src0_rel
+     .addImm(0)        // $src0_abs
+     .addImm(-1);       // $src0_sel
+
+  if (Src1Reg) {
+    MIB.addReg(Src1Reg) // $src1
+       .addImm(0)       // $src1_neg
+       .addImm(0)       // $src1_rel
+       .addImm(0)       // $src1_abs
+       .addImm(-1);      // $src1_sel
+  }
+
+  //XXX: The r600g finalizer expects this to be 1, once we've moved the
+  //scheduling to the backend, we can change the default to 0.
+  MIB.addImm(1)        // $last
+      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addImm(0);        // $literal
+
+  return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         uint64_t Imm) const {
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+                                                  AMDGPU::ALU_LITERAL_X);
+  setImmOperand(MovImm, R600Operands::IMM, Imm);
+  return MovImm;
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
+                                 R600Operands::Ops Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
+                                 R600Operands::Ops Op) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+  unsigned OpTableIdx;
+
+  if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
+    switch (Op) {
+    case R600Operands::DST: return 0;
+    case R600Operands::SRC0: return 1;
+    case R600Operands::SRC1: return 2;
+    case R600Operands::SRC2: return 3;
+    default:
+      assert(!"Unknown operand type for instruction");
+      return -1;
+    }
+  }
+
+  if (TargetFlags & R600_InstFlag::OP1) {
+    OpTableIdx = 0;
+  } else if (TargetFlags & R600_InstFlag::OP2) {
+    OpTableIdx = 1;
+  } else {
+    assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
+                                                 "for this instruction");
+    OpTableIdx = 2;
+  }
+
+  return R600Operands::ALUOpTable[OpTableIdx][Op];
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
+                                  int64_t Imm) const {
+  int Idx = getOperandIdx(*MI, Op);
+  assert(Idx != -1 && "Operand not supported for this instruction.");
+  assert(MI->getOperand(Idx).isImm());
+  MI->getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
+  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
+}
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+                                         unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  int FlagIndex = 0;
+  if (Flag != 0) {
+    // If we pass something other than the default value of Flag to this
+    // function, it means we are want to set a flag on an instruction
+    // that uses native encoding.
+    assert(HAS_NATIVE_OPERANDS(TargetFlags));
+    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+    switch (Flag) {
+    case MO_FLAG_CLAMP:
+      FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
+      break;
+    case MO_FLAG_MASK:
+      FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
+      break;
+    case MO_FLAG_NOT_LAST:
+    case MO_FLAG_LAST:
+      FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
+      break;
+    case MO_FLAG_NEG:
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
+      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
+      case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
+      }
+      break;
+
+    case MO_FLAG_ABS:
+      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+                       "instructions.");
+      (void)IsOP3;
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
+      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
+      }
+      break;
+
+    default:
+      FlagIndex = -1;
+      break;
+    }
+    assert(FlagIndex != -1 && "Flag not supported for this instruction");
+  } else {
+      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+      assert(FlagIndex != 0 &&
+         "Instruction flags not supported for this instruction");
+  }
+
+  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  assert(FlagOp.isImm());
+  return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+                            unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (Flag == 0) {
+    return;
+  }
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    if (Flag == MO_FLAG_NOT_LAST) {
+      clearFlag(MI, Operand, MO_FLAG_LAST);
+    } else if (Flag == MO_FLAG_MASK) {
+      clearFlag(MI, Operand, Flag);
+    } else {
+      FlagOp.setImm(1);
+    }
+  } else {
+      MachineOperand &FlagOp = getFlagOp(MI, Operand);
+      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+  }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+                              unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    FlagOp.setImm(0);
+  } else {
+    MachineOperand &FlagOp = getFlagOp(MI);
+    unsigned InstFlags = FlagOp.getImm();
+    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+    FlagOp.setImm(InstFlags);
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/R600InstrInfo.h b/contrib/llvm/lib/Target/R600/R600InstrInfo.h
new file mode 100644
index 000000000000..dbae90013d22
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600InstrInfo.h
@@ -0,0 +1,204 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600INSTRUCTIONINFO_H_
+#define R600INSTRUCTIONINFO_H_
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDIL.h"
+#include "R600Defines.h"
+#include "R600RegisterInfo.h"
+#include <map>
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class DFAPacketizer;
+  class ScheduleDAG;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class R600InstrInfo : public AMDGPUInstrInfo {
+  private:
+  const R600RegisterInfo RI;
+
+  int getBranchInstr(const MachineOperand &op) const;
+
+  public:
+  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+
+  const R600RegisterInfo &getRegisterInfo() const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  bool isTrig(const MachineInstr &MI) const;
+  bool isPlaceHolderOpcode(unsigned opcode) const;
+  bool isReductionOp(unsigned opcode) const;
+  bool isCubeOp(unsigned opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+
+  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
+  bool canBundle(const std::vector<MachineInstr *> &) const;
+
+  /// \breif Vector instructions are instructions that must fill all
+  /// instruction slots within an instruction group.
+  bool isVector(const MachineInstr &MI) const;
+
+  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                        int64_t Imm) const;
+
+  virtual unsigned getIEQOpcode() const;
+  virtual bool isMov(unsigned Opcode) const;
+
+  DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
+                                           const ScheduleDAG *DAG) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  bool isPredicated(const MachineInstr *MI) const;
+
+  bool isPredicable(MachineInstr *MI) const;
+
+  bool
+   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                             const BranchProbability &Probability) const;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const ;
+
+  bool
+   isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                       unsigned NumTCycles, unsigned ExtraTCycles,
+                       MachineBasicBlock &FMBB,
+                       unsigned NumFCycles, unsigned ExtraFCycles,
+                       const BranchProbability &Probability) const;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                                  std::vector<MachineOperand> &Pred) const;
+
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                          MachineBasicBlock &FMBB) const;
+
+  bool PredicateInstruction(MachineInstr *MI,
+                        const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *MI,
+                               unsigned *PredCost = 0) const;
+
+  virtual int getInstrLatency(const InstrItineraryData *ItinData,
+                              SDNode *Node) const { return 1;}
+
+  /// \returns a list of all the registers that may be accesed using indirect
+  /// addressing.
+  std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                      unsigned SourceReg) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned ValueReg, unsigned Address,
+                                  unsigned OffsetReg) const;
+
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned ValueReg, unsigned Address,
+                                  unsigned OffsetReg) const;
+
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
+
+  unsigned getMaxAlusPerClause() const;
+
+  ///buildDefaultInstruction - This function returns a MachineInstr with
+  /// all the instruction modifiers initialized to their default values.
+  /// You can use this function to avoid manually specifying each instruction
+  /// modifier operand when building a new instruction.
+  ///
+  /// \returns a MachineInstr with all the instruction modifiers initialized
+  /// to their default values.
+  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              unsigned Opcode,
+                                              unsigned DstReg,
+                                              unsigned Src0Reg,
+                                              unsigned Src1Reg = 0) const;
+
+  MachineInstr *buildMovImm(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DstReg,
+                                  uint64_t Imm) const;
+
+  /// \brief Get the index of Op in the MachineInstr.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
+
+  /// \brief Get the index of \p Op for the given Opcode.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+
+  /// \brief Helper function for setting instruction flag values.
+  void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
+
+  /// \returns true if this instruction has an operand for storing target flags.
+  bool hasFlagOperand(const MachineInstr &MI) const;
+
+  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+
+  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+  /// \param Flag The flag being set.
+  ///
+  /// \returns the operand containing the flags for this instruction.
+  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+                            unsigned Flag = 0) const;
+
+  /// \brief Clear the specified flag on the instruction.
+  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+};
+
+} // End llvm namespace
+
+#endif // R600INSTRINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/R600Instructions.td b/contrib/llvm/lib/Target/R600/R600Instructions.td
new file mode 100644
index 000000000000..663b41a66d6f
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Instructions.td
@@ -0,0 +1,2267 @@
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Tablegen instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+
+class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit HasNativeOperands = 0;
+
+  bits<11> op_code = inst;
+  //let Inst = inst;
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+}
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst <outs, ins, asm, pattern> {
+  field bits<64> Inst;
+
+  let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+  let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+  let PrintMethod = PM;
+}
+
+// src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers
+def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> {
+  let PrintMethod = "printSel";
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+
+def FRAMEri : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
+def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
+
+class R600ALU_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_neg;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<1>  src1_neg;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{12}    = src0_neg;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{25}    = src1_neg;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle = 0;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> SRC_GPR;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> BUFFER_ID;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = BUFFER_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
+class TEX_WORD0 {
+  field bits<32> Word0;
+
+  bits<5> TEX_INST;
+  bits<2> INST_MOD;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> RESOURCE_ID;
+  bits<7> SRC_GPR;
+  bits<1> SRC_REL;
+  bits<1> ALT_CONST;
+  bits<2> RESOURCE_INDEX_MODE;
+  bits<2> SAMPLER_INDEX_MODE;
+
+  let Word0{4-0} = TEX_INST;
+  let Word0{6-5} = INST_MOD;
+  let Word0{7} = FETCH_WHOLE_QUAD;
+  let Word0{15-8} = RESOURCE_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23} = SRC_REL;
+  let Word0{24} = ALT_CONST;
+  let Word0{26-25} = RESOURCE_INDEX_MODE;
+  let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+  field bits<32> Word1;
+
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<7> LOD_BIAS;
+  bits<1> COORD_TYPE_X;
+  bits<1> COORD_TYPE_Y;
+  bits<1> COORD_TYPE_Z;
+  bits<1> COORD_TYPE_W;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7} = DST_REL;
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{27-21} = LOD_BIAS;
+  let Word1{28} = COORD_TYPE_X;
+  let Word1{29} = COORD_TYPE_Y;
+  let Word1{30} = COORD_TYPE_Z;
+  let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+  field bits<32> Word2;
+
+  bits<5> OFFSET_X;
+  bits<5> OFFSET_Y;
+  bits<5> OFFSET_Z;
+  bits<5> SAMPLER_ID;
+  bits<3> SRC_SEL_X;
+  bits<3> SRC_SEL_Y;
+  bits<3> SRC_SEL_Z;
+  bits<3> SRC_SEL_W;
+
+  let Word2{4-0} = OFFSET_X;
+  let Word2{9-5} = OFFSET_Y;
+  let Word2{14-10} = OFFSET_Z;
+  let Word2{19-15} = SAMPLER_ID;
+  let Word2{22-20} = SRC_SEL_X;
+  let Word2{25-23} = SRC_SEL_Y;
+  let Word2{28-26} = SRC_SEL_Z;
+  let Word2{31-29} = SRC_SEL_W;
+}
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+                                     (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+    InstR600 <0,
+              (outs R600_Reg32:$dst),
+              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+              !strconcat("  ", opName,
+                   "$clamp $dst$write$dst_rel$omod, "
+                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$literal $pred_sel$last"),
+              pattern,
+              itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+  let HasNativeOperands = 1;
+  let Op1 = 1;
+  let DisableEncoding = "$literal";
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                    InstrItinClass itin = AnyALU> :
+    R600_1OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
+>;
+
+// If you add our change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs, SEL:$src1_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+          !strconcat("  ", opName,
+                "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$literal $pred_sel$last"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let HasNativeOperands = 1;
+  let Op2 = 1;
+  let DisableEncoding = "$literal";
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                       InstrItinClass itim = AnyALU> :
+    R600_2OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+                                           R600_Reg32:$src1))]
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <0,
+          (outs R600_Reg32:$dst),
+          (ins REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, SEL:$src0_sel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, SEL:$src1_sel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+          !strconcat("  ", opName, "$clamp $dst$dst_rel, "
+                             "$src0_neg$src0$src0_rel, "
+                             "$src1_neg$src1$src1_rel, "
+                             "$src2_neg$src2$src2_rel, "
+                             "$literal $pred_sel$last"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP3<inst>{
+
+  let HasNativeOperands = 1;
+  let DisableEncoding = "$literal";
+  let Op3 = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = VecALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin>;
+
+class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
+          !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
+          pattern,
+          itin>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+    let Inst{31-0} = Word0;
+    let Inst{63-32} = Word1;
+
+    let TEX_INST = inst{4-0};
+    let SRC_REL = 0;
+    let DST_REL = 0;
+    let DST_SEL_X = 0;
+    let DST_SEL_Y = 1;
+    let DST_SEL_Z = 2;
+    let DST_SEL_W = 3;
+    let LOD_BIAS = 0;
+
+    let INST_MOD = 0;
+    let FETCH_WHOLE_QUAD = 0;
+    let ALT_CONST = 0;
+    let SAMPLER_INDEX_MODE = 0;
+
+    let COORD_TYPE_X = 0;
+    let COORD_TYPE_Y = 0;
+    let COORD_TYPE_Z = 0;
+    let COORD_TYPE_W = 0;
+  }
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || (TType >= 11 && TType <= 13);
+  }]
+>;
+
+def TEX_RECT : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 5;
+  }]
+>;
+
+def TEX_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 9 || TType == 10 || TType == 15 || TType == 16;
+  }]
+>;
+
+def TEX_SHADOW_ARRAY : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 11 || TType == 12 || TType == 17;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
+                 dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern> {
+  bits<7>  RW_GPR;
+  bits<7>  INDEX_GPR;
+
+  bits<2>  RIM;
+  bits<2>  TYPE;
+  bits<1>  RW_REL;
+  bits<2>  ELEM_SIZE;
+
+  bits<12> ARRAY_SIZE;
+  bits<4>  COMP_MASK;
+  bits<4>  BURST_COUNT;
+  bits<1>  VPM;
+  bits<1>  eop;
+  bits<1>  MARK;
+  bits<1>  BARRIER;
+
+  // CF_ALLOC_EXPORT_WORD0_RAT
+  let Inst{3-0}   = rat_id;
+  let Inst{9-4}   = rat_inst;
+  let Inst{10}    = 0; // Reserved
+  let Inst{12-11} = RIM;
+  let Inst{14-13} = TYPE;
+  let Inst{21-15} = RW_GPR;
+  let Inst{22}    = RW_REL;
+  let Inst{29-23} = INDEX_GPR;
+  let Inst{31-30} = ELEM_SIZE;
+
+  // CF_ALLOC_EXPORT_WORD1_BUF
+  let Inst{43-32} = ARRAY_SIZE;
+  let Inst{47-44} = COMP_MASK;
+  let Inst{51-48} = BURST_COUNT;
+  let Inst{52}    = VPM;
+  let Inst{53}    = eop;
+  let Inst{61-54} = cf_inst;
+  let Inst{62}    = MARK;
+  let Inst{63}    = BARRIER;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+  (ops node:$ptr), (load_type node:$ptr),
+  [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
+>;
+
+def load_param : LoadParamFrag<load>;
+def load_param_zexti8 : LoadParamFrag<zextloadi8>;
+def load_param_zexti16 : LoadParamFrag<zextloadi16>;
+
+def isR600 : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
+def isR700 : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
+                            "Subtarget.device()->getDeviceFlag()"
+                            ">= OCL_DEVICE_RV710">;
+def isEG : Predicate<
+  "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
+  "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
+  "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
+
+def isCayman : Predicate<"Subtarget.device()"
+                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
+def isEGorCayman : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
+                            "|| Subtarget.device()->getGeneration() =="
+                            "AMDGPUDeviceInfo::HD6XXX">;
+
+def isR600toCayman : Predicate<
+                     "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
+
+//===----------------------------------------------------------------------===//
+// R600 SDNodes
+//===----------------------------------------------------------------------===//
+
+def INTERP_PAIR_XY :  AMDGPUShaderInst <
+  (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
+  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def INTERP_PAIR_ZW :  AMDGPUShaderInst <
+  (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
+  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
+  []>;
+
+def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
+  SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+  [SDNPVariadic]
+>;
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP_VEC_LOAD :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0),
+  "INTERP_LOAD $src0 : $dst",
+  []>;
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+def ExportType : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+class ExportWord0 {
+  field bits<32> Word0;
+
+  bits<13> arraybase;
+  bits<2> type;
+  bits<7> gpr;
+  bits<2> elem_size;
+
+  let Word0{12-0} = arraybase;
+  let Word0{14-13} = type;
+  let Word0{21-15} = gpr;
+  let Word0{22} = 0; // RW_REL
+  let Word0{29-23} = 0; // INDEX_GPR
+  let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+  field bits<32> Word1;
+
+  bits<3> sw_x;
+  bits<3> sw_y;
+  bits<3> sw_z;
+  bits<3> sw_w;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{2-0} = sw_x;
+  let Word1{5-3} = sw_y;
+  let Word1{8-6} = sw_z;
+  let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+  field bits<32> Word1;
+
+  bits<12> arraySize;
+  bits<4> compMask;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{11-0} = arraySize;
+  let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        0, 61, 0, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sub0),
+        0, 61, 7, 0, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy (i32 imm:$type)),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_dummy 1),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type),
+    (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$base,
+        imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0)
+  >;
+
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+  def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src),
+      (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, 0, imm:$arraybase,
+      4095, imm:$mask, buf3inst, 0)>;
+}
+
+let usesCustomInserter = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
+    i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportSwzWord1 {
+  let elem_size = 3;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+} // End usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportBufWord1 {
+  let elem_size = 0;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_ALU_WORD0 {
+  field bits<32> Word0;
+
+  bits<22> ADDR;
+  bits<4> KCACHE_BANK0;
+  bits<4> KCACHE_BANK1;
+  bits<2> KCACHE_MODE0;
+
+  let Word0{21-0} = ADDR;
+  let Word0{25-22} = KCACHE_BANK0;
+  let Word0{29-26} = KCACHE_BANK1;
+  let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+  field bits<32> Word1;
+
+  bits<2> KCACHE_MODE1;
+  bits<8> KCACHE_ADDR0;
+  bits<8> KCACHE_ADDR1;
+  bits<7> COUNT;
+  bits<1> ALT_CONST;
+  bits<4> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{1-0} = KCACHE_MODE1;
+  let Word1{9-2} = KCACHE_ADDR0;
+  let Word1{17-10} = KCACHE_ADDR1;
+  let Word1{24-18} = COUNT;
+  let Word1{25} = ALT_CONST;
+  let Word1{29-26} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
+(ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, i32imm:$KCACHE_MODE0, i32imm:$KCACHE_MODE1,
+i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, i32imm:$COUNT),
+!strconcat(OpName, " $COUNT, @$ADDR, "
+"KC0[CB$KCACHE_BANK0:$KCACHE_ADDR0-$KCACHE_ADDR0+32]"
+", KC1[CB$KCACHE_BANK1:$KCACHE_ADDR1-$KCACHE_ADDR1+32]"),
+[] >, CF_ALU_WORD0, CF_ALU_WORD1 {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let ALT_CONST = 0;
+  let WHOLE_QUAD_MODE = 0;
+  let BARRIER = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class CF_WORD0 {
+  field bits<32> Word0;
+
+  bits<24> ADDR;
+  bits<3> JUMPTABLE_SEL;
+
+  let Word0{23-0} = ADDR;
+  let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1 {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<6> COUNT;
+  bits<1> VALID_PIXEL_MODE;
+  bits<8> CF_INST;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{15-10} = COUNT;
+  let Word1{20} = VALID_PIXEL_MODE;
+  let Word1{29-22} = CF_INST;
+  let Word1{31} = BARRIER;
+}
+
+class CF_CLAUSE <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
+ins, AsmPrint, [] >, CF_WORD0, CF_WORD1 {
+  field bits<64> Inst;
+
+  let CF_INST = inst;
+  let BARRIER = 1;
+  let JUMPTABLE_SEL = 0;
+  let CF_CONST = 0;
+  let VALID_PIXEL_MODE = 0;
+  let COND = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+def CF_TC : CF_CLAUSE<1, (ins i32imm:$ADDR, i32imm:$COUNT),
+"TEX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+
+def CF_VC : CF_CLAUSE<2, (ins i32imm:$ADDR, i32imm:$COUNT),
+"VTX $COUNT @$ADDR"> {
+  let POP_COUNT = 0;
+}
+
+def WHILE_LOOP : CF_CLAUSE<6, (ins i32imm:$ADDR), "LOOP_START_DX10 @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+
+def END_LOOP : CF_CLAUSE<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+
+def LOOP_BREAK : CF_CLAUSE<9, (ins i32imm:$ADDR), "LOOP_BREAK @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+
+def CF_CONTINUE : CF_CLAUSE<8, (ins i32imm:$ADDR), "CONTINUE @$ADDR"> {
+  let POP_COUNT = 0;
+  let COUNT = 0;
+}
+
+def CF_JUMP : CF_CLAUSE<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "JUMP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+
+def CF_ELSE : CF_CLAUSE<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+
+def CF_CALL_FS : CF_CLAUSE<19, (ins), "CALL_FS"> {
+  let ADDR = 0;
+  let COUNT = 0;
+  let POP_COUNT = 0;
+}
+
+def POP : CF_CLAUSE<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "POP @$ADDR POP:$POP_COUNT"> {
+  let COUNT = 0;
+}
+
+def CF_ALU : ALU_CLAUSE<8, "ALU">;
+def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+
+def STACK_SIZE : AMDGPUInst <(outs),
+(ins i32imm:$num), "nstack $num", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+}
+
+let Predicates = [isR600toCayman] in {
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+             COND_EQ))]
+>;
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+              COND_GT))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+              COND_GE))]
+>;
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+    COND_NE))]
+>;
+
+def SETE_DX10 : R600_2OP <
+  0xC, "SETE_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_EQ))]
+>;
+
+def SETGT_DX10 : R600_2OP <
+  0xD, "SETGT_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_GT))]
+>;
+
+def SETGE_DX10 : R600_2OP <
+  0xE, "SETGE_DX10",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+    COND_GE))]
+>;
+
+def SETNE_DX10 : R600_2OP <
+  0xF, "SETNE_DX10",
+  [(set R600_Reg32:$dst,
+    (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, (i32 -1), (i32 0),
+     COND_NE))]
+>;
+
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+  (outs R600_Reg32:$dst),
+  (ins immType:$imm),
+  "",
+  []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+  (imm:$val),
+  (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+  (fpimm:$val),
+  (MOV_IMM_F32  fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUumax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SETGT_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+  0x3C, "SETGE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT",
+  [(set (i32 R600_Reg32:$dst),
+    (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+  0x1C, "CNDE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+  0x1E, "CNDGE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_GE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+  0x1D, "CNDGT_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_GT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+def TEX_LD : R600_TEX <
+  0x03, "TEX_LD",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txf R600_Reg128:$SRC_GPR,
+      imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID,
+      imm:$SAMPLER_ID, imm:$textureTarget))]
+> {
+let AsmString = "TEX_LD $DST_GPR, $SRC_GPR, $OFFSET_X, $OFFSET_Y, $OFFSET_Z,"
+    "$RESOURCE_ID, $SAMPLER_ID, $textureTarget";
+let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X,
+    i32imm:$OFFSET_Y, i32imm:$OFFSET_Z, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+    i32imm:$textureTarget);
+}
+
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <
+  0x04, "TEX_GET_TEXTURE_RESINFO",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txq R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_GET_GRADIENTS_H : R600_TEX <
+  0x07, "TEX_GET_GRADIENTS_H",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddx R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_GET_GRADIENTS_V : R600_TEX <
+  0x08, "TEX_GET_GRADIENTS_V",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_ddy R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_SET_GRADIENTS_H : R600_TEX <
+  0x0B, "TEX_SET_GRADIENTS_H",
+  []
+>;
+
+def TEX_SET_GRADIENTS_V : R600_TEX <
+  0x0C, "TEX_SET_GRADIENTS_V",
+  []
+>;
+
+def TEX_SAMPLE : R600_TEX <
+  0x10, "TEX_SAMPLE",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C : R600_TEX <
+  0x18, "TEX_SAMPLE_C",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_tex R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_L : R600_TEX <
+  0x11, "TEX_SAMPLE_L",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C_L : R600_TEX <
+  0x19, "TEX_SAMPLE_C_L",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txl R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_LB : R600_TEX <
+  0x12, "TEX_SAMPLE_LB",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C_LB : R600_TEX <
+  0x1A, "TEX_SAMPLE_C_LB",
+  [(set R600_Reg128:$DST_GPR, (int_AMDGPU_txb R600_Reg128:$SRC_GPR,
+      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_G : R600_TEX <
+  0x14, "TEX_SAMPLE_G",
+  []
+>;
+
+def TEX_SAMPLE_C_G : R600_TEX <
+  0x1C, "TEX_SAMPLE_C_G",
+  []
+>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD",
+  []
+>;
+
+class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD_IEEE",
+  [(set (f32 R600_Reg32:$dst),
+   (fadd (fmul R600_Reg32:$src0, R600_Reg32:$src1), R600_Reg32:$src2))]
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_EQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGT",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_GT))]
+>;
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_GE))]
+>;
+
+multiclass DOT4_Common <bits<11> inst> {
+
+  def _pseudo : R600_REDUCTION <inst,
+    (ins R600_Reg128:$src0, R600_Reg128:$src1),
+    "DOT4 $dst $src0, $src1",
+    [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
+  >;
+
+  def _real : R600_2OP <inst, "DOT4", []>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+  def _pseudo : InstR600 <
+    inst,
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg128:$src),
+    "CUBE $dst $src",
+    [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
+    VecALU
+  > {
+    let isPseudo = 1;
+  }
+
+  def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "EXP_IEEE", fexp2
+>;
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_INT", fp_to_sint
+>;
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "INT_TO_FLT", sint_to_fp
+>;
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_UINT", fp_to_uint
+>;
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "UINT_TO_FLT", uint_to_fp
+>;
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "LOG_IEEE", flog2
+>;
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT", mulhs
+>;
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI", mulhu
+>;
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULLO_INT", mul
+>;
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED", []
+>;
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
+>;
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIP_UINT", AMDGPUurecip
+>;
+
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
+>;
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIPSQRT_IEEE", []
+>;
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+  inst, "SIN", []>{
+  let Trig = 1;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+  inst, "COS", []> {
+  let Trig = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+  (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
+  (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+>;
+
+def : Pat<
+  (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
+  (MUL_IEEE R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+>;
+}
+
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
+  (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
+  (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
+>;
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def MULADD_IEEE_r600 : MULADD_IEEE_Common<0x14>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  defm DOT4_r600 : DOT4_Common<0x50>;
+  defm CUBE_r600 : CUBE_Common<0x52>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL, R600_Reg32>;
+  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
+
+  def : Pat<(fsqrt R600_Reg32:$src),
+    (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
+
+  def R600_ExportSwz : ExportSwzInst {
+    let Word1{20-17} = 1; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<R600_ExportSwz, 39>;
+
+  def R600_ExportBuf : ExportBufInst {
+    let Word1{20-17} = 1; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+}
+
+// Helper pattern for normalizing inputs to triginomic instructions for R700+
+// cards.
+class COS_PAT <InstR600 trig> : Pat<
+  (fcos R600_Reg32:$src),
+  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+>;
+
+class SIN_PAT <InstR600 trig> : Pat<
+  (fsin R600_Reg32:$src),
+  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+>;
+
+//===----------------------------------------------------------------------===//
+// R700 Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+
+  // R700 normalizes inputs to SIN/COS the same as EG
+  def : SIN_PAT <SIN_r700>;
+  def : COS_PAT <COS_r700>;
+}
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL, R600_Reg32>;
+def : SIN_PAT <SIN_eg>;
+def : COS_PAT <COS_eg>;
+def : Pat<(fsqrt R600_Reg32:$src),
+  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+  // BFE_UINT - bit_extract, an optimization for mask and shift
+  // Src0 = Input
+  // Src1 = Offset
+  // Src2 = Width
+  //
+  // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+  //
+  // Example Usage:
+  // (Offset, Width)
+  //
+  // (0, 8)           = (Input << 24) >> 24  = (Input &  0xff)       >> 0
+  // (8, 8)           = (Input << 16) >> 24  = (Input &  0xffff)     >> 8
+  // (16,8)           = (Input <<  8) >> 24  = (Input &  0xffffff)   >> 16
+  // (24,8)           = (Input <<  0) >> 24  = (Input &  0xffffffff) >> 24
+  def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+    [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
+                                                      R600_Reg32:$src1,
+                                                      R600_Reg32:$src2))],
+    VecALU
+  >;
+
+  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
+    [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
+                                          R600_Reg32:$src2))],
+    VecALU
+  >;
+
+  def MULADD_eg : MULADD_Common<0x14>;
+  def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
+  def ASHR_eg : ASHR_Common<0x15>;
+  def LSHR_eg : LSHR_Common<0x16>;
+  def LSHL_eg : LSHL_Common<0x17>;
+  def CNDE_eg : CNDE_Common<0x19>;
+  def CNDGT_eg : CNDGT_Common<0x1A>;
+  def CNDGE_eg : CNDGE_Common<0x1B>;
+  def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+  def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+  defm DOT4_eg : DOT4_Common<0xBE>;
+  defm CUBE_eg : CUBE_Common<0xC0>;
+
+let hasSideEffects = 1 in {
+  def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", []>;
+}
+
+  def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+    let Pattern = [];
+  }
+
+  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+  def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+    let Pattern = [];
+  }
+
+  def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+  // TRUNC is used for the FLT_TO_INT instructions to work around a
+  // perceived problem where the rounding modes are applied differently
+  // depending on the instruction and the slot they are in.
+  // See:
+  // https://bugs.freedesktop.org/show_bug.cgi?id=50232
+  // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+  //
+  // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+  // which do not need to be truncated since the fp values are 0.0f or 1.0f.
+  // We should look into handling these cases separately.
+  def : Pat<(fp_to_sint R600_Reg32:$src0),
+    (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
+
+  def : Pat<(fp_to_uint R600_Reg32:$src0),
+    (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
+
+  def EG_ExportSwz : ExportSwzInst {
+    let Word1{19-16} = 1; // BURST_COUNT
+    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{21} = eop;
+    let Word1{29-22} = inst;
+    let Word1{30} = 0; // MARK
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<EG_ExportSwz, 83>;
+
+  def EG_ExportBuf : ExportBufInst {
+    let Word1{19-16} = 1; // BURST_COUNT
+    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{21} = eop;
+    let Word1{29-22} = inst;
+    let Word1{30} = 0; // MARK
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+
+class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
+                              list<dag> pattern>
+    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
+                 !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
+  let RIM         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let TYPE        = 1;
+  let RW_REL      = 0;
+  let ELEM_SIZE   = 0;
+
+  let ARRAY_SIZE  = 0;
+  let COMP_MASK   = comp_mask;
+  let BURST_COUNT = 0;
+  let VPM         = 0;
+  let MARK        = 0;
+  let BARRIER     = 1;
+}
+
+} // End usesCustomInserter = 1
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0x1, "RAT_WRITE_CACHELESS_32_eg",
+  [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0xf, "RAT_WRITE_CACHELESS_128",
+  [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
+      VTX_WORD1_GPR, VTX_WORD0 {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
+                    pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $ptr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$ptr.ptr = $dst";
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set (v4i32 R600_Reg128:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Constant Loads
+// XXX: We are currently storing all constants in the global address space.
+//===----------------------------------------------------------------------===//
+
+def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
+>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Regist loads and stores - for indirect addressing
+//===----------------------------------------------------------------------===//
+
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+
+let Predicates = [isCayman] in {
+
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL, R600_Reg32>;
+def : SIN_PAT <SIN_cm>;
+def : COS_PAT <COS_cm>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+def : Pat <
+  (AMDGPUurecip R600_Reg32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
+                            (MOV_IMM_I32 0x4f800000)))
+>;
+
+
+def : Pat<(fsqrt R600_Reg32:$src),
+  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
+
+} // End isCayman
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
+def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
+  "PREDICATED_BREAK $src", []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+  0, (outs R600_Predicate_Bit:$dst),
+  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+  "", [], NullALU> {
+  let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1 in {
+def JUMP_COND : InstR600 <0x10,
+          (outs),
+          (ins brtarget:$target, R600_Predicate_Bit:$p),
+          "JUMP $target ($p)",
+          [], AnyALU
+  >;
+
+def JUMP : InstR600 <0x10,
+          (outs),
+          (ins brtarget:$target),
+          "JUMP $target",
+          [], AnyALU
+  >
+{
+  let isPredicable = 1;
+  let isBarrier = 1;
+}
+
+}  // End isTerminator = 1, isBranch = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins R600_Reg32:$src),
+    "MASK_WRITE $src",
+    []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+
+def TXD: AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TXD_SHADOW: AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+>;
+
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
+    usesCustomInserter = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Constant Buffer Addressing Support
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+def CONST_COPY : Instruction {
+  let OutOperandList = (outs R600_Reg32:$dst);
+  let InOperandList = (ins i32imm:$src);
+  let Pattern =
+      [(set R600_Reg32:$dst, (CONST_ADDRESS ADDRGA_CONST_OFFSET:$src))];
+  let AsmString = "CONST_COPY";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let Itinerary = NullALU;
+}
+} // end usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"
+
+def TEX_VTX_CONSTBUF :
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
+      [(set R600_Reg128:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
+  VTX_WORD1_GPR, VTX_WORD0 {
+
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let SRC_REL = 0;
+  let SRC_SEL_X = 0;
+  let DST_REL = 0;
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 2;
+  let FORMAT_COMP_ALL = 1;
+  let SRF_MODE_ALL = 1;
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 2;
+  let DST_SEL_W        = 3;
+  let DATA_FORMAT      = 35;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+}
+
+def TEX_VTX_TEXBUF:
+  InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
+      [(set R600_Reg128:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
+VTX_WORD1_GPR, VTX_WORD0 {
+
+let VC_INST = 0;
+let FETCH_TYPE = 2;
+let FETCH_WHOLE_QUAD = 0;
+let SRC_REL = 0;
+let SRC_SEL_X = 0;
+let DST_REL = 0;
+let USE_CONST_FIELDS = 1;
+let NUM_FORMAT_ALL = 0;
+let FORMAT_COMP_ALL = 0;
+let SRF_MODE_ALL = 1;
+let MEGA_FETCH_COUNT = 16;
+let DST_SEL_X        = 0;
+let DST_SEL_Y        = 1;
+let DST_SEL_Z        = 2;
+let DST_SEL_W        = 3;
+let DATA_FORMAT      = 0;
+
+let Inst{31-0} = Word0;
+let Inst{63-32} = Word1;
+
+// LLVM can only encode 64-bit instructions, so these fields are manually
+// encoded in R600CodeEmitter
+//
+// bits<16> OFFSET;
+// bits<2>  ENDIAN_SWAP = 0;
+// bits<1>  CONST_BUF_NO_STRIDE = 0;
+// bits<1>  MEGA_FETCH = 0;
+// bits<1>  ALT_CONST = 0;
+// bits<2>  BUFFER_INDEX_MODE = 0;
+
+
+
+// VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+// is done in R600CodeEmitter
+//
+// Inst{79-64} = OFFSET;
+// Inst{81-80} = ENDIAN_SWAP;
+// Inst{82}    = CONST_BUF_NO_STRIDE;
+// Inst{83}    = MEGA_FETCH;
+// Inst{84}    = ALT_CONST;
+// Inst{86-85} = BUFFER_INDEX_MODE;
+// Inst{95-86} = 0; Reserved
+
+// VTX_WORD3 (Padding)
+//
+// Inst{127-96} = 0;
+}
+
+
+
+//===--------------------------------------------------------------------===//
+// Instructions support
+//===--------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// seperate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond>;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+let isTerminator=1 in {
+  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
+  !strconcat("SWITCH", " $src"), []>;
+  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
+      !strconcat("CASE", " $src"), []>;
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+// CND*_INT Pattterns for f32 True / False values
+
+class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat <
+  (selectcc (i32 R600_Reg32:$src0), 0, (f32 R600_Reg32:$src1),
+                                            R600_Reg32:$src2, cc),
+  (cnd R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+>;
+
+def : CND_INT_f32 <CNDE_INT,  SETEQ>;
+def : CND_INT_f32 <CNDGT_INT, SETGT>;
+def : CND_INT_f32 <CNDGE_INT, SETGE>;
+
+//CNDGE_INT extra pattern
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
+                                        (i32 R600_Reg32:$src2), COND_GT),
+  (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+  (int_AMDGPU_kill R600_Reg32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
+>;
+
+// SGT Reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
+  (SGT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SGE Reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
+  (SGE R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGT_DX10 reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LT),
+  (SETGT_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_DX10 reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, COND_LE),
+  (SETGE_DX10 R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGT_INT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
+  (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_INT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
+  (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGT_UINT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
+  (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_UINT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
+  (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// The next two patterns are special cases for handling 'true if ordered' and
+// 'true if unordered' conditionals.  The assumption here is that the behavior of
+// SETE and SNE conforms to the Direct3D 10 rules for floating point values
+// described here:
+// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
+// We assume that  SETE returns false when one of the operands is NAN and
+// SNE returns true when on of the operands is NAN
+
+//SETE - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
+  (SETE R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+//SETE_DX10 - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETO),
+  (SETE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+//SNE - 'true if unordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
+  (SNE R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+//SETNE_DX10 - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUO),
+  (SETNE_DX10 R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+def : Extract_Element <f32, v4f32, R600_Reg128, 0, sub0>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 1, sub1>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 2, sub2>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 3, sub3>;
+
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sub0>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sub1>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sub2>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sub3>;
+
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sub0>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sub1>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sub2>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sub3>;
+
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sub0>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sub1>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sub2>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sub3>;
+
+def : Vector4_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
+def : Vector4_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat  <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
diff --git a/contrib/llvm/lib/Target/R600/R600Intrinsics.td b/contrib/llvm/lib/Target/R600/R600Intrinsics.td
new file mode 100644
index 000000000000..dc8980aef146
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Intrinsics.td
@@ -0,0 +1,31 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_input :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_load_texbuf :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_store_swizzle :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_stream_output :
+    Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_pixel_depth :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_pixel_stencil :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_dummy :
+      Intrinsic<[], [llvm_i32_ty], []>;
+}
diff --git a/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..018b40363363
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -0,0 +1,18 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
+
+
diff --git a/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h
new file mode 100644
index 000000000000..99c1f91b09b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineFunctionInfo.h
@@ -0,0 +1,32 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINEFUNCTIONINFO_H
+#define R600MACHINEFUNCTIONINFO_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "AMDGPUMachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  SmallVector<unsigned, 4> LiveOuts;
+  std::vector<unsigned> IndirectRegs;
+};
+
+} // End llvm namespace
+
+#endif //R600MACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp
new file mode 100644
index 000000000000..a777142a9e70
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineScheduler.cpp
@@ -0,0 +1,427 @@
+//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "R600MachineScheduler.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+
+using namespace llvm;
+
+void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
+
+  DAG = dag;
+  TII = static_cast<const R600InstrInfo*>(DAG->TII);
+  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  MRI = &DAG->MRI;
+  Available[IDAlu]->clear();
+  Available[IDFetch]->clear();
+  Available[IDOther]->clear();
+  CurInstKind = IDOther;
+  CurEmitted = 0;
+  OccupedSlotsMask = 15;
+  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
+
+
+  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
+    InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
+  } else {
+    InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
+  }
+}
+
+void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
+{
+  if (QSrc->empty())
+    return;
+  for (ReadyQueue::iterator I = QSrc->begin(),
+      E = QSrc->end(); I != E; ++I) {
+    (*I)->NodeQueueId &= ~QSrc->getID();
+    QDst->push(*I);
+  }
+  QSrc->clear();
+}
+
+SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
+  SUnit *SU = 0;
+  IsTopNode = true;
+  NextInstKind = IDOther;
+
+  // check if we might want to switch current clause type
+  bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
+      (CurEmitted > InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind]->empty());
+  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch]->empty() || !Available[IDOther]->empty());
+
+  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
+    // try to pick ALU
+    SU = pickAlu();
+    if (SU) {
+      if (CurEmitted >  InstKindLimit[IDAlu])
+        CurEmitted = 0;
+      NextInstKind = IDAlu;
+    }
+  }
+
+  if (!SU) {
+    // try to pick FETCH
+    SU = pickOther(IDFetch);
+    if (SU)
+      NextInstKind = IDFetch;
+  }
+
+  // try to pick other
+  if (!SU) {
+    SU = pickOther(IDOther);
+    if (SU)
+      NextInstKind = IDOther;
+  }
+
+  DEBUG(
+      if (SU) {
+        dbgs() << "picked node: ";
+        SU->dump(DAG);
+      } else {
+        dbgs() << "NO NODE ";
+        for (int i = 0; i < IDLast; ++i) {
+          Available[i]->dump();
+          Pending[i]->dump();
+        }
+        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
+          const SUnit &S = DAG->SUnits[i];
+          if (!S.isScheduled)
+            S.dump(DAG);
+        }
+      }
+  );
+
+  return SU;
+}
+
+void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
+
+  DEBUG(dbgs() << "scheduled: ");
+  DEBUG(SU->dump(DAG));
+
+  if (NextInstKind != CurInstKind) {
+    DEBUG(dbgs() << "Instruction Type Switch\n");
+    if (NextInstKind != IDAlu)
+      OccupedSlotsMask = 15;
+    CurEmitted = 0;
+    CurInstKind = NextInstKind;
+  }
+
+  if (CurInstKind == IDAlu) {
+    switch (getAluKind(SU)) {
+    case AluT_XYZW:
+      CurEmitted += 4;
+      break;
+    case AluDiscarded:
+      break;
+    default: {
+      ++CurEmitted;
+      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
+          E = SU->getInstr()->operands_end(); It != E; ++It) {
+        MachineOperand &MO = *It;
+        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
+          ++CurEmitted;
+      }
+    }
+    }
+  } else {
+    ++CurEmitted;
+  }
+
+
+  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
+
+  if (CurInstKind != IDFetch) {
+    MoveUnits(Pending[IDFetch], Available[IDFetch]);
+  }
+  MoveUnits(Pending[IDOther], Available[IDOther]);
+}
+
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  int IK = getInstKind(SU);
+
+  DEBUG(dbgs() << IK << " <= ");
+  DEBUG(SU->dump(DAG));
+
+  Pending[IK]->push(SU);
+}
+
+void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+}
+
+bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
+                                          const TargetRegisterClass *RC) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    return RC->contains(Reg);
+  } else {
+    return MRI->getRegClass(Reg) == RC;
+  }
+}
+
+R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+
+    switch (MI->getOpcode()) {
+    case AMDGPU::INTERP_PAIR_XY:
+    case AMDGPU::INTERP_PAIR_ZW:
+    case AMDGPU::INTERP_VEC_LOAD:
+      return AluT_XYZW;
+    case AMDGPU::COPY:
+      if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
+        // %vregX = COPY Tn_X is likely to be discarded in favor of an
+        // assignement of Tn_X to %vregX, don't considers it in scheduling
+        return AluDiscarded;
+      }
+      else if (MI->getOperand(1).isUndef()) {
+        // MI will become a KILL, don't considers it in scheduling
+        return AluDiscarded;
+      }
+    default:
+      break;
+    }
+
+    // Does the instruction take a whole IG ?
+    if(TII->isVector(*MI) ||
+        TII->isCubeOp(MI->getOpcode()) ||
+        TII->isReductionOp(MI->getOpcode()))
+      return AluT_XYZW;
+
+    // Is the result already assigned to a channel ?
+    unsigned DestSubReg = MI->getOperand(0).getSubReg();
+    switch (DestSubReg) {
+    case AMDGPU::sub0:
+      return AluT_X;
+    case AMDGPU::sub1:
+      return AluT_Y;
+    case AMDGPU::sub2:
+      return AluT_Z;
+    case AMDGPU::sub3:
+      return AluT_W;
+    default:
+      break;
+    }
+
+    // Is the result already member of a X/Y/Z/W class ?
+    unsigned DestReg = MI->getOperand(0).getReg();
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
+        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
+      return AluT_X;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
+      return AluT_Y;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
+      return AluT_Z;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
+      return AluT_W;
+    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
+      return AluT_XYZW;
+
+    return AluAny;
+
+}
+
+int R600SchedStrategy::getInstKind(SUnit* SU) {
+  int Opcode = SU->getInstr()->getOpcode();
+
+  if (TII->isALUInstr(Opcode)) {
+    return IDAlu;
+  }
+
+  switch (Opcode) {
+  case AMDGPU::COPY:
+  case AMDGPU::CONST_COPY:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::DOT4_eg_pseudo:
+  case AMDGPU::DOT4_r600_pseudo:
+    return IDAlu;
+  case AMDGPU::TEX_VTX_CONSTBUF:
+  case AMDGPU::TEX_VTX_TEXBUF:
+  case AMDGPU::TEX_LD:
+  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+  case AMDGPU::TEX_GET_GRADIENTS_H:
+  case AMDGPU::TEX_GET_GRADIENTS_V:
+  case AMDGPU::TEX_SET_GRADIENTS_H:
+  case AMDGPU::TEX_SET_GRADIENTS_V:
+  case AMDGPU::TEX_SAMPLE:
+  case AMDGPU::TEX_SAMPLE_C:
+  case AMDGPU::TEX_SAMPLE_L:
+  case AMDGPU::TEX_SAMPLE_C_L:
+  case AMDGPU::TEX_SAMPLE_LB:
+  case AMDGPU::TEX_SAMPLE_C_LB:
+  case AMDGPU::TEX_SAMPLE_G:
+  case AMDGPU::TEX_SAMPLE_C_G:
+  case AMDGPU::TXD:
+  case AMDGPU::TXD_SHADOW:
+    return IDFetch;
+  default:
+    DEBUG(
+        dbgs() << "other inst: ";
+        SU->dump(DAG);
+    );
+    return IDOther;
+  }
+}
+
+SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
+  if (Q.empty())
+    return NULL;
+  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
+      It != E; ++It) {
+    SUnit *SU = *It;
+    InstructionsGroupCandidate.push_back(SU->getInstr());
+    if (TII->canBundle(InstructionsGroupCandidate)) {
+      InstructionsGroupCandidate.pop_back();
+      Q.erase(It);
+      return SU;
+    } else {
+      InstructionsGroupCandidate.pop_back();
+    }
+  }
+  return NULL;
+}
+
+void R600SchedStrategy::LoadAlu() {
+  ReadyQueue *QSrc = Pending[IDAlu];
+  for (ReadyQueue::iterator I = QSrc->begin(),
+        E = QSrc->end(); I != E; ++I) {
+      (*I)->NodeQueueId &= ~QSrc->getID();
+      AluKind AK = getAluKind(*I);
+      AvailableAlus[AK].insert(*I);
+    }
+    QSrc->clear();
+}
+
+void R600SchedStrategy::PrepareNextSlot() {
+  DEBUG(dbgs() << "New Slot\n");
+  assert (OccupedSlotsMask && "Slot wasn't filled");
+  OccupedSlotsMask = 0;
+  InstructionsGroupCandidate.clear();
+  LoadAlu();
+}
+
+void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  // PressureRegister crashes if an operand is def and used in the same inst
+  // and we try to constraint its regclass
+  for (MachineInstr::mop_iterator It = MI->operands_begin(),
+      E = MI->operands_end(); It != E; ++It) {
+    MachineOperand &MO = *It;
+    if (MO.isReg() && !MO.isDef() &&
+        MO.getReg() == MI->getOperand(0).getReg())
+      return;
+  }
+  // Constrains the regclass of DestReg to assign it to Slot
+  switch (Slot) {
+  case 0:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
+    break;
+  case 1:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
+    break;
+  case 2:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
+    break;
+  case 3:
+    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
+    break;
+  }
+}
+
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
+  if (!UnslotedSU) {
+    return SlotedSU;
+  } else if (!SlotedSU) {
+    AssignSlot(UnslotedSU->getInstr(), Slot);
+    return UnslotedSU;
+  } else {
+    //Determine which one to pick (the lesser one)
+    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
+      AvailableAlus[AluAny].insert(UnslotedSU);
+      return SlotedSU;
+    } else {
+      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
+      AssignSlot(UnslotedSU->getInstr(), Slot);
+      return UnslotedSU;
+    }
+  }
+}
+
+bool R600SchedStrategy::isAvailablesAluEmpty() const {
+  return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
+      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
+      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
+      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
+}
+
+SUnit* R600SchedStrategy::pickAlu() {
+  while (!isAvailablesAluEmpty()) {
+    if (!OccupedSlotsMask) {
+      // Flush physical reg copies (RA will discard them)
+      if (!AvailableAlus[AluDiscarded].empty()) {
+        OccupedSlotsMask = 15;
+        return PopInst(AvailableAlus[AluDiscarded]);
+      }
+      // If there is a T_XYZW alu available, use it
+      if (!AvailableAlus[AluT_XYZW].empty()) {
+        OccupedSlotsMask = 15;
+        return PopInst(AvailableAlus[AluT_XYZW]);
+      }
+    }
+    for (unsigned Chan = 0; Chan < 4; ++Chan) {
+      bool isOccupied = OccupedSlotsMask & (1 << Chan);
+      if (!isOccupied) {
+        SUnit *SU = AttemptFillSlot(Chan);
+        if (SU) {
+          OccupedSlotsMask |= (1 << Chan);
+          InstructionsGroupCandidate.push_back(SU->getInstr());
+          return SU;
+        }
+      }
+    }
+    PrepareNextSlot();
+  }
+  return NULL;
+}
+
+SUnit* R600SchedStrategy::pickOther(int QID) {
+  SUnit *SU = 0;
+  ReadyQueue *AQ = Available[QID];
+
+  if (AQ->empty()) {
+    MoveUnits(Pending[QID], AQ);
+  }
+  if (!AQ->empty()) {
+    SU = *AQ->begin();
+    AQ->remove(AQ->begin());
+  }
+  return SU;
+}
+
diff --git a/contrib/llvm/lib/Target/R600/R600MachineScheduler.h b/contrib/llvm/lib/Target/R600/R600MachineScheduler.h
new file mode 100644
index 000000000000..3d0367fd8ebf
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600MachineScheduler.h
@@ -0,0 +1,120 @@
+//===-- R600MachineScheduler.h - R600 Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINESCHEDULER_H_
+#define R600MACHINESCHEDULER_H_
+
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class CompareSUnit {
+public:
+  bool operator()(const SUnit *S1, const SUnit *S2) {
+    return S1->getDepth() > S2->getDepth();
+  }
+};
+
+class R600SchedStrategy : public MachineSchedStrategy {
+
+  const ScheduleDAGMI *DAG;
+  const R600InstrInfo *TII;
+  const R600RegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  enum InstQueue {
+    QAlu = 1,
+    QFetch = 2,
+    QOther = 4
+  };
+
+  enum InstKind {
+    IDAlu,
+    IDFetch,
+    IDOther,
+    IDLast
+  };
+
+  enum AluKind {
+    AluAny,
+    AluT_X,
+    AluT_Y,
+    AluT_Z,
+    AluT_W,
+    AluT_XYZW,
+    AluDiscarded, // LLVM Instructions that are going to be eliminated
+    AluLast
+  };
+
+  ReadyQueue *Available[IDLast], *Pending[IDLast];
+  std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
+
+  InstKind CurInstKind;
+  int CurEmitted;
+  InstKind NextInstKind;
+
+  int InstKindLimit[IDLast];
+
+  int OccupedSlotsMask;
+
+public:
+  R600SchedStrategy() :
+    DAG(0), TII(0), TRI(0), MRI(0) {
+    Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
+    Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
+    Available[IDOther] = new ReadyQueue(QOther, "AOther");
+    Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
+    Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch");
+    Pending[IDOther] = new ReadyQueue(QOther<<4, "POther");
+  }
+
+  virtual ~R600SchedStrategy() {
+    for (unsigned I = 0; I < IDLast; ++I) {
+      delete Available[I];
+      delete Pending[I];
+    }
+  }
+
+  virtual void initialize(ScheduleDAGMI *dag);
+  virtual SUnit *pickNode(bool &IsTopNode);
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+  virtual void releaseTopNode(SUnit *SU);
+  virtual void releaseBottomNode(SUnit *SU);
+
+private:
+  std::vector<MachineInstr *> InstructionsGroupCandidate;
+
+  int getInstKind(SUnit *SU);
+  bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
+  AluKind getAluKind(SUnit *SU) const;
+  void LoadAlu();
+  bool isAvailablesAluEmpty() const;
+  SUnit *AttemptFillSlot (unsigned Slot);
+  void PrepareNextSlot();
+  SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
+
+  void AssignSlot(MachineInstr *MI, unsigned Slot);
+  SUnit* pickAlu();
+  SUnit* pickOther(int QID);
+  void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
+};
+
+} // namespace llvm
+
+#endif /* R600MACHINESCHEDULER_H_ */
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp
new file mode 100644
index 000000000000..bbd7995d7d51
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.cpp
@@ -0,0 +1,99 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  Reserved.set(AMDGPU::ZERO);
+  Reserved.set(AMDGPU::HALF);
+  Reserved.set(AMDGPU::ONE);
+  Reserved.set(AMDGPU::ONE_INT);
+  Reserved.set(AMDGPU::NEG_HALF);
+  Reserved.set(AMDGPU::NEG_ONE);
+  Reserved.set(AMDGPU::PV_X);
+  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::ALU_CONST);
+  Reserved.set(AMDGPU::PREDICATE_BIT);
+  Reserved.set(AMDGPU::PRED_SEL_OFF);
+  Reserved.set(AMDGPU::PRED_SEL_ZERO);
+  Reserved.set(AMDGPU::PRED_SEL_ONE);
+
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
+                        E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
+                                     E = AMDGPU::TRegMemRegClass.end();
+                                     I !=  E; ++I) {
+    Reserved.set(*I);
+  }
+
+  const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
+  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
+  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
+                                       E = IndirectRegs.end();
+                                       I != E; ++I) {
+    Reserved.set(*I);
+  }
+  return Reserved;
+}
+
+const TargetRegisterClass *
+R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
+  switch (rc->getID()) {
+  case AMDGPU::GPRF32RegClassID:
+  case AMDGPU::GPRI32RegClassID:
+    return &AMDGPU::R600_Reg32RegClass;
+  default: return rc;
+  }
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+  default:
+  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  }
+}
+
+unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  switch (Channel) {
+    default: assert(!"Invalid channel index"); return 0;
+    case 0: return AMDGPU::sub0;
+    case 1: return AMDGPU::sub1;
+    case 2: return AMDGPU::sub2;
+    case 3: return AMDGPU::sub3;
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.h b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h
new file mode 100644
index 000000000000..f9ca918f246b
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.h
@@ -0,0 +1,55 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600REGISTERINFO_H_
+#define R600REGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+namespace llvm {
+
+class R600TargetMachine;
+class TargetInstrInfo;
+
+struct R600RegisterInfo : public AMDGPURegisterInfo {
+  AMDGPUTargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns the R600 reg class that is equivalent to \p RC.
+  virtual const TargetRegisterClass *getISARegClass(
+    const TargetRegisterClass *RC) const;
+
+  /// \brief get the HW encoding for a register's channel.
+  unsigned getHWRegChan(unsigned reg) const;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/R600RegisterInfo.td b/contrib/llvm/lib/Target/R600/R600RegisterInfo.td
new file mode 100644
index 000000000000..03f49761ea40
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600RegisterInfo.td
@@ -0,0 +1,209 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+    Register <name> {
+
+  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+                                !if(!eq(chan, "Y"), 1,
+                                !if(!eq(chan, "Z"), 2,
+                                !if(!eq(chan, "W"), 3, 0))));
+  let HWEncoding{8-0}  = sel;
+  let HWEncoding{10-9} = chan_encoding;
+  let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1, sub2, sub3];
+  let HWEncoding = encoding;
+}
+
+foreach Index = 0-127 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+    // Indirect addressing offset registers
+    def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
+                                              Index, Chan>;
+    def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
+                                                Chan>;
+  }
+  // 128-bit Temporary Registers
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y"),
+                                    !cast<Register>("T"#Index#"_Z"),
+                                    !cast<Register>("T"#Index#"_W")],
+                                   Index>;
+}
+
+// KCACHE_BANK0
+foreach Index = 159-128 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#Index#"-128]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#Index#"-128].XYZW",
+                                 [!cast<Register>("KC0_"#Index#"_X"),
+                                  !cast<Register>("KC0_"#Index#"_Y"),
+                                  !cast<Register>("KC0_"#Index#"_Z"),
+                                  !cast<Register>("KC0_"#Index#"_W")],
+                                 Index>;
+}
+
+// KCACHE_BANK1
+foreach Index = 191-160 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#Index#"-160]."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#Index#"-160].XYZW",
+                                 [!cast<Register>("KC1_"#Index#"_X"),
+                                  !cast<Register>("KC1_"#Index#"_Y"),
+                                  !cast<Register>("KC1_"#Index#"_Z"),
+                                  !cast<Register>("KC1_"#Index#"_W")],
+                                 Index>;
+}
+
+
+// Array Base Register holding input in FS
+foreach Index = 448-480 in {
+  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
+def PV_X : R600Reg<"pv.x", 254>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+def AR_X : R600Reg<"AR.x", 0>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (sequence "ArrayBase%u", 448, 480))>;
+// special registers for ALU src operands
+// const buffer reference, SRCx_SEL contains index
+def ALU_CONST : R600Reg<"CBuf", 0>;
+// interpolation param reference, SRCx_SEL contains index
+def ALU_PARAM : R600Reg<"Param", 0>;
+
+let isAllocatable = 0 in {
+
+// XXX: Only use the X channel, until we support wider stack widths
+def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
+
+} // End isAllocatable = 0
+
+def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_X", 128, 159))>;
+
+def R600_KC0_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Y", 128, 159))>;
+
+def R600_KC0_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_Z", 128, 159))>;
+
+def R600_KC0_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC0_%u_W", 128, 159))>;
+
+def R600_KC0 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC0_X, R600_KC0_Y,
+                                               R600_KC0_Z, R600_KC0_W)>;
+
+def R600_KC1_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_X", 160, 191))>;
+
+def R600_KC1_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Y", 160, 191))>;
+
+def R600_KC1_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_Z", 160, 191))>;
+
+def R600_KC1_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                              (add (sequence "KC1_%u_W", 160, 191))>;
+
+def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_KC1_X, R600_KC1_Y,
+                                               R600_KC1_Z, R600_KC1_W)>;
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_X", 0, 127), AR_X)>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (interleave R600_TReg32_X, R600_TReg32_Y,
+                                               R600_TReg32_Z, R600_TReg32_W)>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_ArrayBase,
+    R600_Addr,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
+    ALU_CONST, ALU_PARAM
+    )>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+    PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+                                (add (sequence "T%u_XYZW", 0, 127))> {
+  let CopyCost = -1;
+}
+
+//===----------------------------------------------------------------------===//
+// Register classes for indirect addressing
+//===----------------------------------------------------------------------===//
+
+// Super register for all the Indirect Registers.  This register class is used
+// by the REG_SEQUENCE instruction to specify the registers to use for direct
+// reads / writes which may be written / read by an indirect address.
+class IndirectSuper<string n, list<Register> subregs> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices =
+ [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+  sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
+}
+
+def IndirectSuperReg : IndirectSuper<"Indirect",
+  [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
+   TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
+   TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
+>;
+
+def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
+
+// This register class defines the registers that are the storage units for
+// the "Indirect Addressing" pseudo memory space.
+// XXX: Only use the X channel, until we support wider stack widths
+def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
+  (add (sequence "TRegMem%u_X", 0, 16))
+>;
diff --git a/contrib/llvm/lib/Target/R600/R600Schedule.td b/contrib/llvm/lib/Target/R600/R600Schedule.td
new file mode 100644
index 000000000000..7ede181c51dc
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/R600Schedule.td
@@ -0,0 +1,36 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+
+def R600_EG_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
diff --git a/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
new file mode 100644
index 000000000000..2477e2a9dcc3
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -0,0 +1,329 @@
+//===-- SIAnnotateControlFlow.cpp -  ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Annotates the control flow with hardware specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+namespace {
+
+// Complex types used in this pass
+typedef std::pair<BasicBlock *, Value *> StackEntry;
+typedef SmallVector<StackEntry, 16> StackVector;
+
+// Intrinsic names the control flow is annotated with
+static const char *IfIntrinsic = "llvm.SI.if";
+static const char *ElseIntrinsic = "llvm.SI.else";
+static const char *BreakIntrinsic = "llvm.SI.break";
+static const char *IfBreakIntrinsic = "llvm.SI.if.break";
+static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
+static const char *LoopIntrinsic = "llvm.SI.loop";
+static const char *EndCfIntrinsic = "llvm.SI.end.cf";
+
+class SIAnnotateControlFlow : public FunctionPass {
+
+  static char ID;
+
+  Type *Boolean;
+  Type *Void;
+  Type *Int64;
+  Type *ReturnStruct;
+
+  ConstantInt *BoolTrue;
+  ConstantInt *BoolFalse;
+  UndefValue *BoolUndef;
+  Constant *Int64Zero;
+
+  Constant *If;
+  Constant *Else;
+  Constant *Break;
+  Constant *IfBreak;
+  Constant *ElseBreak;
+  Constant *Loop;
+  Constant *EndCf;
+
+  DominatorTree *DT;
+  StackVector Stack;
+  SSAUpdater PhiInserter;
+
+  bool isTopOfStack(BasicBlock *BB);
+
+  Value *popSaved();
+
+  void push(BasicBlock *BB, Value *Saved);
+
+  bool isElse(PHINode *Phi);
+
+  void eraseIfUnused(PHINode *Phi);
+
+  void openIf(BranchInst *Term);
+
+  void insertElse(BranchInst *Term);
+
+  void handleLoopCondition(Value *Cond);
+
+  void handleLoop(BranchInst *Term);
+
+  void closeControlFlow(BasicBlock *BB);
+
+public:
+  SIAnnotateControlFlow():
+    FunctionPass(ID) { }
+
+  virtual bool doInitialization(Module &M);
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual const char *getPassName() const {
+    return "SI annotate control flow";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<DominatorTree>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+};
+
+} // end anonymous namespace
+
+char SIAnnotateControlFlow::ID = 0;
+
+/// \brief Initialize all the types and constants used in the pass
+bool SIAnnotateControlFlow::doInitialization(Module &M) {
+  LLVMContext &Context = M.getContext();
+
+  Void = Type::getVoidTy(Context);
+  Boolean = Type::getInt1Ty(Context);
+  Int64 = Type::getInt64Ty(Context);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+
+  BoolTrue = ConstantInt::getTrue(Context);
+  BoolFalse = ConstantInt::getFalse(Context);
+  BoolUndef = UndefValue::get(Boolean);
+  Int64Zero = ConstantInt::get(Int64, 0);
+
+  If = M.getOrInsertFunction(
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+
+  Else = M.getOrInsertFunction(
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+
+  Break = M.getOrInsertFunction(
+    BreakIntrinsic, Int64, Int64, (Type *)0);
+
+  IfBreak = M.getOrInsertFunction(
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+
+  ElseBreak = M.getOrInsertFunction(
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+
+  Loop = M.getOrInsertFunction(
+    LoopIntrinsic, Boolean, Int64, (Type *)0);
+
+  EndCf = M.getOrInsertFunction(
+    EndCfIntrinsic, Void, Int64, (Type *)0);
+
+  return false;
+}
+
+/// \brief Is BB the last block saved on the stack ?
+bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) {
+  return !Stack.empty() && Stack.back().first == BB;
+}
+
+/// \brief Pop the last saved value from the control flow stack
+Value *SIAnnotateControlFlow::popSaved() {
+  return Stack.pop_back_val().second;
+}
+
+/// \brief Push a BB and saved value to the control flow stack
+void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) {
+  Stack.push_back(std::make_pair(BB, Saved));
+}
+
+/// \brief Can the condition represented by this PHI node treated like
+/// an "Else" block?
+bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
+  BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock();
+  for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+    if (Phi->getIncomingBlock(i) == IDom) {
+
+      if (Phi->getIncomingValue(i) != BoolTrue)
+        return false;
+
+    } else {
+      if (Phi->getIncomingValue(i) != BoolFalse)
+        return false;
+ 
+    }
+  }
+  return true;
+}
+
+// \brief Erase "Phi" if it is not used any more
+void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
+  if (!Phi->hasNUsesOrMore(1))
+    Phi->eraseFromParent();
+}
+
+/// \brief Open a new "If" block
+void SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Close the last "If" block and open a new "Else" block
+void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
+  Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
+  Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
+  push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+}
+
+/// \brief Recursively handle the condition leading to a loop
+void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) {
+  if (PHINode *Phi = dyn_cast<PHINode>(Cond)) {
+
+    // Handle all non constant incoming values first
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (isa<ConstantInt>(Incoming))
+        continue;
+
+      Phi->setIncomingValue(i, BoolFalse);
+      handleLoopCondition(Incoming);
+    }
+
+    BasicBlock *Parent = Phi->getParent();
+    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
+
+    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
+
+      Value *Incoming = Phi->getIncomingValue(i);
+      if (Incoming != BoolTrue)
+        continue;
+
+      BasicBlock *From = Phi->getIncomingBlock(i);
+      if (From == IDom) {
+        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
+        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
+          Value *Args[] = {
+            OldEnd->getArgOperand(0),
+            PhiInserter.GetValueAtEndOfBlock(Parent)
+          };
+          Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
+          PhiInserter.AddAvailableValue(Parent, Ret);
+          continue;
+        }
+      }
+
+      TerminatorInst *Insert = From->getTerminator();
+      Value *Arg = PhiInserter.GetValueAtEndOfBlock(From);
+      Value *Ret = CallInst::Create(Break, Arg, "", Insert);
+      PhiInserter.AddAvailableValue(From, Ret);
+    }
+    eraseIfUnused(Phi);
+
+  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+    BasicBlock *Parent = Inst->getParent();
+    TerminatorInst *Insert = Parent->getTerminator();
+    Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) };
+    Value *Ret = CallInst::Create(IfBreak, Args, "", Insert);
+    PhiInserter.AddAvailableValue(Parent, Ret);
+
+  } else {
+    assert(0 && "Unhandled loop condition!");
+  }
+}
+
+/// \brief Handle a back edge (loop)
+void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
+  BasicBlock *Target = Term->getSuccessor(1);
+  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+
+  PhiInserter.Initialize(Int64, "");
+  PhiInserter.AddAvailableValue(Target, Broken);
+
+  Value *Cond = Term->getCondition();
+  Term->setCondition(BoolTrue);
+  handleLoopCondition(Cond);
+
+  BasicBlock *BB = Term->getParent();
+  Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB);
+  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
+       PI != PE; ++PI) {
+
+    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  }
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+  push(Term->getSuccessor(0), Arg);
+}
+
+/// \brief Close the last opened control flow
+void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt());
+}
+
+/// \brief Annotate the control flow with intrinsics so the backend can
+/// recognize if/then/else and loops.
+bool SIAnnotateControlFlow::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTree>();
+
+  for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
+       E = df_end(&F.getEntryBlock()); I != E; ++I) {
+
+    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+
+    if (!Term || Term->isUnconditional()) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      continue;
+    }
+
+    if (I.nodeVisited(Term->getSuccessor(1))) {
+      if (isTopOfStack(*I))
+        closeControlFlow(*I);
+      handleLoop(Term);
+      continue;
+    }
+
+    if (isTopOfStack(*I)) {
+      PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
+      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+        insertElse(Term);
+        eraseIfUnused(Phi);
+        continue;
+      }
+      closeControlFlow(*I);
+    }
+    openIf(Term);
+  }
+
+  assert(Stack.empty());
+  return true;
+}
+
+/// \brief Create the annotation pass
+FunctionPass *llvm::createSIAnnotateControlFlowPass() {
+  return new SIAnnotateControlFlow();
+}
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.cpp b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
new file mode 100644
index 000000000000..6f0c30761506
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIISelLowering.cpp
@@ -0,0 +1,670 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIISelLowering.h"
+#include "AMDIL.h"
+#include "AMDGPU.h"
+#include "AMDILIntrinsicInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+SITargetLowering::SITargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
+    TRI(TM.getRegisterInfo()) {
+
+  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+
+  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
+
+  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+
+  addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
+
+  addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+  addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+
+  addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
+  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+
+  addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
+  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+
+  computeRegisterProperties();
+
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+  setOperationAction(ISD::ADD, MVT::i64, Legal);
+  setOperationAction(ISD::ADD, MVT::i32, Legal);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setTargetDAGCombine(ISD::SELECT_CC);
+
+  setTargetDAGCombine(ISD::SETCC);
+
+  setSchedulingPreference(Sched::RegPressure);
+}
+
+SDValue SITargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  assert(CallConv == CallingConv::C);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  uint32_t Skipped = 0;
+
+  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+   
+    // First check if it's a PS input addr 
+    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+
+      assert((PSInputNum <= 15) && "Too many PS inputs!");
+
+      if (!Arg.Used) {
+        // We can savely skip PS inputs
+        Skipped |= 1 << i;
+        ++PSInputNum;
+        continue;
+      }
+
+      Info->PSInputAddr |= 1 << PSInputNum++;
+    }
+
+    // Second split vertices into their elements
+    if (Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eigth.
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned j = 0; j != NumElements; ++j) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
+      }
+
+    } else {
+      Splits.push_back(Arg);
+    }
+  }
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) {
+    Info->PSInputAddr |= 1;
+    CCInfo.AllocateReg(AMDGPU::VGPR0);
+    CCInfo.AllocateReg(AMDGPU::VGPR1);
+  }
+
+  AnalyzeFormalArguments(CCInfo, Splits);
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+
+    if (Skipped & (1 << i)) {
+      InVals.push_back(SDValue());
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+    MVT VT = VA.getLocVT();
+
+    if (VT == MVT::i64) {
+      // For now assume it is a pointer
+      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
+                                     &AMDGPU::SReg_64RegClass);
+      Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
+      InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      continue;
+    }
+
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    const ISD::InputArg &Arg = Ins[i];
+    if (Arg.VT.isVector()) {
+
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+        Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      for (unsigned j = 0; j != NumElements; ++j)
+        Regs.push_back(DAG.getUNDEF(VT));
+ 
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
+                                   Regs.data(), Regs.size()));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+  return Chain;
+}
+
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+
+  switch (MI->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::BRANCH: return BB;
+  }
+  return BB;
+}
+
+EVT SITargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i1;
+}
+
+MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
+  return MVT::i32;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  }
+  return SDValue();
+}
+
+/// \brief Helper function for LowerBRCOND
+static SDNode *findUser(SDValue Value, unsigned Opcode) {
+
+  SDNode *Parent = Value.getNode();
+  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
+       I != E; ++I) {
+
+    if (I.getUse().get() != Value)
+      continue;
+
+    if (I->getOpcode() == Opcode)
+      return *I;
+  }
+  return 0;
+}
+
+/// This transforms the control flow intrinsics to get the branch destination as
+/// last parameter, also switches branch target with BR if the need arise
+SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
+                                      SelectionDAG &DAG) const {
+
+  DebugLoc DL = BRCOND.getDebugLoc();
+
+  SDNode *Intr = BRCOND.getOperand(1).getNode();
+  SDValue Target = BRCOND.getOperand(2);
+  SDNode *BR = 0;
+
+  if (Intr->getOpcode() == ISD::SETCC) {
+    // As long as we negate the condition everything is fine
+    SDNode *SetCC = Intr;
+    assert(SetCC->getConstantOperandVal(1) == 1);
+    assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+           ISD::SETNE);
+    Intr = SetCC->getOperand(0).getNode();
+
+  } else {
+    // Get the target from BR if we don't negate the condition
+    BR = findUser(BRCOND, ISD::BR);
+    Target = BR->getOperand(1);
+  }
+
+  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+
+  // Build the result and
+  SmallVector<EVT, 4> Res;
+  for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
+    Res.push_back(Intr->getValueType(i));
+
+  // operands of the new intrinsic call
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(BRCOND.getOperand(0));
+  for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
+    Ops.push_back(Intr->getOperand(i));
+  Ops.push_back(Target);
+
+  // build the new intrinsic call
+  SDNode *Result = DAG.getNode(
+    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
+    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+
+  if (BR) {
+    // Give the branch instruction our target
+    SDValue Ops[] = {
+      BR->getOperand(0),
+      BRCOND.getOperand(2)
+    };
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+  }
+
+  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
+
+  // Copy the intrinsic results to registers
+  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
+    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
+    if (!CopyToReg)
+      continue;
+
+    Chain = DAG.getCopyToReg(
+      Chain, DL,
+      CopyToReg->getOperand(1),
+      SDValue(Result, i - 1),
+      SDValue());
+
+    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
+  }
+
+  // Remove the old intrinsic from the chain
+  DAG.ReplaceAllUsesOfValueWith(
+    SDValue(Intr, Intr->getNumValues() - 1),
+    Intr->getOperand(0));
+
+  return Chain;
+}
+
+SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Possible Min/Max pattern
+  SDValue MinMax = LowerMinMax(Op, DAG);
+  if (MinMax.getNode()) {
+    return MinMax;
+  }
+
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
+  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::SELECT_CC: {
+      N->dump();
+      ConstantSDNode *True, *False;
+      // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
+      if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+          && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+          && True->isAllOnesValue()
+          && False->isNullValue()
+          && VT == MVT::i1) {
+        return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
+                           N->getOperand(1), N->getOperand(4));
+
+      }
+      break;
+    }
+    case ISD::SETCC: {
+      SDValue Arg0 = N->getOperand(0);
+      SDValue Arg1 = N->getOperand(1);
+      SDValue CC = N->getOperand(2);
+      ConstantSDNode * C = NULL;
+      ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
+
+      // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
+      if (VT == MVT::i1
+          && Arg0.getOpcode() == ISD::SIGN_EXTEND
+          && Arg0.getOperand(0).getValueType() == MVT::i1
+          && (C = dyn_cast<ConstantSDNode>(Arg1))
+          && C->isNullValue()
+          && CCOp == ISD::SETNE) {
+        return SimplifySetCC(VT, Arg0.getOperand(0),
+                             DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
+      }
+      break;
+    }
+  }
+  return SDValue();
+}
+
+/// \brief Test if RegClass is one of the VSrc classes 
+static bool isVSrc(unsigned RegClass) {
+  return AMDGPU::VSrc_32RegClassID == RegClass ||
+         AMDGPU::VSrc_64RegClassID == RegClass;
+}
+
+/// \brief Test if RegClass is one of the SSrc classes 
+static bool isSSrc(unsigned RegClass) {
+  return AMDGPU::SSrc_32RegClassID == RegClass ||
+         AMDGPU::SSrc_64RegClassID == RegClass;
+}
+
+/// \brief Analyze the possible immediate value Op
+///
+/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
+/// and the immediate value if it's a literal immediate
+int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
+
+  union {
+    int32_t I;
+    float F;
+  } Imm;
+
+  if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N))
+    Imm.I = Node->getSExtValue();
+  else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+    Imm.F = Node->getValueAPF().convertToFloat();
+  else
+    return -1; // It isn't an immediate
+
+  if ((Imm.I >= -16 && Imm.I <= 64) ||
+      Imm.F == 0.5f || Imm.F == -0.5f ||
+      Imm.F == 1.0f || Imm.F == -1.0f ||
+      Imm.F == 2.0f || Imm.F == -2.0f ||
+      Imm.F == 4.0f || Imm.F == -4.0f)
+    return 0; // It's an inline immediate
+
+  return Imm.I; // It's a literal immediate
+}
+
+/// \brief Try to fold an immediate directly into an instruction
+bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
+                               bool &ScalarSlotUsed) const {
+
+  MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
+  if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+    return false;
+
+  const SDValue &Op = Mov->getOperand(0);
+  int32_t Value = analyzeImmediate(Op.getNode());
+  if (Value == -1) {
+    // Not an immediate at all
+    return false;
+
+  } else if (Value == 0) {
+    // Inline immediates can always be fold
+    Operand = Op;
+    return true;
+
+  } else if (Value == Immediate) {
+    // Already fold literal immediate
+    Operand = Op;
+    return true;
+
+  } else if (!ScalarSlotUsed && !Immediate) {
+    // Fold this literal immediate
+    ScalarSlotUsed = true;
+    Immediate = Value;
+    Operand = Op;
+    return true;
+
+  }
+
+  return false;
+}
+
+/// \brief Does "Op" fit into register class "RegClass" ?
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op,
+                                    unsigned RegClass) const {
+
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
+  SDNode *Node = Op.getNode();
+
+  const TargetRegisterClass *OpClass;
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) {
+    const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode());
+    int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
+    if (OpClassID == -1)
+      OpClass = getRegClassFor(Op.getSimpleValueType());
+    else
+      OpClass = TRI->getRegClass(OpClassID);
+
+  } else if (Node->getOpcode() == ISD::CopyFromReg) {
+    RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode());
+    OpClass = MRI.getRegClass(Reg->getReg());
+
+  } else
+    return false;
+
+  return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass);
+}
+
+/// \brief Make sure that we don't exeed the number of allowed scalars
+void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
+                                       unsigned RegClass,
+                                       bool &ScalarSlotUsed) const {
+
+  // First map the operands register class to a destination class
+  if (RegClass == AMDGPU::VSrc_32RegClassID)
+    RegClass = AMDGPU::VReg_32RegClassID;
+  else if (RegClass == AMDGPU::VSrc_64RegClassID)
+    RegClass = AMDGPU::VReg_64RegClassID;
+  else
+    return;
+
+  // Nothing todo if they fit naturaly
+  if (fitsRegClass(DAG, Operand, RegClass))
+    return;
+
+  // If the scalar slot isn't used yet use it now
+  if (!ScalarSlotUsed) {
+    ScalarSlotUsed = true;
+    return;
+  }
+
+  // This is a conservative aproach, it is possible that we can't determine
+  // the correct register class and copy too often, but better save than sorry.
+  SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
+  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(),
+                                    Operand.getValueType(), Operand, RC);
+  Operand = SDValue(Node, 0);
+}
+
+SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
+                                          SelectionDAG &DAG) const {
+
+  // Original encoding (either e32 or e64)
+  int Opcode = Node->getMachineOpcode();
+  const MCInstrDesc *Desc = &TII->get(Opcode);
+
+  unsigned NumDefs = Desc->getNumDefs();
+  unsigned NumOps = Desc->getNumOperands();
+
+  // Commuted opcode if available
+  int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
+  const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
+
+  assert(!DescRev || DescRev->getNumDefs() == NumDefs);
+  assert(!DescRev || DescRev->getNumOperands() == NumOps);
+
+  // e64 version if available, -1 otherwise
+  int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+
+  assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
+  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
+
+  int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
+  bool HaveVSrc = false, HaveSSrc = false;
+
+  // First figure out what we alread have in this instruction
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (isVSrc(RegClass))
+      HaveVSrc = true;
+    else if (isSSrc(RegClass))
+      HaveSSrc = true;
+    else
+      continue;
+
+    int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
+    if (Imm != -1 && Imm != 0) {
+      // Literal immediate
+      Immediate = Imm;
+    }
+  }
+
+  // If we neither have VSrc nor SSrc it makes no sense to continue
+  if (!HaveVSrc && !HaveSSrc)
+    return Node;
+
+  // No scalar allowed when we have both VSrc and SSrc
+  bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
+
+  // Second go over the operands and try to fold them
+  std::vector<SDValue> Ops;
+  bool Promote2e64 = false;
+  for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
+       i != e && Op < NumOps; ++i, ++Op) {
+
+    const SDValue &Operand = Node->getOperand(i);
+    Ops.push_back(Operand);
+
+    // Already folded immediate ?
+    if (isa<ConstantSDNode>(Operand.getNode()) ||
+        isa<ConstantFPSDNode>(Operand.getNode()))
+      continue;
+
+    // Is this a VSrc or SSrc operand ?
+    unsigned RegClass = Desc->OpInfo[Op].RegClass;
+    if (isVSrc(RegClass) || isSSrc(RegClass)) {
+      // Try to fold the immediates
+      if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
+        // Folding didn't worked, make sure we don't hit the SReg limit
+        ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
+      }
+      continue;
+    }
+
+    if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
+
+      unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
+      assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
+
+      // Test if it makes sense to swap operands
+      if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
+          (!fitsRegClass(DAG, Ops[1], RegClass) &&
+           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
+
+        // Swap commutable operands
+        SDValue Tmp = Ops[1];
+        Ops[1] = Ops[0];
+        Ops[0] = Tmp;
+
+        Desc = DescRev;
+        DescRev = 0;
+        continue;
+      }
+    }
+
+    if (DescE64 && !Immediate) {
+
+      // Test if it makes sense to switch to e64 encoding
+      unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
+      if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass))
+        continue;
+
+      int32_t TmpImm = -1;
+      if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) ||
+          (!fitsRegClass(DAG, Ops[i], RegClass) &&
+           fitsRegClass(DAG, Ops[1], OtherRegClass))) {
+
+        // Switch to e64 encoding
+        Immediate = -1;
+        Promote2e64 = true;
+        Desc = DescE64;
+        DescE64 = 0;
+      }
+    }
+  }
+
+  if (Promote2e64) {
+    // Add the modifier flags while promoting
+    for (unsigned i = 0; i < 4; ++i)
+      Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
+  }
+
+  // Add optional chain and glue
+  for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
+    Ops.push_back(Node->getOperand(i));
+
+  // Create a complete new instruction
+  return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
+                            Node->getVTList(), Ops.data(), Ops.size());
+}
diff --git a/contrib/llvm/lib/Target/R600/SIISelLowering.h b/contrib/llvm/lib/Target/R600/SIISelLowering.h
new file mode 100644
index 000000000000..5ad2f40f0f3a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIISelLowering.h
@@ -0,0 +1,58 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SIISELLOWERING_H
+#define SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering : public AMDGPUTargetLowering {
+  const SIInstrInfo * TII;
+  const TargetRegisterInfo * TRI;
+
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+  bool foldImm(SDValue &Operand, int32_t &Immediate,
+               bool &ScalarSlotUsed) const;
+  bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const;
+  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 
+                       unsigned RegClass, bool &ScalarSlotUsed) const;
+
+public:
+  SITargetLowering(TargetMachine &tm);
+
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                              MachineBasicBlock * BB) const;
+  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual MVT getScalarShiftAmountTy(EVT VT) const;
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+
+  int32_t analyzeImmediate(const SDNode *N) const;
+};
+
+} // End namespace llvm
+
+#endif //SIISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
new file mode 100644
index 000000000000..98bd3dbb6646
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInsertWaits.cpp
@@ -0,0 +1,358 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo &TRI;
+  const MachineRegisterInfo *MRI;
+
+  /// \brief Constant hardware limits
+  static const Counters WaitCounts;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(MachineOperand &Op);
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineInstr &MI);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Do we need def2def checks?
+  bool unorderedDefines(MachineInstr &MI);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+public:
+  SIInsertWaits(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
+    TRI(TII->getRegisterInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI insert wait  instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+  return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+  Counters Result;
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+      (MI.getOpcode() == AMDGPU::EXP || MI.getDesc().mayStore()));
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    MachineOperand &Op = MI.getOperand(0);
+    assert(Op.isReg() && "First LGKM operand must be a register!");
+
+    unsigned Reg = Op.getReg();
+    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+    Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+  // Constants are always irrelevant
+  if (!Op.isReg())
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant
+  MachineInstr &MI = *Op.getParent();
+  if (MI.getOpcode() == AMDGPU::EXP)
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+  if (!Op.isReg())
+    return std::make_pair(0, 0);
+
+  unsigned Reg = Op.getReg();
+  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI.getEncodingValue(Reg);
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Increment = getHwCounts(MI);
+  unsigned Sum = 0;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0)
+    return;
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = LastIssued;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = LastIssued;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered
+  Ordered[0] = true;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = WaitCounts;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+    
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // adjust the value to the real hardware posibilities
+      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+          .addImm((Counts.Named.VM & 0xF) |
+                  ((Counts.Named.EXP & 0x7) << 4) |
+                  ((Counts.Named.LGKM & 0x7) << 8));
+
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // For each register affected by this
+  // instruction increase the result sequence
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef()) {
+        increaseCounters(Result, UsedRegs[j]);
+        increaseCounters(Result, DefinedRegs[j]);
+      }
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+
+  bool Changes = false;
+
+  MRI = &MF.getRegInfo();
+
+  WaitedOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      Changes |= insertWait(MBB, I, handleOperands(*I));
+      pushInstruction(*I);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  return Changes;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIInstrFormats.td b/contrib/llvm/lib/Target/R600/SIInstrFormats.td
new file mode 100644
index 000000000000..3891ddb2dbe2
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrFormats.td
@@ -0,0 +1,426 @@
+//===-- SIInstrFormats.td - SI Instruction Encodings ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<1> VM_CNT = 0;
+  field bits<1> EXP_CNT = 0;
+  field bits<1> LGKM_CNT = 0;
+
+  let TSFlags{0} = VM_CNT;
+  let TSFlags{1} = EXP_CNT;
+  let TSFlags{2} = LGKM_CNT;
+}
+
+class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<32> Inst;
+  let Size = 4;
+}
+
+class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  let Size = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Scalar operations
+//===----------------------------------------------------------------------===//
+
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<8> SSRC0;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = SDST;
+  let Inst{31-23} = 0x17d; //encoding;
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+  
+  bits<7> SDST;
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = SDST;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  Enc32<outs, ins, asm, pattern> {
+
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   Enc32 <outs, ins , asm, pattern> {
+
+  bits <7> SDST;
+  bits <16> SIMM16;
+  
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = SDST;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
+  (outs),
+  ins,
+  asm,
+  pattern > {
+
+  bits <16> SIMM16;
+
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
+            list<dag> pattern> : Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<7> SBASE;
+  bits<8> OFFSET;
+  
+  let Inst{7-0} = OFFSET;
+  let Inst{8} = imm;
+  let Inst{14-9} = SBASE{6-1};
+  let Inst{21-15} = SDST;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+
+  let LGKM_CNT = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU operations
+//===----------------------------------------------------------------------===//
+    
+let Uses = [EXEC] in {
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = VDST;
+  let Inst{31-25} = 0x3f; //encoding
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<8> VSRC1;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = VDST;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<3> ABS; 
+  bits<1> CLAMP;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{10-8} = ABS;
+  let Inst{11} = CLAMP;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+  
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<7> SDST;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{14-8} = SDST;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
+
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+ 
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<8> VSRC;
+  bits<2> ATTRCHAN;
+  bits<6> ATTR;
+
+  let Inst{7-0} = VSRC;
+  let Inst{9-8} = ATTRCHAN;
+  let Inst{15-10} = ATTR;
+  let Inst{17-16} = op;
+  let Inst{25-18} = VDST;
+  let Inst{31-26} = 0x32; // encoding
+
+  let neverHasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+} // End Uses = [EXEC]
+
+//===----------------------------------------------------------------------===//
+// Vector I/O operations
+//===----------------------------------------------------------------------===//
+
+let Uses = [EXEC] in {
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<1> LDS;
+  bits<8> VADDR;
+  bits<7> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{16} = LDS;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
+}
+
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<4> DFMT;
+  bits<3> NFMT;
+  bits<8> VADDR;
+  bits<7> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = DFMT;
+  let Inst{25-23} = NFMT;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
+  let neverHasSideEffects = 1;
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<4> DMASK;
+  bits<1> UNORM;
+  bits<1> GLC;
+  bits<1> DA;
+  bits<1> R128;
+  bits<1> TFE;
+  bits<1> LWE;
+  bits<1> SLC;
+  bits<8> VADDR;
+  bits<7> SRSRC;
+  bits<7> SSAMP; 
+
+  let Inst{11-8} = DMASK;
+  let Inst{12} = UNORM;
+  let Inst{13} = GLC;
+  let Inst{14} = DA;
+  let Inst{15} = R128;
+  let Inst{16} = TFE;
+  let Inst{17} = LWE;
+  let Inst{24-18} = op;
+  let Inst{25} = SLC;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC{6-2};
+  let Inst{57-53} = SSAMP{6-2};
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+}
+
+def EXP : Enc64<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  bits<4> EN;
+  bits<6> TGT;
+  bits<1> COMPR;
+  bits<1> DONE;
+  bits<1> VM;
+  bits<8> VSRC0;
+  bits<8> VSRC1;
+  bits<8> VSRC2;
+  bits<8> VSRC3;
+
+  let Inst{3-0} = EN;
+  let Inst{9-4} = TGT;
+  let Inst{10} = COMPR;
+  let Inst{11} = DONE;
+  let Inst{12} = VM;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = VSRC0;
+  let Inst{47-40} = VSRC1;
+  let Inst{55-48} = VSRC2;
+  let Inst{63-56} = VSRC3;
+
+  let EXP_CNT = 1;
+}
+
+} // End Uses = [EXEC]
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
new file mode 100644
index 000000000000..0bfcef562f04
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.cpp
@@ -0,0 +1,264 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include <stdio.h>
+
+using namespace llvm;
+
+SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this)
+    { }
+
+const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+void
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MI, DebugLoc DL,
+                         unsigned DestReg, unsigned SrcReg,
+                         bool KillSrc) const {
+
+  // If we are trying to copy to or from SCC, there is a bug somewhere else in
+  // the backend.  While it may be theoretically possible to do this, it should
+  // never be necessary.
+  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
+
+  const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
+  };
+
+  const int16_t Sub0_7[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
+  };
+
+  const int16_t Sub0_3[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
+  };
+
+  const int16_t Sub0_1[] = {
+    AMDGPU::sub0, AMDGPU::sub1, 0
+  };
+
+  unsigned Opcode;
+  const int16_t *SubIndices;
+
+  if (AMDGPU::M0 == DestReg) {
+    // Check if M0 isn't already set to this value
+    for (MachineBasicBlock::reverse_iterator E = MBB.rend(),
+      I = MachineBasicBlock::reverse_iterator(MI); I != E; ++I) {
+
+      if (!I->definesRegister(AMDGPU::M0))
+        continue;
+
+      unsigned Opc = I->getOpcode();
+      if (Opc != TargetOpcode::COPY && Opc != AMDGPU::S_MOV_B32)
+        break;
+
+      if (!I->readsRegister(SrcReg))
+        break;
+
+      // The copy isn't necessary
+      return;
+    }
+  }
+
+  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::S_MOV_B32;
+    SubIndices = Sub0_15;
+
+  } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+
+  } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_1;
+
+  } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_128RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_3;
+
+  } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_256RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_7;
+
+  } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
+	   AMDGPU::SReg_512RegClass.contains(SrcReg));
+    Opcode = AMDGPU::V_MOV_B32_e32;
+    SubIndices = Sub0_15;
+
+  } else {
+    llvm_unreachable("Can't copy register!");
+  }
+
+  while (unsigned SubIdx = *SubIndices++) {
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, SubIdx));
+
+    Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc));
+
+    if (*SubIndices)
+      Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
+  }
+}
+
+unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
+
+  int NewOpc;
+
+  // Try to map original to commuted opcode
+  if ((NewOpc = AMDGPU::getCommuteRev(Opcode)) != -1)
+    return NewOpc;
+
+  // Try to map commuted to original opcode
+  if ((NewOpc = AMDGPU::getCommuteOrig(Opcode)) != -1)
+    return NewOpc;
+
+  return Opcode;
+}
+
+MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
+                                              bool NewMI) const {
+
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() ||
+      !MI->getOperand(2).isReg())
+    return 0;
+
+  MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+
+  if (MI)
+    MI->setDesc(get(commuteOpcode(MI->getOpcode())));
+
+  return MI;
+}
+
+MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                           int64_t Imm) const {
+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
+  MachineInstrBuilder MIB(*MF, MI);
+  MIB.addReg(DstReg, RegState::Define);
+  MIB.addImm(Imm);
+
+  return MI;
+
+}
+
+bool SIInstrInfo::isMov(unsigned Opcode) const {
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+    return true;
+  }
+}
+
+bool
+SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  return RC != &AMDGPU::EXECRegRegClass;
+}
+
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
+}
+
+
+int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  llvm_unreachable("Unimplemented");
+}
+
+int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
+                                                     unsigned SourceReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
+  llvm_unreachable("Unimplemented");
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+MachineInstrBuilder SIInstrInfo::buildIndirectRead(
+                                   MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned ValueReg,
+                                   unsigned Address, unsigned OffsetReg) const {
+  llvm_unreachable("Unimplemented");
+}
+
+const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
+  llvm_unreachable("Unimplemented");
+}
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.h b/contrib/llvm/lib/Target/R600/SIInstrInfo.h
new file mode 100644
index 000000000000..d4e60e508634
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.h
@@ -0,0 +1,97 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIINSTRINFO_H
+#define SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo : public AMDGPUInstrInfo {
+private:
+  const SIRegisterInfo RI;
+
+public:
+  explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+
+  const SIRegisterInfo &getRegisterInfo() const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  unsigned commuteOpcode(unsigned Opcode) const;
+
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
+                                           bool NewMI=false) const;
+
+  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                        int64_t Imm) const;
+
+  virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  virtual bool isMov(unsigned Opcode) const;
+
+  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
+  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
+                                            unsigned Channel) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
+                                                      unsigned SourceReg) const;
+
+  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+
+  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                                 MachineBasicBlock::iterator I,
+                                                 unsigned ValueReg,
+                                                 unsigned Address,
+                                                 unsigned OffsetReg) const;
+
+  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                                MachineBasicBlock::iterator I,
+                                                unsigned ValueReg,
+                                                unsigned Address,
+                                                unsigned OffsetReg) const;
+
+  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
+  };
+
+namespace AMDGPU {
+
+  int getVOPe64(uint16_t Opcode);
+  int getCommuteRev(uint16_t Opcode);
+  int getCommuteOrig(uint16_t Opcode);
+
+} // End namespace AMDGPU
+
+} // End namespace llvm
+
+namespace SIInstrFlags {
+  enum Flags {
+    // First 4 bits are the instruction encoding
+    VM_CNT = 1 << 0,
+    EXP_CNT = 1 << 1,
+    LGKM_CNT = 1 << 2
+  };
+}
+
+#endif //SIINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/R600/SIInstrInfo.td b/contrib/llvm/lib/Target/R600/SIInstrInfo.td
new file mode 100644
index 000000000000..617f0b871c25
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstrInfo.td
@@ -0,0 +1,356 @@
+//===-- SIInstrInfo.td - SI Instruction Infos -------------*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+// SMRD takes a 64bit memory address and can only add an 32bit offset
+def SIadd64bit32bit : SDNode<"ISD::ADD",
+  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
+>;
+
+// Transformation function, extract the lower 32bit of a 64bit immediate
+def LO32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
+}]>;
+
+// Transformation function, extract the upper 32bit of a 64bit immediate
+def HI32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
+}]>;
+
+def IMM8bitDWORD : ImmLeaf <
+  i32, [{
+    return (Imm & ~0x3FC) == 0;
+  }], SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(
+      N->getZExtValue() >> 2, MVT::i32);
+  }]>
+>;
+
+def IMM12bit : ImmLeaf <
+  i16,
+  [{return isUInt<12>(Imm);}]
+>;
+
+class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
+  return ((const SITargetLowering &)TLI).analyzeImmediate(N) == 0;
+}]>;
+
+//===----------------------------------------------------------------------===//
+// SI assembler operands
+//===----------------------------------------------------------------------===//
+
+def SIOperand {
+  int ZERO = 0x80;
+  int VCC = 0x6A;
+}
+
+include "SIInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction multiclass helpers.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Scalar classes
+//===----------------------------------------------------------------------===//
+
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern> : SOP1 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
+  op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
+  op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_32:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern> : SOPK <
+  op, (outs SReg_64:$dst), (ins i16imm:$src0),
+  opName#" $dst, $src0", pattern
+>;
+
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
+                        RegisterClass dstClass> {
+  def _IMM : SMRD <
+    op, 1, (outs dstClass:$dst),
+    (ins baseClass:$sbase, i32imm:$offset),
+    asm#" $dst, $sbase, $offset", []
+  >;
+
+  def _SGPR : SMRD <
+    op, 0, (outs dstClass:$dst),
+    (ins baseClass:$sbase, SReg_32:$soff),
+    asm#" $dst, $sbase, $soff", []
+  >;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector ALU classes
+//===----------------------------------------------------------------------===//
+
+class VOP <string opName> {
+  string OpName = opName;
+}
+
+class VOP2_REV <string revOp, bit isOrig> {
+  string RevOp = revOp;
+  bit IsOrig = isOrig;
+}
+
+multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
+                        string opName, list<dag> pattern> {
+
+  def _e32 : VOP1 <
+    op, (outs drc:$dst), (ins src:$src0),
+    opName#"_e32 $dst, $src0", pattern
+  >, VOP <opName>;
+
+  def _e64 : VOP3 <
+    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs drc:$dst),
+    (ins src:$src0,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName> {
+    let SRC1 = SIOperand.ZERO;
+    let SRC2 = SIOperand.ZERO;
+  }
+}
+
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_32, opName, pattern>;
+
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
+
+multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, list<dag> pattern, string revOp> {
+  def _e32 : VOP2 <
+    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _e64 : VOP3 <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs vrc:$dst),
+    (ins arc:$src0, arc:$src1,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let SRC2 = SIOperand.ZERO;
+  }
+}
+
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern,
+                    string revOp = opName>
+  : VOP2_Helper <op, VReg_32, VSrc_32, opName, pattern, revOp>;
+
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern,
+                    string revOp = opName>
+  : VOP2_Helper <op, VReg_64, VSrc_64, opName, pattern, revOp>;
+
+multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
+                     string revOp = opName> {
+
+  def _e32 : VOP2 <
+    op, (outs VReg_32:$dst), (ins VSrc_32:$src0, VReg_32:$src1),
+    opName#"_e32 $dst, $src0, $src1", pattern
+  >, VOP <opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>;
+
+  def _e64 : VOP3b <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs VReg_32:$dst),
+    (ins VSrc_32:$src0, VSrc_32:$src1,
+         i32imm:$abs, i32imm:$clamp,
+         i32imm:$omod, i32imm:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+  >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
+    let SRC2 = SIOperand.ZERO;
+    /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
+       can write it into any SGPR. We currently don't use the carry out,
+       so for now hardcode it to VCC as well */
+    let SDST = SIOperand.VCC;
+  }
+}
+
+multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                        string opName, ValueType vt, PatLeaf cond> {
+
+  def _e32 : VOPC <
+    op, (ins arc:$src0, vrc:$src1),
+    opName#"_e32 $dst, $src0, $src1", []
+  >, VOP <opName>;
+
+  def _e64 : VOP3 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    (outs SReg_64:$dst),
+    (ins arc:$src0, arc:$src1,
+         InstFlag:$abs, InstFlag:$clamp,
+         InstFlag:$omod, InstFlag:$neg),
+    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+    !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
+      [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
+    )
+  >, VOP <opName> {
+    let SRC2 = SIOperand.ZERO;
+  }
+}
+
+multiclass VOPC_32 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_32, VSrc_32, opName, vt, cond>;
+
+multiclass VOPC_64 <bits<8> op, string opName,
+  ValueType vt = untyped, PatLeaf cond = COND_NULL>
+  : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
+
+class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
+   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+>, VOP <opName>;
+
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
+   i32imm:$abs, i32imm:$clamp, i32imm:$omod, i32imm:$neg),
+  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
+>, VOP <opName>;
+
+//===----------------------------------------------------------------------===//
+// Vector I/O classes
+//===----------------------------------------------------------------------===//
+
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs),
+  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
+   SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+}
+
+class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
+       i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, "
+     #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
+  asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
+     #" $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+  op,
+  (outs VReg_128:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       SReg_256:$srsrc, SReg_128:$ssamp),
+  asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
+     #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Vector instruction mappings
+//===----------------------------------------------------------------------===//
+
+// Maps an opcode in e32 form to its e64 equivalent
+def getVOPe64 : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["Size"];
+  let KeyCol = ["4"];
+  let ValueCols = [["8"]];
+}
+
+// Maps an original opcode to its commuted version
+def getCommuteRev : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+// Maps an commuted opcode to its original version
+def getCommuteOrig : InstrMapping {
+  let FilterClass = "VOP2_REV";
+  let RowFields = ["RevOp"];
+  let ColFields = ["IsOrig"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+include "SIInstructions.td"
diff --git a/contrib/llvm/lib/Target/R600/SIInstructions.td b/contrib/llvm/lib/Target/R600/SIInstructions.td
new file mode 100644
index 000000000000..4f734f91245a
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIInstructions.td
@@ -0,0 +1,1607 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out.  Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+class InterpSlots {
+int P0 = 2;
+int P10 = 0;
+int P20 = 1;
+}
+def INTERP : InterpSlots;
+
+def InterpSlot : Operand<i32> {
+  let PrintMethod = "printInterpSlot";
+}
+
+def isSI : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
+
+let Predicates = [isSI] in {
+
+let neverHasSideEffects = 1 in {
+
+let isMoveImm = 1 in {
+def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
+def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
+def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
+def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+} // End isMoveImm = 1
+
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
+def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
+} // End neverHasSideEffects = 1
+
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
+////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
+////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
+////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
+////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
+//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
+//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
+def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
+def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
+def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
+
+} // End hasSideEffects = 1
+
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
+def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+
+/*
+This instruction is disabled for now until we can figure out how to teach
+the instruction selector to correctly use the  S_CMP* vs V_CMP*
+instructions.
+
+When this instruction is enabled the code generator sometimes produces this
+invalid sequence:
+
+SCC = S_CMPK_EQ_I32 SGPR0, imm
+VCC = COPY SCC
+VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+
+def S_CMPK_EQ_I32 : SOPK <
+  0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
+  "S_CMPK_EQ_I32",
+  [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
+>;
+*/
+
+let isCompare = 1 in {
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+} // End isCompare = 1
+
+def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
+def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
+def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
+//def EXP : EXP_ <0x00000000, "EXP", []>;
+
+let isCompare = 1 in {
+
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">;
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>;
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32">;
+defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32">;
+defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32">;
+defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32">;
+defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32">;
+defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32">;
+defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32">;
+defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32">;
+defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32">;
+defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32">;
+defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32">;
+defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32">;
+defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32">;
+defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32">;
+defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32">;
+defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64">;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64">;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64">;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64">;
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64">;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">;
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64">;
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64">;
+defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64">;
+defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64">;
+defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64">;
+defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64">;
+defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64">;
+defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64">;
+defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64">;
+defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64">;
+defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64">;
+defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64">;
+defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64">;
+defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64">;
+defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64">;
+defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64">;
+defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32">;
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32">;
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32">;
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32">;
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32">;
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32">;
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32">;
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32">;
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32">;
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32">;
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32">;
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32">;
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32">;
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32">;
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32">;
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32">;
+defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32">;
+defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32">;
+defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32">;
+defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32">;
+defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32">;
+defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32">;
+defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32">;
+defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32">;
+defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32">;
+defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32">;
+defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32">;
+defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32">;
+defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32">;
+defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32">;
+defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64">;
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64">;
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64">;
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64">;
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64">;
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64">;
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64">;
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64">;
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64">;
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64">;
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64">;
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64">;
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64">;
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64">;
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64">;
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64">;
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64">;
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64">;
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64">;
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64">;
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64">;
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64">;
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64">;
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64">;
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64">;
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64">;
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64">;
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64">;
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64">;
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64">;
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>;
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>;
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>;
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32">;
+defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32">;
+defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32">;
+defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32">;
+defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32">;
+defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32">;
+defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32">;
+defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">;
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64">;
+defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64">;
+defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64">;
+defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64">;
+defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64">;
+defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64">;
+defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64">;
+defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">;
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32">;
+defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32">;
+defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32">;
+defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32">;
+defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32">;
+defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32">;
+defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32">;
+defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">;
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+
+defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64">;
+defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64">;
+defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64">;
+defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64">;
+defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64">;
+defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64">;
+defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64">;
+defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64">;
+
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32">;
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64">;
+
+let hasSideEffects = 1, Defs = [EXEC] in {
+defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
+} // End hasSideEffects = 1, Defs = [EXEC]
+
+} // End isCompare = 1
+
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
+//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
+//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
+//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
+def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
+//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
+//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
+//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
+//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
+//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
+//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
+//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
+//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
+//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
+//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
+//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
+//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
+//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
+//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
+//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+
+let mayLoad = 1 in {
+
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
+//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
+//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
+def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; 
+//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
+def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
+def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
+def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
+//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
+def IMAGE_SAMPLE_C : MIMG_Load_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
+//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
+//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
+def IMAGE_SAMPLE_C_L : MIMG_Load_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
+def IMAGE_SAMPLE_C_B : MIMG_Load_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
+//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
+//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
+//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
+//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
+//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
+//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
+//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
+//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
+//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
+//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
+//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
+//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
+//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
+//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
+//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
+//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
+//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
+//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
+//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
+//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
+//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
+//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
+//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
+//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
+//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
+//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
+//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
+//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
+//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
+//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
+//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
+//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
+//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
+//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
+//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
+//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
+//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
+//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
+//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
+//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
+//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
+//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
+//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
+//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
+//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
+//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
+//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
+//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
+//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
+//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+
+
+let neverHasSideEffects = 1, isMoveImm = 1 in {
+defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
+} // End neverHasSideEffects = 1, isMoveImm = 1
+
+defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
+//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
+//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
+  [(set VReg_32:$dst, (sint_to_fp VSrc_32:$src0))]
+>;
+//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
+//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
+  [(set (i32 VReg_32:$dst), (fp_to_sint VSrc_32:$src0))]
+>;
+defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
+////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
+//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
+//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
+//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
+//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
+//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
+//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
+//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
+//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
+  [(set VReg_32:$dst, (AMDGPUfract VSrc_32:$src0))]
+>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32",
+  [(set VReg_32:$dst, (fceil VSrc_32:$src0))]
+>;
+defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
+  [(set VReg_32:$dst, (frint VSrc_32:$src0))]
+>;
+defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
+  [(set VReg_32:$dst, (ffloor VSrc_32:$src0))]
+>;
+defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
+  [(set VReg_32:$dst, (fexp2 VSrc_32:$src0))]
+>;
+defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
+  [(set VReg_32:$dst, (flog2 VSrc_32:$src0))]
+>;
+defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
+defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
+defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
+  [(set VReg_32:$dst, (fdiv FP_ONE, VSrc_32:$src0))]
+>;
+defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_LEGACY_F32 : VOP1_32 <
+  0x0000002d, "V_RSQ_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_rsq VSrc_32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
+defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
+defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
+defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
+defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
+defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
+defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
+defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
+defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
+//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
+defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
+defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
+//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
+defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
+//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+
+def V_INTERP_P1_F32 : VINTRP <
+  0x00000000,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P1_F32 $dst, $i, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+def V_INTERP_P2_F32 : VINTRP <
+  0x00000001,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P2_F32 $dst, [$src0], $j, $attr_chan, $attr, [$m0]",
+  []> {
+
+  let Constraints = "$src0 = $dst";
+  let DisableEncoding = "$src0,$m0";
+
+}
+
+def V_INTERP_MOV_F32 : VINTRP <
+  0x00000002,
+  (outs VReg_32:$dst),
+  (ins InterpSlot:$src0, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_MOV_F32 $dst, $src0, $attr_chan, $attr, [$m0]",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
+  [(br bb:$target)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC0 $target", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC1 $target",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ $target",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ $target",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECZ $target",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ $target",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
+let hasSideEffects = 1 in {
+def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+} // End hasSideEffects
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+
+def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
+  "V_CNDMASK_B32_e32 $dst, $src0, $src1, [$vcc]",
+  []
+>{
+  let DisableEncoding = "$vcc";
+}
+
+def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
+  (ins VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2,
+   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
+  "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
+  [(set (i32 VReg_32:$dst), (select (i1 SSrc_64:$src2),
+   VSrc_32:$src1, VSrc_32:$src0))]
+>;
+
+//f32 pattern for V_CNDMASK_B32_e64
+def : Pat <
+  (f32 (select (i1 SSrc_64:$src2), VSrc_32:$src1, VSrc_32:$src0)),
+  (V_CNDMASK_B32_e64 VSrc_32:$src0, VSrc_32:$src1, SSrc_64:$src2)
+>;
+
+defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
+defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
+
+let isCommutable = 1 in {
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32",
+  [(set VReg_32:$dst, (fadd VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32",
+  [(set VReg_32:$dst, (fsub VSrc_32:$src0, VReg_32:$src1))]
+>;
+defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", [], "V_SUB_F32">;
+} // End isCommutable = 1
+
+defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+
+let isCommutable = 1 in {
+
+defm V_MUL_LEGACY_F32 : VOP2_32 <
+  0x00000007, "V_MUL_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_mul VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
+  [(set VReg_32:$dst, (fmul VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+} // End isCommutable = 1
+
+//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
+//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+
+let isCommutable = 1 in {
+
+defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
+  [(set VReg_32:$dst, (AMDGPUfmin VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
+  [(set VReg_32:$dst, (AMDGPUfmax VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
+defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+  [(set VReg_32:$dst, (srl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+>;
+defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
+
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+  [(set VReg_32:$dst, (sra VSrc_32:$src0, (i32 VReg_32:$src1)))]
+>;
+defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
+
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set VReg_32:$dst, (shl VSrc_32:$src0, (i32 VReg_32:$src1)))]
+>;
+defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
+
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set VReg_32:$dst, (and VSrc_32:$src0, VReg_32:$src1))]
+>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set VReg_32:$dst, (or VSrc_32:$src0, VReg_32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set VReg_32:$dst, (xor VSrc_32:$src0, VReg_32:$src1))]
+>;
+
+} // End isCommutable = 1
+
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
+defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
+defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
+defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
+//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+
+let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set VReg_32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
+>;
+
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set VReg_32:$dst, (sub (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
+>;
+defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
+
+let Uses = [VCC] in { // Carry-out comes from VCC
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
+defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">;
+} // End Uses = [VCC]
+} // End isCommutable = 1, Defs = [VCC]
+
+defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
+ [(set VReg_32:$dst, (int_SI_packf16 VSrc_32:$src0, VReg_32:$src1))]
+>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+let neverHasSideEffects = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
+//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
+//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
+
+} // End neverHasSideEffects
+def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
+def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
+def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
+def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
+def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
+def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
+def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
+def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+
+let isCommutable = 1 in {
+
+def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+
+} // isCommutable = 1
+
+def : Pat <
+  (mul VSrc_32:$src0, VReg_32:$src1),
+  (V_MUL_LO_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+>;
+
+def : Pat <
+  (mulhu VSrc_32:$src0, VReg_32:$src1),
+  (V_MUL_HI_U32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+>;
+
+def : Pat <
+  (mulhs VSrc_32:$src0, VReg_32:$src1),
+  (V_MUL_HI_I32 VSrc_32:$src0, VReg_32:$src1, (i32 0), 0, 0, 0, 0)
+>;
+
+def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
+def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  [(set (i32 SReg_32:$dst), (select (i1 SCCReg:$scc),
+                                     SReg_32:$src0, SReg_32:$src1))]
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+// f32 pattern for S_CSELECT_B32
+def : Pat <
+  (f32 (select (i1 SCCReg:$scc), SReg_32:$src0, SReg_32:$src1)),
+  (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
+>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set SReg_64:$dst, (i64 (and SSrc_64:$src0, SSrc_64:$src1)))]
+>;
+
+def : Pat <
+  (i1 (and SSrc_64:$src0, SSrc_64:$src1)),
+  (S_AND_B64 SSrc_64:$src0, SSrc_64:$src1)
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
+def : Pat <
+  (i1 (or SSrc_64:$src0, SSrc_64:$src1)),
+  (S_OR_B64 SSrc_64:$src0, SSrc_64:$src1)
+>;
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
+def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
+def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
+def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+def LOAD_CONST : AMDGPUShaderInst <
+  (outs GPRF32:$dst),
+  (ins i32imm:$src),
+  "LOAD_CONST $dst, $src",
+  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+>;
+
+// SI Psuedo instructions. These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
+    Uses = [EXEC], Defs = [EXEC] in {
+
+let isBranch = 1, isTerminator = 1 in {
+
+def SI_IF : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, brtarget:$target),
+  "SI_IF $dst, $vcc, $target",
+  [(set SReg_64:$dst, (int_SI_if SReg_64:$vcc, bb:$target))]
+>;
+
+def SI_ELSE : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target),
+  "SI_ELSE $dst, $src, $target",
+  [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> {
+
+  let Constraints = "$src = $dst";
+}
+
+def SI_LOOP : InstSI <
+  (outs),
+  (ins SReg_64:$saved, brtarget:$target),
+  "SI_LOOP $saved, $target",
+  [(int_SI_loop SReg_64:$saved, bb:$target)]
+>;
+
+} // end isBranch = 1, isTerminator = 1
+
+def SI_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src),
+  "SI_ELSE $dst, $src",
+  [(set SReg_64:$dst, (int_SI_break SReg_64:$src))]
+>;
+
+def SI_IF_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$vcc, SReg_64:$src),
+  "SI_IF_BREAK $dst, $vcc, $src",
+  [(set SReg_64:$dst, (int_SI_if_break SReg_64:$vcc, SReg_64:$src))]
+>;
+
+def SI_ELSE_BREAK : InstSI <
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src0, SReg_64:$src1),
+  "SI_ELSE_BREAK $dst, $src0, $src1",
+  [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))]
+>;
+
+def SI_END_CF : InstSI <
+  (outs),
+  (ins SReg_64:$saved),
+  "SI_END_CF $saved",
+  [(int_SI_end_cf SReg_64:$saved)]
+>;
+
+def SI_KILL : InstSI <
+  (outs),
+  (ins VReg_32:$src),
+  "SI_KIL $src",
+  [(int_AMDGPU_kill VReg_32:$src)]
+>;
+
+} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1
+  // Uses = [EXEC], Defs = [EXEC]
+
+let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
+
+def SI_INDIRECT_SRC : InstSI <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
+  "SI_INDIRECT_SRC $dst, $temp, $src, $idx, $off",
+  []
+>;
+
+class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
+  (outs rc:$dst, SReg_64:$temp),
+  (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VReg_32:$val),
+  "SI_INDIRECT_DST $dst, $temp, $src, $idx, $off, $val",
+  []
+> {
+  let Constraints = "$src = $dst";
+}
+
+def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
+def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
+def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
+def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+
+} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
+
+} // end IsCodeGenOnly, isPseudo
+
+def : Pat<
+  (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
+  (V_CNDMASK_B32_e64 VReg_32:$src2, VReg_32:$src1, (V_CMP_GT_F32_e64 0, VReg_32:$src0))
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KILL (V_MOV_B32_e32 0xbf800000))
+>;
+
+/* int_SI_vs_load_input */
+def : Pat<
+  (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
+                        VReg_32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
+                           VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
+                           0, 0, 0)
+>;
+
+/* int_SI_export */
+def : Pat <
+  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
+                 VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
+>;
+
+
+/* int_SI_sample for simple 1D texture lookup */
+def : Pat <
+  (int_SI_sample imm:$writemask, VReg_32:$addr,
+                 SReg_256:$rsrc, SReg_128:$sampler, imm),
+  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_32:$addr,
+                SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SamplePattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                    ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, imm),
+    (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr,
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleRectPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                        ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_RECT),
+    (opcode imm:$writemask, 1, 0, 0, 0, 0, 0, 0, addr_class:$addr,
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleArrayPattern<Intrinsic name, MIMG opcode, RegisterClass addr_class,
+                         ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_ARRAY),
+    (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr,
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleShadowPattern<Intrinsic name, MIMG opcode,
+                          RegisterClass addr_class, ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW),
+    (opcode imm:$writemask, 0, 0, 0, 0, 0, 0, 0, addr_class:$addr,
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+                               RegisterClass addr_class, ValueType addr_type> : Pat <
+    (name imm:$writemask, (addr_type addr_class:$addr),
+          SReg_256:$rsrc, SReg_128:$sampler, TEX_SHADOW_ARRAY),
+    (opcode imm:$writemask, 0, 0, 1, 0, 0, 0, 0, addr_class:$addr,
+          SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+/* int_SI_sample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<RegisterClass addr_class, ValueType addr_type> {
+  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_class, addr_type>;
+
+  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_class, addr_type>;
+
+  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
+  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_class, addr_type>;
+  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
+  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_class, addr_type>;
+}
+
+defm : SamplePatterns<VReg_64, v2i32>;
+defm : SamplePatterns<VReg_128, v4i32>;
+defm : SamplePatterns<VReg_256, v8i32>;
+defm : SamplePatterns<VReg_512, v16i32>;
+
+/********** ============================================ **********/
+/********** Extraction, Insertion, Building and Casting  **********/
+/********** ============================================ **********/
+
+foreach Index = 0-2 in {
+  def Extract_Element_v2i32_#Index : Extract_Element <
+    i32, v2i32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2i32_#Index : Insert_Element <
+    i32, v2i32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v2f32_#Index : Extract_Element <
+    f32, v2f32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v2f32_#Index : Insert_Element <
+    f32, v2f32, VReg_32, VReg_64, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-3 in {
+  def Extract_Element_v4i32_#Index : Extract_Element <
+    i32, v4i32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4i32_#Index : Insert_Element <
+    i32, v4i32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v4f32_#Index : Extract_Element <
+    f32, v4f32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v4f32_#Index : Insert_Element <
+    f32, v4f32, VReg_32, VReg_128, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-7 in {
+  def Extract_Element_v8i32_#Index : Extract_Element <
+    i32, v8i32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8i32_#Index : Insert_Element <
+    i32, v8i32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v8f32_#Index : Extract_Element <
+    f32, v8f32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v8f32_#Index : Insert_Element <
+    f32, v8f32, VReg_32, VReg_256, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+foreach Index = 0-15 in {
+  def Extract_Element_v16i32_#Index : Extract_Element <
+    i32, v16i32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16i32_#Index : Insert_Element <
+    i32, v16i32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+
+  def Extract_Element_v16f32_#Index : Extract_Element <
+    f32, v16f32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+  def Insert_Element_v16f32_#Index : Insert_Element <
+    f32, v16f32, VReg_32, VReg_512, Index, !cast<SubRegIndex>(sub#Index)
+  >;
+}
+
+def : Vector1_Build <v1i32, VReg_32, i32, VReg_32>;
+def : Vector2_Build <v2i32, VReg_64, i32, VReg_32>;
+def : Vector2_Build <v2f32, VReg_64, f32, VReg_32>;
+def : Vector4_Build <v4i32, VReg_128, i32, VReg_32>;
+def : Vector4_Build <v4f32, VReg_128, f32, VReg_32>;
+def : Vector8_Build <v8i32, VReg_256, i32, VReg_32>;
+def : Vector8_Build <v8f32, VReg_256, f32, VReg_32>;
+def : Vector16_Build <v16i32, VReg_512, i32, VReg_32>;
+def : Vector16_Build <v16f32, VReg_512, f32, VReg_32>;
+
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <i32, f32, VReg_32>;
+
+def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <f32, i32, VReg_32>;
+
+/********** =================== **********/
+/********** Src & Dst modifiers **********/
+/********** =================== **********/
+
+def : Pat <
+  (int_AMDIL_clamp VReg_32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+>;
+
+def : Pat <
+  (fabs VReg_32:$src),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   1 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+>;
+
+def : Pat <
+  (fneg VReg_32:$src),
+  (V_ADD_F32_e64 VReg_32:$src, (i32 0 /* SRC1 */),
+   0 /* ABS */, 0 /* CLAMP */, 0 /* OMOD */, 1 /* NEG */)
+>;
+
+/********** ================== **********/
+/********** Immediate Patterns **********/
+/********** ================== **********/
+
+def : Pat <
+  (i32 imm:$imm),
+  (V_MOV_B32_e32 imm:$imm)
+>;
+
+def : Pat <
+  (f32 fpimm:$imm),
+  (V_MOV_B32_e32 fpimm:$imm)
+>;
+
+def : Pat <
+  (i1 imm:$imm),
+  (S_MOV_B64 imm:$imm)
+>;
+
+def : Pat <
+  (i64 InlineImm<i64>:$imm),
+  (S_MOV_B64 InlineImm<i64>:$imm)
+>;
+
+// i64 immediates aren't supported in hardware, split it into two 32bit values
+def : Pat <
+  (i64 imm:$imm),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
+    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
+>;
+
+/********** ===================== **********/
+/********** Interpolation Paterns **********/
+/********** ===================== **********/
+
+def : Pat <
+  (int_SI_fs_constant imm:$attr_chan, imm:$attr, M0Reg:$params),
+  (V_INTERP_MOV_F32 INTERP.P0, imm:$attr_chan, imm:$attr, M0Reg:$params)
+>;
+
+def : Pat <
+  (int_SI_fs_interp imm:$attr_chan, imm:$attr, M0Reg:$params, VReg_64:$ij),
+  (V_INTERP_P2_F32 (V_INTERP_P1_F32 (EXTRACT_SUBREG VReg_64:$ij, sub0),
+                                    imm:$attr_chan, imm:$attr, M0Reg:$params),
+                   (EXTRACT_SUBREG VReg_64:$ij, sub1),
+                   imm:$attr_chan, imm:$attr, M0Reg:$params)
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+/* llvm.AMDGPU.pow */
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32, VReg_32>;
+
+def : Pat <
+  (int_AMDGPU_div VSrc_32:$src0, VSrc_32:$src1),
+  (V_MUL_LEGACY_F32_e32 VSrc_32:$src0, (V_RCP_LEGACY_F32_e32 VSrc_32:$src1))
+>;
+
+def : Pat<
+  (fdiv VSrc_32:$src0, VSrc_32:$src1),
+  (V_MUL_F32_e32 VSrc_32:$src0, (V_RCP_F32_e32 VSrc_32:$src1))
+>;
+
+def : Pat <
+  (fcos VSrc_32:$src0),
+  (V_COS_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+>;
+
+def : Pat <
+  (fsin VSrc_32:$src0),
+  (V_SIN_F32_e32 (V_MUL_F32_e32 VSrc_32:$src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
+>;
+
+def : Pat <
+  (int_AMDGPU_cube VReg_128:$src),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub0),
+    (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub1),
+    (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub2),
+    (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sub0),
+                  (EXTRACT_SUBREG VReg_128:$src, sub1),
+                  (EXTRACT_SUBREG VReg_128:$src, sub2),
+                  0, 0, 0, 0), sub3)
+>;
+
+def : Pat <
+  (i32 (sext (i1 SReg_64:$src0))),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 -1), SReg_64:$src0)
+>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM SReg_128:$sbase, IMM8bitDWORD:$offset)
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR SReg_128:$sbase, (S_MOV_B32 imm:$offset))
+>;
+
+// 3. Offset in an 32Bit VGPR
+def : Pat <
+  (int_SI_load_const SReg_128:$sbase, VReg_32:$voff),
+  (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, VReg_32:$voff, SReg_128:$sbase, 0, 0, 0)
+>;
+
+/********** ================== **********/
+/**********   VOP3 Patterns    **********/
+/********** ================== **********/
+
+def : Pat <(f32 (fadd (fmul VSrc_32:$src0, VSrc_32:$src1), VSrc_32:$src2)),
+           (V_MAD_F32 VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
+            0, 0, 0, 0)>;
+
+/********** ================== **********/
+/**********   SMRD Patterns    **********/
+/********** ================== **********/
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (SIadd64bit32bit SReg_64:$sbase, IMM8bitDWORD:$offset)),
+    (vt (Instr_IMM SReg_64:$sbase, IMM8bitDWORD:$offset))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (SIadd64bit32bit SReg_64:$sbase, imm:$offset)),
+    (vt (Instr_SGPR SReg_64:$sbase, (S_MOV_B32 imm:$offset)))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load SReg_64:$sbase),
+    (vt (Instr_IMM SReg_64:$sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+
+/********** ====================== **********/
+/**********   Indirect adressing   **********/
+/********** ====================== **********/
+
+multiclass SI_INDIRECT_Pattern <RegisterClass rc, ValueType vt,
+                                SI_INDIRECT_DST IndDst> {
+  // 1. Extract with offset
+  def : Pat<
+    (vector_extract (vt rc:$vec),
+      (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+    ),
+    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off))
+  >;
+
+  // 2. Extract without offset
+  def : Pat<
+    (vector_extract (vt rc:$vec),
+      (i64 (zext (i32 VReg_32:$idx)))
+    ),
+    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0))
+  >;
+
+  // 3. Insert with offset
+  def : Pat<
+    (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+      (i64 (zext (i32 (add VReg_32:$idx, imm:$off))))
+    ),
+    (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, imm:$off, VReg_32:$val))
+  >;
+
+  // 4. Insert without offset
+  def : Pat<
+    (vector_insert (vt rc:$vec), (f32 VReg_32:$val),
+      (i64 (zext (i32 VReg_32:$idx)))
+    ),
+    (vt (IndDst (IMPLICIT_DEF), rc:$vec, VReg_32:$idx, 0, VReg_32:$val))
+  >;
+}
+
+defm : SI_INDIRECT_Pattern <VReg_64, v2f32, SI_INDIRECT_DST_V2>;
+defm : SI_INDIRECT_Pattern <VReg_128, v4f32, SI_INDIRECT_DST_V4>;
+defm : SI_INDIRECT_Pattern <VReg_256, v8f32, SI_INDIRECT_DST_V8>;
+defm : SI_INDIRECT_Pattern <VReg_512, v16f32, SI_INDIRECT_DST_V16>;
+
+/********** =============== **********/
+/**********   Conditions    **********/
+/********** =============== **********/
+
+def : Pat<
+  (i1 (setcc f32:$src0, f32:$src1, SETO)),
+  (V_CMP_O_F32_e64 f32:$src0, f32:$src1)
+>;
+
+def : Pat<
+  (i1 (setcc f32:$src0, f32:$src1, SETUO)),
+  (V_CMP_U_F32_e64 f32:$src0, f32:$src1)
+>;
+
+} // End isSI predicate
diff --git a/contrib/llvm/lib/Target/R600/SIIntrinsics.td b/contrib/llvm/lib/Target/R600/SIIntrinsics.td
new file mode 100644
index 000000000000..0af378edfe2e
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIIntrinsics.td
@@ -0,0 +1,42 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+
+  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_SI_sample : Sample;
+  def int_SI_sampleb : Sample;
+  def int_SI_samplel : Sample;
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
+
+  /* Control flow Intrinsics */
+
+  def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>;
+  def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
+  def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>;
+  def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>;
+  def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>;
+  def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>;
+}
diff --git a/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
new file mode 100644
index 000000000000..2b60eb9fb375
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
@@ -0,0 +1,501 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions to real
+/// machine instructions.
+///
+/// All control flow is handled using predicated instructions and
+/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
+/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU).  Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// %SGPR0 = SI_IF %VCC
+///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// %SGPR0 = SI_ELSE %SGPR0
+///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// SI_END_CF %SGPR0
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0            // This instruction is an optional
+///                                   // optimization which allows us to
+///                                   // branch if all the bits of
+///                                   // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1              // Use our branch optimization
+///                                    // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerControlFlowPass : public MachineFunctionPass {
+
+private:
+  static const unsigned SkipThreshold = 12;
+
+  static char ID;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
+
+  void Skip(MachineInstr &From, MachineOperand &To);
+  void SkipIfDead(MachineInstr &MI);
+
+  void If(MachineInstr &MI);
+  void Else(MachineInstr &MI);
+  void Break(MachineInstr &MI);
+  void IfBreak(MachineInstr &MI);
+  void ElseBreak(MachineInstr &MI);
+  void Loop(MachineInstr &MI);
+  void EndCf(MachineInstr &MI);
+
+  void Kill(MachineInstr &MI);
+  void Branch(MachineInstr &MI);
+
+  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
+  void IndirectSrc(MachineInstr &MI);
+  void IndirectDst(MachineInstr &MI);
+
+public:
+  SILowerControlFlowPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
+    TII(tm.getInstrInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI Lower control flow instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlowPass::ID = 0;
+
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
+  return new SILowerControlFlowPass(tm);
+}
+
+bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
+                                        MachineBasicBlock *To) {
+
+  unsigned NumInstr = 0;
+
+  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
+       MBB = *MBB->succ_begin()) {
+
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         NumInstr < SkipThreshold && I != E; ++I) {
+
+      if (I->isBundle() || !I->isBundled())
+        if (++NumInstr >= SkipThreshold)
+          return true;
+    }
+  }
+
+  return false;
+}
+
+void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
+
+  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
+    return;
+
+  DebugLoc DL = From.getDebugLoc();
+  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+          .addOperand(To)
+          .addReg(AMDGPU::EXEC);
+}
+
+void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  if (!shouldSkip(&MBB, &MBB.getParent()->back()))
+    return;
+
+  MachineBasicBlock::iterator Insert = &MI;
+  ++Insert;
+
+  // If the exec mask is non-zero, skip the next two instructions
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(3)
+          .addReg(AMDGPU::EXEC);
+
+  // Exec mask is zero: Export to NULL target...
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
+          .addImm(0)
+          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+          .addImm(0)
+          .addImm(1)
+          .addImm(1)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0)
+          .addReg(AMDGPU::VGPR0);
+
+  // ... and terminate wavefront
+  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+}
+
+void SILowerControlFlowPass::If(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
+          .addReg(Vcc);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Else(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
+          .addReg(Src); // Saved EXEC
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Dst);
+
+  Skip(MI, MI.getOperand(2));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Break(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Src = MI.getOperand(1).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vcc = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Vcc)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Saved = MI.getOperand(1).getReg();
+  unsigned Src = MI.getOperand(2).getReg();
+ 
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+          .addReg(Saved)
+          .addReg(Src);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Loop(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Src = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Src);
+
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addOperand(MI.getOperand(1))
+          .addReg(AMDGPU::EXEC);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
+
+  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
+          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(Reg);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::Branch(MachineInstr &MI) {
+  MachineBasicBlock *Next = MI.getParent()->getNextNode();
+  MachineBasicBlock *Target = MI.getOperand(0).getMBB();
+  if (Target == Next)
+    MI.eraseFromParent();
+  else
+    assert(0);
+}
+
+void SILowerControlFlowPass::Kill(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Kill is only allowed in pixel shaders
+  assert(MBB.getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType ==
+         ShaderType::PIXEL);
+
+  // Clear this pixel from the exec mask if the operand is negative
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
+          .addImm(0)
+          .addOperand(MI.getOperand(0));
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  MachineBasicBlock::iterator I = MI;
+
+  unsigned Save = MI.getOperand(1).getReg();
+  unsigned Idx = MI.getOperand(3).getReg();
+
+  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+            .addReg(Idx);
+    MBB.insert(I, MovRel);
+    MI.eraseFromParent();
+    return;
+  }
+
+  assert(AMDGPU::SReg_64RegClass.contains(Save));
+  assert(AMDGPU::VReg_32RegClass.contains(Idx));
+
+  // Save the EXEC mask
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+          .addReg(AMDGPU::EXEC);
+
+  // Read the next variant into VCC (lower 32 bits) <- also loop target
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32_e32), AMDGPU::VCC)
+          .addReg(Idx);
+
+  // Move index from VCC into M0
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+          .addReg(AMDGPU::VCC);
+
+  // Compare the just read M0 value to all possible Idx values
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
+          .addReg(AMDGPU::M0)
+          .addReg(Idx);
+
+  // Update EXEC, save the original EXEC value to VCC
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+
+  // Do the actual move
+  MBB.insert(I, MovRel);
+
+  // Update EXEC, switch all done bits to 0 and all todo bits to 1
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .addReg(AMDGPU::VCC);
+
+  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .addImm(-7)
+          .addReg(AMDGPU::EXEC);
+
+  // Restore EXEC
+  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+          .addReg(Save);
+
+  MI.eraseFromParent();
+}
+
+void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Vec = MI.getOperand(2).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Vec, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
+void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
+
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned Off = MI.getOperand(4).getImm();
+  unsigned Val = MI.getOperand(5).getReg();
+
+  MachineInstr *MovRel = 
+    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
+            .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+            .addReg(Val)
+            .addReg(AMDGPU::M0, RegState::Implicit)
+            .addReg(Dst, RegState::Implicit);
+
+  LoadM0(MI, MovRel);
+}
+
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+
+  bool HaveKill = false;
+  bool NeedWQM = false;
+  unsigned Depth = 0;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next) {
+
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+        default: break;
+        case AMDGPU::SI_IF:
+          ++Depth;
+          If(MI);
+          break;
+
+        case AMDGPU::SI_ELSE:
+          Else(MI);
+          break;
+
+        case AMDGPU::SI_BREAK:
+          Break(MI);
+          break;
+
+        case AMDGPU::SI_IF_BREAK:
+          IfBreak(MI);
+          break;
+
+        case AMDGPU::SI_ELSE_BREAK:
+          ElseBreak(MI);
+          break;
+
+        case AMDGPU::SI_LOOP:
+          ++Depth;
+          Loop(MI);
+          break;
+
+        case AMDGPU::SI_END_CF:
+          if (--Depth == 0 && HaveKill) {
+            SkipIfDead(MI);
+            HaveKill = false;
+          }
+          EndCf(MI);
+          break;
+
+        case AMDGPU::SI_KILL:
+          if (Depth == 0)
+            SkipIfDead(MI);
+          else
+            HaveKill = true;
+          Kill(MI);
+          break;
+
+        case AMDGPU::S_BRANCH:
+          Branch(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_SRC:
+          IndirectSrc(MI);
+          break;
+
+        case AMDGPU::SI_INDIRECT_DST_V2:
+        case AMDGPU::SI_INDIRECT_DST_V4:
+        case AMDGPU::SI_INDIRECT_DST_V8:
+        case AMDGPU::SI_INDIRECT_DST_V16:
+          IndirectDst(MI);
+          break;
+
+        case AMDGPU::V_INTERP_P1_F32:
+        case AMDGPU::V_INTERP_P2_F32:
+        case AMDGPU::V_INTERP_MOV_F32:
+          NeedWQM = true;
+          break;
+
+      }
+    }
+  }
+
+  if (NeedWQM) {
+    MachineBasicBlock &MBB = MF.front();
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..ee0e30755f01
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -0,0 +1,18 @@
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF),
+    PSInputAddr(0) { }
diff --git a/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h
new file mode 100644
index 000000000000..6da9f7f9a14d
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIMachineFunctionInfo.h
@@ -0,0 +1,33 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIMACHINEFUNCTIONINFO_H_
+#define SIMACHINEFUNCTIONINFO_H_
+
+#include "AMDGPUMachineFunction.h"
+
+namespace llvm {
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+public:
+  SIMachineFunctionInfo(const MachineFunction &MF);
+  unsigned PSInputAddr;
+};
+
+} // End namespace llvm
+
+
+#endif //_SIMACHINEFUNCTIONINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
new file mode 100644
index 000000000000..99278ae8dceb
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.cpp
@@ -0,0 +1,53 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIRegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                             MachineFunction &MF) const {
+  return RC->getNumRegs();
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
+  switch (rc->getID()) {
+  case AMDGPU::GPRF32RegClassID:
+    return &AMDGPU::VReg_32RegClass;
+  default: return rc;
+  }
+}
+
+const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+    default:
+    case MVT::i32: return &AMDGPU::VReg_32RegClass;
+  }
+}
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.h b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
new file mode 100644
index 000000000000..caec22841345
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.h
@@ -0,0 +1,50 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIREGISTERINFO_H_
+#define SIREGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class TargetInstrInfo;
+
+struct SIRegisterInfo : public AMDGPURegisterInfo {
+  AMDGPUTargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                                       MachineFunction &MF) const;
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns the SI register class that is equivalent to \p RC.
+  virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass *RC) const;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+};
+
+} // End namespace llvm
+
+#endif // SIREGISTERINFO_H_
diff --git a/contrib/llvm/lib/Target/R600/SIRegisterInfo.td b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
new file mode 100644
index 000000000000..4f14931a9c48
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SIRegisterInfo.td
@@ -0,0 +1,182 @@
+//===-- SIRegisterInfo.td - SI Register defs ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the SI registers
+//===----------------------------------------------------------------------===//
+
+class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+// Special Registers
+def VCC : SIReg<"VCC", 106>;
+def EXEC : SIReg<"EXEC", 126>;
+def SCC : SIReg<"SCC", 253>;
+def M0 : SIReg <"M0", 124>;
+
+// SGPR registers
+foreach Index = 0-101 in {
+  def SGPR#Index : SIReg <"SGPR"#Index, Index>;
+}
+
+// VGPR registers
+foreach Index = 0-255 in {
+  def VGPR#Index : SIReg <"VGPR"#Index, Index> {
+    let HWEncoding{8} = 1;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Groupings using register classes and tuples
+//===----------------------------------------------------------------------===//
+
+// SGPR 32-bit registers
+def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "SGPR%u", 0, 101))>;
+
+// SGPR 64-bit registers
+def SGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (decimate (trunc SGPR_32, 101), 2)),
+                              (add (decimate (shl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (decimate (trunc SGPR_32, 99), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (decimate (trunc SGPR_32, 95), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4))]>;
+
+// SGPR 512-bit registers
+def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (decimate (trunc SGPR_32, 87), 4)),
+                               (add (decimate (shl SGPR_32, 1), 4)),
+                               (add (decimate (shl SGPR_32, 2), 4)),
+                               (add (decimate (shl SGPR_32, 3), 4)),
+                               (add (decimate (shl SGPR_32, 4), 4)),
+                               (add (decimate (shl SGPR_32, 5), 4)),
+                               (add (decimate (shl SGPR_32, 6), 4)),
+                               (add (decimate (shl SGPR_32, 7), 4)),
+                               (add (decimate (shl SGPR_32, 8), 4)),
+                               (add (decimate (shl SGPR_32, 9), 4)),
+                               (add (decimate (shl SGPR_32, 10), 4)),
+                               (add (decimate (shl SGPR_32, 11), 4)),
+                               (add (decimate (shl SGPR_32, 12), 4)),
+                               (add (decimate (shl SGPR_32, 13), 4)),
+                               (add (decimate (shl SGPR_32, 14), 4)),
+                               (add (decimate (shl SGPR_32, 15), 4))]>;
+
+// VGPR 32-bit registers
+def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "VGPR%u", 0, 255))>;
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[sub0, sub1],
+                             [(add (trunc VGPR_32, 255)),
+                              (add (shl VGPR_32, 1))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3],
+                              [(add (trunc VGPR_32, 253)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3))]>;
+
+// VGPR 256-bit registers
+def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (trunc VGPR_32, 249)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7))]>;
+
+// VGPR 512-bit registers
+def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                               sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15],
+                              [(add (trunc VGPR_32, 241)),
+                               (add (shl VGPR_32, 1)),
+                               (add (shl VGPR_32, 2)),
+                               (add (shl VGPR_32, 3)),
+                               (add (shl VGPR_32, 4)),
+                               (add (shl VGPR_32, 5)),
+                               (add (shl VGPR_32, 6)),
+                               (add (shl VGPR_32, 7)),
+                               (add (shl VGPR_32, 8)),
+                               (add (shl VGPR_32, 9)),
+                               (add (shl VGPR_32, 10)),
+                               (add (shl VGPR_32, 11)),
+                               (add (shl VGPR_32, 12)),
+                               (add (shl VGPR_32, 13)),
+                               (add (shl VGPR_32, 14)),
+                               (add (shl VGPR_32, 15))]>;
+
+//===----------------------------------------------------------------------===//
+//  Register classes used as source and destination
+//===----------------------------------------------------------------------===//
+
+// Special register classes for predicates and the M0 register
+def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)>;
+def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>;
+def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>;
+def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+  (add SGPR_32, M0Reg)
+>;
+
+def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
+  (add SGPR_64, VCCReg, EXECReg)
+>;
+
+def SReg_128 : RegisterClass<"AMDGPU", [v16i8], 128, (add SGPR_128)>;
+
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
+
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
+
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>;
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+
+def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
+
+def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
+
+//===----------------------------------------------------------------------===//
+//  [SV]Src_* register classes, can have either an immediate or an register
+//===----------------------------------------------------------------------===//
+
+def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
+
+def SSrc_64 : RegisterClass<"AMDGPU", [i64, f64, i1], 64, (add SReg_64)>;
+
+def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
+
+def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
+
diff --git a/contrib/llvm/lib/Target/R600/SISchedule.td b/contrib/llvm/lib/Target/R600/SISchedule.td
new file mode 100644
index 000000000000..28b65b825855
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/SISchedule.td
@@ -0,0 +1,15 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: This is just a place holder for now.
+//
+//===----------------------------------------------------------------------===//
+
+
+def SI_Itin : ProcessorItineraries <[], [], []>;
diff --git a/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 000000000000..46b1f18c6263
--- /dev/null
+++ b/contrib/llvm/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target for the AMDGPU backend
+Target llvm::TheAMDGPUTarget;
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeR600TargetInfo() {
+  RegisterTarget<Triple::r600, false>
+    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+}
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 000000000000..6123773d5f4b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -0,0 +1,320 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that attempts to fill delay slots with useful
+// instructions. If no instructions can be moved into the delay slot, then a
+// NOP is placed.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+#include "Sparc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-sparc-delay-filler",
+  cl::init(false),
+  cl::desc("Disable the Sparc delay slot filler."),
+  cl::Hidden);
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "SPARC Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+    bool isDelayFiller(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator candidate);
+
+    void insertCallUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    void insertDefsUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+                    unsigned Reg);
+
+    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+                        bool &sawLoad, bool &sawStore,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
+
+    MachineBasicBlock::iterator
+    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot);
+
+    bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler(tm);
+}
+
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// We assume there is only one delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->hasDelaySlot()) {
+      MachineBasicBlock::iterator D = MBB.end();
+      MachineBasicBlock::iterator J = I;
+
+      if (!DisableDelaySlotFiller)
+        D = findDelayInstr(MBB, I);
+
+      ++FilledSlots;
+      Changed = true;
+
+      if (D == MBB.end())
+        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(SP::NOP));
+      else
+        MBB.splice(++J, &MBB, D);
+      unsigned structSize = 0;
+      if (needsUnimp(I, structSize)) {
+        MachineBasicBlock::iterator J = I;
+        ++J; //skip the delay filler.
+        BuildMI(MBB, ++J, I->getDebugLoc(),
+                TII->get(SP::UNIMP)).addImm(structSize);
+      }
+    }
+  return Changed;
+}
+
+MachineBasicBlock::iterator
+Filler::findDelayInstr(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator slot)
+{
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+  bool sawLoad = false;
+  bool sawStore = false;
+
+  MachineBasicBlock::iterator I = slot;
+
+  if (slot->getOpcode() == SP::RET)
+    return MBB.end();
+
+  if (slot->getOpcode() == SP::RETL) {
+    --I;
+    if (I->getOpcode() != SP::RESTORErr)
+      return MBB.end();
+    //change retl to ret
+    slot->setDesc(TII->get(SP::RET));
+    return I;
+  }
+
+  //Call's delay filler can def some of call's uses.
+  if (slot->isCall())
+    insertCallUses(slot, RegUses);
+  else
+    insertDefsUses(slot, RegDefs, RegUses);
+
+  bool done = false;
+
+  while (!done) {
+    done = (I == MBB.begin());
+
+    if (!done)
+      --I;
+
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+
+    if (I->hasUnmodeledSideEffects()
+        || I->isInlineAsm()
+        || I->isLabel()
+        || I->hasDelaySlot()
+        || isDelayFiller(MBB, I))
+      break;
+
+    if (delayHasHazard(I, sawLoad, sawStore, RegDefs, RegUses)) {
+      insertDefsUses(I, RegDefs, RegUses);
+      continue;
+    }
+
+    return I;
+  }
+  return MBB.end();
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+                            bool &sawLoad,
+                            bool &sawStore,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses)
+{
+
+  if (candidate->isImplicitDef() || candidate->isKill())
+    return true;
+
+  if (candidate->mayLoad()) {
+    sawLoad = true;
+    if (sawStore)
+      return true;
+  }
+
+  if (candidate->mayStore()) {
+    if (sawStore)
+      return true;
+    sawStore = true;
+    if (sawLoad)
+      return true;
+  }
+
+  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+    const MachineOperand &MO = candidate->getOperand(i);
+    if (!MO.isReg())
+      continue; // skip
+
+    unsigned Reg = MO.getReg();
+
+    if (MO.isDef()) {
+      //check whether Reg is defined or used before delay slot.
+      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      //check whether Reg is defined before delay slot.
+      if (IsRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+
+void Filler::insertCallUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegUses)
+{
+
+  switch(MI->getOpcode()) {
+  default: llvm_unreachable("Unknown opcode.");
+  case SP::CALL: break;
+  case SP::JMPLrr:
+  case SP::JMPLri:
+    assert(MI->getNumOperands() >= 2);
+    const MachineOperand &Reg = MI->getOperand(0);
+    assert(Reg.isReg() && "JMPL first operand is not a register.");
+    assert(Reg.isUse() && "JMPL first operand is not a use.");
+    RegUses.insert(Reg.getReg());
+
+    const MachineOperand &RegOrImm = MI->getOperand(1);
+    if (RegOrImm.isImm())
+        break;
+    assert(RegOrImm.isReg() && "JMPLrr second operand is not a register.");
+    assert(RegOrImm.isUse() && "JMPLrr second operand is not a use.");
+    RegUses.insert(RegOrImm.getReg());
+    break;
+  }
+}
+
+//Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses)
+{
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    if (MO.isUse())
+      RegUses.insert(Reg);
+
+  }
+}
+
+//returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
+{
+  // Check Reg and all aliased Registers.
+  for (MCRegAliasIterator AI(Reg, TM.getRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (RegSet.count(*AI))
+      return true;
+  return false;
+}
+
+// return true if the candidate is a delay filler.
+bool Filler::isDelayFiller(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator candidate)
+{
+  if (candidate == MBB.begin())
+    return false;
+  if (candidate->getOpcode() == SP::UNIMP)
+    return true;
+  --candidate;
+  return candidate->hasDelaySlot();
+}
+
+bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
+{
+  if (!I->isCall())
+    return false;
+
+  unsigned structSizeOpNum = 0;
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Unknown call opcode.");
+  case SP::CALL: structSizeOpNum = 1; break;
+  case SP::JMPLrr:
+  case SP::JMPLri: structSizeOpNum = 2; break;
+  }
+
+  const MachineOperand &MO = I->getOperand(structSizeOpNum);
+  if (!MO.isImm())
+    return false;
+  StructSize = MO.getImm();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/FPMover.cpp b/contrib/llvm/lib/Target/Sparc/FPMover.cpp
new file mode 100644
index 000000000000..1325b98cf0ee
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/FPMover.cpp
@@ -0,0 +1,141 @@
+//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "fpmover"
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+STATISTIC(NumFpDs , "Number of instructions translated");
+STATISTIC(NoopFpDs, "Number of noop instructions removed");
+
+namespace {
+  struct FPMover : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    
+    static char ID;
+    explicit FPMover(TargetMachine &tm) 
+      : MachineFunctionPass(ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Sparc Double-FP Move Fixer";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char FPMover::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
+/// instructions into FMOVS instructions
+///
+FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
+  return new FPMover(tm);
+}
+
+/// getDoubleRegPair - Given a DFP register, return the even and odd FP
+/// registers that correspond to it.
+static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
+                             unsigned &OddReg) {
+  static const uint16_t EvenHalvesOfPairs[] = {
+    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
+    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
+  };
+  static const uint16_t OddHalvesOfPairs[] = {
+    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
+    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
+  };
+  static const uint16_t DoubleRegsInOrder[] = {
+    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
+    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
+  };
+  for (unsigned i = 0; i < array_lengthof(DoubleRegsInOrder); ++i)
+    if (DoubleRegsInOrder[i] == DoubleReg) {
+      EvenReg = EvenHalvesOfPairs[i];
+      OddReg = OddHalvesOfPairs[i];
+      return;
+    }
+  llvm_unreachable("Can't find reg");
+}
+
+/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
+///
+bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *MI = I++;
+    DebugLoc dl = MI->getDebugLoc();
+    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
+        MI->getOpcode() == SP::FpNEGD) {
+      Changed = true;
+      unsigned DestDReg = MI->getOperand(0).getReg();
+      unsigned SrcDReg  = MI->getOperand(1).getReg();
+      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
+        MBB.erase(MI);   // Eliminate the noop copy.
+        ++NoopFpDs;
+        continue;
+      }
+      
+      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
+      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
+      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
+
+      const TargetInstrInfo *TII = TM.getInstrInfo();
+      if (MI->getOpcode() == SP::FpMOVD)
+        MI->setDesc(TII->get(SP::FMOVS));
+      else if (MI->getOpcode() == SP::FpNEGD)
+        MI->setDesc(TII->get(SP::FNEGS));
+      else if (MI->getOpcode() == SP::FpABSD)
+        MI->setDesc(TII->get(SP::FABSS));
+      else
+        llvm_unreachable("Unknown opcode!");
+        
+      MI->getOperand(0).setReg(EvenDestReg);
+      MI->getOperand(1).setReg(EvenSrcReg);
+      DEBUG(errs() << "FPMover: the modified instr is: " << *MI);
+      // Insert copy for the other half of the double.
+      if (DestDReg != SrcDReg) {
+        MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
+          .addReg(OddSrcReg);
+        DEBUG(errs() << "FPMover: the inserted instr is: " << *MI);
+      }
+      ++NumFpDs;
+    }
+  }
+  return Changed;
+}
+
+bool FPMover::runOnMachineFunction(MachineFunction &F) {
+  // If the target has V9 instructions, the fp-mover pseudos will never be
+  // emitted.  Avoid a scan of the instructions to improve compile time.
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    return false;
+  
+  bool Changed = false;
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI)
+    Changed |= runOnMachineBasicBlock(*FI);
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
new file mode 100644
index 000000000000..3d4bfdcd5e6d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -0,0 +1,44 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+
+void SparcELFMCAsmInfo::anchor() { }
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, StringRef TT) {
+  IsLittleEndian = false;
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::sparcv9) {
+    PointerSize = CalleeSaveStackSlotSize = 8;
+  }
+
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  WeakRefDirective = "\t.weak\t";
+
+  PrivateGlobalPrefix = ".L";
+}
+
+
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
new file mode 100644
index 000000000000..f0e1354c212b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class StringRef;
+  class Target;
+
+  class SparcELFMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit SparcELFMCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
new file mode 100644
index 000000000000..7fdb0c39285a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -0,0 +1,81 @@
+//===-- SparcMCTargetDesc.cpp - Sparc Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCTargetDesc.h"
+#include "SparcMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "SparcGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "SparcGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createSparcMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitSparcMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSparcMCRegisterInfo(X, SP::I7);
+  return X;
+}
+
+static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitSparcMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+extern "C" void LLVMInitializeSparcTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<SparcELFMCAsmInfo> X(TheSparcTarget);
+  RegisterMCAsmInfo<SparcELFMCAsmInfo> Y(TheSparcV9Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
+                                       createSparcMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target,
+                                       createSparcMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
+                                          createSparcMCSubtargetInfo);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
new file mode 100644
index 000000000000..cba775adb1a8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -0,0 +1,39 @@
+//===-- SparcMCTargetDesc.h - Sparc Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Sparc specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCMCTARGETDESC_H
+#define SPARCMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheSparcTarget;
+extern Target TheSparcV9Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "SparcGenRegisterInfo.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "SparcGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "SparcGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h
new file mode 100644
index 000000000000..ce6ae17b6ca2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.h
@@ -0,0 +1,108 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_SPARC_H
+#define TARGET_SPARC_H
+
+#include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class SparcTargetMachine;
+  class formatted_raw_ostream;
+
+  FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+
+} // end namespace llvm;
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      //ICC_A   =  8   ,  // Always
+      //ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+      
+      //FCC_A   =  8+16,  // Always
+      //FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16   // Ordered
+    };
+  }
+  
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    }
+    llvm_unreachable("Invalid cond code");
+  }
+}  // end namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
new file mode 100644
index 000000000000..611f8e8129f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -0,0 +1,72 @@
+//===-- Sparc.td - Describe the Sparc Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+ 
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
new file mode 100644
index 000000000000..e14b3cbf161d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -0,0 +1,261 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Sparc.h"
+#include "SparcInstrInfo.h"
+#include "SparcTargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+namespace {
+  class SparcAsmPrinter : public AsmPrinter {
+  public:
+    explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer) {}
+
+    virtual const char *getPassName() const {
+      return "Sparc Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
+                         const char *Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
+
+    virtual void EmitInstruction(const MachineInstr *MI) {
+      SmallString<128> Str;
+      raw_svector_ostream OS(Str);
+      printInstruction(MI, OS);
+      OutStreamer.EmitRawText(OS.str());
+    }
+    void printInstruction(const MachineInstr *MI, raw_ostream &OS);// autogen'd.
+    static const char *getRegisterName(unsigned RegNo);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &O);
+
+    bool printGetPCX(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS);
+    
+    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
+                       const;
+
+    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  };
+} // end of anonymous namespace
+
+#include "SparcGenAsmWriter.inc"
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  bool CloseParen = false;
+  if (MI->getOpcode() == SP::SETHIi && !MO.isReg() && !MO.isImm()) {
+    O << "%hi(";
+    CloseParen = true;
+  } else if ((MI->getOpcode() == SP::ORri || MI->getOpcode() == SP::ADDri) &&
+             !MO.isReg() && !MO.isImm()) {
+    O << "%lo(";
+    CloseParen = true;
+  }
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1, O);
+    return;
+  }
+
+  if (MI->getOperand(opNum+1).isReg() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImm() &&
+      MI->getOperand(opNum+1).getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+  if (MI->getOperand(opNum+1).isGlobal() ||
+      MI->getOperand(opNum+1).isCPI()) {
+    O << "%lo(";
+    printOperand(MI, opNum+1, O);
+    O << ")";
+  } else {
+    printOperand(MI, opNum+1, O);
+  }
+}
+
+bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
+                                  raw_ostream &O) {
+  std::string operand = "";
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Operand is not a register");
+  case MachineOperand::MO_Register:
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Operand is not a physical register ");
+    assert(MO.getReg() != SP::O7 && 
+           "%o7 is assigned as destination for getpcx!");
+    operand = "%" + StringRef(getRegisterName(MO.getReg())).lower();
+    break;
+  }
+
+  unsigned mfNum = MI->getParent()->getParent()->getFunctionNumber();
+  unsigned bbNum = MI->getParent()->getNumber();
+
+  O << '\n' << ".LLGETPCH" << mfNum << '_' << bbNum << ":\n";
+  O << "\tcall\t.LLGETPC" << mfNum << '_' << bbNum << '\n' ;
+
+  O << "\t  sethi\t"
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum 
+    << ")), "  << operand << '\n' ;
+
+  O << ".LLGETPC" << mfNum << '_' << bbNum << ":\n" ;
+  O << "\tor\t" << operand  
+    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
+    << ")), " << operand << '\n';
+  O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
+  
+  return true;
+}
+
+void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum,
+                                     raw_ostream &O) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode,
+                                      raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+     break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo, unsigned AsmVariant,
+                                            const char *ExtraCode,
+                                            raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+///
+/// This overrides AsmPrinter's implementation to handle delay slots.
+bool SparcAsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isLandingPad() || MBB->pred_empty())
+    return false;
+  
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
+  ++PI2;
+  if (PI2 != MBB->pred_end())
+    return false;
+  
+  // The predecessor has to be immediately before this block.
+  const MachineBasicBlock *Pred = *PI;
+  
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+  
+  // Check if the last terminator is an unconditional branch.
+  MachineBasicBlock::const_iterator I = Pred->end();
+  while (I != Pred->begin() && !(--I)->isTerminator())
+    ; // Noop
+  return I == Pred->end() || !I->isBarrier();
+}
+
+MachineLocation SparcAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  assert(MI->getNumOperands() == 4 && "Invalid number of operands!");
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+         "Unexpected MachineOperand types");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSparcAsmPrinter() { 
+  RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
+  RegisterAsmPrinter<SparcAsmPrinter> Y(TheSparcV9Target);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 000000000000..b38ac616dcf4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -0,0 +1,56 @@
+//===-- SparcCallingConv.td - Calling Conventions Sparc ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Sparc 32-bit C return-value convention.
+def RetCC_Sparc32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+]>;
+
+// Sparc 64-bit C return-value convention.
+def RetCC_Sparc64 : CallingConv<[
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+]>;
+
+// Sparc 32-bit C Calling convention.
+def CC_Sparc32 : CallingConv<[
+  //Custom assign SRet to [sp+64].
+  CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
+  // i32 f32 arguments get passed in integer registers if there is space.
+  CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  // f64 arguments are split and passed through registers or through stack.
+  CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>,
+
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;
+
+// Sparc 64-bit C Calling convention.
+def CC_Sparc64 : CallingConv<[
+  // All integers are promoted to i64 by the caller.
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  // Integer arguments get passed in integer registers if there is space.
+  CCIfType<[i64], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  // FIXME: Floating point arguments.
+
+  // Alternatively, they are assigned to the stack in 8-byte aligned units.
+  CCAssignToStack<8, 8>
+]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
new file mode 100644
index 000000000000..a0dae6e9480c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -0,0 +1,96 @@
+//===-- SparcFrameLowering.cpp - Sparc Frame Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcFrameLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Emit the correct save instruction based on the number of bytes in
+  // the frame. Minimum stack frame size according to V8 ABI is:
+  //   16 words for register window spill
+  //    1 word for address of returned aggregate-value
+  // +  6 words for passing parameters on the stack
+  // ----------
+  //   23 words * 4 bytes per word = 92 bytes
+  NumBytes += 92;
+
+  // Round up to next doubleword boundary -- a double-word boundary
+  // is required by the ABI.
+  NumBytes = (NumBytes + 7) & ~7;
+  NumBytes = -NumBytes;
+
+  if (NumBytes >= -4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVEri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+  } else {
+    // Emit this the hard way.  This clobbers G1 which we always know is
+    // available here.
+    unsigned OffHi = (unsigned)NumBytes >> 10U;
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVErr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+  }
+}
+
+void SparcFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  DebugLoc dl = MI.getDebugLoc();
+  int Size = MI.getOperand(0).getImm();
+  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+    Size = -Size;
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (Size)
+    BuildMI(MBB, I, dl, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
+  MBB.erase(I);
+}
+
+
+void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
+                                  MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+    .addReg(SP::G0);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
new file mode 100644
index 000000000000..464233e7da35
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -0,0 +1,44 @@
+//===-- SparcFrameLowering.h - Define frame lowering for Sparc --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_FRAMEINFO_H
+#define SPARC_FRAMEINFO_H
+
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class SparcSubtarget;
+
+class SparcFrameLowering : public TargetFrameLowering {
+public:
+  explicit SparcFrameLowering(const SparcSubtarget &/*sti*/)
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, 0) {
+  }
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  bool hasFP(const MachineFunction &MF) const { return false; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 000000000000..5fa545d30160
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -0,0 +1,211 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget &Subtarget;
+  SparcTargetMachine& TM;
+public:
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
+    : SelectionDAGISel(tm),
+      Subtarget(tm.getSubtarget<SparcSubtarget>()),
+      TM(tm) {
+  }
+
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  virtual const char *getPassName() const {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+
+private:
+  SDNode* getGlobalBaseReg();
+};
+}  // end anonymous namespace
+
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (isInt<13>(CN->getSExtValue())) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      if (isInt<13>(CN->getSExtValue()))
+        return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, MVT::i32);
+  return true;
+}
+
+SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case SPISD::GLOBAL_BASE_REG:
+    return getGlobalBaseReg();
+
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDValue DivLHS = N->getOperand(0);
+    SDValue DivRHS = N->getOperand(1);
+
+    // Set the Y register to the high-part.
+    SDValue TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, MVT::i32)), 0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = SDValue(CurDAG->getMachineNode(SP::WRYrr, dl, MVT::Glue, TopPart,
+                                     CurDAG->getRegister(SP::G0, MVT::i32)), 0);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
+                                TopPart);
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    // FIXME: Handle mul by immediate.
+    SDValue MulLHS = N->getOperand(0);
+    SDValue MulRHS = N->getOperand(1);
+    unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
+    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Glue,
+                                         MulLHS, MulRHS);
+    // The high part is in the Y register.
+    return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1));
+  }
+  }
+
+  return SelectCode(N);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                char ConstraintCode,
+                                                std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+   if (!SelectADDRrr(Op, Op0, Op1))
+     SelectADDRri(Op, Op0, Op1);
+   break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 000000000000..325f13424b42
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -0,0 +1,1360 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
+                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  assert (ArgFlags.isSRet());
+
+  //Assign SRet argument
+  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                         0,
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
+                                MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+                                ISD::ArgFlagsTy &ArgFlags, CCState &State)
+{
+  static const uint16_t RegList[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  //Try to get first reg
+  if (unsigned Reg = State.AllocateReg(RegList, 6)) {
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  } else {
+    //Assign whole thing in stack
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8,4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  //Try to get second reg
+  if (unsigned Reg = State.AllocateReg(RegList, 6))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4,4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+#include "SparcGenCallingConv.inc"
+
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 DebugLoc dl, SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, Subtarget->is64Bit() ?
+                             RetCC_Sparc64 : RetCC_Sparc32);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  // Make room for the return address offset.
+  RetOps.push_back(SDValue());
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  unsigned RetAddrOffset = 8; //Call Inst + Delay Slot
+  // If the function returns a struct, copy the SRetReturnReg to I0
+  if (MF.getFunction()->hasStructRetAttr()) {
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+    Chain = DAG.getCopyToReg(Chain, dl, SP::I0, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(SP::I0, getPointerTy()));
+    RetAddrOffset = 12; // CallInst + Delay Slot + Unimp
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+  RetOps[1] = DAG.getConstant(RetAddrOffset, MVT::i32);
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+SDValue SparcTargetLowering::
+LowerFormalArguments(SDValue Chain,
+                     CallingConv::ID CallConv,
+                     bool IsVarArg,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc DL,
+                     SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget->is64Bit())
+    return LowerFormalArguments_64(Chain, CallConv, IsVarArg, Ins,
+                                   DL, DAG, InVals);
+  return LowerFormalArguments_32(Chain, CallConv, IsVarArg, Ins,
+                                 DL, DAG, InVals);
+}
+
+/// LowerFormalArguments32 - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values.  TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue SparcTargetLowering::
+LowerFormalArguments_32(SDValue Chain,
+                        CallingConv::ID CallConv,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        DebugLoc dl,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
+  const unsigned StackOffset = 92;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (i == 0  && Ins[i].Flags.isSRet()) {
+      //Get SRet from [%fp+64]
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+      SDValue Arg = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                                MachinePointerInfo(),
+                                false, false, false, 0);
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    if (VA.isRegLoc()) {
+      if (VA.needsCustom()) {
+        assert(VA.getLocVT() == MVT::f64);
+        unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi);
+        SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
+
+        assert(i+1 < e);
+        CCValAssign &NextVA = ArgLocs[++i];
+
+        SDValue LoVal;
+        if (NextVA.isMemLoc()) {
+          int FrameIdx = MF.getFrameInfo()->
+            CreateFixedObject(4, StackOffset+NextVA.getLocMemOffset(),true);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                              MachinePointerInfo(),
+                              false, false, false, 0);
+        } else {
+          unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
+                                        &SP::IntRegsRegClass);
+          LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
+        }
+        SDValue WholeValue =
+          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+        WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+        InVals.push_back(WholeValue);
+        continue;
+      }
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), VReg);
+      SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+      if (VA.getLocVT() == MVT::f32)
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Arg);
+      else if (VA.getLocVT() != MVT::i32) {
+        Arg = DAG.getNode(ISD::AssertSext, dl, MVT::i32, Arg,
+                          DAG.getValueType(VA.getLocVT()));
+        Arg = DAG.getNode(ISD::TRUNCATE, dl, VA.getLocVT(), Arg);
+      }
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    unsigned Offset = VA.getLocMemOffset()+StackOffset;
+
+    if (VA.needsCustom()) {
+      assert(VA.getValVT() == MVT::f64);
+      //If it is double-word aligned, just load.
+      if (Offset % 8 == 0) {
+        int FI = MF.getFrameInfo()->CreateFixedObject(8,
+                                                      Offset,
+                                                      true);
+        SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+        SDValue Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
+                                   MachinePointerInfo(),
+                                   false,false, false, 0);
+        InVals.push_back(Load);
+        continue;
+      }
+
+      int FI = MF.getFrameInfo()->CreateFixedObject(4,
+                                                    Offset,
+                                                    true);
+      SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
+                                  MachinePointerInfo(),
+                                  false, false, false, 0);
+      int FI2 = MF.getFrameInfo()->CreateFixedObject(4,
+                                                     Offset+4,
+                                                     true);
+      SDValue FIPtr2 = DAG.getFrameIndex(FI2, getPointerTy());
+
+      SDValue LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr2,
+                                  MachinePointerInfo(),
+                                  false, false, false, 0);
+
+      SDValue WholeValue =
+        DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+      WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue);
+      InVals.push_back(WholeValue);
+      continue;
+    }
+
+    int FI = MF.getFrameInfo()->CreateFixedObject(4,
+                                                  Offset,
+                                                  true);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue Load ;
+    if (VA.getValVT() == MVT::i32 || VA.getValVT() == MVT::f32) {
+      Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr,
+                         MachinePointerInfo(),
+                         false, false, false, 0);
+    } else {
+      ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
+      // Sparc is big endian, so add an offset based on the ObjectVT.
+      unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8);
+      FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
+                          DAG.getConstant(Offset, MVT::i32));
+      Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
+                            MachinePointerInfo(),
+                            VA.getValVT(), false, false,0);
+      Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load);
+    }
+    InVals.push_back(Load);
+  }
+
+  if (MF.getFunction()->hasStructRetAttr()) {
+    //Copy the SRet Argument to SRetReturnReg
+    SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
+    unsigned Reg = SFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
+      SFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (isVarArg) {
+    static const uint16_t ArgRegs[] = {
+      SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+    };
+    unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
+    const uint16_t *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    unsigned ArgOffset = CCInfo.getNextStackOffset();
+    if (NumAllocated == 6)
+      ArgOffset += StackOffset;
+    else {
+      assert(!ArgOffset);
+      ArgOffset = 68+4*NumAllocated;
+    }
+
+    // Remember the vararg offset for the va_start implementation.
+    FuncInfo->setVarArgsFrameOffset(ArgOffset);
+
+    std::vector<SDValue> OutChains;
+
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+      SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                          true);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+      ArgOffset += 4;
+    }
+
+    if (!OutChains.empty()) {
+      OutChains.push_back(Chain);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &OutChains[0], OutChains.size());
+    }
+  }
+
+  return Chain;
+}
+
+// Lower formal arguments for the 64 bit ABI.
+SDValue SparcTargetLowering::
+LowerFormalArguments_64(SDValue Chain,
+                        CallingConv::ID CallConv,
+                        bool IsVarArg,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        DebugLoc DL,
+                        SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Analyze arguments according to CC_Sparc64.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc64);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // This argument is passed in a register.
+      // All integer register arguments are promoted by the caller to i64.
+
+      // Create a virtual register for the promoted live-in value.
+      unsigned VReg = MF.addLiveIn(VA.getLocReg(),
+                                   getRegClassFor(VA.getLocVT()));
+      SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
+
+      // The caller promoted the argument, so insert an Assert?ext SDNode so we
+      // won't promote the value again in this function.
+      switch (VA.getLocInfo()) {
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      default:
+        break;
+      }
+
+      // Truncate the register down to the argument type.
+      if (VA.isExtInLoc())
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    // The registers are exhausted. This argument was passed on the stack.
+    assert(VA.isMemLoc());
+  }
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // Sparc target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  //Create local copies for byval args.
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned i = 0,  e = Outs.size(); i != e; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[i];
+    unsigned Size = Flags.getByValSize();
+    unsigned Align = Flags.getByValAlign();
+
+    int FI = MFI->CreateStackObject(Size, Align, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue SizeNode = DAG.getConstant(Size, MVT::i32);
+
+    Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+                          false,        //isVolatile,
+                          (Size <= 32), //AlwaysInline if size <= 32
+                          MachinePointerInfo(), MachinePointerInfo());
+    ByValArgs.push_back(FIPtr);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  const unsigned StackOffset = 92;
+  bool hasStructRetAttr = false;
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, byvalArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    //Use local copy if it is a byval arg.
+    if (Flags.isByVal())
+      Arg = ByValArgs[byvalArgIdx++];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (Flags.isSRet()) {
+      assert(VA.needsCustom());
+      // store SRet argument in %sp+64
+      SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      SDValue PtrOff = DAG.getIntPtrConstant(64);
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         MachinePointerInfo(),
+                                         false, false, 0));
+      hasStructRetAttr = true;
+      continue;
+    }
+
+    if (VA.needsCustom()) {
+      assert(VA.getLocVT() == MVT::f64);
+
+      if (VA.isMemLoc()) {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        //if it is double-word aligned, just store.
+        if (Offset % 8 == 0) {
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                             MachinePointerInfo(),
+                                             false, false, 0));
+          continue;
+        }
+      }
+
+      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
+      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                                   Arg, StackPtr, MachinePointerInfo(),
+                                   false, false, 0);
+      // Sparc is big-endian, so the high part comes first.
+      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
+                               MachinePointerInfo(), false, false, false, 0);
+      // Increment the pointer to the other half.
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             DAG.getIntPtrConstant(4));
+      // Load the low part.
+      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr,
+                               MachinePointerInfo(), false, false, false, 0);
+
+      if (VA.isRegLoc()) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi));
+        assert(i+1 != e);
+        CCValAssign &NextVA = ArgLocs[++i];
+        if (NextVA.isRegLoc()) {
+          RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
+        } else {
+          //Store the low part in stack.
+          unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
+          SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+          SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+          PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+          MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+                                             MachinePointerInfo(),
+                                             false, false, 0));
+        }
+      } else {
+        unsigned Offset = VA.getLocMemOffset() + StackOffset;
+        // Store the high part.
+        SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+        // Store the low part.
+        PtrOff = DAG.getIntPtrConstant(Offset+4);
+        PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff,
+                                           MachinePointerInfo(),
+                                           false, false, 0));
+      }
+      continue;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      if (VA.getLocVT() != MVT::f32) {
+        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+        continue;
+      }
+      Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()+StackOffset);
+    PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                       MachinePointerInfo(),
+                                       false, false, 0));
+  }
+
+
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = RegsToPass[i].first;
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // Returns a chain & a flag for retval copy to use
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+  if (hasStructRetAttr)
+    Ops.push_back(DAG.getTargetConstant(SRetArgSize, MVT::i32));
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = RegsToPass[i].first;
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Ops.push_back(DAG.getRegister(Reg, RegsToPass[i].second.getValueType()));
+  }
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 DAG.getTarget(), RVLocs, *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    unsigned Reg = RVLocs[i].getLocReg();
+
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, Reg,
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+unsigned
+SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
+{
+  const Function *CalleeFn = 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    CalleeFn = M->getFunction(E->getSymbol());
+  }
+
+  if (!CalleeFn)
+    return 0;
+
+  assert(CalleeFn->hasStructRetAttr() &&
+         "Callee does not have the StructRet attribute.");
+
+  PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  Type *ElementTy = Ty->getElementType();
+  return getDataLayout()->getTypeAllocSize(ElementTy);
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  Subtarget = &TM.getSubtarget<SparcSubtarget>();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
+  addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
+  addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Sparc doesn't have i1 sign extending load
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool , MVT::i32, Custom);
+
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+  // Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
+  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
+
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::BR_CC, MVT::i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+  }
+
+  // FIXME: There are instructions available for ATOMIC_FENCE
+  // on SparcV8 and later.
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
+
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FMA  , MVT::f32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // FIXME: Sparc provides these multiplies, but we don't have them yet.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  // No debug info support yet.
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+
+  setMinFunctionAlignment(2);
+
+  computeRegisterProperties();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPISD::CMPICC:     return "SPISD::CMPICC";
+  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
+  case SPISD::BRICC:      return "SPISD::BRICC";
+  case SPISD::BRXCC:      return "SPISD::BRXCC";
+  case SPISD::BRFCC:      return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_XCC: return "SPISD::SELECT_XCC";
+  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+  case SPISD::Hi:         return "SPISD::Hi";
+  case SPISD::Lo:         return "SPISD::Lo";
+  case SPISD::FTOI:       return "SPISD::FTOI";
+  case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::CALL:       return "SPISD::CALL";
+  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
+  case SPISD::FLUSHW:     return "SPISD::FLUSHW";
+  }
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                         APInt &KnownZero,
+                                                         APInt &KnownOne,
+                                                         const SelectionDAG &DAG,
+                                                         unsigned Depth) const {
+  APInt KnownZero2, KnownOne2;
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_XCC:
+  case SPISD::SELECT_FCC:
+    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isNullValue() &&
+      CC == ISD::SETNE &&
+      (((LHS.getOpcode() == SPISD::SELECT_ICC ||
+         LHS.getOpcode() == SPISD::SELECT_XCC) &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->isOne() &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, GA);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, GA);
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl,
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     AbsAddr, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  const Constant *C = N->getConstVal();
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, CP);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, CP);
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl,
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     AbsAddr, MachinePointerInfo(), false, false, false, 0);
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Convert the fp value to integer in an FP register.
+  assert(Op.getValueType() == MVT::i32);
+  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getOperand(0).getValueType() == MVT::i32);
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
+  // Convert the int value to FP in an FP register.
+  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  // Get the condition flag.
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
+    Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    Opc = SPISD::BRFCC;
+  }
+  return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  SDValue CompareFlag;
+  if (LHS.getValueType().isInteger()) {
+    // subcc returns a value
+    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    Opc = LHS.getValueType() == MVT::i32 ?
+          SPISD::SELECT_ICC : SPISD::SELECT_XCC;
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+    Opc = SPISD::SELECT_FCC;
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+  }
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                            const SparcTargetLowering &TLI) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Offset =
+    DAG.getNode(ISD::ADD, dl, MVT::i32,
+                DAG.getRegister(SP::I6, MVT::i32),
+                DAG.getConstant(FuncInfo->getVarArgsFrameOffset(),
+                                MVT::i32));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, Offset, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue VAList = DAG.getLoad(MVT::i32, dl, InChain, VAListPtr,
+                               MachinePointerInfo(SV), false, false, false, 0);
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue NextPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, VAList,
+                                  DAG.getConstant(VT.getSizeInBits()/8,
+                                                  MVT::i32));
+  // Store the incremented VAList to the legalized pointer
+  InChain = DAG.getStore(VAList.getValue(1), dl, NextPtr,
+                         VAListPtr, MachinePointerInfo(SV), false, false, 0);
+  // Load the actual argument out of the pointer VAList, unless this is an
+  // f64 load.
+  if (VT != MVT::f64)
+    return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // Otherwise, load it as i64, then do a bitconvert.
+  SDValue V = DAG.getLoad(MVT::i64, dl, InChain, VAList, MachinePointerInfo(),
+                          false, false, false, 0);
+
+  // Bit-Convert the value to f64.
+  SDValue Ops[2] = {
+    DAG.getNode(ISD::BITCAST, dl, MVT::f64, V),
+    V.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);  // Legalize the chain.
+  SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned SPReg = SP::O6;
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
+
+  // The resultant pointer is actually 16 words from the bottom of the stack,
+  // to provide a register spill area.
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
+                                 DAG.getConstant(96, MVT::i32));
+  SDValue Ops[2] = { NewVal, Chain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+
+static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Chain = DAG.getNode(SPISD::FLUSHW,
+                              dl, MVT::Other, DAG.getEntryNode());
+  return Chain;
+}
+
+static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned FrameReg = SP::I6;
+
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  SDValue FrameAddr;
+  if (depth == 0)
+    FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  else {
+    // flush first to make sure the windowed registers' values are in stack
+    SDValue Chain = getFLUSHW(Op, DAG);
+    FrameAddr = DAG.getCopyFromReg(Chain, dl, FrameReg, VT);
+
+    for (uint64_t i = 0; i != depth; ++i) {
+      SDValue Ptr = DAG.getNode(ISD::ADD,
+                                dl, MVT::i32,
+                                FrameAddr, DAG.getIntPtrConstant(56));
+      FrameAddr = DAG.getLoad(MVT::i32, dl,
+                              Chain,
+                              Ptr,
+                              MachinePointerInfo(), false, false, false, 0);
+    }
+  }
+  return FrameAddr;
+}
+
+static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned RetReg = SP::I7;
+
+  uint64_t depth = Op.getConstantOperandVal(0);
+
+  SDValue RetAddr;
+  if (depth == 0)
+    RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
+  else {
+    // flush first to make sure the windowed registers' values are in stack
+    SDValue Chain = getFLUSHW(Op, DAG);
+    RetAddr = DAG.getCopyFromReg(Chain, dl, SP::I6, VT);
+
+    for (uint64_t i = 0; i != depth; ++i) {
+      SDValue Ptr = DAG.getNode(ISD::ADD,
+                                dl, MVT::i32,
+                                RetAddr,
+                                DAG.getIntPtrConstant((i == depth-1)?60:56));
+      RetAddr = DAG.getLoad(MVT::i32, dl,
+                            Chain,
+                            Ptr,
+                            MachinePointerInfo(), false, false, false, 0);
+    }
+  }
+  return RetAddr;
+}
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned BROpcode;
+  unsigned CC;
+  DebugLoc dl = MI->getDebugLoc();
+  // Figure out the conditional branch opcode to use for this select_cc.
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+    BROpcode = SP::BCOND;
+    break;
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+    BROpcode = SP::FBCOND;
+    break;
+  }
+
+  CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'r': return C_RegisterClass;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, &SP::IntRegsRegClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Sparc target isn't yet aware of offsets.
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 000000000000..aa2ef711a080
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -0,0 +1,113 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_ISELLOWERING_H
+#define SPARC_ISELLOWERING_H
+
+#include "Sparc.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  class SparcSubtarget;
+
+  namespace SPISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMPICC,      // Compare two GPR operands, set icc+xcc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRXCC,       // Branch to dest on xcc condition (64-bit only).
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_XCC,  // Select between two values using the current XCC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      GLOBAL_BASE_REG, // Global base reg for PIC
+      FLUSHW       // FLUSH register windows to stack
+    };
+  }
+
+  class SparcTargetLowering : public TargetLowering {
+    const SparcSubtarget *Subtarget;
+  public:
+    SparcTargetLowering(TargetMachine &TM);
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_32(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerFormalArguments_64(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+
+    unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
+  };
+} // end namespace llvm
+
+#endif    // SPARC_ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
new file mode 100644
index 000000000000..ca1153b3fe8f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstr64Bit.td
@@ -0,0 +1,285 @@
+//===-- SparcInstr64Bit.td - 64-bit instructions for Sparc Target ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction definitions and patterns needed for 64-bit
+// code generation on SPARC v9.
+//
+// Some SPARC v9 instructions are defined in SparcInstrInfo.td because they can
+// also be used in 32-bit code running on a SPARC v9 CPU.
+//
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+// The same integer registers are used for i32 and i64 values.
+// When registers hold i32 values, the high bits are don't care.
+// This give us free trunc and anyext.
+def : Pat<(i64 (anyext i32:$val)), (COPY_TO_REGCLASS $val, I64Regs)>;
+def : Pat<(i32 (trunc i64:$val)), (COPY_TO_REGCLASS $val, IntRegs)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Shift Instructions.
+//===----------------------------------------------------------------------===//
+//
+// The 32-bit shift instructions are still available. The left shift srl
+// instructions shift all 64 bits, but it only accepts a 5-bit shift amount.
+//
+// The srl instructions only shift the low 32 bits and clear the high 32 bits.
+// Finally, sra shifts the low 32 bits and sign-extends to 64 bits.
+
+let Predicates = [Is64Bit] in {
+
+def : Pat<(i64 (zext i32:$val)), (SRLri $val, 0)>;
+def : Pat<(i64 (sext i32:$val)), (SRAri $val, 0)>;
+
+defm SLLX : F3_S<"sllx", 0b100101, 1, shl, i64, I64Regs>;
+defm SRLX : F3_S<"srlx", 0b100110, 1, srl, i64, I64Regs>;
+defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Immediates.
+//===----------------------------------------------------------------------===//
+//
+// All 32-bit immediates can be materialized with sethi+or, but 64-bit
+// immediates may require more code. There may be a point where it is
+// preferable to use a constant pool load instead, depending on the
+// microarchitecture.
+
+// The %g0 register is constant 0.
+// This is useful for stx %g0, [...], for example.
+def : Pat<(i64 0), (i64 G0)>, Requires<[Is64Bit]>;
+
+// Single-instruction patterns.
+
+// The ALU instructions want their simm13 operands as i32 immediates.
+def as_i32imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i32);
+}]>;
+def : Pat<(i64 simm13:$val), (ORri (i64 G0), (as_i32imm $val))>;
+def : Pat<(i64 SETHIimm:$val), (SETHIi (HI22 $val))>;
+
+// Double-instruction patterns.
+
+// All unsigned i32 immediates can be handled by sethi+or.
+def uimm32 : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+def : Pat<(i64 uimm32:$val), (ORri (SETHIi (HI22 $val)), (LO10 $val))>,
+      Requires<[Is64Bit]>;
+
+// All negative i33 immediates can be handled by sethi+xor.
+def nimm33 : PatLeaf<(imm), [{
+  int64_t Imm = N->getSExtValue();
+  return Imm < 0 && isInt<33>(Imm);
+}]>;
+// Bits 10-31 inverted. Same as assembler's %hix.
+def HIX22 : SDNodeXForm<imm, [{
+  uint64_t Val = (~N->getZExtValue() >> 10) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+// Bits 0-9 with ones in bits 10-31. Same as assembler's %lox.
+def LOX10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(~N->getZExtValue() & 0x3ff), MVT::i32);
+}]>;
+def : Pat<(i64 nimm33:$val), (XORri (SETHIi (HIX22 $val)), (LOX10 $val))>,
+      Requires<[Is64Bit]>;
+
+// More possible patterns:
+//
+//   (sllx sethi, n)
+//   (sllx simm13, n)
+//
+// 3 instrs:
+//
+//   (xor (sllx sethi), simm13)
+//   (sllx (xor sethi, simm13))
+//
+// 4 instrs:
+//
+//   (or sethi, (sllx sethi))
+//   (xnor sethi, (sllx sethi))
+//
+// 5 instrs:
+//
+//   (or (sllx sethi), (or sethi, simm13))
+//   (xnor (sllx sethi), (or sethi, simm13))
+//   (or (sllx sethi), (sllx sethi))
+//   (xnor (sllx sethi), (sllx sethi))
+//
+// Worst case is 6 instrs:
+//
+//   (or (sllx (or sethi, simmm13)), (or sethi, simm13))
+
+// Bits 42-63, same as assembler's %hh.
+def HH22 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 42) & ((1u << 22) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+// Bits 32-41, same as assembler's %hm.
+def HM10 : SDNodeXForm<imm, [{
+  uint64_t Val = (N->getZExtValue() >> 32) & ((1u << 10) - 1);
+  return CurDAG->getTargetConstant(Val, MVT::i32);
+}]>;
+def : Pat<(i64 imm:$val),
+          (ORrr (SLLXri (ORri (SETHIi (HH22 $val)), (HM10 $val)), (i64 32)),
+                (ORri (SETHIi (HI22 $val)), (LO10 $val)))>,
+      Requires<[Is64Bit]>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Integer Arithmetic and Logic.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+// Register-register instructions.
+
+def : Pat<(and i64:$a, i64:$b), (ANDrr $a, $b)>;
+def : Pat<(or  i64:$a, i64:$b), (ORrr  $a, $b)>;
+def : Pat<(xor i64:$a, i64:$b), (XORrr $a, $b)>;
+
+def : Pat<(and i64:$a, (not i64:$b)), (ANDNrr $a, $b)>;
+def : Pat<(or  i64:$a, (not i64:$b)), (ORNrr  $a, $b)>;
+def : Pat<(xor i64:$a, (not i64:$b)), (XNORrr $a, $b)>;
+
+def : Pat<(add i64:$a, i64:$b), (ADDrr $a, $b)>;
+def : Pat<(sub i64:$a, i64:$b), (SUBrr $a, $b)>;
+
+// Add/sub with carry were renamed to addc/subc in SPARC v9.
+def : Pat<(adde i64:$a, i64:$b), (ADDXrr $a, $b)>;
+def : Pat<(sube i64:$a, i64:$b), (SUBXrr $a, $b)>;
+
+def : Pat<(addc i64:$a, i64:$b), (ADDCCrr $a, $b)>;
+def : Pat<(subc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
+
+def : Pat<(SPcmpicc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
+
+// Register-immediate instructions.
+
+def : Pat<(and i64:$a, (i64 simm13:$b)), (ANDri $a, (as_i32imm $b))>;
+def : Pat<(or  i64:$a, (i64 simm13:$b)), (ORri  $a, (as_i32imm $b))>;
+def : Pat<(xor i64:$a, (i64 simm13:$b)), (XORri $a, (as_i32imm $b))>;
+
+def : Pat<(add i64:$a, (i64 simm13:$b)), (ADDri $a, (as_i32imm $b))>;
+def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
+
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (SUBCCri $a, (as_i32imm $b))>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Loads and Stores.
+//===----------------------------------------------------------------------===//
+//
+// All the 32-bit loads and stores are available. The extending loads are sign
+// or zero-extending to 64 bits. The LDrr and LDri instructions load 32 bits
+// zero-extended to i64. Their mnemonic is lduw in SPARC v9 (Load Unsigned
+// Word).
+//
+// SPARC v9 adds 64-bit loads as well as a sign-extending ldsw i32 loads.
+
+let Predicates = [Is64Bit] in {
+
+// 64-bit loads.
+def LDXrr  : F3_1<3, 0b001011,
+                  (outs I64Regs:$dst), (ins MEMrr:$addr),
+                  "ldx [$addr], $dst",
+                  [(set i64:$dst, (load ADDRrr:$addr))]>;
+def LDXri  : F3_2<3, 0b001011,
+                  (outs I64Regs:$dst), (ins MEMri:$addr),
+                  "ldx [$addr], $dst",
+                  [(set i64:$dst, (load ADDRri:$addr))]>;
+
+// Extending loads to i64.
+def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRrr:$addr)), (LDSBrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi8 ADDRri:$addr)), (LDSBri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi16 ADDRrr:$addr)), (LDUHrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi16 ADDRri:$addr)), (LDUHri ADDRri:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRrr:$addr)), (LDSHrr ADDRrr:$addr)>;
+def : Pat<(i64 (sextloadi16 ADDRri:$addr)), (LDSHri ADDRri:$addr)>;
+
+def : Pat<(i64 (zextloadi32 ADDRrr:$addr)), (LDrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi32 ADDRri:$addr)), (LDri ADDRri:$addr)>;
+
+// Sign-extending load of i32 into i64 is a new SPARC v9 instruction.
+def LDSWrr : F3_1<3, 0b001011,
+                 (outs I64Regs:$dst), (ins MEMrr:$addr),
+                 "ldsw [$addr], $dst",
+                 [(set i64:$dst, (sextloadi32 ADDRrr:$addr))]>;
+def LDSWri : F3_2<3, 0b001011,
+                 (outs I64Regs:$dst), (ins MEMri:$addr),
+                 "ldsw [$addr], $dst",
+                 [(set i64:$dst, (sextloadi32 ADDRri:$addr))]>;
+
+// 64-bit stores.
+def STXrr  : F3_1<3, 0b001110,
+                 (outs), (ins MEMrr:$addr, I64Regs:$src),
+                 "stx $src, [$addr]",
+                 [(store i64:$src, ADDRrr:$addr)]>;
+def STXri  : F3_2<3, 0b001110,
+                 (outs), (ins MEMri:$addr, I64Regs:$src),
+                 "stx $src, [$addr]",
+                 [(store i64:$src, ADDRri:$addr)]>;
+
+// Truncating stores from i64 are identical to the i32 stores.
+def : Pat<(truncstorei8  i64:$src, ADDRrr:$addr), (STBrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei8  i64:$src, ADDRri:$addr), (STBri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRrr:$addr), (STHrr ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr  ADDRrr:$addr, $src)>;
+def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
+
+} // Predicates = [Is64Bit]
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit Conditionals.
+//===----------------------------------------------------------------------===//
+//
+// Flag-setting instructions like subcc and addcc set both icc and xcc flags.
+// The icc flags correspond to the 32-bit result, and the xcc are for the
+// full 64-bit result.
+//
+// We reuse CMPICC SDNodes for compares, but use new BRXCC branch nodes for
+// 64-bit compares. See LowerBR_CC.
+
+let Predicates = [Is64Bit] in {
+
+let Uses = [ICC] in
+def BPXCC : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                     "bp$cc %xcc, $dst",
+                     [(SPbrxcc bb:$dst, imm:$cc)]>;
+
+// Conditional moves on %xcc.
+let Uses = [ICC], Constraints = "$f = $rd" in {
+def MOVXCCrr : Pseudo<(outs IntRegs:$rd),
+                      (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $rs2, $rd",
+                      [(set i32:$rd,
+                       (SPselectxcc i32:$rs2, i32:$f, imm:$cond))]>;
+def MOVXCCri : Pseudo<(outs IntRegs:$rd),
+                      (ins i32imm:$i, IntRegs:$f, CCOp:$cond),
+                      "mov$cond %xcc, $i, $rd",
+                      [(set i32:$rd,
+                       (SPselecticc simm11:$i, i32:$f, imm:$cond))]>;
+} // Uses, Constraints
+
+def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
+          (MOVXCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
+
+} // Predicates = [Is64Bit]
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 000000000000..f1018569153c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -0,0 +1,151 @@
+//===-- SparcInstrFormats.td - Sparc Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+  
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
+   : F2<outs, ins, asmstr, pattern> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr, 
+           list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
+  bits<4>   cond;
+  bit       annul = 0;     // currently unused
+
+  let cond        = condVal;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<8> asi = 0; // asi not currently used
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins, 
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by register rs2.
+class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12}   = x;     // extended registers.
+  let Inst{4-0}  = rs2;
+}
+
+// Shift by immediate.
+class F3_Si<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
+            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bit x = xVal;           // 1 for 64-bit shifts.
+  bits<6> shcnt;          // shcnt32 / shcnt64.
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12}   = x;     // extended registers.
+  let Inst{5-0}  = shcnt;
+}
+
+// Define rr and ri shift instructions with patterns.
+multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
+                ValueType VT, RegisterClass RC> {
+  def rr : F3_Sr<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs, RC:$rs2),
+                 !strconcat(OpcStr, " $rs, $rs2, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs, VT:$rs2))]>;
+  def ri : F3_Si<2, Op3Val, XVal, (outs RC:$rd), (ins RC:$rs, unknown:$shcnt),
+                 !strconcat(OpcStr, " $rs, $shcnt, $rd"),
+                 [(set VT:$rd, (OpNode VT:$rs, (VT imm:$shcnt)))]>;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 000000000000..39d7329f2663
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -0,0 +1,357 @@
+//===-- SparcInstrInfo.cpp - Sparc Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR
+#include "SparcGenInstrInfo.inc"
+
+using namespace llvm;
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+  : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+    RI(ST, *this), Subtarget(ST) {
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDFri ||
+      MI->getOpcode() == SP::LDDFri) {
+    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STFri ||
+      MI->getOpcode() == SP::STDFri) {
+    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+        MI->getOperand(1).getImm() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+static bool IsIntegerCC(unsigned CC)
+{
+  return  (CC <= SPCC::ICC_VC);
+}
+
+
+static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
+{
+  switch(CC) {
+  case SPCC::ICC_NE:   return SPCC::ICC_E;
+  case SPCC::ICC_E:    return SPCC::ICC_NE;
+  case SPCC::ICC_G:    return SPCC::ICC_LE;
+  case SPCC::ICC_LE:   return SPCC::ICC_G;
+  case SPCC::ICC_GE:   return SPCC::ICC_L;
+  case SPCC::ICC_L:    return SPCC::ICC_GE;
+  case SPCC::ICC_GU:   return SPCC::ICC_LEU;
+  case SPCC::ICC_LEU:  return SPCC::ICC_GU;
+  case SPCC::ICC_CC:   return SPCC::ICC_CS;
+  case SPCC::ICC_CS:   return SPCC::ICC_CC;
+  case SPCC::ICC_POS:  return SPCC::ICC_NEG;
+  case SPCC::ICC_NEG:  return SPCC::ICC_POS;
+  case SPCC::ICC_VC:   return SPCC::ICC_VS;
+  case SPCC::ICC_VS:   return SPCC::ICC_VC;
+
+  case SPCC::FCC_U:    return SPCC::FCC_O;
+  case SPCC::FCC_O:    return SPCC::FCC_U;
+  case SPCC::FCC_G:    return SPCC::FCC_LE;
+  case SPCC::FCC_LE:   return SPCC::FCC_G;
+  case SPCC::FCC_UG:   return SPCC::FCC_ULE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_UG;
+  case SPCC::FCC_L:    return SPCC::FCC_GE;
+  case SPCC::FCC_GE:   return SPCC::FCC_L;
+  case SPCC::FCC_UL:   return SPCC::FCC_UGE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_UL;
+  case SPCC::FCC_LG:   return SPCC::FCC_UE;
+  case SPCC::FCC_UE:   return SPCC::FCC_LG;
+  case SPCC::FCC_NE:   return SPCC::FCC_E;
+  case SPCC::FCC_E:    return SPCC::FCC_NE;
+  }
+  llvm_unreachable("Invalid cond code");
+}
+
+MachineInstr *
+SparcInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                         int FrameIx,
+                                         uint64_t Offset,
+                                         const MDNode *MDPtr,
+                                         DebugLoc dl) const {
+  MachineInstrBuilder MIB = BuildMI(MF, dl, get(SP::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+
+bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const
+{
+
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    //When we see a non-terminator, we are done
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    //Terminator is not a branch
+    if (!I->isBranch())
+      return true;
+
+    //Handle Unconditional branches
+    if (I->getOpcode() == SP::BA) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = 0;
+
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    unsigned Opcode = I->getOpcode();
+    if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
+      return true; //Unknown Opcode
+
+    SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
+
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+
+        //Transform the code
+        //
+        //    brCC L1
+        //    ba L2
+        // L1:
+        //    ..
+        // L2:
+        //
+        // into
+        //
+        //   brnCC L2
+        // L1:
+        //   ...
+        // L2:
+        //
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(Opcode))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB()).addImm(BranchCode);
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(SP::BA))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+    //FIXME: Handle subsequent conditional branches
+    //For now, we can't handle multiple conditional branches
+    return true;
+  }
+  return false;
+}
+
+unsigned
+SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "Sparc branch conditions should have one component!");
+
+  if (Cond.empty()) {
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(SP::BA)).addMBB(TBB);
+    return 1;
+  }
+
+  //Conditional branch
+  unsigned CC = Cond[0].getImm();
+
+  if (IsIntegerCC(CC))
+    BuildMI(&MBB, DL, get(SP::BCOND)).addMBB(TBB).addImm(CC);
+  else
+    BuildMI(&MBB, DL, get(SP::FBCOND)).addMBB(TBB).addImm(CC);
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(SP::BA)).addMBB(FBB);
+  return 2;
+}
+
+unsigned SparcInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const
+{
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    if (I->getOpcode() != SP::BA
+        && I->getOpcode() != SP::BCOND
+        && I->getOpcode() != SP::FBCOND)
+      break; // Not a branch
+
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg))
+    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else
+    llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC,
+                    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else if (RC == &SP::DFPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC,
+                     const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == &SP::FPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == &SP::DFPRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+  SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+
+
+  DebugLoc dl;
+
+  BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+  SparcFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 000000000000..204f69855c23
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -0,0 +1,107 @@
+//===-- SparcInstrInfo.h - Sparc Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCINSTRUCTIONINFO_H
+#define SPARCINSTRUCTIONINFO_H
+
+#include "SparcRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "SparcGenInstrInfo.inc"
+
+namespace llvm {
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public SparcGenInstrInfo {
+  const SparcRegisterInfo RI;
+  const SparcSubtarget& Subtarget;
+public:
+  explicit SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  /// emitFrameIndexDebugValue - Emit a target-dependent form of
+  /// DBG_VALUE encoding the address of a frame index.
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc dl) const;
+
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify = false) const ;
+
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+  
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+  
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 000000000000..5ff439583c5c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -0,0 +1,822 @@
+//===-- SparcInstrInfo.td - Target Description for Sparc Target -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// True when generating 32-bit code.
+def Is32Bit : Predicate<"!Subtarget.is64Bit()">;
+
+// True when generating 64-bit code. This also implies HasV9.
+def Is64Bit : Predicate<"Subtarget.is64Bit()">;
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget.isV9()">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget.isVIS()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>;
+
+def simm13  : PatLeaf<(imm), [{ return isInt<13>(N->getSExtValue()); }]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023,
+                                   MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return (((unsigned)N->getZExtValue() >> 10) << 10) ==
+         (unsigned)N->getZExtValue();
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<iPTR, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def MEMrr : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+def MEMri : Operand<iPTR> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpfcc : 
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc : 
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutGlue]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInGlue]>;
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def SDT_SPCall    : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
+
+def SDT_SPRet     : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
+def getPCX        : Operand<i32> {
+  let PrintMethod = "printGetPCX";
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<27>;  // Less or Equal
+def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<29>;  // Ordered
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set i32:$dst, (OpNode i32:$b, i32:$c))]>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set i32:$dst, (OpNode i32:$b, (i32 simm13:$c)))]>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern>;
+
+// GETPCX for PIC
+let Defs = [O7] in {
+  def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "!ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let hasSideEffects = 1, mayStore = 1 in {
+  let rd = 0, rs1 = 0, rs2 = 0 in
+    def FLUSHW : F3_1<0b10, 0b101011, (outs), (ins),
+                      "flushw",
+                      [(flushw)]>, Requires<[HasV9]>;
+  let rd = 0, rs1 = 1, simm13 = 3 in
+    def TA3 : F3_2<0b10, 0b111010, (outs), (ins),
+                   "ta 3",
+                   [(flushw)]>;
+}
+
+def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
+                "unimp $val", []>;
+
+// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
+// fpmover pass.
+let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
+  def FpMOVD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpMOVD $src, $dst", []>;
+  def FpNEGD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpNEGD $src, $dst",
+                      [(set f64:$dst, (fneg f64:$src))]>;
+  def FpABSD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpABSD $src, $dst",
+                      [(set f64:$dst, (fabs f64:$src))]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.  This has to handle all
+// permutations of selection between i32/f32/f64 on ICC and FCC.
+  // Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in { 
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set i32:$dst, (SPselecticc i32:$T, i32:$F, imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set f32:$dst, (SPselecticc f32:$T, f32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+}
+
+let usesCustomInserter = 1, Uses = [FCC] in {
+
+  def SELECT_CC_Int_FCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set i32:$dst, (SPselectfcc i32:$T, i32:$F, imm:$Cond))]>;
+
+  def SELECT_CC_FP_FCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set f32:$dst, (SPselectfcc f32:$T, f32:$F, imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+}
+
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
+  let rd = O7.Num, rs1 = G0.Num in
+    def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                   "jmp %o7+$val", [(retflag simm13:$val)]>;
+
+  let rd = I7.Num, rs1 = G0.Num in
+    def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
+                  "jmp %i7+$val", []>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+def LDSBrr : F3_1<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set i32:$dst, (sextloadi8 ADDRrr:$addr))]>;
+def LDSBri : F3_2<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set i32:$dst, (sextloadi8 ADDRri:$addr))]>;
+def LDSHrr : F3_1<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set i32:$dst, (sextloadi16 ADDRrr:$addr))]>;
+def LDSHri : F3_2<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set i32:$dst, (sextloadi16 ADDRri:$addr))]>;
+def LDUBrr : F3_1<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldub [$addr], $dst",
+                  [(set i32:$dst, (zextloadi8 ADDRrr:$addr))]>;
+def LDUBri : F3_2<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldub [$addr], $dst",
+                  [(set i32:$dst, (zextloadi8 ADDRri:$addr))]>;
+def LDUHrr : F3_1<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "lduh [$addr], $dst",
+                  [(set i32:$dst, (zextloadi16 ADDRrr:$addr))]>;
+def LDUHri : F3_2<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "lduh [$addr], $dst",
+                  [(set i32:$dst, (zextloadi16 ADDRri:$addr))]>;
+def LDrr   : F3_1<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set i32:$dst, (load ADDRrr:$addr))]>;
+def LDri   : F3_2<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set i32:$dst, (load ADDRri:$addr))]>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+def LDFrr  : F3_1<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set f32:$dst, (load ADDRrr:$addr))]>;
+def LDFri  : F3_2<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set f32:$dst, (load ADDRri:$addr))]>;
+def LDDFrr : F3_1<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldd [$addr], $dst",
+                  [(set f64:$dst, (load ADDRrr:$addr))]>;
+def LDDFri : F3_2<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMri:$addr),
+                  "ldd [$addr], $dst",
+                  [(set f64:$dst, (load ADDRri:$addr))]>;
+
+// Section B.4 - Store Integer Instructions, p. 95
+def STBrr : F3_1<3, 0b000101,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 i32:$src, ADDRrr:$addr)]>;
+def STBri : F3_2<3, 0b000101,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 i32:$src, ADDRri:$addr)]>;
+def STHrr : F3_1<3, 0b000110,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 i32:$src, ADDRrr:$addr)]>;
+def STHri : F3_2<3, 0b000110,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 i32:$src, ADDRri:$addr)]>;
+def STrr  : F3_1<3, 0b000100,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store i32:$src, ADDRrr:$addr)]>;
+def STri  : F3_2<3, 0b000100,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store i32:$src, ADDRri:$addr)]>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+def STFrr   : F3_1<3, 0b100100,
+                   (outs), (ins MEMrr:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store f32:$src, ADDRrr:$addr)]>;
+def STFri   : F3_2<3, 0b100100,
+                   (outs), (ins MEMri:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store f32:$src, ADDRri:$addr)]>;
+def STDFrr  : F3_1<3, 0b100111,
+                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
+                   "std  $src, [$addr]",
+                   [(store f64:$src, ADDRrr:$addr)]>;
+def STDFri  : F3_2<3, 0b100111,
+                   (outs), (ins MEMri:$addr, DFPRegs:$src),
+                   "std $src, [$addr]",
+                   [(store f64:$src, ADDRri:$addr)]>;
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (outs IntRegs:$dst), (ins i32imm:$src),
+                 "sethi $src, $dst",
+                 [(set i32:$dst, SETHIimm:$src)]>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "andn $b, $c, $dst",
+                   [(set i32:$dst, (and i32:$b, (not i32:$c)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "andn $b, $c, $dst", []>;
+
+defm OR     : F3_12<"or", 0b000010, or>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "orn $b, $c, $dst",
+                   [(set i32:$dst, (or i32:$b, (not i32:$c)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "orn $b, $c, $dst", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set i32:$dst, (not (xor i32:$b, i32:$c)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "xnor $b, $c, $dst", []>;
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl>;
+defm SRL : F3_12<"srl", 0b100110, srl>;
+defm SRA : F3_12<"sra", 0b100111, sra>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+def LEA_ADDri   : F3_2<2, 0b000000,
+                   (outs IntRegs:$dst), (ins MEMri:$addr),
+                   "add ${addr:arith}, $dst",
+                   [(set i32:$dst, ADDRri:$addr)]>;
+
+let Defs = [ICC] in                   
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+
+let Uses = [ICC] in
+  defm ADDX  : F3_12<"addx", 0b001000, adde>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
+let Uses = [ICC] in 
+  defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+
+let Defs = [ICC] in 
+  defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+
+let Uses = [ICC], Defs = [ICC] in
+  def SUBXCCrr: F3_1<2, 0b011100, 
+                (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                "subxcc $b, $c, $dst", []>;
+
+
+// Section B.18 - Multiply Instructions, p. 113
+let Defs = [Y] in {
+  defm UMUL : F3_12np<"umul", 0b001010>;
+  defm SMUL : F3_12  <"smul", 0b001011, mul>;
+}
+
+// Section B.19 - Divide Instructions, p. 115
+let Defs = [Y] in {
+  defm UDIV : F3_12np<"udiv", 0b001110>;
+  defm SDIV : F3_12np<"sdiv", 0b001111>;
+}
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// conditional branch class:
+class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+let isBarrier = 1 in
+  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
+                      "ba $dst",
+                      [(br bb:$dst)]>;
+
+// FIXME: the encoding for the JIT should look at the condition field.
+let Uses = [ICC] in
+  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                         "b$cc $dst",
+                        [(SPbricc bb:$dst, imm:$cc)]>;
+
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+// floating-point conditional branch class:
+class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+// FIXME: the encoding for the JIT should look at the condition field.
+let Uses = [FCC] in
+  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                              "fb$cc $dst",
+                              [(SPbrfcc bb:$dst, imm:$cc)]>;
+
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O6],
+    hasDelaySlot = 1, isCall = 1,
+    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
+    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
+        ICC, FCC, Y] in {
+  def CALL : InstSP<(outs), (ins calltarget:$dst, variable_ops),
+                    "call $dst", []> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+  
+  // indirect calls
+  def JMPLrr : F3_1<2, 0b111000,
+                    (outs), (ins MEMrr:$ptr, variable_ops),
+                    "call $ptr",
+                    [(call ADDRrr:$ptr)]>;
+  def JMPLri : F3_2<2, 0b111000,
+                    (outs), (ins MEMri:$ptr, variable_ops),
+                    "call $ptr",
+                    [(call ADDRri:$ptr)]>;
+}
+
+// Section B.28 - Read State Register Instructions
+let Uses = [Y] in 
+  def RDY : F3_1<2, 0b101000,
+                 (outs IntRegs:$dst), (ins),
+                 "rd %y, $dst", []>;
+
+// Section B.29 - Write State Register Instructions
+let Defs = [Y] in {
+  def WRYrr : F3_1<2, 0b110000,
+                   (outs), (ins IntRegs:$b, IntRegs:$c),
+                   "wr $b, $c, %y", []>;
+  def WRYri : F3_2<2, 0b110000,
+                   (outs), (ins IntRegs:$b, i32imm:$c),
+                   "wr $b, $c, %y", []>;
+}
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3<2, 0b110100, 0b011000100,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fitos $src, $dst",
+                 [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOD : F3_3<2, 0b110100, 0b011001000, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fitod $src, $dst",
+                 [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3<2, 0b110100, 0b011010001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fstoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
+def FDTOI : F3_3<2, 0b110100, 0b011010010,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstod $src, $dst",
+                 [(set f64:$dst, (fextend f32:$src))]>;
+def FDTOS : F3_3<2, 0b110100, 0b011000110,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtos $src, $dst",
+                 [(set f32:$dst, (fround f64:$src))]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3<2, 0b110100, 0b000000001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fmovs $src, $dst", []>;
+def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fnegs $src, $dst",
+                 [(set f32:$dst, (fneg f32:$src))]>;
+def FABSS : F3_3<2, 0b110100, 0b000001001, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fabss $src, $dst",
+                 [(set f32:$dst, (fabs f32:$src))]>;
+
+
+// Floating-point Square Root Instructions, p.145
+def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+                  (outs FPRegs:$dst), (ins FPRegs:$src),
+                  "fsqrts $src, $dst",
+                  [(set f32:$dst, (fsqrt f32:$src))]>;
+def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                  "fsqrtd $src, $dst",
+                  [(set f64:$dst, (fsqrt f64:$src))]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fadds $src1, $src2, $dst",
+                  [(set f32:$dst, (fadd f32:$src1, f32:$src2))]>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "faddd $src1, $src2, $dst",
+                  [(set f64:$dst, (fadd f64:$src1, f64:$src2))]>;
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsubs $src1, $src2, $dst",
+                  [(set f32:$dst, (fsub f32:$src1, f32:$src2))]>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fsubd $src1, $src2, $dst",
+                  [(set f64:$dst, (fsub f64:$src1, f64:$src2))]>;
+
+// Floating-point Multiply and Divide Instructions, p. 147
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fmuls $src1, $src2, $dst",
+                  [(set f32:$dst, (fmul f32:$src1, f32:$src2))]>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fmuld $src1, $src2, $dst",
+                  [(set f64:$dst, (fmul f64:$src1, f64:$src2))]>;
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsmuld $src1, $src2, $dst",
+                  [(set f64:$dst, (fmul (fextend f32:$src1),
+                                        (fextend f32:$src2)))]>;
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                 "fdivs $src1, $src2, $dst",
+                 [(set f32:$dst, (fdiv f32:$src1, f32:$src2))]>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                 "fdivd $src1, $src2, $dst",
+                 [(set f64:$dst, (fdiv f64:$src1, f64:$src2))]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock. This behavior
+// is modelled with a forced noop after the instruction.
+let Defs = [FCC] in {
+  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+                   (outs), (ins FPRegs:$src1, FPRegs:$src2),
+                   "fcmps $src1, $src2\n\tnop",
+                   [(SPcmpfcc f32:$src1, f32:$src2)]>;
+  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+                   (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
+                   "fcmpd $src1, $src2\n\tnop",
+                   [(SPcmpfcc f64:$src1, f64:$src2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], Constraints = "$T = $dst" in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  // FIXME: Add instruction encodings for the JIT some day.
+  let Uses = [ICC] in {
+    def MOVICCrr
+      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+               "mov$cc %icc, $F, $dst",
+               [(set i32:$dst, (SPselecticc i32:$F, i32:$T, imm:$cc))]>;
+    def MOVICCri
+      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+               "mov$cc %icc, $F, $dst",
+               [(set i32:$dst, (SPselecticc simm11:$F, i32:$T, imm:$cc))]>;
+  }
+
+  let Uses = [FCC] in {
+    def MOVFCCrr
+      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+               "mov$cc %fcc0, $F, $dst",
+               [(set i32:$dst, (SPselectfcc i32:$F, i32:$T, imm:$cc))]>;
+    def MOVFCCri
+      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+               "mov$cc %fcc0, $F, $dst",
+               [(set i32:$dst, (SPselectfcc simm11:$F, i32:$T, imm:$cc))]>;
+  }
+
+  let Uses = [ICC] in {
+    def FMOVS_ICC
+      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+               "fmovs$cc %icc, $F, $dst",
+               [(set f32:$dst,
+                           (SPselecticc f32:$F, f32:$T, imm:$cc))]>;
+    def FMOVD_ICC
+      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+               "fmovd$cc %icc, $F, $dst",
+               [(set f64:$dst, (SPselecticc f64:$F, f64:$T, imm:$cc))]>;
+  }
+
+  let Uses = [FCC] in {
+    def FMOVS_FCC
+      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+               "fmovs$cc %fcc0, $F, $dst",
+               [(set f32:$dst, (SPselectfcc f32:$F, f32:$T, imm:$cc))]>;
+    def FMOVD_FCC
+      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+               "fmovd$cc %fcc0, $F, $dst",
+               [(set f64:$dst, (SPselectfcc f64:$F, f64:$T, imm:$cc))]>;
+  }
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fmovd $src, $dst", []>;
+  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fnegd $src, $dst",
+                   [(set f64:$dst, (fneg f64:$src))]>;
+  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fabsd $src, $dst",
+                   [(set f64:$dst, (fabs f64:$src))]>;
+}
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
+def POPCrr : F3_1<2, 0b101110, 
+                  (outs IntRegs:$dst), (ins IntRegs:$src),
+                  "popc $src, $dst", []>, Requires<[HasV9]>;
+def : Pat<(ctpop i32:$src),
+          (POPCrr (SLLri $src, 0))>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri (i32 G0), imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+// subc
+def : Pat<(subc i32:$b, i32:$c),
+          (SUBCCrr $b, $c)>;
+def : Pat<(subc i32:$b, simm13:$val),
+          (SUBCCri $b, imm:$val)>;
+
+// Global addresses, constant pool entries
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add i32:$r, (SPlo tglobaladdr:$in)),
+          (ADDri $r, tglobaladdr:$in)>;
+def : Pat<(add i32:$r, (SPlo tconstpool:$in)),
+          (ADDri $r, tconstpool:$in)>;
+
+// Calls: 
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+
+include "SparcInstr64Bit.td"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..e7442826e78b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- SparcMachineFunctionInfo.cpp - Sparc Machine Function Info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void SparcMachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 000000000000..90c27a4459a1
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -0,0 +1,48 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SPARCMACHINEFUNCTIONINFO_H
+#define SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  class SparcMachineFunctionInfo : public MachineFunctionInfo {
+    virtual void anchor();
+  private:
+    unsigned GlobalBaseReg;
+
+    /// VarArgsFrameOffset - Frame offset to start of varargs area.
+    int VarArgsFrameOffset;
+
+    /// SRetReturnReg - Holds the virtual register into which the sret
+    /// argument is passed.
+    unsigned SRetReturnReg;
+  public:
+    SparcMachineFunctionInfo()
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+    explicit SparcMachineFunctionInfo(MachineFunction &MF)
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+
+    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+    void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
+
+    unsigned getSRetReturnReg() const { return SRetReturnReg; }
+    void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 000000000000..db9b30eb4330
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -0,0 +1,110 @@
+//===-- SparcRegisterInfo.cpp - SPARC Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcRegisterInfo.h"
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "SparcGenRegisterInfo.inc"
+
+using namespace llvm;
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
+                                     const TargetInstrInfo &tii)
+  : SparcGenRegisterInfo(SP::I7), Subtarget(st), TII(tii) {
+}
+
+const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  // FIXME: G1 reserved for now for large imm generation by frame code.
+  Reserved.set(SP::G1);
+  Reserved.set(SP::G2);
+  Reserved.set(SP::G3);
+  Reserved.set(SP::G4);
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G5);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+  return Reserved;
+}
+
+const TargetRegisterClass*
+SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
+}
+
+void
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(FIOperandNum + 1).getImm();
+
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::I6, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  } else {
+    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
+    // scavenge a register here instead of reserving G1 all of the time.
+    unsigned OffHi = (unsigned)Offset >> 10U;
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(SP::I6);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  }
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return SP::I6;
+}
+
+unsigned SparcRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned SparcRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 000000000000..f91df5398953
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h
@@ -0,0 +1,59 @@
+//===-- SparcRegisterInfo.h - Sparc Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCREGISTERINFO_H
+#define SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "SparcGenRegisterInfo.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+
+  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                       RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 000000000000..497e7c5d5612
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -0,0 +1,170 @@
+//===-- SparcRegisterInfo.td - Sparc Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file 
+//===----------------------------------------------------------------------===//
+
+class SparcReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "SP";
+}
+
+class SparcCtrlReg<string n>: Register<n> {
+  let Namespace = "SP";
+}
+
+let Namespace = "SP" in {
+def sub_even : SubRegIndex;
+def sub_odd  : SubRegIndex;
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
+  let Num = num;
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even, sub_odd];
+  let CoveredBySubRegs = 1;
+}
+
+// Control Registers
+def ICC : SparcCtrlReg<"ICC">; // This represents icc and xcc in 64-bit code.
+def FCC : SparcCtrlReg<"FCC">;
+
+// Y register
+def Y : SparcCtrlReg<"Y">;
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>; 
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>; 
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>; 
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>; 
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>; 
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>; 
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>; 
+def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>; 
+def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>; 
+def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>; 
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>; 
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>; 
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>; 
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>; 
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>; 
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[72]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[73]>;
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[74]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[75]>;
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[76]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[77]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[78]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[79]>;
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[80]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[81]>;
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[82]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[83]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[84]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+// This register class should not be used to hold i64 values, use the I64Regs
+// register class for that. The i64 type is included here to allow i64 patterns
+// using the integer instructions.
+def IntRegs : RegisterClass<"SP", [i32, i64], 32,
+                            (add L0, L1, L2, L3, L4, L5, L6,
+                                 L7, I0, I1, I2, I3, I4, I5,
+                                 O0, O1, O2, O3, O4, O5, O7,
+                                 G1,
+                                 // Non-allocatable regs:
+                                 G2, G3, G4, // FIXME: OK for use only in
+                                             // applications, not libraries.
+                                 O6, // stack ptr
+                                 I6, // frame ptr
+                                 I7, // return address
+                                 G0, // constant zero
+                                 G5, G6, G7 // reserved for kernel
+                                 )>;
+
+// Register class for 64-bit mode, with a 64-bit spill slot size.
+// These are the same as the 32-bit registers, so TableGen will consider this
+// to be a sub-class of IntRegs. That works out because requiring a 64-bit
+// spill slot is a stricter constraint than only requiring a 32-bit spill slot.
+def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
+
+// Floating point register classes.
+def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..190c575a52de
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- SparcSelectionDAGInfo.cpp - Sparc SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SparcSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sparc-selectiondag-info"
+#include "SparcTargetMachine.h"
+using namespace llvm;
+
+SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+SparcSelectionDAGInfo::~SparcSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h
new file mode 100644
index 000000000000..dcd42037253d
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- SparcSelectionDAGInfo.h - Sparc SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCSELECTIONDAGINFO_H
+#define SPARCSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SparcTargetMachine;
+
+class SparcSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit SparcSelectionDAGInfo(const SparcTargetMachine &TM);
+  ~SparcSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 000000000000..e5b2aeb1bb85
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -0,0 +1,46 @@
+//===-- SparcSubtarget.cpp - SPARC Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "SparcGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void SparcSubtarget::anchor() { }
+
+SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
+                               const std::string &FS,  bool is64Bit) :
+  SparcGenSubtargetInfo(TT, CPU, FS),
+  IsV9(false),
+  V8DeprecatedInsts(false),
+  IsVIS(false),
+  Is64Bit(is64Bit) {
+  
+  // Determine default and user specified characteristics
+  std::string CPUName = CPU;
+  if (CPUName.empty()) {
+    if (is64Bit)
+      CPUName = "v9";
+    else
+      CPUName = "v8";
+  }
+  IsV9 = CPUName == "v9";
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FS);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 000000000000..a81931b34aa2
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -0,0 +1,59 @@
+//===-- SparcSubtarget.h - Define Subtarget for the SPARC -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_SUBTARGET_H
+#define SPARC_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "SparcGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class SparcSubtarget : public SparcGenSubtargetInfo {
+  virtual void anchor();
+  bool IsV9;
+  bool V8DeprecatedInsts;
+  bool IsVIS;
+  bool Is64Bit;
+  
+public:
+  SparcSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool is64bit);
+
+  bool isV9() const { return IsV9; }
+  bool isVIS() const { return IsVIS; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  
+  bool is64Bit() const { return Is64Bit; }
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit()) {
+      p = "E-p:64:64:64-i64:64:64-f64:64:64-f128:128:128-n32:64";
+    } else {
+      p = "E-p:32:32:32-i64:64:64-f64:64:64-f128:64:64-n32";
+    }
+    return std::string(p);
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 000000000000..60bceb708fbc
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -0,0 +1,98 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcTargetMachine.h"
+#include "Sparc.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSparcTarget() {
+  // Register the target.
+  RegisterTargetMachine<SparcV8TargetMachine> X(TheSparcTarget);
+  RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
+}
+
+/// SparcTargetMachine ctor - Create an ILP32 architecture model
+///
+SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
+                                       bool is64bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, is64bit),
+    DL(Subtarget.getDataLayout()),
+    InstrInfo(Subtarget),
+    TLInfo(*this), TSInfo(*this),
+    FrameLowering(Subtarget) {
+}
+
+namespace {
+/// Sparc Code Generator Pass Configuration Options.
+class SparcPassConfig : public TargetPassConfig {
+public:
+  SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  SparcTargetMachine &getSparcTargetMachine() const {
+    return getTM<SparcTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new SparcPassConfig(this, PM);
+}
+
+bool SparcPassConfig::addInstSelector() {
+  addPass(createSparcISelDag(getSparcTargetMachine()));
+  return false;
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool SparcPassConfig::addPreEmitPass(){
+  addPass(createSparcFPMoverPass(getSparcTargetMachine()));
+  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  return true;
+}
+
+void SparcV8TargetMachine::anchor() { }
+
+SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {
+}
+
+void SparcV9TargetMachine::anchor() { }
+
+SparcV9TargetMachine::SparcV9TargetMachine(const Target &T,
+                                           StringRef TT,  StringRef CPU,
+                                           StringRef FS,
+                                           const TargetOptions &Options,
+                                           Reloc::Model RM,
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {
+}
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 000000000000..081075de2dc8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -0,0 +1,87 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETMACHINE_H
+#define SPARCTARGETMACHINE_H
+
+#include "SparcFrameLowering.h"
+#include "SparcISelLowering.h"
+#include "SparcInstrInfo.h"
+#include "SparcSelectionDAGInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  SparcSubtarget Subtarget;
+  const DataLayout DL;       // Calculates type size & alignment
+  SparcInstrInfo InstrInfo;
+  SparcTargetLowering TLInfo;
+  SparcSelectionDAGInfo TSInfo;
+  SparcFrameLowering FrameLowering;
+public:
+  SparcTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL, bool is64bit);
+
+  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameLowering  *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const SparcSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const SparcRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const SparcTargetLowering* getTargetLowering() const {
+    return &TLInfo;
+  }
+  virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+/// SparcV8TargetMachine - Sparc 32-bit target machine
+///
+class SparcV8TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV8TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+/// SparcV9TargetMachine - Sparc 64-bit target machine
+///
+class SparcV9TargetMachine : public SparcTargetMachine {
+  virtual void anchor();
+public:
+  SparcV9TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       const TargetOptions &Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 000000000000..bb714632349a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -0,0 +1,21 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSparcTarget;
+Target llvm::TheSparcV9Target;
+
+extern "C" void LLVMInitializeSparcTargetInfo() { 
+  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9> Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+}
diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp
new file mode 100644
index 000000000000..9a78ebc3facb
--- /dev/null
+++ b/contrib/llvm/lib/Target/Target.cpp
@@ -0,0 +1,116 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for 
+// libLLVMTarget.a, which implements target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <cstring>
+
+using namespace llvm;
+
+void llvm::initializeTarget(PassRegistry &Registry) {
+  initializeDataLayoutPass(Registry);
+  initializeTargetLibraryInfoPass(Registry);
+}
+
+void LLVMInitializeTarget(LLVMPassRegistryRef R) {
+  initializeTarget(*unwrap(R));
+}
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+  return wrap(new DataLayout(StringRep));
+}
+
+void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new DataLayout(*unwrap(TD)));
+}
+
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetLibraryInfo(*unwrap(TLI)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+  std::string StringRep = unwrap(TD)->getStringRepresentation();
+  return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+  return unwrap(TD)->isLittleEndian() ? LLVMLittleEndian : LLVMBigEndian;
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+  return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return unwrap(TD)->getPointerSize(AS);
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getCallFrameTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+                                        LLVMValueRef GlobalVar) {
+  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                             unsigned long long Offset) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                                       unsigned Element) {
+  StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
+}
diff --git a/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 000000000000..64bd56f6e7df
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetIntrinsicInfo.cpp
@@ -0,0 +1,30 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Function.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(Function *F) const {
+  const ValueName *ValName = F->getValueName();
+  if (!ValName)
+    return 0;
+  return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}
diff --git a/contrib/llvm/lib/Target/TargetJITInfo.cpp b/contrib/llvm/lib/Target/TargetJITInfo.cpp
new file mode 100644
index 000000000000..aafedf8749b1
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetJITInfo.cpp
@@ -0,0 +1,14 @@
+//===- Target/TargetJITInfo.h - Target Information for JIT ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetJITInfo.h"
+
+using namespace llvm;
+
+void TargetJITInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/TargetLibraryInfo.cpp b/contrib/llvm/lib/Target/TargetLibraryInfo.cpp
new file mode 100644
index 000000000000..ee88ce77c09f
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLibraryInfo.cpp
@@ -0,0 +1,645 @@
+//===-- TargetLibraryInfo.cpp - Runtime library information ----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetLibraryInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+// Register the default implementation.
+INITIALIZE_PASS(TargetLibraryInfo, "targetlibinfo",
+                "Target Library Information", false, true)
+char TargetLibraryInfo::ID = 0;
+
+void TargetLibraryInfo::anchor() { }
+
+const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
+  {
+    "_IO_getc",
+    "_IO_putc",
+    "_ZdaPv",
+    "_ZdlPv",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
+    "__cxa_atexit",
+    "__cxa_guard_abort",
+    "__cxa_guard_acquire",
+    "__cxa_guard_release",
+    "__isoc99_scanf",
+    "__isoc99_sscanf",
+    "__memcpy_chk",
+    "__strdup",
+    "__strndup",
+    "__strtok_r",
+    "abs",
+    "access",
+    "acos",
+    "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
+    "acosl",
+    "asin",
+    "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
+    "asinl",
+    "atan",
+    "atan2",
+    "atan2f",
+    "atan2l",
+    "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
+    "atanl",
+    "atof",
+    "atoi",
+    "atol",
+    "atoll",
+    "bcmp",
+    "bcopy",
+    "bzero",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
+    "ceil",
+    "ceilf",
+    "ceill",
+    "chmod",
+    "chown",
+    "clearerr",
+    "closedir",
+    "copysign",
+    "copysignf",
+    "copysignl",
+    "cos",
+    "cosf",
+    "cosh",
+    "coshf",
+    "coshl",
+    "cosl",
+    "ctermid",
+    "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
+    "exp2",
+    "exp2f",
+    "exp2l",
+    "expf",
+    "expl",
+    "expm1",
+    "expm1f",
+    "expm1l",
+    "fabs",
+    "fabsf",
+    "fabsl",
+    "fclose",
+    "fdopen",
+    "feof",
+    "ferror",
+    "fflush",
+    "ffs",
+    "ffsl",
+    "ffsll",
+    "fgetc",
+    "fgetpos",
+    "fgets",
+    "fileno",
+    "fiprintf",
+    "flockfile",
+    "floor",
+    "floorf",
+    "floorl",
+    "fmod",
+    "fmodf",
+    "fmodl",
+    "fopen",
+    "fopen64",
+    "fprintf",
+    "fputc",
+    "fputs",
+    "fread",
+    "free",
+    "frexp",
+    "frexpf",
+    "frexpl",
+    "fscanf",
+    "fseek",
+    "fseeko",
+    "fseeko64",
+    "fsetpos",
+    "fstat",
+    "fstat64",
+    "fstatvfs",
+    "fstatvfs64",
+    "ftell",
+    "ftello",
+    "ftello64",
+    "ftrylockfile",
+    "funlockfile",
+    "fwrite",
+    "getc",
+    "getc_unlocked",
+    "getchar",
+    "getenv",
+    "getitimer",
+    "getlogin_r",
+    "getpwnam",
+    "gets",
+    "htonl",
+    "htons",
+    "iprintf",
+    "isascii",
+    "isdigit",
+    "labs",
+    "lchown",
+    "llabs",
+    "log",
+    "log10",
+    "log10f",
+    "log10l",
+    "log1p",
+    "log1pf",
+    "log1pl",
+    "log2",
+    "log2f",
+    "log2l",
+    "logb",
+    "logbf",
+    "logbl",
+    "logf",
+    "logl",
+    "lstat",
+    "lstat64",
+    "malloc",
+    "memalign",
+    "memccpy",
+    "memchr",
+    "memcmp",
+    "memcpy",
+    "memmove",
+    "memrchr",
+    "memset",
+    "memset_pattern16",
+    "mkdir",
+    "mktime",
+    "modf",
+    "modff",
+    "modfl",
+    "nearbyint",
+    "nearbyintf",
+    "nearbyintl",
+    "ntohl",
+    "ntohs",
+    "open",
+    "open64",
+    "opendir",
+    "pclose",
+    "perror",
+    "popen",
+    "posix_memalign",
+    "pow",
+    "powf",
+    "powl",
+    "pread",
+    "printf",
+    "putc",
+    "putchar",
+    "puts",
+    "pwrite",
+    "qsort",
+    "read",
+    "readlink",
+    "realloc",
+    "reallocf",
+    "realpath",
+    "remove",
+    "rename",
+    "rewind",
+    "rint",
+    "rintf",
+    "rintl",
+    "rmdir",
+    "round",
+    "roundf",
+    "roundl",
+    "scanf",
+    "setbuf",
+    "setitimer",
+    "setvbuf",
+    "sin",
+    "sinf",
+    "sinh",
+    "sinhf",
+    "sinhl",
+    "sinl",
+    "siprintf",
+    "snprintf",
+    "sprintf",
+    "sqrt",
+    "sqrtf",
+    "sqrtl",
+    "sscanf",
+    "stat",
+    "stat64",
+    "statvfs",
+    "statvfs64",
+    "stpcpy",
+    "stpncpy",
+    "strcasecmp",
+    "strcat",
+    "strchr",
+    "strcmp",
+    "strcoll",
+    "strcpy",
+    "strcspn",
+    "strdup",
+    "strlen",
+    "strncasecmp",
+    "strncat",
+    "strncmp",
+    "strncpy",
+    "strndup",
+    "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtok",
+    "strtok_r",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
+    "strxfrm",
+    "system",
+    "tan",
+    "tanf",
+    "tanh",
+    "tanhf",
+    "tanhl",
+    "tanl",
+    "times",
+    "tmpfile",
+    "tmpfile64",
+    "toascii",
+    "trunc",
+    "truncf",
+    "truncl",
+    "uname",
+    "ungetc",
+    "unlink",
+    "unsetenv",
+    "utime",
+    "utimes",
+    "valloc",
+    "vfprintf",
+    "vfscanf",
+    "vprintf",
+    "vscanf",
+    "vsnprintf",
+    "vsprintf",
+    "vsscanf",
+    "write"
+  };
+
+/// initialize - Initialize the set of available library functions based on the
+/// specified target triple.  This should be carefully written so that a missing
+/// target triple gets a sane set of defaults.
+static void initialize(TargetLibraryInfo &TLI, const Triple &T,
+                       const char **StandardNames) {
+  initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
+
+#ifndef NDEBUG
+  // Verify that the StandardNames array is in alphabetical order.
+  for (unsigned F = 1; F < LibFunc::NumLibFuncs; ++F) {
+    if (strcmp(StandardNames[F-1], StandardNames[F]) >= 0)
+      llvm_unreachable("TargetLibraryInfo function names must be sorted");
+  }
+#endif // !NDEBUG
+  
+  // memset_pattern16 is only available on iOS 3.0 and Mac OS/X 10.5 and later.
+  if (T.isMacOSX()) {
+    if (T.isMacOSXVersionLT(10, 5))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else if (T.getOS() == Triple::IOS) {
+    if (T.isOSVersionLT(3, 0))
+      TLI.setUnavailable(LibFunc::memset_pattern16);
+  } else {
+    TLI.setUnavailable(LibFunc::memset_pattern16);
+  }
+
+  if (T.isMacOSX() && T.getArch() == Triple::x86 &&
+      !T.isMacOSXVersionLT(10, 7)) {
+    // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
+    // we don't care about) have two versions; on recent OSX, the one we want
+    // has a $UNIX2003 suffix. The two implementations are identical except
+    // for the return value in some edge cases.  However, we don't want to
+    // generate code that depends on the old symbols.
+    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+  }
+
+  // iprintf and friends are only available on XCore and TCE.
+  if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
+    TLI.setUnavailable(LibFunc::iprintf);
+    TLI.setUnavailable(LibFunc::siprintf);
+    TLI.setUnavailable(LibFunc::fiprintf);
+  }
+
+  if (T.getOS() == Triple::Win32) {
+    // Win32 does not support long double
+    TLI.setUnavailable(LibFunc::acosl);
+    TLI.setUnavailable(LibFunc::asinl);
+    TLI.setUnavailable(LibFunc::atanl);
+    TLI.setUnavailable(LibFunc::atan2l);
+    TLI.setUnavailable(LibFunc::ceill);
+    TLI.setUnavailable(LibFunc::copysignl);
+    TLI.setUnavailable(LibFunc::cosl);
+    TLI.setUnavailable(LibFunc::coshl);
+    TLI.setUnavailable(LibFunc::expl);
+    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc::fabsl);
+    TLI.setUnavailable(LibFunc::floorl);
+    TLI.setUnavailable(LibFunc::fmodl);
+    TLI.setUnavailable(LibFunc::frexpl);
+    TLI.setUnavailable(LibFunc::logl);
+    TLI.setUnavailable(LibFunc::modfl);
+    TLI.setUnavailable(LibFunc::powl);
+    TLI.setUnavailable(LibFunc::sinl);
+    TLI.setUnavailable(LibFunc::sinhl);
+    TLI.setUnavailable(LibFunc::sqrtl);
+    TLI.setUnavailable(LibFunc::tanl);
+    TLI.setUnavailable(LibFunc::tanhl);
+
+    // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc::exp2);
+    TLI.setUnavailable(LibFunc::exp2f);
+    TLI.setUnavailable(LibFunc::exp2l);
+    TLI.setUnavailable(LibFunc::expm1);
+    TLI.setUnavailable(LibFunc::expm1f);
+    TLI.setUnavailable(LibFunc::expm1l);
+    TLI.setUnavailable(LibFunc::log2);
+    TLI.setUnavailable(LibFunc::log2f);
+    TLI.setUnavailable(LibFunc::log2l);
+    TLI.setUnavailable(LibFunc::log1p);
+    TLI.setUnavailable(LibFunc::log1pf);
+    TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
+    TLI.setUnavailable(LibFunc::nearbyint);
+    TLI.setUnavailable(LibFunc::nearbyintf);
+    TLI.setUnavailable(LibFunc::nearbyintl);
+    TLI.setUnavailable(LibFunc::rint);
+    TLI.setUnavailable(LibFunc::rintf);
+    TLI.setUnavailable(LibFunc::rintl);
+    TLI.setUnavailable(LibFunc::round);
+    TLI.setUnavailable(LibFunc::roundf);
+    TLI.setUnavailable(LibFunc::roundl);
+    TLI.setUnavailable(LibFunc::trunc);
+    TLI.setUnavailable(LibFunc::truncf);
+    TLI.setUnavailable(LibFunc::truncl);
+
+    // Win32 provides some C99 math with mangled names
+    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+
+    if (T.getArch() == Triple::x86) {
+      // Win32 on x86 implements single-precision math functions as macros
+      TLI.setUnavailable(LibFunc::acosf);
+      TLI.setUnavailable(LibFunc::asinf);
+      TLI.setUnavailable(LibFunc::atanf);
+      TLI.setUnavailable(LibFunc::atan2f);
+      TLI.setUnavailable(LibFunc::ceilf);
+      TLI.setUnavailable(LibFunc::copysignf);
+      TLI.setUnavailable(LibFunc::cosf);
+      TLI.setUnavailable(LibFunc::coshf);
+      TLI.setUnavailable(LibFunc::expf);
+      TLI.setUnavailable(LibFunc::floorf);
+      TLI.setUnavailable(LibFunc::fmodf);
+      TLI.setUnavailable(LibFunc::logf);
+      TLI.setUnavailable(LibFunc::powf);
+      TLI.setUnavailable(LibFunc::sinf);
+      TLI.setUnavailable(LibFunc::sinhf);
+      TLI.setUnavailable(LibFunc::sqrtf);
+      TLI.setUnavailable(LibFunc::tanf);
+      TLI.setUnavailable(LibFunc::tanhf);
+    }
+
+    // Win32 does *not* provide provide these functions, but they are
+    // generally available on POSIX-compliant systems:
+    TLI.setUnavailable(LibFunc::access);
+    TLI.setUnavailable(LibFunc::bcmp);
+    TLI.setUnavailable(LibFunc::bcopy);
+    TLI.setUnavailable(LibFunc::bzero);
+    TLI.setUnavailable(LibFunc::chmod);
+    TLI.setUnavailable(LibFunc::chown);
+    TLI.setUnavailable(LibFunc::closedir);
+    TLI.setUnavailable(LibFunc::ctermid);
+    TLI.setUnavailable(LibFunc::fdopen);
+    TLI.setUnavailable(LibFunc::ffs);
+    TLI.setUnavailable(LibFunc::fileno);
+    TLI.setUnavailable(LibFunc::flockfile);
+    TLI.setUnavailable(LibFunc::fseeko);
+    TLI.setUnavailable(LibFunc::fstat);
+    TLI.setUnavailable(LibFunc::fstatvfs);
+    TLI.setUnavailable(LibFunc::ftello);
+    TLI.setUnavailable(LibFunc::ftrylockfile);
+    TLI.setUnavailable(LibFunc::funlockfile);
+    TLI.setUnavailable(LibFunc::getc_unlocked);
+    TLI.setUnavailable(LibFunc::getitimer);
+    TLI.setUnavailable(LibFunc::getlogin_r);
+    TLI.setUnavailable(LibFunc::getpwnam);
+    TLI.setUnavailable(LibFunc::htonl);
+    TLI.setUnavailable(LibFunc::htons);
+    TLI.setUnavailable(LibFunc::lchown);
+    TLI.setUnavailable(LibFunc::lstat);
+    TLI.setUnavailable(LibFunc::memccpy);
+    TLI.setUnavailable(LibFunc::mkdir);
+    TLI.setUnavailable(LibFunc::ntohl);
+    TLI.setUnavailable(LibFunc::ntohs);
+    TLI.setUnavailable(LibFunc::open);
+    TLI.setUnavailable(LibFunc::opendir);
+    TLI.setUnavailable(LibFunc::pclose);
+    TLI.setUnavailable(LibFunc::popen);
+    TLI.setUnavailable(LibFunc::pread);
+    TLI.setUnavailable(LibFunc::pwrite);
+    TLI.setUnavailable(LibFunc::read);
+    TLI.setUnavailable(LibFunc::readlink);
+    TLI.setUnavailable(LibFunc::realpath);
+    TLI.setUnavailable(LibFunc::rmdir);
+    TLI.setUnavailable(LibFunc::setitimer);
+    TLI.setUnavailable(LibFunc::stat);
+    TLI.setUnavailable(LibFunc::statvfs);
+    TLI.setUnavailable(LibFunc::stpcpy);
+    TLI.setUnavailable(LibFunc::stpncpy);
+    TLI.setUnavailable(LibFunc::strcasecmp);
+    TLI.setUnavailable(LibFunc::strncasecmp);
+    TLI.setUnavailable(LibFunc::times);
+    TLI.setUnavailable(LibFunc::uname);
+    TLI.setUnavailable(LibFunc::unlink);
+    TLI.setUnavailable(LibFunc::unsetenv);
+    TLI.setUnavailable(LibFunc::utime);
+    TLI.setUnavailable(LibFunc::utimes);
+    TLI.setUnavailable(LibFunc::write);
+
+    // Win32 does *not* provide provide these functions, but they are
+    // specified by C99:
+    TLI.setUnavailable(LibFunc::atoll);
+    TLI.setUnavailable(LibFunc::frexpf);
+    TLI.setUnavailable(LibFunc::llabs);
+  }
+
+  // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
+  // Linux (GLIBC):
+  // http://developer.apple.com/library/mac/#documentation/Darwin/Reference/ManPages/man3/ffsl.3.html
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsl.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsl.html
+  switch (T.getOS()) {
+  case Triple::Darwin:
+  case Triple::MacOSX:
+  case Triple::IOS:
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsl);
+  }
+
+  // ffsll is available on at least FreeBSD and Linux (GLIBC):
+  // http://svn.freebsd.org/base/user/eri/pf45/head/lib/libc/string/ffsll.c
+  // http://www.gnu.org/software/gnulib/manual/html_node/ffsll.html
+  switch (T.getOS()) {
+  case Triple::FreeBSD:
+  case Triple::Linux:
+    break;
+  default:
+    TLI.setUnavailable(LibFunc::ffsll);
+  }
+
+  // The following functions are available on at least Linux:
+  if (T.getOS() != Triple::Linux) {
+    TLI.setUnavailable(LibFunc::dunder_strdup);
+    TLI.setUnavailable(LibFunc::dunder_strtok_r);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc::under_IO_getc);
+    TLI.setUnavailable(LibFunc::under_IO_putc);
+    TLI.setUnavailable(LibFunc::memalign);
+    TLI.setUnavailable(LibFunc::fopen64);
+    TLI.setUnavailable(LibFunc::fseeko64);
+    TLI.setUnavailable(LibFunc::fstat64);
+    TLI.setUnavailable(LibFunc::fstatvfs64);
+    TLI.setUnavailable(LibFunc::ftello64);
+    TLI.setUnavailable(LibFunc::lstat64);
+    TLI.setUnavailable(LibFunc::open64);
+    TLI.setUnavailable(LibFunc::stat64);
+    TLI.setUnavailable(LibFunc::statvfs64);
+    TLI.setUnavailable(LibFunc::tmpfile64);
+  }
+}
+
+
+TargetLibraryInfo::TargetLibraryInfo() : ImmutablePass(ID) {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+
+  initialize(*this, Triple(), StandardNames);
+}
+
+TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
+  // Default to everything being available.
+  memset(AvailableArray, -1, sizeof(AvailableArray));
+  
+  initialize(*this, T, StandardNames);
+}
+
+TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
+  : ImmutablePass(ID) {
+  memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+  CustomNames = TLI.CustomNames;
+}
+
+namespace {
+struct StringComparator {
+  /// Compare two strings and return true if LHS is lexicographically less than
+  /// RHS. Requires that RHS doesn't contain any zero bytes.
+  bool operator()(const char *LHS, StringRef RHS) const {
+    // Compare prefixes with strncmp. If prefixes match we know that LHS is
+    // greater or equal to RHS as RHS can't contain any '\0'.
+    return std::strncmp(LHS, RHS.data(), RHS.size()) < 0;
+  }
+
+  // Provided for compatibility with MSVC's debug mode.
+  bool operator()(StringRef LHS, const char *RHS) const { return LHS < RHS; }
+  bool operator()(StringRef LHS, StringRef RHS) const { return LHS < RHS; }
+  bool operator()(const char *LHS, const char *RHS) const {
+    return std::strcmp(LHS, RHS) < 0;
+  }
+};
+}
+
+bool TargetLibraryInfo::getLibFunc(StringRef funcName,
+                                   LibFunc::Func &F) const {
+  const char **Start = &StandardNames[0];
+  const char **End = &StandardNames[LibFunc::NumLibFuncs];
+
+  // Filter out empty names and names containing null bytes, those can't be in
+  // our table.
+  if (funcName.empty() || funcName.find('\0') != StringRef::npos)
+    return false;
+
+  // Check for \01 prefix that is used to mangle __asm declarations and
+  // strip it if present.
+  if (funcName.front() == '\01')
+    funcName = funcName.substr(1);
+  const char **I = std::lower_bound(Start, End, funcName, StringComparator());
+  if (I != End && *I == funcName) {
+    F = (LibFunc::Func)(I - Start);
+    return true;
+  }
+  return false;
+}
+
+/// disableAllFunctions - This disables all builtins, which is used for options
+/// like -fno-builtin.
+void TargetLibraryInfo::disableAllFunctions() {
+  memset(AvailableArray, 0, sizeof(AvailableArray));
+}
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 000000000000..f5121e34f77f
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -0,0 +1,319 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Generic Code
+//===----------------------------------------------------------------------===//
+
+/// Initialize - this method must be called before any actual lowering is
+/// done.  This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+                                          const TargetMachine &TM) {
+  Ctx = &ctx;
+  InitMCObjectFileInfo(TM.getTargetTriple(),
+                       TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
+}
+  
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
+  const Constant *C = GV->getInitializer();
+
+  // Must have zero initializer.
+  if (!C->isNullValue())
+    return false;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared.
+  if (GV->isConstant())
+    return false;
+
+  // If the global has an explicit section specified, don't put it in BSS.
+  if (!GV->getSection().empty())
+    return false;
+
+  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+  if (NoZerosInBSS)
+    return false;
+
+  // Otherwise, put it in BSS!
+  return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it.  Note that this is more general
+/// than ConstantDataSequential::isString because we allow 2 & 4 byte strings.
+static bool IsNullTerminatedString(const Constant *C) {
+  // First check: is we have constant array terminated with zero
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) {
+    unsigned NumElts = CDS->getNumElements();
+    assert(NumElts != 0 && "Can't have an empty CDS");
+    
+    if (CDS->getElementAsInteger(NumElts-1) != 0)
+      return false; // Not null terminated.
+    
+    // Verify that the null doesn't occur anywhere else in the string.
+    for (unsigned i = 0; i != NumElts-1; ++i)
+      if (CDS->getElementAsInteger(i) == 0)
+        return false;
+    return true;
+  }
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C))
+    return cast<ArrayType>(C->getType())->getNumElements() == 1;
+
+  return false;
+}
+
+MCSymbol *TargetLoweringObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
+void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
+                                                    const TargetMachine &TM,
+                                                    const MCSymbol *Sym) const {
+}
+
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable.  Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler.  The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
+                                                       const TargetMachine &TM){
+  assert(!GV->isDeclaration() && !GV->hasAvailableExternallyLinkage() &&
+         "Can only be used for global definitions");
+
+  Reloc::Model ReloModel = TM.getRelocationModel();
+
+  // Early exit - functions should be always in text sections.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (GVar == 0)
+    return SectionKind::getText();
+
+  // Handle thread-local data first.
+  if (GVar->isThreadLocal()) {
+    if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS))
+      return SectionKind::getThreadBSS();
+    return SectionKind::getThreadData();
+  }
+
+  // Variables with common linkage always get classified as common.
+  if (GVar->hasCommonLinkage())
+    return SectionKind::getCommon();
+
+  // Variable can be easily put to BSS section.
+  if (isSuitableForBSS(GVar, TM.Options.NoZerosInBSS)) {
+    if (GVar->hasLocalLinkage())
+      return SectionKind::getBSSLocal();
+    else if (GVar->hasExternalLinkage())
+      return SectionKind::getBSSExtern();
+    return SectionKind::getBSS();
+  }
+
+  const Constant *C = GVar->getInitializer();
+
+  // If the global is marked constant, we can put it into a mergable section,
+  // a mergable string section, or general .data if it contains relocations.
+  if (GVar->isConstant()) {
+    // If the initializer for the global contains something that requires a
+    // relocation, then we may have to drop this into a writable data section
+    // even though it is marked const.
+    switch (C->getRelocationInfo()) {
+    case Constant::NoRelocation:
+      // If the global is required to have a unique address, it can't be put
+      // into a mergable section: just drop it into the general read-only
+      // section instead.
+      if (!GVar->hasUnnamedAddr())
+        return SectionKind::getReadOnly();
+        
+      // If initializer is a null-terminated string, put it in a "cstring"
+      // section of the right width.
+      if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (IntegerType *ITy =
+              dyn_cast<IntegerType>(ATy->getElementType())) {
+          if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+               ITy->getBitWidth() == 32) &&
+              IsNullTerminatedString(C)) {
+            if (ITy->getBitWidth() == 8)
+              return SectionKind::getMergeable1ByteCString();
+            if (ITy->getBitWidth() == 16)
+              return SectionKind::getMergeable2ByteCString();
+
+            assert(ITy->getBitWidth() == 32 && "Unknown width");
+            return SectionKind::getMergeable4ByteCString();
+          }
+        }
+      }
+
+      // Otherwise, just drop it into a mergable constant section.  If we have
+      // a section for this size, use it, otherwise use the arbitrary sized
+      // mergable section.
+      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
+      case 4:  return SectionKind::getMergeableConst4();
+      case 8:  return SectionKind::getMergeableConst8();
+      case 16: return SectionKind::getMergeableConst16();
+      default: return SectionKind::getMergeableConst();
+      }
+
+    case Constant::LocalRelocation:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel.local section.
+      return SectionKind::getReadOnlyWithRelLocal();
+
+    case Constant::GlobalRelocations:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel section.
+      return SectionKind::getReadOnlyWithRel();
+    }
+  }
+
+  // Okay, this isn't a constant.  If the initializer for the global is going
+  // to require a runtime relocation by the dynamic linker, put it into a more
+  // specific section to improve startup time of the app.  This coalesces these
+  // globals together onto fewer pages, improving the locality of the dynamic
+  // linker.
+  if (ReloModel == Reloc::Static)
+    return SectionKind::getDataNoRel();
+
+  switch (C->getRelocationInfo()) {
+  case Constant::NoRelocation:
+    return SectionKind::getDataNoRel();
+  case Constant::LocalRelocation:
+    return SectionKind::getDataRelLocal();
+  case Constant::GlobalRelocations:
+    return SectionKind::getDataRel();
+  }
+  llvm_unreachable("Invalid relocation");
+}
+
+/// SectionForGlobal - This method computes the appropriate section to emit
+/// the specified global variable or function definition.  This should not
+/// be passed external (or available externally) globals.
+const MCSection *TargetLoweringObjectFile::
+SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang,
+                 const TargetMachine &TM) const {
+  // Select section name.
+  if (GV->hasSection())
+    return getExplicitSectionGlobal(GV, Kind, Mang, TM);
+
+
+  // Use default section depending on the 'type' of global
+  return SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+
+// Lame default implementation. Calculate the section name for global.
+const MCSection *
+TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                 SectionKind Kind,
+                                                 Mangler *Mang,
+                                                 const TargetMachine &TM) const{
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  if (Kind.isText())
+    return getTextSection();
+
+  if (Kind.isBSS() && BSSSection != 0)
+    return BSSSection;
+
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return getDataSection();
+}
+
+/// getSectionForConstant - Given a mergable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *
+TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return DataSection;
+}
+
+/// getTTypeGlobalReference - Return an MCExpr to use for a
+/// reference to the specified global variable from exception
+/// handling information.
+const MCExpr *TargetLoweringObjectFile::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI, unsigned Encoding,
+                        MCStreamer &Streamer) const {
+  const MCSymbolRefExpr *Ref =
+    MCSymbolRefExpr::Create(Mang->getSymbol(GV), getContext());
+
+  return getTTypeReference(Ref, Encoding, Streamer);
+}
+
+const MCExpr *TargetLoweringObjectFile::
+getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
+                  MCStreamer &Streamer) const {
+  switch (Encoding & 0x70) {
+  default:
+    report_fatal_error("We do not support this DWARF encoding yet!");
+  case dwarf::DW_EH_PE_absptr:
+    // Do nothing special
+    return Sym;
+  case dwarf::DW_EH_PE_pcrel: {
+    // Emit a label to the streamer for the current position.  This gives us
+    // .-foo addressing.
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Sym, PC, getContext());
+  }
+  }
+}
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
new file mode 100644
index 000000000000..e7282519d597
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -0,0 +1,190 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// Command-line options that tend to be useful on more than one back-end.
+//
+
+namespace llvm {
+  bool HasDivModLibcall;
+  bool AsmVerbosityDefault(false);
+}
+
+static cl::opt<bool>
+DataSections("fdata-sections",
+  cl::desc("Emit data into separate sections"),
+  cl::init(false));
+static cl::opt<bool>
+FunctionSections("ffunction-sections",
+  cl::desc("Emit functions into separate sections"),
+  cl::init(false));
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::TargetMachine(const Target &T,
+                             StringRef TT, StringRef CPU, StringRef FS,
+                             const TargetOptions &Options)
+  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
+    CodeGenInfo(0), AsmInfo(0),
+    MCRelaxAll(false),
+    MCNoExecStack(false),
+    MCSaveTempLabels(false),
+    MCUseLoc(true),
+    MCUseCFI(true),
+    MCUseDwarfDirectory(false),
+    Options(Options) {
+}
+
+TargetMachine::~TargetMachine() {
+  delete CodeGenInfo;
+  delete AsmInfo;
+}
+
+/// \brief Reset the target options based on the function's attributes.
+void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
+  const Function *F = MF->getFunction();
+  TargetOptions &TO = MF->getTarget().Options;
+  
+#define RESET_OPTION(X, Y)                                              \
+  do {                                                                  \
+    if (F->hasFnAttribute(Y))                                           \
+      TO.X =                                                            \
+        (F->getAttributes().                                            \
+           getAttribute(AttributeSet::FunctionIndex,                    \
+                        Y).getValueAsString() == "true");               \
+  } while (0)
+
+  RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
+  RESET_OPTION(NoFramePointerElimNonLeaf, "no-frame-pointer-elim-non-leaf");
+  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
+  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
+  RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
+  RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(UseSoftFloat, "use-soft-float");
+  RESET_OPTION(DisableTailCalls, "disable-tail-calls");
+}
+
+/// getRelocationModel - Returns the code generation relocation model. The
+/// choices are static, PIC, and dynamic-no-pic, and target default.
+Reloc::Model TargetMachine::getRelocationModel() const {
+  if (!CodeGenInfo)
+    return Reloc::Default;
+  return CodeGenInfo->getRelocationModel();
+}
+
+/// getCodeModel - Returns the code model. The choices are small, kernel,
+/// medium, large, and target default.
+CodeModel::Model TargetMachine::getCodeModel() const {
+  if (!CodeGenInfo)
+    return CodeModel::Default;
+  return CodeGenInfo->getCodeModel();
+}
+
+/// Get the IR-specified TLS model for Var.
+static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
+  switch (Var->getThreadLocalMode()) {
+  case GlobalVariable::NotThreadLocal:
+    llvm_unreachable("getSelectedTLSModel for non-TLS variable");
+    break;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return TLSModel::GeneralDynamic;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return TLSModel::LocalDynamic;
+  case GlobalVariable::InitialExecTLSModel:
+    return TLSModel::InitialExec;
+  case GlobalVariable::LocalExecTLSModel:
+    return TLSModel::LocalExec;
+  }
+  llvm_unreachable("invalid TLS model");
+}
+
+TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
+  // If GV is an alias then use the aliasee for determining
+  // thread-localness.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->resolveAliasedGlobal(false);
+  const GlobalVariable *Var = cast<GlobalVariable>(GV);
+
+  bool isLocal = Var->hasLocalLinkage();
+  bool isDeclaration = Var->isDeclaration();
+  bool isPIC = getRelocationModel() == Reloc::PIC_;
+  bool isPIE = Options.PositionIndependentExecutable;
+  // FIXME: what should we do for protected and internal visibility?
+  // For variables, is internal different from hidden?
+  bool isHidden = Var->hasHiddenVisibility();
+
+  TLSModel::Model Model;
+  if (isPIC && !isPIE) {
+    if (isLocal || isHidden)
+      Model = TLSModel::LocalDynamic;
+    else
+      Model = TLSModel::GeneralDynamic;
+  } else {
+    if (!isDeclaration || isHidden)
+      Model = TLSModel::LocalExec;
+    else
+      Model = TLSModel::InitialExec;
+  }
+
+  // If the user specified a more specific model, use that.
+  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+  if (SelectedModel > Model)
+    return SelectedModel;
+
+  return Model;
+}
+
+/// getOptLevel - Returns the optimization level: None, Less,
+/// Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const {
+  if (!CodeGenInfo)
+    return CodeGenOpt::Default;
+  return CodeGenInfo->getOptLevel();
+}
+
+bool TargetMachine::getAsmVerbosityDefault() {
+  return AsmVerbosityDefault;
+}
+
+void TargetMachine::setAsmVerbosityDefault(bool V) {
+  AsmVerbosityDefault = V;
+}
+
+bool TargetMachine::getFunctionSections() {
+  return FunctionSections;
+}
+
+bool TargetMachine::getDataSections() {
+  return DataSections;
+}
+
+void TargetMachine::setFunctionSections(bool V) {
+  FunctionSections = V;
+}
+
+void TargetMachine::setDataSections(bool V) {
+  DataSections = V;
+}
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
new file mode 100644
index 000000000000..79f74bd66127
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -0,0 +1,197 @@
+//===-- TargetMachine.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVM-C part of TargetMachine.h
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/TargetMachine.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Target.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+using namespace llvm;
+
+
+
+LLVMTargetRef LLVMGetFirstTarget() {
+   const Target* target = &*TargetRegistry::begin();
+   return wrap(target);
+}
+LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
+  return wrap(unwrap(T)->getNext());
+}
+
+const char * LLVMGetTargetName(LLVMTargetRef T) {
+  return unwrap(T)->getName();
+}
+
+const char * LLVMGetTargetDescription(LLVMTargetRef T) {
+  return unwrap(T)->getShortDescription();
+}
+
+LLVMBool LLVMTargetHasJIT(LLVMTargetRef T) {
+  return unwrap(T)->hasJIT();
+}
+
+LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T) {
+  return unwrap(T)->hasTargetMachine();
+}
+
+LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
+  return unwrap(T)->hasMCAsmBackend();
+}
+
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char* Triple,
+  char* CPU, char* Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+  LLVMCodeModel CodeModel) {
+  Reloc::Model RM;
+  switch (Reloc){
+    case LLVMRelocStatic:
+      RM = Reloc::Static;
+      break;
+    case LLVMRelocPIC:
+      RM = Reloc::PIC_;
+      break;
+    case LLVMRelocDynamicNoPic:
+      RM = Reloc::DynamicNoPIC;
+      break;
+    default:
+      RM = Reloc::Default;
+      break;
+  }
+
+  CodeModel::Model CM;
+  switch (CodeModel) {
+    case LLVMCodeModelJITDefault:
+      CM = CodeModel::JITDefault;
+      break;
+    case LLVMCodeModelSmall:
+      CM = CodeModel::Small;
+      break;
+    case LLVMCodeModelKernel:
+      CM = CodeModel::Kernel;
+      break;
+    case LLVMCodeModelMedium:
+      CM = CodeModel::Medium;
+      break;
+    case LLVMCodeModelLarge:
+      CM = CodeModel::Large;
+      break;
+    default:
+      CM = CodeModel::Default;
+      break;
+  }
+  CodeGenOpt::Level OL;
+
+  switch (Level) {
+    case LLVMCodeGenLevelNone:
+      OL = CodeGenOpt::None;
+      break;
+    case LLVMCodeGenLevelLess:
+      OL = CodeGenOpt::Less;
+      break;
+    case LLVMCodeGenLevelAggressive:
+      OL = CodeGenOpt::Aggressive;
+      break;
+    default:
+      OL = CodeGenOpt::Default;
+      break;
+  }
+
+  TargetOptions opt;
+  return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
+    CM, OL));
+}
+
+
+void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) {
+  delete unwrap(T);
+}
+
+LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) {
+  const Target* target = &(unwrap(T)->getTarget());
+  return wrap(target);
+}
+
+char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetTriple();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetCPU();
+  return strdup(StringRep.c_str());
+}
+
+char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
+  std::string StringRep = unwrap(T)->getTargetFeatureString();
+  return strdup(StringRep.c_str());
+}
+
+LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
+  return wrap(unwrap(T)->getDataLayout());
+}
+
+LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
+  char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
+  TargetMachine* TM = unwrap(T);
+  Module* Mod = unwrap(M);
+
+  PassManager pass;
+
+  std::string error;
+
+  const DataLayout* td = TM->getDataLayout();
+
+  if (!td) {
+    error = "No DataLayout in TargetMachine";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+  pass.add(new DataLayout(*td));
+
+  TargetMachine::CodeGenFileType ft;
+  switch (codegen) {
+    case LLVMAssemblyFile:
+      ft = TargetMachine::CGFT_AssemblyFile;
+      break;
+    default:
+      ft = TargetMachine::CGFT_ObjectFile;
+      break;
+  }
+  raw_fd_ostream dest(Filename, error, raw_fd_ostream::F_Binary);
+  formatted_raw_ostream destf(dest);
+  if (!error.empty()) {
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  if (TM->addPassesToEmitFile(pass, destf, ft)) {
+    error = "TargetMachine can't emit a file of this type";
+    *ErrorMessage = strdup(error.c_str());
+    return true;
+  }
+
+  pass.run(*Mod);
+
+  destf.flush();
+  dest.flush();
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp
new file mode 100644
index 000000000000..af0cef62d552
--- /dev/null
+++ b/contrib/llvm/lib/Target/TargetSubtargetInfo.cpp
@@ -0,0 +1,37 @@
+//===-- TargetSubtargetInfo.cpp - General Target Information ---------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetSubtargetInfo Class
+//
+TargetSubtargetInfo::TargetSubtargetInfo() {}
+
+TargetSubtargetInfo::~TargetSubtargetInfo() {}
+
+bool TargetSubtargetInfo::enableMachineScheduler() const {
+  return false;
+}
+
+bool TargetSubtargetInfo::enablePostRAScheduler(
+          CodeGenOpt::Level OptLevel,
+          AntiDepBreakMode& Mode,
+          RegClassVector& CriticalPathRCs) const {
+  Mode = ANTIDEP_NONE;
+  CriticalPathRCs.clear();
+  return false;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 000000000000..e4623228b397
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -0,0 +1,2221 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+struct X86Operand;
+
+class X86AsmParser : public MCTargetAsmParser {
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  ParseInstructionInfo *InstInfo;
+private:
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  bool Error(SMLoc L, const Twine &Msg,
+             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+             bool MatchingInlineAsm = false) {
+    if (MatchingInlineAsm) return true;
+    return Parser.Error(L, Msg, Ranges);
+  }
+
+  X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
+    Error(Loc, Msg);
+    return 0;
+  }
+
+  X86Operand *ParseOperand();
+  X86Operand *ParseATTOperand();
+  X86Operand *ParseIntelOperand();
+  X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelOperator(SMLoc StartLoc, unsigned OpKind);
+  X86Operand *ParseIntelMemOperand(unsigned SegReg, uint64_t ImmDisp,
+                                   SMLoc StartLoc);
+  X86Operand *ParseIntelBracExpression(unsigned SegReg, uint64_t ImmDisp,
+                                       unsigned Size);
+  X86Operand *ParseIntelVarWithQualifier(const MCExpr *&Disp,
+                                         SMLoc &IdentStart);
+  X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+
+  X86Operand *CreateMemForInlineAsm(const MCExpr *Disp, SMLoc Start, SMLoc End,
+                                    SMLoc SizeDirLoc, unsigned Size);
+
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp,
+                             SmallString<64> &Err);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+
+  bool processInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
+
+  /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
+  /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
+  bool isSrcOp(X86Operand &Op);
+
+  /// isDstOp - Returns true if operand is either (%rdi) or %es:(%rdi)
+  /// in 64bit mode or (%edi) or %es:(%edi) in 32bit mode.
+  bool isDstOp(X86Operand &Op);
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+  void SwitchMode() {
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit));
+    setAvailableFeatures(FB);
+  }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "X86GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
+    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+
+  bool isParsingIntelSyntax() {
+    return getParser().getAssemblerDialect();
+  }
+};
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+static bool isImmSExti16i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti32i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmZExtu32u8Value(uint64_t Value) {
+    return (Value <= 0x00000000000000FFULL);
+}
+
+static bool isImmSExti64i8Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000000000007FULL)||
+          (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+
+static bool isImmSExti64i32Value(uint64_t Value) {
+  return ((                                  Value <= 0x000000007FFFFFFFULL)||
+          (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
+}
+namespace {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
+  bool AddressOf;
+
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNo;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned SegReg;
+    const MCExpr *Disp;
+    unsigned BaseReg;
+    unsigned IndexReg;
+    unsigned Scale;
+    unsigned Size;
+  };
+
+  union {
+    struct TokOp Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
+
+  virtual void print(raw_ostream &OS) const {}
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+  void setTokenValue(StringRef Value) {
+    assert(Kind == Token && "Invalid access!");
+    Tok.Data = Value.data();
+    Tok.Length = Value.size();
+  }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+
+  bool isToken() const {return Kind == Token; }
+
+  bool isImm() const { return Kind == Immediate; }
+
+  bool isImmSExti16i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti16i8Value(CE->getValue());
+  }
+  bool isImmSExti32i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti32i8Value(CE->getValue());
+  }
+  bool isImmZExtu32u8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmZExtu32u8Value(CE->getValue());
+  }
+  bool isImmSExti64i8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i8Value(CE->getValue());
+  }
+  bool isImmSExti64i32() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    return isImmSExti64i32Value(CE->getValue());
+  }
+
+  bool isOffsetOf() const {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needAddressOf() const {
+    return AddressOf;
+  }
+
+  bool isMem() const { return Kind == Memory; }
+  bool isMem8() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMem16() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMem32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMem64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64);
+  }
+  bool isMem80() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 80);
+  }
+  bool isMem128() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 128);
+  }
+  bool isMem256() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 256);
+  }
+
+  bool isMemVX32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+  bool isMemVX64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15;
+  }
+  bool isMemVY64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
+  }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+
+  bool isReg() const { return Kind == Register; }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addMem8Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem16Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem80Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem128Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMem256Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
+    addMemOperands(Inst, N);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    addExpr(Inst, getMemDisp());
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
+  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+    SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
+    X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               bool AddressOf = false,
+                               SMLoc OffsetOfLoc = SMLoc()) {
+    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    Res->AddressOf = AddressOf;
+    Res->OffsetOfLoc = OffsetOfLoc;
+    return Res;
+  }
+
+  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0) {
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    Res->Mem.Size     = Size;
+    Res->AddressOf = false;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
+                               unsigned BaseReg, unsigned IndexReg,
+                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    Res->Mem.Size     = Size;
+    Res->AddressOf = false;
+    return Res;
+  }
+};
+
+} // end anonymous namespace.
+
+bool X86AsmParser::isSrcOp(X86Operand &Op) {
+  unsigned basereg = is64BitMode() ? X86::RSI : X86::ESI;
+
+  return (Op.isMem() &&
+    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::DS) &&
+    isa<MCConstantExpr>(Op.Mem.Disp) &&
+    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0);
+}
+
+bool X86AsmParser::isDstOp(X86Operand &Op) {
+  unsigned basereg = is64BitMode() ? X86::RDI : X86::EDI;
+
+  return Op.isMem() &&
+    (Op.Mem.SegReg == 0 || Op.Mem.SegReg == X86::ES) &&
+    isa<MCConstantExpr>(Op.Mem.Disp) &&
+    cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+    Op.Mem.BaseReg == basereg && Op.Mem.IndexReg == 0;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo,
+                                 SMLoc &StartLoc, SMLoc &EndLoc) {
+  RegNo = 0;
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
+    Parser.Lex(); // Eat percent token.
+
+  const AsmToken &Tok = Parser.getTok();
+  EndLoc = Tok.getEndLoc();
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  RegNo = MatchRegisterName(Tok.getString());
+
+  // If the match failed, try the register name as lowercase.
+  if (RegNo == 0)
+    RegNo = MatchRegisterName(Tok.getString().lower());
+
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<In32BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(StartLoc, "register %"
+                   + Tok.getString() + " is only available in 64-bit mode",
+                   SMRange(StartLoc, EndLoc));
+  }
+
+  // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
+  if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
+    RegNo = X86::ST0;
+    Parser.Lex(); // Eat 'st'
+
+    // Check to see if we have '(4)' after %st.
+    if (getLexer().isNot(AsmToken::LParen))
+      return false;
+    // Lex the paren.
+    getParser().Lex();
+
+    const AsmToken &IntTok = Parser.getTok();
+    if (IntTok.isNot(AsmToken::Integer))
+      return Error(IntTok.getLoc(), "expected stack index");
+    switch (IntTok.getIntVal()) {
+    case 0: RegNo = X86::ST0; break;
+    case 1: RegNo = X86::ST1; break;
+    case 2: RegNo = X86::ST2; break;
+    case 3: RegNo = X86::ST3; break;
+    case 4: RegNo = X86::ST4; break;
+    case 5: RegNo = X86::ST5; break;
+    case 6: RegNo = X86::ST6; break;
+    case 7: RegNo = X86::ST7; break;
+    default: return Error(IntTok.getLoc(), "invalid stack index");
+    }
+
+    if (getParser().Lex().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+
+    EndLoc = Parser.getTok().getEndLoc();
+    Parser.Lex(); // Eat ')'
+    return false;
+  }
+
+  EndLoc = Parser.getTok().getEndLoc();
+
+  // If this is "db[0-7]", match it as an alias
+  // for dr[0-7].
+  if (RegNo == 0 && Tok.getString().size() == 3 &&
+      Tok.getString().startswith("db")) {
+    switch (Tok.getString()[2]) {
+    case '0': RegNo = X86::DR0; break;
+    case '1': RegNo = X86::DR1; break;
+    case '2': RegNo = X86::DR2; break;
+    case '3': RegNo = X86::DR3; break;
+    case '4': RegNo = X86::DR4; break;
+    case '5': RegNo = X86::DR5; break;
+    case '6': RegNo = X86::DR6; break;
+    case '7': RegNo = X86::DR7; break;
+    }
+
+    if (RegNo != 0) {
+      EndLoc = Parser.getTok().getEndLoc();
+      Parser.Lex(); // Eat it.
+      return false;
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  Parser.Lex(); // Eat identifier token.
+  return false;
+}
+
+X86Operand *X86AsmParser::ParseOperand() {
+  if (isParsingIntelSyntax())
+    return ParseIntelOperand();
+  return ParseATTOperand();
+}
+
+/// getIntelMemOperandSize - Return intel memory operand size.
+static unsigned getIntelMemOperandSize(StringRef OpStr) {
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("QWORD", "qword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Default(0);
+  return Size;
+}
+
+enum IntelBracExprState {
+  IBES_START,
+  IBES_LBRAC,
+  IBES_RBRAC,
+  IBES_REGISTER,
+  IBES_REGISTER_STAR,
+  IBES_REGISTER_STAR_INTEGER,
+  IBES_INTEGER,
+  IBES_INTEGER_STAR,
+  IBES_INDEX_REGISTER,
+  IBES_IDENTIFIER,
+  IBES_DISP_EXPR,
+  IBES_MINUS,
+  IBES_ERROR
+};
+
+class IntelBracExprStateMachine {
+  IntelBracExprState State;
+  unsigned BaseReg, IndexReg, Scale;
+  int64_t Disp;
+
+  unsigned TmpReg;
+  int64_t TmpInteger;
+
+  bool isPlus;
+
+public:
+  IntelBracExprStateMachine(MCAsmParser &parser, int64_t disp) :
+    State(IBES_START), BaseReg(0), IndexReg(0), Scale(1), Disp(disp),
+    TmpReg(0), TmpInteger(0), isPlus(true) {}
+
+  unsigned getBaseReg() { return BaseReg; }
+  unsigned getIndexReg() { return IndexReg; }
+  unsigned getScale() { return Scale; }
+  int64_t getDisp() { return Disp; }
+  bool isValidEndState() { return State == IBES_RBRAC; }
+
+  void onPlus() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_INTEGER:
+      State = IBES_START;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_START;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_START;
+      break;
+    }
+    isPlus = true;
+  }
+  void onMinus() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_MINUS;
+      break;
+    case IBES_INTEGER:
+      State = IBES_START;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_START;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_START;
+      break;
+    }
+    isPlus = false;
+  }
+  void onRegister(unsigned Reg) {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_REGISTER;
+      TmpReg = Reg;
+      break;
+    case IBES_INTEGER_STAR:
+      assert (!IndexReg && "IndexReg already set!");
+      State = IBES_INDEX_REGISTER;
+      IndexReg = Reg;
+      Scale = TmpInteger;
+      break;
+    }
+  }
+  void onDispExpr() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_DISP_EXPR;
+      break;
+    }
+  }
+  void onInteger(int64_t TmpInt) {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_START:
+      State = IBES_INTEGER;
+      TmpInteger = TmpInt;
+      break;
+    case IBES_MINUS:
+      State = IBES_INTEGER;
+      TmpInteger = TmpInt;
+      break;
+    case IBES_REGISTER_STAR:
+      assert (!IndexReg && "IndexReg already set!");
+      State = IBES_INDEX_REGISTER;
+      IndexReg = TmpReg;
+      Scale = TmpInt;
+      break;
+    }
+  }
+  void onStar() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_INTEGER:
+      State = IBES_INTEGER_STAR;
+      break;
+    case IBES_REGISTER:
+      State = IBES_REGISTER_STAR;
+      break;
+    }
+  }
+  void onLBrac() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_RBRAC:
+      State = IBES_START;
+      isPlus = true;
+      break;
+    }
+  }
+  void onRBrac() {
+    switch (State) {
+    default:
+      State = IBES_ERROR;
+      break;
+    case IBES_DISP_EXPR:
+      State = IBES_RBRAC;
+      break;
+    case IBES_INTEGER:
+      State = IBES_RBRAC;
+      if (isPlus)
+        Disp += TmpInteger;
+      else
+        Disp -= TmpInteger;
+      break;
+    case IBES_REGISTER:
+      State = IBES_RBRAC;
+      // If we already have a BaseReg, then assume this is the IndexReg with a
+      // scale of 1.
+      if (!BaseReg) {
+        BaseReg = TmpReg;
+      } else {
+        assert (!IndexReg && "BaseReg/IndexReg already set!");
+        IndexReg = TmpReg;
+        Scale = 1;
+      }
+      break;
+    case IBES_INDEX_REGISTER:
+      State = IBES_RBRAC;
+      break;
+    }
+  }
+};
+
+X86Operand *X86AsmParser::CreateMemForInlineAsm(const MCExpr *Disp, SMLoc Start,
+                                                SMLoc End, SMLoc SizeDirLoc,
+                                                unsigned Size) {
+  bool NeedSizeDir = false;
+  bool IsVarDecl = false;
+  if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
+    const MCSymbol &Sym = SymRef->getSymbol();
+    // FIXME: The SemaLookup will fail if the name is anything other then an
+    // identifier.
+    // FIXME: Pass a valid SMLoc.
+    unsigned tLength, tSize, tType;
+    SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, tLength,
+                                            tSize, tType, IsVarDecl);
+    if (!Size) {
+      Size = tType * 8; // Size is in terms of bits in this context.
+      NeedSizeDir = Size > 0;
+    }
+  }
+
+  // If this is not a VarDecl then assume it is a FuncDecl or some other label
+  // reference.  We need an 'r' constraint here, so we need to create register
+  // operand to ensure proper matching.  Just pick a GPR based on the size of
+  // a pointer.
+  if (!IsVarDecl) {
+    unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+    return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true);
+  }
+
+  if (NeedSizeDir)
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, SizeDirLoc,
+                                                /*Len*/0, Size));  
+
+  // When parsing inline assembly we set the base register to a non-zero value
+  // as we don't know the actual value at this time.  This is necessary to
+  // get the matching correct in some cases.
+  return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
+                               /*Scale*/1, Start, End, Size);
+}
+
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
+                                                   uint64_t ImmDisp,
+                                                   unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc();
+
+  // Eat '['
+  if (getLexer().isNot(AsmToken::LBrac))
+    return ErrorOperand(Start, "Expected '[' token!");
+  Parser.Lex();
+
+  unsigned TmpReg = 0;
+
+  // Try to handle '[' 'Symbol' ']'
+  if (getLexer().is(AsmToken::Identifier)) {
+    if (ParseRegister(TmpReg, Start, End)) {
+      const MCExpr *Disp;
+      SMLoc IdentStart = Tok.getLoc();
+      if (getParser().parseExpression(Disp, End))
+        return 0;
+
+      if (X86Operand *Err = ParseIntelVarWithQualifier(Disp, IdentStart))
+        return Err;
+
+      if (getLexer().isNot(AsmToken::RBrac))
+        return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!");
+
+      // FIXME: We don't handle 'ImmDisp' '[' 'Symbol' ']'.
+      if (ImmDisp)
+        return ErrorOperand(Start, "Unsupported immediate displacement!");
+
+      // Adjust the EndLoc due to the ']'.
+      End = SMLoc::getFromPointer(Parser.getTok().getEndLoc().getPointer()-1);
+      Parser.Lex();
+      if (!isParsingInlineAsm())
+        return X86Operand::CreateMem(Disp, Start, End, Size);
+
+      // We want the size directive before the '['.
+      SMLoc SizeDirLoc = SMLoc::getFromPointer(Start.getPointer()-1);
+      return CreateMemForInlineAsm(Disp, Start, End, SizeDirLoc, Size);
+    }
+  }
+
+  // Parse [ BaseReg + Scale*IndexReg + Disp ].  We may have already parsed an
+  // immediate displacement before the bracketed expression.
+  bool Done = false;
+  IntelBracExprStateMachine SM(Parser, ImmDisp);
+
+  // If we parsed a register, then the end loc has already been set and
+  // the identifier has already been lexed.  We also need to update the
+  // state.
+  if (TmpReg)
+    SM.onRegister(TmpReg);
+
+  const MCExpr *Disp = 0;
+  while (!Done) {
+    bool UpdateLocLex = true;
+
+    // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+    // identifier.  Don't try an parse it as a register.
+    if (Tok.getString().startswith("."))
+      break;
+
+    switch (getLexer().getKind()) {
+    default: {
+      if (SM.isValidEndState()) {
+        Done = true;
+        break;
+      }
+      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+    }
+    case AsmToken::Identifier: {
+      // This could be a register or a displacement expression.
+      if(!ParseRegister(TmpReg, Start, End)) {
+        SM.onRegister(TmpReg);
+        UpdateLocLex = false;
+        break;
+      } else if (!getParser().parseExpression(Disp, End)) {
+        SM.onDispExpr();
+        UpdateLocLex = false;
+        break;
+      }
+      return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+    }
+    case AsmToken::Integer: {
+      int64_t Val = Tok.getIntVal();
+      SM.onInteger(Val);
+      break;
+    }
+    case AsmToken::Plus:    SM.onPlus(); break;
+    case AsmToken::Minus:   SM.onMinus(); break;
+    case AsmToken::Star:    SM.onStar(); break;
+    case AsmToken::LBrac:   SM.onLBrac(); break;
+    case AsmToken::RBrac:   SM.onRBrac(); break;
+    }
+    if (!Done && UpdateLocLex) {
+      End = Tok.getLoc();
+      Parser.Lex(); // Consume the token.
+    }
+  }
+
+  if (!Disp)
+    Disp = MCConstantExpr::Create(SM.getDisp(), getContext());
+
+  // Parse the dot operator (e.g., [ebx].foo.bar).
+  if (Tok.getString().startswith(".")) {
+    SmallString<64> Err;
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, &NewDisp, Err))
+      return ErrorOperand(Tok.getLoc(), Err);
+    
+    End = Parser.getTok().getEndLoc();
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  int BaseReg = SM.getBaseReg();
+  int IndexReg = SM.getIndexReg();
+
+  // handle [-42]
+  if (!BaseReg && !IndexReg) {
+    if (!SegReg)
+      return X86Operand::CreateMem(Disp, Start, End);
+    else
+      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+  }
+
+  int Scale = SM.getScale();
+  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+                               Start, End, Size);
+}
+
+// Inline assembly may use variable names with namespace alias qualifiers.
+X86Operand *X86AsmParser::ParseIntelVarWithQualifier(const MCExpr *&Disp,
+                                                     SMLoc &IdentStart) {
+  // We should only see Foo::Bar if we're parsing inline assembly.
+  if (!isParsingInlineAsm())
+    return 0;
+
+  // If we don't see a ':' then there can't be a qualifier.
+  if (getLexer().isNot(AsmToken::Colon))
+    return 0;
+
+
+  bool Done = false;
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc IdentEnd = Tok.getEndLoc();
+  while (!Done) {
+    switch (getLexer().getKind()) {
+    default:
+      Done = true; 
+      break;
+    case AsmToken::Colon:
+      getLexer().Lex(); // Consume ':'.
+      if (getLexer().isNot(AsmToken::Colon))
+        return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+      getLexer().Lex(); // Consume second ':'.
+      if (getLexer().isNot(AsmToken::Identifier))
+        return ErrorOperand(Tok.getLoc(), "Expected an identifier token!");
+      break;
+    case AsmToken::Identifier:
+      IdentEnd = Tok.getEndLoc();
+      getLexer().Lex(); // Consume the identifier.
+      break;
+    }
+  }
+  size_t Len = IdentEnd.getPointer() - IdentStart.getPointer();
+  StringRef Identifier(IdentStart.getPointer(), Len);
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
+  MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+  Disp = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
+  return 0;
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
+                                               uint64_t ImmDisp,
+                                               SMLoc Start) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex();
+    assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") &&
+            "Unexpected token!");
+    Parser.Lex();
+  }
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::Integer)) {
+    const AsmToken &IntTok = Parser.getTok();
+    if (isParsingInlineAsm())
+      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
+                                                  IntTok.getLoc()));
+    uint64_t ImmDisp = IntTok.getIntVal();
+    Parser.Lex(); // Eat the integer.
+    if (getLexer().isNot(AsmToken::LBrac))
+      return ErrorOperand(Start, "Expected '[' token!");
+    return ParseIntelBracExpression(SegReg, ImmDisp, Size);
+  }
+
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(SegReg, ImmDisp, Size);
+
+  if (!ParseRegister(SegReg, Start, End)) {
+    // Handel SegReg : [ ... ]
+    if (getLexer().isNot(AsmToken::Colon))
+      return ErrorOperand(Start, "Expected ':' token!");
+    Parser.Lex(); // Eat :
+    if (getLexer().isNot(AsmToken::LBrac))
+      return ErrorOperand(Start, "Expected '[' token!");
+    return ParseIntelBracExpression(SegReg, ImmDisp, Size);
+  }
+
+  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  SMLoc IdentStart = Tok.getLoc();
+  if (getParser().parseExpression(Disp, End))
+    return 0;
+
+  if (!isParsingInlineAsm())
+    return X86Operand::CreateMem(Disp, Start, End, Size);
+
+  if (X86Operand *Err = ParseIntelVarWithQualifier(Disp, IdentStart))
+    return Err;
+
+  return CreateMemForInlineAsm(Disp, Start, End, Start, Size);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                         const MCExpr **NewDisp,
+                                         SmallString<64> &Err) {
+  AsmToken Tok = *&Parser.getTok();
+  uint64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) {
+    OrigDispVal = OrigDisp->getValue();
+  } else {
+    Err = "Non-constant offsets are not supported!";
+    return true;
+  }
+
+  // Drop the '.'.
+  StringRef DotDispStr = Tok.getString().drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (Tok.is(AsmToken::Identifier)) {
+    // We should only see an identifier when parsing the original inline asm.
+    // The front-end should rewrite this in terms of immediates.
+    assert (isParsingInlineAsm() && "Unexpected field name!");
+
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp)) {
+      Err = "Unable to lookup field reference!";
+      return true;
+    }
+    DotDispVal = DotDisp;
+  } else {
+    Err = "Unexpected token type!";
+    return true;
+  }
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
+                                                Val));
+  }
+
+  *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
+  SMLoc OffsetOfLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().parseExpression(Val, End))
+    return ErrorOperand(Start, "Unable to parse expression!");
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+  return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+                               OffsetOfLoc);
+}
+
+enum IntelOperatorKind {
+  IOK_LENGTH,
+  IOK_SIZE,
+  IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
+/// returns the number of elements in an array.  It returns the value 1 for
+/// non-array variables.  The SIZE operator returns the size of a C or C++
+/// variable.  A variable's size is the product of its LENGTH and TYPE.  The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) {
+  SMLoc TypeLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().parseExpression(Val, End))
+    return 0;
+
+  unsigned Length = 0, Size = 0, Type = 0;
+  if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
+    const MCSymbol &Sym = SymRef->getSymbol();
+    // FIXME: The SemaLookup will fail if the name is anything other then an
+    // identifier.
+    // FIXME: Pass a valid SMLoc.
+    bool IsVarDecl;
+    if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Length,
+                                                 Size, Type, IsVarDecl))
+      return ErrorOperand(Start, "Unable to lookup expr!");
+  }
+  unsigned CVal;
+  switch(OpKind) {
+  default: llvm_unreachable("Unexpected operand kind!");
+  case IOK_LENGTH: CVal = Length; break;
+  case IOK_SIZE: CVal = Size; break;
+  case IOK_TYPE: CVal = Type; break;
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
+
+  const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
+  return X86Operand::CreateImm(Imm, Start, End);
+}
+
+X86Operand *X86AsmParser::ParseIntelOperand() {
+  SMLoc Start = Parser.getTok().getLoc(), End;
+  StringRef AsmTokStr = Parser.getTok().getString();
+
+  // Offset, length, type and size operators.
+  if (isParsingInlineAsm()) {
+    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+      return ParseIntelOffsetOfOperator(Start);
+    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+      return ParseIntelOperator(Start, IOK_LENGTH);
+    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+      return ParseIntelOperator(Start, IOK_SIZE);
+    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+      return ParseIntelOperator(Start, IOK_TYPE);
+  }
+
+  // Immediate.
+  if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
+      getLexer().is(AsmToken::Minus)) {
+    const MCExpr *Val;
+    bool isInteger = getLexer().is(AsmToken::Integer);
+    if (!getParser().parseExpression(Val, End)) {
+      if (isParsingInlineAsm())
+        InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start));
+      // Immediate.
+      if (getLexer().isNot(AsmToken::LBrac))
+        return X86Operand::CreateImm(Val, Start, End);
+
+      // Only positive immediates are valid.
+      if (!isInteger) {
+        Error(Parser.getTok().getLoc(), "expected a positive immediate "
+              "displacement before bracketed expr.");
+        return 0;
+      }
+
+      // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+      if (uint64_t ImmDisp = dyn_cast<MCConstantExpr>(Val)->getValue())
+        return ParseIntelMemOperand(/*SegReg=*/0, ImmDisp, Start);
+    }
+  }
+
+  // Register.
+  unsigned RegNo = 0;
+  if (!ParseRegister(RegNo, Start, End)) {
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    getParser().Lex(); // Eat the colon.
+    return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
+  }
+
+  // Memory operand.
+  return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
+}
+
+X86Operand *X86AsmParser::ParseATTOperand() {
+  switch (getLexer().getKind()) {
+  default:
+    // Parse a memory operand with no segment register.
+    return ParseMemOperand(0, Parser.getTok().getLoc());
+  case AsmToken::Percent: {
+    // Read the register.
+    unsigned RegNo;
+    SMLoc Start, End;
+    if (ParseRegister(RegNo, Start, End)) return 0;
+    if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
+      Error(Start, "%eiz and %riz can only be used as index registers",
+            SMRange(Start, End));
+      return 0;
+    }
+
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, End);
+
+    getParser().Lex(); // Eat the colon.
+    return ParseMemOperand(RegNo, Start);
+  }
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    Parser.Lex();
+    const MCExpr *Val;
+    if (getParser().parseExpression(Val, End))
+      return 0;
+    return X86Operand::CreateImm(Val, Start, End);
+  }
+  }
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale).  The '%ds:' prefix
+/// has already been parsed if present.
+X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
+
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the '(' and see what is
+  // after it.
+  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    SMLoc ExprEnd;
+    if (getParser().parseExpression(Disp, ExprEnd)) return 0;
+
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg == 0)
+        return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+    }
+
+    // Eat the '('.
+    Parser.Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    SMLoc LParenLoc = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the '('.
+
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      SMLoc ExprEnd;
+
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().parseParenExpression(Disp, ExprEnd))
+        return 0;
+
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg == 0)
+          return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
+        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+      }
+
+      // Eat the '('.
+      Parser.Lex();
+    }
+  }
+
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  SMLoc IndexLoc;
+
+  if (getLexer().is(AsmToken::Percent)) {
+    SMLoc StartLoc, EndLoc;
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
+    if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
+      Error(StartLoc, "eiz and riz can only be used as index registers",
+            SMRange(StartLoc, EndLoc));
+      return 0;
+    }
+  }
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+    IndexLoc = Parser.getTok().getLoc();
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
+    if (getLexer().is(AsmToken::Percent)) {
+      SMLoc L;
+      if (ParseRegister(IndexReg, L, L)) return 0;
+
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma)) {
+          Error(Parser.getTok().getLoc(),
+                "expected comma in scale expression");
+          return 0;
+        }
+        Parser.Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = Parser.getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().parseAbsoluteExpression(ScaleVal)){
+            Error(Loc, "expected scale expression");
+            return 0;
+          }
+
+          // Validate the scale amount.
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+            Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+            return 0;
+          }
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // A scale amount without an index is ignored.
+      // index.
+      SMLoc Loc = Parser.getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().parseAbsoluteExpression(Value))
+        return 0;
+
+      if (Value != 1)
+        Warning(Loc, "scale factor without index register is ignored");
+      Scale = 1;
+    }
+  }
+
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+    return 0;
+  }
+  SMLoc MemEnd = Parser.getTok().getEndLoc();
+  Parser.Lex(); // Eat the ')'.
+
+  // If we have both a base register and an index register make sure they are
+  // both 64-bit or 32-bit registers.
+  // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
+  if (BaseReg != 0 && IndexReg != 0) {
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::RIZ) {
+      Error(IndexLoc, "index register is 32-bit, but base register is 64-bit");
+      return 0;
+    }
+    if (X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) &&
+        (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
+         X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg)) &&
+        IndexReg != X86::EIZ){
+      Error(IndexLoc, "index register is 64-bit, but base register is 32-bit");
+      return 0;
+    }
+  }
+
+  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+                               MemStart, MemEnd);
+}
+
+bool X86AsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  InstInfo = &Info;
+  StringRef PatchedName = Name;
+
+  // FIXME: Hack to recognize setneb as setne.
+  if (PatchedName.startswith("set") && PatchedName.endswith("b") &&
+      PatchedName != "setb" && PatchedName != "setnb")
+    PatchedName = PatchedName.substr(0, Name.size()-1);
+
+  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  const MCExpr *ExtraImmOp = 0;
+  if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
+      (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
+    bool IsVCMP = PatchedName[0] == 'v';
+    unsigned SSECCIdx = IsVCMP ? 4 : 3;
+    unsigned SSEComparisonCode = StringSwitch<unsigned>(
+      PatchedName.slice(SSECCIdx, PatchedName.size() - 2))
+      .Case("eq",       0x00)
+      .Case("lt",       0x01)
+      .Case("le",       0x02)
+      .Case("unord",    0x03)
+      .Case("neq",      0x04)
+      .Case("nlt",      0x05)
+      .Case("nle",      0x06)
+      .Case("ord",      0x07)
+      /* AVX only from here */
+      .Case("eq_uq",    0x08)
+      .Case("nge",      0x09)
+      .Case("ngt",      0x0A)
+      .Case("false",    0x0B)
+      .Case("neq_oq",   0x0C)
+      .Case("ge",       0x0D)
+      .Case("gt",       0x0E)
+      .Case("true",     0x0F)
+      .Case("eq_os",    0x10)
+      .Case("lt_oq",    0x11)
+      .Case("le_oq",    0x12)
+      .Case("unord_s",  0x13)
+      .Case("neq_us",   0x14)
+      .Case("nlt_uq",   0x15)
+      .Case("nle_uq",   0x16)
+      .Case("ord_s",    0x17)
+      .Case("eq_us",    0x18)
+      .Case("nge_uq",   0x19)
+      .Case("ngt_uq",   0x1A)
+      .Case("false_os", 0x1B)
+      .Case("neq_os",   0x1C)
+      .Case("ge_oq",    0x1D)
+      .Case("gt_oq",    0x1E)
+      .Case("true_us",  0x1F)
+      .Default(~0U);
+    if (SSEComparisonCode != ~0U && (IsVCMP || SSEComparisonCode < 8)) {
+      ExtraImmOp = MCConstantExpr::Create(SSEComparisonCode,
+                                          getParser().getContext());
+      if (PatchedName.endswith("ss")) {
+        PatchedName = IsVCMP ? "vcmpss" : "cmpss";
+      } else if (PatchedName.endswith("sd")) {
+        PatchedName = IsVCMP ? "vcmpsd" : "cmpsd";
+      } else if (PatchedName.endswith("ps")) {
+        PatchedName = IsVCMP ? "vcmpps" : "cmpps";
+      } else {
+        assert(PatchedName.endswith("pd") && "Unexpected mnemonic!");
+        PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+      }
+    }
+  }
+
+  Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
+  if (ExtraImmOp && !isParsingIntelSyntax())
+    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+  // Determine whether this is an instruction prefix.
+  bool isPrefix =
+    Name == "lock" || Name == "rep" ||
+    Name == "repe" || Name == "repz" ||
+    Name == "repne" || Name == "repnz" ||
+    Name == "rex64" || Name == "data16";
+
+
+  // This does the actual operand parsing.  Don't parse any more if we have a
+  // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
+  // just want to parse the "lock" as the first instruction and the "incl" as
+  // the next one.
+  if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star)) {
+      SMLoc Loc = Parser.getTok().getLoc();
+      Operands.push_back(X86Operand::CreateToken("*", Loc));
+      Parser.Lex(); // Eat the star.
+    }
+
+    // Read the first operand.
+    if (X86Operand *Op = ParseOperand())
+      Operands.push_back(Op);
+    else {
+      Parser.eatToEndOfStatement();
+      return true;
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (X86Operand *Op = ParseOperand())
+        Operands.push_back(Op);
+      else {
+        Parser.eatToEndOfStatement();
+        return true;
+      }
+    }
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.eatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  if (getLexer().is(AsmToken::EndOfStatement))
+    Parser.Lex(); // Consume the EndOfStatement
+  else if (isPrefix && getLexer().is(AsmToken::Slash))
+    Parser.Lex(); // Consume the prefix separator Slash
+
+  if (ExtraImmOp && isParsingIntelSyntax())
+    Operands.push_back(X86Operand::CreateImm(ExtraImmOp, NameLoc, NameLoc));
+
+  // This is a terrible hack to handle "out[bwl]? %al, (%dx)" ->
+  // "outb %al, %dx".  Out doesn't take a memory form, but this is a widely
+  // documented form in various unofficial manuals, so a lot of code uses it.
+  if ((Name == "outb" || Name == "outw" || Name == "outl" || Name == "out") &&
+      Operands.size() == 3) {
+    X86Operand &Op = *(X86Operand*)Operands.back();
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.back() = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+      delete &Op;
+    }
+  }
+  // Same hack for "in[bwl]? (%dx), %al" -> "inb %dx, %al".
+  if ((Name == "inb" || Name == "inw" || Name == "inl" || Name == "in") &&
+      Operands.size() == 3) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    if (Op.isMem() && Op.Mem.SegReg == 0 &&
+        isa<MCConstantExpr>(Op.Mem.Disp) &&
+        cast<MCConstantExpr>(Op.Mem.Disp)->getValue() == 0 &&
+        Op.Mem.BaseReg == MatchRegisterName("dx") && Op.Mem.IndexReg == 0) {
+      SMLoc Loc = Op.getEndLoc();
+      Operands.begin()[1] = X86Operand::CreateReg(Op.Mem.BaseReg, Loc, Loc);
+      delete &Op;
+    }
+  }
+  // Transform "ins[bwl] %dx, %es:(%edi)" into "ins[bwl]"
+  if (Name.startswith("ins") && Operands.size() == 3 &&
+      (Name == "insb" || Name == "insw" || Name == "insl")) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (Op.isReg() && Op.getReg() == X86::DX && isDstOp(Op2)) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+
+  // Transform "outs[bwl] %ds:(%esi), %dx" into "out[bwl]"
+  if (Name.startswith("outs") && Operands.size() == 3 &&
+      (Name == "outsb" || Name == "outsw" || Name == "outsl")) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (isSrcOp(Op) && Op2.isReg() && Op2.getReg() == X86::DX) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+
+  // Transform "movs[bwl] %ds:(%esi), %es:(%edi)" into "movs[bwl]"
+  if (Name.startswith("movs") && Operands.size() == 3 &&
+      (Name == "movsb" || Name == "movsw" || Name == "movsl" ||
+       (is64BitMode() && Name == "movsq"))) {
+    X86Operand &Op = *(X86Operand*)Operands.begin()[1];
+    X86Operand &Op2 = *(X86Operand*)Operands.begin()[2];
+    if (isSrcOp(Op) && isDstOp(Op2)) {
+      Operands.pop_back();
+      Operands.pop_back();
+      delete &Op;
+      delete &Op2;
+    }
+  }
+  // Transform "lods[bwl] %ds:(%esi),{%al,%ax,%eax,%rax}" into "lods[bwl]"
+  if (Name.startswith("lods") && Operands.size() == 3 &&
+      (Name == "lods" || Name == "lodsb" || Name == "lodsw" ||
+       Name == "lodsl" || (is64BitMode() && Name == "lodsq"))) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+    if (isSrcOp(*Op1) && Op2->isReg()) {
+      const char *ins;
+      unsigned reg = Op2->getReg();
+      bool isLods = Name == "lods";
+      if (reg == X86::AL && (isLods || Name == "lodsb"))
+        ins = "lodsb";
+      else if (reg == X86::AX && (isLods || Name == "lodsw"))
+        ins = "lodsw";
+      else if (reg == X86::EAX && (isLods || Name == "lodsl"))
+        ins = "lodsl";
+      else if (reg == X86::RAX && (isLods || Name == "lodsq"))
+        ins = "lodsq";
+      else
+        ins = NULL;
+      if (ins != NULL) {
+        Operands.pop_back();
+        Operands.pop_back();
+        delete Op1;
+        delete Op2;
+        if (Name != ins)
+          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+      }
+    }
+  }
+  // Transform "stos[bwl] {%al,%ax,%eax,%rax},%es:(%edi)" into "stos[bwl]"
+  if (Name.startswith("stos") && Operands.size() == 3 &&
+      (Name == "stos" || Name == "stosb" || Name == "stosw" ||
+       Name == "stosl" || (is64BitMode() && Name == "stosq"))) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    X86Operand *Op2 = static_cast<X86Operand*>(Operands[2]);
+    if (isDstOp(*Op2) && Op1->isReg()) {
+      const char *ins;
+      unsigned reg = Op1->getReg();
+      bool isStos = Name == "stos";
+      if (reg == X86::AL && (isStos || Name == "stosb"))
+        ins = "stosb";
+      else if (reg == X86::AX && (isStos || Name == "stosw"))
+        ins = "stosw";
+      else if (reg == X86::EAX && (isStos || Name == "stosl"))
+        ins = "stosl";
+      else if (reg == X86::RAX && (isStos || Name == "stosq"))
+        ins = "stosq";
+      else
+        ins = NULL;
+      if (ins != NULL) {
+        Operands.pop_back();
+        Operands.pop_back();
+        delete Op1;
+        delete Op2;
+        if (Name != ins)
+          static_cast<X86Operand*>(Operands[0])->setTokenValue(ins);
+      }
+    }
+  }
+
+  // FIXME: Hack to handle recognize s{hr,ar,hl} $1, <op>.  Canonicalize to
+  // "shift <op>".
+  if ((Name.startswith("shr") || Name.startswith("sar") ||
+       Name.startswith("shl") || Name.startswith("sal") ||
+       Name.startswith("rcl") || Name.startswith("rcr") ||
+       Name.startswith("rol") || Name.startswith("ror")) &&
+      Operands.size() == 3) {
+    if (isParsingIntelSyntax()) {
+      // Intel syntax
+      X86Operand *Op1 = static_cast<X86Operand*>(Operands[2]);
+      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[2];
+        Operands.pop_back();
+      }
+    } else {
+      X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+      if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+          cast<MCConstantExpr>(Op1->getImm())->getValue() == 1) {
+        delete Operands[1];
+        Operands.erase(Operands.begin() + 1);
+      }
+    }
+  }
+
+  // Transforms "int $3" into "int3" as a size optimization.  We can't write an
+  // instalias with an immediate operand yet.
+  if (Name == "int" && Operands.size() == 2) {
+    X86Operand *Op1 = static_cast<X86Operand*>(Operands[1]);
+    if (Op1->isImm() && isa<MCConstantExpr>(Op1->getImm()) &&
+        cast<MCConstantExpr>(Op1->getImm())->getValue() == 3) {
+      delete Operands[1];
+      Operands.erase(Operands.begin() + 1);
+      static_cast<X86Operand*>(Operands[0])->setTokenValue("int3");
+    }
+  }
+
+  return false;
+}
+
+static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg,
+                            bool isCmp) {
+  MCInst TmpInst;
+  TmpInst.setOpcode(Opcode);
+  if (!isCmp)
+    TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(MCOperand::CreateReg(Reg));
+  TmpInst.addOperand(Inst.getOperand(0));
+  Inst = TmpInst;
+  return true;
+}
+
+static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti16i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::AX, isCmp);
+}
+
+static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti32i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::EAX, isCmp);
+}
+
+static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode,
+                                bool isCmp = false) {
+  if (!Inst.getOperand(0).isImm() ||
+      !isImmSExti64i8Value(Inst.getOperand(0).getImm()))
+    return false;
+
+  return convertToSExti8(Inst, Opcode, X86::RAX, isCmp);
+}
+
+bool X86AsmParser::
+processInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Ops) {
+  switch (Inst.getOpcode()) {
+  default: return false;
+  case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8);
+  case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8);
+  case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8);
+  case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8);
+  case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8);
+  case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8);
+  case X86::OR16i16:  return convert16i16to16ri8(Inst, X86::OR16ri8);
+  case X86::OR32i32:  return convert32i32to32ri8(Inst, X86::OR32ri8);
+  case X86::OR64i32:  return convert64i32to64ri8(Inst, X86::OR64ri8);
+  case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true);
+  case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true);
+  case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true);
+  case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8);
+  case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8);
+  case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8);
+  case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8);
+  case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8);
+  case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8);
+  case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8);
+  case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8);
+  case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8);
+  case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
+  case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
+  case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+bool X86AsmParser::
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
+
+  // First, handle aliases that expand to multiple instructions.
+  // FIXME: This should be replaced with a real .td file alias mechanism.
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
+  // call.
+  if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
+      Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
+      Op->getToken() == "finit" || Op->getToken() == "fsave" ||
+      Op->getToken() == "fstenv" || Op->getToken() == "fclex") {
+    MCInst Inst;
+    Inst.setOpcode(X86::WAIT);
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+
+    const char *Repl =
+      StringSwitch<const char*>(Op->getToken())
+        .Case("finit",  "fninit")
+        .Case("fsave",  "fnsave")
+        .Case("fstcw",  "fnstcw")
+        .Case("fstcww",  "fnstcw")
+        .Case("fstenv", "fnstenv")
+        .Case("fstsw",  "fnstsw")
+        .Case("fstsww", "fnstsw")
+        .Case("fclex",  "fnclex")
+        .Default(0);
+    assert(Repl && "Unknown wait-prefixed instruction");
+    delete Operands[0];
+    Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
+  }
+
+  bool WasOriginallyInvalidOperand = false;
+  MCInst Inst;
+
+  // First, try a direct match.
+  switch (MatchInstructionImpl(Operands, Inst,
+                               ErrorInfo, MatchingInlineAsm,
+                               isParsingIntelSyntax())) {
+  default: break;
+  case Match_Success:
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected. Loop on it while changes happen so the
+    // individual transformations can chain off each other.
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
+
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
+    return false;
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing.
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+  }
+  case Match_InvalidOperand:
+    WasOriginallyInvalidOperand = true;
+    break;
+  case Match_MnemonicFail:
+    break;
+  }
+
+  // FIXME: Ideally, we would only attempt suffix matches for things which are
+  // valid prefixes, and we could just infer the right unambiguous
+  // type. However, that requires substantially more matcher support than the
+  // following hack.
+
+  // Change the operand to point to a temporary token.
+  StringRef Base = Op->getToken();
+  SmallString<16> Tmp;
+  Tmp += Base;
+  Tmp += ' ';
+  Op->setTokenValue(Tmp.str());
+
+  // If this instruction starts with an 'f', then it is a floating point stack
+  // instruction.  These come in up to three forms for 32-bit, 64-bit, and
+  // 80-bit floating point, which use the suffixes s,l,t respectively.
+  //
+  // Otherwise, we assume that this may be an integer instruction, which comes
+  // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
+  const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+
+  // Check for the various suffix matches.
+  Tmp[Base.size()] = Suffixes[0];
+  unsigned ErrorInfoIgnore;
+  unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
+  unsigned Match1, Match2, Match3, Match4;
+
+  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match1 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[1];
+  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match2 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[2];
+  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match3 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+  Tmp[Base.size()] = Suffixes[3];
+  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
+  // If this returned as a missing feature failure, remember that.
+  if (Match4 == Match_MissingFeature)
+    ErrorInfoMissingFeature = ErrorInfoIgnore;
+
+  // Restore the old token.
+  Op->setTokenValue(Base);
+
+  // If exactly one matched, then we treat that as a successful match (and the
+  // instruction will already have been filled in correctly, since the failing
+  // matches won't have modified it).
+  unsigned NumSuccessfulMatches =
+    (Match1 == Match_Success) + (Match2 == Match_Success) +
+    (Match3 == Match_Success) + (Match4 == Match_Success);
+  if (NumSuccessfulMatches == 1) {
+    Inst.setLoc(IDLoc);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
+    return false;
+  }
+
+  // Otherwise, the match failed, try to produce a decent error message.
+
+  // If we had multiple suffix matches, then identify this as an ambiguous
+  // match.
+  if (NumSuccessfulMatches > 1) {
+    char MatchChars[4];
+    unsigned NumMatches = 0;
+    if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0];
+    if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1];
+    if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2];
+    if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3];
+
+    SmallString<126> Msg;
+    raw_svector_ostream OS(Msg);
+    OS << "ambiguous instructions require an explicit suffix (could be ";
+    for (unsigned i = 0; i != NumMatches; ++i) {
+      if (i != 0)
+        OS << ", ";
+      if (i + 1 == NumMatches)
+        OS << "or ";
+      OS << "'" << Base << MatchChars[i] << "'";
+    }
+    OS << ")";
+    Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
+    return true;
+  }
+
+  // Okay, we know that none of the variants matched successfully.
+
+  // If all of the instructions reported an invalid mnemonic, then the original
+  // mnemonic was invalid.
+  if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
+      (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
+    if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
+        Op->getLocRange();
+      return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
+                   Ranges, MatchingInlineAsm);
+    }
+
+    // Recover location info for the operand if we know which was the problem.
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     EmptyRanges, MatchingInlineAsm);
+
+      X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
+      if (Operand->getStartLoc().isValid()) {
+        SMRange OperandRange = Operand->getLocRange();
+        return Error(Operand->getStartLoc(), "invalid operand for instruction",
+                     OperandRange, MatchingInlineAsm);
+      }
+    }
+
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
+  }
+
+  // If one instruction matched with a missing feature, report this as a
+  // missing feature.
+  if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
+      (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) {
+      if (ErrorInfoMissingFeature & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm);
+  }
+
+  // If one instruction matched with an invalid operand, report this as an
+  // operand failure.
+  if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
+      (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
+    Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+          MatchingInlineAsm);
+    return true;
+  }
+
+  // If all of these were an outright failure, report it in a useless way.
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRanges, MatchingInlineAsm);
+  return true;
+}
+
+
+bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
+    getParser().setAssemblerDialect(1);
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if(Parser.getTok().getString() == "noprefix") {
+        // FIXME : Handle noprefix
+        Parser.Lex();
+      } else
+        return true;
+    }
+    return false;
+  }
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().parseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code32 | .code64
+bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  if (IDVal == ".code32") {
+    Parser.Lex();
+    if (is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    return Error(L, "unexpected directive " + IDVal);
+  }
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
+  RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#define GET_SUBTARGET_FEATURE_NAME
+#include "X86GenAsmMatcher.inc"
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 000000000000..ca6f80ce3e58
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -0,0 +1,814 @@
+//===-- X86Disassembler.cpp - Disassembler for x86 and x86_64 -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+//  MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+void x86DisassemblerDebug(const char *file,
+                          unsigned line,
+                          const char *s) {
+  dbgs() << file << ":" << line << ": " << s;
+}
+
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
+  const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
+  return MII->getName(Opcode);
+}
+
+#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+
+namespace llvm {  
+  
+// Fill-ins to make the compiler happy.  These constants are never actually
+//   assigned; they are just filler to make an automatically-generated switch
+//   statement work.
+namespace X86 {
+  enum {
+    BX_SI = 500,
+    BX_DI = 501,
+    BP_SI = 502,
+    BP_DI = 503,
+    sib   = 504,
+    sib64 = 505
+  };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static bool translateInstruction(MCInst &target,
+                                InternalInstruction &source,
+                                const MCDisassembler *Dis);
+
+X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
+                                               DisassemblerMode mode,
+                                               const MCInstrInfo *MII)
+  : MCDisassembler(STI), MII(MII), fMode(mode) {}
+
+X86GenericDisassembler::~X86GenericDisassembler() {
+  delete MII;
+}
+
+/// regionReader - a callback function that wraps the readByte method from
+///   MemoryObject.
+///
+/// @param arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a MemoryObject.
+/// @param byte     - A pointer to the byte to be read.
+/// @param address  - The address to be read.
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
+  return region->readByte(address, byte);
+}
+
+/// logger - a callback function that wraps the operator<< method from
+///   raw_ostream.
+///
+/// @param arg      - The generic callback parameter.  This should be a pointe
+///                   to a raw_ostream.
+/// @param log      - A string to be logged.  logger() adds a newline.
+static void logger(void* arg, const char* log) {
+  if (!arg)
+    return;
+  
+  raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+  vStream << log << "\n";
+}  
+  
+//
+// Public interface for the disassembler
+//
+
+MCDisassembler::DecodeStatus
+X86GenericDisassembler::getInstruction(MCInst &instr,
+                                       uint64_t &size,
+                                       const MemoryObject &region,
+                                       uint64_t address,
+                                       raw_ostream &vStream,
+                                       raw_ostream &cStream) const {
+  CommentStream = &cStream;
+
+  InternalInstruction internalInstr;
+
+  dlog_t loggerFn = logger;
+  if (&vStream == &nulls())
+    loggerFn = 0; // Disable logging completely if it's going to nulls().
+  
+  int ret = decodeInstruction(&internalInstr,
+                              regionReader,
+                              (const void*)&region,
+                              loggerFn,
+                              (void*)&vStream,
+                              (const void*)MII,
+                              address,
+                              fMode);
+
+  if (ret) {
+    size = internalInstr.readerCursor - address;
+    return Fail;
+  }
+  else {
+    size = internalInstr.length;
+    return (!translateInstruction(instr, internalInstr, this)) ?
+            Success : Fail;
+  }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+///   register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst     - The MCInst to append to.
+/// @param reg        - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+  uint8_t llvmRegnums[] = {
+    ALL_REGS
+    0
+  };
+#undef ENTRY
+
+  uint8_t llvmRegnum = llvmRegnums[reg];
+  mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst. 
+///
+/// @param Value      - The immediate Value, has had any PC adjustment made by
+///                     the caller.
+/// @param isBranch   - If the instruction is a branch instruction
+/// @param Address    - The starting address of the instruction
+/// @param Offset     - The byte offset to this immediate in the instruction
+/// @param Width      - The byte width of this immediate in the instruction
+///
+/// If the getOpInfo() function was set when setupForSymbolicDisassembly() was
+/// called then that function is called to get any symbolic information for the
+/// immediate in the instruction using the Address, Offset and Width.  If that
+/// returns non-zero then the symbolic information it returns is used to create 
+/// an MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
+/// returns zero and isBranch is true then a symbol look up for immediate Value
+/// is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with the immediate Value is created.  This function returns true
+/// if it adds an operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
+                                     uint64_t Address, uint64_t Offset,
+                                     uint64_t Width, MCInst &MI, 
+                                     const MCDisassembler *Dis) {  
+  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  void *DisInfo = Dis->getDisInfoBlock();
+
+  if (!getOpInfo ||
+      !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
+    // Clear SymbolicOp.Value from above and also all other fields.
+    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+    if (!SymbolLookUp)
+      return false;
+    uint64_t ReferenceType;
+    if (isBranch)
+       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+    else
+       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    const char *ReferenceName;
+    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+                                    &ReferenceName);
+    if (Name) {
+      SymbolicOp.AddSymbol.Name = Name;
+      SymbolicOp.AddSymbol.Present = true;
+    }
+    // For branches always create an MCExpr so it gets printed as hex address.
+    else if (isBranch) {
+      SymbolicOp.Value = Value;
+    }
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
+    if (!Name && !isBranch)
+      return false;
+  }
+
+  MCContext *Ctx = Dis->getMCContext();
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+      if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the rip.
+/// These can often be addresses in a literal pool.  The Address of the
+/// instruction and its immediate Value are used to determine the address
+/// being referenced in the literal pool entry.  The SymbolLookUp call back will
+/// return a pointer to a literal 'C' string if the referenced address is an 
+/// address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
+                                            const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+  if (SymbolLookUp) {
+    void *DisInfo = Dis->getDisInfoBlock();
+    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+  }
+}
+
+/// translateImmediate  - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param immediate    - The immediate value to append.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate,
+                               const OperandSpecifier &operand,
+                               InternalInstruction &insn,
+                               const MCDisassembler *Dis) {  
+  // Sign-extend the immediate if necessary.
+
+  OperandType type = (OperandType)operand.type;
+
+  bool isBranch = false;
+  uint64_t pcrel = 0;
+  if (type == TYPE_RELv) {
+    isBranch = true;
+    pcrel = insn.startLocation +
+            insn.immediateOffset + insn.immediateSize;
+    switch (insn.displacementSize) {
+    default:
+      break;
+    case 1:
+      type = TYPE_MOFFS8;
+      break;
+    case 2:
+      type = TYPE_MOFFS16;
+      break;
+    case 4:
+      type = TYPE_MOFFS32;
+      break;
+    case 8:
+      type = TYPE_MOFFS64;
+      break;
+    }
+  }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64) {
+    uint32_t Opcode = mcInst.getOpcode();
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      // Special case those X86 instructions that use the imm8 as a set of
+      // bits, bit count, etc. and are not sign-extend.
+      if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
+          Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+          Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+          Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+          Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+          Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+          Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+          Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+          Opcode != X86::VINSERTPSrr)
+        type = TYPE_MOFFS8;
+      break;
+    case ENCODING_IW:
+      type = TYPE_MOFFS16;
+      break;
+    case ENCODING_ID:
+      type = TYPE_MOFFS32;
+      break;
+    case ENCODING_IO:
+      type = TYPE_MOFFS64;
+      break;
+    }
+  }
+
+  switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
+    return;
+  case TYPE_REL8:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    // fall through to sign extend the immediate if needed.
+  case TYPE_MOFFS8:
+    if(immediate & 0x80)
+      immediate |= ~(0xffull);
+    break;
+  case TYPE_MOFFS16:
+    if(immediate & 0x8000)
+      immediate |= ~(0xffffull);
+    break;
+  case TYPE_REL32:
+  case TYPE_REL64:
+    isBranch = true;
+    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
+    // fall through to sign extend the immediate if needed.
+  case TYPE_MOFFS32:
+    if(immediate & 0x80000000)
+      immediate |= ~(0xffffffffull);
+    break;
+  case TYPE_MOFFS64:
+  default:
+    // operand is 64 bits wide.  Do nothing.
+    break;
+  }
+    
+  if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
+                               insn.immediateOffset, insn.immediateSize,
+                               mcInst, Dis))
+    mcInst.addOperand(MCOperand::CreateImm(immediate));
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+///   ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction to extract the R/M field
+///                       from.
+/// @return             - 0 on success; -1 otherwise
+static bool translateRMRegister(MCInst &mcInst,
+                                InternalInstruction &insn) {
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    debug("A R/M register operand may not have a SIB byte");
+    return true;
+  }
+  
+  switch (insn.eaBase) {
+  default:
+    debug("Unexpected EA base register");
+    return true;
+  case EA_BASE_NONE:
+    debug("EA_BASE_NONE for ModR/M base");
+    return true;
+#define ENTRY(x) case EA_BASE_##x:
+  ALL_EA_BASES
+#undef ENTRY
+    debug("A R/M register operand may not have a base; "
+          "the operand must be a register.");
+    return true;
+#define ENTRY(x)                                                      \
+  case EA_REG_##x:                                                    \
+    mcInst.addOperand(MCOperand::CreateReg(X86::x)); break;
+  ALL_REGS
+#undef ENTRY
+  }
+  
+  return false;
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+///   fields of an internal instruction (and possibly its SIB byte) to a memory
+///   operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
+                              const MCDisassembler *Dis) {  
+  // Addresses in an MCInst are represented as five operands:
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the 
+  //                                SIB base
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified 
+  //                                scale amount
+  //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
+  //                                the index (which is multiplied by the 
+  //                                scale amount)
+  //   4. displacement  (immediate) 0, or the displacement if there is one
+  //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
+  //                                if we have segment overrides
+  
+  MCOperand baseReg;
+  MCOperand scaleAmount;
+  MCOperand indexReg;
+  MCOperand displacement;
+  MCOperand segmentReg;
+  uint64_t pcrel = 0;
+  
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    if (insn.sibBase != SIB_BASE_NONE) {
+      switch (insn.sibBase) {
+      default:
+        debug("Unexpected sibBase");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_BASE_##x:                                  \
+        baseReg = MCOperand::CreateReg(X86::x); break;
+      ALL_SIB_BASES
+#undef ENTRY
+      }
+    } else {
+      baseReg = MCOperand::CreateReg(0);
+    }
+
+    // Check whether we are handling VSIB addressing mode for GATHER.
+    // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
+    // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
+    // I don't see a way to get the correct IndexReg in readSIB:
+    //   We can tell whether it is VSIB or SIB after instruction ID is decoded,
+    //   but instruction ID may not be decoded yet when calling readSIB.
+    uint32_t Opcode = mcInst.getOpcode();
+    bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
+                       Opcode == X86::VGATHERDPDYrm ||
+                       Opcode == X86::VGATHERQPDrm ||
+                       Opcode == X86::VGATHERDPSrm ||
+                       Opcode == X86::VGATHERQPSrm ||
+                       Opcode == X86::VPGATHERDQrm ||
+                       Opcode == X86::VPGATHERDQYrm ||
+                       Opcode == X86::VPGATHERQQrm ||
+                       Opcode == X86::VPGATHERDDrm ||
+                       Opcode == X86::VPGATHERQDrm);
+    bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
+                       Opcode == X86::VGATHERDPSYrm ||
+                       Opcode == X86::VGATHERQPSYrm ||
+                       Opcode == X86::VPGATHERQQYrm ||
+                       Opcode == X86::VPGATHERDDYrm ||
+                       Opcode == X86::VPGATHERQDYrm);
+    if (IndexIs128 || IndexIs256) {
+      unsigned IndexOffset = insn.sibIndex -
+                         (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
+      SIBIndex IndexBase = IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
+      insn.sibIndex = (SIBIndex)(IndexBase + 
+                           (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
+    }
+
+    if (insn.sibIndex != SIB_INDEX_NONE) {
+      switch (insn.sibIndex) {
+      default:
+        debug("Unexpected sibIndex");
+        return true;
+#define ENTRY(x)                                          \
+      case SIB_INDEX_##x:                                 \
+        indexReg = MCOperand::CreateReg(X86::x); break;
+      EA_BASES_32BIT
+      EA_BASES_64BIT
+      REGS_XMM
+      REGS_YMM
+#undef ENTRY
+      }
+    } else {
+      indexReg = MCOperand::CreateReg(0);
+    }
+    
+    scaleAmount = MCOperand::CreateImm(insn.sibScale);
+  } else {
+    switch (insn.eaBase) {
+    case EA_BASE_NONE:
+      if (insn.eaDisplacement == EA_DISP_NONE) {
+        debug("EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+        return true;
+      }
+      if (insn.mode == MODE_64BIT){
+        pcrel = insn.startLocation +
+                insn.displacementOffset + insn.displacementSize;
+        tryAddingPcLoadReferenceComment(insn.startLocation +
+                                        insn.displacementOffset,
+                                        insn.displacement + pcrel, Dis);
+        baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+      }
+      else
+        baseReg = MCOperand::CreateReg(0);
+      
+      indexReg = MCOperand::CreateReg(0);
+      break;
+    case EA_BASE_BX_SI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BX_DI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    case EA_BASE_BP_SI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BP_DI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    default:
+      indexReg = MCOperand::CreateReg(0);
+      switch (insn.eaBase) {
+      default:
+        debug("Unexpected eaBase");
+        return true;
+        // Here, we will use the fill-ins defined above.  However,
+        //   BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+        //   sib and sib64 were handled in the top-level if, so they're only
+        //   placeholders to keep the compiler happy.
+#define ENTRY(x)                                        \
+      case EA_BASE_##x:                                 \
+        baseReg = MCOperand::CreateReg(X86::x); break; 
+      ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+      ALL_REGS
+#undef ENTRY
+        debug("A R/M memory operand may not be a register; "
+              "the base field must be a base.");
+        return true;
+      }
+    }
+    
+    scaleAmount = MCOperand::CreateImm(1);
+  }
+  
+  displacement = MCOperand::CreateImm(insn.displacement);
+  
+  static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+    0,        // SEG_OVERRIDE_NONE
+    X86::CS,
+    X86::SS,
+    X86::DS,
+    X86::ES,
+    X86::FS,
+    X86::GS
+  };
+  
+  segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+  
+  mcInst.addOperand(baseReg);
+  mcInst.addOperand(scaleAmount);
+  mcInst.addOperand(indexReg);
+  if(!tryAddingSymbolicOperand(insn.displacement + pcrel, false,
+                               insn.startLocation, insn.displacementOffset,
+                               insn.displacementSize, mcInst, Dis))
+    mcInst.addOperand(displacement);
+  mcInst.addOperand(segmentReg);
+  return false;
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+///   byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @return             - 0 on success; nonzero otherwise
+static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
+                        InternalInstruction &insn, const MCDisassembler *Dis) {  
+  switch (operand.type) {
+  default:
+    debug("Unexpected type for a R/M operand");
+    return true;
+  case TYPE_R8:
+  case TYPE_R16:
+  case TYPE_R32:
+  case TYPE_R64:
+  case TYPE_Rv:
+  case TYPE_MM:
+  case TYPE_MM32:
+  case TYPE_MM64:
+  case TYPE_XMM:
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+  case TYPE_XMM256:
+  case TYPE_DEBUGREG:
+  case TYPE_CONTROLREG:
+    return translateRMRegister(mcInst, insn);
+  case TYPE_M:
+  case TYPE_M8:
+  case TYPE_M16:
+  case TYPE_M32:
+  case TYPE_M64:
+  case TYPE_M128:
+  case TYPE_M256:
+  case TYPE_M512:
+  case TYPE_Mv:
+  case TYPE_M32FP:
+  case TYPE_M64FP:
+  case TYPE_M80FP:
+  case TYPE_M16INT:
+  case TYPE_M32INT:
+  case TYPE_M64INT:
+  case TYPE_M1616:
+  case TYPE_M1632:
+  case TYPE_M1664:
+  case TYPE_LEA:
+    return translateRMMemory(mcInst, insn, Dis);
+  }
+}
+  
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param stackPos     - The stack position to translate.
+/// @return             - 0 on success; nonzero otherwise.
+static bool translateFPRegister(MCInst &mcInst,
+                               uint8_t stackPos) {
+  if (stackPos >= 8) {
+    debug("Invalid FP stack position");
+    return true;
+  }
+  
+  mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+
+  return false;
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction 
+///   to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
+                             InternalInstruction &insn,
+                             const MCDisassembler *Dis) {  
+  switch (operand.encoding) {
+  default:
+    debug("Unhandled operand encoding during translation");
+    return true;
+  case ENCODING_REG:
+    translateRegister(mcInst, insn.reg);
+    return false;
+  case ENCODING_RM:
+    return translateRM(mcInst, operand, insn, Dis);
+  case ENCODING_CB:
+  case ENCODING_CW:
+  case ENCODING_CD:
+  case ENCODING_CP:
+  case ENCODING_CO:
+  case ENCODING_CT:
+    debug("Translation of code offsets isn't supported.");
+    return true;
+  case ENCODING_IB:
+  case ENCODING_IW:
+  case ENCODING_ID:
+  case ENCODING_IO:
+  case ENCODING_Iv:
+  case ENCODING_Ia:
+    translateImmediate(mcInst,
+                       insn.immediates[insn.numImmediatesTranslated++],
+                       operand,
+                       insn,
+                       Dis);
+    return false;
+  case ENCODING_RB:
+  case ENCODING_RW:
+  case ENCODING_RD:
+  case ENCODING_RO:
+    translateRegister(mcInst, insn.opcodeRegister);
+    return false;
+  case ENCODING_I:
+    return translateFPRegister(mcInst, insn.opcodeModifier);
+  case ENCODING_Rv:
+    translateRegister(mcInst, insn.opcodeRegister);
+    return false;
+  case ENCODING_VVVV:
+    translateRegister(mcInst, insn.vvvv);
+    return false;
+  case ENCODING_DUP:
+    return translateOperand(mcInst, insn.operands[operand.type - TYPE_DUP0],
+                            insn, Dis);
+  }
+}
+  
+/// translateInstruction - Translates an internal instruction and all its
+///   operands to an MCInst.
+///
+/// @param mcInst       - The MCInst to populate with the instruction's data.
+/// @param insn         - The internal instruction.
+/// @return             - false on success; true otherwise.
+static bool translateInstruction(MCInst &mcInst,
+                                InternalInstruction &insn,
+                                const MCDisassembler *Dis) {  
+  if (!insn.spec) {
+    debug("Instruction has no specification");
+    return true;
+  }
+  
+  mcInst.setOpcode(insn.instructionID);
+  
+  int index;
+  
+  insn.numImmediatesTranslated = 0;
+  
+  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+    if (insn.operands[index].encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
+        return true;
+      }
+    }
+  }
+  
+  return false;
+}
+
+static MCDisassembler *createX86_32Disassembler(const Target &T,
+                                                const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86GenericDisassembler(STI, MODE_32BIT,
+                                                     T.createMCInstrInfo());
+}
+
+static MCDisassembler *createX86_64Disassembler(const Target &T,
+                                                const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86GenericDisassembler(STI, MODE_64BIT,
+                                                     T.createMCInstrInfo());
+}
+
+extern "C" void LLVMInitializeX86Disassembler() { 
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheX86_32Target, 
+                                         createX86_32Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+                                         createX86_64Disassembler);
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 000000000000..b92427a7e91a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -0,0 +1,131 @@
+//===-- X86Disassembler.h - Disassembler for x86 and x86_64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+//   adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLER_H
+#define X86DISASSEMBLER_H
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+  uint16_t operands;
+
+#define INSTRUCTION_IDS               \
+  uint16_t instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MemoryObject;
+class raw_ostream;
+
+namespace X86Disassembler {
+
+/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
+///   All each platform class should have to do is subclass the constructor, and
+///   provide a different disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+  const MCInstrInfo *MII;
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  /// @param mode     - The X86 architecture mode to decode for.
+  X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode,
+                         const MCInstrInfo *MII);
+private:
+  ~X86GenericDisassembler();
+public:
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+private:
+  DisassemblerMode              fMode;
+};
+
+} // namespace X86Disassembler
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
new file mode 100644
index 000000000000..e40edba6d689
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -0,0 +1,1676 @@
+/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the implementation of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include <stdarg.h>   /* for va_*()       */
+#include <stdio.h>    /* for vsnprintf()  */
+#include <stdlib.h>   /* for exit()       */
+#include <string.h>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+#include "X86GenDisassemblerTables.inc"
+
+#define TRUE  1
+#define FALSE 0
+
+typedef int8_t bool;
+
+#ifndef NDEBUG
+#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
+#else
+#define debug(s) do { } while (0)
+#endif
+
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint8_t attrMask) {
+  return CONTEXTS_SYM[attrMask];
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                         InstructionContext insnContext,
+                         uint8_t opcode) {
+  const struct ContextDecision* decision = 0;
+
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  case THREEBYTE_A6:
+    decision = &THREEBYTEA6_SYM;
+    break;
+  case THREEBYTE_A7:
+    decision = &THREEBYTEA7_SYM;
+    break;
+  }
+
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ * @return            - The UID of the instruction, or 0 on failure.
+ */
+static InstrUID decode(OpcodeType type,
+                       InstructionContext insnContext,
+                       uint8_t opcode,
+                       uint8_t modRM) {
+  const struct ModRMDecision* dec = 0;
+
+  switch (type) {
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_A6:
+    dec = &THREEBYTEA6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_A7:
+    dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+
+  switch (dec->modrm_type) {
+  default:
+    debug("Corrupt table!  Unknown modrm_type");
+    return 0;
+  case MODRM_ONEENTRY:
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+1];
+    return modRMTable[dec->instructionIDs];
+  case MODRM_SPLITREG:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_FULL:
+    return modRMTable[dec->instructionIDs+modRM];
+  }
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+
+  if (!ret)
+    ++(insn->readerCursor);
+
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {
+  char buffer[256];
+  va_list ap;
+
+  if (!insn->dlog)
+    return;
+
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+
+  insn->dlog(insn->dlogArg, buffer);
+
+  return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  if (insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location)
+    return TRUE;
+  else
+    return FALSE;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  BOOL isPrefix = TRUE;
+  BOOL prefixGroups[4] = { FALSE };
+  uint64_t prefixLocation;
+  uint8_t byte = 0;
+
+  BOOL hasAdSize = FALSE;
+  BOOL hasOpSize = FALSE;
+
+  dbgprintf(insn, "readPrefixes()");
+
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+
+    if (consumeByte(insn, &byte))
+      return -1;
+
+    /*
+     * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+     * break and let it be disassembled as a normal "instruction".
+     */
+    if (insn->readerCursor - 1 == insn->startLocation
+        && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
+      uint8_t nextByte;
+      if (byte == 0xf0)
+        break;
+      if (lookAtByte(insn, &nextByte))
+        return -1;
+      if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
+        if (consumeByte(insn, &nextByte))
+          return -1;
+        if (lookAtByte(insn, &nextByte))
+          return -1;
+        unconsumeByte(insn);
+      }
+      if (nextByte != 0x0f && nextByte != 0x90)
+        break;
+    }
+
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        debug("Unhandled override");
+        return -1;
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = TRUE;
+      hasOpSize = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = TRUE;
+      hasAdSize = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = FALSE;
+      break;
+    }
+
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+
+  insn->vexSize = 0;
+
+  if (byte == 0xc4) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vexSize = 3;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vexSize == 3) {
+      insn->vexPrefix[0] = byte;
+      consumeByte(insn, &insn->vexPrefix[1]);
+      consumeByte(insn, &insn->vexPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+      }
+
+      switch (ppFromVEX3of3(insn->vexPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+    }
+  }
+  else if (byte == 0xc5) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of VEX");
+      return -1;
+    }
+
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+      insn->vexSize = 2;
+    }
+    else {
+      unconsumeByte(insn);
+    }
+
+    if (insn->vexSize == 2) {
+      insn->vexPrefix[0] = byte;
+      consumeByte(insn, &insn->vexPrefix[1]);
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+      }
+
+      switch (ppFromVEX2of2(insn->vexPrefix[1]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;
+        break;
+      }
+
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+    }
+  }
+  else {
+    if (insn->mode == MODE_64BIT) {
+      if ((byte & 0xf0) == 0x40) {
+        uint8_t opcodeByte;
+
+        if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+          dbgprintf(insn, "Redundant REX prefix");
+          return -1;
+        }
+
+        insn->rexPrefix = byte;
+        insn->necessaryPrefixLocation = insn->readerCursor - 2;
+
+        dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+      } else {
+        unconsumeByte(insn);
+        insn->necessaryPrefixLocation = insn->readerCursor - 1;
+      }
+    } else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  }
+
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasOpSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {
+  /* Determine the length of the primary opcode */
+
+  uint8_t current;
+
+  dbgprintf(insn, "readOpcode()");
+
+  insn->opcodeType = ONEBYTE;
+
+  if (insn->vexSize == 3)
+  {
+    switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+    {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+      return -1;
+    case 0:
+      break;
+    case VEX_LOB_0F:
+      insn->twoByteEscape = 0x0f;
+      insn->opcodeType = TWOBYTE;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F38:
+      insn->twoByteEscape = 0x0f;
+      insn->threeByteEscape = 0x38;
+      insn->opcodeType = THREEBYTE_38;
+      return consumeByte(insn, &insn->opcode);
+    case VEX_LOB_0F3A:
+      insn->twoByteEscape = 0x0f;
+      insn->threeByteEscape = 0x3a;
+      insn->opcodeType = THREEBYTE_3A;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
+  else if (insn->vexSize == 2)
+  {
+    insn->twoByteEscape = 0x0f;
+    insn->opcodeType = TWOBYTE;
+    return consumeByte(insn, &insn->opcode);
+  }
+
+  if (consumeByte(insn, &current))
+    return -1;
+
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+
+    insn->twoByteEscape = current;
+
+    if (consumeByte(insn, &current))
+      return -1;
+
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      insn->threeByteEscape = current;
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      insn->threeByteEscape = current;
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_3A;
+    } else if (current == 0xa6) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      insn->threeByteEscape = current;
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_A6;
+    } else if (current == 0xa7) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+
+      insn->threeByteEscape = current;
+
+      if (consumeByte(insn, &current))
+        return -1;
+
+      insn->opcodeType = THREEBYTE_A7;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+
+  insn->opcode = current;
+
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint8_t attrMask) {
+  BOOL hasModRMExtension;
+
+  uint8_t instructionClass;
+
+  instructionClass = contextForAttrs(attrMask);
+
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+
+  if (hasModRMExtension) {
+    if (readModRM(insn))
+      return -1;
+
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
+  off_t i;
+
+  for (i = 0;; i++) {
+    if (orig[i] == '\0' && equiv[i] == '\0')
+      return TRUE;
+    if (orig[i] == '\0' || equiv[i] == '\0')
+      return FALSE;
+    if (orig[i] != equiv[i]) {
+      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return FALSE;
+    }
+  }
+}
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as
+ *   appropriate for extended and escape opcodes.  Determines the attributes and
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
+  uint8_t attrMask;
+  uint16_t instructionID;
+
+  dbgprintf(insn, "getID()");
+
+  attrMask = ATTR_NONE;
+
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+
+  if (insn->vexSize) {
+    attrMask |= ATTR_VEX;
+
+    if (insn->vexSize == 3) {
+      switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vexPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vexSize == 2) {
+      switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX2of2(insn->vexPrefix[1]))
+        attrMask |= ATTR_VEXL;
+    }
+    else {
+      return -1;
+    }
+  }
+  else {
+    if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_OPSIZE;
+    else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_ADSIZE;
+    else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XS;
+    else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+      attrMask |= ATTR_XD;
+  }
+
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+
+  if (getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+
+  /* The following clauses compensate for limitations of the tables. */
+
+  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) &&
+      !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
+     * has precedence since there are no L-bit with W-bit entries in the tables.
+     * So if the L-bit isn't significant we should use the W-bit instead.
+     * We only need to do this if the instruction doesn't specify OpSize since
+     * there is a VEX_L_W_OPSIZE table.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithWBit;
+    const struct InstructionSpecifier *specWithWBit;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithWBit,
+                          insn,
+                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithWBit = specifierForUID(instructionIDWithWBit);
+
+    if (instructionID != instructionIDWithWBit) {
+      insn->instructionID = instructionIDWithWBit;
+      insn->spec = specWithWBit;
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithOpsize;
+    const char *specName, *specWithOpSizeName;
+
+    spec = specifierForUID(instructionID);
+
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /*
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specName = x86DisassemblerGetInstrName(instructionID, miiArg);
+    specWithOpSizeName =
+      x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
+
+    if (is16BitEquivalent(specName, specWithOpSizeName)) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specifierForUID(instructionIDWithOpsize);
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
+
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = 0;
+  SIBBase sibBaseBase = 0;
+  uint8_t index, base;
+
+  dbgprintf(insn, "readSIB()");
+
+  if (insn->consumedSIB)
+    return 0;
+
+  insn->consumedSIB = TRUE;
+
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+    break;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+  switch (index) {
+  case 0x4:
+    insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  default:
+    insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+    if (insn->sibIndex == SIB_INDEX_sib ||
+        insn->sibIndex == SIB_INDEX_sib64)
+      insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  }
+
+  switch (scaleFromSIB(insn->sib)) {
+  case 0:
+    insn->sibScale = 1;
+    break;
+  case 1:
+    insn->sibScale = 2;
+    break;
+  case 2:
+    insn->sibScale = 4;
+    break;
+  case 3:
+    insn->sibScale = 8;
+    break;
+  }
+
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+  switch (base) {
+  case 0x5:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (insn->addressSize == 4 ?
+                       SIB_BASE_EBP : SIB_BASE_RBP);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (insn->addressSize == 4 ?
+                       SIB_BASE_EBP : SIB_BASE_RBP);
+      break;
+    case 0x3:
+      debug("Cannot have Mod = 0b11 and a SIB byte");
+      return -1;
+    }
+    break;
+  default:
+    insn->sibBase = (SIBBase)(sibBaseBase + base);
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+
+  dbgprintf(insn, "readDisplacement()");
+
+  if (insn->consumedDisplacement)
+    return 0;
+
+  insn->consumedDisplacement = TRUE;
+  insn->displacementOffset = insn->readerCursor - insn->startLocation;
+
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = FALSE;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+
+  insn->consumedDisplacement = TRUE;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {
+  uint8_t mod, rm, reg;
+
+  dbgprintf(insn, "readModRM()");
+
+  if (insn->consumedModRM)
+    return 0;
+
+  if (consumeByte(insn, &insn->modRM))
+    return -1;
+  insn->consumedModRM = TRUE;
+
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+
+  insn->reg = (Reg)(insn->regBase + reg);
+
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if (readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if (readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      switch (rm) {
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = (insn->addressSize == 4 ?
+                        EA_BASE_sib : EA_BASE_sib64);
+        readSIB(insn);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5:
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm) {
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = EA_BASE_sib;
+        readSIB(insn);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if (readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint8_t name(struct InternalInstruction *insn,   \
+                      OperandType type,                   \
+                      uint8_t index,                      \
+                      uint8_t *valid) {                   \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      debug("Unhandled register type");                   \
+      *valid = 0;                                         \
+      return 0;                                           \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if (insn->rexPrefix &&                              \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM256:                                     \
+      return prefix##_YMM0 + index;                       \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_MM64:                                       \
+    case TYPE_MM32:                                       \
+    case TYPE_MM:                                         \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_MM0 + index;                        \
+    case TYPE_SEGMENTREG:                                 \
+      if (index > 5)                                      \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      if (index > 7)                                      \
+        *valid = 0;                                       \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CONTROLREG:                                 \
+      if (index > 8)                                      \
+        *valid = 0;                                       \
+      return prefix##_CR0 + index;                        \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ * @return      - The proper value.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn,
+                    const struct OperandSpecifier *op) {
+  uint8_t valid;
+
+  dbgprintf(insn, "fixupReg()");
+
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    debug("Expected a REG or R/M encoding in fixupReg");
+    return -1;
+  case ENCODING_VVVV:
+    insn->vvvv = (Reg)fixupRegValue(insn,
+                                    (OperandType)op->type,
+                                    insn->vvvv,
+                                    &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readOpcodeModifier - Reads an operand from the opcode field of an
+ *   instruction.  Handles AddRegFrm instructions.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @param inModRM - Indicates that the opcode field is to be read from the
+ *                  ModR/M extension; useful for escape opcodes
+ * @return        - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeModifier(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readOpcodeModifier()");
+
+  if (insn->consumedOpcodeModifier)
+    return 0;
+
+  insn->consumedOpcodeModifier = TRUE;
+
+  switch (insn->spec->modifierType) {
+  default:
+    debug("Unknown modifier type.");
+    return -1;
+  case MODIFIER_NONE:
+    debug("No modifier but an operand expects one.");
+    return -1;
+  case MODIFIER_OPCODE:
+    insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
+    return 0;
+  case MODIFIER_MODRM:
+    insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
+    return 0;
+  }
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - See readOpcodeModifier().
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ * @return      - 0 on success; nonzero otherwise.
+ */
+static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  if (readOpcodeModifier(insn))
+    return -1;
+
+  if (size == 0)
+    size = insn->registerSize;
+
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
+                                                  | insn->opcodeModifier));
+    if (insn->rexPrefix &&
+        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | insn->opcodeModifier));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | insn->opcodeModifier));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
+                                 + ((bFromREX(insn->rexPrefix) << 3)
+                                    | insn->opcodeModifier));
+    break;
+  }
+
+  return 0;
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+
+  dbgprintf(insn, "readImmediate()");
+
+  if (insn->numImmediatesConsumed == 2) {
+    debug("Already consumed two immediates");
+    return -1;
+  }
+
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+
+  insn->numImmediatesConsumed++;
+
+  return 0;
+}
+
+/*
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readVVVV(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readVVVV()");
+
+  if (insn->vexSize == 3)
+    insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
+  else if (insn->vexSize == 2)
+    insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+  else
+    return -1;
+
+  if (insn->mode != MODE_64BIT)
+    insn->vvvv &= 0x7;
+
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int index;
+  int hasVVVV, needVVVV;
+  int sawRegImm = 0;
+
+  dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
+
+  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+    switch (x86OperandSets[insn->spec->operands][index].encoding) {
+    case ENCODING_NONE:
+      break;
+    case ENCODING_REG:
+    case ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+        return -1;
+      break;
+    case ENCODING_CB:
+    case ENCODING_CW:
+    case ENCODING_CD:
+    case ENCODING_CP:
+    case ENCODING_CO:
+    case ENCODING_CT:
+      dbgprintf(insn, "We currently don't hande code-offset encodings");
+      return -1;
+    case ENCODING_IB:
+      if (sawRegImm) {
+        /* Saw a register immediate so don't read again and instead split the
+           previous immediate.  FIXME: This is a hack. */
+        insn->immediates[insn->numImmediatesConsumed] =
+          insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+        ++insn->numImmediatesConsumed;
+        break;
+      }
+      if (readImmediate(insn, 1))
+        return -1;
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
+        return -1;
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
+          insn->immediates[insn->numImmediatesConsumed - 1] > 31)
+        return -1;
+      if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
+          x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
+        sawRegImm = 1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      if (readImmediate(insn, insn->immediateSize))
+        return -1;
+      break;
+    case ENCODING_Ia:
+      if (readImmediate(insn, insn->addressSize))
+        return -1;
+      break;
+    case ENCODING_RB:
+      if (readOpcodeRegister(insn, 1))
+        return -1;
+      break;
+    case ENCODING_RW:
+      if (readOpcodeRegister(insn, 2))
+        return -1;
+      break;
+    case ENCODING_RD:
+      if (readOpcodeRegister(insn, 4))
+        return -1;
+      break;
+    case ENCODING_RO:
+      if (readOpcodeRegister(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Rv:
+      if (readOpcodeRegister(insn, 0))
+        return -1;
+      break;
+    case ENCODING_I:
+      if (readOpcodeModifier(insn))
+        return -1;
+      break;
+    case ENCODING_VVVV:
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
+        return -1;
+      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+        return -1;
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
+
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+                      byteReader_t reader,
+                      const void* readerArg,
+                      dlog_t logger,
+                      void* loggerArg,
+                      const void* miiArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn, miiArg)      ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+
+  insn->operands = &x86OperandSets[insn->spec->operands][0];
+
+  insn->length = insn->readerCursor - insn->startLocation;
+
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
+            startLoc, insn->readerCursor, insn->length);
+
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 000000000000..407ead3cafa9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -0,0 +1,588 @@
+/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the public interface of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef X86DISASSEMBLERDECODER_H
+#define X86DISASSEMBLERDECODER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INSTRUCTION_SPECIFIER_FIELDS \
+  uint16_t operands;
+
+#define INSTRUCTION_IDS     \
+  uint16_t instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+/*
+ * Accessor functions for various fields of an Intel instruction
+ */
+#define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
+#define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
+#define rmFromModRM(modRM)   ((modRM) & 0x7)
+#define scaleFromSIB(sib)    (((sib) & 0xc0) >> 6)
+#define indexFromSIB(sib)    (((sib) & 0x38) >> 3)
+#define baseFromSIB(sib)     ((sib) & 0x7)
+#define wFromREX(rex)        (((rex) & 0x8) >> 3)
+#define rFromREX(rex)        (((rex) & 0x4) >> 2)
+#define xFromREX(rex)        (((rex) & 0x2) >> 1)
+#define bFromREX(rex)        ((rex) & 0x1)
+
+#define rFromVEX2of3(vex)       (((~(vex)) & 0x80) >> 7)
+#define xFromVEX2of3(vex)       (((~(vex)) & 0x40) >> 6)
+#define bFromVEX2of3(vex)       (((~(vex)) & 0x20) >> 5)
+#define mmmmmFromVEX2of3(vex)   ((vex) & 0x1f)
+#define wFromVEX3of3(vex)       (((vex) & 0x80) >> 7)
+#define vvvvFromVEX3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX3of3(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX3of3(vex)      ((vex) & 0x3)
+
+#define rFromVEX2of2(vex)       (((~(vex)) & 0x80) >> 7)
+#define vvvvFromVEX2of2(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
+#define ppFromVEX2of2(vex)      ((vex) & 0x3)
+
+/*
+ * These enums represent Intel registers for use by the decoder.
+ */
+
+#define REGS_8BIT     \
+  ENTRY(AL)           \
+  ENTRY(CL)           \
+  ENTRY(DL)           \
+  ENTRY(BL)           \
+  ENTRY(AH)           \
+  ENTRY(CH)           \
+  ENTRY(DH)           \
+  ENTRY(BH)           \
+  ENTRY(R8B)          \
+  ENTRY(R9B)          \
+  ENTRY(R10B)         \
+  ENTRY(R11B)         \
+  ENTRY(R12B)         \
+  ENTRY(R13B)         \
+  ENTRY(R14B)         \
+  ENTRY(R15B)         \
+  ENTRY(SPL)          \
+  ENTRY(BPL)          \
+  ENTRY(SIL)          \
+  ENTRY(DIL)
+
+#define EA_BASES_16BIT  \
+  ENTRY(BX_SI)          \
+  ENTRY(BX_DI)          \
+  ENTRY(BP_SI)          \
+  ENTRY(BP_DI)          \
+  ENTRY(SI)             \
+  ENTRY(DI)             \
+  ENTRY(BP)             \
+  ENTRY(BX)             \
+  ENTRY(R8W)            \
+  ENTRY(R9W)            \
+  ENTRY(R10W)           \
+  ENTRY(R11W)           \
+  ENTRY(R12W)           \
+  ENTRY(R13W)           \
+  ENTRY(R14W)           \
+  ENTRY(R15W)
+
+#define REGS_16BIT    \
+  ENTRY(AX)           \
+  ENTRY(CX)           \
+  ENTRY(DX)           \
+  ENTRY(BX)           \
+  ENTRY(SP)           \
+  ENTRY(BP)           \
+  ENTRY(SI)           \
+  ENTRY(DI)           \
+  ENTRY(R8W)          \
+  ENTRY(R9W)          \
+  ENTRY(R10W)         \
+  ENTRY(R11W)         \
+  ENTRY(R12W)         \
+  ENTRY(R13W)         \
+  ENTRY(R14W)         \
+  ENTRY(R15W)
+
+#define EA_BASES_32BIT  \
+  ENTRY(EAX)            \
+  ENTRY(ECX)            \
+  ENTRY(EDX)            \
+  ENTRY(EBX)            \
+  ENTRY(sib)            \
+  ENTRY(EBP)            \
+  ENTRY(ESI)            \
+  ENTRY(EDI)            \
+  ENTRY(R8D)            \
+  ENTRY(R9D)            \
+  ENTRY(R10D)           \
+  ENTRY(R11D)           \
+  ENTRY(R12D)           \
+  ENTRY(R13D)           \
+  ENTRY(R14D)           \
+  ENTRY(R15D)
+
+#define REGS_32BIT  \
+  ENTRY(EAX)        \
+  ENTRY(ECX)        \
+  ENTRY(EDX)        \
+  ENTRY(EBX)        \
+  ENTRY(ESP)        \
+  ENTRY(EBP)        \
+  ENTRY(ESI)        \
+  ENTRY(EDI)        \
+  ENTRY(R8D)        \
+  ENTRY(R9D)        \
+  ENTRY(R10D)       \
+  ENTRY(R11D)       \
+  ENTRY(R12D)       \
+  ENTRY(R13D)       \
+  ENTRY(R14D)       \
+  ENTRY(R15D)
+
+#define EA_BASES_64BIT  \
+  ENTRY(RAX)            \
+  ENTRY(RCX)            \
+  ENTRY(RDX)            \
+  ENTRY(RBX)            \
+  ENTRY(sib64)          \
+  ENTRY(RBP)            \
+  ENTRY(RSI)            \
+  ENTRY(RDI)            \
+  ENTRY(R8)             \
+  ENTRY(R9)             \
+  ENTRY(R10)            \
+  ENTRY(R11)            \
+  ENTRY(R12)            \
+  ENTRY(R13)            \
+  ENTRY(R14)            \
+  ENTRY(R15)
+
+#define REGS_64BIT  \
+  ENTRY(RAX)        \
+  ENTRY(RCX)        \
+  ENTRY(RDX)        \
+  ENTRY(RBX)        \
+  ENTRY(RSP)        \
+  ENTRY(RBP)        \
+  ENTRY(RSI)        \
+  ENTRY(RDI)        \
+  ENTRY(R8)         \
+  ENTRY(R9)         \
+  ENTRY(R10)        \
+  ENTRY(R11)        \
+  ENTRY(R12)        \
+  ENTRY(R13)        \
+  ENTRY(R14)        \
+  ENTRY(R15)
+
+#define REGS_MMX  \
+  ENTRY(MM0)      \
+  ENTRY(MM1)      \
+  ENTRY(MM2)      \
+  ENTRY(MM3)      \
+  ENTRY(MM4)      \
+  ENTRY(MM5)      \
+  ENTRY(MM6)      \
+  ENTRY(MM7)
+
+#define REGS_XMM  \
+  ENTRY(XMM0)     \
+  ENTRY(XMM1)     \
+  ENTRY(XMM2)     \
+  ENTRY(XMM3)     \
+  ENTRY(XMM4)     \
+  ENTRY(XMM5)     \
+  ENTRY(XMM6)     \
+  ENTRY(XMM7)     \
+  ENTRY(XMM8)     \
+  ENTRY(XMM9)     \
+  ENTRY(XMM10)    \
+  ENTRY(XMM11)    \
+  ENTRY(XMM12)    \
+  ENTRY(XMM13)    \
+  ENTRY(XMM14)    \
+  ENTRY(XMM15)
+
+#define REGS_YMM  \
+  ENTRY(YMM0)     \
+  ENTRY(YMM1)     \
+  ENTRY(YMM2)     \
+  ENTRY(YMM3)     \
+  ENTRY(YMM4)     \
+  ENTRY(YMM5)     \
+  ENTRY(YMM6)     \
+  ENTRY(YMM7)     \
+  ENTRY(YMM8)     \
+  ENTRY(YMM9)     \
+  ENTRY(YMM10)    \
+  ENTRY(YMM11)    \
+  ENTRY(YMM12)    \
+  ENTRY(YMM13)    \
+  ENTRY(YMM14)    \
+  ENTRY(YMM15)
+
+#define REGS_SEGMENT \
+  ENTRY(ES)          \
+  ENTRY(CS)          \
+  ENTRY(SS)          \
+  ENTRY(DS)          \
+  ENTRY(FS)          \
+  ENTRY(GS)
+
+#define REGS_DEBUG  \
+  ENTRY(DR0)        \
+  ENTRY(DR1)        \
+  ENTRY(DR2)        \
+  ENTRY(DR3)        \
+  ENTRY(DR4)        \
+  ENTRY(DR5)        \
+  ENTRY(DR6)        \
+  ENTRY(DR7)
+
+#define REGS_CONTROL  \
+  ENTRY(CR0)          \
+  ENTRY(CR1)          \
+  ENTRY(CR2)          \
+  ENTRY(CR3)          \
+  ENTRY(CR4)          \
+  ENTRY(CR5)          \
+  ENTRY(CR6)          \
+  ENTRY(CR7)          \
+  ENTRY(CR8)
+
+#define ALL_EA_BASES  \
+  EA_BASES_16BIT      \
+  EA_BASES_32BIT      \
+  EA_BASES_64BIT
+
+#define ALL_SIB_BASES \
+  REGS_32BIT          \
+  REGS_64BIT
+
+#define ALL_REGS      \
+  REGS_8BIT           \
+  REGS_16BIT          \
+  REGS_32BIT          \
+  REGS_64BIT          \
+  REGS_MMX            \
+  REGS_XMM            \
+  REGS_YMM            \
+  REGS_SEGMENT        \
+  REGS_DEBUG          \
+  REGS_CONTROL        \
+  ENTRY(RIP)
+
+/*
+ * EABase - All possible values of the base field for effective-address
+ *   computations, a.k.a. the Mod and R/M fields of the ModR/M byte.  We
+ *   distinguish between bases (EA_BASE_*) and registers that just happen to be
+ *   referred to when Mod == 0b11 (EA_REG_*).
+ */
+typedef enum {
+  EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+  ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  EA_max
+} EABase;
+
+/*
+ * SIBIndex - All possible values of the SIB index field.
+ *   Borrows entries from ALL_EA_BASES with the special case that
+ *   sib is synonymous with NONE.
+ * Vector SIB: index can be XMM or YMM.
+ */
+typedef enum {
+  SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+  ALL_EA_BASES
+  REGS_XMM
+  REGS_YMM
+#undef ENTRY
+  SIB_INDEX_max
+} SIBIndex;
+
+/*
+ * SIBBase - All possible values of the SIB base field.
+ */
+typedef enum {
+  SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+  ALL_SIB_BASES
+#undef ENTRY
+  SIB_BASE_max
+} SIBBase;
+
+/*
+ * EADisplacement - Possible displacement types for effective-address
+ *   computations.
+ */
+typedef enum {
+  EA_DISP_NONE,
+  EA_DISP_8,
+  EA_DISP_16,
+  EA_DISP_32
+} EADisplacement;
+
+/*
+ * Reg - All possible values of the reg field in the ModR/M byte.
+ */
+typedef enum {
+#define ENTRY(x) MODRM_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  MODRM_REG_max
+} Reg;
+
+/*
+ * SegmentOverride - All possible segment overrides.
+ */
+typedef enum {
+  SEG_OVERRIDE_NONE,
+  SEG_OVERRIDE_CS,
+  SEG_OVERRIDE_SS,
+  SEG_OVERRIDE_DS,
+  SEG_OVERRIDE_ES,
+  SEG_OVERRIDE_FS,
+  SEG_OVERRIDE_GS,
+  SEG_OVERRIDE_max
+} SegmentOverride;
+
+/*
+ * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
+ */
+
+typedef enum {
+  VEX_LOB_0F = 0x1,
+  VEX_LOB_0F38 = 0x2,
+  VEX_LOB_0F3A = 0x3
+} VEXLeadingOpcodeByte;
+
+/*
+ * VEXPrefixCode - Possible values for the VEX.pp field
+ */
+
+typedef enum {
+  VEX_PREFIX_NONE = 0x0,
+  VEX_PREFIX_66 = 0x1,
+  VEX_PREFIX_F3 = 0x2,
+  VEX_PREFIX_F2 = 0x3
+} VEXPrefixCode;
+
+typedef uint8_t BOOL;
+
+/*
+ * byteReader_t - Type for the byte reader that the consumer must provide to
+ *   the decoder.  Reads a single byte from the instruction's address space.
+ * @param arg     - A baton that the consumer can associate with any internal
+ *                  state that it needs.
+ * @param byte    - A pointer to a single byte in memory that should be set to
+ *                  contain the value at address.
+ * @param address - The address in the instruction's address space that should
+ *                  be read from.
+ * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
+ */
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
+
+/*
+ * dlog_t - Type for the logging function that the consumer can provide to
+ *   get debugging output from the decoder.
+ * @param arg     - A baton that the consumer can associate with any internal
+ *                  state that it needs.
+ * @param log     - A string that contains the message.  Will be reused after
+ *                  the logger returns.
+ */
+typedef void (*dlog_t)(void* arg, const char *log);
+
+/*
+ * The x86 internal instruction, which is produced by the decoder.
+ */
+struct InternalInstruction {
+  /* Reader interface (C) */
+  byteReader_t reader;
+  /* Opaque value passed to the reader */
+  const void* readerArg;
+  /* The address of the next byte to read via the reader */
+  uint64_t readerCursor;
+
+  /* Logger interface (C) */
+  dlog_t dlog;
+  /* Opaque value passed to the logger */
+  void* dlogArg;
+
+  /* General instruction information */
+
+  /* The mode to disassemble for (64-bit, protected, real) */
+  DisassemblerMode mode;
+  /* The start of the instruction, usable with the reader */
+  uint64_t startLocation;
+  /* The length of the instruction, in bytes */
+  size_t length;
+
+  /* Prefix state */
+
+  /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+  uint8_t prefixPresent[0x100];
+  /* contains the location (for use with the reader) of the prefix byte */
+  uint64_t prefixLocations[0x100];
+  /* The value of the VEX prefix, if present */
+  uint8_t vexPrefix[3];
+  /* The length of the VEX prefix (0 if not present) */
+  uint8_t vexSize;
+  /* The value of the REX prefix, if present */
+  uint8_t rexPrefix;
+  /* The location where a mandatory prefix would have to be (i.e., right before
+     the opcode, or right before the REX prefix if one is present) */
+  uint64_t necessaryPrefixLocation;
+  /* The segment override type */
+  SegmentOverride segmentOverride;
+
+  /* Sizes of various critical pieces of data, in bytes */
+  uint8_t registerSize;
+  uint8_t addressSize;
+  uint8_t displacementSize;
+  uint8_t immediateSize;
+
+  /* Offsets from the start of the instruction to the pieces of data, which is
+     needed to find relocation entries for adding symbolic operands */
+  uint8_t displacementOffset;
+  uint8_t immediateOffset;
+
+  /* opcode state */
+
+  /* The value of the two-byte escape prefix (usually 0x0f) */
+  uint8_t twoByteEscape;
+  /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
+  uint8_t threeByteEscape;
+  /* The last byte of the opcode, not counting any ModR/M extension */
+  uint8_t opcode;
+  /* The ModR/M byte of the instruction, if it is an opcode extension */
+  uint8_t modRMExtension;
+
+  /* decode state */
+
+  /* The type of opcode, used for indexing into the array of decode tables */
+  OpcodeType opcodeType;
+  /* The instruction ID, extracted from the decode table */
+  uint16_t instructionID;
+  /* The specifier for the instruction, from the instruction info table */
+  const struct InstructionSpecifier *spec;
+
+  /* state for additional bytes, consumed during operand decode.  Pattern:
+     consumed___ indicates that the byte was already consumed and does not
+     need to be consumed again */
+
+  /* The VEX.vvvv field, which contains a third register operand for some AVX
+     instructions */
+  Reg                           vvvv;
+
+  /* The ModR/M byte, which contains most register operands and some portion of
+     all memory operands */
+  BOOL                          consumedModRM;
+  uint8_t                       modRM;
+
+  /* The SIB byte, used for more complex 32- or 64-bit memory operands */
+  BOOL                          consumedSIB;
+  uint8_t                       sib;
+
+  /* The displacement, used for memory operands */
+  BOOL                          consumedDisplacement;
+  int32_t                       displacement;
+
+  /* Immediates.  There can be two in some cases */
+  uint8_t                       numImmediatesConsumed;
+  uint8_t                       numImmediatesTranslated;
+  uint64_t                      immediates[2];
+
+  /* A register or immediate operand encoded into the opcode */
+  BOOL                          consumedOpcodeModifier;
+  uint8_t                       opcodeModifier;
+  Reg                           opcodeRegister;
+
+  /* Portions of the ModR/M byte */
+
+  /* These fields determine the allowable values for the ModR/M fields, which
+     depend on operand and address widths */
+  EABase                        eaBaseBase;
+  EABase                        eaRegBase;
+  Reg                           regBase;
+
+  /* The Mod and R/M fields can encode a base for an effective address, or a
+     register.  These are separated into two fields here */
+  EABase                        eaBase;
+  EADisplacement                eaDisplacement;
+  /* The reg field always encodes a register */
+  Reg                           reg;
+
+  /* SIB state */
+  SIBIndex                      sibIndex;
+  uint8_t                       sibScale;
+  SIBBase                       sibBase;
+
+  const struct OperandSpecifier *operands;
+};
+
+/* decodeInstruction - Decode one instruction and store the decoding results in
+ *   a buffer provided by the consumer.
+ * @param insn      - The buffer to store the instruction in.  Allocated by the
+ *                    consumer.
+ * @param reader    - The byteReader_t for the bytes to be read.
+ * @param readerArg - An argument to pass to the reader for storing context
+ *                    specific to the consumer.  May be NULL.
+ * @param logger    - The dlog_t to be used in printing status messages from the
+ *                    disassembler.  May be NULL.
+ * @param loggerArg - An argument to pass to the logger for storing context
+ *                    specific to the logger.  May be NULL.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (16-bit, 32-bit, 64-bit) to decode in.
+ * @return          - Nonzero if there was an error during decode, 0 otherwise.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+                      byteReader_t reader,
+                      const void* readerArg,
+                      dlog_t logger,
+                      void* loggerArg,
+                      const void* miiArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode);
+
+/* x86DisassemblerDebug - C-accessible function for printing a message to
+ *   debugs()
+ * @param file  - The name of the file printing the debug message.
+ * @param line  - The line number that printed the debug message.
+ * @param s     - The message to print.
+ */
+
+void x86DisassemblerDebug(const char *file,
+                          unsigned line,
+                          const char *s);
+
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 000000000000..23dfe4b5b5f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -0,0 +1,398 @@
+/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains common definitions used by both the disassembler and the table
+ *  generator.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+/*
+ * This header file provides those definitions that need to be shared between
+ * the decoder and the table generator in a C-friendly manner.
+ */
+
+#ifndef X86DISASSEMBLERDECODERCOMMON_H
+#define X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/Support/DataTypes.h"
+
+#define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM      x86DisassemblerContexts
+#define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+#define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
+#define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
+
+#define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR      "x86DisassemblerContexts"
+#define ONEBYTE_STR       "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+#define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
+#define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
+
+/*
+ * Attributes of an instruction that must be known before the opcode can be
+ * processed correctly.  Most of these indicate the presence of particular
+ * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+ */
+#define ATTRIBUTE_BITS          \
+  ENUM_ENTRY(ATTR_NONE,   0x00) \
+  ENUM_ENTRY(ATTR_64BIT,  0x01) \
+  ENUM_ENTRY(ATTR_XS,     0x02) \
+  ENUM_ENTRY(ATTR_XD,     0x04) \
+  ENUM_ENTRY(ATTR_REXW,   0x08) \
+  ENUM_ENTRY(ATTR_OPSIZE, 0x10) \
+  ENUM_ENTRY(ATTR_ADSIZE, 0x20) \
+  ENUM_ENTRY(ATTR_VEX,    0x40) \
+  ENUM_ENTRY(ATTR_VEXL,   0x80)
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+  ATTRIBUTE_BITS
+  ATTR_max
+};
+#undef ENUM_ENTRY
+
+/*
+ * Combinations of the above attributes that are relevant to instruction
+ * decode.  Although other combinations are possible, they can be reduced to
+ * these without affecting the ultimately decoded instruction.
+ */
+
+/*           Class name           Rank  Rationale for rank assignment         */
+#define INSTRUCTION_CONTEXTS                                                   \
+  ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
+  ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
+                                        "64-bit mode but no more")             \
+  ENUM_ENTRY(IC_OPSIZE,             3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_ADSIZE,             3,  "requires an ADSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
+                                        "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
+  ENUM_ENTRY(IC_64BIT_ADSIZE,       3,  "Just as meaningful as IC_ADSIZE")     \
+  ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
+                                        "secondary")                           \
+  ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
+                                        "opcode")                              \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
+                                        "IC_64BIT_REXW_XS")                    \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  7,  "The Dynamic Duo!  Prefer over all "   \
+                                        "else because this changes most "      \
+                                        "operands' meaning")                   \
+  ENUM_ENTRY(IC_VEX,                1,  "requires a VEX prefix")               \
+  ENUM_ENTRY(IC_VEX_XS,             2,  "requires VEX and the XS prefix")      \
+  ENUM_ENTRY(IC_VEX_XD,             2,  "requires VEX and the XD prefix")      \
+  ENUM_ENTRY(IC_VEX_OPSIZE,         2,  "requires VEX and the OpSize prefix")  \
+  ENUM_ENTRY(IC_VEX_W,              3,  "requires VEX and the W prefix")       \
+  ENUM_ENTRY(IC_VEX_W_XS,           4,  "requires VEX, W, and XS prefix")      \
+  ENUM_ENTRY(IC_VEX_W_XD,           4,  "requires VEX, W, and XD prefix")      \
+  ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
+  ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")
+
+
+#define ENUM_ENTRY(n, r, d) n,
+typedef enum {
+  INSTRUCTION_CONTEXTS
+  IC_max
+} InstructionContext;
+#undef ENUM_ENTRY
+
+/*
+ * Opcode types, which determine which decode table to use, both in the Intel
+ * manual and also for the decoder.
+ */
+typedef enum {
+  ONEBYTE       = 0,
+  TWOBYTE       = 1,
+  THREEBYTE_38  = 2,
+  THREEBYTE_3A  = 3,
+  THREEBYTE_A6  = 4,
+  THREEBYTE_A7  = 5
+} OpcodeType;
+
+/*
+ * The following structs are used for the hierarchical decode table.  After
+ * determining the instruction's class (i.e., which IC_* constant applies to
+ * it), the decoder reads the opcode.  Some instructions require specific
+ * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+ *
+ * If a ModR/M byte is not required, "required" is left unset, and the values
+ * for each instructionID are identical.
+ */
+
+typedef uint16_t InstrUID;
+
+/*
+ * ModRMDecisionType - describes the type of ModR/M decision, allowing the
+ * consumer to determine the number of entries in it.
+ *
+ * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+ *                  instruction is the same.
+ * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+ *                  corresponds to one instruction; otherwise, it corresponds to
+ *                  a different instruction.
+ * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+ *                  divided by 8 is used to select instruction; otherwise, each
+ *                  value of the ModR/M byte could correspond to a different
+ *                  instruction.
+ * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+                    corresponds to instructions that use reg field as opcode
+ * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+ *                  to a different instruction.
+ */
+
+#define MODRMTYPES            \
+  ENUM_ENTRY(MODRM_ONEENTRY)  \
+  ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
+  ENUM_ENTRY(MODRM_SPLITREG)  \
+  ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+  MODRMTYPES
+  MODRM_max
+} ModRMDecisionType;
+#undef ENUM_ENTRY
+
+/*
+ * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
+ *  instruction each possible value of the ModR/M byte corresponds to.  Once
+ *  this information is known, we have narrowed down to a single instruction.
+ */
+struct ModRMDecision {
+  uint8_t     modrm_type;
+
+  /* The macro below must be defined wherever this file is included. */
+  INSTRUCTION_IDS
+};
+
+/*
+ * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
+ *   given a particular opcode.
+ */
+struct OpcodeDecision {
+  struct ModRMDecision modRMDecisions[256];
+};
+
+/*
+ * ContextDecision - Specifies which opcode->instruction tables to look at given
+ *   a particular context (set of attributes).  Since there are many possible
+ *   contexts, the decoder first uses CONTEXTS_SYM to determine which context
+ *   applies given a specific set of attributes.  Hence there are only IC_max
+ *   entries in this table, rather than 2^(ATTR_max).
+ */
+struct ContextDecision {
+  struct OpcodeDecision opcodeDecisions[IC_max];
+};
+
+/*
+ * Physical encodings of instruction operands.
+ */
+
+#define ENCODINGS                                                              \
+  ENUM_ENTRY(ENCODING_NONE,   "")                                              \
+  ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
+  ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
+  ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
+  ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_CP,     "6-byte")                                        \
+  ENUM_ENTRY(ENCODING_CO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_CT,     "10-byte")                                       \
+  ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
+  ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+                              "the opcode byte")                               \
+  ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
+  ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
+  ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
+  ENUM_ENTRY(ENCODING_I,      "Position on floating-point stack added to the " \
+                              "opcode byte")                                   \
+                                                                               \
+  ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
+  ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
+  ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
+                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
+                              "in type")
+
+#define ENUM_ENTRY(n, d) n,
+  typedef enum {
+    ENCODINGS
+    ENCODING_max
+  } OperandEncoding;
+#undef ENUM_ENTRY
+
+/*
+ * Semantic interpretations of instruction operands.
+ */
+
+#define TYPES                                                                  \
+  ENUM_ENTRY(TYPE_NONE,       "")                                              \
+  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
+  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
+  ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
+  ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
+  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
+  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
+  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
+  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
+  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
+  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
+  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
+  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
+  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_M16_32,     "2+4-byte two-part memory operand (LIDT, LGDT)") \
+  ENUM_ENTRY(TYPE_M16_16,     "2+2-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M32_32,     "4+4-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M16_64,     "2+8-byte (LIDT, LGDT)")                         \
+  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
+                              "base)")                                         \
+  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
+  ENUM_ENTRY(TYPE_SREG,       "Byte with single bit set: 0 = ES, 1 = CS, "     \
+                              "2 = SS, 3 = DS, 4 = FS, 5 = GS")                \
+  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
+  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
+  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_M16INT,     "2-byte memory integer operand for use in "      \
+                              "floating-point instructions")                   \
+  ENUM_ENTRY(TYPE_M32INT,     "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64INT,     "8-byte")                                        \
+  ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
+  ENUM_ENTRY(TYPE_MM,         "MMX register operand")                          \
+  ENUM_ENTRY(TYPE_MM32,       "4-byte MMX register or memory operand")         \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
+  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
+  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
+  ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
+  ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
+  ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
+                                                                               \
+  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
+  ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
+  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
+  ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
+  ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
+  ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
+  ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
+  ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
+  ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
+  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,
+typedef enum {
+  TYPES
+  TYPE_max
+} OperandType;
+#undef ENUM_ENTRY
+
+/*
+ * OperandSpecifier - The specification for how to extract and interpret one
+ *   operand.
+ */
+struct OperandSpecifier {
+  uint8_t encoding;
+  uint8_t type;
+};
+
+/*
+ * Indicates where the opcode modifier (if any) is to be found.  Extended
+ * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
+ */
+
+#define MODIFIER_TYPES        \
+  ENUM_ENTRY(MODIFIER_NONE)   \
+  ENUM_ENTRY(MODIFIER_OPCODE) \
+  ENUM_ENTRY(MODIFIER_MODRM)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+  MODIFIER_TYPES
+  MODIFIER_max
+} ModifierType;
+#undef ENUM_ENTRY
+
+#define X86_MAX_OPERANDS 5
+
+/*
+ * The specification for how to extract and interpret a full instruction and
+ * its operands.
+ */
+struct InstructionSpecifier {
+  uint8_t modifierType;
+  uint8_t modifierBase;
+
+  /* The macro below must be defined wherever this file is included. */
+  INSTRUCTION_SPECIFIER_FIELDS
+};
+
+/*
+ * Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+ * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+ * respectively.
+ */
+typedef enum {
+  MODE_16BIT,
+  MODE_32BIT,
+  MODE_64BIT
+} DisassemblerMode;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 000000000000..e357710b20eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -0,0 +1,218 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <map>
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
+#include "X86GenAsmWriter.inc"
+
+void X86ATTInstPrinter::printRegName(raw_ostream &OS,
+                                     unsigned RegNo) const {
+  OS << markup("<reg:")
+     << '%' << getRegisterName(RegNo)
+     << markup(">");
+}
+
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  // Try to print any aliases first.
+  if (!printAliasInstr(MI, OS))
+    printInstruction(MI, OS);
+  
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
+/// print slightly differently than normal immediates.  For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << formatImm(Op.getImm());
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+  } else if (Op.isImm()) {
+    // Print X86 immediates as signed values.
+    O << markup("<imm:")
+      << '$' << formatImm((int64_t)Op.getImm())
+      << markup(">");
+    
+    if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+      *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
+    
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << markup("<imm:")
+      << '$' << *Op.getExpr()
+      << markup(">");
+  }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                          raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op);
+  const MCOperand &IndexReg = MI->getOperand(Op+2);
+  const MCOperand &DispSpec = MI->getOperand(Op+3);
+  const MCOperand &SegReg = MI->getOperand(Op+4);
+  
+  O << markup("<mem:");
+
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O);
+    O << ':';
+  }
+  
+  if (DispSpec.isImm()) {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << formatImm(DispVal);
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  }
+  
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op, O);
+    
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+2, O);
+      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      if (ScaleVal != 1) {
+        O << ','
+	  << markup("<imm:")
+          << ScaleVal // never printed in hex.
+	  << markup(">");
+      }
+    }
+    O << ')';
+  }
+
+  O << markup(">");
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
new file mode 100644
index 000000000000..8e09183dccc9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -0,0 +1,87 @@
+//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class MCOperand;
+  
+class X86ATTInstPrinter : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                    const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+
+  // Autogenerated by tblgen, returns true if we successfully printed an
+  // alias.
+  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &OS);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+};
+  
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
new file mode 100644
index 000000000000..0f6eeb19bccd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -0,0 +1,511 @@
+//===-- X86InstComments.cpp - Generate verbose-asm comments for instrs ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstComments.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Top Level Entrypoint
+//===----------------------------------------------------------------------===//
+
+/// EmitAnyX86InstComments - This function decodes x86 instructions and prints
+/// newline terminated strings to the specified string if desired.  This
+/// information is shown in disassembly dumps when verbose assembly is enabled.
+void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                                  const char *(*getRegName)(unsigned)) {
+  // If this is a shuffle operation, the switch should fill in this state.
+  SmallVector<int, 8> ShuffleMask;
+  const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+
+  switch (MI->getOpcode()) {
+  case X86::INSERTPSrr:
+  case X86::VINSERTPSrr:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+    break;
+
+  case X86::MOVLHPSrr:
+  case X86::VMOVLHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
+
+  case X86::MOVHLPSrr:
+  case X86::VMOVHLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
+
+  case X86::PALIGNR128rr:
+  case X86::VPALIGNR128rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PALIGNR128rm:
+  case X86::VPALIGNR128rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePALIGNRMask(MVT::v16i8,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPALIGNR256rr:
+    Src1Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPALIGNR256rm:
+    Src2Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePALIGNRMask(MVT::v32i8,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PSHUFDri:
+  case X86::VPSHUFDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFDmi:
+  case X86::VPSHUFDmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFMask(MVT::v4i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                     ShuffleMask);
+    break;
+  case X86::VPSHUFDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFDYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFMask(MVT::v8i32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    break;
+
+
+  case X86::PSHUFHWri:
+  case X86::VPSHUFHWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFHWmi:
+  case X86::VPSHUFHWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFHWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFHWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFHWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFHWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::PSHUFLWri:
+  case X86::VPSHUFLWri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::PSHUFLWmi:
+  case X86::VPSHUFLWmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFLWMask(MVT::v8i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+  case X86::VPSHUFLWYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPSHUFLWYmi:
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodePSHUFLWMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    break;
+
+  case X86::PUNPCKHBWrr:
+  case X86::VPUNPCKHBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHBWrm:
+  case X86::VPUNPCKHBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKHBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKHWDrr:
+  case X86::VPUNPCKHWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHWDrm:
+  case X86::VPUNPCKHWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKHWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKHDQrr:
+  case X86::VPUNPCKHDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHDQrm:
+  case X86::VPUNPCKHDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKHDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::PUNPCKHQDQrr:
+  case X86::VPUNPCKHQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKHQDQrm:
+  case X86::VPUNPCKHQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKHQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKHQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKHMask(MVT::v4i64, ShuffleMask);
+    break;
+
+  case X86::PUNPCKLBWrr:
+  case X86::VPUNPCKLBWrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLBWrm:
+  case X86::VPUNPCKLBWrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
+    break;
+  case X86::VPUNPCKLBWYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLBWYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
+    break;
+  case X86::PUNPCKLWDrr:
+  case X86::VPUNPCKLWDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLWDrm:
+  case X86::VPUNPCKLWDrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
+    break;
+  case X86::VPUNPCKLWDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLWDYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
+    break;
+  case X86::PUNPCKLDQrr:
+  case X86::VPUNPCKLDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLDQrm:
+  case X86::VPUNPCKLDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
+    break;
+  case X86::VPUNPCKLDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
+    break;
+  case X86::PUNPCKLQDQrr:
+  case X86::VPUNPCKLQDQrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::PUNPCKLQDQrm:
+  case X86::VPUNPCKLQDQrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
+    break;
+  case X86::VPUNPCKLQDQYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPUNPCKLQDQYrm:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeUNPCKLMask(MVT::v4i64, ShuffleMask);
+    break;
+
+  case X86::SHUFPDrri:
+  case X86::VSHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPDrmi:
+  case X86::VSHUFPDrmi:
+    DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPDYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPDYrmi:
+    DecodeSHUFPMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::SHUFPSrri:
+  case X86::VSHUFPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPSrmi:
+  case X86::VSHUFPSrmi:
+    DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VSHUFPSYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPSYrmi:
+    DecodeSHUFPMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+
+  case X86::UNPCKLPDrr:
+  case X86::VUNPCKLPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPDrm:
+  case X86::VUNPCKLPDrm:
+    DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPDYrm:
+    DecodeUNPCKLMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKLPSrr:
+  case X86::VUNPCKLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKLPSrm:
+  case X86::VUNPCKLPSrm:
+    DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKLPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKLPSYrm:
+    DecodeUNPCKLMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPDrr:
+  case X86::VUNPCKHPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPDrm:
+  case X86::VUNPCKHPDrm:
+    DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDYrm:
+    DecodeUNPCKHMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::UNPCKHPSrr:
+  case X86::VUNPCKHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::UNPCKHPSrm:
+  case X86::VUNPCKHPSrm:
+    DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSYrm:
+    DecodeUNPCKHMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSmi:
+    DecodePSHUFMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPSYmi:
+    DecodePSHUFMask(MVT::v8f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDmi:
+    DecodePSHUFMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMILPDYmi:
+    DecodePSHUFMask(MVT::v4f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
+                       ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPERM2F128rm:
+  case X86::VPERM2I128rm:
+    // For instruction comments purpose, assume the 256-bit vector is v4i64.
+    DecodeVPERM2X128Mask(MVT::v4i64,
+                         MI->getOperand(MI->getNumOperands()-1).getImm(),
+                         ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMQYri:
+  case X86::VPERMPDYri:
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    // FALL THROUGH.
+  case X86::VPERMQYmi:
+  case X86::VPERMPDYmi:
+    DecodeVPERMMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
+                    ShuffleMask);
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  }
+
+
+  // If this was a shuffle operation, print the shuffle mask.
+  if (!ShuffleMask.empty()) {
+    if (DestName == 0) DestName = Src1Name;
+    OS << (DestName ? DestName : "mem") << " = ";
+
+    // If the two sources are the same, canonicalize the input elements to be
+    // from the first src so that we get larger element spans.
+    if (Src1Name == Src2Name) {
+      for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+        if ((int)ShuffleMask[i] >= 0 && // Not sentinel.
+            ShuffleMask[i] >= (int)e)        // From second mask.
+          ShuffleMask[i] -= e;
+      }
+    }
+
+    // The shuffle mask specifies which elements of the src1/src2 fill in the
+    // destination, with a few sentinel values.  Loop through and print them
+    // out.
+    for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) {
+      if (i != 0)
+        OS << ',';
+      if (ShuffleMask[i] == SM_SentinelZero) {
+        OS << "zero";
+        continue;
+      }
+
+      // Otherwise, it must come from src1 or src2.  Print the span of elements
+      // that comes from this src.
+      bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size();
+      const char *SrcName = isSrc1 ? Src1Name : Src2Name;
+      OS << (SrcName ? SrcName : "mem") << '[';
+      bool IsFirst = true;
+      while (i != e &&
+             (int)ShuffleMask[i] >= 0 &&
+             (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) {
+        if (!IsFirst)
+          OS << ',';
+        else
+          IsFirst = false;
+        OS << ShuffleMask[i] % ShuffleMask.size();
+        ++i;
+      }
+      OS << ']';
+      --i;  // For loop increments element #.
+    }
+    //MI->print(OS, 0);
+    OS << "\n";
+  }
+
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
new file mode 100644
index 000000000000..13fdf9af8c98
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -0,0 +1,25 @@
+//=- X86InstComments.h - Generate verbose-asm comments for instrs -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines functionality used to emit comments about X86 instructions to
+// an output stream for -fverbose-asm.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INST_COMMENTS_H
+#define X86_INST_COMMENTS_H
+
+namespace llvm {
+  class MCInst;
+  class raw_ostream;
+  void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
+                              const char *(*getRegName)(unsigned));
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 000000000000..141f4a4dd856
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -0,0 +1,208 @@
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as Intel-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86InstComments.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#include "X86GenAsmWriter1.inc"
+
+void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+
+  // If verbose assembly is enabled, we can print some informative comments.
+  if (CommentStream)
+    EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+}
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
+  case 0x10: O << "eq_os"; break;
+  case 0x11: O << "lt_oq"; break;
+  case 0x12: O << "le_oq"; break;
+  case 0x13: O << "unord_s"; break;
+  case 0x14: O << "neq_us"; break;
+  case 0x15: O << "nlt_uq"; break;
+  case 0x16: O << "nle_uq"; break;
+  case 0x17: O << "ord_s"; break;
+  case 0x18: O << "eq_us"; break;
+  case 0x19: O << "nge_uq"; break;
+  case 0x1a: O << "ngt_uq"; break;
+  case 0x1b: O << "false_os"; break;
+  case 0x1c: O << "neq_os"; break;
+  case 0x1d: O << "ge_oq"; break;
+  case 0x1e: O << "gt_oq"; break;
+  case 0x1f: O << "true_us"; break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
+  }
+}
+
+static void PrintRegName(raw_ostream &O, StringRef RegName) {
+  for (unsigned i = 0, e = RegName.size(); i != e; ++i)
+    O << (char)toupper(RegName[i]);
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    PrintRegName(O, getRegisterName(Op.getReg()));
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
+                                            raw_ostream &O) {
+  const MCOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal         = MI->getOperand(Op+1).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+2);
+  const MCOperand &DispSpec = MI->getOperand(Op+3);
+  const MCOperand &SegReg   = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O);
+    O << ':';
+  }
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op, O);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2, O);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  
+  O << ']';
+}
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
new file mode 100644
index 000000000000..bb769eb52e4f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -0,0 +1,96 @@
+//= X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to Intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+  
+class X86IntelInstPrinter : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                      const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "OPAQUE PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "YMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "XWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "YMMWORD PTR ";
+    printMemReference(MI, OpNo, O);
+  }
+};
+  
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
new file mode 100644
index 000000000000..598ddee56d21
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -0,0 +1,476 @@
+//===-- X86AsmBackend.cpp - X86 Assembler Backend -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Option to allow disabling arithmetic relaxation to workaround PR9807, which
+// is useful when running bitwise comparison experiments on Darwin. We should be
+// able to remove this once PR9807 is resolved.
+static cl::opt<bool>
+MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
+         cl::desc("Disable relaxation of arithmetic instruction for X86"));
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default: llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_SecRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_SecRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case X86::reloc_global_offset_table:
+  case FK_SecRel_4:
+  case FK_Data_4: return 2;
+  case FK_PCRel_8:
+  case FK_SecRel_8:
+  case FK_Data_8: return 3;
+  }
+}
+
+namespace {
+
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
+                     bool HasRelocationAddend, bool foobar)
+    : MCELFObjectTargetWriter(is64Bit, OSABI, EMachine, HasRelocationAddend) {}
+};
+
+class X86AsmBackend : public MCAsmBackend {
+  StringRef CPU;
+public:
+  X86AsmBackend(const Target &T, StringRef _CPU)
+    : MCAsmBackend(), CPU(_CPU) {}
+
+  unsigned getNumFixupKinds() const {
+    return X86::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      { "reloc_riprel_4byte", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel },
+      { "reloc_riprel_4byte_movq_load", 0, 4 * 8, MCFixupKindInfo::FKF_IsPCRel},
+      { "reloc_signed_4byte", 0, 4 * 8, 0},
+      { "reloc_global_offset_table", 0, 4 * 8, 0}
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+    unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
+
+    assert(Fixup.getOffset() + Size <= DataSize &&
+           "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
+    for (unsigned i = 0; i != Size; ++i)
+      Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const;
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup,
+                            uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const;
+
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+};
+} // end anonymous namespace
+
+static unsigned getRelaxedOpcodeBranch(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+  case X86::JAE_1: return X86::JAE_4;
+  case X86::JA_1:  return X86::JA_4;
+  case X86::JBE_1: return X86::JBE_4;
+  case X86::JB_1:  return X86::JB_4;
+  case X86::JE_1:  return X86::JE_4;
+  case X86::JGE_1: return X86::JGE_4;
+  case X86::JG_1:  return X86::JG_4;
+  case X86::JLE_1: return X86::JLE_4;
+  case X86::JL_1:  return X86::JL_4;
+  case X86::JMP_1: return X86::JMP_4;
+  case X86::JNE_1: return X86::JNE_4;
+  case X86::JNO_1: return X86::JNO_4;
+  case X86::JNP_1: return X86::JNP_4;
+  case X86::JNS_1: return X86::JNS_4;
+  case X86::JO_1:  return X86::JO_4;
+  case X86::JP_1:  return X86::JP_4;
+  case X86::JS_1:  return X86::JS_4;
+  }
+}
+
+static unsigned getRelaxedOpcodeArith(unsigned Op) {
+  switch (Op) {
+  default:
+    return Op;
+
+    // IMUL
+  case X86::IMUL16rri8: return X86::IMUL16rri;
+  case X86::IMUL16rmi8: return X86::IMUL16rmi;
+  case X86::IMUL32rri8: return X86::IMUL32rri;
+  case X86::IMUL32rmi8: return X86::IMUL32rmi;
+  case X86::IMUL64rri8: return X86::IMUL64rri32;
+  case X86::IMUL64rmi8: return X86::IMUL64rmi32;
+
+    // AND
+  case X86::AND16ri8: return X86::AND16ri;
+  case X86::AND16mi8: return X86::AND16mi;
+  case X86::AND32ri8: return X86::AND32ri;
+  case X86::AND32mi8: return X86::AND32mi;
+  case X86::AND64ri8: return X86::AND64ri32;
+  case X86::AND64mi8: return X86::AND64mi32;
+
+    // OR
+  case X86::OR16ri8: return X86::OR16ri;
+  case X86::OR16mi8: return X86::OR16mi;
+  case X86::OR32ri8: return X86::OR32ri;
+  case X86::OR32mi8: return X86::OR32mi;
+  case X86::OR64ri8: return X86::OR64ri32;
+  case X86::OR64mi8: return X86::OR64mi32;
+
+    // XOR
+  case X86::XOR16ri8: return X86::XOR16ri;
+  case X86::XOR16mi8: return X86::XOR16mi;
+  case X86::XOR32ri8: return X86::XOR32ri;
+  case X86::XOR32mi8: return X86::XOR32mi;
+  case X86::XOR64ri8: return X86::XOR64ri32;
+  case X86::XOR64mi8: return X86::XOR64mi32;
+
+    // ADD
+  case X86::ADD16ri8: return X86::ADD16ri;
+  case X86::ADD16mi8: return X86::ADD16mi;
+  case X86::ADD32ri8: return X86::ADD32ri;
+  case X86::ADD32mi8: return X86::ADD32mi;
+  case X86::ADD64ri8: return X86::ADD64ri32;
+  case X86::ADD64mi8: return X86::ADD64mi32;
+
+    // SUB
+  case X86::SUB16ri8: return X86::SUB16ri;
+  case X86::SUB16mi8: return X86::SUB16mi;
+  case X86::SUB32ri8: return X86::SUB32ri;
+  case X86::SUB32mi8: return X86::SUB32mi;
+  case X86::SUB64ri8: return X86::SUB64ri32;
+  case X86::SUB64mi8: return X86::SUB64mi32;
+
+    // CMP
+  case X86::CMP16ri8: return X86::CMP16ri;
+  case X86::CMP16mi8: return X86::CMP16mi;
+  case X86::CMP32ri8: return X86::CMP32ri;
+  case X86::CMP32mi8: return X86::CMP32mi;
+  case X86::CMP64ri8: return X86::CMP64ri32;
+  case X86::CMP64mi8: return X86::CMP64mi32;
+
+    // PUSH
+  case X86::PUSHi8: return X86::PUSHi32;
+  case X86::PUSHi16: return X86::PUSHi32;
+  case X86::PUSH64i8: return X86::PUSH64i32;
+  case X86::PUSH64i16: return X86::PUSH64i32;
+  }
+}
+
+static unsigned getRelaxedOpcode(unsigned Op) {
+  unsigned R = getRelaxedOpcodeArith(Op);
+  if (R != Op)
+    return R;
+  return getRelaxedOpcodeBranch(Op);
+}
+
+bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  // Branches can always be relaxed.
+  if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode())
+    return true;
+
+  if (MCDisableArithRelaxation)
+    return false;
+
+  // Check if this instruction is ever relaxable.
+  if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode())
+    return false;
+
+
+  // Check if it has an expression and is not RIP relative.
+  bool hasExp = false;
+  bool hasRIP = false;
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
+    const MCOperand &Op = Inst.getOperand(i);
+    if (Op.isExpr())
+      hasExp = true;
+
+    if (Op.isReg() && Op.getReg() == X86::RIP)
+      hasRIP = true;
+  }
+
+  // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on
+  // how we do relaxations?
+  return hasExp && !hasRIP;
+}
+
+bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                         uint64_t Value,
+                                         const MCRelaxableFragment *DF,
+                                         const MCAsmLayout &Layout) const {
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+// FIXME: Can tblgen help at all here to verify there aren't other instructions
+// we can relax?
+void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
+  unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode());
+
+  if (RelaxedOp == Inst.getOpcode()) {
+    SmallString<256> Tmp;
+    raw_svector_ostream OS(Tmp);
+    Inst.dump_pretty(OS);
+    OS << "\n";
+    report_fatal_error("unexpected instruction to relax: " + OS.str());
+  }
+
+  Res = Inst;
+  Res.setOpcode(RelaxedOp);
+}
+
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
+bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  static const uint8_t Nops[10][10] = {
+    // nop
+    {0x90},
+    // xchg %ax,%ax
+    {0x66, 0x90},
+    // nopl (%[re]ax)
+    {0x0f, 0x1f, 0x00},
+    // nopl 0(%[re]ax)
+    {0x0f, 0x1f, 0x40, 0x00},
+    // nopl 0(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopw 0(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
+    // nopl 0L(%[re]ax)
+    {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
+    // nopl 0L(%[re]ax,%[re]ax,1)
+    {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw 0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+    // nopw %cs:0L(%[re]ax,%[re]ax,1)
+    {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
+  };
+
+  // This CPU doesnt support long nops. If needed add more.
+  // FIXME: Can we get this from the subtarget somehow?
+  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
+      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->Write8(0x90);
+    return true;
+  }
+
+  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
+  // needed, then emit a nop of the remaining length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->Write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->Write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
+
+  return true;
+}
+
+/* *** */
+
+namespace {
+class ELFX86AsmBackend : public X86AsmBackend {
+public:
+  uint8_t OSABI;
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
+    : X86AsmBackend(T, CPU), OSABI(_OSABI) {
+    HasReliableSymbolDifference = true;
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
+    return ES.getFlags() & ELF::SHF_MERGE;
+  }
+};
+
+class ELFX86_32AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
+  }
+};
+
+class ELFX86_64AsmBackend : public ELFX86AsmBackend {
+public:
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
+  }
+};
+
+class WindowsX86AsmBackend : public X86AsmBackend {
+  bool Is64Bit;
+
+public:
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
+    , Is64Bit(is64Bit) {
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86WinCOFFObjectWriter(OS, Is64Bit);
+  }
+};
+
+class DarwinX86AsmBackend : public X86AsmBackend {
+public:
+  DarwinX86AsmBackend(const Target &T, StringRef CPU)
+    : X86AsmBackend(T, CPU) { }
+};
+
+class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
+                                     object::mach::CTM_i386,
+                                     object::mach::CSX86_ALL);
+  }
+};
+
+class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+public:
+  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {
+    HasReliableSymbolDifference = true;
+  }
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
+                                     object::mach::CTM_x86_64,
+                                     object::mach::CSX86_ALL);
+  }
+
+  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+    // Temporary labels in the string literals sections require symbols. The
+    // issue is that the x86_64 relocation format does not allow symbol +
+    // offset, and so the linker does not have enough information to resolve the
+    // access to the appropriate atom unless an external relocation is used. For
+    // non-cstring sections, we expect the compiler to use a non-temporary label
+    // for anything that could have an addend pointing outside the symbol.
+    //
+    // See <rdar://problem/4765733>.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    return SMO.getType() == MCSectionMachO::S_CSTRING_LITERALS;
+  }
+
+  virtual bool isSectionAtomizable(const MCSection &Section) const {
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
+    // Fixed sized data sections are uniqued, they cannot be diced into atoms.
+    switch (SMO.getType()) {
+    default:
+      return true;
+
+    case MCSectionMachO::S_4BYTE_LITERALS:
+    case MCSectionMachO::S_8BYTE_LITERALS:
+    case MCSectionMachO::S_16BYTE_LITERALS:
+    case MCSectionMachO::S_LITERAL_POINTERS:
+    case MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS:
+    case MCSectionMachO::S_LAZY_SYMBOL_POINTERS:
+    case MCSectionMachO::S_MOD_INIT_FUNC_POINTERS:
+    case MCSectionMachO::S_MOD_TERM_FUNC_POINTERS:
+    case MCSectionMachO::S_INTERPOSING:
+      return false;
+    }
+  }
+};
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return new DarwinX86_32AsmBackend(T, CPU);
+
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, false, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+}
+
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return new DarwinX86_64AsmBackend(T, CPU);
+
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, true, CPU);
+
+  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 000000000000..36695600707e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,621 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86BASEINFO_H
+#define X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+} // end namespace X86;
+ 
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index structure that contains
+    /// the module number and variable offset for the symbol. Used in the
+    /// general dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_TLSLD - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// __tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the x86-64 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLD
+    MO_TLSLD,
+
+    /// MO_TLSLDM - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS index for the module that
+    /// contains the symbol. When this index is passed to a call to
+    /// ___tls_get_addr, the function will return the base address of the TLS
+    /// block for the symbol. Used in the IA32 local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSLDM
+    MO_TLSLDM,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the thread-pointer offset for the
+    /// symbol. Used in the x86-64 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the absolute address of the GOT entry with the negative thread-pointer
+    /// offset for the symbol. Used in the non-PIC IA32 initial exec TLS access
+    /// model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// the thread-pointer offset for the symbol. Used in the x86-64 local
+    /// exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_DTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the TLS offset of the symbol. Used
+    /// in the local dynamic TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @DTPOFF
+    MO_DTPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// the negative thread-pointer offset for the symbol. Used in the IA32
+    /// local exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is
+    /// the offset of the GOT entry with the negative thread-pointer offset for
+    /// the symbol. Used in the PIC IA32 initial exec TLS access model.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTNTPOFF
+    MO_GOTNTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE,
+
+    /// MO_SECREL - On a symbol operand this indicates that the immediate is
+    /// the offset from beginning of section.
+    ///
+    /// This is the TLS offset for the COFF/Windows TLS mechanism.
+    MO_SECREL
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    //// MRM_XX - A mod/rm byte of exactly 0xXX.
+    MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
+    MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
+    MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
+    MRM_D4 = 47, MRM_D5 = 48, MRM_D6 = 49, MRM_D8 = 50,
+    MRM_D9 = 51, MRM_DA = 52, MRM_DB = 53, MRM_DC = 54,
+    MRM_DD = 55, MRM_DE = 56, MRM_DF = 57,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 43,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 44,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0x1F << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
+
+    // T8XD - Prefix before and after 0x0F. Combination of T8 and XD.
+    T8XD = 17 << Op0Shift,
+
+    // T8XS - Prefix before and after 0x0F. Combination of T8 and XS.
+    T8XS = 18 << Op0Shift,
+
+    // TAXD - Prefix before and after 0x0F. Combination of TA and XD.
+    TAXD = 19 << Op0Shift,
+
+    // XOP8 - Prefix to include use of imm byte.
+    XOP8 = 20 << Op0Shift,
+
+    // XOP9 - Prefix to exclude use of imm byte.
+    XOP9 = 21 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = Op0Shift + 5,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 7 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm16      = 3 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm64      = 7 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 3,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // Segment override prefixes. Currently we just need ability to address
+    // stuff in gs and fs segments.
+    SegOvrShift = LOCKShift + 1,
+    SegOvrMask  = 3 << SegOvrShift,
+    FS          = 1 << SegOvrShift,
+    GS          = 2 << SegOvrShift,
+
+    // Execution domain for SSE instructions in bits 23, 24.
+    // 0 in bits 23-24 means normal, non-SSE instruction.
+    SSEDomainShift = SegOvrShift + 2,
+
+    OpcodeShift   = SSEDomainShift + 2,
+
+    //===------------------------------------------------------------------===//
+    /// VEX - The opcode prefix used by AVX instructions
+    VEXShift = OpcodeShift + 8,
+    VEX         = 1U << 0,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_W       = 1U << 1,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4V      = 1U << 2,
+
+    /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode
+    /// operand 3 with VEX.vvvv.
+    VEX_4VOp3   = 1U << 3,
+
+    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    /// must be encoded in the i8 immediate field. This usually happens in
+    /// instructions with 4 operands.
+    VEX_I8IMM   = 1U << 4,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_L       = 1U << 5,
+
+    // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+    // prefix. Usually used for scalar instructions. Needed by disassembler.
+    VEX_LIG     = 1U << 6,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcode = 1U << 7,
+
+    /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
+    /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
+    MemOp4 = 1U << 8,
+
+    /// XOP - Opcode prefix used by XOP instructions.
+    XOP = 1U << 9
+
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:  return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: llvm_unreachable("Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+        // FIXME: Remove this form.
+        return -1;
+    default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+       return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem: {
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+      unsigned FirstMemOp = 1;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+      if (HasMemOp4)
+        ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
+
+      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      return FirstMemOp;
+    }
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      unsigned FirstMemOp = 0;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+      return FirstMemOp;
+    }
+    case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+    case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+    case X86II::MRM_E8: case X86II::MRM_F0: case X86II::MRM_F8:
+    case X86II::MRM_F9: case X86II::MRM_D0: case X86II::MRM_D1:
+    case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6:
+    case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
+    case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+        return true;
+    }
+    return false;
+  }
+  
+  inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
new file mode 100644
index 000000000000..de80dd835e99
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -0,0 +1,227 @@
+//===-- X86ELFObjectWriter.cpp - X86 ELF Writer ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+  public:
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+
+    virtual ~X86ELFObjectWriter();
+  protected:
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend) const;
+  };
+}
+
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+  : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                            // Only i386 uses Rel instead of RelA.
+                            /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
+
+X86ELFObjectWriter::~X86ELFObjectWriter()
+{}
+
+unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel,
+                                          bool IsRelocWithSymbol,
+                                          int64_t Addend) const {
+  // determine the type of the relocation
+
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+  unsigned Type;
+  if (getEMachine() == ELF::EM_X86_64) {
+    if (IsPCRel) {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case FK_Data_8: Type = ELF::R_X86_64_PC64; break;
+      case FK_Data_4: Type = ELF::R_X86_64_PC32; break;
+      case FK_Data_2: Type = ELF::R_X86_64_PC16; break;
+
+      case FK_PCRel_8:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC64;
+        break;
+      case X86::reloc_signed_4byte:
+      case X86::reloc_riprel_4byte_movq_load:
+      case X86::reloc_riprel_4byte:
+      case FK_PCRel_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_PC32;
+          break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_X86_64_PLT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTPCREL:
+          Type = ELF::R_X86_64_GOTPCREL;
+          break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_X86_64_GOTTPOFF;
+        break;
+        case MCSymbolRefExpr::VK_TLSGD:
+          Type = ELF::R_X86_64_TLSGD;
+          break;
+        case MCSymbolRefExpr::VK_TLSLD:
+          Type = ELF::R_X86_64_TLSLD;
+          break;
+        }
+        break;
+      case FK_PCRel_2:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC16;
+        break;
+      case FK_PCRel_1:
+        assert(Modifier == MCSymbolRefExpr::VK_None);
+        Type = ELF::R_X86_64_PC8;
+        break;
+      }
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+      case FK_Data_8: Type = ELF::R_X86_64_64; break;
+      case X86::reloc_signed_4byte:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_32S;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTPCREL:
+          Type = ELF::R_X86_64_GOTPCREL;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF32;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_X86_64_DTPOFF32;
+          break;
+        }
+        break;
+      case FK_Data_4:
+        Type = ELF::R_X86_64_32;
+        break;
+      case FK_Data_2: Type = ELF::R_X86_64_16; break;
+      case FK_PCRel_1:
+      case FK_Data_1: Type = ELF::R_X86_64_8; break;
+      }
+    }
+  } else if (getEMachine() == ELF::EM_386) {
+    if (IsPCRel) {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_386_GOTPC;
+        break;
+
+      case X86::reloc_signed_4byte:
+      case FK_PCRel_4:
+      case FK_Data_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_PC32;
+          break;
+        case MCSymbolRefExpr::VK_PLT:
+          Type = ELF::R_386_PLT32;
+          break;
+        }
+        break;
+      }
+    } else {
+      switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_386_GOTPC;
+        break;
+
+      // FIXME: Should we avoid selecting reloc_signed_4byte in 32 bit mode
+      // instead?
+      case X86::reloc_signed_4byte:
+      case FK_PCRel_4:
+      case FK_Data_4:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_386_32;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_386_GOT32;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_386_GOTOFF;
+          break;
+        case MCSymbolRefExpr::VK_TLSGD:
+          Type = ELF::R_386_TLS_GD;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_386_TLS_LE_32;
+          break;
+        case MCSymbolRefExpr::VK_INDNTPOFF:
+          Type = ELF::R_386_TLS_IE;
+          break;
+        case MCSymbolRefExpr::VK_NTPOFF:
+          Type = ELF::R_386_TLS_LE;
+          break;
+        case MCSymbolRefExpr::VK_GOTNTPOFF:
+          Type = ELF::R_386_TLS_GOTIE;
+          break;
+        case MCSymbolRefExpr::VK_TLSLDM:
+          Type = ELF::R_386_TLS_LDM;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_386_TLS_LDO_32;
+          break;
+        case MCSymbolRefExpr::VK_GOTTPOFF:
+          Type = ELF::R_386_TLS_IE_32;
+          break;
+        }
+        break;
+      case FK_Data_2: Type = ELF::R_386_16; break;
+      case FK_PCRel_1:
+      case FK_Data_1: Type = ELF::R_386_8; break;
+      }
+    }
+  } else
+    llvm_unreachable("Unsupported ELF machine type.");
+
+  return Type;
+}
+
+MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
+                                               bool IsELF64,
+                                               uint8_t OSABI,
+                                               uint16_t EMachine) {
+  MCELFObjectTargetWriter *MOTW =
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
+  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
new file mode 100644
index 000000000000..f2e34cbe0d65
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -0,0 +1,33 @@
+//===-- X86FixupKinds.h - X86 Specific Fixup Entries ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_X86_X86FIXUPKINDS_H
+#define LLVM_X86_X86FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
+  reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
+                                             // this will be sign extended at
+                                             // runtime.
+  reloc_global_offset_table,                 // 32-bit, relative to the start
+                                             // of the instruction. Used only
+                                             // for _GLOBAL_OFFSET_TABLE_.
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+}
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
new file mode 100644
index 000000000000..7815ae98c9bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -0,0 +1,158 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ELF.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+             clEnumValEnd));
+
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
+
+void X86MCAsmInfoDarwin::anchor() { }
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  if (is64Bit)
+    PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+
+  // Use ## as a comment string so that .s files generated by llvm can go
+  // through the GCC preprocessor without causing an error.  This is needed
+  // because "clang foo.s" runs the C preprocessor, which is usually reserved
+  // for .S files on other systems.  Perhaps this is because the file system
+  // wasn't always case preserving or something.
+  CommentString = "##";
+  PCSymbol = ".";
+
+  SupportsDebugInformation = true;
+  DwarfUsesInlineInfoSection = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
+  : X86MCAsmInfoDarwin(Triple) {
+}
+
+void X86ELFMCAsmInfo::anchor() { }
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
+  bool is64Bit = T.getArch() == Triple::x86_64;
+  bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+  // For ELF, x86-64 pointer size depends on the ABI.
+  // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+  // with the x32 ABI, pointer size remains the default 4.
+  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+  // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+  CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Debug Information
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
+  // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split
+  // into two .words.
+  if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
+       T.getArch() == Triple::x86)
+    Data64bitsDirective = 0;
+}
+
+const MCExpr *
+X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                                   unsigned Encoding,
+                                                   MCStreamer &Streamer) const {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Context);
+  const MCExpr *Four = MCConstantExpr::Create(4, Context);
+  return MCBinaryExpr::CreateAdd(Res, Four, Context);
+}
+
+const MCSection *X86ELFMCAsmInfo::
+getNonexecutableStackSection(MCContext &Ctx) const {
+  return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS,
+                           0, SectionKind::getMetadata());
+}
+
+void X86MCAsmInfoMicrosoft::anchor() { }
+
+X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    GlobalPrefix = "";
+    PrivateGlobalPrefix = ".L";
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+}
+
+void X86MCAsmInfoGNUCOFF::anchor() { }
+
+X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
+  if (Triple.getArch() == Triple::x86_64) {
+    GlobalPrefix = "";
+    PrivateGlobalPrefix = ".L";
+  }
+
+  AssemblerDialect = AsmWriterFlavor;
+
+  TextAlignFillValue = 0x90;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
new file mode 100644
index 000000000000..b6b70fd3e855
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -0,0 +1,58 @@
+//===-- X86MCAsmInfo.h - X86 asm properties --------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+  class Triple;
+
+  class X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+    virtual void anchor();
+  public:
+    explicit X86MCAsmInfoDarwin(const Triple &Triple);
+  };
+
+  struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin {
+    explicit X86_64MCAsmInfoDarwin(const Triple &Triple);
+    virtual const MCExpr *
+    getExprForPersonalitySymbol(const MCSymbol *Sym,
+                                unsigned Encoding,
+                                MCStreamer &Streamer) const;
+  };
+
+  class X86ELFMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit X86ELFMCAsmInfo(const Triple &Triple);
+    virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+  };
+
+  class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
+    virtual void anchor();
+  public:
+    explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
+  };
+
+  class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
+    virtual void anchor();
+  public:
+    explicit X86MCAsmInfoGNUCOFF(const Triple &Triple);
+  };
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
new file mode 100644
index 000000000000..776cee1e35cc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -0,0 +1,1233 @@
+//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+  MCContext &Ctx;
+public:
+  X86MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                   MCContext &ctx)
+    : MCII(mcii), STI(sti), Ctx(ctx) {
+  }
+
+  ~X86MCCodeEmitter() {}
+
+  bool is64BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
+  }
+
+  bool is32BitMode() const {
+    // FIXME: Can tablegen auto-generate this?
+    return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
+  }
+
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
+  }
+
+  // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+  // 0-7 and the difference between the 2 groups is given by the REX prefix.
+  // In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+  // in 1's complement form, example:
+  //
+  //  ModRM field => XMM9 => 1
+  //  VEX.VVVV    => XMM9 => ~9
+  //
+  // See table 4-35 of Intel AVX Programming Reference for details.
+  unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                       unsigned OpNum) const {
+    unsigned SrcReg = MI.getOperand(OpNum).getReg();
+    unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    if (X86II::isX86_64ExtendedReg(SrcReg))
+      SrcRegNum |= 8;
+
+    // The registers represented through VEX_VVVV should
+    // be encoded in 1's complement form.
+    return (~SrcRegNum) & 0xf;
+  }
+
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
+                     unsigned ImmSize, MCFixupKind FixupKind,
+                     unsigned &CurByte, raw_ostream &OS,
+                     SmallVectorImpl<MCFixup> &Fixups,
+                     int ImmOffset = 0) const;
+
+  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                        unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+
+
+  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                        unsigned RegOpcodeField,
+                        uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS,
+                        SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                           const MCInst &MI, const MCInstrDesc &Desc,
+                           raw_ostream &OS) const;
+
+  void EmitSegmentOverridePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                 int MemOperand, const MCInst &MI,
+                                 raw_ostream &OS) const;
+
+  void EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+                        const MCInst &MI, const MCInstrDesc &Desc,
+                        raw_ostream &OS) const;
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, STI, Ctx);
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
+/// in an instruction with the specified TSFlags.
+static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
+  unsigned Size = X86II::getSizeOfImm(TSFlags);
+  bool isPCRel = X86II::isImmPCRel(TSFlags);
+
+  return MCFixup::getKindForSize(Size, isPCRel);
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MCInst &MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// StartsWithGlobalOffsetTable - Check if this expression starts with
+///  _GLOBAL_OFFSET_TABLE_ and if it is of the form
+///  _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
+/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
+/// of a binary expression.
+enum GlobalOffsetTableExprKind {
+  GOT_None,
+  GOT_Normal,
+  GOT_SymDiff
+};
+static GlobalOffsetTableExprKind
+StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+  const MCExpr *RHS = 0;
+  if (Expr->getKind() == MCExpr::Binary) {
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
+    Expr = BE->getLHS();
+    RHS = BE->getRHS();
+  }
+
+  if (Expr->getKind() != MCExpr::SymbolRef)
+    return GOT_None;
+
+  const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+  const MCSymbol &S = Ref->getSymbol();
+  if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
+    return GOT_None;
+  if (RHS && RHS->getKind() == MCExpr::SymbolRef)
+    return GOT_SymDiff;
+  return GOT_Normal;
+}
+
+void X86MCCodeEmitter::
+EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
+              MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+              SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+  const MCExpr *Expr = NULL;
+  if (DispOp.isImm()) {
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
+    if (FixupKind != FK_PCRel_1 &&
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
+      EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+      return;
+    }
+    Expr = MCConstantExpr::Create(DispOp.getImm(), Ctx);
+  } else {
+    Expr = DispOp.getExpr();
+  }
+
+  // If we have an immoffset, add it to the expression.
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == FK_Data_8 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
+    GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+    if (Kind != GOT_None) {
+      assert(ImmOffset == 0);
+
+      FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      if (Kind == GOT_Normal)
+        ImmOffset = CurByte;
+    } else if (Expr->getKind() == MCExpr::SymbolRef) {
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+      if (Ref->getKind() == MCSymbolRefExpr::VK_SECREL) {
+        FixupKind = MCFixupKind(FK_SecRel_4);
+      }
+    }
+  }
+
+  // If the fixup is pc-relative, we need to bias the value to be relative to
+  // the start of the field, not the end of the field.
+  if (FixupKind == FK_PCRel_4 ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load))
+    ImmOffset -= 4;
+  if (FixupKind == FK_PCRel_2)
+    ImmOffset -= 2;
+  if (FixupKind == FK_PCRel_1)
+    ImmOffset -= 1;
+
+  if (ImmOffset)
+    Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(ImmOffset, Ctx),
+                                   Ctx);
+
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind, Loc));
+  EmitConstant(0, Size, CurByte, OS);
+}
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        uint64_t TSFlags, unsigned &CurByte,
+                                        raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups) const{
+  const MCOperand &Disp     = MI.getOperand(Op+X86::AddrDisp);
+  const MCOperand &Base     = MI.getOperand(Op+X86::AddrBaseReg);
+  const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
+  const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+  unsigned BaseReg = Base.getReg();
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
+    assert(is64BitMode() && "Rip-relative addressing requires 64-bit mode");
+    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
+    EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+
+    unsigned FixupKind = X86::reloc_riprel_4byte;
+
+    // movq loads are handled with a special relocation form which allows the
+    // linker to eliminate some loads for GOT references which end up in the
+    // same linkage unit.
+    if (MI.getOpcode() == X86::MOV64rm)
+      FixupKind = X86::reloc_riprel_4byte_movq_load;
+
+    // rip-relative addressing is actually relative to the *next* instruction.
+    // Since an immediate can follow the mod/rm byte for an instruction, this
+    // means that we need to bias the immediate field of the instruction with
+    // the size of the immediate field.  If we have this case, add it into the
+    // expression to emit.
+    int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
+
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
+                  CurByte, OS, Fixups, -ImmSize);
+    return;
+  }
+
+  unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!is64BitMode() || BaseReg != 0)) {
+
+    if (BaseReg == 0) {          // [disp32]     in X86-32 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm() && isDisp8(Disp.getImm())) {
+      EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+      EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+      return;
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS,
+                  Fixups);
+    return;
+  }
+
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 &&
+             // Base reg can't be anything that ends up with '5' as the base
+             // reg, it is the magic [*] nomenclature that indicates no base.
+             BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+                  CurByte, OS, Fixups);
+}
+
+/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+/// called VEX.
+void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                           int MemOperand, const MCInst &MI,
+                                           const MCInstrDesc &Desc,
+                                           raw_ostream &OS) const {
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // XOP: Use XOP prefix byte 0x8f instead of VEX.
+  unsigned char XOP = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b10001: XOP map select - 09h instructions with no imm byte
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+    XOP = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case X86II::T8:  // 0F 38
+    VEX_5M = 0x2;
+    break;
+  case X86II::TA:  // 0F 3A
+    VEX_5M = 0x3;
+    break;
+  case X86II::T8XS: // F3 0F 38
+    VEX_PP = 0x2;
+    VEX_5M = 0x2;
+    break;
+  case X86II::T8XD: // F2 0F 38
+    VEX_PP = 0x3;
+    VEX_5M = 0x2;
+    break;
+  case X86II::TAXD: // F2 0F 3A
+    VEX_PP = 0x3;
+    VEX_5M = 0x3;
+    break;
+  case X86II::XS:  // F3 0F
+    VEX_PP = 0x2;
+    break;
+  case X86II::XD:  // F2 0F
+    VEX_PP = 0x3;
+    break;
+  case X86II::XOP8:
+    VEX_5M = 0x8;
+    break;
+  case X86II::XOP9:
+    VEX_5M = 0x9;
+    break;
+  case X86II::A6:  // Bypass: Not used by VEX
+  case X86II::A7:  // Bypass: Not used by VEX
+  case X86II::TB:  // Bypass: Not used by VEX
+  case 0:
+    break;  // No prefix!
+  }
+
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+
+    CurOp = X86::AddrNumOperands;
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    const MCOperand &MO = MI.getOperand(CurOp);
+    if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+      VEX_R = 0x0;
+    break;
+  }
+  case X86II::MRMSrcMem:
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
+      VEX_R = 0x0;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+
+    if (HasVEX_4VOp3)
+      // Instruction format for 4VOp3:
+      //   src1(ModR/M), MemAddr, src3(VEX_4V)
+      // CurOp points to start of the MemoryOperand,
+      //   it skips TIED_TO operands if exist, then increments past src1.
+      // CurOp + X86::AddrNumOperands will point to src3.
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
+    break;
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    //  src1(VEX_4V), MemAddr
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
+  case X86II::MRMSrcReg:
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    //  FMA4:
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    CurOp++;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      CurOp++;
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    CurOp++;
+    if (HasVEX_4VOp3)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+    break;
+  case X86II::MRMDestReg:
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    CurOp++;
+
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_R = 0x0;
+    break;
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    VEX_4V = getVEXRegisterEncoding(MI, 0);
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_B = 0x0;
+    break;
+  default: // RawFrm
+    break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+    EmitByte(0xC5, CurByte, OS);
+    EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+    return;
+  }
+
+  // 3 byte VEX prefix
+  EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+  EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+                                   const MCInstrDesc &Desc) {
+  unsigned REX = 0;
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3; // set REX.W
+
+  if (MI.getNumOperands() == 0) return REX;
+
+  unsigned NumOps = MI.getNumOperands();
+  // FIXME: MCInst should explicitize the two-addrness.
+  bool isTwoAddr = NumOps > 1 &&
+                      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  unsigned i = isTwoAddr ? 1 : 0;
+  for (; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
+    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+    // that returns non-zero.
+    REX |= 0x40; // REX fixed encoding prefix
+    break;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
+  case X86II::MRMSrcReg:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 0; // set REX.B
+    }
+    break;
+  case X86II::MRMSrcMem: {
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem: {
+    unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+    i = isTwoAddr ? 1 : 0;
+    if (NumOps > e && MI.getOperand(e).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+      REX |= 1 << 2; // set REX.R
+    unsigned Bit = 0;
+    for (; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
+        Bit++;
+      }
+    }
+    break;
+  }
+  default:
+    if (MI.getOperand(0).isReg() &&
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 0; // set REX.B
+    i = isTwoAddr ? 2 : 1;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 2; // set REX.R
+    }
+    break;
+  }
+  return REX;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+void X86MCCodeEmitter::EmitSegmentOverridePrefix(uint64_t TSFlags,
+                                        unsigned &CurByte, int MemOperand,
+                                        const MCInst &MI,
+                                        raw_ostream &OS) const {
+  switch (TSFlags & X86II::SegOvrMask) {
+  default: llvm_unreachable("Invalid segment!");
+  case 0:
+    // No segment override, check for explicit one on memory operand.
+    if (MemOperand != -1) {   // If the instruction has a memory operand.
+      switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+      default: llvm_unreachable("Unknown segment register!");
+      case 0: break;
+      case X86::CS: EmitByte(0x2E, CurByte, OS); break;
+      case X86::SS: EmitByte(0x36, CurByte, OS); break;
+      case X86::DS: EmitByte(0x3E, CurByte, OS); break;
+      case X86::ES: EmitByte(0x26, CurByte, OS); break;
+      case X86::FS: EmitByte(0x64, CurByte, OS); break;
+      case X86::GS: EmitByte(0x65, CurByte, OS); break;
+      }
+    }
+    break;
+  case X86II::FS:
+    EmitByte(0x64, CurByte, OS);
+    break;
+  case X86II::GS:
+    EmitByte(0x65, CurByte, OS);
+    break;
+  }
+}
+
+/// EmitOpcodePrefix - Emit all instruction prefixes prior to the opcode.
+///
+/// MemOperand is the operand # of the start of a memory operand if present.  If
+/// Not present, it is -1.
+void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+                                        int MemOperand, const MCInst &MI,
+                                        const MCInstrDesc &Desc,
+                                        raw_ostream &OS) const {
+
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+
+  // Emit segment override opcode prefix as needed.
+  EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
+
+  // Emit the repeat opcode prefix as needed.
+  if ((TSFlags & X86II::Op0Mask) == X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemOperand == -1) {
+    need_address_override = false;
+  } else if (is64BitMode()) {
+    assert(!Is16BitMemOperand(MI, MemOperand));
+    need_address_override = Is32BitMemOperand(MI, MemOperand);
+  } else if (is32BitMode()) {
+    assert(!Is64BitMemOperand(MI, MemOperand));
+    need_address_override = Is16BitMemOperand(MI, MemOperand);
+  } else {
+    need_address_override = false;
+  }
+
+  if (need_address_override)
+    EmitByte(0x67, CurByte, OS);
+
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    EmitByte(0x66, CurByte, OS);
+
+  bool Need0FPrefix = false;
+  switch (TSFlags & X86II::Op0Mask) {
+  default: llvm_unreachable("Invalid prefix!");
+  case 0: break;  // No prefix!
+  case X86II::REP: break; // already handled.
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+  case X86II::A6:  // 0F A6
+  case X86II::A7:  // 0F A7
+    Need0FPrefix = true;
+    break;
+  case X86II::T8XS: // F3 0F 38
+    EmitByte(0xF3, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::T8XD: // F2 0F 38
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::TAXD: // F2 0F 3A
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XS:   // F3 0F
+    EmitByte(0xF3, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
+  case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
+  case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
+  case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
+  case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
+  case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
+  case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
+  case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
+  }
+
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (is64BitMode()) {
+    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+      EmitByte(0x40 | REX, CurByte, OS);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    EmitByte(0x0F, CurByte, OS);
+
+  // FIXME: Pull this up into previous switch if REX can be moved earlier.
+  switch (TSFlags & X86II::Op0Mask) {
+  case X86II::T8XS:  // F3 0F 38
+  case X86II::T8XD:  // F2 0F 38
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TAXD:  // F2 0F 3A
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  case X86II::A6:    // 0F A6
+    EmitByte(0xA6, CurByte, OS);
+    break;
+  case X86II::A7:    // 0F A7
+    EmitByte(0xA7, CurByte, OS);
+    break;
+  }
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const MCInstrDesc &Desc = MCII.get(Opcode);
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Pseudo instructions don't get encoded.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return;
+
+  // If this is a two-address instruction, skip one of the register operands.
+  // FIXME: This should be handled during MCInst lowering.
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    EmitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+  else
+    EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
+
+  unsigned SrcRegNum = 0;
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg:
+    llvm_unreachable("FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+    llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::Pseudo:
+    llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  case X86II::RawFrm:
+    EmitByte(BaseOpcode, CurByte, OS);
+    break;
+  case X86II::RawFrmImm8:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  OS, Fixups);
+    break;
+  case X86II::RawFrmImm16:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                  X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
+                  CurByte, OS, Fixups);
+    EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  OS, Fixups);
+    break;
+
+  case X86II::AddRegFrm:
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    break;
+
+  case X86II::MRMDestReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMDestMem:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + X86::AddrNumOperands;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    EmitMemModRMByte(MI, CurOp,
+                     GetX86RegNum(MI.getOperand(SrcRegNum)),
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp = SrcRegNum + 1;
+    break;
+
+  case X86II::MRMSrcReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    SrcRegNum = CurOp + 1;
+
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    EmitRegModRMByte(MI.getOperand(SrcRegNum),
+                     GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp += AddrOperands + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp++),
+                     (TSFlags & X86II::FormMask)-X86II::MRM0r,
+                     CurByte, OS);
+    break;
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     TSFlags, CurByte, OS, Fixups);
+    CurOp += X86::AddrNumOperands;
+    break;
+  case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3:
+  case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9:
+  case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4:
+  case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8:
+  case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE:
+  case X86II::MRM_DF: case X86II::MRM_E8: case X86II::MRM_F0:
+  case X86II::MRM_F8: case X86II::MRM_F9:
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    unsigned char MRM;
+    switch (TSFlags & X86II::FormMask) {
+    default: llvm_unreachable("Invalid Form");
+    case X86II::MRM_C1: MRM = 0xC1; break;
+    case X86II::MRM_C2: MRM = 0xC2; break;
+    case X86II::MRM_C3: MRM = 0xC3; break;
+    case X86II::MRM_C4: MRM = 0xC4; break;
+    case X86II::MRM_C8: MRM = 0xC8; break;
+    case X86II::MRM_C9: MRM = 0xC9; break;
+    case X86II::MRM_D0: MRM = 0xD0; break;
+    case X86II::MRM_D1: MRM = 0xD1; break;
+    case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
+    case X86II::MRM_D6: MRM = 0xD6; break;
+    case X86II::MRM_D8: MRM = 0xD8; break;
+    case X86II::MRM_D9: MRM = 0xD9; break;
+    case X86II::MRM_DA: MRM = 0xDA; break;
+    case X86II::MRM_DB: MRM = 0xDB; break;
+    case X86II::MRM_DC: MRM = 0xDC; break;
+    case X86II::MRM_DD: MRM = 0xDD; break;
+    case X86II::MRM_DE: MRM = 0xDE; break;
+    case X86II::MRM_DF: MRM = 0xDF; break;
+    case X86II::MRM_E8: MRM = 0xE8; break;
+    case X86II::MRM_F0: MRM = 0xF0; break;
+    case X86II::MRM_F8: MRM = 0xF8; break;
+    case X86II::MRM_F9: MRM = 0xF9; break;
+    }
+    EmitByte(MRM, CurByte, OS);
+    break;
+  }
+
+  // If there is a remaining operand, it must be a trailing immediate.  Emit it
+  // according to the right size for the instruction. Some instructions
+  // (SSE4a extrq and insertq) have two trailing immediates.
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                    : CurOp);
+      ++CurOp;
+      unsigned RegNum = GetX86RegNum(MO) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MCOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      EmitImmediate(MCOperand::CreateImm(RegNum), MI.getLoc(), 1, FK_Data_1,
+                    CurByte, OS, Fixups);
+    } else {
+      unsigned FixupKind;
+      // FIXME: Is there a better way to know that we need a signed relocation?
+      if (MI.getOpcode() == X86::ADD64ri32 ||
+          MI.getOpcode() == X86::MOV64ri32 ||
+          MI.getOpcode() == X86::MOV64mi32 ||
+          MI.getOpcode() == X86::PUSH64i32)
+        FixupKind = X86::reloc_signed_4byte;
+      else
+        FixupKind = getImmFixupKind(TSFlags);
+      EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+                    X86II::getSizeOfImm(TSFlags), MCFixupKind(FixupKind),
+                    CurByte, OS, Fixups);
+    }
+  }
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
+    EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
new file mode 100644
index 000000000000..5e84530cd729
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -0,0 +1,442 @@
+//===-- X86MCTargetDesc.cpp - X86 Target Descriptions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "X86MCAsmInfo.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_REGINFO_MC_DESC
+#include "X86GenRegisterInfo.inc"
+
+#define GET_INSTRINFO_MC_DESC
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "X86GenSubtargetInfo.inc"
+
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+
+std::string X86_MC::ParseX86Triple(StringRef TT) {
+  Triple TheTriple(TT);
+  std::string FS;
+  if (TheTriple.getArch() == Triple::x86_64)
+    FS = "+64bit-mode";
+  else
+    FS = "-64bit-mode";
+  return FS;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+bool X86_MC::GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                             unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    int registers[4];
+    __cpuid(registers, value);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
+  #else
+    return true;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #else
+    return true;
+  #endif
+#else
+  return true;
+#endif
+}
+
+/// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+bool X86_MC::GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                               unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc desn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value),
+            "c" (subleaf));
+    return false;
+  #elif defined(_MSC_VER)
+    // __cpuidex was added in MSVC++ 9.0 SP1
+    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+      int registers[4];
+      __cpuidex(registers, value, subleaf);
+      *rEAX = registers[0];
+      *rEBX = registers[1];
+      *rECX = registers[2];
+      *rEDX = registers[3];
+      return false;
+    #else
+      return true;
+    #endif
+  #else
+    return true;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value),
+            "c" (subleaf));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      mov   ecx,subleaf
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #else
+    return true;
+  #endif
+#else
+  return true;
+#endif
+}
+
+void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
+                               unsigned &Model) {
+  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
+  if (Family == 6 || Family == 0xf) {
+    if (Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TheTriple.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TheTriple.getOS() == Triple::MinGW32 ||
+      TheTriple.getOS() == Triple::Cygwin)
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    unsigned SEH = MRI->getEncodingValue(Reg);
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+}
+
+MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                  StringRef FS) {
+  std::string ArchFS = X86_MC::ParseX86Triple(TT);
+  if (!FS.empty()) {
+    if (!ArchFS.empty())
+      ArchFS = ArchFS + "," + FS.str();
+    else
+      ArchFS = FS;
+  }
+
+  std::string CPUName = CPU;
+  if (CPUName.empty()) {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+    CPUName = sys::getHostCPUName();
+#else
+    CPUName = "generic";
+#endif
+  }
+
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
+  return X;
+}
+
+static MCInstrInfo *createX86MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitX86MCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
+    ? X86::RIP     // Should have dwarf #16.
+    : X86::EIP;    // Should have dwarf #8.
+
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitX86MCRegisterInfo(X, RA,
+                        X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true),
+                        RA);
+  X86_MC::InitLLVM2SEHRegisterMapping(X);
+  return X;
+}
+
+static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
+    else
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.getEnvironment() == Triple::ELF) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  } else if (TheTriple.getOS() == Triple::Win32) {
+    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+  } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
+    MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
+  } else {
+    // The default is ELF.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
+  }
+
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  // Add return address to move list
+  MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
+  MAI->addInitialFrameState(0, CSDst, CSSrc);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  Triple T(TT);
+  bool is64Bit = T.getArch() == Triple::x86_64;
+
+  if (RM == Reloc::Default) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (T.isOSDarwin()) {
+      if (is64Bit)
+        RM = Reloc::PIC_;
+      else
+        RM = Reloc::DynamicNoPIC;
+    } else if (T.isOSWindows() && is64Bit)
+      RM = Reloc::PIC_;
+    else
+      RM = Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      RM = Reloc::PIC_;
+    else if (!T.isOSDarwin())
+      RM = Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+    RM = Reloc::PIC_;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCInstrInfo &MII,
+                                             const MCRegisterInfo &MRI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI, MII, MRI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI, MII, MRI);
+  return 0;
+}
+
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo);
+  RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo);
+
+  // Register the MC codegen info.
+  RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo);
+  RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target,
+                                          createX86MCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target,
+                                          createX86MCInstrAnalysis);
+
+  // Register the code emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target,
+                                        createX86MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target,
+                                        createX86MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+                                       createX86_64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,
+                                        createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
+                                        createX86MCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
new file mode 100644
index 000000000000..981aa1a2b911
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -0,0 +1,114 @@
+//===-- X86MCTargetDesc.h - X86 Target Descriptions -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides X86 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MCTARGETDESC_H
+#define X86MCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+class StringRef;
+class raw_ostream;
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86_MC {
+  std::string ParseX86Triple(StringRef TT);
+
+  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+  /// the specified arguments.  If we can't run cpuid on the host, return true.
+  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                       unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+  /// GetCpuIDAndInfoEx - Execute the specified cpuid with subleaf and return
+  /// the 4 values in the specified arguments.  If we can't run cpuid on the
+  /// host, return true.
+  bool GetCpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                       unsigned *rEBX, unsigned *rECX, unsigned *rEDX);
+
+  void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
+
+  unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+
+  void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+  /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
+  /// This is exposed so Asm parser, etc. do not need to go through
+  /// TargetRegistry.
+  MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
+                                            StringRef FS);
+}
+
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCRegisterInfo &MRI,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+
+/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
+/// createX86ELFObjectWriter - Construct an X86 ELF object writer.
+MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
+                                         bool IsELF64,
+                                         uint8_t OSABI,
+                                         uint16_t EMachine);
+/// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
+MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+} // End llvm namespace
+
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "X86GenRegisterInfo.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "X86GenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
new file mode 100644
index 000000000000..64f005c469bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -0,0 +1,586 @@
+//===-- X86MachObjectWriter.cpp - X86 Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+class X86MachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup,
+                                 MCValue Target,
+                                 unsigned Log2Size,
+                                 uint64_t &FixedValue);
+  void RecordTLVPRelocation(MachObjectWriter *Writer,
+                            const MCAssembler &Asm,
+                            const MCAsmLayout &Layout,
+                            const MCFragment *Fragment,
+                            const MCFixup &Fixup,
+                            MCValue Target,
+                            uint64_t &FixedValue);
+
+  void RecordX86Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+  void RecordX86_64Relocation(MachObjectWriter *Writer,
+                              const MCAssembler &Asm,
+                              const MCAsmLayout &Layout,
+                              const MCFragment *Fragment,
+                              const MCFixup &Fixup,
+                              MCValue Target,
+                              uint64_t &FixedValue);
+public:
+  X86MachObjectWriter(bool Is64Bit, uint32_t CPUType,
+                      uint32_t CPUSubtype)
+    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                               /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer,
+                        const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) {
+    if (Writer->is64Bit())
+      RecordX86_64Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                             FixedValue);
+    else
+      RecordX86Relocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+static bool isFixupKindRIPRel(unsigned Kind) {
+  return Kind == X86::reloc_riprel_4byte ||
+    Kind == X86::reloc_riprel_4byte_movq_load;
+}
+
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("invalid fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1: return 0;
+  case FK_PCRel_2:
+  case FK_Data_2: return 1;
+  case FK_PCRel_4:
+    // FIXME: Remove these!!!
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_signed_4byte:
+  case FK_Data_4: return 2;
+  case FK_Data_8: return 3;
+  }
+}
+
+void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
+                                                 const MCAssembler &Asm,
+                                                 const MCAsmLayout &Layout,
+                                                 const MCFragment *Fragment,
+                                                 const MCFixup &Fixup,
+                                                 MCValue Target,
+                                                 uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned IsRIPRel = isFixupKindRIPRel(Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // See <reloc.h>.
+  uint32_t FixupOffset =
+    Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  uint32_t FixupAddress =
+    Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+  int64_t Value = 0;
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  Value = Target.getConstant();
+
+  if (IsPCRel) {
+    // Compensate for the relocation offset, Darwin x86_64 relocations only have
+    // the addend and appear to have attempted to define it to be the actual
+    // expression addend without the PCrel bias. However, instructions with data
+    // following the relocation are not accommodated for (see comment below
+    // regarding SIGNED{1,2,4}), so it isn't exactly that either.
+    Value += 1LL << Log2Size;
+  }
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    Type = macho::RIT_X86_64_Unsigned;
+    Index = 0;
+
+    // FIXME: I believe this is broken, I don't think the linker can understand
+    // it. I think it would require a local relocation, but I'm not sure if that
+    // would work either. The official way to get an absolute PCrel relocation
+    // is to use an absolute symbol (which we don't support yet).
+    if (IsPCRel) {
+      IsExtern = 1;
+      Type = macho::RIT_X86_64_Branch;
+    }
+  } else if (Target.getSymB()) { // A - B + constant
+    const MCSymbol *A = &Target.getSymA()->getSymbol();
+    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
+
+    const MCSymbol *B = &Target.getSymB()->getSymbol();
+    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
+
+    // Neither symbol can be modified.
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
+        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
+      report_fatal_error("unsupported relocation of modified symbol");
+
+    // We don't support PCrel relocations of differences. Darwin 'as' doesn't
+    // implement most of these correctly.
+    if (IsPCRel)
+      report_fatal_error("unsupported pc-relative relocation of difference");
+
+    // The support for the situation where one or both of the symbols would
+    // require a local relocation is handled just like if the symbols were
+    // external.  This is certainly used in the case of debug sections where the
+    // section has only temporary symbols and thus the symbols don't have base
+    // symbols.  This is encoded using the section ordinal and non-extern
+    // relocation entries.
+
+    // Darwin 'as' doesn't emit correct relocations for this (it ends up with a
+    // single SIGNED relocation); reject it for now.  Except the case where both
+    // symbols don't have a base, equal but both NULL.
+    if (A_Base == B_Base && A_Base)
+      report_fatal_error("unsupported relocation with identical base");
+
+    Value += Writer->getSymbolAddress(&A_SD, Layout) -
+      (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+    Value -= Writer->getSymbolAddress(&B_SD, Layout) -
+      (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+
+    if (A_Base) {
+      Index = A_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = macho::RIT_X86_64_Unsigned;
+
+    macho::RelocationEntry MRE;
+    MRE.Word0 = FixupOffset;
+    MRE.Word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
+    Writer->addRelocation(Fragment->getParent(), MRE);
+
+    if (B_Base) {
+      Index = B_Base->getIndex();
+      IsExtern = 1;
+    }
+    else {
+      Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+    }
+    Type = macho::RIT_X86_64_Subtractor;
+  } else {
+    const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
+    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData *Base = Asm.getAtom(&SD);
+
+    // Relocations inside debug sections always use local relocations when
+    // possible. This seems to be done because the debugger doesn't fully
+    // understand x86_64 relocation entries, and expects to find values that
+    // have already been fixed up.
+    if (Symbol->isInSection()) {
+      const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
+        Fragment->getParent()->getSection());
+      if (Section.hasAttribute(MCSectionMachO::S_ATTR_DEBUG))
+        Base = 0;
+    }
+
+    // x86_64 almost always uses external relocations, except when there is no
+    // symbol to use as a base address (a local symbol with no preceding
+    // non-local symbol).
+    if (Base) {
+      Index = Base->getIndex();
+      IsExtern = 1;
+
+      // Add the local offset, if needed.
+      if (Base != &SD)
+        Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
+    } else if (Symbol->isInSection() && !Symbol->isVariable()) {
+      // The index is the section ordinal (1-based).
+      Index = SD.getFragment()->getParent()->getOrdinal() + 1;
+      IsExtern = 0;
+      Value += Writer->getSymbolAddress(&SD, Layout);
+
+      if (IsPCRel)
+        Value -= FixupAddress + (1 << Log2Size);
+    } else if (Symbol->isVariable()) {
+      const MCExpr *Value = Symbol->getVariableValue();
+      int64_t Res;
+      bool isAbs = Value->EvaluateAsAbsolute(Res, Layout,
+                                             Writer->getSectionAddressMap());
+      if (isAbs) {
+        FixedValue = Res;
+        return;
+      } else {
+        report_fatal_error("unsupported relocation of variable '" +
+                           Symbol->getName() + "'");
+      }
+    } else {
+      report_fatal_error("unsupported relocation of undefined symbol '" +
+                         Symbol->getName() + "'");
+    }
+
+    MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
+    if (IsPCRel) {
+      if (IsRIPRel) {
+        if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+          // x86_64 distinguishes movq foo@GOTPCREL so that the linker can
+          // rewrite the movq to an leaq at link time if the symbol ends up in
+          // the same linkage unit.
+          if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
+            Type = macho::RIT_X86_64_GOTLoad;
+          else
+            Type = macho::RIT_X86_64_GOT;
+        }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+          Type = macho::RIT_X86_64_TLV;
+        }  else if (Modifier != MCSymbolRefExpr::VK_None) {
+          report_fatal_error("unsupported symbol modifier in relocation");
+        } else {
+          Type = macho::RIT_X86_64_Signed;
+
+          // The Darwin x86_64 relocation format has a problem where it cannot
+          // encode an address (L<foo> + <constant>) which is outside the atom
+          // containing L<foo>. Generally, this shouldn't occur but it does
+          // happen when we have a RIPrel instruction with data following the
+          // relocation entry (e.g., movb $012, L0(%rip)). Even with the PCrel
+          // adjustment Darwin x86_64 uses, the offset is still negative and the
+          // linker has no way to recognize this.
+          //
+          // To work around this, Darwin uses several special relocation types
+          // to indicate the offsets. However, the specification or
+          // implementation of these seems to also be incomplete; they should
+          // adjust the addend as well based on the actual encoded instruction
+          // (the additional bias), but instead appear to just look at the final
+          // offset.
+          switch (-(Target.getConstant() + (1LL << Log2Size))) {
+          case 1: Type = macho::RIT_X86_64_Signed1; break;
+          case 2: Type = macho::RIT_X86_64_Signed2; break;
+          case 4: Type = macho::RIT_X86_64_Signed4; break;
+          }
+        }
+      } else {
+        if (Modifier != MCSymbolRefExpr::VK_None)
+          report_fatal_error("unsupported symbol modifier in branch "
+                             "relocation");
+
+        Type = macho::RIT_X86_64_Branch;
+      }
+    } else {
+      if (Modifier == MCSymbolRefExpr::VK_GOT) {
+        Type = macho::RIT_X86_64_GOT;
+      } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
+        // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
+        // case all we do is set the PCrel bit in the relocation entry; this is
+        // used with exception handling, for example. The source is required to
+        // include any necessary offset directly.
+        Type = macho::RIT_X86_64_GOT;
+        IsPCRel = 1;
+      } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
+        report_fatal_error("TLVP symbol modifier should have been rip-rel");
+      } else if (Modifier != MCSymbolRefExpr::VK_None)
+        report_fatal_error("unsupported symbol modifier in relocation");
+      else
+        Type = macho::RIT_X86_64_Unsigned;
+    }
+  }
+
+  // x86_64 always writes custom values into the fixups.
+  FixedValue = Value;
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+                                                    const MCAssembler &Asm,
+                                                    const MCAsmLayout &Layout,
+                                                    const MCFragment *Fragment,
+                                                    const MCFixup &Fixup,
+                                                    MCValue Target,
+                                                    unsigned Log2Size,
+                                                    uint64_t &FixedValue) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Type = macho::RIT_Vanilla;
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // Select the appropriate difference relocation type.
+    //
+    // Note that there is no longer any semantic difference between these two
+    // relocation types from the linkers point of view, this is done solely for
+    // pedantic compatibility with 'as'.
+    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
+      (unsigned)macho::RIT_Generic_LocalDifference;
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == macho::RIT_Difference ||
+      Type == macho::RIT_Generic_LocalDifference) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    macho::RelocationEntry MRE;
+    MRE.Word0 = ((0         <<  0) |
+                 (macho::RIT_Pair  << 24) |
+                 (Log2Size  << 28) |
+                 (IsPCRel   << 30) |
+                 macho::RF_Scattered);
+    MRE.Word1 = Value2;
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+
+  macho::RelocationEntry MRE;
+  MRE.Word0 = ((FixupOffset <<  0) |
+               (Type        << 24) |
+               (Log2Size    << 28) |
+               (IsPCRel     << 30) |
+               macho::RF_Scattered);
+  MRE.Word1 = Value;
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
+                                               const MCAssembler &Asm,
+                                               const MCAsmLayout &Layout,
+                                               const MCFragment *Fragment,
+                                               const MCFixup &Fixup,
+                                               MCValue Target,
+                                               uint64_t &FixedValue) {
+  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
+         !is64Bit() &&
+         "Should only be called with a 32-bit TLVP relocation!");
+
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+  uint32_t Value = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned IsPCRel = 0;
+
+  // Get the symbol data.
+  MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  unsigned Index = SD_A->getIndex();
+
+  // We're only going to have a second symbol in pic mode and it'll be a
+  // subtraction from the picbase. For 32-bit pic the addend is the difference
+  // between the picbase and the next address.  For 32-bit static the addend is
+  // zero.
+  if (Target.getSymB()) {
+    // If this is a subtraction then we're pcrel.
+    uint32_t FixupAddress =
+      Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
+    MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+    IsPCRel = 1;
+    FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
+                  Target.getConstant());
+    FixedValue += 1ULL << Log2Size;
+  } else {
+    FixedValue = 0;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = Value;
+  MRE.Word1 = ((Index                  <<  0) |
+               (IsPCRel                << 24) |
+               (Log2Size               << 25) |
+               (1                      << 27) | // Extern
+               (macho::RIT_Generic_TLV << 28)); // Type
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
+                                              const MCAssembler &Asm,
+                                              const MCAsmLayout &Layout,
+                                              const MCFragment *Fragment,
+                                              const MCFixup &Fixup,
+                                              MCValue Target,
+                                              uint64_t &FixedValue) {
+  unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
+  unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
+
+  // If this is a 32-bit TLVP reloc it's handled a bit differently.
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP) {
+    RecordTLVPRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                         FixedValue);
+    return;
+  }
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB()) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
+
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // If this is an internal relocation with an offset, it also needs a scattered
+  // relocation entry.
+  uint32_t Offset = Target.getConstant();
+  if (IsPCRel)
+    Offset += 1 << Log2Size;
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in RecordScatteredRelocation()
+  // for details).
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) &&
+      RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                Target, Log2Size, FixedValue))
+    return;
+
+  // See <reloc.h>.
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = 0;
+
+  if (Target.isAbsolute()) { // constant
+    // SymbolNum of 0 indicates the absolute section.
+    //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    Type = macho::RIT_Vanilla;
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+            Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD = Asm.getSectionData(
+        SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+
+    Type = macho::RIT_Vanilla;
+  }
+
+  // struct relocation_info (8 bytes)
+  macho::RelocationEntry MRE;
+  MRE.Word0 = FixupOffset;
+  MRE.Word1 = ((Index     <<  0) |
+               (IsPCRel   << 24) |
+               (Log2Size  << 25) |
+               (IsExtern  << 27) |
+               (Type      << 28));
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createX86MachObjectWriter(raw_ostream &OS,
+                                                bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
+                                                        CPUType,
+                                                        CPUSubtype),
+                                OS, /*IsLittleEndian=*/true);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
new file mode 100644
index 000000000000..bc272efcc9ce
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -0,0 +1,65 @@
+//===-- X86WinCOFFObjectWriter.cpp - X86 Win COFF Writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace llvm {
+  class MCObjectWriter;
+}
+
+namespace {
+  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+    const bool Is64Bit;
+
+  public:
+    X86WinCOFFObjectWriter(bool Is64Bit_);
+    ~X86WinCOFFObjectWriter();
+
+    virtual unsigned getRelocType(unsigned FixupKind) const;
+  };
+}
+
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
+  : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
+                                COFF::IMAGE_FILE_MACHINE_I386),
+    Is64Bit(Is64Bit_) {}
+
+X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
+
+unsigned X86WinCOFFObjectWriter::getRelocType(unsigned FixupKind) const {
+  switch (FixupKind) {
+  case FK_PCRel_4:
+  case X86::reloc_riprel_4byte:
+  case X86::reloc_riprel_4byte_movq_load:
+    return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
+  case FK_Data_4:
+  case X86::reloc_signed_4byte:
+    return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
+  case FK_Data_8:
+    if (Is64Bit)
+      return COFF::IMAGE_REL_AMD64_ADDR64;
+    llvm_unreachable("unsupported relocation type");
+  case FK_SecRel_4:
+    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
+  default:
+    llvm_unreachable("unsupported relocation type");
+  }
+}
+
+MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
+                                                   bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 000000000000..815d23588f11
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() { 
+  RegisterTarget<Triple::x86, /*HasJIT=*/true>
+    X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+    Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
new file mode 100644
index 000000000000..bbd490411f2d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -0,0 +1,217 @@
+//===-- X86ShuffleDecode.cpp - X86 shuffle decode logic -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ShuffleDecode.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  // Defaults the copying the dest value.
+  ShuffleMask.push_back(0);
+  ShuffleMask.push_back(1);
+  ShuffleMask.push_back(2);
+  ShuffleMask.push_back(3);
+
+  // Decode the immediate.
+  unsigned ZMask = Imm & 15;
+  unsigned CountD = (Imm >> 4) & 3;
+  unsigned CountS = (Imm >> 6) & 3;
+
+  // CountS selects which input element to use.
+  unsigned InVal = 4+CountS;
+  // CountD specifies which element of destination to update.
+  ShuffleMask[CountD] = InVal;
+  // ZMask zaps values, potentially overriding the CountD elt.
+  if (ZMask & 1) ShuffleMask[0] = SM_SentinelZero;
+  if (ZMask & 2) ShuffleMask[1] = SM_SentinelZero;
+  if (ZMask & 4) ShuffleMask[2] = SM_SentinelZero;
+  if (ZMask & 8) ShuffleMask[3] = SM_SentinelZero;
+}
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(NElts+i);
+
+  for (unsigned i = NElts/2; i != NElts; ++i)
+    ShuffleMask.push_back(i);
+}
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(i);
+
+  for (unsigned i = 0; i != NElts/2; ++i)
+    ShuffleMask.push_back(NElts+i);
+}
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      unsigned Base = i + Offset;
+      // if i+offset is out of this lane then we actually need the other source
+      if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+      ShuffleMask.push_back(Base + l);
+    }
+  }
+}
+
+/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
+/// VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      ShuffleMask.push_back(NewImm % NumLaneElts + l);
+      NewImm /= NumLaneElts;
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + 4 + (NewImm & 3));
+      NewImm >>= 2;
+    }
+  }
+}
+
+void DecodePSHUFLWMask(MVT VT, unsigned Imm,
+                       SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    unsigned NewImm = Imm;
+    for (unsigned i = 0, e = 4; i != e; ++i) {
+      ShuffleMask.push_back(l + (NewImm & 3));
+      NewImm >>= 2;
+    }
+    for (unsigned i = 4, e = 8; i != e; ++i) {
+      ShuffleMask.push_back(l + i);
+    }
+  }
+}
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  unsigned NewImm = Imm;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    // each half of a lane comes from different source
+    for (unsigned s = 0; s != NumElts*2; s += NumElts) {
+      for (unsigned i = 0; i != NumLaneElts/2; ++i) {
+        ShuffleMask.push_back(NewImm % NumLaneElts + s + l);
+        NewImm /= NumLaneElts;
+      }
+    }
+    if (NumLaneElts == 4) NewImm = Imm; // reload imm
+  }
+}
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l + NumLaneElts/2, e = l + NumLaneElts; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
+
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = l, e = l + NumLaneElts/2; i != e; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
+}
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask) {
+  if (Imm & 0x88)
+    return; // Not a shuffle
+
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+
+  for (unsigned l = 0; l != 2; ++l) {
+    unsigned HalfBegin = ((Imm >> (l*4)) & 0x3) * HalfSize;
+    for (unsigned i = HalfBegin, e = HalfBegin+HalfSize; i != e; ++i)
+      ShuffleMask.push_back(i);
+  }
+}
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  for (unsigned i = 0; i != 4; ++i) {
+    ShuffleMask.push_back((Imm >> (2*i)) & 3);
+  }
+}
+
+} // llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
new file mode 100644
index 000000000000..017ab325ec51
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -0,0 +1,71 @@
+//===-- X86ShuffleDecode.h - X86 shuffle decode logic -----------*-C++-*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Define several functions to decode x86 specific shuffle semantics into a
+// generic vector mask.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_SHUFFLE_DECODE_H
+#define X86_SHUFFLE_DECODE_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+//===----------------------------------------------------------------------===//
+//  Vector Mask Decoding
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+enum {
+  SM_SentinelZero = -1
+};
+
+void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+// <3,1> or <6,7,2,3>
+void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+// <0,2> or <0,1,4,5>
+void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
+/// the type of the vector allowing it to handle different datatypes and vector
+/// widths.
+void DecodeSHUFPMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// and punpckh*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
+/// and punpckl*. VT indicates the type of the vector allowing it to handle
+/// different datatypes and vector widths.
+void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
+
+
+void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
+                          SmallVectorImpl<int> &ShuffleMask);
+
+/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
+/// No VT provided since it only works on 256-bit, 4 element vectors.
+void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
+} // llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
new file mode 100644
index 000000000000..1f9919f15955
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -0,0 +1,75 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class JITCodeEmitter;
+class X86TargetMachine;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// createGlobalBaseRegPass - This pass initializes a global base
+/// register for PIC on x86-32.
+FunctionPass* createGlobalBaseRegPass();
+
+/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
+/// to local-dynamic TLS variables so that the TLS base address for the module
+/// is only fetched once per execution path through the function.
+FunctionPass *createCleanupLocalDynamicTLSPass();
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+/// \brief Creates an X86-specific Target Transformation Info pass.
+ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
+
+/// createX86PadShortFunctions - Return a pass that pads short functions
+/// with NOOPs. This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
new file mode 100644
index 000000000000..1dcc344e7f0d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -0,0 +1,343 @@
+//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget state
+//
+
+def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
+                                  "64-bit mode (x86_64)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
+def FeaturePOPCNT   : SubtargetFeature<"popcnt", "HasPOPCNT", "true",
+                                       "Support POPCNT instruction">;
+
+
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them.
+                                      [FeatureMMX, FeatureCMOV]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions",
+                                      [FeatureMMX]>;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
+                                        "IsUAMemFast", "true",
+                                        "Fast unaligned memory access">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions",
+                                      [FeatureSSE3]>;
+
+def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
+                                      "Enable AVX instructions",
+                                      [FeatureSSE42]>;
+def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
+                                      "Enable AVX2 instructions",
+                                      [FeatureAVX]>;
+def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
+                         "Enable packed carry-less multiplication instructions",
+                               [FeatureSSE2]>;
+def FeatureFMA     : SubtargetFeature<"fma", "HasFMA", "true",
+                                      "Enable three-operand fused multiple-add",
+                                      [FeatureAVX]>;
+def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
+                                      "Enable four-operand fused multiple-add",
+                                      [FeatureAVX, FeatureSSE4A]>;
+def FeatureXOP     : SubtargetFeature<"xop", "HasXOP", "true",
+                                      "Enable XOP instructions",
+                                      [FeatureFMA4]>;
+def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
+                                          "HasVectorUAMem", "true",
+                 "Allow unaligned memory operands on vector/SIMD instructions">;
+def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
+                                      "Enable AES instructions",
+                                      [FeatureSSE2]>;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions">;
+def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
+                                       "Support FS/GS Base instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
+def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
+                                      "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
+def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
+                                      "Support HLE">;
+def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
+                                      "Support ADX instructions">;
+def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
+                                      "Support PRFCHW instructions">;
+def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
+                                      "Support RDSEED instruction">;
+def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
+                                     "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                                     "HasSlowDivide", "true",
+                                     "Use small divide for positive values less than 256">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+                                     "PadShortFunctions", "true",
+                                     "Pad short functions">;
+def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
+                                     "CallRegIndirect", "true",
+                                     "Call register indirect">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86Schedule.td"
+
+def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
+                    "Intel Atom processors">;
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : ProcessorModel<Name, GenericModel, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"i586",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      [FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
+def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
+// Intel Core Duo.
+def : ProcessorModel<"yonah", SandyBridgeModel,
+                     [FeatureSSE3, FeatureSlowBTMem]>;
+
+// NetBurst.
+def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona",   [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Intel Core 2 Solo/Duo.
+def : ProcessorModel<"core2", SandyBridgeModel,
+                     [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+def : ProcessorModel<"penryn", SandyBridgeModel,
+                     [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>;
+
+// Atom.
+def : ProcessorModel<"atom", AtomModel,
+                     [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+                      FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                      FeatureSlowDivide,
+                      FeatureCallRegIndirect,
+                      FeaturePadShortFunctions]>;
+
+// "Arrandale" along with corei3 and corei5
+def : ProcessorModel<"corei7", SandyBridgeModel,
+                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>;
+
+def : ProcessorModel<"nehalem", SandyBridgeModel,
+                     [FeatureSSE42,  FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT]>;
+// Westmere is a similar machine to nehalem with some additional features.
+// Westmere is the corei3/i5/i7 path from nehalem to sandybridge
+def : ProcessorModel<"westmere", SandyBridgeModel,
+                     [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
+                      FeatureFastUAMem, FeaturePOPCNT, FeatureAES,
+                      FeaturePCLMUL]>;
+// Sandy Bridge
+// SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
+// rather than a superset.
+def : ProcessorModel<"corei7-avx", SandyBridgeModel,
+                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+// Ivy Bridge
+def : ProcessorModel<"core-avx-i", SandyBridgeModel,
+                     [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+                      FeatureF16C, FeatureFSGSBase]>;
+
+// Haswell
+def : ProcessorModel<"core-avx2", HaswellModel,
+                     [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem,
+                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
+                      FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT,
+                      FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
+                      FeatureHLE]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [Feature3DNow]>;
+def : Proc<"k6-3",            [Feature3DNow]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"amdfam10",        [FeatureSSE4A,
+                               Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureSlowBTMem]>;
+// Bobcat
+def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
+                               FeatureLZCNT, FeaturePOPCNT]>;
+// Bulldozer
+def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureLZCNT, FeaturePOPCNT]>;
+// Enhanced Bulldozer
+def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+def : Proc<"geode",           [Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [Feature3DNow]>;
+def : Proc<"c3",              [Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Parser
+//===----------------------------------------------------------------------===//
+
+def ATTAsmParser : AsmParser {
+  string AsmParserClassName = "AsmParser";
+}
+
+def ATTAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
+def IntelAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = ";";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "";
+}
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTInstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelInstPrinter";
+  int Variant = 1;
+  bit isMCAsmWriter = 1;
+}
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+  let AssemblyParsers = [ATTAsmParser];
+  let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant];
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 000000000000..6b228b0b0329
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,755 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+    bool Intrn = MF.getFunction()->hasInternalLinkage();
+    OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer.EmitCOFFSymbolStorageClass(Intrn ? COFF::IMAGE_SYM_CLASS_STATIC
+                                              : COFF::IMAGE_SYM_CLASS_EXTERNAL);
+    OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    OutStreamer.EndCOFFSymbolDef();
+  }
+
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
+                                       raw_ostream &O) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    printOffset(MO.getOffset(), O);
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+
+    MCSymbol *GVSym;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+      GVSym = GetSymbolWithGlobalValueBase(GV, "$stub");
+    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    else
+      GVSym = Mang->getSymbol(GV);
+
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (StubSym.getPointer() == 0)
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+      if (StubSym.getPointer() == 0)
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym.getPointer() == 0)
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+    }
+
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (GVSym->getName()[0] != '$')
+      O << *GVSym;
+    else
+      O << '(' << *GVSym << ')';
+    printOffset(MO.getOffset(), O);
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const MCSymbol *SymToPrint;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      SmallString<128> TempNameStr;
+      TempNameStr += StringRef(MO.getSymbolName());
+      TempNameStr += StringRef("$stub");
+
+      MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
+      MachineModuleInfoImpl::StubValueTy &StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym.getPointer() == 0) {
+        TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end());
+        StubSym = MachineModuleInfoImpl::
+          StubValueTy(OutContext.GetOrCreateSymbol(TempNameStr.str()),
+                      true);
+      }
+      SymToPrint = StubSym.getPointer();
+    } else {
+      SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName());
+    }
+
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (SymToPrint->getName()[0] != '$')
+      O << *SymToPrint;
+    else
+      O << '(' << *SymToPrint << '(';
+    break;
+  }
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-" << *MF->getPICBaseSymbol() << ']';
+    break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    O << '-' << *MF->getPICBaseSymbol();
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_TLSLD:     O << "@TLSLD";     break;
+  case X86II::MO_TLSLDM:    O << "@TLSLDM";    break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_DTPOFF:    O << "@DTPOFF";    break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  case X86II::MO_TLVP:      O << "@TLVP";      break;
+  case X86II::MO_TLVP_PIC_BASE:
+    O << "@TLVP" << '-' << *MF->getPICBaseSymbol();
+    break;
+  case X86II::MO_SECREL:    O << "@SECREL32";  break;
+  }
+}
+
+/// printPCRelImm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Register:
+    // pc-relativeness was handled when computing the value in the reg.
+    printOperand(MI, OpNo, O);
+    return;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    printSymbolOperand(MO, O);
+    return;
+  }
+}
+
+
+void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 raw_ostream &O, const char *Modifier,
+                                 unsigned AsmVariant) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (AsmVariant == 0) O << '$';
+    O << MO.getImm();
+    return;
+
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol: {
+    if (AsmVariant == 0) O << '$';
+    printSymbolOperand(MO, O);
+    break;
+  }
+  }
+}
+
+void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                                         raw_ostream &O, const char *Modifier) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+
+  if (DispSpec.isImm()) {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+  } else {
+    assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
+           DispSpec.isJTI() || DispSpec.isSymbol());
+    printSymbolOperand(MI->getOperand(Op+3), O);
+  }
+
+  if (Modifier && strcmp(Modifier, "H") == 0)
+    O << "+8";
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(MI, Op, O, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+2, O, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                      raw_ostream &O, const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+4);
+  if (Segment.getReg()) {
+    printOperand(MI, Op+4, O, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op, O, Modifier);
+}
+
+void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                                           raw_ostream &O, const char *Modifier,
+                                           unsigned AsmVariant){
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+  const MachineOperand &SegReg   = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O, Modifier, AsmVariant);
+    O << ':';
+  }
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    printOperand(MI, Op+3, O, Modifier, AsmVariant);
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  O << ']';
+}
+
+bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                      raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  case 'q': // Print DImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i64);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode, raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      if (MO.isImm()) {
+        O << MO.getImm();
+        return false;
+      }
+      if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
+        printSymbolOperand(MO, O);
+        if (Subtarget->isPICStyleRIPRel())
+          O << "(%rip)";
+        return false;
+      }
+      if (MO.isReg()) {
+        O << '(';
+        printOperand(MI, OpNo, O);
+        O << ')';
+        return false;
+      }
+      return true;
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      if (MO.isImm())
+        O << MO.getImm();
+      else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
+        printSymbolOperand(MO, O);
+      else
+        printOperand(MI, OpNo, O);
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      printOperand(MI, OpNo, O);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      printPCRelImm(MI, OpNo, O);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
+
+  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+  return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode,
+                                          raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(MI, OpNo, O);
+    return false;
+  }
+
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'H':
+      printMemReference(MI, OpNo, O, "H");
+      return false;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(MI, OpNo, O, "no-rip");
+      return false;
+    }
+  }
+  printMemReference(MI, OpNo, O);
+  return false;
+}
+
+void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetEnvMacho())
+    OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetEnvMacho()) {
+    // All darwin targets use mach-o.
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    Stubs = MMIMacho.GetFnStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__jump_table",
+                                   MCSectionMachO::S_SYMBOL_STUBS |
+                                   MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
+                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                   5, SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$stub:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        //   .indirect_symbol _foo
+        OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+                                        MCSA_IndirectSymbol);
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
+        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
+        OutStreamer.EmitBytes(StringRef(HltInsts, 5));
+      }
+
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__pointers",
+                                   MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$non_lazy_ptr:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        // .indirect_symbol _foo
+        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
+        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
+                                        MCSA_IndirectSymbol);
+        // .long 0
+        if (MCSym.getInt())
+          // External to current translation unit.
+          OutStreamer.EmitIntValue(0, 4/*size*/);
+        else
+          // Internal to current translation unit.
+          //
+          // When we place the LSDA into the TEXT section, the type info
+          // pointers need to be indirect and pc-rel. We accomplish this by
+          // using NLPs.  However, sometimes the types are local to the file. So
+          // we need to fill in the value for the NLP in those cases.
+          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
+                                                        OutContext), 4/*size*/);
+      }
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$non_lazy_ptr:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        // .long _foo
+        OutStreamer.EmitValue(MCSymbolRefExpr::
+                              Create(Stubs[i].second.getPointer(),
+                                     OutContext), 4/*size*/);
+      }
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing() &&
+      MMI->usesVAFloatArgument()) {
+    StringRef SymbolName = Subtarget->is64Bit() ? "_fltused" : "__fltused";
+    MCSymbol *S = MMI->getContext().GetOrCreateSymbol(SymbolName);
+    OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+  }
+
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho()) {
+    X86COFFMachineModuleInfo &COFFMMI =
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+    // Emit type information for external functions
+    typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
+    for (externals_iterator I = COFFMMI.externals_begin(),
+                            E = COFFMMI.externals_end();
+                            I != E; ++I) {
+      OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
+      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
+      OutStreamer.EndCOFFSymbolDef();
+    }
+
+    // Necessary for dllexport support
+    std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
+
+    const TargetLoweringObjectFileCOFF &TLOFCOFF =
+      static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+      if (I->hasDLLExportLinkage())
+        DLLExportedFns.push_back(Mang->getSymbol(I));
+
+    for (Module::const_global_iterator I = M.global_begin(),
+           E = M.global_end(); I != E; ++I)
+      if (I->hasDLLExportLinkage())
+        DLLExportedGlobals.push_back(Mang->getSymbol(I));
+
+    // Output linker support code for dllexported globals on windows.
+    if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+      OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
+      SmallString<128> name;
+      for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
+        if (Subtarget->isTargetWindows())
+          name = " /EXPORT:";
+        else
+          name = " -export:";
+        name += DLLExportedGlobals[i]->getName();
+        if (Subtarget->isTargetWindows())
+          name += ",DATA";
+        else
+        name += ",data";
+        OutStreamer.EmitBytes(name);
+      }
+
+      for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
+        if (Subtarget->isTargetWindows())
+          name = " /EXPORT:";
+        else
+          name = " -export:";
+        name += DLLExportedFns[i]->getName();
+        OutStreamer.EmitBytes(name);
+      }
+    }
+  }
+
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize());
+      }
+      Stubs.clear();
+    }
+  }
+}
+
+MachineLocation
+X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+
+  if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
+
+void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                           raw_ostream &O) {
+  // Only the target-dependent form of DBG_VALUE should get here.
+  // Referencing the offset and metadata as NOps-2 and NOps-1 is
+  // probably portable to other targets; frame pointer location is not.
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps==7);
+  O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  if (V.getContext().isSubprogram())
+    O << DISubprogram(V.getContext()).getDisplayName() << ":";
+  O << V.getName();
+  O << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  O << '[';
+  if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
+    printOperand(MI, 0, O);
+  else
+    O << "undef";
+  O << '+'; printOperand(MI, 3, O);
+  O << ']';
+  O << "+";
+  printOperand(MI, NOps-2, O);
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() {
+  RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+  RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
new file mode 100644
index 000000000000..bc7496bad144
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,79 @@
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCStreamer;
+
+class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+ public:
+  explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  virtual const char *getPassName() const LLVM_OVERRIDE {
+    return "X86 Assembly / Object Emitter";
+  }
+
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
+
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
+
+  void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
+                    const char *Modifier = 0, unsigned AsmVariant = 0);
+  void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) LLVM_OVERRIDE;
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &OS) LLVM_OVERRIDE;
+
+  void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+                         const char *Modifier=NULL);
+  void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
+                            const char *Modifier=NULL);
+
+  void printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                              raw_ostream &O, const char *Modifier=NULL,
+                              unsigned AsmVariant = 1);
+
+  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  virtual MachineLocation
+    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
new file mode 100644
index 000000000000..6a6125bb6b2c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.cpp
@@ -0,0 +1,19 @@
+//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86COFFMachineModuleInfo.h"
+using namespace llvm;
+
+
+X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
+}
diff --git a/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
new file mode 100644
index 000000000000..0dfeb42f1a4d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -0,0 +1,46 @@
+//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_MACHINEMODULEINFO_H
+#define X86COFF_MACHINEMODULEINFO_H
+
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+
+namespace llvm {
+  class X86MachineFunctionInfo;
+  class DataLayout;
+
+/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
+/// for X86 COFF targets.
+class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
+  DenseSet<MCSymbol const *> Externals;
+public:
+  X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
+  virtual ~X86COFFMachineModuleInfo();
+
+  void addExternalFunction(MCSymbol* Symbol) {
+    Externals.insert(Symbol);
+  }
+
+  typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
+  externals_iterator externals_begin() const { return Externals.begin(); }
+  externals_iterator externals_end() const { return Externals.end(); }
+};
+
+
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 000000000000..9eafbd55a5ae
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,537 @@
+//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>,
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX target feature.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in ST0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 HiPE return-value convention.
+def RetCC_X86_32_HiPE : CallingConv<[
+  // Promote all types to i32
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0.
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// X86-64 HiPE return-value convention.
+def RetCC_X86_64_HiPE : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: HP, P, VAL1, VAL2
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
+]>;
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // If HiPE, use RetCC_X86_32_HiPE.
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
+
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // HiPE uses RetCC_X86_64_HiPE
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // The first 8 MMX vector arguments are passed in XMM registers on Darwin.
+  CCIfType<[x86mmx],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCPromoteToType<v2i64>>>>,
+
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+
+  // The first 8 256-bit vector arguments are passed in YMM registers, unless
+  // this is a vararg function.
+  // FIXME: This isn't precisely correct; the x86-64 ABI document says that
+  // fixed arguments to vararg functions are supposed to be passed in
+  // registers.  Actually modeling that would be a lot of work, though.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          CCIfSubtarget<"hasAVX()",
+                          CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
+                                         YMM4, YMM5, YMM6, YMM7]>>>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+
+  // 256 bit vectors are passed by pointer
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // Do not pass the sret argument in RCX, the Win64 thiscall calling
+  // convention requires "this" to be passed in RCX.                                        
+  CCIfCC<"CallingConv::X86_ThisCall", 
+    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
+                                                     [XMM1, XMM2, XMM3]>>>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>
+]>;
+
+def CC_X86_64_GHC : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim
+  CCIfType<[i64],
+            CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>,
+
+  // Pass in STG registers: F1, F2, F3, F4, D1, D2
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+]>;
+
+def CC_X86_64_HiPE : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3
+  CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 vector arguments are passed in mmx registers if the
+  // call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[x86mmx],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // The first 4 SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                CCIfSubtarget<"hasAVX()",
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
+
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+           CCAssignToStack<32, 32>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[x86mmx], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_ThisCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass sret arguments indirectly through stack.
+  CCIfSRet<CCAssignToStack<4, 4>>,
+
+  // The first integer argument is passed in ECX
+  CCIfType<[i32], CCAssignToReg<[ECX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_GHC : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in STG registers: Base, Sp, Hp, R1
+  CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>>
+]>;
+
+def CC_X86_32_HiPE : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2
+  CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>
+]>;
+
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>,
+  CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+
+  // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Root Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// This is the root argument convention for the X86-32 backend.
+def CC_X86_32 : CallingConv<[
+  CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
+  CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+
+  // Otherwise, drop to normal X86-32 CC
+  CCDelegateTo<CC_X86_32_C>
+]>;
+
+// This is the root argument convention for the X86-64 backend.
+def CC_X86_64 : CallingConv<[
+  CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
+  CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+// This is the argument convention used for the entire X86 backend.
+def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
+  CCDelegateTo<CC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Callee-saved Registers.
+//===----------------------------------------------------------------------===//
+
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>;
+def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>;
+
+def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>;
+def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
+
+def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
+                                     (sequence "XMM%u", 6, 15))>;
+
+def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+                                           R11, R12, R13, R14, R15, RBP,
+                                           (sequence "XMM%u", 0, 15))>;
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15, 
+                                                  (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 000000000000..2518e02e2a40
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,1499 @@
+//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  template<class CodeEmitter>
+  class Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const DataLayout    *TD;
+    X86TargetMachine    &TM;
+    CodeEmitter         &MCE;
+    MachineModuleInfo   *MMI;
+    intptr_t PICBaseOffset;
+    bool Is64BitMode;
+    bool IsPIC;
+  public:
+    static char ID;
+    explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+      MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+    Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+            const X86InstrInfo &ii, const DataLayout &td, bool is64)
+      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
+      MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                          const MachineInstr &MI,
+                          const MCInstrDesc *Desc) const;
+
+    void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand,
+                             const MachineInstr &MI,
+                             const MCInstrDesc *Desc) const;
+
+    void emitSegmentOverridePrefix(uint64_t TSFlags,
+                                   int MemOperand,
+                                   const MachineInstr &MI) const;
+
+    void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                           intptr_t Disp = 0, intptr_t PCAdj = 0,
+                           bool Indirect = false);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+                              intptr_t PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                              intptr_t PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               intptr_t Adj = 0, bool IsPCRel = true);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitRegModRMByte(unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const {
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      return TRI->getEncodingValue(RegNo) & 0x7;
+    }
+
+    unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                         unsigned OpNum) const;
+  };
+
+template<class CodeEmitter>
+  char Emitter<CodeEmitter>::ID = 0;
+} // end anonymous namespace.
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified JITCodeEmitter object.
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+  MMI = &getAnalysis<MachineModuleInfo>();
+  MCE.setModuleInfo(MMI);
+
+  II = TM.getInstrInfo();
+  TD = TM.getDataLayout();
+  Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+
+  do {
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+        const MCInstrDesc &Desc = I->getDesc();
+        emitInstruction(*I, &Desc);
+        // MOVPC32r is basically a call plus a pop instruction.
+        if (Desc.getOpcode() == X86::MOVPC32r)
+          emitInstruction(*I, &II->get(X86::POP32r));
+        ++NumEmitted;  // Keep track of the # of mi's emitted
+      }
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const MCInstrDesc &Desc = MI.getDesc();
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc.getNumOperands();
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (X86II::isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc.TSFlags & X86II::FormMask) {
+      case X86II::MRMInitReg:
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= (1 << 0) | (1 << 2);
+        break;
+      case X86II::MRMSrcReg: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 0;
+        }
+        break;
+      }
+      case X86II::MRMSrcMem: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        i = isTwoAddr ? 2 : 1;
+        for (; i != NumOps; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      case X86II::MRM0m: case X86II::MRM1m:
+      case X86II::MRM2m: case X86II::MRM3m:
+      case X86II::MRM4m: case X86II::MRM5m:
+      case X86II::MRM6m: case X86II::MRM7m:
+      case X86II::MRMDestMem: {
+        unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
+        i = isTwoAddr ? 1 : 0;
+        if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e)))
+          REX |= 1 << 2;
+        unsigned Bit = 0;
+        for (; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (MO.isReg()) {
+            if (X86InstrInfo::isX86_64ExtendedReg(MO))
+              REX |= 1 << Bit;
+            Bit++;
+          }
+        }
+        break;
+      }
+      default: {
+        if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0)))
+          REX |= 1 << 0;
+        i = isTwoAddr ? 2 : 1;
+        for (unsigned e = NumOps; i != e; ++i) {
+          const MachineOperand& MO = MI.getOperand(i);
+          if (X86InstrInfo::isX86_64ExtendedReg(MO))
+            REX |= 1 << 2;
+        }
+        break;
+      }
+    }
+  }
+  return REX;
+}
+
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV,
+                                unsigned Reloc,
+                                intptr_t Disp /* = 0 */,
+                                intptr_t PCAdj /* = 0 */,
+                                bool Indirect /* = false */) {
+  intptr_t RelocCST = Disp;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           const_cast<GlobalValue *>(GV),
+                                           RelocCST, false)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               const_cast<GlobalValue *>(GV), RelocCST, false);
+  MCE.addRelocation(MR);
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+                                                     unsigned Reloc) {
+  intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+
+  // X86 never needs stubs because instruction selection will always pick
+  // an instruction sequence that is large enough to hold any address
+  // to a symbol.
+  // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
+  bool NeedStub = false;
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, RelocCST,
+                                                 0, NeedStub));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   intptr_t Disp /* = 0 */,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+                                            unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS,
+                                       unsigned Index,
+                                       unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+                              const TargetMachine &TM) {
+  // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
+  // mechanism as 32-bit mode.
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() &&
+      !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+    return false;
+
+  // Return true if this is a reference to a stub containing the address of the
+  // global, not the global itself.
+  return isGlobalStubReference(GVOp.getTargetFlags());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+                                                 int DispVal,
+                                                 intptr_t Adj /* = 0 */,
+                                                 bool IsPCRel /* = true */) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  unsigned RelocType = Is64BitMode ?
+    (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+  if (RelocOp->isGlobal()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial. If the MCE supports using RIP directly
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel).
+    //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
+    //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
+    bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+    emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+                      Adj, Indirect);
+  } else if (RelocOp->isSymbol()) {
+    emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
+  } else if (RelocOp->isCPI()) {
+    emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+                         RelocOp->getOffset(), Adj);
+  } else {
+    assert(RelocOp->isJTI() && "Unexpected machine operand!");
+    emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+                                            unsigned Op,unsigned RegOpcodeField,
+                                            intptr_t PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isSymbol()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJTI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Handle %rip relative addressing.
+  if (BaseReg == X86::RIP ||
+      (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode
+    assert(IndexReg.getReg() == 0 && Is64BitMode &&
+           "Invalid rip-relative address");
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+    return;
+  }
+
+  // Indicate that the displacement will use an pcrel or absolute reference
+  // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+  // while others, unless explicit asked to use RIP, use absolute references.
+  bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
+  // Is a SIB byte needed?
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+  unsigned BaseRegNo = -1U;
+  if (BaseReg != 0 && BaseReg != X86::RIP)
+    BaseRegNo = getX86RegNum(BaseReg);
+
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 &&
+      // The SIB byte must be used if the base is ESP/RSP/R12, all of which
+      // encode to an R/M value of 4, which indicates that a SIB byte is
+      // present.
+      BaseRegNo != N86::ESP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!Is64BitMode || BaseReg != 0)) {
+    if (BaseReg == 0 ||          // [disp32]     in X86-32 mode
+        BaseReg == X86::RIP) {   // [disp32+RIP] in X86-64 mode
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+      return;
+    }
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      return;
+    }
+
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (!DispForReloc && isDisp8(DispVal)) {
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+      emitConstant(DispVal, 1);
+      return;
+    }
+
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+    return;
+  }
+
+  // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispForReloc) {
+    // Emit the normal disp32 encoding.
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispVal == 0 && BaseRegNo != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+  } else if (isDisp8(DispVal)) {
+    // Emit the disp8 encoding...
+    MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding...
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    emitSIBByte(SS, IndexRegNo, 5);
+  } else {
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    emitSIBByte(SS, IndexRegNo, BaseRegNo);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8) {
+    emitConstant(DispVal, 1);
+  } else if (DispVal != 0 || ForceDisp32) {
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+  }
+}
+
+static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II,
+                                   unsigned Opcode) {
+  const MCInstrDesc *Desc = &II->get(Opcode);
+  MI.setDesc(*Desc);
+  return Desc;
+}
+
+/// Is16BitMemOperand - Return true if the specified instruction has
+/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is32BitMemOperand - Return true if the specified instruction has
+/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
+static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+
+/// Is64BitMemOperand - Return true if the specified instruction has
+/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+#ifndef NDEBUG
+static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) {
+  const MachineOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
+  const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
+    return true;
+  return false;
+}
+#endif
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
+                                            int MemOperand,
+                                            const MachineInstr &MI,
+                                            const MCInstrDesc *Desc) const {
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK)
+    MCE.emitByte(0xF0);
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
+    MCE.emitByte(0xF3);
+
+  // Emit the address size opcode prefix as needed.
+  bool need_address_override;
+  if (TSFlags & X86II::AdSize) {
+    need_address_override = true;
+  } else if (MemOperand == -1) {
+    need_address_override = false;
+  } else if (Is64BitMode) {
+    assert(!Is16BitMemOperand(MI, MemOperand));
+    need_address_override = Is32BitMemOperand(MI, MemOperand);
+  } else {
+    assert(!Is64BitMemOperand(MI, MemOperand));
+    need_address_override = Is16BitMemOperand(MI, MemOperand);
+  }
+
+  if (need_address_override)
+    MCE.emitByte(0x67);
+
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    MCE.emitByte(0x66);
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+    case X86II::TB:  // Two-byte opcode prefix
+    case X86II::T8:  // 0F 38
+    case X86II::TA:  // 0F 3A
+    case X86II::A6:  // 0F A6
+    case X86II::A7:  // 0F A7
+      Need0FPrefix = true;
+      break;
+    case X86II::REP: break; // already handled.
+    case X86II::T8XS: // F3 0F 38
+    case X86II::XS:   // F3 0F
+      MCE.emitByte(0xF3);
+      Need0FPrefix = true;
+      break;
+    case X86II::T8XD: // F2 0F 38
+    case X86II::TAXD: // F2 0F 3A
+    case X86II::XD:   // F2 0F
+      MCE.emitByte(0xF2);
+      Need0FPrefix = true;
+      break;
+    case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+    case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+      MCE.emitByte(0xD8+
+                   (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                    >> X86II::Op0Shift));
+      break; // Two-byte opcode prefix
+    default: llvm_unreachable("Invalid prefix!");
+    case 0: break;  // No prefix!
+  }
+
+  // Handle REX prefix.
+  if (Is64BitMode) {
+    if (unsigned REX = determineREX(MI))
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+    case X86II::T8XD:  // F2 0F 38
+    case X86II::T8XS:  // F3 0F 38
+    case X86II::T8:    // 0F 38
+      MCE.emitByte(0x38);
+      break;
+    case X86II::TAXD:  // F2 0F 38
+    case X86II::TA:    // 0F 3A
+      MCE.emitByte(0x3A);
+      break;
+    case X86II::A6:    // 0F A6
+      MCE.emitByte(0xA6);
+      break;
+    case X86II::A7:    // 0F A7
+      MCE.emitByte(0xA7);
+      break;
+  }
+}
+
+// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
+// 0-7 and the difference between the 2 groups is given by the REX prefix.
+// In the VEX prefix, registers are seen sequencially from 0-15 and encoded
+// in 1's complement form, example:
+//
+//  ModRM field => XMM9 => 1
+//  VEX.VVVV    => XMM9 => ~9
+//
+// See table 4-35 of Intel AVX Programming Reference for details.
+template<class CodeEmitter>
+unsigned char
+Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
+                                             unsigned OpNum) const {
+  unsigned SrcReg = MI.getOperand(OpNum).getReg();
+  unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
+  if (X86II::isX86_64ExtendedReg(SrcReg))
+    SrcRegNum |= 8;
+
+  // The registers represented through VEX_VVVV should
+  // be encoded in 1's complement form.
+  return (~SrcRegNum) & 0xf;
+}
+
+/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags,
+                                                 int MemOperand,
+                                                 const MachineInstr &MI) const {
+  switch (TSFlags & X86II::SegOvrMask) {
+    default: llvm_unreachable("Invalid segment!");
+    case 0:
+      // No segment override, check for explicit one on memory operand.
+      if (MemOperand != -1) {   // If the instruction has a memory operand.
+        switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) {
+          default: llvm_unreachable("Unknown segment register!");
+          case 0: break;
+          case X86::CS: MCE.emitByte(0x2E); break;
+          case X86::SS: MCE.emitByte(0x36); break;
+          case X86::DS: MCE.emitByte(0x3E); break;
+          case X86::ES: MCE.emitByte(0x26); break;
+          case X86::FS: MCE.emitByte(0x64); break;
+          case X86::GS: MCE.emitByte(0x65); break;
+        }
+      }
+      break;
+    case X86II::FS:
+      MCE.emitByte(0x64);
+      break;
+    case X86II::GS:
+      MCE.emitByte(0x65);
+      break;
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
+                                               int MemOperand,
+                                               const MachineInstr &MI,
+                                               const MCInstrDesc *Desc) const {
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+
+  // VEX_R: opcode externsion equivalent to REX.R in
+  // 1's complement (inverted) form
+  //
+  //  1: Same as REX_R=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX_R=1 (64 bit mode only)
+  //
+  unsigned char VEX_R = 0x1;
+
+  // VEX_X: equivalent to REX.X, only used when a
+  // register is used for index in SIB Byte.
+  //
+  //  1: Same as REX.X=0 (must be 1 in 32-bit mode)
+  //  0: Same as REX.X=1 (64-bit mode only)
+  unsigned char VEX_X = 0x1;
+
+  // VEX_B:
+  //
+  //  1: Same as REX_B=0 (ignored in 32-bit mode)
+  //  0: Same as REX_B=1 (64 bit mode only)
+  //
+  unsigned char VEX_B = 0x1;
+
+  // VEX_W: opcode specific (use like REX.W, or used for
+  // opcode extension, or ignored, depending on the opcode byte)
+  unsigned char VEX_W = 0;
+
+  // XOP: Use XOP prefix byte 0x8f instead of VEX.
+  unsigned char XOP = 0;
+
+  // VEX_5M (VEX m-mmmmm field):
+  //
+  //  0b00000: Reserved for future use
+  //  0b00001: implied 0F leading opcode
+  //  0b00010: implied 0F 38 leading opcode bytes
+  //  0b00011: implied 0F 3A leading opcode bytes
+  //  0b00100-0b11111: Reserved for future use
+  //  0b01000: XOP map select - 08h instructions with imm byte
+  //  0b10001: XOP map select - 09h instructions with no imm byte
+  unsigned char VEX_5M = 0x1;
+
+  // VEX_4V (VEX vvvv field): a register specifier
+  // (in 1's complement form) or 1111 if unused.
+  unsigned char VEX_4V = 0xf;
+
+  // VEX_L (Vector Length):
+  //
+  //  0: scalar or 128-bit vector
+  //  1: 256-bit vector
+  //
+  unsigned char VEX_L = 0;
+
+  // VEX_PP: opcode extension providing equivalent
+  // functionality of a SIMD prefix
+  //
+  //  0b00: None
+  //  0b01: 66
+  //  0b10: F3
+  //  0b11: F2
+  //
+  unsigned char VEX_PP = 0;
+
+  // Encode the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    VEX_PP = 0x01;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W)
+    VEX_W = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
+    XOP = 1;
+
+  if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
+    VEX_L = 1;
+
+  switch (TSFlags & X86II::Op0Mask) {
+    default: llvm_unreachable("Invalid prefix!");
+    case X86II::T8:  // 0F 38
+      VEX_5M = 0x2;
+      break;
+    case X86II::TA:  // 0F 3A
+      VEX_5M = 0x3;
+      break;
+    case X86II::T8XS: // F3 0F 38
+      VEX_PP = 0x2;
+      VEX_5M = 0x2;
+      break;
+    case X86II::T8XD: // F2 0F 38
+      VEX_PP = 0x3;
+      VEX_5M = 0x2;
+      break;
+    case X86II::TAXD: // F2 0F 3A
+      VEX_PP = 0x3;
+      VEX_5M = 0x3;
+      break;
+    case X86II::XS:  // F3 0F
+      VEX_PP = 0x2;
+      break;
+    case X86II::XD:  // F2 0F
+      VEX_PP = 0x3;
+      break;
+    case X86II::XOP8:
+      VEX_5M = 0x8;
+      break;
+    case X86II::XOP9:
+      VEX_5M = 0x9;
+      break;
+    case X86II::A6:  // Bypass: Not used by VEX
+    case X86II::A7:  // Bypass: Not used by VEX
+    case X86II::TB:  // Bypass: Not used by VEX
+    case 0:
+      break;  // No prefix!
+  }
+
+
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      // Duplicate register.
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestMem: {
+      // MRMDestMem instructions forms:
+      //  MemAddr, src1(ModR/M)
+      //  MemAddr, src1(VEX_4V), src2(ModR/M)
+      //  MemAddr, src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      CurOp = X86::AddrNumOperands;
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      const MachineOperand &MO = MI.getOperand(CurOp);
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      break;
+    }
+    case X86II::MRMSrcMem:
+      // MRMSrcMem instructions forms:
+      //  src1(ModR/M), MemAddr
+      //  src1(ModR/M), src2(VEX_4V), MemAddr
+      //  src1(ModR/M), MemAddr, imm8
+      //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+      //
+      //  FMA4:
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        VEX_R = 0x0;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 1);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+      break;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m: {
+      // MRM[0-9]m instructions forms:
+      //  MemAddr
+      //  src1(VEX_4V), MemAddr
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, 0);
+
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
+        VEX_B = 0x0;
+      if (X86II::isX86_64ExtendedReg(
+                          MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
+        VEX_X = 0x0;
+      break;
+    }
+    case X86II::MRMSrcReg:
+      // MRMSrcReg instructions forms:
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+      //  dst(ModR/M), src1(ModR/M)
+      //  dst(ModR/M), src1(ModR/M), imm8
+      //
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+        CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+      if (HasVEX_4VOp3)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      break;
+    case X86II::MRMDestReg:
+      // MRMDestReg instructions forms:
+      //  dst(ModR/M), src(ModR/M)
+      //  dst(ModR/M), src(ModR/M), imm8
+      //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_B = 0x0;
+      CurOp++;
+
+      if (HasVEX_4V)
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+        VEX_R = 0x0;
+      break;
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // MRM0r-MRM7r instructions forms:
+      //  dst(VEX_4V), src(ModR/M), imm8
+      VEX_4V = getVEXRegisterEncoding(MI, 0);
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+        VEX_B = 0x0;
+      break;
+    default: // RawFrm
+      break;
+  }
+
+  // Emit segment override opcode prefix as needed.
+  emitSegmentOverridePrefix(TSFlags, MemOperand, MI);
+
+  // VEX opcode prefix can have 2 or 3 bytes
+  //
+  //  3 bytes:
+  //    +-----+ +--------------+ +-------------------+
+  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+  //    +-----+ +--------------+ +-------------------+
+  //  2 bytes:
+  //    +-----+ +-------------------+
+  //    | C5h | | R | vvvv | L | pp |
+  //    +-----+ +-------------------+
+  //
+  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+
+  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+    MCE.emitByte(0xC5);
+    MCE.emitByte(LastByte | (VEX_R << 7));
+    return;
+  }
+
+  // 3 byte VEX prefix
+  MCE.emitByte(XOP ? 0x8F : 0xC4);
+  MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M);
+  MCE.emitByte(LastByte | (VEX_W << 7));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
+                                           const MCInstrDesc *Desc) {
+  DEBUG(dbgs() << MI);
+
+  // If this is a pseudo instruction, lower it.
+  switch (Desc->getOpcode()) {
+  case X86::ADD16rr_DB:      Desc = UpdateOp(MI, II, X86::OR16rr); break;
+  case X86::ADD32rr_DB:      Desc = UpdateOp(MI, II, X86::OR32rr); break;
+  case X86::ADD64rr_DB:      Desc = UpdateOp(MI, II, X86::OR64rr); break;
+  case X86::ADD16ri_DB:      Desc = UpdateOp(MI, II, X86::OR16ri); break;
+  case X86::ADD32ri_DB:      Desc = UpdateOp(MI, II, X86::OR32ri); break;
+  case X86::ADD64ri32_DB:    Desc = UpdateOp(MI, II, X86::OR64ri32); break;
+  case X86::ADD16ri8_DB:     Desc = UpdateOp(MI, II, X86::OR16ri8); break;
+  case X86::ADD32ri8_DB:     Desc = UpdateOp(MI, II, X86::OR32ri8); break;
+  case X86::ADD64ri8_DB:     Desc = UpdateOp(MI, II, X86::OR64ri8); break;
+  case X86::ACQUIRE_MOV8rm:  Desc = UpdateOp(MI, II, X86::MOV8rm); break;
+  case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break;
+  case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break;
+  case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break;
+  case X86::RELEASE_MOV8mr:  Desc = UpdateOp(MI, II, X86::MOV8mr); break;
+  case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break;
+  case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break;
+  case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break;
+  }
+
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  unsigned Opcode = Desc->Opcode;
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0)
+    ++CurOp;
+  else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) {
+    assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    // Special case for GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  }
+
+  uint64_t TSFlags = Desc->TSFlags;
+
+  // Is this instruction encoded using the AVX VEX prefix?
+  bool HasVEXPrefix = (TSFlags >> X86II::VEXShift) & X86II::VEX;
+  // It uses the VEX.VVVV field?
+  bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+  bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
+  bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+  const unsigned MemOp4_I8IMMOperand = 2;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
+  if (MemoryOperand != -1) MemoryOperand += CurOp;
+
+  if (!HasVEXPrefix)
+    emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+  else
+    emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc);
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
+  switch (TSFlags & X86II::FormMask) {
+  default:
+    llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default:
+      llvm_unreachable("pseudo instructions should be removed before code"
+                       " emission");
+    // Do nothing for Int_MemBarrier - it's just a comment.  Add a debug
+    // to make it slightly easier to see.
+    case X86::Int_MemBarrier:
+      DEBUG(dbgs() << "#MEMBARRIER\n");
+      break;
+
+    case TargetOpcode::INLINEASM:
+      // We allow inline assembler nodes with empty bodies - they can
+      // implicitly define registers, which is ok for JIT.
+      if (MI.getOperand(0).getSymbolName()[0])
+        report_fatal_error("JIT does not support inline asm!");
+      break;
+    case TargetOpcode::PROLOG_LABEL:
+    case TargetOpcode::GC_LABEL:
+    case TargetOpcode::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getMCSymbol());
+      break;
+
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break;
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      MCE.emitByte(BaseOpcode);
+      emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
+      // Remember PIC base.
+      PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+      X86JITInfo *JTI = TM.getJITInfo();
+      JTI->setPICBase(MCE.getCurrentPCValue());
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm: {
+    MCE.emitByte(BaseOpcode);
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+
+    DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
+    DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
+    DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
+    DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
+    DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
+
+    if (MO.isMBB()) {
+      emitPCRelativeBlockAddress(MO.getMBB());
+      break;
+    }
+
+    if (MO.isGlobal()) {
+      emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+                        MO.getOffset(), 0);
+      break;
+    }
+
+    if (MO.isSymbol()) {
+      emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      break;
+    }
+
+    // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
+    if (MO.isJTI()) {
+      emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
+      break;
+    }
+
+    assert(MO.isImm() && "Unknown RawFrm operand!");
+    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+      // Fix up immediate operand for pc relative calls.
+      intptr_t Imm = (intptr_t)MO.getImm();
+      Imm = Imm - MCE.getCurrentPCValue() - 4;
+      emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
+    } else
+      emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+
+  case X86II::AddRegFrm: {
+    MCE.emitByte(BaseOpcode +
+                 getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri64i32)
+      rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+    // This should not occur on Darwin for relocatable objects.
+    if (Opcode == X86::MOV64ri)
+      rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      SrcRegNum++;
+    emitMemModRMByte(MI, CurOp,
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+    CurOp = SrcRegNum + 1;
+    break;
+  }
+
+  case X86II::MRMSrcReg: {
+    MCE.emitByte(BaseOpcode);
+
+    unsigned SrcRegNum = CurOp+1;
+    if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
+      ++SrcRegNum;
+
+    if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM)
+      ++SrcRegNum;
+
+    emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    // 2 operands skipped with HasMemOp4, compensate accordingly
+    CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    int AddrOperands = X86::AddrNumOperands;
+    unsigned FirstMemOp = CurOp+1;
+    if (HasVEX_4V) {
+      ++AddrOperands;
+      ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
+    }
+    if (HasMemOp4) // Skip second register source (encoded in I8IMM)
+      ++FirstMemOp;
+
+    MCE.emitByte(BaseOpcode);
+
+    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+      X86II::getSizeOfImm(Desc->TSFlags) : 0;
+    emitMemModRMByte(MI, FirstMemOp,
+                     getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
+    CurOp += AddrOperands + 1;
+    if (HasVEX_4VOp3)
+      ++CurOp;
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                     (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
+      ++CurOp;
+    intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ?
+      (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ?
+          X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
+    CurOp += X86::AddrNumOperands;
+
+    if (CurOp == NumOps)
+      break;
+
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO.isImm()) {
+      emitConstant(MO.getImm(), Size);
+      break;
+    }
+
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64mi32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                        Indirect);
+    } else if (MO.isSymbol())
+      emitExternalSymbolAddress(MO.getSymbolName(), rt);
+    else if (MO.isCPI())
+      emitConstPoolAddress(MO.getIndex(), rt);
+    else if (MO.isJTI())
+      emitJumpTableAddress(MO.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    MCE.emitByte(BaseOpcode);
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    ++CurOp;
+    break;
+
+  case X86II::MRM_C1:
+    MCE.emitByte(BaseOpcode);
+    MCE.emitByte(0xC1);
+    break;
+  case X86II::MRM_C8:
+    MCE.emitByte(BaseOpcode);
+    MCE.emitByte(0xC8);
+    break;
+  case X86II::MRM_C9:
+    MCE.emitByte(BaseOpcode);
+    MCE.emitByte(0xC9);
+    break;
+  case X86II::MRM_E8:
+    MCE.emitByte(BaseOpcode);
+    MCE.emitByte(0xE8);
+    break;
+  case X86II::MRM_F0:
+    MCE.emitByte(BaseOpcode);
+    MCE.emitByte(0xF0);
+    break;
+  }
+
+  while (CurOp != NumOps && NumOps - CurOp <= 2) {
+    // The last source register of a 4 operand instruction in AVX is encoded
+    // in bits[7:4] of a immediate byte.
+    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
+      const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
+                                                         : CurOp);
+      ++CurOp;
+      unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        RegNum |= 1 << 7;
+      // If there is an additional 5th operand it must be an immediate, which
+      // is encoded in bits[3:0]
+      if (CurOp != NumOps) {
+        const MachineOperand &MIMM = MI.getOperand(CurOp++);
+        if (MIMM.isImm()) {
+          unsigned Val = MIMM.getImm();
+          assert(Val < 16 && "Immediate operand value out of range");
+          RegNum |= Val;
+        }
+      }
+      emitConstant(RegNum, 1);
+    } else {
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    }
+  }
+
+  if (!MI.isVariadic() && CurOp != NumOps) {
+#ifndef NDEBUG
+    dbgs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 000000000000..cadec682a435
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -0,0 +1,2311 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86ISelLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+
+class X86FastISel : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// RegInfo - X86 register info.
+  ///
+  const X86RegisterInfo *RegInfo;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(FunctionLoweringInfo &funcInfo,
+                       const TargetLibraryInfo *libInfo)
+    : FastISel(funcInfo, libInfo) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+    RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
+  }
+
+  virtual bool TargetSelectInstruction(const Instruction *I);
+
+  /// TryToFoldLoad - The specified machine instr operand is a vreg, and that
+  /// vreg is being provided by the specified load instruction.  If possible,
+  /// try to fold the load as an operand to the instruction, returning true if
+  /// possible.
+  virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI);
+
+  virtual bool FastLowerArguments();
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
+
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+
+  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                         unsigned &ResultReg);
+
+  bool X86SelectAddress(const Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
+
+  bool X86SelectLoad(const Instruction *I);
+
+  bool X86SelectStore(const Instruction *I);
+
+  bool X86SelectRet(const Instruction *I);
+
+  bool X86SelectCmp(const Instruction *I);
+
+  bool X86SelectZExt(const Instruction *I);
+
+  bool X86SelectBranch(const Instruction *I);
+
+  bool X86SelectShift(const Instruction *I);
+
+  bool X86SelectSelect(const Instruction *I);
+
+  bool X86SelectTrunc(const Instruction *I);
+
+  bool X86SelectFPExt(const Instruction *I);
+  bool X86SelectFPTrunc(const Instruction *I);
+
+  bool X86VisitIntrinsicCall(const IntrinsicInst &I);
+  bool X86SelectCall(const Instruction *I);
+
+  bool DoSelectCall(const Instruction *I, const char *MemIntName);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return getTargetMachine()->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  unsigned TargetMaterializeConstant(const Constant *C);
+
+  unsigned TargetMaterializeAlloca(const AllocaInst *C);
+
+  unsigned TargetMaterializeFloatZero(const ConstantFP *CF);
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
+
+  bool IsMemcpySmall(uint64_t Len);
+
+  bool TryEmitSmallMemcpy(X86AddressMode DestAM,
+                          X86AddressMode SrcAM, uint64_t Len);
+};
+
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
+  EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  if (evt == MVT::Other || !evt.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  VT = evt.getSimpleVT();
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+    return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+    return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
+                                  unsigned &ResultReg) {
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+
+  ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                         DL, TII.get(Opc), ResultReg), AM);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
+    Val = AndResult;
+  }
+  // FALLTHROUGH, handling i1 as i8.
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32: Opc = X86::MOV32mr; break;
+  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+  case MVT::f32:
+    Opc = X86ScalarSSEf32 ?
+          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    Opc = X86ScalarSSEf64 ?
+          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
+    break;
+  case MVT::v4f32:
+    Opc = X86::MOVAPSmr;
+    break;
+  case MVT::v2f64:
+    Opc = X86::MOVAPDmr;
+    break;
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    Opc = X86::MOVDQAmr;
+    break;
+  }
+
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                         DL, TII.get(Opc)), AM).addReg(Val);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
+                                   const X86AddressMode &AM) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: break;
+    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if (isInt<32>(CI->getSExtValue()))
+        Opc = X86::MOV64mi32;
+      break;
+    }
+
+    if (Opc) {
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                             DL, TII.get(Opc)), AM)
+                             .addImm(Signed ? (uint64_t) CI->getSExtValue() :
+                                              CI->getZExtValue());
+      return true;
+    }
+  }
+
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;
+
+  return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
+                           Src, /*TODO: Kill=*/false);
+  if (RR == 0)
+    return false;
+
+  ResultReg = RR;
+  return true;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Don't walk into other basic blocks; it's possible we haven't
+    // visited them yet, so the instructions may not yet be assigned
+    // virtual registers.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+    if (Ty->getAddressSpace() > 255)
+      // Fast instruction selection doesn't support the special
+      // address spaces.
+      return false;
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::Alloca: {
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(A);
+    if (SI != FuncInfo.StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt<32>(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    X86AddressMode SavedAM = AM;
+
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      const Value *Op = *i;
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD.getStructLayout(STy);
+        Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
+        continue;
+      }
+
+      // A array/variable index is always of the form i*S where S is the
+      // constant scale size.  See if we can push the scale into immediates.
+      uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+      for (;;) {
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+          break;
+        }
+        if (isa<AddOperator>(Op) &&
+            (!isa<Instruction>(Op) ||
+             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
+               == FuncInfo.MBB) &&
+            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
+          // An add (in the same block) with a constant operand. Fold the
+          // constant.
+          ConstantInt *CI =
+            cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+          Disp += CI->getSExtValue() * S;
+          // Iterate on the other operand.
+          Op = cast<AddOperator>(Op)->getOperand(0);
+          continue;
+        }
+        if (IndexReg == 0 &&
+            (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+            (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op).first;
+          if (IndexReg == 0)
+            return false;
+          break;
+        }
+        // Unsupported.
+        goto unsupported_gep;
+      }
+    }
+    // Check for displacement overflow.
+    if (!isInt<32>(Disp))
+      break;
+    // Ok, the GEP indices were covered by constant-offset and scaled-index
+    // addressing. Update the address state and move on to examining the base.
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    if (X86SelectAddress(U->getOperand(0), AM))
+      return true;
+
+    // If we couldn't merge the gep value into this addr mode, revert back to
+    // our address and just match the value instead of completely failing.
+    AM = SavedAM;
+    break;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectCallAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle DLLImport.
+    if (GV->hasDLLImportLinkage())
+      return false;
+
+    // Can't handle TLS.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+      AM.Base.Reg = X86::RIP;
+    } else if (Subtarget->isPICStyleStubPIC()) {
+      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+    } else if (Subtarget->isPICStyleGOT()) {
+      AM.GVOpFlags = X86II::MO_GOTOFF;
+    }
+
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  const StoreInst *S = cast<StoreInst>(I);
+
+  if (S->isAtomic())
+    return false;
+
+  unsigned SABIAlignment =
+    TD.getABITypeAlignment(S->getValueOperand()->getType());
+  if (S->getAlignment() != 0 && S->getAlignment() < SABIAlignment)
+    return false;
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(1), AM))
+    return false;
+
+  return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectRet - Select and emit code to implement ret instructions.
+bool X86FastISel::X86SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall)
+    return false;
+
+  if (Subtarget->isTargetWin64())
+    return false;
+
+  // Don't handle popping bytes on return for now.
+  if (X86MFInfo->getBytesToPopOnReturn() != 0)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  if (F.isVarArg())
+    return false;
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
+                   I->getContext());
+    CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+    const Value *RV = Ret->getOperand(0);
+    unsigned Reg = getRegForValue(RV);
+    if (Reg == 0)
+      return false;
+
+    // Only handle a single return value for now.
+    if (ValLocs.size() != 1)
+      return false;
+
+    CCValAssign &VA = ValLocs[0];
+
+    // Don't bother handling odd stuff for now.
+    if (VA.getLocInfo() != CCValAssign::Full)
+      return false;
+    // Only handle register returns for now.
+    if (!VA.isRegLoc())
+      return false;
+
+    // The calling-convention tables for x87 returns don't tell
+    // the whole story.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+      return false;
+
+    unsigned SrcReg = Reg + VA.getValNo();
+    EVT SrcVT = TLI.getValueType(RV->getType());
+    EVT DstVT = VA.getValVT();
+    // Special handling for extended integers.
+    if (SrcVT != DstVT) {
+      if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
+        return false;
+
+      if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
+        return false;
+
+      assert(DstVT == MVT::i32 && "X86 should always ext to i32");
+
+      if (SrcVT == MVT::i1) {
+        if (Outs[0].Flags.isSExt())
+          return false;
+        SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
+        SrcVT = MVT::i8;
+      }
+      unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
+                                             ISD::SIGN_EXTEND;
+      SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
+                          SrcReg, /*TODO: Kill=*/false);
+    }
+
+    // Make the copy.
+    unsigned DstReg = VA.getLocReg();
+    const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg);
+    // Avoid a cross-class copy. This is very unlikely.
+    if (!SrcRC->contains(DstReg))
+      return false;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            DstReg).addReg(SrcReg);
+
+    // Add register to return instruction.
+    RetRegs.push_back(VA.getLocReg());
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax. We also do the same with %eax for Win32.
+  if (F.hasStructRetAttr() &&
+      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            RetReg).addReg(Reg);
+    RetRegs.push_back(RetReg);
+  }
+
+  // Now emit the RET.
+  MachineInstrBuilder MIB =
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+  return true;
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(const Instruction *I)  {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(0), AM))
+    return false;
+
+  unsigned ResultReg = 0;
+  if (X86FastEmitLoad(VT, AM, ResultReg)) {
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  return false;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = Subtarget->hasSSE2();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+  }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default: return 0;
+  case MVT::i8: return X86::CMP8ri;
+  case MVT::i16: return X86::CMP16ri;
+  case MVT::i32: return X86::CMP32ri;
+  case MVT::i64:
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
+                                     EVT VT) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareImmOpc))
+        .addReg(Op0Reg)
+        .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
+  if (CompareOpc == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CompareOpc))
+    .addReg(Op0Reg)
+    .addReg(Op1Reg);
+
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(const Instruction *I) {
+  const CmpInst *CI = cast<CmpInst>(I);
+
+  MVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+  unsigned SetCCOpc;
+  bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+  switch (CI->getPredicate()) {
+  case CmpInst::FCMP_OEQ: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+
+    unsigned EReg = createResultReg(&X86::GR8RegClass);
+    unsigned NPReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETEr), EReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::SETNPr), NPReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_UNE: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+
+    unsigned NEReg = createResultReg(&X86::GR8RegClass);
+    unsigned PReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETNEr), NEReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SETPr), PReg);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::OR8rr),ResultReg)
+      .addReg(PReg).addReg(NEReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_OLT: SwapArgs = true;  SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OLE: SwapArgs = true;  SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+  case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr;  break;
+  case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::FCMP_UGT: SwapArgs = true;  SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_UGE: SwapArgs = true;  SetCCOpc = X86::SETBEr; break;
+  case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+
+  case CmpInst::ICMP_EQ:  SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::ICMP_NE:  SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+  case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr;  break;
+  case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+  case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr;  break;
+  case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+  default:
+    return false;
+  }
+
+  const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  if (SwapArgs)
+    std::swap(Op0, Op1);
+
+  // Emit a compare of Op0/Op1.
+  if (!X86FastEmitCompare(Op0, Op1, VT))
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(SetCCOpc), ResultReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(const Instruction *I) {
+  // Handle zero-extension from i1 to i8, which is common.
+  if (!I->getOperand(0)->getType()->isIntegerTy(1))
+    return false;
+
+  EVT DstVT = TLI.getValueType(I->getType());
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  if (ResultReg == 0)
+    return false;
+
+  // Set the high bits to zero.
+  ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+  if (ResultReg == 0)
+    return false;
+
+  if (DstVT != MVT::i8) {
+    ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
+                           ResultReg, /*Kill=*/true);
+    if (ResultReg == 0)
+      return false;
+  }
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+
+bool X86FastISel::X86SelectBranch(const Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block (values defined on other blocks may not have
+  // initialized registers).
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+      unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+      switch (Predicate) {
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::FCMP_UNE;
+        // FALL THROUGH
+      case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+      case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
+      case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+      case CmpInst::FCMP_OLT: SwapArgs = true;  BranchOpc = X86::JA_4;  break;
+      case CmpInst::FCMP_OLE: SwapArgs = true;  BranchOpc = X86::JAE_4; break;
+      case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE_4; break;
+      case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP_4; break;
+      case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP_4;  break;
+      case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE_4;  break;
+      case CmpInst::FCMP_UGT: SwapArgs = true;  BranchOpc = X86::JB_4;  break;
+      case CmpInst::FCMP_UGE: SwapArgs = true;  BranchOpc = X86::JBE_4; break;
+      case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
+      case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+
+      case CmpInst::ICMP_EQ:  SwapArgs = false; BranchOpc = X86::JE_4;  break;
+      case CmpInst::ICMP_NE:  SwapArgs = false; BranchOpc = X86::JNE_4; break;
+      case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA_4;  break;
+      case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE_4; break;
+      case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB_4;  break;
+      case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE_4; break;
+      case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG_4;  break;
+      case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE_4; break;
+      case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL_4;  break;
+      case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE_4; break;
+      default:
+        return false;
+      }
+
+      const Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      if (SwapArgs)
+        std::swap(Op0, Op1);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(Op0, Op1, VT))
+        return false;
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(BranchOpc))
+        .addMBB(TrueMBB);
+
+      if (Predicate == CmpInst::FCMP_UNE) {
+        // X86 requires a second branch to handle UNE (and OEQ,
+        // which is mapped to UNE above).
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JP_4))
+          .addMBB(TrueMBB);
+      }
+
+      FastEmitBranch(FalseMBB, DL);
+      FuncInfo.MBB->addSuccessor(TrueMBB);
+      return true;
+    }
+  } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
+    // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
+    // typically happen for _Bool and C++ bools.
+    MVT SourceVT;
+    if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
+        isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
+      unsigned TestOpc = 0;
+      switch (SourceVT.SimpleTy) {
+      default: break;
+      case MVT::i8:  TestOpc = X86::TEST8ri; break;
+      case MVT::i16: TestOpc = X86::TEST16ri; break;
+      case MVT::i32: TestOpc = X86::TEST32ri; break;
+      case MVT::i64: TestOpc = X86::TEST64ri32; break;
+      }
+      if (TestOpc) {
+        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        if (OpReg == 0) return false;
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TestOpc))
+          .addReg(OpReg).addImm(1);
+
+        unsigned JmpOpc = X86::JNE_4;
+        if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
+          std::swap(TrueMBB, FalseMBB);
+          JmpOpc = X86::JE_4;
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(JmpOpc))
+          .addMBB(TrueMBB);
+        FastEmitBranch(FalseMBB, DL);
+        FuncInfo.MBB->addSuccessor(TrueMBB);
+        return true;
+      }
+    }
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
+  // in an explicit cast, so make sure to handle that correctly.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8ri))
+    .addReg(OpReg).addImm(1);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::JNE_4))
+    .addMBB(TrueMBB);
+  FastEmitBranch(FalseMBB, DL);
+  FuncInfo.MBB->addSuccessor(TrueMBB);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(const Instruction *I) {
+  unsigned CReg = 0, OpReg = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (I->getType()->isIntegerTy(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(16)) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(32)) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
+    default: return false;
+    }
+  } else if (I->getType()->isIntegerTy(64)) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+          CReg).addReg(Op1Reg);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::KILL), X86::CL)
+      .addReg(CReg, RegState::Kill);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpReg), ResultReg)
+    .addReg(Op0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(const Instruction *I) {
+  MVT VT;
+  if (!isTypeLegal(I->getType(), VT))
+    return false;
+
+  // We only use cmov here, if we don't have a cmov instruction bail.
+  if (!Subtarget->hasCMov()) return false;
+
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (VT == MVT::i16) {
+    Opc = X86::CMOVE16rr;
+    RC = &X86::GR16RegClass;
+  } else if (VT == MVT::i32) {
+    Opc = X86::CMOVE32rr;
+    RC = &X86::GR32RegClass;
+  } else if (VT == MVT::i64) {
+    Opc = X86::CMOVE64rr;
+    RC = &X86::GR64RegClass;
+  } else {
+    return false;
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  unsigned Op2Reg = getRegForValue(I->getOperand(2));
+  if (Op2Reg == 0) return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TEST8rr))
+    .addReg(Op0Reg).addReg(Op0Reg);
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(Op1Reg).addReg(Op2Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectFPExt(const Instruction *I) {
+  // fpext from float to double.
+  if (X86ScalarSSEf64 &&
+      I->getType()->isDoubleTy()) {
+    const Value *V = I->getOperand(0);
+    if (V->getType()->isFloatTy()) {
+      unsigned OpReg = getRegForValue(V);
+      if (OpReg == 0) return false;
+      unsigned ResultReg = createResultReg(&X86::FR64RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(X86::CVTSS2SDrr), ResultReg)
+        .addReg(OpReg);
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
+  if (X86ScalarSSEf64) {
+    if (I->getType()->isFloatTy()) {
+      const Value *V = I->getOperand(0);
+      if (V->getType()->isDoubleTy()) {
+        unsigned OpReg = getRegForValue(V);
+        if (OpReg == 0) return false;
+        unsigned ResultReg = createResultReg(&X86::FR32RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(X86::CVTSD2SSrr), ResultReg)
+          .addReg(OpReg);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(const Instruction *I) {
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(I->getType());
+
+  // This code only handles truncation to byte.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    return false;
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  if (SrcVT == MVT::i8) {
+    // Truncate from i8 to i1; no code needed.
+    UpdateValueMap(I, InputReg);
+    return true;
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // If we're on x86-32; we can't extract an i8 from a general register.
+    // First issue a copy to GR16_ABCD or GR32_ABCD.
+    const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ?
+      (const TargetRegisterClass*)&X86::GR16_ABCDRegClass :
+      (const TargetRegisterClass*)&X86::GR32_ABCDRegClass;
+    unsigned CopyReg = createResultReg(CopyRC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            CopyReg).addReg(InputReg);
+    InputReg = CopyReg;
+  }
+
+  // Issue an extract_subreg.
+  unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+                                                  InputReg, /*Kill=*/true,
+                                                  X86::sub_8bit);
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::IsMemcpySmall(uint64_t Len) {
+  return Len <= (Subtarget->is64Bit() ? 32 : 16);
+}
+
+bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
+                                     X86AddressMode SrcAM, uint64_t Len) {
+
+  // Make sure we don't bloat code by inlining very large memcpy's.
+  if (!IsMemcpySmall(Len))
+    return false;
+
+  bool i64Legal = Subtarget->is64Bit();
+
+  // We don't care about alignment here since we just emit integer accesses.
+  while (Len) {
+    MVT VT;
+    if (Len >= 8 && i64Legal)
+      VT = MVT::i64;
+    else if (Len >= 4)
+      VT = MVT::i32;
+    else if (Len >= 2)
+      VT = MVT::i16;
+    else {
+      VT = MVT::i8;
+    }
+
+    unsigned Reg;
+    bool RV = X86FastEmitLoad(VT, SrcAM, Reg);
+    RV &= X86FastEmitStore(VT, Reg, DestAM);
+    assert(RV && "Failed to emit load or store??");
+
+    unsigned Size = VT.getSizeInBits()/8;
+    Len -= Size;
+    DestAM.Disp += Size;
+    SrcAM.Disp += Size;
+  }
+
+  return true;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::memcpy: {
+    const MemCpyInst &MCI = cast<MemCpyInst>(I);
+    // Don't handle volatile or variable length memcpys.
+    if (MCI.isVolatile())
+      return false;
+
+    if (isa<ConstantInt>(MCI.getLength())) {
+      // Small memcpy's are common enough that we want to do them
+      // without a call if possible.
+      uint64_t Len = cast<ConstantInt>(MCI.getLength())->getZExtValue();
+      if (IsMemcpySmall(Len)) {
+        X86AddressMode DestAM, SrcAM;
+        if (!X86SelectAddress(MCI.getRawDest(), DestAM) ||
+            !X86SelectAddress(MCI.getRawSource(), SrcAM))
+          return false;
+        TryEmitSmallMemcpy(DestAM, SrcAM, Len);
+        return true;
+      }
+    }
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MCI.getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MCI.getSourceAddressSpace() > 255 || MCI.getDestAddressSpace() > 255)
+      return false;
+
+    return DoSelectCall(&I, "memcpy");
+  }
+  case Intrinsic::memset: {
+    const MemSetInst &MSI = cast<MemSetInst>(I);
+
+    if (MSI.isVolatile())
+      return false;
+
+    unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
+    if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
+      return false;
+
+    if (MSI.getDestAddressSpace() > 255)
+      return false;
+
+    return DoSelectCall(&I, "memset");
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code to store the stack guard onto the stack.
+    EVT PtrTy = TLI.getPointerTy();
+
+    const Value *Op1 = I.getArgOperand(0); // The guard's value.
+    const AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+    // Grab the frame index.
+    X86AddressMode AM;
+    if (!X86SelectAddress(Slot, AM)) return false;
+    if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
+    return true;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
+    X86AddressMode AM;
+    assert(DI->getAddress() && "Null address should be checked earlier!");
+    if (!X86SelectAddress(DI->getAddress(), AM))
+      return false;
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    // FIXME may need to add RegState::Debug to any registers produced,
+    // although ESP/EBP should be the only ones at the moment.
+    addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II), AM).
+      addImm(0).addMetadata(DI->getVariable());
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::TRAP));
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow: {
+    // FIXME: Should fold immediates.
+
+    // Replace "add with overflow" intrinsics with an "add" instruction followed
+    // by a seto/setc instruction.
+    const Function *Callee = I.getCalledFunction();
+    Type *RetTy =
+      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+    MVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    const Value *Op1 = I.getArgOperand(0);
+    const Value *Op2 = I.getArgOperand(1);
+    unsigned Reg1 = getRegForValue(Op1);
+    unsigned Reg2 = getRegForValue(Op2);
+
+    if (Reg1 == 0 || Reg2 == 0)
+      // FIXME: Handle values *not* in registers.
+      return false;
+
+    unsigned OpC = 0;
+    if (VT == MVT::i32)
+      OpC = X86::ADD32rr;
+    else if (VT == MVT::i64)
+      OpC = X86::ADD64rr;
+    else
+      return false;
+
+    // The call to CreateRegs builds two sequential registers, to store the
+    // both the returned values.
+    unsigned ResultReg = FuncInfo.CreateRegs(I.getType());
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(OpC), ResultReg)
+      .addReg(Reg1).addReg(Reg2);
+
+    unsigned Opc = X86::SETBr;
+    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+      Opc = X86::SETOr;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg+1);
+
+    UpdateValueMap(&I, ResultReg, 2);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::FastLowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  if (Subtarget->isTargetWin64())
+    return false;
+
+  const Function *F = FuncInfo.Fn;
+  if (F->isVarArg())
+    return false;
+
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::C)
+    return false;
+  
+  if (!Subtarget->is64Bit())
+    return false;
+  
+  // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (Idx > 6)
+      return false;
+
+    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+      return false;
+
+    Type *ArgTy = I->getType();
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+      return false;
+
+    EVT ArgVT = TLI.getValueType(ArgTy);
+    if (!ArgVT.isSimple()) return false;
+    switch (ArgVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+    case MVT::i64:
+      break;
+    default:
+      return false;
+    }
+  }
+
+  static const uint16_t GPR32ArgRegs[] = {
+    X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+  };
+  static const uint16_t GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+  };
+
+  Idx = 0;
+  const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
+  const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++Idx) {
+    if (I->use_empty())
+      continue;
+    bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
+    const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
+    unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+    // Without this, EmitLiveInCopies may eliminate the livein if its only
+    // use is a bitcast (which isn't turned into an instruction).
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(DstReg, getKillRegState(true));
+    UpdateValueMap(I, ResultReg);
+  }
+  return true;
+}
+
+bool X86FastISel::X86SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm yet.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Handle intrinsic calls.
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+    return X86VisitIntrinsicCall(*II);
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (cast<CallInst>(I)->isTailCall())
+    return false;
+
+  return DoSelectCall(I, 0);
+}
+
+static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
+                                           const ImmutableCallSite &CS) {
+  if (Subtarget.is64Bit())
+    return 0;
+  if (Subtarget.isTargetWindows())
+    return 0;
+  CallingConv::ID CC = CS.getCallingConv();
+  if (CC == CallingConv::Fast || CC == CallingConv::GHC)
+    return 0;
+  if (!CS.paramHasAttr(1, Attribute::StructRet))
+    return 0;
+  if (CS.paramHasAttr(1, Attribute::InReg))
+    return 0;
+  return 4;
+}
+
+// Select either a call, or an llvm.memcpy/memmove/memset intrinsic
+bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Handle only C and fastcc calling conventions for now.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+  if (CC != CallingConv::C && CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
+    return false;
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool isVarArg = FTy->isVarArg();
+
+  // Don't know how to handle Win64 varargs yet.  Nothing special needed for
+  // x86-32.  Special handling for x86-64 is implemented.
+  if (isVarArg && Subtarget->isTargetWin64())
+    return false;
+
+  // Fast-isel doesn't know about callee-pop yet.
+  if (X86::isCalleePop(CC, Subtarget->is64Bit(), isVarArg,
+                       TM.Options.GuaranteedTailCallOpt))
+    return false;
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI);
+  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
+                                           *FuncInfo.MF, FTy->isVarArg(),
+                                           Outs, FTy->getContext());
+  if (!CanLowerReturn)
+    return false;
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+  unsigned CalleeOp = 0;
+  const GlobalValue *GV = 0;
+  if (CalleeAM.GV != 0) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
+  // Deal with call operands first.
+  SmallVector<const Value *, 8> ArgVals;
+  SmallVector<unsigned, 8> Args;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  unsigned arg_size = CS.arg_size();
+  Args.reserve(arg_size);
+  ArgVals.reserve(arg_size);
+  ArgVTs.reserve(arg_size);
+  ArgFlags.reserve(arg_size);
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    // If we're lowering a mem intrinsic instead of a regular call, skip the
+    // last two arguments, which should not passed to the underlying functions.
+    if (MemIntName && e-i <= 2)
+      break;
+    Value *ArgVal = *i;
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+      PointerType *Ty = cast<PointerType>(ArgVal->getType());
+      Type *ElementTy = Ty->getElementType();
+      unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
+      unsigned FrameAlign = CS.getParamAlignment(AttrInd);
+      if (!FrameAlign)
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+      Flags.setByVal();
+      Flags.setByValSize(FrameSize);
+      Flags.setByValAlign(FrameAlign);
+      if (!IsMemcpySmall(FrameSize))
+        return false;
+    }
+
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+      Flags.setInReg();
+    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+      Flags.setNest();
+
+    // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
+    // instruction.  This is safe because it is common to all fastisel supported
+    // calling conventions on x86.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(ArgVal)) {
+      if (CI->getBitWidth() == 1 || CI->getBitWidth() == 8 ||
+          CI->getBitWidth() == 16) {
+        if (Flags.isSExt())
+          ArgVal = ConstantExpr::getSExt(CI,Type::getInt32Ty(CI->getContext()));
+        else
+          ArgVal = ConstantExpr::getZExt(CI,Type::getInt32Ty(CI->getContext()));
+      }
+    }
+
+    unsigned ArgReg;
+
+    // Passing bools around ends up doing a trunc to i1 and passing it.
+    // Codegen this as an argument + "and 1".
+    if (ArgVal->getType()->isIntegerTy(1) && isa<TruncInst>(ArgVal) &&
+        cast<TruncInst>(ArgVal)->getParent() == I->getParent() &&
+        ArgVal->hasOneUse()) {
+      ArgVal = cast<TruncInst>(ArgVal)->getOperand(0);
+      ArgReg = getRegForValue(ArgVal);
+      if (ArgReg == 0) return false;
+
+      MVT ArgVT;
+      if (!isTypeLegal(ArgVal->getType(), ArgVT)) return false;
+
+      ArgReg = FastEmit_ri(ArgVT, ArgVT, ISD::AND, ArgReg,
+                           ArgVal->hasOneUse(), 1);
+    } else {
+      ArgReg = getRegForValue(ArgVal);
+    }
+
+    if (ArgReg == 0) return false;
+
+    Type *ArgTy = ArgVal->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT))
+      return false;
+    if (ArgVT == MVT::x86mmx)
+      return false;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(ArgReg);
+    ArgVals.push_back(ArgVal);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, *FuncInfo.MF, TM, ArgLocs,
+                 I->getParent()->getContext());
+
+  // Allocate shadow area for Win64
+  if (Subtarget->isTargetWin64())
+    CCInfo.AllocateStack(32, 8);
+
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackDown))
+    .addImm(NumBytes);
+
+  // Process argument: walk the register/memloc assignments, inserting
+  // copies / loads.
+  SmallVector<unsigned, 4> RegArgs;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = Args[VA.getValNo()];
+    EVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
+             "Unexpected extend");
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+
+      assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::BCvt: {
+      unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT(),
+                               ISD::BITCAST, Arg, /*TODO: Kill=*/false);
+      assert(BC != 0 && "Failed to emit a bitcast!");
+      Arg = BC;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::VExt: 
+      // VExt has not been implemented, so this should be impossible to reach
+      // for now.  However, fallback to Selection DAG isel once implemented.
+      return false;
+    case CCValAssign::Indirect:
+      // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
+      // support this.
+      return false;
+    }
+
+    if (VA.isRegLoc()) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              VA.getLocReg()).addReg(Arg);
+      RegArgs.push_back(VA.getLocReg());
+    } else {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = RegInfo->getStackRegister();
+      AM.Disp = LocMemOffset;
+      const Value *ArgVal = ArgVals[VA.getValNo()];
+      ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
+
+      if (Flags.isByVal()) {
+        X86AddressMode SrcAM;
+        SrcAM.Base.Reg = Arg;
+        bool Res = TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize());
+        assert(Res && "memcpy length already checked!"); (void)Res;
+      } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
+        // If this is a really simple value, emit this with the Value* version
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // as it can cause us to reevaluate the argument.
+        if (!X86FastEmitStore(ArgVT, ArgVal, AM))
+          return false;
+      } else {
+        if (!X86FastEmitStore(ArgVT, Arg, AM))
+          return false;
+      }
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (Subtarget->isPICStyleGOT()) {
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            X86::EBX).addReg(Base);
+  }
+
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+    // Count the number of XMM registers allocated.
+    static const uint16_t XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::MOV8ri),
+            X86::AL).addImm(NumXMMRegs);
+  }
+
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc;
+    if (Subtarget->is64Bit())
+      CallOpc = X86::CALL64r;
+    else
+      CallOpc = X86::CALL32r;
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
+      .addReg(CalleeOp);
+
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc;
+    if (Subtarget->is64Bit())
+      CallOpc = X86::CALL64pcrel32;
+    else
+      CallOpc = X86::CALLpcrel32;
+
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+    // external symbols most go through the PLT in PIC mode.  If the symbol
+    // has hidden or protected visibility, or if it is static or local, then
+    // we don't need to use the PLT - we can directly call it.
+    if (Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc));
+    if (MemIntName)
+      MIB.addExternalSymbol(MemIntName, OpFlags);
+    else
+      MIB.addGlobalAddress(GV, 0, OpFlags);
+  }
+
+  // Add a register mask with the call-preserved registers.
+  // Proper defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX, RegState::Implicit);
+
+  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+    MIB.addReg(X86::AL, RegState::Implicit);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i], RegState::Implicit);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  const unsigned NumBytesCallee = computeBytesPoppedByCallee(*Subtarget, CS);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(AdjStackUp))
+    .addImm(NumBytes).addImm(NumBytesCallee);
+
+  // Build info for return calling conv lowering code.
+  // FIXME: This is practically a copy-paste from TargetLowering::LowerCallTo.
+  SmallVector<ISD::InputArg, 32> Ins;
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, I->getType(), RetTys);
+  for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
+    EVT VT = RetTys[i];
+    MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
+    for (unsigned j = 0; j != NumRegs; ++j) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.Used = !CS.getInstruction()->use_empty();
+      if (CS.paramHasAttr(0, Attribute::SExt))
+        MyFlags.Flags.setSExt();
+      if (CS.paramHasAttr(0, Attribute::ZExt))
+        MyFlags.Flags.setZExt();
+      if (CS.paramHasAttr(0, Attribute::InReg))
+        MyFlags.Flags.setInReg();
+      Ins.push_back(MyFlags);
+    }
+  }
+
+  // Now handle call return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CC, false, *FuncInfo.MF, TM, RVLocs,
+                    I->getParent()->getContext());
+  unsigned ResultReg = FuncInfo.CreateRegs(I->getType());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    EVT CopyVT = RVLocs[i].getValVT();
+    unsigned CopyReg = ResultReg + i;
+
+    // If this is a call to a function that returns an fp value on the x87 fp
+    // stack, but where we prefer to use the value in xmm registers, copy it
+    // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+    if ((RVLocs[i].getLocReg() == X86::ST0 ||
+         RVLocs[i].getLocReg() == X86::ST1)) {
+      if (isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
+        CopyVT = MVT::f80;
+        CopyReg = createResultReg(&X86::RFP80RegClass);
+      }
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::FpPOP_RETVAL),
+              CopyReg);
+    } else {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              CopyReg).addReg(RVLocs[i].getLocReg());
+      UsedRegs.push_back(RVLocs[i].getLocReg());
+    }
+
+    if (CopyVT != RVLocs[i].getValVT()) {
+      // Round the F80 the right size, which also moves to the appropriate xmm
+      // register. This is accomplished by storing the F80 value in memory and
+      // then loading it back. Ewww...
+      EVT ResVT = RVLocs[i].getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(Opc)), FI)
+        .addReg(CopyReg);
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                TII.get(Opc), ResultReg + i), FI);
+    }
+  }
+
+  if (RVLocs.size())
+    UpdateValueMap(I, ResultReg, RVLocs.size());
+
+  // Set all unused physreg defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(const Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::Ret:
+    return X86SelectRet(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::Call:
+    return X86SelectCall(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
+  MVT VT;
+  if (!isTypeLegal(C->getType(), VT))
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = &X86::GR8RegClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = &X86::GR16RegClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = &X86::GR32RegClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = &X86::GR64RegClass;
+    break;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  // Materialize addresses with LEA instructions.
+  if (isa<GlobalValue>(C)) {
+    X86AddressMode AM;
+    if (X86SelectAddress(C, AM)) {
+      // If the expression is just a basereg, then we're done, otherwise we need
+      // to emit an LEA.
+      if (AM.BaseType == X86AddressMode::RegBase &&
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+        return AM.Base.Reg;
+
+      Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
+      unsigned ResultReg = createResultReg(RC);
+      addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                             TII.get(Opc), ResultReg), AM);
+      return ResultReg;
+    }
+    return 0;
+  }
+
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // Alignment of vector types.  FIXME!
+    Align = TD.getTypeAllocSize(C->getType());
+  }
+
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  unsigned char OpFlag = 0;
+  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+    PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+  } else if (Subtarget->isPICStyleRIPRel() &&
+             TM.getCodeModel() == CodeModel::Small) {
+    PICBase = X86::RIP;
+  }
+
+  // Create the load from the constant pool.
+  unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+  unsigned ResultReg = createResultReg(RC);
+  addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                   TII.get(Opc), ResultReg),
+                           MCPOffset, PICBase, OpFlag);
+
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but TargetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and TargetMaterializeAlloca.
+  if (!FuncInfo.StaticAllocaMap.count(C))
+    return 0;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM))
+    return 0;
+  unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                         TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
+  MVT VT;
+  if (!isTypeLegal(CF->getType(), VT))
+    return 0;
+
+  // Get opcode and regclass for the given zero.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.SimpleTy) {
+  default: return 0;
+  case MVT::f32:
+    if (X86ScalarSSEf32) {
+      Opc = X86::FsFLD0SS;
+      RC  = &X86::FR32RegClass;
+    } else {
+      Opc = X86::LD_Fp032;
+      RC  = &X86::RFP32RegClass;
+    }
+    break;
+  case MVT::f64:
+    if (X86ScalarSSEf64) {
+      Opc = X86::FsFLD0SD;
+      RC  = &X86::FR64RegClass;
+    } else {
+      Opc = X86::LD_Fp064;
+      RC  = &X86::RFP64RegClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return 0;
+  }
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg);
+  return ResultReg;
+}
+
+
+/// TryToFoldLoad - The specified machine instr operand is a vreg, and that
+/// vreg is being provided by the specified load instruction.  If possible,
+/// try to fold the load as an operand to the instruction, returning true if
+/// possible.
+bool X86FastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
+                                const LoadInst *LI) {
+  X86AddressMode AM;
+  if (!X86SelectAddress(LI->getOperand(0), AM))
+    return false;
+
+  const X86InstrInfo &XII = (const X86InstrInfo&)TII;
+
+  unsigned Size = TD.getTypeAllocSize(LI->getType());
+  unsigned Alignment = LI->getAlignment();
+
+  SmallVector<MachineOperand, 8> AddrOps;
+  AM.getFullAddress(AddrOps);
+
+  MachineInstr *Result =
+    XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
+  if (Result == 0) return false;
+
+  FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
+  MI->eraseFromParent();
+  return true;
+}
+
+
+namespace llvm {
+  FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
+                                const TargetLibraryInfo *libInfo) {
+    return new X86FastISel(funcInfo, libInfo);
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 000000000000..0585b43a4640
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,1768 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// pseudo registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// The x87 hardware tracks liveness of the stack registers, so it is necessary
+// to implement exact liveness tracking between basic blocks. The CFG edges are
+// partitioned into bundles where the same FP registers must be live in
+// identical stack positions. Instructions are inserted at the end of each basic
+// block to rearrange the live registers to match the outgoing bundle.
+//
+// This approach avoids splitting critical edges at the potential cost of more
+// live register shuffling instructions when critical edges are present.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(ID) {
+      initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+      // This is really only to keep valgrind quiet.
+      // The logic in isLive() is too much for it.
+      memset(Stack, 0, sizeof(Stack));
+      memset(RegMap, 0, sizeof(RegMap));
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<EdgeBundles>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Two CFG edges are related if they leave the same block, or enter the same
+    // block. The transitive closure of an edge under this relation is a
+    // LiveBundle. It represents a set of CFG edges where the live FP stack
+    // registers must be allocated identically in the x87 stack.
+    //
+    // A LiveBundle is usually all the edges leaving a block, or all the edges
+    // entering a block, but it can contain more edges if critical edges are
+    // present.
+    //
+    // The set of live FP registers in a LiveBundle is calculated by bundleCFG,
+    // but the exact mapping of FP registers to stack slots is fixed later.
+    struct LiveBundle {
+      // Bit mask of live FP registers. Bit 0 = FP0, bit 1 = FP1, &c.
+      unsigned Mask;
+
+      // Number of pre-assigned live registers in FixStack. This is 0 when the
+      // stack order has not yet been fixed.
+      unsigned FixCount;
+
+      // Assigned stack order for live-in registers.
+      // FixStack[i] == getStackEntry(i) for all i < FixCount.
+      unsigned char FixStack[8];
+
+      LiveBundle() : Mask(0), FixCount(0) {}
+
+      // Have the live registers been assigned a stack order yet?
+      bool isFixed() const { return !Mask || FixCount; }
+    };
+
+    // Numbered LiveBundle structs. LiveBundles[0] is used for all CFG edges
+    // with no live FP registers.
+    SmallVector<LiveBundle, 8> LiveBundles;
+
+    // The edge bundle analysis provides indices into the LiveBundles vector.
+    EdgeBundles *Bundles;
+
+    // Return a bitmask of FP registers in block's live-in list.
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+      unsigned Mask = 0;
+      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
+           E = MBB->livein_end(); I != E; ++I) {
+        unsigned Reg = *I - X86::FP0;
+        if (Reg < 8)
+          Mask |= 1 << Reg;
+      }
+      return Mask;
+    }
+
+    // Partition all the CFG edges into LiveBundles.
+    void bundleCFG(MachineFunction &MF);
+
+    MachineBasicBlock *MBB;     // Current basic block
+
+    // The hardware keeps track of how many FP registers are live, so we have
+    // to model that exactly. Usually, each live register corresponds to an
+    // FP<n> register, but when dealing with calls, returns, and inline
+    // assembly, it is sometimes necessary to have live scratch registers.
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned StackTop;          // The current top of the FP stack.
+
+    enum {
+      NumFPRegs = 16            // Including scratch pseudo-registers.
+    };
+
+    // For each live FP<n> register, point to its Stack[] entry.
+    // The first entries correspond to FP0-FP6, the rest are scratch registers
+    // used when we need slightly different live registers than what the
+    // register allocator thinks.
+    unsigned RegMap[NumFPRegs];
+
+    // Pending fixed registers - Inline assembly needs FP registers to appear
+    // in fixed stack slot positions. This is handled by copying FP registers
+    // to ST registers before the instruction, and copying back after the
+    // instruction.
+    //
+    // This is modeled with pending ST registers. NumPendingSTs is the number
+    // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP
+    // register that holds the ST value. The ST registers are not moved into
+    // place until immediately before the instruction that needs them.
+    //
+    // It can happen that we need an ST register to be live when no FP register
+    // holds the value:
+    //
+    //   %ST0 = COPY %FP4<kill>
+    //
+    // When that happens, we allocate a scratch FP register to hold the ST
+    // value. That means every register in PendingST must be live.
+
+    unsigned NumPendingSTs;
+    unsigned char PendingST[8];
+
+    // Set up our stack model to match the incoming registers to MBB.
+    void setupBlockStack();
+
+    // Shuffle live registers to match the expectations of successor blocks.
+    void finishBlockStack();
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dumpStack() const {
+      dbgs() << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        dbgs() << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      for (unsigned i = 0; i != NumPendingSTs; ++i)
+        dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
+      dbgs() << "\n";
+    }
+#endif
+
+    /// getSlot - Return the stack slot number a particular register number is
+    /// in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < NumFPRegs && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    /// isLive - Is RegNo currently live in the stack?
+    bool isLive(unsigned RegNo) const {
+      unsigned Slot = getSlot(RegNo);
+      return Slot < StackTop && Stack[Slot] == RegNo;
+    }
+
+    /// getScratchReg - Return an FP register that is not currently in use.
+    unsigned getScratchReg() const {
+      for (int i = NumFPRegs - 1; i >= 8; --i)
+        if (!isLive(i))
+          return i;
+      llvm_unreachable("Ran out of scratch FP registers");
+    }
+
+    /// isScratchReg - Returns trus if RegNo is a scratch FP register.
+    static bool isScratchReg(unsigned RegNo) {
+      return RegNo > 8 && RegNo < NumFPRegs;
+    }
+
+    /// getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      if (STi >= StackTop)
+        report_fatal_error("Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    /// getSTReg - Return the X86::ST(i) register which contains the specified
+    /// FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < NumFPRegs && "Register number out of range!");
+      if (StackTop >= 8)
+        report_fatal_error("Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      if (RegMap[RegOnTop] >= StackTop)
+        report_fatal_error("Access past stack top!");
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      ++NumFXCH;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      DebugLoc dl = I == MBB->end() ? DebugLoc() : I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
+    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
+    /// them to new scratch registers.
+    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
+      for (unsigned i = 0; i != NumPendingSTs; ++i) {
+        if (PendingST[i] != RegNo)
+          continue;
+        unsigned SR = getScratchReg();
+        DEBUG(dbgs() << "Duplicating pending ST" << i
+                     << " in FP" << RegNo << " to FP" << SR << '\n');
+        duplicateToTop(RegNo, SR, I);
+        PendingST[i] = SR;
+      }
+    }
+
+    /// popStackAfter - Pop the current value off of the top of the FP stack
+    /// after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    /// freeStackSlotAfter - Free the specified register from the register
+    /// stack, so that it is no longer in a register.  If the register is
+    /// currently at the top of the stack, we just pop the current instruction,
+    /// otherwise we store the current top-of-stack into the specified slot,
+    /// then pop the top of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    /// freeStackSlotBefore - Just the pop, no folding. Return the inserted
+    /// instruction.
+    MachineBasicBlock::iterator
+    freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo);
+
+    /// Adjust the live registers to be the set in Mask.
+    void adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I);
+
+    /// Shuffle the top FixCount stack entries such that FP reg FixStack[0] is
+    /// st(0), FP reg FixStack[1] is st(1) etc.
+    void shuffleStackTop(const unsigned char *FixStack, unsigned FixCount,
+                         MachineBasicBlock::iterator I);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+
+    // Check if a COPY instruction is using FP registers.
+    static bool isFPCopy(MachineInstr *MI) {
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      return X86::RFP80RegClass.contains(DstReg) ||
+        X86::RFP80RegClass.contains(SrcReg);
+    }
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  Bundles = &getAnalysis<EdgeBundles>();
+  TII = MF.getTarget().getInstrInfo();
+
+  // Prepare cross-MBB liveness.
+  bundleCFG(MF);
+
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  SmallPtrSet<MachineBasicBlock*, 8> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() != Processed.size())
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      if (Processed.insert(BB))
+        Changed |= processBasicBlock(MF, *BB);
+
+  LiveBundles.clear();
+
+  return Changed;
+}
+
+/// bundleCFG - Scan all the basic blocks to determine consistent live-in and
+/// live-out sets for the FP registers. Consistent means that the set of
+/// registers live-out from a block is identical to the live-in set of all
+/// successors. This is not enforced by the normal live-in lists since
+/// registers may be implicitly defined, or not used by all successors.
+void FPS::bundleCFG(MachineFunction &MF) {
+  assert(LiveBundles.empty() && "Stale data in LiveBundles");
+  LiveBundles.resize(Bundles->getNumBundles());
+
+  // Gather the actual live-in masks for all MBBs.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = I;
+    const unsigned Mask = calcLiveInMask(MBB);
+    if (!Mask)
+      continue;
+    // Update MBB ingoing bundle mask.
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask;
+  }
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+  NumPendingSTs = 0;
+
+  setupBlockStack();
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    uint64_t Flags = MI->getDesc().TSFlags;
+
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI->isInlineAsm())
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isCopy() && isFPCopy(MI))
+      FPInstClass = X86II::SpecialFP;
+
+    if (MI->isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = 0;
+    if (I != BB.begin())
+      PrevMI = prior(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: llvm_unreachable("Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        dbgs() << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        dbgs() << "Inserted instructions:\n\t";
+        Start->print(dbgs(), &MF.getTarget());
+        while (++Start != llvm::next(I)) {}
+      }
+      dumpStack();
+    );
+    (void)PrevMI;
+
+    Changed = true;
+  }
+
+  finishBlockStack();
+
+  return Changed;
+}
+
+/// setupBlockStack - Use the live bundles to set up our model of the stack
+/// to match predecessors' live out stack.
+void FPS::setupBlockStack() {
+  DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+  StackTop = 0;
+  // Get the live-in bundle for MBB.
+  const LiveBundle &Bundle =
+    LiveBundles[Bundles->getBundle(MBB->getNumber(), false)];
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "Block has no FP live-ins.\n");
+    return;
+  }
+
+  // Depth-first iteration should ensure that we always have an assigned stack.
+  assert(Bundle.isFixed() && "Reached block before any predecessors");
+
+  // Push the fixed live-in registers.
+  for (unsigned i = Bundle.FixCount; i > 0; --i) {
+    MBB->addLiveIn(X86::ST0+i-1);
+    DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+                 << unsigned(Bundle.FixStack[i-1]) << '\n');
+    pushReg(Bundle.FixStack[i-1]);
+  }
+
+  // Kill off unwanted live-ins. This can happen with a critical edge.
+  // FIXME: We could keep these live registers around as zombies. They may need
+  // to be revived at the end of a short block. It might save a few instrs.
+  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  DEBUG(MBB->dump());
+}
+
+/// finishBlockStack - Revive live-outs that are implicitly defined out of
+/// MBB. Shuffle live registers to match the expected fixed stack of any
+/// predecessors, and ensure that all predecessors are expecting the same
+/// stack.
+void FPS::finishBlockStack() {
+  // The RET handling below takes care of return blocks for us.
+  if (MBB->succ_empty())
+    return;
+
+  DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+               << " derived from " << MBB->getName() << ".\n");
+
+  // Get MBB's live-out bundle.
+  unsigned BundleIdx = Bundles->getBundle(MBB->getNumber(), true);
+  LiveBundle &Bundle = LiveBundles[BundleIdx];
+
+  // We may need to kill and define some registers to match successors.
+  // FIXME: This can probably be combined with the shuffle below.
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  adjustLiveRegs(Bundle.Mask, Term);
+
+  if (!Bundle.Mask) {
+    DEBUG(dbgs() << "No live-outs.\n");
+    return;
+  }
+
+  // Has the stack order been fixed yet?
+  DEBUG(dbgs() << "LB#" << BundleIdx << ": ");
+  if (Bundle.isFixed()) {
+    DEBUG(dbgs() << "Shuffling stack to match.\n");
+    shuffleStackTop(Bundle.FixStack, Bundle.FixCount, Term);
+  } else {
+    // Not fixed yet, we get to choose.
+    DEBUG(dbgs() << "Fixing stack order now.\n");
+    Bundle.FixCount = StackTop;
+    for (unsigned i = 0; i < StackTop; ++i)
+      Bundle.FixStack[i] = getStackEntry(i);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    uint16_t from;
+    uint16_t to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, array_lengthof(TABLE)) &&              \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr* MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  ASSERT_SORTED(PopTable);
+  if (StackTop == 0)
+    report_fatal_error("Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  I = freeStackSlotBefore(++I, FPRegNo);
+}
+
+/// freeStackSlotBefore - Free the specified register without trying any
+/// folding.
+MachineBasicBlock::iterator
+FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) {
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+/// adjustLiveRegs - Kill and revive registers such that exactly the FP
+/// registers with a bit in Mask are live.
+void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
+  unsigned Defs = Mask;
+  unsigned Kills = 0;
+  for (unsigned i = 0; i < StackTop; ++i) {
+    unsigned RegNo = Stack[i];
+    if (!(Defs & (1 << RegNo)))
+      // This register is live, but we don't want it.
+      Kills |= (1 << RegNo);
+    else
+      // We don't need to imp-def this live register.
+      Defs &= ~(1 << RegNo);
+  }
+  assert((Kills & Defs) == 0 && "Register needs killing and def'ing?");
+
+  // Produce implicit-defs for free by using killed registers.
+  while (Kills && Defs) {
+    unsigned KReg = CountTrailingZeros_32(Kills);
+    unsigned DReg = CountTrailingZeros_32(Defs);
+    DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+    std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
+    std::swap(RegMap[KReg], RegMap[DReg]);
+    Kills &= ~(1 << KReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Kill registers by popping.
+  if (Kills && I != MBB->begin()) {
+    MachineBasicBlock::iterator I2 = llvm::prior(I);
+    while (StackTop) {
+      unsigned KReg = getStackEntry(0);
+      if (!(Kills & (1 << KReg)))
+        break;
+      DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+      popStackAfter(I2);
+      Kills &= ~(1 << KReg);
+    }
+  }
+
+  // Manually kill the rest.
+  while (Kills) {
+    unsigned KReg = CountTrailingZeros_32(Kills);
+    DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+    freeStackSlotBefore(I, KReg);
+    Kills &= ~(1 << KReg);
+  }
+
+  // Load zeros for all the imp-defs.
+  while(Defs) {
+    unsigned DReg = CountTrailingZeros_32(Defs);
+    DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+    BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
+    pushReg(DReg);
+    Defs &= ~(1 << DReg);
+  }
+
+  // Now we should have the correct registers live.
+  DEBUG(dumpStack());
+  assert(StackTop == CountPopulation_32(Mask) && "Live count mismatch");
+}
+
+/// shuffleStackTop - emit fxch instructions before I to shuffle the top
+/// FixCount entries into the order given by FixStack.
+/// FIXME: Is there a better algorithm than insertion sort?
+void FPS::shuffleStackTop(const unsigned char *FixStack,
+                          unsigned FixCount,
+                          MachineBasicBlock::iterator I) {
+  // Move items into place, starting from the desired stack bottom.
+  while (FixCount--) {
+    // Old register at position FixCount.
+    unsigned OldReg = getStackEntry(FixCount);
+    // Desired register at position FixCount.
+    unsigned Reg = FixStack[FixCount];
+    if (Reg == OldReg)
+      continue;
+    // (Reg st0) (OldReg st0) = (Reg OldReg st0)
+    moveToTop(Reg, I);
+    if (FixCount > 0)
+      moveToTop(OldReg, I);
+  }
+  DEBUG(dumpStack());
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert((NumOps == X86::AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc)
+    duplicatePendingSTBeforeKill(Reg, I);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64 ||
+       MI->getOpcode() == X86::IST_Fp64m80 ||
+       MI->getOpcode() == X86::ISTT_Fp16m80 ||
+       MI->getOpcode() == X86::ISTT_Fp32m80 ||
+       MI->getOpcode() == X86::ISTT_Fp64m80 ||
+       MI->getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, getScratchReg(), I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m ||
+      MI->getOpcode() == X86::ST_FP80m) {
+    if (StackTop == 0)
+      report_fatal_error("Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+#ifndef NDEBUG
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc) {
+    duplicatePendingSTBeforeKill(Reg, I);
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    if (StackTop == 0)
+      report_fatal_error("Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+                      MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
+  case TargetOpcode::COPY: {
+    // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP.
+    const MachineOperand &MO1 = MI->getOperand(1);
+    const MachineOperand &MO0 = MI->getOperand(0);
+    unsigned DstST = MO0.getReg() - X86::ST0;
+    unsigned SrcST = MO1.getReg() - X86::ST0;
+    bool KillsSrc = MI->killsRegister(MO1.getReg());
+
+    // ST = COPY FP. Set up a pending ST register.
+    if (DstST < 8) {
+      unsigned SrcFP = getFPReg(MO1);
+      assert(isLive(SrcFP) && "Cannot copy dead register");
+      assert(!MO0.isDead() && "Cannot copy to dead ST register");
+
+      // Unallocated STs are marked as the nonexistent FP255.
+      while (NumPendingSTs <= DstST)
+        PendingST[NumPendingSTs++] = NumFPRegs;
+
+      // STi could still be live from a previous inline asm.
+      if (isScratchReg(PendingST[DstST])) {
+        DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST])
+                     << '\n');
+        freeStackSlotBefore(MI, PendingST[DstST]);
+      }
+
+      // When the source is killed, allocate a scratch FP register.
+      if (KillsSrc) {
+        duplicatePendingSTBeforeKill(SrcFP, I);
+        unsigned Slot = getSlot(SrcFP);
+        unsigned SR = getScratchReg();
+        PendingST[DstST] = SR;
+        Stack[Slot] = SR;
+        RegMap[SR] = Slot;
+      } else
+        PendingST[DstST] = SrcFP;
+      break;
+    }
+
+    // FP = COPY ST. Extract fixed stack value.
+    // Any instruction defining ST registers must have assigned them to a
+    // scratch register.
+    if (SrcST < 8) {
+      unsigned DstFP = getFPReg(MO0);
+      assert(!isLive(DstFP) && "Cannot copy ST to live FP register");
+      assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register");
+      unsigned SrcFP = PendingST[SrcST];
+      assert(isScratchReg(SrcFP) && "Expected ST in a scratch register");
+      assert(isLive(SrcFP) && "Scratch holding ST is dead");
+
+      // DstFP steals the stack slot from SrcFP.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+
+      // Always treat the ST as killed.
+      PendingST[SrcST] = NumFPRegs;
+      while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+        --NumPendingSTs;
+      break;
+    }
+
+    // FP <- FP copy.
+    unsigned DstFP = getFPReg(MO0);
+    unsigned SrcFP = getFPReg(MO1);
+    assert(isLive(SrcFP) && "Cannot copy dead register");
+    if (KillsSrc) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcFP);
+      Stack[Slot] = DstFP;
+      RegMap[DstFP] = Slot;
+    } else {
+      // For COPY we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcFP, DstFP, I);
+    }
+    break;
+  }
+
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
+  case X86::FpPOP_RETVAL: {
+    // The FpPOP_RETVAL instruction is used after calls that return a value on
+    // the floating point stack. We cannot model this with ST defs since CALL
+    // instructions have fixed clobber lists. This instruction is interpreted
+    // to mean that there is one more live register on the stack than we
+    // thought.
+    //
+    // This means that StackTop does not match the hardware stack between a
+    // call and the FpPOP_RETVAL instructions.  We do tolerate FP instructions
+    // between CALL and FpPOP_RETVAL as long as they don't overflow the
+    // hardware stack.
+    unsigned DstFP = getFPReg(MI->getOperand(0));
+
+    // Move existing stack elements up to reflect reality.
+    assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL");
+    if (StackTop) {
+      std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1);
+      for (unsigned i = 0; i != NumFPRegs; ++i)
+        ++RegMap[i];
+    }
+    ++StackTop;
+
+    // DstFP is the new bottom of the stack.
+    Stack[0] = DstFP;
+    RegMap[DstFP] = 0;
+
+    // DstFP will be killed by processBasicBlock if this was a dead def.
+    break;
+  }
+
+  case TargetOpcode::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.
+    //
+    // There are special rules for x87 inline assembly. The compiler must know
+    // exactly how many registers are popped and pushed implicitly by the asm.
+    // Otherwise it is not possible to restore the stack state after the inline
+    // asm.
+    //
+    // There are 3 kinds of input operands:
+    //
+    // 1. Popped inputs. These must appear at the stack top in ST0-STn. A
+    //    popped input operand must be in a fixed stack slot, and it is either
+    //    tied to an output operand, or in the clobber list. The MI has ST use
+    //    and def operands for these inputs.
+    //
+    // 2. Fixed inputs. These inputs appear in fixed stack slots, but are
+    //    preserved by the inline asm. The fixed stack slots must be STn-STm
+    //    following the popped inputs. A fixed input operand cannot be tied to
+    //    an output or appear in the clobber list. The MI has ST use operands
+    //    and no defs for these inputs.
+    //
+    // 3. Preserved inputs. These inputs use the "f" constraint which is
+    //    represented as an FP register. The inline asm won't change these
+    //    stack slots.
+    //
+    // Outputs must be in ST registers, FP outputs are not allowed. Clobbered
+    // registers do not count as output operands. The inline asm changes the
+    // stack as if it popped all the popped inputs and then pushed all the
+    // output operands.
+
+    // Scan the assembly for ST registers used, defined and clobbered. We can
+    // only tell clobbers from defs by looking at the asm descriptor.
+    unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0;
+    unsigned NumOps = 0;
+    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands();
+         i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) {
+      unsigned Flags = MI->getOperand(i).getImm();
+      NumOps = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumOps != 1)
+        continue;
+      const MachineOperand &MO = MI->getOperand(i + 1);
+      if (!MO.isReg())
+        continue;
+      unsigned STReg = MO.getReg() - X86::ST0;
+      if (STReg >= 8)
+        continue;
+
+      switch (InlineAsm::getKind(Flags)) {
+      case InlineAsm::Kind_RegUse:
+        STUses |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber:
+        STDefs |= (1u << STReg);
+        if (MO.isDead())
+          STDeadDefs |= (1u << STReg);
+        break;
+      case InlineAsm::Kind_Clobber:
+        STClobbers |= (1u << STReg);
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (STUses && !isMask_32(STUses))
+      MI->emitError("fixed input regs must be last on the x87 stack");
+    unsigned NumSTUses = CountTrailingOnes_32(STUses);
+
+    // Defs must be contiguous from the stack top. ST0-STn.
+    if (STDefs && !isMask_32(STDefs)) {
+      MI->emitError("output regs must be last on the x87 stack");
+      STDefs = NextPowerOf2(STDefs) - 1;
+    }
+    unsigned NumSTDefs = CountTrailingOnes_32(STDefs);
+
+    // So must the clobbered stack slots. ST0-STm, m >= n.
+    if (STClobbers && !isMask_32(STDefs | STClobbers))
+      MI->emitError("clobbers must be last on the x87 stack");
+
+    // Popped inputs are the ones that are also clobbered or defined.
+    unsigned STPopped = STUses & (STDefs | STClobbers);
+    if (STPopped && !isMask_32(STPopped))
+      MI->emitError("implicitly popped regs must be last on the x87 stack");
+    unsigned NumSTPopped = CountTrailingOnes_32(STPopped);
+
+    DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops "
+                 << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n");
+
+    // Scan the instruction for FP uses corresponding to "f" constraints.
+    // Collect FP registers to kill afer the instruction.
+    // Always kill all the scratch regs.
+    unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff;
+    unsigned FPUsed = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      if (!Op.isUse())
+        MI->emitError("illegal \"f\" output constraint");
+      unsigned FPReg = getFPReg(Op);
+      FPUsed |= 1U << FPReg;
+
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isKill())
+        FPKills |= 1U << FPReg;
+    }
+
+    // The popped inputs will be killed by the instruction, so duplicate them
+    // if the FP register needs to be live after the instruction, or if it is
+    // used in the instruction itself. We effectively treat the popped inputs
+    // as early clobbers.
+    for (unsigned i = 0; i < NumSTPopped; ++i) {
+      if ((FPKills & ~FPUsed) & (1u << PendingST[i]))
+        continue;
+      unsigned SR = getScratchReg();
+      duplicateToTop(PendingST[i], SR, I);
+      DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+                   << unsigned(PendingST[i]) << " to avoid clobbering it.\n");
+      PendingST[i] = SR;
+    }
+
+    // Make sure we have a unique live register for every fixed use. Some of
+    // them could be undef uses, and we need to emit LD_F0 instructions.
+    for (unsigned i = 0; i < NumSTUses; ++i) {
+      if (i < NumPendingSTs && PendingST[i] < NumFPRegs) {
+        // Check for shared assignments.
+        for (unsigned j = 0; j < i; ++j) {
+          if (PendingST[j] != PendingST[i])
+            continue;
+          // STi and STj are inn the same register, create a copy.
+          unsigned SR = getScratchReg();
+          duplicateToTop(PendingST[i], SR, I);
+          DEBUG(dbgs() << "Duplicating ST" << i << " in FP"
+                       << unsigned(PendingST[i])
+                       << " to avoid collision with ST" << j << '\n');
+          PendingST[i] = SR;
+        }
+        continue;
+      }
+      unsigned SR = getScratchReg();
+      DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n');
+      BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+      pushReg(SR);
+      PendingST[i] = SR;
+      if (NumPendingSTs == i)
+        ++NumPendingSTs;
+    }
+    assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned");
+
+    // Now we can rearrange the live registers to match what was requested.
+    shuffleStackTop(PendingST, NumPendingSTs, I);
+    DEBUG({dbgs() << "Before asm: "; dumpStack();});
+
+    // With the stack layout fixed, rewrite the FP registers.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      unsigned FPReg = getFPReg(Op);
+      Op.setReg(getSTReg(FPReg));
+    }
+
+    // Simulate the inline asm popping its inputs and pushing its outputs.
+    StackTop -= NumSTPopped;
+
+    // Hold the fixed output registers in scratch FP registers. They will be
+    // transferred to real FP registers by copies.
+    NumPendingSTs = 0;
+    for (unsigned i = 0; i < NumSTDefs; ++i) {
+      unsigned SR = getScratchReg();
+      pushReg(SR);
+      FPKills &= ~(1u << SR);
+    }
+    for (unsigned i = 0; i < NumSTDefs; ++i)
+      PendingST[NumPendingSTs++] = getStackEntry(i);
+    DEBUG({dbgs() << "After asm: "; dumpStack();});
+
+    // If any of the ST defs were dead, pop them immediately. Our caller only
+    // handles dead FP defs.
+    MachineBasicBlock::iterator InsertPt = MI;
+    for (unsigned i = 0; STDefs & (1u << i); ++i) {
+      if (!(STDeadDefs & (1u << i)))
+        continue;
+      freeStackSlotAfter(InsertPt, PendingST[i]);
+      PendingST[i] = NumFPRegs;
+    }
+    while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs)
+      --NumPendingSTs;
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    while (FPKills) {
+      unsigned FPReg = CountTrailingZeros_32(FPKills);
+      if (isLive(FPReg))
+        freeStackSlotAfter(InsertPt, FPReg);
+      FPKills &= ~(1U << FPReg);
+    }
+    // Don't delete the inline asm!
+    return;
+  }
+
+  case X86::WIN_FTOL_32:
+  case X86::WIN_FTOL_64: {
+    // Push the operand into ST0.
+    MachineOperand &Op = MI->getOperand(0);
+    assert(Op.isUse() && Op.isReg() &&
+      Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6);
+    unsigned FPReg = getFPReg(Op);
+    if (Op.isKill())
+      moveToTop(FPReg, I);
+    else
+      duplicateToTop(FPReg, FPReg, I);
+
+    // Emit the call. This will pop the operand.
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("_ftol2")
+      .addReg(X86::ST0, RegState::ImplicitKill)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::EDX, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    --StackTop;
+
+    break;
+  }
+
+  case X86::RET:
+  case X86::RETI:
+    // If RET has an FP register use operand, pass the first one in ST(0) and
+    // the second one in ST(1).
+
+    // Find the register operands.
+    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+    unsigned LiveMask = 0;
+
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      // FP Register uses must be kills unless there are two uses of the same
+      // register, in which case only one will be a kill.
+      assert(Op.isUse() &&
+             (Op.isKill() ||                        // Marked kill.
+              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
+              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
+             "Ret only defs operands, and values aren't live beyond it");
+
+      if (FirstFPRegOp == ~0U)
+        FirstFPRegOp = getFPReg(Op);
+      else {
+        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+        SecondFPRegOp = getFPReg(Op);
+      }
+      LiveMask |= (1 << getFPReg(Op));
+
+      // Remove the operand so that later passes don't see it.
+      MI->RemoveOperand(i);
+      --i, --e;
+    }
+
+    // We may have been carrying spurious live-ins, so make sure only the returned
+    // registers are left live.
+    adjustLiveRegs(LiveMask, MI);
+    if (!LiveMask) return;  // Quick check to see if any are possible.
+
+    // There are only four possibilities here:
+    // 1) we are returning a single FP value.  In this case, it has to be in
+    //    ST(0) already, so just declare success by removing the value from the
+    //    FP Stack.
+    if (SecondFPRegOp == ~0U) {
+      // Assert that the top of stack contains the right FP register.
+      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+             "Top of stack not the right register for RET!");
+
+      // Ok, everything is good, mark the value as not being on the stack
+      // anymore so that our assertion about the stack being empty at end of
+      // block doesn't fire.
+      StackTop = 0;
+      return;
+    }
+
+    // Otherwise, we are returning two values:
+    // 2) If returning the same value for both, we only have one thing in the FP
+    //    stack.  Consider:  RET FP1, FP1
+    if (StackTop == 1) {
+      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+             "Stack misconfiguration for RET!");
+
+      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+      // register to hold it.
+      unsigned NewReg = getScratchReg();
+      duplicateToTop(FirstFPRegOp, NewReg, MI);
+      FirstFPRegOp = NewReg;
+    }
+
+    /// Okay we know we have two different FPx operands now:
+    assert(StackTop == 2 && "Must have two values live!");
+
+    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+    ///    in ST(1).  In this case, emit an fxch.
+    if (getStackEntry(0) == SecondFPRegOp) {
+      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+      moveToTop(FirstFPRegOp, MI);
+    }
+
+    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+    /// ST(1).  Just remove both from our understanding of the stack and return.
+    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+    StackTop = 0;
+    return;
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+
+  // We want to leave I pointing to the previous instruction, but what if we
+  // just erased the first instruction?
+  if (I == MBB->begin()) {
+    DEBUG(dbgs() << "Inserting dummy KILL\n");
+    I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL));
+  } else
+    --I;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
new file mode 100644
index 000000000000..54cbd40274a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -0,0 +1,1837 @@
+//===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86FrameLowering.h"
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+// FIXME: completely move here.
+extern cl::opt<bool> ForceStackAlign;
+
+bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineModuleInfo &MMI = MF.getMMI();
+  const TargetRegisterInfo *RegInfo = TM.getRegisterInfo();
+
+  return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
+          RegInfo->needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MMI.callsUnwindInit() || MMI.callsEHReturn());
+}
+
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::ADD64ri8;
+    return X86::ADD64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::ADD32ri8;
+    return X86::ADD32ri;
+  }
+}
+
+static unsigned getLEArOpcode(unsigned IsLP64) {
+  return IsLP64 ? X86::LEA64r : X86::LEA32r;
+}
+
+/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
+/// when it reaches the "return" instruction. We can then pop a stack object
+/// to this register without worry about clobbering it.
+static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator &MBBI,
+                                       const TargetRegisterInfo &TRI,
+                                       bool Is64Bit) {
+  const MachineFunction *MF = MBB.getParent();
+  const Function *F = MF->getFunction();
+  if (!F || MF->getMMI().callsEHReturn())
+    return 0;
+
+  static const uint16_t CallerSavedRegs32Bit[] = {
+    X86::EAX, X86::EDX, X86::ECX, 0
+  };
+
+  static const uint16_t CallerSavedRegs64Bit[] = {
+    X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
+    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
+  };
+
+  unsigned Opc = MBBI->getOpcode();
+  switch (Opc) {
+  default: return 0;
+  case X86::RET:
+  case X86::RETI:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    SmallSet<uint16_t, 8> Uses;
+    for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MBBI->getOperand(i);
+      if (!MO.isReg() || MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+        Uses.insert(*AI);
+    }
+
+    const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
+    for (; *CS; ++CS)
+      if (!Uses.count(*CS))
+        return *CS;
+  }
+  }
+
+  return 0;
+}
+
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes,
+                  bool Is64Bit, bool IsLP64, bool UseLEA,
+                  const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc;
+  if (UseLEA)
+    Opc = getLEArOpcode(IsLP64);
+  else
+    Opc = isSub
+      ? getSUBriOpcode(IsLP64, Offset)
+      : getADDriOpcode(IsLP64, Offset);
+
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    if (ThisVal == (Is64Bit ? 8 : 4)) {
+      // Use push / pop instead.
+      unsigned Reg = isSub
+        ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
+        : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+      if (Reg) {
+        Opc = isSub
+          ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
+          : (Is64Bit ? X86::POP64r  : X86::POP32r);
+        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
+          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
+        if (isSub)
+          MI->setFlag(MachineInstr::FrameSetup);
+        Offset -= ThisVal;
+        continue;
+      }
+    }
+
+    MachineInstr *MI = NULL;
+
+    if (UseLEA) {
+      MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                          StackPtr, false, isSub ? -ThisVal : ThisVal);
+    } else {
+      MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+            .addReg(StackPtr)
+            .addImm(ThisVal);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+    }
+
+    if (isSub)
+      MI->setFlag(MachineInstr::FrameSetup);
+
+    Offset -= ThisVal;
+  }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                      unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  if (MBBI == MBB.begin()) return;
+
+  MachineBasicBlock::iterator PI = prior(MBBI);
+  unsigned Opc = PI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+      PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  }
+}
+
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  // FIXME:  THIS ISN'T RUN!!!
+  return;
+
+  if (MBBI == MBB.end()) return;
+
+  MachineBasicBlock::iterator NI = llvm::next(MBBI);
+  if (NI == MBB.end()) return;
+
+  unsigned Opc = NI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
+/// SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI,
+                           unsigned StackPtr,
+                           bool doMergeWithPrevious) {
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
+       Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+static bool isEAXLiveIn(MachineFunction &MF) {
+  for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
+       EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
+    unsigned Reg = II->first;
+
+    if (Reg == X86::EAX || Reg == X86::AX ||
+        Reg == X86::AH || Reg == X86::AL)
+      return true;
+  }
+
+  return false;
+}
+
+void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
+                                                 MCSymbol *Label,
+                                                 unsigned FramePtr) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty()) return;
+
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -RegInfo->getSlotSize();
+
+  // FIXME: This is dirty hack. The code itself is pretty mess right now.
+  // It should be rewritten from scratch and generalized sometimes.
+
+  // Determine maximum offset (minimum due to stack growth).
+  int64_t MaxOffset = 0;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I)
+    MaxOffset = std::min(MaxOffset,
+                         MFI->getObjectOffset(I->getFrameIdx()));
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+    Offset = MaxOffset - Offset + saveAreaOffset;
+
+    // Don't output a new machine move if we're re-saving the frame
+    // pointer. This happens when the PrologEpilogInserter has inserted an extra
+    // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
+    // generates one when frame pointers are used. If we generate a "machine
+    // move" for this extra "PUSH", the linker will lose track of the fact that
+    // the frame pointer should have the value of the first "PUSH" when it's
+    // trying to unwind.
+    //
+    // FIXME: This looks inelegant. It's possibly correct, but it's covering up
+    //        another bug. I.e., one where we generate a prolog like this:
+    //
+    //          pushl  %ebp
+    //          movl   %esp, %ebp
+    //          pushl  %ebp
+    //          pushl  %esi
+    //           ...
+    //
+    //        The immediate re-push of EBP is unnecessary. At the least, it's an
+    //        optimization bug. EBP can be used as a scratch register in certain
+    //        cases, but probably not when we have a frame pointer.
+    if (HasFP && FramePtr == Reg)
+      continue;
+
+    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+    MachineLocation CSSrc(Reg);
+    Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+  }
+}
+
+/// getCompactUnwindRegNum - Get the compact unwind number for a given
+/// register. The number corresponds to the enum lists in
+/// compact_unwind_encoding.h.
+static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
+  for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+    if (*CURegs == Reg)
+      return Idx;
+
+  return -1;
+}
+
+// Number of registers that can be saved in a compact unwind encoding.
+#define CU_NUM_SAVED_REGS 6
+
+/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
+/// used with frameless stacks. It is passed the number of registers to be saved
+/// and an array of the registers saved.
+static uint32_t
+encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+                                         unsigned RegCount, bool Is64Bit) {
+  // The saved registers are numbered from 1 to 6. In order to encode the order
+  // in which they were saved, we re-number them according to their place in the
+  // register order. The re-numbering is relative to the last re-numbered
+  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
+  //
+  //    Orig  Re-Num
+  //    ----  ------
+  //     6       6
+  //     2       2
+  //     4       3
+  //     5       3
+  //
+  static const uint16_t CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const uint16_t CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+    int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+    if (CUReg == -1) return ~0U;
+    SavedRegs[i] = CUReg;
+  }
+
+  // Reverse the list.
+  std::swap(SavedRegs[0], SavedRegs[5]);
+  std::swap(SavedRegs[1], SavedRegs[4]);
+  std::swap(SavedRegs[2], SavedRegs[3]);
+
+  uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+  for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
+    unsigned Countless = 0;
+    for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+      if (SavedRegs[j] < SavedRegs[i])
+        ++Countless;
+
+    RenumRegs[i] = SavedRegs[i] - Countless - 1;
+  }
+
+  // Take the renumbered values and encode them into a 10-bit number.
+  uint32_t permutationEncoding = 0;
+  switch (RegCount) {
+  case 6:
+    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                           +     RenumRegs[4];
+    break;
+  case 5:
+    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 4:
+    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                           + 3 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 3:
+    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 2:
+    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 1:
+    permutationEncoding |=       RenumRegs[5];
+    break;
+  }
+
+  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+         "Invalid compact register encoding!");
+  return permutationEncoding;
+}
+
+/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
+/// compact encoding with a frame pointer.
+static uint32_t
+encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
+                                      bool Is64Bit) {
+  static const uint16_t CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const uint16_t CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  // Encode the registers in the order they were saved, 3-bits per register. The
+  // registers are numbered from 1 to CU_NUM_SAVED_REGS.
+  uint32_t RegEnc = 0;
+  for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
+    unsigned Reg = SavedRegs[I];
+    if (Reg == 0) continue;
+
+    int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+    if (CURegNum == -1) return ~0U;
+
+    // Encode the 3-bit register number in order, skipping over 3-bits for each
+    // register.
+    RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+  }
+
+  assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
+  return RegEnc;
+}
+
+uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+
+  bool Is64Bit = STI.is64Bit();
+  bool HasFP = hasFP(MF);
+
+  unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
+  unsigned SavedRegIdx = 0;
+
+  unsigned OffsetSize = (Is64Bit ? 8 : 4);
+
+  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
+  unsigned PushInstrSize = 1;
+  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
+  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
+
+  unsigned StackDivide = (Is64Bit ? 8 : 4);
+
+  unsigned InstrOffset = 0;
+  unsigned StackAdjust = 0;
+  unsigned StackSize = 0;
+
+  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
+  bool ExpectEnd = false;
+  for (MachineBasicBlock::iterator
+         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opc = MI.getOpcode();
+    if (Opc == X86::PROLOG_LABEL) continue;
+    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
+
+    // We don't exect any more prolog instructions.
+    if (ExpectEnd) return 0;
+
+    if (Opc == PushInstr) {
+      // If there are too many saved registers, we cannot use compact encoding.
+      if (SavedRegIdx >= CU_NUM_SAVED_REGS) return 0;
+
+      SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
+      StackAdjust += OffsetSize;
+      InstrOffset += PushInstrSize;
+    } else if (Opc == MoveInstr) {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+
+      if (DstReg != FramePtr || SrcReg != StackPtr)
+        return 0;
+
+      StackAdjust = 0;
+      memset(SavedRegs, 0, sizeof(SavedRegs));
+      SavedRegIdx = 0;
+      InstrOffset += MoveInstrSize;
+    } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
+      if (StackSize)
+        // We already have a stack size.
+        return 0;
+
+      if (!MI.getOperand(0).isReg() ||
+          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
+          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
+        // We need this to be a stack adjustment pointer. Something like:
+        //
+        //   %RSP<def> = SUB64ri8 %RSP, 48
+        return 0;
+
+      StackSize = MI.getOperand(2).getImm() / StackDivide;
+      SubtractInstrIdx += InstrOffset;
+      ExpectEnd = true;
+    }
+  }
+
+  // Encode that we are using EBP/RBP as the frame pointer.
+  uint32_t CompactUnwindEncoding = 0;
+  StackAdjust /= StackDivide;
+  if (HasFP) {
+    if ((StackAdjust & 0xFF) != StackAdjust)
+      // Offset was too big for compact encoding.
+      return 0;
+
+    // Get the encoding of the saved registers when we have a frame pointer.
+    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
+    if (RegEnc == ~0U) return 0;
+
+    CompactUnwindEncoding |= 0x01000000;
+    CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+    CompactUnwindEncoding |= RegEnc & 0x7FFF;
+  } else {
+    ++StackAdjust;
+    uint32_t TotalStackSize = StackAdjust + StackSize;
+    if ((TotalStackSize & 0xFF) == TotalStackSize) {
+      // Frameless stack with a small stack size.
+      CompactUnwindEncoding |= 0x02000000;
+
+      // Encode the stack size.
+      CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
+    } else {
+      if ((StackAdjust & 0x7) != StackAdjust)
+        // The extra stack adjustments are too big for us to handle.
+        return 0;
+
+      // Frameless stack with an offset too large for us to encode compactly.
+      CompactUnwindEncoding |= 0x03000000;
+
+      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+      // instruction.
+      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+      // Encode any extra stack stack adjustments (done via push instructions).
+      CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+    }
+
+    // Encode the number of registers saved.
+    CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+    // Get the encoding of the saved registers when we don't have a frame
+    // pointer.
+    uint32_t RegEnc =
+      encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
+                                               Is64Bit);
+    if (RegEnc == ~0U) return 0;
+
+    // Encode the register encoding.
+    CompactUnwindEncoding |= RegEnc & 0x3FF;
+  }
+
+  return CompactUnwindEncoding;
+}
+
+/// usesTheStack - This function checks if any of the users of EFLAGS
+/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
+/// to use the stack, and if we don't adjust the stack we clobber the first
+/// frame index.
+/// See X86InstrInfo::copyPhysReg.
+static bool usesTheStack(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
+       re = MRI.reg_end(); ri != re; ++ri)
+    if (ri->isCopy())
+      return true;
+
+  return false;
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+    Fn->needsUnwindTableEntry();
+  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
+  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool HasFP = hasFP(MF);
+  bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  bool IsWin64 = STI.isTargetWin64();
+  bool UseLEA = STI.useLeaForSP();
+  unsigned StackAlign = getStackAlignment();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+  unsigned BasePtr = RegInfo->getBaseRegister();
+  DebugLoc DL;
+
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum SlotSize.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else if (MaxAlign < SlotSize)
+      MaxAlign = SlotSize;
+  }
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone). We also check that we don't
+  // push and pop from the stack.
+  if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                   Attribute::NoRedZone) &&
+      !RegInfo->needsStackRealignment(MF) &&
+      !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
+      !MFI->adjustsStack() &&                           // No calls.
+      !IsWin64 &&                                       // Win64 has no Red Zone
+      !usesTheStack(MF) &&                              // Don't push and pop.
+      !MF.getTarget().Options.EnableSegmentedStacks) {  // Regular stack
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (HasFP) MinSize += SlotSize;
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
+              StackPtr)
+        .addReg(StackPtr)
+        .addImm(-TailCallReturnAddrDelta)
+        .setMIFlag(MachineInstr::FrameSetup);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+  }
+
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  uint64_t NumBytes = 0;
+  int stackGrowth = -SlotSize;
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers are pushed on stack before the stack
+      // is realigned.
+      FrameSize -= X86FI->getCalleeSavedFrameSize();
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+    }
+
+    // Get the offset of the stack slot for the EBP register, which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save EBP/RBP into the appropriate stack slot.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    if (needsFrameMoves) {
+      // Mark the place where EBP/RBP was saved.
+      MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
+
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
+        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+      } else {
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+      }
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
+      MachineLocation FPSrc(FramePtr);
+      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+    }
+
+    // Update EBP with the new base value.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+        .addReg(StackPtr)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    if (needsFrameMoves) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
+
+      // Define the current CFA to use the EBP/RBP register.
+      MachineLocation FPDst(FramePtr);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+    }
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(FramePtr);
+  } else {
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
+    MBBI->setFlag(MachineInstr::FrameSetup);
+    ++MBBI;
+
+    if (!HasFP && needsFrameMoves) {
+      // Mark callee-saved push instruction.
+      MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
+
+      // Define the current CFA rule to use the provided offset.
+      unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
+      MachineLocation SPDst(Ptr);
+      MachineLocation SPSrc(Ptr, StackOffset);
+      Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+      StackOffset += stackGrowth;
+    }
+  }
+
+  // Realign stack after we pushed callee-saved registers (so that we'll be
+  // able to calculate their offsets from the frame pointer).
+
+  // NOTE: We push the registers before realigning the stack, so
+  // vector callee-saved (xmm) registers may be saved w/o proper
+  // alignment in this way. However, currently these regs are saved in
+  // stack slots (see X86FrameLowering::spillCalleeSavedRegisters()), so
+  // this shouldn't be a problem.
+  if (RegInfo->needsStackRealignment(MF)) {
+    assert(HasFP && "There should be a frame pointer if stack is realigned.");
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+      .addReg(StackPtr)
+      .addImm(-MaxAlign)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // The EFLAGS implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
+
+  // If there is an SUB32ri of ESP immediately before this instruction, merge
+  // the two. This can be the case when tail call elimination is enabled and
+  // the callee has more arguments then the caller.
+  NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately after this
+  // instruction, merge the two instructions.
+  mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+  // Adjust stack pointer: ESP -= numbytes.
+
+  // Windows and cygwin/mingw require a prologue helper routine when allocating
+  // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
+  // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
+  // stack and adjust the stack pointer in one go.  The 64-bit version of
+  // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
+  // responsible for adjusting the stack pointer.  Touching the stack at 4K
+  // increments is necessary to ensure that the guard pages used by the OS
+  // virtual memory manager are allocated in correct sequence.
+  if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+    const char *StackProbeSymbol;
+    bool isSPUpdateNeeded = false;
+
+    if (Is64Bit) {
+      if (STI.isTargetCygMing())
+        StackProbeSymbol = "___chkstk";
+      else {
+        StackProbeSymbol = "__chkstk";
+        isSPUpdateNeeded = true;
+      }
+    } else if (STI.isTargetCygMing())
+      StackProbeSymbol = "_alloca";
+    else
+      StackProbeSymbol = "_chkstk";
+
+    // Check whether EAX is livein for this function.
+    bool isEAXAlive = isEAXLiveIn(MF);
+
+    if (isEAXAlive) {
+      // Sanity check that EAX is not livein for this function.
+      // It should not be, so throw an assert.
+      assert(!Is64Bit && "EAX is livein in x64 case!");
+
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    if (Is64Bit) {
+      // Handle the 64-bit Windows ABI case where we need to call __chkstk.
+      // Function prologue is responsible for adjusting the stack pointer.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
+      // We'll also use 4 already allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
+
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(StackPtr,    RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+    // MSVC x64's __chkstk needs to adjust %rsp.
+    // FIXME: %rax preserves the offset and should be available.
+    if (isSPUpdateNeeded)
+      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+                   UseLEA, TII, *RegInfo);
+
+    if (isEAXAlive) {
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                                X86::EAX),
+                                        StackPtr, false, NumBytes - 4);
+        MI->setFlag(MachineInstr::FrameSetup);
+        MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes)
+    emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
+                 UseLEA, TII, *RegInfo);
+
+  // If we need a base pointer, set it up here. It's whatever the value
+  // of the stack pointer is at this point. Any variable size objects
+  // will be allocated after this, so we can still use the base pointer
+  // to reference locals.
+  if (RegInfo->hasBasePointer(MF)) {
+    // Update the frame pointer with the current stack pointer.
+    unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr;
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
+    // Mark end of stack pointer adjustment.
+    MCSymbol *Label = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+      .addSym(Label);
+
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP,
+                              -StackSize + stackGrowth);
+        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+      } else {
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+      }
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
+  }
+
+  // Darwin 10.7 and greater has support for compact unwind encoding.
+  if (STI.getTargetTriple().isMacOSX() &&
+      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
+    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
+}
+
+void X86FrameLowering::emitEpilogue(MachineFunction &MF,
+                                    MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI != MBB.end() && "Returning block has no instructions");
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+  bool Is64Bit = STI.is64Bit();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  bool UseLEA = STI.useLeaForSP();
+  unsigned StackAlign = getStackAlignment();
+  unsigned SlotSize = RegInfo->getSlotSize();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+
+  switch (RetOpcode) {
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  case X86::RET:
+  case X86::RETI:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64:
+    break;  // These are ok
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  // If we're forcing a stack realignment we can't rely on just the frame
+  // info, we need to know the ABI stack alignment as well in case we
+  // have a call out.  Otherwise just make sure we have some alignment - we'll
+  // go with the minimum.
+  if (ForceStackAlign) {
+    if (MFI->hasCalls())
+      MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
+    else
+      MaxAlign = MaxAlign ? MaxAlign : 4;
+  }
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (RegInfo->needsStackRealignment(MF)) {
+      // Callee-saved registers were pushed on stack before the stack
+      // was realigned.
+      FrameSize -= CSSize;
+      NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+    } else {
+      NumBytes = FrameSize - CSSize;
+    }
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+
+    if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
+        !PI->isTerminator())
+      break;
+
+    --MBBI;
+  }
+  MachineBasicBlock::iterator FirstCSPop = MBBI;
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned.
+  if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
+    if (RegInfo->needsStackRealignment(MF))
+      MBBI = FirstCSPop;
+    if (CSSize != 0) {
+      unsigned Opc = getLEArOpcode(IsLP64);
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
+                   FramePtr, false, -CSSize);
+    } else {
+      unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+        .addReg(FramePtr);
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+                 TII, *RegInfo);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(DestAddr.getReg());
+  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+             RetOpcode == X86::TCRETURNmi ||
+             RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
+             RetOpcode == X86::TCRETURNmi64) {
+    bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = MBB.getLastNonDebugInstr();
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+    // Incoporate the retaddr area.
+    Offset = StackAdj-MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
+
+    if (Offset) {
+      // Check for possible merge with preceding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+                   UseLEA, TII, *RegInfo);
+    }
+
+    // Jump to label or value in register.
+    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
+                                       ? X86::TAILJMPd : X86::TAILJMPd64));
+      if (JumpTarget.isGlobal())
+        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                             JumpTarget.getTargetFlags());
+      else {
+        assert(JumpTarget.isSymbol());
+        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
+                              JumpTarget.getTargetFlags());
+      }
+    } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
+                                       ? X86::TAILJMPm : X86::TAILJMPm64));
+      for (unsigned i = 0; i != 5; ++i)
+        MIB.addOperand(MBBI->getOperand(i));
+    } else if (RetOpcode == X86::TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+    }
+
+    MachineInstr *NewMI = prior(MBBI);
+    NewMI->copyImplicitOps(MF, MBBI);
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+             (X86FI->getTCReturnAddrDelta() < 0)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int delta = -1*X86FI->getTCReturnAddrDelta();
+    MBBI = MBB.getLastNonDebugInstr();
+
+    // Check for possible merge with preceding ADD instruction.
+    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+                 *RegInfo);
+  }
+}
+
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (RegInfo->hasBasePointer(MF)) {
+    assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+  } else if (RegInfo->needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      return Offset + RegInfo->getSlotSize();
+    } else {
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
+      return Offset + StackSize;
+    }
+    // FIXME: Support tail calls
+  } else {
+    if (!hasFP(MF))
+      return Offset + StackSize;
+
+    // Skip the saved EBP.
+    Offset += RegInfo->getSlotSize();
+
+    // Skip the RETADDR move area
+    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset;
+}
+
+int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             unsigned &FrameReg) const {
+  const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
+  // We can't calculate offset from frame pointer if the stack is realigned,
+  // so enforce usage of stack/base pointer.  The base pointer is used when we
+  // have dynamic allocas in addition to dynamic realignment.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else if (RegInfo->needsStackRealignment(MF))
+    FrameReg = RegInfo->getStackRegister();
+  else
+    FrameReg = RegInfo->getFrameRegister(MF);
+  return getFrameIndexOffset(MF, FI);
+}
+
+bool X86FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
+
+  unsigned SlotSize = STI.is64Bit() ? 8 : 4;
+  unsigned FPReg = TRI->getFrameRegister(MF);
+  unsigned CalleeFrameSize = 0;
+
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+
+  // Push GPRs. It increases frame size.
+  unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitPrologue will handle spilling of frame register.
+      continue;
+    CalleeFrameSize += SlotSize;
+    BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+
+  // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
+  // It can be done by spilling XMMs to stack frame.
+  // Note that only Win64 ABI might spill XMMs.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(),
+                            RC, TRI);
+  }
+
+  return true;
+}
+
+bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // Reload XMMs from stack frame.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (X86::GR64RegClass.contains(Reg) ||
+        X86::GR32RegClass.contains(Reg))
+      continue;
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(),
+                             RC, TRI);
+  }
+
+  // POP GPRs.
+  unsigned FPReg = TRI->getFrameRegister(MF);
+  unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (!X86::GR64RegClass.contains(Reg) &&
+        !X86::GR32RegClass.contains(Reg))
+      continue;
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
+      continue;
+    BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
+  }
+  return true;
+}
+
+void
+X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                   RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  unsigned SlotSize = RegInfo->getSlotSize();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+                           (-1U*SlotSize)+TailCallReturnAddrDelta, true);
+  }
+
+  if (hasFP(MF)) {
+    assert((TailCallReturnAddrDelta <= 0) &&
+           "The Delta should always be zero or negative");
+    const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
+
+    // Create a frame entry for the EBP register that must be saved.
+    int FrameIdx = MFI->CreateFixedObject(SlotSize,
+                                          -(int)SlotSize +
+                                          TFI.getOffsetOfLocalArea() +
+                                          TailCallReturnAddrDelta,
+                                          true);
+    assert(FrameIdx == MFI->getObjectIndexBegin() &&
+           "Slot for EBP register must be last in order to be found!");
+    (void)FrameIdx;
+  }
+
+  // Spill the BasePtr if it's used.
+  if (RegInfo->hasBasePointer(MF))
+    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
+}
+
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
+  }
+  return false;
+}
+
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
+static unsigned
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+  CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+  // Erlang stuff.
+  if (CallingConvention == CallingConv::HiPE) {
+    if (Is64Bit)
+      return Primary ? X86::R14 : X86::R13;
+    else
+      return Primary ? X86::EBX : X86::EDI;
+  }
+
+  if (Is64Bit)
+    return Primary ? X86::R11 : X86::R12;
+
+  bool IsNested = HasNestArgument(&MF);
+
+  if (CallingConvention == CallingConv::X86_FastCall ||
+      CallingConvention == CallingConv::Fast) {
+    if (IsNested)
+      report_fatal_error("Segmented stacks does not support fastcall with "
+                         "nested function.");
+    return Primary ? X86::EAX : X86::ECX;
+  }
+  if (IsNested)
+    return Primary ? X86::EDX : X86::EAX;
+  return Primary ? X86::ECX : X86::EAX;
+}
+
+// The stack limit in the TCB is set to this many bytes above the actual stack
+// limit.
+static const uint64_t kSplitStackAvailable = 256;
+
+void
+X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  uint64_t StackSize;
+  bool Is64Bit = STI.is64Bit();
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
+
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
+      !STI.isTargetWin32() && !STI.isTargetFreeBSD())
+    report_fatal_error("Segmented stacks not supported on this platform.");
+
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
+
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
+
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+         e = prologueMBB.livein_end(); i != e; i++) {
+    allocMBB->addLiveIn(*i);
+    checkMBB->addLiveIn(*i);
+  }
+
+  if (IsNested)
+    allocMBB->addLiveIn(X86::R10);
+
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // When the frame size is less than 256 we just compare the stack
+  // boundary directly to the value of the stack pointer, per gcc.
+  bool CompareStackPointer = StackSize < kSplitStackAvailable;
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x70;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
+    } else if (STI.isTargetFreeBSD()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x18;
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::RSP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+      .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    if (STI.isTargetLinux()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x30;
+    } else if (STI.isTargetDarwin()) {
+      TlsReg = X86::GS;
+      TlsOffset = 0x48 + 90*4;
+    } else if (STI.isTargetWin32()) {
+      TlsReg = X86::FS;
+      TlsOffset = 0x14; // pvArbitrary, reserved for application use
+    } else if (STI.isTargetFreeBSD()) {
+      report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
+    } else {
+      report_fatal_error("Segmented stacks not supported on this platform.");
+    }
+
+    if (CompareStackPointer)
+      ScratchReg = X86::ESP;
+    else
+      BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+        .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
+
+    if (STI.isTargetLinux() || STI.isTargetWin32()) {
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+        .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+    } else if (STI.isTargetDarwin()) {
+
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+      unsigned ScratchReg2;
+      bool SaveScratch2;
+      if (CompareStackPointer) {
+        // The primary scratch register is available for holding the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
+        SaveScratch2 = false;
+      } else {
+        // Need to use a second register to hold the TLS offset
+        ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
+
+        // Unfortunately, with fastcc the second scratch register may hold an arg
+        SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
+      }
+
+      // If Scratch2 is live-in then it needs to be saved
+      assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
+             "Scratch register is live-in and not saved");
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
+          .addReg(ScratchReg2, RegState::Kill);
+
+      BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
+        .addImm(TlsOffset);
+      BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
+        .addReg(ScratchReg)
+        .addReg(ScratchReg2).addImm(1).addReg(0)
+        .addImm(0)
+        .addReg(TlsReg);
+
+      if (SaveScratch2)
+        BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
+    }
+  }
+
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+      .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(X86::R10);
+    MF.getRegInfo().setPhysRegUsed(X86::R11);
+  } else {
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
+
+  // __morestack is in libgcc
+  if (Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack");
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack");
+
+  if (IsNested)
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
+
+  allocMBB->addSuccessor(&prologueMBB);
+
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&prologueMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+///	  temp0 = sp - MaxStack
+///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+///	  ...
+/// IncStack:
+///	  call inc_stack   # doubles the stack space
+///	  temp0 = sp - MaxStack
+///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+  const bool Is64Bit = STI.is64Bit();
+  DebugLoc DL;
+  // HiPE-specific values
+  const unsigned HipeLeafWords = 24;
+  const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+  const unsigned Guaranteed = HipeLeafWords * SlotSize;
+  unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+                            MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+  unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+  assert(STI.isTargetLinux() &&
+         "HiPE prologue is only supported on Linux operating systems.");
+
+  // Compute the largest caller's frame that is needed to fit the callees'
+  // frames. This 'MaxStack' is computed from:
+  //
+  // a) the fixed frame size, which is the space needed for all spilled temps,
+  // b) outgoing on-stack parameter areas, and
+  // c) the minimum stack space this function needs to make available for the
+  //    functions it calls (a tunable ABI property).
+  if (MFI->hasCalls()) {
+    unsigned MoreStackForCalls = 0;
+
+    for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+         MBBI != MBBE; ++MBBI)
+      for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+           MI != ME; ++MI) {
+        if (!MI->isCall())
+          continue;
+
+        // Get callee operand.
+        const MachineOperand &MO = MI->getOperand(0);
+
+        // Only take account of global function calls (no closures etc.).
+        if (!MO.isGlobal())
+          continue;
+
+        const Function *F = dyn_cast<Function>(MO.getGlobal());
+        if (!F)
+          continue;
+
+        // Do not update 'MaxStack' for primitive and built-in functions
+        // (encoded with names either starting with "erlang."/"bif_" or not
+        // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+        // "_", such as the BIF "suspend_0") as they are executed on another
+        // stack.
+        if (F->getName().find("erlang.") != StringRef::npos ||
+            F->getName().find("bif_") != StringRef::npos ||
+            F->getName().find_first_of("._") == StringRef::npos)
+          continue;
+
+        unsigned CalleeStkArity =
+          F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+        if (HipeLeafWords - 1 > CalleeStkArity)
+          MoreStackForCalls = std::max(MoreStackForCalls,
+                               (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+      }
+    MaxStack += MoreStackForCalls;
+  }
+
+  // If the stack frame needed is larger than the guaranteed then runtime checks
+  // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+  if (MaxStack > Guaranteed) {
+    MachineBasicBlock &prologueMBB = MF.front();
+    MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+    MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+    for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
+           E = prologueMBB.livein_end(); I != E; I++) {
+      stackCheckMBB->addLiveIn(*I);
+      incStackMBB->addLiveIn(*I);
+    }
+
+    MF.push_front(incStackMBB);
+    MF.push_front(stackCheckMBB);
+
+    unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+    unsigned LEAop, CMPop, CALLop;
+    if (Is64Bit) {
+      SPReg = X86::RSP;
+      PReg  = X86::RBP;
+      LEAop = X86::LEA64r;
+      CMPop = X86::CMP64rm;
+      CALLop = X86::CALL64pcrel32;
+      SPLimitOffset = 0x90;
+    } else {
+      SPReg = X86::ESP;
+      PReg  = X86::EBP;
+      LEAop = X86::LEA32r;
+      CMPop = X86::CMP32rm;
+      CALLop = X86::CALLpcrel32;
+      SPLimitOffset = 0x4c;
+    }
+
+    ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+    assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+           "HiPE prologue scratch register is live-in");
+
+    // Create new MBB for StackCheck:
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    // SPLimitOffset is in a fixed heap location (pointed by BP).
+    addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+
+    // Create new MBB for IncStack:
+    BuildMI(incStackMBB, DL, TII.get(CALLop)).
+      addExternalSymbol("inc_stack_0");
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+                 SPReg, false, -MaxStack);
+    addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+                 .addReg(ScratchReg), PReg, false, SPLimitOffset);
+    BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+
+    stackCheckMBB->addSuccessor(&prologueMBB, 99);
+    stackCheckMBB->addSuccessor(incStackMBB, 1);
+    incStackMBB->addSuccessor(&prologueMBB, 99);
+    incStackMBB->addSuccessor(incStackMBB, 1);
+  }
+#ifdef XDEBUG
+  MF.verify();
+#endif
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+  unsigned StackPtr = RegInfo.getStackRegister();
+  bool reseveCallFrame = hasReservedCallFrame(MF);
+  int Opcode = I->getOpcode();
+  bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+  bool IsLP64 = STI.isTarget64BitLP64();
+  DebugLoc DL = I->getDebugLoc();
+  uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  I = MBB.erase(I);
+
+  if (!reseveCallFrame) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    if (Amount == 0)
+      return;
+
+    // We need to keep the stack aligned properly.  To do this, we round the
+    // amount of space needed for the outgoing arguments up to the next
+    // alignment boundary.
+    unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+    Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+    MachineInstr *New = 0;
+    if (Opcode == TII.getCallFrameSetupOpcode()) {
+      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+                    StackPtr)
+        .addReg(StackPtr)
+        .addImm(Amount);
+    } else {
+      assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+      // Factor out the amount the callee already popped.
+      Amount -= CalleeAmt;
+
+      if (Amount) {
+        unsigned Opc = getADDriOpcode(IsLP64, Amount);
+        New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      }
+    }
+
+    if (New) {
+      // The EFLAGS implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      // Replace the pseudo instruction with a new instruction.
+      MBB.insert(I, New);
+    }
+
+    return;
+  }
+
+  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+      .addReg(StackPtr).addImm(CalleeAmt);
+
+    // The EFLAGS implicit def is dead.
+    New->getOperand(3).setIsDead();
+
+    // We are not tracking the stack pointer adjustment by the callee, so make
+    // sure we restore the stack pointer immediately after the call, there may
+    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+    MachineBasicBlock::iterator B = MBB.begin();
+    while (I != B && !llvm::prior(I)->isCall())
+      --I;
+    MBB.insert(I, New);
+  }
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
new file mode 100644
index 000000000000..3f08b9a2e8d2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -0,0 +1,76 @@
+//===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements X86-specific bits of TargetFrameLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_FRAMELOWERING_H
+#define X86_FRAMELOWERING_H
+
+#include "X86Subtarget.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+  class MCSymbol;
+  class X86TargetMachine;
+
+class X86FrameLowering : public TargetFrameLowering {
+  const X86TargetMachine &TM;
+  const X86Subtarget &STI;
+public:
+  explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
+    : TargetFrameLowering(StackGrowsDown,
+                          sti.getStackAlignment(),
+                          (sti.is64Bit() ? -8 : -4)),
+      TM(tm), STI(sti) {
+  }
+
+  void emitCalleeSavedFrameMoves(MachineFunction &MF, MCSymbol *Label,
+                                 unsigned FramePtr) const;
+
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void adjustForSegmentedStacks(MachineFunction &MF) const;
+
+  void adjustForHiPEPrologue(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 000000000000..6041669f8182
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,2728 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDValue's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    // This is really a union, discriminated by BaseType!
+    SDValue Base_Reg;
+    int Base_FrameIndex;
+
+    unsigned Scale;
+    SDValue IndexReg;
+    int32_t Disp;
+    SDValue Segment;
+    const GlobalValue *GV;
+    const Constant *CP;
+    const BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+    unsigned char SymbolFlags;  // X86II::MO_*
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
+        Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
+        SymbolFlags(X86II::MO_NO_FLAG) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+    }
+
+    bool hasBaseOrIndexReg() const {
+      return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+    }
+
+    /// isRIPRelative - Return true if this addressing mode is already RIP
+    /// relative.
+    bool isRIPRelative() const {
+      if (BaseType != RegBase) return false;
+      if (RegisterSDNode *RegNode =
+            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
+        return RegNode->getReg() == X86::RIP;
+      return false;
+    }
+
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base_Reg = Reg;
+    }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump() {
+      dbgs() << "X86ISelAddressMode " << this << '\n';
+      dbgs() << "Base_Reg ";
+      if (Base_Reg.getNode() != 0)
+        Base_Reg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode() != 0)
+        IndexReg.getNode()->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << '\n'
+             << "ES ";
+      if (ES)
+        dbgs() << ES;
+      else
+        dbgs() << "nul";
+      dbgs() << " JT" << JT << " Align" << Align << '\n';
+    }
+#endif
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class X86DAGToDAGISel : public SelectionDAGISel {
+    /// X86Lowering - This object fully describes how to lower LLVM code to an
+    /// X86-specific SelectionDAG.
+    const X86TargetLowering &X86Lowering;
+
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// OptForSize - If true, selector should try to optimize for code size
+    /// instead of performance.
+    bool OptForSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel),
+        X86Lowering(*tm.getTargetLowering()),
+        Subtarget(&tm.getSubtarget<X86Subtarget>()),
+        OptForSize(false) {}
+
+    virtual const char *getPassName() const {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    virtual void EmitFunctionEntryCode();
+
+    virtual bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const;
+
+    virtual void PreprocessISelDAG();
+
+    inline bool immSext8(SDNode *N) const {
+      return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue());
+    }
+
+    // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+    // sign extended field.
+    inline bool i64immSExt32(SDNode *N) const {
+      uint64_t v = cast<ConstantSDNode>(N)->getZExtValue();
+      return (int64_t)v == (int32_t)v;
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N);
+    SDNode *SelectGather(SDNode *N, unsigned Opc);
+    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
+
+    bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
+    bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
+    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool SelectLEAAddr(SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp,
+                       SDValue &Segment);
+    bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
+                           SDValue &Scale, SDValue &Index, SDValue &Disp,
+                           SDValue &Segment);
+    bool SelectScalarSSELoad(SDNode *Root, SDValue N,
+                             SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &NodeWithChain);
+
+    bool TryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps);
+
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
+                                   SDValue &Scale, SDValue &Index,
+                                   SDValue &Disp, SDValue &Segment) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+        AM.Base_Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+                                              MVT::i32, AM.Disp,
+                                              AM.SymbolFlags);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp, AM.SymbolFlags);
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - Return an SDNode that returns the value of
+    /// the global base register. Output instructions required to
+    /// initialize the global base register, if necessary.
+    ///
+    SDNode *getGlobalBaseReg();
+
+    /// getTargetMachine - Return a reference to the TargetMachine, casted
+    /// to the target-specific type.
+    const X86TargetMachine &getTargetMachine() const {
+      return static_cast<const X86TargetMachine &>(TM);
+    }
+
+    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+    /// to the target-specific type.
+    const X86InstrInfo *getInstrInfo() const {
+      return getTargetMachine().getInstrInfo();
+    }
+  };
+}
+
+
+bool
+X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (!N.hasOneUse())
+    return false;
+
+  if (N.getOpcode() != ISD::LOAD)
+    return true;
+
+  // If N is a load, do additional profitability checks.
+  if (U == Root) {
+    switch (U->getOpcode()) {
+    default: break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::AND:
+    case X86ISD::XOR:
+    case X86ISD::OR:
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+  }
+
+  return true;
+}
+
+/// MoveBelowCallOrigChain - Replace the original chain operand of the call with
+/// load's chain operand and move load below the call's chain operand.
+static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
+                               SDValue Call, SDValue OrigChain) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = OrigChain.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+                      MVT::Other, &Ops[0], Ops.size());
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
+    Ops.push_back(OrigChain.getOperand(i));
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+
+  unsigned NumOps = Call.getNode()->getNumOperands();
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  for (unsigned i = 1, e = NumOps; i != e; ++i)
+    Ops.push_back(Call.getOperand(i));
+  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+/// In the case of a tail call, there isn't a callseq node between the call
+/// chain and the load.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+
+  if (!Chain.getNumOperands())
+    return false;
+  // Since we are not checking for AA here, conservatively abort if the chain
+  // writes to memory. It's not safe to move the callee (a load) across a store.
+  if (isa<MemSDNode>(Chain.getNode()) &&
+      cast<MemSDNode>(Chain.getNode())->writeMem())
+    return false;
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
+    return true;
+  return false;
+}
+
+void X86DAGToDAGISel::PreprocessISelDAG() {
+  // OptForSize is used in pattern predicates that isel is matching.
+  OptForSize = MF->getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+
+    if (OptLevel != CodeGenOpt::None &&
+        // Only does this when target favors doesn't favor register indirect
+        // call.
+        ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be folded into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
+      SDValue Chain = N->getOperand(0);
+      SDValue Load  = N->getOperand(1);
+      if (!isCalleeLoad(Load, Chain, HasCallSeq))
+        continue;
+      MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    // Lower fpround and fpextend nodes that target the FP stack to be store and
+    // load to the stack.  This is a gross hack.  We would like to simply mark
+    // these as being illegal, but when we do that, legalize produces these when
+    // it expands calls, then expands these in the same legalize pass.  We would
+    // like dag combine to be able to hack on these between the call expansion
+    // and the node legalization.  As such this pass basically does "really
+    // late" legalization of these inline with the X86 isel pass.
+    // FIXME: This should only happen when not compiled with -O0.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+
+    EVT SrcVT = N->getOperand(0).getValueType();
+    EVT DstVT = N->getValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    EVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    DebugLoc dl = N->getDebugLoc();
+
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+                                          N->getOperand(0),
+                                          MemTmp, MachinePointerInfo(), MemVT,
+                                          false, false, 0);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        MachinePointerInfo(),
+                                        MemVT, false, false, 0);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }
+}
+
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing()) {
+    unsigned CallOp =
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+    BuildMI(BB, DebugLoc(),
+            TII->get(CallOp)).addExternalSymbol("__main");
+  }
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode() {
+  // If this is main, emit special code for main.
+  if (const Function *Fn = MF->getFunction())
+    if (Fn->hasExternalLinkage() && Fn->getName() == "main")
+      EmitSpecialCodeForMain(MF->begin(), MF->getFrameInfo());
+}
+
+static bool isDispSafeForFrameIndex(int64_t Val) {
+  // On 64-bit platforms, we can run into an issue where a frame index
+  // includes a displacement that, when added to the explicit displacement,
+  // will overflow the displacement field. Assuming that the frame index
+  // displacement fits into a 31-bit integer  (which is only slightly more
+  // aggressive than the current fundamental assumption that it fits into
+  // a 32-bit integer), a 31-bit disp should always be safe.
+  return isInt<31>(Val);
+}
+
+bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
+                                            X86ISelAddressMode &AM) {
+  int64_t Val = AM.Disp + Offset;
+  CodeModel::Model M = TM.getCodeModel();
+  if (Subtarget->is64Bit()) {
+    if (!X86::isOffsetSuitableForCodeModel(Val, M,
+                                           AM.hasSymbolicDisplacement()))
+      return true;
+    // In addition to the checks required for a register base, check that
+    // we do not try to use an unsafe Disp with a frame index.
+    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
+        !isDispSafeForFrameIndex(Val))
+      return true;
+  }
+  AM.Disp = Val;
+  return false;
+
+}
+
+bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
+  SDValue Address = N->getOperand(1);
+
+  // load gs:0 -> GS segment register.
+  // load fs:0 -> FS segment register.
+  //
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
+        Subtarget->isTargetLinux())
+      switch (N->getPointerInfo().getAddrSpace()) {
+      case 256:
+        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+        return false;
+      case 257:
+        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+        return false;
+      }
+
+  return true;
+}
+
+/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
+/// into an addressing mode.  These wrap things that will resolve down into a
+/// symbol reference.  If no match is possible, this returns true, otherwise it
+/// returns false.
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+  CodeModel::Model M = TM.getCodeModel();
+
+  // Handle X86-64 rip-relative addresses.  We check this before checking direct
+  // folding because RIP is preferable to non-RIP accesses.
+  if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
+      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+      // they cannot be folded into immediate fields.
+      // FIXME: This can be improved for kernel and other models?
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    // Base and index reg must be 0 in order to use %rip as base.
+    if (AM.hasBaseOrIndexReg())
+      return true;
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.GV = G->getGlobal();
+      AM.SymbolFlags = G->getTargetFlags();
+      if (FoldOffsetIntoAddress(G->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.SymbolFlags = CP->getTargetFlags();
+      if (FoldOffsetIntoAddress(CP->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+
+    if (N.getOpcode() == X86ISD::WrapperRIP)
+      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+    return false;
+  }
+
+  // Handle the case when globals fit in our immediate field: This is true for
+  // X86-32 always and X86-64 when in -mcmodel=small mode.  In 64-bit
+  // mode, this only applies to a non-RIP-relative computation.
+  if (!Subtarget->is64Bit() ||
+      M == CodeModel::Small || M == CodeModel::Kernel) {
+    assert(N.getOpcode() != X86ISD::WrapperRIP &&
+           "RIP-relative addressing already handled");
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      AM.GV = G->getGlobal();
+      AM.Disp += G->getOffset();
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += CP->getOffset();
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
+    return false;
+  }
+
+  return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (MatchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == 0) {
+    AM.Base_Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == 0 &&
+      AM.IndexReg.getNode() == 0 &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+// Insert a node into the DAG at least before the Pos node's position. This
+// will reposition the node as needed, and will assign it a node ID that is <=
+// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
+// IDs! The selection DAG must no longer depend on their uniqueness when this
+// is used.
+static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
+  if (N.getNode()->getNodeId() == -1 ||
+      N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
+    DAG.RepositionNode(Pos.getNode(), N.getNode());
+    N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+  }
+}
+
+// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This
+// allows us to convert the shift and and into an h-register extract and
+// a scaled index. Returns false if the simplification is performed.
+static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
+                                      uint64_t Mask,
+                                      SDValue Shift, SDValue X,
+                                      X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse())
+    return true;
+
+  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
+  if (ScaleLog <= 0 || ScaleLog >= 4 ||
+      Mask != (0xffu << ScaleLog))
+    return true;
+
+  EVT VT = N.getValueType();
+  DebugLoc DL = N.getDebugLoc();
+  SDValue Eight = DAG.getConstant(8, MVT::i8);
+  SDValue NewMask = DAG.getConstant(0xff, VT);
+  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
+  SDValue ShlCount = DAG.getConstant(ScaleLog, MVT::i8);
+  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, Eight);
+  InsertDAGNode(DAG, N, Srl);
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, And);
+  InsertDAGNode(DAG, N, ShlCount);
+  InsertDAGNode(DAG, N, Shl);
+  DAG.ReplaceAllUsesWith(N, Shl);
+  AM.IndexReg = And;
+  AM.Scale = (1 << ScaleLog);
+  return false;
+}
+
+// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
+// allows us to fold the shift into this addressing mode. Returns false if the
+// transform succeeded.
+static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
+                                        uint64_t Mask,
+                                        SDValue Shift, SDValue X,
+                                        X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SHL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  // Not likely to be profitable if either the AND or SHIFT node has more
+  // than one use (unless all uses are for address computation). Besides,
+  // isel mechanism requires their node ids to be reused.
+  if (!N.hasOneUse() || !Shift.hasOneUse())
+    return true;
+
+  // Verify that the shift amount is something we can fold.
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
+    return true;
+
+  EVT VT = N.getValueType();
+  DebugLoc DL = N.getDebugLoc();
+  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewMask);
+  InsertDAGNode(DAG, N, NewAnd);
+  InsertDAGNode(DAG, N, NewShift);
+  DAG.ReplaceAllUsesWith(N, NewShift);
+
+  AM.Scale = 1 << ShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
+// Implement some heroics to detect shifts of masked values where the mask can
+// be replaced by extending the shift and undoing that in the addressing mode
+// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
+// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
+// the addressing mode. This results in code such as:
+//
+//   int f(short *y, int *lookup_table) {
+//     ...
+//     return *y + lookup_table[*y >> 11];
+//   }
+//
+// Turning into:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $11, %ecx
+//   addl (%rsi,%rcx,4), %eax
+//
+// Instead of:
+//   movzwl (%rdi), %eax
+//   movl %eax, %ecx
+//   shrl $9, %ecx
+//   andl $124, %rcx
+//   addl (%rsi,%rcx), %eax
+//
+// Note that this function assumes the mask is provided as a mask *after* the
+// value is shifted. The input chain may or may not match that, but computing
+// such a mask is trivial.
+static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
+                                    uint64_t Mask,
+                                    SDValue Shift, SDValue X,
+                                    X86ISelAddressMode &AM) {
+  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)))
+    return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+  unsigned MaskLZ = CountLeadingZeros_64(Mask);
+  unsigned MaskTZ = CountTrailingZeros_64(Mask);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = MaskTZ;
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  // We also need to ensure that mask is a continuous run of bits.
+  if (CountTrailingOnes_64(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
+
+  // Scale the leading zero count down based on the actual size of the value.
+  // Also scale it down based on the size of the shift.
+  MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+
+  // The final check is to ensure that any masked out high bits of X are
+  // already known to be zero. Otherwise, the mask has a semantic impact
+  // other than masking out a couple of low bits. Unfortunately, because of
+  // the mask, zero extensions will be removed from operands in some cases.
+  // This code works extra hard to look through extensions because we can
+  // replace them with zero extensions cheaply if necessary.
+  bool ReplacingAnyExtend = false;
+  if (X.getOpcode() == ISD::ANY_EXTEND) {
+    unsigned ExtendBits =
+      X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+    // Assume that we'll replace the any-extend with a zero-extend, and
+    // narrow the search to the extended value.
+    X = X.getOperand(0);
+    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
+    ReplacingAnyExtend = true;
+  }
+  APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
+                                               MaskLZ);
+  APInt KnownZero, KnownOne;
+  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
+  if (MaskedHighBits != KnownZero) return true;
+
+  // We've identified a pattern that can be transformed into a single shift
+  // and an addressing mode. Make it so.
+  EVT VT = N.getValueType();
+  if (ReplacingAnyExtend) {
+    assert(X.getValueType() != VT);
+    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+    InsertDAGNode(DAG, N, NewX);
+    X = NewX;
+  }
+  DebugLoc DL = N.getDebugLoc();
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  InsertDAGNode(DAG, N, NewSRLAmt);
+  InsertDAGNode(DAG, N, NewSRL);
+  InsertDAGNode(DAG, N, NewSHLAmt);
+  InsertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewSRL;
+  return false;
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
+  DebugLoc dl = N.getDebugLoc();
+  DEBUG({
+      dbgs() << "MatchAddress: ";
+      AM.dump();
+    });
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+
+  // If this is already a %rip relative address, we can only merge immediates
+  // into it.  Instead of handling this in every case, we handle it here.
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRelative()) {
+    // FIXME: JumpTable and ExternalSymbol address currently don't like
+    // displacements.  It isn't very important, but this should be fixed for
+    // consistency.
+    if (!AM.ES && AM.JT != -1) return true;
+
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
+      if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM))
+        return false;
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!FoldOffsetIntoAddress(Val, AM))
+      return false;
+    break;
+  }
+
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == 0 &&
+        (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+      break;
+
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
+          if (!FoldOffsetIntoAddress(Disp, AM))
+            return false;
+        }
+
+        AM.IndexReg = ShVal;
+        return false;
+      }
+    }
+    break;
+
+  case ISD::SRL: {
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+    SDValue And = N.getOperand(0);
+    if (And.getOpcode() != ISD::AND) break;
+    SDValue X = And.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getValueSizeInBits() > 64) break;
+
+    // The mask used for the transform is expected to be post-shift, but we
+    // found the shift first so just apply the shift to the mask before passing
+    // it down.
+    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
+        !isa<ConstantSDNode>(And.getOperand(1)))
+      break;
+    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into the scale, and return false if we
+    // succeed.
+    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
+      return false;
+    break;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    // FALL THROUGH
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base_Reg.getNode() == 0 &&
+        AM.IndexReg.getNode() == 0) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
+            if (FoldOffsetIntoAddress(Disp, AM))
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base_Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+      AM = Backup;
+      break;
+    }
+
+    int Cost = 0;
+    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base_Reg.getNode() &&
+         !AM.Base_Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    InsertDAGNode(*CurDAG, N, Zero);
+    InsertDAGNode(*CurDAG, N, Neg);
+    return false;
+  }
+
+  case ISD::ADD: {
+    // Add an artificial use to this node so that we can keep track of
+    // it if it gets CSE'd with a different node.
+    HandleSDNode Handle(N);
+
+    X86ISelAddressMode Backup = AM;
+    if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+        !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // Try again after commuting the operands.
+    if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&&
+        !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        !AM.Base_Reg.getNode() &&
+        !AM.IndexReg.getNode()) {
+      N = Handle.getValue();
+      AM.Base_Reg = N.getOperand(0);
+      AM.IndexReg = N.getOperand(1);
+      AM.Scale = 1;
+      return false;
+    }
+    N = Handle.getValue();
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (CurDAG->isBaseWithConstantOffset(N)) {
+      X86ISelAddressMode Backup = AM;
+      ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1));
+
+      // Start with the LHS as an addr mode.
+      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+          !FoldOffsetIntoAddress(CN->getSExtValue(), AM))
+        return false;
+      AM = Backup;
+    }
+    break;
+
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
+    SDValue X = Shift.getOperand(0);
+
+    // We only handle up to 64-bit values here as those are what matter for
+    // addressing mode optimizations.
+    if (X.getValueSizeInBits() > 64) break;
+
+    if (!isa<ConstantSDNode>(N.getOperand(1)))
+      break;
+    uint64_t Mask = N.getConstantOperandVal(1);
+
+    // Try to fold the mask and shift into an extract and scale.
+    if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to fold the mask and shift directly into the scale.
+    if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+
+    // Try to swap the mask and shift to place shifts which can be done as
+    // a scale on the outside of the mask.
+    if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
+      return false;
+    break;
+  }
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (AM.IndexReg.getNode() == 0) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base_Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+///
+/// Parent is the parent node of the addr operand that is being matched.  It
+/// is always a load, store, atomic node, or null.  It is only null when
+/// checking memory operands for inline asm nodes.
+bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  if (Parent &&
+      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
+      // that are not a MemSDNode, and thus don't have proper addrspace info.
+      Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
+      Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
+    unsigned AddrSpace =
+      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+    // AddrSpace 256 -> GS, 257 -> FS.
+    if (AddrSpace == 256)
+      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
+    if (AddrSpace == 257)
+      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
+  }
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base_Reg.getNode())
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+///
+/// We also return:
+///   PatternChainNode: this is the matched node that has a chain input and
+///   output.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &PatternNodeWithChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    PatternNodeWithChain = N.getOperand(0);
+    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
+        PatternNodeWithChain.hasOneUse() &&
+        IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+        IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
+      if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+        return false;
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      N.getOperand(0).getNode()->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+      N.getOperand(0).getOperand(0).hasOneUse() &&
+      IsProfitableToFold(N.getOperand(0), N.getNode(), Root) &&
+      IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      return false;
+    PatternNodeWithChain = SDValue(LD, 0);
+    return true;
+  }
+  return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp,
+                                    SDValue &Segment) {
+  X86ISelAddressMode AM;
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (MatchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  EVT VT = N.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base_Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base_Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
+                                        SDValue &Scale, SDValue &Index,
+                                        SDValue &Disp, SDValue &Segment) {
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+
+  X86ISelAddressMode AM;
+  AM.GV = GA->getGlobal();
+  AM.Disp += GA->getOffset();
+  AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
+  AM.SymbolFlags = GA->getTargetFlags();
+
+  if (N.getValueType() == MVT::i32) {
+    AM.Scale = 1;
+    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+  } else {
+    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+  }
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+
+bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (!ISD::isNON_EXTLoad(N.getNode()) ||
+      !IsProfitableToFold(N, P, P) ||
+      !IsLegalToFold(N, P, P, OptLevel))
+    return false;
+
+  return SelectAddr(N.getNode(),
+                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = Node->getOperand(2);
+  SDValue In2H = Node->getOperand(3);
+
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return NULL;
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other, Ops,
+                                           array_lengthof(Ops));
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
+/// Atomic opcode table
+///
+enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
+  OR,
+  AND,
+  XOR,
+  AtomicOpcEnd
+};
+
+enum AtomicSz {
+  ConstantI8,
+  I8,
+  SextConstantI16,
+  ConstantI16,
+  I16,
+  SextConstantI32,
+  ConstantI32,
+  I32,
+  SextConstantI64,
+  ConstantI64,
+  I64,
+  AtomicSzEnd
+};
+
+static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
+  {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
+    X86::LOCK_OR8mi,
+    X86::LOCK_OR8mr,
+    X86::LOCK_OR16mi8,
+    X86::LOCK_OR16mi,
+    X86::LOCK_OR16mr,
+    X86::LOCK_OR32mi8,
+    X86::LOCK_OR32mi,
+    X86::LOCK_OR32mr,
+    X86::LOCK_OR64mi8,
+    X86::LOCK_OR64mi32,
+    X86::LOCK_OR64mr,
+  },
+  {
+    X86::LOCK_AND8mi,
+    X86::LOCK_AND8mr,
+    X86::LOCK_AND16mi8,
+    X86::LOCK_AND16mi,
+    X86::LOCK_AND16mr,
+    X86::LOCK_AND32mi8,
+    X86::LOCK_AND32mi,
+    X86::LOCK_AND32mr,
+    X86::LOCK_AND64mi8,
+    X86::LOCK_AND64mi32,
+    X86::LOCK_AND64mr,
+  },
+  {
+    X86::LOCK_XOR8mi,
+    X86::LOCK_XOR8mr,
+    X86::LOCK_XOR16mi8,
+    X86::LOCK_XOR16mi,
+    X86::LOCK_XOR16mr,
+    X86::LOCK_XOR32mi8,
+    X86::LOCK_XOR32mi,
+    X86::LOCK_XOR32mr,
+    X86::LOCK_XOR64mi8,
+    X86::LOCK_XOR64mi32,
+    X86::LOCK_XOR64mr,
+  }
+};
+
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                DebugLoc dl,
+                                                enum AtomicOpc &Op, EVT NVT,
+                                                SDValue Val) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if ((CNVal == 1) || (CNVal == -1)) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return 0;
+
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Optimize common patterns for __sync_or_and_fetch and similar arith
+  // operations where the result is not used. This allows us to use the "lock"
+  // version of the arithmetic instruction.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return 0;
+
+  // Which index into the table.
+  enum AtomicOpc Op;
+  switch (Node->getOpcode()) {
+    default:
+      return 0;
+    case ISD::ATOMIC_LOAD_OR:
+      Op = OR;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Op = AND;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Op = XOR;
+      break;
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
+  }
+  
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
+
+  unsigned Opc = 0;
+  switch (NVT.getSimpleVT().SimpleTy) {
+    default: return 0;
+    case MVT::i8:
+      if (isCN)
+        Opc = AtomicOpcTbl[Op][ConstantI8];
+      else
+        Opc = AtomicOpcTbl[Op][I8];
+      break;
+    case MVT::i16:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI16];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI16];
+      } else
+        Opc = AtomicOpcTbl[Op][I16];
+      break;
+    case MVT::i32:
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI32];
+        else
+          Opc = AtomicOpcTbl[Op][ConstantI32];
+      } else
+        Opc = AtomicOpcTbl[Op][I32];
+      break;
+    case MVT::i64:
+      Opc = AtomicOpcTbl[Op][I64];
+      if (isCN) {
+        if (immSext8(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][SextConstantI64];
+        else if (i64immSExt32(Val.getNode()))
+          Opc = AtomicOpcTbl[Op][ConstantI64];
+      }
+      break;
+  }
+
+  assert(Opc != 0 && "Invalid arith lock transform!");
+
+  SDValue Ret;
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  if (isUnOp) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  }
+  cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+  SDValue RetVals[] = { Undef, Ret };
+  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4:
+      case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode
+/// is suitable for doing the {load; increment or decrement; store} to modify
+/// transformation.
+static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
+                                SDValue StoredVal, SelectionDAG *CurDAG,
+                                LoadSDNode* &LoadNode, SDValue &InputChain) {
+
+  // is the value stored the result of a DEC or INC?
+  if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+
+  // is the stored value result 0 of the load?
+  if (StoredVal.getResNo() != 0) return false;
+
+  // are there other uses of the loaded value than the inc or dec?
+  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
+
+  // is the store non-extending and non-indexed?
+  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
+    return false;
+
+  SDValue Load = StoredVal->getOperand(0);
+  // Is the stored value a non-extending and non-indexed load?
+  if (!ISD::isNormalLoad(Load.getNode())) return false;
+
+  // Return LoadNode by reference.
+  LoadNode = cast<LoadSDNode>(Load);
+  // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
+  EVT LdVT = LoadNode->getMemoryVT();
+  if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
+      LdVT != MVT::i8)
+    return false;
+
+  // Is store the only read of the loaded value?
+  if (!Load.hasOneUse())
+    return false;
+
+  // Is the address of the store the same as the load?
+  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
+      LoadNode->getOffset() != StoreNode->getOffset())
+    return false;
+
+  // Check if the chain is produced by the load or is a TokenFactor with
+  // the load output chain as an operand. Return InputChain by reference.
+  SDValue Chain = StoreNode->getChain();
+
+  bool ChainCheck = false;
+  if (Chain == Load.getValue(1)) {
+    ChainCheck = true;
+    InputChain = LoadNode->getChain();
+  } else if (Chain.getOpcode() == ISD::TokenFactor) {
+    SmallVector<SDValue, 4> ChainOps;
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
+      SDValue Op = Chain.getOperand(i);
+      if (Op == Load.getValue(1)) {
+        ChainCheck = true;
+        continue;
+      }
+
+      // Make sure using Op as part of the chain would not cause a cycle here.
+      // In theory, we could check whether the chain node is a predecessor of
+      // the load. But that can be very expensive. Instead visit the uses and
+      // make sure they all have smaller node id than the load.
+      int LoadId = LoadNode->getNodeId();
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = UI->use_end(); UI != UE; ++UI) {
+        if (UI.getUse().getResNo() != 0)
+          continue;
+        if (UI->getNodeId() > LoadId)
+          return false;
+      }
+
+      ChainOps.push_back(Op);
+    }
+
+    if (ChainCheck)
+      // Make a new TokenFactor with all the other input chains except
+      // for the load.
+      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+                                   MVT::Other, &ChainOps[0], ChainOps.size());
+  }
+  if (!ChainCheck)
+    return false;
+
+  return true;
+}
+
+/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory
+/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC.
+static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
+  if (Opc == X86ISD::DEC) {
+    if (LdVT == MVT::i64) return X86::DEC64m;
+    if (LdVT == MVT::i32) return X86::DEC32m;
+    if (LdVT == MVT::i16) return X86::DEC16m;
+    if (LdVT == MVT::i8)  return X86::DEC8m;
+  } else {
+    assert(Opc == X86ISD::INC && "unrecognized opcode");
+    if (LdVT == MVT::i64) return X86::INC64m;
+    if (LdVT == MVT::i32) return X86::INC32m;
+    if (LdVT == MVT::i16) return X86::INC16m;
+    if (LdVT == MVT::i8)  return X86::INC8m;
+  }
+  llvm_unreachable("unrecognized size for LdVT");
+}
+
+/// SelectGather - Customized ISel for GATHER operations.
+///
+SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
+  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
+  SDValue Chain = Node->getOperand(0);
+  SDValue VSrc = Node->getOperand(2);
+  SDValue Base = Node->getOperand(3);
+  SDValue VIdx = Node->getOperand(4);
+  SDValue VMask = Node->getOperand(5);
+  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
+  if (!Scale)
+    return 0;
+
+  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
+                                   MVT::Other);
+
+  // Memory Operands: Base, Scale, Index, Disp, Segment
+  SDValue Disp = CurDAG->getTargetConstant(0, MVT::i32);
+  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
+  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
+                          Disp, Segment, VMask, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           VTs, Ops, array_lengthof(Ops));
+  // Node has 2 outputs: VDst and MVT::Other.
+  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
+  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
+  // of ResNode.
+  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
+  return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+  EVT NVT = Node->getValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
+
+  if (Node->isMachineOpcode()) {
+    DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    return NULL;   // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_avx2_gather_d_pd:
+    case Intrinsic::x86_avx2_gather_d_pd_256:
+    case Intrinsic::x86_avx2_gather_q_pd:
+    case Intrinsic::x86_avx2_gather_q_pd_256:
+    case Intrinsic::x86_avx2_gather_d_ps:
+    case Intrinsic::x86_avx2_gather_d_ps_256:
+    case Intrinsic::x86_avx2_gather_q_ps:
+    case Intrinsic::x86_avx2_gather_q_ps_256:
+    case Intrinsic::x86_avx2_gather_d_q:
+    case Intrinsic::x86_avx2_gather_d_q_256:
+    case Intrinsic::x86_avx2_gather_q_q:
+    case Intrinsic::x86_avx2_gather_q_q_256:
+    case Intrinsic::x86_avx2_gather_d_d:
+    case Intrinsic::x86_avx2_gather_d_d_256:
+    case Intrinsic::x86_avx2_gather_q_d:
+    case Intrinsic::x86_avx2_gather_q_d_256: {
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Impossible intrinsic");
+      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
+      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
+      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
+      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
+      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
+      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
+      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
+      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
+      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
+      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
+      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
+      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
+      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
+      }
+      SDNode *RetVal = SelectGather(Node, Opc);
+      if (RetVal)
+        // We already called ReplaceUses inside SelectGather.
+        return NULL;
+      break;
+    }
+    }
+    break;
+  }
+  case X86ISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+
+  case X86ISD::ATOMOR64_DAG:
+  case X86ISD::ATOMXOR64_DAG:
+  case X86ISD::ATOMADD64_DAG:
+  case X86ISD::ATOMSUB64_DAG:
+  case X86ISD::ATOMNAND64_DAG:
+  case X86ISD::ATOMAND64_DAG:
+  case X86ISD::ATOMMAX64_DAG:
+  case X86ISD::ATOMMIN64_DAG:
+  case X86ISD::ATOMUMAX64_DAG:
+  case X86ISD::ATOMUMIN64_DAG:
+  case X86ISD::ATOMSWAP64_DAG: {
+    unsigned Opc;
+    switch (Opcode) {
+    default: llvm_unreachable("Impossible opcode");
+    case X86ISD::ATOMOR64_DAG:   Opc = X86::ATOMOR6432;   break;
+    case X86ISD::ATOMXOR64_DAG:  Opc = X86::ATOMXOR6432;  break;
+    case X86ISD::ATOMADD64_DAG:  Opc = X86::ATOMADD6432;  break;
+    case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
+    case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
+    case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMMAX64_DAG:  Opc = X86::ATOMMAX6432;  break;
+    case X86ISD::ATOMMIN64_DAG:  Opc = X86::ATOMMIN6432;  break;
+    case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
+    case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
+    case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
+    }
+    SDNode *RetVal = SelectAtomic64(Node, Opc);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
+    SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    // For operations of the form (x << C1) op C2, check if we can use a smaller
+    // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
+      break;
+
+    // i8 is unshrinkable, i16 should be promoted to i32.
+    if (NVT != MVT::i32 && NVT != MVT::i64)
+      break;
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
+    ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!Cst || !ShlCst)
+      break;
+
+    int64_t Val = Cst->getSExtValue();
+    uint64_t ShlVal = ShlCst->getZExtValue();
+
+    // Make sure that we don't change the operation by removing bits.
+    // This only matters for OR and XOR, AND is unaffected.
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
+      break;
+
+    unsigned ShlOp, Op;
+    EVT CstVT = NVT;
+
+    // Check the minimum bitwidth for the new constant.
+    // TODO: AND32ri is the same as AND64ri32 with zext imm.
+    // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
+    // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
+    if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
+      CstVT = MVT::i8;
+    else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
+      CstVT = MVT::i32;
+
+    // Bail if there is no smaller encoding.
+    if (NVT == CstVT)
+      break;
+
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      assert(CstVT == MVT::i8);
+      ShlOp = X86::SHL32ri;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = X86::AND32ri8; break;
+      case ISD::OR:  Op =  X86::OR32ri8; break;
+      case ISD::XOR: Op = X86::XOR32ri8; break;
+      }
+      break;
+    case MVT::i64:
+      assert(CstVT == MVT::i8 || CstVT == MVT::i32);
+      ShlOp = X86::SHL64ri;
+
+      switch (Opcode) {
+      default: llvm_unreachable("Impossible opcode");
+      case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
+      case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
+      case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
+      }
+      break;
+    }
+
+    // Emit the smaller op and the shift.
+    SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, CstVT);
+    SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
+    return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
+                                getI8Imm(ShlVal));
+  }
+  case X86ISD::UMUL: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    unsigned LoReg;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
+    case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
+    case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
+    case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                          N0, SDValue()).getValue(1);
+
+    SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
+    SDValue Ops[] = {N1, InFlag};
+    SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops, 2);
+
+    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
+    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
+    return NULL;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
+      }
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+      }
+    }
+
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+                                          N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
+
+    if (foldedLoad) {
+      SDValue Chain;
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
+
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), Chain);
+    } else {
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        InFlag = SDValue(CNode, 0);
+      }
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+      // Get the low part if needed. Don't use getCopyFromReg for aliasing
+      // registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 1),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX down 8 bits.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                              Result,
+                                     CurDAG->getTargetConstant(8, MVT::i8)), 0);
+      // Then truncate it down to i8.
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the low half of the result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      if (ResLo.getNode() == 0) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      if (ResHi.getNode() == 0) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+
+    return NULL;
+  }
+
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SDIVREM;
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      }
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg, ClrReg;
+    unsigned ClrOpcode, SExtOpcode;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
+      ClrOpcode  = 0;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrOpcode  = X86::MOV16r0; ClrReg = X86::DX;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+      ClrOpcode  = X86::MOV32r0;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+      ClrOpcode  = X86::MOV64r0;
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
+                                         MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
+      } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
+        InFlag =
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode =
+          SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+                                      ClrNode, InFlag).getValue(1);
+      }
+    }
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops,
+                               array_lengthof(Ops));
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+    }
+
+    // Prevent use of AH in a REX instruction by referencing AX instead.
+    // Shift it down 8 bits.
+    if (HiReg == X86::AH && Subtarget->is64Bit() &&
+        !SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              X86::AX, MVT::i16, InFlag);
+      InFlag = Result.getValue(2);
+
+      // If we also need AL (the quotient), get it by extracting a subreg from
+      // Result. The fast register allocator does not like multiple CopyFromReg
+      // nodes using aliasing registers.
+      if (!SDValue(Node, 0).use_empty())
+        ReplaceUses(SDValue(Node, 0),
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+
+      // Shift AX right by 8 bits instead of using AH.
+      Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                         Result,
+                                         CurDAG->getTargetConstant(8, MVT::i8)),
+                       0);
+      ReplaceUses(SDValue(Node, 1),
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
+    }
+    // Copy the division (low) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                              HiReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 1), Result);
+      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+    }
+    return NULL;
+  }
+
+  case X86ISD::CMP:
+  case X86ISD::SUB: {
+    // Sometimes a SUB is used to perform comparison.
+    if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
+      // This node is not a CMP.
+      break;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
+        HasNoSignedComparisonUses(Node))
+      // Look past the truncate if CMP is the only use of it.
+      N0 = N0.getOperand(0);
+    if ((N0.getNode()->getOpcode() == ISD::AND ||
+         (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+        N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          const TargetRegisterClass *TRC;
+          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
+      }
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        const TargetRegisterClass *TRC;
+        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
+      }
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
+      }
+    }
+    break;
+  }
+  case ISD::STORE: {
+    // Change a chain of {load; incr or dec; store} of the same value into
+    // a simple increment or decrement through memory of that value, if the
+    // uses of the modified value and its address are suitable.
+    // The DEC64m tablegen pattern is currently not able to match the case where
+    // the EFLAGS on the original DEC are used. (This also applies to
+    // {INC,DEC}X{64,32,16,8}.)
+    // We'll need to improve tablegen to allow flags to be transferred from a
+    // node in the pattern to the result node.  probably with a new keyword
+    // for example, we have this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (implicit EFLAGS)]>;
+    // but maybe need something like this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (transferrable EFLAGS)]>;
+
+    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+    SDValue StoredVal = StoreNode->getOperand(1);
+    unsigned Opc = StoredVal->getOpcode();
+
+    LoadSDNode *LoadNode = 0;
+    SDValue InputChain;
+    if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
+                             LoadNode, InputChain))
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+                    Base, Scale, Index, Disp, Segment))
+      break;
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+    MemOp[0] = StoreNode->getMemOperand();
+    MemOp[1] = LoadNode->getMemOperand();
+    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+    EVT LdVT = LoadNode->getMemoryVT();
+    unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
+    MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
+                                                   Node->getDebugLoc(),
+                                                   MVT::i32, MVT::Other, Ops,
+                                                   array_lengthof(Ops));
+    Result->setMemRefs(MemOp, MemOp + 2);
+
+    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+    return Result;
+  }
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(dbgs() << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        dbgs() << '\n');
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 000000000000..69341869aa3e
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,18512 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86ISelLowering.h"
+#include "Utils/X86ShuffleDecode.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VariadicFunction.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include <bitset>
+#include <cctype>
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                       SDValue V2);
+
+/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
+/// sets things up to match to an AVX VEXTRACTF128 instruction or a
+/// simple subregister reference.  Idx is an index in the 128 bits we
+/// want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, DebugLoc dl) {
+  EVT VT = Vec.getValueType();
+  assert(VT.is256BitVector() && "Unexpected vector size!");
+  EVT ElVT = VT.getVectorElementType();
+  unsigned Factor = VT.getSizeInBits()/128;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
+
+  // Extract from UNDEF is UNDEF.
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getUNDEF(ResultVT);
+
+  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
+  // we can match to VEXTRACTF128.
+  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+                               * ElemsPerChunk);
+
+  // If the input is a buildvector just emit a smaller one.
+  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+  SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
+                               VecIdx);
+
+  return Result;
+}
+
+/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
+/// sets things up to match to an AVX VINSERTF128 instruction or a
+/// simple superregister reference.  Idx is an index in the 128 bits
+/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
+                                  DebugLoc dl) {
+  // Inserting UNDEF is Result
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return Result;
+
+  EVT VT = Vec.getValueType();
+  assert(VT.is128BitVector() && "Unexpected vector size!");
+
+  EVT ElVT = VT.getVectorElementType();
+  EVT ResultVT = Result.getValueType();
+
+  // Insert the relevant 128 bits.
+  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+
+  // This is the index of the first element of the 128-bit chunk
+  // we want.
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+                               * ElemsPerChunk);
+
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
+                     VecIdx);
+}
+
+/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
+/// instructions. This is used because creating CONCAT_VECTOR nodes of
+/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
+/// large BUILD_VECTORS.
+static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   DebugLoc dl) {
+  SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
+static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  bool is64Bit = Subtarget->is64Bit();
+
+  if (Subtarget->isTargetEnvMacho()) {
+    if (is64Bit)
+      return new X86_64MachoTargetObjectFile();
+    return new TargetLoweringObjectFileMachO();
+  }
+
+  if (Subtarget->isTargetLinux())
+    return new X86LinuxTargetObjectFile();
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+    return new TargetLoweringObjectFileCOFF();
+  llvm_unreachable("unknown subtarget type");
+}
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+  : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
+
+  RegInfo = TM.getRegisterInfo();
+  TD = getDataLayout();
+
+  // Set up the TargetLowering object.
+  static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // For 64-bit since we have so many registers use the ILP scheduler, for
+  // 32-bit code use the register pressure specific scheduling.
+  // For Atom, always use ILP scheduling.
+  if (Subtarget->isAtom())
+    setSchedulingPreference(Sched::ILP);
+  else if (Subtarget->is64Bit())
+    setSchedulingPreference(Sched::ILP);
+  else
+    setSchedulingPreference(Sched::RegPressure);
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass expensive divides on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
+    addBypassSlowDiv(32, 8);
+    if (Subtarget->is64Bit())
+      addBypassSlowDiv(64, 16);
+  }
+
+  if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
+    // Setup Windows compiler runtime calls.
+    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
+    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
+    setLibcallName(RTLIB::SREM_I64, "_allrem");
+    setLibcallName(RTLIB::UREM_I64, "_aullrem");
+    setLibcallName(RTLIB::MUL_I64, "_allmul");
+    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
+
+    // The _ftol2 runtime function has an unusual calling conv, which
+    // is modeled by a special pseudo-instruction.
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetMingw()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, &X86::GR8RegClass);
+  addRegisterClass(MVT::i16, &X86::GR16RegClass);
+  addRegisterClass(MVT::i32, &X86::GR32RegClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, &X86::GR64RegClass);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+  } else if (!TM.Options.UseSoftFloat) {
+    // We have an algorithm for SSE2->double, and we turn this into a
+    // 64-bit FILD followed by conditional FADD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD for other targets.
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!TM.Options.UseSoftFloat) {
+    // SSE has no i16 to fp conversion, only i32
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+  // are Legal, f80 is custom lowered.
+  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSEf32) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    // f32 and f64 cases are Legal, f80 case is not
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else if (!TM.Options.UseSoftFloat) {
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+  }
+
+  if (isTargetFTOL()) {
+    // Use the _ftol2 runtime function, which has a pseudo-instruction
+    // to handle its weird calling convention.
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
+    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
+      // Without SSE, i64->f64 goes through memory.
+      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
+    }
+  }
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::MULHS, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+
+    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
+    setOperationAction(ISD::ADDC, VT, Custom);
+    setOperationAction(ISD::ADDE, VT, Custom);
+    setOperationAction(ISD::SUBC, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Custom);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  // Promote the i8 variants and force them on up to i32 which has a shorter
+  // encoding.
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
+  AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    // When promoting the i8 variants, force them to i32 for a shorter
+    // encoding.
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
+    AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    }
+  }
+
+  if (Subtarget->hasPOPCNT()) {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
+    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
+
+  // On X86 and X86-64, atomic operations are lowered to locked instructions.
+  // Locked instructions, in turn, have implicit fence semantics (all memory
+  // operations are flushed before issuing the locked instruction, and they
+  // are not buffered), so we can fold away the common pattern of
+  // fence-atomic-fence.
+  setShouldFoldAtomicFences(true);
+
+  // Expand certain atomics
+  for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
+  }
+
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
+  }
+
+  if (Subtarget->hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+  }
+
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() &&
+      !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing()) {
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  if (Subtarget->is64Bit()) {
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  } else {
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else if (TM.Options.EnableSegmentedStacks)
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Expand);
+
+  if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::FR64RegClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // Lower this to FGETSIGNx86 plus an AND.
+    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
+    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, &X86::FR32RegClass);
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    }
+  } else if (!TM.Options.UseSoftFloat) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
+      setOperationAction(ISD::FSIN   , MVT::f32, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f32, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // We don't support FMA.
+  setOperationAction(ISD::FMA, MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f32, Expand);
+
+  // Long double always uses X87.
+  if (!TM.Options.UseSoftFloat) {
+    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+
+      bool ignored;
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
+      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
+    }
+
+    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
+    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
+    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
+    setOperationAction(ISD::FMA, MVT::f80, Expand);
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+           i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType)i;
+    setOperationAction(ISD::ADD , VT, Expand);
+    setOperationAction(ISD::SUB , VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FNEG, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+    setOperationAction(ISD::MUL , VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::SDIV, VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::LOAD, VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FSINCOS, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Expand);
+    setOperationAction(ISD::FMA,  VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FCEIL, VT, Expand);
+    setOperationAction(ISD::FTRUNC, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::SHL, VT, Expand);
+    setOperationAction(ISD::SRA, VT, Expand);
+    setOperationAction(ISD::SRL, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
+    setOperationAction(ISD::TRUNCATE, VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
+             InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction(VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
+    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
+    // No operations on x86mmx supported, everything uses intrinsics.
+  }
+
+  // MMX-sized vectors (other than x86mmx) are expected to be expanded
+  // into smaller operations.
+  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
+  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
+  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
+  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
+  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
+  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
+  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
+  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
+  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
+  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
+  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
+  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
+  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
+  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
+  setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
+
+    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
+    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
+    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
+    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+      // Do not attempt to custom lower non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
+
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
+    }
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+  }
+
+  if (Subtarget->hasSSE41()) {
+    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
+    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
+    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
+
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
+
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
+
+    // i8 and i16 vectors are custom , because the source register and source
+    // source memory operand types are not the same width.  f32 vectors are
+    // custom since the immediate controlling the insert encodes additional
+    // information.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+    // FIXME: these should be Legal but thats only for the case where
+    // the index is constant.  For now custom expand to deal with that.
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+  }
+
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
+  }
+
+  if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
+    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
+    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
+    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
+    setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
+      setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
+      setOperationAction(ISD::FMA,             MVT::f32, Legal);
+      setOperationAction(ISD::FMA,             MVT::f64, Legal);
+    }
+
+    if (Subtarget->hasInt256()) {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+
+      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
+    } else {
+      setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
+      setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
+      setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
+      setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
+      setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
+      setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
+      setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
+
+      setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
+      setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
+      setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
+      // Don't lower v32i8 because there is no 128-bit byte mul
+    }
+
+    // In the customized shift lowering, the legal cases in AVX2 will be
+    // recognized.
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+
+    // Custom lower several nodes for 256-bit types.
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Extract subvector is special because the value type
+      // (result) is 128-bit but the source is 256-bit wide.
+      if (VT.is128BitVector())
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+      // Do not attempt to custom lower other non-256-bit vectors
+      if (!VT.is256BitVector())
+        continue;
+
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+    }
+
+    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
+    for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is256BitVector())
+        continue;
+
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+    }
+  }
+
+  // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
+  // of this type with custom code.
+  for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
+           VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                       Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
+  // handle type legalization for these operations here.
+  //
+  // FIXME: We really should do custom legalization for addition and
+  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
+  // than generic legalization for 64-bit multiplication-with-overflow, though.
+  for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
+    // Add/Sub/Mul with overflow operations are custom lowered.
+    MVT VT = IntVTs[i];
+    setOperationAction(ISD::SADDO, VT, Custom);
+    setOperationAction(ISD::UADDO, VT, Custom);
+    setOperationAction(ISD::SSUBO, VT, Custom);
+    setOperationAction(ISD::USUBO, VT, Custom);
+    setOperationAction(ISD::SMULO, VT, Custom);
+    setOperationAction(ISD::UMULO, VT, Custom);
+  }
+
+  // There are no 8-bit 3-address imul/mul instructions
+  setOperationAction(ISD::SMULO, MVT::i8, Expand);
+  setOperationAction(ISD::UMULO, MVT::i8, Expand);
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, 0);
+    setLibcallName(RTLIB::SRL_I128, 0);
+    setLibcallName(RTLIB::SRA_I128, 0);
+  }
+
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->isTargetDarwin()) {
+      // For MacOSX, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+      // traffic.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::VSELECT);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
+  setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::TRUNCATE);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::SETCC);
+  if (Subtarget->is64Bit())
+    setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::XOR);
+
+  computeRegisterProperties();
+
+  // On Darwin, -Os means optimize for size without hurting performance,
+  // do not reduce the limit.
+  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+  MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+  MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+  setPrefLoopAlignment(4); // 2^4 bytes.
+
+  // Predictable cmov don't hurt on atom because it's in-order.
+  PredictableSelectIsExpensive = !Subtarget->isAtom();
+
+  setPrefFunctionAlignment(4); // 2^4 bytes.
+}
+
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return MVT::i8;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
+  if (Subtarget->is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget->hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. If DstAlign is zero that means it's safe to destination
+/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+/// means there isn't a need to check it against alignment requirement,
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
+/// It returns EVT::Other if the type should be determined using generic
+/// target-independent logic.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size,
+                                       unsigned DstAlign, unsigned SrcAlign,
+                                       bool IsMemset, bool ZeroMemset,
+                                       bool MemcpyStrSrc,
+                                       MachineFunction &MF) const {
+  const Function *F = MF.getFunction();
+  if ((!IsMemset || ZeroMemset) &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat)) {
+    if (Size >= 16 &&
+        (Subtarget->isUnalignedMemAccessFast() ||
+         ((DstAlign == 0 || DstAlign >= 16) &&
+          (SrcAlign == 0 || SrcAlign >= 16)))) {
+      if (Size >= 32) {
+        if (Subtarget->hasInt256())
+          return MVT::v8i32;
+        if (Subtarget->hasFp256())
+          return MVT::v8f32;
+      }
+      if (Subtarget->hasSSE2())
+        return MVT::v4i32;
+      if (Subtarget->hasSSE1())
+        return MVT::v4f32;
+    } else if (!MemcpyStrSrc && Size >= 8 &&
+               !Subtarget->is64Bit() &&
+               Subtarget->hasSSE2()) {
+      // Do not use f64 to lower memcpy if source is string constant. It's
+      // better to use i32 to avoid the loads.
+      return MVT::f64;
+    }
+  }
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
+bool
+X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
+  if (Fast)
+    *Fast = Subtarget->isUnalignedMemAccessFast();
+  return true;
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+  // symbol.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    return MachineJumpTableInfo::EK_Custom32;
+
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                             const MachineBasicBlock *MBB,
+                                             unsigned uid,MCContext &Ctx) const{
+  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+         Subtarget->isPICStyleGOT());
+  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+  // entries.
+  return MCSymbolRefExpr::Create(MBB->getSymbol(),
+                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget->is64Bit())
+    // This doesn't have DebugLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+  return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+                             MCContext &Ctx) const {
+  // X86-64 uses RIP relative addressing based on the jump table label.
+  if (Subtarget->isPICStyleRIPRel())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  // Otherwise, the reference is relative to the PIC base.
+  return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
+}
+
+// FIXME: Why this routine is here? Move to RegInfo!
+std::pair<const TargetRegisterClass*, uint8_t>
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
+    RRC = Subtarget->is64Bit() ?
+      (const TargetRegisterClass*)&X86::GR64RegClass :
+      (const TargetRegisterClass*)&X86::GR32RegClass;
+    break;
+  case MVT::x86mmx:
+    RRC = &X86::VR64RegClass;
+    break;
+  case MVT::f32: case MVT::f64:
+  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
+  case MVT::v4f32: case MVT::v2f64:
+  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
+  case MVT::v4f64:
+    RRC = &X86::VR128RegClass;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
+bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
+                                               unsigned &Offset) const {
+  if (!Subtarget->isTargetLinux())
+    return false;
+
+  if (Subtarget->is64Bit()) {
+    // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
+    Offset = 0x28;
+    if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
+      AddressSpace = 256;
+    else
+      AddressSpace = 257;
+  } else {
+    // %gs:0x14 on i386
+    Offset = 0x14;
+    AddressSpace = 256;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_X86);
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<SDValue> &OutVals,
+                               DebugLoc dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
+                   MVT::i16));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = OutVals[i];
+    EVT ValVT = ValToCopy.getValueType();
+
+    // Promote values to the appropriate types
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
+    else if (VA.getLocInfo() == CCValAssign::BCvt)
+      ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values,
+    // or SSE or MMX vectors.
+    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
+         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
+          (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+    // llvm-gcc has never done it right and no one has noticed, so this
+    // should be OK for now.
+    if (ValVT == MVT::f64 &&
+        (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
+      report_fatal_error("SSE2 register return with SSE2 disabled");
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::ST0 ||
+        VA.getLocReg() == X86::ST1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget->is64Bit()) {
+      if (ValVT == MVT::x86mmx) {
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                  ValToCopy);
+          // If we don't have SSE2 available, convert to v4f32 so the generated
+          // register is legal.
+          if (!Subtarget->hasSSE2())
+            ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
+        }
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  // The x86-64 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // Win32 requires us to put the sret argument to %eax as well.
+  // We saved the argument into a virtual register in the entry block,
+  // so now we copy the value out and into %rax/%eax.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
+      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments().");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    unsigned RetValReg
+        = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
+          X86::RAX : X86::EAX;
+    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
+    Flag = Chain.getValue(1);
+
+    // RAX/EAX now acts like a return value.
+    RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(X86ISD::RET_FLAG, dl,
+                     MVT::Other, &RetOps[0], RetOps.size());
+}
+
+bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
+  if (N->getNumValues() != 1)
+    return false;
+  if (!N->hasNUsesOfValue(1, 0))
+    return false;
+
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
+      return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
+
+  bool HasRet = false;
+  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
+       UI != UE; ++UI) {
+    if (UI->getOpcode() != X86ISD::RET_FLAG)
+      return false;
+    HasRet = true;
+  }
+
+  if (!HasRet)
+    return false;
+
+  Chain = TCChain;
+  return true;
+}
+
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
+                                            ISD::NodeType ExtendKind) const {
+  MVT ReturnMVT;
+  // TODO: Is this also valid on 32-bit?
+  if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
+    ReturnMVT = MVT::i8;
+  else
+    ReturnMVT = MVT::i32;
+
+  MVT MinVT = getRegisterType(ReturnMVT);
+  return VT.bitsLT(MinVT) ? MinVT : VT;
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool Is64Bit = Subtarget->is64Bit();
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      report_fatal_error("SSE register return with SSE disabled");
+    }
+
+    SDValue Val;
+
+    // If this is a call to a function that returns an fp value on the floating
+    // point stack, we must guarantee the value is popped from the stack, so
+    // a CopyFromReg is not good enough - the copy instruction may be eliminated
+    // if the return value is not used. We use the FpPOP_RETVAL instruction
+    // instead.
+    if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) {
+      // If we prefer to use the value in xmm registers, copy it out as f80 and
+      // use a truncate to move it from fp stack reg to xmm reg.
+      if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80;
+      SDValue Ops[] = { Chain, InFlag };
+      Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT,
+                                         MVT::Other, MVT::Glue, Ops, 2), 1);
+      Val = Chain.getValue(0);
+
+      // Round the f80 to the right size, which also moves it to the appropriate
+      // xmm register.
+      if (CopyVT != VA.getValVT())
+        Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                          // This truncation won't change the value.
+                          DAG.getIntPtrConstant(1));
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                 CopyVT, InFlag).getValue(1);
+      Val = Chain.getValue(0);
+    }
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+enum StructReturnType {
+  NotStructReturn,
+  RegStructReturn,
+  StackStructReturn
+};
+static StructReturnType
+callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  if (Outs.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// ArgsAreStructReturn - Determines whether a function uses struct
+/// return semantics.
+static StructReturnType
+argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  if (Ins.empty())
+    return NotStructReturn;
+
+  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
+  if (!Flags.isSRet())
+    return NotStructReturn;
+  if (Flags.isInReg())
+    return RegStructReturn;
+  return StackStructReturn;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*isVolatile*/false, /*AlwaysInline=*/true,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
+/// IsTailCallConvention - Return true if the calling convention is one that
+/// supports tail call optimization.
+static bool IsTailCallConvention(CallingConv::ID CC) {
+  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
+          CC == CallingConv::HiPE);
+}
+
+bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
+    return false;
+
+  CallSite CS(CI);
+  CallingConv::ID CalleeCC = CS.getCallingConv();
+  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+    return false;
+
+  return true;
+}
+
+/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// a tailcall target by changing its ABI.
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
+                                   bool GuaranteedTailCallOpt) {
+  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo *MFI,
+                                    unsigned i) const {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv,
+                              getTargetMachine().Options.GuaranteedTailCallOpt);
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself.
+  if (VA.getLocInfo() == CCValAssign::Indirect)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  if (Flags.isByVal()) {
+    unsigned Bytes = Flags.getByValSize();
+    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
+    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
+    return DAG.getFrameIndex(FI, getPointerTy());
+  } else {
+    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                    VA.getLocMemOffset(), isImmutable);
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getLoad(ValVT, dl, Chain, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, false, 0);
+  }
+}
+
+SDValue
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals)
+                                          const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool Is64Bit = Subtarget->is64Bit();
+  bool IsWindows = Subtarget->isTargetWindows();
+  bool IsWin64 = Subtarget->isTargetWin64();
+
+  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64) {
+    CCInfo.AllocateStack(32, 8);
+  }
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+
+  unsigned LastVal = ~0U;
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
+    LastVal = VA.getValNo();
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = &X86::GR32RegClass;
+      else if (Is64Bit && RegVT == MVT::i64)
+        RC = &X86::GR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &X86::FR32RegClass;
+      else if (RegVT == MVT::f64)
+        RC = &X86::FR64RegClass;
+      else if (RegVT.is256BitVector())
+        RC = &X86::VR256RegClass;
+      else if (RegVT.is128BitVector())
+        RC = &X86::VR128RegClass;
+      else if (RegVT == MVT::x86mmx)
+        RC = &X86::VR64RegClass;
+      else
+        llvm_unreachable("Unknown argument type!");
+
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
+
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector())
+          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+        else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+    } else {
+      assert(VA.isMemLoc());
+      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+    }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
+                             MachinePointerInfo(), false, false, false, 0);
+
+    InVals.push_back(ArgValue);
+  }
+
+  // The x86-64 ABIs require that for returning structs by value we copy
+  // the sret argument into %rax/%eax (depending on ABI) for the return.
+  // Win32 requires us to put the sret argument to %eax as well.
+  // Save the argument into a virtual register so that we can access it
+  // from the return points.
+  if (MF.getFunction()->hasStructRetAttr() &&
+      (Subtarget->is64Bit() || Subtarget->isTargetWindows())) {
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      MVT PtrTy = getPointerTy();
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // Align stack specially for tail calls.
+  if (FuncIsMadeTailCallSafe(CallConv,
+                             MF.getTarget().Options.GuaranteedTailCallOpt))
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
+                    CallConv != CallingConv::X86_ThisCall)) {
+      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true));
+    }
+    if (Is64Bit) {
+      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+      // FIXME: We should really autogenerate these arrays
+      static const uint16_t GPR64ArgRegsWin64[] = {
+        X86::RCX, X86::RDX, X86::R8,  X86::R9
+      };
+      static const uint16_t GPR64ArgRegs64Bit[] = {
+        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+      };
+      static const uint16_t XMMArgRegs64Bit[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+      };
+      const uint16_t *GPR64ArgRegs;
+      unsigned NumXMMRegs = 0;
+
+      if (IsWin64) {
+        // The XMM registers which might contain var arg parameters are shadowed
+        // in their paired GPR.  So we only need to save the GPR to their home
+        // slots.
+        TotalNumIntRegs = 4;
+        GPR64ArgRegs = GPR64ArgRegsWin64;
+      } else {
+        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+        GPR64ArgRegs = GPR64ArgRegs64Bit;
+
+        NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit,
+                                                TotalNumXMMRegs);
+      }
+      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+                                                       TotalNumIntRegs);
+
+      bool NoImplicitFloatOps = Fn->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+             "SSE register cannot be used when SSE is disabled!");
+      assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
+               NoImplicitFloatOps) &&
+             "SSE register cannot be used when SSE is disabled!");
+      if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
+          !Subtarget->hasSSE1())
+        // Kernel mode asks for SSE to be disabled, so don't push them
+        // on the stack.
+        TotalNumXMMRegs = 0;
+
+      if (IsWin64) {
+        const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+        // Get to the caller-allocated home save location.  Add 8 to account
+        // for the return address.
+        int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
+        FuncInfo->setRegSaveFrameIndex(
+          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+        // Fixup to set vararg frame on shadow area (4 x i64).
+        if (NumIntRegs < 4)
+          FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+      } else {
+        // For X86-64, if there are vararg parameters that are passed via
+        // registers, then we must store them to their spots on the stack so
+        // they may be loaded by deferencing the result of va_next.
+        FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+        FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
+        FuncInfo->setRegSaveFrameIndex(
+          MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
+                               false));
+      }
+
+      // Store the integer parameter registers.
+      SmallVector<SDValue, 8> MemOps;
+      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                        getPointerTy());
+      unsigned Offset = FuncInfo->getVarArgsGPOffset();
+      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(Offset));
+        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+                                     &X86::GR64RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+        SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                         FuncInfo->getRegSaveFrameIndex(), Offset),
+                       false, false, 0);
+        MemOps.push_back(Store);
+        Offset += 8;
+      }
+
+      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+        // Now store the XMM (fp + vector) parameter registers.
+        SmallVector<SDValue, 11> SaveXMMOps;
+        SaveXMMOps.push_back(Chain);
+
+        unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
+        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+        SaveXMMOps.push_back(ALVal);
+
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                               FuncInfo->getRegSaveFrameIndex()));
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(
+                               FuncInfo->getVarArgsFPOffset()));
+
+        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+          unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs],
+                                       &X86::VR128RegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+          SaveXMMOps.push_back(Val);
+        }
+        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                     MVT::Other,
+                                     &SaveXMMOps[0], SaveXMMOps.size()));
+      }
+
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
+    }
+  }
+
+  // Some CCs need callee pop.
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
+    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
+  } else {
+    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+        argsAreStructReturn(Ins) == StackStructReturn)
+      FuncInfo->setBytesToPopOnReturn(4);
+  }
+
+  if (!Is64Bit) {
+    // RegSaveFrameIndex is X86-64 only.
+    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+    if (CallConv == CallingConv::X86_FastCall ||
+        CallConv == CallingConv::X86_ThisCall)
+      // fastcc functions can't have varargs.
+      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+  }
+
+  FuncInfo->setArgumentStackSize(StackSize);
+
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) const {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal())
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      MachinePointerInfo::getStack(LocMemOffset),
+                      false, false, 0);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+                                           SDValue &OutRetAddr, SDValue Chain,
+                                           bool IsTailCall, bool Is64Bit,
+                                           int FPDiff, DebugLoc dl) const {
+  // Adjust the Return address stack slot.
+  EVT VT = getPointerTy();
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
+                           false, false, false, 0);
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int NewReturnAddrFI =
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       MachinePointerInfo::getFixedStack(NewReturnAddrFI),
+                       false, false, 0);
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                             SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool &isTailCall                      = CLI.IsTailCall;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsWin64        = Subtarget->isTargetWin64();
+  bool IsWindows      = Subtarget->isTargetWindows();
+  StructReturnType SR = callIsStructReturn(Outs);
+  bool IsSibcall      = false;
+
+  if (MF.getTarget().Options.DisableTailCalls)
+    isTailCall = false;
+
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
+                    isVarArg, SR != NotStructReturn,
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+                    Outs, OutVals, Ins, DAG);
+
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
+  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
+         "Var args not supported with calling convention fastcc, ghc or hipe");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  // Allocate shadow area for Win64
+  if (IsWin64) {
+    CCInfo.AllocateStack(32, 8);
+  }
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  if (IsSibcall)
+    // This is a sibcall. The memory operands are available in caller's
+    // own caller's stack.
+    NumBytes = 0;
+  else if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+           IsTailCallConvention(CallConv))
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (isTailCall && !IsSibcall) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
+  }
+
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue RetAddrFrIdx;
+  // Load return address for tail calls.
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      if (RegVT.is128BitVector()) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+                           MachinePointerInfo::getFixedStack(FI),
+                           false, false, 0);
+      Arg = SpillSlot;
+      break;
+    }
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (isVarArg && IsWin64) {
+        // Win64 ABI requires argument XMM reg to be copied to the corresponding
+        // shadow reg if callee is a varargs function.
+        unsigned ShadowReg = 0;
+        switch (VA.getLocReg()) {
+        case X86::XMM0: ShadowReg = X86::RCX; break;
+        case X86::XMM1: ShadowReg = X86::RDX; break;
+        case X86::XMM2: ShadowReg = X86::R8; break;
+        case X86::XMM3: ShadowReg = X86::R9; break;
+        }
+        if (ShadowReg)
+          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
+      }
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy());
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
+               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
+  }
+
+  if (Is64Bit && isVarArg && !IsWin64) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // Count the number of XMM registers allocated.
+    static const uint16_t XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+                                        DAG.getConstant(NumXMMRegs, MVT::i8)));
+  }
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (VA.isRegLoc())
+          continue;
+        assert(VA.isMemLoc());
+        SDValue Arg = OutVals[i];
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        // Create frame index.
+        int32_t Offset = VA.getLocMemOffset()+FPDiff;
+        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+        FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+        if (Flags.isByVal()) {
+          // Copy relative to framepointer.
+          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+          if (StackPtr.getNode() == 0)
+            StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                          RegInfo->getStackRegister(),
+                                          getPointerTy());
+          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                           ArgChain,
+                                                           Flags, DAG, dl));
+        } else {
+          // Store relative to framepointer.
+          MemOpChains2.push_back(
+            DAG.getStore(ArgChain, dl, Arg, FIN,
+                         MachinePointerInfo::getFixedStack(FI),
+                         false, false, 0));
+        }
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOpChains2[0], MemOpChains2.size());
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(), RegInfo->getSlotSize(),
+                                     FPDiff, dl);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+    // In the 64-bit large code model, we have to make all calls
+    // through a register, since the call instruction's 32-bit
+    // pc-relative offset may not be large enough to hold the whole
+    // address.
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // If the callee is a GlobalAddress node (quite common, every direct call
+    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+    // it.
+
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    const GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportLinkage()) {
+      unsigned char OpFlags = 0;
+      bool ExtraLoad = false;
+      unsigned WrapperKind = ISD::DELETED_NODE;
+
+      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+      // external symbols most go through the PLT in PIC mode.  If the symbol
+      // has hidden or protected visibility, or if it is static or local, then
+      // we don't need to use the PLT - we can directly call it.
+      if (Subtarget->isTargetELF() &&
+          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+        OpFlags = X86II::MO_PLT;
+      } else if (Subtarget->isPICStyleStubAny() &&
+                 (GV->isDeclaration() || GV->isWeakForLinker()) &&
+                 (!Subtarget->getTargetTriple().isMacOSX() ||
+                  Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = X86II::MO_DARWIN_STUB;
+      } else if (Subtarget->isPICStyleRIPRel() &&
+                 isa<Function>(GV) &&
+                 cast<Function>(GV)->getAttributes().
+                   hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::NonLazyBind)) {
+        // If the function is marked as non-lazy, generate an indirect call
+        // which loads from the GOT directly. This avoids runtime overhead
+        // at the cost of eager binding (and one extra byte of encoding).
+        OpFlags = X86II::MO_GOTPCREL;
+        WrapperKind = X86ISD::WrapperRIP;
+        ExtraLoad = true;
+      }
+
+      Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
+                                          G->getOffset(), OpFlags);
+
+      // Add a wrapper if needed.
+      if (WrapperKind != ISD::DELETED_NODE)
+        Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
+      // Add extra indirection if needed.
+      if (ExtraLoad)
+        Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
+                             MachinePointerInfo::getGOT(),
+                             false, false, false, 0);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
+    // external symbols should go through the PLT.
+    if (Subtarget->isTargetELF() &&
+        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (!Subtarget->getTargetTriple().isMacOSX() ||
+                Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+                                         OpFlags);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+
+  if (!IsSibcall && isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                           DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (isTailCall) {
+    // We used to do:
+    //// If this is the first return lowered for this function, add the regs
+    //// to the liveout set for the function.
+    // This isn't right, although it's probably harmless on x86; liveouts
+    // should be computed from returns not tail calls.  Consider a void
+    // function making a tail call to a function returning int.
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPush;
+  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+                       getTargetMachine().Options.GuaranteedTailCallOpt))
+    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && !IsTailCallConvention(CallConv) && !IsWindows &&
+           SR == StackStructReturn)
+    // If this is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
+    NumBytesForCalleeToPush = 4;
+  else
+    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
+
+  // Returns a flag for retval copy to use.
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                                                     true),
+                               InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original REtADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned
+X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                               SelectionDAG& DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  unsigned SlotSize = RegInfo->getSlotSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII) {
+  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
+  int FI = INT_MAX;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        Bytes = Flags.getByValSize();
+      } else
+        return false;
+    }
+  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
+    if (Flags.isByVal())
+      // ByVal argument is passed in as a pointer but it's now being
+      // dereferenced. e.g.
+      // define @foo(%struct.X* %A) {
+      //   tail call @bar(%struct.X* byval %A)
+      // }
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
+    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
+    FI = FINode->getIndex();
+    Bytes = Flags.getByValSize();
+  } else
+    return false;
+
+  assert(FI != INT_MAX);
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                                     bool isCalleeStructRet,
+                                                     bool isCallerStructRet,
+                                                     Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG &DAG) const {
+  if (!IsTailCallConvention(CalleeCC) &&
+      CalleeCC != CallingConv::C)
+    return false;
+
+  // If -tailcallopt is specified, make fastcc functions tail-callable.
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
+  CallingConv::ID CallerCC = CallerF->getCallingConv();
+  bool CCMatch = CallerCC == CalleeCC;
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt) {
+    if (IsTailCallConvention(CalleeCC) && CCMatch)
+      return true;
+    return false;
+  }
+
+  // Look for obvious safe cases to perform tail call optimization that do not
+  // require ABI changes. This is what gcc calls sibcall.
+
+  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
+  // emit a special epilogue.
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // Also avoid sibcall optimization if either caller or callee uses struct
+  // return semantics.
+  if (isCalleeStructRet || isCallerStructRet)
+    return false;
+
+  // An stdcall caller is expected to clean up its arguments; the callee
+  // isn't going to do that.
+  if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
+    return false;
+
+  // Do not sibcall optimize vararg calls unless all arguments are passed via
+  // registers.
+  if (isVarArg && !Outs.empty()) {
+
+    // Optimizing for varargs on Win64 is unlikely to be safe without
+    // additional testing.
+    if (Subtarget->isTargetWin64())
+      return false;
+
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
+      if (!ArgLocs[i].isRegLoc())
+        return false;
+  }
+
+  // If the call result is in ST0 / ST1, it needs to be popped off the x87
+  // stack.  Therefore, if it's not used by the call it is not safe to optimize
+  // this into a sibcall.
+  bool Unused = false;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    if (!Ins[i].Used) {
+      Unused = true;
+      break;
+    }
+  }
+  if (Unused) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(),
+                   getTargetMachine(), RVLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+      CCValAssign &VA = RVLocs[i];
+      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
+        return false;
+    }
+  }
+
+  // If the calling conventions do not match, then we'd better make sure the
+  // results are returned in the same way as what the caller expects.
+  if (!CCMatch) {
+    SmallVector<CCValAssign, 16> RVLocs1;
+    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs1, *DAG.getContext());
+    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
+
+    SmallVector<CCValAssign, 16> RVLocs2;
+    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
+                    getTargetMachine(), RVLocs2, *DAG.getContext());
+    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
+
+    if (RVLocs1.size() != RVLocs2.size())
+      return false;
+    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
+      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
+        return false;
+      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
+        return false;
+      if (RVLocs1[i].isRegLoc()) {
+        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
+          return false;
+      } else {
+        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
+          return false;
+      }
+    }
+  }
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
+                   getTargetMachine(), ArgLocs, *DAG.getContext());
+
+    // Allocate shadow area for Win64
+    if (Subtarget->isTargetWin64()) {
+      CCInfo.AllocateStack(32, 8);
+    }
+
+    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+        return false;
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII =
+        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        SDValue Arg = OutVals[i];
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+
+    // If the tailcall address may be in a register, then make sure it's
+    // possible to register allocate for it. In 32-bit, the call address can
+    // only target EAX, EDX, or ECX since the tail call must be scheduled after
+    // callee-saved registers are restored. These happen to be the same
+    // registers used to pass 'inreg' arguments so watch out for those.
+    if (!Subtarget->is64Bit() &&
+        ((!isa<GlobalAddressSDNode>(Callee) &&
+          !isa<ExternalSymbolSDNode>(Callee)) ||
+         getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
+      unsigned NumInRegs = 0;
+      // In PIC we need an extra register to formulate the address computation
+      // for the callee.
+      unsigned MaxInRegs =
+          (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (!VA.isRegLoc())
+          continue;
+        unsigned Reg = VA.getLocReg();
+        switch (Reg) {
+        default: break;
+        case X86::EAX: case X86::EDX: case X86::ECX:
+          if (++NumInRegs == MaxInRegs)
+            return false;
+          break;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  return X86::createFastISel(funcInfo, libInfo);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+static bool MayFoldLoad(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
+}
+
+static bool MayFoldIntoStore(SDValue Op) {
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+}
+
+static bool isTargetShuffle(unsigned Opcode) {
+  switch(Opcode) {
+  default: return false;
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::SHUFP:
+  case X86ISD::PALIGNR:
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
+  case X86ISD::VPERMI:
+    return true;
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                    SDValue V1, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+  case X86ISD::MOVDDUP:
+    return DAG.getNode(Opc, dl, VT, V1);
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                    SDValue V1, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERMI:
+    return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                    SDValue V1, SDValue V2, unsigned TargetMask,
+                                    SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::PALIGNR:
+  case X86ISD::SHUFP:
+  case X86ISD::VPERM2X128:
+    return DAG.getNode(Opc, dl, VT, V1, V2,
+                       DAG.getConstant(TargetMask, MVT::i8));
+  }
+}
+
+static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
+  switch(Opc) {
+  default: llvm_unreachable("Unknown x86 shuffle node");
+  case X86ISD::MOVLHPS:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::UNPCKL:
+  case X86ISD::UNPCKH:
+    return DAG.getNode(Opc, dl, VT, V1, V2);
+  }
+}
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    unsigned SlotSize = RegInfo->getSlotSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt<32>(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset > 0)
+    return true;
+
+  return false;
+}
+
+/// isCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86::isCalleePop(CallingConv::ID CallingConv,
+                      bool is64Bit, bool IsVarArg, bool TailCallOpt) {
+  if (IsVarArg)
+    return false;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+    return !is64Bit;
+  case CallingConv::X86_FastCall:
+    return !is64Bit;
+  case CallingConv::X86_ThisCall:
+    return !is64Bit;
+  case CallingConv::Fast:
+    return TailCallOpt;
+  case CallingConv::GHC:
+    return TailCallOpt;
+  case CallingConv::HiPE:
+    return TailCallOpt;
+  }
+}
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_NS;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      }
+      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Invalid integer condition!");
+    case ISD::SETEQ:  return X86::COND_E;
+    case ISD::SETGT:  return X86::COND_G;
+    case ISD::SETGE:  return X86::COND_GE;
+    case ISD::SETLT:  return X86::COND_L;
+    case ISD::SETLE:  return X86::COND_LE;
+    case ISD::SETNE:  return X86::COND_NE;
+    case ISD::SETULT: return X86::COND_B;
+    case ISD::SETUGT: return X86::COND_A;
+    case ISD::SETULE: return X86::COND_BE;
+    case ISD::SETUGE: return X86::COND_AE;
+    }
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
+      !ISD::isNON_EXTLoad(RHS.getNode())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
+  }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+      return true;
+  }
+  return false;
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  return (Val < 0 || Val == CmpVal);
+}
+
+/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
+                                       unsigned Pos, unsigned Size, int Low) {
+  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
+  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
+    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+  if (VT == MVT::v2f64 || VT == MVT::v2i64)
+    return (Mask[0] < 2 && Mask[1] < 2);
+  return false;
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+    return false;
+
+  // Lower quadword copied in order or undef.
+  if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
+    return false;
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i)
+    if (!isUndefOrInRange(Mask[i], 4, 8))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Lower quadword copied in order or undef.
+    if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
+      return false;
+
+    // Upper quadword shuffled.
+    for (unsigned i = 12; i != 16; ++i)
+      if (!isUndefOrInRange(Mask[i], 12, 16))
+        return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
+    return false;
+
+  // Upper quadword copied in order.
+  if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
+    return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, 4))
+      return false;
+
+  if (VT == MVT::v16i16) {
+    // Upper quadword copied in order.
+    if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
+      return false;
+
+    // Lower quadword shuffled.
+    for (unsigned i = 8; i != 12; ++i)
+      if (!isUndefOrInRange(Mask[i], 8, 12))
+        return false;
+  }
+
+  return true;
+}
+
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+                          const X86Subtarget *Subtarget) {
+  if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+      (VT.is256BitVector() && !Subtarget->hasInt256()))
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  // Do not handle 64-bit element shuffles with palignr.
+  if (NumLaneElts == 2)
+    return false;
+
+  for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
+    unsigned i;
+    for (i = 0; i != NumLaneElts; ++i) {
+      if (Mask[i+l] >= 0)
+        break;
+    }
+
+    // Lane is all undef, go to next lane
+    if (i == NumLaneElts)
+      continue;
+
+    int Start = Mask[i+l];
+
+    // Make sure its in this lane in one of the sources
+    if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
+        !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
+      return false;
+
+    // If not lane 0, then we must match lane 0
+    if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
+      return false;
+
+    // Correct second source to be contiguous with first source
+    if (Start >= (int)NumElts)
+      Start -= NumElts - NumLaneElts;
+
+    // Make sure we're shifting in the right direction.
+    if (Start <= (int)(i+l))
+      return false;
+
+    Start -= i;
+
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != NumLaneElts; ++i) {
+      int Idx = Mask[i+l];
+
+      // Make sure its in this lane
+      if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
+          !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
+        return false;
+
+      // If not lane 0, then we must match lane 0
+      if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+
+      if (Idx >= (int)NumElts)
+        Idx -= NumElts - NumLaneElts;
+
+      if (!isUndefOrEqual(Idx, Start+i))
+        return false;
+
+    }
+  }
+
+  return true;
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
+                                     unsigned NumElems) {
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
+      continue;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
+  }
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128/256-bit
+/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
+/// reverse of what x86 shuffles want.
+static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
+                        bool Commuted = false) {
+  if (!HasFp256 && VT.is256BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElems = NumElems/NumLanes;
+
+  if (NumLaneElems != 2 && NumLaneElems != 4)
+    return false;
+
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  // VSHUFPDY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>      X3       X2       X1       X0
+  //  SRC2 =>      Y3       Y2       Y1       Y0
+  //
+  //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
+  //
+  unsigned HalfLaneElems = NumLaneElems/2;
+  for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      int Idx = Mask[i+l];
+      unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
+      if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
+        return false;
+      // For VSHUFPSY, the mask of the second half must be the same as the
+      // first but with the appropriate offsets. This works in the same way as
+      // VPERMILPS works with masks.
+      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Idx, Mask[i]+l))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(Mask[0], 6) &&
+         isUndefOrEqual(Mask[1], 7) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 4)
+    return false;
+
+  return isUndefOrEqual(Mask[0], 2) &&
+         isUndefOrEqual(Mask[1], 3) &&
+         isUndefOrEqual(Mask[2], 2) &&
+         isUndefOrEqual(Mask[3], 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  return true;
+}
+
+/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
+static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i + e], i + NumElems))
+      return false;
+
+  return true;
+}
+
+//
+// Some special combinations that can be optimized.
+//
+static
+SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
+                               SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  DebugLoc dl = SVOp->getDebugLoc();
+
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return SDValue();
+
+  ArrayRef<int> Mask = SVOp->getMask();
+
+  // These are the special masks that may be optimized.
+  static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
+  static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
+  bool MatchEvenMask = true;
+  bool MatchOddMask  = true;
+  for (int i=0; i<8; ++i) {
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
+      MatchEvenMask = false;
+    if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
+      MatchOddMask = false;
+  }
+
+  if (!MatchEvenMask && !MatchOddMask)
+    return SDValue();
+
+  SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
+
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+                         bool HasInt256, bool V2IsSplat = false) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
+         i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (V2IsSplat) {
+        if (!isUndefOrEqual(BitI1, NumElts))
+          return false;
+      } else {
+        if (!isUndefOrEqual(BitI1, j + NumElts))
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+                         bool HasInt256, bool V2IsSplat = false) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (V2IsSplat) {
+        if (isUndefOrEqual(BitI1, NumElts))
+          return false;
+      } else {
+        if (!isUndefOrEqual(BitI1, j+NumElts))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  unsigned NumElts = VT.getVectorNumElements();
+  bool Is256BitVec = VT.is256BitVector();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+  // FIXME: Need a better way to get rid of this, there's no latency difference
+  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+  // the former later. We should also remove the "_undef" special mask.
+  if (NumElts == 4 && Is256BitVec)
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
+         i != (l+1)*NumLaneElts;
+         i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
+      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
+         i != (l+1)*NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
+        return false;
+      if (!isUndefOrEqual(BitI1, j))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
+  if (VT.getVectorElementType().getSizeInBits() < 32)
+    return false;
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if (!isUndefOrEqual(Mask[0], NumElts))
+    return false;
+
+  for (unsigned i = 1; i != NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  return true;
+}
+
+/// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (unsigned Half = 0; Half != 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (unsigned Half = 0; Half != 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
+static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+
+  unsigned FstHalf = 0, SndHalf = 0;
+  for (unsigned i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
+static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  // Only match 256-bit with 32/64-bit types
+  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
+    return false;
+
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned LaneSize = NumElts/NumLanes;
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (NumElts != 8 || l == 0)
+        continue;
+      // VPERMILPS handling
+      if (Mask[i] < 0)
+        continue;
+      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
+                               bool V2IsSplat = false, bool V2IsUndef = false) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned NumOps = VT.getVectorNumElements();
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+
+  if (!isUndefOrEqual(Mask[0], 0))
+    return false;
+
+  for (unsigned i = 1; i != NumOps; ++i)
+    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+      return false;
+
+  return true;
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8))
+    return false;
+
+  // "i+1" is the value the indexed mask element must have
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i+1) ||
+        !isUndefOrEqual(Mask[i+1], i+1))
+      return false;
+
+  return true;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3())
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.is128BitVector() && NumElems != 4) ||
+      (VT.is256BitVector() && NumElems != 8))
+    return false;
+
+  // "i" is the value the indexed mask element must have
+  for (unsigned i = 0; i != NumElems; i += 2)
+    if (!isUndefOrEqual(Mask[i], i) ||
+        !isUndefOrEqual(Mask[i+1], i))
+      return false;
+
+  return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+  if (!HasFp256 || !VT.is256BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts != 4)
+    return false;
+
+  for (unsigned i = 0; i != NumElts/2; ++i)
+    if (!isUndefOrEqual(Mask[i], 0))
+      return false;
+  for (unsigned i = NumElts/2; i != NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], NumElts/2))
+      return false;
+  return true;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  unsigned e = VT.getVectorNumElements() / 2;
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+  for (unsigned i = 0; i != e; ++i)
+    if (!isUndefOrEqual(Mask[e+i], i))
+      return false;
+  return true;
+}
+
+/// isVEXTRACTF128Index - Return true if the specified
+/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+/// suitable for input to VEXTRACTF128.
+bool X86::isVEXTRACTF128Index(SDNode *N) {
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    return false;
+
+  // The index should be aligned on a 128-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VT = N->getValueType(0).getSimpleVT();
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % 128 == 0;
+
+  return Result;
+}
+
+/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
+/// operand specifies a subvector insert that is suitable for input to
+/// VINSERTF128.
+bool X86::isVINSERTF128Index(SDNode *N) {
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    return false;
+
+  // The index should be aligned on a 128-bit boundary.
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VT = N->getValueType(0).getSimpleVT();
+  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
+  bool Result = (Index * ElSize) % 128 == 0;
+
+  return Result;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
+/// Handles 128-bit and 256-bit.
+static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getValueType(0).getSimpleVT();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for PSHUF/SHUFP");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
+  // independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
+         "Only supports 2 or 4 elements per lane");
+
+  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+  unsigned Mask = 0;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0) continue;
+    Elt &= NumLaneElts - 1;
+    unsigned ShAmt = (i << Shift) % 8;
+    Mask |= Elt << ShAmt;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
+static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getValueType(0).getSimpleVT();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned Mask = 0;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the last 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i+4);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits.
+      Mask |= Elt << (i * 2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
+static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getValueType(0).getSimpleVT();
+
+  assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
+         "Unsupported vector type for PSHUFHW");
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  unsigned Mask = 0;
+  for (unsigned l = 0; l != NumElts; l += 8) {
+    // 8 nodes per lane, but we only care about the first 4.
+    for (unsigned i = 0; i < 4; ++i) {
+      int Elt = N->getMaskElt(l+i);
+      if (Elt < 0) continue;
+      Elt &= 0x3; // only 2-bits
+      Mask |= Elt << (i * 2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  int Val = 0;
+  unsigned i;
+  for (i = 0; i != NumElts; ++i) {
+    Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      break;
+  }
+  if (Val >= (int)NumElts)
+    Val -= NumElts - NumLaneElts;
+
+  assert(Val - i > 0 && "PALIGNR imm should be positive");
+  return (Val - i) * EltSize;
+}
+
+/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// instructions.
+unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
+  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
+    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+
+  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+/// getInsertVINSERTF128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// instructions.
+unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
+  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
+    llvm_unreachable("Illegal insert subvector for VINSERTF128");
+
+  uint64_t Index =
+    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+
+  MVT VecVT = N->getValueType(0).getSimpleVT();
+  MVT ElVT = VecVT.getVectorElementType();
+
+  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+  return Index / NumElemsPerChunk;
+}
+
+/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
+/// Handles 256-bit.
+static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
+  MVT VT = N->getValueType(0).getSimpleVT();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  assert((VT.is256BitVector() && NumElts == 4) &&
+         "Unsupported vector type for VPERMQ/VPERMPD");
+
+  unsigned Mask = 0;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt < 0)
+      continue;
+    Mask |= Elt << (i*2);
+  }
+
+  return Mask;
+}
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
+    return CN->isNullValue();
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
+    return CFP->getValueAPF().isPosZero();
+  return false;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+                                    SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = SVOp->getMaskElt(i);
+    if (Idx >= 0) {
+      if (Idx < (int)NumElems)
+        Idx += NumElems;
+      else
+        Idx -= NumElems;
+    }
+    MaskVec.push_back(Idx);
+  }
+  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+                              SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+  if (VT.getVectorNumElements() != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Mask[i], i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
+}
+
+// Test whether the given value is a vector value which will be legalized
+// into a load.
+static bool WillBeConstantPoolLoad(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  // Check for any non-constant elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    switch (N->getOperand(i).getNode()->getOpcode()) {
+    case ISD::UNDEF:
+    case ISD::ConstantFP:
+    case ISD::Constant:
+      break;
+    default:
+      return false;
+    }
+
+  // Vectors of all-zeros and all-ones are materialized with special
+  // instructions rather than being loaded.
+  return !ISD::isBuildVectorAllZeros(N) &&
+         !ISD::isBuildVectorAllOnes(N);
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+                               ArrayRef<int> Mask, EVT VT) {
+  if (!VT.is128BitVector())
+    return false;
+
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
+    return false;
+
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+  for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
+    if (!isUndefOrEqual(Mask[i], i+NumElems))
+      return false;
+  return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  SDValue SplatValue = N->getOperand(0);
+  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i) != SplatValue)
+      return false;
+  return true;
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx >= (int)NumElems) {
+      unsigned Opc = V2.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
+        return false;
+    } else if (Idx >= 0) {
+      unsigned Opc = V1.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V1.getOperand(Idx)))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build SSE zero vectors as <4 x i32> bitcasted
+  // to their dest type. This ensures they get CSE'd.
+  SDValue Vec;
+  if (VT.is128BitVector()) {  // SSE
+    if (Subtarget->hasSSE2()) {  // SSE2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    } else { // SSE1
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+    }
+  } else if (VT.is256BitVector()) { // AVX
+    if (Subtarget->hasInt256()) { // AVX2
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else {
+      // 256-bit logic and arithmetic instructions in AVX are all
+      // floating-point, no support for integer ops. Emit fp zeroed vectors.
+      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
+    }
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
+                             DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec;
+  if (VT.is256BitVector()) {
+    if (HasInt256) { // AVX2
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else { // AVX
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+      Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
+    }
+  } else if (VT.is128BitVector()) {
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else
+    llvm_unreachable("Unexpected vector type");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
+  for (unsigned i = 0; i != NumElems; ++i) {
+    if (Mask[i] > (int)NumElems) {
+      Mask[i] = NumElems;
+    }
+  }
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    Mask.push_back(i);
+    Mask.push_back(i + NumElems);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
+    Mask.push_back(i + Half);
+    Mask.push_back(i + NumElems + Half);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+  EVT VT = V.getValueType();
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = V.getDebugLoc();
+
+  while (NumElems > 4) {
+    if (EltNo < NumElems/2) {
+      V = getUnpackl(DAG, dl, VT, V, V);
+    } else {
+      V = getUnpackh(DAG, dl, VT, V, V);
+      EltNo -= NumElems/2;
+    }
+    NumElems >>= 1;
+  }
+  return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+  EVT VT = V.getValueType();
+  DebugLoc dl = V.getDebugLoc();
+
+  if (VT.is128BitVector()) {
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+                             &SplatMask[0]);
+  } else if (VT.is256BitVector()) {
+    // To use VPERMILPS to splat scalars, the second half of indicies must
+    // refer to the higher part, which is a duplication of the lower one,
+    // because VPERMILPS can only handle in-lane permutations.
+    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+                             &SplatMask[0]);
+  } else
+    llvm_unreachable("Vector size not supported");
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+  EVT SrcVT = SV->getValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  DebugLoc dl = SV->getDebugLoc();
+
+  int EltNo = SV->getSplatIndex();
+  int NumElems = SrcVT.getVectorNumElements();
+  bool Is256BitVec = SrcVT.is256BitVector();
+
+  assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
+         "Unknown how to promote splat for type");
+
+  // Extract the 128-bit part containing the splat element and update
+  // the splat element index when it refers to the higher register.
+  if (Is256BitVec) {
+    V1 = Extract128BitVector(V1, EltNo, DAG, dl);
+    if (EltNo >= NumElems/2)
+      EltNo -= NumElems/2;
+  }
+
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles.
+  EVT EltVT = SrcVT.getVectorElementType();
+  if (EltVT == MVT::i8 || EltVT == MVT::i16)
+    V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+  // Recreate the 256-bit vector and place the same 128-bit vector
+  // into the low and high part. This is necessary because we want
+  // to use VPERM* to shuffle the vectors
+  if (Is256BitVec) {
+    V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
+  }
+
+  return getLegalSplat(DAG, V1, EltNo);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+                                           bool IsZero,
+                                           const X86Subtarget *Subtarget,
+                                           SelectionDAG &DAG) {
+  EVT VT = V2.getValueType();
+  SDValue V1 = IsZero
+    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec.push_back(i == Idx ? NumElems : i);
+  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
+/// target specific opcode. Returns true if the Mask could be calculated.
+/// Sets IsUnary to true if only uses one source.
+static bool getTargetShuffleMask(SDNode *N, MVT VT,
+                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SDValue ImmN;
+
+  IsUnary = false;
+  switch(N->getOpcode()) {
+  case X86ISD::SHUFP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::UNPCKH:
+    DecodeUNPCKHMask(VT, Mask);
+    break;
+  case X86ISD::UNPCKL:
+    DecodeUNPCKLMask(VT, Mask);
+    break;
+  case X86ISD::MOVHLPS:
+    DecodeMOVHLPSMask(NumElems, Mask);
+    break;
+  case X86ISD::MOVLHPS:
+    DecodeMOVLHPSMask(NumElems, Mask);
+    break;
+  case X86ISD::PALIGNR:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    break;
+  case X86ISD::PSHUFD:
+  case X86ISD::VPERMILP:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFHW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::PSHUFLW:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::VPERMI:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = true;
+    break;
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD: {
+    // The index 0 always comes from the first element of the second source,
+    // this is why MOVSS and MOVSD are used in the first place. The other
+    // elements come from the other positions of the first source vector
+    Mask.push_back(NumElems);
+    for (unsigned i = 1; i != NumElems; ++i) {
+      Mask.push_back(i);
+    }
+    break;
+  }
+  case X86ISD::VPERM2X128:
+    ImmN = N->getOperand(N->getNumOperands()-1);
+    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    if (Mask.empty()) return false;
+    break;
+  case X86ISD::MOVDDUP:
+  case X86ISD::MOVLHPD:
+  case X86ISD::MOVLPD:
+  case X86ISD::MOVLPS:
+  case X86ISD::MOVSHDUP:
+  case X86ISD::MOVSLDUP:
+    // Not yet implemented
+    return false;
+  default: llvm_unreachable("unknown target shuffle node");
+  }
+
+  return true;
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
+                                   unsigned Depth) {
+  if (Depth == 6)
+    return SDValue();  // Limit search depth.
+
+  SDValue V = SDValue(N, 0);
+  EVT VT = V.getValueType();
+  unsigned Opcode = V.getOpcode();
+
+  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
+  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+    int Elt = SV->getMaskElt(Index);
+
+    if (Elt < 0)
+      return DAG.getUNDEF(VT.getVectorElementType());
+
+    unsigned NumElems = VT.getVectorNumElements();
+    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
+                                         : SV->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+  }
+
+  // Recurse into target specific vector shuffles to find scalars.
+  if (isTargetShuffle(Opcode)) {
+    MVT ShufVT = V.getValueType().getSimpleVT();
+    unsigned NumElems = ShufVT.getVectorNumElements();
+    SmallVector<int, 16> ShuffleMask;
+    bool IsUnary;
+
+    if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
+      return SDValue();
+
+    int Elt = ShuffleMask[Index];
+    if (Elt < 0)
+      return DAG.getUNDEF(ShufVT.getVectorElementType());
+
+    SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
+                                         : N->getOperand(1);
+    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
+                               Depth+1);
+  }
+
+  // Actual nodes that may contain scalar elements
+  if (Opcode == ISD::BITCAST) {
+    V = V.getOperand(0);
+    EVT SrcVT = V.getValueType();
+    unsigned NumElems = VT.getVectorNumElements();
+
+    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
+      return SDValue();
+  }
+
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? V.getOperand(0)
+                        : DAG.getUNDEF(VT.getVectorElementType());
+
+  if (V.getOpcode() == ISD::BUILD_VECTOR)
+    return V.getOperand(Index);
+
+  return SDValue();
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements of a vector
+/// shuffle operation which come from a consecutively from a zero. The
+/// search can start in two different directions, from left or right.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
+                                  bool ZerosFromLeft, SelectionDAG &DAG) {
+  unsigned i;
+  for (i = 0; i != NumElems; ++i) {
+    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+    SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
+    if (!(Elt.getNode() &&
+         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+      break;
+  }
+
+  return i;
+}
+
+/// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
+/// correspond consecutively to elements from one of the vector operands,
+/// starting from its index OpIdx. Also tell OpNum which source vector operand.
+static
+bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
+                              unsigned MaskI, unsigned MaskE, unsigned OpIdx,
+                              unsigned NumElems, unsigned &OpNum) {
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+
+  for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
+    int Idx = SVOp->getMaskElt(i);
+    // Ignore undef indicies
+    if (Idx < 0)
+      continue;
+
+    if (Idx < (int)NumElems)
+      SeenV1 = true;
+    else
+      SeenV2 = true;
+
+    // Only accept consecutive elements from the same vector
+    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
+      return false;
+  }
+
+  OpNum = SeenV1 ? 0 : 1;
+  return true;
+}
+
+/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+              false /* check zeros from right */, DAG);
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //               V1 = {X, A, B, C}     0
+  //                         \  \  \    /
+  //   vector_shuffle V1, V2 <1, 2, 3, X>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            0,                   // Mask Start Index
+            NumElems-NumZeros,   // Mask End Index(exclusive)
+            NumZeros,            // Where to start looking in the src vector
+            NumElems,            // Number of elements in vector
+            OpSrc))              // Which source operand ?
+    return false;
+
+  isLeft = false;
+  ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
+  return true;
+}
+
+/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
+/// logical left shift of a vector.
+static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
+              true /* check zeros from left */, DAG);
+  unsigned OpSrc;
+
+  if (!NumZeros)
+    return false;
+
+  // Considering the elements in the mask that are not consecutive zeros,
+  // check if they consecutively come from only one of the source vectors.
+  //
+  //                           0    { A, B, X, X } = V2
+  //                          / \    /  /
+  //   vector_shuffle V1, V2 <X, X, 4, 5>
+  //
+  if (!isShuffleMaskConsecutive(SVOp,
+            NumZeros,     // Mask Start Index
+            NumElems,     // Mask End Index(exclusive)
+            0,            // Where to start looking in the src vector
+            NumElems,     // Number of elements in vector
+            OpSrc))       // Which source operand ?
+    return false;
+
+  isLeft = true;
+  ShAmt = NumZeros;
+  ShVal = SVOp->getOperand(OpSrc);
+  return true;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  // Although the logic below support any bitwidth size, there are no
+  // shift instructions which handle more than 128-bit vectors.
+  if (!SVOp->getValueType(0).is128BitVector())
+    return false;
+
+  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
+      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
+    return true;
+
+  return false;
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG,
+                                       const X86Subtarget* Subtarget,
+                                       const TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt(0, 0), LastElt(0, 0);
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2));
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget* Subtarget,
+                                     const TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i));
+    }
+  }
+
+  return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+                         unsigned NumBits, SelectionDAG &DAG,
+                         const TargetLowering &TLI, DebugLoc dl) {
+  assert(VT.is128BitVector() && "Unknown type for VShift");
+  EVT ShVT = MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
+  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(Opc, dl, ShVT, SrcOp,
+                             DAG.getConstant(NumBits,
+                                  TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
+}
+
+SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                          SelectionDAG &DAG) const {
+
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16 or 32.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. FIXME: It's possible to compute
+        // the exact stack offset and reference FI + adjust offset instead.
+        // If someone *really* cares about this. That's the way to implement it.
+        return SDValue();
+      } else {
+        MFI->setObjectAlignment(FI, RequiredAlign);
+      }
+    }
+
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % RequiredAlign) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~(RequiredAlign-1);
+    if (StartOffset)
+      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    unsigned NumElems = VT.getVectorNumElements();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
+                             LD->getPointerInfo().getWithOffset(StartOffset),
+                             false, false, false, 0);
+
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumElems; ++i)
+      Mask.push_back(EltNo);
+
+    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
+  }
+
+  return SDValue();
+}
+
+/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
+/// vector of type 'VT', see if the elements can be replaced by a single large
+/// load which has the same value as a build_vector whose operands are 'elts'.
+///
+/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
+///
+/// FIXME: we'd also like to handle the case where the last elements are zero
+/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
+/// There's even a handy isZeroNode for that purpose.
+static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
+                                        DebugLoc &DL, SelectionDAG &DAG) {
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = Elts.size();
+
+  LoadSDNode *LDBase = NULL;
+  unsigned LastLoadedElt = -1U;
+
+  // For each element in the initializer, see if we've found a load or an undef.
+  // If we don't find an initial load element, or later load elements are
+  // non-consecutive, bail out.
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Elts[i];
+
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return SDValue();
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return SDValue();
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return SDValue();
+    LastLoadedElt = i;
+  }
+
+  // If we have found an entire vector of loads and undefs, then return a large
+  // load of the entire vector width starting at the base pointer.  If we found
+  // consecutive loads for the low half, generate a vzext_load node.
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                         LDBase->getPointerInfo(),
+                         LDBase->isVolatile(), LDBase->isNonTemporal(),
+                         LDBase->isInvariant(), 0);
+    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                       LDBase->getPointerInfo(),
+                       LDBase->isVolatile(), LDBase->isNonTemporal(),
+                       LDBase->isInvariant(), LDBase->getAlignment());
+  }
+  if (NumElems == 4 && LastLoadedElt == 1 &&
+      DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+                                LDBase->getPointerInfo(),
+                                LDBase->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
+    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
+  }
+  return SDValue();
+}
+
+/// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
+/// to generate a splat value for the following cases:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load, or a constant.
+/// The VBROADCAST node is returned when a pattern is found,
+/// or SDValue() otherwise.
+SDValue
+X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
+  if (!Subtarget->hasFp256())
+    return SDValue();
+
+  MVT VT = Op.getValueType().getSimpleVT();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for broadcast.");
+
+  SDValue Ld;
+  bool ConstSplatVal;
+
+  switch (Op.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      // The BUILD_VECTOR node must be a splat.
+      if (!isSplatVector(Op.getNode()))
+        return SDValue();
+
+      Ld = Op.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                     Ld.getOpcode() == ISD::ConstantFP);
+
+      // The suspected load node has several users. Make sure that all
+      // of its users are from the BUILD_VECTOR node.
+      // Constants may have multiple users.
+      if (!ConstSplatVal && !Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+        return SDValue();
+      break;
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
+          Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+        if (!Subtarget->hasInt256())
+          return SDValue();
+
+        // Use the register form of the broadcast instruction available on AVX2.
+        if (VT.is256BitVector())
+          Sc = Extract128BitVector(Sc, 0, DAG, dl);
+        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+      }
+
+      Ld = Sc.getOperand(0);
+      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
+                       Ld.getOpcode() == ISD::ConstantFP);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      // Constants may have multiple users.
+      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+        return SDValue();
+      break;
+    }
+  }
+
+  bool Is256 = VT.is256BitVector();
+
+  // Handle the broadcasting a single constant scalar from the constant pool
+  // into a vector. On Sandybridge it is still better to load a constant vector
+  // from the constant pool and not to broadcast it from a scalar.
+  if (ConstSplatVal && Subtarget->hasInt256()) {
+    EVT CVT = Ld.getValueType();
+    assert(!CVT.isVector() && "Must not broadcast a vector type");
+    unsigned ScalarSize = CVT.getSizeInBits();
+
+    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+      const Constant *C = 0;
+      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
+        C = CI->getConstantIntValue();
+      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
+        C = CF->getConstantFPValue();
+
+      assert(C && "Invalid constant type");
+
+      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+      Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
+                       MachinePointerInfo::getConstantPool(),
+                       false, false, false, Alignment);
+
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+    }
+  }
+
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+  // Handle AVX2 in-register broadcasts.
+  if (!IsLoad && Subtarget->hasInt256() &&
+      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The scalar source must be a normal load.
+  if (!IsLoad)
+    return SDValue();
+
+  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
+  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
+  // double since there is no vbroadcastsd xmm
+  if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
+    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  }
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  // Skip if insert_vec_elt is not supported.
+  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+    return SDValue();
+
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
+
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
+      return SDValue();
+
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+
+    if (VecIn1.getNode() == 0)
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (VecIn2.getNode() == 0)
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
+
+    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
+  }
+
+  if (VecIn1.getNode() == 0)
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
+
+  return NV;
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT ExtVT = VT.getVectorElementType();
+  unsigned NumElems = Op.getNumOperands();
+
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (VT == MVT::v4i32 || VT == MVT::v8i32)
+      return Op;
+
+    return getZeroVector(VT, Subtarget, DAG, dl);
+  }
+
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
+  if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
+      return Op;
+
+    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+  }
+
+  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  if (Broadcast.getNode())
+    return Broadcast;
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (X86::isZeroNode(Elt))
+      NumZero++;
+    else {
+      NonZeros |= (1 << i);
+      NumNonZero++;
+    }
+  }
+
+  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+  if (NumNonZero == 0)
+    return DAG.getUNDEF(VT);
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1) {
+    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle SSE only.
+        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+        EVT VecVT = MVT::v4i32;
+        unsigned VecElts = 4;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+
+        // Now we have our 32-bit value zero extended in the low element of
+        // a vector.  If Idx != 0, swizzle it into place.
+        if (Idx != 0) {
+          SmallVector<int, 4> Mask;
+          Mask.push_back(Idx);
+          for (unsigned i = 1; i != VecElts; ++i)
+            Mask.push_back(i);
+          Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
+                                      &Mask[0]);
+        }
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0)
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+        if (VT.is256BitVector()) {
+          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
+          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
+                             Item, DAG.getIntPtrConstant(0));
+        }
+        assert(VT.is128BitVector() && "Expected an SSE value type!");
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+      }
+
+      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
+        if (VT.is256BitVector()) {
+          SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
+          Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
+        } else {
+          assert(VT.is128BitVector() && "Expected an SSE value type!");
+          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
+        }
+        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
+      }
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
+      SmallVector<int, 8> MaskVec;
+      for (unsigned i = 0; i != NumElems; ++i)
+        MaskVec.push_back(i == Idx ? 0 : 1);
+      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
+    return SDValue();
+  }
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.is256BitVector()) {
+    SmallVector<SDValue, 32> V;
+    for (unsigned i = 0; i != NumElems; ++i)
+      V.push_back(Op.getOperand(i));
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
+                                NumElems/2);
+
+    // Recreate the wider vector with the lower and upper part.
+    return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        Subtarget, *this);
+    if (V.getNode()) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                      Subtarget, *this);
+    if (V.getNode()) return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDValue, 8> V(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, Subtarget, DAG, dl);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          break;
+        case 2:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+        case 3:
+          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+      }
+    }
+
+    bool Reverse1 = (NonZeros & 0x3) == 2;
+    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    int MaskVec[] = {
+      Reverse1 ? 1 : 0,
+      Reverse1 ? 0 : 1,
+      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
+      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
+    };
+    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+  }
+
+  if (Values.size() > 1 && VT.is128BitVector()) {
+    // Check for a build vector of consecutive loads.
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = Op.getOperand(i);
+
+    // Check for elements which are consecutive loads.
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    if (LD.getNode())
+      return LD;
+
+    // Check for a build vector from mostly shuffle plus few inserting.
+    SDValue Sh = buildFromShuffleMostly(Op, DAG);
+    if (Sh.getNode())
+      return Sh;
+
+    // For SSE 4.1, use insertps to put the high elements into the low element.
+    if (getSubtarget()->hasSSE41()) {
+      SDValue Result;
+      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
+        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+      else
+        Result = DAG.getUNDEF(VT);
+
+      for (unsigned i = 1; i < NumElems; ++i) {
+        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+                             Op.getOperand(i), DAG.getIntPtrConstant(i));
+      }
+      return Result;
+    }
+
+    // Otherwise, expand into a number of unpckl*, start by extending each of
+    // our (non-undef) elements to the full vector width with the element in the
+    // bottom slot of the vector (which generates no code for SSE).
+    for (unsigned i = 0; i < NumElems; ++i) {
+      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+      else
+        V[i] = DAG.getUNDEF(VT);
+    }
+
+    // Next, we iteratively mix elements, e.g. for v4f32:
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    unsigned EltStride = NumElems >> 1;
+    while (EltStride != 0) {
+      for (unsigned i = 0; i < EltStride; ++i) {
+        // If V[i+EltStride] is undef and this is the first round of mixing,
+        // then it is safe to just drop this shuffle: V[i] is already in the
+        // right place, the one element (since it's the first round) being
+        // inserted as undef can be dropped.  This isn't safe for successive
+        // rounds because they will permute elements within both vectors.
+        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
+            EltStride == NumElems/2)
+          continue;
+
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
+      }
+      EltStride >>= 1;
+    }
+    return V[0];
+  }
+  return SDValue();
+}
+
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  MVT ResVT = Op.getValueType().getSimpleVT();
+
+  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+
+  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 2);
+
+  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
+// Try to lower a shuffle node into a simple blend instruction.
+static SDValue
+LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems-1)/8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+
+    int SndLaneEltIdx = (NumLanes == 2) ?
+      SVOp->getMaskElt(i + NumElemsInLane) : -1;
+    int EltIdx = SVOp->getMaskElt(i);
+
+    if ((EltIdx < 0 || EltIdx == (int)i) &&
+        (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+      continue;
+
+    if (((unsigned)EltIdx == (i + NumElems)) &&
+        (SndLaneEltIdx < 0 ||
+         (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
+      MaskValue |= (1<<i);
+    else
+      return SDValue();
+  }
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
+    V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 8> MaskVals;
+
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  unsigned LoQuad[] = { 0, 0, 0, 0 };
+  unsigned HiQuad[] = { 0, 0, 0, 0 };
+  std::bitset<4> InputQuads;
+  for (unsigned i = 0; i < 8; ++i) {
+    unsigned *Quad = i < 4 ? LoQuad : HiQuad;
+    int EltIdx = SVOp->getMaskElt(i);
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
+      continue;
+    }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
+  }
+
+  int BestLoQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
+    }
+  }
+
+  int BestHiQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
+    }
+  }
+
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a
+  // single pshufb instruction is necessary. If There are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (Subtarget->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads[0] ? 0 : 1;
+      BestHiQuad = InputQuads[2] ? 2 : 3;
+    }
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
+    }
+  }
+
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    int MaskV[] = {
+      BestLoQuad < 0 ? 0 : BestLoQuad,
+      BestHiQuad < 0 ? 1 : BestHiQuad
+    };
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
+
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    bool InOrder[2] = { true, true };
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx != (int)i)
+        InOrder[i/4] = false;
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
+    }
+
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
+
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
+      unsigned TargetMask = 0;
+      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
+                             getShufflePSHUFLWImmediate(SVOp);
+      V1 = NewV.getOperand(0);
+      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
+    }
+  }
+
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (Subtarget->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
+      int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
+    }
+    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      int Idx0 = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      int Idx1 = (EltIdx < 16) ? 0x80 : EltIdx - 15;
+      pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
+    }
+    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  }
+
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  std::bitset<8> InOrder;
+  if (BestLoQuad >= 0) {
+    int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV[i] = idx & 3;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFLWImmediate(SVOp), DAG);
+    }
+  }
+
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV[i] = (idx & 3) + 4;
+        InOrder.set(i);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
+      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
+                                  NewV.getOperand(0),
+                                  getShufflePSHUFHWImmediate(SVOp), DAG);
+    }
+  }
+
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
+  }
+
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8) ?
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx)) :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG,
+                                 const X86TargetLowering &TLI) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  ArrayRef<int> MaskVals = SVOp->getMask();
+
+  // Promote splats to a larger type which usually leads to more efficient code.
+  // FIXME: Is this true if pshufb is available?
+  if (SVOp->isSplat())
+    return PromoteSplat(SVOp, DAG);
+
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
+  // present, fall back to case 3.
+
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask.
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || EltIdx >= 16)
+        EltIdx = 0x80;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
+      return V1;
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
+
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
+  SDValue NewV = V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
+
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
+
+    // This word of the result is already in the correct place, skip it.
+    if ((Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
+
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
+
+    // If Elt0 and Elt1 are defined, are consecutive, and can be load
+    // using a single extract together, load it and store it.
+    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                        DAG.getIntPtrConstant(i));
+      continue;
+    }
+
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits
+    // otherwise clear the bottom 8 bits if we need to do an or.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8,
+                                  TLI.getShiftAmountTy(InsElt.getValueType())));
+      else if (Elt0 >= 0)
+        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+                             DAG.getConstant(0xFF00, MVT::i16));
+    }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8,
+                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
+      else if (Elt1 >= 0)
+        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+                             DAG.getConstant(0x00FF, MVT::i16));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
+    }
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
+  }
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
+}
+
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  SmallVector<SDValue, 32> pshufbMask;
+  for (unsigned i = 0; i != 32; i++) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0 || EltIdx >= 32)
+      EltIdx = 0x80;
+    else {
+      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+        // Cross lane is not allowed.
+        return SDValue();
+      EltIdx &= 0xf;
+    }
+    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  }
+  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                  MVT::v32i8, &pshufbMask[0], 32));
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG) {
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  DebugLoc dl = SVOp->getDebugLoc();
+  unsigned NumElems = VT.getVectorNumElements();
+  MVT NewVT;
+  unsigned Scale;
+  switch (VT.SimpleTy) {
+  default: llvm_unreachable("Unexpected!");
+  case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
+  case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
+  case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
+  case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
+  case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
+  case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
+  }
+
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i != NumElems; i += Scale) {
+    int StartIdx = -1;
+    for (unsigned j = 0; j != Scale; ++j) {
+      int EltIdx = SVOp->getMaskElt(i+j);
+      if (EltIdx < 0)
+        continue;
+      if (StartIdx < 0)
+        StartIdx = (EltIdx / Scale);
+      if (EltIdx != (int)(StartIdx*Scale + j))
+        return SDValue();
+    }
+    MaskVec.push_back(StartIdx);
+  }
+
+  SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
+  SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
+  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(MVT VT, EVT OpVT,
+                            SDValue SrcOp, SelectionDAG &DAG,
+                            const X86Subtarget *Subtarget, DebugLoc dl) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = NULL;
+    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BITCAST, dl, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   OpVT,
+                                                   SrcOp.getOperand(0)
+                                                          .getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                 DAG.getNode(ISD::BITCAST, dl,
+                                             OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+
+  SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NumLaneElems = NumElems / 2;
+
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT EltVT = VT.getVectorElementType();
+  MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+  SDValue Output[2];
+
+  SmallVector<int, 16> Mask;
+  for (unsigned l = 0; l < 2; ++l) {
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with UseBuildVector set.
+    bool UseBuildVector = false;
+    int InputUsed[2] = { -1, -1 }; // Not yet discovered.
+    unsigned LaneStart = l * NumLaneElems;
+    for (unsigned i = 0; i != NumLaneElems; ++i) {
+      // The mask element.  This indexes into the input.
+      int Idx = SVOp->getMaskElt(i+LaneStart);
+      if (Idx < 0) {
+        // the mask element does not index into any input vector.
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // The input vector this mask element indexes into.
+      int Input = Idx / NumLaneElems;
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NumLaneElems;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input)
+          // This input vector is already an operand.
+          break;
+        if (InputUsed[OpNo] < 0) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        UseBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Mask.push_back(Idx + OpNo * NumLaneElems);
+    }
+
+    if (UseBuildVector) {
+      SmallVector<SDValue, 16> SVOps;
+      for (unsigned i = 0; i != NumLaneElems; ++i) {
+        // The mask element.  This indexes into the input.
+        int Idx = SVOp->getMaskElt(i+LaneStart);
+        if (Idx < 0) {
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // The input vector this mask element indexes into.
+        int Input = Idx / NumElems;
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NumElems;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    SVOp->getOperand(Input),
+                                    DAG.getIntPtrConstant(Idx)));
+      }
+
+      // Construct the output using a BUILD_VECTOR.
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
+                              SVOps.size());
+    } else if (InputUsed[0] < 0) {
+      // No input vectors were used! The result is undefined.
+      Output[l] = DAG.getUNDEF(NVT);
+    } else {
+      SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
+                                        (InputUsed[0] % 2) * NumLaneElems,
+                                        DAG, dl);
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
+        Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
+                            (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
+      // At least one input vector was used. Create a new shuffle vector.
+      Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
+    }
+
+    Mask.clear();
+  }
+
+  // Concatenate the result back
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
+static SDValue
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getValueType(0).getSimpleVT();
+
+  assert(VT.is128BitVector() && "Unsupported vector size");
+
+  std::pair<int, int> Locs[4];
+  int Mask1[] = { -1, -1, -1, -1 };
+  SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  unsigned NumHi = 0;
+  unsigned NumLo = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else {
+      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+      if (Idx < 4) {
+        Locs[i] = std::make_pair(0, NumLo);
+        Mask1[NumLo] = Idx;
+        NumLo++;
+      } else {
+        Locs[i] = std::make_pair(1, NumHi);
+        if (2+NumHi < 4)
+          Mask1[2+NumHi] = Idx;
+        NumHi++;
+      }
+    }
+  }
+
+  if (NumLo <= 2 && NumHi <= 2) {
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    int Mask2[] = { -1, -1, -1, -1 };
+
+    for (unsigned i = 0; i != 4; ++i)
+      if (Locs[i].first != -1) {
+        unsigned Idx = (i < 2) ? 0 : 4;
+        Idx += Locs[i].first * 2 + Locs[i].second;
+        Mask2[i] = Idx;
+      }
+
+    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+  }
+
+  if (NumLo == 3 || NumHi == 3) {
+    // Otherwise, we must have three elements from one vector, call it X, and
+    // one element from the other, call it Y.  First, use a shufps to build an
+    // intermediate vector with the one element from Y and the element from X
+    // that will be in the same half in the final destination (the indexes don't
+    // matter). Then, use a shufps to build the final vector, taking the half
+    // containing the element from Y from the intermediate, and the other half
+    // from X.
+    if (NumHi == 3) {
+      // Normalize it so the 3 elements come from V1.
+      CommuteVectorShuffleMask(PermMask, 4);
+      std::swap(V1, V2);
+    }
+
+    // Find the element from V2.
+    unsigned HiIndex;
+    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+      int Val = PermMask[HiIndex];
+      if (Val < 0)
+        continue;
+      if (Val >= 4)
+        break;
+    }
+
+    Mask1[0] = PermMask[HiIndex];
+    Mask1[1] = -1;
+    Mask1[2] = PermMask[HiIndex^1];
+    Mask1[3] = -1;
+    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    if (HiIndex >= 2) {
+      Mask1[0] = PermMask[0];
+      Mask1[1] = PermMask[1];
+      Mask1[2] = HiIndex & 1 ? 6 : 4;
+      Mask1[3] = HiIndex & 1 ? 4 : 6;
+      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+    }
+
+    Mask1[0] = HiIndex & 1 ? 2 : 0;
+    Mask1[1] = HiIndex & 1 ? 0 : 2;
+    Mask1[2] = PermMask[2];
+    Mask1[3] = PermMask[3];
+    if (Mask1[2] >= 0)
+      Mask1[2] += 4;
+    if (Mask1[3] >= 0)
+      Mask1[3] += 4;
+    return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+  }
+
+  // Break it into (shuffle shuffle_hi, shuffle_lo).
+  int LoMask[] = { -1, -1, -1, -1 };
+  int HiMask[] = { -1, -1, -1, -1 };
+
+  int *MaskPtr = LoMask;
+  unsigned MaskIdx = 0;
+  unsigned LoIdx = 0;
+  unsigned HiIdx = 2;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (i == 2) {
+      MaskPtr = HiMask;
+      MaskIdx = 1;
+      LoIdx = 0;
+      HiIdx = 2;
+    }
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else if (Idx < 4) {
+      Locs[i] = std::make_pair(MaskIdx, LoIdx);
+      MaskPtr[LoIdx] = Idx;
+      LoIdx++;
+    } else {
+      Locs[i] = std::make_pair(MaskIdx, HiIdx);
+      MaskPtr[HiIdx] = Idx;
+      HiIdx++;
+    }
+  }
+
+  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+  int MaskOps[] = { -1, -1, -1, -1 };
+  for (unsigned i = 0; i != 4; ++i)
+    if (Locs[i].first != -1)
+      MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
+  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+static bool MayFoldVectorLoad(SDValue V) {
+  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    V = V.getOperand(0);
+  if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
+      V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
+    // BUILD_VECTOR (load), undef
+    V = V.getOperand(0);
+
+  return MayFoldLoad(V);
+}
+
+static
+SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Canonizalize to v2f64.
+  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
+                                          V1, DAG));
+}
+
+static
+SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+                        bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+
+  assert(VT != MVT::v2i64 && "unsupported shuffle type");
+
+  if (HasSSE2 && VT == MVT::v2f64)
+    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
+
+  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
+}
+
+static
+SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+
+  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
+         "unsupported shuffle type");
+
+  if (V2.getOpcode() == ISD::UNDEF)
+    V2 = V1;
+
+  // v4i32 or v4f32
+  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
+}
+
+static
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
+  // operand of these instructions is only memory, so check if there's a
+  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
+  // same masks.
+  bool CanFoldLoad = false;
+
+  // Trivial case, when V2 comes from a load.
+  if (MayFoldVectorLoad(V2))
+    CanFoldLoad = true;
+
+  // When V1 is a load, it can be folded later into a store in isel, example:
+  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
+  //    turns into:
+  //  (MOVLPSmr addr:$src1, VR128:$src2)
+  // So, recognize this potential and also use MOVLPS or MOVLPD
+  else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
+    CanFoldLoad = true;
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  if (CanFoldLoad) {
+    if (HasSSE2 && NumElems == 2)
+      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
+
+    if (NumElems == 4)
+      // If we don't care about the second element, proceed to use movss.
+      if (SVOp->getMaskElt(1) != -1)
+        return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
+  }
+
+  // movl and movlp will both match v2i64, but v2i64 is never matched by
+  // movl earlier because we make it strict to avoid messing with the movlp load
+  // folding logic (see the code above getMOVLP call). Match it here then,
+  // this is horrible, but will stay like this until we move all shuffle
+  // matching to x86 specific nodes. Note that for the 1st condition all
+  // types are matched with movsd.
+  if (HasSSE2) {
+    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+    // as to remove this logic from here, as much as possible
+    if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
+      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+  }
+
+  assert(VT != MVT::v4i32 && "unsupported shuffle type");
+
+  // Invert the operand order and use SHUFPS to match it.
+  return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
+                              getShuffleSHUFImmediate(SVOp), DAG);
+}
+
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasInt256() && VT.is256BitVector())
+    return SDValue();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1U << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1U << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
+  }
+
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
+
+  LLVMContext *Context = DAG.getContext();
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
+  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0)
+        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+        // The "ext_vec_elt" node is wider than the result node.
+        // In this case we should extract subvector from V.
+        // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
+        EVT FullVT = V.getValueType();
+        EVT SubVecVT = EVT::getVectorVT(*Context, 
+                                        FullVT.getVectorElementType(),
+                                        FullVT.getVectorNumElements()/Ratio);
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 
+                        DAG.getIntPtrConstant(0));
+      }
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+    }
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
+SDValue
+X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  MVT VT = Op.getValueType().getSimpleVT();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (isZeroShuffle(SVOp))
+    return getZeroVector(VT, Subtarget, DAG, dl);
+
+  // Handle splat operations
+  if (SVOp->isSplat()) {
+    // Use vbroadcast whenever the splat comes from a foldable load
+    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    if (Broadcast.getNode())
+      return Broadcast;
+  }
+
+  // Check integer expanding shuffles.
+  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  // If the shuffle can be profitably rewritten as a narrower shuffle, then
+  // do it!
+  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
+      VT == MVT::v16i16 || VT == MVT::v32i8) {
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+    if (NewOp.getNode())
+      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
+  } else if ((VT == MVT::v4i32 ||
+             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+    // FIXME: Figure out a cleaner way to do this.
+    // Try to make use of movq to zero out the top part.
+    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
+                               NewVT, true, false))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
+                              DAG, Subtarget, dl);
+      }
+    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
+      if (NewOp.getNode()) {
+        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
+                              DAG, Subtarget, dl);
+      }
+    }
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  MVT VT = Op.getValueType().getSimpleVT();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned NumElems = VT.getVectorNumElements();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+  bool HasSSE2 = Subtarget->hasSSE2();
+  bool HasFp256    = Subtarget->hasFp256();
+  bool HasInt256   = Subtarget->hasInt256();
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+
+  assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
+
+  if (V1IsUndef && V2IsUndef)
+    return DAG.getUNDEF(VT);
+
+  assert(!V1IsUndef && "Op 1 of shuffle should not be undef");
+
+  // Vector shuffle lowering takes 3 steps:
+  //
+  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
+  //    narrowing and commutation of operands should be handled.
+  // 2) Matching of shuffles with known shuffle masks to x86 target specific
+  //    shuffle nodes.
+  // 3) Rewriting of unmatched masks into new generic shuffle operations,
+  //    so the shuffle can be broken into other shuffles and the legalizer can
+  //    try the lowering again.
+  //
+  // The general idea is that no vector_shuffle operation should be left to
+  // be matched during isel, all of them must be converted to a target specific
+  // node here.
+
+  // Normalize the input vectors. Here splats, zeroed vectors, profitable
+  // narrowing and commutation of operands should be handled. The actual code
+  // doesn't include all of those, work in progress...
+  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
+  SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
+  // unpckh_undef). Only use pshufd if speed is more important than size.
+  if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
+      V2IsUndef && MayFoldVectorLoad(V1))
+    return getMOVDDup(Op, dl, V1, DAG);
+
+  if (isMOVHLPS_v_undef_Mask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
+
+  // Use to match splats
+  if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
+      (VT == MVT::v2f64 || VT == MVT::v2i64))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  if (isPSHUFDMask(M, VT)) {
+    // The actual implementation will match the mask in the if above and then
+    // during isel it can match several different instructions, not only pshufd
+    // as its name says, sad but true, emulate the behavior for now...
+    if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
+      return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
+
+    unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
+
+    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
+
+    if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
+      return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask,
+                                  DAG);
+
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
+                                TargetMask, DAG);
+  }
+
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDValue ShVal;
+  bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use
+    // v_set0 + movlhps or movhlps, etc.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  if (isMOVLMask(M, VT)) {
+    if (ISD::isBuildVectorAllZeros(V1.getNode()))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+    if (!isMOVLPMask(M, VT)) {
+      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
+
+      if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
+    }
+  }
+
+  // FIXME: fold these into legal mask.
+  if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+
+  if (isMOVHLPSMask(M, VT))
+    return getMOVHighToLow(Op, dl, DAG);
+
+  if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+
+  if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+
+  if (isMOVLPMask(M, VT))
+    return getMOVLP(Op, dl, DAG, HasSSE2);
+
+  if (ShouldXformToMOVHLPS(M, VT) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  if (isShift) {
+    // No better options. Use a vshldq / vsrldq.
+    MVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
+  V1IsSplat = isSplatVector(V1.getNode());
+  V2IsSplat = isSplatVector(V2.getNode());
+
+  // Canonicalize the splat or undef, if present, to be on the RHS.
+  if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
+    // Shuffling low element of v1 into undef, just return v1.
+    if (V2IsUndef)
+      return V1;
+    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+    // the instruction selector will not match, so get a canonical MOVL with
+    // swapped operands to undo the commute.
+    return getMOVL(DAG, dl, VT, V2, V1);
+  }
+
+  if (isUNPCKLMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+  if (isUNPCKHMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.p
+    SmallVector<int, 8> NewMask(M.begin(), M.end());
+    NormalizeMask(NewMask, NumElems);
+    if (isUNPCKLMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+    if (isUNPCKHMask(NewMask, VT, HasInt256, true))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    // FIXME: this seems wrong.
+    CommuteVectorShuffleMask(M, NumElems);
+    std::swap(V1, V2);
+    std::swap(V1IsSplat, V2IsSplat);
+    Commuted = false;
+
+    if (isUNPCKLMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
+
+    if (isUNPCKHMask(M, VT, HasInt256))
+      return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
+  }
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  // The checks below are all present in isShuffleMaskLegal, but they are
+  // inlined here right now to enable us to directly emit target specific
+  // nodes, and remove one by one until they don't return Op anymore.
+
+  if (isPALIGNRMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+                                getShufflePALIGNRImmediate(SVOp),
+                                DAG);
+
+  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
+      SVOp->getSplatIndex() == 0 && V2IsUndef) {
+    if (VT == MVT::v2f64 || VT == MVT::v2i64)
+      return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  }
+
+  if (isPSHUFHWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
+                                getShufflePSHUFHWImmediate(SVOp),
+                                DAG);
+
+  if (isPSHUFLWMask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
+                                getShufflePSHUFLWImmediate(SVOp),
+                                DAG);
+
+  if (isSHUFPMask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
+                                getShuffleSHUFImmediate(SVOp), DAG);
+
+  if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
+  if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
+    return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //
+
+  // Handle VMOVDDUPY permutations
+  if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+  // Handle VPERMILPS/D* permutations
+  if (isVPERMILPMask(M, VT, HasFp256)) {
+    if (HasInt256 && VT == MVT::v8i32)
+      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
+                                  getShuffleSHUFImmediate(SVOp), DAG);
+    return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
+                                getShuffleSHUFImmediate(SVOp), DAG);
+  }
+
+  // Handle VPERM2F128/VPERM2I128 permutations
+  if (isVPERM2X128Mask(M, VT, HasFp256))
+    return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
+                                V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
+
+  SDValue BlendOp = LowerVECTOR_SHUFFLEtoBlend(SVOp, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
+    SmallVector<SDValue, 8> permclMask;
+    for (unsigned i = 0; i != 8; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+    }
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
+                               &permclMask[0], 8);
+    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+    return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+  }
+
+  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
+                                getShuffleCLImmediate(SVOp), DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //
+
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.is128BitVector())
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
+
+  return SDValue();
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType().getSimpleVT();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+    return SDValue();
+
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+        (User->getOpcode() != ISD::BITCAST ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
+                                              Op.getOperand(0)),
+                                              Op.getOperand(1));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
+  }
+
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    // ExtractPS/pextrq works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  SDValue Vec = Op.getOperand(0);
+  MVT VecVT = Vec.getValueType().getSimpleVT();
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.is256BitVector()) {
+    DebugLoc dl = Op.getNode()->getDebugLoc();
+    unsigned NumElems = VecVT.getVectorNumElements();
+    SDValue Idx = Op.getOperand(1);
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+    // Get the 128-bit vector.
+    Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+
+    if (IdxVal >= NumElems/2)
+      IdxVal -= NumElems/2;
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
+                       DAG.getConstant(IdxVal, MVT::i32));
+  }
+
+  assert(VecVT.is128BitVector() && "Unexpected vector length");
+
+  if (Subtarget->hasSSE41()) {
+    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  MVT VT = Op.getValueType().getSimpleVT();
+  DebugLoc dl = Op.getDebugLoc();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BITCAST, dl,
+                                                 MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT EltVT = MVT::i32;
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+                                  DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  }
+
+  if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
+    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if (!VT.is128BitVector())
+    return SDValue();
+
+  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+      isa<ConstantSDNode>(N2)) {
+    unsigned Opc;
+    if (VT == MVT::v8i16)
+      Opc = X86ISD::PINSRW;
+    else if (VT == MVT::v16i8)
+      Opc = X86ISD::PINSRB;
+    else
+      Opc = X86ISD::PINSRB;
+
+    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+    // argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  }
+
+  if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+    // Bits [7:6] of the constant are the source select.  This will always be
+    //  zero here.  The DAG Combiner may combine an extract_elt index into these
+    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
+    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+    // Bits [5:4] of the constant are the destination select.  This is the
+    //  value of the incoming immediate.
+    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+    //   combine either bitwise AND or insert of float 0.0 to set these bits.
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    // Create this as a scalar to vector..
+    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+  }
+
+  if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) {
+    // PINSR* works with constant index.
+    return Op;
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT.getVectorElementType();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  // If this is a 256-bit vector result, first extract the 128-bit vector,
+  // insert the element into the extracted half and then place it back.
+  if (VT.is256BitVector()) {
+    if (!isa<ConstantSDNode>(N2))
+      return SDValue();
+
+    // Get the desired 128-bit vector half.
+    unsigned NumElems = VT.getVectorNumElements();
+    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
+    SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
+
+    // Insert the element into the desired half.
+    bool Upper = IdxVal >= NumElems/2;
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
+                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
+
+    // Insert the changed part back to the 256-bit vector
+    return Insert128BitVector(N0, V, IdxVal, DAG, dl);
+  }
+
+  if (Subtarget->hasSSE41())
+    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  DebugLoc dl = Op.getDebugLoc();
+  MVT OpVT = Op.getValueType().getSimpleVT();
+
+  // If this is a 256-bit vector result, first insert into a 128-bit
+  // vector and then insert into the 256-bit vector.
+  if (!OpVT.is128BitVector()) {
+    // Insert into a 128-bit vector.
+    EVT VT128 = EVT::getVectorVT(*Context,
+                                 OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / 2);
+
+    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
+
+    // Insert the 128-bit vector.
+    return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
+  }
+
+  if (OpVT == MVT::v1i64 &&
+      Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+  return DAG.getNode(ISD::BITCAST, dl, OpVT,
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
+}
+
+// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
+// a simple subregister reference or explicit instructions to grab
+// upper bits of a vector.
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDValue Vec = Op.getNode()->getOperand(0);
+    SDValue Idx = Op.getNode()->getOperand(1);
+
+    if (Op.getNode()->getValueType(0).is128BitVector() &&
+        Vec.getNode()->getValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Extract128BitVector(Vec, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
+}
+
+// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
+// simple superregister reference or explicit instructions to insert
+// the upper bits of a vector.
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  if (Subtarget->hasFp256()) {
+    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDValue Vec = Op.getNode()->getOperand(0);
+    SDValue SubVec = Op.getNode()->getOperand(1);
+    SDValue Idx = Op.getNode()->getOperand(2);
+
+    if (Op.getNode()->getValueType(0).is256BitVector() &&
+        SubVec.getNode()->getValueType(0).is128BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
+  }
+  return SDValue();
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset(), OpFlag);
+  DebugLoc DL = CP->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc(), getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                          OpFlag);
+  DebugLoc DL = JT->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag)
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc(), getPointerTy()),
+                         Result);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
+
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+
+  DebugLoc DL = Op.getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit()) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc(), getPointerTy()),
+                         Result);
+  }
+
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget->ClassifyBlockAddressReference();
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                                      int64_t Offset, SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags =
+    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
+  }
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags, bool LocalDynamic = false) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+
+  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
+                                           : X86ISD::TLSADDR;
+
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
+  }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setAdjustsStack(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               DebugLoc(), PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
+
+static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
+                                           SelectionDAG &DAG,
+                                           const EVT PtrVT,
+                                           bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+
+  // Get the start address of the TLS block for this module.
+  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
+      .getInfo<X86MachineFunctionInfo>();
+  MFI->incNumLocalDynamicTLSAccesses();
+
+  SDValue Base;
+  if (is64Bit) {
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+  } else {
+    SDValue InFlag;
+    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+    InFlag = Chain.getValue(1);
+    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
+                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+  }
+
+  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
+  // of Base.
+
+  // Build x@dtpoff.
+  unsigned char OperandFlags = X86II::MO_DTPOFF;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  // Add x@dtpoff with the base.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit, bool isPIC) {
+  DebugLoc dl = GA->getDebugLoc();
+
+  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
+  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
+                                                         is64Bit ? 257 : 256));
+
+  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                                      DAG.getIntPtrConstant(0),
+                                      MachinePointerInfo(Ptr),
+                                      false, false, false, 0);
+
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (model == TLSModel::InitialExec) {
+    if (is64Bit) {
+      OperandFlags = X86II::MO_GOTTPOFF;
+      WrapperKind = X86ISD::WrapperRIP;
+    } else {
+      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
+    }
+  } else {
+    llvm_unreachable("Unexpected model");
+  }
+
+  // emit "addl x@ntpoff,%eax" (local exec)
+  // or "addl x@indntpoff,%eax" (initial exec)
+  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                           GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec) {
+    if (isPIC && !is64Bit) {
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
+                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                           Offset);
+    }
+
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         MachinePointerInfo::getGOT(), false, false, false,
+                         0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  if (Subtarget->isTargetELF()) {
+    TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+    switch (model) {
+      case TLSModel::GeneralDynamic:
+        if (Subtarget->is64Bit())
+          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+      case TLSModel::LocalDynamic:
+        return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
+                                           Subtarget->is64Bit());
+      case TLSModel::InitialExec:
+      case TLSModel::LocalExec:
+        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+                                   Subtarget->is64Bit(),
+                        getTargetMachine().getRelocationModel() == Reloc::PIC_);
+    }
+    llvm_unreachable("Unknown TLS model.");
+  }
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin only has one model of TLS.  Lower to that.
+    unsigned char OpFlag = 0;
+    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
+                           X86ISD::WrapperRIP : X86ISD::Wrapper;
+
+    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+    // global base reg.
+    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
+                  !Subtarget->is64Bit();
+    if (PIC32)
+      OpFlag = X86II::MO_TLVP_PIC_BASE;
+    else
+      OpFlag = X86II::MO_TLVP;
+    DebugLoc DL = Op.getDebugLoc();
+    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
+                                                GA->getValueType(0),
+                                                GA->getOffset(), OpFlag);
+    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+    // With PIC32, the address is actually $g + Offset.
+    if (PIC32)
+      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                           DAG.getNode(X86ISD::GlobalBaseReg,
+                                       DebugLoc(), getPointerTy()),
+                           Offset);
+
+    // Lowering the machine isd will make sure everything is in the right
+    // location.
+    SDValue Chain = DAG.getEntryNode();
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Args[] = { Chain, Offset };
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+
+    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setAdjustsStack(true);
+
+    // And our return value (tls address) is in the standard call return value
+    // location.
+    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
+  }
+
+  if (Subtarget->isTargetWindows() || Subtarget->isTargetMingw()) {
+    // Just use the implicit TLS architecture
+    // Need to generate someting similar to:
+    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
+    //                                  ; from TEB
+    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
+    //   mov     rcx, qword [rdx+rcx*8]
+    //   mov     eax, .tls$:tlsvar
+    //   [rax+rcx] contains the address
+    // Windows 64bit: gs:0x58
+    // Windows 32bit: fs:__tls_array
+
+    // If GV is an alias then use the aliasee for determining
+    // thread-localness.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GV = GA->resolveAliasedGlobal(false);
+    DebugLoc dl = GA->getDebugLoc();
+    SDValue Chain = DAG.getEntryNode();
+
+    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
+    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
+    // use its literal value of 0x2C.
+    Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
+                                        ? Type::getInt8PtrTy(*DAG.getContext(),
+                                                             256)
+                                        : Type::getInt32PtrTy(*DAG.getContext(),
+                                                              257));
+
+    SDValue TlsArray = Subtarget->is64Bit() ? DAG.getIntPtrConstant(0x58) :
+      (Subtarget->isTargetMingw() ? DAG.getIntPtrConstant(0x2C) :
+        DAG.getExternalSymbol("_tls_array", getPointerTy()));
+
+    SDValue ThreadPointer = DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
+                                        MachinePointerInfo(Ptr),
+                                        false, false, false, 0);
+
+    // Load the _tls_index variable
+    SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
+    if (Subtarget->is64Bit())
+      IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
+                           IDX, MachinePointerInfo(), MVT::i32,
+                           false, false, 0);
+    else
+      IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
+                        false, false, false, 0);
+
+    SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
+                                    getPointerTy());
+    IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
+
+    SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
+    res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
+                      false, false, false, 0);
+
+    // Get the offset of start of .tls section
+    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
+                                             GA->getValueType(0),
+                                             GA->getOffset(), X86II::MO_SECREL);
+    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
+
+    // The address of the thread local variable is the add of the thread
+    // pointer with the offset of the variable.
+    return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
+  }
+
+  llvm_unreachable("TLS not implemented for this target.");
+}
+
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
+/// and take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, MVT::i8))
+                       : DAG.getConstant(0, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+  }
+
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                             AndNode, DAG.getConstant(0, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT SrcVT = Op.getOperand(0).getValueType();
+
+  if (SrcVT.isVector())
+    return SDValue();
+
+  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot,
+                               MachinePointerInfo::getFixedStack(SSFI),
+                               false, false, 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) const {
+  // Build the FILD
+  DebugLoc DL = Op.getDebugLoc();
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+
+  unsigned ByteSize = SrcVT.getSizeInBits()/8;
+
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
+  MachineMemOperand *MMO;
+  if (FI) {
+    int SSFI = FI->getIndex();
+    MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
+  } else {
+    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
+    StackSlot = StackSlot.getOperand(1);
+  }
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
+                                           X86ISD::FILD, DL,
+                                           Tys, Ops, array_lengthof(Ops),
+                                           SrcVT, MMO);
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
+    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    MachineMemOperand *MMO =
+      DAG.getMachineFunction()
+      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, SSFISize, SSFISize);
+
+    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
+                                    Ops, array_lengthof(Ops),
+                                    Op.getValueType(), MMO);
+    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, false, 0);
+  }
+
+  return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // This algorithm is not obvious. Here it is what we're trying to output:
+  /*
+     movq       %rax,  %xmm0
+     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
+     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
+     #ifdef __SSE3__
+       haddpd   %xmm0, %xmm0
+     #else
+       pshufd   $0x4e, %xmm0, %xmm1
+       addpd    %xmm1, %xmm0
+     #endif
+  */
+
+  DebugLoc dl = Op.getDebugLoc();
+  LLVMContext *Context = DAG.getContext();
+
+  // Build some magic constants.
+  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  Constant *C0 = ConstantDataVector::get(*Context, CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+  SmallVector<Constant*,2> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4330000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                      APInt(64, 0x4530000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+  // Load the 64-bit value into an XMM register.
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                            Op.getOperand(0));
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
+                              DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
+                              CLod0);
+
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+  SDValue Result;
+
+  if (Subtarget->hasSSE3()) {
+    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
+    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
+  } else {
+    SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
+    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
+                                           S2F, 0x4E, DAG);
+    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
+                         DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
+                         Sub);
+  }
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+                     DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Load)),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Bias)));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
+                   DAG.getIntPtrConstant(0));
+
+  // Subtract the bias.
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  EVT DestVT = Op.getValueType();
+
+  if (DestVT.bitsLT(MVT::f64))
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0));
+  if (DestVT.bitsGT(MVT::f64))
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+
+  // Handle final rounding.
+  return Sub;
+}
+
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  EVT SVT = N0.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                             SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
+  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  EVT SrcVT = N0.getValueType();
+  EVT DstVT = Op.getValueType();
+  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
+    return SDValue();
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  if (SrcVT == MVT::i32) {
+    SDValue WordOff = DAG.getConstant(4, getPointerTy());
+    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+                                     getPointerTy(), StackSlot, WordOff);
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                  StackSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+                                  OffsetSlot, MachinePointerInfo(),
+                                  false, false, 0);
+    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+    return Fild;
+  }
+
+  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot, MachinePointerInfo(),
+                               false, false, 0);
+  // For i64 source, we need to add the appropriate power of 2 if the input
+  // was negative.  This is the same as the optimization in
+  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
+  // we must be careful to do the computation in x87 extended precision, not
+  // in SSE. (The generic code can't know it's OK to do this, or how to.)
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction()
+    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                          MachineMemOperand::MOLoad, 8, 8);
+
+  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
+  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
+                                         MVT::i64, MMO);
+
+  APInt FF(32, 0x5F800000ULL);
+
+  // Check whether the sign bit is set.
+  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
+                                 ISD::SETLT);
+
+  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+  SDValue FudgePtr = DAG.getConstantPool(
+                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                                         getPointerTy());
+
+  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+  SDValue Zero = DAG.getIntPtrConstant(0);
+  SDValue Four = DAG.getIntPtrConstant(4);
+  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
+                               Zero, Four);
+  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
+
+  // Load the value out, extending it from f32 to f80.
+  // FIXME: Avoid the extend by constructing the right constant pool?
+  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
+                                 FudgePtr, MachinePointerInfo::getConstantPool(),
+                                 MVT::f32, false, false, 4);
+  // Extend everything to 80 bits to force it to be done on x87.
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
+  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
+}
+
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                    bool IsSigned, bool IsReplace) const {
+  DebugLoc DL = Op.getDebugLoc();
+
+  EVT DstTy = Op.getValueType();
+
+  if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_INT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->int64 either into FISTP64 followed by a load from a temporary
+  // stack slot, or into the FTOL runtime function.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  if (!IsSigned && isIntegerTypeFTOL(DstTy))
+    Opc = X86ISD::WIN_FTOL;
+  else
+    switch (DstTy.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+    }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  EVT TheVT = Op.getOperand(0).getValueType();
+  // FIXME This causes a redundant load/store if the SSE-class value is already
+  // in memory, such as if it is on the callstack.
+  if (isScalarFPTypeInSSEReg(TheVT)) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
+                         MachinePointerInfo::getFixedStack(SSFI),
+                         false, false, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(TheVT)
+    };
+
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                              MachineMemOperand::MOLoad, MemSize, MemSize);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
+                                    DstTy, MMO);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                            MachineMemOperand::MOStore, MemSize, MemSize);
+
+  if (Opc != X86ISD::WIN_FTOL) {
+    // Build the FP_TO_INT*_IN_MEM
+    SDValue Ops[] = { Chain, Value, StackSlot };
+    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
+                                           Ops, 3, DstTy, MMO);
+    return std::make_pair(FIST, StackSlot);
+  } else {
+    SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
+      DAG.getVTList(MVT::Other, MVT::Glue),
+      Chain, Value);
+    SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
+      MVT::i32, ftol.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
+      MVT::i32, eax.getValue(2));
+    SDValue Ops[] = { eax, edx };
+    SDValue pair = IsReplace
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, 2)
+      : DAG.getMergeValues(Ops, 2, DL);
+    return std::make_pair(pair, SDValue());
+  }
+}
+
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+                              const X86Subtarget *Subtarget) {
+  MVT VT = Op->getValueType(0).getSimpleVT();
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getValueType().getSimpleVT();
+  DebugLoc dl = Op->getDebugLoc();
+
+  // Optimize vectors in AVX mode:
+  //
+  //   v8i16 -> v8i32
+  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
+  //   Concat upper and lower parts.
+  //
+  //   v4i32 -> v4i64
+  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
+  //   Concat upper and lower parts.
+  //
+
+  if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
+
+  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  SDValue Undef = DAG.getUNDEF(InVT);
+  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+                             VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+  OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  return SDValue();
+}
+SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getValueType().getSimpleVT();
+
+  if (Subtarget->hasFp256()) {
+    SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+    if (Res.getNode())
+      return Res;
+  }
+
+  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
+
+  // AVX2 has better support of integer extending.
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+                           DAG.getVectorShuffle(MVT::v8i16, DL, In,
+                                                DAG.getUNDEF(MVT::v8i16),
+                                                &Mask[0]));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getValueType().getSimpleVT();
+
+  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+    if (Subtarget->hasInt256()) {
+      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+                                ShufMask);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+                         DAG.getIntPtrConstant(0));
+    }
+
+    // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(0));
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                               DAG.getIntPtrConstant(2));
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+    // The PSHUFD mask:
+    static const int ShufMask1[] = {0, 2, 0, 0};
+    SDValue Undef = DAG.getUNDEF(VT);
+    OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
+
+    // The MOVLHPS mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
+  }
+
+  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    if (Subtarget->hasInt256()) {
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+
+      SmallVector<SDValue,32> pshufbMask;
+      for (unsigned i = 0; i < 2; ++i) {
+        pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+        for (unsigned j = 0; j < 8; ++j)
+          pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+      }
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
+                               &pshufbMask[0], 32);
+      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+
+      static const int ShufMask[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
+                                &ShufMask[0]);
+      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+                       DAG.getIntPtrConstant(0));
+      return DAG.getNode(ISD::BITCAST, DL, VT, In);
+    }
+
+    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(0));
+
+    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+                               DAG.getIntPtrConstant(4));
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+
+    // The PSHUFB mask:
+    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
+                                   -1, -1, -1, -1, -1, -1, -1, -1};
+
+    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+    OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+    OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+    // The MOVLHPS Mask:
+    static const int ShufMask2[] = {0, 1, 4, 5};
+    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+    return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+  }
+
+  // Handle truncation of V256 to V128 using shuffles.
+  if (!VT.is128BitVector() || !SVT.is256BitVector())
+    return SDValue();
+
+  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
+         "Invalid op");
+  assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                             NumElems * 2);
+
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MVT VT = Op.getValueType().getSimpleVT();
+  if (VT.isVector()) {
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
+                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+                                     MVT::v8i32, Op.getOperand(0)));
+    return SDValue();
+  }
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ true, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (FIST.getNode() == 0) return Op;
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
+    /*IsSigned=*/ false, /*IsReplace=*/ false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  assert(FIST.getNode() && "Unexpected failure");
+
+  if (StackSlot.getNode())
+    // Load the result.
+    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                       FIST, StackSlot, MachinePointerInfo(),
+                       false, false, false, 0);
+
+  // The node is the result.
+  return FIST;
+}
+
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  SDValue In = Op.getOperand(0);
+  MVT SVT = In.getValueType().getSimpleVT();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
+  LLVMContext *Context = DAG.getContext();
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
+  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
+  LLVMContext *Context = DAG.getContext();
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT EltVT = VT;
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
+    EltVT = VT.getVectorElementType();
+    NumElts = VT.getVectorNumElements();
+  }
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+                                          APInt(64, 1ULL << 63)));
+  else
+    C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+                                          APInt(32, 1U << 31)));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                             MachinePointerInfo::getConstantPool(),
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, XORVT,
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, XORVT, Mask)));
+  }
+
+  return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  LLVMContext *Context = DAG.getContext();
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+  MVT SrcVT = Op1.getValueType().getSimpleVT();
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+
+  // First get the sign bit of second operand.
+  SmallVector<Constant*,4> CV;
+  if (SrcVT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (SrcVT.bitsGT(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+                          DAG.getIntPtrConstant(0));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    const fltSemantics &Sem = APFloat::IEEEdouble;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
+  } else {
+    const fltSemantics &Sem = APFloat::IEEEsingle;
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+                                                   APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                              MachinePointerInfo::getConstantPool(),
+                              false, false, false, 16);
+  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getValueType().getSimpleVT();
+
+  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
+  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
+                                  DAG.getConstant(1, VT));
+  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
+}
+
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+  SmallVector<SDValue, 8> VecIns;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+    VecIns.push_back(I->first);
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+                                    SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  default: break;
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO:
+    NeedOF = true;
+    break;
+  }
+
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() != 0 || NeedOF || NeedCF)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  unsigned Opcode = 0;
+  unsigned NumOperands = 0;
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
+  case ISD::ADD:
+    // Due to an isel shortcoming, be conservative if this add is likely to be
+    // selected as part of a load-modify-store instruction. When the root node
+    // in a match is a store, isel doesn't know how to remap non-chain non-flag
+    // uses of other nodes in the match, such as the ADD in this case. This
+    // leads to the ADD being left around and reselected, with the result being
+    // two adds in the output.  Alas, even if none our users are stores, that
+    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
+    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
+    // climbing the DAG back to the root, and it doesn't seem to be worth the
+    // effort.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
+        goto default_case;
+
+    if (ConstantSDNode *C =
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
+      // An add of one will be selected as an INC.
+      if (C->getAPIntValue() == 1) {
+        Opcode = X86ISD::INC;
+        NumOperands = 1;
+        break;
+      }
+
+      // An add of negative one (subtract of one) will be selected as a DEC.
+      if (C->getAPIntValue().isAllOnesValue()) {
+        Opcode = X86ISD::DEC;
+        NumOperands = 1;
+        break;
+      }
+    }
+
+    // Otherwise use a regular EFLAGS-setting add.
+    Opcode = X86ISD::ADD;
+    NumOperands = 2;
+    break;
+  case ISD::AND: {
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    bool NonFlagUse = false;
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+      SDNode *User = *UI;
+      unsigned UOpNo = UI.getOperandNo();
+      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+        // Look pass truncate.
+        UOpNo = User->use_begin().getOperandNo();
+        User = *User->use_begin();
+      }
+
+      if (User->getOpcode() != ISD::BRCOND &&
+          User->getOpcode() != ISD::SETCC &&
+          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
+        NonFlagUse = true;
+        break;
+      }
+    }
+
+    if (!NonFlagUse)
+      break;
+  }
+    // FALL THROUGH
+  case ISD::SUB:
+  case ISD::OR:
+  case ISD::XOR:
+    // Due to the ISEL shortcoming noted above, be conservative if this op is
+    // likely to be selected as part of a load-modify-store instruction.
+    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() == ISD::STORE)
+        goto default_case;
+
+    // Otherwise use a regular EFLAGS-setting instruction.
+    switch (ArithOp.getOpcode()) {
+    default: llvm_unreachable("unexpected operator!");
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
+    case ISD::XOR: Opcode = X86ISD::XOR; break;
+    case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
+    }
+
+    NumOperands = 2;
+    break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    return SDValue(Op.getNode(), 1);
+  default:
+  default_case:
+    break;
+  }
+
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
+  if (Opcode == 0)
+    // Emit a CMP with 0, which is the TEST pattern.
+    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                       DAG.getConstant(0, Op.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i != NumOperands; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  DAG.ReplaceAllUsesWith(Op, New);
+  return SDValue(New.getNode(), 1);
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   SelectionDAG &DAG) const {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+    if (C->getAPIntValue() == 0)
+      return EmitTest(Op0, X86CC, DAG);
+
+  DebugLoc dl = Op0.getDebugLoc();
+  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
+       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
+    // Use SUB instead of CMP to enable CSE between SUB and CMP.
+    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
+                              Op0, Op1);
+    return SDValue(Sub.getNode(), 1);
+  }
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// Convert a comparison if required by the subtarget.
+SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
+                                                 SelectionDAG &DAG) const {
+  // If the subtarget does not support the FUCOMI instruction, floating-point
+  // comparisons have to be converted.
+  if (Subtarget->hasCMov() ||
+      Cmp.getOpcode() != X86ISD::CMP ||
+      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
+      !Cmp.getOperand(1).getValueType().isFloatingPoint())
+    return Cmp;
+
+  // The instruction selector will select an FUCOM instruction instead of
+  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
+  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
+  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+  DebugLoc dl = Cmp.getDebugLoc();
+  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
+  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
+  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
+                            DAG.getConstant(8, MVT::i8));
+  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
+  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+}
+
+static bool isAllOnes(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isAllOnesValue();
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
+                                     DebugLoc dl, SelectionDAG &DAG) const {
+  SDValue Op0 = And.getOperand(0);
+  SDValue Op1 = And.getOperand(1);
+  if (Op0.getOpcode() == ISD::TRUNCATE)
+    Op0 = Op0.getOperand(0);
+  if (Op1.getOpcode() == ISD::TRUNCATE)
+    Op1 = Op1.getOperand(0);
+
+  SDValue LHS, RHS;
+  if (Op1.getOpcode() == ISD::SHL)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
+      if (And00C->getZExtValue() == 1) {
+        // If we looked past a truncate, check that it's only truncating away
+        // known zeros.
+        unsigned BitWidth = Op0.getValueSizeInBits();
+        unsigned AndBitWidth = And.getValueSizeInBits();
+        if (BitWidth > AndBitWidth) {
+          APInt Zeros, Ones;
+          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+            return SDValue();
+        }
+        LHS = Op1;
+        RHS = Op0.getOperand(1);
+      }
+  } else if (Op1.getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
+    SDValue AndLHS = Op0;
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
+    }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+    }
+  }
+
+  if (LHS.getNode()) {
+    // If the LHS is of the form (x ^ -1) then replace the LHS with x and flip
+    // the condition code later.
+    bool Invert = false;
+    if (LHS.getOpcode() == ISD::XOR && isAllOnes(LHS.getOperand(1))) {
+      Invert = true;
+      LHS = LHS.getOperand(0);
+    }
+
+    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
+    // instruction.  Since the shift amount is in-range-or-undefined, we know
+    // that doing a bittest on the i32 value is ok.  We extend to i32 because
+    // the encoding for the i16 version is larger than the i32 version.
+    // Also promote i16 to i32 for performance / code size reason.
+    if (LHS.getValueType() == MVT::i8 ||
+        LHS.getValueType() == MVT::i16)
+      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+    // If the operand types disagree, extend the shift amount to match.  Since
+    // BT ignores high bits (like shifts) we can use anyextend.
+    if (LHS.getValueType() != RHS.getValueType())
+      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    // Flip the condition if the LHS was a not instruction
+    if (Invert)
+      Cond = X86::GetOppositeBranchCondition(Cond);
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(Cond, MVT::i8), BT);
+  }
+
+  return SDValue();
+}
+
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getValueType().getSimpleVT();
+
+  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC = Op.getOperand(2);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  SDValue Cond;
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getValueType().getSimpleVT();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (isFP) {
+#ifndef NDEBUG
+    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+#endif
+
+    unsigned SSECC;
+    bool Swap = false;
+
+    // SSE Condition code mapping:
+    //  0 - EQ
+    //  1 - LT
+    //  2 - LE
+    //  3 - UNORD
+    //  4 - NEQ
+    //  5 - NLT
+    //  6 - NLE
+    //  7 - ORD
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Unexpected SETCC condition");
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  SSECC = 0; break;
+    case ISD::SETOGT:
+    case ISD::SETGT: Swap = true; // Fallthrough
+    case ISD::SETLT:
+    case ISD::SETOLT: SSECC = 1; break;
+    case ISD::SETOGE:
+    case ISD::SETGE: Swap = true; // Fallthrough
+    case ISD::SETLE:
+    case ISD::SETOLE: SSECC = 2; break;
+    case ISD::SETUO:  SSECC = 3; break;
+    case ISD::SETUNE:
+    case ISD::SETNE:  SSECC = 4; break;
+    case ISD::SETULE: Swap = true; // Fallthrough
+    case ISD::SETUGE: SSECC = 5; break;
+    case ISD::SETULT: Swap = true; // Fallthrough
+    case ISD::SETUGT: SSECC = 6; break;
+    case ISD::SETO:   SSECC = 7; break;
+    case ISD::SETUEQ:
+    case ISD::SETONE: SSECC = 8; break;
+    }
+    if (Swap)
+      std::swap(Op0, Op1);
+
+    // In the two special cases we can't handle, emit two comparisons.
+    if (SSECC == 8) {
+      unsigned CC0, CC1;
+      unsigned CombineOpc;
+      if (SetCCOpcode == ISD::SETUEQ) {
+        CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
+      } else {
+        assert(SetCCOpcode == ISD::SETONE);
+        CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
+      }
+
+      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC0, MVT::i8));
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                                 DAG.getConstant(CC1, MVT::i8));
+      return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
+    }
+    // Handle all other FP comparisons here.
+    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+                       DAG.getConstant(SSECC, MVT::i8));
+  }
+
+  // Break 256-bit integer vector compare into smaller ones.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntVSETCC(Op, DAG);
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc;
+  bool Swap = false, Invert = false, FlipSigns = false;
+
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (VT == MVT::v2i64) {
+    if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
+      return SDValue();
+    if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+      // pcmpeqd + pshufd + pand.
+      assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+      // First cast everything to the right type,
+      Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+      Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+      // Do the compare.
+      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+      // Make sure the lower and upper halves are both all-ones.
+      const int Mask[] = { 1, 0, 3, 2 };
+      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+      if (Invert)
+        Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+  }
+
+  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
+                                      EltVT);
+    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
+    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
+                                    SignBits.size());
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getValueType().getSimpleVT();
+
+  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->isNullValue() &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+    if (NewSetCC.getNode())
+      return NewSetCC;
+  }
+
+  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
+  // these.
+  if (Op1.getOpcode() == ISD::Constant &&
+      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+       cast<ConstantSDNode>(Op1)->isNullValue()) &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+    // If the input is a setcc, then reuse the input setcc or use a new one with
+    // the inverted condition.
+    if (Op0.getOpcode() == X86ISD::SETCC) {
+      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+      bool Invert = (CC == ISD::SETNE) ^
+        cast<ConstantSDNode>(Op1)->isNullValue();
+      if (!Invert) return Op0;
+
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+    }
+  }
+
+  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getNode()->getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
+      Opc == X86ISD::SAHF)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD ||
+       Opc == X86ISD::SUB ||
+       Opc == X86ISD::ADC ||
+       Opc == X86ISD::SBB ||
+       Opc == X86ISD::SMUL ||
+       Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC ||
+       Opc == X86ISD::DEC ||
+       Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR ||
+       Opc == X86ISD::AND))
+    return true;
+
+  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
+    return true;
+
+  return false;
+}
+
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
+static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
+  if (V.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue VOp0 = V.getOperand(0);
+  unsigned InBits = VOp0.getValueSizeInBits();
+  unsigned Bits = V.getValueSizeInBits();
+  return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue CC;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
+
+  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
+  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
+  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
+  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  if (Cond.getOpcode() == X86ISD::SETCC &&
+      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
+      isZero(Cond.getOperand(1).getOperand(1))) {
+    SDValue Cmp = Cond.getOperand(1);
+
+    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+
+    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
+        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
+      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
+
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      // Apply further optimizations for special cases
+      // (select (x != 0), -1, 0) -> neg & sbb
+      // (select (x == 0), 0, -1) -> neg & sbb
+      if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
+        if (YC->isNullValue() &&
+            (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
+          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
+                                    DAG.getConstant(0, CmpOp0.getValueType()),
+                                    CmpOp0);
+          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                    DAG.getConstant(X86::COND_B, MVT::i8),
+                                    SDValue(Neg.getNode(), 1));
+          return Res;
+        }
+
+      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                        CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+
+      SDValue Res =   // Res = 0 or -1.
+        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                    DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+
+      if (isAllOnes(Op1) != (CondCode == X86::COND_E))
+        Res = DAG.getNOT(DL, Res, Res.getValueType());
+
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
+      if (N2C == 0 || !N2C->isNullValue())
+        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+      return Res;
+    }
+  }
+
+  // Look past (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1)
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    MVT VT = Op.getValueType().getSimpleVT();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      addTest = false;
+    }
+  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+              Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+
+  // a <  b ? -1 :  0 -> RES = ~setcc_carry
+  // a <  b ?  0 : -1 -> RES = setcc_carry
+  // a >= b ? -1 :  0 -> RES = setcc_carry
+  // a >= b ?  0 : -1 -> RES = ~setcc_carry
+  if (Cond.getOpcode() == X86ISD::SUB) {
+    Cond = ConvertCmpIfNecessary(Cond, DAG);
+    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
+
+    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
+        (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
+      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                DAG.getConstant(X86::COND_B, MVT::i8), Cond);
+      if (isAllOnes(Op1) != (CondCode == X86::COND_B))
+        return DAG.getNOT(DL, Res, Res.getValueType());
+      return Res;
+    }
+  }
+
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
+  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+}
+
+SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  MVT VT = Op->getValueType(0).getSimpleVT();
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getValueType().getSimpleVT();
+  DebugLoc dl = Op->getDebugLoc();
+
+  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+      (VT != MVT::v8i32 || InVT != MVT::v8i16))
+    return SDValue();
+
+  if (Subtarget->hasInt256())
+    return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
+
+  // Optimize vectors in AVX mode
+  // Sign extend  v8i16 to v8i32 and
+  //              v4i32 to v4i64
+  //
+  // Divide input vector into two parts
+  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+  // concat the vectors to original VT
+
+  unsigned NumElems = InVT.getVectorNumElements();
+  SDValue Undef = DAG.getUNDEF(InVT);
+
+  SmallVector<int,8> ShufMask1(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask1[i] = i;
+
+  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+  SmallVector<int,8> ShufMask2(NumElems, -1);
+  for (unsigned i = 0; i != NumElems/2; ++i)
+    ShufMask2[i] = i + NumElems/2;
+
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+  MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+                                VT.getVectorNumElements()/2);
+
+  OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+  OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (N1C && N1C->getAPIntValue() == 1) {
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+      Op.getOperand(0).hasOneUse();
+  }
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC;
+  bool Inverted = false;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    // Check for setcc([su]{add,sub,mul}o == 0).
+    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+        isa<ConstantSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
+        Cond.getOperand(0).getResNo() == 1 &&
+        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
+         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
+         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
+         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
+      Inverted = true;
+      Cond = Cond.getOperand(0);
+    } else {
+      SDValue NewCond = LowerSETCC(Cond, DAG);
+      if (NewCond.getNode())
+        Cond = NewCond;
+    }
+  }
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1)
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  unsigned CondOpcode = Cond.getOpcode();
+  if (CondOpcode == X86ISD::SETCC ||
+      CondOpcode == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  }
+  CondOpcode = Cond.getOpcode();
+  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
+      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
+      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
+       Cond.getOperand(0).getValueType() != MVT::i8)) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    unsigned X86Opcode;
+    unsigned X86Cond;
+    SDVTList VTs;
+    switch (CondOpcode) {
+    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
+    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
+    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
+    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
+    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
+    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
+    default: llvm_unreachable("unexpected overflowing operator");
+    }
+    if (Inverted)
+      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
+    if (CondOpcode == ISD::UMULO)
+      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
+                          MVT::i32);
+    else
+      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
+
+    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
+
+    if (CondOpcode == ISD::UMULO)
+      Cond = X86Op.getValue(2);
+    else
+      Cond = X86Op.getValue(1);
+
+    CC = DAG.getConstant(X86Cond, MVT::i8);
+    addTest = false;
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, MVT::i8);
+          SDNode *User = *Op.getNode()->use_begin();
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User->getOpcode() == ISD::BR) {
+            SDValue FalseBB = User->getOperand(1);
+            SDNode *NewBR =
+              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+            assert(NewBR == User);
+            (void)NewBR;
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+      // For FCMP_OEQ, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_OEQ.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+          Dest = FalseBB;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_P, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+        }
+      }
+    } else if (Cond.getOpcode() == ISD::SETCC &&
+               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+      // For FCMP_UNE, we can emit
+      // two branches instead of an explicit AND instruction with a
+      // separate test. However, we only do this if this block doesn't
+      // have a fall-through edge, because this requires an explicit
+      // jmp when the condition is false.
+      if (Op.getNode()->hasOneUse()) {
+        SDNode *User = *Op.getNode()->use_begin();
+        // Look for an unconditional branch following this conditional branch.
+        // We need this because we need to reverse the successors in order
+        // to implement FCMP_UNE.
+        if (User->getOpcode() == ISD::BR) {
+          SDValue FalseBB = User->getOperand(1);
+          SDNode *NewBR =
+            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
+          assert(NewBR == User);
+          (void)NewBR;
+
+          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
+                                    Cond.getOperand(0), Cond.getOperand(1));
+          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
+          CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = DAG.getConstant(X86::COND_NP, MVT::i8);
+          Cond = Cmp;
+          addTest = false;
+          Dest = FalseBB;
+        }
+      }
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate if the high bits are known zero.
+    if (isTruncWithZeroHighBitsInput(Cond, DAG))
+        Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+  Cond = ConvertCmpIfNecessary(Cond, DAG);
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+          getTargetMachine().Options.EnableSegmentedStacks) &&
+         "This should be used only on Windows targets or when segmented stacks "
+         "are being used");
+  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  // FIXME: Ensure alignment here
+
+  bool Is64Bit = Subtarget->is64Bit();
+  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+
+  if (getTargetMachine().Options.EnableSegmentedStacks) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+
+      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; ++I)
+        if (I->hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+    }
+
+    const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+    SDValue Ops1[2] = { Value, Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  } else {
+    SDValue Flag;
+    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+    Flag = Chain.getValue(1);
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+    Flag = Chain.getValue(1);
+
+    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                               SPTy).getValue(1);
+
+    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  }
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  DebugLoc DL = Op.getDebugLoc();
+
+  if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                   getPointerTy());
+    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                        MachinePointerInfo(SV), false, false, 0);
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
+                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
+                                               MVT::i32),
+                               FIN, MachinePointerInfo(SV), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
+                                       MVT::i32),
+                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                    getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
+                       MachinePointerInfo(SV, 8),
+                       false, false, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(8));
+  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                                    getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
+                       MachinePointerInfo(SV, 16), false, false, 0);
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                     &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->is64Bit() &&
+         "LowerVAARG only handles 64-bit va_arg!");
+  assert((Subtarget->isTargetLinux() ||
+          Subtarget->isTargetDarwin()) &&
+          "Unhandled target in LowerVAARG");
+  assert(Op.getNode()->getNumOperands() == 4);
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  unsigned Align = Op.getConstantOperandVal(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT ArgVT = Op.getNode()->getValueType(0);
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+  uint8_t ArgMode;
+
+  // Decide which area this value should be read from.
+  // TODO: Implement the AMD64 ABI in its entirety. This simple
+  // selection mechanism works only for the basic types.
+  if (ArgVT == MVT::f80) {
+    llvm_unreachable("va_arg for f80 not yet implemented");
+  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
+  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
+    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
+  } else {
+    llvm_unreachable("Unhandled argument type in LowerVAARG");
+  }
+
+  if (ArgMode == 2) {
+    // Sanity Check: Make sure using fp_offset makes sense.
+    assert(!getTargetMachine().Options.UseSoftFloat &&
+           !(DAG.getMachineFunction()
+                .getFunction()->getAttributes()
+                .hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::NoImplicitFloat)) &&
+           Subtarget->hasSSE1());
+  }
+
+  // Insert VAARG_64 node into the DAG
+  // VAARG_64 returns two values: Variable Argument Address, Chain
+  SmallVector<SDValue, 11> InstOps;
+  InstOps.push_back(Chain);
+  InstOps.push_back(SrcPtr);
+  InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
+  InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
+  InstOps.push_back(DAG.getConstant(Align, MVT::i32));
+  SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
+  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
+                                          VTs, &InstOps[0], InstOps.size(),
+                                          MVT::i64,
+                                          MachinePointerInfo(SV),
+                                          /*Align=*/0,
+                                          /*Volatile=*/false,
+                                          /*ReadMem=*/true,
+                                          /*WriteMem=*/true);
+  Chain = VAARG.getValue(1);
+
+  // Load the next argument and return it
+  return DAG.getLoad(ArgVT, dl,
+                     Chain,
+                     VAARG,
+                     MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  DebugLoc DL = Op.getDebugLoc();
+
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
+                       false,
+                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+}
+
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
+// may or may not be a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+                                   SDValue SrcOp, SDValue ShAmt,
+                                   SelectionDAG &DAG) {
+  assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
+
+  if (isa<ConstantSDNode>(ShAmt)) {
+    // Constant may be a TargetConstant. Use a regular constant.
+    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    switch (Opc) {
+      default: llvm_unreachable("Unknown target vector shift node");
+      case X86ISD::VSHLI:
+      case X86ISD::VSRLI:
+      case X86ISD::VSRAI:
+        return DAG.getNode(Opc, dl, VT, SrcOp,
+                           DAG.getConstant(ShiftAmt, MVT::i32));
+    }
+  }
+
+  // Change opcode to non-immediate version
+  switch (Opc) {
+    default: llvm_unreachable("Unknown target vector shift node");
+    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
+    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
+    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
+  }
+
+  // Need to build a vector containing shift amount
+  // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0
+  SDValue ShOps[4];
+  ShOps[0] = ShAmt;
+  ShOps[1] = DAG.getConstant(0, MVT::i32);
+  ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+
+  // The return type has to be a 128-bit type with the same element
+  // type as the input type.
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
+
+  ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
+  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
+}
+
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc;
+    ISD::CondCode CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8), Cond);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // Arithmetic intrinsics.
+  case Intrinsic::x86_sse2_pmulu_dq:
+  case Intrinsic::x86_avx2_pmulu_dq:
+    return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  // SSE2/AVX2 sub with unsigned saturation intrinsics
+  case Intrinsic::x86_sse2_psubus_b:
+  case Intrinsic::x86_sse2_psubus_w:
+  case Intrinsic::x86_avx2_psubus_b:
+  case Intrinsic::x86_avx2_psubus_w:
+    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  // SSE3/AVX horizontal add/sub intrinsics
+  case Intrinsic::x86_sse3_hadd_ps:
+  case Intrinsic::x86_sse3_hadd_pd:
+  case Intrinsic::x86_avx_hadd_ps_256:
+  case Intrinsic::x86_avx_hadd_pd_256:
+  case Intrinsic::x86_sse3_hsub_ps:
+  case Intrinsic::x86_sse3_hsub_pd:
+  case Intrinsic::x86_avx_hsub_ps_256:
+  case Intrinsic::x86_avx_hsub_pd_256:
+  case Intrinsic::x86_ssse3_phadd_w_128:
+  case Intrinsic::x86_ssse3_phadd_d_128:
+  case Intrinsic::x86_avx2_phadd_w:
+  case Intrinsic::x86_avx2_phadd_d:
+  case Intrinsic::x86_ssse3_phsub_w_128:
+  case Intrinsic::x86_ssse3_phsub_d_128:
+  case Intrinsic::x86_avx2_phsub_w:
+  case Intrinsic::x86_avx2_phsub_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse3_hadd_ps:
+    case Intrinsic::x86_sse3_hadd_pd:
+    case Intrinsic::x86_avx_hadd_ps_256:
+    case Intrinsic::x86_avx_hadd_pd_256:
+      Opcode = X86ISD::FHADD;
+      break;
+    case Intrinsic::x86_sse3_hsub_ps:
+    case Intrinsic::x86_sse3_hsub_pd:
+    case Intrinsic::x86_avx_hsub_ps_256:
+    case Intrinsic::x86_avx_hsub_pd_256:
+      Opcode = X86ISD::FHSUB;
+      break;
+    case Intrinsic::x86_ssse3_phadd_w_128:
+    case Intrinsic::x86_ssse3_phadd_d_128:
+    case Intrinsic::x86_avx2_phadd_w:
+    case Intrinsic::x86_avx2_phadd_d:
+      Opcode = X86ISD::HADD;
+      break;
+    case Intrinsic::x86_ssse3_phsub_w_128:
+    case Intrinsic::x86_ssse3_phsub_d_128:
+    case Intrinsic::x86_avx2_phsub_w:
+    case Intrinsic::x86_avx2_phsub_d:
+      Opcode = X86ISD::HSUB;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE2/SSE41/AVX2 integer max/min intrinsics.
+  case Intrinsic::x86_sse2_pmaxu_b:
+  case Intrinsic::x86_sse41_pmaxuw:
+  case Intrinsic::x86_sse41_pmaxud:
+  case Intrinsic::x86_avx2_pmaxu_b:
+  case Intrinsic::x86_avx2_pmaxu_w:
+  case Intrinsic::x86_avx2_pmaxu_d:
+  case Intrinsic::x86_sse2_pminu_b:
+  case Intrinsic::x86_sse41_pminuw:
+  case Intrinsic::x86_sse41_pminud:
+  case Intrinsic::x86_avx2_pminu_b:
+  case Intrinsic::x86_avx2_pminu_w:
+  case Intrinsic::x86_avx2_pminu_d:
+  case Intrinsic::x86_sse41_pmaxsb:
+  case Intrinsic::x86_sse2_pmaxs_w:
+  case Intrinsic::x86_sse41_pmaxsd:
+  case Intrinsic::x86_avx2_pmaxs_b:
+  case Intrinsic::x86_avx2_pmaxs_w:
+  case Intrinsic::x86_avx2_pmaxs_d:
+  case Intrinsic::x86_sse41_pminsb:
+  case Intrinsic::x86_sse2_pmins_w:
+  case Intrinsic::x86_sse41_pminsd:
+  case Intrinsic::x86_avx2_pmins_b:
+  case Intrinsic::x86_avx2_pmins_w:
+  case Intrinsic::x86_avx2_pmins_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pmaxu_b:
+    case Intrinsic::x86_sse41_pmaxuw:
+    case Intrinsic::x86_sse41_pmaxud:
+    case Intrinsic::x86_avx2_pmaxu_b:
+    case Intrinsic::x86_avx2_pmaxu_w:
+    case Intrinsic::x86_avx2_pmaxu_d:
+      Opcode = X86ISD::UMAX;
+      break;
+    case Intrinsic::x86_sse2_pminu_b:
+    case Intrinsic::x86_sse41_pminuw:
+    case Intrinsic::x86_sse41_pminud:
+    case Intrinsic::x86_avx2_pminu_b:
+    case Intrinsic::x86_avx2_pminu_w:
+    case Intrinsic::x86_avx2_pminu_d:
+      Opcode = X86ISD::UMIN;
+      break;
+    case Intrinsic::x86_sse41_pmaxsb:
+    case Intrinsic::x86_sse2_pmaxs_w:
+    case Intrinsic::x86_sse41_pmaxsd:
+    case Intrinsic::x86_avx2_pmaxs_b:
+    case Intrinsic::x86_avx2_pmaxs_w:
+    case Intrinsic::x86_avx2_pmaxs_d:
+      Opcode = X86ISD::SMAX;
+      break;
+    case Intrinsic::x86_sse41_pminsb:
+    case Intrinsic::x86_sse2_pmins_w:
+    case Intrinsic::x86_sse41_pminsd:
+    case Intrinsic::x86_avx2_pmins_b:
+    case Intrinsic::x86_avx2_pmins_w:
+    case Intrinsic::x86_avx2_pmins_d:
+      Opcode = X86ISD::SMIN;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/SSE2/AVX floating point max/min intrinsics.
+  case Intrinsic::x86_sse_max_ps:
+  case Intrinsic::x86_sse2_max_pd:
+  case Intrinsic::x86_avx_max_ps_256:
+  case Intrinsic::x86_avx_max_pd_256:
+  case Intrinsic::x86_sse_min_ps:
+  case Intrinsic::x86_sse2_min_pd:
+  case Intrinsic::x86_avx_min_ps_256:
+  case Intrinsic::x86_avx_min_pd_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse_max_ps:
+    case Intrinsic::x86_sse2_max_pd:
+    case Intrinsic::x86_avx_max_ps_256:
+    case Intrinsic::x86_avx_max_pd_256:
+      Opcode = X86ISD::FMAX;
+      break;
+    case Intrinsic::x86_sse_min_ps:
+    case Intrinsic::x86_sse2_min_pd:
+    case Intrinsic::x86_avx_min_ps_256:
+    case Intrinsic::x86_avx_min_pd_256:
+      Opcode = X86ISD::FMIN;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // AVX2 variable shift intrinsics
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q_256:
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_avx2_psllv_d:
+    case Intrinsic::x86_avx2_psllv_q:
+    case Intrinsic::x86_avx2_psllv_d_256:
+    case Intrinsic::x86_avx2_psllv_q_256:
+      Opcode = ISD::SHL;
+      break;
+    case Intrinsic::x86_avx2_psrlv_d:
+    case Intrinsic::x86_avx2_psrlv_q:
+    case Intrinsic::x86_avx2_psrlv_d_256:
+    case Intrinsic::x86_avx2_psrlv_q_256:
+      Opcode = ISD::SRL;
+      break;
+    case Intrinsic::x86_avx2_psrav_d:
+    case Intrinsic::x86_avx2_psrav_d_256:
+      Opcode = ISD::SRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  case Intrinsic::x86_ssse3_pshuf_b_128:
+  case Intrinsic::x86_avx2_pshuf_b:
+    return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_ssse3_psign_b_128:
+  case Intrinsic::x86_ssse3_psign_w_128:
+  case Intrinsic::x86_ssse3_psign_d_128:
+  case Intrinsic::x86_avx2_psign_b:
+  case Intrinsic::x86_avx2_psign_w:
+  case Intrinsic::x86_avx2_psign_d:
+    return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse41_insertps:
+    return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::x86_avx_vperm2f128_ps_256:
+  case Intrinsic::x86_avx_vperm2f128_pd_256:
+  case Intrinsic::x86_avx_vperm2f128_si_256:
+  case Intrinsic::x86_avx2_vperm2i128:
+    return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+  case Intrinsic::x86_avx2_permd:
+  case Intrinsic::x86_avx2_permps:
+    // Operands intentionally swapped. Mask is last operand to intrinsic,
+    // but second operand for node/intruction.
+    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
+                       Op.getOperand(2), Op.getOperand(1));
+
+  case Intrinsic::x86_sse_sqrt_ps:
+  case Intrinsic::x86_sse2_sqrt_pd:
+  case Intrinsic::x86_avx_sqrt_ps_256:
+  case Intrinsic::x86_avx_sqrt_pd_256:
+    return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
+
+  // ptest and testp intrinsics. The intrinsic these come from are designed to
+  // return an integer value, not just an instruction so lower it to the ptest
+  // or testp pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:
+  case Intrinsic::x86_avx_ptestz_256:
+  case Intrinsic::x86_avx_ptestc_256:
+  case Intrinsic::x86_avx_ptestnzc_256:
+  case Intrinsic::x86_avx_vtestz_ps:
+  case Intrinsic::x86_avx_vtestc_ps:
+  case Intrinsic::x86_avx_vtestnzc_ps:
+  case Intrinsic::x86_avx_vtestz_pd:
+  case Intrinsic::x86_avx_vtestc_pd:
+  case Intrinsic::x86_avx_vtestnzc_pd:
+  case Intrinsic::x86_avx_vtestz_ps_256:
+  case Intrinsic::x86_avx_vtestc_ps_256:
+  case Intrinsic::x86_avx_vtestnzc_ps_256:
+  case Intrinsic::x86_avx_vtestz_pd_256:
+  case Intrinsic::x86_avx_vtestc_pd_256:
+  case Intrinsic::x86_avx_vtestnzc_pd_256: {
+    bool IsTestPacked = false;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_avx_vtestz_ps:
+    case Intrinsic::x86_avx_vtestz_pd:
+    case Intrinsic::x86_avx_vtestz_ps_256:
+    case Intrinsic::x86_avx_vtestz_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestz:
+    case Intrinsic::x86_avx_ptestz_256:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_avx_vtestc_ps:
+    case Intrinsic::x86_avx_vtestc_pd:
+    case Intrinsic::x86_avx_vtestc_ps_256:
+    case Intrinsic::x86_avx_vtestc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestc:
+    case Intrinsic::x86_avx_ptestc_256:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_avx_vtestnzc_ps:
+    case Intrinsic::x86_avx_vtestnzc_pd:
+    case Intrinsic::x86_avx_vtestnzc_ps_256:
+    case Intrinsic::x86_avx_vtestnzc_pd_256:
+      IsTestPacked = true; // Fallthrough
+    case Intrinsic::x86_sse41_ptestnzc:
+    case Intrinsic::x86_avx_ptestnzc_256:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
+    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // SSE/AVX shift intrinsics
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_psll_w:
+    case Intrinsic::x86_sse2_psll_d:
+    case Intrinsic::x86_sse2_psll_q:
+    case Intrinsic::x86_avx2_psll_w:
+    case Intrinsic::x86_avx2_psll_d:
+    case Intrinsic::x86_avx2_psll_q:
+      Opcode = X86ISD::VSHL;
+      break;
+    case Intrinsic::x86_sse2_psrl_w:
+    case Intrinsic::x86_sse2_psrl_d:
+    case Intrinsic::x86_sse2_psrl_q:
+    case Intrinsic::x86_avx2_psrl_w:
+    case Intrinsic::x86_avx2_psrl_d:
+    case Intrinsic::x86_avx2_psrl_q:
+      Opcode = X86ISD::VSRL;
+      break;
+    case Intrinsic::x86_sse2_psra_w:
+    case Intrinsic::x86_sse2_psra_d:
+    case Intrinsic::x86_avx2_psra_w:
+    case Intrinsic::x86_avx2_psra_d:
+      Opcode = X86ISD::VSRA;
+      break;
+    }
+    return DAG.getNode(Opcode, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  }
+
+  // SSE/AVX immediate shift intrinsics
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_avx2_pslli_w:
+  case Intrinsic::x86_avx2_pslli_d:
+  case Intrinsic::x86_avx2_pslli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_avx2_psrli_w:
+  case Intrinsic::x86_avx2_psrli_d:
+  case Intrinsic::x86_avx2_psrli_q:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d: {
+    unsigned Opcode;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse2_pslli_w:
+    case Intrinsic::x86_sse2_pslli_d:
+    case Intrinsic::x86_sse2_pslli_q:
+    case Intrinsic::x86_avx2_pslli_w:
+    case Intrinsic::x86_avx2_pslli_d:
+    case Intrinsic::x86_avx2_pslli_q:
+      Opcode = X86ISD::VSHLI;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+    case Intrinsic::x86_sse2_psrli_d:
+    case Intrinsic::x86_sse2_psrli_q:
+    case Intrinsic::x86_avx2_psrli_w:
+    case Intrinsic::x86_avx2_psrli_d:
+    case Intrinsic::x86_avx2_psrli_q:
+      Opcode = X86ISD::VSRLI;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+    case Intrinsic::x86_sse2_psrai_d:
+    case Intrinsic::x86_avx2_psrai_w:
+    case Intrinsic::x86_avx2_psrai_d:
+      Opcode = X86ISD::VSRAI;
+      break;
+    }
+    return getTargetVShiftNode(Opcode, dl, Op.getValueType(),
+                               Op.getOperand(1), Op.getOperand(2), DAG);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistria128:
+  case Intrinsic::x86_sse42_pcmpestria128:
+  case Intrinsic::x86_sse42_pcmpistric128:
+  case Intrinsic::x86_sse42_pcmpestric128:
+  case Intrinsic::x86_sse42_pcmpistrio128:
+  case Intrinsic::x86_sse42_pcmpestrio128:
+  case Intrinsic::x86_sse42_pcmpistris128:
+  case Intrinsic::x86_sse42_pcmpestris128:
+  case Intrinsic::x86_sse42_pcmpistriz128:
+  case Intrinsic::x86_sse42_pcmpestriz128: {
+    unsigned Opcode;
+    unsigned X86CC;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_sse42_pcmpistria128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpestria128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_A;
+      break;
+    case Intrinsic::x86_sse42_pcmpistric128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpestric128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse42_pcmpistrio128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpestrio128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_O;
+      break;
+    case Intrinsic::x86_sse42_pcmpistris128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpestris128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_S;
+      break;
+    case Intrinsic::x86_sse42_pcmpistriz128:
+      Opcode = X86ISD::PCMPISTRI;
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse42_pcmpestriz128:
+      Opcode = X86ISD::PCMPESTRI;
+      X86CC = X86::COND_E;
+      break;
+    }
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8),
+                                SDValue(PCMP.getNode(), 1));
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  case Intrinsic::x86_sse42_pcmpistri128:
+  case Intrinsic::x86_sse42_pcmpestri128: {
+    unsigned Opcode;
+    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
+      Opcode = X86ISD::PCMPISTRI;
+    else
+      Opcode = X86ISD::PCMPESTRI;
+
+    SmallVector<SDValue, 5> NewOps;
+    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+  }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
+  }
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+
+  // RDRAND/RDSEED intrinsics.
+  case Intrinsic::x86_rdrand_16:
+  case Intrinsic::x86_rdrand_32:
+  case Intrinsic::x86_rdrand_64:
+  case Intrinsic::x86_rdseed_16:
+  case Intrinsic::x86_rdseed_32:
+  case Intrinsic::x86_rdseed_64: {
+    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
+                       IntNo == Intrinsic::x86_rdseed_32 ||
+                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
+                                                            X86ISD::RDRAND;
+    // Emit the node with the right value type.
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+
+    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
+    // Otherwise return the value from Rand, which is always 0, casted to i32.
+    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
+                      DAG.getConstant(1, Op->getValueType(1)),
+                      DAG.getConstant(X86::COND_B, MVT::i32),
+                      SDValue(Result.getNode(), 1) };
+    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
+                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
+                                  Ops, 4);
+
+    // Return { result, isValid, chain }.
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
+                       SDValue(Result.getNode(), 2));
+  }
+
+  // XTEST intrinsics.
+  case Intrinsic::x86_xtest: {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
+    SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86::COND_NE, MVT::i8),
+                                InTrans);
+    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
+                       Ret, SDValue(InTrans.getNode(), 1));
+  }
+  }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
+                                   FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc dl       = Op.getDebugLoc();
+
+  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+                                     getPointerTy());
+  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
+                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
+                       false, false, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl,
+                     MVT::Other,
+                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl  = Op.getDebugLoc();
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+
+  if (Subtarget->is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, MVT::i64));
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
+                                MachinePointerInfo(TrmpAddr, 2),
+                                false, false, 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 10),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, MVT::i64));
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 12),
+                                false, false, 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, MachinePointerInfo(TrmpAddr, 20),
+                                false, false, 0);
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+                                MachinePointerInfo(TrmpAddr, 22),
+                                false, false, 0);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    CallingConv::ID CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      FunctionType *FTy = Func->getFunctionType();
+      const AttributeSet &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.hasAttribute(Idx, Attribute::InReg))
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+        if (InRegCount > 2) {
+          report_fatal_error("Nest register in use - reduce number of inreg"
+                             " parameters!");
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::X86_ThisCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
+    OutChains[0] = DAG.getStore(Root, dl,
+                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+                                Trmp, MachinePointerInfo(TrmpAddr),
+                                false, false, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, MVT::i32));
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
+                                MachinePointerInfo(TrmpAddr, 1),
+                                false, false, 1);
+
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+                                MachinePointerInfo(TrmpAddr, 5),
+                                false, false, 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, MVT::i32));
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
+                                MachinePointerInfo(TrmpAddr, 6),
+                                false, false, 1);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  MachineMemOperand *MMO =
+   MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
+                           MachineMemOperand::MOStore, 2, 2);
+
+  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
+  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+                                          DAG.getVTList(MVT::Other),
+                                          Ops, 2, MVT::i16, MMO);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
+                            MachinePointerInfo(), false, false, false, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x800, MVT::i16)),
+                DAG.getConstant(11, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, DL, MVT::i16,
+                DAG.getNode(ISD::AND, DL, MVT::i16,
+                            CWD, DAG.getConstant(0x400, MVT::i16)),
+                DAG.getConstant(9, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, DL, MVT::i16,
+                DAG.getNode(ISD::ADD, DL, MVT::i16,
+                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, MVT::i16)),
+                DAG.getConstant(3, MVT::i16));
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
+}
+
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // If src is zero (i.e. bsr sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits+NumBits-1, OpVT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse).
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // And xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  Op = Op.getOperand(0);
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits, VT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+}
+
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.is256BitVector() && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType().is256BitVector() &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType().is256BitVector() &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget->hasInt256())
+    return Lower256IntArith(Op, DAG);
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+  if (VT == MVT::v4i32) {
+    assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+           "Should not custom lower when pmuldq is available!");
+
+    // Extract the odd parts.
+    const int UnpackMask[] = { 1, -1, 3, -1 };
+    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+    // Multiply the even parts.
+    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+    // Now multiply odd parts.
+    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+    Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+    Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+    // Merge the two vectors back together with a shuffle. This expands into 2
+    // shuffles.
+    const int ShufMask[] = { 0, 4, 2, 6 };
+    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+  }
+
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+         "Only know how to lower V2I64/V4I64 multiply");
+
+  //  Ahi = psrlqi(a, 32);
+  //  Bhi = psrlqi(b, 32);
+  //
+  //  AloBlo = pmuludq(a, b);
+  //  AloBhi = pmuludq(a, Bhi);
+  //  AhiBlo = pmuludq(Ahi, b);
+
+  //  AloBhi = psllqi(AloBhi, 32);
+  //  AhiBlo = psllqi(AhiBlo, 32);
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
+
+  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
+  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+
+  // Bit cast to 32-bit vectors for MULUDQ
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
+  B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
+  Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
+  Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
+
+  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
+  SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
+  SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
+
+  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
+  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+
+  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+}
+
+SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  EVT EltTy = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Lower sdiv X, pow2-const.
+  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+  if (!C)
+    return SDValue();
+
+  APInt SplatValue, SplatUndef;
+  unsigned MinSplatBits;
+  bool HasAnyUndefs;
+  if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+    return SDValue();
+
+  if ((SplatValue != 0) &&
+      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+    unsigned lg2 = SplatValue.countTrailingZeros();
+    // Splat the sign bit.
+    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (SplatValue.isNonNegative())
+      return SRA;
+
+    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
+    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
+    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+  }
+  return SDValue();
+}
+
+static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
+                                         const X86Subtarget *Subtarget) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+
+      if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+          (Subtarget->hasInt256() &&
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        if (Op.getOpcode() == ISD::SHL)
+          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRL)
+          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+        if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
+          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+                             DAG.getConstant(ShiftAmt, MVT::i32));
+      }
+
+      if (VT == MVT::v16i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 16> V(16,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+
+      if (Subtarget->hasInt256() && VT == MVT::v32i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
+                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 32> V(32,
+                                     DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                     MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
+        }
+        llvm_unreachable("Unknown shift opcode.");
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
+    uint64_t ShiftAmt = 0;
+    for (unsigned i = 0; i != Ratio; ++i) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
+      if (C == 0)
+        return SDValue();
+      // 6 == Log2(64)
+      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
+    }
+    // Check remaining shift amounts.
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      uint64_t ShAmt = 0;
+      for (unsigned j = 0; j != Ratio; ++j) {
+        ConstantSDNode *C =
+          dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
+        if (C == 0)
+          return SDValue();
+        // 6 == Log2(64)
+        ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
+      }
+      if (ShAmt != ShiftAmt)
+        return SDValue();
+    }
+    switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Unknown shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
+                         DAG.getConstant(ShiftAmt, MVT::i32));
+    case ISD::SRL:
+      return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
+                         DAG.getConstant(ShiftAmt, MVT::i32));
+    case ISD::SRA:
+      return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
+                         DAG.getConstant(ShiftAmt, MVT::i32));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
+                                        const X86Subtarget* Subtarget) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+
+  if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
+      VT == MVT::v4i32 || VT == MVT::v8i16 ||
+      (Subtarget->hasInt256() &&
+       ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
+        VT == MVT::v8i32 || VT == MVT::v16i16))) {
+    SDValue BaseShAmt;
+    EVT EltVT = VT.getVectorElementType();
+
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElts = VT.getVectorNumElements();
+      unsigned i, j;
+      for (i = 0; i != NumElts; ++i) {
+        if (Amt.getOperand(i).getOpcode() == ISD::UNDEF)
+          continue;
+        break;
+      }
+      for (j = i; j != NumElts; ++j) {
+        SDValue Arg = Amt.getOperand(j);
+        if (Arg.getOpcode() == ISD::UNDEF) continue;
+        if (Arg != Amt.getOperand(i))
+          break;
+      }
+      if (i != NumElts && j == NumElts)
+        BaseShAmt = Amt.getOperand(i);
+    } else {
+      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+        Amt = Amt.getOperand(0);
+      if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE &&
+               cast<ShuffleVectorSDNode>(Amt)->isSplat()) {
+        SDValue InVec = Amt.getOperand(0);
+        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+          unsigned NumElts = InVec.getValueType().getVectorNumElements();
+          unsigned i = 0;
+          for (; i != NumElts; ++i) {
+            SDValue Arg = InVec.getOperand(i);
+            if (Arg.getOpcode() == ISD::UNDEF) continue;
+            BaseShAmt = Arg;
+            break;
+          }
+        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+           if (ConstantSDNode *C =
+               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+             unsigned SplatIdx =
+               cast<ShuffleVectorSDNode>(Amt)->getSplatIndex();
+             if (C->getZExtValue() == SplatIdx)
+               BaseShAmt = InVec.getOperand(1);
+           }
+        }
+        if (BaseShAmt.getNode() == 0)
+          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
+                                  DAG.getIntPtrConstant(0));
+      }
+    }
+
+    if (BaseShAmt.getNode()) {
+      if (EltVT.bitsGT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt);
+      else if (EltVT.bitsLT(MVT::i32))
+        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
+
+      switch (Op.getOpcode()) {
+      default:
+        llvm_unreachable("Unknown shift opcode!");
+      case ISD::SHL:
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: return SDValue();
+        case MVT::v2i64:
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v4i64:
+        case MVT::v8i32:
+        case MVT::v16i16:
+          return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
+        }
+      case ISD::SRA:
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: return SDValue();
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v8i32:
+        case MVT::v16i16:
+          return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
+        }
+      case ISD::SRL:
+        switch (VT.getSimpleVT().SimpleTy) {
+        default: return SDValue();
+        case MVT::v2i64:
+        case MVT::v4i32:
+        case MVT::v8i16:
+        case MVT::v4i64:
+        case MVT::v8i32:
+        case MVT::v16i16:
+          return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
+        }
+      }
+    }
+  }
+
+  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  if (!Subtarget->is64Bit() &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      Amt.getOpcode() == ISD::BITCAST &&
+      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    Amt = Amt.getOperand(0);
+    unsigned Ratio = Amt.getValueType().getVectorNumElements() /
+                     VT.getVectorNumElements();
+    std::vector<SDValue> Vals(Ratio);
+    for (unsigned i = 0; i != Ratio; ++i)
+      Vals[i] = Amt.getOperand(i);
+    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
+      for (unsigned j = 0; j != Ratio; ++j)
+        if (Vals[j] != Amt.getOperand(i + j))
+          return SDValue();
+    }
+    switch (Op.getOpcode()) {
+    default:
+      llvm_unreachable("Unknown shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
+  SDValue V;
+
+  if (!Subtarget->hasSSE2())
+    return SDValue();
+
+  V = LowerScalarImmediateShift(Op, DAG, Subtarget);
+  if (V.getNode())
+    return V;
+
+  V = LowerScalarVariableShift(Op, DAG, Subtarget);
+  if (V.getNode())
+      return V;
+
+  // AVX2 has VPSLLV/VPSRAV/VPSRLV.
+  if (Subtarget->hasInt256()) {
+    if (Op.getOpcode() == ISD::SRL &&
+        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+         VT == MVT::v4i64 || VT == MVT::v8i32))
+      return Op;
+    if (Op.getOpcode() == ISD::SHL &&
+        (VT == MVT::v2i64 || VT == MVT::v4i32 ||
+         VT == MVT::v4i64 || VT == MVT::v8i32))
+      return Op;
+    if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
+      return Op;
+  }
+
+  // Lower SHL with variable shift amount.
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
+
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
+    Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
+    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
+    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
+  }
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
+    assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
+
+    // a = a << 5;
+    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
+    Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+    // Turn 'a' into a mask suitable for VSELECT
+    SDValue VSelM = DAG.getConstant(0x80, VT);
+    SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    SDValue CM1 = DAG.getConstant(0x0f, VT);
+    SDValue CM2 = DAG.getConstant(0x3f, VT);
+
+    // r = VSELECT(r, psllw(r & (char16)15, 4), a);
+    SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(4, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // r = VSELECT(r, psllw(r & (char16)63, 2), a);
+    M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
+    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
+                            DAG.getConstant(2, MVT::i32), DAG);
+    M = DAG.getNode(ISD::BITCAST, dl, VT, M);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
+
+    // a += a
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
+    OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
+    OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
+
+    // return VSELECT(r, r+r, a);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
+                    DAG.getNode(ISD::ADD, dl, VT, R, R), R);
+    return R;
+  }
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.is256BitVector()) {
+    unsigned NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
+    SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (unsigned i = 0; i != NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (unsigned i = NumElems/2; i != NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
+      Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
+      if (C->isOne()) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
+                                 MVT::i32);
+    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
+
+    SDValue SetCC =
+      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                  DAG.getConstant(X86::COND_O, MVT::i32),
+                  SDValue(Sum.getNode(), 2));
+
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+  }
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
+                DAG.getConstant(Cond, MVT::i32),
+                SDValue(Sum.getNode(), 1));
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
+}
+
+SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  EVT VT = Op.getValueType();
+
+  if (!Subtarget->hasSSE2() || !VT.isVector())
+    return SDValue();
+
+  unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                      ExtraVT.getScalarType().getSizeInBits();
+  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
+
+  switch (VT.getSimpleVT().SimpleTy) {
+    default: return SDValue();
+    case MVT::v8i32:
+    case MVT::v16i16:
+      if (!Subtarget->hasFp256())
+        return SDValue();
+      if (!Subtarget->hasInt256()) {
+        // needs to be split
+        unsigned NumElems = VT.getVectorNumElements();
+
+        // Extract the LHS vectors
+        SDValue LHS = Op.getOperand(0);
+        SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
+        SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
+
+        MVT EltVT = VT.getVectorElementType().getSimpleVT();
+        EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+        EVT ExtraEltVT = ExtraVT.getVectorElementType();
+        unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
+        ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+                                   ExtraNumElems/2);
+        SDValue Extra = DAG.getValueType(ExtraVT);
+
+        LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+        LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
+      }
+      // fall through
+    case MVT::v4i32:
+    case MVT::v8i16: {
+      // (sext (vzext x)) -> (vsext x)
+      SDValue Op0 = Op.getOperand(0);
+      SDValue Op00 = Op0.getOperand(0);
+      SDValue Tmp1;
+      // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
+      if (Op0.getOpcode() == ISD::BITCAST &&
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
+        Tmp1 = LowerVectorIntExtend(Op00, DAG);
+      if (Tmp1.getNode()) {
+        SDValue Tmp1Op0 = Tmp1.getOperand(0);
+        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+               "This optimization is invalid without a VZEXT.");
+        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+      }
+
+      // If the above didn't work, then just use Shift-Left + Shift-Right.
+      Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
+      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+    }
+  }
+}
+
+static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
+                              SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
+  // There isn't any reason to disable it if the target processor supports it.
+  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res =
+      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+                          array_lengthof(Ops));
+    return SDValue(Res, 0);
+  }
+
+  unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
+  if (!isDev)
+    return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+
+  unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+  if (!Op1 && !Op2 && !Op3 && Op4)
+    return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0));
+
+  // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+  if (Op1 && !Op2 && !Op3 && !Op4)
+    return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0));
+
+  // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)),
+  //           (MFENCE)>;
+  return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+    // no-sse2). There isn't any reason to disable it if the target processor
+    // supports it.
+    if (Subtarget->hasSSE2() || Subtarget->is64Bit())
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res =
+      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+                         array_lengthof(Ops));
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  EVT T = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
+                                    Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
+  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
+                                           Ops, 5, T, MMO);
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
+  return cpOut;
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  assert(Subtarget->is64Bit() && "Result not type legalized?");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue TheChain = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+                                   rax.getValue(2));
+  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+                            DAG.getConstant(32, MVT::i8));
+  SDValue Ops[] = {
+    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+    rdx.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+  EVT SrcVT = Op.getOperand(0).getValueType();
+  EVT DstVT = Op.getValueType();
+  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+         Subtarget->hasMMX() && "Unexpected custom BITCAST");
+  assert((DstVT == MVT::i64 ||
+          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
+         "Unexpected custom BITCAST");
+  // i64 <=> MMX conversions are Legal.
+  if (SrcVT==MVT::i64 && DstVT.isVector())
+    return Op;
+  if (DstVT==MVT::i64 && SrcVT.isVector())
+    return Op;
+  // MMX <=> MMX conversions are Legal.
+  if (SrcVT.isVector() && DstVT.isVector())
+    return Op;
+  // All other conversions need to be expanded.
+  return SDValue();
+}
+
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT T = Node->getValueType(0);
+  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+                              DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getSrcValue(),
+                       cast<AtomicSDNode>(Node)->getAlignment(),
+                       cast<AtomicSDNode>(Node)->getOrdering(),
+                       cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid code");
+  case ISD::ADDC: Opc = X86ISD::ADD; break;
+  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
+  case ISD::SUBC: Opc = X86ISD::SUB; break;
+  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two XMM registers.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+  // the small struct {f32, f32} is returned in (eax, edx). For f64,
+  // the results are returned via SRet in memory.
+  const char *LibcallName = (ArgVT == MVT::f64)
+    ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::
+    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
+                         false, false, false, false, 0,
+                         CallingConv::C, /*isTaillCall=*/false,
+                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
+                         Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
+  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, Subtarget, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
+  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::SDIV:               return LowerSDIV(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
+  }
+}
+
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert wide load -> cmpxchg8b/cmpxchg16b
+  // FIXME: On 32-bit, load -> fild or movq would be more efficient
+  //        (The only way to get a 16-byte load is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+                               Node->getOperand(0),
+                               Node->getOperand(1), Zero, Zero,
+                               cast<AtomicSDNode>(Node)->getMemOperand(),
+                               cast<AtomicSDNode>(Node)->getOrdering(),
+                               cast<AtomicSDNode>(Node)->getSynchScope());
+  Results.push_back(Swap.getValue(0));
+  Results.push_back(Swap.getValue(1));
+}
+
+static void
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+                        SelectionDAG &DAG, unsigned NewOp) {
+  DebugLoc dl = Node->getDebugLoc();
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(0));
+  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(1));
+  SDValue Ops[] = { Chain, In1, In2L, In2H };
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) const {
+  DebugLoc dl = N->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    // We don't want to expand or promote these.
+    return;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+    if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
+      return;
+
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode() != 0) {
+      EVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      if (StackSlot.getNode() != 0)
+        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
+                                      MachinePointerInfo(),
+                                      false, false, false, 0));
+      else
+        Results.push_back(FIST);
+    }
+    return;
+  }
+  case ISD::UINT_TO_FP: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue TheChain = N->getOperand(0);
+    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+                                     rd.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+                                     eax.getValue(2));
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    SDValue Ops[] = { eax, edx };
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+    Results.push_back(edx.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_CMP_SWAP: {
+    EVT T = N->getValueType(0);
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, HalfT));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+                               Regs64bit ? X86::RBX : X86::EBX,
+                               swapInL, cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+                               Regs64bit ? X86::RCX : X86::ECX,
+                               swapInH, swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
+    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+                                  X86ISD::LCMPXCHG8_DAG;
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
+                                             Ops, 3, T, MMO);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+    Results.push_back(cpOutH.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_SWAP: {
+    unsigned Opc;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case ISD::ATOMIC_LOAD_ADD:
+      Opc = X86ISD::ATOMADD64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_AND:
+      Opc = X86ISD::ATOMAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_NAND:
+      Opc = X86ISD::ATOMNAND64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_OR:
+      Opc = X86ISD::ATOMOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_SUB:
+      Opc = X86ISD::ATOMSUB64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_XOR:
+      Opc = X86ISD::ATOMXOR64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_MAX:
+      Opc = X86ISD::ATOMMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_MIN:
+      Opc = X86ISD::ATOMMIN64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMAX:
+      Opc = X86ISD::ATOMUMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMIN:
+      Opc = X86ISD::ATOMUMIN64_DAG;
+      break;
+    case ISD::ATOMIC_SWAP:
+      Opc = X86ISD::ATOMSWAP64_DAG;
+      break;
+    }
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
+    return;
+  }
+  case ISD::ATOMIC_LOAD:
+    ReplaceATOMIC_LOAD(N, Results, DAG);
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
+  case X86ISD::FSETCCss:           return "X86ISD::FSETCCss";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
+  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
+  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
+  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
+  case X86ISD::HADD:               return "X86ISD::HADD";
+  case X86ISD::HSUB:               return "X86ISD::HSUB";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
+  case X86ISD::UMAX:               return "X86ISD::UMAX";
+  case X86ISD::UMIN:               return "X86ISD::UMIN";
+  case X86ISD::SMAX:               return "X86ISD::SMAX";
+  case X86ISD::SMIN:               return "X86ISD::SMIN";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
+  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
+  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
+  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
+  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
+  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
+  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
+  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
+  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::VSRA:               return "X86ISD::VSRA";
+  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
+  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
+  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
+  case X86ISD::CMPP:               return "X86ISD::CMPP";
+  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
+  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::ADC:                return "X86ISD::ADC";
+  case X86ISD::SBB:                return "X86ISD::SBB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::BLSI:               return "X86ISD::BLSI";
+  case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
+  case X86ISD::BLSR:               return "X86ISD::BLSR";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
+  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
+  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
+  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
+  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
+  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
+  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
+  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
+  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
+  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
+  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
+  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
+  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
+  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
+  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
+  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
+  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
+  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
+  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
+  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
+  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
+  case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
+  case X86ISD::SAHF:               return "X86ISD::SAHF";
+  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
+  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
+  case X86ISD::FMADD:              return "X86ISD::FMADD";
+  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
+  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
+  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
+  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
+  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
+  case X86ISD::XTEST:              return "X86ISD::XTEST";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  Reloc::Model R = getTargetMachine().getRelocationModel();
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+    return false;
+
+  if (AM.BaseGV) {
+    unsigned GVFlags =
+      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
+      return false;
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+      return false;
+
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if ((M != CodeModel::Small || R != Reloc::Static) &&
+        Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  // Can also use sub to handle negated immediates.
+  return isInt<32>(Imm);
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2))
+    return true;
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // X86 has 8, 16, and 32-bit zero-extending loads.
+    return true;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  // Very little shuffling can be done for 64-bit vectors right now.
+  if (VT.getSizeInBits() == 64)
+    return false;
+
+  // FIXME: pshufb, blends, shifts.
+  return (VT.getVectorNumElements() == 2 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isMOVLMask(M, VT) ||
+          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
+          isPSHUFDMask(M, VT) ||
+          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
+          isPALIGNRMask(M, VT, Subtarget) ||
+          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          EVT VT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+  // FIXME: This collection of masks seems suspect.
+  if (NumElts == 2)
+    return true;
+  if (NumElts == 4 && VT.is128BitVector()) {
+    return (isMOVLMask(Mask, VT)  ||
+            isCommutedMOVLMask(Mask, VT, true) ||
+            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
+            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI->getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::LCMPXCHG8;
+  case MVT::i16: return X86::LCMPXCHG16;
+  case MVT::i32: return X86::LCMPXCHG32;
+  case MVT::i64: return X86::LCMPXCHG64;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
+
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::MOV8rm;
+  case MVT::i16: return X86::MOV16rm;
+  case MVT::i32: return X86::MOV32rm;
+  case MVT::i64: return X86::MOV64rm;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+  switch (Opc) {
+  case X86::ATOMAND8:  return X86::AND8rr;
+  case X86::ATOMAND16: return X86::AND16rr;
+  case X86::ATOMAND32: return X86::AND32rr;
+  case X86::ATOMAND64: return X86::AND64rr;
+  case X86::ATOMOR8:   return X86::OR8rr;
+  case X86::ATOMOR16:  return X86::OR16rr;
+  case X86::ATOMOR32:  return X86::OR32rr;
+  case X86::ATOMOR64:  return X86::OR64rr;
+  case X86::ATOMXOR8:  return X86::XOR8rr;
+  case X86::ATOMXOR16: return X86::XOR16rr;
+  case X86::ATOMXOR32: return X86::XOR32rr;
+  case X86::ATOMXOR64: return X86::XOR64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+                                               unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
+  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
+  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
+  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
+  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
+  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
+  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
+  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
+  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+  switch (Opc) {
+  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
+  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
+  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
+  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
+  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
+  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
+  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
+  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
+  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+                                                   unsigned &HiOpc,
+                                                   unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND6432:
+    ExtraOpc = X86::NOT32r;
+    HiOpc = X86::AND32rr;
+    return X86::AND32rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::CMOV_GR8;
+  case MVT::i16: return X86::CMOV_GR16;
+  case MVT::i32: return X86::CMOV_GR32;
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown CMOV opcode!");
+}
+
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    t1 = LOAD MI.addr
+// loop:
+//    t4 = phi(t1, t3 / loop)
+//    t2 = OP MI.val, t4
+//    EAX = t4
+//    LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
+//    t3 = EAX
+//    JNE loop
+// sink:
+//    dst = t3
+//    ...
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg, SrcReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  MVT::SimpleValueType VT = *RC->vt_begin();
+  unsigned t1 = MRI.createVirtualRegister(RC);
+  unsigned t2 = MRI.createVirtualRegister(RC);
+  unsigned t3 = MRI.createVirtualRegister(RC);
+  unsigned t4 = MRI.createVirtualRegister(RC);
+  unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+  unsigned LOADOpc = getLoadOpcode(VT);
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    t1 = LOAD [MI.addr]
+  //  mainMBB:
+  //    t4 = phi(t1 / thisMBB, t3 / mainMBB)
+  //    t1 = OP MI.val, EAX
+  //    EAX = t4
+  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+  //    t3 = EAX
+  //    JNE mainMBB
+  //  sinkMBB:
+  //    dst = t3
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+    unsigned flags = (*MMOI)->getFlags();
+    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+                               (*MMOI)->getSize(),
+                               (*MMOI)->getBaseAlignment(),
+                               (*MMOI)->getTBAAInfo(),
+                               (*MMOI)->getRanges());
+    MIB.addMemOperand(MMO);
+  }
+
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+
+  // Add a PHI.
+  MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
+                        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op opcode!");
+  case X86::ATOMAND8:
+  case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+  case X86::ATOMOR8:
+  case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+  case X86::ATOMXOR8:
+  case X86::ATOMXOR16:
+  case X86::ATOMXOR32:
+  case X86::ATOMXOR64: {
+    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
+      .addReg(t4);
+    break;
+  }
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64: {
+    unsigned Tmp = MRI.createVirtualRegister(RC);
+    unsigned NOTOpc;
+    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+    BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
+      .addReg(t4);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
+    break;
+  }
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64: {
+    unsigned CMPOpc;
+    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+    BuildMI(mainMBB, DL, TII->get(CMPOpc))
+      .addReg(SrcReg)
+      .addReg(t4);
+
+    if (Subtarget->hasCMov()) {
+      if (VT != MVT::i8) {
+        // Native support
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+          .addReg(SrcReg)
+          .addReg(t4);
+      } else {
+        // Promote i8 to i32 to use CMOV32
+        const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+        const TargetRegisterClass *RC32 =
+          TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
+        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+        unsigned Tmp = MRI.createVirtualRegister(RC32);
+
+        unsigned Undef = MRI.createVirtualRegister(RC32);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+          .addReg(Undef)
+          .addReg(SrcReg)
+          .addImm(X86::sub_8bit);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+          .addReg(Undef)
+          .addReg(t4)
+          .addImm(X86::sub_8bit);
+
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
+          .addReg(SrcReg32)
+          .addReg(AccReg32);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
+          .addReg(Tmp, 0, X86::sub_8bit);
+      }
+    } else {
+      // Use pseudo select and lower them.
+      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+             "Invalid atomic-load-op transformation!");
+      unsigned SelOpc = getPseudoCMOVOpc(VT);
+      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
+              .addReg(SrcReg).addReg(t4)
+              .addImm(CC);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      // Replace the original PHI node as mainMBB is changed after CMOV
+      // lowering.
+      BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
+        .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+      Phi->eraseFromParent();
+    }
+    break;
+  }
+  }
+
+  // Copy PhyReg back from virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
+    .addReg(t4);
+
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  MIB.addReg(t2);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Copy PhyReg back to virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
+    .addReg(PhyReg);
+
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstReg)
+    .addReg(t3);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    t1L = LOAD [MI.addr + 0]
+//    t1H = LOAD [MI.addr + 4]
+// loop:
+//    t4L = phi(t1L, t3L / loop)
+//    t4H = phi(t1H, t3H / loop)
+//    t2L = OP MI.val.lo, t4L
+//    t2H = OP MI.val.hi, t4H
+//    EAX = t4L
+//    EDX = t4H
+//    EBX = t2L
+//    ECX = t2H
+//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+//    t3L = EAX
+//    t3H = EDX
+//    JNE loop
+// sink:
+//    dstL = t3L
+//    dstH = t3H
+//    ...
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op32 to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstLoReg, DstHiReg;
+  unsigned SrcLoReg, SrcHiReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstLoReg = MI->getOperand(CurOp++).getReg();
+  DstHiReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcLoReg = MI->getOperand(CurOp++).getReg();
+  SrcHiReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
+  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+
+  unsigned t1L = MRI.createVirtualRegister(RC);
+  unsigned t1H = MRI.createVirtualRegister(RC);
+  unsigned t2L = MRI.createVirtualRegister(RC);
+  unsigned t2H = MRI.createVirtualRegister(RC);
+  unsigned t3L = MRI.createVirtualRegister(RC);
+  unsigned t3H = MRI.createVirtualRegister(RC);
+  unsigned t4L = MRI.createVirtualRegister(RC);
+  unsigned t4H = MRI.createVirtualRegister(RC);
+
+  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+  unsigned LOADOpc = X86::MOV32rm;
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    t1L = LOAD [MI.addr + 0]
+  //    t1H = LOAD [MI.addr + 4]
+  //  mainMBB:
+  //    t4L = phi(t1L / thisMBB, t3L / mainMBB)
+  //    t4H = phi(t1H / thisMBB, t3H / mainMBB)
+  //    t2L = OP MI.val.lo, t4L
+  //    t2H = OP MI.val.hi, t4H
+  //    EBX = t2L
+  //    ECX = t2H
+  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+  //    t3L = EAX
+  //    t3H = EDX
+  //    JNE loop
+  //  sinkMBB:
+  //    dstL = t3L
+  //    dstH = t3H
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  // Lo
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+    unsigned flags = (*MMOI)->getFlags();
+    flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+    MachineMemOperand *MMO =
+      MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+                               (*MMOI)->getSize(),
+                               (*MMOI)->getBaseAlignment(),
+                               (*MMOI)->getTBAAInfo(),
+                               (*MMOI)->getRanges());
+    MIB.addMemOperand(MMO);
+  };
+  MachineInstr *LowMI = MIB;
+
+  // Hi
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp) {
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+    } else {
+      MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+      if (NewMO.isReg())
+        NewMO.setIsKill(false);
+      MIB.addOperand(NewMO);
+    }
+  }
+  MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
+
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+
+  // Add PHIs.
+  MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
+                        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+  MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
+                        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+  case X86::ATOMAND6432:
+  case X86::ATOMOR6432:
+  case X86::ATOMXOR6432:
+  case X86::ATOMADD6432:
+  case X86::ATOMSUB6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
+      .addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
+      .addReg(SrcHiReg);
+    break;
+  }
+  case X86::ATOMNAND6432: {
+    unsigned HiOpc, NOTOpc;
+    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+    unsigned TmpL = MRI.createVirtualRegister(RC);
+    unsigned TmpH = MRI.createVirtualRegister(RC);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
+      .addReg(t4L);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
+      .addReg(t4H);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
+    break;
+  }
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    unsigned cL = MRI.createVirtualRegister(RC8);
+    unsigned cH = MRI.createVirtualRegister(RC8);
+    unsigned cL32 = MRI.createVirtualRegister(RC);
+    unsigned cH32 = MRI.createVirtualRegister(RC);
+    unsigned cc = MRI.createVirtualRegister(RC);
+    // cl := cmp src_lo, lo
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcLoReg).addReg(t4L);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
+    // ch := cmp src_hi, hi
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcHiReg).addReg(t4H);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
+    // cc := if (src_hi == hi) ? cl : ch;
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
+        .addReg(cH32).addReg(cL32);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
+              .addReg(cH32).addReg(cL32)
+              .addImm(X86::COND_E);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
+        .addReg(SrcLoReg).addReg(t4L);
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
+        .addReg(SrcHiReg).addReg(t4H);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
+              .addReg(SrcLoReg).addReg(t4L)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
+      // 2nd CMOV lowering.
+      mainMBB->addLiveIn(X86::EFLAGS);
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
+              .addReg(SrcHiReg).addReg(t4H)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      // Replace the original PHI node as mainMBB is changed after CMOV
+      // lowering.
+      BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
+        .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+      BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
+        .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+      PhiL->eraseFromParent();
+      PhiH->eraseFromParent();
+    }
+    break;
+  }
+  case X86::ATOMSWAP6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
+    break;
+  }
+  }
+
+  // Copy EDX:EAX back from HiReg:LoReg
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
+  // Copy ECX:EBX from t1H:t1L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
+
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+    if (NewMO.isReg())
+      NewMO.setIsKill(false);
+    MIB.addOperand(NewMO);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  // Copy EDX:EAX back to t3H:t3L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
+
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
+
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstLoReg)
+    .addReg(t3L);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstHiReg)
+    .addReg(t3H);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// or XMM0_V32I8 in AVX all of this code can be replaced with that
+// in the .td file.
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
+  }
+
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::ECX);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII,
+                                       const X86Subtarget* Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+
+  // Address into RAX/EAX, other two args into ECX, EDX.
+  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  unsigned ValOps = X86::AddrNumOperands;
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
+    .addReg(MI->getOperand(ValOps).getReg());
+  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
+    .addReg(MI->getOperand(ValOps+1).getReg());
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAARG64WithCustomInserter(
+                   MachineInstr *MI,
+                   MachineBasicBlock *MBB) const {
+  // Emit va_arg instruction on X86-64.
+
+  // Operands to this pseudo-instruction:
+  // 0  ) Output        : destination address (reg)
+  // 1-5) Input         : va_list address (addr, i64mem)
+  // 6  ) ArgSize       : Size (in bytes) of vararg type
+  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
+  // 8  ) Align         : Alignment of type
+  // 9  ) EFLAGS (implicit-def)
+
+  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
+  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
+
+  unsigned DestReg = MI->getOperand(0).getReg();
+  MachineOperand &Base = MI->getOperand(1);
+  MachineOperand &Scale = MI->getOperand(2);
+  MachineOperand &Index = MI->getOperand(3);
+  MachineOperand &Disp = MI->getOperand(4);
+  MachineOperand &Segment = MI->getOperand(5);
+  unsigned ArgSize = MI->getOperand(6).getImm();
+  unsigned ArgMode = MI->getOperand(7).getImm();
+  unsigned Align = MI->getOperand(8).getImm();
+
+  // Memory Reference
+  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  // Machine Information
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // struct va_list {
+  //   i32   gp_offset
+  //   i32   fp_offset
+  //   i64   overflow_area (address)
+  //   i64   reg_save_area (address)
+  // }
+  // sizeof(va_list) = 24
+  // alignment(va_list) = 8
+
+  unsigned TotalNumIntRegs = 6;
+  unsigned TotalNumXMMRegs = 8;
+  bool UseGPOffset = (ArgMode == 1);
+  bool UseFPOffset = (ArgMode == 2);
+  unsigned MaxOffset = TotalNumIntRegs * 8 +
+                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
+
+  /* Align ArgSize to a multiple of 8 */
+  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
+  bool NeedsAlign = (Align > 8);
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *overflowMBB;
+  MachineBasicBlock *offsetMBB;
+  MachineBasicBlock *endMBB;
+
+  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
+  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
+  unsigned OffsetReg = 0;
+
+  if (!UseGPOffset && !UseFPOffset) {
+    // If we only pull from the overflow region, we don't create a branch.
+    // We don't need to alter control flow.
+    OffsetDestReg = 0; // unused
+    OverflowDestReg = DestReg;
+
+    offsetMBB = NULL;
+    overflowMBB = thisMBB;
+    endMBB = thisMBB;
+  } else {
+    // First emit code to check if gp_offset (or fp_offset) is below the bound.
+    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
+    // If not, pull from overflow_area. (branch to overflowMBB)
+    //
+    //       thisMBB
+    //         |     .
+    //         |        .
+    //     offsetMBB   overflowMBB
+    //         |        .
+    //         |     .
+    //        endMBB
+
+    // Registers for the PHI in endMBB
+    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
+    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
+
+    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+    MachineFunction *MF = MBB->getParent();
+    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+    MachineFunction::iterator MBBIter = MBB;
+    ++MBBIter;
+
+    // Insert the new basic blocks
+    MF->insert(MBBIter, offsetMBB);
+    MF->insert(MBBIter, overflowMBB);
+    MF->insert(MBBIter, endMBB);
+
+    // Transfer the remainder of MBB and its successor edges to endMBB.
+    endMBB->splice(endMBB->begin(), thisMBB,
+                    llvm::next(MachineBasicBlock::iterator(MI)),
+                    thisMBB->end());
+    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
+
+    // Make offsetMBB and overflowMBB successors of thisMBB
+    thisMBB->addSuccessor(offsetMBB);
+    thisMBB->addSuccessor(overflowMBB);
+
+    // endMBB is a successor of both offsetMBB and overflowMBB
+    offsetMBB->addSuccessor(endMBB);
+    overflowMBB->addSuccessor(endMBB);
+
+    // Load the offset value into a register
+    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Check if there is enough room left to pull this argument.
+    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
+      .addReg(OffsetReg)
+      .addImm(MaxOffset + 8 - ArgSizeA8);
+
+    // Branch to "overflowMBB" if offset >= max
+    // Fall through to "offsetMBB" otherwise
+    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
+      .addMBB(overflowMBB);
+  }
+
+  // In offsetMBB, emit code to use the reg_save_area.
+  if (offsetMBB) {
+    assert(OffsetReg != 0);
+
+    // Read the reg_save_area address.
+    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, 16)
+      .addOperand(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Zero-extend the offset
+    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
+      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
+        .addImm(0)
+        .addReg(OffsetReg)
+        .addImm(X86::sub_32bit);
+
+    // Add the offset to the reg_save_area to get the final address.
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
+      .addReg(OffsetReg64)
+      .addReg(RegSaveReg);
+
+    // Compute the offset for the next argument
+    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
+    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
+      .addReg(OffsetReg)
+      .addImm(UseFPOffset ? 16 : 8);
+
+    // Store it back into the va_list.
+    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
+      .addOperand(Base)
+      .addOperand(Scale)
+      .addOperand(Index)
+      .addDisp(Disp, UseFPOffset ? 4 : 0)
+      .addOperand(Segment)
+      .addReg(NextOffsetReg)
+      .setMemRefs(MMOBegin, MMOEnd);
+
+    // Jump to endMBB
+    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
+      .addMBB(endMBB);
+  }
+
+  //
+  // Emit code to use overflow area
+  //
+
+  // Load the overflow_area address into a register.
+  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we need to align it, do so. Otherwise, just copy the address
+  // to OverflowDestReg.
+  if (NeedsAlign) {
+    // Align the overflow address
+    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
+    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
+
+    // aligned_addr = (addr + (align-1)) & ~(align-1)
+    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+      .addReg(OverflowAddrReg)
+      .addImm(Align-1);
+
+    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+      .addReg(TmpReg)
+      .addImm(~(uint64_t)(Align-1));
+  } else {
+    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
+      .addReg(OverflowAddrReg);
+  }
+
+  // Compute the next overflow address after this argument.
+  // (the overflow address should be kept 8-byte aligned)
+  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
+  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+    .addReg(OverflowDestReg)
+    .addImm(ArgSizeA8);
+
+  // Store the new overflow address.
+  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+    .addOperand(Base)
+    .addOperand(Scale)
+    .addOperand(Index)
+    .addDisp(Disp, 8)
+    .addOperand(Segment)
+    .addReg(NextAddrReg)
+    .setMemRefs(MMOBegin, MMOEnd);
+
+  // If we branched, emit the PHI to the front of endMBB.
+  if (offsetMBB) {
+    BuildMI(*endMBB, endMBB->begin(), DL,
+            TII->get(X86::PHI), DestReg)
+      .addReg(OffsetDestReg).addMBB(offsetMBB)
+      .addReg(OverflowDestReg).addMBB(overflowMBB);
+  }
+
+  // Erase the pseudo instruction
+  MI->eraseFromParent();
+
+  return endMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Transfer the remainder of MBB and its successor edges to EndMBB.
+  EndMBB->splice(EndMBB->begin(), MBB,
+                 llvm::next(MachineBasicBlock::iterator(MI)),
+                 MBB->end());
+  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO =
+      F->getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
+        MachineMemOperand::MOStore,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+// The EFLAGS operand of SelectItr might be missing a kill marker
+// because there were multiple uses of EFLAGS, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
+                                     MachineBasicBlock* BB,
+                                     const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  MachineBasicBlock::iterator miI(llvm::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(X86::EFLAGS))
+      return false;
+    if (mi.definesRegister(X86::EFLAGS))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(X86::EFLAGS))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
+  // out. SelectMI should have a kill flag on EFLAGS.
+  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
+  return true;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+  if (!MI->killsRegister(X86::EFLAGS) &&
+      !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  // Create the conditional branch instruction.
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  copy0MBB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+                                        bool Is64Bit) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(getTargetMachine().Options.EnableSegmentedStacks);
+
+  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = BB;
+  ++MBBIter;
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB, llvm::next
+                      (MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(SPLimitVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(SPLimitVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  const uint32_t *RegMask =
+    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::RDI, RegState::Implicit)
+      .addReg(X86::RAX, RegState::ImplicitDefine);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space")
+      .addRegMask(RegMask)
+      .addReg(X86::EAX, RegState::ImplicitDefine);
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI->eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  assert(!Subtarget->isTargetEnvMacho());
+
+  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
+  // non-trivial part is impdef of ESP.
+
+  if (Subtarget->isTargetWin64()) {
+    if (Subtarget->isTargetCygMing()) {
+      // ___chkstk(Mingw64):
+      // Clobbers R10, R11, RAX and EFLAGS.
+      // Updates RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("___chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::RSP, RegState::Implicit)
+        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
+        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+    } else {
+      // __chkstk(MSVCRT): does not update stack pointer.
+      // Clobbers R10, R11 and EFLAGS.
+      // FIXME: RAX(allocated size) might be reused and not killed.
+      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
+        .addExternalSymbol("__chkstk")
+        .addReg(X86::RAX, RegState::Implicit)
+        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      // RAX has the offset to subtracted from RSP.
+      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
+        .addReg(X86::RSP)
+        .addReg(X86::RAX);
+    }
+  } else {
+    const char *StackProbeSymbol =
+      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";
+
+    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol(StackProbeSymbol)
+      .addReg(X86::EAX, RegState::Implicit)
+      .addReg(X86::ESP, RegState::Implicit)
+      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
+      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
+                                      MachineBasicBlock *BB) const {
+  // This is pretty easy.  We're taking the value that we received from
+  // our load from the relocation, sticking it in either RDI (x86-64)
+  // or EAX and doing an indirect call.  The return value will then
+  // be in the normal return register.
+  const X86InstrInfo *TII
+    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *F = BB->getParent();
+
+  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
+  assert(MI->getOperand(3).isGlobal() && "This should be a global");
+
+  // Get a register mask for the lowered call.
+  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
+  // proper register mask.
+  const uint32_t *RegMask =
+    getTargetMachine().getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+  if (Subtarget->is64Bit()) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV64rm), X86::RDI)
+    .addReg(X86::RIP)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
+    addDirectMem(MIB, X86::RDI);
+    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(0)
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
+                                      TII->get(X86::MOV32rm), X86::EAX)
+    .addReg(TII->getGlobalBaseReg(F))
+    .addImm(0).addReg(0)
+    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
+                      MI->getOperand(3).getTargetFlags())
+    .addReg(0);
+    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
+    addDirectMem(MIB, X86::EAX);
+    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
+  }
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = getTargetMachine().getRelocationModel();
+  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB) const {
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unexpected instr type to insert");
+  case X86::TAILJMPd64:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+    llvm_unreachable("TAILJMP64 would not be touched here.");
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return BB;
+  case X86::WIN_ALLOCA:
+    return EmitLoweredWinAlloca(MI, BB);
+  case X86::SEG_ALLOCA_32:
+    return EmitLoweredSegAlloca(MI, BB, false);
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB, true);
+  case X86::TLSCall_32:
+  case X86::TLSCall_64:
+    return EmitLoweredTLSCall(MI, BB);
+  case X86::CMOV_GR8:
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
+  case X86::CMOV_GR16:
+  case X86::CMOV_GR32:
+  case X86::CMOV_RFP32:
+  case X86::CMOV_RFP64:
+  case X86::CMOV_RFP80:
+    return EmitLoweredSelect(MI, BB);
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    MachineFunction *F = BB->getParent();
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(*BB, MI, DL,
+                              TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    MI->eraseFromParent();   // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+  case X86::VPCMPISTRM128REG:
+  case X86::PCMPISTRM128MEM:
+  case X86::VPCMPISTRM128MEM:
+  case X86::PCMPESTRM128REG:
+  case X86::VPCMPESTRM128REG:
+  case X86::PCMPESTRM128MEM:
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+  // Thread synchronization.
+  case X86::MONITOR:
+    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
+
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
+
+  // Atomic Lowering.
+  case X86::ATOMAND8:
+  case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+    // Fall through
+  case X86::ATOMOR8:
+  case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+    // Fall through
+  case X86::ATOMXOR16:
+  case X86::ATOMXOR8:
+  case X86::ATOMXOR32:
+  case X86::ATOMXOR64:
+    // Fall through
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64:
+    // Fall through
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+    // Fall through
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+    // Fall through
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+    // Fall through
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64:
+    return EmitAtomicLoadArith(MI, BB);
+
+  // This group does 64-bit operations on a 32-bit host.
+  case X86::ATOMAND6432:
+  case X86::ATOMOR6432:
+  case X86::ATOMXOR6432:
+  case X86::ATOMNAND6432:
+  case X86::ATOMADD6432:
+  case X86::ATOMSUB6432:
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432:
+  case X86::ATOMSWAP6432:
+    return EmitAtomicLoadArith6432(MI, BB);
+
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+
+  case X86::VAARG_64:
+    return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::ADC:
+  case X86ISD::SBB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128:
+    case Intrinsic::x86_avx2_pmovmskb: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+        case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
+      }
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
+  }
+}
+
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         unsigned Depth) const {
+  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
+    return Op.getValueType().getScalarType().getSizeInBits();
+
+  // Fallback case.
+  return 1;
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       const GlobalValue* &GA,
+                                       int64_t &Offset) const {
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget* Subtarget) {
+  DebugLoc dl = N->getDebugLoc();
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
+
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (unsigned i = 0; i != NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
+
+    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
+    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
+      if (Ld->hasNUsesOfValue(1, 0)) {
+        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
+        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
+        SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2,
+                                  Ld->getMemoryVT(),
+                                  Ld->getPointerInfo(),
+                                  Ld->getAlignment(),
+                                  false/*isVolatile*/, true/*ReadMem*/,
+                                  false/*WriteMem*/);
+
+        // Make sure the newly-created LOAD is in the same position as Ld in
+        // terms of dependency. We create a TokenFactor for Ld and ResNode,
+        // and update uses of Ld's output chain to use the TokenFactor.
+        if (Ld->hasAnyUseOfValue(1)) {
+          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
+          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
+                                 SDValue(ResNode.getNode(), 1));
+        }
+
+        return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
+      }
+    }
+
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, 0, DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // Don't create instructions with illegal types after legalize types has run.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
+    return SDValue();
+
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasFp256() && VT.is256BitVector() &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
+    return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
+
+  // Only handle 128 wide vector from here on.
+  if (!VT.is128BitVector())
+    return SDValue();
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
+  SmallVector<SDValue, 16> Elts;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
+    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+}
+
+/// PerformTruncateCombine - Converts truncate operation to
+/// a sequence of vector shuffle operations.
+/// It is possible when we truncate 256-bit vector to 128-bit vector
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget *Subtarget)  {
+  return SDValue();
+}
+
+/// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
+/// specific shuffle of a load can be folded into a single element load.
+/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
+/// shuffles have been customed lowered so we need to handle those here.
+static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue InVec = N->getOperand(0);
+  SDValue EltNo = N->getOperand(1);
+
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+
+  EVT VT = InVec.getValueType();
+
+  bool HasShuffleIntoBitcast = false;
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (BCVT.getVectorNumElements() != VT.getVectorNumElements())
+      return SDValue();
+    InVec = InVec.getOperand(0);
+    HasShuffleIntoBitcast = true;
+  }
+
+  if (!isTargetShuffle(InVec.getOpcode()))
+    return SDValue();
+
+  // Don't duplicate a load with other uses.
+  if (!InVec.hasOneUse())
+    return SDValue();
+
+  SmallVector<int, 16> ShuffleMask;
+  bool UnaryShuffle;
+  if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask,
+                            UnaryShuffle))
+    return SDValue();
+
+  // Select the input vector, guarding against out of range extract vector.
+  unsigned NumElems = VT.getVectorNumElements();
+  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+  int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
+  SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
+                                         : InVec.getOperand(1);
+
+  // If inputs to shuffle are the same for both ops, then allow 2 uses
+  unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
+
+  if (LdNode.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
+      return SDValue();
+
+    AllowedUses = 1; // only allow 1 load use if we have a bitcast
+    LdNode = LdNode.getOperand(0);
+  }
+
+  if (!ISD::isNormalLoad(LdNode.getNode()))
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
+
+  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
+    return SDValue();
+
+  if (HasShuffleIntoBitcast) {
+    // If there's a bitcast before the shuffle, check if the load type and
+    // alignment is valid.
+    unsigned Align = LN0->getAlignment();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NewAlign = TLI.getDataLayout()->
+      getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
+      return SDValue();
+  }
+
+  // All checks match so transform back to vector_shuffle so that DAG combiner
+  // can finish the job
+  DebugLoc dl = N->getDebugLoc();
+
+  // Create shuffle node taking into account the case that its a unary shuffle
+  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
+  Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl,
+                                 InVec.getOperand(0), Shuffle,
+                                 &ShuffleMask[0]);
+  Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
+                     EltNo);
+}
+
+/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
+/// generation and convert it from being a bunch of shuffles and extracts
+/// to a simple store and scalar loads to extract the elements.
+static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
+  if (NewOp.getNode())
+    return NewOp;
+
+  SDValue InputVector = N->getOperand(0);
+  // Detect whether we are trying to convert from mmx to i32 and the bitcast
+  // from mmx to v2i32 has a single usage.
+  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+                       N->getValueType(0),
+                       InputVector.getNode()->getOperand(0));
+
+  // Only operate on vectors of 4 elements, where the alternative shuffling
+  // gets to be more expensive.
+  if (InputVector.getValueType() != MVT::v4i32)
+    return SDValue();
+
+  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
+  // single use which is a sign-extend or zero-extend, and all elements are
+  // used.
+  SmallVector<SDNode *, 4> Uses;
+  unsigned ExtractedElements = 0;
+  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
+       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
+    if (UI.getUse().getResNo() != InputVector.getResNo())
+      return SDValue();
+
+    SDNode *Extract = *UI;
+    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    if (Extract->getValueType(0) != MVT::i32)
+      return SDValue();
+    if (!Extract->hasOneUse())
+      return SDValue();
+    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
+        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
+      return SDValue();
+
+    // Record which element was extracted.
+    ExtractedElements |=
+      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
+
+    Uses.push_back(Extract);
+  }
+
+  // If not all the elements were used, this may not be worthwhile.
+  if (ExtractedElements != 15)
+    return SDValue();
+
+  // Ok, we've now decided to do the transformation.
+  DebugLoc dl = InputVector.getDebugLoc();
+
+  // Store the value to a temporary stack slot.
+  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
+                            MachinePointerInfo(), false, false, 0);
+
+  // Replace each use (extract) with a load of the appropriate element.
+  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
+       UE = Uses.end(); UI != UE; ++UI) {
+    SDNode *Extract = *UI;
+
+    // cOMpute the element's address.
+    SDValue Idx = Extract->getOperand(1);
+    unsigned EltSize =
+        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
+    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
+
+    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+                                     StackPtr, OffsetVal);
+
+    // Load the scalar.
+    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
+                                     ScalarAddr, MachinePointerInfo(),
+                                     false, false, false, 0);
+
+    // Replace the exact with the load.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
+  }
+
+  // The replacement was made in place; don't return anything.
+  return SDValue();
+}
+
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+                                   SDValue RHS, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  if (!VT.isVector())
+    return 0;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return 0;
+  case MVT::v32i8:
+  case MVT::v16i16:
+  case MVT::v8i32:
+    if (!Subtarget->hasAVX2())
+      return 0;
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+    if (!Subtarget->hasSSE2())
+      return 0;
+  }
+
+  // SSE2 has only a small subset of the operations.
+  bool hasUnsigned = Subtarget->hasSSE41() ||
+                     (Subtarget->hasSSE2() && VT == MVT::v16i8);
+  bool hasSigned = Subtarget->hasSSE41() ||
+                   (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+  // Check for x CC y ? x : y.
+  if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    }
+  // Check for x CC y ? y : x -- a min/max with reversed arms.
+  } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+             DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+    case ISD::SETULE:
+      return hasUnsigned ? X86ISD::UMAX : 0;
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+      return hasUnsigned ? X86ISD::UMIN : 0;
+    case ISD::SETLT:
+    case ISD::SETLE:
+      return hasSigned ? X86ISD::SMAX : 0;
+    case ISD::SETGT:
+    case ISD::SETGE:
+      return hasSigned ? X86ISD::SMIN : 0;
+    }
+  }
+
+  return 0;
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions match the semantics of the common C idiom x<y?x:y but not
+  // x<=y?x:y, because of how they handle negative zero (which can be
+  // ignored in unsafe-math mode).
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      (Subtarget->hasSSE2() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
+    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETULT:
+        // Converting this to a min would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETULE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a max would handle NaNs incorrectly, and swapping
+        // the operands would cause it to handle comparisons between positive
+        // and negative zero incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
+          if (!DAG.getTarget().Options.UnsafeFPMath &&
+              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
+    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGE:
+        // Converting this to a min would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGT:
+        // Converting this to a min would handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+          break;
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // Converting this to a min would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETULT:
+        // Converting this to a max would handle NaNs incorrectly.
+        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+          break;
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // Converting this to a max would handle comparisons between positive
+        // and negative zero incorrectly, and swapping the operands would
+        // cause it to handle NaNs incorrectly.
+        if (!DAG.getTarget().Options.UnsafeFPMath &&
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
+          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
+            break;
+          std::swap(LHS, RHS);
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETULE:
+        // Converting this to a max would handle both negative zeros and NaNs
+        // incorrectly, but we can swap the operands to fix both.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+      // Don't do this for crazy integer types.
+      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+        // so that TrueC (the true value) is larger than FalseC.
+        bool NeedsCondInvert = false;
+
+        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+            // Efficiently invertible.
+            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
+             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
+              isa<ConstantSDNode>(Cond.getOperand(1))))) {
+          NeedsCondInvert = true;
+          std::swap(TrueC, FalseC);
+        }
+
+        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+        if (FalseC->getAPIntValue() == 0 &&
+            TrueC->getAPIntValue().isPowerOf2()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                             DAG.getConstant(ShAmt, MVT::i8));
+        }
+
+        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                             FalseC->getValueType(0), Cond);
+          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                             SDValue(FalseC, 0));
+        }
+
+        // Optimize cases that will turn into an LEA instruction.  This requires
+        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+          bool isFastMultiplier = false;
+          if (Diff < 10) {
+            switch ((unsigned char)Diff) {
+              default: break;
+              case 1:  // result = add base, cond
+              case 2:  // result = lea base(    , cond*2)
+              case 3:  // result = lea base(cond, cond*2)
+              case 4:  // result = lea base(    , cond*4)
+              case 5:  // result = lea base(cond, cond*4)
+              case 8:  // result = lea base(    , cond*8)
+              case 9:  // result = lea base(cond, cond*8)
+                isFastMultiplier = true;
+                break;
+            }
+          }
+
+          if (isFastMultiplier) {
+            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+            if (NeedsCondInvert) // Invert the condition if needed.
+              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(1, Cond.getValueType()));
+
+            // Zero extend the condition if needed.
+            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                               Cond);
+            // Scale the condition by the difference.
+            if (Diff != 1)
+              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(Diff, Cond.getValueType()));
+
+            // Add the base if non-zero.
+            if (FalseC->getAPIntValue() != 0)
+              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                                 SDValue(FalseC, 0));
+            return Cond;
+          }
+        }
+      }
+  }
+
+  // Canonicalize max and min:
+  // (x > y) ? x : y -> (x >= y) ? x : y
+  // (x < y) ? x : y -> (x <= y) ? x : y
+  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
+  // the need for an extra compare
+  // against zero. e.g.
+  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
+  // subl   %esi, %edi
+  // testl  %edi, %edi
+  // movl   $0, %eax
+  // cmovgl %edi, %eax
+  // =>
+  // xorl   %eax, %eax
+  // subl   %esi, $edi
+  // cmovsl %eax, %edi
+  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
+      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    switch (CC) {
+    default: break;
+    case ISD::SETLT:
+    case ISD::SETGT: {
+      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
+      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
+      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+    }
+    }
+  }
+
+  // Match VSELECTs into subs with unsigned saturation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      // If the RHS is a constant we have to reverse the const canonicalization.
+      // x > C-1 ? x+-C : 0 --> subus x, C
+      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (CondRHS.getConstantOperandVal(0) == -A-1)
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+                             DAG.getConstant(-A, VT));
+      }
+
+      // Another special case: If C was a sign bit, the sub has been
+      // canonicalized into a xor.
+      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      //        it's safe to decanonicalize the xor?
+      // x s< 0 ? x^C : 0 --> subus x, C
+      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+          isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (A.isSignBit())
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+      }
+    }
+  }
+
+  // Try to match a min/max vector operation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
+  // If we know that this node is legal then we know that it is going to be
+  // matched by one of the SSE/AVX BLEND instructions. These instructions only
+  // depend on the highest bit in each word. Try to use SimplifyDemandedBits
+  // to simplify previous instructions.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
+      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+    unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
+
+    // Don't optimize vector selects that map to mask-registers.
+    if (BitWidth == 1)
+      return SDValue();
+
+    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
+                                          DCI.isBeforeLegalizeOps());
+    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+
+  return SDValue();
+}
+
+// Check whether a boolean test is testing a boolean value generated by
+// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
+// code.
+//
+// Simplify the following patterns:
+// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
+// to (Op EFLAGS Cond)
+//
+// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
+// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
+// to (Op EFLAGS !Cond)
+//
+// where Op could be BRCOND or CMOV.
+//
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+  // Quit if not CMP and SUB with its value result used.
+  if (Cmp.getOpcode() != X86ISD::CMP &&
+      (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
+      return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  // Check CMP operands. One of them should be 0 or 1 and the other should be
+  // an SetCC or extended from it.
+  SDValue Op1 = Cmp.getOperand(0);
+  SDValue Op2 = Cmp.getOperand(1);
+
+  SDValue SetCC;
+  const ConstantSDNode* C = 0;
+  bool needOppositeCond = (CC == X86::COND_E);
+
+  if ((C = dyn_cast<ConstantSDNode>(Op1)))
+    SetCC = Op2;
+  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
+    SetCC = Op1;
+  else // Quit if all operands are not constants.
+    return SDValue();
+
+  if (C->getZExtValue() == 1)
+    needOppositeCond = !needOppositeCond;
+  else if (C->getZExtValue() != 0)
+    // Quit if the constant is neither 0 or 1.
+    return SDValue();
+
+  // Skip 'zext' or 'trunc' node.
+  if (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
+      SetCC.getOpcode() == ISD::TRUNCATE)
+    SetCC = SetCC.getOperand(0);
+
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      SDValue Op = SetCC.getOperand(0);
+      // Skip 'zext' or 'trunc' node.
+      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
+          Op.getOpcode() == ISD::TRUNCATE)
+        Op = Op.getOperand(0);
+      // A special case for rdrand/rdseed, where 0 is set if false cond is
+      // found.
+      if ((Op.getOpcode() != X86ISD::RDRAND &&
+           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
+
+  return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+
+  SDValue FalseOp = N->getOperand(0);
+  SDValue TrueOp = N->getOperand(1);
+  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+  SDValue Cond = N->getOperand(3);
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    switch (Cond.getOpcode()) {
+    default: break;
+    case X86ISD::BSR:
+    case X86ISD::BSF:
+      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
+      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
+        return (CC == X86::COND_E) ? FalseOp : TrueOp;
+    }
+  }
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
+  if (Flags.getNode() &&
+      // Extra check as FCMOV only supports a subset of X86 cond.
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                             DAG.getConstant(CC, MVT::i8), Cond);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }
+    }
+  }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = 0;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        !isa<ConstantSDNode>(Cond.getOperand(0))) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
+                           array_lengthof(Ops));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+    DebugLoc DL = N->getDebugLoc();
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    SDValue NewMul;
+    if (isPowerOf2_64(MulAmt1))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, VT));
+
+    if (isPowerOf2_64(MulAmt2))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, VT));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (VT.isInteger() && !VT.isVector() &&
+      N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+        ((N00.getOpcode() == ISD::ANY_EXTEND ||
+          N00.getOpcode() == ISD::ZERO_EXTEND) &&
+         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      APInt ShAmt = N1C->getAPIntValue();
+      Mask = Mask.shl(ShAmt);
+      if (Mask != 0)
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           N00, DAG.getConstant(Mask, VT));
+    }
+  }
+
+  // Hardware support for vector shifts is sparse which makes us scalarize the
+  // vector operations in many cases. Also, on sandybridge ADD is faster than
+  // shl.
+  // (shl V, 1) -> add V,V
+  if (isSplatVector(N1.getNode())) {
+    assert(N0.getValueType().isVector() && "Invalid vector shift type");
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1->getOperand(0));
+    // We shift all of the values by one. In many cases we do not have
+    // hardware support for this operation. This is better expressed as an ADD
+    // of two values.
+    if (N1C && (1 == N1C->getZExtValue())) {
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
+///                       when possible.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  if (N->getOpcode() == ISD::SHL) {
+    SDValue V = PerformSHLCombine(N, DAG);
+    if (V.getNode()) return V;
+  }
+
+  return SDValue();
+}
+
+// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
+// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
+// and friends.  Likewise for OR -> CMPNEQSS.
+static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget *Subtarget) {
+  unsigned opcode;
+
+  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
+  // we're requiring SSE2 for both.
+  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue CMP0 = N0->getOperand(1);
+    SDValue CMP1 = N1->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // The SETCCs should both refer to the same CMP.
+    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+      return SDValue();
+
+    SDValue CMP00 = CMP0->getOperand(0);
+    SDValue CMP01 = CMP0->getOperand(1);
+    EVT     VT    = CMP00.getValueType();
+
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      bool ExpectingFlags = false;
+      // Check for any users that want flags:
+      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+           !ExpectingFlags && UI != UE; ++UI)
+        switch (UI->getOpcode()) {
+        default:
+        case ISD::BR_CC:
+        case ISD::BRCOND:
+        case ISD::SELECT:
+          ExpectingFlags = true;
+          break;
+        case ISD::CopyToReg:
+        case ISD::SIGN_EXTEND:
+        case ISD::ZERO_EXTEND:
+        case ISD::ANY_EXTEND:
+          break;
+        }
+
+      if (!ExpectingFlags) {
+        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
+        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
+
+        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
+          X86::CondCode tmp = cc0;
+          cc0 = cc1;
+          cc1 = tmp;
+        }
+
+        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
+            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
+          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
+          X86ISD::NodeType NTOperator = is64BitFP ?
+            X86ISD::FSETCCsd : X86ISD::FSETCCss;
+          // FIXME: need symbolic constants for these magic numbers.
+          // See X86ATTInstPrinter.cpp:printSSECC().
+          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
+          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
+                                              DAG.getConstant(x86cc, MVT::i8));
+          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
+                                              OnesOrZeroesF);
+          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
+                                      DAG.getConstant(1, MVT::i32));
+          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
+          return OneBitOfTruth;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  if (VT.is256BitVector() &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
+
+  return false;
+}
+
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.is256BitVector())
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::ANY_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+  SDValue Narrow = N->getOperand(0);
+  EVT NarrowVT = Narrow->getValueType(0);
+  if (!NarrowVT.is128BitVector())
+    return SDValue();
+
+  if (Narrow->getOpcode() != ISD::XOR &&
+      Narrow->getOpcode() != ISD::AND &&
+      Narrow->getOpcode() != ISD::OR)
+    return SDValue();
+
+  SDValue N0  = Narrow->getOperand(0);
+  SDValue N1  = Narrow->getOperand(1);
+  DebugLoc DL = Narrow->getDebugLoc();
+
+  // The Left side has to be a trunc.
+  if (N0.getOpcode() != ISD::TRUNCATE)
+    return SDValue();
+
+  // The type of the truncated inputs.
+  EVT WideVT = N0->getOperand(0)->getValueType(0);
+  if (WideVT != VT)
+    return SDValue();
+
+  // The right side has to be a 'trunc' or a constant vector.
+  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+  bool RHSConst = (isSplatVector(N1.getNode()) &&
+                   isa<ConstantSDNode>(N1->getOperand(0)));
+  if (!RHSTrunc && !RHSConst)
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+    return SDValue();
+
+  // Set N0 and N1 to hold the inputs to the new wide operation.
+  N0 = N0->getOperand(0);
+  if (RHSConst) {
+    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+                     N1->getOperand(0));
+    SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+  } else if (RHSTrunc) {
+    N1 = N1->getOperand(0);
+  }
+
+  // Generate the wide operation.
+  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  case ISD::ANY_EXTEND:
+    return Op;
+  case ISD::ZERO_EXTEND: {
+    unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+    APInt Mask = APInt::getAllOnesValue(InBits);
+    Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+    return DAG.getNode(ISD::AND, DL, VT,
+                       Op, DAG.getConstant(Mask, VT));
+  }
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+                       Op, DAG.getValueType(NarrowVT));
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
+  // Create BLSI, and BLSR instructions
+  // BLSI is X & (-X)
+  // BLSR is X & (X-1)
+  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // Check LHS for neg
+    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+        isZero(N0.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+    // Check RHS for neg
+    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+        isZero(N1.getOperand(0)))
+      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+    // Check LHS for X-1
+    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+        isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+    // Check RHS for X-1
+    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+        isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+
+    return SDValue();
+  }
+
+  // Want to form ANDNP nodes:
+  // 1) In the hopes of then easily combining them with OR and AND nodes
+  //    to form PBLEND/PSIGN.
+  // 2) To match ANDN packed intrinsics
+  if (VT != MVT::v2i64 && VT != MVT::v4i64)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Check LHS for vnot
+  if (N0.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
+
+  // Check RHS for vnot
+  if (N1.getOpcode() == ISD::XOR &&
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
+
+  return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
+  if (R.getNode())
+    return R;
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // look for psign/blend
+  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+    if (!Subtarget->hasSSSE3() ||
+        (VT == MVT::v4i64 && !Subtarget->hasInt256()))
+      return SDValue();
+
+    // Canonicalize pandn to RHS
+    if (N0.getOpcode() == X86ISD::ANDNP)
+      std::swap(N0, N1);
+    // or (and (m, y), (pandn m, x))
+    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+      SDValue Mask = N1.getOperand(0);
+      SDValue X    = N1.getOperand(1);
+      SDValue Y;
+      if (N0.getOperand(0) == Mask)
+        Y = N0.getOperand(1);
+      if (N0.getOperand(1) == Mask)
+        Y = N0.getOperand(0);
+
+      // Check to see if the mask appeared in both the AND and ANDNP and
+      if (!Y.getNode())
+        return SDValue();
+
+      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+      // Look through mask bitcast.
+      if (Mask.getOpcode() == ISD::BITCAST)
+        Mask = Mask.getOperand(0);
+      if (X.getOpcode() == ISD::BITCAST)
+        X = X.getOperand(0);
+      if (Y.getOpcode() == ISD::BITCAST)
+        Y = Y.getOperand(0);
+
+      EVT MaskVT = Mask.getValueType();
+
+      // Validate that the Mask operand is a vector sra node.
+      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+      // there is no psrai.b
+      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+      unsigned SraAmt = ~0;
+      if (Mask.getOpcode() == ISD::SRA) {
+        SDValue Amt = Mask.getOperand(1);
+        if (isSplatVector(Amt.getNode())) {
+          SDValue SclrAmt = Amt->getOperand(0);
+          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt))
+            SraAmt = C->getZExtValue();
+        }
+      } else if (Mask.getOpcode() == X86ISD::VSRAI) {
+        SDValue SraC = Mask.getOperand(1);
+        SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
+      }
+      if ((SraAmt + 1) != EltBits)
+        return SDValue();
+
+      DebugLoc DL = N->getDebugLoc();
+
+      // Now we know we at least have a plendvb with the mask val.  See if
+      // we can form a psignb/w/d.
+      // psign = x.type == y.type == mask.type && y = sub(0, x);
+      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+          X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+        assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
+               "Unsupported VT for PSIGN");
+        Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+      }
+      // PBLENDVB only available on SSE 4.1
+      if (!Subtarget->hasSSE41())
+        return SDValue();
+
+      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
+    }
+  }
+
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue ShAmt0 = N0.getOperand(1);
+  if (ShAmt0.getValueType() != MVT::i8)
+    return SDValue();
+  SDValue ShAmt1 = N1.getOperand(1);
+  if (ShAmt1.getValueType() != MVT::i8)
+    return SDValue();
+  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+    ShAmt0 = ShAmt0.getOperand(0);
+  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+    ShAmt1 = ShAmt1.getOperand(0);
+
+  DebugLoc DL = N->getDebugLoc();
+  unsigned Opc = X86ISD::SHLD;
+  SDValue Op0 = N0.getOperand(0);
+  SDValue Op1 = N1.getOperand(0);
+  if (ShAmt0.getOpcode() == ISD::SUB) {
+    Opc = X86ISD::SHRD;
+    std::swap(Op0, Op1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+
+  unsigned Bits = VT.getSizeInBits();
+  if (ShAmt1.getOpcode() == ISD::SUB) {
+    SDValue Sum = ShAmt1.getOperand(0);
+    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
+      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
+        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
+      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
+        return DAG.getNode(Opc, DL, VT,
+                           Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+    }
+  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+    if (ShAmt0C &&
+        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
+      return DAG.getNode(Opc, DL, VT,
+                         N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+  }
+
+  return SDValue();
+}
+
+// Generate NEG and CMOV for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Since X86 does not have CMOV for 8-bit integer, we don't convert
+  // 8-bit integer abs to NEG and CMOV.
+  if (VT.isInteger() && VT.getSizeInBits() == 8)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CMOV.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA &&
+      N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
+        // Generate SUB & CMOV.
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+                                  DAG.getConstant(0, VT), N0.getOperand(0));
+
+        SDValue Ops[] = { N0.getOperand(0), Neg,
+                          DAG.getConstant(X86::COND_GE, MVT::i8),
+                          SDValue(Neg.getNode(), 1) };
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
+                           Ops, array_lengthof(Ops));
+      }
+  return SDValue();
+}
+
+// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
+static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (Subtarget->hasCMov()) {
+    SDValue RV = performIntegerAbsCombine(N, DAG);
+    if (RV.getNode())
+      return RV;
+  }
+
+  // Try forming BMI if it is available.
+  if (!Subtarget->hasBMI())
+    return SDValue();
+
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  assert(Subtarget->hasBMI() && "Creating BLSMSK requires BMI instructions");
+
+  // Create BLSMSK instructions by finding X ^ (X-1)
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+      isAllOnes(N0.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N1);
+
+  if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+      isAllOnes(N1.getOperand(1)))
+    return DAG.getNode(X86ISD::BLSMSK, DL, VT, N0);
+
+  return SDValue();
+}
+
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  DebugLoc dl = Ld->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned RegSz = RegVT.getSizeInBits();
+
+  // On Sandybridge unaligned 256bit loads are inefficient.
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+  unsigned Alignment = Ld->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
+  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+      !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+    unsigned NumElems = RegVT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  NumElems/2);
+    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                Alignment);
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+                                Ld->getPointerInfo(), Ld->isVolatile(),
+                                Ld->isNonTemporal(), Ld->isInvariant(),
+                                std::min(16U, Alignment));
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             Load1.getValue(1),
+                             Load2.getValue(1));
+
+    SDValue NewVec = DAG.getUNDEF(RegVT);
+    NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+    NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+    return DCI.CombineTo(N, NewVec, TF, true);
+  }
+
+  // If this is a vector EXT Load then attempt to optimize it using a
+  // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+  // expansion is still better than scalar code.
+  // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+  // emit a shuffle and a arithmetic shift.
+  // TODO: It is possible to support ZExt by zeroing the undef values
+  // during the shuffle phase or after the shuffle.
+  if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+      (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
+    assert(MemVT != RegVT && "Cannot extend to the same type");
+    assert(MemVT.isVector() && "Must load a vector from memory");
+
+    unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned MemSz = MemVT.getSizeInBits();
+    assert(RegSz > MemSz && "Register size must be greater than the mem size");
+
+    if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+      return SDValue();
+
+    // All sizes must be a power of two.
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems))
+      return SDValue();
+
+    // Attempt to load the original value using scalar loads.
+    // Find the largest scalar type that divides the total loaded size.
+    MVT SclrLoadTy = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
+        SclrLoadTy = Tp;
+      }
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
+        (64 <= MemSz))
+      SclrLoadTy = MVT::f64;
+
+    // Calculate the number of scalar loads that we need to perform
+    // in order to load our vector from memory.
+    unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+    if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+      return SDValue();
+
+    unsigned loadRegZize = RegSz;
+    if (Ext == ISD::SEXTLOAD && RegSz == 256)
+      loadRegZize /= 2;
+
+    // Represent our vector as a sequence of elements which are the
+    // largest scalar that we can load.
+    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+      loadRegZize/SclrLoadTy.getSizeInBits());
+
+    // Represent the data using the same element type that is stored in
+    // memory. In practice, we ''widen'' MemVT.
+    EVT WideVecVT =
+          EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                       loadRegZize/MemVT.getScalarType().getSizeInBits());
+
+    assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
+      "Invalid vector type");
+
+    // We can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SmallVector<SDValue, 8> Chains;
+    SDValue Ptr = Ld->getBasePtr();
+    SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
+
+    for (unsigned i = 0; i < NumLoads; ++i) {
+      // Perform a single load.
+      SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                       Ptr, Ld->getPointerInfo(),
+                                       Ld->isVolatile(), Ld->isNonTemporal(),
+                                       Ld->isInvariant(), Ld->getAlignment());
+      Chains.push_back(ScalarLoad.getValue(1));
+      // Create the first element type using SCALAR_TO_VECTOR in order to avoid
+      // another round of DAGCombining.
+      if (i == 0)
+        Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
+      else
+        Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
+                          ScalarLoad, DAG.getIntPtrConstant(i));
+
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+    }
+
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+
+    // Bitcast the loaded value to a vector of the original element type, in
+    // the size of the target vector type.
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
+    unsigned SizeRatio = RegSz/MemSz;
+
+    if (Ext == ISD::SEXTLOAD) {
+      // If we have SSE4.1 we can directly emit a VSEXT node.
+      if (Subtarget->hasSSE41()) {
+        SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+        return DCI.CombineTo(N, Sext, TF, true);
+      }
+
+      // Otherwise we'll shuffle the small elements in the high bits of the
+      // larger type and perform an arithmetic shift. If the shift is not legal
+      // it's better to scalarize.
+      if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+        return SDValue();
+
+      // Redistribute the loaded elements into the different locations.
+      SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+      for (unsigned i = 0; i != NumElems; ++i)
+        ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+      SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                           DAG.getUNDEF(WideVecVT),
+                                           &ShuffleVec[0]);
+
+      Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+      // Build the arithmetic shift.
+      unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+                     MemVT.getVectorElementType().getSizeInBits();
+      Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+                          DAG.getConstant(Amt, RegVT));
+
+      return DCI.CombineTo(N, Shuff, TF, true);
+    }
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i*SizeRatio] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
+
+    // Bitcast to the requested type.
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    // Replace the original load with the new sequence
+    // and return the new chain.
+    return DCI.CombineTo(N, Shuff, TF, true);
+  }
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatenation of two XMM registers, perform two stores.
+  // On Sandy Bridge, 256-bit memory operations are executed by two
+  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
+  // memory  operation.
+  unsigned Alignment = St->getAlignment();
+  bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
+  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+      StVT == VT && !IsAligned) {
+    unsigned NumElems = VT.getVectorNumElements();
+    if (NumElems < 2)
+      return SDValue();
+
+    SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+    SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), Alignment);
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(),
+                                std::min(16U, Alignment));
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i != NumElems; ++i)
+      ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT))
+      return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                         DAG.getUNDEF(WideVecVT),
+                                         &ShuffleVec[0]);
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
+    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
+        (64 <= NumElems * ToSz))
+      StoreType = MVT::f64;
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
+                     && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = 0;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    DebugLoc LdDL = Ld->getDebugLoc();
+    DebugLoc StDL = N->getDebugLoc();
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->isInvariant(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex != -1) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                               Ops.size());
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getPointerInfo(),
+                          St->isVolatile(), St->isNonTemporal(),
+                          St->getAlignment());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getPointerInfo(),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getPointerInfo().getWithOffset(4),
+                               Ld->isVolatile(), Ld->isNonTemporal(),
+                               Ld->isInvariant(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                             Ops.size());
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getPointerInfo(),
+                                St->isVolatile(), St->isNonTemporal(),
+                                St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getPointerInfo().getWithOffset(4),
+                                St->isVolatile(),
+                                St->isNonTemporal(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+  return SDValue();
+}
+
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  EVT VT = LHS.getValueType();
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for horizontal add/sub");
+
+  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
+  // operate independently on 128-bit lanes.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts / NumLanes;
+  assert((NumLaneElts % 2 == 0) &&
+         "Vector type should have an even number of elements in each lane");
+  unsigned HalfLaneElts = NumLaneElts/2;
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 16> LMask(NumElts);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), LMask.begin());
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 16> RMask(NumElts);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
+    std::copy(Mask.begin(), Mask.end(), RMask.begin());
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != NumElts; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    CommuteVectorShuffleMask(RMask, NumElts);
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int LIdx = LMask[i], RIdx = RMask[i];
+
+    // Ignore any UNDEF components.
+    if (LIdx < 0 || RIdx < 0 ||
+        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+      continue;
+
+    // Check that successive elements are being operated on.  If not, this is
+    // not a horizontal operation.
+    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
+    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
+    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
+    if (!(LIdx == Index && RIdx == Index + 1) &&
+        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+      return false;
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+       (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+  // F[X]OR(0.0, x) -> x
+  // F[X]OR(x, 0.0) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  return SDValue();
+}
+
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+  // FAND(0.0, x) -> 0.0
+  // FAND(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() ==
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 
+                                               const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+  DebugLoc dl = N->getDebugLoc();
+
+  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+  // both SSE and AVX2 since there is no sign-extended shift right
+  // operation on a vector with 64-bit elements.
+  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+    SDValue N00 = N0.getOperand(0);
+
+    // EXTLOAD has a better solution on AVX2, 
+    // it may be replaced with X86ISD::VSEXT node.
+    if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+      if (!ISD::isNormalLoad(N00.getNode()))
+        return SDValue();
+
+    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 
+                                  N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+    }
+  }
+  return SDValue();
+}
+
+static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->hasFp256())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT.isVector() && VT.getSizeInBits() == 256) {
+    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+    if (R.getNode())
+      return R;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget* Subtarget) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  EVT ScalarVT = VT.getScalarType();
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
+    return SDValue();
+
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+  SDValue C = N->getOperand(2);
+
+  bool NegA = (A.getOpcode() == ISD::FNEG);
+  bool NegB = (B.getOpcode() == ISD::FNEG);
+  bool NegC = (C.getOpcode() == ISD::FNEG);
+
+  // Negative multiplication when NegA xor NegB
+  bool NegMul = (NegA != NegB);
+  if (NegA)
+    A = A.getOperand(0);
+  if (NegB)
+    B = B.getOperand(0);
+  if (NegC)
+    C = C.getOperand(0);
+
+  unsigned Opcode;
+  if (!NegMul)
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+  else
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
+  return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
+  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
+  //           (and (i32 x86isd::setcc_carry), 1)
+  // This eliminates the zext. This transformation is necessary because
+  // ISD::SETCC is always legalized to i8.
+  DebugLoc dl = N->getDebugLoc();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 1)
+        return SDValue();
+      return DAG.getNode(ISD::AND, dl, VT,
+                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                     N00.getOperand(0), N00.getOperand(1)),
+                         DAG.getConstant(1, VT));
+    }
+  }
+
+  if (VT.is256BitVector()) {
+    SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+    if (R.getNode())
+      return R;
+  }
+
+  return SDValue();
+}
+
+// Optimize x == -y --> x+y == 0
+//          x != -y --> x+y != 0
+static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   LHS.getValueType(), RHS, LHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
+      if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
+        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+                                   RHS.getValueType(), LHS, RHS.getOperand(1));
+        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+                            addV, DAG.getConstant(0, addV.getValueType()), CC);
+      }
+  return SDValue();
+}
+
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+  return DAG.getNode(ISD::AND, DL, MVT::i8,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+                     DAG.getConstant(1, MVT::i8));
+}
+
+// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
+  SDValue EFLAGS = N->getOperand(1);
+
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG);
+    }
+  }
+
+  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
+  // a zext and produces an all-ones bit which is more useful than 0/1 in some
+  // cases.
+  if (CC == X86::COND_B)
+    return MaterializeSETB(DL, EFLAGS, DAG);
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  return SDValue();
+}
+
+// Optimize branch condition evaluation.
+//
+static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue EFLAGS = N->getOperand(3);
+  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
+
+  SDValue Flags;
+
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
+                                        const X86TargetLowering *XTLI) {
+  SDValue Op0 = N->getOperand(0);
+  EVT InVT = Op0->getValueType(0);
+
+  // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
+  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
+    DebugLoc dl = N->getDebugLoc();
+    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
+  }
+
+  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
+  // a 32-bit target where SSE doesn't support i64->FP operations.
+  if (Op0.getOpcode() == ISD::LOAD) {
+    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
+    EVT VT = Ld->getValueType(0);
+    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
+        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
+        !XTLI->getSubtarget()->is64Bit() &&
+        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
+                                          Ld->getChain(), Op0, DAG);
+      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
+      return FILDChain;
+    }
+  }
+  return SDValue();
+}
+
+// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
+static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
+                                 X86TargetLowering::DAGCombinerInfo &DCI) {
+  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
+  // the result is either zero or one (depending on the input carry bit).
+  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
+  if (X86::isZeroNode(N->getOperand(0)) &&
+      X86::isZeroNode(N->getOperand(1)) &&
+      // We don't have a good way to replace an EFLAGS use, so only do this when
+      // dead right now.
+      SDValue(N, 1).use_empty()) {
+    DebugLoc DL = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
+    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
+                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                           DAG.getConstant(X86::COND_B,MVT::i8),
+                                           N->getOperand(2)),
+                               DAG.getConstant(1, VT));
+    return DCI.CombineTo(N, Res1, CarryOut);
+  }
+
+  return SDValue();
+}
+
+// fold (add Y, (sete  X, 0)) -> adc  0, Y
+//      (add Y, (setne X, 0)) -> sbb -1, Y
+//      (sub (sete  X, 0), Y) -> sbb  0, Y
+//      (sub (setne X, 0), Y) -> adc -1, Y
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+  DebugLoc DL = N->getDebugLoc();
+
+  // Look through ZExts.
+  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
+  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
+    return SDValue();
+
+  SDValue SetCC = Ext.getOperand(0);
+  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+    return SDValue();
+
+  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SDValue Cmp = SetCC.getOperand(1);
+  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
+      !X86::isZeroNode(Cmp.getOperand(1)) ||
+      !Cmp.getOperand(0).getValueType().isInteger())
+    return SDValue();
+
+  SDValue CmpOp0 = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
+                               DAG.getConstant(1, CmpOp0.getValueType()));
+
+  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  if (CC == X86::COND_NE)
+    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
+                       DL, OtherVal.getValueType(), OtherVal,
+                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
+  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
+                     DL, OtherVal.getValueType(), OtherVal,
+                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
+}
+
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, VT));
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
+    }
+  }
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
+       (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget *Subtarget) {
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue In = N->getOperand(0);
+  while (In.getOpcode() == ISD::BITCAST)
+    In = In.getOperand(0);
+
+  if (In.getOpcode() != X86ISD::VZEXT)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+                     In.getOperand(0));
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
+  case ISD::VSELECT:
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
+  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
+  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
+  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
+  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
+  case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::SHUFP:       // Handle all target specific shuffles
+  case X86ISD::PALIGNR:
+  case X86ISD::UNPCKH:
+  case X86ISD::UNPCKL:
+  case X86ISD::MOVHLPS:
+  case X86ISD::MOVLHPS:
+  case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
+  case X86ISD::MOVSS:
+  case X86ISD::MOVSD:
+  case X86ISD::VPERMILP:
+  case X86ISD::VPERM2X128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+  case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  }
+
+  return SDValue();
+}
+
+/// isTypeDesirableForOp - Return true if the target has native support for
+/// the specified value type and it is 'desirable' to use the type for the
+/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+/// instruction encodings are longer and some i16 instructions are slow.
+bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
+  if (!isTypeLegal(VT))
+    return false;
+  if (VT != MVT::i16)
+    return true;
+
+  switch (Opc) {
+  default:
+    return true;
+  case ISD::LOAD:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SUB:
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return false;
+  }
+}
+
+/// IsDesirableToPromoteOp - This method query the target whether it is
+/// beneficial for dag combiner to promote the specified node. If true, it
+/// should return the desired promotion type by reference.
+bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
+  EVT VT = Op.getValueType();
+  if (VT != MVT::i16)
+    return false;
+
+  bool Promote = false;
+  bool Commute = false;
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    // If the non-extending load has a single use and it's not live out, then it
+    // might be folded.
+    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
+                                                     Op.hasOneUse()*/) {
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+        // The only case where we'd want to promote LOAD (rather then it being
+        // promoted as an operand is when it's only use is liveout.
+        if (UI->getOpcode() != ISD::CopyToReg)
+          return false;
+      }
+    }
+    Promote = true;
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    Promote = true;
+    break;
+  case ISD::SHL:
+  case ISD::SRL: {
+    SDValue N0 = Op.getOperand(0);
+    // Look out for (store (shl (load), x)).
+    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
+      return false;
+    Promote = true;
+    break;
+  }
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    Commute = true;
+    // fallthrough
+  case ISD::SUB: {
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    if (!Commute && MayFoldLoad(N1))
+      return false;
+    // Avoid disabling potential load folding opportunities.
+    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
+      return false;
+    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
+      return false;
+    Promote = true;
+  }
+  }
+
+  PVT = MVT::i32;
+  return Promote;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  // Helper to match a string separated by whitespace.
+  bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
+    s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
+
+    for (unsigned i = 0, e = args.size(); i != e; ++i) {
+      StringRef piece(*args[i]);
+      if (!s.startswith(piece)) // Check if the piece matches.
+        return false;
+
+      s = s.substr(piece.size());
+      StringRef::size_type pos = s.find_first_not_of(" \t");
+      if (pos == 0) // We matched a prefix.
+        return false;
+
+      s = s.substr(pos);
+    }
+
+    return s.empty();
+  }
+  const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+
+  std::string AsmStr = IA->getAsmString();
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, ";\n");
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    // FIXME: this should verify that we are targeting a 486 or better.  If not,
+    // we will turn this bswap into something that will be lowered to logical
+    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
+    // lower so don't worry about this.
+    // bswap $0
+    if (matchAsm(AsmPieces[0], "bswap", "$0") ||
+        matchAsm(AsmPieces[0], "bswapl", "$0") ||
+        matchAsm(AsmPieces[0], "bswapq", "$0") ||
+        matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
+        matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType()->isIntegerTy(16) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
+         matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (AsmPieces.size() == 4 &&
+          AsmPieces[0] == "~{cc}" &&
+          AsmPieces[1] == "~{dirflag}" &&
+          AsmPieces[2] == "~{flags}" &&
+          AsmPieces[3] == "~{fpsr}")
+      return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType()->isIntegerTy(32) &&
+        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
+        matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
+        matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
+        matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
+      AsmPieces.clear();
+      const std::string &ConstraintsStr = IA->getConstraintString();
+      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
+      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
+      if (AsmPieces.size() == 4 &&
+          AsmPieces[0] == "~{cc}" &&
+          AsmPieces[1] == "~{dirflag}" &&
+          AsmPieces[2] == "~{flags}" &&
+          AsmPieces[3] == "~{fpsr}")
+        return IntrinsicLowering::LowerToByteSwap(CI);
+    }
+
+    if (CI->getType()->isIntegerTy(64)) {
+      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+      if (Constraints.size() >= 2 &&
+          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+        if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
+            matchAsm(AsmPieces[1], "bswap", "%edx") &&
+            matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
+          return IntrinsicLowering::LowerToByteSwap(CI);
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'R':
+    case 'q':
+    case 'Q':
+    case 'f':
+    case 't':
+    case 'u':
+    case 'y':
+    case 'x':
+    case 'Y':
+    case 'l':
+      return C_RegisterClass;
+    case 'a':
+    case 'b':
+    case 'c':
+    case 'd':
+    case 'S':
+    case 'D':
+    case 'A':
+      return C_Register;
+    case 'I':
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'G':
+    case 'C':
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  X86TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (CallOperandVal == NULL)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
+  default:
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+  case 'R':
+  case 'q':
+  case 'Q':
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'S':
+  case 'D':
+  case 'A':
+    if (CallOperandVal->getType()->isIntegerTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'f':
+  case 't':
+  case 'u':
+    if (type->isFloatingPointTy())
+      weight = CW_SpecificReg;
+    break;
+  case 'y':
+    if (type->isX86_MMXTy() && Subtarget->hasMMX())
+      weight = CW_SpecificReg;
+    break;
+  case 'x':
+  case 'Y':
+    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
+        ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
+      weight = CW_Register;
+    break;
+  case 'I':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
+      if (C->getZExtValue() <= 31)
+        weight = CW_Constant;
+    }
+    break;
+  case 'J':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 63)
+        weight = CW_Constant;
+    }
+    break;
+  case 'K':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
+        weight = CW_Constant;
+    }
+    break;
+  case 'L':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
+        weight = CW_Constant;
+    }
+    break;
+  case 'M':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 3)
+        weight = CW_Constant;
+    }
+    break;
+  case 'N':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xff)
+        weight = CW_Constant;
+    }
+    break;
+  case 'G':
+  case 'C':
+    if (dyn_cast<ConstantFP>(CallOperandVal)) {
+      weight = CW_Constant;
+    }
+    break;
+  case 'e':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if ((C->getSExtValue() >= -0x80000000LL) &&
+          (C->getSExtValue() <= 0x7fffffffLL))
+        weight = CW_Constant;
+    }
+    break;
+  case 'Z':
+    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
+      if (C->getZExtValue() <= 0xffffffff)
+        weight = CW_Constant;
+    }
+    break;
+  }
+  return weight;
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget->hasSSE2())
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     std::string &Constraint,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  // Only support length 1 constraints for now.
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (isInt<8>(C->getSExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                           C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+      break;
+    }
+
+    // In any sort of PIC mode addresses need to be computed at runtime by
+    // adding in a register or some sort of table lookup.  These can't
+    // be used as immediates.
+    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
+      return;
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = 0;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    const GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
+                                                        getTargetMachine())))
+      return;
+
+    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+                                        GA->getValueType(0), Offset);
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+      // TODO: Slight differences here in allocation order and leaving
+      // RIP in the class. Do they matter any more here than they do
+      // in the normal allocation?
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget->is64Bit()) {
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i8 || VT == MVT::i1)
+          return std::make_pair(0U, &X86::GR8RegClass);
+        if (VT == MVT::i64 || VT == MVT::f64)
+          return std::make_pair(0U, &X86::GR64RegClass);
+        break;
+      }
+      // 32-bit fallthrough
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32 || VT == MVT::f32)
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
+      if (VT == MVT::i64)
+        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
+      break;
+    case 'r':   // GENERAL_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8RegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16RegClass);
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32RegClass);
+      return std::make_pair(0U, &X86::GR64RegClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8 || VT == MVT::i1)
+        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
+      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP32RegClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, &X86::RFP64RegClass);
+      return std::make_pair(0U, &X86::RFP80RegClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, &X86::VR64RegClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
+      if (!Subtarget->hasSSE1()) break;
+
+      switch (VT.getSimpleVT().SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, &X86::FR32RegClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, &X86::FR64RegClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, &X86::VR128RegClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        return std::make_pair(0U, &X86::VR256RegClass);
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
+    if (StringRef("{st}").equals_lower(Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = &X86::RFP80RegClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringRef("{flags}").equals_lower(Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = &X86::CCRRegClass;
+      return Res;
+    }
+
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = &X86::GR32_ADRegClass;
+      return Res;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second == &X86::GR16RegClass) {
+    if (VT == MVT::i8 || VT == MVT::i1) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::AL; break;
+      case X86::DX: DestReg = X86::DL; break;
+      case X86::CX: DestReg = X86::CL; break;
+      case X86::BX: DestReg = X86::BL; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR8RegClass;
+      }
+    } else if (VT == MVT::i32 || VT == MVT::f32) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::EAX; break;
+      case X86::DX: DestReg = X86::EDX; break;
+      case X86::CX: DestReg = X86::ECX; break;
+      case X86::BX: DestReg = X86::EBX; break;
+      case X86::SI: DestReg = X86::ESI; break;
+      case X86::DI: DestReg = X86::EDI; break;
+      case X86::BP: DestReg = X86::EBP; break;
+      case X86::SP: DestReg = X86::ESP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR32RegClass;
+      }
+    } else if (VT == MVT::i64 || VT == MVT::f64) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::RAX; break;
+      case X86::DX: DestReg = X86::RDX; break;
+      case X86::CX: DestReg = X86::RCX; break;
+      case X86::BX: DestReg = X86::RBX; break;
+      case X86::SI: DestReg = X86::RSI; break;
+      case X86::DI: DestReg = X86::RDI; break;
+      case X86::BP: DestReg = X86::RBP; break;
+      case X86::SP: DestReg = X86::RSP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = &X86::GR64RegClass;
+      }
+    }
+  } else if (Res.second == &X86::FR32RegClass ||
+             Res.second == &X86::FR64RegClass ||
+             Res.second == &X86::VR128RegClass) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+
+    if (VT == MVT::f32 || VT == MVT::i32)
+      Res.second = &X86::FR32RegClass;
+    else if (VT == MVT::f64 || VT == MVT::i64)
+      Res.second = &X86::FR64RegClass;
+    else if (X86::VR128RegClass.hasType(VT))
+      Res.second = &X86::VR128RegClass;
+    else if (X86::VR256RegClass.hasType(VT))
+      Res.second = &X86::VR256RegClass;
+  }
+
+  return Res;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 000000000000..5725f7aea581
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,950 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// BSF - Bit scan forward.
+      /// BSR - Bit scan reverse.
+      BSF,
+      BSR,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// CALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      CALL,
+
+      /// RDTSC_DAG - This operation implements the lowering for
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+      /// operand, usually produced by a CMP instruction.
+      SETCC,
+
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
+
+      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+      /// Operands are two FP values to compare; result is a mask of
+      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+      FSETCCss, FSETCCsd,
+
+      /// X86 MOVMSK{pd|ps}, extracts sign bits of two or four FP values,
+      /// result in an integer GPR.  Needs masking for scalar result.
+      FGETSIGNx86,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
+      /// to an MMX vector.  If you think this is too close to the previous
+      /// mnemonic, so do I; blame Intel.
+      MOVDQ2Q,
+
+      /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
+      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// INSERTPS - Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW, MMX_PINSRW,
+
+      /// PSHUFB - Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// ANDNP - Bitwise Logical AND NOT of Packed FP values.
+      ANDNP,
+
+      /// PSIGN - Copy integer sign.
+      PSIGN,
+
+      /// BLENDV - Blend where the selector is a register.
+      BLENDV,
+
+      /// BLENDI - Blend where the selector is an immediate.
+      BLENDI,
+
+      // SUBUS - Integer sub with unsigned saturation.
+      SUBUS,
+
+      /// HADD - Integer horizontal add.
+      HADD,
+
+      /// HSUB - Integer horizontal sub.
+      HSUB,
+
+      /// FHADD - Floating point horizontal add.
+      FHADD,
+
+      /// FHSUB - Floating point horizontal sub.
+      FHSUB,
+
+      /// UMAX, UMIN - Unsigned integer max and min.
+      UMAX, UMIN,
+
+      /// SMAX, SMIN - Signed integer max and min.
+      SMAX, SMIN,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FMAXC, FMINC - Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // TLSADDR - Thread Local Storage.
+      TLSADDR,
+
+      // TLSBASEADDR - Thread Local Storage. A call to get the start address
+      // of the TLS block for the current module.
+      TLSBASEADDR,
+
+      // TLSCALL - Thread Local Storage.  When calling to an OS provided
+      // thunk at the address from an earlier relocation.
+      TLSCALL,
+
+      // EH_RETURN - Exception Handling helpers.
+      EH_RETURN,
+
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
+      /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for
+      /// the list of operands.
+      TC_RETURN,
+
+      // VZEXT_MOVL - Vector move low and zero extend.
+      VZEXT_MOVL,
+
+      // VSEXT_MOVL - Vector move low and sign extend.
+      VSEXT_MOVL,
+
+      // VZEXT - Vector integer zero-extend.
+      VZEXT,
+
+      // VSEXT - Vector integer signed-extend.
+      VSEXT,
+
+      // VFPEXT - Vector FP extend.
+      VFPEXT,
+
+      // VFPROUND - Vector FP round.
+      VFPROUND,
+
+      // VSHL, VSRL - 128-bit vector logical left / right shift
+      VSHLDQ, VSRLDQ,
+
+      // VSHL, VSRL, VSRA - Vector shift elements
+      VSHL, VSRL, VSRA,
+
+      // VSHLI, VSRLI, VSRAI - Vector shift elements by immediate
+      VSHLI, VSRLI, VSRAI,
+
+      // CMPP - Vector packed double/float comparison.
+      CMPP,
+
+      // PCMP* - Vector integer comparisons.
+      PCMPEQ, PCMPGT,
+
+      // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
+      ADD, SUB, ADC, SBB, SMUL,
+      INC, DEC, OR, XOR, AND,
+
+      BLSI,   // BLSI - Extract lowest set isolated bit
+      BLSMSK, // BLSMSK - Get mask up to lowest set bit
+      BLSR,   // BLSR - Reset lowest set bit
+
+      UMUL, // LOW, HI, FLAGS = umul LHS, RHS
+
+      // MUL_IMM - X86 specific multiply by immediate.
+      MUL_IMM,
+
+      // PTEST - Vector bitwise comparisons
+      PTEST,
+
+      // TESTP - Vector packed fp sign bitwise comparisons
+      TESTP,
+
+      // Several flavors of instructions with vector shuffle behaviors.
+      PALIGNR,
+      PSHUFD,
+      PSHUFHW,
+      PSHUFLW,
+      SHUFP,
+      MOVDDUP,
+      MOVSHDUP,
+      MOVSLDUP,
+      MOVLHPS,
+      MOVLHPD,
+      MOVHLPS,
+      MOVLPS,
+      MOVLPD,
+      MOVSD,
+      MOVSS,
+      UNPCKL,
+      UNPCKH,
+      VPERMILP,
+      VPERMV,
+      VPERMI,
+      VPERM2X128,
+      VBROADCAST,
+
+      // PMULUDQ - Vector multiply packed unsigned doubleword integers
+      PMULUDQ,
+
+      // FMA nodes
+      FMADD,
+      FNMADD,
+      FMSUB,
+      FNMSUB,
+      FMADDSUB,
+      FMSUBADD,
+
+      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+      // according to %al. An operator is needed so that this can be expanded
+      // with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
+      WIN_ALLOCA,
+
+      // SEG_ALLOCA - For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
+      // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui.
+      WIN_FTOL,
+
+      // Memory barrier
+      MEMBARRIER,
+      MFENCE,
+      SFENCE,
+      LFENCE,
+
+      // FNSTSW16r - Store FP status word into i16 register.
+      FNSTSW16r,
+
+      // SAHF - Store contents of %ah into %eflags.
+      SAHF,
+
+      // RDRAND - Get a random integer and indicate whether it is valid in CF.
+      RDRAND,
+
+      // RDSEED - Get a NIST SP800-90B & C compliant random integer and
+      // indicate whether it is valid in CF.
+      RDSEED,
+
+      // PCMP*STRI
+      PCMPISTRI,
+      PCMPESTRI,
+
+      // XTEST - Test if in transactional execution.
+      XTEST,
+
+      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG,
+      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG -
+      // Atomic 64-bit binary operations.
+      ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMMAX64_DAG,
+      ATOMMIN64_DAG,
+      ATOMUMAX64_DAG,
+      ATOMUMIN64_DAG,
+      ATOMSWAP64_DAG,
+
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
+      LCMPXCHG_DAG,
+      LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
+
+      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// VAARG_64 - This instruction grabs the address of the next argument
+      /// from a va_list. (reads and modifies the va_list in memory)
+      VAARG_64
+
+      // WARNING: Do not add anything in the end unless you want the node to
+      // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be
+      // thought as target memory ops!
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// isVEXTRACTF128Index - Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF128.
+    bool isVEXTRACTF128Index(SDNode *N);
+
+    /// isVINSERTF128Index - Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF128.
+    bool isVINSERTF128Index(SDNode *N);
+
+    /// getExtractVEXTRACTF128Immediate - Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF128 instructions.
+    unsigned getExtractVEXTRACTF128Immediate(SDNode *N);
+
+    /// getInsertVINSERTF128Immediate - Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF128 instructions.
+    unsigned getInsertVINSERTF128Immediate(SDNode *N);
+
+    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+    /// constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
+
+
+    /// isCalleePop - Determines whether the callee is required to pop its
+    /// own arguments. Callee pop is necessary to support tail calls.
+    bool isCalleePop(CallingConv::ID CallingConv,
+                     bool is64Bit, bool IsVarArg, bool TailCallOpt);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering : public TargetLowering {
+  public:
+    explicit X86TargetLowering(X86TargetMachine &TM);
+
+    virtual unsigned getJumpTableEncoding() const;
+
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+
+    virtual const MCExpr *
+    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB, unsigned uid,
+                              MCContext &Ctx) const;
+
+    /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+    /// jumptable.
+    virtual SDValue getPICJumpTableRelocBase(SDValue Table,
+                                             SelectionDAG &DAG) const;
+    virtual const MCExpr *
+    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                 unsigned JTI, MCContext &Ctx) const;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    virtual unsigned getByValTypeAlignment(Type *Ty) const;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. If DstAlign is zero that means it's safe to destination
+    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
+    /// means there isn't a need to check it against alignment requirement,
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
+    /// It returns EVT::Other if the type should be determined using generic
+    /// target-independent logic.
+    virtual EVT
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                        MachineFunction &MF) const;
+
+    /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+    /// specified type to expand memcpy / memset inline. This is mostly true
+    /// for all types except for some special cases. For example, on X86
+    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+    /// also does type conversion. Note the specified type doesn't have to be
+    /// legal as the hook is used before type legalization.
+    virtual bool isSafeMemOpType(MVT VT) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type. Returns whether it
+    /// is "fast" by reference in the second argument.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    /// isTypeDesirableForOp - Return true if the target has native support for
+    /// the specified value type and it is 'desirable' to use the type for the
+    /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
+    /// instruction encodings are longer and some i16 instructions are slow.
+    virtual bool isTypeDesirableForOp(unsigned Opc, EVT VT) const;
+
+    /// isTypeDesirable - Return true if the target has native support for the
+    /// specified value type and it is 'desirable' to use the type. e.g. On x86
+    /// i16 is legal, but undesirable since i16 instruction encodings are longer
+    /// and some i16 instructions are slow.
+    virtual bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
+    // operation that are sign bits.
+    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                     unsigned Depth) const;
+
+    virtual bool
+    isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
+
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    /// Examine constraint string and operand type and determine a weight value.
+    /// The operand object must already have been set up with the operand type.
+    virtual ConstraintWeight getSingleConstraintMatchWeight(
+      AsmOperandInfo &info, const char *constraint) const;
+
+    virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              std::string &Constraint,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    virtual bool isLegalAddImmediate(int64_t Imm) const;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    /// isZExtFree - Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
+    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+    /// is expanded to mul + add.
+    virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
+    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                                    EVT VT) const;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                        EVT VT) const;
+
+    /// ShouldShrinkFPConstant - If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    virtual bool ShouldShrinkFPConstant(EVT VT) const {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+
+    const X86Subtarget* getSubtarget() const {
+      return Subtarget;
+    }
+
+    /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+    /// computed in an SSE register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine
+    /// for fptoui.
+    bool isTargetFTOL() const {
+      return Subtarget->isTargetWindows() && !Subtarget->is64Bit();
+    }
+
+    /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be
+    /// used for fptoui to the given type.
+    bool isIntegerTypeFTOL(EVT VT) const {
+      return isTargetFTOL() && VT == MVT::i64;
+    }
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                                     const TargetLibraryInfo *libInfo) const;
+
+    /// getStackCookieLocation - Return true if the target stores stack
+    /// protector cookies at a fixed offset in some non-standard address
+    /// space, and populates the address space and offset as
+    /// appropriate.
+    virtual bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const;
+
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG) const;
+
+  protected:
+    std::pair<const TargetRegisterClass*, uint8_t>
+    findRepresentativeClass(MVT VT) const;
+
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const X86RegisterInfo *RegInfo;
+    const DataLayout *TD;
+
+    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
+    /// floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    /// LegalFPImmediates - A list of legal fp immediates.
+    std::vector<APFloat> LegalFPImmediates;
+
+    /// addLegalFPImmediate - Indicate that this x86 target can instruction
+    /// select the specified FP immediate natively.
+    void addLegalFPImmediate(const APFloat& Imm) {
+      LegalFPImmediates.push_back(Imm);
+    }
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerMemArgument(SDValue Chain,
+                             CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,  MachineFrameInfo *MFI,
+                              unsigned i) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags) const;
+
+    // Call lowering helpers.
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                           bool isCalleeStructRet,
+                                           bool isCallerStructRet,
+                                           Type *RetTy,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+    bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                int FPDiff, DebugLoc dl) const;
+
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize,
+                                         SelectionDAG &DAG) const;
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned,
+                                               bool isReplace) const;
+
+    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                   SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerToBT(SDValue And, ISD::CondCode CC,
+                      DebugLoc dl, SelectionDAG &DAG) const;
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
+    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
+    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
+    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    virtual SDValue
+      LowerCall(CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
+
+    virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
+
+    virtual MVT
+    getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const;
+
+    virtual bool
+    CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, max, min, umax, umin). It takes the corresponding instruction to
+    /// expand, the associated machine basic block, and the associated X86
+    /// opcodes for reg/reg.
+    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) const;
+
+    // Utility function to emit the low-level va_arg code for X86-64.
+    MachineBasicBlock *EmitVAARG64WithCustomInserter(
+                       MachineInstr *MI,
+                       MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+                                         MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
+                                              MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool Is64Bit) const;
+
+    MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
+                                          MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                    SelectionDAG &DAG) const;
+
+    /// Convert a comparison if required by the subtarget.
+    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
+}
+
+#endif    // X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
new file mode 100644
index 000000000000..ba1aede3c1a0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -0,0 +1,103 @@
+//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the 3DNow! instruction set, which extends MMX to support
+// floating point and also adds a few more random instructions for good measure.
+//
+//===----------------------------------------------------------------------===//
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
+      : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+  let Constraints = "$src1 = $dst";
+}
+
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+      : I3DNow<o, F, (outs VR64:$dst), ins,
+          !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+        Has3DNow0F0FOpcode {
+  // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
+  let isAsmParserOnly = 1;
+}
+
+multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
+}
+
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+  def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
+        (bitconvert (load_mmx addr:$src2))))]>;
+}
+
+multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+}
+
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+  def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+  def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
+    [(set VR64:$dst, (!cast<Intrinsic>(
+      !strconcat("int_x86_3dnow", Ver, "_", Mn))
+        (bitconvert (load_mmx addr:$src))))]>;
+}
+
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
+defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
+defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
+defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
+defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
+defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+
+
+def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
+                   [(int_x86_mmx_femms)]>;
+
+def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
+                      "prefetch\t$addr",
+                      [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
+
+def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
+                  [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
+                Requires<[HasPrefetchW]>;
+
+// "3DNowA" instructions
+defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
+defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
new file mode 100644
index 000000000000..225e9720da0c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -0,0 +1,1361 @@
+//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the integer arithmetic instructions in the X86
+// architecture.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// LEA - Load Effective Address
+let SchedRW = [WriteLEA] in {
+let neverHasSideEffects = 1 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins i32mem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins i32mem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+                 Requires<[In32BitMode]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
+                  Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions.
+//
+
+// SchedModel info for instruction that loads one value and gets the second
+// (and possibly third) value from a register.
+// This is used for instructions that put the memory operands before other
+// uses.
+class SchedLoadReg<SchedWrite SW> : Sched<[SW,
+  // Memory operand.
+  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+  // Register reads (implicit or explicit).
+  ReadAfterLd, ReadAfterLd]>;
+
+// Extra precision multiplication
+
+// AL is really implied by AX, but the registers in Defs must match the
+// SDNode results (i8, i32).
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_REG>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/],
+               IIC_MUL32_REG>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src",
+                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/],
+                IIC_MUL64>, Sched<[WriteIMul]>;
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               [], IIC_MUL16_MEM>, OpSize, SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              [], IIC_MUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+}
+
+let neverHasSideEffects = 1 in {
+// AL,AH = AL*GR8
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", [],
+              IIC_IMUL8>, Sched<[WriteIMul]>;
+// AX,DX = AX*GR16
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", [],
+              IIC_IMUL16_RR>, OpSize, Sched<[WriteIMul]>;
+// EAX,EDX = EAX*GR32
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", [],
+              IIC_IMUL32_RR>, Sched<[WriteIMul]>;
+// RAX,RDX = RAX*GR64
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", [],
+              IIC_IMUL64_RR>, Sched<[WriteIMul]>;
+
+let mayLoad = 1 in {
+// AL,AH = AL*[mem8]
+let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", [], IIC_IMUL8>, SchedLoadReg<WriteIMulLd>;
+// AX,DX = AX*[mem16]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", [], IIC_IMUL16_MEM>, OpSize,
+              SchedLoadReg<WriteIMulLd>;
+// EAX,EDX = EAX*[mem32]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", [], IIC_IMUL32_MEM>, SchedLoadReg<WriteIMulLd>;
+// RAX,RDX = RAX*[mem64]
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+}
+} // neverHasSideEffects
+
+
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst" in {
+
+let isCommutable = 1, SchedRW = [WriteIMul] in {
+// X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, GR16:$src2))], IIC_IMUL16_RR>,
+                       TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, GR32:$src2))], IIC_IMUL32_RR>,
+                 TB;
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, GR64:$src2))], IIC_IMUL64_RR>,
+                 TB;
+} // isCommutable, SchedRW
+
+// Register-Memory Signed Integer Multiply
+let SchedRW = [WriteIMulLd, ReadAfterLd] in {
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, EFLAGS,
+                       (X86smul_flag GR16:$src1, (load addr:$src2)))],
+                       IIC_IMUL16_RM>,
+               TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+                 (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, EFLAGS,
+                       (X86smul_flag GR32:$src1, (load addr:$src2)))],
+                       IIC_IMUL32_RM>,
+               TB;
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, EFLAGS,
+                        (X86smul_flag GR64:$src1, (load addr:$src2)))],
+                        IIC_IMUL64_RM>,
+               TB;
+} // SchedRW
+} // Constraints = "$src1 = $dst"
+
+} // Defs = [EFLAGS]
+
+// Surprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteIMul] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag GR16:$src1, imm:$src2))],
+                            IIC_IMUL16_RRI>, OpSize;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag GR16:$src1, i16immSExt8:$src2))],
+                           IIC_IMUL16_RRI>,
+                 OpSize;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag GR32:$src1, imm:$src2))],
+                            IIC_IMUL32_RRI>;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag GR32:$src1, i32immSExt8:$src2))],
+                           IIC_IMUL32_RRI>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR64:$dst, EFLAGS,
+                             (X86smul_flag GR64:$src1, i64immSExt32:$src2))],
+                             IIC_IMUL64_RRI>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag GR64:$src1, i64immSExt8:$src2))],
+                            IIC_IMUL64_RRI>;
+} // SchedRW
+
+// Memory-Integer Signed Integer Multiply
+let SchedRW = [WriteIMulLd] in {
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL16_RMI>,
+                 OpSize;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i16immSExt8:$src2))], IIC_IMUL16_RMI>,
+                                         OpSize;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1), imm:$src2))],
+                            IIC_IMUL32_RMI>;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, EFLAGS,
+                           (X86smul_flag (load addr:$src1),
+                                         i32immSExt8:$src2))],
+                                         IIC_IMUL32_RMI>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                        (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                        [(set GR64:$dst, EFLAGS,
+                              (X86smul_flag (load addr:$src1),
+                                            i64immSExt32:$src2))],
+                                            IIC_IMUL64_RMI>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, EFLAGS,
+                            (X86smul_flag (load addr:$src1),
+                                          i64immSExt8:$src2))],
+                                          IIC_IMUL64_RMI>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+
+
+// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_REG>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", [], IIC_DIV32>;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+                "div{q}\t$src", [], IIC_DIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "div{b}\t$src", [], IIC_DIV8_MEM>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", [], IIC_DIV16>, OpSize,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+               "div{l}\t$src", [], IIC_DIV32>,
+             SchedLoadReg<WriteIDivLd>;
+// RDX:RAX/[mem64] = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+                "div{q}\t$src", [], IIC_DIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+
+// Signed division/remainder.
+let SchedRW = [WriteIDiv] in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", [], IIC_IDIV32>;
+// RDX:RAX/r64 = RAX,RDX
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>;
+} // SchedRW
+
+let mayLoad = 1 in {
+let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", [], IIC_IDIV8>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", [], IIC_IDIV16>, OpSize,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+               "idiv{l}\t$src", [], IIC_IDIV32>,
+             SchedLoadReg<WriteIDivLd>;
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+                "idiv{q}\t$src", [], IIC_IDIV64>,
+             SchedLoadReg<WriteIDivLd>;
+}
+} // hasSideEffects = 0
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+               "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+               "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src1)),
+                (implicit EFLAGS)], IIC_UNARY_REG>;
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src1)),
+                 (implicit EFLAGS)], IIC_UNARY_REG>;
+} // Constraints = "$src1 = $dst", SchedRW
+
+// Read-modify-write negate.
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst),
+               "neg{b}\t$dst",
+               [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst),
+               "neg{w}\t$dst",
+               [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize;
+def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
+               "neg{l}\t$dst",
+               [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // SchedRW
+} // Defs = [EFLAGS]
+
+
+// Note: NOT does not set EFLAGS!
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src1))], IIC_UNARY_REG>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+               "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src1))], IIC_UNARY_REG>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+               "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src1))], IIC_UNARY_REG>;
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src1))], IIC_UNARY_REG>;
+}
+} // Constraints = "$src1 = $dst", SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst),
+               "not{b}\t$dst",
+               [(store (not (loadi8 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst),
+               "not{w}\t$dst",
+               [(store (not (loadi16 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+               OpSize;
+def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
+               "not{l}\t$dst",
+               [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+} // SchedRW
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "inc{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+               "inc{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
+             OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               "inc{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+               IIC_UNARY_REG>,
+             Requires<[In32BitMode]>;
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 1
+
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst",
+                  [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst",
+                  [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
+                  IIC_UNARY_REG>,
+                Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst",
+                  [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+                  IIC_UNARY_REG>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst",
+                  [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+                  IIC_UNARY_REG>,
+                Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress = 1, CodeSize = 2
+
+} // Constraints = "$src1 = $dst", SchedRW
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize, Requires<[In32BitMode]>;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               Requires<[In32BitMode]>;
+  def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+// FIXME: What is this for??
+def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                Requires<[In64BitMode]>;
+def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+                  [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+                  [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                    (implicit EFLAGS)], IIC_UNARY_MEM>,
+                Requires<[In64BitMode]>;
+} // CodeSize = 2, SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+               "dec{b}\t$dst",
+               [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
+               IIC_UNARY_REG>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+               "dec{w}\t$dst",
+               [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
+               IIC_UNARY_REG>,
+             OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+               "dec{l}\t$dst",
+               [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
+               IIC_UNARY_REG>,
+             Requires<[In32BitMode]>;
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst",
+                [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))],
+                IIC_UNARY_REG>;
+} // CodeSize = 2
+} // Constraints = "$src1 = $dst", SchedRW
+
+
+let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               OpSize, Requires<[In32BitMode]>;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)], IIC_UNARY_MEM>,
+               Requires<[In32BitMode]>;
+  def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                   (implicit EFLAGS)], IIC_UNARY_MEM>;
+} // CodeSize = 2, SchedRW
+} // Defs = [EFLAGS]
+
+
+/// X86TypeInfo - This is a bunch of information that describes relevant X86
+/// information about value types.  For example, it can tell you what the
+/// register class and preferred load to use.
+class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
+                  PatFrag loadnode, X86MemOperand memoperand, ImmType immkind,
+                  Operand immoperand, SDPatternOperator immoperator,
+                  Operand imm8operand, SDPatternOperator imm8operator,
+                  bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> {
+  /// VT - This is the value type itself.
+  ValueType VT = vt;
+
+  /// InstrSuffix - This is the suffix used on instructions with this type.  For
+  /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
+  string InstrSuffix = instrsuffix;
+
+  /// RegClass - This is the register class associated with this type.  For
+  /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
+  RegisterClass RegClass = regclass;
+
+  /// LoadNode - This is the load node associated with this type.  For
+  /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
+  PatFrag LoadNode = loadnode;
+
+  /// MemOperand - This is the memory operand associated with this type.  For
+  /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
+  X86MemOperand MemOperand = memoperand;
+
+  /// ImmEncoding - This is the encoding of an immediate of this type.  For
+  /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32.  Note that i64 -> Imm32
+  /// since the immediate fields of i64 instructions is a 32-bit sign extended
+  /// value.
+  ImmType ImmEncoding = immkind;
+
+  /// ImmOperand - This is the operand kind of an immediate of this type.  For
+  /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm.  Note that i64 ->
+  /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
+  /// extended value.
+  Operand ImmOperand = immoperand;
+
+  /// ImmOperator - This is the operator that should be used to match an
+  /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
+  SDPatternOperator ImmOperator = immoperator;
+
+  /// Imm8Operand - This is the operand kind to use for an imm8 of this type.
+  /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm.  This is
+  /// only used for instructions that have a sign-extended imm8 field form.
+  Operand Imm8Operand = imm8operand;
+
+  /// Imm8Operator - This is the operator that should be used to match an 8-bit
+  /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
+  SDPatternOperator Imm8Operator = imm8operator;
+
+  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
+  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
+  /// other datatypes are odd.
+  bit HasOddOpcode = hasOddOpcode;
+
+  /// HasOpSizePrefix - This bit is set to true if the instruction should have
+  /// the 0x66 operand size prefix.  This is set for i16 types.
+  bit HasOpSizePrefix = hasOpSizePrefix;
+
+  /// HasREX_WPrefix - This bit is set to true if the instruction should have
+  /// the 0x40 REX prefix.  This is set for i64 types.
+  bit HasREX_WPrefix = hasREX_WPrefix;
+}
+
+def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
+
+
+def Xi8  : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem ,
+                       Imm8 , i8imm ,    imm,          i8imm   , invalid_node,
+                       0, 0, 0>;
+def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
+                       Imm16, i16imm,    imm,          i16i8imm, i16immSExt8,
+                       1, 1, 0>;
+def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
+                       Imm32, i32imm,    imm,          i32i8imm, i32immSExt8,
+                       1, 0, 0>;
+def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
+                       Imm32, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8,
+                       1, 0, 1>;
+
+/// ITy - This instruction base class takes the type info for the instruction.
+/// Using this, it:
+/// 1. Concatenates together the instruction mnemonic with the appropriate
+///    suffix letter, a tab, and the arguments.
+/// 2. Infers whether the instruction should have a 0x66 prefix byte.
+/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+///    or 1 (for i16,i32,i64 operations).
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+          string mnemonic, string args, list<dag> pattern,
+          InstrItinClass itin = IIC_BIN_NONMEM>
+  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
+       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
+      f, outs, ins,
+      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
+      itin> {
+
+  // Infer instruction prefixes from type info.
+  let hasOpSizePrefix = typeinfo.HasOpSizePrefix;
+  let hasREX_WPrefix  = typeinfo.HasREX_WPrefix;
+}
+
+// BinOpRR - Instructions like "add reg, reg, reg".
+class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern, InstrItinClass itin,
+              Format f = MRMDestReg>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
+    Sched<[WriteALU]>;
+
+// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has
+// just a regclass (no eflags) as a result.
+class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
+// just a EFLAGS as a result.
+class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f = MRMDestReg>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+            IIC_BIN_NONMEM, f>;
+
+// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result.
+class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
+                  IIC_BIN_NONMEM>;
+
+// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has
+// both a regclass and EFLAGS as a result, and has EFLAGS as input.
+class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
+                          EFLAGS))], IIC_BIN_NONMEM>;
+
+// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo,
+        (outs typeinfo.RegClass:$dst),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 0;
+}
+
+// BinOpRM - Instructions like "add reg, reg, [mem]".
+class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              dag outlist, list<dag> pattern>
+  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALULd, ReadAfterLd]>;
+
+// BinOpRM_R - Instructions like "add reg, reg, [mem]".
+class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_F - Instructions like "cmp reg, [mem]".
+class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              SDPatternOperator opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs),
+            [(set EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RF - Instructions like "add reg, reg, [mem]".
+class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
+
+// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]".
+class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode>
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+                    EFLAGS))]>;
+
+// BinOpRI - Instructions like "add reg, reg, imm".
+class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Format f, dag outlist, list<dag> pattern>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpRI_R - Instructions like "add reg, reg, imm".
+class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_F - Instructions like "cmp reg, imm".
+class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs),
+            [(set EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+
+// BinOpRI_RF - Instructions like "add reg, reg, imm".
+class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
+// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
+class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
+                        EFLAGS))]>;
+
+// BinOpRI8 - Instructions like "add reg, reg, imm8".
+class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+               Format f, dag outlist, list<dag> pattern>
+  : ITy<opcode, f, typeinfo, outlist,
+        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+    Sched<[WriteALU]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpRI8_R - Instructions like "add reg, reg, imm8".
+class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_F - Instructions like "cmp reg, imm8".
+class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs),
+             [(set EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RF - Instructions like "add reg, reg, imm8".
+class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
+
+// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8".
+class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
+             [(set typeinfo.RegClass:$dst, EFLAGS,
+               (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
+                       EFLAGS))]>;
+
+// BinOpMR - Instructions like "add [mem], reg".
+class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              list<dag> pattern>
+  : ITy<opcode, MRMDestMem, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+    Sched<[WriteALULd, WriteRMW]>;
+
+// BinOpMR_RMW - Instructions like "add [mem], reg".
+class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+           (implicit EFLAGS)]>;
+
+// BinOpMR_RMW_FF - Instructions like "adc [mem], reg".
+class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                    SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+          [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
+                  addr:$dst),
+           (implicit EFLAGS)]>;
+
+// BinOpMR_F - Instructions like "cmp [mem], reg".
+class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode>
+  : BinOpMR<opcode, mnemonic, typeinfo,
+            [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
+
+// BinOpMI - Instructions like "add [mem], imm".
+class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
+              Format f, list<dag> pattern, bits<8> opcode = 0x80>
+  : ITy<opcode, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = typeinfo.ImmEncoding;
+}
+
+// BinOpMI_RMW - Instructions like "add [mem], imm".
+class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src), addr:$dst),
+             (implicit EFLAGS)]>;
+// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
+class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                  SDNode opnode, Format f>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(store (opnode (typeinfo.VT (load addr:$dst)),
+                            typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
+             (implicit EFLAGS)]>;
+
+// BinOpMI_F - Instructions like "cmp [mem], imm".
+class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
+                SDPatternOperator opnode, Format f, bits<8> opcode = 0x80>
+  : BinOpMI<mnemonic, typeinfo, f,
+            [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
+                                               typeinfo.ImmOperator:$src))],
+            opcode>;
+
+// BinOpMI8 - Instructions like "add [mem], imm8".
+class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
+               Format f, list<dag> pattern>
+  : ITy<0x82, f, typeinfo,
+        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+    Sched<[WriteALULd, WriteRMW]> {
+  let ImmT = Imm8; // Always 8-bit immediate.
+}
+
+// BinOpMI8_RMW - Instructions like "add [mem], imm8".
+class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8".
+class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
+                   SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(store (opnode (load addr:$dst),
+                             typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
+              (implicit EFLAGS)]>;
+
+// BinOpMI8_F - Instructions like "cmp [mem], imm8".
+class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
+                 SDNode opnode, Format f>
+  : BinOpMI8<mnemonic, typeinfo, f,
+             [(set EFLAGS, (opnode (load addr:$dst),
+                                   typeinfo.Imm8Operator:$src))]>;
+
+// BinOpAI - Instructions like "add %eax, %eax, imm".
+class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+              Register areg, string operands>
+  : ITy<opcode, RawFrm, typeinfo,
+        (outs), (ins typeinfo.ImmOperand:$src),
+        mnemonic, operands, []>, Sched<[WriteALU]> {
+  let ImmT = typeinfo.ImmEncoding;
+  let Uses = [areg];
+  let Defs = [areg];
+  let hasSideEffects = 0;
+}
+
+/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (...".
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                         string mnemonic, Format RegMRM, Format MemMRM,
+                         SDNode opnodeflag, SDNode opnode,
+                         bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR,
+          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+        def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+        def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+        def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|AL, $src}">;
+    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|AX, $src}">;
+    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|EAX, $src}">;
+    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|RAX, $src}">;
+  }
+}
+
+/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
+/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and
+/// SBB.
+///
+/// It would be nice to get rid of the second and third argument here, but
+/// tblgen can't handle dependent type references aggressively enough: PR8330
+multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                          string mnemonic, Format RegMRM, Format MemMRM,
+                          SDNode opnode, bit CommutableRR,
+                           bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let Constraints = "$src1 = $dst" in {
+      let isCommutable = CommutableRR,
+          isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+        def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+        def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+        def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+      } // isCommutable
+
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+
+      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+        // NOTE: These are order specific, we want the ri8 forms to be listed
+        // first so that they are slightly preferred to the ri forms.
+        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+        def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+      }
+    } // Constraints = "$src1 = $dst"
+
+    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|AL, $src}">;
+    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|AX, $src}">;
+    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|EAX, $src}">;
+    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|RAX, $src}">;
+  }
+}
+
+/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
+/// defined with "(set EFLAGS, (...".  It would be really nice to find a way
+/// to factor this with the other ArithBinOp_*.
+///
+multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
+                        string mnemonic, Format RegMRM, Format MemMRM,
+                        SDNode opnode,
+                        bit CommutableRR, bit ConvertibleToThreeAddress> {
+  let Defs = [EFLAGS] in {
+    let isCommutable = CommutableRR,
+        isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+      def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+      def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+    } // isCommutable
+
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      // NOTE: These are order specific, we want the ri8 forms to be listed
+      // first so that they are slightly preferred to the ri forms.
+      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+      def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+    }
+
+    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+    // NOTE: These are order specific, we want the mi8 forms to be listed
+    // first so that they are slightly preferred to the mi forms.
+    def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8mi    : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
+    def NAME#16mi   : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
+    def NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
+    def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+
+    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|AL, $src}">;
+    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                             "{$src, %ax|AX, $src}">;
+    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                             "{$src, %eax|EAX, $src}">;
+    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                             "{$src, %rax|RAX, $src}">;
+  }
+}
+
+
+defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+                         X86and_flag, and, 1, 0>;
+defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+                         X86or_flag, or, 1, 0>;
+defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+                         X86xor_flag, xor, 1, 0>;
+defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+                         X86add_flag, add, 1, 1>;
+let isCompare = 1 in {
+defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+                         X86sub_flag, sub, 0, 0>;
+}
+
+// Arithmetic.
+let Uses = [EFLAGS] in {
+  defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                            1, 0>;
+  defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                            0, 0>;
+}
+
+let isCompare = 1 in {
+defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Semantically, test instructions are similar like AND, except they don't
+// generate a result.  From an encoding perspective, they are very different:
+// they don't have all the usual imm8 and REV forms, and are encoded into a
+// different space.
+def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
+                         (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
+
+let isCompare = 1, Defs = [EFLAGS] in {
+  let isCommutable = 1 in {
+    def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+    def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+    def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+    def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+  } // isCommutable
+
+  def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+  def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+  def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+  def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+  def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+  def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+  def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+  def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+  def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+  def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+  def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+  def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
+                           "{$src, %al|AL, $src}">;
+  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
+                           "{$src, %ax|AX, $src}">;
+  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
+                           "{$src, %eax|EAX, $src}">;
+  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
+                           "{$src, %rax|RAX, $src}">;
+
+  // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+  // register class is constrained to GR8_NOREX.
+  let isPseudo = 1 in
+  def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                        "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+}
+
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
+            IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>,
+           Sched<[WriteALULd, ReadAfterLd]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
+}
+
+let Predicates = [HasBMI] in {
+  def : Pat<(and (not GR32:$src1), GR32:$src2),
+            (ANDN32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(and (not GR64:$src1), GR64:$src2),
+            (ANDN64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+            (ANDN32rm GR32:$src1, addr:$src2)>;
+  def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+            (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// MULX Instruction
+//
+multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  let isCommutable = 1 in
+  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>;
+
+  let mayLoad = 1 in
+  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  let Uses = [EDX] in
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem>;
+  let Uses = [RDX] in
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8, OpSize;
+
+  def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adcx{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8, OpSize;
+ 
+  def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adcx{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+  let SchedRW = [WriteALU] in {
+  def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS;
+
+  def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+  } // SchedRW
+
+  let mayLoad = 1, SchedRW = [WriteALULd] in {
+  def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+             "adox{l}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS;
+ 
+  def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+             "adox{q}\t{$src, $dst|$dst, $src}",
+             [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 000000000000..aaef4a466ded
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,183 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  int Disp;
+  const GlobalValue *GV;
+  unsigned GVOpFlags;
+
+  X86AddressMode()
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+    Base.Reg = 0;
+  }
+
+
+  void getFullAddress(SmallVectorImpl<MachineOperand> &MO) {
+    assert(Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8);
+
+    if (BaseType == X86AddressMode::RegBase)
+      MO.push_back(MachineOperand::CreateReg(Base.Reg, false, false,
+                                             false, false, false, 0, false));
+    else {
+      assert(BaseType == X86AddressMode::FrameIndexBase);
+      MO.push_back(MachineOperand::CreateFI(Base.FrameIndex));
+    }
+
+    MO.push_back(MachineOperand::CreateImm(Scale));
+    MO.push_back(MachineOperand::CreateReg(IndexReg, false, false,
+                                           false, false, false, 0, false));
+
+    if (GV)
+      MO.push_back(MachineOperand::CreateGA(GV, Disp, GVOpFlags));
+    else
+      MO.push_back(MachineOperand::CreateImm(Disp));
+
+    MO.push_back(MachineOperand::CreateReg(0, false, false,
+                                           false, false, false, 0, false));
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with five
+  // values, this adds: Reg, 1, NoReg, 0, NoReg to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0).addReg(0);
+}
+
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  assert(AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else {
+    assert(AM.BaseType == X86AddressMode::FrameIndexBase);
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  }
+
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+  else
+    MIB.addImm(AM.Disp);
+
+  return MIB.addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned Flags = 0;
+  if (MCID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (MCID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset),
+                            Flags, MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg, unsigned char OpFlags) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
new file mode 100644
index 000000000000..a967a4da5cf7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -0,0 +1,111 @@
+//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 conditional move and set on condition
+// instructions.
+//
+//===----------------------------------------------------------------------===//
+
+
+// SetCC instructions.
+multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      isCommutable = 1, SchedRW = [WriteALU] in {
+    def NAME#16rr
+      : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst,
+                (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
+                IIC_CMOV16_RR>,TB,OpSize;
+    def NAME#32rr
+      : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst,
+                (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB;
+    def NAME#64rr
+      :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst,
+                (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))],
+                IIC_CMOV32_RR>, TB;
+  }
+
+  let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
+      SchedRW = [WriteALULd, ReadAfterLd] in {
+    def NAME#16rm
+      : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+          !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV16_RM>,
+                                    TB, OpSize;
+    def NAME#32rm
+      : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+          !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+    def NAME#64rm
+      :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+          !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
+          [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
+  } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst"
+} // end multiclass
+
+
+// Conditional Moves.
+defm CMOVO  : CMOV<0x40, "cmovo" , X86_COND_O>;
+defm CMOVNO : CMOV<0x41, "cmovno", X86_COND_NO>;
+defm CMOVB  : CMOV<0x42, "cmovb" , X86_COND_B>;
+defm CMOVAE : CMOV<0x43, "cmovae", X86_COND_AE>;
+defm CMOVE  : CMOV<0x44, "cmove" , X86_COND_E>;
+defm CMOVNE : CMOV<0x45, "cmovne", X86_COND_NE>;
+defm CMOVBE : CMOV<0x46, "cmovbe", X86_COND_BE>;
+defm CMOVA  : CMOV<0x47, "cmova" , X86_COND_A>;
+defm CMOVS  : CMOV<0x48, "cmovs" , X86_COND_S>;
+defm CMOVNS : CMOV<0x49, "cmovns", X86_COND_NS>;
+defm CMOVP  : CMOV<0x4A, "cmovp" , X86_COND_P>;
+defm CMOVNP : CMOV<0x4B, "cmovnp", X86_COND_NP>;
+defm CMOVL  : CMOV<0x4C, "cmovl" , X86_COND_L>;
+defm CMOVGE : CMOV<0x4D, "cmovge", X86_COND_GE>;
+defm CMOVLE : CMOV<0x4E, "cmovle", X86_COND_LE>;
+defm CMOVG  : CMOV<0x4F, "cmovg" , X86_COND_G>;
+
+
+// SetCC instructions.
+multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> {
+  let Uses = [EFLAGS] in {
+    def r    : I<opc, MRM0r,  (outs GR8:$dst), (ins),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(set GR8:$dst, (X86setcc OpNode, EFLAGS))],
+                     IIC_SET_R>, TB, Sched<[WriteALU]>;
+    def m    : I<opc, MRM0m,  (outs), (ins i8mem:$dst),
+                     !strconcat(Mnemonic, "\t$dst"),
+                     [(store (X86setcc OpNode, EFLAGS), addr:$dst)],
+                     IIC_SET_M>, TB, Sched<[WriteALU, WriteStore]>;
+  } // Uses = [EFLAGS]
+}
+
+defm SETO  : SETCC<0x90, "seto",  X86_COND_O>;   // is overflow bit set
+defm SETNO : SETCC<0x91, "setno", X86_COND_NO>;  // is overflow bit not set
+defm SETB  : SETCC<0x92, "setb",  X86_COND_B>;   // unsigned less than
+defm SETAE : SETCC<0x93, "setae", X86_COND_AE>;  // unsigned greater or equal
+defm SETE  : SETCC<0x94, "sete",  X86_COND_E>;   // equal to
+defm SETNE : SETCC<0x95, "setne", X86_COND_NE>;  // not equal to
+defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>;  // unsigned less than or equal
+defm SETA  : SETCC<0x97, "seta",  X86_COND_A>;   // unsigned greater than
+defm SETS  : SETCC<0x98, "sets",  X86_COND_S>;   // is signed bit set
+defm SETNS : SETCC<0x99, "setns", X86_COND_NS>;  // is not signed
+defm SETP  : SETCC<0x9A, "setp",  X86_COND_P>;   // is parity bit set
+defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>;  // is parity bit not set
+defm SETL  : SETCC<0x9C, "setl",  X86_COND_L>;   // signed less than
+defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
+defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
+defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
new file mode 100644
index 000000000000..d9ff0c63c55f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -0,0 +1,1839 @@
+//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various pseudo instructions used by the compiler,
+// as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matching Support
+
+def GetLo32XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 32 bits.
+  return getI32Imm((unsigned)N->getZExtValue());
+}]>;
+
+def GetLo8XForm : SDNodeXForm<imm, [{
+  // Transformation function: get the low 8 bits.
+  return getI8Imm((uint8_t)N->getZExtValue());
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Random Pseudo Instructions.
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
+
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In32BitMode]>;
+}
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In64BitMode]>;
+}
+
+
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1 in {
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset)]>;
+
+// The VAARG_64 pseudo-instruction takes the address of the va_list,
+// and places the address of the next argument into a register.
+let Defs = [EFLAGS] in
+def VAARG_64 : I<0, Pseudo,
+                 (outs GR64:$dst),
+                 (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                 "#VAARG_64 $dst, $ap, $size, $mode, $align",
+                 [(set GR64:$dst,
+                    (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+                  (implicit EFLAGS)]>;
+
+// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
+// targets.  These calls are needed to probe the stack when allocating more than
+// 4k bytes in one go. Touching the stack at 4K increments is necessary to
+// ensure that the guard pages used by the OS virtual memory manager are
+// allocated in correct sequence.
+// The main point of having separate instruction are extra unmodelled effects
+// (compared to ordinary calls) like stack pointer change.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+  def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
+                     "# dynamic stack allocation",
+                     [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from 
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[In32BitMode]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+}
+
+// The MSVC runtime contains an _ftol2 routine for converting floating-point
+// to integer values. It has a strange calling convention: the input is
+// popped from the x87 stack, and the return value is given in EDX:EAX. No
+// other registers (aside from flags) are touched.
+// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
+// variant is unnecessary.
+
+let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+  def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
+                      "# win32 fptoui",
+                      [(X86WinFTOL RFP32:$src)]>,
+                    Requires<[In32BitMode]>;
+
+  def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
+                      "# win32 fptoui",
+                      [(X86WinFTOL RFP64:$src)]>,
+                    Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let SchedRW = [WriteSystem] in {
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
+
+}
+
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+} // SchedRW
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions used by segmented stacks.
+//
+
+// This is lowered into a RET instruction by MCInstLower.  We need
+// this so that we don't have to have a MachineBasicBlock which ends
+// with a RET and also has successors.
+let isPseudo = 1 in {
+def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
+                          "", []>;
+
+// This instruction is lowered to a RET followed by a MOV.  The two
+// instructions are not generated on a higher level since then the
+// verifier sees a MachineBasicBlock ending with a non-terminator.
+def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
+                                  "", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: Set encoding to pseudo.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isCodeGenOnly = 1 in {
+def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
+                 [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
+// encoding and avoids a partial-register update sometimes, but doing so
+// at isel time interferes with rematerialization in the current register
+// allocator. For now, this is rewritten when the instruction is lowered
+// to an MCInst.
+def MOV16r0   : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+                 "",
+                 [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize,
+                 Sched<[WriteZero]>;
+
+// FIXME: Set encoding to pseudo.
+def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+}
+
+// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
+// smaller encoding, but doing so at isel time interferes with rematerialization
+// in the current register allocator. For now, this is rewritten when the
+// instruction is lowered to an MCInst.
+// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS], isCodeGenOnly=1,
+    AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isCodeGenOnly = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+                        "", [(set GR64:$dst, i64immZExt32:$src)],
+                        IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// Use sbb to materialize carry bit.
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
+// will be eliminated and that the sbb can be extended up to a wider type.  When
+// this happens, it is great.  However, if we are left with an 8-bit sbb and an
+// and, we might as well just match it as a setb.
+def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
+          (SETBr)>;
+
+// (add OP, SETB) -> (adc OP, 0)
+def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
+          (ADC64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETB) -> (sbb OP, 0)
+def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
+          (SBB64ri8 GR64:$op, 0)>;
+
+// (sub OP, SETCC_CARRY) -> (adc OP, 0)
+def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC8ri GR8:$op, 0)>;
+def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC32ri8 GR32:$op, 0)>;
+def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
+          (ADC64ri8 GR64:$op, 0)>;
+
+//===----------------------------------------------------------------------===//
+// String Pseudo Instructions
+//
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In32BitMode]>;
+def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+                   Requires<[In32BitMode]>;
+def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+                   Requires<[In32BitMode]>;
+}
+
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
+def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                    [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                    [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize,
+                   Requires<[In64BitMode]>;
+def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                    [(X86rep_movs i32)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                    [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
+                   Requires<[In64BitMode]>;
+}
+
+// FIXME: Should use "(X86rep_stos AL)" as the pattern.
+let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,ECX,EDI] in
+  def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In32BitMode]>;
+  let Uses = [AX,ECX,EDI] in
+  def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+                     Requires<[In32BitMode]>;
+  let Uses = [EAX,ECX,EDI] in
+  def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+                     Requires<[In32BitMode]>;
+}
+
+let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
+  let Uses = [AL,RCX,RDI] in
+  def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                      [(X86rep_stos i8)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+  let Uses = [AX,RCX,RDI] in
+  def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                      [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize,
+                     Requires<[In64BitMode]>;
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                      [(X86rep_stos i32)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+ 
+  let Uses = [RAX,RCX,RDI] in
+  def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                      [(X86rep_stos i64)], IIC_REP_STOS>, REP,
+                     Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// ELF TLS Support
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP] in {
+def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_addr32",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[In32BitMode]>;
+def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                  "# TLS_base_addr32",
+                  [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
+                  Requires<[In32BitMode]>;
+}
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in {
+def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_addr64",
+                  [(X86tlsaddr tls64addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                   "# TLS_base_addr64",
+                  [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
+                  Requires<[In64BitMode]>;
+}
+
+// Darwin TLS Support
+// For i386, the address of the thunk is passed on the stack, on return the
+// address of the variable is in %eax.  %ecx is trashed during the function
+// call.  All other registers are preserved.
+let Defs = [EAX, ECX, EFLAGS],
+    Uses = [ESP],
+    usesCustomInserter = 1 in
+def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                "# TLSCall_32",
+                [(X86TLSCall addr:$sym)]>,
+                Requires<[In32BitMode]>;
+
+// For x86_64, the address of the thunk is passed in %rdi, on return
+// the address of the variable is in %rax.  All other registers are preserved.
+let Defs = [RAX, EFLAGS],
+    Uses = [RSP, RDI],
+    usesCustomInserter = 1 in
+def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
+                  "# TLSCall_64",
+                  [(X86TLSCall addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions
+
+// X86 doesn't have 8-bit conditional moves. Use a customInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure.
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
+def CMOV_GR8 : I<0, Pseudo,
+                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+                 "#CMOV_GR8 PSEUDO!",
+                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+                                          imm:$cond, EFLAGS))]>;
+
+let Predicates = [NoCMov] in {
+def CMOV_GR32 : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
+                    "#CMOV_GR32* PSEUDO!",
+                    [(set GR32:$dst,
+                      (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
+def CMOV_GR16 : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
+                    "#CMOV_GR16* PSEUDO!",
+                    [(set GR16:$dst,
+                      (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+} // Predicates = [NoCMov]
+
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE1.
+let Predicates = [FPStackf32] in
+def CMOV_RFP32 : I<0, Pseudo,
+                    (outs RFP32:$dst),
+                    (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
+                    "#CMOV_RFP32 PSEUDO!",
+                    [(set RFP32:$dst,
+                      (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE2.
+let Predicates = [FPStackf64] in
+def CMOV_RFP64 : I<0, Pseudo,
+                    (outs RFP64:$dst),
+                    (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
+                    "#CMOV_RFP64 PSEUDO!",
+                    [(set RFP64:$dst,
+                      (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+def CMOV_RFP80 : I<0, Pseudo,
+                    (outs RFP80:$dst),
+                    (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
+                    "#CMOV_RFP80 PSEUDO!",
+                    [(set RFP80:$dst,
+                      (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
+                                                  EFLAGS))]>;
+} // UsesCustomInserter = 1, Uses = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic Instruction Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo atomic instructions
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+    let Defs = [EFLAGS, AL] in
+    def NAME#8  : I<0, Pseudo, (outs GR8:$dst),
+                    (ins i8mem:$ptr, GR8:$val),
+                    !strconcat(mnemonic, "8 PSEUDO!"), []>;
+    let Defs = [EFLAGS, AX] in
+    def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+                    (ins i16mem:$ptr, GR16:$val),
+                    !strconcat(mnemonic, "16 PSEUDO!"), []>;
+    let Defs = [EFLAGS, EAX] in
+    def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+                    (ins i32mem:$ptr, GR32:$val),
+                    !strconcat(mnemonic, "32 PSEUDO!"), []>;
+    let Defs = [EFLAGS, RAX] in
+    def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+                    (ins i64mem:$ptr, GR64:$val),
+                    !strconcat(mnemonic, "64 PSEUDO!"), []>;
+  }
+}
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
+  def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
+            (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
+            (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
+            (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
+            (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
+}
+
+// Atomic exchange, and, or, xor
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
+
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND",  "atomic_load_and">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR",   "atomic_load_or">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR",  "atomic_load_xor">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX",  "atomic_load_max">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN",  "atomic_load_min">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
+  let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
+      mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
+    def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                      (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+                      !strconcat(mnemonic, "6432 PSEUDO!"), []>;
+}
+
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
+defm ATOMADD  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
+defm ATOMSUB  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
+defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
+
+//===----------------------------------------------------------------------===//
+// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+// FIXME: Use normal instructions and add lock prefix dynamically.
+
+// Memory barriers
+
+// TODO: Get this to fold the constant into the instruction.
+let isCodeGenOnly = 1, Defs = [EFLAGS] in
+def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
+                      "or{l}\t{$zero, $dst|$dst, $zero}",
+                      [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK,
+                    Sched<[WriteALULd, WriteRMW]>;
+
+let hasSideEffects = 1 in
+def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
+                     "#MEMBARRIER",
+                     [(X86MemBarrier)]>, Sched<[WriteLoad]>;
+
+// RegOpc corresponds to the mr version of the instruction
+// ImmOpc corresponds to the mi version of the instruction
+// ImmOpc8 corresponds to the mi8 version of the instruction
+// ImmMod corresponds to the instruction format of the mi and mi8 versions
+multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
+                           Format ImmMod, string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                  RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+                  MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                  !strconcat(mnemonic, "{b}\t",
+                             "{$src2, $dst|$dst, $src2}"),
+                  [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   !strconcat(mnemonic, "{w}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, OpSize, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                   MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   !strconcat(mnemonic, "{l}\t",
+                              "{$src2, $dst|$dst, $src2}"),
+                   [], IIC_ALU_NONMEM>, LOCK;
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+                    MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    !strconcat(mnemonic, "{q}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_NONMEM>, LOCK;
+
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                    ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+                    ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    !strconcat(mnemonic, "{b}\t",
+                               "{$src2, $dst|$dst, $src2}"),
+                    [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, LOCK;
+
+def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+                         ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+                         ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+                         !strconcat(mnemonic, "{q}\t",
+                                    "{$src2, $dst|$dst, $src2}"),
+                         [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+                      !strconcat(mnemonic, "{w}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, OpSize, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                      ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                      ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+                      !strconcat(mnemonic, "{l}\t",
+                                 "{$src2, $dst|$dst, $src2}"),
+                      [], IIC_ALU_MEM>, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+                       ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+                       !strconcat(mnemonic, "{q}\t",
+                                  "{$src2, $dst|$dst, $src2}"),
+                       [], IIC_ALU_MEM>, LOCK;
+
+}
+
+}
+
+defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
+defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
+defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
+
+// Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
+let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+    SchedRW = [WriteALULd, WriteRMW] in {
+
+def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                 !strconcat(mnemonic, "{b}\t$dst"),
+                 [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                 !strconcat(mnemonic, "{w}\t$dst"),
+                 [], IIC_UNARY_MEM>, OpSize, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                 !strconcat(mnemonic, "{l}\t$dst"),
+                 [], IIC_UNARY_MEM>, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                  !strconcat(mnemonic, "{q}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+}
+}
+
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
+
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+               !strconcat(mnemonic, "\t$ptr"),
+               [(frag addr:$ptr)], itin>, TB, LOCK;
+}
+}
+
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                  !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                  !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                  !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                  [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                   !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
+    SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
+}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
+}
+
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
+// Atomic exchange and add
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
+      SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                    (ins GR8:$val, i8mem:$ptr),
+                    !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                    [(set GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                    itin8>;
+    def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                    (ins GR16:$val, i16mem:$ptr),
+                    !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR16:$dst,
+                       (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                    itin>, OpSize;
+    def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                    (ins GR32:$val, i32mem:$ptr),
+                    !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                    [(set
+                       GR32:$dst,
+                       (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                    itin>;
+    def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                     (ins GR64:$val, i64mem:$ptr),
+                     !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR64:$dst,
+                        (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                     itin>;
+  }
+}
+
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Conditional Move Pseudo Instructions.
+//===----------------------------------------------------------------------===//
+
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+  def CMOV_FR32 : I<0, Pseudo,
+                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F32 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V8F32 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4F64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4F64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4I64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4I64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DAG Pattern Matching Rules
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+          (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
+//  'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In static codegen with small code model, we can get the address of a label
+// into a register with 'movl'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri64i32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri64i32 tconstpool  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri64i32 tjumptable  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+          Requires<[NearData, IsStatic]>;
+
+
+
+// Calls
+
+// tls has some funny stuff here...
+// This corresponds to movabs $foo@tpoff, %rax
+def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
+          (MOV64ri tglobaltlsaddr :$dst)>;
+// This corresponds to add $foo@tpoff, %rax
+def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
+          (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
+
+
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In32BitMode]>;
+
+// FIXME: This is disabled for 32-bit PIC mode because the global base
+// register which is part of the address mode may be assigned a
+// callee-saved register.
+def : Pat<(X86tcret (load addr:$dst), imm:$off),
+          (TCRETURNmi addr:$dst, imm:$off)>,
+          Requires<[In32BitMode, IsNotPIC]>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[In32BitMode]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>,
+          Requires<[In32BitMode]>;
+
+def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
+          (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
+          (TCRETURNmi64 addr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+          Requires<[In64BitMode]>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(X86cmp GR64:$src1, 0),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
+                  Instruction Inst64> {
+  let Predicates = [HasCMov] in {
+    def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+              (Inst16 GR16:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+              (Inst32 GR32:$src2, addr:$src1)>;
+    def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+              (Inst64 GR64:$src2, addr:$src1)>;
+  }
+}
+
+defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
+defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
+defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
+defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
+defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
+defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
+defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
+defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
+defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
+defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
+defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
+defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
+defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
+defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
+defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
+defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload bool -> extload byte
+// When extloading from 16-bit and smaller memory locations into 64-bit
+// registers, use zero-extending loads so that the entire 64-bit register is
+// defined, avoiding partial-register updates.
+
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
+                         sub_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
+                                     (MOVZX32rr8 GR8 :$src), sub_16bit)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+
+// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
+def : Pat<(i32 (anyext GR16:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
+
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8  :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Pattern match OR as ADD
+//===----------------------------------------------------------------------===//
+
+// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
+// 3-addressified into an LEA instruction to avoid copies.  However, we also
+// want to finally emit these instructions as an or at the end of the code
+// generator to make the generated code easier to read.  To do this, we select
+// into "disjoint bits" pseudo ops.
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+
+  APInt KnownZero0, KnownOne0;
+  CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  APInt KnownZero1, KnownOne1;
+  CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  return (~KnownZero0 & ~KnownZero1) == 0;
+}]>;
+
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+// Try this before the selecting to OR.
+let AddedComplexity = 5, SchedRW = [WriteALU] in {
+
+let isConvertibleToThreeAddress = 1,
+    Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
+let isCommutable = 1 in {
+def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "", // orw/addw REG, REG
+                    [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
+def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "", // orl/addl REG, REG
+                    [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
+def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "", // orq/addq REG, REG
+                    [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
+} // isCommutable
+
+// NOTE: These are order specific, we want the ri8 forms to be listed
+// first so that they are slightly preferred to the ri forms.
+
+def ADD16ri8_DB : I<0, Pseudo,
+                    (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                    "", // orw/addw REG, imm8
+                    [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
+def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "", // orw/addw REG, imm
+                    [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
+
+def ADD32ri8_DB : I<0, Pseudo,
+                    (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                    "", // orl/addl REG, imm8
+                    [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
+def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "", // orl/addl REG, imm
+                    [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
+
+
+def ADD64ri8_DB : I<0, Pseudo,
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "", // orq/addq REG, imm8
+                    [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                i64immSExt8:$src2))]>;
+def ADD64ri32_DB : I<0, Pseudo,
+                     (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                      "", // orq/addq REG, imm
+                      [(set GR64:$dst, (or_is_add GR64:$src1,
+                                                  i64immSExt32:$src2))]>;
+}
+} // AddedComplexity, SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// To avoid needing to materialize an immediate in a register, use a 32-bit and
+// with implicit zero-extension instead of a 64-bit and if the immediate has at
+// least 32 bits of leading zeros. If in addition the last 32 bits can be
+// represented with a sign extension of a 8 bit constant, use that.
+
+def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri8
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo8XForm imm:$imm))),
+            sub_32bit)>;
+
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri
+              (EXTRACT_SUBREG GR64:$src, sub_32bit),
+              (i32 (GetLo32XForm imm:$imm))),
+            sub_32bit)>;
+
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
+            (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
+             sub_16bit)>,
+      Requires<[In32BitMode]>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (EXTRACT_SUBREG (MOVZX32rr8 (i8
+            (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit))>,
+      Requires<[In32BitMode]>;
+
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
+            (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
+             sub_16bit)>,
+      Requires<[In32BitMode]>;
+
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+           (EXTRACT_SUBREG (MOVSX32rr8
+            (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
+      Requires<[In64BitMode]>;
+
+// sext, sext_load, zext, zext_load
+def: Pat<(i16 (sext GR8:$src)),
+          (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(sextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
+def: Pat<(i16 (zext GR8:$src)),
+          (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
+def: Pat<(zextloadi16i8 addr:$src),
+          (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          sub_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
+                                                             GR16_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                             GR32_ABCD)),
+                                      sub_8bit_hi))>,
+      Requires<[In32BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
+          (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
+                                                                   GR32_ABCD)),
+                                             sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              sub_8bit_hi)),
+            sub_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                            sub_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            sub_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+
+// (shl x, 1) ==> (add x, x)
+// Note that if x is undef (immediate or otherwise), we could theoretically
+// end up with the two uses of x getting different values, producing a result
+// where the least significant bit is not 0. However, the probability of this
+// happening is considered low enough that this is officially not a
+// "real problem".
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// Helper imms that check if a mask doesn't change significant shift bits.
+def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
+def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL, immShift32)),
+          (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL, immShift32)),
+          (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL, immShift32)),
+          (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL, immShift32)),
+          (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL, immShift32)),
+          (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL, immShift32)),
+          (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL, immShift32)),
+          (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL, immShift32)),
+          (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL, immShift32)),
+          (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+          (SAR32mCL addr:$dst)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL, immShift64)),
+          (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+          (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL, immShift64)),
+          (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+          (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL, immShift64)),
+          (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+          (SAR64mCL addr:$dst)>;
+
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+
+
+
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// add reg, reg
+def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// add reg, mem
+def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// add reg, imm
+def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
+def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(add GR16:$src1, i16immSExt8:$src2),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(add GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub reg, reg
+def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// sub reg, mem
+def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// sub reg, imm
+def : Pat<(sub GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(sub GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// sub 0, reg
+def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
+def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
+def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
+def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
+
+// mul reg, reg
+def : Pat<(mul GR16:$src1, GR16:$src2),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(mul GR32:$src1, GR32:$src2),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// mul reg, mem
+def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// mul reg, imm
+def : Pat<(mul GR16:$src1, imm:$src2),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(mul GR32:$src1, imm:$src2),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// reg = mul mem, imm
+def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Patterns for nodes that do not produce flags, for instructions that do.
+
+// addition
+def : Pat<(add GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(add GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// subtraction
+def : Pat<(sub GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Multiply
+def : Pat<(mul GR64:$src1, GR64:$src2),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// Increment reg.
+def : Pat<(add GR8 :$src, 1), (INC8r     GR8 :$src)>;
+def : Pat<(add GR16:$src, 1), (INC16r    GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, 1), (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC32r    GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, 1), (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, 1), (INC64r    GR64:$src)>;
+
+// Decrement reg.
+def : Pat<(add GR8 :$src, -1), (DEC8r     GR8 :$src)>;
+def : Pat<(add GR16:$src, -1), (DEC16r    GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR16:$src, -1), (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC32r    GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(add GR32:$src, -1), (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(add GR64:$src, -1), (DEC64r    GR64:$src)>;
+
+// or reg/reg.
+def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
+
+// or reg/mem
+def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// or reg/imm
+def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(or GR16:$src1, i16immSExt8:$src2),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(or GR32:$src1, i32immSExt8:$src2),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt8:$src2),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(or GR64:$src1, i64immSExt32:$src2),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// xor reg/reg
+def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// xor reg/mem
+def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// xor reg/imm
+def : Pat<(xor GR8:$src1, imm:$src2),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, imm:$src2),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(xor GR32:$src1, imm:$src2),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// and reg/reg
+def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
+def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
+
+// and reg/mem
+def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// and reg/imm
+def : Pat<(and GR8:$src1, imm:$src2),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, imm:$src2),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(and GR32:$src1, imm:$src2),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(and GR16:$src1, i16immSExt8:$src2),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(and GR32:$src1, i32immSExt8:$src2),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt8:$src2),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(and GR64:$src1, i64immSExt32:$src2),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Bit scan instruction patterns to match explicit zero-undef behavior.
+def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
+def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
+def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
+def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
+def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
new file mode 100644
index 000000000000..0e696513d47c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -0,0 +1,287 @@
+//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 jump, return, call, and related instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+// Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret",
+                    [(X86retflag 0)], IIC_RET>;
+  def RETW   : I   <0xC3, RawFrm, (outs), (ins),
+                    "ret{w}",
+                    [], IIC_RET>, OpSize;
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret\t$amt",
+                    [(X86retflag timm:$amt)], IIC_RET_IMM>;
+  def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+                    "ret{w}\t$amt",
+                    [], IIC_RET_IMM>, OpSize;
+  def LRETL  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{l|f}", [], IIC_RET>;
+  def LRETW  : I   <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{w|f}", [], IIC_RET>, OpSize;
+  def LRETQ  : RI  <0xCB, RawFrm, (outs), (ins),
+                    "{l}ret{q|f}", [], IIC_RET>;
+  def LRETI  : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{l|f}\t$amt", [], IIC_RET>;
+  def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize;
+}
+
+// Unconditional branches.
+let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+  def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+                        "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
+                       "jmp\t$dst", [], IIC_JMP_REL>;
+  // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
+  // with JMP_1.
+  def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
+                       "jmpq\t$dst", [], IIC_JMP_REL>;
+}
+
+// Conditional Branches.
+let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
+  multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
+                       IIC_Jcc>;
+    def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
+                       [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB;
+  }
+}
+
+defm JO  : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
+defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
+defm JB  : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
+defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
+defm JE  : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
+defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
+defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
+defm JA  : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
+defm JS  : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
+defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
+defm JP  : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
+defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
+defm JL  : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
+defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
+defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
+defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
+
+// jcx/jecx/jrcx instructions.
+let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+  // These are the 32-bit versions of this instruction for the asmparser.  In
+  // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
+  // jecxz.
+  let Uses = [CX] in
+    def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                        "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>;
+  let Uses = [ECX] in
+    def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                           "jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>;
+
+  // J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
+  // In 64-bit mode, the address size prefix is jecxz and the unprefixed version
+  // is jrcxz.
+  let Uses = [ECX] in
+    def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                            "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
+  let Uses = [RCX] in
+    def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
+                           "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>,
+                   Sched<[WriteJump]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[In32BitMode]>, Sched<[WriteJumpLd]>;
+
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
+                   Sched<[WriteJump]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
+                   Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
+
+  def FARJMP16i  : Iseg16<0xEA, RawFrmImm16, (outs),
+                          (ins i16imm:$off, i16imm:$seg),
+                          "ljmp{w}\t{$seg, $off|$off, $seg}", [],
+                          IIC_JMP_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+  def FARJMP32i  : Iseg32<0xEA, RawFrmImm16, (outs),
+                          (ins i32imm:$off, i16imm:$seg),
+                          "ljmp{l}\t{$seg, $off|$off, $seg}", [],
+                          IIC_JMP_FAR_PTR>, Sched<[WriteJump]>;
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                   Sched<[WriteJump]>;
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
+                     "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize,
+                   Sched<[WriteJumpLd]>;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
+                   Sched<[WriteJumpLd]>;
+}
+
+
+// Loop instructions
+let SchedRW = [WriteJump] in {
+def LOOP   : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
+def LOOPE  : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
+def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Uses = [ESP] in {
+    def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
+                           (outs), (ins i32imm_pcrel:$dst),
+                           "call{l}\t$dst", [], IIC_CALL_RI>,
+                      Requires<[In32BitMode]>, Sched<[WriteJump]>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
+                        "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
+                      Requires<[In32BitMode]>, Sched<[WriteJump]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
+                        "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
+                        IIC_CALL_MEM>,
+                      Requires<[In32BitMode,FavorMemIndirectCall]>,
+                      Sched<[WriteJumpLd]>;
+
+    def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
+                             (ins i16imm:$off, i16imm:$seg),
+                             "lcall{w}\t{$seg, $off|$off, $seg}", [],
+                             IIC_CALL_FAR_PTR>, OpSize, Sched<[WriteJump]>;
+    def FARCALL32i  : Iseg32<0x9A, RawFrmImm16, (outs),
+                             (ins i32imm:$off, i16imm:$seg),
+                             "lcall{l}\t{$seg, $off|$off, $seg}", [],
+                             IIC_CALL_FAR_PTR>, Sched<[WriteJump]>;
+
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize,
+                      Sched<[WriteJumpLd]>;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>,
+                      Sched<[WriteJumpLd]>;
+
+    // callw for 16 bit code for the assembler.
+    let isAsmParserOnly = 1 in
+      def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
+                       (outs), (ins i16imm_pcrel:$dst),
+                       "callw\t$dst", []>, OpSize;
+  }
+
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP] in {
+  def TCRETURNdi : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+  def TCRETURNri : PseudoI<(outs),
+                     (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi : PseudoI<(outs),
+                     (ins i32mem_TC:$dst, i32imm:$offset), []>;
+
+  // FIXME: The should be pseudo instructions that are lowered when going to
+  // mcinst.
+  def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst),
+                           "jmp\t$dst  # TAILCALL",
+                           [], IIC_JMP_REL>;
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                   "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
+  let mayLoad = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
+                   "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+// RSP is marked as a use to prevent stack-pointer assignments that appear
+// immediately before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+  // NOTE: this pattern doesn't match "X86call imm", because we do not know
+  // that the offset between an arbitrary immediate and the call will fit in
+  // the 32-bit pcrel field that we have.
+  def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
+                        (outs), (ins i64i32imm_pcrel:$dst),
+                        "call{q}\t$dst", [], IIC_CALL_RI>,
+                      Requires<[In64BitMode]>;
+  def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
+                        "call{q}\t{*}$dst", [(X86call GR64:$dst)],
+                        IIC_CALL_RI>,
+                      Requires<[In64BitMode]>;
+  def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
+                        "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
+                        IIC_CALL_MEM>,
+                      Requires<[In64BitMode,FavorMemIndirectCall]>;
+
+  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+}
+
+let isCall = 1, isCodeGenOnly = 1 in
+  // __chkstk(MSVC):     clobber R10, R11 and EFLAGS.
+  // ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
+  let Defs = [RAX, R10, R11, RSP, EFLAGS],
+      Uses = [RSP] in {
+    def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
+                      (outs), (ins i64i32imm_pcrel:$dst),
+                      "call{q}\t$dst", [], IIC_CALL_RI>,
+                    Requires<[IsWin64]>, Sched<[WriteJump]>;
+  }
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+    SchedRW = [WriteJump] in {
+  def TCRETURNdi64 : PseudoI<(outs),
+                      (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+                      []>;
+  def TCRETURNri64 : PseudoI<(outs),
+                      (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+  let mayLoad = 1 in
+  def TCRETURNmi64 : PseudoI<(outs),
+                       (ins i64mem_TC:$dst, i32imm:$offset), []>;
+
+  def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
+                                      (ins i64i32imm_pcrel:$dst),
+                   "jmp\t$dst  # TAILCALL", [], IIC_JMP_REL>;
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
+                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+
+  let mayLoad = 1 in
+  def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
+                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
new file mode 100644
index 000000000000..6dc7175357b3
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td
@@ -0,0 +1,186 @@
+//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the sign and zero extension operations.
+//
+//===----------------------------------------------------------------------===//
+
+let neverHasSideEffects = 1 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+
+
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+
+
+// Sign/Zero extenders
+let neverHasSideEffects = 1 in {
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>,
+                   TB, OpSize, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>,
+                   TB, OpSize, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                   Sched<[WriteALU]>;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))], IIC_MOVSX>, TB,
+                   Sched<[WriteALULd]>;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                   Sched<[WriteALU]>;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>,
+                   TB, Sched<[WriteALULd]>;
+
+let neverHasSideEffects = 1 in {
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>,
+                   TB, OpSize, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                   "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>,
+                   TB, OpSize, Sched<[WriteALULd]>;
+} // neverHasSideEffects = 1
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   Sched<[WriteALU]>;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))], IIC_MOVZX>, TB,
+                   Sched<[WriteALULd]>;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   Sched<[WriteALU]>;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))], IIC_MOVZX>,
+                   TB, Sched<[WriteALULd]>;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+let neverHasSideEffects = 1, isCodeGenOnly = 1 in {
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALU]>;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                         [], IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+}
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))], IIC_MOVSX>, TB,
+                    Sched<[WriteALU]>;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))], IIC_MOVSX>,
+                    TB, Sched<[WriteALULd]>;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))], IIC_MOVSX>,
+                    Sched<[WriteALU]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))], IIC_MOVSX>,
+                    Sched<[WriteALULd]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALU]>;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
+                       TB, Sched<[WriteALULd]>;
+
+// FIXME: These should be Pat patterns.
+let isCodeGenOnly = 1 in {
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                   "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
+                   Sched<[WriteALU]>;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                   "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
+                   TB, Sched<[WriteALULd]>;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                   "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
+                   Sched<[WriteALU]>;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                   "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
+                   IIC_MOVZX>, TB, Sched<[WriteALULd]>;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+                    "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>,
+                    Sched<[WriteALU]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
+                    IIC_MOVZX>, Sched<[WriteALULd]>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
new file mode 100644
index 000000000000..7759a8a2dabb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -0,0 +1,368 @@
+//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes FMA (Fused Multiply-Add) instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FMA3 - Intel 3 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    SDPatternOperator Op = null_frag> {
+  let isCommutable = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
+                                               VR128:$src1, VR128:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
+                                               (MemFrag128 addr:$src3))))]>;
+
+  let isCommutable = 1 in
+  def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
+
+  let mayLoad = 1 in
+  def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR256:$dst,
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpcodeStr, string PackTy,
+                       PatFrag MemFrag128, PatFrag MemFrag256,
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  defm r213 : fma3p_rm<opc213,
+                       !strconcat(OpcodeStr, "213", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3p_rm<opc132,
+                       !strconcat(OpcodeStr, "132", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  defm r231 : fma3p_rm<opc231,
+                       !strconcat(OpcodeStr, "231", PackTy),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+} // neverHasSideEffects = 1
+}
+
+// Fused Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
+                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
+                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
+                                 memopv4f32, memopv8f32, X86Fmaddsub,
+                                 v4f32, v8f32>;
+  defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
+                                 memopv4f32, memopv8f32, X86Fmsubadd,
+                                 v4f32, v8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
+                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
+                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+                                 memopv2f64, memopv4f64, X86Fmaddsub,
+                                 v2f64, v4f64>, VEX_W;
+  defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+                                 memopv2f64, memopv4f64, X86Fmsubadd,
+                                 v2f64, v4f64>, VEX_W;
+}
+
+// Fused Negative Multiply-Add
+let ExeDomain = SSEPackedSingle in {
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
+                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
+                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
+                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
+                               v4f64>, VEX_W;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
+                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    SDPatternOperator OpNode = null_frag> {
+  let isCommutable = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
+                        ComplexPattern mem_cpat, Intrinsic IntId,
+                        RegisterClass RC> {
+  let isCommutable = 1 in
+  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
+                     VR128:$src3))]>;
+  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
+}
+} // Constraints = "$src1 = $dst"
+
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                       string OpStr, string PackTy, Intrinsic Int,
+                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
+                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
+                       ComplexPattern mem_cpat> {
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                       x86memop, RC, OpVT, mem_frag>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                       x86memop, RC, OpVT, mem_frag>;
+}
+
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                     x86memop, RC, OpVT, mem_frag, OpNode>,
+            fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+                         memop, mem_cpat, Int, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode,
+                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode,
+                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+}
+
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+
+
+//===----------------------------------------------------------------------===//
+// FMA4 - AMD 4 operand Fused Multiply-Add instructions
+//===----------------------------------------------------------------------===//
+
+
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+}
+
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
+                 PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
+                              (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
+  def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4, VEX_L;
+  def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
+  def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+  def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+                (ins VR256:$src1, VR256:$src2, VR256:$src3),
+                !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
+} // isCodeGenOnly = 1
+}
+
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+  defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+  defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                            memopv4f32, memopv8f32>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+  defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+  defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                            memopv2f64, memopv4f64>;
+}
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 000000000000..2224a08d59f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,692 @@
+//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, 
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+                              SDNPMemOperand]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
+                              SDNPMemOperand]>;
+def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect,
+                              SDNPMemOperand]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in {  // Expanded after instruction selection.
+  def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src),
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src),
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src),
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src),
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src),
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src),
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values, 
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instruction for FP stack return values.
+def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>;
+
+// FpIf32, FpIf64 - Floating Point Pseudo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst), 
+                     (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src), 
+                 !strconcat("f", asmstring, "{s}\t$src")> { 
+  let mayLoad = 1; 
+}
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
+                 !strconcat("f", asmstring, "{l}\t$src")> { 
+  let mayLoad = 1; 
+}
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src), 
+                  !strconcat("fi", asmstring, "{s}\t$src")> { 
+  let mayLoad = 1; 
+}
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src), 
+                  !strconcat("fi", asmstring, "{l}\t$src")> { 
+  let mayLoad = 1; 
+}
+}
+
+let Defs = [FPSW] in {
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+}
+
+class FPST0rInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r   : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
+def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
+def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r   : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
+def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
+def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+def COM_FST0r   : FPST0rInst <0xD0, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <0xD8, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+let Defs = [FPSW] in {
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+} // Defs = [FPSW]
+
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>,
+                                        Requires<[HasCMov]>;
+}
+
+let Defs = [FPSW] in {
+let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+} // Uses = [EFLAGS], Constraints = "$src1 = $dst"
+
+let Predicates = [HasCMov] in {
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVE_F  : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
+def CMOVP_F  : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
+} // Predicates = [HasCMov]
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
+                    IIC_FLD>;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src",
+                    IIC_FLD>;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src",
+                    IIC_FLD80>;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src",
+                    IIC_FILD>;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src",
+                    IIC_FILD>;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src",
+                    IIC_FILD>;
+}
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst",
+                    IIC_FST>;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst",
+                    IIC_FST>;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst",
+                    IIC_FST>;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst",
+                    IIC_FST>;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst",
+                    IIC_FST80>;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst",
+                    IIC_FIST>;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst",
+                    IIC_FIST>;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst",
+                    IIC_FIST>;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
+                    IIC_FIST>;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+let Predicates = [HasSSE3] in {
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>;
+} // Predicates = [HasSSE3]
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst",
+  IIC_FST>;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst",
+  IIC_FST>;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
+  "fisttp{ll}\t$dst", IIC_FST>;
+}
+
+// FP Stack manipulation instructions.
+let SchedRW = [WriteMove] in {
+def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op",
+                   IIC_FLD>, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op",
+                   IIC_FST>, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op",
+                   IIC_FST>, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op",
+                   IIC_FXCH>, D9;
+}
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+let SchedRW = [WriteZero] in {
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz", IIC_FLDZ>, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1", IIC_FIST>, D9;
+}
+
+// Floating point compares.
+let SchedRW = [WriteFAdd] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+} // SchedRW
+} // Defs = [FPSW]
+
+let SchedRW = [WriteFAdd] in {
+// CC = ST(0) cmp ST(i)
+let Defs = [EFLAGS, FPSW] in {
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>;
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>;
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>;
+}
+
+let Defs = [FPSW], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucom\t$reg", IIC_FUCOM>, DD;
+def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucomp\t$reg", IIC_FUCOM>, DD;
+def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins),
+                    "fucompp", IIC_FUCOM>, DA;
+}
+
+let Defs = [EFLAGS, FPSW], Uses = [ST0] in {
+def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucomi\t$reg", IIC_FUCOMI>, DB;
+def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucompi\t$reg", IIC_FUCOMI>, DF;
+}
+
+let Defs = [EFLAGS, FPSW] in {
+def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+                  "fcomi\t$reg", IIC_FCOMI>, DB;
+def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+                   "fcompi\t$reg", IIC_FCOMI>, DF;
+}
+} // SchedRW
+
+// Floating point flag ops.
+let SchedRW = [WriteALU] in {
+let Defs = [AX], Uses = [FPSW] in
+def FNSTSW16r : I<0xE0, RawFrm,                  // AX = fp flags
+                  (outs), (ins), "fnstsw %ax",
+                  [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
+} // SchedRW
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
+                Sched<[WriteLoad]>;
+
+// FPU control instructions
+let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW] in
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", [], IIC_FNINIT>, DB;
+def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
+                "ffree\t$reg", IIC_FFREE>, DD;
+// Clear exceptions
+
+let Defs = [FPSW] in
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", [], IIC_FNCLEX>, DB;
+} // SchedRW
+
+// Operandless floating-point instructions for the disassembler.
+let SchedRW = [WriteMicrocoded] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", [], IIC_FNOP>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", [], IIC_FXAM>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", [], IIC_FLDL>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", [], IIC_FLDL>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", [], IIC_FLDL>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", [], IIC_FLDL>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", [], IIC_FLDL>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", [], IIC_F2XM1>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", [], IIC_FYL2X>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", [], IIC_FPTAN>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", [], IIC_FPATAN>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", [], IIC_FXTRACT>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", [], IIC_FPREM1>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", [], IIC_FPSTP>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", [], IIC_FPSTP>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", [], IIC_FPREM>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", [], IIC_FYL2XP1>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", [], IIC_FSINCOS>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", [], IIC_FRNDINT>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", [], IIC_FSCALE>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", [], IIC_FCOMPP>, DE;
+
+def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+               "fxsave\t$dst", [], IIC_FXSAVE>, TB;
+def FXSAVE64 : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+                 "fxsaveq\t$dst", [], IIC_FXSAVE>, TB, REX_W,
+                 Requires<[In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", [], IIC_FXRSTOR>, TB;
+def FXRSTOR64 : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                  "fxrstorq\t$src", [], IIC_FXRSTOR>, TB, REX_W,
+                  Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, 
+                                                          RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+                                                         RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+           Requires<[FPStackf64]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 000000000000..0ef9491eb7fc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -0,0 +1,658 @@
+//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRMInitReg : Format<32>;
+def MRM_C1 : Format<33>;
+def MRM_C2 : Format<34>;
+def MRM_C3 : Format<35>;
+def MRM_C4 : Format<36>;
+def MRM_C8 : Format<37>;
+def MRM_C9 : Format<38>;
+def MRM_E8 : Format<39>;
+def MRM_F0 : Format<40>;
+def MRM_F8 : Format<41>;
+def MRM_F9 : Format<42>;
+def RawFrmImm8 : Format<43>;
+def RawFrmImm16 : Format<44>;
+def MRM_D0 : Format<45>;
+def MRM_D1 : Format<46>;
+def MRM_D4 : Format<47>;
+def MRM_D5 : Format<48>;
+def MRM_D6 : Format<49>;
+def MRM_D8 : Format<50>;
+def MRM_D9 : Format<51>;
+def MRM_DA : Format<52>;
+def MRM_DB : Format<53>;
+def MRM_DC : Format<54>;
+def MRM_DD : Format<55>;
+def MRM_DE : Format<56>;
+def MRM_DF : Format<57>;
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoImm      : ImmType<0>;
+def Imm8       : ImmType<1>;
+def Imm8PCRel  : ImmType<2>;
+def Imm16      : ImmType<3>;
+def Imm16PCRel : ImmType<4>;
+def Imm32      : ImmType<5>;
+def Imm32PCRel : ImmType<6>;
+def Imm64      : ImmType<7>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Class specifying the SSE execution domain, used by the SSEDomainFix pass.
+// Keep in sync with tables in X86InstrInfo.cpp.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain   : Domain<0>;
+def SSEPackedSingle : Domain<1>;
+def SSEPackedDouble : Domain<2>;
+def SSEPackedInt    : Domain<3>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class SegFS  { bits<2> SegOvrBits = 1; }
+class SegGS  { bits<2> SegOvrBits = 2; }
+class TB     { bits<5> Prefix = 1; }
+class REP    { bits<5> Prefix = 2; }
+class D8     { bits<5> Prefix = 3; }
+class D9     { bits<5> Prefix = 4; }
+class DA     { bits<5> Prefix = 5; }
+class DB     { bits<5> Prefix = 6; }
+class DC     { bits<5> Prefix = 7; }
+class DD     { bits<5> Prefix = 8; }
+class DE     { bits<5> Prefix = 9; }
+class DF     { bits<5> Prefix = 10; }
+class XD     { bits<5> Prefix = 11; }
+class XS     { bits<5> Prefix = 12; }
+class T8     { bits<5> Prefix = 13; }
+class TA     { bits<5> Prefix = 14; }
+class A6     { bits<5> Prefix = 15; }
+class A7     { bits<5> Prefix = 16; }
+class T8XD   { bits<5> Prefix = 17; }
+class T8XS   { bits<5> Prefix = 18; }
+class TAXD   { bits<5> Prefix = 19; }
+class XOP8   { bits<5> Prefix = 20; }
+class XOP9   { bits<5> Prefix = 21; }
+class VEX    { bit hasVEXPrefix = 1; }
+class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
+class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
+class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
+class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
+class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
+class MemOp4 { bit hasMemOp4Prefix = 1; }
+class XOP { bit hasXOP_Prefix = 1; }
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr,
+              InstrItinClass itin,
+              Domain d = GenericDomain>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<6> FormBits = Form.Value;
+  ImmType ImmT = i;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  // If this is a pseudo instruction, mark it isCodeGenOnly.
+  let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
+
+  let Itinerary = itin;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  bits<5> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
+  FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  bits<2> SegOvrBits = 0;   // Segment override prefix.
+  Domain ExeDomain = d;
+  bit hasVEXPrefix = 0;     // Does this inst require a VEX prefix?
+  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bit hasVEX_4VPrefix = 0;  // Does this inst require the VEX.VVVV field?
+  bit hasVEX_4VOp3Prefix = 0;  // Does this inst require the VEX.VVVV field to
+                               // encode the third operand?
+  bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
+                            // to be encoded in a immediate field?
+  bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
+  bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
+  bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
+  bit hasXOP_Prefix = 0;    // Does this inst require an XOP prefix?
+
+  // TSFlags layout should be kept in sync with X86InstrInfo.h.
+  let TSFlags{5-0}   = FormBits;
+  let TSFlags{6}     = hasOpSizePrefix;
+  let TSFlags{7}     = hasAdSizePrefix;
+  let TSFlags{12-8}  = Prefix;
+  let TSFlags{13}    = hasREX_WPrefix;
+  let TSFlags{16-14} = ImmT.Value;
+  let TSFlags{19-17} = FPForm.Value;
+  let TSFlags{20}    = hasLockPrefix;
+  let TSFlags{22-21} = SegOvrBits;
+  let TSFlags{24-23} = ExeDomain.Value;
+  let TSFlags{32-25} = Opcode;
+  let TSFlags{33}    = hasVEXPrefix;
+  let TSFlags{34}    = hasVEX_WPrefix;
+  let TSFlags{35}    = hasVEX_4VPrefix;
+  let TSFlags{36}    = hasVEX_4VOp3Prefix;
+  let TSFlags{37}    = hasVEX_i8ImmReg;
+  let TSFlags{38}    = hasVEX_L;
+  let TSFlags{39}    = ignoresVEX_L;
+  let TSFlags{40}    = has3DNow0F0FOpcode;
+  let TSFlags{41}    = hasMemOp4Prefix;
+  let TSFlags{42}    = hasXOP_Prefix;
+}
+
+class PseudoI<dag oops, dag iops, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> {
+  let Pattern = pattern;
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm,
+        list<dag> pattern, InstrItinClass itin = NoItinerary,
+        Domain d = GenericDomain>
+  : X86Inst<o, f, NoImm, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary,
+           Domain d = GenericDomain>
+  : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+           : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          InstrItinClass itin = NoItinerary>
+  : I<o, F, outs, ins, asm, [], itin> {}
+
+// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+           InstrItinClass itin = NoItinerary>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> {
+  let FPForm = fp;
+  let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm16, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : X86Inst<o, f, Imm32, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+def __xs : XS;
+
+// SI - SSE 1 & 2 scalar instructions
+class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// SIi8 - SSE 1 & 2 scalar instructions
+class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// PI - SSE 1 & 2 packed instructions
+class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+         InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
+}
+
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
+// PIi8 - SSE 1 & 2 packed instructions with immediate
+class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin, Domain d>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+
+  // AVX instructions have a 'v' prefix in the mnemonic
+  let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+}
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with TB prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+//   VSSI  - SSE1 instructions with XS prefix in AVX form.
+//   VPSI  - SSE1 instructions with TB prefix in AVX form.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+        Requires<[UseSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+        Requires<[UseSSE1]>;
+class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedSingle>, TB,
+        Requires<[HasAVX]>;
+
+// SSE2 Instruction Templates:
+// 
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   S2SI   - SSE2 instructions with XS prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with TB and OpSize prefixes.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+//   VSDI   - SSE2 instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
+class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
+class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+        Requires<[UseSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+        Requires<[UseSSE2]>;
+class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
+        Requires<[HasAVX]>;
+class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
+        Requires<[HasAVX]>;
+class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
+        OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
+        Requires<[UseSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
+        Requires<[UseSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
+          list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
+        Requires<[UseSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. The 64-bit versions are grouped with the MMX
+// classes. They need to be enabled even if AVX is enabled.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[UseSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+// 
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[UseSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSE41]>;
+
+// SSE4.2 Instruction Templates:
+// 
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[UseSSE42]>;
+
+//   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
+
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSE42]>;
+
+// AVX Instruction Templates:
+//   Instructions introduced in AVX (no SSE equivalent forms)
+//
+//   AVX8I - AVX instructions with T8 and OpSize prefix.
+//   AVXAIi8 - AVX instructions with TA, OpSize prefix and ImmT = Imm8.
+class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+        Requires<[HasAVX]>;
+class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+        Requires<[HasAVX]>;
+
+// AVX2 Instruction Templates:
+//   Instructions introduced in AVX2 (no SSE equivalent forms)
+//
+//   AVX28I - AVX2 instructions with T8 and OpSize prefix.
+//   AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8.
+class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+        Requires<[HasAVX2]>;
+class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+        Requires<[HasAVX2]>;
+
+// AES Instruction Templates:
+//
+// AES8I
+// These use the same encoding as the SSE4.2 T8 and TA encodings.
+class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasAES]>;
+
+class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[HasAES]>;
+
+// PCLMUL Instruction Templates
+class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        OpSize, Requires<[HasPCLMUL]>;
+
+class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                  list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        OpSize, VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+
+// FMA3 Instruction Templates
+class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8,
+        OpSize, VEX_4V, FMASC, Requires<[HasFMA]>;
+
+// FMA4 Instruction Templates
+class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
+        OpSize, VEX_4V, VEX_I8IMM, FMASC, Requires<[HasFMA4]>;
+
+// XOP 2, 3 and 4 Operand Instruction Template
+class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP, XOP9, Requires<[HasXOP]>;
+
+// XOP 2, 3 and 4 Operand Instruction Templates with imm byte
+class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>,
+         XOP, XOP8, Requires<[HasXOP]>;
+
+//  XOP 5 operand instruction (VEX encoding!)
+class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        OpSize, VEX_4V, VEX_I8IMM, Requires<[HasXOP]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm,
+         list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii32<o, F, outs, ins, asm, pattern, itin>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : SDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : PDI<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 000000000000..2a72fb6f7b2a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -0,0 +1,432 @@
+//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86umin    : SDNode<"X86ISD::UMIN",      SDTIntBinOp>;
+def X86umax    : SDNode<"X86ISD::UMAX",      SDTIntBinOp>;
+def X86smin    : SDNode<"X86ISD::SMIN",      SDTIntBinOp>;
+def X86smax    : SDNode<"X86ISD::SMAX",      SDTIntBinOp>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
+def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
+def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
+def X86cmpsd   : SDNode<"X86ISD::FSETCCsd",    SDTX86Cmpsd>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86andnp   : SDNode<"X86ISD::ANDNP",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86psign   : SDNode<"X86ISD::PSIGN",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+
+def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisOpSmallerThanOp<1, 0> ]>>;
+
+def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
+                 SDTypeProfile<1, 1,
+                 [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
+
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vfpext  : SDNode<"X86ISD::VFPEXT",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>]>>;
+
+def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
+def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
+def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
+def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+
+def X86vshl    : SDNode<"X86ISD::VSHL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsrl    : SDNode<"X86ISD::VSRL",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+def X86vsra    : SDNode<"X86ISD::VSRA",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                      SDTCisVec<2>]>>;
+
+def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
+def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
+def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>;
+def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+
+def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
+                        SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                      SDTCisSameAs<1,2>]>>;
+
+// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
+// translated into one of the target nodes below during lowering.
+// Note: this is a work in progress...
+def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>]>;
+
+def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                 SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                 SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                             SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+
+def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
+def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
+def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;
+
+def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>;
+
+def X86Movddup  : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>;
+def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>;
+def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>;
+
+def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2Op>;
+def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2Op>;
+
+def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2Op>;
+def X86Movlhpd : SDNode<"X86ISD::MOVLHPD", SDTShuff2Op>;
+def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
+
+def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
+def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
+
+def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
+def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
+
+def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
+def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
+def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
+
+def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+
+def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+
+def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
+                                         SDTCisVT<4, i8>]>;
+def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                         SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
+                                         SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
+                                         SDTCisVT<6, i8>]>;
+
+def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>;
+def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
+                                   SDNPWantRoot]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+// 128-bit load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// 256-bit load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
+// Like 'store', but always requires 128-bit vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'X86vzload', but always requires 128-bit vector alignment.
+def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{
+  return cast<MemSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr),
+                               (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr),
+                               (f64 (alignedload node:$ptr))>;
+
+// 128-bit aligned load pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv4f32 : PatFrag<(ops node:$ptr),
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr),
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr),
+                               (v2i64 (alignedload node:$ptr))>;
+
+// 256-bit aligned load pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv8f32 : PatFrag<(ops node:$ptr),
+                               (v8f32 (alignedload256 node:$ptr))>;
+def alignedloadv4f64 : PatFrag<(ops node:$ptr),
+                               (v4f64 (alignedload256 node:$ptr))>;
+def alignedloadv4i64 : PatFrag<(ops node:$ptr),
+                               (v4i64 (alignedload256 node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+
+// 128-bit memop pattern fragments
+// NOTE: all 128-bit integer vector loads are promoted to v2i64
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+
+// 256-bit memop pattern fragments
+// NOTE: all 256-bit integer vector loads are promoted to v4i64
+def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
+def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
+def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
+
+// MOVNT Support
+// Like 'store', but requires the non-temporal bit to be set
+def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                           (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal();
+  return false;
+}]>;
+
+def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                                    (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() && !ST->isTruncatingStore() &&
+           ST->getAddressingMode() == ISD::UNINDEXED &&
+           ST->getAlignment() >= 16;
+  return false;
+}]>;
+
+def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
+                                      (st node:$val, node:$ptr), [{
+  if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
+    return ST->isNonTemporal() &&
+           ST->getAlignment() < 16;
+  return false;
+}]>;
+
+// 128-bit bitconvert pattern fragments
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+// 256-bit bitconvert pattern fragments
+def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>;
+def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
+def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
+def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128 imm.
+def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
+}]>;
+
+// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
+// VINSERTF128 imm.
+def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
+}]>;
+
+def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACTF128Index(N);
+}], EXTRACT_get_vextractf128_imm>;
+
+def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERTF128Index(N);
+}], INSERT_get_vinsertf128_imm>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 000000000000..7ba542c87520
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,4898 @@
+//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include <limits>
+
+#define GET_INSTRINFO_CTOR
+#include "X86GenInstrInfo.inc"
+
+using namespace llvm;
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
+
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 3)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_3    = 3,
+  TB_INDEX_MASK = 0xf,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 4,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 5,
+
+  TB_FOLDED_LOAD  = 1 << 6,
+  TB_FOLDED_STORE = 1 << 7,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
+};
+
+struct X86OpTblEntry {
+  uint16_t RegOp;
+  uint16_t MemOp;
+  uint16_t Flags;
+};
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+  : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
+                     ? X86::ADJCALLSTACKDOWN64
+                     : X86::ADJCALLSTACKDOWN32),
+                    (tm.getSubtarget<X86Subtarget>().is64Bit()
+                     ? X86::ADJCALLSTACKUP64
+                     : X86::ADJCALLSTACKUP32)),
+    TM(tm), RI(tm, *this) {
+
+  static const X86OpTblEntry OpTbl2Addr[] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64_16r,   X86::DEC64_16m,  0 },
+    { X86::DEC64_32r,   X86::DEC64_32m,  0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64_16r,   X86::INC64_16m,  0 },
+    { X86::INC64_32r,   X86::INC64_32m,  0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+    unsigned RegOp = OpTbl2Addr[i].RegOp;
+    unsigned MemOp = OpTbl2Addr[i].MemOp;
+    unsigned Flags = OpTbl2Addr[i].Flags;
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const X86OpTblEntry OpTbl0[] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
+    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    // AVX 256-bit foldable instructions
+    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+    unsigned RegOp      = OpTbl0[i].RegOp;
+    unsigned MemOp      = OpTbl0[i].MemOp;
+    unsigned Flags      = OpTbl0[i].Flags;
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  RegOp, MemOp, TB_INDEX_0 | Flags);
+  }
+
+  static const X86OpTblEntry OpTbl1[] = {
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
+    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16,         0 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32,         0 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8,          0 },
+    { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
+    { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
+    { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
+    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
+    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
+    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
+    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
+    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
+    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
+    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
+    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
+    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
+    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
+    { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
+    { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
+    { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
+    { X86::VPABSWrr128,     X86::VPABSWrm128,         0 },
+    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
+    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
+    { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         0 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
+    { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       0 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
+
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
+    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
+    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
+
+    // AVX2 foldable instructions
+    { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
+    { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
+    { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
+    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
+    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
+    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
+    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
+    { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
+    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
+    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
+    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
+    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
+    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+
+    // BMI/BMI2/LZCNT/POPCNT foldable instructions
+    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
+    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
+    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
+    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
+    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
+    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
+    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
+    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
+    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
+    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
+    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
+    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
+    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
+    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
+    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
+    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+    unsigned RegOp = OpTbl1[i].RegOp;
+    unsigned MemOp = OpTbl1[i].MemOp;
+    unsigned Flags = OpTbl1[i].Flags;
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 1, folded load
+                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
+  }
+
+  static const X86OpTblEntry OpTbl2[] = {
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
+    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
+    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
+    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
+    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
+    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
+    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PALIGNR128rr,    X86::PALIGNR128rm,  TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
+    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
+    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
+    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
+    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
+    { X86::PMADDUBSWrr128,  X86::PMADDUBSWrm128, TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
+    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
+    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHRSWrr128,   X86::PMULHRSWrm128, TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
+    { X86::PSIGNBrr,        X86::PSIGNBrm,      TB_ALIGN_16 },
+    { X86::PSIGNWrr,        X86::PSIGNWrm,      TB_ALIGN_16 },
+    { X86::PSIGNDrr,        X86::PSIGNDrm,      TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQXrm,      0 },
+    { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       0 },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
+    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
+    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
+    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
+    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
+    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
+    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
+    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
+    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
+    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
+    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
+    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VORPDrr,           X86::VORPDrm,            0 },
+    { X86::VORPSrr,           X86::VORPSrm,            0 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
+    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
+    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
+    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
+    { X86::VPALIGNR128rr,     X86::VPALIGNR128rm,      0 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
+    { X86::VPANDrr,           X86::VPANDrm,            0 },
+    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
+    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
+    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
+    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
+    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
+    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
+    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
+    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
+    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
+    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
+    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
+    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
+    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
+    { X86::VPMADDUBSWrr128,   X86::VPMADDUBSWrm128,    0 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
+    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
+    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
+    { X86::VPMULHRSWrr128,    X86::VPMULHRSWrm128,     0 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
+    { X86::VPORrr,            X86::VPORrm,             0 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
+    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
+    { X86::VPSIGNBrr,         X86::VPSIGNBrm,          0 },
+    { X86::VPSIGNWrr,         X86::VPSIGNWrm,          0 },
+    { X86::VPSIGNDrr,         X86::VPSIGNDrm,          0 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
+    { X86::VPXORrr,           X86::VPXORrm,            0 },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
+    // AVX 256-bit foldable instructions
+    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
+    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
+    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
+    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
+    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
+    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
+    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
+    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
+    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
+    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
+    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
+    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
+    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
+    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
+    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
+    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
+    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
+    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
+    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
+    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
+    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
+    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
+    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
+    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
+    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
+    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
+    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
+    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
+    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
+    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
+    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
+    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
+    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
+    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
+    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
+    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
+    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
+    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
+    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
+    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
+    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
+    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
+    // AVX2 foldable instructions
+    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
+    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
+    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
+    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
+    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
+    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
+    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
+    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
+    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
+    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
+    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
+    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
+    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
+    { X86::VPALIGNR256rr,     X86::VPALIGNR256rm,      0 },
+    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
+    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
+    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
+    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
+    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
+    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
+    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
+    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
+    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
+    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
+    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
+    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
+    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
+    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
+    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
+    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
+    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
+    { X86::VPERMPDYri,        X86::VPERMPDYmi,         0 },
+    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
+    { X86::VPERMQYri,         X86::VPERMQYmi,          0 },
+    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
+    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
+    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
+    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
+    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
+    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
+    { X86::VPMADDUBSWrr256,   X86::VPMADDUBSWrm256,    0 },
+    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
+    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
+    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
+    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
+    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
+    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
+    { X86::VPMULHRSWrr256,    X86::VPMULHRSWrm256,     0 },
+    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
+    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
+    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
+    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
+    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
+    { X86::VPORYrr,           X86::VPORYrm,            0 },
+    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
+    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
+    { X86::VPSIGNBYrr,        X86::VPSIGNBYrm,         0 },
+    { X86::VPSIGNWYrr,        X86::VPSIGNWYrm,         0 },
+    { X86::VPSIGNDYrr,        X86::VPSIGNDYrm,         0 },
+    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
+    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
+    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
+    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
+    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
+    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
+    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
+    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
+    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
+    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
+    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
+    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
+    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
+    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
+    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
+    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
+    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
+    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
+    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
+    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
+    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
+    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
+    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
+    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
+    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
+    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
+    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
+    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
+    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
+    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
+    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
+    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
+    // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
+    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
+    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
+    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
+    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
+    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+    unsigned RegOp = OpTbl2[i].RegOp;
+    unsigned MemOp = OpTbl2[i].MemOp;
+    unsigned Flags = OpTbl2[i].Flags;
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 2, folded load
+                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+
+  static const X86OpTblEntry OpTbl3[] = {
+    // FMA foldable instructions
+    { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         0 },
+    { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         0 },
+    { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         0 },
+    { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         0 },
+    { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         0 },
+    { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         0 },
+    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     0 },
+    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     0 },
+
+    { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_16 },
+    { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
+
+    { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
+    { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
+    { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        0 },
+    { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        0 },
+    { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        0 },
+    { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        0 },
+    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    0 },
+    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    0 },
+
+    { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
+
+    { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
+    { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
+    { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         0 },
+    { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         0 },
+    { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         0 },
+    { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         0 },
+    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     0 },
+    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     0 },
+
+    { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_16 },
+    { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
+    { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
+
+    { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
+    { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
+    { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        0 },
+    { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        0 },
+    { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        0 },
+    { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        0 },
+    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    0 },
+    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    0 },
+
+    { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_16 },
+    { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
+
+    { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_16 },
+    { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
+
+    { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_16 },
+    { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
+    { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
+    unsigned RegOp = OpTbl3[i].RegOp;
+    unsigned MemOp = OpTbl3[i].MemOp;
+    unsigned Flags = OpTbl3[i].Flags;
+    AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 3, folded load
+                  Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
+  }
+
+}
+
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+    if ((Flags & TB_NO_FORWARD) == 0) {
+      assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+      R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+    }
+    if ((Flags & TB_NO_REVERSE) == 0) {
+      assert(!M2RTable.count(MemOp) &&
+           "Duplicated entries in unfolding maps?");
+      M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+    }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+  case X86::MOVZX64rr8:
+    if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVZX64rr16:
+  case X86::MOVSX64rr32:
+  case X86::MOVZX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+    case X86::MOVZX64rr8:
+      SubIdx = X86::sub_8bit;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+    case X86::MOVZX64rr16:
+      SubIdx = X86::sub_16bit;
+      break;
+    case X86::MOVSX64rr32:
+    case X86::MOVZX64rr32:
+      SubIdx = X86::sub_32bit;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// isFrameOperand - Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                                  int &FrameIndex) const {
+  if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
+      MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
+      MI->getOperand(Op+1).getImm() == 1 &&
+      MI->getOperand(Op+2).getReg() == 0 &&
+      MI->getOperand(Op+3).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op).getIndex();
+    return true;
+  }
+  return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return true;
+  }
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVDQAmr:
+  case X86::VMOVAPSYmr:
+  case X86::VMOVAPDYmr:
+  case X86::VMOVDQAYmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                           int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode()))
+    if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
+      return MI->getOperand(0).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                 int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode()))
+    if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
+        isFrameOperand(MI, 0, FrameIndex))
+      return MI->getOperand(X86::AddrNumOperands).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  // Don't waste compile time scanning use-def chains of physregs.
+  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+    return false;
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+         E = MRI.def_end(); I != E; ++I) {
+    MachineInstr *DefMI = I.getOperand().getParent();
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                AliasAnalysis *AA) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI->getOperand(1).isReg() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        MI->isInvariantLoad(AA)) {
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        !MI->getOperand(4).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI->getOperand(1).isReg())
+        return true;
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// a few instructions in each direction it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator E = MBB.end();
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SeenDef = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != E && Iter->isDebugValue())
+      ++Iter;
+  }
+
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
+  }
+
+  MachineBasicBlock::iterator B = MBB.begin();
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == B)
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    // Skip over DBG_VALUE.
+    while (Iter != B && Iter->isDebugValue())
+      --Iter;
+
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      // A register mask may clobber EFLAGS, but we should still look for a
+      // live EFLAGS def.
+      if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
+        SawKill = true;
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 const MachineInstr *Orig,
+                                 const TargetRegisterInfo &TRI) const {
+  DebugLoc DL = Orig->getDebugLoc();
+
+  // MOV32r0 etc. are implemented with xor which clobbers condition code.
+  // Re-materialize them as movri instructions to avoid side effects.
+  bool Clone = true;
+  unsigned Opc = Orig->getOpcode();
+  switch (Opc) {
+  default: break;
+  case X86::MOV8r0:
+  case X86::MOV16r0:
+  case X86::MOV32r0:
+  case X86::MOV64r0: {
+    if (!isSafeToClobberEFLAGS(MBB, I)) {
+      switch (Opc) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
+      case X86::MOV16r0: Opc = X86::MOV16ri; break;
+      case X86::MOV32r0: Opc = X86::MOV32ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
+      }
+      Clone = false;
+    }
+    break;
+  }
+  }
+
+  if (Clone) {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MBB.insert(I, MI);
+  } else {
+    BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
+  }
+
+  MachineInstr *NewMI = prior(I);
+  NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                           MachineFunction::iterator &MFI,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+    ? X86::LEA64_32r : X86::LEA32r;
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits.
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(leaInReg, RegState::Define, X86::sub_16bit)
+    .addReg(Src, getKillRegState(isKill));
+
+  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+                                    get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default: llvm_unreachable("Unreachable!");
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    MIB.addReg(0).addImm(1 << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
+    break;
+  }
+  case X86::INC16r:
+  case X86::INC64_16r:
+    addRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    addRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri_DB:
+  case X86::ADD16ri8_DB:
+    addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB: {
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = 0;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits.
+      BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
+      InsMI2 =
+        BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
+        .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
+        .addReg(Src2, getKillRegState(isKill2));
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
+    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+    .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, ExtMI);
+  }
+
+  return ExtMI;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
+
+  MachineInstr *NewMI = NULL;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
+  bool DisableLEA16 = true;
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+  unsigned MIOpc = MI->getOpcode();
+  switch (MIOpc) {
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return 0;
+    unsigned M = MI->getOperand(3).getImm();
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addOperand(Dest).addOperand(Src).addImm(M);
+    break;
+  }
+  case X86::SHUFPDrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return 0;
+    unsigned M = MI->getOperand(3).getImm();
+
+    // Convert to PSHUFD mask.
+    M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addOperand(Dest).addOperand(Src).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    // LEA can't handle RSP.
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
+      return 0;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    // LEA can't handle ESP.
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR32_NOSPRegClass))
+      return 0;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+    break;
+  }
+  default: {
+    // The following opcodes also sets the condition code register(s). Only
+    // convert them to equivalent lea if the condition code register def's
+    // are dead!
+    if (hasLiveCondCodeDef(MI))
+      return 0;
+
+    switch (MIOpc) {
+    default: return 0;
+    case X86::INC64r:
+    case X86::INC32r:
+    case X86::INC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
+
+      // LEA can't handle RSP.
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+        return 0;
+
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), 1);
+      break;
+    }
+    case X86::INC16r:
+    case X86::INC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
+      break;
+    case X86::DEC64r:
+    case X86::DEC32r:
+    case X86::DEC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
+        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
+        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
+      // LEA can't handle RSP.
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+        return 0;
+
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), -1);
+      break;
+    }
+    case X86::DEC16r:
+    case X86::DEC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
+      break;
+    case X86::ADD64rr:
+    case X86::ADD64rr_DB:
+    case X86::ADD32rr:
+    case X86::ADD32rr_DB: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc;
+      const TargetRegisterClass *RC;
+      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
+        Opc = X86::LEA64r;
+        RC = &X86::GR64_NOSPRegClass;
+      } else {
+        Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+        RC = &X86::GR32_NOSPRegClass;
+      }
+
+
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+
+      // LEA can't handle RSP.
+      if (TargetRegisterInfo::isVirtualRegister(Src2) &&
+          !MF.getRegInfo().constrainRegClass(Src2, RC))
+        return 0;
+
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD16rr:
+    case X86::ADD16rr_DB: {
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+    case X86::ADD64ri32_DB:
+    case X86::ADD64ri8_DB:
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
+      break;
+    case X86::ADD32ri:
+    case X86::ADD32ri8:
+    case X86::ADD32ri_DB:
+    case X86::ADD32ri8_DB: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
+      break;
+    }
+    case X86::ADD16ri:
+    case X86::ADD16ri8:
+    case X86::ADD16ri_DB:
+    case X86::ADD16ri8_DB:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
+      break;
+    }
+  }
+  }
+
+  if (!NewMI) return 0;
+
+  if (LV) {  // Update live variables
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
+  }
+
+  MFI->insert(MBBI, NewMI);          // Insert the new inst
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImm();
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    // Fallthrough intended.
+  }
+  default:
+    return TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
+}
+
+static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE_4:  return X86::COND_E;
+  case X86::JNE_4: return X86::COND_NE;
+  case X86::JL_4:  return X86::COND_L;
+  case X86::JLE_4: return X86::COND_LE;
+  case X86::JG_4:  return X86::COND_G;
+  case X86::JGE_4: return X86::COND_GE;
+  case X86::JB_4:  return X86::COND_B;
+  case X86::JBE_4: return X86::COND_BE;
+  case X86::JA_4:  return X86::COND_A;
+  case X86::JAE_4: return X86::COND_AE;
+  case X86::JS_4:  return X86::COND_S;
+  case X86::JNS_4: return X86::COND_NS;
+  case X86::JP_4:  return X86::COND_P;
+  case X86::JNP_4: return X86::COND_NP;
+  case X86::JO_4:  return X86::COND_O;
+  case X86::JNO_4: return X86::COND_NO;
+  }
+}
+
+/// getCondFromSETOpc - return condition code of a SET opcode.
+static X86::CondCode getCondFromSETOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
+  case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
+  case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
+  case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
+  case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
+  case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
+  case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
+  case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
+  case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
+  case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
+  case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
+  case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
+  case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
+  case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
+  case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
+  case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
+  }
+}
+
+/// getCondFromCmovOpc - return condition code of a CMov opcode.
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
+  switch (Opc) {
+  default: return X86::COND_INVALID;
+  case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
+  case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
+    return X86::COND_A;
+  case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
+  case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
+    return X86::COND_AE;
+  case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
+  case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
+    return X86::COND_B;
+  case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
+  case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
+    return X86::COND_BE;
+  case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
+  case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
+    return X86::COND_E;
+  case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
+  case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
+    return X86::COND_G;
+  case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
+  case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
+    return X86::COND_GE;
+  case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
+  case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
+    return X86::COND_L;
+  case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
+  case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
+    return X86::COND_LE;
+  case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
+  case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
+    return X86::COND_NE;
+  case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
+  case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
+    return X86::COND_NO;
+  case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
+  case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
+    return X86::COND_NP;
+  case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
+  case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
+    return X86::COND_NS;
+  case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
+  case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
+    return X86::COND_O;
+  case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
+  case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
+    return X86::COND_P;
+  case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
+  case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
+    return X86::COND_S;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::JE_4;
+  case X86::COND_NE: return X86::JNE_4;
+  case X86::COND_L:  return X86::JL_4;
+  case X86::COND_LE: return X86::JLE_4;
+  case X86::COND_G:  return X86::JG_4;
+  case X86::COND_GE: return X86::JGE_4;
+  case X86::COND_B:  return X86::JB_4;
+  case X86::COND_BE: return X86::JBE_4;
+  case X86::COND_A:  return X86::JA_4;
+  case X86::COND_AE: return X86::JAE_4;
+  case X86::COND_S:  return X86::JS_4;
+  case X86::COND_NS: return X86::JNS_4;
+  case X86::COND_P:  return X86::JP_4;
+  case X86::COND_NP: return X86::JNP_4;
+  case X86::COND_O:  return X86::JO_4;
+  case X86::COND_NO: return X86::JNO_4;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+static X86::CondCode getSwappedCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_E;
+  case X86::COND_NE: return X86::COND_NE;
+  case X86::COND_L:  return X86::COND_G;
+  case X86::COND_LE: return X86::COND_GE;
+  case X86::COND_G:  return X86::COND_L;
+  case X86::COND_GE: return X86::COND_LE;
+  case X86::COND_B:  return X86::COND_A;
+  case X86::COND_BE: return X86::COND_AE;
+  case X86::COND_A:  return X86::COND_B;
+  case X86::COND_AE: return X86::COND_BE;
+  }
+}
+
+/// getSETFromCond - Return a set opcode for the given condition and
+/// whether it has memory operand.
+static unsigned getSETFromCond(X86::CondCode CC,
+                               bool HasMemoryOperand) {
+  static const uint16_t Opc[16][2] = {
+    { X86::SETAr,  X86::SETAm  },
+    { X86::SETAEr, X86::SETAEm },
+    { X86::SETBr,  X86::SETBm  },
+    { X86::SETBEr, X86::SETBEm },
+    { X86::SETEr,  X86::SETEm  },
+    { X86::SETGr,  X86::SETGm  },
+    { X86::SETGEr, X86::SETGEm },
+    { X86::SETLr,  X86::SETLm  },
+    { X86::SETLEr, X86::SETLEm },
+    { X86::SETNEr, X86::SETNEm },
+    { X86::SETNOr, X86::SETNOm },
+    { X86::SETNPr, X86::SETNPm },
+    { X86::SETNSr, X86::SETNSm },
+    { X86::SETOr,  X86::SETOm  },
+    { X86::SETPr,  X86::SETPm  },
+    { X86::SETSr,  X86::SETSm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  return Opc[CC][HasMemoryOperand ? 1 : 0];
+}
+
+/// getCMovFromCond - Return a cmov opcode for the given condition,
+/// register size in bytes, and operand type.
+static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
+                                bool HasMemoryOperand) {
+  static const uint16_t Opc[32][3] = {
+    { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
+    { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
+    { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
+    { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
+    { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
+    { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
+    { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
+    { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
+    { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
+    { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
+    { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
+    { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
+    { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
+    { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
+    { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
+    { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
+    { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
+    { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
+    { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
+    { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
+    { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
+    { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
+    { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
+    { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
+    { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
+    { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
+    { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
+    { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
+    { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
+    { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
+    { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
+    { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
+  };
+
+  assert(CC < 16 && "Can only handle standard cond codes");
+  unsigned Idx = HasMemoryOperand ? 16+CC : CC;
+  switch(RegBytes) {
+  default: llvm_unreachable("Illegal register size!");
+  case 2: return Opc[Idx][0];
+  case 4: return Opc[Idx][1];
+  case 8: return Opc[Idx][2];
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (!MI->isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI->isBranch() && !MI->isBarrier())
+    return true;
+  if (!MI->isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP_4) {
+      UnCondBrIter = I;
+
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        UnCondBrIter = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditional destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
+      if (AllowModify && UnCondBrIter != MBB.end() &&
+          MBB.isLayoutSuccessor(TargetBB)) {
+        // If we can modify the code and it ends in something like:
+        //
+        //     jCC L1
+        //     jmp L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Then we can change this to:
+        //
+        //     jnCC L2
+        //   L1:
+        //     ...
+        //   L2:
+        //
+        // Which is a bit more efficient.
+        // We conditionally jump to the fall-through block.
+        BranchCode = GetOppositeBranchCondition(BranchCode);
+        unsigned JNCC = GetCondBranchFromCond(BranchCode);
+        MachineBasicBlock::iterator OldInst = I;
+
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
+          .addMBB(UnCondBrIter->getOperand(0).getMBB());
+        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4))
+          .addMBB(TargetBB);
+
+        OldInst->eraseFromParent();
+        UnCondBrIter->eraseFromParent();
+
+        // Restart the analysis.
+        UnCondBrIter = MBB.end();
+        I = MBB.end();
+        continue;
+      }
+
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+  }
+
+  return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (I->getOpcode() != X86::JMP_4 &&
+        getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond,
+                           DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+bool X86InstrInfo::
+canInsertSelect(const MachineBasicBlock &MBB,
+                const SmallVectorImpl<MachineOperand> &Cond,
+                unsigned TrueReg, unsigned FalseReg,
+                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+  // Not all subtargets have cmov instructions.
+  if (!TM.getSubtarget<X86Subtarget>().hasCMov())
+    return false;
+  if (Cond.size() != 1)
+    return false;
+  // We cannot do the composite conditions, at least not in SSA form.
+  if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
+    return false;
+
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
+    return false;
+
+  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
+  if (X86::GR16RegClass.hasSubClassEq(RC) ||
+      X86::GR32RegClass.hasSubClassEq(RC) ||
+      X86::GR64RegClass.hasSubClassEq(RC)) {
+    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
+    // Bridge. Probably Ivy Bridge as well.
+    CondCycles = 2;
+    TrueCycles = 2;
+    FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
+  return false;
+}
+
+void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, DebugLoc DL,
+                                unsigned DstReg,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                unsigned TrueReg, unsigned FalseReg) const {
+   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+   assert(Cond.size() == 1 && "Invalid Cond array");
+   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
+                                  MRI.getRegClass(DstReg)->getSize(),
+                                  false/*HasMemoryOperand*/);
+   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+// Try and copy between VR128/VR64 and GR64 registers.
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+                                        bool HasAVX) {
+  // SrcReg(VR128) -> DestReg(GR64)
+  // SrcReg(VR64)  -> DestReg(GR64)
+  // SrcReg(GR64)  -> DestReg(VR128)
+  // SrcReg(GR64)  -> DestReg(VR64)
+
+  if (X86::GR64RegClass.contains(DestReg)) {
+    if (X86::VR128RegClass.contains(SrcReg))
+      // Copy from a VR128 register to a GR64 register.
+      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
+    if (X86::VR64RegClass.contains(SrcReg))
+      // Copy from a VR64 register to a GR64 register.
+      return X86::MOVSDto64rr;
+  } else if (X86::GR64RegClass.contains(SrcReg)) {
+    // Copy from a GR64 register to a VR128 register.
+    if (X86::VR128RegClass.contains(DestReg))
+      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
+    // Copy from a GR64 register to a VR64 register.
+    if (X86::VR64RegClass.contains(DestReg))
+      return X86::MOV64toSDrr;
+  }
+
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+
+  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+
+  return 0;
+}
+
+void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, DebugLoc DL,
+                               unsigned DestReg, unsigned SrcReg,
+                               bool KillSrc) const {
+  // First deal with the normal symmetric copies.
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  unsigned Opc;
+  if (X86::GR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV64rr;
+  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV32rr;
+  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MOV16rr;
+  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit()) {
+      Opc = X86::MOV8rr_NOREX;
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
+      Opc = X86::MOV8rr;
+  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+    Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
+  else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+    Opc = X86::VMOVAPSYrr;
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+
+  if (Opc) {
+    BuildMI(MBB, MI, DL, get(Opc), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // Moving EFLAGS to / from another register requires a push and a pop.
+  // Notice that we have to adjust the stack if we don't want to clobber the
+  // first frame index. See X86FrameLowering.cpp - colobbersTheStack.
+  if (SrcReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF64));
+      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+      return;
+    }
+    if (X86::GR32RegClass.contains(DestReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHF32));
+      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+      return;
+    }
+  }
+  if (DestReg == X86::EFLAGS) {
+    if (X86::GR64RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF64));
+      return;
+    }
+    if (X86::GR32RegClass.contains(SrcReg)) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+      BuildMI(MBB, MI, DL, get(X86::POPF32));
+      return;
+    }
+  }
+
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  llvm_unreachable("Cannot emit physreg copy instruction");
+}
+
+static unsigned getLoadStoreRegOpcode(unsigned Reg,
+                                      const TargetRegisterClass *RC,
+                                      bool isStackAligned,
+                                      const TargetMachine &TM,
+                                      bool load) {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  switch (RC->getSize()) {
+  default:
+    llvm_unreachable("Unknown spill size");
+  case 1:
+    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
+        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
+    return load ? X86::MOV8rm : X86::MOV8mr;
+  case 2:
+    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
+    return load ? X86::MOV16rm : X86::MOV16mr;
+  case 4:
+    if (X86::GR32RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::FR32RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+    if (X86::RFP32RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp32m : X86::ST_Fp32m;
+    llvm_unreachable("Unknown 4-byte regclass");
+  case 8:
+    if (X86::GR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::FR64RegClass.hasSubClassEq(RC))
+      return load ?
+        (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+    if (X86::VR64RegClass.hasSubClassEq(RC))
+      return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
+    if (X86::RFP64RegClass.hasSubClassEq(RC))
+      return load ? X86::LD_Fp64m : X86::ST_Fp64m;
+    llvm_unreachable("Unknown 8-byte regclass");
+  case 10:
+    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
+    return load ? X86::LD_Fp80m : X86::ST_FpP80m;
+  case 16: {
+    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ?
+        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return load ?
+        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
+  case 32:
+    assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+    // If stack is realigned we can use aligned stores.
+    if (isStackAligned)
+      return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+    else
+      return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  }
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  TargetMachine &TM) {
+  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, TM, false);
+}
+
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const TargetMachine &TM) {
+  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, TM, true);
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
+         "Stack slot too small for store");
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
+    RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
+    RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::
+analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
+               int &CmpMask, int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(1).getImm();
+    return true;
+  // A SUB can be used to perform comparison.
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case X86::CMP64rr:
+  case X86::CMP32rr:
+  case X86::CMP16rr:
+  case X86::CMP8rr:
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = MI->getOperand(1).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+    SrcReg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).getReg() != SrcReg) return false;
+    // Compare against zero.
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  }
+  return false;
+}
+
+/// isRedundantFlagInstr - check whether the first instruction, whose only
+/// purpose is to update flags, can be made redundant.
+/// CMPrr can be made redundant by SUBrr if the operands are the same.
+/// This function can be extended later on.
+/// SrcReg, SrcRegs: register operands for FlagI.
+/// ImmValue: immediate for FlagI if it takes an immediate.
+inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
+                                        unsigned SrcReg2, int ImmValue,
+                                        MachineInstr *OI) {
+  if (((FlagI->getOpcode() == X86::CMP64rr &&
+        OI->getOpcode() == X86::SUB64rr) ||
+       (FlagI->getOpcode() == X86::CMP32rr &&
+        OI->getOpcode() == X86::SUB32rr)||
+       (FlagI->getOpcode() == X86::CMP16rr &&
+        OI->getOpcode() == X86::SUB16rr)||
+       (FlagI->getOpcode() == X86::CMP8rr &&
+        OI->getOpcode() == X86::SUB8rr)) &&
+      ((OI->getOperand(1).getReg() == SrcReg &&
+        OI->getOperand(2).getReg() == SrcReg2) ||
+       (OI->getOperand(1).getReg() == SrcReg2 &&
+        OI->getOperand(2).getReg() == SrcReg)))
+    return true;
+
+  if (((FlagI->getOpcode() == X86::CMP64ri32 &&
+        OI->getOpcode() == X86::SUB64ri32) ||
+       (FlagI->getOpcode() == X86::CMP64ri8 &&
+        OI->getOpcode() == X86::SUB64ri8) ||
+       (FlagI->getOpcode() == X86::CMP32ri &&
+        OI->getOpcode() == X86::SUB32ri) ||
+       (FlagI->getOpcode() == X86::CMP32ri8 &&
+        OI->getOpcode() == X86::SUB32ri8) ||
+       (FlagI->getOpcode() == X86::CMP16ri &&
+        OI->getOpcode() == X86::SUB16ri) ||
+       (FlagI->getOpcode() == X86::CMP16ri8 &&
+        OI->getOpcode() == X86::SUB16ri8) ||
+       (FlagI->getOpcode() == X86::CMP8ri &&
+        OI->getOpcode() == X86::SUB8ri)) &&
+      OI->getOperand(1).getReg() == SrcReg &&
+      OI->getOperand(2).getImm() == ImmValue)
+    return true;
+  return false;
+}
+
+/// isDefConvertible - check whether the definition can be converted
+/// to remove a comparison against zero.
+inline static bool isDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
+  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
+  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
+  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
+  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
+  case X86::DEC64_32r: case X86::DEC64_16r:
+  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
+  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
+  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
+  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
+  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
+  case X86::INC64_32r: case X86::INC64_16r:
+  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
+  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
+  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
+  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
+  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
+  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
+  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
+  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
+  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
+  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
+  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
+  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
+  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
+  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
+  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::ANDN32rr:  case X86::ANDN32rm:
+  case X86::ANDN64rr:  case X86::ANDN64rm:
+    return true;
+  }
+}
+
+/// optimizeCompareInstr - Check if there exists an earlier instruction that
+/// operates on the same source operands and sets flags in the same way as
+/// Compare; remove Compare if possible.
+bool X86InstrInfo::
+optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
+                     int CmpMask, int CmpValue,
+                     const MachineRegisterInfo *MRI) const {
+  // Check whether we can replace SUB with CMP.
+  unsigned NewOpcode = 0;
+  switch (CmpInstr->getOpcode()) {
+  default: break;
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB8ri:
+  case X86::SUB64rm:
+  case X86::SUB32rm:
+  case X86::SUB16rm:
+  case X86::SUB8rm:
+  case X86::SUB64rr:
+  case X86::SUB32rr:
+  case X86::SUB16rr:
+  case X86::SUB8rr: {
+    if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+      return false;
+    // There is no use of the destination register, we can replace SUB with CMP.
+    switch (CmpInstr->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
+    case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
+    case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
+    case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
+    case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
+    case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
+    case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
+    case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
+    case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
+    case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
+    case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
+    case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
+    case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
+    case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
+    case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
+    }
+    CmpInstr->setDesc(get(NewOpcode));
+    CmpInstr->RemoveOperand(0);
+    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
+    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
+        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
+      return false;
+  }
+  }
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI) return false;
+
+  // CmpInstr is the first instruction of the BB.
+  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
+
+  // If we are comparing against zero, check whether we can use MI to update
+  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
+  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
+      !isDefConvertible(MI)))
+    return false;
+
+  // We are searching for an earlier instruction that can make CmpInstr
+  // redundant and that instruction will be saved in Sub.
+  MachineInstr *Sub = NULL;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of a source register or done with the BB.
+  // RI points to the instruction before CmpInstr.
+  // If the definition is in this basic block, RE points to the definition;
+  // otherwise, RE is the rend of the basic block.
+  MachineBasicBlock::reverse_iterator
+      RI = MachineBasicBlock::reverse_iterator(I),
+      RE = CmpInstr->getParent() == MI->getParent() ?
+           MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
+           CmpInstr->getParent()->rend();
+  MachineInstr *Movr0Inst = 0;
+  for (; RI != RE; ++RI) {
+    MachineInstr *Instr = &*RI;
+    // Check whether CmpInstr can be made redundant by the current instruction.
+    if (!IsCmpZero &&
+        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+      Sub = Instr;
+      break;
+    }
+
+    if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
+        Instr->readsRegister(X86::EFLAGS, TRI)) {
+      // This instruction modifies or uses EFLAGS.
+
+      // MOV32r0 etc. are implemented with xor which clobbers condition code.
+      // They are safe to move up, if the definition to EFLAGS is dead and
+      // earlier instructions do not read or write EFLAGS.
+      if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
+           Instr->getOpcode() == X86::MOV16r0 ||
+           Instr->getOpcode() == X86::MOV32r0 ||
+           Instr->getOpcode() == X86::MOV64r0) &&
+          Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
+        Movr0Inst = Instr;
+        continue;
+      }
+
+      // We can't remove CmpInstr.
+      return false;
+    }
+  }
+
+  // Return false if no candidates exist.
+  if (!IsCmpZero && !Sub)
+    return false;
+
+  bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+                    Sub->getOperand(2).getReg() == SrcReg);
+
+  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
+  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
+  // If we are done with the basic block, we need to check whether EFLAGS is
+  // live-out.
+  bool IsSafe = false;
+  SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
+  MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
+  for (++I; I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
+    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
+    // We should check the usage if this instruction uses and updates EFLAGS.
+    if (!UseEFLAGS && ModifyEFLAGS) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again.
+      IsSafe = true;
+      break;
+    }
+    if (!UseEFLAGS && !ModifyEFLAGS)
+      continue;
+
+    // EFLAGS is used by this instruction.
+    X86::CondCode OldCC;
+    bool OpcIsSET = false;
+    if (IsCmpZero || IsSwapped) {
+      // We decode the condition code from opcode.
+      if (Instr.isBranch())
+        OldCC = getCondFromBranchOpc(Instr.getOpcode());
+      else {
+        OldCC = getCondFromSETOpc(Instr.getOpcode());
+        if (OldCC != X86::COND_INVALID)
+          OpcIsSET = true;
+        else
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
+      }
+      if (OldCC == X86::COND_INVALID) return false;
+    }
+    if (IsCmpZero) {
+      switch (OldCC) {
+      default: break;
+      case X86::COND_A: case X86::COND_AE:
+      case X86::COND_B: case X86::COND_BE:
+      case X86::COND_G: case X86::COND_GE:
+      case X86::COND_L: case X86::COND_LE:
+      case X86::COND_O: case X86::COND_NO:
+        // CF and OF are used, we can't perform this optimization.
+        return false;
+      }
+    } else if (IsSwapped) {
+      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
+      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+      // We swap the condition code and synthesize the new opcode.
+      X86::CondCode NewCC = getSwappedCondition(OldCC);
+      if (NewCC == X86::COND_INVALID) return false;
+
+      // Synthesize the new opcode.
+      bool HasMemoryOperand = Instr.hasOneMemOperand();
+      unsigned NewOpc;
+      if (Instr.isBranch())
+        NewOpc = GetCondBranchFromCond(NewCC);
+      else if(OpcIsSET)
+        NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+      else {
+        unsigned DstReg = Instr.getOperand(0).getReg();
+        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+                                 HasMemoryOperand);
+      }
+
+      // Push the MachineInstr to OpsToUpdate.
+      // If it is safe to remove CmpInstr, the condition code of these
+      // instructions will be modified.
+      OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
+    }
+    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
+      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
+      IsSafe = true;
+      break;
+    }
+  }
+
+  // If EFLAGS is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if ((IsCmpZero || IsSwapped) && !IsSafe) {
+    MachineBasicBlock *MBB = CmpInstr->getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+  }
+
+  // The instruction to be updated is either Sub or MI.
+  Sub = IsCmpZero ? MI : Sub;
+  // Move Movr0Inst to the place right before Sub.
+  if (Movr0Inst) {
+    Sub->getParent()->remove(Movr0Inst);
+    Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+  }
+
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned LastOperand = Sub->getNumOperands() - 1;
+  assert(Sub->getNumOperands() >= 2 &&
+         Sub->getOperand(LastOperand).isReg() &&
+         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
+         "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
+  Sub->getOperand(LastOperand).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDead(false);
+  CmpInstr->eraseFromParent();
+
+  // Modify the condition code of instructions in OpsToUpdate.
+  for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
+    OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
+  return true;
+}
+
+/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr* X86InstrInfo::
+optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
+                  unsigned &FoldAsLoadDefReg,
+                  MachineInstr *&DefMI) const {
+  if (FoldAsLoadDefReg == 0)
+    return 0;
+  // To be conservative, if there exists another load, clear the load candidate.
+  if (MI->mayLoad()) {
+    FoldAsLoadDefReg = 0;
+    return 0;
+  }
+
+  // Check whether we can move DefMI here.
+  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+  assert(DefMI);
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(this, 0, SawStore))
+    return 0;
+
+  // We try to commute MI if possible.
+  unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
+  for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
+    // Collect information about virtual register operands of MI.
+    unsigned SrcOperandId = 0;
+    bool FoundSrcOperand = false;
+    for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg != FoldAsLoadDefReg)
+        continue;
+      // Do not fold if we have a subreg use or a def or multiple uses.
+      if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+        return 0;
+
+      SrcOperandId = i;
+      FoundSrcOperand = true;
+    }
+    if (!FoundSrcOperand) return 0;
+
+    // Check whether we can fold the def into SrcOperandId.
+    SmallVector<unsigned, 8> Ops;
+    Ops.push_back(SrcOperandId);
+    MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+    if (FoldMI) {
+      FoldAsLoadDefReg = 0;
+      return FoldMI;
+    }
+
+    if (Idx == 1) {
+      // MI was changed but it didn't help, commute it back!
+      commuteInstruction(MI, false);
+      return 0;
+    }
+
+    // Check whether we can commute MI and enable folding.
+    if (MI->isCommutable()) {
+      MachineInstr *NewMI = commuteInstruction(MI, false);
+      // Unable to commute.
+      if (!NewMI) return 0;
+      if (NewMI != MI) {
+        // New instruction. It doesn't need to be kept.
+        NewMI->eraseFromParent();
+        return 0;
+      }
+    }
+  }
+  return 0;
+}
+
+/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.  This is
+/// used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+                             const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MIB->getOperand(0).getReg();
+  MIB->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MIB->getOperand(1).getReg() == Reg &&
+         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+  switch (MI->getOpcode()) {
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
+  case X86::V_SET0:
+  case X86::FsFLD0SS:
+  case X86::FsFLD0SD:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::TEST8ri_NOREX:
+    MI->setDesc(get(X86::TEST8ri));
+    return true;
+  }
+  return false;
+}
+
+MachineInstr*
+X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                       int FrameIx, uint64_t Offset,
+                                       const MDNode *MDPtr,
+                                       DebugLoc DL) const {
+  X86AddressMode AM;
+  AM.BaseType = X86AddressMode::FrameIndexBase;
+  AM.Base.FrameIndex = FrameIx;
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
+  addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    MIB.addOperand(MO);
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+                              unsigned Opcode, unsigned OpNo,
+                              const SmallVectorImpl<MachineOperand> &MOs,
+                              MachineInstr *MI, const TargetInstrInfo &TII) {
+  // Omit the implicit operands, something BuildMI can't do.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      unsigned NumAddrOps = MOs.size();
+      for (unsigned i = 0; i != NumAddrOps; ++i)
+        MIB.addOperand(MOs[i]);
+      if (NumAddrOps < 4)  // FrameIndex only
+        addOffset(MIB, 0);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                const SmallVectorImpl<MachineOperand> &MOs,
+                                MachineInstr *MI) {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI, unsigned i,
+                                    const SmallVectorImpl<MachineOperand> &MOs,
+                                    unsigned Size, unsigned Align) const {
+  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
+  bool isTwoAddrFold = false;
+
+  // Atom favors register form of call. So, we do not fold loads into calls
+  // when X86Subtarget is Atom.
+  if (isCallRegIndirect &&
+    (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
+    return NULL;
+  }
+
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // FIXME: AsmPrinter doesn't know how to handle
+  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+  if (MI->getOpcode() == X86::ADD32ri &&
+      MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+    return NULL;
+
+  MachineInstr *NewMI = NULL;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
+    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
+    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    }
+    if (Opc)
+       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
+    if (NewMI)
+      return NewMI;
+
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (i == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (i == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (i == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  }
+
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+      OpcodeTablePtr->find(MI->getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
+      if (Align < MinAlign)
+        return NULL;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return NULL;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is due
+          // to liveintervalanalysis remat'ing a load from stack slot.
+          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+            return NULL;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+      else
+        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+                                                   X86::sub_32bit));
+        else
+          NewMI->getOperand(0).setSubReg(X86::sub_32bit);
+      }
+      return NewMI;
+    }
+  }
+
+  // No fusion
+  if (PrintFailedFusing && !MI->isCopy())
+    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+  return NULL;
+}
+
+/// hasPartialRegUpdate - Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSI2SSrr:
+  case X86::CVTSI2SS64rr:
+  case X86::CVTSI2SDrr:
+  case X86::CVTSI2SD64rr:
+  case X86::CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrr:
+  case X86::CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrr:
+  case X86::RCPSSr:
+  case X86::RCPSSr_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSDr_Int:
+  case X86::ROUNDSSr:
+  case X86::ROUNDSSr_Int:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSr_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX encoded versions
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+    return true;
+  }
+
+  return false;
+}
+
+/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // If MI is marked as reading Reg, the partial register update is wanted.
+  const MachineOperand &MO = MI->getOperand(0);
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else {
+    if (MI->readsRegister(Reg, TRI))
+      return 0;
+  }
+
+  // If any of the preceding 16 instructions are reading Reg, insert a
+  // dependency breaking instruction.  The magic number is based on a few
+  // Nehalem experiments.
+  return 16;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  if (X86::VR128RegClass.contains(Reg)) {
+    // These instructions are all floating point domain, so xorps is the best
+    // choice.
+    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+      .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+  } else if (X86::VR256RegClass.contains(Reg)) {
+    // Use vxorps to clear the full ymm register.
+    // It wants to read and write the xmm sub-register.
+    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+      .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+      .addReg(Reg, RegState::ImplicitDefine);
+  } else
+    return;
+  MI->addRegisterKilled(Reg, TRI, true);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
+  // Check switch flag
+  if (NoFusing) return NULL;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    unsigned RCSize = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
+    }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return NULL;
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  SmallVector<MachineOperand,4> MOs;
+  MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  MachineInstr *LoadMI) const {
+  // Check switch flag
+  if (NoFusing) return NULL;
+
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI->hasOneMemOperand())
+    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI->getOpcode()) {
+    case X86::AVX2_SETALLONES:
+    case X86::AVX_SET0:
+      Alignment = 32;
+      break;
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      return 0;
+    }
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  // Make sure the subregisters match.
+  // Otherwise we risk changing the size of the load.
+  if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
+    return NULL;
+
+  SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
+  switch (LoadMI->getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::AVX2_SETALLONES:
+  case X86::AVX_SET0:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // Medium and large mode can't fold loads this way.
+    if (TM.getCodeModel() != CodeModel::Small &&
+        TM.getCodeModel() != CodeModel::Kernel)
+      return NULL;
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return NULL;
+    }
+
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    Type *Ty;
+    unsigned Opc = LoadMI->getOpcode();
+    if (Opc == X86::FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (Opc == X86::FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    break;
+  }
+  default: {
+    if ((LoadMI->getOpcode() == X86::MOVSSrm ||
+         LoadMI->getOpcode() == X86::VMOVSSrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 4)
+      // These instructions only load 32 bits, we can't fold them if the
+      // destination register is wider than 32 bits (4 bytes).
+      return NULL;
+    if ((LoadMI->getOpcode() == X86::MOVSDrm ||
+         LoadMI->getOpcode() == X86::VMOVSDrm) &&
+        MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
+          > 8)
+      // These instructions only load 64 bits, we can't fold them if the
+      // destination register is wider than 64 bits (8 bytes).
+      return NULL;
+
+    // Folding a normal load. Just copy the load's address operands.
+    unsigned NumOps = LoadMI->getDesc().getNumOperands();
+    for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+      MOs.push_back(LoadMI->getOperand(i));
+    break;
+  }
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  // Check switch flag
+  if (NoFusing) return 0;
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    switch (MI->getOpcode()) {
+    default: return false;
+    case X86::TEST8rr:
+    case X86::TEST16rr:
+    case X86::TEST32rr:
+    case X86::TEST64rr:
+      return true;
+    case X86::ADD32ri:
+      // FIXME: AsmPrinter doesn't know how to handle
+      // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
+      if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
+        return false;
+      break;
+    }
+  }
+
+  if (Ops.size() != 1)
+    return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+  } else if (OpNum == 0) { // If operand 0
+    switch (Opc) {
+    case X86::MOV8r0:
+    case X86::MOV16r0:
+    case X86::MOV32r0:
+    case X86::MOV64r0: return true;
+    default: break;
+    }
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
+  }
+
+  if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
+    return true;
+  return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(MI->getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const MCInstrDesc &MCID = get(Opc);
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  if (!MI->hasOneMemOperand() &&
+      RC == &X86::VR128RegClass &&
+      !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
+    // conservatively assume the address is unaligned. That's bad for
+    // performance.
+    return false;
+  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (i >= Index && i < Index + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(MI->memoperands_begin(),
+                            MI->memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, DataMI);
+
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+    MIB.addOperand(BeforeOps[i]);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+    MIB.addOperand(AfterOps[i]);
+  for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+    MachineOperand &MO = ImpOps[i];
+    MIB.addReg(MO.getReg(),
+               getDefRegState(MO.isDef()) |
+               RegState::Implicit |
+               getKillRegState(MO.isKill()) |
+               getDeadRegState(MO.isDead()) |
+               getUndefRegState(MO.isUndef()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      unsigned NewOpc;
+      switch (DataMI->getOpcode()) {
+      default: llvm_unreachable("Unreachable!");
+      case X86::CMP64ri8:
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri8:
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri8:
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(MI->memoperands_begin(),
+                             MI->memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  const MCInstrDesc &MCID = get(Opc);
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
+  unsigned NumDefs = MCID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = 0;
+  if (FoldedLoad) {
+    EVT VT = *RC->vt_begin();
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned load.
+      return false;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+                              VT, MVT::Other, &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<EVT> VTs;
+  const TargetRegisterClass *DstRC = 0;
+  if (MCID.getNumDefs() > 0) {
+    DstRC = getRegClass(MCID, 0, &RI, MF);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0],
+                                      BeforeOps.size());
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    if (!(*MMOs.first) &&
+        RC == &X86::VR128RegClass &&
+        !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
+      // Do not introduce a slow unaligned store.
+      return false;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
+    SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
+                                                         isAligned, TM),
+                                       dl, MVT::Other,
+                                       &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex) const {
+  DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find(Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  if (LoadRegIndex)
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
+  return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::VMOVDQUYrm:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (TM.getSubtargetImpl()->is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads) {
+      return false;
+    }
+    break;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+    break;
+  }
+
+  return true;
+}
+
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Create the register. The code to initialize it is inserted
+  // later, by the CGBR pass (below).
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+// These are the replaceable SSE instructions. Some of these have Int variants
+// that we don't include here. We don't want to replace instructions selected
+// by intrinsics.
+static const uint16_t ReplaceableInstrs[][3] = {
+  //PackedSingle     PackedDouble    PackedInt
+  { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
+  { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
+  { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
+  { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
+  { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
+  { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
+  { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
+  { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
+  { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
+  { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
+  { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
+  { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
+  { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
+  { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
+  // AVX 128-bit support
+  { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
+  { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
+  { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
+  { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
+  { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
+  { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
+  { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
+  { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
+  { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
+  { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
+  { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
+  { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
+  { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
+  { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
+  // AVX 256-bit support
+  { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
+  { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
+  { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
+  { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
+  { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
+};
+
+static const uint16_t ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
+  { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
+  { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
+  { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
+  { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
+  { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
+  { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+  { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
+  { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr }
+};
+
+// FIXME: Some shuffle and unpack instructions have equivalents in different
+// domains, but they require a bit more work than just switching opcodes.
+
+static const uint16_t *lookup(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
+    if (ReplaceableInstrs[i][domain-1] == opcode)
+      return ReplaceableInstrs[i];
+  return 0;
+}
+
+static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
+    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
+      return ReplaceableInstrsAVX2[i];
+  return 0;
+}
+
+std::pair<uint16_t, uint16_t>
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+  uint16_t validDomains = 0;
+  if (domain && lookup(MI->getOpcode(), domain))
+    validDomains = 0xe;
+  else if (domain && lookupAVX2(MI->getOpcode(), domain))
+    validDomains = hasAVX2 ? 0xe : 0x6;
+  return std::make_pair(domain, validDomains);
+}
+
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  assert(Domain>0 && Domain<4 && "Invalid execution domain");
+  uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
+  assert(dom && "Not an SSE instruction");
+  const uint16_t *table = lookup(MI->getOpcode(), dom);
+  if (!table) { // try the other table
+    assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+           "256-bit vector operations only available in AVX2");
+    table = lookupAVX2(MI->getOpcode(), dom);
+  }
+  assert(table && "Cannot change domain");
+  MI->setDesc(get(table[Domain-1]));
+}
+
+/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
+void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(X86::NOOP);
+}
+
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
+  default: return false;
+  case X86::DIVSDrm:
+  case X86::DIVSDrm_Int:
+  case X86::DIVSDrr:
+  case X86::DIVSDrr_Int:
+  case X86::DIVSSrm:
+  case X86::DIVSSrm_Int:
+  case X86::DIVSSrr:
+  case X86::DIVSSrr_Int:
+  case X86::SQRTPDm:
+  case X86::SQRTPDr:
+  case X86::SQRTPSm:
+  case X86::SQRTPSr:
+  case X86::SQRTSDm:
+  case X86::SQRTSDm_Int:
+  case X86::SQRTSDr:
+  case X86::SQRTSDr_Int:
+  case X86::SQRTSSm:
+  case X86::SQRTSSm_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSr:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
+    return true;
+  }
+}
+
+bool X86InstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI->getOpcode());
+}
+
+namespace {
+  /// CGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for x86-32.
+  struct CGBR : public MachineFunctionPass {
+    static char ID;
+    CGBR() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      const X86TargetMachine *TM =
+        static_cast<const X86TargetMachine *>(&MF.getTarget());
+
+      assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
+             "X86-64 PIC uses RIP relative addressing");
+
+      // Only emit a global base reg in PIC mode.
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+
+      // If we didn't need a GlobalBaseReg, don't insert code.
+      if (GlobalBaseReg == 0)
+        return false;
+
+      // Insert the set of GlobalBaseReg into the first MBB of the function
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      unsigned PC;
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
+        PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      else
+        PC = GlobalBaseReg;
+
+      // Operand of MovePCtoStack is completely ignored by asm printer. It's
+      // only used in JIT code emission as displacement to pc.
+      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+
+      // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+      // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+      if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+          .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                        X86II::MO_GOT_ABSOLUTE_ADDRESS);
+      }
+
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "X86 PIC Global Base Reg Initialization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char CGBR::ID = 0;
+FunctionPass*
+llvm::createGlobalBaseRegPass() { return new CGBR(); }
+
+namespace {
+  struct LDTLSCleanup : public MachineFunctionPass {
+    static char ID;
+    LDTLSCleanup() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
+      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
+        // No point folding accesses if there isn't at least two.
+        return false;
+      }
+
+      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+      return VisitNode(DT->getRootNode(), 0);
+    }
+
+    // Visit the dominator subtree rooted at Node in pre-order.
+    // If TLSBaseAddrReg is non-null, then use that to replace any
+    // TLS_base_addr instructions. Otherwise, create the register
+    // when the first such instruction is seen, and then use it
+    // as we encounter more instructions.
+    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
+      MachineBasicBlock *BB = Node->getBlock();
+      bool Changed = false;
+
+      // Traverse the current block.
+      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+           ++I) {
+        switch (I->getOpcode()) {
+          case X86::TLS_base_addr32:
+          case X86::TLS_base_addr64:
+            if (TLSBaseAddrReg)
+              I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+            else
+              I = SetRegister(I, &TLSBaseAddrReg);
+            Changed = true;
+            break;
+          default:
+            break;
+        }
+      }
+
+      // Visit the children of this block in the dominator tree.
+      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
+           I != E; ++I) {
+        Changed |= VisitNode(*I, TLSBaseAddrReg);
+      }
+
+      return Changed;
+    }
+
+    // Replace the TLS_base_addr instruction I with a copy from
+    // TLSBaseAddrReg, returning the new instruction.
+    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
+                                         unsigned TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
+      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   is64Bit ? X86::RAX : X86::EAX)
+                                   .addReg(TLSBaseAddrReg);
+
+      // Erase the TLS_base_addr instruction.
+      I->eraseFromParent();
+
+      return Copy;
+    }
+
+    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // inserting a copy instruction after I. Returns the new instruction.
+    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
+      MachineFunction *MF = I->getParent()->getParent();
+      const X86TargetMachine *TM =
+          static_cast<const X86TargetMachine *>(&MF->getTarget());
+      const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
+      const X86InstrInfo *TII = TM->getInstrInfo();
+
+      // Create a virtual register for the TLS base address.
+      MachineRegisterInfo &RegInfo = MF->getRegInfo();
+      *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
+                                                      ? &X86::GR64RegClass
+                                                      : &X86::GR32RegClass);
+
+      // Insert a copy from RAX/EAX to TLSBaseAddrReg.
+      MachineInstr *Next = I->getNextNode();
+      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY),
+                                   *TLSBaseAddrReg)
+                                   .addReg(is64Bit ? X86::RAX : X86::EAX);
+
+      return Copy;
+    }
+
+    virtual const char *getPassName() const {
+      return "Local Dynamic TLS Access Clean-up";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char LDTLSCleanup::ID = 0;
+FunctionPass*
+llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 000000000000..260f054d69cb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,419 @@
+//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "X86GenInstrInfo.inc"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86TargetMachine;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+
+    // Artificial condition codes. These are used by AnalyzeBranch
+    // to indicate a block terminated with two conditional branches to
+    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+    // which can't be represented on x86 with a single condition. These
+    // are never used in MachineInstrs.
+    COND_NE_OR_P,
+    COND_NP_OR_E,
+
+    COND_INVALID
+  };
+
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+}  // end namespace X86;
+
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+  case X86II::MO_TLVP:                           // ??? Pretty sure..
+    return true;
+  default:
+    return false;
+  }
+}
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+4 <= MI->getNumOperands() &&
+    MI->getOperand(Op  ).isReg() && isScale(MI->getOperand(Op+1)) &&
+    MI->getOperand(Op+2).isReg() &&
+    (MI->getOperand(Op+3).isImm() ||
+     MI->getOperand(Op+3).isGlobal() ||
+     MI->getOperand(Op+3).isCPI() ||
+     MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+5 <= MI->getNumOperands() &&
+    MI->getOperand(Op+4).isReg() &&
+    isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public X86GenInstrInfo {
+  X86TargetMachine &TM;
+  const X86RegisterInfo RI;
+
+  /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
+  RegOp2MemOpTableType RegOp2MemOpTable3;
+
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags);
+
+public:
+  explicit X86InstrInfo(X86TargetMachine &tm);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+  /// extension instruction. That is, it's like a copy where it's legal for the
+  /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+  /// true, then it's expected the pre-extension value is available as a subreg
+  /// of the result register. This also returns the sub-register index in
+  /// SubIdx.
+  virtual bool isCoalescableExtInstr(const MachineInstr &MI,
+                                     unsigned &SrcReg, unsigned &DstReg,
+                                     unsigned &SubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const;
+
+  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo &TRI) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+  // Branch analysis.
+  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  virtual bool canInsertSelect(const MachineBasicBlock&,
+                               const SmallVectorImpl<MachineOperand> &Cond,
+                               unsigned, unsigned, int&, int&, int&) const;
+  virtual void insertSelect(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, DebugLoc DL,
+                            unsigned DstReg,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            unsigned TrueReg, unsigned FalseReg) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              MachineInstr::mmo_iterator MMOBegin,
+                              MachineInstr::mmo_iterator MMOEnd,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               MachineInstr::mmo_iterator MMOBegin,
+                               MachineInstr::mmo_iterator MMOEnd,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
+  virtual
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                         int FrameIx, uint64_t Offset,
+                                         const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const;
+
+  /// canFoldMemoryOperand - Returns true if the specified load / store is
+  /// folding is possible.
+  virtual bool canFoldMemoryOperand(const MachineInstr*,
+                                    const SmallVectorImpl<unsigned> &) const;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+  /// index of the operand which will hold the register holding the loaded
+  /// value.
+  virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex = 0) const;
+
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+  /// to determine if two loads are loading from the same base address. It
+  /// should only return true if the base pointers are the same and the
+  /// only differences between the two addresses are the offset. It also returns
+  /// the offsets by reference.
+  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                       int64_t &Offset1, int64_t &Offset2) const;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                       int64_t Offset1, int64_t Offset2,
+                                       unsigned NumLoads) const;
+
+  virtual void getNoopForMachoTarget(MCInst &NopInst) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  static bool isX86_64ExtendedReg(const MachineOperand &MO) {
+    if (!MO.isReg()) return false;
+    return X86II::isX86_64ExtendedReg(MO.getReg());
+  }
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
+
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+
+  unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+                                        const TargetRegisterInfo *TRI) const;
+  void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+                                 const TargetRegisterInfo *TRI) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      unsigned OpNum,
+                                      const SmallVectorImpl<MachineOperand> &MOs,
+                                      unsigned Size, unsigned Alignment) const;
+
+  bool isHighLatencyDef(int opc) const;
+
+  bool hasHighOperandLatency(const InstrItineraryData *ItinData,
+                             const MachineRegisterInfo *MRI,
+                             const MachineInstr *DefMI, unsigned DefIdx,
+                             const MachineInstr *UseMI, unsigned UseIdx) const;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2 if having two register operands, and the value it
+  /// compares against in CmpValue. Return true if the comparison instruction
+  /// can be analyzed.
+  virtual bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                              unsigned &SrcReg2,
+                              int &CmpMask, int &CmpValue) const;
+
+  /// optimizeCompareInstr - Check if there exists an earlier instruction that
+  /// operates on the same source operands and sets flags in the same way as
+  /// Compare; remove Compare if possible.
+  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                                    unsigned SrcReg2, int CmpMask, int CmpValue,
+                                    const MachineRegisterInfo *MRI) const;
+
+  /// optimizeLoadInstr - Try to remove the load by folding it to a register
+  /// operand at the use. We fold the load instructions if and only if the
+  /// def and use are in the same BB. We only look at one load and see
+  /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
+  /// defined by the load we are trying to fold. DefMI returns the machine
+  /// instruction that defines FoldAsLoadDefReg, and the function returns
+  /// the machine instruction generated due to folding.
+  virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+                        const MachineRegisterInfo *MRI,
+                        unsigned &FoldAsLoadDefReg,
+                        MachineInstr *&DefMI) const;
+
+private:
+  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+                                              MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// isFrameOperand - Return true and the FrameIndex if the specified
+  /// operand and follow operands form a reference to the stack frame.
+  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                      int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 000000000000..ccc1aa2e35a5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,2197 @@
+//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
+
+def SDTX86Cmpsd : SDTypeProfile<1, 3, [SDTCisVT<0, f64>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags : SDTypeProfile<2, 1,
+                                           [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+// RES1, RES2, FLAGS = op LHS, RHS
+def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+                                  [SDTCisInt<0>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>;
+
+def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
+                                     SDTCisVT<2, i8>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32>]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>,
+                                            SDTCisPtrTy<1>,
+                                            SDTCisVT<2, i32>,
+                                            SDTCisVT<3, i8>,
+                                            SDTCisVT<4, i32>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86Void    : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
+                            [SDNPHasChain,SDNPSideEffect]>;
+def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+def X86LFence : SDNode<"X86ISD::LFENCE", SDT_X86MEMBARRIER,
+                        [SDNPHasChain]>;
+
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
+
+def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86rdseed  : SDNode<"X86ISD::RDSEED",   SDTX86rdrand,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain, SDNPVariadic]>;
+def X86vaarg64 :
+                 SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                         SDNPMemOperand]>;
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutGlue]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                         SDNPVariadic]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
+                        [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86adc_flag  : SDNode<"X86ISD::ADC",  SDTBinaryArithWithFlagsInOut>;
+def X86sbb_flag  : SDNode<"X86ISD::SBB",  SDTBinaryArithWithFlagsInOut>;
+
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
+
+def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
+def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
+def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
+                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
+def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL,
+                        [SDNPHasChain, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass { 
+ let Name = "Mem"; let PredicateMethod = "isMem"; 
+}
+def X86Mem8AsmOperand : AsmOperandClass { 
+  let Name = "Mem8"; let PredicateMethod = "isMem8";
+}
+def X86Mem16AsmOperand : AsmOperandClass { 
+  let Name = "Mem16"; let PredicateMethod = "isMem16";
+}
+def X86Mem32AsmOperand : AsmOperandClass { 
+  let Name = "Mem32"; let PredicateMethod = "isMem32";
+}
+def X86Mem64AsmOperand : AsmOperandClass { 
+  let Name = "Mem64"; let PredicateMethod = "isMem64";
+}
+def X86Mem80AsmOperand : AsmOperandClass { 
+  let Name = "Mem80"; let PredicateMethod = "isMem80";
+}
+def X86Mem128AsmOperand : AsmOperandClass { 
+  let Name = "Mem128"; let PredicateMethod = "isMem128";
+}
+def X86Mem256AsmOperand : AsmOperandClass { 
+  let Name = "Mem256"; let PredicateMethod = "isMem256";
+}
+
+// Gather mem operands
+def X86MemVX32Operand : AsmOperandClass {
+  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+}
+def X86MemVY32Operand : AsmOperandClass {
+  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+}
+def X86MemVX64Operand : AsmOperandClass {
+  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+}
+def X86MemVY64Operand : AsmOperandClass {
+  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+}
+
+def X86AbsMemAsmOperand : AsmOperandClass {
+  let Name = "AbsMem";
+  let SuperClasses = [X86MemAsmOperand];
+}
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem   : X86MemOperand<"printi8mem"> { 
+  let ParserMatchClass = X86Mem8AsmOperand; }
+def i16mem  : X86MemOperand<"printi16mem"> { 
+  let ParserMatchClass = X86Mem16AsmOperand; }
+def i32mem  : X86MemOperand<"printi32mem"> { 
+  let ParserMatchClass = X86Mem32AsmOperand; }
+def i64mem  : X86MemOperand<"printi64mem"> { 
+  let ParserMatchClass = X86Mem64AsmOperand; }
+def i128mem : X86MemOperand<"printi128mem"> { 
+  let ParserMatchClass = X86Mem128AsmOperand; }
+def i256mem : X86MemOperand<"printi256mem"> { 
+  let ParserMatchClass = X86Mem256AsmOperand; }
+def f32mem  : X86MemOperand<"printf32mem"> { 
+  let ParserMatchClass = X86Mem32AsmOperand; }
+def f64mem  : X86MemOperand<"printf64mem"> { 
+  let ParserMatchClass = X86Mem64AsmOperand; }
+def f80mem  : X86MemOperand<"printf80mem"> { 
+  let ParserMatchClass = X86Mem80AsmOperand; }
+def f128mem : X86MemOperand<"printf128mem"> { 
+  let ParserMatchClass = X86Mem128AsmOperand; }
+def f256mem : X86MemOperand<"printf256mem">{ 
+  let ParserMatchClass = X86Mem256AsmOperand; }
+
+// Gather mem operands
+def vx32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX32Operand; }
+def vy32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY32Operand; }
+def vx64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm);
+  let ParserMatchClass = X86MemVX64Operand; }
+def vy64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
+}
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86Mem8AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// GPRs available for tailcall.
+// It represents GR32_TC, GR64_TC or GR64_TCW64.
+def ptr_rc_tailcall : PointerLikeRegClass<2>;
+
+// Special i32mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i32mem_TC : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall,
+                       i32imm, i8imm);
+  let ParserMatchClass = X86Mem32AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+// Special i64mem for addresses of load folding tail calls. These are not
+// allowed to use callee-saved registers since they must be scheduled
+// after callee-saved register are popped.
+def i64mem_TC : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops ptr_rc_tailcall, i8imm,
+                       ptr_rc_tailcall, i32imm, i8imm);
+  let ParserMatchClass = X86Mem64AsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+let OperandType = "OPERAND_PCREL",
+    ParserMatchClass = X86AbsMemAsmOperand,
+    PrintMethod = "printPCRelImm" in {
+def i32imm_pcrel : Operand<i32>;
+def i16imm_pcrel : Operand<i16>;
+
+def offset8 : Operand<i64>;
+def offset16 : Operand<i64>;
+def offset32 : Operand<i64>;
+def offset64 : Operand<i64>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def AVXCC : Operand<i8> {
+  let PrintMethod = "printAVXCC";
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+class ImmSExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+class ImmZExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
+// Sign-extended immediate classes. We don't need to define the full lattice
+// here because there is no instruction with an ambiguity between ImmSExti64i32
+// and ImmSExti32i8.
+//
+// The strange ranges come from the fact that the assembler always works with
+// 64-bit immediates, but for a 16-bit target value we want to accept both "-1"
+// (which will be a -1ULL), and "0xFF" (-1 in 16-bits).
+
+// [0, 0x7FFFFFFF]                                            |
+//   [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i32";
+}
+
+// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti16i8";
+  let SuperClasses = [ImmSExti64i32AsmOperand];
+}
+
+// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti32i8";
+}
+
+// [0, 0x000000FF]
+def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
+  let Name = "ImmZExtu32u8";
+}
+
+
+// [0, 0x0000007F]                                            |
+//   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
+def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
+  let Name = "ImmSExti64i8";
+  let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand,
+                      ImmSExti64i32AsmOperand];
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExti16i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExti32i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
+def u32u8imm  : Operand<i32> {
+  let ParserMatchClass = ImmZExtu32u8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i32AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImm";
+  let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let AsmOperandLowerMethod = "lower_lea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
+def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+                                  [add, sub, mul, X86mul_imm, shl, or,
+                                   frameindex, X86WrapperRIP],
+                                  []>;
+
+def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr",
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasCMov      : Predicate<"Subtarget->hasCMov()">;
+def NoCMov       : Predicate<"!Subtarget->hasCMov()">;
+
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
+def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
+def HasAES       : Predicate<"Subtarget->hasAES()">;
+def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
+def HasFMA       : Predicate<"Subtarget->hasFMA()">;
+def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
+def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
+def HasHLE       : Predicate<"Subtarget->hasHLE()">;
+def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
+def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
+def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
+def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
+def In32BitMode  : Predicate<"!Subtarget->is64Bit()">,
+                             AssemblerPredicate<"!Mode64Bit", "32-bit mode">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
+                             AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+                             "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def IsNotPIC     : Predicate<"TM.getRelocationModel() != Reloc::PIC_">;
+def OptForSize   : Predicate<"OptForSize">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs.
+  def i16immSExt8  : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>;
+  def i32immSExt8  : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>;
+  def i64immSExt8  : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>;
+}
+
+def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>;
+
+
+// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+// unsigned field.
+def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>;
+
+def i64immZExt32SExt8 : ImmLeaf<i64, [{
+  return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm;
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),[{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list.
+//
+
+// Nop
+let neverHasSideEffects = 1, SchedRW = [WriteZero] in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>;
+  def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize;
+  def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", [], IIC_NOP>, TB;
+}
+
+
+// Constructing a stack frame.
+def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl),
+                 "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>;
+
+let SchedRW = [WriteALU] in {
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In32BitMode]>;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", [], IIC_LEAVE>,
+                 Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG16>, OpSize;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
+                IIC_POP_REG>, OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
+                IIC_POP_REG>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+                IIC_POP_MEM>;
+
+def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
+def POPF32   : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>,
+               Requires<[In32BitMode]>;
+} // mayLoad, SchedRW
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
+                 IIC_PUSH_REG>, OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[],
+                 IIC_PUSH_MEM>,
+  OpSize;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[],
+                 IIC_PUSH_REG>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[],
+                 IIC_PUSH_MEM>;
+
+def PUSHi8   : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm),
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
+def PUSHi16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                      "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize;
+def PUSHi32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm),
+                      "push{l}\t$imm", [], IIC_PUSH_IMM>;
+
+def PUSHF16  : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>,
+                 OpSize;
+def PUSHF32  : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>,
+               Requires<[In32BitMode]>;
+
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1, SchedRW = [WriteLoad] in {
+def POP64r   : I<0x58, AddRegFrm,
+                 (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
+                IIC_POP_REG>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+                IIC_POP_MEM>;
+} // mayLoad, SchedRW
+let mayStore = 1, SchedRW = [WriteStore] in {
+def PUSH64r  : I<0x50, AddRegFrm,
+                 (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
+                 IIC_PUSH_REG>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [],
+                 IIC_PUSH_MEM>;
+} // mayStore, SchedRW
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1,
+    SchedRW = [WriteStore] in {
+def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm),
+                     "push{q}\t$imm", [], IIC_PUSH_IMM>;
+def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm),
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
+def PUSH64i32  : Ii32<0x68, RawFrm, (outs), (ins i64i32imm:$imm),
+                      "push{q}\t$imm", [], IIC_PUSH_IMM>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in
+def POPF64   : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>,
+               Requires<[In64BitMode]>, Sched<[WriteLoad]>;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
+def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
+                 Requires<[In64BitMode]>, Sched<[WriteStore]>;
+
+let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
+    mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
+               Requires<[In32BitMode]>;
+}
+let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
+    mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
+               Requires<[In32BitMode]>;
+}
+
+let Constraints = "$src = $dst", SchedRW = [WriteALU] in {
+// GR32 = bswap GR32
+def BSWAP32r : I<0xC8, AddRegFrm,
+                 (outs GR32:$dst), (ins GR32:$src),
+                 "bswap{l}\t$dst",
+                 [(set GR32:$dst, (bswap GR32:$src))], IIC_BSWAP>, TB;
+
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst",
+                  [(set GR64:$dst, (bswap GR64:$src))], IIC_BSWAP>, TB;
+} // Constraints = "$src = $dst", SchedRW
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
+                  IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
+                  IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+               Sched<[WriteShift]>;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
+                 IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
+                  IIC_BSF>, TB, Sched<[WriteShift]>;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
+                  IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 TB, OpSize, Sched<[WriteShift]>;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
+                 IIC_BSR>, TB,
+                 OpSize, Sched<[WriteShiftLd]>;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+               Sched<[WriteShift]>;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
+                 IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+               Sched<[WriteShift]>;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
+                  IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+} // Defs = [EFLAGS]
+
+let SchedRW = [WriteMicrocoded] in {
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", [], IIC_MOVS>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", [], IIC_MOVS>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", [], IIC_MOVS>;
+def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", [], IIC_MOVS>;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", [], IIC_STOS>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", [], IIC_STOS>, OpSize;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", [], IIC_STOS>;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
+def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", [], IIC_STOS>;
+
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", [], IIC_SCAS>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", [], IIC_SCAS>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", [], IIC_SCAS>;
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", [], IIC_SCAS>;
+
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", [], IIC_CMPS>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", [], IIC_CMPS>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", [], IIC_CMPS>;
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", [], IIC_CMPS>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)], IIC_MOV>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)], IIC_MOV>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)], IIC_MOV>;
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)], IIC_MOV>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, i64immSExt32:$src)], IIC_MOV>;
+}
+} // SchedRW
+
+let SchedRW = [WriteStore] in {
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+/// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
+/// 32-bit offset from the PC.  These are only valid in x86-32 mode.
+let SchedRW = [WriteALU] in {
+def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
+                   "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
+                   Requires<[In32BitMode]>;
+def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
+                     Requires<[In32BitMode]>;
+def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
+                      "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
+                     Requires<[In32BitMode]>;
+def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
+                  Requires<[In32BitMode]>;
+def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
+                     Requires<[In32BitMode]>;
+def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
+                     Requires<[In32BitMode]>;
+}
+
+// FIXME: These definitions are utterly broken
+// Just leave them commented out for now because they're useless outside
+// of the large code model, and most compilers won't generate the instructions
+// in question.
+/*
+def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
+                      "mov{q}\t{$src, %rax|RAX, $src}", []>;
+def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
+                       "mov{q}\t{$src, %rax|RAX, $src}", []>;
+def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
+def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
+*/
+
+
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))], IIC_MOV_MEM>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))], IIC_MOV_MEM>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))], IIC_MOV_MEM>;
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))], IIC_MOV_MEM>;
+}
+
+let SchedRW = [WriteStore] in {
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)], IIC_MOV_MEM>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)], IIC_MOV_MEM>;
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)], IIC_MOV_MEM>;
+} // SchedRW
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let isCodeGenOnly = 1 in {
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
+                   Sched<[WriteMove]>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteStore]>;
+let mayLoad = 1, neverHasSideEffects = 1,
+    canFoldAsLoad = 1, isReMaterializable = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
+                     IIC_MOV_MEM>, Sched<[WriteLoad]>;
+}
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let SchedRW = [WriteALU] in {
+let Defs = [EFLAGS], Uses = [AH] in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
+                 [(set EFLAGS, (X86sahf AH))], IIC_AHF>;
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", [],
+                IIC_AHF>;  // AH = flags
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Bit tests instructions: BT, BTS, BTR, BTC.
+
+let Defs = [EFLAGS] in {
+let SchedRW = [WriteALU] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
+               OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, TB;
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+} // SchedRW
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
+  def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                 "bt{w}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, OpSize, TB, Requires<[FastBTMem]>;
+  def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                 "bt{l}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+  //                (implicit EFLAGS)]
+                 [], IIC_BT_MR
+                 >, TB, Requires<[FastBTMem]>;
+  def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bt{q}\t{$src2, $src1|$src1, $src2}",
+  //               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+  //                (implicit EFLAGS)]
+                  [], IIC_BT_MR
+                  >, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))],
+                IIC_BT_RI>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))],
+                IIC_BT_RI>, TB;
+def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))],
+                IIC_BT_RI>, TB;
+} // SchedRW
+
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+let SchedRW = [WriteALU] in {
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi16 addr:$src1), i16immSExt8:$src2))
+                 ], IIC_BT_MI>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi32 addr:$src1), i32immSExt8:$src2))
+                 ], IIC_BT_MI>, TB;
+def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86bt (loadi64 addr:$src1),
+                                     i64immSExt8:$src2))], IIC_BT_MI>, TB;
+} // SchedRW
+
+let hasSideEffects = 0 in {
+let SchedRW = [WriteALU] in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
+                OpSize, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
+                OpSize, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
+let SchedRW = [WriteALU] in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
+                    OpSize, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
+                    OpSize, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
+} // Defs = [EFLAGS]
+
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in {
+    def NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                      (ins GR8:$val, i8mem:$ptr),
+                      !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR8:$dst,
+                         (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                      itin>;
+    def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                      (ins GR16:$val, i16mem:$ptr),
+                      !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR16:$dst,
+                         (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                      itin>, OpSize;
+    def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$val, i32mem:$ptr),
+                      !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR32:$dst,
+                         (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                      itin>;
+    def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                       (ins GR64:$val, i64mem:$ptr),
+                       !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                       itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
+let SchedRW = [WriteALU] in {
+let Constraints = "$val = $dst" in {
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+                "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+                 "xchg{w}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>, OpSize;
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+                 "xchg{l}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+                  "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
+}
+
+// Swap between EAX and other registers.
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                  Requires<[In32BitMode]>;
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                   Requires<[In64BitMode]>;
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
+} // SchedRW
+
+let SchedRW = [WriteALU] in {
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB,
+                 OpSize;
+def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_REG>, TB;
+} // SchedRW
+
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
+def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "xadd{b}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB,
+                 OpSize;
+def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", [], IIC_XADD_MEM>, TB;
+
+}
+
+let SchedRW = [WriteALU] in {
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                   IIC_CMPXCHG_REG8>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                    IIC_CMPXCHG_REG>, TB, OpSize;
+def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_REG>, TB;
+def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_REG>, TB;
+} // SchedRW
+
+let SchedRW = [WriteALULd, WriteRMW] in {
+let mayLoad = 1, mayStore = 1 in {
+def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM8>, TB;
+def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB, OpSize;
+def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", [],
+                     IIC_CMPXCHG_MEM>, TB;
+def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", [],
+                      IIC_CMPXCHG_MEM>, TB;
+}
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+                  "cmpxchg8b\t$dst", [], IIC_CMPXCHG_8B>, TB;
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+                    "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
+                    TB, Requires<[HasCmpxchg16b]>;
+} // SchedRW
+
+
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+
+// Rex64 instruction prefix
+def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>;
+
+// Data16 instruction prefix
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+}
+
+
+// String manipulation instructions
+let SchedRW = [WriteMicrocoded] in {
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", [], IIC_LODS>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", [], IIC_LODS>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", [], IIC_LODS>;
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", [], IIC_LODS>;
+}
+
+let SchedRW = [WriteSystem] in {
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", [], IIC_OUTS>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", [], IIC_OUTS>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", [], IIC_OUTS>;
+}
+
+// Flag instructions
+let SchedRW = [WriteALU] in {
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", [], IIC_CLC>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", [], IIC_STC>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", [], IIC_CLI>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", [], IIC_STI>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", [], IIC_CLD>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", [], IIC_STD>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", [], IIC_CMC>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB;
+}
+
+// Table lookup instructions
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>,
+           Sched<[WriteLoad]>;
+
+let SchedRW = [WriteMicrocoded] in {
+// ASCII Adjust After Addition
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>,
+            Requires<[In32BitMode]>;
+
+// ASCII Adjust AX Before Division
+// sets AL, AH and EFLAGS and uses AL and AH
+def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src),
+                 "aad\t$src", [], IIC_AAD>, Requires<[In32BitMode]>;
+
+// ASCII Adjust AX After Multiply
+// sets AL, AH and EFLAGS and uses AL
+def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src),
+                 "aam\t$src", [], IIC_AAM>, Requires<[In32BitMode]>;
+
+// ASCII Adjust AL After Subtraction - sets
+// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS
+def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>,
+            Requires<[In32BitMode]>;
+
+// Decimal Adjust AL after Addition
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>,
+            Requires<[In32BitMode]>;
+
+// Decimal Adjust AL after Subtraction
+// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS
+def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>,
+            Requires<[In32BitMode]>;
+} // SchedRW
+
+let SchedRW = [WriteSystem] in {
+// Check Array Index Against Bounds
+def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>, OpSize,
+                   Requires<[In32BitMode]>;
+def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "bound\t{$src, $dst|$dst, $src}", [], IIC_BOUND>,
+                   Requires<[In32BitMode]>;
+
+// Adjust RPL Field of Segment Selector
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
+                 Requires<[In32BitMode]>;
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
+                 Requires<[In32BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  let SchedRW = [WriteALULd] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))], IIC_MOVBE>,
+                    OpSize, T8;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))], IIC_MOVBE>,
+                    T8;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))], IIC_MOVBE>,
+                     T8;
+  }
+  let SchedRW = [WriteStore] in {
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)], IIC_MOVBE>,
+                    OpSize, T8;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)], IIC_MOVBE>,
+                    T8;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)], IIC_MOVBE>,
+                     T8;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdrand))]>, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// RDSEED Instruction
+//
+let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+  def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
+                    "rdseed{w}\t$dst",
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize, TB;
+  def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
+                    "rdseed{l}\t$dst",
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, TB;
+  def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
+                     "rdseed{q}\t$dst",
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// BMI Instructions
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
+                  RegisterClass RC, X86MemOperand x86memop, SDNode OpNode,
+                  PatFrag ld_frag> {
+  def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V;
+  def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
+             [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>,
+             T8, VEX_4V;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem,
+                        X86blsr, loadi32>;
+  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem,
+                        X86blsr, loadi64>, VEX_W;
+  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem,
+                          X86blsmsk, loadi32>;
+  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem,
+                          X86blsmsk, loadi64>, VEX_W;
+  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem,
+                        X86blsi, loadi32>;
+  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem,
+                        X86blsi, loadi64>, VEX_W;
+}
+
+multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
+                          X86MemOperand x86memop, Intrinsic Int,
+                          PatFrag ld_frag> {
+  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
+             T8, VEX_4VOp3;
+  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
+              (implicit EFLAGS)]>, T8, VEX_4VOp3;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem,
+                                int_x86_bmi_bextr_32, loadi32>;
+  defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem,
+                                int_x86_bmi_bextr_64, loadi64>, VEX_W;
+}
+
+let Predicates = [HasBMI2], Defs = [EFLAGS] in {
+  defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
+                               int_x86_bmi_bzhi_32, loadi32>;
+  defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
+                               int_x86_bmi_bzhi_64, loadi64>, VEX_W;
+}
+
+multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
+                         X86MemOperand x86memop, Intrinsic Int,
+                         PatFrag ld_frag> {
+  def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+             VEX_4V;
+  def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+}
+
+let Predicates = [HasBMI2] in {
+  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
+                               int_x86_bmi_pdep_32, loadi32>, T8XD;
+  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
+                               int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
+                               int_x86_bmi_pext_32, loadi32>, T8XS;
+  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
+                               int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Subsystems.
+//===----------------------------------------------------------------------===//
+
+include "X86InstrArithmetic.td"
+include "X86InstrCMovSetCC.td"
+include "X86InstrExtension.td"
+include "X86InstrControl.td"
+include "X86InstrShiftRotate.td"
+
+// X87 Floating Point Stack.
+include "X86InstrFPStack.td"
+
+// SIMD support (SSE, MMX and AVX)
+include "X86InstrFragmentsSIMD.td"
+
+// FMA - Fused Multiply-Add support (requires FMA)
+include "X86InstrFMA.td"
+
+// XOP
+include "X86InstrXOP.td"
+
+// SSE, MMX and 3DNow! vector support.
+include "X86InstrSSE.td"
+include "X86InstrMMX.td"
+include "X86Instr3DNow.td"
+
+include "X86InstrVMX.td"
+include "X86InstrSVM.td"
+
+include "X86InstrTSX.td"
+
+// System instructions.
+include "X86InstrSystem.td"
+
+// Compiler Pseudo Instructions and Pat Patterns
+include "X86InstrCompiler.td"
+
+//===----------------------------------------------------------------------===//
+// Assembler Mnemonic Aliases
+//===----------------------------------------------------------------------===//
+
+def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"cbw",  "cbtw">;
+def : MnemonicAlias<"cwde", "cwtl">;
+def : MnemonicAlias<"cwd",  "cwtd">;
+def : MnemonicAlias<"cdq", "cltd">;
+def : MnemonicAlias<"cdqe", "cltq">;
+def : MnemonicAlias<"cqo", "cqto">;
+
+// lret maps to lretl, it is not ambiguous with lretq.
+def : MnemonicAlias<"lret", "lretl">;
+
+def : MnemonicAlias<"leavel", "leave">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"leaveq", "leave">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"loopz", "loope">;
+def : MnemonicAlias<"loopnz", "loopne">;
+
+def : MnemonicAlias<"pop", "popl">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pop", "popq">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfl">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popf", "popfq">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popfd",  "popfl">;
+
+// FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
+// all modes.  However: "push (addr)" and "push $42" should default to
+// pushl/pushq depending on the current mode.  Similar for "pop %bx"
+def : MnemonicAlias<"push", "pushl">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"push", "pushq">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfl">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushfd", "pushfl">;
+
+def : MnemonicAlias<"repe", "rep">;
+def : MnemonicAlias<"repz", "rep">;
+def : MnemonicAlias<"repnz", "repne">;
+
+def : MnemonicAlias<"retl", "ret">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"retq", "ret">, Requires<[In64BitMode]>;
+
+def : MnemonicAlias<"salb", "shlb">;
+def : MnemonicAlias<"salw", "shlw">;
+def : MnemonicAlias<"sall", "shll">;
+def : MnemonicAlias<"salq", "shlq">;
+
+def : MnemonicAlias<"smovb", "movsb">;
+def : MnemonicAlias<"smovw", "movsw">;
+def : MnemonicAlias<"smovl", "movsl">;
+def : MnemonicAlias<"smovq", "movsq">;
+
+def : MnemonicAlias<"ud2a", "ud2">;
+def : MnemonicAlias<"verrw", "verr">;
+
+// System instruction aliases.
+def : MnemonicAlias<"iret", "iretl">;
+def : MnemonicAlias<"sysret", "sysretl">;
+def : MnemonicAlias<"sysexit", "sysexitl">;
+
+def : MnemonicAlias<"lgdtl", "lgdt">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lgdtq", "lgdt">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"lidtl", "lidt">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"lidtq", "lidt">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sgdtl", "sgdt">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sgdtq", "sgdt">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"sidtl", "sidt">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"sidtq", "sidt">, Requires<[In64BitMode]>;
+
+
+// Floating point stack aliases.
+def : MnemonicAlias<"fcmovz",   "fcmove">;
+def : MnemonicAlias<"fcmova",   "fcmovnbe">;
+def : MnemonicAlias<"fcmovnae", "fcmovb">;
+def : MnemonicAlias<"fcmovna",  "fcmovbe">;
+def : MnemonicAlias<"fcmovae",  "fcmovnb">;
+def : MnemonicAlias<"fcomip",   "fcompi">;
+def : MnemonicAlias<"fildq",    "fildll">;
+def : MnemonicAlias<"fistpq",   "fistpll">;
+def : MnemonicAlias<"fisttpq",  "fisttpll">;
+def : MnemonicAlias<"fldcww",   "fldcw">;
+def : MnemonicAlias<"fnstcww", "fnstcw">;
+def : MnemonicAlias<"fnstsww", "fnstsw">;
+def : MnemonicAlias<"fucomip",  "fucompi">;
+def : MnemonicAlias<"fwait",    "wait">;
+
+
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond>
+  : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
+                  !strconcat(Prefix, NewCond, Suffix)>;
+
+/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
+/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
+/// example "setz" -> "sete".
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> {
+  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b">;   // setc   -> setb
+  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e">;   // setz   -> sete
+  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be">;  // setna  -> setbe
+  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae">;  // setnb  -> setae
+  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae">;  // setnc  -> setae
+  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le">;  // setng  -> setle
+  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge">;  // setnl  -> setge
+  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne">;  // setnz  -> setne
+  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p">;   // setpe  -> setp
+  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np">;  // setpo  -> setnp
+
+  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">;   // setnae -> setb
+  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">;   // setnbe -> seta
+  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">;   // setnge -> setl
+  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">;   // setnle -> setg
+}
+
+// Aliases for set<CC>
+defm : IntegerCondCodeMnemonicAlias<"set", "">;
+// Aliases for j<CC>
+defm : IntegerCondCodeMnemonicAlias<"j", "">;
+// Aliases for cmov<CC>{w,l,q}
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q">;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler Instruction Aliases
+//===----------------------------------------------------------------------===//
+
+// aad/aam default to base 10 if no operand is specified.
+def : InstAlias<"aad", (AAD8i8 10)>;
+def : InstAlias<"aam", (AAM8i8 10)>;
+
+// Disambiguate the mem/imm form of bt-without-a-suffix as btl.
+def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>;
+
+// clr aliases.
+def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg)>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+
+// div and idiv aliases for explicit A register.
+def : InstAlias<"divb $src, %al",  (DIV8r  GR8 :$src)>;
+def : InstAlias<"divw $src, %ax",  (DIV16r GR16:$src)>;
+def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>;
+def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>;
+def : InstAlias<"divb $src, %al",  (DIV8m  i8mem :$src)>;
+def : InstAlias<"divw $src, %ax",  (DIV16m i16mem:$src)>;
+def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>;
+def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>;
+def : InstAlias<"idivb $src, %al",  (IDIV8r  GR8 :$src)>;
+def : InstAlias<"idivw $src, %ax",  (IDIV16r GR16:$src)>;
+def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>;
+def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>;
+def : InstAlias<"idivb $src, %al",  (IDIV8m  i8mem :$src)>;
+def : InstAlias<"idivw $src, %ax",  (IDIV16m i16mem:$src)>;
+def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
+
+
+
+// Various unary fpstack operations default to operating on on ST1.
+// For example, "fxch" -> "fxch %st(1)"
+def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
+def : InstAlias<"fsubp",        (SUBR_FPrST0 ST1)>;
+def : InstAlias<"fsubrp",       (SUB_FPrST0  ST1)>;
+def : InstAlias<"fmulp",        (MUL_FPrST0  ST1)>;
+def : InstAlias<"fdivp",        (DIVR_FPrST0 ST1)>;
+def : InstAlias<"fdivrp",       (DIV_FPrST0  ST1)>;
+def : InstAlias<"fxch",         (XCH_F       ST1)>;
+def : InstAlias<"fcom",         (COM_FST0r   ST1)>;
+def : InstAlias<"fcomp",        (COMP_FST0r  ST1)>;
+def : InstAlias<"fcomi",        (COM_FIr     ST1)>;
+def : InstAlias<"fcompi",       (COM_FIPr    ST1)>;
+def : InstAlias<"fucom",        (UCOM_Fr     ST1)>;
+def : InstAlias<"fucomp",       (UCOM_FPr    ST1)>;
+def : InstAlias<"fucomi",       (UCOM_FIr    ST1)>;
+def : InstAlias<"fucompi",      (UCOM_FIPr   ST1)>;
+
+// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
+// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
+// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
+// gas.
+multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
+ def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+                 (Inst RST:$op), EmitAlias>;
+ def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+                 (Inst ST0), EmitAlias>;
+}
+
+defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
+defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
+defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
+defm : FpUnaryAlias<"fsubp",  SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
+defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>;
+defm : FpUnaryAlias<"fmul",   MUL_FST0r>;
+defm : FpUnaryAlias<"fmulp",  MUL_FPrST0>;
+defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
+defm : FpUnaryAlias<"fdivp",  DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
+defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
+defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
+defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
+defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
+defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
+
+
+// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
+// commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
+// solely because gas supports it.
+def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
+
+// We accept "fnstsw %eax" even though it only writes %ax.
+def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
+def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
+
+// lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
+// this is compatible with what GAS does.
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst)>;
+def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst)>;
+
+// "imul <imm>, B" is an alias for "imul <imm>, B, B".
+def : InstAlias<"imulw $imm, $r", (IMUL16rri  GR16:$r, GR16:$r, i16imm:$imm)>;
+def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri  GR32:$r, GR32:$r, i32imm:$imm)>;
+def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>;
+def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
+def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
+
+// inb %dx -> inb %al, %dx
+def : InstAlias<"inb %dx", (IN8rr)>;
+def : InstAlias<"inw %dx", (IN16rr)>;
+def : InstAlias<"inl %dx", (IN32rr)>;
+def : InstAlias<"inb $port", (IN8ri i8imm:$port)>;
+def : InstAlias<"inw $port", (IN16ri i8imm:$port)>;
+def : InstAlias<"inl $port", (IN32ri i8imm:$port)>;
+
+
+// jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
+def : InstAlias<"call $seg, $off",  (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmp $seg, $off",   (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"callw $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpw $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>;
+def : InstAlias<"calll $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>;
+def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
+
+// Force mov without a suffix with a segment and mem to prefer the 'l' form of
+// the move.  All segment/mem forms are equivalent, this has the shortest
+// encoding.
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
+
+// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
+
+// Match 'movq GR64, MMX' as an alias for movd.
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq $src, $dst",
+                (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>;
+
+// movsd with no operands (as opposed to the SSE scalar move of a double) is an
+// alias for movsl. (as in rep; movsd)
+def : InstAlias<"movsd", (MOVSD)>;
+
+// movsx aliases
+def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>;
+def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>;
+
+// movzx aliases
+def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>;
+def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
+// Note: No GR32->GR64 movzx form.
+
+// outb %dx -> outb %al, %dx
+def : InstAlias<"outb %dx", (OUT8rr)>;
+def : InstAlias<"outw %dx", (OUT16rr)>;
+def : InstAlias<"outl %dx", (OUT32rr)>;
+def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>;
+def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>;
+def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
+
+// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
+// effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
+// errors, since its encoding is the most compact.
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
+
+// shld/shrd op,op -> shld op, op, CL
+def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
+def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
+def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
+def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
+
+def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
+def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
+def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+
+/*  FIXME: This is disabled because the asm matcher is currently incapable of
+ *  matching a fixed immediate like $1.
+// "shl X, $1" is an alias for "shl X".
+multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> {
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>;
+ def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"),
+                 (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>;
+}
+
+defm : ShiftRotateByOneAlias<"rcl", "RCL">;
+defm : ShiftRotateByOneAlias<"rcr", "RCR">;
+defm : ShiftRotateByOneAlias<"rol", "ROL">;
+defm : ShiftRotateByOneAlias<"ror", "ROR">;
+FIXME */
+
+// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
+def : InstAlias<"testb $val, $mem", (TEST8rm  GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
+def : InstAlias<"xchgb $mem, $val", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 000000000000..49721df7c118
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,624 @@
+//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+// All instructions that use MMX should be in this file, even if they also use
+// SSE.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def MMX_INTALU_ITINS : OpndItins<
+  IIC_MMX_ALU_RR, IIC_MMX_ALU_RM
+>;
+
+def MMX_INTALUQ_ITINS : OpndItins<
+  IIC_MMX_ALUQ_RR, IIC_MMX_ALUQ_RM
+>;
+
+def MMX_PHADDSUBW : OpndItins<
+  IIC_MMX_PHADDSUBW_RR, IIC_MMX_PHADDSUBW_RM
+>;
+
+def MMX_PHADDSUBD : OpndItins<
+  IIC_MMX_PHADDSUBD_RR, IIC_MMX_PHADDSUBD_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def MMX_PMUL_ITINS : OpndItins<
+  IIC_MMX_PMUL, IIC_MMX_PMUL
+>;
+
+let Sched = WriteVecALU in {
+def MMX_PSADBW_ITINS : OpndItins<
+  IIC_MMX_PSADBW, IIC_MMX_PSADBW
+>;
+
+def MMX_MISC_FUNC_ITINS : OpndItins<
+  IIC_MMX_MISC_FUNC_MEM, IIC_MMX_MISC_FUNC_REG
+>;
+}
+
+def MMX_SHIFT_ITINS : ShiftOpndItins<
+  IIC_MMX_SHIFT_RR, IIC_MMX_SHIFT_RM, IIC_MMX_SHIFT_RI
+>;
+
+let Sched = WriteShuffle in {
+def MMX_UNPCK_H_ITINS : OpndItins<
+  IIC_MMX_UNPCK_H_RR, IIC_MMX_UNPCK_H_RM
+>;
+
+def MMX_UNPCK_L_ITINS : OpndItins<
+  IIC_MMX_UNPCK_L, IIC_MMX_UNPCK_L
+>;
+
+def MMX_PCK_ITINS : OpndItins<
+  IIC_MMX_PCK_RR, IIC_MMX_PCK_RM
+>;
+
+def MMX_PSHUF_ITINS : OpndItins<
+  IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+} // Sched
+
+let Sched = WriteCvtF2I in {
+def MMX_CVT_PD_ITINS : OpndItins<
+  IIC_MMX_CVT_PD_RR, IIC_MMX_CVT_PD_RM
+>;
+
+def MMX_CVT_PS_ITINS : OpndItins<
+  IIC_MMX_CVT_PS_RR, IIC_MMX_CVT_PS_RM
+>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
+  // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins, bit Commutable = 0> {
+    def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                 (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+              Sched<[itins.Sched]> {
+      let isCommutable = Commutable;
+    }
+    def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                 (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))],
+                 itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2, ShiftOpndItins itins> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))], itins.rr>,
+             Sched<[WriteVecShift]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))],
+                  itins.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))], itins.ri>,
+           Sched<[WriteVecShift]>;
+  }
+}
+
+/// Unary MMX instructions requiring SSSE3.
+multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, OpndItins itins> {
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>,
+             Sched<[itins.Sched]>;
+
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopmmx addr:$src))))],
+                   itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// Binary MMX instructions requiring SSSE3.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId64, OpndItins itins> {
+  let isCommutable = 0 in
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
+       (ins VR64:$src1, VR64:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
+       (ins VR64:$src1, i64mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+       [(set VR64:$dst,
+         (IntId64 VR64:$src1,
+          (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+}
+
+/// PALIGN MMX instructions (require SSSE3).
+multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+      (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
+      [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+      (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+      [(set VR64:$dst, (IntId VR64:$src1,
+                       (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>;
+}
+
+multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm, OpndItins itins, Domain d> {
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>,
+            Sched<[itins.Sched]>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, Domain d> {
+  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
+              asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], 
+              NoItinerary, d>;
+  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
+                   (ins DstRC:$src1, x86memop:$src2), asm,
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], 
+              NoItinerary, d>;
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS Instruction
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+                     [(int_x86_mmx_emms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, 
+                         (x86mmx (scalar_to_vector GR32:$src)))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+let canFoldAsLoad = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst,
+                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>,
+                   Sched<[WriteStore]>;
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))],
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+let SchedRW = [WriteMove] in {
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}", 
+                             [(set GR64:$dst,
+                              (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
+def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+} // SchedRW
+
+let SchedRW = [WriteLoad] in {
+let canFoldAsLoad = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))],
+                        IIC_MMX_MOVQ_RM>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (x86mmx VR64:$src), addr:$dst)],
+                        IIC_MMX_MOVQ_RM>;
+} // SchedRW
+
+let SchedRW = [WriteMove] in {
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
+} // SchedRW
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
+                         IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst,
+                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
+                            IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
+                           (ins i32mem:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+          [(set VR64:$dst,
+                (x86mmx (X86vzmovl (x86mmx
+                                   (scalar_to_vector (loadi32 addr:$src))))))],
+                            IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+
+// Arithmetic Instructions
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d,
+                                     MMX_INTALU_ITINS>;
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
+                                   MMX_PHADDSUBW>;
+
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
+                                   MMX_INTALU_ITINS>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
+                                   MMX_INTALUQ_ITINS, 1>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
+                                   MMX_INTALU_ITINS, 1>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
+                                   MMX_INTALU_ITINS, 1>;
+
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
+                                   MMX_PHADDSUBW>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d,
+                                   MMX_PHADDSUBD>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw,
+                                   MMX_PHADDSUBW>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w,
+                                     MMX_PMUL_ITINS, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq,
+                                     MMX_PMUL_ITINS, 1>;
+let isCommutable = 1 in
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw, MMX_PMUL_ITINS>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd,
+                                     MMX_PMUL_ITINS, 1>;
+
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>;
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w,
+                                     MMX_MISC_FUNC_ITINS, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw,
+                                     MMX_PSADBW_ITINS, 1>;
+
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w,
+                                        MMX_MISC_FUNC_ITINS>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d,
+                                        MMX_MISC_FUNC_ITINS>;
+let Constraints = "$src1 = $dst" in
+  defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor,
+                                  MMX_INTALU_ITINS, 1>;
+defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn,
+                                  MMX_INTALU_ITINS>;
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
+                                    MMX_SHIFT_ITINS>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
+                                    MMX_SHIFT_ITINS>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
+                                    MMX_SHIFT_ITINS>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
+                                    MMX_SHIFT_ITINS>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d,
+                                     MMX_INTALU_ITINS>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w,
+                                     MMX_INTALU_ITINS>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d,
+                                     MMX_INTALU_ITINS>;
+
+// -- Unpack Instructions
+defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
+                                       int_x86_mmx_punpckhbw,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
+                                       int_x86_mmx_punpckhwd,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", 
+                                       int_x86_mmx_punpckhdq,
+                                       MMX_UNPCK_H_ITINS>;
+defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", 
+                                       int_x86_mmx_punpcklbw,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", 
+                                       int_x86_mmx_punpcklwd,
+                                       MMX_UNPCK_L_ITINS>;
+defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq",
+                                       int_x86_mmx_punpckldq,
+                                       MMX_UNPCK_L_ITINS>;
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw,
+                                      MMX_PCK_ITINS>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
+                                      MMX_PCK_ITINS>;
+
+// -- Shuffle Instructions
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
+                                       MMX_PSHUF_ITINS>;
+
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffle]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                             (int_x86_sse_pshuf_w (load_mmx addr:$src1),
+                                                   imm:$src2))],
+                          IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
+
+
+
+
+// -- Conversion Instructions
+defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
+                      f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
+                      f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                      MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
+                       f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PS_ITINS, SSEPackedSingle>, TB;
+defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
+                       f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                       MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
+                         i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         MMX_CVT_PD_ITINS, SSEPackedDouble>, TB, OpSize;
+let Constraints = "$src1 = $dst" in {
+  defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
+                         int_x86_sse_cvtpi2ps,
+                         i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                          SSEPackedSingle>, TB;
+}
+
+// Extract / Insert
+def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
+                           (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                             (iPTR imm:$src2)))],
+                           IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+let Constraints = "$src1 = $dst" in {
+  def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst), 
+                      (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                        GR32:$src2, (iPTR imm:$src3)))],
+                      IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
+
+  def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst),
+                     (ins VR64:$src1, i16mem:$src2, i32i8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
+                                         (i32 (anyext (loadi16 addr:$src2))),
+                                       (iPTR imm:$src3)))],
+                     IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, 
+                                (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+          (x86mmx (MMX_MOVQ64rm addr:$src))>;
+
+// Misc.
+let SchedRW = [WriteShuffle] in {
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                        "maskmovq\t{$mask, $src|$src, $mask}",
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                        IIC_MMX_MASKMOV>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)],
+                           IIC_MMX_MASKMOV>;
+}
+
+// 64-bit bit convert.
+let Predicates = [HasSSE2] in {
+def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
+          (MMX_MOVFR642Qrr FR64:$src)>;
+}
+
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 000000000000..384238741b18
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,8328 @@
+//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  // InstrSchedModel info.
+  X86FoldableSchedWrite Sched = WriteFAdd;
+}
+
+class SizeItins<OpndItins arg_s, OpndItins arg_d> {
+  OpndItins s = arg_s;
+  OpndItins d = arg_d;
+}
+
+
+class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+  InstrItinClass arg_ri> {
+  InstrItinClass rr = arg_rr;
+  InstrItinClass rm = arg_rm;
+  InstrItinClass ri = arg_ri;
+}
+
+
+// scalar
+let Sched = WriteFAdd in {
+def SSE_ALU_F32S : OpndItins<
+  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
+>;
+
+def SSE_ALU_F64S : OpndItins<
+  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
+>;
+}
+
+def SSE_ALU_ITINS_S : SizeItins<
+  SSE_ALU_F32S, SSE_ALU_F64S
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32S : OpndItins<
+  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
+>;
+
+def SSE_MUL_F64S : OpndItins<
+  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
+>;
+}
+
+def SSE_MUL_ITINS_S : SizeItins<
+  SSE_MUL_F32S, SSE_MUL_F64S
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32S : OpndItins<
+  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
+>;
+
+def SSE_DIV_F64S : OpndItins<
+  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
+>;
+}
+
+def SSE_DIV_ITINS_S : SizeItins<
+  SSE_DIV_F32S, SSE_DIV_F64S
+>;
+
+// parallel
+let Sched = WriteFAdd in {
+def SSE_ALU_F32P : OpndItins<
+  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+def SSE_ALU_F64P : OpndItins<
+  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
+>;
+}
+
+def SSE_ALU_ITINS_P : SizeItins<
+  SSE_ALU_F32P, SSE_ALU_F64P
+>;
+
+let Sched = WriteFMul in {
+def SSE_MUL_F32P : OpndItins<
+  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
+>;
+
+def SSE_MUL_F64P : OpndItins<
+  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
+>;
+}
+
+def SSE_MUL_ITINS_P : SizeItins<
+  SSE_MUL_F32P, SSE_MUL_F64P
+>;
+
+let Sched = WriteFDiv in {
+def SSE_DIV_F32P : OpndItins<
+  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
+>;
+
+def SSE_DIV_F64P : OpndItins<
+  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
+>;
+}
+
+def SSE_DIV_ITINS_P : SizeItins<
+  SSE_DIV_F32P, SSE_DIV_F64P
+>;
+
+def SSE_BIT_ITINS_P : OpndItins<
+  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
+>;
+
+let Sched = WriteVecALU in {
+def SSE_INTALU_ITINS_P : OpndItins<
+  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+def SSE_INTALUQ_ITINS_P : OpndItins<
+  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
+>;
+}
+
+let Sched = WriteVecIMul in
+def SSE_INTMUL_ITINS_P : OpndItins<
+  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
+>;
+
+def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
+  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
+>;
+
+def SSE_MOVA_ITINS : OpndItins<
+  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
+>;
+
+def SSE_MOVU_ITINS : OpndItins<
+  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
+>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 Instructions Classes
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
+multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, X86MemOperand x86memop,
+                           OpndItins itins,
+                           bit Is2Addr = 1> {
+  let isCommutable = 1 in {
+    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  }
+  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
+multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                             string asm, string SSEVer, string FPSizeStr,
+                             Operand memopr, ComplexPattern mem_cpat,
+                             OpndItins itins,
+                             bit Is2Addr = 1> {
+  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(
+                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
+       !if(Is2Addr,
+           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
+                                          SSEVer, "_", OpcodeStr, FPSizeStr))
+             RC:$src1, mem_cpat:$src2))], itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           Domain d, OpndItins itins, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
+multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
+                                      string OpcodeStr, X86MemOperand x86memop,
+                                      list<dag> pat_rr, list<dag> pat_rm,
+                                      bit Is2Addr = 1> {
+  let isCommutable = 1, hasSideEffects = 0 in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rr, NoItinerary, d>,
+       Sched<[WriteVecLogic]>;
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       pat_rm, NoItinerary, d>,
+       Sched<[WriteVecLogicLd, ReadAfterLd]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+          (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
+// This is expanded by ExpandPostRAPseudos.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
+  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+}
+
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
+}
+
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
+}
+
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero] in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX2] in
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
+// is used instead. Register-to-register movss/movsd is not modeled as an
+// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
+// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+                         X86MemOperand x86memop, string base_opc,
+                         string asm_opr> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src1, RC:$src2),
+              !strconcat(base_opc, asm_opr),
+              [(set VR128:$dst, (vt (OpNode VR128:$src1,
+                                 (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+
+  // For the disassembler
+  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                  (ins VR128:$src1, RC:$src2),
+                  !strconcat(base_opc, asm_opr),
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteMove]>;
+}
+
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+                      X86MemOperand x86memop, string OpcodeStr> {
+  // AVX
+  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+                              VEX_4V, VEX_LIG;
+
+  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                     VEX, VEX_LIG, Sched<[WriteStore]>;
+  // SSE1 & 2
+  let Constraints = "$src1 = $dst" in {
+    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+                              "\t{$src2, $dst|$dst, $src2}">;
+  }
+
+  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+                  Sched<[WriteStore]>;
+}
+
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+                         PatFrag mem_pat, string OpcodeStr> {
+  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (mem_pat addr:$src))],
+                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
+}
+
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+
+  let AddedComplexity = 20 in
+    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+}
+
+// Patterns
+let Predicates = [HasAVX] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSrr (v4f32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
+              sub_xmm)>;
+
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+let Predicates = [UseSSE1] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm already zeros the high parts of the register.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+}
+
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm already zeros the high parts of the register.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
+
+  // Shuffle with MOVSD
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
+                            X86MemOperand x86memop, PatFrag ld_frag,
+                            string asm, Domain d,
+                            OpndItins itins,
+                            bit IsReMaterializable = 1> {
+let neverHasSideEffects = 1 in
+  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
+           Sched<[WriteMove]>;
+let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
+  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
+           Sched<[WriteLoad]>;
+}
+
+defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              TB, VEX;
+defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              TB, OpSize, VEX;
+defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              TB, VEX;
+defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              TB, OpSize, VEX;
+
+defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              TB, VEX, VEX_L;
+defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              TB, OpSize, VEX, VEX_L;
+defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              TB, VEX, VEX_L;
+defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              TB, OpSize, VEX, VEX_L;
+defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
+                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
+                              TB;
+defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
+                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
+                              TB, OpSize;
+defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
+                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
+                              TB;
+defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
+                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
+                              TB, OpSize;
+
+let SchedRW = [WriteStore] in {
+def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX;
+def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX;
+def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v8f32 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v4f64 VR256:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", [],
+                          IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVA_P_RR>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", [],
+                           IIC_SSE_MOVU_P_RR>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", [],
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzmovl
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v8f32 (X86vzmovl
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+def : Pat<(v4f64 (X86vzmovl
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
+}
+
+
+def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
+          (VMOVUPSYmr addr:$dst, VR256:$src)>;
+def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
+          (VMOVUPDYmr addr:$dst, VR256:$src)>;
+
+let SchedRW = [WriteStore] in {
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)],
+                   IIC_SSE_MOVU_P_MR>;
+} // SchedRW
+
+// For disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVA_P_RR>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", [],
+                         IIC_SSE_MOVU_P_RR>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [UseSSE1] in
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [UseSSE2] in
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [UseSSE1] in {
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                       "movaps\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>, VEX;
+def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                       "movapd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>, VEX;
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", [],
+                     IIC_SSE_MOVA_P_RR>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", [],
+                     IIC_SSE_MOVA_P_RR>;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
+let isCodeGenOnly = 1 in {
+  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                         "movaps\t{$src, $dst|$dst, $src}",
+                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                         "movapd\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                         IIC_SSE_MOVA_P_RM>, VEX;
+}
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                     IIC_SSE_MOVA_P_RM>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                     IIC_SSE_MOVA_P_RM>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                      string base_opc, string asm_opr,
+                                      InstrItinClass itin> {
+  def PSrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "s", asm_opr),
+     [(set VR128:$dst,
+       (psnode VR128:$src1,
+              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
+              itin, SSEPackedSingle>, TB,
+     Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+  def PDrm : PI<opc, MRMSrcMem,
+         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+         !strconcat(base_opc, "d", asm_opr),
+     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))))],
+              itin, SSEPackedDouble>, TB, OpSize,
+     Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+}
+
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+                                 string base_opc, InstrItinClass itin> {
+  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                                    itin>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+                                    "\t{$src2, $dst|$dst, $src2}",
+                                    itin>;
+}
+
+let AddedComplexity = 20 in {
+  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>, VEX;
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)],
+                                 IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
+                                 (iPTR 0))), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+                                    IIC_SSE_MOV_LH>;
+}
+
+let SchedRW = [WriteStore] in {
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                            (bc_v2f64 (v4f32 VR128:$src))),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
+} // SchedRW
+
+let Predicates = [HasAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold
+  // cause it has two uses through a bitcast. One use disappears at isel time
+  // and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteShuffle]>;
+  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>,
+                      VEX_4V, Sched<[WriteShuffle]>;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movlhps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                       (ins VR128:$src1, VR128:$src2),
+                      "movhlps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
+                        IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+}
+
+let Predicates = [HasAVX] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [UseSSE1] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
+
+def SSE_CVT_PD : OpndItins<
+  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_PS : OpndItins<
+  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+let Sched = WriteCvtI2F in
+def SSE_CVT_Scalar : OpndItins<
+  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_32 : OpndItins<
+  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
+
+let Sched = WriteCvtF2I in
+def SSE_CVT_SD2SI : OpndItins<
+  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
+
+multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                     string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
+                        itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
+                        itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                       X86MemOperand x86memop, string asm, Domain d,
+                       OpndItins itins> {
+let neverHasSideEffects = 1 in {
+  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+             [], itins.rr, d>, Sched<[itins.Sched]>;
+  let mayLoad = 1 in
+  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
+}
+}
+
+multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2F]>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+} // neverHasSideEffects = 1
+}
+
+defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_32>,
+                                XS, VEX, VEX_LIG;
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                                "cvttss2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SS2SI_64>,
+                                XS, VEX, VEX_W, VEX_LIG;
+defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_LIG;
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                                "cvttsd2si\t{$src, $dst|$dst, $src}",
+                                SSE_CVT_SD2SI>,
+                                XD, VEX, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+// The assembler can recognize rr 64-bit instructions by seeing a rxx
+// register, but the same isn't true when only using memory operands,
+// provide other assembly "l" and "q" forms to address this explicitly
+// where appropriate to do so.
+defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
+                                  XS, VEX_4V, VEX_LIG;
+defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
+                                  XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
+                                  XD, VEX_4V, VEX_LIG;
+defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
+                                  XD, VEX_4V, VEX_W, VEX_LIG;
+
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f32 (sint_to_fp GR32:$src)),
+            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f32 (sint_to_fp GR64:$src)),
+            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+  def : Pat<(f64 (sint_to_fp GR32:$src)),
+            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f64 (sint_to_fp GR64:$src)),
+            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_32>, XS;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SS2SI_64>, XS, REX_W;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_SD2SI>, XD, REX_W;
+defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS;
+defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XS, REX_W;
+defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD;
+defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                      SSE_CVT_Scalar>, XD, REX_W;
+
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+
+// Conversion Instructions Intrinsics - Match intrinsics which expect MM
+// and/or XMM operand(s).
+
+multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                         string asm, OpndItins itins> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+           Sched<[itins.Sched.Folded]>;
+}
+
+multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
+                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
+                    PatFrag ld_frag, string asm, OpndItins itins,
+                    bit Is2Addr = 1> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+              itins.rr>, Sched<[itins.Sched]>;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src2),
+              !if(Is2Addr,
+                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
+                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
+                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
+                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
+                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
+
+defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
+                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
+defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
+                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+
+defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
+          SSE_CVT_Scalar, 0>, XS, VEX_4V;
+defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
+          SSE_CVT_Scalar, 0>, XS, VEX_4V,
+          VEX_W;
+defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
+          SSE_CVT_Scalar, 0>, XD, VEX_4V;
+defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
+          SSE_CVT_Scalar, 0>, XD,
+          VEX_4V, VEX_W;
+
+let Constraints = "$src1 = $dst" in {
+  defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                        int_x86_sse_cvtsi2ss, i32mem, loadi32,
+                        "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
+  defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                        int_x86_sse_cvtsi642ss, i64mem, loadi64,
+                        "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
+  defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+                        int_x86_sse2_cvtsi2sd, i32mem, loadi32,
+                        "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
+  defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+                        int_x86_sse2_cvtsi642sd, i64mem, loadi64,
+                        "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+}
+
+/// SSE 1 Only
+
+// Aliases for intrinsics
+defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS, VEX;
+defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>,
+                                   XS, VEX, VEX_W;
+defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD, VEX;
+defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>,
+                                  XD, VEX, VEX_W;
+defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+                                    ssmem, sse_load_f32, "cvttss2si",
+                                    SSE_CVT_SS2SI_32>, XS;
+defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
+defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+                                    sdmem, sse_load_f64, "cvttsd2si",
+                                    SSE_CVT_SD2SI>, XD;
+defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
+
+defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                  ssmem, sse_load_f32, "cvtss2si",
+                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
+
+defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
+                               ssmem, sse_load_f32, "cvtss2si",
+                               SSE_CVT_SS2SI_32>, XS;
+defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
+                                 ssmem, sse_load_f32, "cvtss2si",
+                                 SSE_CVT_SS2SI_64>, XS, REX_W;
+
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               TB, VEX, Requires<[HasAVX]>;
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
+                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
+                               SSEPackedSingle, SSE_CVT_PS>,
+                               TB, VEX, VEX_L, Requires<[HasAVX]>;
+
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
+                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                            SSEPackedSingle, SSE_CVT_PS>,
+                            TB, Requires<[UseSSE2]>;
+
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
+/// SSE 2 Only
+
+// Convert scalar double to scalar single
+let neverHasSideEffects = 1 in {
+def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR64:$src1, FR64:$src2),
+                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
+                       (ins FR64:$src1, f64mem:$src2),
+                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [], IIC_SSE_CVT_Scalar_RM>,
+                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+          Requires<[HasAVX]>;
+
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))],
+                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
+def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
+                      IIC_SSE_CVT_Scalar_RM>,
+                      XD,
+                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2F]>;
+def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
+                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
+                                          VR128:$src1, sse_load_f64:$src2))],
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
+                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Convert scalar single to scalar double
+// SSE2 instructions with XS prefix
+let neverHasSideEffects = 1 in {
+def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
+                    (ins FR32:$src1, FR32:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RR>,
+                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
+                    Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
+                    (ins FR32:$src1, f32mem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [], IIC_SSE_CVT_Scalar_RM>,
+                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[HasAVX, OptForSpeed]>;
+
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))],
+                   IIC_SSE_CVT_Scalar_RR>, XS,
+                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))],
+                   IIC_SSE_CVT_Scalar_RM>, XS,
+                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
+
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
+
+def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2F]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+}
+
+// Convert packed single/double fp to doubleword
+def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR256:$dst,
+                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
+                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+
+// Convert Packed Double FP to Packed DW Integers
+let Predicates = [HasAVX] in {
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                       VEX, Sched<[WriteCvtF2I]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+                       Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
+                       Sched<[WriteCvtF2I]>;
+def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+}
+
+def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
+                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
+def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+
+// Convert with truncation packed single/double fp to doubleword
+// SSE2 packed instructions with XS prefix
+def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_sse2_cvttps2dq VR128:$src))],
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                            (memopv4f32 addr:$src)))],
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst,
+                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                          "cvttps2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
+                                             (memopv8f32 addr:$src)))],
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
+                          Sched<[WriteCvtF2ILd]>;
+
+def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
+                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvttps2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
+                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (VCVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+            (VCVTTPS2DQrm addr:$src)>;
+
+  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+            (VCVTTPS2DQYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
+            (CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+            (CVTDQ2PSrm addr:$src)>;
+
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+            (CVTTPS2DQrm addr:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttpd2dq VR128:$src))],
+                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+
+// XMM only
+def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
+def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                            (memopv2f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+
+// YMM only
+def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst,
+                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+            (VCVTTPD2DQYrr VR256:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+            (VCVTTPD2DQYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
+                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
+def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                        (memopv2f64 addr:$src)))],
+                                        IIC_SSE_CVT_PD_RM>,
+                      Sched<[WriteCvtF2ILd]>;
+
+// Convert packed single to packed double
+let Predicates = [HasAVX] in {
+                  // SSE2 instructions without OpSize prefix
+def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, TB, VEX, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, TB, VEX, Sched<[WriteCvtF2FLd]>;
+def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, TB, Sched<[WriteCvtF2F]>;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, TB, Sched<[WriteCvtF2FLd]>;
+}
+
+// Convert Packed DW Integers to Packed Double FP
+let Predicates = [HasAVX] in {
+let neverHasSideEffects = 1, mayLoad = 1 in
+def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     []>, VEX, Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
+                   Sched<[WriteCvtI2F]>;
+def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256
+                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2FLd]>;
+def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR256:$dst,
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
+                    Sched<[WriteCvtI2F]>;
+}
+
+let neverHasSideEffects = 1, mayLoad = 1 in
+def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
+def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
+                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
+
+// AVX 256-bit register conversion intrinsics
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+            (VCVTDQ2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VCVTDQ2PDYrm addr:$src)>;
+} // Predicates = [HasAVX]
+
+// Convert packed double to packed single
+// The assembler can recognize rr 256-bit instructions by seeing a ymm
+// register, but the same isn't true when using memory operands instead.
+// Provide other assembly rr and rm forms to address this explicitly.
+def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+
+// XMM only
+def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
+def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+
+// YMM only
+def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
+                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
+                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
+
+
+// AVX 256-bit register conversion intrinsics
+// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
+// whenever possible to avoid declaring two versions of each one.
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+            (VCVTDQ2PSYrm addr:$src)>;
+
+  // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (VCVTPD2PSXrm addr:$src)>;
+  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+            (VCVTPD2PSYrr VR256:$src)>;
+  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+            (VCVTPD2PSYrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (VCVTPS2PDrr VR128:$src)>;
+  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+            (VCVTPS2PDYrr VR128:$src)>;
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
+            (VCVTPS2PDYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Match fround and fextend for 128 conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (CVTPD2PSrm addr:$src)>;
+
+  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
+            (CVTPS2PDrr VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
+multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, SDNode OpNode, ValueType VT,
+                            PatFrag ld_frag, string asm, string asm_alt,
+                            OpndItins itins> {
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
+                itins.rr>, Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), imm:$cc))],
+                                         itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
+                      IIC_SSE_ALU_F32S_RM>,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmpss, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S>,
+                 XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmpsd, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SSE_ALU_F32S>, // same latency as 32 bit compare
+                 XD, VEX_4V, VEX_LIG;
+
+let Constraints = "$src1 = $dst" in {
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmpss, f32, loadf32,
+                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
+                  XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
+                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SSE_ALU_F32S>, // same latency as 32 bit compare
+                  XD;
+}
+
+multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
+                         Intrinsic Int, string asm, OpndItins itins> {
+  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               VR128:$src, imm:$cc))],
+                                               itins.rr>,
+           Sched<[itins.Sched]>;
+  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
+                        [(set VR128:$dst, (Int VR128:$src1,
+                                               (load addr:$src), imm:$cc))],
+                                               itins.rm>,
+           Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+// Aliases to match intrinsics which expect XMM operand(s).
+defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
+                     "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+                     SSE_ALU_F32S>,
+                     XS, VEX_4V;
+defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
+                     "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+                     SSE_ALU_F32S>, // same latency as f32
+                     XD, VEX_4V;
+let Constraints = "$src1 = $dst" in {
+  defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
+                       "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                       SSE_ALU_F32S>, XS;
+  defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
+                       "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                       SSE_ALU_F32S>, // same latency as f32
+                       XD;
+}
+
+
+// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
+multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
+                            ValueType vt, X86MemOperand x86memop,
+                            PatFrag ld_frag, string OpcodeStr, Domain d> {
+  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+                     IIC_SSE_COMIS_RR, d>,
+          Sched<[WriteFAdd]>;
+  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+                     [(set EFLAGS, (OpNode (vt RC:$src1),
+                                           (ld_frag addr:$src2)))],
+                                           IIC_SSE_COMIS_RM, d>,
+          Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Defs = [EFLAGS] in {
+  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
+  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                  VEX_LIG;
+  let Pattern = []<dag> in {
+    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss", SSEPackedSingle>, TB, VEX,
+                                    VEX_LIG;
+    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                    VEX_LIG;
+  }
+
+  defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
+  defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
+
+  defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
+                            load, "comiss", SSEPackedSingle>, TB, VEX;
+  defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
+                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
+  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
+                                  "ucomiss", SSEPackedSingle>, TB;
+  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+  let Pattern = []<dag> in {
+    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
+                                    "comiss", SSEPackedSingle>, TB;
+    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
+                                    "comisd", SSEPackedDouble>, TB, OpSize;
+  }
+
+  defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
+                              load, "ucomiss", SSEPackedSingle>, TB;
+  defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
+                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+
+  defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
+                                  "comiss", SSEPackedSingle>, TB;
+  defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
+                                  "comisd", SSEPackedDouble>, TB, OpSize;
+} // Defs = [EFLAGS]
+
+// sse12_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
+                            Operand CC, Intrinsic Int, string asm,
+                            string asm_alt, Domain d> {
+  def rri : PIi8<0xC2, MRMSrcReg,
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
+             IIC_SSE_CMPP_RR, d>,
+            Sched<[WriteFAdd]>;
+  def rmi : PIi8<0xC2, MRMSrcMem,
+             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
+             IIC_SSE_CMPP_RM, d>,
+            Sched<[WriteFAddLd, ReadAfterLd]>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
+    def rmi_alt : PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_CMPP_RM, d>,
+               Sched<[WriteFAddLd, ReadAfterLd]>;
+  }
+}
+
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in {
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedSingle>, TB;
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                 SSEPackedDouble>, TB, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Shuffle Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_shuffle - sse 1 & 2 shuffle instructions
+multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
+                         ValueType vt, string asm, PatFrag mem_frag,
+                         Domain d, bit IsConvertibleToThreeAddress = 0> {
+  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+            Sched<[WriteShuffleLd, ReadAfterLd]>;
+  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
+    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
+              Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
+           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
+                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>,
+                    TB;
+  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
+                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>,
+                    TB, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufp VR256:$src1,
+                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufp VR256:$src1,
+                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE1] in {
+  def : Pat<(v4i32 (X86Shufp VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // Generic SHUFPD patterns
+  def : Pat<(v2i64 (X86Shufp VR128:$src1,
+                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_unpack_interleave - sse 1 & 2 unpack and interleave
+multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           IIC_SSE_UNPCK, d>, Sched<[WriteShuffle]>;
+    def rm : PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1,
+                                       (mem_frag addr:$src2))))],
+                                       IIC_SSE_UNPCK, d>,
+             Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, TB, VEX_4V;
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in {
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, TB;
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, TB, OpSize;
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedSingle>, TB;
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                       SSEPackedDouble>, TB, OpSize;
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Extract Floating-Point Sign mask
+//===----------------------------------------------------------------------===//
+
+/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
+multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
+                                Domain d> {
+  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
+                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+             Sched<[WriteVecLogic]>;
+  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
+                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
+                IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, TB, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, TB,
+                                        VEX, VEX_L;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX, VEX_L;
+
+  def : Pat<(i32 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+  def : Pat<(i32 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+  def : Pat<(i64 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+  // Assembler Only
+  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+             SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
+  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+             SSEPackedDouble>, TB,
+             OpSize, VEX, Sched<[WriteVecLogic]>;
+  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+             SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
+  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
+             SSEPackedDouble>, TB,
+             OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
+}
+
+defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
+                                     SSEPackedSingle>, TB;
+defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
+                                     SSEPackedDouble>, TB, OpSize;
+
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[UseSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+      Requires<[UseSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[UseSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+      Requires<[UseSSE2]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, OpndItins itins,
+                        bit IsCommutable, bit Is2Addr> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1,
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     itins.rm>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+                         ValueType OpVT128, ValueType OpVT256,
+                         OpndItins itins, bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+                    VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+                           memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+                               OpVT256, VR256, memopv4i64, i256mem, itins,
+                               IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+// These are ordered here for pattern ordering requirements with the fp versions
+
+defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+                           SSE_BIT_ITINS_P, 0>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Logical Instructions
+//===----------------------------------------------------------------------===//
+
+/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
+///
+multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
+                                       SDNode OpNode, OpndItins itins> {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
+              TB, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
+        TB, OpSize, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
+                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
+                TB;
+
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
+                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
+                TB, OpSize;
+  }
+}
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+              SSE_BIT_ITINS_P>;
+defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+              SSE_BIT_ITINS_P>;
+defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+              SSE_BIT_ITINS_P>;
+
+let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
+  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+                SSE_BIT_ITINS_P>;
+
+/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
+///
+multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
+                                   SDNode OpNode> {
+  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+        !strconcat(OpcodeStr, "ps"), f256mem,
+        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+                           (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+
+  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+        !strconcat(OpcodeStr, "pd"), f256mem,
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (bc_v4i64 (v4f64 VR256:$src2))))],
+        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+                                  (memopv4i64 addr:$src2)))], 0>,
+                                  TB, OpSize, VEX_4V, VEX_L;
+
+  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+  // are all promoted to v2i64, and the patterns are covered by the int
+  // version. This is needed in SSE only, because v2i64 isn't supported on
+  // SSE1, but only on SSE2.
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem, [],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>,
+                                                 TB, OpSize, VEX_4V;
+
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+         !strconcat(OpcodeStr, "ps"), f128mem,
+         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, TB;
+
+    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+         !strconcat(OpcodeStr, "pd"), f128mem,
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (bc_v2i64 (v2f64 VR128:$src2))))],
+         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                   (memopv2i64 addr:$src2)))]>, TB, OpSize;
+  }
+}
+
+defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
+defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
+defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
+let isCommutable = 0 in
+  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Arithmetic Instructions
+//===----------------------------------------------------------------------===//
+
+/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
+/// vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem.
+///
+
+/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
+/// classes below
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins,
+                                  bit Is2Addr = 1> {
+  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                            OpNode, FR32, f32mem,
+                            itins.s, Is2Addr>, XS;
+  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                            OpNode, FR64, f64mem,
+                            itins.d, Is2Addr>, XD;
+}
+
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, SizeItins itins> {
+let Predicates = [HasAVX] in {
+  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+                               VR128, v4f32, f128mem, memopv4f32,
+                               SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
+  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+                               VR128, v2f64, f128mem, memopv2f64,
+                               SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
+
+  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+                        OpNode, VR256, v8f32, f256mem, memopv8f32,
+                        SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
+  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+                        OpNode, VR256, v4f64, f256mem, memopv4f64,
+                        SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                            v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                            itins.s, 1>, TB;
+  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                            v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                            itins.d, 1>, TB, OpSize;
+}
+}
+
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins,
+                                      bit Is2Addr = 1> {
+  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+     itins.s, Is2Addr>, XS;
+  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+     itins.d, Is2Addr>, XD;
+}
+
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
+}
+
+let isCodeGenOnly = 1 in {
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+}
+
+defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
+            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
+              VEX_4V, VEX_LIG;
+defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
+            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
+              VEX_4V, VEX_LIG;
+
+let isCommutable = 0 in {
+  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
+              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
+                VEX_4V, VEX_LIG;
+  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
+              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
+                VEX_4V, VEX_LIG;
+  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
+              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
+                VEX_4V, VEX_LIG;
+  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
+              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
+                VEX_4V, VEX_LIG;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+
+  let isCommutable = 0 in {
+    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+               basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+               basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
+  }
+}
+
+let isCodeGenOnly = 1 in {
+  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in {
+    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
+  }
+}
+
+/// Unop Arithmetic
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+
+let Sched = WriteFSqrt in {
+def SSE_SQRTP : OpndItins<
+  IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+>;
+
+def SSE_SQRTS : OpndItins<
+  IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+>;
+}
+
+let Sched = WriteFRcp in {
+def SSE_RCPP : OpndItins<
+  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
+>;
+
+def SSE_RCPS : OpndItins<
+  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
+>;
+}
+
+/// sse1_fp_unop_s - SSE1 unops in scalar form.
+multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+                      (ins FR32:$src1, FR32:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                      (ins FR32:$src1,f32mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
+}
+
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+                       (ins FR32:$src1, FR32:$src2),
+                       !strconcat("v", OpcodeStr,
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                      (ins FR32:$src1,f32mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
+  let Constraints = "$src1 = $dst" in {
+    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rr>, Sched<[itins.Sched]>;
+    let mayLoad = 1, hasSideEffects = 0 in
+    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+/// sse1_fp_unop_p - SSE1 unops in packed form.
+multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "ps\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
+multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V4F32Int, Intrinsic V8F32Int,
+                              OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                           !strconcat("v", OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR128:$dst, (V4F32Int VR128:$src))],
+                           itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                          !strconcat("v", OpcodeStr,
+                          "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                            !strconcat("v", OpcodeStr,
+                                       "ps\t{$src, $dst|$dst, $src}"),
+                            [(set VR256:$dst, (V8F32Int VR256:$src))],
+                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
+                          (ins f256mem:$src),
+                          !strconcat("v", OpcodeStr,
+                                    "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))],
+                    itins.rr>, Sched<[itins.Sched]>;
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                    itins.rm>, Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_s - SSE2 unops in scalar form.
+multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
+                      (ins FR64:$src1, FR64:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
+  let mayLoad = 1 in {
+  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                      (ins FR64:$src1,f64mem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, sdmem:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, VEX_4V, VEX_LIG,
+                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
+  }
+}
+
+  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
+            Sched<[itins.Sched]>;
+  // See the comments in sse1_fp_unop_s for why this is OptForSize.
+  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
+            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
+                Sched<[itins.Sched]>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
+                Sched<[itins.Sched.Folded]>;
+}
+
+/// sse2_fp_unop_p - SSE2 unops in vector forms.
+multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
+                          SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+                       itins.rr>, VEX, Sched<[itins.Sched]>;
+  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       !strconcat("v", OpcodeStr,
+                                  "pd\t{$src, $dst|$dst, $src}"),
+                       [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                        !strconcat("v", OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+}
+
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
+            Sched<[itins.Sched]>;
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
+            Sched<[itins.Sched.Folded]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
+                            SSE_SQRTS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
+                            SSE_SQRTS>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
+                                int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
+             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
+             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
+                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
+
+def : Pat<(f32 (fsqrt FR32:$src)),
+          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (fsqrt (load addr:$src))),
+          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+def : Pat<(f64 (fsqrt FR64:$src)),
+          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fsqrt (load addr:$src))),
+          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frsqrt FR32:$src)),
+          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frsqrt (load addr:$src))),
+          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frcp FR32:$src)),
+          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frcp (load addr:$src))),
+          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
+            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
+            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
+                              VR128)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
+            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
+                              VR128)>;
+  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
+            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Non-temporal stores
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 400 in { // Prefer non-temporal versions
+let SchedRW = [WriteStore] in {
+def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f32 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f128mem:$dst, VR128:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v2f64 VR128:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX;
+
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
+                         (ins f128mem:$dst, VR128:$src),
+                         "movntdq\t{$src, $dst|$dst, $src}",
+                         [(alignednontemporalstore (v2i64 VR128:$src),
+                                                   addr:$dst)],
+                                                   IIC_SSE_MOVNT>, VEX;
+
+def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntps\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v8f32 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
+                     (ins f256mem:$dst, VR256:$src),
+                     "movntpd\t{$src, $dst|$dst, $src}",
+                     [(alignednontemporalstore (v4f64 VR256:$src),
+                                               addr:$dst)],
+                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+let ExeDomain = SSEPackedInt in
+def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
+                    (ins f256mem:$dst, VR256:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4i64 VR256:$src),
+                                              addr:$dst)],
+                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+let ExeDomain = SSEPackedInt in
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
+                    IIC_SSE_MOVNT>;
+
+// There is no AVX form for instructions below this point
+def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "movnti{l}\t{$src, $dst|$dst, $src}",
+                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
+                 IIC_SSE_MOVNT>,
+               TB, Requires<[HasSSE2]>;
+def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movnti{q}\t{$src, $dst|$dst, $src}",
+                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
+                     IIC_SSE_MOVNT>,
+                  TB, Requires<[HasSSE2]>;
+} // SchedRW = [WriteStore]
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (VMOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasAVX]>;
+
+def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Prefetch and memory fence
+//===----------------------------------------------------------------------===//
+
+// Prefetch intrinsic.
+let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
+def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
+    IIC_SSE_PREFETCH>, TB;
+}
+
+// FIXME: How should these memory instructions be modeled?
+let SchedRW = [WriteLoad] in {
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
+               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
+
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", [], IIC_SSE_PAUSE>, REP;
+
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
+               TB, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
+               TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
+               TB, Requires<[HasSSE2]>;
+} // SchedRW
+
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Load/Store XCSR register
+//===----------------------------------------------------------------------===//
+
+def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
+def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX;
+def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
+                    VEX, VEX_L;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
+                    VEX, VEX_L;
+}
+
+// For Disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>,
+                        VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>,
+                        VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", [],
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+}
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX;
+def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
+                   VEX, VEX_L;
+let Predicates = [HasAVX] in {
+  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX;
+  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                    XS, VEX, VEX_L;
+}
+}
+
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
+def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i128mem:$dst, VR128:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX;
+def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
+                     (ins i256mem:$dst, VR256:$src),
+                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
+                     VEX, VEX_L;
+let Predicates = [HasAVX] in {
+def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX;
+def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_L;
+}
+}
+
+let SchedRW = [WriteMove] in {
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+
+// For Disassembler
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", [],
+                       IIC_SSE_MOVA_P_RR>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVA_P_RM>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
+                   IIC_SSE_MOVU_P_RM>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+let mayStore = 1, SchedRW = [WriteStore] in {
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVA_P_MR>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
+                   IIC_SSE_MOVU_P_MR>,
+                 XS, Requires<[UseSSE2]>;
+}
+
+} // ExeDomain = SSEPackedInt
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+            (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
+let Predicates = [UseSSE2] in
+def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
+          (MOVDQUmr addr:$dst, VR128:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Arithmetic Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecIMul in
+def SSE_PMADD : OpndItins<
+  IIC_SSE_PMADD, IIC_SSE_PMADD
+>;
+
+let ExeDomain = SSEPackedInt in { // SSE integer instructions
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            RegisterClass RC, PatFrag memop_frag,
+                            X86MemOperand x86memop,
+                            OpndItins itins,
+                            bit IsCommutable = 0,
+                            bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
+      Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
+       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+                             Intrinsic IntId256, OpndItins itins,
+                             bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+                                 VR128, memopv2i64, i128mem, itins,
+                                 IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+                               i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+                                   VR256, memopv4i64, i256mem, itins,
+                                   IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
+multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
+                         string OpcodeStr, SDNode OpNode,
+                         SDNode OpNode2, RegisterClass RC,
+                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
+                         ShiftOpndItins itins,
+                         bit Is2Addr = 1> {
+  // src2 is always 128-bit
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
+        itins.rr>, Sched<[WriteVecShift]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
+      Sched<[WriteVecShiftLd, ReadAfterLd]>;
+  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
+       (ins RC:$src1, i32i8imm:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+       Sched<[WriteVecShift]>;
+}
+
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
+multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+} // ExeDomain = SSEPackedInt
+
+defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 1>;
+defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+                             SSE_INTALUQ_ITINS_P, 0>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+
+// Intrinsic forms
+defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
+                                 int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW  : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
+                                 int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
+                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
+defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
+                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
+defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
+                                 int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+
+let Predicates = [HasAVX] in
+defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              VEX_4V;
+let Predicates = [HasAVX2] in
+defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
+                               VR256, memopv4i64, i256mem,
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst" in
+defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
+                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Logical Instructions
+//===---------------------------------------------------------------------===//
+
+let Predicates = [HasAVX] in {
+defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                            VR128, v2i64, v2i64, bc_v2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                            VR128, v2i64, v2i64, bc_v2i64,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                            VR128, v8i16, v8i16, bc_v8i16,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                            VR128, v4i32, v4i32, bc_v4i32,
+                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 128-bit logical shifts.
+  def VPSLLDQri : PDIi8<0x73, MRM7r,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
+                    VEX_4V;
+  def VPSRLDQri : PDIi8<0x73, MRM3r,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst,
+                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
+                    VEX_4V;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX]
+
+let Predicates = [HasAVX2] in {
+defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
+                             VR256, v4i64, v2i64, bc_v2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
+                             VR256, v4i64, v2i64, bc_v2i64,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
+                             VR256, v16i16, v8i16, bc_v8i16,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
+                             VR256, v8i32, v4i32, bc_v4i32,
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 256-bit logical shifts.
+  def VPSLLDQYri : PDIi8<0x73, MRM7r,
+                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
+                    VEX_4V, VEX_L;
+  def VPSRLDQYri : PDIi8<0x73, MRM3r,
+                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
+                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR256:$dst,
+                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
+                    VEX_4V, VEX_L;
+  // PSRADQYri doesn't exist in SSE[1-3].
+}
+} // Predicates = [HasAVX2]
+
+let Constraints = "$src1 = $dst" in {
+defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
+                           VR128, v2i64, v2i64, bc_v2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
+                           VR128, v2i64, v2i64, bc_v2i64,
+                           SSE_INTSHIFT_ITINS_P>;
+
+defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
+                           VR128, v8i16, v8i16, bc_v8i16,
+                           SSE_INTSHIFT_ITINS_P>;
+defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
+                           VR128, v4i32, v4i32, bc_v4i32,
+                           SSE_INTSHIFT_ITINS_P>;
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
+  // 128-bit logical shifts.
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "pslldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "psrldq\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+} // Constraints = "$src1 = $dst"
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
+            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
+            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
+            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
+            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Comparison Instructions
+//===---------------------------------------------------------------------===//
+
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 1>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+                             SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+                             SSE_INTALU_ITINS_P, 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Pack Instructions
+//===---------------------------------------------------------------------===//
+
+defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
+                                  int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>;
+defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
+                                  int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>;
+defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
+                                  int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Shuffle Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+                         SDNode OpNode> {
+let Predicates = [HasAVX] in {
+  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, i8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR128:$dst,
+                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                      IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
+  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+                      (ins i128mem:$src1, i8imm:$src2),
+                      !strconcat("v", OpcodeStr,
+                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR128:$dst,
+                       (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+                  Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX2] in {
+  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+                       (ins VR256:$src1, i8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                       [(set VR256:$dst,
+                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+                       IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
+  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+                       (ins i256mem:$src1, i8imm:$src2),
+                       !strconcat("v", OpcodeStr,
+                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [(set VR256:$dst,
+                        (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+                   Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [UseSSE2] in {
+  def ri : Ii8<0x70, MRMSrcReg,
+               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+                IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
+  def mi : Ii8<0x70, MRMSrcMem,
+               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                [(set VR128:$dst,
+                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+           Sched<[WriteShuffleLd]>;
+}
+}
+} // ExeDomain = SSEPackedInt
+
+defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Unpack Instructions
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
+                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
+  def rr : PDI<opc, MRMSrcReg,
+      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
+      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+  def rm : PDI<opc, MRMSrcMem,
+      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
+          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set VR128:$dst, (OpNode VR128:$src1,
+                                  (bc_frag (memopv2i64
+                                               addr:$src2))))],
+                                               IIC_SSE_UNPCK>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
+                         SDNode OpNode, PatFrag bc_frag> {
+  def Yrr : PDI<opc, MRMSrcReg,
+      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
+      Sched<[WriteShuffle]>;
+  def Yrm : PDI<opc, MRMSrcMem,
+      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
+      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+      [(set VR256:$dst, (OpNode VR256:$src1,
+                                  (bc_frag (memopv4i64 addr:$src2))))]>,
+      Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
+                                 bc_v4i32, 0>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
+                                 bc_v2i64, 0>, VEX_4V;
+
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
+                                 bc_v4i32, 0>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
+                                 bc_v2i64, 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+
+  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
+                                   bc_v32i8>, VEX_4V, VEX_L;
+  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
+                                   bc_v16i16>, VEX_4V, VEX_L;
+  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
+                                   bc_v8i32>, VEX_4V, VEX_L;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
+                                   bc_v4i64>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
+                                bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
+                                bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
+                                bc_v4i32>;
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
+                                bc_v2i64>;
+
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
+                                bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
+                                bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
+                                bc_v4i32>;
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
+                                bc_v2i64>;
+}
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Integer Extract and Insert
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt in {
+multiclass sse2_pinsrw<bit Is2Addr = 1> {
+  def rri : Ii8<0xC4, MRMSrcReg,
+       (outs VR128:$dst), (ins VR128:$src1,
+        GR32:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffle]>;
+  def rmi : Ii8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+       !if(Is2Addr,
+           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+       [(set VR128:$dst,
+         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                    imm:$src3))], IIC_SSE_PINSRW>,
+       Sched<[WriteShuffleLd, ReadAfterLd]>;
+}
+
+// Extract
+let Predicates = [HasAVX] in
+def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
+                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                                imm:$src2))]>, TB, OpSize, VEX,
+                Sched<[WriteShuffle]>;
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                                imm:$src2))], IIC_SSE_PEXTRW>,
+               Sched<[WriteShuffleLd, ReadAfterLd]>;
+
+// Insert
+let Predicates = [HasAVX] in {
+  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
+  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+       []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
+}
+
+let Constraints = "$src1 = $dst" in
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Packed Mask Creation
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
+
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>, VEX;
+def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
+
+let Predicates = [HasAVX2] in {
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
+def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+}
+
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+           "pmovmskb\t{$src, $dst|$dst, $src}",
+           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           IIC_SSE_MOVMSK>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Conditional Store
+//===---------------------------------------------------------------------===//
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+
+let Uses = [EDI] in
+def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>, VEX;
+let Uses = [RDI] in
+def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
+           (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>, VEX;
+
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
+           IIC_SSE_MASKMOV>;
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+           "maskmovdqu\t{$mask, $src|$src, $mask}",
+           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
+           IIC_SSE_MASKMOV>;
+
+} // ExeDomain = SSEPackedInt
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Doubleword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteMove]>;
+def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>,
+                      VEX, Sched<[WriteLoad]>;
+def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                  Sched<[WriteMove]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+//===---------------------------------------------------------------------===//
+// Move Int Doubleword to Single Scalar
+//
+def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>,
+                      VEX, Sched<[WriteLoad]>;
+def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
+                    Sched<[WriteMove]>;
+def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                                     VEX, Sched<[WriteLoad]>;
+def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                   Sched<[WriteMove]>;
+def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)],
+                                     IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+let SchedRW = [WriteMove] in {
+def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                           (iPTR 0)))],
+                                                           IIC_SSE_MOVD_ToGP>,
+                      TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
+
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                         (iPTR 0)))],
+                                                         IIC_SSE_MOVD_ToGP>;
+} //SchedRW
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let Predicates = [HasAVX] in
+def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                        VEX, Sched<[WriteLoad]>;
+def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, (bitconvert FR64:$src))],
+                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Move Scalar Single to Double Int
+//
+def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))],
+                      IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))],
+                      IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+//===---------------------------------------------------------------------===//
+// Patterns and instructions to describe movd/movq to XMM register zero-extends
+//
+let SchedRW = [WriteMove] in {
+let AddedComplexity = 15 in {
+def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))],
+                                      IIC_SSE_MOVDQ>, VEX;
+def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))],
+                                      IIC_SSE_MOVDQ>,
+                                      VEX, VEX_W;
+}
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))],
+                                      IIC_SSE_MOVDQ>;
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))],
+                                      IIC_SSE_MOVDQ>;
+}
+} // SchedRW
+
+let AddedComplexity = 20, SchedRW = [WriteLoad] in {
+def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))],
+                                                   IIC_SSE_MOVDQ>, VEX;
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))],
+                                                   IIC_SSE_MOVDQ>;
+} // AddedComplexity, SchedRW
+
+let Predicates = [HasAVX] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+}
+
+// These are the correct encodings of the instructions so that we know how to
+// read correct assembly, even though we continue to emit the wrong ones for
+// compatibility with Darwin's buggy assembler.
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 - Move Quadword
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Move Quadword Int to Packed Quadword Int
+//
+
+let SchedRW = [WriteLoad] in {
+def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                    VEX, Requires<[HasAVX]>;
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
+                      IIC_SSE_MOVDQ>, XS,
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// Move Packed Quadword Int to Quadword Int
+//
+let SchedRW = [WriteStore] in {
+def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>, VEX;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)],
+                                    IIC_SSE_MOVDQ>;
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// Store / copy lower 64-bits of a XMM register.
+//
+def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
+                  Sched<[WriteStore]>;
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
+                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+
+let AddedComplexity = 20 in
+def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "vmovq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
+
+let AddedComplexity = 20 in
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))],
+                                                 IIC_SSE_MOVDQ>,
+                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+
+let Predicates = [HasAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i64 (alignedX86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
+def : Pat<(v4i64 (X86vzload addr:$src)),
+          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
+let SchedRW = [WriteVecLogic] in {
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, VEX, Requires<[HasAVX]>;
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
+                    IIC_SSE_MOVQ_RR>,
+                      XS, Requires<[UseSSE2]>;
+} // SchedRW
+
+let SchedRW = [WriteVecLogicLd] in {
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, VEX, Requires<[HasAVX]>;
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))],
+                                             IIC_SSE_MOVDQ>,
+                      XS, Requires<[UseSSE2]>;
+}
+} // SchedRW
+
+let AddedComplexity = 20 in {
+  let Predicates = [HasAVX] in {
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+              (VMOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [UseSSE2] in {
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+              (MOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+}
+
+// Instructions to match in the assembler
+let SchedRW = [WriteMove] in {
+def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [],
+                      IIC_SSE_MOVDQ>, VEX, VEX_W;
+def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}", [],
+                      IIC_SSE_MOVDQ>, VEX, VEX_W;
+// Recognize "movd" with GR64 destination, but encode as a "movq"
+def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movd\t{$src, $dst|$dst, $src}", [],
+                          IIC_SSE_MOVDQ>, VEX, VEX_W;
+} // SchedRW
+
+// Instructions for the disassembler
+// xr = XMM register
+// xm = mem64
+
+let SchedRW = [WriteMove] in {
+let Predicates = [HasAVX] in
+def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
+def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
+} // SchedRW
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
+                      IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_replicate_dfp<string OpcodeStr> {
+let neverHasSideEffects = 1 in
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [], IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (v2f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))],
+                              IIC_SSE_MOV_LH>, Sched<[WriteShuffleLd]>;
+}
+
+// FIXME: Merge with above classe when there're patterns for the ymm version
+multiclass sse3_replicate_dfp_y<string OpcodeStr> {
+def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
+                    Sched<[WriteShuffle]>;
+def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (v4f64 (X86Movddup
+                              (scalar_to_vector (loadf64 addr:$src)))))]>,
+                    Sched<[WriteShuffleLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+}
+
+let Predicates = [UseSSE3] in {
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX] in {
+  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                   "vlddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
+}
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
+                   IIC_SSE_LDDQU>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 - Arithmetic
+//===---------------------------------------------------------------------===//
+
+multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, OpndItins itins,
+                       bit Is2Addr = 1> {
+  def rr : I<0xD0, MRMSrcReg,
+       (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+       Sched<[itins.Sched]>;
+  def rm : I<0xD0, MRMSrcMem,
+       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                                 f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                                 f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
+  }
+}
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
+  let ExeDomain = SSEPackedSingle in
+  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
+                              f128mem, SSE_ALU_F32P>, TB, XD;
+  let ExeDomain = SSEPackedDouble in
+  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
+                              f128mem, SSE_ALU_F64P>, TB, OpSize;
+}
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Horizontal ops
+multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
+                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
+  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
+      Sched<[WriteFAdd]>;
+
+  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V, VEX_L;
+    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V, VEX_L;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V, VEX_L;
+    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in {
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+  }
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Absolute Instructions
+//===---------------------------------------------------------------------===//
+
+
+/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
+                            Intrinsic IntId128> {
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
+                    OpSize, Sched<[WriteVecALU]>;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
+                    OpSize, Sched<[WriteVecALULd]>;
+}
+
+/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId256> {
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+                    (ins VR256:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
+                    OpSize, Sched<[WriteVecALU]>;
+
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+                    (ins i256mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR256:$dst,
+                      (IntId256
+                       (bitconvert (memopv4i64 addr:$src))))]>, OpSize,
+                    Sched<[WriteVecALULd]>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
+                                  int_x86_ssse3_pabs_b_128>, VEX;
+  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
+                                  int_x86_ssse3_pabs_w_128>, VEX;
+  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
+                                  int_x86_ssse3_pabs_d_128>, VEX;
+}
+
+let Predicates = [HasAVX2] in {
+  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
+  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
+  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
+}
+
+defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
+                              int_x86_ssse3_pabs_b_128>;
+defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
+                              int_x86_ssse3_pabs_w_128>;
+defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
+                              int_x86_ssse3_pabs_d_128>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Binary Operator Instructions
+//===---------------------------------------------------------------------===//
+
+let Sched = WriteVecALU in {
+def SSE_PHADDSUBD : OpndItins<
+  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
+>;
+def SSE_PHADDSUBSW : OpndItins<
+  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
+>;
+def SSE_PHADDSUBW : OpndItins<
+  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
+>;
+}
+let Sched = WriteShuffle in
+def SSE_PSHUFB : OpndItins<
+  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
+>;
+let Sched = WriteVecALU in
+def SSE_PSIGN : OpndItins<
+  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
+>;
+let Sched = WriteVecIMul in
+def SSE_PMULHRSW : OpndItins<
+  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
+>;
+
+/// SS3I_binop_rm - Simple SSSE3 bin op
+multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                         X86MemOperand x86memop, OpndItins itins,
+                         bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+       OpSize, Sched<[itins.Sched]>;
+  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1,
+          (bitconvert (memop_frag addr:$src2)))))], itins.rm>, OpSize,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
+multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                             Intrinsic IntId128, OpndItins itins,
+                             bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       OpSize, Sched<[itins.Sched]>;
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId256> {
+  let isCommutable = 1 in
+  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+       OpSize;
+  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1,
+          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX] in {
+let isCommutable = 0 in {
+  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V;
+  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PHADDSUBD, 0>, VEX_4V;
+  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PSIGN, 0>, VEX_4V;
+  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
+                                  memopv2i64, i128mem,
+                                  SSE_PSHUFB, 0>, VEX_4V;
+  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
+                                      int_x86_ssse3_phadd_sw_128,
+                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
+                                      int_x86_ssse3_phsub_sw_128,
+                                      SSE_PHADDSUBSW, 0>, VEX_4V;
+  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
+                                      int_x86_ssse3_pmadd_ub_sw_128,
+                                      SSE_PMADD, 0>, VEX_4V;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
+                                      int_x86_ssse3_pmul_hr_sw_128,
+                                      SSE_PMULHRSW, 0>, VEX_4V;
+}
+
+let ImmT = NoImm, Predicates = [HasAVX2] in {
+let isCommutable = 0 in {
+  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
+                                  memopv4i64, i256mem,
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
+                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
+  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
+                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
+  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
+                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
+}
+defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
+                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
+}
+
+// None of these have i8 immediate fields.
+let ImmT = NoImm, Constraints = "$src1 = $dst" in {
+let isCommutable = 0 in {
+  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
+  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
+  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
+                                 memopv2i64, i128mem, SSE_PSIGN>;
+  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
+                                 memopv2i64, i128mem, SSE_PSHUFB>;
+  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
+                                     int_x86_ssse3_phadd_sw_128,
+                                     SSE_PHADDSUBSW>;
+  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
+                                     int_x86_ssse3_phsub_sw_128,
+                                     SSE_PHADDSUBSW>;
+  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
+                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
+}
+defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmul_hr_sw_128,
+                                     SSE_PMULHRSW>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Packed Align Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+  let neverHasSideEffects = 1 in {
+  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
+  let neverHasSideEffects = 1 in {
+  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
+      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, OpSize, Sched<[WriteShuffle]>;
+  let mayLoad = 1 in
+  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
+      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
+      !strconcat(asm,
+                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  }
+}
+
+let Predicates = [HasAVX] in
+  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
+  defm PALIGN : ssse3_palignr<"palignr">;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [UseSSSE3] in {
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// SSSE3 - Thread synchronization
+//===---------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let usesCustomInserter = 1 in {
+def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
+                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
+                Requires<[HasSSE3]>;
+}
+
+let Uses = [EAX, ECX, EDX] in
+def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
+                 TB, Requires<[HasSSE3]>;
+let Uses = [ECX, EAX] in
+def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
+                TB, Requires<[HasSSE3]>;
+} // SchedRW
+
+def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+
+def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+      Requires<[In32BitMode]>;
+def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+      Requires<[In64BitMode]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Move with Sign/Zero Extend
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+       OpSize;
+}
+
+multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
+                                     VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
+                                     VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
+                                     VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
+                                     VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
+                                     VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
+                                     VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
+                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
+defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
+                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
+defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
+                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
+defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
+                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
+defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
+                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
+defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
+                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
+}
+
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVSXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVSXDQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVSXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVSXDQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  let AddedComplexity = 15 in {
+    def : Pat<(v4i64 (X86vzmovly (v4i32 VR128:$src))),
+              (VPMOVZXDQYrr VR128:$src)>;
+    def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
+              (VPMOVZXWDYrr VR128:$src)>;
+  }
+
+  def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+  def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+}
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+          OpSize;
+}
+
+multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR256:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+          OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>,
+                                     VEX;
+defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>,
+                                     VEX;
+defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd>,
+                                     VEX;
+defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
+                                     VEX;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
+                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
+defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
+                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
+defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
+                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
+defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
+                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
+}
+
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXWQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXWQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVSXWQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVZXWQrm addr:$src)>;
+}
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  // Expecting a i16 load any extended to i32 value.
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId (bitconvert
+                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+                 OpSize;
+}
+
+multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId> {
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR256:$dst, (IntId VR128:$src))]>, OpSize;
+
+  // Expecting a i16 load any extended to i32 value.
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR256:$dst, (IntId (bitconvert
+                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+                  OpSize;
+}
+
+let Predicates = [HasAVX] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>,
+                                     VEX;
+defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
+                                     VEX;
+}
+let Predicates = [HasAVX2] in {
+defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
+                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
+defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
+                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
+}
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
+            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
+            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
+            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
+            (VPMOVSXWDYrm addr:$src)>;
+  def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
+            (VPMOVSXDQYrm addr:$src)>;
+
+  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXBDYrm addr:$src)>;
+  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXBDYrm addr:$src)>;
+
+  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXWQYrm addr:$src)>;
+  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXWQYrm addr:$src)>;
+
+  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQYrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXWQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVSXBWrm addr:$src)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
+            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (VPMOVZXDQrm addr:$src)>;
+
+  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+                    (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+                    (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+                    (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXWQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Extract Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                 OpSize;
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+let Predicates = [HasAVX] in {
+  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
+  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
+         (ins VR128:$src1, i32i8imm:$src2),
+         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
+}
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize, REX_W;
+}
+
+let Predicates = [HasAVX] in
+  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+           OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [HasAVX] in {
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
+                    (ins VR128:$src1, i32i8imm:$src2),
+                    "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, OpSize, VEX;
+  }
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+}
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[UseSSE41]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Insert Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                   imm:$src3))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
+
+multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+      OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                          imm:$src3)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
+
+multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+      OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                          imm:$src3)))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
+let Constraints = "$src1 = $dst" in
+  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+      OpSize;
+  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
+      !if(Is2Addr,
+        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        !strconcat(asm,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      [(set VR128:$dst,
+        (X86insrtps VR128:$src1,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                    imm:$src3))]>, OpSize;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [HasAVX] in
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+  let Constraints = "$src1 = $dst" in
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Round Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : SS4AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PSm : SS4AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                    OpSize;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PDm : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                    OpSize;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : SS4AIi8<opcss, MRMSrcReg,
+      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
+      !if(Is2Addr,
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+      []>, OpSize;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Intrinsic operation, mem.
+  def SSm : SS4AIi8<opcss, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : SS4AIi8<opcsd, MRMSrcReg,
+        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        []>, OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+        OpSize;
+
+  // Intrinsic operation, mem.
+  def SDm : SS4AIi8<opcsd, MRMSrcMem,
+        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set VR128:$dst,
+              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        OpSize;
+} // ExeDomain = GenericDomain
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+let Predicates = [HasAVX] in {
+  // Intrinsic form
+  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
+                                  memopv4f32, memopv2f64,
+                                  int_x86_sse41_round_ps,
+                                  int_x86_sse41_round_pd>, VEX;
+  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
+                                  memopv8f32, memopv4f64,
+                                  int_x86_avx_round_ps_256,
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
+  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
+                                  int_x86_sse41_round_ss,
+                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+
+  def : Pat<(ffloor FR32:$src),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x2))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x2))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (fnearbyint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0xC))>;
+  def : Pat<(v8f32 (fceil VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x2))>;
+  def : Pat<(v8f32 (frint VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x4))>;
+  def : Pat<(v8f32 (ftrunc VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x3))>;
+
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (fnearbyint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0xC))>;
+  def : Pat<(v4f64 (fceil VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x2))>;
+  def : Pat<(v4f64 (frint VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x4))>;
+  def : Pat<(v4f64 (ftrunc VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x3))>;
+}
+
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
+                               memopv4f32, memopv2f64,
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+let Constraints = "$src1 = $dst" in
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v4f32 (fnearbyint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0xC))>;
+  def : Pat<(v4f32 (fceil VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x2))>;
+  def : Pat<(v4f32 (frint VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x4))>;
+  def : Pat<(v4f32 (ftrunc VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x3))>;
+
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (fnearbyint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0xC))>;
+  def : Pat<(v2f64 (fceil VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x2))>;
+  def : Pat<(v2f64 (frint VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x4))>;
+  def : Pat<(v2f64 (ftrunc VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x3))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Packed Bit Test
+//===----------------------------------------------------------------------===//
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+                OpSize, VEX;
+def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+                OpSize, VEX;
+
+def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
+                OpSize, VEX, VEX_L;
+def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
+                "vptest\t{$src2, $src1|$src1, $src2}",
+                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+                OpSize, VEX, VEX_L;
+}
+
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
+              OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+              "ptest\t{$src2, $src1|$src1, $src2}",
+              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+              OpSize;
+}
+
+// The bit test instructions below are AVX only
+multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
+  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, OpSize, VEX;
+  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
+            OpSize, VEX;
+}
+
+let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+                            VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+                            VEX_L;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 - Misc Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     OpSize, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)]>, OpSize, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)]>, XS;
+}
+
+
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128
+                        (bitconvert (memopv2i64 addr:$src))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
+                                         int_x86_sse41_phminposuw>, VEX;
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1,
+          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+}
+
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId256> {
+  let isCommutable = 1 in
+  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
+       (ins VR256:$src1, VR256:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
+  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
+       (ins VR256:$src1, i256mem:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set VR256:$dst,
+         (IntId256 VR256:$src1,
+          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+}
+
+
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, OpSize;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1,
+          (bitconvert (memop_frag addr:$src2)))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in
+  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
+                                                         0>, VEX_4V;
+  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+                                  memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
+                                                         0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in
+  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
+                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
+  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
+                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in
+  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
+  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+                                 memopv2i64, i128mem>;
+  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
+}
+
+let Predicates = [HasAVX] in {
+  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
+                                memopv2i64, i128mem, 0>, VEX_4V;
+  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
+                                 memopv2i64, i128mem, 0>, VEX_4V;
+}
+let Predicates = [HasAVX2] in {
+  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
+                                memopv2i64, i128mem>;
+  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
+                                memopv2i64, i128mem>;
+}
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop, bit Is2Addr = 1> {
+  let isCommutable = 1 in
+  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        OpSize;
+  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        !if(Is2Addr,
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+            !strconcat(OpcodeStr,
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+        OpSize;
+}
+
+let Predicates = [HasAVX] in {
+  let isCommutable = 0 in {
+    let ExeDomain = SSEPackedSingle in {
+    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
+    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    f256mem, 0>, VEX_4V, VEX_L;
+    }
+    let ExeDomain = SSEPackedDouble in {
+    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
+    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     f256mem, 0>, VEX_4V, VEX_L;
+    }
+  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
+                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
+                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
+                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
+  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
+                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in
+  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
+                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+  let isCommutable = 0 in {
+  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+  }
+}
+
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 0 in {
+  let ExeDomain = SSEPackedSingle in
+  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
+                                     VR128, memopv4f32, f128mem>;
+  let ExeDomain = SSEPackedDouble in
+  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
+                                     VR128, memopv2f64, f128mem>;
+  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
+                                     VR128, memopv2i64, i128mem>;
+  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
+                                     VR128, memopv2i64, i128mem>;
+  }
+  let ExeDomain = SSEPackedSingle in
+  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
+                                  VR128, memopv4f32, f128mem>;
+  let ExeDomain = SSEPackedDouble in
+  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
+                                  VR128, memopv2f64, f128mem>;
+}
+
+/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
+multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
+                                    RegisterClass RC, X86MemOperand x86memop,
+                                    PatFrag mem_frag, Intrinsic IntId> {
+  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                  (ins RC:$src1, RC:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
+                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+
+  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                  (ins RC:$src1, x86memop:$src2, RC:$src3),
+                  !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                  [(set RC:$dst,
+                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                               RC:$src3))],
+                  NoItinerary, SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
+}
+
+let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
+defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
+                                           memopv2f64, int_x86_sse41_blendvpd>;
+defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
+                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
+                                           memopv4f32, int_x86_sse41_blendvps>;
+defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
+                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           memopv2i64, int_x86_sse41_pblendvb>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+
+  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
+                               (imm:$mask))),
+            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
+                            (v32i8 VR256:$src2))),
+            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
+                               (imm:$mask))),
+            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+}
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                               X86MemOperand x86memop, Intrinsic IntId> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+                    OpSize;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, x86memop:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{$src2, $dst|$dst, $src2}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
+  }
+}
+
+let ExeDomain = SSEPackedDouble in
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+                                  int_x86_sse41_blendvpd>;
+let ExeDomain = SSEPackedSingle in
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+                                  int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+                                  int_x86_sse41_pblendvb>;
+
+// Aliases with the implicit xmm0 argument
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+
+  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
+                               (imm:$mask))),
+            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
+  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
+                               (imm:$mask))),
+            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+
+}
+
+let Predicates = [HasAVX] in
+def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "vmovntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       OpSize, VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                         "vmovntdqa\t{$src, $dst|$dst, $src}",
+                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+                         OpSize, VEX, VEX_L;
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       OpSize;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - Compare Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                          X86MemOperand x86memop, bit Is2Addr = 1> {
+  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
+       OpSize;
+  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, OpSize;
+}
+
+let Predicates = [HasAVX] in
+  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
+                                 memopv2i64, i128mem, 0>, VEX_4V;
+
+let Predicates = [HasAVX2] in
+  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+
+let Constraints = "$src1 = $dst" in
+  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
+                                memopv2i64, i128mem>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - String/text Processing Instructions
+//===----------------------------------------------------------------------===//
+
+// Packed Compare Implicit Length Strings, Return Mask
+multiclass pseudo_pcmpistrm<string asm> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                  imm:$src3))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
+}
+
+// Packed Compare Explicit Length Strings, Return Mask
+multiclass pseudo_pcmpestrm<string asm> {
+  def REG : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
+                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
+}
+
+// Packed Compare Implicit Length Strings, Return Index
+multiclass pseudo_pcmpistri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
+}
+
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
+
+// Packed Compare Explicit Length Strings, Return Index
+multiclass pseudo_pcmpestri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+       imm:$src5))]>;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 - CRC Instructions
+//===----------------------------------------------------------------------===//
+
+// No CRC instructions have AVX equivalents
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+let Constraints = "$src1 = $dst" in {
+  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i8mem:$src2),
+                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_8 GR32:$src1,
+                         (load addr:$src2)))]>;
+  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR8:$src2),
+                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
+  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i16mem:$src2),
+                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1,
+                         (load addr:$src2)))]>,
+                         OpSize;
+  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR16:$src2),
+                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
+                         OpSize;
+  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i32mem:$src2),
+                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_32 GR32:$src1,
+                         (load addr:$src2)))]>;
+  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR32:$src2),
+                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
+  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$src1, i8mem:$src2),
+                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1,
+                         (load addr:$src2)))]>,
+                         REX_W;
+  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+                      (ins GR64:$src1, GR8:$src2),
+                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
+                         REX_W;
+  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$src1, i64mem:$src2),
+                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1,
+                         (load addr:$src2)))]>,
+                         REX_W;
+  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2),
+                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
+                         REX_W;
+}
+
+//===----------------------------------------------------------------------===//
+// AES-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId128, bit Is2Addr = 1> {
+  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
+       (ins VR128:$src1, VR128:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+       OpSize;
+  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
+       (ins VR128:$src1, i128mem:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set VR128:$dst,
+         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>, OpSize;
+}
+
+// Perform One Round of an AES Encryption/Decryption Flow
+let Predicates = [HasAVX, HasAES] in {
+  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
+                         int_x86_aesni_aesenc, 0>, VEX_4V;
+  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
+                         int_x86_aesni_aesenclast, 0>, VEX_4V;
+  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
+                         int_x86_aesni_aesdec, 0>, VEX_4V;
+  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
+                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
+}
+
+let Constraints = "$src1 = $dst" in {
+  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
+                         int_x86_aesni_aesenc>;
+  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
+                         int_x86_aesni_aesenclast>;
+  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
+                         int_x86_aesni_aesdec>;
+  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
+                         int_x86_aesni_aesdeclast>;
+}
+
+// Perform the AES InvMixColumn Transformation
+let Predicates = [HasAVX, HasAES] in {
+  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aesimc VR128:$src1))]>,
+      OpSize, VEX;
+  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1),
+      "vaesimc\t{$src1, $dst|$dst, $src1}",
+      [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+      OpSize, VEX;
+}
+def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aesimc VR128:$src1))]>,
+  OpSize;
+def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1),
+  "aesimc\t{$src1, $dst|$dst, $src1}",
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  OpSize;
+
+// AES Round Key Generation Assist
+let Predicates = [HasAVX, HasAES] in {
+  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+      (ins VR128:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+      OpSize, VEX;
+  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+      (ins i128mem:$src1, i8imm:$src2),
+      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set VR128:$dst,
+        (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+      OpSize, VEX;
+}
+def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
+  (ins VR128:$src1, i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
+  OpSize;
+def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
+  (ins i128mem:$src1, i8imm:$src2),
+  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  [(set VR128:$dst,
+    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+  OpSize;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL Instructions
+//===----------------------------------------------------------------------===//
+
+// AVX carry-less Multiplication instructions
+def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+
+def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
+
+// Carry-less Multiplication instructions
+let Constraints = "$src1 = $dst" in {
+def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst,
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+
+def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
+                              (memopv2i64 addr:$src2), imm:$src3))]>;
+} // Constraints = "$src1 = $dst"
+
+
+multiclass pclmul_alias<string asm, int immop> {
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
+
+  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
+
+  def : InstAlias<!strconcat("vpclmul", asm,
+                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
+}
+defm : pclmul_alias<"hqhq", 0x11>;
+defm : pclmul_alias<"hqlq", 0x01>;
+defm : pclmul_alias<"lqhq", 0x10>;
+defm : pclmul_alias<"lqlq", 0x00>;
+
+//===----------------------------------------------------------------------===//
+// SSE4A Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasSSE4A] in {
+
+let Constraints = "$src = $dst" in {
+def EXTRQI : Ii8<0x78, MRM0r, (outs VR128:$dst),
+                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
+                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
+                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
+                                    imm:$idx))]>, TB, OpSize;
+def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+              (ins VR128:$src, VR128:$mask),
+              "extrq\t{$mask, $src|$src, $mask}",
+              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
+                                 VR128:$mask))]>, TB, OpSize;
+
+def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
+                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
+                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
+                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
+def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
+                 (ins VR128:$src, VR128:$mask),
+                 "insertq\t{$mask, $src|$src, $mask}",
+                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
+                                    VR128:$mask))]>, XD;
+}
+
+def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                "movntss\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
+
+def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                "movntsd\t{$src, $dst|$dst, $src}",
+                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
+class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                    X86MemOperand x86memop, Intrinsic Int> :
+  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+        [(set RC:$dst, (Int addr:$src))]>, VEX;
+
+// AVX2 adds register forms
+class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         Intrinsic Int> :
+  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         [(set RC:$dst, (Int VR128:$src))]>, VEX;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
+                                      int_x86_avx_vbroadcast_ss>;
+  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
+                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
+                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
+def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
+                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
+
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
+                                           int_x86_avx2_vbroadcast_ss_ps>;
+  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
+                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in
+def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
+
+let Predicates = [HasAVX2] in
+def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
+                                   int_x86_avx2_vbroadcasti128>, VEX_L;
+
+let Predicates = [HasAVX] in
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
+          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, VEX, VEX_L;
+let mayStore = 1 in
+def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
+          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, VEX, VEX_L;
+}
+
+// AVX1 patterns
+let Predicates = [HasAVX] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTF128rr
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTF128rr
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTF128rr
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTF128rr
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
+multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
+                          Intrinsic IntLd, Intrinsic IntLd256,
+                          Intrinsic IntSt, Intrinsic IntSt256> {
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, f128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
+             VEX_4V;
+  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
+             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
+                                 int_x86_avx_maskload_ps,
+                                 int_x86_avx_maskload_ps_256,
+                                 int_x86_avx_maskstore_ps,
+                                 int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
+defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
+                                 int_x86_avx_maskload_pd,
+                                 int_x86_avx_maskload_pd_256,
+                                 int_x86_avx_maskstore_pd,
+                                 int_x86_avx_maskstore_pd_256>;
+
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
+multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
+                      RegisterClass RC, X86MemOperand x86memop_f,
+                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      Intrinsic IntVar, ValueType vt> {
+  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V;
+  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop_i:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (IntVar RC:$src1,
+                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V;
+
+  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX;
+  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
+             (ins x86memop_f:$src1, i8imm:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX;
+}
+
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                               memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                               memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+                               (i8 imm:$imm))),
+          (VPERMILPSYmi addr:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDYmi addr:$src1, imm:$imm)>;
+
+def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
+          (VPERMILPDri VR128:$src1, imm:$imm)>;
+def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+          (VPERMILPDmi addr:$src1, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
+let ExeDomain = SSEPackedSingle in {
+def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
+                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
+                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
+
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set RC:$dst, (Int VR128:$src))]>,
+             T8, OpSize, VEX;
+  let neverHasSideEffects = 1, mayLoad = 1 in
+  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+}
+
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+               (ins RC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+               TA, OpSize, VEX;
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : Ii8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               TA, OpSize, VEX;
+}
+
+let Predicates = [HasAVX, HasF16C] in {
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
+  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
+multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
+                 X86MemOperand x86memop> {
+  let isCommutable = 1 in
+  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        VEX_4V;
+  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
+        !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set RC:$dst,
+          (IntId RC:$src1,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+        VEX_4V;
+}
+
+let isCommutable = 0 in {
+defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
+                                   VR128, memopv2i64, i128mem>;
+defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
+                                    VR256, memopv4i64, i256mem>, VEX_L;
+}
+
+def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+                  imm:$mask)),
+          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
+def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
+                  imm:$mask)),
+          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
+
+//===----------------------------------------------------------------------===//
+// VPBROADCAST - Load from memory and broadcast to all elements of the
+//               destination operand
+//
+multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          Intrinsic Int128, Intrinsic Int256> {
+  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst, (Int128 VR128:$src))]>, VEX;
+  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set VR128:$dst,
+                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR256:$dst,
+                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   VEX, VEX_L;
+}
+
+defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
+                                    int_x86_avx2_pbroadcastb_128,
+                                    int_x86_avx2_pbroadcastb_256>;
+defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
+                                    int_x86_avx2_pbroadcastw_128,
+                                    int_x86_avx2_pbroadcastw_256>;
+defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
+                                    int_x86_avx2_pbroadcastd_128,
+                                    int_x86_avx2_pbroadcastd_256>;
+defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
+                                    int_x86_avx2_pbroadcastq_128,
+                                    int_x86_avx2_pbroadcastq_256>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBrm addr:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDrm addr:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDYrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQYrm addr:$src)>;
+
+  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBrr VR128:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+          (VPBROADCASTBYrr VR128:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWrr VR128:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+          (VPBROADCASTWYrr VR128:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDrr VR128:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+          (VPBROADCASTDYrr VR128:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+          (VPBROADCASTQYrr VR128:$src)>;
+  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSrr VR128:$src)>;
+  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+          (VBROADCASTSSYrr VR128:$src)>;
+  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VPBROADCASTQrr VR128:$src)>;
+  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+          (VBROADCASTSDYrr VR128:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
+    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  }
+}
+
+// AVX1 broadcast patterns
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+
+  // Provide fallback in case the load node that is used in the patterns above
+  // is used by additional users, which prevents the pattern selection.
+  let AddedComplexity = 20 in {
+  // 128bit broadcasts:
+  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+
+  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
+            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
+  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
+              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// VPERM - Permute instructions
+//
+
+multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                     ValueType OpVT> {
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+                   (ins VR256:$src1, VR256:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   VEX_4V, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+                   (ins VR256:$src1, i256mem:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR256:$dst,
+                     (OpVT (X86VPermv VR256:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))]>,
+                   VEX_4V, VEX_L;
+}
+
+defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
+
+multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+                         ValueType OpVT> {
+  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
+                     (ins VR256:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     VEX, VEX_L;
+  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
+                     (ins i256mem:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set VR256:$dst,
+                       (OpVT (X86VPermi (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, VEX, VEX_L;
+}
+
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
+//
+def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
+def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
+          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
+                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+                  (i8 imm:$imm))),
+          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// VINSERTI128 - Insert packed integer values
+//
+let neverHasSideEffects = 1 in {
+def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
+          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V, VEX_L;
+let mayLoad = 1 in
+def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
+          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
+          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, VEX_4V, VEX_L;
+}
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTI128 - Extract packed integer values
+//
+def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
+          (ins VR256:$src1, i8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128:$dst,
+            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
+          VEX, VEX_L;
+let neverHasSideEffects = 1, mayStore = 1 in
+def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          VEX, VEX_L;
+
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v2i64 (VEXTRACTI128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTI128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v8i16 (VEXTRACTI128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+          (v16i8 (VEXTRACTI128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+//===----------------------------------------------------------------------===//
+// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
+//
+multiclass avx2_pmovmask<string OpcodeStr,
+                         Intrinsic IntLd128, Intrinsic IntLd256,
+                         Intrinsic IntSt128, Intrinsic IntSt256> {
+  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
+  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
+             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+}
+
+defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
+                                int_x86_avx2_maskload_d,
+                                int_x86_avx2_maskload_d_256,
+                                int_x86_avx2_maskstore_d,
+                                int_x86_avx2_maskstore_d_256>;
+defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
+                                int_x86_avx2_maskload_q,
+                                int_x86_avx2_maskload_q_256,
+                                int_x86_avx2_maskstore_q,
+                                int_x86_avx2_maskstore_q_256>, VEX_W;
+
+
+//===----------------------------------------------------------------------===//
+// Variable Bit Shifts
+//
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType vt128, ValueType vt256> {
+  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
+             VEX_4V;
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR128:$dst,
+               (vt128 (OpNode VR128:$src1,
+                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
+             VEX_4V;
+  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+             VEX_4V, VEX_L;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, i256mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1,
+                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
+             VEX_4V, VEX_L;
+}
+
+defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
+
+//===----------------------------------------------------------------------===//
+// VGATHER - GATHER Operations
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
+                       X86MemOperand memop128, X86MemOperand memop256> {
+  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
+            (ins VR128:$src1, memop128:$src2, VR128:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3;
+  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
+            (ins RC256:$src1, memop256:$src2, RC256:$mask),
+            !strconcat(OpcodeStr,
+              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
+            []>, VEX_4VOp3, VEX_L;
+}
+
+let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+  defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
+  defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
+  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
+  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
+  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
+  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSVM.td b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
new file mode 100644
index 000000000000..757dcd0b5dcb
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSVM.td
@@ -0,0 +1,62 @@
+//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the AMD SVM instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SVM instructions
+
+// 0F 01 D9
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+
+// 0F 01 DC
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+
+// 0F 01 DD
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+
+// 0F 01 DE
+let Uses = [EAX] in
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+
+// 0F 01 D8
+let Uses = [EAX] in
+def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
+                "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DA
+let Uses = [EAX] in
+def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
+                "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DB
+let Uses = [EAX] in
+def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX] in
+def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
+                "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+
+// 0F 01 DF
+let Uses = [EAX, ECX] in
+def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+let Uses = [RAX, ECX] in
+def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
+                "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
new file mode 100644
index 000000000000..5b6298b541bc
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -0,0 +1,978 @@
+//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the shift and rotate instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: Someone needs to smear multipattern goodness all over this file.
+
+let Defs = [EFLAGS] in {
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shl{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shl{q}\t{%cl, $dst|$dst, CL}",
+                  [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+                   
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>,
+                   OpSize;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", [], IIC_SR>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", [], IIC_SR>, OpSize;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", [], IIC_SR>;
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
+} // isConvertibleToThreeAddress = 1
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
+// using CL?
+let Uses = [CL] in {
+def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t{%cl, $dst|$dst, CL}",
+                 [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t{%cl, $dst|$dst, CL}",
+                 [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize;
+def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t{%cl, $dst|$dst, CL}",
+                 [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, CL}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "shl{b}\t{$src, $dst|$dst, $src}",
+                [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "shl{w}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>,
+                   OpSize;
+def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "shl{l}\t{$src, $dst|$dst, $src}",
+               [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                 "shl{b}\t$dst",
+                [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                 "shl{w}\t$dst",
+               [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>,
+                   OpSize;
+def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                 "shl{l}\t$dst",
+               [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "shr{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                  "shr{q}\t{%cl, $dst|$dst, CL}",
+                  [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, OpSize;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>;
+
+// Shift right by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))], IIC_SR>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))], IIC_SR>, OpSize;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))], IIC_SR>;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))], IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t{%cl, $dst|$dst, CL}",
+                 [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t{%cl, $dst|$dst, CL}",
+                 [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
+                 OpSize;
+def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t{%cl, $dst|$dst, CL}",
+                 [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, CL}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+}
+def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "shr{b}\t{$src, $dst|$dst, $src}",
+                [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "shr{w}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>,
+                   OpSize;
+def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "shr{l}\t{$src, $dst|$dst, $src}",
+               [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                 "shr{b}\t$dst",
+                [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                 "shr{w}\t$dst",
+               [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>,OpSize;
+def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                 "shr{l}\t$dst",
+               [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (sra GR8:$src1, CL))],
+                 IIC_SR>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (sra GR16:$src1, CL))],
+                 IIC_SR>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (sra GR32:$src1, CL))],
+                 IIC_SR>;
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t{%cl, $dst|$dst, CL}",
+                 [(set GR64:$dst, (sra GR64:$src1, CL))],
+                 IIC_SR>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>,
+                   OpSize;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+                    (ins GR64:$src1, i8imm:$src2),
+                    "sar{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))],
+                 IIC_SR>;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))],
+                 IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t{%cl, $dst|$dst, CL}",
+                 [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t{%cl, $dst|$dst, CL}",
+                 [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize;
+def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
+                 "sar{l}\t{%cl, $dst|$dst, CL}",
+                 [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
+                 "sar{q}\t{%cl, $dst|$dst, CL}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+}
+def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "sar{b}\t{$src, $dst|$dst, $src}",
+                [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "sar{w}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>,
+                   OpSize;
+def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "sar{l}\t{$src, $dst|$dst, $src}",
+               [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                 IIC_SR>;
+
+// Shift by 1
+def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                 "sar{b}\t$dst",
+                [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+                IIC_SR>;
+def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                 "sar{w}\t$dst",
+               [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>,
+                   OpSize;
+def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                 "sar{l}\t$dst",
+               [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+                 IIC_SR>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Rotate instructions
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+  
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+let Uses = [CL] in
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcl{l}\t$dst", [], IIC_SR>;
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+  
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+let Uses = [CL] in
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                "rcr{l}\t$dst", [], IIC_SR>;
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                 
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+let Uses = [CL] in
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+} // Constraints = "$src = $dst"
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
+               "rcl{b}\t$dst", [], IIC_SR>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst),
+                "rcl{w}\t$dst", [], IIC_SR>, OpSize;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
+                "rcl{l}\t$dst", [], IIC_SR>;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
+                 "rcl{q}\t$dst", [], IIC_SR>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
+               "rcr{b}\t$dst", [], IIC_SR>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst),
+                "rcr{w}\t$dst", [], IIC_SR>, OpSize;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
+def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
+                "rcr{l}\t$dst", [], IIC_SR>;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
+                 "rcr{q}\t$dst", [], IIC_SR>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+
+let Uses = [CL] in {
+def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+
+def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+}
+} // SchedRW
+} // hasSideEffects = 0
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t{%cl, $dst|$dst, CL}",
+                  [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, 
+                   OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize;
+def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                   "rol{q}\t{%cl, $dst|$dst, %cl}",
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
+                   IIC_SR>;
+}
+def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1),
+                   "rol{b}\t{$src1, $dst|$dst, $src1}",
+               [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+               IIC_SR>;
+def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1),
+                   "rol{w}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>,
+                   OpSize;
+def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1),
+                   "rol{l}\t{$src1, $dst|$dst, $src1}",
+              [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+              IIC_SR>;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1),
+                    "rol{q}\t{$src1, $dst|$dst, $src1}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                 "rol{b}\t$dst",
+               [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                 "rol{w}\t$dst",
+              [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>,
+                   OpSize;
+def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                 "rol{l}\t$dst",
+              [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t{%cl, $dst|$dst, CL}",
+                  [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))],
+                   IIC_SR>, 
+                   OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))],
+                   IIC_SR>;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))],
+                    IIC_SR>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))],
+                 IIC_SR>, OpSize;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))],
+                 IIC_SR>;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))],
+                  IIC_SR>;
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>, OpSize;
+def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
+                 "ror{l}\t{%cl, $dst|$dst, CL}",
+                 [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
+                 IIC_SR>;
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
+                  "ror{q}\t{%cl, $dst|$dst, CL}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
+                  IIC_SR>;
+}
+def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                   "ror{b}\t{$src, $dst|$dst, $src}",
+               [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)],
+               IIC_SR>;
+def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+                   "ror{w}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>,
+                   OpSize;
+def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+                   "ror{l}\t{$src, $dst|$dst, $src}",
+              [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)],
+              IIC_SR>;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
+                IIC_SR>;
+
+// Rotate by 1
+def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                 "ror{b}\t$dst",
+               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                 "ror{w}\t$dst",
+              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>,
+                   OpSize;
+def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                 "ror{l}\t$dst",
+              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              IIC_SR>;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               IIC_SR>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+// Double shift instructions (generalizations of rotate)
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL] in {
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
+                    IIC_SHD16_REG_CL>,
+                   TB, OpSize;
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
+                   (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
+                    IIC_SHD32_REG_CL>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
+                   IIC_SHD32_REG_CL>, TB;
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>, 
+                    TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
+                    IIC_SHD64_REG_CL>, 
+                    TB;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD16_REG_IM>,
+                     TB, OpSize;
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))], IIC_SHD32_REG_IM>,
+                 TB;
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))], IIC_SHD64_REG_IM>,
+                 TB;
+}
+} // Constraints = "$src = $dst", SchedRW
+
+let SchedRW = [WriteShiftLd, WriteRMW] in {
+let Uses = [CL] in {
+def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                    addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
+
+def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                     addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                    addr:$dst)], IIC_SHD32_MEM_CL>, TB;
+                    
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)], IIC_SHD64_MEM_CL>, TB;
+}
+
+def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                    "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                    TB, OpSize;
+def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                     (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD16_MEM_IM>,
+                     TB, OpSize;
+
+def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                    (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                    "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                      (i8 imm:$src3)), addr:$dst)],
+                                      IIC_SHD32_MEM_IM>,
+                    TB;
+def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                     (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD32_MEM_IM>,
+                     TB;
+
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)],
+                                       IIC_SHD64_MEM_IM>,
+                 TB;
+} // SchedRW
+
+} // Defs = [EFLAGS]
+
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue());
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue());
+}]>;
+
+multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
+               (ins x86memop:$src1, i8imm:$src2),
+               !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               []>, TAXD, VEX, Sched<[WriteShiftLd]>;
+}
+}
+
+multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in {
+  def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3, Sched<[WriteShift]>;
+  let mayLoad = 1 in
+  def rm : I<0xF7, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
+             !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
+             VEX_4VOp3,
+             Sched<[WriteShiftLd,
+                    // x86memop:$src1
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    ReadDefault,
+                    // RC:$src1
+                    ReadAfterLd]>;
+}
+}
+
+let Predicates = [HasBMI2] in {
+  defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
+  defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
new file mode 100644
index 000000000000..053417ccde63
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -0,0 +1,522 @@
+//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instructions that are generally used in
+// privileged modes.  These are not typically used by the compiler, but are
+// supported for the assembler and disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+let SchedRW = [WriteSystem] in {
+let Defs = [RAX, RDX] in
+  def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)], IIC_RDTSC>,
+              TB;
+
+let Defs = [RAX, RCX, RDX] in
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+
+// CPU flow control instructions
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+  def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+}
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", [], IIC_HLT>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
+
+// Interrupt and SysCall Instructions.
+let Uses = [EFLAGS] in
+  def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
+              [(int_x86_int (i8 3))], IIC_INT3>;
+} // SchedRW
+
+def : Pat<(debugtrap),
+          (INT3)>;
+
+// The long form of "int $3" turns into int3 as a size optimization.
+// FIXME: This doesn't work because InstAlias can't match immediate constants.
+//def : InstAlias<"int\t$3", (INT3)>;
+
+let SchedRW = [WriteSystem] in {
+
+def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap",
+              [(int_x86_int imm:$trap)], IIC_INT>;
+
+
+def SYSCALL  : I<0x05, RawFrm, (outs), (ins), "syscall", [], IIC_SYSCALL>, TB;
+def SYSRET   : I<0x07, RawFrm, (outs), (ins), "sysret{l}", [], IIC_SYSCALL>, TB;
+def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysret{q}", [], IIC_SYSCALL>, TB,
+               Requires<[In64BitMode]>;
+
+def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+
+def SYSEXIT   : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [],
+                 IIC_SYS_ENTER_EXIT>, TB;
+def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", []>, TB,
+                Requires<[In64BitMode]>;
+
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>;
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
+             Requires<[In64BitMode]>;
+} // SchedRW
+
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions.
+//
+let SchedRW = [WriteSystem] in {
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>,  OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
+                  "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+                  "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
+                  "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
+                   "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+                   "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
+                   "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
+
+def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>,  OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins), "ins{l}", [], IIC_INS>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from debug registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_DR>, TB;
+                
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_DR_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Moves to and from control registers
+
+let SchedRW = [WriteSystem] in {
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_CR>, TB;
+                
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_CR_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segment override instruction prefixes
+
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+
+
+//===----------------------------------------------------------------------===//
+// Moves to and from segment registers.
+//
+
+let SchedRW = [WriteMove] in {
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize;
+def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
+
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize;
+def MOV32ms : I<0x8C, MRMDestMem, (outs i32mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize;
+def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
+
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize;
+def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Segmentation support instructions.
+
+let SchedRW = [WriteSystem] in {
+def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB, OpSize; 
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; 
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB;
+
+def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr",
+               [], IIC_INVLPG>, TB;
+
+def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB, OpSize;
+def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
+               "str{l}\t$dst", [], IIC_STR>, TB;
+def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
+                "str{q}\t$dst", [], IIC_STR>, TB;
+def STRm   : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+               "str{w}\t$dst", [], IIC_STR>, TB;
+
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+             "ltr{w}\t$src", [], IIC_LTR>, TB;
+             
+def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
+def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
+                 "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
+                 "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
+def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
+                 "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
+def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
+                 "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
+                 "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+               OpSize;
+def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
+                 "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
+
+// No "pop cs" instruction.
+def POPSS16 : I<0x17, RawFrm, (outs), (ins),
+                "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+              OpSize, Requires<[In32BitMode]>;
+def POPSS32 : I<0x17, RawFrm, (outs), (ins),
+                "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+                      Requires<[In32BitMode]>;
+                
+def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
+def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
+                "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
+                
+def POPES16 : I<0x07, RawFrm, (outs), (ins),
+                "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+              OpSize, Requires<[In32BitMode]>;
+def POPES32 : I<0x07, RawFrm, (outs), (ins),
+                "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+                      Requires<[In32BitMode]>;
+                
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
+                
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
+                 
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+                
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>;
+                
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+                
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB;
+
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+              "verw\t$seg", [], IIC_VERW_MEM>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+              "verw\t$seg", [], IIC_VERW_REG>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Descriptor-table support instructions
+
+let SchedRW = [WriteSystem] in {
+def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
+def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt\t$dst", [], IIC_SGDT>, TB;
+def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
+def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt\t$dst", []>, TB;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                "sldt{w}\t$dst", [], IIC_SLDT>, TB;
+def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
+                "sldt{l}\t$dst", [], IIC_SLDT>, TB;
+                
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+//   extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                 "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+
+def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
+def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt\t$src", [], IIC_LGDT>, TB;
+def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
+def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt\t$src", [], IIC_LIDT>, TB;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+                "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Specialized register support
+let SchedRW = [WriteSystem] in {
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", [], IIC_WRMSR>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", [], IIC_RDMSR>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [], IIC_RDPMC>, TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
+                "smsw{w}\t$dst", [], IIC_SMSW>, OpSize, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
+                "smsw{l}\t$dst", [], IIC_SMSW>, TB;
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
+                 "smsw{q}\t$dst", [], IIC_SMSW>, TB;
+
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+                "smsw{w}\t$dst", [], IIC_SMSW>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+                "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
+                
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// Cache instructions
+let SchedRW = [WriteSystem] in {
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", [], IIC_INVD>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
+let SchedRW = [WriteSystem] in {
+let Defs = [RDX, RAX], Uses = [RCX] in
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+
+let Uses = [RDX, RAX, RCX] in
+  def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
+
+let Uses = [RDX, RAX] in {
+  def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+               "xsave\t$dst", []>, TB;
+  def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+                 "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+               "xrstor\t$dst", []>, TB;
+  def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                  "xsaveopt\t$dst", []>, TB;
+  def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                    "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+}
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// VIA PadLock crypto instructions
+let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+  def XSTORE : I<0xc0, RawFrm, (outs), (ins), "xstore", []>, A7;
+
+def : InstAlias<"xstorerng", (XSTORE)>;
+
+let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
+  def XCRYPTECB : I<0xc8, RawFrm, (outs), (ins), "xcryptecb", []>, A7;
+  def XCRYPTCBC : I<0xd0, RawFrm, (outs), (ins), "xcryptcbc", []>, A7;
+  def XCRYPTCTR : I<0xd8, RawFrm, (outs), (ins), "xcryptctr", []>, A7;
+  def XCRYPTCFB : I<0xe0, RawFrm, (outs), (ins), "xcryptcfb", []>, A7;
+  def XCRYPTOFB : I<0xe8, RawFrm, (outs), (ins), "xcryptofb", []>, A7;
+}
+
+let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
+  def XSHA1 : I<0xc8, RawFrm, (outs), (ins), "xsha1", []>, A6;
+  def XSHA256 : I<0xd0, RawFrm, (outs), (ins), "xsha256", []>, A6;
+}
+let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
+  def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [HasFSGSBase, In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst",
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst",
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
+                   "wrfsbase{l}\t$src",
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
+                      "wrfsbase{q}\t$src",
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
+                   "wrgsbase{l}\t$src",
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
+                      "wrgsbase{q}\t$src",
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// INVPCID Instruction
+def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                Requires<[In32BitMode]>;
+def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invpcid {$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                Requires<[In64BitMode]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 000000000000..363a190aa854
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,39 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
+                     [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
+                         "xbegin\t$dst", []>, Requires<[HasRTM]>;
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+let Defs = [EFLAGS] in
+def XTEST : I<0x01, MRM_D6, (outs), (ins),
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
new file mode 100644
index 000000000000..6d3548f09398
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -0,0 +1,66 @@
+//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel VMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               Requires<[In32BitMode]>;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+               Requires<[In64BitMode]>;
+// 66 0F 38 81
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                Requires<[In32BitMode]>;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, OpSize, T8,
+                Requires<[In64BitMode]>;
+// 0F 01 C1
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmclear\t$vmcs", []>, OpSize, TB;
+// OF 01 D4
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmptrld\t$vmcs", []>, TB;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+  "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In32BitMode]>;
+// 0F 01 C4
+def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+  "vmxon\t$vmxon", []>, XS;
+
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
new file mode 100644
index 000000000000..2aa08fad7836
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -0,0 +1,311 @@
+//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes XOP (eXtended OPerations)
+//
+//===----------------------------------------------------------------------===//
+
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
+}
+
+// Scalar load 2 addr operand instructions
+multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     Operand memop, ComplexPattern mem_cpat> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                   ssmem, sse_load_f32>;
+  defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                   sdmem, sse_load_f64>;
+}
+
+multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int VR128:$src))]>, VEX;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
+}
+
+multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
+                     PatFrag memop> {
+  def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
+  def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+           !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
+                           memopv8f32>;
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
+                           memopv4f64>;
+}
+
+multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, VEX_4VOp3;
+  def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
+           VEX_4V, VEX_W;
+  def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, VR128:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+              (Int (bitconvert (memopv2i64 addr:$src1)), VR128:$src2))]>,
+             VEX_4VOp3;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
+}
+
+multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, VEX;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins i128mem:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+           [(set VR128:$dst,
+             (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
+}
+
+// Instruction where second source can be memory, but third must be register
+multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+              (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              VR128:$src3))]>, VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
+}
+
+// Instruction where second source can be memory, third must be imm8
+multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+           VEX_4V;
+  def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              imm:$src3))]>, VEX_4V;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
+}
+
+// Instruction where either second or third source can be memory
+multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rr : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, VR128:$src3))]>,
+           VEX_4V, VEX_I8IMM;
+  def rm : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, VR128:$src2,
+              (bitconvert (memopv2i64 addr:$src3))))]>,
+           VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+  def mr : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
+           (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR128:$dst,
+             (Int VR128:$src1, (bitconvert (memopv2i64 addr:$src2)),
+              VR128:$src3))]>,
+           VEX_4V, VEX_I8IMM;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
+}
+
+multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
+  def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
+           VEX_4V, VEX_I8IMM, VEX_L;
+  def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, VR256:$src2,
+              (bitconvert (memopv4i64 addr:$src3))))]>,
+           VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
+  def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
+           (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+           !strconcat(OpcodeStr,
+           "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+           [(set VR256:$dst,
+             (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
+              VR256:$src3))]>,
+           VEX_4V, VEX_I8IMM, VEX_L;
+}
+
+let isAsmParserOnly = 1 in {
+  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
+}
+
+multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
+                  Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
+  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>;
+  def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4;
+  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
+        (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR128:$dst,
+           (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>;
+  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
+  def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+          (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
+        VEX_W, MemOp4, VEX_L;
+  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
+        (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
+        !strconcat(OpcodeStr,
+        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
+        [(set VR256:$dst,
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
+}
+
+defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
+                         int_x86_xop_vpermil2pd_256, memopv2f64, memopv4f64>;
+defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps,
+                         int_x86_xop_vpermil2ps_256, memopv4f32, memopv8f32>;
+
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 000000000000..44d8cce05413
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,581 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Valgrind.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+
+  // X86 doesn't need to invalidate the processor cache, so just invalidate
+  // Valgrind's cache directly.
+  sys::ValgrindDiscardTranslations(Old, 5);
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+//        need to add some workarounds for compilers, which support
+//        only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    // Save RBP
+    "pushq   %rbp\n"
+    CFI(".cfi_def_cfa_offset 16\n")
+    CFI(".cfi_offset %rbp, -16\n")
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    CFI(".cfi_def_cfa_register %rbp\n")
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    CFI(".cfi_rel_offset %rdi, 0\n")
+    "pushq   %rsi\n"
+    CFI(".cfi_rel_offset %rsi, 8\n")
+    "pushq   %rdx\n"
+    CFI(".cfi_rel_offset %rdx, 16\n")
+    "pushq   %rcx\n"
+    CFI(".cfi_rel_offset %rcx, 24\n")
+    "pushq   %r8\n"
+    CFI(".cfi_rel_offset %r8, 32\n")
+    "pushq   %r9\n"
+    CFI(".cfi_rel_offset %r9, 40\n")
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+#ifdef _WIN64
+    "subq    $32, %rsp\n"
+    "movq    %rbp, %rcx\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rdx\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "addq    $32, %rsp\n"
+#else
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+#endif
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    CFI(".cfi_def_cfa_register %rsp\n")
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    CFI(".cfi_adjust_cfa_offset 48\n")
+    "popq    %r9\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r9\n")
+    "popq    %r8\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r8\n")
+    "popq    %rcx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rcx\n")
+    "popq    %rdx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdx\n")
+    "popq    %rsi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rsi\n")
+    "popq    %rdi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdi\n")
+    // Restore RBP
+    "popq    %rbp\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rbp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+# else
+  // No inline assembler support on this platform. The routine is in external
+  // file.
+  void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+#  if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#  endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register %esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+    TYPE_FUNCTION(X86CompilationCallback_SSE)
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "LLVMX86CompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    CFI(".cfi_restore %xmm3\n")
+    "movaps  32(%esp), %xmm2\n"
+    CFI(".cfi_restore %xmm2\n")
+    "movaps  16(%esp), %xmm1\n"
+    CFI(".cfi_restore %xmm1\n")
+    "movaps  (%esp), %xmm0\n"
+    CFI(".cfi_restore %xmm0\n")
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback_SSE)
+  );
+# else
+  void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  ebp
+      mov   ebp, esp
+      push  eax
+      push  edx
+      push  ecx
+      and   esp, -16
+      sub   esp, 16
+      mov   eax, dword ptr [ebp+4]
+      mov   dword ptr [esp+4], eax
+      mov   dword ptr [esp], ebp
+      call  LLVMX86CompilationCallback2
+      mov   esp, ebp
+      sub   esp, 12
+      pop   ecx
+      pop   edx
+      pop   eax
+      pop   ebp
+      ret
+    }
+  }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
+  }
+#endif
+}
+
+/// This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" {
+LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
+                                                         intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+  // We are reading raw stack data here. Tell MemorySanitizer that it is
+  // sufficiently initialized.
+  __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
+
+  // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+               << " ESP=" << (void*)StackPtr
+               << ": Resolving call to function: "
+               << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#if defined (X86_64_JIT)
+  assert(isStub &&
+         "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
+         " the call instruction varies too much.");
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+    // If the target address is within 32-bit range of the stub, use a
+    // PC-relative branch instead of loading the actual address.  (This is
+    // considerably shorter than the 64-bit immediate load already there.)
+    // We assume here intptr_t is 64 bits.
+    intptr_t diff = NewVal-RetAddr+7;
+    if (diff >= -2147483648LL && diff <= 2147483647LL) {
+      *(unsigned char*)(RetAddr-0xc) = 0xE9;
+      *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+    } else {
+      *(intptr_t *)(RetAddr - 0xa) = NewVal;
+      ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+    }
+    sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd);
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+    sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5);
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  TsanIgnoreWritesBegin();
+  JITCompilerFunction = F;
+  TsanIgnoreWritesEnd();
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  if (Subtarget->hasSSE1())
+    return X86CompilationCallback_SSE;
+#endif
+
+  return X86CompilationCallback;
+}
+
+X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  useGOT = 0;
+  TLSOffset = 0;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+  const unsigned Alignment = 8;
+  uint8_t Buffer[8];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
+#else
+  const unsigned Alignment = 4;
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
+#endif
+  return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
+}
+
+TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
+  // The 64-bit stub contains:
+  //   movabs r10 <- 8-byte-target-address  # 10 bytes
+  //   call|jmp *r10  # 3 bytes
+  // The 32-bit stub contains a 5-byte call|jmp.
+  // If the stub is a call to the compilation callback, an extra byte is added
+  // to mark it as a stub.
+  StubLayout Result = {14, 4};
+  return Result;
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that
+  // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
+                Target != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  JCE.emitAlignment(4);
+  void *Result = (void*)JCE.getCurrentPCValue();
+  if (NotCC) {
+#if defined (X86_64_JIT)
+    JCE.emitByte(0x49);          // REX prefix
+    JCE.emitByte(0xB8+2);        // movabsq r10
+    JCE.emitWordLE((unsigned)(intptr_t)Target);
+    JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+    JCE.emitByte(0x41);          // REX prefix
+    JCE.emitByte(0xFF);          // jmpq *r10
+    JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+    return Result;
+  }
+
+#if defined (X86_64_JIT)
+  JCE.emitByte(0x49);          // REX prefix
+  JCE.emitByte(0xB8+2);        // movabsq r10
+  JCE.emitWordLE((unsigned)(intptr_t)Target);
+  JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+  JCE.emitByte(0x41);          // REX prefix
+  JCE.emitByte(0xFF);          // callq *r10
+  JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  JCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+
+  // This used to use 0xCD, but that value is used by JITMemoryManager to
+  // initialize the buffer with garbage, which means it may follow a
+  // noreturn function call, confusing LLVMX86CompilationCallback2.  PR 4929.
+  JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
+  return Result;
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+  return BB - Entry;
+#else
+  return BB - PICBase;
+#endif
+}
+
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
+  T Value;
+  std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
+              sizeof(T));
+  Value += Delta;
+  std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
+              sizeof(T));
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    }
+    case X86::reloc_picrel_word: {
+      // PIC base relative relocation, add the relocated value to the value
+      // already in memory, after we adjust it for where the PIC base is.
+      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    }
+    case X86::reloc_absolute_word:
+    case X86::reloc_absolute_word_sext:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
+      break;
+    case X86::reloc_absolute_dword:
+      addUnaligned<intptr_t>(RelocPos, ResultPtr);
+      break;
+    }
+  }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+  TLSOffset -= size;
+  return TLSOffset;
+#else
+  llvm_unreachable("Cannot allocate thread local storage on this arch!");
+#endif
+}
diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 000000000000..f916327378a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,81 @@
+//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86TargetMachine;
+  class X86Subtarget;
+
+  class X86JITInfo : public TargetJITInfo {
+    X86TargetMachine &TM;
+    const X86Subtarget *Subtarget;
+    uintptr_t PICBase;
+    char* TLSOffset;
+  public:
+    explicit X86JITInfo(X86TargetMachine &tm);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on X86.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE);
+
+    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+    /// specific basic block.
+    virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// allocateThreadLocalMemory - Each target has its own way of
+    /// handling thread local variables. This method returns a value only
+    /// meaningful to the target.
+    virtual char* allocateThreadLocalMemory(size_t size);
+
+    /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+    /// PIC jumptable entry.
+    void setPICBase(uintptr_t Base) { PICBase = Base; }
+    uintptr_t getPICBase() const { return PICBase; }
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
new file mode 100644
index 000000000000..a8a9fd8accde
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -0,0 +1,794 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/Mangler.h"
+using namespace llvm;
+
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  Mangler *Mang;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
+                 X86AsmPrinter &asmprinter);
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
+X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+                               X86AsmPrinter &asmprinter)
+: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+  MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
+}
+
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
+
+  SmallString<128> Name;
+
+  if (MO.isGlobal()) {
+    const GlobalValue *GV = MO.getGlobal();
+    bool isImplicitlyPrivate = false;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      isImplicitlyPrivate = true;
+
+    Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  } else if (MO.isSymbol()) {
+    Name += MAI.getGlobalPrefix();
+    Name += MO.getSymbolName();
+  } else if (MO.isMBB()) {
+    Name += MO.getMBB()->getSymbol()->getName();
+  }
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  switch (MO.getTargetFlags()) {
+  default: break;
+  case X86II::MO_DLLIMPORT: {
+    // Handle dllimport linkage.
+    const char *Prefix = "__imp_";
+    Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+    break;
+  }
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getGVStubEntry(Sym);
+    if (StubSym.getPointer() == 0) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getHiddenGVStubEntry(Sym);
+    if (StubSym.getPointer() == 0) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    Name += "$stub";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    MachineModuleInfoImpl::StubValueTy &StubSym =
+      getMachOMMI().getFnStubEntry(Sym);
+    if (StubSym.getPointer())
+      return Sym;
+
+    if (MO.isGlobal()) {
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+                    !MO.getGlobal()->hasInternalLinkage());
+    } else {
+      Name.erase(Name.end()-5, Name.end());
+      StubSym =
+        MachineModuleInfoImpl::
+        StubValueTy(Ctx.GetOrCreateSymbol(Name.str()), false);
+    }
+    return Sym;
+  }
+  }
+
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = 0;
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    break;
+
+  case X86II::MO_TLVP:      RefKind = MCSymbolRefExpr::VK_TLVP; break;
+  case X86II::MO_TLVP_PIC_BASE:
+    Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr,
+                                  MCSymbolRefExpr::Create(MF.getPICBaseSymbol(),
+                                                           Ctx),
+                                   Ctx);
+    break;
+  case X86II::MO_SECREL:    RefKind = MCSymbolRefExpr::VK_SECREL; break;
+  case X86II::MO_TLSGD:     RefKind = MCSymbolRefExpr::VK_TLSGD; break;
+  case X86II::MO_TLSLD:     RefKind = MCSymbolRefExpr::VK_TLSLD; break;
+  case X86II::MO_TLSLDM:    RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
+  case X86II::MO_GOTTPOFF:  RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
+  case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
+  case X86II::MO_TPOFF:     RefKind = MCSymbolRefExpr::VK_TPOFF; break;
+  case X86II::MO_DTPOFF:    RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
+  case X86II::MO_NTPOFF:    RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
+  case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
+  case X86II::MO_GOTPCREL:  RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
+  case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
+  case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
+  case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr,
+                            MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx),
+                                   Ctx);
+    if (MO.isJTI() && MAI.hasSetDirective()) {
+      // If .set directive is supported, use it to reduce the number of
+      // relocations the assembler will generate for differences between
+      // local labels. This is only safe when the symbols are in the same
+      // section so we are restricting it to jumptable references.
+      MCSymbol *Label = Ctx.CreateTempSymbol();
+      AsmPrinter.OutStreamer.EmitAssignment(Label, Expr);
+      Expr = MCSymbolRefExpr::Create(Label, Ctx);
+    }
+    break;
+  }
+
+  if (Expr == 0)
+    Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+
+static void lower_subreg32(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg32.
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  if (Reg != 0)
+    MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
+}
+
+static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg64.
+  for (unsigned i = 0; i != 4; ++i) {
+    if (!MI->getOperand(OpNo+i).isReg()) continue;
+
+    unsigned Reg = MI->getOperand(OpNo+i).getReg();
+    // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
+    if (Reg == 0 || Reg == X86::RIP) continue;
+
+    MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
+  }
+}
+
+/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
+static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
+  OutMI.setOpcode(NewOpc);
+  lower_subreg32(&OutMI, 0);
+}
+/// LowerUnaryToTwoAddr - R = setb   -> R = sbb R, R
+static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
+  OutMI.setOpcode(NewOpc);
+  OutMI.addOperand(OutMI.getOperand(0));
+  OutMI.addOperand(OutMI.getOperand(0));
+}
+
+/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
+/// a short fixed-register form.
+static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
+  unsigned ImmOp = Inst.getNumOperands() - 1;
+  assert(Inst.getOperand(0).isReg() &&
+         (Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
+         ((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
+           Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
+          Inst.getNumOperands() == 2) && "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(0).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(ImmOp);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+}
+
+/// \brief Simplify things like MOV32rm to MOV32o32a.
+static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
+                                  unsigned Opcode) {
+  // Don't make these simplifications in 64-bit mode; other assemblers don't
+  // perform them because they make the code larger.
+  if (Printer.getSubtarget().is64Bit())
+    return;
+
+  bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
+  unsigned AddrBase = IsStore;
+  unsigned RegOp = IsStore ? 0 : 5;
+  unsigned AddrOp = AddrBase + 3;
+  assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
+         Inst.getOperand(AddrBase + 0).isReg() && // base
+         Inst.getOperand(AddrBase + 1).isImm() && // scale
+         Inst.getOperand(AddrBase + 2).isReg() && // index register
+         (Inst.getOperand(AddrOp).isExpr() ||     // address
+          Inst.getOperand(AddrOp).isImm())&&
+         Inst.getOperand(AddrBase + 4).isReg() && // segment
+         "Unexpected instruction!");
+
+  // Check whether the destination register can be fixed.
+  unsigned Reg = Inst.getOperand(RegOp).getReg();
+  if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
+    return;
+
+  // Check whether this is an absolute address.
+  // FIXME: We know TLVP symbol refs aren't, but there should be a better way
+  // to do this here.
+  bool Absolute = true;
+  if (Inst.getOperand(AddrOp).isExpr()) {
+    const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
+        Absolute = false;
+  }
+
+  if (Absolute &&
+      (Inst.getOperand(AddrBase + 0).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 2).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 4).getReg() != 0 ||
+       Inst.getOperand(AddrBase + 1).getImm() != 1))
+    return;
+
+  // If so, rewrite the instruction.
+  MCOperand Saved = Inst.getOperand(AddrOp);
+  Inst = MCInst();
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(Saved);
+}
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO,
+                     AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+      break;
+    case MachineOperand::MO_RegisterMask:
+      // Ignore call clobbers.
+      continue;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+
+  // Handle a few special cases to eliminate operand modifiers.
+ReSimplify:
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
+    lower_lea64_32mem(&OutMI, 1);
+    // FALL THROUGH.
+  case X86::LEA64r:
+  case X86::LEA16r:
+  case X86::LEA32r:
+    // LEA should have a segment register, but it must be empty.
+    assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
+           "Unexpected # of LEA operands");
+    assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
+           "LEA has segment specified!");
+    break;
+  case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
+  case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
+  case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
+  case X86::MOVZX64rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+  case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+  case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
+  case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
+  case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
+  case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
+
+  case X86::MOV16r0:
+    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
+    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+    break;
+  case X86::MOV64r0:
+    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV64r0 -> MOV32r0
+    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+    break;
+
+  // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
+  // if one of the registers is extended, but other isn't.
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+      case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+      case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+      case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+      case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+      case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+      case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+      case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+      case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+      case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+      case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+      case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
+        X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+      unsigned NewOpc;
+      switch (OutMI.getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VMOVSDrr:   NewOpc = X86::VMOVSDrr_REV;   break;
+      case X86::VMOVSSrr:   NewOpc = X86::VMOVSSrr_REV;   break;
+      }
+      OutMI.setOpcode(NewOpc);
+    }
+    break;
+  }
+
+  // TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
+  // inputs modeled as normal uses instead of implicit uses.  As such, truncate
+  // off all but the first operand (the callee).  FIXME: Change isel.
+  case X86::TAILJMPr64:
+  case X86::CALL64r:
+  case X86::CALL64pcrel32: {
+    unsigned Opcode = OutMI.getOpcode();
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    OutMI = MCInst();
+    OutMI.setOpcode(X86::RET);
+    break;
+  }
+
+  // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions.
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64: {
+    unsigned Opcode;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::TAILJMPr: Opcode = X86::JMP32r; break;
+    case X86::TAILJMPd:
+    case X86::TAILJMPd64: Opcode = X86::JMP_1; break;
+    }
+
+    MCOperand Saved = OutMI.getOperand(0);
+    OutMI = MCInst();
+    OutMI.setOpcode(Opcode);
+    OutMI.addOperand(Saved);
+    break;
+  }
+
+  // These are pseudo-ops for OR to help with the OR->ADD transformation.  We do
+  // this with an ugly goto in case the resultant OR uses EAX and needs the
+  // short form.
+  case X86::ADD16rr_DB:   OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
+  case X86::ADD32rr_DB:   OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
+  case X86::ADD64rr_DB:   OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
+  case X86::ADD16ri_DB:   OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
+  case X86::ADD32ri_DB:   OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
+  case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
+  case X86::ADD16ri8_DB:  OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
+  case X86::ADD32ri8_DB:  OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
+  case X86::ADD64ri8_DB:  OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
+
+  // The assembler backend wants to see branches in their small form and relax
+  // them to their large form.  The JIT can only handle the large form because
+  // it does not do relaxation.  For now, translate the large form to the
+  // small one here.
+  case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break;
+  case X86::JO_4:  OutMI.setOpcode(X86::JO_1); break;
+  case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break;
+  case X86::JB_4:  OutMI.setOpcode(X86::JB_1); break;
+  case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break;
+  case X86::JE_4:  OutMI.setOpcode(X86::JE_1); break;
+  case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break;
+  case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break;
+  case X86::JA_4:  OutMI.setOpcode(X86::JA_1); break;
+  case X86::JS_4:  OutMI.setOpcode(X86::JS_1); break;
+  case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break;
+  case X86::JP_4:  OutMI.setOpcode(X86::JP_1); break;
+  case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break;
+  case X86::JL_4:  OutMI.setOpcode(X86::JL_1); break;
+  case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break;
+  case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
+  case X86::JG_4:  OutMI.setOpcode(X86::JG_1); break;
+
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:  OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:  OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+
+  // We don't currently select the correct instruction form for instructions
+  // which have a short %eax, etc. form. Handle this by custom lowering, for
+  // now.
+  //
+  // Note, we are currently not handling the following instructions:
+  // MOV64ao8, MOV64o8a
+  // XCHG16ar, XCHG32ar, XCHG64ar
+  case X86::MOV8mr_NOREX:
+  case X86::MOV8mr:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break;
+  case X86::MOV8rm_NOREX:
+  case X86::MOV8rm:     SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break;
+  case X86::MOV16mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break;
+  case X86::MOV16rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break;
+  case X86::MOV32mr:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break;
+  case X86::MOV32rm:    SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break;
+
+  case X86::ADC8ri:     SimplifyShortImmForm(OutMI, X86::ADC8i8);    break;
+  case X86::ADC16ri:    SimplifyShortImmForm(OutMI, X86::ADC16i16);  break;
+  case X86::ADC32ri:    SimplifyShortImmForm(OutMI, X86::ADC32i32);  break;
+  case X86::ADC64ri32:  SimplifyShortImmForm(OutMI, X86::ADC64i32);  break;
+  case X86::ADD8ri:     SimplifyShortImmForm(OutMI, X86::ADD8i8);    break;
+  case X86::ADD16ri:    SimplifyShortImmForm(OutMI, X86::ADD16i16);  break;
+  case X86::ADD32ri:    SimplifyShortImmForm(OutMI, X86::ADD32i32);  break;
+  case X86::ADD64ri32:  SimplifyShortImmForm(OutMI, X86::ADD64i32);  break;
+  case X86::AND8ri:     SimplifyShortImmForm(OutMI, X86::AND8i8);    break;
+  case X86::AND16ri:    SimplifyShortImmForm(OutMI, X86::AND16i16);  break;
+  case X86::AND32ri:    SimplifyShortImmForm(OutMI, X86::AND32i32);  break;
+  case X86::AND64ri32:  SimplifyShortImmForm(OutMI, X86::AND64i32);  break;
+  case X86::CMP8ri:     SimplifyShortImmForm(OutMI, X86::CMP8i8);    break;
+  case X86::CMP16ri:    SimplifyShortImmForm(OutMI, X86::CMP16i16);  break;
+  case X86::CMP32ri:    SimplifyShortImmForm(OutMI, X86::CMP32i32);  break;
+  case X86::CMP64ri32:  SimplifyShortImmForm(OutMI, X86::CMP64i32);  break;
+  case X86::OR8ri:      SimplifyShortImmForm(OutMI, X86::OR8i8);     break;
+  case X86::OR16ri:     SimplifyShortImmForm(OutMI, X86::OR16i16);   break;
+  case X86::OR32ri:     SimplifyShortImmForm(OutMI, X86::OR32i32);   break;
+  case X86::OR64ri32:   SimplifyShortImmForm(OutMI, X86::OR64i32);   break;
+  case X86::SBB8ri:     SimplifyShortImmForm(OutMI, X86::SBB8i8);    break;
+  case X86::SBB16ri:    SimplifyShortImmForm(OutMI, X86::SBB16i16);  break;
+  case X86::SBB32ri:    SimplifyShortImmForm(OutMI, X86::SBB32i32);  break;
+  case X86::SBB64ri32:  SimplifyShortImmForm(OutMI, X86::SBB64i32);  break;
+  case X86::SUB8ri:     SimplifyShortImmForm(OutMI, X86::SUB8i8);    break;
+  case X86::SUB16ri:    SimplifyShortImmForm(OutMI, X86::SUB16i16);  break;
+  case X86::SUB32ri:    SimplifyShortImmForm(OutMI, X86::SUB32i32);  break;
+  case X86::SUB64ri32:  SimplifyShortImmForm(OutMI, X86::SUB64i32);  break;
+  case X86::TEST8ri:    SimplifyShortImmForm(OutMI, X86::TEST8i8);   break;
+  case X86::TEST16ri:   SimplifyShortImmForm(OutMI, X86::TEST16i16); break;
+  case X86::TEST32ri:   SimplifyShortImmForm(OutMI, X86::TEST32i32); break;
+  case X86::TEST64ri32: SimplifyShortImmForm(OutMI, X86::TEST64i32); break;
+  case X86::XOR8ri:     SimplifyShortImmForm(OutMI, X86::XOR8i8);    break;
+  case X86::XOR16ri:    SimplifyShortImmForm(OutMI, X86::XOR16i16);  break;
+  case X86::XOR32ri:    SimplifyShortImmForm(OutMI, X86::XOR32i32);  break;
+  case X86::XOR64ri32:  SimplifyShortImmForm(OutMI, X86::XOR64i32);  break;
+
+  case X86::MORESTACK_RET:
+    OutMI.setOpcode(X86::RET);
+    break;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    OutMI.setOpcode(X86::MOV64rr);
+    OutMI.addOperand(MCOperand::CreateReg(X86::R10));
+    OutMI.addOperand(MCOperand::CreateReg(X86::RAX));
+
+    AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    break;
+  }
+}
+
+static void LowerTlsAddr(MCStreamer &OutStreamer,
+                         X86MCInstLower &MCInstLowering,
+                         const MachineInstr &MI) {
+
+  bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
+                  MI.getOpcode() == X86::TLS_base_addr64;
+
+  bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
+
+  MCContext &context = OutStreamer.getContext();
+
+  if (needsPadding)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+
+  MCSymbolRefExpr::VariantKind SRVK;
+  switch (MI.getOpcode()) {
+    case X86::TLS_addr32:
+    case X86::TLS_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSGD;
+      break;
+    case X86::TLS_base_addr32:
+      SRVK = MCSymbolRefExpr::VK_TLSLDM;
+      break;
+    case X86::TLS_base_addr64:
+      SRVK = MCSymbolRefExpr::VK_TLSLD;
+      break;
+    default:
+      llvm_unreachable("unexpected opcode");
+  }
+
+  MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
+  const MCSymbolRefExpr *symRef = MCSymbolRefExpr::Create(sym, SRVK, context);
+
+  MCInst LEA;
+  if (is64Bits) {
+    LEA.setOpcode(X86::LEA64r);
+    LEA.addOperand(MCOperand::CreateReg(X86::RDI)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::RIP)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(0));        // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  } else {
+    LEA.setOpcode(X86::LEA32r);
+    LEA.addOperand(MCOperand::CreateReg(X86::EAX)); // dest
+    LEA.addOperand(MCOperand::CreateReg(0));        // base
+    LEA.addOperand(MCOperand::CreateImm(1));        // scale
+    LEA.addOperand(MCOperand::CreateReg(X86::EBX)); // index
+    LEA.addOperand(MCOperand::CreateExpr(symRef));  // disp
+    LEA.addOperand(MCOperand::CreateReg(0));        // seg
+  }
+  OutStreamer.EmitInstruction(LEA);
+
+  if (needsPadding) {
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX));
+  }
+
+  StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
+  MCSymbol *tlsGetAddr = context.GetOrCreateSymbol(name);
+  const MCSymbolRefExpr *tlsRef =
+    MCSymbolRefExpr::Create(tlsGetAddr,
+                            MCSymbolRefExpr::VK_PLT,
+                            context);
+
+  OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
+                                                     : X86::CALLpcrel32)
+    .addExpr(tlsRef));
+}
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(Mang, *MF, *this);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      std::string TmpStr;
+      raw_string_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    if (OutStreamer.hasRawTextSupport())
+      OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
+    return;
+
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    unsigned Reg = MI->getOperand(0).getReg();
+    OutStreamer.AddComment(StringRef("eh_return, addr: %") +
+                           X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::TAILJMPr:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd64:
+    // Lower these as normal, but add some comments.
+    OutStreamer.AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(OutStreamer, MCInstLowering, *MI);
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALLpcrel32)
+      .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext)));
+
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+
+    // popl $reg
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::POP32r)
+      .addReg(MI->getOperand(0).getReg()));
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.CreateTempSymbol();
+    OutStreamer.EmitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext),
+                                      DotExpr, OutContext);
+
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::ADD32ri)
+      .addReg(MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg())
+      .addExpr(DotExpr));
+    return;
+  }
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitInstruction(TmpInst);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..568dc222d9d1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MachineFunctionInfo.h"
+
+using namespace llvm;
+
+void X86MachineFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 000000000000..78d20ce870f1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,145 @@
+//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
+  /// to the space used by the return address).
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// TailCallReturnAddrDelta - The number of bytes by which return address
+  /// stack slot is moved as the result of tail call optimization.
+  int TailCallReturnAddrDelta;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+  /// RegSaveFrameIndex - X86-64 vararg func register save area.
+  int RegSaveFrameIndex;
+  /// VarArgsGPOffset - X86-64 vararg func int reg offset.
+  unsigned VarArgsGPOffset;
+  /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
+  unsigned VarArgsFPOffset;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize;
+  /// NumLocalDynamics - Number of local-dynamic TLS accesses.
+  unsigned NumLocalDynamics;
+
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             ReturnAddrIndex(0),
+                             TailCallReturnAddrDelta(0),
+                             SRetReturnReg(0),
+                             GlobalBaseReg(0),
+                             VarArgsFrameIndex(0),
+                             RegSaveFrameIndex(0),
+                             VarArgsGPOffset(0),
+                             VarArgsFPOffset(0),
+                             ArgumentStackSize(0),
+                             NumLocalDynamics(0) {}
+
+  explicit X86MachineFunctionInfo(MachineFunction &MF)
+    : ForceFramePointer(false),
+      CalleeSavedFrameSize(0),
+      BytesToPopOnReturn(0),
+      ReturnAddrIndex(0),
+      TailCallReturnAddrDelta(0),
+      SRetReturnReg(0),
+      GlobalBaseReg(0),
+      VarArgsFrameIndex(0),
+      RegSaveFrameIndex(0),
+      VarArgsGPOffset(0),
+      VarArgsFPOffset(0),
+      ArgumentStackSize(0),
+      NumLocalDynamics(0) {}
+
+  bool getForceFramePointer() const { return ForceFramePointer;}
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
+
+  int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
+  void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
+
+  unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
+  void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
+
+  unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
+  void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
+
+  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 000000000000..83e75ea994ca
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,212 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+  struct VisitedBBInfo {
+    // HasReturn - Whether the BB contains a return instruction
+    bool HasReturn;
+
+    // Cycles - Number of cycles until return if HasReturn is true, otherwise
+    // number of cycles until end of the BB
+    unsigned int Cycles;
+
+    VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+    VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+      : HasReturn(HasReturn), Cycles(Cycles) {}
+  };
+
+  struct PadShortFunc : public MachineFunctionPass {
+    static char ID;
+    PadShortFunc() : MachineFunctionPass(ID)
+                   , Threshold(4), TM(0), TII(0) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Atom pad short functions";
+    }
+
+  private:
+    void findReturns(MachineBasicBlock *MBB,
+                     unsigned int Cycles = 0);
+
+    bool cyclesUntilReturn(MachineBasicBlock *MBB,
+                           unsigned int &Cycles);
+
+    void addPadding(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator &MBBI,
+                    unsigned int NOOPsToAdd);
+
+    const unsigned int Threshold;
+
+    // ReturnBBs - Maps basic blocks that return to the minimum number of
+    // cycles until the return, starting from the entry block.
+    DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+    // VisitedBBs - Cache of previously visited BBs.
+    DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+  };
+
+  char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+  return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
+  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::OptimizeForSize) ||
+      FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                           Attribute::MinSize)) {
+    return false;
+  }
+
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+
+  // Search through basic blocks and mark the ones that have early returns
+  ReturnBBs.clear();
+  VisitedBBs.clear();
+  findReturns(MF.begin());
+
+  bool MadeChange = false;
+
+  MachineBasicBlock *MBB;
+  unsigned int Cycles = 0;
+
+  // Pad the identified basic blocks with NOOPs
+  for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+       I != ReturnBBs.end(); ++I) {
+    MBB = I->first;
+    Cycles = I->second;
+
+    if (Cycles < Threshold) {
+      // BB ends in a return. Skip over any DBG_VALUE instructions
+      // trailing the terminator.
+      assert(MBB->size() > 0 &&
+             "Basic block should contain at least a RET but is empty");
+      MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+      while (ReturnLoc->isDebugValue())
+        --ReturnLoc;
+      assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+             "Basic block does not end with RET");
+
+      addPadding(MBB, ReturnLoc, Threshold - Cycles);
+      NumBBsPadded++;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+  // If this BB has a return, note how many cycles it takes to get there.
+  bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+  if (Cycles >= Threshold)
+    return;
+
+  if (hasReturn) {
+    ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+    return;
+  }
+
+  // Follow branches in BB and look for returns
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+       I != MBB->succ_end(); ++I) {
+    if (*I == MBB)
+      continue;
+    findReturns(*I, Cycles);
+  }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+                                     unsigned int &Cycles) {
+  // Return cached result if BB was previously visited
+  DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+    = VisitedBBs.find(MBB);
+  if (it != VisitedBBs.end()) {
+    VisitedBBInfo BBInfo = it->second;
+    Cycles += BBInfo.Cycles;
+    return BBInfo.HasReturn;
+  }
+
+  unsigned int CyclesToEnd = 0;
+
+  for (MachineBasicBlock::iterator MBBI = MBB->begin();
+        MBBI != MBB->end(); ++MBBI) {
+    MachineInstr *MI = MBBI;
+    // Mark basic blocks with a return instruction. Calls to other
+    // functions do not count because the called function will be padded,
+    // if necessary.
+    if (MI->isReturn() && !MI->isCall()) {
+      VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+      Cycles += CyclesToEnd;
+      return true;
+    }
+
+    CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+  }
+
+  VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+  Cycles += CyclesToEnd;
+  return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned int NOOPsToAdd) {
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  while (NOOPsToAdd-- > 0) {
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+    BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 000000000000..16886e432d19
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,691 @@
+//===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterInfo.h"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "X86GenRegisterInfo.inc"
+
+using namespace llvm;
+
+cl::opt<bool>
+ForceStackAlign("force-align-stack",
+                 cl::desc("Force align the stack to the minimum alignment"
+                           " needed for the function."),
+                 cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
+          cl::desc("Enable use of a base pointer for complex stack frames"));
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
+                         ? X86::RIP : X86::EIP),
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
+                       (tm.getSubtarget<X86Subtarget>().is64Bit()
+                         ? X86::RIP : X86::EIP)),
+                       TM(tm), TII(tii) {
+  X86_MC::InitLLVM2SEHRegisterMapping(this);
+
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  IsWin64 = Subtarget->isTargetWin64();
+
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+  // Use a callee-saved register as the base pointer.  These registers must
+  // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
+  // requires GOT in the EBX register before function calls via PLT GOT pointer.
+  BasePtr = Is64Bit ? X86::RBX : X86::ESI;
+}
+
+/// getCompactUnwindRegNum - This function maps the register to the number for
+/// compact unwind encoding. Return -1 if the register isn't valid.
+int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
+  switch (getLLVMRegNum(RegNum, isEH)) {
+  case X86::EBX: case X86::RBX: return 1;
+  case X86::ECX: case X86::R12: return 2;
+  case X86::EDX: case X86::R13: return 3;
+  case X86::EDI: case X86::R14: return 4;
+  case X86::ESI: case X86::R15: return 5;
+  case X86::EBP: case X86::RBP: return 6;
+  }
+
+  return -1;
+}
+
+bool
+X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  // Only enable when post-RA scheduling is enabled and this is needed.
+  return TM.getSubtargetImpl()->postRAScheduler();
+}
+
+int
+X86RegisterInfo::getSEHRegNum(unsigned i) const {
+  return getEncodingValue(i);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  if (!Is64Bit && SubIdx == X86::sub_8bit) {
+    A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
+    if (!A)
+      return 0;
+  }
+  return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
+}
+
+const TargetRegisterClass*
+X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extrating sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == &X86::GR8_NOREXRegClass)
+    return RC;
+
+  const TargetRegisterClass *Super = RC;
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
+  do {
+    switch (Super->getID()) {
+    case X86::GR8RegClassID:
+    case X86::GR16RegClassID:
+    case X86::GR32RegClassID:
+    case X86::GR64RegClassID:
+    case X86::FR32RegClassID:
+    case X86::FR64RegClassID:
+    case X86::RFP32RegClassID:
+    case X86::RFP64RegClassID:
+    case X86::RFP80RegClassID:
+    case X86::VR128RegClassID:
+    case X86::VR256RegClassID:
+      // Don't return a super-class that would shrink the spill size.
+      // That can happen with the vector and float classes.
+      if (Super->getSize() == RC->getSize())
+        return Super;
+    }
+    Super = *I++;
+  } while (Super);
+  return RC;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
+                                                                         const {
+  const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64RegClass;
+    return &X86::GR32RegClass;
+  case 1: // Normal GPRs except the stack pointer (for encoding reasons).
+    if (Subtarget.isTarget64BitLP64())
+      return &X86::GR64_NOSPRegClass;
+    return &X86::GR32_NOSPRegClass;
+  case 2: // Available for tailcall (not callee-saved GPRs).
+    if (Subtarget.isTargetWin64())
+      return &X86::GR64_TCW64RegClass;
+    else if (Subtarget.is64Bit())
+      return &X86::GR64_TCRegClass;
+
+    const Function *F = MF.getFunction();
+    bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+    if (hasHipeCC)
+      return &X86::GR32RegClass;
+    return &X86::GR32_TCRegClass;
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return RC;
+}
+
+unsigned
+X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                     MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case X86::GR32RegClassID:
+    return 4 - FPDiff;
+  case X86::GR64RegClassID:
+    return 12 - FPDiff;
+  case X86::VR128RegClassID:
+    return TM.getSubtarget<X86Subtarget>().is64Bit() ? 10 : 4;
+  case X86::VR64RegClassID:
+    return 4;
+  }
+}
+
+const uint16_t *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  switch (MF->getFunction()->getCallingConv()) {
+  case CallingConv::GHC:
+  case CallingConv::HiPE:
+    return CSR_NoRegs_SaveList;
+
+  case CallingConv::Intel_OCL_BI: {
+    bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    if (HasAVX && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_SaveList;
+    break;
+  }
+
+  case CallingConv::Cold:
+    if (Is64Bit)
+      return CSR_MostRegs_64_SaveList;
+    break;
+
+  default:
+    break;
+  }
+
+  bool CallsEHReturn = MF->getMMI().callsEHReturn();
+  if (Is64Bit) {
+    if (IsWin64)
+      return CSR_Win64_SaveList;
+    if (CallsEHReturn)
+      return CSR_64EHRet_SaveList;
+    return CSR_64_SaveList;
+  }
+  if (CallsEHReturn)
+    return CSR_32EHRet_SaveList;
+  return CSR_32_SaveList;
+}
+
+const uint32_t*
+X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+  if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (Is64Bit && HasAVX)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+  }
+  if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
+    return CSR_NoRegs_RegMask;
+  if (!Is64Bit)
+    return CSR_32_RegMask;
+  if (CC == CallingConv::Cold)
+    return CSR_MostRegs_64_RegMask;
+  if (IsWin64)
+    return CSR_Win64_RegMask;
+  return CSR_64_RegMask;
+}
+
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Set the stack-pointer register and its aliases as reserved.
+  Reserved.set(X86::RSP);
+  for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+    Reserved.set(*I);
+
+  // Set the instruction pointer register and its aliases as reserved.
+  Reserved.set(X86::RIP);
+  for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+    Reserved.set(*I);
+
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (TFI->hasFP(MF)) {
+    Reserved.set(X86::RBP);
+    for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Set the base-pointer register and its aliases as reserved if needed.
+  if (hasBasePointer(MF)) {
+    CallingConv::ID CC = MF.getFunction()->getCallingConv();
+    const uint32_t* RegMask = getCallPreservedMask(CC);
+    if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
+      report_fatal_error(
+        "Stack realignment in presence of dynamic allocas is not supported with"
+        "this calling convention.");
+
+    Reserved.set(getBaseRegister());
+    for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+      Reserved.set(*I);
+  }
+
+  // Mark the segment registers as reserved.
+  Reserved.set(X86::CS);
+  Reserved.set(X86::SS);
+  Reserved.set(X86::DS);
+  Reserved.set(X86::ES);
+  Reserved.set(X86::FS);
+  Reserved.set(X86::GS);
+
+  // Mark the floating point stack registers as reserved.
+  Reserved.set(X86::ST0);
+  Reserved.set(X86::ST1);
+  Reserved.set(X86::ST2);
+  Reserved.set(X86::ST3);
+  Reserved.set(X86::ST4);
+  Reserved.set(X86::ST5);
+  Reserved.set(X86::ST6);
+  Reserved.set(X86::ST7);
+
+  // Reserve the registers that only exist in 64-bit mode.
+  if (!Is64Bit) {
+    // These 8-bit registers are part of the x86-64 extension even though their
+    // super-registers are old 32-bits.
+    Reserved.set(X86::SIL);
+    Reserved.set(X86::DIL);
+    Reserved.set(X86::BPL);
+    Reserved.set(X86::SPL);
+
+    for (unsigned n = 0; n != 8; ++n) {
+      // R8, R9, ...
+      static const uint16_t GPR64[] = {
+        X86::R8,  X86::R9,  X86::R10, X86::R11,
+        X86::R12, X86::R13, X86::R14, X86::R15
+      };
+      for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+
+      // XMM8, XMM9, ...
+      assert(X86::XMM15 == X86::XMM8+7);
+      for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+   const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+   if (!EnableBasePointer)
+     return false;
+
+   // When we need stack realignment and there are dynamic allocas, we can't
+   // reference off of the stack pointer, so we reserve a base pointer.
+   //
+   // This is also true if the function contain MS-style inline assembly.  We
+   // do this because if any stack changes occur in the inline assembly, e.g.,
+   // "pusha", then any C local variable or C argument references in the
+   // inline assembly will be wrong because the SP is not properly tracked.
+   if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) ||
+       MF.hasMSInlineAsm())
+     return true;
+
+   return false;
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  if (!MF.getTarget().Options.RealignStack)
+    return false;
+
+  // Stack realignment requires a frame pointer.  If we already started
+  // register allocation with frame pointer elimination, it is too late now.
+  if (!MRI->canReserveReg(FramePtr))
+    return false;
+
+  // If a base pointer is necessary.  Check that it isn't too late to reserve
+  // it.
+  if (MFI->hasVarSizedObjects())
+    return MRI->canReserveReg(BasePtr);
+  return true;
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  // If we've requested that we force align the stack do so now.
+  if (ForceStackAlign)
+    return canRealignStack(MF);
+
+  return requiresRealignment && canRealignStack(MF);
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                           unsigned Reg, int &FrameIdx) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (Reg == FramePtr && TFI->hasFP(MF)) {
+    FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+    return true;
+  }
+  return false;
+}
+
+void
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, unsigned FIOperandNum,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned BasePtr;
+
+  unsigned Opc = MI.getOpcode();
+  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
+  if (hasBasePointer(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
+  else if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else if (AfterFPPop)
+    BasePtr = StackPtr;
+  else
+    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int FIOffset;
+  if (AfterFPPop) {
+    // Tail call jmp happens after FP is popped.
+    const MachineFrameInfo *MFI = MF.getFrameInfo();
+    FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
+  } else
+    FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
+
+  if (MI.getOperand(FIOperandNum+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
+    int Offset = FIOffset + Imm;
+    assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
+           "Requesting 64-bit offset in 32-bit immediate!");
+    MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = FIOffset +
+      (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+    MI.getOperand(FIOperandNum + 3).setOffset(Offset);
+  }
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  return TFI->hasFP(MF) ? FramePtr : StackPtr;
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
+                                bool High) {
+  switch (VT) {
+  default: llvm_unreachable("Unexpected VT");
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return getX86SubSuperRegister(Reg, MVT::i64);
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SI;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DI;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BP;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SP;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: llvm_unreachable("Unexpected register");
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+}
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 000000000000..b9d7b8cf8b9a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,143 @@
+//===-- X86RegisterInfo.h - X86 Register Information Impl -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "X86GenRegisterInfo.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86TargetMachine;
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+  X86TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+  /// BasePtr - X86 physical register used as a base ptr in complex stack
+  /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
+  /// variable size stack objects.
+  unsigned BasePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const;
+
+  /// getCompactUnwindRegNum - This function maps the register to the number for
+  /// compact unwind encoding. Return -1 if the register isn't valid.
+  int getCompactUnwindRegNum(unsigned RegNum, bool isEH) const;
+
+  /// Code Generation virtual methods...
+  ///
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  virtual const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
+  virtual const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const;
+
+  const TargetRegisterClass*
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasBasePointer(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+
+  bool needsStackRealignment(const MachineFunction &MF) const;
+
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getStackRegister() const { return StackPtr; }
+  unsigned getBaseRegister() const { return BasePtr; }
+  // FIXME: Move to FrameInfok
+  unsigned getSlotSize() const { return SlotSize; }
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 000000000000..be6282a643bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,423 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
+
+// Subregister indices.
+let Namespace = "X86" in {
+  def sub_8bit    : SubRegIndex;
+  def sub_8bit_hi : SubRegIndex;
+  def sub_16bit   : SubRegIndex;
+  def sub_32bit   : SubRegIndex;
+  def sub_xmm     : SubRegIndex;
+}
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+} // CostPerUse
+
+// YMM Registers, used by AVX instructions
+let SubRegIndices = [sub_xmm] in {
+def YMM0:  X86Reg<"ymm0",   0, [XMM0]>,  DwarfRegAlias<XMM0>;
+def YMM1:  X86Reg<"ymm1",   1, [XMM1]>,  DwarfRegAlias<XMM1>;
+def YMM2:  X86Reg<"ymm2",   2, [XMM2]>,  DwarfRegAlias<XMM2>;
+def YMM3:  X86Reg<"ymm3",   3, [XMM3]>,  DwarfRegAlias<XMM3>;
+def YMM4:  X86Reg<"ymm4",   4, [XMM4]>,  DwarfRegAlias<XMM4>;
+def YMM5:  X86Reg<"ymm5",   5, [XMM5]>,  DwarfRegAlias<XMM5>;
+def YMM6:  X86Reg<"ymm6",   6, [XMM6]>,  DwarfRegAlias<XMM6>;
+def YMM7:  X86Reg<"ymm7",   7, [XMM7]>,  DwarfRegAlias<XMM7>;
+def YMM8:  X86Reg<"ymm8",   8, [XMM8]>,  DwarfRegAlias<XMM8>;
+def YMM9:  X86Reg<"ymm9",   9, [XMM9]>,  DwarfRegAlias<XMM9>;
+def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
+def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
+def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
+def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
+def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
+def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+}
+
+class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
+  let Aliases = A;
+}
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
+def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                             R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> {
+  let AltOrders = [(sub GR8, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         (add AX, CX, DX, SI, DI, BX, BP, SP,
+                              R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>;
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+                         (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                              R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>;
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64,
+                         (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                              RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+
+// Control registers.
+def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>;
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
+                                                     R8, R9, R11, RIP)>;
+def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
+                                                      R8, R9, R11)>;
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              (add AL, CL, DL, AH, CH, DH, BL, BH)> {
+  let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)];
+  let AltOrderSelect = [{
+    return MF.getTarget().getSubtarget<X86Subtarget>().is64Bit();
+  }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               (add AX, CX, DX, SI, DI, BX, BP, SP)>;
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>;
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                            (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>;
+
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)>;
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>;
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>;
+
+// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except
+// ESP.
+def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32,
+                                    (and GR32_NOREX, GR32_NOSP)>;
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    (and GR64_NOREX, GR64_NOSP)>;
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>;
+
+def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>;
+def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>;
+def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> {
+  let isAllocatable = 0;
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (add FR32)>;
+def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 15)>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
+def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h
new file mode 100644
index 000000000000..03330566161d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,52 @@
+//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    /// RelocationType - An enum for the x86 relocation codes. Note that
+    /// the terminology here doesn't follow x86 convention - word means
+    /// 32-bit and dword means 64-bit. The relocations will be treated
+    /// by JIT or ObjectCode emitters, this is transparent to the x86 code
+    /// emitter but JIT and ObjectCode will treat them differently
+    enum RelocationType {
+      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+      /// the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      /// reloc_picrel_word - PIC base relative relocation, add the relocated
+      /// value to the value already in memory, after we adjust it for where the
+      /// PIC base is.
+      reloc_picrel_word = 1,
+
+      /// reloc_absolute_word - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_word = 2,
+
+      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+      /// value to the value already in memory. In object files, it represents a
+      /// value which must be sign-extended when resolving the relocation.
+      reloc_absolute_word_sext = 3,
+
+      /// reloc_absolute_dword - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_dword = 4
+    };
+  }
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
new file mode 100644
index 000000000000..7de6791f2e48
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -0,0 +1,126 @@
+//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Haswell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def HaswellModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+  // instructions per cycle.
+  let IssueWidth = 4;
+  let MinLatency = 0; // 0 = Out-of-order execution.
+  let LoadLatency = 4;
+  let ILPWindow = 40;
+  let MispredictPenalty = 16;
+}
+
+let SchedModel = HaswellModel in {
+
+// Haswell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, 6 and 7 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def HWPort0 : ProcResource<1>;
+def HWPort1 : ProcResource<1>;
+def HWPort2 : ProcResource<1>;
+def HWPort3 : ProcResource<1>;
+def HWPort4 : ProcResource<1>;
+def HWPort5 : ProcResource<1>;
+def HWPort6 : ProcResource<1>;
+def HWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def HWPort23  : ProcResGroup<[HWPort2, HWPort3]>;
+def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>;
+def HWPort05  : ProcResGroup<[HWPort0, HWPort5]>;
+def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>;
+def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
+def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
+def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
+
+// Integer division issued on port 0.
+def HWDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [HWPort4]>;
+
+def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteLoad,  [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [HWPort0156]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
+defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
+defm : HWWriteResPair<WriteShift, HWPort056,  1>;
+defm : HWWriteResPair<WriteJump,  HWPort5,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [HWPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [HWPort0, HWDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
+defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
+defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
+defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
+defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+
+// Vector integer operations.
+defm : HWWriteResPair<WriteVecShift, HWPort05,  1>;
+defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
+defm : HWWriteResPair<WriteVecALU,   HWPort15,  1>;
+defm : HWWriteResPair<WriteVecIMul,  HWPort0,   5>;
+defm : HWWriteResPair<WriteShuffle,  HWPort15,  1>;
+
+def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
new file mode 100644
index 000000000000..74d5f1b6eba8
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -0,0 +1,122 @@
+//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Sandy Bridge to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SandyBridgeModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SB can decode 4
+  // instructions per cycle.
+  // FIXME: Identify instructions that aren't a single fused micro-op.
+  let IssueWidth = 4;
+  let MinLatency = 0; // 0 = Out-of-order execution.
+  let LoadLatency = 4;
+  let ILPWindow = 30;
+  let MispredictPenalty = 16;
+}
+
+let SchedModel = SandyBridgeModel in {
+
+// Sandy Bridge can issue micro-ops to 6 different ports in one cycle.
+
+// Ports 0, 1, and 5 handle all computation.
+def SBPort0 : ProcResource<1>;
+def SBPort1 : ProcResource<1>;
+def SBPort5 : ProcResource<1>;
+
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores.
+def SBPort23 : ProcResource<2>;
+
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+def SBPort4 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
+def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
+def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
+
+// Integer division issued on port 0.
+def SBDivider : ProcResource<1>;
+
+// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SBPort4]>;
+
+def : WriteRes<WriteStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteLoad,  [SBPort23]> { let Latency = 4; }
+def : WriteRes<WriteMove,  [SBPort015]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
+defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
+defm : SBWriteResPair<WriteShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [SBPort15]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 10];
+}
+
+// Scalar and vector floating point.
+defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
+defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
+defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
+defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
+defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
+
+// Vector integer operations.
+defm : SBWriteResPair<WriteVecShift, SBPort05,  1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
+defm : SBWriteResPair<WriteVecALU,   SBPort15,  1>;
+defm : SBWriteResPair<WriteVecIMul,  SBPort0,   5>;
+defm : SBWriteResPair<WriteShuffle,  SBPort15,  1>;
+
+def : WriteRes<WriteSystem,     [SBPort015]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
+} // SchedModel
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
new file mode 100644
index 000000000000..9fbde88b7100
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -0,0 +1,573 @@
+//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// InstrSchedModel annotations for out-of-order CPUs.
+//
+// These annotations are independent of the itinerary classes defined below.
+
+// Instructions with folded loads need to read the memory operand immediately,
+// but other register operands don't have to be read until the load is ready.
+// These operands are marked with ReadAfterLd.
+def ReadAfterLd : SchedRead;
+
+// Instructions with both a load and a store folded are modeled as a folded
+// load + WriteRMW.
+def WriteRMW : SchedWrite;
+
+// Most instructions can fold loads, so almost every SchedWrite comes in two
+// variants: With and without a folded load.
+// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite
+// with a folded load.
+class X86FoldableSchedWrite : SchedWrite {
+  // The SchedWrite to use when a load is folded into the instruction.
+  SchedWrite Folded;
+}
+
+// Multiclass that produces a linked pair of SchedWrites.
+multiclass X86SchedWritePair {
+  // Register-Memory operation.
+  def Ld : SchedWrite;
+  // Register-Register operation.
+  def NAME : X86FoldableSchedWrite {
+    let Folded = !cast<SchedWrite>(NAME#"Ld");
+  }
+}
+
+// Arithmetic.
+defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
+defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+defm WriteIDiv : X86SchedWritePair; // Integer division.
+def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm WriteShift : X86SchedWritePair;
+
+// Loads, stores, and moves, not folded with other operations.
+def WriteLoad  : SchedWrite;
+def WriteStore : SchedWrite;
+def WriteMove  : SchedWrite;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def WriteZero : SchedWrite;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm WriteJump : X86SchedWritePair;
+
+// Floating point. This covers both scalar and vector operations.
+defm WriteFAdd  : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul  : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv  : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
+defm WriteFMA   : X86SchedWritePair; // Fused Multiply Add.
+
+// FMA Scheduling helper class.
+class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
+defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
+defm WriteVecIMul  : X86SchedWritePair; // Vector integer multiply.
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor.
+defm WriteShuffle  : X86SchedWritePair; // Vector shuffles and blends.
+
+// Conversion between integer and float.
+defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer.
+defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float.
+defm WriteCvtF2F : X86SchedWritePair; // Float -> Float size conversion.
+
+// Catch-all for expensive system instructions.
+def WriteSystem : SchedWrite;
+
+// Old microcoded instructions that nobody use.
+def WriteMicrocoded : SchedWrite;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for X86
+def IIC_ALU_MEM     : InstrItinClass;
+def IIC_ALU_NONMEM  : InstrItinClass;
+def IIC_LEA         : InstrItinClass;
+def IIC_LEA_16      : InstrItinClass;
+def IIC_MUL8        : InstrItinClass;
+def IIC_MUL16_MEM   : InstrItinClass;
+def IIC_MUL16_REG   : InstrItinClass;
+def IIC_MUL32_MEM   : InstrItinClass;
+def IIC_MUL32_REG   : InstrItinClass;
+def IIC_MUL64       : InstrItinClass;
+// imul by al, ax, eax, tax
+def IIC_IMUL8       : InstrItinClass;
+def IIC_IMUL16_MEM  : InstrItinClass;
+def IIC_IMUL16_REG  : InstrItinClass;
+def IIC_IMUL32_MEM  : InstrItinClass;
+def IIC_IMUL32_REG  : InstrItinClass;
+def IIC_IMUL64      : InstrItinClass;
+// imul reg by reg|mem
+def IIC_IMUL16_RM   : InstrItinClass;
+def IIC_IMUL16_RR   : InstrItinClass;
+def IIC_IMUL32_RM   : InstrItinClass;
+def IIC_IMUL32_RR   : InstrItinClass;
+def IIC_IMUL64_RM   : InstrItinClass;
+def IIC_IMUL64_RR   : InstrItinClass;
+// imul reg = reg/mem * imm
+def IIC_IMUL16_RMI  : InstrItinClass;
+def IIC_IMUL16_RRI  : InstrItinClass;
+def IIC_IMUL32_RMI  : InstrItinClass;
+def IIC_IMUL32_RRI  : InstrItinClass;
+def IIC_IMUL64_RMI  : InstrItinClass;
+def IIC_IMUL64_RRI  : InstrItinClass;
+// div
+def IIC_DIV8_MEM    : InstrItinClass;
+def IIC_DIV8_REG    : InstrItinClass;
+def IIC_DIV16       : InstrItinClass;
+def IIC_DIV32       : InstrItinClass;
+def IIC_DIV64       : InstrItinClass;
+// idiv
+def IIC_IDIV8       : InstrItinClass;
+def IIC_IDIV16      : InstrItinClass;
+def IIC_IDIV32      : InstrItinClass;
+def IIC_IDIV64      : InstrItinClass;
+// neg/not/inc/dec
+def IIC_UNARY_REG   : InstrItinClass;
+def IIC_UNARY_MEM   : InstrItinClass;
+// add/sub/and/or/xor/adc/sbc/cmp/test
+def IIC_BIN_MEM     : InstrItinClass;
+def IIC_BIN_NONMEM  : InstrItinClass;
+// shift/rotate
+def IIC_SR          : InstrItinClass;
+// shift double
+def IIC_SHD16_REG_IM : InstrItinClass;
+def IIC_SHD16_REG_CL : InstrItinClass;
+def IIC_SHD16_MEM_IM : InstrItinClass;
+def IIC_SHD16_MEM_CL : InstrItinClass;
+def IIC_SHD32_REG_IM : InstrItinClass;
+def IIC_SHD32_REG_CL : InstrItinClass;
+def IIC_SHD32_MEM_IM : InstrItinClass;
+def IIC_SHD32_MEM_CL : InstrItinClass;
+def IIC_SHD64_REG_IM : InstrItinClass;
+def IIC_SHD64_REG_CL : InstrItinClass;
+def IIC_SHD64_MEM_IM : InstrItinClass;
+def IIC_SHD64_MEM_CL : InstrItinClass;
+// cmov
+def IIC_CMOV16_RM : InstrItinClass;
+def IIC_CMOV16_RR : InstrItinClass;
+def IIC_CMOV32_RM : InstrItinClass;
+def IIC_CMOV32_RR : InstrItinClass;
+def IIC_CMOV64_RM : InstrItinClass;
+def IIC_CMOV64_RR : InstrItinClass;
+// set
+def IIC_SET_R : InstrItinClass;
+def IIC_SET_M : InstrItinClass;
+// jmp/jcc/jcxz
+def IIC_Jcc : InstrItinClass;
+def IIC_JCXZ : InstrItinClass;
+def IIC_JMP_REL : InstrItinClass;
+def IIC_JMP_REG : InstrItinClass;
+def IIC_JMP_MEM : InstrItinClass;
+def IIC_JMP_FAR_MEM : InstrItinClass;
+def IIC_JMP_FAR_PTR : InstrItinClass;
+// loop
+def IIC_LOOP : InstrItinClass;
+def IIC_LOOPE : InstrItinClass;
+def IIC_LOOPNE : InstrItinClass;
+// call
+def IIC_CALL_RI : InstrItinClass;
+def IIC_CALL_MEM : InstrItinClass;
+def IIC_CALL_FAR_MEM : InstrItinClass;
+def IIC_CALL_FAR_PTR : InstrItinClass;
+// ret
+def IIC_RET : InstrItinClass;
+def IIC_RET_IMM : InstrItinClass;
+//sign extension movs
+def IIC_MOVSX : InstrItinClass;
+def IIC_MOVSX_R16_R8 : InstrItinClass;
+def IIC_MOVSX_R16_M8 : InstrItinClass;
+def IIC_MOVSX_R16_R16 : InstrItinClass;
+def IIC_MOVSX_R32_R32 : InstrItinClass;
+//zero extension movs
+def IIC_MOVZX : InstrItinClass;
+def IIC_MOVZX_R16_R8 : InstrItinClass;
+def IIC_MOVZX_R16_M8 : InstrItinClass;
+
+def IIC_REP_MOVS : InstrItinClass;
+def IIC_REP_STOS : InstrItinClass;
+
+// SSE scalar/parallel binary operations
+def IIC_SSE_ALU_F32S_RR : InstrItinClass;
+def IIC_SSE_ALU_F32S_RM : InstrItinClass;
+def IIC_SSE_ALU_F64S_RR : InstrItinClass;
+def IIC_SSE_ALU_F64S_RM : InstrItinClass;
+def IIC_SSE_MUL_F32S_RR : InstrItinClass;
+def IIC_SSE_MUL_F32S_RM : InstrItinClass;
+def IIC_SSE_MUL_F64S_RR : InstrItinClass;
+def IIC_SSE_MUL_F64S_RM : InstrItinClass;
+def IIC_SSE_DIV_F32S_RR : InstrItinClass;
+def IIC_SSE_DIV_F32S_RM : InstrItinClass;
+def IIC_SSE_DIV_F64S_RR : InstrItinClass;
+def IIC_SSE_DIV_F64S_RM : InstrItinClass;
+def IIC_SSE_ALU_F32P_RR : InstrItinClass;
+def IIC_SSE_ALU_F32P_RM : InstrItinClass;
+def IIC_SSE_ALU_F64P_RR : InstrItinClass;
+def IIC_SSE_ALU_F64P_RM : InstrItinClass;
+def IIC_SSE_MUL_F32P_RR : InstrItinClass;
+def IIC_SSE_MUL_F32P_RM : InstrItinClass;
+def IIC_SSE_MUL_F64P_RR : InstrItinClass;
+def IIC_SSE_MUL_F64P_RM : InstrItinClass;
+def IIC_SSE_DIV_F32P_RR : InstrItinClass;
+def IIC_SSE_DIV_F32P_RM : InstrItinClass;
+def IIC_SSE_DIV_F64P_RR : InstrItinClass;
+def IIC_SSE_DIV_F64P_RM : InstrItinClass;
+
+def IIC_SSE_COMIS_RR : InstrItinClass;
+def IIC_SSE_COMIS_RM : InstrItinClass;
+
+def IIC_SSE_HADDSUB_RR : InstrItinClass;
+def IIC_SSE_HADDSUB_RM : InstrItinClass;
+
+def IIC_SSE_BIT_P_RR  : InstrItinClass;
+def IIC_SSE_BIT_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTALU_P_RR  : InstrItinClass;
+def IIC_SSE_INTALU_P_RM  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RR  : InstrItinClass;
+def IIC_SSE_INTALUQ_P_RM  : InstrItinClass;
+
+def IIC_SSE_INTMUL_P_RR : InstrItinClass;
+def IIC_SSE_INTMUL_P_RM : InstrItinClass;
+
+def IIC_SSE_INTSH_P_RR : InstrItinClass;
+def IIC_SSE_INTSH_P_RM : InstrItinClass;
+def IIC_SSE_INTSH_P_RI : InstrItinClass;
+
+def IIC_SSE_CMPP_RR : InstrItinClass;
+def IIC_SSE_CMPP_RM : InstrItinClass;
+
+def IIC_SSE_SHUFP : InstrItinClass;
+def IIC_SSE_PSHUF : InstrItinClass;
+
+def IIC_SSE_UNPCK : InstrItinClass;
+
+def IIC_SSE_MOVMSK : InstrItinClass;
+def IIC_SSE_MASKMOV : InstrItinClass;
+
+def IIC_SSE_PEXTRW : InstrItinClass;
+def IIC_SSE_PINSRW : InstrItinClass;
+
+def IIC_SSE_PABS_RR : InstrItinClass;
+def IIC_SSE_PABS_RM : InstrItinClass;
+
+def IIC_SSE_SQRTP_RR : InstrItinClass;
+def IIC_SSE_SQRTP_RM : InstrItinClass;
+def IIC_SSE_SQRTS_RR : InstrItinClass;
+def IIC_SSE_SQRTS_RM : InstrItinClass;
+
+def IIC_SSE_RCPP_RR : InstrItinClass;
+def IIC_SSE_RCPP_RM : InstrItinClass;
+def IIC_SSE_RCPS_RR : InstrItinClass;
+def IIC_SSE_RCPS_RM : InstrItinClass;
+
+def IIC_SSE_MOV_S_RR : InstrItinClass;
+def IIC_SSE_MOV_S_RM : InstrItinClass;
+def IIC_SSE_MOV_S_MR : InstrItinClass;
+
+def IIC_SSE_MOVA_P_RR : InstrItinClass;
+def IIC_SSE_MOVA_P_RM : InstrItinClass;
+def IIC_SSE_MOVA_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVU_P_RR : InstrItinClass;
+def IIC_SSE_MOVU_P_RM : InstrItinClass;
+def IIC_SSE_MOVU_P_MR : InstrItinClass;
+
+def IIC_SSE_MOVDQ : InstrItinClass;
+def IIC_SSE_MOVD_ToGP : InstrItinClass;
+def IIC_SSE_MOVQ_RR : InstrItinClass;
+
+def IIC_SSE_MOV_LH : InstrItinClass;
+
+def IIC_SSE_LDDQU : InstrItinClass;
+
+def IIC_SSE_MOVNT : InstrItinClass;
+
+def IIC_SSE_PHADDSUBD_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBD_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBSW_RM : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RR : InstrItinClass;
+def IIC_SSE_PHADDSUBW_RM : InstrItinClass;
+def IIC_SSE_PSHUFB_RR : InstrItinClass;
+def IIC_SSE_PSHUFB_RM : InstrItinClass;
+def IIC_SSE_PSIGN_RR : InstrItinClass;
+def IIC_SSE_PSIGN_RM : InstrItinClass;
+
+def IIC_SSE_PMADD : InstrItinClass;
+def IIC_SSE_PMULHRSW : InstrItinClass;
+def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_MWAIT : InstrItinClass;
+def IIC_SSE_MONITOR : InstrItinClass;
+
+def IIC_SSE_PREFETCH : InstrItinClass;
+def IIC_SSE_PAUSE : InstrItinClass;
+def IIC_SSE_LFENCE : InstrItinClass;
+def IIC_SSE_MFENCE : InstrItinClass;
+def IIC_SSE_SFENCE : InstrItinClass;
+def IIC_SSE_LDMXCSR : InstrItinClass;
+def IIC_SSE_STMXCSR : InstrItinClass;
+
+def IIC_SSE_CVT_PD_RR : InstrItinClass;
+def IIC_SSE_CVT_PD_RM : InstrItinClass;
+def IIC_SSE_CVT_PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PS_RM : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
+def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
+def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI32_RR : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RM : InstrItinClass;
+def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
+def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+
+// MMX
+def IIC_MMX_MOV_MM_RM : InstrItinClass;
+def IIC_MMX_MOV_REG_MM : InstrItinClass;
+def IIC_MMX_MOVQ_RM : InstrItinClass;
+def IIC_MMX_MOVQ_RR : InstrItinClass;
+
+def IIC_MMX_ALU_RM : InstrItinClass;
+def IIC_MMX_ALU_RR : InstrItinClass;
+def IIC_MMX_ALUQ_RM : InstrItinClass;
+def IIC_MMX_ALUQ_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBW_RR : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RM : InstrItinClass;
+def IIC_MMX_PHADDSUBD_RR : InstrItinClass;
+def IIC_MMX_PMUL : InstrItinClass;
+def IIC_MMX_MISC_FUNC_MEM : InstrItinClass;
+def IIC_MMX_MISC_FUNC_REG : InstrItinClass;
+def IIC_MMX_PSADBW : InstrItinClass;
+def IIC_MMX_SHIFT_RI : InstrItinClass;
+def IIC_MMX_SHIFT_RM : InstrItinClass;
+def IIC_MMX_SHIFT_RR : InstrItinClass;
+def IIC_MMX_UNPCK_H_RM : InstrItinClass;
+def IIC_MMX_UNPCK_H_RR : InstrItinClass;
+def IIC_MMX_UNPCK_L : InstrItinClass;
+def IIC_MMX_PCK_RM : InstrItinClass;
+def IIC_MMX_PCK_RR : InstrItinClass;
+def IIC_MMX_PSHUF : InstrItinClass;
+def IIC_MMX_PEXTR : InstrItinClass;
+def IIC_MMX_PINSRW : InstrItinClass;
+def IIC_MMX_MASKMOV : InstrItinClass;
+
+def IIC_MMX_CVT_PD_RR : InstrItinClass;
+def IIC_MMX_CVT_PD_RM : InstrItinClass;
+def IIC_MMX_CVT_PS_RR : InstrItinClass;
+def IIC_MMX_CVT_PS_RM : InstrItinClass;
+
+def IIC_CMPX_LOCK : InstrItinClass;
+def IIC_CMPX_LOCK_8 : InstrItinClass;
+def IIC_CMPX_LOCK_8B : InstrItinClass;
+def IIC_CMPX_LOCK_16B : InstrItinClass;
+
+def IIC_XADD_LOCK_MEM : InstrItinClass;
+def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+
+def IIC_FILD : InstrItinClass;
+def IIC_FLD : InstrItinClass;
+def IIC_FLD80 : InstrItinClass;
+def IIC_FST : InstrItinClass;
+def IIC_FST80 : InstrItinClass;
+def IIC_FIST : InstrItinClass;
+def IIC_FLDZ : InstrItinClass;
+def IIC_FUCOM : InstrItinClass;
+def IIC_FUCOMI : InstrItinClass;
+def IIC_FCOMI : InstrItinClass;
+def IIC_FNSTSW : InstrItinClass;
+def IIC_FNSTCW : InstrItinClass;
+def IIC_FLDCW : InstrItinClass;
+def IIC_FNINIT : InstrItinClass;
+def IIC_FFREE : InstrItinClass;
+def IIC_FNCLEX : InstrItinClass;
+def IIC_WAIT : InstrItinClass;
+def IIC_FXAM : InstrItinClass;
+def IIC_FNOP : InstrItinClass;
+def IIC_FLDL : InstrItinClass;
+def IIC_F2XM1 : InstrItinClass;
+def IIC_FYL2X : InstrItinClass;
+def IIC_FPTAN : InstrItinClass;
+def IIC_FPATAN : InstrItinClass;
+def IIC_FXTRACT : InstrItinClass;
+def IIC_FPREM1 : InstrItinClass;
+def IIC_FPSTP : InstrItinClass;
+def IIC_FPREM : InstrItinClass;
+def IIC_FYL2XP1 : InstrItinClass;
+def IIC_FSINCOS : InstrItinClass;
+def IIC_FRNDINT : InstrItinClass;
+def IIC_FSCALE : InstrItinClass;
+def IIC_FCOMPP : InstrItinClass;
+def IIC_FXSAVE : InstrItinClass;
+def IIC_FXRSTOR : InstrItinClass;
+
+def IIC_FXCH : InstrItinClass;
+
+// System instructions
+def IIC_CPUID : InstrItinClass;
+def IIC_INT : InstrItinClass;
+def IIC_INT3 : InstrItinClass;
+def IIC_INVD : InstrItinClass;
+def IIC_INVLPG : InstrItinClass;
+def IIC_IRET : InstrItinClass;
+def IIC_HLT : InstrItinClass;
+def IIC_LXS : InstrItinClass;
+def IIC_LTR : InstrItinClass;
+def IIC_RDTSC : InstrItinClass;
+def IIC_RSM : InstrItinClass;
+def IIC_SIDT : InstrItinClass;
+def IIC_SGDT : InstrItinClass;
+def IIC_SLDT : InstrItinClass;
+def IIC_STR : InstrItinClass;
+def IIC_SWAPGS : InstrItinClass;
+def IIC_SYSCALL : InstrItinClass;
+def IIC_SYS_ENTER_EXIT : InstrItinClass;
+def IIC_IN_RR : InstrItinClass;
+def IIC_IN_RI : InstrItinClass;
+def IIC_OUT_RR : InstrItinClass;
+def IIC_OUT_IR : InstrItinClass;
+def IIC_INS : InstrItinClass;
+def IIC_MOV_REG_DR : InstrItinClass;
+def IIC_MOV_DR_REG : InstrItinClass;
+def IIC_MOV_REG_CR : InstrItinClass;
+def IIC_MOV_CR_REG : InstrItinClass;
+def IIC_MOV_REG_SR : InstrItinClass;
+def IIC_MOV_MEM_SR : InstrItinClass;
+def IIC_MOV_SR_REG : InstrItinClass;
+def IIC_MOV_SR_MEM : InstrItinClass;
+def IIC_LAR_RM : InstrItinClass;
+def IIC_LAR_RR : InstrItinClass;
+def IIC_LSL_RM : InstrItinClass;
+def IIC_LSL_RR : InstrItinClass;
+def IIC_LGDT : InstrItinClass;
+def IIC_LIDT : InstrItinClass;
+def IIC_LLDT_REG : InstrItinClass;
+def IIC_LLDT_MEM : InstrItinClass;
+def IIC_PUSH_CS : InstrItinClass;
+def IIC_PUSH_SR : InstrItinClass;
+def IIC_POP_SR : InstrItinClass;
+def IIC_POP_SR_SS : InstrItinClass;
+def IIC_VERR : InstrItinClass;
+def IIC_VERW_REG : InstrItinClass;
+def IIC_VERW_MEM : InstrItinClass;
+def IIC_WRMSR : InstrItinClass;
+def IIC_RDMSR : InstrItinClass;
+def IIC_RDPMC : InstrItinClass;
+def IIC_SMSW : InstrItinClass;
+def IIC_LMSW_REG : InstrItinClass;
+def IIC_LMSW_MEM : InstrItinClass;
+def IIC_ENTER : InstrItinClass;
+def IIC_LEAVE : InstrItinClass;
+def IIC_POP_MEM : InstrItinClass;
+def IIC_POP_REG16 : InstrItinClass;
+def IIC_POP_REG : InstrItinClass;
+def IIC_POP_F : InstrItinClass;
+def IIC_POP_FD : InstrItinClass;
+def IIC_POP_A : InstrItinClass;
+def IIC_PUSH_IMM : InstrItinClass;
+def IIC_PUSH_MEM : InstrItinClass;
+def IIC_PUSH_REG : InstrItinClass;
+def IIC_PUSH_F : InstrItinClass;
+def IIC_PUSH_A : InstrItinClass;
+def IIC_BSWAP : InstrItinClass;
+def IIC_BSF : InstrItinClass;
+def IIC_BSR : InstrItinClass;
+def IIC_MOVS : InstrItinClass;
+def IIC_STOS : InstrItinClass;
+def IIC_SCAS : InstrItinClass;
+def IIC_CMPS : InstrItinClass;
+def IIC_MOV : InstrItinClass;
+def IIC_MOV_MEM : InstrItinClass;
+def IIC_AHF : InstrItinClass;
+def IIC_BT_MI : InstrItinClass;
+def IIC_BT_MR : InstrItinClass;
+def IIC_BT_RI : InstrItinClass;
+def IIC_BT_RR : InstrItinClass;
+def IIC_BTX_MI : InstrItinClass;
+def IIC_BTX_MR : InstrItinClass;
+def IIC_BTX_RI : InstrItinClass;
+def IIC_BTX_RR : InstrItinClass;
+def IIC_XCHG_REG : InstrItinClass;
+def IIC_XCHG_MEM : InstrItinClass;
+def IIC_XADD_REG : InstrItinClass;
+def IIC_XADD_MEM : InstrItinClass;
+def IIC_CMPXCHG_MEM : InstrItinClass;
+def IIC_CMPXCHG_REG : InstrItinClass;
+def IIC_CMPXCHG_MEM8 : InstrItinClass;
+def IIC_CMPXCHG_REG8 : InstrItinClass;
+def IIC_CMPXCHG_8B : InstrItinClass;
+def IIC_CMPXCHG_16B : InstrItinClass;
+def IIC_LODS : InstrItinClass;
+def IIC_OUTS : InstrItinClass;
+def IIC_CLC : InstrItinClass;
+def IIC_CLD : InstrItinClass;
+def IIC_CLI : InstrItinClass;
+def IIC_CMC : InstrItinClass;
+def IIC_CLTS : InstrItinClass;
+def IIC_STC : InstrItinClass;
+def IIC_STI : InstrItinClass;
+def IIC_STD : InstrItinClass;
+def IIC_XLAT : InstrItinClass;
+def IIC_AAA : InstrItinClass;
+def IIC_AAD : InstrItinClass;
+def IIC_AAM : InstrItinClass;
+def IIC_AAS : InstrItinClass;
+def IIC_DAA : InstrItinClass;
+def IIC_DAS : InstrItinClass;
+def IIC_BOUND : InstrItinClass;
+def IIC_ARPL_REG : InstrItinClass;
+def IIC_ARPL_MEM : InstrItinClass;
+def IIC_MOVBE : InstrItinClass;
+
+def IIC_NOP : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+// IssueWidth is analagous to the number of decode units. Core and its
+// descendents, including Nehalem and SandyBridge have 4 decoders.
+// Resources beyond the decoder operate on micro-ops and are bufferred
+// so adjacent micro-ops don't directly compete.
+//
+// MinLatency=0 indicates that RAW dependencies can be decoded in the
+// same cycle.
+//
+// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
+// indicates high latency opcodes. Alternatively, InstrItinData
+// entries may be included here to define specific operand
+// latencies. Since these latencies are not used for pipeline hazards,
+// they do not need to be exact.
+//
+// ILPWindow=10 is an arbitrary threshold that approximates cycles of
+// latency hidden by instruction buffers. The actual value is not very
+// important but should be zero for inorder and nonzero for OOO processors.
+//
+// The GenericModel contains no instruciton itineraries.
+def GenericModel : SchedMachineModel {
+  let IssueWidth = 4;
+  let MinLatency = 0;
+  let LoadLatency = 4;
+  let HighLatency = 10;
+  let ILPWindow = 10;
+}
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
new file mode 100644
index 000000000000..cce8f1b11436
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -0,0 +1,530 @@
+//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom (Bonnell)
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from the "Intel 64 and IA32 Architectures
+// Optimization Reference Manual", Chapter 13, Section 4.
+// Functional Units
+//    Port 0
+def Port0 : FuncUnit; // ALU: ALU0, shift/rotate, load/store
+                      // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide
+def Port1 : FuncUnit; // ALU: ALU1, bit processing, jump, and LEA
+                      // SIMD/FP: SIMD ALU, FP Adder
+
+def AtomItineraries : ProcessorItineraries<
+  [ Port0, Port1 ],
+  [], [
+  // P0 only
+  // InstrItinData<class, [InstrStage<N, [P0]>] >,
+  // P0 or P1
+  // InstrItinData<class, [InstrStage<N, [P0, P1]>] >,
+  // P0 and P1
+  // InstrItinData<class, [InstrStage<N, [P0], 0>,  InstrStage<N, [P1]>] >,
+  //
+  // Default is 1 cycle, port0 or port1
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<2, [Port0, Port1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<12, [Port0, Port1]>] >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<14, [Port0, Port1]>] >,
+  // idiv
+  InstrItinData<IIC_IDIV8, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // div
+  InstrItinData<IIC_DIV8_REG, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<130, [Port0, Port1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
+  // add/sub/and/or/xor/adc/sbc/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<9, [Port0, Port1]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [Port0, Port1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [Port1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<4, [Port0, Port1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [Port1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<31, [Port0, Port1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<17, [Port0, Port1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [Port0], 0>,
+                              InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<40, [Port0, Port1]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<39, [Port0, Port1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<79, [Port0, Port1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [Port0], 0>,  InstrStage<1, [Port1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX,[InstrStage<1, [Port0] >] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [Port0, Port1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX,[InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<75, [Port0, Port1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<74, [Port0, Port1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<62, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<10, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<5, [Port0], 0>,
+                                   InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<125, [Port0, Port1]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [Port0]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [Port0]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<15, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<9, [Port0, Port1]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<3, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [Port0, Port1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<8, [Port0, Port1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<5, [Port0], 0>,
+                                    InstrStage<5, [Port1]>]>,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<22, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<3, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<5, [Port0], 0>, InstrStage<5, [Port1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<4, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<6, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<8, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<5, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<63, [Port0, Port1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<99, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<146, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<168, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<183, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<25, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<55, [Port0, Port1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<147, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<174, [Port0, Port1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<77, [Port0, Port1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<140, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<141, [Port0, Port1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<127, [Port0, Port1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<130, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1003, [Port0, Port1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<71, [Port0, Port1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<109, [Port0, Port1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<121, [Port0, Port1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<83, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<741, [Port0, Port1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<96, [Port0, Port1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<88, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<94, [Port0, Port1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<92, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<68, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<72, [Port0, Port1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<59, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<88, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<123, [Port0, Port1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<136, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<26, [Port0, Port1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<50, [Port0, Port1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<54, [Port0, Port1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<46, [Port0, Port1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<49, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<44, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<60, [Port0, Port1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<64, [Port0, Port1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<2, [Port0, Port1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<29, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<48, [Port0, Port1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<41, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<51, [Port0, Port1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<50, [Port0, Port1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<202, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<78, [Port0, Port1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<46, [Port0, Port1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<69, [Port0, Port1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<67, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<2, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [Port0], 0>,
+                            InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<32, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<26, [Port0, Port1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<9, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [Port0], 0>,
+                               InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [Port1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<15, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<22, [Port0, Port1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<2, [Port0, Port1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<74, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<3, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<14, [Port0, Port1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<33, [Port0, Port1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<17, [Port0, Port1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<6, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<21, [Port0, Port1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<13, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<18, [Port0, Port1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<20, [Port0, Port1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<11, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
+  ]>;
+
+// Atom machine model.
+def AtomModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+  let ILPWindow = 0; // Always try to hide expected latency.
+
+  let Itineraries = AtomItineraries;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
new file mode 100644
index 000000000000..f934fdd85914
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -0,0 +1,270 @@
+//===-- X86SelectionDAGInfo.cpp - X86 SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86SelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-selectiondag-info"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DerivedTypes.h"
+using namespace llvm;
+
+X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
+  TargetSelectionDAGInfo(TM),
+  Subtarget(&TM.getSubtarget<X86Subtarget>()),
+  TLI(*TM.getTargetLowering()) {
+}
+
+X86SelectionDAGInfo::~X86SelectionDAGInfo() {
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile,
+                                         MachinePointerInfo DstPtrInfo) const {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 ||
+      !ConstantSize ||
+      ConstantSize->getZExtValue() >
+        Subtarget->getMaxInlineSizeThreshold()) {
+    SDValue InFlag(0, 0);
+
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+      EVT IntPtr = TLI.getPointerTy();
+      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+      TargetLowering::
+      CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
+                        false, false, false, false,
+                        0, CallingConv::C, /*isTailCall=*/false,
+                        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
+                        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
+                        DAG, dl);
+      std::pair<SDValue,SDValue> CallResult =
+        TLI.LowerCallTo(CLI);
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag(0, 0);
+  EVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                              X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    EVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+                                                             X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, SizeVT),
+                          Align, isVolatile, DstPtrInfo.getWithOffset(Offset));
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        SDValue Size, unsigned Align,
+                                        bool isVolatile, bool AlwaysInline,
+                                         MachinePointerInfo DstPtrInfo,
+                                         MachinePointerInfo SrcPtrInfo) const {
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, it is more efficient to call the library.  However
+  /// if calling the library is not allowed (AlwaysInline), then soldier on as
+  /// the code generated here is better than the long load-store sequence we
+  /// would otherwise get.
+  if (!AlwaysInline && (Align & 3) != 0)
+    return SDValue();
+
+  // If to a segment-relative address space, use the default lowering.
+  if (DstPtrInfo.getAddrSpace() >= 256 ||
+      SrcPtrInfo.getAddrSpace() >= 256)
+    return SDValue();
+
+  // ESI might be used as a base pointer, in that case we can't simply overwrite
+  // the register.  Fall back to generic code.
+  const X86RegisterInfo *TRI =
+      static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
+  if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+      TRI->getBaseRegister() == X86::ESI)
+    return SDValue();
+
+  MVT AVT;
+  if (Align & 1)
+    AVT = MVT::i8;
+  else if (Align & 2)
+    AVT = MVT::i16;
+  else if (Align & 4)
+    // DWORD aligned
+    AVT = MVT::i32;
+  else
+    // QWORD aligned
+    AVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag(0, 0);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                              X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+                                                              X86::ESI,
+                            Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
+                                array_lengthof(Ops));
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, SrcVT)),
+                                    DAG.getConstant(BytesLeft, SizeVT),
+                                    Align, isVolatile, AlwaysInline,
+                                    DstPtrInfo.getWithOffset(Offset),
+                                    SrcPtrInfo.getWithOffset(Offset)));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &Results[0], Results.size());
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
new file mode 100644
index 000000000000..d1d66fe76e94
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -0,0 +1,56 @@
+//===-- X86SelectionDAGInfo.h - X86 SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SELECTIONDAGINFO_H
+#define X86SELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class X86TargetLowering;
+class X86TargetMachine;
+class X86Subtarget;
+
+class X86SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  const X86TargetLowering &TLI;
+
+public:
+  explicit X86SelectionDAGInfo(const X86TargetMachine &TM);
+  ~X86SelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 000000000000..14619b63927b
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,495 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "X86GenSubtargetInfo.inc"
+
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::ClassifyBlockAddressReference() const {
+  if (isPICStyleGOT())    // 32-bit ELF targets.
+    return X86II::MO_GOTOFF;
+
+  if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
+    return X86II::MO_PIC_BASE_OFFSET;
+
+  // Direct static reference to label.
+  return X86II::MO_NO_FLAG;
+}
+
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+  // DLLImport only exists on windows, it is implemented as a load from a
+  // DLLIMPORT stub.
+  if (GV->hasDLLImportLinkage())
+    return X86II::MO_DLLIMPORT;
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // X86-64 in PIC mode.
+  if (isPICStyleRIPRel()) {
+    // Large model never uses stubs.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return X86II::MO_NO_FLAG;
+
+    if (isTargetDarwin()) {
+      // If symbol visibility is hidden, the extra load is not needed if
+      // target is x86-64 or the symbol is definitely defined in the current
+      // translation unit.
+      if (GV->hasDefaultVisibility() &&
+          (isDecl || GV->isWeakForLinker()))
+        return X86II::MO_GOTPCREL;
+    } else if (!isTargetWin64()) {
+      assert(isTargetELF() && "Unknown rip-relative target");
+
+      // Extra load is needed for all externally visible.
+      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+        return X86II::MO_GOTPCREL;
+    }
+
+    return X86II::MO_NO_FLAG;
+  }
+
+  if (isPICStyleGOT()) {   // 32-bit ELF targets.
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return X86II::MO_GOTOFF;
+    return X86II::MO_GOT;
+  }
+
+  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
+    // Determine whether we have a stub reference and/or whether the reference
+    // is relative to the PIC base or not.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_PIC_BASE_OFFSET;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+
+    // If symbol visibility is hidden, we have a stub for common symbol
+    // references and external declarations.
+    if (isDecl || GV->hasCommonLinkage()) {
+      // Hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+    }
+
+    // Otherwise, no stub.
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+
+  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
+    // Determine whether we have a stub reference.
+
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_NO_FLAG;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY;
+
+    // Otherwise, no stub.
+    return X86II::MO_NO_FLAG;
+  }
+
+  // Direct static reference to global.
+  return X86II::MO_NO_FLAG;
+}
+
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getTargetTriple().isMacOSX() &&
+      !getTargetTriple().isMacOSXVersionLT(10, 6))
+    return "__bzero";
+
+  return 0;
+}
+
+bool X86Subtarget::hasSinCos() const {
+  return getTargetTriple().isMacOSX() &&
+    !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+    is64Bit();
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+  if (In64BitMode)
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  unsigned MaxLevel;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
+      MaxLevel < 1)
+    return;
+
+  X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+
+  if ((EDX >> 15) & 1) { HasCMov = true;      ToggleFeature(X86::FeatureCMOV); }
+  if ((EDX >> 23) & 1) { X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);  }
+  if ((EDX >> 25) & 1) { X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1); }
+  if ((EDX >> 26) & 1) { X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2); }
+  if (ECX & 0x1)       { X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3); }
+  if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
+  if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
+  if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
+  if ((ECX >> 28) & 1) { X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX); }
+
+  bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+  bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+
+  if ((ECX >> 1) & 0x1) {
+    HasPCLMUL = true;
+    ToggleFeature(X86::FeaturePCLMUL);
+  }
+  if ((ECX >> 12) & 0x1) {
+    HasFMA = true;
+    ToggleFeature(X86::FeatureFMA);
+  }
+  if (IsIntel && ((ECX >> 22) & 0x1)) {
+    HasMOVBE = true;
+    ToggleFeature(X86::FeatureMOVBE);
+  }
+  if ((ECX >> 23) & 0x1) {
+    HasPOPCNT = true;
+    ToggleFeature(X86::FeaturePOPCNT);
+  }
+  if ((ECX >> 25) & 0x1) {
+    HasAES = true;
+    ToggleFeature(X86::FeatureAES);
+  }
+  if ((ECX >> 29) & 0x1) {
+    HasF16C = true;
+    ToggleFeature(X86::FeatureF16C);
+  }
+  if (IsIntel && ((ECX >> 30) & 0x1)) {
+    HasRDRAND = true;
+    ToggleFeature(X86::FeatureRDRAND);
+  }
+
+  if ((ECX >> 13) & 0x1) {
+    HasCmpxchg16b = true;
+    ToggleFeature(X86::FeatureCMPXCHG16B);
+  }
+
+  if (IsIntel || IsAMD) {
+    // Determine if bit test memory instructions are slow.
+    unsigned Family = 0;
+    unsigned Model  = 0;
+    X86_MC::DetectFamilyModel(EAX, Family, Model);
+    if (IsAMD || (Family == 6 && Model >= 13)) {
+      IsBTMemSlow = true;
+      ToggleFeature(X86::FeatureSlowBTMem);
+    }
+
+    // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
+    // memory access is fast. We hard code model numbers here because they
+    // aren't strictly increasing for Intel chips it seems.
+    if (IsIntel &&
+        ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield,
+                                           //          Jasper Froest
+         (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP
+         (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX
+         (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale
+         (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP
+         (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
+         (Family == 6 && Model == 0x2A) || // SandyBridge
+         (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
+         (Family == 6 && Model == 0x3A))) {// IvyBridge
+      IsUAMemFast = true;
+      ToggleFeature(X86::FeatureFastUAMem);
+    }
+
+    // Set processor type. Currently only Atom is detected.
+    if (Family == 6 &&
+        (Model == 28 || Model == 38 || Model == 39
+         || Model == 53 || Model == 54)) {
+      X86ProcFamily = IntelAtom;
+
+      UseLeaForSP = true;
+      ToggleFeature(X86::FeatureLeaForSP);
+    }
+
+    unsigned MaxExtLevel;
+    X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
+
+    if (MaxExtLevel >= 0x80000001) {
+      X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+      if ((EDX >> 29) & 0x1) {
+        HasX86_64 = true;
+        ToggleFeature(X86::Feature64Bit);
+      }
+      if ((ECX >> 5) & 0x1) {
+        HasLZCNT = true;
+        ToggleFeature(X86::FeatureLZCNT);
+      }
+      if (IsIntel && ((ECX >> 8) & 0x1)) {
+        HasPRFCHW = true;
+        ToggleFeature(X86::FeaturePRFCHW);
+      }
+      if (IsAMD) {
+        if ((ECX >> 6) & 0x1) {
+          HasSSE4A = true;
+          ToggleFeature(X86::FeatureSSE4A);
+        }
+        if ((ECX >> 11) & 0x1) {
+          HasXOP = true;
+          ToggleFeature(X86::FeatureXOP);
+        }
+        if ((ECX >> 16) & 0x1) {
+          HasFMA4 = true;
+          ToggleFeature(X86::FeatureFMA4);
+        }
+      }
+    }
+  }
+
+  if (MaxLevel >= 7) {
+    if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
+      if (IsIntel && (EBX & 0x1)) {
+        HasFSGSBase = true;
+        ToggleFeature(X86::FeatureFSGSBase);
+      }
+      if ((EBX >> 3) & 0x1) {
+        HasBMI = true;
+        ToggleFeature(X86::FeatureBMI);
+      }
+      if ((EBX >> 4) & 0x1) {
+        HasHLE = true;
+        ToggleFeature(X86::FeatureHLE);
+      }
+      if (IsIntel && ((EBX >> 5) & 0x1)) {
+        X86SSELevel = AVX2;
+        ToggleFeature(X86::FeatureAVX2);
+      }
+      if (IsIntel && ((EBX >> 8) & 0x1)) {
+        HasBMI2 = true;
+        ToggleFeature(X86::FeatureBMI2);
+      }
+      if (IsIntel && ((EBX >> 11) & 0x1)) {
+        HasRTM = true;
+        ToggleFeature(X86::FeatureRTM);
+      }
+      if (IsIntel && ((EBX >> 19) & 0x1)) {
+        HasADX = true;
+        ToggleFeature(X86::FeatureADX);
+      }
+      if (IsIntel && ((EBX >> 18) & 0x1)) {
+        HasRDSEED = true;
+        ToggleFeature(X86::FeatureRDSEED);
+      }
+    }
+  }
+}
+
+void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
+
+void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
+  std::string CPUName = CPU;
+  if (!FS.empty() || !CPU.empty()) {
+    if (CPUName.empty()) {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
+    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+      CPUName = sys::getHostCPUName();
+#else
+      CPUName = "generic";
+#endif
+    }
+
+    // Make sure 64-bit features are available in 64-bit mode. (But make sure
+    // SSE2 can be turned off explicitly.)
+    std::string FullFS = FS;
+    if (In64BitMode) {
+      if (!FullFS.empty())
+        FullFS = "+64bit,+sse2," + FullFS;
+      else
+        FullFS = "+64bit,+sse2";
+    }
+
+    // If feature string is not empty, parse features string.
+    ParseSubtargetFeatures(CPUName, FullFS);
+  } else {
+    if (CPUName.empty()) {
+#if defined (__x86_64__) || defined(__i386__)
+      CPUName = sys::getHostCPUName();
+#else
+      CPUName = "generic";
+#endif
+    }
+    // Otherwise, use CPUID to auto-detect feature set.
+    AutoDetectSubtargetFeatures();
+
+    // Make sure 64-bit features are available in 64-bit mode.
+    if (In64BitMode) {
+      HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
+      HasCMov = true;   ToggleFeature(X86::FeatureCMOV);
+
+      if (X86SSELevel < SSE2) {
+        X86SSELevel = SSE2;
+        ToggleFeature(X86::FeatureSSE1);
+        ToggleFeature(X86::FeatureSSE2);
+      }
+    }
+  }
+
+  // CPUName may have been set by the CPU detection code. Make sure the
+  // new MCSchedModel is used.
+  InitMCProcessorInfo(CPUName, FS);
+
+  if (X86ProcFamily == IntelAtom)
+    PostRAScheduler = true;
+
+  InstrItins = getInstrItineraryForCPU(CPUName);
+
+  // It's important to keep the MCSubtargetInfo feature bits in sync with
+  // target data structure which is shared with MC code emitter, etc.
+  if (In64BitMode)
+    ToggleFeature(X86::Mode64Bit);
+
+  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
+  assert((!In64BitMode || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
+  // 32 and 64 bit) and for all 64-bit targets.
+  if (StackAlignOverride)
+    stackAlignment = StackAlignOverride;
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           In64BitMode)
+    stackAlignment = 16;
+}
+
+void X86Subtarget::initializeEnvironment() {
+  X86SSELevel = NoMMXSSE;
+  X863DNowLevel = NoThreeDNow;
+  HasCMov = false;
+  HasX86_64 = false;
+  HasPOPCNT = false;
+  HasSSE4A = false;
+  HasAES = false;
+  HasPCLMUL = false;
+  HasFMA = false;
+  HasFMA4 = false;
+  HasXOP = false;
+  HasMOVBE = false;
+  HasRDRAND = false;
+  HasF16C = false;
+  HasFSGSBase = false;
+  HasLZCNT = false;
+  HasBMI = false;
+  HasBMI2 = false;
+  HasRTM = false;
+  HasHLE = false;
+  HasADX = false;
+  HasPRFCHW = false;
+  HasRDSEED = false;
+  IsBTMemSlow = false;
+  IsUAMemFast = false;
+  HasVectorUAMem = false;
+  HasCmpxchg16b = false;
+  UseLeaForSP = false;
+  HasSlowDivide = false;
+  PostRAScheduler = false;
+  PadShortFunctions = false;
+  CallRegIndirect = false;
+  stackAlignment = 4;
+  // FIXME: this is a known good value for Yonah. How about others?
+  MaxInlineSizeThreshold = 128;
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
+                           const std::string &FS,
+                           unsigned StackAlignOverride, bool is64Bit)
+  : X86GenSubtargetInfo(TT, CPU, FS)
+  , X86ProcFamily(Others)
+  , PICStyle(PICStyles::None)
+  , TargetTriple(TT)
+  , StackAlignOverride(StackAlignOverride)
+  , In64BitMode(is64Bit) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+}
+
+bool X86Subtarget::enablePostRAScheduler(
+           CodeGenOpt::Level OptLevel,
+           TargetSubtargetInfo::AntiDepBreakMode& Mode,
+           RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 000000000000..6fbdb1d5f00f
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,379 @@
+//===-- X86Subtarget.h - Define Subtarget for the X86 ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "X86GenSubtargetInfo.inc"
+
+namespace llvm {
+class GlobalValue;
+class StringRef;
+class TargetMachine;
+
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+///
+namespace PICStyles {
+enum Style {
+  StubPIC,          // Used on i386-darwin in -fPIC mode.
+  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+  GOT,              // Used on many 32-bit unices in -fPIC mode.
+  RIPRel,           // Used on X86-64 when not in -static mode.
+  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget : public X86GenSubtargetInfo {
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  enum X86ProcFamilyEnum {
+    Others, IntelAtom
+  };
+
+  /// X86ProcFamily - X86 processor family: Intel Atom, and others
+  X86ProcFamilyEnum X86ProcFamily;
+
+  /// PICStyle - Which PIC style to use
+  ///
+  PICStyles::Style PICStyle;
+
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+  /// none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  ///
+  X863DNowEnum X863DNowLevel;
+
+  /// HasCMov - True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  ///
+  bool HasX86_64;
+
+  /// HasPOPCNT - True if the processor supports POPCNT.
+  bool HasPOPCNT;
+
+  /// HasSSE4A - True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// HasAES - Target has AES instructions
+  bool HasAES;
+
+  /// HasPCLMUL - Target has carry-less multiplication
+  bool HasPCLMUL;
+
+  /// HasFMA - Target has 3-operand fused multiply-add
+  bool HasFMA;
+
+  /// HasFMA4 - Target has 4-operand fused multiply-add
+  bool HasFMA4;
+
+  /// HasXOP - Target has XOP instructions
+  bool HasXOP;
+
+  /// HasMOVBE - True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// HasRDRAND - True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// HasF16C - Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// HasFSGSBase - Processor has FS/GS base insturctions.
+  bool HasFSGSBase;
+
+  /// HasLZCNT - Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// HasBMI - Processor has BMI1 instructions.
+  bool HasBMI;
+
+  /// HasBMI2 - Processor has BMI2 instructions.
+  bool HasBMI2;
+
+  /// HasRTM - Processor has RTM instructions.
+  bool HasRTM;
+
+  /// HasHLE - Processor has HLE.
+  bool HasHLE;
+
+  /// HasADX - Processor has ADX instructions.
+  bool HasADX;
+
+  /// HasPRFCHW - Processor has PRFCHW instructions.
+  bool HasPRFCHW;
+
+  /// HasRDSEED - Processor has RDSEED instructions.
+  bool HasRDSEED;
+
+  /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+
+  /// IsUAMemFast - True if unaligned memory access is fast.
+  bool IsUAMemFast;
+
+  /// HasVectorUAMem - True if SIMD operations can have unaligned memory
+  /// operands. This may require setting a feature bit in the processor.
+  bool HasVectorUAMem;
+
+  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
+  /// UseLeaForSP - True if the LEA instruction should be used for adjusting
+  /// the stack pointer. This is an optimization for Intel Atom processors.
+  bool UseLeaForSP;
+
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
+  /// PostRAScheduler - True if using post-register-allocation scheduler.
+  bool PostRAScheduler;
+
+  /// PadShortFunctions - True if the short functions should be padded to prevent
+  /// a stall when returning too early.
+  bool PadShortFunctions;
+
+  /// CallRegIndirect - True if the Calls with memory reference should be converted
+  /// to a register-based indirect call.
+  bool CallRegIndirect;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
+
+  /// Instruction itineraries for scheduling
+  InstrItineraryData InstrItins;
+
+private:
+  /// StackAlignOverride - Override the stack alignment.
+  unsigned StackAlignOverride;
+
+  /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
+  bool In64BitMode;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  X86Subtarget(const std::string &TT, const std::string &CPU,
+               const std::string &FS,
+               unsigned StackAlignOverride, bool is64Bit);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+  /// instruction.
+  void AutoDetectSubtargetFeatures();
+
+  /// \brief Reset the features for the X86 target.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+  /// Is this x86_64? (disregarding specific ABI / programming model)
+  bool is64Bit() const {
+    return In64BitMode;
+  }
+
+  /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+  bool isTarget64BitILP32() const {
+    return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32);
+  }
+
+  /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+  bool isTarget64BitLP64() const {
+    return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+  }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasCMov() const { return HasCMov; }
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasAVX() const { return X86SSELevel >= AVX; }
+  bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasFp256() const { return hasAVX(); }
+  bool hasInt256() const { return hasAVX2(); }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+  bool hasPOPCNT() const { return HasPOPCNT; }
+  bool hasAES() const { return HasAES; }
+  bool hasPCLMUL() const { return HasPCLMUL; }
+  bool hasFMA() const { return HasFMA; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
+  bool hasXOP() const { return HasXOP; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasFSGSBase() const { return HasFSGSBase; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
+  bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
+  bool hasHLE() const { return HasHLE; }
+  bool hasADX() const { return HasADX; }
+  bool hasPRFCHW() const { return HasPRFCHW; }
+  bool hasRDSEED() const { return HasRDSEED; }
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+  bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
+  bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
+  bool padShortFunctions() const { return PadShortFunctions; }
+  bool callRegIndirect() const { return CallRegIndirect; }
+
+  bool isAtom() const { return X86ProcFamily == IntelAtom; }
+
+  const Triple &getTargetTriple() const { return TargetTriple; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetFreeBSD() const {
+    return TargetTriple.getOS() == Triple::FreeBSD;
+  }
+  bool isTargetSolaris() const {
+    return TargetTriple.getOS() == Triple::Solaris;
+  }
+  bool isTargetELF() const {
+    return (TargetTriple.getEnvironment() == Triple::ELF ||
+            TargetTriple.isOSBinFormatELF());
+  }
+  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NaCl;
+  }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
+  bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
+  bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
+  bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
+  bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
+  bool isTargetCOFF() const {
+    return (TargetTriple.getEnvironment() != Triple::ELF &&
+            TargetTriple.isOSBinFormatCOFF());
+  }
+  bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
+
+  bool isTargetWin64() const {
+    // FIXME: x86_64-cygwin has not been released yet.
+    return In64BitMode && TargetTriple.isOSWindows();
+  }
+
+  bool isTargetWin32() const {
+    // FIXME: Cygwin is included for isTargetWin64 -- should it be included
+    // here too?
+    return !In64BitMode && (isTargetMingw() || isTargetWindows());
+  }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPICStyleStubNoDynamic() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC;
+  }
+  bool isPICStyleStubAny() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC ||
+           PICStyle == PICStyles::StubPIC; }
+
+  /// ClassifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM)const;
+
+  /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyBlockAddressReference() const;
+
+  /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+  /// to immediate address.
+  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+  
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
+
+  /// enablePostRAScheduler - run for Atom optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  bool postRAScheduler() const { return PostRAScheduler; }
+
+  /// getInstrItins = Return the instruction itineraries based on the
+  /// subtarget selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 000000000000..8aa58a204260
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,227 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeX86Target() {
+  // Register the target.
+  RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
+}
+
+void X86_32TargetMachine::anchor() { }
+
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
+    DL(getSubtargetImpl()->isTargetDarwin() ?
+               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
+               "n8:16:32-S128" :
+               (getSubtargetImpl()->isTargetCygMing() ||
+                getSubtargetImpl()->isTargetWindows()) ?
+               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S128"),
+    InstrInfo(*this),
+    TLInfo(*this),
+    TSInfo(*this),
+    JITInfo(*this) {
+}
+
+void X86_64TargetMachine::anchor() { }
+
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         const TargetOptions &Options,
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
+    // The x32 ABI dictates the ILP32 programming model for x64.
+    DL(getSubtargetImpl()->isTarget64BitILP32() ?
+        "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+        "n8:16:32:64-S128" :
+        "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+        "n8:16:32:64-S128"),
+    InstrInfo(*this),
+    TLInfo(*this),
+    TSInfo(*this),
+    JITInfo(*this) {
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   const TargetOptions &Options,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
+    FrameLowering(*this, Subtarget),
+    InstrItins(Subtarget.getInstrItineraryData()){
+  // Determine the PICStyle based on the target selected.
+  if (getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.is64Bit()) {
+    // PIC in 64 bit mode is always rip-rel.
+    Subtarget.setPICStyle(PICStyles::RIPRel);
+  } else if (Subtarget.isTargetCygMing()) {
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetDarwin()) {
+    if (getRelocationModel() == Reloc::PIC_)
+      Subtarget.setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(getRelocationModel() == Reloc::DynamicNoPIC);
+      Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
+  } else if (Subtarget.isTargetELF()) {
+    Subtarget.setPICStyle(PICStyles::GOT);
+  }
+
+  // default to hard float ABI
+  if (Options.FloatABIType == FloatABI::Default)
+    this->Options.FloatABIType = FloatABI::Hard;
+}
+
+//===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper",
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(true));
+
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt",
+	       cl::desc("Enable early if-conversion on X86"));
+
+//===----------------------------------------------------------------------===//
+// X86 Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our X86 pass. This
+  // allows the X86 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createX86TargetTransformInfoPass(this));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// X86 Code Generator Pass Configuration Options.
+class X86PassConfig : public TargetPassConfig {
+public:
+  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  X86TargetMachine &getX86TargetMachine() const {
+    return getTM<X86TargetMachine>();
+  }
+
+  const X86Subtarget &getX86Subtarget() const {
+    return *getX86TargetMachine().getSubtargetImpl();
+  }
+
+  virtual bool addInstSelector();
+  virtual bool addILPOpts();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreEmitPass();
+};
+} // namespace
+
+TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new X86PassConfig(this, PM);
+}
+
+bool X86PassConfig::addInstSelector() {
+  // Install an instruction selector.
+  addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
+
+  // For ELF, cleanup any local-dynamic TLS accesses.
+  if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+    addPass(createCleanupLocalDynamicTLSPass());
+
+  // For 32-bit, prepend instructions to set the "global base reg" for PIC.
+  if (!getX86Subtarget().is64Bit())
+    addPass(createGlobalBaseRegPass());
+
+  return false;
+}
+
+bool X86PassConfig::addILPOpts() {
+  if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
+    addPass(&EarlyIfConverterID);
+    return true;
+  }
+  return false;
+}
+
+bool X86PassConfig::addPreRegAlloc() {
+  return false;  // -print-machineinstr shouldn't print after this.
+}
+
+bool X86PassConfig::addPostRegAlloc() {
+  addPass(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86PassConfig::addPreEmitPass() {
+  bool ShouldPrint = false;
+  if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) {
+    addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    ShouldPrint = true;
+  }
+
+  if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+    addPass(createX86IssueVZeroUpperPass());
+    ShouldPrint = true;
+  }
+
+  if (getOptLevel() != CodeGenOpt::None &&
+      getX86Subtarget().padShortFunctions()) {
+    addPass(createX86PadShortFunctions());
+    ShouldPrint = true;
+  }
+
+  return ShouldPrint;
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      JITCodeEmitter &JCE) {
+  PM.add(createX86JITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 000000000000..174d3918318d
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,137 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "X86.h"
+#include "X86FrameLowering.h"
+#include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86SelectionDAGInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class StringRef;
+
+class X86TargetMachine : public LLVMTargetMachine {
+  X86Subtarget       Subtarget;
+  X86FrameLowering   FrameLowering;
+  InstrItineraryData InstrItins;
+
+public:
+  X86TargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL,
+                   bool is64Bit);
+
+  virtual const X86InstrInfo     *getInstrInfo() const {
+    llvm_unreachable("getInstrInfo not implemented");
+  }
+  virtual const TargetFrameLowering  *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual       X86JITInfo       *getJITInfo()         {
+    llvm_unreachable("getJITInfo not implemented");
+  }
+  virtual const X86Subtarget     *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const X86TargetLowering *getTargetLowering() const {
+    llvm_unreachable("getTargetLowering not implemented");
+  }
+  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+    llvm_unreachable("getSelectionDAGInfo not implemented");
+  }
+  virtual const X86RegisterInfo  *getRegisterInfo() const {
+    return &getInstrInfo()->getRegisterInfo();
+  }
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return &InstrItins;
+  }
+
+  /// \brief Register X86 analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
+
+  // Set up the pass pipeline.
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual bool addCodeEmitter(PassManagerBase &PM,
+                              JITCodeEmitter &JCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+  virtual void anchor();
+  const DataLayout  DL; // Calculates type size & alignment
+  X86InstrInfo      InstrInfo;
+  X86TargetLowering TLInfo;
+  X86SelectionDAGInfo TSInfo;
+  X86JITInfo        JITInfo;
+public:
+  X86_32TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+  virtual const DataLayout *getDataLayout() const { return &DL; }
+  virtual const X86TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual const X86InstrInfo     *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual       X86JITInfo       *getJITInfo()         {
+    return &JITInfo;
+  }
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+  virtual void anchor();
+  const DataLayout  DL; // Calculates type size & alignment
+  X86InstrInfo      InstrInfo;
+  X86TargetLowering TLInfo;
+  X86SelectionDAGInfo TSInfo;
+  X86JITInfo        JITInfo;
+public:
+  X86_64TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS, const TargetOptions &Options,
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
+  virtual const DataLayout *getDataLayout() const { return &DL; }
+  virtual const X86TargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+  virtual const X86SelectionDAGInfo *getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+  virtual const X86InstrInfo     *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual       X86JITInfo       *getJITInfo()         {
+    return &JITInfo;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 000000000000..871dacd6a1c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -0,0 +1,49 @@
+//===-- X86TargetObjectFile.cpp - X86 Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+const MCExpr *X86_64MachoTargetObjectFile::
+getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI, unsigned Encoding,
+                        MCStreamer &Streamer) const {
+
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = Mang->getSymbol(GV);
+    const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
+    const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+    return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::
+    getTTypeGlobalReference(GV, Mang, MMI, Encoding, Streamer);
+}
+
+MCSymbol *X86_64MachoTargetObjectFile::
+getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                        MachineModuleInfo *MMI) const {
+  return Mang->getSymbol(GV);
+}
+
+void
+X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 000000000000..9d26d389d4de
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -0,0 +1,43 @@
+//===-- X86TargetObjectFile.h - X86 Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  /// X86_64MachoTargetObjectFile - This TLOF implementation is used for Darwin
+  /// x86-64.
+  class X86_64MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    virtual const MCExpr *
+    getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                            MachineModuleInfo *MMI, unsigned Encoding,
+                            MCStreamer &Streamer) const;
+
+    // getCFIPersonalitySymbol - The symbol that gets passed to
+    // .cfi_personality.
+    virtual MCSymbol *
+    getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
+                            MachineModuleInfo *MMI) const;
+  };
+
+  /// X86LinuxTargetObjectFile - This implementation is used for linux x86
+  /// and x86-64.
+  class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+  };
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 000000000000..a98c6991192c
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,495 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86tti"
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeX86TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class X86TTI : public ImmutablePass, public TargetTransformInfo {
+  const X86TargetMachine *TM;
+  const X86Subtarget *ST;
+  const X86TargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  X86TTI(const X86TargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeX86TTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  /// \name Scalar TTI Implementations
+  /// @{
+  virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+  /// @}
+
+  /// \name Vector TTI Implementations
+  /// @{
+
+  virtual unsigned getNumberOfRegisters(bool Vector) const;
+  virtual unsigned getRegisterBitWidth(bool Vector) const;
+  virtual unsigned getMaximumUnrollFactor() const;
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind,
+                                          OperandValueKind) const;
+  virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+                                  int Index, Type *SubTp) const;
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
+                   "X86 Target Transform Info", true, true, false)
+char X86TTI::ID = 0;
+
+ImmutablePass *
+llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
+  return new X86TTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  // TODO: Currently the __builtin_popcount() implementation using SSE3
+  //   instructions is inefficient. Once the problem is fixed, we should
+  //   call ST->hasSSE3() instead of ST->hasSSE4().
+  return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+  if (Vector && !ST->hasSSE1())
+    return 0;
+
+  if (ST->is64Bit())
+    return 16;
+  return 8;
+}
+
+unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+  if (Vector) {
+    if (ST->hasAVX()) return 256;
+    if (ST->hasSSE1()) return 128;
+    return 0;
+  }
+
+  if (ST->is64Bit())
+    return 64;
+  return 32;
+
+}
+
+unsigned X86TTI::getMaximumUnrollFactor() const {
+  if (ST->isAtom())
+    return 1;
+
+  // Sandybridge and Haswell have multiple execution ports and pipelined
+  // vector units.
+  if (ST->hasAVX())
+    return 4;
+
+  return 2;
+}
+
+unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                        OperandValueKind Op1Info,
+                                        OperandValueKind Op2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry<MVT> AVX2CostTable[] = {
+    // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+    // customize them to detect the cases where shift amount is a scalar one.
+    { ISD::SHL,     MVT::v4i32,    1 },
+    { ISD::SRL,     MVT::v4i32,    1 },
+    { ISD::SRA,     MVT::v4i32,    1 },
+    { ISD::SHL,     MVT::v8i32,    1 },
+    { ISD::SRL,     MVT::v8i32,    1 },
+    { ISD::SRA,     MVT::v8i32,    1 },
+    { ISD::SHL,     MVT::v2i64,    1 },
+    { ISD::SRL,     MVT::v2i64,    1 },
+    { ISD::SHL,     MVT::v4i64,    1 },
+    { ISD::SRL,     MVT::v4i64,    1 },
+
+    { ISD::SHL,  MVT::v32i8,  42 }, // cmpeqb sequence.
+    { ISD::SHL,  MVT::v16i16,  16*10 }, // Scalarized.
+
+    { ISD::SRL,  MVT::v32i8,  32*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v16i16,  8*10 }, // Scalarized.
+
+    { ISD::SRA,  MVT::v32i8,  32*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v16i16,  16*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i64,  4*10 }, // Scalarized.
+  };
+
+  // Look for AVX2 lowering tricks.
+  if (ST->hasAVX2()) {
+    int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
+                                   ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX2CostTable[Idx].Cost;
+  }
+
+  static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // Constant splats are cheaper for the following instructions.
+    { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+
+    { ISD::SRL,  MVT::v16i8,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+
+    { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2()) {
+    int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
+                                   array_lengthof(SSE2UniformConstCostTable),
+                                   ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * SSE2UniformConstCostTable[Idx].Cost;
+  }
+
+
+  static const CostTblEntry<MVT> SSE2CostTable[] = {
+    // We don't correctly identify costs of casts because they are marked as
+    // custom.
+    // For some cases, where the shift amount is a scalar we would be able
+    // to generate better code. Unfortunately, when this is the case the value
+    // (the splat) will get hoisted out of the loop, thereby making it invisible
+    // to ISel. The cost model must return worst case assumptions because it is
+    // used for vectorization and we don't want to make vectorized code worse
+    // than scalar code.
+    { ISD::SHL,  MVT::v16i8,  30 }, // cmpeqb sequence.
+    { ISD::SHL,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SHL,  MVT::v4i32,  2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,  2*10 }, // Scalarized.
+
+    { ISD::SRL,  MVT::v16i8,  16*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRL,  MVT::v2i64,  2*10 }, // Scalarized.
+
+    { ISD::SRA,  MVT::v16i8,  16*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v8i16,  8*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v4i32,  4*10 }, // Scalarized.
+    { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
+  };
+
+  if (ST->hasSSE2()) {
+    int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
+                                   ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * SSE2CostTable[Idx].Cost;
+  }
+
+  static const CostTblEntry<MVT> AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+    // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+    // Because we believe v4i64 to be a legal type, we must also include the
+    // split factor of two in the cost table. Therefore, the cost here is 18
+    // instead of 9.
+    { ISD::MUL,     MVT::v4i64,    18 },
+  };
+
+  // Look for AVX1 lowering tricks.
+  if (ST->hasAVX() && !ST->hasAVX2()) {
+    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
+                                   ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX1CostTable[Idx].Cost;
+  }
+
+  // Custom lowering of vectors.
+  static const CostTblEntry<MVT> CustomLowered[] = {
+    // A v2i64/v4i64 and multiply is custom lowered as a series of long
+    // multiplies(3), shifts(4) and adds(2).
+    { ISD::MUL,     MVT::v2i64,    9 },
+    { ISD::MUL,     MVT::v4i64,    9 },
+  };
+  int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
+                                 ISD, LT.second);
+  if (Idx != -1)
+    return LT.first * CustomLowered[Idx].Cost;
+
+  // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+  // 2x pmuludq, 2x shuffle.
+  if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+      !ST->hasSSE41())
+    return 6;
+
+  // Fallback to the default implementation.
+  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
+                                                     Op2Info);
+}
+
+unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+                                Type *SubTp) const {
+  // We only estimate the cost of reverse shuffles.
+  if (Kind != SK_Reverse)
+    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+  unsigned Cost = 1;
+  if (LT.second.getSizeInBits() > 128)
+    Cost = 3; // Extract + insert + copy.
+
+  // Multiple by the number of parts.
+  return Cost * LT.first;
+}
+
+unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i1,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i8,  3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i16, 3 },
+    { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i32, 1 },
+
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i1,  6 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i16, 5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i1,  7 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1,  8 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8,  6 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
+  };
+
+  if (ST->hasAVX()) {
+    int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
+                                 array_lengthof(AVXConversionTbl),
+                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVXConversionTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                    Type *CondTy) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  static const CostTblEntry<MVT> SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const CostTblEntry<MVT> AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const CostTblEntry<MVT> AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  if (ST->hasAVX2()) {
+    int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX2CostTbl[Idx].Cost;
+  }
+
+  if (ST->hasAVX()) {
+    int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX1CostTbl[Idx].Cost;
+  }
+
+  if (ST->hasSSE42()) {
+    int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * SSE42CostTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                    unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+         "Invalid Opcode");
+
+  // Each load/store unit costs 1.
+  unsigned Cost = LT.first * 1;
+
+  // On Sandybridge 256bit load/stores are double pumped
+  // (but not on Haswell).
+  if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+    Cost*=2;
+
+  return Cost;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 000000000000..0f77948c0eff
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,293 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-vzeroupper"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+  struct VZeroUpperInserter : public MachineFunctionPass {
+    static char ID;
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+
+    // Any YMM register live-in to this function?
+    bool FnHasLiveInYmm;
+
+    // BBState - Contains the state of each MBB: unknown, clean, dirty
+    SmallVector<uint8_t, 8> BBState;
+
+    // BBSolved - Keep track of all MBB which had been already analyzed
+    // and there is no further processing required.
+    BitVector BBSolved;
+
+    // Machine Basic Blocks are classified according this pass:
+    //
+    //  ST_UNKNOWN - The MBB state is unknown, meaning from the entry state
+    //    until the MBB exit there isn't a instruction using YMM to change
+    //    the state to dirty, or one of the incoming predecessors is unknown
+    //    and there's not a dirty predecessor between them.
+    //
+    //  ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have
+    //    instructions using YMM and be marked ST_CLEAN, as long as the state
+    //    is cleaned by a vzeroupper before any call.
+    //
+    //  ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a
+    //    vzeroupper instruction.
+    //
+    //  ST_INIT - Placeholder for an empty state set
+    //
+    enum {
+      ST_UNKNOWN = 0,
+      ST_CLEAN   = 1,
+      ST_DIRTY   = 2,
+      ST_INIT    = 3
+    };
+
+    // computeState - Given two states, compute the resulting state, in
+    // the following way
+    //
+    //  1) One dirty state yields another dirty state
+    //  2) All states must be clean for the result to be clean
+    //  3) If none above and one unknown, the result state is also unknown
+    //
+    static unsigned computeState(unsigned PrevState, unsigned CurState) {
+      if (PrevState == ST_INIT)
+        return CurState;
+
+      if (PrevState == ST_DIRTY || CurState == ST_DIRTY)
+        return ST_DIRTY;
+
+      if (PrevState == ST_CLEAN && CurState == ST_CLEAN)
+        return ST_CLEAN;
+
+      return ST_UNKNOWN;
+    }
+
+  };
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+static bool isYmmReg(unsigned Reg) {
+  if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
+    return true;
+
+  return false;
+}
+
+static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
+       E = MRI.livein_end(); I != E; ++I)
+    if (isYmmReg(I->first))
+      return true;
+
+  return false;
+}
+
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  return true;
+}
+
+static bool hasYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+      return true;
+    if (!MO.isReg())
+      continue;
+    if (MO.isDebug())
+      continue;
+    if (isYmmReg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool EverMadeChange = false;
+
+  // Fast check: if the function doesn't use any ymm registers, we don't need
+  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
+  // cheap in the common case of no ymm use.
+  bool YMMUsed = false;
+  const TargetRegisterClass *RC = &X86::VR256RegClass;
+  for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
+       i != e; i++) {
+    if (!MRI.reg_nodbg_empty(*i)) {
+      YMMUsed = true;
+      break;
+    }
+  }
+  if (!YMMUsed)
+    return EverMadeChange;
+
+  // Pre-compute the existence of any live-in YMM registers to this function
+  FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
+
+  assert(BBState.empty());
+  BBState.resize(MF.getNumBlockIDs(), 0);
+  BBSolved.resize(MF.getNumBlockIDs(), 0);
+
+  // Each BB state depends on all predecessors, loop over until everything
+  // converges.  (Once we converge, we can implicitly mark everything that is
+  // still ST_UNKNOWN as ST_CLEAN.)
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      MadeChange |= processBasicBlock(MF, *I);
+
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+
+  BBState.clear();
+  BBSolved.clear();
+  return EverMadeChange;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+                                           MachineBasicBlock &BB) {
+  bool Changed = false;
+  unsigned BBNum = BB.getNumber();
+
+  // Don't process already solved BBs
+  if (BBSolved[BBNum])
+    return false; // No changes
+
+  // Check the state of all predecessors
+  unsigned EntryState = ST_INIT;
+  for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(),
+       PE = BB.pred_end(); PI != PE; ++PI) {
+    EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]);
+    if (EntryState == ST_DIRTY)
+      break;
+  }
+
+
+  // The entry MBB for the function may set the initial state to dirty if
+  // the function receives any YMM incoming arguments
+  if (&BB == MF.begin()) {
+    EntryState = ST_CLEAN;
+    if (FnHasLiveInYmm)
+      EntryState = ST_DIRTY;
+  }
+
+  // The current state is initialized according to the predecessors
+  unsigned CurState = EntryState;
+  bool BBHasCall = false;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    DebugLoc dl = I->getDebugLoc();
+    bool isControlFlow = MI->isCall() || MI->isReturn();
+
+    // Shortcut: don't need to check regular instructions in dirty state.
+    if (!isControlFlow && CurState == ST_DIRTY)
+      continue;
+
+    if (hasYmmReg(MI)) {
+      // We found a ymm-using instruction; this could be an AVX instruction,
+      // or it could be control flow.
+      CurState = ST_DIRTY;
+      continue;
+    }
+
+    // Check for control-flow out of the current function (which might
+    // indirectly execute SSE instructions).
+    if (!isControlFlow)
+      continue;
+
+    BBHasCall = true;
+
+    // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
+    // registers. This instruction has zero latency. In addition, the processor
+    // changes back to Clean state, after which execution of Intel SSE
+    // instructions or Intel AVX instructions has no transition penalty. Add
+    // the VZEROUPPER instruction before any function call/return that might
+    // execute SSE code.
+    // FIXME: In some cases, we may want to move the VZEROUPPER into a
+    // predecessor block.
+    if (CurState == ST_DIRTY) {
+      // Only insert the VZEROUPPER in case the entry state isn't unknown.
+      // When unknown, only compute the information within the block to have
+      // it available in the exit if possible, but don't change the block.
+      if (EntryState != ST_UNKNOWN) {
+        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
+        ++NumVZU;
+      }
+
+      // After the inserted VZEROUPPER the state becomes clean again, but
+      // other YMM may appear before other subsequent calls or even before
+      // the end of the BB.
+      CurState = ST_CLEAN;
+    }
+  }
+
+  DEBUG(dbgs() << "MBB #" << BBNum
+               << ", current state: " << CurState << '\n');
+
+  // A BB can only be considered solved when we both have done all the
+  // necessary transformations, and have computed the exit state.  This happens
+  // in two cases:
+  //  1) We know the entry state: this immediately implies the exit state and
+  //     all the necessary transformations.
+  //  2) There are no calls, and and a non-call instruction marks this block:
+  //     no transformations are necessary, and we know the exit state.
+  if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN))
+    BBSolved[BBNum] = true;
+
+  if (CurState != BBState[BBNum])
+    Changed = true;
+
+  BBState[BBNum] = CurState;
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
new file mode 100644
index 000000000000..7b99967c4f32
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -0,0 +1,800 @@
+//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the XCore Disassembler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief A disassembler class for XCore.
+class XCoreDisassembler : public MCDisassembler {
+  const MCRegisterInfo *RegInfo;
+public:
+  XCoreDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
+    MCDisassembler(STI), RegInfo(Info) {}
+
+  /// \brief See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+};
+}
+
+static bool readInstruction16(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint16_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 2 Bytes of data.
+  if (region.readBytes(address, 2, Bytes, NULL) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 16-bit word in the stream.
+  insn = (Bytes[0] <<  0) | (Bytes[1] <<  8);
+  return true;
+}
+
+static bool readInstruction32(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint32_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes, NULL) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 32-bit word in the stream.
+  insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+         (Bytes[3] << 24);
+  return true;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
+  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder);
+
+static DecodeStatus Decode2RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode2RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode3RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus Decode3RImmInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus Decode2RUSInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus Decode2RUSBitpInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeL3RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL3RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL2RUSInstruction(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeL2RUSBitpInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL6RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL5RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstInstruction(MCInst &Inst,
+                                               unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst,
+                                                     unsigned Insn,
+                                                     uint64_t Address,
+                                                     const void *Decoder);
+
+#include "XCoreGenDisassemblerTables.inc"
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder)
+{
+  if (RegNo > 11)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRRegsRegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder)
+{
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::RRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  if (Val > 11)
+    return MCDisassembler::Fail;
+  static unsigned Values[] = {
+    32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
+  };
+  Inst.addOperand(MCOperand::CreateImm(Values[Val]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMEMiiOperand(MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  Inst.addOperand(MCOperand::CreateImm(0));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined < 27)
+    return MCDisassembler::Fail;
+  if (fieldFromInstruction(Insn, 5, 1)) {
+    if (Combined == 31)
+      return MCDisassembler::Fail;
+    Combined += 5;
+  }
+  Combined -= 27;
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = Combined / 3;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode3OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2,
+                     unsigned &Op3) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5);
+  if (Combined >= 27)
+    return MCDisassembler::Fail;
+
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = (Combined / 3) % 3;
+  unsigned Op3High = Combined / 9;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 4, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op3 = (Op3High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  // Try and decode as a 3R instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 11, 5);
+  switch (Opcode) {
+  case 0x0:
+    Inst.setOpcode(XCore::STW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x1:
+    Inst.setOpcode(XCore::LDW_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x2:
+    Inst.setOpcode(XCore::ADD_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3:
+    Inst.setOpcode(XCore::SUB_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4:
+    Inst.setOpcode(XCore::SHL_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5:
+    Inst.setOpcode(XCore::SHR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6:
+    Inst.setOpcode(XCore::EQ_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7:
+    Inst.setOpcode(XCore::AND_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8:
+    Inst.setOpcode(XCore::OR_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9:
+    Inst.setOpcode(XCore::LDW_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10:
+    Inst.setOpcode(XCore::LD16S_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11:
+    Inst.setOpcode(XCore::LD8U_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12:
+    Inst.setOpcode(XCore::ADD_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x13:
+    Inst.setOpcode(XCore::SUB_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14:
+    Inst.setOpcode(XCore::SHL_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x15:
+    Inst.setOpcode(XCore::SHR_2rus);
+    return Decode2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x16:
+    Inst.setOpcode(XCore::EQ_2rus);
+    return Decode2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x17:
+    Inst.setOpcode(XCore::TSETR_3r);
+    return Decode3RImmInstruction(Inst, Insn, Address, Decoder);
+  case 0x18:
+    Inst.setOpcode(XCore::LSS_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19:
+    Inst.setOpcode(XCore::LSU_3r);
+    return Decode3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(Op1));
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(Op2));
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return Decode2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL2OpInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  // Try and decode as a L3R / L2RUS instruction.
+  unsigned Opcode = fieldFromInstruction(Insn, 16, 4) |
+                    fieldFromInstruction(Insn, 27, 5) << 4;
+  switch (Opcode) {
+  case 0x0c:
+    Inst.setOpcode(XCore::STW_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x1c:
+    Inst.setOpcode(XCore::XOR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x2c:
+    Inst.setOpcode(XCore::ASHR_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x3c:
+    Inst.setOpcode(XCore::LDAWF_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x4c:
+    Inst.setOpcode(XCore::LDAWB_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x5c:
+    Inst.setOpcode(XCore::LDA16F_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x6c:
+    Inst.setOpcode(XCore::LDA16B_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x7c:
+    Inst.setOpcode(XCore::MUL_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x8c:
+    Inst.setOpcode(XCore::DIVS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x9c:
+    Inst.setOpcode(XCore::DIVU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x10c:
+    Inst.setOpcode(XCore::ST16_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x11c:
+    Inst.setOpcode(XCore::ST8_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x12c:
+    Inst.setOpcode(XCore::ASHR_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12d:
+    Inst.setOpcode(XCore::OUTPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x12e:
+    Inst.setOpcode(XCore::INPW_l2rus);
+    return DecodeL2RUSBitpInstruction(Inst, Insn, Address, Decoder);
+  case 0x13c:
+    Inst.setOpcode(XCore::LDAWF_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x14c:
+    Inst.setOpcode(XCore::LDAWB_l2rus);
+    return DecodeL2RUSInstruction(Inst, Insn, Address, Decoder);
+  case 0x15c:
+    Inst.setOpcode(XCore::CRC_l3r);
+    return DecodeL3RSrcDstInstruction(Inst, Insn, Address, Decoder);
+  case 0x18c:
+    Inst.setOpcode(XCore::REMS_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  case 0x19c:
+    Inst.setOpcode(XCore::REMU_l3r);
+    return DecodeL3RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S != MCDisassembler::Success)
+    return DecodeL2OpInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+Decode3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode3RImmInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    Inst.addOperand(MCOperand::CreateImm(Op1));
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                      const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S = Decode3OpInstruction(Insn, Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL3RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                       const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op3));
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeBitpOperand(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL6RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5, Op6;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return S;
+  S = Decode3OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5, Op6);
+  if (S != MCDisassembler::Success)
+    return S;
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op6, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL5RInstructionFail(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  // Try and decode as a L6R instruction.
+  Inst.clear();
+  unsigned Opcode = fieldFromInstruction(Insn, 27, 5);
+  switch (Opcode) {
+  case 0x00:
+    Inst.setOpcode(XCore::LMUL_l6r);
+    return DecodeL6RInstruction(Inst, Insn, Address, Decoder);
+  }
+  return MCDisassembler::Fail;
+}
+
+static DecodeStatus
+DecodeL5RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2, Op3, Op4, Op5;
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+  S = Decode2OpInstruction(fieldFromInstruction(Insn, 16, 16), Op4, Op5);
+  if (S != MCDisassembler::Success)
+    return DecodeL5RInstructionFail(Inst, Insn, Address, Decoder);
+
+  DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  DecodeGRRegsRegisterClass(Inst, Op5, Address, Decoder);
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                           const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+    Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL4RSrcDstSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                                 const void *Decoder) {
+  unsigned Op1, Op2, Op3;
+  unsigned Op4 = fieldFromInstruction(Insn, 16, 4);
+  DecodeStatus S =
+  Decode3OpInstruction(fieldFromInstruction(Insn, 0, 16), Op1, Op2, Op3);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    S = DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+  }
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op4, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op3, Address, Decoder);
+  }
+  return S;
+}
+
+MCDisassembler::DecodeStatus
+XCoreDisassembler::getInstruction(MCInst &instr,
+                                  uint64_t &Size,
+                                  const MemoryObject &Region,
+                                  uint64_t Address,
+                                  raw_ostream &vStream,
+                                  raw_ostream &cStream) const {
+  uint16_t insn16;
+
+  if (!readInstruction16(Region, Address, Size, insn16)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result = decodeInstruction(DecoderTable16, instr, insn16,
+                                          Address, this, STI);
+  if (Result != Fail) {
+    Size = 2;
+    return Result;
+  }
+
+  uint32_t insn32;
+
+  if (!readInstruction32(Region, Address, Size, insn32)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTable32, instr, insn32, Address, this, STI);
+  if (Result != Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return Fail;
+}
+
+namespace llvm {
+  extern Target TheXCoreTarget;
+}
+
+static MCDisassembler *createXCoreDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI) {
+  return new XCoreDisassembler(STI, T.createMCRegInfo(""));
+}
+
+extern "C" void LLVMInitializeXCoreDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheXCoreTarget,
+                                         createXCoreDisassembler);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
new file mode 100644
index 000000000000..1592351c3861
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -0,0 +1,97 @@
+//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an XCore MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCoreInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void XCoreInstPrinter::
+printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT");
+}
+
+void XCoreInstPrinter::
+printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT32");
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  } else {
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+    assert(SRE && "Unexpected MCExpr type.");
+  }
+  assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
+
+  OS << SRE->getSymbol();
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+}
+
+void XCoreInstPrinter::
+printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), O);
+}
+
+void XCoreInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  printOperand(MI, opNum, O);
+
+  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+
+  O << "+";
+  printOperand(MI, opNum+1, O);
+}
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
new file mode 100644
index 000000000000..772c515b5c9e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -0,0 +1,44 @@
+//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// which is used to print XCore MCInst to a .s file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTPRINTER_H
+#define XCOREINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+class XCoreInstPrinter : public MCInstPrinter {
+public:
+  XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+private:
+  void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
+  void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
new file mode 100644
index 000000000000..1cfdbda003b5
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -0,0 +1,32 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+#include "llvm/ADT/StringRef.h"
+using namespace llvm;
+
+void XCoreMCAsmInfo::anchor() { }
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, StringRef TT) {
+  SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = 0;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+    
+  PrivateGlobalPrefix = ".L";
+  AscizDirective = ".asciiz";
+  WeakDefDirective = "\t.weak\t";
+  WeakRefDirective = "\t.weak\t";
+
+  // Debug
+  HasLEB128 = true;
+}
+
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
new file mode 100644
index 000000000000..076777541e33
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -0,0 +1,31 @@
+//===-- XCoreMCAsmInfo.h - XCore asm properties ----------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETASMINFO_H
+#define XCORETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class StringRef;
+  class Target;
+
+  class XCoreMCAsmInfo : public MCAsmInfo {
+    virtual void anchor();
+  public:
+    explicit XCoreMCAsmInfo(const Target &T, StringRef TT);
+  };
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
new file mode 100644
index 000000000000..b5b072dcbda6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -0,0 +1,104 @@
+//===-- XCoreMCTargetDesc.cpp - XCore Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCTargetDesc.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "XCoreGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createXCoreMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitXCoreMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitXCoreMCRegisterInfo(X, XCore::LR);
+  return X;
+}
+
+static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
+  InitXCoreMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is SP.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(XCore::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createXCoreMCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  return new XCoreInstPrinter(MAI, MII, MRI);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheXCoreTarget, createXCoreMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheXCoreTarget,
+                                        createXCoreMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheXCoreTarget, createXCoreMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
+                                          createXCoreMCSubtargetInfo);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(TheXCoreTarget,
+                                        createXCoreMCInstPrinter);
+}
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
new file mode 100644
index 000000000000..a255adb2e0f2
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -0,0 +1,38 @@
+//===-- XCoreMCTargetDesc.h - XCore Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides XCore specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMCTARGETDESC_H
+#define XCOREMCTARGETDESC_H
+
+namespace llvm {
+class Target;
+
+extern Target TheXCoreTarget;
+
+} // End llvm namespace
+
+// Defines symbolic names for XCore registers.  This defines a mapping from
+// register name to register number.
+//
+#define GET_REGINFO_ENUM
+#include "XCoreGenRegisterInfo.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#define GET_INSTRINFO_ENUM
+#include "XCoreGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "XCoreGenSubtargetInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 000000000000..00e34e04fbe5
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -0,0 +1,19 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheXCoreTarget;
+
+extern "C" void LLVMInitializeXCoreTargetInfo() { 
+  RegisterTarget<Triple::xcore> X(TheXCoreTarget, "xcore", "XCore");
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCore.h b/contrib/llvm/lib/Target/XCore/XCore.h
new file mode 100644
index 000000000000..08f091e5b870
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.h
@@ -0,0 +1,32 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_XCORE_H
+#define TARGET_XCORE_H
+
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class TargetMachine;
+  class XCoreTargetMachine;
+  class formatted_raw_ostream;
+
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM,
+                                   CodeGenOpt::Level OptLevel);
+
+} // end namespace llvm;
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCore.td b/contrib/llvm/lib/Target/XCore/XCore.td
new file mode 100644
index 000000000000..e9a6d88fd68e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCore.td
@@ -0,0 +1,53 @@
+//===-- XCore.td - Describe the XCore Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",      []>;
+def : Proc<"xs1b-generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCoreAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+def XCore : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = XCoreInstrInfo;
+  let AssemblyWriters = [XCoreAsmWriter];
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
new file mode 100644
index 000000000000..0d146ba4d98d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -0,0 +1,342 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCore.h"
+#include "InstPrinter/XCoreInstPrinter.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMCInstLower.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden,
+  cl::value_desc("number"),
+  cl::init(8));
+
+namespace {
+  class XCoreAsmPrinter : public AsmPrinter {
+    const XCoreSubtarget &Subtarget;
+    XCoreMCInstLower MCInstLowering;
+    void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+  public:
+    explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
+        MCInstLowering(*this) {}
+
+    virtual const char *getPassName() const {
+      return "XCore Assembly Printer";
+    }
+
+    void printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+                       const std::string &directive = ".jmptable");
+    void printInlineJT32(const MachineInstr *MI, int opNum, raw_ostream &O) {
+      printInlineJT(MI, opNum, O, ".jmptable32");
+    }
+    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode,
+                         raw_ostream &O);
+
+    void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
+    virtual void EmitGlobalVariable(const GlobalVariable *GV);
+
+    void EmitFunctionEntryLabel();
+    void EmitInstruction(const MachineInstr *MI);
+    void EmitFunctionBodyStart();
+    void EmitFunctionBodyEnd();
+    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  };
+} // end of anonymous namespace
+
+void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
+  assert(((GV->hasExternalLinkage() ||
+    GV->hasWeakLinkage()) ||
+    GV->hasLinkOnceLinkage()) && "Unexpected linkage");
+  if (ArrayType *ATy = dyn_cast<ArrayType>(
+    cast<PointerType>(GV->getType())->getElementType())) {
+    OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
+    // FIXME: MCStreamerize.
+    OutStreamer.EmitRawText(StringRef(".globound"));
+    OutStreamer.EmitRawText("\t.set\t" + Twine(Sym->getName()));
+    OutStreamer.EmitRawText(".globound," + Twine(ATy->getNumElements()));
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      OutStreamer.EmitRawText(MAI->getWeakDefDirective() +Twine(Sym->getName())+
+                              ".globound");
+    }
+  }
+}
+
+void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (!GV->hasInitializer() ||
+      EmitSpecialLLVMGlobal(GV))
+    return;
+
+  const DataLayout *TD = TM.getDataLayout();
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
+
+  
+  MCSymbol *GVSym = Mang->getSymbol(GV);
+  const Constant *C = GV->getInitializer();
+  unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
+  
+  // Mark the start of the global
+  OutStreamer.EmitRawText("\t.cc_top " + Twine(GVSym->getName()) + ".data," +
+                          GVSym->getName());
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::AppendingLinkage:
+    report_fatal_error("AppendingLinkage is not supported by this target!");
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalLinkage:
+    emitArrayBound(GVSym, GV);
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
+
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage())
+      OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
+    // FALL THROUGH
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  case GlobalValue::DLLImportLinkage:
+    llvm_unreachable("DLLImport linkage is not supported by this target!");
+  case GlobalValue::DLLExportLinkage:
+    llvm_unreachable("DLLExport linkage is not supported by this target!");
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align > 2 ? Align : 2, GV);
+  
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  if (GV->isThreadLocal()) {
+    Size *= MaxThreads;
+  }
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+    OutStreamer.EmitRawText("\t.size " + Twine(GVSym->getName()) + "," +
+                            Twine(Size));
+  }
+  OutStreamer.EmitLabel(GVSym);
+  
+  EmitGlobalConstant(C);
+  if (GV->isThreadLocal()) {
+    for (unsigned i = 1; i < MaxThreads; ++i)
+      EmitGlobalConstant(C);
+  }
+  // The ABI requires that unsigned scalar types smaller than 32 bits
+  // are padded to 32 bits.
+  if (Size < 4)
+    OutStreamer.EmitZeros(4 - Size);
+  
+  // Mark the end of the global
+  OutStreamer.EmitRawText("\t.cc_bottom " + Twine(GVSym->getName()) + ".data");
+}
+
+void XCoreAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(Mang, &MF->getContext());
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+  // Emit function end directives
+  OutStreamer.EmitRawText("\t.cc_bottom " + Twine(CurrentFnSym->getName()) +
+                          ".function");
+}
+
+void XCoreAsmPrinter::EmitFunctionEntryLabel() {
+  // Mark the start of the function
+  OutStreamer.EmitRawText("\t.cc_top " + Twine(CurrentFnSym->getName()) +
+                          ".function," + CurrentFnSym->getName());
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+void XCoreAsmPrinter::
+printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
+              const std::string &directive) {
+  unsigned JTI = MI->getOperand(opNum).getIndex();
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  O << "\t" << directive << " ";
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (i > 0)
+      O << ",";
+    O << *MBB->getSymbol();
+  }
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << XCoreInstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << *Mang->getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,const char *ExtraCode,
+                                      raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    }
+
+  printOperand(MI, OpNo, O);
+  return false;
+}
+
+void XCoreAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                             raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps-2, OS);
+}
+
+MachineLocation XCoreAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  // Handles frame addresses emitted in XCoreInstrInfo::emitFrameIndexDebugValue.
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+         "Unexpected MachineOperand types");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
+void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  SmallString<128> Str;
+  raw_svector_ostream O(Str);
+
+  switch (MI->getOpcode()) {
+  case XCore::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  case XCore::ADD_2rus:
+    if (MI->getOperand(2).getImm() == 0) {
+      O << "\tmov "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
+      OutStreamer.EmitRawText(O.str());
+      return;
+    }
+    break;
+  case XCore::BR_JT:
+  case XCore::BR_JT32:
+    O << "\tbru "
+      << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg()) << '\n';
+    if (MI->getOpcode() == XCore::BR_JT)
+      printInlineJT(MI, 0, O);
+    else
+      printInlineJT32(MI, 0, O);
+    O << '\n';
+    OutStreamer.EmitRawText(O.str());
+    return;
+  }
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() { 
+  RegisterAsmPrinter<XCoreAsmPrinter> X(TheXCoreTarget);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 000000000000..b20d71f49cfd
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreCallingConv.td
@@ -0,0 +1,36 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+  // i32 are returned in registers R0, R1, R2, R3
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R11.
+  CCIfNest<CCAssignToReg<[R11]>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
new file mode 100644
index 000000000000..beeb07f831c6
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -0,0 +1,423 @@
+//===-- XCoreFrameLowering.cpp - Frame info for XCore Target --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreFrameLowering.h"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+
+// helper functions. FIXME: Eliminate.
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+static void loadFromStack(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned DstReg, int Offset, DebugLoc dl,
+                          const TargetInstrInfo &TII) {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset))
+    report_fatal_error("loadFromStack offset too big " + Twine(Offset));
+  int Opcode = isU6 ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg)
+    .addImm(Offset);
+}
+
+
+static void storeToStack(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator I,
+                         unsigned SrcReg, int Offset, DebugLoc dl,
+                         const TargetInstrInfo &TII) {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset))
+    report_fatal_error("storeToStack offset too big " + Twine(Offset));
+  int Opcode = isU6 ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode))
+    .addReg(SrcReg)
+    .addImm(Offset);
+}
+
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameLowering:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 4, 0) {
+  // Do nothing
+}
+
+bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = &MF.getMMI();
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+
+  bool FP = hasFP(MF);
+  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+
+  if (PAL.hasAttrSomewhere(Attribute::Nest))
+    loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+  FrameSize/=4;
+
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    report_fatal_error("emitPrologue Frame size too big: " + Twine(FrameSize));
+  }
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
+
+  // Do we need to allocate space on the stack?
+  if (FrameSize) {
+    bool saveLR = XFI->getUsesLR();
+    bool LRSavedOnEntry = false;
+    int Opcode;
+    if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
+      Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+      MBB.addLiveIn(XCore::LR);
+      saveLR = false;
+      LRSavedOnEntry = true;
+    } else {
+      Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    }
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+
+    if (emitFrameMoves) {
+      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+
+      // Show update of SP.
+      MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
+      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+
+      if (LRSavedOnEntry) {
+        MachineLocation CSDst(MachineLocation::VirtualFP, 0);
+        MachineLocation CSSrc(XCore::LR);
+        Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc));
+      }
+    }
+    if (saveLR) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
+      MBB.addLiveIn(XCore::LR);
+
+      if (emitFrameMoves) {
+        MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
+        BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
+        MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
+        MachineLocation CSSrc(XCore::LR);
+        MMI->getFrameMoves().push_back(MachineMove(SaveLRLabel, CSDst, CSSrc));
+      }
+    }
+  }
+
+  if (FP) {
+    // Save R10 to the stack.
+    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    storeToStack(MBB, MBBI, XCore::R10, FPSpillOffset + FrameSize*4, dl, TII);
+    // R10 is live-in. It is killed at the spill.
+    MBB.addLiveIn(XCore::R10);
+    if (emitFrameMoves) {
+      MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
+      MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
+      MachineLocation CSSrc(XCore::R10);
+      MMI->getFrameMoves().push_back(MachineMove(SaveR10Label, CSDst, CSSrc));
+    }
+    // Set the FP from the SP.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
+      .addImm(0);
+    if (emitFrameMoves) {
+      // Show FP is now valid.
+      MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
+      MachineLocation SPDst(FramePtr);
+      MachineLocation SPSrc(MachineLocation::VirtualFP);
+      MMI->getFrameMoves().push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+    }
+  }
+
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
+        XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      MCSymbol *SpillLabel = SpillLabels[I].first;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned Reg = CSI.getReg();
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(SpillLabel, CSDst, CSSrc));
+    }
+  }
+}
+
+void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  bool FP = hasFP(MF);
+  if (FP) {
+    // Restore the stack pointer.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r))
+      .addReg(FramePtr);
+  }
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+
+  FrameSize/=4;
+
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    report_fatal_error("emitEpilogue Frame size too big: " + Twine(FrameSize));
+  }
+
+  if (FrameSize) {
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+
+    if (FP) {
+      // Restore R10
+      int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+      FPSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
+    }
+    bool restoreLR = XFI->getUsesLR();
+    if (restoreLR && MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      LRSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
+      restoreLR = false;
+    }
+    if (restoreLR) {
+      // Fold prologue into return instruction
+      assert(MBBI->getOpcode() == XCore::RETSP_u6
+        || MBBI->getOpcode() == XCore::RETSP_lu6);
+      int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      MBB.erase(MBBI);
+    } else {
+      int Opcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(FrameSize);
+    }
+  }
+}
+
+bool XCoreFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                          const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+  DebugLoc DL;
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(it->getReg());
+
+    unsigned Reg = it->getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true,
+                            it->getFrameIdx(), RC, TRI);
+    if (emitFrameMoves) {
+      MCSymbol *SaveLabel = MF->getContext().CreateTempSymbol();
+      BuildMI(MBB, MI, DL, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLabel);
+      XFI->getSpillLabels().push_back(std::make_pair(SaveLabel, *it));
+    }
+  }
+  return true;
+}
+
+bool XCoreFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                            const TargetRegisterInfo *TRI) const{
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+
+  bool AtStart = MI == MBB.begin();
+  MachineBasicBlock::iterator BeforeI = MI;
+  if (!AtStart)
+    --BeforeI;
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    unsigned Reg = it->getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, it->getReg(), it->getFrameIdx(),
+                             RC, TRI);
+    assert(MI != MBB.begin() &&
+           "loadRegFromStackSlot didn't insert any code!");
+    // Insert in reverse order.  loadRegFromStackSlot can insert multiple
+    // instructions.
+    if (AtStart)
+      MI = MBB.begin();
+    else {
+      MI = BeforeI;
+      ++MI;
+    }
+  }
+  return true;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void XCoreFrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  const XCoreInstrInfo &TII =
+    *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+
+      bool isU6 = isImmU6(Amount);
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(0);
+      }
+
+      MachineInstr *New;
+      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+  
+  MBB.erase(I);
+}
+
+void
+XCoreFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  if (LRUsed) {
+    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+
+    bool isVarArg = MF.getFunction()->isVarArg();
+    int FrameIdx;
+    if (! isVarArg) {
+      // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0, true);
+    } else {
+      FrameIdx = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(),
+                                        false);
+    }
+    XFI->setUsesLR(FrameIdx);
+    XFI->setLRSpillSlot(FrameIdx);
+  }
+  if (RegInfo->requiresRegisterScavenging(MF)) {
+    // Reserve a slot close to SP or frame pointer.
+    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
+  if (hasFP(MF)) {
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->setFPSpillSlot(MFI->CreateStackObject(RC->getSize(),
+                                               RC->getAlignment(),
+                                               false));
+  }
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
new file mode 100644
index 000000000000..ebad62f2fa53
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -0,0 +1,58 @@
+//===-- XCoreFrameLowering.h - Frame info for XCore Target ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREFRAMEINFO_H
+#define XCOREFRAMEINFO_H
+
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class XCoreSubtarget;
+
+  class XCoreFrameLowering: public TargetFrameLowering {
+  public:
+    XCoreFrameLowering(const XCoreSubtarget &STI);
+
+    /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+    /// the function.
+    void emitPrologue(MachineFunction &MF) const;
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+    bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+    bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const;
+
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+
+    bool hasFP(const MachineFunction &MF) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS = NULL) const;
+
+    //! Stack slot size (4 bytes)
+    static int stackSlotSize() {
+      return 4;
+    }
+  };
+}
+
+#endif // XCOREFRAMEINFO_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 000000000000..fbf86c523054
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -0,0 +1,296 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class XCoreDAGToDAGISel : public SelectionDAGISel {
+    const XCoreTargetLowering &Lowering;
+    const XCoreSubtarget &Subtarget;
+
+  public:
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()), 
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    SDNode *Select(SDNode *N);
+    SDNode *SelectBRIND(SDNode *N);
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    inline bool immMskBitp(SDNode *inN) const {
+      ConstantSDNode *N = cast<ConstantSDNode>(inN);
+      uint32_t value = (uint32_t)N->getZExtValue();
+      if (!isMask_32(value)) {
+        return false;
+      }
+      int msksize = 32 - CountLeadingZeros_32(value);
+      return (msksize >= 1 && msksize <= 8) ||
+              msksize == 16 || msksize == 24 || msksize == 32;
+    }
+
+    // Complex Pattern Selectors.
+    bool SelectADDRspii(SDValue Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRdpii(SDValue Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRcpii(SDValue Addr, SDValue &Base, SDValue &Offset);
+    
+    virtual const char *getPassName() const {
+      return "XCore DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    // Include the pieces autogenerated from the target description.
+  #include "XCoreGenDAGISel.inc"
+  };
+}  // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a 
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
+                                       CodeGenOpt::Level OptLevel) {
+  return new XCoreDAGToDAGISel(TM, OptLevel);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  FrameIndexSDNode *FIN = 0;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRdpii(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::DPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::DPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRcpii(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::CPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::CPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue();
+    if (immMskBitp(N)) {
+      // Transformation function: get the size of a mask
+      // Look for the first non-zero bit
+      SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
+      return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                    MVT::i32, MskSize);
+    }
+    else if (!isUInt<16>(Val)) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                              Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      TLI.getPointerTy());
+      SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+                                            MVT::Other, CPIdx,
+                                            CurDAG->getEntryNode());
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);      
+      cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+      return node;
+    }
+    break;
+  }
+  case XCoreISD::LADD: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
+  }
+  case XCoreISD::LSUB: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
+  }
+  case XCoreISD::MACCU: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCU_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case XCoreISD::MACCS: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::MACCS_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case XCoreISD::LMUL: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3) };
+    return CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32, MVT::i32,
+                                  Ops, 4);
+  }
+  case XCoreISD::CRC8: {
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+    return CurDAG->getMachineNode(XCore::CRC8_l4r, dl, MVT::i32, MVT::i32,
+                                  Ops, 3);
+  }
+  case ISD::BRIND:
+    if (SDNode *ResNode = SelectBRIND(N))
+      return ResNode;
+    break;
+  // Other cases are autogenerated.
+  }
+  return SelectCode(N);
+}
+
+/// Given a chain return a new chain where any appearance of Old is replaced
+/// by New. There must be at most one instruction between Old and Chain and
+/// this instruction must be a TokenFactor. Returns an empty SDValue if 
+/// these conditions don't hold.
+static SDValue
+replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
+{
+  if (Chain == Old)
+    return New;
+  if (Chain->getOpcode() != ISD::TokenFactor)
+    return SDValue();
+  SmallVector<SDValue, 8> Ops;
+  bool found = false;
+  for (unsigned i = 0, e = Chain->getNumOperands(); i != e; ++i) {
+    if (Chain->getOperand(i) == Old) {
+      Ops.push_back(New);
+      found = true;
+    } else {
+      Ops.push_back(Chain->getOperand(i));
+    }
+  }
+  if (!found)
+    return SDValue();
+  return CurDAG->getNode(ISD::TokenFactor, Chain->getDebugLoc(), MVT::Other,
+                         &Ops[0], Ops.size());
+}
+
+SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  // (brind (int_xcore_checkevent (addr)))
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return 0;
+  unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
+  if (IntNo != Intrinsic::xcore_checkevent)
+    return 0;
+  SDValue nextAddr = Addr->getOperand(2);
+  SDValue CheckEventChainOut(Addr.getNode(), 1);
+  if (!CheckEventChainOut.use_empty()) {
+    // If the chain out of the checkevent intrinsic is an operand of the
+    // indirect branch or used in a TokenFactor which is the operand of the
+    // indirect branch then build a new chain which uses the chain coming into
+    // the checkevent intrinsic instead.
+    SDValue CheckEventChainIn = Addr->getOperand(0);
+    SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
+                                      CheckEventChainIn);
+    if (!NewChain.getNode())
+      return 0;
+    Chain = NewChain;
+  }
+  // Enable events on the thread using setsr 1 and then disable them immediately
+  // after with clrsr 1. If any resources owned by the thread are ready an event
+  // will be taken. If no resource is ready we branch to the address which was
+  // the operand to the checkevent intrinsic.
+  SDValue constOne = getI32Imm(1);
+  SDValue Glue =
+    SDValue(CurDAG->getMachineNode(XCore::SETSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Chain), 0);
+  Glue =
+    SDValue(CurDAG->getMachineNode(XCore::CLRSR_branch_u6, dl, MVT::Glue,
+                                   constOne, Glue), 0);
+  if (nextAddr->getOpcode() == XCoreISD::PCRelativeWrapper &&
+      nextAddr->getOperand(0)->getOpcode() == ISD::TargetBlockAddress) {
+    return CurDAG->SelectNodeTo(N, XCore::BRFU_lu6, MVT::Other,
+                                nextAddr->getOperand(0), Glue);
+  }
+  return CurDAG->SelectNodeTo(N, XCore::BAU_1r, MVT::Other, nextAddr, Glue);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 000000000000..a5d2be88db7d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -0,0 +1,1644 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcore-lower"
+
+#include "XCoreISelLowering.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreTargetObjectFile.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const
+{
+  switch (Opcode)
+  {
+    case XCoreISD::BL                : return "XCoreISD::BL";
+    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
+    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    case XCoreISD::LADD              : return "XCoreISD::LADD";
+    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
+    case XCoreISD::LMUL              : return "XCoreISD::LMUL";
+    case XCoreISD::MACCU             : return "XCoreISD::MACCU";
+    case XCoreISD::MACCS             : return "XCoreISD::MACCS";
+    case XCoreISD::CRC8              : return "XCoreISD::CRC8";
+    case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
+    case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    default                          : return NULL;
+  }
+}
+
+XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
+  : TargetLowering(XTM, new XCoreTargetObjectFile()),
+    TM(XTM),
+    Subtarget(*XTM.getSubtargetImpl()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &XCore::GRRegsRegClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setStackPointerRegisterToSaveRestore(XCore::SP);
+
+  setSchedulingPreference(Sched::RegPressure);
+
+  // Use i32 for setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
+
+  // XCore does not have the NodeTypes below.
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+  // Stop the combiner recombining select and set_cc
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+
+  // 64bit
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+  setOperationAction(ISD::SUB, MVT::i64, Custom);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // Bit Manipulation
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // Jump tables.
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
+
+  // Thread Local Storage
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+
+  // Conversion of i64 -> double produces constantpool nodes
+  setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
+
+  // Loads
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+
+  // Custom expand misaligned loads / stores.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+  // Varargs
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
+  // Dynamic stack
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize
+    = MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 2;
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::ADD);
+
+  setMinFunctionAlignment(1);
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode())
+  {
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+  case ISD::LOAD:               return LowerLOAD(Op, DAG);
+  case ISD::STORE:              return LowerSTORE(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::SMUL_LOHI:          return LowerSMUL_LOHI(Op, DAG);
+  case ISD::UMUL_LOHI:          return LowerUMUL_LOHI(Op, DAG);
+  // FIXME: Remove these when LegalizeDAGTypes lands.
+  case ISD::ADD:
+  case ISD::SUB:                return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::ADD:
+  case ISD::SUB:
+    Results.push_back(ExpandADDSUB(N, DAG));
+    return;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::
+LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
+                             Op.getOperand(3), Op.getOperand(4));
+  return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue XCoreTargetLowering::
+getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
+                        SelectionDAG &DAG) const
+{
+  // FIXME there is no actual debug info here
+  DebugLoc dl = GA.getDebugLoc();
+  const GlobalValue *UnderlyingGV = GV;
+  // If GV is an alias then use the aliasee to determine the wrapper type
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    UnderlyingGV = GA->resolveAliasedGlobal();
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
+    if (GVar->isConstant())
+      return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+    return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+  }
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), MVT::i32);
+  return getGlobalAddressWrapper(GA, GV, DAG);
+}
+
+static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
+}
+
+static inline bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  // FIXME there isn't really debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  // transform to label + getid() * size
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee to determine size
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+  }
+  if (!GVar) {
+    llvm_unreachable("Thread local object not a GlobalVariable?");
+  }
+  Type *Ty = cast<PointerType>(GV->getType())->getElementType();
+  if (!Ty->isSized() || isZeroLengthArray(Ty)) {
+#ifndef NDEBUG
+    errs() << "Size of thread local object " << GVar->getName()
+           << " is unknown\n";
+#endif
+    llvm_unreachable(0);
+  }
+  SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
+  const DataLayout *TD = TM.getDataLayout();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  SDValue offset = DAG.getNode(ISD::MUL, dl, MVT::i32, BuildGetId(DAG, dl),
+                       DAG.getConstant(Size, MVT::i32));
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, base, offset);
+}
+
+SDValue XCoreTargetLowering::
+LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
+
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really debug info here
+  DebugLoc dl = CP->getDebugLoc();
+  EVT PtrVT = Op.getValueType();
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry()) {
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  } else {
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  }
+  return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+}
+
+unsigned XCoreTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
+SDValue XCoreTargetLowering::
+LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  unsigned JTI = JT->getIndex();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+
+  unsigned NumEntries = MJTI->getJumpTables()[JTI].MBBs.size();
+  if (NumEntries <= 32) {
+    return DAG.getNode(XCoreISD::BR_JT, dl, MVT::Other, Chain, TargetJT, Index);
+  }
+  assert((NumEntries >> 31) == 0);
+  SDValue ScaledIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index,
+                                    DAG.getConstant(1, MVT::i32));
+  return DAG.getNode(XCoreISD::BR_JT32, dl, MVT::Other, Chain, TargetJT,
+                     ScaledIndex);
+}
+
+static bool
+IsWordAlignedBasePlusConstantOffset(SDValue Addr, SDValue &AlignedBase,
+                                    int64_t &Offset)
+{
+  if (Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+  ConstantSDNode *CN = 0;
+  if (!(CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    return false;
+  }
+  int64_t off = CN->getSExtValue();
+  const SDValue &Base = Addr.getOperand(0);
+  const SDValue *Root = &Base;
+  if (Base.getOpcode() == ISD::ADD &&
+      Base.getOperand(1).getOpcode() == ISD::SHL) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Base.getOperand(1)
+                                                      .getOperand(1));
+    if (CN && (CN->getSExtValue() >= 2)) {
+      Root = &Base.getOperand(0);
+    }
+  }
+  if (isa<FrameIndexSDNode>(*Root)) {
+    // All frame indicies are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  if (Root->getOpcode() == XCoreISD::DPRelativeWrapper ||
+      Root->getOpcode() == XCoreISD::CPRelativeWrapper) {
+    // All dp / cp relative addresses are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  // Check for an aligned global variable.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(*Root)) {
+    const GlobalValue *GV = GA->getGlobal();
+    if (GA->getOffset() == 0 && GV->getAlignment() >= 4) {
+      AlignedBase = Base;
+      Offset = off;
+      return true;
+    }
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Unexpected extension type");
+  assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+  if (allowsUnalignedMemoryAccesses(LD->getMemoryVT()))
+    return SDValue();
+
+  unsigned ABIAlignment = getDataLayout()->
+    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned load alone.
+  if (LD->getAlignment() >= ABIAlignment)
+    return SDValue();
+
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Base;
+  int64_t Offset;
+  if (!LD->isVolatile() &&
+      IsWordAlignedBasePlusConstantOffset(BasePtr, Base, Offset)) {
+    if (Offset % 4 == 0) {
+      // We've managed to infer better alignment information than the load
+      // already has. Use an aligned load.
+      //
+      return DAG.getLoad(getPointerTy(), DL, Chain, BasePtr,
+                         MachinePointerInfo(),
+                         false, false, false, 0);
+    }
+    // Lower to
+    // ldw low, base[offset >> 2]
+    // ldw high, base[(offset >> 2) + 1]
+    // shr low_shifted, low, (offset & 0x3) * 8
+    // shl high_shifted, high, 32 - (offset & 0x3) * 8
+    // or result, low_shifted, high_shifted
+    SDValue LowOffset = DAG.getConstant(Offset & ~0x3, MVT::i32);
+    SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
+    SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
+    SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
+
+    SDValue LowAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, LowOffset);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, Base, HighOffset);
+
+    SDValue Low = DAG.getLoad(getPointerTy(), DL, Chain,
+                              LowAddr, MachinePointerInfo(),
+                              false, false, false, 0);
+    SDValue High = DAG.getLoad(getPointerTy(), DL, Chain,
+                               HighAddr, MachinePointerInfo(),
+                               false, false, false, 0);
+    SDValue LowShifted = DAG.getNode(ISD::SRL, DL, MVT::i32, Low, LowShift);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High, HighShift);
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, LowShifted, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, DL);
+  }
+
+  if (LD->getAlignment() == 2) {
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, DL, MVT::i32, Chain,
+                                 BasePtr, LD->getPointerInfo(), MVT::i16,
+                                 LD->isVolatile(), LD->isNonTemporal(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                  HighAddr,
+                                  LD->getPointerInfo().getWithOffset(2),
+                                  MVT::i16, LD->isVolatile(),
+                                  LD->isNonTemporal(), 2);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, DL, MVT::i32, High,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, DL);
+  }
+
+  // Lower to a call to __misaligned_load(BasePtr).
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false,
+                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
+                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+                    Args, DAG, DL);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  SDValue Ops[] =
+    { CallResult.first, CallResult.second };
+
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const
+{
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+  assert(!ST->isTruncatingStore() && "Unexpected store type");
+  assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+  if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getDataLayout()->
+    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned store alone.
+  if (ST->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (ST->getAlignment() == 2) {
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
+                                         ST->getPointerInfo(), MVT::i16,
+                                         ST->isVolatile(), ST->isNonTemporal(),
+                                         2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
+                                          ST->getPointerInfo().getWithOffset(2),
+                                          MVT::i16, ST->isVolatile(),
+                                          ST->isNonTemporal(), 2);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+
+  // Lower to a call to __misaligned_store(BasePtr, Value).
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+
+  Entry.Node = Value;
+  Args.push_back(Entry);
+
+  TargetLowering::CallLoweringInfo CLI(Chain,
+                    Type::getVoidTy(*DAG.getContext()), false, false,
+                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
+                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+                    Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  return CallResult.second;
+}
+
+SDValue XCoreTargetLowering::
+LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
+         "Unexpected operand to lower!");
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), Zero, Zero,
+                           LHS, RHS);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
+{
+  assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
+         "Unexpected operand to lower!");
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), LHS, RHS,
+                           Zero, Zero);
+  SDValue Lo(Hi.getNode(), 1);
+  SDValue Ops[] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// isADDADDMUL - Return whether Op is in a form that is equivalent to
+/// add(add(mul(x,y),a),b). If requireIntermediatesHaveOneUse is true then
+/// each intermediate result in the calculation must also have a single use.
+/// If the Op is in the correct form the constituent parts are written to Mul0,
+/// Mul1, Addend0 and Addend1.
+static bool
+isADDADDMUL(SDValue Op, SDValue &Mul0, SDValue &Mul1, SDValue &Addend0,
+            SDValue &Addend1, bool requireIntermediatesHaveOneUse)
+{
+  if (Op.getOpcode() != ISD::ADD)
+    return false;
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue AddOp;
+  SDValue OtherOp;
+  if (N0.getOpcode() == ISD::ADD) {
+    AddOp = N0;
+    OtherOp = N1;
+  } else if (N1.getOpcode() == ISD::ADD) {
+    AddOp = N1;
+    OtherOp = N0;
+  } else {
+    return false;
+  }
+  if (requireIntermediatesHaveOneUse && !AddOp.hasOneUse())
+    return false;
+  if (OtherOp.getOpcode() == ISD::MUL) {
+    // add(add(a,b),mul(x,y))
+    if (requireIntermediatesHaveOneUse && !OtherOp.hasOneUse())
+      return false;
+    Mul0 = OtherOp.getOperand(0);
+    Mul1 = OtherOp.getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = AddOp.getOperand(1);
+    return true;
+  }
+  if (AddOp.getOperand(0).getOpcode() == ISD::MUL) {
+    // add(add(mul(x,y),a),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(0).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(0).getOperand(0);
+    Mul1 = AddOp.getOperand(0).getOperand(1);
+    Addend0 = AddOp.getOperand(1);
+    Addend1 = OtherOp;
+    return true;
+  }
+  if (AddOp.getOperand(1).getOpcode() == ISD::MUL) {
+    // add(add(a,mul(x,y)),b)
+    if (requireIntermediatesHaveOneUse && !AddOp.getOperand(1).hasOneUse())
+      return false;
+    Mul0 = AddOp.getOperand(1).getOperand(0);
+    Mul1 = AddOp.getOperand(1).getOperand(1);
+    Addend0 = AddOp.getOperand(0);
+    Addend1 = OtherOp;
+    return true;
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
+{
+  SDValue Mul;
+  SDValue Other;
+  if (N->getOperand(0).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(0);
+    Other = N->getOperand(1);
+  } else if (N->getOperand(1).getOpcode() == ISD::MUL) {
+    Mul = N->getOperand(1);
+    Other = N->getOperand(0);
+  } else {
+    return SDValue();
+  }
+  DebugLoc dl = N->getDebugLoc();
+  SDValue LL, RL, AddendL, AddendH;
+  LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0),  DAG.getConstant(0, MVT::i32));
+  RL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1),  DAG.getConstant(0, MVT::i32));
+  AddendL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other,  DAG.getConstant(0, MVT::i32));
+  AddendH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                        Other,  DAG.getConstant(1, MVT::i32));
+  APInt HighMask = APInt::getHighBitsSet(64, 32);
+  unsigned LHSSB = DAG.ComputeNumSignBits(Mul.getOperand(0));
+  unsigned RHSSB = DAG.ComputeNumSignBits(Mul.getOperand(1));
+  if (DAG.MaskedValueIsZero(Mul.getOperand(0), HighMask) &&
+      DAG.MaskedValueIsZero(Mul.getOperand(1), HighMask)) {
+    // The inputs are both zero-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  if (LHSSB > 32 && RHSSB > 32) {
+    // The inputs are both sign-extended.
+    SDValue Hi = DAG.getNode(XCoreISD::MACCS, dl,
+                             DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                             AddendL, LL, RL);
+    SDValue Lo(Hi.getNode(), 1);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+  }
+  SDValue LH, RH;
+  LH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(0),  DAG.getConstant(1, MVT::i32));
+  RH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                   Mul.getOperand(1),  DAG.getConstant(1, MVT::i32));
+  SDValue Hi = DAG.getNode(XCoreISD::MACCU, dl,
+                           DAG.getVTList(MVT::i32, MVT::i32), AddendH,
+                           AddendL, LL, RL);
+  SDValue Lo(Hi.getNode(), 1);
+  RH = DAG.getNode(ISD::MUL, dl, MVT::i32, LL, RH);
+  LH = DAG.getNode(ISD::MUL, dl, MVT::i32, LH, RL);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, RH);
+  Hi = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, LH);
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
+{
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        "Unknown operand to lower!");
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue Result = TryExpandADDWithMul(N, DAG);
+    if (Result.getNode() != 0)
+      return Result;
+  }
+
+  DebugLoc dl = N->getDebugLoc();
+
+  // Extract components
+  SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(0, MVT::i32));
+  SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(1, MVT::i32));
+  SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(0, MVT::i32));
+  SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(1, MVT::i32));
+
+  // Expand
+  unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+                                                   XCoreISD::LSUB;
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSL, RHSL, Zero);
+  SDValue Carry(Lo.getNode(), 1);
+
+  SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                           LHSH, RHSH, Carry);
+  SDValue Ignored(Hi.getNode(), 1);
+  // Merge the pieces
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG) const
+{
+  llvm_unreachable("unimplemented");
+  // FIXME Arguments passed by reference need a extra dereference.
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  EVT VT = Node->getValueType(0);
+  SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
+                               Node->getOperand(1), MachinePointerInfo(V),
+                               false, false, false, 0);
+  // Increment the pointer, VAList, to the next vararg
+  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList,
+                     DAG.getConstant(VT.getSizeInBits(),
+                                     getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Node->getOperand(1),
+                      MachinePointerInfo(V), false, false, 0);
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, dl, Tmp3, VAList, MachinePointerInfo(),
+                     false, false, false, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc dl = Op.getDebugLoc();
+  // vastart stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1),
+                      MachinePointerInfo(), false, false, 0);
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+SDValue XCoreTargetLowering::
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  // .align 4
+  // LDAPF_u10 r11, nest
+  // LDW_2rus r11, r11[0]
+  // STWSP_ru6 r11, sp[0]
+  // LDAPF_u10 r11, fptr
+  // LDW_2rus r11, r11[0]
+  // BAU_1r r11
+  // nest:
+  // .word nest
+  // fptr:
+  // .word fptr
+  SDValue OutChains[5];
+
+  SDValue Addr = Trmp;
+
+  DebugLoc dl = Op.getDebugLoc();
+  OutChains[0] = DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr), false, false,
+                              0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(4, MVT::i32));
+  OutChains[1] = DAG.getStore(Chain, dl, DAG.getConstant(0xd80456c0, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr, 4), false,
+                              false, 0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(8, MVT::i32));
+  OutChains[2] = DAG.getStore(Chain, dl, DAG.getConstant(0x27fb0a3c, MVT::i32),
+                              Addr, MachinePointerInfo(TrmpAddr, 8), false,
+                              false, 0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(12, MVT::i32));
+  OutChains[3] = DAG.getStore(Chain, dl, Nest, Addr,
+                              MachinePointerInfo(TrmpAddr, 12), false, false,
+                              0);
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                     DAG.getConstant(16, MVT::i32));
+  OutChains[4] = DAG.getStore(Chain, dl, FPtr, Addr,
+                              MachinePointerInfo(TrmpAddr, 16), false, false,
+                              0);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
+}
+
+SDValue XCoreTargetLowering::
+LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+    case Intrinsic::xcore_crc8:
+      EVT VT = Op.getValueType();
+      SDValue Data =
+        DAG.getNode(XCoreISD::CRC8, DL, DAG.getVTList(VT, VT),
+                    Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
+      SDValue Crc(Data.getNode(), 1);
+      SDValue Results[] = { Crc, Data };
+      return DAG.getMergeValues(Results, 2, DL);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                               SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG                     = CLI.DAG;
+  DebugLoc &dl                          = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDValue Chain                         = CLI.Chain;
+  SDValue Callee                        = CLI.Callee;
+  bool &isTailCall                      = CLI.IsTailCall;
+  CallingConv::ID CallConv              = CLI.CallConv;
+  bool isVarArg                         = CLI.IsVarArg;
+
+  // XCore target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // For now, only CallingConv::C implemented
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                            Outs, OutVals, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue
+XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // The ABI dictates there should be one stack slot available to the callee
+  // on function entry (for saving lr).
+  CCInfo.AllocateStack(4, 4);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
+                                 getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      int Offset = VA.getLocMemOffset();
+
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other,
+                                        Chain, Arg,
+                                        DAG.getConstant(Offset/4, MVT::i32)));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emitted instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) const {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore formal arguments implementation
+SDValue
+XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals)
+                                            const {
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::Fast:
+      return LowerCCCArguments(Chain, CallConv, isVarArg,
+                               Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue
+XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
+                                       CallingConv::ID CallConv,
+                                       bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       DebugLoc dl,
+                                       SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
+
+  unsigned StackSlotSize = XCoreFrameLowering::stackSlotSize();
+
+  unsigned LRSaveSize = StackSlotSize;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+                 << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
+      case MVT::i32:
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      }
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > StackSlotSize) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << EVT(VA.getLocVT()).getEVTString()
+               << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      LRSaveSize + VA.getLocMemOffset(),
+                                      true);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                   MachinePointerInfo::getFixedStack(FI),
+                                   false, false, false, 0));
+    }
+  }
+
+  if (isVarArg) {
+    /* Argument registers */
+    static const uint16_t ArgRegs[] = {
+      XCore::R0, XCore::R1, XCore::R2, XCore::R3
+    };
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
+                                                     array_lengthof(ArgRegs));
+    if (FirstVAReg < array_lengthof(ArgRegs)) {
+      SmallVector<SDValue, 4> MemOps;
+      int offset = 0;
+      // Save remaining registers, storing higher register numbers at a higher
+      // address
+      for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
+        // Create a stack slot
+        int FI = MFI->CreateFixedObject(4, offset, true);
+        if (i == (int)FirstVAReg) {
+          XFI->setVarArgsFrameIndex(FI);
+        }
+        offset -= StackSlotSize;
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+        // Move argument from phys reg -> virt reg
+        unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
+        RegInfo.addLiveIn(ArgRegs[i], VReg);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        // Move argument from virt reg -> stack
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOps.push_back(Store);
+      }
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
+    } else {
+      // This will point to the next argument passed via stack.
+      XFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
+                               true));
+    }
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool XCoreTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+               bool isVarArg,
+               const SmallVectorImpl<ISD::OutputArg> &Outs,
+               LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_XCore);
+}
+
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 DebugLoc dl, SelectionDAG &DAG) const {
+
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
+
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Return on XCore is always a "retsp 0"
+  RetOps.push_back(DAG.getConstant(0, MVT::i32));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             OutVals[i], Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                     &RetOps[0], RetOps.size());
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == XCore::SELECT_CC) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(*BB, BB->begin(), dl,
+          TII.get(XCore::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case XCoreISD::LADD: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // canonicalize constant to RHS
+    if (N0C && !N1C)
+      return DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N1, N0, N2);
+
+    // fold (ladd 0, 0, x) -> 0, x & 1
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      SDValue Carry = DAG.getConstant(0, VT);
+      SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
+                                   DAG.getConstant(1, VT));
+      SDValue Ops[] = { Result, Carry };
+      return DAG.getMergeValues(Ops, 2, dl);
+    }
+
+    // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Carry = DAG.getConstant(0, VT);
+        SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Carry };
+        return DAG.getMergeValues(Ops, 2, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LSUB: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+
+    // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
+    if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = N2;
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
+                                     DAG.getConstant(0, VT), N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, 2, dl);
+      }
+    }
+
+    // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
+    // low bit set
+    if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
+      APInt KnownZero, KnownOne;
+      APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                         VT.getSizeInBits() - 1);
+      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      if ((KnownZero & Mask) == Mask) {
+        SDValue Borrow = DAG.getConstant(0, VT);
+        SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
+        SDValue Ops[] = { Result, Borrow };
+        return DAG.getMergeValues(Ops, 2, dl);
+      }
+    }
+  }
+  break;
+  case XCoreISD::LMUL: {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    EVT VT = N0.getValueType();
+    // Canonicalize multiplicative constant to RHS. If both multiplicative
+    // operands are constant canonicalize smallest to RHS.
+    if ((N0C && !N1C) ||
+        (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
+      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+                         N1, N0, N2, N3);
+
+    // lmul(x, 0, a, b)
+    if (N1C && N1C->isNullValue()) {
+      // If the high result is unused fold to add(a, b)
+      if (N->hasNUsesOfValue(0, 0)) {
+        SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
+        SDValue Ops[] = { Lo, Lo };
+        return DAG.getMergeValues(Ops, 2, dl);
+      }
+      // Otherwise fold to ladd(a, b, 0)
+      SDValue Result =
+        DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
+      SDValue Carry(Result.getNode(), 1);
+      SDValue Ops[] = { Carry, Result };
+      return DAG.getMergeValues(Ops, 2, dl);
+    }
+  }
+  break;
+  case ISD::ADD: {
+    // Fold 32 bit expressions such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b). The high result of lmul will be ignored.
+    // This is only profitable if the intermediate results are unused
+    // elsewhere.
+    SDValue Mul0, Mul1, Addend0, Addend1;
+    if (N->getValueType(0) == MVT::i32 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, true)) {
+      SDValue Ignored = DAG.getNode(XCoreISD::LMUL, dl,
+                                    DAG.getVTList(MVT::i32, MVT::i32), Mul0,
+                                    Mul1, Addend0, Addend1);
+      SDValue Result(Ignored.getNode(), 1);
+      return Result;
+    }
+    APInt HighMask = APInt::getHighBitsSet(64, 32);
+    // Fold 64 bit expression such as add(add(mul(x,y),a),b) ->
+    // lmul(x, y, a, b) if all operands are zero-extended. We do this
+    // before type legalization as it is messy to match the operands after
+    // that.
+    if (N->getValueType(0) == MVT::i64 &&
+        isADDADDMUL(SDValue(N, 0), Mul0, Mul1, Addend0, Addend1, false) &&
+        DAG.MaskedValueIsZero(Mul0, HighMask) &&
+        DAG.MaskedValueIsZero(Mul1, HighMask) &&
+        DAG.MaskedValueIsZero(Addend0, HighMask) &&
+        DAG.MaskedValueIsZero(Addend1, HighMask)) {
+      SDValue Mul0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul0, DAG.getConstant(0, MVT::i32));
+      SDValue Mul1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                  Mul1, DAG.getConstant(0, MVT::i32));
+      SDValue Addend0L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend0, DAG.getConstant(0, MVT::i32));
+      SDValue Addend1L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                     Addend1, DAG.getConstant(0, MVT::i32));
+      SDValue Hi = DAG.getNode(XCoreISD::LMUL, dl,
+                               DAG.getVTList(MVT::i32, MVT::i32), Mul0L, Mul1L,
+                               Addend0L, Addend1L);
+      SDValue Lo(Hi.getNode(), 1);
+      return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+    }
+  }
+  break;
+  case ISD::STORE: {
+    // Replace unaligned store of unaligned load with memmove.
+    StoreSDNode *ST  = cast<StoreSDNode>(N);
+    if (!DCI.isBeforeLegalize() ||
+        allowsUnalignedMemoryAccesses(ST->getMemoryVT()) ||
+        ST->isVolatile() || ST->isIndexed()) {
+      break;
+    }
+    SDValue Chain = ST->getChain();
+
+    unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+    if (StoreBits % 8) {
+      break;
+    }
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(
+        ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+    unsigned Alignment = ST->getAlignment();
+    if (Alignment >= ABIAlignment) {
+      break;
+    }
+
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+      if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+        LD->getAlignment() == Alignment &&
+        !LD->isVolatile() && !LD->isIndexed() &&
+        Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+                              LD->getBasePtr(),
+                              DAG.getConstant(StoreBits/8, MVT::i32),
+                              Alignment, false, ST->getPointerInfo(),
+                              LD->getPointerInfo());
+      }
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                         APInt &KnownZero,
+                                                         APInt &KnownOne,
+                                                         const SelectionDAG &DAG,
+                                                         unsigned Depth) const {
+  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case XCoreISD::LADD:
+  case XCoreISD::LSUB:
+    if (Op.getResNo() == 1) {
+      // Top bits of carry / borrow are clear.
+      KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
+                                        KnownZero.getBitWidth() - 1);
+    }
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+  return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+  return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+  return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool
+XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              Type *Ty) const {
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
+
+  const DataLayout *TD = TM.getDataLayout();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (AM.BaseGV) {
+    return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
+                 AM.BaseOffs%4 == 0;
+  }
+
+  switch (Size) {
+  case 1:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs(AM.BaseOffs);
+    }
+    // reg + reg
+    return AM.Scale == 1 && AM.BaseOffs == 0;
+  case 2:
+  case 3:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs2(AM.BaseOffs);
+    }
+    // reg + reg<<1
+    return AM.Scale == 2 && AM.BaseOffs == 0;
+  default:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs4(AM.BaseOffs);
+    }
+    // reg + reg<<2
+    return AM.Scale == 4 && AM.BaseOffs == 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::pair<unsigned, const TargetRegisterClass*>
+XCoreTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default : break;
+    case 'r':
+      return std::make_pair(0U, &XCore::GRRegsRegClass);
+    }
+  }
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 000000000000..8d258f5054c1
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -0,0 +1,199 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREISELLOWERING_H
+#define XCOREISELLOWERING_H
+
+#include "XCore.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+  // Forward delcarations
+  class XCoreSubtarget;
+  class XCoreTargetMachine;
+
+  namespace XCoreISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Branch and link (call)
+      BL,
+
+      // pc relative address
+      PCRelativeWrapper,
+
+      // dp relative address
+      DPRelativeWrapper,
+
+      // cp relative address
+      CPRelativeWrapper,
+
+      // Store word to stack
+      STWSP,
+
+      // Corresponds to retsp instruction
+      RETSP,
+
+      // Corresponds to LADD instruction
+      LADD,
+
+      // Corresponds to LSUB instruction
+      LSUB,
+
+      // Corresponds to LMUL instruction
+      LMUL,
+
+      // Corresponds to MACCU instruction
+      MACCU,
+
+      // Corresponds to MACCS instruction
+      MACCS,
+
+      // Corresponds to CRC8 instruction
+      CRC8,
+
+      // Jumptable branch.
+      BR_JT,
+
+      // Jumptable branch using long branches for each entry.
+      BR_JT32
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class XCoreTargetLowering : public TargetLowering
+  {
+  public:
+
+    explicit XCoreTargetLowering(XCoreTargetMachine &TM);
+
+    virtual unsigned getJumpTableEncoding() const;
+    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MachineBasicBlock *
+      EmitInstrWithCustomInserter(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
+    virtual bool isLegalAddressingMode(const AddrMode &AM,
+                                       Type *Ty) const;
+
+  private:
+    const XCoreTargetMachine &TM;
+    const XCoreSubtarget &Subtarget;
+
+    // Lower Operand helpers
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<SDValue> &OutVals,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals) const;
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
+    SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
+                                    SelectionDAG &DAG) const;
+
+    // Lower Operand specifics
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+
+    // Inline asm support
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 EVT VT) const;
+
+    // Expand specifics
+    SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
+    SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  const SmallVectorImpl<SDValue> &OutVals,
+                  DebugLoc dl, SelectionDAG &DAG) const;
+
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                     bool isVarArg,
+                     const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
+                     LLVMContext &Context) const;
+  };
+}
+
+#endif // XCOREISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 000000000000..379cc39aa617
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrFormats.td
@@ -0,0 +1,277 @@
+//===-- XCoreInstrFormats.td - XCore Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "XCore";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Size = sz;
+  field bits<32> SoftFail = 0;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstXCore<0, outs, ins, asmstr, pattern> {
+  let isPseudo = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode3RInstruction";
+}
+
+// 3R with first operand as an immediate. Used for TSETR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F3RImm<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode3RImmInstruction";
+}
+
+class _FL3R<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL3RInstruction";
+}
+
+// L3R with first operand as both a source and a destination.
+class _FL3RSrcDst<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern> : _FL3R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL3RSrcDstInstruction";
+}
+
+class _F2RUS<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc;
+  let DecoderMethod = "Decode2RUSInstruction";
+}
+
+// 2RUS with bitp operand
+class _F2RUSBitp<bits<5> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+    : _F2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RUSBitpInstruction";
+}
+
+class _FL2RUS<bits<9> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{8-4};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{3-0};
+
+  let Inst{15-11} = 0b11111;
+  let DecoderMethod = "DecodeL2RUSInstruction";
+}
+
+// L2RUS with bitp operand
+class _FL2RUSBitp<bits<9> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL2RUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL2RUSBitpInstruction";
+}
+
+class _FRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<6> b;
+
+  let Inst{15-10} = opc;
+  let Inst{9-6} = a;
+  let Inst{5-0} = b;
+}
+
+class _FLRU6<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> a;
+  bits<16> b;
+
+  let Inst{31-26} = opc;
+  let Inst{25-22} = a;
+  let Inst{21-16} = b{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = b{15-6};
+}
+
+class _FU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<6> a;
+
+  let Inst{15-6} = opc;
+  let Inst{5-0} = a;
+}
+
+class _FLU6<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<16> a;
+
+  let Inst{31-22} = opc;
+  let Inst{21-16} = a{5-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{15-6};
+}
+
+class _FU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<10> a;
+
+  let Inst{15-10} = opc;
+  let Inst{9-0} = a;
+}
+
+class _FLU10<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<20> a;
+
+  let Inst{31-26} = opc;
+  let Inst{25-16} = a{9-0};
+  let Inst{15-10} = 0b111100;
+  let Inst{9-0} = a{19-10};
+}
+
+class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "Decode2RInstruction";
+}
+
+// 2R with first operand as an immediate. Used for TSETMR where the first
+// operand is treated as an immediate since it refers to a register number in
+// another thread.
+class _F2RImm<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RImmInstruction";
+}
+
+// 2R with first operand as both a source and a destination.
+class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RSrcDstInstruction";
+}
+
+// Same as 2R with last two operands swapped
+class _FR2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeR2RInstruction";
+}
+
+class _FRUS<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeRUSInstruction";
+}
+
+// RUS with bitp operand
+class _FRUSBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSBitpInstruction";
+}
+
+// RUS with first operand as both a source and a destination and a bitp second
+// operand
+class _FRUSSrcDstBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                      list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSSrcDstBitpInstruction";
+}
+
+class _FL2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{9-5};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{4-1};
+
+  let Inst{15-11} = 0b11111;
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeL2RInstruction";
+}
+
+// Same as L2R with last two operands swapped
+class _FLR2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _FL2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeLR2RInstruction";
+}
+
+class _F1R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+
+  let Inst{15-11} = opc{5-1};
+  let Inst{10-5} = 0b111111;
+  let Inst{4} = opc{0};
+  let Inst{3-0} = a;
+}
+
+class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{9-5};
+  let Inst{10-5} = 0b111111;
+  let Inst{4-0} = opc{4-0};
+}
+
+class _FL4R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  bits<4> d;
+
+  let Inst{31-27} = opc{5-1};
+  let Inst{26-21} = 0b111111;
+  let Inst{20} = opc{0};
+  let Inst{19-16} = d;
+  let Inst{15-11} = 0b11111;
+}
+
+// L4R with 4th operand as both a source and a destination.
+class _FL4RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                  list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstInstruction";
+}
+
+// L4R with 1st and 4th operand as both a source and a destination.
+class _FL4RSrcDstSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                        list<dag> pattern>
+    : _FL4R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeL4RSrcDstSrcDstInstruction";
+}
+
+class _FL5R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{5-1};
+  let Inst{20} = opc{0};
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL5RInstruction";
+}
+
+class _FL6R<bits<5> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc;
+  let Inst{15-11} = 0b11111;
+
+  let DecoderMethod = "DecodeL6RInstruction";
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 000000000000..e457e0dbf027
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -0,0 +1,406 @@
+//===-- XCoreInstrInfo.cpp - XCore Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_CTOR
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+namespace XCore {
+
+  // XCore Condition Codes
+  enum CondCode {
+    COND_TRUE,
+    COND_FALSE,
+    COND_INVALID
+  };
+}
+}
+
+using namespace llvm;
+
+XCoreInstrInfo::XCoreInstrInfo()
+  : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    RI(*this) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned
+XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const{
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::LDWFI) 
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) 
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+unsigned
+XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::STWFI)
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2))))
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+  return BrOpc == XCore::BRFU_u6
+      || BrOpc == XCore::BRFU_lu6
+      || BrOpc == XCore::BRBU_u6
+      || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+  return BrOpc == XCore::BRFT_ru6
+      || BrOpc == XCore::BRFT_lru6
+      || BrOpc == XCore::BRBT_ru6
+      || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+  return BrOpc == XCore::BRFF_ru6
+      || BrOpc == XCore::BRFF_lru6
+      || BrOpc == XCore::BRBF_ru6
+      || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+  return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+static inline bool IsBR_JT(unsigned BrOpc) {
+  return BrOpc == XCore::BR_JT
+      || BrOpc == XCore::BR_JT32;
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches 
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  if (IsBRT(BrOpc)) {
+    return XCore::COND_TRUE;
+  } else if (IsBRF(BrOpc)) {
+    return XCore::COND_FALSE;
+  } else {
+    return XCore::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::BRFT_lru6;
+  case XCore::COND_FALSE  : return XCore::BRFF_lru6;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::COND_FALSE;
+  case XCore::COND_FALSE  : return XCore::COND_TRUE;
+  }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool
+XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                              MachineBasicBlock *&FBB,
+                              SmallVectorImpl<MachineOperand> &Cond,
+                              bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (!isUnpredicatedTerminator(I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (IsBRU(LastInst->getOpcode())) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    
+    XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == XCore::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+
+    TBB = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(LastInst->getOperand(0));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  
+  // If the block ends with conditional branch followed by unconditional,
+  // handle it.
+  if (BranchCode != XCore::COND_INVALID
+    && IsBRU(LastInst->getOpcode())) {
+
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(SecondLastInst->getOperand(0));
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (IsBRU(SecondLastInst->getOpcode()) && 
+      IsBRU(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Likewise if it ends with a branch table followed by an unconditional branch.
+  if (IsBR_JT(SecondLastInst->getOpcode()) && IsBRU(LastInst->getOpcode())) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL)const{
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "Unexpected number of components!");
+  
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch
+      BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                             .addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  assert(Cond.size() == 2 && "Unexpected number of components!");
+  unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg())
+                         .addMBB(TBB);
+  BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(FBB);
+  return 2;
+}
+
+unsigned
+XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return 0;
+    --I;
+  }
+  if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!IsCondBranch(I->getOpcode()))
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I, DebugLoc DL,
+                                 unsigned DestReg, unsigned SrcReg,
+                                 bool KillSrc) const {
+  bool GRDest = XCore::GRRegsRegClass.contains(DestReg);
+  bool GRSrc  = XCore::GRRegsRegClass.contains(SrcReg);
+
+  if (GRDest && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addImm(0);
+    return;
+  }
+  
+  if (GRDest && SrcReg == XCore::SP) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg).addImm(0);
+    return;
+  }
+
+  if (DestReg == XCore::SP && GRSrc) {
+    BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::STWFI))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FrameIndex,
+                                          const TargetRegisterClass *RC,
+                                          const TargetRegisterInfo *TRI) const
+{
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+MachineInstr*
+XCoreInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(XCore::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool XCoreInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert((Cond.size() == 2) && 
+          "Invalid XCore branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 000000000000..42eeed8370f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -0,0 +1,93 @@
+//===-- XCoreInstrInfo.h - XCore Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTRUCTIONINFO_H
+#define XCOREINSTRUCTIONINFO_H
+
+#include "XCoreRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "XCoreGenInstrInfo.inc"
+
+namespace llvm {
+
+class XCoreInstrInfo : public XCoreGenInstrInfo {
+  const XCoreRegisterInfo RI;
+public:
+  XCoreInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const SmallVectorImpl<MachineOperand> &Cond,
+                                DebugLoc DL) const;
+  
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC,
+                                   const TargetRegisterInfo *TRI) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const;
+
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
+
+  virtual bool ReverseBranchCondition(
+                            SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 000000000000..03653cb2b3de
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -0,0 +1,1267 @@
+//===-- XCoreInstrInfo.td - Target Description for XCore ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
+def XCoreRetsp : SDNode<"XCoreISD::RETSP", SDTBrind,
+                      [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad, SDNPVariadic]>;
+
+def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
+                                      [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def XCoreBR_JT : SDNode<"XCoreISD::BR_JT", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def XCoreBR_JT32 : SDNode<"XCoreISD::BR_JT32", SDT_XCoreBR_JT,
+                        [SDNPHasChain]>;
+
+def SDT_XCoreAddress    : SDTypeProfile<1, 1,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+                               [SDNPHasChain, SDNPMayStore]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+  // Transformation function: imm/4
+  assert(N->getZExtValue() % 4 == 0);
+  return getI32Imm(N->getZExtValue()/4);
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+  // Transformation function: get the size of a mask
+  assert(isMask_32(N->getZExtValue()));
+  // look for the first non-zero bit
+  return getI32Imm(32 - CountLeadingZeros_32(N->getZExtValue()));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(-value);
+}]>;
+
+def bpwsub_xform : SDNodeXForm<imm, [{
+  // Transformation function: 32-imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(32-value);
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm/4
+  uint32_t value = N->getZExtValue();
+  assert(-value % 4 == 0);
+  return getI32Imm(-value/4);
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+  return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU10 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 10);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immU20 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 20);
+}]>;
+
+def immMskBitp : PatLeaf<(imm), [{ return immMskBitp(N); }]>;
+
+def immBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 1 && value <= 8)
+          || value == 16
+          || value == 24
+          || value == 32;
+}]>;
+
+def immBpwSubBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 24 && value <= 31)
+          || value == 16
+          || value == 8
+          || value == 0;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def calltarget  : Operand<i32>;
+def brtarget : Operand<OtherVT>;
+def pclabel : Operand<i32>;
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+def ADDRdpii : ComplexPattern<i32, 2, "SelectADDRdpii", [add, dprelwrapper],
+                 []>;
+def ADDRcpii : ComplexPattern<i32, 2, "SelectADDRcpii", [add, cprelwrapper],
+                 []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let DecoderMethod = "DecodeMEMiiOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+// Jump tables.
+def InlineJT : Operand<i32> {
+  let PrintMethod = "printInlineJT";
+}
+
+def InlineJT32 : Operand<i32> {
+  let PrintMethod = "printInlineJT32";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<bits<5> opc1, bits<5> opc2, string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"),
+                     [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<bits<5> opc1, bits<5> opc2, string OpcStr> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"), []>;
+  def _2rus : _F2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                     !strconcat(OpcStr, " $dst, $b, $c"), []>;
+}
+
+multiclass F3R_2RBITP<bits<5> opc1, bits<5> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _3r: _F3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                !strconcat(OpcStr, " $dst, $b, $c"),
+                [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                         !strconcat(OpcStr, " $dst, $b, $c"),
+                         [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<bits<5> opc, string OpcStr, SDNode OpNode> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"),
+       [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<bits<5> opc, string OpcStr> :
+  _F3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+       !strconcat(OpcStr, " $dst, $b, $c"), []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<bits<9> opc1, bits<9> opc2, string OpcStr,
+                      SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                       !strconcat(OpcStr, " $dst, $b, $c"),
+                       [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<bits<9> opc1, bits<9> opc2, string OpcStr,
+                        SDNode OpNode> {
+  def _l3r: _FL3R<opc1, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                  !strconcat(OpcStr, " $dst, $b, $c"),
+                  [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUSBitp<opc2, (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                           !strconcat(OpcStr, " $dst, $b, $c"),
+                           [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<bits<9> opc, string OpcStr, SDNode OpNode> :
+  _FL3R<opc, (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+        !strconcat(OpcStr, " $dst, $b, $c"),
+        [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, $b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, $b"), []>;
+}
+
+multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                  !strconcat(OpcStr, " $a, -$b"), []>;
+  def _lru6: _FLRU6<opc, (outs), (ins GRRegs:$a, brtarget:$b),
+                    !strconcat(OpcStr, " $a, -$b"), []>;
+}
+
+multiclass FRU6_LRU6_cp<bits<6> opc, string OpcStr> {
+  def _ru6: _FRU6<opc, (outs RRegs:$a), (ins i32imm:$b),
+                  !strconcat(OpcStr, " $a, cp[$b]"), []>;
+  def _lru6: _FLRU6<opc, (outs RRegs:$a), (ins i32imm:$b),
+                    !strconcat(OpcStr, " $a, cp[$b]"), []>;
+}
+
+// U6
+multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(OpNode immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(OpNode immU16:$a)]>;
+}
+
+multiclass FU6_LU6_int<bits<10> opc, string OpcStr, Intrinsic Int> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                [(Int immU6:$a)]>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"),
+                  [(Int immU16:$a)]>;
+}
+
+multiclass FU6_LU6_np<bits<10> opc, string OpcStr> {
+  def _u6: _FU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+  def _lu6: _FLU6<opc, (outs), (ins i32imm:$a), !strconcat(OpcStr, " $a"), []>;
+}
+
+// Two operand short
+
+class F2R_np<bits<6> opc, string OpcStr> :
+  _F2R<opc, (outs GRRegs:$dst), (ins GRRegs:$b),
+       !strconcat(OpcStr, " $dst, $b"), []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+                               "# ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "# ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDWFI $dst, $addr",
+                             [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "# LDAWFI $dst, $addr",
+                             [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+                            "# STWFI $src, $addr",
+                            [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {
+  def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+                              (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+                              "# SELECT_CC PSEUDO!",
+                              [(set GRRegs:$dst,
+                                 (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<0b00010, 0b10010, "add", add>;
+defm SUB : F3R_2RUS<0b00011, 0b10011, "sub", sub>;
+let neverHasSideEffects = 1 in {
+defm EQ : F3R_2RUS_np<0b00110, 0b10110, "eq">;
+def LSS_3r : F3R_np<0b11000, "lss">;
+def LSU_3r : F3R_np<0b11001, "lsu">;
+}
+def AND_3r : F3R<0b00111, "and", and>;
+def OR_3r : F3R<0b01000, "or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<0b01001, (outs GRRegs:$dst),
+                  (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]", []>;
+
+def LDW_2rus : _F2RUS<0b00001, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, i32imm:$offset),
+                      "ldw $dst, $addr[$offset]", []>;
+
+def LD16S_3r :  _F3R<0b10000, (outs GRRegs:$dst),
+                     (ins GRRegs:$addr, GRRegs:$offset),
+                     "ld16s $dst, $addr[$offset]", []>;
+
+def LD8U_3r :  _F3R<0b10001, (outs GRRegs:$dst),
+                    (ins GRRegs:$addr, GRRegs:$offset),
+                    "ld8u $dst, $addr[$offset]", []>;
+}
+
+let mayStore=1 in {
+def STW_l3r : _FL3R<0b000001100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "stw $val, $addr[$offset]", []>;
+
+def STW_2rus : _F2RUS<0b0000, (outs),
+                      (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                      "stw $val, $addr[$offset]", []>;
+}
+
+defm SHL : F3R_2RBITP<0b00100, 0b10100, "shl", shl>;
+defm SHR : F3R_2RBITP<0b00101, 0b10101, "shr", srl>;
+
+// The first operand is treated as an immediate since it refers to a register
+// number in another thread.
+def TSETR_3r : _F3RImm<0b10111, (outs), (ins i32imm:$a, GRRegs:$b, GRRegs:$c),
+                       "set t[$c]:r$a, $b", []>;
+
+// Three operand long
+def LDAWF_l3r : _FL3R<0b000111100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWF_l2rus : _FL2RUS<0b100111100, (outs GRRegs:$dst),
+                          (ins GRRegs:$addr, i32imm:$offset),
+                          "ldaw $dst, $addr[$offset]", []>;
+
+def LDAWB_l3r : _FL3R<0b001001100, (outs GRRegs:$dst),
+                      (ins GRRegs:$addr, GRRegs:$offset),
+                      "ldaw $dst, $addr[-$offset]",
+                      [(set GRRegs:$dst,
+                         (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWB_l2rus : _FL2RUS<0b101001100, (outs GRRegs:$dst),
+                         (ins GRRegs:$addr, i32imm:$offset),
+                         "ldaw $dst, $addr[-$offset]", []>;
+
+def LDA16F_l3r : _FL3R<0b001011100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<0b001101100, (outs GRRegs:$dst),
+                       (ins GRRegs:$addr, GRRegs:$offset),
+                       "lda16 $dst, $addr[-$offset]",
+                       [(set GRRegs:$dst,
+                          (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<0b001111100, "mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<0b010001100, "divs", sdiv>;
+def DIVU_l3r : FL3R<0b010011100, "divu", udiv>;
+def REMS_l3r : FL3R<0b110001100, "rems", srem>;
+def REMU_l3r : FL3R<0b110011100, "remu", urem>;
+}
+def XOR_l3r : FL3R<0b000011100, "xor", xor>;
+defm ASHR : FL3R_L2RBITP<0b000101100, 0b100101100, "ashr", sra>;
+
+let Constraints = "$src1 = $dst" in
+def CRC_l3r : _FL3RSrcDst<0b101011100, (outs GRRegs:$dst),
+                          (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                          "crc32 $dst, $src2, $src3",
+                          [(set GRRegs:$dst,
+                             (int_xcore_crc32 GRRegs:$src1, GRRegs:$src2,
+                                              GRRegs:$src3))]>;
+
+let mayStore=1 in {
+def ST16_l3r : _FL3R<0b100001100, (outs),
+                     (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                     "st16 $val, $addr[$offset]", []>;
+
+def ST8_l3r : _FL3R<0b100011100, (outs),
+                    (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                    "st8 $val, $addr[$offset]", []>;
+}
+
+def INPW_l2rus : _FL2RUSBitp<0b100101110, (outs GRRegs:$a),
+                             (ins GRRegs:$b, i32imm:$c), "inpw $a, res[$b], $c",
+                             []>;
+
+def OUTPW_l2rus : _FL2RUSBitp<0b100101101, (outs),
+                              (ins GRRegs:$a, GRRegs:$b, i32imm:$c),
+                              "outpw res[$b], $a, $c", []>;
+
+// Four operand long
+let Constraints = "$e = $a,$f = $b" in {
+def MACCU_l4r : _FL4RSrcDstSrcDst<
+  0b000001, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccu $a, $b, $c, $d", []>;
+
+def MACCS_l4r : _FL4RSrcDstSrcDst<
+  0b000010, (outs GRRegs:$a, GRRegs:$b),
+  (ins GRRegs:$e, GRRegs:$f, GRRegs:$c, GRRegs:$d), "maccs $a, $b, $c, $d", []>;
+}
+
+let Constraints = "$e = $b" in
+def CRC8_l4r : _FL4RSrcDst<0b000000, (outs GRRegs:$a, GRRegs:$b),
+                           (ins GRRegs:$e, GRRegs:$c, GRRegs:$d),
+                           "crc8 $b, $a, $c, $d", []>;
+
+// Five operand long
+
+def LADD_l5r : _FL5R<0b000001, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "ladd $dst2, $dst1, $src1, $src2, $src3",
+                     []>;
+
+def LSUB_l5r : _FL5R<0b000010, (outs GRRegs:$dst1, GRRegs:$dst2),
+                     (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                     "lsub $dst2, $dst1, $src1, $src2, $src3", []>;
+
+def LDIVU_l5r : _FL5R<0b000000, (outs GRRegs:$dst1, GRRegs:$dst2),
+                      (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                      "ldivu $dst1, $dst2, $src3, $src1, $src2", []>;
+
+// Six operand long
+
+def LMUL_l6r : _FL6R<
+  0b00000, (outs GRRegs:$dst1, GRRegs:$dst2),
+  (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3, GRRegs:$src4),
+  "lmul $dst1, $dst2, $src1, $src2, $src3, $src4", []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins MEMii:$b),
+                      "ldaw $a, dp[$b]", []>;
+
+let isReMaterializable = 1 in                    
+def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins MEMii:$b),
+                        "ldaw $a, dp[$b]",
+                        [(set RRegs:$a, ADDRdpii:$b)]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<0b010110, (outs RRegs:$a), (ins MEMii:$b),
+                     "ldw $a, dp[$b]", []>;
+
+def LDWDP_lru6: _FLRU6<0b010110, (outs RRegs:$a), (ins MEMii:$b),
+                       "ldw $a, dp[$b]",
+                       [(set RRegs:$a, (load ADDRdpii:$b))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<0b010100, (outs), (ins RRegs:$a, MEMii:$b),
+                      "stw $a, dp[$b]", []>;
+
+def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, MEMii:$b),
+                        "stw $a, dp[$b]",
+                        [(store RRegs:$a, ADDRdpii:$b)]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
+defm LDWCP : FRU6_LRU6_cp<0b011011, "ldw">;
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                      "stw $a, sp[$b]",
+                      [(XCoreStwsp RRegs:$a, immU6:$b)]>;
+
+def STWSP_lru6 : _FLRU6<0b010101, (outs), (ins RRegs:$a, i32imm:$b),
+                        "stw $a, sp[$b]",
+                        [(XCoreStwsp RRegs:$a, immU16:$b)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, sp[$b]", []>;
+
+def LDWSP_lru6 : _FLRU6<0b010111, (outs RRegs:$a), (ins i32imm:$b),
+                        "ldw $a, sp[$b]", []>;
+}
+
+let neverHasSideEffects = 1 in {
+def LDAWSP_ru6 : _FRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldaw $a, sp[$b]", []>;
+
+def LDAWSP_lru6 : _FLRU6<0b011001, (outs RRegs:$a), (ins i32imm:$b),
+                         "ldaw $a, sp[$b]", []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                    "ldc $a, $b", [(set RRegs:$a, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<0b011010, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldc $a, $b", [(set RRegs:$a, immU16:$b)]>;
+}
+
+def SETC_ru6 : _FRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                     "setc res[$a], $b",
+                     [(int_xcore_setc GRRegs:$a, immU6:$b)]>;
+
+def SETC_lru6 : _FLRU6<0b111010, (outs), (ins GRRegs:$a, i32imm:$b),
+                       "setc res[$a], $b",
+                       [(int_xcore_setc GRRegs:$a, immU16:$b)]>;
+
+// Operand register - U6
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<0b011100, "bt">;
+defm BRBT: FRU6_LRU6_backwards_branch<0b011101, "bt">;
+defm BRFF: FRU6_LRU6_branch<0b011110, "bf">;
+defm BRBF: FRU6_LRU6_backwards_branch<0b011111, "bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let neverHasSideEffects = 1 in
+defm EXTSP : FU6_LU6_np<0b0111011110, "extsp">;
+
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<0b0111011101, "entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
+defm RETSP : FU6_LU6<0b0111011111, "retsp", XCoreRetsp>;
+}
+}
+
+let neverHasSideEffects = 1 in
+defm EXTDP : FU6_LU6_np<0b0111001110, "extdp">;
+
+let Uses = [R11], isCall=1 in
+defm BLAT : FU6_LU6_np<0b0111001101, "blat">;
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def BRBU_u6 : _FU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
+
+def BRBU_lu6 : _FLU6<0b0111011100, (outs), (ins brtarget:$a), "bu -$a", []>;
+
+def BRFU_u6 : _FU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+
+def BRFU_lu6 : _FLU6<0b0111001100, (outs), (ins brtarget:$a), "bu $a", []>;
+}
+
+//let Uses = [CP] in ...
+let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWCP_u6: _FU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
+                    []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLU6<0b0111111101, (outs), (ins MEMii:$a), "ldaw r11, cp[$a]",
+                      [(set R11, ADDRcpii:$a)]>;
+
+let Defs = [R11] in
+defm GETSR : FU6_LU6_np<0b0111111100, "getsr r11,">;
+
+defm SETSR : FU6_LU6_int<0b0111101101, "setsr", int_xcore_setsr>;
+
+defm CLRSR : FU6_LU6_int<0b0111101100, "clrsr", int_xcore_clrsr>;
+
+// setsr may cause a branch if it is used to enable events. clrsr may
+// branch if it is executed while events are enabled.
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    isCodeGenOnly = 1 in {
+defm SETSR_branch : FU6_LU6_np<0b0111101101, "setsr">;
+defm CLRSR_branch : FU6_LU6_np<0b0111101100, "clrsr">;
+}
+
+defm KCALL : FU6_LU6_np<0b0111001111, "kcall">;
+
+let Uses = [SP], Defs = [SP], mayStore = 1 in
+defm KENTSP : FU6_LU6_np<0b0111101110, "kentsp">;
+
+let Uses = [SP], Defs = [SP], mayLoad = 1 in
+defm KRESTSP : FU6_LU6_np<0b0111101111, "krestsp">;
+
+// U10
+
+let Defs = [R11], isReMaterializable = 1, neverHasSideEffects = 1 in
+def LDAPF_u10 : _FU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a", []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAPF_lu10 : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+                        [(set R11, (pcrelwrapper tglobaladdr:$a))]>;
+
+let Defs = [R11], isReMaterializable = 1, isCodeGenOnly = 1 in
+def LDAPF_lu10_ba : _FLU10<0b110110, (outs), (ins i32imm:$a), "ldap r11, $a",
+                           [(set R11, (pcrelwrapper tblockaddress:$a))]>;
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLACP_u10 : _FU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLACP_lu10 : _FLU10<0b111000, (outs), (ins i32imm:$a), "bla cp[$a]", []>;
+
+def BLRF_u10 : _FU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+                     [(XCoreBranchLink immU10:$a)]>;
+
+def BLRF_lu10 : _FLU10<0b110100, (outs), (ins calltarget:$a), "bl $a",
+                       [(XCoreBranchLink immU20:$a)]>;
+}
+
+let Defs = [R11], mayLoad = 1, isReMaterializable = 1,
+    neverHasSideEffects = 1 in {
+def LDWCP_u10 : _FU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]", []>;
+
+def LDWCP_lu10 : _FLU10<0b111001, (outs), (ins i32imm:$a), "ldw r11, cp[$a]",
+                        []>;
+}
+
+// Two operand short
+def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<0b100100, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+let Constraints = "$src1 = $dst" in {
+def SEXT_rus :
+  _FRUSSrcDstBitp<0b001101, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "sext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def SEXT_2r :
+  _F2RSrcDst<0b001100, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "sext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ZEXT_rus :
+  _FRUSSrcDstBitp<0b010001, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "zext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def ZEXT_2r :
+  _F2RSrcDst<0b010000, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "zext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ANDNOT_2r :
+  _F2RSrcDst<0b001010, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "andnot $dst, $src2",
+             [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, neverHasSideEffects = 1 in
+def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
+                          "mkmsk $dst, $size", []>;
+
+def MKMSK_2r : _F2R<0b101000, (outs GRRegs:$dst), (ins GRRegs:$size),
+                    "mkmsk $dst, $size",
+                    [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
+
+def GETR_rus : _FRUS<0b100000, (outs GRRegs:$dst), (ins i32imm:$type),
+                     "getr $dst, $type",
+                     [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
+
+def GETTS_2r : _F2R<0b001110, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getts $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+
+def SETPT_2r : _FR2R<0b001111, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setpt res[$r], $val",
+                     [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_2r : _F2R<0b010010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outct res[$r], $val",
+                    [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
+
+def OUTCT_rus : _FRUS<0b010011, (outs), (ins GRRegs:$r, i32imm:$val),
+                       "outct res[$r], $val",
+                       [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
+
+def OUTT_2r : _FR2R<0b000011, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outt res[$r], $val",
+                    [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
+
+def OUT_2r : _FR2R<0b101010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                   "out res[$r], $val",
+                   [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
+
+let Constraints = "$src = $dst" in
+def OUTSHR_2r :
+  _F2RSrcDst<0b101011, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "outshr res[$r], $src",
+             [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+
+def INCT_2r : _F2R<0b100001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                   "inct $dst, res[$r]",
+                   [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
+
+def INT_2r : _F2R<0b100011, (outs GRRegs:$dst), (ins GRRegs:$r),
+                  "int $dst, res[$r]",
+                  [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
+
+def IN_2r : _F2R<0b101100, (outs GRRegs:$dst), (ins GRRegs:$r),
+                 "in $dst, res[$r]",
+                 [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
+
+let Constraints = "$src = $dst" in
+def INSHR_2r :
+  _F2RSrcDst<0b101101, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "inshr $dst, res[$r]",
+             [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+
+def CHKCT_2r : _F2R<0b110010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "chkct res[$r], $val",
+                    [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
+
+def CHKCT_rus : _FRUSBitp<0b110011, (outs), (ins GRRegs:$r, i32imm:$val),
+                          "chkct res[$r], $val",
+                          [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
+
+def TESTCT_2r : _F2R<0b101111, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "testct $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<0b110001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "testwct $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
+def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "setd res[$r], $val",
+                    [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
+
+def SETPSC_2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setpsc res[$src1], $src2",
+                      [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
+def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getst $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
+
+def INITSP_2r : _F2R<0b000100, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:sp, $src",
+                     [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
+
+def INITPC_2r : _F2R<0b000000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:pc, $src",
+                     [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
+
+def INITCP_2r : _F2R<0b000110, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:cp, $src",
+                     [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
+
+def INITDP_2r : _F2R<0b000010, (outs), (ins GRRegs:$src, GRRegs:$t),
+                     "init t[$t]:dp, $src",
+                     [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
+
+def PEEK_2r : _F2R<0b101110, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "peek $dst, res[$src]",
+                    [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "endin $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
+def EEF_2r : _F2R<0b001011, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eef $a, res[$b]", []>;
+
+def EET_2r : _F2R<0b001001, (outs), (ins GRRegs:$a, GRRegs:$b),
+                  "eet $a, res[$b]", []>;
+
+def TSETMR_2r : _F2RImm<0b000111, (outs), (ins i32imm:$a, GRRegs:$b),
+                        "tsetmr r$a, $b", []>;
+
+// Two operand long
+def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                       "bitrev $dst, $src",
+                       [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<0b0000011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "byterev $dst, $src",
+                        [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<0b000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "clz $dst, $src",
+                    [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+def GETD_l2r : _FL2R<0b0001111001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getd $dst, res[$src]", []>;
+
+def GETN_l2r : _FL2R<0b0011011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "getn $dst, res[$src]", []>;
+
+def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setc res[$r], $val",
+                     [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
+
+def SETTW_l2r : _FLR2R<0b0010011001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                       "settw res[$r], $val",
+                       [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+
+def GETPS_l2r : _FL2R<0b0001011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "get $dst, ps[$src]",
+                      [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+
+def SETPS_l2r : _FLR2R<0b0001111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "set ps[$src1], $src2",
+                       [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+
+def INITLR_l2r : _FL2R<0b0001011000, (outs), (ins GRRegs:$src, GRRegs:$t),
+                       "init t[$t]:lr, $src",
+                       [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
+
+def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setclk res[$src1], $src2",
+                        [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
+
+def SETN_l2r : _FLR2R<0b0011011000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                      "setn res[$src1], $src2", []>;
+
+def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setrdy res[$src1], $src2",
+                        [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
+
+def TESTLCL_l2r : _FL2R<0b0010011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "testlcl $dst, res[$src]", []>;
+
+// One operand short
+def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
+                    "msync res[$a]",
+                    [(int_xcore_msync GRRegs:$a)]>;
+def MJOIN_1r : _F1R<0b000101, (outs), (ins GRRegs:$a),
+                    "mjoin res[$a]",
+                    [(int_xcore_mjoin GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BAU_1r : _F1R<0b001001, (outs), (ins GRRegs:$a),
+                 "bau $a",
+                 [(brind GRRegs:$a)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
+                            "bru $i\n$t",
+                            [(XCoreBR_JT tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
+                              "bru $i\n$t",
+                              [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
+def BRU_1r : _F1R<0b001010, (outs), (ins GRRegs:$a), "bru $a", []>;
+
+let Defs=[SP], neverHasSideEffects=1 in
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a), "set sp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
+
+let neverHasSideEffects=1 in
+def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
+
+let hasCtrlDep = 1 in 
+def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
+                 "ecallt $a",
+                 []>;
+
+let hasCtrlDep = 1 in 
+def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
+                 "ecallf $a",
+                 []>;
+
+let isCall=1, 
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
+def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
+                 "bla $a",
+                 [(XCoreBranchLink GRRegs:$a)]>;
+}
+
+def SYNCR_1r : _F1R<0b100001, (outs), (ins GRRegs:$a),
+                 "syncr res[$a]",
+                 [(int_xcore_syncr GRRegs:$a)]>;
+
+def FREER_1r : _F1R<0b000100, (outs), (ins GRRegs:$a),
+               "freer res[$a]",
+               [(int_xcore_freer GRRegs:$a)]>;
+
+let Uses=[R11] in {
+def SETV_1r : _F1R<0b010001, (outs), (ins GRRegs:$a),
+                   "setv res[$a], r11",
+                   [(int_xcore_setv GRRegs:$a, R11)]>;
+
+def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
+                    "setev res[$a], r11",
+                    [(int_xcore_setev GRRegs:$a, R11)]>;
+}
+
+def DGETREG_1r : _F1R<0b001110, (outs GRRegs:$a), (ins), "dgetreg $a", []>;
+
+def EDU_1r : _F1R<0b000000, (outs), (ins GRRegs:$a), "edu res[$a]", []>;
+
+def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
+               "eeu res[$a]",
+               [(int_xcore_eeu GRRegs:$a)]>;
+
+def KCALL_1r : _F1R<0b010000, (outs), (ins GRRegs:$a), "kcall $a", []>;
+
+def WAITEF_1R : _F1R<0b000011, (outs), (ins GRRegs:$a), "waitef $a", []>;
+
+def WAITET_1R : _F1R<0b000010, (outs), (ins GRRegs:$a), "waitet $a", []>;
+
+def TSTART_1R : _F1R<0b000110, (outs), (ins GRRegs:$a), "start t[$a]", []>;
+
+def CLRPT_1R : _F1R<0b100000, (outs), (ins GRRegs:$a), "clrpt res[$a]", []>;
+
+// Zero operand short
+
+def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
+
+def DCALL_0R : _F0R<0b0000011100, (outs), (ins), "dcall", []>;
+
+let Defs = [SP], Uses = [SP] in
+def DENTSP_0R : _F0R<0b0001001100, (outs), (ins), "dentsp", []>;
+
+let Defs = [SP] in
+def DRESTSP_0R : _F0R<0b0001001101, (outs), (ins), "drestsp", []>;
+
+def DRET_0R : _F0R<0b0000011110, (outs), (ins), "dret", []>;
+
+def FREET_0R : _F0R<0b0000001111, (outs), (ins), "freet", []>;
+
+let Defs = [R11] in {
+def GETID_0R : _F0R<0b0001001110, (outs), (ins),
+                    "get r11, id",
+                    [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<0b0000111110, (outs), (ins),
+                    "get r11, ed",
+                    [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<0b0000111111, (outs), (ins),
+                    "get r11, et",
+                    [(set R11, (int_xcore_getet))]>;
+
+def GETKEP_0R : _F0R<0b0001001111, (outs), (ins),
+                     "get r11, kep", []>;
+
+def GETKSP_0R : _F0R<0b0001011100, (outs), (ins),
+                     "get r11, ksp", []>;
+}
+
+let Defs = [SP] in
+def KRET_0R : _F0R<0b0000011101, (outs), (ins), "kret", []>;
+
+let Uses = [SP], mayLoad = 1 in {
+def LDET_0R : _F0R<0b0001011110, (outs), (ins), "ldw et, sp[4]", []>;
+
+def LDSED_0R : _F0R<0b0001011101, (outs), (ins), "ldw sed, sp[3]", []>;
+
+def LDSPC_0R : _F0R<0b0000101100, (outs), (ins), "ldw spc, sp[1]", []>;
+
+def LDSSR_0R : _F0R<0b0000101110, (outs), (ins), "ldw ssr, sp[2]", []>;
+}
+
+let Uses=[R11] in
+def SETKEP_0R : _F0R<0b0000011111, (outs), (ins), "set kep, r11", []>;
+
+def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
+                    "ssync",
+                    [(int_xcore_ssync)]>;
+
+let Uses = [SP], mayStore = 1 in {
+def STET_0R : _F0R<0b0000111101, (outs), (ins), "stw et, sp[4]", []>;
+
+def STSED_0R : _F0R<0b0000111100, (outs), (ins), "stw sed, sp[3]", []>;
+
+def STSPC_0R : _F0R<0b0000101101, (outs), (ins), "stw spc, sp[1]", []>;
+
+def STSSR_0R : _F0R<0b0000101111, (outs), (ins), "stw ssr, sp[2]", []>;
+}
+
+let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
+    hasSideEffects = 1 in
+def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
+                     "waiteu",
+                     [(brind (int_xcore_waitevent))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BLRF_lu10 tglobaladdr:$addr)>;
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BLRF_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+          (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+          
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (STW_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+          (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+          (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+          (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+          (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+          (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+          (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+          (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+          (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+          (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+          (ASHR_l2rus GRRegs:$src, 32)>;
+
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+          (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+          (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+          (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
new file mode 100644
index 000000000000..f96eda9fcb9f
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -0,0 +1,117 @@
+//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower XCore MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+#include "XCoreMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+XCoreMCInstLower::XCoreMCInstLower(class AsmPrinter &asmprinter)
+: Printer(asmprinter) {}
+
+void XCoreMCInstLower::Initialize(Mangler *M, MCContext *C) {
+  Mang = M;
+  Ctx = C;
+}
+
+MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MachineOperandType MOTy,
+                                               unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  const MCSymbol *Symbol;
+
+  switch (MOTy) {
+    case MachineOperand::MO_MachineBasicBlock:
+      Symbol = MO.getMBB()->getSymbol();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Symbol = Mang->getSymbol(MO.getGlobal());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Symbol = Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Symbol = Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      Symbol = Printer.GetJTISymbol(MO.getIndex());
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      Symbol = Printer.GetCPISymbol(MO.getIndex());
+      Offset += MO.getOffset();
+      break;
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::CreateExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
+}
+
+MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
+                                         unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+    default: llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) break;
+      return MCOperand::CreateReg(MO.getReg());
+    case MachineOperand::MO_Immediate:
+      return MCOperand::CreateImm(MO.getImm() + offset);
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return LowerSymbolOperand(MO, MOTy, offset);
+    case MachineOperand::MO_RegisterMask:
+      break;
+  }
+
+  return MCOperand();
+}
+
+void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
new file mode 100644
index 000000000000..28e702bb9884
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -0,0 +1,42 @@
+//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMCINSTLOWER_H
+#define XCOREMCINSTLOWER_H
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class Mangler;
+  class AsmPrinter;
+
+/// \brief This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  Mangler *Mang;
+  AsmPrinter &Printer;
+public:
+  XCoreMCInstLower(class AsmPrinter &asmprinter);
+  void Initialize(Mangler *mang, MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..7ca06729120e
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -0,0 +1,14 @@
+//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void XCoreFunctionInfo::anchor() { }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 000000000000..69d5de3e03ad
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -0,0 +1,69 @@
+//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMACHINEFUNCTIONINFO_H
+#define XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+  bool UsesLR;
+  int LRSpillSlot;
+  int FPSpillSlot;
+  int VarArgsFrameIndex;
+  std::vector<std::pair<MCSymbol*, CalleeSavedInfo> > SpillLabels;
+
+public:
+  XCoreFunctionInfo() :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  explicit XCoreFunctionInfo(MachineFunction &MF) :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  ~XCoreFunctionInfo() {}
+  
+  void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  
+  void setUsesLR(bool val) { UsesLR = val; }
+  bool getUsesLR() const { return UsesLR; }
+  
+  void setLRSpillSlot(int off) { LRSpillSlot = off; }
+  int getLRSpillSlot() const { return LRSpillSlot; }
+  
+  void setFPSpillSlot(int off) { FPSpillSlot = off; }
+  int getFPSpillSlot() const { return FPSpillSlot; }
+  
+  std::vector<std::pair<MCSymbol*, CalleeSavedInfo> > &getSpillLabels() {
+    return SpillLabels;
+  }
+};
+} // End llvm namespace
+
+#endif // XCOREMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 000000000000..49b563497c0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -0,0 +1,259 @@
+//===-- XCoreRegisterInfo.cpp - XCore Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCore.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+
+#define GET_REGINFO_TARGET_DESC
+#include "XCoreGenRegisterInfo.inc"
+
+using namespace llvm;
+
+XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
+  : XCoreGenRegisterInfo(XCore::LR), TII(tii) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
+  return MF.getMMI().hasDebugInfo() ||
+    MF.getFunction()->needsUnwindTableEntry();
+}
+
+const uint16_t* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const uint16_t CalleeSavedRegs[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9, XCore::R10, XCore::LR,
+    0
+  };
+  return CalleeSavedRegs;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  Reserved.set(XCore::CP);
+  Reserved.set(XCore::DP);
+  Reserved.set(XCore::SP);
+  Reserved.set(XCore::LR);
+  if (TFI->hasFP(MF)) {
+    Reserved.set(XCore::R10);
+  }
+  return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // TODO can we estimate stack size?
+  return TFI->hasFP(MF);
+}
+
+bool
+XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  return requiresRegisterScavenging(MF);
+}
+
+bool
+XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  return false;
+}
+
+void
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, unsigned FIOperandNum,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  MachineOperand &FrameOp = MI.getOperand(FIOperandNum);
+  int FrameIndex = FrameOp.getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  int StackSize = MF.getFrameInfo()->getStackSize();
+
+  #ifndef NDEBUG
+  DEBUG(errs() << "\nFunction         : " 
+        << MF.getName() << "\n");
+  DEBUG(errs() << "<--------->\n");
+  DEBUG(MI.print(errs()));
+  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
+  #endif
+
+  Offset += StackSize;
+
+  unsigned FrameReg = getFrameRegister(MF);
+
+  // Special handling of DBG_VALUE instructions.
+  if (MI.isDebugValue()) {
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // fold constant into offset.
+  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+  
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+
+  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
+  
+  Offset/=4;
+  
+  bool FP = TFI->hasFP(MF);
+
+  unsigned Reg = MI.getOperand(0).getReg();
+  bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
+
+  assert(XCore::GRRegsRegClass.contains(Reg) && "Unexpected register operand");
+  
+  MachineBasicBlock &MBB = *MI.getParent();
+  
+  if (FP) {
+    bool isUs = isImmUs(Offset);
+    
+    if (!isUs) {
+      if (!RS)
+        report_fatal_error("eliminateFrameIndex Frame size too big: " +
+                           Twine(Offset));
+      unsigned ScratchReg = RS->scavengeRegister(&XCore::GRRegsRegClass, II,
+                                                 SPAdj);
+      loadConstant(MBB, II, ScratchReg, Offset, dl);
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+              .addReg(FrameReg)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::STWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_l3r))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FrameReg)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::LDAWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+              .addReg(FrameReg)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      default:
+        llvm_unreachable("Unexpected Opcode");
+      }
+    } else {
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+              .addReg(FrameReg)
+              .addImm(Offset);
+        break;
+      case XCore::STWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FrameReg)
+              .addImm(Offset);
+        break;
+      case XCore::LDAWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+              .addReg(FrameReg)
+              .addImm(Offset);
+        break;
+      default:
+        llvm_unreachable("Unexpected Opcode");
+      }
+    }
+  } else {
+    bool isU6 = isImmU6(Offset);
+    if (!isU6 && !isImmU16(Offset))
+      report_fatal_error("eliminateFrameIndex Frame size too big: " +
+                         Twine(Offset));
+
+    switch (MI.getOpcode()) {
+    int NewOpcode;
+    case XCore::LDWFI:
+      NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    case XCore::STWFI:
+      NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode))
+            .addReg(Reg, getKillRegState(isKill))
+            .addImm(Offset);
+      break;
+    case XCore::LDAWFI:
+      NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    default:
+      llvm_unreachable("Unexpected Opcode");
+    }
+  }
+  // Erase old instruction.
+  MBB.erase(II);
+}
+
+void XCoreRegisterInfo::
+loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+            unsigned DstReg, int64_t Value, DebugLoc dl) const {
+  // TODO use mkmsk if possible.
+  if (!isImmU16(Value)) {
+    // TODO use constant pool.
+    report_fatal_error("loadConstant value too big " + Twine(Value));
+  }
+  int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
+}
+
+unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 000000000000..1db32489cf8d
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -0,0 +1,70 @@
+//===-- XCoreRegisterInfo.h - XCore Register Information Impl ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREREGISTERINFO_H
+#define XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "XCoreGenRegisterInfo.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+  void loadConstant(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int64_t Value, DebugLoc dl) const;
+
+  void storeToStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned SrcReg, int Offset, DebugLoc dl) const;
+
+  void loadFromStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int Offset, DebugLoc dl) const;
+
+public:
+  XCoreRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+
+  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+
+  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator II,
+                           int SPAdj, unsigned FIOperandNum,
+                           RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  //! Return whether to emit frame moves
+  static bool needsFrameMoves(const MachineFunction &MF);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 000000000000..6694b2882aca
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreRegisterInfo.td
@@ -0,0 +1,59 @@
+//===-- XCoreRegisterInfo.td - XCore Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the XCore register file 
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+  let Num = num;
+}
+
+// CPU registers
+def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+  // Return values and arguments
+  (add R0, R1, R2, R3,
+  // Callee save
+  R4, R5, R6, R7, R8, R9, R10,
+  // Not preserved across procedure calls
+  R11)>;
+
+// Reserved
+def RRegs : RegisterClass<"XCore", [i32], 32,
+  (add R0, R1, R2, R3,
+   R4, R5, R6, R7, R8, R9, R10,
+   R11, CP, DP, SP, LR)> {
+  let isAllocatable = 0;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
new file mode 100644
index 000000000000..44aeb6057ccb
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -0,0 +1,23 @@
+//===-- XCoreSelectionDAGInfo.cpp - XCore SelectionDAG Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcore-selectiondag-info"
+#include "XCoreTargetMachine.h"
+using namespace llvm;
+
+XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+XCoreSelectionDAGInfo::~XCoreSelectionDAGInfo() {
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
new file mode 100644
index 000000000000..0386968638bd
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -0,0 +1,31 @@
+//===-- XCoreSelectionDAGInfo.h - XCore SelectionDAG Info -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the XCore subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESELECTIONDAGINFO_H
+#define XCORESELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class XCoreTargetMachine;
+
+class XCoreSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
+  ~XCoreSelectionDAGInfo();
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 000000000000..8cfb77089f31
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -0,0 +1,30 @@
+//===-- XCoreSubtarget.cpp - XCore Subtarget Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "XCoreGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+void XCoreSubtarget::anchor() { }
+
+XCoreSubtarget::XCoreSubtarget(const std::string &TT,
+                               const std::string &CPU, const std::string &FS)
+  : XCoreGenSubtargetInfo(TT, CPU, FS)
+{
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 000000000000..5ac4dbc4bc07
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreSubtarget.h
@@ -0,0 +1,43 @@
+//===-- XCoreSubtarget.h - Define Subtarget for the XCore -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtargetInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESUBTARGET_H
+#define XCORESUBTARGET_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "XCoreGenSubtargetInfo.inc"
+
+namespace llvm {
+class StringRef;
+
+class XCoreSubtarget : public XCoreGenSubtargetInfo {
+  virtual void anchor();
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  XCoreSubtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+};
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 000000000000..28c3d12c05fe
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -0,0 +1,65 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetMachine.h"
+#include "XCore.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+/// XCoreTargetMachine ctor - Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+    Subtarget(TT, CPU, FS),
+    DL("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
+               "i16:16:32-i32:32:32-i64:32:32-n32"),
+    InstrInfo(),
+    FrameLowering(Subtarget),
+    TLInfo(*this),
+    TSInfo(*this) {
+}
+
+namespace {
+/// XCore Code Generator Pass Configuration Options.
+class XCorePassConfig : public TargetPassConfig {
+public:
+  XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  XCoreTargetMachine &getXCoreTargetMachine() const {
+    return getTM<XCoreTargetMachine>();
+  }
+
+  virtual bool addInstSelector();
+};
+} // namespace
+
+TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new XCorePassConfig(this, PM);
+}
+
+bool XCorePassConfig::addInstSelector() {
+  addPass(createXCoreISelDag(getXCoreTargetMachine(), getOptLevel()));
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+  RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 000000000000..eb9a1aa420eb
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -0,0 +1,64 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETMACHINE_H
+#define XCORETARGETMACHINE_H
+
+#include "XCoreFrameLowering.h"
+#include "XCoreISelLowering.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSelectionDAGInfo.h"
+#include "XCoreSubtarget.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+  XCoreSubtarget Subtarget;
+  const DataLayout DL;       // Calculates type size & alignment
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameLowering FrameLowering;
+  XCoreTargetLowering TLInfo;
+  XCoreSelectionDAGInfo TSInfo;
+public:
+  XCoreTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS, const TargetOptions &Options,
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+
+  virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const XCoreFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual const XCoreTargetLowering *getTargetLowering() const {
+    return &TLInfo;
+  }
+
+  virtual const XCoreSelectionDAGInfo* getSelectionDAGInfo() const {
+    return &TSInfo;
+  }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
+
+  // Pass Pipeline Configuration
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 000000000000..820389935b38
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -0,0 +1,65 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  DataSection =
+    Ctx.getELFSection(".dp.data", ELF::SHT_PROGBITS, 
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getDataRel());
+  BSSSection =
+    Ctx.getELFSection(".dp.bss", ELF::SHT_NOBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_WRITE |
+                      ELF::XCORE_SHF_DP_SECTION,
+                      SectionKind::getBSS());
+  
+  MergeableConst4Section = 
+    Ctx.getELFSection(".cp.rodata.cst4", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst4());
+  MergeableConst8Section = 
+    Ctx.getELFSection(".cp.rodata.cst8", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst8());
+  MergeableConst16Section = 
+    Ctx.getELFSection(".cp.rodata.cst16", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC | ELF::SHF_MERGE |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getMergeableConst16());
+  
+  // TLS globals are lowered in the backend to arrays indexed by the current
+  // thread id. After lowering they require no special handling by the linker
+  // and can be placed in the standard data / bss sections.
+  TLSDataSection = DataSection;
+  TLSBSSSection = BSSSection;
+
+  ReadOnlySection = 
+    Ctx.getELFSection(".cp.rodata", ELF::SHT_PROGBITS,
+                      ELF::SHF_ALLOC |
+                      ELF::XCORE_SHF_CP_SECTION,
+                      SectionKind::getReadOnlyWithRel());
+
+  // Dynamic linking is not supported. Data with relocations is placed in the
+  // same section as data without relocations.
+  DataRelSection = DataRelLocalSection = DataSection;
+  DataRelROSection = DataRelROLocalSection = ReadOnlySection;
+}
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 000000000000..27875e783b33
--- /dev/null
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -0,0 +1,25 @@
+//===-- XCoreTargetObjectFile.h - XCore Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+namespace llvm {
+
+  class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    // TODO: Classify globals as xcore wishes.
+  };
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
new file mode 100644
index 000000000000..e6fa4edf612e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -0,0 +1,897 @@
+//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments.  In
+// practice, this means looking for internal functions that have pointer
+// arguments.  If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value.  This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded.  Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently.  This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "argpromotion"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead     , "Number of dead pointer args eliminated");
+
+namespace {
+  /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+  ///
+  struct ArgPromotion : public CallGraphSCCPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnSCC(CallGraphSCC &SCC);
+    static char ID; // Pass identification, replacement for typeid
+    explicit ArgPromotion(unsigned maxElements = 3)
+        : CallGraphSCCPass(ID), maxElements(maxElements) {
+      initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+    }
+
+    /// A vector used to hold the indices of a single GEP instruction
+    typedef std::vector<uint64_t> IndicesVector;
+
+  private:
+    CallGraphNode *PromoteArguments(CallGraphNode *CGN);
+    bool isSafeToPromoteArgument(Argument *Arg, bool isByVal) const;
+    CallGraphNode *DoPromotion(Function *F,
+                               SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                               SmallPtrSet<Argument*, 8> &ByValArgsToTransform);
+    /// The maximum number of elements to expand, or 0 for unlimited.
+    unsigned maxElements;
+  };
+}
+
+char ArgPromotion::ID = 0;
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+                "Promote 'by reference' arguments to scalars", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+                "Promote 'by reference' arguments to scalars", false, false)
+
+Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
+  return new ArgPromotion(maxElements);
+}
+
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  bool Changed = false, LocalChange;
+
+  do {  // Iterate until we stop promoting from this SCC.
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+      if (CallGraphNode *CGN = PromoteArguments(*I)) {
+        LocalChange = true;
+        SCC.ReplaceNode(*I, CGN);
+      }
+    }
+    Changed |= LocalChange;               // Remember that we changed something.
+  } while (LocalChange);
+  
+  return Changed;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
+  Function *F = CGN->getFunction();
+
+  // Make sure that it is local to this module.
+  if (!F || !F->hasLocalLinkage()) return 0;
+
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs;
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++ArgNo)
+    if (I->getType()->isPointerTy())
+      PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo));
+  if (PointerArgs.empty()) return 0;
+
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.  Also see if the function
+  // is self-recursive.
+  bool isSelfRecursive = false;
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end();
+       UI != E; ++UI) {
+    CallSite CS(*UI);
+    // Must be a direct call.
+    if (CS.getInstruction() == 0 || !CS.isCallee(UI)) return 0;
+    
+    if (CS.getInstruction()->getParent()->getParent() == F)
+      isSelfRecursive = true;
+  }
+  
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument*, 8> ArgsToPromote;
+  SmallPtrSet<Argument*, 8> ByValArgsToTransform;
+  for (unsigned i = 0; i != PointerArgs.size(); ++i) {
+    bool isByVal=F->getAttributes().
+      hasAttribute(PointerArgs[i].second+1, Attribute::ByVal);
+    Argument *PtrArg = PointerArgs[i].first;
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe.
+    if (isByVal) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (maxElements > 0 && STy->getNumElements() > maxElements) {
+          DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                << PtrArg->getName() << "' because it would require adding more"
+                << " than " << maxElements << " arguments to the function.\n");
+          continue;
+        }
+        
+        // If all the elements are single-value types, we can promote it.
+        bool AllSimple = true;
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          if (!STy->getElementType(i)->isSingleValueType()) {
+            AllSimple = false;
+            break;
+          }
+        }
+
+        // Safe to transform, don't even bother trying to "promote" it.
+        // Passing the elements as a scalar will allow scalarrepl to hack on
+        // the new alloca we introduce.
+        if (AllSimple) {
+          ByValArgsToTransform.insert(PtrArg);
+          continue;
+        }
+      }
+    }
+
+    // If the argument is a recursive type and we're in a recursive
+    // function, we could end up infinitely peeling the function argument.
+    if (isSelfRecursive) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        bool RecursiveType = false;
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          if (STy->getElementType(i) == PtrArg->getType()) {
+            RecursiveType = true;
+            break;
+          }
+        }
+        if (RecursiveType)
+          continue;
+      }
+    }
+    
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, isByVal))
+      ArgsToPromote.insert(PtrArg);
+  }
+
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
+    return 0;
+
+  return DoPromotion(F, ArgsToPromote, ByValArgsToTransform);
+}
+
+/// AllCallersPassInValidPointerForArgument - Return true if we can prove that
+/// all callees pass in a valid pointer for the specified function argument.
+static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
+  Function *Callee = Arg->getParent();
+
+  unsigned ArgNo = std::distance(Callee->arg_begin(),
+                                 Function::arg_iterator(Arg));
+
+  // Look at all call sites of the function.  At this pointer we know we only
+  // have direct callees.
+  for (Value::use_iterator UI = Callee->use_begin(), E = Callee->use_end();
+       UI != E; ++UI) {
+    CallSite CS(*UI);
+    assert(CS && "Should only have direct calls!");
+
+    if (!CS.getArgument(ArgNo)->isDereferenceablePointer())
+      return false;
+  }
+  return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix,
+                     const ArgPromotion::IndicesVector &Longer) {
+  if (Prefix.size() > Longer.size())
+    return false;
+  return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
+}
+
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool PrefixIn(const ArgPromotion::IndicesVector &Indices,
+                     std::set<ArgPromotion::IndicesVector> &Set) {
+    std::set<ArgPromotion::IndicesVector>::iterator Low;
+    Low = Set.upper_bound(Indices);
+    if (Low != Set.begin())
+      Low--;
+    // Low is now the last element smaller than or equal to Indices. This means
+    // it points to a prefix of Indices (possibly Indices itself), if such
+    // prefix exists.
+    //
+    // This load is safe if any prefix of its operands is safe to load.
+    return Low != Set.end() && IsPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void MarkIndicesSafe(const ArgPromotion::IndicesVector &ToMark,
+                            std::set<ArgPromotion::IndicesVector> &Safe) {
+  std::set<ArgPromotion::IndicesVector>::iterator Low;
+  Low = Safe.upper_bound(ToMark);
+  // Guard against the case where Safe is empty
+  if (Low != Safe.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This
+  // means it points to a prefix of Indices (possibly Indices itself), if
+  // such prefix exists.
+  if (Low != Safe.end()) {
+    if (IsPrefix(*Low, ToMark))
+      // If there is already a prefix of these indices (or exactly these
+      // indices) marked a safe, don't bother adding these indices
+      return;
+
+    // Increment Low, so we can use it as a "insert before" hint
+    ++Low;
+  }
+  // Insert
+  Low = Safe.insert(Low, ToMark);
+  ++Low;
+  // If there we're a prefix of longer index list(s), remove those
+  std::set<ArgPromotion::IndicesVector>::iterator End = Safe.end();
+  while (Low != End && IsPrefix(ToMark, *Low)) {
+    std::set<ArgPromotion::IndicesVector>::iterator Remove = Low;
+    ++Low;
+    Safe.erase(Remove);
+  }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
+  typedef std::set<IndicesVector> GEPIndicesSet;
+
+  // Quick exit for unused arguments
+  if (Arg->use_empty())
+    return true;
+
+  // We can only promote this argument if all of the uses are loads, or are GEP
+  // instructions (with constant indices) that are subsequently loaded.
+  //
+  // Promoting the argument causes it to be loaded in the caller
+  // unconditionally. This is only safe if we can prove that either the load
+  // would have happened in the callee anyway (ie, there is a load in the entry
+  // block) or the pointer passed in at every call site is guaranteed to be
+  // valid.
+  // In the former case, invalid loads can happen, but would have happened
+  // anyway, in the latter case, invalid loads won't happen. This prevents us
+  // from introducing an invalid load that wouldn't have happened in the
+  // original code.
+  //
+  // This set will contain all sets of indices that are loaded in the entry
+  // block, and thus are safe to unconditionally load in the caller.
+  GEPIndicesSet SafeToUnconditionallyLoad;
+
+  // This set contains all the sets of indices that we are planning to promote.
+  // This makes it possible to limit the number of arguments added.
+  GEPIndicesSet ToPromote;
+
+  // If the pointer is always valid, any load with first index 0 is valid.
+  if (isByVal || AllCallersPassInValidPointerForArgument(Arg))
+    SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+  // First, iterate the entry block and mark loads of (geps of) arguments as
+  // safe.
+  BasicBlock *EntryBlock = Arg->getParent()->begin();
+  // Declare this here so we can reuse it
+  IndicesVector Indices;
+  for (BasicBlock::iterator I = EntryBlock->begin(), E = EntryBlock->end();
+       I != E; ++I)
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      Value *V = LI->getPointerOperand();
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+        V = GEP->getPointerOperand();
+        if (V == Arg) {
+          // This load actually loads (part of) Arg? Check the indices then.
+          Indices.reserve(GEP->getNumIndices());
+          for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+               II != IE; ++II)
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+              Indices.push_back(CI->getSExtValue());
+            else
+              // We found a non-constant GEP index for this argument? Bail out
+              // right away, can't promote this argument at all.
+              return false;
+
+          // Indices checked out, mark them as safe
+          MarkIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          Indices.clear();
+        }
+      } else if (V == Arg) {
+        // Direct loads are equivalent to a GEP with a single 0 index.
+        MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+      }
+    }
+
+  // Now, iterate all uses of the argument to see if there are any uses that are
+  // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+  SmallVector<LoadInst*, 16> Loads;
+  IndicesVector Operands;
+  for (Value::use_iterator UI = Arg->use_begin(), E = Arg->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+    Operands.clear();
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Don't hack volatile/atomic loads
+      if (!LI->isSimple()) return false;
+      Loads.push_back(LI);
+      // Direct loads are equivalent to a GEP with a zero index and then a load.
+      Operands.push_back(0);
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      if (GEP->use_empty()) {
+        // Dead GEP's cause trouble later.  Just remove them if we run into
+        // them.
+        getAnalysis<AliasAnalysis>().deleteValue(GEP);
+        GEP->eraseFromParent();
+        // TODO: This runs the above loop over and over again for dead GEPs
+        // Couldn't we just do increment the UI iterator earlier and erase the
+        // use?
+        return isSafeToPromoteArgument(Arg, isByVal);
+      }
+
+      // Ensure that all of the indices are constants.
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end();
+        i != e; ++i)
+        if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+          Operands.push_back(C->getSExtValue());
+        else
+          return false;  // Not a constant operand GEP!
+
+      // Ensure that the only users of the GEP are load instructions.
+      for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          // Don't hack volatile/atomic loads
+          if (!LI->isSimple()) return false;
+          Loads.push_back(LI);
+        } else {
+          // Other uses than load?
+          return false;
+        }
+    } else {
+      return false;  // Not a load or a GEP.
+    }
+
+    // Now, see if it is safe to promote this load / loads of this GEP. Loading
+    // is safe if Operands, or a prefix of Operands, is marked as safe.
+    if (!PrefixIn(Operands, SafeToUnconditionallyLoad))
+      return false;
+
+    // See if we are already promoting a load with these indices. If not, check
+    // to make sure that we aren't promoting too many elements.  If so, nothing
+    // to do.
+    if (ToPromote.find(Operands) == ToPromote.end()) {
+      if (maxElements > 0 && ToPromote.size() == maxElements) {
+        DEBUG(dbgs() << "argpromotion not promoting argument '"
+              << Arg->getName() << "' because it would require adding more "
+              << "than " << maxElements << " arguments to the function.\n");
+        // We limit aggregate promotion to only promoting up to a fixed number
+        // of elements of the aggregate.
+        return false;
+      }
+      ToPromote.insert(Operands);
+    }
+  }
+
+  if (Loads.empty()) return true;  // No users, this is a dead argument.
+
+  // Okay, now we know that the argument is only used by load instructions and
+  // it is safe to unconditionally perform all of them. Use alias analysis to
+  // check to see if the pointer is guaranteed to not be modified from entry of
+  // the function to each of the load instructions.
+
+  // Because there could be several/many load instructions, remember which
+  // blocks we know to be transparent to the load.
+  SmallPtrSet<BasicBlock*, 16> TranspBlocks;
+
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  for (unsigned i = 0, e = Loads.size(); i != e; ++i) {
+    // Check to see if the load is invalidated from the start of the block to
+    // the load itself.
+    LoadInst *Load = Loads[i];
+    BasicBlock *BB = Load->getParent();
+
+    AliasAnalysis::Location Loc = AA.getLocation(Load);
+    if (AA.canInstructionRangeModify(BB->front(), *Load, Loc))
+      return false;  // Pointer is invalidated!
+
+    // Now check every path from the entry block to the load for transparency.
+    // To do this, we perform a depth first search on the inverse CFG from the
+    // loading block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      for (idf_ext_iterator<BasicBlock*, SmallPtrSet<BasicBlock*, 16> >
+             I = idf_ext_begin(P, TranspBlocks),
+             E = idf_ext_end(P, TranspBlocks); I != E; ++I)
+        if (AA.canBasicBlockModify(**I, Loc))
+          return false;
+    }
+  }
+
+  // If the path from the entry of the function to each load is free of
+  // instructions that potentially invalidate the load, we can make the
+  // transformation!
+  return true;
+}
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+CallGraphNode *ArgPromotion::DoPromotion(Function *F,
+                               SmallPtrSet<Argument*, 8> &ArgsToPromote,
+                              SmallPtrSet<Argument*, 8> &ByValArgsToTransform) {
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type*> Params;
+
+  typedef std::set<IndicesVector> ScalarizeTable;
+
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument*, ScalarizeTable> ScalarizedElements;
+
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  std::map<IndicesVector, LoadInst*> OriginalLoads;
+
+  // Attribute - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeSet, 8> AttributesVec;
+  const AttributeSet &PAL = F->getAttributes();
+
+  // Add any return attributes.
+  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getRetAttributes()));
+
+  // First, determine the new argument list
+  unsigned ArgIndex = 1;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgIndex) {
+    if (ByValArgsToTransform.count(I)) {
+      // Simple byval argument? Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        Params.push_back(STy->getElementType(i));
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes(ArgIndex)) {
+        AttrBuilder B(attrs, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
+
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[I];
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+           ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        assert(isa<LoadInst>(User) || isa<GetElementPtrInst>(User));
+        IndicesVector Indices;
+        Indices.reserve(User->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = User->op_begin() + 1, IE = User->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(Indices);
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(User))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(User->use_back());
+        OriginalLoads[Indices] = OrigLoad;
+      }
+
+      // Add a parameter to the function for each element passed in.
+      for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+             E = ArgIndices.end(); SI != E; ++SI) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), *SI));
+        assert(Params.back());
+      }
+
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
+
+  // Add any function attributes.
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
+                                              PAL.getFnAttributes()));
+
+  Type *RetTy = FTy->getReturnType();
+
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
+
+  
+  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+        << "From: " << *F);
+  
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
+  AttributesVec.clear();
+
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Get the alias analysis information that we need to update to reflect our
+  // changes.
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraph>();
+  
+  // Get a new callgraph node for NF.
+  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value*, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->use_back());
+    assert(CS.getCalledFunction() == F);
+    Instruction *Call = CS.getInstruction();
+    const AttributeSet &CallPAL = CS.getAttributes();
+
+    // Add any return attributes.
+    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                                CallPAL.getRetAttributes()));
+
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgIndex = 1;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I, ++AI, ++ArgIndex)
+      if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+        Args.push_back(*AI);          // Unmodified argument
+
+        if (CallPAL.hasAttributes(ArgIndex)) {
+          AttrBuilder B(CallPAL, ArgIndex);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
+      } else if (ByValArgsToTransform.count(I)) {
+        // Emit a GEP and load for each element of the struct.
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = {
+              ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+          Value *Idx = GetElementPtrInst::Create(*AI, Idxs,
+                                                 (*AI)->getName()+"."+utostr(i),
+                                                 Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse.
+        std::vector<Value*> Ops;
+        for (ScalarizeTable::iterator SI = ArgIndices.begin(),
+               E = ArgIndices.end(); SI != E; ++SI) {
+          Value *V = *AI;
+          LoadInst *OrigLoad = OriginalLoads[*SI];
+          if (!SI->empty()) {
+            Ops.reserve(SI->size());
+            Type *ElTy = V->getType();
+            for (IndicesVector::const_iterator II = SI->begin(),
+                 IE = SI->end(); II != IE; ++II) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              Type *IdxTy = (ElTy->isStructTy() ?
+                    Type::getInt32Ty(F->getContext()) : 
+                    Type::getInt64Ty(F->getContext()));
+              Ops.push_back(ConstantInt::get(IdxTy, *II));
+              // Keep track of the type we're currently indexing.
+              ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
+            }
+            // And create a GEP to extract those indices.
+            V = GetElementPtrInst::Create(V, Ops, V->getName()+".idx", Call);
+            Ops.clear();
+            AA.copyValue(OrigLoad->getOperand(0), V);
+          }
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call);
+          newLoad->setAlignment(OrigLoad->getAlignment());
+          // Transfer the TBAA info too.
+          newLoad->setMetadata(LLVMContext::MD_tbaa,
+                               OrigLoad->getMetadata(LLVMContext::MD_tbaa));
+          Args.push_back(newLoad);
+          AA.copyValue(OrigLoad, Args.back());
+        }
+      }
+
+    // Push any varargs arguments on the list.
+    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
+      Args.push_back(*AI);
+      if (CallPAL.hasAttributes(ArgIndex)) {
+        AttrBuilder B(CallPAL, ArgIndex);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
+    }
+
+    // Add any function attributes.
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args, "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
+                                                            AttributesVec));
+    } else {
+      New = CallInst::Create(NF, Args, "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
+                                                          AttributesVec));
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    Args.clear();
+    AttributesVec.clear();
+
+    // Update the alias analysis implementation to know that we are replacing
+    // the old call with a new one.
+    AA.replaceWithNewValue(Call, New);
+
+    // Update the callgraph to know that the callsite has been transformed.
+    CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
+    CalleeNode->replaceCallEdge(Call, New, NF_CGN);
+
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(New);
+      New->takeName(Call);
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I) {
+    if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      AA.replaceWithNewValue(I, I2);
+      ++I2;
+      continue;
+    }
+
+    if (ByValArgsToTransform.count(I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = NF->begin()->begin();
+
+      // Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
+      StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = {
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
+
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+        Value *Idx = 
+          GetElementPtrInst::Create(TheAlloca, Idxs,
+                                    TheAlloca->getName()+"."+Twine(i), 
+                                    InsertPt);
+        I2->setName(I->getName()+"."+Twine(i));
+        new StoreInst(I2++, Idx, InsertPt);
+      }
+
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(I);
+      AA.replaceWithNewValue(I, TheAlloca);
+      continue;
+    }
+
+    if (I->use_empty()) {
+      AA.deleteValue(I);
+      continue;
+    }
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->use_back())) {
+        assert(ArgIndices.begin()->empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName()+".val");
+        LI->replaceAllUsesWith(I2);
+        AA.replaceWithNewValue(LI, I2);
+        LI->eraseFromParent();
+        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+              << "' in function '" << F->getName() << "'\n");
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->use_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             *It != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
+        }
+
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+            NewName += "." + utostr(Operands[i]);
+        }
+        NewName += ".val";
+        TheArg->setName(NewName);
+
+        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+              << "' of function '" << NF->getName() << "'\n");
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->use_back());
+          L->replaceAllUsesWith(TheArg);
+          AA.replaceWithNewValue(L, TheArg);
+          L->eraseFromParent();
+        }
+        AA.deleteValue(GEP);
+        GEP->eraseFromParent();
+      }
+    }
+
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    std::advance(I2, ArgIndices.size());
+  }
+
+  // Tell the alias analysis that the old function is about to disappear.
+  AA.replaceWithNewValue(F, NF);
+
+  
+  NF_CGN->stealCalledFunctionsFrom(CG[F]);
+  
+  // Now that the old function is dead, delete it.  If there is a dangling
+  // reference to the CallgraphNode, just leave the dead function around for
+  // someone else to nuke.
+  CallGraphNode *CGN = CG[F];
+  if (CGN->getNumReferences() == 0)
+    delete CG.removeFunctionFromModule(CGN);
+  else
+    F->setLinkage(Function::ExternalLinkage);
+  
+  return NF_CGN;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
new file mode 100644
index 000000000000..2e32240621f9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -0,0 +1,47 @@
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// \brief A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+  static char ID; // Pass identification.
+
+  BarrierNoop() : ModulePass(ID) {
+    initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+                false, false)
diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
new file mode 100644
index 000000000000..8336d3ad3479
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -0,0 +1,229 @@
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared.  This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constmerge"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+STATISTIC(NumMerged, "Number of global constants merged");
+
+namespace {
+  struct ConstantMerge : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantMerge() : ModulePass(ID) {
+      initializeConstantMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    // For this pass, process all of the globals in the module, eliminating
+    // duplicate constants.
+    bool runOnModule(Module &M);
+
+    // Return true iff we can determine the alignment of this global variable.
+    bool hasKnownAlignment(GlobalVariable *GV) const;
+
+    // Return the alignment of the global, including converting the default
+    // alignment to a concrete value.
+    unsigned getAlignment(GlobalVariable *GV) const;
+
+    const DataLayout *TD;
+  };
+}
+
+char ConstantMerge::ID = 0;
+INITIALIZE_PASS(ConstantMerge, "constmerge",
+                "Merge Duplicate Global Constants", false, false)
+
+ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
+
+
+
+/// Find values that are marked as llvm.used.
+static void FindUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
+  if (Inits == 0) return;
+  
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV = 
+        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+// True if A is better than B.
+static bool IsBetterCannonical(const GlobalVariable &A,
+                               const GlobalVariable &B) {
+  if (!A.hasLocalLinkage() && B.hasLocalLinkage())
+    return true;
+
+  if (A.hasLocalLinkage() && !B.hasLocalLinkage())
+    return false;
+
+  return A.hasUnnamedAddr();
+}
+
+bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
+  return TD || GV->getAlignment() != 0;
+}
+
+unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  if (TD)
+    return TD->getPreferredAlignment(GV);
+  return GV->getAlignment();
+}
+
+bool ConstantMerge::runOnModule(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  // Find all the globals that are marked "used".  These cannot be merged.
+  SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
+  FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
+  FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
+  
+  // Map unique <constants, has-unknown-alignment> pairs to globals.  We don't
+  // want to merge globals of unknown alignment with those of explicit
+  // alignment.  If we have DataLayout, we always know the alignment.
+  DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap;
+
+  // Replacements - This vector contains a list of replacements to perform.
+  SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements;
+
+  bool MadeChange = false;
+
+  // Iterate constant merging while we are still making progress.  Merging two
+  // constants together may allow us to merge other constants together if the
+  // second level constants have initializers which point to the globals that
+  // were just merged.
+  while (1) {
+
+    // First: Find the canonical constants others will be merged with.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = GVI++;
+
+      // If this GV is dead, remove it.
+      GV->removeDeadConstantUsers();
+      if (GV->use_empty() && GV->hasLocalLinkage()) {
+        GV->eraseFromParent();
+        continue;
+      }
+
+      // Only process constants with initializers in the default address space.
+      if (!GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+          // Don't touch values marked with attribute(used).
+          UsedGlobals.count(GV))
+        continue;
+
+      // This transformation is legal for weak ODR globals in the sense it
+      // doesn't change semantics, but we really don't want to perform it
+      // anyway; it's likely to pessimize code generation, and some tools
+      // (like the Darwin linker in cases involving CFString) don't expect it.
+      if (GV->isWeakForLinker())
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *&Slot = CMap[Pair];
+
+      // If this is the first constant we find or if the old one is local,
+      // replace with the current one. If the current is externally visible
+      // it cannot be replace, but can be the canonical constant we merge with.
+      if (Slot == 0 || IsBetterCannonical(*GV, *Slot))
+        Slot = GV;
+    }
+
+    // Second: identify all globals that can be merged together, filling in
+    // the Replacements vector.  We cannot do the replacement in this pass
+    // because doing so may cause initializers of other globals to be rewritten,
+    // invalidating the Constant* pointers in CMap.
+    for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+         GVI != E; ) {
+      GlobalVariable *GV = GVI++;
+
+      // Only process constants with initializers in the default address space.
+      if (!GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+          GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+          // Don't touch values marked with attribute(used).
+          UsedGlobals.count(GV))
+        continue;
+
+      // We can only replace constant with local linkage.
+      if (!GV->hasLocalLinkage())
+        continue;
+
+      Constant *Init = GV->getInitializer();
+
+      // Check to see if the initializer is already known.
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *Slot = CMap[Pair];
+
+      if (!Slot || Slot == GV)
+        continue;
+
+      if (!Slot->hasUnnamedAddr() && !GV->hasUnnamedAddr())
+        continue;
+
+      if (!GV->hasUnnamedAddr())
+        Slot->setUnnamedAddr(false);
+
+      // Make all uses of the duplicate constant use the canonical version.
+      Replacements.push_back(std::make_pair(GV, Slot));
+    }
+
+    if (Replacements.empty())
+      return MadeChange;
+    CMap.clear();
+
+    // Now that we have figured out which replacements must be made, do them all
+    // now.  This avoid invalidating the pointers in CMap, which are unneeded
+    // now.
+    for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Bump the alignment if necessary.
+      if (Replacements[i].first->getAlignment() ||
+          Replacements[i].second->getAlignment()) {
+        Replacements[i].second->setAlignment(std::max(
+            Replacements[i].first->getAlignment(),
+            Replacements[i].second->getAlignment()));
+      }
+
+      // Eliminate any uses of the dead global.
+      Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
+
+      // Delete the global value from the module.
+      assert(Replacements[i].first->hasLocalLinkage() &&
+             "Refusing to delete an externally visible global variable.");
+      Replacements[i].first->eraseFromParent();
+    }
+
+    NumMerged += Replacements.size();
+    Replacements.clear();
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
new file mode 100644
index 000000000000..49ef1e75f1cd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -0,0 +1,1078 @@
+//===-- DeadArgumentElimination.cpp - Eliminate dead arguments ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions.  Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions.  This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "deadargelim"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated  , "Number of unused return values removed");
+STATISTIC(NumArgumentsReplacedWithUndef, 
+          "Number of unread args replaced with undef");
+namespace {
+  /// DAE - The dead argument elimination pass.
+  ///
+  class DAE : public ModulePass {
+  public:
+
+    /// Struct that represents (part of) either a return value or a function
+    /// argument.  Used so that arguments and return values can be used
+    /// interchangeably.
+    struct RetOrArg {
+      RetOrArg(const Function *F, unsigned Idx, bool IsArg) : F(F), Idx(Idx),
+               IsArg(IsArg) {}
+      const Function *F;
+      unsigned Idx;
+      bool IsArg;
+
+      /// Make RetOrArg comparable, so we can put it into a map.
+      bool operator<(const RetOrArg &O) const {
+        if (F != O.F)
+          return F < O.F;
+        else if (Idx != O.Idx)
+          return Idx < O.Idx;
+        else
+          return IsArg < O.IsArg;
+      }
+
+      /// Make RetOrArg comparable, so we can easily iterate the multimap.
+      bool operator==(const RetOrArg &O) const {
+        return F == O.F && Idx == O.Idx && IsArg == O.IsArg;
+      }
+
+      std::string getDescription() const {
+        return std::string((IsArg ? "Argument #" : "Return value #"))
+               + utostr(Idx) + " of function " + F->getName().str();
+      }
+    };
+
+    /// Liveness enum - During our initial pass over the program, we determine
+    /// that things are either alive or maybe alive. We don't mark anything
+    /// explicitly dead (even if we know they are), since anything not alive
+    /// with no registered uses (in Uses) will never be marked alive and will
+    /// thus become dead in the end.
+    enum Liveness { Live, MaybeLive };
+
+    /// Convenience wrapper
+    RetOrArg CreateRet(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, false);
+    }
+    /// Convenience wrapper
+    RetOrArg CreateArg(const Function *F, unsigned Idx) {
+      return RetOrArg(F, Idx, true);
+    }
+
+    typedef std::multimap<RetOrArg, RetOrArg> UseMap;
+    /// This maps a return value or argument to any MaybeLive return values or
+    /// arguments it uses. This allows the MaybeLive values to be marked live
+    /// when any of its users is marked live.
+    /// For example (indices are left out for clarity):
+    ///  - Uses[ret F] = ret G
+    ///    This means that F calls G, and F returns the value returned by G.
+    ///  - Uses[arg F] = ret G
+    ///    This means that some function calls G and passes its result as an
+    ///    argument to F.
+    ///  - Uses[ret F] = arg F
+    ///    This means that F returns one of its own arguments.
+    ///  - Uses[arg F] = arg G
+    ///    This means that G calls F and passes one of its own (G's) arguments
+    ///    directly to F.
+    UseMap Uses;
+
+    typedef std::set<RetOrArg> LiveSet;
+    typedef std::set<const Function*> LiveFuncSet;
+
+    /// This set contains all values that have been determined to be live.
+    LiveSet LiveValues;
+    /// This set contains all values that are cannot be changed in any way.
+    LiveFuncSet LiveFunctions;
+
+    typedef SmallVector<RetOrArg, 5> UseVector;
+
+    // Map each LLVM function to corresponding metadata with debug info. If
+    // the function is replaced with another one, we should patch the pointer
+    // to LLVM function in metadata.
+    // As the code generation for module is finished (and DIBuilder is
+    // finalized) we assume that subprogram descriptors won't be changed, and
+    // they are stored in map for short duration anyway.
+    typedef DenseMap<Function*, DISubprogram> FunctionDIMap;
+    FunctionDIMap FunctionDIs;
+
+  protected:
+    // DAH uses this to specify a different ID.
+    explicit DAE(char &ID) : ModulePass(ID) {}
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DAE() : ModulePass(ID) {
+      initializeDAEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M);
+
+    virtual bool ShouldHackArguments() const { return false; }
+
+  private:
+    Liveness MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses);
+    Liveness SurveyUse(Value::const_use_iterator U, UseVector &MaybeLiveUses,
+                       unsigned RetValNum = 0);
+    Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
+
+    void CollectFunctionDIs(Module &M);
+    void SurveyFunction(const Function &F);
+    void MarkValue(const RetOrArg &RA, Liveness L,
+                   const UseVector &MaybeLiveUses);
+    void MarkLive(const RetOrArg &RA);
+    void MarkLive(const Function &F);
+    void PropagateLiveness(const RetOrArg &RA);
+    bool RemoveDeadStuffFromFunction(Function *F);
+    bool DeleteDeadVarargs(Function &Fn);
+    bool RemoveDeadArgumentsFromCallers(Function &Fn);
+  };
+}
+
+
+char DAE::ID = 0;
+INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
+
+namespace {
+  /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+  /// deletes arguments to functions which are external.  This is only for use
+  /// by bugpoint.
+  struct DAH : public DAE {
+    static char ID;
+    DAH() : DAE(ID) {}
+
+    virtual bool ShouldHackArguments() const { return true; }
+  };
+}
+
+char DAH::ID = 0;
+INITIALIZE_PASS(DAH, "deadarghaX0r", 
+                "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
+                false, false)
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+///
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// CollectFunctionDIs - Map each function in the module to its debug info
+/// descriptor.
+void DAE::CollectFunctionDIs(Module &M) {
+  FunctionDIs.clear();
+
+  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+       E = M.named_metadata_end(); I != E; ++I) {
+    NamedMDNode &NMD = *I;
+    for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands();
+         MDIndex < MDNum; ++MDIndex) {
+      MDNode *Node = NMD.getOperand(MDIndex);
+      if (!DIDescriptor(Node).isCompileUnit())
+        continue;
+      DICompileUnit CU(Node);
+      const DIArray &SPs = CU.getSubprograms();
+      for (unsigned SPIndex = 0, SPNum = SPs.getNumElements();
+           SPIndex < SPNum; ++SPIndex) {
+        DISubprogram SP(SPs.getElement(SPIndex));
+        if (!SP.Verify())
+          continue;
+        if (Function *F = SP.getFunction())
+          FunctionDIs[F] = SP;
+      }
+    }
+  }
+}
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DAE::DeleteDeadVarargs(Function &Fn) {
+  assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+  if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+  // Ensure that the function is only directly called.
+  if (Fn.hasAddressTaken())
+    return false;
+
+  // Okay, we know we can transform this function if safe.  Scan its body
+  // looking for calls to llvm.vastart.
+  for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+      }
+    }
+  }
+
+  // If we get here, there are no calls to llvm.vastart in the function body,
+  // remove the "..." and adjust all the calls.
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but doesn't have isVarArg set.
+  FunctionType *FTy = Fn.getFunctionType();
+
+  std::vector<Type*> Params(FTy->param_begin(), FTy->param_end());
+  FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
+                                                Params, false);
+  unsigned NumArgs = Params.size();
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, Fn.getLinkage());
+  NF->copyAttributesFrom(&Fn);
+  Fn.getParent()->getFunctionList().insert(&Fn, NF);
+  NF->takeName(&Fn);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!Fn.use_empty()) {
+    CallSite CS(Fn.use_back());
+    Instruction *Call = CS.getInstruction();
+
+    // Pass all the same arguments.
+    Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
+
+    // Drop any attributes that were on the vararg arguments.
+    AttributeSet PAL = CS.getAttributes();
+    if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
+      SmallVector<AttributeSet, 8> AttributesVec;
+      for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
+        AttributesVec.push_back(PAL.getSlotAttributes(i));
+      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
+                                                  PAL.getFnAttributes()));
+      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
+    }
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args, "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(PAL);
+    } else {
+      New = CallInst::Create(NF, Args, "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(PAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    New->setDebugLoc(Call->getDebugLoc());
+
+    Args.clear();
+
+    if (!Call->use_empty())
+      Call->replaceAllUsesWith(New);
+
+    New->takeName(Call);
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.  While we're at
+  // it, remove the dead arguments from the DeadArguments list.
+  //
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+    // Move the name and users over to the new version.
+    I->replaceAllUsesWith(I2);
+    I2->takeName(I);
+  }
+
+  // Patch the pointer to LLVM function in debug info descriptor.
+  FunctionDIMap::iterator DI = FunctionDIs.find(&Fn);
+  if (DI != FunctionDIs.end())
+    DI->second.replaceFunction(NF);
+
+  // Finally, nuke the old function.
+  Fn.eraseFromParent();
+  return true;
+}
+
+/// RemoveDeadArgumentsFromCallers - Checks if the given function has any 
+/// arguments that are unused, and changes the caller parameters to be undefined
+/// instead.
+bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
+{
+  if (Fn.isDeclaration() || Fn.mayBeOverridden())
+    return false;
+
+  // Functions with local linkage should already have been handled.
+  if (Fn.hasLocalLinkage())
+    return false;
+
+  if (Fn.use_empty())
+    return false;
+
+  SmallVector<unsigned, 8> UnusedArgs;
+  for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(); 
+       I != E; ++I) {
+    Argument *Arg = I;
+
+    if (Arg->use_empty() && !Arg->hasByValAttr())
+      UnusedArgs.push_back(Arg->getArgNo());
+  }
+
+  if (UnusedArgs.empty())
+    return false;
+
+  bool Changed = false;
+
+  for (Function::use_iterator I = Fn.use_begin(), E = Fn.use_end(); 
+       I != E; ++I) {
+    CallSite CS(*I);
+    if (!CS || !CS.isCallee(I))
+      continue;
+
+    // Now go through all unused args and replace them with "undef".
+    for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
+      unsigned ArgNo = UnusedArgs[I];
+
+      Value *Arg = CS.getArgument(ArgNo);
+      CS.setArgument(ArgNo, UndefValue::get(Arg->getType()));
+      ++NumArgumentsReplacedWithUndef;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+  if (F->getReturnType()->isVoidTy())
+    return 0;
+  else if (StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+    return STy->getNumElements();
+  else
+    return 1;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DAE::Liveness DAE::MarkIfNotLive(RetOrArg Use, UseVector &MaybeLiveUses) {
+  // We're live if our use or its Function is already marked as live.
+  if (LiveFunctions.count(Use.F) || LiveValues.count(Use))
+    return Live;
+
+  // We're maybe live otherwise, but remember that we must become live if
+  // Use becomes live.
+  MaybeLiveUses.push_back(Use);
+  return MaybeLive;
+}
+
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeLive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DAE::Liveness DAE::SurveyUse(Value::const_use_iterator U,
+                             UseVector &MaybeLiveUses, unsigned RetValNum) {
+    const User *V = *U;
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+      // The value is returned from a function. It's only live when the
+      // function's return value is live. We use RetValNum here, for the case
+      // that U is really a use of an insertvalue instruction that uses the
+      // original Use.
+      RetOrArg Use = CreateRet(RI->getParent()->getParent(), RetValNum);
+      // We might be live, depending on the liveness of Use.
+      return MarkIfNotLive(Use, MaybeLiveUses);
+    }
+    if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+      if (U.getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+          && IV->hasIndices())
+        // The use we are examining is inserted into an aggregate. Our liveness
+        // depends on all uses of that aggregate, but if it is used as a return
+        // value, only index at which we were inserted counts.
+        RetValNum = *IV->idx_begin();
+
+      // Note that if we are used as the aggregate operand to the insertvalue,
+      // we don't change RetValNum, but do survey all our uses.
+
+      Liveness Result = MaybeLive;
+      for (Value::const_use_iterator I = IV->use_begin(),
+           E = V->use_end(); I != E; ++I) {
+        Result = SurveyUse(I, MaybeLiveUses, RetValNum);
+        if (Result == Live)
+          break;
+      }
+      return Result;
+    }
+
+    if (ImmutableCallSite CS = V) {
+      const Function *F = CS.getCalledFunction();
+      if (F) {
+        // Used in a direct call.
+
+        // Find the argument number. We know for sure that this use is an
+        // argument, since if it was the function argument this would be an
+        // indirect call and the we know can't be looking at a value of the
+        // label type (for the invoke instruction).
+        unsigned ArgNo = CS.getArgumentNo(U);
+
+        if (ArgNo >= F->getFunctionType()->getNumParams())
+          // The value is passed in through a vararg! Must be live.
+          return Live;
+
+        assert(CS.getArgument(ArgNo)
+               == CS->getOperand(U.getOperandNo())
+               && "Argument is not where we expected it");
+
+        // Value passed to a normal call. It's only live when the corresponding
+        // argument to the called function turns out live.
+        RetOrArg Use = CreateArg(F, ArgNo);
+        return MarkIfNotLive(Use, MaybeLiveUses);
+      }
+    }
+    // Used in any other way? Value must be live.
+    return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DAE::Liveness DAE::SurveyUses(const Value *V, UseVector &MaybeLiveUses) {
+  // Assume it's dead (which will only hold if there are no uses at all..).
+  Liveness Result = MaybeLive;
+  // Check each use.
+  for (Value::const_use_iterator I = V->use_begin(),
+       E = V->use_end(); I != E; ++I) {
+    Result = SurveyUse(I, MaybeLiveUses);
+    if (Result == Live)
+      break;
+  }
+  return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value.  This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+//
+void DAE::SurveyFunction(const Function &F) {
+  unsigned RetCount = NumRetVals(&F);
+  // Assume all return values are dead
+  typedef SmallVector<Liveness, 5> RetVals;
+  RetVals RetValLiveness(RetCount, MaybeLive);
+
+  typedef SmallVector<UseVector, 5> RetUses;
+  // These vectors map each return value to the uses that make it MaybeLive, so
+  // we can add those to the Uses map if the return value really turns out to be
+  // MaybeLive. Initialized to a list of RetCount empty lists.
+  RetUses MaybeLiveRetUses(RetCount);
+
+  for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+      if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+          != F.getFunctionType()->getReturnType()) {
+        // We don't support old style multiple return values.
+        MarkLive(F);
+        return;
+      }
+
+  if (!F.hasLocalLinkage() && (!ShouldHackArguments() || F.isIntrinsic())) {
+    MarkLive(F);
+    return;
+  }
+
+  DEBUG(dbgs() << "DAE - Inspecting callers for fn: " << F.getName() << "\n");
+  // Keep track of the number of live retvals, so we can skip checks once all
+  // of them turn out to be live.
+  unsigned NumLiveRetVals = 0;
+  Type *STy = dyn_cast<StructType>(F.getReturnType());
+  // Loop all uses of the function.
+  for (Value::const_use_iterator I = F.use_begin(), E = F.use_end();
+       I != E; ++I) {
+    // If the function is PASSED IN as an argument, its address has been
+    // taken.
+    ImmutableCallSite CS(*I);
+    if (!CS || !CS.isCallee(I)) {
+      MarkLive(F);
+      return;
+    }
+
+    // If this use is anything other than a call site, the function is alive.
+    const Instruction *TheCall = CS.getInstruction();
+    if (!TheCall) {   // Not a direct call site?
+      MarkLive(F);
+      return;
+    }
+
+    // If we end up here, we are looking at a direct call to our function.
+
+    // Now, check how our return value(s) is/are used in this caller. Don't
+    // bother checking return values if all of them are live already.
+    if (NumLiveRetVals != RetCount) {
+      if (STy) {
+        // Check all uses of the return value.
+        for (Value::const_use_iterator I = TheCall->use_begin(),
+             E = TheCall->use_end(); I != E; ++I) {
+          const ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(*I);
+          if (Ext && Ext->hasIndices()) {
+            // This use uses a part of our return value, survey the uses of
+            // that part and store the results for this index only.
+            unsigned Idx = *Ext->idx_begin();
+            if (RetValLiveness[Idx] != Live) {
+              RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+              if (RetValLiveness[Idx] == Live)
+                NumLiveRetVals++;
+            }
+          } else {
+            // Used by something else than extractvalue. Mark all return
+            // values as live.
+            for (unsigned i = 0; i != RetCount; ++i )
+              RetValLiveness[i] = Live;
+            NumLiveRetVals = RetCount;
+            break;
+          }
+        }
+      } else {
+        // Single return value
+        RetValLiveness[0] = SurveyUses(TheCall, MaybeLiveRetUses[0]);
+        if (RetValLiveness[0] == Live)
+          NumLiveRetVals = RetCount;
+      }
+    }
+  }
+
+  // Now we've inspected all callers, record the liveness of our return values.
+  for (unsigned i = 0; i != RetCount; ++i)
+    MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
+
+  DEBUG(dbgs() << "DAE - Inspecting args for fn: " << F.getName() << "\n");
+
+  // Now, check all of our arguments.
+  unsigned i = 0;
+  UseVector MaybeLiveArgUses;
+  for (Function::const_arg_iterator AI = F.arg_begin(),
+       E = F.arg_end(); AI != E; ++AI, ++i) {
+    // See what the effect of this use is (recording any uses that cause
+    // MaybeLive in MaybeLiveArgUses).
+    Liveness Result = SurveyUses(AI, MaybeLiveArgUses);
+    // Mark the result.
+    MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
+    // Clear the vector again for the next iteration.
+    MaybeLiveArgUses.clear();
+  }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DAE::MarkValue(const RetOrArg &RA, Liveness L,
+                    const UseVector &MaybeLiveUses) {
+  switch (L) {
+    case Live: MarkLive(RA); break;
+    case MaybeLive:
+    {
+      // Note any uses of this value, so this return value can be
+      // marked live whenever one of the uses becomes live.
+      for (UseVector::const_iterator UI = MaybeLiveUses.begin(),
+           UE = MaybeLiveUses.end(); UI != UE; ++UI)
+        Uses.insert(std::make_pair(*UI, RA));
+      break;
+    }
+  }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DAE::MarkLive(const Function &F) {
+  DEBUG(dbgs() << "DAE - Intrinsically live fn: " << F.getName() << "\n");
+  // Mark the function as live.
+  LiveFunctions.insert(&F);
+  // Mark all arguments as live.
+  for (unsigned i = 0, e = F.arg_size(); i != e; ++i)
+    PropagateLiveness(CreateArg(&F, i));
+  // Mark all return values as live.
+  for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i)
+    PropagateLiveness(CreateRet(&F, i));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DAE::MarkLive(const RetOrArg &RA) {
+  if (LiveFunctions.count(RA.F))
+    return; // Function was already marked Live.
+
+  if (!LiveValues.insert(RA).second)
+    return; // We were already marked Live.
+
+  DEBUG(dbgs() << "DAE - Marking " << RA.getDescription() << " live\n");
+  PropagateLiveness(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DAE::PropagateLiveness(const RetOrArg &RA) {
+  // We don't use upper_bound (or equal_range) here, because our recursive call
+  // to ourselves is likely to cause the upper_bound (which is the first value
+  // not belonging to RA) to become erased and the iterator invalidated.
+  UseMap::iterator Begin = Uses.lower_bound(RA);
+  UseMap::iterator E = Uses.end();
+  UseMap::iterator I;
+  for (I = Begin; I != E && I->first == RA; ++I)
+    MarkLive(I->second);
+
+  // Erase RA from the Uses map (from the lower bound to wherever we ended up
+  // after the loop).
+  Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DAE::RemoveDeadStuffFromFunction(Function *F) {
+  // Don't modify fully live functions
+  if (LiveFunctions.count(F))
+    return false;
+
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has fewer arguments and a different return type.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type*> Params;
+
+  // Set up to build a new list of parameter attributes.
+  SmallVector<AttributeSet, 8> AttributesVec;
+  const AttributeSet &PAL = F->getAttributes();
+
+  // Find out the new return value.
+  Type *RetTy = FTy->getReturnType();
+  Type *NRetTy = NULL;
+  unsigned RetCount = NumRetVals(F);
+
+  // -1 means unused, other numbers are the new index
+  SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+  std::vector<Type*> RetTypes;
+  if (RetTy->isVoidTy()) {
+    NRetTy = RetTy;
+  } else {
+    StructType *STy = dyn_cast<StructType>(RetTy);
+    if (STy)
+      // Look at each of the original return values individually.
+      for (unsigned i = 0; i != RetCount; ++i) {
+        RetOrArg Ret = CreateRet(F, i);
+        if (LiveValues.erase(Ret)) {
+          RetTypes.push_back(STy->getElementType(i));
+          NewRetIdxs[i] = RetTypes.size() - 1;
+        } else {
+          ++NumRetValsEliminated;
+          DEBUG(dbgs() << "DAE - Removing return value " << i << " from "
+                << F->getName() << "\n");
+        }
+      }
+    else
+      // We used to return a single value.
+      if (LiveValues.erase(CreateRet(F, 0))) {
+        RetTypes.push_back(RetTy);
+        NewRetIdxs[0] = 0;
+      } else {
+        DEBUG(dbgs() << "DAE - Removing return value from " << F->getName()
+              << "\n");
+        ++NumRetValsEliminated;
+      }
+    if (RetTypes.size() > 1)
+      // More than one return type? Return a struct with them. Also, if we used
+      // to return a struct and didn't change the number of return values,
+      // return a struct again. This prevents changing {something} into
+      // something and {} into void.
+      // Make the new struct packed if we used to return a packed struct
+      // already.
+      NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
+    else if (RetTypes.size() == 1)
+      // One return type? Just a simple value then, but only if we didn't use to
+      // return a struct with that simple value before.
+      NRetTy = RetTypes.front();
+    else if (RetTypes.size() == 0)
+      // No return types? Make it void, but only if we didn't use to return {}.
+      NRetTy = Type::getVoidTy(F->getContext());
+  }
+
+  assert(NRetTy && "No new return type found?");
+
+  // The existing function return attributes.
+  AttributeSet RAttrs = PAL.getRetAttributes();
+
+  // Remove any incompatible attributes, but only if we removed all return
+  // values. Otherwise, ensure that we don't have any conflicting attributes
+  // here. Currently, this should not be possible, but special handling might be
+  // required when new return value attributes are added.
+  if (NRetTy->isVoidTy())
+    RAttrs =
+      AttributeSet::get(NRetTy->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+         removeAttributes(AttributeFuncs::
+                          typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                          AttributeSet::ReturnIndex));
+  else
+    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+             hasAttributes(AttributeFuncs::
+                           typeIncompatible(NRetTy, AttributeSet::ReturnIndex),
+                           AttributeSet::ReturnIndex) &&
+           "Return attributes no longer compatible?");
+
+  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
+
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++i) {
+    RetOrArg Arg = CreateArg(F, i);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[i] = true;
+
+      // Get the original parameter attributes (skipping the first one, that is
+      // for the return value.
+      if (PAL.hasAttributes(i + 1)) {
+        AttrBuilder B(PAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
+    } else {
+      ++NumArgumentsEliminated;
+      DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
+            << ") from " << F->getName() << "\n");
+    }
+  }
+
+  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    AttributesVec.push_back(AttributeSet::get(F->getContext(),
+                                              PAL.getFnAttributes()));
+
+  // Reconstruct the AttributesList based on the vector we constructed.
+  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
+
+  // Create the new function type based on the recomputed parameters.
+  FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+  // No change?
+  if (NFTy == FTy)
+    return false;
+
+  // Create the new function body and insert it into the module...
+  Function *NF = Function::Create(NFTy, F->getLinkage());
+  NF->copyAttributesFrom(F);
+  NF->setAttributes(NewPAL);
+  // Insert the new function before the old function, so we won't be processing
+  // it again.
+  F->getParent()->getFunctionList().insert(F, NF);
+  NF->takeName(F);
+
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in a smaller number of arguments into the new function.
+  //
+  std::vector<Value*> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->use_back());
+    Instruction *Call = CS.getInstruction();
+
+    AttributesVec.clear();
+    const AttributeSet &CallPAL = CS.getAttributes();
+
+    // The call return attributes.
+    AttributeSet RAttrs = CallPAL.getRetAttributes();
+
+    // Adjust in case the function was changed to return void.
+    RAttrs =
+      AttributeSet::get(NF->getContext(), AttributeSet::ReturnIndex,
+                        AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
+        removeAttributes(AttributeFuncs::
+                         typeIncompatible(NF->getReturnType(),
+                                          AttributeSet::ReturnIndex),
+                         AttributeSet::ReturnIndex));
+    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
+      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
+
+    // Declare these outside of the loops, so we can reuse them for the second
+    // loop, which loops the varargs.
+    CallSite::arg_iterator I = CS.arg_begin();
+    unsigned i = 0;
+    // Loop over those operands, corresponding to the normal arguments to the
+    // original function, and add those that are still alive.
+    for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i)
+      if (ArgAlive[i]) {
+        Args.push_back(*I);
+        // Get original parameter attributes, but skip return attributes.
+        if (CallPAL.hasAttributes(i + 1)) {
+          AttrBuilder B(CallPAL, i + 1);
+          AttributesVec.
+            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+        }
+      }
+
+    // Push any varargs arguments on the list. Don't forget their attributes.
+    for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
+      Args.push_back(*I);
+      if (CallPAL.hasAttributes(i + 1)) {
+        AttrBuilder B(CallPAL, i + 1);
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+      }
+    }
+
+    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
+      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
+                                                CallPAL.getFnAttributes()));
+
+    // Reconstruct the AttributesList based on the vector we constructed.
+    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
+
+    Instruction *New;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args, "", Call);
+      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<InvokeInst>(New)->setAttributes(NewCallPAL);
+    } else {
+      New = CallInst::Create(NF, Args, "", Call);
+      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
+      cast<CallInst>(New)->setAttributes(NewCallPAL);
+      if (cast<CallInst>(Call)->isTailCall())
+        cast<CallInst>(New)->setTailCall();
+    }
+    New->setDebugLoc(Call->getDebugLoc());
+
+    Args.clear();
+
+    if (!Call->use_empty()) {
+      if (New->getType() == Call->getType()) {
+        // Return type not changed? Just replace users then.
+        Call->replaceAllUsesWith(New);
+        New->takeName(Call);
+      } else if (New->getType()->isVoidTy()) {
+        // Our return value has uses, but they will get removed later on.
+        // Replace by null for now.
+        if (!Call->getType()->isX86_MMXTy())
+          Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+      } else {
+        assert(RetTy->isStructTy() &&
+               "Return type changed, but not into a void. The old return type"
+               " must have been a struct!");
+        Instruction *InsertPt = Call;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+          BasicBlock::iterator IP = II->getNormalDest()->begin();
+          while (isa<PHINode>(IP)) ++IP;
+          InsertPt = IP;
+        }
+
+        // We used to return a struct. Instead of doing smart stuff with all the
+        // uses of this struct, we will just rebuild it using
+        // extract/insertvalue chaining and let instcombine clean that up.
+        //
+        // Start out building up our return value from undef
+        Value *RetVal = UndefValue::get(RetTy);
+        for (unsigned i = 0; i != RetCount; ++i)
+          if (NewRetIdxs[i] != -1) {
+            Value *V;
+            if (RetTypes.size() > 1)
+              // We are still returning a struct, so extract the value from our
+              // return value
+              V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret",
+                                           InsertPt);
+            else
+              // We are now returning a single element, so just insert that
+              V = New;
+            // Insert the value at the old position
+            RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt);
+          }
+        // Now, replace all uses of the old call instruction with the return
+        // struct we built
+        Call->replaceAllUsesWith(RetVal);
+        New->takeName(Call);
+      }
+    }
+
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
+
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+       I2 = NF->arg_begin(); I != E; ++I, ++i)
+    if (ArgAlive[i]) {
+      // If this is a live argument, move the name and users over to the new
+      // version.
+      I->replaceAllUsesWith(I2);
+      I2->takeName(I);
+      ++I2;
+    } else {
+      // If this argument is dead, replace any uses of it with null constants
+      // (these are guaranteed to become unused later on).
+      if (!I->getType()->isX86_MMXTy())
+        I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+    }
+
+  // If we change the return value of the function we must rewrite any return
+  // instructions.  Check this now.
+  if (F->getReturnType() != NF->getReturnType())
+    for (Function::iterator BB = NF->begin(), E = NF->end(); BB != E; ++BB)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+        Value *RetVal;
+
+        if (NFTy->getReturnType()->isVoidTy()) {
+          RetVal = 0;
+        } else {
+          assert (RetTy->isStructTy());
+          // The original return value was a struct, insert
+          // extractvalue/insertvalue chains to extract only the values we need
+          // to return and insert them into our new result.
+          // This does generate messy code, but we'll let it to instcombine to
+          // clean that up.
+          Value *OldRet = RI->getOperand(0);
+          // Start out building up our return value from undef
+          RetVal = UndefValue::get(NRetTy);
+          for (unsigned i = 0; i != RetCount; ++i)
+            if (NewRetIdxs[i] != -1) {
+              ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i,
+                                                              "oldret", RI);
+              if (RetTypes.size() > 1) {
+                // We're still returning a struct, so reinsert the value into
+                // our new return value at the new index
+
+                RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i],
+                                                 "newret", RI);
+              } else {
+                // We are now only returning a simple value, so just return the
+                // extracted value.
+                RetVal = EV;
+              }
+            }
+        }
+        // Replace the return instruction with one returning the new return
+        // value (possibly 0 if we became void).
+        ReturnInst::Create(F->getContext(), RetVal, RI);
+        BB->getInstList().erase(RI);
+      }
+
+  // Patch the pointer to LLVM function in debug info descriptor.
+  FunctionDIMap::iterator DI = FunctionDIs.find(F);
+  if (DI != FunctionDIs.end())
+    DI->second.replaceFunction(NF);
+
+  // Now that the old function is dead, delete it.
+  F->eraseFromParent();
+
+  return true;
+}
+
+bool DAE::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Collect debug info descriptors for functions.
+  CollectFunctionDIs(M);
+
+  // First pass: Do a simple check to see if any functions can have their "..."
+  // removed.  We can do this if they never call va_start.  This loop cannot be
+  // fused with the next loop, because deleting a function invalidates
+  // information computed while surveying other functions.
+  DEBUG(dbgs() << "DAE - Deleting dead varargs\n");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function &F = *I++;
+    if (F.getFunctionType()->isVarArg())
+      Changed |= DeleteDeadVarargs(F);
+  }
+
+  // Second phase:loop through the module, determining which arguments are live.
+  // We assume all arguments are dead unless proven otherwise (allowing us to
+  // determine that dead arguments passed into recursive functions are dead).
+  //
+  DEBUG(dbgs() << "DAE - Determining liveness\n");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    SurveyFunction(*I);
+
+  // Now, remove all dead arguments and return values from each function in
+  // turn.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    // Increment now, because the function will probably get removed (ie.
+    // replaced by a new one).
+    Function *F = I++;
+    Changed |= RemoveDeadStuffFromFunction(F);
+  }
+
+  // Finally, look for any unused parameters in functions with non-local
+  // linkage and replace the passed in parameters with undef.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Function& F = *I;
+
+    Changed |= RemoveDeadArgumentsFromCallers(F);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
new file mode 100644
index 000000000000..fa3d72ddcf16
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -0,0 +1,135 @@
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// @brief A pass to extract specific functions and their dependencies.
+  class GVExtractorPass : public ModulePass {
+    SetVector<GlobalValue *> Named;
+    bool deleteStuff;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+
+    /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the
+    /// specified function. Otherwise, it deletes as much of the module as
+    /// possible, except for the function specified.
+    ///
+    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true)
+      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
+
+    bool runOnModule(Module &M) {
+      // Visit the global inline asm.
+      if (!deleteStuff)
+        M.setModuleInlineAsm("");
+
+      // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
+      // implementation could figure out which GlobalValues are actually
+      // referenced by the Named set, and which GlobalValues in the rest of
+      // the module are referenced by the NamedSet, and get away with leaving
+      // more internal and private things internal and private. But for now,
+      // be conservative and simple.
+
+      // Visit the GlobalVariables.
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+           I != E; ++I) {
+        bool Delete =
+          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+        if (!Delete) {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+          if (I->getName() == "llvm.global_ctors")
+            continue;
+        }
+
+        bool Local = I->isDiscardableIfUnused();
+        if (Local)
+          I->setVisibility(GlobalValue::HiddenVisibility);
+
+        if (Local || Delete)
+          I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Delete)
+          I->setInitializer(0);
+      }
+
+      // Visit the Functions.
+      for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+        bool Delete =
+          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+        if (!Delete) {
+          if (I->hasAvailableExternallyLinkage())
+            continue;
+        }
+
+        bool Local = I->isDiscardableIfUnused();
+        if (Local)
+          I->setVisibility(GlobalValue::HiddenVisibility);
+
+        if (Local || Delete)
+          I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Delete)
+          I->deleteBody();
+      }
+
+      // Visit the Aliases.
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+           I != E;) {
+        Module::alias_iterator CurI = I;
+        ++I;
+
+        if (CurI->isDiscardableIfUnused()) {
+          CurI->setVisibility(GlobalValue::HiddenVisibility);
+          CurI->setLinkage(GlobalValue::ExternalLinkage);
+        }
+
+        if (deleteStuff == (bool)Named.count(CurI)) {
+          Type *Ty =  CurI->getType()->getElementType();
+
+          CurI->removeFromParent();
+          llvm::Value *Declaration;
+          if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                           CurI->getName(), &M);
+
+          } else {
+            Declaration =
+              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                 0, CurI->getName());
+
+          }
+          CurI->replaceAllUsesWith(Declaration);
+          delete CurI;
+        }
+      }
+
+      return true;
+    }
+  };
+
+  char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue*>& GVs, 
+                                         bool deleteFn) {
+  return new GVExtractorPass(GVs, deleteFn);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
new file mode 100644
index 000000000000..bc5109b4d48d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -0,0 +1,1345 @@
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, looking for functions which do not access or only read
+// non-local memory, and marking them readnone/readonly.  In addition,
+// it marks function arguments (of pointer type) 'nocapture' if a call
+// to the function does not create any copies of the pointer value that
+// outlive the call.  This more or less means that the pointer is only
+// dereferenced, and not returned from the function or stored in a global.
+// Finally, well-known library call declarations are marked with all
+// attributes that are consistent with the function's standard definition.
+// This pass is implemented as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "functionattrs"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
+
+namespace {
+  struct FunctionAttrs : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    FunctionAttrs() : CallGraphSCCPass(ID), AA(0) {
+      initializeFunctionAttrsPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(CallGraphSCC &SCC);
+
+    // AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+    bool AddReadAttrs(const CallGraphSCC &SCC);
+
+    // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+    bool AddNoCaptureAttrs(const CallGraphSCC &SCC);
+
+    // IsFunctionMallocLike - Does this function allocate new memory?
+    bool IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<Function*, 8> &) const;
+
+    // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+    bool AddNoAliasAttrs(const CallGraphSCC &SCC);
+
+    // Utility methods used by inferPrototypeAttributes to add attributes
+    // and maintain annotation statistics.
+
+    void setDoesNotAccessMemory(Function &F) {
+      if (!F.doesNotAccessMemory()) {
+	F.setDoesNotAccessMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setOnlyReadsMemory(Function &F) {
+      if (!F.onlyReadsMemory()) {
+	F.setOnlyReadsMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotThrow(Function &F) {
+      if (!F.doesNotThrow()) {
+	F.setDoesNotThrow();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotCapture(Function &F, unsigned n) {
+      if (!F.doesNotCapture(n)) {
+	F.setDoesNotCapture(n);
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotAlias(Function &F, unsigned n) {
+      if (!F.doesNotAlias(n)) {
+	F.setDoesNotAlias(n);
+	++NumAnnotated;
+      }
+    }
+
+    // inferPrototypeAttributes - Analyze the name and prototype of the
+    // given function and set any applicable attributes.  Returns true
+    // if any attributes were set and false otherwise.
+    bool inferPrototypeAttributes(Function &F);
+
+    // annotateLibraryCalls - Adds attributes to well-known standard library
+    // call declarations.
+    bool annotateLibraryCalls(const CallGraphSCC &SCC);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
+      CallGraphSCCPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    AliasAnalysis *AA;
+    TargetLibraryInfo *TLI;
+  };
+}
+
+char FunctionAttrs::ID = 0;
+INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
+                "Deduce function attributes", false, false)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
+                "Deduce function attributes", false, false)
+
+Pass *llvm::createFunctionAttrsPass() { return new FunctionAttrs(); }
+
+
+/// AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
+bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
+  SmallPtrSet<Function*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+    SCCNodes.insert((*I)->getFunction());
+
+  // Check if any of the functions in the SCC read or write memory.  If they
+  // write memory then they can't be marked readnone or readonly.
+  bool ReadsMemory = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F == 0)
+      // External node - may write memory.  Just give up.
+      return false;
+
+    AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(F);
+    if (MRB == AliasAnalysis::DoesNotAccessMemory)
+      // Already perfect!
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that writes memory, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden()) {
+      if (!AliasAnalysis::onlyReadsMemory(MRB))
+        // May write memory.  Just give up.
+        return false;
+
+      ReadsMemory = true;
+      continue;
+    }
+
+    // Scan the function body for instructions that may read or write memory.
+    for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+      Instruction *I = &*II;
+
+      // Some instructions can be ignored even if they read or write memory.
+      // Detect these now, skipping to the next instruction if one is found.
+      CallSite CS(cast<Value>(I));
+      if (CS) {
+        // Ignore calls to functions in the same SCC.
+        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
+          continue;
+        AliasAnalysis::ModRefBehavior MRB = AA->getModRefBehavior(CS);
+        // If the call doesn't access arbitrary memory, we may be able to
+        // figure out something.
+        if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+          // If the call does access argument pointees, check each argument.
+          if (AliasAnalysis::doesAccessArgPointees(MRB))
+            // Check whether all pointer arguments point to local memory, and
+            // ignore calls that only access local memory.
+            for (CallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
+                 CI != CE; ++CI) {
+              Value *Arg = *CI;
+              if (Arg->getType()->isPointerTy()) {
+                AliasAnalysis::Location Loc(Arg,
+                                            AliasAnalysis::UnknownSize,
+                                            I->getMetadata(LLVMContext::MD_tbaa));
+                if (!AA->pointsToConstantMemory(Loc, /*OrLocal=*/true)) {
+                  if (MRB & AliasAnalysis::Mod)
+                    // Writes non-local memory.  Give up.
+                    return false;
+                  if (MRB & AliasAnalysis::Ref)
+                    // Ok, it reads non-local memory.
+                    ReadsMemory = true;
+                }
+              }
+            }
+          continue;
+        }
+        // The call could access any memory. If that includes writes, give up.
+        if (MRB & AliasAnalysis::Mod)
+          return false;
+        // If it reads, note it.
+        if (MRB & AliasAnalysis::Ref)
+          ReadsMemory = true;
+        continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        // Ignore non-volatile loads from local memory. (Atomic is okay here.)
+        if (!LI->isVolatile()) {
+          AliasAnalysis::Location Loc = AA->getLocation(LI);
+          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
+            continue;
+        }
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Ignore non-volatile stores to local memory. (Atomic is okay here.)
+        if (!SI->isVolatile()) {
+          AliasAnalysis::Location Loc = AA->getLocation(SI);
+          if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
+            continue;
+        }
+      } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+        // Ignore vaargs on local memory.
+        AliasAnalysis::Location Loc = AA->getLocation(VI);
+        if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
+          continue;
+      }
+
+      // Any remaining instructions need to be taken seriously!  Check if they
+      // read or write memory.
+      if (I->mayWriteToMemory())
+        // Writes memory.  Just give up.
+        return false;
+
+      // If this instruction may read memory, remember that.
+      ReadsMemory |= I->mayReadFromMemory();
+    }
+  }
+
+  // Success!  Functions in this SCC do not access memory, or only read memory.
+  // Give them the appropriate attribute.
+  bool MadeChange = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F->doesNotAccessMemory())
+      // Already perfect!
+      continue;
+
+    if (F->onlyReadsMemory() && ReadsMemory)
+      // No change.
+      continue;
+
+    MadeChange = true;
+
+    // Clear out any existing attributes.
+    AttrBuilder B;
+    B.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
+    F->removeAttributes(AttributeSet::FunctionIndex,
+                        AttributeSet::get(F->getContext(),
+                                          AttributeSet::FunctionIndex, B));
+
+    // Add in the new attribute.
+    F->addAttribute(AttributeSet::FunctionIndex,
+                    ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
+
+    if (ReadsMemory)
+      ++NumReadOnly;
+    else
+      ++NumReadNone;
+  }
+
+  return MadeChange;
+}
+
+namespace {
+  // For a given pointer Argument, this retains a list of Arguments of functions
+  // in the same SCC that the pointer data flows into. We use this to build an
+  // SCC of the arguments.
+  struct ArgumentGraphNode {
+    Argument *Definition;
+    SmallVector<ArgumentGraphNode*, 4> Uses;
+  };
+
+  class ArgumentGraph {
+    // We store pointers to ArgumentGraphNode objects, so it's important that
+    // that they not move around upon insert.
+    typedef std::map<Argument*, ArgumentGraphNode> ArgumentMapTy;
+
+    ArgumentMapTy ArgumentMap;
+
+    // There is no root node for the argument graph, in fact:
+    //   void f(int *x, int *y) { if (...) f(x, y); }
+    // is an example where the graph is disconnected. The SCCIterator requires a
+    // single entry point, so we maintain a fake ("synthetic") root node that
+    // uses every node. Because the graph is directed and nothing points into
+    // the root, it will not participate in any SCCs (except for its own).
+    ArgumentGraphNode SyntheticRoot;
+
+  public:
+    ArgumentGraph() { SyntheticRoot.Definition = 0; }
+
+    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator iterator;
+
+    iterator begin() { return SyntheticRoot.Uses.begin(); }
+    iterator end() { return SyntheticRoot.Uses.end(); }
+    ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+    ArgumentGraphNode *operator[](Argument *A) {
+      ArgumentGraphNode &Node = ArgumentMap[A];
+      Node.Definition = A;
+      SyntheticRoot.Uses.push_back(&Node);
+      return &Node;
+    }
+  };
+
+  // This tracker checks whether callees are in the SCC, and if so it does not
+  // consider that a capture, instead adding it to the "Uses" list and
+  // continuing with the analysis.
+  struct ArgumentUsesTracker : public CaptureTracker {
+    ArgumentUsesTracker(const SmallPtrSet<Function*, 8> &SCCNodes)
+      : Captured(false), SCCNodes(SCCNodes) {}
+
+    void tooManyUses() { Captured = true; }
+
+    bool captured(Use *U) {
+      CallSite CS(U->getUser());
+      if (!CS.getInstruction()) { Captured = true; return true; }
+
+      Function *F = CS.getCalledFunction();
+      if (!F || !SCCNodes.count(F)) { Captured = true; return true; }
+
+      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+      for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end();
+           PI != PE; ++PI, ++AI) {
+        if (AI == AE) {
+          assert(F->isVarArg() && "More params than args in non-varargs call");
+          Captured = true;
+          return true;
+        }
+        if (PI == U) {
+          Uses.push_back(AI);
+          break;
+        }
+      }
+      assert(!Uses.empty() && "Capturing call-site captured nothing?");
+      return false;
+    }
+
+    bool Captured;  // True only if certainly captured (used outside our SCC).
+    SmallVector<Argument*, 4> Uses;  // Uses within our SCC.
+
+    const SmallPtrSet<Function*, 8> &SCCNodes;
+  };
+}
+
+namespace llvm {
+  template<> struct GraphTraits<ArgumentGraphNode*> {
+    typedef ArgumentGraphNode NodeType;
+    typedef SmallVectorImpl<ArgumentGraphNode*>::iterator ChildIteratorType;
+
+    static inline NodeType *getEntryNode(NodeType *A) { return A; }
+    static inline ChildIteratorType child_begin(NodeType *N) {
+      return N->Uses.begin();
+    }
+    static inline ChildIteratorType child_end(NodeType *N) {
+      return N->Uses.end();
+    }
+  };
+  template<> struct GraphTraits<ArgumentGraph*>
+    : public GraphTraits<ArgumentGraphNode*> {
+    static NodeType *getEntryNode(ArgumentGraph *AG) {
+      return AG->getEntryNode();
+    }
+    static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+      return AG->begin();
+    }
+    static ChildIteratorType nodes_end(ArgumentGraph *AG) {
+      return AG->end();
+    }
+  };
+}
+
+/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
+bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
+  bool Changed = false;
+
+  SmallPtrSet<Function*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (F && !F->isDeclaration() && !F->mayBeOverridden())
+      SCCNodes.insert(F);
+  }
+
+  ArgumentGraph AG;
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::NoCapture);
+
+  // Check each function in turn, determining which pointer arguments are not
+  // captured.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F == 0)
+      // External node - only a problem for arguments that we pass to it.
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime with
+    // something that captures pointers, so treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      continue;
+
+    // Functions that are readonly (or readnone) and nounwind and don't return
+    // a value can't capture arguments. Don't analyze them.
+    if (F->onlyReadsMemory() && F->doesNotThrow() &&
+        F->getReturnType()->isVoidTy()) {
+      for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
+           A != E; ++A) {
+        if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+          ++NumNoCapture;
+          Changed = true;
+        }
+      }
+      continue;
+    }
+
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A)
+      if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+        ArgumentUsesTracker Tracker(SCCNodes);
+        PointerMayBeCaptured(A, &Tracker);
+        if (!Tracker.Captured) {
+          if (Tracker.Uses.empty()) {
+            // If it's trivially not captured, mark it nocapture now.
+            A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo()+1, B));
+            ++NumNoCapture;
+            Changed = true;
+          } else {
+            // If it's not trivially captured and not trivially not captured,
+            // then it must be calling into another function in our SCC. Save
+            // its particulars for Argument-SCC analysis later.
+            ArgumentGraphNode *Node = AG[A];
+            for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(),
+                   UE = Tracker.Uses.end(); UI != UE; ++UI)
+              Node->Uses.push_back(AG[*UI]);
+          }
+        }
+        // Otherwise, it's captured. Don't bother doing SCC analysis on it.
+      }
+  }
+
+  // The graph we've collected is partial because we stopped scanning for
+  // argument uses once we solved the argument trivially. These partial nodes
+  // show up as ArgumentGraphNode objects with an empty Uses list, and for
+  // these nodes the final decision about whether they capture has already been
+  // made.  If the definition doesn't have a 'nocapture' attribute by now, it
+  // captures.
+
+  for (scc_iterator<ArgumentGraph*> I = scc_begin(&AG), E = scc_end(&AG);
+       I != E; ++I) {
+    std::vector<ArgumentGraphNode*> &ArgumentSCC = *I;
+    if (ArgumentSCC.size() == 1) {
+      if (!ArgumentSCC[0]->Definition) continue;  // synthetic root node
+
+      // eg. "void f(int* x) { if (...) f(x); }"
+      if (ArgumentSCC[0]->Uses.size() == 1 &&
+          ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
+        ArgumentSCC[0]->
+          Definition->
+          addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(),
+                                    ArgumentSCC[0]->Definition->getArgNo() + 1,
+                                    B));
+        ++NumNoCapture;
+        Changed = true;
+      }
+      continue;
+    }
+
+    bool SCCCaptured = false;
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *Node = *I;
+      if (Node->Uses.empty()) {
+        if (!Node->Definition->hasNoCaptureAttr())
+          SCCCaptured = true;
+      }
+    }
+    if (SCCCaptured) continue;
+
+    SmallPtrSet<Argument*, 8> ArgumentSCCNodes;
+    // Fill ArgumentSCCNodes with the elements of the ArgumentSCC.  Used for
+    // quickly looking up whether a given Argument is in this ArgumentSCC.
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E; ++I) {
+      ArgumentSCCNodes.insert((*I)->Definition);
+    }
+
+    for (std::vector<ArgumentGraphNode*>::iterator I = ArgumentSCC.begin(),
+           E = ArgumentSCC.end(); I != E && !SCCCaptured; ++I) {
+      ArgumentGraphNode *N = *I;
+      for (SmallVectorImpl<ArgumentGraphNode*>::iterator UI = N->Uses.begin(),
+             UE = N->Uses.end(); UI != UE; ++UI) {
+        Argument *A = (*UI)->Definition;
+        if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
+          continue;
+        SCCCaptured = true;
+        break;
+      }
+    }
+    if (SCCCaptured) continue;
+
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+      ++NumNoCapture;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// IsFunctionMallocLike - A function is malloc-like if it returns either null
+/// or a pointer that doesn't alias any other pointer visible to the caller.
+bool FunctionAttrs::IsFunctionMallocLike(Function *F,
+                              SmallPtrSet<Function*, 8> &SCCNodes) const {
+  SmallSetVector<Value *, 8> FlowsToReturn;
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
+      FlowsToReturn.insert(Ret->getReturnValue());
+
+  for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+    Value *RetVal = FlowsToReturn[i];
+
+    if (Constant *C = dyn_cast<Constant>(RetVal)) {
+      if (!C->isNullValue() && !isa<UndefValue>(C))
+        return false;
+
+      continue;
+    }
+
+    if (isa<Argument>(RetVal))
+      return false;
+
+    if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+      switch (RVI->getOpcode()) {
+        // Extend the analysis by looking upwards.
+        case Instruction::BitCast:
+        case Instruction::GetElementPtr:
+          FlowsToReturn.insert(RVI->getOperand(0));
+          continue;
+        case Instruction::Select: {
+          SelectInst *SI = cast<SelectInst>(RVI);
+          FlowsToReturn.insert(SI->getTrueValue());
+          FlowsToReturn.insert(SI->getFalseValue());
+          continue;
+        }
+        case Instruction::PHI: {
+          PHINode *PN = cast<PHINode>(RVI);
+          for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            FlowsToReturn.insert(PN->getIncomingValue(i));
+          continue;
+        }
+
+        // Check whether the pointer came from an allocation.
+        case Instruction::Alloca:
+          break;
+        case Instruction::Call:
+        case Instruction::Invoke: {
+          CallSite CS(RVI);
+          if (CS.paramHasAttr(0, Attribute::NoAlias))
+            break;
+          if (CS.getCalledFunction() &&
+              SCCNodes.count(CS.getCalledFunction()))
+            break;
+        } // fall-through
+        default:
+          return false;  // Did not come from an allocation.
+      }
+
+    if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
+      return false;
+  }
+
+  return true;
+}
+
+/// AddNoAliasAttrs - Deduce noalias attributes for the SCC.
+bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
+  SmallPtrSet<Function*, 8> SCCNodes;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+    SCCNodes.insert((*I)->getFunction());
+
+  // Check each function in turn, determining which functions return noalias
+  // pointers.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F == 0)
+      // External node - skip it;
+      return false;
+
+    // Already noalias.
+    if (F->doesNotAlias(0))
+      continue;
+
+    // Definitions with weak linkage may be overridden at linktime, so
+    // treat them like declarations.
+    if (F->isDeclaration() || F->mayBeOverridden())
+      return false;
+
+    // We annotate noalias return values, which are only applicable to 
+    // pointer types.
+    if (!F->getReturnType()->isPointerTy())
+      continue;
+
+    if (!IsFunctionMallocLike(F, SCCNodes))
+      return false;
+  }
+
+  bool MadeChange = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy())
+      continue;
+
+    F->setDoesNotAlias(0);
+    ++NumNoAlias;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// inferPrototypeAttributes - Analyze the name and prototype of the
+/// given function and set any applicable attributes.  Returns true
+/// if any attributes were set and false otherwise.
+bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
+  FunctionType *FTy = F.getFunctionType();
+  LibFunc::Func TheLibFunc;
+  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))
+    return false;
+
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isIntegerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strxfrm:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strcmp:
+  case LibFunc::strspn:
+  case LibFunc::strncmp:
+  case LibFunc::strcspn:
+  case LibFunc::strcoll:
+  case LibFunc::strcasecmp:
+  case LibFunc::strncasecmp:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::scanf:
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat:
+  case LibFunc::sscanf:
+  case LibFunc::sprintf:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::snprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::setitimer:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::system:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "system" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::malloc:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::memcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    if (FTy->getNumParams() != 3)
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memalign:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::mkdir:
+  case LibFunc::mktime:
+    if (FTy->getNumParams() == 0 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::realloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::read:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "read" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::rmdir:
+  case LibFunc::rewind:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::rename:
+  case LibFunc::readlink:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::write:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "write" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcopy:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bzero:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::calloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::access:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::fopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fdopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ferror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F);
+    break;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fgets:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 3);
+  case LibFunc::fread:
+  case LibFunc::fwrite:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 4);
+  case LibFunc::fputs:
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+  case LibFunc::fgetpos:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::getenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::getitimer:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getpwnam:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ungetc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::uname:
+  case LibFunc::unlink:
+  case LibFunc::unsetenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::pread:
+  case LibFunc::pwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; these are valid pthread cancellation points.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::popen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::pclose:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vscanf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vsscanf:
+  case LibFunc::vfscanf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::valloc:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::vprintf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::vsnprintf:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::open:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::opendir:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::times:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    setDoesNotThrow(F);
+    setDoesNotAccessMemory(F);
+    break;
+  case LibFunc::lstat:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::lchown:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::qsort:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
+      return false;
+    // May throw; places call through function pointer.
+    setDoesNotCapture(F, 4);
+    break;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::dunder_strtok_r:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::under_IO_getc:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::under_IO_putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::dunder_isoc99_scanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+  case LibFunc::dunder_isoc99_sscanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fopen64:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile64:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::open64:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  default:
+    // Didn't mark any attributes.
+    return false;
+  }
+
+  return true;
+}
+
+/// annotateLibraryCalls - Adds attributes to well-known standard library
+/// call declarations.
+bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
+  bool MadeChange = false;
+
+  // Check each function in turn annotating well-known library function
+  // declarations with attributes.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F != 0 && F->isDeclaration())
+      MadeChange |= inferPrototypeAttributes(*F);
+  }
+
+  return MadeChange;
+}
+
+bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
+  AA = &getAnalysis<AliasAnalysis>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  bool Changed = annotateLibraryCalls(SCC);
+  Changed |= AddReadAttrs(SCC);
+  Changed |= AddNoCaptureAttrs(SCC);
+  Changed |= AddNoAliasAttrs(SCC);
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
new file mode 100644
index 000000000000..dc99492990a3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -0,0 +1,211 @@
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program.  It uses an aggressive algorithm, searching out globals that are
+// known to be alive.  After it finds all of the globals which are needed, it
+// deletes whatever is left over.  This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globaldce"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+namespace {
+  struct GlobalDCE : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    GlobalDCE() : ModulePass(ID) {
+      initializeGlobalDCEPass(*PassRegistry::getPassRegistry());
+    }
+
+    // run - Do the GlobalDCE pass on the specified module, optionally updating
+    // the specified callgraph to reflect the changes.
+    //
+    bool runOnModule(Module &M);
+
+  private:
+    SmallPtrSet<GlobalValue*, 32> AliveGlobals;
+
+    /// GlobalIsNeeded - mark the specific global value as needed, and
+    /// recursively mark anything that it uses as also needed.
+    void GlobalIsNeeded(GlobalValue *GV);
+    void MarkUsedGlobalsAsNeeded(Constant *C);
+
+    bool RemoveUnusedGlobalValue(GlobalValue &GV);
+  };
+}
+
+char GlobalDCE::ID = 0;
+INITIALIZE_PASS(GlobalDCE, "globaldce",
+                "Dead Global Elimination", false, false)
+
+ModulePass *llvm::createGlobalDCEPass() { return new GlobalDCE(); }
+
+bool GlobalDCE::runOnModule(Module &M) {
+  bool Changed = false;
+  
+  // Loop over the module, adding globals which are obviously necessary.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Functions with external linkage are needed if they have a body
+    if (!I->isDiscardableIfUnused() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible & appending globals are needed, if they have an
+    // initializer.
+    if (!I->isDiscardableIfUnused() &&
+        !I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      GlobalIsNeeded(I);
+  }
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I) {
+    Changed |= RemoveUnusedGlobalValue(*I);
+    // Externally visible aliases are needed.
+    if (!I->isDiscardableIfUnused())
+      GlobalIsNeeded(I);
+  }
+
+  // Now that all globals which are needed are in the AliveGlobals set, we loop
+  // through the program, deleting those which are not alive.
+  //
+
+  // The first pass is to drop initializers of global variables which are dead.
+  std::vector<GlobalVariable*> DeadGlobalVars;   // Keep track of dead globals
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadGlobalVars.push_back(I);         // Keep track of dead globals
+      I->setInitializer(0);
+    }
+
+  // The second pass drops the bodies of functions which are dead...
+  std::vector<Function*> DeadFunctions;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadFunctions.push_back(I);         // Keep track of dead globals
+      if (!I->isDeclaration())
+        I->deleteBody();
+    }
+
+  // The third pass drops targets of aliases which are dead...
+  std::vector<GlobalAlias*> DeadAliases;
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
+       ++I)
+    if (!AliveGlobals.count(I)) {
+      DeadAliases.push_back(I);
+      I->setAliasee(0);
+    }
+
+  if (!DeadFunctions.empty()) {
+    // Now that all interferences have been dropped, delete the actual objects
+    // themselves.
+    for (unsigned i = 0, e = DeadFunctions.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadFunctions[i]);
+      M.getFunctionList().erase(DeadFunctions[i]);
+    }
+    NumFunctions += DeadFunctions.size();
+    Changed = true;
+  }
+
+  if (!DeadGlobalVars.empty()) {
+    for (unsigned i = 0, e = DeadGlobalVars.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadGlobalVars[i]);
+      M.getGlobalList().erase(DeadGlobalVars[i]);
+    }
+    NumVariables += DeadGlobalVars.size();
+    Changed = true;
+  }
+
+  // Now delete any dead aliases.
+  if (!DeadAliases.empty()) {
+    for (unsigned i = 0, e = DeadAliases.size(); i != e; ++i) {
+      RemoveUnusedGlobalValue(*DeadAliases[i]);
+      M.getAliasList().erase(DeadAliases[i]);
+    }
+    NumAliases += DeadAliases.size();
+    Changed = true;
+  }
+
+  // Make sure that all memory is released
+  AliveGlobals.clear();
+
+  return Changed;
+}
+
+/// GlobalIsNeeded - the specific global value as needed, and
+/// recursively mark anything that it uses as also needed.
+void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
+  // If the global is already in the set, no need to reprocess it.
+  if (!AliveGlobals.insert(G))
+    return;
+  
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
+    // If this is a global variable, we must make sure to add any global values
+    // referenced by the initializer to the alive set.
+    if (GV->hasInitializer())
+      MarkUsedGlobalsAsNeeded(GV->getInitializer());
+  } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(G)) {
+    // The target of a global alias is needed.
+    MarkUsedGlobalsAsNeeded(GA->getAliasee());
+  } else {
+    // Otherwise this must be a function object.  We have to scan the body of
+    // the function looking for constants and global values which are used as
+    // operands.  Any operands of these types must be processed to ensure that
+    // any globals used will be marked as needed.
+    Function *F = cast<Function>(G);
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
+          if (GlobalValue *GV = dyn_cast<GlobalValue>(*U))
+            GlobalIsNeeded(GV);
+          else if (Constant *C = dyn_cast<Constant>(*U))
+            MarkUsedGlobalsAsNeeded(C);
+  }
+}
+
+void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return GlobalIsNeeded(GV);
+  
+  // Loop over all of the operands of the constant, adding any globals they
+  // use to the list of needed globals.
+  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I)
+    if (Constant *OpC = dyn_cast<Constant>(*I))
+      MarkUsedGlobalsAsNeeded(OpC);
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it.  This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCE::RemoveUnusedGlobalValue(GlobalValue &GV) {
+  if (GV.use_empty()) return false;
+  GV.removeDeadConstantUsers();
+  return GV.use_empty();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
new file mode 100644
index 000000000000..b035a821b4cf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -0,0 +1,3256 @@
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken.  If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "globalopt"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumMarked    , "Number of globals marked constant");
+STATISTIC(NumUnnamed   , "Number of globals marked unnamed_addr");
+STATISTIC(NumSRA       , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA   , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted   , "Number of globals deleted");
+STATISTIC(NumFnDeleted , "Number of functions deleted");
+STATISTIC(NumGlobUses  , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool  , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns   , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved   , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+
+namespace {
+  struct GlobalStatus;
+  struct GlobalOpt : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
+    static char ID; // Pass identification, replacement for typeid
+    GlobalOpt() : ModulePass(ID) {
+      initializeGlobalOptPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M);
+
+  private:
+    GlobalVariable *FindGlobalCtors(Module &M);
+    bool OptimizeFunctions(Module &M);
+    bool OptimizeGlobalVars(Module &M);
+    bool OptimizeGlobalAliases(Module &M);
+    bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
+    bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
+    bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
+                               const SmallPtrSet<const PHINode*, 16> &PHIUsers,
+                               const GlobalStatus &GS);
+    bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
+
+    DataLayout *TD;
+    TargetLibraryInfo *TLI;
+  };
+}
+
+char GlobalOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalOpt, "globalopt",
+                "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(GlobalOpt, "globalopt",
+                "Global Variable Optimizer", false, false)
+
+ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
+
+namespace {
+
+/// GlobalStatus - As we analyze each global, keep track of some information
+/// about it.  If we find out that the address of the global is taken, none of
+/// this info will be accurate.
+struct GlobalStatus {
+  /// isCompared - True if the global's address is used in a comparison.
+  bool isCompared;
+
+  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
+  /// loaded it can be deleted.
+  bool isLoaded;
+
+  /// StoredType - Keep track of what stores to the global look like.
+  ///
+  enum StoredType {
+    /// NotStored - There is no store to this global.  It can thus be marked
+    /// constant.
+    NotStored,
+
+    /// isInitializerStored - This global is stored to, but the only thing
+    /// stored is the constant it was initialized with.  This is only tracked
+    /// for scalar globals.
+    isInitializerStored,
+
+    /// isStoredOnce - This global is stored to, but only its initializer and
+    /// one other value is ever stored to it.  If this global isStoredOnce, we
+    /// track the value stored to it in StoredOnceValue below.  This is only
+    /// tracked for scalar globals.
+    isStoredOnce,
+
+    /// isStored - This global is stored to by multiple values or something else
+    /// that we cannot track.
+    isStored
+  } StoredType;
+
+  /// StoredOnceValue - If only one value (besides the initializer constant) is
+  /// ever stored to this global, keep track of what value it is.
+  Value *StoredOnceValue;
+
+  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
+  /// null/false.  When the first accessing function is noticed, it is recorded.
+  /// When a second different accessing function is noticed,
+  /// HasMultipleAccessingFunctions is set to true.
+  const Function *AccessingFunction;
+  bool HasMultipleAccessingFunctions;
+
+  /// HasNonInstructionUser - Set to true if this global has a user that is not
+  /// an instruction (e.g. a constant expr or GV initializer).
+  bool HasNonInstructionUser;
+
+  /// AtomicOrdering - Set to the strongest atomic ordering requirement.
+  AtomicOrdering Ordering;
+
+  GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
+                   StoredOnceValue(0), AccessingFunction(0),
+                   HasMultipleAccessingFunctions(false),
+                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
+};
+
+}
+
+/// StrongerOrdering - Return the stronger of the two ordering. If the two
+/// orderings are acquire and release, then return AcquireRelease.
+///
+static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if (X == Acquire && Y == Release) return AcquireRelease;
+  if (Y == Acquire && X == Release) return AcquireRelease;
+  return (AtomicOrdering)std::max(X, Y);
+}
+
+/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
+/// by constants itself.  Note that constants cannot be cyclic, so this test is
+/// pretty easy to implement recursively.
+///
+static bool SafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C)) return false;
+
+  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
+       ++UI)
+    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!SafeToDestroyConstant(CU)) return false;
+    } else
+      return false;
+  return true;
+}
+
+
+/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
+/// structure.  If the global has its address taken, return true to indicate we
+/// can't do anything with it.
+///
+static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
+                          SmallPtrSet<const PHINode*, 16> &PHIUsers) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const User *U = *UI;
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType())) return true;
+
+      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.isLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile()) return true;
+        GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V) return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile()) return true;
+
+        GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::isStored) {
+          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(
+                                                           SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::isInitializerStored)
+                GS.StoredType = GlobalStatus::isInitializerStored;
+            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
+              GS.StoredType = GlobalStatus::isStoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::isStored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::isStored;
+          }
+        }
+      } else if (isa<BitCastInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (isa<SelectInst>(I)) {
+        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PHIUsers.insert(PN))  // Not already visited.
+          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.isCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile()) return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::isStored;
+        if (MTI->getArgOperand(1) == V)
+          GS.isLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile()) return true;
+        GS.StoredType = GlobalStatus::isStored;
+      } else {
+        return true;  // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(U)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!SafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
+/// as a root?  If so, we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+  // A global variable is a root if it is a pointer, or could plausibly contain
+  // a pointer.  There are two challenges; one is that we could have a struct
+  // the has an inner member which is a pointer.  We recurse through the type to
+  // detect these (up to a point).  The other is that we may actually be a union
+  // of a pointer and another type, and so our LLVM type is an integer which
+  // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+  // potentially contained here.
+
+  if (GV->hasPrivateLinkage())
+    return false;
+
+  SmallVector<Type *, 4> Types;
+  Types.push_back(cast<PointerType>(GV->getType())->getElementType());
+
+  unsigned Limit = 20;
+  do {
+    Type *Ty = Types.pop_back_val();
+    switch (Ty->getTypeID()) {
+      default: break;
+      case Type::PointerTyID: return true;
+      case Type::ArrayTyID:
+      case Type::VectorTyID: {
+        SequentialType *STy = cast<SequentialType>(Ty);
+        Types.push_back(STy->getElementType());
+        break;
+      }
+      case Type::StructTyID: {
+        StructType *STy = cast<StructType>(Ty);
+        if (STy->isOpaque()) return true;
+        for (StructType::element_iterator I = STy->element_begin(),
+                 E = STy->element_end(); I != E; ++I) {
+          Type *InnerTy = *I;
+          if (isa<PointerType>(InnerTy)) return true;
+          if (isa<CompositeType>(InnerTy))
+            Types.push_back(InnerTy);
+        }
+        break;
+      }
+    }
+    if (--Limit == 0) return true;
+  } while (!Types.empty());
+  return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
+  do {
+    if (isa<Constant>(V))
+      return true;
+    if (!V->hasOneUse())
+      return false;
+    if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
+        isa<GlobalValue>(V))
+      return false;
+    if (isAllocationFn(V, TLI))
+      return true;
+
+    Instruction *I = cast<Instruction>(V);
+    if (I->mayHaveSideEffects())
+      return false;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+    } else if (I->getNumOperands() != 1) {
+      return false;
+    }
+
+    V = I->getOperand(0);
+  } while (1);
+}
+
+/// CleanupPointerRootUsers - This GV is a pointer root.  Loop over all users
+/// of the global and clean up any that obviously don't assign the global a
+/// value that isn't dynamically allocated.
+///
+static bool CleanupPointerRootUsers(GlobalVariable *GV,
+                                    const TargetLibraryInfo *TLI) {
+  // A brief explanation of leak checkers.  The goal is to find bugs where
+  // pointers are forgotten, causing an accumulating growth in memory
+  // usage over time.  The common strategy for leak checkers is to whitelist the
+  // memory pointed to by globals at exit.  This is popular because it also
+  // solves another problem where the main thread of a C++ program may shut down
+  // before other threads that are still expecting to use those globals.  To
+  // handle that case, we expect the program may create a singleton and never
+  // destroy it.
+
+  bool Changed = false;
+
+  // If Dead[n].first is the only use of a malloc result, we can delete its
+  // chain of computation and the store to the global in Dead[n].second.
+  SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+  // Constants can't be pointers to dynamically allocated memory.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E;) {
+    User *U = *UI++;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      Value *V = SI->getValueOperand();
+      if (isa<Constant>(V)) {
+        Changed = true;
+        SI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, SI));
+      }
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+      if (isa<Constant>(MSI->getValue())) {
+        Changed = true;
+        MSI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MSI));
+      }
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+      GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+      if (MemSrc && MemSrc->isConstant()) {
+        Changed = true;
+        MTI->eraseFromParent();
+      } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+        if (I->hasOneUse())
+          Dead.push_back(std::make_pair(I, MTI));
+      }
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      if (SafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        // This could have invalidated UI, start over from scratch.
+        Dead.clear();
+        CleanupPointerRootUsers(GV, TLI);
+        return true;
+      }
+    }
+  }
+
+  for (int i = 0, e = Dead.size(); i != e; ++i) {
+    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
+      Dead[i].second->eraseFromParent();
+      Instruction *I = Dead[i].first;
+      do {
+        if (isAllocationFn(I, TLI))
+          break;
+        Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+        if (!J)
+          break;
+        I->eraseFromParent();
+        I = J;
+      } while (1);
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+/// CleanupConstantGlobalUsers - We just marked GV constant.  Loop over all
+/// users of the global, cleaning up the obvious ones.  This is largely just a
+/// quick scan over the use list to clean up the easy and obvious cruft.  This
+/// returns true if it made a change.
+static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
+                                       DataLayout *TD, TargetLibraryInfo *TLI) {
+  bool Changed = false;
+  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  while (!WorkList.empty()) {
+    User *U = WorkList.pop_back_val();
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (Init) {
+        // Replace the load with the initializer.
+        LI->replaceAllUsesWith(Init);
+        LI->eraseFromParent();
+        Changed = true;
+      }
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // Store must be unreachable or storing Init into the global.
+      SI->eraseFromParent();
+      Changed = true;
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Constant *SubInit = 0;
+        if (Init)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+        Changed |= CleanupConstantGlobalUsers(CE, SubInit, TD, TLI);
+      } else if (CE->getOpcode() == Instruction::BitCast &&
+                 CE->getType()->isPointerTy()) {
+        // Pointer cast, delete any stores and memsets to the global.
+        Changed |= CleanupConstantGlobalUsers(CE, 0, TD, TLI);
+      }
+
+      if (CE->use_empty()) {
+        CE->destroyConstant();
+        Changed = true;
+      }
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+      // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+      // and will invalidate our notion of what Init is.
+      Constant *SubInit = 0;
+      if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+        ConstantExpr *CE =
+          dyn_cast_or_null<ConstantExpr>(ConstantFoldInstruction(GEP, TD, TLI));
+        if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+          SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+        // If the initializer is an all-null value and we have an inbounds GEP,
+        // we already know what the result of any load from that GEP is.
+        // TODO: Handle splats.
+        if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+          SubInit = Constant::getNullValue(GEP->getType()->getElementType());
+      }
+      Changed |= CleanupConstantGlobalUsers(GEP, SubInit, TD, TLI);
+
+      if (GEP->use_empty()) {
+        GEP->eraseFromParent();
+        Changed = true;
+      }
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+      if (MI->getRawDest() == V) {
+        MI->eraseFromParent();
+        Changed = true;
+      }
+
+    } else if (Constant *C = dyn_cast<Constant>(U)) {
+      // If we have a chain of dead constantexprs or other things dangling from
+      // us, and if they are all dead, nuke them without remorse.
+      if (SafeToDestroyConstant(C)) {
+        C->destroyConstant();
+        CleanupConstantGlobalUsers(V, Init, TD, TLI);
+        return true;
+      }
+    }
+  }
+  return Changed;
+}
+
+/// isSafeSROAElementUse - Return true if the specified instruction is a safe
+/// user of a derived expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+  // We might have a dead and dangling constant hanging off of here.
+  if (Constant *C = dyn_cast<Constant>(V))
+    return SafeToDestroyConstant(C);
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Loads are ok.
+  if (isa<LoadInst>(I)) return true;
+
+  // Stores *to* the pointer are ok.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getOperand(0) != V;
+
+  // Otherwise, it must be a GEP.
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
+  if (GEPI == 0) return false;
+
+  if (GEPI->getNumOperands() < 3 || !isa<Constant>(GEPI->getOperand(1)) ||
+      !cast<Constant>(GEPI->getOperand(1))->isNullValue())
+    return false;
+
+  for (Value::use_iterator I = GEPI->use_begin(), E = GEPI->use_end();
+       I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+
+/// IsUserOfGlobalSafeForSRA - U is a direct user of the specified global value.
+/// Look at it and its uses and decide whether it is safe to SROA this global.
+///
+static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
+  // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+  if (!isa<GetElementPtrInst>(U) &&
+      (!isa<ConstantExpr>(U) ||
+       cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+    return false;
+
+  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
+  // don't like < 3 operand CE's, and we don't like non-constant integer
+  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
+  // value of C.
+  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+      !cast<Constant>(U->getOperand(1))->isNullValue() ||
+      !isa<ConstantInt>(U->getOperand(2)))
+    return false;
+
+  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+  ++GEPI;  // Skip over the pointer index.
+
+  // If this is a use of an array allocation, do a bit more checking for sanity.
+  if (ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
+    uint64_t NumElements = AT->getNumElements();
+    ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
+
+    // Check to make sure that index falls within the array.  If not,
+    // something funny is going on, so we won't do the optimization.
+    //
+    if (Idx->getZExtValue() >= NumElements)
+      return false;
+
+    // We cannot scalar repl this level of the array unless any array
+    // sub-indices are in-range constants.  In particular, consider:
+    // A[0][i].  We cannot know that the user isn't doing invalid things like
+    // allowing i to index an out-of-range subscript that accesses A[1].
+    //
+    // Scalar replacing *just* the outer index of the array is probably not
+    // going to be a win anyway, so just give up.
+    for (++GEPI; // Skip array index.
+         GEPI != E;
+         ++GEPI) {
+      uint64_t NumElements;
+      if (ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
+        NumElements = SubArrayTy->getNumElements();
+      else if (VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
+        NumElements = SubVectorTy->getNumElements();
+      else {
+        assert((*GEPI)->isStructTy() &&
+               "Indexed GEP type is not array, vector, or struct!");
+        continue;
+      }
+
+      ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+      if (!IdxVal || IdxVal->getZExtValue() >= NumElements)
+        return false;
+    }
+  }
+
+  for (Value::use_iterator I = U->use_begin(), E = U->use_end(); I != E; ++I)
+    if (!isSafeSROAElementUse(*I))
+      return false;
+  return true;
+}
+
+/// GlobalUsersSafeToSRA - Look at all uses of the global and decide whether it
+/// is safe for us to perform this transformation.
+///
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    if (!IsUserOfGlobalSafeForSRA(*UI, GV))
+      return false;
+  }
+  return true;
+}
+
+
+/// SRAGlobal - Perform scalar replacement of aggregates on the specified global
+/// variable.  This opens the door for other optimizations by exposing the
+/// behavior of the program in a more fine-grained way.  We have determined that
+/// this transformation is safe already.  We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) {
+  // Make sure this global only has simple uses that we can SRA.
+  if (!GlobalUsersSafeToSRA(GV))
+    return 0;
+
+  assert(GV->hasLocalLinkage() && !GV->isConstant());
+  Constant *Init = GV->getInitializer();
+  Type *Ty = Init->getType();
+
+  std::vector<GlobalVariable*> NewGlobals;
+  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+
+  // Get the alignment of the global, either explicit or target-specific.
+  unsigned StartAlignment = GV->getAlignment();
+  if (StartAlignment == 0)
+    StartAlignment = TD.getABITypeAlignment(GV->getType());
+
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    NewGlobals.reserve(STy->getNumElements());
+    const StructLayout &Layout = *TD.getStructLayout(STy);
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      Constant *In = Init->getAggregateElement(i);
+      assert(In && "Couldn't get element of initializer?");
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+Twine(i),
+                                               GV->getThreadLocalMode(),
+                                              GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      uint64_t FieldOffset = Layout.getElementOffset(i);
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset);
+      if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
+        NGV->setAlignment(NewAlign);
+    }
+  } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+    unsigned NumElements = 0;
+    if (ArrayType *ATy = dyn_cast<ArrayType>(STy))
+      NumElements = ATy->getNumElements();
+    else
+      NumElements = cast<VectorType>(STy)->getNumElements();
+
+    if (NumElements > 16 && GV->hasNUsesOrMore(16))
+      return 0; // It's not worth it.
+    NewGlobals.reserve(NumElements);
+
+    uint64_t EltSize = TD.getTypeAllocSize(STy->getElementType());
+    unsigned EltAlign = TD.getABITypeAlignment(STy->getElementType());
+    for (unsigned i = 0, e = NumElements; i != e; ++i) {
+      Constant *In = Init->getAggregateElement(i);
+      assert(In && "Couldn't get element of initializer?");
+
+      GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false,
+                                               GlobalVariable::InternalLinkage,
+                                               In, GV->getName()+"."+Twine(i),
+                                               GV->getThreadLocalMode(),
+                                              GV->getType()->getAddressSpace());
+      Globals.insert(GV, NGV);
+      NewGlobals.push_back(NGV);
+
+      // Calculate the known alignment of the field.  If the original aggregate
+      // had 256 byte alignment for example, something might depend on that:
+      // propagate info to each field.
+      unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i);
+      if (NewAlign > EltAlign)
+        NGV->setAlignment(NewAlign);
+    }
+  }
+
+  if (NewGlobals.empty())
+    return 0;
+
+  DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV);
+
+  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+
+  // Loop over all of the uses of the global, replacing the constantexpr geps,
+  // with smaller constantexpr geps or direct references.
+  while (!GV->use_empty()) {
+    User *GEP = GV->use_back();
+    assert(((isa<ConstantExpr>(GEP) &&
+             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+    // Ignore the 1th operand, which has to be zero or else the program is quite
+    // broken (undefined).  Get the 2nd operand, which is the structure or array
+    // index.
+    unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+    if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access.
+
+    Value *NewPtr = NewGlobals[Val];
+
+    // Form a shorter GEP if needed.
+    if (GEP->getNumOperands() > 3) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+        SmallVector<Constant*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+          Idxs.push_back(CE->getOperand(i));
+        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr), Idxs);
+      } else {
+        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+        SmallVector<Value*, 8> Idxs;
+        Idxs.push_back(NullInt);
+        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+          Idxs.push_back(GEPI->getOperand(i));
+        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs,
+                                           GEPI->getName()+"."+Twine(Val),GEPI);
+      }
+    }
+    GEP->replaceAllUsesWith(NewPtr);
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+      GEPI->eraseFromParent();
+    else
+      cast<ConstantExpr>(GEP)->destroyConstant();
+  }
+
+  // Delete the old global, now that it is dead.
+  Globals.erase(GV);
+  ++NumSRA;
+
+  // Loop over the new globals array deleting any globals that are obviously
+  // dead.  This can arise due to scalarization of a structure or an array that
+  // has elements that are dead.
+  unsigned FirstGlobal = 0;
+  for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i)
+    if (NewGlobals[i]->use_empty()) {
+      Globals.erase(NewGlobals[i]);
+      if (FirstGlobal == i) ++FirstGlobal;
+    }
+
+  return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : 0;
+}
+
+/// AllUsesOfValueWillTrapIfNull - Return true if all users of the specified
+/// value will trap if the value is dynamically null.  PHIs keeps track of any
+/// phi nodes we've seen to avoid reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(const Value *V,
+                                         SmallPtrSet<const PHINode*, 8> &PHIs) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const User *U = *UI;
+
+    if (isa<LoadInst>(U)) {
+      // Will trap.
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Storing the value.
+      }
+    } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
+      if (CI->getCalledValue() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+      if (II->getCalledValue() != V) {
+        //cerr << "NONTRAPPING USE: " << *U;
+        return false;  // Not calling the ptr
+      }
+    } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If we've already seen this phi node, ignore it, it has already been
+      // checked.
+      if (PHIs.insert(PN) && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
+        return false;
+    } else if (isa<ICmpInst>(U) &&
+               isa<ConstantPointerNull>(UI->getOperand(1))) {
+      // Ignore icmp X, null
+    } else {
+      //cerr << "NONTRAPPING USE: " << *U;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// AllUsesOfLoadedValueWillTrapIfNull - Return true if all uses of any loads
+/// from GV will trap if the loaded value is null.  Note that this also permits
+/// comparisons of the loaded value against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      SmallPtrSet<const PHINode*, 8> PHIs;
+      if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+        return false;
+    } else if (isa<StoreInst>(U)) {
+      // Ignore stores to the global.
+    } else {
+      // We don't know or understand this user, bail out.
+      //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+  bool Changed = false;
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ) {
+    Instruction *I = cast<Instruction>(*UI++);
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      LI->setOperand(0, NewV);
+      Changed = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (SI->getOperand(1) == V) {
+        SI->setOperand(1, NewV);
+        Changed = true;
+      }
+    } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+      CallSite CS(I);
+      if (CS.getCalledValue() == V) {
+        // Calling through the pointer!  Turn into a direct call, but be careful
+        // that the pointer is not also being passed as an argument.
+        CS.setCalledFunction(NewV);
+        Changed = true;
+        bool PassedAsArg = false;
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.getArgument(i) == V) {
+            PassedAsArg = true;
+            CS.setArgument(i, NewV);
+          }
+
+        if (PassedAsArg) {
+          // Being passed as an argument also.  Be careful to not invalidate UI!
+          UI = V->use_begin();
+        }
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+                                ConstantExpr::getCast(CI->getOpcode(),
+                                                      NewV, CI->getType()));
+      if (CI->use_empty()) {
+        Changed = true;
+        CI->eraseFromParent();
+      }
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      // Should handle GEP here.
+      SmallVector<Constant*, 8> Idxs;
+      Idxs.reserve(GEPI->getNumOperands()-1);
+      for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+           i != e; ++i)
+        if (Constant *C = dyn_cast<Constant>(*i))
+          Idxs.push_back(C);
+        else
+          break;
+      if (Idxs.size() == GEPI->getNumOperands()-1)
+        Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
+                          ConstantExpr::getGetElementPtr(NewV, Idxs));
+      if (GEPI->use_empty()) {
+        Changed = true;
+        GEPI->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+/// OptimizeAwayTrappingUsesOfLoads - The specified global has only one non-null
+/// value stored into it.  If there are uses of the loaded value that would trap
+/// if the loaded value is dynamically null, then we know that they cannot be
+/// reachable with a null optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
+                                            DataLayout *TD,
+                                            TargetLibraryInfo *TLI) {
+  bool Changed = false;
+
+  // Keep track of whether we are able to remove all the uses of the global
+  // other than the store that defines it.
+  bool AllNonStoreUsesGone = true;
+
+  // Replace all uses of loads with uses of uses of the stored value.
+  for (Value::use_iterator GUI = GV->use_begin(), E = GV->use_end(); GUI != E;){
+    User *GlobalUser = *GUI++;
+    if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+      Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+      // If we were able to delete all uses of the loads
+      if (LI->use_empty()) {
+        LI->eraseFromParent();
+        Changed = true;
+      } else {
+        AllNonStoreUsesGone = false;
+      }
+    } else if (isa<StoreInst>(GlobalUser)) {
+      // Ignore the store that stores "LV" to the global.
+      assert(GlobalUser->getOperand(1) == GV &&
+             "Must be storing *to* the global");
+    } else {
+      AllNonStoreUsesGone = false;
+
+      // If we get here we could have other crazy uses that are transitively
+      // loaded.
+      assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
+              isa<BitCastInst>(GlobalUser) ||
+              isa<GetElementPtrInst>(GlobalUser)) &&
+             "Only expect load and stores!");
+    }
+  }
+
+  if (Changed) {
+    DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV);
+    ++NumGlobUses;
+  }
+
+  // If we nuked all of the loads, then none of the stores are needed either,
+  // nor is the global.
+  if (AllNonStoreUsesGone) {
+    if (isLeakCheckerRoot(GV)) {
+      Changed |= CleanupPointerRootUsers(GV, TLI);
+    } else {
+      Changed = true;
+      CleanupConstantGlobalUsers(GV, 0, TD, TLI);
+    }
+    if (GV->use_empty()) {
+      DEBUG(dbgs() << "  *** GLOBAL NOW DEAD!\n");
+      Changed = true;
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+  }
+  return Changed;
+}
+
+/// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
+/// instructions that are foldable.
+static void ConstantPropUsersOf(Value *V,
+                                DataLayout *TD, TargetLibraryInfo *TLI) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
+    if (Instruction *I = dyn_cast<Instruction>(*UI++))
+      if (Constant *NewC = ConstantFoldInstruction(I, TD, TLI)) {
+        I->replaceAllUsesWith(NewC);
+
+        // Advance UI to the next non-I use to avoid invalidating it!
+        // Instructions could multiply use V.
+        while (UI != E && *UI == I)
+          ++UI;
+        I->eraseFromParent();
+      }
+}
+
+/// OptimizeGlobalAddressOfMalloc - This function takes the specified global
+/// variable, and transforms the program as if it always contained the result of
+/// the specified malloc.  Because it is always the result of the specified
+/// malloc, there is no reason to actually DO the malloc.  Instead, turn the
+/// malloc into a global, and any loads of GV as uses of the new global.
+static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
+                                                     CallInst *CI,
+                                                     Type *AllocTy,
+                                                     ConstantInt *NElements,
+                                                     DataLayout *TD,
+                                                     TargetLibraryInfo *TLI) {
+  DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
+
+  Type *GlobalType;
+  if (NElements->getZExtValue() == 1)
+    GlobalType = AllocTy;
+  else
+    // If we have an array allocation, the global variable is of an array.
+    GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+
+  // Create the new global variable.  The contents of the malloc'd memory is
+  // undefined, so initialize with an undef value.
+  GlobalVariable *NewGV = new GlobalVariable(*GV->getParent(),
+                                             GlobalType, false,
+                                             GlobalValue::InternalLinkage,
+                                             UndefValue::get(GlobalType),
+                                             GV->getName()+".body",
+                                             GV,
+                                             GV->getThreadLocalMode());
+
+  // If there are bitcast users of the malloc (which is typical, usually we have
+  // a malloc + bitcast) then replace them with uses of the new global.  Update
+  // other users to use the global as well.
+  BitCastInst *TheBC = 0;
+  while (!CI->use_empty()) {
+    Instruction *User = cast<Instruction>(CI->use_back());
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (BCI->getType() == NewGV->getType()) {
+        BCI->replaceAllUsesWith(NewGV);
+        BCI->eraseFromParent();
+      } else {
+        BCI->setOperand(0, NewGV);
+      }
+    } else {
+      if (TheBC == 0)
+        TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
+      User->replaceUsesOfWith(CI, TheBC);
+    }
+  }
+
+  Constant *RepValue = NewGV;
+  if (NewGV->getType() != GV->getType()->getElementType())
+    RepValue = ConstantExpr::getBitCast(RepValue,
+                                        GV->getType()->getElementType());
+
+  // If there is a comparison against null, we will insert a global bool to
+  // keep track of whether the global was initialized yet or not.
+  GlobalVariable *InitBool =
+    new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
+                       GlobalValue::InternalLinkage,
+                       ConstantInt::getFalse(GV->getContext()),
+                       GV->getName()+".init", GV->getThreadLocalMode());
+  bool InitBoolUsed = false;
+
+  // Loop over all uses of GV, processing them in turn.
+  while (!GV->use_empty()) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(GV->use_back())) {
+      // The global is initialized when the store to it occurs.
+      new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
+                    SI->getOrdering(), SI->getSynchScope(), SI);
+      SI->eraseFromParent();
+      continue;
+    }
+
+    LoadInst *LI = cast<LoadInst>(GV->use_back());
+    while (!LI->use_empty()) {
+      Use &LoadUse = LI->use_begin().getUse();
+      if (!isa<ICmpInst>(LoadUse.getUser())) {
+        LoadUse = RepValue;
+        continue;
+      }
+
+      ICmpInst *ICI = cast<ICmpInst>(LoadUse.getUser());
+      // Replace the cmp X, 0 with a use of the bool value.
+      // Sink the load to where the compare was, if atomic rules allow us to.
+      Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0,
+                               LI->getOrdering(), LI->getSynchScope(),
+                               LI->isUnordered() ? (Instruction*)ICI : LI);
+      InitBoolUsed = true;
+      switch (ICI->getPredicate()) {
+      default: llvm_unreachable("Unknown ICmp Predicate!");
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT:   // X < null -> always false
+        LV = ConstantInt::getFalse(GV->getContext());
+        break;
+      case ICmpInst::ICMP_ULE:
+      case ICmpInst::ICMP_SLE:
+      case ICmpInst::ICMP_EQ:
+        LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
+        break;
+      case ICmpInst::ICMP_NE:
+      case ICmpInst::ICMP_UGE:
+      case ICmpInst::ICMP_SGE:
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT:
+        break;  // no change.
+      }
+      ICI->replaceAllUsesWith(LV);
+      ICI->eraseFromParent();
+    }
+    LI->eraseFromParent();
+  }
+
+  // If the initialization boolean was used, insert it, otherwise delete it.
+  if (!InitBoolUsed) {
+    while (!InitBool->use_empty())  // Delete initializations
+      cast<StoreInst>(InitBool->use_back())->eraseFromParent();
+    delete InitBool;
+  } else
+    GV->getParent()->getGlobalList().insert(GV, InitBool);
+
+  // Now the GV is dead, nuke it and the malloc..
+  GV->eraseFromParent();
+  CI->eraseFromParent();
+
+  // To further other optimizations, loop over all users of NewGV and try to
+  // constant prop them.  This will promote GEP instructions with constant
+  // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+  ConstantPropUsersOf(NewGV, TD, TLI);
+  if (RepValue != NewGV)
+    ConstantPropUsersOf(RepValue, TD, TLI);
+
+  return NewGV;
+}
+
+/// ValueIsOnlyUsedLocallyOrStoredToOneGlobal - Scan the use-list of V checking
+/// to make sure that there are no complex uses of V.  We permit simple things
+/// like dereferencing the pointer, but not storing through the address, unless
+/// it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+                                                      const GlobalVariable *GV,
+                                         SmallPtrSet<const PHINode*, 8> &PHIs) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    const Instruction *Inst = cast<Instruction>(*UI);
+
+    if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+      continue; // Fine, ignore.
+    }
+
+    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+        return false;  // Storing the pointer itself... bad.
+      continue; // Otherwise, storing through it, or storing into GV... fine.
+    }
+
+    // Must index into the array and into the struct.
+    if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
+      // PHIs are ok if all uses are ok.  Don't infinitely recurse through PHI
+      // cycles.
+      if (PHIs.insert(PN))
+        if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+          return false;
+      continue;
+    }
+
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+      if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+        return false;
+      continue;
+    }
+
+    return false;
+  }
+  return true;
+}
+
+/// ReplaceUsesOfMallocWithGlobal - The Alloc pointer is stored into GV
+/// somewhere.  Transform all uses of the allocation into loads from the
+/// global and uses of the resultant pointer.  Further, delete the store into
+/// GV.  This assumes that these value pass the
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
+                                          GlobalVariable *GV) {
+  while (!Alloc->use_empty()) {
+    Instruction *U = cast<Instruction>(*Alloc->use_begin());
+    Instruction *InsertPt = U;
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      // If this is the store of the allocation into the global, remove it.
+      if (SI->getOperand(1) == GV) {
+        SI->eraseFromParent();
+        continue;
+      }
+    } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Insert the load in the corresponding predecessor, not right before the
+      // PHI.
+      InsertPt = PN->getIncomingBlock(Alloc->use_begin())->getTerminator();
+    } else if (isa<BitCastInst>(U)) {
+      // Must be bitcast between the malloc and store to initialize the global.
+      ReplaceUsesOfMallocWithGlobal(U, GV);
+      U->eraseFromParent();
+      continue;
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      // If this is a "GEP bitcast" and the user is a store to the global, then
+      // just process it as a bitcast.
+      if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+        if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->use_back()))
+          if (SI->getOperand(1) == GV) {
+            // Must be bitcast GEP between the malloc and store to initialize
+            // the global.
+            ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+            GEPI->eraseFromParent();
+            continue;
+          }
+    }
+
+    // Insert a load from the global, and use it instead of the malloc.
+    Value *NL = new LoadInst(GV, GV->getName()+".val", InsertPt);
+    U->replaceUsesOfWith(Alloc, NL);
+  }
+}
+
+/// LoadUsesSimpleEnoughForHeapSRA - Verify that all uses of V (a load, or a phi
+/// of a load) are simple enough to perform heap SRA on.  This permits GEP's
+/// that index through the array and struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
+                        SmallPtrSet<const PHINode*, 32> &LoadUsingPHIs,
+                        SmallPtrSet<const PHINode*, 32> &LoadUsingPHIsPerLoad) {
+  // We permit two users of the load: setcc comparing against the null
+  // pointer, and a getelementptr of a specific form.
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const Instruction *User = cast<Instruction>(*UI);
+
+    // Comparison against null is ok.
+    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(User)) {
+      if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+        return false;
+      continue;
+    }
+
+    // getelementptr is also ok, but only a simple form.
+    if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      // Must index into the array and into the struct.
+      if (GEPI->getNumOperands() < 3)
+        return false;
+
+      // Otherwise the GEP is ok.
+      continue;
+    }
+
+    if (const PHINode *PN = dyn_cast<PHINode>(User)) {
+      if (!LoadUsingPHIsPerLoad.insert(PN))
+        // This means some phi nodes are dependent on each other.
+        // Avoid infinite looping!
+        return false;
+      if (!LoadUsingPHIs.insert(PN))
+        // If we have already analyzed this PHI, then it is safe.
+        continue;
+
+      // Make sure all uses of the PHI are simple enough to transform.
+      if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+                                          LoadUsingPHIs, LoadUsingPHIsPerLoad))
+        return false;
+
+      continue;
+    }
+
+    // Otherwise we don't know what this is, not ok.
+    return false;
+  }
+
+  return true;
+}
+
+
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA - If all users of values loaded from
+/// GV are simple enough to perform HeapSRA, return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
+                                                    Instruction *StoredVal) {
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
+  SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
+  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI)
+    if (const LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+                                          LoadUsingPHIsPerLoad))
+        return false;
+      LoadUsingPHIsPerLoad.clear();
+    }
+
+  // If we reach here, we know that all uses of the loads and transitive uses
+  // (through PHI nodes) are simple enough to transform.  However, we don't know
+  // that all inputs the to the PHI nodes are in the same equivalence sets.
+  // Check to verify that all operands of the PHIs are either PHIS that can be
+  // transformed, loads from GV, or MI itself.
+  for (SmallPtrSet<const PHINode*, 32>::const_iterator I = LoadUsingPHIs.begin()
+       , E = LoadUsingPHIs.end(); I != E; ++I) {
+    const PHINode *PN = *I;
+    for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+      Value *InVal = PN->getIncomingValue(op);
+
+      // PHI of the stored value itself is ok.
+      if (InVal == StoredVal) continue;
+
+      if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+        // One of the PHIs in our set is (optimistically) ok.
+        if (LoadUsingPHIs.count(InPN))
+          continue;
+        return false;
+      }
+
+      // Load from GV is ok.
+      if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
+        if (LI->getOperand(0) == GV)
+          continue;
+
+      // UNDEF? NULL?
+
+      // Anything else is rejected.
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  std::vector<Value*> &FieldVals = InsertedScalarizedValues[V];
+
+  if (FieldNo >= FieldVals.size())
+    FieldVals.resize(FieldNo+1);
+
+  // If we already have this value, just reuse the previously scalarized
+  // version.
+  if (Value *FieldVal = FieldVals[FieldNo])
+    return FieldVal;
+
+  // Depending on what instruction this is, we have several cases.
+  Value *Result;
+  if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    // This is a scalarized version of the load from the global.  Just create
+    // a new Load of the scalarized global.
+    Result = new LoadInst(GetHeapSROAValue(LI->getOperand(0), FieldNo,
+                                           InsertedScalarizedValues,
+                                           PHIsToRewrite),
+                          LI->getName()+".f"+Twine(FieldNo), LI);
+  } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    // PN's type is pointer to struct.  Make a new PHI of pointer to struct
+    // field.
+    StructType *ST =
+      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+
+    PHINode *NewPN =
+     PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
+                     PN->getNumIncomingValues(),
+                     PN->getName()+".f"+Twine(FieldNo), PN);
+    Result = NewPN;
+    PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+  } else {
+    llvm_unreachable("Unknown usable value");
+  }
+
+  return FieldVals[FieldNo] = Result;
+}
+
+/// RewriteHeapSROALoadUser - Given a load instruction and a value derived from
+/// the load, rewrite the derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser,
+             DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  // If this is a comparison against null, handle it.
+  if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+    assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+    // If we have a setcc of the loaded pointer, we can use a setcc of any
+    // field.
+    Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+                                   InsertedScalarizedValues, PHIsToRewrite);
+
+    Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
+                              Constant::getNullValue(NPtr->getType()),
+                              SCI->getName());
+    SCI->replaceAllUsesWith(New);
+    SCI->eraseFromParent();
+    return;
+  }
+
+  // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+    assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+           && "Unexpected GEPI!");
+
+    // Load the pointer for this field.
+    unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+    Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+                                     InsertedScalarizedValues, PHIsToRewrite);
+
+    // Create the new GEP idx vector.
+    SmallVector<Value*, 8> GEPIdx;
+    GEPIdx.push_back(GEPI->getOperand(1));
+    GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx,
+                                             GEPI->getName(), GEPI);
+    GEPI->replaceAllUsesWith(NGEPI);
+    GEPI->eraseFromParent();
+    return;
+  }
+
+  // Recursively transform the users of PHI nodes.  This will lazily create the
+  // PHIs that are needed for individual elements.  Keep track of what PHIs we
+  // see in InsertedScalarizedValues so that we don't get infinite loops (very
+  // antisocial).  If the PHI is already in InsertedScalarizedValues, it has
+  // already been seen first by another load, so its uses have already been
+  // processed.
+  PHINode *PN = cast<PHINode>(LoadUser);
+  if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+                                              std::vector<Value*>())).second)
+    return;
+
+  // If this is the first time we've seen this PHI, recursively process all
+  // users.
+  for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+}
+
+/// RewriteUsesOfLoadForHeapSRoA - We are performing Heap SRoA on a global.  Ptr
+/// is a value loaded from the global.  Eliminate all uses of Ptr, making them
+/// use FieldGlobals instead.  All uses of loaded values satisfy
+/// AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
+               DenseMap<Value*, std::vector<Value*> > &InsertedScalarizedValues,
+                   std::vector<std::pair<PHINode*, unsigned> > &PHIsToRewrite) {
+  for (Value::use_iterator UI = Load->use_begin(), E = Load->use_end();
+       UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+  }
+
+  if (Load->use_empty()) {
+    Load->eraseFromParent();
+    InsertedScalarizedValues.erase(Load);
+  }
+}
+
+/// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
+/// it up into multiple allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
+                                            Value *NElems, DataLayout *TD,
+                                            const TargetLibraryInfo *TLI) {
+  DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
+  Type *MAT = getMallocAllocatedType(CI, TLI);
+  StructType *STy = cast<StructType>(MAT);
+
+  // There is guaranteed to be at least one use of the malloc (storing
+  // it into GV).  If there are other uses, change them to be uses of
+  // the global to simplify later code.  This also deletes the store
+  // into GV.
+  ReplaceUsesOfMallocWithGlobal(CI, GV);
+
+  // Okay, at this point, there are no users of the malloc.  Insert N
+  // new mallocs at the same place as CI, and N globals.
+  std::vector<Value*> FieldGlobals;
+  std::vector<Value*> FieldMallocs;
+
+  for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+    Type *FieldTy = STy->getElementType(FieldNo);
+    PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
+
+    GlobalVariable *NGV =
+      new GlobalVariable(*GV->getParent(),
+                         PFieldTy, false, GlobalValue::InternalLinkage,
+                         Constant::getNullValue(PFieldTy),
+                         GV->getName() + ".f" + Twine(FieldNo), GV,
+                         GV->getThreadLocalMode());
+    FieldGlobals.push_back(NGV);
+
+    unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
+    if (StructType *ST = dyn_cast<StructType>(FieldTy))
+      TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
+    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
+                                        ConstantInt::get(IntPtrTy, TypeSize),
+                                        NElems, 0,
+                                        CI->getName() + ".f" + Twine(FieldNo));
+    FieldMallocs.push_back(NMI);
+    new StoreInst(NMI, NGV, CI);
+  }
+
+  // The tricky aspect of this transformation is handling the case when malloc
+  // fails.  In the original code, malloc failing would set the result pointer
+  // of malloc to null.  In this case, some mallocs could succeed and others
+  // could fail.  As such, we emit code that looks like this:
+  //    F0 = malloc(field0)
+  //    F1 = malloc(field1)
+  //    F2 = malloc(field2)
+  //    if (F0 == 0 || F1 == 0 || F2 == 0) {
+  //      if (F0) { free(F0); F0 = 0; }
+  //      if (F1) { free(F1); F1 = 0; }
+  //      if (F2) { free(F2); F2 = 0; }
+  //    }
+  // The malloc can also fail if its argument is too large.
+  Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
+  Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
+                                  ConstantZero, "isneg");
+  for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+    Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
+                             Constant::getNullValue(FieldMallocs[i]->getType()),
+                               "isnull");
+    RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
+  }
+
+  // Split the basic block at the old malloc.
+  BasicBlock *OrigBB = CI->getParent();
+  BasicBlock *ContBB = OrigBB->splitBasicBlock(CI, "malloc_cont");
+
+  // Create the block to check the first condition.  Put all these blocks at the
+  // end of the function as they are unlikely to be executed.
+  BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
+                                                "malloc_ret_null",
+                                                OrigBB->getParent());
+
+  // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+  // branch on RunningOr.
+  OrigBB->getTerminator()->eraseFromParent();
+  BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+
+  // Within the NullPtrBlock, we need to emit a comparison and branch for each
+  // pointer, because some may be null while others are not.
+  for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+    Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
+    Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
+                              Constant::getNullValue(GVVal->getType()));
+    BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
+                                               OrigBB->getParent());
+    BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
+                                               OrigBB->getParent());
+    Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
+                                         Cmp, NullPtrBlock);
+
+    // Fill in FreeBlock.
+    CallInst::CreateFree(GVVal, BI);
+    new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+                  FreeBlock);
+    BranchInst::Create(NextBlock, FreeBlock);
+
+    NullPtrBlock = NextBlock;
+  }
+
+  BranchInst::Create(ContBB, NullPtrBlock);
+
+  // CI is no longer needed, remove it.
+  CI->eraseFromParent();
+
+  /// InsertedScalarizedLoads - As we process loads, if we can't immediately
+  /// update all uses of the load, keep track of what scalarized loads are
+  /// inserted for a given load.
+  DenseMap<Value*, std::vector<Value*> > InsertedScalarizedValues;
+  InsertedScalarizedValues[GV] = FieldGlobals;
+
+  std::vector<std::pair<PHINode*, unsigned> > PHIsToRewrite;
+
+  // Okay, the malloc site is completely handled.  All of the uses of GV are now
+  // loads, and all uses of those loads are simple.  Rewrite them to use loads
+  // of the per-field globals instead.
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;) {
+    Instruction *User = cast<Instruction>(*UI++);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+      continue;
+    }
+
+    // Must be a store of null.
+    StoreInst *SI = cast<StoreInst>(User);
+    assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+           "Unexpected heap-sra user!");
+
+    // Insert a store of null into each global.
+    for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+      PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
+      Constant *Null = Constant::getNullValue(PT->getElementType());
+      new StoreInst(Null, FieldGlobals[i], SI);
+    }
+    // Erase the original store.
+    SI->eraseFromParent();
+  }
+
+  // While we have PHIs that are interesting to rewrite, do it.
+  while (!PHIsToRewrite.empty()) {
+    PHINode *PN = PHIsToRewrite.back().first;
+    unsigned FieldNo = PHIsToRewrite.back().second;
+    PHIsToRewrite.pop_back();
+    PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+    assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+    // Add all the incoming values.  This can materialize more phis.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+                               PHIsToRewrite);
+      FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+    }
+  }
+
+  // Drop all inter-phi links and any loads that made it this far.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->dropAllReferences();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->dropAllReferences();
+  }
+
+  // Delete all the phis and loads now that inter-references are dead.
+  for (DenseMap<Value*, std::vector<Value*> >::iterator
+       I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I->first))
+      PN->eraseFromParent();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+      LI->eraseFromParent();
+  }
+
+  // The old global is now dead, remove it.
+  GV->eraseFromParent();
+
+  ++NumHeapSRA;
+  return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// TryToOptimizeStoreOfMallocToGlobal - This function is called when we see a
+/// pointer global variable with a single value stored it that is a malloc or
+/// cast of malloc.
+static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
+                                               CallInst *CI,
+                                               Type *AllocTy,
+                                               AtomicOrdering Ordering,
+                                               Module::global_iterator &GVI,
+                                               DataLayout *TD,
+                                               TargetLibraryInfo *TLI) {
+  if (!TD)
+    return false;
+
+  // If this is a malloc of an abstract type, don't touch it.
+  if (!AllocTy->isSized())
+    return false;
+
+  // We can't optimize this global unless all uses of it are *known* to be
+  // of the malloc value, not of the null initializer value (consider a use
+  // that compares the global's value against zero to see if the malloc has
+  // been reached).  To do this, we check to see if all uses of the global
+  // would trap if the global were null: this proves that they must all
+  // happen after the malloc.
+  if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+    return false;
+
+  // We can't optimize this if the malloc itself is used in a complex way,
+  // for example, being stored into multiple globals.  This allows the
+  // malloc to be stored into the specified global, loaded icmp'd, and
+  // GEP'd.  These are all things we could transform to using the global
+  // for.
+  SmallPtrSet<const PHINode*, 8> PHIs;
+  if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
+    return false;
+
+  // If we have a global that is only initialized with a fixed size malloc,
+  // transform the program to use global memory instead of malloc'd memory.
+  // This eliminates dynamic allocation, avoids an indirection accessing the
+  // data, and exposes the resultant global to further GlobalOpt.
+  // We cannot optimize the malloc if we cannot determine malloc array size.
+  Value *NElems = getMallocArraySize(CI, TD, TLI, true);
+  if (!NElems)
+    return false;
+
+  if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
+    // Restrict this transformation to only working on small allocations
+    // (2048 bytes currently), as we don't want to introduce a 16M global or
+    // something.
+    if (NElements->getZExtValue() * TD->getTypeAllocSize(AllocTy) < 2048) {
+      GVI = OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, TD, TLI);
+      return true;
+    }
+
+  // If the allocation is an array of structures, consider transforming this
+  // into multiple malloc'd arrays, one for each field.  This is basically
+  // SRoA for malloc'd memory.
+
+  if (Ordering != NotAtomic)
+    return false;
+
+  // If this is an allocation of a fixed size array of structs, analyze as a
+  // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
+  if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
+    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+      AllocTy = AT->getElementType();
+
+  StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+  if (!AllocSTy)
+    return false;
+
+  // This the structure has an unreasonable number of fields, leave it
+  // alone.
+  if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+      AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {
+
+    // If this is a fixed size array, transform the Malloc to be an alloc of
+    // structs.  malloc [100 x struct],1 -> malloc struct, 100
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
+      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+      unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
+      Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
+      Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
+      Instruction *Malloc = CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy,
+                                                   AllocSize, NumElements,
+                                                   0, CI->getName());
+      Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
+      CI->replaceAllUsesWith(Cast);
+      CI->eraseFromParent();
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+        CI = cast<CallInst>(BCI->getOperand(0));
+      else
+        CI = cast<CallInst>(Malloc);
+    }
+
+    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, TLI, true),
+                               TD, TLI);
+    return true;
+  }
+
+  return false;
+}
+
+// OptimizeOnceStoredGlobal - Try to optimize globals based on the knowledge
+// that only one value (besides its initializer) is ever stored to the global.
+static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+                                     AtomicOrdering Ordering,
+                                     Module::global_iterator &GVI,
+                                     DataLayout *TD, TargetLibraryInfo *TLI) {
+  // Ignore no-op GEPs and bitcasts.
+  StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+  // If we are dealing with a pointer global that is initialized to null and
+  // only has one (non-null) value stored into it, then we can optimize any
+  // users of the loaded value (often calls and loads) that would trap if the
+  // value was null.
+  if (GV->getInitializer()->getType()->isPointerTy() &&
+      GV->getInitializer()->isNullValue()) {
+    if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+      if (GV->getInitializer()->getType() != SOVC->getType())
+        SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+      // Optimize away any trapping uses of the loaded value.
+      if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI))
+        return true;
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
+      if (MallocType &&
+          TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
+                                             TD, TLI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// TryToShrinkGlobalToBoolean - At this point, we have learned that the only
+/// two values ever stored into GV are its initializer and OtherVal.  See if we
+/// can shrink the global into a boolean and select between the two values
+/// whenever it is used.  This exposes the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+  Type *GVElType = GV->getType()->getElementType();
+
+  // If GVElType is already i1, it is already shrunk.  If the type of the GV is
+  // an FP value, pointer or vector, don't do this optimization because a select
+  // between them is very expensive and unlikely to lead to later
+  // simplification.  In these cases, we typically end up with "cond ? v1 : v2"
+  // where v1 and v2 both require constant pool loads, a big loss.
+  if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+      GVElType->isFloatingPointTy() ||
+      GVElType->isPointerTy() || GVElType->isVectorTy())
+    return false;
+
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I){
+    User *U = *I;
+    if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
+      return false;
+  }
+
+  DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV);
+
+  // Create the new global, initializing it to false.
+  GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
+                                             false,
+                                             GlobalValue::InternalLinkage,
+                                        ConstantInt::getFalse(GV->getContext()),
+                                             GV->getName()+".b",
+                                             GV->getThreadLocalMode(),
+                                             GV->getType()->getAddressSpace());
+  GV->getParent()->getGlobalList().insert(GV, NewGV);
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
+         "No reason to shrink to bool!");
+
+  // If initialized to zero and storing one into the global, we can use a cast
+  // instead of a select to synthesize the desired value.
+  bool IsOneZero = false;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal))
+    IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+  while (!GV->use_empty()) {
+    Instruction *UI = cast<Instruction>(GV->use_back());
+    if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+      // Change the store into a boolean store.
+      bool StoringOther = SI->getOperand(0) == OtherVal;
+      // Only do this if we weren't storing a loaded value.
+      Value *StoreVal;
+      if (StoringOther || SI->getOperand(0) == InitVal) {
+        StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
+                                    StoringOther);
+      } else {
+        // Otherwise, we are storing a previously loaded copy.  To do this,
+        // change the copy from copying the original value to just copying the
+        // bool.
+        Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+        // If we've already replaced the input, StoredVal will be a cast or
+        // select instruction.  If not, it will be a load of the original
+        // global.
+        if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+          assert(LI->getOperand(0) == GV && "Not a copy!");
+          // Insert a new load, to preserve the saved value.
+          StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                  LI->getOrdering(), LI->getSynchScope(), LI);
+        } else {
+          assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+                 "This is not a form that we understand!");
+          StoreVal = StoredVal->getOperand(0);
+          assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+        }
+      }
+      new StoreInst(StoreVal, NewGV, false, 0,
+                    SI->getOrdering(), SI->getSynchScope(), SI);
+    } else {
+      // Change the load into a load of bool then a select.
+      LoadInst *LI = cast<LoadInst>(UI);
+      LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
+                                   LI->getOrdering(), LI->getSynchScope(), LI);
+      Value *NSI;
+      if (IsOneZero)
+        NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+      else
+        NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+      NSI->takeName(LI);
+      LI->replaceAllUsesWith(NSI);
+    }
+    UI->eraseFromParent();
+  }
+
+  // Retain the name of the old global variable. People who are debugging their
+  // programs may expect these variables to be named the same.
+  NewGV->takeName(GV);
+  GV->eraseFromParent();
+  return true;
+}
+
+
+/// ProcessGlobal - Analyze the specified global variable and optimize it if
+/// possible.  If we make a change, return true.
+bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
+                              Module::global_iterator &GVI) {
+  if (!GV->isDiscardableIfUnused())
+    return false;
+
+  // Do more involved optimizations if the global is internal.
+  GV->removeDeadConstantUsers();
+
+  if (GV->use_empty()) {
+    DEBUG(dbgs() << "GLOBAL DEAD: " << *GV);
+    GV->eraseFromParent();
+    ++NumDeleted;
+    return true;
+  }
+
+  if (!GV->hasLocalLinkage())
+    return false;
+
+  SmallPtrSet<const PHINode*, 16> PHIUsers;
+  GlobalStatus GS;
+
+  if (AnalyzeGlobal(GV, GS, PHIUsers))
+    return false;
+
+  if (!GS.isCompared && !GV->hasUnnamedAddr()) {
+    GV->setUnnamedAddr(true);
+    NumUnnamed++;
+  }
+
+  if (GV->isConstant() || !GV->hasInitializer())
+    return false;
+
+  return ProcessInternalGlobal(GV, GVI, PHIUsers, GS);
+}
+
+/// ProcessInternalGlobal - Analyze the specified global variable and optimize
+/// it if possible.  If we make a change, return true.
+bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
+                                      Module::global_iterator &GVI,
+                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
+                                      const GlobalStatus &GS) {
+  // If this is a first class global and has only one accessing function
+  // and this function is main (which we know is not recursive we can make
+  // this global a local variable) we replace the global with a local alloca
+  // in this function.
+  //
+  // NOTE: It doesn't make sense to promote non single-value types since we
+  // are just replacing static memory to stack memory.
+  //
+  // If the global is in different address space, don't bring it to stack.
+  if (!GS.HasMultipleAccessingFunctions &&
+      GS.AccessingFunction && !GS.HasNonInstructionUser &&
+      GV->getType()->getElementType()->isSingleValueType() &&
+      GS.AccessingFunction->getName() == "main" &&
+      GS.AccessingFunction->hasExternalLinkage() &&
+      GV->getType()->getAddressSpace() == 0) {
+    DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
+    Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+                                                   ->getEntryBlock().begin());
+    Type *ElemTy = GV->getType()->getElementType();
+    // FIXME: Pass Global's alignment when globals have alignment
+    AllocaInst *Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
+    if (!isa<UndefValue>(GV->getInitializer()))
+      new StoreInst(GV->getInitializer(), Alloca, &FirstI);
+
+    GV->replaceAllUsesWith(Alloca);
+    GV->eraseFromParent();
+    ++NumLocalized;
+    return true;
+  }
+
+  // If the global is never loaded (but may be stored to), it is dead.
+  // Delete it now.
+  if (!GS.isLoaded) {
+    DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
+
+    bool Changed;
+    if (isLeakCheckerRoot(GV)) {
+      // Delete any constant stores to the global.
+      Changed = CleanupPointerRootUsers(GV, TLI);
+    } else {
+      // Delete any stores we can find to the global.  We may not be able to
+      // make it completely dead though.
+      Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
+    }
+
+    // If the global is dead now, delete it.
+    if (GV->use_empty()) {
+      GV->eraseFromParent();
+      ++NumDeleted;
+      Changed = true;
+    }
+    return Changed;
+
+  } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+    DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+    GV->setConstant(true);
+
+    // Clean up any obviously simplifiable users now.
+    CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
+
+    // If the global is dead now, just nuke it.
+    if (GV->use_empty()) {
+      DEBUG(dbgs() << "   *** Marking constant allowed us to simplify "
+            << "all users and delete global!\n");
+      GV->eraseFromParent();
+      ++NumDeleted;
+    }
+
+    ++NumMarked;
+    return true;
+  } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
+    if (DataLayout *TD = getAnalysisIfAvailable<DataLayout>())
+      if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) {
+        GVI = FirstNewGV;  // Don't skip the newly produced globals!
+        return true;
+      }
+  } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+    // If the initial value for the global was an undef value, and if only
+    // one other value was stored into it, we can just change the
+    // initializer to be the stored value, then delete all stores to the
+    // global.  This allows us to mark it constant.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+      if (isa<UndefValue>(GV->getInitializer())) {
+        // Change the initial value here.
+        GV->setInitializer(SOVConstant);
+
+        // Clean up any obviously simplifiable users now.
+        CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI);
+
+        if (GV->use_empty()) {
+          DEBUG(dbgs() << "   *** Substituting initializer allowed us to "
+                       << "simplify all users and delete global!\n");
+          GV->eraseFromParent();
+          ++NumDeleted;
+        } else {
+          GVI = GV;
+        }
+        ++NumSubstitute;
+        return true;
+      }
+
+    // Try to optimize globals based on the knowledge that only one value
+    // (besides its initializer) is ever stored to the global.
+    if (OptimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, GVI,
+                                 TD, TLI))
+      return true;
+
+    // Otherwise, if the global was not a boolean, we can shrink it to be a
+    // boolean.
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+        ++NumShrunkToBool;
+        return true;
+      }
+  }
+
+  return false;
+}
+
+/// ChangeCalleesToFastCall - Walk all of the direct calls of the specified
+/// function, changing them to FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
+    CallSite User(cast<Instruction>(*UI));
+    User.setCallingConv(CallingConv::Fast);
+  }
+}
+
+static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
+  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
+    unsigned Index = Attrs.getSlotIndex(i);
+    if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
+      continue;
+
+    // There can be only one.
+    return Attrs.removeAttribute(C, Index, Attribute::Nest);
+  }
+
+  return Attrs;
+}
+
+static void RemoveNestAttribute(Function *F) {
+  F->setAttributes(StripNest(F->getContext(), F->getAttributes()));
+  for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
+    if (isa<BlockAddress>(*UI))
+      continue;
+    CallSite User(cast<Instruction>(*UI));
+    User.setAttributes(StripNest(F->getContext(), User.getAttributes()));
+  }
+}
+
+bool GlobalOpt::OptimizeFunctions(Module &M) {
+  bool Changed = false;
+  // Optimize functions.
+  for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+    Function *F = FI++;
+    // Functions without names cannot be referenced outside this module.
+    if (!F->hasName() && !F->isDeclaration())
+      F->setLinkage(GlobalValue::InternalLinkage);
+    F->removeDeadConstantUsers();
+    if (F->isDefTriviallyDead()) {
+      F->eraseFromParent();
+      Changed = true;
+      ++NumFnDeleted;
+    } else if (F->hasLocalLinkage()) {
+      if (F->getCallingConv() == CallingConv::C && !F->isVarArg() &&
+          !F->hasAddressTaken()) {
+        // If this function has C calling conventions, is not a varargs
+        // function, and is only called directly, promote it to use the Fast
+        // calling convention.
+        F->setCallingConv(CallingConv::Fast);
+        ChangeCalleesToFastCall(F);
+        ++NumFastCallFns;
+        Changed = true;
+      }
+
+      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+          !F->hasAddressTaken()) {
+        // The function is not used by a trampoline intrinsic, so it is safe
+        // to remove the 'nest' attribute.
+        RemoveNestAttribute(F);
+        ++NumNestRemoved;
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+bool GlobalOpt::OptimizeGlobalVars(Module &M) {
+  bool Changed = false;
+  for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+       GVI != E; ) {
+    GlobalVariable *GV = GVI++;
+    // Global variables without names cannot be referenced outside this module.
+    if (!GV->hasName() && !GV->isDeclaration())
+      GV->setLinkage(GlobalValue::InternalLinkage);
+    // Simplify the initializer.
+    if (GV->hasInitializer())
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GV->getInitializer())) {
+        Constant *New = ConstantFoldConstantExpression(CE, TD, TLI);
+        if (New && New != CE)
+          GV->setInitializer(New);
+      }
+
+    Changed |= ProcessGlobal(GV, GVI);
+  }
+  return Changed;
+}
+
+/// FindGlobalCtors - Find the llvm.global_ctors list, verifying that all
+/// initializers have an init priority of 65535.
+GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (GV == 0) return 0;
+
+  // Verify that the initializer is simple enough for us to handle. We are
+  // only allowed to optimize the initializer if it is unique.
+  if (!GV->hasUniqueInitializer()) return 0;
+
+  if (isa<ConstantAggregateZero>(GV->getInitializer()))
+    return GV;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    if (isa<ConstantAggregateZero>(*i))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    if (isa<ConstantPointerNull>(CS->getOperand(1)))
+      continue;
+
+    // Must have a function or null ptr.
+    if (!isa<Function>(CS->getOperand(1)))
+      return 0;
+
+    // Init priority must be standard.
+    ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
+    if (CI->getZExtValue() != 65535)
+      return 0;
+  }
+
+  return GV;
+}
+
+/// ParseGlobalCtors - Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function*> ParseGlobalCtors(GlobalVariable *GV) {
+  if (GV->getInitializer()->isNullValue())
+    return std::vector<Function*>();
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  std::vector<Function*> Result;
+  Result.reserve(CA->getNumOperands());
+  for (User::op_iterator i = CA->op_begin(), e = CA->op_end(); i != e; ++i) {
+    ConstantStruct *CS = cast<ConstantStruct>(*i);
+    Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+  }
+  return Result;
+}
+
+/// InstallGlobalCtors - Given a specified llvm.global_ctors list, install the
+/// specified array, returning the new global to use.
+static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
+                                          const std::vector<Function*> &Ctors) {
+  // If we made a change, reassemble the initializer list.
+  Constant *CSVals[2];
+  CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), 65535);
+  CSVals[1] = 0;
+
+  StructType *StructTy =
+    cast <StructType>(
+    cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
+
+  // Create the new init list.
+  std::vector<Constant*> CAList;
+  for (unsigned i = 0, e = Ctors.size(); i != e; ++i) {
+    if (Ctors[i]) {
+      CSVals[1] = Ctors[i];
+    } else {
+      Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
+                                          false);
+      PointerType *PFTy = PointerType::getUnqual(FTy);
+      CSVals[1] = Constant::getNullValue(PFTy);
+      CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
+                                   0x7fffffff);
+    }
+    CAList.push_back(ConstantStruct::get(StructTy, CSVals));
+  }
+
+  // Create the array initializer.
+  Constant *CA = ConstantArray::get(ArrayType::get(StructTy,
+                                                   CAList.size()), CAList);
+
+  // If we didn't change the number of elements, don't create a new GV.
+  if (CA->getType() == GCL->getInitializer()->getType()) {
+    GCL->setInitializer(CA);
+    return GCL;
+  }
+
+  // Create the new global and insert it next to the existing list.
+  GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(),
+                                           GCL->getLinkage(), CA, "",
+                                           GCL->getThreadLocalMode());
+  GCL->getParent()->getGlobalList().insert(GCL, NGV);
+  NGV->takeName(GCL);
+
+  // Nuke the old list, replacing any uses with the new one.
+  if (!GCL->use_empty()) {
+    Constant *V = NGV;
+    if (V->getType() != GCL->getType())
+      V = ConstantExpr::getBitCast(V, GCL->getType());
+    GCL->replaceAllUsesWith(V);
+  }
+  GCL->eraseFromParent();
+
+  if (Ctors.size())
+    return NGV;
+  else
+    return 0;
+}
+
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            const DataLayout *TD);
+
+
+/// isSimpleEnoughValueToCommit - Return true if the specified constant can be
+/// handled by the code generator.  We don't want to generate something like:
+///   void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool isSimpleEnoughValueToCommitHelper(Constant *C,
+                                   SmallPtrSet<Constant*, 8> &SimpleConstants,
+                                   const DataLayout *TD) {
+  // Simple integer, undef, constant aggregate zero, global addresses, etc are
+  // all supported.
+  if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
+      isa<GlobalValue>(C))
+    return true;
+
+  // Aggregate values are safe if all their elements are.
+  if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
+      isa<ConstantVector>(C)) {
+    for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i) {
+      Constant *Op = cast<Constant>(C->getOperand(i));
+      if (!isSimpleEnoughValueToCommit(Op, SimpleConstants, TD))
+        return false;
+    }
+    return true;
+  }
+
+  // We don't know exactly what relocations are allowed in constant expressions,
+  // so we allow &global+constantoffset, which is safe and uniformly supported
+  // across targets.
+  ConstantExpr *CE = cast<ConstantExpr>(C);
+  switch (CE->getOpcode()) {
+  case Instruction::BitCast:
+    // Bitcast is fine if the casted value is fine.
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
+
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // int <=> ptr is fine if the int type is the same size as the
+    // pointer type.
+    if (!TD || TD->getTypeSizeInBits(CE->getType()) !=
+               TD->getTypeSizeInBits(CE->getOperand(0)->getType()))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
+
+  // GEP is fine if it is simple + constant offset.
+  case Instruction::GetElementPtr:
+    for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+      if (!isa<ConstantInt>(CE->getOperand(i)))
+        return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
+
+  case Instruction::Add:
+    // We allow simple+cst.
+    if (!isa<ConstantInt>(CE->getOperand(1)))
+      return false;
+    return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
+  }
+  return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+                            SmallPtrSet<Constant*, 8> &SimpleConstants,
+                            const DataLayout *TD) {
+  // If we already checked this constant, we win.
+  if (!SimpleConstants.insert(C)) return true;
+  // Check the constant.
+  return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, TD);
+}
+
+
+/// isSimpleEnoughPointerToCommit - Return true if this constant is simple
+/// enough for us to understand.  In particular, if it is a cast to anything
+/// other than from one pointer type to another pointer type, we punt.
+/// We basically just support direct accesses to globals and GEP's of
+/// globals.  This should be kept up to date with CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+  // Conservatively, avoid aggregate types. This is because we don't
+  // want to worry about them partially overlapping other stores.
+  if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+    return false;
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+    // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+    // external globals.
+    return GV->hasUniqueInitializer();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    // Handle a constantexpr gep.
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0)) &&
+        cast<GEPOperator>(CE)->isInBounds()) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      if (!GV->hasUniqueInitializer())
+        return false;
+
+      // The first index must be zero.
+      ConstantInt *CI = dyn_cast<ConstantInt>(*llvm::next(CE->op_begin()));
+      if (!CI || !CI->isZero()) return false;
+
+      // The remaining indices must be compile-time known integers within the
+      // notional bounds of the corresponding static array types.
+      if (!CE->isGEPWithNoNotionalOverIndexing())
+        return false;
+
+      return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+    // A constantexpr bitcast from a pointer to another pointer is a no-op,
+    // and we know how to evaluate it by moving the bitcast from the pointer
+    // operand to the value operand.
+    } else if (CE->getOpcode() == Instruction::BitCast &&
+               isa<GlobalVariable>(CE->getOperand(0))) {
+      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+      // external globals.
+      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+    }
+  }
+
+  return false;
+}
+
+/// EvaluateStoreInto - Evaluate a piece of a constantexpr store into a global
+/// initializer.  This returns 'Init' modified to reflect 'Val' stored into it.
+/// At this point, the GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+                                   ConstantExpr *Addr, unsigned OpNo) {
+  // Base case of the recursion.
+  if (OpNo == Addr->getNumOperands()) {
+    assert(Val->getType() == Init->getType() && "Type mismatch!");
+    return Val;
+  }
+
+  SmallVector<Constant*, 32> Elts;
+  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
+    // Break up the constant into its elements.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      Elts.push_back(Init->getAggregateElement(i));
+
+    // Replace the element that we are supposed to.
+    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+    unsigned Idx = CU->getZExtValue();
+    assert(Idx < STy->getNumElements() && "Struct index out of range!");
+    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+
+    // Return the modified struct.
+    return ConstantStruct::get(STy, Elts);
+  }
+
+  ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+  SequentialType *InitTy = cast<SequentialType>(Init->getType());
+
+  uint64_t NumElts;
+  if (ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
+    NumElts = ATy->getNumElements();
+  else
+    NumElts = InitTy->getVectorNumElements();
+
+  // Break up the array into elements.
+  for (uint64_t i = 0, e = NumElts; i != e; ++i)
+    Elts.push_back(Init->getAggregateElement(i));
+
+  assert(CI->getZExtValue() < NumElts);
+  Elts[CI->getZExtValue()] =
+    EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+
+  if (Init->getType()->isArrayTy())
+    return ConstantArray::get(cast<ArrayType>(InitTy), Elts);
+  return ConstantVector::get(Elts);
+}
+
+/// CommitValueTo - We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    assert(GV->hasInitializer());
+    GV->setInitializer(Val);
+    return;
+  }
+
+  ConstantExpr *CE = cast<ConstantExpr>(Addr);
+  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+  GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
+}
+
+namespace {
+
+/// Evaluator - This class evaluates LLVM IR, producing the Constant
+/// representing each SSA instruction.  Changes to global variables are stored
+/// in a mapping that can be iterated over after the evaluation is complete.
+/// Once an evaluation call fails, the evaluation object should not be reused.
+class Evaluator {
+public:
+  Evaluator(const DataLayout *TD, const TargetLibraryInfo *TLI)
+    : TD(TD), TLI(TLI) {
+    ValueStack.push_back(new DenseMap<Value*, Constant*>);
+  }
+
+  ~Evaluator() {
+    DeleteContainerPointers(ValueStack);
+    while (!AllocaTmps.empty()) {
+      GlobalVariable *Tmp = AllocaTmps.back();
+      AllocaTmps.pop_back();
+
+      // If there are still users of the alloca, the program is doing something
+      // silly, e.g. storing the address of the alloca somewhere and using it
+      // later.  Since this is undefined, we'll just make it be null.
+      if (!Tmp->use_empty())
+        Tmp->replaceAllUsesWith(Constant::getNullValue(Tmp->getType()));
+      delete Tmp;
+    }
+  }
+
+  /// EvaluateFunction - Evaluate a call to function F, returning true if
+  /// successful, false if we can't evaluate it.  ActualArgs contains the formal
+  /// arguments for the function.
+  bool EvaluateFunction(Function *F, Constant *&RetVal,
+                        const SmallVectorImpl<Constant*> &ActualArgs);
+
+  /// EvaluateBlock - Evaluate all instructions in block BB, returning true if
+  /// successful, false if we can't evaluate it.  NewBB returns the next BB that
+  /// control flows into, or null upon return.
+  bool EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB);
+
+  Constant *getVal(Value *V) {
+    if (Constant *CV = dyn_cast<Constant>(V)) return CV;
+    Constant *R = ValueStack.back()->lookup(V);
+    assert(R && "Reference to an uncomputed value!");
+    return R;
+  }
+
+  void setVal(Value *V, Constant *C) {
+    ValueStack.back()->operator[](V) = C;
+  }
+
+  const DenseMap<Constant*, Constant*> &getMutatedMemory() const {
+    return MutatedMemory;
+  }
+
+  const SmallPtrSet<GlobalVariable*, 8> &getInvariants() const {
+    return Invariants;
+  }
+
+private:
+  Constant *ComputeLoadResult(Constant *P);
+
+  /// ValueStack - As we compute SSA register values, we store their contents
+  /// here. The back of the vector contains the current function and the stack
+  /// contains the values in the calling frames.
+  SmallVector<DenseMap<Value*, Constant*>*, 4> ValueStack;
+
+  /// CallStack - This is used to detect recursion.  In pathological situations
+  /// we could hit exponential behavior, but at least there is nothing
+  /// unbounded.
+  SmallVector<Function*, 4> CallStack;
+
+  /// MutatedMemory - For each store we execute, we update this map.  Loads
+  /// check this to get the most up-to-date value.  If evaluation is successful,
+  /// this state is committed to the process.
+  DenseMap<Constant*, Constant*> MutatedMemory;
+
+  /// AllocaTmps - To 'execute' an alloca, we create a temporary global variable
+  /// to represent its body.  This vector is needed so we can delete the
+  /// temporary globals when we are done.
+  SmallVector<GlobalVariable*, 32> AllocaTmps;
+
+  /// Invariants - These global variables have been marked invariant by the
+  /// static constructor.
+  SmallPtrSet<GlobalVariable*, 8> Invariants;
+
+  /// SimpleConstants - These are constants we have checked and know to be
+  /// simple enough to live in a static initializer of a global.
+  SmallPtrSet<Constant*, 8> SimpleConstants;
+
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+};
+
+}  // anonymous namespace
+
+/// ComputeLoadResult - Return the value that would be computed by a load from
+/// P after the stores reflected by 'memory' have been performed.  If we can't
+/// decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+  // If this memory location has been recently stored, use the stored value: it
+  // is the most up-to-date.
+  DenseMap<Constant*, Constant*>::const_iterator I = MutatedMemory.find(P);
+  if (I != MutatedMemory.end()) return I->second;
+
+  // Access it.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+    if (GV->hasDefinitiveInitializer())
+      return GV->getInitializer();
+    return 0;
+  }
+
+  // Handle a constantexpr getelementptr.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P))
+    if (CE->getOpcode() == Instruction::GetElementPtr &&
+        isa<GlobalVariable>(CE->getOperand(0))) {
+      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+      if (GV->hasDefinitiveInitializer())
+        return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+    }
+
+  return 0;  // don't know how to evaluate.
+}
+
+/// EvaluateBlock - Evaluate all instructions in block BB, returning true if
+/// successful, false if we can't evaluate it.  NewBB returns the next BB that
+/// control flows into, or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+                              BasicBlock *&NextBB) {
+  // This is the main evaluation loop.
+  while (1) {
+    Constant *InstResult = 0;
+
+    DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+      if (!SI->isSimple()) {
+        DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+      Constant *Ptr = getVal(SI->getOperand(1));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+        Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+        DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+      }
+      if (!isSimpleEnoughPointerToCommit(Ptr)) {
+        // If this is too complex for us to commit, reject it.
+        DEBUG(dbgs() << "Pointer is too complex for us to evaluate store.");
+        return false;
+      }
+
+      Constant *Val = getVal(SI->getOperand(0));
+
+      // If this might be too difficult for the backend to handle (e.g. the addr
+      // of one global variable divided by another) then we can't commit it.
+      if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD)) {
+        DEBUG(dbgs() << "Store value is too complex to evaluate store. " << *Val
+              << "\n");
+        return false;
+      }
+
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        if (CE->getOpcode() == Instruction::BitCast) {
+          DEBUG(dbgs() << "Attempting to resolve bitcast on constant ptr.\n");
+          // If we're evaluating a store through a bitcast, then we need
+          // to pull the bitcast off the pointer type and push it onto the
+          // stored value.
+          Ptr = CE->getOperand(0);
+
+          Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
+
+          // In order to push the bitcast onto the stored value, a bitcast
+          // from NewTy to Val's type must be legal.  If it's not, we can try
+          // introspecting NewTy to find a legal conversion.
+          while (!Val->getType()->canLosslesslyBitCastTo(NewTy)) {
+            // If NewTy is a struct, we can convert the pointer to the struct
+            // into a pointer to its first member.
+            // FIXME: This could be extended to support arrays as well.
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
+              NewTy = STy->getTypeAtIndex(0U);
+
+              IntegerType *IdxTy = IntegerType::get(NewTy->getContext(), 32);
+              Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+              Constant * const IdxList[] = {IdxZero, IdxZero};
+
+              Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList);
+              if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+                Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+
+            // If we can't improve the situation by introspecting NewTy,
+            // we have to give up.
+            } else {
+              DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+                    "evaluate.\n");
+              return false;
+            }
+          }
+
+          // If we found compatible types, go ahead and push the bitcast
+          // onto the stored value.
+          Val = ConstantExpr::getBitCast(Val, NewTy);
+
+          DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+        }
+      }
+
+      MutatedMemory[Ptr] = Val;
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+      InstResult = ConstantExpr::get(BO->getOpcode(),
+                                     getVal(BO->getOperand(0)),
+                                     getVal(BO->getOperand(1)));
+      DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+      InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+                                            getVal(CI->getOperand(0)),
+                                            getVal(CI->getOperand(1)));
+      DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+            << "\n");
+    } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+      InstResult = ConstantExpr::getCast(CI->getOpcode(),
+                                         getVal(CI->getOperand(0)),
+                                         CI->getType());
+      DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+            << "\n");
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+      InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+                                           getVal(SI->getOperand(1)),
+                                           getVal(SI->getOperand(2)));
+      DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+            << "\n");
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+      Constant *P = getVal(GEP->getOperand(0));
+      SmallVector<Constant*, 8> GEPOps;
+      for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+           i != e; ++i)
+        GEPOps.push_back(getVal(*i));
+      InstResult =
+        ConstantExpr::getGetElementPtr(P, GEPOps,
+                                       cast<GEPOperator>(GEP)->isInBounds());
+      DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult
+            << "\n");
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+
+      if (!LI->isSimple()) {
+        DEBUG(dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+        return false;  // no volatile/atomic accesses.
+      }
+
+      Constant *Ptr = getVal(LI->getOperand(0));
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+        Ptr = ConstantFoldConstantExpression(CE, TD, TLI);
+        DEBUG(dbgs() << "Found a constant pointer expression, constant "
+              "folding: " << *Ptr << "\n");
+      }
+      InstResult = ComputeLoadResult(Ptr);
+      if (InstResult == 0) {
+        DEBUG(dbgs() << "Failed to compute load result. Can not evaluate load."
+              "\n");
+        return false; // Could not evaluate load.
+      }
+
+      DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+    } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+      if (AI->isArrayAllocation()) {
+        DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+        return false;  // Cannot handle array allocs.
+      }
+      Type *Ty = AI->getType()->getElementType();
+      AllocaTmps.push_back(new GlobalVariable(Ty, false,
+                                              GlobalValue::InternalLinkage,
+                                              UndefValue::get(Ty),
+                                              AI->getName()));
+      InstResult = AllocaTmps.back();
+      DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+    } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+      CallSite CS(CurInst);
+
+      // Debug info can safely be ignored here.
+      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+        DEBUG(dbgs() << "Ignoring debug info.\n");
+        ++CurInst;
+        continue;
+      }
+
+      // Cannot handle inline asm.
+      if (isa<InlineAsm>(CS.getCalledValue())) {
+        DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+        return false;
+      }
+
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+        if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+          if (MSI->isVolatile()) {
+            DEBUG(dbgs() << "Can not optimize a volatile memset " <<
+                  "intrinsic.\n");
+            return false;
+          }
+          Constant *Ptr = getVal(MSI->getDest());
+          Constant *Val = getVal(MSI->getValue());
+          Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+          if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+            // This memset is a no-op.
+            DEBUG(dbgs() << "Ignoring no-op memset.\n");
+            ++CurInst;
+            continue;
+          }
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+            II->getIntrinsicID() == Intrinsic::lifetime_end) {
+          DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+          ++CurInst;
+          continue;
+        }
+
+        if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+          // We don't insert an entry into Values, as it doesn't have a
+          // meaningful return value.
+          if (!II->use_empty()) {
+            DEBUG(dbgs() << "Found unused invariant_start. Cant evaluate.\n");
+            return false;
+          }
+          ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+          Value *PtrArg = getVal(II->getArgOperand(1));
+          Value *Ptr = PtrArg->stripPointerCasts();
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+            Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
+            if (!Size->isAllOnesValue() &&
+                Size->getValue().getLimitedValue() >=
+                TD->getTypeStoreSize(ElemTy)) {
+              Invariants.insert(GV);
+              DEBUG(dbgs() << "Found a global var that is an invariant: " << *GV
+                    << "\n");
+            } else {
+              DEBUG(dbgs() << "Found a global var, but can not treat it as an "
+                    "invariant.\n");
+            }
+          }
+          // Continue even if we do nothing.
+          ++CurInst;
+          continue;
+        }
+
+        DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+        return false;
+      }
+
+      // Resolve function pointers.
+      Function *Callee = dyn_cast<Function>(getVal(CS.getCalledValue()));
+      if (!Callee || Callee->mayBeOverridden()) {
+        DEBUG(dbgs() << "Can not resolve function pointer.\n");
+        return false;  // Cannot resolve.
+      }
+
+      SmallVector<Constant*, 8> Formals;
+      for (User::op_iterator i = CS.arg_begin(), e = CS.arg_end(); i != e; ++i)
+        Formals.push_back(getVal(*i));
+
+      if (Callee->isDeclaration()) {
+        // If this is a function we can constant fold, do it.
+        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
+          InstResult = C;
+          DEBUG(dbgs() << "Constant folded function call. Result: " <<
+                *InstResult << "\n");
+        } else {
+          DEBUG(dbgs() << "Can not constant fold function call.\n");
+          return false;
+        }
+      } else {
+        if (Callee->getFunctionType()->isVarArg()) {
+          DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+          return false;
+        }
+
+        Constant *RetVal = 0;
+        // Execute the call, if successful, use the return value.
+        ValueStack.push_back(new DenseMap<Value*, Constant*>);
+        if (!EvaluateFunction(Callee, RetVal, Formals)) {
+          DEBUG(dbgs() << "Failed to evaluate function.\n");
+          return false;
+        }
+        delete ValueStack.pop_back_val();
+        InstResult = RetVal;
+
+        if (InstResult != NULL) {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: " <<
+                InstResult << "\n\n");
+        } else {
+          DEBUG(dbgs() << "Successfully evaluated function. Result: 0\n\n");
+        }
+      }
+    } else if (isa<TerminatorInst>(CurInst)) {
+      DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+        if (BI->isUnconditional()) {
+          NextBB = BI->getSuccessor(0);
+        } else {
+          ConstantInt *Cond =
+            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+          if (!Cond) return false;  // Cannot determine.
+
+          NextBB = BI->getSuccessor(!Cond->getZExtValue());
+        }
+      } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+        ConstantInt *Val =
+          dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+        if (!Val) return false;  // Cannot determine.
+        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+      } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+        Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+        if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+          NextBB = BA->getBasicBlock();
+        else
+          return false;  // Cannot determine.
+      } else if (isa<ReturnInst>(CurInst)) {
+        NextBB = 0;
+      } else {
+        // invoke, unwind, resume, unreachable.
+        DEBUG(dbgs() << "Can not handle terminator.");
+        return false;  // Cannot handle this terminator.
+      }
+
+      // We succeeded at evaluating this block!
+      DEBUG(dbgs() << "Successfully evaluated block.\n");
+      return true;
+    } else {
+      // Did not know how to evaluate this!
+      DEBUG(dbgs() << "Failed to evaluate block due to unhandled instruction."
+            "\n");
+      return false;
+    }
+
+    if (!CurInst->use_empty()) {
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
+        InstResult = ConstantFoldConstantExpression(CE, TD, TLI);
+
+      setVal(CurInst, InstResult);
+    }
+
+    // If we just processed an invoke, we finished evaluating the block.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+      NextBB = II->getNormalDest();
+      DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+      return true;
+    }
+
+    // Advance program counter.
+    ++CurInst;
+  }
+}
+
+/// EvaluateFunction - Evaluate a call to function F, returning true if
+/// successful, false if we can't evaluate it.  ActualArgs contains the formal
+/// arguments for the function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+                                 const SmallVectorImpl<Constant*> &ActualArgs) {
+  // Check to see if this function is already executing (recursion).  If so,
+  // bail out.  TODO: we might want to accept limited recursion.
+  if (std::find(CallStack.begin(), CallStack.end(), F) != CallStack.end())
+    return false;
+
+  CallStack.push_back(F);
+
+  // Initialize arguments to the incoming values specified.
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+       ++AI, ++ArgNo)
+    setVal(AI, ActualArgs[ArgNo]);
+
+  // ExecutedBlocks - We only handle non-looping, non-recursive code.  As such,
+  // we can only evaluate any one basic block at most once.  This set keeps
+  // track of what we have executed so we can detect recursive cases etc.
+  SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+  // CurBB - The current basic block we're evaluating.
+  BasicBlock *CurBB = F->begin();
+
+  BasicBlock::iterator CurInst = CurBB->begin();
+
+  while (1) {
+    BasicBlock *NextBB = 0; // Initialized to avoid compiler warnings.
+    DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+    if (!EvaluateBlock(CurInst, NextBB))
+      return false;
+
+    if (NextBB == 0) {
+      // Successfully running until there's no next block means that we found
+      // the return.  Fill it the return value and pop the call stack.
+      ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+      if (RI->getNumOperands())
+        RetVal = getVal(RI->getOperand(0));
+      CallStack.pop_back();
+      return true;
+    }
+
+    // Okay, we succeeded in evaluating this control flow.  See if we have
+    // executed the new block before.  If so, we have a looping function,
+    // which we cannot evaluate in reasonable time.
+    if (!ExecutedBlocks.insert(NextBB))
+      return false;  // looped!
+
+    // Okay, we have never been in this block before.  Check to see if there
+    // are any PHI nodes.  If so, evaluate them with information about where
+    // we came from.
+    PHINode *PN = 0;
+    for (CurInst = NextBB->begin();
+         (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+      setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+    // Advance to the next block.
+    CurBB = NextBB;
+  }
+}
+
+/// EvaluateStaticConstructor - Evaluate static constructors in the function, if
+/// we can.  Return true if we can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD,
+                                      const TargetLibraryInfo *TLI) {
+  // Call the function.
+  Evaluator Eval(TD, TLI);
+  Constant *RetValDummy;
+  bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
+                                           SmallVector<Constant*, 0>());
+
+  if (EvalSuccess) {
+    // We succeeded at evaluation: commit the result.
+    DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+          << F->getName() << "' to " << Eval.getMutatedMemory().size()
+          << " stores.\n");
+    for (DenseMap<Constant*, Constant*>::const_iterator I =
+           Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end();
+         I != E; ++I)
+      CommitValueTo(I->second, I->first);
+    for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I =
+           Eval.getInvariants().begin(), E = Eval.getInvariants().end();
+         I != E; ++I)
+      (*I)->setConstant(true);
+  }
+
+  return EvalSuccess;
+}
+
+/// OptimizeGlobalCtorsList - Simplify and evaluation global ctors if possible.
+/// Return true if anything changed.
+bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
+  std::vector<Function*> Ctors = ParseGlobalCtors(GCL);
+  bool MadeChange = false;
+  if (Ctors.empty()) return false;
+
+  // Loop over global ctors, optimizing them when we can.
+  for (unsigned i = 0; i != Ctors.size(); ++i) {
+    Function *F = Ctors[i];
+    // Found a null terminator in the middle of the list, prune off the rest of
+    // the list.
+    if (F == 0) {
+      if (i != Ctors.size()-1) {
+        Ctors.resize(i+1);
+        MadeChange = true;
+      }
+      break;
+    }
+    DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+
+    // We cannot simplify external ctor functions.
+    if (F->empty()) continue;
+
+    // If we can evaluate the ctor at compile time, do.
+    if (EvaluateStaticConstructor(F, TD, TLI)) {
+      Ctors.erase(Ctors.begin()+i);
+      MadeChange = true;
+      --i;
+      ++NumCtorsEvaluated;
+      continue;
+    }
+  }
+
+  if (!MadeChange) return false;
+
+  GCL = InstallGlobalCtors(GCL, Ctors);
+  return true;
+}
+
+bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
+  bool Changed = false;
+
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E;) {
+    Module::alias_iterator J = I++;
+    // Aliases without names cannot be referenced outside this module.
+    if (!J->hasName() && !J->isDeclaration())
+      J->setLinkage(GlobalValue::InternalLinkage);
+    // If the aliasee may change at link time, nothing can be done - bail out.
+    if (J->mayBeOverridden())
+      continue;
+
+    Constant *Aliasee = J->getAliasee();
+    GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+    Target->removeDeadConstantUsers();
+    bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
+
+    // Make all users of the alias use the aliasee instead.
+    if (!J->use_empty()) {
+      J->replaceAllUsesWith(Aliasee);
+      ++NumAliasesResolved;
+      Changed = true;
+    }
+
+    // If the alias is externally visible, we may still be able to simplify it.
+    if (!J->hasLocalLinkage()) {
+      // If the aliasee has internal linkage, give it the name and linkage
+      // of the alias, and delete the alias.  This turns:
+      //   define internal ... @f(...)
+      //   @a = alias ... @f
+      // into:
+      //   define ... @a(...)
+      if (!Target->hasLocalLinkage())
+        continue;
+
+      // Do not perform the transform if multiple aliases potentially target the
+      // aliasee. This check also ensures that it is safe to replace the section
+      // and other attributes of the aliasee with those of the alias.
+      if (!hasOneUse)
+        continue;
+
+      // Give the aliasee the name, linkage and other attributes of the alias.
+      Target->takeName(J);
+      Target->setLinkage(J->getLinkage());
+      Target->GlobalValue::copyAttributesFrom(J);
+    }
+
+    // Delete the alias.
+    M.getAliasList().erase(J);
+    ++NumAliasesRemoved;
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::cxa_atexit))
+    return 0;
+
+  Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
+
+  if (!Fn)
+    return 0;
+
+  FunctionType *FTy = Fn->getFunctionType();
+
+  // Checking that the function has the right return type, the right number of
+  // parameters and that they all have pointer types should be enough.
+  if (!FTy->getReturnType()->isIntegerTy() ||
+      FTy->getNumParams() != 3 ||
+      !FTy->getParamType(0)->isPointerTy() ||
+      !FTy->getParamType(1)->isPointerTy() ||
+      !FTy->getParamType(2)->isPointerTy())
+    return 0;
+
+  return Fn;
+}
+
+/// cxxDtorIsEmpty - Returns whether the given function is an empty C++
+/// destructor and can therefore be eliminated.
+/// Note that we assume that other optimization passes have already simplified
+/// the code so we only look for a function with a single basic block, where
+/// the only allowed instructions are 'ret', 'call' to an empty C++ dtor and
+/// other side-effect free instructions.
+static bool cxxDtorIsEmpty(const Function &Fn,
+                           SmallPtrSet<const Function *, 8> &CalledFunctions) {
+  // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
+  // nounwind, but that doesn't seem worth doing.
+  if (Fn.isDeclaration())
+    return false;
+
+  if (++Fn.begin() != Fn.end())
+    return false;
+
+  const BasicBlock &EntryBlock = Fn.getEntryBlock();
+  for (BasicBlock::const_iterator I = EntryBlock.begin(), E = EntryBlock.end();
+       I != E; ++I) {
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      // Ignore debug intrinsics.
+      if (isa<DbgInfoIntrinsic>(CI))
+        continue;
+
+      const Function *CalledFn = CI->getCalledFunction();
+
+      if (!CalledFn)
+        return false;
+
+      SmallPtrSet<const Function *, 8> NewCalledFunctions(CalledFunctions);
+
+      // Don't treat recursive functions as empty.
+      if (!NewCalledFunctions.insert(CalledFn))
+        return false;
+
+      if (!cxxDtorIsEmpty(*CalledFn, NewCalledFunctions))
+        return false;
+    } else if (isa<ReturnInst>(*I))
+      return true; // We're done.
+    else if (I->mayHaveSideEffects())
+      return false; // Destructor with side effects, bail.
+  }
+
+  return false;
+}
+
+bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+  /// Itanium C++ ABI p3.3.5:
+  ///
+  ///   After constructing a global (or local static) object, that will require
+  ///   destruction on exit, a termination function is registered as follows:
+  ///
+  ///   extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d );
+  ///
+  ///   This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
+  ///   call f(p) when DSO d is unloaded, before all such termination calls
+  ///   registered before this one. It returns zero if registration is
+  ///   successful, nonzero on failure.
+
+  // This pass will look for calls to __cxa_atexit where the function is trivial
+  // and remove them.
+  bool Changed = false;
+
+  for (Function::use_iterator I = CXAAtExitFn->use_begin(),
+       E = CXAAtExitFn->use_end(); I != E;) {
+    // We're only interested in calls. Theoretically, we could handle invoke
+    // instructions as well, but neither llvm-gcc nor clang generate invokes
+    // to __cxa_atexit.
+    CallInst *CI = dyn_cast<CallInst>(*I++);
+    if (!CI)
+      continue;
+
+    Function *DtorFn =
+      dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
+    if (!DtorFn)
+      continue;
+
+    SmallPtrSet<const Function *, 8> CalledFunctions;
+    if (!cxxDtorIsEmpty(*DtorFn, CalledFunctions))
+      continue;
+
+    // Just remove the call.
+    CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    CI->eraseFromParent();
+
+    ++NumCXXDtorsRemoved;
+
+    Changed |= true;
+  }
+
+  return Changed;
+}
+
+bool GlobalOpt::runOnModule(Module &M) {
+  bool Changed = false;
+
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  // Try to find the llvm.globalctors list.
+  GlobalVariable *GlobalCtors = FindGlobalCtors(M);
+
+  Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
+
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Delete functions that are trivially dead, ccc -> fastcc
+    LocalChange |= OptimizeFunctions(M);
+
+    // Optimize global_ctors list.
+    if (GlobalCtors)
+      LocalChange |= OptimizeGlobalCtorsList(GlobalCtors);
+
+    // Optimize non-address-taken globals.
+    LocalChange |= OptimizeGlobalVars(M);
+
+    // Resolve aliases, when possible.
+    LocalChange |= OptimizeGlobalAliases(M);
+
+    // Try to remove trivial global destructors.
+    if (CXAAtExitFn)
+      LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+
+    Changed |= LocalChange;
+  }
+
+  // TODO: Move all global ctors functions to the end of the module for code
+  // layout.
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
new file mode 100644
index 000000000000..4ac1dfc09682
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -0,0 +1,279 @@
+//===-- IPConstantPropagation.cpp - Propagate constants through calls -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an _extremely_ simple interprocedural constant
+// propagation pass.  It could certainly be improved in many different ways,
+// like using a worklist.  This pass makes arguments dead, but does not remove
+// them.  The existing dead argument elimination pass should be run after this
+// to clean up the mess.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ipconstprop"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+using namespace llvm;
+
+STATISTIC(NumArgumentsProped, "Number of args turned into constants");
+STATISTIC(NumReturnValProped, "Number of return values turned into constants");
+
+namespace {
+  /// IPCP - The interprocedural constant propagation pass
+  ///
+  struct IPCP : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    IPCP() : ModulePass(ID) {
+      initializeIPCPPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M);
+  private:
+    bool PropagateConstantsIntoArguments(Function &F);
+    bool PropagateConstantReturn(Function &F);
+  };
+}
+
+char IPCP::ID = 0;
+INITIALIZE_PASS(IPCP, "ipconstprop",
+                "Interprocedural constant propagation", false, false)
+
+ModulePass *llvm::createIPConstantPropagationPass() { return new IPCP(); }
+
+bool IPCP::runOnModule(Module &M) {
+  bool Changed = false;
+  bool LocalChange = true;
+
+  // FIXME: instead of using smart algorithms, we just iterate until we stop
+  // making changes.
+  while (LocalChange) {
+    LocalChange = false;
+    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Delete any klingons.
+        I->removeDeadConstantUsers();
+        if (I->hasLocalLinkage())
+          LocalChange |= PropagateConstantsIntoArguments(*I);
+        Changed |= PropagateConstantReturn(*I);
+      }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+/// PropagateConstantsIntoArguments - Look at all uses of the specified
+/// function.  If all uses are direct call sites, and all pass a particular
+/// constant in for an argument, propagate that constant in as the argument.
+///
+bool IPCP::PropagateConstantsIntoArguments(Function &F) {
+  if (F.arg_empty() || F.use_empty()) return false; // No arguments? Early exit.
+
+  // For each argument, keep track of its constant value and whether it is a
+  // constant or not.  The bool is driven to true when found to be non-constant.
+  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
+  ArgumentConstants.resize(F.arg_size());
+
+  unsigned NumNonconstant = 0;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    User *U = *UI;
+    // Ignore blockaddress uses.
+    if (isa<BlockAddress>(U)) continue;
+    
+    // Used by a non-instruction, or not the callee of a function, do not
+    // transform.
+    if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+      return false;
+    
+    CallSite CS(cast<Instruction>(U));
+    if (!CS.isCallee(UI))
+      return false;
+
+    // Check out all of the potentially constant arguments.  Note that we don't
+    // inspect varargs here.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    Function::arg_iterator Arg = F.arg_begin();
+    for (unsigned i = 0, e = ArgumentConstants.size(); i != e;
+         ++i, ++AI, ++Arg) {
+      
+      // If this argument is known non-constant, ignore it.
+      if (ArgumentConstants[i].second)
+        continue;
+      
+      Constant *C = dyn_cast<Constant>(*AI);
+      if (C && ArgumentConstants[i].first == 0) {
+        ArgumentConstants[i].first = C;   // First constant seen.
+      } else if (C && ArgumentConstants[i].first == C) {
+        // Still the constant value we think it is.
+      } else if (*AI == &*Arg) {
+        // Ignore recursive calls passing argument down.
+      } else {
+        // Argument became non-constant.  If all arguments are non-constant now,
+        // give up on this function.
+        if (++NumNonconstant == ArgumentConstants.size())
+          return false;
+        ArgumentConstants[i].second = true;
+      }
+    }
+  }
+
+  // If we got to this point, there is a constant argument!
+  assert(NumNonconstant != ArgumentConstants.size());
+  bool MadeChange = false;
+  Function::arg_iterator AI = F.arg_begin();
+  for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
+    // Do we have a constant argument?
+    if (ArgumentConstants[i].second || AI->use_empty() ||
+        (AI->hasByValAttr() && !F.onlyReadsMemory()))
+      continue;
+  
+    Value *V = ArgumentConstants[i].first;
+    if (V == 0) V = UndefValue::get(AI->getType());
+    AI->replaceAllUsesWith(V);
+    ++NumArgumentsProped;
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+
+// Check to see if this function returns one or more constants. If so, replace
+// all callers that use those return values with the constant value. This will
+// leave in the actual return values and instructions, but deadargelim will
+// clean that up.
+//
+// Additionally if a function always returns one of its arguments directly,
+// callers will be updated to use the value they pass in directly instead of
+// using the return value.
+bool IPCP::PropagateConstantReturn(Function &F) {
+  if (F.getReturnType()->isVoidTy())
+    return false; // No return value.
+
+  // If this function could be overridden later in the link stage, we can't
+  // propagate information about its results into callers.
+  if (F.mayBeOverridden())
+    return false;
+    
+  // Check to see if this function returns a constant.
+  SmallVector<Value *,4> RetVals;
+  StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  if (STy)
+    for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
+      RetVals.push_back(UndefValue::get(STy->getElementType(i)));
+  else
+    RetVals.push_back(UndefValue::get(F.getReturnType()));
+
+  unsigned NumNonConstant = 0;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      for (unsigned i = 0, e = RetVals.size(); i != e; ++i) {
+        // Already found conflicting return values?
+        Value *RV = RetVals[i];
+        if (!RV)
+          continue;
+
+        // Find the returned value
+        Value *V;
+        if (!STy)
+          V = RI->getOperand(0);
+        else
+          V = FindInsertedValue(RI->getOperand(0), i);
+
+        if (V) {
+          // Ignore undefs, we can change them into anything
+          if (isa<UndefValue>(V))
+            continue;
+          
+          // Try to see if all the rets return the same constant or argument.
+          if (isa<Constant>(V) || isa<Argument>(V)) {
+            if (isa<UndefValue>(RV)) {
+              // No value found yet? Try the current one.
+              RetVals[i] = V;
+              continue;
+            }
+            // Returning the same value? Good.
+            if (RV == V)
+              continue;
+          }
+        }
+        // Different or no known return value? Don't propagate this return
+        // value.
+        RetVals[i] = 0;
+        // All values non constant? Stop looking.
+        if (++NumNonConstant == RetVals.size())
+          return false;
+      }
+    }
+
+  // If we got here, the function returns at least one constant value.  Loop
+  // over all users, replacing any uses of the return value with the returned
+  // constant.
+  bool MadeChange = false;
+  for (Value::use_iterator UI = F.use_begin(), E = F.use_end(); UI != E; ++UI) {
+    CallSite CS(*UI);
+    Instruction* Call = CS.getInstruction();
+
+    // Not a call instruction or a call instruction that's not calling F
+    // directly?
+    if (!Call || !CS.isCallee(UI))
+      continue;
+    
+    // Call result not used?
+    if (Call->use_empty())
+      continue;
+
+    MadeChange = true;
+
+    if (STy == 0) {
+      Value* New = RetVals[0];
+      if (Argument *A = dyn_cast<Argument>(New))
+        // Was an argument returned? Then find the corresponding argument in
+        // the call instruction and use that.
+        New = CS.getArgument(A->getArgNo());
+      Call->replaceAllUsesWith(New);
+      continue;
+    }
+   
+    for (Value::use_iterator I = Call->use_begin(), E = Call->use_end();
+         I != E;) {
+      Instruction *Ins = cast<Instruction>(*I);
+
+      // Increment now, so we can remove the use
+      ++I;
+
+      // Find the index of the retval to replace with
+      int index = -1;
+      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins))
+        if (EV->hasIndices())
+          index = *EV->idx_begin();
+
+      // If this use uses a specific return value, and we have a replacement,
+      // replace it.
+      if (index != -1) {
+        Value *New = RetVals[index];
+        if (New) {
+          if (Argument *A = dyn_cast<Argument>(New))
+            // Was an argument returned? Then find the corresponding argument in
+            // the call instruction and use that.
+            New = CS.getArgument(A->getArgNo());
+          Ins->replaceAllUsesWith(New);
+          Ins->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  if (MadeChange) ++NumReturnValProped;
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
new file mode 100644
index 000000000000..5d563d8bbf51
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -0,0 +1,110 @@
+//===-- IPO.cpp -----------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for 
+// libLLVMIPO.a, which implements several transformations over the LLVM 
+// intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+void llvm::initializeIPO(PassRegistry &Registry) {
+  initializeArgPromotionPass(Registry);
+  initializeConstantMergePass(Registry);
+  initializeDAEPass(Registry);
+  initializeDAHPass(Registry);
+  initializeFunctionAttrsPass(Registry);
+  initializeGlobalDCEPass(Registry);
+  initializeGlobalOptPass(Registry);
+  initializeIPCPPass(Registry);
+  initializeAlwaysInlinerPass(Registry);
+  initializeSimpleInlinerPass(Registry);
+  initializeInternalizePassPass(Registry);
+  initializeLoopExtractorPass(Registry);
+  initializeBlockExtractorPassPass(Registry);
+  initializeSingleLoopExtractorPass(Registry);
+  initializeMergeFunctionsPass(Registry);
+  initializePartialInlinerPass(Registry);
+  initializePruneEHPass(Registry);
+  initializeStripDeadPrototypesPassPass(Registry);
+  initializeStripSymbolsPass(Registry);
+  initializeStripDebugDeclarePass(Registry);
+  initializeStripDeadDebugInfoPass(Registry);
+  initializeStripNonDebugSymbolsPass(Registry);
+}
+
+void LLVMInitializeIPO(LLVMPassRegistryRef R) {
+  initializeIPO(*unwrap(R));
+}
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionAttrsPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(llvm::createAlwaysInlinerPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPConstantPropagationPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIPSCCPPass());
+}
+
+void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
+  std::vector<const char *> Export;
+  if (AllButMain)
+    Export.push_back("main");
+  unwrap(PM)->add(createInternalizePass(Export));
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
new file mode 100644
index 000000000000..a0095dad1af7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/InlineAlways.cpp
@@ -0,0 +1,112 @@
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Inliner pass which only handles "always inline" functions.
+class AlwaysInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  // Use extremely low threshold.
+  AlwaysInliner() : Inliner(ID, -2000000000, /*InsertLifetime*/ true), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  AlwaysInliner(bool InsertLifetime)
+      : Inliner(ID, -2000000000, InsertLifetime), ICA(0) {
+    initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  virtual InlineCost getInlineCost(CallSite CS);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+
+  using llvm::Pass::doFinalization;
+  virtual bool doFinalization(CallGraph &CG) {
+    return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/ true);
+  }
+};
+
+}
+
+char AlwaysInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
+                "Inliner for always_inline functions", false, false)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
+INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
+                "Inliner for always_inline functions", false, false)
+
+Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
+
+Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
+  return new AlwaysInliner(InsertLifetime);
+}
+
+/// \brief Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
+  Function *Callee = CS.getCalledFunction();
+
+  // Only inline direct calls to functions with always-inline attributes
+  // that are viable for inlining. FIXME: We shouldn't even get here for
+  // declarations.
+  if (Callee && !Callee->isDeclaration() &&
+      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::AlwaysInline) &&
+      ICA->isInlineViable(*Callee))
+    return InlineCost::getAlways();
+
+  return InlineCost::getNever();
+}
+
+bool AlwaysInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
+}
+
+void AlwaysInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
new file mode 100644
index 000000000000..a4f702604188
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -0,0 +1,83 @@
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Actaul inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public Inliner {
+  InlineCostAnalysis *ICA;
+
+public:
+  SimpleInliner() : Inliner(ID), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  SimpleInliner(int Threshold)
+      : Inliner(ID, Threshold, /*InsertLifetime*/ true), ICA(0) {
+    initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID; // Pass identification, replacement for typeid
+
+  InlineCost getInlineCost(CallSite CS) {
+    return ICA->getInlineCost(CS, getInlineThreshold(CS));
+  }
+
+  virtual bool runOnSCC(CallGraphSCC &SCC);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+} // end anonymous namespace
+
+char SimpleInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
+                "Function Integration/Inlining", false, false)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
+INITIALIZE_PASS_END(SimpleInliner, "inline",
+                "Function Integration/Inlining", false, false)
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) {
+  return new SimpleInliner(Threshold);
+}
+
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+  ICA = &getAnalysis<InlineCostAnalysis>();
+  return Inliner::runOnSCC(SCC);
+}
+
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<InlineCostAnalysis>();
+  Inliner::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
new file mode 100644
index 000000000000..663ddb75f423
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -0,0 +1,613 @@
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph.  The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "inline"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+STATISTIC(NumMergedAllocas, "Number of allocas merged together");
+
+// This weirdly named statistic tracks the number of times that, when attempting
+// to inline a function A into B, we analyze the callers of B in order to see
+// if those would be more profitable and blocked inline steps.
+STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
+
+static cl::opt<int>
+InlineLimit("inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
+        cl::desc("Control the amount of inlining to perform (default = 225)"));
+
+static cl::opt<int>
+HintThreshold("inlinehint-threshold", cl::Hidden, cl::init(325),
+              cl::desc("Threshold for inlining functions with inline hint"));
+
+// Threshold to use when optsize is specified (and there is no -inline-limit).
+const int OptSizeThreshold = 75;
+
+Inliner::Inliner(char &ID) 
+  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit), InsertLifetime(true) {}
+
+Inliner::Inliner(char &ID, int Threshold, bool InsertLifetime)
+  : CallGraphSCCPass(ID), InlineThreshold(InlineLimit.getNumOccurrences() > 0 ?
+                                          InlineLimit : Threshold),
+    InsertLifetime(InsertLifetime) {}
+
+/// getAnalysisUsage - For this class, we declare that we require and preserve
+/// the call graph.  If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void Inliner::getAnalysisUsage(AnalysisUsage &AU) const {
+  CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+
+typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
+InlinedArrayAllocasTy;
+
+/// \brief If the inlined function had a higher stack protection level than the
+/// calling function, then bump up the caller's stack protection level.
+static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
+  // If upgrading the SSP attribute, clear out the old SSP Attributes first.
+  // Having multiple SSP attributes doesn't actually hurt, but it adds useless
+  // clutter to the IR.
+  AttrBuilder B;
+  B.addAttribute(Attribute::StackProtect)
+    .addAttribute(Attribute::StackProtectStrong);
+  AttributeSet OldSSPAttr = AttributeSet::get(Caller->getContext(),
+                                              AttributeSet::FunctionIndex,
+                                              B);
+  AttributeSet CallerAttr = Caller->getAttributes(),
+               CalleeAttr = Callee->getAttributes();
+
+  if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtectStrong) &&
+             !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::StackProtectReq)) {
+    Caller->removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller->addFnAttr(Attribute::StackProtectStrong);
+  } else if (CalleeAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackProtect) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectReq) &&
+           !CallerAttr.hasAttribute(AttributeSet::FunctionIndex,
+                                    Attribute::StackProtectStrong))
+    Caller->addFnAttr(Attribute::StackProtect);
+}
+
+/// InlineCallIfPossible - If it is possible to inline the specified call site,
+/// do so and update the CallGraph for this operation.
+///
+/// This function also does some basic book-keeping to update the IR.  The
+/// InlinedArrayAllocas map keeps track of any allocas that are already
+/// available from other  functions inlined into the caller.  If we are able to
+/// inline this call site we attempt to reuse already available allocas or add
+/// any new allocas to the set if not possible.
+static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
+                                 InlinedArrayAllocasTy &InlinedArrayAllocas,
+                                 int InlineHistory, bool InsertLifetime) {
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  // Try to inline the function.  Get the list of static allocas that were
+  // inlined.
+  if (!InlineFunction(CS, IFI, InsertLifetime))
+    return false;
+
+  AdjustCallerSSPLevel(Caller, Callee);
+
+  // Look at all of the allocas that we inlined through this call site.  If we
+  // have already inlined other allocas through other calls into this function,
+  // then we know that they have disjoint lifetimes and that we can merge them.
+  //
+  // There are many heuristics possible for merging these allocas, and the
+  // different options have different tradeoffs.  One thing that we *really*
+  // don't want to hurt is SRoA: once inlining happens, often allocas are no
+  // longer address taken and so they can be promoted.
+  //
+  // Our "solution" for that is to only merge allocas whose outermost type is an
+  // array type.  These are usually not promoted because someone is using a
+  // variable index into them.  These are also often the most important ones to
+  // merge.
+  //
+  // A better solution would be to have real memory lifetime markers in the IR
+  // and not have the inliner do any merging of allocas at all.  This would
+  // allow the backend to do proper stack slot coloring of all allocas that
+  // *actually make it to the backend*, which is really what we want.
+  //
+  // Because we don't have this information, we do this simple and useful hack.
+  //
+  SmallPtrSet<AllocaInst*, 16> UsedAllocas;
+  
+  // When processing our SCC, check to see if CS was inlined from some other
+  // call site.  For example, if we're processing "A" in this code:
+  //   A() { B() }
+  //   B() { x = alloca ... C() }
+  //   C() { y = alloca ... }
+  // Assume that C was not inlined into B initially, and so we're processing A
+  // and decide to inline B into A.  Doing this makes an alloca available for
+  // reuse and makes a callsite (C) available for inlining.  When we process
+  // the C call site we don't want to do any alloca merging between X and Y
+  // because their scopes are not disjoint.  We could make this smarter by
+  // keeping track of the inline history for each alloca in the
+  // InlinedArrayAllocas but this isn't likely to be a significant win.
+  if (InlineHistory != -1)  // Only do merging for top-level call sites in SCC.
+    return true;
+  
+  // Loop over all the allocas we have so far and see if they can be merged with
+  // a previously inlined alloca.  If not, remember that we had it.
+  for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size();
+       AllocaNo != e; ++AllocaNo) {
+    AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
+    
+    // Don't bother trying to merge array allocations (they will usually be
+    // canonicalized to be an allocation *of* an array), or allocations whose
+    // type is not itself an array (because we're afraid of pessimizing SRoA).
+    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    if (ATy == 0 || AI->isArrayAllocation())
+      continue;
+    
+    // Get the list of all available allocas for this array type.
+    std::vector<AllocaInst*> &AllocasForType = InlinedArrayAllocas[ATy];
+    
+    // Loop over the allocas in AllocasForType to see if we can reuse one.  Note
+    // that we have to be careful not to reuse the same "available" alloca for
+    // multiple different allocas that we just inlined, we use the 'UsedAllocas'
+    // set to keep track of which "available" allocas are being used by this
+    // function.  Also, AllocasForType can be empty of course!
+    bool MergedAwayAlloca = false;
+    for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) {
+      AllocaInst *AvailableAlloca = AllocasForType[i];
+      
+      // The available alloca has to be in the right function, not in some other
+      // function in this SCC.
+      if (AvailableAlloca->getParent() != AI->getParent())
+        continue;
+      
+      // If the inlined function already uses this alloca then we can't reuse
+      // it.
+      if (!UsedAllocas.insert(AvailableAlloca))
+        continue;
+      
+      // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
+      // success!
+      DEBUG(dbgs() << "    ***MERGED ALLOCA: " << *AI << "\n\t\tINTO: "
+                   << *AvailableAlloca << '\n');
+      
+      AI->replaceAllUsesWith(AvailableAlloca);
+      AI->eraseFromParent();
+      MergedAwayAlloca = true;
+      ++NumMergedAllocas;
+      IFI.StaticAllocas[AllocaNo] = 0;
+      break;
+    }
+
+    // If we already nuked the alloca, we're done with it.
+    if (MergedAwayAlloca)
+      continue;
+    
+    // If we were unable to merge away the alloca either because there are no
+    // allocas of the right type available or because we reused them all
+    // already, remember that this alloca came from an inlined function and mark
+    // it used so we don't reuse it for other allocas from this inline
+    // operation.
+    AllocasForType.push_back(AI);
+    UsedAllocas.insert(AI);
+  }
+  
+  return true;
+}
+
+unsigned Inliner::getInlineThreshold(CallSite CS) const {
+  int thres = InlineThreshold; // -inline-threshold or else selected by
+                               // overall opt level
+
+  // If -inline-threshold is not given, listen to the optsize attribute when it
+  // would decrease the threshold.
+  Function *Caller = CS.getCaller();
+  bool OptSize = Caller && !Caller->isDeclaration() &&
+    Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::OptimizeForSize);
+  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
+      OptSizeThreshold < thres)
+    thres = OptSizeThreshold;
+
+  // Listen to the inlinehint attribute when it would increase the threshold
+  // and the caller does not need to minimize its size.
+  Function *Callee = CS.getCalledFunction();
+  bool InlineHint = Callee && !Callee->isDeclaration() &&
+    Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::InlineHint);
+  if (InlineHint && HintThreshold > thres
+      && !Caller->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                               Attribute::MinSize))
+    thres = HintThreshold;
+
+  return thres;
+}
+
+/// shouldInline - Return true if the inliner should attempt to inline
+/// at the given CallSite.
+bool Inliner::shouldInline(CallSite CS) {
+  InlineCost IC = getInlineCost(CS);
+  
+  if (IC.isAlways()) {
+    DEBUG(dbgs() << "    Inlining: cost=always"
+          << ", Call: " << *CS.getInstruction() << "\n");
+    return true;
+  }
+  
+  if (IC.isNever()) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=never"
+          << ", Call: " << *CS.getInstruction() << "\n");
+    return false;
+  }
+  
+  Function *Caller = CS.getCaller();
+  if (!IC) {
+    DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
+          << ", thres=" << (IC.getCostDelta() + IC.getCost())
+          << ", Call: " << *CS.getInstruction() << "\n");
+    return false;
+  }
+  
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
+  if (Caller->hasLocalLinkage() ||
+      Caller->getLinkage() == GlobalValue::LinkOnceODRLinkage) {
+    int TotalSecondaryCost = 0;
+    // The candidate cost to be imposed upon the current function.
+    int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
+    // This bool tracks what happens if we do NOT inline C into B.
+    bool callerWillBeRemoved = Caller->hasLocalLinkage();
+    // This bool tracks what happens if we DO inline C into B.
+    bool inliningPreventsSomeOuterInline = false;
+    for (Value::use_iterator I = Caller->use_begin(), E =Caller->use_end(); 
+         I != E; ++I) {
+      CallSite CS2(*I);
+
+      // If this isn't a call to Caller (it could be some other sort
+      // of reference) skip it.  Such references will prevent the caller
+      // from being removed.
+      if (!CS2 || CS2.getCalledFunction() != Caller) {
+        callerWillBeRemoved = false;
+        continue;
+      }
+
+      InlineCost IC2 = getInlineCost(CS2);
+      ++NumCallerCallersAnalyzed;
+      if (!IC2) {
+        callerWillBeRemoved = false;
+        continue;
+      }
+      if (IC2.isAlways())
+        continue;
+
+      // See if inlining or original callsite would erase the cost delta of
+      // this callsite. We subtract off the penalty for the call instruction,
+      // which we would be deleting.
+      if (IC2.getCostDelta() <= CandidateCost) {
+        inliningPreventsSomeOuterInline = true;
+        TotalSecondaryCost += IC2.getCost();
+      }
+    }
+    // If all outer calls to Caller would get inlined, the cost for the last
+    // one is set very low by getInlineCost, in anticipation that Caller will
+    // be removed entirely.  We did not account for this above unless there
+    // is only one caller of Caller.
+    if (callerWillBeRemoved && Caller->use_begin() != Caller->use_end())
+      TotalSecondaryCost += InlineConstants::LastCallToStaticBonus;
+
+    if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) {
+      DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction() <<
+           " Cost = " << IC.getCost() <<
+           ", outer Cost = " << TotalSecondaryCost << '\n');
+      return false;
+    }
+  }
+
+  DEBUG(dbgs() << "    Inlining: cost=" << IC.getCost()
+        << ", thres=" << (IC.getCostDelta() + IC.getCost())
+        << ", Call: " << *CS.getInstruction() << '\n');
+  return true;
+}
+
+/// InlineHistoryIncludes - Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
+            const SmallVectorImpl<std::pair<Function*, int> > &InlineHistory) {
+  while (InlineHistoryID != -1) {
+    assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+           "Invalid inline history ID");
+    if (InlineHistory[InlineHistoryID].first == F)
+      return true;
+    InlineHistoryID = InlineHistory[InlineHistoryID].second;
+  }
+  return false;
+}
+
+bool Inliner::runOnSCC(CallGraphSCC &SCC) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+  SmallPtrSet<Function*, 8> SCCFunctions;
+  DEBUG(dbgs() << "Inliner visiting SCC:");
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (F) SCCFunctions.insert(F);
+    DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+  }
+
+  // Scan through and identify all call sites ahead of time so that we only
+  // inline call sites in the original functions, not call sites that result
+  // from inlining other functions.
+  SmallVector<std::pair<CallSite, int>, 16> CallSites;
+  
+  // When inlining a callee produces new call sites, we want to keep track of
+  // the fact that they were inlined from the callee.  This allows us to avoid
+  // infinite inlining in some obscure cases.  To represent this, we use an
+  // index into the InlineHistory vector.
+  SmallVector<std::pair<Function*, int>, 8> InlineHistory;
+
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (!F) continue;
+    
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+        CallSite CS(cast<Value>(I));
+        // If this isn't a call, or it is a call to an intrinsic, it can
+        // never be inlined.
+        if (!CS || isa<IntrinsicInst>(I))
+          continue;
+        
+        // If this is a direct call to an external function, we can never inline
+        // it.  If it is an indirect call, inlining may resolve it to be a
+        // direct call, so we keep it.
+        if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration())
+          continue;
+        
+        CallSites.push_back(std::make_pair(CS, -1));
+      }
+  }
+
+  DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+
+  // If there are no calls in this function, exit early.
+  if (CallSites.empty())
+    return false;
+  
+  // Now that we have all of the call sites, move the ones to functions in the
+  // current SCC to the end of the list.
+  unsigned FirstCallInSCC = CallSites.size();
+  for (unsigned i = 0; i < FirstCallInSCC; ++i)
+    if (Function *F = CallSites[i].first.getCalledFunction())
+      if (SCCFunctions.count(F))
+        std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
+
+  
+  InlinedArrayAllocasTy InlinedArrayAllocas;
+  InlineFunctionInfo InlineInfo(&CG, TD);
+  
+  // Now that we have all of the call sites, loop over them and inline them if
+  // it looks profitable to do so.
+  bool Changed = false;
+  bool LocalChange;
+  do {
+    LocalChange = false;
+    // Iterate over the outer loop because inlining functions can cause indirect
+    // calls to become direct calls.
+    for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+      CallSite CS = CallSites[CSi].first;
+      
+      Function *Caller = CS.getCaller();
+      Function *Callee = CS.getCalledFunction();
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) {
+        DEBUG(dbgs() << "    -> Deleting dead call: "
+                     << *CS.getInstruction() << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        CG[Caller]->removeCallEdgeFor(CS);
+        CS.getInstruction()->eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
+        // We can only inline direct calls to non-declarations.
+        if (Callee == 0 || Callee->isDeclaration()) continue;
+      
+        // If this call site was obtained by inlining another function, verify
+        // that the include path for the function did not include the callee
+        // itself.  If so, we'd be recursively inlining the same function,
+        // which would provide the same callsites, which would cause us to
+        // infinitely inline.
+        int InlineHistoryID = CallSites[CSi].second;
+        if (InlineHistoryID != -1 &&
+            InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
+          continue;
+        
+        
+        // If the policy determines that we should inline this function,
+        // try to do so.
+        if (!shouldInline(CS))
+          continue;
+
+        // Attempt to inline the function.
+        if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
+                                  InlineHistoryID, InsertLifetime))
+          continue;
+        ++NumInlined;
+        
+        // If inlining this function gave us any new call sites, throw them
+        // onto our worklist to process.  They are useful inline candidates.
+        if (!InlineInfo.InlinedCalls.empty()) {
+          // Create a new inline history entry for this, so that we remember
+          // that these new callsites came about due to inlining Callee.
+          int NewHistoryID = InlineHistory.size();
+          InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
+
+          for (unsigned i = 0, e = InlineInfo.InlinedCalls.size();
+               i != e; ++i) {
+            Value *Ptr = InlineInfo.InlinedCalls[i];
+            CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
+          }
+        }
+      }
+      
+      // If we inlined or deleted the last possible call site to the function,
+      // delete the function body now.
+      if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
+          // TODO: Can remove if in SCC now.
+          !SCCFunctions.count(Callee) &&
+          
+          // The function may be apparently dead, but if there are indirect
+          // callgraph references to the node, we cannot delete it yet, this
+          // could invalidate the CGSCC iterator.
+          CG[Callee]->getNumReferences() == 0) {
+        DEBUG(dbgs() << "    -> Deleting dead function: "
+              << Callee->getName() << "\n");
+        CallGraphNode *CalleeNode = CG[Callee];
+        
+        // Remove any call graph edges from the callee to its callees.
+        CalleeNode->removeAllCalledFunctions();
+        
+        // Removing the node for callee from the call graph and delete it.
+        delete CG.removeFunctionFromModule(CalleeNode);
+        ++NumDeleted;
+      }
+
+      // Remove this call site from the list.  If possible, use 
+      // swap/pop_back for efficiency, but do not use it if doing so would
+      // move a call site to a function in this SCC before the
+      // 'FirstCallInSCC' barrier.
+      if (SCC.isSingular()) {
+        CallSites[CSi] = CallSites.back();
+        CallSites.pop_back();
+      } else {
+        CallSites.erase(CallSites.begin()+CSi);
+      }
+      --CSi;
+
+      Changed = true;
+      LocalChange = true;
+    }
+  } while (LocalChange);
+
+  return Changed;
+}
+
+// doFinalization - Remove now-dead linkonce functions at the end of
+// processing to avoid breaking the SCC traversal.
+bool Inliner::doFinalization(CallGraph &CG) {
+  return removeDeadFunctions(CG);
+}
+
+/// removeDeadFunctions - Remove dead functions that are not included in
+/// DNR (Do Not Remove) list.
+bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
+  SmallVector<CallGraphNode*, 16> FunctionsToRemove;
+
+  // Scan for all of the functions, looking for ones that should now be removed
+  // from the program.  Insert the dead ones in the FunctionsToRemove set.
+  for (CallGraph::iterator I = CG.begin(), E = CG.end(); I != E; ++I) {
+    CallGraphNode *CGN = I->second;
+    Function *F = CGN->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
+
+    // Handle the case when this function is called and we only want to care
+    // about always-inline functions. This is a bit of a hack to share code
+    // between here and the InlineAlways pass.
+    if (AlwaysInlineOnly &&
+        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::AlwaysInline))
+      continue;
+
+    // If the only remaining users of the function are dead constants, remove
+    // them.
+    F->removeDeadConstantUsers();
+
+    if (!F->isDefTriviallyDead())
+      continue;
+    
+    // Remove any call graph edges from the function to its callees.
+    CGN->removeAllCalledFunctions();
+
+    // Remove any edges from the external node to the function's call graph
+    // node.  These edges might have been made irrelegant due to
+    // optimization of the program.
+    CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+    // Removing the node for callee from the call graph and delete it.
+    FunctionsToRemove.push_back(CGN);
+  }
+  if (FunctionsToRemove.empty())
+    return false;
+
+  // Now that we know which functions to delete, do so.  We didn't want to do
+  // this inline, because that would invalidate our CallGraph::iterator
+  // objects. :(
+  //
+  // Note that it doesn't matter that we are iterating over a non-stable order
+  // here to do this, it doesn't matter which order the functions are deleted
+  // in.
+  array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+  FunctionsToRemove.erase(std::unique(FunctionsToRemove.begin(),
+                                      FunctionsToRemove.end()),
+                          FunctionsToRemove.end());
+  for (SmallVectorImpl<CallGraphNode *>::iterator I = FunctionsToRemove.begin(),
+                                                  E = FunctionsToRemove.end();
+       I != E; ++I) {
+    delete CG.removeFunctionFromModule(*I);
+    ++NumDeleted;
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
new file mode 100644
index 000000000000..4bfab5b0afbd
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -0,0 +1,188 @@
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions and variables in the input module.
+// If the function or variable is not in the list of external names given to
+// the pass it is marked as internal.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "internalize"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumAliases  , "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals  , "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+APIFile("internalize-public-api-file", cl::value_desc("filename"),
+        cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+APIList("internalize-public-api-list", cl::value_desc("list"),
+        cl::desc("A list of symbol names to preserve"),
+        cl::CommaSeparated);
+
+namespace {
+  class InternalizePass : public ModulePass {
+    std::set<std::string> ExternalNames;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit InternalizePass();
+    explicit InternalizePass(ArrayRef<const char *> exportList);
+    void LoadFile(const char *Filename);
+    void ClearExportList();
+    void AddToExportList(const std::string &val);
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreserved<CallGraph>();
+    }
+  };
+} // end anonymous namespace
+
+char InternalizePass::ID = 0;
+INITIALIZE_PASS(InternalizePass, "internalize",
+                "Internalize Global Symbols", false, false)
+
+InternalizePass::InternalizePass()
+  : ModulePass(ID) {
+  initializeInternalizePassPass(*PassRegistry::getPassRegistry());
+  if (!APIFile.empty())           // If a filename is specified, use it.
+    LoadFile(APIFile.c_str());
+  if (!APIList.empty())           // If a list is specified, use it as well.
+    ExternalNames.insert(APIList.begin(), APIList.end());
+}
+
+InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
+  : ModulePass(ID){
+  initializeInternalizePassPass(*PassRegistry::getPassRegistry());
+  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
+        itr != exportList.end(); itr++) {
+    ExternalNames.insert(*itr);
+  }
+}
+
+void InternalizePass::LoadFile(const char *Filename) {
+  // Load the APIFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    errs() << "WARNING: Internalize couldn't load file '" << Filename
+         << "'! Continuing as if it's empty.\n";
+    return; // Just continue as if the file were empty
+  }
+  while (In) {
+    std::string Symbol;
+    In >> Symbol;
+    if (!Symbol.empty())
+      ExternalNames.insert(Symbol);
+  }
+}
+
+void InternalizePass::ClearExportList() {
+  ExternalNames.clear();
+}
+
+void InternalizePass::AddToExportList(const std::string &val) {
+  ExternalNames.insert(val);
+}
+
+bool InternalizePass::runOnModule(Module &M) {
+  CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
+  CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
+  bool Changed = false;
+
+  // Never internalize functions which code-gen might insert.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  ExternalNames.insert("__stack_chk_fail");
+
+  // Mark all functions not in the api as internal.
+  // FIXME: maybe use private linkage?
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() &&         // Function must be defined here
+        // Available externally is really just a "declaration with a body".
+        !I->hasAvailableExternallyLinkage() &&
+        !I->hasLocalLinkage() &&  // Can't already have internal linkage
+        !ExternalNames.count(I->getName())) {// Not marked to keep external?
+      I->setLinkage(GlobalValue::InternalLinkage);
+      // Remove a callgraph edge from the external node to this function.
+      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+      Changed = true;
+      ++NumFunctions;
+      DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
+    }
+
+  // Never internalize the llvm.used symbol.  It is used to implement
+  // attribute((used)).
+  // FIXME: Shouldn't this just filter on llvm.metadata section??
+  ExternalNames.insert("llvm.used");
+  ExternalNames.insert("llvm.compiler.used");
+
+  // Never internalize anchors used by the machine module info, else the info
+  // won't find them.  (see MachineModuleInfo.)
+  ExternalNames.insert("llvm.global_ctors");
+  ExternalNames.insert("llvm.global_dtors");
+  ExternalNames.insert("llvm.global.annotations");
+
+  // Never internalize symbols code-gen inserts.
+  ExternalNames.insert("__stack_chk_guard");
+
+  // Mark all global variables with initializers that are not in the api as
+  // internal as well.
+  // FIXME: maybe use private linkage?
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
+        // Available externally is really just a "declaration with a body".
+        !I->hasAvailableExternallyLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumGlobals;
+      DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
+    }
+
+  // Mark all aliases that are not in the api as internal as well.
+  for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+       I != E; ++I)
+    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
+        // Available externally is really just a "declaration with a body".
+        !I->hasAvailableExternallyLinkage() &&
+        !ExternalNames.count(I->getName())) {
+      I->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+      ++NumAliases;
+      DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
+    }
+
+  return Changed;
+}
+
+ModulePass *llvm::createInternalizePass() {
+  return new InternalizePass();
+}
+
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
+  return new InternalizePass(el);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
new file mode 100644
index 000000000000..8282a8e6fabc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -0,0 +1,304 @@
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-extract"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
+  struct LoopExtractor : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    unsigned NumLoops;
+
+    explicit LoopExtractor(unsigned numLoops = ~0) 
+      : LoopPass(ID), NumLoops(numLoops) {
+        initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<DominatorTree>();
+    }
+  };
+}
+
+char LoopExtractor::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
+                      "Extract loops into new functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
+                    "Extract loops into new functions", false, false)
+
+namespace {
+  /// SingleLoopExtractor - For bugpoint.
+  struct SingleLoopExtractor : public LoopExtractor {
+    static char ID; // Pass identification, replacement for typeid
+    SingleLoopExtractor() : LoopExtractor(1) {}
+  };
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
+                "Extract at most one loop into a new function", false, false)
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
+Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
+
+bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // Only visit top-level loops.
+  if (L->getParentLoop())
+    return false;
+
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  bool Changed = false;
+
+  // If there is more than one top-level loop in this function, extract all of
+  // the loops. Otherwise there is exactly one top-level loop; in this case if
+  // this function is more than a minimal wrapper around the loop, extract
+  // the loop.
+  bool ShouldExtractLoop = false;
+
+  // Extract the loop if the entry block doesn't branch to the loop header.
+  TerminatorInst *EntryTI =
+    L->getHeader()->getParent()->getEntryBlock().getTerminator();
+  if (!isa<BranchInst>(EntryTI) ||
+      !cast<BranchInst>(EntryTI)->isUnconditional() ||
+      EntryTI->getSuccessor(0) != L->getHeader()) {
+    ShouldExtractLoop = true;
+  } else {
+    // Check to see if any exits from the loop are more than just return
+    // blocks.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) {
+        ShouldExtractLoop = true;
+        break;
+      }
+  }
+
+  if (ShouldExtractLoop) {
+    // We must omit landing pads. Landing pads must accompany the invoke
+    // instruction. But this would result in a loop in the extracted
+    // function. An infinite cycle occurs when it tries to extract that loop as
+    // well.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i]->isLandingPad()) {
+        ShouldExtractLoop = false;
+        break;
+      }
+  }
+
+  if (ShouldExtractLoop) {
+    if (NumLoops == 0) return Changed;
+    --NumLoops;
+    CodeExtractor Extractor(DT, *L);
+    if (Extractor.extractCodeRegion() != 0) {
+      Changed = true;
+      // After extraction, the loop is replaced by a function call, so
+      // we shouldn't try to run any more loop passes on it.
+      LPM.deleteLoopFromQueue(L);
+    }
+    ++NumExtracted;
+  }
+
+  return Changed;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can.  This is used by bugpoint.
+//
+Pass *llvm::createSingleLoopExtractorPass() {
+  return new SingleLoopExtractor();
+}
+
+
+// BlockFile - A file which contains a list of blocks that should not be
+// extracted.
+static cl::opt<std::string>
+BlockFile("extract-blocks-file", cl::value_desc("filename"),
+          cl::desc("A file containing list of basic blocks to not extract"),
+          cl::Hidden);
+
+namespace {
+  /// BlockExtractorPass - This pass is used by bugpoint to extract all blocks
+  /// from the module into their own functions except for those specified by the
+  /// BlocksToNotExtract list.
+  class BlockExtractorPass : public ModulePass {
+    void LoadFile(const char *Filename);
+    void SplitLandingPadPreds(Function *F);
+
+    std::vector<BasicBlock*> BlocksToNotExtract;
+    std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    BlockExtractorPass() : ModulePass(ID) {
+      if (!BlockFile.empty())
+        LoadFile(BlockFile.c_str());
+    }
+
+    bool runOnModule(Module &M);
+  };
+}
+
+char BlockExtractorPass::ID = 0;
+INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
+                "Extract Basic Blocks From Module (for bugpoint use)",
+                false, false)
+
+// createBlockExtractorPass - This pass extracts all blocks (except those
+// specified in the argument list) from the functions in the module.
+//
+ModulePass *llvm::createBlockExtractorPass() {
+  return new BlockExtractorPass();
+}
+
+void BlockExtractorPass::LoadFile(const char *Filename) {
+  // Load the BlockFile...
+  std::ifstream In(Filename);
+  if (!In.good()) {
+    errs() << "WARNING: BlockExtractor couldn't load file '" << Filename
+           << "'!\n";
+    return;
+  }
+  while (In) {
+    std::string FunctionName, BlockName;
+    In >> FunctionName;
+    In >> BlockName;
+    if (!BlockName.empty())
+      BlocksToNotExtractByName.push_back(
+          std::make_pair(FunctionName, BlockName));
+  }
+}
+
+/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
+/// instruction. The critical edge breaker will refuse to break critical edges
+/// to a landing pad. So do them here. After this method runs, all landing pads
+/// should have only one predecessor.
+void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    if (!II) continue;
+    BasicBlock *Parent = II->getParent();
+    BasicBlock *LPad = II->getUnwindDest();
+
+    // Look through the landing pad's predecessors. If one of them ends in an
+    // 'invoke', then we want to split the landing pad.
+    bool Split = false;
+    for (pred_iterator
+           PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
+      BasicBlock *BB = *PI;
+      if (BB->isLandingPad() && BB != Parent &&
+          isa<InvokeInst>(Parent->getTerminator())) {
+        Split = true;
+        break;
+      }
+    }
+
+    if (!Split) continue;
+
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", 0, NewBBs);
+  }
+}
+
+bool BlockExtractorPass::runOnModule(Module &M) {
+  std::set<BasicBlock*> TranslatedBlocksToNotExtract;
+  for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
+    BasicBlock *BB = BlocksToNotExtract[i];
+    Function *F = BB->getParent();
+
+    // Map the corresponding function in this module.
+    Function *MF = M.getFunction(F->getName());
+    assert(MF->getFunctionType() == F->getFunctionType() && "Wrong function?");
+
+    // Figure out which index the basic block is in its function.
+    Function::iterator BBI = MF->begin();
+    std::advance(BBI, std::distance(F->begin(), Function::iterator(BB)));
+    TranslatedBlocksToNotExtract.insert(BBI);
+  }
+
+  while (!BlocksToNotExtractByName.empty()) {
+    // There's no way to find BBs by name without looking at every BB inside
+    // every Function. Fortunately, this is always empty except when used by
+    // bugpoint in which case correctness is more important than performance.
+
+    std::string &FuncName  = BlocksToNotExtractByName.back().first;
+    std::string &BlockName = BlocksToNotExtractByName.back().second;
+
+    for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+      Function &F = *FI;
+      if (F.getName() != FuncName) continue;
+
+      for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+        BasicBlock &BB = *BI;
+        if (BB.getName() != BlockName) continue;
+
+        TranslatedBlocksToNotExtract.insert(BI);
+      }
+    }
+
+    BlocksToNotExtractByName.pop_back();
+  }
+
+  // Now that we know which blocks to not extract, figure out which ones we WANT
+  // to extract.
+  std::vector<BasicBlock*> BlocksToExtract;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    SplitLandingPadPreds(&*F);
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!TranslatedBlocksToNotExtract.count(BB))
+        BlocksToExtract.push_back(BB);
+  }
+
+  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) {
+    SmallVector<BasicBlock*, 2> BlocksToExtractVec;
+    BlocksToExtractVec.push_back(BlocksToExtract[i]);
+    if (const InvokeInst *II =
+        dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
+      BlocksToExtractVec.push_back(II->getUnwindDest());
+    CodeExtractor(BlocksToExtractVec).extractCodeRegion();
+  }
+
+  return !BlocksToExtract.empty();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
new file mode 100644
index 000000000000..892100f0585a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -0,0 +1,870 @@
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// A hash is computed from the function, based on its type and number of
+// basic blocks.
+//
+// Once all hashes are computed, we perform an expensive equality comparison
+// on each function pair. This takes n^2/2 comparisons per bucket, so it's
+// important that the hash function be high quality. The equality comparison
+// iterates through each instruction in each basic block.
+//
+// When a match is found the functions are folded. If both functions are
+// overridable, we move the functionality into a new internal function and
+// leave two overridable thunks to it.
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such functions.
+//
+// * switch from n^2 pair-wise comparisons to an n-way comparison for each
+// bucket.
+//
+// * be smarter about bitcasts.
+//
+// In order to fold functions, we will sometimes add either bitcast instructions
+// or bitcast constant expressions. Unfortunately, this can confound further
+// analysis since the two functions differ where one has a bitcast and the
+// other doesn't. We should learn to look through bitcasts.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mergefunc"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+using namespace llvm;
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+
+/// Creates a hash-code for the function which is the same for any two
+/// functions that will compare equal, without looking at the instructions
+/// inside the function.
+static unsigned profileFunction(const Function *F) {
+  FunctionType *FTy = F->getFunctionType();
+
+  FoldingSetNodeID ID;
+  ID.AddInteger(F->size());
+  ID.AddInteger(F->getCallingConv());
+  ID.AddBoolean(F->hasGC());
+  ID.AddBoolean(FTy->isVarArg());
+  ID.AddInteger(FTy->getReturnType()->getTypeID());
+  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
+    ID.AddInteger(FTy->getParamType(i)->getTypeID());
+  return ID.ComputeHash();
+}
+
+namespace {
+
+/// ComparableFunction - A struct that pairs together functions with a
+/// DataLayout so that we can keep them together as elements in the DenseSet.
+class ComparableFunction {
+public:
+  static const ComparableFunction EmptyKey;
+  static const ComparableFunction TombstoneKey;
+  static DataLayout * const LookupOnly;
+
+  ComparableFunction(Function *Func, DataLayout *TD)
+    : Func(Func), Hash(profileFunction(Func)), TD(TD) {}
+
+  Function *getFunc() const { return Func; }
+  unsigned getHash() const { return Hash; }
+  DataLayout *getTD() const { return TD; }
+
+  // Drops AssertingVH reference to the function. Outside of debug mode, this
+  // does nothing.
+  void release() {
+    assert(Func &&
+           "Attempted to release function twice, or release empty/tombstone!");
+    Func = NULL;
+  }
+
+private:
+  explicit ComparableFunction(unsigned Hash)
+    : Func(NULL), Hash(Hash), TD(NULL) {}
+
+  AssertingVH<Function> Func;
+  unsigned Hash;
+  DataLayout *TD;
+};
+
+const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0);
+const ComparableFunction ComparableFunction::TombstoneKey =
+    ComparableFunction(1);
+DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1);
+
+}
+
+namespace llvm {
+  template <>
+  struct DenseMapInfo<ComparableFunction> {
+    static ComparableFunction getEmptyKey() {
+      return ComparableFunction::EmptyKey;
+    }
+    static ComparableFunction getTombstoneKey() {
+      return ComparableFunction::TombstoneKey;
+    }
+    static unsigned getHashValue(const ComparableFunction &CF) {
+      return CF.getHash();
+    }
+    static bool isEqual(const ComparableFunction &LHS,
+                        const ComparableFunction &RHS);
+  };
+}
+
+namespace {
+
+/// FunctionComparator - Compares two functions to determine whether or not
+/// they will generate machine code with the same behaviour. DataLayout is
+/// used if available. The comparator always fails conservatively (erring on the
+/// side of claiming that two functions are different).
+class FunctionComparator {
+public:
+  FunctionComparator(const DataLayout *TD, const Function *F1,
+                     const Function *F2)
+    : F1(F1), F2(F2), TD(TD) {}
+
+  /// Test whether the two functions have equivalent behaviour.
+  bool compare();
+
+private:
+  /// Test whether two basic blocks have equivalent behaviour.
+  bool compare(const BasicBlock *BB1, const BasicBlock *BB2);
+
+  /// Assign or look up previously assigned numbers for the two values, and
+  /// return whether the numbers are equal. Numbers are assigned in the order
+  /// visited.
+  bool enumerate(const Value *V1, const Value *V2);
+
+  /// Compare two Instructions for equivalence, similar to
+  /// Instruction::isSameOperationAs but with modifications to the type
+  /// comparison.
+  bool isEquivalentOperation(const Instruction *I1,
+                             const Instruction *I2) const;
+
+  /// Compare two GEPs for equivalent pointer arithmetic.
+  bool isEquivalentGEP(const GEPOperator *GEP1, const GEPOperator *GEP2);
+  bool isEquivalentGEP(const GetElementPtrInst *GEP1,
+                       const GetElementPtrInst *GEP2) {
+    return isEquivalentGEP(cast<GEPOperator>(GEP1), cast<GEPOperator>(GEP2));
+  }
+
+  /// Compare two Types, treating all pointer types as equal.
+  bool isEquivalentType(Type *Ty1, Type *Ty2) const;
+
+  // The two functions undergoing comparison.
+  const Function *F1, *F2;
+
+  const DataLayout *TD;
+
+  DenseMap<const Value *, const Value *> id_map;
+  DenseSet<const Value *> seen_values;
+};
+
+}
+
+// Any two pointers in the same address space are equivalent, intptr_t and
+// pointers are equivalent. Otherwise, standard type equivalence rules apply.
+bool FunctionComparator::isEquivalentType(Type *Ty1,
+                                          Type *Ty2) const {
+  if (Ty1 == Ty2)
+    return true;
+  if (Ty1->getTypeID() != Ty2->getTypeID()) {
+    if (TD) {
+      LLVMContext &Ctx = Ty1->getContext();
+      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true;
+      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true;
+    }
+    return false;
+  }
+
+  switch (Ty1->getTypeID()) {
+  default:
+    llvm_unreachable("Unknown type!");
+    // Fall through in Release mode.
+  case Type::IntegerTyID:
+  case Type::VectorTyID:
+    // Ty1 == Ty2 would have returned true earlier.
+    return false;
+
+  case Type::VoidTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+  case Type::MetadataTyID:
+    return true;
+
+  case Type::PointerTyID: {
+    PointerType *PTy1 = cast<PointerType>(Ty1);
+    PointerType *PTy2 = cast<PointerType>(Ty2);
+    return PTy1->getAddressSpace() == PTy2->getAddressSpace();
+  }
+
+  case Type::StructTyID: {
+    StructType *STy1 = cast<StructType>(Ty1);
+    StructType *STy2 = cast<StructType>(Ty2);
+    if (STy1->getNumElements() != STy2->getNumElements())
+      return false;
+
+    if (STy1->isPacked() != STy2->isPacked())
+      return false;
+
+    for (unsigned i = 0, e = STy1->getNumElements(); i != e; ++i) {
+      if (!isEquivalentType(STy1->getElementType(i), STy2->getElementType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::FunctionTyID: {
+    FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    if (FTy1->getNumParams() != FTy2->getNumParams() ||
+        FTy1->isVarArg() != FTy2->isVarArg())
+      return false;
+
+    if (!isEquivalentType(FTy1->getReturnType(), FTy2->getReturnType()))
+      return false;
+
+    for (unsigned i = 0, e = FTy1->getNumParams(); i != e; ++i) {
+      if (!isEquivalentType(FTy1->getParamType(i), FTy2->getParamType(i)))
+        return false;
+    }
+    return true;
+  }
+
+  case Type::ArrayTyID: {
+    ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    return ATy1->getNumElements() == ATy2->getNumElements() &&
+           isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
+  }
+  }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
+                                               const Instruction *I2) const {
+  // Differences from Instruction::isSameOperationAs:
+  //  * replace type comparison with calls to isEquivalentType.
+  //  * we test for I->hasSameSubclassOptionalData (nuw/nsw/tail) at the top
+  //  * because of the above, we don't test for the tail bit on calls later on
+  if (I1->getOpcode() != I2->getOpcode() ||
+      I1->getNumOperands() != I2->getNumOperands() ||
+      !isEquivalentType(I1->getType(), I2->getType()) ||
+      !I1->hasSameSubclassOptionalData(I2))
+    return false;
+
+  // We have two instructions of identical opcode and #operands.  Check to see
+  // if all operands are the same type
+  for (unsigned i = 0, e = I1->getNumOperands(); i != e; ++i)
+    if (!isEquivalentType(I1->getOperand(i)->getType(),
+                          I2->getOperand(i)->getType()))
+      return false;
+
+  // Check special state that is a part of some instructions.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
+    return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
+           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
+    return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
+  if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
+    return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
+  if (const CallInst *CI = dyn_cast<CallInst>(I1))
+    return CI->getCallingConv() == cast<CallInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<CallInst>(I2)->getAttributes();
+  if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
+    return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
+           CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I2)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
+
+  return true;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
+                                         const GEPOperator *GEP2) {
+  // When we have target data, we can reduce the GEP down to the value in bytes
+  // added to the address.
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1;
+  APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
+  if (TD &&
+      GEP1->accumulateConstantOffset(*TD, Offset1) &&
+      GEP2->accumulateConstantOffset(*TD, Offset2)) {
+    return Offset1 == Offset2;
+  }
+
+  if (GEP1->getPointerOperand()->getType() !=
+      GEP2->getPointerOperand()->getType())
+    return false;
+
+  if (GEP1->getNumOperands() != GEP2->getNumOperands())
+    return false;
+
+  for (unsigned i = 0, e = GEP1->getNumOperands(); i != e; ++i) {
+    if (!enumerate(GEP1->getOperand(i), GEP2->getOperand(i)))
+      return false;
+  }
+
+  return true;
+}
+
+// Compare two values used by the two functions under pair-wise comparison. If
+// this is the first time the values are seen, they're added to the mapping so
+// that we will detect mismatches on next use.
+bool FunctionComparator::enumerate(const Value *V1, const Value *V2) {
+  // Check for function @f1 referring to itself and function @f2 referring to
+  // itself, or referring to each other, or both referring to either of them.
+  // They're all equivalent if the two functions are otherwise equivalent.
+  if (V1 == F1 && V2 == F2)
+    return true;
+  if (V1 == F2 && V2 == F1)
+    return true;
+
+  if (const Constant *C1 = dyn_cast<Constant>(V1)) {
+    if (V1 == V2) return true;
+    const Constant *C2 = dyn_cast<Constant>(V2);
+    if (!C2) return false;
+    // TODO: constant expressions with GEP or references to F1 or F2.
+    if (C1->isNullValue() && C2->isNullValue() &&
+        isEquivalentType(C1->getType(), C2->getType()))
+      return true;
+    // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1
+    // then they must have equal bit patterns.
+    return C1->getType()->canLosslesslyBitCastTo(C2->getType()) &&
+      C1 == ConstantExpr::getBitCast(const_cast<Constant*>(C2), C1->getType());
+  }
+
+  if (isa<InlineAsm>(V1) || isa<InlineAsm>(V2))
+    return V1 == V2;
+
+  // Check that V1 maps to V2. If we find a value that V1 maps to then we simply
+  // check whether it's equal to V2. When there is no mapping then we need to
+  // ensure that V2 isn't already equivalent to something else. For this
+  // purpose, we track the V2 values in a set.
+
+  const Value *&map_elem = id_map[V1];
+  if (map_elem)
+    return map_elem == V2;
+  if (!seen_values.insert(V2).second)
+    return false;
+  map_elem = V2;
+  return true;
+}
+
+// Test whether two basic blocks have equivalent behaviour.
+bool FunctionComparator::compare(const BasicBlock *BB1, const BasicBlock *BB2) {
+  BasicBlock::const_iterator F1I = BB1->begin(), F1E = BB1->end();
+  BasicBlock::const_iterator F2I = BB2->begin(), F2E = BB2->end();
+
+  do {
+    if (!enumerate(F1I, F2I))
+      return false;
+
+    if (const GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(F1I)) {
+      const GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(F2I);
+      if (!GEP2)
+        return false;
+
+      if (!enumerate(GEP1->getPointerOperand(), GEP2->getPointerOperand()))
+        return false;
+
+      if (!isEquivalentGEP(GEP1, GEP2))
+        return false;
+    } else {
+      if (!isEquivalentOperation(F1I, F2I))
+        return false;
+
+      assert(F1I->getNumOperands() == F2I->getNumOperands());
+      for (unsigned i = 0, e = F1I->getNumOperands(); i != e; ++i) {
+        Value *OpF1 = F1I->getOperand(i);
+        Value *OpF2 = F2I->getOperand(i);
+
+        if (!enumerate(OpF1, OpF2))
+          return false;
+
+        if (OpF1->getValueID() != OpF2->getValueID() ||
+            !isEquivalentType(OpF1->getType(), OpF2->getType()))
+          return false;
+      }
+    }
+
+    ++F1I, ++F2I;
+  } while (F1I != F1E && F2I != F2E);
+
+  return F1I == F1E && F2I == F2E;
+}
+
+// Test whether the two functions have equivalent behaviour.
+bool FunctionComparator::compare() {
+  // We need to recheck everything, but check the things that weren't included
+  // in the hash first.
+
+  if (F1->getAttributes() != F2->getAttributes())
+    return false;
+
+  if (F1->hasGC() != F2->hasGC())
+    return false;
+
+  if (F1->hasGC() && F1->getGC() != F2->getGC())
+    return false;
+
+  if (F1->hasSection() != F2->hasSection())
+    return false;
+
+  if (F1->hasSection() && F1->getSection() != F2->getSection())
+    return false;
+
+  if (F1->isVarArg() != F2->isVarArg())
+    return false;
+
+  // TODO: if it's internal and only used in direct calls, we could handle this
+  // case too.
+  if (F1->getCallingConv() != F2->getCallingConv())
+    return false;
+
+  if (!isEquivalentType(F1->getFunctionType(), F2->getFunctionType()))
+    return false;
+
+  assert(F1->arg_size() == F2->arg_size() &&
+         "Identically typed functions have different numbers of args!");
+
+  // Visit the arguments so that they get enumerated in the order they're
+  // passed in.
+  for (Function::const_arg_iterator f1i = F1->arg_begin(),
+         f2i = F2->arg_begin(), f1e = F1->arg_end(); f1i != f1e; ++f1i, ++f2i) {
+    if (!enumerate(f1i, f2i))
+      llvm_unreachable("Arguments repeat!");
+  }
+
+  // We do a CFG-ordered walk since the actual ordering of the blocks in the
+  // linked list is immaterial. Our walk starts at the entry block for both
+  // functions, then takes each block from each terminator in order. As an
+  // artifact, this also means that unreachable blocks are ignored.
+  SmallVector<const BasicBlock *, 8> F1BBs, F2BBs;
+  SmallSet<const BasicBlock *, 128> VisitedBBs; // in terms of F1.
+
+  F1BBs.push_back(&F1->getEntryBlock());
+  F2BBs.push_back(&F2->getEntryBlock());
+
+  VisitedBBs.insert(F1BBs[0]);
+  while (!F1BBs.empty()) {
+    const BasicBlock *F1BB = F1BBs.pop_back_val();
+    const BasicBlock *F2BB = F2BBs.pop_back_val();
+
+    if (!enumerate(F1BB, F2BB) || !compare(F1BB, F2BB))
+      return false;
+
+    const TerminatorInst *F1TI = F1BB->getTerminator();
+    const TerminatorInst *F2TI = F2BB->getTerminator();
+
+    assert(F1TI->getNumSuccessors() == F2TI->getNumSuccessors());
+    for (unsigned i = 0, e = F1TI->getNumSuccessors(); i != e; ++i) {
+      if (!VisitedBBs.insert(F1TI->getSuccessor(i)))
+        continue;
+
+      F1BBs.push_back(F1TI->getSuccessor(i));
+      F2BBs.push_back(F2TI->getSuccessor(i));
+    }
+  }
+  return true;
+}
+
+namespace {
+
+/// MergeFunctions finds functions which will generate identical machine code,
+/// by considering all pointer types to be equivalent. Once identified,
+/// MergeFunctions will fold them by replacing a call to one to a call to a
+/// bitcast of the other.
+///
+class MergeFunctions : public ModulePass {
+public:
+  static char ID;
+  MergeFunctions()
+    : ModulePass(ID), HasGlobalAliases(false) {
+    initializeMergeFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M);
+
+private:
+  typedef DenseSet<ComparableFunction> FnSetType;
+
+  /// A work queue of functions that may have been modified and should be
+  /// analyzed again.
+  std::vector<WeakVH> Deferred;
+
+  /// Insert a ComparableFunction into the FnSet, or merge it away if it's
+  /// equal to one that's already present.
+  bool insert(ComparableFunction &NewF);
+
+  /// Remove a Function from the FnSet and queue it up for a second sweep of
+  /// analysis.
+  void remove(Function *F);
+
+  /// Find the functions that use this Value and remove them from FnSet and
+  /// queue the functions.
+  void removeUsers(Value *V);
+
+  /// Replace all direct calls of Old with calls of New. Will bitcast New if
+  /// necessary to make types match.
+  void replaceDirectCallers(Function *Old, Function *New);
+
+  /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+  /// be converted into a thunk. In either case, it should never be visited
+  /// again.
+  void mergeTwoFunctions(Function *F, Function *G);
+
+  /// Replace G with a thunk or an alias to F. Deletes G.
+  void writeThunkOrAlias(Function *F, Function *G);
+
+  /// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+  /// of G with bitcast(F). Deletes G.
+  void writeThunk(Function *F, Function *G);
+
+  /// Replace G with an alias to F. Deletes G.
+  void writeAlias(Function *F, Function *G);
+
+  /// The set of all distinct functions. Use the insert() and remove() methods
+  /// to modify it.
+  FnSetType FnSet;
+
+  /// DataLayout for more accurate GEP comparisons. May be NULL.
+  DataLayout *TD;
+
+  /// Whether or not the target supports global aliases.
+  bool HasGlobalAliases;
+};
+
+}  // end anonymous namespace
+
+char MergeFunctions::ID = 0;
+INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false)
+
+ModulePass *llvm::createMergeFunctionsPass() {
+  return new MergeFunctions();
+}
+
+bool MergeFunctions::runOnModule(Module &M) {
+  bool Changed = false;
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
+      Deferred.push_back(WeakVH(I));
+  }
+  FnSet.resize(Deferred.size());
+
+  do {
+    std::vector<WeakVH> Worklist;
+    Deferred.swap(Worklist);
+
+    DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+    DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+
+    // Insert only strong functions and merge them. Strong function merging
+    // always deletes one of them.
+    for (std::vector<WeakVH>::iterator I = Worklist.begin(),
+           E = Worklist.end(); I != E; ++I) {
+      if (!*I) continue;
+      Function *F = cast<Function>(*I);
+      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
+          !F->mayBeOverridden()) {
+        ComparableFunction CF = ComparableFunction(F, TD);
+        Changed |= insert(CF);
+      }
+    }
+
+    // Insert only weak functions and merge them. By doing these second we
+    // create thunks to the strong function when possible. When two weak
+    // functions are identical, we create a new strong function with two weak
+    // weak thunks to it which are identical but not mergable.
+    for (std::vector<WeakVH>::iterator I = Worklist.begin(),
+           E = Worklist.end(); I != E; ++I) {
+      if (!*I) continue;
+      Function *F = cast<Function>(*I);
+      if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage() &&
+          F->mayBeOverridden()) {
+        ComparableFunction CF = ComparableFunction(F, TD);
+        Changed |= insert(CF);
+      }
+    }
+    DEBUG(dbgs() << "size of FnSet: " << FnSet.size() << '\n');
+  } while (!Deferred.empty());
+
+  FnSet.clear();
+
+  return Changed;
+}
+
+bool DenseMapInfo<ComparableFunction>::isEqual(const ComparableFunction &LHS,
+                                               const ComparableFunction &RHS) {
+  if (LHS.getFunc() == RHS.getFunc() &&
+      LHS.getHash() == RHS.getHash())
+    return true;
+  if (!LHS.getFunc() || !RHS.getFunc())
+    return false;
+
+  // One of these is a special "underlying pointer comparison only" object.
+  if (LHS.getTD() == ComparableFunction::LookupOnly ||
+      RHS.getTD() == ComparableFunction::LookupOnly)
+    return false;
+
+  assert(LHS.getTD() == RHS.getTD() &&
+         "Comparing functions for different targets");
+
+  return FunctionComparator(LHS.getTD(), LHS.getFunc(),
+                            RHS.getFunc()).compare();
+}
+
+// Replace direct callers of Old with New.
+void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
+  Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+  for (Value::use_iterator UI = Old->use_begin(), UE = Old->use_end();
+       UI != UE;) {
+    Value::use_iterator TheIter = UI;
+    ++UI;
+    CallSite CS(*TheIter);
+    if (CS && CS.isCallee(TheIter)) {
+      remove(CS.getInstruction()->getParent()->getParent());
+      TheIter.getUse().set(BitcastNew);
+    }
+  }
+}
+
+// Replace G with an alias to F if possible, or else a thunk to F. Deletes G.
+void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
+  if (HasGlobalAliases && G->hasUnnamedAddr()) {
+    if (G->hasExternalLinkage() || G->hasLocalLinkage() ||
+        G->hasWeakLinkage()) {
+      writeAlias(F, G);
+      return;
+    }
+  }
+
+  writeThunk(F, G);
+}
+
+// Replace G with a simple tail call to bitcast(F). Also replace direct uses
+// of G with bitcast(F). Deletes G.
+void MergeFunctions::writeThunk(Function *F, Function *G) {
+  if (!G->mayBeOverridden()) {
+    // Redirect direct callers of G to F.
+    replaceDirectCallers(G, F);
+  }
+
+  // If G was internal then we may have replaced all uses of G with F. If so,
+  // stop here and delete G. There's no need for a thunk.
+  if (G->hasLocalLinkage() && G->use_empty()) {
+    G->eraseFromParent();
+    return;
+  }
+
+  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                                    G->getParent());
+  BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG);
+  IRBuilder<false> Builder(BB);
+
+  SmallVector<Value *, 16> Args;
+  unsigned i = 0;
+  FunctionType *FFTy = F->getFunctionType();
+  for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
+       AI != AE; ++AI) {
+    Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
+    ++i;
+  }
+
+  CallInst *CI = Builder.CreateCall(F, Args);
+  CI->setTailCall();
+  CI->setCallingConv(F->getCallingConv());
+  if (NewG->getReturnType()->isVoidTy()) {
+    Builder.CreateRetVoid();
+  } else {
+    Builder.CreateRet(Builder.CreateBitCast(CI, NewG->getReturnType()));
+  }
+
+  NewG->copyAttributesFrom(G);
+  NewG->takeName(G);
+  removeUsers(G);
+  G->replaceAllUsesWith(NewG);
+  G->eraseFromParent();
+
+  DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n');
+  ++NumThunksWritten;
+}
+
+// Replace G with an alias to F and delete G.
+void MergeFunctions::writeAlias(Function *F, Function *G) {
+  Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+  GlobalAlias *GA = new GlobalAlias(G->getType(), G->getLinkage(), "",
+                                    BitcastF, G->getParent());
+  F->setAlignment(std::max(F->getAlignment(), G->getAlignment()));
+  GA->takeName(G);
+  GA->setVisibility(G->getVisibility());
+  removeUsers(G);
+  G->replaceAllUsesWith(GA);
+  G->eraseFromParent();
+
+  DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+  ++NumAliasesWritten;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
+  if (F->mayBeOverridden()) {
+    assert(G->mayBeOverridden());
+
+    if (HasGlobalAliases) {
+      // Make them both thunks to the same internal function.
+      Function *H = Function::Create(F->getFunctionType(), F->getLinkage(), "",
+                                     F->getParent());
+      H->copyAttributesFrom(F);
+      H->takeName(F);
+      removeUsers(F);
+      F->replaceAllUsesWith(H);
+
+      unsigned MaxAlignment = std::max(G->getAlignment(), H->getAlignment());
+
+      writeAlias(F, G);
+      writeAlias(F, H);
+
+      F->setAlignment(MaxAlignment);
+      F->setLinkage(GlobalValue::PrivateLinkage);
+    } else {
+      // We can't merge them. Instead, pick one and update all direct callers
+      // to call it and hope that we improve the instruction cache hit rate.
+      replaceDirectCallers(G, F);
+    }
+
+    ++NumDoubleWeak;
+  } else {
+    writeThunkOrAlias(F, G);
+  }
+
+  ++NumFunctionsMerged;
+}
+
+// Insert a ComparableFunction into the FnSet, or merge it away if equal to one
+// that was already inserted.
+bool MergeFunctions::insert(ComparableFunction &NewF) {
+  std::pair<FnSetType::iterator, bool> Result = FnSet.insert(NewF);
+  if (Result.second) {
+    DEBUG(dbgs() << "Inserting as unique: " << NewF.getFunc()->getName() << '\n');
+    return false;
+  }
+
+  const ComparableFunction &OldF = *Result.first;
+
+  // Never thunk a strong function to a weak function.
+  assert(!OldF.getFunc()->mayBeOverridden() ||
+         NewF.getFunc()->mayBeOverridden());
+
+  DEBUG(dbgs() << "  " << OldF.getFunc()->getName() << " == "
+               << NewF.getFunc()->getName() << '\n');
+
+  Function *DeleteF = NewF.getFunc();
+  NewF.release();
+  mergeTwoFunctions(OldF.getFunc(), DeleteF);
+  return true;
+}
+
+// Remove a function from FnSet. If it was already in FnSet, add it to Deferred
+// so that we'll look at it in the next round.
+void MergeFunctions::remove(Function *F) {
+  // We need to make sure we remove F, not a function "equal" to F per the
+  // function equality comparator.
+  //
+  // The special "lookup only" ComparableFunction bypasses the expensive
+  // function comparison in favour of a pointer comparison on the underlying
+  // Function*'s.
+  ComparableFunction CF = ComparableFunction(F, ComparableFunction::LookupOnly);
+  if (FnSet.erase(CF)) {
+    DEBUG(dbgs() << "Removed " << F->getName() << " from set and deferred it.\n");
+    Deferred.push_back(F);
+  }
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeFunctions::removeUsers(Value *V) {
+  std::vector<Value *> Worklist;
+  Worklist.push_back(V);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.back();
+    Worklist.pop_back();
+
+    for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      Use &U = UI.getUse();
+      if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+        remove(I->getParent()->getParent());
+      } else if (isa<GlobalValue>(U.getUser())) {
+        // do nothing
+      } else if (Constant *C = dyn_cast<Constant>(U.getUser())) {
+        for (Value::use_iterator CUI = C->use_begin(), CUE = C->use_end();
+             CUI != CUE; ++CUI)
+          Worklist.push_back(*CUI);
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
new file mode 100644
index 000000000000..fa518cb0abb6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -0,0 +1,183 @@
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partialinlining"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+using namespace llvm;
+
+STATISTIC(NumPartialInlined, "Number of functions partially inlined");
+
+namespace {
+  struct PartialInliner : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const { }
+    static char ID; // Pass identification, replacement for typeid
+    PartialInliner() : ModulePass(ID) {
+      initializePartialInlinerPass(*PassRegistry::getPassRegistry());
+    }
+    
+    bool runOnModule(Module& M);
+    
+  private:
+    Function* unswitchFunction(Function* F);
+  };
+}
+
+char PartialInliner::ID = 0;
+INITIALIZE_PASS(PartialInliner, "partial-inliner",
+                "Partial Inliner", false, false)
+
+ModulePass* llvm::createPartialInliningPass() { return new PartialInliner(); }
+
+Function* PartialInliner::unswitchFunction(Function* F) {
+  // First, verify that this function is an unswitching candidate...
+  BasicBlock* entryBlock = F->begin();
+  BranchInst *BR = dyn_cast<BranchInst>(entryBlock->getTerminator());
+  if (!BR || BR->isUnconditional())
+    return 0;
+  
+  BasicBlock* returnBlock = 0;
+  BasicBlock* nonReturnBlock = 0;
+  unsigned returnCount = 0;
+  for (succ_iterator SI = succ_begin(entryBlock), SE = succ_end(entryBlock);
+       SI != SE; ++SI)
+    if (isa<ReturnInst>((*SI)->getTerminator())) {
+      returnBlock = *SI;
+      returnCount++;
+    } else
+      nonReturnBlock = *SI;
+  
+  if (returnCount != 1)
+    return 0;
+  
+  // Clone the function, so that we can hack away on it.
+  ValueToValueMapTy VMap;
+  Function* duplicateFunction = CloneFunction(F, VMap,
+                                              /*ModuleLevelChanges=*/false);
+  duplicateFunction->setLinkage(GlobalValue::InternalLinkage);
+  F->getParent()->getFunctionList().push_back(duplicateFunction);
+  BasicBlock* newEntryBlock = cast<BasicBlock>(VMap[entryBlock]);
+  BasicBlock* newReturnBlock = cast<BasicBlock>(VMap[returnBlock]);
+  BasicBlock* newNonReturnBlock = cast<BasicBlock>(VMap[nonReturnBlock]);
+  
+  // Go ahead and update all uses to the duplicate, so that we can just
+  // use the inliner functionality when we're done hacking.
+  F->replaceAllUsesWith(duplicateFunction);
+  
+  // Special hackery is needed with PHI nodes that have inputs from more than
+  // one extracted block.  For simplicity, just split the PHIs into a two-level
+  // sequence of PHIs, some of which will go in the extracted region, and some
+  // of which will go outside.
+  BasicBlock* preReturn = newReturnBlock;
+  newReturnBlock = newReturnBlock->splitBasicBlock(
+                                              newReturnBlock->getFirstNonPHI());
+  BasicBlock::iterator I = preReturn->begin();
+  BasicBlock::iterator Ins = newReturnBlock->begin();
+  while (I != preReturn->end()) {
+    PHINode* OldPhi = dyn_cast<PHINode>(I);
+    if (!OldPhi) break;
+    
+    PHINode* retPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
+    OldPhi->replaceAllUsesWith(retPhi);
+    Ins = newReturnBlock->getFirstNonPHI();
+    
+    retPhi->addIncoming(I, preReturn);
+    retPhi->addIncoming(OldPhi->getIncomingValueForBlock(newEntryBlock),
+                        newEntryBlock);
+    OldPhi->removeIncomingValue(newEntryBlock);
+    
+    ++I;
+  }
+  newEntryBlock->getTerminator()->replaceUsesOfWith(preReturn, newReturnBlock);
+  
+  // Gather up the blocks that we're going to extract.
+  std::vector<BasicBlock*> toExtract;
+  toExtract.push_back(newNonReturnBlock);
+  for (Function::iterator FI = duplicateFunction->begin(),
+       FE = duplicateFunction->end(); FI != FE; ++FI)
+    if (&*FI != newEntryBlock && &*FI != newReturnBlock &&
+        &*FI != newNonReturnBlock)
+      toExtract.push_back(FI);
+      
+  // The CodeExtractor needs a dominator tree.
+  DominatorTree DT;
+  DT.runOnFunction(*duplicateFunction);
+  
+  // Extract the body of the if.
+  Function* extractedFunction
+    = CodeExtractor(toExtract, &DT).extractCodeRegion();
+  
+  InlineFunctionInfo IFI;
+  
+  // Inline the top-level if test into all callers.
+  std::vector<User*> Users(duplicateFunction->use_begin(), 
+                           duplicateFunction->use_end());
+  for (std::vector<User*>::iterator UI = Users.begin(), UE = Users.end();
+       UI != UE; ++UI)
+    if (CallInst *CI = dyn_cast<CallInst>(*UI))
+      InlineFunction(CI, IFI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(*UI))
+      InlineFunction(II, IFI);
+  
+  // Ditch the duplicate, since we're done with it, and rewrite all remaining
+  // users (function pointers, etc.) back to the original function.
+  duplicateFunction->replaceAllUsesWith(F);
+  duplicateFunction->eraseFromParent();
+  
+  ++NumPartialInlined;
+  
+  return extractedFunction;
+}
+
+bool PartialInliner::runOnModule(Module& M) {
+  std::vector<Function*> worklist;
+  worklist.reserve(M.size());
+  for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI)
+    if (!FI->use_empty() && !FI->isDeclaration())
+      worklist.push_back(&*FI);
+    
+  bool changed = false;
+  while (!worklist.empty()) {
+    Function* currFunc = worklist.back();
+    worklist.pop_back();
+  
+    if (currFunc->use_empty()) continue;
+    
+    bool recursive = false;
+    for (Function::use_iterator UI = currFunc->use_begin(),
+         UE = currFunc->use_end(); UI != UE; ++UI)
+      if (Instruction* I = dyn_cast<Instruction>(*UI))
+        if (I->getParent()->getParent() == currFunc) {
+          recursive = true;
+          break;
+        }
+    if (recursive) continue;
+          
+    
+    if (Function* newFunc = unswitchFunction(currFunc)) {
+      worklist.push_back(newFunc);
+      changed = true;
+    }
+    
+  }
+  
+  return changed;
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
new file mode 100644
index 000000000000..027a9f2a6871
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -0,0 +1,399 @@
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+RunLoopVectorization("vectorize-loops",
+                     cl::desc("Run the Loop vectorization passes"));
+
+static cl::opt<bool>
+RunBBVectorization("vectorize", cl::desc("Run the BB vectorization passes"));
+
+static cl::opt<bool>
+UseGVNAfterVectorization("use-gvn-after-vectorization",
+  cl::init(false), cl::Hidden,
+  cl::desc("Run GVN instead of Early CSE after vectorization passes"));
+
+static cl::opt<bool> UseNewSROA("use-new-sroa",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable the new, experimental SROA pass"));
+
+PassManagerBuilder::PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = 0;
+    Inliner = 0;
+    DisableSimplifyLibCalls = false;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+    Vectorize = RunBBVectorization;
+    LoopVectorize = RunLoopVectorization;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+  delete LibraryInfo;
+  delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
+   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+
+void PassManagerBuilder::addGlobalExtension(
+    PassManagerBuilder::ExtensionPointTy Ty,
+    PassManagerBuilder::ExtensionFn Fn) {
+  GlobalExtensions->push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+  Extensions.push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+                                           PassManagerBase &PM) const {
+  for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i)
+    if ((*GlobalExtensions)[i].first == ETy)
+      (*GlobalExtensions)[i].second(*this, PM);
+  for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+    if (Extensions[i].first == ETy)
+      Extensions[i].second(*this, PM);
+}
+
+void
+PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
+}
+
+void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
+  addExtensionsToPM(EP_EarlyAsPossible, FPM);
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  if (OptLevel == 0) return;
+
+  addInitialAliasAnalysisPasses(FPM);
+
+  FPM.add(createCFGSimplificationPass());
+  if (UseNewSROA)
+    FPM.add(createSROAPass());
+  else
+    FPM.add(createScalarReplAggregatesPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createLowerExpectIntrinsicPass());
+}
+
+void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
+  // If all optimizations are disabled, just run the always-inline pass.
+  if (OptLevel == 0) {
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = 0;
+    }
+
+    // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+    // pass manager, but we don't want to add extensions into that pass manager.
+    // To prevent this we must insert a no-op module pass to reset the pass
+    // manager to get the same behavior as EP_OptimizerLast in non-O0 builds.
+    if (!GlobalExtensions->empty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  if (!DisableUnitAtATime) {
+    addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
+    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+
+    MPM.add(createIPSCCPPass());              // IP SCCP
+    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+
+    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+  }
+
+  // Start of CallGraph SCC passes.
+  if (!DisableUnitAtATime)
+    MPM.add(createPruneEHPass());             // Remove dead EH info
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = 0;
+  }
+  if (!DisableUnitAtATime)
+    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
+
+  // Start of function pass.
+  // Break up aggregate allocas, using SSAUpdater.
+  if (UseNewSROA)
+    MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+  else
+    MPM.add(createScalarReplAggregatesPass(-1, false));
+  MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  if (!DisableSimplifyLibCalls)
+    MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
+  MPM.add(createJumpThreadingPass());         // Thread jumps.
+  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+
+  MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createReassociatePass());           // Reassociate expressions
+  MPM.add(createLoopRotatePass());            // Rotate Loop
+  MPM.add(createLICMPass());                  // Hoist loop invariants
+  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  MPM.add(createLoopDeletionPass());          // Delete dead loops
+
+  if (LoopVectorize && OptLevel > 2)
+    MPM.add(createLoopVectorizePass());
+
+  if (!DisableUnrollLoops)
+    MPM.add(createLoopUnrollPass());          // Unroll small loops
+  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+  if (OptLevel > 1)
+    MPM.add(createGVNPass());                 // Remove redundancies
+  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+  MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+  // Run instcombine after redundancy elimination to exploit opportunities
+  // opened up by them.
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createJumpThreadingPass());         // Thread jumps
+  MPM.add(createCorrelatedValuePropagationPass());
+  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+
+  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+  if (Vectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1 && UseGVNAfterVectorization)
+      MPM.add(createGVNPass());                   // Remove redundancies
+    else
+      MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+
+    // BBVectorize may have significantly shortened a loop body; unroll again.
+    if (!DisableUnrollLoops)
+      MPM.add(createLoopUnrollPass());
+  }
+
+  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+
+  if (!DisableUnitAtATime) {
+    // FIXME: We shouldn't bother with this anymore.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+    // GlobalOpt already deletes dead functions and globals, at -O2 try a
+    // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+    if (OptLevel > 1) {
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
+  }
+  addExtensionsToPM(EP_OptimizerLast, MPM);
+}
+
+void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
+                                                bool Internalize,
+                                                bool RunInliner,
+                                                bool DisableGVNLoadPRE) {
+  // Provide AliasAnalysis services for optimizations.
+  addInitialAliasAnalysisPasses(PM);
+
+  // Now that composite has been compiled, scan through the module, looking
+  // for a main function.  If main is defined, mark all other functions
+  // internal.
+  if (Internalize) {
+    std::vector<const char*> E;
+    E.push_back("main");
+    PM.add(createInternalizePass(E));
+  }
+
+  // Propagate constants at call sites into the functions they call.  This
+  // opens opportunities for globalopt (and inlining) by substituting function
+  // pointers passed as arguments to direct uses of functions.
+  PM.add(createIPSCCPPass());
+
+  // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createGlobalOptimizerPass());
+
+  // Linking modules together can lead to duplicated global constants, only
+  // keep one copy of each constant.
+  PM.add(createConstantMergePass());
+
+  // Remove unused arguments from functions.
+  PM.add(createDeadArgEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  PM.add(createInstructionCombiningPass());
+
+  // Inline small functions
+  if (RunInliner)
+    PM.add(createFunctionInliningPass());
+
+  PM.add(createPruneEHPass());   // Remove dead EH info.
+
+  // Optimize globals again if we ran the inliner.
+  if (RunInliner)
+    PM.add(createGlobalOptimizerPass());
+  PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  PM.add(createArgumentPromotionPass());
+
+  // The IPO passes may leave cruft around.  Clean up after them.
+  PM.add(createInstructionCombiningPass());
+  PM.add(createJumpThreadingPass());
+  // Break up allocas
+  if (UseNewSROA)
+    PM.add(createSROAPass());
+  else
+    PM.add(createScalarReplAggregatesPass());
+
+  // Run a few AA driven optimizations here and now, to cleanup the code.
+  PM.add(createFunctionAttrsPass()); // Add nocapture.
+  PM.add(createGlobalsModRefPass()); // IP alias analysis.
+
+  PM.add(createLICMPass());                 // Hoist loop invariants.
+  PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+  PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+  // Nuke dead stores.
+  PM.add(createDeadStoreEliminationPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  PM.add(createInstructionCombiningPass());
+
+  PM.add(createJumpThreadingPass());
+
+  // Delete basic blocks, which optimization passes may have killed.
+  PM.add(createCFGSimplificationPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  PM.add(createGlobalDCEPass());
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
+  PassManagerBuilder *PMB = new PassManagerBuilder();
+  return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+                                  unsigned OptLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+                                   unsigned SizeLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnitAtATime = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+                                                 LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableSimplifyLibCalls = Value;
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+                                              unsigned Threshold) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM);
+  Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+                                                LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *MPM = unwrap(PM);
+  Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM,
+                                                  LLVMBool Internalize,
+                                                  LLVMBool RunInliner) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *LPM = unwrap(PM);
+  Builder->populateLTOPassManager(*LPM, Internalize != 0, RunInliner != 0);
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
new file mode 100644
index 000000000000..73d9323195bb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -0,0 +1,260 @@
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "prune-eh"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CFG.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+  struct PruneEH : public CallGraphSCCPass {
+    static char ID; // Pass identification, replacement for typeid
+    PruneEH() : CallGraphSCCPass(ID) {
+      initializePruneEHPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnSCC - Analyze the SCC, performing the transformation if possible.
+    bool runOnSCC(CallGraphSCC &SCC);
+
+    bool SimplifyFunction(Function *F);
+    void DeleteBasicBlock(BasicBlock *BB);
+  };
+}
+
+char PruneEH::ID = 0;
+INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_END(PruneEH, "prune-eh",
+                "Remove unused exception handling info", false, false)
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
+
+bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+  SmallPtrSet<CallGraphNode *, 8> SCCNodes;
+  CallGraph &CG = getAnalysis<CallGraph>();
+  bool MadeChange = false;
+
+  // Fill SCCNodes with the elements of the SCC.  Used for quickly
+  // looking up whether a given CallGraphNode is in this SCC.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+    SCCNodes.insert(*I);
+
+  // First pass, scan all of the functions in the SCC, simplifying them
+  // according to what we know.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I)
+    if (Function *F = (*I)->getFunction())
+      MadeChange |= SimplifyFunction(F);
+
+  // Next, check to see if any callees might throw or if there are any external
+  // functions in this SCC: if so, we cannot prune any functions in this SCC.
+  // Definitions that are weak and not declared non-throwing might be 
+  // overridden at linktime with something that throws, so assume that.
+  // If this SCC includes the unwind instruction, we KNOW it throws, so
+  // obviously the SCC might throw.
+  //
+  bool SCCMightUnwind = false, SCCMightReturn = false;
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); 
+       (!SCCMightUnwind || !SCCMightReturn) && I != E; ++I) {
+    Function *F = (*I)->getFunction();
+    if (F == 0) {
+      SCCMightUnwind = true;
+      SCCMightReturn = true;
+    } else if (F->isDeclaration() || F->mayBeOverridden()) {
+      SCCMightUnwind |= !F->doesNotThrow();
+      SCCMightReturn |= !F->doesNotReturn();
+    } else {
+      bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+      bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+
+      if (!CheckUnwind && !CheckReturn)
+        continue;
+
+      // Check to see if this function performs an unwind or calls an
+      // unwinding function.
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        if (CheckUnwind && isa<ResumeInst>(BB->getTerminator())) {
+          // Uses unwind / resume!
+          SCCMightUnwind = true;
+        } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
+          SCCMightReturn = true;
+        }
+
+        // Invoke instructions don't allow unwinding to continue, so we are
+        // only interested in call instructions.
+        if (CheckUnwind && !SCCMightUnwind)
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+            if (CallInst *CI = dyn_cast<CallInst>(I)) {
+              if (CI->doesNotThrow()) {
+                // This call cannot throw.
+              } else if (Function *Callee = CI->getCalledFunction()) {
+                CallGraphNode *CalleeNode = CG[Callee];
+                // If the callee is outside our current SCC then we may
+                // throw because it might.
+                if (!SCCNodes.count(CalleeNode)) {
+                  SCCMightUnwind = true;
+                  break;
+                }
+              } else {
+                // Indirect call, it might throw.
+                SCCMightUnwind = true;
+                break;
+              }
+            }
+        if (SCCMightUnwind && SCCMightReturn) break;
+      }
+    }
+  }
+
+  // If the SCC doesn't unwind or doesn't throw, note this fact.
+  if (!SCCMightUnwind || !SCCMightReturn)
+    for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+      AttrBuilder NewAttributes;
+
+      if (!SCCMightUnwind)
+        NewAttributes.addAttribute(Attribute::NoUnwind);
+      if (!SCCMightReturn)
+        NewAttributes.addAttribute(Attribute::NoReturn);
+
+      Function *F = (*I)->getFunction();
+      const AttributeSet &PAL = F->getAttributes();
+      const AttributeSet &NPAL =
+        PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex,
+                          AttributeSet::get(F->getContext(),
+                                            AttributeSet::FunctionIndex,
+                                            NewAttributes));
+      if (PAL != NPAL) {
+        MadeChange = true;
+        F->setAttributes(NPAL);
+      }
+    }
+
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    // Convert any invoke instructions to non-throwing functions in this node
+    // into call instructions with a branch.  This makes the exception blocks
+    // dead.
+    if (Function *F = (*I)->getFunction())
+      MadeChange |= SimplifyFunction(F);
+  }
+
+  return MadeChange;
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
+bool PruneEH::SimplifyFunction(Function *F) {
+  bool MadeChange = false;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (II->doesNotThrow()) {
+        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+        // Insert a call instruction before the invoke.
+        CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+        Call->takeName(II);
+        Call->setCallingConv(II->getCallingConv());
+        Call->setAttributes(II->getAttributes());
+        Call->setDebugLoc(II->getDebugLoc());
+
+        // Anything that used the value produced by the invoke instruction
+        // now uses the value produced by the call instruction.  Note that we
+        // do this even for void functions and calls with no uses so that the
+        // callgraph edge is updated.
+        II->replaceAllUsesWith(Call);
+        BasicBlock *UnwindBlock = II->getUnwindDest();
+        UnwindBlock->removePredecessor(II->getParent());
+
+        // Insert a branch to the normal destination right before the
+        // invoke.
+        BranchInst::Create(II->getNormalDest(), II);
+
+        // Finally, delete the invoke instruction!
+        BB->getInstList().pop_back();
+
+        // If the unwind block is now dead, nuke it.
+        if (pred_begin(UnwindBlock) == pred_end(UnwindBlock))
+          DeleteBasicBlock(UnwindBlock);  // Delete the new BB.
+
+        ++NumRemoved;
+        MadeChange = true;
+      }
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (CI->doesNotReturn() && !isa<UnreachableInst>(I)) {
+          // This call calls a function that cannot return.  Insert an
+          // unreachable instruction after it and simplify the code.  Do this
+          // by splitting the BB, adding the unreachable, then deleting the
+          // new BB.
+          BasicBlock *New = BB->splitBasicBlock(I);
+
+          // Remove the uncond branch and add an unreachable.
+          BB->getInstList().pop_back();
+          new UnreachableInst(BB->getContext(), BB);
+
+          DeleteBasicBlock(New);  // Delete the new BB.
+          MadeChange = true;
+          ++NumUnreach;
+          break;
+        }
+  }
+
+  return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
+void PruneEH::DeleteBasicBlock(BasicBlock *BB) {
+  assert(pred_begin(BB) == pred_end(BB) && "BB is not dead!");
+  CallGraph &CG = getAnalysis<CallGraph>();
+
+  CallGraphNode *CGN = CG[BB->getParent()];
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+    --I;
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (!isa<IntrinsicInst>(I))
+        CGN->removeCallEdgeFor(CI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+      CGN->removeCallEdgeFor(II);
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  // Get the list of successors of this block.
+  std::vector<BasicBlock*> Succs(succ_begin(BB), succ_end(BB));
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    Succs[i]->removePredecessor(BB);
+
+  BB->eraseFromParent();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
new file mode 100644
index 000000000000..f00830aadaad
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -0,0 +1,73 @@
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for 
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+namespace {
+
+/// @brief Pass to remove unused function declarations.
+class StripDeadPrototypesPass : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  StripDeadPrototypesPass() : ModulePass(ID) {
+    initializeStripDeadPrototypesPassPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnModule(Module &M);
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesPass::ID = 0;
+INITIALIZE_PASS(StripDeadPrototypesPass, "strip-dead-prototypes",
+                "Strip Unused Function Prototypes", false, false)
+
+bool StripDeadPrototypesPass::runOnModule(Module &M) {
+  bool MadeChange = false;
+  
+  // Erase dead function prototypes.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+    Function *F = I++;
+    // Function must be a prototype and unused.
+    if (F->isDeclaration() && F->use_empty()) {
+      F->eraseFromParent();
+      ++NumDeadPrototypes;
+      MadeChange = true;
+    }
+  }
+
+  // Erase dead global var prototypes.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ) {
+    GlobalVariable *GV = I++;
+    // Global must be a prototype and unused.
+    if (GV->isDeclaration() && GV->use_empty())
+      GV->eraseFromParent();
+  }
+  
+  // Return an indication of whether we changed anything or not.
+  return MadeChange;
+}
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+  return new StripDeadPrototypesPass();
+}
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
new file mode 100644
index 000000000000..5f8681ff454e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -0,0 +1,414 @@
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+// 
+//   * names for virtual registers
+//   * symbols for internal globals and functions
+//   * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+namespace {
+  class StripSymbols : public ModulePass {
+    bool OnlyDebugInfo;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripSymbols(bool ODI = false) 
+      : ModulePass(ID), OnlyDebugInfo(ODI) {
+        initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripNonDebugSymbols : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripNonDebugSymbols()
+      : ModulePass(ID) {
+        initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDebugDeclare : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDebugDeclare()
+      : ModulePass(ID) {
+        initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+
+  class StripDeadDebugInfo : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit StripDeadDebugInfo()
+      : ModulePass(ID) {
+        initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry());
+      }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+  };
+}
+
+char StripSymbols::ID = 0;
+INITIALIZE_PASS(StripSymbols, "strip",
+                "Strip all symbols from a module", false, false)
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+  return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
+                "Strip all symbols, except dbg symbols, from a module",
+                false, false)
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+  return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
+                "Strip all llvm.dbg.declare intrinsics", false, false)
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+  return new StripDebugDeclare();
+}
+
+char StripDeadDebugInfo::ID = 0;
+INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
+                "Strip debug info for unused symbols", false, false)
+
+ModulePass *llvm::createStripDeadDebugInfoPass() {
+  return new StripDeadDebugInfo();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+  for(Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
+    User *U = *I;
+    if (U != Usr)
+      return false;
+  }
+  return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+  assert(C->use_empty() && "Constant is not dead!");
+  SmallPtrSet<Constant*, 4> Operands;
+  for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
+    if (OnlyUsedBy(C->getOperand(i), C)) 
+      Operands.insert(cast<Constant>(C->getOperand(i)));
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+    if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
+    GV->eraseFromParent();
+  }
+  else if (!isa<Function>(C))
+    if (isa<CompositeType>(C->getType()))
+      C->destroyConstant();
+
+  // If the constant referenced anything, see if we can delete it as well.
+  for (SmallPtrSet<Constant*, 4>::iterator OI = Operands.begin(),
+         OE = Operands.end(); OI != OE; ++OI)
+    RemoveDeadConstant(*OI);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+  for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+    Value *V = VI->getValue();
+    ++VI;
+    if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+      if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg"))
+        // Set name to "", removing from symbol table!
+        V->setName("");
+    }
+  }
+}
+
+// Strip any named types of their names.
+static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
+  TypeFinder StructTypes;
+  StructTypes.run(M, false);
+
+  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+    StructType *STy = StructTypes[i];
+    if (STy->isLiteral() || STy->getName().empty()) continue;
+    
+    if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
+      continue;
+
+    STy->setName("");
+  }
+}
+
+/// Find values that are marked as llvm.used.
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+  UsedValues.insert(LLVMUsed);
+  
+  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
+  if (Inits == 0) return;
+  
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV = 
+          dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+/// StripSymbolNames - Strip symbol names.
+static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+  SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+  findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues);
+  findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
+        I->setName("");     // Internal symbols can't participate in linkage
+  }
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
+      if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
+        I->setName("");     // Internal symbols can't participate in linkage
+    StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
+  }
+  
+  // Remove all names from types.
+  StripTypeNames(M, PreserveDbgInfo);
+
+  return true;
+}
+
+// StripDebugInfo - Strip debug info in the module if it exists.  
+// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
+// llvm.dbg.region.end calls, and any globals they point to if now dead.
+static bool StripDebugInfo(Module &M) {
+
+  bool Changed = false;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      CI->eraseFromParent();
+    }
+    Declare->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
+    while (!DbgVal->use_empty()) {
+      CallInst *CI = cast<CallInst>(DbgVal->use_back());
+      CI->eraseFromParent();
+    }
+    DbgVal->eraseFromParent();
+    Changed = true;
+  }
+
+  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
+         NME = M.named_metadata_end(); NMI != NME;) {
+    NamedMDNode *NMD = NMI;
+    ++NMI;
+    if (NMD->getName().startswith("llvm.dbg.")) {
+      NMD->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
+         ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+           ++BI) {
+        if (!BI->getDebugLoc().isUnknown()) {
+          Changed = true;
+          BI->setDebugLoc(DebugLoc());
+        }
+      }
+
+  return Changed;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+  bool Changed = false;
+  Changed |= StripDebugInfo(M);
+  if (!OnlyDebugInfo)
+    Changed |= StripSymbolNames(M, false);
+  return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+  return StripSymbolNames(M, true);
+}
+
+bool StripDebugDeclare::runOnModule(Module &M) {
+
+  Function *Declare = M.getFunction("llvm.dbg.declare");
+  std::vector<Constant*> DeadConstants;
+
+  if (Declare) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      Value *Arg1 = CI->getArgOperand(0);
+      Value *Arg2 = CI->getArgOperand(1);
+      assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+      CI->eraseFromParent();
+      if (Arg1->use_empty()) {
+        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+          DeadConstants.push_back(C);
+        else 
+          RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+      }
+      if (Arg2->use_empty())
+        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+          DeadConstants.push_back(C);
+    }
+    Declare->eraseFromParent();
+  }
+
+  while (!DeadConstants.empty()) {
+    Constant *C = DeadConstants.back();
+    DeadConstants.pop_back();
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->hasLocalLinkage())
+        RemoveDeadConstant(GV);
+    } else
+      RemoveDeadConstant(C);
+  }
+
+  return true;
+}
+
+/// getRealLinkageName - If special LLVM prefix that is used to inform the asm 
+/// printer to not emit usual symbol prefix before the symbol name is used then
+/// return linkage name after skipping this special LLVM prefix.
+static StringRef getRealLinkageName(StringRef LinkageName) {
+  char One = '\1';
+  if (LinkageName.startswith(StringRef(&One, 1)))
+    return LinkageName.substr(1);
+  return LinkageName;
+}
+
+bool StripDeadDebugInfo::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Debugging infomration is encoded in llvm IR using metadata. This is designed
+  // such a way that debug info for symbols preserved even if symbols are
+  // optimized away by the optimizer. This special pass removes debug info for 
+  // such symbols.
+
+  // llvm.dbg.gv keeps track of debug info for global variables.
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
+    SmallVector<MDNode *, 8> MDs;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      if (DIGlobalVariable(NMD->getOperand(i)).Verify())
+        MDs.push_back(NMD->getOperand(i));
+      else
+        Changed = true;
+    NMD->eraseFromParent();
+    NMD = NULL;
+
+    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
+           E = MDs.end(); I != E; ++I) {
+      GlobalVariable *GV = DIGlobalVariable(*I).getGlobal();
+      if (GV && M.getGlobalVariable(GV->getName(), true)) {
+        if (!NMD)
+          NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
+        NMD->addOperand(*I);
+      }
+      else
+        Changed = true;
+    }
+  }
+
+  // llvm.dbg.sp keeps track of debug info for subprograms.
+  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) {
+    SmallVector<MDNode *, 8> MDs;
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
+      if (DISubprogram(NMD->getOperand(i)).Verify())
+        MDs.push_back(NMD->getOperand(i));
+      else
+        Changed = true;
+    NMD->eraseFromParent();
+    NMD = NULL;
+
+    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
+           E = MDs.end(); I != E; ++I) {
+      bool FnIsLive = false;
+      if (Function *F = DISubprogram(*I).getFunction())
+        if (M.getFunction(F->getName()))
+          FnIsLive = true;
+      if (FnIsLive) {
+          if (!NMD)
+            NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
+          NMD->addOperand(*I);
+      } else {
+        // Remove llvm.dbg.lv.fnname named mdnode which may have been used
+        // to hold debug info for dead function's local variables.
+        StringRef FName = DISubprogram(*I).getLinkageName();
+        if (FName.empty())
+          FName = DISubprogram(*I).getName();
+        if (NamedMDNode *LVNMD = 
+            M.getNamedMetadata(Twine("llvm.dbg.lv.", 
+                                     getRealLinkageName(FName)))) 
+          LVNMD->eraseFromParent();
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h
new file mode 100644
index 000000000000..1f6a3a5e335d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombine.h
@@ -0,0 +1,389 @@
+//===- InstCombine.h - Main InstCombine pass definition -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INSTCOMBINE_INSTCOMBINE_H
+#define INSTCOMBINE_INSTCOMBINE_H
+
+#include "InstCombineWorklist.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/TargetFolder.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+
+namespace llvm {
+  class CallSite;
+  class DataLayout;
+  class TargetLibraryInfo;
+  class DbgDeclareInst;
+  class MemIntrinsic;
+  class MemSetInst;
+
+/// SelectPatternFlavor - We can match a variety of different patterns for
+/// select operations.
+enum SelectPatternFlavor {
+  SPF_UNKNOWN = 0,
+  SPF_SMIN, SPF_UMIN,
+  SPF_SMAX, SPF_UMAX
+  //SPF_ABS - TODO.
+};
+
+/// getComplexity:  Assign a complexity or rank value to LLVM Values...
+///   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
+static inline unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (BinaryOperator::isNeg(V) ||
+        BinaryOperator::isFNeg(V) ||
+        BinaryOperator::isNot(V))
+      return 3;
+    return 4;
+  }
+  if (isa<Argument>(V)) return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+
+/// InstCombineIRInserter - This is an IRBuilder insertion helper that works
+/// just like the normal insertion helper, but also adds any new instructions
+/// to the instcombine worklist.
+class LLVM_LIBRARY_VISIBILITY InstCombineIRInserter
+    : public IRBuilderDefaultInserter<true> {
+  InstCombineWorklist &Worklist;
+public:
+  InstCombineIRInserter(InstCombineWorklist &WL) : Worklist(WL) {}
+
+  void InsertHelper(Instruction *I, const Twine &Name,
+                    BasicBlock *BB, BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<true>::InsertHelper(I, Name, BB, InsertPt);
+    Worklist.Add(I);
+  }
+};
+
+/// InstCombiner - The -instcombine pass.
+class LLVM_LIBRARY_VISIBILITY InstCombiner
+                             : public FunctionPass,
+                               public InstVisitor<InstCombiner, Instruction*> {
+  DataLayout *TD;
+  TargetLibraryInfo *TLI;
+  bool MadeIRChange;
+  LibCallSimplifier *Simplifier;
+  bool MinimizeSize;
+public:
+  /// Worklist - All of the instructions that need to be simplified.
+  InstCombineWorklist Worklist;
+
+  /// Builder - This is an IRBuilder that automatically inserts new
+  /// instructions into the worklist when they are created.
+  typedef IRBuilder<true, TargetFolder, InstCombineIRInserter> BuilderTy;
+  BuilderTy *Builder;
+
+  static char ID; // Pass identification, replacement for typeid
+  InstCombiner() : FunctionPass(ID), TD(0), Builder(0) {
+    MinimizeSize = false;
+    initializeInstCombinerPass(*PassRegistry::getPassRegistry());
+  }
+
+public:
+  virtual bool runOnFunction(Function &F);
+
+  bool DoOneIteration(Function &F, unsigned ItNum);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  DataLayout *getDataLayout() const { return TD; }
+
+  TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+
+  // Visitation implementation - Implement instruction combining for different
+  // instruction types.  The semantics are as follows:
+  // Return Value:
+  //    null        - No change was made
+  //     I          - Change was made, I is still valid, I may be dead though
+  //   otherwise    - Change was made, replace I with returned instruction
+  //
+  Instruction *visitAdd(BinaryOperator &I);
+  Instruction *visitFAdd(BinaryOperator &I);
+  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
+  Instruction *visitSub(BinaryOperator &I);
+  Instruction *visitFSub(BinaryOperator &I);
+  Instruction *visitMul(BinaryOperator &I);
+  Value *foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
+                       Instruction *InsertBefore);
+  Instruction *visitFMul(BinaryOperator &I);
+  Instruction *visitURem(BinaryOperator &I);
+  Instruction *visitSRem(BinaryOperator &I);
+  Instruction *visitFRem(BinaryOperator &I);
+  bool SimplifyDivRemOfSelect(BinaryOperator &I);
+  Instruction *commonRemTransforms(BinaryOperator &I);
+  Instruction *commonIRemTransforms(BinaryOperator &I);
+  Instruction *commonDivTransforms(BinaryOperator &I);
+  Instruction *commonIDivTransforms(BinaryOperator &I);
+  Instruction *visitUDiv(BinaryOperator &I);
+  Instruction *visitSDiv(BinaryOperator &I);
+  Instruction *visitFDiv(BinaryOperator &I);
+  Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Instruction *visitAnd(BinaryOperator &I);
+  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+  Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                   Value *A, Value *B, Value *C);
+  Instruction *visitOr (BinaryOperator &I);
+  Instruction *visitXor(BinaryOperator &I);
+  Instruction *visitShl(BinaryOperator &I);
+  Instruction *visitAShr(BinaryOperator &I);
+  Instruction *visitLShr(BinaryOperator &I);
+  Instruction *commonShiftTransforms(BinaryOperator &I);
+  Instruction *FoldFCmp_IntToFP_Cst(FCmpInst &I, Instruction *LHSI,
+                                    Constant *RHSC);
+  Instruction *FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+                                            GlobalVariable *GV, CmpInst &ICI,
+                                            ConstantInt *AndCst = 0);
+  Instruction *visitFCmpInst(FCmpInst &I);
+  Instruction *visitICmpInst(ICmpInst &I);
+  Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
+  Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                              Instruction *LHS,
+                                              ConstantInt *RHS);
+  Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                              ConstantInt *DivRHS);
+  Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
+                              ConstantInt *DivRHS);
+  Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI,
+                                ICmpInst::Predicate Pred, Value *TheAdd);
+  Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                           ICmpInst::Predicate Cond, Instruction &I);
+  Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                   BinaryOperator &I);
+  Instruction *commonCastTransforms(CastInst &CI);
+  Instruction *commonPointerCastTransforms(CastInst &CI);
+  Instruction *visitTrunc(TruncInst &CI);
+  Instruction *visitZExt(ZExtInst &CI);
+  Instruction *visitSExt(SExtInst &CI);
+  Instruction *visitFPTrunc(FPTruncInst &CI);
+  Instruction *visitFPExt(CastInst &CI);
+  Instruction *visitFPToUI(FPToUIInst &FI);
+  Instruction *visitFPToSI(FPToSIInst &FI);
+  Instruction *visitUIToFP(CastInst &CI);
+  Instruction *visitSIToFP(CastInst &CI);
+  Instruction *visitPtrToInt(PtrToIntInst &CI);
+  Instruction *visitIntToPtr(IntToPtrInst &CI);
+  Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                              Instruction *FI);
+  Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
+  Instruction *FoldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+                            Value *A, Value *B, Instruction &Outer,
+                            SelectPatternFlavor SPF2, Value *C);
+  Instruction *visitSelectInst(SelectInst &SI);
+  Instruction *visitSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+  Instruction *visitCallInst(CallInst &CI);
+  Instruction *visitInvokeInst(InvokeInst &II);
+
+  Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
+  Instruction *visitPHINode(PHINode &PN);
+  Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitAllocaInst(AllocaInst &AI);
+  Instruction *visitAllocSite(Instruction &FI);
+  Instruction *visitFree(CallInst &FI);
+  Instruction *visitLoadInst(LoadInst &LI);
+  Instruction *visitStoreInst(StoreInst &SI);
+  Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitSwitchInst(SwitchInst &SI);
+  Instruction *visitInsertElementInst(InsertElementInst &IE);
+  Instruction *visitExtractElementInst(ExtractElementInst &EI);
+  Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+  Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
+
+  // visitInstruction - Specify what to return for unhandled instructions...
+  Instruction *visitInstruction(Instruction &I) { return 0; }
+
+private:
+  bool ShouldChangeType(Type *From, Type *To) const;
+  Value *dyn_castNegVal(Value *V) const;
+  Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
+                                  SmallVectorImpl<Value*> &NewIndices);
+  Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+  /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
+  /// results in any code being generated and is interesting to optimize out. If
+  /// the cast can be eliminated by some other simple transformation, we prefer
+  /// to do the simplification first.
+  bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V,
+                          Type *Ty);
+
+  Instruction *visitCallSite(CallSite CS);
+  Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *TD);
+  bool transformConstExprCastCall(CallSite CS);
+  Instruction *transformCallThroughTrampoline(CallSite CS,
+                                              IntrinsicInst *Tramp);
+  Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                 bool DoXform = true);
+  Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
+  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
+  Value *EmitGEPOffset(User *GEP);
+
+public:
+  // InsertNewInstBefore - insert an instruction New before instruction Old
+  // in the program.  Add the new instruction to the worklist.
+  //
+  Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+    assert(New && New->getParent() == 0 &&
+           "New instruction already inserted into a basic block!");
+    BasicBlock *BB = Old.getParent();
+    BB->getInstList().insert(&Old, New);  // Insert inst
+    Worklist.Add(New);
+    return New;
+  }
+
+  // InsertNewInstWith - same as InsertNewInstBefore, but also sets the
+  // debug loc.
+  //
+  Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+    New->setDebugLoc(Old.getDebugLoc());
+    return InsertNewInstBefore(New, Old);
+  }
+
+  // ReplaceInstUsesWith - This method is to be used when an instruction is
+  // found to be dead, replacable with another preexisting expression.  Here
+  // we add all uses of I to the worklist, replace all uses of I with the new
+  // value, then return I, so that the inst combiner will know that I was
+  // modified.
+  //
+  Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+    Worklist.AddUsersToWorkList(I);   // Add all modified instrs to worklist.
+
+    // If we are replacing the instruction with itself, this must be in a
+    // segment of unreachable code, so just clobber the instruction.
+    if (&I == V)
+      V = UndefValue::get(I.getType());
+
+    DEBUG(errs() << "IC: Replacing " << I << "\n"
+                    "    with " << *V << '\n');
+
+    I.replaceAllUsesWith(V);
+    return &I;
+  }
+
+  // EraseInstFromFunction - When dealing with an instruction that has side
+  // effects or produces a void value, we can't rely on DCE to delete the
+  // instruction.  Instead, visit methods should return the value returned by
+  // this function.
+  Instruction *EraseInstFromFunction(Instruction &I) {
+    DEBUG(errs() << "IC: ERASE " << I << '\n');
+
+    assert(I.use_empty() && "Cannot erase instruction that is used!");
+    // Make sure that we reprocess all operands now that we reduced their
+    // use counts.
+    if (I.getNumOperands() < 8) {
+      for (User::op_iterator i = I.op_begin(), e = I.op_end(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(*i))
+          Worklist.Add(Op);
+    }
+    Worklist.Remove(&I);
+    I.eraseFromParent();
+    MadeIRChange = true;
+    return 0;  // Don't do anything with FI
+  }
+
+  void ComputeMaskedBits(Value *V, APInt &KnownZero,
+                         APInt &KnownOne, unsigned Depth = 0) const {
+    return llvm::ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
+  }
+
+  bool MaskedValueIsZero(Value *V, const APInt &Mask,
+                         unsigned Depth = 0) const {
+    return llvm::MaskedValueIsZero(V, Mask, TD, Depth);
+  }
+  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0) const {
+    return llvm::ComputeNumSignBits(Op, TD, Depth);
+  }
+
+private:
+
+  /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
+  /// operators which are associative or commutative.
+  bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
+
+  /// SimplifyUsingDistributiveLaws - This tries to simplify binary operations
+  /// which some other binary operation distributes over either by factorizing
+  /// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this
+  /// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is
+  /// a win).  Returns the simplified value, or null if it didn't simplify.
+  Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+
+  /// SimplifyDemandedUseBits - Attempts to replace V with a simpler value
+  /// based on the demanded bits.
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                 APInt& KnownZero, APInt& KnownOne,
+                                 unsigned Depth);
+  bool SimplifyDemandedBits(Use &U, APInt DemandedMask,
+                            APInt& KnownZero, APInt& KnownOne,
+                            unsigned Depth=0);
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
+                                    APInt DemandedMask, APInt &KnownZero,
+                                    APInt &KnownOne);
+
+  /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+  /// SimplifyDemandedBits knows about.  See if the instruction has any
+  /// properties that allow us to simplify its operands.
+  bool SimplifyDemandedInstructionBits(Instruction &Inst);
+
+  Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                    APInt& UndefElts, unsigned Depth = 0);
+
+  // FoldOpIntoPhi - Given a binary operator, cast instruction, or select
+  // which has a PHI node as operand #0, see if we can fold the instruction
+  // into the PHI (which is only possible if all operands to the PHI are
+  // constants).
+  //
+  Instruction *FoldOpIntoPhi(Instruction &I);
+
+  // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+  // operator and they all are only used by the PHI, PHI together their
+  // inputs, and do the operation once, to the result of the PHI.
+  Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgGEPIntoPHI(PHINode &PN);
+  Instruction *FoldPHIArgLoadIntoPHI(PHINode &PN);
+
+
+  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+                        ConstantInt *AndRHS, BinaryOperator &TheAnd);
+
+  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
+                            bool isSub, Instruction &I);
+  Value *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                         bool isSigned, bool Inside);
+  Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+  Instruction *MatchBSwap(BinaryOperator &I);
+  bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+  Instruction *SimplifyMemSet(MemSetInst *MI);
+
+
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+  /// Descale - Return a value X such that Val = X * Scale, or null if none.  If
+  /// the multiplication is known not to overflow then NoSignedWrap is set.
+  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+};
+
+
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
new file mode 100644
index 000000000000..7595da08d3e8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -0,0 +1,1497 @@
+//===- InstCombineAddSub.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for add, fadd, sub, and fsub.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+namespace {
+
+  /// Class representing coefficient of floating-point addend.
+  /// This class needs to be highly efficient, which is especially true for
+  /// the constructor. As of I write this comment, the cost of the default
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to 
+  /// perform write-merging).
+  /// 
+  class FAddendCoef {
+  public:
+    // The constructor has to initialize a APFloat, which is uncessary for
+    // most addends which have coefficient either 1 or -1. So, the constructor
+    // is expensive. In order to avoid the cost of the constructor, we should
+    // reuse some instances whenever possible. The pre-created instances
+    // FAddCombine::Add[0-5] embodies this idea.
+    //
+    FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
+    ~FAddendCoef();
+  
+    void set(short C) {
+      assert(!insaneIntVal(C) && "Insane coefficient");
+      IsFp = false; IntVal = C;
+    }
+  
+    void set(const APFloat& C);
+
+    void negate();
+  
+    bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
+    Value *getValue(Type *) const;
+  
+    // If possible, don't define operator+/operator- etc because these
+    // operators inevitably call FAddendCoef's constructor which is not cheap.
+    void operator=(const FAddendCoef &A);
+    void operator+=(const FAddendCoef &A);
+    void operator-=(const FAddendCoef &A);
+    void operator*=(const FAddendCoef &S);
+  
+    bool isOne() const { return isInt() && IntVal == 1; }
+    bool isTwo() const { return isInt() && IntVal == 2; }
+    bool isMinusOne() const { return isInt() && IntVal == -1; }
+    bool isMinusTwo() const { return isInt() && IntVal == -2; }
+  
+  private:
+    bool insaneIntVal(int V) { return V > 4 || V < -4; }
+    APFloat *getFpValPtr(void)
+      { return reinterpret_cast<APFloat*>(&FpValBuf.buffer[0]); }
+    const APFloat *getFpValPtr(void) const
+      { return reinterpret_cast<const APFloat*>(&FpValBuf.buffer[0]); }
+
+    const APFloat &getFpVal(void) const {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
+    APFloat &getFpVal(void)
+      { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); }
+  
+    bool isInt() const { return !IsFp; }
+
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point. 
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer. 
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
+  private:
+
+    bool IsFp;
+  
+    // True iff FpValBuf contains an instance of APFloat.
+    bool BufHasFpVal;
+  
+    // The integer coefficient of an individual addend is either 1 or -1,
+    // and we try to simplify at most 4 addends from neighboring at most
+    // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
+    // is overkill of this end.
+    short IntVal;
+
+    AlignedCharArrayUnion<APFloat> FpValBuf;
+  };
+  
+  /// FAddend is used to represent floating-point addend. An addend is
+  /// represented as <C, V>, where the V is a symbolic value, and C is a
+  /// constant coefficient. A constant addend is represented as <C, 0>.
+  ///
+  class FAddend {
+  public:
+    FAddend() { Val = 0; }
+  
+    Value *getSymVal (void) const { return Val; }
+    const FAddendCoef &getCoef(void) const { return Coeff; }
+  
+    bool isConstant() const { return Val == 0; }
+    bool isZero() const { return Coeff.isZero(); }
+
+    void set(short Coefficient, Value *V) { Coeff.set(Coefficient), Val = V; }
+    void set(const APFloat& Coefficient, Value *V)
+      { Coeff.set(Coefficient); Val = V; }
+    void set(const ConstantFP* Coefficient, Value *V)
+      { Coeff.set(Coefficient->getValueAPF()); Val = V; }
+  
+    void negate() { Coeff.negate(); }
+  
+    /// Drill down the U-D chain one step to find the definition of V, and
+    /// try to break the definition into one or two addends.
+    static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+  
+    /// Similar to FAddend::drillDownOneStep() except that the value being
+    /// splitted is the addend itself.
+    unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+  
+    void operator+=(const FAddend &T) {
+      assert((Val == T.Val) && "Symbolic-values disagree");
+      Coeff += T.Coeff;
+    }
+
+  private:
+    void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+  
+    // This addend has the value of "Coeff * Val".
+    Value *Val;
+    FAddendCoef Coeff;
+  };
+  
+  /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+  /// with its neighboring at most two instructions.
+  ///
+  class FAddCombine {
+  public:
+    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
+    Value *simplify(Instruction *FAdd);
+  
+  private:
+    typedef SmallVector<const FAddend*, 4> AddendVect;
+  
+    Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+    Value *performFactorization(Instruction *I);
+
+    /// Convert given addend to a Value
+    Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+    
+    /// Return the number of instructions needed to emit the N-ary addition.
+    unsigned calcInstrNumber(const AddendVect& Vect);
+    Value *createFSub(Value *Opnd0, Value *Opnd1);
+    Value *createFAdd(Value *Opnd0, Value *Opnd1);
+    Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFDiv(Value *Opnd0, Value *Opnd1);
+    Value *createFNeg(Value *V);
+    Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+    void createInstPostProc(Instruction *NewInst);
+  
+    InstCombiner::BuilderTy *Builder;
+    Instruction *Instr;
+  
+  private:
+     // Debugging stuff are clustered here.
+    #ifndef NDEBUG
+      unsigned CreateInstrNum;
+      void initCreateInstNum() { CreateInstrNum = 0; }
+      void incCreateInstNum() { CreateInstrNum++; }
+    #else
+      void initCreateInstNum() {}
+      void incCreateInstNum() {}
+    #endif
+  };
+} 
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+//    {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+FAddendCoef::~FAddendCoef() {
+  if (BufHasFpVal)
+    getFpValPtr()->~APFloat();
+}
+
+void FAddendCoef::set(const APFloat& C) {
+  APFloat *P = getFpValPtr();
+
+  if (isInt()) {
+    // As the buffer is meanless byte stream, we cannot call
+    // APFloat::operator=().
+    new(P) APFloat(C);
+  } else
+    *P = C;
+
+  IsFp = BufHasFpVal = true; 
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true; 
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
+  if (That.isInt())
+    set(That.IntVal);
+  else
+    set(That.getFpVal());
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal += That.IntVal;
+    else
+      getFpVal().add(That.getFpVal(), RndMode);
+    return;
+  }
+  
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
+    return;
+  }
+  
+  APFloat &T = getFpVal();
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator-=(const FAddendCoef &That) {
+  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  if (isInt() == That.isInt()) {
+    if (isInt())
+      IntVal -= That.IntVal;
+    else
+      getFpVal().subtract(That.getFpVal(), RndMode);
+    return;
+  }
+  
+  if (isInt()) {
+    const APFloat &T = That.getFpVal();
+    convertToFpType(T.getSemantics());
+    getFpVal().subtract(T, RndMode);
+    return;
+  }
+
+  APFloat &T = getFpVal();
+  T.subtract(createAPFloatFromInt(T.getSemantics(), IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+  if (That.isOne())
+    return;
+
+  if (That.isMinusOne()) {
+    negate();
+    return;
+  }
+
+  if (isInt() && That.isInt()) {
+    int Res = IntVal * (int)That.IntVal;
+    assert(!insaneIntVal(Res) && "Insane int value");
+    IntVal = Res;
+    return;
+  }
+
+  const fltSemantics &Semantic = 
+    isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
+
+  if (isInt())
+    convertToFpType(Semantic);
+  APFloat &F0 = getFpVal();
+
+  if (That.isInt())
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
+  else
+    F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
+
+  return;
+}
+
+void FAddendCoef::negate() {
+  if (isInt())
+    IntVal = 0 - IntVal;
+  else
+    getFpVal().changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+  return isInt() ?
+    ConstantFP::get(Ty, float(IntVal)) :
+    ConstantFP::get(Ty->getContext(), getFpVal());
+}
+
+// The definition of <Val>     Addends
+// =========================================
+//  A + B                     <1, A>, <1,B>
+//  A - B                     <1, A>, <1,B>
+//  0 - B                     <-1, B>
+//  C * A,                    <C, A>
+//  A + C                     <1, A> <C, NULL> 
+//  0 +/- 0                   <0, NULL> (corner case)
+//
+// Legend: A and B are not constant, C is constant
+// 
+unsigned FAddend::drillValueDownOneStep
+  (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+  Instruction *I = 0;
+  if (Val == 0 || !(I = dyn_cast<Instruction>(Val)))
+    return 0;
+
+  unsigned Opcode = I->getOpcode();
+
+  if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+    ConstantFP *C0, *C1;
+    Value *Opnd0 = I->getOperand(0);
+    Value *Opnd1 = I->getOperand(1);
+    if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+      Opnd0 = 0;
+
+    if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+      Opnd1 = 0;
+
+    if (Opnd0) {
+      if (!C0)
+        Addend0.set(1, Opnd0);
+      else
+        Addend0.set(C0, 0);
+    }
+
+    if (Opnd1) {
+      FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+      if (!C1)
+        Addend.set(1, Opnd1);
+      else
+        Addend.set(C1, 0);
+      if (Opcode == Instruction::FSub)
+        Addend.negate();
+    }
+
+    if (Opnd0 || Opnd1)
+      return Opnd0 && Opnd1 ? 2 : 1;
+
+    // Both operands are zero. Weird!
+    Addend0.set(APFloat(C0->getValueAPF().getSemantics()), 0);
+    return 1;
+  }
+
+  if (I->getOpcode() == Instruction::FMul) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+      Addend0.set(C, V1);
+      return 1;
+    }
+
+    if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+      Addend0.set(C, V0);
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+// Try to break *this* addend into two addends. e.g. Suppose this addend is
+// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
+// i.e. <2.3, X> and <2.3, Y>.
+//
+unsigned FAddend::drillAddendDownOneStep
+  (FAddend &Addend0, FAddend &Addend1) const {
+  if (isConstant())
+    return 0;
+
+  unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
+  if (!BreakNum || Coeff.isOne()) 
+    return BreakNum;
+
+  Addend0.Scale(Coeff);
+
+  if (BreakNum == 2)
+    Addend1.Scale(Coeff);
+
+  return BreakNum;
+}
+
+// Try to perform following optimization on the input instruction I. Return the
+// simplified expression if was successful; otherwise, return 0.
+//
+//   Instruction "I" is                Simplified into
+// -------------------------------------------------------
+//   (x * y) +/- (x * z)               x * (y +/- z)
+//   (y / x) +/- (z / x)               (y +/- z) / x
+//
+Value *FAddCombine::performFactorization(Instruction *I) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+  
+  Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+  
+  if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
+    return 0;
+
+  bool isMpy = false;
+  if (I0->getOpcode() == Instruction::FMul)
+    isMpy = true;
+  else if (I0->getOpcode() != Instruction::FDiv)
+    return 0;
+
+  Value *Opnd0_0 = I0->getOperand(0);
+  Value *Opnd0_1 = I0->getOperand(1);
+  Value *Opnd1_0 = I1->getOperand(0);
+  Value *Opnd1_1 = I1->getOperand(1);
+
+  //  Input Instr I       Factor   AddSub0  AddSub1 
+  //  ----------------------------------------------
+  // (x*y) +/- (x*z)        x        y         z
+  // (y/x) +/- (z/x)        x        y         z
+  //
+  Value *Factor = 0;
+  Value *AddSub0 = 0, *AddSub1 = 0;
+  
+  if (isMpy) {
+    if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
+      Factor = Opnd0_0;
+    else if (Opnd0_1 == Opnd1_0 || Opnd0_1 == Opnd1_1)
+      Factor = Opnd0_1;
+
+    if (Factor) {
+      AddSub0 = (Factor == Opnd0_0) ? Opnd0_1 : Opnd0_0;
+      AddSub1 = (Factor == Opnd1_0) ? Opnd1_1 : Opnd1_0;
+    }
+  } else if (Opnd0_1 == Opnd1_1) {
+    Factor = Opnd0_1;
+    AddSub0 = Opnd0_0;
+    AddSub1 = Opnd1_0;
+  }
+
+  if (!Factor)
+    return 0;
+
+  // Create expression "NewAddSub = AddSub0 +/- AddsSub1"
+  Value *NewAddSub = (I->getOpcode() == Instruction::FAdd) ?
+                      createFAdd(AddSub0, AddSub1) :
+                      createFSub(AddSub0, AddSub1);
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
+    const APFloat &F = CFP->getValueAPF();
+    if (!F.isNormal() || F.isDenormal())
+      return 0;
+  }
+
+  if (isMpy)
+    return createFMul(Factor, NewAddSub);
+ 
+  return createFDiv(NewAddSub, Factor);
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+  assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
+
+  // Currently we are not able to handle vector type.
+  if (I->getType()->isVectorTy())
+    return 0;
+
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  // Save the instruction before calling other member-functions. 
+  Instr = I;
+
+  FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+  unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
+
+  // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
+  unsigned Opnd0_ExpNum = 0;
+  unsigned Opnd1_ExpNum = 0;
+
+  if (!Opnd0.isConstant()) 
+    Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
+
+  // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+  if (OpndNum == 2 && !Opnd1.isConstant())
+    Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
+
+  // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+  if (Opnd0_ExpNum && Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0_0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    // Compute instruction quota. We should save at least one instruction.
+    unsigned InstQuota = 0;
+
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+                 (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+    if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+      return R;
+  }
+
+  if (OpndNum != 2) {
+    // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+    // splitted into two addends, say "V = X - Y", the instruction would have
+    // been optimized into "I = Y - X" in the previous steps.
+    //
+    const FAddendCoef &CE = Opnd0.getCoef();
+    return CE.isOne() ? Opnd0.getSymVal() : 0;
+  }
+
+  // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+  if (Opnd1_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd0);
+    AllOpnds.push_back(&Opnd1_0);
+    if (Opnd1_ExpNum == 2)
+      AllOpnds.push_back(&Opnd1_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+  if (Opnd0_ExpNum) {
+    AddendVect AllOpnds;
+    AllOpnds.push_back(&Opnd1);
+    AllOpnds.push_back(&Opnd0_0);
+    if (Opnd0_ExpNum == 2)
+      AllOpnds.push_back(&Opnd0_1);
+
+    if (Value *R = simplifyFAdd(AllOpnds, 1))
+      return R;
+  }
+
+  // step 6: Try factorization as the last resort, 
+  return performFactorization(I);
+}
+
+Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
+
+  unsigned AddendNum = Addends.size();
+  assert(AddendNum <= 4 && "Too many addends");
+
+  // For saving intermediate results; 
+  unsigned NextTmpIdx = 0;
+  FAddend TmpResult[3];
+
+  // Points to the constant addend of the resulting simplified expression.
+  // If the resulting expr has constant-addend, this constant-addend is
+  // desirable to reside at the top of the resulting expression tree. Placing
+  // constant close to supper-expr(s) will potentially reveal some optimization
+  // opportunities in super-expr(s).
+  //
+  const FAddend *ConstAdd = 0;
+
+  // Simplified addends are placed <SimpVect>.
+  AddendVect SimpVect;
+
+  // The outer loop works on one symbolic-value at a time. Suppose the input
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... 
+  // The symbolic-values will be processed in this order: x, y, z.
+  //
+  for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
+
+    const FAddend *ThisAddend = Addends[SymIdx];
+    if (!ThisAddend) {
+      // This addend was processed before.
+      continue;
+    }
+
+    Value *Val = ThisAddend->getSymVal();
+    unsigned StartIdx = SimpVect.size();
+    SimpVect.push_back(ThisAddend);
+
+    // The inner loop collects addends sharing same symbolic-value, and these
+    // addends will be later on folded into a single addend. Following above
+    // example, if the symbolic value "y" is being processed, the inner loop
+    // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
+    // be later on folded into "<b1+b2, y>".
+    //
+    for (unsigned SameSymIdx = SymIdx + 1;
+         SameSymIdx < AddendNum; SameSymIdx++) {
+      const FAddend *T = Addends[SameSymIdx];
+      if (T && T->getSymVal() == Val) {
+        // Set null such that next iteration of the outer loop will not process
+        // this addend again.
+        Addends[SameSymIdx] = 0; 
+        SimpVect.push_back(T);
+      }
+    }
+
+    // If multiple addends share same symbolic value, fold them together.
+    if (StartIdx + 1 != SimpVect.size()) {
+      FAddend &R = TmpResult[NextTmpIdx ++];
+      R = *SimpVect[StartIdx];
+      for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
+        R += *SimpVect[Idx];
+
+      // Pop all addends being folded and push the resulting folded addend.
+      SimpVect.resize(StartIdx); 
+      if (Val != 0) {
+        if (!R.isZero()) {
+          SimpVect.push_back(&R);
+        }
+      } else {
+        // Don't push constant addend at this time. It will be the last element
+        // of <SimpVect>.
+        ConstAdd = &R;
+      }
+    }
+  }
+
+  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && 
+         "out-of-bound access");
+
+  if (ConstAdd)
+    SimpVect.push_back(ConstAdd);
+
+  Value *Result;
+  if (!SimpVect.empty())
+    Result = createNaryFAdd(SimpVect, InstrQuota);
+  else {
+    // The addition is folded to 0.0.
+    Result = ConstantFP::get(Instr->getType(), 0.0);
+  }
+
+  return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+  (const AddendVect &Opnds, unsigned InstrQuota) {
+  assert(!Opnds.empty() && "Expect at least one addend");
+
+  // Step 1: Check if the # of instructions needed exceeds the quota.
+  // 
+  unsigned InstrNeeded = calcInstrNumber(Opnds);
+  if (InstrNeeded > InstrQuota)
+    return 0;
+
+  initCreateInstNum();
+
+  // step 2: Emit the N-ary addition.
+  // Note that at most three instructions are involved in Fadd-InstCombine: the
+  // addition in question, and at most two neighboring instructions.
+  // The resulting optimized addition should have at least one less instruction
+  // than the original addition expression tree. This implies that the resulting
+  // N-ary addition has at most two instructions, and we don't need to worry
+  // about tree-height when constructing the N-ary addition.
+
+  Value *LastVal = 0;
+  bool LastValNeedNeg = false;
+
+  // Iterate the addends, creating fadd/fsub using adjacent two addends.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    bool NeedNeg; 
+    Value *V = createAddendVal(**I, NeedNeg);
+    if (!LastVal) {
+      LastVal = V;
+      LastValNeedNeg = NeedNeg;
+      continue;
+    }
+
+    if (LastValNeedNeg == NeedNeg) {
+      LastVal = createFAdd(LastVal, V);
+      continue;
+    }
+
+    if (LastValNeedNeg)
+      LastVal = createFSub(V, LastVal);
+    else
+      LastVal = createFSub(LastVal, V);
+
+    LastValNeedNeg = false;
+  }
+
+  if (LastValNeedNeg) {
+    LastVal = createFNeg(LastVal);
+  }
+
+  #ifndef NDEBUG
+    assert(CreateInstrNum == InstrNeeded && 
+           "Inconsistent in instruction numbers");
+  #endif
+
+  return LastVal;
+}
+
+Value *FAddCombine::createFSub
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+  Value *Zero = cast<Value>(ConstantFP::get(V->getType(), 0.0));
+  return createFSub(Zero, V);
+}
+
+Value *FAddCombine::createFAdd
+  (Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFDiv(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr) {
+  NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+  // Keep track of the number of instruction created.
+  incCreateInstNum();
+
+  // Propagate fast-math flags
+  NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
+  unsigned OpndNum = Opnds.size();
+  unsigned InstrNeeded = OpndNum - 1;
+
+  // The number of addends in the form of "(-1)*x". 
+  unsigned NegOpndNum = 0; 
+
+  // Adjust the number of instructions needed to emit the N-ary add.
+  for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
+       I != E; I++) {
+    const FAddend *Opnd = *I;
+    if (Opnd->isConstant())
+      continue;
+
+    const FAddendCoef &CE = Opnd->getCoef();
+    if (CE.isMinusOne() || CE.isMinusTwo())
+      NegOpndNum++;
+
+    // Let the addend be "c * x". If "c == +/-1", the value of the addend
+    // is immediately available; otherwise, it needs exactly one instruction
+    // to evaluate the value.
+    if (!CE.isMinusOne() && !CE.isOne())
+      InstrNeeded++;
+  }
+  if (NegOpndNum == OpndNum)
+    InstrNeeded++;
+  return InstrNeeded;
+}
+
+// Input Addend        Value           NeedNeg(output)
+// ================================================================
+// Constant C          C               false
+// <+/-1, V>           V               coefficient is -1
+// <2/-2, V>          "fadd V, V"      coefficient is -2
+// <C, V>             "fmul V, C"      false
+//
+// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
+Value *FAddCombine::createAddendVal
+  (const FAddend &Opnd, bool &NeedNeg) {
+  const FAddendCoef &Coeff = Opnd.getCoef();
+
+  if (Opnd.isConstant()) {
+    NeedNeg = false;
+    return Coeff.getValue(Instr->getType());
+  }
+
+  Value *OpndVal = Opnd.getSymVal();
+
+  if (Coeff.isMinusOne() || Coeff.isOne()) {
+    NeedNeg = Coeff.isMinusOne();
+    return OpndVal;
+  }
+
+  if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+    NeedNeg = Coeff.isMinusTwo();
+    return createFAdd(OpndVal, OpndVal);
+  }
+
+  NeedNeg = false;
+  return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
+/// AddOne - Add one to a ConstantInt.
+static Constant *AddOne(Constant *C) {
+  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+}
+
+/// SubOne - Subtract one from a ConstantInt.
+static Constant *SubOne(ConstantInt *C) {
+  return ConstantInt::get(C->getContext(), C->getValue()-1);
+}
+
+
+// dyn_castFoldableMul - If this value is a multiply that can be folded into
+// other computations (because it has a constant operand), return the
+// non-constant operand of the multiply, and set CST to point to the multiplier.
+// Otherwise, return null.
+//
+static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
+  if (!V->hasOneUse() || !V->getType()->isIntegerTy())
+    return 0;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return 0;
+
+  if (I->getOpcode() == Instruction::Mul)
+    if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
+      return I->getOperand(0);
+  if (I->getOpcode() == Instruction::Shl)
+    if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) {
+      // The multiplier is really 1 << CST.
+      uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+      uint32_t CSTVal = CST->getLimitedValue(BitWidth);
+      CST = ConstantInt::get(V->getType()->getContext(),
+                             APInt(BitWidth, 1).shl(CSTVal));
+      return I->getOperand(0);
+    }
+  return 0;
+}
+
+
+/// WillNotOverflowSignedAdd - Return true if we can prove that:
+///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
+/// This basically requires proving that the add in the original type would not
+/// overflow to change the sign bit or have a carry out.
+bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
+  // There are different heuristics we can use for this.  Here are some simple
+  // ones.
+
+  // Add has the property that adding any two 2's complement numbers can only
+  // have one carry bit which can change a sign.  As such, if LHS and RHS each
+  // have at least two sign bits, we know that the addition of the two values
+  // will sign extend fine.
+  if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
+    return true;
+
+
+  // If one of the operands only has one non-zero bit, and if the other operand
+  // has a known-zero bit in a more significant place than it (not including the
+  // sign bit) the ripple may go up to and fill the zero, but won't change the
+  // sign.  For example, (X & ~4) + 1.
+
+  // TODO: Implement.
+
+  return false;
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
+                                 I.hasNoUnsignedWrap(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // (A*B)+(A*C) -> A*(B+C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+    // X + (signbit) --> X ^ signbit
+    const APInt &Val = CI->getValue();
+    if (Val.isSignBit())
+      return BinaryOperator::CreateXor(LHS, RHS);
+
+    // See if SimplifyDemandedBits can simplify this.  This handles stuff like
+    // (X & 254)+1 -> (X&254)|1
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+
+    // zext(bool) + C -> bool ? C + 1 : C
+    if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
+      if (ZI->getSrcTy()->isIntegerTy(1))
+        return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI);
+
+    Value *XorLHS = 0; ConstantInt *XorRHS = 0;
+    if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
+      uint32_t TySizeBits = I.getType()->getScalarSizeInBits();
+      const APInt &RHSVal = CI->getValue();
+      unsigned ExtendAmt = 0;
+      // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
+      // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
+      if (XorRHS->getValue() == -RHSVal) {
+        if (RHSVal.isPowerOf2())
+          ExtendAmt = TySizeBits - RHSVal.logBase2() - 1;
+        else if (XorRHS->getValue().isPowerOf2())
+          ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
+      }
+
+      if (ExtendAmt) {
+        APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
+        if (!MaskedValueIsZero(XorLHS, Mask))
+          ExtendAmt = 0;
+      }
+
+      if (ExtendAmt) {
+        Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt);
+        Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext");
+        return BinaryOperator::CreateAShr(NewShl, ShAmt);
+      }
+
+      // If this is a xor that was canonicalized from a sub, turn it back into
+      // a sub and fuse this add with it.
+      if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
+        IntegerType *IT = cast<IntegerType>(I.getType());
+        APInt LHSKnownOne(IT->getBitWidth(), 0);
+        APInt LHSKnownZero(IT->getBitWidth(), 0);
+        ComputeMaskedBits(XorLHS, LHSKnownZero, LHSKnownOne);
+        if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
+          return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
+                                           XorLHS);
+      }
+    }
+  }
+
+  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+
+  if (I.getType()->isIntegerTy(1))
+    return BinaryOperator::CreateXor(LHS, RHS);
+
+  // X + X --> X << 1
+  if (LHS == RHS) {
+    BinaryOperator *New =
+      BinaryOperator::CreateShl(LHS, ConstantInt::get(I.getType(), 1));
+    New->setHasNoSignedWrap(I.hasNoSignedWrap());
+    New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+    return New;
+  }
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castNegVal(LHS)) {
+    if (!isa<Constant>(RHS))
+      if (Value *RHSV = dyn_castNegVal(RHS)) {
+        Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
+        return BinaryOperator::CreateNeg(NewAdd);
+      }
+
+    return BinaryOperator::CreateSub(RHS, LHSV);
+  }
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castNegVal(RHS))
+      return BinaryOperator::CreateSub(LHS, V);
+
+
+  ConstantInt *C2;
+  if (Value *X = dyn_castFoldableMul(LHS, C2)) {
+    if (X == RHS)   // X*C + X --> X * (C+1)
+      return BinaryOperator::CreateMul(RHS, AddOne(C2));
+
+    // X*C1 + X*C2 --> X * (C1+C2)
+    ConstantInt *C1;
+    if (X == dyn_castFoldableMul(RHS, C1))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getAdd(C1, C2));
+  }
+
+  // X + X*C --> X * (C+1)
+  if (dyn_castFoldableMul(RHS, C2) == LHS)
+    return BinaryOperator::CreateMul(LHS, AddOne(C2));
+
+  // A+B --> A|B iff A and B have no bits set in common.
+  if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+    APInt LHSKnownOne(IT->getBitWidth(), 0);
+    APInt LHSKnownZero(IT->getBitWidth(), 0);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    if (LHSKnownZero != 0) {
+      APInt RHSKnownOne(IT->getBitWidth(), 0);
+      APInt RHSKnownZero(IT->getBitWidth(), 0);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+
+      // No bits in common -> bitwise or.
+      if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
+        return BinaryOperator::CreateOr(LHS, RHS);
+    }
+  }
+
+  // W*X + Y*Z --> W * (X+Z)  iff W == Y
+  {
+    Value *W, *X, *Y, *Z;
+    if (match(LHS, m_Mul(m_Value(W), m_Value(X))) &&
+        match(RHS, m_Mul(m_Value(Y), m_Value(Z)))) {
+      if (W != Y) {
+        if (W == Z) {
+          std::swap(Y, Z);
+        } else if (Y == X) {
+          std::swap(W, X);
+        } else if (X == Z) {
+          std::swap(Y, Z);
+          std::swap(W, X);
+        }
+      }
+
+      if (W == Y) {
+        Value *NewAdd = Builder->CreateAdd(X, Z, LHS->getName());
+        return BinaryOperator::CreateMul(W, NewAdd);
+      }
+    }
+  }
+
+  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
+    Value *X = 0;
+    if (match(LHS, m_Not(m_Value(X))))    // ~X + C --> (C-1) - X
+      return BinaryOperator::CreateSub(SubOne(CRHS), X);
+
+    // (X & FF00) + xx00  -> (X+xx00) & FF00
+    if (LHS->hasOneUse() &&
+        match(LHS, m_And(m_Value(X), m_ConstantInt(C2))) &&
+        CRHS->getValue() == (CRHS->getValue() & C2->getValue())) {
+      // See if all bits from the first bit set in the Add RHS up are included
+      // in the mask.  First, get the rightmost bit.
+      const APInt &AddRHSV = CRHS->getValue();
+
+      // Form a mask of all bits from the lowest bit added through the top.
+      APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
+
+      // See if the and mask includes all of these bits.
+      APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
+
+      if (AddRHSHighBits == AddRHSHighBitsAnd) {
+        // Okay, the xform is safe.  Insert the new add pronto.
+        Value *NewAdd = Builder->CreateAdd(X, CRHS, LHS->getName());
+        return BinaryOperator::CreateAnd(NewAdd, C2);
+      }
+    }
+
+    // Try to fold constant add into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+  }
+
+  // add (select X 0 (sub n A)) A  -->  select X A n
+  {
+    SelectInst *SI = dyn_cast<SelectInst>(LHS);
+    Value *A = RHS;
+    if (!SI) {
+      SI = dyn_cast<SelectInst>(RHS);
+      A = LHS;
+    }
+    if (SI && SI->hasOneUse()) {
+      Value *TV = SI->getTrueValue();
+      Value *FV = SI->getFalseValue();
+      Value *N;
+
+      // Can we fold the add into the argument of the select?
+      // We check both true and false select arguments for a matching subtract.
+      if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the true select value.
+        return SelectInst::Create(SI->getCondition(), N, A);
+
+      if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+        // Fold the add into the false select value.
+        return SelectInst::Create(SI->getCondition(), A, N);
+    }
+  }
+
+  // Check for (add (sext x), y), see if we can merge this into an
+  // integer add followed by a sext.
+  if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
+    // (add (sext x), cst) --> (sext (add x, cst'))
+    if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
+      Constant *CI =
+        ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new, smaller add.
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+                                              CI, "addconv");
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+
+    // (add (sext x), (sext y)) --> (sext (add int x, y))
+    if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of sexts), and if the
+      // integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+                                             RHSConv->getOperand(0), "addconv");
+        return new SExtInst(NewAdd, I.getType());
+      }
+    }
+  }
+
+  // Check for (x & y) + (x ^ y)
+  {
+    Value *A = 0, *B = 0;
+    if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
+        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
+         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+      return BinaryOperator::CreateOr(A, B);
+  }
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+
+  // -A + B  -->  B - A
+  // -A + -B  -->  -(A + B)
+  if (Value *LHSV = dyn_castFNegVal(LHS))
+    return BinaryOperator::CreateFSub(RHS, LHSV);
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castFNegVal(RHS))
+      return BinaryOperator::CreateFSub(LHS, V);
+
+  // Check for (fadd double (sitofp x), y), see if we can merge this into an
+  // integer add followed by a promotion.
+  if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+    // ... if the constant fits in the integer value.  This is useful for things
+    // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+    // requires a constant pool load, and generally allows the add to be better
+    // instcombined.
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
+      Constant *CI =
+      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      if (LHSConv->hasOneUse() &&
+          ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
+        // Insert the new integer add.
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+                                              CI, "addconv");
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+
+    // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+    if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+      // Only do this if x/y have the same type, if at last one of them has a
+      // single use (so we don't increase the number of int->fp conversions),
+      // and if the integer add will not overflow.
+      if (LHSConv->getOperand(0)->getType()==RHSConv->getOperand(0)->getType()&&
+          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+                                   RHSConv->getOperand(0))) {
+        // Insert the new integer add.
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+                                              RHSConv->getOperand(0),"addconv");
+        return new SIToFPInst(NewAdd, I.getType());
+      }
+    }
+  }
+
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
+  return Changed ? &I : 0;
+}
+
+
+/// Optimize pointer differences into the same array into a size.  Consider:
+///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
+/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
+///
+Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
+                                               Type *Ty) {
+  assert(TD && "Must have target data info for this");
+
+  // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
+  // this.
+  bool Swapped = false;
+  GEPOperator *GEP1 = 0, *GEP2 = 0;
+
+  // For now we require one side to be the base pointer "A" or a constant
+  // GEP derived from it.
+  if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+    // (gep X, ...) - X
+    if (LHSGEP->getOperand(0) == RHS) {
+      GEP1 = LHSGEP;
+      Swapped = false;
+    } else if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (LHSGEP->getOperand(0)->stripPointerCasts() ==
+            RHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = RHSGEP;
+        GEP1 = LHSGEP;
+        Swapped = false;
+      }
+    }
+  }
+
+  if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
+    // X - (gep X, ...)
+    if (RHSGEP->getOperand(0) == LHS) {
+      GEP1 = RHSGEP;
+      Swapped = true;
+    } else if (GEPOperator *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
+      // (gep X, ...) - (gep X, ...)
+      if (RHSGEP->getOperand(0)->stripPointerCasts() ==
+            LHSGEP->getOperand(0)->stripPointerCasts()) {
+        GEP2 = LHSGEP;
+        GEP1 = RHSGEP;
+        Swapped = true;
+      }
+    }
+  }
+
+  // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
+  // multiple users.
+  if (GEP1 == 0 ||
+      (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
+    return 0;
+
+  // Emit the offset of the GEP and an intptr_t.
+  Value *Result = EmitGEPOffset(GEP1);
+
+  // If we had a constant expression GEP on the other side offsetting the
+  // pointer, subtract it from the offset we have.
+  if (GEP2) {
+    Value *Offset = EmitGEPOffset(GEP2);
+    Result = Builder->CreateSub(Result, Offset);
+  }
+
+  // If we have p - gep(p, ...)  then we have to negate the result.
+  if (Swapped)
+    Result = Builder->CreateNeg(Result, "diff.neg");
+
+  return Builder->CreateIntCast(Result, Ty, true);
+}
+
+
+Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
+                                 I.hasNoUnsignedWrap(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // (A*B)-(A*C) -> A*(B-C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  // If this is a 'B = x-(-A)', change to B = x+A.  This preserves NSW/NUW.
+  if (Value *V = dyn_castNegVal(Op1)) {
+    BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
+    Res->setHasNoSignedWrap(I.hasNoSignedWrap());
+    Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+    return Res;
+  }
+
+  if (I.getType()->isIntegerTy(1))
+    return BinaryOperator::CreateXor(Op0, Op1);
+
+  // Replace (-1 - A) with (~A).
+  if (match(Op0, m_AllOnes()))
+    return BinaryOperator::CreateNot(Op1);
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
+    // C - ~X == X + (1+C)
+    Value *X = 0;
+    if (match(Op1, m_Not(m_Value(X))))
+      return BinaryOperator::CreateAdd(X, AddOne(C));
+
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (C->isZero()) {
+      Value *X; ConstantInt *CI;
+      if (match(Op1, m_LShr(m_Value(X), m_ConstantInt(CI))) &&
+          // Verify we are shifting out everything but the sign bit.
+          CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1)
+        return BinaryOperator::CreateAShr(X, CI);
+
+      if (match(Op1, m_AShr(m_Value(X), m_ConstantInt(CI))) &&
+          // Verify we are shifting out everything but the sign bit.
+          CI->getValue() == I.getType()->getPrimitiveSizeInBits()-1)
+        return BinaryOperator::CreateLShr(X, CI);
+    }
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    // C-(X+C2) --> (C-C2)-X
+    ConstantInt *C2;
+    if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2))))
+      return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
+
+    if (SimplifyDemandedInstructionBits(I))
+      return &I;
+
+    // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
+    if (C->isZero() && match(Op1, m_ZExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateSExtOrBitCast(X, Op1->getType());
+
+    // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
+    if (C->isZero() && match(Op1, m_SExt(m_Value(X))))
+      if (X->getType()->isIntegerTy(1))
+        return CastInst::CreateZExtOrBitCast(X, Op1->getType());
+  }
+
+
+  { Value *Y;
+    // X-(X+Y) == -Y    X-(Y+X) == -Y
+    if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
+        match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
+      return BinaryOperator::CreateNeg(Y);
+
+    // (X-Y)-X == -Y
+    if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
+      return BinaryOperator::CreateNeg(Y);
+  }
+
+  if (Op1->hasOneUse()) {
+    Value *X = 0, *Y = 0, *Z = 0;
+    Constant *C = 0;
+    ConstantInt *CI = 0;
+
+    // (X - (Y - Z))  -->  (X + (Z - Y)).
+    if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
+      return BinaryOperator::CreateAdd(Op0,
+                                      Builder->CreateSub(Z, Y, Op1->getName()));
+
+    // (X - (X & Y))   -->   (X & ~Y)
+    //
+    if (match(Op1, m_And(m_Value(Y), m_Specific(Op0))) ||
+        match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
+      return BinaryOperator::CreateAnd(Op0,
+                                  Builder->CreateNot(Y, Y->getName() + ".not"));
+
+    // 0 - (X sdiv C)  -> (X sdiv -C)
+    if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) &&
+        match(Op0, m_Zero()))
+      return BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(C));
+
+    // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
+    if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero()))
+      if (Value *XNeg = dyn_castNegVal(X))
+        return BinaryOperator::CreateShl(XNeg, Y);
+
+    // X - X*C --> X * (1-C)
+    if (match(Op1, m_Mul(m_Specific(Op0), m_ConstantInt(CI)))) {
+      Constant *CP1 = ConstantExpr::getSub(ConstantInt::get(I.getType(),1), CI);
+      return BinaryOperator::CreateMul(Op0, CP1);
+    }
+
+    // X - X<<C --> X * (1-(1<<C))
+    if (match(Op1, m_Shl(m_Specific(Op0), m_ConstantInt(CI)))) {
+      Constant *One = ConstantInt::get(I.getType(), 1);
+      C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI));
+      return BinaryOperator::CreateMul(Op0, C);
+    }
+
+    // X - A*-B -> X + A*B
+    // X - -A*B -> X + A*B
+    Value *A, *B;
+    if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
+        match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
+      return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
+
+    // X - A*CI -> X + A*-CI
+    // X - CI*A -> X + A*-CI
+    if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) ||
+        match(Op1, m_Mul(m_ConstantInt(CI), m_Value(A)))) {
+      Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI));
+      return BinaryOperator::CreateAdd(Op0, NewMul);
+    }
+  }
+
+  ConstantInt *C1;
+  if (Value *X = dyn_castFoldableMul(Op0, C1)) {
+    if (X == Op1)  // X*C - X --> X * (C-1)
+      return BinaryOperator::CreateMul(Op1, SubOne(C1));
+
+    ConstantInt *C2;   // X*C1 - X*C2 -> X * (C1-C2)
+    if (X == dyn_castFoldableMul(Op1, C2))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2));
+  }
+
+  // Optimize pointer differences into the same array into a size.  Consider:
+  //  &A[10] - &A[0]: we should compile this to "10".
+  if (TD) {
+    Value *LHSOp, *RHSOp;
+    if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
+        match(Op1, m_PtrToInt(m_Value(RHSOp))))
+      if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+        return ReplaceInstUsesWith(I, Res);
+
+    // trunc(p)-trunc(q) -> trunc(p-q)
+    if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
+        match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+      if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+        return ReplaceInstUsesWith(I, Res);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // If this is a 'B = x-(-A)', change to B = x+A...
+  if (Value *V = dyn_castFNegVal(Op1))
+    return BinaryOperator::CreateFAdd(Op0, V);
+
+  if (I.hasUnsafeAlgebra()) {
+    if (Value *V = FAddCombine(Builder).simplify(&I))
+      return ReplaceInstUsesWith(I, V);
+  }
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
new file mode 100644
index 000000000000..990cbc3d594e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -0,0 +1,2355 @@
+//===- InstCombineAndOrXor.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitAnd, visitOr, and visitXor functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+
+/// AddOne - Add one to a ConstantInt.
+static Constant *AddOne(ConstantInt *C) {
+  return ConstantInt::get(C->getContext(), C->getValue() + 1);
+}
+/// SubOne - Subtract one from a ConstantInt.
+static Constant *SubOne(ConstantInt *C) {
+  return ConstantInt::get(C->getContext(), C->getValue()-1);
+}
+
+/// isFreeToInvert - Return true if the specified value is free to invert (apply
+/// ~ to).  This happens in cases where the ~ can be eliminated.
+static inline bool isFreeToInvert(Value *V) {
+  // ~(~(X)) -> X.
+  if (BinaryOperator::isNot(V))
+    return true;
+
+  // Constants can be considered to be not'ed values.
+  if (isa<ConstantInt>(V))
+    return true;
+
+  // Compares can be inverted if they have a single use.
+  if (CmpInst *CI = dyn_cast<CmpInst>(V))
+    return CI->hasOneUse();
+
+  return false;
+}
+
+static inline Value *dyn_castNotVal(Value *V) {
+  // If this is not(not(x)) don't return that this is a not: we want the two
+  // not's to be folded first.
+  if (BinaryOperator::isNot(V)) {
+    Value *Operand = BinaryOperator::getNotArgument(V);
+    if (!isFreeToInvert(Operand))
+      return Operand;
+  }
+
+  // Constants can be considered to be not'ed values...
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantInt::get(C->getType(), ~C->getValue());
+  return 0;
+}
+
+/// getFCmpCode - Similar to getICmpCode but for FCmpInst. This encodes a fcmp
+/// predicate into a three bit mask. It also returns whether it is an ordered
+/// predicate by reference.
+static unsigned getFCmpCode(FCmpInst::Predicate CC, bool &isOrdered) {
+  isOrdered = false;
+  switch (CC) {
+  case FCmpInst::FCMP_ORD: isOrdered = true; return 0;  // 000
+  case FCmpInst::FCMP_UNO:                   return 0;  // 000
+  case FCmpInst::FCMP_OGT: isOrdered = true; return 1;  // 001
+  case FCmpInst::FCMP_UGT:                   return 1;  // 001
+  case FCmpInst::FCMP_OEQ: isOrdered = true; return 2;  // 010
+  case FCmpInst::FCMP_UEQ:                   return 2;  // 010
+  case FCmpInst::FCMP_OGE: isOrdered = true; return 3;  // 011
+  case FCmpInst::FCMP_UGE:                   return 3;  // 011
+  case FCmpInst::FCMP_OLT: isOrdered = true; return 4;  // 100
+  case FCmpInst::FCMP_ULT:                   return 4;  // 100
+  case FCmpInst::FCMP_ONE: isOrdered = true; return 5;  // 101
+  case FCmpInst::FCMP_UNE:                   return 5;  // 101
+  case FCmpInst::FCMP_OLE: isOrdered = true; return 6;  // 110
+  case FCmpInst::FCMP_ULE:                   return 6;  // 110
+    // True -> 7
+  default:
+    // Not expecting FCMP_FALSE and FCMP_TRUE;
+    llvm_unreachable("Unexpected FCmp predicate!");
+  }
+}
+
+/// getNewICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or a brand
+/// new ICmp instruction. The sign is passed in to determine which kind
+/// of predicate to use in the new icmp instruction.
+static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                              InstCombiner::BuilderTy *Builder) {
+  ICmpInst::Predicate NewPred;
+  if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred))
+    return NewConstant;
+  return Builder->CreateICmp(NewPred, LHS, RHS);
+}
+
+/// getFCmpValue - This is the complement of getFCmpCode, which turns an
+/// opcode and two operands into either a FCmp instruction. isordered is passed
+/// in to determine which kind of predicate to use in the new fcmp instruction.
+static Value *getFCmpValue(bool isordered, unsigned code,
+                           Value *LHS, Value *RHS,
+                           InstCombiner::BuilderTy *Builder) {
+  CmpInst::Predicate Pred;
+  switch (code) {
+  default: llvm_unreachable("Illegal FCmp code!");
+  case 0: Pred = isordered ? FCmpInst::FCMP_ORD : FCmpInst::FCMP_UNO; break;
+  case 1: Pred = isordered ? FCmpInst::FCMP_OGT : FCmpInst::FCMP_UGT; break;
+  case 2: Pred = isordered ? FCmpInst::FCMP_OEQ : FCmpInst::FCMP_UEQ; break;
+  case 3: Pred = isordered ? FCmpInst::FCMP_OGE : FCmpInst::FCMP_UGE; break;
+  case 4: Pred = isordered ? FCmpInst::FCMP_OLT : FCmpInst::FCMP_ULT; break;
+  case 5: Pred = isordered ? FCmpInst::FCMP_ONE : FCmpInst::FCMP_UNE; break;
+  case 6: Pred = isordered ? FCmpInst::FCMP_OLE : FCmpInst::FCMP_ULE; break;
+  case 7:
+    if (!isordered) return ConstantInt::getTrue(LHS->getContext());
+    Pred = FCmpInst::FCMP_ORD; break;
+  }
+  return Builder->CreateFCmp(Pred, LHS, RHS);
+}
+
+// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
+// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
+// guaranteed to be a binary operator.
+Instruction *InstCombiner::OptAndOp(Instruction *Op,
+                                    ConstantInt *OpRHS,
+                                    ConstantInt *AndRHS,
+                                    BinaryOperator &TheAnd) {
+  Value *X = Op->getOperand(0);
+  Constant *Together = 0;
+  if (!Op->isShift())
+    Together = ConstantExpr::getAnd(AndRHS, OpRHS);
+
+  switch (Op->getOpcode()) {
+  case Instruction::Xor:
+    if (Op->hasOneUse()) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+      Value *And = Builder->CreateAnd(X, AndRHS);
+      And->takeName(Op);
+      return BinaryOperator::CreateXor(And, Together);
+    }
+    break;
+  case Instruction::Or:
+    if (Op->hasOneUse()){
+      if (Together != OpRHS) {
+        // (X | C1) & C2 --> (X | (C1&C2)) & C2
+        Value *Or = Builder->CreateOr(X, Together);
+        Or->takeName(Op);
+        return BinaryOperator::CreateAnd(Or, AndRHS);
+      }
+
+      ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
+      if (TogetherCI && !TogetherCI->isZero()){
+        // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
+        // NOTE: This reduces the number of bits set in the & mask, which
+        // can expose opportunities for store narrowing.
+        Together = ConstantExpr::getXor(AndRHS, Together);
+        Value *And = Builder->CreateAnd(X, Together);
+        And->takeName(Op);
+        return BinaryOperator::CreateOr(And, OpRHS);
+      }
+    }
+
+    break;
+  case Instruction::Add:
+    if (Op->hasOneUse()) {
+      // Adding a one to a single bit bit-field should be turned into an XOR
+      // of the bit.  First thing to check is to see if this AND is with a
+      // single bit constant.
+      const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+
+      // If there is only one bit set.
+      if (AndRHSV.isPowerOf2()) {
+        // Ok, at this point, we know that we are masking the result of the
+        // ADD down to exactly one bit.  If the constant we are adding has
+        // no bits set below this bit, then we can eliminate the ADD.
+        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+
+        // Check to see if any bits below the one bit set in AndRHSV are set.
+        if ((AddRHS & (AndRHSV-1)) == 0) {
+          // If not, the only thing that can effect the output of the AND is
+          // the bit specified by AndRHSV.  If that bit is set, the effect of
+          // the XOR is to toggle the bit.  If it is clear, then the ADD has
+          // no effect.
+          if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop
+            TheAnd.setOperand(0, X);
+            return &TheAnd;
+          } else {
+            // Pull the XOR out of the AND.
+            Value *NewAnd = Builder->CreateAnd(X, AndRHS);
+            NewAnd->takeName(Op);
+            return BinaryOperator::CreateXor(NewAnd, AndRHS);
+          }
+        }
+      }
+    }
+    break;
+
+  case Instruction::Shl: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getContext(),
+                                       AndRHS->getValue() & ShlMask);
+
+    if (CI->getValue() == ShlMask)
+      // Masking out bits that the shift already masks.
+      return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
+
+    if (CI != AndRHS) {                  // Reducing bits set in and.
+      TheAnd.setOperand(1, CI);
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::LShr: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!  This only applies to
+    // unsigned shifts, because a signed shr may bring in set bits!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(Op->getContext(),
+                                       AndRHS->getValue() & ShrMask);
+
+    if (CI->getValue() == ShrMask)
+      // Masking out bits that the shift already masks.
+      return ReplaceInstUsesWith(TheAnd, Op);
+
+    if (CI != AndRHS) {
+      TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::AShr:
+    // Signed shr.
+    // See if this is shifting in some sign extension, then masking it out
+    // with an and.
+    if (Op->hasOneUse()) {
+      uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+      uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+      APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+      Constant *C = ConstantInt::get(Op->getContext(),
+                                     AndRHS->getValue() & ShrMask);
+      if (C == AndRHS) {          // Masking out bits shifted in.
+        // (Val ashr C1) & C2 -> (Val lshr C1) & C2
+        // Make the argument unsigned.
+        Value *ShVal = Op->getOperand(0);
+        ShVal = Builder->CreateLShr(ShVal, OpRHS, Op->getName());
+        return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
+/// true, otherwise (V < Lo || V >= Hi).  In practice, we emit the more efficient
+/// (V-Lo) \<u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
+/// whether to treat the V, Lo and HI as signed or not. IB is the location to
+/// insert new instructions.
+Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                     bool isSigned, bool Inside) {
+  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ?
+            ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
+         "Lo is not <= Hi in range emission code!");
+
+  if (Inside) {
+    if (Lo == Hi)  // Trivially false.
+      return ConstantInt::getFalse(V->getContext());
+
+    // V >= Min && V < Hi --> V < Hi
+    if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+      ICmpInst::Predicate pred = (isSigned ?
+        ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
+      return Builder->CreateICmp(pred, V, Hi);
+    }
+
+    // Emit V-Lo <u Hi-Lo
+    Constant *NegLo = ConstantExpr::getNeg(Lo);
+    Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off");
+    Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
+    return Builder->CreateICmpULT(Add, UpperBound);
+  }
+
+  if (Lo == Hi)  // Trivially true.
+    return ConstantInt::getTrue(V->getContext());
+
+  // V < Min || V >= Hi -> V > Hi-1
+  Hi = SubOne(cast<ConstantInt>(Hi));
+  if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+    ICmpInst::Predicate pred = (isSigned ?
+        ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+    return Builder->CreateICmp(pred, V, Hi);
+  }
+
+  // Emit V-Lo >u Hi-1-Lo
+  // Note that Hi has already had one subtracted from it, above.
+  ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
+  Value *Add = Builder->CreateAdd(V, NegLo, V->getName()+".off");
+  Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
+  return Builder->CreateICmpUGT(Add, LowerBound);
+}
+
+// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with
+// any number of 0s on either side.  The 1s are allowed to wrap from LSB to
+// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
+// not, since all 1s are not contiguous.
+static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
+  const APInt& V = Val->getValue();
+  uint32_t BitWidth = Val->getType()->getBitWidth();
+  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
+
+  // look for the first zero bit after the run of ones
+  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
+  // look for the first non-zero bit
+  ME = V.getActiveBits();
+  return true;
+}
+
+/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
+/// where isSub determines whether the operator is a sub.  If we can fold one of
+/// the following xforms:
+///
+/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
+/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+///
+/// return (A +/- B).
+///
+Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
+                                        ConstantInt *Mask, bool isSub,
+                                        Instruction &I) {
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  if (!LHSI || LHSI->getNumOperands() != 2 ||
+      !isa<ConstantInt>(LHSI->getOperand(1))) return 0;
+
+  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
+
+  switch (LHSI->getOpcode()) {
+  default: return 0;
+  case Instruction::And:
+    if (ConstantExpr::getAnd(N, Mask) == Mask) {
+      // If the AndRHS is a power of two minus one (0+1+), this is simple.
+      if ((Mask->getValue().countLeadingZeros() +
+           Mask->getValue().countPopulation()) ==
+          Mask->getValue().getBitWidth())
+        break;
+
+      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
+      // part, we don't need any explicit masks to take them out of A.  If that
+      // is all N is, ignore it.
+      uint32_t MB = 0, ME = 0;
+      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
+        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
+        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
+        if (MaskedValueIsZero(RHS, Mask))
+          break;
+      }
+    }
+    return 0;
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
+    if ((Mask->getValue().countLeadingZeros() +
+         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
+        && ConstantExpr::getAnd(N, Mask)->isNullValue())
+      break;
+    return 0;
+  }
+
+  if (isSub)
+    return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold");
+  return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold");
+}
+
+/// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C)
+/// One of A and B is considered the mask, the other the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum
+/// contains only "Mask", then both A and B can be considered masks.
+/// If A is the mask, then it was proven, that (A & C) == C. This
+/// is trivial if C == A, or C == 0. If both A and C are constants, this
+/// proof is also easy.
+/// For the following explanations we assume that A is the mask.
+/// The part "AllOnes" declares, that the comparison is true only
+/// if (A & B) == A, or all bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes
+/// The part "AllZeroes" declares, that the comparison is true only
+/// if (A & B) == 0, or all bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes
+/// The part "Mixed" declares, that (A & B) == C and C might or might not
+/// contain any number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed
+/// The Part "Not" means, that in above descriptions "==" should be replaced
+/// by "!=".
+///   Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes
+/// If the mask A contains a single bit, then the following is equivalent:
+///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
+///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
+enum MaskedICmpType {
+  FoldMskICmp_AMask_AllOnes           =     1,
+  FoldMskICmp_AMask_NotAllOnes        =     2,
+  FoldMskICmp_BMask_AllOnes           =     4,
+  FoldMskICmp_BMask_NotAllOnes        =     8,
+  FoldMskICmp_Mask_AllZeroes          =    16,
+  FoldMskICmp_Mask_NotAllZeroes       =    32,
+  FoldMskICmp_AMask_Mixed             =    64,
+  FoldMskICmp_AMask_NotMixed          =   128,
+  FoldMskICmp_BMask_Mixed             =   256,
+  FoldMskICmp_BMask_NotMixed          =   512
+};
+
+/// return the set of pattern classes (from MaskedICmpType)
+/// that (icmp SCC (A & B), C) satisfies
+static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
+                                    ICmpInst::Predicate SCC)
+{
+  ConstantInt *ACst = dyn_cast<ConstantInt>(A);
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+  bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
+  bool icmp_abit = (ACst != 0 && !ACst->isZero() &&
+                    ACst->getValue().isPowerOf2());
+  bool icmp_bbit = (BCst != 0 && !BCst->isZero() &&
+                    BCst->getValue().isPowerOf2());
+  unsigned result = 0;
+  if (CCst != 0 && CCst->isZero()) {
+    // if C is zero, then both A and B qualify as mask
+    result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
+                          FoldMskICmp_Mask_AllZeroes |
+                          FoldMskICmp_AMask_Mixed |
+                          FoldMskICmp_BMask_Mixed)
+                       : (FoldMskICmp_Mask_NotAllZeroes |
+                          FoldMskICmp_Mask_NotAllZeroes |
+                          FoldMskICmp_AMask_NotMixed |
+                          FoldMskICmp_BMask_NotMixed));
+    if (icmp_abit)
+      result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes |
+                            FoldMskICmp_AMask_NotMixed)
+                         : (FoldMskICmp_AMask_AllOnes |
+                            FoldMskICmp_AMask_Mixed));
+    if (icmp_bbit)
+      result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes |
+                            FoldMskICmp_BMask_NotMixed)
+                         : (FoldMskICmp_BMask_AllOnes |
+                            FoldMskICmp_BMask_Mixed));
+    return result;
+  }
+  if (A == C) {
+    result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes |
+                          FoldMskICmp_AMask_Mixed)
+                       : (FoldMskICmp_AMask_NotAllOnes |
+                          FoldMskICmp_AMask_NotMixed));
+    if (icmp_abit)
+      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
+                            FoldMskICmp_AMask_NotMixed)
+                         : (FoldMskICmp_Mask_AllZeroes |
+                            FoldMskICmp_AMask_Mixed));
+  } else if (ACst != 0 && CCst != 0 &&
+             ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
+                       : FoldMskICmp_AMask_NotMixed);
+  }
+  if (B == C) {
+    result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes |
+                          FoldMskICmp_BMask_Mixed)
+                       : (FoldMskICmp_BMask_NotAllOnes |
+                          FoldMskICmp_BMask_NotMixed));
+    if (icmp_bbit)
+      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
+                            FoldMskICmp_BMask_NotMixed)
+                         : (FoldMskICmp_Mask_AllZeroes |
+                            FoldMskICmp_BMask_Mixed));
+  } else if (BCst != 0 && CCst != 0 &&
+             ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
+                       : FoldMskICmp_BMask_NotMixed);
+  }
+  return result;
+}
+
+/// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
+/// if possible. The returned predicate is either == or !=. Returns false if
+/// decomposition fails.
+static bool decomposeBitTestICmp(const ICmpInst *I, ICmpInst::Predicate &Pred,
+                                 Value *&X, Value *&Y, Value *&Z) {
+  // X < 0 is equivalent to (X & SignBit) != 0.
+  if (I->getPredicate() == ICmpInst::ICMP_SLT)
+    if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+      if (C->isZero()) {
+        X = I->getOperand(0);
+        Y = ConstantInt::get(I->getContext(),
+                             APInt::getSignBit(C->getBitWidth()));
+        Pred = ICmpInst::ICMP_NE;
+        Z = C;
+        return true;
+      }
+
+  // X > -1 is equivalent to (X & SignBit) == 0.
+  if (I->getPredicate() == ICmpInst::ICMP_SGT)
+    if (ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+      if (C->isAllOnesValue()) {
+        X = I->getOperand(0);
+        Y = ConstantInt::get(I->getContext(),
+                             APInt::getSignBit(C->getBitWidth()));
+        Pred = ICmpInst::ICMP_EQ;
+        Z = ConstantInt::getNullValue(C->getType());
+        return true;
+      }
+
+  return false;
+}
+
+/// foldLogOpOfMaskedICmpsHelper:
+/// handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// return the set of pattern classes (from MaskedICmpType)
+/// that both LHS and RHS satisfy
+static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
+                                             Value*& B, Value*& C,
+                                             Value*& D, Value*& E,
+                                             ICmpInst *LHS, ICmpInst *RHS,
+                                             ICmpInst::Predicate &LHSCC,
+                                             ICmpInst::Predicate &RHSCC) {
+  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0;
+  // vectors are not (yet?) supported
+  if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
+
+  // Here comes the tricky part:
+  // LHS might be of the form L11 & L12 == X, X == L21 & L22,
+  // and L11 & L12 == L21 & L22. The same goes for RHS.
+  // Now we must find those components L** and R**, that are equal, so
+  // that we can extract the parameters A, B, C, D, and E for the canonical
+  // above.
+  Value *L1 = LHS->getOperand(0);
+  Value *L2 = LHS->getOperand(1);
+  Value *L11,*L12,*L21,*L22;
+  // Check whether the icmp can be decomposed into a bit test.
+  if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
+    L21 = L22 = L1 = 0;
+  } else {
+    // Look for ANDs in the LHS icmp.
+    if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
+        L21 = L22 = 0;
+    } else {
+      if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
+        return 0;
+      std::swap(L1, L2);
+      L21 = L22 = 0;
+    }
+  }
+
+  // Bail if LHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(LHSCC))
+    return 0;
+
+  Value *R1 = RHS->getOperand(0);
+  Value *R2 = RHS->getOperand(1);
+  Value *R11,*R12;
+  bool ok = false;
+  if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12; D = R11;
+    } else {
+      return 0;
+    }
+    E = R2; R1 = 0; ok = true;
+  } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12; E = R2; ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12; D = R11; E = R2; ok = true;
+    }
+  }
+
+  // Bail if RHS was a icmp that can't be decomposed into an equality.
+  if (!ICmpInst::isEquality(RHSCC))
+    return 0;
+
+  // Look for ANDs in on the right side of the RHS icmp.
+  if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+    if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+      A = R11; D = R12; E = R1; ok = true;
+    } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+      A = R12; D = R11; E = R1; ok = true;
+    } else {
+      return 0;
+    }
+  }
+  if (!ok)
+    return 0;
+
+  if (L11 == A) {
+    B = L12; C = L2;
+  } else if (L12 == A) {
+    B = L11; C = L2;
+  } else if (L21 == A) {
+    B = L22; C = L1;
+  } else if (L22 == A) {
+    B = L21; C = L1;
+  }
+
+  unsigned left_type = getTypeOfMaskedICmp(A, B, C, LHSCC);
+  unsigned right_type = getTypeOfMaskedICmp(A, D, E, RHSCC);
+  return left_type & right_type;
+}
+/// foldLogOpOfMaskedICmps:
+/// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// into a single (icmp(A & X) ==/!= Y)
+static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                     ICmpInst::Predicate NEWCC,
+                                     llvm::InstCombiner::BuilderTy* Builder) {
+  Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
+  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  unsigned mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
+                                               LHSCC, RHSCC);
+  if (mask == 0) return 0;
+  assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
+         "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
+
+  if (NEWCC == ICmpInst::ICMP_NE)
+    mask >>= 1; // treat "Not"-states as normal states
+
+  if (mask & FoldMskICmp_Mask_AllZeroes) {
+    // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
+    // -> (icmp eq (A & (B|D)), 0)
+    Value* newOr = Builder->CreateOr(B, D);
+    Value* newAnd = Builder->CreateAnd(A, newOr);
+    // we can't use C as zero, because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    // with B and D, having a single bit set
+    Value* zero = Constant::getNullValue(A->getType());
+    return Builder->CreateICmp(NEWCC, newAnd, zero);
+  }
+  if (mask & FoldMskICmp_BMask_AllOnes) {
+    // (icmp eq (A & B), B) & (icmp eq (A & D), D)
+    // -> (icmp eq (A & (B|D)), (B|D))
+    Value* newOr = Builder->CreateOr(B, D);
+    Value* newAnd = Builder->CreateAnd(A, newOr);
+    return Builder->CreateICmp(NEWCC, newAnd, newOr);
+  }
+  if (mask & FoldMskICmp_AMask_AllOnes) {
+    // (icmp eq (A & B), A) & (icmp eq (A & D), A)
+    // -> (icmp eq (A & (B&D)), A)
+    Value* newAnd1 = Builder->CreateAnd(B, D);
+    Value* newAnd = Builder->CreateAnd(A, newAnd1);
+    return Builder->CreateICmp(NEWCC, newAnd, A);
+  }
+  if (mask & FoldMskICmp_BMask_Mixed) {
+    // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+    // We already know that B & C == C && D & E == E.
+    // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+    // C and E, which are shared by both the mask B and the mask D, don't
+    // contradict, then we can transform to
+    // -> (icmp eq (A & (B|D)), (C|E))
+    // Currently, we only handle the case of B, C, D, and E being constant.
+    ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+    if (BCst == 0) return 0;
+    ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+    if (DCst == 0) return 0;
+    // we can't simply use C and E, because we might actually handle
+    //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
+    // with B and D, having a single bit set
+
+    ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+    if (CCst == 0) return 0;
+    if (LHSCC != NEWCC)
+      CCst = dyn_cast<ConstantInt>( ConstantExpr::getXor(BCst, CCst) );
+    ConstantInt *ECst = dyn_cast<ConstantInt>(E);
+    if (ECst == 0) return 0;
+    if (RHSCC != NEWCC)
+      ECst = dyn_cast<ConstantInt>( ConstantExpr::getXor(DCst, ECst) );
+    ConstantInt* MCst = dyn_cast<ConstantInt>(
+      ConstantExpr::getAnd(ConstantExpr::getAnd(BCst, DCst),
+                           ConstantExpr::getXor(CCst, ECst)) );
+    // if there is a conflict we should actually return a false for the
+    // whole construct
+    if (!MCst->isZero())
+      return 0;
+    Value *newOr1 = Builder->CreateOr(B, D);
+    Value *newOr2 = ConstantExpr::getOr(CCst, ECst);
+    Value *newAnd = Builder->CreateAnd(A, newOr1);
+    return Builder->CreateICmp(NEWCC, newAnd, newOr2);
+  }
+  return 0;
+}
+
+/// FoldAndOfICmps - Fold (icmp)&(icmp) if possible.
+Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+
+  // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+  if (PredicatesFoldable(LHSCC, RHSCC)) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder))
+    return V;
+
+  // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
+  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (LHSCst == 0 || RHSCst == 0) return 0;
+
+  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+    // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+    // where C is a power of 2
+    if (LHSCC == ICmpInst::ICMP_ULT &&
+        LHSCst->getValue().isPowerOf2()) {
+      Value *NewOr = Builder->CreateOr(Val, Val2);
+      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    }
+
+    // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
+    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero()) {
+      Value *NewOr = Builder->CreateOr(Val, Val2);
+      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    }
+  }
+
+  // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+  // where CMAX is the all ones value for the truncated type,
+  // iff the lower bits of C2 and CA are zero.
+  if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
+      LHS->hasOneUse() && RHS->hasOneUse()) {
+    Value *V;
+    ConstantInt *AndCst, *SmallCst = 0, *BigCst = 0;
+
+    // (trunc x) == C1 & (and x, CA) == C2
+    // (and x, CA) == C2 & (trunc x) == C1
+    if (match(Val2, m_Trunc(m_Value(V))) &&
+        match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
+      SmallCst = RHSCst;
+      BigCst = LHSCst;
+    } else if (match(Val, m_Trunc(m_Value(V))) &&
+               match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
+      SmallCst = LHSCst;
+      BigCst = RHSCst;
+    }
+
+    if (SmallCst && BigCst) {
+      unsigned BigBitSize = BigCst->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallCst->getType()->getBitWidth();
+
+      // Check that the low bits are zero.
+      APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+      if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) {
+        Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue());
+        APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue();
+        Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N);
+        return Builder->CreateICmp(LHSCC, NewAnd, NewVal);
+      }
+    }
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+
+  // Make a constant range that's the intersection of the two icmp ranges.
+  // If the intersection is empty, we know that the result is false.
+  ConstantRange LHSRange =
+    ConstantRange::makeICmpRegion(LHSCC, LHSCst->getValue());
+  ConstantRange RHSRange =
+    ConstantRange::makeICmpRegion(RHSCC, RHSCst->getValue());
+
+  if (LHSRange.intersectWith(RHSRange).isEmptySet())
+    return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+
+  // We can't fold (ugt x, C) & (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) &&
+       CmpInst::isSigned(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and and'ing the result
+  // together.  Because of the above check, we know that we only have
+  // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
+  // (from the icmp folding check above), that the two constants
+  // are not equal and that the larger constant is on the RHS
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+      return LHS;
+    }
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_ULT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
+        return Builder->CreateICmpULT(Val, LHSCst);
+      break;                        // (X != 13 & X u< 15) -> no change
+    case ICmpInst::ICMP_SLT:
+      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
+        return Builder->CreateICmpSLT(Val, LHSCst);
+      break;                        // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      return RHS;
+    case ICmpInst::ICMP_NE:
+      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
+        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1));
+      }
+      break;                        // (X != 13 & X != 15) -> no change
+    }
+    break;
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+      return LHS;
+    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+      return LHS;
+    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+      return RHS;
+    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
+        return Builder->CreateICmp(LHSCC, Val, RHSCst);
+      break;                        // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, false, true);
+    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+      return RHS;
+    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:
+      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
+        return Builder->CreateICmp(LHSCC, Val, RHSCst);
+      break;                        // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return InsertRangeTest(Val, AddOne(LHSCst), RHSCst, true, true);
+    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+
+  return 0;
+}
+
+/// FoldAndOfFCmps - Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of
+/// instcombine, this returns a Value which should already be inserted into the
+/// function.
+Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
+      RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+    // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
+    if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+      if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+        // If either of the constants are nans, then the whole thing returns
+        // false.
+        if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+          return ConstantInt::getFalse(LHS->getContext());
+        return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+      }
+
+    // Handle vector zeros.  This occurs because the canonical form of
+    // "fcmp ord x,x" is "fcmp ord x, 0".
+    if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
+        isa<ConstantAggregateZero>(RHS->getOperand(1)))
+      return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+    return 0;
+  }
+
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+    // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+    if (Op0CC == Op1CC)
+      return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+    if (Op0CC == FCmpInst::FCMP_FALSE || Op1CC == FCmpInst::FCMP_FALSE)
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    if (Op0CC == FCmpInst::FCMP_TRUE)
+      return RHS;
+    if (Op1CC == FCmpInst::FCMP_TRUE)
+      return LHS;
+
+    bool Op0Ordered;
+    bool Op1Ordered;
+    unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+    unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+    // uno && ord -> false
+    if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered)
+        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    if (Op1Pred == 0) {
+      std::swap(LHS, RHS);
+      std::swap(Op0Pred, Op1Pred);
+      std::swap(Op0Ordered, Op1Ordered);
+    }
+    if (Op0Pred == 0) {
+      // uno && ueq -> uno && (uno || eq) -> uno
+      // ord && olt -> ord && (ord && lt) -> olt
+      if (!Op0Ordered && (Op0Ordered == Op1Ordered))
+        return LHS;
+      if (Op0Ordered && (Op0Ordered == Op1Ordered))
+        return RHS;
+
+      // uno && oeq -> uno && (ord && eq) -> false
+      if (!Op0Ordered)
+        return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+      // ord && ueq -> ord && (uno || eq) -> oeq
+      return getFCmpValue(true, Op1Pred, Op0LHS, Op0RHS, Builder);
+    }
+  }
+
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyAndInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // (A|B)&(A|C) -> A|(B&C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+    const APInt &AndRHSMask = AndRHS->getValue();
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      Value *Op0LHS = Op0I->getOperand(0);
+      Value *Op0RHS = Op0I->getOperand(1);
+      switch (Op0I->getOpcode()) {
+      default: break;
+      case Instruction::Xor:
+      case Instruction::Or: {
+        // If the mask is only needed on one incoming arm, push it up.
+        if (!Op0I->hasOneUse()) break;
+
+        APInt NotAndRHS(~AndRHSMask);
+        if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
+          // Not masking anything out for the LHS, move to RHS.
+          Value *NewRHS = Builder->CreateAnd(Op0RHS, AndRHS,
+                                             Op0RHS->getName()+".masked");
+          return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS);
+        }
+        if (!isa<Constant>(Op0RHS) &&
+            MaskedValueIsZero(Op0RHS, NotAndRHS)) {
+          // Not masking anything out for the RHS, move to LHS.
+          Value *NewLHS = Builder->CreateAnd(Op0LHS, AndRHS,
+                                             Op0LHS->getName()+".masked");
+          return BinaryOperator::Create(Op0I->getOpcode(), NewLHS, Op0RHS);
+        }
+
+        break;
+      }
+      case Instruction::Add:
+        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
+        break;
+
+      case Instruction::Sub:
+        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
+          return BinaryOperator::CreateAnd(V, AndRHS);
+
+        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
+        // has 1's for all bits that the subtraction with A might affect.
+        if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) {
+          uint32_t BitWidth = AndRHSMask.getBitWidth();
+          uint32_t Zeros = AndRHSMask.countLeadingZeros();
+          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
+
+          if (MaskedValueIsZero(Op0LHS, Mask)) {
+            Value *NewNeg = Builder->CreateNeg(Op0RHS);
+            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
+          }
+        }
+        break;
+
+      case Instruction::Shl:
+      case Instruction::LShr:
+        // (1 << x) & 1 --> zext(x == 0)
+        // (1 >> x) & 1 --> zext(x == 0)
+        if (AndRHSMask == 1 && Op0LHS == AndRHS) {
+          Value *NewICmp =
+            Builder->CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType()));
+          return new ZExtInst(NewICmp, I.getType());
+        }
+        break;
+      }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
+          return Res;
+    }
+
+    // If this is an integer truncation, and if the source is an 'and' with
+    // immediate, transform it.  This frequently occurs for bitfield accesses.
+    {
+      Value *X = 0; ConstantInt *YC = 0;
+      if (match(Op0, m_Trunc(m_And(m_Value(X), m_ConstantInt(YC))))) {
+        // Change: and (trunc (and X, YC) to T), C2
+        // into  : and (trunc X to T), trunc(YC) & C2
+        // This will fold the two constants together, which may allow
+        // other simplifications.
+        Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk");
+        Constant *C3 = ConstantExpr::getTrunc(YC, I.getType());
+        C3 = ConstantExpr::getAnd(C3, AndRHS);
+        return BinaryOperator::CreateAnd(NewCast, C3);
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+
+  // (~A & ~B) == (~(A | B)) - De Morgan's Law
+  if (Value *Op0NotVal = dyn_castNotVal(Op0))
+    if (Value *Op1NotVal = dyn_castNotVal(Op1))
+      if (Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *Or = Builder->CreateOr(Op0NotVal, Op1NotVal,
+                                      I.getName()+".demorgan");
+        return BinaryOperator::CreateNot(Or);
+      }
+
+  {
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    // (A|B) & ~(A&B) -> A^B
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) &&
+        ((A == C && B == D) || (A == D && B == C)))
+      return BinaryOperator::CreateXor(A, B);
+
+    // ~(A&B) & (A|B) -> A^B
+    if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) &&
+        ((A == C && B == D) || (A == D && B == C)))
+      return BinaryOperator::CreateXor(A, B);
+
+    // A&(A^B) => A & ~B
+    {
+      Value *tmpOp0 = Op0;
+      Value *tmpOp1 = Op1;
+      if (Op0->hasOneUse() &&
+          match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+        if (A == Op1 || B == Op1 ) {
+          tmpOp1 = Op0;
+          tmpOp0 = Op1;
+          // Simplify below
+        }
+      }
+
+      if (tmpOp1->hasOneUse() &&
+          match(tmpOp1, m_Xor(m_Value(A), m_Value(B)))) {
+        if (B == tmpOp0) {
+          std::swap(A, B);
+        }
+        // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
+        // A is originally -1 (or a vector of -1 and undefs), then we enter
+        // an endless loop. By checking that A is non-constant we ensure that
+        // we will never get to the loop.
+        if (A == tmpOp0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
+          return BinaryOperator::CreateAnd(A, Builder->CreateNot(B));
+      }
+    }
+
+    // (A&((~A)|B)) -> A&B
+    if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) ||
+        match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1)))))
+      return BinaryOperator::CreateAnd(A, Op1);
+    if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
+        match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
+      return BinaryOperator::CreateAnd(A, Op0);
+  }
+
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1))
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0))
+      if (Value *Res = FoldAndOfICmps(LHS, RHS))
+        return ReplaceInstUsesWith(I, Res);
+
+  // If and'ing two fcmp, try combine them into one.
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = FoldAndOfFCmps(LHS, RHS))
+        return ReplaceInstUsesWith(I, Res);
+
+
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
+      Type *SrcTy = Op0C->getOperand(0)->getType();
+      if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?
+          SrcTy == Op1C->getOperand(0)->getType() &&
+          SrcTy->isIntOrIntVectorTy()) {
+        Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
+
+        // Only do this if the casts both really cause code to be generated.
+        if (ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
+            ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) {
+          Value *NewOp = Builder->CreateAnd(Op0COp, Op1COp, I.getName());
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+
+        // If this is and(cast(icmp), cast(icmp)), try to fold this even if the
+        // cast is otherwise not optimizable.  This happens for vector sexts.
+        if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
+          if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
+            if (Value *Res = FoldAndOfICmps(LHS, RHS))
+              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
+
+        // If this is and(cast(fcmp), cast(fcmp)), try to fold this even if the
+        // cast is otherwise not optimizable.  This happens for vector sexts.
+        if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
+          if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp))
+            if (Value *Res = FoldAndOfFCmps(LHS, RHS))
+              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
+      }
+    }
+
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Value *NewOp =
+          Builder->CreateAnd(SI0->getOperand(0), SI1->getOperand(0),
+                             SI0->getName());
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
+                                      SI1->getOperand(1));
+      }
+  }
+
+  {
+    Value *X = 0;
+    bool OpsSwapped = false;
+    // Canonicalize SExt or Not to the LHS
+    if (match(Op1, m_SExt(m_Value())) ||
+        match(Op1, m_Not(m_Value()))) {
+      std::swap(Op0, Op1);
+      OpsSwapped = true;
+    }
+
+    // Fold (and (sext bool to A), B) --> (select bool, B, 0)
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op1->getType());
+      return SelectInst::Create(X, Op1, Zero);
+    }
+
+    // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
+    if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
+        X->getType()->getScalarType()->isIntegerTy(1)) {
+      Value *Zero = Constant::getNullValue(Op0->getType());
+      return SelectInst::Create(X, Zero, Op1);
+    }
+
+    if (OpsSwapped)
+      std::swap(Op0, Op1);
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// CollectBSwapParts - Analyze the specified subexpression and see if it is
+/// capable of providing pieces of a bswap.  The subexpression provides pieces
+/// of a bswap if it is proven that each of the non-zero bytes in the output of
+/// the expression came from the corresponding "byte swapped" byte in some other
+/// value.  For example, if the current subexpression is "(shl i32 %X, 24)" then
+/// we know that the expression deposits the low byte of %X into the high byte
+/// of the bswap result and that all other bytes are zero.  This expression is
+/// accepted, the high byte of ByteValues is set to X to indicate a correct
+/// match.
+///
+/// This function returns true if the match was unsuccessful and false if so.
+/// On entry to the function the "OverallLeftShift" is a signed integer value
+/// indicating the number of bytes that the subexpression is later shifted.  For
+/// example, if the expression is later right shifted by 16 bits, the
+/// OverallLeftShift value would be -2 on entry.  This is used to specify which
+/// byte of ByteValues is actually being set.
+///
+/// Similarly, ByteMask is a bitmask where a bit is clear if its corresponding
+/// byte is masked to zero by a user.  For example, in (X & 255), X will be
+/// processed with a bytemask of 1.  Because bytemask is 32-bits, this limits
+/// this function to working on up to 32-byte (256 bit) values.  ByteMask is
+/// always in the local (OverallLeftShift) coordinate space.
+///
+static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
+                              SmallVector<Value*, 8> &ByteValues) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If this is an or instruction, it may be an inner node of the bswap.
+    if (I->getOpcode() == Instruction::Or) {
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
+                               ByteValues) ||
+             CollectBSwapParts(I->getOperand(1), OverallLeftShift, ByteMask,
+                               ByteValues);
+    }
+
+    // If this is a logical shift by a constant multiple of 8, recurse with
+    // OverallLeftShift and ByteMask adjusted.
+    if (I->isLogicalShift() && isa<ConstantInt>(I->getOperand(1))) {
+      unsigned ShAmt =
+        cast<ConstantInt>(I->getOperand(1))->getLimitedValue(~0U);
+      // Ensure the shift amount is defined and of a byte value.
+      if ((ShAmt & 7) || (ShAmt > 8*ByteValues.size()))
+        return true;
+
+      unsigned ByteShift = ShAmt >> 3;
+      if (I->getOpcode() == Instruction::Shl) {
+        // X << 2 -> collect(X, +2)
+        OverallLeftShift += ByteShift;
+        ByteMask >>= ByteShift;
+      } else {
+        // X >>u 2 -> collect(X, -2)
+        OverallLeftShift -= ByteShift;
+        ByteMask <<= ByteShift;
+        ByteMask &= (~0U >> (32-ByteValues.size()));
+      }
+
+      if (OverallLeftShift >= (int)ByteValues.size()) return true;
+      if (OverallLeftShift <= -(int)ByteValues.size()) return true;
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
+                               ByteValues);
+    }
+
+    // If this is a logical 'and' with a mask that clears bytes, clear the
+    // corresponding bytes in ByteMask.
+    if (I->getOpcode() == Instruction::And &&
+        isa<ConstantInt>(I->getOperand(1))) {
+      // Scan every byte of the and mask, seeing if the byte is either 0 or 255.
+      unsigned NumBytes = ByteValues.size();
+      APInt Byte(I->getType()->getPrimitiveSizeInBits(), 255);
+      const APInt &AndMask = cast<ConstantInt>(I->getOperand(1))->getValue();
+
+      for (unsigned i = 0; i != NumBytes; ++i, Byte <<= 8) {
+        // If this byte is masked out by a later operation, we don't care what
+        // the and mask is.
+        if ((ByteMask & (1 << i)) == 0)
+          continue;
+
+        // If the AndMask is all zeros for this byte, clear the bit.
+        APInt MaskB = AndMask & Byte;
+        if (MaskB == 0) {
+          ByteMask &= ~(1U << i);
+          continue;
+        }
+
+        // If the AndMask is not all ones for this byte, it's not a bytezap.
+        if (MaskB != Byte)
+          return true;
+
+        // Otherwise, this byte is kept.
+      }
+
+      return CollectBSwapParts(I->getOperand(0), OverallLeftShift, ByteMask,
+                               ByteValues);
+    }
+  }
+
+  // Okay, we got to something that isn't a shift, 'or' or 'and'.  This must be
+  // the input value to the bswap.  Some observations: 1) if more than one byte
+  // is demanded from this input, then it could not be successfully assembled
+  // into a byteswap.  At least one of the two bytes would not be aligned with
+  // their ultimate destination.
+  if (!isPowerOf2_32(ByteMask)) return true;
+  unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
+
+  // 2) The input and ultimate destinations must line up: if byte 3 of an i32
+  // is demanded, it needs to go into byte 0 of the result.  This means that the
+  // byte needs to be shifted until it lands in the right byte bucket.  The
+  // shift amount depends on the position: if the byte is coming from the high
+  // part of the value (e.g. byte 3) then it must be shifted right.  If from the
+  // low part, it must be shifted left.
+  unsigned DestByteNo = InputByteNo + OverallLeftShift;
+  if (ByteValues.size()-1-DestByteNo != InputByteNo)
+    return true;
+
+  // If the destination byte value is already defined, the values are or'd
+  // together, which isn't a bswap (unless it's an or of the same bits).
+  if (ByteValues[DestByteNo] && ByteValues[DestByteNo] != V)
+    return true;
+  ByteValues[DestByteNo] = V;
+  return false;
+}
+
+/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom.
+/// If so, insert the new bswap intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+  if (!ITy || ITy->getBitWidth() % 16 ||
+      // ByteMask only allows up to 32-byte values.
+      ITy->getBitWidth() > 32*8)
+    return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
+
+  /// ByteValues - For each byte of the result, we keep track of which value
+  /// defines each byte.
+  SmallVector<Value*, 8> ByteValues;
+  ByteValues.resize(ITy->getBitWidth()/8);
+
+  // Try to find all the pieces corresponding to the bswap.
+  uint32_t ByteMask = ~0U >> (32-ByteValues.size());
+  if (CollectBSwapParts(&I, 0, ByteMask, ByteValues))
+    return 0;
+
+  // Check to see if all of the bytes come from the same value.
+  Value *V = ByteValues[0];
+  if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
+
+  // Check to make sure that all of the bytes come from the same value.
+  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
+    if (ByteValues[i] != V)
+      return 0;
+  Module *M = I.getParent()->getParent()->getParent();
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, ITy);
+  return CallInst::Create(F, V);
+}
+
+/// MatchSelectFromAndOr - We have an expression of the form (A&C)|(B&D).  Check
+/// If A is (cond?-1:0) and either B or D is ~(cond?-1,0) or (cond?0,-1), then
+/// we can simplify this expression to "cond ? C : D or B".
+static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
+                                         Value *C, Value *D) {
+  // If A is not a select of -1/0, this cannot match.
+  Value *Cond = 0;
+  if (!match(A, m_SExt(m_Value(Cond))) ||
+      !Cond->getType()->isIntegerTy(1))
+    return 0;
+
+  // ((cond?-1:0)&C) | (B&(cond?0:-1)) -> cond ? C : B.
+  if (match(D, m_Not(m_SExt(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, B);
+  if (match(D, m_SExt(m_Not(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, B);
+
+  // ((cond?-1:0)&C) | ((cond?0:-1)&D) -> cond ? C : D.
+  if (match(B, m_Not(m_SExt(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, D);
+  if (match(B, m_SExt(m_Not(m_Specific(Cond)))))
+    return SelectInst::Create(Cond, C, D);
+  return 0;
+}
+
+/// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
+Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (PredicatesFoldable(LHSCC, RHSCC)) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // handle (roughly):
+  // (icmp ne (A & B), C) | (icmp ne (A & D), E)
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder))
+    return V;
+
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
+  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (LHSCst == 0 || RHSCst == 0) return 0;
+
+  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+    // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
+    if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
+      Value *NewOr = Builder->CreateOr(Val, Val2);
+      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    }
+  }
+
+  // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
+  //   iff C2 + CA == C1.
+  if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddCst;
+    if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst))))
+      if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue())
+        return Builder->CreateICmpULE(Val, LHSCst);
+  }
+
+  // From here on, we only handle:
+  //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+  if (Val != Val2) return 0;
+
+  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
+  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
+      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
+      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
+      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+    return 0;
+
+  // We can't fold (ugt x, C) | (sgt x, C2).
+  if (!PredicatesFoldable(LHSCC, RHSCC))
+    return 0;
+
+  // Ensure that the larger constant is on the RHS.
+  bool ShouldSwap;
+  if (CmpInst::isSigned(LHSCC) ||
+      (ICmpInst::isEquality(LHSCC) &&
+       CmpInst::isSigned(RHSCC)))
+    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  else
+    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+
+  if (ShouldSwap) {
+    std::swap(LHS, RHS);
+    std::swap(LHSCst, RHSCst);
+    std::swap(LHSCC, RHSCC);
+  }
+
+  // At this point, we know we have two icmp instructions
+  // comparing a value against two constants and or'ing the result
+  // together.  Because of the above check, we know that we only have
+  // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+  // icmp folding check above), that the two constants are not
+  // equal.
+  assert(LHSCst != RHSCst && "Compares not folded above?");
+
+  switch (LHSCC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ICmpInst::ICMP_EQ:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:
+      if (LHSCst == SubOne(RHSCst)) {
+        // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
+        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
+        return Builder->CreateICmpULT(Add, AddCST);
+      }
+
+      if (LHS->getOperand(0) == RHS->getOperand(0)) {
+        // if LHSCst and RHSCst differ only by one bit:
+        // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
+        assert(LHSCst->getValue().ule(LHSCst->getValue()));
+
+        APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
+        if (Xor.isPowerOf2()) {
+          Value *NegCst = Builder->getInt(~Xor);
+          Value *And = Builder->CreateAnd(LHS->getOperand(0), NegCst);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, And, LHSCst);
+        }
+      }
+
+      break;                         // (X == 13 | X == 15) -> no change
+    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      break;
+    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+      return RHS;
+    }
+    break;
+  case ICmpInst::ICMP_NE:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+      return LHS;
+    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+      return ConstantInt::getTrue(LHS->getContext());
+    }
+  case ICmpInst::ICMP_ULT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(false))
+        return LHS;
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), false, false);
+    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+      return RHS;
+    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SLT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+      break;
+    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+      // this can cause overflow.
+      if (RHSCst->isMaxValue(true))
+        return LHS;
+      return InsertRangeTest(Val, LHSCst, AddOne(RHSCst), true, false);
+    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return RHS;
+    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_UGT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+      return LHS;
+    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+      return ConstantInt::getTrue(LHS->getContext());
+    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
+      break;
+    }
+    break;
+  case ICmpInst::ICMP_SGT:
+    switch (RHSCC) {
+    default: llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+      return LHS;
+    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
+      break;
+    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+      return ConstantInt::getTrue(LHS->getContext());
+    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+      break;
+    }
+    break;
+  }
+  return 0;
+}
+
+/// FoldOrOfFCmps - Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of
+/// instcombine, this returns a Value which should already be inserted into the
+/// function.
+Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+  if (LHS->getPredicate() == FCmpInst::FCMP_UNO &&
+      RHS->getPredicate() == FCmpInst::FCMP_UNO &&
+      LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType()) {
+    if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
+      if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
+        // If either of the constants are nans, then the whole thing returns
+        // true.
+        if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
+          return ConstantInt::getTrue(LHS->getContext());
+
+        // Otherwise, no need to compare the two constants, compare the
+        // rest.
+        return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+      }
+
+    // Handle vector zeros.  This occurs because the canonical form of
+    // "fcmp uno x,x" is "fcmp uno x, 0".
+    if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
+        isa<ConstantAggregateZero>(RHS->getOperand(1)))
+      return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+
+    return 0;
+  }
+
+  Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
+  Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
+  FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
+
+  if (Op0LHS == Op1RHS && Op0RHS == Op1LHS) {
+    // Swap RHS operands to match LHS.
+    Op1CC = FCmpInst::getSwappedPredicate(Op1CC);
+    std::swap(Op1LHS, Op1RHS);
+  }
+  if (Op0LHS == Op1LHS && Op0RHS == Op1RHS) {
+    // Simplify (fcmp cc0 x, y) | (fcmp cc1 x, y).
+    if (Op0CC == Op1CC)
+      return Builder->CreateFCmp((FCmpInst::Predicate)Op0CC, Op0LHS, Op0RHS);
+    if (Op0CC == FCmpInst::FCMP_TRUE || Op1CC == FCmpInst::FCMP_TRUE)
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+    if (Op0CC == FCmpInst::FCMP_FALSE)
+      return RHS;
+    if (Op1CC == FCmpInst::FCMP_FALSE)
+      return LHS;
+    bool Op0Ordered;
+    bool Op1Ordered;
+    unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered);
+    unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered);
+    if (Op0Ordered == Op1Ordered) {
+      // If both are ordered or unordered, return a new fcmp with
+      // or'ed predicates.
+      return getFCmpValue(Op0Ordered, Op0Pred|Op1Pred, Op0LHS, Op0RHS, Builder);
+    }
+  }
+  return 0;
+}
+
+/// FoldOrWithConstants - This helper function folds:
+///
+///     ((A | B) & C1) | (B & C2)
+///
+/// into:
+///
+///     (A & C1) | B
+///
+/// when the XOR of the two constants is "all ones" (-1).
+Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                               Value *A, Value *B, Value *C) {
+  ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
+  if (!CI1) return 0;
+
+  Value *V1 = 0;
+  ConstantInt *CI2 = 0;
+  if (!match(Op, m_And(m_Value(V1), m_ConstantInt(CI2)))) return 0;
+
+  APInt Xor = CI1->getValue() ^ CI2->getValue();
+  if (!Xor.isAllOnesValue()) return 0;
+
+  if (V1 == A || V1 == B) {
+    Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1);
+    return BinaryOperator::CreateOr(NewOp, V1);
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyOrInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // (A&B)|(A&C) -> A&(B|C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    ConstantInt *C1 = 0; Value *X = 0;
+    // (X & C1) | C2 --> (X | C2) & (C1|C2)
+    // iff (C1 & C2) == 0.
+    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) &&
+        (RHS->getValue() & C1->getValue()) != 0 &&
+        Op0->hasOneUse()) {
+      Value *Or = Builder->CreateOr(X, RHS);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateAnd(Or,
+                         ConstantInt::get(I.getContext(),
+                                          RHS->getValue() | C1->getValue()));
+    }
+
+    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
+        Op0->hasOneUse()) {
+      Value *Or = Builder->CreateOr(X, RHS);
+      Or->takeName(Op0);
+      return BinaryOperator::CreateXor(Or,
+                 ConstantInt::get(I.getContext(),
+                                  C1->getValue() & ~RHS->getValue()));
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *A = 0, *B = 0;
+  ConstantInt *C1 = 0, *C2 = 0;
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  if (match(Op0, m_Or(m_Value(), m_Value())) ||
+      match(Op1, m_Or(m_Value(), m_Value())) ||
+      (match(Op0, m_LogicalShift(m_Value(), m_Value())) &&
+       match(Op1, m_LogicalShift(m_Value(), m_Value())))) {
+    if (Instruction *BSwap = MatchBSwap(I))
+      return BSwap;
+  }
+
+  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+  if (Op0->hasOneUse() &&
+      match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op1, C1->getValue())) {
+    Value *NOr = Builder->CreateOr(A, Op1);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+  if (Op1->hasOneUse() &&
+      match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op0, C1->getValue())) {
+    Value *NOr = Builder->CreateOr(A, Op0);
+    NOr->takeName(Op0);
+    return BinaryOperator::CreateXor(NOr, C1);
+  }
+
+  // (A & C)|(B & D)
+  Value *C = 0, *D = 0;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    Value *V1 = 0, *V2 = 0;
+    C1 = dyn_cast<ConstantInt>(C);
+    C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2) {  // (A & C1)|(B & C2)
+      // If we have: ((V + N) & C1) | (V & C2)
+      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+      // replace with V+N.
+      if (C1->getValue() == ~C2->getValue()) {
+        if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+
+            match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+          if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+        }
+        // Or commutes, try both ways.
+        if ((C1->getValue() & (C1->getValue()+1)) == 0 &&
+            match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+          if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+        }
+      }
+
+      if ((C1->getValue() & C2->getValue()) == 0) {
+        // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
+        // iff (C1&C2) == 0 and (N&~C1) == 0
+        if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) ||  // (V|N)
+             (V2 == B && MaskedValueIsZero(V1, ~C1->getValue()))))   // (N|V)
+          return BinaryOperator::CreateAnd(A,
+                               ConstantInt::get(A->getContext(),
+                                                C1->getValue()|C2->getValue()));
+        // Or commutes, try both ways.
+        if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
+            ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) ||  // (V|N)
+             (V2 == A && MaskedValueIsZero(V1, ~C2->getValue()))))   // (N|V)
+          return BinaryOperator::CreateAnd(B,
+                               ConstantInt::get(B->getContext(),
+                                                C1->getValue()|C2->getValue()));
+
+        // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
+        // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
+        ConstantInt *C3 = 0, *C4 = 0;
+        if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
+            (C3->getValue() & ~C1->getValue()) == 0 &&
+            match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
+            (C4->getValue() & ~C2->getValue()) == 0) {
+          V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
+          return BinaryOperator::CreateAnd(V2,
+                               ConstantInt::get(B->getContext(),
+                                                C1->getValue()|C2->getValue()));
+        }
+      }
+    }
+
+    // (A & (C0?-1:0)) | (B & ~(C0?-1:0)) ->  C0 ? A : B, and commuted variants.
+    // Don't do this for vector select idioms, the code generator doesn't handle
+    // them well yet.
+    if (!I.getType()->isVectorTy()) {
+      if (Instruction *Match = MatchSelectFromAndOr(A, B, C, D))
+        return Match;
+      if (Instruction *Match = MatchSelectFromAndOr(B, A, D, C))
+        return Match;
+      if (Instruction *Match = MatchSelectFromAndOr(C, B, A, D))
+        return Match;
+      if (Instruction *Match = MatchSelectFromAndOr(D, A, B, C))
+        return Match;
+    }
+
+    // ((A&~B)|(~A&B)) -> A^B
+    if ((match(C, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, D);
+    // ((~B&A)|(~A&B)) -> A^B
+    if ((match(A, m_Not(m_Specific(D))) &&
+         match(B, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, D);
+    // ((A&~B)|(B&~A)) -> A^B
+    if ((match(C, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(A)))))
+      return BinaryOperator::CreateXor(A, B);
+    // ((~B&A)|(B&~A)) -> A^B
+    if ((match(A, m_Not(m_Specific(B))) &&
+         match(D, m_Not(m_Specific(C)))))
+      return BinaryOperator::CreateXor(C, B);
+
+    // ((A|B)&1)|(B&-2) -> (A&1) | B
+    if (match(A, m_Or(m_Value(V1), m_Specific(B))) ||
+        match(A, m_Or(m_Specific(B), m_Value(V1)))) {
+      Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C);
+      if (Ret) return Ret;
+    }
+    // (B&-2)|((A|B)&1) -> (A&1) | B
+    if (match(B, m_Or(m_Specific(A), m_Value(V1))) ||
+        match(B, m_Or(m_Value(V1), m_Specific(A)))) {
+      Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D);
+      if (Ret) return Ret;
+    }
+  }
+
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() &&
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Value *NewOp = Builder->CreateOr(SI0->getOperand(0), SI1->getOperand(0),
+                                         SI0->getName());
+        return BinaryOperator::Create(SI1->getOpcode(), NewOp,
+                                      SI1->getOperand(1));
+      }
+  }
+
+  // (~A | ~B) == (~(A & B)) - De Morgan's Law
+  if (Value *Op0NotVal = dyn_castNotVal(Op0))
+    if (Value *Op1NotVal = dyn_castNotVal(Op1))
+      if (Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *And = Builder->CreateAnd(Op0NotVal, Op1NotVal,
+                                        I.getName()+".demorgan");
+        return BinaryOperator::CreateNot(And);
+      }
+
+  // Canonicalize xor to the RHS.
+  bool SwappedForXor = false;
+  if (match(Op0, m_Xor(m_Value(), m_Value()))) {
+    std::swap(Op0, Op1);
+    SwappedForXor = true;
+  }
+
+  // A | ( A ^ B) -> A |  B
+  // A | (~A ^ B) -> A | ~B
+  // (A & B) | (A ^ B)
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+    if (Op0 == A || Op0 == B)
+      return BinaryOperator::CreateOr(A, B);
+
+    if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+        match(Op0, m_And(m_Specific(B), m_Specific(A))))
+      return BinaryOperator::CreateOr(A, B);
+
+    if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder->CreateNot(B, B->getName()+".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
+      Value *Not = Builder->CreateNot(A, A->getName()+".not");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+  }
+
+  // A | ~(A | B) -> A | ~B
+  // A | ~(A ^ B) -> A | ~B
+  if (match(Op1, m_Not(m_Value(A))))
+    if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
+      if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
+          Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
+                               B->getOpcode() == Instruction::Xor)) {
+        Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
+                                                 B->getOperand(0);
+        Value *Not = Builder->CreateNot(NotOp, NotOp->getName()+".not");
+        return BinaryOperator::CreateOr(Not, Op0);
+      }
+
+  if (SwappedForXor)
+    std::swap(Op0, Op1);
+
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+      if (Value *Res = FoldOrOfICmps(LHS, RHS))
+        return ReplaceInstUsesWith(I, Res);
+
+  // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
+  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+      if (Value *Res = FoldOrOfFCmps(LHS, RHS))
+        return ReplaceInstUsesWith(I, Res);
+
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    CastInst *Op1C = dyn_cast<CastInst>(Op1);
+    if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
+      Type *SrcTy = Op0C->getOperand(0)->getType();
+      if (SrcTy == Op1C->getOperand(0)->getType() &&
+          SrcTy->isIntOrIntVectorTy()) {
+        Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
+
+        if ((!isa<ICmpInst>(Op0COp) || !isa<ICmpInst>(Op1COp)) &&
+            // Only do this if the casts both really cause code to be
+            // generated.
+            ShouldOptimizeCast(Op0C->getOpcode(), Op0COp, I.getType()) &&
+            ShouldOptimizeCast(Op1C->getOpcode(), Op1COp, I.getType())) {
+          Value *NewOp = Builder->CreateOr(Op0COp, Op1COp, I.getName());
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+
+        // If this is or(cast(icmp), cast(icmp)), try to fold this even if the
+        // cast is otherwise not optimizable.  This happens for vector sexts.
+        if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1COp))
+          if (ICmpInst *LHS = dyn_cast<ICmpInst>(Op0COp))
+            if (Value *Res = FoldOrOfICmps(LHS, RHS))
+              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
+
+        // If this is or(cast(fcmp), cast(fcmp)), try to fold this even if the
+        // cast is otherwise not optimizable.  This happens for vector sexts.
+        if (FCmpInst *RHS = dyn_cast<FCmpInst>(Op1COp))
+          if (FCmpInst *LHS = dyn_cast<FCmpInst>(Op0COp))
+            if (Value *Res = FoldOrOfFCmps(LHS, RHS))
+              return CastInst::Create(Op0C->getOpcode(), Res, I.getType());
+      }
+    }
+  }
+
+  // or(sext(A), B) -> A ? -1 : B where A is an i1
+  // or(A, sext(B)) -> B ? -1 : A where B is an i1
+  if (match(Op0, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+  if (match(Op1, m_SExt(m_Value(A))) && A->getType()->isIntegerTy(1))
+    return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+
+  // Note: If we've gotten to the point of visiting the outer OR, then the
+  // inner one couldn't be simplified.  If it was a constant, then it won't
+  // be simplified by a later pass either, so we try swapping the inner/outer
+  // ORs in the hopes that we'll be able to simplify it this way.
+  // (X|C) | V --> (X|V) | C
+  if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
+      match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) {
+    Value *Inner = Builder->CreateOr(A, Op1);
+    Inner->takeName(Op0);
+    return BinaryOperator::CreateOr(Inner, C1);
+  }
+
+  // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+  // Since this OR statement hasn't been optimized further yet, we hope
+  // that this transformation will allow the new ORs to be optimized.
+  {
+    Value *X = 0, *Y = 0;
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+        match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+      Value *orTrue = Builder->CreateOr(A, C);
+      Value *orFalse = Builder->CreateOr(B, D);
+      return SelectInst::Create(X, orTrue, orFalse);
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyXorInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // (A&B)^(A&C) -> A&(B^C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Is this a ~ operation?
+  if (Value *NotOp = dyn_castNotVal(&I)) {
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
+      if (Op0I->getOpcode() == Instruction::And ||
+          Op0I->getOpcode() == Instruction::Or) {
+        // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
+        // ~(~X | Y) === (X & ~Y) - De Morgan's Law
+        if (dyn_castNotVal(Op0I->getOperand(1)))
+          Op0I->swapOperands();
+        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
+          Value *NotY =
+            Builder->CreateNot(Op0I->getOperand(1),
+                               Op0I->getOperand(1)->getName()+".not");
+          if (Op0I->getOpcode() == Instruction::And)
+            return BinaryOperator::CreateOr(Op0NotVal, NotY);
+          return BinaryOperator::CreateAnd(Op0NotVal, NotY);
+        }
+
+        // ~(X & Y) --> (~X | ~Y) - De Morgan's Law
+        // ~(X | Y) === (~X & ~Y) - De Morgan's Law
+        if (isFreeToInvert(Op0I->getOperand(0)) &&
+            isFreeToInvert(Op0I->getOperand(1))) {
+          Value *NotX =
+            Builder->CreateNot(Op0I->getOperand(0), "notlhs");
+          Value *NotY =
+            Builder->CreateNot(Op0I->getOperand(1), "notrhs");
+          if (Op0I->getOpcode() == Instruction::And)
+            return BinaryOperator::CreateOr(NotX, NotY);
+          return BinaryOperator::CreateAnd(NotX, NotY);
+        }
+
+      } else if (Op0I->getOpcode() == Instruction::AShr) {
+        // ~(~X >>s Y) --> (X >>s Y)
+        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0)))
+          return BinaryOperator::CreateAShr(Op0NotVal, Op0I->getOperand(1));
+      }
+    }
+  }
+
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    if (RHS->isOne() && Op0->hasOneUse())
+      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0))
+        return CmpInst::Create(CI->getOpcode(),
+                               CI->getInversePredicate(),
+                               CI->getOperand(0), CI->getOperand(1));
+
+    // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
+    if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+      if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
+        if (CI->hasOneUse() && Op0C->hasOneUse()) {
+          Instruction::CastOps Opcode = Op0C->getOpcode();
+          if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+              (RHS == ConstantExpr::getCast(Opcode,
+                                           ConstantInt::getTrue(I.getContext()),
+                                            Op0C->getDestTy()))) {
+            CI->setPredicate(CI->getInversePredicate());
+            return CastInst::Create(Opcode, CI, Op0C->getType());
+          }
+        }
+      }
+    }
+
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ~(c-X) == X-c-1 == X+(-c-1)
+      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+        if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
+          Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
+          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
+                                      ConstantInt::get(I.getType(), 1));
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+        }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
+        if (Op0I->getOpcode() == Instruction::Add) {
+          // ~(X-c) --> (-c-1)-X
+          if (RHS->isAllOnesValue()) {
+            Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
+            return BinaryOperator::CreateSub(
+                           ConstantExpr::getSub(NegOp0CI,
+                                      ConstantInt::get(I.getType(), 1)),
+                                      Op0I->getOperand(0));
+          } else if (RHS->getValue().isSignBit()) {
+            // (X + C) ^ signbit -> (X + C + signbit)
+            Constant *C = ConstantInt::get(I.getContext(),
+                                           RHS->getValue() + Op0CI->getValue());
+            return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
+
+          }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS,
+                                       ConstantExpr::getNot(CommonBits));
+            Worklist.Add(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
+        } else if (Op0I->getOpcode() == Instruction::LShr) {
+          // ((X^C1) >> C2) ^ C3 -> (X>>C2) ^ ((C1>>C2)^C3)
+          // E1 = "X ^ C1"
+          BinaryOperator *E1;
+          ConstantInt *C1;
+          if (Op0I->hasOneUse() &&
+              (E1 = dyn_cast<BinaryOperator>(Op0I->getOperand(0))) &&
+              E1->getOpcode() == Instruction::Xor &&
+              (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
+            // fold (C1 >> C2) ^ C3
+            ConstantInt *C2 = Op0CI, *C3 = RHS;
+            APInt FoldConst = C1->getValue().lshr(C2->getValue());
+            FoldConst ^= C3->getValue();
+            // Prepare the two operands.
+            Value *Opnd0 = Builder->CreateLShr(E1->getOperand(0), C2);
+            Opnd0->takeName(Op0I);
+            cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc());
+            Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst);
+
+            return BinaryOperator::CreateXor(Opnd0, FoldVal);
+          }
+        }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
+  if (Op1I) {
+    Value *A, *B;
+    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0) {              // B^(B|A) == (A|B)^B
+        Op1I->swapOperands();
+        I.swapOperands();
+        std::swap(Op0, Op1);
+      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) &&
+               Op1I->hasOneUse()){
+      if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    }
+  }
+
+  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
+  if (Op0I) {
+    Value *A, *B;
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        Op0I->hasOneUse()) {
+      if (A == Op1)                                  // (B|A)^B == (A|B)^B
+        std::swap(A, B);
+      if (B == Op1)                                  // (A|B)^B == A & ~B
+        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
+    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+               Op0I->hasOneUse()){
+      if (A == Op1)                                        // (A&B)^A -> (B&A)^A
+        std::swap(A, B);
+      if (B == Op1 &&                                      // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
+      }
+    }
+  }
+
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  if (Op0I && Op1I && Op0I->isShift() &&
+      Op0I->getOpcode() == Op1I->getOpcode() &&
+      Op0I->getOperand(1) == Op1I->getOperand(1) &&
+      (Op0I->hasOneUse() || Op1I->hasOneUse())) {
+    Value *NewOp =
+      Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0),
+                         Op0I->getName());
+    return BinaryOperator::Create(Op1I->getOpcode(), NewOp,
+                                  Op1I->getOperand(1));
+  }
+
+  if (Op0I && Op1I) {
+    Value *A, *B, *C, *D;
+    // (A & B)^(A | B) -> A ^ B
+    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C))
+        return BinaryOperator::CreateXor(A, B);
+    }
+    // (A | B)^(A & B) -> A ^ B
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C))
+        return BinaryOperator::CreateXor(A, B);
+    }
+  }
+
+  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+      if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+        if (LHS->getOperand(0) == RHS->getOperand(1) &&
+            LHS->getOperand(1) == RHS->getOperand(0))
+          LHS->swapOperands();
+        if (LHS->getOperand(0) == RHS->getOperand(0) &&
+            LHS->getOperand(1) == RHS->getOperand(1)) {
+          Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+          unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+          bool isSigned = LHS->isSigned() || RHS->isSigned();
+          return ReplaceInstUsesWith(I,
+                               getNewICmpValue(isSigned, Code, Op0, Op1,
+                                               Builder));
+        }
+      }
+
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
+        Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() &&
+            // Only do this if the casts both really cause code to be generated.
+            ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0),
+                               I.getType()) &&
+            ShouldOptimizeCast(Op1C->getOpcode(), Op1C->getOperand(0),
+                               I.getType())) {
+          Value *NewOp = Builder->CreateXor(Op0C->getOperand(0),
+                                            Op1C->getOperand(0), I.getName());
+          return CastInst::Create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+  }
+
+  return Changed ? &I : 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
new file mode 100644
index 000000000000..64cd1bd27891
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -0,0 +1,1405 @@
+//===- InstCombineCalls.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitCall and visitInvoke functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
+/// getPromotedType - Return the specified type promoted as it would be to pass
+/// though a va_arg area.
+static Type *getPromotedType(Type *Ty) {
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::getInt32Ty(Ty->getContext());
+  }
+  return Ty;
+}
+
+/// reduceToSingleValueType - Given an aggregate type which ultimately holds a
+/// single scalar element, like {{{type}}} or [1 x type], return type.
+static Type *reduceToSingleValueType(Type *T) {
+  while (!T->isSingleValueType()) {
+    if (StructType *STy = dyn_cast<StructType>(T)) {
+      if (STy->getNumElements() == 1)
+        T = STy->getElementType(0);
+      else
+        break;
+    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
+      if (ATy->getNumElements() == 1)
+        T = ATy->getElementType();
+      else
+        break;
+    } else
+      break;
+  }
+
+  return T;
+}
+
+Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+  unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD);
+  unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), TD);
+  unsigned MinAlign = std::min(DstAlign, SrcAlign);
+  unsigned CopyAlign = MI->getAlignment();
+
+  if (CopyAlign < MinAlign) {
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
+                                             MinAlign, false));
+    return MI;
+  }
+
+  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+  // load/store.
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+  if (MemOpLength == 0) return 0;
+
+  // Source and destination pointer types are always "i8*" for intrinsic.  See
+  // if the size is something we can handle with a single primitive load/store.
+  // A single load+store correctly handles overlapping memory in the memmove
+  // case.
+  uint64_t Size = MemOpLength->getLimitedValue();
+  assert(Size && "0-sized memory transfering should be removed already.");
+
+  if (Size > 8 || (Size&(Size-1)))
+    return 0;  // If not 1/2/4/8 bytes, exit.
+
+  // Use an integer load+store unless we can find something better.
+  unsigned SrcAddrSp =
+    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
+  unsigned DstAddrSp =
+    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
+
+  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
+  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
+
+  // Memcpy forces the use of i8* for the source and destination.  That means
+  // that if you're using memcpy to move one double around, you'll get a cast
+  // from double* to i8*.  We'd much rather use a double load+store rather than
+  // an i64 load+store, here because this improves the odds that the source or
+  // dest address will be promotable.  See if we can find a better type than the
+  // integer datatype.
+  Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  MDNode *CopyMD = 0;
+  if (StrippedDest != MI->getArgOperand(0)) {
+    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
+                                    ->getElementType();
+    if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
+      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
+      // down through these levels if so.
+      SrcETy = reduceToSingleValueType(SrcETy);
+
+      if (SrcETy->isSingleValueType()) {
+        NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
+        NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
+
+        // If the memcpy has metadata describing the members, see if we can
+        // get the TBAA tag describing our copy.
+        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+          if (M->getNumOperands() == 3 &&
+              M->getOperand(0) &&
+              isa<ConstantInt>(M->getOperand(0)) &&
+              cast<ConstantInt>(M->getOperand(0))->isNullValue() &&
+              M->getOperand(1) &&
+              isa<ConstantInt>(M->getOperand(1)) &&
+              cast<ConstantInt>(M->getOperand(1))->getValue() == Size &&
+              M->getOperand(2) &&
+              isa<MDNode>(M->getOperand(2)))
+            CopyMD = cast<MDNode>(M->getOperand(2));
+        }
+      }
+    }
+  }
+
+  // If the memcpy/memmove provides better alignment info than we can
+  // infer, use it.
+  SrcAlign = std::max(SrcAlign, CopyAlign);
+  DstAlign = std::max(DstAlign, CopyAlign);
+
+  Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+  Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
+  LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
+  L->setAlignment(SrcAlign);
+  if (CopyMD)
+    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+  StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
+  S->setAlignment(DstAlign);
+  if (CopyMD)
+    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+
+  // Set the size of the copy to 0, it will be deleted on the next iteration.
+  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+  return MI;
+}
+
+Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+  unsigned Alignment = getKnownAlignment(MI->getDest(), TD);
+  if (MI->getAlignment() < Alignment) {
+    MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
+                                             Alignment, false));
+    return MI;
+  }
+
+  // Extract the length and alignment and fill if they are constant.
+  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+  if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
+    return 0;
+  uint64_t Len = LenC->getLimitedValue();
+  Alignment = MI->getAlignment();
+  assert(Len && "0-sized memory setting should be removed already.");
+
+  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
+
+    Value *Dest = MI->getDest();
+    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
+    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
+    Dest = Builder->CreateBitCast(Dest, NewDstPtrTy);
+
+    // Alignment 0 is identity for alignment 1 for memset, but not store.
+    if (Alignment == 0) Alignment = 1;
+
+    // Extract the fill value and store.
+    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+    StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                        MI->isVolatile());
+    S->setAlignment(Alignment);
+
+    // Set the size of the copy to 0, it will be deleted on the next iteration.
+    MI->setLength(Constant::getNullValue(LenC->getType()));
+    return MI;
+  }
+
+  return 0;
+}
+
+/// visitCallInst - CallInst simplification.  This mostly only handles folding
+/// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
+/// the heavy lifting.
+///
+Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+  if (isFreeCall(&CI, TLI))
+    return visitFree(CI);
+
+  // If the caller function is nounwind, mark the call as nounwind, even if the
+  // callee isn't.
+  if (CI.getParent()->getParent()->doesNotThrow() &&
+      !CI.doesNotThrow()) {
+    CI.setDoesNotThrow();
+    return &CI;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallSite(&CI);
+
+  // Intrinsics cannot occur in an invoke, so handle them here instead of in
+  // visitCallSite.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue())
+        return EraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // No other transformations apply to volatile transfers.
+    if (MI->isVolatile())
+      return 0;
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getParent()->getParent()->getParent();
+          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          Type *Tys[3] = { CI.getArgOperand(0)->getType(),
+                           CI.getArgOperand(1)->getType(),
+                           CI.getArgOperand(2)->getType() };
+          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
+          Changed = true;
+        }
+    }
+
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+      // memmove(x,x,size) -> noop.
+      if (MTI->getSource() == MTI->getDest())
+        return EraseInstFromFunction(CI);
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyMemTransfer(MI))
+        return I;
+    } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
+      if (Instruction *I = SimplifyMemSet(MSI))
+        return I;
+    }
+
+    if (Changed) return II;
+  }
+
+  switch (II->getIntrinsicID()) {
+  default: break;
+  case Intrinsic::objectsize: {
+    uint64_t Size;
+    if (getObjectSize(II->getArgOperand(0), Size, TD, TLI))
+      return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
+    return 0;
+  }
+  case Intrinsic::bswap: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = 0;
+
+    // bswap(bswap(x)) -> x
+    if (match(IIOperand, m_BSwap(m_Value(X))))
+        return ReplaceInstUsesWith(CI, X);
+
+    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
+    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
+      unsigned C = X->getType()->getPrimitiveSizeInBits() -
+        IIOperand->getType()->getPrimitiveSizeInBits();
+      Value *CV = ConstantInt::get(X->getType(), C);
+      Value *V = Builder->CreateLShr(X, CV);
+      return new TruncInst(V, IIOperand->getType());
+    }
+    break;
+  }
+
+  case Intrinsic::powi:
+    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // powi(x, 0) -> 1.0
+      if (Power->isZero())
+        return ReplaceInstUsesWith(CI, ConstantFP::get(CI.getType(), 1.0));
+      // powi(x, 1) -> x
+      if (Power->isOne())
+        return ReplaceInstUsesWith(CI, II->getArgOperand(0));
+      // powi(x, -1) -> 1/x
+      if (Power->isAllOnesValue())
+        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
+                                          II->getArgOperand(0));
+    }
+    break;
+  case Intrinsic::cttz: {
+    // If all bits below the first known one are known zero,
+    // this value is constant.
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    // FIXME: Try to simplify vectors of integers.
+    if (!IT) break;
+    uint32_t BitWidth = IT->getBitWidth();
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    unsigned TrailingZeros = KnownOne.countTrailingZeros();
+    APInt Mask(APInt::getLowBitsSet(BitWidth, TrailingZeros));
+    if ((Mask & KnownZero) == Mask)
+      return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
+                                 APInt(BitWidth, TrailingZeros)));
+
+    }
+    break;
+  case Intrinsic::ctlz: {
+    // If all bits above the first known one are known zero,
+    // this value is constant.
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    // FIXME: Try to simplify vectors of integers.
+    if (!IT) break;
+    uint32_t BitWidth = IT->getBitWidth();
+    APInt KnownZero(BitWidth, 0);
+    APInt KnownOne(BitWidth, 0);
+    ComputeMaskedBits(II->getArgOperand(0), KnownZero, KnownOne);
+    unsigned LeadingZeros = KnownOne.countLeadingZeros();
+    APInt Mask(APInt::getHighBitsSet(BitWidth, LeadingZeros));
+    if ((Mask & KnownZero) == Mask)
+      return ReplaceInstUsesWith(CI, ConstantInt::get(IT,
+                                 APInt(BitWidth, LeadingZeros)));
+
+    }
+    break;
+  case Intrinsic::uadd_with_overflow: {
+    Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+    IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
+    uint32_t BitWidth = IT->getBitWidth();
+    APInt LHSKnownZero(BitWidth, 0);
+    APInt LHSKnownOne(BitWidth, 0);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    bool LHSKnownNegative = LHSKnownOne[BitWidth - 1];
+    bool LHSKnownPositive = LHSKnownZero[BitWidth - 1];
+
+    if (LHSKnownNegative || LHSKnownPositive) {
+      APInt RHSKnownZero(BitWidth, 0);
+      APInt RHSKnownOne(BitWidth, 0);
+      ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+      bool RHSKnownNegative = RHSKnownOne[BitWidth - 1];
+      bool RHSKnownPositive = RHSKnownZero[BitWidth - 1];
+      if (LHSKnownNegative && RHSKnownNegative) {
+        // The sign bit is set in both cases: this MUST overflow.
+        // Create a simple add instruction, and insert it into the struct.
+        Value *Add = Builder->CreateAdd(LHS, RHS);
+        Add->takeName(&CI);
+        Constant *V[] = {
+          UndefValue::get(LHS->getType()),
+          ConstantInt::getTrue(II->getContext())
+        };
+        StructType *ST = cast<StructType>(II->getType());
+        Constant *Struct = ConstantStruct::get(ST, V);
+        return InsertValueInst::Create(Struct, Add, 0);
+      }
+
+      if (LHSKnownPositive && RHSKnownPositive) {
+        // The sign bit is clear in both cases: this CANNOT overflow.
+        // Create a simple add instruction, and insert it into the struct.
+        Value *Add = Builder->CreateNUWAdd(LHS, RHS);
+        Add->takeName(&CI);
+        Constant *V[] = {
+          UndefValue::get(LHS->getType()),
+          ConstantInt::getFalse(II->getContext())
+        };
+        StructType *ST = cast<StructType>(II->getType());
+        Constant *Struct = ConstantStruct::get(ST, V);
+        return InsertValueInst::Create(Struct, Add, 0);
+      }
+    }
+  }
+  // FALL THROUGH uadd into sadd
+  case Intrinsic::sadd_with_overflow:
+    // Canonicalize constants into the RHS.
+    if (isa<Constant>(II->getArgOperand(0)) &&
+        !isa<Constant>(II->getArgOperand(1))) {
+      Value *LHS = II->getArgOperand(0);
+      II->setArgOperand(0, II->getArgOperand(1));
+      II->setArgOperand(1, LHS);
+      return II;
+    }
+
+    // X + undef -> undef
+    if (isa<UndefValue>(II->getArgOperand(1)))
+      return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // X + 0 -> {X, false}
+      if (RHS->isZero()) {
+        Constant *V[] = {
+          UndefValue::get(II->getArgOperand(0)->getType()),
+          ConstantInt::getFalse(II->getContext())
+        };
+        Constant *Struct =
+          ConstantStruct::get(cast<StructType>(II->getType()), V);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+      }
+    }
+    break;
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+    // undef - X -> undef
+    // X - undef -> undef
+    if (isa<UndefValue>(II->getArgOperand(0)) ||
+        isa<UndefValue>(II->getArgOperand(1)))
+      return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // X - 0 -> {X, false}
+      if (RHS->isZero()) {
+        Constant *V[] = {
+          UndefValue::get(II->getArgOperand(0)->getType()),
+          ConstantInt::getFalse(II->getContext())
+        };
+        Constant *Struct =
+          ConstantStruct::get(cast<StructType>(II->getType()), V);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+      }
+    }
+    break;
+  case Intrinsic::umul_with_overflow: {
+    Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+    unsigned BitWidth = cast<IntegerType>(LHS->getType())->getBitWidth();
+
+    APInt LHSKnownZero(BitWidth, 0);
+    APInt LHSKnownOne(BitWidth, 0);
+    ComputeMaskedBits(LHS, LHSKnownZero, LHSKnownOne);
+    APInt RHSKnownZero(BitWidth, 0);
+    APInt RHSKnownOne(BitWidth, 0);
+    ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
+
+    // Get the largest possible values for each operand.
+    APInt LHSMax = ~LHSKnownZero;
+    APInt RHSMax = ~RHSKnownZero;
+
+    // If multiplying the maximum values does not overflow then we can turn
+    // this into a plain NUW mul.
+    bool Overflow;
+    LHSMax.umul_ov(RHSMax, Overflow);
+    if (!Overflow) {
+      Value *Mul = Builder->CreateNUWMul(LHS, RHS, "umul_with_overflow");
+      Constant *V[] = {
+        UndefValue::get(LHS->getType()),
+        Builder->getFalse()
+      };
+      Constant *Struct = ConstantStruct::get(cast<StructType>(II->getType()),V);
+      return InsertValueInst::Create(Struct, Mul, 0);
+    }
+  } // FALL THROUGH
+  case Intrinsic::smul_with_overflow:
+    // Canonicalize constants into the RHS.
+    if (isa<Constant>(II->getArgOperand(0)) &&
+        !isa<Constant>(II->getArgOperand(1))) {
+      Value *LHS = II->getArgOperand(0);
+      II->setArgOperand(0, II->getArgOperand(1));
+      II->setArgOperand(1, LHS);
+      return II;
+    }
+
+    // X * undef -> undef
+    if (isa<UndefValue>(II->getArgOperand(1)))
+      return ReplaceInstUsesWith(CI, UndefValue::get(II->getType()));
+
+    if (ConstantInt *RHSI = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      // X*0 -> {0, false}
+      if (RHSI->isZero())
+        return ReplaceInstUsesWith(CI, Constant::getNullValue(II->getType()));
+
+      // X * 1 -> {X, false}
+      if (RHSI->equalsInt(1)) {
+        Constant *V[] = {
+          UndefValue::get(II->getArgOperand(0)->getType()),
+          ConstantInt::getFalse(II->getContext())
+        };
+        Constant *Struct =
+          ConstantStruct::get(cast<StructType>(II->getType()), V);
+        return InsertValueInst::Create(Struct, II->getArgOperand(0), 0);
+      }
+    }
+    break;
+  case Intrinsic::ppc_altivec_lvx:
+  case Intrinsic::ppc_altivec_lvxl:
+    // Turn PPC lvx -> load if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+                                         PointerType::getUnqual(II->getType()));
+      return new LoadInst(Ptr);
+    }
+    break;
+  case Intrinsic::ppc_altivec_stvx:
+  case Intrinsic::ppc_altivec_stvxl:
+    // Turn stvx -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(0)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      return new StoreInst(II->getArgOperand(0), Ptr);
+    }
+    break;
+  case Intrinsic::x86_sse_storeu_ps:
+  case Intrinsic::x86_sse2_storeu_pd:
+  case Intrinsic::x86_sse2_storeu_dq:
+    // Turn X86 storeu -> store if the pointer is known aligned.
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
+      Type *OpPtrTy =
+        PointerType::getUnqual(II->getArgOperand(1)->getType());
+      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
+      return new StoreInst(II->getArgOperand(1), Ptr);
+    }
+    break;
+
+  case Intrinsic::x86_sse_cvtss2si:
+  case Intrinsic::x86_sse_cvtss2si64:
+  case Intrinsic::x86_sse_cvttss2si:
+  case Intrinsic::x86_sse_cvttss2si64:
+  case Intrinsic::x86_sse2_cvtsd2si:
+  case Intrinsic::x86_sse2_cvtsd2si64:
+  case Intrinsic::x86_sse2_cvttsd2si:
+  case Intrinsic::x86_sse2_cvttsd2si64: {
+    // These intrinsics only demand the 0th element of their input vectors. If
+    // we can simplify the input based on that, do so now.
+    unsigned VWidth =
+      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
+    APInt DemandedElts(VWidth, 1);
+    APInt UndefElts(VWidth, 0);
+    if (Value *V = SimplifyDemandedVectorElts(II->getArgOperand(0),
+                                              DemandedElts, UndefElts)) {
+      II->setArgOperand(0, V);
+      return II;
+    }
+    break;
+  }
+
+
+  case Intrinsic::x86_sse41_pmovsxbw:
+  case Intrinsic::x86_sse41_pmovsxwd:
+  case Intrinsic::x86_sse41_pmovsxdq:
+  case Intrinsic::x86_sse41_pmovzxbw:
+  case Intrinsic::x86_sse41_pmovzxwd:
+  case Intrinsic::x86_sse41_pmovzxdq: {
+    // pmov{s|z}x ignores the upper half of their input vectors.
+    unsigned VWidth =
+      cast<VectorType>(II->getArgOperand(0)->getType())->getNumElements();
+    unsigned LowHalfElts = VWidth / 2;
+    APInt InputDemandedElts(APInt::getBitsSet(VWidth, 0, LowHalfElts));
+    APInt UndefElts(VWidth, 0);
+    if (Value *TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0),
+                                                 InputDemandedElts,
+                                                 UndefElts)) {
+      II->setArgOperand(0, TmpV);
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::ppc_altivec_vperm:
+    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
+      assert(Mask->getType()->getVectorNumElements() == 16 &&
+             "Bad type for intrinsic!");
+
+      // Check that all of the elements are integer constants or undefs.
+      bool AllEltsOk = true;
+      for (unsigned i = 0; i != 16; ++i) {
+        Constant *Elt = Mask->getAggregateElement(i);
+        if (Elt == 0 ||
+            !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
+          AllEltsOk = false;
+          break;
+        }
+      }
+
+      if (AllEltsOk) {
+        // Cast the input vectors to byte vectors.
+        Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0),
+                                            Mask->getType());
+        Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1),
+                                            Mask->getType());
+        Value *Result = UndefValue::get(Op0->getType());
+
+        // Only extract each element once.
+        Value *ExtractedElts[32];
+        memset(ExtractedElts, 0, sizeof(ExtractedElts));
+
+        for (unsigned i = 0; i != 16; ++i) {
+          if (isa<UndefValue>(Mask->getAggregateElement(i)))
+            continue;
+          unsigned Idx =
+            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
+          Idx &= 31;  // Match the hardware behavior.
+
+          if (ExtractedElts[Idx] == 0) {
+            ExtractedElts[Idx] =
+              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
+                                            Builder->getInt32(Idx&15));
+          }
+
+          // Insert this value into the result vector.
+          Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx],
+                                                Builder->getInt32(i));
+        }
+        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
+      }
+    }
+    break;
+
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane:
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0), TD);
+    unsigned AlignArg = II->getNumArgOperands() - 1;
+    ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
+    if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
+      II->setArgOperand(AlignArg,
+                        ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                         MemAlign, false));
+      return II;
+    }
+    break;
+  }
+
+  case Intrinsic::arm_neon_vmulls:
+  case Intrinsic::arm_neon_vmullu: {
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+
+    // Handle mul by zero first:
+    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+      return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+    }
+
+    // Check for constant LHS & RHS - in this case we just simplify.
+    bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu);
+    VectorType *NewVT = cast<VectorType>(II->getType());
+    unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth();
+    if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) {
+      if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+        VectorType* VT = cast<VectorType>(CV0->getType());
+        SmallVector<Constant*, 4> NewElems;
+        for (unsigned i = 0; i < VT->getNumElements(); ++i) {
+          APInt CV0E =
+            (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue();
+          CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth);
+          APInt CV1E =
+            (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue();
+          CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth);
+          NewElems.push_back(
+            ConstantInt::get(NewVT->getElementType(), CV0E * CV1E));
+        }
+        return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems));
+      }
+
+      // Couldn't simplify - cannonicalize constant to the RHS.
+      std::swap(Arg0, Arg1);
+    }
+
+    // Handle mul by one:
+    if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) {
+      if (ConstantInt *Splat =
+            dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) {
+        if (Splat->isOne()) {
+          if (Zext)
+            return CastInst::CreateZExtOrBitCast(Arg0, II->getType());
+          // else
+          return CastInst::CreateSExtOrBitCast(Arg0, II->getType());
+        }
+      }
+    }
+
+    break;
+  }
+
+  case Intrinsic::stackrestore: {
+    // If the save is right next to the restore, remove the restore.  This can
+    // happen when variable allocas are DCE'd.
+    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+        BasicBlock::iterator BI = SS;
+        if (&*++BI == II)
+          return EraseInstFromFunction(CI);
+      }
+    }
+
+    // Scan down this block to see if there is another stack restore in the
+    // same block without an intervening call/alloca.
+    BasicBlock::iterator BI = II;
+    TerminatorInst *TI = II->getParent()->getTerminator();
+    bool CannotRemove = false;
+    for (++BI; &*BI != TI; ++BI) {
+      if (isa<AllocaInst>(BI)) {
+        CannotRemove = true;
+        break;
+      }
+      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
+          // If there is a stackrestore below this one, remove this one.
+          if (II->getIntrinsicID() == Intrinsic::stackrestore)
+            return EraseInstFromFunction(CI);
+          // Otherwise, ignore the intrinsic.
+        } else {
+          // If we found a non-intrinsic call, we can't remove the stack
+          // restore.
+          CannotRemove = true;
+          break;
+        }
+      }
+    }
+
+    // If the stack restore is in a return, resume, or unwind block and if there
+    // are no allocas or calls between the restore and the return, nuke the
+    // restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
+      return EraseInstFromFunction(CI);
+    break;
+  }
+  }
+
+  return visitCallSite(II);
+}
+
+// InvokeInst simplification
+//
+Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+  return visitCallSite(&II);
+}
+
+/// isSafeToEliminateVarargsCast - If this cast does not affect the value
+/// passed through the varargs area, we can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallSite CS,
+                                         const CastInst * const CI,
+                                         const DataLayout * const TD,
+                                         const int ix) {
+  if (!CI->isLosslessCast())
+    return false;
+
+  // The size of ByVal arguments is derived from the type, so we
+  // can't change to a type with a different size.  If the size were
+  // passed explicitly we could avoid this check.
+  if (!CS.isByValArgument(ix))
+    return true;
+
+  Type* SrcTy =
+            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+  Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  if (!SrcTy->isSized() || !DstTy->isSized())
+    return false;
+  if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
+    return false;
+  return true;
+}
+
+// Try to fold some different type of calls here.
+// Currently we're only working with the checking functions, memcpy_chk,
+// mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
+// strcat_chk and strncat_chk.
+Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) {
+  if (CI->getCalledFunction() == 0) return 0;
+
+  if (Value *With = Simplifier->optimizeCall(CI)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
+  }
+
+  return 0;
+}
+
+static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
+  // Strip off at most one level of pointer casts, looking for an alloca.  This
+  // is good enough in practice and simpler than handling any number of casts.
+  Value *Underlying = TrampMem->stripPointerCasts();
+  if (Underlying != TrampMem &&
+      (!Underlying->hasOneUse() || *Underlying->use_begin() != TrampMem))
+    return 0;
+  if (!isa<AllocaInst>(Underlying))
+    return 0;
+
+  IntrinsicInst *InitTrampoline = 0;
+  for (Value::use_iterator I = TrampMem->use_begin(), E = TrampMem->use_end();
+       I != E; I++) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(*I);
+    if (!II)
+      return 0;
+    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+      if (InitTrampoline)
+        // More than one init_trampoline writes to this value.  Give up.
+        return 0;
+      InitTrampoline = II;
+      continue;
+    }
+    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+      // Allow any number of calls to adjust.trampoline.
+      continue;
+    return 0;
+  }
+
+  // No call to init.trampoline found.
+  if (!InitTrampoline)
+    return 0;
+
+  // Check that the alloca is being used in the expected way.
+  if (InitTrampoline->getOperand(0) != TrampMem)
+    return 0;
+
+  return InitTrampoline;
+}
+
+static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+                                               Value *TrampMem) {
+  // Visit all the previous instructions in the basic block, and try to find a
+  // init.trampoline which has a direct path to the adjust.trampoline.
+  for (BasicBlock::iterator I = AdjustTramp,
+       E = AdjustTramp->getParent()->begin(); I != E; ) {
+    Instruction *Inst = --I;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+          II->getOperand(0) == TrampMem)
+        return II;
+    if (Inst->mayWriteToMemory())
+      return 0;
+  }
+  return 0;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function.  Otherwise return NULL.
+//
+static IntrinsicInst *FindInitTrampoline(Value *Callee) {
+  Callee = Callee->stripPointerCasts();
+  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+  if (!AdjustTramp ||
+      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+    return 0;
+
+  Value *TrampMem = AdjustTramp->getOperand(0);
+
+  if (IntrinsicInst *IT = FindInitTrampolineFromAlloca(TrampMem))
+    return IT;
+  if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
+    return IT;
+  return 0;
+}
+
+// visitCallSite - Improvements for call and invoke instructions.
+//
+Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  if (isAllocLikeFn(CS.getInstruction(), TLI))
+    return visitAllocSite(*CS.getInstruction());
+
+  bool Changed = false;
+
+  // If the callee is a pointer to a function, attempt to move any casts to the
+  // arguments of the call/invoke.
+  Value *Callee = CS.getCalledValue();
+  if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
+    return 0;
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee))
+    // If the call and callee calling conventions don't match, this call must
+    // be unreachable, as the call is undefined.
+    if (CalleeF->getCallingConv() != CS.getCallingConv() &&
+        // Only do this for calls to a function with a body.  A prototype may
+        // not actually end up matching the implementation's calling conv for a
+        // variety of reasons (e.g. it may be written in assembly).
+        !CalleeF->isDeclaration()) {
+      Instruction *OldCall = CS.getInstruction();
+      new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                                  OldCall);
+      // If OldCall does not return void then replaceAllUsesWith undef.
+      // This allows ValueHandlers and custom metadata to adjust itself.
+      if (!OldCall->getType()->isVoidTy())
+        ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))
+        return EraseInstFromFunction(*OldCall);
+
+      // We cannot remove an invoke, because it would change the CFG, just
+      // change the callee to a null pointer.
+      cast<InvokeInst>(OldCall)->setCalledFunction(
+                                    Constant::getNullValue(CalleeF->getType()));
+      return 0;
+    }
+
+  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+    // If CS does not return void then replaceAllUsesWith undef.
+    // This allows ValueHandlers and custom metadata to adjust itself.
+    if (!CS.getInstruction()->getType()->isVoidTy())
+      ReplaceInstUsesWith(*CS.getInstruction(),
+                          UndefValue::get(CS.getInstruction()->getType()));
+
+    if (isa<InvokeInst>(CS.getInstruction())) {
+      // Can't remove an invoke because we cannot change the CFG.
+      return 0;
+    }
+
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(Callee->getContext()),
+                  UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
+                  CS.getInstruction());
+
+    return EraseInstFromFunction(*CS.getInstruction());
+  }
+
+  if (IntrinsicInst *II = FindInitTrampoline(Callee))
+    return transformCallThroughTrampoline(CS, II);
+
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    int ix = FTy->getNumParams();
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
+           E = CS.arg_end(); I != E; ++I, ++ix) {
+      CastInst *CI = dyn_cast<CastInst>(*I);
+      if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
+        *I = CI->getOperand(0);
+        Changed = true;
+      }
+    }
+  }
+
+  if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
+    // Inline asm calls cannot throw - mark them 'nounwind'.
+    CS.setDoesNotThrow();
+    Changed = true;
+  }
+
+  // Try to optimize the call if possible, we require DataLayout for most of
+  // this.  None of these calls are seen as possibly dead so go ahead and
+  // delete the instruction now.
+  if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+    Instruction *I = tryOptimizeCall(CI, TD);
+    // If we changed something return the result, etc. Otherwise let
+    // the fallthrough check.
+    if (I) return EraseInstFromFunction(*I);
+  }
+
+  return Changed ? CS.getInstruction() : 0;
+}
+
+// transformConstExprCastCall - If the callee is a constexpr cast of a function,
+// attempt to move the cast to the arguments of the call/invoke.
+//
+bool InstCombiner::transformConstExprCastCall(CallSite CS) {
+  Function *Callee =
+    dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+  if (Callee == 0)
+    return false;
+  Instruction *Caller = CS.getInstruction();
+  const AttributeSet &CallerPAL = CS.getAttributes();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  //
+  FunctionType *FT = Callee->getFunctionType();
+  Type *OldRetTy = Caller->getType();
+  Type *NewRetTy = FT->getReturnType();
+
+  if (NewRetTy->isStructTy())
+    return false; // TODO: Handle multiple return values.
+
+  // Check to see if we are changing the return type...
+  if (OldRetTy != NewRetTy) {
+    if (Callee->isDeclaration() &&
+        // Conversion is ok if changing from one pointer type to another or from
+        // a pointer to an integer of the same size.
+        !((OldRetTy->isPointerTy() || !TD ||
+           OldRetTy == TD->getIntPtrType(Caller->getContext())) &&
+          (NewRetTy->isPointerTy() || !TD ||
+           NewRetTy == TD->getIntPtrType(Caller->getContext()))))
+      return false;   // Cannot transform this return value.
+
+    if (!Caller->use_empty() &&
+        // void -> non-void is handled specially
+        !NewRetTy->isVoidTy() && !CastInst::isCastable(NewRetTy, OldRetTy))
+      return false;   // Cannot transform this return value.
+
+    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      if (RAttrs.
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                        AttributeSet::ReturnIndex))
+        return false;   // Attribute not compatible with transformed value.
+    }
+
+    // If the callsite is an invoke instruction, and the return value is used by
+    // a PHI node in a successor, we cannot change the return type of the call
+    // because there is no place to put the cast instruction (without breaking
+    // the critical edge).  Bail out in this case.
+    if (!Caller->use_empty())
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+             UI != E; ++UI)
+          if (PHINode *PN = dyn_cast<PHINode>(*UI))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+  }
+
+  unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  CallSite::arg_iterator AI = CS.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+    Type *ActTy = (*AI)->getType();
+
+    if (!CastInst::isCastable(ActTy, ParamTy))
+      return false;   // Cannot transform this parameter value.
+
+    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
+          hasAttributes(AttributeFuncs::
+                        typeIncompatible(ParamTy, i + 1), i + 1))
+      return false;   // Attribute not compatible with transformed value.
+
+    // If the parameter is passed as a byval argument, then we have to have a
+    // sized type and the sized type has to have the same size as the old type.
+    if (ParamTy != ActTy &&
+        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
+                                                         Attribute::ByVal)) {
+      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+      if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
+        return false;
+
+      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      if (TD->getTypeAllocSize(CurElTy) !=
+          TD->getTypeAllocSize(ParamPTy->getElementType()))
+        return false;
+    }
+
+    // Converting from one pointer type to another or between a pointer and an
+    // integer of the same size is safe even if we do not have a body.
+    bool isConvertible = ActTy == ParamTy ||
+      (TD && ((ParamTy->isPointerTy() ||
+      ParamTy == TD->getIntPtrType(Caller->getContext())) &&
+              (ActTy->isPointerTy() ||
+              ActTy == TD->getIntPtrType(Caller->getContext()))));
+    if (Callee->isDeclaration() && !isConvertible) return false;
+  }
+
+  if (Callee->isDeclaration()) {
+    // Do not delete arguments unless we have a function body.
+    if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
+      return false;
+
+    // If the callee is just a declaration, don't change the varargsness of the
+    // call.  We don't want to introduce a varargs call where one doesn't
+    // already exist.
+    PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
+    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+      return false;
+
+    // If both the callee and the cast type are varargs, we still have to make
+    // sure the number of fixed parameters are the same or we have the same
+    // ABI issues as if we introduce a varargs call.
+    if (FT->isVarArg() &&
+        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
+        FT->getNumParams() !=
+        cast<FunctionType>(APTy->getElementType())->getNumParams())
+      return false;
+  }
+
+  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+      !CallerPAL.isEmpty())
+    // In this case we have more arguments than the new function type, but we
+    // won't be dropping them.  Check that these extra arguments have attributes
+    // that are compatible with being a vararg call argument.
+    for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
+      unsigned Index = CallerPAL.getSlotIndex(i - 1);
+      if (Index <= FT->getNumParams())
+        break;
+
+      // Check if it has an attribute that's incompatible with varargs.
+      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      if (PAttrs.hasAttribute(Index, Attribute::StructRet))
+        return false;
+    }
+
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary.
+  std::vector<Value*> Args;
+  Args.reserve(NumActualArgs);
+  SmallVector<AttributeSet, 8> attrVec;
+  attrVec.reserve(NumCommonArgs);
+
+  // Get any return attributes.
+  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+
+  // If the return value is not being used, the type may not be compatible
+  // with the existing attributes.  Wipe out any problematic attributes.
+  RAttrs.
+    removeAttributes(AttributeFuncs::
+                     typeIncompatible(NewRetTy, AttributeSet::ReturnIndex),
+                     AttributeSet::ReturnIndex);
+
+  // Add the new return attributes.
+  if (RAttrs.hasAttributes())
+    attrVec.push_back(AttributeSet::get(Caller->getContext(),
+                                        AttributeSet::ReturnIndex, RAttrs));
+
+  AI = CS.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    Type *ParamTy = FT->getParamType(i);
+    if ((*AI)->getType() == ParamTy) {
+      Args.push_back(*AI);
+    } else {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
+          false, ParamTy, false);
+      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy));
+    }
+
+    // Add any parameter attributes.
+    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
+    if (PAttrs.hasAttributes())
+      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
+                                          PAttrs));
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now.
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+
+  // If we are removing arguments to the function, emit an obnoxious warning.
+  if (FT->getNumParams() < NumActualArgs) {
+    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
+    if (FT->isVarArg()) {
+      // Add all of the arguments in their promoted form to the arg list.
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        Type *PTy = getPromotedType((*AI)->getType());
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode =
+            CastInst::getCastOpcode(*AI, false, PTy, false);
+          Args.push_back(Builder->CreateCast(opcode, *AI, PTy));
+        } else {
+          Args.push_back(*AI);
+        }
+
+        // Add any parameter attributes.
+        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
+        if (PAttrs.hasAttributes())
+          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
+                                              PAttrs));
+      }
+    }
+  }
+
+  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
+    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
+
+  if (NewRetTy->isVoidTy())
+    Caller->setName("");   // Void type should not have a name.
+
+  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
+                                                       attrVec);
+
+  Instruction *NC;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NC = Builder->CreateInvoke(Callee, II->getNormalDest(),
+                               II->getUnwindDest(), Args);
+    NC->takeName(II);
+    cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
+    cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
+  } else {
+    CallInst *CI = cast<CallInst>(Caller);
+    NC = Builder->CreateCall(Callee, Args);
+    NC->takeName(CI);
+    if (CI->isTailCall())
+      cast<CallInst>(NC)->setTailCall();
+    cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
+    cast<CallInst>(NC)->setAttributes(NewCallerPAL);
+  }
+
+  // Insert a cast of the return type as necessary.
+  Value *NV = NC;
+  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+    if (!NV->getType()->isVoidTy()) {
+      Instruction::CastOps opcode =
+        CastInst::getCastOpcode(NC, false, OldRetTy, false);
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy);
+      NC->setDebugLoc(Caller->getDebugLoc());
+
+      // If this is an invoke instruction, we should insert it after the first
+      // non-phi, instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call.
+        InsertNewInstBefore(NC, *Caller);
+      }
+      Worklist.AddUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (!Caller->use_empty())
+    ReplaceInstUsesWith(*Caller, NV);
+
+  EraseInstFromFunction(*Caller);
+  return true;
+}
+
+// transformCallThroughTrampoline - Turn a call to a function created by
+// init_trampoline / adjust_trampoline intrinsic pair into a direct call to the
+// underlying function.
+//
+Instruction *
+InstCombiner::transformCallThroughTrampoline(CallSite CS,
+                                             IntrinsicInst *Tramp) {
+  Value *Callee = CS.getCalledValue();
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  const AttributeSet &Attrs = CS.getAttributes();
+
+  // If the call already has the 'nest' attribute somewhere then give up -
+  // otherwise 'nest' would occur twice after splicing in the chain.
+  if (Attrs.hasAttrSomewhere(Attribute::Nest))
+    return 0;
+
+  assert(Tramp &&
+         "transformCallThroughTrampoline called with incorrect CallSite.");
+
+  Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
+  PointerType *NestFPTy = cast<PointerType>(NestF->getType());
+  FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+
+  const AttributeSet &NestAttrs = NestF->getAttributes();
+  if (!NestAttrs.isEmpty()) {
+    unsigned NestIdx = 1;
+    Type *NestTy = 0;
+    AttributeSet NestAttr;
+
+    // Look for a parameter marked with the 'nest' attribute.
+    for (FunctionType::param_iterator I = NestFTy->param_begin(),
+         E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
+      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
+        // Record the parameter type and any other attributes.
+        NestTy = *I;
+        NestAttr = NestAttrs.getParamAttributes(NestIdx);
+        break;
+      }
+
+    if (NestTy) {
+      Instruction *Caller = CS.getInstruction();
+      std::vector<Value*> NewArgs;
+      NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
+
+      SmallVector<AttributeSet, 8> NewAttrs;
+      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+
+      // Insert the nest argument into the call argument list, which may
+      // mean appending it.  Likewise for attributes.
+
+      // Add any result attributes.
+      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
+        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                             Attrs.getRetAttributes()));
+
+      {
+        unsigned Idx = 1;
+        CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+        do {
+          if (Idx == NestIdx) {
+            // Add the chain argument and attributes.
+            Value *NestVal = Tramp->getArgOperand(2);
+            if (NestVal->getType() != NestTy)
+              NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
+            NewArgs.push_back(NestVal);
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 NestAttr));
+          }
+
+          if (I == E)
+            break;
+
+          // Add the original argument and attributes.
+          NewArgs.push_back(*I);
+          AttributeSet Attr = Attrs.getParamAttributes(Idx);
+          if (Attr.hasAttributes(Idx)) {
+            AttrBuilder B(Attr, Idx);
+            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
+                                                 Idx + (Idx >= NestIdx), B));
+          }
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Add any function attributes.
+      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
+                                             Attrs.getFnAttributes()));
+
+      // The trampoline may have been bitcast to a bogus type (FTy).
+      // Handle this by synthesizing a new function type, equal to FTy
+      // with the chain parameter inserted.
+
+      std::vector<Type*> NewTypes;
+      NewTypes.reserve(FTy->getNumParams()+1);
+
+      // Insert the chain's type into the list of parameter types, which may
+      // mean appending it.
+      {
+        unsigned Idx = 1;
+        FunctionType::param_iterator I = FTy->param_begin(),
+          E = FTy->param_end();
+
+        do {
+          if (Idx == NestIdx)
+            // Add the chain's type.
+            NewTypes.push_back(NestTy);
+
+          if (I == E)
+            break;
+
+          // Add the original type.
+          NewTypes.push_back(*I);
+
+          ++Idx, ++I;
+        } while (1);
+      }
+
+      // Replace the trampoline call with a direct call.  Let the generic
+      // code sort out any function type mismatches.
+      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
+                                                FTy->isVarArg());
+      Constant *NewCallee =
+        NestF->getType() == PointerType::getUnqual(NewFTy) ?
+        NestF : ConstantExpr::getBitCast(NestF,
+                                         PointerType::getUnqual(NewFTy));
+      const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs);
+
+      Instruction *NewCaller;
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        NewCaller = InvokeInst::Create(NewCallee,
+                                       II->getNormalDest(), II->getUnwindDest(),
+                                       NewArgs);
+        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+      } else {
+        NewCaller = CallInst::Create(NewCallee, NewArgs);
+        if (cast<CallInst>(Caller)->isTailCall())
+          cast<CallInst>(NewCaller)->setTailCall();
+        cast<CallInst>(NewCaller)->
+          setCallingConv(cast<CallInst>(Caller)->getCallingConv());
+        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+      }
+
+      return NewCaller;
+    }
+  }
+
+  // Replace the trampoline call with a direct call.  Since there is no 'nest'
+  // parameter, there is no need to adjust the argument list.  Let the generic
+  // code sort out any function type mismatches.
+  Constant *NewCallee =
+    NestF->getType() == PTy ? NestF :
+                              ConstantExpr::getBitCast(NestF, PTy);
+  CS.setCalledFunction(NewCallee);
+  return CS.getInstruction();
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
new file mode 100644
index 000000000000..2ee1278d23dc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -0,0 +1,1802 @@
+//===- InstCombineCasts.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for cast operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
+/// expression.  If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        uint64_t &Offset) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 0;
+    return ConstantInt::get(Val->getType(), 0);
+  }
+
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+    // Cannot look past anything that might overflow.
+    OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
+    if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
+      Scale = 1;
+      Offset = 0;
+      return Val;
+    }
+
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (I->getOpcode() == Instruction::Shl) {
+        // This is a value scaled by '1 << the shift amt'.
+        Scale = UINT64_C(1) << RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Mul) {
+        // This value is scaled by 'RHS'.
+        Scale = RHS->getZExtValue();
+        Offset = 0;
+        return I->getOperand(0);
+      }
+
+      if (I->getOpcode() == Instruction::Add) {
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
+        // where C1 is divisible by C2.
+        unsigned SubScale;
+        Value *SubVal =
+          DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+        Offset += RHS->getZExtValue();
+        Scale = SubScale;
+        return SubVal;
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+/// PromoteCastOfAllocation - If we find a cast of an allocation instruction,
+/// try to eliminate the cast by moving the type information into the alloc.
+Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
+                                                   AllocaInst &AI) {
+  // This requires DataLayout to get the alloca alignment and size information.
+  if (!TD) return 0;
+
+  PointerType *PTy = cast<PointerType>(CI.getType());
+
+  BuilderTy AllocaBuilder(*Builder);
+  AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
+
+  // Get the type really allocated and the type casted to.
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
+
+  unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return 0;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return 0;
+
+  uint64_t AllocElTySize = TD->getTypeAllocSize(AllocElTy);
+  uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy);
+  if (CastElTySize == 0 || AllocElTySize == 0) return 0;
+
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
+  uint64_t AllocElTyStoreSize = TD->getTypeStoreSize(AllocElTy);
+  uint64_t CastElTyStoreSize = TD->getTypeStoreSize(CastElTy);
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return 0;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  uint64_t ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return 0;
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = 0;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
+    // Insert before the alloca, not before the cast.
+    Amt = AllocaBuilder.CreateMul(Amt, NumElements);
+  }
+
+  if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
+                                  Offset, true);
+    Amt = AllocaBuilder.CreateAdd(Amt, Off);
+  }
+
+  AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
+  New->setAlignment(AI.getAlignment());
+  New->takeName(&AI);
+
+  // If the allocation has multiple real uses, insert a cast and change all
+  // things that used it to use the new cast.  This will also hack on CI, but it
+  // will die soon.
+  if (!AI.hasOneUse()) {
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
+    ReplaceInstUsesWith(AI, NewCast);
+  }
+  return ReplaceInstUsesWith(CI, New);
+}
+
+/// EvaluateInDifferentType - Given an expression that
+/// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
+/// insert the code to evaluate the expression.
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
+                                             bool isSigned) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+    // If we got a constantexpr back, try to simplify it with TD info.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+      C = ConstantFoldConstantExpression(CE, TD, TLI);
+    return C;
+  }
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = 0;
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source.  There's no need to insert it because it is not
+    // new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+
+    // Otherwise, must be the same type of cast, so just reinsert a new one.
+    // This also handles the case of zext(trunc(x)) -> zext(x).
+    Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
+                                      Opc == Instruction::SExt);
+    break;
+  case Instruction::Select: {
+    Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+    Res = SelectInst::Create(I->getOperand(0), True, False);
+    break;
+  }
+  case Instruction::PHI: {
+    PHINode *OPN = cast<PHINode>(I);
+    PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
+    for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+      Value *V =EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+      NPN->addIncoming(V, OPN->getIncomingBlock(i));
+    }
+    Res = NPN;
+    break;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    llvm_unreachable("Unreachable!");
+  }
+
+  Res->takeName(I);
+  return InsertNewInstWith(Res, *I);
+}
+
+
+/// This function is a wrapper around CastInst::isEliminableCastPair. It
+/// simply extracts arguments and returns what that function returns.
+static Instruction::CastOps
+isEliminableCastPair(
+  const CastInst *CI, ///< The first cast instruction
+  unsigned opcode,       ///< The opcode of the second cast instruction
+  Type *DstTy,     ///< The target type for the second cast instruction
+  DataLayout *TD         ///< The target data for pointer size
+) {
+
+  Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  Type *MidTy = CI->getType();                  // B from above
+
+  // Get the opcodes of the two Cast instructions
+  Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opcode);
+  Type *SrcIntPtrTy = TD && SrcTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(SrcTy) : 0;
+  Type *MidIntPtrTy = TD && MidTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(MidTy) : 0;
+  Type *DstIntPtrTy = TD && DstTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(DstTy) : 0;
+  unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                                DstTy, SrcIntPtrTy, MidIntPtrTy,
+                                                DstIntPtrTy);
+
+  // We don't want to form an inttoptr or ptrtoint that converts to an integer
+  // type that differs from the pointer size.
+  if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
+      (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
+    Res = 0;
+
+  return Instruction::CastOps(Res);
+}
+
+/// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
+/// results in any code being generated and is interesting to optimize out. If
+/// the cast can be eliminated by some other simple transformation, we prefer
+/// to do the simplification first.
+bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
+                                      Type *Ty) {
+  // Noop casts and casts of constants should be eliminated trivially.
+  if (V->getType() == Ty || isa<Constant>(V)) return false;
+
+  // If this is another cast that can be eliminated, we prefer to have it
+  // eliminated.
+  if (const CastInst *CI = dyn_cast<CastInst>(V))
+    if (isEliminableCastPair(CI, opc, Ty, TD))
+      return false;
+
+  // If this is a vector sext from a compare, then we don't want to break the
+  // idiom where each element of the extended vector is either zero or all ones.
+  if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
+    return false;
+
+  return true;
+}
+
+
+/// @brief Implement the transforms common to all CastInst visitors.
+Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
+  // eliminate it now.
+  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps opc =
+        isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      return CastInst::Create(opc, CSrc->getOperand(0), CI.getType());
+    }
+  }
+
+  // If we are casting a select then fold the cast into the select
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src))
+    if (Instruction *NV = FoldOpIntoSelect(CI, SI))
+      return NV;
+
+  // If we are casting a PHI then fold the cast into the PHI
+  if (isa<PHINode>(Src)) {
+    // We don't do this if this would create a PHI node with an illegal type if
+    // it is currently legal.
+    if (!Src->getType()->isIntegerTy() ||
+        !CI.getType()->isIntegerTy() ||
+        ShouldChangeType(CI.getType(), Src->getType()))
+      if (Instruction *NV = FoldOpIntoPhi(CI))
+        return NV;
+  }
+
+  return 0;
+}
+
+/// CanEvaluateTruncated - Return true if we can evaluate the specified
+/// expression tree as type Ty instead of its larger type, and arrive with the
+/// same value.  This is used by code that tries to eliminate truncates.
+///
+/// Ty will always be a type smaller than V.  We should return true if trunc(V)
+/// can be computed by computing V in the smaller type.  If V is an instruction,
+/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
+/// makes sense if x and y can be efficiently truncated.
+///
+/// This function works on both vectors and scalars.
+///
+static bool CanEvaluateTruncated(Value *V, Type *Ty) {
+  // We can always evaluate constants in another type.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  Type *OrigTy = V->getType();
+
+  // If this is an extension from the dest type, we can eliminate it, even if it
+  // has multiple uses.
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+      I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode();
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // These operators can all arbitrarily be extended or truncated.
+    return CanEvaluateTruncated(I->getOperand(0), Ty) &&
+           CanEvaluateTruncated(I->getOperand(1), Ty);
+
+  case Instruction::UDiv:
+  case Instruction::URem: {
+    // UDiv and URem can be truncated if all the truncated bits are zero.
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    if (BitWidth < OrigBitWidth) {
+      APInt Mask = APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth);
+      if (MaskedValueIsZero(I->getOperand(0), Mask) &&
+          MaskedValueIsZero(I->getOperand(1), Mask)) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty) &&
+               CanEvaluateTruncated(I->getOperand(1), Ty);
+      }
+    }
+    break;
+  }
+  case Instruction::Shl:
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (CI->getLimitedValue(BitWidth) < BitWidth)
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+    }
+    break;
+  case Instruction::LShr:
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+      }
+    }
+    break;
+  case Instruction::Trunc:
+    // trunc(trunc(x)) -> trunc(x)
+    return true;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+    // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+    return true;
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateTruncated(SI->getTrueValue(), Ty) &&
+           CanEvaluateTruncated(SI->getFalseValue(), Ty);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateTruncated(PN->getIncomingValue(i), Ty))
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
+Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  Value *Src = CI.getOperand(0);
+  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+
+  // Attempt to truncate the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateTruncated(Src, DestTy)) {
+
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid cast: " << CI << '\n');
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+    return ReplaceInstUsesWith(CI, Res);
+  }
+
+  // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
+  if (DestTy->getScalarSizeInBits() == 1) {
+    Constant *One = ConstantInt::get(Src->getType(), 1);
+    Src = Builder->CreateAnd(Src, One);
+    Value *Zero = Constant::getNullValue(Src->getType());
+    return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
+  }
+
+  // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
+  Value *A = 0; ConstantInt *Cst = 0;
+  if (Src->hasOneUse() &&
+      match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    // We have three types to worry about here, the type of A, the source of
+    // the truncate (MidSize), and the destination of the truncate. We know that
+    // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
+    // between ASize and ResultSize.
+    unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+
+    // If the shift amount is larger than the size of A, then the result is
+    // known to be zero because all the input bits got shifted out.
+    if (Cst->getZExtValue() >= ASize)
+      return ReplaceInstUsesWith(CI, Constant::getNullValue(CI.getType()));
+
+    // Since we're doing an lshr and a zero extend, and know that the shift
+    // amount is smaller than ASize, it is always safe to do the shift in A's
+    // type, then zero extend or truncate to the result.
+    Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());
+    Shift->takeName(Src);
+    return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
+  }
+
+  // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
+  // type isn't non-native.
+  if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
+      ShouldChangeType(Src->getType(), CI.getType()) &&
+      match(Src, m_And(m_Value(A), m_ConstantInt(Cst)))) {
+    Value *NewTrunc = Builder->CreateTrunc(A, CI.getType(), A->getName()+".tr");
+    return BinaryOperator::CreateAnd(NewTrunc,
+                                     ConstantExpr::getTrunc(Cst, CI.getType()));
+  }
+
+  return 0;
+}
+
+/// transformZExtICmp - Transform (zext icmp) to bitwise / integer operations
+/// in order to eliminate the icmp.
+Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
+                                             bool DoXform) {
+  // If we are just checking for a icmp eq of a single bit and zext'ing it
+  // to an integer, then shift the bit to the appropriate place and then
+  // cast to integer to avoid the comparison.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+    const APInt &Op1CV = Op1C->getValue();
+
+    // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+    // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+        (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())) {
+      if (!DoXform) return ICI;
+
+      Value *In = ICI->getOperand(0);
+      Value *Sh = ConstantInt::get(In->getType(),
+                                   In->getType()->getScalarSizeInBits()-1);
+      In = Builder->CreateLShr(In, Sh, In->getName()+".lobit");
+      if (In->getType() != CI.getType())
+        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
+
+      if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+        Constant *One = ConstantInt::get(In->getType(), 1);
+        In = Builder->CreateXor(In, One, In->getName()+".not");
+      }
+
+      return ReplaceInstUsesWith(CI, In);
+    }
+
+    // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+    // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+    // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+    // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+    // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
+        // This only works for EQ and NE
+        ICI->isEquality()) {
+      // If Op1C some other power of two, convert:
+      uint32_t BitWidth = Op1C->getType()->getBitWidth();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
+
+      APInt KnownZeroMask(~KnownZero);
+      if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+        if (!DoXform) return ICI;
+
+        bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+        if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
+          // (X&4) == 2 --> false
+          // (X&4) != 2 --> true
+          Constant *Res = ConstantInt::get(Type::getInt1Ty(CI.getContext()),
+                                           isNE);
+          Res = ConstantExpr::getZExt(Res, CI.getType());
+          return ReplaceInstUsesWith(CI, Res);
+        }
+
+        uint32_t ShiftAmt = KnownZeroMask.logBase2();
+        Value *In = ICI->getOperand(0);
+        if (ShiftAmt) {
+          // Perform a logical shr by shiftamt.
+          // Insert the shift to put the result in the low bit.
+          In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
+                                   In->getName()+".lobit");
+        }
+
+        if ((Op1CV != 0) == isNE) { // Toggle the low bit.
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = Builder->CreateXor(In, One);
+        }
+
+        if (CI.getType() == In->getType())
+          return ReplaceInstUsesWith(CI, In);
+        return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
+      }
+    }
+  }
+
+  // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
+  // It is also profitable to transform icmp eq into not(xor(A, B)) because that
+  // may lead to additional simplifications.
+  if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
+      uint32_t BitWidth = ITy->getBitWidth();
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+
+      APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
+      APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
+      ComputeMaskedBits(LHS, KnownZeroLHS, KnownOneLHS);
+      ComputeMaskedBits(RHS, KnownZeroRHS, KnownOneRHS);
+
+      if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
+        APInt KnownBits = KnownZeroLHS | KnownOneLHS;
+        APInt UnknownBit = ~KnownBits;
+        if (UnknownBit.countPopulation() == 1) {
+          if (!DoXform) return ICI;
+
+          Value *Result = Builder->CreateXor(LHS, RHS);
+
+          // Mask off any bits that are set and won't be shifted away.
+          if (KnownOneLHS.uge(UnknownBit))
+            Result = Builder->CreateAnd(Result,
+                                        ConstantInt::get(ITy, UnknownBit));
+
+          // Shift the bit we're testing down to the lsb.
+          Result = Builder->CreateLShr(
+               Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+            Result = Builder->CreateXor(Result, ConstantInt::get(ITy, 1));
+          Result->takeName(ICI);
+          return ReplaceInstUsesWith(CI, Result);
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// CanEvaluateZExtd - Determine if the specified value can be computed in the
+/// specified wider type and produce the same low bits.  If not, return false.
+///
+/// If this function returns true, it can also return a non-zero number of bits
+/// (in BitsToClear) which indicates that the value it computes is correct for
+/// the zero extend, but that the additional BitsToClear bits need to be zero'd
+/// out.  For example, to promote something like:
+///
+///   %B = trunc i64 %A to i32
+///   %C = lshr i32 %B, 8
+///   %E = zext i32 %C to i64
+///
+/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
+/// set to 8 to indicate that the promoted value needs to have bits 24-31
+/// cleared in addition to bits 32-63.  Since an 'and' will be generated to
+/// clear the top bits anyway, doing this has no extra cost.
+///
+/// This function works on both vectors and scalars.
+static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
+  BitsToClear = 0;
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If the input is a truncate from the destination type, we can trivially
+  // eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  unsigned Opc = I->getOpcode(), Tmp;
+  switch (Opc) {
+  case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
+  case Instruction::SExt:  // zext(sext(x)) -> sext(x).
+  case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) ||
+        !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp))
+      return false;
+    // These can all be promoted if neither operand has 'bits to clear'.
+    if (BitsToClear == 0 && Tmp == 0)
+      return true;
+
+    // If the operation is an AND/OR/XOR and the bits to clear are zero in the
+    // other side, BitsToClear is ok.
+    if (Tmp == 0 &&
+        (Opc == Instruction::And || Opc == Instruction::Or ||
+         Opc == Instruction::Xor)) {
+      // We use MaskedValueIsZero here for generality, but the case we care
+      // about the most is constant RHS.
+      unsigned VSize = V->getType()->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(1),
+                            APInt::getHighBitsSet(VSize, BitsToClear)))
+        return true;
+    }
+
+    // Otherwise, we don't know how to analyze this BitsToClear case yet.
+    return false;
+
+  case Instruction::LShr:
+    // We can promote lshr(x, cst) if we can promote x.  This requires the
+    // ultimate 'and' to clear out the high zero bits we're clearing out though.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+        return false;
+      BitsToClear += Amt->getZExtValue();
+      if (BitsToClear > V->getType()->getScalarSizeInBits())
+        BitsToClear = V->getType()->getScalarSizeInBits();
+      return true;
+    }
+    // Cannot promote variable LSHR.
+    return false;
+  case Instruction::Select:
+    if (!CanEvaluateZExtd(I->getOperand(1), Ty, Tmp) ||
+        !CanEvaluateZExtd(I->getOperand(2), Ty, BitsToClear) ||
+        // TODO: If important, we could handle the case when the BitsToClear are
+        // known zero in the disagreeing side.
+        Tmp != BitsToClear)
+      return false;
+    return true;
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    if (!CanEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear))
+      return false;
+    for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp) ||
+          // TODO: If important, we could handle the case when the BitsToClear
+          // are known zero in the disagreeing input.
+          Tmp != BitsToClear)
+        return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    return false;
+  }
+}
+
+Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If this zero extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this zext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
+    return 0;
+
+  // If one of the common conversion will work, do it.
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  unsigned BitsToClear;
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
+    assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
+           "Unreasonable BitsToClear");
+
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid zero extend: " << CI);
+    Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with zeros, just replace this
+    // cast with the result.
+    if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
+                                                     DestBitSize-SrcBitsKept)))
+      return ReplaceInstUsesWith(CI, Res);
+
+    // We need to emit an AND to clear the high bits.
+    Constant *C = ConstantInt::get(Res->getType(),
+                               APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
+    return BinaryOperator::CreateAnd(Res, C);
+  }
+
+  // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+  // types and if the sizes are just right we can convert this into a logical
+  // 'and' which will be much cheaper than the pair of casts.
+  if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
+    // TODO: Subsume this into EvaluateInDifferentType.
+
+    // Get the sizes of the types involved.  We know that the intermediate type
+    // will be smaller than A or C, but don't know the relation between A and C.
+    Value *A = CSrc->getOperand(0);
+    unsigned SrcSize = A->getType()->getScalarSizeInBits();
+    unsigned MidSize = CSrc->getType()->getScalarSizeInBits();
+    unsigned DstSize = CI.getType()->getScalarSizeInBits();
+    // If we're actually extending zero bits, then if
+    // SrcSize <  DstSize: zext(a & mask)
+    // SrcSize == DstSize: a & mask
+    // SrcSize  > DstSize: trunc(a) & mask
+    if (SrcSize < DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
+      Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      return new ZExtInst(And, CI.getType());
+    }
+
+    if (SrcSize == DstSize) {
+      APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+      return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
+                                                           AndValue));
+    }
+    if (SrcSize > DstSize) {
+      Value *Trunc = Builder->CreateTrunc(A, CI.getType());
+      APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+      return BinaryOperator::CreateAnd(Trunc,
+                                       ConstantInt::get(Trunc->getType(),
+                                                        AndValue));
+    }
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformZExtICmp(ICI, CI);
+
+  BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+  if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+    // zext (or icmp, icmp) --> or (zext icmp), (zext icmp) if at least one
+    // of the (zext icmp) will be transformed.
+    ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+    ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+    if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+        (transformZExtICmp(LHS, CI, false) ||
+         transformZExtICmp(RHS, CI, false))) {
+      Value *LCast = Builder->CreateZExt(LHS, CI.getType(), LHS->getName());
+      Value *RCast = Builder->CreateZExt(RHS, CI.getType(), RHS->getName());
+      return BinaryOperator::Create(Instruction::Or, LCast, RCast);
+    }
+  }
+
+  // zext(trunc(t) & C) -> (t & zext(C)).
+  if (SrcI && SrcI->getOpcode() == Instruction::And && SrcI->hasOneUse())
+    if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1)))
+      if (TruncInst *TI = dyn_cast<TruncInst>(SrcI->getOperand(0))) {
+        Value *TI0 = TI->getOperand(0);
+        if (TI0->getType() == CI.getType())
+          return
+            BinaryOperator::CreateAnd(TI0,
+                                ConstantExpr::getZExt(C, CI.getType()));
+      }
+
+  // zext((trunc(t) & C) ^ C) -> ((t & zext(C)) ^ zext(C)).
+  if (SrcI && SrcI->getOpcode() == Instruction::Xor && SrcI->hasOneUse())
+    if (ConstantInt *C = dyn_cast<ConstantInt>(SrcI->getOperand(1)))
+      if (BinaryOperator *And = dyn_cast<BinaryOperator>(SrcI->getOperand(0)))
+        if (And->getOpcode() == Instruction::And && And->hasOneUse() &&
+            And->getOperand(1) == C)
+          if (TruncInst *TI = dyn_cast<TruncInst>(And->getOperand(0))) {
+            Value *TI0 = TI->getOperand(0);
+            if (TI0->getType() == CI.getType()) {
+              Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
+              Value *NewAnd = Builder->CreateAnd(TI0, ZC);
+              return BinaryOperator::CreateXor(NewAnd, ZC);
+            }
+          }
+
+  // zext (xor i1 X, true) to i32  --> xor (zext i1 X to i32), 1
+  Value *X;
+  if (SrcI && SrcI->hasOneUse() && SrcI->getType()->isIntegerTy(1) &&
+      match(SrcI, m_Not(m_Value(X))) &&
+      (!X->hasOneUse() || !isa<CmpInst>(X))) {
+    Value *New = Builder->CreateZExt(X, CI.getType());
+    return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
+  }
+
+  return 0;
+}
+
+/// transformSExtICmp - Transform (sext icmp) to bitwise / integer operations
+/// in order to eliminate the icmp.
+Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
+  Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    // (x <s  0) ? -1 : 0 -> ashr x, 31        -> all ones if negative
+    // (x >s -1) ? -1 : 0 -> not (ashr x, 31)  -> all ones if positive
+    if ((Pred == ICmpInst::ICMP_SLT && Op1C->isZero()) ||
+        (Pred == ICmpInst::ICMP_SGT && Op1C->isAllOnesValue())) {
+
+      Value *Sh = ConstantInt::get(Op0->getType(),
+                                   Op0->getType()->getScalarSizeInBits()-1);
+      Value *In = Builder->CreateAShr(Op0, Sh, Op0->getName()+".lobit");
+      if (In->getType() != CI.getType())
+        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/);
+
+      if (Pred == ICmpInst::ICMP_SGT)
+        In = Builder->CreateNot(In, In->getName()+".not");
+      return ReplaceInstUsesWith(CI, In);
+    }
+
+    // If we know that only one bit of the LHS of the icmp can be set and we
+    // have an equality comparison with zero or a power of 2, we can transform
+    // the icmp and sext into bitwise/integer operations.
+    if (ICI->hasOneUse() &&
+        ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
+      unsigned BitWidth = Op1C->getType()->getBitWidth();
+      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+      ComputeMaskedBits(Op0, KnownZero, KnownOne);
+
+      APInt KnownZeroMask(~KnownZero);
+      if (KnownZeroMask.isPowerOf2()) {
+        Value *In = ICI->getOperand(0);
+
+        // If the icmp tests for a known zero bit we can constant fold it.
+        if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) {
+          Value *V = Pred == ICmpInst::ICMP_NE ?
+                       ConstantInt::getAllOnesValue(CI.getType()) :
+                       ConstantInt::getNullValue(CI.getType());
+          return ReplaceInstUsesWith(CI, V);
+        }
+
+        if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
+          // sext ((x & 2^n) == 0)   -> (x >> n) - 1
+          // sext ((x & 2^n) != 2^n) -> (x >> n) - 1
+          unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
+          // Perform a right shift to place the desired bit in the LSB.
+          if (ShiftAmt)
+            In = Builder->CreateLShr(In,
+                                     ConstantInt::get(In->getType(), ShiftAmt));
+
+          // At this point "In" is either 1 or 0. Subtract 1 to turn
+          // {1, 0} -> {0, -1}.
+          In = Builder->CreateAdd(In,
+                                  ConstantInt::getAllOnesValue(In->getType()),
+                                  "sext");
+        } else {
+          // sext ((x & 2^n) != 0)   -> (x << bitwidth-n) a>> bitwidth-1
+          // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
+          unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
+          // Perform a left shift to place the desired bit in the MSB.
+          if (ShiftAmt)
+            In = Builder->CreateShl(In,
+                                    ConstantInt::get(In->getType(), ShiftAmt));
+
+          // Distribute the bit over the whole bit width.
+          In = Builder->CreateAShr(In, ConstantInt::get(In->getType(),
+                                                        BitWidth - 1), "sext");
+        }
+
+        if (CI.getType() == In->getType())
+          return ReplaceInstUsesWith(CI, In);
+        return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
+      }
+    }
+  }
+
+  // vector (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed.
+  if (VectorType *VTy = dyn_cast<VectorType>(CI.getType())) {
+    if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_Zero()) &&
+        Op0->getType() == CI.getType()) {
+      Type *EltTy = VTy->getElementType();
+
+      // splat the shift constant to a constant vector.
+      Constant *VSh = ConstantInt::get(VTy, EltTy->getScalarSizeInBits()-1);
+      Value *In = Builder->CreateAShr(Op0, VSh, Op0->getName()+".lobit");
+      return ReplaceInstUsesWith(CI, In);
+    }
+  }
+
+  return 0;
+}
+
+/// CanEvaluateSExtd - Return true if we can take the specified value
+/// and return it as type Ty without inserting any new casts and without
+/// changing the value of the common low bits.  This is used by code that tries
+/// to promote integer operations to a wider types will allow us to eliminate
+/// the extension.
+///
+/// This function works on both vectors and scalars.
+///
+static bool CanEvaluateSExtd(Value *V, Type *Ty) {
+  assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
+         "Can't sign extend type to a smaller type");
+  // If this is a constant, it can be trivially promoted.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If this is a truncate from the dest type, we can trivially eliminate it.
+  if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
+    return true;
+
+  // We can't extend or shrink something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::SExt:  // sext(sext(x)) -> sext(x)
+  case Instruction::ZExt:  // sext(zext(x)) -> zext(x)
+  case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
+    return true;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // These operators can all arbitrarily be extended if their inputs can.
+    return CanEvaluateSExtd(I->getOperand(0), Ty) &&
+           CanEvaluateSExtd(I->getOperand(1), Ty);
+
+  //case Instruction::Shl:   TODO
+  //case Instruction::LShr:  TODO
+
+  case Instruction::Select:
+    return CanEvaluateSExtd(I->getOperand(1), Ty) &&
+           CanEvaluateSExtd(I->getOperand(2), Ty);
+
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateSExtd(PN->getIncomingValue(i), Ty)) return false;
+    return true;
+  }
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+
+  return false;
+}
+
+Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+  // If this sign extend is only used by a truncate, let the truncate be
+  // eliminated before we try to optimize this sext.
+  if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
+    return 0;
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  // See if we can simplify any instructions used by the input whose sole
+  // purpose is to compute bits we don't care about.
+  if (SimplifyDemandedInstructionBits(CI))
+    return &CI;
+
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+  // Attempt to extend the entire input expression tree to the destination
+  // type.   Only do this if the dest type is a simple type, don't convert the
+  // expression tree to something weird like i93 unless the source is also
+  // strange.
+  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+      CanEvaluateSExtd(Src, DestTy)) {
+    // Okay, we can transform this!  Insert the new expression now.
+    DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+          " to avoid sign extend: " << CI);
+    Value *Res = EvaluateInDifferentType(Src, DestTy, true);
+    assert(Res->getType() == DestTy);
+
+    uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+    uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+    // If the high bits are already filled with sign bit, just replace this
+    // cast with the result.
+    if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
+      return ReplaceInstUsesWith(CI, Res);
+
+    // We need to emit a shl + ashr to do the sign extend.
+    Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
+                                      ShAmt);
+  }
+
+  // If this input is a trunc from our destination, then turn sext(trunc(x))
+  // into shifts.
+  if (TruncInst *TI = dyn_cast<TruncInst>(Src))
+    if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
+      uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+      uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+      // We need to emit a shl + ashr to do the sign extend.
+      Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+      Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
+      return BinaryOperator::CreateAShr(Res, ShAmt);
+    }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+    return transformSExtICmp(ICI, CI);
+
+  // If the input is a shl/ashr pair of a same constant, then this is a sign
+  // extension from a smaller value.  If we could trust arbitrary bitwidth
+  // integers, we could turn this into a truncate to the smaller bit and then
+  // use a sext for the whole extension.  Since we don't, look deeper and check
+  // for a truncate.  If the source and dest are the same type, eliminate the
+  // trunc and extend and just do shifts.  For example, turn:
+  //   %a = trunc i32 %i to i8
+  //   %b = shl i8 %a, 6
+  //   %c = ashr i8 %b, 6
+  //   %d = sext i8 %c to i32
+  // into:
+  //   %a = shl i32 %i, 30
+  //   %d = ashr i32 %a, 30
+  Value *A = 0;
+  // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
+  ConstantInt *BA = 0, *CA = 0;
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
+                        m_ConstantInt(CA))) &&
+      BA == CA && A->getType() == CI.getType()) {
+    unsigned MidSize = Src->getType()->getScalarSizeInBits();
+    unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
+    unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
+    Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
+    A = Builder->CreateShl(A, ShAmtV, CI.getName());
+    return BinaryOperator::CreateAShr(A, ShAmtV);
+  }
+
+  return 0;
+}
+
+
+/// FitsInFPType - Return a Constant* for the specified FP constant if it fits
+/// in the specified FP type without changing its value.
+static Constant *FitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+  bool losesInfo;
+  APFloat F = CFP->getValueAPF();
+  (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+  if (!losesInfo)
+    return ConstantFP::get(CFP->getContext(), F);
+  return 0;
+}
+
+/// LookThroughFPExtensions - If this is an fp extension instruction, look
+/// through it until we get the source value.
+static Value *LookThroughFPExtensions(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::FPExt)
+      return LookThroughFPExtensions(I->getOperand(0));
+
+  // If this value is a constant, return the constant in the smallest FP type
+  // that can accurately represent it.  This allows us to turn
+  // (float)((double)X+2.0) into x+2.0f.
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+    if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
+      return V;  // No constant folding of this.
+    // See if the value can be truncated to half and then reextended.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEhalf))
+      return V;
+    // See if the value can be truncated to float and then reextended.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEsingle))
+      return V;
+    if (CFP->getType()->isDoubleTy())
+      return V;  // Won't shrink.
+    if (Value *V = FitsInFPType(CFP, APFloat::IEEEdouble))
+      return V;
+    // Don't try to shrink to various long double types.
+  }
+
+  return V;
+}
+
+Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
+  // smaller than the destination type, we can eliminate the truncate by doing
+  // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
+  // as many builtins (sqrt, etc).
+  BinaryOperator *OpI = dyn_cast<BinaryOperator>(CI.getOperand(0));
+  if (OpI && OpI->hasOneUse()) {
+    switch (OpI->getOpcode()) {
+    default: break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      Type *SrcTy = OpI->getType();
+      Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
+      Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
+      if (LHSTrunc->getType() != SrcTy &&
+          RHSTrunc->getType() != SrcTy) {
+        unsigned DstSize = CI.getType()->getScalarSizeInBits();
+        // If the source types were both smaller than the destination type of
+        // the cast, do this xform.
+        if (LHSTrunc->getType()->getScalarSizeInBits() <= DstSize &&
+            RHSTrunc->getType()->getScalarSizeInBits() <= DstSize) {
+          LHSTrunc = Builder->CreateFPExt(LHSTrunc, CI.getType());
+          RHSTrunc = Builder->CreateFPExt(RHSTrunc, CI.getType());
+          return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
+        }
+      }
+      break;
+    }
+
+    // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+    if (BinaryOperator::isFNeg(OpI)) {
+      Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1),
+                                                 CI.getType());
+      return BinaryOperator::CreateFNeg(InnerTrunc);
+    }
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
+  if (II) {
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::fabs: {
+        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
+                                                   CI.getType());
+        Type *IntrinsicType[] = { CI.getType() };
+        Function *Overload =
+          Intrinsic::getDeclaration(CI.getParent()->getParent()->getParent(),
+                                    II->getIntrinsicID(), IntrinsicType);
+
+        Value *Args[] = { InnerTrunc };
+        return CallInst::Create(Overload, Args, II->getName());
+      }
+    }
+  }
+
+  // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
+  if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
+      Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) &&
+      Call->getNumArgOperands() == 1 &&
+      Call->hasOneUse()) {
+    CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
+    if (Arg && Arg->getOpcode() == Instruction::FPExt &&
+        CI.getType()->isFloatTy() &&
+        Call->getType()->isDoubleTy() &&
+        Arg->getType()->isDoubleTy() &&
+        Arg->getOperand(0)->getType()->isFloatTy()) {
+      Function *Callee = Call->getCalledFunction();
+      Module *M = CI.getParent()->getParent()->getParent();
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
+                                                   Callee->getAttributes(),
+                                                   Builder->getFloatTy(),
+                                                   Builder->getFloatTy(),
+                                                   NULL);
+      CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
+                                       "sqrtfcall");
+      ret->setAttributes(Callee->getAttributes());
+
+
+      // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
+      ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
+      EraseInstFromFunction(*Call);
+      return ret;
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFPExt(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+
+  // fptoui(uitofp(X)) --> X
+  // fptoui(sitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui.
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
+  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
+  if (OpI == 0)
+    return commonCastTransforms(FI);
+
+  // fptosi(sitofp(X)) --> X
+  // fptosi(uitofp(X)) --> X
+  // This is safe if the intermediate type has enough bits in its mantissa to
+  // accurately represent all values of X.  For example, do not do this with
+  // i64->float->i64.  This is also safe for sitofp case, because any negative
+  // 'X' value would cause an undefined result for the fptoui.
+  if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
+      OpI->getOperand(0)->getType() == FI.getType() &&
+      (int)FI.getType()->getScalarSizeInBits() <=
+                    OpI->getType()->getFPMantissaWidth())
+    return ReplaceInstUsesWith(FI, OpI->getOperand(0));
+
+  return commonCastTransforms(FI);
+}
+
+Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
+  // If the source integer type is not the intptr_t type for this target, do a
+  // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
+  // cast to be exposed to other transforms.
+  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+      TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    return new IntToPtrInst(P, CI.getType());
+  }
+
+  if (Instruction *I = commonCastTransforms(CI))
+    return I;
+
+  return 0;
+}
+
+/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices()) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another
+      // pointer operand so the opcode doesn't need to change.
+      Worklist.Add(GEP);
+      CI.setOperand(0, GEP->getOperand(0));
+      return &CI;
+    }
+
+    // If the GEP has a single use, and the base pointer is a bitcast, and the
+    // GEP computes a constant offset, see if we can convert these three
+    // instructions into fewer.  This typically happens with unions and other
+    // non-type-safe code.
+    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
+    if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
+        GEP->accumulateConstantOffset(*TD, Offset)) {
+      // Get the base pointer input of the bitcast, and the type it points to.
+      Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
+      Type *GEPIdxTy =
+      cast<PointerType>(OrigBase->getType())->getElementType();
+      SmallVector<Value*, 8> NewIndices;
+      if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) {
+        // If we were able to index down into an element, create the GEP
+        // and bitcast the result.  This eliminates one bitcast, potentially
+        // two.
+        Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
+        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+        Builder->CreateGEP(OrigBase, NewIndices);
+        NGEP->takeName(GEP);
+
+        if (isa<BitCastInst>(CI))
+          return new BitCastInst(NGEP, CI.getType());
+        assert(isa<PtrToIntInst>(CI));
+        return new PtrToIntInst(NGEP, CI.getType());
+      }
+    }
+  }
+
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
+  // If the destination integer type is not the intptr_t type for this target,
+  // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
+  // to be exposed to other transforms.
+  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
+    Type *Ty = TD->getIntPtrType(CI.getContext());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
+    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
+  }
+
+  return commonPointerCastTransforms(CI);
+}
+
+/// OptimizeVectorResize - This input value (which is known to have vector type)
+/// is being zero extended or truncated to the specified vector type.  Try to
+/// replace it with a shuffle (and vector/vector bitcast) if possible.
+///
+/// The source and destination vector types may have different element types.
+static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
+                                         InstCombiner &IC) {
+  // We can only do this optimization if the output is a multiple of the input
+  // element size, or the input is a multiple of the output element size.
+  // Convert the input type to have the same element type as the output.
+  VectorType *SrcTy = cast<VectorType>(InVal->getType());
+
+  if (SrcTy->getElementType() != DestTy->getElementType()) {
+    // The input types don't need to be identical, but for now they must be the
+    // same size.  There is no specific reason we couldn't handle things like
+    // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
+    // there yet.
+    if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
+        DestTy->getElementType()->getPrimitiveSizeInBits())
+      return 0;
+
+    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
+  }
+
+  // Now that the element types match, get the shuffle mask and RHS of the
+  // shuffle to use, which depends on whether we're increasing or decreasing the
+  // size of the input.
+  SmallVector<uint32_t, 16> ShuffleMask;
+  Value *V2;
+
+  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
+    // If we're shrinking the number of elements, just shuffle in the low
+    // elements from the input and use undef as the second shuffle input.
+    V2 = UndefValue::get(SrcTy);
+    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
+      ShuffleMask.push_back(i);
+
+  } else {
+    // If we're increasing the number of elements, shuffle in all of the
+    // elements from InVal and fill the rest of the result elements with zeros
+    // from a constant zero.
+    V2 = Constant::getNullValue(SrcTy);
+    unsigned SrcElts = SrcTy->getNumElements();
+    for (unsigned i = 0, e = SrcElts; i != e; ++i)
+      ShuffleMask.push_back(i);
+
+    // The excess elements reference the first element of the zero input.
+    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
+      ShuffleMask.push_back(SrcElts);
+  }
+
+  return new ShuffleVectorInst(InVal, V2,
+                               ConstantDataVector::get(V2->getContext(),
+                                                       ShuffleMask));
+}
+
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
+  return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
+  return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// CollectInsertionElements - V is a value which is inserted into a vector of
+/// VecEltTy.  Look through the value to see if we can decompose it into
+/// insertions into the vector.  See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+                                     SmallVectorImpl<Value*> &Elements,
+                                     Type *VecEltTy) {
+  // Undef values never contribute useful bits to the result.
+  if (isa<UndefValue>(V)) return true;
+
+  // If we got down to a value of the right type, we win, try inserting into the
+  // right element.
+  if (V->getType() == VecEltTy) {
+    // Inserting null doesn't actually insert any elements.
+    if (Constant *C = dyn_cast<Constant>(V))
+      if (C->isNullValue())
+        return true;
+
+    // Fail if multiple elements are inserted into this slot.
+    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+      return false;
+
+    Elements[ElementIndex] = V;
+    return true;
+  }
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Figure out the # elements this provides, and bitcast it or slice it up
+    // as required.
+    unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+                                        VecEltTy);
+    // If the constant is the size of a vector element, we just need to bitcast
+    // it to the right type so it gets properly inserted.
+    if (NumElts == 1)
+      return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+                                      ElementIndex, Elements, VecEltTy);
+
+    // Okay, this is a constant that covers multiple elements.  Slice it up into
+    // pieces and insert each element-sized piece into the vector.
+    if (!isa<IntegerType>(C->getType()))
+      C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+                                       C->getType()->getPrimitiveSizeInBits()));
+    unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+                                                               i*ElementSize));
+      Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+        return false;
+    }
+    return true;
+  }
+
+  if (!V->hasOneUse()) return false;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return false;
+  switch (I->getOpcode()) {
+  default: return false; // Unhandled case.
+  case Instruction::BitCast:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);
+  case Instruction::ZExt:
+    if (!isMultipleOfTypeSize(
+                          I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+                              VecEltTy))
+      return false;
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy);
+  case Instruction::Or:
+    return CollectInsertionElements(I->getOperand(0), ElementIndex,
+                                    Elements, VecEltTy) &&
+           CollectInsertionElements(I->getOperand(1), ElementIndex,
+                                    Elements, VecEltTy);
+  case Instruction::Shl: {
+    // Must be shifting by a constant that is a multiple of the element size.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
+    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
+
+    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
+                                    Elements, VecEltTy);
+  }
+
+  }
+}
+
+
+/// OptimizeIntegerToVectorInsertions - If the input is an 'or' instruction, we
+/// may be doing shifts and ors to assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements.  This is to
+/// optimize code like this:
+///
+///    %tmp37 = bitcast float %inc to i32
+///    %tmp38 = zext i32 %tmp37 to i64
+///    %tmp31 = bitcast float %inc5 to i32
+///    %tmp32 = zext i32 %tmp31 to i64
+///    %tmp33 = shl i64 %tmp32, 32
+///    %ins35 = or i64 %tmp33, %tmp38
+///    %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
+                                                InstCombiner &IC) {
+  VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  Value *IntInput = CI.getOperand(0);
+
+  SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+  if (!CollectInsertionElements(IntInput, 0, Elements,
+                                DestVecTy->getElementType()))
+    return 0;
+
+  // If we succeeded, we know that all of the element are specified by Elements
+  // or are zero if Elements has a null entry.  Recast this as a set of
+  // insertions.
+  Value *Result = Constant::getNullValue(CI.getType());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] == 0) continue;  // Unset element.
+
+    Result = IC.Builder->CreateInsertElement(Result, Elements[i],
+                                             IC.Builder->getInt32(i));
+  }
+
+  return Result;
+}
+
+
+/// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
+/// bitcast.  The various long double bitcasts can't get in here.
+static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
+  Value *Src = CI.getOperand(0);
+  Type *DestTy = CI.getType();
+
+  // If this is a bitcast from int to float, check to see if the int is an
+  // extraction from a vector.
+  Value *VecInput = 0;
+  // bitcast(trunc(bitcast(somevector)))
+  if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+
+      unsigned Elt = 0;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+    }
+  }
+
+  // bitcast(trunc(lshr(bitcast(somevector), cst))
+  ConstantInt *ShAmt = 0;
+  if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
+                                m_ConstantInt(ShAmt)))) &&
+      isa<VectorType>(VecInput->getType())) {
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+    if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
+        ShAmt->getZExtValue() % DestWidth == 0) {
+      // If the element type of the vector doesn't match the result type,
+      // bitcast it to be a vector type we can extract from.
+      if (VecTy->getElementType() != DestTy) {
+        VecTy = VectorType::get(DestTy,
+                                VecTy->getPrimitiveSizeInBits() / DestWidth);
+        VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
+      }
+
+      unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  Type *SrcTy = Src->getType();
+  Type *DestTy = CI.getType();
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return ReplaceInstUsesWith(CI, Src);
+
+  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    Type *DstElTy = DstPTy->getElementType();
+    Type *SrcElTy = SrcPTy->getElementType();
+
+    // If the address spaces don't match, don't eliminate the bitcast, which is
+    // required for changing types.
+    if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
+      return 0;
+
+    // If we are casting a alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    // There is no need to modify malloc calls because it is their bitcast that
+    // needs to be cleaned up.
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    Constant *ZeroUInt =
+      Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
+    unsigned NumZeros = 0;
+    while (SrcElTy != DstElTy &&
+           isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
+           SrcElTy->getNumContainedTypes() /* not "{}" */) {
+      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
+      return GetElementPtrInst::CreateInBounds(Src, Idxs);
+    }
+  }
+
+  // Try to optimize int -> float bitcasts.
+  if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
+    if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
+      return I;
+
+  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+    if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
+      Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
+      return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+      // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
+    }
+
+    if (isa<IntegerType>(SrcTy)) {
+      // If this is a cast from an integer to vector, check to see if the input
+      // is a trunc or zext of a bitcast from vector.  If so, we can replace all
+      // the casts with a shuffle and (potentially) a bitcast.
+      if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+        CastInst *SrcCast = cast<CastInst>(Src);
+        if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+          if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+            if (Instruction *I = OptimizeVectorResize(BCIn->getOperand(0),
+                                               cast<VectorType>(DestTy), *this))
+              return I;
+      }
+
+      // If the input is an 'or' instruction, we may be doing shifts and ors to
+      // assemble the elements of the vector manually.  Try to rip the code out
+      // and replace it with insertelements.
+      if (Value *V = OptimizeIntegerToVectorInsertions(CI, *this))
+        return ReplaceInstUsesWith(CI, V);
+    }
+  }
+
+  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+    if (SrcVTy->getNumElements() == 1) {
+      // If our destination is not a vector, then make this a straight
+      // scalar-scalar cast.
+      if (!DestTy->isVectorTy()) {
+        Value *Elem =
+          Builder->CreateExtractElement(Src,
+                     Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+        return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+      }
+
+      // Otherwise, see if our source is an insert. If so, then use the scalar
+      // component directly.
+      if (InsertElementInst *IEI =
+            dyn_cast<InsertElementInst>(CI.getOperand(0)))
+        return CastInst::Create(Instruction::BitCast, IEI->getOperand(1),
+                                DestTy);
+    }
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+    // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
+    // a bitcast to a vector with the same # elts.
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
+        cast<VectorType>(DestTy)->getNumElements() ==
+              SVI->getType()->getNumElements() &&
+        SVI->getType()->getNumElements() ==
+          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+      BitCastInst *Tmp;
+      // If either of the operands is a cast from CI.getType(), then
+      // evaluating the shuffle in the casted destination's type will allow
+      // us to eliminate at least one cast.
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
+           Tmp->getOperand(0)->getType() == DestTy) ||
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
+           Tmp->getOperand(0)->getType() == DestTy)) {
+        Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
+        Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
+        // Return a new shuffle vector.  Use the same element ID's, as we
+        // know the vector types match #elts.
+        return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+      }
+    }
+  }
+
+  if (SrcTy->isPointerTy())
+    return commonPointerCastTransforms(CI);
+  return commonCastTransforms(CI);
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
new file mode 100644
index 000000000000..a96e754f3dd0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -0,0 +1,3167 @@
+//===- InstCombineCompares.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitICmp and visitFCmp functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+static ConstantInt *getOne(Constant *C) {
+  return ConstantInt::get(cast<IntegerType>(C->getType()), 1);
+}
+
+/// AddOne - Add one to a ConstantInt
+static Constant *AddOne(Constant *C) {
+  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+}
+/// SubOne - Subtract one from a ConstantInt
+static Constant *SubOne(Constant *C) {
+  return ConstantExpr::getSub(C, ConstantInt::get(C->getType(), 1));
+}
+
+static ConstantInt *ExtractElement(Constant *V, Constant *Idx) {
+  return cast<ConstantInt>(ConstantExpr::getExtractElement(V, Idx));
+}
+
+static bool HasAddOverflow(ConstantInt *Result,
+                           ConstantInt *In1, ConstantInt *In2,
+                           bool IsSigned) {
+  if (!IsSigned)
+    return Result->getValue().ult(In1->getValue());
+
+  if (In2->isNegative())
+    return Result->getValue().sgt(In1->getValue());
+  return Result->getValue().slt(In1->getValue());
+}
+
+/// AddWithOverflow - Compute Result = In1+In2, returning true if the result
+/// overflowed for this type.
+static bool AddWithOverflow(Constant *&Result, Constant *In1,
+                            Constant *In2, bool IsSigned = false) {
+  Result = ConstantExpr::getAdd(In1, In2);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+      Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
+      if (HasAddOverflow(ExtractElement(Result, Idx),
+                         ExtractElement(In1, Idx),
+                         ExtractElement(In2, Idx),
+                         IsSigned))
+        return true;
+    }
+    return false;
+  }
+
+  return HasAddOverflow(cast<ConstantInt>(Result),
+                        cast<ConstantInt>(In1), cast<ConstantInt>(In2),
+                        IsSigned);
+}
+
+static bool HasSubOverflow(ConstantInt *Result,
+                           ConstantInt *In1, ConstantInt *In2,
+                           bool IsSigned) {
+  if (!IsSigned)
+    return Result->getValue().ugt(In1->getValue());
+
+  if (In2->isNegative())
+    return Result->getValue().slt(In1->getValue());
+
+  return Result->getValue().sgt(In1->getValue());
+}
+
+/// SubWithOverflow - Compute Result = In1-In2, returning true if the result
+/// overflowed for this type.
+static bool SubWithOverflow(Constant *&Result, Constant *In1,
+                            Constant *In2, bool IsSigned = false) {
+  Result = ConstantExpr::getSub(In1, In2);
+
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
+      Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
+      if (HasSubOverflow(ExtractElement(Result, Idx),
+                         ExtractElement(In1, Idx),
+                         ExtractElement(In2, Idx),
+                         IsSigned))
+        return true;
+    }
+    return false;
+  }
+
+  return HasSubOverflow(cast<ConstantInt>(Result),
+                        cast<ConstantInt>(In1), cast<ConstantInt>(In2),
+                        IsSigned);
+}
+
+/// isSignBitCheck - Given an exploded icmp instruction, return true if the
+/// comparison only checks the sign bit.  If it only checks the sign bit, set
+/// TrueIfSigned if the result of the comparison is true when the input value is
+/// signed.
+static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
+                           bool &TrueIfSigned) {
+  switch (pred) {
+  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS->isZero();
+  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
+    TrueIfSigned = true;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == high-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS->isMaxValue(true);
+  case ICmpInst::ICMP_UGE:
+    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS->getValue().isSignBit();
+  default:
+    return false;
+  }
+}
+
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
+  if (!ICmpInst::isSigned(pred))
+    return false;
+
+  if (RHS->isZero())
+    return ICmpInst::isRelational(pred);
+
+  if (RHS->isOne()) {
+    if (pred == ICmpInst::ICMP_SLT) {
+      pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (RHS->isAllOnesValue()) {
+    if (pred == ICmpInst::ICMP_SGT) {
+      pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// isHighOnes - Return true if the constant is of the form 1+0+.
+// This is the same as lowones(~X).
+static bool isHighOnes(const ConstantInt *CI) {
+  return (~CI->getValue() + 1).isPowerOf2();
+}
+
+/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a
+/// set of known zero and one bits, compute the maximum and minimum values that
+/// could have the specified known zero and known one bits, returning them in
+/// min/max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
+                                                   const APInt& KnownOne,
+                                                   APInt& Min, APInt& Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = KnownOne;
+  Max = KnownOne|UnknownBits;
+
+  if (UnknownBits.isNegative()) { // Sign bit is unknown
+    Min.setBit(Min.getBitWidth()-1);
+    Max.clearBit(Max.getBitWidth()-1);
+  }
+}
+
+// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
+// a set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
+                                                     const APInt &KnownOne,
+                                                     APInt &Min, APInt &Max) {
+  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
+         KnownZero.getBitWidth() == Min.getBitWidth() &&
+         KnownZero.getBitWidth() == Max.getBitWidth() &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+
+  // The minimum value is when the unknown bits are all zeros.
+  Min = KnownOne;
+  // The maximum value is when the unknown bits are all ones.
+  Max = KnownOne|UnknownBits;
+}
+
+
+
+/// FoldCmpLoadFromIndexedGlobal - Called we see this pattern:
+///   cmp pred (load (gep GV, ...)), cmpcst
+/// where GV is a global variable with a constant initializer.  Try to simplify
+/// this into some simple computation that does not need the load.  For example
+/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
+///
+/// If AndCst is non-null, then the loaded value is masked with that constant
+/// before doing the comparison.  This handles cases like "A[i]&4 == 0".
+Instruction *InstCombiner::
+FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
+                             CmpInst &ICI, ConstantInt *AndCst) {
+  // We need TD information to know the pointer size unless this is inbounds.
+  if (!GEP->isInBounds() && TD == 0) return 0;
+
+  Constant *Init = GV->getInitializer();
+  if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+    return 0;
+  
+  uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+  if (ArrayElementCount > 1024) return 0;  // Don't blow up on huge arrays.
+
+  // There are many forms of this optimization we can handle, for now, just do
+  // the simple index into a single-dimensional array.
+  //
+  // Require: GEP GV, 0, i {{, constant indices}}
+  if (GEP->getNumOperands() < 3 ||
+      !isa<ConstantInt>(GEP->getOperand(1)) ||
+      !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+      isa<Constant>(GEP->getOperand(2)))
+    return 0;
+
+  // Check that indices after the variable are constants and in-range for the
+  // type they index.  Collect the indices.  This is typically for arrays of
+  // structs.
+  SmallVector<unsigned, 4> LaterIndices;
+
+  Type *EltTy = Init->getType()->getArrayElementType();
+  for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
+    ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (Idx == 0) return 0;  // Variable index.
+
+    uint64_t IdxVal = Idx->getZExtValue();
+    if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index.
+
+    if (StructType *STy = dyn_cast<StructType>(EltTy))
+      EltTy = STy->getElementType(IdxVal);
+    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+      if (IdxVal >= ATy->getNumElements()) return 0;
+      EltTy = ATy->getElementType();
+    } else {
+      return 0; // Unknown type.
+    }
+
+    LaterIndices.push_back(IdxVal);
+  }
+
+  enum { Overdefined = -3, Undefined = -2 };
+
+  // Variables for our state machines.
+
+  // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
+  // "i == 47 | i == 87", where 47 is the first index the condition is true for,
+  // and 87 is the second (and last) index.  FirstTrueElement is -2 when
+  // undefined, otherwise set to the first true element.  SecondTrueElement is
+  // -2 when undefined, -3 when overdefined and >= 0 when that index is true.
+  int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
+
+  // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
+  // form "i != 47 & i != 87".  Same state transitions as for true elements.
+  int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
+
+  /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
+  /// define a state machine that triggers for ranges of values that the index
+  /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'.
+  /// This is -2 when undefined, -3 when overdefined, and otherwise the last
+  /// index in the range (inclusive).  We use -2 for undefined here because we
+  /// use relative comparisons and don't want 0-1 to match -1.
+  int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
+
+  // MagicBitvector - This is a magic bitvector where we set a bit if the
+  // comparison is true for element 'i'.  If there are 64 elements or less in
+  // the array, this will fully represent all the comparison results.
+  uint64_t MagicBitvector = 0;
+
+
+  // Scan the array and see if one of our patterns matches.
+  Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
+  for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
+    Constant *Elt = Init->getAggregateElement(i);
+    if (Elt == 0) return 0;
+
+    // If this is indexing an array of structures, get the structure element.
+    if (!LaterIndices.empty())
+      Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
+
+    // If the element is masked, handle it.
+    if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
+
+    // Find out if the comparison would be true or false for the i'th element.
+    Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
+                                                  CompareRHS, TD, TLI);
+    // If the result is undef for this element, ignore it.
+    if (isa<UndefValue>(C)) {
+      // Extend range state machines to cover this element in case there is an
+      // undef in the middle of the range.
+      if (TrueRangeEnd == (int)i-1)
+        TrueRangeEnd = i;
+      if (FalseRangeEnd == (int)i-1)
+        FalseRangeEnd = i;
+      continue;
+    }
+
+    // If we can't compute the result for any of the elements, we have to give
+    // up evaluating the entire conditional.
+    if (!isa<ConstantInt>(C)) return 0;
+
+    // Otherwise, we know if the comparison is true or false for this element,
+    // update our state machines.
+    bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
+
+    // State machine for single/double/range index comparison.
+    if (IsTrueForElt) {
+      // Update the TrueElement state machine.
+      if (FirstTrueElement == Undefined)
+        FirstTrueElement = TrueRangeEnd = i;  // First true element.
+      else {
+        // Update double-compare state machine.
+        if (SecondTrueElement == Undefined)
+          SecondTrueElement = i;
+        else
+          SecondTrueElement = Overdefined;
+
+        // Update range state machine.
+        if (TrueRangeEnd == (int)i-1)
+          TrueRangeEnd = i;
+        else
+          TrueRangeEnd = Overdefined;
+      }
+    } else {
+      // Update the FalseElement state machine.
+      if (FirstFalseElement == Undefined)
+        FirstFalseElement = FalseRangeEnd = i; // First false element.
+      else {
+        // Update double-compare state machine.
+        if (SecondFalseElement == Undefined)
+          SecondFalseElement = i;
+        else
+          SecondFalseElement = Overdefined;
+
+        // Update range state machine.
+        if (FalseRangeEnd == (int)i-1)
+          FalseRangeEnd = i;
+        else
+          FalseRangeEnd = Overdefined;
+      }
+    }
+
+
+    // If this element is in range, update our magic bitvector.
+    if (i < 64 && IsTrueForElt)
+      MagicBitvector |= 1ULL << i;
+
+    // If all of our states become overdefined, bail out early.  Since the
+    // predicate is expensive, only check it every 8 elements.  This is only
+    // really useful for really huge arrays.
+    if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
+        SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
+        FalseRangeEnd == Overdefined)
+      return 0;
+  }
+
+  // Now that we've scanned the entire array, emit our new comparison(s).  We
+  // order the state machines in complexity of the generated code.
+  Value *Idx = GEP->getOperand(2);
+
+  // If the index is larger than the pointer size of the target, truncate the
+  // index down like the GEP would do implicitly.  We don't have to do this for
+  // an inbounds GEP because the index can't be out of range.
+  if (!GEP->isInBounds() &&
+      Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
+    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
+
+  // If the comparison is only true for one or two elements, emit direct
+  // comparisons.
+  if (SecondTrueElement != Overdefined) {
+    // None true -> false.
+    if (FirstTrueElement == Undefined)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext()));
+
+    Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
+
+    // True for one element -> 'i == 47'.
+    if (SecondTrueElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
+
+    // True for two elements -> 'i == 47 | i == 72'.
+    Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx);
+    Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
+    Value *C2 = Builder->CreateICmpEQ(Idx, SecondTrueIdx);
+    return BinaryOperator::CreateOr(C1, C2);
+  }
+
+  // If the comparison is only false for one or two elements, emit direct
+  // comparisons.
+  if (SecondFalseElement != Overdefined) {
+    // None false -> true.
+    if (FirstFalseElement == Undefined)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext()));
+
+    Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
+
+    // False for one element -> 'i != 47'.
+    if (SecondFalseElement == Undefined)
+      return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
+
+    // False for two elements -> 'i != 47 & i != 72'.
+    Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx);
+    Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
+    Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx);
+    return BinaryOperator::CreateAnd(C1, C2);
+  }
+
+  // If the comparison can be replaced with a range comparison for the elements
+  // where it is true, emit the range check.
+  if (TrueRangeEnd != Overdefined) {
+    assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
+
+    // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
+    if (FirstTrueElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
+      Idx = Builder->CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  TrueRangeEnd-FirstTrueElement+1);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
+  }
+
+  // False range check.
+  if (FalseRangeEnd != Overdefined) {
+    assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
+    // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
+    if (FirstFalseElement) {
+      Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
+      Idx = Builder->CreateAdd(Idx, Offs);
+    }
+
+    Value *End = ConstantInt::get(Idx->getType(),
+                                  FalseRangeEnd-FirstFalseElement);
+    return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
+  }
+
+
+  // If a magic bitvector captures the entire comparison state
+  // of this load, replace it with computation that does:
+  //   ((magic_cst >> i) & 1) != 0
+  {
+    Type *Ty = 0;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type if we have a DataLayout
+    // - Default to i32
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else if (TD)
+      Ty = TD->getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+    else if (ArrayElementCount <= 32)
+      Ty = Type::getInt32Ty(Init->getContext());
+
+    if (Ty != 0) {
+      Value *V = Builder->CreateIntCast(Idx, Ty, false);
+      V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
+  }
+
+  return 0;
+}
+
+
+/// EvaluateGEPOffsetExpression - Return a value that can be used to compare
+/// the *offset* implied by a GEP to zero.  For example, if we have &A[i], we
+/// want to return 'i' for "icmp ne i, 0".  Note that, in general, indices can
+/// be complex, and scales are involved.  The above expression would also be
+/// legal to codegen as "icmp ne (i*4), 0" (assuming A is a pointer to i32).
+/// This later form is less amenable to optimization though, and we are allowed
+/// to generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+///
+static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
+  DataLayout &TD = *IC.getDataLayout();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+
+  // Check to see if this gep only has a single variable index.  If so, and if
+  // any constant indices are a multiple of its scale, then we can compute this
+  // in terms of the scale of the variable index.  For example, if the GEP
+  // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+  // because the expression will cross zero at the same point.
+  unsigned i, e = GEP->getNumOperands();
+  int64_t Offset = 0;
+  for (i = 1; i != e; ++i, ++GTI) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+      // Compute the aggregate offset of constant indices.
+      if (CI->isZero()) continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+      } else {
+        uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+        Offset += Size*CI->getSExtValue();
+      }
+    } else {
+      // Found our variable index.
+      break;
+    }
+  }
+
+  // If there are no variable indices, we must have a constant offset, just
+  // evaluate it the general way.
+  if (i == e) return 0;
+
+  Value *VariableIdx = GEP->getOperand(i);
+  // Determine the scale factor of the variable element.  For example, this is
+  // 4 if the variable index is into an array of i32.
+  uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType());
+
+  // Verify that there are no other variable indices.  If so, emit the hard way.
+  for (++i, ++GTI; i != e; ++i, ++GTI) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!CI) return 0;
+
+    // Compute the aggregate offset of constant indices.
+    if (CI->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+    } else {
+      uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+      Offset += Size*CI->getSExtValue();
+    }
+  }
+
+  // Okay, we know we have a single variable index, which must be a
+  // pointer/array/vector index.  If there is no offset, life is simple, return
+  // the index.
+  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  if (Offset == 0) {
+    // Cast to intptrty in case a truncation occurs.  If an extension is needed,
+    // we don't need to bother extending: the extension won't affect where the
+    // computation crosses zero.
+    if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
+      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
+    }
+    return VariableIdx;
+  }
+
+  // Otherwise, there is an index.  The computation we will do will be modulo
+  // the pointer size, so get it.
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  Offset &= PtrSizeMask;
+  VariableScale &= PtrSizeMask;
+
+  // To do this transformation, any constant index must be a multiple of the
+  // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
+  // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
+  // multiple of the variable scale.
+  int64_t NewOffs = Offset / (int64_t)VariableScale;
+  if (Offset != NewOffs*(int64_t)VariableScale)
+    return 0;
+
+  // Okay, we can do this evaluation.  Start by converting the index to intptr.
+  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+  if (VariableIdx->getType() != IntPtrTy)
+    VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
+                                            true /*Signed*/);
+  Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+  return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset");
+}
+
+/// FoldGEPICmp - Fold comparisons between a GEP instruction and something
+/// else.  At this point we know that the GEP is on the LHS of the comparison.
+Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+                                       ICmpInst::Predicate Cond,
+                                       Instruction &I) {
+  // Don't transform signed compares of GEPs into index compares. Even if the
+  // GEP is inbounds, the final add of the base pointer can have signed overflow
+  // and would change the result of the icmp.
+  // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
+  // the maximum signed value for the pointer type.
+  if (ICmpInst::isSigned(Cond))
+    return 0;
+
+  // Look through bitcasts.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(RHS))
+    RHS = BCI->getOperand(0);
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  if (TD && PtrBase == RHS && GEPLHS->isInBounds()) {
+    // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+    // This transformation (ignoring the base and scales) is valid because we
+    // know pointers can't overflow since the gep is inbounds.  See if we can
+    // output an optimized form.
+    Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
+
+    // If not, synthesize the offset the hard way.
+    if (Offset == 0)
+      Offset = EmitGEPOffset(GEPLHS);
+    return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                        Constant::getNullValue(Offset->getType()));
+  } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      if (IndicesTheSame)
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond),
+                            GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // If we're comparing GEPs with two base pointers that only differ in type
+      // and both GEPs have only constant indices or just one use, then fold
+      // the compare with the adjusted indices.
+      if (TD && GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+          (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
+          (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
+          PtrBase->stripPointerCasts() ==
+            GEPRHS->getOperand(0)->stripPointerCasts()) {
+        Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
+                                         EmitGEPOffset(GEPLHS),
+                                         EmitGEPOffset(GEPRHS));
+        return ReplaceInstUsesWith(I, Cmp);
+      }
+
+      // Otherwise, the base pointers are different and the indices are
+      // different, bail out.
+      return 0;
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    bool AllZeros = true;
+    for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPLHS->getOperand(i)) ||
+          !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                          ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    AllZeros = true;
+    for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPRHS->getOperand(i)) ||
+          !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
+                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          } else {
+            if (NumDifferences++) break;
+            DiffOperand = i;
+          }
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return ReplaceInstUsesWith(I, // No comparison is needed here.
+                               ConstantInt::get(Type::getInt1Ty(I.getContext()),
+                                             ICmpInst::isTrueWhenEqual(Cond)));
+
+      else if (NumDifferences == 1 && GEPsInBounds) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if (TD &&
+        GEPsInBounds &&
+        (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS);
+      Value *R = EmitGEPOffset(GEPRHS);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+  return 0;
+}
+
+/// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
+Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
+                                            Value *X, ConstantInt *CI,
+                                            ICmpInst::Predicate Pred,
+                                            Value *TheAdd) {
+  // If we have X+0, exit early (simplifying logic below) and let it get folded
+  // elsewhere.   icmp X+0, X  -> icmp X, X
+  if (CI->isZero()) {
+    bool isTrue = ICmpInst::isTrueWhenEqual(Pred);
+    return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
+  }
+
+  // (X+4) == X -> false.
+  if (Pred == ICmpInst::ICMP_EQ)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
+
+  // (X+4) != X -> true.
+  if (Pred == ICmpInst::ICMP_NE)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext()));
+
+  // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
+  // so the values can never be equal.  Similarly for all other "or equals"
+  // operators.
+
+  // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
+  // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
+  // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
+  if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
+    Value *R =
+      ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI);
+    return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
+  }
+
+  // (X+1) >u X        --> X <u (0-1)        --> X != 255
+  // (X+2) >u X        --> X <u (0-2)        --> X <u 254
+  // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
+  if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
+    return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI));
+
+  unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits();
+  ConstantInt *SMax = ConstantInt::get(X->getContext(),
+                                       APInt::getSignedMaxValue(BitWidth));
+
+  // (X+ 1) <s X       --> X >s (MAXSINT-1)          --> X == 127
+  // (X+ 2) <s X       --> X >s (MAXSINT-2)          --> X >s 125
+  // (X+MAXSINT) <s X  --> X >s (MAXSINT-MAXSINT)    --> X >s 0
+  // (X+MINSINT) <s X  --> X >s (MAXSINT-MINSINT)    --> X >s -1
+  // (X+ -2) <s X      --> X >s (MAXSINT- -2)        --> X >s 126
+  // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
+  if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+    return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI));
+
+  // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
+  // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
+  // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
+  // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
+  // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
+  // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
+
+  assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
+  Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
+  return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
+}
+
+/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
+/// and CmpRHS are both known to be integer constants.
+Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                          ConstantInt *DivRHS) {
+  ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
+  const APInt &CmpRHSV = CmpRHS->getValue();
+
+  // FIXME: If the operand types don't match the type of the divide
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C1) <s C2  produces different
+  // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
+  // (x /u C1) <u C2.  Simply casting the operands and result won't
+  // work. :(  The if statement below tests that condition and bails
+  // if it finds it.
+  bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
+  if (!ICI.isEquality() && DivIsSigned != ICI.isSigned())
+    return 0;
+  if (DivRHS->isZero())
+    return 0; // The ProdOV computation fails on divide by zero.
+  if (DivIsSigned && DivRHS->isAllOnesValue())
+    return 0; // The overflow computation also screws up here
+  if (DivRHS->isOne()) {
+    // This eliminates some funny cases with INT_MIN.
+    ICI.setOperand(0, DivI->getOperand(0));   // X/1 == X.
+    return &ICI;
+  }
+
+  // Compute Prod = CI * DivRHS. We are essentially solving an equation
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and
+  // C2 (CI). By solving for X we can turn this into a range check
+  // instead of computing a divide.
+  Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS);
+
+  // Determine if the product overflows by seeing if the product is
+  // not equal to the divide. Make sure we do the same kind of divide
+  // as in the LHS instruction that we're folding.
+  bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
+                 ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+
+  // Get the ICmp opcode
+  ICmpInst::Predicate Pred = ICI.getPredicate();
+
+  /// If the division is known to be exact, then there is no remainder from the
+  /// divide, so the covered range size is unit, otherwise it is the divisor.
+  ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS;
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  Constant *LoBound = 0, *HiBound = 0;
+
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow) {
+      // If this is not an exact divide, then many values in the range collapse
+      // to the same result value.
+      HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);
+    }
+
+  } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
+    if (CmpRHSV == 0) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = ConstantExpr::getNeg(SubOne(RangeSize));
+      HiBound = RangeSize;
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = AddWithOverflow(HiBound, Prod, RangeSize, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      HiBound = AddOne(Prod);
+      LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow) {
+        ConstantInt *DivNeg =cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
+        LoOverflow = AddWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
+      }
+    }
+  } else if (DivRHS->isNegative()) { // Divisor is < 0.
+    if (DivI->isExact())
+      RangeSize = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
+    if (CmpRHSV == 0) {       // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = AddOne(RangeSize);
+      HiBound = cast<ConstantInt>(ConstantExpr::getNeg(RangeSize));
+      if (HiBound == DivRHS) {     // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = 0;               // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (CmpRHSV.isStrictlyPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiBound = AddOne(Prod);
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = AddWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
+    } else {                       // (X / neg) op neg
+      LoBound = Prod;       // e.g. X/-5 op -3  --> [15, 20)
+      LoOverflow = HiOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true);
+    }
+
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = DivI->getOperand(0);
+  switch (Pred) {
+  default: llvm_unreachable("Unhandled icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+    if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                          ICmpInst::ICMP_UGE, X, LoBound);
+    if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                          ICmpInst::ICMP_ULT, X, HiBound);
+    return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
+                                                    DivIsSigned, true));
+  case ICmpInst::ICMP_NE:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+    if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+                          ICmpInst::ICMP_ULT, X, LoBound);
+    if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+                          ICmpInst::ICMP_UGE, X, HiBound);
+    return ReplaceInstUsesWith(ICI, InsertRangeTest(X, LoBound, HiBound,
+                                                    DivIsSigned, false));
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (LoOverflow == +1)   // Low bound is greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+    if (LoOverflow == -1)   // Low bound is less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+    return new ICmpInst(Pred, X, LoBound);
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (HiOverflow == +1)       // High bound greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+    if (HiOverflow == -1)       // High bound less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+    return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+  }
+}
+
+/// FoldICmpShrCst - Handle "icmp(([al]shr X, cst1), cst2)".
+Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
+                                          ConstantInt *ShAmt) {
+  const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue();
+
+  // Check that the shift amount is in range.  If not, don't perform
+  // undefined shifts.  When the shift is visited it will be
+  // simplified.
+  uint32_t TypeBits = CmpRHSV.getBitWidth();
+  uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+  if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+    return 0;
+
+  if (!ICI.isEquality()) {
+    // If we have an unsigned comparison and an ashr, we can't simplify this.
+    // Similarly for signed comparisons with lshr.
+    if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
+      return 0;
+
+    // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
+    // by a power of 2.  Since we already have logic to simplify these,
+    // transform to div and then simplify the resultant comparison.
+    if (Shr->getOpcode() == Instruction::AShr &&
+        (!Shr->isExact() || ShAmtVal == TypeBits - 1))
+      return 0;
+
+    // Revisit the shift (to delete it).
+    Worklist.Add(Shr);
+
+    Constant *DivCst =
+      ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
+
+    Value *Tmp =
+      Shr->getOpcode() == Instruction::AShr ?
+      Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) :
+      Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact());
+
+    ICI.setOperand(0, Tmp);
+
+    // If the builder folded the binop, just return it.
+    BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
+    if (TheDiv == 0)
+      return &ICI;
+
+    // Otherwise, fold this div/compare.
+    assert(TheDiv->getOpcode() == Instruction::SDiv ||
+           TheDiv->getOpcode() == Instruction::UDiv);
+
+    Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast<ConstantInt>(DivCst));
+    assert(Res && "This div/cst should have folded!");
+    return Res;
+  }
+
+
+  // If we are comparing against bits always shifted out, the
+  // comparison cannot succeed.
+  APInt Comp = CmpRHSV << ShAmtVal;
+  ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp);
+  if (Shr->getOpcode() == Instruction::LShr)
+    Comp = Comp.lshr(ShAmtVal);
+  else
+    Comp = Comp.ashr(ShAmtVal);
+
+  if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
+    bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+    Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
+                                     IsICMP_NE);
+    return ReplaceInstUsesWith(ICI, Cst);
+  }
+
+  // Otherwise, check to see if the bits shifted out are known to be zero.
+  // If so, we can compare against the unshifted value:
+  //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
+  if (Shr->hasOneUse() && Shr->isExact())
+    return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS);
+
+  if (Shr->hasOneUse()) {
+    // Otherwise strength reduce the shift into an and.
+    APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+    Constant *Mask = ConstantInt::get(ICI.getContext(), Val);
+
+    Value *And = Builder->CreateAnd(Shr->getOperand(0),
+                                    Mask, Shr->getName()+".mask");
+    return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS);
+  }
+  return 0;
+}
+
+
+/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
+///
+Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                          Instruction *LHSI,
+                                                          ConstantInt *RHS) {
+  const APInt &RHSV = RHS->getValue();
+
+  switch (LHSI->getOpcode()) {
+  case Instruction::Trunc:
+    if (ICI.isEquality() && LHSI->hasOneUse()) {
+      // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+      // of the high bits truncated out of x are known.
+      unsigned DstBits = LHSI->getType()->getPrimitiveSizeInBits(),
+             SrcBits = LHSI->getOperand(0)->getType()->getPrimitiveSizeInBits();
+      APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
+      ComputeMaskedBits(LHSI->getOperand(0), KnownZero, KnownOne);
+
+      // If all the high bits are known, we can do this xform.
+      if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
+        // Pull in the high bits from known-ones set.
+        APInt NewRHS = RHS->getValue().zext(SrcBits);
+        NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantInt::get(ICI.getContext(), NewRHS));
+      }
+    }
+    break;
+
+  case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
+    if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+      // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+      // fold the xor.
+      if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
+          (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
+        Value *CompareVal = LHSI->getOperand(0);
+
+        // If the sign bit of the XorCST is not set, there is no change to
+        // the operation, just stop using the Xor.
+        if (!XorCST->isNegative()) {
+          ICI.setOperand(0, CompareVal);
+          Worklist.Add(LHSI);
+          return &ICI;
+        }
+
+        // Was the old condition true if the operand is positive?
+        bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
+
+        // If so, the new one isn't.
+        isTrueIfPositive ^= true;
+
+        if (isTrueIfPositive)
+          return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal,
+                              SubOne(RHS));
+        else
+          return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal,
+                              AddOne(RHS));
+      }
+
+      if (LHSI->hasOneUse()) {
+        // (icmp u/s (xor A SignBit), C) -> (icmp s/u A, (xor C SignBit))
+        if (!ICI.isEquality() && XorCST->getValue().isSignBit()) {
+          const APInt &SignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSigned()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),
+                                               RHSV ^ SignBit));
+        }
+
+        // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
+        if (!ICI.isEquality() && XorCST->isMaxValue(true)) {
+          const APInt &NotSignBit = XorCST->getValue();
+          ICmpInst::Predicate Pred = ICI.isSigned()
+                                         ? ICI.getUnsignedPredicate()
+                                         : ICI.getSignedPredicate();
+          Pred = ICI.getSwappedPredicate(Pred);
+          return new ICmpInst(Pred, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),
+                                               RHSV ^ NotSignBit));
+        }
+      }
+    }
+    break;
+  case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
+    if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
+        LHSI->getOperand(0)->hasOneUse()) {
+      ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
+
+      // If the LHS is an AND of a truncating cast, we can widen the
+      // and/compare to be the input width without changing the value
+      // produced, eliminating a cast.
+      if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
+        // We can do this transformation if either the AND constant does not
+        // have its sign bit set or if it is an equality comparison.
+        // Extending a relational comparison when we're checking the sign
+        // bit would not work.
+        if (ICI.isEquality() ||
+            (!AndCST->isNegative() && RHSV.isNonNegative())) {
+          Value *NewAnd =
+            Builder->CreateAnd(Cast->getOperand(0),
+                               ConstantExpr::getZExt(AndCST, Cast->getSrcTy()));
+          NewAnd->takeName(LHSI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantExpr::getZExt(RHS, Cast->getSrcTy()));
+        }
+      }
+
+      // If the LHS is an AND of a zext, and we have an equality compare, we can
+      // shrink the and/compare to the smaller type, eliminating the cast.
+      if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) {
+        IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
+        // Make sure we don't compare the upper bits, SimplifyDemandedBits
+        // should fold the icmp to true/false in that case.
+        if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) {
+          Value *NewAnd =
+            Builder->CreateAnd(Cast->getOperand(0),
+                               ConstantExpr::getTrunc(AndCST, Ty));
+          NewAnd->takeName(LHSI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantExpr::getTrunc(RHS, Ty));
+        }
+      }
+
+      // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
+      // could exist), turn it into (X & (C2 << C1)) != (C3 << C1).  This
+      // happens a LOT in code produced by the C front-end, for bitfield
+      // access.
+      BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
+      if (Shift && !Shift->isShift())
+        Shift = 0;
+
+      ConstantInt *ShAmt;
+      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
+      Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      Type *AndTy = AndCST->getType();          // Type of the and.
+
+      // We can fold this as long as we can't shift unknown bits
+      // into the mask.  This can only happen with signed shift
+      // rights, as they sign-extend.
+      if (ShAmt) {
+        bool CanFold = Shift->isLogicalShift();
+        if (!CanFold) {
+          // To test for the bad case of the signed shr, see if any
+          // of the bits shifted in could be tested after the mask.
+          uint32_t TyBits = Ty->getPrimitiveSizeInBits();
+          int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
+
+          uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
+               AndCST->getValue()) == 0)
+            CanFold = true;
+        }
+
+        if (CanFold) {
+          Constant *NewCst;
+          if (Shift->getOpcode() == Instruction::Shl)
+            NewCst = ConstantExpr::getLShr(RHS, ShAmt);
+          else
+            NewCst = ConstantExpr::getShl(RHS, ShAmt);
+
+          // Check to see if we are shifting out any of the bits being
+          // compared.
+          if (ConstantExpr::get(Shift->getOpcode(),
+                                       NewCst, ShAmt) != RHS) {
+            // If we shifted bits out, the fold is not going to work out.
+            // As a special case, check to see if this means that the
+            // result is always true or false now.
+            if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+              return ReplaceInstUsesWith(ICI,
+                                       ConstantInt::getFalse(ICI.getContext()));
+            if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+              return ReplaceInstUsesWith(ICI,
+                                       ConstantInt::getTrue(ICI.getContext()));
+          } else {
+            ICI.setOperand(1, NewCst);
+            Constant *NewAndCST;
+            if (Shift->getOpcode() == Instruction::Shl)
+              NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt);
+            else
+              NewAndCST = ConstantExpr::getShl(AndCST, ShAmt);
+            LHSI->setOperand(1, NewAndCST);
+            LHSI->setOperand(0, Shift->getOperand(0));
+            Worklist.Add(Shift); // Shift is dead.
+            return &ICI;
+          }
+        }
+      }
+
+      // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
+      // preferable because it allows the C<<Y expression to be hoisted out
+      // of a loop if Y is invariant and X is not.
+      if (Shift && Shift->hasOneUse() && RHSV == 0 &&
+          ICI.isEquality() && !Shift->isArithmeticShift() &&
+          !isa<Constant>(Shift->getOperand(0))) {
+        // Compute C << Y.
+        Value *NS;
+        if (Shift->getOpcode() == Instruction::LShr) {
+          NS = Builder->CreateShl(AndCST, Shift->getOperand(1));
+        } else {
+          // Insert a logical shift.
+          NS = Builder->CreateLShr(AndCST, Shift->getOperand(1));
+        }
+
+        // Compute X & (C << Y).
+        Value *NewAnd =
+          Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
+
+        ICI.setOperand(0, NewAnd);
+        return &ICI;
+      }
+
+      // Replace ((X & AndCST) > RHSV) with ((X & AndCST) != 0), if any
+      // bit set in (X & AndCST) will produce a result greater than RHSV.
+      if (ICI.getPredicate() == ICmpInst::ICMP_UGT) {
+        unsigned NTZ = AndCST->getValue().countTrailingZeros();
+        if ((NTZ < AndCST->getBitWidth()) &&
+            APInt::getOneBitSet(AndCST->getBitWidth(), NTZ).ugt(RHSV))
+          return new ICmpInst(ICmpInst::ICMP_NE, LHSI,
+                              Constant::getNullValue(RHS->getType()));
+      }
+    }
+
+    // Try to optimize things like "A[i]&42 == 0" to index computations.
+    if (LoadInst *LI = dyn_cast<LoadInst>(LHSI->getOperand(0))) {
+      if (GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+              !LI->isVolatile() && isa<ConstantInt>(LHSI->getOperand(1))) {
+            ConstantInt *C = cast<ConstantInt>(LHSI->getOperand(1));
+            if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV,ICI, C))
+              return Res;
+          }
+    }
+    break;
+
+  case Instruction::Or: {
+    if (!ICI.isEquality() || !RHS->isNullValue() || !LHSI->hasOneUse())
+      break;
+    Value *P, *Q;
+    if (match(LHSI, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
+      // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
+      // -> and (icmp eq P, null), (icmp eq Q, null).
+      Value *ICIP = Builder->CreateICmp(ICI.getPredicate(), P,
+                                        Constant::getNullValue(P->getType()));
+      Value *ICIQ = Builder->CreateICmp(ICI.getPredicate(), Q,
+                                        Constant::getNullValue(Q->getType()));
+      Instruction *Op;
+      if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+        Op = BinaryOperator::CreateAnd(ICIP, ICIQ);
+      else
+        Op = BinaryOperator::CreateOr(ICIP, ICIQ);
+      return Op;
+    }
+    break;
+  }
+
+  case Instruction::Mul: {       // (icmp pred (mul X, Val), CI)
+    ConstantInt *Val = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!Val) break;
+
+    // If this is a signed comparison to 0 and the mul is sign preserving,
+    // use the mul LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) && !Val->isZero() &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(Val->isNegative() ?
+                          ICmpInst::getSwappedPredicate(pred) : pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
+    break;
+  }
+
+  case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt) break;
+
+    uint32_t TypeBits = RHSV.getBitWidth();
+
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    if (ShAmt->uge(TypeBits))
+      break;
+
+    if (ICI.isEquality()) {
+      // If we are comparing against bits always shifted out, the
+      // comparison cannot succeed.
+      Constant *Comp =
+        ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt),
+                                                                 ShAmt);
+      if (Comp != RHS) {// Comparing against a bit that we know is zero.
+        bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+        Constant *Cst =
+          ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE);
+        return ReplaceInstUsesWith(ICI, Cst);
+      }
+
+      // If the shift is NUW, then it is just shifting out zeros, no need for an
+      // AND.
+      if (cast<BinaryOperator>(LHSI)->hasNoUnsignedWrap())
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantExpr::getLShr(RHS, ShAmt));
+
+      // If the shift is NSW and we compare to 0, then it is just shifting out
+      // sign bits, no need for an AND either.
+      if (cast<BinaryOperator>(LHSI)->hasNoSignedWrap() && RHSV == 0)
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantExpr::getLShr(RHS, ShAmt));
+
+      if (LHSI->hasOneUse()) {
+        // Otherwise strength reduce the shift into an and.
+        uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+        Constant *Mask =
+          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits,
+                                                       TypeBits-ShAmtVal));
+
+        Value *And =
+          Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
+        return new ICmpInst(ICI.getPredicate(), And,
+                            ConstantExpr::getLShr(RHS, ShAmt));
+      }
+    }
+
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
+    // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+    bool TrueIfSigned = false;
+    if (LHSI->hasOneUse() &&
+        isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
+      // (X << 31) <s 0  --> (X&1) != 0
+      Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(),
+                                        APInt::getOneBitSet(TypeBits,
+                                            TypeBits-ShAmt->getZExtValue()-1));
+      Value *And =
+        Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask");
+      return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                          And, Constant::getNullValue(And->getType()));
+    }
+
+    // Transform (icmp pred iM (shl iM %v, N), CI)
+    // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (CI>>N))
+    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss and M-N.
+    // This enables to get rid of the shift in favor of a trunc which can be
+    // free on the target. It has the additional benefit of comparing to a
+    // smaller constant, which will be target friendly.
+    unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
+    if (LHSI->hasOneUse() &&
+        Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
+      Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
+      Constant *NCI = ConstantExpr::getTrunc(
+                        ConstantExpr::getAShr(RHS,
+                          ConstantInt::get(RHS->getType(), Amt)),
+                        NTy);
+      return new ICmpInst(ICI.getPredicate(),
+                          Builder->CreateTrunc(LHSI->getOperand(0), NTy),
+                          NCI);
+    }
+
+    break;
+  }
+
+  case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
+  case Instruction::AShr: {
+    // Handle equality comparisons of shift-by-constant.
+    BinaryOperator *BO = cast<BinaryOperator>(LHSI);
+    if (ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+      if (Instruction *Res = FoldICmpShrCst(ICI, BO, ShAmt))
+        return Res;
+    }
+
+    // Handle exact shr's.
+    if (ICI.isEquality() && BO->isExact() && BO->hasOneUse()) {
+      if (RHSV.isMinValue())
+        return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), RHS);
+    }
+    break;
+  }
+
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+    // Fold: icmp pred ([us]div X, C1), C2 -> range test
+    // Fold this div into the comparison, producing a range check.
+    // Determine, based on the divide type, what the range is being
+    // checked.  If there is an overflow on the low or high side, remember
+    // it, otherwise compute the range [low, hi) bounding the new value.
+    // See: InsertRangeTest above for the kinds of replacements possible.
+    if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
+      if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI),
+                                          DivRHS))
+        return R;
+    break;
+
+  case Instruction::Add:
+    // Fold: icmp pred (add X, C1), C2
+    if (!ICI.isEquality()) {
+      ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+      if (!LHSC) break;
+      const APInt &LHSV = LHSC->getValue();
+
+      ConstantRange CR = ICI.makeConstantRange(ICI.getPredicate(), RHSV)
+                            .subtract(LHSV);
+
+      if (ICI.isSigned()) {
+        if (CR.getLower().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+        } else if (CR.getUpper().isSignBit()) {
+          return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+        }
+      } else {
+        if (CR.getLower().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+        } else if (CR.getUpper().isMinValue()) {
+          return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
+                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+        }
+      }
+    }
+    break;
+  }
+
+  // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
+  if (ICI.isEquality()) {
+    bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and
+    // the second operand is a constant, simplify a bit.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
+      switch (BO->getOpcode()) {
+      case Instruction::SRem:
+        // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+        if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){
+          const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
+          if (V.sgt(1) && V.isPowerOf2()) {
+            Value *NewRem =
+              Builder->CreateURem(BO->getOperand(0), BO->getOperand(1),
+                                  BO->getName());
+            return new ICmpInst(ICI.getPredicate(), NewRem,
+                                Constant::getNullValue(BO->getType()));
+          }
+        }
+        break;
+      case Instruction::Add:
+        // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+        if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                ConstantExpr::getSub(RHS, BOp1C));
+        } else if (RHSV == 0) {
+          // Replace ((add A, B) != 0) with (A != -B) if A or B is
+          // efficiently invertible, or if the add has just this one use.
+          Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+
+          if (Value *NegVal = dyn_castNegVal(BOp1))
+            return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
+          if (Value *NegVal = dyn_castNegVal(BOp0))
+            return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
+          if (BO->hasOneUse()) {
+            Value *Neg = Builder->CreateNeg(BOp1);
+            Neg->takeName(BO);
+            return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
+          }
+        }
+        break;
+      case Instruction::Xor:
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              ConstantExpr::getXor(RHS, BOC));
+        } else if (RHSV == 0) {
+          // Replace ((xor A, B) != 0) with (A != B)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        }
+        break;
+      case Instruction::Sub:
+        // Replace ((sub A, B) != C) with (B != A-C) if A & C are constants.
+        if (ConstantInt *BOp0C = dyn_cast<ConstantInt>(BO->getOperand(0))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(1),
+                                ConstantExpr::getSub(BOp0C, RHS));
+        } else if (RHSV == 0) {
+          // Replace ((sub A, B) != 0) with (A != B)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        }
+        break;
+      case Instruction::Or:
+        // If bits are being or'd in that are not present in the constant we
+        // are comparing against, then the comparison could never succeed!
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          Constant *NotCI = ConstantExpr::getNot(RHS);
+          if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
+            return ReplaceInstUsesWith(ICI,
+                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
+                                       isICMP_NE));
+        }
+        break;
+
+      case Instruction::And:
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          // If bits are being compared against that are and'd out, then the
+          // comparison can never succeed!
+          if ((RHSV & ~BOC->getValue()) != 0)
+            return ReplaceInstUsesWith(ICI,
+                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
+                                       isICMP_NE));
+
+          // If we have ((X & C) == C), turn it into ((X & C) != 0).
+          if (RHS == BOC && RHSV.isPowerOf2())
+            return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
+                                ICmpInst::ICMP_NE, LHSI,
+                                Constant::getNullValue(RHS->getType()));
+
+          // Don't perform the following transforms if the AND has multiple uses
+          if (!BO->hasOneUse())
+            break;
+
+          // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+          if (BOC->getValue().isSignBit()) {
+            Value *X = BO->getOperand(0);
+            Constant *Zero = Constant::getNullValue(X->getType());
+            ICmpInst::Predicate pred = isICMP_NE ?
+              ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+            return new ICmpInst(pred, X, Zero);
+          }
+
+          // ((X & ~7) == 0) --> X < 8
+          if (RHSV == 0 && isHighOnes(BOC)) {
+            Value *X = BO->getOperand(0);
+            Constant *NegX = ConstantExpr::getNeg(BOC);
+            ICmpInst::Predicate pred = isICMP_NE ?
+              ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+            return new ICmpInst(pred, X, NegX);
+          }
+        }
+        break;
+      case Instruction::Mul:
+        if (RHSV == 0 && BO->hasNoSignedWrap()) {
+          if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+            // The trivial case (mul X, 0) is handled by InstSimplify
+            // General case : (mul X, C) != 0 iff X != 0
+            //                (mul X, C) == 0 iff X == 0
+            if (!BOC->isZero())
+              return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                  Constant::getNullValue(RHS->getType()));
+          }
+        }
+        break;
+      default: break;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
+      // Handle icmp {eq|ne} <intrinsic>, intcst.
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::bswap:
+        Worklist.Add(II);
+        ICI.setOperand(0, II->getArgOperand(0));
+        ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap()));
+        return &ICI;
+      case Intrinsic::ctlz:
+      case Intrinsic::cttz:
+        // ctz(A) == bitwidth(a)  ->  A == 0 and likewise for !=
+        if (RHSV == RHS->getType()->getBitWidth()) {
+          Worklist.Add(II);
+          ICI.setOperand(0, II->getArgOperand(0));
+          ICI.setOperand(1, ConstantInt::get(RHS->getType(), 0));
+          return &ICI;
+        }
+        break;
+      case Intrinsic::ctpop:
+        // popcount(A) == 0  ->  A == 0 and likewise for !=
+        if (RHS->isZero()) {
+          Worklist.Add(II);
+          ICI.setOperand(0, II->getArgOperand(0));
+          ICI.setOperand(1, RHS);
+          return &ICI;
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
+/// We only handle extending casts so far.
+///
+Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
+  const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
+  Value *LHSCIOp        = LHSCI->getOperand(0);
+  Type *SrcTy     = LHSCIOp->getType();
+  Type *DestTy    = LHSCI->getType();
+  Value *RHSCIOp;
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+  // integer type is the same size as the pointer type.
+  if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
+      TD->getPointerSizeInBits() ==
+         cast<IntegerType>(DestTy)->getBitWidth()) {
+    Value *RHSOp = 0;
+    if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
+      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
+      RHSOp = RHSC->getOperand(0);
+      // If the pointer types don't match, insert a bitcast.
+      if (LHSCIOp->getType() != RHSOp->getType())
+        RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType());
+    }
+
+    if (RHSOp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
+  }
+
+  // The code below only handles extension cast instructions, so far.
+  // Enforce this.
+  if (LHSCI->getOpcode() != Instruction::ZExt &&
+      LHSCI->getOpcode() != Instruction::SExt)
+    return 0;
+
+  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
+  bool isSignedCmp = ICI.isSigned();
+
+  if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
+    // Not an extension from the same type?
+    RHSCIOp = CI->getOperand(0);
+    if (RHSCIOp->getType() != LHSCIOp->getType())
+      return 0;
+
+    // If the signedness of the two casts doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    if (CI->getOpcode() != LHSCI->getOpcode())
+      return 0;
+
+    // Deal with equality cases early.
+    if (ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedCmp && isSignedExt)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, RHSCIOp);
+  }
+
+  // If we aren't dealing with a constant on the RHS, exit early
+  ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
+  if (!CI)
+    return 0;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // reextended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(),
+                                                Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == CI) {
+    // Deal with equality cases early.
+    if (ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+
+    // A signed comparison of sign extended values simplifies into a
+    // signed comparison.
+    if (isSignedExt && isSignedCmp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+
+    // The other three cases all fold into an unsigned comparison.
+    return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1);
+  }
+
+  // The re-extended constant changed so the constant cannot be represented
+  // in the shorter type. Consequently, we cannot emit a simple comparison.
+  // All the cases that fold to true or false will have already been handled
+  // by SimplifyICmpInst, so only deal with the tricky case.
+
+  if (isSignedCmp || !isSignedExt)
+    return 0;
+
+  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
+  // should have been folded away previously and not enter in here.
+
+  // We're performing an unsigned comp with a sign extended value.
+  // This is true if the input is >= 0. [aka >s -1]
+  Constant *NegOne = Constant::getAllOnesValue(SrcTy);
+  Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICI.getName());
+
+  // Finally, return the value computed.
+  if (ICI.getPredicate() == ICmpInst::ICMP_ULT)
+    return ReplaceInstUsesWith(ICI, Result);
+
+  assert(ICI.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+  return BinaryOperator::CreateNot(Result);
+}
+
+/// ProcessUGT_ADDCST_ADD - The caller has matched a pattern of the form:
+///   I = icmp ugt (add (add A, B), CI2), CI1
+/// If this is of the form:
+///   sum = a + b
+///   if (sum+128 >u 255)
+/// Then replace it with llvm.sadd.with.overflow.i8.
+///
+static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
+                                          ConstantInt *CI2, ConstantInt *CI1,
+                                          InstCombiner &IC) {
+  // The transformation we're trying to do here is to transform this into an
+  // llvm.sadd.with.overflow.  To do this, we have to replace the original add
+  // with a narrower add, and discard the add-with-constant that is part of the
+  // range check (if we can't eliminate it, this isn't profitable).
+
+  // In order to eliminate the add-with-constant, the compare can be its only
+  // use.
+  Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
+  if (!AddWithCst->hasOneUse()) return 0;
+
+  // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
+  if (!CI2->getValue().isPowerOf2()) return 0;
+  unsigned NewWidth = CI2->getValue().countTrailingZeros();
+  if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0;
+
+  // The width of the new add formed is 1 more than the bias.
+  ++NewWidth;
+
+  // Check to see that CI1 is an all-ones value with NewWidth bits.
+  if (CI1->getBitWidth() == NewWidth ||
+      CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
+    return 0;
+
+  // This is only really a signed overflow check if the inputs have been
+  // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+  // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+  unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+  if (IC.ComputeNumSignBits(A) < NeededSignBits ||
+      IC.ComputeNumSignBits(B) < NeededSignBits)
+    return 0;
+
+  // In order to replace the original add with a narrower
+  // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
+  // and truncates that discard the high bits of the add.  Verify that this is
+  // the case.
+  Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
+  for (Value::use_iterator UI = OrigAdd->use_begin(), E = OrigAdd->use_end();
+       UI != E; ++UI) {
+    if (*UI == AddWithCst) continue;
+
+    // Only accept truncates for now.  We would really like a nice recursive
+    // predicate like SimplifyDemandedBits, but which goes downwards the use-def
+    // chain to see which bits of a value are actually demanded.  If the
+    // original add had another add which was then immediately truncated, we
+    // could still do the transformation.
+    TruncInst *TI = dyn_cast<TruncInst>(*UI);
+    if (TI == 0 ||
+        TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0;
+  }
+
+  // If the pattern matches, truncate the inputs to the narrower type and
+  // use the sadd_with_overflow intrinsic to efficiently compute both the
+  // result and the overflow bit.
+  Module *M = I.getParent()->getParent()->getParent();
+
+  Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
+  Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow,
+                                       NewType);
+
+  InstCombiner::BuilderTy *Builder = IC.Builder;
+
+  // Put the new code above the original add, in case there are any uses of the
+  // add between the add and the compare.
+  Builder->SetInsertPoint(OrigAdd);
+
+  Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc");
+  Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc");
+  CallInst *Call = Builder->CreateCall2(F, TruncA, TruncB, "sadd");
+  Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
+  Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
+
+  // The inner add was the result of the narrow add, zero extended to the
+  // wider type.  Replace it with the result computed by the intrinsic.
+  IC.ReplaceInstUsesWith(*OrigAdd, ZExt);
+
+  // The original icmp gets replaced with the overflow value.
+  return ExtractValueInst::Create(Call, 1, "sadd.overflow");
+}
+
+static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
+                                     InstCombiner &IC) {
+  // Don't bother doing this transformation for pointers, don't do it for
+  // vectors.
+  if (!isa<IntegerType>(OrigAddV->getType())) return 0;
+
+  // If the add is a constant expr, then we don't bother transforming it.
+  Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV);
+  if (OrigAdd == 0) return 0;
+
+  Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1);
+
+  // Put the new code above the original add, in case there are any uses of the
+  // add between the add and the compare.
+  InstCombiner::BuilderTy *Builder = IC.Builder;
+  Builder->SetInsertPoint(OrigAdd);
+
+  Module *M = I.getParent()->getParent()->getParent();
+  Type *Ty = LHS->getType();
+  Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
+  CallInst *Call = Builder->CreateCall2(F, LHS, RHS, "uadd");
+  Value *Add = Builder->CreateExtractValue(Call, 0);
+
+  IC.ReplaceInstUsesWith(*OrigAdd, Add);
+
+  // The original icmp gets replaced with the overflow value.
+  return ExtractValueInst::Create(Call, 1, "uadd.overflow");
+}
+
+// DemandedBitsLHSMask - When performing a comparison against a constant,
+// it is possible that not all the bits in the LHS are demanded.  This helper
+// method computes the mask that IS demanded.
+static APInt DemandedBitsLHSMask(ICmpInst &I,
+                                 unsigned BitWidth, bool isSignCheck) {
+  if (isSignCheck)
+    return APInt::getSignBit(BitWidth);
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
+  if (!CI) return APInt::getAllOnesValue(BitWidth);
+  const APInt &RHS = CI->getValue();
+
+  switch (I.getPredicate()) {
+  // For a UGT comparison, we don't care about any bits that
+  // correspond to the trailing ones of the comparand.  The value of these
+  // bits doesn't impact the outcome of the comparison, because any value
+  // greater than the RHS must differ in a bit higher than these due to carry.
+  case ICmpInst::ICMP_UGT: {
+    unsigned trailingOnes = RHS.countTrailingOnes();
+    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes);
+    return ~lowBitsSet;
+  }
+
+  // Similarly, for a ULT comparison, we don't care about the trailing zeros.
+  // Any value less than the RHS must differ in a higher bit because of carries.
+  case ICmpInst::ICMP_ULT: {
+    unsigned trailingZeros = RHS.countTrailingZeros();
+    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros);
+    return ~lowBitsSet;
+  }
+
+  default:
+    return APInt::getAllOnesValue(BitWidth);
+  }
+
+}
+
+Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+  bool Changed = false;
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(Op0) < getComplexity(Op1)) {
+    I.swapOperands();
+    std::swap(Op0, Op1);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // comparing -val or val with non-zero is the same as just comparing val
+  // ie, abs(val) != 0 -> val != 0
+  if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero()))
+  {
+    Value *Cond, *SelectTrue, *SelectFalse;
+    if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
+                            m_Value(SelectFalse)))) {
+      if (Value *V = dyn_castNegVal(SelectTrue)) {
+        if (V == SelectFalse)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+      else if (Value *V = dyn_castNegVal(SelectFalse)) {
+        if (V == SelectTrue)
+          return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+      }
+    }
+  }
+
+  Type *Ty = Op0->getType();
+
+  // icmp's with boolean values can always be turned into bitwise operations
+  if (Ty->isIntegerTy(1)) {
+    switch (I.getPredicate()) {
+    default: llvm_unreachable("Invalid icmp instruction!");
+    case ICmpInst::ICMP_EQ: {               // icmp eq i1 A, B -> ~(A^B)
+      Value *Xor = Builder->CreateXor(Op0, Op1, I.getName()+"tmp");
+      return BinaryOperator::CreateNot(Xor);
+    }
+    case ICmpInst::ICMP_NE:                  // icmp eq i1 A, B -> A^B
+      return BinaryOperator::CreateXor(Op0, Op1);
+
+    case ICmpInst::ICMP_UGT:
+      std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULT:{               // icmp ult i1 A, B -> ~A & B
+      Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp");
+      return BinaryOperator::CreateAnd(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGT:
+      std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
+      Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp");
+      return BinaryOperator::CreateAnd(Not, Op0);
+    }
+    case ICmpInst::ICMP_UGE:
+      std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULE: {               //  icmp ule i1 A, B -> ~A | B
+      Value *Not = Builder->CreateNot(Op0, I.getName()+"tmp");
+      return BinaryOperator::CreateOr(Not, Op1);
+    }
+    case ICmpInst::ICMP_SGE:
+      std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
+      // FALL THROUGH
+    case ICmpInst::ICMP_SLE: {               //  icmp sle i1 A, B -> A | ~B
+      Value *Not = Builder->CreateNot(Op1, I.getName()+"tmp");
+      return BinaryOperator::CreateOr(Not, Op0);
+    }
+    }
+  }
+
+  unsigned BitWidth = 0;
+  if (Ty->isIntOrIntVectorTy())
+    BitWidth = Ty->getScalarSizeInBits();
+  else if (TD)  // Pointers require TD info to get their size.
+    BitWidth = TD->getTypeSizeInBits(Ty->getScalarType());
+
+  bool isSignBit = false;
+
+  // See if we are doing a comparison with a constant.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    Value *A = 0, *B = 0;
+
+    // Match the following pattern, which is a common idiom when writing
+    // overflow-safe integer arithmetic function.  The source performs an
+    // addition in wider type, and explicitly checks for overflow using
+    // comparisons against INT_MIN and INT_MAX.  Simplify this by using the
+    // sadd_with_overflow intrinsic.
+    //
+    // TODO: This could probably be generalized to handle other overflow-safe
+    // operations if we worked out the formulas to compute the appropriate
+    // magic constants.
+    //
+    // sum = a + b
+    // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
+    {
+    ConstantInt *CI2;    // I = icmp ugt (add (add A, B), CI2), CI
+    if (I.getPredicate() == ICmpInst::ICMP_UGT &&
+        match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+      if (Instruction *Res = ProcessUGT_ADDCST_ADD(I, A, B, CI2, CI, *this))
+        return Res;
+    }
+
+    // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
+    if (I.isEquality() && CI->isZero() &&
+        match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
+      // (icmp cond A B) if cond is equality
+      return new ICmpInst(I.getPredicate(), A, B);
+    }
+
+    // If we have an icmp le or icmp ge instruction, turn it into the
+    // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
+    // them being folded in the code below.  The SimplifyICmpInst code has
+    // already handled the edge cases for us, so we just assert on them.
+    switch (I.getPredicate()) {
+    default: break;
+    case ICmpInst::ICMP_ULE:
+      assert(!CI->isMaxValue(false));                 // A <=u MAX -> TRUE
+      return new ICmpInst(ICmpInst::ICMP_ULT, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+    case ICmpInst::ICMP_SLE:
+      assert(!CI->isMaxValue(true));                  // A <=s MAX -> TRUE
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+    case ICmpInst::ICMP_UGE:
+      assert(!CI->isMinValue(false));                 // A >=u MIN -> TRUE
+      return new ICmpInst(ICmpInst::ICMP_UGT, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+    case ICmpInst::ICMP_SGE:
+      assert(!CI->isMinValue(true));                  // A >=s MIN -> TRUE
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+    }
+
+    // If this comparison is a normal comparison, it demands all
+    // bits, if it is a sign bit comparison, it only demands the sign bit.
+    bool UnusedBit;
+    isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
+  }
+
+  // See if we can fold the comparison based on range information we can get
+  // by checking whether bits are known to be zero or one in the input.
+  if (BitWidth != 0) {
+    APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
+    APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
+
+    if (SimplifyDemandedBits(I.getOperandUse(0),
+                             DemandedBitsLHSMask(I, BitWidth, isSignBit),
+                             Op0KnownZero, Op0KnownOne, 0))
+      return &I;
+    if (SimplifyDemandedBits(I.getOperandUse(1),
+                             APInt::getAllOnesValue(BitWidth),
+                             Op1KnownZero, Op1KnownOne, 0))
+      return &I;
+
+    // Given the known and unknown bits, compute a range that the LHS could be
+    // in.  Compute the Min, Max and RHS values based on the known bits. For the
+    // EQ and NE we use unsigned values.
+    APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+    APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+    if (I.isSigned()) {
+      ComputeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                             Op0Min, Op0Max);
+      ComputeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                             Op1Min, Op1Max);
+    } else {
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne,
+                                               Op0Min, Op0Max);
+      ComputeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne,
+                                               Op1Min, Op1Max);
+    }
+
+    // If Min and Max are known to be the same, then SimplifyDemandedBits
+    // figured out that the LHS is a constant.  Just constant fold this now so
+    // that code below can assume that Min != Max.
+    if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+      return new ICmpInst(I.getPredicate(),
+                          ConstantInt::get(Op0->getType(), Op0Min), Op1);
+    if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+      return new ICmpInst(I.getPredicate(), Op0,
+                          ConstantInt::get(Op1->getType(), Op1Min));
+
+    // Based on the range information we know about the LHS, see if we can
+    // simplify this comparison.  For example, (x&4) < 8 is always true.
+    switch (I.getPredicate()) {
+    default: llvm_unreachable("Unknown icmp opcode!");
+    case ICmpInst::ICMP_EQ: {
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+      // If all bits are known zero except for one, then we know at most one
+      // bit is set.   If the comparison is against zero, then this is a check
+      // to see if *that* bit is set.
+      APInt Op0KnownZeroInverted = ~Op0KnownZero;
+      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+        // If the LHS is an AND with the same constant, look through it.
+        Value *LHS = 0;
+        ConstantInt *LHSC = 0;
+        if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
+            LHSC->getValue() != Op0KnownZeroInverted)
+          LHS = Op0;
+
+        // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
+        // then turn "((1 << x)&8) == 0" into "x != 3".
+        Value *X = 0;
+        if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
+          return new ICmpInst(ICmpInst::ICMP_NE, X,
+                              ConstantInt::get(X->getType(), CmpVal));
+        }
+
+        // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
+        // then turn "((8 >>u x)&1) == 0" into "x != 3".
+        const APInt *CI;
+        if (Op0KnownZeroInverted == 1 &&
+            match(LHS, m_LShr(m_Power2(CI), m_Value(X))))
+          return new ICmpInst(ICmpInst::ICMP_NE, X,
+                              ConstantInt::get(X->getType(),
+                                               CI->countTrailingZeros()));
+      }
+
+      break;
+    }
+    case ICmpInst::ICMP_NE: {
+      if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+
+      // If all bits are known zero except for one, then we know at most one
+      // bit is set.   If the comparison is against zero, then this is a check
+      // to see if *that* bit is set.
+      APInt Op0KnownZeroInverted = ~Op0KnownZero;
+      if (~Op1KnownZero == 0 && Op0KnownZeroInverted.isPowerOf2()) {
+        // If the LHS is an AND with the same constant, look through it.
+        Value *LHS = 0;
+        ConstantInt *LHSC = 0;
+        if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
+            LHSC->getValue() != Op0KnownZeroInverted)
+          LHS = Op0;
+
+        // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
+        // then turn "((1 << x)&8) != 0" into "x == 3".
+        Value *X = 0;
+        if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+          unsigned CmpVal = Op0KnownZeroInverted.countTrailingZeros();
+          return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                              ConstantInt::get(X->getType(), CmpVal));
+        }
+
+        // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
+        // then turn "((8 >>u x)&1) != 0" into "x == 3".
+        const APInt *CI;
+        if (Op0KnownZeroInverted == 1 &&
+            match(LHS, m_LShr(m_Power2(CI), m_Value(X))))
+          return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                              ConstantInt::get(X->getType(),
+                                               CI->countTrailingZeros()));
+      }
+
+      break;
+    }
+    case ICmpInst::ICMP_ULT:
+      if (Op0Max.ult(Op1Min))          // A <u B -> true if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Min.uge(Op1Max))          // A <u B -> false if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      if (Op1Min == Op0Max)            // A <u B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <u C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+
+        // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
+        if (CI->isMinValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
+                           Constant::getAllOnesValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_UGT:
+      if (Op0Min.ugt(Op1Max))          // A >u B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Max.ule(Op1Min))          // A >u B -> false if max(A) <= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+      if (Op1Max == Op0Min)            // A >u B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >u C -> A == C+1 if max(a)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+
+        // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
+        if (CI->isMaxValue(true))
+          return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
+                              Constant::getNullValue(Op0->getType()));
+      }
+      break;
+    case ICmpInst::ICMP_SLT:
+      if (Op0Max.slt(Op1Min))          // A <s B -> true if max(A) < min(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Min.sge(Op1Max))          // A <s B -> false if min(A) >= max(C)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      if (Op1Min == Op0Max)            // A <s B -> A != B if max(A) == min(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Max == Op0Min+1)        // A <s C -> A == C-1 if min(A)+1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+      }
+      break;
+    case ICmpInst::ICMP_SGT:
+      if (Op0Min.sgt(Op1Max))          // A >s B -> true if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Max.sle(Op1Min))          // A >s B -> false if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+
+      if (Op1Max == Op0Min)            // A >s B -> A != B if min(A) == max(B)
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+        if (Op1Min == Op0Max-1)        // A >s C -> A == C+1 if max(A)-1 == C
+          return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+      }
+      break;
+    case ICmpInst::ICMP_SGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+      if (Op0Min.sge(Op1Max))          // A >=s B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Max.slt(Op1Min))          // A >=s B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      break;
+    case ICmpInst::ICMP_SLE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+      if (Op0Max.sle(Op1Min))          // A <=s B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Min.sgt(Op1Max))          // A <=s B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      break;
+    case ICmpInst::ICMP_UGE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+      if (Op0Min.uge(Op1Max))          // A >=u B -> true if min(A) >= max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Max.ult(Op1Min))          // A >=u B -> false if max(A) < min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      break;
+    case ICmpInst::ICMP_ULE:
+      assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+      if (Op0Max.ule(Op1Min))          // A <=u B -> true if max(A) <= min(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+      if (Op0Min.ugt(Op1Max))          // A <=u B -> false if min(A) > max(B)
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+      break;
+    }
+
+    // Turn a signed comparison into an unsigned one if both operands
+    // are known to have the same sign.
+    if (I.isSigned() &&
+        ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
+         (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
+      return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+  }
+
+  // Test if the ICmpInst instruction is used exclusively by a select as
+  // part of a minimum or maximum operation. If so, refrain from doing
+  // any other folding. This helps out other analyses which understand
+  // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+  // and CodeGen. And in this case, at least one of the comparison
+  // operands has at least one user besides the compare (the select),
+  // which would often largely negate the benefit of folding anyway.
+  if (I.hasOneUse())
+    if (SelectInst *SI = dyn_cast<SelectInst>(*I.use_begin()))
+      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
+          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+        return 0;
+
+  // See if we are doing a comparison between a constant and an instruction that
+  // can be folded into the comparison.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an
+    // instruction, see if that instruction also has constants so that the
+    // instruction can be folded into the icmp
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
+        return Res;
+  }
+
+  // Handle icmp with constant (but not simple integer constant) RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::GetElementPtr:
+          // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+        if (RHSC->isNullValue() &&
+            cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
+          return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
+                  Constant::getNullValue(LHSI->getOperand(0)->getType()));
+        break;
+      case Instruction::PHI:
+        // Only fold icmp into the PHI if the phi and icmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::Select: {
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1)))
+          Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+        if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2)))
+          Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+
+        // We only want to perform this transformation if it will not lead to
+        // additional code. This is true if either both sides of the select
+        // fold to a constant (in which case the icmp is replaced with a select
+        // which will usually simplify) or this is the only user of the
+        // select (in which case we are trading a select+icmp for a simpler
+        // select+icmp).
+        if ((Op1 && Op2) || (LHSI->hasOneUse() && (Op1 || Op2))) {
+          if (!Op1)
+            Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1),
+                                      RHSC, I.getName());
+          if (!Op2)
+            Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2),
+                                      RHSC, I.getName());
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        }
+        break;
+      }
+      case Instruction::IntToPtr:
+        // icmp pred inttoptr(X), null -> icmp pred X, 0
+        if (RHSC->isNullValue() && TD &&
+            TD->getIntPtrType(RHSC->getContext()) ==
+               LHSI->getOperand(0)->getType())
+          return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
+                        Constant::getNullValue(LHSI->getOperand(0)->getType()));
+        break;
+
+      case Instruction::Load:
+        // Try to optimize things like "A[i] > 4" to index computations.
+        if (GetElementPtrInst *GEP =
+              dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+            if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+                !cast<LoadInst>(LHSI)->isVolatile())
+              if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I))
+                return Res;
+        }
+        break;
+      }
+  }
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
+  // now.
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
+    if (Op0->getType()->isPointerTy() &&
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
+      // We keep moving the cast from the left operand over to the right
+      // operand, where it can often be eliminated completely.
+      Op0 = CI->getOperand(0);
+
+      // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+      // so eliminate it as well.
+      if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1))
+        Op1 = CI2->getOperand(0);
+
+      // If Op1 is a constant, we can fold the cast into the constant.
+      if (Op0->getType() != Op1->getType()) {
+        if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+          Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
+        } else {
+          // Otherwise, cast the RHS right before the icmp
+          Op1 = Builder->CreateBitCast(Op1, Op0->getType());
+        }
+      }
+      return new ICmpInst(I.getPredicate(), Op0, Op1);
+    }
+  }
+
+  if (isa<CastInst>(Op0)) {
+    // Handle the special case of: icmp (cast bool to X), <cst>
+    // This comes up when you have code like
+    //   int X = A < B;
+    //   if (X) ...
+    // For generality, we handle any zero-extension of any operand comparison
+    // with a constant or another cast from the same type.
+    if (isa<Constant>(Op1) || isa<CastInst>(Op1))
+      if (Instruction *R = visitICmpInstWithCastAndCast(I))
+        return R;
+  }
+
+  // Special logic for binary operators.
+  BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
+  BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
+  if (BO0 || BO1) {
+    CmpInst::Predicate Pred = I.getPredicate();
+    bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
+    if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+      NoOp0WrapProblem = ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+    if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+      NoOp1WrapProblem = ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
+
+    // Analyze the case when either Op0 or Op1 is an add instruction.
+    // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    if (BO0 && BO0->getOpcode() == Instruction::Add)
+      A = BO0->getOperand(0), B = BO0->getOperand(1);
+    if (BO1 && BO1->getOpcode() == Instruction::Add)
+      C = BO1->getOperand(0), D = BO1->getOperand(1);
+
+    // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
+    if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
+      return new ICmpInst(Pred, A == Op1 ? B : A,
+                          Constant::getNullValue(Op1->getType()));
+
+    // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow.
+    if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
+      return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
+                          C == Op0 ? D : C);
+
+    // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow.
+    if (A && C && (A == C || A == D || B == C || B == D) &&
+        NoOp0WrapProblem && NoOp1WrapProblem &&
+        // Try not to increase register pressure.
+        BO0->hasOneUse() && BO1->hasOneUse()) {
+      // Determine Y and Z in the form icmp (X+Y), (X+Z).
+      Value *Y, *Z;
+      if (A == C) {
+        // C + B == C + D  ->  B == D
+        Y = B;
+        Z = D;
+      } else if (A == D) {
+        // D + B == C + D  ->  B == C
+        Y = B;
+        Z = C;
+      } else if (B == C) {
+        // A + C == C + D  ->  A == D
+        Y = A;
+        Z = D;
+      } else {
+        assert(B == D);
+        // A + D == C + D  ->  A == C
+        Y = A;
+        Z = C;
+      }
+      return new ICmpInst(Pred, Y, Z);
+    }
+
+    // Analyze the case when either Op0 or Op1 is a sub instruction.
+    // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
+    A = 0; B = 0; C = 0; D = 0;
+    if (BO0 && BO0->getOpcode() == Instruction::Sub)
+      A = BO0->getOperand(0), B = BO0->getOperand(1);
+    if (BO1 && BO1->getOpcode() == Instruction::Sub)
+      C = BO1->getOperand(0), D = BO1->getOperand(1);
+
+    // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow.
+    if (A == Op1 && NoOp0WrapProblem)
+      return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
+
+    // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow.
+    if (C == Op0 && NoOp1WrapProblem)
+      return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+
+    // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow.
+    if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem &&
+        // Try not to increase register pressure.
+        BO0->hasOneUse() && BO1->hasOneUse())
+      return new ICmpInst(Pred, A, C);
+
+    // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow.
+    if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem &&
+        // Try not to increase register pressure.
+        BO0->hasOneUse() && BO1->hasOneUse())
+      return new ICmpInst(Pred, D, B);
+
+    BinaryOperator *SRem = NULL;
+    // icmp (srem X, Y), Y
+    if (BO0 && BO0->getOpcode() == Instruction::SRem &&
+        Op1 == BO0->getOperand(1))
+      SRem = BO0;
+    // icmp Y, (srem X, Y)
+    else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
+             Op0 == BO1->getOperand(1))
+      SRem = BO1;
+    if (SRem) {
+      // We don't check hasOneUse to avoid increasing register pressure because
+      // the value we use is the same value this instruction was already using.
+      switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
+        default: break;
+        case ICmpInst::ICMP_EQ:
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+        case ICmpInst::ICMP_NE:
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_SGE:
+          return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
+                              Constant::getAllOnesValue(SRem->getType()));
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_SLE:
+          return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
+                              Constant::getNullValue(SRem->getType()));
+      }
+    }
+
+    if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() &&
+        BO0->hasOneUse() && BO1->hasOneUse() &&
+        BO0->getOperand(1) == BO1->getOperand(1)) {
+      switch (BO0->getOpcode()) {
+      default: break;
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Xor:
+        if (I.isEquality())    // a+x icmp eq/ne b+x --> a icmp b
+          return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+                              BO1->getOperand(0));
+        // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
+          if (CI->getValue().isSignBit()) {
+            ICmpInst::Predicate Pred = I.isSigned()
+                                           ? I.getUnsignedPredicate()
+                                           : I.getSignedPredicate();
+            return new ICmpInst(Pred, BO0->getOperand(0),
+                                BO1->getOperand(0));
+          }
+
+          if (CI->isMaxValue(true)) {
+            ICmpInst::Predicate Pred = I.isSigned()
+                                           ? I.getUnsignedPredicate()
+                                           : I.getSignedPredicate();
+            Pred = I.getSwappedPredicate(Pred);
+            return new ICmpInst(Pred, BO0->getOperand(0),
+                                BO1->getOperand(0));
+          }
+        }
+        break;
+      case Instruction::Mul:
+        if (!I.isEquality())
+          break;
+
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
+          // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
+          // Mask = -1 >> count-trailing-zeros(Cst).
+          if (!CI->isZero() && !CI->isOne()) {
+            const APInt &AP = CI->getValue();
+            ConstantInt *Mask = ConstantInt::get(I.getContext(),
+                                    APInt::getLowBitsSet(AP.getBitWidth(),
+                                                         AP.getBitWidth() -
+                                                    AP.countTrailingZeros()));
+            Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
+            Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
+            return new ICmpInst(I.getPredicate(), And1, And2);
+          }
+        }
+        break;
+      case Instruction::UDiv:
+      case Instruction::LShr:
+        if (I.isSigned())
+          break;
+        // fall-through
+      case Instruction::SDiv:
+      case Instruction::AShr:
+        if (!BO0->isExact() || !BO1->isExact())
+          break;
+        return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+                            BO1->getOperand(0));
+      case Instruction::Shl: {
+        bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+        bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+        if (!NUW && !NSW)
+          break;
+        if (!NSW && I.isSigned())
+          break;
+        return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
+                            BO1->getOperand(0));
+      }
+      }
+    }
+  }
+
+  { Value *A, *B;
+    // ~x < ~y --> y < x
+    // ~x < cst --> ~cst < x
+    if (match(Op0, m_Not(m_Value(A)))) {
+      if (match(Op1, m_Not(m_Value(B))))
+        return new ICmpInst(I.getPredicate(), B, A);
+      if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
+        return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A);
+    }
+
+    // (a+b) <u a  --> llvm.uadd.with.overflow.
+    // (a+b) <u b  --> llvm.uadd.with.overflow.
+    if (I.getPredicate() == ICmpInst::ICMP_ULT &&
+        match(Op0, m_Add(m_Value(A), m_Value(B))) &&
+        (Op1 == A || Op1 == B))
+      if (Instruction *R = ProcessUAddIdiom(I, Op0, *this))
+        return R;
+
+    // a >u (a+b)  --> llvm.uadd.with.overflow.
+    // b >u (a+b)  --> llvm.uadd.with.overflow.
+    if (I.getPredicate() == ICmpInst::ICMP_UGT &&
+        match(Op1, m_Add(m_Value(A), m_Value(B))) &&
+        (Op0 == A || Op0 == B))
+      if (Instruction *R = ProcessUAddIdiom(I, Op1, *this))
+        return R;
+  }
+
+  if (I.isEquality()) {
+    Value *A, *B, *C, *D;
+
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1) {    // (A^B) == A  ->  B == 0
+        Value *OtherVal = A == Op1 ? B : A;
+        return new ICmpInst(I.getPredicate(), OtherVal,
+                            Constant::getNullValue(A->getType()));
+      }
+
+      if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+        // A^c1 == C^c2 --> A == C^(c1^c2)
+        ConstantInt *C1, *C2;
+        if (match(B, m_ConstantInt(C1)) &&
+            match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
+          Constant *NC = ConstantInt::get(I.getContext(),
+                                          C1->getValue() ^ C2->getValue());
+          Value *Xor = Builder->CreateXor(C, NC);
+          return new ICmpInst(I.getPredicate(), A, Xor);
+        }
+
+        // A^B == A^D -> B == D
+        if (A == C) return new ICmpInst(I.getPredicate(), B, D);
+        if (A == D) return new ICmpInst(I.getPredicate(), B, C);
+        if (B == C) return new ICmpInst(I.getPredicate(), A, D);
+        if (B == D) return new ICmpInst(I.getPredicate(), A, C);
+      }
+    }
+
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return new ICmpInst(I.getPredicate(), OtherVal,
+                          Constant::getNullValue(A->getType()));
+    }
+
+    // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
+        match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+      Value *X = 0, *Y = 0, *Z = 0;
+
+      if (A == C) {
+        X = B; Y = D; Z = A;
+      } else if (A == D) {
+        X = B; Y = C; Z = A;
+      } else if (B == C) {
+        X = A; Y = D; Z = B;
+      } else if (B == D) {
+        X = A; Y = C; Z = B;
+      }
+
+      if (X) {   // Build (X^Y) & Z
+        Op1 = Builder->CreateXor(X, Y);
+        Op1 = Builder->CreateAnd(Op1, Z);
+        I.setOperand(0, Op1);
+        I.setOperand(1, Constant::getNullValue(Op1->getType()));
+        return &I;
+      }
+    }
+
+    // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+    // and       (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+    ConstantInt *Cst1;
+    if ((Op0->hasOneUse() &&
+         match(Op0, m_ZExt(m_Value(A))) &&
+         match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+        (Op1->hasOneUse() &&
+         match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+         match(Op1, m_ZExt(m_Value(A))))) {
+      APInt Pow2 = Cst1->getValue() + 1;
+      if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+          Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+        return new ICmpInst(I.getPredicate(), A,
+                            Builder->CreateTrunc(B, A->getType()));
+    }
+
+    // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+    // "icmp (and X, mask), cst"
+    uint64_t ShAmt = 0;
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A),
+                                           m_ConstantInt(ShAmt))))) &&
+        match(Op1, m_ConstantInt(Cst1)) &&
+        // Only do this when A has multiple uses.  This is most important to do
+        // when it exposes other optimizations.
+        !A->hasOneUse()) {
+      unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+
+      if (ShAmt < ASize) {
+        APInt MaskV =
+          APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+        MaskV <<= ShAmt;
+
+        APInt CmpV = Cst1->getValue().zext(ASize);
+        CmpV <<= ShAmt;
+
+        Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
+        return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
+      }
+    }
+  }
+
+  {
+    Value *X; ConstantInt *Cst;
+    // icmp X+Cst, X
+    if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
+      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0);
+
+    // icmp X, X+Cst
+    if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
+      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1);
+  }
+  return Changed ? &I : 0;
+}
+
+
+
+
+
+
+/// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
+///
+Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
+                                                Instruction *LHSI,
+                                                Constant *RHSC) {
+  if (!isa<ConstantFP>(RHSC)) return 0;
+  const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+
+  // Get the width of the mantissa.  We don't want to hack on conversions that
+  // might lose information from the integer, e.g. "i64 -> float"
+  int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+  if (MantissaWidth == -1) return 0;  // Unknown.
+
+  // Check to see that the input is converted from an integer type that is small
+  // enough that preserves all bits.  TODO: check here for "known" sign bits.
+  // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+  unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits();
+
+  // If this is a uitofp instruction, we need an extra bit to hold the sign.
+  bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+  if (LHSUnsigned)
+    ++InputSize;
+
+  // If the conversion would lose info, don't hack on this.
+  if ((int)InputSize > MantissaWidth)
+    return 0;
+
+  // Otherwise, we can potentially simplify the comparison.  We know that it
+  // will always come through as an integer value and we know the constant is
+  // not a NAN (it would have been previously simplified).
+  assert(!RHS.isNaN() && "NaN comparison not already folded!");
+
+  ICmpInst::Predicate Pred;
+  switch (I.getPredicate()) {
+  default: llvm_unreachable("Unexpected predicate!");
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ:
+    Pred = ICmpInst::ICMP_EQ;
+    break;
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+    break;
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+    break;
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+    break;
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE:
+    Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+    break;
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE:
+    Pred = ICmpInst::ICMP_NE;
+    break;
+  case FCmpInst::FCMP_ORD:
+    return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+  case FCmpInst::FCMP_UNO:
+    return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+  }
+
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
+  // Now we know that the APFloat is a normal number, zero or inf.
+
+  // See if the FP constant is too large for the integer.  For example,
+  // comparing an i8 to 300.0.
+  unsigned IntWidth = IntTy->getScalarSizeInBits();
+
+  if (!LHSUnsigned) {
+    // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
+    // and large values.
+    APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false);
+    SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
+          Pred == ICmpInst::ICMP_SLE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    }
+  } else {
+    // If the RHS value is > UnsignedMax, fold the comparison. This handles
+    // +INF and large values.
+    APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false);
+    UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+                          APFloat::rmNearestTiesToEven);
+    if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
+      if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
+          Pred == ICmpInst::ICMP_ULE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    }
+  }
+
+  if (!LHSUnsigned) {
+    // See if the RHS value is < SignedMin.
+    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+          Pred == ICmpInst::ICMP_SGE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    }
+  } else {
+    // See if the RHS value is < UnsignedMin.
+    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
+                          APFloat::rmNearestTiesToEven);
+    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
+      if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
+          Pred == ICmpInst::ICMP_UGE)
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    }
+  }
+
+  // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+  // [0, UMAX], but it may still be fractional.  See if it is fractional by
+  // casting the FP value to the integer value and back, checking for equality.
+  // Don't do this for zero, because -0.0 is not fractional.
+  Constant *RHSInt = LHSUnsigned
+    ? ConstantExpr::getFPToUI(RHSC, IntTy)
+    : ConstantExpr::getFPToSI(RHSC, IntTy);
+  if (!RHS.isZero()) {
+    bool Equal = LHSUnsigned
+      ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+      : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+    if (!Equal) {
+      // If we had a comparison against a fractional value, we have to adjust
+      // the compare predicate and sometimes the value.  RHSC is rounded towards
+      // zero at this point.
+      switch (Pred) {
+      default: llvm_unreachable("Unexpected integer comparison!");
+      case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+      case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+      case ICmpInst::ICMP_ULE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> false
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        break;
+      case ICmpInst::ICMP_SLE:
+        // (float)int <= 4.4   --> int <= 4
+        // (float)int <= -4.4  --> int < -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLT;
+        break;
+      case ICmpInst::ICMP_ULT:
+        // (float)int < -4.4   --> false
+        // (float)int < 4.4    --> int <= 4
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        Pred = ICmpInst::ICMP_ULE;
+        break;
+      case ICmpInst::ICMP_SLT:
+        // (float)int < -4.4   --> int < -4
+        // (float)int < 4.4    --> int <= 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SLE;
+        break;
+      case ICmpInst::ICMP_UGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> true
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+        break;
+      case ICmpInst::ICMP_SGT:
+        // (float)int > 4.4    --> int > 4
+        // (float)int > -4.4   --> int >= -4
+        if (RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGE;
+        break;
+      case ICmpInst::ICMP_UGE:
+        // (float)int >= -4.4   --> true
+        // (float)int >= 4.4    --> int > 4
+        if (RHS.isNegative())
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+        Pred = ICmpInst::ICMP_UGT;
+        break;
+      case ICmpInst::ICMP_SGE:
+        // (float)int >= -4.4   --> int >= -4
+        // (float)int >= 4.4    --> int > 4
+        if (!RHS.isNegative())
+          Pred = ICmpInst::ICMP_SGT;
+        break;
+      }
+    }
+  }
+
+  // Lower this FP comparison into an appropriate integer version of the
+  // comparison.
+  return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+  bool Changed = false;
+
+  /// Orders the operands of the compare so that they are listed from most
+  /// complex to least complex.  This puts constants before unary operators,
+  /// before binary operators.
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+    I.swapOperands();
+    Changed = true;
+  }
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Simplify 'fcmp pred X, X'
+  if (Op0 == Op1) {
+    switch (I.getPredicate()) {
+    default: llvm_unreachable("Unknown predicate!");
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+    }
+  }
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::FPExt: {
+        // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
+        FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
+        ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
+        if (!RHSF)
+          break;
+
+        const fltSemantics *Sem;
+        // FIXME: This shouldn't be here.
+        if (LHSExt->getSrcTy()->isHalfTy())
+          Sem = &APFloat::IEEEhalf;
+        else if (LHSExt->getSrcTy()->isFloatTy())
+          Sem = &APFloat::IEEEsingle;
+        else if (LHSExt->getSrcTy()->isDoubleTy())
+          Sem = &APFloat::IEEEdouble;
+        else if (LHSExt->getSrcTy()->isFP128Ty())
+          Sem = &APFloat::IEEEquad;
+        else if (LHSExt->getSrcTy()->isX86_FP80Ty())
+          Sem = &APFloat::x87DoubleExtended;
+        else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
+          Sem = &APFloat::PPCDoubleDouble;
+        else
+          break;
+
+        bool Lossy;
+        APFloat F = RHSF->getValueAPF();
+        F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
+
+        // Avoid lossy conversions and denormals. Zero is a special case
+        // that's OK to convert.
+        APFloat Fabs = F;
+        Fabs.clearSign();
+        if (!Lossy &&
+            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
+                 APFloat::cmpLessThan) || Fabs.isZero()))
+
+          return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
+                              ConstantFP::get(RHSC->getContext(), F));
+        break;
+      }
+      case Instruction::PHI:
+        // Only fold fcmp into the PHI if the phi and fcmp are in the same
+        // block.  If in the same block, we're encouraging jump threading.  If
+        // not, we are just pessimizing the code by making an i1 phi.
+        if (LHSI->getParent() == I.getParent())
+          if (Instruction *NV = FoldOpIntoPhi(I))
+            return NV;
+        break;
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        if (Instruction *NV = FoldFCmp_IntToFP_Cst(I, LHSI, RHSC))
+          return NV;
+        break;
+      case Instruction::Select: {
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op2 = Builder->CreateFCmp(I.getPredicate(),
+                                      LHSI->getOperand(2), RHSC, I.getName());
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op1 = Builder->CreateFCmp(I.getPredicate(), LHSI->getOperand(1),
+                                      RHSC, I.getName());
+          }
+        }
+
+        if (Op1)
+          return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+      case Instruction::FSub: {
+        // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
+        Value *Op;
+        if (match(LHSI, m_FNeg(m_Value(Op))))
+          return new FCmpInst(I.getSwappedPredicate(), Op,
+                              ConstantExpr::getFNeg(RHSC));
+        break;
+      }
+      case Instruction::Load:
+        if (GetElementPtrInst *GEP =
+            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+            if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+                !cast<LoadInst>(LHSI)->isVolatile())
+              if (Instruction *Res = FoldCmpLoadFromIndexedGlobal(GEP, GV, I))
+                return Res;
+        }
+        break;
+      case Instruction::Call: {
+        CallInst *CI = cast<CallInst>(LHSI);
+        LibFunc::Func Func;
+        // Various optimization for fabs compared with zero.
+        if (RHSC->isNullValue() && CI->getCalledFunction() &&
+            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+            TLI->has(Func)) {
+          if (Func == LibFunc::fabs || Func == LibFunc::fabsf ||
+              Func == LibFunc::fabsl) {
+            switch (I.getPredicate()) {
+            default: break;
+            // fabs(x) < 0 --> false
+            case FCmpInst::FCMP_OLT:
+              return ReplaceInstUsesWith(I, Builder->getFalse());
+            // fabs(x) > 0 --> x != 0
+            case FCmpInst::FCMP_OGT:
+              return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) <= 0 --> x == 0
+            case FCmpInst::FCMP_OLE:
+              return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) >= 0 --> !isnan(x)
+            case FCmpInst::FCMP_OGE:
+              return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) == 0 --> x == 0
+            // fabs(x) != 0 --> x != 0
+            case FCmpInst::FCMP_OEQ:
+            case FCmpInst::FCMP_UEQ:
+            case FCmpInst::FCMP_ONE:
+            case FCmpInst::FCMP_UNE:
+              return new FCmpInst(I.getPredicate(), CI->getArgOperand(0),
+                                  RHSC);
+            }
+          }
+        }
+      }
+      }
+  }
+
+  // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return new FCmpInst(I.getSwappedPredicate(), X, Y);
+
+  // fcmp (fpext x), (fpext y) -> fcmp x, y
+  if (FPExtInst *LHSExt = dyn_cast<FPExtInst>(Op0))
+    if (FPExtInst *RHSExt = dyn_cast<FPExtInst>(Op1))
+      if (LHSExt->getSrcTy() == RHSExt->getSrcTy())
+        return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
+                            RHSExt->getOperand(0));
+
+  return Changed ? &I : 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
new file mode 100644
index 000000000000..337cfe32a869
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -0,0 +1,816 @@
+//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for load, store and alloca.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
+STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
+
+/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we can't rewrite arbitrary instructions.
+static bool pointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast ||
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return pointsToConstantGlobal(CE->getOperand(0));
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant global, we
+/// can optimize this.
+static bool
+isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
+                               SmallVectorImpl<Instruction *> &ToDelete,
+                               bool IsOffset = false) {
+  // We track lifetime intrinsics as we encounter them.  If we decide to go
+  // ahead and replace the value with the global, this lets the caller quickly
+  // eliminate the markers.
+
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    User *U = cast<Instruction>(*UI);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Ignore non-volatile loads, they are always ok.
+      if (!LI->isSimple()) return false;
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      // If uses of the bitcast are ok, we are ok.
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset))
+        return false;
+      continue;
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
+      // doesn't, it does.
+      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, ToDelete,
+                                          IsOffset || !GEP->hasAllZeroIndices()))
+        return false;
+      continue;
+    }
+
+    if (CallSite CS = U) {
+      // If this is the function being called then we treat it like a load and
+      // ignore it.
+      if (CS.isCallee(UI))
+        continue;
+
+      // If this is a readonly/readnone call site, then we know it is just a
+      // load (but one that potentially returns the value itself), so we can
+      // ignore it if we know that the value isn't captured.
+      unsigned ArgNo = CS.getArgumentNo(UI);
+      if (CS.onlyReadsMemory() &&
+          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+        continue;
+
+      // If this is being passed as a byval argument, the caller is making a
+      // copy, so it is only a read of the alloca.
+      if (CS.isByValArgument(ArgNo))
+        continue;
+    }
+
+    // Lifetime intrinsics can be handled by the caller.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        assert(II->use_empty() && "Lifetime markers have no result to use!");
+        ToDelete.push_back(II);
+        continue;
+      }
+    }
+
+    // If this is isn't our memcpy/memmove, reject it as something we can't
+    // handle.
+    MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
+    if (MI == 0)
+      return false;
+
+    // If the transfer is using the alloca as a source of the transfer, then
+    // ignore it since it is a load (unless the transfer is volatile).
+    if (UI.getOperandNo() == 1) {
+      if (MI->isVolatile()) return false;
+      continue;
+    }
+
+    // If we already have seen a copy, reject the second one.
+    if (TheCopy) return false;
+
+    // If the pointer has been offset from the start of the alloca, we can't
+    // safely handle this.
+    if (IsOffset) return false;
+
+    // If the memintrinsic isn't using the alloca as the dest, reject it.
+    if (UI.getOperandNo() != 0) return false;
+
+    // If the source of the memcpy/move is not a constant global, reject it.
+    if (!pointsToConstantGlobal(MI->getSource()))
+      return false;
+
+    // Otherwise, the transform is safe.  Remember the copy instruction.
+    TheCopy = MI;
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+static MemTransferInst *
+isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  MemTransferInst *TheCopy = 0;
+  if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
+    return TheCopy;
+  return 0;
+}
+
+Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
+  // Ensure that the alloca array size argument has type intptr_t, so that
+  // any casting is exposed early.
+  if (TD) {
+    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    if (AI.getArraySize()->getType() != IntPtrTy) {
+      Value *V = Builder->CreateIntCast(AI.getArraySize(),
+                                        IntPtrTy, false);
+      AI.setOperand(0, V);
+      return &AI;
+    }
+  }
+
+  // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
+  if (AI.isArrayAllocation()) {  // Check C != 1
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+      Type *NewTy = 
+        ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+      AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
+      New->setAlignment(AI.getAlignment());
+
+      // Scan to the end of the allocation instructions, to skip over a block of
+      // allocas if possible...also skip interleaved debug info
+      //
+      BasicBlock::iterator It = New;
+      while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It)) ++It;
+
+      // Now that I is pointing to the first non-allocation-inst in the block,
+      // insert our getelementptr instruction...
+      //
+      Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext()));
+      Value *Idx[2];
+      Idx[0] = NullIdx;
+      Idx[1] = NullIdx;
+      Instruction *GEP =
+           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
+      InsertNewInstBefore(GEP, *It);
+
+      // Now make everything use the getelementptr instead of the original
+      // allocation.
+      return ReplaceInstUsesWith(AI, GEP);
+    } else if (isa<UndefValue>(AI.getArraySize())) {
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+    }
+  }
+
+  if (TD && AI.getAllocatedType()->isSized()) {
+    // If the alignment is 0 (unspecified), assign it the preferred alignment.
+    if (AI.getAlignment() == 0)
+      AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType()));
+
+    // Move all alloca's of zero byte objects to the entry block and merge them
+    // together.  Note that we only do this for alloca's, because malloc should
+    // allocate and return a unique pointer, even for a zero byte allocation.
+    if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) {
+      // For a zero sized alloca there is no point in doing an array allocation.
+      // This is helpful if the array size is a complicated expression not used
+      // elsewhere.
+      if (AI.isArrayAllocation()) {
+        AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
+        return &AI;
+      }
+
+      // Get the first instruction in the entry block.
+      BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (FirstInst != &AI) {
+        // If the entry block doesn't start with a zero-size alloca then move
+        // this one to the start of the entry block.  There is no problem with
+        // dominance as the array size was forced to a constant earlier already.
+        AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+        if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+            TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+          AI.moveBefore(FirstInst);
+          return &AI;
+        }
+
+        // If the alignment of the entry block alloca is 0 (unspecified),
+        // assign it the preferred alignment.
+        if (EntryAI->getAlignment() == 0)
+          EntryAI->setAlignment(
+            TD->getPrefTypeAlignment(EntryAI->getAllocatedType()));
+        // Replace this zero-sized alloca with the one at the start of the entry
+        // block after ensuring that the address will be aligned enough for both
+        // types.
+        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
+                                     AI.getAlignment());
+        EntryAI->setAlignment(MaxAlign);
+        if (AI.getType() != EntryAI->getType())
+          return new BitCastInst(EntryAI, AI.getType());
+        return ReplaceInstUsesWith(AI, EntryAI);
+      }
+    }
+  }
+
+  if (AI.getAlignment()) {
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global whose alignment is equal to or exceeds that of the
+    // allocation.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    SmallVector<Instruction *, 4> ToDelete;
+    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
+      unsigned SourceAlign = getOrEnforceKnownAlignment(Copy->getSource(),
+                                                        AI.getAlignment(), TD);
+      if (AI.getAlignment() <= SourceAlign) {
+        DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+          EraseInstFromFunction(*ToDelete[i]);
+        Constant *TheSrc = cast<Constant>(Copy->getSource());
+        Instruction *NewI
+          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
+                                                             AI.getType()));
+        EraseInstFromFunction(*Copy);
+        ++NumGlobalCopies;
+        return NewI;
+      }
+    }
+  }
+
+  // At last, use the generic allocation site handler to aggressively remove
+  // unused allocas.
+  return visitAllocSite(AI);
+}
+
+
+/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
+static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
+                                        const DataLayout *TD) {
+  User *CI = cast<User>(LI.getOperand(0));
+  Value *CastOp = CI->getOperand(0);
+
+  PointerType *DestTy = cast<PointerType>(CI->getType());
+  Type *DestPTy = DestTy->getElementType();
+  if (PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+
+    // If the address spaces don't match, don't eliminate the cast.
+    if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
+      return 0;
+
+    Type *SrcPTy = SrcTy->getElementType();
+
+    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
+         DestPTy->isVectorTy()) {
+      // If the source is an array, the code below will not succeed.  Check to
+      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+      // constants.
+      if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
+          if (ASrcTy->getNumElements() != 0) {
+            Value *Idxs[2];
+            Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
+            Idxs[1] = Idxs[0];
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
+            SrcTy = cast<PointerType>(CastOp->getType());
+            SrcPTy = SrcTy->getElementType();
+          }
+
+      if (IC.getDataLayout() &&
+          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || 
+            SrcPTy->isVectorTy()) &&
+          // Do not allow turning this into a load of an integer, which is then
+          // casted to a pointer, this pessimizes pointer analysis a lot.
+          (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
+          IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
+               IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
+
+        // Okay, we are casting from one integer or pointer type to another of
+        // the same size.  Instead of casting the pointer before the load, cast
+        // the result of the loaded value.
+        LoadInst *NewLoad = 
+          IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
+        NewLoad->setAlignment(LI.getAlignment());
+        NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        // Now cast the result of the load.
+        return new BitCastInst(NewLoad, LI.getType());
+      }
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+  Value *Op = LI.getOperand(0);
+
+  // Attempt to improve the alignment.
+  if (TD) {
+    unsigned KnownAlign =
+      getOrEnforceKnownAlignment(Op, TD->getPrefTypeAlignment(LI.getType()),TD);
+    unsigned LoadAlign = LI.getAlignment();
+    unsigned EffectiveLoadAlign = LoadAlign != 0 ? LoadAlign :
+      TD->getABITypeAlignment(LI.getType());
+
+    if (KnownAlign > EffectiveLoadAlign)
+      LI.setAlignment(KnownAlign);
+    else if (LoadAlign == 0)
+      LI.setAlignment(EffectiveLoadAlign);
+  }
+
+  // load (cast X) --> cast (load X) iff safe.
+  if (isa<CastInst>(Op))
+    if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+      return Res;
+
+  // None of the following transforms are legal for volatile/atomic loads.
+  // FIXME: Some of it is okay for atomic loads; needs refactoring.
+  if (!LI.isSimple()) return 0;
+  
+  // Do really simple store-to-load forwarding and load CSE, to catch cases
+  // where there are several consecutive memory accesses to the same location,
+  // separated by a few arithmetic operations.
+  BasicBlock::iterator BBI = &LI;
+  if (Value *AvailableVal = FindAvailableLoadedValue(Op, LI.getParent(), BBI,6))
+    return ReplaceInstUsesWith(LI, AvailableVal);
+
+  // load(gep null, ...) -> unreachable
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    // TODO: Consider a target hook for valid address spaces for this xform.
+    if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0){
+      // Insert a new store to null instruction before the load to indicate
+      // that this code is not reachable.  We do this instead of inserting
+      // an unreachable instruction directly because we cannot modify the
+      // CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+  } 
+
+  // load null/undef -> unreachable
+  // TODO: Consider a target hook for valid address spaces for this xform.
+  if (isa<UndefValue>(Op) ||
+      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0)) {
+    // Insert a new store to null instruction before the load to indicate that
+    // this code is not reachable.  We do this instead of inserting an
+    // unreachable instruction directly because we cannot modify the CFG.
+    new StoreInst(UndefValue::get(LI.getType()),
+                  Constant::getNullValue(Op->getType()), &LI);
+    return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+  }
+
+  // Instcombine load (constantexpr_cast global) -> cast (load global)
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
+        return Res;
+  
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      unsigned Align = LI.getAlignment();
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI, Align, TD) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI, Align, TD)) {
+        LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
+                                           SI->getOperand(1)->getName()+".val");
+        LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
+                                           SI->getOperand(2)->getName()+".val");
+        V1->setAlignment(Align);
+        V2->setAlignment(Align);
+        return SelectInst::Create(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(2));
+          return &LI;
+        }
+
+      // load (select (cond, P, null)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(1));
+          return &LI;
+        }
+    }
+  }
+  return 0;
+}
+
+/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
+/// when possible.  This makes it generally easy to do alias analysis and/or
+/// SROA/mem2reg of the memory object.
+static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
+  User *CI = cast<User>(SI.getOperand(1));
+  Value *CastOp = CI->getOperand(0);
+
+  Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  if (SrcTy == 0) return 0;
+  
+  Type *SrcPTy = SrcTy->getElementType();
+
+  if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
+    return 0;
+  
+  /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
+  /// to its first element.  This allows us to handle things like:
+  ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
+  /// on 32-bit hosts.
+  SmallVector<Value*, 4> NewGEPIndices;
+  
+  // If the source is an array, the code below will not succeed.  Check to
+  // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+  // constants.
+  if (SrcPTy->isArrayTy() || SrcPTy->isStructTy()) {
+    // Index through pointer.
+    Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
+    NewGEPIndices.push_back(Zero);
+    
+    while (1) {
+      if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+        if (!STy->getNumElements()) /* Struct can be empty {} */
+          break;
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = STy->getElementType(0);
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+        NewGEPIndices.push_back(Zero);
+        SrcPTy = ATy->getElementType();
+      } else {
+        break;
+      }
+    }
+    
+    SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
+  }
+
+  if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
+    return 0;
+  
+  // If the pointers point into different address spaces or if they point to
+  // values with different sizes, we can't do the transformation.
+  if (!IC.getDataLayout() ||
+      SrcTy->getAddressSpace() != 
+        cast<PointerType>(CI->getType())->getAddressSpace() ||
+      IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
+      IC.getDataLayout()->getTypeSizeInBits(DestPTy))
+    return 0;
+
+  // Okay, we are casting from one integer or pointer type to another of
+  // the same size.  Instead of casting the pointer before 
+  // the store, cast the value to be stored.
+  Value *NewCast;
+  Value *SIOp0 = SI.getOperand(0);
+  Instruction::CastOps opcode = Instruction::BitCast;
+  Type* CastSrcTy = SIOp0->getType();
+  Type* CastDstTy = SrcPTy;
+  if (CastDstTy->isPointerTy()) {
+    if (CastSrcTy->isIntegerTy())
+      opcode = Instruction::IntToPtr;
+  } else if (CastDstTy->isIntegerTy()) {
+    if (SIOp0->getType()->isPointerTy())
+      opcode = Instruction::PtrToInt;
+  }
+  
+  // SIOp0 is a pointer to aggregate and this is a store to the first field,
+  // emit a GEP to index into its first field.
+  if (!NewGEPIndices.empty())
+    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
+  
+  NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
+                                   SIOp0->getName()+".c");
+  SI.setOperand(0, NewCast);
+  SI.setOperand(1, CastOp);
+  return &SI;
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+///   %t0 = getelementptr \@a, 0, 3
+///   store i32 0, i32* %t0
+///   %t1 = getelementptr \@a, 0, 3
+///   %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+  // Test if the values are trivially equivalent.
+  if (A == B) return true;
+  
+  // Test if the values come form identical arithmetic instructions.
+  // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
+  // its only used to compare two uses within the same basic block, which
+  // means that they'll always either have the same value or one of them
+  // will have an undefined value.
+  if (isa<BinaryOperator>(A) ||
+      isa<CastInst>(A) ||
+      isa<PHINode>(A) ||
+      isa<GetElementPtrInst>(A))
+    if (Instruction *BI = dyn_cast<Instruction>(B))
+      if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+        return true;
+  
+  // Otherwise they may not be equivalent.
+  return false;
+}
+
+Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  // Attempt to improve the alignment.
+  if (TD) {
+    unsigned KnownAlign =
+      getOrEnforceKnownAlignment(Ptr, TD->getPrefTypeAlignment(Val->getType()),
+                                 TD);
+    unsigned StoreAlign = SI.getAlignment();
+    unsigned EffectiveStoreAlign = StoreAlign != 0 ? StoreAlign :
+      TD->getABITypeAlignment(Val->getType());
+
+    if (KnownAlign > EffectiveStoreAlign)
+      SI.setAlignment(KnownAlign);
+    else if (StoreAlign == 0)
+      SI.setAlignment(EffectiveStoreAlign);
+  }
+
+  // Don't hack volatile/atomic stores.
+  // FIXME: Some bits are legal for atomic stores; needs refactoring.
+  if (!SI.isSimple()) return 0;
+
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr)) 
+      return EraseInstFromFunction(SI);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (isa<AllocaInst>(GEP->getOperand(0))) {
+        if (GEP->getOperand(0)->hasOneUse())
+          return EraseInstFromFunction(SI);
+      }
+    }
+  }
+
+  // Do really simple DSE, to catch cases where there are several consecutive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI = &SI;
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    // Don't count debug info directives, lest they affect codegen,
+    // and we skip pointer-to-pointer bitcasts, which are NOPs.
+    if (isa<DbgInfoIntrinsic>(BBI) ||
+        (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      ScanInsts++;
+      continue;
+    }    
+    
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
+                                                        SI.getOperand(1))) {
+        ++NumDeadStore;
+        ++BBI;
+        EraseInstFromFunction(*PrevSI);
+        continue;
+      }
+      break;
+    }
+    
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
+          LI->isSimple())
+        return EraseInstFromFunction(SI);
+      
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+    
+    // Don't skip over loads or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
+      break;
+  }
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) {
+    if (!isa<UndefValue>(Val)) {
+      SI.setOperand(0, UndefValue::get(Val->getType()));
+      if (Instruction *U = dyn_cast<Instruction>(Val))
+        Worklist.Add(U);  // Dropped a use.
+    }
+    return 0;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val))
+    return EraseInstFromFunction(SI);
+
+  // If the pointer destination is a cast, see if we can fold the cast into the
+  // source instead.
+  if (isa<CastInst>(Ptr))
+    if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+      return Res;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+        return Res;
+
+  
+  // If this store is the last instruction in the basic block (possibly
+  // excepting debug info instructions), and if the block ends with an
+  // unconditional branch, try to move it to the successor block.
+  BBI = &SI; 
+  do {
+    ++BBI;
+  } while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
+    if (BI->isUnconditional())
+      if (SimplifyStoreAtEndOfBlock(SI))
+        return 0;  // xform done!
+  
+  return 0;
+}
+
+/// SimplifyStoreAtEndOfBlock - Turn things like:
+///   if () { *P = v1; } else { *P = v2 }
+/// into a phi node with a store in the successor.
+///
+/// Simplify things like:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
+///
+bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  BasicBlock *StoreBB = SI.getParent();
+  
+  // Check to see if the successor block has exactly two incoming edges.  If
+  // so, see if the other predecessor contains a store to the same location.
+  // if so, insert a PHI node (if needed) and move the stores down.
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+  
+  // Determine whether Dest has exactly two predecessors and, if so, compute
+  // the other predecessor.
+  pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *P = *PI;
+  BasicBlock *OtherBB = 0;
+
+  if (P != StoreBB)
+    OtherBB = P;
+
+  if (++PI == pred_end(DestBB))
+    return false;
+  
+  P = *PI;
+  if (P != StoreBB) {
+    if (OtherBB)
+      return false;
+    OtherBB = P;
+  }
+  if (++PI != pred_end(DestBB))
+    return false;
+
+  // Bail out if all the relevant blocks aren't distinct (this can happen,
+  // for example, if SI is in an infinite loop)
+  if (StoreBB == DestBB || OtherBB == DestBB)
+    return false;
+
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI = OtherBB->getTerminator();
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+  
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case.  there is an instruction before the branch.
+  StoreInst *OtherStore = 0;
+  if (OtherBr->isUnconditional()) {
+    --BBI;
+    // Skip over debugging info.
+    while (isa<DbgInfoIntrinsic>(BBI) ||
+           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+      if (BBI==OtherBB->begin())
+        return false;
+      --BBI;
+    }
+    // If this isn't a store, isn't a store to the same location, or is not the
+    // right kind of store, bail out.
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
+        !SI.isSameOperationAs(OtherStore))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB && 
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+    
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1) ||
+            !SI.isSameOperationAs(OtherStore))
+          return false;
+        break;
+      }
+      // If we find something that may be using or overwriting the stored
+      // value, or if we run out of instructions, we can't do the xform.
+      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
+          BBI == OtherBB->begin())
+        return false;
+    }
+    
+    // In order to eliminate the store in OtherBr, we have to
+    // make sure nothing reads or overwrites the stored value in
+    // StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (I->mayReadFromMemory() || I->mayWriteToMemory())
+        return false;
+    }
+  }
+  
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+  }
+  
+  // Advance to a place where it is safe to insert the new store and
+  // insert it.
+  BBI = DestBB->getFirstInsertionPt();
+  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
+                                   SI.isVolatile(),
+                                   SI.getAlignment(),
+                                   SI.getOrdering(),
+                                   SI.getSynchScope());
+  InsertNewInstBefore(NewSI, *BBI);
+  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+
+  // If the two stores had the same TBAA tag, preserve it.
+  if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
+    if ((TBAATag = MDNode::getMostGenericTBAA(TBAATag,
+                               OtherStore->getMetadata(LLVMContext::MD_tbaa))))
+      NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  
+  // Nuke the old stores.
+  EraseInstFromFunction(SI);
+  EraseInstFromFunction(*OtherStore);
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
new file mode 100644
index 000000000000..173f2bf63304
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -0,0 +1,1117 @@
+//===- InstCombineMulDivRem.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
+// srem, urem, frem.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+
+/// simplifyValueKnownNonZero - The specific integer value is used in a context
+/// where it is known to be non-zero.  If this allows us to simplify the
+/// computation, do so and return the new operand, otherwise return null.
+static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
+  // If V has multiple uses, then we would have to do more analysis to determine
+  // if this is safe.  For example, the use could be in dynamically unreached
+  // code.
+  if (!V->hasOneUse()) return 0;
+  
+  bool MadeChange = false;
+
+  // ((1 << A) >>u B) --> (1 << (A-B))
+  // Because V cannot be zero, we know that B is less than A.
+  Value *A = 0, *B = 0, *PowerOf2 = 0;
+  if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
+                      m_Value(B))) &&
+      // The "1" can be any value known to be a power of 2.
+      isKnownToBeAPowerOfTwo(PowerOf2)) {
+    A = IC.Builder->CreateSub(A, B);
+    return IC.Builder->CreateShl(PowerOf2, A);
+  }
+  
+  // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+  // inexact.  Similarly for <<.
+  if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
+    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0))) {
+      // We know that this is an exact/nuw shift and that the input is a
+      // non-zero context as well.
+      if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
+        I->setOperand(0, V2);
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+        I->setIsExact();
+        MadeChange = true;
+      }
+      
+      if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+        I->setHasNoUnsignedWrap();
+        MadeChange = true;
+      }
+    }
+
+  // TODO: Lots more we could do here:
+  //    If V is a phi node, we can call this on each of its operands.
+  //    "select cond, X, 0" can simplify to "X".
+  
+  return MadeChange ? V : 0;
+}
+
+
+/// MultiplyOverflows - True if the multiply can not be expressed in an int
+/// this size.
+static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
+  uint32_t W = C1->getBitWidth();
+  APInt LHSExt = C1->getValue(), RHSExt = C2->getValue();
+  if (sign) {
+    LHSExt = LHSExt.sext(W * 2);
+    RHSExt = RHSExt.sext(W * 2);
+  } else {
+    LHSExt = LHSExt.zext(W * 2);
+    RHSExt = RHSExt.zext(W * 2);
+  }
+  
+  APInt MulExt = LHSExt * RHSExt;
+  
+  if (!sign)
+    return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
+  
+  APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
+  APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
+  return MulExt.slt(Min) || MulExt.sgt(Max);
+}
+
+Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyMulInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return ReplaceInstUsesWith(I, V);
+
+  if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
+    return BinaryOperator::CreateNeg(Op0, I.getName());
+  
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    
+    // ((X << C1)*C2) == (X * (C2 << C1))
+    if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
+      if (SI->getOpcode() == Instruction::Shl)
+        if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
+          return BinaryOperator::CreateMul(SI->getOperand(0),
+                                           ConstantExpr::getShl(CI, ShOp));
+    
+    const APInt &Val = CI->getValue();
+    if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
+      Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2());
+      BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst);
+      if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
+      if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
+      return Shl;
+    }
+    
+    // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
+    { Value *X; ConstantInt *C1;
+      if (Op0->hasOneUse() &&
+          match(Op0, m_Add(m_Value(X), m_ConstantInt(C1)))) {
+        Value *Add = Builder->CreateMul(X, CI);
+        return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI));
+      }
+    }
+
+    // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n
+    // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n
+    // The "* (2**n)" thus becomes a potential shifting opportunity.
+    {
+      const APInt &   Val = CI->getValue();
+      const APInt &PosVal = Val.abs();
+      if (Val.isNegative() && PosVal.isPowerOf2()) {
+        Value *X = 0, *Y = 0;
+        if (Op0->hasOneUse()) {
+          ConstantInt *C1;
+          Value *Sub = 0;
+          if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
+            Sub = Builder->CreateSub(X, Y, "suba");
+          else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
+            Sub = Builder->CreateSub(Builder->CreateNeg(C1), Y, "subc");
+          if (Sub)
+            return
+              BinaryOperator::CreateMul(Sub,
+                                        ConstantInt::get(Y->getType(), PosVal));
+        }
+      }
+    }
+  }
+  
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {    
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(Op1))
+      return BinaryOperator::CreateMul(Op0v, Op1v);
+
+  // (X / Y) *  Y = X - (X % Y)
+  // (X / Y) * -Y = (X % Y) - X
+  {
+    Value *Op1C = Op1;
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
+    if (!BO ||
+        (BO->getOpcode() != Instruction::UDiv && 
+         BO->getOpcode() != Instruction::SDiv)) {
+      Op1C = Op0;
+      BO = dyn_cast<BinaryOperator>(Op1);
+    }
+    Value *Neg = dyn_castNegVal(Op1C);
+    if (BO && BO->hasOneUse() &&
+        (BO->getOperand(1) == Op1C || BO->getOperand(1) == Neg) &&
+        (BO->getOpcode() == Instruction::UDiv ||
+         BO->getOpcode() == Instruction::SDiv)) {
+      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+
+      // If the division is exact, X % Y is zero, so we end up with X or -X.
+      if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO))
+        if (SDiv->isExact()) {
+          if (Op1BO == Op1C)
+            return ReplaceInstUsesWith(I, Op0BO);
+          return BinaryOperator::CreateNeg(Op0BO);
+        }
+
+      Value *Rem;
+      if (BO->getOpcode() == Instruction::UDiv)
+        Rem = Builder->CreateURem(Op0BO, Op1BO);
+      else
+        Rem = Builder->CreateSRem(Op0BO, Op1BO);
+      Rem->takeName(BO);
+
+      if (Op1BO == Op1C)
+        return BinaryOperator::CreateSub(Op0BO, Rem);
+      return BinaryOperator::CreateSub(Rem, Op0BO);
+    }
+  }
+
+  /// i1 mul -> i1 and.
+  if (I.getType()->isIntegerTy(1))
+    return BinaryOperator::CreateAnd(Op0, Op1);
+
+  // X*(1 << Y) --> X << Y
+  // (1 << Y)*X --> X << Y
+  {
+    Value *Y;
+    if (match(Op0, m_Shl(m_One(), m_Value(Y))))
+      return BinaryOperator::CreateShl(Op1, Y);
+    if (match(Op1, m_Shl(m_One(), m_Value(Y))))
+      return BinaryOperator::CreateShl(Op0, Y);
+  }
+  
+  // If one of the operands of the multiply is a cast from a boolean value, then
+  // we know the bool is either zero or one, so this is a 'masking' multiply.
+  //   X * Y (where Y is 0 or 1) -> X & (0-Y)
+  if (!I.getType()->isVectorTy()) {
+    // -2 is "-1 << 1" so it is all bits set except the low one.
+    APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
+    
+    Value *BoolCast = 0, *OtherOp = 0;
+    if (MaskedValueIsZero(Op0, Negative2))
+      BoolCast = Op0, OtherOp = Op1;
+    else if (MaskedValueIsZero(Op1, Negative2))
+      BoolCast = Op1, OtherOp = Op0;
+
+    if (BoolCast) {
+      Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()),
+                                    BoolCast);
+      return BinaryOperator::CreateAnd(V, OtherOp);
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+//
+// Detect pattern:
+//
+// log2(Y*0.5)
+//
+// And check for corresponding fast math flags
+//
+
+static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
+
+   if (!Op->hasOneUse())
+     return;
+
+   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
+   if (!II)
+     return;
+   if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+     return;
+   Log2 = II;
+
+   Value *OpLog2Of = II->getArgOperand(0);
+   if (!OpLog2Of->hasOneUse())
+     return;
+
+   Instruction *I = dyn_cast<Instruction>(OpLog2Of);
+   if (!I)
+     return;
+   if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+     return;
+              
+   ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0));
+   if (CFP && CFP->isExactlyValue(0.5)) {
+     Y = I->getOperand(1);
+     return;
+   }
+   CFP = dyn_cast<ConstantFP>(I->getOperand(1));
+   if (CFP && CFP->isExactlyValue(0.5))
+     Y = I->getOperand(0);
+} 
+
+/// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
+/// true iff the given value is FMul or FDiv with one and only one operand
+/// being a normal constant (i.e. not Zero/NaN/Infinity).
+static bool isFMulOrFDivWithConstant(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || (I->getOpcode() != Instruction::FMul && 
+             I->getOpcode() != Instruction::FDiv))
+    return false;
+
+  ConstantFP *C0 = dyn_cast<ConstantFP>(I->getOperand(0));
+  ConstantFP *C1 = dyn_cast<ConstantFP>(I->getOperand(1));
+
+  if (C0 && C1)
+    return false;
+
+  return (C0 && C0->getValueAPF().isNormal()) ||
+         (C1 && C1->getValueAPF().isNormal());
+}
+
+static bool isNormalFp(const ConstantFP *C) {
+  const APFloat &Flt = C->getValueAPF();
+  return Flt.isNormal() && !Flt.isDenormal();
+}
+
+/// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
+/// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
+/// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
+/// This function is to simplify "FMulOrDiv * C" and returns the 
+/// resulting expression. Note that this function could return NULL in
+/// case the constants cannot be folded into a normal floating-point.
+/// 
+Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
+                                   Instruction *InsertBefore) {
+  assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
+
+  Value *Opnd0 = FMulOrDiv->getOperand(0);
+  Value *Opnd1 = FMulOrDiv->getOperand(1);
+
+  ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0);
+  ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1);
+
+  BinaryOperator *R = 0;
+
+  // (X * C0) * C => X * (C0*C)
+  if (FMulOrDiv->getOpcode() == Instruction::FMul) {
+    Constant *F = ConstantExpr::getFMul(C1 ? C1 : C0, C);
+    if (isNormalFp(cast<ConstantFP>(F)))
+      R = BinaryOperator::CreateFMul(C1 ? Opnd0 : Opnd1, F);
+  } else {
+    if (C0) {
+      // (C0 / X) * C => (C0 * C) / X
+      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
+      if (isNormalFp(F))
+        R = BinaryOperator::CreateFDiv(F, Opnd1);
+    } else {
+      // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
+      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1));
+      if (isNormalFp(F)) {
+        R = BinaryOperator::CreateFMul(Opnd0, F);
+      } else {
+        // (X / C1) * C => X / (C1/C) 
+        Constant *F = ConstantExpr::getFDiv(C1, C);
+        if (isNormalFp(cast<ConstantFP>(F)))
+          R = BinaryOperator::CreateFDiv(Opnd0, F);
+      }
+    }
+  }
+
+  if (R) {
+    R->setHasUnsafeAlgebra(true);
+    InsertNewInstWith(R, *InsertBefore);
+  }
+
+  return R;
+}
+
+Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
+  bool Changed = SimplifyAssociativeOrCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<Constant>(Op0))
+    std::swap(Op0, Op1);
+
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+
+    ConstantFP *C = dyn_cast<ConstantFP>(Op1);
+    if (C && AllowReassociate && C->getValueAPF().isNormal()) {
+      // Let MDC denote an expression in one of these forms:
+      // X * C, C/X, X/C, where C is a constant.
+      //
+      // Try to simplify "MDC * Constant"
+      if (isFMulOrFDivWithConstant(Op0)) {
+        Value *V = foldFMulConst(cast<Instruction>(Op0), C, &I);
+        if (V)
+          return ReplaceInstUsesWith(I, V);
+      }
+
+      // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
+      Instruction *FAddSub = dyn_cast<Instruction>(Op0);
+      if (FAddSub &&
+          (FAddSub->getOpcode() == Instruction::FAdd ||
+           FAddSub->getOpcode() == Instruction::FSub)) {
+        Value *Opnd0 = FAddSub->getOperand(0);
+        Value *Opnd1 = FAddSub->getOperand(1);
+        ConstantFP *C0 = dyn_cast<ConstantFP>(Opnd0);
+        ConstantFP *C1 = dyn_cast<ConstantFP>(Opnd1);
+        bool Swap = false;
+        if (C0) {
+          std::swap(C0, C1);
+          std::swap(Opnd0, Opnd1);
+          Swap = true; 
+        }
+
+        if (C1 && C1->getValueAPF().isNormal() &&
+            isFMulOrFDivWithConstant(Opnd0)) {
+          Value *M1 = ConstantExpr::getFMul(C1, C);
+          Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ? 
+                      foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
+                      0;
+          if (M0 && M1) {
+            if (Swap && FAddSub->getOpcode() == Instruction::FSub)
+              std::swap(M0, M1);
+
+            Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ?
+                        BinaryOperator::CreateFAdd(M0, M1) :
+                        BinaryOperator::CreateFSub(M0, M1);
+            Instruction *RI = cast<Instruction>(R);
+            RI->copyFastMathFlags(&I);
+            return RI;
+          }
+        }
+      }
+    }
+  }
+
+
+  // Under unsafe algebra do:
+  // X * log2(0.5*Y) = X*log2(Y) - X
+  if (I.hasUnsafeAlgebra()) {
+    Value *OpX = NULL;
+    Value *OpY = NULL;
+    IntrinsicInst *Log2;
+    detectLog2OfHalf(Op0, OpY, Log2);
+    if (OpY) {
+      OpX = Op1;
+    } else {
+      detectLog2OfHalf(Op1, OpY, Log2);
+      if (OpY) {
+        OpX = Op0;
+      }
+    }
+    // if pattern detected emit alternate sequence
+    if (OpX && OpY) {
+      Log2->setArgOperand(0, OpY);
+      Value *FMulVal = Builder->CreateFMul(OpX, Log2);
+      Instruction *FMul = cast<Instruction>(FMulVal);
+      FMul->copyFastMathFlags(Log2);
+      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
+      FSub->copyFastMathFlags(Log2);
+      return FSub;
+    }
+  }
+
+  // Handle symmetric situation in a 2-iteration loop
+  Value *Opnd0 = Op0;
+  Value *Opnd1 = Op1;
+  for (int i = 0; i < 2; i++) {
+    bool IgnoreZeroSign = I.hasNoSignedZeros();
+    if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
+      Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
+
+      // -X * -Y => X*Y
+      if (N1)
+        return BinaryOperator::CreateFMul(N0, N1);
+
+      if (Opnd0->hasOneUse()) {
+        // -X * Y => -(X*Y) (Promote negation as high as possible)
+        Value *T = Builder->CreateFMul(N0, Opnd1);
+        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
+        Instruction *Neg = BinaryOperator::CreateFNeg(T);
+        if (I.getFastMathFlags().any()) {
+          cast<Instruction>(T)->copyFastMathFlags(&I);
+          Neg->copyFastMathFlags(&I);
+        }
+        return Neg;
+      }
+    }
+
+    // (X*Y) * X => (X*X) * Y where Y != X
+    //  The purpose is two-fold: 
+    //   1) to form a power expression (of X).
+    //   2) potentially shorten the critical path: After transformation, the
+    //  latency of the instruction Y is amortized by the expression of X*X,
+    //  and therefore Y is in a "less critical" position compared to what it
+    //  was before the transformation.
+    //
+    if (AllowReassociate) {
+      Value *Opnd0_0, *Opnd0_1;
+      if (Opnd0->hasOneUse() &&
+          match(Opnd0, m_FMul(m_Value(Opnd0_0), m_Value(Opnd0_1)))) {
+        Value *Y = 0;
+        if (Opnd0_0 == Opnd1 && Opnd0_1 != Opnd1)
+          Y = Opnd0_1;
+        else if (Opnd0_1 == Opnd1 && Opnd0_0 != Opnd1)
+          Y = Opnd0_0;
+
+        if (Y) {
+          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
+          T->copyFastMathFlags(&I);
+          T->setDebugLoc(I.getDebugLoc());
+
+          Instruction *R = BinaryOperator::CreateFMul(T, Y);
+          R->copyFastMathFlags(&I);
+          return R;
+        }
+      }
+    }
+
+    if (!isa<Constant>(Op1))
+      std::swap(Opnd0, Opnd1);
+    else
+      break;
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// SimplifyDivRemOfSelect - Try to fold a divide or remainder of a select
+/// instruction.
+bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
+  SelectInst *SI = cast<SelectInst>(I.getOperand(1));
+  
+  // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+  int NonNullOperand = -1;
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+    if (ST->isNullValue())
+      NonNullOperand = 2;
+  // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+  if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+    if (ST->isNullValue())
+      NonNullOperand = 1;
+  
+  if (NonNullOperand == -1)
+    return false;
+  
+  Value *SelectCond = SI->getOperand(0);
+  
+  // Change the div/rem to use 'Y' instead of the select.
+  I.setOperand(1, SI->getOperand(NonNullOperand));
+  
+  // Okay, we know we replace the operand of the div/rem with 'Y' with no
+  // problem.  However, the select, or the condition of the select may have
+  // multiple uses.  Based on our knowledge that the operand must be non-zero,
+  // propagate the known value for the select into other uses of it, and
+  // propagate a known value of the condition into its other users.
+  
+  // If the select and condition only have a single use, don't bother with this,
+  // early exit.
+  if (SI->use_empty() && SelectCond->hasOneUse())
+    return true;
+  
+  // Scan the current block backward, looking for other uses of SI.
+  BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
+  
+  while (BBI != BBFront) {
+    --BBI;
+    // If we found a call to a function, we can't assume it will return, so
+    // information from below it cannot be propagated above it.
+    if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
+      break;
+    
+    // Replace uses of the select or its condition with the known values.
+    for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+         I != E; ++I) {
+      if (*I == SI) {
+        *I = SI->getOperand(NonNullOperand);
+        Worklist.Add(BBI);
+      } else if (*I == SelectCond) {
+        *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) :
+                                   ConstantInt::getFalse(BBI->getContext());
+        Worklist.Add(BBI);
+      }
+    }
+    
+    // If we past the instruction, quit looking for it.
+    if (&*BBI == SI)
+      SI = 0;
+    if (&*BBI == SelectCond)
+      SelectCond = 0;
+    
+    // If we ran out of things to eliminate, break out of the loop.
+    if (SelectCond == 0 && SI == 0)
+      break;
+    
+  }
+  return true;
+}
+
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// @brief Common integer divide transforms
+Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+  
+  // Handle cases involving: [su]div X, (select Cond, Y, Z)
+  // This does not apply for fdiv.
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // (X / C1) / C2  -> X / (C1*C2)
+    if (Instruction *LHS = dyn_cast<Instruction>(Op0))
+      if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
+        if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
+          if (MultiplyOverflows(RHS, LHSRHS,
+                                I.getOpcode()==Instruction::SDiv))
+            return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+          return BinaryOperator::Create(I.getOpcode(), LHS->getOperand(0),
+                                        ConstantExpr::getMul(RHS, LHSRHS));
+        }
+
+    if (!RHS->isZero()) { // avoid X udiv 0
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+        if (Instruction *R = FoldOpIntoSelect(I, SI))
+          return R;
+      if (isa<PHINode>(Op0))
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+    }
+  }
+
+  // See if we can fold away this div instruction.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
+  Value *X = 0, *Z = 0;
+  if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) { // (X - Z) / Y; Y = Op1
+    bool isSigned = I.getOpcode() == Instruction::SDiv;
+    if ((isSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+        (!isSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+      return BinaryOperator::Create(I.getOpcode(), X, Op1);
+  }
+
+  return 0;
+}
+
+/// dyn_castZExtVal - Checks if V is a zext or constant that can
+/// be truncated to Ty without losing bits.
+static Value *dyn_castZExtVal(Value *V, Type *Ty) {
+  if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) {
+    if (Z->getSrcTy() == Ty)
+      return Z->getOperand(0);
+  } else if (ConstantInt *C = dyn_cast<ConstantInt>(V)) {
+    if (C->getValue().getActiveBits() <= cast<IntegerType>(Ty)->getBitWidth())
+      return ConstantExpr::getTrunc(C, Ty);
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyUDivInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+  
+  { 
+    // X udiv 2^C -> X >> C
+    // Check to see if this is an unsigned division with an exact power of 2,
+    // if so, convert to a right shift.
+    const APInt *C;
+    if (match(Op1, m_Power2(C))) {
+      BinaryOperator *LShr =
+      BinaryOperator::CreateLShr(Op0, 
+                                 ConstantInt::get(Op0->getType(), 
+                                                  C->logBase2()));
+      if (I.isExact()) LShr->setIsExact();
+      return LShr;
+    }
+  }
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+    // X udiv C, where C >= signbit
+    if (C->getValue().isNegative()) {
+      Value *IC = Builder->CreateICmpULT(Op0, C);
+      return SelectInst::Create(IC, Constant::getNullValue(I.getType()),
+                                ConstantInt::get(I.getType(), 1));
+    }
+  }
+
+  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
+  if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) {
+    Value *X;
+    ConstantInt *C1;
+    if (match(Op0, m_LShr(m_Value(X), m_ConstantInt(C1)))) {
+      APInt NC = C2->getValue().shl(C1->getLimitedValue(C1->getBitWidth()-1));
+      return BinaryOperator::CreateUDiv(X, Builder->getInt(NC));
+    }
+  }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  { const APInt *CI; Value *N;
+    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
+        match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
+      if (*CI != 1)
+        N = Builder->CreateAdd(N,
+                               ConstantInt::get(N->getType(), CI->logBase2()));
+      if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
+        N = Builder->CreateZExt(N, Z->getDestTy());
+      if (I.isExact())
+        return BinaryOperator::CreateExactLShr(Op0, N);
+      return BinaryOperator::CreateLShr(Op0, N);
+    }
+  }
+  
+  // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
+  // where C1&C2 are powers of two.
+  { Value *Cond; const APInt *C1, *C2;
+    if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) {
+      // Construct the "on true" case of the select
+      Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t",
+                                       I.isExact());
+  
+      // Construct the "on false" case of the select
+      Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f",
+                                       I.isExact());
+      
+      // construct the select instruction and return it.
+      return SelectInst::Create(Cond, TSI, FSI);
+    }
+  }
+
+  // (zext A) udiv (zext B) --> zext (A udiv B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div",
+                                              I.isExact()),
+                          I.getType());
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifySDivInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // sdiv X, -1 == -X
+    if (RHS->isAllOnesValue())
+      return BinaryOperator::CreateNeg(Op0);
+
+    // sdiv X, C  -->  ashr exact X, log2(C)
+    if (I.isExact() && RHS->getValue().isNonNegative() &&
+        RHS->getValue().isPowerOf2()) {
+      Value *ShAmt = llvm::ConstantInt::get(RHS->getType(),
+                                            RHS->getValue().exactLogBase2());
+      return BinaryOperator::CreateExactAShr(Op0, ShAmt, I.getName());
+    }
+
+    // -X/C  -->  X/-C  provided the negation doesn't overflow.
+    if (SubOperator *Sub = dyn_cast<SubOperator>(Op0))
+      if (match(Sub->getOperand(0), m_Zero()) && Sub->hasNoSignedWrap())
+        return BinaryOperator::CreateSDiv(Sub->getOperand(1),
+                                          ConstantExpr::getNeg(RHS));
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
+  if (I.getType()->isIntegerTy()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op0, Mask)) {
+      if (MaskedValueIsZero(Op1, Mask)) {
+        // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+        return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      }
+      
+      if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
+        // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
+        // Safe because the only negative value (1 << Y) can take on is
+        // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
+        // the sign bit set.
+        return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      }
+    }
+  }
+  
+  return 0;
+}
+
+/// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
+/// FP value and:
+///    1) 1/C is exact, or 
+///    2) reciprocal is allowed.
+/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// returned; otherwise, NULL is returned.
+///
+static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
+                                             ConstantFP *Divisor,
+                                             bool AllowReciprocal) {
+  const APFloat &FpVal = Divisor->getValueAPF();
+  APFloat Reciprocal(FpVal.getSemantics());
+  bool Cvt = FpVal.getExactInverse(&Reciprocal);
+    
+  if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
+    Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
+    (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
+    Cvt = !Reciprocal.isDenormal();
+  }
+
+  if (!Cvt)
+    return 0;
+
+  ConstantFP *R;
+  R = ConstantFP::get(Dividend->getType()->getContext(), Reciprocal);
+  return BinaryOperator::CreateFMul(Dividend, R);
+}
+
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyFDivInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  bool AllowReassociate = I.hasUnsafeAlgebra();
+  bool AllowReciprocal = I.hasAllowReciprocal();
+
+  if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+    if (AllowReassociate) {
+      ConstantFP *C1 = 0;
+      ConstantFP *C2 = Op1C;
+      Value *X;
+      Instruction *Res = 0;
+
+      if (match(Op0, m_FMul(m_Value(X), m_ConstantFP(C1)))) {
+        // (X*C1)/C2 => X * (C1/C2)
+        //
+        Constant *C = ConstantExpr::getFDiv(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal())
+          Res = BinaryOperator::CreateFMul(X, C);
+      } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) {
+        // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
+        //
+        Constant *C = ConstantExpr::getFMul(C1, C2);
+        const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
+        if (F.isNormal() && !F.isDenormal()) {
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+                                         AllowReciprocal);
+          if (!Res)
+            Res = BinaryOperator::CreateFDiv(X, C); 
+        }
+      }
+
+      if (Res) {
+        Res->setFastMathFlags(I.getFastMathFlags());
+        return Res;
+      }
+    }
+
+    // X / C => X * 1/C
+    if (Instruction *T = CvtFDivConstToReciprocal(Op0, Op1C, AllowReciprocal))
+      return T;
+
+    return 0;
+  }
+
+  if (AllowReassociate && isa<ConstantFP>(Op0)) {
+    ConstantFP *C1 = cast<ConstantFP>(Op0), *C2;
+    Constant *Fold = 0;
+    Value *X;
+    bool CreateDiv = true;
+
+    // C1 / (X*C2) => (C1/C2) / X
+    if (match(Op1, m_FMul(m_Value(X), m_ConstantFP(C2))))
+      Fold = ConstantExpr::getFDiv(C1, C2);
+    else if (match(Op1, m_FDiv(m_Value(X), m_ConstantFP(C2)))) {
+      // C1 / (X/C2) => (C1*C2) / X
+      Fold = ConstantExpr::getFMul(C1, C2);
+    } else if (match(Op1, m_FDiv(m_ConstantFP(C2), m_Value(X)))) {
+      // C1 / (C2/X) => (C1/C2) * X
+      Fold = ConstantExpr::getFDiv(C1, C2);
+      CreateDiv = false;
+    }
+
+    if (Fold) {
+      const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
+      if (FoldC.isNormal() && !FoldC.isDenormal()) {
+        Instruction *R = CreateDiv ? 
+                         BinaryOperator::CreateFDiv(Fold, X) :
+                         BinaryOperator::CreateFMul(X, Fold);
+        R->setFastMathFlags(I.getFastMathFlags());
+        return R;
+      }
+    }
+    return 0;
+  }
+
+  if (AllowReassociate) {
+    Value *X, *Y;
+    Value *NewInst = 0;
+    Instruction *SimpR = 0;
+
+    if (Op0->hasOneUse() && match(Op0, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // (X/Y) / Z => X / (Y*Z)
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op1)) {
+        NewInst = Builder->CreateFMul(Y, Op1);
+        SimpR = BinaryOperator::CreateFDiv(X, NewInst);
+      }
+    } else if (Op1->hasOneUse() && match(Op1, m_FDiv(m_Value(X), m_Value(Y)))) {
+      // Z / (X/Y) => Z*Y / X
+      //
+      if (!isa<ConstantFP>(Y) || !isa<ConstantFP>(Op0)) {
+        NewInst = Builder->CreateFMul(Op0, Y);
+        SimpR = BinaryOperator::CreateFDiv(NewInst, X);
+      }
+    }
+
+    if (NewInst) {
+      if (Instruction *T = dyn_cast<Instruction>(NewInst))
+        T->setDebugLoc(I.getDebugLoc());
+      SimpR->setFastMathFlags(I.getFastMathFlags());
+      return SimpR;
+    }
+  }
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// @brief Common integer remainder transforms
+Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // The RHS is known non-zero.
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this)) {
+    I.setOperand(1, V);
+    return &I;
+  }
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  if (isa<ConstantInt>(Op1)) {
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI))
+          return R;
+      } else if (isa<PHINode>(Op0I)) {
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+      }
+
+      // See if we can fold away this rem instruction.
+      if (SimplifyDemandedInstructionBits(I))
+        return &I;
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyURemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  // X urem C^2 -> X and C-1
+  { const APInt *C;
+    if (match(Op1, m_Power2(C)))
+      return BinaryOperator::CreateAnd(Op0,
+                                       ConstantInt::get(I.getType(), *C-1));
+  }
+
+  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+  if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
+    Constant *N1 = Constant::getAllOnesValue(I.getType());
+    Value *Add = Builder->CreateAdd(Op1, N1);
+    return BinaryOperator::CreateAnd(Op0, Add);
+  }
+
+  // urem X, (select Cond, 2^C1, 2^C2) -->
+  //    select Cond, (and X, C1-1), (and X, C2-1)
+  // when C1&C2 are powers of two.
+  { Value *Cond; const APInt *C1, *C2;
+    if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) {
+      Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t");
+      Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f");
+      return SelectInst::Create(Cond, TrueAnd, FalseAnd);
+    }
+  }
+
+  // (zext A) urem (zext B) --> zext (A urem B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
+                          I.getType());
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifySRemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle the integer rem common cases
+  if (Instruction *Common = commonIRemTransforms(I))
+    return Common;
+  
+  if (Value *RHSNeg = dyn_castNegVal(Op1))
+    if (!isa<Constant>(RHSNeg) ||
+        (isa<ConstantInt>(RHSNeg) &&
+         cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive())) {
+      // X % -Y -> X % Y
+      Worklist.AddValue(I.getOperand(1));
+      I.setOperand(1, RHSNeg);
+      return &I;
+    }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  if (I.getType()->isIntegerTy()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+      return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+    }
+  }
+
+  // If it's a constant vector, flip any negative values positive.
+  if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
+    Constant *C = cast<Constant>(Op1);
+    unsigned VWidth = C->getType()->getVectorNumElements();
+
+    bool hasNegative = false;
+    bool hasMissing = false;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      Constant *Elt = C->getAggregateElement(i);
+      if (Elt == 0) {
+        hasMissing = true;
+        break;
+      }
+
+      if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
+        if (RHS->isNegative())
+          hasNegative = true;
+    }
+
+    if (hasNegative && !hasMissing) {
+      SmallVector<Constant *, 16> Elts(VWidth);
+      for (unsigned i = 0; i != VWidth; ++i) {
+        Elts[i] = C->getAggregateElement(i);  // Handle undef, etc.
+        if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
+          if (RHS->isNegative())
+            Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+        }
+      }
+
+      Constant *NewRHSV = ConstantVector::get(Elts);
+      if (NewRHSV != C) {  // Don't loop on -MININT
+        Worklist.AddValue(I.getOperand(1));
+        I.setOperand(1, NewRHSV);
+        return &I;
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyFRemInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
+    return &I;
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
new file mode 100644
index 000000000000..b0a998cca76e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -0,0 +1,904 @@
+//===- InstCombinePHI.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitPHINode function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+using namespace llvm;
+
+/// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)]
+/// and if a/b/c and the add's all have a single use, turn this into a phi
+/// and a single binop.
+Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+    
+  Type *LHSType = LHSVal->getType();
+  Type *RHSType = RHSVal->getType();
+  
+  bool isNUW = false, isNSW = false, isExact = false;
+  if (OverflowingBinaryOperator *BO =
+        dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
+    isNUW = BO->hasNoUnsignedWrap();
+    isNSW = BO->hasNoSignedWrap();
+  } else if (PossiblyExactOperator *PEO =
+               dyn_cast<PossiblyExactOperator>(FirstInst))
+    isExact = PEO->isExact();
+  
+  // Scan to see if all operands are the same opcode, and all have one use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return 0;
+
+    // If they are CmpInst instructions, check their predicates
+    if (CmpInst *CI = dyn_cast<CmpInst>(I))
+      if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
+        return 0;
+    
+    if (isNUW)
+      isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
+    if (isNSW)
+      isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    if (isExact)
+      isExact = cast<PossiblyExactOperator>(I)->isExact();
+    
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = 0;
+    if (I->getOperand(1) != RHSVal) RHSVal = 0;
+  }
+
+  // If both LHS and RHS would need a PHI, don't do this transformation,
+  // because it would increase the number of PHIs entering the block,
+  // which leads to higher register pressure. This is especially
+  // bad when the PHIs are in the header of a loop.
+  if (!LHSVal && !RHSVal)
+    return 0;
+  
+  // Otherwise, this is safe to transform!
+  
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = 0, *NewRHS = 0;
+  if (LHSVal == 0) {
+    NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(0)->getName() + ".pn");
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+  
+  if (RHSVal == 0) {
+    NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
+                             FirstInst->getOperand(1)->getName() + ".pn");
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+  
+  // Add all operands to the new PHIs.
+  if (NewLHS || NewRHS) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+      if (NewLHS) {
+        Value *NewInLHS = InInst->getOperand(0);
+        NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+      }
+      if (NewRHS) {
+        Value *NewInRHS = InInst->getOperand(1);
+        NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+      }
+    }
+  }
+    
+  if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+    CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                     LHSVal, RHSVal);
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
+
+  BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
+  BinaryOperator *NewBinOp =
+    BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+  if (isNUW) NewBinOp->setHasNoUnsignedWrap();
+  if (isNSW) NewBinOp->setHasNoSignedWrap();
+  if (isExact) NewBinOp->setIsExact();
+  NewBinOp->setDebugLoc(FirstInst->getDebugLoc());
+  return NewBinOp;
+}
+
+Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
+  GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+  
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+                                        FirstInst->op_end());
+  // This is true if all GEP bases are allocas and if all indices into them are
+  // constants.
+  bool AllBasePointersAreAllocas = true;
+
+  // We don't want to replace this phi if the replacement would require
+  // more than one phi, which leads to higher register pressure. This is
+  // especially bad when the PHIs are in the header of a loop.
+  bool NeededPhi = false;
+  
+  bool AllInBounds = true;
+  
+  // Scan to see if all operands are the same opcode, and all have one use.
+  for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+    GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
+    if (!GEP || !GEP->hasOneUse() || GEP->getType() != FirstInst->getType() ||
+      GEP->getNumOperands() != FirstInst->getNumOperands())
+      return 0;
+
+    AllInBounds &= GEP->isInBounds();
+    
+    // Keep track of whether or not all GEPs are of alloca pointers.
+    if (AllBasePointersAreAllocas &&
+        (!isa<AllocaInst>(GEP->getOperand(0)) ||
+         !GEP->hasAllConstantIndices()))
+      AllBasePointersAreAllocas = false;
+    
+    // Compare the operand lists.
+    for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+      if (FirstInst->getOperand(op) == GEP->getOperand(op))
+        continue;
+      
+      // Don't merge two GEPs when two operands differ (introducing phi nodes)
+      // if one of the PHIs has a constant for the index.  The index may be
+      // substantially cheaper to compute for the constants, so making it a
+      // variable index could pessimize the path.  This also handles the case
+      // for struct indices, which must always be constant.
+      if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+          isa<ConstantInt>(GEP->getOperand(op)))
+        return 0;
+      
+      if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+        return 0;
+
+      // If we already needed a PHI for an earlier operand, and another operand
+      // also requires a PHI, we'd be introducing more PHIs than we're
+      // eliminating, which increases register pressure on entry to the PHI's
+      // block.
+      if (NeededPhi)
+        return 0;
+
+      FixedOperands[op] = 0;  // Needs a PHI.
+      NeededPhi = true;
+    }
+  }
+  
+  // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+  // bother doing this transformation.  At best, this will just save a bit of
+  // offset calculation, but all the predecessors will have to materialize the
+  // stack address into a register anyway.  We'd actually rather *clone* the
+  // load up into the predecessors so that we have a load of a gep of an alloca,
+  // which can usually all be folded into the load.
+  if (AllBasePointersAreAllocas)
+    return 0;
+  
+  // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
+  // that is variable.
+  SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+  
+  bool HasAnyPHIs = false;
+  for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+    if (FixedOperands[i]) continue;  // operand doesn't need a phi.
+    Value *FirstOp = FirstInst->getOperand(i);
+    PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
+                                     FirstOp->getName()+".pn");
+    InsertNewInstBefore(NewPN, PN);
+    
+    NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+    OperandPhis[i] = NewPN;
+    FixedOperands[i] = NewPN;
+    HasAnyPHIs = true;
+  }
+
+  
+  // Add all operands to the new PHIs.
+  if (HasAnyPHIs) {
+    for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+      GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+      BasicBlock *InBB = PN.getIncomingBlock(i);
+      
+      for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+        if (PHINode *OpPhi = OperandPhis[op])
+          OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+    }
+  }
+  
+  Value *Base = FixedOperands[0];
+  GetElementPtrInst *NewGEP = 
+    GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
+  if (AllInBounds) NewGEP->setIsInBounds();
+  NewGEP->setDebugLoc(FirstInst->getDebugLoc());
+  return NewGEP;
+}
+
+
+/// isSafeAndProfitableToSinkLoad - Return true if we know that it is safe to
+/// sink the load out of the block that defines it.  This means that it must be
+/// obvious the value of the load is not changed from the point of the load to
+/// the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targeting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L, E = L->getParent()->end();
+  
+  for (++BBI; BBI != E; ++BBI)
+    if (BBI->mayWriteToMemory())
+      return false;
+  
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E; ++UI) {
+      User *U = *UI;
+      if (isa<LoadInst>(U)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+    
+    if (!isAddressTaken && AI->isStaticAlloca())
+      return false;
+  }
+  
+  // If this load is a load from a GEP with a constant offset from an alloca,
+  // then we don't want to sink it.  In its present form, it will be
+  // load [constant stack offset].  Sinking it will cause us to have to
+  // materialize the stack addresses in each predecessor in a register only to
+  // do a shared load from register in the successor.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+      if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+        return false;
+  
+  return true;
+}
+
+Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
+  LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
+
+  // FIXME: This is overconservative; this transform is allowed in some cases
+  // for atomic operations.
+  if (FirstLI->isAtomic())
+    return 0;
+
+  // When processing loads, we need to propagate two bits of information to the
+  // sunk load: whether it is volatile, and what its alignment is.  We currently
+  // don't sink loads when some have their alignment specified and some don't.
+  // visitLoadInst will propagate an alignment onto the load when TD is around,
+  // and if TD isn't around, we can't handle the mixed case.
+  bool isVolatile = FirstLI->isVolatile();
+  unsigned LoadAlignment = FirstLI->getAlignment();
+  unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
+  
+  // We can't sink the load if the loaded value could be modified between the
+  // load and the PHI.
+  if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
+      !isSafeAndProfitableToSinkLoad(FirstLI))
+    return 0;
+  
+  // If the PHI is of volatile loads and the load block has multiple
+  // successors, sinking it would remove a load of the volatile value from
+  // the path through the other successor.
+  if (isVolatile && 
+      FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
+    return 0;
+  
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
+    if (!LI || !LI->hasOneUse())
+      return 0;
+    
+    // We can't sink the load if the loaded value could be modified between 
+    // the load and the PHI.
+    if (LI->isVolatile() != isVolatile ||
+        LI->getParent() != PN.getIncomingBlock(i) ||
+        LI->getPointerAddressSpace() != LoadAddrSpace ||
+        !isSafeAndProfitableToSinkLoad(LI))
+      return 0;
+      
+    // If some of the loads have an alignment specified but not all of them,
+    // we can't do the transformation.
+    if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
+      return 0;
+    
+    LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
+    
+    // If the PHI is of volatile loads and the load block has multiple
+    // successors, sinking it would remove a load of the volatile value from
+    // the path through the other successor.
+    if (isVolatile &&
+        LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+      return 0;
+  }
+  
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+  
+  Value *InVal = FirstLI->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+  
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = 0;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+  
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+  
+  // If this was a volatile load that we are merging, make sure to loop through
+  // and mark all the input loads as non-volatile.  If we don't do this, we will
+  // insert a new volatile load and the old ones will not be deletable.
+  if (isVolatile)
+    for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+      cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
+  
+  LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
+  NewLI->setDebugLoc(FirstLI->getDebugLoc());
+  return NewLI;
+}
+
+
+
+/// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+/// operator and they all are only used by the PHI, PHI together their
+/// inputs, and do the operation once, to the result of the PHI.
+Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  if (isa<GetElementPtrInst>(FirstInst))
+    return FoldPHIArgGEPIntoPHI(PN);
+  if (isa<LoadInst>(FirstInst))
+    return FoldPHIArgLoadIntoPHI(PN);
+  
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = 0;
+  Type *CastSrcTy = 0;
+  bool isNUW = false, isNSW = false, isExact = false;
+  
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+
+    // Be careful about transforming integer PHIs.  We don't want to pessimize
+    // the code by turning an i32 into an i1293.
+    if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
+      if (!ShouldChangeType(PN.getType(), CastSrcTy))
+        return 0;
+    }
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (ConstantOp == 0)
+      return FoldPHIArgBinOpIntoPHI(PN);
+    
+    if (OverflowingBinaryOperator *BO =
+        dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
+      isNUW = BO->hasNoUnsignedWrap();
+      isNSW = BO->hasNoSignedWrap();
+    } else if (PossiblyExactOperator *PEO =
+               dyn_cast<PossiblyExactOperator>(FirstInst))
+      isExact = PEO->isExact();
+  } else {
+    return 0;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (I == 0 || !I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return 0;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return 0;  // Cast operation must match.
+    } else if (I->getOperand(1) != ConstantOp) {
+      return 0;
+    }
+    
+    if (isNUW)
+      isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
+    if (isNSW)
+      isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+    if (isExact)
+      isExact = cast<PossiblyExactOperator>(I)->isExact();
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+                                   PN.getNumIncomingValues(),
+                                   PN.getName()+".in");
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = 0;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+    CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+                                       PN.getType());
+    NewCI->setDebugLoc(FirstInst->getDebugLoc());
+    return NewCI;
+  }
+  
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
+    BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+    if (isNUW) BinOp->setHasNoUnsignedWrap();
+    if (isNSW) BinOp->setHasNoSignedWrap();
+    if (isExact) BinOp->setIsExact();
+    BinOp->setDebugLoc(FirstInst->getDebugLoc());
+    return BinOp;
+  }
+  
+  CmpInst *CIOp = cast<CmpInst>(FirstInst);
+  CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+                                   PhiVal, ConstantOp);
+  NewCI->setDebugLoc(FirstInst->getDebugLoc());
+  return NewCI;
+}
+
+/// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
+/// that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (PotentiallyDeadPHIs.size() == 16)
+    return false;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->use_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+/// PHIsEqualValue - Return true if this phi node is always equal to
+/// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
+///   z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
+                           SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
+  // See if we already saw this PHI node.
+  if (!ValueEqualPHIs.insert(PN))
+    return true;
+  
+  // Don't scan crazily complex things.
+  if (ValueEqualPHIs.size() == 16)
+    return false;
+ 
+  // Scan the operands to see if they are either phi nodes or are equal to
+  // the value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *Op = PN->getIncomingValue(i);
+    if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+      if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+        return false;
+    } else if (Op != NonPhiInVal)
+      return false;
+  }
+  
+  return true;
+}
+
+
+namespace {
+struct PHIUsageRecord {
+  unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
+  unsigned Shift;     // The amount shifted.
+  Instruction *Inst;  // The trunc instruction.
+  
+  PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
+    : PHIId(pn), Shift(Sh), Inst(User) {}
+  
+  bool operator<(const PHIUsageRecord &RHS) const {
+    if (PHIId < RHS.PHIId) return true;
+    if (PHIId > RHS.PHIId) return false;
+    if (Shift < RHS.Shift) return true;
+    if (Shift > RHS.Shift) return false;
+    return Inst->getType()->getPrimitiveSizeInBits() <
+           RHS.Inst->getType()->getPrimitiveSizeInBits();
+  }
+};
+  
+struct LoweredPHIRecord {
+  PHINode *PN;        // The PHI that was lowered.
+  unsigned Shift;     // The amount shifted.
+  unsigned Width;     // The width extracted.
+  
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
+    : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
+  
+  // Ctor form used by DenseMap.
+  LoweredPHIRecord(PHINode *pn, unsigned Sh)
+    : PN(pn), Shift(Sh), Width(0) {}
+};
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<LoweredPHIRecord> {
+    static inline LoweredPHIRecord getEmptyKey() {
+      return LoweredPHIRecord(0, 0);
+    }
+    static inline LoweredPHIRecord getTombstoneKey() {
+      return LoweredPHIRecord(0, 1);
+    }
+    static unsigned getHashValue(const LoweredPHIRecord &Val) {
+      return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
+             (Val.Width>>3);
+    }
+    static bool isEqual(const LoweredPHIRecord &LHS,
+                        const LoweredPHIRecord &RHS) {
+      return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
+             LHS.Width == RHS.Width;
+    }
+  };
+  template <>
+  struct isPodLike<LoweredPHIRecord> { static const bool value = true; };
+}
+
+
+/// SliceUpIllegalIntegerPHI - This is an integer PHI and we know that it has an
+/// illegal type: see if it is only used by trunc or trunc(lshr) operations.  If
+/// so, we split the PHI into the various pieces being extracted.  This sort of
+/// thing is introduced when SROA promotes an aggregate to large integer values.
+///
+/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
+/// inttoptr.  We should produce new PHIs in the right type.
+///
+Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
+  // PHIUsers - Keep track of all of the truncated values extracted from a set
+  // of PHIs, along with their offset.  These are the things we want to rewrite.
+  SmallVector<PHIUsageRecord, 16> PHIUsers;
+  
+  // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
+  // nodes which are extracted from. PHIsToSlice is a set we use to avoid
+  // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
+  // check the uses of (to ensure they are all extracts).
+  SmallVector<PHINode*, 8> PHIsToSlice;
+  SmallPtrSet<PHINode*, 8> PHIsInspected;
+  
+  PHIsToSlice.push_back(&FirstPhi);
+  PHIsInspected.insert(&FirstPhi);
+  
+  for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
+    PHINode *PN = PHIsToSlice[PHIId];
+    
+    // Scan the input list of the PHI.  If any input is an invoke, and if the
+    // input is defined in the predecessor, then we won't be split the critical
+    // edge which is required to insert a truncate.  Because of this, we have to
+    // bail out.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
+      if (II == 0) continue;
+      if (II->getParent() != PN->getIncomingBlock(i))
+        continue;
+     
+      // If we have a phi, and if it's directly in the predecessor, then we have
+      // a critical edge where we need to put the truncate.  Since we can't
+      // split the edge in instcombine, we have to bail out.
+      return 0;
+    }
+      
+    
+    for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      
+      // If the user is a PHI, inspect its uses recursively.
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (PHIsInspected.insert(UserPN))
+          PHIsToSlice.push_back(UserPN);
+        continue;
+      }
+      
+      // Truncates are always ok.
+      if (isa<TruncInst>(User)) {
+        PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User));
+        continue;
+      }
+      
+      // Otherwise it must be a lshr which can only be used by one trunc.
+      if (User->getOpcode() != Instruction::LShr ||
+          !User->hasOneUse() || !isa<TruncInst>(User->use_back()) ||
+          !isa<ConstantInt>(User->getOperand(1)))
+        return 0;
+      
+      unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
+      PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back()));
+    }
+  }
+  
+  // If we have no users, they must be all self uses, just nuke the PHI.
+  if (PHIUsers.empty())
+    return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+  
+  // If this phi node is transformable, create new PHIs for all the pieces
+  // extracted out of it.  First, sort the users by their offset and size.
+  array_pod_sort(PHIUsers.begin(), PHIUsers.end());
+  
+  DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
+            for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+              errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
+        );
+  
+  // PredValues - This is a temporary used when rewriting PHI nodes.  It is
+  // hoisted out here to avoid construction/destruction thrashing.
+  DenseMap<BasicBlock*, Value*> PredValues;
+  
+  // ExtractedVals - Each new PHI we introduce is saved here so we don't
+  // introduce redundant PHIs.
+  DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
+  
+  for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
+    unsigned PHIId = PHIUsers[UserI].PHIId;
+    PHINode *PN = PHIsToSlice[PHIId];
+    unsigned Offset = PHIUsers[UserI].Shift;
+    Type *Ty = PHIUsers[UserI].Inst->getType();
+    
+    PHINode *EltPHI;
+    
+    // If we've already lowered a user like this, reuse the previously lowered
+    // value.
+    if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) {
+      
+      // Otherwise, Create the new PHI node for this user.
+      EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
+                               PN->getName()+".off"+Twine(Offset), PN);
+      assert(EltPHI->getType() != PN->getType() &&
+             "Truncate didn't shrink phi?");
+    
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        Value *&PredVal = PredValues[Pred];
+        
+        // If we already have a value for this predecessor, reuse it.
+        if (PredVal) {
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+
+        // Handle the PHI self-reuse case.
+        Value *InVal = PN->getIncomingValue(i);
+        if (InVal == PN) {
+          PredVal = EltPHI;
+          EltPHI->addIncoming(PredVal, Pred);
+          continue;
+        }
+        
+        if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+          // If the incoming value was a PHI, and if it was one of the PHIs we
+          // already rewrote it, just use the lowered value.
+          if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
+            PredVal = Res;
+            EltPHI->addIncoming(PredVal, Pred);
+            continue;
+          }
+        }
+        
+        // Otherwise, do an extract in the predecessor.
+        Builder->SetInsertPoint(Pred, Pred->getTerminator());
+        Value *Res = InVal;
+        if (Offset)
+          Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(),
+                                                          Offset), "extract");
+        Res = Builder->CreateTrunc(Res, Ty, "extract.t");
+        PredVal = Res;
+        EltPHI->addIncoming(Res, Pred);
+        
+        // If the incoming value was a PHI, and if it was one of the PHIs we are
+        // rewriting, we will ultimately delete the code we inserted.  This
+        // means we need to revisit that PHI to make sure we extract out the
+        // needed piece.
+        if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
+          if (PHIsInspected.count(OldInVal)) {
+            unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(),
+                                          OldInVal)-PHIsToSlice.begin();
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, 
+                                              cast<Instruction>(Res)));
+            ++UserE;
+          }
+      }
+      PredValues.clear();
+      
+      DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
+                   << *EltPHI << '\n');
+      ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
+    }
+    
+    // Replace the use of this piece with the PHI node.
+    ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+  }
+  
+  // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
+  // with undefs.
+  Value *Undef = UndefValue::get(FirstPhi.getType());
+  for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+    ReplaceInstUsesWith(*PHIsToSlice[i], Undef);
+  return ReplaceInstUsesWith(FirstPhi, Undef);
+}
+
+// PHINode simplification
+//
+Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+  if (Value *V = SimplifyInstruction(&PN, TD))
+    return ReplaceInstUsesWith(PN, V);
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      isa<Instruction>(PN.getIncomingValue(1)) &&
+      cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+      cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
+      // FIXME: The hasOneUse check will fail for PHIs that use the value more
+      // than themselves more than once.
+      PN.getIncomingValue(0)->hasOneUse())
+    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
+    Instruction *PHIUser = cast<Instruction>(PN.use_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+   
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->use_back() == &PN) {
+      return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+  }
+
+  // We sometimes end up with phi cycles that non-obviously end up being the
+  // same value, for example:
+  //   z = some value; x = phi (y, z); y = phi (x, z)
+  // where the phi nodes don't necessarily need to be in the same block.  Do a
+  // quick check to see if the PHI node only contains a single non-phi value, if
+  // so, scan to see if the phi cycle is actually equal to that value.
+  {
+    unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
+    // Scan for the first non-phi operand.
+    while (InValNo != NumIncomingVals &&
+           isa<PHINode>(PN.getIncomingValue(InValNo)))
+      ++InValNo;
+
+    if (InValNo != NumIncomingVals) {
+      Value *NonPhiInVal = PN.getIncomingValue(InValNo);
+      
+      // Scan the rest of the operands to see if there are any conflicts, if so
+      // there is no need to recursively scan other phis.
+      for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
+        Value *OpVal = PN.getIncomingValue(InValNo);
+        if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+          break;
+      }
+      
+      // If we scanned over all operands, then we have one unique value plus
+      // phi values.  Scan PHI nodes to see if they all merge in each other or
+      // the value.
+      if (InValNo == NumIncomingVals) {
+        SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+        if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+          return ReplaceInstUsesWith(PN, NonPhiInVal);
+      }
+    }
+  }
+
+  // If there are multiple PHIs, sort their operands so that they all list
+  // the blocks in the same order. This will help identical PHIs be eliminated
+  // by other passes. Other passes shouldn't depend on this for correctness
+  // however.
+  PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin());
+  if (&PN != FirstPN)
+    for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *BBA = PN.getIncomingBlock(i);
+      BasicBlock *BBB = FirstPN->getIncomingBlock(i);
+      if (BBA != BBB) {
+        Value *VA = PN.getIncomingValue(i);
+        unsigned j = PN.getBasicBlockIndex(BBB);
+        Value *VB = PN.getIncomingValue(j);
+        PN.setIncomingBlock(i, BBB);
+        PN.setIncomingValue(i, VB);
+        PN.setIncomingBlock(j, BBA);
+        PN.setIncomingValue(j, VA);
+        // NOTE: Instcombine normally would want us to "return &PN" if we
+        // modified any of the operands of an instruction.  However, since we
+        // aren't adding or removing uses (just rearranging them) we don't do
+        // this in this case.
+      }
+    }
+
+  // If this is an integer PHI and we know that it has an illegal type, see if
+  // it is only used by trunc or trunc(lshr) operations.  If so, we split the
+  // PHI into the various pieces being extracted.  This sort of thing is
+  // introduced when SROA promotes an aggregate to a single large integer type.
+  if (PN.getType()->isIntegerTy() && TD &&
+      !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+    if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
+      return Res;
+  
+  return 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
new file mode 100644
index 000000000000..121aa1f8d73f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -0,0 +1,945 @@
+//===- InstCombineSelect.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitSelect function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+/// MatchSelectPattern - Pattern match integer [SU]MIN, [SU]MAX, and ABS idioms,
+/// returning the kind and providing the out parameter results if we
+/// successfully match.
+static SelectPatternFlavor
+MatchSelectPattern(Value *V, Value *&LHS, Value *&RHS) {
+  SelectInst *SI = dyn_cast<SelectInst>(V);
+  if (SI == 0) return SPF_UNKNOWN;
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(SI->getCondition());
+  if (ICI == 0) return SPF_UNKNOWN;
+
+  LHS = ICI->getOperand(0);
+  RHS = ICI->getOperand(1);
+
+  // (icmp X, Y) ? X : Y
+  if (SI->getTrueValue() == ICI->getOperand(0) &&
+      SI->getFalseValue() == ICI->getOperand(1)) {
+    switch (ICI->getPredicate()) {
+    default: return SPF_UNKNOWN; // Equality.
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_UGE: return SPF_UMAX;
+    case ICmpInst::ICMP_SGT:
+    case ICmpInst::ICMP_SGE: return SPF_SMAX;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_ULE: return SPF_UMIN;
+    case ICmpInst::ICMP_SLT:
+    case ICmpInst::ICMP_SLE: return SPF_SMIN;
+    }
+  }
+
+  // (icmp X, Y) ? Y : X
+  if (SI->getTrueValue() == ICI->getOperand(1) &&
+      SI->getFalseValue() == ICI->getOperand(0)) {
+    switch (ICI->getPredicate()) {
+      default: return SPF_UNKNOWN; // Equality.
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_UGE: return SPF_UMIN;
+      case ICmpInst::ICMP_SGT:
+      case ICmpInst::ICMP_SGE: return SPF_SMIN;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_ULE: return SPF_UMAX;
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_SLE: return SPF_SMAX;
+    }
+  }
+
+  // TODO: (X > 4) ? X : 5   -->  (X >= 5) ? X : 5  -->  MAX(X, 5)
+
+  return SPF_UNKNOWN;
+}
+
+
+/// GetSelectFoldableOperands - We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+///
+static unsigned GetSelectFoldableOperands(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// GetSelectFoldableConstant - For the same transformation as the previous
+/// function, return the identity constant that goes into the select.
+static Constant *GetSelectFoldableConstant(Instruction *I) {
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("This cannot happen!");
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return Constant::getNullValue(I->getType());
+  case Instruction::And:
+    return Constant::getAllOnesValue(I->getType());
+  case Instruction::Mul:
+    return ConstantInt::get(I->getType(), 1);
+  }
+}
+
+/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI
+/// have the same opcode and only one use each.  Try to simplify this.
+Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                          Instruction *FI) {
+  if (TI->getNumOperands() == 1) {
+    // If this is a non-volatile load or a cast from the same type,
+    // merge.
+    if (TI->isCast()) {
+      Type *FIOpndTy = FI->getOperand(0)->getType();
+      if (TI->getOperand(0)->getType() != FIOpndTy)
+        return 0;
+      // The select condition may be a vector. We may only change the operand
+      // type if the vector width remains the same (and matches the condition).
+      Type *CondTy = SI.getCondition()->getType();
+      if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
+          CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
+        return 0;
+    } else {
+      return 0;  // unknown unary op.
+    }
+
+    // Fold this by inserting a select from the input values.
+    Value *NewSI = Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
+                                         FI->getOperand(0), SI.getName()+".v");
+    return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
+                            TI->getType());
+  }
+
+  // Only handle binary operators here.
+  if (!isa<BinaryOperator>(TI))
+    return 0;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return 0;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return 0;
+  }
+
+  // If we reach here, they do have operations in common.
+  Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT,
+                                       OtherOpF, SI.getName()+".v");
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
+    if (MatchIsOpZero)
+      return BinaryOperator::Create(BO->getOpcode(), MatchOp, NewSI);
+    else
+      return BinaryOperator::Create(BO->getOpcode(), NewSI, MatchOp);
+  }
+  llvm_unreachable("Shouldn't get here");
+}
+
+static bool isSelect01(Constant *C1, Constant *C2) {
+  ConstantInt *C1I = dyn_cast<ConstantInt>(C1);
+  if (!C1I)
+    return false;
+  ConstantInt *C2I = dyn_cast<ConstantInt>(C2);
+  if (!C2I)
+    return false;
+  if (!C1I->isZero() && !C2I->isZero()) // One side must be zero.
+    return false;
+  return C1I->isOne() || C1I->isAllOnesValue() ||
+         C2I->isOne() || C2I->isAllOnesValue();
+}
+
+/// FoldSelectIntoOp - Try fold the select into one of the operands to
+/// facilitate further optimization.
+Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
+                                            Value *FalseVal) {
+  // See the comment above GetSelectFoldableOperands for a description of the
+  // transformation we are doing here.
+  if (Instruction *TVI = dyn_cast<Instruction>(TrueVal)) {
+    if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
+        !isa<Constant>(FalseVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(TVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(TVI);
+          Value *OOp = TVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), OOp, C);
+            NewSel->takeName(TVI);
+            BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
+            BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
+                                                        FalseVal, NewSel);
+            if (isa<PossiblyExactOperator>(BO))
+              BO->setIsExact(TVI_BO->isExact());
+            if (isa<OverflowingBinaryOperator>(BO)) {
+              BO->setHasNoUnsignedWrap(TVI_BO->hasNoUnsignedWrap());
+              BO->setHasNoSignedWrap(TVI_BO->hasNoSignedWrap());
+            }
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  if (Instruction *FVI = dyn_cast<Instruction>(FalseVal)) {
+    if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
+        !isa<Constant>(TrueVal)) {
+      if (unsigned SFO = GetSelectFoldableOperands(FVI)) {
+        unsigned OpToFold = 0;
+        if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+          OpToFold = 1;
+        } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+          OpToFold = 2;
+        }
+
+        if (OpToFold) {
+          Constant *C = GetSelectFoldableConstant(FVI);
+          Value *OOp = FVI->getOperand(2-OpToFold);
+          // Avoid creating select between 2 constants unless it's selecting
+          // between 0, 1 and -1.
+          if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
+            Value *NewSel = Builder->CreateSelect(SI.getCondition(), C, OOp);
+            NewSel->takeName(FVI);
+            BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
+            BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
+                                                        TrueVal, NewSel);
+            if (isa<PossiblyExactOperator>(BO))
+              BO->setIsExact(FVI_BO->isExact());
+            if (isa<OverflowingBinaryOperator>(BO)) {
+              BO->setHasNoUnsignedWrap(FVI_BO->hasNoUnsignedWrap());
+              BO->setHasNoSignedWrap(FVI_BO->hasNoSignedWrap());
+            }
+            return BO;
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
+/// replaced with RepOp.
+static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI) {
+  // Trivial replacement.
+  if (V == Op)
+    return RepOp;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return 0;
+
+  // If this is a binary operator, try to simplify it with the replaced op.
+  if (BinaryOperator *B = dyn_cast<BinaryOperator>(I)) {
+    if (B->getOperand(0) == Op)
+      return SimplifyBinOp(B->getOpcode(), RepOp, B->getOperand(1), TD, TLI);
+    if (B->getOperand(1) == Op)
+      return SimplifyBinOp(B->getOpcode(), B->getOperand(0), RepOp, TD, TLI);
+  }
+
+  // Same for CmpInsts.
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    if (C->getOperand(0) == Op)
+      return SimplifyCmpInst(C->getPredicate(), RepOp, C->getOperand(1), TD,
+                             TLI);
+    if (C->getOperand(1) == Op)
+      return SimplifyCmpInst(C->getPredicate(), C->getOperand(0), RepOp, TD,
+                             TLI);
+  }
+
+  // TODO: We could hand off more cases to instsimplify here.
+
+  // If all operands are constant after substituting Op for RepOp then we can
+  // constant fold the instruction.
+  if (Constant *CRepOp = dyn_cast<Constant>(RepOp)) {
+    // Build a list of all constant operands.
+    SmallVector<Constant*, 8> ConstOps;
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      if (I->getOperand(i) == Op)
+        ConstOps.push_back(CRepOp);
+      else if (Constant *COp = dyn_cast<Constant>(I->getOperand(i)))
+        ConstOps.push_back(COp);
+      else
+        break;
+    }
+
+    // All operands were constants, fold it.
+    if (ConstOps.size() == I->getNumOperands()) {
+      if (CmpInst *C = dyn_cast<CmpInst>(I))
+        return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
+                                               ConstOps[1], TD, TLI);
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        if (!LI->isVolatile())
+          return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
+
+      return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
+                                      ConstOps, TD, TLI);
+    }
+  }
+
+  return 0;
+}
+
+/// visitSelectInstWithICmp - Visit a SelectInst that has an
+/// ICmpInst as its first operand.
+///
+Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
+                                                   ICmpInst *ICI) {
+  bool Changed = false;
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *CmpLHS = ICI->getOperand(0);
+  Value *CmpRHS = ICI->getOperand(1);
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // Check cases where the comparison is with a constant that
+  // can be adjusted to fit the min/max idiom. We may move or edit ICI
+  // here, so make sure the select is the only user.
+  if (ICI->hasOneUse())
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS)) {
+      // X < MIN ? T : F  -->  F
+      if ((Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT)
+          && CI->isMinValue(Pred == ICmpInst::ICMP_SLT))
+        return ReplaceInstUsesWith(SI, FalseVal);
+      // X > MAX ? T : F  -->  F
+      else if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT)
+               && CI->isMaxValue(Pred == ICmpInst::ICMP_SGT))
+        return ReplaceInstUsesWith(SI, FalseVal);
+      switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULT:
+      case ICmpInst::ICMP_SLT:
+      case ICmpInst::ICMP_UGT:
+      case ICmpInst::ICMP_SGT: {
+        // These transformations only work for selects over integers.
+        IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
+        if (!SelectTy)
+          break;
+
+        Constant *AdjustedRHS;
+        if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
+          AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() + 1);
+        else // (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
+          AdjustedRHS = ConstantInt::get(CI->getContext(), CI->getValue() - 1);
+
+        // X > C ? X : C+1  -->  X < C+1 ? C+1 : X
+        // X < C ? X : C-1  -->  X > C-1 ? C-1 : X
+        if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+            (CmpLHS == FalseVal && AdjustedRHS == TrueVal))
+          ; // Nothing to do here. Values match without any sign/zero extension.
+
+        // Types do not match. Instead of calculating this with mixed types
+        // promote all to the larger type. This enables scalar evolution to
+        // analyze this expression.
+        else if (CmpRHS->getType()->getScalarSizeInBits()
+                 < SelectTy->getBitWidth()) {
+          Constant *sextRHS = ConstantExpr::getSExt(AdjustedRHS, SelectTy);
+
+          // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
+          // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
+          // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
+          // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
+          if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) &&
+                sextRHS == FalseVal) {
+            CmpLHS = TrueVal;
+            AdjustedRHS = sextRHS;
+          } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
+                     sextRHS == TrueVal) {
+            CmpLHS = FalseVal;
+            AdjustedRHS = sextRHS;
+          } else if (ICI->isUnsigned()) {
+            Constant *zextRHS = ConstantExpr::getZExt(AdjustedRHS, SelectTy);
+            // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
+            // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
+            // zext + signed compare cannot be changed:
+            //    0xff <s 0x00, but 0x00ff >s 0x0000
+            if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) &&
+                zextRHS == FalseVal) {
+              CmpLHS = TrueVal;
+              AdjustedRHS = zextRHS;
+            } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
+                       zextRHS == TrueVal) {
+              CmpLHS = FalseVal;
+              AdjustedRHS = zextRHS;
+            } else
+              break;
+          } else
+            break;
+        } else
+          break;
+
+        Pred = ICmpInst::getSwappedPredicate(Pred);
+        CmpRHS = AdjustedRHS;
+        std::swap(FalseVal, TrueVal);
+        ICI->setPredicate(Pred);
+        ICI->setOperand(0, CmpLHS);
+        ICI->setOperand(1, CmpRHS);
+        SI.setOperand(1, TrueVal);
+        SI.setOperand(2, FalseVal);
+
+        // Move ICI instruction right before the select instruction. Otherwise
+        // the sext/zext value may be defined after the ICI instruction uses it.
+        ICI->moveBefore(&SI);
+
+        Changed = true;
+        break;
+      }
+      }
+    }
+
+  // Transform (X >s -1) ? C1 : C2 --> ((X >>s 31) & (C2 - C1)) + C1
+  // and       (X <s  0) ? C2 : C1 --> ((X >>s 31) & (C2 - C1)) + C1
+  // FIXME: Type and constness constraints could be lifted, but we have to
+  //        watch code size carefully. We should consider xor instead of
+  //        sub/add when we decide to do that.
+  if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
+    if (TrueVal->getType() == Ty) {
+      if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
+        ConstantInt *C1 = NULL, *C2 = NULL;
+        if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) {
+          C1 = dyn_cast<ConstantInt>(TrueVal);
+          C2 = dyn_cast<ConstantInt>(FalseVal);
+        } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isNullValue()) {
+          C1 = dyn_cast<ConstantInt>(FalseVal);
+          C2 = dyn_cast<ConstantInt>(TrueVal);
+        }
+        if (C1 && C2) {
+          // This shift results in either -1 or 0.
+          Value *AShr = Builder->CreateAShr(CmpLHS, Ty->getBitWidth()-1);
+
+          // Check if we can express the operation with a single or.
+          if (C2->isAllOnesValue())
+            return ReplaceInstUsesWith(SI, Builder->CreateOr(AShr, C1));
+
+          Value *And = Builder->CreateAnd(AShr, C2->getValue()-C1->getValue());
+          return ReplaceInstUsesWith(SI, Builder->CreateAdd(And, C1));
+        }
+      }
+    }
+  }
+
+  // If we have an equality comparison then we know the value in one of the
+  // arms of the select. See if substituting this value into the arm and
+  // simplifying the result yields the same value as the other arm.
+  if (Pred == ICmpInst::ICMP_EQ) {
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal)
+      return ReplaceInstUsesWith(SI, FalseVal);
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal)
+      return ReplaceInstUsesWith(SI, FalseVal);
+  } else if (Pred == ICmpInst::ICMP_NE) {
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD, TLI) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD, TLI) == FalseVal)
+      return ReplaceInstUsesWith(SI, TrueVal);
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD, TLI) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD, TLI) == TrueVal)
+      return ReplaceInstUsesWith(SI, TrueVal);
+  }
+
+  // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+
+  if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
+    if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
+      // Transform (X == C) ? X : Y -> (X == C) ? C : Y
+      SI.setOperand(1, CmpRHS);
+      Changed = true;
+    } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) {
+      // Transform (X != C) ? Y : X -> (X != C) ? Y : C
+      SI.setOperand(2, CmpRHS);
+      Changed = true;
+    }
+  }
+
+  return Changed ? &SI : 0;
+}
+
+
+/// CanSelectOperandBeMappingIntoPredBlock - SI is a select whose condition is a
+/// PHI node (but the two may be in different blocks).  See if the true/false
+/// values (V) are live in all of the predecessor blocks of the PHI.  For
+/// example, cases like this cannot be mapped:
+///
+///   X = phi [ C1, BB1], [C2, BB2]
+///   Y = add
+///   Z = select X, Y, 0
+///
+/// because Y is not live in BB1/BB2.
+///
+static bool CanSelectOperandBeMappingIntoPredBlock(const Value *V,
+                                                   const SelectInst &SI) {
+  // If the value is a non-instruction value like a constant or argument, it
+  // can always be mapped.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return true;
+
+  // If V is a PHI node defined in the same block as the condition PHI, we can
+  // map the arguments.
+  const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
+
+  if (const PHINode *VP = dyn_cast<PHINode>(I))
+    if (VP->getParent() == CondPHI->getParent())
+      return true;
+
+  // Otherwise, if the PHI and select are defined in the same block and if V is
+  // defined in a different block, then we can transform it.
+  if (SI.getParent() == CondPHI->getParent() &&
+      I->getParent() != CondPHI->getParent())
+    return true;
+
+  // Otherwise we have a 'hard' case and we can't tell without doing more
+  // detailed dominator based analysis, punt.
+  return false;
+}
+
+/// FoldSPFofSPF - We have an SPF (e.g. a min or max) of an SPF of the form:
+///   SPF2(SPF1(A, B), C)
+Instruction *InstCombiner::FoldSPFofSPF(Instruction *Inner,
+                                        SelectPatternFlavor SPF1,
+                                        Value *A, Value *B,
+                                        Instruction &Outer,
+                                        SelectPatternFlavor SPF2, Value *C) {
+  if (C == A || C == B) {
+    // MAX(MAX(A, B), B) -> MAX(A, B)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    if (SPF1 == SPF2)
+      return ReplaceInstUsesWith(Outer, Inner);
+
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
+        (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
+        (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
+        (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
+      return ReplaceInstUsesWith(Outer, C);
+  }
+
+  // TODO: MIN(MIN(A, 23), 97)
+  return 0;
+}
+
+
+/// foldSelectICmpAnd - If one of the constants is zero (we know they can't
+/// both be) and we have an icmp instruction with zero, and we have an 'and'
+/// with the non-constant value and a power of two we can turn the select
+/// into a shift on the result of the 'and'.
+static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
+                                ConstantInt *FalseVal,
+                                InstCombiner::BuilderTy *Builder) {
+  const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
+  if (!IC || !IC->isEquality())
+    return 0;
+
+  if (!match(IC->getOperand(1), m_Zero()))
+    return 0;
+
+  ConstantInt *AndRHS;
+  Value *LHS = IC->getOperand(0);
+  if (LHS->getType() != SI.getType() ||
+      !match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS))))
+    return 0;
+
+  // If both select arms are non-zero see if we have a select of the form
+  // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
+  // for 'x ? 2^n : 0' and fix the thing up at the end.
+  ConstantInt *Offset = 0;
+  if (!TrueVal->isZero() && !FalseVal->isZero()) {
+    if ((TrueVal->getValue() - FalseVal->getValue()).isPowerOf2())
+      Offset = FalseVal;
+    else if ((FalseVal->getValue() - TrueVal->getValue()).isPowerOf2())
+      Offset = TrueVal;
+    else
+      return 0;
+
+    // Adjust TrueVal and FalseVal to the offset.
+    TrueVal = ConstantInt::get(Builder->getContext(),
+                               TrueVal->getValue() - Offset->getValue());
+    FalseVal = ConstantInt::get(Builder->getContext(),
+                                FalseVal->getValue() - Offset->getValue());
+  }
+
+  // Make sure the mask in the 'and' and one of the select arms is a power of 2.
+  if (!AndRHS->getValue().isPowerOf2() ||
+      (!TrueVal->getValue().isPowerOf2() &&
+       !FalseVal->getValue().isPowerOf2()))
+    return 0;
+
+  // Determine which shift is needed to transform result of the 'and' into the
+  // desired result.
+  ConstantInt *ValC = !TrueVal->isZero() ? TrueVal : FalseVal;
+  unsigned ValZeros = ValC->getValue().logBase2();
+  unsigned AndZeros = AndRHS->getValue().logBase2();
+
+  Value *V = LHS;
+  if (ValZeros > AndZeros)
+    V = Builder->CreateShl(V, ValZeros - AndZeros);
+  else if (ValZeros < AndZeros)
+    V = Builder->CreateLShr(V, AndZeros - ValZeros);
+
+  // Okay, now we know that everything is set up, we just don't know whether we
+  // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
+  bool ShouldNotVal = !TrueVal->isZero();
+  ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
+  if (ShouldNotVal)
+    V = Builder->CreateXor(V, ValC);
+
+  // Apply an offset if needed.
+  if (Offset)
+    V = Builder->CreateAdd(V, Offset);
+  return V;
+}
+
+Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal, TD))
+    return ReplaceInstUsesWith(SI, V);
+
+  if (SI.getType()->isIntegerTy(1)) {
+    if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
+      if (C->getZExtValue()) {
+        // Change: A = select B, true, C --> A = or B, C
+        return BinaryOperator::CreateOr(CondVal, FalseVal);
+      }
+      // Change: A = select B, false, C --> A = and !B, C
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+      return BinaryOperator::CreateAnd(NotCond, FalseVal);
+    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+      if (C->getZExtValue() == false) {
+        // Change: A = select B, C, false --> A = and B, C
+        return BinaryOperator::CreateAnd(CondVal, TrueVal);
+      }
+      // Change: A = select B, C, true --> A = or !B, C
+      Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+      return BinaryOperator::CreateOr(NotCond, TrueVal);
+    }
+
+    // select a, b, a  -> a&b
+    // select a, a, b  -> a|b
+    if (CondVal == TrueVal)
+      return BinaryOperator::CreateOr(CondVal, FalseVal);
+    else if (CondVal == FalseVal)
+      return BinaryOperator::CreateAnd(CondVal, TrueVal);
+
+    // select a, ~a, b -> (~a)&b
+    // select a, b, ~a -> (~a)|b
+    if (match(TrueVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateAnd(TrueVal, FalseVal);
+    else if (match(FalseVal, m_Not(m_Specific(CondVal))))
+      return BinaryOperator::CreateOr(TrueVal, FalseVal);
+  }
+
+  // Selecting between two integer constants?
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
+      // select C, 1, 0 -> zext C to int
+      if (FalseValC->isZero() && TrueValC->getValue() == 1)
+        return new ZExtInst(CondVal, SI.getType());
+
+      // select C, -1, 0 -> sext C to int
+      if (FalseValC->isZero() && TrueValC->isAllOnesValue())
+        return new SExtInst(CondVal, SI.getType());
+
+      // select C, 0, 1 -> zext !C to int
+      if (TrueValC->isZero() && FalseValC->getValue() == 1) {
+        Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+        return new ZExtInst(NotCond, SI.getType());
+      }
+
+      // select C, 0, -1 -> sext !C to int
+      if (TrueValC->isZero() && FalseValC->isAllOnesValue()) {
+        Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
+        return new SExtInst(NotCond, SI.getType());
+      }
+
+      if (Value *V = foldSelectICmpAnd(SI, TrueValC, FalseValC, Builder))
+        return ReplaceInstUsesWith(SI, V);
+    }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X une Y) ? X : Y  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+        return ReplaceInstUsesWith(SI, TrueVal);
+      }
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+
+    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return ReplaceInstUsesWith(SI, FalseVal);
+      }
+      // Transform (X une Y) ? Y : X  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
+        // This is not safe in general for floating point:
+        // consider X== -0, Y== +0.
+        // It becomes safe if either operand is a nonzero constant.
+        ConstantFP *CFPt, *CFPf;
+        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
+              !CFPt->getValueAPF().isZero()) ||
+            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
+             !CFPf->getValueAPF().isZero()))
+          return ReplaceInstUsesWith(SI, TrueVal);
+      }
+      // NOTE: if we wanted to, this is where to detect MIN/MAX
+    }
+    // NOTE: if we wanted to, this is where to detect ABS
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+    if (Instruction *Result = visitSelectInstWithICmp(SI, ICI))
+      return Result;
+
+  if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
+    if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
+      if (TI->hasOneUse() && FI->hasOneUse()) {
+        Instruction *AddOp = 0, *SubOp = 0;
+
+        // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+        if (TI->getOpcode() == FI->getOpcode())
+          if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+            return IV;
+
+        // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
+        // even legal for FP.
+        if ((TI->getOpcode() == Instruction::Sub &&
+             FI->getOpcode() == Instruction::Add) ||
+            (TI->getOpcode() == Instruction::FSub &&
+             FI->getOpcode() == Instruction::FAdd)) {
+          AddOp = FI; SubOp = TI;
+        } else if ((FI->getOpcode() == Instruction::Sub &&
+                    TI->getOpcode() == Instruction::Add) ||
+                   (FI->getOpcode() == Instruction::FSub &&
+                    TI->getOpcode() == Instruction::FAdd)) {
+          AddOp = TI; SubOp = FI;
+        }
+
+        if (AddOp) {
+          Value *OtherAddOp = 0;
+          if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+            OtherAddOp = AddOp->getOperand(1);
+          } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+            OtherAddOp = AddOp->getOperand(0);
+          }
+
+          if (OtherAddOp) {
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (SI.getType()->isFPOrFPVectorTy()) {
+              NegVal = Builder->CreateFNeg(SubOp->getOperand(1));
+            } else {
+              NegVal = Builder->CreateNeg(SubOp->getOperand(1));
+            }
+
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Value *NewSel = 
+              Builder->CreateSelect(CondVal, NewTrueOp,
+                                    NewFalseOp, SI.getName() + ".p");
+
+            if (SI.getType()->isFPOrFPVectorTy())
+              return BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
+            else
+              return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+          }
+        }
+      }
+
+  // See if we can fold the select into one of our operands.
+  if (SI.getType()->isIntegerTy()) {
+    if (Instruction *FoldI = FoldSelectIntoOp(SI, TrueVal, FalseVal))
+      return FoldI;
+
+    // MAX(MAX(a, b), a) -> MAX(a, b)
+    // MIN(MIN(a, b), a) -> MIN(a, b)
+    // MAX(MIN(a, b), a) -> a
+    // MIN(MAX(a, b), a) -> a
+    Value *LHS, *RHS, *LHS2, *RHS2;
+    if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) {
+      if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2))
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, 
+                                          SI, SPF, RHS))
+          return R;
+      if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2))
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(RHS),SPF2,LHS2,RHS2,
+                                          SI, SPF, LHS))
+          return R;
+    }
+
+    // TODO.
+    // ABS(-X) -> ABS(X)
+    // ABS(ABS(X)) -> ABS(X)
+  }
+
+  // See if we can fold the select into a phi node if the condition is a select.
+  if (isa<PHINode>(SI.getCondition()))
+    // The true/false values have to be live in the PHI predecessor's blocks.
+    if (CanSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+        CanSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+      if (Instruction *NV = FoldOpIntoPhi(SI))
+        return NV;
+
+  if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
+    if (TrueSI->getCondition() == CondVal) {
+      if (SI.getTrueValue() == TrueSI->getTrueValue())
+        return 0;
+      SI.setOperand(1, TrueSI->getTrueValue());
+      return &SI;
+    }
+  }
+  if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
+    if (FalseSI->getCondition() == CondVal) {
+      if (SI.getFalseValue() == FalseSI->getFalseValue())
+        return 0;
+      SI.setOperand(2, FalseSI->getFalseValue());
+      return &SI;
+    }
+  }
+
+  if (BinaryOperator::isNot(CondVal)) {
+    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    return &SI;
+  }
+
+  if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) {
+    unsigned VWidth = VecTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) {
+      if (V != &SI)
+        return ReplaceInstUsesWith(SI, V);
+      return &SI;
+    }
+
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) {
+      // Form a shufflevector instruction.
+      SmallVector<Constant *, 8> Mask(VWidth);
+      Type *Int32Ty = Type::getInt32Ty(CV->getContext());
+      for (unsigned i = 0; i != VWidth; ++i) {
+        Constant *Elem = cast<Constant>(CV->getOperand(i));
+        if (ConstantInt *E = dyn_cast<ConstantInt>(Elem))
+          Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0));
+        else if (isa<UndefValue>(Elem))
+          Mask[i] = UndefValue::get(Int32Ty);
+        else
+          return 0;
+      }
+      Constant *MaskVal = ConstantVector::get(Mask);
+      Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal);
+      return ReplaceInstUsesWith(SI, V);
+    }
+
+    if (isa<ConstantAggregateZero>(CondVal)) {
+      return ReplaceInstUsesWith(SI, FalseVal);
+    }
+  }
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
new file mode 100644
index 000000000000..8cf76e5e8a9f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -0,0 +1,810 @@
+//===- InstCombineShifts.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitShl, visitLShr, and visitAShr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // See if we can fold away this shift.
+  if (SimplifyDemandedInstructionBits(I))
+    return &I;
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
+  if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+
+  // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2.
+  // Because shifts by negative values (which could occur if A were negative)
+  // are undefined.
+  Value *A; const APInt *B;
+  if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) {
+    // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
+    // demand the sign bit (and many others) here??
+    Value *Rem = Builder->CreateAnd(A, ConstantInt::get(I.getType(), *B-1),
+                                    Op1->getName());
+    I.setOperand(1, Rem);
+    return &I;
+  }
+
+  return 0;
+}
+
+/// CanEvaluateShifted - See if we can compute the specified value, but shifted
+/// logically to the left or right by some number of bits.  This should return
+/// true if the expression can be computed for the same cost as the current
+/// expression tree.  This is used to eliminate extraneous shifting from things
+/// like:
+///      %C = shl i128 %A, 64
+///      %D = shl i128 %B, 96
+///      %E = or i128 %C, %D
+///      %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits.  If
+/// this succeeds, the GetShiftedValue function will be called to produce the
+/// value.
+static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
+                               InstCombiner &IC) {
+  // We can always evaluate constants shifted.
+  if (isa<Constant>(V))
+    return true;
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // If this is the opposite shift, we can directly reuse the input of the shift
+  // if the needed bits are already zero in the input.  This allows us to reuse
+  // the value which means that we don't care if the shift has multiple uses.
+  //  TODO:  Handle opposite shift by exact value.
+  ConstantInt *CI = 0;
+  if ((isLeftShift && match(I, m_LShr(m_Value(), m_ConstantInt(CI)))) ||
+      (!isLeftShift && match(I, m_Shl(m_Value(), m_ConstantInt(CI))))) {
+    if (CI->getZExtValue() == NumBits) {
+      // TODO: Check that the input bits are already zero with MaskedValueIsZero
+#if 0
+      // If this is a truncate of a logical shr, we can truncate it to a smaller
+      // lshr iff we know that the bits we would otherwise be shifting in are
+      // already zeros.
+      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+      uint32_t BitWidth = Ty->getScalarSizeInBits();
+      if (MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateTruncated(I->getOperand(0), Ty);
+      }
+#endif
+
+    }
+  }
+
+  // We can't mutate something that has multiple uses: doing so would
+  // require duplicating the instruction in general, which isn't profitable.
+  if (!I->hasOneUse()) return false;
+
+  switch (I->getOpcode()) {
+  default: return false;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) &&
+           CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC);
+
+  case Instruction::Shl: {
+    // We can often fold the shift into shifts-by-a-constant.
+    CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+
+    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
+    if (isLeftShift) return true;
+
+    // We can always turn shl(c)+shr(c) -> and(c2).
+    if (CI->getValue() == NumBits) return true;
+
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+
+    // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't
+    // profitable unless we know the and'd out bits are already zero.
+    if (CI->getZExtValue() > NumBits) {
+      unsigned LowBits = TypeWidth - CI->getZExtValue();
+      if (MaskedValueIsZero(I->getOperand(0),
+                       APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
+        return true;
+    }
+
+    return false;
+  }
+  case Instruction::LShr: {
+    // We can often fold the shift into shifts-by-a-constant.
+    CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    if (CI == 0) return false;
+
+    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
+    if (!isLeftShift) return true;
+
+    // We can always turn lshr(c)+shl(c) -> and(c2).
+    if (CI->getValue() == NumBits) return true;
+
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+
+    // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
+    // profitable unless we know the and'd out bits are already zero.
+    if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) {
+      unsigned LowBits = CI->getZExtValue() - NumBits;
+      if (MaskedValueIsZero(I->getOperand(0),
+                          APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
+        return true;
+    }
+
+    return false;
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    return CanEvaluateShifted(SI->getTrueValue(), NumBits, isLeftShift, IC) &&
+           CanEvaluateShifted(SI->getFalseValue(), NumBits, isLeftShift, IC);
+  }
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!CanEvaluateShifted(PN->getIncomingValue(i), NumBits, isLeftShift,IC))
+        return false;
+    return true;
+  }
+  }
+}
+
+/// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
+/// this value inserts the new computation that produces the shifted value.
+static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+                              InstCombiner &IC) {
+  // We can always evaluate constants shifted.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isLeftShift)
+      V = IC.Builder->CreateShl(C, NumBits);
+    else
+      V = IC.Builder->CreateLShr(C, NumBits);
+    // If we got a constantexpr back, try to simplify it with TD info.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+      V = ConstantFoldConstantExpression(CE, IC.getDataLayout(),
+                                         IC.getTargetLibraryInfo());
+    return V;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  IC.Worklist.Add(I);
+
+  switch (I->getOpcode()) {
+  default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+    I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC));
+    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
+    return I;
+
+  case Instruction::Shl: {
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
+
+    // We only accept shifts-by-a-constant in CanEvaluateShifted.
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+
+    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
+    if (isLeftShift) {
+      // If this is oversized composite shift, then unsigned shifts get 0.
+      unsigned NewShAmt = NumBits+CI->getZExtValue();
+      if (NewShAmt >= TypeWidth)
+        return Constant::getNullValue(I->getType());
+
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setHasNoUnsignedWrap(false);
+      BO->setHasNoSignedWrap(false);
+      return I;
+    }
+
+    // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
+    // zeros.
+    if (CI->getValue() == NumBits) {
+      APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
+      V = IC.Builder->CreateAnd(BO->getOperand(0),
+                                ConstantInt::get(BO->getContext(), Mask));
+      if (Instruction *VI = dyn_cast<Instruction>(V)) {
+        VI->moveBefore(BO);
+        VI->takeName(BO);
+      }
+      return V;
+    }
+
+    // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
+    // the and won't be needed.
+    assert(CI->getZExtValue() > NumBits);
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setHasNoUnsignedWrap(false);
+    BO->setHasNoSignedWrap(false);
+    return BO;
+  }
+  case Instruction::LShr: {
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
+    // We only accept shifts-by-a-constant in CanEvaluateShifted.
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+
+    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
+    if (!isLeftShift) {
+      // If this is oversized composite shift, then unsigned shifts get 0.
+      unsigned NewShAmt = NumBits+CI->getZExtValue();
+      if (NewShAmt >= TypeWidth)
+        return Constant::getNullValue(BO->getType());
+
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setIsExact(false);
+      return I;
+    }
+
+    // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
+    // zeros.
+    if (CI->getValue() == NumBits) {
+      APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
+      V = IC.Builder->CreateAnd(I->getOperand(0),
+                                ConstantInt::get(BO->getContext(), Mask));
+      if (Instruction *VI = dyn_cast<Instruction>(V)) {
+        VI->moveBefore(I);
+        VI->takeName(I);
+      }
+      return V;
+    }
+
+    // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
+    // the and won't be needed.
+    assert(CI->getZExtValue() > NumBits);
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setIsExact(false);
+    return BO;
+  }
+
+  case Instruction::Select:
+    I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
+    I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC));
+    return I;
+  case Instruction::PHI: {
+    // We can change a phi if we can change all operands.  Note that we never
+    // get into trouble with cyclic PHIs here because we only consider
+    // instructions with a single use.
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i),
+                                              NumBits, isLeftShift, IC));
+    return PN;
+  }
+  }
+}
+
+
+
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                               BinaryOperator &I) {
+  bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+
+  // See if we can propagate this shift into the input, this covers the trivial
+  // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+  if (I.getOpcode() != Instruction::AShr &&
+      CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) {
+    DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+              " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
+
+    return ReplaceInstUsesWith(I,
+                 GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this));
+  }
+
+
+  // See if we can simplify any instructions used by the instruction whose sole
+  // purpose is to compute bits we don't care about.
+  uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
+
+  // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate
+  // a signed shift.
+  //
+  if (Op1->uge(TypeBits)) {
+    if (I.getOpcode() != Instruction::AShr)
+      return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType()));
+    // ashr i32 X, 32 --> ashr i32 X, 31
+    I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
+    return &I;
+  }
+
+  // ((X*C1) << C2) == (X * (C1 << C2))
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
+    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
+      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
+        return BinaryOperator::CreateMul(BO->getOperand(0),
+                                        ConstantExpr::getShl(BOOp, Op1));
+
+  // Try to fold constant and into select arguments.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *R = FoldOpIntoSelect(I, SI))
+      return R;
+  if (isa<PHINode>(Op0))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+
+  // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
+  if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
+    Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
+    // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+    // place.  Don't try to do this transformation in this case.  Also, we
+    // require that the input operand is a shift-by-constant so that we have
+    // confidence that the shifts will get folded together.  We could do this
+    // xform in more cases, but it is unlikely to be profitable.
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
+        isa<ConstantInt>(TrOp->getOperand(1))) {
+      // Okay, we'll do this xform.  Make the shift of shift.
+      Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
+      // (shift2 (shift1 & 0x00FF), c2)
+      Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName());
+
+      // For logical shifts, the truncation has the effect of making the high
+      // part of the register be zeros.  Emulate this by inserting an AND to
+      // clear the top bits as needed.  This 'and' will usually be zapped by
+      // other xforms later if dead.
+      unsigned SrcSize = TrOp->getType()->getScalarSizeInBits();
+      unsigned DstSize = TI->getType()->getScalarSizeInBits();
+      APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
+
+      // The mask we constructed says what the trunc would do if occurring
+      // between the shifts.  We want to know the effect *after* the second
+      // shift.  We know that it is a logical shift by a constant, so adjust the
+      // mask as appropriate.
+      if (I.getOpcode() == Instruction::Shl)
+        MaskV <<= Op1->getZExtValue();
+      else {
+        assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
+        MaskV = MaskV.lshr(Op1->getZExtValue());
+      }
+
+      // shift1 & 0x00FF
+      Value *And = Builder->CreateAnd(NSh,
+                                      ConstantInt::get(I.getContext(), MaskV),
+                                      TI->getName());
+
+      // Return the value truncated to the interesting size.
+      return new TruncInst(And, I.getType());
+    }
+  }
+
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+      Value *V1, *V2;
+      ConstantInt *CC;
+      switch (Op0BO->getOpcode()) {
+      default: break;
+      case Instruction::Add:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // These operators commute.
+        // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+            match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =         // (Y << C)
+            Builder->CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1,
+                                          Op0BO->getOperand(1)->getName());
+          uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+          return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
+                     APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+        }
+
+        // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+        Value *Op0BOOp1 = Op0BO->getOperand(1);
+        if (isLeftShift && Op0BOOp1->hasOneUse() &&
+            match(Op0BOOp1,
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                        m_ConstantInt(CC)))) {
+          Value *YS =   // (Y << C)
+            Builder->CreateShl(Op0BO->getOperand(0), Op1,
+                                         Op0BO->getName());
+          // X & (CC << C)
+          Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                         V1->getName()+".mask");
+          return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+        }
+      }
+
+      // FALL THROUGH.
+      case Instruction::Sub: {
+        // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
+                  m_Specific(Op1)))) {
+          Value *YS =  // (Y << C)
+            Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+          // (X + (Y << C))
+          Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS,
+                                          Op0BO->getOperand(0)->getName());
+          uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+          return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
+                     APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+        }
+
+        // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+        if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+            match(Op0BO->getOperand(0),
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
+                        m_ConstantInt(CC))) && V2 == Op1) {
+          Value *YS = // (Y << C)
+            Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+          // X & (CC << C)
+          Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                         V1->getName()+".mask");
+
+          return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+        }
+
+        break;
+      }
+      }
+
+
+      // If the operand is an bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
+        bool isValid = true;     // Valid only for And, Or, Xor
+        bool highBitSet = false; // Transform if high bit of constant set?
+
+        switch (Op0BO->getOpcode()) {
+        default: isValid = false; break;   // Do not perform transform!
+        case Instruction::Add:
+          isValid = isLeftShift;
+          break;
+        case Instruction::Or:
+        case Instruction::Xor:
+          highBitSet = false;
+          break;
+        case Instruction::And:
+          highBitSet = true;
+          break;
+        }
+
+        // If this is a signed shift right, and the high bit is modified
+        // by the logical operation, do not perform the transformation.
+        // The highBitSet boolean indicates the value of the high bit of
+        // the constant which would cause it to be modified for this
+        // operation.
+        //
+        if (isValid && I.getOpcode() == Instruction::AShr)
+          isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
+
+        if (isValid) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
+
+          Value *NewShift =
+            Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          NewShift->takeName(Op0BO);
+
+          return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+    }
+  }
+
+  // Find out if this is a shift of a shift by a constant.
+  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
+  if (ShiftOp && !ShiftOp->isShift())
+    ShiftOp = 0;
+
+  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+
+    // This is a constant shift of a constant shift. Be careful about hiding
+    // shl instructions behind bit masks. They are used to represent multiplies
+    // by a constant, and it is important that simple arithmetic expressions
+    // are still recognizable by scalar evolution.
+    //
+    // The transforms applied to shl are very similar to the transforms applied
+    // to mul by constant. We can be more aggressive about optimizing right
+    // shifts.
+    //
+    // Combinations of right and left shifts will still be optimized in
+    // DAGCombine where scalar evolution no longer applies.
+
+    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
+    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
+    uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
+    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
+    if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
+    Value *X = ShiftOp->getOperand(0);
+
+    IntegerType *Ty = cast<IntegerType>(I.getType());
+
+    // Check for (X << c1) << c2  and  (X >> c1) >> c2
+    if (I.getOpcode() == ShiftOp->getOpcode()) {
+      uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
+      // If this is oversized composite shift, then unsigned shifts get 0, ashr
+      // saturates.
+      if (AmtSum >= TypeBits) {
+        if (I.getOpcode() != Instruction::AShr)
+          return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+        AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
+      }
+
+      return BinaryOperator::Create(I.getOpcode(), X,
+                                    ConstantInt::get(Ty, AmtSum));
+    }
+
+    if (ShiftAmt1 == ShiftAmt2) {
+      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::CreateAnd(X,
+                                        ConstantInt::get(I.getContext(), Mask));
+      }
+    } else if (ShiftAmt1 < ShiftAmt2) {
+      uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
+
+      // (X >>?,exact C1) << C2 --> X << (C2-C1)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
+      if (I.getOpcode() == Instruction::Shl &&
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                        X, ShiftDiffCst);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
+      }
+
+      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
+        if (ShiftOp->hasNoUnsignedWrap()) {
+          BinaryOperator *NewLShr = BinaryOperator::Create(Instruction::LShr,
+                                                           X, ShiftDiffCst);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
+
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift,
+                                         ConstantInt::get(I.getContext(),Mask));
+      }
+
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+      if (I.getOpcode() == Instruction::AShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        if (ShiftOp->hasNoSignedWrap()) {
+          // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
+          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+          BinaryOperator *NewAShr = BinaryOperator::Create(Instruction::AShr,
+                                                           X, ShiftDiffCst);
+          NewAShr->setIsExact(I.isExact());
+          return NewAShr;
+        }
+      }
+    } else {
+      assert(ShiftAmt2 < ShiftAmt1);
+      uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
+
+      // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
+      // The inexact version is deferred to DAGCombine so we don't hide shl
+      // behind a bit mask.
+      if (I.getOpcode() == Instruction::Shl &&
+          ShiftOp->getOpcode() != Instruction::Shl &&
+          ShiftOp->isExact()) {
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(),
+                                                        X, ShiftDiffCst);
+        NewShr->setIsExact(true);
+        return NewShr;
+      }
+
+      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+        if (ShiftOp->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
+          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                          X, ShiftDiffCst);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
+
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::CreateAnd(Shift,
+                                         ConstantInt::get(I.getContext(),Mask));
+      }
+
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
+      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+      if (I.getOpcode() == Instruction::AShr &&
+          ShiftOp->getOpcode() == Instruction::Shl) {
+        if (ShiftOp->hasNoSignedWrap()) {
+          // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
+          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
+          BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl,
+                                                          X, ShiftDiffCst);
+          NewShl->setHasNoSignedWrap(true);
+          return NewShl;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+                                 I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                                 TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (Instruction *V = commonShiftTransforms(I))
+    return V;
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
+    unsigned ShAmt = Op1C->getZExtValue();
+
+    // If the shifted-out value is known-zero, then this is a NUW shift.
+    if (!I.hasNoUnsignedWrap() &&
+        MaskedValueIsZero(I.getOperand(0),
+                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) {
+          I.setHasNoUnsignedWrap();
+          return &I;
+        }
+
+    // If the shifted out value is all signbits, this is a NSW shift.
+    if (!I.hasNoSignedWrap() &&
+        ComputeNumSignBits(I.getOperand(0)) > ShAmt) {
+      I.setHasNoSignedWrap();
+      return &I;
+    }
+  }
+
+  // (C1 << A) << C2 -> (C1 << C2) << A
+  Constant *C1, *C2;
+  Value *A;
+  if (match(I.getOperand(0), m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) &&
+      match(I.getOperand(1), m_Constant(C2)))
+    return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1),
+                                  I.isExact(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    unsigned ShAmt = Op1C->getZExtValue();
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
+      unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+      // ctlz.i32(x)>>5  --> zext(x == 0)
+      // cttz.i32(x)>>5  --> zext(x == 0)
+      // ctpop.i32(x)>>5 --> zext(x == -1)
+      if ((II->getIntrinsicID() == Intrinsic::ctlz ||
+           II->getIntrinsicID() == Intrinsic::cttz ||
+           II->getIntrinsicID() == Intrinsic::ctpop) &&
+          isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt) {
+        bool isCtPop = II->getIntrinsicID() == Intrinsic::ctpop;
+        Constant *RHS = ConstantInt::getSigned(Op0->getType(), isCtPop ? -1:0);
+        Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS);
+        return new ZExtInst(Cmp, II->getType());
+      }
+    }
+
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
+      I.setIsExact();
+      return &I;
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1),
+                                  I.isExact(), TD))
+    return ReplaceInstUsesWith(I, V);
+
+  if (Instruction *R = commonShiftTransforms(I))
+    return R;
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    unsigned ShAmt = Op1C->getZExtValue();
+
+    // If the input is a SHL by the same constant (ashr (shl X, C), C), then we
+    // have a sign-extend idiom.
+    Value *X;
+    if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) {
+      // If the left shift is just shifting out partial signbits, delete the
+      // extension.
+      if (cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap())
+        return ReplaceInstUsesWith(I, X);
+
+      // If the input is an extension from the shifted amount value, e.g.
+      //   %x = zext i8 %A to i32
+      //   %y = shl i32 %x, 24
+      //   %z = ashr %y, 24
+      // then turn this into "z = sext i8 A to i32".
+      if (ZExtInst *ZI = dyn_cast<ZExtInst>(X)) {
+        uint32_t SrcBits = ZI->getOperand(0)->getType()->getScalarSizeInBits();
+        uint32_t DestBits = ZI->getType()->getScalarSizeInBits();
+        if (Op1C->getZExtValue() == DestBits-SrcBits)
+          return new SExtInst(ZI->getOperand(0), ZI->getType());
+      }
+    }
+
+    // If the shifted-out value is known-zero, then this is an exact shift.
+    if (!I.isExact() &&
+        MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
+      I.setIsExact();
+      return &I;
+    }
+  }
+
+  // See if we can turn a signed shr into an unsigned shr.
+  if (MaskedValueIsZero(Op0,
+                        APInt::getSignBit(I.getType()->getScalarSizeInBits())))
+    return BinaryOperator::CreateLShr(Op0, Op1);
+
+  // Arithmetic shifting an all-sign-bit value is a no-op.
+  unsigned NumSignBits = ComputeNumSignBits(Op0);
+  if (NumSignBits == Op0->getType()->getScalarSizeInBits())
+    return ReplaceInstUsesWith(I, Op0);
+
+  return 0;
+}
+
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
new file mode 100644
index 000000000000..8add1ea618d3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -0,0 +1,1281 @@
+//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains logic for simplifying instructions based on information
+// about how they are used.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "InstCombine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/PatternMatch.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
+                                   APInt Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // If the operand is not a constant integer, nothing to do.
+  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
+  if (!OpC) return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  Demanded = Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
+  if ((~Demanded & OpC->getValue()) == 0)
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  Demanded &= OpC->getValue();
+  I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded));
+  return true;
+}
+
+
+
+/// SimplifyDemandedInstructionBits - Inst is an integer instruction that
+/// SimplifyDemandedBits knows about.  See if the instruction has any
+/// properties that allow us to simplify its operands.
+bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask,
+                                     KnownZero, KnownOne, 0);
+  if (V == 0) return false;
+  if (V == &Inst) return true;
+  ReplaceInstUsesWith(Inst, V);
+  return true;
+}
+
+/// SimplifyDemandedBits - This form of SimplifyDemandedBits simplifies the
+/// specified instruction operand if possible, updating it in place.  It returns
+/// true if it made any change and false otherwise.
+bool InstCombiner::SimplifyDemandedBits(Use &U, APInt DemandedMask,
+                                        APInt &KnownZero, APInt &KnownOne,
+                                        unsigned Depth) {
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask,
+                                          KnownZero, KnownOne, Depth);
+  if (NewVal == 0) return false;
+  U = NewVal;
+  return true;
+}
+
+
+/// SimplifyDemandedUseBits - This function attempts to replace V with a simpler
+/// value based on the demanded bits.  When this function is called, it is known
+/// that only the bits set in DemandedMask of the result of V are ever used
+/// downstream. Consequently, depending on the mask and V, it may be possible
+/// to replace V with a constant or one of its operands. In such cases, this
+/// function does the replacement and returns true. In all other cases, it
+/// returns false after analyzing the expression and setting KnownOne and known
+/// to be one in the expression.  KnownZero contains all the bits that are known
+/// to be zero in the expression. These are provided to potentially allow the
+/// caller (which might recursively be SimplifyDemandedBits itself) to simplify
+/// the expression. KnownOne and KnownZero always follow the invariant that
+/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
+/// the bits in KnownOne and KnownZero may only be accurate for those bits set
+/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
+/// and KnownOne must all be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification.  This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
+Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
+                                             APInt &KnownZero, APInt &KnownOne,
+                                             unsigned Depth) {
+  assert(V != 0 && "Null pointer of Value???");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  Type *VTy = V->getType();
+  assert((TD || !VTy->isPointerTy()) &&
+         "SimplifyDemandedBits needs to know bit widths!");
+  assert((!TD || TD->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) &&
+         (!VTy->isIntOrIntVectorTy() ||
+          VTy->getScalarSizeInBits() == BitWidth) &&
+         KnownZero.getBitWidth() == BitWidth &&
+         KnownOne.getBitWidth() == BitWidth &&
+         "Value *V, DemandedMask, KnownZero and KnownOne "
+         "must have same BitWidth");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & DemandedMask;
+    KnownZero = ~KnownOne & DemandedMask;
+    return 0;
+  }
+  if (isa<ConstantPointerNull>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne.clearAllBits();
+    KnownZero = DemandedMask;
+    return 0;
+  }
+
+  KnownZero.clearAllBits();
+  KnownOne.clearAllBits();
+  if (DemandedMask == 0) {   // Not demanding any bits from V.
+    if (isa<UndefValue>(V))
+      return 0;
+    return UndefValue::get(VTy);
+  }
+
+  if (Depth == 6)        // Limit search depth.
+    return 0;
+
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+    return 0;        // Only analyze instructions.
+  }
+
+  // If there are multiple uses of this value and we aren't at the root, then
+  // we can't do any simplifications of the operands, because DemandedMask
+  // only reflects the bits demanded by *one* of the users.
+  if (Depth != 0 && !I->hasOneUse()) {
+    // Despite the fact that we can't simplify this instruction in all User's
+    // context, we can at least compute the knownzero/knownone bits, and we can
+    // do simplifications that apply to *just* the one user if we know that
+    // this instruction has a simpler value in that context.
+    if (I->getOpcode() == Instruction::And) {
+      // If either the LHS or the RHS are Zero, the result is zero.
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+
+      // If all of the demanded bits are known 1 on one side, return the other.
+      // These bits cannot contribute to the result of the 'and' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
+          (DemandedMask & ~LHSKnownZero))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
+          (DemandedMask & ~RHSKnownZero))
+        return I->getOperand(1);
+
+      // If all of the demanded bits in the inputs are known zeros, return zero.
+      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+        return Constant::getNullValue(VTy);
+
+    } else if (I->getOpcode() == Instruction::Or) {
+      // We can simplify (X|Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+
+      // If either the LHS or the RHS are One, the result is One.
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+
+      // If all of the demanded bits are known zero on one side, return the
+      // other.  These bits cannot contribute to the result of the 'or' in this
+      // context.
+      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
+          (DemandedMask & ~LHSKnownOne))
+        return I->getOperand(0);
+      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
+          (DemandedMask & ~RHSKnownOne))
+        return I->getOperand(1);
+
+      // If all of the potentially set bits on one side are known to be set on
+      // the other side, just use the 'other' side.
+      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
+          (DemandedMask & (~RHSKnownZero)))
+        return I->getOperand(0);
+      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
+          (DemandedMask & (~LHSKnownZero)))
+        return I->getOperand(1);
+    } else if (I->getOpcode() == Instruction::Xor) {
+      // We can simplify (X^Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+
+      // If all of the demanded bits are known zero on one side, return the
+      // other.
+      if ((DemandedMask & RHSKnownZero) == DemandedMask)
+        return I->getOperand(0);
+      if ((DemandedMask & LHSKnownZero) == DemandedMask)
+        return I->getOperand(1);
+    }
+
+    // Compute the KnownZero/KnownOne bits to simplify things downstream.
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
+    return 0;
+  }
+
+  // If this is the root being simplified, allow it to have multiple uses,
+  // just set the DemandedMask to all bits so that we can try to simplify the
+  // operands.  This allows visitTruncInst (for example) to simplify the
+  // operand of a trunc without duplicating all the logic below.
+  if (Depth == 0 && !V->hasOneUse())
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+
+  switch (I->getOpcode()) {
+  default:
+    ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
+    break;
+  case Instruction::And:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
+        (DemandedMask & ~LHSKnownZero))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
+        (DemandedMask & ~RHSKnownZero))
+      return I->getOperand(1);
+
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+      return Constant::getNullValue(VTy);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
+      return I;
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne = RHSKnownOne & LHSKnownOne;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero = RHSKnownZero | LHSKnownZero;
+    break;
+  case Instruction::Or:
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
+        (DemandedMask & ~LHSKnownOne))
+      return I->getOperand(0);
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
+        (DemandedMask & ~RHSKnownOne))
+      return I->getOperand(1);
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
+        (DemandedMask & (~RHSKnownZero)))
+      return I->getOperand(0);
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
+        (DemandedMask & (~LHSKnownZero)))
+      return I->getOperand(1);
+
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero = RHSKnownZero & LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne = RHSKnownOne | LHSKnownOne;
+    break;
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+      return I->getOperand(0);
+    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+      return I->getOperand(1);
+
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
+      Instruction *Or =
+        BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      return InsertNewInstWith(Or, *I);
+    }
+
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) {
+      // all known
+      if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
+        Constant *AndC = Constant::getIntegerValue(VTy,
+                                                   ~RHSKnownOne & DemandedMask);
+        Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+        return InsertNewInstWith(And, *I);
+      }
+    }
+
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return I;
+
+    // If our LHS is an 'and' and if it has one use, and if any of the bits we
+    // are flipping are known to be set, then the xor is just resetting those
+    // bits to zero.  We can just knock out bits from the 'and' and the 'xor',
+    // simplifying both of them.
+    if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0)))
+      if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
+          isa<ConstantInt>(I->getOperand(1)) &&
+          isa<ConstantInt>(LHSInst->getOperand(1)) &&
+          (LHSKnownOne & RHSKnownOne & DemandedMask) != 0) {
+        ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
+        ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
+        APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask);
+
+        Constant *AndC =
+          ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
+        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+        InsertNewInstWith(NewAnd, *I);
+
+        Constant *XorC =
+          ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
+        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
+        return InsertNewInstWith(NewXor, *I);
+      }
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZero= (RHSKnownZero & LHSKnownZero) | (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (RHSKnownZero & LHSKnownOne) | (RHSKnownOne & LHSKnownZero);
+    break;
+  }
+  case Instruction::Select:
+    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return I;
+    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
+        ShrinkDemandedConstant(I, 2, DemandedMask))
+      return I;
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne = RHSKnownOne & LHSKnownOne;
+    KnownZero = RHSKnownZero & LHSKnownZero;
+    break;
+  case Instruction::Trunc: {
+    unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits();
+    DemandedMask = DemandedMask.zext(truncBf);
+    KnownZero = KnownZero.zext(truncBf);
+    KnownOne = KnownOne.zext(truncBf);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return I;
+    DemandedMask = DemandedMask.trunc(BitWidth);
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
+      return 0;  // vector->int or fp->int?
+
+    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+      if (VectorType *SrcVTy =
+            dyn_cast<VectorType>(I->getOperand(0)->getType())) {
+        if (DstVTy->getNumElements() != SrcVTy->getNumElements())
+          // Don't touch a bitcast between vectors of different element counts.
+          return 0;
+      } else
+        // Don't touch a scalar-to-vector bitcast.
+        return 0;
+    } else if (I->getOperand(0)->getType()->isVectorTy())
+      // Don't touch a vector-to-scalar bitcast.
+      return 0;
+
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return I;
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+    break;
+  case Instruction::ZExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    DemandedMask = DemandedMask.trunc(SrcBitWidth);
+    KnownZero = KnownZero.trunc(SrcBitWidth);
+    KnownOne = KnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask,
+                             KnownZero, KnownOne, Depth+1))
+      return I;
+    DemandedMask = DemandedMask.zext(BitWidth);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+    // The top bits are known to be zero.
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedBits = DemandedMask &
+                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+
+    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if ((NewBits & DemandedMask) != 0)
+      InputDemandedBits.setBit(SrcBitWidth-1);
+
+    InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
+    KnownZero = KnownZero.trunc(SrcBitWidth);
+    KnownOne = KnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits,
+                             KnownZero, KnownOne, Depth+1))
+      return I;
+    InputDemandedBits = InputDemandedBits.zext(BitWidth);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
+      // Convert to ZExt cast
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+      return InsertNewInstWith(NewCast, *I);
+    } else if (KnownOne[SrcBitWidth-1]) {    // Input sign bit known set
+      KnownOne |= NewBits;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Figure out what the input bits are.  If the top bits of the and result
+    // are not demanded, then the add doesn't demand them from its input
+    // either.
+    unsigned NLZ = DemandedMask.countLeadingZeros();
+
+    // If there is a constant on the RHS, there are a variety of xformations
+    // we can do.
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // If null, this should be simplified elsewhere.  Some of the xforms here
+      // won't work if the RHS is zero.
+      if (RHS->isZero())
+        break;
+
+      // If the top bit of the output is demanded, demand everything from the
+      // input.  Otherwise, we demand all the input bits except NLZ top bits.
+      APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
+
+      // Find information about known zero/one bits in the input.
+      if (SimplifyDemandedBits(I->getOperandUse(0), InDemandedBits,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+
+      // If the RHS of the add has bits set that can't affect the input, reduce
+      // the constant.
+      if (ShrinkDemandedConstant(I, 1, InDemandedBits))
+        return I;
+
+      // Avoid excess work.
+      if (LHSKnownZero == 0 && LHSKnownOne == 0)
+        break;
+
+      // Turn it into OR if input bits are zero.
+      if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
+        Instruction *Or =
+          BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+                                   I->getName());
+        return InsertNewInstWith(Or, *I);
+      }
+
+      // We can say something about the output known-zero and known-one bits,
+      // depending on potential carries from the input constant and the
+      // unknowns.  For example if the LHS is known to have at most the 0x0F0F0
+      // bits set and the RHS constant is 0x01001, then we know we have a known
+      // one mask of 0x00001 and a known zero mask of 0xE0F0E.
+
+      // To compute this, we first compute the potential carry bits.  These are
+      // the bits which may be modified.  I'm not aware of a better way to do
+      // this scan.
+      const APInt &RHSVal = RHS->getValue();
+      APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
+
+      // Now that we know which bits have carries, compute the known-1/0 sets.
+
+      // Bits are known one if they are known zero in one operand and one in the
+      // other, and there is no input carry.
+      KnownOne = ((LHSKnownZero & RHSVal) |
+                  (LHSKnownOne & ~RHSVal)) & ~CarryBits;
+
+      // Bits are known zero if they are known zero in both operands and there
+      // is no input carry.
+      KnownZero = LHSKnownZero & ~RHSVal & ~CarryBits;
+    } else {
+      // If the high-bits of this ADD are not demanded, then it does not demand
+      // the high bits of its LHS or RHS.
+      if (DemandedMask[BitWidth-1] == 0) {
+        // Right fill the mask of bits for this ADD to demand the most
+        // significant bit and all those below it.
+        APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+        if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1) ||
+            SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+      }
+    }
+    break;
+  }
+  case Instruction::Sub:
+    // If the high-bits of this SUB are not demanded, then it does not demand
+    // the high bits of its LHS or RHS.
+    if (DemandedMask[BitWidth-1] == 0) {
+      // Right fill the mask of bits for this SUB to demand the most
+      // significant bit and all those below it.
+      uint32_t NLZ = DemandedMask.countLeadingZeros();
+      APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1) ||
+          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return I;
+    }
+
+    // Otherwise just hand the sub off to ComputeMaskedBits to fill in
+    // the known zeros and ones.
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+
+    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+    // zero.
+    if (ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(0))) {
+      APInt I0 = C0->getValue();
+      if ((I0 + 1).isPowerOf2() && (I0 | KnownZero).isAllOnesValue()) {
+        Instruction *Xor = BinaryOperator::CreateXor(I->getOperand(1), C0);
+        return InsertNewInstWith(Xor, *I);
+      }
+    }
+    break;
+  case Instruction::Shl:
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      {
+        Value *VarX; ConstantInt *C1;
+        if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) {
+          Instruction *Shr = cast<Instruction>(I->getOperand(0));
+          Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask,
+                                                KnownZero, KnownOne);
+          if (R)
+            return R;
+        }
+      }
+
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+
+      // If the shift is NUW/NSW, then it does demand the high bits.
+      ShlOperator *IOp = cast<ShlOperator>(I);
+      if (IOp->hasNoSignedWrap())
+        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+      else if (IOp->hasNoUnsignedWrap())
+        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               KnownZero, KnownOne, Depth+1))
+        return I;
+      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+      KnownZero <<= ShiftAmt;
+      KnownOne  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+    }
+    break;
+  case Instruction::LShr:
+    // For a logical shift right
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<LShrOperator>(I)->isExact())
+        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               KnownZero, KnownOne, Depth+1))
+        return I;
+      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      if (ShiftAmt) {
+        // Compute the new bits that are at the top now.
+        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+        KnownZero |= HighBits;  // high bits known zero.
+      }
+    }
+    break;
+  case Instruction::AShr:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1) {
+      // Perform the logical shift right.
+      Instruction *NewVal = BinaryOperator::CreateLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      return InsertNewInstWith(NewVal, *I);
+    }
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignBit())
+      return I->getOperand(0);
+
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the "high bits" are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.setBit(BitWidth-1);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<AShrOperator>(I)->isExact())
+        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+
+      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn,
+                               KnownZero, KnownOne, Depth+1))
+        return I;
+      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+      // Compute the new bits that are at the top now.
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+
+      // Handle the sign bits.
+      APInt SignBit(APInt::getSignBit(BitWidth));
+      // Adjust to where it is now in the mask.
+      SignBit = APIntOps::lshr(SignBit, ShiftAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] ||
+          (HighBits & ~DemandedMask) == HighBits) {
+        // Perform the logical shift right.
+        BinaryOperator *NewVal = BinaryOperator::CreateLShr(I->getOperand(0),
+                                                            SA, I->getName());
+        NewVal->setIsExact(cast<BinaryOperator>(I)->isExact());
+        return InsertNewInstWith(NewVal, *I);
+      } else if ((KnownOne & SignBit) != 0) { // New bits are known one.
+        KnownOne |= HighBits;
+      }
+    }
+    break;
+  case Instruction::SRem:
+    if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // X % -1 demands all the bits because we don't want to introduce
+      // INT_MIN % -1 (== undef) by accident.
+      if (Rem->isAllOnesValue())
+        break;
+      APInt RA = Rem->getValue().abs();
+      if (RA.isPowerOf2()) {
+        if (DemandedMask.ult(RA))    // srem won't affect demanded bits
+          return I->getOperand(0);
+
+        APInt LowBits = RA - 1;
+        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
+        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return I;
+
+        // The low bits of LHS are unchanged by the srem.
+        KnownZero = LHSKnownZero & LowBits;
+        KnownOne = LHSKnownOne & LowBits;
+
+        // If LHS is non-negative or has all low bits zero, then the upper bits
+        // are all zero.
+        if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
+          KnownZero |= ~LowBits;
+
+        // If LHS is negative and not all low bits are zero, then the upper bits
+        // are all one.
+        if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0))
+          KnownOne |= ~LowBits;
+
+        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+      }
+    }
+
+    // The sign bit is the LHS's sign bit, except when the result of the
+    // remainder is zero.
+    if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
+      APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      // If it's known zero, our sign bit is also zero.
+      if (LHSKnownZero.isNegative())
+        KnownZero |= LHSKnownZero;
+    }
+    break;
+  case Instruction::URem: {
+    APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
+    APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1) ||
+        SimplifyDemandedBits(I->getOperandUse(1), AllOnes,
+                             KnownZero2, KnownOne2, Depth+1))
+      return I;
+
+    unsigned Leaders = KnownZero2.countLeadingOnes();
+    Leaders = std::max(Leaders,
+                       KnownZero2.countLeadingOnes());
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    break;
+  }
+  case Instruction::Call:
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap: {
+        // If the only bits demanded come from one byte of the bswap result,
+        // just shift the input byte into position to eliminate the bswap.
+        unsigned NLZ = DemandedMask.countLeadingZeros();
+        unsigned NTZ = DemandedMask.countTrailingZeros();
+
+        // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+        // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+        // have 14 leading zeros, round to 8.
+        NLZ &= ~7;
+        NTZ &= ~7;
+        // If we need exactly one byte, we can do this transformation.
+        if (BitWidth-NLZ-NTZ == 8) {
+          unsigned ResultBit = NTZ;
+          unsigned InputBit = BitWidth-NTZ-8;
+
+          // Replace this with either a left or right shift to get the byte into
+          // the right place.
+          Instruction *NewVal;
+          if (InputBit > ResultBit)
+            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          else
+            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
+                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+          NewVal->takeName(I);
+          return InsertNewInstWith(NewVal, *I);
+        }
+
+        // TODO: Could compute known zero/one bits based on the input.
+        break;
+      }
+      case Intrinsic::x86_sse42_crc32_64_8:
+      case Intrinsic::x86_sse42_crc32_64_64:
+        KnownZero = APInt::getHighBitsSet(64, 32);
+        return 0;
+      }
+    }
+    ComputeMaskedBits(V, KnownZero, KnownOne, Depth);
+    break;
+  }
+
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
+    return Constant::getIntegerValue(VTy, KnownOne);
+  return 0;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+///  1) All the bit in S are 0, in this case E1 == E2.
+///  2) We don't care those bits in S, per the input DemandedMask.
+///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+///     rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
+Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
+  Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
+
+  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
+  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+
+  KnownOne.clearAllBits();
+  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
+  KnownZero &= DemandedMask;
+
+  if (ShlAmt == 0 || ShrAmt == 0)
+    return 0;
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+
+  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+
+  bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+                      (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+  if (ShrAmt <= ShlAmt) {
+    BitMask2 <<= (ShlAmt - ShrAmt);
+  } else {
+    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+                        BitMask2.ashr(ShrAmt - ShlAmt);
+  }
+
+  // Check if condition-2 (see the comment to this function) is satified.
+  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+    if (ShrAmt == ShlAmt)
+      return VarX;
+
+    if (!Shr->hasOneUse())
+      return 0;
+
+    BinaryOperator *New;
+    if (ShrAmt < ShlAmt) {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+      New = BinaryOperator::CreateShl(VarX, Amt);
+      BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+      New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+    } else {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+                     BinaryOperator::CreateAShr(VarX, Amt);
+      if (cast<BinaryOperator>(Shr)->isExact())
+        New->setIsExact(true);
+    }
+
+    return InsertNewInstWith(New, *Shl);
+  }
+
+  return 0;
+}
+
+/// SimplifyDemandedVectorElts - The specified value produces a vector with
+/// any number of elements. DemandedElts contains the set of elements that are
+/// actually used by the caller.  This method analyzes which elements of the
+/// operand are undef and returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
+                                                APInt &UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  APInt EltMask(APInt::getAllOnesValue(VWidth));
+  assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return 0;
+  }
+
+  if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+
+  UndefElts = 0;
+
+  // Handle ConstantAggregateZero, ConstantVector, ConstantDataSequential.
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    // Check if this is identity. If so, return 0 since we are not simplifying
+    // anything.
+    if (DemandedElts.isAllOnesValue())
+      return 0;
+
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0; i != VWidth; ++i) {
+      if (!DemandedElts[i]) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts.setBit(i);
+        continue;
+      }
+
+      Constant *Elt = C->getAggregateElement(i);
+      if (Elt == 0) return 0;
+
+      if (isa<UndefValue>(Elt)) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts.setBit(i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(Elt);
+      }
+    }
+
+    // If we changed the constant, return it.
+    Constant *NewCV = ConstantVector::get(Elts);
+    return NewCV != C ? NewCV : 0;
+  }
+
+  // Limit search depth.
+  if (Depth == 10)
+    return 0;
+
+  // If multiple users are using the root value, proceed with
+  // simplification conservatively assuming that all elements
+  // are needed.
+  if (!V->hasOneUse()) {
+    // Quit if we find multiple users of a non-root value though.
+    // They'll be handled when it's their turn to be visited by
+    // the main instcombine process.
+    if (Depth != 0)
+      // TODO: Just compute the UndefElts information recursively.
+      return 0;
+
+    // Conservatively assume that all elements are needed.
+    DemandedElts = EltMask;
+  }
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return 0;        // Only analyze instructions.
+
+  bool MadeChange = false;
+  APInt UndefElts2(VWidth, 0);
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (Idx == 0) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getZExtValue();
+    if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
+      Worklist.Add(I);
+      return I->getOperand(0);
+    }
+
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    APInt DemandedElts2 = DemandedElts;
+    DemandedElts2.clearBit(IdxNo);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts2,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts.clearBit(IdxNo);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
+    uint64_t LHSVWidth =
+      cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+    APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0);
+    for (unsigned i = 0; i < VWidth; i++) {
+      if (DemandedElts[i]) {
+        unsigned MaskVal = Shuffle->getMaskValue(i);
+        if (MaskVal != -1u) {
+          assert(MaskVal < LHSVWidth * 2 &&
+                 "shufflevector mask index out of range!");
+          if (MaskVal < LHSVWidth)
+            LeftDemanded.setBit(MaskVal);
+          else
+            RightDemanded.setBit(MaskVal - LHSVWidth);
+        }
+      }
+    }
+
+    APInt UndefElts4(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), LeftDemanded,
+                                      UndefElts4, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    APInt UndefElts3(LHSVWidth, 0);
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), RightDemanded,
+                                      UndefElts3, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    bool NewUndefElts = false;
+    for (unsigned i = 0; i < VWidth; i++) {
+      unsigned MaskVal = Shuffle->getMaskValue(i);
+      if (MaskVal == -1u) {
+        UndefElts.setBit(i);
+      } else if (!DemandedElts[i]) {
+        NewUndefElts = true;
+        UndefElts.setBit(i);
+      } else if (MaskVal < LHSVWidth) {
+        if (UndefElts4[MaskVal]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        }
+      } else {
+        if (UndefElts3[MaskVal - LHSVWidth]) {
+          NewUndefElts = true;
+          UndefElts.setBit(i);
+        }
+      }
+    }
+
+    if (NewUndefElts) {
+      // Add additional discovered undefs.
+      SmallVector<Constant*, 16> Elts;
+      for (unsigned i = 0; i < VWidth; ++i) {
+        if (UndefElts[i])
+          Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext())));
+        else
+          Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()),
+                                          Shuffle->getMaskValue(i)));
+      }
+      I->setOperand(2, ConstantVector::get(Elts));
+      MadeChange = true;
+    }
+    break;
+  }
+  case Instruction::Select: {
+    APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts);
+    if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) {
+      for (unsigned i = 0; i < VWidth; i++) {
+        if (CV->getAggregateElement(i)->isNullValue())
+          LeftDemanded.clearBit(i);
+        else
+          RightDemanded.clearBit(i);
+      }
+    }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; }
+
+    // Output elements are undefined if both are undefined.
+    UndefElts &= UndefElts2;
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
+    unsigned InVWidth = VTy->getNumElements();
+    APInt InputDemandedElts(InVWidth, 0);
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if (VWidth > InVWidth) {
+      // Untested so far.
+      break;
+
+      // If there are more elements in the result than there are in the source,
+      // then an input element is live if any of the corresponding output
+      // elements are live.
+      Ratio = VWidth/InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        if (DemandedElts[OutIdx])
+          InputDemandedElts.setBit(OutIdx/Ratio);
+      }
+    } else {
+      // Untested so far.
+      break;
+
+      // If there are more elements in the source than there are in the result,
+      // then an input element is live if the corresponding output element is
+      // live.
+      Ratio = InVWidth/VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts[InIdx/Ratio])
+          InputDemandedElts.setBit(InIdx);
+    }
+
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) {
+      I->setOperand(0, TmpV);
+      MadeChange = true;
+    }
+
+    UndefElts = UndefElts2;
+    if (VWidth > InVWidth) {
+      llvm_unreachable("Unimp");
+      // If there are more elements in the result than there are in the source,
+      // then an output element is undef if the corresponding input element is
+      // undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2[OutIdx/Ratio])
+          UndefElts.setBit(OutIdx);
+    } else if (VWidth < InVWidth) {
+      llvm_unreachable("Unimp");
+      // If there are more elements in the source than there are in the result,
+      // then a result element is undef if all of the corresponding input
+      // elements are undef.
+      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (!UndefElts2[InIdx])            // Not undef?
+          UndefElts.clearBit(InIdx/Ratio);    // Clear undef bit.
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    break;
+
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+
+    // Binary vector operations that work column-wise.  A dest element is a
+    // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_sub_ss:
+    case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse2_sub_sd:
+    case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(0), DemandedElts,
+                                        UndefElts, Depth+1);
+      if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getArgOperand(1), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+
+      // If only the low elt is demanded and this is a scalarizable intrinsic,
+      // scalarize it now.
+      if (DemandedElts == 1) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::x86_sse_sub_ss:
+        case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_sub_sd:
+        case Intrinsic::x86_sse2_mul_sd:
+          // TODO: Lower MIN/MAX/ABS/etc
+          Value *LHS = II->getArgOperand(0);
+          Value *RHS = II->getArgOperand(1);
+          // Extract the element as scalars.
+          LHS = InsertNewInstWith(ExtractElementInst::Create(LHS,
+            ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
+          RHS = InsertNewInstWith(ExtractElementInst::Create(RHS,
+            ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U)), *II);
+
+          switch (II->getIntrinsicID()) {
+          default: llvm_unreachable("Case stmts out of sync!");
+          case Intrinsic::x86_sse_sub_ss:
+          case Intrinsic::x86_sse2_sub_sd:
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFSub(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
+          case Intrinsic::x86_sse_mul_ss:
+          case Intrinsic::x86_sse2_mul_sd:
+            TmpV = InsertNewInstWith(BinaryOperator::CreateFMul(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
+          }
+
+          Instruction *New =
+            InsertElementInst::Create(
+              UndefValue::get(II->getType()), TmpV,
+              ConstantInt::get(Type::getInt32Ty(I->getContext()), 0U, false),
+                                      II->getName());
+          InsertNewInstWith(New, *II);
+          return New;
+        }
+      }
+
+      // Output elements are undefined if both are undefined.  Consider things
+      // like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      break;
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
new file mode 100644
index 000000000000..4f71db1a4b09
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -0,0 +1,682 @@
+//===- InstCombineVectorOps.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements instcombine for ExtractElement, InsertElement and
+// ShuffleVector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombine.h"
+#include "llvm/Support/PatternMatch.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.  isConstant indicates whether we're
+/// extracting one known element.  If false we're extracting a variable index.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    if (isConstant) return true;
+
+    // If all elts are the same, we can extract it and use any of the values.
+    Constant *Op0 = C->getAggregateElement(0U);
+    for (unsigned i = 1, e = V->getType()->getVectorNumElements(); i != e; ++i)
+      if (C->getAggregateElement(i) != Op0)
+        return false;
+    return true;
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (CI->hasOneUse() &&
+        (CheapToScalarize(CI->getOperand(0), isConstant) ||
+         CheapToScalarize(CI->getOperand(1), isConstant)))
+      return true;
+
+  return false;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(V->getType()->isVectorTy() && "Not looking at a vector?");
+  VectorType *VTy = cast<VectorType>(V->getType());
+  unsigned Width = VTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(VTy->getElementType());
+
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->getAggregateElement(EltNo);
+
+  if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantInt>(III->getOperand(2)))
+      return 0;
+    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
+
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt)
+      return III->getOperand(1);
+
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
+    int InEl = SVI->getMaskValue(EltNo);
+    if (InEl < 0)
+      return UndefValue::get(VTy->getElementType());
+    if (InEl < (int)LHSWidth)
+      return FindScalarElement(SVI->getOperand(0), InEl);
+    return FindScalarElement(SVI->getOperand(1), InEl - LHSWidth);
+  }
+
+  // Extract a value from a vector add operation with a constant zero.
+  Value *Val = 0; Constant *Con = 0;
+  if (match(V, m_Add(m_Value(Val), m_Constant(Con)))) {
+    if (Con->getAggregateElement(EltNo)->isNullValue())
+      return FindScalarElement(Val, EltNo);
+  }
+
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+  // If vector val is constant with all elements the same, replace EI with
+  // that element.  We handle a known element # below.
+  if (Constant *C = dyn_cast<Constant>(EI.getOperand(0)))
+    if (CheapToScalarize(C, false))
+      return ReplaceInstUsesWith(EI, C->getAggregateElement(0U));
+
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = EI.getVectorOperandType()->getNumElements();
+
+    // If this is extracting an invalid index, turn this into undef, to avoid
+    // crashing the code below.
+    if (IndexVal >= VectorWidth)
+      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
+      APInt UndefElts(VectorWidth, 0);
+      APInt DemandedMask(VectorWidth, 0);
+      DemandedMask.setBit(IndexVal);
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
+                                                DemandedMask, UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
+      return ReplaceInstUsesWith(EI, Elt);
+
+    // If the this extractelement is directly using a bitcast from a vector of
+    // the same number of elements, see if we can find the source element from
+    // it.  In this case, we will end up needing to bitcast the scalars.
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
+      if (VectorType *VT = dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+        if (VT->getNumElements() == VectorWidth)
+          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+            return new BitCastInst(Elt, EI.getType());
+    }
+  }
+
+  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
+    // Push extractelement into predecessor operation if legal and
+    // profitable to do so
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+      if (I->hasOneUse() &&
+          CheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
+        Value *newEI0 =
+          Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
+                                        EI.getName()+".lhs");
+        Value *newEI1 =
+          Builder->CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
+                                        EI.getName()+".rhs");
+        return BinaryOperator::Create(BO->getOpcode(), newEI0, newEI1);
+      }
+    } else if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == EI.getOperand(1))
+        return ReplaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) && isa<Constant>(EI.getOperand(1))) {
+        Worklist.AddValue(EI.getOperand(0));
+        EI.setOperand(0, IE->getOperand(0));
+        return &EI;
+      }
+    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+        int SrcIdx = SVI->getMaskValue(Elt->getZExtValue());
+        Value *Src;
+        unsigned LHSWidth =
+          SVI->getOperand(0)->getType()->getVectorNumElements();
+
+        if (SrcIdx < 0)
+          return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        if (SrcIdx < (int)LHSWidth)
+          Src = SVI->getOperand(0);
+        else {
+          SrcIdx -= LHSWidth;
+          Src = SVI->getOperand(1);
+        }
+        Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+        return ExtractElementInst::Create(Src,
+                                          ConstantInt::get(Int32Ty,
+                                                           SrcIdx, false));
+      }
+    } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      // Canonicalize extractelement(cast) -> cast(extractelement)
+      // bitcasts can change the number of vector elements and they cost nothing
+      if (CI->hasOneUse() && EI.hasOneUse() &&
+          (CI->getOpcode() != Instruction::BitCast)) {
+        Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
+                                                  EI.getIndexOperand());
+        return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+      }
+    }
+  }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true.
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         SmallVectorImpl<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return true;
+  }
+
+  if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+    return true;
+  }
+
+  if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                                      i+NumElts));
+    return true;
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
+        return true;
+      }
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx+NumElts);
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(V->getType()->isVectorTy() &&
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return V;
+  }
+
+  if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
+    return V;
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx % NumElts] =
+            ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                             NumElts+ExtractedIdx);
+          return V;
+        }
+
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                                         NumElts+i);
+          }
+          return V;
+        }
+
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+
+  // Inserting an undef or into an undefined place, remove this.
+  if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
+    ReplaceInstUsesWith(IE, VecOp);
+
+  // If the inserted element was extracted from some other vector, and if the
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);
+
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        SmallVector<Constant*, 16> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask));
+      }
+    }
+  }
+
+  unsigned VWidth = cast<VectorType>(VecOp->getType())->getNumElements();
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
+    if (V != &IE)
+      return ReplaceInstUsesWith(IE, V);
+    return &IE;
+  }
+
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  SmallVector<int, 16> Mask = SVI.getShuffleMask();
+
+  bool MadeChange = false;
+
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(SVI.getOperand(2)))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+
+  unsigned VWidth = cast<VectorType>(SVI.getType())->getNumElements();
+
+  APInt UndefElts(VWidth, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+  if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+    if (V != &SVI)
+      return ReplaceInstUsesWith(SVI, V);
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+
+  unsigned LHSWidth = cast<VectorType>(LHS->getType())->getNumElements();
+
+  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (LHS == RHS || isa<UndefValue>(LHS)) {
+    if (isa<UndefValue>(LHS) && LHS == RHS) {
+      // shuffle(undef,undef,mask) -> undef.
+      Value* result = (VWidth == LHSWidth)
+                      ? LHS : UndefValue::get(SVI.getType());
+      return ReplaceInstUsesWith(SVI, result);
+    }
+
+    // Remap any references to RHS to use LHS.
+    SmallVector<Constant*, 16> Elts;
+    for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) {
+      if (Mask[i] < 0) {
+        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+        continue;
+      }
+
+      if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) ||
+          (Mask[i] <  (int)e && isa<UndefValue>(LHS))) {
+        Mask[i] = -1;     // Turn into undef.
+        Elts.push_back(UndefValue::get(Type::getInt32Ty(SVI.getContext())));
+      } else {
+        Mask[i] = Mask[i] % e;  // Force to LHS.
+        Elts.push_back(ConstantInt::get(Type::getInt32Ty(SVI.getContext()),
+                                        Mask[i]));
+      }
+    }
+    SVI.setOperand(0, SVI.getOperand(1));
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, ConstantVector::get(Elts));
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+
+  if (VWidth == LHSWidth) {
+    // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+    bool isLHSID = true, isRHSID = true;
+
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] < 0) continue;  // Ignore undef values.
+      // Is this an identity shuffle of the LHS value?
+      isLHSID &= (Mask[i] == (int)i);
+
+      // Is this an identity shuffle of the RHS value?
+      isRHSID &= (Mask[i]-e == i);
+    }
+
+    // Eliminate identity shuffles.
+    if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+    if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  }
+
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.
+  // Cases that might be simplified:
+  // 1.
+  // x1=shuffle(v1,v2,mask1)
+  //  x=shuffle(x1,undef,mask)
+  //        ==>
+  //  x=shuffle(v1,undef,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
+  // 2.
+  // x1=shuffle(v1,undef,mask1)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == mask1.size()
+  //        ==>
+  //  x=shuffle(v1,x2,newMask)
+  // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
+  // 3.
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v2.size() == mask2.size()
+  //        ==>
+  //  x=shuffle(x1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
+  // 4.
+  // x1=shuffle(v1,undef,mask1)
+  // x2=shuffle(v2,undef,mask2)
+  //  x=shuffle(x1,x2,mask)
+  // where v1.size() == v2.size()
+  //        ==>
+  //  x=shuffle(v1,v2,newMask)
+  // newMask[i] = (mask[i] < x1.size())
+  //              ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
+  //
+  // Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is either a splat or one of the
+  // input shuffle masks.  In this case, merging the shuffles just removes
+  // one instruction, which we know is safe.  This is good for things like
+  // turning: (splat(splat)) -> splat, or
+  // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
+  ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
+  ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LHSShuffle)
+    if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
+      LHSShuffle = NULL;
+  if (RHSShuffle)
+    if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
+      RHSShuffle = NULL;
+  if (!LHSShuffle && !RHSShuffle)
+    return MadeChange ? &SVI : 0;
+
+  Value* LHSOp0 = NULL;
+  Value* LHSOp1 = NULL;
+  Value* RHSOp0 = NULL;
+  unsigned LHSOp0Width = 0;
+  unsigned RHSOp0Width = 0;
+  if (LHSShuffle) {
+    LHSOp0 = LHSShuffle->getOperand(0);
+    LHSOp1 = LHSShuffle->getOperand(1);
+    LHSOp0Width = cast<VectorType>(LHSOp0->getType())->getNumElements();
+  }
+  if (RHSShuffle) {
+    RHSOp0 = RHSShuffle->getOperand(0);
+    RHSOp0Width = cast<VectorType>(RHSOp0->getType())->getNumElements();
+  }
+  Value* newLHS = LHS;
+  Value* newRHS = RHS;
+  if (LHSShuffle) {
+    // case 1
+    if (isa<UndefValue>(RHS)) {
+      newLHS = LHSOp0;
+      newRHS = LHSOp1;
+    }
+    // case 2 or 4
+    else if (LHSOp0Width == LHSWidth) {
+      newLHS = LHSOp0;
+    }
+  }
+  // case 3 or 4
+  if (RHSShuffle && RHSOp0Width == LHSWidth) {
+    newRHS = RHSOp0;
+  }
+  // case 4
+  if (LHSOp0 == RHSOp0) {
+    newLHS = LHSOp0;
+    newRHS = NULL;
+  }
+
+  if (newLHS == LHS && newRHS == RHS)
+    return MadeChange ? &SVI : 0;
+
+  SmallVector<int, 16> LHSMask;
+  SmallVector<int, 16> RHSMask;
+  if (newLHS != LHS)
+    LHSMask = LHSShuffle->getShuffleMask();
+  if (RHSShuffle && newRHS != RHS)
+    RHSMask = RHSShuffle->getShuffleMask();
+
+  unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+  SmallVector<int, 16> newMask;
+  bool isSplat = true;
+  int SplatElt = -1;
+  // Create a new mask for the new ShuffleVectorInst so that the new
+  // ShuffleVectorInst is equivalent to the original one.
+  for (unsigned i = 0; i < VWidth; ++i) {
+    int eltMask;
+    if (Mask[i] < 0) {
+      // This element is an undef value.
+      eltMask = -1;
+    } else if (Mask[i] < (int)LHSWidth) {
+      // This element is from left hand side vector operand.
+      //
+      // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+      // new mask value for the element.
+      if (newLHS != LHS) {
+        eltMask = LHSMask[Mask[i]];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+          eltMask = -1;
+      } else
+        eltMask = Mask[i];
+    } else {
+      // This element is from right hand side vector operand
+      //
+      // If the value selected is an undef value, explicitly specify it
+      // with a -1 mask value. (case 1)
+      if (isa<UndefValue>(RHS))
+        eltMask = -1;
+      // If RHS is going to be replaced (case 3 or 4), calculate the
+      // new mask value for the element.
+      else if (newRHS != RHS) {
+        eltMask = RHSMask[Mask[i]-LHSWidth];
+        // If the value selected is an undef value, explicitly specify it
+        // with a -1 mask value.
+        if (eltMask >= (int)RHSOp0Width) {
+          assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+                 && "should have been check above");
+          eltMask = -1;
+        }
+      } else
+        eltMask = Mask[i]-LHSWidth;
+
+      // If LHS's width is changed, shift the mask value accordingly.
+      // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
+      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      // If newRHS == newLHS, we want to remap any references from newRHS to
+      // newLHS so that we can properly identify splats that may occur due to
+      // obfuscation accross the two vectors.
+      if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS)
+        eltMask += newLHSWidth;
+    }
+
+    // Check if this could still be a splat.
+    if (eltMask >= 0) {
+      if (SplatElt >= 0 && SplatElt != eltMask)
+        isSplat = false;
+      SplatElt = eltMask;
+    }
+
+    newMask.push_back(eltMask);
+  }
+
+  // If the result mask is equal to one of the original shuffle masks,
+  // or is a splat, do the replacement.
+  if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+    SmallVector<Constant*, 16> Elts;
+    Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+    for (unsigned i = 0, e = newMask.size(); i != e; ++i) {
+      if (newMask[i] < 0) {
+        Elts.push_back(UndefValue::get(Int32Ty));
+      } else {
+        Elts.push_back(ConstantInt::get(Int32Ty, newMask[i]));
+      }
+    }
+    if (newRHS == NULL)
+      newRHS = UndefValue::get(newLHS->getType());
+    return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts));
+  }
+
+  return MadeChange ? &SVI : 0;
+}
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h
new file mode 100644
index 000000000000..49efce5c4f22
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -0,0 +1,106 @@
+//===- InstCombineWorklist.h - Worklist for the InstCombine pass ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef INSTCOMBINE_WORKLIST_H
+#define INSTCOMBINE_WORKLIST_H
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+/// InstCombineWorklist - This is the worklist management logic for
+/// InstCombine.
+class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
+  SmallVector<Instruction*, 256> Worklist;
+  DenseMap<Instruction*, unsigned> WorklistMap;
+
+  void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
+  InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
+public:
+  InstCombineWorklist() {}
+
+  bool isEmpty() const { return Worklist.empty(); }
+
+  /// Add - Add the specified instruction to the worklist if it isn't already
+  /// in it.
+  void Add(Instruction *I) {
+    if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
+      DEBUG(errs() << "IC: ADD: " << *I << '\n');
+      Worklist.push_back(I);
+    }
+  }
+
+  void AddValue(Value *V) {
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      Add(I);
+  }
+
+  /// AddInitialGroup - Add the specified batch of stuff in reverse order.
+  /// which should only be done when the worklist is empty and when the group
+  /// has no duplicates.
+  void AddInitialGroup(Instruction *const *List, unsigned NumEntries) {
+    assert(Worklist.empty() && "Worklist must be empty to add initial group");
+    Worklist.reserve(NumEntries+16);
+    WorklistMap.resize(NumEntries);
+    DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
+    for (unsigned Idx = 0; NumEntries; --NumEntries) {
+      Instruction *I = List[NumEntries-1];
+      WorklistMap.insert(std::make_pair(I, Idx++));
+      Worklist.push_back(I);
+    }
+  }
+
+  // Remove - remove I from the worklist if it exists.
+  void Remove(Instruction *I) {
+    DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+    if (It == WorklistMap.end()) return; // Not in worklist.
+
+    // Don't bother moving everything down, just null out the slot.
+    Worklist[It->second] = 0;
+
+    WorklistMap.erase(It);
+  }
+
+  Instruction *RemoveOne() {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    WorklistMap.erase(I);
+    return I;
+  }
+
+  /// AddUsersToWorkList - When an instruction is simplified, add all users of
+  /// the instruction to the work lists because they might get more simplified
+  /// now.
+  ///
+  void AddUsersToWorkList(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+         UI != UE; ++UI)
+      Add(cast<Instruction>(*UI));
+  }
+
+
+  /// Zap - check that the worklist is empty and nuke the backing store for
+  /// the map if it is large.
+  void Zap() {
+    assert(WorklistMap.empty() && "Worklist empty, but map not?");
+
+    // Do an explicit clear, this shrinks the map if needed.
+    WorklistMap.clear();
+  }
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
new file mode 100644
index 000000000000..c6115e3e91fe
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -0,0 +1,2508 @@
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG.  This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Scalar.h"
+#include "InstCombine.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <climits>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+STATISTIC(NumExpand,    "Number of expansions");
+STATISTIC(NumFactor   , "Number of factorizations");
+STATISTIC(NumReassoc  , "Number of reassociations");
+
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
+
+// Initialization Routines
+void llvm::initializeInstCombine(PassRegistry &Registry) {
+  initializeInstCombinerPass(Registry);
+}
+
+void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
+  initializeInstCombine(*unwrap(R));
+}
+
+char InstCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(InstCombiner, "instcombine",
+                "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(InstCombiner, "instcombine",
+                "Combine redundant instructions", false, false)
+
+void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<TargetLibraryInfo>();
+}
+
+
+Value *InstCombiner::EmitGEPOffset(User *GEP) {
+  return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP);
+}
+
+/// ShouldChangeType - Return true if it is desirable to convert a computation
+/// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
+/// type for example, or from a smaller to a larger illegal type.
+bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
+  assert(From->isIntegerTy() && To->isIntegerTy());
+
+  // If we don't have TD, we don't know if the source/dest are legal.
+  if (!TD) return false;
+
+  unsigned FromWidth = From->getPrimitiveSizeInBits();
+  unsigned ToWidth = To->getPrimitiveSizeInBits();
+  bool FromLegal = TD->isLegalInteger(FromWidth);
+  bool ToLegal = TD->isLegalInteger(ToWidth);
+
+  // If this is a legal integer from type, and the result would be an illegal
+  // type, don't do the transformation.
+  if (FromLegal && !ToLegal)
+    return false;
+
+  // Otherwise, if both are illegal, do not increase the size of the result. We
+  // do allow things like i160 -> i64, but not i64 -> i160.
+  if (!FromLegal && !ToLegal && ToWidth > FromWidth)
+    return false;
+
+  return true;
+}
+
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  if (!OBO || !OBO->hasNoSignedWrap()) {
+    return false;
+  }
+
+  // We reason about Add and Sub Only.
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (Opcode != Instruction::Add &&
+      Opcode != Instruction::Sub) {
+    return false;
+  }
+
+  ConstantInt *CB = dyn_cast<ConstantInt>(B);
+  ConstantInt *CC = dyn_cast<ConstantInt>(C);
+
+  if (!CB || !CC) {
+    return false;
+  }
+
+  const APInt &BVal = CB->getValue();
+  const APInt &CVal = CC->getValue();
+  bool Overflow = false;
+
+  if (Opcode == Instruction::Add) {
+    BVal.sadd_ov(CVal, Overflow);
+  } else {
+    BVal.ssub_ov(CVal, Overflow);
+  }
+
+  return !Overflow;
+}
+
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+  FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+  if (!FPMO) {
+    I.clearSubclassOptionalData();
+    return;
+  }
+
+  FastMathFlags FMF = I.getFastMathFlags();
+  I.clearSubclassOptionalData();
+  I.setFastMathFlags(FMF);
+}
+
+/// SimplifyAssociativeOrCommutative - This performs a few simplifications for
+/// operators which are associative or commutative:
+//
+//  Commutative operators:
+//
+//  1. Order operands such that they are listed from right (least complex) to
+//     left (most complex).  This puts constants before unary operators before
+//     binary operators.
+//
+//  Associative operators:
+//
+//  2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+//  3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+//
+//  Associative and commutative operators:
+//
+//  4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+//  5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+//  6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+//     if C1 and C2 are constants.
+//
+bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  bool Changed = false;
+
+  do {
+    // Order operands such that they are listed from right (least complex) to
+    // left (most complex).  This puts constants before unary operators before
+    // binary operators.
+    if (I.isCommutative() && getComplexity(I.getOperand(0)) <
+        getComplexity(I.getOperand(1)))
+      Changed = !I.swapOperands();
+
+    BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+    BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
+
+    if (I.isAssociative()) {
+      // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "B op C" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, B, C, TD)) {
+          // It simplifies to V.  Form "A op V".
+          I.setOperand(0, A);
+          I.setOperand(1, V);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          if (MaintainNoSignedWrap(I, B, C) &&
+              (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+            // Note: this is only valid because SimplifyBinOp doesn't look at
+            // the operands to Op0.
+            I.clearSubclassOptionalData();
+            I.setHasNoSignedWrap(true);
+          } else {
+            ClearSubclassDataAfterReassociation(I);
+          }
+
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "A op B" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, A, B, TD)) {
+          // It simplifies to V.  Form "V op C".
+          I.setOperand(0, V);
+          I.setOperand(1, C);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+    }
+
+    if (I.isAssociative() && I.isCommutative()) {
+      // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+      if (Op0 && Op0->getOpcode() == Opcode) {
+        Value *A = Op0->getOperand(0);
+        Value *B = Op0->getOperand(1);
+        Value *C = I.getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) {
+          // It simplifies to V.  Form "V op B".
+          I.setOperand(0, V);
+          I.setOperand(1, B);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+      if (Op1 && Op1->getOpcode() == Opcode) {
+        Value *A = I.getOperand(0);
+        Value *B = Op1->getOperand(0);
+        Value *C = Op1->getOperand(1);
+
+        // Does "C op A" simplify?
+        if (Value *V = SimplifyBinOp(Opcode, C, A, TD)) {
+          // It simplifies to V.  Form "B op V".
+          I.setOperand(0, B);
+          I.setOperand(1, V);
+          // Conservatively clear the optional flags, since they may not be
+          // preserved by the reassociation.
+          ClearSubclassDataAfterReassociation(I);
+          Changed = true;
+          ++NumReassoc;
+          continue;
+        }
+      }
+
+      // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+      // if C1 and C2 are constants.
+      if (Op0 && Op1 &&
+          Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
+          isa<Constant>(Op0->getOperand(1)) &&
+          isa<Constant>(Op1->getOperand(1)) &&
+          Op0->hasOneUse() && Op1->hasOneUse()) {
+        Value *A = Op0->getOperand(0);
+        Constant *C1 = cast<Constant>(Op0->getOperand(1));
+        Value *B = Op1->getOperand(0);
+        Constant *C2 = cast<Constant>(Op1->getOperand(1));
+
+        Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
+        BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
+        InsertNewInstWith(New, I);
+        New->takeName(Op1);
+        I.setOperand(0, New);
+        I.setOperand(1, Folded);
+        // Conservatively clear the optional flags, since they may not be
+        // preserved by the reassociation.
+        ClearSubclassDataAfterReassociation(I);
+
+        Changed = true;
+        continue;
+      }
+    }
+
+    // No further simplifications.
+    return Changed;
+  } while (1);
+}
+
+/// LeftDistributesOverRight - Whether "X LOp (Y ROp Z)" is always equal to
+/// "(X LOp Y) ROp (X LOp Z)".
+static bool LeftDistributesOverRight(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  switch (LOp) {
+  default:
+    return false;
+
+  case Instruction::And:
+    // And distributes over Or and Xor.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Or:
+    case Instruction::Xor:
+      return true;
+    }
+
+  case Instruction::Mul:
+    // Multiplication distributes over addition and subtraction.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return true;
+    }
+
+  case Instruction::Or:
+    // Or distributes over And.
+    switch (ROp) {
+    default:
+      return false;
+    case Instruction::And:
+      return true;
+    }
+  }
+}
+
+/// RightDistributesOverLeft - Whether "(X LOp Y) ROp Z" is always equal to
+/// "(X ROp Z) LOp (Y ROp Z)".
+static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
+                                     Instruction::BinaryOps ROp) {
+  if (Instruction::isCommutative(ROp))
+    return LeftDistributesOverRight(ROp, LOp);
+  // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
+  // but this requires knowing that the addition does not overflow and other
+  // such subtleties.
+  return false;
+}
+
+/// SimplifyUsingDistributiveLaws - This tries to simplify binary operations
+/// which some other binary operation distributes over either by factorizing
+/// out common terms (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this
+/// results in simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is
+/// a win).  Returns the simplified value, or null if it didn't simplify.
+Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+  BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode(); // op
+
+  // Factorization.
+  if (Op0 && Op1 && Op0->getOpcode() == Op1->getOpcode()) {
+    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+    // a common term.
+    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1);
+    Value *C = Op1->getOperand(0), *D = Op1->getOperand(1);
+    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+
+    // Does "X op' Y" always equal "Y op' X"?
+    bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
+
+    // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
+    if (LeftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+      // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+      // commutative case, "(A op' B) op (C op' A)"?
+      if (A == C || (InnerCommutative && A == D)) {
+        if (A != C)
+          std::swap(C, D);
+        // Consider forming "A op' (B op D)".
+        // If "B op D" simplifies then it can be formed with no cost.
+        Value *V = SimplifyBinOp(TopLevelOpcode, B, D, TD);
+        // If "B op D" doesn't simplify then only go on if both of the existing
+        // operations "A op' B" and "C op' D" will be zapped as no longer used.
+        if (!V && Op0->hasOneUse() && Op1->hasOneUse())
+          V = Builder->CreateBinOp(TopLevelOpcode, B, D, Op1->getName());
+        if (V) {
+          ++NumFactor;
+          V = Builder->CreateBinOp(InnerOpcode, A, V);
+          V->takeName(&I);
+          return V;
+        }
+      }
+
+    // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
+    if (RightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+      // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+      // commutative case, "(A op' B) op (B op' D)"?
+      if (B == D || (InnerCommutative && B == C)) {
+        if (B != D)
+          std::swap(C, D);
+        // Consider forming "(A op C) op' B".
+        // If "A op C" simplifies then it can be formed with no cost.
+        Value *V = SimplifyBinOp(TopLevelOpcode, A, C, TD);
+        // If "A op C" doesn't simplify then only go on if both of the existing
+        // operations "A op' B" and "C op' D" will be zapped as no longer used.
+        if (!V && Op0->hasOneUse() && Op1->hasOneUse())
+          V = Builder->CreateBinOp(TopLevelOpcode, A, C, Op0->getName());
+        if (V) {
+          ++NumFactor;
+          V = Builder->CreateBinOp(InnerOpcode, V, B);
+          V->takeName(&I);
+          return V;
+        }
+      }
+  }
+
+  // Expansion.
+  if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+    // The instruction has the form "(A op' B) op C".  See if expanding it out
+    // to "(A op C) op' (B op C)" results in simplifications.
+    Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
+    Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+
+    // Do "A op C" and "B op C" both simplify?
+    if (Value *L = SimplifyBinOp(TopLevelOpcode, A, C, TD))
+      if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, TD)) {
+        // They do! Return "L op' R".
+        ++NumExpand;
+        // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
+        if ((L == A && R == B) ||
+            (Instruction::isCommutative(InnerOpcode) && L == B && R == A))
+          return Op0;
+        // Otherwise return "L op' R" if it simplifies.
+        if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD))
+          return V;
+        // Otherwise, create a new instruction.
+        C = Builder->CreateBinOp(InnerOpcode, L, R);
+        C->takeName(&I);
+        return C;
+      }
+  }
+
+  if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+    // The instruction has the form "A op (B op' C)".  See if expanding it out
+    // to "(A op B) op' (A op C)" results in simplifications.
+    Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
+    Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
+
+    // Do "A op B" and "A op C" both simplify?
+    if (Value *L = SimplifyBinOp(TopLevelOpcode, A, B, TD))
+      if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, TD)) {
+        // They do! Return "L op' R".
+        ++NumExpand;
+        // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
+        if ((L == B && R == C) ||
+            (Instruction::isCommutative(InnerOpcode) && L == C && R == B))
+          return Op1;
+        // Otherwise return "L op' R" if it simplifies.
+        if (Value *V = SimplifyBinOp(InnerOpcode, L, R, TD))
+          return V;
+        // Otherwise, create a new instruction.
+        A = Builder->CreateBinOp(InnerOpcode, L, R);
+        A->takeName(&I);
+        return A;
+      }
+  }
+
+  return 0;
+}
+
+// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
+// if the LHS is a constant zero (which is the 'negate' form).
+//
+Value *InstCombiner::dyn_castNegVal(Value *V) const {
+  if (BinaryOperator::isNeg(V))
+    return BinaryOperator::getNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+    if (C->getType()->getElementType()->isIntegerTy())
+      return ConstantExpr::getNeg(C);
+
+  return 0;
+}
+
+// dyn_castFNegVal - Given a 'fsub' instruction, return the RHS of the
+// instruction if the LHS is a constant negative zero (which is the 'negate'
+// form).
+//
+Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
+  if (BinaryOperator::isFNeg(V, IgnoreZeroSign))
+    return BinaryOperator::getFNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantFP *C = dyn_cast<ConstantFP>(V))
+    return ConstantExpr::getFNeg(C);
+
+  if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+    if (C->getType()->getElementType()->isFloatingPointTy())
+      return ConstantExpr::getFNeg(C);
+
+  return 0;
+}
+
+static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner *IC) {
+  if (CastInst *CI = dyn_cast<CastInst>(&I)) {
+    return IC->Builder->CreateCast(CI->getOpcode(), SO, I.getType());
+  }
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (Constant *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
+    return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
+                                    SO->getName()+".op");
+  if (ICmpInst *CI = dyn_cast<ICmpInst>(&I))
+    return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1,
+                                   SO->getName()+".cmp");
+  if (FCmpInst *CI = dyn_cast<FCmpInst>(&I))
+    return IC->Builder->CreateICmp(CI->getPredicate(), Op0, Op1,
+                                   SO->getName()+".cmp");
+  llvm_unreachable("Unknown binary instruction type!");
+}
+
+// FoldOpIntoSelect - Given an instruction with a select as one operand and a
+// constant as the other operand, try to fold the binary operator into the
+// select arguments.  This also works for Cast instructions, which obviously do
+// not have a second operand.
+Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
+  // Don't modify shared select instructions
+  if (!SI->hasOneUse()) return 0;
+  Value *TV = SI->getOperand(1);
+  Value *FV = SI->getOperand(2);
+
+  if (isa<Constant>(TV) || isa<Constant>(FV)) {
+    // Bool selects with constant operands can be folded to logical ops.
+    if (SI->getType()->isIntegerTy(1)) return 0;
+
+    // If it's a bitcast involving vectors, make sure it has the same number of
+    // elements on both sides.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(&Op)) {
+      VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+      VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+
+      // Verify that either both or neither are vectors.
+      if ((SrcTy == NULL) != (DestTy == NULL)) return 0;
+      // If vectors, verify that they have the same number of elements.
+      if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements())
+        return 0;
+    }
+
+    Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this);
+    Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this);
+
+    return SelectInst::Create(SI->getCondition(),
+                              SelectTrueVal, SelectFalseVal);
+  }
+  return 0;
+}
+
+
+/// FoldOpIntoPhi - Given a binary operator, cast instruction, or select which
+/// has a PHI node as operand #0, see if we can fold the instruction into the
+/// PHI (which is only possible if all operands to the PHI are constants).
+///
+Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
+  PHINode *PN = cast<PHINode>(I.getOperand(0));
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (NumPHIValues == 0)
+    return 0;
+
+  // We normally only transform phis with a single use.  However, if a PHI has
+  // multiple uses and they are all the same operation, we can fold *all* of the
+  // uses into the PHI.
+  if (!PN->hasOneUse()) {
+    // Walk the use list for the instruction, comparing them to I.
+    for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      if (User != &I && !I.isIdenticalTo(User))
+        return 0;
+    }
+    // Otherwise, we can replace *all* users with the new PHI we form.
+  }
+
+  // Check to see if all of the operands of the PHI are simple constants
+  // (constantint/constantfp/undef).  If there is one non-constant value,
+  // remember the BB it is in.  If there is more than one or if *it* is a PHI,
+  // bail out.  We don't do arbitrary constant expressions here because moving
+  // their computation can be expensive without a cost model.
+  BasicBlock *NonConstBB = 0;
+  for (unsigned i = 0; i != NumPHIValues; ++i) {
+    Value *InVal = PN->getIncomingValue(i);
+    if (isa<Constant>(InVal) && !isa<ConstantExpr>(InVal))
+      continue;
+
+    if (isa<PHINode>(InVal)) return 0;  // Itself a phi.
+    if (NonConstBB) return 0;  // More than one non-const value.
+
+    NonConstBB = PN->getIncomingBlock(i);
+
+    // If the InVal is an invoke at the end of the pred block, then we can't
+    // insert a computation after it without breaking the edge.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
+      if (II->getParent() == NonConstBB)
+        return 0;
+
+    // If the incoming non-constant value is in I's block, we will remove one
+    // instruction, but insert another equivalent one, leading to infinite
+    // instcombine.
+    if (NonConstBB == I.getParent())
+      return 0;
+  }
+
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation one some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
+  if (NonConstBB != 0) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+    if (!BI || !BI->isUnconditional()) return 0;
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // If we are going to have to insert a new computation, do so right before the
+  // predecessors terminator.
+  if (NonConstBB)
+    Builder->SetInsertPoint(NonConstBB->getTerminator());
+
+  // Next, add all of the operands to the PHI.
+  if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
+    // We only currently try to fold the condition of a select when it is a phi,
+    // not the true/false values.
+    Value *TrueV = SI->getTrueValue();
+    Value *FalseV = SI->getFalseValue();
+    BasicBlock *PhiTransBB = PN->getParent();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      BasicBlock *ThisBB = PN->getIncomingBlock(i);
+      Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
+      Value *InV = 0;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
+      else
+        InV = Builder->CreateSelect(PN->getIncomingValue(i),
+                                    TrueVInPred, FalseVInPred, "phitmp");
+      NewPN->addIncoming(InV, ThisBB);
+    }
+  } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = 0;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+      else if (isa<ICmpInst>(CI))
+        InV = Builder->CreateICmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                  C, "phitmp");
+      else
+        InV = Builder->CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                  C, "phitmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else if (I.getNumOperands() == 2) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV = 0;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::get(I.getOpcode(), InC, C);
+      else
+        InV = Builder->CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+                                   PN->getIncomingValue(i), C, "phitmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else {
+    CastInst *CI = cast<CastInst>(&I);
+    Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      else
+        InV = Builder->CreateCast(CI->getOpcode(),
+                                PN->getIncomingValue(i), I.getType(), "phitmp");
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+
+  for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
+       UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (User == &I) continue;
+    ReplaceInstUsesWith(*User, NewPN);
+    EraseInstFromFunction(*User);
+  }
+  return ReplaceInstUsesWith(I, NewPN);
+}
+
+/// FindElementAtOffset - Given a type and a constant offset, determine whether
+/// or not there is a sequence of GEP indices into the type that will land us at
+/// the specified offset.  If so, fill them into NewIndices and return the
+/// resultant element type, otherwise return null.
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
+                                          SmallVectorImpl<Value*> &NewIndices) {
+  if (!TD) return 0;
+  if (!Ty->isSized()) return 0;
+
+  // Start with the index over the outer type.  Note that the type size
+  // might be zero (even if the offset isn't zero) if the indexed type
+  // is something like [0 x {int, int}]
+  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  int64_t FirstIdx = 0;
+  if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
+    FirstIdx = Offset/TySize;
+    Offset -= FirstIdx*TySize;
+
+    // Handle hosts where % returns negative instead of values [0..TySize).
+    if (Offset < 0) {
+      --FirstIdx;
+      Offset += TySize;
+      assert(Offset >= 0);
+    }
+    assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+  }
+
+  NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+
+  // Index into the types.  If we fail, set OrigBase to null.
+  while (Offset) {
+    // Indexing into tail padding between struct/array elements.
+    if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
+      return 0;
+
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = TD->getStructLayout(STy);
+      assert(Offset < (int64_t)SL->getSizeInBytes() &&
+             "Offset must stay within the indexed type");
+
+      unsigned Elt = SL->getElementContainingOffset(Offset);
+      NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
+                                            Elt));
+
+      Offset -= SL->getElementOffset(Elt);
+      Ty = STy->getElementType(Elt);
+    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+      uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType());
+      assert(EltSize && "Cannot index into a zero-sized array");
+      NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+      Offset %= EltSize;
+      Ty = AT->getElementType();
+    } else {
+      // Otherwise, we can't index into the middle of this atomic type, bail.
+      return 0;
+    }
+  }
+
+  return Ty;
+}
+
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+  // If this GEP has only 0 indices, it is the same pointer as
+  // Src. If Src is not a trivial GEP too, don't combine
+  // the indices.
+  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+      !Src.hasOneUse())
+    return false;
+  return true;
+}
+
+/// Descale - Return a value X such that Val = X * Scale, or null if none.  If
+/// the multiplication is known not to overflow then NoSignedWrap is set.
+Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
+  assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
+         Scale.getBitWidth() && "Scale not compatible with value!");
+
+  // If Val is zero or Scale is one then Val = Val * Scale.
+  if (match(Val, m_Zero()) || Scale == 1) {
+    NoSignedWrap = true;
+    return Val;
+  }
+
+  // If Scale is zero then it does not divide Val.
+  if (Scale.isMinValue())
+    return 0;
+
+  // Look through chains of multiplications, searching for a constant that is
+  // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4
+  // will find the constant factor 4 and produce X*(Y*Z).  Descaling X*(Y*8) by
+  // a factor of 4 will produce X*(Y*2).  The principle of operation is to bore
+  // down from Val:
+  //
+  //     Val = M1 * X          ||   Analysis starts here and works down
+  //      M1 = M2 * Y          ||   Doesn't descend into terms with more
+  //      M2 =  Z * 4          \/   than one use
+  //
+  // Then to modify a term at the bottom:
+  //
+  //     Val = M1 * X
+  //      M1 =  Z * Y          ||   Replaced M2 with Z
+  //
+  // Then to work back up correcting nsw flags.
+
+  // Op - the term we are currently analyzing.  Starts at Val then drills down.
+  // Replaced with its descaled value before exiting from the drill down loop.
+  Value *Op = Val;
+
+  // Parent - initially null, but after drilling down notes where Op came from.
+  // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
+  // 0'th operand of Val.
+  std::pair<Instruction*, unsigned> Parent;
+
+  // RequireNoSignedWrap - Set if the transform requires a descaling at deeper
+  // levels that doesn't overflow.
+  bool RequireNoSignedWrap = false;
+
+  // logScale - log base 2 of the scale.  Negative if not a power of 2.
+  int32_t logScale = Scale.exactLogBase2();
+
+  for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      // If Op is a constant divisible by Scale then descale to the quotient.
+      APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
+      APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
+      if (!Remainder.isMinValue())
+        // Not divisible by Scale.
+        return 0;
+      // Replace with the quotient in the parent.
+      Op = ConstantInt::get(CI->getType(), Quotient);
+      NoSignedWrap = true;
+      break;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
+
+      if (BO->getOpcode() == Instruction::Mul) {
+        // Multiplication.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return 0;
+
+        // There are three cases for multiplication: multiplication by exactly
+        // the scale, multiplication by a constant different to the scale, and
+        // multiplication by something else.
+        Value *LHS = BO->getOperand(0);
+        Value *RHS = BO->getOperand(1);
+
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+          // Multiplication by a constant.
+          if (CI->getValue() == Scale) {
+            // Multiplication by exactly the scale, replace the multiplication
+            // by its left-hand side in the parent.
+            Op = LHS;
+            break;
+          }
+
+          // Otherwise drill down into the constant.
+          if (!Op->hasOneUse())
+            return 0;
+
+          Parent = std::make_pair(BO, 1);
+          continue;
+        }
+
+        // Multiplication by something else. Drill down into the left-hand side
+        // since that's where the reassociate pass puts the good stuff.
+        if (!Op->hasOneUse())
+          return 0;
+
+        Parent = std::make_pair(BO, 0);
+        continue;
+      }
+
+      if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
+          isa<ConstantInt>(BO->getOperand(1))) {
+        // Multiplication by a power of 2.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return 0;
+
+        Value *LHS = BO->getOperand(0);
+        int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
+          getLimitedValue(Scale.getBitWidth());
+        // Op = LHS << Amt.
+
+        if (Amt == logScale) {
+          // Multiplication by exactly the scale, replace the multiplication
+          // by its left-hand side in the parent.
+          Op = LHS;
+          break;
+        }
+        if (Amt < logScale || !Op->hasOneUse())
+          return 0;
+
+        // Multiplication by more than the scale.  Reduce the multiplying amount
+        // by the scale in the parent.
+        Parent = std::make_pair(BO, 1);
+        Op = ConstantInt::get(BO->getType(), Amt - logScale);
+        break;
+      }
+    }
+
+    if (!Op->hasOneUse())
+      return 0;
+
+    if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getOpcode() == Instruction::SExt) {
+        // Op is sign-extended from a smaller type, descale in the smaller type.
+        unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        APInt SmallScale = Scale.trunc(SmallSize);
+        // Suppose Op = sext X, and we descale X as Y * SmallScale.  We want to
+        // descale Op as (sext Y) * Scale.  In order to have
+        //   sext (Y * SmallScale) = (sext Y) * Scale
+        // some conditions need to hold however: SmallScale must sign-extend to
+        // Scale and the multiplication Y * SmallScale should not overflow.
+        if (SmallScale.sext(Scale.getBitWidth()) != Scale)
+          // SmallScale does not sign-extend to Scale.
+          return 0;
+        assert(SmallScale.exactLogBase2() == logScale);
+        // Require that Y * SmallScale must not overflow.
+        RequireNoSignedWrap = true;
+
+        // Drill down through the cast.
+        Parent = std::make_pair(Cast, 0);
+        Scale = SmallScale;
+        continue;
+      }
+
+      if (Cast->getOpcode() == Instruction::Trunc) {
+        // Op is truncated from a larger type, descale in the larger type.
+        // Suppose Op = trunc X, and we descale X as Y * sext Scale.  Then
+        //   trunc (Y * sext Scale) = (trunc Y) * Scale
+        // always holds.  However (trunc Y) * Scale may overflow even if
+        // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
+        // from this point up in the expression (see later).
+        if (RequireNoSignedWrap)
+          return 0;
+
+        // Drill down through the cast.
+        unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        Parent = std::make_pair(Cast, 0);
+        Scale = Scale.sext(LargeSize);
+        if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
+          logScale = -1;
+        assert(Scale.exactLogBase2() == logScale);
+        continue;
+      }
+    }
+
+    // Unsupported expression, bail out.
+    return 0;
+  }
+
+  // We know that we can successfully descale, so from here on we can safely
+  // modify the IR.  Op holds the descaled version of the deepest term in the
+  // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known
+  // not to overflow.
+
+  if (!Parent.first)
+    // The expression only had one term.
+    return Op;
+
+  // Rewrite the parent using the descaled version of its operand.
+  assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
+  assert(Op != Parent.first->getOperand(Parent.second) &&
+         "Descaling was a no-op?");
+  Parent.first->setOperand(Parent.second, Op);
+  Worklist.Add(Parent.first);
+
+  // Now work back up the expression correcting nsw flags.  The logic is based
+  // on the following observation: if X * Y is known not to overflow as a signed
+  // multiplication, and Y is replaced by a value Z with smaller absolute value,
+  // then X * Z will not overflow as a signed multiplication either.  As we work
+  // our way up, having NoSignedWrap 'true' means that the descaled value at the
+  // current level has strictly smaller absolute value than the original.
+  Instruction *Ancestor = Parent.first;
+  do {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
+      // If the multiplication wasn't nsw then we can't say anything about the
+      // value of the descaled multiplication, and we have to clear nsw flags
+      // from this point on up.
+      bool OpNoSignedWrap = BO->hasNoSignedWrap();
+      NoSignedWrap &= OpNoSignedWrap;
+      if (NoSignedWrap != OpNoSignedWrap) {
+        BO->setHasNoSignedWrap(NoSignedWrap);
+        Worklist.Add(Ancestor);
+      }
+    } else if (Ancestor->getOpcode() == Instruction::Trunc) {
+      // The fact that the descaled input to the trunc has smaller absolute
+      // value than the original input doesn't tell us anything useful about
+      // the absolute values of the truncations.
+      NoSignedWrap = false;
+    }
+    assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
+           "Failed to keep proper track of nsw flags while drilling down?");
+
+    if (Ancestor == Val)
+      // Got to the top, all done!
+      return Val;
+
+    // Move up one level in the expression.
+    assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
+    Ancestor = Ancestor->use_back();
+  } while (1);
+}
+
+Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
+
+  if (Value *V = SimplifyGEPInst(Ops, TD))
+    return ReplaceInstUsesWith(GEP, V);
+
+  Value *PtrOp = GEP.getOperand(0);
+
+  // Eliminate unneeded casts for indices, and replace indices which displace
+  // by multiples of a zero size type with zero.
+  if (TD) {
+    bool MadeChange = false;
+    Type *IntPtrTy = TD->getIntPtrType(GEP.getPointerOperandType());
+
+    gep_type_iterator GTI = gep_type_begin(GEP);
+    for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
+         I != E; ++I, ++GTI) {
+      // Skip indices into struct types.
+      SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
+      if (!SeqTy) continue;
+
+      // If the element type has zero size then any index over it is equivalent
+      // to an index of zero, so replace it with zero if it is not zero already.
+      if (SeqTy->getElementType()->isSized() &&
+          TD->getTypeAllocSize(SeqTy->getElementType()) == 0)
+        if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) {
+          *I = Constant::getNullValue(IntPtrTy);
+          MadeChange = true;
+        }
+
+      Type *IndexTy = (*I)->getType();
+      if (IndexTy != IntPtrTy) {
+        // If we are using a wider index than needed for this platform, shrink
+        // it to what we need.  If narrower, sign-extend it to what we need.
+        // This explicit cast can make subsequent optimizations more obvious.
+        *I = Builder->CreateIntCast(*I, IntPtrTy, true);
+        MadeChange = true;
+      }
+    }
+    if (MadeChange) return &GEP;
+  }
+
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  //
+  if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
+    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+      return 0;
+
+    // Note that if our source is a gep chain itself then we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    if (GEPOperator *SrcGEP =
+          dyn_cast<GEPOperator>(Src->getOperand(0)))
+      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+        return 0;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+         I != E; ++I)
+      EndsWithSequential = !(*I)->isStructTy();
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      //
+      Value *Sum;
+      Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+      Value *GO1 = GEP.getOperand(1);
+      if (SO1 == Constant::getNullValue(SO1->getType())) {
+        Sum = GO1;
+      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
+        Sum = SO1;
+      } else {
+        // If they aren't the same type, then the input hasn't been processed
+        // by the loop above yet (which canonicalizes sequential index types to
+        // intptr_t).  Just avoid transforming this until the input has been
+        // normalized.
+        if (SO1->getType() != GO1->getType())
+          return 0;
+        Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
+      }
+
+      // Update the GEP in place if possible.
+      if (Src->getNumOperands() == 2) {
+        GEP.setOperand(0, Src->getOperand(0));
+        GEP.setOperand(1, Sum);
+        return &GEP;
+      }
+      Indices.append(Src->op_begin()+1, Src->op_end()-1);
+      Indices.push_back(Sum);
+      Indices.append(GEP.op_begin()+2, GEP.op_end());
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               Src->getNumOperands() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.append(Src->op_begin()+1, Src->op_end());
+      Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return (GEP.isInBounds() && Src->isInBounds()) ?
+        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices,
+                                          GEP.getName()) :
+        GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
+  }
+
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
+
+  // We do not handle pointer-vector geps here.
+  if (!StrippedPtrTy)
+    return 0;
+
+  if (StrippedPtr != PtrOp &&
+    StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
+
+    bool HasZeroPointerIndex = false;
+    if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+      HasZeroPointerIndex = C->isZero();
+
+    // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+    // into     : GEP [10 x i8]* X, i32 0, ...
+    //
+    // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+    //           into     : GEP i8* X, ...
+    //
+    // This occurs when the program declares an array extern like "int X[];"
+    if (HasZeroPointerIndex) {
+      PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      if (ArrayType *CATy =
+          dyn_cast<ArrayType>(CPTy->getElementType())) {
+        // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+        if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
+          // -> GEP i8* X, ...
+          SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
+          GetElementPtrInst *Res =
+            GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName());
+          Res->setIsInBounds(GEP.isInBounds());
+          return Res;
+        }
+
+        if (ArrayType *XATy =
+              dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
+          // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // -> GEP [10 x i8]* X, i32 0, ...
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            GEP.setOperand(0, StrippedPtr);
+            return &GEP;
+          }
+        }
+      }
+    } else if (GEP.getNumOperands() == 2) {
+      // Transform things like:
+      // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
+      // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
+      Type *SrcElTy = StrippedPtrTy->getElementType();
+      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      if (TD && SrcElTy->isArrayTy() &&
+          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(ResElTy)) {
+        Value *Idx[2];
+        Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
+        Idx[1] = GEP.getOperand(1);
+        Value *NewGEP = GEP.isInBounds() ?
+          Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
+          Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
+        // V and GEP are both pointer types --> BitCast
+        return new BitCastInst(NewGEP, GEP.getType());
+      }
+
+      // Transform things like:
+      // %V = mul i64 %N, 4
+      // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
+      // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
+      if (TD && ResElTy->isSized() && SrcElTy->isSized()) {
+        // Check that changing the type amounts to dividing the index by a scale
+        // factor.
+        uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
+        uint64_t SrcSize = TD->getTypeAllocSize(SrcElTy);
+        if (ResSize && SrcSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = SrcSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *NewGEP = GEP.isInBounds() && NSW ?
+              Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) :
+              Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return new BitCastInst(NewGEP, GEP.getType());
+          }
+        }
+      }
+
+      // Similarly, transform things like:
+      // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+      if (TD && ResElTy->isSized() && SrcElTy->isSized() &&
+          SrcElTy->isArrayTy()) {
+        // Check that changing to the array element type amounts to dividing the
+        // index by a scale factor.
+        uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
+        uint64_t ArrayEltSize =
+          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        if (ResSize && ArrayEltSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = ArrayEltSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *Off[2];
+            Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
+            Off[1] = NewIdx;
+            Value *NewGEP = GEP.isInBounds() && NSW ?
+              Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
+              Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return new BitCastInst(NewGEP, GEP.getType());
+          }
+        }
+      }
+    }
+  }
+
+  /// See if we can simplify:
+  ///   X = bitcast A* to B*
+  ///   Y = gep X, <...constant indices...>
+  /// into a gep of the original struct.  This is important for SROA and alias
+  /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
+    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
+    if (TD &&
+        !isa<BitCastInst>(BCI->getOperand(0)) &&
+        GEP.accumulateConstantOffset(*TD, Offset) &&
+        StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
+
+      // If this GEP instruction doesn't move the pointer, just replace the GEP
+      // with a bitcast of the real input to the dest type.
+      if (!Offset) {
+        // If the bitcast is of an allocation, and the allocation will be
+        // converted to match the type of the cast, don't touch this.
+        if (isa<AllocaInst>(BCI->getOperand(0)) ||
+            isAllocationFn(BCI->getOperand(0), TLI)) {
+          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+          if (Instruction *I = visitBitCast(*BCI)) {
+            if (I != BCI) {
+              I->takeName(BCI);
+              BCI->getParent()->getInstList().insert(BCI, I);
+              ReplaceInstUsesWith(*BCI, I);
+            }
+            return &GEP;
+          }
+        }
+        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+      }
+
+      // Otherwise, if the offset is non-zero, we need to find out if there is a
+      // field at Offset in 'A's type.  If so, we can pull the cast through the
+      // GEP.
+      SmallVector<Value*, 8> NewIndices;
+      Type *InTy =
+        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
+      if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) {
+        Value *NGEP = GEP.isInBounds() ?
+          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
+          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
+
+        if (NGEP->getType() == GEP.getType())
+          return ReplaceInstUsesWith(GEP, NGEP);
+        NGEP->takeName(&GEP);
+        return new BitCastInst(NGEP, GEP.getType());
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+
+static bool
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
+                     const TargetLibraryInfo *TLI) {
+  SmallVector<Instruction*, 4> Worklist;
+  Worklist.push_back(AI);
+
+  do {
+    Instruction *PI = Worklist.pop_back_val();
+    for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE;
+         ++UI) {
+      Instruction *I = cast<Instruction>(*UI);
+      switch (I->getOpcode()) {
+      default:
+        // Give up the moment we see something we can't handle.
+        return false;
+
+      case Instruction::BitCast:
+      case Instruction::GetElementPtr:
+        Users.push_back(I);
+        Worklist.push_back(I);
+        continue;
+
+      case Instruction::ICmp: {
+        ICmpInst *ICI = cast<ICmpInst>(I);
+        // We can fold eq/ne comparisons with null to false/true, respectively.
+        if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1)))
+          return false;
+        Users.push_back(I);
+        continue;
+      }
+
+      case Instruction::Call:
+        // Ignore no-op and store intrinsics.
+        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          switch (II->getIntrinsicID()) {
+          default:
+            return false;
+
+          case Intrinsic::memmove:
+          case Intrinsic::memcpy:
+          case Intrinsic::memset: {
+            MemIntrinsic *MI = cast<MemIntrinsic>(II);
+            if (MI->isVolatile() || MI->getRawDest() != PI)
+              return false;
+          }
+          // fall through
+          case Intrinsic::dbg_declare:
+          case Intrinsic::dbg_value:
+          case Intrinsic::invariant_start:
+          case Intrinsic::invariant_end:
+          case Intrinsic::lifetime_start:
+          case Intrinsic::lifetime_end:
+          case Intrinsic::objectsize:
+            Users.push_back(I);
+            continue;
+          }
+        }
+
+        if (isFreeCall(I, TLI)) {
+          Users.push_back(I);
+          continue;
+        }
+        return false;
+
+      case Instruction::Store: {
+        StoreInst *SI = cast<StoreInst>(I);
+        if (SI->isVolatile() || SI->getPointerOperand() != PI)
+          return false;
+        Users.push_back(I);
+        continue;
+      }
+      }
+      llvm_unreachable("missing a return?");
+    }
+  } while (!Worklist.empty());
+  return true;
+}
+
+Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
+  // If we have a malloc call which is only used in any amount of comparisons
+  // to null and free calls, delete the calls and replace the comparisons with
+  // true or false as appropriate.
+  SmallVector<WeakVH, 64> Users;
+  if (isAllocSiteRemovable(&MI, Users, TLI)) {
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      Instruction *I = cast_or_null<Instruction>(&*Users[i]);
+      if (!I) continue;
+
+      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+        ReplaceInstUsesWith(*C,
+                            ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                             C->isFalseWhenEqual()));
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
+      } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::objectsize) {
+          ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
+          uint64_t DontKnow = CI->isZero() ? -1ULL : 0;
+          ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow));
+        }
+      }
+      EraseInstFromFunction(*I);
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+      // Replace invoke with a NOP intrinsic to maintain the original CFG
+      Module *M = II->getParent()->getParent()->getParent();
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+      InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+                         ArrayRef<Value *>(), "", II->getParent());
+    }
+    return EraseInstFromFunction(MI);
+  }
+  return 0;
+}
+
+/// \brief Move the call to free before a NULL test.
+///
+/// Check if this free is accessed after its argument has been test
+/// against NULL (property 0).
+/// If yes, it is legal to move this call in its predecessor block.
+///
+/// The move is performed only if the block containing the call to free
+/// will be removed, i.e.:
+/// 1. it has only one predecessor P, and P has two successors
+/// 2. it contains the call and an unconditional branch
+/// 3. its successor is the same as its predecessor's successor
+///
+/// The profitability is out-of concern here and this function should
+/// be called only if the caller knows this transformation would be
+/// profitable (e.g., for code size).
+static Instruction *
+tryToMoveFreeBeforeNullTest(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
+  BasicBlock *FreeInstrBB = FI.getParent();
+  BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
+
+  // Validate part of constraint #1: Only one predecessor
+  // FIXME: We can extend the number of predecessor, but in that case, we
+  //        would duplicate the call to free in each predecessor and it may
+  //        not be profitable even for code size.
+  if (!PredBB)
+    return 0;
+
+  // Validate constraint #2: Does this block contains only the call to
+  //                         free and an unconditional branch?
+  // FIXME: We could check if we can speculate everything in the
+  //        predecessor block
+  if (FreeInstrBB->size() != 2)
+    return 0;
+  BasicBlock *SuccBB;
+  if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
+    return 0;
+
+  // Validate the rest of constraint #1 by matching on the pred branch.
+  TerminatorInst *TI = PredBB->getTerminator();
+  BasicBlock *TrueBB, *FalseBB;
+  ICmpInst::Predicate Pred;
+  if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
+    return 0;
+  if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
+    return 0;
+
+  // Validate constraint #3: Ensure the null case just falls through.
+  if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
+    return 0;
+  assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
+         "Broken CFG: missing edge from predecessor to successor");
+
+  FI.moveBefore(TI);
+  return &FI;
+}
+
+
+Instruction *InstCombiner::visitFree(CallInst &FI) {
+  Value *Op = FI.getArgOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Insert a new store to null because we cannot modify the CFG here.
+    Builder->CreateStore(ConstantInt::getTrue(FI.getContext()),
+                         UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
+    return EraseInstFromFunction(FI);
+  }
+
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return EraseInstFromFunction(FI);
+
+  // If we optimize for code size, try to move the call to free before the null
+  // test so that simplify cfg can remove the empty block and dead code
+  // elimination the branch. I.e., helps to turn something like:
+  // if (foo) free(foo);
+  // into
+  // free(foo);
+  if (MinimizeSize)
+    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
+      return I;
+
+  return 0;
+}
+
+
+
+Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = 0;
+  BasicBlock *TrueDest;
+  BasicBlock *FalseDest;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.setCondition(X);
+    BI.swapSuccessors();
+    return &BI;
+  }
+
+  // Cannonicalize fcmp_one -> fcmp_oeq
+  FCmpInst::Predicate FPred; Value *Y;
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
+                             TrueDest, FalseDest)) &&
+      BI.getCondition()->hasOneUse())
+    if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
+        FPred == FCmpInst::FCMP_OGE) {
+      FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
+      Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
+
+      // Swap Destinations and condition.
+      BI.swapSuccessors();
+      Worklist.Add(Cond);
+      return &BI;
+    }
+
+  // Cannonicalize icmp_ne -> icmp_eq
+  ICmpInst::Predicate IPred;
+  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
+                      TrueDest, FalseDest)) &&
+      BI.getCondition()->hasOneUse())
+    if (IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
+        IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
+        IPred == ICmpInst::ICMP_SGE) {
+      ICmpInst *Cond = cast<ICmpInst>(BI.getCondition());
+      Cond->setPredicate(ICmpInst::getInversePredicate(IPred));
+      // Swap Destinations and condition.
+      BI.swapSuccessors();
+      Worklist.Add(Cond);
+      return &BI;
+    }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+  Value *Cond = SI.getCondition();
+  if (Instruction *I = dyn_cast<Instruction>(Cond)) {
+    if (I->getOpcode() == Instruction::Add)
+      if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        // change 'switch (X+4) case 1:' into 'switch (X) case -3'
+        // Skip the first item since that's the default case.
+        for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
+             i != e; ++i) {
+          ConstantInt* CaseVal = i.getCaseValue();
+          Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
+                                                      AddRHS);
+          assert(isa<ConstantInt>(NewCaseVal) &&
+                 "Result of expression should be constant");
+          i.setValue(cast<ConstantInt>(NewCaseVal));
+        }
+        SI.setCondition(I->getOperand(0));
+        Worklist.Add(I);
+        return &SI;
+      }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
+  Value *Agg = EV.getAggregateOperand();
+
+  if (!EV.hasIndices())
+    return ReplaceInstUsesWith(EV, Agg);
+
+  if (Constant *C = dyn_cast<Constant>(Agg)) {
+    if (Constant *C2 = C->getAggregateElement(*EV.idx_begin())) {
+      if (EV.getNumIndices() == 0)
+        return ReplaceInstUsesWith(EV, C2);
+      // Extract the remaining indices out of the constant indexed by the
+      // first index
+      return ExtractValueInst::Create(C2, EV.getIndices().slice(1));
+    }
+    return 0; // Can't handle other constants
+  }
+
+  if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+    // We're extracting from an insertvalue instruction, compare the indices
+    const unsigned *exti, *exte, *insi, *inse;
+    for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+         exte = EV.idx_end(), inse = IV->idx_end();
+         exti != exte && insi != inse;
+         ++exti, ++insi) {
+      if (*insi != *exti)
+        // The insert and extract both reference distinctly different elements.
+        // This means the extract is not influenced by the insert, and we can
+        // replace the aggregate operand of the extract with the aggregate
+        // operand of the insert. i.e., replace
+        // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+        // %E = extractvalue { i32, { i32 } } %I, 0
+        // with
+        // %E = extractvalue { i32, { i32 } } %A, 0
+        return ExtractValueInst::Create(IV->getAggregateOperand(),
+                                        EV.getIndices());
+    }
+    if (exti == exte && insi == inse)
+      // Both iterators are at the end: Index lists are identical. Replace
+      // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %C = extractvalue { i32, { i32 } } %B, 1, 0
+      // with "i32 42"
+      return ReplaceInstUsesWith(EV, IV->getInsertedValueOperand());
+    if (exti == exte) {
+      // The extract list is a prefix of the insert list. i.e. replace
+      // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+      // %E = extractvalue { i32, { i32 } } %I, 1
+      // with
+      // %X = extractvalue { i32, { i32 } } %A, 1
+      // %E = insertvalue { i32 } %X, i32 42, 0
+      // by switching the order of the insert and extract (though the
+      // insertvalue should be left in, since it may have other uses).
+      Value *NewEV = Builder->CreateExtractValue(IV->getAggregateOperand(),
+                                                 EV.getIndices());
+      return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+                                     makeArrayRef(insi, inse));
+    }
+    if (insi == inse)
+      // The insert list is a prefix of the extract list
+      // We can simply remove the common indices from the extract and make it
+      // operate on the inserted value instead of the insertvalue result.
+      // i.e., replace
+      // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+      // %E = extractvalue { i32, { i32 } } %I, 1, 0
+      // with
+      // %E extractvalue { i32 } { i32 42 }, 0
+      return ExtractValueInst::Create(IV->getInsertedValueOperand(),
+                                      makeArrayRef(exti, exte));
+  }
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
+    // We're extracting from an intrinsic, see if we're the only user, which
+    // allows us to simplify multiple result intrinsics to simpler things that
+    // just get one value.
+    if (II->hasOneUse()) {
+      // Check if we're grabbing the overflow bit or the result of a 'with
+      // overflow' intrinsic.  If it's the latter we can remove the intrinsic
+      // and replace it with a traditional binary instruction.
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::sadd_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateAdd(LHS, RHS);
+        }
+
+        // If the normal result of the add is dead, and the RHS is a constant,
+        // we can transform this into a range comparison.
+        // overflow = uadd a, -4  -->  overflow = icmp ugt a, 3
+        if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow)
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getArgOperand(1)))
+            return new ICmpInst(ICmpInst::ICMP_UGT, II->getArgOperand(0),
+                                ConstantExpr::getNot(CI));
+        break;
+      case Intrinsic::usub_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateSub(LHS, RHS);
+        }
+        break;
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::smul_with_overflow:
+        if (*EV.idx_begin() == 0) {  // Normal result.
+          Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
+          ReplaceInstUsesWith(*II, UndefValue::get(II->getType()));
+          EraseInstFromFunction(*II);
+          return BinaryOperator::CreateMul(LHS, RHS);
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+  if (LoadInst *L = dyn_cast<LoadInst>(Agg))
+    // If the (non-volatile) load only has one use, we can rewrite this to a
+    // load from a GEP. This reduces the size of the load.
+    // FIXME: If a load is used only by extractvalue instructions then this
+    //        could be done regardless of having multiple uses.
+    if (L->isSimple() && L->hasOneUse()) {
+      // extractvalue has integer indices, getelementptr has Value*s. Convert.
+      SmallVector<Value*, 4> Indices;
+      // Prefix an i32 0 since we need the first element.
+      Indices.push_back(Builder->getInt32(0));
+      for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
+            I != E; ++I)
+        Indices.push_back(Builder->getInt32(*I));
+
+      // We need to insert these at the location of the old load, not at that of
+      // the extractvalue.
+      Builder->SetInsertPoint(L->getParent(), L);
+      Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(), Indices);
+      // Returning the load directly will cause the main loop to insert it in
+      // the wrong spot, so use ReplaceInstUsesWith().
+      return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP));
+    }
+  // We could simplify extracts from other values. Note that nested extracts may
+  // already be simplified implicitly by the above: extract (extract (insert) )
+  // will be translated into extract ( insert ( extract ) ) first and then just
+  // the value inserted, if appropriate. Similarly for extracts from single-use
+  // loads: extract (extract (load)) will be translated to extract (load (gep))
+  // and if again single-use then via load (gep (gep)) to load (gep).
+  // However, double extracts from e.g. function arguments or return values
+  // aren't handled yet.
+  return 0;
+}
+
+enum Personality_Type {
+  Unknown_Personality,
+  GNU_Ada_Personality,
+  GNU_CXX_Personality,
+  GNU_ObjC_Personality
+};
+
+/// RecognizePersonality - See if the given exception handling personality
+/// function is one that we understand.  If so, return a description of it;
+/// otherwise return Unknown_Personality.
+static Personality_Type RecognizePersonality(Value *Pers) {
+  Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
+  if (!F)
+    return Unknown_Personality;
+  return StringSwitch<Personality_Type>(F->getName())
+    .Case("__gnat_eh_personality", GNU_Ada_Personality)
+    .Case("__gxx_personality_v0",  GNU_CXX_Personality)
+    .Case("__objc_personality_v0", GNU_ObjC_Personality)
+    .Default(Unknown_Personality);
+}
+
+/// isCatchAll - Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) {
+  switch (Personality) {
+  case Unknown_Personality:
+    return false;
+  case GNU_Ada_Personality:
+    // While __gnat_all_others_value will match any Ada exception, it doesn't
+    // match foreign exceptions (or didn't, before gcc-4.7).
+    return false;
+  case GNU_CXX_Personality:
+  case GNU_ObjC_Personality:
+    return TypeInfo->isNullValue();
+  }
+  llvm_unreachable("Unknown personality!");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+  return
+    cast<ArrayType>(LHS->getType())->getNumElements()
+  <
+    cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
+Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+  // The logic here should be correct for any real-world personality function.
+  // However if that turns out not to be true, the offending logic can always
+  // be conditioned on the personality function, like the catch-all logic is.
+  Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn());
+
+  // Simplify the list of clauses, eg by removing repeated catch clauses
+  // (these are often created by inlining).
+  bool MakeNewInstruction = false; // If true, recreate using the following:
+  SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction;
+  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
+
+  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+    bool isLastClause = i + 1 == e;
+    if (LI.isCatch(i)) {
+      // A catch clause.
+      Value *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts());
+
+      // If we already saw this clause, there is no point in having a second
+      // copy of it.
+      if (AlreadyCaught.insert(TypeInfo)) {
+        // This catch clause was not already seen.
+        NewClauses.push_back(CatchClause);
+      } else {
+        // Repeated catch clause - drop the redundant copy.
+        MakeNewInstruction = true;
+      }
+
+      // If this is a catch-all then there is no point in keeping any following
+      // clauses or marking the landingpad as having a cleanup.
+      if (isCatchAll(Personality, TypeInfo)) {
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+    } else {
+      // A filter clause.  If any of the filter elements were already caught
+      // then they can be dropped from the filter.  It is tempting to try to
+      // exploit the filter further by saying that any typeinfo that does not
+      // occur in the filter can't be caught later (and thus can be dropped).
+      // However this would be wrong, since typeinfos can match without being
+      // equal (for example if one represents a C++ class, and the other some
+      // class derived from it).
+      assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+      Value *FilterClause = LI.getClause(i);
+      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+      unsigned NumTypeInfos = FilterType->getNumElements();
+
+      // An empty filter catches everything, so there is no point in keeping any
+      // following clauses or marking the landingpad as having a cleanup.  By
+      // dealing with this case here the following code is made a bit simpler.
+      if (!NumTypeInfos) {
+        NewClauses.push_back(FilterClause);
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+
+      bool MakeNewFilter = false; // If true, make a new filter.
+      SmallVector<Constant *, 16> NewFilterElts; // New elements.
+      if (isa<ConstantAggregateZero>(FilterClause)) {
+        // Not an empty filter - it contains at least one null typeinfo.
+        assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+        Constant *TypeInfo =
+          Constant::getNullValue(FilterType->getElementType());
+        // If this typeinfo is a catch-all then the filter can never match.
+        if (isCatchAll(Personality, TypeInfo)) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // There is no point in having multiple copies of this typeinfo, so
+        // discard all but the first copy if there is more than one.
+        NewFilterElts.push_back(TypeInfo);
+        if (NumTypeInfos > 1)
+          MakeNewFilter = true;
+      } else {
+        ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+        NewFilterElts.reserve(NumTypeInfos);
+
+        // Remove any filter elements that were already caught or that already
+        // occurred in the filter.  While there, see if any of the elements are
+        // catch-alls.  If so, the filter can be discarded.
+        bool SawCatchAll = false;
+        for (unsigned j = 0; j != NumTypeInfos; ++j) {
+          Value *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts());
+          if (isCatchAll(Personality, TypeInfo)) {
+            // This element is a catch-all.  Bail out, noting this fact.
+            SawCatchAll = true;
+            break;
+          }
+          if (AlreadyCaught.count(TypeInfo))
+            // Already caught by an earlier clause, so having it in the filter
+            // is pointless.
+            continue;
+          // There is no point in having multiple copies of the same typeinfo in
+          // a filter, so only add it if we didn't already.
+          if (SeenInFilter.insert(TypeInfo))
+            NewFilterElts.push_back(cast<Constant>(Elt));
+        }
+        // A filter containing a catch-all cannot match anything by definition.
+        if (SawCatchAll) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // If we dropped something from the filter, make a new one.
+        if (NewFilterElts.size() < NumTypeInfos)
+          MakeNewFilter = true;
+      }
+      if (MakeNewFilter) {
+        FilterType = ArrayType::get(FilterType->getElementType(),
+                                    NewFilterElts.size());
+        FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+        MakeNewInstruction = true;
+      }
+
+      NewClauses.push_back(FilterClause);
+
+      // If the new filter is empty then it will catch everything so there is
+      // no point in keeping any following clauses or marking the landingpad
+      // as having a cleanup.  The case of the original filter being empty was
+      // already handled above.
+      if (MakeNewFilter && !NewFilterElts.size()) {
+        assert(MakeNewInstruction && "New filter but not a new instruction!");
+        CleanupFlag = false;
+        break;
+      }
+    }
+  }
+
+  // If several filters occur in a row then reorder them so that the shortest
+  // filters come first (those with the smallest number of elements).  This is
+  // advantageous because shorter filters are more likely to match, speeding up
+  // unwinding, but mostly because it increases the effectiveness of the other
+  // filter optimizations below.
+  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+    unsigned j;
+    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+    for (j = i; j != e; ++j)
+      if (!isa<ArrayType>(NewClauses[j]->getType()))
+        break;
+
+    // Check whether the filters are already sorted by length.  We need to know
+    // if sorting them is actually going to do anything so that we only make a
+    // new landingpad instruction if it does.
+    for (unsigned k = i; k + 1 < j; ++k)
+      if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+        // Not sorted, so sort the filters now.  Doing an unstable sort would be
+        // correct too but reordering filters pointlessly might confuse users.
+        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+                         shorter_filter);
+        MakeNewInstruction = true;
+        break;
+      }
+
+    // Look for the next batch of filters.
+    i = j + 1;
+  }
+
+  // If typeinfos matched if and only if equal, then the elements of a filter L
+  // that occurs later than a filter F could be replaced by the intersection of
+  // the elements of F and L.  In reality two typeinfos can match without being
+  // equal (for example if one represents a C++ class, and the other some class
+  // derived from it) so it would be wrong to perform this transform in general.
+  // However the transform is correct and useful if F is a subset of L.  In that
+  // case L can be replaced by F, and thus removed altogether since repeating a
+  // filter is pointless.  So here we look at all pairs of filters F and L where
+  // L follows F in the list of clauses, and remove L if every element of F is
+  // an element of L.  This can occur when inlining C++ functions with exception
+  // specifications.
+  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+    // Examine each filter in turn.
+    Value *Filter = NewClauses[i];
+    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+    if (!FTy)
+      // Not a filter - skip it.
+      continue;
+    unsigned FElts = FTy->getNumElements();
+    // Examine each filter following this one.  Doing this backwards means that
+    // we don't have to worry about filters disappearing under us when removed.
+    for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+      Value *LFilter = NewClauses[j];
+      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+      if (!LTy)
+        // Not a filter - skip it.
+        continue;
+      // If Filter is a subset of LFilter, i.e. every element of Filter is also
+      // an element of LFilter, then discard LFilter.
+      SmallVector<Value *, 16>::iterator J = NewClauses.begin() + j;
+      // If Filter is empty then it is a subset of LFilter.
+      if (!FElts) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+        // Move on to the next filter.
+        continue;
+      }
+      unsigned LElts = LTy->getNumElements();
+      // If Filter is longer than LFilter then it cannot be a subset of it.
+      if (FElts > LElts)
+        // Move on to the next filter.
+        continue;
+      // At this point we know that LFilter has at least one element.
+      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+        // Filter is a subset of LFilter iff Filter contains only zeros (as we
+        // already know that Filter is not longer than LFilter).
+        if (isa<ConstantAggregateZero>(Filter)) {
+          assert(FElts <= LElts && "Should have handled this case earlier!");
+          // Discard LFilter.
+          NewClauses.erase(J);
+          MakeNewInstruction = true;
+        }
+        // Move on to the next filter.
+        continue;
+      }
+      ConstantArray *LArray = cast<ConstantArray>(LFilter);
+      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+        // Since Filter is non-empty and contains only zeros, it is a subset of
+        // LFilter iff LFilter contains a zero.
+        assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+        for (unsigned l = 0; l != LElts; ++l)
+          if (LArray->getOperand(l)->isNullValue()) {
+            // LFilter contains a zero - discard it.
+            NewClauses.erase(J);
+            MakeNewInstruction = true;
+            break;
+          }
+        // Move on to the next filter.
+        continue;
+      }
+      // At this point we know that both filters are ConstantArrays.  Loop over
+      // operands to see whether every element of Filter is also an element of
+      // LFilter.  Since filters tend to be short this is probably faster than
+      // using a method that scales nicely.
+      ConstantArray *FArray = cast<ConstantArray>(Filter);
+      bool AllFound = true;
+      for (unsigned f = 0; f != FElts; ++f) {
+        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+        AllFound = false;
+        for (unsigned l = 0; l != LElts; ++l) {
+          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+          if (LTypeInfo == FTypeInfo) {
+            AllFound = true;
+            break;
+          }
+        }
+        if (!AllFound)
+          break;
+      }
+      if (AllFound) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+      }
+      // Move on to the next filter.
+    }
+  }
+
+  // If we changed any of the clauses, replace the old landingpad instruction
+  // with a new one.
+  if (MakeNewInstruction) {
+    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+                                                 LI.getPersonalityFn(),
+                                                 NewClauses.size());
+    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+      NLI->addClause(NewClauses[i]);
+    // A landing pad with no clauses must have the cleanup flag set.  It is
+    // theoretically possible, though highly unlikely, that we eliminated all
+    // clauses.  If so, force the cleanup flag to true.
+    if (NewClauses.empty())
+      CleanupFlag = true;
+    NLI->setCleanup(CleanupFlag);
+    return NLI;
+  }
+
+  // Even if none of the clauses changed, we may nonetheless have understood
+  // that the cleanup flag is pointless.  Clear it if so.
+  if (LI.isCleanup() != CleanupFlag) {
+    assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+    LI.setCleanup(CleanupFlag);
+    return &LI;
+  }
+
+  return 0;
+}
+
+
+
+
+/// TryToSinkInstruction - Try to move the specified instruction from its
+/// current block into the beginning of DestBlock, which can only happen if it's
+/// safe to move the instruction past all of the instructions between it and the
+/// end of its block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->hasOneUse() && "Invariants didn't hold!");
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() ||
+      isa<TerminatorInst>(I))
+    return false;
+
+  // Do not sink alloca instructions out of the entry block.
+  if (isa<AllocaInst>(I) && I->getParent() ==
+        &DestBlock->getParent()->getEntryBlock())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (I->mayReadFromMemory()) {
+    for (BasicBlock::iterator Scan = I, E = I->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+  I->moveBefore(InsertPos);
+  ++NumSunkInst;
+  return true;
+}
+
+
+/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
+/// all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+///
+static bool AddReachableCodeToWorklist(BasicBlock *BB,
+                                       SmallPtrSet<BasicBlock*, 64> &Visited,
+                                       InstCombiner &IC,
+                                       const DataLayout *TD,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeIRChange = false;
+  SmallVector<BasicBlock*, 256> Worklist;
+  Worklist.push_back(BB);
+
+  SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
+  DenseMap<ConstantExpr*, Constant*> FoldedConstants;
+
+  do {
+    BB = Worklist.pop_back_val();
+
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB)) continue;
+
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = BBI++;
+
+      // DCE instruction if trivially dead.
+      if (isInstructionTriviallyDead(Inst, TLI)) {
+        ++NumDeadInst;
+        DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
+        Inst->eraseFromParent();
+        continue;
+      }
+
+      // ConstantProp instruction if trivially constant.
+      if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
+        if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
+          DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
+                       << *Inst << '\n');
+          Inst->replaceAllUsesWith(C);
+          ++NumConstProp;
+          Inst->eraseFromParent();
+          continue;
+        }
+
+      if (TD) {
+        // See if we can constant fold its operands.
+        for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end();
+             i != e; ++i) {
+          ConstantExpr *CE = dyn_cast<ConstantExpr>(i);
+          if (CE == 0) continue;
+
+          Constant*& FoldRes = FoldedConstants[CE];
+          if (!FoldRes)
+            FoldRes = ConstantFoldConstantExpression(CE, TD, TLI);
+          if (!FoldRes)
+            FoldRes = CE;
+
+          if (FoldRes != CE) {
+            *i = FoldRes;
+            MadeIRChange = true;
+          }
+        }
+      }
+
+      InstrsForInstCombineWorklist.push_back(Inst);
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+        Worklist.push_back(ReachableBB);
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        // See if this is an explicit destination.
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i)
+          if (i.getCaseValue() == Cond) {
+            BasicBlock *ReachableBB = i.getCaseSuccessor();
+            Worklist.push_back(ReachableBB);
+            continue;
+          }
+
+        // Otherwise it is the default destination.
+        Worklist.push_back(SI->getDefaultDest());
+        continue;
+      }
+    }
+
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      Worklist.push_back(TI->getSuccessor(i));
+  } while (!Worklist.empty());
+
+  // Once we've found all of the instructions to add to instcombine's worklist,
+  // add them in reverse order.  This way instcombine will visit from the top
+  // of the function down.  This jives well with the way that it adds all uses
+  // of instructions to the worklist after doing a transformation, thus avoiding
+  // some N^2 behavior in pathological cases.
+  IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0],
+                              InstrsForInstCombineWorklist.size());
+
+  return MadeIRChange;
+}
+
+bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
+  MadeIRChange = false;
+
+  DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+               << F.getName() << "\n");
+
+  {
+    // Do a depth-first traversal of the function, populate the worklist with
+    // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+    // track of which blocks we visit.
+    SmallPtrSet<BasicBlock*, 64> Visited;
+    MadeIRChange |= AddReachableCodeToWorklist(F.begin(), Visited, *this, TD,
+                                               TLI);
+
+    // Do a quick scan over the function.  If we find any blocks that are
+    // unreachable, remove any instructions inside of them.  This prevents
+    // the instcombine code from having to deal with some bad special cases.
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (Visited.count(BB)) continue;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+      while (EndInst != BB->begin()) {
+        // Delete the next to last instruction.
+        BasicBlock::iterator I = EndInst;
+        Instruction *Inst = --I;
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+        if (isa<LandingPadInst>(Inst)) {
+          EndInst = Inst;
+          continue;
+        }
+        if (!isa<DbgInfoIntrinsic>(Inst)) {
+          ++NumDeadInst;
+          MadeIRChange = true;
+        }
+        Inst->eraseFromParent();
+      }
+    }
+  }
+
+  while (!Worklist.isEmpty()) {
+    Instruction *I = Worklist.RemoveOne();
+    if (I == 0) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I, TLI)) {
+      DEBUG(errs() << "IC: DCE: " << *I << '\n');
+      EraseInstFromFunction(*I);
+      ++NumDeadInst;
+      MadeIRChange = true;
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
+      if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
+        DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+
+        // Add operands to the worklist.
+        ReplaceInstUsesWith(*I, C);
+        ++NumConstProp;
+        EraseInstFromFunction(*I);
+        MadeIRChange = true;
+        continue;
+      }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      Instruction *UserInst = cast<Instruction>(I->use_back());
+      BasicBlock *UserParent;
+
+      // Get the block the use occurs in.
+      if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+        UserParent = PN->getIncomingBlock(I->use_begin().getUse());
+      else
+        UserParent = UserInst->getParent();
+
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && UserParent->getSinglePredecessor())
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          MadeIRChange |= TryToSinkInstruction(I, UserParent);
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it.
+    Builder->SetInsertPoint(I->getParent(), I);
+    Builder->SetCurrentDebugLocation(I->getDebugLoc());
+
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+    DEBUG(errs() << "IC: Visiting: " << OrigI << '\n');
+
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DEBUG(errs() << "IC: Old = " << *I << '\n'
+                     << "    New = " << *Result << '\n');
+
+        if (!I->getDebugLoc().isUnknown())
+          Result->setDebugLoc(I->getDebugLoc());
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Push the new instruction and any users onto the worklist.
+        Worklist.Add(Result);
+        Worklist.AddUsersToWorkList(*Result);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I;
+
+        // If we replace a PHI with something that isn't a PHI, fix up the
+        // insertion point.
+        if (!isa<PHINode>(Result) && isa<PHINode>(InsertPos))
+          InsertPos = InstParent->getFirstInsertionPt();
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        EraseInstFromFunction(*I);
+      } else {
+#ifndef NDEBUG
+        DEBUG(errs() << "IC: Mod = " << OrigI << '\n'
+                     << "    New = " << *I << '\n');
+#endif
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I, TLI)) {
+          EraseInstFromFunction(*I);
+        } else {
+          Worklist.Add(I);
+          Worklist.AddUsersToWorkList(*I);
+        }
+      }
+      MadeIRChange = true;
+    }
+  }
+
+  Worklist.Zap();
+  return MadeIRChange;
+}
+
+namespace {
+class InstCombinerLibCallSimplifier : public LibCallSimplifier {
+  InstCombiner *IC;
+public:
+  InstCombinerLibCallSimplifier(const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
+                                InstCombiner *IC)
+    : LibCallSimplifier(TD, TLI, UnsafeFPShrink) {
+    this->IC = IC;
+  }
+
+  /// replaceAllUsesWith - override so that instruction replacement
+  /// can be defined in terms of the instruction combiner framework.
+  virtual void replaceAllUsesWith(Instruction *I, Value *With) const {
+    IC->ReplaceInstUsesWith(*I, With);
+  }
+};
+}
+
+bool InstCombiner::runOnFunction(Function &F) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  // Minimizing size?
+  MinimizeSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                                Attribute::MinSize);
+
+  /// Builder - This is an IRBuilder that automatically inserts new
+  /// instructions into the worklist when they are created.
+  IRBuilder<true, TargetFolder, InstCombineIRInserter>
+    TheBuilder(F.getContext(), TargetFolder(TD),
+               InstCombineIRInserter(Worklist));
+  Builder = &TheBuilder;
+
+  InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
+  Simplifier = &TheSimplifier;
+
+  bool EverMadeChange = false;
+
+  // Lower dbg.declare intrinsics otherwise their value may be clobbered
+  // by instcombiner.
+  EverMadeChange = LowerDbgDeclare(F);
+
+  // Iterate while there is work to do.
+  unsigned Iteration = 0;
+  while (DoOneIteration(F, Iteration++))
+    EverMadeChange = true;
+
+  Builder = 0;
+  return EverMadeChange;
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstCombiner();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 000000000000..623c4705061e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,1453 @@
+//===-- AddressSanitizer.cpp - memory error detector ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asan"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <string>
+
+using namespace llvm;
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *kAsanModuleCtorName = "asan.module_ctor";
+static const char *kAsanModuleDtorName = "asan.module_dtor";
+static const int   kAsanCtorAndCtorPriority = 1;
+static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanReportLoadN = "__asan_report_load_n";
+static const char *kAsanReportStoreN = "__asan_report_store_n";
+static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
+static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
+static const char *kAsanInitName = "__asan_init_v3";
+static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
+static const char *kAsanMappingScaleName = "__asan_mapping_scale";
+static const char *kAsanStackMallocName = "__asan_stack_malloc";
+static const char *kAsanStackFreeName = "__asan_stack_free";
+static const char *kAsanGenPrefix = "__asan_gen_";
+static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory";
+static const char *kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
+
+static const int kAsanStackLeftRedzoneMagic = 0xf1;
+static const int kAsanStackMidRedzoneMagic = 0xf2;
+static const int kAsanStackRightRedzoneMagic = 0xf3;
+static const int kAsanStackPartialRedzoneMagic = 0xf4;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+// Command-line flags.
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+       cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
+       cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics",
+       cl::desc("instrument atomic instructions (rmw, cmpxchg)"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClAlwaysSlowPath("asan-always-slow-path",
+       cl::desc("use instrumentation with slow path for all accesses"),
+       cl::Hidden, cl::init(false));
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb",
+       cl::init(10000),
+       cl::desc("maximal number of instructions to instrument in any given BB"),
+       cl::Hidden);
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack",
+       cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-use-after-return.
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+       cl::desc("Check return-after-free"), cl::Hidden, cl::init(false));
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+       cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
+static cl::opt<bool> ClMemIntrin("asan-memintrin",
+       cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClRealignStack("asan-realign-stack",
+       cl::desc("Realign stack to 32"), cl::Hidden, cl::init(true));
+static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
+       cl::desc("File containing the list of objects to ignore "
+                "during instrumentation"), cl::Hidden);
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + (1 << offset_log)
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+       cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
+static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
+       cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+static cl::opt<bool> ClShort64BitOffset("asan-short-64bit-mapping-offset",
+       cl::desc("Use short immediate constant as the mapping offset for 64bit"),
+       cl::Hidden, cl::init(true));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+static cl::opt<bool> ClOpt("asan-opt",
+       cl::desc("Optimize instrumentation"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
+       cl::desc("Instrument the same temp just once"), cl::Hidden,
+       cl::init(true));
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+       cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClCheckLifetime("asan-check-lifetime",
+       cl::desc("Use llvm.lifetime intrinsics to insert extra checks"),
+       cl::Hidden, cl::init(false));
+
+// Debug flags.
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func",
+                                        cl::Hidden, cl::desc("Debug func"));
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
+                               cl::Hidden, cl::init(-1));
+
+namespace {
+/// A set of dynamically initialized globals extracted from metadata.
+class SetOfDynamicallyInitializedGlobals {
+ public:
+  void Init(Module& M) {
+    // Clang generates metadata identifying all dynamically initialized globals.
+    NamedMDNode *DynamicGlobals =
+        M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
+    if (!DynamicGlobals)
+      return;
+    for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
+      MDNode *MDN = DynamicGlobals->getOperand(i);
+      assert(MDN->getNumOperands() == 1);
+      Value *VG = MDN->getOperand(0);
+      // The optimizer may optimize away a global entirely, in which case we
+      // cannot instrument access to it.
+      if (!VG)
+        continue;
+      DynInitGlobals.insert(cast<GlobalVariable>(VG));
+    }
+  }
+  bool Contains(GlobalVariable *G) { return DynInitGlobals.count(G) != 0; }
+ private:
+  SmallSet<GlobalValue*, 32> DynInitGlobals;
+};
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = (mem >> Scale) ADD-or-OR Offset.
+struct ShadowMapping {
+  int Scale;
+  uint64_t Offset;
+  bool OrShadowOffset;
+};
+
+static ShadowMapping getShadowMapping(const Module &M, int LongSize,
+                                      bool ZeroBaseShadow) {
+  llvm::Triple TargetTriple(M.getTargetTriple());
+  bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
+  bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
+  bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+
+  ShadowMapping Mapping;
+
+  // OR-ing shadow offset if more efficient (at least on x86),
+  // but on ppc64 we have to use add since the shadow offset is not neccesary
+  // 1/8-th of the address space.
+  Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
+
+  Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
+      (LongSize == 32 ? kDefaultShadowOffset32 :
+       IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
+  if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
+    assert(LongSize == 64);
+    Mapping.Offset = kDefaultShort64bitShadowOffset;
+  }
+  if (!ZeroBaseShadow && ClMappingOffsetLog >= 0) {
+    // Zero offset log is the special case.
+    Mapping.Offset = (ClMappingOffsetLog == 0) ? 0 : 1ULL << ClMappingOffsetLog;
+  }
+
+  Mapping.Scale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    Mapping.Scale = ClMappingScale;
+  }
+
+  return Mapping;
+}
+
+static size_t RedzoneSizeForScale(int MappingScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << MappingScale);
+}
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer : public FunctionPass {
+  AddressSanitizer(bool CheckInitOrder = true,
+                   bool CheckUseAfterReturn = false,
+                   bool CheckLifetime = false,
+                   StringRef BlacklistFile = StringRef(),
+                   bool ZeroBaseShadow = false)
+      : FunctionPass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
+        CheckLifetime(CheckLifetime || ClCheckLifetime),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
+  virtual const char *getPassName() const {
+    return "AddressSanitizerFunctionPass";
+  }
+  void instrumentMop(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite,
+                         Value *SizeArgument);
+  Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                           Value *ShadowValue, uint32_t TypeSize);
+  Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+                                 bool IsWrite, size_t AccessSizeIndex,
+                                 Value *SizeArgument);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
+                                   Value *Size,
+                                   Instruction *InsertBefore, bool IsWrite);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool runOnFunction(Function &F);
+  bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+  void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
+  virtual bool doInitialization(Module &M);
+  static char ID;  // Pass identification, replacement for typeid
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  bool LooksLikeCodeInBug11395(Instruction *I);
+  void FindDynamicInitializers(Module &M);
+
+  bool CheckInitOrder;
+  bool CheckUseAfterReturn;
+  bool CheckLifetime;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
+  LLVMContext *C;
+  DataLayout *TD;
+  int LongSize;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+  Function *AsanCtorFunction;
+  Function *AsanInitFunction;
+  Function *AsanHandleNoReturnFunc;
+  OwningPtr<BlackList> BL;
+  // This array is indexed by AccessIsWrite and log2(AccessSize).
+  Function *AsanErrorCallback[2][kNumberOfAccessSizes];
+  // This array is indexed by AccessIsWrite.
+  Function *AsanErrorCallbackSized[2];
+  InlineAsm *EmptyAsm;
+  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+
+  friend struct FunctionStackPoisoner;
+};
+
+class AddressSanitizerModule : public ModulePass {
+ public:
+  AddressSanitizerModule(bool CheckInitOrder = true,
+                         StringRef BlacklistFile = StringRef(),
+                         bool ZeroBaseShadow = false)
+      : ModulePass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile),
+        ZeroBaseShadow(ZeroBaseShadow) {}
+  bool runOnModule(Module &M);
+  static char ID;  // Pass identification, replacement for typeid
+  virtual const char *getPassName() const {
+    return "AddressSanitizerModule";
+  }
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+
+  bool CheckInitOrder;
+  SmallString<64> BlacklistFile;
+  bool ZeroBaseShadow;
+
+  OwningPtr<BlackList> BL;
+  SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
+  Type *IntptrTy;
+  LLVMContext *C;
+  DataLayout *TD;
+  ShadowMapping Mapping;
+  Function *AsanPoisonGlobals;
+  Function *AsanUnpoisonGlobals;
+  Function *AsanRegisterGlobals;
+  Function *AsanUnregisterGlobals;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+  Function &F;
+  AddressSanitizer &ASan;
+  DIBuilder DIB;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  ShadowMapping Mapping;
+
+  SmallVector<AllocaInst*, 16> AllocaVec;
+  SmallVector<Instruction*, 8> RetVec;
+  uint64_t TotalStackSize;
+  unsigned StackAlignment;
+
+  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
+
+  // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+  struct AllocaPoisonCall {
+    IntrinsicInst *InsBefore;
+    uint64_t Size;
+    bool DoPoison;
+  };
+  SmallVector<AllocaPoisonCall, 8> AllocaPoisonCallVec;
+
+  // Maps Value to an AllocaInst from which the Value is originated.
+  typedef DenseMap<Value*, AllocaInst*> AllocaForValueMapTy;
+  AllocaForValueMapTy AllocaForValue;
+
+  FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+      : F(F), ASan(ASan), DIB(*F.getParent()), C(ASan.C),
+        IntptrTy(ASan.IntptrTy), IntptrPtrTy(PointerType::get(IntptrTy, 0)),
+        Mapping(ASan.Mapping),
+        TotalStackSize(0), StackAlignment(1 << Mapping.Scale) {}
+
+  bool runOnFunction() {
+    if (!ClStack) return false;
+    // Collect alloca, ret, lifetime instructions etc.
+    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+      BasicBlock *BB = *DI;
+      visit(*BB);
+    }
+    if (AllocaVec.empty()) return false;
+
+    initializeCallbacks(*F.getParent());
+
+    poisonStack();
+
+    if (ClDebugStack) {
+      DEBUG(dbgs() << F);
+    }
+    return true;
+  }
+
+  // Finds all static Alloca instructions and puts
+  // poisoned red zones around all of them.
+  // Then unpoison everything back before the function returns.
+  void poisonStack();
+
+  // ----------------------- Visitors.
+  /// \brief Collect all Ret instructions.
+  void visitReturnInst(ReturnInst &RI) {
+    RetVec.push_back(&RI);
+  }
+
+  /// \brief Collect Alloca instructions we want (and can) handle.
+  void visitAllocaInst(AllocaInst &AI) {
+    if (!isInterestingAlloca(AI)) return;
+
+    StackAlignment = std::max(StackAlignment, AI.getAlignment());
+    AllocaVec.push_back(&AI);
+    uint64_t AlignedSize =  getAlignedAllocaSize(&AI);
+    TotalStackSize += AlignedSize;
+  }
+
+  /// \brief Collect lifetime intrinsic calls to check for use-after-scope
+  /// errors.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!ASan.CheckLifetime) return;
+    Intrinsic::ID ID = II.getIntrinsicID();
+    if (ID != Intrinsic::lifetime_start &&
+        ID != Intrinsic::lifetime_end)
+      return;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne()) return;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+      return;
+    // Find alloca instruction that corresponds to llvm.lifetime argument.
+    AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
+    if (!AI) return;
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    AllocaPoisonCall APC = {&II, SizeValue, DoPoison};
+    AllocaPoisonCallVec.push_back(APC);
+  }
+
+  // ---------------------- Helpers.
+  void initializeCallbacks(Module &M);
+
+  // Check if we want (and can) handle this alloca.
+  bool isInterestingAlloca(AllocaInst &AI) {
+    return (!AI.isArrayAllocation() &&
+            AI.isStaticAlloca() &&
+            AI.getAllocatedType()->isSized());
+  }
+
+  size_t RedzoneSize() const {
+    return RedzoneSizeForScale(Mapping.Scale);
+  }
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
+    return SizeInBytes;
+  }
+  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+    size_t RZ = RedzoneSize();
+    return ((SizeInBytes + RZ - 1) / RZ) * RZ;
+  }
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    return getAlignedSize(SizeInBytes);
+  }
+  /// Finds alloca where the value comes from.
+  AllocaInst *findAllocaForValue(Value *V);
+  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+                      Value *ShadowBase, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
+};
+
+}  // namespace
+
+char AddressSanitizer::ID = 0;
+INITIALIZE_PASS(AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
+    false, false)
+FunctionPass *llvm::createAddressSanitizerFunctionPass(
+    bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
+    StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
+                              CheckLifetime, BlacklistFile, ZeroBaseShadow);
+}
+
+char AddressSanitizerModule::ID = 0;
+INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass", false, false)
+ModulePass *llvm::createAddressSanitizerModulePass(
+    bool CheckInitOrder, StringRef BlacklistFile, bool ZeroBaseShadow) {
+  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile,
+                                    ZeroBaseShadow);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = CountTrailingZeros_32(TypeSize / 8);
+  assert(Res < kNumberOfAccessSizes);
+  return Res;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true,
+                            GlobalValue::PrivateLinkage, StrConst,
+                            kAsanGenPrefix);
+  GV->setUnnamedAddr(true);  // Ok to merge these.
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
+  return G->getName().find(kAsanGenPrefix) == 0;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  if (Mapping.Offset == 0)
+    return Shadow;
+  // (Shadow >> scale) | offset
+  if (Mapping.OrShadowOffset)
+    return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+  else
+    return IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Mapping.Offset));
+}
+
+void AddressSanitizer::instrumentMemIntrinsicParam(
+    Instruction *OrigIns,
+    Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  if (Size->getType() != IntptrTy)
+    Size = IRB.CreateIntCast(Size, IntptrTy, false);
+  // Check the first byte.
+  instrumentAddress(OrigIns, InsertBefore, Addr, 8, IsWrite, Size);
+  // Check the last byte.
+  IRB.SetInsertPoint(InsertBefore);
+  Value *SizeMinusOne = IRB.CreateSub(Size, ConstantInt::get(IntptrTy, 1));
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+  Value *AddrLast = IRB.CreateAdd(AddrLong, SizeMinusOne);
+  instrumentAddress(OrigIns, InsertBefore, AddrLast, 8, IsWrite, Size);
+}
+
+// Instrument memset/memmove/memcpy
+bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  Value *Dst = MI->getDest();
+  MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
+  Value *Src = MemTran ? MemTran->getSource() : 0;
+  Value *Length = MI->getLength();
+
+  Constant *ConstLength = dyn_cast<Constant>(Length);
+  Instruction *InsertBefore = MI;
+  if (ConstLength) {
+    if (ConstLength->isNullValue()) return false;
+  } else {
+    // The size is not a constant so it could be zero -- check at run-time.
+    IRBuilder<> IRB(InsertBefore);
+
+    Value *Cmp = IRB.CreateICmpNE(Length,
+                                  Constant::getNullValue(Length->getType()));
+    InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+  }
+
+  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
+  if (Src)
+    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
+  return true;
+}
+
+// If I is an interesting memory access, return the PointerOperand
+// and set IsWrite. Otherwise return NULL.
+static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads) return NULL;
+    *IsWrite = false;
+    return LI->getPointerOperand();
+  }
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites) return NULL;
+    *IsWrite = true;
+    return SI->getPointerOperand();
+  }
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return RMW->getPointerOperand();
+  }
+  if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics) return NULL;
+    *IsWrite = true;
+    return XCHG->getPointerOperand();
+  }
+  return NULL;
+}
+
+void AddressSanitizer::instrumentMop(Instruction *I) {
+  bool IsWrite = false;
+  Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
+  assert(Addr);
+  if (ClOpt && ClOptGlobals) {
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
+      // If initialization order checking is disabled, a simple access to a
+      // dynamically initialized global is always valid.
+      if (!CheckInitOrder)
+        return;
+      // If a global variable does not have dynamic initialization we don't
+      // have to instrument it.  However, if a global does not have initailizer
+      // at all, we assume it has dynamic initializer (in other TU).
+      if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G))
+        return;
+    }
+  }
+
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+
+  assert((TypeSize % 8) == 0);
+
+  // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
+  if (TypeSize == 8  || TypeSize == 16 ||
+      TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
+    return instrumentAddress(I, I, Addr, TypeSize, IsWrite, 0);
+  // Instrument unusual size (but still multiple of 8).
+  // We can not do it with a single check, so we do 1-byte check for the first
+  // and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+  // to report the actual access size.
+  IRBuilder<> IRB(I);
+  Value *LastByte =  IRB.CreateIntToPtr(
+      IRB.CreateAdd(IRB.CreatePointerCast(Addr, IntptrTy),
+                    ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+      OrigPtrTy);
+  Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+  instrumentAddress(I, I, Addr, 8, IsWrite, Size);
+  instrumentAddress(I, I, LastByte, 8, IsWrite, Size);
+}
+
+// Validate the result of Module::getOrInsertFunction called for an interface
+// function of AddressSanitizer. If the instrumented module defines a function
+// with the same name, their prototypes must match, otherwise
+// getOrInsertFunction returns a bitcast.
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast);
+  FuncOrBitcast->dump();
+  report_fatal_error("trying to redefine an AddressSanitizer "
+                     "interface function");
+}
+
+Instruction *AddressSanitizer::generateCrashCode(
+    Instruction *InsertBefore, Value *Addr,
+    bool IsWrite, size_t AccessSizeIndex, Value *SizeArgument) {
+  IRBuilder<> IRB(InsertBefore);
+  CallInst *Call = SizeArgument
+    ? IRB.CreateCall2(AsanErrorCallbackSized[IsWrite], Addr, SizeArgument)
+    : IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr);
+
+  // We don't do Call->setDoesNotReturn() because the BB already has
+  // UnreachableInst at the end.
+  // This EmptyAsm is required to avoid callback merge.
+  IRB.CreateCall(EmptyAsm);
+  return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+                                            Value *ShadowValue,
+                                            uint32_t TypeSize) {
+  size_t Granularity = 1 << Mapping.Scale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte = IRB.CreateAnd(
+      AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte = IRB.CreateIntCast(
+      LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         Instruction *InsertBefore,
+                                         Value *Addr, uint32_t TypeSize,
+                                         bool IsWrite, Value *SizeArgument) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  Type *ShadowTy  = IntegerType::get(
+      *C, std::max(8U, TypeSize >> Mapping.Scale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue = IRB.CreateLoad(
+      IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+  size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+  size_t Granularity = 1 << Mapping.Scale;
+  TerminatorInst *CrashTerm = 0;
+
+  if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+    TerminatorInst *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+    assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional());
+    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    IRB.SetInsertPoint(CheckTerm);
+    Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+    BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+    CrashTerm = new UnreachableInst(*C, CrashBlock);
+    BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+    ReplaceInstWithInst(CheckTerm, NewTerm);
+  } else {
+    CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
+  }
+
+  Instruction *Crash = generateCrashCode(
+      CrashTerm, AddrLong, IsWrite, AccessSizeIndex, SizeArgument);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+void AddressSanitizerModule::createInitializerPoisonCalls(
+    Module &M, GlobalValue *ModuleName) {
+  // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
+  Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
+  // If that function is not present, this TU contains no globals, or they have
+  // all been optimized away
+  if (!GlobalInit)
+    return;
+
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
+
+  // Add a call to poison all external globals before the given function starts.
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
+      I != E; ++I) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) {
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+    }
+  }
+}
+
+bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
+  Type *Ty = cast<PointerType>(G->getType())->getElementType();
+  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  if (BL->isIn(*G)) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  if (GlobalWasGeneratedByAsan(G)) return false;  // Our own global.
+  // Touch only those globals that will not be defined in other modules.
+  // Don't handle ODR type linkages since other modules may be built w/o asan.
+  if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+      G->getLinkage() != GlobalVariable::PrivateLinkage &&
+      G->getLinkage() != GlobalVariable::InternalLinkage)
+    return false;
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal())
+    return false;
+  // For now, just ignore this Alloca if the alignment is large.
+  if (G->getAlignment() > RedzoneSize()) return false;
+
+  // Ignore all the globals with the names starting with "\01L_OBJC_".
+  // Many of those are put into the .cstring section. The linker compresses
+  // that section by removing the spare \0s after the string terminator, so
+  // our redzones get broken.
+  if ((G->getName().find("\01L_OBJC_") == 0) ||
+      (G->getName().find("\01l_OBJC_") == 0)) {
+    DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G);
+    return false;
+  }
+
+  if (G->hasSection()) {
+    StringRef Section(G->getSection());
+    // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+    // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+    // them.
+    if ((Section.find("__OBJC,") == 0) ||
+        (Section.find("__DATA, __objc_") == 0)) {
+      DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G);
+      return false;
+    }
+    // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+    // Constant CFString instances are compiled in the following way:
+    //  -- the string buffer is emitted into
+    //     __TEXT,__cstring,cstring_literals
+    //  -- the constant NSConstantString structure referencing that buffer
+    //     is placed into __DATA,__cfstring
+    // Therefore there's no point in placing redzones into __DATA,__cfstring.
+    // Moreover, it causes the linker to crash on OS X 10.7
+    if (Section.find("__DATA,__cfstring") == 0) {
+      DEBUG(dbgs() << "Ignoring CFString: " << *G);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void AddressSanitizerModule::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Declare our poisoning and unpoisoning functions.
+  AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, NULL));
+  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
+  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+  // Declare functions that register/unregister globals.
+  AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, NULL));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+  AsanUnregisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnregisterGlobalsName,
+      IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  if (!ClGlobals) return false;
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  if (BL->isIn(M)) return false;
+  C = &(M.getContext());
+  int LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  initializeCallbacks(M);
+  DynamicallyInitializedGlobals.Init(M);
+
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (Module::GlobalListType::iterator G = M.global_begin(),
+       E = M.global_end(); G != E; ++G) {
+    if (ShouldInstrumentGlobal(G))
+      GlobalsToChange.push_back(G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) return false;
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  //   const char *module_name;
+  //   size_t has_dynamic_init;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
+                                               IntptrTy, IntptrTy,
+                                               IntptrTy, IntptrTy, NULL);
+  SmallVector<Constant *, 16> Initializers(n), DynamicInit;
+
+
+  Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
+  assert(CtorFunc);
+  IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+
+  bool HasDynamicallyInitializedGlobals = false;
+
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier());
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  ModuleName->setUnnamedAddr(false);
+
+  for (size_t i = 0; i < n; i++) {
+    static const uint64_t kMaxGlobalRedzone = 1 << 18;
+    GlobalVariable *G = GlobalsToChange[i];
+    PointerType *PtrTy = cast<PointerType>(G->getType());
+    Type *Ty = PtrTy->getElementType();
+    uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
+    uint64_t MinRZ = RedzoneSize();
+    // MinRZ <= RZ <= kMaxGlobalRedzone
+    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
+    uint64_t RZ = std::max(MinRZ,
+                         std::min(kMaxGlobalRedzone,
+                                  (SizeInBytes / MinRZ / 4) * MinRZ));
+    uint64_t RightRedzoneSize = RZ;
+    // Round up to MinRZ
+    if (SizeInBytes % MinRZ)
+      RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
+    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+    // Determine whether this global should be poisoned in initialization.
+    bool GlobalHasDynamicInitializer =
+        DynamicallyInitializedGlobals.Contains(G);
+    // Don't check initialization order if this global is blacklisted.
+    GlobalHasDynamicInitializer &= !BL->isInInit(*G);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(),
+        Constant::getNullValue(RightRedZoneTy), NULL);
+
+    GlobalVariable *Name = createPrivateGlobalForString(M, G->getName());
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalVariable *NewGlobal = new GlobalVariable(
+        M, NewTy, G->isConstant(), G->getLinkage(),
+        NewInitializer, "", G, G->getThreadLocalMode());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setAlignment(MinRZ);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewGlobal, Indices2, true));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+
+    Initializers[i] = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+        ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
+        NULL);
+
+    // Populate the first and last globals declared in this TU.
+    if (CheckInitOrder && GlobalHasDynamicInitializer)
+      HasDynamicallyInitializedGlobals = true;
+
+    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+  }
+
+  ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
+  GlobalVariable *AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
+  IRB.CreateCall2(AsanRegisterGlobals,
+                  IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, n));
+
+  // We also need to unregister globals at the end, e.g. when a shared library
+  // gets closed.
+  Function *AsanDtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+  BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+  IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB));
+  IRB_Dtor.CreateCall2(AsanUnregisterGlobals,
+                       IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                       ConstantInt::get(IntptrTy, n));
+  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndCtorPriority);
+
+  DEBUG(dbgs() << M);
+  return true;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  // Create __asan_report* callbacks.
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+         AccessSizeIndex++) {
+      // IsWrite and TypeSize are encoded in the function name.
+      std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+          (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
+      // If we are merging crash callbacks, they have two parameters.
+      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
+          checkInterfaceFunction(M.getOrInsertFunction(
+              FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+    }
+  }
+  AsanErrorCallbackSized[0] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportLoadN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanErrorCallbackSized[1] = checkInterfaceFunction(M.getOrInsertFunction(
+              kAsanReportStoreN, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+
+  AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  // We insert an empty inline asm after __asan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+void AddressSanitizer::emitShadowMapping(Module &M, IRBuilder<> &IRB) const {
+  // Tell the values of mapping offset and scale to the run-time.
+  GlobalValue *asan_mapping_offset =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                     ConstantInt::get(IntptrTy, Mapping.Offset),
+                     kAsanMappingOffsetName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_offset, true);
+
+  GlobalValue *asan_mapping_scale =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                         ConstantInt::get(IntptrTy, Mapping.Scale),
+                         kAsanMappingScaleName);
+  // Read the global, otherwise it may be optimized away.
+  IRB.CreateLoad(asan_mapping_scale, true);
+}
+
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  DynamicallyInitializedGlobals.Init(M);
+
+  C = &(M.getContext());
+  LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+
+  AsanCtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
+  // call __asan_init in the module ctor.
+  IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
+  AsanInitFunction = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+  AsanInitFunction->setLinkage(Function::ExternalLinkage);
+  IRB.CreateCall(AsanInitFunction);
+
+  Mapping = getShadowMapping(M, LongSize, ZeroBaseShadow);
+  emitShadowMapping(M, IRB);
+
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
+  return true;
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    IRBuilder<> IRB(F.begin()->begin());
+    IRB.CreateCall(AsanInitFunction);
+    return true;
+  }
+  return false;
+}
+
+bool AddressSanitizer::runOnFunction(Function &F) {
+  if (BL->isIn(F)) return false;
+  if (&F == AsanCtorFunction) return false;
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+  initializeCallbacks(*F.getParent());
+
+  // If needed, insert __asan_init before checking for SanitizeAddress attr.
+  maybeInsertAsanInitAtFunctionEntry(F);
+
+  if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::SanitizeAddress))
+    return false;
+
+  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
+    return false;
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
+  SmallSet<Value*, 16> TempsToInstrument;
+  SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<Instruction*, 8> NoReturnCalls;
+  bool IsWrite;
+
+  // Fill the set of memory operations to instrument.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    TempsToInstrument.clear();
+    int NumInsnsPerBB = 0;
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      if (LooksLikeCodeInBug11395(BI)) return false;
+      if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) {
+        if (ClOpt && ClOptSameTemp) {
+          if (!TempsToInstrument.insert(Addr))
+            continue;  // We've seen this temp in the current BB.
+        }
+      } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
+        // ok, take it.
+      } else {
+        CallSite CS(BI);
+        if (CS) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+          if (CS.doesNotReturn())
+            NoReturnCalls.push_back(CS.getInstruction());
+        }
+        continue;
+      }
+      ToInstrument.push_back(BI);
+      NumInsnsPerBB++;
+      if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB)
+        break;
+    }
+  }
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
+    Instruction *Inst = ToInstrument[i];
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isInterestingMemoryAccess(Inst, &IsWrite))
+        instrumentMop(Inst);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  FunctionStackPoisoner FSP(F, *this);
+  bool ChangedStack = FSP.runOnFunction();
+
+  // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
+  // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
+  for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) {
+    Instruction *CI = NoReturnCalls[i];
+    IRBuilder<> IRB(CI);
+    IRB.CreateCall(AsanHandleNoReturnFunc);
+  }
+  DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n");
+
+  return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+}
+
+static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) {
+  if (ShadowRedzoneSize == 1) return PoisonByte;
+  if (ShadowRedzoneSize == 2) return (PoisonByte << 8) + PoisonByte;
+  if (ShadowRedzoneSize == 4)
+    return (PoisonByte << 24) + (PoisonByte << 16) +
+        (PoisonByte << 8) + (PoisonByte);
+  llvm_unreachable("ShadowRedzoneSize is either 1, 2 or 4");
+}
+
+static void PoisonShadowPartialRightRedzone(uint8_t *Shadow,
+                                            size_t Size,
+                                            size_t RZSize,
+                                            size_t ShadowGranularity,
+                                            uint8_t Magic) {
+  for (size_t i = 0; i < RZSize;
+       i+= ShadowGranularity, Shadow++) {
+    if (i + ShadowGranularity <= Size) {
+      *Shadow = 0;  // fully addressable
+    } else if (i >= Size) {
+      *Shadow = Magic;  // unaddressable
+    } else {
+      *Shadow = Size - i;  // first Size-i bytes are addressable
+    }
+  }
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackFreeName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+}
+
+void FunctionStackPoisoner::poisonRedZones(
+  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
+  bool DoPoison) {
+  size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
+  assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
+  Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
+  Type *RZPtrTy = PointerType::get(RZTy, 0);
+
+  Value *PoisonLeft  = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackLeftRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonMid   = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackMidRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonRight = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackRightRedzoneMagic : 0LL, ShadowRZSize));
+
+  // poison the first red zone.
+  IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy));
+
+  // poison all other red zones.
+  uint64_t Pos = RedzoneSize();
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert(AlignedSize - SizeInBytes < RedzoneSize());
+    Value *Ptr = NULL;
+
+    Pos += AlignedSize;
+
+    assert(ShadowBase->getType() == IntptrTy);
+    if (SizeInBytes < AlignedSize) {
+      // Poison the partial redzone at right
+      Ptr = IRB.CreateAdd(
+          ShadowBase, ConstantInt::get(IntptrTy,
+                                       (Pos >> Mapping.Scale) - ShadowRZSize));
+      size_t AddressableBytes = RedzoneSize() - (AlignedSize - SizeInBytes);
+      uint32_t Poison = 0;
+      if (DoPoison) {
+        PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
+                                        RedzoneSize(),
+                                        1ULL << Mapping.Scale,
+                                        kAsanStackPartialRedzoneMagic);
+      }
+      Value *PartialPoison = ConstantInt::get(RZTy, Poison);
+      IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+    }
+
+    // Poison the full redzone at right.
+    Ptr = IRB.CreateAdd(ShadowBase,
+                        ConstantInt::get(IntptrTy, Pos >> Mapping.Scale));
+    bool LastAlloca = (i == AllocaVec.size() - 1);
+    Value *Poison = LastAlloca ? PoisonRight : PoisonMid;
+    IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+
+    Pos += RedzoneSize();
+  }
+}
+
+void FunctionStackPoisoner::poisonStack() {
+  uint64_t LocalStackSize = TotalStackSize +
+                            (AllocaVec.size() + 1) * RedzoneSize();
+
+  bool DoStackMalloc = ASan.CheckUseAfterReturn
+      && LocalStackSize <= kMaxStackMallocSize;
+
+  assert(AllocaVec.size() > 0);
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+
+
+  Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
+  AllocaInst *MyAlloca =
+      new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
+  if (ClRealignStack && StackAlignment < RedzoneSize())
+    StackAlignment = RedzoneSize();
+  MyAlloca->setAlignment(StackAlignment);
+  assert(MyAlloca->isStaticAlloca());
+  Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
+  Value *LocalStackBase = OrigStackBase;
+
+  if (DoStackMalloc) {
+    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+        ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+  }
+
+  // This string will be parsed by the run-time (DescribeAddressIfStack).
+  SmallString<2048> StackDescriptionStorage;
+  raw_svector_ostream StackDescription(StackDescriptionStorage);
+  StackDescription << AllocaVec.size() << " ";
+
+  // Insert poison calls for lifetime intrinsics for alloca.
+  bool HavePoisonedAllocas = false;
+  for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
+    const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
+    IntrinsicInst *II = APC.InsBefore;
+    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
+    assert(AI);
+    IRBuilder<> IRB(II);
+    poisonAlloca(AI, APC.Size, IRB, APC.DoPoison);
+    HavePoisonedAllocas |= APC.DoPoison;
+  }
+
+  uint64_t Pos = RedzoneSize();
+  // Replace Alloca instructions with base+offset.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    StringRef Name = AI->getName();
+    StackDescription << Pos << " " << SizeInBytes << " "
+                     << Name.size() << " " << Name << " ";
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert((AlignedSize % RedzoneSize()) == 0);
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
+            IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
+            AI->getType());
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
+    AI->replaceAllUsesWith(NewAllocaPtr);
+    Pos += AlignedSize + RedzoneSize();
+  }
+  assert(Pos == LocalStackSize);
+
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)),
+    IntptrPtrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), StackDescription.str());
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal,
+                                             IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy,
+                                                   2 * ASan.LongSize/8)),
+    IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+  // Poison the stack redzones at the entry.
+  Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+  poisonRedZones(AllocaVec, IRB, ShadowBase, true);
+
+  // Unpoison the stack before all ret instructions.
+  for (size_t i = 0, n = RetVec.size(); i < n; i++) {
+    Instruction *Ret = RetVec[i];
+    IRBuilder<> IRBRet(Ret);
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    // Unpoison the stack.
+    poisonRedZones(AllocaVec, IRBRet, ShadowBase, false);
+    if (DoStackMalloc) {
+      // In use-after-return mode, mark the whole stack frame unaddressable.
+      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
+                         ConstantInt::get(IntptrTy, LocalStackSize),
+                         OrigStackBase);
+    } else if (HavePoisonedAllocas) {
+      // If we poisoned some allocas in llvm.lifetime analysis,
+      // unpoison whole stack frame now.
+      assert(LocalStackBase == OrigStackBase);
+      poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
+    }
+  }
+
+  // We are done. Remove the old unused alloca instructions.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
+    AllocaVec[i]->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+                                         IRBuilder<> IRB, bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc
+                           : AsanUnpoisonStackMemoryFunc,
+                  AddrArg, SizeArg);
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+
+AllocaInst *FunctionStackPoisoner::findAllocaForValue(Value *V) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
+    // We're intested only in allocas we can handle.
+    return isInterestingAlloca(*AI) ? AI : 0;
+  // See if we've already calculated (or started to calculate) alloca for a
+  // given value.
+  AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
+  if (I != AllocaForValue.end())
+    return I->second;
+  // Store 0 while we're calculating alloca for value V to avoid
+  // infinite recursion if the value references itself.
+  AllocaForValue[V] = 0;
+  AllocaInst *Res = 0;
+  if (CastInst *CI = dyn_cast<CastInst>(V))
+    Res = findAllocaForValue(CI->getOperand(0));
+  else if (PHINode *PN = dyn_cast<PHINode>(V)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *IncValue = PN->getIncomingValue(i);
+      // Allow self-referencing phi-nodes.
+      if (IncValue == PN) continue;
+      AllocaInst *IncValueAI = findAllocaForValue(IncValue);
+      // AI for incoming values should exist and should all be equal.
+      if (IncValueAI == 0 || (Res != 0 && IncValueAI != Res))
+        return 0;
+      Res = IncValueAI;
+    }
+  }
+  if (Res != 0)
+    AllocaForValue[V] = Res;
+  return Res;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp
new file mode 100644
index 000000000000..927982d2af47
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BlackList.cpp
@@ -0,0 +1,125 @@
+//===-- BlackList.cpp - blacklist for sanitizers --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <string>
+#include <utility>
+
+namespace llvm {
+
+BlackList::BlackList(const StringRef Path) {
+  // Validate and open blacklist file.
+  if (Path.empty()) return;
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+    report_fatal_error("Can't open blacklist file: " + Path + ": " +
+                       EC.message());
+  }
+
+  // Iterate through each line in the blacklist file.
+  SmallVector<StringRef, 16> Lines;
+  SplitString(File.take()->getBuffer(), Lines, "\n\r");
+  StringMap<std::string> Regexps;
+  for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
+       I != E; ++I) {
+    // Ignore empty lines and lines starting with "#"
+    if (I->empty() || I->startswith("#"))
+      continue;
+    // Get our prefix and unparsed regexp.
+    std::pair<StringRef, StringRef> SplitLine = I->split(":");
+    StringRef Prefix = SplitLine.first;
+    std::string Regexp = SplitLine.second;
+    if (Regexp.empty()) {
+      // Missing ':' in the line.
+      report_fatal_error("malformed blacklist line: " + SplitLine.first);
+    }
+
+    // Replace * with .*
+    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
+         pos += strlen(".*")) {
+      Regexp.replace(pos, strlen("*"), ".*");
+    }
+
+    // Check that the regexp is valid.
+    Regex CheckRE(Regexp);
+    std::string Error;
+    if (!CheckRE.isValid(Error)) {
+      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
+          ": " + Error);
+    }
+
+    // Add this regexp into the proper group by its prefix.
+    if (!Regexps[Prefix].empty())
+      Regexps[Prefix] += "|";
+    Regexps[Prefix] += Regexp;
+  }
+
+  // Iterate through each of the prefixes, and create Regexs for them.
+  for (StringMap<std::string>::const_iterator I = Regexps.begin(),
+       E = Regexps.end(); I != E; ++I) {
+    Entries[I->getKey()] = new Regex(I->getValue());
+  }
+}
+
+bool BlackList::isIn(const Function &F) const {
+  return isIn(*F.getParent()) || inSection("fun", F.getName());
+}
+
+bool BlackList::isIn(const GlobalVariable &G) const {
+  return isIn(*G.getParent()) || inSection("global", G.getName());
+}
+
+bool BlackList::isIn(const Module &M) const {
+  return inSection("src", M.getModuleIdentifier());
+}
+
+static StringRef GetGVTypeString(const GlobalVariable &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getType()->getElementType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+bool BlackList::isInInit(const GlobalVariable &G) const {
+  return (isIn(*G.getParent()) ||
+          inSection("global-init", G.getName()) ||
+          inSection("global-init-type", GetGVTypeString(G)));
+}
+
+bool BlackList::inSection(const StringRef Section,
+                          const StringRef Query) const {
+  StringMap<Regex*>::const_iterator I = Entries.find(Section);
+  if (I == Entries.end()) return false;
+
+  Regex *FunctionRegex = I->getValue();
+  return FunctionRegex->match(Query);
+}
+
+}  // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
new file mode 100644
index 000000000000..b094d42568f0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -0,0 +1,212 @@
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that instruments the code to perform run-time
+// bounds checking on loads, stores, and other memory intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bounds-checking"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/TargetFolder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+using namespace llvm;
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+                                  cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+typedef IRBuilder<true, TargetFolder> BuilderTy;
+
+namespace {
+  struct BoundsChecking : public FunctionPass {
+    static char ID;
+
+    BoundsChecking() : FunctionPass(ID) {
+      initializeBoundsCheckingPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DataLayout>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+  private:
+    const DataLayout *TD;
+    const TargetLibraryInfo *TLI;
+    ObjectSizeOffsetEvaluator *ObjSizeEval;
+    BuilderTy *Builder;
+    Instruction *Inst;
+    BasicBlock *TrapBB;
+
+    BasicBlock *getTrapBB();
+    void emitBranchToTrap(Value *Cmp = 0);
+    bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue,
+                          APInt &Size, Value* &SizeValue);
+    bool instrument(Value *Ptr, Value *Val);
+ };
+}
+
+char BoundsChecking::ID = 0;
+INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking",
+                false, false)
+
+
+/// getTrapBB - create a basic block that traps. All overflowing conditions
+/// branch to this block. There's only one trap block per function.
+BasicBlock *BoundsChecking::getTrapBB() {
+  if (TrapBB && SingleTrapBB)
+    return TrapBB;
+
+  Function *Fn = Inst->getParent()->getParent();
+  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+  Builder->SetInsertPoint(TrapBB);
+
+  llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+  CallInst *TrapCall = Builder->CreateCall(F);
+  TrapCall->setDoesNotReturn();
+  TrapCall->setDoesNotThrow();
+  TrapCall->setDebugLoc(Inst->getDebugLoc());
+  Builder->CreateUnreachable();
+
+  Builder->SetInsertPoint(PrevInsertPoint);
+  return TrapBB;
+}
+
+
+/// emitBranchToTrap - emit a branch instruction to a trap block.
+/// If Cmp is non-null, perform a jump only if its value evaluates to true.
+void BoundsChecking::emitBranchToTrap(Value *Cmp) {
+  // check if the comparison is always false
+  ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp);
+  if (C) {
+    ++ChecksSkipped;
+    if (!C->getZExtValue())
+      return;
+    else
+      Cmp = 0; // unconditional branch
+  }
+  ++ChecksAdded;
+
+  Instruction *Inst = Builder->GetInsertPoint();
+  BasicBlock *OldBB = Inst->getParent();
+  BasicBlock *Cont = OldBB->splitBasicBlock(Inst);
+  OldBB->getTerminator()->eraseFromParent();
+
+  if (Cmp)
+    BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB);
+  else
+    BranchInst::Create(getTrapBB(), OldBB);
+}
+
+
+/// instrument - adds run-time bounds checks to memory accessing instructions.
+/// Ptr is the pointer that will be read/written, and InstVal is either the
+/// result from the load or the value being stored. It is used to determine the
+/// size of memory block that is touched.
+/// Returns true if any change was made to the IR, false otherwise.
+bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
+  uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType());
+  DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+              << " bytes\n");
+
+  SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr);
+
+  if (!ObjSizeEval->bothKnown(SizeOffset)) {
+    ++ChecksUnable;
+    return false;
+  }
+
+  Value *Size   = SizeOffset.first;
+  Value *Offset = SizeOffset.second;
+  ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+  Type *IntTy = TD->getIntPtrType(Ptr->getType());
+  Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+  // three checks are required to ensure safety:
+  // . Offset >= 0  (since the offset is given from the base ptr)
+  // . Size >= Offset  (unsigned)
+  // . Size - Offset >= NeededSize  (unsigned)
+  //
+  // optimization: if Size >= 0 (signed), skip 1st check
+  // FIXME: add NSW/NUW here?  -- we dont care if the subtraction overflows
+  Value *ObjSize = Builder->CreateSub(Size, Offset);
+  Value *Cmp2 = Builder->CreateICmpULT(Size, Offset);
+  Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal);
+  Value *Or = Builder->CreateOr(Cmp2, Cmp3);
+  if (!SizeCI || SizeCI->getValue().slt(0)) {
+    Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+    Or = Builder->CreateOr(Cmp1, Or);
+  }
+  emitBranchToTrap(Or);
+
+  return true;
+}
+
+bool BoundsChecking::runOnFunction(Function &F) {
+  TD = &getAnalysis<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  TrapBB = 0;
+  BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
+  Builder = &TheBuilder;
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
+  ObjSizeEval = &TheObjSizeEval;
+
+  // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+  // touching instructions
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) ||
+        isa<AtomicRMWInst>(I))
+        WorkList.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (std::vector<Instruction*>::iterator i = WorkList.begin(),
+       e = WorkList.end(); i != e; ++i) {
+    Inst = *i;
+
+    Builder->SetInsertPoint(Inst);
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      MadeChange |= instrument(LI->getPointerOperand(), LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand());
+    } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand());
+    } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) {
+      MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand());
+    } else {
+      llvm_unreachable("unknown Instruction type");
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createBoundsCheckingPass() {
+  return new BoundsChecking();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp
new file mode 100644
index 000000000000..a2459fbafe18
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -0,0 +1,117 @@
+//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for edge profiling.
+// Edge profiling can give a reasonable approximation of the hot paths through a
+// program, and is used for a wide variety of program transformations.
+//
+// Note that this implementation is very naive.  We insert a counter for *every*
+// edge in the program, instead of using control flow information to prune the
+// number of counters inserted.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "insert-edge-profiling"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "ProfilingUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumEdgesInserted, "The # of edges inserted.");
+
+namespace {
+  class EdgeProfiler : public ModulePass {
+    bool runOnModule(Module &M);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    EdgeProfiler() : ModulePass(ID) {
+      initializeEdgeProfilerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual const char *getPassName() const {
+      return "Edge Profiler";
+    }
+  };
+}
+
+char EdgeProfiler::ID = 0;
+INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling",
+                "Insert instrumentation for edge profiling", false, false)
+
+ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
+
+bool EdgeProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  std::set<BasicBlock*> BlocksToInstrument;
+  unsigned NumEdges = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    // Reserve space for (0,entry) edge.
+    ++NumEdges;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // Keep track of which blocks need to be instrumented.  We don't want to
+      // instrument blocks that are added as the result of breaking critical
+      // edges!
+      BlocksToInstrument.insert(BB);
+      NumEdges += BB->getTerminator()->getNumSuccessors();
+    }
+  }
+
+  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
+  GlobalVariable *Counters =
+    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "EdgeProfCounters");
+  NumEdgesInserted = NumEdges;
+
+  // Instrument all of the edges...
+  unsigned i = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    // Create counter for (0,entry) edge.
+    IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters);
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
+        // Okay, we have to add a counter of each outgoing edge.  If the
+        // outgoing edge is not critical don't split it, just insert the counter
+        // in the source or destination of the edge.
+        TerminatorInst *TI = BB->getTerminator();
+        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+          // If the edge is critical, split it.
+          SplitCriticalEdge(TI, s, this);
+
+          // Okay, we are guaranteed that the edge is no longer critical.  If we
+          // only have a single successor, insert the counter in this block,
+          // otherwise insert it in the successor block.
+          if (TI->getNumSuccessors() == 1) {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(BB, i++, Counters, false);
+          } else {
+            // Insert counter at the start of the block
+            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
+          }
+        }
+      }
+  }
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
new file mode 100644
index 000000000000..2edd151869e0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -0,0 +1,855 @@
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "ProfilingUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/PathV2.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <string>
+#include <utility>
+using namespace llvm;
+
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
+namespace {
+  class GCOVProfiler : public ModulePass {
+  public:
+    static char ID;
+    GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) {
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
+      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
+    }
+    GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){
+      assert((Options.EmitNotes || Options.EmitData) &&
+             "GCOVProfiler asked to do nothing?");
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
+      initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
+    }
+    virtual const char *getPassName() const {
+      return "GCOV Profiler";
+    }
+
+  private:
+    bool runOnModule(Module &M);
+
+    // Create the .gcno files for the Module based on DebugInfo.
+    void emitProfileNotes();
+
+    // Modify the program to track transitions along edges and call into the
+    // profiling runtime to emit .gcda files when run.
+    bool emitProfileArcs();
+
+    // Get pointers to the functions in the runtime library.
+    Constant *getStartFileFunc();
+    Constant *getIncrementIndirectCounterFunc();
+    Constant *getEmitFunctionFunc();
+    Constant *getEmitArcsFunc();
+    Constant *getDeleteWriteoutFunctionListFunc();
+    Constant *getDeleteFlushFunctionListFunc();
+    Constant *getEndFileFunc();
+
+    // Create or retrieve an i32 state value that is used to represent the
+    // pred block number for certain non-trivial edges.
+    GlobalVariable *getEdgeStateValue();
+
+    // Produce a table of pointers to counters, by predecessor and successor
+    // block number.
+    GlobalVariable *buildEdgeLookupTable(Function *F,
+                                         GlobalVariable *Counter,
+                                         const UniqueVector<BasicBlock *>&Preds,
+                                         const UniqueVector<BasicBlock*>&Succs);
+
+    // Add the function to write out all our counters to the global destructor
+    // list.
+    Function *insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*,
+                                                       MDNode*> >);
+    Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
+    void insertIndirectCounterIncrement();
+
+    std::string mangleName(DICompileUnit CU, const char *NewStem);
+
+    GCOVOptions Options;
+
+    // Reversed, NUL-terminated copy of Options.Version.
+    char ReversedVersion[5];  
+
+    Module *M;
+    LLVMContext *Ctx;
+  };
+}
+
+char GCOVProfiler::ID = 0;
+INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
+                "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfiler(Options);
+}
+
+static std::string getFunctionName(DISubprogram SP) {
+  if (!SP.getLinkageName().empty())
+    return SP.getLinkageName();
+  return SP.getName();
+}
+
+namespace {
+  class GCOVRecord {
+   protected:
+    static const char *LinesTag;
+    static const char *FunctionTag;
+    static const char *BlockTag;
+    static const char *EdgeTag;
+
+    GCOVRecord() {}
+
+    void writeBytes(const char *Bytes, int Size) {
+      os->write(Bytes, Size);
+    }
+
+    void write(uint32_t i) {
+      writeBytes(reinterpret_cast<char*>(&i), 4);
+    }
+
+    // Returns the length measured in 4-byte blocks that will be used to
+    // represent this string in a GCOV file
+    unsigned lengthOfGCOVString(StringRef s) {
+      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
+      // padding out to the next 4-byte word. The length is measured in 4-byte
+      // words including padding, not bytes of actual string.
+      return (s.size() / 4) + 1;
+    }
+
+    void writeGCOVString(StringRef s) {
+      uint32_t Len = lengthOfGCOVString(s);
+      write(Len);
+      writeBytes(s.data(), s.size());
+
+      // Write 1 to 4 bytes of NUL padding.
+      assert((unsigned)(4 - (s.size() % 4)) > 0);
+      assert((unsigned)(4 - (s.size() % 4)) <= 4);
+      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
+    }
+
+    raw_ostream *os;
+  };
+  const char *GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *GCOVRecord::EdgeTag = "\0\0\x43\x01";
+
+  class GCOVFunction;
+  class GCOVBlock;
+
+  // Constructed only by requesting it from a GCOVBlock, this object stores a
+  // list of line numbers and a single filename, representing lines that belong
+  // to the block.
+  class GCOVLines : public GCOVRecord {
+   public:
+    void addLine(uint32_t Line) {
+      Lines.push_back(Line);
+    }
+
+    uint32_t length() {
+      // Here 2 = 1 for string length + 1 for '0' id#.
+      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+    }
+
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
+
+    GCOVLines(StringRef F, raw_ostream *os) 
+      : Filename(F) {
+      this->os = os;
+    }
+
+   private:
+    StringRef Filename;
+    SmallVector<uint32_t, 32> Lines;
+  };
+
+  // Represent a basic block in GCOV. Each block has a unique number in the
+  // function, number of lines belonging to each block, and a set of edges to
+  // other blocks.
+  class GCOVBlock : public GCOVRecord {
+   public:
+    GCOVLines &getFile(StringRef Filename) {
+      GCOVLines *&Lines = LinesByFile[Filename];
+      if (!Lines) {
+        Lines = new GCOVLines(Filename, os);
+      }
+      return *Lines;
+    }
+
+    void addEdge(GCOVBlock &Successor) {
+      OutEdges.push_back(&Successor);
+    }
+
+    void writeOut() {
+      uint32_t Len = 3;
+      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
+               E = LinesByFile.end(); I != E; ++I) {
+        Len += I->second->length();
+      }
+
+      writeBytes(LinesTag, 4);
+      write(Len);
+      write(Number);
+      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
+               E = LinesByFile.end(); I != E; ++I) 
+        I->second->writeOut();
+      write(0);
+      write(0);
+    }
+
+    ~GCOVBlock() {
+      DeleteContainerSeconds(LinesByFile);
+    }
+
+   private:
+    friend class GCOVFunction;
+
+    GCOVBlock(uint32_t Number, raw_ostream *os)
+        : Number(Number) {
+      this->os = os;
+    }
+
+    uint32_t Number;
+    StringMap<GCOVLines *> LinesByFile;
+    SmallVector<GCOVBlock *, 4> OutEdges;
+  };
+
+  // A function has a unique identifier, a checksum (we leave as zero) and a
+  // set of blocks and a map of edges between blocks. This is the only GCOV
+  // object users can construct, the blocks and lines will be rooted here.
+  class GCOVFunction : public GCOVRecord {
+   public:
+    GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
+                 bool UseCfgChecksum) {
+      this->os = os;
+
+      Function *F = SP.getFunction();
+      DEBUG(dbgs() << "Function: " << F->getName() << "\n");
+      uint32_t i = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        Blocks[BB] = new GCOVBlock(i++, os);
+      }
+      ReturnBlock = new GCOVBlock(i++, os);
+
+      writeBytes(FunctionTag, 4);
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
+          1 + lengthOfGCOVString(SP.getFilename()) + 1;
+      if (UseCfgChecksum)
+        ++BlockLen;
+      write(BlockLen);
+      write(Ident);
+      write(0);  // lineno checksum
+      if (UseCfgChecksum)
+        write(0);  // cfg checksum
+      writeGCOVString(getFunctionName(SP));
+      writeGCOVString(SP.getFilename());
+      write(SP.getLineNumber());
+    }
+
+    ~GCOVFunction() {
+      DeleteContainerSeconds(Blocks);
+      delete ReturnBlock;
+    }
+
+    GCOVBlock &getBlock(BasicBlock *BB) {
+      return *Blocks[BB];
+    }
+
+    GCOVBlock &getReturnBlock() {
+      return *ReturnBlock;
+    }
+
+    void writeOut() {
+      // Emit count of blocks.
+      writeBytes(BlockTag, 4);
+      write(Blocks.size() + 1);
+      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
+        write(0);  // No flags on our blocks.
+      }
+      DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+
+      // Emit edges between blocks.
+      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
+               E = Blocks.end(); I != E; ++I) {
+        GCOVBlock &Block = *I->second;
+        if (Block.OutEdges.empty()) continue;
+
+        writeBytes(EdgeTag, 4);
+        write(Block.OutEdges.size() * 2 + 1);
+        write(Block.Number);
+        for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
+          DEBUG(dbgs() << Block.Number << " -> " << Block.OutEdges[i]->Number
+                       << "\n");
+          write(Block.OutEdges[i]->Number);
+          write(0);  // no flags
+        }
+      }
+
+      // Emit lines for each block.
+      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
+               E = Blocks.end(); I != E; ++I) {
+        I->second->writeOut();
+      }
+    }
+
+   private:
+    DenseMap<BasicBlock *, GCOVBlock *> Blocks;
+    GCOVBlock *ReturnBlock;
+  };
+}
+
+std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
+  if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+    for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+      MDNode *N = GCov->getOperand(i);
+      if (N->getNumOperands() != 2) continue;
+      MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+      MDNode *CompileUnit = dyn_cast<MDNode>(N->getOperand(1));
+      if (!GCovFile || !CompileUnit) continue;
+      if (CompileUnit == CU) {
+        SmallString<128> Filename = GCovFile->getString();
+        sys::path::replace_extension(Filename, NewStem);
+        return Filename.str();
+      }
+    }
+  }
+
+  SmallString<128> Filename = CU.getFilename();
+  sys::path::replace_extension(Filename, NewStem);
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName.str());
+  return CurPath.str();
+}
+
+bool GCOVProfiler::runOnModule(Module &M) {
+  this->M = &M;
+  Ctx = &M.getContext();
+
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
+  return false;
+}
+
+void GCOVProfiler::emitProfileNotes() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return;
+
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    // Each compile unit gets its own .gcno file. This means that whether we run
+    // this pass over the original .o's as they're produced, or run it after
+    // LTO, we'll generate the same .gcno files.
+
+    DICompileUnit CU(CU_Nodes->getOperand(i));
+    std::string ErrorInfo;
+    raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                       raw_fd_ostream::F_Binary);
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write("MVLL", 4);
+
+    DIArray SPs = CU.getSubprograms();
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (!SP.Verify()) continue;
+
+      Function *F = SP.getFunction();
+      if (!F) continue;
+      GCOVFunction Func(SP, &out, i, Options.UseCfgChecksum);
+
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        GCOVBlock &Block = Func.getBlock(BB);
+        TerminatorInst *TI = BB->getTerminator();
+        if (int successors = TI->getNumSuccessors()) {
+          for (int i = 0; i != successors; ++i) {
+            Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+          }
+        } else if (isa<ReturnInst>(TI)) {
+          Block.addEdge(Func.getReturnBlock());
+        }
+
+        uint32_t Line = 0;
+        for (BasicBlock::iterator I = BB->begin(), IE = BB->end();
+             I != IE; ++I) {
+          const DebugLoc &Loc = I->getDebugLoc();
+          if (Loc.isUnknown()) continue;
+          if (Line == Loc.getLine()) continue;
+          Line = Loc.getLine();
+          if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
+
+          GCOVLines &Lines = Block.getFile(SP.getFilename());
+          Lines.addLine(Loc.getLine());
+        }
+      }
+      Func.writeOut();
+    }
+    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    out.close();
+  }
+}
+
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;  
+  bool InsertIndCounterIncrCode = false;
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    DICompileUnit CU(CU_Nodes->getOperand(i));
+    DIArray SPs = CU.getSubprograms();
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (!SP.Verify()) continue;
+      Function *F = SP.getFunction();
+      if (!F) continue;
+      if (!Result) Result = true;
+      unsigned Edges = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        if (isa<ReturnInst>(TI))
+          ++Edges;
+        else
+          Edges += TI->getNumSuccessors();
+      }
+      
+      ArrayType *CounterTy =
+        ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
+      GlobalVariable *Counters =
+        new GlobalVariable(*M, CounterTy, false,
+                           GlobalValue::InternalLinkage,
+                           Constant::getNullValue(CounterTy),
+                           "__llvm_gcov_ctr");
+      CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
+      
+      UniqueVector<BasicBlock *> ComplexEdgePreds;
+      UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+      
+      unsigned Edge = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+        if (Successors) {
+          IRBuilder<> Builder(TI);
+          
+          if (Successors == 1) {
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                                Edge);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            Value *Sel = Builder.CreateSelect(BI->getCondition(),
+                                              Builder.getInt64(Edge),
+                                              Builder.getInt64(Edge + 1));
+            SmallVector<Value *, 2> Idx;
+            Idx.push_back(Builder.getInt64(0));
+            Idx.push_back(Sel);
+            Value *Counter = Builder.CreateInBoundsGEP(Counters, Idx);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count, Builder.getInt64(1));
+            Builder.CreateStore(Count, Counter);
+          } else {
+            ComplexEdgePreds.insert(BB);
+            for (int i = 0; i != Successors; ++i)
+              ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+          }
+          Edge += Successors;
+        }
+      }
+      
+      if (!ComplexEdgePreds.empty()) {
+        GlobalVariable *EdgeTable =
+          buildEdgeLookupTable(F, Counters,
+                               ComplexEdgePreds, ComplexEdgeSuccs);
+        GlobalVariable *EdgeState = getEdgeStateValue();
+        
+        for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          Builder.CreateStore(Builder.getInt32(i), EdgeState);
+        }
+        for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+          // call runtime to perform increment
+          BasicBlock::iterator InsertPt =
+            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
+          IRBuilder<> Builder(InsertPt);
+          Value *CounterPtrArray =
+            Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
+                                               i * ComplexEdgePreds.size());
+
+          // Build code to increment the counter.
+          InsertIndCounterIncrCode = true;
+          Builder.CreateCall2(getIncrementIndirectCounterFunc(),
+                              EdgeState, CounterPtrArray);
+        }
+      }
+    }
+
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(true);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Inialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
+  }
+
+  if (InsertIndCounterIncrCode)
+    insertIndirectCounterIncrement();
+
+  return Result;
+}
+
+// All edges with successors that aren't branches are "complex", because it
+// requires complex logic to pick which counter to update.
+GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
+    Function *F,
+    GlobalVariable *Counters,
+    const UniqueVector<BasicBlock *> &Preds,
+    const UniqueVector<BasicBlock *> &Succs) {
+  // TODO: support invoke, threads. We rely on the fact that nothing can modify
+  // the whole-Module pred edge# between the time we set it and the time we next
+  // read it. Threads and invoke make this untrue.
+
+  // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
+  size_t TableSize = Succs.size() * Preds.size();
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  ArrayType *EdgeTableTy = ArrayType::get(Int64PtrTy, TableSize);
+
+  OwningArrayPtr<Constant *> EdgeTable(new Constant*[TableSize]);
+  Constant *NullValue = Constant::getNullValue(Int64PtrTy);
+  for (size_t i = 0; i != TableSize; ++i)
+    EdgeTable[i] = NullValue;
+
+  unsigned Edge = 0;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    TerminatorInst *TI = BB->getTerminator();
+    int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+    if (Successors > 1 && !isa<BranchInst>(TI) && !isa<ReturnInst>(TI)) {
+      for (int i = 0; i != Successors; ++i) {
+        BasicBlock *Succ = TI->getSuccessor(i);
+        IRBuilder<> Builder(Succ);
+        Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                            Edge + i);
+        EdgeTable[((Succs.idFor(Succ)-1) * Preds.size()) +
+                  (Preds.idFor(BB)-1)] = cast<Constant>(Counter);
+      }
+    }
+    Edge += Successors;
+  }
+
+  ArrayRef<Constant*> V(&EdgeTable[0], TableSize);
+  GlobalVariable *EdgeTableGV =
+      new GlobalVariable(
+          *M, EdgeTableTy, true, GlobalValue::InternalLinkage,
+          ConstantArray::get(EdgeTableTy, V),
+          "__llvm_gcda_edge_table");
+  EdgeTableGV->setUnnamedAddr(true);
+  return EdgeTableGV;
+}
+
+Constant *GCOVProfiler::getStartFileFunc() {
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
+}
+
+Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
+  Type *Int32Ty = Type::getInt32Ty(*Ctx);
+  Type *Int64Ty = Type::getInt64Ty(*Ctx);
+  Type *Args[] = {
+    Int32Ty->getPointerTo(),                // uint32_t *predecessor
+    Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy);
+}
+
+Constant *GCOVProfiler::getEmitFunctionFunc() {
+  Type *Args[3] = {
+    Type::getInt32Ty(*Ctx),    // uint32_t ident
+    Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+Constant *GCOVProfiler::getEmitArcsFunc() {
+  Type *Args[] = {
+    Type::getInt32Ty(*Ctx),     // uint32_t num_counters
+    Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+  return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getEndFileFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+GlobalVariable *GCOVProfiler::getEdgeStateValue() {
+  GlobalVariable *GV = M->getGlobalVariable("__llvm_gcov_global_state_pred");
+  if (!GV) {
+    GV = new GlobalVariable(*M, Type::getInt32Ty(*Ctx), false,
+                            GlobalValue::InternalLinkage,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx),
+                                             0xffffffff),
+                            "__llvm_gcov_global_state_pred");
+    GV->setUnnamedAddr(true);
+  }
+  return GV;
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
+  WriteoutF->setUnnamedAddr(true);
+  WriteoutF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+  IRBuilder<> Builder(BB);
+
+  Constant *StartFile = getStartFileFunc();
+  Constant *EmitFunction = getEmitFunctionFunc();
+  Constant *EmitArcs = getEmitArcsFunc();
+  Constant *EndFile = getEndFileFunc();
+
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(CU, "gcda");
+      Builder.CreateCall2(StartFile,
+                          Builder.CreateGlobalStringPtr(FilenameGcda),
+                          Builder.CreateGlobalStringPtr(ReversedVersion));
+      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
+        DISubprogram SP(CountersBySP[j].second);
+        Builder.CreateCall3(
+            EmitFunction, Builder.getInt32(j),
+            Options.FunctionNamesInData ?
+              Builder.CreateGlobalStringPtr(getFunctionName(SP)) :
+              Constant::getNullValue(Builder.getInt8PtrTy()),
+            Builder.getInt8(Options.UseCfgChecksum));
+
+        GlobalVariable *GV = CountersBySP[j].first;
+        unsigned Arcs =
+          cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
+        Builder.CreateCall2(EmitArcs,
+                            Builder.getInt32(Arcs),
+                            Builder.CreateConstGEP2_64(GV, 0, 0));
+      }
+      Builder.CreateCall(EndFile);
+    }
+  }
+
+  Builder.CreateRetVoid();
+  return WriteoutF;
+}
+
+void GCOVProfiler::insertIndirectCounterIncrement() {
+  Function *Fn =
+    cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
+  Fn->setUnnamedAddr(true);
+  Fn->setLinkage(GlobalValue::InternalLinkage);
+  Fn->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    Fn->addFnAttr(Attribute::NoRedZone);
+
+  // Create basic blocks for function.
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn);
+  IRBuilder<> Builder(BB);
+
+  BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn);
+  BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn);
+
+  // uint32_t pred = *predecessor;
+  // if (pred == 0xffffffff) return;
+  Argument *Arg = Fn->arg_begin();
+  Arg->setName("predecessor");
+  Value *Pred = Builder.CreateLoad(Arg, "pred");
+  Value *Cond = Builder.CreateICmpEQ(Pred, Builder.getInt32(0xffffffff));
+  BranchInst::Create(Exit, PredNotNegOne, Cond, BB);
+
+  Builder.SetInsertPoint(PredNotNegOne);
+
+  // uint64_t *counter = counters[pred];
+  // if (!counter) return;
+  Value *ZExtPred = Builder.CreateZExt(Pred, Builder.getInt64Ty());
+  Arg = llvm::next(Fn->arg_begin());
+  Arg->setName("counters");
+  Value *GEP = Builder.CreateGEP(Arg, ZExtPred);
+  Value *Counter = Builder.CreateLoad(GEP, "counter");
+  Cond = Builder.CreateICmpEQ(Counter,
+                              Constant::getNullValue(
+                                  Builder.getInt64Ty()->getPointerTo()));
+  Builder.CreateCondBr(Cond, Exit, CounterEnd);
+
+  // ++*counter;
+  Builder.SetInsertPoint(CounterEnd);
+  Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter),
+                                 Builder.getInt64(1));
+  Builder.CreateStore(Add, Counter);
+  Builder.CreateBr(Exit);
+
+  // Fill in the exit block.
+  Builder.SetInsertPoint(Exit);
+  Builder.CreateRetVoid();
+}
+
+Function *GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(true);
+  FlushF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    FlushF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF);
+
+  // Zero out the counters.
+  for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
+         I = CountersBySP.begin(), E = CountersBySP.end();
+       I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    Constant *Null = Constant::getNullValue(GV->getType()->getElementType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
new file mode 100644
index 000000000000..8ba102559bb6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -0,0 +1,38 @@
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+
+using namespace llvm;
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+  initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
+  initializeBoundsCheckingPass(Registry);
+  initializeEdgeProfilerPass(Registry);
+  initializeGCOVProfilerPass(Registry);
+  initializeOptimalEdgeProfilerPass(Registry);
+  initializePathProfilerPass(Registry);
+  initializeMemorySanitizerPass(Registry);
+  initializeThreadSanitizerPass(Registry);
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+  initializeInstrumentation(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
new file mode 100644
index 000000000000..363539b2886f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -0,0 +1,111 @@
+//===- llvm/Analysis/MaximumSpanningTree.h - Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This module provides means for calculating a maximum spanning tree for a
+// given set of weighted edges. The type parameter T is the type of a node.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
+#define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
+
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/IR/BasicBlock.h"
+#include <algorithm>
+#include <vector>
+
+namespace llvm {
+
+  /// MaximumSpanningTree - A MST implementation.
+  /// The type parameter T determines the type of the nodes of the graph.
+  template <typename T>
+  class MaximumSpanningTree {
+  public:
+    typedef std::pair<const T*, const T*> Edge;
+    typedef std::pair<Edge, double> EdgeWeight;
+    typedef std::vector<EdgeWeight> EdgeWeights;
+  protected:
+    typedef std::vector<Edge> MaxSpanTree;
+
+    MaxSpanTree MST;
+
+  private:
+    // A comparing class for comparing weighted edges.
+    struct EdgeWeightCompare {
+      static bool getBlockSize(const T *X) {
+        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
+        return BB ? BB->size() : 0;
+      }
+
+      bool operator()(EdgeWeight X, EdgeWeight Y) const {
+        if (X.second > Y.second) return true;
+        if (X.second < Y.second) return false;
+
+        // Equal edge weights: break ties by comparing block sizes.
+        size_t XSizeA = getBlockSize(X.first.first);
+        size_t YSizeA = getBlockSize(Y.first.first);
+        if (XSizeA > YSizeA) return true;
+        if (XSizeA < YSizeA) return false;
+
+        size_t XSizeB = getBlockSize(X.first.second);
+        size_t YSizeB = getBlockSize(Y.first.second);
+        if (XSizeB > YSizeB) return true;
+        if (XSizeB < YSizeB) return false;
+
+        return false;
+      }
+    };
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+
+    /// MaximumSpanningTree() - Takes a vector of weighted edges and returns a
+    /// spanning tree.
+    MaximumSpanningTree(EdgeWeights &EdgeVector) {
+
+      std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare());
+
+      // Create spanning tree, Forest contains a special data structure
+      // that makes checking if two nodes are already in a common (sub-)tree
+      // fast and cheap.
+      EquivalenceClasses<const T*> Forest;
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        Forest.insert(e.first);
+        Forest.insert(e.second);
+      }
+
+      // Iterate over the sorted edges, biggest first.
+      for (typename EdgeWeights::iterator EWi = EdgeVector.begin(),
+           EWe = EdgeVector.end(); EWi != EWe; ++EWi) {
+        Edge e = (*EWi).first;
+
+        if (Forest.findLeader(e.first) != Forest.findLeader(e.second)) {
+          Forest.unionSets(e.first, e.second);
+          // So we know now that the edge is not already in a subtree, so we push
+          // the edge to the MST.
+          MST.push_back(e);
+        }
+      }
+    }
+
+    typename MaxSpanTree::iterator begin() {
+      return MST.begin();
+    }
+
+    typename MaxSpanTree::iterator end() {
+      return MST.end();
+    }
+  };
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 000000000000..4e75904ded4f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,1985 @@
+//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// Status: early prototype.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+///                           Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwritting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msan"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static const uint64_t kShadowMask32 = 1ULL << 31;
+static const uint64_t kShadowMask64 = 1ULL << 46;
+static const uint64_t kOriginOffset32 = 1ULL << 30;
+static const uint64_t kOriginOffset64 = 1ULL << 45;
+static const unsigned kMinOriginAlignment = 4;
+static const unsigned kShadowTLSAlignment = 8;
+
+/// \brief Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<bool> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given patter"),
+       cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+       cl::desc("exact handling of relational integer ICmp"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin",
+       cl::desc("store origin for clean (fully initialized) values"),
+       cl::Hidden, cl::init(false));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
+       cl::desc("File containing the list of functions where MemorySanitizer "
+                "should not report bugs"), cl::Hidden);
+
+namespace {
+
+/// \brief An instrumentation pass implementing detection of uninitialized
+/// reads.
+///
+/// MemorySanitizer: instrument the code in module to find
+/// uninitialized reads.
+class MemorySanitizer : public FunctionPass {
+ public:
+  MemorySanitizer(bool TrackOrigins = false,
+                  StringRef BlacklistFile = StringRef())
+    : FunctionPass(ID),
+      TrackOrigins(TrackOrigins || ClTrackOrigins),
+      TD(0),
+      WarningFn(0),
+      BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                          : BlacklistFile) { }
+  const char *getPassName() const { return "MemorySanitizer"; }
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+
+  /// \brief Track origins (allocation points) of uninitialized values.
+  bool TrackOrigins;
+
+  DataLayout *TD;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+  /// \brief Thread-local shadow storage for function parameters.
+  GlobalVariable *ParamTLS;
+  /// \brief Thread-local origin storage for function parameters.
+  GlobalVariable *ParamOriginTLS;
+  /// \brief Thread-local shadow storage for function return value.
+  GlobalVariable *RetvalTLS;
+  /// \brief Thread-local origin storage for function return value.
+  GlobalVariable *RetvalOriginTLS;
+  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  GlobalVariable *VAArgTLS;
+  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  GlobalVariable *VAArgOverflowSizeTLS;
+  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  GlobalVariable *OriginTLS;
+
+  /// \brief The run-time callback to print a warning.
+  Value *WarningFn;
+  /// \brief Run-time helper that copies origin info for a memory range.
+  Value *MsanCopyOriginFn;
+  /// \brief Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  Value *MsanSetAllocaOriginFn;
+  /// \brief Run-time helper that poisons stack on function entry.
+  Value *MsanPoisonStackFn;
+  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  Value *MemmoveFn, *MemcpyFn, *MemsetFn;
+
+  /// \brief Address mask used in application-to-shadow address calculation.
+  /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask.
+  uint64_t ShadowMask;
+  /// \brief Offset of the origin shadow from the "normal" shadow.
+  /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL
+  uint64_t OriginOffset;
+  /// \brief Branch weights for error reporting.
+  MDNode *ColdCallWeights;
+  /// \brief Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+  /// \bried Path to blacklist file.
+  SmallString<64> BlacklistFile;
+  /// \brief The blacklist.
+  OwningPtr<BlackList> BL;
+  /// \brief An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+};
+}  // namespace
+
+char MemorySanitizer::ID = 0;
+INITIALIZE_PASS(MemorySanitizer, "msan",
+                "MemorySanitizer: detects uninitialized reads.",
+                false, false)
+
+FunctionPass *llvm::createMemorySanitizerPass(bool TrackOrigins,
+                                              StringRef BlacklistFile) {
+  return new MemorySanitizer(TrackOrigins, BlacklistFile);
+}
+
+/// \brief Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+
+/// \brief Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (WarningFn)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
+                                        : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL);
+
+  MsanCopyOriginFn = M.getOrInsertFunction(
+    "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MsanSetAllocaOriginFn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), NULL);
+  MsanPoisonStackFn = M.getOrInsertFunction(
+    "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL);
+
+  // Create globals.
+  RetvalTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 8), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  RetvalOriginTLS = new GlobalVariable(
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  ParamTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  ParamOriginTLS = new GlobalVariable(
+    M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
+    0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_va_arg_overflow_size_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  OriginTLS = new GlobalVariable(
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+/// \brief Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+bool MemorySanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  C = &(M.getContext());
+  unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
+  switch (PtrSize) {
+    case 64:
+      ShadowMask = kShadowMask64;
+      OriginOffset = kOriginOffset64;
+      break;
+    case 32:
+      ShadowMask = kShadowMask32;
+      OriginOffset = kOriginOffset32;
+      break;
+    default:
+      report_fatal_error("unsupported pointer size");
+      break;
+  }
+
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(TD);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  // Insert a call to __msan_init/__msan_track_origins into the module's CTORs.
+  appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
+                      "__msan_init", IRB.getVoidTy(), NULL)), 0);
+
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(TrackOrigins), "__msan_track_origins");
+
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+
+  return true;
+}
+
+namespace {
+
+/// \brief A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  /// \brief Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// \brief Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// \brief Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// \brief Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+
+  virtual ~VarArgHelper() {}
+};
+
+struct MemorySanitizerVisitor;
+
+VarArgHelper*
+CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                   MemorySanitizerVisitor &Visitor);
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  bool InsertChecks;
+  bool LoadShadow;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  struct ShadowOriginAndInsertPoint {
+    Instruction *Shadow;
+    Instruction *Origin;
+    Instruction *OrigIns;
+    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) { }
+    ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  SmallVector<Instruction*, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
+      : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
+    LoadShadow = InsertChecks =
+        !MS.BL->isIn(F) &&
+        F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::SanitizeMemory);
+
+    DEBUG(if (!InsertChecks)
+          dbgs() << "MemorySanitizer is not inserting checks into '"
+                 << F.getName() << "'\n");
+  }
+
+  void materializeStores() {
+    for (size_t i = 0, n = StoreList.size(); i < n; i++) {
+      StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]);
+
+      IRBuilder<> IRB(&I);
+      Value *Val = I.getValueOperand();
+      Value *Addr = I.getPointerOperand();
+      Value *Shadow = getShadow(Val);
+      Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+      StoreInst *NewSI =
+        IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+
+      if (ClCheckAccessAddress)
+        insertCheck(Addr, &I);
+
+      if (MS.TrackOrigins) {
+        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) {
+          IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB),
+                                 Alignment);
+        } else {
+          Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+
+          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
+          // TODO(eugenis): handle non-zero constant shadow by inserting an
+          // unconditional check (can not simply fail compilation as this could
+          // be in the dead code).
+          if (Cst)
+            continue;
+
+          Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+              getCleanShadow(ConvertedShadow), "_mscmp");
+          Instruction *CheckTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false,
+                                      MS.OriginStoreWeights);
+          IRBuilder<> IRBNew(CheckTerm);
+          IRBNew.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRBNew),
+                                    Alignment);
+        }
+      }
+    }
+  }
+
+  void materializeChecks() {
+    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
+      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Instruction *OrigIns = InstrumentationList[i].OrigIns;
+      IRBuilder<> IRB(OrigIns);
+      DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp),
+                                  /* Unreachable */ !ClKeepGoing,
+                                  MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      if (MS.TrackOrigins) {
+        Instruction *Origin = InstrumentationList[i].Origin;
+        IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
+                        MS.OriginTLS);
+      }
+      CallInst *Call = IRB.CreateCall(MS.WarningFn);
+      Call->setDebugLoc(OrigIns->getDebugLoc());
+      IRB.CreateCall(MS.EmptyAsm);
+      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+    DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  /// \brief Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    MS.initializeCallbacks(*F.getParent());
+    if (!MS.TD) return false;
+
+    // In the presence of unreachable blocks, we may see Phi nodes with
+    // incoming nodes from such blocks. Since InstVisitor skips unreachable
+    // blocks, such nodes will not have any shadow value associated with them.
+    // It's easier to remove unreachable blocks than deal with missing shadow.
+    removeUnreachableBlocks(F);
+
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+      BasicBlock *BB = *DI;
+      visit(*BB);
+    }
+
+    // Finalize PHI nodes.
+    for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
+      PHINode *PN = ShadowPHINodes[i];
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO)
+          PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    // Delayed instrumentation of StoreInst.
+    // This may add new checks to be inserted later.
+    materializeStores();
+
+    // Insert shadow value checks.
+    materializeChecks();
+
+    return true;
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return 0;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+      uint32_t EltSize = MS.TD->getTypeSizeInBits(VT->getElementType());
+      return VectorType::get(IntegerType::get(*MS.C, EltSize),
+                             VT->getNumElements());
+    }
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = MS.TD->getTypeSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// \brief Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// \brief Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// \brief Compute the shadow address that corresponds to a given application
+  /// address.
+  ///
+  /// Shadow = Addr & ~ShadowMask.
+  Value *getShadowPtr(Value *Addr, Type *ShadowTy,
+                      IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  }
+
+  /// \brief Compute the origin address that corresponds to a given application
+  /// address.
+  ///
+  /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    Value *Add =
+      IRB.CreateAdd(ShadowLong,
+                    ConstantInt::get(MS.IntptrTy, MS.OriginOffset));
+    Value *SecondAnd =
+      IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL));
+    return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0));
+  }
+
+  /// \brief Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// \brief Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!MS.TrackOrigins) return 0;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// \brief Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy);
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msret");
+  }
+
+  /// \brief Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// \brief Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = SV;
+  }
+
+  /// \brief Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!MS.TrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  /// \brief Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Constant *getCleanShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// \brief Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    StructType *ST = cast<StructType>(ShadowTy);
+    SmallVector<Constant *, 4> Vals;
+    for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+      Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+    return ConstantStruct::get(ST, Vals);
+  }
+
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return getPoisonedShadow(ShadowTy);
+  }
+
+  /// \brief Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// \brief Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        if (!AI->getType()->isSized()) {
+          DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size = AI->hasByValAttr()
+          ? MS.TD->getTypeAllocSize(AI->getType()->getPointerElementType())
+          : MS.TD->getTypeAllocSize(AI->getType());
+        if (A == AI) {
+          Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset);
+          if (AI->hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            Value *Cpy = EntryIRB.CreateMemCpy(
+              getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+              Base, Size, AI->getParamAlignment());
+            DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+            (void)Cpy;
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            *ShadowPtr = EntryIRB.CreateLoad(Base);
+          }
+          DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
+                **ShadowPtr << "\n");
+          if (MS.TrackOrigins) {
+            Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          }
+        }
+        ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// \brief Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// \brief Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!MS.TrackOrigins) return 0;
+    if (isa<Instruction>(V) || isa<Argument>(V)) {
+      Value *Origin = OriginMap[V];
+      if (!Origin) {
+        DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n");
+        Origin = getCleanOrigin();
+      }
+      return Origin;
+    }
+    return getCleanOrigin();
+  }
+
+  /// \brief Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    if (!InsertChecks) return;
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    InstrumentationList.push_back(
+      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  // ------------------- Visitors.
+
+  /// \brief Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    if (LoadShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+    if (ClCheckAccessAddress)
+      insertCheck(I.getPointerOperand(), &I);
+
+    if (MS.TrackOrigins) {
+      if (LoadShadow) {
+        unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
+        setOrigin(&I,
+                  IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), Alignment));
+      } else {
+        setOrigin(&I, getCleanOrigin());
+      }
+    }
+  }
+
+  /// \brief Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// \brief Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Default propagation of shadow and/or origin.
+  ///
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow;
+    Value *Origin;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
+      Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {}
+
+    /// \brief Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (MSV->MS.TrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+          Value *Cond = IRB.CreateICmpNE(FlatShadow,
+                                         MSV->getCleanShadow(FlatShadow));
+          Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+        }
+      }
+      return *this;
+    }
+
+    /// \brief Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : 0;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// \brief Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (MSV->MS.TrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  typedef Combiner<true> ShadowAndOriginCombiner;
+  typedef Combiner<false> OriginCombiner;
+
+  /// \brief Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!MS.TrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+           "Vector of pointers is not a valid shadow type");
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// \brief Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+    Type *srcTy = V->getType();
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, false);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, false);
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
+  }
+
+  /// \brief Propagate shadow for arbitrary operation.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+  void visitMul(BinaryOperator &I) { handleShadowOr(I); }
+
+  void handleDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitURem(BinaryOperator &I) { handleDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleDiv(I); }
+  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+
+  /// \brief Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Build the lowest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Maximise the undefined shadow bit, minimize other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+    } else {
+      // Minimize undefined bits.
+      return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+    }
+  }
+
+  /// \brief Build the highest possible value of V, taking into account V's
+  ///        uninitialized bits.
+  Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+                                bool isSigned) {
+    if (isSigned) {
+      // Split shadow into sign bit and other bits.
+      Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+      Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+      // Minimise the undefined shadow bit, maximise other undefined bits.
+      return
+        IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+    } else {
+      // Maximize undefined bits.
+      return IRB.CreateOr(A, Sa);
+    }
+  }
+
+  /// \brief Instrument relational comparisons.
+  ///
+  /// This function does exact shadow propagation for all relational
+  /// comparisons of integers, pointers and vectors of those.
+  /// FIXME: output seems suboptimal when one of the operands is a constant
+  void handleRelationalComparisonExact(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+
+    // Get rid of pointers and vectors of pointers.
+    // For ints (and vectors of ints), types of A and Sa match,
+    // and this is a no-op.
+    A = IRB.CreatePointerCast(A, Sa->getType());
+    B = IRB.CreatePointerCast(B, Sb->getType());
+
+    // Let [a0, a1] be the interval of possible values of A, taking into account
+    // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+    // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+    bool IsSigned = I.isSigned();
+    Value *S1 = IRB.CreateICmp(I.getPredicate(),
+                               getLowestPossibleValue(IRB, A, Sa, IsSigned),
+                               getHighestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *S2 = IRB.CreateICmp(I.getPredicate(),
+                               getHighestPossibleValue(IRB, A, Sa, IsSigned),
+                               getLowestPossibleValue(IRB, B, Sb, IsSigned));
+    Value *Si = IRB.CreateXor(S1, S2);
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Instrument signed relational comparisons.
+  ///
+  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
+  /// propagating the highest bit of the shadow. Everything else is delegated
+  /// to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    Value* op = NULL;
+    CmpInst::Predicate pre = I.getPredicate();
+    if (constOp0 && constOp0->isNullValue() &&
+        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
+      op = I.getOperand(1);
+    } else if (constOp1 && constOp1->isNullValue() &&
+               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) {
+      op = I.getOperand(0);
+    }
+    if (op) {
+      IRBuilder<> IRB(&I);
+      Value* Shadow =
+        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (!ClHandleICmp) {
+      handleShadowOr(I);
+      return;
+    }
+    if (I.isEquality()) {
+      handleEqualityComparison(I);
+      return;
+    }
+
+    assert(I.isRelational());
+    if (ClHandleICmpExact) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+    if (I.isSigned()) {
+      handleSignedRelationalComparison(I);
+      return;
+    }
+
+    assert(I.isUnsigned());
+    if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+      handleRelationalComparisonExact(I);
+      return;
+    }
+
+    handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// \brief Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemmoveFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemcpyFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemsetFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  enum IntrinsicKind {
+    IK_DoesNotAccessMemory,
+    IK_OnlyReadsMemory,
+    IK_WritesMemory
+  };
+
+  static IntrinsicKind getIntrinsicKind(Intrinsic::ID iid) {
+    const int DoesNotAccessMemory = IK_DoesNotAccessMemory;
+    const int OnlyReadsArgumentPointees = IK_OnlyReadsMemory;
+    const int OnlyReadsMemory = IK_OnlyReadsMemory;
+    const int OnlyAccessesArgumentPointees = IK_WritesMemory;
+    const int UnknownModRefBehavior = IK_WritesMemory;
+#define GET_INTRINSIC_MODREF_BEHAVIOR
+#define ModRefBehavior IntrinsicKind
+#include "llvm/IR/Intrinsics.gen"
+#undef ModRefBehavior
+#undef GET_INTRINSIC_MODREF_BEHAVIOR
+  }
+
+  /// \brief Handle vector store-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+  /// has 1 pointer argument and 1 vector argument, returns void.
+  bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Value *Shadow = getShadow(&I, 1);
+    Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+    // We don't know the pointer alignment (could be unaligned SSE store!).
+    // Have to assume to worst case.
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+
+    if (ClCheckAccessAddress)
+      insertCheck(Addr, &I);
+
+    // FIXME: use ClStoreCleanOrigin
+    // FIXME: factor out common code from materializeStores
+    if (MS.TrackOrigins)
+      IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB));
+    return true;
+  }
+
+  /// \brief Handle vector load-like intrinsics.
+  ///
+  /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+  /// has 1 pointer argument, returns a vector.
+  bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+
+    Type *ShadowTy = getShadowTy(&I);
+    if (LoadShadow) {
+      Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+      // We don't know the pointer alignment (could be unaligned SSE load!).
+      // Have to assume to worst case.
+      setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, 1, "_msld"));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+
+
+    if (ClCheckAccessAddress)
+      insertCheck(Addr, &I);
+
+    if (MS.TrackOrigins) {
+      if (LoadShadow)
+        setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB)));
+      else
+        setOrigin(&I, getCleanOrigin());
+    }
+    return true;
+  }
+
+  /// \brief Handle (SIMD arithmetic)-like intrinsics.
+  ///
+  /// Instrument intrinsics with any number of arguments of the same type,
+  /// equal to the return type. The type should be simple (no aggregates or
+  /// pointers; vectors are fine).
+  /// Caller guarantees that this intrinsic does not access memory.
+  bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+    Type *RetTy = I.getType();
+    if (!(RetTy->isIntOrIntVectorTy() ||
+          RetTy->isFPOrFPVectorTy() ||
+          RetTy->isX86_MMXTy()))
+      return false;
+
+    unsigned NumArgOperands = I.getNumArgOperands();
+
+    for (unsigned i = 0; i < NumArgOperands; ++i) {
+      Type *Ty = I.getArgOperand(i)->getType();
+      if (Ty != RetTy)
+        return false;
+    }
+
+    IRBuilder<> IRB(&I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (unsigned i = 0; i < NumArgOperands; ++i)
+      SC.Add(I.getArgOperand(i));
+    SC.Done(&I);
+
+    return true;
+  }
+
+  /// \brief Heuristically instrument unknown intrinsics.
+  ///
+  /// The main purpose of this code is to do something reasonable with all
+  /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+  /// We recognize several classes of intrinsics by their argument types and
+  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// sure that we know what the intrinsic does.
+  ///
+  /// We special-case intrinsics where this approach fails. See llvm.bswap
+  /// handling as an example of that.
+  bool handleUnknownIntrinsic(IntrinsicInst &I) {
+    unsigned NumArgOperands = I.getNumArgOperands();
+    if (NumArgOperands == 0)
+      return false;
+
+    Intrinsic::ID iid = I.getIntrinsicID();
+    IntrinsicKind IK = getIntrinsicKind(iid);
+    bool OnlyReadsMemory = IK == IK_OnlyReadsMemory;
+    bool WritesMemory = IK == IK_WritesMemory;
+    assert(!(OnlyReadsMemory && WritesMemory));
+
+    if (NumArgOperands == 2 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getArgOperand(1)->getType()->isVectorTy() &&
+        I.getType()->isVoidTy() &&
+        WritesMemory) {
+      // This looks like a vector store.
+      return handleVectorStoreIntrinsic(I);
+    }
+
+    if (NumArgOperands == 1 &&
+        I.getArgOperand(0)->getType()->isPointerTy() &&
+        I.getType()->isVectorTy() &&
+        OnlyReadsMemory) {
+      // This looks like a vector load.
+      return handleVectorLoadIntrinsic(I);
+    }
+
+    if (!OnlyReadsMemory && !WritesMemory)
+      if (maybeHandleSimpleNomemIntrinsic(I))
+        return true;
+
+    // FIXME: detect and handle SSE maskstore/maskload
+    return false;
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, ArrayRef<Type*>(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case llvm::Intrinsic::bswap:
+      handleBswap(I);
+      break;
+    default:
+      if (!handleUnknownIntrinsic(I))
+        visitInstruction(I);
+      break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite");
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+
+      // For inline asm, do the usual thing: check argument shadow and mark all
+      // outputs as clean. Note that any side effects of the inline asm that are
+      // not immediately visible in its constraints are not handled.
+      if (Call->isInlineAsm()) {
+        visitInstruction(I);
+        return;
+      }
+
+      // Allow only tail calls with the same types, otherwise
+      // we may have a false positive: shadow for a non-void RetVal
+      // will get propagated to a void RetVal.
+      if (Call->isTailCall() && Call->getType() != Call->getParent()->getType())
+        Call->setTailCall(false);
+
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attribute::ReadOnly)
+          .addAttribute(Attribute::ReadNone);
+        Func->removeAttributes(AttributeSet::FunctionIndex,
+                               AttributeSet::get(Func->getContext(),
+                                                 AttributeSet::FunctionIndex,
+                                                 B));
+      }
+    }
+    IRBuilder<> IRB(&I);
+    unsigned ArgOffset = 0;
+    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = 0;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
+            " Shadow: " << *ArgShadow << "\n");
+      if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = MS.TD->getTypeAllocSize(A->getType()->getPointerElementType());
+        unsigned Alignment = CS.getParamAlignment(i + 1);
+        Store = IRB.CreateMemCpy(ArgShadowBase,
+                                 getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
+                                 Size, Alignment);
+      } else {
+        Size = MS.TD->getTypeAllocSize(A->getType());
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
+      }
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      (void)Store;
+      assert(Size != 0 && Store != 0);
+      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+    }
+    DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT =
+      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Untill we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
+    Instruction *NextInsn = 0;
+    if (CS.isCall()) {
+      NextInsn = I.getNextNode();
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(NextInsn);
+    Value *RetvalShadow =
+      IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                 kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    if (Value *RetVal = I.getReturnValue()) {
+      // Set the shadow for the RetVal.
+      Value *Shadow = getShadow(RetVal);
+      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (MS.TrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    if (!ClPoisonStack) return;
+    IRBuilder<> IRB(I.getNextNode());
+    uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType());
+    if (ClPoisonStackWithCall) {
+      IRB.CreateCall2(MS.MsanPoisonStackFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size));
+    } else {
+      Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
+      IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern),
+                       Size, I.getAlignment());
+    }
+
+    if (MS.TrackOrigins) {
+      setOrigin(&I, getCleanOrigin());
+      SmallString<2048> StackDescriptionStorage;
+      raw_svector_ostream StackDescription(StackDescriptionStorage);
+      // We create a string with a description of the stack allocation and
+      // pass it into __msan_set_alloca_origin.
+      // It will be printed by the run-time if stack-originated UMR is found.
+      // The first 4 bytes of the string are set to '----' and will be replaced
+      // by __msan_va_arg_overflow_size_tls at the first call.
+      StackDescription << "----" << I.getName() << "@" << F.getName();
+      Value *Descr =
+          createPrivateNonConstGlobalForString(*F.getParent(),
+                                               StackDescription.str());
+      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size),
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+    }
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
+              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
+              "_msprop"));
+    if (MS.TrackOrigins) {
+      // Origins are always i32, so any vector conditions must be flattened.
+      // FIXME: consider tracking vector origins for app vectors?
+      Value *Cond = I.getCondition();
+      if (Cond->getType()->isVectorTy()) {
+        Value *ConvertedShadow = convertToShadowTyNoVec(Cond, IRB);
+        Cond = IRB.CreateICmpNE(ConvertedShadow,
+                                getCleanShadow(ConvertedShadow), "_mso_select");
+      }
+      setOrigin(&I, IRB.CreateSelect(Cond,
+                getOrigin(I.getTrueValue()), getOrigin(I.getFalseValue())));
+    }
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
+      insertCheck(I.getOperand(i), &I);
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// \brief AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { }
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+      case AK_GeneralPurpose:
+        Base = getShadowPtrForVAArgument(A, IRB, GpOffset);
+        GpOffset += 8;
+        break;
+      case AK_FloatingPoint:
+        Base = getShadowPtrForVAArgument(A, IRB, FpOffset);
+        FpOffset += 16;
+        break;
+      case AK_Memory:
+        uint64_t ArgSize = MS.TD->getTypeAllocSize(A->getType());
+        Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset);
+        OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
+      }
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Value *A, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */8, false);
+  }
+
+  void finalizeInstrumentation() {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+        MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy,
+                       AMD64FpEndOffset, 16);
+
+      Value *OverflowArgAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr =
+        MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
+      Value *SrcPtr =
+        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
+    }
+  }
+};
+
+VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                 MemorySanitizerVisitor &Visitor) {
+  return new VarArgAMD64Helper(Func, Msan, Visitor);
+}
+
+}  // namespace
+
+bool MemorySanitizer::runOnFunction(Function &F) {
+  MemorySanitizerVisitor Visitor(F, *this);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly)
+    .addAttribute(Attribute::ReadNone);
+  F.removeAttributes(AttributeSet::FunctionIndex,
+                     AttributeSet::get(F.getContext(),
+                                       AttributeSet::FunctionIndex, B));
+
+  return Visitor.runOnFunction();
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
new file mode 100644
index 000000000000..b45aef65bc76
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -0,0 +1,225 @@
+//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments the specified program with counters for edge profiling.
+// Edge profiling can give a reasonable approximation of the hot paths through a
+// program, and is used for a wide variety of program transformations.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "insert-optimal-edge-profiling"
+#include "llvm/Transforms/Instrumentation.h"
+#include "MaximumSpanningTree.h"
+#include "ProfilingUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Analysis/ProfileInfoLoader.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+STATISTIC(NumEdgesInserted, "The # of edges inserted.");
+
+namespace {
+  class OptimalEdgeProfiler : public ModulePass {
+    bool runOnModule(Module &M);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    OptimalEdgeProfiler() : ModulePass(ID) {
+      initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(ProfileEstimatorPassID);
+      AU.addRequired<ProfileInfo>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Optimal Edge Profiler";
+    }
+  };
+}
+
+char OptimalEdgeProfiler::ID = 0;
+INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
+                "Insert optimal instrumentation for edge profiling",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass)
+INITIALIZE_AG_DEPENDENCY(ProfileInfo)
+INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
+                "Insert optimal instrumentation for edge profiling",
+                false, false)
+
+ModulePass *llvm::createOptimalEdgeProfilerPass() {
+  return new OptimalEdgeProfiler();
+}
+
+inline static void printEdgeCounter(ProfileInfo::Edge e,
+                                    BasicBlock* b,
+                                    unsigned i) {
+  DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
+               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
+}
+
+bool OptimalEdgeProfiler::runOnModule(Module &M) {
+  Function *Main = M.getFunction("main");
+  if (Main == 0) {
+    errs() << "WARNING: cannot insert edge profiling into a module"
+           << " with no main function!\n";
+    return false;  // No main, no instrumentation!
+  }
+
+  // NumEdges counts all the edges that may be instrumented. Later on its
+  // decided which edges to actually instrument, to achieve optimal profiling.
+  // For the entry block a virtual edge (0,entry) is reserved, for each block
+  // with no successors an edge (BB,0) is reserved. These edges are necessary
+  // to calculate a truly optimal maximum spanning tree and thus an optimal
+  // instrumentation.
+  unsigned NumEdges = 0;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    // Reserve space for (0,entry) edge.
+    ++NumEdges;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // Keep track of which blocks need to be instrumented.  We don't want to
+      // instrument blocks that are added as the result of breaking critical
+      // edges!
+      if (BB->getTerminator()->getNumSuccessors() == 0) {
+        // Reserve space for (BB,0) edge.
+        ++NumEdges;
+      } else {
+        NumEdges += BB->getTerminator()->getNumSuccessors();
+      }
+    }
+  }
+
+  // In the profiling output a counter for each edge is reserved, but only few
+  // are used. This is done to be able to read back in the profile without
+  // calulating the maximum spanning tree again, instead each edge counter that
+  // is not used is initialised with -1 to signal that this edge counter has to
+  // be calculated from other edge counters on reading the profile info back
+  // in.
+
+  Type *Int32 = Type::getInt32Ty(M.getContext());
+  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
+  GlobalVariable *Counters =
+    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
+                       Constant::getNullValue(ATy), "OptEdgeProfCounters");
+  NumEdgesInserted = 0;
+
+  std::vector<Constant*> Initializer(NumEdges);
+  Constant *Zero = ConstantInt::get(Int32, 0);
+  Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted);
+
+  // Instrument all of the edges not in MST...
+  unsigned i = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
+
+    // Calculate a Maximum Spanning Tree with the edge weights determined by
+    // ProfileEstimator. ProfileEstimator also assign weights to the virtual
+    // edges (0,entry) and (BB,0) (for blocks with no successors) and this
+    // edges also participate in the maximum spanning tree calculation.
+    // The third parameter of MaximumSpanningTree() has the effect that not the
+    // actual MST is returned but the edges _not_ in the MST.
+
+    ProfileInfo::EdgeWeights ECs =
+      getAnalysis<ProfileInfo>(*F).getEdgeWeights(F);
+    std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end());
+    MaximumSpanningTree<BasicBlock> MST(EdgeVector);
+    std::stable_sort(MST.begin(), MST.end());
+
+    // Check if (0,entry) not in the MST. If not, instrument edge
+    // (IncrementCounterInBlock()) and set the counter initially to zero, if
+    // the edge is in the MST the counter is initialised to -1.
+
+    BasicBlock *entry = &(F->getEntryBlock());
+    ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry);
+    if (!std::binary_search(MST.begin(), MST.end(), edge)) {
+      printEdgeCounter(edge, entry, i);
+      IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted;
+      Initializer[i++] = (Zero);
+    } else{
+      Initializer[i++] = (Uncounted);
+    }
+
+    // InsertedBlocks contains all blocks that were inserted for splitting an
+    // edge, this blocks do not have to be instrumented.
+    DenseSet<BasicBlock*> InsertedBlocks;
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // Check if block was not inserted and thus does not have to be
+      // instrumented.
+      if (InsertedBlocks.count(BB)) continue;
+
+      // Okay, we have to add a counter of each outgoing edge not in MST. If
+      // the outgoing edge is not critical don't split it, just insert the
+      // counter in the source or destination of the edge. Also, if the block
+      // has no successors, the virtual edge (BB,0) is processed.
+      TerminatorInst *TI = BB->getTerminator();
+      if (TI->getNumSuccessors() == 0) {
+        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0);
+        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
+          printEdgeCounter(edge, BB, i);
+          IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
+          Initializer[i++] = (Zero);
+        } else{
+          Initializer[i++] = (Uncounted);
+        }
+      }
+      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+        BasicBlock *Succ = TI->getSuccessor(s);
+        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ);
+        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
+
+          // If the edge is critical, split it.
+          bool wasInserted = SplitCriticalEdge(TI, s, this);
+          Succ = TI->getSuccessor(s);
+          if (wasInserted)
+            InsertedBlocks.insert(Succ);
+
+          // Okay, we are guaranteed that the edge is no longer critical.  If
+          // we only have a single successor, insert the counter in this block,
+          // otherwise insert it in the successor block.
+          if (TI->getNumSuccessors() == 1) {
+            // Insert counter at the start of the block
+            printEdgeCounter(edge, BB, i);
+            IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
+          } else {
+            // Insert counter at the start of the block
+            printEdgeCounter(edge, Succ, i);
+            IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted;
+          }
+          Initializer[i++] = (Zero);
+        } else {
+          Initializer[i++] = (Uncounted);
+        }
+      }
+    }
+  }
+
+  // Check if the number of edges counted at first was the number of edges we
+  // considered for instrumentation.
+  assert(i == NumEdges && "the number of edges in counting array is wrong");
+
+  // Assign the now completely defined initialiser to the array.
+  Constant *init = ConstantArray::get(ATy, Initializer);
+  Counters->setInitializer(init);
+
+  // Add the initialization call to main.
+  InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters);
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp
new file mode 100644
index 000000000000..7de73269cf2b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -0,0 +1,1424 @@
+//===- PathProfiling.cpp - Inserts counters for path profiling ------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass instruments functions for Ball-Larus path profiling.  Ball-Larus
+// profiling converts the CFG into a DAG by replacing backedges with edges
+// from entry to the start block and from the end block to exit.  The paths
+// along the new DAG are enumrated, i.e. each path is given a path number.
+// Edges are instrumented to increment the path number register, such that the
+// path number register will equal the path number of the path taken at the
+// exit.
+//
+// This file defines classes for building a CFG for use with different stages
+// in the Ball-Larus path profiling instrumentation [Ball96].  The
+// requirements are formatting the llvm CFG into the Ball-Larus DAG, path
+// numbering, finding a spanning tree, moving increments from the spanning
+// tree to chords.
+//
+// Terms:
+// DAG            - Directed Acyclic Graph.
+// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges
+//                  removed in the following manner.  For every backedge
+//                  v->w, insert edge ENTRY->w and edge v->EXIT.
+// Path Number    - The number corresponding to a specific path through a
+//                  Ball-Larus DAG.
+// Spanning Tree  - A subgraph, S, is a spanning tree if S covers all
+//                  vertices and is a tree.
+// Chord          - An edge not in the spanning tree.
+//
+// [Ball96]
+//  T. Ball and J. R. Larus. "Efficient Path Profiling."
+//  International Symposium on Microarchitecture, pages 46-57, 1996.
+//  http://portal.acm.org/citation.cfm?id=243857
+//
+// [Ball94]
+//  Thomas Ball.  "Efficiently Counting Program Events with Support for
+//  On-line queries."
+//  ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5,
+//  September 1994, Pages 1399-1410.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "insert-path-profiling"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "ProfilingUtils.h"
+#include "llvm/Analysis/PathNumbering.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <vector>
+
+#define HASH_THRESHHOLD 100000
+
+using namespace llvm;
+
+namespace {
+class BLInstrumentationNode;
+class BLInstrumentationEdge;
+class BLInstrumentationDag;
+
+// ---------------------------------------------------------------------------
+// BLInstrumentationNode extends BallLarusNode with member used by the
+// instrumentation algortihms.
+// ---------------------------------------------------------------------------
+class BLInstrumentationNode : public BallLarusNode {
+public:
+  // Creates a new BLInstrumentationNode from a BasicBlock.
+  BLInstrumentationNode(BasicBlock* BB);
+
+  // Get/sets the Value corresponding to the pathNumber register,
+  // constant or phinode.  Used by the instrumentation code to remember
+  // path number Values.
+  Value* getStartingPathNumber();
+  void setStartingPathNumber(Value* pathNumber);
+
+  Value* getEndingPathNumber();
+  void setEndingPathNumber(Value* pathNumber);
+
+  // Get/set the PHINode Instruction for this node.
+  PHINode* getPathPHI();
+  void setPathPHI(PHINode* pathPHI);
+
+private:
+
+  Value* _startingPathNumber; // The Value for the current pathNumber.
+  Value* _endingPathNumber; // The Value for the current pathNumber.
+  PHINode* _pathPHI; // The PHINode for current pathNumber.
+};
+
+// --------------------------------------------------------------------------
+// BLInstrumentationEdge extends BallLarusEdge with data about the
+// instrumentation that will end up on each edge.
+// --------------------------------------------------------------------------
+class BLInstrumentationEdge : public BallLarusEdge {
+public:
+  BLInstrumentationEdge(BLInstrumentationNode* source,
+                        BLInstrumentationNode* target);
+
+  // Sets the target node of this edge.  Required to split edges.
+  void setTarget(BallLarusNode* node);
+
+  // Get/set whether edge is in the spanning tree.
+  bool isInSpanningTree() const;
+  void setIsInSpanningTree(bool isInSpanningTree);
+
+  // Get/ set whether this edge will be instrumented with a path number
+  // initialization.
+  bool isInitialization() const;
+  void setIsInitialization(bool isInitialization);
+
+  // Get/set whether this edge will be instrumented with a path counter
+  // increment.  Notice this is incrementing the path counter
+  // corresponding to the path number register.  The path number
+  // increment is determined by getIncrement().
+  bool isCounterIncrement() const;
+  void setIsCounterIncrement(bool isCounterIncrement);
+
+  // Get/set the path number increment that this edge will be instrumented
+  // with.  This is distinct from the path counter increment and the
+  // weight.  The counter increment counts the number of executions of
+  // some path, whereas the path number keeps track of which path number
+  // the program is on.
+  long getIncrement() const;
+  void setIncrement(long increment);
+
+  // Get/set whether the edge has been instrumented.
+  bool hasInstrumentation();
+  void setHasInstrumentation(bool hasInstrumentation);
+
+  // Returns the successor number of this edge in the source.
+  unsigned getSuccessorNumber();
+
+private:
+  // The increment that the code will be instrumented with.
+  long long _increment;
+
+  // Whether this edge is in the spanning tree.
+  bool _isInSpanningTree;
+
+  // Whether this edge is an initialiation of the path number.
+  bool _isInitialization;
+
+  // Whether this edge is a path counter increment.
+  bool _isCounterIncrement;
+
+  // Whether this edge has been instrumented.
+  bool _hasInstrumentation;
+};
+
+// ---------------------------------------------------------------------------
+// BLInstrumentationDag extends BallLarusDag with algorithms that
+// determine where instrumentation should be placed.
+// ---------------------------------------------------------------------------
+class BLInstrumentationDag : public BallLarusDag {
+public:
+  BLInstrumentationDag(Function &F);
+
+  // Returns the Exit->Root edge. This edge is required for creating
+  // directed cycles in the algorithm for moving instrumentation off of
+  // the spanning tree
+  BallLarusEdge* getExitRootEdge();
+
+  // Returns an array of phony edges which mark those nodes
+  // with function calls
+  BLEdgeVector getCallPhonyEdges();
+
+  // Gets/sets the path counter array
+  GlobalVariable* getCounterArray();
+  void setCounterArray(GlobalVariable* c);
+
+  // Calculates the increments for the chords, thereby removing
+  // instrumentation from the spanning tree edges. Implementation is based
+  // on the algorithm in Figure 4 of [Ball94]
+  void calculateChordIncrements();
+
+  // Updates the state when an edge has been split
+  void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock);
+
+  // Calculates a spanning tree of the DAG ignoring cycles.  Whichever
+  // edges are in the spanning tree will not be instrumented, but this
+  // implementation does not try to minimize the instrumentation overhead
+  // by trying to find hot edges.
+  void calculateSpanningTree();
+
+  // Pushes initialization further down in order to group the first
+  // increment and initialization.
+  void pushInitialization();
+
+  // Pushes the path counter increments up in order to group the last path
+  // number increment.
+  void pushCounters();
+
+  // Removes phony edges from the successor list of the source, and the
+  // predecessor list of the target.
+  void unlinkPhony();
+
+  // Generate dot graph for the function
+  void generateDotGraph();
+
+protected:
+  // BLInstrumentationDag creates BLInstrumentationNode objects in this
+  // method overriding the creation of BallLarusNode objects.
+  //
+  // Allows subclasses to determine which type of Node is created.
+  // Override this method to produce subclasses of BallLarusNode if
+  // necessary.
+  virtual BallLarusNode* createNode(BasicBlock* BB);
+
+  // BLInstrumentationDag create BLInstrumentationEdges.
+  //
+  // Allows subclasses to determine which type of Edge is created.
+  // Override this method to produce subclasses of BallLarusEdge if
+  // necessary.  Parameters source and target will have been created by
+  // createNode and can be cast to the subclass of BallLarusNode*
+  // returned by createNode.
+  virtual BallLarusEdge* createEdge(
+    BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber);
+
+private:
+  BLEdgeVector _treeEdges; // All edges in the spanning tree.
+  BLEdgeVector _chordEdges; // All edges not in the spanning tree.
+  GlobalVariable* _counterArray; // Array to store path counters
+
+  // Removes the edge from the appropriate predecessor and successor lists.
+  void unlinkEdge(BallLarusEdge* edge);
+
+  // Makes an edge part of the spanning tree.
+  void makeEdgeSpanning(BLInstrumentationEdge* edge);
+
+  // Pushes initialization and calls itself recursively.
+  void pushInitializationFromEdge(BLInstrumentationEdge* edge);
+
+  // Pushes path counter increments up recursively.
+  void pushCountersFromEdge(BLInstrumentationEdge* edge);
+
+  // Depth first algorithm for determining the chord increments.f
+  void calculateChordIncrementsDfs(
+    long weight, BallLarusNode* v, BallLarusEdge* e);
+
+  // Determines the relative direction of two edges.
+  int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f);
+};
+
+// ---------------------------------------------------------------------------
+// PathProfiler is a module pass which instruments path profiling instructions
+// ---------------------------------------------------------------------------
+class PathProfiler : public ModulePass {
+private:
+  // Current context for multi threading support.
+  LLVMContext* Context;
+
+  // Which function are we currently instrumenting
+  unsigned currentFunctionNumber;
+
+  // The function prototype in the profiling runtime for incrementing a
+  // single path counter in a hash table.
+  Constant* llvmIncrementHashFunction;
+  Constant* llvmDecrementHashFunction;
+
+  // Instruments each function with path profiling.  'main' is instrumented
+  // with code to save the profile to disk.
+  bool runOnModule(Module &M);
+
+  // Analyzes the function for Ball-Larus path profiling, and inserts code.
+  void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M);
+
+  // Creates an increment constant representing incr.
+  ConstantInt* createIncrementConstant(long incr, int bitsize);
+
+  // Creates an increment constant representing the value in
+  // edge->getIncrement().
+  ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge);
+
+  // Finds the insertion point after pathNumber in block.  PathNumber may
+  // be NULL.
+  BasicBlock::iterator getInsertionPoint(
+    BasicBlock* block, Value* pathNumber);
+
+  // Inserts source's pathNumber Value* into target.  Target may or may not
+  // have multiple predecessors, and may or may not have its phiNode
+  // initalized.
+  void pushValueIntoNode(
+    BLInstrumentationNode* source, BLInstrumentationNode* target);
+
+  // Inserts source's pathNumber Value* into the appropriate slot of
+  // target's phiNode.
+  void pushValueIntoPHI(
+    BLInstrumentationNode* target, BLInstrumentationNode* source);
+
+  // The Value* in node, oldVal,  is updated with a Value* correspodning to
+  // oldVal + addition.
+  void insertNumberIncrement(BLInstrumentationNode* node, Value* addition,
+                             bool atBeginning);
+
+  // Creates a counter increment in the given node.  The Value* in node is
+  // taken as the index into a hash table.
+  void insertCounterIncrement(
+    Value* incValue,
+    BasicBlock::iterator insertPoint,
+    BLInstrumentationDag* dag,
+    bool increment = true);
+
+  // A PHINode is created in the node, and its values initialized to -1U.
+  void preparePHI(BLInstrumentationNode* node);
+
+  // Inserts instrumentation for the given edge
+  //
+  // Pre: The edge's source node has pathNumber set if edge is non zero
+  // path number increment.
+  //
+  // Post: Edge's target node has a pathNumber set to the path number Value
+  // corresponding to the value of the path register after edge's
+  // execution.
+  void insertInstrumentationStartingAt(
+    BLInstrumentationEdge* edge,
+    BLInstrumentationDag* dag);
+
+  // If this edge is a critical edge, then inserts a node at this edge.
+  // This edge becomes the first edge, and a new BallLarusEdge is created.
+  bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag);
+
+  // Inserts instrumentation according to the marked edges in dag.  Phony
+  // edges must be unlinked from the DAG, but accessible from the
+  // backedges.  Dag must have initializations, path number increments, and
+  // counter increments present.
+  //
+  // Counter storage is created here.
+  void insertInstrumentation( BLInstrumentationDag& dag, Module &M);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  PathProfiler() : ModulePass(ID) {
+    initializePathProfilerPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual const char *getPassName() const {
+    return "Path Profiler";
+  }
+};
+} // end anonymous namespace
+
+// Should we print the dot-graphs
+static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden,
+        cl::desc("Output the path profiling DAG for each function."));
+
+// Register the path profiler as a pass
+char PathProfiler::ID = 0;
+INITIALIZE_PASS(PathProfiler, "insert-path-profiling",
+                "Insert instrumentation for Ball-Larus path profiling",
+                false, false)
+
+ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); }
+
+namespace llvm {
+  class PathProfilingFunctionTable {};
+
+  // Type for global array storing references to hashes or arrays
+  template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
+                                            xcompile> {
+  public:
+    static StructType *get(LLVMContext& C) {
+      return( StructType::get(
+                TypeBuilder<types::i<32>, xcompile>::get(C), // type
+                TypeBuilder<types::i<32>, xcompile>::get(C), // array size
+                TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr
+                NULL));
+    }
+  };
+
+  typedef TypeBuilder<PathProfilingFunctionTable, true>
+  ftEntryTypeBuilder;
+
+  // BallLarusEdge << operator overloading
+  raw_ostream& operator<<(raw_ostream& os,
+                          const BLInstrumentationEdge& edge)
+      LLVM_ATTRIBUTE_USED;
+  raw_ostream& operator<<(raw_ostream& os,
+                          const BLInstrumentationEdge& edge) {
+    os << "[" << edge.getSource()->getName() << " -> "
+       << edge.getTarget()->getName() << "] init: "
+       << (edge.isInitialization() ? "yes" : "no")
+       << " incr:" << edge.getIncrement() << " cinc: "
+       << (edge.isCounterIncrement() ? "yes" : "no");
+    return(os);
+  }
+}
+
+// Creates a new BLInstrumentationNode from a BasicBlock.
+BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) :
+  BallLarusNode(BB),
+  _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {}
+
+// Constructor for BLInstrumentationEdge.
+BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source,
+                                             BLInstrumentationNode* target)
+  : BallLarusEdge(source, target, 0),
+    _increment(0), _isInSpanningTree(false), _isInitialization(false),
+    _isCounterIncrement(false), _hasInstrumentation(false) {}
+
+// Sets the target node of this edge.  Required to split edges.
+void BLInstrumentationEdge::setTarget(BallLarusNode* node) {
+  _target = node;
+}
+
+// Returns whether this edge is in the spanning tree.
+bool BLInstrumentationEdge::isInSpanningTree() const {
+  return(_isInSpanningTree);
+}
+
+// Sets whether this edge is in the spanning tree.
+void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) {
+  _isInSpanningTree = isInSpanningTree;
+}
+
+// Returns whether this edge will be instrumented with a path number
+// initialization.
+bool BLInstrumentationEdge::isInitialization() const {
+  return(_isInitialization);
+}
+
+// Sets whether this edge will be instrumented with a path number
+// initialization.
+void BLInstrumentationEdge::setIsInitialization(bool isInitialization) {
+  _isInitialization = isInitialization;
+}
+
+// Returns whether this edge will be instrumented with a path counter
+// increment.  Notice this is incrementing the path counter
+// corresponding to the path number register.  The path number
+// increment is determined by getIncrement().
+bool BLInstrumentationEdge::isCounterIncrement() const {
+  return(_isCounterIncrement);
+}
+
+// Sets whether this edge will be instrumented with a path counter
+// increment.
+void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) {
+  _isCounterIncrement = isCounterIncrement;
+}
+
+// Gets the path number increment that this edge will be instrumented
+// with.  This is distinct from the path counter increment and the
+// weight.  The counter increment is counts the number of executions of
+// some path, whereas the path number keeps track of which path number
+// the program is on.
+long BLInstrumentationEdge::getIncrement() const {
+  return(_increment);
+}
+
+// Set whether this edge will be instrumented with a path number
+// increment.
+void BLInstrumentationEdge::setIncrement(long increment) {
+  _increment = increment;
+}
+
+// True iff the edge has already been instrumented.
+bool BLInstrumentationEdge::hasInstrumentation() {
+  return(_hasInstrumentation);
+}
+
+// Set whether this edge has been instrumented.
+void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) {
+  _hasInstrumentation = hasInstrumentation;
+}
+
+// Returns the successor number of this edge in the source.
+unsigned BLInstrumentationEdge::getSuccessorNumber() {
+  BallLarusNode* sourceNode = getSource();
+  BallLarusNode* targetNode = getTarget();
+  BasicBlock* source = sourceNode->getBlock();
+  BasicBlock* target = targetNode->getBlock();
+
+  if(source == NULL || target == NULL)
+    return(0);
+
+  TerminatorInst* terminator = source->getTerminator();
+
+        unsigned i;
+  for(i=0; i < terminator->getNumSuccessors(); i++) {
+    if(terminator->getSuccessor(i) == target)
+      break;
+  }
+
+  return(i);
+}
+
+// BLInstrumentationDag constructor initializes a DAG for the given Function.
+BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F),
+                                                          _counterArray(0) {
+}
+
+// Returns the Exit->Root edge. This edge is required for creating
+// directed cycles in the algorithm for moving instrumentation off of
+// the spanning tree
+BallLarusEdge* BLInstrumentationDag::getExitRootEdge() {
+  BLEdgeIterator erEdge = getExit()->succBegin();
+  return(*erEdge);
+}
+
+BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () {
+  BLEdgeVector callEdges;
+
+  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
+       edge != end; edge++ ) {
+    if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY )
+      callEdges.push_back(*edge);
+  }
+
+  return callEdges;
+}
+
+// Gets the path counter array
+GlobalVariable* BLInstrumentationDag::getCounterArray() {
+  return _counterArray;
+}
+
+void BLInstrumentationDag::setCounterArray(GlobalVariable* c) {
+  _counterArray = c;
+}
+
+// Calculates the increment for the chords, thereby removing
+// instrumentation from the spanning tree edges. Implementation is based on
+// the algorithm in Figure 4 of [Ball94]
+void BLInstrumentationDag::calculateChordIncrements() {
+  calculateChordIncrementsDfs(0, getRoot(), NULL);
+
+  BLInstrumentationEdge* chord;
+  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
+      end = _chordEdges.end(); chordEdge != end; chordEdge++) {
+    chord = (BLInstrumentationEdge*) *chordEdge;
+    chord->setIncrement(chord->getIncrement() + chord->getWeight());
+  }
+}
+
+// Updates the state when an edge has been split
+void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge,
+                                       BasicBlock* newBlock) {
+  BallLarusNode* oldTarget = formerEdge->getTarget();
+  BallLarusNode* newNode = addNode(newBlock);
+  formerEdge->setTarget(newNode);
+  newNode->addPredEdge(formerEdge);
+
+  DEBUG(dbgs() << "  Edge split: " << *formerEdge << "\n");
+
+  oldTarget->removePredEdge(formerEdge);
+  BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0);
+
+  if( formerEdge->getType() == BallLarusEdge::BACKEDGE ||
+                        formerEdge->getType() == BallLarusEdge::SPLITEDGE) {
+                newEdge->setType(formerEdge->getType());
+    newEdge->setPhonyRoot(formerEdge->getPhonyRoot());
+    newEdge->setPhonyExit(formerEdge->getPhonyExit());
+    formerEdge->setType(BallLarusEdge::NORMAL);
+                formerEdge->setPhonyRoot(NULL);
+    formerEdge->setPhonyExit(NULL);
+  }
+}
+
+// Calculates a spanning tree of the DAG ignoring cycles.  Whichever
+// edges are in the spanning tree will not be instrumented, but this
+// implementation does not try to minimize the instrumentation overhead
+// by trying to find hot edges.
+void BLInstrumentationDag::calculateSpanningTree() {
+  std::stack<BallLarusNode*> dfsStack;
+
+  for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end();
+      nodeIt != end; nodeIt++) {
+    (*nodeIt)->setColor(BallLarusNode::WHITE);
+  }
+
+  dfsStack.push(getRoot());
+  while(dfsStack.size() > 0) {
+    BallLarusNode* node = dfsStack.top();
+    dfsStack.pop();
+
+    if(node->getColor() == BallLarusNode::WHITE)
+      continue;
+
+    BallLarusNode* nextNode;
+    bool forward = true;
+    BLEdgeIterator succEnd = node->succEnd();
+
+    node->setColor(BallLarusNode::WHITE);
+    // first iterate over successors then predecessors
+    for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd();
+        edge != predEnd; edge++) {
+      if(edge == succEnd) {
+        edge = node->predBegin();
+        forward = false;
+      }
+
+      // Ignore split edges
+      if ((*edge)->getType() == BallLarusEdge::SPLITEDGE)
+        continue;
+
+      nextNode = forward? (*edge)->getTarget(): (*edge)->getSource();
+      if(nextNode->getColor() != BallLarusNode::WHITE) {
+        nextNode->setColor(BallLarusNode::WHITE);
+        makeEdgeSpanning((BLInstrumentationEdge*)(*edge));
+      }
+    }
+  }
+
+  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end();
+      edge != end; edge++) {
+    BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge);
+      // safe since createEdge is overriden
+    if(!instEdge->isInSpanningTree() && (*edge)->getType()
+        != BallLarusEdge::SPLITEDGE)
+      _chordEdges.push_back(instEdge);
+  }
+}
+
+// Pushes initialization further down in order to group the first
+// increment and initialization.
+void BLInstrumentationDag::pushInitialization() {
+  BLInstrumentationEdge* exitRootEdge =
+                (BLInstrumentationEdge*) getExitRootEdge();
+  exitRootEdge->setIsInitialization(true);
+  pushInitializationFromEdge(exitRootEdge);
+}
+
+// Pushes the path counter increments up in order to group the last path
+// number increment.
+void BLInstrumentationDag::pushCounters() {
+  BLInstrumentationEdge* exitRootEdge =
+    (BLInstrumentationEdge*) getExitRootEdge();
+  exitRootEdge->setIsCounterIncrement(true);
+  pushCountersFromEdge(exitRootEdge);
+}
+
+// Removes phony edges from the successor list of the source, and the
+// predecessor list of the target.
+void BLInstrumentationDag::unlinkPhony() {
+  BallLarusEdge* edge;
+
+  for(BLEdgeIterator next = _edges.begin(),
+      end = _edges.end(); next != end; next++) {
+    edge = (*next);
+
+    if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
+        edge->getType() == BallLarusEdge::SPLITEDGE_PHONY ||
+        edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) {
+      unlinkEdge(edge);
+    }
+  }
+}
+
+// Generate a .dot graph to represent the DAG and pathNumbers
+void BLInstrumentationDag::generateDotGraph() {
+  std::string errorInfo;
+  std::string functionName = getFunction().getName().str();
+  std::string filename = "pathdag." + functionName + ".dot";
+
+  DEBUG (dbgs() << "Writing '" << filename << "'...\n");
+  raw_fd_ostream dotFile(filename.c_str(), errorInfo);
+
+  if (!errorInfo.empty()) {
+    errs() << "Error opening '" << filename.c_str() <<"' for writing!";
+    errs() << "\n";
+    return;
+  }
+
+  dotFile << "digraph " << functionName << " {\n";
+
+  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
+       edge != end; edge++) {
+    std::string sourceName = (*edge)->getSource()->getName();
+    std::string targetName = (*edge)->getTarget()->getName();
+
+    dotFile << "\t\"" << sourceName.c_str() << "\" -> \""
+            << targetName.c_str() << "\" ";
+
+    long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
+
+    switch( (*edge)->getType() ) {
+    case BallLarusEdge::NORMAL:
+      dotFile << "[label=" << inc << "] [color=black];\n";
+      break;
+
+    case BallLarusEdge::BACKEDGE:
+      dotFile << "[color=cyan];\n";
+      break;
+
+    case BallLarusEdge::BACKEDGE_PHONY:
+      dotFile << "[label=" << inc
+              << "] [color=blue];\n";
+      break;
+
+    case BallLarusEdge::SPLITEDGE:
+      dotFile << "[color=violet];\n";
+      break;
+
+    case BallLarusEdge::SPLITEDGE_PHONY:
+      dotFile << "[label=" << inc << "] [color=red];\n";
+      break;
+
+    case BallLarusEdge::CALLEDGE_PHONY:
+      dotFile << "[label=" << inc     << "] [color=green];\n";
+      break;
+    }
+  }
+
+  dotFile << "}\n";
+}
+
+// Allows subclasses to determine which type of Node is created.
+// Override this method to produce subclasses of BallLarusNode if
+// necessary. The destructor of BallLarusDag will call free on each pointer
+// created.
+BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) {
+  return( new BLInstrumentationNode(BB) );
+}
+
+// Allows subclasses to determine which type of Edge is created.
+// Override this method to produce subclasses of BallLarusEdge if
+// necessary. The destructor of BallLarusDag will call free on each pointer
+// created.
+BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source,
+                                                BallLarusNode* target, unsigned edgeNumber) {
+  // One can cast from BallLarusNode to BLInstrumentationNode since createNode
+  // is overriden to produce BLInstrumentationNode.
+  return( new BLInstrumentationEdge((BLInstrumentationNode*)source,
+                                    (BLInstrumentationNode*)target) );
+}
+
+// Sets the Value corresponding to the pathNumber register, constant,
+// or phinode.  Used by the instrumentation code to remember path
+// number Values.
+Value* BLInstrumentationNode::getStartingPathNumber(){
+  return(_startingPathNumber);
+}
+
+// Sets the Value of the pathNumber.  Used by the instrumentation code.
+void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
+  DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
+                                                       pathNumber->getName() :
+                                                       "unused") << "\n");
+  _startingPathNumber = pathNumber;
+}
+
+Value* BLInstrumentationNode::getEndingPathNumber(){
+  return(_endingPathNumber);
+}
+
+void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
+  DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
+               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
+  _endingPathNumber = pathNumber;
+}
+
+// Get the PHINode Instruction for this node.  Used by instrumentation
+// code.
+PHINode* BLInstrumentationNode::getPathPHI() {
+  return(_pathPHI);
+}
+
+// Set the PHINode Instruction for this node.  Used by instrumentation
+// code.
+void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) {
+  _pathPHI = pathPHI;
+}
+
+// Removes the edge from the appropriate predecessor and successor
+// lists.
+void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) {
+  if(edge == getExitRootEdge())
+    DEBUG(dbgs() << " Removing exit->root edge\n");
+
+  edge->getSource()->removeSuccEdge(edge);
+  edge->getTarget()->removePredEdge(edge);
+}
+
+// Makes an edge part of the spanning tree.
+void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) {
+  edge->setIsInSpanningTree(true);
+  _treeEdges.push_back(edge);
+}
+
+// Pushes initialization and calls itself recursively.
+void BLInstrumentationDag::pushInitializationFromEdge(
+  BLInstrumentationEdge* edge) {
+  BallLarusNode* target;
+
+  target = edge->getTarget();
+  if( target->getNumberPredEdges() > 1 || target == getExit() ) {
+    return;
+  } else {
+    for(BLEdgeIterator next = target->succBegin(),
+          end = target->succEnd(); next != end; next++) {
+      BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next;
+
+      // Skip split edges
+      if (intoEdge->getType() == BallLarusEdge::SPLITEDGE)
+        continue;
+
+      intoEdge->setIncrement(intoEdge->getIncrement() +
+                             edge->getIncrement());
+      intoEdge->setIsInitialization(true);
+      pushInitializationFromEdge(intoEdge);
+    }
+
+    edge->setIncrement(0);
+    edge->setIsInitialization(false);
+  }
+}
+
+// Pushes path counter increments up recursively.
+void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) {
+  BallLarusNode* source;
+
+  source = edge->getSource();
+  if(source->getNumberSuccEdges() > 1 || source == getRoot()
+     || edge->isInitialization()) {
+    return;
+  } else {
+    for(BLEdgeIterator previous = source->predBegin(),
+          end = source->predEnd(); previous != end; previous++) {
+      BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous;
+
+      // Skip split edges
+      if (fromEdge->getType() == BallLarusEdge::SPLITEDGE)
+        continue;
+
+      fromEdge->setIncrement(fromEdge->getIncrement() +
+                             edge->getIncrement());
+      fromEdge->setIsCounterIncrement(true);
+      pushCountersFromEdge(fromEdge);
+    }
+
+    edge->setIncrement(0);
+    edge->setIsCounterIncrement(false);
+  }
+}
+
+// Depth first algorithm for determining the chord increments.
+void BLInstrumentationDag::calculateChordIncrementsDfs(long weight,
+                                                       BallLarusNode* v, BallLarusEdge* e) {
+  BLInstrumentationEdge* f;
+
+  for(BLEdgeIterator treeEdge = _treeEdges.begin(),
+        end = _treeEdges.end(); treeEdge != end; treeEdge++) {
+    f = (BLInstrumentationEdge*) *treeEdge;
+    if(e != f && v == f->getTarget()) {
+      calculateChordIncrementsDfs(
+        calculateChordIncrementsDir(e,f)*(weight) +
+        f->getWeight(), f->getSource(), f);
+    }
+    if(e != f && v == f->getSource()) {
+      calculateChordIncrementsDfs(
+        calculateChordIncrementsDir(e,f)*(weight) +
+        f->getWeight(), f->getTarget(), f);
+    }
+  }
+
+  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
+        end = _chordEdges.end(); chordEdge != end; chordEdge++) {
+    f = (BLInstrumentationEdge*) *chordEdge;
+    if(v == f->getSource() || v == f->getTarget()) {
+      f->setIncrement(f->getIncrement() +
+                      calculateChordIncrementsDir(e,f)*weight);
+    }
+  }
+}
+
+// Determines the relative direction of two edges.
+int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e,
+                                                      BallLarusEdge* f) {
+  if( e == NULL)
+    return(1);
+  else if(e->getSource() == f->getTarget()
+          || e->getTarget() == f->getSource())
+    return(1);
+
+  return(-1);
+}
+
+// Creates an increment constant representing incr.
+ConstantInt* PathProfiler::createIncrementConstant(long incr,
+                                                   int bitsize) {
+  return(ConstantInt::get(IntegerType::get(*Context, 32), incr));
+}
+
+// Creates an increment constant representing the value in
+// edge->getIncrement().
+ConstantInt* PathProfiler::createIncrementConstant(
+  BLInstrumentationEdge* edge) {
+  return(createIncrementConstant(edge->getIncrement(), 32));
+}
+
+// Finds the insertion point after pathNumber in block.  PathNumber may
+// be NULL.
+BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
+                                                     pathNumber) {
+  if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
+     || (((Instruction*)(pathNumber))->getParent()) != block) {
+    return(block->getFirstInsertionPt());
+  } else {
+    Instruction* pathNumberInst = (Instruction*) (pathNumber);
+    BasicBlock::iterator insertPoint;
+    BasicBlock::iterator end = block->end();
+
+    for(insertPoint = block->begin();
+        insertPoint != end; insertPoint++) {
+      Instruction* insertInst = &(*insertPoint);
+
+      if(insertInst == pathNumberInst)
+        return(++insertPoint);
+    }
+
+    return(insertPoint);
+  }
+}
+
+// A PHINode is created in the node, and its values initialized to -1U.
+void PathProfiler::preparePHI(BLInstrumentationNode* node) {
+  BasicBlock* block = node->getBlock();
+  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
+  pred_iterator PB = pred_begin(node->getBlock()),
+          PE = pred_end(node->getBlock());
+  PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
+                                 std::distance(PB, PE), "pathNumber",
+                                 insertPoint );
+  node->setPathPHI(phi);
+  node->setStartingPathNumber(phi);
+  node->setEndingPathNumber(phi);
+
+  for(pred_iterator predIt = PB; predIt != PE; predIt++) {
+    BasicBlock* pred = (*predIt);
+
+    if(pred != NULL)
+      phi->addIncoming(createIncrementConstant((long)-1, 32), pred);
+  }
+}
+
+// Inserts source's pathNumber Value* into target.  Target may or may not
+// have multiple predecessors, and may or may not have its phiNode
+// initalized.
+void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source,
+                                     BLInstrumentationNode* target) {
+  if(target->getBlock() == NULL)
+    return;
+
+
+  if(target->getNumberPredEdges() <= 1) {
+    assert(target->getStartingPathNumber() == NULL &&
+           "Target already has path number");
+    target->setStartingPathNumber(source->getEndingPathNumber());
+    target->setEndingPathNumber(source->getEndingPathNumber());
+    DEBUG(dbgs() << "  Passing path number"
+          << (source->getEndingPathNumber() ? "" : " (null)")
+          << " value through.\n");
+  } else {
+    if(target->getPathPHI() == NULL) {
+      DEBUG(dbgs() << "  Initializing PHI node for block '"
+            << target->getName() << "'\n");
+      preparePHI(target);
+    }
+    pushValueIntoPHI(target, source);
+    DEBUG(dbgs() << "  Passing number value into PHI for block '"
+          << target->getName() << "'\n");
+  }
+}
+
+// Inserts source's pathNumber Value* into the appropriate slot of
+// target's phiNode.
+void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target,
+                                    BLInstrumentationNode* source) {
+  PHINode* phi = target->getPathPHI();
+  assert(phi != NULL && "  Tried to push value into node with PHI, but node"
+         " actually had no PHI.");
+  phi->removeIncomingValue(source->getBlock(), false);
+  phi->addIncoming(source->getEndingPathNumber(), source->getBlock());
+}
+
+// The Value* in node, oldVal,  is updated with a Value* correspodning to
+// oldVal + addition.
+void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
+                                         Value* addition, bool atBeginning) {
+  BasicBlock* block = node->getBlock();
+  assert(node->getStartingPathNumber() != NULL);
+  assert(node->getEndingPathNumber() != NULL);
+
+  BasicBlock::iterator insertPoint;
+
+  if( atBeginning )
+    insertPoint = block->getFirstInsertionPt();
+  else
+    insertPoint = block->getTerminator();
+
+  DEBUG(errs() << "  Creating addition instruction.\n");
+  Value* newpn = BinaryOperator::Create(Instruction::Add,
+                                        node->getStartingPathNumber(),
+                                        addition, "pathNumber", insertPoint);
+
+  node->setEndingPathNumber(newpn);
+
+  if( atBeginning )
+    node->setStartingPathNumber(newpn);
+}
+
+// Creates a counter increment in the given node.  The Value* in node is
+// taken as the index into an array or hash table.  The hash table access
+// is a call to the runtime.
+void PathProfiler::insertCounterIncrement(Value* incValue,
+                                          BasicBlock::iterator insertPoint,
+                                          BLInstrumentationDag* dag,
+                                          bool increment) {
+  // Counter increment for array
+  if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) {
+    // Get pointer to the array location
+    std::vector<Value*> gepIndices(2);
+    gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context));
+    gepIndices[1] = incValue;
+
+    GetElementPtrInst* pcPointer =
+      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
+                                "counterInc", insertPoint);
+
+    // Load from the array - call it oldPC
+    LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint);
+
+    // Test to see whether adding 1 will overflow the counter
+    ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc,
+                                   createIncrementConstant(0xffffffff, 32),
+                                   "isMax");
+
+    // Select increment for the path counter based on overflow
+    SelectInst* inc =
+      SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32),
+                          createIncrementConstant(0,32),
+                          "pathInc", insertPoint);
+
+    // newPc = oldPc + inc
+    BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add,
+                                                   oldPc, inc, "newPC",
+                                                   insertPoint);
+
+    // Store back in to the array
+    new StoreInst(newPc, pcPointer, insertPoint);
+  } else { // Counter increment for hash
+    std::vector<Value*> args(2);
+    args[0] = ConstantInt::get(Type::getInt32Ty(*Context),
+                               currentFunctionNumber);
+    args[1] = incValue;
+
+    CallInst::Create(
+      increment ? llvmIncrementHashFunction : llvmDecrementHashFunction,
+      args, "", insertPoint);
+  }
+}
+
+// Inserts instrumentation for the given edge
+//
+// Pre: The edge's source node has pathNumber set if edge is non zero
+// path number increment.
+//
+// Post: Edge's target node has a pathNumber set to the path number Value
+// corresponding to the value of the path register after edge's
+// execution.
+//
+// FIXME: This should be reworked so it's not recursive.
+void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
+                                                   BLInstrumentationDag* dag) {
+  // Mark the edge as instrumented
+  edge->setHasInstrumentation(true);
+  DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n");
+
+  // create a new node for this edge's instrumentation
+  splitCritical(edge, dag);
+
+  BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource();
+  BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget();
+  BLInstrumentationNode* instrumentNode;
+  BLInstrumentationNode* nextSourceNode;
+
+  bool atBeginning = false;
+
+  // Source node has only 1 successor so any information can be simply
+  // inserted in to it without splitting
+  if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) {
+    DEBUG(dbgs() << "  Potential instructions to be placed in: "
+          << sourceNode->getName() << " (at end)\n");
+    instrumentNode = sourceNode;
+    nextSourceNode = targetNode; // ... since we never made any new nodes
+  }
+
+  // The target node only has one predecessor, so we can safely insert edge
+  // instrumentation into it. If there was splitting, it must have been
+  // successful.
+  else if( targetNode->getNumberPredEdges() == 1 ) {
+    DEBUG(dbgs() << "  Potential instructions to be placed in: "
+          << targetNode->getName() << " (at beginning)\n");
+    pushValueIntoNode(sourceNode, targetNode);
+    instrumentNode = targetNode;
+    nextSourceNode = NULL; // ... otherwise we'll just keep splitting
+    atBeginning = true;
+  }
+
+  // Somehow, splitting must have failed.
+  else {
+    errs() << "Instrumenting could not split a critical edge.\n";
+    DEBUG(dbgs() << "  Couldn't split edge " << (*edge) << ".\n");
+    return;
+  }
+
+  // Insert instrumentation if this is a back or split edge
+  if( edge->getType() == BallLarusEdge::BACKEDGE ||
+      edge->getType() == BallLarusEdge::SPLITEDGE ) {
+    BLInstrumentationEdge* top =
+      (BLInstrumentationEdge*) edge->getPhonyRoot();
+    BLInstrumentationEdge* bottom =
+      (BLInstrumentationEdge*) edge->getPhonyExit();
+
+    assert( top->isInitialization() && " Top phony edge did not"
+            " contain a path number initialization.");
+    assert( bottom->isCounterIncrement() && " Bottom phony edge"
+            " did not contain a path counter increment.");
+
+    // split edge has yet to be initialized
+    if( !instrumentNode->getEndingPathNumber() ) {
+      instrumentNode->setStartingPathNumber(createIncrementConstant(0,32));
+      instrumentNode->setEndingPathNumber(createIncrementConstant(0,32));
+    }
+
+    BasicBlock::iterator insertPoint = atBeginning ?
+      instrumentNode->getBlock()->getFirstInsertionPt() :
+      instrumentNode->getBlock()->getTerminator();
+
+    // add information from the bottom edge, if it exists
+    if( bottom->getIncrement() ) {
+      Value* newpn =
+        BinaryOperator::Create(Instruction::Add,
+                               instrumentNode->getStartingPathNumber(),
+                               createIncrementConstant(bottom),
+                               "pathNumber", insertPoint);
+      instrumentNode->setEndingPathNumber(newpn);
+    }
+
+    insertCounterIncrement(instrumentNode->getEndingPathNumber(),
+                           insertPoint, dag);
+
+    if( atBeginning )
+      instrumentNode->setStartingPathNumber(createIncrementConstant(top));
+
+    instrumentNode->setEndingPathNumber(createIncrementConstant(top));
+
+    // Check for path counter increments
+    if( top->isCounterIncrement() ) {
+      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
+                             instrumentNode->getBlock()->getTerminator(),dag);
+      instrumentNode->setEndingPathNumber(0);
+    }
+  }
+
+  // Insert instrumentation if this is a normal edge
+  else {
+    BasicBlock::iterator insertPoint = atBeginning ?
+      instrumentNode->getBlock()->getFirstInsertionPt() :
+      instrumentNode->getBlock()->getTerminator();
+
+    if( edge->isInitialization() ) { // initialize path number
+      instrumentNode->setEndingPathNumber(createIncrementConstant(edge));
+    } else if( edge->getIncrement() )       {// increment path number
+      Value* newpn =
+        BinaryOperator::Create(Instruction::Add,
+                               instrumentNode->getStartingPathNumber(),
+                               createIncrementConstant(edge),
+                               "pathNumber", insertPoint);
+      instrumentNode->setEndingPathNumber(newpn);
+
+      if( atBeginning )
+        instrumentNode->setStartingPathNumber(newpn);
+    }
+
+    // Check for path counter increments
+    if( edge->isCounterIncrement() ) {
+      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
+                             insertPoint, dag);
+      instrumentNode->setEndingPathNumber(0);
+    }
+  }
+
+  // Push it along
+  if (nextSourceNode && instrumentNode->getEndingPathNumber())
+    pushValueIntoNode(instrumentNode, nextSourceNode);
+
+  // Add all the successors
+  for( BLEdgeIterator next = targetNode->succBegin(),
+         end = targetNode->succEnd(); next != end; next++ ) {
+    // So long as it is un-instrumented, add it to the list
+    if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() )
+      insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag);
+    else
+      DEBUG(dbgs() << "  Edge " << *(BLInstrumentationEdge*)(*next)
+            << " already instrumented.\n");
+  }
+}
+
+// Inserts instrumentation according to the marked edges in dag.  Phony edges
+// must be unlinked from the DAG, but accessible from the backedges.  Dag
+// must have initializations, path number increments, and counter increments
+// present.
+//
+// Counter storage is created here.
+void PathProfiler::insertInstrumentation(
+  BLInstrumentationDag& dag, Module &M) {
+
+  BLInstrumentationEdge* exitRootEdge =
+    (BLInstrumentationEdge*) dag.getExitRootEdge();
+  insertInstrumentationStartingAt(exitRootEdge, &dag);
+
+  // Iterate through each call edge and apply the appropriate hash increment
+  // and decrement functions
+  BLEdgeVector callEdges = dag.getCallPhonyEdges();
+  for( BLEdgeIterator edge = callEdges.begin(),
+         end = callEdges.end(); edge != end; edge++ ) {
+    BLInstrumentationNode* node =
+      (BLInstrumentationNode*)(*edge)->getSource();
+    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
+
+    // Find the first function call
+    while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
+      insertPoint++;
+
+    DEBUG(dbgs() << "\nInstrumenting method call block '"
+                 << node->getBlock()->getName() << "'\n");
+    DEBUG(dbgs() << "   Path number initialized: "
+                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
+
+    Value* newpn;
+    if( node->getStartingPathNumber() ) {
+      long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
+      if ( inc )
+        newpn = BinaryOperator::Create(Instruction::Add,
+                                       node->getStartingPathNumber(),
+                                       createIncrementConstant(inc,32),
+                                       "pathNumber", insertPoint);
+      else
+        newpn = node->getStartingPathNumber();
+    } else {
+      newpn = (Value*)createIncrementConstant(
+        ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32);
+    }
+
+    insertCounterIncrement(newpn, insertPoint, &dag);
+    insertCounterIncrement(newpn, node->getBlock()->getTerminator(),
+                           &dag, false);
+  }
+}
+
+// Entry point of the module
+void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
+                                 Function &F, Module &M) {
+  // Build DAG from CFG
+  BLInstrumentationDag dag = BLInstrumentationDag(F);
+  dag.init();
+
+  // give each path a unique integer value
+  dag.calculatePathNumbers();
+
+  // modify path increments to increase the efficiency
+  // of instrumentation
+  dag.calculateSpanningTree();
+  dag.calculateChordIncrements();
+  dag.pushInitialization();
+  dag.pushCounters();
+  dag.unlinkPhony();
+
+  // potentially generate .dot graph for the dag
+  if (DotPathDag)
+    dag.generateDotGraph ();
+
+  // Should we store the information in an array or hash
+  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
+    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
+                                   dag.getNumberOfPaths());
+
+    dag.setCounterArray(new GlobalVariable(M, t, false,
+                                           GlobalValue::InternalLinkage,
+                                           Constant::getNullValue(t), ""));
+  }
+
+  insertInstrumentation(dag, M);
+
+  // Add to global function reference table
+  unsigned type;
+  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
+
+  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
+    type = ProfilingArray;
+  else
+    type = ProfilingHash;
+
+  std::vector<Constant*> entryArray(3);
+  entryArray[0] = createIncrementConstant(type,32);
+  entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32);
+  entryArray[2] = dag.getCounterArray() ?
+    ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
+    Constant::getNullValue(voidPtr);
+
+  StructType* at = ftEntryTypeBuilder::get(*Context);
+  ConstantStruct* functionEntry =
+    (ConstantStruct*)ConstantStruct::get(at, entryArray);
+  ftInit.push_back(functionEntry);
+}
+
+// Output the bitcode if we want to observe instrumentation changess
+#define PRINT_MODULE dbgs() <<                               \
+  "\n\n============= MODULE BEGIN ===============\n" << M << \
+  "\n============== MODULE END ================\n"
+
+bool PathProfiler::runOnModule(Module &M) {
+  Context = &M.getContext();
+
+  DEBUG(dbgs()
+        << "****************************************\n"
+        << "****************************************\n"
+        << "**                                    **\n"
+        << "**   PATH PROFILING INSTRUMENTATION   **\n"
+        << "**                                    **\n"
+        << "****************************************\n"
+        << "****************************************\n");
+
+  // No main, no instrumentation!
+  Function *Main = M.getFunction("main");
+
+  // Using fortran? ... this kind of works
+  if (!Main)
+    Main = M.getFunction("MAIN__");
+
+  if (!Main) {
+    errs() << "WARNING: cannot insert path profiling into a module"
+           << " with no main function!\n";
+    return false;
+  }
+
+  llvmIncrementHashFunction = M.getOrInsertFunction(
+    "llvm_increment_path_count",
+    Type::getVoidTy(*Context), // return type
+    Type::getInt32Ty(*Context), // function number
+    Type::getInt32Ty(*Context), // path number
+    NULL );
+
+  llvmDecrementHashFunction = M.getOrInsertFunction(
+    "llvm_decrement_path_count",
+    Type::getVoidTy(*Context), // return type
+    Type::getInt32Ty(*Context), // function number
+    Type::getInt32Ty(*Context), // path number
+    NULL );
+
+  std::vector<Constant*> ftInit;
+  unsigned functionNumber = 0;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
+    if (F->isDeclaration())
+      continue;
+
+    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
+    functionNumber++;
+
+    // set function number
+    currentFunctionNumber = functionNumber;
+    runOnFunction(ftInit, *F, M);
+  }
+
+  Type *t = ftEntryTypeBuilder::get(*Context);
+  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
+  Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
+
+  DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
+
+  GlobalVariable* functionTable =
+    new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
+                       ftInitConstant, "functionPathTable");
+  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
+  InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
+                          PointerType::getUnqual(eltType));
+
+  DEBUG(PRINT_MODULE);
+
+  return true;
+}
+
+// If this edge is a critical edge, then inserts a node at this edge.
+// This edge becomes the first edge, and a new BallLarusEdge is created.
+// Returns true if the edge was split
+bool PathProfiler::splitCritical(BLInstrumentationEdge* edge,
+                                 BLInstrumentationDag* dag) {
+  unsigned succNum = edge->getSuccessorNumber();
+  BallLarusNode* sourceNode = edge->getSource();
+  BallLarusNode* targetNode = edge->getTarget();
+  BasicBlock* sourceBlock = sourceNode->getBlock();
+  BasicBlock* targetBlock = targetNode->getBlock();
+
+  if(sourceBlock == NULL || targetBlock == NULL
+     || sourceNode->getNumberSuccEdges() <= 1
+     || targetNode->getNumberPredEdges() == 1 ) {
+    return(false);
+  }
+
+  TerminatorInst* terminator = sourceBlock->getTerminator();
+
+  if( SplitCriticalEdge(terminator, succNum, this, false)) {
+    BasicBlock* newBlock = terminator->getSuccessor(succNum);
+    dag->splitUpdate(edge, newBlock);
+    return(true);
+  } else
+    return(false);
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp
new file mode 100644
index 000000000000..4b3de6d7fc38
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -0,0 +1,169 @@
+//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProfilingUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                                   GlobalValue *Array,
+                                   PointerType *arrayType) {
+  LLVMContext &Context = MainFn->getContext();
+  Type *ArgVTy =
+    PointerType::getUnqual(Type::getInt8PtrTy(Context));
+  PointerType *UIntPtr = arrayType ? arrayType :
+    Type::getInt32PtrTy(Context);
+  Module &M = *MainFn->getParent();
+  Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
+                                           Type::getInt32Ty(Context),
+                                           ArgVTy, UIntPtr,
+                                           Type::getInt32Ty(Context),
+                                           (Type *)0);
+
+  // This could force argc and argv into programs that wouldn't otherwise have
+  // them, but instead we just pass null values in.
+  std::vector<Value*> Args(4);
+  Args[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+  Args[1] = Constant::getNullValue(ArgVTy);
+
+  // Skip over any allocas in the entry block.
+  BasicBlock *Entry = MainFn->begin();
+  BasicBlock::iterator InsertPos = Entry->begin();
+  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
+
+  std::vector<Constant*> GEPIndices(2,
+                             Constant::getNullValue(Type::getInt32Ty(Context)));
+  unsigned NumElements = 0;
+  if (Array) {
+    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
+    NumElements =
+      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
+  } else {
+    // If this profiling instrumentation doesn't have a constant array, just
+    // pass null.
+    Args[2] = ConstantPointerNull::get(UIntPtr);
+  }
+  Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements);
+
+  CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos);
+
+  // If argc or argv are not available in main, just pass null values in.
+  Function::arg_iterator AI;
+  switch (MainFn->arg_size()) {
+  default:
+  case 2:
+    AI = MainFn->arg_begin(); ++AI;
+    if (AI->getType() != ArgVTy) {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy,
+                                                            false);
+      InitCall->setArgOperand(1,
+          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
+    } else {
+      InitCall->setArgOperand(1, AI);
+    }
+    /* FALL THROUGH */
+
+  case 1:
+    AI = MainFn->arg_begin();
+    // If the program looked at argc, have it look at the return value of the
+    // init call instead.
+    if (!AI->getType()->isIntegerTy(32)) {
+      Instruction::CastOps opcode;
+      if (!AI->use_empty()) {
+        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
+        AI->replaceAllUsesWith(
+          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
+      }
+      opcode = CastInst::getCastOpcode(AI, true,
+                                       Type::getInt32Ty(Context), true);
+      InitCall->setArgOperand(0,
+          CastInst::Create(opcode, AI, Type::getInt32Ty(Context),
+                           "argc.cast", InitCall));
+    } else {
+      AI->replaceAllUsesWith(InitCall);
+      InitCall->setArgOperand(0, AI);
+    }
+
+  case 0: break;
+  }
+}
+
+void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                                   GlobalValue *CounterArray, bool beginning) {
+  // Insert the increment after any alloca or PHI instructions...
+  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
+                                   BB->getTerminator();
+  while (isa<AllocaInst>(InsertPos))
+    ++InsertPos;
+
+  LLVMContext &Context = BB->getContext();
+
+  // Create the getelementptr constant expression
+  std::vector<Constant*> Indices(2);
+  Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+  Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
+  Constant *ElementPtr =
+    ConstantExpr::getGetElementPtr(CounterArray, Indices);
+
+  // Load, increment and store the value back.
+  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
+  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
+                                 ConstantInt::get(Type::getInt32Ty(Context), 1),
+                                         "NewFuncCounter", InsertPos);
+  new StoreInst(NewVal, ElementPtr, InsertPos);
+}
+
+void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
+  // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those
+  // types.
+  Type *GlobalDtorElems[2] = {
+    Type::getInt32Ty(Mod->getContext()),
+    FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
+  };
+  StructType *GlobalDtorElemTy =
+      StructType::get(Mod->getContext(), GlobalDtorElems, false);
+
+  // Construct the new element we'll be adding.
+  Constant *Elem[2] = {
+    ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535),
+    ConstantExpr::getBitCast(Callee, GlobalDtorElems[1])
+  };
+
+  // If llvm.global_dtors exists, make a copy of the things in its list and
+  // delete it, to replace it with one that has a larger array type.
+  std::vector<Constant *> dtors;
+  if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) {
+    if (ConstantArray *InitList =
+        dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) {
+      for (unsigned i = 0, e = InitList->getType()->getNumElements();
+           i != e; ++i)
+        dtors.push_back(cast<Constant>(InitList->getOperand(i)));
+    }
+    GlobalDtors->eraseFromParent();
+  }
+
+  // Build up llvm.global_dtors with our new item in it.
+  GlobalVariable *GlobalDtors = new GlobalVariable(
+      *Mod, ArrayType::get(GlobalDtorElemTy, 1), false,
+      GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors");
+                                    
+  dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem));
+  GlobalDtors->setInitializer(ConstantArray::get(
+      cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors));
+}
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h
new file mode 100644
index 000000000000..09b22171ff04
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ProfilingUtils.h
@@ -0,0 +1,36 @@
+//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a few helper functions which are used by profile
+// instrumentation code to instrument the code.  This allows the profiler pass
+// to worry about *what* to insert, and these functions take care of *how* to do
+// it.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PROFILINGUTILS_H
+#define PROFILINGUTILS_H
+
+namespace llvm {
+  class BasicBlock;
+  class Function;
+  class GlobalValue;
+  class Module;
+  class PointerType;
+
+  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
+                               GlobalValue *Arr = 0,
+                               PointerType *arrayType = 0);
+  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
+                               GlobalValue *CounterArray,
+                               bool beginning = true);
+  void InsertProfilingShutdownCall(Function *Callee, Module *Mod);
+}
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
new file mode 100644
index 000000000000..299060a42fe8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -0,0 +1,585 @@
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+//   - Insert calls to run-time library before every memory access.
+//      - Optimizations may apply to avoid instrumenting some of the accesses.
+//   - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tsan"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>  ClBlacklistFile("tsan-blacklist",
+       cl::desc("Blacklist file"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "tsan-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "tsan-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+          "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+struct ThreadSanitizer : public FunctionPass {
+  ThreadSanitizer(StringRef BlacklistFile = StringRef())
+      : FunctionPass(ID),
+        TD(0),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile) { }
+  const char *getPassName() const;
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  static char ID;  // Pass identification, replacement for typeid.
+
+ private:
+  void initializeCallbacks(Module &M);
+  bool instrumentLoadOrStore(Instruction *I);
+  bool instrumentAtomic(Instruction *I);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
+                                      SmallVectorImpl<Instruction*> &All);
+  bool addrPointsToConstantData(Value *Addr);
+  int getMemoryAccessFuncIndex(Value *Addr);
+
+  DataLayout *TD;
+  Type *IntptrTy;
+  SmallString<64> BlacklistFile;
+  OwningPtr<BlackList> BL;
+  IntegerType *OrdTy;
+  // Callbacks to run-time library are computed in doInitialization.
+  Function *TsanFuncEntry;
+  Function *TsanFuncExit;
+  // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+  static const size_t kNumberOfAccessSizes = 5;
+  Function *TsanRead[kNumberOfAccessSizes];
+  Function *TsanWrite[kNumberOfAccessSizes];
+  Function *TsanAtomicLoad[kNumberOfAccessSizes];
+  Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+  Function *TsanAtomicCAS[kNumberOfAccessSizes];
+  Function *TsanAtomicThreadFence;
+  Function *TsanAtomicSignalFence;
+  Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+};
+}  // namespace
+
+char ThreadSanitizer::ID = 0;
+INITIALIZE_PASS(ThreadSanitizer, "tsan",
+    "ThreadSanitizer: detects data races.",
+    false, false)
+
+const char *ThreadSanitizer::getPassName() const {
+  return "ThreadSanitizer";
+}
+
+FunctionPass *llvm::createThreadSanitizerPass(StringRef BlacklistFile) {
+  return new ThreadSanitizer(BlacklistFile);
+}
+
+static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast))
+     return F;
+  FuncOrBitcast->dump();
+  report_fatal_error("ThreadSanitizer interface function redefined");
+}
+
+void ThreadSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  // Initialize the callbacks.
+  TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+  TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_func_exit", IRB.getVoidTy(), NULL));
+  OrdTy = IRB.getInt32Ty();
+  for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+    const size_t ByteSize = 1 << i;
+    const size_t BitSize = ByteSize * 8;
+    SmallString<32> ReadName("__tsan_read" + itostr(ByteSize));
+    TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+
+    SmallString<32> WriteName("__tsan_write" + itostr(ByteSize));
+    TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+
+    Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) +
+                                   "_load");
+    TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicLoadName, Ty, PtrTy, OrdTy, NULL));
+
+    SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) +
+                                    "_store");
+    TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
+        NULL));
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = NULL;
+      const char *NamePart = NULL;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction(
+          RMWName, Ty, PtrTy, Ty, OrdTy, NULL));
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, NULL));
+  }
+  TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), NULL));
+  TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
+  TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
+  TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+
+  MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL));
+  MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL));
+  MemsetFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL));
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+
+  // Always insert a call to __tsan_init into the module's CTORs.
+  IRBuilder<> IRB(M.getContext());
+  IntptrTy = IRB.getIntPtrTy(TD);
+  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
+                                          IRB.getVoidTy(), NULL);
+  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
+
+  return true;
+}
+
+static bool isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
+    if (Tag->getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+  }
+  return false;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      // Reads from constant globals can not race with any writes.
+      NumOmittedReadsFromConstantGlobals++;
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      // Reads from a vtable pointer can not race with any writes.
+      NumOmittedReadsFromVtable++;
+      return true;
+    }
+  }
+  return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+//  - read-before-write (within same BB, no calls between)
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction*> &Local,
+    SmallVectorImpl<Instruction*> &All) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (SmallVectorImpl<Instruction*>::reverse_iterator It = Local.rbegin(),
+       E = Local.rend(); It != E; ++It) {
+    Instruction *I = *It;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      WriteTargets.insert(Store->getPointerOperand());
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        continue;
+      }
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSynchScope() == CrossThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSynchScope() == CrossThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+bool ThreadSanitizer::runOnFunction(Function &F) {
+  if (!TD) return false;
+  if (BL->isIn(F)) return false;
+  initializeCallbacks(*F.getParent());
+  SmallVector<Instruction*, 8> RetVec;
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  bool Res = false;
+  bool HasCalls = false;
+
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    BasicBlock &BB = *FI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
+         BI != BE; ++BI) {
+      if (isAtomic(BI))
+        AtomicAccesses.push_back(BI);
+      else if (isa<LoadInst>(BI) || isa<StoreInst>(BI))
+        LocalLoadsAndStores.push_back(BI);
+      else if (isa<ReturnInst>(BI))
+        RetVec.push_back(BI);
+      else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
+        if (isa<MemIntrinsic>(BI))
+          MemIntrinCalls.push_back(BI);
+        HasCalls = true;
+        chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
+  }
+
+  // We have collected all loads and stores.
+  // FIXME: many of these accesses do not need to be checked for races
+  // (e.g. variables that do not escape, etc).
+
+  // Instrument memory accesses.
+  if (ClInstrumentMemoryAccesses)
+    for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
+      Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
+    }
+
+  // Instrument atomic memory accesses.
+  if (ClInstrumentAtomics)
+    for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
+      Res |= instrumentAtomic(AtomicAccesses[i]);
+    }
+
+  if (ClInstrumentMemIntrinsics)
+    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
+      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+    }
+
+  // Instrument function entry/exit points if there were instrumented accesses.
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+    IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+    Value *ReturnAddress = IRB.CreateCall(
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+        IRB.getInt32(0));
+    IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+    for (size_t i = 0, n = RetVec.size(); i < n; ++i) {
+      IRBuilder<> IRBRet(RetVec[i]);
+      IRBRet.CreateCall(TsanFuncExit);
+    }
+    Res = true;
+  }
+  return Res;
+}
+
+bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+  int Idx = getMemoryAccessFuncIndex(Addr);
+  if (Idx < 0)
+    return false;
+  if (IsWrite && isVtableAccess(I)) {
+    DEBUG(dbgs() << "  VPTR : " << *I << "\n");
+    Value *StoredValue = cast<StoreInst>(I)->getValueOperand();
+    // StoredValue does not necessary have a pointer type.
+    if (isa<IntegerType>(StoredValue->getType()))
+      StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+    // Call TsanVptrUpdate.
+    IRB.CreateCall2(TsanVptrUpdate,
+                    IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                    IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableWrites++;
+    return true;
+  }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
+  Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+  if (IsWrite) NumInstrumentedWrites++;
+  else         NumInstrumentedReads++;
+  return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic:              assert(false);
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 0; break;
+    // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 3; break;
+    case AcquireRelease:         v = 4; break;
+    case SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+  uint32_t v = 0;
+  switch (ord) {
+    case NotAtomic:              assert(false);
+    case Unordered:              // Fall-through.
+    case Monotonic:              v = 0; break;
+    // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 0; break;
+    case AcquireRelease:         v = 2; break;
+    case SequentiallyConsistent: v = 5; break;
+  }
+  return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall3(MemsetFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall3(isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  }
+  return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards.  For background see C++11 standard.  A slightly older, publically
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Value *Addr = LI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     createOrdering(&IRB, LI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicLoad[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Value *Addr = SI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(SI->getValueOperand(), Ty, false),
+                     createOrdering(&IRB, SI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
+                                   ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (F == NULL)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
+                     IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+                     createOrdering(&IRB, CASI->getOrdering()),
+                     createFailOrdering(&IRB, CASI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    Function *F = FI->getSynchScope() == SingleThread ?
+        TsanAtomicSignalFence : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  }
+  return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    NumAccessesWithBadSize++;
+    // Ignore all unusual sizes.
+    return -1;
+  }
+  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  assert(Idx < kNumberOfAccessSizes);
+  return Idx;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
new file mode 100644
index 000000000000..8f917aeb3725
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -0,0 +1,262 @@
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-dependency"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/Support/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                                ProvenanceAnalysis &PA,
+                                InstructionClass Class) {
+  switch (Class) {
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_IntrinsicUser:
+  case IC_User:
+    // These operations never directly modify a reference count.
+    return false;
+  default: break;
+  }
+
+  ImmutableCallSite CS = static_cast<const Value *>(Inst);
+  assert(CS && "Only calls can alter reference counts!");
+
+  // See if AliasAnalysis can help us with the call.
+  AliasAnalysis::ModRefBehavior MRB = PA.getAA()->getModRefBehavior(CS);
+  if (AliasAnalysis::onlyReadsMemory(MRB))
+    return false;
+  if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+    for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+         I != E; ++I) {
+      const Value *Op = *I;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  }
+
+  // Assume the worst.
+  return true;
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+                      ProvenanceAnalysis &PA, InstructionClass Class) {
+  // IC_Call operations (as opposed to IC_CallOrUser) never "use" objc pointers.
+  if (Class == IC_Call)
+    return false;
+
+  // Consider various instructions which may have pointer arguments which are
+  // not "uses".
+  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+    // Comparing a pointer with null, or any other constant, isn't really a use,
+    // because we don't care what the pointer points to, or about the values
+    // of any other dynamic reference-counted pointers.
+    if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+      return false;
+  } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
+    // For calls, just check the arguments (and not the callee operand).
+    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
+         OE = CS.arg_end(); OI != OE; ++OI) {
+      const Value *Op = *OI;
+      if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+        return true;
+    }
+    return false;
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    // Special-case stores, because we don't care about the stored value, just
+    // the store address.
+    const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
+    // If we can't tell what the underlying object was, assume there is a
+    // dependence.
+    return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
+  }
+
+  // Check each operand for a match.
+  for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+       OI != OE; ++OI) {
+    const Value *Op = *OI;
+    if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
+      return true;
+  }
+  return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+                       const Value *Arg, ProvenanceAnalysis &PA) {
+  // If we've reached the definition of Arg, stop.
+  if (Inst == Arg)
+    return true;
+
+  switch (Flavor) {
+  case NeedsPositiveRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanUse(Inst, Arg, PA, Class);
+    }
+  }
+
+  case AutoreleasePoolBoundary: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // These mark the end and begin of an autorelease pool scope.
+      return true;
+    default:
+      // Nothing else does this.
+      return false;
+    }
+  }
+
+  case CanChangeRetainCount: {
+    InstructionClass Class = GetInstructionClass(Inst);
+    switch (Class) {
+    case IC_AutoreleasepoolPop:
+      // Conservatively assume this can decrement any count.
+      return true;
+    case IC_AutoreleasepoolPush:
+    case IC_None:
+      return false;
+    default:
+      return CanAlterRefCount(Inst, Arg, PA, Class);
+    }
+  }
+
+  case RetainAutoreleaseDep:
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPop:
+    case IC_AutoreleasepoolPush:
+      // Don't merge an objc_autorelease with an objc_retain inside a different
+      // autoreleasepool scope.
+      return true;
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Nothing else matters for objc_retainAutorelease formation.
+      return false;
+    }
+
+  case RetainAutoreleaseRVDep: {
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_RetainRV:
+      // Check for a retain of the same pointer for merging.
+      return GetObjCArg(Inst) == Arg;
+    default:
+      // Anything that can autorelease interrupts
+      // retainAutoreleaseReturnValue formation.
+      return CanInterruptRV(Class);
+    }
+  }
+
+  case RetainRVDep:
+    return CanInterruptRV(GetBasicInstructionClass(Inst));
+  }
+
+  llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
+void
+llvm::objcarc::FindDependencies(DependenceKind Flavor,
+                                const Value *Arg,
+                                BasicBlock *StartBB, Instruction *StartInst,
+                                SmallPtrSet<Instruction *, 4> &DependingInsts,
+                                SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                ProvenanceAnalysis &PA) {
+  BasicBlock::iterator StartPos = StartInst;
+
+  SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+  Worklist.push_back(std::make_pair(StartBB, StartPos));
+  do {
+    std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+      Worklist.pop_back_val();
+    BasicBlock *LocalStartBB = Pair.first;
+    BasicBlock::iterator LocalStartPos = Pair.second;
+    BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+    for (;;) {
+      if (LocalStartPos == StartBBBegin) {
+        pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+        if (PI == PE)
+          // If we've reached the function entry, produce a null dependence.
+          DependingInsts.insert(0);
+        else
+          // Add the predecessors to the worklist.
+          do {
+            BasicBlock *PredBB = *PI;
+            if (Visited.insert(PredBB))
+              Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
+          } while (++PI != PE);
+        break;
+      }
+
+      Instruction *Inst = --LocalStartPos;
+      if (Depends(Flavor, Inst, Arg, PA)) {
+        DependingInsts.insert(Inst);
+        break;
+      }
+    }
+  } while (!Worklist.empty());
+
+  // Determine whether the original StartBB post-dominates all of the blocks we
+  // visited. If not, insert a sentinal indicating that most optimizations are
+  // not safe.
+  for (SmallPtrSet<const BasicBlock *, 4>::const_iterator I = Visited.begin(),
+       E = Visited.end(); I != E; ++I) {
+    const BasicBlock *BB = *I;
+    if (BB == StartBB)
+      continue;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+      const BasicBlock *Succ = *SI;
+      if (Succ != StartBB && !Visited.count(Succ)) {
+        DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
+        return;
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
new file mode 100644
index 000000000000..24d358b30ab1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -0,0 +1,79 @@
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+  class BasicBlock;
+  class Instruction;
+  class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// \brief Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+  NeedsPositiveRetainCount,
+  AutoreleasePoolBoundary,
+  CanChangeRetainCount,
+  RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
+  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
+  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
+void FindDependencies(DependenceKind Flavor,
+                      const Value *Arg,
+                      BasicBlock *StartBB, Instruction *StartInst,
+                      SmallPtrSet<Instruction *, 4> &DependingInstructions,
+                      SmallPtrSet<const BasicBlock *, 4> &Visited,
+                      ProvenanceAnalysis &PA);
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+        ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool
+CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+       InstructionClass Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool
+CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+                 ProvenanceAnalysis &PA, InstructionClass Class);
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_DEPEDENCYANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
new file mode 100644
index 000000000000..53a31b0de178
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -0,0 +1,48 @@
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+bool llvm::objcarc::EnableARCOpts;
+static cl::opt<bool, true>
+EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::location(EnableARCOpts),
+                       cl::init(true));
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+  initializeObjCARCAliasAnalysisPass(Registry);
+  initializeObjCARCAPElimPass(Registry);
+  initializeObjCARCExpandPass(Registry);
+  initializeObjCARCContractPass(Registry);
+  initializeObjCARCOptPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+  initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
new file mode 100644
index 000000000000..39670f339e9f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -0,0 +1,395 @@
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+#define LLVM_TRANSFORMS_SCALAR_OBJCARC_H
+
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Transforms/ObjCARC.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief A handy option to enable/disable all ARC Optimizations.
+extern bool EnableARCOpts;
+
+/// \brief Test if the given module looks interesting to run ARC optimization
+/// on.
+static inline bool ModuleHasARC(const Module &M) {
+  return
+    M.getNamedValue("objc_retain") ||
+    M.getNamedValue("objc_release") ||
+    M.getNamedValue("objc_autorelease") ||
+    M.getNamedValue("objc_retainAutoreleasedReturnValue") ||
+    M.getNamedValue("objc_retainBlock") ||
+    M.getNamedValue("objc_autoreleaseReturnValue") ||
+    M.getNamedValue("objc_autoreleasePoolPush") ||
+    M.getNamedValue("objc_loadWeakRetained") ||
+    M.getNamedValue("objc_loadWeak") ||
+    M.getNamedValue("objc_destroyWeak") ||
+    M.getNamedValue("objc_storeWeak") ||
+    M.getNamedValue("objc_initWeak") ||
+    M.getNamedValue("objc_moveWeak") ||
+    M.getNamedValue("objc_copyWeak") ||
+    M.getNamedValue("objc_retainedObject") ||
+    M.getNamedValue("objc_unretainedObject") ||
+    M.getNamedValue("objc_unretainedPointer") ||
+    M.getNamedValue("clang.arc.use");
+}
+
+/// \enum InstructionClass
+/// \brief A simple classification for instructions.
+enum InstructionClass {
+  IC_Retain,              ///< objc_retain
+  IC_RetainRV,            ///< objc_retainAutoreleasedReturnValue
+  IC_RetainBlock,         ///< objc_retainBlock
+  IC_Release,             ///< objc_release
+  IC_Autorelease,         ///< objc_autorelease
+  IC_AutoreleaseRV,       ///< objc_autoreleaseReturnValue
+  IC_AutoreleasepoolPush, ///< objc_autoreleasePoolPush
+  IC_AutoreleasepoolPop,  ///< objc_autoreleasePoolPop
+  IC_NoopCast,            ///< objc_retainedObject, etc.
+  IC_FusedRetainAutorelease, ///< objc_retainAutorelease
+  IC_FusedRetainAutoreleaseRV, ///< objc_retainAutoreleaseReturnValue
+  IC_LoadWeakRetained,    ///< objc_loadWeakRetained (primitive)
+  IC_StoreWeak,           ///< objc_storeWeak (primitive)
+  IC_InitWeak,            ///< objc_initWeak (derived)
+  IC_LoadWeak,            ///< objc_loadWeak (derived)
+  IC_MoveWeak,            ///< objc_moveWeak (derived)
+  IC_CopyWeak,            ///< objc_copyWeak (derived)
+  IC_DestroyWeak,         ///< objc_destroyWeak (derived)
+  IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_IntrinsicUser,       ///< clang.arc.use
+  IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
+  IC_Call,                ///< could call objc_release
+  IC_User,                ///< could "use" a pointer
+  IC_None                 ///< anything else
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
+
+/// \brief Test if the given class is a kind of user.
+inline static bool IsUser(InstructionClass Class) {
+  return Class == IC_User ||
+         Class == IC_CallOrUser ||
+         Class == IC_IntrinsicUser;
+}
+
+/// \brief Test if the given class is objc_retain or equivalent.
+static inline bool IsRetain(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV;
+}
+
+/// \brief Test if the given class is objc_autorelease or equivalent.
+static inline bool IsAutorelease(InstructionClass Class) {
+  return Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which return their
+/// argument verbatim.
+static inline bool IsForwarding(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_NoopCast;
+}
+
+/// \brief Test if the given class represents instructions which do nothing if
+/// passed a null pointer.
+static inline bool IsNoopOnNull(InstructionClass Class) {
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_RetainBlock;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the "tail" keyword.
+static inline bool IsAlwaysTail(InstructionClass Class) {
+  // IC_RetainBlock may be given a stack argument.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_AutoreleaseRV;
+}
+
+/// \brief Test if the given class represents instructions which are never safe
+/// to mark with the "tail" keyword.
+static inline bool IsNeverTail(InstructionClass Class) {
+  /// It is never safe to tail call objc_autorelease since by tail calling
+  /// objc_autorelease, we also tail call -[NSObject autorelease] which supports
+  /// fast autoreleasing causing our object to be potentially reclaimed from the
+  /// autorelease pool which violates the semantics of __autoreleasing types in
+  /// ARC.
+  return Class == IC_Autorelease;
+}
+
+/// \brief Test if the given class represents instructions which are always safe
+/// to mark with the nounwind attribute.
+static inline bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
+  return Class == IC_Retain ||
+         Class == IC_RetainRV ||
+         Class == IC_Release ||
+         Class == IC_Autorelease ||
+         Class == IC_AutoreleaseRV ||
+         Class == IC_AutoreleasepoolPush ||
+         Class == IC_AutoreleasepoolPop;
+}
+
+/// Test whether the given instruction can autorelease any pointer or cause an
+/// autoreleasepool pop.
+static inline bool
+CanInterruptRV(InstructionClass Class) {
+  switch (Class) {
+  case IC_AutoreleasepoolPop:
+  case IC_CallOrUser:
+  case IC_Call:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// \brief Determine if F is one of the special known Functions.  If it isn't,
+/// return IC_CallOrUser.
+InstructionClass GetFunctionClass(const Function *F);
+
+/// \brief Determine which objc runtime call instruction class V belongs to.
+///
+/// This is similar to GetInstructionClass except that it only detects objc
+/// runtime calls. This allows it to be faster.
+///
+static inline InstructionClass GetBasicInstructionClass(const Value *V) {
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (const Function *F = CI->getCalledFunction())
+      return GetFunctionClass(F);
+    // Otherwise, be conservative.
+    return IC_CallOrUser;
+  }
+
+  // Otherwise, be conservative.
+  return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass GetInstructionClass(const Value *V);
+
+/// \brief This is a wrapper around getUnderlyingObject which also knows how to
+/// look through objc_retain and objc_autorelease calls, which we know to return
+/// their argument verbatim.
+static inline const Value *GetUnderlyingObjCPtr(const Value *V) {
+  for (;;) {
+    V = GetUnderlyingObject(V);
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline const Value *StripPointerCastsAndObjCCalls(const Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief This is a wrapper around Value::stripPointerCasts which also knows
+/// how to look through objc_retain and objc_autorelease calls, which we know to
+/// return their argument verbatim.
+static inline Value *StripPointerCastsAndObjCCalls(Value *V) {
+  for (;;) {
+    V = V->stripPointerCasts();
+    if (!IsForwarding(GetBasicInstructionClass(V)))
+      break;
+    V = cast<CallInst>(V)->getArgOperand(0);
+  }
+  return V;
+}
+
+/// \brief Assuming the given instruction is one of the special calls such as
+/// objc_retain or objc_release, return the argument value, stripped of no-op
+/// casts and forwarding calls.
+static inline Value *GetObjCArg(Value *Inst) {
+  return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
+}
+
+static inline bool IsNullOrUndef(const Value *V) {
+  return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
+}
+
+static inline bool IsNoopInstruction(const Instruction *I) {
+  return isa<BitCastInst>(I) ||
+    (isa<GetElementPtrInst>(I) &&
+     cast<GetElementPtrInst>(I)->hasAllZeroIndices());
+}
+
+
+/// \brief Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+  Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+  bool Unused = CI->use_empty();
+
+  if (!Unused) {
+    // Replace the return value with the argument.
+    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+           "Can't delete non-forwarding instruction with users!");
+    CI->replaceAllUsesWith(OldArg);
+  }
+
+  CI->eraseFromParent();
+
+  if (Unused)
+    RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// \brief Test whether the given value is possible a retainable object pointer.
+static inline bool IsPotentialRetainableObjPtr(const Value *Op) {
+  // Pointers to static or stack storage are not valid retainable object
+  // pointers.
+  if (isa<Constant>(Op) || isa<AllocaInst>(Op))
+    return false;
+  // Special arguments can not be a valid retainable object pointer.
+  if (const Argument *Arg = dyn_cast<Argument>(Op))
+    if (Arg->hasByValAttr() ||
+        Arg->hasNestAttr() ||
+        Arg->hasStructRetAttr())
+      return false;
+  // Only consider values with pointer types.
+  //
+  // It seemes intuitive to exclude function pointer types as well, since
+  // functions are never retainable object pointers, however clang occasionally
+  // bitcasts retainable object pointers to function-pointer type temporarily.
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  if (!Ty)
+    return false;
+  // Conservatively assume anything else is a potential retainable object
+  // pointer.
+  return true;
+}
+
+static inline bool IsPotentialRetainableObjPtr(const Value *Op,
+                                               AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialRetainableObjPtr(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
+/// \brief Helper for GetInstructionClass. Determines what kind of construct CS
+/// is.
+static inline InstructionClass GetCallSiteClass(ImmutableCallSite CS) {
+  for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
+       I != E; ++I)
+    if (IsPotentialRetainableObjPtr(*I))
+      return CS.onlyReadsMemory() ? IC_User : IC_CallOrUser;
+
+  return CS.onlyReadsMemory() ? IC_None : IC_Call;
+}
+
+/// \brief Return true if this value refers to a distinct and identifiable
+/// object.
+///
+/// This is similar to AliasAnalysis's isIdentifiedObject, except that it uses
+/// special knowledge of ObjC conventions.
+static inline bool IsObjCIdentifiedObject(const Value *V) {
+  // Assume that call results and arguments have their own "provenance".
+  // Constants (including GlobalVariables) and Allocas are never
+  // reference-counted.
+  if (isa<CallInst>(V) || isa<InvokeInst>(V) ||
+      isa<Argument>(V) || isa<Constant>(V) ||
+      isa<AllocaInst>(V))
+    return true;
+
+  if (const LoadInst *LI = dyn_cast<LoadInst>(V)) {
+    const Value *Pointer =
+      StripPointerCastsAndObjCCalls(LI->getPointerOperand());
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
+      StringRef Name = GV->getName();
+      // These special variables are known to hold values which are not
+      // reference-counted pointers.
+      if (Name.startswith("\01L_OBJC_SELECTOR_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_REFERENCES_") ||
+          Name.startswith("\01L_OBJC_CLASSLIST_SUP_REFS_$_") ||
+          Name.startswith("\01L_OBJC_METH_VAR_NAME_") ||
+          Name.startswith("\01l_objc_msgSend_fixup_"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_OBJCARC_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
new file mode 100644
index 000000000000..00d9864953dc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -0,0 +1,175 @@
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Autorelease pool elimination.
+  class ObjCARCAPElim : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnModule(Module &M);
+
+    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool OptimizeBB(BasicBlock *BB);
+
+  public:
+    static char ID;
+    ObjCARCAPElim() : ModulePass(ID) {
+      initializeObjCARCAPElimPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCAPElim::ID = 0;
+INITIALIZE_PASS(ObjCARCAPElim,
+                "objc-arc-apelim",
+                "ObjC ARC autorelease pool elimination",
+                false, false)
+
+Pass *llvm::createObjCARCAPElimPass() {
+  return new ObjCARCAPElim();
+}
+
+void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
+bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
+  if (const Function *Callee = CS.getCalledFunction()) {
+    if (Callee->isDeclaration() || Callee->mayBeOverridden())
+      return true;
+    for (Function::const_iterator I = Callee->begin(), E = Callee->end();
+         I != E; ++I) {
+      const BasicBlock *BB = I;
+      for (BasicBlock::const_iterator J = BB->begin(), F = BB->end();
+           J != F; ++J)
+        if (ImmutableCallSite JCS = ImmutableCallSite(J))
+          // This recursion depth limit is arbitrary. It's just great
+          // enough to cover known interesting testcases.
+          if (Depth < 3 &&
+              !JCS.onlyReadsMemory() &&
+              MayAutorelease(JCS, Depth + 1))
+            return true;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
+  bool Changed = false;
+
+  Instruction *Push = 0;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_AutoreleasepoolPush:
+      Push = Inst;
+      break;
+    case IC_AutoreleasepoolPop:
+      // If this pop matches a push and nothing in between can autorelease,
+      // zap the pair.
+      if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+        Changed = true;
+        DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+                        "autorelease pair:\n"
+                        "                           Pop: " << *Inst << "\n"
+                     << "                           Push: " << *Push << "\n");
+        Inst->eraseFromParent();
+        Push->eraseFromParent();
+      }
+      Push = 0;
+      break;
+    case IC_CallOrUser:
+      if (MayAutorelease(ImmutableCallSite(Inst)))
+        Push = 0;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool ObjCARCAPElim::runOnModule(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!ModuleHasARC(M))
+    return false;
+
+  // Find the llvm.global_ctors variable, as the first step in
+  // identifying the global constructors. In theory, unnecessary autorelease
+  // pools could occur anywhere, but in practice it's pretty rare. Global
+  // ctors are a place where autorelease pools get inserted automatically,
+  // so it's pretty common for them to be unnecessary, and it's pretty
+  // profitable to eliminate them.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return false;
+
+  assert(GV->hasDefinitiveInitializer() &&
+         "llvm.global_ctors is uncooperative!");
+
+  bool Changed = false;
+
+  // Dig the constructor functions out of GV's initializer.
+  ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+       OI != OE; ++OI) {
+    Value *Op = *OI;
+    // llvm.global_ctors is an array of pairs where the second members
+    // are constructor functions.
+    Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+    // If the user used a constructor function with the wrong signature and
+    // it got bitcasted or whatever, look the other way.
+    if (!F)
+      continue;
+    // Only look at function definitions.
+    if (F->isDeclaration())
+      continue;
+    // Only look at functions with one basic block.
+    if (llvm::next(F->begin()) != F->end())
+      continue;
+    // Ok, a single-block constructor function definition. Try to optimize it.
+    Changed |= OptimizeBB(F->begin());
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
new file mode 100644
index 000000000000..46b2de713745
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -0,0 +1,162 @@
+//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-aa"
+#include "ObjCARC.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassSupport.h"
+
+namespace llvm {
+  class Function;
+  class Value;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+// Register this pass...
+char ObjCARCAliasAnalysis::ID = 0;
+INITIALIZE_AG_PASS(ObjCARCAliasAnalysis, AliasAnalysis, "objc-arc-aa",
+                   "ObjC-ARC-Based Alias Analysis", false, true, false)
+
+ImmutablePass *llvm::createObjCARCAliasAnalysisPass() {
+  return new ObjCARCAliasAnalysis();
+}
+
+void
+ObjCARCAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AliasAnalysis::getAnalysisUsage(AU);
+}
+
+AliasAnalysis::AliasResult
+ObjCARCAliasAnalysis::alias(const Location &LocA, const Location &LocB) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::alias(LocA, LocB);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making a
+  // precise alias query.
+  const Value *SA = StripPointerCastsAndObjCCalls(LocA.Ptr);
+  const Value *SB = StripPointerCastsAndObjCCalls(LocB.Ptr);
+  AliasResult Result =
+    AliasAnalysis::alias(Location(SA, LocA.Size, LocA.TBAATag),
+                         Location(SB, LocB.Size, LocB.TBAATag));
+  if (Result != MayAlias)
+    return Result;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *UA = GetUnderlyingObjCPtr(SA);
+  const Value *UB = GetUnderlyingObjCPtr(SB);
+  if (UA != SA || UB != SB) {
+    Result = AliasAnalysis::alias(Location(UA), Location(UB));
+    // We can't use MustAlias or PartialAlias results here because
+    // GetUnderlyingObjCPtr may return an offsetted pointer value.
+    if (Result == NoAlias)
+      return NoAlias;
+  }
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return MayAlias;
+}
+
+bool
+ObjCARCAliasAnalysis::pointsToConstantMemory(const Location &Loc,
+                                             bool OrLocal) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
+
+  // First, strip off no-ops, including ObjC-specific no-ops, and try making
+  // a precise alias query.
+  const Value *S = StripPointerCastsAndObjCCalls(Loc.Ptr);
+  if (AliasAnalysis::pointsToConstantMemory(Location(S, Loc.Size, Loc.TBAATag),
+                                            OrLocal))
+    return true;
+
+  // If that failed, climb to the underlying object, including climbing through
+  // ObjC-specific no-ops, and try making an imprecise alias query.
+  const Value *U = GetUnderlyingObjCPtr(S);
+  if (U != S)
+    return AliasAnalysis::pointsToConstantMemory(Location(U), OrLocal);
+
+  // If that failed, fail. We don't need to chain here, since that's covered
+  // by the earlier precise query.
+  return false;
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
+  // We have nothing to do. Just chain to the next AliasAnalysis.
+  return AliasAnalysis::getModRefBehavior(CS);
+}
+
+AliasAnalysis::ModRefBehavior
+ObjCARCAliasAnalysis::getModRefBehavior(const Function *F) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefBehavior(F);
+
+  switch (GetFunctionClass(F)) {
+  case IC_NoopCast:
+    return DoesNotAccessMemory;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefBehavior(F);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
+  if (!EnableARCOpts)
+    return AliasAnalysis::getModRefInfo(CS, Loc);
+
+  switch (GetBasicInstructionClass(CS.getInstruction())) {
+  case IC_Retain:
+  case IC_RetainRV:
+  case IC_Autorelease:
+  case IC_AutoreleaseRV:
+  case IC_NoopCast:
+  case IC_AutoreleasepoolPush:
+  case IC_FusedRetainAutorelease:
+  case IC_FusedRetainAutoreleaseRV:
+    // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, because it updates
+    // pointers when it copies block data.
+    return NoModRef;
+  default:
+    break;
+  }
+
+  return AliasAnalysis::getModRefInfo(CS, Loc);
+}
+
+AliasAnalysis::ModRefResult
+ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
+                                    ImmutableCallSite CS2) {
+  // TODO: Theoretically we could check for dependencies between objc_* calls
+  // and OnlyAccessesArgumentPointees calls or other well-behaved calls.
+  return AliasAnalysis::getModRefInfo(CS1, CS2);
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
new file mode 100644
index 000000000000..7abe995a5ce7
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -0,0 +1,74 @@
+//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares a simple ARC-aware AliasAnalysis using special knowledge
+/// of Objective C to enhance other optimization passes which rely on the Alias
+/// Analysis infrastructure.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+namespace objcarc {
+
+  /// \brief This is a simple alias analysis implementation that uses knowledge
+  /// of ARC constructs to answer queries.
+  ///
+  /// TODO: This class could be generalized to know about other ObjC-specific
+  /// tricks. Such as knowing that ivars in the non-fragile ABI are non-aliasing
+  /// even though their offsets are dynamic.
+  class ObjCARCAliasAnalysis : public ImmutablePass,
+                               public AliasAnalysis {
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    ObjCARCAliasAnalysis() : ImmutablePass(ID) {
+      initializeObjCARCAliasAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+  private:
+    virtual void initializePass() {
+      InitializeAliasAnalysis(this);
+    }
+
+    /// This method is used when a pass implements an analysis interface through
+    /// multiple inheritance.  If needed, it should override this to adjust the
+    /// this pointer as needed for the specified pass info.
+    virtual void *getAdjustedAnalysisPointer(const void *PI) {
+      if (PI == &AliasAnalysis::ID)
+        return static_cast<AliasAnalysis *>(this);
+      return this;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual AliasResult alias(const Location &LocA, const Location &LocB);
+    virtual bool pointsToConstantMemory(const Location &Loc, bool OrLocal);
+    virtual ModRefBehavior getModRefBehavior(ImmutableCallSite CS);
+    virtual ModRefBehavior getModRefBehavior(const Function *F);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS,
+                                       const Location &Loc);
+    virtual ModRefResult getModRefInfo(ImmutableCallSite CS1,
+                                       ImmutableCallSite CS2);
+  };
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_OBJCARCALIASANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
new file mode 100644
index 000000000000..b96c64fe81de
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -0,0 +1,541 @@
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#define DEBUG_TYPE "objc-arc-contract"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+namespace {
+  /// \brief Late ARC optimizations
+  ///
+  /// These change the IR in a way that makes it difficult to be analyzed by
+  /// ObjCARCOpt, so it's run late.
+  class ObjCARCContract : public FunctionPass {
+    bool Changed;
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for objc_storeStrong().
+    Constant *StoreStrongCallee;
+    /// Declaration for objc_retainAutorelease().
+    Constant *RetainAutoreleaseCallee;
+    /// Declaration for objc_retainAutoreleaseReturnValue().
+    Constant *RetainAutoreleaseRVCallee;
+
+    /// The inline asm string to insert between calls and RetainRV calls to make
+    /// the optimization work on targets which need it.
+    const MDString *RetainRVMarker;
+
+    /// The set of inserted objc_storeStrong calls. If at the end of walking the
+    /// function we have found no alloca instructions, these calls can be marked
+    /// "tail".
+    SmallPtrSet<CallInst *, 8> StoreStrongCalls;
+
+    Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainAutoreleaseCallee(Module *M);
+    Constant *getRetainAutoreleaseRVCallee(Module *M);
+
+    bool ContractAutorelease(Function &F, Instruction *Autorelease,
+                             InstructionClass Class,
+                             SmallPtrSet<Instruction *, 4>
+                               &DependingInstructions,
+                             SmallPtrSet<const BasicBlock *, 4>
+                               &Visited);
+
+    void ContractRelease(Instruction *Release,
+                         inst_iterator &Iter);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  public:
+    static char ID;
+    ObjCARCContract() : FunctionPass(ID) {
+      initializeObjCARCContractPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCContract::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCContract,
+                      "objc-arc-contract", "ObjC ARC contraction", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(ObjCARCContract,
+                    "objc-arc-contract", "ObjC ARC contraction", false, false)
+
+Pass *llvm::createObjCARCContractPass() {
+  return new ObjCARCContract();
+}
+
+void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AliasAnalysis>();
+  AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
+
+Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
+  if (!StoreStrongCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeSet Attr = AttributeSet()
+      .addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                    Attribute::NoUnwind)
+      .addAttribute(M->getContext(), 1, Attribute::NoCapture);
+
+    StoreStrongCallee =
+      M->getOrInsertFunction(
+        "objc_storeStrong",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attr);
+  }
+  return StoreStrongCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
+  if (!RetainAutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseCallee =
+      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
+  }
+  return RetainAutoreleaseCallee;
+}
+
+Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
+  if (!RetainAutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainAutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainAutoreleaseRVCallee;
+}
+
+/// Merge an autorelease with a retain into a fused call.
+bool
+ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
+                                     InstructionClass Class,
+                                     SmallPtrSet<Instruction *, 4>
+                                       &DependingInstructions,
+                                     SmallPtrSet<const BasicBlock *, 4>
+                                       &Visited) {
+  const Value *Arg = GetObjCArg(Autorelease);
+
+  // Check that there are no instructions between the retain and the autorelease
+  // (such as an autorelease_pop) which may change the count.
+  CallInst *Retain = 0;
+  if (Class == IC_AutoreleaseRV)
+    FindDependencies(RetainAutoreleaseRVDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+  else
+    FindDependencies(RetainAutoreleaseDep, Arg,
+                     Autorelease->getParent(), Autorelease,
+                     DependingInstructions, Visited, PA);
+
+  Visited.clear();
+  if (DependingInstructions.size() != 1) {
+    DependingInstructions.clear();
+    return false;
+  }
+
+  Retain = dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+  DependingInstructions.clear();
+
+  if (!Retain ||
+      GetBasicInstructionClass(Retain) != IC_Retain ||
+      GetObjCArg(Retain) != Arg)
+    return false;
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCContract::ContractAutorelease: Fusing "
+                  "retain/autorelease. Erasing: " << *Autorelease << "\n"
+                  "                                      Old Retain: "
+               << *Retain << "\n");
+
+  if (Class == IC_AutoreleaseRV)
+    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
+  else
+    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                      New Retain: "
+               << *Retain << "\n");
+
+  EraseInstruction(Autorelease);
+  return true;
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. This can be a little tricky because the instructions
+/// don't always appear in order, and there may be unrelated intervening
+/// instructions.
+void ObjCARCContract::ContractRelease(Instruction *Release,
+                                      inst_iterator &Iter) {
+  LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
+  if (!Load || !Load->isSimple()) return;
+
+  // For now, require everything to be in one basic block.
+  BasicBlock *BB = Release->getParent();
+  if (Load->getParent() != BB) return;
+
+  // Walk down to find the store and the release, which may be in either order.
+  BasicBlock::iterator I = Load, End = BB->end();
+  ++I;
+  AliasAnalysis::Location Loc = AA->getLocation(Load);
+  StoreInst *Store = 0;
+  bool SawRelease = false;
+  for (; !Store || !SawRelease; ++I) {
+    if (I == End)
+      return;
+
+    Instruction *Inst = I;
+    if (Inst == Release) {
+      SawRelease = true;
+      continue;
+    }
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    // Unrelated retains are harmless.
+    if (IsRetain(Class))
+      continue;
+
+    if (Store) {
+      // The store is the point where we're going to put the objc_storeStrong,
+      // so make sure there are no uses after it.
+      if (CanUse(Inst, Load, PA, Class))
+        return;
+    } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) {
+      // We are moving the load down to the store, so check for anything
+      // else which writes to the memory between the load and the store.
+      Store = dyn_cast<StoreInst>(Inst);
+      if (!Store || !Store->isSimple()) return;
+      if (Store->getPointerOperand() != Loc.Ptr) return;
+    }
+  }
+
+  Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
+
+  // Walk up to find the retain.
+  I = Store;
+  BasicBlock::iterator Begin = BB->begin();
+  while (I != Begin && GetBasicInstructionClass(I) != IC_Retain)
+    --I;
+  Instruction *Retain = I;
+  if (GetBasicInstructionClass(Retain) != IC_Retain) return;
+  if (GetObjCArg(Retain) != New) return;
+
+  Changed = true;
+  ++NumStoreStrongs;
+
+  LLVMContext &C = Release->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+
+  Value *Args[] = { Load->getPointerOperand(), New };
+  if (Args[0]->getType() != I8XX)
+    Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+  if (Args[1]->getType() != I8X)
+    Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+  CallInst *StoreStrong =
+    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
+                     Args, "", Store);
+  StoreStrong->setDoesNotThrow();
+  StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+  // We can't set the tail flag yet, because we haven't yet determined
+  // whether there are any escaping allocas. Remember this call, so that
+  // we can set the tail flag once we know it's safe.
+  StoreStrongCalls.insert(StoreStrong);
+
+  if (&*Iter == Store) ++Iter;
+  Store->eraseFromParent();
+  Release->eraseFromParent();
+  EraseInstruction(Retain);
+  if (Load->use_empty())
+    Load->eraseFromParent();
+}
+
+bool ObjCARCContract::doInitialization(Module &M) {
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // These are initialized lazily.
+  StoreStrongCallee = 0;
+  RetainAutoreleaseCallee = 0;
+  RetainAutoreleaseRVCallee = 0;
+
+  // Initialize RetainRVMarker.
+  RetainRVMarker = 0;
+  if (NamedMDNode *NMD =
+        M.getNamedMetadata("clang.arc.retainAutoreleasedReturnValueMarker"))
+    if (NMD->getNumOperands() == 1) {
+      const MDNode *N = NMD->getOperand(0);
+      if (N->getNumOperands() == 1)
+        if (const MDString *S = dyn_cast<MDString>(N->getOperand(0)))
+          RetainRVMarker = S;
+    }
+
+  return false;
+}
+
+bool ObjCARCContract::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+  // keyword. Be conservative if the function has variadic arguments.
+  // It seems that functions which "return twice" are also unsafe for the
+  // "tail" argument, because they are setjmp, which could need to
+  // return to an earlier stack state.
+  bool TailOkForStoreStrongs = !F.isVarArg() &&
+                               !F.callsFunctionThatReturnsTwice();
+
+  // For ObjC library calls which return their argument, replace uses of the
+  // argument with uses of the call return value, if it dominates the use. This
+  // reduces register pressure.
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCContract: Visiting: " << *Inst << "\n");
+
+    // Only these library routines return their argument. In particular,
+    // objc_retainBlock does not necessarily return its argument.
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    switch (Class) {
+    case IC_Retain:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV:
+      break;
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+      if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
+        continue;
+      break;
+    case IC_RetainRV: {
+      // If we're compiling for a target which needs a special inline-asm
+      // marker to do the retainAutoreleasedReturnValue optimization,
+      // insert it now.
+      if (!RetainRVMarker)
+        break;
+      BasicBlock::iterator BBI = Inst;
+      BasicBlock *InstParent = Inst->getParent();
+
+      // Step up to see if the call immediately precedes the RetainRV call.
+      // If it's an invoke, we have to cross a block boundary. And we have
+      // to carefully dodge no-op instructions.
+      do {
+        if (&*BBI == InstParent->begin()) {
+          BasicBlock *Pred = InstParent->getSinglePredecessor();
+          if (!Pred)
+            goto decline_rv_optimization;
+          BBI = Pred->getTerminator();
+          break;
+        }
+        --BBI;
+      } while (IsNoopInstruction(BBI));
+
+      if (&*BBI == GetObjCArg(Inst)) {
+        DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
+                        "retainAutoreleasedReturnValue optimization.\n");
+        Changed = true;
+        InlineAsm *IA =
+          InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+                                           /*isVarArg=*/false),
+                         RetainRVMarker->getString(),
+                         /*Constraints=*/"", /*hasSideEffects=*/true);
+        CallInst::Create(IA, "", Inst);
+      }
+    decline_rv_optimization:
+      break;
+    }
+    case IC_InitWeak: {
+      // objc_initWeak(p, null) => *p = null
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(1))) {
+        Value *Null =
+          ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+        Changed = true;
+        new StoreInst(Null, CI->getArgOperand(0), CI);
+
+        DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+                     << "                 New = " << *Null << "\n");
+
+        CI->replaceAllUsesWith(Null);
+        CI->eraseFromParent();
+      }
+      continue;
+    }
+    case IC_Release:
+      ContractRelease(Inst, I);
+      continue;
+    case IC_User:
+      // Be conservative if the function has any alloca instructions.
+      // Technically we only care about escaping alloca instructions,
+      // but this is sufficient to handle some interesting cases.
+      if (isa<AllocaInst>(Inst))
+        TailOkForStoreStrongs = false;
+      continue;
+    case IC_IntrinsicUser:
+      // Remove calls to @clang.arc.use(...).
+      Inst->eraseFromParent();
+      continue;
+    default:
+      continue;
+    }
+
+    DEBUG(dbgs() << "ObjCARCContract: Finished List.\n\n");
+
+    // Don't use GetObjCArg because we don't want to look through bitcasts
+    // and such; to do the replacement, the argument must have type i8*.
+    const Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+    for (;;) {
+      // If we're compiling bugpointed code, don't get in trouble.
+      if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+        break;
+      // Look through the uses of the pointer.
+      for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+           UI != UE; ) {
+        Use &U = UI.getUse();
+        unsigned OperandNo = UI.getOperandNo();
+        ++UI; // Increment UI now, because we may unlink its element.
+
+        // If the call's return value dominates a use of the call's argument
+        // value, rewrite the use to use the return value. We check for
+        // reachability here because an unreachable call is considered to
+        // trivially dominate itself, which would lead us to rewriting its
+        // argument in terms of its return value, which would lead to
+        // infinite loops in GetObjCArg.
+        if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) {
+          Changed = true;
+          Instruction *Replacement = Inst;
+          Type *UseTy = U.get()->getType();
+          if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+            // For PHI nodes, insert the bitcast in the predecessor block.
+            unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+            BasicBlock *BB = PHI->getIncomingBlock(ValNo);
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            &BB->back());
+            // While we're here, rewrite all edges for this PHI, rather
+            // than just one use at a time, to minimize the number of
+            // bitcasts we emit.
+            for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+              if (PHI->getIncomingBlock(i) == BB) {
+                // Keep the UI iterator valid.
+                if (&PHI->getOperandUse(
+                      PHINode::getOperandNumForIncomingValue(i)) ==
+                    &UI.getUse())
+                  ++UI;
+                PHI->setIncomingValue(i, Replacement);
+              }
+          } else {
+            if (Replacement->getType() != UseTy)
+              Replacement = new BitCastInst(Replacement, UseTy, "",
+                                            cast<Instruction>(U.getUser()));
+            U.set(Replacement);
+          }
+        }
+      }
+
+      // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+      if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+        Arg = BI->getOperand(0);
+      else if (isa<GEPOperator>(Arg) &&
+               cast<GEPOperator>(Arg)->hasAllZeroIndices())
+        Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+      else if (isa<GlobalAlias>(Arg) &&
+               !cast<GlobalAlias>(Arg)->mayBeOverridden())
+        Arg = cast<GlobalAlias>(Arg)->getAliasee();
+      else
+        break;
+    }
+  }
+
+  // If this function has no escaping allocas or suspicious vararg usage,
+  // objc_storeStrong calls can be marked with the "tail" keyword.
+  if (TailOkForStoreStrongs)
+    for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(),
+         E = StoreStrongCalls.end(); I != E; ++I)
+      (*I)->setTailCall();
+  StoreStrongCalls.clear();
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
new file mode 100644
index 000000000000..39bf8f38735b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -0,0 +1,128 @@
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+#include "ObjCARC.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class Module;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+  /// \brief Early ARC transformations.
+  class ObjCARCExpand : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+  public:
+    static char ID;
+    ObjCARCExpand() : FunctionPass(ID) {
+      initializeObjCARCExpandPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCExpand::ID = 0;
+INITIALIZE_PASS(ObjCARCExpand,
+                "objc-arc-expand", "ObjC ARC expansion", false, false)
+
+Pass *llvm::createObjCARCExpandPass() {
+  return new ObjCARCExpand();
+}
+
+void ObjCARCExpand::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCExpand::doInitialization(Module &M) {
+  Run = ModuleHasARC(M);
+  return false;
+}
+
+bool ObjCARCExpand::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  bool Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName() << "\n");
+
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    Instruction *Inst = &*I;
+
+    DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+    switch (GetBasicInstructionClass(Inst)) {
+    case IC_Retain:
+    case IC_RetainRV:
+    case IC_Autorelease:
+    case IC_AutoreleaseRV:
+    case IC_FusedRetainAutorelease:
+    case IC_FusedRetainAutoreleaseRV: {
+      // These calls return their argument verbatim, as a low-level
+      // optimization. However, this makes high-level optimizations
+      // harder. Undo any uses of this optimization that the front-end
+      // emitted here. We'll redo them in the contract pass.
+      Changed = true;
+      Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+      DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst << "\n"
+                      "               New = " << *Value << "\n");
+      Inst->replaceAllUsesWith(Value);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
new file mode 100644
index 000000000000..92d6fc4767c2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -0,0 +1,3026 @@
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "objc-arc-opts"
+#include "ObjCARC.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARCAliasAnalysis.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// \defgroup MiscUtils Miscellaneous utilities that are not ARC specific.
+/// @{
+
+namespace {
+  /// \brief An associative container with fast insertion-order (deterministic)
+  /// iteration over its elements. Plus the special blot operation.
+  template<class KeyT, class ValueT>
+  class MapVector {
+    /// Map keys to indices in Vector.
+    typedef DenseMap<KeyT, size_t> MapTy;
+    MapTy Map;
+
+    typedef std::vector<std::pair<KeyT, ValueT> > VectorTy;
+    /// Keys and values.
+    VectorTy Vector;
+
+  public:
+    typedef typename VectorTy::iterator iterator;
+    typedef typename VectorTy::const_iterator const_iterator;
+    iterator begin() { return Vector.begin(); }
+    iterator end() { return Vector.end(); }
+    const_iterator begin() const { return Vector.begin(); }
+    const_iterator end() const { return Vector.end(); }
+
+#ifdef XDEBUG
+    ~MapVector() {
+      assert(Vector.size() >= Map.size()); // May differ due to blotting.
+      for (typename MapTy::const_iterator I = Map.begin(), E = Map.end();
+           I != E; ++I) {
+        assert(I->second < Vector.size());
+        assert(Vector[I->second].first == I->first);
+      }
+      for (typename VectorTy::const_iterator I = Vector.begin(),
+           E = Vector.end(); I != E; ++I)
+        assert(!I->first ||
+               (Map.count(I->first) &&
+                Map[I->first] == size_t(I - Vector.begin())));
+    }
+#endif
+
+    ValueT &operator[](const KeyT &Arg) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(Arg, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(std::make_pair(Arg, ValueT()));
+        return Vector[Num].second;
+      }
+      return Vector[Pair.first->second].second;
+    }
+
+    std::pair<iterator, bool>
+    insert(const std::pair<KeyT, ValueT> &InsertPair) {
+      std::pair<typename MapTy::iterator, bool> Pair =
+        Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+      if (Pair.second) {
+        size_t Num = Vector.size();
+        Pair.first->second = Num;
+        Vector.push_back(InsertPair);
+        return std::make_pair(Vector.begin() + Num, true);
+      }
+      return std::make_pair(Vector.begin() + Pair.first->second, false);
+    }
+
+    const_iterator find(const KeyT &Key) const {
+      typename MapTy::const_iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
+    /// This is similar to erase, but instead of removing the element from the
+    /// vector, it just zeros out the key in the vector. This leaves iterators
+    /// intact, but clients must be prepared for zeroed-out keys when iterating.
+    void blot(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return;
+      Vector[It->second].first = KeyT();
+      Map.erase(It);
+    }
+
+    void clear() {
+      Map.clear();
+      Vector.clear();
+    }
+  };
+}
+
+/// @}
+///
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// \brief This is similar to StripPointerCastsAndObjCCalls but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+  if (Arg->hasOneUse()) {
+    if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+      return FindSingleUseIdentifiedObject(BC->getOperand(0));
+    if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+      if (GEP->hasAllZeroIndices())
+        return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+    if (IsForwarding(GetBasicInstructionClass(Arg)))
+      return FindSingleUseIdentifiedObject(
+               cast<CallInst>(Arg)->getArgOperand(0));
+    if (!IsObjCIdentifiedObject(Arg))
+      return 0;
+    return Arg;
+  }
+
+  // If we found an identifiable object but it has multiple uses, but they are
+  // trivial uses, we can still consider this to be a single-use value.
+  if (IsObjCIdentifiedObject(Arg)) {
+    for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI) {
+      const User *U = *UI;
+      if (!U->use_empty() || StripPointerCastsAndObjCCalls(U) != Arg)
+         return 0;
+    }
+
+    return Arg;
+  }
+
+  return 0;
+}
+
+/// \brief Test whether the given retainable object pointer escapes.
+///
+/// This differs from regular escape analysis in that a use as an
+/// argument to a call is not considered an escape.
+///
+static bool DoesRetainableObjPtrEscape(const User *Ptr) {
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
+
+  // Walk the def-use chains.
+  SmallVector<const Value *, 4> Worklist;
+  Worklist.push_back(Ptr);
+  // If Ptr has any operands add them as well.
+  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
+       ++I) {
+    Worklist.push_back(*I);
+  }
+
+  // Ensure we do not visit any value twice.
+  SmallPtrSet<const Value *, 8> VisitedSet;
+
+  do {
+    const Value *V = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+
+    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+         UI != UE; ++UI) {
+      const User *UUser = *UI;
+
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+
+      // Special - Use by a call (callee or argument) is not considered
+      // to be an escape.
+      switch (GetBasicInstructionClass(UUser)) {
+      case IC_StoreWeak:
+      case IC_InitWeak:
+      case IC_StoreStrong:
+      case IC_Autorelease:
+      case IC_AutoreleaseRV: {
+        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
+              "arguments. Pointer Escapes!\n");
+        // These special functions make copies of their pointer arguments.
+        return true;
+      }
+      case IC_IntrinsicUser:
+        // Use by the use intrinsic is not an escape.
+        continue;
+      case IC_User:
+      case IC_None:
+        // Use by an instruction which copies the value is an escape if the
+        // result is an escape.
+        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
+            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
+
+          if (VisitedSet.insert(UUser)) {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
+                  "Ptr escapes if result escapes. Adding to list.\n");
+            Worklist.push_back(UUser);
+          } else {
+            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
+                  "\n");
+          }
+          continue;
+        }
+        // Use by a load is not an escape.
+        if (isa<LoadInst>(UUser))
+          continue;
+        // Use by a store is not an escape if the use is the address.
+        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
+          if (V != SI->getValueOperand())
+            continue;
+        break;
+      default:
+        // Regular calls and other stuff are not considered escapes.
+        continue;
+      }
+      // Otherwise, conservatively assume an escape.
+      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      return true;
+    }
+  } while (!Worklist.empty());
+
+  // No escapes found.
+  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  return false;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets,        "Number of return value forwarding "
+                          "retain+autoreleaes eliminated");
+STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+
+namespace {
+  /// \enum Sequence
+  ///
+  /// \brief A sequence of states that a pointer may go through in which an
+  /// objc_retain and objc_release are actually needed.
+  enum Sequence {
+    S_None,
+    S_Retain,         ///< objc_retain(x).
+    S_CanRelease,     ///< foo(x) -- x could possibly see a ref count decrement.
+    S_Use,            ///< any use of x.
+    S_Stop,           ///< like S_Release, but code motion is stopped.
+    S_Release,        ///< objc_release(x).
+    S_MovableRelease  ///< objc_release(x), !clang.imprecise_release.
+  };
+
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S)
+    LLVM_ATTRIBUTE_UNUSED;
+  raw_ostream &operator<<(raw_ostream &OS, const Sequence S) {
+    switch (S) {
+    case S_None:
+      return OS << "S_None";
+    case S_Retain:
+      return OS << "S_Retain";
+    case S_CanRelease:
+      return OS << "S_CanRelease";
+    case S_Use:
+      return OS << "S_Use";
+    case S_Release:
+      return OS << "S_Release";
+    case S_MovableRelease:
+      return OS << "S_MovableRelease";
+    case S_Stop:
+      return OS << "S_Stop";
+    }
+    llvm_unreachable("Unknown sequence type.");
+  }
+}
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+  // The easy cases.
+  if (A == B)
+    return A;
+  if (A == S_None || B == S_None)
+    return S_None;
+
+  if (A > B) std::swap(A, B);
+  if (TopDown) {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
+      return B;
+  } else {
+    // Choose the side which is further along in the sequence.
+    if ((A == S_Use || A == S_CanRelease) &&
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+      return A;
+    // If both sides are releases, choose the more conservative one.
+    if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+      return A;
+    if (A == S_Release && B == S_MovableRelease)
+      return A;
+  }
+
+  return S_None;
+}
+
+namespace {
+  /// \brief Unidirectional information about either a
+  /// retain-decrement-use-release sequence or release-use-decrement-retain
+  /// reverese sequence.
+  struct RRInfo {
+    /// After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
+
+    /// True of the objc_release calls are all marked with the "tail" keyword.
+    bool IsTailCallRelease;
+
+    /// If the Calls are objc_release calls and they all have a
+    /// clang.imprecise_release tag, this is the metadata tag.
+    MDNode *ReleaseMetadata;
+
+    /// For a top-down sequence, the set of objc_retains or
+    /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+    SmallPtrSet<Instruction *, 2> Calls;
+
+    /// The set of optimal insert positions for moving calls in the opposite
+    /// sequence.
+    SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+    RRInfo() :
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
+
+    void clear();
+  };
+}
+
+void RRInfo::clear() {
+  KnownSafe = false;
+  IsTailCallRelease = false;
+  ReleaseMetadata = 0;
+  Calls.clear();
+  ReverseInsertPts.clear();
+}
+
+namespace {
+  /// \brief This class summarizes several per-pointer runtime properties which
+  /// are propogated through the flow graph.
+  class PtrState {
+    /// True if the reference count is known to be incremented.
+    bool KnownPositiveRefCount;
+
+    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+    bool Partial;
+
+    /// The current position in the sequence.
+    Sequence Seq : 8;
+
+  public:
+    /// Unidirectional information about the current sequence.
+    ///
+    /// TODO: Encapsulate this better.
+    RRInfo RRI;
+
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
+                 Seq(S_None) {}
+
+    void SetKnownPositiveRefCount() {
+      KnownPositiveRefCount = true;
+    }
+
+    void ClearKnownPositiveRefCount() {
+      KnownPositiveRefCount = false;
+    }
+
+    bool HasKnownPositiveRefCount() const {
+      return KnownPositiveRefCount;
+    }
+
+    void SetSeq(Sequence NewSeq) {
+      Seq = NewSeq;
+    }
+
+    Sequence GetSeq() const {
+      return Seq;
+    }
+
+    void ClearSequenceProgress() {
+      ResetSequenceProgress(S_None);
+    }
+
+    void ResetSequenceProgress(Sequence NewSeq) {
+      Seq = NewSeq;
+      Partial = false;
+      RRI.clear();
+    }
+
+    void Merge(const PtrState &Other, bool TopDown);
+  };
+}
+
+void
+PtrState::Merge(const PtrState &Other, bool TopDown) {
+  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
+  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
+
+  // If we're not in a sequence (anymore), drop all associated state.
+  if (Seq == S_None) {
+    Partial = false;
+    RRI.clear();
+  } else if (Partial || Other.Partial) {
+    // If we're doing a merge on a path that's previously seen a partial
+    // merge, conservatively drop the sequence, to avoid doing partial
+    // RR elimination. If the branch predicates for the two merge differ,
+    // mixing them is unsafe.
+    ClearSequenceProgress();
+  } else {
+    // Conservatively merge the ReleaseMetadata information.
+    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
+      RRI.ReleaseMetadata = 0;
+
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
+    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
+                            Other.RRI.IsTailCallRelease;
+    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.RRI.ReverseInsertPts.begin(),
+         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
+      Partial |= RRI.ReverseInsertPts.insert(*I);
+  }
+}
+
+namespace {
+  /// \brief Per-BasicBlock state.
+  class BBState {
+    /// The number of unique control paths from the entry which can reach this
+    /// block.
+    unsigned TopDownPathCount;
+
+    /// The number of unique control paths to exits from this block.
+    unsigned BottomUpPathCount;
+
+    /// A type for PerPtrTopDown and PerPtrBottomUp.
+    typedef MapVector<const Value *, PtrState> MapTy;
+
+    /// The top-down traversal uses this to record information known about a
+    /// pointer at the bottom of each block.
+    MapTy PerPtrTopDown;
+
+    /// The bottom-up traversal uses this to record information known about a
+    /// pointer at the top of each block.
+    MapTy PerPtrBottomUp;
+
+    /// Effective predecessors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Preds;
+    /// Effective successors of the current block ignoring ignorable edges and
+    /// ignored backedges.
+    SmallVector<BasicBlock *, 2> Succs;
+
+  public:
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+
+    typedef MapTy::iterator ptr_iterator;
+    typedef MapTy::const_iterator ptr_const_iterator;
+
+    ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+    ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+    ptr_const_iterator top_down_ptr_begin() const {
+      return PerPtrTopDown.begin();
+    }
+    ptr_const_iterator top_down_ptr_end() const {
+      return PerPtrTopDown.end();
+    }
+
+    ptr_iterator bottom_up_ptr_begin() { return PerPtrBottomUp.begin(); }
+    ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+    ptr_const_iterator bottom_up_ptr_begin() const {
+      return PerPtrBottomUp.begin();
+    }
+    ptr_const_iterator bottom_up_ptr_end() const {
+      return PerPtrBottomUp.end();
+    }
+
+    /// Mark this block as being an entry block, which has one path from the
+    /// entry by definition.
+    void SetAsEntry() { TopDownPathCount = 1; }
+
+    /// Mark this block as being an exit block, which has one path to an exit by
+    /// definition.
+    void SetAsExit()  { BottomUpPathCount = 1; }
+
+    PtrState &getPtrTopDownState(const Value *Arg) {
+      return PerPtrTopDown[Arg];
+    }
+
+    PtrState &getPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp[Arg];
+    }
+
+    void clearBottomUpPointers() {
+      PerPtrBottomUp.clear();
+    }
+
+    void clearTopDownPointers() {
+      PerPtrTopDown.clear();
+    }
+
+    void InitFromPred(const BBState &Other);
+    void InitFromSucc(const BBState &Other);
+    void MergePred(const BBState &Other);
+    void MergeSucc(const BBState &Other);
+
+    /// Return the number of possible unique paths from an entry to an exit
+    /// which pass through this block. This is only valid after both the
+    /// top-down and bottom-up traversals are complete.
+    unsigned GetAllPathCount() const {
+      assert(TopDownPathCount != 0);
+      assert(BottomUpPathCount != 0);
+      return TopDownPathCount * BottomUpPathCount;
+    }
+
+    // Specialized CFG utilities.
+    typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
+    edge_iterator pred_begin() { return Preds.begin(); }
+    edge_iterator pred_end() { return Preds.end(); }
+    edge_iterator succ_begin() { return Succs.begin(); }
+    edge_iterator succ_end() { return Succs.end(); }
+
+    void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+    void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+    bool isExit() const { return Succs.empty(); }
+  };
+}
+
+void BBState::InitFromPred(const BBState &Other) {
+  PerPtrTopDown = Other.PerPtrTopDown;
+  TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+  PerPtrBottomUp = Other.PerPtrBottomUp;
+  BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+  // Other.TopDownPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  TopDownPathCount += Other.TopDownPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the same key,
+  // merge the entries. Otherwise, copy the entry and merge it with an empty
+  // entry.
+  for (ptr_const_iterator MI = Other.top_down_ptr_begin(),
+       ME = Other.top_down_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrTopDown.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/true);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry with the
+  // same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = top_down_ptr_begin(),
+       ME = top_down_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+  // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+  // loop backedge. Loop backedges are special.
+  BottomUpPathCount += Other.BottomUpPathCount;
+
+  // Check for overflow. If we have overflow, fall back to conservative
+  // behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
+  // For each entry in the other set, if our set has an entry with the
+  // same key, merge the entries. Otherwise, copy the entry and merge
+  // it with an empty entry.
+  for (ptr_const_iterator MI = Other.bottom_up_ptr_begin(),
+       ME = Other.bottom_up_ptr_end(); MI != ME; ++MI) {
+    std::pair<ptr_iterator, bool> Pair = PerPtrBottomUp.insert(*MI);
+    Pair.first->second.Merge(Pair.second ? PtrState() : MI->second,
+                             /*TopDown=*/false);
+  }
+
+  // For each entry in our set, if the other set doesn't have an entry
+  // with the same key, force it to merge with an empty entry.
+  for (ptr_iterator MI = bottom_up_ptr_begin(),
+       ME = bottom_up_ptr_end(); MI != ME; ++MI)
+    if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+      MI->second.Merge(PtrState(), /*TopDown=*/false);
+}
+
+// Only enable ARC Annotations if we are building a debug version of
+// libObjCARCOpts.
+#ifndef NDEBUG
+#define ARC_ANNOTATIONS
+#endif
+
+// Define some macros along the lines of DEBUG and some helper functions to make
+// it cleaner to create annotations in the source code and to no-op when not
+// building in debug mode.
+#ifdef ARC_ANNOTATIONS
+
+#include "llvm/Support/CommandLine.h"
+
+/// Enable/disable ARC sequence annotations.
+static cl::opt<bool>
+EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false));
+
+/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an
+/// instruction so that we can track backwards when post processing via the llvm
+/// arc annotation processor tool. If the function is an
+static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
+                                         Value *Ptr) {
+  MDString *Hash = 0;
+
+  // If pointer is a result of an instruction and it does not have a source
+  // MDNode it, attach a new MDNode onto it. If pointer is a result of
+  // an instruction and does have a source MDNode attached to it, return a
+  // reference to said Node. Otherwise just return 0.
+  if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) {
+    MDNode *Node;
+    if (!(Node = Inst->getMetadata(NodeId))) {
+      // We do not have any node. Generate and attatch the hash MDString to the
+      // instruction.
+
+      // We just use an MDString to ensure that this metadata gets written out
+      // of line at the module level and to provide a very simple format
+      // encoding the information herein. Both of these makes it simpler to
+      // parse the annotations by a simple external program.
+      std::string Str;
+      raw_string_ostream os(Str);
+      os << "(" << Inst->getParent()->getParent()->getName() << ",%"
+         << Inst->getName() << ")";
+
+      Hash = MDString::get(Inst->getContext(), os.str());
+      Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash));
+    } else {
+      // We have a node. Grab its hash and return it.
+      assert(Node->getNumOperands() == 1 &&
+        "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand.");
+      Hash = cast<MDString>(Node->getOperand(0));
+    }
+  } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    std::string str;
+    raw_string_ostream os(str);
+    os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName()
+       << ")";
+    Hash = MDString::get(Arg->getContext(), os.str());
+  }
+
+  return Hash;
+}
+
+static std::string SequenceToString(Sequence A) {
+  std::string str;
+  raw_string_ostream os(str);
+  os << A;
+  return os.str();
+}
+
+/// Helper function to change a Sequence into a String object using our overload
+/// for raw_ostream so we only have printing code in one location.
+static MDString *SequenceToMDString(LLVMContext &Context,
+                                    Sequence A) {
+  return MDString::get(Context, SequenceToString(A));
+}
+
+/// A simple function to generate a MDNode which describes the change in state
+/// for Value *Ptr caused by Instruction *Inst.
+static void AppendMDNodeToInstForPtr(unsigned NodeId,
+                                     Instruction *Inst,
+                                     Value *Ptr,
+                                     MDString *PtrSourceMDNodeID,
+                                     Sequence OldSeq,
+                                     Sequence NewSeq) {
+  MDNode *Node = 0;
+  Value *tmp[3] = {PtrSourceMDNodeID,
+                   SequenceToMDString(Inst->getContext(),
+                                      OldSeq),
+                   SequenceToMDString(Inst->getContext(),
+                                      NewSeq)};
+  Node = MDNode::get(Inst->getContext(),
+                     ArrayRef<Value*>(tmp, 3));
+
+  Inst->setMetadata(NodeId, Node);
+}
+
+/// Add to the beginning of the basic block llvm.ptr.annotations which show the
+/// state of a pointer at the entrance to a basic block.
+static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
+                                            Value *Ptr, Sequence Seq) {
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, BB->getFirstInsertionPt());
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Add to the end of the basic block llvm.ptr.annotations which show the state
+/// of the pointer at the bottom of the basic block.
+static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
+                                              Value *Ptr, Sequence Seq) {
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, llvm::prior(BB->end()));
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Adds a source annotation to pointer and a state change annotation to Inst
+/// referencing the source annotation and the old/new state of pointer.
+static void GenerateARCAnnotation(unsigned InstMDId,
+                                  unsigned PtrMDId,
+                                  Instruction *Inst,
+                                  Value *Ptr,
+                                  Sequence OldSeq,
+                                  Sequence NewSeq) {
+  if (EnableARCAnnotations) {
+    // First generate the source annotation on our pointer. This will return an
+    // MDString* if Ptr actually comes from an instruction implying we can put
+    // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL),
+    // then we know that our pointer is from an Argument so we put a reference
+    // to the argument number.
+    //
+    // The point of this is to make it easy for the
+    // llvm-arc-annotation-processor tool to cross reference where the source
+    // pointer is in the LLVM IR since the LLVM IR parser does not submit such
+    // information via debug info for backends to use (since why would anyone
+    // need such a thing from LLVM IR besides in non standard cases
+    // [i.e. this]).
+    MDString *SourcePtrMDNode =
+      AppendMDNodeToSourcePtr(PtrMDId, Ptr);
+    AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq,
+                             NewSeq);
+  }
+}
+
+// The actual interface for accessing the above functionality is defined via
+// some simple macros which are defined below. We do this so that the user does
+// not need to pass in what metadata id is needed resulting in cleaner code and
+// additionally since it provides an easy way to conditionally no-op all
+// annotation support in a non-debug build.
+
+/// Use this macro to annotate a sequence state change when processing
+/// instructions bottom up,
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)                          \
+  GenerateARCAnnotation(ARCAnnotationBottomUpMDKind,                    \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+/// Use this macro to annotate a sequence state change when processing
+/// instructions top down.
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)                           \
+  GenerateARCAnnotation(ARCAnnotationTopDownMDKind,                     \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+
+#define ANNOTATE_BB(_states, _bb, _name, _type, _direction)                   \
+  do {                                                                        \
+  if (EnableARCAnnotations) {                                                 \
+    for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(),   \
+          E = (_states)._direction##_ptr_end(); I != E; ++I) {                \
+      Value *Ptr = const_cast<Value*>(I->first);                              \
+      Sequence Seq = I->second.GetSeq();                                      \
+      GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq);           \
+    }                                                                         \
+  }                                                                           \
+} while (0)
+
+#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \
+                Entrance, bottom_up)
+#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend", \
+                Terminator, bottom_up)
+#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart", \
+                Entrance, top_down)
+#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock) \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend", \
+                Terminator, top_down)
+
+#else // !ARC_ANNOTATION
+// If annotations are off, noop.
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)
+#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock)
+#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBEND(states, basicblock)
+#endif // !ARC_ANNOTATION
+
+namespace {
+  /// \brief The main ARC optimization pass.
+  class ObjCARCOpt : public FunctionPass {
+    bool Changed;
+    ProvenanceAnalysis PA;
+
+    /// A flag indicating whether this optimization pass should run.
+    bool Run;
+
+    /// Declarations for ObjC runtime functions, for use in creating calls to
+    /// them. These are initialized lazily to avoid cluttering up the Module
+    /// with unused declarations.
+
+    /// Declaration for ObjC runtime function
+    /// objc_retainAutoreleasedReturnValue.
+    Constant *RetainRVCallee;
+    /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+    Constant *AutoreleaseRVCallee;
+    /// Declaration for ObjC runtime function objc_release.
+    Constant *ReleaseCallee;
+    /// Declaration for ObjC runtime function objc_retain.
+    Constant *RetainCallee;
+    /// Declaration for ObjC runtime function objc_retainBlock.
+    Constant *RetainBlockCallee;
+    /// Declaration for ObjC runtime function objc_autorelease.
+    Constant *AutoreleaseCallee;
+
+    /// Flags which determine whether each of the interesting runtine functions
+    /// is in fact used in the current function.
+    unsigned UsedInThisFunction;
+
+    /// The Metadata Kind for clang.imprecise_release metadata.
+    unsigned ImpreciseReleaseMDKind;
+
+    /// The Metadata Kind for clang.arc.copy_on_escape metadata.
+    unsigned CopyOnEscapeMDKind;
+
+    /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
+    unsigned NoObjCARCExceptionsMDKind;
+
+#ifdef ARC_ANNOTATIONS
+    /// The Metadata Kind for llvm.arc.annotation.bottomup metadata.
+    unsigned ARCAnnotationBottomUpMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.topdown metadata.
+    unsigned ARCAnnotationTopDownMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata.
+    unsigned ARCAnnotationProvenanceSourceMDKind;
+#endif // ARC_ANNOATIONS
+
+    Constant *getRetainRVCallee(Module *M);
+    Constant *getAutoreleaseRVCallee(Module *M);
+    Constant *getReleaseCallee(Module *M);
+    Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
+    Constant *getAutoreleaseCallee(Module *M);
+
+    bool IsRetainBlockOptimizable(const Instruction *Inst);
+
+    void OptimizeRetainCall(Function &F, Instruction *Retain);
+    bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
+    void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                   InstructionClass &Class);
+    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
+                                 InstructionClass &Class);
+    void OptimizeIndividualCalls(Function &F);
+
+    void CheckForCFGHazards(const BasicBlock *BB,
+                            DenseMap<const BasicBlock *, BBState> &BBStates,
+                            BBState &MyStates) const;
+    bool VisitInstructionBottomUp(Instruction *Inst,
+                                  BasicBlock *BB,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  BBState &MyStates);
+    bool VisitBottomUp(BasicBlock *BB,
+                       DenseMap<const BasicBlock *, BBState> &BBStates,
+                       MapVector<Value *, RRInfo> &Retains);
+    bool VisitInstructionTopDown(Instruction *Inst,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 BBState &MyStates);
+    bool VisitTopDown(BasicBlock *BB,
+                      DenseMap<const BasicBlock *, BBState> &BBStates,
+                      DenseMap<Value *, RRInfo> &Releases);
+    bool Visit(Function &F,
+               DenseMap<const BasicBlock *, BBState> &BBStates,
+               MapVector<Value *, RRInfo> &Retains,
+               DenseMap<Value *, RRInfo> &Releases);
+
+    void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
+                   MapVector<Value *, RRInfo> &Retains,
+                   DenseMap<Value *, RRInfo> &Releases,
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
+
+    bool ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState> &BBStates,
+                               MapVector<Value *, RRInfo> &Retains,
+                               DenseMap<Value *, RRInfo> &Releases,
+                               Module *M,
+                               SmallVector<Instruction *, 4> &NewRetains,
+                               SmallVector<Instruction *, 4> &NewReleases,
+                               SmallVector<Instruction *, 8> &DeadInsts,
+                               RRInfo &RetainsToMove,
+                               RRInfo &ReleasesToMove,
+                               Value *Arg,
+                               bool KnownSafe,
+                               bool &AnyPairsCompletelyEliminated);
+
+    bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
+                              MapVector<Value *, RRInfo> &Retains,
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
+
+    void OptimizeWeakCalls(Function &F);
+
+    bool OptimizeSequences(Function &F);
+
+    void OptimizeReturns(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual void releaseMemory();
+
+  public:
+    static char ID;
+    ObjCARCOpt() : FunctionPass(ID) {
+      initializeObjCARCOptPass(*PassRegistry::getPassRegistry());
+    }
+  };
+}
+
+char ObjCARCOpt::ID = 0;
+INITIALIZE_PASS_BEGIN(ObjCARCOpt,
+                      "objc-arc", "ObjC ARC optimization", false, false)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAliasAnalysis)
+INITIALIZE_PASS_END(ObjCARCOpt,
+                    "objc-arc", "ObjC ARC optimization", false, false)
+
+Pass *llvm::createObjCARCOptPass() {
+  return new ObjCARCOpt();
+}
+
+void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<ObjCARCAliasAnalysis>();
+  AU.addRequired<AliasAnalysis>();
+  // ARC optimization doesn't currently split critical edges.
+  AU.setPreservesCFG();
+}
+
+bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
+  // Without the magic metadata tag, we have to assume this might be an
+  // objc_retainBlock call inserted to convert a block pointer to an id,
+  // in which case it really is needed.
+  if (!Inst->getMetadata(CopyOnEscapeMDKind))
+    return false;
+
+  // If the pointer "escapes" (not including being used in a call),
+  // the copy may be needed.
+  if (DoesRetainableObjPtrEscape(Inst))
+    return false;
+
+  // Otherwise, it's not needed.
+  return true;
+}
+
+Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
+  if (!AutoreleaseRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseRVCallee =
+      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
+                             Attribute);
+  }
+  return AutoreleaseRVCallee;
+}
+
+Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
+  if (!ReleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    ReleaseCallee =
+      M->getOrInsertFunction(
+        "objc_release",
+        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return ReleaseCallee;
+}
+
+Constant *ObjCARCOpt::getRetainCallee(Module *M) {
+  if (!RetainCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainCallee =
+      M->getOrInsertFunction(
+        "objc_retain",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return RetainCallee;
+}
+
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        AttributeSet());
+  }
+  return RetainBlockCallee;
+}
+
+Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
+  if (!AutoreleaseCallee) {
+    LLVMContext &C = M->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    AutoreleaseCallee =
+      M->getOrInsertFunction(
+        "objc_autorelease",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attribute);
+  }
+  return AutoreleaseCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value.
+void
+ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call) return;
+  if (Call->getParent() != Retain->getParent()) return;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (IsNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue..
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
+                  "objc_retain => objc_retainAutoreleasedReturnValue"
+                  " since the operand is a return value.\n"
+                  "                                Old: "
+               << *Retain << "\n");
+
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                New: "
+               << *Retain << "\n");
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.  Or, if it can be paired with an
+/// objc_autoreleaseReturnValue, delete the pair and return true.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+  // Check for the argument being from an immediately preceding call or invoke.
+  const Value *Arg = GetObjCArg(RetainRV);
+  ImmutableCallSite CS(Arg);
+  if (const Instruction *Call = CS.getInstruction()) {
+    if (Call->getParent() == RetainRV->getParent()) {
+      BasicBlock::const_iterator I = Call;
+      ++I;
+      while (IsNoopInstruction(I)) ++I;
+      if (&*I == RetainRV)
+        return false;
+    } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      BasicBlock *RetainRVParent = RetainRV->getParent();
+      if (II->getNormalDest() == RetainRVParent) {
+        BasicBlock::const_iterator I = RetainRVParent->begin();
+        while (IsNoopInstruction(I)) ++I;
+        if (&*I == RetainRV)
+          return false;
+      }
+    }
+  }
+
+  // Check for being preceded by an objc_autoreleaseReturnValue on the same
+  // pointer. In this case, we can delete the pair.
+  BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
+  if (I != Begin) {
+    do --I; while (I != Begin && IsNoopInstruction(I));
+    if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
+        GetObjCArg(I) == Arg) {
+      Changed = true;
+      ++NumPeeps;
+
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
+                   << "                                  Erasing " << *RetainRV
+                   << "\n");
+
+      EraseInstruction(I);
+      EraseInstruction(RetainRV);
+      return true;
+    }
+  }
+
+  // Turn it to a plain objc_retain.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
+                  "objc_retainAutoreleasedReturnValue => "
+                  "objc_retain since the operand is not a return value.\n"
+                  "                                  Old: "
+               << *RetainRV << "\n");
+
+  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+
+  DEBUG(dbgs() << "                                  New: "
+               << *RetainRV << "\n");
+
+  return false;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void
+ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
+                                      InstructionClass &Class) {
+  // Check for a return of the pointer value.
+  const Value *Ptr = GetObjCArg(AutoreleaseRV);
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
+
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
+                  "objc_autoreleaseReturnValue => "
+                  "objc_autorelease since its operand is not used as a return "
+                  "value.\n"
+                  "                                       Old: "
+               << *AutoreleaseRV << "\n");
+
+  CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+  AutoreleaseRVCI->
+    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+  AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+  Class = IC_Autorelease;
+
+  DEBUG(dbgs() << "                                       New: "
+               << *AutoreleaseRV << "\n");
+
+}
+
+// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
+// calls.
+//
+// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
+// does not escape (following the rules of block escaping), strength reduce the
+// objc_retainBlock to an objc_retain.
+//
+// TODO: If an objc_retainBlock call is dominated period by a previous
+// objc_retainBlock call, strength reduce the objc_retainBlock to an
+// objc_retain.
+bool
+ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
+                                    InstructionClass &Class) {
+  assert(GetBasicInstructionClass(Inst) == Class);
+  assert(IC_RetainBlock == Class);
+
+  // If we can not optimize Inst, return false.
+  if (!IsRetainBlockOptimizable(Inst))
+    return false;
+
+  CallInst *RetainBlock = cast<CallInst>(Inst);
+  RetainBlock->setCalledFunction(getRetainCallee(F.getParent()));
+  // Remove copy_on_escape metadata.
+  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
+  Class = IC_Retain;
+
+  return true;
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  // Reset all the flags in preparation for recomputing them.
+  UsedInThisFunction = 0;
+
+  // Visit all objc_* calls in F.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
+          << Class << "; " << *Inst << "\n");
+
+    switch (Class) {
+    default: break;
+
+    // Delete no-op casts. These function calls have special semantics, but
+    // the semantics are entirely implemented via lowering in the front-end,
+    // so by the time they reach the optimizer, they are just no-op calls
+    // which return their argument.
+    //
+    // There are gray areas here, as the ability to cast reference-counted
+    // pointers to raw void* and back allows code to break ARC assumptions,
+    // however these are currently considered to be unimportant.
+    case IC_NoopCast:
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
+                   " " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+
+    // If the pointer-to-weak-pointer is null, it's undefined behavior.
+    case IC_StoreWeak:
+    case IC_LoadWeak:
+    case IC_LoadWeakRetained:
+    case IC_InitWeak:
+    case IC_DestroyWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_CopyWeak:
+    case IC_MoveWeak: {
+      CallInst *CI = cast<CallInst>(Inst);
+      if (IsNullOrUndef(CI->getArgOperand(0)) ||
+          IsNullOrUndef(CI->getArgOperand(1))) {
+        Changed = true;
+        Type *Ty = CI->getArgOperand(0)->getType();
+        new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+                      Constant::getNullValue(Ty),
+                      CI);
+
+        llvm::Value *NewValue = UndefValue::get(CI->getType());
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
+                        "pointer-to-weak-pointer is undefined behavior.\n"
+                        "                                     Old = " << *CI <<
+                        "\n                                     New = " <<
+                        *NewValue << "\n");
+
+        CI->replaceAllUsesWith(NewValue);
+        CI->eraseFromParent();
+        continue;
+      }
+      break;
+    }
+    case IC_RetainBlock:
+      // If we strength reduce an objc_retainBlock to amn objc_retain, continue
+      // onto the objc_retain peephole optimizations. Otherwise break.
+      if (!OptimizeRetainBlockCall(F, Inst, Class))
+        break;
+      // FALLTHROUGH
+    case IC_Retain:
+      OptimizeRetainCall(F, Inst);
+      break;
+    case IC_RetainRV:
+      if (OptimizeRetainRVCall(F, Inst))
+        continue;
+      break;
+    case IC_AutoreleaseRV:
+      OptimizeAutoreleaseRVCall(F, Inst, Class);
+      break;
+    }
+
+    // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+    if (IsAutorelease(Class) && Inst->use_empty()) {
+      CallInst *Call = cast<CallInst>(Inst);
+      const Value *Arg = Call->getArgOperand(0);
+      Arg = FindSingleUseIdentifiedObject(Arg);
+      if (Arg) {
+        Changed = true;
+        ++NumAutoreleases;
+
+        // Create the declaration lazily.
+        LLVMContext &C = Inst->getContext();
+        CallInst *NewCall =
+          CallInst::Create(getReleaseCallee(F.getParent()),
+                           Call->getArgOperand(0), "", Call);
+        NewCall->setMetadata(ImpreciseReleaseMDKind,
+                             MDNode::get(C, ArrayRef<Value *>()));
+
+        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
+                        "objc_autorelease(x) with objc_release(x) since x is "
+                        "otherwise unused.\n"
+                        "                                     Old: " << *Call <<
+                        "\n                                     New: " <<
+                        *NewCall << "\n");
+
+        EraseInstruction(Call);
+        Inst = NewCall;
+        Class = IC_Release;
+      }
+    }
+
+    // For functions which can never be passed stack arguments, add
+    // a tail keyword.
+    if (IsAlwaysTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
+            " to function since it can never be passed stack args: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall();
+    }
+
+    // Ensure that functions that can never have a "tail" keyword due to the
+    // semantics of ARC truly do not do so.
+    if (IsNeverTail(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
+            "keyword from function: " << *Inst <<
+            "\n");
+      cast<CallInst>(Inst)->setTailCall(false);
+    }
+
+    // Set nounwind as needed.
+    if (IsNoThrow(Class)) {
+      Changed = true;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
+            " class. Setting nounwind on: " << *Inst << "\n");
+      cast<CallInst>(Inst)->setDoesNotThrow();
+    }
+
+    if (!IsNoopOnNull(Class)) {
+      UsedInThisFunction |= 1 << Class;
+      continue;
+    }
+
+    const Value *Arg = GetObjCArg(Inst);
+
+    // ARC calls with null are no-ops. Delete them.
+    if (IsNullOrUndef(Arg)) {
+      Changed = true;
+      ++NumNoops;
+      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
+            " null are no-ops. Erasing: " << *Inst << "\n");
+      EraseInstruction(Inst);
+      continue;
+    }
+
+    // Keep track of which of retain, release, autorelease, and retain_block
+    // are actually present in this function.
+    UsedInThisFunction |= 1 << Class;
+
+    // If Arg is a PHI, and one or more incoming values to the
+    // PHI are null, and the call is control-equivalent to the PHI, and there
+    // are no relevant side effects between the PHI and the call, the call
+    // could be pushed up to just those paths with non-null incoming values.
+    // For now, don't bother splitting critical edges for this.
+    SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+    Worklist.push_back(std::make_pair(Inst, Arg));
+    do {
+      std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+      Inst = Pair.first;
+      Arg = Pair.second;
+
+      const PHINode *PN = dyn_cast<PHINode>(Arg);
+      if (!PN) continue;
+
+      // Determine if the PHI has any null operands, or any incoming
+      // critical edges.
+      bool HasNull = false;
+      bool HasCriticalEdges = false;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        Value *Incoming =
+          StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+        if (IsNullOrUndef(Incoming))
+          HasNull = true;
+        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
+                   .getNumSuccessors() != 1) {
+          HasCriticalEdges = true;
+          break;
+        }
+      }
+      // If we have null operands and no critical edges, optimize.
+      if (!HasCriticalEdges && HasNull) {
+        SmallPtrSet<Instruction *, 4> DependingInstructions;
+        SmallPtrSet<const BasicBlock *, 4> Visited;
+
+        // Check that there is nothing that cares about the reference
+        // count between the call and the phi.
+        switch (Class) {
+        case IC_Retain:
+        case IC_RetainBlock:
+          // These can always be moved up.
+          break;
+        case IC_Release:
+          // These can't be moved across things that care about the retain
+          // count.
+          FindDependencies(NeedsPositiveRetainCount, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_Autorelease:
+          // These can't be moved across autorelease pool scope boundaries.
+          FindDependencies(AutoreleasePoolBoundary, Arg,
+                           Inst->getParent(), Inst,
+                           DependingInstructions, Visited, PA);
+          break;
+        case IC_RetainRV:
+        case IC_AutoreleaseRV:
+          // Don't move these; the RV optimization depends on the autoreleaseRV
+          // being tail called, and the retainRV being immediately after a call
+          // (which might still happen if we get lucky with codegen layout, but
+          // it's not worth taking the chance).
+          continue;
+        default:
+          llvm_unreachable("Invalid dependence flavor");
+        }
+
+        if (DependingInstructions.size() == 1 &&
+            *DependingInstructions.begin() == PN) {
+          Changed = true;
+          ++NumPartialNoops;
+          // Clone the call into each predecessor that has a non-null value.
+          CallInst *CInst = cast<CallInst>(Inst);
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            Value *Incoming =
+              StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
+            if (!IsNullOrUndef(Incoming)) {
+              CallInst *Clone = cast<CallInst>(CInst->clone());
+              Value *Op = PN->getIncomingValue(i);
+              Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+              if (Op->getType() != ParamTy)
+                Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+              Clone->setArgOperand(0, Op);
+              Clone->insertBefore(InsertPos);
+
+              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+                           << *CInst << "\n"
+                           "                                     And inserting "
+                           "clone at " << *InsertPos << "\n");
+              Worklist.push_back(std::make_pair(Clone, Incoming));
+            }
+          }
+          // Erase the original call.
+          DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+          EraseInstruction(CInst);
+          continue;
+        }
+      }
+    } while (!Worklist.empty());
+  }
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+                               DenseMap<const BasicBlock *, BBState> &BBStates,
+                               BBState &MyStates) const {
+  // If any top-down local-use or possible-dec has a succ which is earlier in
+  // the sequence, forget it.
+  for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
+       E = MyStates.top_down_ptr_end(); I != E; ++I)
+    switch (I->second.GetSeq()) {
+    default: break;
+    case S_Use: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None:
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_Use:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    case S_CanRelease: {
+      const Value *Arg = I->first;
+      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+      bool SomeSuccHasSame = false;
+      bool AllSuccsHaveSame = true;
+      PtrState &S = I->second;
+      succ_const_iterator SI(TI), SE(TI, false);
+
+      for (; SI != SE; ++SI) {
+        Sequence SuccSSeq = S_None;
+        bool SuccSRRIKnownSafe = false;
+        // If VisitBottomUp has pointer information for this successor, take
+        // what we know about it.
+        DenseMap<const BasicBlock *, BBState>::iterator BBI =
+          BBStates.find(*SI);
+        assert(BBI != BBStates.end());
+        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+        SuccSSeq = SuccS.GetSeq();
+        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+        switch (SuccSSeq) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+            S.ClearSequenceProgress();
+            break;
+          }
+          continue;
+        }
+        case S_CanRelease:
+          SomeSuccHasSame = true;
+          break;
+        case S_Stop:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Use:
+          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+            AllSuccsHaveSame = false;
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
+      }
+      // If the state at the other end of any of the successor edges
+      // matches the current state, require all edges to match. This
+      // guards against loops in the middle of a sequence.
+      if (SomeSuccHasSame && !AllSuccsHaveSame)
+        S.ClearSequenceProgress();
+      break;
+    }
+    }
+}
+
+bool
+ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
+                                     BasicBlock *BB,
+                                     MapVector<Value *, RRInfo> &Retains,
+                                     BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+
+    // If we see two releases in a row on the same pointer. If so, make
+    // a note, and we'll cicle back to revisit it after we've
+    // hopefully eliminated the second release, which may allow us to
+    // eliminate the first release too.
+    // Theoretically we could implement removal of nested retain+release
+    // pairs by making PtrState hold a stack of states, but this is
+    // simple and avoids adding overhead for the non-nested case.
+    if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
+      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
+                      "releases (i.e. a release pair)\n");
+      NestingDetected = true;
+    }
+
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+    Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+    ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
+    S.ResetSequenceProgress(NewSeq);
+    S.RRI.ReleaseMetadata = ReleaseMetadata;
+    S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
+    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+    S.RRI.Calls.insert(Inst);
+    S.SetKnownPositiveRefCount();
+    break;
+  }
+  case IC_RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrBottomUpState(Arg);
+    S.SetKnownPositiveRefCount();
+
+    Sequence OldSeq = S.GetSeq();
+    switch (OldSeq) {
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+    case S_Use:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_CanRelease:
+      // Don't do retain+release tracking for IC_RetainRV, because it's
+      // better to let it remain as the first instruction after a call.
+      if (Class != IC_RetainRV)
+        Retains[Inst] = S.RRI;
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+    ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq());
+    return NestingDetected;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearBottomUpPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.bottom_up_ptr_begin(),
+       ME = MyStates.bottom_up_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearKnownPositiveRefCount();
+      switch (Seq) {
+      case S_Use:
+        S.SetSeq(S_CanRelease);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq());
+        continue;
+      case S_CanRelease:
+      case S_Release:
+      case S_MovableRelease:
+      case S_Stop:
+      case S_None:
+        break;
+      case S_Retain:
+        llvm_unreachable("bottom-up pointer in retain state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_Release:
+    case S_MovableRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        assert(S.RRI.ReverseInsertPts.empty());
+        // If this is an invoke instruction, we're scanning it as part of
+        // one of its successor blocks, since we can't insert code after it
+        // in its own block, and we don't want to split critical edges.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+        S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      } else if (Seq == S_Release && IsUser(Class)) {
+        // Non-movable releases depend on any possible objc pointer use.
+        S.SetSeq(S_Stop);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
+        assert(S.RRI.ReverseInsertPts.empty());
+        // As above; handle invoke specially.
+        if (isa<InvokeInst>(Inst))
+          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+        else
+          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+      }
+      break;
+    case S_Stop:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      }
+      break;
+    case S_CanRelease:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Retain:
+      llvm_unreachable("bottom-up pointer in retain state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+                          DenseMap<const BasicBlock *, BBState> &BBStates,
+                          MapVector<Value *, RRInfo> &Retains) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each successor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
+    const BasicBlock *Succ = *SI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+    assert(I != BBStates.end());
+    MyStates.InitFromSucc(I->second);
+    ++SI;
+    for (; SI != SE; ++SI) {
+      Succ = *SI;
+      I = BBStates.find(Succ);
+      assert(I != BBStates.end());
+      MyStates.MergeSucc(I->second);
+    }
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_BOTTOMUP_BBEND(MyStates, BB);
+
+  // Visit all the instructions, bottom-up.
+  for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+    Instruction *Inst = llvm::prior(I);
+
+    // Invoke instructions are visited as part of their successors (below).
+    if (isa<InvokeInst>(Inst))
+      continue;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+  }
+
+  // If there's a predecessor with an invoke, visit the invoke as if it were
+  // part of this block, since we can't insert code after an invoke in its own
+  // block, and we don't want to split critical edges.
+  for (BBState::edge_iterator PI(MyStates.pred_begin()),
+       PE(MyStates.pred_end()); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+      NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB);
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+                                    DenseMap<Value *, RRInfo> &Releases,
+                                    BBState &MyStates) {
+  bool NestingDetected = false;
+  InstructionClass Class = GetInstructionClass(Inst);
+  const Value *Arg = 0;
+
+  switch (Class) {
+  case IC_RetainBlock:
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
+  case IC_Retain:
+  case IC_RetainRV: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+
+    // Don't do retain+release tracking for IC_RetainRV, because it's
+    // better to let it remain as the first instruction after a call.
+    if (Class != IC_RetainRV) {
+      // If we see two retains in a row on the same pointer. If so, make
+      // a note, and we'll cicle back to revisit it after we've
+      // hopefully eliminated the second retain, which may allow us to
+      // eliminate the first retain too.
+      // Theoretically we could implement removal of nested retain+release
+      // pairs by making PtrState hold a stack of states, but this is
+      // simple and avoids adding overhead for the non-nested case.
+      if (S.GetSeq() == S_Retain)
+        NestingDetected = true;
+
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
+      S.ResetSequenceProgress(S_Retain);
+      S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
+      S.RRI.Calls.insert(Inst);
+    }
+
+    S.SetKnownPositiveRefCount();
+
+    // A retain can be a potential use; procede to the generic checking
+    // code below.
+    break;
+  }
+  case IC_Release: {
+    Arg = GetObjCArg(Inst);
+
+    PtrState &S = MyStates.getPtrTopDownState(Arg);
+    S.ClearKnownPositiveRefCount();
+
+    switch (S.GetSeq()) {
+    case S_Retain:
+    case S_CanRelease:
+      S.RRI.ReverseInsertPts.clear();
+      // FALL THROUGH
+    case S_Use:
+      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
+      Releases[Inst] = S.RRI;
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
+      S.ClearSequenceProgress();
+      break;
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+    break;
+  }
+  case IC_AutoreleasepoolPop:
+    // Conservatively, clear MyStates for all known pointers.
+    MyStates.clearTopDownPointers();
+    return NestingDetected;
+  case IC_AutoreleasepoolPush:
+  case IC_None:
+    // These are irrelevant.
+    return NestingDetected;
+  default:
+    break;
+  }
+
+  // Consider any other possible effects of this instruction on each
+  // pointer being tracked.
+  for (BBState::ptr_iterator MI = MyStates.top_down_ptr_begin(),
+       ME = MyStates.top_down_ptr_end(); MI != ME; ++MI) {
+    const Value *Ptr = MI->first;
+    if (Ptr == Arg)
+      continue; // Handled above.
+    PtrState &S = MI->second;
+    Sequence Seq = S.GetSeq();
+
+    // Check for possible releases.
+    if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
+      S.ClearKnownPositiveRefCount();
+      switch (Seq) {
+      case S_Retain:
+        S.SetSeq(S_CanRelease);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
+        assert(S.RRI.ReverseInsertPts.empty());
+        S.RRI.ReverseInsertPts.insert(Inst);
+
+        // One call can't cause a transition from S_Retain to S_CanRelease
+        // and S_CanRelease to S_Use. If we've made the first transition,
+        // we're done.
+        continue;
+      case S_Use:
+      case S_CanRelease:
+      case S_None:
+        break;
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        llvm_unreachable("top-down pointer in release state!");
+      }
+    }
+
+    // Check for possible direct uses.
+    switch (Seq) {
+    case S_CanRelease:
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        S.SetSeq(S_Use);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use);
+      }
+      break;
+    case S_Retain:
+    case S_Use:
+    case S_None:
+      break;
+    case S_Stop:
+    case S_Release:
+    case S_MovableRelease:
+      llvm_unreachable("top-down pointer in release state!");
+    }
+  }
+
+  return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+                         DenseMap<const BasicBlock *, BBState> &BBStates,
+                         DenseMap<Value *, RRInfo> &Releases) {
+  bool NestingDetected = false;
+  BBState &MyStates = BBStates[BB];
+
+  // Merge the states from each predecessor to compute the initial state
+  // for the current block.
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
+    const BasicBlock *Pred = *PI;
+    DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+    assert(I != BBStates.end());
+    MyStates.InitFromPred(I->second);
+    ++PI;
+    for (; PI != PE; ++PI) {
+      Pred = *PI;
+      I = BBStates.find(Pred);
+      assert(I != BBStates.end());
+      MyStates.MergePred(I->second);
+    }
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_TOPDOWN_BBSTART(MyStates, BB);
+
+  // Visit all the instructions, top-down.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    Instruction *Inst = I;
+
+    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+
+    NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
+  }
+
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_TOPDOWN_BBEND(MyStates, BB);
+
+  CheckForCFGHazards(BB, BBStates, MyStates);
+  return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+                  SmallVectorImpl<BasicBlock *> &PostOrder,
+                  SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+                  unsigned NoObjCARCExceptionsMDKind,
+                  DenseMap<const BasicBlock *, BBState> &BBStates) {
+  /// The visited set, for doing DFS walks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  // Do DFS, computing the PostOrder.
+  SmallPtrSet<BasicBlock *, 16> OnStack;
+  SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+  // Functions always have exactly one entry block, and we don't have
+  // any other block that we treat like an entry block.
+  BasicBlock *EntryBB = &F.getEntryBlock();
+  BBState &MyStates = BBStates[EntryBB];
+  MyStates.SetAsEntry();
+  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  Visited.insert(EntryBB);
+  OnStack.insert(EntryBB);
+  do {
+  dfs_next_succ:
+    BasicBlock *CurrBB = SuccStack.back().first;
+    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
+    succ_iterator SE(TI, false);
+
+    while (SuccStack.back().second != SE) {
+      BasicBlock *SuccBB = *SuccStack.back().second++;
+      if (Visited.insert(SuccBB)) {
+        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
+        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBState &SuccStates = BBStates[SuccBB];
+        SuccStates.addPred(CurrBB);
+        OnStack.insert(SuccBB);
+        goto dfs_next_succ;
+      }
+
+      if (!OnStack.count(SuccBB)) {
+        BBStates[CurrBB].addSucc(SuccBB);
+        BBStates[SuccBB].addPred(CurrBB);
+      }
+    }
+    OnStack.erase(CurrBB);
+    PostOrder.push_back(CurrBB);
+    SuccStack.pop_back();
+  } while (!SuccStack.empty());
+
+  Visited.clear();
+
+  // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+  // Functions may have many exits, and there also blocks which we treat
+  // as exits due to ignored edges.
+  SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *ExitBB = I;
+    BBState &MyStates = BBStates[ExitBB];
+    if (!MyStates.isExit())
+      continue;
+
+    MyStates.SetAsExit();
+
+    PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin()));
+    Visited.insert(ExitBB);
+    while (!PredStack.empty()) {
+    reverse_dfs_next_succ:
+      BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+      while (PredStack.back().second != PE) {
+        BasicBlock *BB = *PredStack.back().second++;
+        if (Visited.insert(BB)) {
+          PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+          goto reverse_dfs_next_succ;
+        }
+      }
+      ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+    }
+  }
+}
+
+// Visit the function both top-down and bottom-up.
+bool
+ObjCARCOpt::Visit(Function &F,
+                  DenseMap<const BasicBlock *, BBState> &BBStates,
+                  MapVector<Value *, RRInfo> &Retains,
+                  DenseMap<Value *, RRInfo> &Releases) {
+
+  // Use reverse-postorder traversals, because we magically know that loops
+  // will be well behaved, i.e. they won't repeatedly call retain on a single
+  // pointer without doing a release. We can't use the ReversePostOrderTraversal
+  // class here because we want the reverse-CFG postorder to consider each
+  // function exit point, and we want to ignore selected cycle edges.
+  SmallVector<BasicBlock *, 16> PostOrder;
+  SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+  ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+                    NoObjCARCExceptionsMDKind,
+                    BBStates);
+
+  // Use reverse-postorder on the reverse CFG for bottom-up.
+  bool BottomUpNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       ReverseCFGPostOrder.rbegin(), E = ReverseCFGPostOrder.rend();
+       I != E; ++I)
+    BottomUpNestingDetected |= VisitBottomUp(*I, BBStates, Retains);
+
+  // Use reverse-postorder for top-down.
+  bool TopDownNestingDetected = false;
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+       PostOrder.rbegin(), E = PostOrder.rend();
+       I != E; ++I)
+    TopDownNestingDetected |= VisitTopDown(*I, BBStates, Releases);
+
+  return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg,
+                           RRInfo &RetainsToMove,
+                           RRInfo &ReleasesToMove,
+                           MapVector<Value *, RRInfo> &Retains,
+                           DenseMap<Value *, RRInfo> &Releases,
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+  // Insert the new retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = ReleasesToMove.ReverseInsertPts.begin(),
+       PE = ReleasesToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call =
+      CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt);
+    Call->setDoesNotThrow();
+    Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       PI = RetainsToMove.ReverseInsertPts.begin(),
+       PE = RetainsToMove.ReverseInsertPts.end(); PI != PE; ++PI) {
+    Instruction *InsertPt = *PI;
+    Value *MyArg = ArgTy == ParamTy ? Arg :
+                   new BitCastInst(Arg, ParamTy, "", InsertPt);
+    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                      "", InsertPt);
+    // Attach a clang.imprecise_release metadata tag, if appropriate.
+    if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+      Call->setMetadata(ImpreciseReleaseMDKind, M);
+    Call->setDoesNotThrow();
+    if (ReleasesToMove.IsTailCallRelease)
+      Call->setTailCall();
+
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
+                 << "\n"
+                    "                       At insertion point: " << *InsertPt
+                 << "\n");
+  }
+
+  // Delete the original retain and release calls.
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = RetainsToMove.Calls.begin(),
+       AE = RetainsToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRetain = *AI;
+    Retains.blot(OrigRetain);
+    DeadInsts.push_back(OrigRetain);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
+                    "\n");
+  }
+  for (SmallPtrSet<Instruction *, 2>::const_iterator
+       AI = ReleasesToMove.Calls.begin(),
+       AE = ReleasesToMove.Calls.end(); AI != AE; ++AI) {
+    Instruction *OrigRelease = *AI;
+    Releases.erase(OrigRelease);
+    DeadInsts.push_back(OrigRelease);
+    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
+                 << "\n");
+  }
+}
+
+bool
+ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
+                                    &BBStates,
+                                  MapVector<Value *, RRInfo> &Retains,
+                                  DenseMap<Value *, RRInfo> &Releases,
+                                  Module *M,
+                                  SmallVector<Instruction *, 4> &NewRetains,
+                                  SmallVector<Instruction *, 4> &NewReleases,
+                                  SmallVector<Instruction *, 8> &DeadInsts,
+                                  RRInfo &RetainsToMove,
+                                  RRInfo &ReleasesToMove,
+                                  Value *Arg,
+                                  bool KnownSafe,
+                                  bool &AnyPairsCompletelyEliminated) {
+  // If a pair happens in a region where it is known that the reference count
+  // is already incremented, we can similarly ignore possible decrements.
+  bool KnownSafeTD = true, KnownSafeBU = true;
+
+  // Connect the dots between the top-down-collected RetainsToMove and
+  // bottom-up-collected ReleasesToMove to form sets of related calls.
+  // This is an iterative process so that we connect multiple releases
+  // to multiple retains if needed.
+  unsigned OldDelta = 0;
+  unsigned NewDelta = 0;
+  unsigned OldCount = 0;
+  unsigned NewCount = 0;
+  bool FirstRelease = true;
+  for (;;) {
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
+      Instruction *NewRetain = *NI;
+      MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
+      assert(It != Retains.end());
+      const RRInfo &NewRetainRRI = It->second;
+      KnownSafeTD &= NewRetainRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewRetainRRI.Calls.begin(),
+             LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewRetainRelease = *LI;
+        DenseMap<Value *, RRInfo>::const_iterator Jt =
+          Releases.find(NewRetainRelease);
+        if (Jt == Releases.end())
+          return false;
+        const RRInfo &NewRetainReleaseRRI = Jt->second;
+        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+        if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
+          OldDelta -=
+            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // Merge the ReleaseMetadata and IsTailCallRelease values.
+          if (FirstRelease) {
+            ReleasesToMove.ReleaseMetadata =
+              NewRetainReleaseRRI.ReleaseMetadata;
+            ReleasesToMove.IsTailCallRelease =
+              NewRetainReleaseRRI.IsTailCallRelease;
+            FirstRelease = false;
+          } else {
+            if (ReleasesToMove.ReleaseMetadata !=
+                NewRetainReleaseRRI.ReleaseMetadata)
+              ReleasesToMove.ReleaseMetadata = 0;
+            if (ReleasesToMove.IsTailCallRelease !=
+                NewRetainReleaseRRI.IsTailCallRelease)
+              ReleasesToMove.IsTailCallRelease = false;
+          }
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewRetainReleaseRRI.ReverseInsertPts.begin(),
+                   RE = NewRetainReleaseRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
+                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+            }
+          NewReleases.push_back(NewRetainRelease);
+        }
+      }
+    }
+    NewRetains.clear();
+    if (NewReleases.empty()) break;
+
+    // Back the other way.
+    for (SmallVectorImpl<Instruction *>::const_iterator
+           NI = NewReleases.begin(), NE = NewReleases.end(); NI != NE; ++NI) {
+      Instruction *NewRelease = *NI;
+      DenseMap<Value *, RRInfo>::const_iterator It =
+        Releases.find(NewRelease);
+      assert(It != Releases.end());
+      const RRInfo &NewReleaseRRI = It->second;
+      KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      for (SmallPtrSet<Instruction *, 2>::const_iterator
+             LI = NewReleaseRRI.Calls.begin(),
+             LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
+        Instruction *NewReleaseRetain = *LI;
+        MapVector<Value *, RRInfo>::const_iterator Jt =
+          Retains.find(NewReleaseRetain);
+        if (Jt == Retains.end())
+          return false;
+        const RRInfo &NewReleaseRetainRRI = Jt->second;
+        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+        if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
+          unsigned PathCount =
+            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+          OldDelta += PathCount;
+          OldCount += PathCount;
+
+          // Collect the optimal insertion points.
+          if (!KnownSafe)
+            for (SmallPtrSet<Instruction *, 2>::const_iterator
+                   RI = NewReleaseRetainRRI.ReverseInsertPts.begin(),
+                   RE = NewReleaseRetainRRI.ReverseInsertPts.end();
+                 RI != RE; ++RI) {
+              Instruction *RIP = *RI;
+              if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
+                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                NewDelta += PathCount;
+                NewCount += PathCount;
+              }
+            }
+          NewRetains.push_back(NewReleaseRetain);
+        }
+      }
+    }
+    NewReleases.clear();
+    if (NewRetains.empty()) break;
+  }
+
+  // If the pointer is known incremented or nested, we can safely delete the
+  // pair regardless of what's between them.
+  if (KnownSafeTD || KnownSafeBU) {
+    RetainsToMove.ReverseInsertPts.clear();
+    ReleasesToMove.ReverseInsertPts.clear();
+    NewCount = 0;
+  } else {
+    // Determine whether the new insertion points we computed preserve the
+    // balance of retain and release calls through the program.
+    // TODO: If the fully aggressive solution isn't valid, try to find a
+    // less aggressive solution which is.
+    if (NewDelta != 0)
+      return false;
+  }
+
+  // Determine whether the original call points are balanced in the retain and
+  // release calls through the program. If not, conservatively don't touch
+  // them.
+  // TODO: It's theoretically possible to do code motion in this case, as
+  // long as the existing imbalances are maintained.
+  if (OldDelta != 0)
+    return false;
+
+  Changed = true;
+  assert(OldCount != 0 && "Unreachable code?");
+  NumRRs += OldCount - NewCount;
+  // Set to true if we completely removed any RR pairs.
+  AnyPairsCompletelyEliminated = NewCount == 0;
+
+  // We can move calls!
+  return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool
+ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
+                                   &BBStates,
+                                 MapVector<Value *, RRInfo> &Retains,
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
+  bool AnyPairsCompletelyEliminated = false;
+  RRInfo RetainsToMove;
+  RRInfo ReleasesToMove;
+  SmallVector<Instruction *, 4> NewRetains;
+  SmallVector<Instruction *, 4> NewReleases;
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  // Visit each retain.
+  for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
+    if (!V) continue; // blotted
+
+    Instruction *Retain = cast<Instruction>(V);
+
+    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
+          << "\n");
+
+    Value *Arg = GetObjCArg(Retain);
+
+    // If the object being released is in static or stack storage, we know it's
+    // not being managed by ObjC reference counting, so we can delete pairs
+    // regardless of what possible decrements or uses lie between them.
+    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
+
+    // Connect the dots between the top-down-collected RetainsToMove and
+    // bottom-up-collected ReleasesToMove to form sets of related calls.
+    NewRetains.push_back(Retain);
+    bool PerformMoveCalls =
+      ConnectTDBUTraversals(BBStates, Retains, Releases, M, NewRetains,
+                            NewReleases, DeadInsts, RetainsToMove,
+                            ReleasesToMove, Arg, KnownSafe,
+                            AnyPairsCompletelyEliminated);
+
+#ifdef ARC_ANNOTATIONS
+    // Do not move calls if ARC annotations are requested. If we were to move
+    // calls in this case, we would not be able
+    PerformMoveCalls = PerformMoveCalls && !EnableARCAnnotations;
+#endif // ARC_ANNOTATIONS
+
+    if (PerformMoveCalls) {
+      // Ok, everything checks out and we're all set. Let's move/delete some
+      // code!
+      MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+                Retains, Releases, DeadInsts, M);
+    }
+
+    // Clean up state for next retain.
+    NewReleases.clear();
+    NewRetains.clear();
+    RetainsToMove.clear();
+    ReleasesToMove.clear();
+  }
+
+  // Now that we're done moving everything, we can delete the newly dead
+  // instructions, as we no longer need them as insert points.
+  while (!DeadInsts.empty())
+    EraseInstruction(DeadInsts.pop_back_val());
+
+  return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+  // itself because it uses AliasAnalysis and we need to do provenance
+  // queries instead.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
+          "\n");
+
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
+      continue;
+
+    // Delete objc_loadWeak calls with no users.
+    if (Class == IC_LoadWeak && Inst->use_empty()) {
+      Inst->eraseFromParent();
+      continue;
+    }
+
+    // TODO: For now, just look for an earlier available version of this value
+    // within the same block. Theoretically, we could do memdep-style non-local
+    // analysis too, but that would want caching. A better approach would be to
+    // use the technique that EarlyCSE uses.
+    inst_iterator Current = llvm::prior(I);
+    BasicBlock *CurrentBB = Current.getBasicBlockIterator();
+    for (BasicBlock::iterator B = CurrentBB->begin(),
+                              J = Current.getInstructionIterator();
+         J != B; --J) {
+      Instruction *EarlierInst = &*llvm::prior(J);
+      InstructionClass EarlierClass = GetInstructionClass(EarlierInst);
+      switch (EarlierClass) {
+      case IC_LoadWeak:
+      case IC_LoadWeakRetained: {
+        // If this is loading from the same pointer, replace this load's value
+        // with that one.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall);
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_StoreWeak:
+      case IC_InitWeak: {
+        // If this is storing to the same pointer and has the same size etc.
+        // replace this load's value with the stored value.
+        CallInst *Call = cast<CallInst>(Inst);
+        CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+        Value *Arg = Call->getArgOperand(0);
+        Value *EarlierArg = EarlierCall->getArgOperand(0);
+        switch (PA.getAA()->alias(Arg, EarlierArg)) {
+        case AliasAnalysis::MustAlias:
+          Changed = true;
+          // If the load has a builtin retain, insert a plain retain for it.
+          if (Class == IC_LoadWeakRetained) {
+            CallInst *CI =
+              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
+                               "", Call);
+            CI->setTailCall();
+          }
+          // Zap the fully redundant load.
+          Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+          Call->eraseFromParent();
+          goto clobbered;
+        case AliasAnalysis::MayAlias:
+        case AliasAnalysis::PartialAlias:
+          goto clobbered;
+        case AliasAnalysis::NoAlias:
+          break;
+        }
+        break;
+      }
+      case IC_MoveWeak:
+      case IC_CopyWeak:
+        // TOOD: Grab the copied value.
+        goto clobbered;
+      case IC_AutoreleasepoolPush:
+      case IC_None:
+      case IC_IntrinsicUser:
+      case IC_User:
+        // Weak pointers are only modified through the weak entry points
+        // (and arbitrary calls, which could call the weak entry points).
+        break;
+      default:
+        // Anything else could modify the weak pointer.
+        goto clobbered;
+      }
+    }
+  clobbered:;
+  }
+
+  // Then, for each destroyWeak with an alloca operand, check to see if
+  // the alloca and all its users can be zapped.
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    InstructionClass Class = GetBasicInstructionClass(Inst);
+    if (Class != IC_DestroyWeak)
+      continue;
+
+    CallInst *Call = cast<CallInst>(Inst);
+    Value *Arg = Call->getArgOperand(0);
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ++UI) {
+        const Instruction *UserInst = cast<Instruction>(*UI);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+        case IC_DestroyWeak:
+          continue;
+        default:
+          goto done;
+        }
+      }
+      Changed = true;
+      for (Value::use_iterator UI = Alloca->use_begin(),
+           UE = Alloca->use_end(); UI != UE; ) {
+        CallInst *UserInst = cast<CallInst>(*UI++);
+        switch (GetBasicInstructionClass(UserInst)) {
+        case IC_InitWeak:
+        case IC_StoreWeak:
+          // These functions return their second argument.
+          UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+          break;
+        case IC_DestroyWeak:
+          // No return value.
+          break;
+        default:
+          llvm_unreachable("alloca really is used!");
+        }
+        UserInst->eraseFromParent();
+      }
+      Alloca->eraseFromParent();
+    done:;
+    }
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
+
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+  /// Releases, Retains - These are used to store the results of the main flow
+  /// analysis. These use Value* as the key instead of Instruction* so that the
+  /// map stays valid when we get around to rewriting code and calls get
+  /// replaced by arguments.
+  DenseMap<Value *, RRInfo> Releases;
+  MapVector<Value *, RRInfo> Retains;
+
+  /// This is used during the traversal of the function to track the
+  /// states for each identified object at each block.
+  DenseMap<const BasicBlock *, BBState> BBStates;
+
+  // Analyze the CFG of the function, and all instructions.
+  bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+  // Transform.
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
+}
+
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
+static bool
+HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
+                             SmallPtrSet<Instruction *, 4> &DepInsts,
+                             SmallPtrSet<const BasicBlock *, 4> &Visited,
+                             ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                   DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return false;
+
+  CallInst *Call =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
+    return false;
+
+  // Check that the call is a regular call.
+  InstructionClass Class = GetBasicInstructionClass(Call);
+  if (Class != IC_CallOrUser && Class != IC_Call)
+    return false;
+
+  return true;
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  SmallPtrSet<Instruction *, 4> &DepInsts,
+                                  SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                  ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg,
+                   BB, Autorelease, DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+  
+  CallInst *Retain =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  
+  // Check that we found a retain with the same argument.
+  if (!Retain ||
+      !IsRetain(GetBasicInstructionClass(Retain)) ||
+      GetObjCArg(Retain) != Arg) {
+    return 0;
+  }
+  
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       SmallPtrSet<Instruction *, 4> &DepInsts,
+                                       SmallPtrSet<const BasicBlock *, 4> &V,
+                                       ProvenanceAnalysis &PA) {
+  FindDependencies(NeedsPositiveRetainCount, Arg,
+                   BB, Ret, DepInsts, V, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+  
+  CallInst *Autorelease =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  if (!Autorelease)
+    return 0;
+  InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return 0;
+  if (GetObjCArg(Autorelease) != Arg)
+    return 0;
+  
+  return Autorelease;
+}
+
+/// Look for this pattern:
+/// \code
+///    %call = call i8* @something(...)
+///    %2 = call i8* @objc_retain(i8* %call)
+///    %3 = call i8* @objc_autorelease(i8* %2)
+///    ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+  if (!F.getReturnType()->isPointerTy())
+    return;
+
+  SmallPtrSet<Instruction *, 4> DependingInstructions;
+  SmallPtrSet<const BasicBlock *, 4> Visited;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    BasicBlock *BB = FI;
+    ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
+
+    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+
+    if (!Ret)
+      continue;
+    
+    const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
+    
+    // Look for an ``autorelease'' instruction that is a predecssor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
+    CallInst *Autorelease =
+      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret,
+                                             DependingInstructions, Visited,
+                                             PA);
+    if (Autorelease) {
+      DependingInstructions.clear();
+      Visited.clear();
+      
+      CallInst *Retain =
+        FindPredecessorRetainWithSafePath(Arg, BB, Autorelease,
+                                          DependingInstructions, Visited, PA);
+      if (Retain) {
+        DependingInstructions.clear();
+        Visited.clear();
+        
+        // Check that there is nothing that can affect the reference count
+        // between the retain and the call.  Note that Retain need not be in BB.
+        if (HasSafePathToPredecessorCall(Arg, Retain, DependingInstructions,
+                                         Visited, PA)) {
+          // If so, we can zap the retain and autorelease.
+          Changed = true;
+          ++NumRets;
+          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
+                       << "\n                             Erasing: "
+                       << *Autorelease << "\n");
+          EraseInstruction(Retain);
+          EraseInstruction(Autorelease);
+        }
+      }
+    }
+    
+    DependingInstructions.clear();
+    Visited.clear();
+  }
+
+  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+
+}
+
+bool ObjCARCOpt::doInitialization(Module &M) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  Run = ModuleHasARC(M);
+  if (!Run)
+    return false;
+
+  // Identify the imprecise release metadata kind.
+  ImpreciseReleaseMDKind =
+    M.getContext().getMDKindID("clang.imprecise_release");
+  CopyOnEscapeMDKind =
+    M.getContext().getMDKindID("clang.arc.copy_on_escape");
+  NoObjCARCExceptionsMDKind =
+    M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+#ifdef ARC_ANNOTATIONS
+  ARCAnnotationBottomUpMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.bottomup");
+  ARCAnnotationTopDownMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.topdown");
+  ARCAnnotationProvenanceSourceMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.provenancesource");
+#endif // ARC_ANNOTATIONS
+
+  // Intuitively, objc_retain and others are nocapture, however in practice
+  // they are not, because they return their argument value. And objc_release
+  // calls finalizers which can have arbitrary side effects.
+
+  // These are initialized lazily.
+  RetainRVCallee = 0;
+  AutoreleaseRVCallee = 0;
+  ReleaseCallee = 0;
+  RetainCallee = 0;
+  RetainBlockCallee = 0;
+  AutoreleaseCallee = 0;
+
+  return false;
+}
+
+bool ObjCARCOpt::runOnFunction(Function &F) {
+  if (!EnableARCOpts)
+    return false;
+
+  // If nothing in the Module uses ARC, don't do anything.
+  if (!Run)
+    return false;
+
+  Changed = false;
+
+  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+
+  PA.setAA(&getAnalysis<AliasAnalysis>());
+
+  // This pass performs several distinct transformations. As a compile-time aid
+  // when compiling code that isn't ObjC, skip these if the relevant ObjC
+  // library functions aren't declared.
+
+  // Preliminary optimizations. This also computs UsedInThisFunction.
+  OptimizeIndividualCalls(F);
+
+  // Optimizations for weak pointers.
+  if (UsedInThisFunction & ((1 << IC_LoadWeak) |
+                            (1 << IC_LoadWeakRetained) |
+                            (1 << IC_StoreWeak) |
+                            (1 << IC_InitWeak) |
+                            (1 << IC_CopyWeak) |
+                            (1 << IC_MoveWeak) |
+                            (1 << IC_DestroyWeak)))
+    OptimizeWeakCalls(F);
+
+  // Optimizations for retain+release pairs.
+  if (UsedInThisFunction & ((1 << IC_Retain) |
+                            (1 << IC_RetainRV) |
+                            (1 << IC_RetainBlock)))
+    if (UsedInThisFunction & (1 << IC_Release))
+      // Run OptimizeSequences until it either stops making changes or
+      // no retain+release pair nesting is detected.
+      while (OptimizeSequences(F)) {}
+
+  // Optimizations if objc_autorelease is used.
+  if (UsedInThisFunction & ((1 << IC_Autorelease) |
+                            (1 << IC_AutoreleaseRV)))
+    OptimizeReturns(F);
+
+  DEBUG(dbgs() << "\n");
+
+  return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+  PA.clear();
+}
+
+/// @}
+///
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
new file mode 100644
index 000000000000..03e12d4fd763
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -0,0 +1,252 @@
+//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines several utility functions used by various ARC
+/// optimizations which are IMHO too big to be in a header file.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
+                                       const InstructionClass Class) {
+  switch (Class) {
+  case IC_Retain:
+    return OS << "IC_Retain";
+  case IC_RetainRV:
+    return OS << "IC_RetainRV";
+  case IC_RetainBlock:
+    return OS << "IC_RetainBlock";
+  case IC_Release:
+    return OS << "IC_Release";
+  case IC_Autorelease:
+    return OS << "IC_Autorelease";
+  case IC_AutoreleaseRV:
+    return OS << "IC_AutoreleaseRV";
+  case IC_AutoreleasepoolPush:
+    return OS << "IC_AutoreleasepoolPush";
+  case IC_AutoreleasepoolPop:
+    return OS << "IC_AutoreleasepoolPop";
+  case IC_NoopCast:
+    return OS << "IC_NoopCast";
+  case IC_FusedRetainAutorelease:
+    return OS << "IC_FusedRetainAutorelease";
+  case IC_FusedRetainAutoreleaseRV:
+    return OS << "IC_FusedRetainAutoreleaseRV";
+  case IC_LoadWeakRetained:
+    return OS << "IC_LoadWeakRetained";
+  case IC_StoreWeak:
+    return OS << "IC_StoreWeak";
+  case IC_InitWeak:
+    return OS << "IC_InitWeak";
+  case IC_LoadWeak:
+    return OS << "IC_LoadWeak";
+  case IC_MoveWeak:
+    return OS << "IC_MoveWeak";
+  case IC_CopyWeak:
+    return OS << "IC_CopyWeak";
+  case IC_DestroyWeak:
+    return OS << "IC_DestroyWeak";
+  case IC_StoreStrong:
+    return OS << "IC_StoreStrong";
+  case IC_CallOrUser:
+    return OS << "IC_CallOrUser";
+  case IC_Call:
+    return OS << "IC_Call";
+  case IC_User:
+    return OS << "IC_User";
+  case IC_IntrinsicUser:
+    return OS << "IC_IntrinsicUser";
+  case IC_None:
+    return OS << "IC_None";
+  }
+  llvm_unreachable("Unknown instruction class!");
+}
+
+InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
+  Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+
+  // No (mandatory) arguments.
+  if (AI == AE)
+    return StringSwitch<InstructionClass>(F->getName())
+      .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Case("clang.arc.use", IC_IntrinsicUser)
+      .Default(IC_CallOrUser);
+
+  // One argument.
+  const Argument *A0 = AI++;
+  if (AI == AE)
+    // Argument is a pointer.
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
+      // Argument is i8*.
+      if (ETy->isIntegerTy(8))
+        return StringSwitch<InstructionClass>(F->getName())
+          .Case("objc_retain",                IC_Retain)
+          .Case("objc_retainAutoreleasedReturnValue", IC_RetainRV)
+          .Case("objc_retainBlock",           IC_RetainBlock)
+          .Case("objc_release",               IC_Release)
+          .Case("objc_autorelease",           IC_Autorelease)
+          .Case("objc_autoreleaseReturnValue", IC_AutoreleaseRV)
+          .Case("objc_autoreleasePoolPop",    IC_AutoreleasepoolPop)
+          .Case("objc_retainedObject",        IC_NoopCast)
+          .Case("objc_unretainedObject",      IC_NoopCast)
+          .Case("objc_unretainedPointer",     IC_NoopCast)
+          .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
+          .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
+          .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Default(IC_CallOrUser);
+
+      // Argument is i8**
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
+        if (Pte->getElementType()->isIntegerTy(8))
+          return StringSwitch<InstructionClass>(F->getName())
+            .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
+            .Case("objc_loadWeak",              IC_LoadWeak)
+            .Case("objc_destroyWeak",           IC_DestroyWeak)
+            .Default(IC_CallOrUser);
+    }
+
+  // Two arguments, first is i8**.
+  const Argument *A1 = AI++;
+  if (AI == AE)
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+        if (Pte->getElementType()->isIntegerTy(8))
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
+            // Second argument is i8*
+            if (ETy1->isIntegerTy(8))
+              return StringSwitch<InstructionClass>(F->getName())
+                .Case("objc_storeWeak",             IC_StoreWeak)
+                .Case("objc_initWeak",              IC_InitWeak)
+                .Case("objc_storeStrong",           IC_StoreStrong)
+                .Default(IC_CallOrUser);
+            // Second argument is i8**.
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+              if (Pte1->getElementType()->isIntegerTy(8))
+                return StringSwitch<InstructionClass>(F->getName())
+                  .Case("objc_moveWeak",              IC_MoveWeak)
+                  .Case("objc_copyWeak",              IC_CopyWeak)
+                  // Ignore annotation calls. This is important to stop the
+                  // optimizer from treating annotations as uses which would
+                  // make the state of the pointers they are attempting to
+                  // elucidate to be incorrect.
+                  .Case("llvm.arc.annotation.topdown.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.topdown.bbend", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbend", IC_None)
+                  .Default(IC_CallOrUser);
+          }
+
+  // Anything else.
+  return IC_CallOrUser;
+}
+
+/// \brief Determine what kind of construct V is.
+InstructionClass
+llvm::objcarc::GetInstructionClass(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    // Any instruction other than bitcast and gep with a pointer operand have a
+    // use of an objc pointer. Bitcasts, GEPs, Selects, PHIs transfer a pointer
+    // to a subsequent use, rather than using it themselves, in this sense.
+    // As a short cut, several other opcodes are known to have no pointer
+    // operands of interest. And ret is never followed by a release, so it's
+    // not interesting to examine.
+    switch (I->getOpcode()) {
+    case Instruction::Call: {
+      const CallInst *CI = cast<CallInst>(I);
+      // Check for calls to special functions.
+      if (const Function *F = CI->getCalledFunction()) {
+        InstructionClass Class = GetFunctionClass(F);
+        if (Class != IC_CallOrUser)
+          return Class;
+
+        // None of the intrinsic functions do objc_release. For intrinsics, the
+        // only question is whether or not they may be users.
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::returnaddress: case Intrinsic::frameaddress:
+        case Intrinsic::stacksave: case Intrinsic::stackrestore:
+        case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend:
+        case Intrinsic::objectsize: case Intrinsic::prefetch:
+        case Intrinsic::stackprotector:
+        case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64:
+        case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa:
+        case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext:
+        case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline:
+        case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start: case Intrinsic::invariant_end:
+        // Don't let dbg info affect our results.
+        case Intrinsic::dbg_declare: case Intrinsic::dbg_value:
+          // Short cut: Some intrinsics obviously don't use ObjC pointers.
+          return IC_None;
+        default:
+          break;
+        }
+      }
+      return GetCallSiteClass(CI);
+    }
+    case Instruction::Invoke:
+      return GetCallSiteClass(cast<InvokeInst>(I));
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select: case Instruction::PHI:
+    case Instruction::Ret: case Instruction::Br:
+    case Instruction::Switch: case Instruction::IndirectBr:
+    case Instruction::Alloca: case Instruction::VAArg:
+    case Instruction::Add: case Instruction::FAdd:
+    case Instruction::Sub: case Instruction::FSub:
+    case Instruction::Mul: case Instruction::FMul:
+    case Instruction::SDiv: case Instruction::UDiv: case Instruction::FDiv:
+    case Instruction::SRem: case Instruction::URem: case Instruction::FRem:
+    case Instruction::Shl: case Instruction::LShr: case Instruction::AShr:
+    case Instruction::And: case Instruction::Or: case Instruction::Xor:
+    case Instruction::SExt: case Instruction::ZExt: case Instruction::Trunc:
+    case Instruction::IntToPtr: case Instruction::FCmp:
+    case Instruction::FPTrunc: case Instruction::FPExt:
+    case Instruction::FPToUI: case Instruction::FPToSI:
+    case Instruction::UIToFP: case Instruction::SIToFP:
+    case Instruction::InsertElement: case Instruction::ExtractElement:
+    case Instruction::ShuffleVector:
+    case Instruction::ExtractValue:
+      break;
+    case Instruction::ICmp:
+      // Comparing a pointer with null, or any other constant, isn't an
+      // interesting use, because we don't care what the pointer points to, or
+      // about the values of any other dynamic reference-counted pointers.
+      if (IsPotentialRetainableObjPtr(I->getOperand(1)))
+        return IC_User;
+      break;
+    default:
+      // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
+      for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
+           OI != OE; ++OI)
+        if (IsPotentialRetainableObjPtr(*OI))
+          return IC_User;
+    }
+  }
+
+  // Otherwise, it's totally inert for ARC purposes.
+  return IC_None;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
new file mode 100644
index 000000000000..ae3c6282cf83
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -0,0 +1,177 @@
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+                                       const Value *B) {
+  // If the values are Selects with the same condition, we can do a more precise
+  // check: just check for relations between the values on corresponding arms.
+  if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+    if (A->getCondition() == SB->getCondition())
+      return related(A->getTrueValue(), SB->getTrueValue()) ||
+             related(A->getFalseValue(), SB->getFalseValue());
+
+  // Check both arms of the Select node individually.
+  return related(A->getTrueValue(), B) ||
+         related(A->getFalseValue(), B);
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+                                    const Value *B) {
+  // If the values are PHIs in the same block, we can do a more precise as well
+  // as efficient check: just check for relations between the values on
+  // corresponding edges.
+  if (const PHINode *PNB = dyn_cast<PHINode>(B))
+    if (PNB->getParent() == A->getParent()) {
+      for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+        if (related(A->getIncomingValue(i),
+                    PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
+          return true;
+      return false;
+    }
+
+  // Check each unique source of the PHI node against B.
+  SmallPtrSet<const Value *, 4> UniqueSrc;
+  for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i) {
+    const Value *PV1 = A->getIncomingValue(i);
+    if (UniqueSrc.insert(PV1) && related(PV1, B))
+      return true;
+  }
+
+  // All of the arms checked out.
+  return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+  SmallPtrSet<const Value *, 8> Visited;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(P);
+  Visited.insert(P);
+  do {
+    P = Worklist.pop_back_val();
+    for (Value::const_use_iterator UI = P->use_begin(), UE = P->use_end();
+         UI != UE; ++UI) {
+      const User *Ur = *UI;
+      if (isa<StoreInst>(Ur)) {
+        if (UI.getOperandNo() == 0)
+          // The pointer is stored.
+          return true;
+        // The pointed is stored through.
+        continue;
+      }
+      if (isa<CallInst>(Ur))
+        // The pointer is passed as an argument, ignore this.
+        continue;
+      if (isa<PtrToIntInst>(P))
+        // Assume the worst.
+        return true;
+      if (Visited.insert(Ur))
+        Worklist.push_back(Ur);
+    }
+  } while (!Worklist.empty());
+
+  // Everything checked out.
+  return false;
+}
+
+bool ProvenanceAnalysis::relatedCheck(const Value *A,
+                                      const Value *B) {
+  // Skip past provenance pass-throughs.
+  A = GetUnderlyingObjCPtr(A);
+  B = GetUnderlyingObjCPtr(B);
+
+  // Quick check.
+  if (A == B)
+    return true;
+
+  // Ask regular AliasAnalysis, for a first approximation.
+  switch (AA->alias(A, B)) {
+  case AliasAnalysis::NoAlias:
+    return false;
+  case AliasAnalysis::MustAlias:
+  case AliasAnalysis::PartialAlias:
+    return true;
+  case AliasAnalysis::MayAlias:
+    break;
+  }
+
+  bool AIsIdentified = IsObjCIdentifiedObject(A);
+  bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+  // An ObjC-Identified object can't alias a load if it is never locally stored.
+  if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return IsStoredObjCPointer(A);
+    if (BIsIdentified) {
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return IsStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
+    }
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
+      return IsStoredObjCPointer(B);
+  }
+
+   // Special handling for PHI and Select.
+  if (const PHINode *PN = dyn_cast<PHINode>(A))
+    return relatedPHI(PN, B);
+  if (const PHINode *PN = dyn_cast<PHINode>(B))
+    return relatedPHI(PN, A);
+  if (const SelectInst *S = dyn_cast<SelectInst>(A))
+    return relatedSelect(S, B);
+  if (const SelectInst *S = dyn_cast<SelectInst>(B))
+    return relatedSelect(S, A);
+
+  // Conservative.
+  return true;
+}
+
+bool ProvenanceAnalysis::related(const Value *A,
+                                 const Value *B) {
+  // Begin by inserting a conservative value into the map. If the insertion
+  // fails, we have the answer already. If it succeeds, leave it there until we
+  // compute the real answer to guard against recursive queries.
+  if (A > B) std::swap(A, B);
+  std::pair<CachedResultsTy::iterator, bool> Pair =
+    CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+  if (!Pair.second)
+    return Pair.first->second;
+
+  bool Result = relatedCheck(A, B);
+  CachedResults[ValuePairTy(A, B)] = Result;
+  return Result;
+}
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
new file mode 100644
index 000000000000..ec449fd8e747
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -0,0 +1,80 @@
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class Value;
+  class AliasAnalysis;
+  class PHINode;
+  class SelectInst;
+}
+
+namespace llvm {
+namespace objcarc {
+
+/// \brief This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
+  AliasAnalysis *AA;
+
+  typedef std::pair<const Value *, const Value *> ValuePairTy;
+  typedef DenseMap<ValuePairTy, bool> CachedResultsTy;
+  CachedResultsTy CachedResults;
+
+  bool relatedCheck(const Value *A, const Value *B);
+  bool relatedSelect(const SelectInst *A, const Value *B);
+  bool relatedPHI(const PHINode *A, const Value *B);
+
+  void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+  ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+
+public:
+  ProvenanceAnalysis() {}
+
+  void setAA(AliasAnalysis *aa) { AA = aa; }
+
+  AliasAnalysis *getAA() const { return AA; }
+
+  bool related(const Value *A, const Value *B);
+
+  void clear() {
+    CachedResults.clear();
+  }
+};
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 000000000000..a09730864051
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -0,0 +1,97 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass.  This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "adce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/InstIterator.h"
+using namespace llvm;
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace {
+  struct ADCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ADCE() : FunctionPass(ID) {
+      initializeADCEPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function& F);
+
+    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+      AU.setPreservesCFG();
+    }
+
+  };
+}
+
+char ADCE::ID = 0;
+INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false)
+
+bool ADCE::runOnFunction(Function& F) {
+  SmallPtrSet<Instruction*, 128> alive;
+  SmallVector<Instruction*, 128> worklist;
+
+  // Collect the set of "root" instructions that are known live.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (isa<TerminatorInst>(I.getInstructionIterator()) ||
+        isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        isa<LandingPadInst>(I.getInstructionIterator()) ||
+        I->mayHaveSideEffects()) {
+      alive.insert(I.getInstructionIterator());
+      worklist.push_back(I.getInstructionIterator());
+    }
+
+  // Propagate liveness backwards to operands.
+  while (!worklist.empty()) {
+    Instruction* curr = worklist.pop_back_val();
+    for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
+         OI != OE; ++OI)
+      if (Instruction* Inst = dyn_cast<Instruction>(OI))
+        if (alive.insert(Inst))
+          worklist.push_back(Inst);
+  }
+
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the worklist vector here for memory efficiency.
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (!alive.count(I.getInstructionIterator())) {
+      worklist.push_back(I.getInstructionIterator());
+      I->dropAllReferences();
+    }
+
+  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+       E = worklist.end(); I != E; ++I) {
+    ++NumRemoved;
+    (*I)->eraseFromParent();
+  }
+
+  return !worklist.empty();
+}
+
+FunctionPass *llvm::createAggressiveDCEPass() {
+  return new ADCE();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
new file mode 100644
index 000000000000..e755008808f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -0,0 +1,152 @@
+//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a very simple profile guided basic block placement
+// algorithm.  The idea is to put frequently executed blocks together at the
+// start of the function, and hopefully increase the number of fall-through
+// conditional branches.  If there is no profile information for a particular
+// function, this pass basically orders blocks in depth-first order
+//
+// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
+// Positioning" by Pettis and Hansen, except that it uses basic block counts
+// instead of edge counts.  This should be improved in many ways, but is very
+// simple for now.
+//
+// Basically we "place" the entry block, then loop over all successors in a DFO,
+// placing the most frequently executed successor until we run out of blocks.  I
+// told you this was _extremely_ simplistic. :) This is also much slower than it
+// could be.  When it becomes important, this pass will be rewritten to use a
+// better algorithm, and then we can worry about efficiency.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "block-placement"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumMoved, "Number of basic blocks moved");
+
+namespace {
+  struct BlockPlacement : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BlockPlacement() : FunctionPass(ID) {
+      initializeBlockPlacementPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<ProfileInfo>();
+      //AU.addPreserved<ProfileInfo>();  // Does this work?
+    }
+  private:
+    /// PI - The profile information that is guiding us.
+    ///
+    ProfileInfo *PI;
+
+    /// NumMovedBlocks - Every time we move a block, increment this counter.
+    ///
+    unsigned NumMovedBlocks;
+
+    /// PlacedBlocks - Every time we place a block, remember it so we don't get
+    /// into infinite loops.
+    std::set<BasicBlock*> PlacedBlocks;
+
+    /// InsertPos - This an iterator to the next place we want to insert a
+    /// block.
+    Function::iterator InsertPos;
+
+    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
+    /// successors.
+    void PlaceBlocks(BasicBlock *BB);
+  };
+}
+
+char BlockPlacement::ID = 0;
+INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement",
+                "Profile Guided Basic Block Placement", false, false)
+INITIALIZE_AG_DEPENDENCY(ProfileInfo)
+INITIALIZE_PASS_END(BlockPlacement, "block-placement",
+                "Profile Guided Basic Block Placement", false, false)
+
+FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
+
+bool BlockPlacement::runOnFunction(Function &F) {
+  PI = &getAnalysis<ProfileInfo>();
+
+  NumMovedBlocks = 0;
+  InsertPos = F.begin();
+
+  // Recursively place all blocks.
+  PlaceBlocks(F.begin());
+
+  PlacedBlocks.clear();
+  NumMoved += NumMovedBlocks;
+  return NumMovedBlocks != 0;
+}
+
+
+/// PlaceBlocks - Recursively place the specified blocks and any unplaced
+/// successors.
+void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
+  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
+  PlacedBlocks.insert(BB);
+
+  // Place the specified block.
+  if (&*InsertPos != BB) {
+    // Use splice to move the block into the right place.  This avoids having to
+    // remove the block from the function then readd it, which causes a bunch of
+    // symbol table traffic that is entirely pointless.
+    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
+    Blocks.splice(InsertPos, Blocks, BB);
+
+    ++NumMovedBlocks;
+  } else {
+    // This block is already in the right place, we don't have to do anything.
+    ++InsertPos;
+  }
+
+  // Keep placing successors until we run out of ones to place.  Note that this
+  // loop is very inefficient (N^2) for blocks with many successors, like switch
+  // statements.  FIXME!
+  while (1) {
+    // Okay, now place any unplaced successors.
+    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
+
+    // Scan for the first unplaced successor.
+    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
+      /*empty*/;
+    if (SI == E) return;  // No more successors to place.
+
+    double MaxExecutionCount = PI->getExecutionCount(*SI);
+    BasicBlock *MaxSuccessor = *SI;
+
+    // Scan for more frequently executed successors
+    for (; SI != E; ++SI)
+      if (!PlacedBlocks.count(*SI)) {
+        double Count = PI->getExecutionCount(*SI);
+        if (Count > MaxExecutionCount ||
+            // Prefer to not disturb the code.
+            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
+          MaxExecutionCount = Count;
+          MaxSuccessor = *SI;
+        }
+      }
+
+    // Now that we picked the maximally executed successor, place it.
+    PlaceBlocks(MaxSuccessor);
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
new file mode 100644
index 000000000000..015fd2e6e6fc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -0,0 +1,2012 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegenprepare"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumBlocksElim, "Number of blocks eliminated");
+STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
+STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
+STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
+                      "sunken Cmps");
+STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
+                       "of sunken Casts");
+STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
+                          "computations were sunk");
+STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
+STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
+STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
+STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
+
+static cl::opt<bool> DisableBranchOpts(
+  "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
+  cl::desc("Disable branch optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> DisableSelectToBranch(
+  "disable-cgp-select2branch", cl::Hidden, cl::init(false),
+  cl::desc("Disable select to branch conversion."));
+
+namespace {
+  class CodeGenPrepare : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+    const TargetLibraryInfo *TLInfo;
+    DominatorTree *DT;
+    ProfileInfo *PFI;
+
+    /// CurInstIterator - As we scan instructions optimizing them, this is the
+    /// next instruction to optimize.  Xforms that can invalidate this should
+    /// update it.
+    BasicBlock::iterator CurInstIterator;
+
+    /// Keeps track of non-local addresses that have been sunk into a block.
+    /// This allows us to avoid inserting duplicate code for blocks with
+    /// multiple load/stores of the same address.
+    DenseMap<Value*, Value*> SunkAddrs;
+
+    /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to
+    /// be updated.
+    bool ModifiedDT;
+
+    /// OptSize - True if optimizing for size.
+    bool OptSize;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit CodeGenPrepare(const TargetLowering *tli = 0)
+      : FunctionPass(ID), TLI(tli) {
+        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
+      }
+    bool runOnFunction(Function &F);
+
+    const char *getPassName() const { return "CodeGen Prepare"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<ProfileInfo>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+  private:
+    bool EliminateFallThrough(Function &F);
+    bool EliminateMostlyEmptyBlocks(Function &F);
+    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void EliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool OptimizeBlock(BasicBlock &BB);
+    bool OptimizeInst(Instruction *I);
+    bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
+    bool OptimizeInlineAsmInst(CallInst *CS);
+    bool OptimizeCallInst(CallInst *CI);
+    bool MoveExtToFormExtLoad(Instruction *I);
+    bool OptimizeExtUses(Instruction *I);
+    bool OptimizeSelectInst(SelectInst *SI);
+    bool DupRetToEnableTailCallOpts(BasicBlock *BB);
+    bool PlaceDbgValues(Function &F);
+  };
+}
+
+char CodeGenPrepare::ID = 0;
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
+                "Optimize for code generation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
+                "Optimize for code generation", false, false)
+
+FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
+  return new CodeGenPrepare(TLI);
+}
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+
+  ModifiedDT = false;
+  TLInfo = &getAnalysis<TargetLibraryInfo>();
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  PFI = getAnalysisIfAvailable<ProfileInfo>();
+  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                           Attribute::OptimizeForSize);
+
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<unsigned int, unsigned int> &BypassWidths =
+       TLI->getBypassSlowDivWidths();
+    for (Function::iterator I = F.begin(); I != F.end(); I++)
+      EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
+  }
+
+  // Eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not
+  // find a node corresponding to the value.
+  EverMadeChange |= PlaceDbgValues(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end(); ) {
+      BasicBlock *BB = I++;
+      MadeChange |= OptimizeBlock(*BB);
+    }
+    EverMadeChange |= MadeChange;
+  }
+
+  SunkAddrs.clear();
+
+  if (!DisableBranchOpts) {
+    MadeChange = false;
+    SmallPtrSet<BasicBlock*, 8> WorkList;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+      MadeChange |= ConstantFoldTerminator(BB, true);
+      if (!MadeChange) continue;
+
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
+
+    // Delete the dead blocks and any of their dead successors.
+    MadeChange |= !WorkList.empty();
+    while (!WorkList.empty()) {
+      BasicBlock *BB = *WorkList.begin();
+      WorkList.erase(BB);
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+
+      DeleteDeadBlock(BB);
+      
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
+
+    // Merge pairs of basic blocks with unconditional branches, connected by
+    // a single edge.
+    if (EverMadeChange || MadeChange)
+      MadeChange |= EliminateFallThrough(F);
+
+    if (MadeChange)
+      ModifiedDT = true;
+    EverMadeChange |= MadeChange;
+  }
+
+  if (ModifiedDT && DT)
+    DT->DT->recalculate(F);
+
+  return EverMadeChange;
+}
+
+/// EliminateFallThrough - Merge basic blocks which are connected
+/// by a single edge, where one of the basic blocks has a single successor
+/// pointing to the other basic block, which has a single predecessor.
+bool CodeGenPrepare::EliminateFallThrough(Function &F) {
+  bool Changed = false;
+  // Scan all of the blocks in the function, except for the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+    // If the destination block has a single pred, then this is a trivial
+    // edge, just collapse it.
+    BasicBlock *SinglePred = BB->getSinglePredecessor();
+
+    // Don't merge if BB's address is taken.
+    if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
+
+    BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
+    if (Term && !Term->isConditional()) {
+      Changed = true;
+      DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
+      // Remember if SinglePred was the entry block of the function.
+      // If so, we will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(BB, this);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+
+      // We have erased a block. Update the iterator.
+      I = BB;
+    }
+  }
+  return Changed;
+}
+
+/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes,
+/// debug info directives, and an unconditional branch.  Passes before isel
+/// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for
+/// isel.  Start by eliminating these blocks so we can split them the way we
+/// want them.
+bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+  bool MadeChange = false;
+  // Note that this intentionally skips the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // If this block doesn't end with an uncond branch, ignore it.
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isUnconditional())
+      continue;
+
+    // If the instruction before the branch (skipping debug info) isn't a phi
+    // node, then other stuff is happening here.
+    BasicBlock::iterator BBI = BI;
+    if (BBI != BB->begin()) {
+      --BBI;
+      while (isa<DbgInfoIntrinsic>(BBI)) {
+        if (BBI == BB->begin())
+          break;
+        --BBI;
+      }
+      if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+        continue;
+    }
+
+    // Do not break infinite loops.
+    BasicBlock *DestBB = BI->getSuccessor(0);
+    if (DestBB == BB)
+      continue;
+
+    if (!CanMergeBlocks(BB, DestBB))
+      continue;
+
+    EliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
+/// single uncond branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  BasicBlock::const_iterator BBI = BB->begin();
+  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+    for (Value::const_use_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      const Instruction *User = cast<Instruction>(*UI);
+      if (User->getParent() != DestBB || !isa<PHINode>(User))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check
+      // incoming value. If incoming value is not from BB then this is
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (User->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(User))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN) return true;  // no conflict.
+
+  // Collect the preds of BB.
+  SmallPtrSet<const BasicBlock*, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) {   // Common predecessor?
+      BBI = DestBB->begin();
+      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+        const Value *V1 = PN->getIncomingValueForBlock(Pred);
+        const Value *V2 = PN->getIncomingValueForBlock(BB);
+
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+
+        // If there is a conflict, bail out.
+        if (V1 != V2) return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
+/// an unconditional branch in it.
+void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+
+  DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
+
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
+    if (SinglePred != DestBB) {
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(DestBB, this);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+
+      DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+      return;
+    }
+  }
+
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  PHINode *PN;
+  for (BasicBlock::iterator BBI = DestBB->begin();
+       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN->removeIncomingValue(BB, false);
+
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN->addIncoming(InValPhi->getIncomingValue(i),
+                        InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+          PN->addIncoming(InVal, *PI);
+      }
+    }
+  }
+
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  if (DT && !ModifiedDT) {
+    BasicBlock *BBIDom  = DT->getNode(BB)->getIDom()->getBlock();
+    BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock();
+    BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom);
+    DT->changeImmediateDominator(DestBB, NewIDom);
+    DT->eraseNode(BB);
+  }
+  if (PFI) {
+    PFI->replaceAllUses(BB, DestBB);
+    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
+  }
+  BB->eraseFromParent();
+  ++NumBlocksElim;
+
+  DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+}
+
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, i32->i8 on PPC),
+/// sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+///
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy,
+  EVT SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT)) return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
+    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
+    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      UserBB = PN->getIncomingBlock(UI);
+    }
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      InsertedCast =
+        CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
+                         InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+    ++NumCastUses;
+  }
+
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty()) {
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce
+/// the number of virtual registers that must be created and coalesced.  This is
+/// a clear win except on targets with multiple condition code registers
+///  (PowerPC), where it might lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool OptimizeCmpExpression(CmpInst *CI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCmp - Only insert a cmp in each block once.
+  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      InsertedCmp =
+        CmpInst::Create(CI->getOpcode(),
+                        CI->getPredicate(),  CI->getOperand(0),
+                        CI->getOperand(1), "", InsertPt);
+      MadeChange = true;
+    }
+
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+    ++NumCmpUses;
+  }
+
+  // If we removed all uses, nuke the cmp.
+  if (CI->use_empty())
+    CI->eraseFromParent();
+
+  return MadeChange;
+}
+
+namespace {
+class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
+protected:
+  void replaceCall(Value *With) {
+    CI->replaceAllUsesWith(With);
+    CI->eraseFromParent();
+  }
+  bool isFoldable(unsigned SizeCIOp, unsigned, bool) const {
+      if (ConstantInt *SizeCI =
+                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
+        return SizeCI->isAllOnesValue();
+    return false;
+  }
+};
+} // end anonymous namespace
+
+bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
+  BasicBlock *BB = CI->getParent();
+
+  // Lower inline assembly if we can.
+  // If we found an inline asm expession, and if the target knows how to
+  // lower it to normal LLVM code, do so now.
+  if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
+    if (TLI->ExpandInlineAsm(CI)) {
+      // Avoid invalidating the iterator.
+      CurInstIterator = BB->begin();
+      // Avoid processing instructions out of order, which could cause
+      // reuse before a value is defined.
+      SunkAddrs.clear();
+      return true;
+    }
+    // Sink address computing for memory operands into the block.
+    if (OptimizeInlineAsmInst(CI))
+      return true;
+  }
+
+  // Lower all uses of llvm.objectsize.*
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
+    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+    Type *ReturnTy = CI->getType();
+    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+
+    // Substituting this can cause recursive simplifications, which can
+    // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+    // happens.
+    WeakVH IterHandle(CurInstIterator);
+
+    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0,
+                                  TLInfo, ModifiedDT ? 0 : DT);
+
+    // If the iterator instruction was recursively deleted, start over at the
+    // start of the block.
+    if (IterHandle != CurInstIterator) {
+      CurInstIterator = BB->begin();
+      SunkAddrs.clear();
+    }
+    return true;
+  }
+
+  if (II && TLI) {
+    SmallVector<Value*, 2> PtrOps;
+    Type *AccessTy;
+    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+      while (!PtrOps.empty())
+        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+          return true;
+  }
+
+  // From here on out we're working with named functions.
+  if (CI->getCalledFunction() == 0) return false;
+
+  // We'll need DataLayout from here on out.
+  const DataLayout *TD = TLI ? TLI->getDataLayout() : 0;
+  if (!TD) return false;
+
+  // Lower all default uses of _chk calls.  This is very similar
+  // to what InstCombineCalls does, but here we are only lowering calls
+  // that have the default "don't know" as the objectsize.  Anything else
+  // should be left alone.
+  CodeGenPrepareFortifiedLibCalls Simplifier;
+  return Simplifier.fold(CI, TD, TLInfo);
+}
+
+/// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
+/// instructions to the predecessor to enable tail call optimizations. The
+/// case it is currently looking for is:
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   br label %return
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   br label %return
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   br label %return
+/// return:
+///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+///   ret i32 %retval
+/// @endcode
+///
+/// =>
+///
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   ret i32 %tmp0
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   ret i32 %tmp1
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   ret i32 %tmp2
+/// @endcode
+bool CodeGenPrepare::DupRetToEnableTailCallOpts(BasicBlock *BB) {
+  if (!TLI)
+    return false;
+
+  ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+  if (!RI)
+    return false;
+
+  PHINode *PN = 0;
+  BitCastInst *BCI = 0;
+  Value *V = RI->getReturnValue();
+  if (V) {
+    BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      V = BCI->getOperand(0);
+
+    PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return false;
+  }
+
+  if (PN && PN->getParent() != BB)
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  // See llvm::isInTailCallPosition().
+  const Function *F = BB->getParent();
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Make sure there are no instructions between the PHI and return, or that the
+  // return is the first instruction in the block.
+  if (PN) {
+    BasicBlock::iterator BI = BB->begin();
+    do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
+    if (&*BI == BCI)
+      // Also skip over the bitcast.
+      ++BI;
+    if (&*BI != RI)
+      return false;
+  } else {
+    BasicBlock::iterator BI = BB->begin();
+    while (isa<DbgInfoIntrinsic>(BI)) ++BI;
+    if (&*BI != RI)
+      return false;
+  }
+
+  /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
+  /// call.
+  SmallVector<CallInst*, 4> TailCalls;
+  if (PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
+      CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
+      // Make sure the phi value is indeed produced by the tail call.
+      if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
+          TLI->mayBeEmittedAsTailCall(CI))
+        TailCalls.push_back(CI);
+    }
+  } else {
+    SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      if (!VisitedBBs.insert(*PI))
+        continue;
+
+      BasicBlock::InstListType &InstList = (*PI)->getInstList();
+      BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
+      BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
+      do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
+      if (RI == RE)
+        continue;
+
+      CallInst *CI = dyn_cast<CallInst>(&*RI);
+      if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI))
+        TailCalls.push_back(CI);
+    }
+  }
+
+  bool Changed = false;
+  for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
+    CallInst *CI = TailCalls[i];
+    CallSite CS(CI);
+
+    // Conservatively require the attributes of the call to match those of the
+    // return. Ignore noalias because it doesn't affect the call sequence.
+    AttributeSet CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias))
+      continue;
+
+    // Make sure the call instruction is followed by an unconditional branch to
+    // the return block.
+    BasicBlock *CallBB = CI->getParent();
+    BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
+    if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
+      continue;
+
+    // Duplicate the return into CallBB.
+    (void)FoldReturnIntoUncondBranch(RI, BB, CallBB);
+    ModifiedDT = Changed = true;
+    ++NumRetsDup;
+  }
+
+  // If we eliminated all predecessors of the block, delete the block now.
+  if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+    BB->eraseFromParent();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Optimization
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
+/// which holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg;
+  Value *ScaledReg;
+  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
+  void print(raw_ostream &OS) const;
+  void dump() const;
+  
+  bool operator==(const ExtAddrMode& O) const {
+    return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
+           (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
+           (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
+  AM.print(OS);
+  return OS;
+}
+
+void ExtAddrMode::print(raw_ostream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (BaseGV) {
+    OS << (NeedPlus ? " + " : "")
+       << "GV:";
+    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs)
+    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "")
+       << "Base:";
+    WriteAsOperand(OS, BaseReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "")
+       << Scale << "*";
+    WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  OS << ']';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ExtAddrMode::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+
+/// \brief A helper class for matching addressing modes.
+///
+/// This encapsulates the logic for matching the target-legal addressing modes.
+class AddressingModeMatcher {
+  SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetLowering &TLI;
+
+  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
+  /// the memory instruction that we're computing this address for.
+  Type *AccessTy;
+  Instruction *MemoryInst;
+  
+  /// AddrMode - This is the addressing mode that we're building up.  This is
+  /// part of the return value of this addressing mode matching stuff.
+  ExtAddrMode &AddrMode;
+  
+  /// IgnoreProfitability - This is set to true when we should not do
+  /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
+  /// always returns true.
+  bool IgnoreProfitability;
+  
+  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
+                        const TargetLowering &T, Type *AT,
+                        Instruction *MI, ExtAddrMode &AM)
+    : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM) {
+    IgnoreProfitability = false;
+  }
+public:
+  
+  /// Match - Find the maximal addressing mode that a load/store of V can fold,
+  /// give an access type of AccessTy.  This returns a list of involved
+  /// instructions in AddrModeInsts.
+  static ExtAddrMode Match(Value *V, Type *AccessTy,
+                           Instruction *MemoryInst,
+                           SmallVectorImpl<Instruction*> &AddrModeInsts,
+                           const TargetLowering &TLI) {
+    ExtAddrMode Result;
+
+    bool Success = 
+      AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
+                            MemoryInst, Result).MatchAddr(V, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+    return Result;
+  }
+private:
+  bool MatchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool MatchAddr(Value *V, unsigned Depth);
+  bool MatchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth);
+  bool IsProfitableToFoldIntoAddressingMode(Instruction *I,
+                                            ExtAddrMode &AMBefore,
+                                            ExtAddrMode &AMAfter);
+  bool ValueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+};
+
+/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return MatchAddr(ScaleReg, Depth);
+  
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+  
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+  
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode.
+  ConstantInt *CI = 0; Value *AddLHS = 0;
+  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+      
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+  }
+
+  // Otherwise, not (x+c)*scale, just return what we have.
+  return true;
+}
+
+/// MightBeFoldableInst - This is a little filter, which returns true if an
+/// addressing computation involving I might be folded into a load/store
+/// accessing it.  This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// MatchOperationAddr - Given an instruction or constant expr, see if we can
+/// fold the operation into the addressing mode.  If so, update the addressing
+/// mode and return true, otherwise return false without modifying AddrMode.
+bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5) return false;
+  
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return MatchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
+         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return MatchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(0), Depth+1))
+      return true;
+    
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
+        MatchAddr(AddrInst->getOperand(1), Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;
+  }
+  //case Instruction::Or:
+  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  //break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) return false;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1LL << Scale;
+    
+    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const DataLayout *TD = TLI.getDataLayout();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+    
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+
+    // Match the base operand of the GEP.
+    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  }
+  return false;
+}
+
+/// MatchAddr - If we can, try to add the value of 'Addr' into the current
+/// addressing mode.  If Addr can't be added to AddrMode this returns false and
+/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
+/// or intptr_t for the target.
+///
+bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    // Fold in immediates if legal for the target.
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+      
+      // It isn't profitable to do this, roll back.
+      //cerr << "NOT FOLDING: " << *I;
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = 0;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
+/// inline asm call are due to memory operands.  If so, return true, otherwise
+/// return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetLowering &TLI) {
+  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+    
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+
+  return true;
+}
+
+/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
+/// memory use.  If we find an obviously non-foldable instruction, return true.
+/// Add the ultimately found memory instructions to MemoryUses.
+static bool FindAllMemoryUses(Instruction *I,
+                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
+                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
+                              const TargetLowering &TLI) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I))
+    return false;
+  
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  // Loop over all the uses, recursively processing them.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
+      continue;
+    }
+    
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      unsigned opNo = UI.getOperandNo();
+      if (opNo == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, opNo));
+      continue;
+    }
+    
+    if (CallInst *CI = dyn_cast<CallInst>(U)) {
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      if (!IA) return true;
+      
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+        return true;
+      continue;
+    }
+    
+    if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
+                          TLI))
+      return true;
+  }
+
+  return false;
+}
+
+/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
+/// the use site that we're folding it into.  If so, there is no cost to
+/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
+/// that we know are live at the instruction already.
+bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+  
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
+  
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+  
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  return Val->isUsedInBasicBlock(MemoryInst->getParent());
+}
+
+/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
+/// mode of the machine to fold the specified instruction into a load or store
+/// that ultimately uses it.  However, the specified instruction has multiple
+/// uses.  Given this, it may actually increase register pressure to fold it
+/// into the load.  For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::
+IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+                                     ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability) return true;
+  
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+  
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = 0;
+  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = 0;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (BaseReg == 0 && ScaledReg == 0)
+    return true;
+
+  // If all uses of this instruction are ultimately load/store/inlineasm's,
+  // check to see if their addressing modes will include this instruction.  If
+  // so, we can fold it into all uses, so it doesn't matter if it has multiple
+  // uses.
+  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallPtrSet<Instruction*, 16> ConsideredInsts;
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+    return false;  // Has a non-memory, non-foldable use!
+  
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these uses and see if they could
+  // *actually* fold the instruction.
+  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
+  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
+    Instruction *User = MemoryUses[i].first;
+    unsigned OpNo = MemoryUses[i].second;
+    
+    // Get the access type of this use.  If the use isn't a pointer, we don't
+    // know what it accesses.
+    Value *Address = User->getOperand(OpNo);
+    if (!Address->getType()->isPointerTy())
+      return false;
+    Type *AddressAccessTy =
+      cast<PointerType>(Address->getType())->getElementType();
+    
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+                                  MemoryInst, Result);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.MatchAddr(Address, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
+                  I) == MatchedAddrModeInsts.end())
+      return false;
+    
+    MatchedAddrModeInsts.clear();
+  }
+  
+  return true;
+}
+
+} // end anonymous namespace
+
+/// IsNonLocalValue - Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// OptimizeMemoryInst - Load and Store Instructions often have
+/// addressing modes that can do significant amounts of computation.  As such,
+/// instruction selection will try to get the load or store to do as much
+/// computation as possible for the program.  The problem is that isel can only
+/// see within a single block.  As such, we sink as much legal addressing mode
+/// stuff into the block as possible.
+///
+/// This method is used to optimize both load/store and inline asms with memory
+/// operands.
+bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        Type *AccessTy) {
+  Value *Repl = Addr;
+
+  // Try to collapse single-value PHI nodes.  This is necessary to undo
+  // unprofitable PRE transformations.
+  SmallVector<Value*, 8> worklist;
+  SmallPtrSet<Value*, 16> Visited;
+  worklist.push_back(Addr);
+
+  // Use a worklist to iteratively look through PHI nodes, and ensure that
+  // the addressing mode obtained from the non-PHI roots of the graph
+  // are equivalent.
+  Value *Consensus = 0;
+  unsigned NumUsesConsensus = 0;
+  bool IsNumUsesConsensusValid = false;
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode;
+  while (!worklist.empty()) {
+    Value *V = worklist.back();
+    worklist.pop_back();
+
+    // Break use-def graph loops.
+    if (!Visited.insert(V)) {
+      Consensus = 0;
+      break;
+    }
+
+    // For a PHI node, push all of its incoming values.
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
+        worklist.push_back(P->getIncomingValue(i));
+      continue;
+    }
+
+    // For non-PHIs, determine the addressing mode being computed.
+    SmallVector<Instruction*, 16> NewAddrModeInsts;
+    ExtAddrMode NewAddrMode =
+      AddressingModeMatcher::Match(V, AccessTy, MemoryInst,
+                                   NewAddrModeInsts, *TLI);
+
+    // This check is broken into two cases with very similar code to avoid using
+    // getNumUses() as much as possible. Some values have a lot of uses, so
+    // calling getNumUses() unconditionally caused a significant compile-time
+    // regression.
+    if (!Consensus) {
+      Consensus = V;
+      AddrMode = NewAddrMode;
+      AddrModeInsts = NewAddrModeInsts;
+      continue;
+    } else if (NewAddrMode == AddrMode) {
+      if (!IsNumUsesConsensusValid) {
+        NumUsesConsensus = Consensus->getNumUses();
+        IsNumUsesConsensusValid = true;
+      }
+
+      // Ensure that the obtained addressing mode is equivalent to that obtained
+      // for all other roots of the PHI traversal.  Also, when choosing one
+      // such root as representative, select the one with the most uses in order
+      // to keep the cost modeling heuristics in AddressingModeMatcher
+      // applicable.
+      unsigned NumUses = V->getNumUses();
+      if (NumUses > NumUsesConsensus) {
+        Consensus = V;
+        NumUsesConsensus = NumUses;
+        AddrModeInsts = NewAddrModeInsts;
+      }
+      continue;
+    }
+
+    Consensus = 0;
+    break;
+  }
+
+  // If the addressing mode couldn't be determined, or if multiple different
+  // ones were determined, bail out now.
+  if (!Consensus) return false;
+
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  IRBuilder<> Builder(MemoryInst);
+
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst);
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+  } else {
+    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst);
+    Type *IntPtrTy =
+          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
+
+    Value *Result = 0;
+
+    // Start with the base register. Do this first so that subsequent address
+    // matching finds it last, which will prevent it from trying to match it
+    // as the scaled value in case it happens to be a mul. That would be
+    // problematic if we've sunk a different mul for the scale, because then
+    // we'd end up sinking both muls.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType()->isPointerTy())
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      if (V->getType() != IntPtrTy)
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+      Result = V;
+    }
+
+    // Add the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (V->getType()->isPointerTy()) {
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+      } else {
+        V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr");
+      }
+      if (AddrMode.Scale != 1)
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    if (Result == 0)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
+  }
+
+  MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Repl->use_empty()) {
+    // This can cause recursive deletion, which can invalidate our iterator.
+    // Use a WeakVH to hold onto it in case this happens.
+    WeakVH IterHandle(CurInstIterator);
+    BasicBlock *BB = CurInstIterator->getParent();
+
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
+
+    if (IterHandle != CurInstIterator) {
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      CurInstIterator = BB->begin();
+      SunkAddrs.clear();
+    } else {
+      // This address is now available for reassignment, so erase the table
+      // entry; we don't want to match some completely different instruction.
+      SunkAddrs[Addr] = 0;
+    }
+  }
+  ++NumMemoryInsts;
+  return true;
+}
+
+/// OptimizeInlineAsmInst - If there are any memory operands, use
+/// OptimizeMemoryInst to sink their address computing into the block when
+/// possible / profitable.
+bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
+  bool MadeChange = false;
+
+  TargetLowering::AsmOperandInfoVector
+    TargetConstraints = TLI->ParseConstraints(CS);
+  unsigned ArgNo = 0;
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.isIndirect) {
+      Value *OpVal = CS->getArgOperand(ArgNo++);
+      MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType());
+    } else if (OpInfo.Type == InlineAsm::isInput)
+      ArgNo++;
+  }
+
+  return MadeChange;
+}
+
+/// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
+/// basic block as the load, unless conditions are unfavorable. This allows
+/// SelectionDAG to fold the extend into the load.
+///
+bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
+  // Look for a load being extended.
+  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
+  if (!LI) return false;
+
+  // If they're already in the same block, there's nothing to do.
+  if (LI->getParent() == I->getParent())
+    return false;
+
+  // If the load has other users and the truncate is not free, this probably
+  // isn't worthwhile.
+  if (!LI->hasOneUse() &&
+      TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) ||
+              !TLI->isTypeLegal(TLI->getValueType(I->getType()))) &&
+      !TLI->isTruncateFree(I->getType(), LI->getType()))
+    return false;
+
+  // Check whether the target supports casts folded into loads.
+  unsigned LType;
+  if (isa<ZExtInst>(I))
+    LType = ISD::ZEXTLOAD;
+  else {
+    assert(isa<SExtInst>(I) && "Unexpected ext type!");
+    LType = ISD::SEXTLOAD;
+  }
+  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
+    return false;
+
+  // Move the extend into the same block as the load, so that SelectionDAG
+  // can fold it.
+  I->removeFromParent();
+  I->insertAfter(LI);
+  ++NumExtsMoved;
+  return true;
+}
+
+bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
+  BasicBlock *DefBB = I->getParent();
+
+  // If the result of a {s|z}ext and its source are both live out, rewrite all
+  // other uses of the source with result of extension.
+  Value *Src = I->getOperand(0);
+  if (Src->hasOneUse())
+    return false;
+
+  // Only do this xform if truncating is free.
+  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+    return false;
+
+  // Only safe to perform the optimization if the source is also defined in
+  // this block.
+  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
+    return false;
+
+  bool DefIsLiveOut = false;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    DefIsLiveOut = true;
+    break;
+  }
+  if (!DefIsLiveOut)
+    return false;
+
+  // Make sure non of the uses are PHI nodes.
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+    // Be conservative. We don't want this xform to end up introducing
+    // reloads just before load / store instructions.
+    if (isa<PHINode>(User) || isa<LoadInst>(User) || isa<StoreInst>(User))
+      return false;
+  }
+
+  // InsertedTruncs - Only insert one trunc in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
+       UI != E; ++UI) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // Both src and def are live in this block. Rewrite the use.
+    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
+
+    if (!InsertedTrunc) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
+    }
+
+    // Replace a use of the {s|z}ext source with a use of the result.
+    TheUse = InsertedTrunc;
+    ++NumExtUses;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be
+/// turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
+  // FIXME: This should use the same heuristics as IfConversion to determine
+  // whether a select is better represented as a branch.  This requires that
+  // branch probability metadata is preserved for the select, which is not the
+  // case currently.
+
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+
+  // If the branch is predicted right, an out of order CPU can avoid blocking on
+  // the compare.  Emit cmovs on compares with a memory operand as branches to
+  // avoid stalls on the load from memory.  If the compare has more than one use
+  // there's probably another cmov or setcc around so it's not worth emitting a
+  // branch.
+  if (!Cmp)
+    return false;
+
+  Value *CmpOp0 = Cmp->getOperand(0);
+  Value *CmpOp1 = Cmp->getOperand(1);
+
+  // We check that the memory operand has one use to avoid uses of the loaded
+  // value directly after the compare, making branches unprofitable.
+  return Cmp->hasOneUse() &&
+         ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) ||
+          (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse()));
+}
+
+
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
+bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
+    return false;
+
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  // Do we have efficient codegen support for this kind of 'selects' ?
+  if (TLI->isSelectSupported(SelectKind)) {
+    // We have efficient codegen support for the select instruction.
+    // Check if it is profitable to keep this 'select'.
+    if (!TLI->isPredictableSelectExpensive() ||
+        !isFormingBranchFromSelectProfitable(SI))
+      return false;
+  }
+
+  ModifiedDT = true;
+
+  // First, we split the block containing the select into 2 blocks.
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
+  BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+
+  // Create a new block serving as the landing pad for the branch.
+  BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid",
+                                             NextBlock->getParent(), NextBlock);
+
+  // Move the unconditional branch from the block with the select in it into our
+  // landing pad block.
+  StartBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(NextBlock, SmallBlock);
+
+  // Insert the real conditional branch based on the original condition.
+  BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI);
+
+  // The select itself is replaced with a PHI Node.
+  PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin());
+  PN->takeName(SI);
+  PN->addIncoming(SI->getTrueValue(), StartBlock);
+  PN->addIncoming(SI->getFalseValue(), SmallBlock);
+  SI->replaceAllUsesWith(PN);
+  SI->eraseFromParent();
+
+  // Instruct OptimizeBlock to skip to the next block.
+  CurInstIterator = StartBlock->end();
+  ++NumSelectsExpanded;
+  return true;
+}
+
+bool CodeGenPrepare::OptimizeInst(Instruction *I) {
+  if (PHINode *P = dyn_cast<PHINode>(I)) {
+    // It is possible for very late stage optimizations (such as SimplifyCFG)
+    // to introduce PHI nodes too late to be cleaned up.  If we detect such a
+    // trivial PHI, go ahead and zap it here.
+    if (Value *V = SimplifyInstruction(P)) {
+      P->replaceAllUsesWith(V);
+      P->eraseFromParent();
+      ++NumPHIsElim;
+      return true;
+    }
+    return false;
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    // If the source of the cast is a constant, then this should have
+    // already been constant folded.  The only reason NOT to constant fold
+    // it is if something (e.g. LSR) was careful to place the constant
+    // evaluation in a block other than then one that uses it (e.g. to hoist
+    // the address of globals out of a loop).  If this is the case, we don't
+    // want to forward-subst the cast.
+    if (isa<Constant>(CI->getOperand(0)))
+      return false;
+
+    if (TLI && OptimizeNoopCopyExpression(CI, *TLI))
+      return true;
+
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+      bool MadeChange = MoveExtToFormExtLoad(I);
+      return MadeChange | OptimizeExtUses(I);
+    }
+    return false;
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    return OptimizeCmpExpression(CI);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (TLI)
+      return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
+    return false;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (TLI)
+      return OptimizeMemoryInst(I, SI->getOperand(1),
+                                SI->getOperand(0)->getType());
+    return false;
+  }
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEPI->hasAllZeroIndices()) {
+      /// The GEP operand must be a pointer, so must its result -> BitCast
+      Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
+                                        GEPI->getName(), GEPI);
+      GEPI->replaceAllUsesWith(NC);
+      GEPI->eraseFromParent();
+      ++NumGEPsElim;
+      OptimizeInst(NC);
+      return true;
+    }
+    return false;
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    return OptimizeCallInst(CI);
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I))
+    return OptimizeSelectInst(SI);
+
+  return false;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+  SunkAddrs.clear();
+  bool MadeChange = false;
+
+  CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end())
+    MadeChange |= OptimizeInst(CurInstIterator++);
+
+  MadeChange |= DupRetToEnableTailCallOpts(&BB);
+
+  return MadeChange;
+}
+
+// llvm.dbg.value is far away from the value then iSel may not be able
+// handle it properly. iSel will drop llvm.dbg.value if it can not
+// find a node corresponding to the value.
+bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+  bool MadeChange = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    Instruction *PrevNonDbgInst = NULL;
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      Instruction *Insn = BI; ++BI;
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+      if (!DVI) {
+        PrevNonDbgInst = Insn;
+        continue;
+      }
+
+      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
+      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 000000000000..d5a96eceb993
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -0,0 +1,98 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass(ID) {
+      initializeConstantPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+  };
+}
+
+char ConstantPropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(ConstantPropagation, "constprop",
+                "Simple constant propagation", false, false)
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      WorkList.insert(&*i);
+  }
+  bool Changed = false;
+  DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          WorkList.insert(cast<Instruction>(*UI));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        I->eraseFromParent();
+
+        // We made a change to the function...
+        Changed = true;
+        ++NumInstKilled;
+      }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
new file mode 100644
index 000000000000..995782e1bc6b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -0,0 +1,321 @@
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "correlated-value-propagation"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumPhis,      "Number of phis propagated");
+STATISTIC(NumSelects,   "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps,      "Number of comparisons propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
+
+namespace {
+  class CorrelatedValuePropagation : public FunctionPass {
+    LazyValueInfo *LVI;
+
+    bool processSelect(SelectInst *SI);
+    bool processPHI(PHINode *P);
+    bool processMemAccess(Instruction *I);
+    bool processCmp(CmpInst *C);
+    bool processSwitch(SwitchInst *SI);
+
+  public:
+    static char ID;
+    CorrelatedValuePropagation(): FunctionPass(ID) {
+     initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LazyValueInfo>();
+    }
+  };
+}
+
+char CorrelatedValuePropagation::ID = 0;
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+                "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+  return new CorrelatedValuePropagation();
+}
+
+bool CorrelatedValuePropagation::processSelect(SelectInst *S) {
+  if (S->getType()->isVectorTy()) return false;
+  if (isa<Constant>(S->getOperand(0))) return false;
+
+  Constant *C = LVI->getConstant(S->getOperand(0), S->getParent());
+  if (!C) return false;
+
+  ConstantInt *CI = dyn_cast<ConstantInt>(C);
+  if (!CI) return false;
+
+  Value *ReplaceWith = S->getOperand(1);
+  Value *Other = S->getOperand(2);
+  if (!CI->isOne()) std::swap(ReplaceWith, Other);
+  if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
+
+  S->replaceAllUsesWith(ReplaceWith);
+  S->eraseFromParent();
+
+  ++NumSelects;
+
+  return true;
+}
+
+bool CorrelatedValuePropagation::processPHI(PHINode *P) {
+  bool Changed = false;
+
+  BasicBlock *BB = P->getParent();
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    Value *Incoming = P->getIncomingValue(i);
+    if (isa<Constant>(Incoming)) continue;
+
+    Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB);
+
+    // Look if the incoming value is a select with a constant but LVI tells us
+    // that the incoming value can never be that constant. In that case replace
+    // the incoming value with the other value of the select. This often allows
+    // us to remove the select later.
+    if (!V) {
+      SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+      if (!SI) continue;
+
+      Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+      if (!C) continue;
+
+      if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+                                  P->getIncomingBlock(i), BB) !=
+          LazyValueInfo::False)
+        continue;
+
+      DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+      V = SI->getTrueValue();
+    }
+
+    P->setIncomingValue(i, V);
+    Changed = true;
+  }
+
+  if (Value *V = SimplifyInstruction(P)) {
+    P->replaceAllUsesWith(V);
+    P->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Changed)
+    ++NumPhis;
+
+  return Changed;
+}
+
+bool CorrelatedValuePropagation::processMemAccess(Instruction *I) {
+  Value *Pointer = 0;
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    Pointer = L->getPointerOperand();
+  else
+    Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+  if (isa<Constant>(Pointer)) return false;
+
+  Constant *C = LVI->getConstant(Pointer, I->getParent());
+  if (!C) return false;
+
+  ++NumMemAccess;
+  I->replaceUsesOfWith(Pointer, C);
+  return true;
+}
+
+/// processCmp - If the value of this comparison could be determined locally,
+/// constant propagation would already have figured it out.  Instead, walk
+/// the predecessors and statically evaluate the comparison based on information
+/// available on that edge.  If a given static evaluation is true on ALL
+/// incoming edges, then it's true universally and we can simplify the compare.
+bool CorrelatedValuePropagation::processCmp(CmpInst *C) {
+  Value *Op0 = C->getOperand(0);
+  if (isa<Instruction>(Op0) &&
+      cast<Instruction>(Op0)->getParent() == C->getParent())
+    return false;
+
+  Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+  if (!Op1) return false;
+
+  pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent());
+  if (PI == PE) return false;
+
+  LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(),
+                                    C->getOperand(0), Op1, *PI, C->getParent());
+  if (Result == LazyValueInfo::Unknown) return false;
+
+  ++PI;
+  while (PI != PE) {
+    LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(),
+                                    C->getOperand(0), Op1, *PI, C->getParent());
+    if (Res != Result) return false;
+    ++PI;
+  }
+
+  ++NumCmps;
+
+  if (Result == LazyValueInfo::True)
+    C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext()));
+  else
+    C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext()));
+
+  C->eraseFromParent();
+
+  return true;
+}
+
+/// processSwitch - Simplify a switch instruction by removing cases which can
+/// never fire.  If the uselessness of a case could be determined locally then
+/// constant propagation would already have figured it out.  Instead, walk the
+/// predecessors and statically evaluate cases based on information available
+/// on that edge.  Cases that cannot fire no matter what the incoming edge can
+/// safely be removed.  If a case fires on every incoming edge then the entire
+/// switch can be removed and replaced with a branch to the case destination.
+bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  BasicBlock *BB = SI->getParent();
+
+  // If the condition was defined in same block as the switch then LazyValueInfo
+  // currently won't say anything useful about it, though in theory it could.
+  if (isa<Instruction>(Cond) && cast<Instruction>(Cond)->getParent() == BB)
+    return false;
+
+  // If the switch is unreachable then trying to improve it is a waste of time.
+  pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+  if (PB == PE) return false;
+
+  // Analyse each switch case in turn.  This is done in reverse order so that
+  // removing a case doesn't cause trouble for the iteration.
+  bool Changed = false;
+  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
+       ) {
+    ConstantInt *Case = CI.getCaseValue();
+
+    // Check to see if the switch condition is equal to/not equal to the case
+    // value on every incoming edge, equal/not equal being the same each time.
+    LazyValueInfo::Tristate State = LazyValueInfo::Unknown;
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      // Is the switch condition equal to the case value?
+      LazyValueInfo::Tristate Value = LVI->getPredicateOnEdge(CmpInst::ICMP_EQ,
+                                                              Cond, Case, *PI, BB);
+      // Give up on this case if nothing is known.
+      if (Value == LazyValueInfo::Unknown) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+
+      // If this was the first edge to be visited, record that all other edges
+      // need to give the same result.
+      if (PI == PB) {
+        State = Value;
+        continue;
+      }
+
+      // If this case is known to fire for some edges and known not to fire for
+      // others then there is nothing we can do - give up.
+      if (Value != State) {
+        State = LazyValueInfo::Unknown;
+        break;
+      }
+    }
+
+    if (State == LazyValueInfo::False) {
+      // This case never fires - remove it.
+      CI.getCaseSuccessor()->removePredecessor(BB);
+      SI->removeCase(CI); // Does not invalidate the iterator.
+
+      // The condition can be modified by removePredecessor's PHI simplification
+      // logic.
+      Cond = SI->getCondition();
+
+      ++NumDeadCases;
+      Changed = true;
+    } else if (State == LazyValueInfo::True) {
+      // This case always fires.  Arrange for the switch to be turned into an
+      // unconditional branch by replacing the switch condition with the case
+      // value.
+      SI->setCondition(Case);
+      NumDeadCases += SI->getNumCases();
+      Changed = true;
+      break;
+    }
+  }
+
+  if (Changed)
+    // If the switch has been simplified to the point where it can be replaced
+    // by a branch then do so now.
+    ConstantFoldTerminator(BB);
+
+  return Changed;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+  LVI = &getAnalysis<LazyValueInfo>();
+
+  bool FnChanged = false;
+
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    bool BBChanged = false;
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) {
+      Instruction *II = BI++;
+      switch (II->getOpcode()) {
+      case Instruction::Select:
+        BBChanged |= processSelect(cast<SelectInst>(II));
+        break;
+      case Instruction::PHI:
+        BBChanged |= processPHI(cast<PHINode>(II));
+        break;
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        BBChanged |= processCmp(cast<CmpInst>(II));
+        break;
+      case Instruction::Load:
+      case Instruction::Store:
+        BBChanged |= processMemAccess(II);
+        break;
+      }
+    }
+
+    Instruction *Term = FI->getTerminator();
+    switch (Term->getOpcode()) {
+    case Instruction::Switch:
+      BBChanged |= processSwitch(cast<SwitchInst>(Term));
+      break;
+    }
+
+    FnChanged |= BBChanged;
+  }
+
+  return FnChanged;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 000000000000..e8a090af40c3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -0,0 +1,134 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(ID) {
+      initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
+    }
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
+        Instruction *Inst = DI++;
+        if (isInstructionTriviallyDead(Inst, TLI)) {
+          Inst->eraseFromParent();
+          Changed = true;
+          ++DIEEliminated;
+        }
+      }
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+}
+
+char DeadInstElimination::ID = 0;
+INITIALIZE_PASS(DeadInstElimination, "die",
+                "Dead Instruction Elimination", false, false)
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadCodeElimination pass implementation
+  //
+  struct DCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DCE() : FunctionPass(ID) {
+      initializeDCEPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+ };
+}
+
+char DCE::ID = 0;
+INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
+
+bool DCE::runOnFunction(Function &F) {
+  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+  // Start out with all of the instructions in the worklist...
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
+    WorkList.push_back(&*i);
+
+  // Loop over the worklist finding instructions that are dead.  If they are
+  // dead make them drop all of their uses, making other instructions
+  // potentially dead, and work until the worklist is empty.
+  //
+  bool MadeChange = false;
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
+      // Loop over all of the values that the instruction uses, if there are
+      // instructions being used, add them to the worklist, because they might
+      // go dead after this one is removed.
+      //
+      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+        if (Instruction *Used = dyn_cast<Instruction>(*OI))
+          WorkList.push_back(Used);
+
+      // Remove the instruction.
+      I->eraseFromParent();
+
+      // Remove the instruction from the worklist if it still exists in it.
+      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
+                     WorkList.end());
+
+      MadeChange = true;
+      ++DCEEliminated;
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCE();
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 000000000000..57432c7d71d8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -0,0 +1,900 @@
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+  struct DSE : public FunctionPass {
+    AliasAnalysis *AA;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+    const TargetLibraryInfo *TLI;
+
+    static char ID; // Pass identification, replacement for typeid
+    DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) {
+      initializeDSEPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      AA = &getAnalysis<AliasAnalysis>();
+      MD = &getAnalysis<MemoryDependenceAnalysis>();
+      DT = &getAnalysis<DominatorTree>();
+      TLI = AA->getTargetLibraryInfo();
+
+      bool Changed = false;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        // Only check non-dead blocks.  Dead blocks may have strange pointer
+        // cycles that will confuse alias analysis.
+        if (DT->isReachableFromEntry(I))
+          Changed |= runOnBasicBlock(*I);
+
+      AA = 0; MD = 0; DT = 0;
+      return Changed;
+    }
+
+    bool runOnBasicBlock(BasicBlock &BB);
+    bool HandleFree(CallInst *F);
+    bool handleEndBlock(BasicBlock &BB);
+    void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
+                               SmallSetVector<Value*, 16> &DeadStackObjects);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+  };
+}
+
+char DSE::ID = 0;
+INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// DeleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+///
+static void DeleteDeadInstruction(Instruction *I,
+                                  MemoryDependenceAnalysis &MD,
+                                  const TargetLibraryInfo *TLI,
+                                  SmallSetVector<Value*, 16> *ValueSet = 0) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+
+  NowDeadInsts.push_back(I);
+  --NumFastOther;
+
+  // Before we touch this instruction, remove it from memdep!
+  do {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+    ++NumFastOther;
+
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // MemDep, which needs to know the operands and needs it to be in the
+    // function.
+    MD.removeInstruction(DeadInst);
+
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, 0);
+
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          NowDeadInsts.push_back(OpI);
+    }
+
+    DeadInst->eraseFromParent();
+
+    if (ValueSet) ValueSet->remove(DeadInst);
+  } while (!NowDeadInsts.empty());
+}
+
+
+/// hasMemoryWrite - Does this instruction write some memory?  This only returns
+/// true for things that we can analyze with other helpers below.
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
+  if (isa<StoreInst>(I))
+    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+    case Intrinsic::init_trampoline:
+    case Intrinsic::lifetime_end:
+      return true;
+    }
+  }
+  if (CallSite CS = I) {
+    if (Function *F = CS.getCalledFunction()) {
+      if (TLI && TLI->has(LibFunc::strcpy) &&
+          F->getName() == TLI->getName(LibFunc::strcpy)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strncpy) &&
+          F->getName() == TLI->getName(LibFunc::strncpy)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strcat) &&
+          F->getName() == TLI->getName(LibFunc::strcat)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strncat) &&
+          F->getName() == TLI->getName(LibFunc::strncat)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
+static AliasAnalysis::Location
+getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return AA.getLocation(SI);
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
+    // memcpy/memmove/memset.
+    AliasAnalysis::Location Loc = AA.getLocationForDest(MI);
+    // If we don't have target data around, an unknown size in Location means
+    // that we should use the size of the pointee type.  This isn't valid for
+    // memset/memcpy, which writes more than an i8.
+    if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0)
+      return AliasAnalysis::Location();
+    return Loc;
+  }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+  if (II == 0) return AliasAnalysis::Location();
+
+  switch (II->getIntrinsicID()) {
+  default: return AliasAnalysis::Location(); // Unhandled intrinsic.
+  case Intrinsic::init_trampoline:
+    // If we don't have target data around, an unknown size in Location means
+    // that we should use the size of the pointee type.  This isn't valid for
+    // init.trampoline, which writes more than an i8.
+    if (AA.getDataLayout() == 0) return AliasAnalysis::Location();
+
+    // FIXME: We don't know the size of the trampoline, so we can't really
+    // handle it here.
+    return AliasAnalysis::Location(II->getArgOperand(0));
+  case Intrinsic::lifetime_end: {
+    uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+    return AliasAnalysis::Location(II->getArgOperand(1), Len);
+  }
+  }
+}
+
+/// getLocForRead - Return the location read by the specified "hasMemoryWrite"
+/// instruction if any.
+static AliasAnalysis::Location
+getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
+  assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) &&
+         "Unknown instruction case");
+
+  // The only instructions that both read and write are the mem transfer
+  // instructions (memcpy/memmove).
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
+    return AA.getLocationForSource(MTI);
+  return AliasAnalysis::Location();
+}
+
+
+/// isRemovable - If the value of this instruction and the memory it writes to
+/// is unused, may we delete this instruction?
+static bool isRemovable(Instruction *I) {
+  // Don't remove volatile/atomic stores.
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+    case Intrinsic::lifetime_end:
+      // Never remove dead lifetime_end's, e.g. because it is followed by a
+      // free.
+      return false;
+    case Intrinsic::init_trampoline:
+      // Always safe to remove init_trampoline.
+      return true;
+
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      // Don't remove volatile memory intrinsics.
+      return !cast<MemIntrinsic>(II)->isVolatile();
+    }
+  }
+
+  if (CallSite CS = I)
+    return CS.getInstruction()->use_empty();
+
+  return false;
+}
+
+
+/// isShortenable - Returns true if this instruction can be safely shortened in
+/// length.
+static bool isShortenable(Instruction *I) {
+  // Don't shorten stores for now
+  if (isa<StoreInst>(I))
+    return false;
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+      default: return false;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+        // Do shorten memory intrinsics.
+        return true;
+    }
+  }
+
+  // Don't shorten libcalls calls for now.
+
+  return false;
+}
+
+/// getStoredPointerOperand - Return the pointer that is being written to.
+static Value *getStoredPointerOperand(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return MI->getDest();
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return II->getArgOperand(0);
+    }
+  }
+
+  CallSite CS = I;
+  // All the supported functions so far happen to have dest as their first
+  // argument.
+  return CS.getArgument(0);
+}
+
+static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
+  uint64_t Size;
+  if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo()))
+    return Size;
+  return AliasAnalysis::UnknownSize;
+}
+
+namespace {
+  enum OverwriteResult
+  {
+    OverwriteComplete,
+    OverwriteEnd,
+    OverwriteUnknown
+  };
+}
+
+/// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location
+/// completely overwrites a store to the 'Earlier' location.
+/// 'OverwriteEnd' if the end of the 'Earlier' location is completely
+/// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined
+static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
+                                   const AliasAnalysis::Location &Earlier,
+                                   AliasAnalysis &AA,
+                                   int64_t &EarlierOff,
+                                   int64_t &LaterOff) {
+  const Value *P1 = Earlier.Ptr->stripPointerCasts();
+  const Value *P2 = Later.Ptr->stripPointerCasts();
+
+  // If the start pointers are the same, we just have to compare sizes to see if
+  // the later store was larger than the earlier store.
+  if (P1 == P2) {
+    // If we don't know the sizes of either access, then we can't do a
+    // comparison.
+    if (Later.Size == AliasAnalysis::UnknownSize ||
+        Earlier.Size == AliasAnalysis::UnknownSize) {
+      // If we have no DataLayout information around, then the size of the store
+      // is inferrable from the pointee type.  If they are the same type, then
+      // we know that the store is safe.
+      if (AA.getDataLayout() == 0 &&
+          Later.Ptr->getType() == Earlier.Ptr->getType())
+        return OverwriteComplete;
+
+      return OverwriteUnknown;
+    }
+
+    // Make sure that the Later size is >= the Earlier size.
+    if (Later.Size >= Earlier.Size)
+      return OverwriteComplete;
+  }
+
+  // Otherwise, we have to have size information, and the later store has to be
+  // larger than the earlier one.
+  if (Later.Size == AliasAnalysis::UnknownSize ||
+      Earlier.Size == AliasAnalysis::UnknownSize ||
+      AA.getDataLayout() == 0)
+    return OverwriteUnknown;
+
+  // Check to see if the later store is to the entire object (either a global,
+  // an alloca, or a byval argument).  If so, then it clearly overwrites any
+  // other store to the same object.
+  const DataLayout *TD = AA.getDataLayout();
+
+  const Value *UO1 = GetUnderlyingObject(P1, TD),
+              *UO2 = GetUnderlyingObject(P2, TD);
+
+  // If we can't resolve the same pointers to the same object, then we can't
+  // analyze them at all.
+  if (UO1 != UO2)
+    return OverwriteUnknown;
+
+  // If the "Later" store is to a recognizable object, get its size.
+  uint64_t ObjectSize = getPointerSize(UO2, AA);
+  if (ObjectSize != AliasAnalysis::UnknownSize)
+    if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
+      return OverwriteComplete;
+
+  // Okay, we have stores to two completely different pointers.  Try to
+  // decompose the pointer into a "base + constant_offset" form.  If the base
+  // pointers are equal, then we can reason about the two stores.
+  EarlierOff = 0;
+  LaterOff = 0;
+  const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
+  const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
+
+  // If the base pointers still differ, we have two completely different stores.
+  if (BP1 != BP2)
+    return OverwriteUnknown;
+
+  // The later store completely overlaps the earlier store if:
+  //
+  // 1. Both start at the same offset and the later one's size is greater than
+  //    or equal to the earlier one's, or
+  //
+  //      |--earlier--|
+  //      |--   later   --|
+  //
+  // 2. The earlier store has an offset greater than the later offset, but which
+  //    still lies completely within the later store.
+  //
+  //        |--earlier--|
+  //    |-----  later  ------|
+  //
+  // We have to be careful here as *Off is signed while *.Size is unsigned.
+  if (EarlierOff >= LaterOff &&
+      Later.Size >= Earlier.Size &&
+      uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
+    return OverwriteComplete;
+
+  // The other interesting case is if the later store overwrites the end of
+  // the earlier store
+  //
+  //      |--earlier--|
+  //                |--   later   --|
+  //
+  // In this case we may want to trim the size of earlier to avoid generating
+  // writes to addresses which will definitely be overwritten later
+  if (LaterOff > EarlierOff &&
+      LaterOff < int64_t(EarlierOff + Earlier.Size) &&
+      int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))
+    return OverwriteEnd;
+
+  // Otherwise, they don't completely overlap.
+  return OverwriteUnknown;
+}
+
+/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense.  Consider this case:
+///
+///   memcpy(A <- B)
+///   memcpy(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+                               const AliasAnalysis::Location &InstStoreLoc,
+                               Instruction *DepWrite, AliasAnalysis &AA) {
+  // Self reads can only happen for instructions that read memory.  Get the
+  // location read.
+  AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA);
+  if (InstReadLoc.Ptr == 0) return false;  // Not a reading instruction.
+
+  // If the read and written loc obviously don't alias, it isn't a read.
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+
+  // Okay, 'Inst' may copy over itself.  However, we can still remove a the
+  // DepWrite instruction if we can prove that it reads from the same location
+  // as Inst.  This handles useful cases like:
+  //   memcpy(A <- B)
+  //   memcpy(A <- B)
+  // Here we don't know if A/B may alias, but we do know that B/B are must
+  // aliases, so removing the first memcpy is safe (assuming it writes <= #
+  // bytes as the second one.
+  AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA);
+
+  if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+    return false;
+
+  // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+  // then it can't be considered dead.
+  return true;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+
+  // Do a top-down walk on the BB.
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+    Instruction *Inst = BBI++;
+
+    // Handle 'free' calls specially.
+    if (CallInst *F = isFreeCall(Inst, TLI)) {
+      MadeChange |= HandleFree(F);
+      continue;
+    }
+
+    // If we find something that writes memory, get its memory dependence.
+    if (!hasMemoryWrite(Inst, TLI))
+      continue;
+
+    MemDepResult InstDep = MD->getDependency(Inst);
+
+    // Ignore any store where we can't find a local dependence.
+    // FIXME: cross-block DSE would be fun. :)
+    if (!InstDep.isDef() && !InstDep.isClobber())
+      continue;
+
+    // If we're storing the same value back to a pointer that we just
+    // loaded from, then the store can be removed.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
+        if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+            SI->getOperand(0) == DepLoad && isRemovable(SI)) {
+          DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
+                       << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
+
+          // DeleteDeadInstruction can delete the current instruction.  Save BBI
+          // in case we need it.
+          WeakVH NextInst(BBI);
+
+          DeleteDeadInstruction(SI, *MD, TLI);
+
+          if (NextInst == 0)  // Next instruction deleted.
+            BBI = BB.begin();
+          else if (BBI != BB.begin())  // Revisit this instruction if possible.
+            --BBI;
+          ++NumFastStores;
+          MadeChange = true;
+          continue;
+        }
+      }
+    }
+
+    // Figure out what location is being stored to.
+    AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA);
+
+    // If we didn't get a useful location, fail.
+    if (Loc.Ptr == 0)
+      continue;
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
+      // Get the memory clobbered by the instruction we depend on.  MemDep will
+      // skip any instructions that 'Loc' clearly doesn't interact with.  If we
+      // end up depending on a may- or must-aliased load, then we can't optimize
+      // away the store and we bail out.  However, if we depend on on something
+      // that overwrites the memory location we *can* potentially optimize it.
+      //
+      // Find out what memory location the dependent instruction stores.
+      Instruction *DepWrite = InstDep.getInst();
+      AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA);
+      // If we didn't get a useful location, or if it isn't a size, bail out.
+      if (DepLoc.Ptr == 0)
+        break;
+
+      // If we find a write that is a) removable (i.e., non-volatile), b) is
+      // completely obliterated by the store to 'Loc', and c) which we know that
+      // 'Inst' doesn't load from, then we can remove it.
+      if (isRemovable(DepWrite) &&
+          !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
+        int64_t InstWriteOffset, DepWriteOffset;
+        OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA,
+                                         DepWriteOffset, InstWriteOffset);
+        if (OR == OverwriteComplete) {
+          DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
+                << *DepWrite << "\n  KILLER: " << *Inst << '\n');
+
+          // Delete the store and now-dead instructions that feed it.
+          DeleteDeadInstruction(DepWrite, *MD, TLI);
+          ++NumFastStores;
+          MadeChange = true;
+
+          // DeleteDeadInstruction can delete the current instruction in loop
+          // cases, reset BBI.
+          BBI = Inst;
+          if (BBI != BB.begin())
+            --BBI;
+          break;
+        } else if (OR == OverwriteEnd && isShortenable(DepWrite)) {
+          // TODO: base this on the target vector size so that if the earlier
+          // store was too small to get vector writes anyway then its likely
+          // a good idea to shorten it
+          // Power of 2 vector writes are probably always a bad idea to optimize
+          // as any store/memset/memcpy is likely using vector instructions so
+          // shortening it to not vector size is likely to be slower
+          MemIntrinsic* DepIntrinsic = cast<MemIntrinsic>(DepWrite);
+          unsigned DepWriteAlign = DepIntrinsic->getAlignment();
+          if (llvm::isPowerOf2_64(InstWriteOffset) ||
+              ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) {
+
+            DEBUG(dbgs() << "DSE: Remove Dead Store:\n  OW END: "
+                  << *DepWrite << "\n  KILLER (offset "
+                  << InstWriteOffset << ", "
+                  << DepLoc.Size << ")"
+                  << *Inst << '\n');
+
+            Value* DepWriteLength = DepIntrinsic->getLength();
+            Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(),
+                                                    InstWriteOffset -
+                                                    DepWriteOffset);
+            DepIntrinsic->setLength(TrimmedLength);
+            MadeChange = true;
+          }
+        }
+      }
+
+      // If this is a may-aliased store that is clobbering the store value, we
+      // can keep searching past it for another must-aliased pointer that stores
+      // to the same location.  For example, in:
+      //   store -> P
+      //   store -> Q
+      //   store -> P
+      // we can remove the first store to P even though we don't know if P and Q
+      // alias.
+      if (DepWrite == &BB.front()) break;
+
+      // Can't look past this instruction if it might read 'Loc'.
+      if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref)
+        break;
+
+      InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB);
+    }
+  }
+
+  // If this block ends in a return, unwind, or unreachable, all allocas are
+  // dead at its end, which means stores to them are also dead.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB);
+
+  return MadeChange;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void FindUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+                                   BasicBlock *BB, DominatorTree *DT) {
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    BasicBlock *Pred = *I;
+    if (Pred == BB) continue;
+    TerminatorInst *PredTI = Pred->getTerminator();
+    if (PredTI->getNumSuccessors() != 1)
+      continue;
+
+    if (DT->isReachableFromEntry(Pred))
+      Blocks.push_back(Pred);
+  }
+}
+
+/// HandleFree - Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+bool DSE::HandleFree(CallInst *F) {
+  bool MadeChange = false;
+
+  AliasAnalysis::Location Loc = AliasAnalysis::Location(F->getOperand(0));
+  SmallVector<BasicBlock *, 16> Blocks;
+  Blocks.push_back(F->getParent());
+
+  while (!Blocks.empty()) {
+    BasicBlock *BB = Blocks.pop_back_val();
+    Instruction *InstPt = BB->getTerminator();
+    if (BB == F->getParent()) InstPt = F;
+
+    MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB);
+    while (Dep.isDef() || Dep.isClobber()) {
+      Instruction *Dependency = Dep.getInst();
+      if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency))
+        break;
+
+      Value *DepPointer =
+        GetUnderlyingObject(getStoredPointerOperand(Dependency));
+
+      // Check for aliasing.
+      if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+        break;
+
+      Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
+
+      // DCE instructions only used to calculate that store
+      DeleteDeadInstruction(Dependency, *MD, TLI);
+      ++NumFastStores;
+      MadeChange = true;
+
+      // Inst's old Dependency is now deleted. Compute the next dependency,
+      // which may also be dead, as in
+      //    s[0] = 0;
+      //    s[1] = 0; // This has just been deleted.
+      //    free(s);
+      Dep = MD->getPointerDependencyFrom(Loc, false, Next, BB);
+    }
+
+    if (Dep.isNonLocal())
+      FindUnconditionalPreds(Blocks, BB, DT);
+  }
+
+  return MadeChange;
+}
+
+namespace {
+  struct CouldRef {
+    typedef Value *argument_type;
+    const CallSite CS;
+    AliasAnalysis *AA;
+
+    bool operator()(Value *I) {
+      // See if the call site touches the value.
+      AliasAnalysis::ModRefResult A =
+        AA->getModRefInfo(CS, I, getPointerSize(I, *AA));
+
+      return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
+    }
+  };
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the
+/// function end block.  Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+bool DSE::handleEndBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+
+  // Keep track of all of the stack objects that are dead at the end of the
+  // function.
+  SmallSetVector<Value*, 16> DeadStackObjects;
+
+  // Find all of the alloca'd pointers in the entry block.
+  BasicBlock *Entry = BB.getParent()->begin();
+  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) {
+    if (isa<AllocaInst>(I))
+      DeadStackObjects.insert(I);
+
+    // Okay, so these are dead heap objects, but if the pointer never escapes
+    // then it's leaked by this function anyways.
+    else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true))
+      DeadStackObjects.insert(I);
+  }
+
+  // Treat byval arguments the same, stores to them are dead at the end of the
+  // function.
+  for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
+       AE = BB.getParent()->arg_end(); AI != AE; ++AI)
+    if (AI->hasByValAttr())
+      DeadStackObjects.insert(AI);
+
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+
+    // If we find a store, check to see if it points into a dead stack value.
+    if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) {
+      // See through pointer-to-pointer bitcasts
+      SmallVector<Value *, 4> Pointers;
+      GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers);
+
+      // Stores to stack values are valid candidates for removal.
+      bool AllDead = true;
+      for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+           E = Pointers.end(); I != E; ++I)
+        if (!DeadStackObjects.count(*I)) {
+          AllDead = false;
+          break;
+        }
+
+      if (AllDead) {
+        Instruction *Dead = BBI++;
+
+        DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
+                     << *Dead << "\n  Objects: ";
+              for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(),
+                   E = Pointers.end(); I != E; ++I) {
+                dbgs() << **I;
+                if (llvm::next(I) != E)
+                  dbgs() << ", ";
+              }
+              dbgs() << '\n');
+
+        // DCE instructions only used to calculate that store.
+        DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects);
+        ++NumFastStores;
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    // Remove any dead non-memory-mutating instructions.
+    if (isInstructionTriviallyDead(BBI, TLI)) {
+      Instruction *Inst = BBI++;
+      DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects);
+      ++NumFastOther;
+      MadeChange = true;
+      continue;
+    }
+
+    if (isa<AllocaInst>(BBI)) {
+      // Remove allocas from the list of dead stack objects; there can't be
+      // any references before the definition.
+      DeadStackObjects.remove(BBI);
+      continue;
+    }
+
+    if (CallSite CS = cast<Value>(BBI)) {
+      // Remove allocation function calls from the list of dead stack objects; 
+      // there can't be any references before the definition.
+      if (isAllocLikeFn(BBI, TLI))
+        DeadStackObjects.remove(BBI);
+
+      // If this call does not access memory, it can't be loading any of our
+      // pointers.
+      if (AA->doesNotAccessMemory(CS))
+        continue;
+
+      // If the call might load from any of our allocas, then any store above
+      // the call is live.
+      CouldRef Pred = { CS, AA };
+      DeadStackObjects.remove_if(Pred);
+
+      // If all of the allocas were clobbered by the call then we're not going
+      // to find anything else to process.
+      if (DeadStackObjects.empty())
+        break;
+
+      continue;
+    }
+
+    AliasAnalysis::Location LoadedLoc;
+
+    // If we encounter a use of the pointer, it is no longer considered dead
+    if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
+      LoadedLoc = AA->getLocation(L);
+    } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+      LoadedLoc = AA->getLocation(V);
+    } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
+      LoadedLoc = AA->getLocationForSource(MTI);
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
+      continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
+    }
+
+    // Remove any allocas from the DeadPointer set that are loaded, as this
+    // makes any stores above the access live.
+    RemoveAccessedObjects(LoadedLoc, DeadStackObjects);
+
+    // If all of the allocas were clobbered by the access then we're not going
+    // to find anything else to process.
+    if (DeadStackObjects.empty())
+      break;
+  }
+
+  return MadeChange;
+}
+
+namespace {
+  struct CouldAlias {
+    typedef Value *argument_type;
+    const AliasAnalysis::Location &LoadedLoc;
+    AliasAnalysis *AA;
+
+    bool operator()(Value *I) {
+      // See if the loaded location could alias the stack location.
+      AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA));
+      return !AA->isNoAlias(StackLoc, LoadedLoc);
+    }
+  };
+}
+
+/// RemoveAccessedObjects - Check to see if the specified location may alias any
+/// of the stack objects in the DeadStackObjects set.  If so, they become live
+/// because the location is being loaded.
+void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
+                                SmallSetVector<Value*, 16> &DeadStackObjects) {
+  const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr);
+
+  // A constant can't be in the dead pointer set.
+  if (isa<Constant>(UnderlyingPointer))
+    return;
+
+  // If the kill pointer can be easily reduced to an alloca, don't bother doing
+  // extraneous AA queries.
+  if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+    DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer));
+    return;
+  }
+
+  // Remove objects that could alias LoadedLoc.
+  CouldAlias Pred = { LoadedLoc, AA };
+  DeadStackObjects.remove_if(Pred);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
new file mode 100644
index 000000000000..3c08634bfe22
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -0,0 +1,628 @@
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "early-cse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <deque>
+using namespace llvm;
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE,      "Number of instructions CSE'd");
+STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
+STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
+STATISTIC(NumDSE,      "Number of trivial dead stores removed");
+
+static unsigned getHash(const void *V) {
+  return DenseMapInfo<const void*>::getHashValue(V);
+}
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// SimpleValue - Instances of this struct represent available values in the
+  /// scoped hash table.
+  struct SimpleValue {
+    Instruction *Inst;
+
+    SimpleValue(Instruction *I) : Inst(I) {
+      assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+    }
+
+    bool isSentinel() const {
+      return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
+             Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
+    }
+
+    static bool canHandle(Instruction *Inst) {
+      // This can only handle non-void readnone functions.
+      if (CallInst *CI = dyn_cast<CallInst>(Inst))
+        return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+      return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
+             isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
+             isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
+             isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
+             isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
+    }
+  };
+}
+
+namespace llvm {
+// SimpleValue is POD.
+template<> struct isPodLike<SimpleValue> {
+  static const bool value = true;
+};
+
+template<> struct DenseMapInfo<SimpleValue> {
+  static inline SimpleValue getEmptyKey() {
+    return DenseMapInfo<Instruction*>::getEmptyKey();
+  }
+  static inline SimpleValue getTombstoneKey() {
+    return DenseMapInfo<Instruction*>::getTombstoneKey();
+  }
+  static unsigned getHashValue(SimpleValue Val);
+  static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash in all of the operands as pointers.
+  if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+      std::swap(LHS, RHS);
+
+    if (isa<OverflowingBinaryOperator>(BinOp)) {
+      // Hash the overflow behavior
+      unsigned Overflow =
+        BinOp->hasNoSignedWrap()   * OverflowingBinaryOperator::NoSignedWrap |
+        BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap;
+      return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
+    }
+
+    return hash_combine(BinOp->getOpcode(), LHS, RHS);
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    Value *LHS = CI->getOperand(0);
+    Value *RHS = CI->getOperand(1);
+    CmpInst::Predicate Pred = CI->getPredicate();
+    if (Inst->getOperand(0) > Inst->getOperand(1)) {
+      std::swap(LHS, RHS);
+      Pred = CI->getSwappedPredicate();
+    }
+    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(Inst))
+    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+    return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+                        hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+    return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+                        IVI->getOperand(1),
+                        hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+  assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+          isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+          isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction");
+
+  // Mix in the opcode.
+  return hash_combine(Inst->getOpcode(),
+                      hash_combine_range(Inst->value_op_begin(),
+                                         Inst->value_op_end()));
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+
+  if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
+  if (LHSI->isIdenticalTo(RHSI)) return true;
+
+  // If we're not strictly identical, we still might be a commutable instruction
+  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+    if (!LHSBinOp->isCommutative())
+      return false;
+
+    assert(isa<BinaryOperator>(RHSI)
+           && "same opcode, but different instruction type?");
+    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+    // Check overflow attributes
+    if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
+      assert(isa<OverflowingBinaryOperator>(RHSBinOp)
+             && "same opcode, but different operator type?");
+      if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
+          LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
+        return false;
+    }
+
+    // Commuted equality
+    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+      LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+  }
+  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+    assert(isa<CmpInst>(RHSI)
+           && "same opcode, but different instruction type?");
+    CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+    // Commuted equality
+    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+      LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+      LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// CallValue - Instances of this struct represent available call values in
+  /// the scoped hash table.
+  struct CallValue {
+    Instruction *Inst;
+
+    CallValue(Instruction *I) : Inst(I) {
+      assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+    }
+
+    bool isSentinel() const {
+      return Inst == DenseMapInfo<Instruction*>::getEmptyKey() ||
+             Inst == DenseMapInfo<Instruction*>::getTombstoneKey();
+    }
+
+    static bool canHandle(Instruction *Inst) {
+      // Don't value number anything that returns void.
+      if (Inst->getType()->isVoidTy())
+        return false;
+
+      CallInst *CI = dyn_cast<CallInst>(Inst);
+      if (CI == 0 || !CI->onlyReadsMemory())
+        return false;
+      return true;
+    }
+  };
+}
+
+namespace llvm {
+  // CallValue is POD.
+  template<> struct isPodLike<CallValue> {
+    static const bool value = true;
+  };
+
+  template<> struct DenseMapInfo<CallValue> {
+    static inline CallValue getEmptyKey() {
+      return DenseMapInfo<Instruction*>::getEmptyKey();
+    }
+    static inline CallValue getTombstoneKey() {
+      return DenseMapInfo<Instruction*>::getTombstoneKey();
+    }
+    static unsigned getHashValue(CallValue Val);
+    static bool isEqual(CallValue LHS, CallValue RHS);
+  };
+}
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+  Instruction *Inst = Val.Inst;
+  // Hash in all of the operands as pointers.
+  unsigned Res = 0;
+  for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
+    assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
+           "Cannot value number calls with metadata operands");
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
+  }
+
+  // Mix in the opcode.
+  return (Res << 1) ^ Inst->getOpcode();
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+  Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+  if (LHS.isSentinel() || RHS.isSentinel())
+    return LHSI == RHSI;
+  return LHSI->isIdenticalTo(RHSI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE pass.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// EarlyCSE - This pass does a simple depth-first walk over the dominator
+/// tree, eliminating trivially redundant instructions and using instsimplify
+/// to canonicalize things as it goes.  It is intended to be fast and catch
+/// obvious cases so that instcombine and other passes are more effective.  It
+/// is expected that a later pass of GVN will catch the interesting/hard
+/// cases.
+class EarlyCSE : public FunctionPass {
+public:
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  DominatorTree *DT;
+  typedef RecyclingAllocator<BumpPtrAllocator,
+                      ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy;
+  typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>,
+                          AllocatorTy> ScopedHTType;
+
+  /// AvailableValues - This scoped hash table contains the current values of
+  /// all of our simple scalar expressions.  As we walk down the domtree, we
+  /// look to see if instructions are in this: if so, we replace them with what
+  /// we find, otherwise we insert them so that dominated values can succeed in
+  /// their lookup.
+  ScopedHTType *AvailableValues;
+
+  /// AvailableLoads - This scoped hash table contains the current values
+  /// of loads.  This allows us to get efficient access to dominating loads when
+  /// we have a fully redundant load.  In addition to the most recent load, we
+  /// keep track of a generation count of the read, which is compared against
+  /// the current generation count.  The current generation count is
+  /// incremented after every possibly writing memory operation, which ensures
+  /// that we only CSE loads with other loads that have no intervening store.
+  typedef RecyclingAllocator<BumpPtrAllocator,
+    ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator;
+  typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>,
+                          DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType;
+  LoadHTType *AvailableLoads;
+
+  /// AvailableCalls - This scoped hash table contains the current values
+  /// of read-only call values.  It uses the same generation count as loads.
+  typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType;
+  CallHTType *AvailableCalls;
+
+  /// CurrentGeneration - This is the current generation of the memory value.
+  unsigned CurrentGeneration;
+
+  static char ID;
+  explicit EarlyCSE() : FunctionPass(ID) {
+    initializeEarlyCSEPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F);
+
+private:
+
+  // NodeScope - almost a POD, but needs to call the constructors for the
+  // scoped hash tables so that a new scope gets pushed on. These are RAII so
+  // that the scope gets popped when the NodeScope is destroyed.
+  class NodeScope {
+   public:
+    NodeScope(ScopedHTType *availableValues,
+              LoadHTType *availableLoads,
+              CallHTType *availableCalls) :
+        Scope(*availableValues),
+        LoadScope(*availableLoads),
+        CallScope(*availableCalls) {}
+
+   private:
+    NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION;
+    void operator=(const NodeScope&) LLVM_DELETED_FUNCTION;
+
+    ScopedHTType::ScopeTy Scope;
+    LoadHTType::ScopeTy LoadScope;
+    CallHTType::ScopeTy CallScope;
+  };
+
+  // StackNode - contains all the needed information to create a stack for
+  // doing a depth first tranversal of the tree. This includes scopes for
+  // values, loads, and calls as well as the generation. There is a child
+  // iterator so that the children do not need to be store spearately.
+  class StackNode {
+   public:
+    StackNode(ScopedHTType *availableValues,
+              LoadHTType *availableLoads,
+              CallHTType *availableCalls,
+              unsigned cg, DomTreeNode *n,
+              DomTreeNode::iterator child, DomTreeNode::iterator end) :
+        CurrentGeneration(cg), ChildGeneration(cg), Node(n),
+        ChildIter(child), EndIter(end),
+        Scopes(availableValues, availableLoads, availableCalls),
+        Processed(false) {}
+
+    // Accessors.
+    unsigned currentGeneration() { return CurrentGeneration; }
+    unsigned childGeneration() { return ChildGeneration; }
+    void childGeneration(unsigned generation) { ChildGeneration = generation; }
+    DomTreeNode *node() { return Node; }
+    DomTreeNode::iterator childIter() { return ChildIter; }
+    DomTreeNode *nextChild() {
+      DomTreeNode *child = *ChildIter;
+      ++ChildIter;
+      return child;
+    }
+    DomTreeNode::iterator end() { return EndIter; }
+    bool isProcessed() { return Processed; }
+    void process() { Processed = true; }
+
+   private:
+    StackNode(const StackNode&) LLVM_DELETED_FUNCTION;
+    void operator=(const StackNode&) LLVM_DELETED_FUNCTION;
+
+    // Members.
+    unsigned CurrentGeneration;
+    unsigned ChildGeneration;
+    DomTreeNode *Node;
+    DomTreeNode::iterator ChildIter;
+    DomTreeNode::iterator EndIter;
+    NodeScope Scopes;
+    bool Processed;
+  };
+
+  bool processNode(DomTreeNode *Node);
+
+  // This transformation requires dominator postdominator info
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<DominatorTree>();
+    AU.addRequired<TargetLibraryInfo>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char EarlyCSE::ID = 0;
+
+// createEarlyCSEPass - The public interface to this file.
+FunctionPass *llvm::createEarlyCSEPass() {
+  return new EarlyCSE();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false)
+
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+  BasicBlock *BB = Node->getBlock();
+
+  // If this block has a single predecessor, then the predecessor is the parent
+  // of the domtree node and all of the live out memory values are still current
+  // in this block.  If this block has multiple predecessors, then they could
+  // have invalidated the live-out memory values of our parent value.  For now,
+  // just be conservative and invalidate memory if this block has multiple
+  // predecessors.
+  if (BB->getSinglePredecessor() == 0)
+    ++CurrentGeneration;
+
+  /// LastStore - Keep track of the last non-volatile store that we saw... for
+  /// as long as there in no instruction that reads memory.  If we see a store
+  /// to the same location, we delete the dead store.  This zaps trivial dead
+  /// stores which can occur in bitfield code among other things.
+  StoreInst *LastStore = 0;
+
+  bool Changed = false;
+
+  // See if any instructions in the block can be eliminated.  If so, do it.  If
+  // not, add them to AvailableValues.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+
+    // Dead instructions should just be removed.
+    if (isInstructionTriviallyDead(Inst, TLI)) {
+      DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+      Inst->eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+    // its simpler value.
+    if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) {
+      DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
+      Inst->replaceAllUsesWith(V);
+      Inst->eraseFromParent();
+      Changed = true;
+      ++NumSimplify;
+      continue;
+    }
+
+    // If this is a simple instruction that we can value number, process it.
+    if (SimpleValue::canHandle(Inst)) {
+      // See if the instruction has an available value.  If so, use it.
+      if (Value *V = AvailableValues->lookup(Inst)) {
+        DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
+        Inst->replaceAllUsesWith(V);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSE;
+        continue;
+      }
+
+      // Otherwise, just remember that this value is available.
+      AvailableValues->insert(Inst, Inst);
+      continue;
+    }
+
+    // If this is a non-volatile load, process it.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      // Ignore volatile loads.
+      if (!LI->isSimple()) {
+        LastStore = 0;
+        continue;
+      }
+
+      // If we have an available version of this load, and if it is the right
+      // generation, replace this instruction.
+      std::pair<Value*, unsigned> InVal =
+        AvailableLoads->lookup(Inst->getOperand(0));
+      if (InVal.first != 0 && InVal.second == CurrentGeneration) {
+        DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << "  to: "
+              << *InVal.first << '\n');
+        if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSELoad;
+        continue;
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableLoads->insert(Inst->getOperand(0),
+                          std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+      LastStore = 0;
+      continue;
+    }
+
+    // If this instruction may read from memory, forget LastStore.
+    if (Inst->mayReadFromMemory())
+      LastStore = 0;
+
+    // If this is a read-only call, process it.
+    if (CallValue::canHandle(Inst)) {
+      // If we have an available version of this call, and if it is the right
+      // generation, replace this instruction.
+      std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst);
+      if (InVal.first != 0 && InVal.second == CurrentGeneration) {
+        DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << "  to: "
+                     << *InVal.first << '\n');
+        if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first);
+        Inst->eraseFromParent();
+        Changed = true;
+        ++NumCSECall;
+        continue;
+      }
+
+      // Otherwise, remember that we have this instruction.
+      AvailableCalls->insert(Inst,
+                         std::pair<Value*, unsigned>(Inst, CurrentGeneration));
+      continue;
+    }
+
+    // Okay, this isn't something we can CSE at all.  Check to see if it is
+    // something that could modify memory.  If so, our available memory values
+    // cannot be used so bump the generation count.
+    if (Inst->mayWriteToMemory()) {
+      ++CurrentGeneration;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // We do a trivial form of DSE if there are two stores to the same
+        // location with no intervening loads.  Delete the earlier store.
+        if (LastStore &&
+            LastStore->getPointerOperand() == SI->getPointerOperand()) {
+          DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << "  due to: "
+                       << *Inst << '\n');
+          LastStore->eraseFromParent();
+          Changed = true;
+          ++NumDSE;
+          LastStore = 0;
+          continue;
+        }
+
+        // Okay, we just invalidated anything we knew about loaded values.  Try
+        // to salvage *something* by remembering that the stored value is a live
+        // version of the pointer.  It is safe to forward from volatile stores
+        // to non-volatile loads, so we don't have to check for volatility of
+        // the store.
+        AvailableLoads->insert(SI->getPointerOperand(),
+         std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
+
+        // Remember that this was the last store we saw for DSE.
+        if (SI->isSimple())
+          LastStore = SI;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+
+bool EarlyCSE::runOnFunction(Function &F) {
+  std::deque<StackNode *> nodesToProcess;
+
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DT = &getAnalysis<DominatorTree>();
+
+  // Tables that the pass uses when walking the domtree.
+  ScopedHTType AVTable;
+  AvailableValues = &AVTable;
+  LoadHTType LoadTable;
+  AvailableLoads = &LoadTable;
+  CallHTType CallTable;
+  AvailableCalls = &CallTable;
+
+  CurrentGeneration = 0;
+  bool Changed = false;
+
+  // Process the root node.
+  nodesToProcess.push_front(
+      new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
+                    CurrentGeneration, DT->getRootNode(),
+                    DT->getRootNode()->begin(),
+                    DT->getRootNode()->end()));
+
+  // Save the current generation.
+  unsigned LiveOutGeneration = CurrentGeneration;
+
+  // Process the stack.
+  while (!nodesToProcess.empty()) {
+    // Grab the first item off the stack. Set the current generation, remove
+    // the node from the stack, and process it.
+    StackNode *NodeToProcess = nodesToProcess.front();
+
+    // Initialize class members.
+    CurrentGeneration = NodeToProcess->currentGeneration();
+
+    // Check if the node needs to be processed.
+    if (!NodeToProcess->isProcessed()) {
+      // Process the node.
+      Changed |= processNode(NodeToProcess->node());
+      NodeToProcess->childGeneration(CurrentGeneration);
+      NodeToProcess->process();
+    } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+      // Push the next child onto the stack.
+      DomTreeNode *child = NodeToProcess->nextChild();
+      nodesToProcess.push_front(
+          new StackNode(AvailableValues,
+                        AvailableLoads,
+                        AvailableCalls,
+                        NodeToProcess->childGeneration(), child,
+                        child->begin(), child->end()));
+    } else {
+      // It has been processed, and there are no more children to process,
+      // so delete it and pop it off the stack.
+      delete NodeToProcess;
+      nodesToProcess.pop_front();
+    }
+  } // while (!nodes...)
+
+  // Reset the current generation.
+  CurrentGeneration = LiveOutGeneration;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
new file mode 100644
index 000000000000..129af8d45d6f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -0,0 +1,2602 @@
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions.  It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvn"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+STATISTIC(NumGVNInstr,  "Number of instructions deleted");
+STATISTIC(NumGVNLoad,   "Number of loads deleted");
+STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad,   "Number of loads PRE'd");
+
+static cl::opt<bool> EnablePRE("enable-pre",
+                               cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
+
+// Maximum allowed recursion depth.
+static cl::opt<uint32_t>
+MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
+                cl::desc("Max recurse depth (default = 1000)"));
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+namespace {
+  struct Expression {
+    uint32_t opcode;
+    Type *type;
+    SmallVector<uint32_t, 4> varargs;
+
+    Expression(uint32_t o = ~2U) : opcode(o) { }
+
+    bool operator==(const Expression &other) const {
+      if (opcode != other.opcode)
+        return false;
+      if (opcode == ~0U || opcode == ~1U)
+        return true;
+      if (type != other.type)
+        return false;
+      if (varargs != other.varargs)
+        return false;
+      return true;
+    }
+
+    friend hash_code hash_value(const Expression &Value) {
+      return hash_combine(Value.opcode, Value.type,
+                          hash_combine_range(Value.varargs.begin(),
+                                             Value.varargs.end()));
+    }
+  };
+
+  class ValueTable {
+    DenseMap<Value*, uint32_t> valueNumbering;
+    DenseMap<Expression, uint32_t> expressionNumbering;
+    AliasAnalysis *AA;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+
+    uint32_t nextValueNumber;
+
+    Expression create_expression(Instruction* I);
+    Expression create_cmp_expression(unsigned Opcode,
+                                     CmpInst::Predicate Predicate,
+                                     Value *LHS, Value *RHS);
+    Expression create_extractvalue_expression(ExtractValueInst* EI);
+    uint32_t lookup_or_add_call(CallInst* C);
+  public:
+    ValueTable() : nextValueNumber(1) { }
+    uint32_t lookup_or_add(Value *V);
+    uint32_t lookup(Value *V) const;
+    uint32_t lookup_or_add_cmp(unsigned Opcode, CmpInst::Predicate Pred,
+                               Value *LHS, Value *RHS);
+    void add(Value *V, uint32_t num);
+    void clear();
+    void erase(Value *v);
+    void setAliasAnalysis(AliasAnalysis* A) { AA = A; }
+    AliasAnalysis *getAliasAnalysis() const { return AA; }
+    void setMemDep(MemoryDependenceAnalysis* M) { MD = M; }
+    void setDomTree(DominatorTree* D) { DT = D; }
+    uint32_t getNextUnusedValueNumber() { return nextValueNumber; }
+    void verifyRemoved(const Value *) const;
+  };
+}
+
+namespace llvm {
+template <> struct DenseMapInfo<Expression> {
+  static inline Expression getEmptyKey() {
+    return ~0U;
+  }
+
+  static inline Expression getTombstoneKey() {
+    return ~1U;
+  }
+
+  static unsigned getHashValue(const Expression e) {
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(e));
+  }
+  static bool isEqual(const Expression &LHS, const Expression &RHS) {
+    return LHS == RHS;
+  }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+Expression ValueTable::create_expression(Instruction *I) {
+  Expression e;
+  e.type = I->getType();
+  e.opcode = I->getOpcode();
+  for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookup_or_add(*OI));
+  if (I->isCommutative()) {
+    // Ensure that commutative instructions that only differ by a permutation
+    // of their operands get the same value number by sorting the operand value
+    // numbers.  Since all commutative instructions have two operands it is more
+    // efficient to sort by hand rather than using, say, std::sort.
+    assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+    if (e.varargs[0] > e.varargs[1])
+      std::swap(e.varargs[0], e.varargs[1]);
+  }
+
+  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+    // Sort the operand value numbers so x<y and y>x get the same value number.
+    CmpInst::Predicate Predicate = C->getPredicate();
+    if (e.varargs[0] > e.varargs[1]) {
+      std::swap(e.varargs[0], e.varargs[1]);
+      Predicate = CmpInst::getSwappedPredicate(Predicate);
+    }
+    e.opcode = (C->getOpcode() << 8) | Predicate;
+  } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
+    for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
+         II != IE; ++II)
+      e.varargs.push_back(*II);
+  }
+
+  return e;
+}
+
+Expression ValueTable::create_cmp_expression(unsigned Opcode,
+                                             CmpInst::Predicate Predicate,
+                                             Value *LHS, Value *RHS) {
+  assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+         "Not a comparison!");
+  Expression e;
+  e.type = CmpInst::makeCmpResultType(LHS->getType());
+  e.varargs.push_back(lookup_or_add(LHS));
+  e.varargs.push_back(lookup_or_add(RHS));
+
+  // Sort the operand value numbers so x<y and y>x get the same value number.
+  if (e.varargs[0] > e.varargs[1]) {
+    std::swap(e.varargs[0], e.varargs[1]);
+    Predicate = CmpInst::getSwappedPredicate(Predicate);
+  }
+  e.opcode = (Opcode << 8) | Predicate;
+  return e;
+}
+
+Expression ValueTable::create_extractvalue_expression(ExtractValueInst *EI) {
+  assert(EI != 0 && "Not an ExtractValueInst?");
+  Expression e;
+  e.type = EI->getType();
+  e.opcode = 0;
+
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
+  if (I != 0 && EI->getNumIndices() == 1 && *EI->idx_begin() == 0 ) {
+    // EI might be an extract from one of our recognised intrinsics. If it
+    // is we'll synthesize a semantically equivalent expression instead on
+    // an extract value expression.
+    switch (I->getIntrinsicID()) {
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::uadd_with_overflow:
+        e.opcode = Instruction::Add;
+        break;
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::usub_with_overflow:
+        e.opcode = Instruction::Sub;
+        break;
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::umul_with_overflow:
+        e.opcode = Instruction::Mul;
+        break;
+      default:
+        break;
+    }
+
+    if (e.opcode != 0) {
+      // Intrinsic recognized. Grab its args to finish building the expression.
+      assert(I->getNumArgOperands() == 2 &&
+             "Expect two args for recognised intrinsics.");
+      e.varargs.push_back(lookup_or_add(I->getArgOperand(0)));
+      e.varargs.push_back(lookup_or_add(I->getArgOperand(1)));
+      return e;
+    }
+  }
+
+  // Not a recognised intrinsic. Fall back to producing an extract value
+  // expression.
+  e.opcode = EI->getOpcode();
+  for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+       OI != OE; ++OI)
+    e.varargs.push_back(lookup_or_add(*OI));
+
+  for (ExtractValueInst::idx_iterator II = EI->idx_begin(), IE = EI->idx_end();
+         II != IE; ++II)
+    e.varargs.push_back(*II);
+
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// add - Insert a value into the table with a specified value number.
+void ValueTable::add(Value *V, uint32_t num) {
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
+  if (AA->doesNotAccessMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) e = nextValueNumber++;
+    valueNumbering[C] = e;
+    return e;
+  } else if (AA->onlyReadsMemory(C)) {
+    Expression exp = create_expression(C);
+    uint32_t &e = expressionNumbering[exp];
+    if (!e) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+    if (!MD) {
+      e = nextValueNumber++;
+      valueNumbering[C] = e;
+      return e;
+    }
+
+    MemDepResult local_dep = MD->getDependency(C);
+
+    if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+      valueNumbering[C] =  nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (local_dep.isDef()) {
+      CallInst* local_cdep = cast<CallInst>(local_dep.getInst());
+
+      if (local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+
+      for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+        uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+        uint32_t cd_vn = lookup_or_add(local_cdep->getArgOperand(i));
+        if (c_vn != cd_vn) {
+          valueNumbering[C] = nextValueNumber;
+          return nextValueNumber++;
+        }
+      }
+
+      uint32_t v = lookup_or_add(local_cdep);
+      valueNumbering[C] = v;
+      return v;
+    }
+
+    // Non-local case.
+    const MemoryDependenceAnalysis::NonLocalDepInfo &deps =
+      MD->getNonLocalCallDependency(CallSite(C));
+    // FIXME: Move the checking logic to MemDep!
+    CallInst* cdep = 0;
+
+    // Check to see if we have a single dominating call instruction that is
+    // identical to C.
+    for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+      const NonLocalDepEntry *I = &deps[i];
+      if (I->getResult().isNonLocal())
+        continue;
+
+      // We don't handle non-definitions.  If we already have a call, reject
+      // instruction dependencies.
+      if (!I->getResult().isDef() || cdep != 0) {
+        cdep = 0;
+        break;
+      }
+
+      CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+      // FIXME: All duplicated with non-local case.
+      if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+        cdep = NonLocalDepCall;
+        continue;
+      }
+
+      cdep = 0;
+      break;
+    }
+
+    if (!cdep) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+      valueNumbering[C] = nextValueNumber;
+      return nextValueNumber++;
+    }
+    for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+      uint32_t c_vn = lookup_or_add(C->getArgOperand(i));
+      uint32_t cd_vn = lookup_or_add(cdep->getArgOperand(i));
+      if (c_vn != cd_vn) {
+        valueNumbering[C] = nextValueNumber;
+        return nextValueNumber++;
+      }
+    }
+
+    uint32_t v = lookup_or_add(cdep);
+    valueNumbering[C] = v;
+    return v;
+
+  } else {
+    valueNumbering[C] = nextValueNumber;
+    return nextValueNumber++;
+  }
+}
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value *V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+
+  if (!isa<Instruction>(V)) {
+    valueNumbering[V] = nextValueNumber;
+    return nextValueNumber++;
+  }
+
+  Instruction* I = cast<Instruction>(V);
+  Expression exp;
+  switch (I->getOpcode()) {
+    case Instruction::Call:
+      return lookup_or_add_call(cast<CallInst>(I));
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = create_expression(I);
+      break;
+    case Instruction::ExtractValue:
+      exp = create_extractvalue_expression(cast<ExtractValueInst>(I));
+      break;
+    default:
+      valueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+  }
+
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  valueNumbering[V] = e;
+  return e;
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value *V) const {
+  DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+  assert(VI != valueNumbering.end() && "Value not numbered?");
+  return VI->second;
+}
+
+/// lookup_or_add_cmp - Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before.  Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t ValueTable::lookup_or_add_cmp(unsigned Opcode,
+                                       CmpInst::Predicate Predicate,
+                                       Value *LHS, Value *RHS) {
+  Expression exp = create_cmp_expression(Opcode, Predicate, LHS, RHS);
+  uint32_t& e = expressionNumbering[exp];
+  if (!e) e = nextValueNumber++;
+  return e;
+}
+
+/// clear - Remove all entries from the ValueTable.
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering.
+void ValueTable::erase(Value *V) {
+  valueNumbering.erase(V);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void ValueTable::verifyRemoved(const Value *V) const {
+  for (DenseMap<Value*, uint32_t>::const_iterator
+         I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+    assert(I->first != V && "Inst still occurs in value numbering map!");
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                                GVN Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class GVN : public FunctionPass {
+    bool NoLoads;
+    MemoryDependenceAnalysis *MD;
+    DominatorTree *DT;
+    const DataLayout *TD;
+    const TargetLibraryInfo *TLI;
+
+    ValueTable VN;
+
+    /// LeaderTable - A mapping from value numbers to lists of Value*'s that
+    /// have that value number.  Use findLeader to query it.
+    struct LeaderTableEntry {
+      Value *Val;
+      const BasicBlock *BB;
+      LeaderTableEntry *Next;
+    };
+    DenseMap<uint32_t, LeaderTableEntry> LeaderTable;
+    BumpPtrAllocator TableAllocator;
+
+    SmallVector<Instruction*, 8> InstrsToErase;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit GVN(bool noloads = false)
+        : FunctionPass(ID), NoLoads(noloads), MD(0) {
+      initializeGVNPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    /// markInstructionForDeletion - This removes the specified instruction from
+    /// our various maps and marks it for deletion.
+    void markInstructionForDeletion(Instruction *I) {
+      VN.erase(I);
+      InstrsToErase.push_back(I);
+    }
+
+    const DataLayout *getDataLayout() const { return TD; }
+    DominatorTree &getDominatorTree() const { return *DT; }
+    AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
+    MemoryDependenceAnalysis &getMemDep() const { return *MD; }
+  private:
+    /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for
+    /// its value number.
+    void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) {
+      LeaderTableEntry &Curr = LeaderTable[N];
+      if (!Curr.Val) {
+        Curr.Val = V;
+        Curr.BB = BB;
+        return;
+      }
+
+      LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>();
+      Node->Val = V;
+      Node->BB = BB;
+      Node->Next = Curr.Next;
+      Curr.Next = Node;
+    }
+
+    /// removeFromLeaderTable - Scan the list of values corresponding to a given
+    /// value number, and remove the given instruction if encountered.
+    void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) {
+      LeaderTableEntry* Prev = 0;
+      LeaderTableEntry* Curr = &LeaderTable[N];
+
+      while (Curr->Val != I || Curr->BB != BB) {
+        Prev = Curr;
+        Curr = Curr->Next;
+      }
+
+      if (Prev) {
+        Prev->Next = Curr->Next;
+      } else {
+        if (!Curr->Next) {
+          Curr->Val = 0;
+          Curr->BB = 0;
+        } else {
+          LeaderTableEntry* Next = Curr->Next;
+          Curr->Val = Next->Val;
+          Curr->BB = Next->BB;
+          Curr->Next = Next->Next;
+        }
+      }
+    }
+
+    // List of critical edges to be split between iterations.
+    SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit;
+
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetLibraryInfo>();
+      if (!NoLoads)
+        AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<AliasAnalysis>();
+    }
+
+
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    bool processLoad(LoadInst *L);
+    bool processInstruction(Instruction *I);
+    bool processNonLocalLoad(LoadInst *L);
+    bool processBlock(BasicBlock *BB);
+    void dump(DenseMap<uint32_t, Value*> &d);
+    bool iterateOnFunction(Function &F);
+    bool performPRE(Function &F);
+    Value *findLeader(const BasicBlock *BB, uint32_t num);
+    void cleanupGlobalSets();
+    void verifyRemoved(const Instruction *I) const;
+    bool splitCriticalEdges();
+    unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
+                                         const BasicBlockEdge &Root);
+    bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+  };
+
+  char GVN::ID = 0;
+}
+
+// createGVNPass - The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoLoads) {
+  return new GVN(NoLoads);
+}
+
+INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+  errs() << "{\n";
+  for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+       E = d.end(); I != E; ++I) {
+      errs() << I->first << "\n";
+      I->second->dump();
+  }
+  errs() << "}\n";
+}
+#endif
+
+/// IsValueFullyAvailableInBlock - Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block.  As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks.  This
+/// map is actually a tri-state map with the following values:
+///   0) we know the block *is not* fully available.
+///   1) we know the block *is* fully available.
+///   2) we do not know whether the block is fully available or not, but we are
+///      currently speculating that it will be.
+///   3) we are speculating for this block and have used that to speculate for
+///      other blocks.
+static bool IsValueFullyAvailableInBlock(BasicBlock *BB,
+                            DenseMap<BasicBlock*, char> &FullyAvailableBlocks,
+                            uint32_t RecurseDepth) {
+  if (RecurseDepth > MaxRecurseDepth)
+    return false;
+
+  // Optimistically assume that the block is fully available and check to see
+  // if we already know about this block in one lookup.
+  std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV =
+    FullyAvailableBlocks.insert(std::make_pair(BB, 2));
+
+  // If the entry already existed for this block, return the precomputed value.
+  if (!IV.second) {
+    // If this is a speculative "available" value, mark it as being used for
+    // speculation of other blocks.
+    if (IV.first->second == 2)
+      IV.first->second = 3;
+    return IV.first->second != 0;
+  }
+
+  // Otherwise, see if it is fully available in all predecessors.
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+
+  // If this block has no predecessors, it isn't live-in here.
+  if (PI == PE)
+    goto SpeculationFailure;
+
+  for (; PI != PE; ++PI)
+    // If the value isn't fully available in one of our predecessors, then it
+    // isn't fully available in this block either.  Undo our previous
+    // optimistic assumption and bail out.
+    if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1))
+      goto SpeculationFailure;
+
+  return true;
+
+// SpeculationFailure - If we get here, we found out that this is not, after
+// all, a fully-available block.  We have a problem if we speculated on this and
+// used the speculation to mark other blocks as available.
+SpeculationFailure:
+  char &BBVal = FullyAvailableBlocks[BB];
+
+  // If we didn't speculate on this, just return with it set to false.
+  if (BBVal == 2) {
+    BBVal = 0;
+    return false;
+  }
+
+  // If we did speculate on this value, we could have blocks set to 1 that are
+  // incorrect.  Walk the (transitive) successors of this block and mark them as
+  // 0 if set to one.
+  SmallVector<BasicBlock*, 32> BBWorklist;
+  BBWorklist.push_back(BB);
+
+  do {
+    BasicBlock *Entry = BBWorklist.pop_back_val();
+    // Note that this sets blocks to 0 (unavailable) if they happen to not
+    // already be in FullyAvailableBlocks.  This is safe.
+    char &EntryVal = FullyAvailableBlocks[Entry];
+    if (EntryVal == 0) continue;  // Already unavailable.
+
+    // Mark as unavailable.
+    EntryVal = 0;
+
+    for (succ_iterator I = succ_begin(Entry), E = succ_end(Entry); I != E; ++I)
+      BBWorklist.push_back(*I);
+  } while (!BBWorklist.empty());
+
+  return false;
+}
+
+
+/// CanCoerceMustAliasedValueToLoad - Return true if
+/// CoerceAvailableValueToLoadType will succeed.
+static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
+                                            Type *LoadTy,
+                                            const DataLayout &TD) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() ||
+      StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (TD.getTypeSizeInBits(StoredVal->getType()) <
+        TD.getTypeSizeInBits(LoadTy))
+    return false;
+
+  return true;
+}
+
+/// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace and
+/// InsertPt is the place to insert new instructions.
+///
+/// If we can't do it, return null.
+static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
+                                             Type *LoadedTy,
+                                             Instruction *InsertPt,
+                                             const DataLayout &TD) {
+  if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
+    return 0;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoreSize == LoadSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy())
+      return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
+
+    // Convert source pointers to integers, which can be bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy()) {
+      StoredValTy = TD.getIntPtrType(StoredValTy);
+      StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
+    }
+
+    Type *TypeToCastTo = LoadedTy;
+    if (TypeToCastTo->getScalarType()->isPointerTy())
+      TypeToCastTo = TD.getIntPtrType(TypeToCastTo);
+
+    if (StoredValTy != TypeToCastTo)
+      StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
+
+    // Cast to pointer if the load needs a pointer type.
+    if (LoadedTy->getScalarType()->isPointerTy())
+      StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt);
+
+    return StoredVal;
+  }
+
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = TD.getIntPtrType(StoredValTy);
+    StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize);
+    StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (TD.isBigEndian()) {
+    Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize);
+    StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt);
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+  StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt);
+
+  if (LoadedTy == NewIntTy)
+    return StoredVal;
+
+  // If the result is a pointer, inttoptr.
+  if (LoadedTy->getScalarType()->isPointerTy())
+    return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt);
+
+  // Otherwise, bitcast.
+  return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt);
+}
+
+/// AnalyzeLoadFromClobberingWrite - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering memory write (store,
+/// memset, memcpy, memmove).  This means that the write *may* provide bits used
+/// by the load but we can't be sure because the pointers don't mustalias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &TD) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr,StoreOffset,&TD);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, &TD);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+#if 0
+  if (LoadOffset == StoreOffset) {
+    dbgs() << "STORE/LOAD DEP WITH COMMON POINTER MISSED:\n"
+    << "Base       = " << *StoreBase << "\n"
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
+  }
+#endif
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits >> 3;  // Convert to bytes.
+  LoadSize >>= 3;
+
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure) {
+#if 0
+    dbgs() << "STORE LOAD DEP WITH COMMON BASE:\n"
+    << "Base       = " << *StoreBase << "\n"
+    << "Store Ptr  = " << *WritePtr << "\n"
+    << "Store Offs = " << StoreOffset << "\n"
+    << "Load Ptr   = " << *LoadPtr << "\n";
+    abort();
+#endif
+    return -1;
+  }
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset+StoreSize < LoadOffset+LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset-StoreOffset;
+}
+
+/// AnalyzeLoadFromClobberingStore - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                          StoreInst *DepSI,
+                                          const DataLayout &TD) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                        StorePtr, StoreSize, TD);
+}
+
+/// AnalyzeLoadFromClobberingLoad - This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
+                                         LoadInst *DepLI, const DataLayout &TD){
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType());
+  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD);
+  if (R != -1) return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+    GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, &TD);
+  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceAnalysis::
+    getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD);
+  if (Size == 0) return -1;
+
+  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD);
+}
+
+
+
+static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                            MemIntrinsic *MI,
+                                            const DataLayout &TD) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (SizeCst == 0) return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, TD);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (Src == 0) return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD));
+  if (GV == 0 || !GV->isConstant()) return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
+                                              MI->getDest(), MemSizeInBits, TD);
+  if (Offset == -1)
+    return Offset;
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+  Constant *OffsetCst =
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  if (ConstantFoldLoadFromConstPtr(Src, &TD))
+    return Offset;
+  return -1;
+}
+
+
+/// GetStoreValueForLoad - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.  This means
+/// that the store provides bits used by the load but we the pointers don't
+/// mustalias.  Check this case to see if there is anything more we can do
+/// before we give up.
+static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
+                                   Type *LoadTy,
+                                   Instruction *InsertPt, const DataLayout &TD){
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8;
+
+  IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
+
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Builder.CreatePtrToInt(SrcVal,
+        TD.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (TD.isLittleEndian())
+    ShiftAmt = Offset*8;
+  else
+    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
+
+  if (ShiftAmt)
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
+
+  if (LoadSize != StoreSize)
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
+
+  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
+}
+
+/// GetLoadValueForLoad - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering load.  This means
+/// that the load *may* provide bits used by the load but we can't be sure
+/// because the pointers don't mustalias.  Check this case to see if there is
+/// anything more we can do before we give up.
+static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
+                                  Type *LoadTy, Instruction *InsertPt,
+                                  GVN &gvn) {
+  const DataLayout &TD = *gvn.getDataLayout();
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
+  if (Offset+LoadSize > SrcValSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset+LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy =
+      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
+    DestPTy = PointerType::get(DestPTy,
+                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (TD.isBigEndian())
+      RV = Builder.CreateLShr(RV,
+                    NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits());
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    // We would like to use gvn.markInstructionForDeletion here, but we can't
+    // because the load is already memoized into the leader map table that GVN
+    // tracks.  It is potentially possible to remove the load from the table,
+    // but then there all of the operations based on it would need to be
+    // rehashed.  Just leave the dead load around.
+    gvn.getMemDep().removeInstruction(SrcVal);
+    SrcVal = NewLoad;
+  }
+
+  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD);
+}
+
+
+/// GetMemInstValueForLoad - This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                     Type *LoadTy, Instruction *InsertPt,
+                                     const DataLayout &TD){
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
+
+  IRBuilder<> Builder(InsertPt->getParent(), InsertPt);
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    Value *Val = MSI->getValue();
+    if (LoadSize != 1)
+      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
+
+    Value *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet*2 <= LoadSize) {
+        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
+        Val = Builder.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      Value *ShVal = Builder.CreateShl(Val, 1*8);
+      Val = Builder.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src = ConstantExpr::getBitCast(Src,
+                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+  Constant *OffsetCst =
+  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  return ConstantFoldLoadFromConstPtr(Src, &TD);
+}
+
+namespace {
+
+struct AvailableValueInBlock {
+  /// BB - The basic block in question.
+  BasicBlock *BB;
+  enum ValType {
+    SimpleVal,  // A simple offsetted value that is accessed.
+    LoadVal,    // A value produced by a load.
+    MemIntrin   // A memory intrinsic which is loaded from.
+  };
+
+  /// V - The value that is live out of the block.
+  PointerIntPair<Value *, 2, ValType> Val;
+
+  /// Offset - The byte offset in Val that is interesting for the load query.
+  unsigned Offset;
+
+  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                   unsigned Offset = 0) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.Val.setPointer(V);
+    Res.Val.setInt(SimpleVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+                                     unsigned Offset = 0) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.Val.setPointer(MI);
+    Res.Val.setInt(MemIntrin);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+                                       unsigned Offset = 0) {
+    AvailableValueInBlock Res;
+    Res.BB = BB;
+    Res.Val.setPointer(LI);
+    Res.Val.setInt(LoadVal);
+    Res.Offset = Offset;
+    return Res;
+  }
+
+  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+
+  Value *getSimpleValue() const {
+    assert(isSimpleValue() && "Wrong accessor");
+    return Val.getPointer();
+  }
+
+  LoadInst *getCoercedLoadValue() const {
+    assert(isCoercedLoadValue() && "Wrong accessor");
+    return cast<LoadInst>(Val.getPointer());
+  }
+
+  MemIntrinsic *getMemIntrinValue() const {
+    assert(isMemIntrinValue() && "Wrong accessor");
+    return cast<MemIntrinsic>(Val.getPointer());
+  }
+
+  /// MaterializeAdjustedValue - Emit code into this block to adjust the value
+  /// defined here to the specified type.  This handles various coercion cases.
+  Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
+    Value *Res;
+    if (isSimpleValue()) {
+      Res = getSimpleValue();
+      if (Res->getType() != LoadTy) {
+        const DataLayout *TD = gvn.getDataLayout();
+        assert(TD && "Need target data to handle type mismatch case");
+        Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
+                                   *TD);
+
+        DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                     << *getSimpleValue() << '\n'
+                     << *Res << '\n' << "\n\n\n");
+      }
+    } else if (isCoercedLoadValue()) {
+      LoadInst *Load = getCoercedLoadValue();
+      if (Load->getType() == LoadTy && Offset == 0) {
+        Res = Load;
+      } else {
+        Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+                                  gvn);
+
+        DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                     << *getCoercedLoadValue() << '\n'
+                     << *Res << '\n' << "\n\n\n");
+      }
+    } else {
+      const DataLayout *TD = gvn.getDataLayout();
+      assert(TD && "Need target data to handle type mismatch case");
+      Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
+                                   LoadTy, BB->getTerminator(), *TD);
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                   << "  " << *getMemIntrinValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+    return Res;
+  }
+};
+
+} // end anonymous namespace
+
+/// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI.  This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+                         SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+                                     GVN &gvn) {
+  // Check for the fully redundant, dominating load case.  In this case, we can
+  // just use the dominating value directly.
+  if (ValuesPerBlock.size() == 1 &&
+      gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+                                               LI->getParent()))
+    return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+
+  // Otherwise, we have to construct SSA form.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSAUpdate(&NewPHIs);
+  SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+  Type *LoadTy = LI->getType();
+
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
+    const AvailableValueInBlock &AV = ValuesPerBlock[i];
+    BasicBlock *BB = AV.BB;
+
+    if (SSAUpdate.HasValueForBlock(BB))
+      continue;
+
+    SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn));
+  }
+
+  // Perform PHI construction.
+  Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+
+  // If new PHI nodes were created, notify alias analysis.
+  if (V->getType()->getScalarType()->isPointerTy()) {
+    AliasAnalysis *AA = gvn.getAliasAnalysis();
+
+    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
+      AA->copyValue(LI, NewPHIs[i]);
+
+    // Now that we've copied information to the new PHIs, scan through
+    // them again and inform alias analysis that we've added potentially
+    // escaping uses to any values that are operands to these PHIs.
+    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) {
+      PHINode *P = NewPHIs[i];
+      for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) {
+        unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
+        AA->addEscapingUse(P->getOperandUse(jj));
+      }
+    }
+  }
+
+  return V;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+  if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+    return II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return false;
+}
+
+/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // Find the non-local dependencies of the load.
+  SmallVector<NonLocalDepResult, 64> Deps;
+  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
+  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+  //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: "
+  //             << Deps.size() << *LI << '\n');
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    DEBUG(
+      dbgs() << "GVN: non-local load ";
+      WriteAsOperand(dbgs(), LI);
+      dbgs() << " has unknown dependencies\n";
+    );
+    return false;
+  }
+
+  // Filter out useless results (non-locals, etc).  Keep track of the blocks
+  // where we have a value available in repl, also keep track of whether we see
+  // dependencies that produce an unknown value for the load (such as a call
+  // that could potentially clobber the load).
+  SmallVector<AvailableValueInBlock, 64> ValuesPerBlock;
+  SmallVector<BasicBlock*, 64> UnavailableBlocks;
+
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    BasicBlock *DepBB = Deps[i].getBB();
+    MemDepResult DepInfo = Deps[i].getResult();
+
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    if (DepInfo.isClobber()) {
+      // The address being loaded in this non-local block may not be the same as
+      // the pointer operand of the load if PHI translation occurs.  Make sure
+      // to consider the right address.
+      Value *Address = Deps[i].getAddress();
+
+      // If the dependence is to a store that writes to a superset of the bits
+      // read by the load, we can extract the bits we need for the load from the
+      // stored value.
+      if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+        if (TD && Address) {
+          int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address,
+                                                      DepSI, *TD);
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                       DepSI->getValueOperand(),
+                                                                Offset));
+            continue;
+          }
+        }
+      }
+
+      // Check to see if we have something like this:
+      //    load i32* P
+      //    load i8* (P+1)
+      // if we have this, replace the later with an extraction from the former.
+      if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+        // If this is a clobber and L is the first instruction in its block, then
+        // we have the first instruction in the entry block.
+        if (DepLI != LI && Address && TD) {
+          int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(),
+                                                     LI->getPointerOperand(),
+                                                     DepLI, *TD);
+
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI,
+                                                                    Offset));
+            continue;
+          }
+        }
+      }
+
+      // If the clobbering value is a memset/memcpy/memmove, see if we can
+      // forward a value on from it.
+      if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+        if (TD && Address) {
+          int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+                                                        DepMI, *TD);
+          if (Offset != -1) {
+            ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI,
+                                                                  Offset));
+            continue;
+          }
+        }
+      }
+
+      UnavailableBlocks.push_back(DepBB);
+      continue;
+    }
+
+    // DepInfo.isDef() here
+
+    Instruction *DepInst = DepInfo.getInst();
+
+    // Loading the allocation -> undef.
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+        // Loading immediately after lifetime begin -> undef.
+        isLifetimeStart(DepInst)) {
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                             UndefValue::get(LI->getType())));
+      continue;
+    }
+
+    if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+      // Reject loads and stores that are to the same address but are of
+      // different types if we have to.
+      if (S->getValueOperand()->getType() != LI->getType()) {
+        // If the stored value is larger or equal to the loaded value, we can
+        // reuse it.
+        if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+                                                        LI->getType(), *TD)) {
+          UnavailableBlocks.push_back(DepBB);
+          continue;
+        }
+      }
+
+      ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+                                                         S->getValueOperand()));
+      continue;
+    }
+
+    if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+      // If the types mismatch and we can't handle it, reject reuse of the load.
+      if (LD->getType() != LI->getType()) {
+        // If the stored value is larger or equal to the loaded value, we can
+        // reuse it.
+        if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){
+          UnavailableBlocks.push_back(DepBB);
+          continue;
+        }
+      }
+      ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD));
+      continue;
+    }
+
+    UnavailableBlocks.push_back(DepBB);
+    continue;
+  }
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty()) return false;
+
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  // Okay, we have *some* definitions of the value.  This means that the value
+  // is available in some of our (transitive) predecessors.  Lets think about
+  // doing PRE of this load.  This will involve inserting a new load into the
+  // predecessor when it's not available.  We could do this in general, but
+  // prefer to not increase code size.  As such, we only do this when we know
+  // that we only have to insert *one* load (which means we're basically moving
+  // the load, not inserting a new one).
+
+  SmallPtrSet<BasicBlock *, 4> Blockers;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    Blockers.insert(UnavailableBlocks[i]);
+
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
+  BasicBlock *LoadBB = LI->getParent();
+  BasicBlock *TmpBB = LoadBB;
+
+  bool allSingleSucc = true;
+  while (TmpBB->getSinglePredecessor()) {
+    TmpBB = TmpBB->getSinglePredecessor();
+    if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+      return false;
+    if (Blockers.count(TmpBB))
+      return false;
+
+    // If any of these blocks has more than one successor (i.e. if the edge we
+    // just traversed was critical), then there are other paths through this
+    // block along which the load may not be anticipated.  Hoisting the load
+    // above this block would be adding the load to execution paths along
+    // which it was not previously executed.
+    if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+      return false;
+  }
+
+  assert(TmpBB);
+  LoadBB = TmpBB;
+
+  // Check to see how many predecessors have the loaded value fully
+  // available.
+  DenseMap<BasicBlock*, Value*> PredLoads;
+  DenseMap<BasicBlock*, char> FullyAvailableBlocks;
+  for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i)
+    FullyAvailableBlocks[ValuesPerBlock[i].BB] = true;
+  for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
+    FullyAvailableBlocks[UnavailableBlocks[i]] = false;
+
+  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit;
+  for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
+       PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
+      continue;
+    }
+    PredLoads[Pred] = 0;
+
+    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+      if (isa<IndirectBrInst>(Pred->getTerminator())) {
+        DEBUG(dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      if (LoadBB->isLandingPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
+      unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
+      NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
+    }
+  }
+
+  if (!NeedToSplit.empty()) {
+    toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
+    return false;
+  }
+
+  // Decide whether PRE is profitable for this load.
+  unsigned NumUnavailablePreds = PredLoads.size();
+  assert(NumUnavailablePreds != 0 &&
+         "Fully available value should be eliminated above!");
+
+  // If this load is unavailable in multiple predecessors, reject it.
+  // FIXME: If we could restructure the CFG, we could make a common pred with
+  // all the preds that don't have an available LI and insert a new load into
+  // that one block.
+  if (NumUnavailablePreds != 1)
+      return false;
+
+  // Check if the load can safely be moved to all the unavailable predecessors.
+  bool CanDoPRE = true;
+  SmallVector<Instruction*, 8> NewInsts;
+  for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(),
+         E = PredLoads.end(); I != E; ++I) {
+    BasicBlock *UnavailablePred = I->first;
+
+    // Do PHI translation to get its value in the predecessor if necessary.  The
+    // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+
+    // If all preds have a single successor, then we know it is safe to insert
+    // the load on the pred (?!?), so we can insert code to materialize the
+    // pointer if it is not available.
+    PHITransAddr Address(LI->getPointerOperand(), TD);
+    Value *LoadPtr = 0;
+    if (allSingleSucc) {
+      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                  *DT, NewInsts);
+    } else {
+      Address.PHITranslateValue(LoadBB, UnavailablePred, DT);
+      LoadPtr = Address.getAddr();
+    }
+
+    // If we couldn't find or insert a computation of this phi translated value,
+    // we fail PRE.
+    if (LoadPtr == 0) {
+      DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+            << *LI->getPointerOperand() << "\n");
+      CanDoPRE = false;
+      break;
+    }
+
+    // Make sure it is valid to move this load here.  We have to watch out for:
+    //  @1 = getelementptr (i8* p, ...
+    //  test p and branch if == 0
+    //  load @1
+    // It is valid to have the getelementptr before the test, even if p can
+    // be 0, as getelementptr only does address arithmetic.
+    // If we are not pushing the value through any multiple-successor blocks
+    // we do not have this case.  Otherwise, check that the load is safe to
+    // put anywhere; this can be improved, but should be conservatively safe.
+    if (!allSingleSucc &&
+        // FIXME: REEVALUTE THIS.
+        !isSafeToLoadUnconditionally(LoadPtr,
+                                     UnavailablePred->getTerminator(),
+                                     LI->getAlignment(), TD)) {
+      CanDoPRE = false;
+      break;
+    }
+
+    I->second = LoadPtr;
+  }
+
+  if (!CanDoPRE) {
+    while (!NewInsts.empty()) {
+      Instruction *I = NewInsts.pop_back_val();
+      if (MD) MD->removeInstruction(I);
+      I->eraseFromParent();
+    }
+    return false;
+  }
+
+  // Okay, we can eliminate this load by inserting a reload in the predecessor
+  // and using PHI construction to get the value in the other predecessors, do
+  // it.
+  DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+  DEBUG(if (!NewInsts.empty())
+          dbgs() << "INSERTED " << NewInsts.size() << " INSTS: "
+                 << *NewInsts.back() << '\n');
+
+  // Assign value numbers to the new instructions.
+  for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) {
+    // FIXME: We really _ought_ to insert these value numbers into their
+    // parent's availability map.  However, in doing so, we risk getting into
+    // ordering issues.  If a block hasn't been processed yet, we would be
+    // marking a value as AVAIL-IN, which isn't what we intend.
+    VN.lookup_or_add(NewInsts[i]);
+  }
+
+  for (DenseMap<BasicBlock*, Value*>::iterator I = PredLoads.begin(),
+         E = PredLoads.end(); I != E; ++I) {
+    BasicBlock *UnavailablePred = I->first;
+    Value *LoadPtr = I->second;
+
+    Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false,
+                                        LI->getAlignment(),
+                                        UnavailablePred->getTerminator());
+
+    // Transfer the old load's TBAA tag to the new load.
+    if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa))
+      NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+
+    // Transfer DebugLoc.
+    NewLoad->setDebugLoc(LI->getDebugLoc());
+
+    // Add the newly created load.
+    ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+                                                        NewLoad));
+    MD->invalidateCachedPointerInfo(LoadPtr);
+    DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+  }
+
+  // Perform PHI construction.
+  Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+  LI->replaceAllUsesWith(V);
+  if (isa<PHINode>(V))
+    V->takeName(LI);
+  if (V->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(V);
+  markInstructionForDeletion(LI);
+  ++NumPRELoad;
+  return true;
+}
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
+  BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl);
+  if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) &&
+      isa<OverflowingBinaryOperator>(ReplOp)) {
+    if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap())
+      ReplOp->setHasNoSignedWrap(false);
+    if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap())
+      ReplOp->setHasNoUnsignedWrap(false);
+  }
+  if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    ReplInst->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (int i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *IMD = I->getMetadata(Kind);
+      MDNode *ReplMD = Metadata[i].second;
+      switch(Kind) {
+      default:
+        ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_dbg:
+        llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+      case LLVMContext::MD_tbaa:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_range:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD));
+        break;
+      case LLVMContext::MD_prof:
+        llvm_unreachable("MD_prof in a non terminator instruction");
+        break;
+      case LLVMContext::MD_fpmath:
+        ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD));
+        break;
+      }
+    }
+  }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
+  I->replaceAllUsesWith(Repl);
+}
+
+/// processLoad - Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+  if (!MD)
+    return false;
+
+  if (!L->isSimple())
+    return false;
+
+  if (L->use_empty()) {
+    markInstructionForDeletion(L);
+    return true;
+  }
+
+  // ... to a pointer that has been loaded from before...
+  MemDepResult Dep = MD->getDependency(L);
+
+  // If we have a clobber and target data is around, see if this is a clobber
+  // that we can fix up through code synthesis.
+  if (Dep.isClobber() && TD) {
+    // Check to see if we have something like this:
+    //   store i32 123, i32* %P
+    //   %A = bitcast i32* %P to i8*
+    //   %B = gep i8* %A, i32 1
+    //   %C = load i8* %B
+    //
+    // We could do that by recognizing if the clobber instructions are obviously
+    // a common base + constant offset, and if the previous store (or memset)
+    // completely covers this load.  This sort of thing can happen in bitfield
+    // access code.
+    Value *AvailVal = 0;
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) {
+      int Offset = AnalyzeLoadFromClobberingStore(L->getType(),
+                                                  L->getPointerOperand(),
+                                                  DepSI, *TD);
+      if (Offset != -1)
+        AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset,
+                                        L->getType(), L, *TD);
+    }
+
+    // Check to see if we have something like this:
+    //    load i32* P
+    //    load i8* (P+1)
+    // if we have this, replace the later with an extraction from the former.
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(Dep.getInst())) {
+      // If this is a clobber and L is the first instruction in its block, then
+      // we have the first instruction in the entry block.
+      if (DepLI == L)
+        return false;
+
+      int Offset = AnalyzeLoadFromClobberingLoad(L->getType(),
+                                                 L->getPointerOperand(),
+                                                 DepLI, *TD);
+      if (Offset != -1)
+        AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this);
+    }
+
+    // If the clobbering value is a memset/memcpy/memmove, see if we can forward
+    // a value on from it.
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) {
+      int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(),
+                                                    L->getPointerOperand(),
+                                                    DepMI, *TD);
+      if (Offset != -1)
+        AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD);
+    }
+
+    if (AvailVal) {
+      DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n'
+            << *AvailVal << '\n' << *L << "\n\n\n");
+
+      // Replace the load!
+      L->replaceAllUsesWith(AvailVal);
+      if (AvailVal->getType()->getScalarType()->isPointerTy())
+        MD->invalidateCachedPointerInfo(AvailVal);
+      markInstructionForDeletion(L);
+      ++NumGVNLoad;
+      return true;
+    }
+  }
+
+  // If the value isn't available, don't do anything!
+  if (Dep.isClobber()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      WriteAsOperand(dbgs(), L);
+      Instruction *I = Dep.getInst();
+      dbgs() << " is clobbered by " << *I << '\n';
+    );
+    return false;
+  }
+
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  if (!Dep.isDef()) {
+    DEBUG(
+      // fast print dep, using operator<< on instruction is too slow.
+      dbgs() << "GVN: load ";
+      WriteAsOperand(dbgs(), L);
+      dbgs() << " has unknown dependence\n";
+    );
+    return false;
+  }
+
+  Instruction *DepInst = Dep.getInst();
+  if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    Value *StoredVal = DepSI->getValueOperand();
+
+    // The store and load are to a must-aliased pointer, but they may not
+    // actually have the same type.  See if we know how to reuse the stored
+    // value (depending on its type).
+    if (StoredVal->getType() != L->getType()) {
+      if (TD) {
+        StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(),
+                                                   L, *TD);
+        if (StoredVal == 0)
+          return false;
+
+        DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal
+                     << '\n' << *L << "\n\n\n");
+      }
+      else
+        return false;
+    }
+
+    // Remove it!
+    L->replaceAllUsesWith(StoredVal);
+    if (StoredVal->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(StoredVal);
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    Value *AvailableVal = DepLI;
+
+    // The loads are of a must-aliased pointer, but they may not actually have
+    // the same type.  See if we know how to reuse the previously loaded value
+    // (depending on its type).
+    if (DepLI->getType() != L->getType()) {
+      if (TD) {
+        AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(),
+                                                      L, *TD);
+        if (AvailableVal == 0)
+          return false;
+
+        DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal
+                     << "\n" << *L << "\n\n\n");
+      }
+      else
+        return false;
+    }
+
+    // Remove it!
+    patchAndReplaceAllUsesWith(L, AvailableVal);
+    if (DepLI->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(DepLI);
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    L->replaceAllUsesWith(UndefValue::get(L->getType()));
+    markInstructionForDeletion(L);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
+      L->replaceAllUsesWith(UndefValue::get(L->getType()));
+      markInstructionForDeletion(L);
+      ++NumGVNLoad;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// findLeader - In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question.  This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+  LeaderTableEntry Vals = LeaderTable[num];
+  if (!Vals.Val) return 0;
+
+  Value *Val = 0;
+  if (DT->dominates(Vals.BB, BB)) {
+    Val = Vals.Val;
+    if (isa<Constant>(Val)) return Val;
+  }
+
+  LeaderTableEntry* Next = Vals.Next;
+  while (Next) {
+    if (DT->dominates(Next->BB, BB)) {
+      if (isa<Constant>(Next->Val)) return Next->Val;
+      if (!Val) Val = Next->Val;
+    }
+
+    Next = Next->Next;
+  }
+
+  return Val;
+}
+
+/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
+/// use is dominated by the given basic block.  Returns the number of uses that
+/// were replaced.
+unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
+                                          const BasicBlockEdge &Root) {
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ) {
+    Use &U = (UI++).getUse();
+
+    if (DT->dominates(Root, U)) {
+      U.set(To);
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+                                       DominatorTree *DT) {
+  // While in theory it is interesting to consider the case in which Dst has
+  // more than one predecessor, because Dst might be part of a loop which is
+  // only reachable from Src, in practice it is pointless since at the time
+  // GVN runs all such loops have preheaders, which means that Dst will have
+  // been changed to have only one predecessor, namely Src.
+  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+  const BasicBlock *Src = E.getStart();
+  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
+  (void)Src;
+  return Pred != 0;
+}
+
+/// propagateEquality - The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+bool GVN::propagateEquality(Value *LHS, Value *RHS,
+                            const BasicBlockEdge &Root) {
+  SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+  Worklist.push_back(std::make_pair(LHS, RHS));
+  bool Changed = false;
+  // For speed, compute a conservative fast approximation to
+  // DT->dominates(Root, Root.getEnd());
+  bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+  while (!Worklist.empty()) {
+    std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+    LHS = Item.first; RHS = Item.second;
+
+    if (LHS == RHS) continue;
+    assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+    // Don't try to propagate equalities between constants.
+    if (isa<Constant>(LHS) && isa<Constant>(RHS)) continue;
+
+    // Prefer a constant on the right-hand side, or an Argument if no constants.
+    if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+      std::swap(LHS, RHS);
+    assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+    // If there is no obvious reason to prefer the left-hand side over the right-
+    // hand side, ensure the longest lived term is on the right-hand side, so the
+    // shortest lived term will be replaced by the longest lived.  This tends to
+    // expose more simplifications.
+    uint32_t LVN = VN.lookup_or_add(LHS);
+    if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+        (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+      // Move the 'oldest' value to the right-hand side, using the value number as
+      // a proxy for age.
+      uint32_t RVN = VN.lookup_or_add(RHS);
+      if (LVN < RVN) {
+        std::swap(LHS, RHS);
+        LVN = RVN;
+      }
+    }
+
+    // If value numbering later sees that an instruction in the scope is equal
+    // to 'LHS' then ensure it will be turned into 'RHS'.  In order to preserve
+    // the invariant that instructions only occur in the leader table for their
+    // own value number (this is used by removeFromLeaderTable), do not do this
+    // if RHS is an instruction (if an instruction in the scope is morphed into
+    // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+    // using the leader table is about compiling faster, not optimizing better).
+    // The leader table only tracks basic blocks, not edges. Only add to if we
+    // have the simple case where the edge dominates the end.
+    if (RootDominatesEnd && !isa<Instruction>(RHS))
+      addToLeaderTable(LVN, RHS, Root.getEnd());
+
+    // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.  As
+    // LHS always has at least one use that is not dominated by Root, this will
+    // never do anything if LHS has only one use.
+    if (!LHS->hasOneUse()) {
+      unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+      Changed |= NumReplacements > 0;
+      NumGVNEqProp += NumReplacements;
+    }
+
+    // Now try to deduce additional equalities from this one.  For example, if the
+    // known equality was "(A != B)" == "false" then it follows that A and B are
+    // equal in the scope.  Only boolean equalities with an explicit true or false
+    // RHS are currently supported.
+    if (!RHS->getType()->isIntegerTy(1))
+      // Not a boolean equality - bail out.
+      continue;
+    ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+    if (!CI)
+      // RHS neither 'true' nor 'false' - bail out.
+      continue;
+    // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+    bool isKnownTrue = CI->isAllOnesValue();
+    bool isKnownFalse = !isKnownTrue;
+
+    // If "A && B" is known true then both A and B are known true.  If "A || B"
+    // is known false then both A and B are known false.
+    Value *A, *B;
+    if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+        (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+      Worklist.push_back(std::make_pair(A, RHS));
+      Worklist.push_back(std::make_pair(B, RHS));
+      continue;
+    }
+
+    // If we are propagating an equality like "(A == B)" == "true" then also
+    // propagate the equality A == B.  When propagating a comparison such as
+    // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+    if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+      // If "A == B" is known true, or "A != B" is known false, then replace
+      // A with B everywhere in the scope.
+      if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+          (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE))
+        Worklist.push_back(std::make_pair(Op0, Op1));
+
+      // If "A >= B" is known true, replace "A < B" with false everywhere.
+      CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+      Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+      // Since we don't have the instruction "A < B" immediately to hand, work out
+      // the value number that it would have and use that to find an appropriate
+      // instruction (if any).
+      uint32_t NextNum = VN.getNextUnusedValueNumber();
+      uint32_t Num = VN.lookup_or_add_cmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+      // If the number we were assigned was brand new then there is no point in
+      // looking for an instruction realizing it: there cannot be one!
+      if (Num < NextNum) {
+        Value *NotCmp = findLeader(Root.getEnd(), Num);
+        if (NotCmp && isa<Instruction>(NotCmp)) {
+          unsigned NumReplacements =
+            replaceAllDominatedUsesWith(NotCmp, NotVal, Root);
+          Changed |= NumReplacements > 0;
+          NumGVNEqProp += NumReplacements;
+        }
+      }
+      // Ensure that any instruction in scope that gets the "A < B" value number
+      // is replaced with false.
+      // The leader table only tracks basic blocks, not edges. Only add to if we
+      // have the simple case where the edge dominates the end.
+      if (RootDominatesEnd)
+        addToLeaderTable(Num, NotVal, Root.getEnd());
+
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+/// processInstruction - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+  // Ignore dbg info intrinsics.
+  if (isa<DbgInfoIntrinsic>(I))
+    return false;
+
+  // If the instruction can be easily simplified then do so now in preference
+  // to value numbering it.  Value numbering often exposes redundancies, for
+  // example if it determines that %y is equal to %x then the instruction
+  // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+  if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
+    I->replaceAllUsesWith(V);
+    if (MD && V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(I);
+    ++NumGVNSimpl;
+    return true;
+  }
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (processLoad(LI))
+      return true;
+
+    unsigned Num = VN.lookup_or_add(LI);
+    addToLeaderTable(Num, LI, LI->getParent());
+    return false;
+  }
+
+  // For conditional branches, we can perform simple conditional propagation on
+  // the condition value itself.
+  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return false;
+
+    Value *BranchCond = BI->getCondition();
+
+    BasicBlock *TrueSucc = BI->getSuccessor(0);
+    BasicBlock *FalseSucc = BI->getSuccessor(1);
+    // Avoid multiple edges early.
+    if (TrueSucc == FalseSucc)
+      return false;
+
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+    BasicBlockEdge TrueE(Parent, TrueSucc);
+    Changed |= propagateEquality(BranchCond, TrueVal, TrueE);
+
+    Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+    BasicBlockEdge FalseE(Parent, FalseSucc);
+    Changed |= propagateEquality(BranchCond, FalseVal, FalseE);
+
+    return Changed;
+  }
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock *Dst = i.getCaseSuccessor();
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
+        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+      }
+    }
+    return Changed;
+  }
+
+  // Instructions with void type don't return a value, so there's
+  // no point in trying to find redundancies in them.
+  if (I->getType()->isVoidTy()) return false;
+
+  uint32_t NextNum = VN.getNextUnusedValueNumber();
+  unsigned Num = VN.lookup_or_add(I);
+
+  // Allocations are always uniquely numbered, so we can save time and memory
+  // by fast failing them.
+  if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // If the number we were assigned was a brand new VN, then we don't
+  // need to do a lookup to see if the number already exists
+  // somewhere in the domtree: it can't!
+  if (Num >= NextNum) {
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // Perform fast-path value-number based elimination of values inherited from
+  // dominators.
+  Value *repl = findLeader(I->getParent(), Num);
+  if (repl == 0) {
+    // Failure, just remember this instance for future use.
+    addToLeaderTable(Num, I, I->getParent());
+    return false;
+  }
+
+  // Remove it!
+  patchAndReplaceAllUsesWith(I, repl);
+  if (MD && repl->getType()->getScalarType()->isPointerTy())
+    MD->invalidateCachedPointerInfo(repl);
+  markInstructionForDeletion(I);
+  return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runOnFunction(Function& F) {
+  if (!NoLoads)
+    MD = &getAnalysis<MemoryDependenceAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
+  VN.setMemDep(MD);
+  VN.setDomTree(DT);
+
+  bool Changed = false;
+  bool ShouldContinue = true;
+
+  // Merge unconditional branches, allowing PRE to catch more
+  // optimization opportunities.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+    BasicBlock *BB = FI++;
+
+    bool removedBlock = MergeBlockIntoPredecessor(BB, this);
+    if (removedBlock) ++NumGVNBlocks;
+
+    Changed |= removedBlock;
+  }
+
+  unsigned Iteration = 0;
+  while (ShouldContinue) {
+    DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+    ShouldContinue = iterateOnFunction(F);
+    if (splitCriticalEdges())
+      ShouldContinue = true;
+    Changed |= ShouldContinue;
+    ++Iteration;
+  }
+
+  if (EnablePRE) {
+    bool PREChanged = true;
+    while (PREChanged) {
+      PREChanged = performPRE(F);
+      Changed |= PREChanged;
+    }
+  }
+  // FIXME: Should perform GVN again after PRE does something.  PRE can move
+  // computations into blocks where they become fully redundant.  Note that
+  // we can't do this until PRE's critical edge splitting updates memdep.
+  // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+  cleanupGlobalSets();
+
+  return Changed;
+}
+
+
+bool GVN::processBlock(BasicBlock *BB) {
+  // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+  // (and incrementing BI before processing an instruction).
+  assert(InstrsToErase.empty() &&
+         "We expect InstrsToErase to be empty across iterations");
+  bool ChangedFunction = false;
+
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+       BI != BE;) {
+    ChangedFunction |= processInstruction(BI);
+    if (InstrsToErase.empty()) {
+      ++BI;
+      continue;
+    }
+
+    // If we need some instructions deleted, do it now.
+    NumGVNInstr += InstrsToErase.size();
+
+    // Avoid iterator invalidation.
+    bool AtStart = BI == BB->begin();
+    if (!AtStart)
+      --BI;
+
+    for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(),
+         E = InstrsToErase.end(); I != E; ++I) {
+      DEBUG(dbgs() << "GVN removed: " << **I << '\n');
+      if (MD) MD->removeInstruction(*I);
+      DEBUG(verifyRemoved(*I));
+      (*I)->eraseFromParent();
+    }
+    InstrsToErase.clear();
+
+    if (AtStart)
+      BI = BB->begin();
+    else
+      ++BI;
+  }
+
+  return ChangedFunction;
+}
+
+/// performPRE - Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+  bool Changed = false;
+  SmallVector<std::pair<Value*, BasicBlock*>, 8> predMap;
+  for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+       DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+    BasicBlock *CurrentBlock = *DI;
+
+    // Nothing to PRE in the entry block.
+    if (CurrentBlock == &F.getEntryBlock()) continue;
+
+    // Don't perform PRE on a landing pad.
+    if (CurrentBlock->isLandingPad()) continue;
+
+    for (BasicBlock::iterator BI = CurrentBlock->begin(),
+         BE = CurrentBlock->end(); BI != BE; ) {
+      Instruction *CurInst = BI++;
+
+      if (isa<AllocaInst>(CurInst) ||
+          isa<TerminatorInst>(CurInst) || isa<PHINode>(CurInst) ||
+          CurInst->getType()->isVoidTy() ||
+          CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+          isa<DbgInfoIntrinsic>(CurInst))
+        continue;
+
+      // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+      // sinking the compare again, and it would force the code generator to
+      // move the i1 from processor flags or predicate registers into a general
+      // purpose register.
+      if (isa<CmpInst>(CurInst))
+        continue;
+
+      // We don't currently value number ANY inline asm calls.
+      if (CallInst *CallI = dyn_cast<CallInst>(CurInst))
+        if (CallI->isInlineAsm())
+          continue;
+
+      uint32_t ValNo = VN.lookup(CurInst);
+
+      // Look for the predecessors for PRE opportunities.  We're
+      // only trying to solve the basic diamond case, where
+      // a value is computed in the successor and one predecessor,
+      // but not the other.  We also explicitly disallow cases
+      // where the successor is its own predecessor, because they're
+      // more complicated to get right.
+      unsigned NumWith = 0;
+      unsigned NumWithout = 0;
+      BasicBlock *PREPred = 0;
+      predMap.clear();
+
+      for (pred_iterator PI = pred_begin(CurrentBlock),
+           PE = pred_end(CurrentBlock); PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        // We're not interested in PRE where the block is its
+        // own predecessor, or in blocks with predecessors
+        // that are not reachable.
+        if (P == CurrentBlock) {
+          NumWithout = 2;
+          break;
+        } else if (!DT->isReachableFromEntry(P))  {
+          NumWithout = 2;
+          break;
+        }
+
+        Value* predV = findLeader(P, ValNo);
+        if (predV == 0) {
+          predMap.push_back(std::make_pair(static_cast<Value *>(0), P));
+          PREPred = P;
+          ++NumWithout;
+        } else if (predV == CurInst) {
+          /* CurInst dominates this predecessor. */
+          NumWithout = 2;
+          break;
+        } else {
+          predMap.push_back(std::make_pair(predV, P));
+          ++NumWith;
+        }
+      }
+
+      // Don't do PRE when it might increase code size, i.e. when
+      // we would need to insert instructions in more than one pred.
+      if (NumWithout != 1 || NumWith == 0)
+        continue;
+
+      // Don't do PRE across indirect branch.
+      if (isa<IndirectBrInst>(PREPred->getTerminator()))
+        continue;
+
+      // We can't do PRE safely on a critical edge, so instead we schedule
+      // the edge to be split and perform the PRE the next time we iterate
+      // on the function.
+      unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+      if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+        toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+        continue;
+      }
+
+      // Instantiate the expression in the predecessor that lacked it.
+      // Because we are going top-down through the block, all value numbers
+      // will be available in the predecessor by the time we need them.  Any
+      // that weren't originally present will have been instantiated earlier
+      // in this loop.
+      Instruction *PREInstr = CurInst->clone();
+      bool success = true;
+      for (unsigned i = 0, e = CurInst->getNumOperands(); i != e; ++i) {
+        Value *Op = PREInstr->getOperand(i);
+        if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+          continue;
+
+        if (Value *V = findLeader(PREPred, VN.lookup(Op))) {
+          PREInstr->setOperand(i, V);
+        } else {
+          success = false;
+          break;
+        }
+      }
+
+      // Fail out if we encounter an operand that is not available in
+      // the PRE predecessor.  This is typically because of loads which
+      // are not value numbered precisely.
+      if (!success) {
+        DEBUG(verifyRemoved(PREInstr));
+        delete PREInstr;
+        continue;
+      }
+
+      PREInstr->insertBefore(PREPred->getTerminator());
+      PREInstr->setName(CurInst->getName() + ".pre");
+      PREInstr->setDebugLoc(CurInst->getDebugLoc());
+      VN.add(PREInstr, ValNo);
+      ++NumGVNPRE;
+
+      // Update the availability map to include the new instruction.
+      addToLeaderTable(ValNo, PREInstr, PREPred);
+
+      // Create a PHI to make the value available in this block.
+      PHINode* Phi = PHINode::Create(CurInst->getType(), predMap.size(),
+                                     CurInst->getName() + ".pre-phi",
+                                     CurrentBlock->begin());
+      for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+        if (Value *V = predMap[i].first)
+          Phi->addIncoming(V, predMap[i].second);
+        else
+          Phi->addIncoming(PREInstr, PREPred);
+      }
+
+      VN.add(Phi, ValNo);
+      addToLeaderTable(ValNo, Phi, CurrentBlock);
+      Phi->setDebugLoc(CurInst->getDebugLoc());
+      CurInst->replaceAllUsesWith(Phi);
+      if (Phi->getType()->getScalarType()->isPointerTy()) {
+        // Because we have added a PHI-use of the pointer value, it has now
+        // "escaped" from alias analysis' perspective.  We need to inform
+        // AA of this.
+        for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee;
+             ++ii) {
+          unsigned jj = PHINode::getOperandNumForIncomingValue(ii);
+          VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj));
+        }
+
+        if (MD)
+          MD->invalidateCachedPointerInfo(Phi);
+      }
+      VN.erase(CurInst);
+      removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+      DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+      if (MD) MD->removeInstruction(CurInst);
+      DEBUG(verifyRemoved(CurInst));
+      CurInst->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  if (splitCriticalEdges())
+    Changed = true;
+
+  return Changed;
+}
+
+/// splitCriticalEdges - Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+  if (toSplit.empty())
+    return false;
+  do {
+    std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+    SplitCriticalEdge(Edge.first, Edge.second, this);
+  } while (!toSplit.empty());
+  if (MD) MD->invalidateCachedPredecessors();
+  return true;
+}
+
+/// iterateOnFunction - Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+  cleanupGlobalSets();
+
+  // Top-down walk of the dominator tree
+  bool Changed = false;
+#if 0
+  // Needed for value numbering with phi construction to work.
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
+       RE = RPOT.end(); RI != RE; ++RI)
+    Changed |= processBlock(*RI);
+#else
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
+       DE = df_end(DT->getRootNode()); DI != DE; ++DI)
+    Changed |= processBlock(DI->getBlock());
+#endif
+
+  return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+  VN.clear();
+  LeaderTable.clear();
+  TableAllocator.Reset();
+}
+
+/// verifyRemoved - Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+  VN.verifyRemoved(Inst);
+
+  // Walk through the value number scope to make sure the instruction isn't
+  // ferreted away in it.
+  for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+       I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+    const LeaderTableEntry *Node = &I->second;
+    assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+    while (Node->Next) {
+      Node = Node->Next;
+      assert(Node->Val != Inst && "Inst still in value numbering scope!");
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
new file mode 100644
index 000000000000..5d02c68a7a47
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -0,0 +1,311 @@
+//===-- GlobalMerge.cpp - Internal globals merging  -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at
+// once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+// ===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "global-merge"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
+                  	cl::desc("Enable global merge pass on constants"),
+                  	cl::init(false));
+
+STATISTIC(NumMerged      , "Number of globals merged");
+namespace {
+  class GlobalMerge : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// target type sizes.
+    const TargetLowering *TLI;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool isConst, unsigned AddrSpace) const;
+
+    /// \brief Check if the given variable has been identified as must keep
+    /// \pre setMustKeepGlobalVariables must have been called on the Module that
+    ///      contains GV
+    bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
+      return MustKeepGlobalVariables.count(GV);
+    }
+
+    /// Collect every variables marked as "used" or used in a landing pad
+    /// instruction for this Module.
+    void setMustKeepGlobalVariables(Module &M);
+
+    /// Collect every variables marked as "used"
+    void collectUsedGlobalVariables(Module &M);
+
+    /// Keep track of the GlobalVariable that must not be merged away
+    SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    explicit GlobalMerge(const TargetLowering *tli = 0)
+      : FunctionPass(ID), TLI(tli) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+    virtual bool doFinalization(Module &M);
+
+    const char *getPassName() const {
+      return "Merge internal globals";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+
+    struct GlobalCmp {
+      const DataLayout *TD;
+
+      GlobalCmp(const DataLayout *td) : TD(td) { }
+
+      bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+
+        return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
+      }
+    };
+  };
+} // end anonymous namespace
+
+char GlobalMerge::ID = 0;
+INITIALIZE_PASS(GlobalMerge, "global-merge",
+                "Global Merge", false, false)
+
+
+bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                          Module &M, bool isConst, unsigned AddrSpace) const {
+  const DataLayout *TD = TLI->getDataLayout();
+
+  // FIXME: Infer the maximum possible offset depending on the actual users
+  // (these max offsets are different for the users inside Thumb or ARM
+  // functions)
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+
+  // FIXME: Find better heuristics
+  std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
+
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+
+  for (size_t i = 0, e = Globals.size(); i != e; ) {
+    size_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<Type*> Tys;
+    std::vector<Constant*> Inits;
+    for (j = i; j != e; ++j) {
+      Type *Ty = Globals[j]->getType()->getElementType();
+      MergedSize += TD->getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+    }
+
+    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+    GlobalVariable *MergedGV = new GlobalVariable(M, MergedTy, isConst,
+                                                  GlobalValue::InternalLinkage,
+                                                  MergedInit, "_MergedGlobals",
+                                                  0, GlobalVariable::NotThreadLocal,
+                                                  AddrSpace);
+    for (size_t k = i; k < j; ++k) {
+      Constant *Idx[2] = {
+        ConstantInt::get(Int32Ty, 0),
+        ConstantInt::get(Int32Ty, k-i)
+      };
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+      NumMerged++;
+    }
+    i = j;
+  }
+
+  return true;
+}
+
+void GlobalMerge::collectUsedGlobalVariables(Module &M) {
+  // Extract global variables from llvm.used array
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (InitList == 0) return;
+ 
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const GlobalVariable *G =
+        dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
+      MustKeepGlobalVariables.insert(G);
+}
+
+void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
+  collectUsedGlobalVariables(M);
+
+  for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
+       ++IFn) {
+    for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end();
+         IBB != IEndBB; ++IBB) {
+      // Follow the inwoke link to find the landing pad instruction
+      const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator());
+      if (!II) continue;
+
+      const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst();
+      // Look for globals in the clauses of the landing pad instruction
+      for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses();
+           Idx != NumClauses; ++Idx)
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(LPInst->getClause(Idx)
+                                     ->stripPointerCasts()))
+          MustKeepGlobalVariables.insert(GV);
+    }
+  }
+}
+
+bool GlobalMerge::doInitialization(Module &M) {
+  DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
+                                                        BSSGlobals;
+  const DataLayout *TD = TLI->getDataLayout();
+  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
+  bool Changed = false;
+  setMustKeepGlobalVariables(M);
+
+  // Grab all non-const globals.
+  for (Module::global_iterator I = M.global_begin(),
+         E = M.global_end(); I != E; ++I) {
+    // Merge is safe for "normal" internal globals only
+    if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection())
+      continue;
+
+    PointerType *PT = dyn_cast<PointerType>(I->getType());
+    assert(PT && "Global variable is not a pointer!");
+
+    unsigned AddressSpace = PT->getAddressSpace();
+
+    // Ignore fancy-aligned globals for now.
+    unsigned Alignment = TD->getPreferredAlignment(I);
+    Type *Ty = I->getType()->getElementType();
+    if (Alignment > TD->getABITypeAlignment(Ty))
+      continue;
+
+    // Ignore all 'special' globals.
+    if (I->getName().startswith("llvm.") ||
+        I->getName().startswith(".llvm."))
+      continue;
+
+    // Ignore all "required" globals:
+    if (isMustKeepGlobalVariable(I))
+      continue;
+
+    if (TD->getTypeAllocSize(Ty) < MaxOffset) {
+      if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
+          .isBSSLocal())
+        BSSGlobals[AddressSpace].push_back(I);
+      else if (I->isConstant())
+        ConstGlobals[AddressSpace].push_back(I);
+      else
+        Globals[AddressSpace].push_back(I);
+    }
+  }
+
+  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+       I = Globals.begin(), E = Globals.end(); I != E; ++I)
+    if (I->second.size() > 1)
+      Changed |= doMerge(I->second, M, false, I->first);
+
+  for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+       I = BSSGlobals.begin(), E = BSSGlobals.end(); I != E; ++I)
+    if (I->second.size() > 1)
+      Changed |= doMerge(I->second, M, false, I->first);
+
+  if (EnableGlobalMergeOnConst)
+    for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+         I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I)
+      if (I->second.size() > 1)
+        Changed |= doMerge(I->second, M, true, I->first);
+
+  return Changed;
+}
+
+bool GlobalMerge::runOnFunction(Function &F) {
+  return false;
+}
+
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
+Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
+  return new GlobalMerge(tli);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 000000000000..8e76c78f5ac3
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -0,0 +1,1845 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+using namespace llvm;
+
+STATISTIC(NumWidened     , "Number of indvars widened");
+STATISTIC(NumReplaced    , "Number of exit values replaced");
+STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+  "verify-indvars", cl::Hidden,
+  cl::desc("Verify the ScalarEvolution result after running indvars"));
+
+namespace {
+  class IndVarSimplify : public LoopPass {
+    LoopInfo        *LI;
+    ScalarEvolution *SE;
+    DominatorTree   *DT;
+    DataLayout      *TD;
+    TargetLibraryInfo *TLI;
+
+    SmallVector<WeakVH, 16> DeadInsts;
+    bool Changed;
+  public:
+
+    static char ID; // Pass identification, replacement for typeid
+    IndVarSimplify() : LoopPass(ID), LI(0), SE(0), DT(0), TD(0),
+                       Changed(false) {
+      initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+      AU.setPreservesCFG();
+    }
+
+  private:
+    virtual void releaseMemory() {
+      DeadInsts.clear();
+    }
+
+    bool isValidRewrite(Value *FromVal, Value *ToVal);
+
+    void HandleFloatingPointIV(Loop *L, PHINode *PH);
+    void RewriteNonIntegerIVs(Loop *L);
+
+    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM);
+
+    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
+
+    Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                     PHINode *IndVar, SCEVExpander &Rewriter);
+
+    void SinkUnusedInvariants(Loop *L);
+  };
+}
+
+char IndVarSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars",
+                "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(IndVarSimplify, "indvars",
+                "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplify();
+}
+
+/// isValidRewrite - Return true if the SCEV expansion generated by the
+/// rewriter can replace the original value. SCEV guarantees that it
+/// produces the same value, but the way it is produced may be illegal IR.
+/// Ideally, this function will only be called for verification.
+bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
+  // If an SCEV expression subsumed multiple pointers, its expansion could
+  // reassociate the GEP changing the base pointer. This is illegal because the
+  // final address produced by a GEP chain must be inbounds relative to its
+  // underlying object. Otherwise basic alias analysis, among other things,
+  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+  // producing an expression involving multiple pointers. Until then, we must
+  // bail out here.
+  //
+  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+  // because it understands lcssa phis while SCEV does not.
+  Value *FromPtr = FromVal;
+  Value *ToPtr = ToVal;
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(FromVal)) {
+    FromPtr = GEP->getPointerOperand();
+  }
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(ToVal)) {
+    ToPtr = GEP->getPointerOperand();
+  }
+  if (FromPtr != FromVal || ToPtr != ToVal) {
+    // Quickly check the common case
+    if (FromPtr == ToPtr)
+      return true;
+
+    // SCEV may have rewritten an expression that produces the GEP's pointer
+    // operand. That's ok as long as the pointer operand has the same base
+    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+    // base of a recurrence. This handles the case in which SCEV expansion
+    // converts a pointer type recurrence into a nonrecurrent pointer base
+    // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
+    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+    if (FromBase == ToBase)
+      return true;
+
+    DEBUG(dbgs() << "INDVARS: GEP rewrite bail out "
+          << *FromBase << " != " << *ToBase << "\n");
+
+    return false;
+  }
+  return true;
+}
+
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = 0;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+  assert(InsertPt && "Missing phi operand");
+  assert((!isa<Instruction>(Def) ||
+          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
+         "def does not dominate all uses");
+  return InsertPt;
+}
+
+//===----------------------------------------------------------------------===//
+// RewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// ConvertToSInt - Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+  bool isExact = false;
+  // See if we can convert this to an int64_t
+  uint64_t UIntVal;
+  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
+                           &isExact) != APFloat::opOK || !isExact)
+    return false;
+  IntVal = UIntVal;
+  return true;
+}
+
+/// HandleFloatingPointIV - If the loop has floating induction variable
+/// then insert corresponding integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+///   bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+///   bar((double)i);
+///
+void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
+  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+  unsigned BackEdge     = IncomingEdge^1;
+
+  // Check incoming value.
+  ConstantFP *InitValueVal =
+    dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+  int64_t InitValue;
+  if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+    return;
+
+  // Check IV increment. Reject this PN if increment operation is not
+  // an add or increment value can not be represented by an integer.
+  BinaryOperator *Incr =
+    dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+  if (Incr == 0 || Incr->getOpcode() != Instruction::FAdd) return;
+
+  // If this is not an add of the PHI with a constantfp, or if the constant fp
+  // is not an integer, bail out.
+  ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+  int64_t IncValue;
+  if (IncValueVal == 0 || Incr->getOperand(0) != PN ||
+      !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+    return;
+
+  // Check Incr uses. One user is PN and the other user is an exit condition
+  // used by the conditional terminator.
+  Value::use_iterator IncrUse = Incr->use_begin();
+  Instruction *U1 = cast<Instruction>(*IncrUse++);
+  if (IncrUse == Incr->use_end()) return;
+  Instruction *U2 = cast<Instruction>(*IncrUse++);
+  if (IncrUse != Incr->use_end()) return;
+
+  // Find exit condition, which is an fcmp.  If it doesn't exist, or if it isn't
+  // only used by a branch, we can't transform it.
+  FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+  if (!Compare)
+    Compare = dyn_cast<FCmpInst>(U2);
+  if (Compare == 0 || !Compare->hasOneUse() ||
+      !isa<BranchInst>(Compare->use_back()))
+    return;
+
+  BranchInst *TheBr = cast<BranchInst>(Compare->use_back());
+
+  // We need to verify that the branch actually controls the iteration count
+  // of the loop.  If not, the new IV can overflow and no one will notice.
+  // The branch block must be in the loop and one of the successors must be out
+  // of the loop.
+  assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+  if (!L->contains(TheBr->getParent()) ||
+      (L->contains(TheBr->getSuccessor(0)) &&
+       L->contains(TheBr->getSuccessor(1))))
+    return;
+
+
+  // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+  // transform it.
+  ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+  int64_t ExitValue;
+  if (ExitValueVal == 0 ||
+      !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+    return;
+
+  // Find new predicate for integer comparison.
+  CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+  switch (Compare->getPredicate()) {
+  default: return;  // Unknown comparison.
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+  case CmpInst::FCMP_OGE:
+  case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+  case CmpInst::FCMP_OLT:
+  case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+  }
+
+  // We convert the floating point induction variable to a signed i32 value if
+  // we can.  This is only safe if the comparison will not overflow in a way
+  // that won't be trapped by the integer equivalent operations.  Check for this
+  // now.
+  // TODO: We could use i64 if it is native and the range requires it.
+
+  // The start/stride/exit values must all fit in signed i32.
+  if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+    return;
+
+  // If not actually striding (add x, 0.0), avoid touching the code.
+  if (IncValue == 0)
+    return;
+
+  // Positive and negative strides have different safety conditions.
+  if (IncValue > 0) {
+    // If we have a positive stride, we require the init to be less than the
+    // exit value.
+    if (InitValue >= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(ExitValue-InitValue);
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+      return;
+
+  } else {
+    // If we have a negative stride, we require the init to be greater than the
+    // exit value.
+    if (InitValue <= ExitValue)
+      return;
+
+    uint32_t Range = uint32_t(InitValue-ExitValue);
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+      if (++Range == 0) return;  // Range overflows.
+    }
+
+    unsigned Leftover = Range % uint32_t(-IncValue);
+
+    // If this is an equality comparison, we require that the strided value
+    // exactly land on the exit value, otherwise the IV condition will wrap
+    // around and do things the fp IV wouldn't.
+    if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+        Leftover != 0)
+      return;
+
+    // If the stride would wrap around the i32 before exiting, we can't
+    // transform the IV.
+    if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+      return;
+  }
+
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+  // Insert new integer induction variable.
+  PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+                      PN->getIncomingBlock(IncomingEdge));
+
+  Value *NewAdd =
+    BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+                              Incr->getName()+".int", Incr);
+  NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+  ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+                                      ConstantInt::get(Int32Ty, ExitValue),
+                                      Compare->getName());
+
+  // In the following deletions, PN may become dead and may be deleted.
+  // Use a WeakVH to observe whether this happens.
+  WeakVH WeakPH = PN;
+
+  // Delete the old floating point exit comparison.  The branch starts using the
+  // new comparison.
+  NewCompare->takeName(Compare);
+  Compare->replaceAllUsesWith(NewCompare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+
+  // Delete the old floating point increment.
+  Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+
+  // If the FP induction variable still has uses, this is because something else
+  // in the loop uses its value.  In order to canonicalize the induction
+  // variable, we chose to eliminate the IV and rewrite it in terms of an
+  // int->fp cast.
+  //
+  // We give preference to sitofp over uitofp because it is faster on most
+  // platforms.
+  if (WeakPH) {
+    Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+                                 PN->getParent()->getFirstInsertionPt());
+    PN->replaceAllUsesWith(Conv);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+  }
+  Changed = true;
+}
+
+void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
+  // First step.  Check to see if there are any floating-point recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header = L->getHeader();
+
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+      HandleFloatingPointIV(L, PN);
+
+  // If the loop previously had floating-point IV, ScalarEvolution
+  // may not have been able to compute a trip count. Now that we've done some
+  // re-writing, the trip count may be computable.
+  if (Changed)
+    SE->forgetLoop(L);
+}
+
+//===----------------------------------------------------------------------===//
+// RewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+/// RewriteLoopExitValues - Check to see if this loop has a computable
+/// loop-invariant execution count.  If so, this means that we can compute the
+/// final value of any expressions that are recurrent in the loop, and
+/// substitute the exit values from the loop into any instructions outside of
+/// the loop that use the final values of the current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
+void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
+  // Verify the input to the pass in already in LCSSA form.
+  assert(L->isLCSSAForm(*DT));
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBB = ExitBlocks[i];
+
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+
+      // SCEV only supports integer expressions for now.
+      if (!PN->getType()->isIntegerTy() && !PN->getType()->isPointerTy())
+        continue;
+
+      // It's necessary to tell ScalarEvolution about this explicitly so that
+      // it can walk the def-use list and forget all SCEVs, as it may not be
+      // watching the PHI itself. Once the new exit value is in place, there
+      // may not be a def-use connection between the loop and every instruction
+      // which got a SCEVAddRecExpr for that loop.
+      SE->forgetValue(PN);
+
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (!SE->isLoopInvariant(ExitValue, L))
+          continue;
+
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end();
+               IB!=IE && NumUses<=6 ; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (Value::use_iterator PB=UseInstr->use_begin(),
+                                         PE=UseInstr->use_end();
+                     PB!=PE && NumUses<=6 ; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
+        Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
+
+        DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
+                     << "  LoopVal = " << *Inst << "\n");
+
+        if (!isValidRewrite(Inst, ExitVal)) {
+          DeadInsts.push_back(ExitVal);
+          continue;
+        }
+        Changed = true;
+        ++NumReplaced;
+
+        PN->setIncomingValue(i, ExitVal);
+
+        // If this instruction is dead now, delete it. Don't do it now to avoid
+        // invalidating iterators.
+        if (isInstructionTriviallyDead(Inst, TLI))
+          DeadInsts.push_back(Inst);
+
+        if (NumPreds == 1) {
+          // Completely replace a single-pred PHI. This is safe, because the
+          // NewVal won't be variant in the loop, so we don't need an LCSSA phi
+          // node anymore.
+          PN->replaceAllUsesWith(ExitVal);
+          PN->eraseFromParent();
+        }
+      }
+      if (NumPreds != 1) {
+        // Clone the PHI and delete the original one. This lets IVUsers and
+        // any other maps purge the original user from their records.
+        PHINode *NewPN = cast<PHINode>(PN->clone());
+        NewPN->takeName(PN);
+        NewPN->insertBefore(PN);
+        PN->replaceAllUsesWith(NewPN);
+        PN->eraseFromParent();
+      }
+    }
+  }
+
+  // The insertion point instruction may have been deleted; clear it out
+  // so that the rewriter doesn't trip over it later.
+  Rewriter.clearInsertPoint();
+}
+
+//===----------------------------------------------------------------------===//
+//  IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+namespace {
+  // Collect information about induction variables that are used by sign/zero
+  // extend operations. This information is recorded by CollectExtend and
+  // provides the input to WidenIV.
+  struct WideIVInfo {
+    PHINode *NarrowIV;
+    Type *WidestNativeType; // Widest integer type created [sz]ext
+    bool IsSigned;          // Was an sext user seen before a zext?
+
+    WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
+  };
+
+  class WideIVVisitor : public IVVisitor {
+    ScalarEvolution *SE;
+    const DataLayout *TD;
+
+  public:
+    WideIVInfo WI;
+
+    WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
+                  const DataLayout *TData) :
+      SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
+
+    // Implement the interface used by simplifyUsersOfIV.
+    virtual void visitCast(CastInst *Cast);
+  };
+}
+
+/// visitCast - Update information about the induction variable that is
+/// extended by this sign or zero extend operation. This is used to determine
+/// the final width of the IV before actually widening it.
+void WideIVVisitor::visitCast(CastInst *Cast) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
+  uint64_t Width = SE->getTypeSizeInBits(Ty);
+  if (TD && !TD->isLegalInteger(Width))
+    return;
+
+  if (!WI.WidestNativeType) {
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+    WI.IsSigned = IsSigned;
+    return;
+  }
+
+  // We extend the IV to satisfy the sign of its first user, arbitrarily.
+  if (WI.IsSigned != IsSigned)
+    return;
+
+  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+namespace {
+
+/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the
+/// WideIV that computes the same value as the Narrow IV def.  This avoids
+/// caching Use* pointers.
+struct NarrowIVDefUse {
+  Instruction *NarrowDef;
+  Instruction *NarrowUse;
+  Instruction *WideDef;
+
+  NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {}
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD):
+    NarrowDef(ND), NarrowUse(NU), WideDef(WD) {}
+};
+
+/// WidenIV - The goal of this transform is to remove sign and zero extends
+/// without creating any new induction variables. To do this, it creates a new
+/// phi of the wider type and redirects all users, either removing extends or
+/// inserting truncs whenever we stop propagating the type.
+///
+class WidenIV {
+  // Parameters
+  PHINode *OrigPhi;
+  Type *WideType;
+  bool IsSigned;
+
+  // Context
+  LoopInfo        *LI;
+  Loop            *L;
+  ScalarEvolution *SE;
+  DominatorTree   *DT;
+
+  // Result
+  PHINode *WidePhi;
+  Instruction *WideInc;
+  const SCEV *WideIncExpr;
+  SmallVectorImpl<WeakVH> &DeadInsts;
+
+  SmallPtrSet<Instruction*,16> Widened;
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
+
+public:
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
+          ScalarEvolution *SEv, DominatorTree *DTree,
+          SmallVectorImpl<WeakVH> &DI) :
+    OrigPhi(WI.NarrowIV),
+    WideType(WI.WidestNativeType),
+    IsSigned(WI.IsSigned),
+    LI(LInfo),
+    L(LI->getLoopFor(OrigPhi->getParent())),
+    SE(SEv),
+    DT(DTree),
+    WidePhi(0),
+    WideInc(0),
+    WideIncExpr(0),
+    DeadInsts(DI) {
+    assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
+  }
+
+  PHINode *CreateWideIV(SCEVExpander &Rewriter);
+
+protected:
+  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                   Instruction *Use);
+
+  Instruction *CloneIVUser(NarrowIVDefUse DU);
+
+  const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
+
+  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  Instruction *WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
+
+  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
+};
+} // anonymous namespace
+
+/// isLoopInvariant - Perform a quick domtree based check for loop invariance
+/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
+/// gratuitous for this purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
+  return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
+                    Builder.CreateZExt(NarrowOper, WideType);
+}
+
+/// CloneIVUser - Instantiate a wide operation to replace a narrow
+/// operation. This only needs to handle operations that can evaluation to
+/// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
+Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
+  switch (Opcode) {
+  default:
+    return 0;
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n");
+
+    // Replace NarrowDef operands with WideDef. Otherwise, we don't know
+    // anything about the narrow operand yet so must insert a [sz]ext. It is
+    // probably loop invariant and will be folded or hoisted. If it actually
+    // comes from a widened IV, it should be removed during a future call to
+    // WidenIVUse.
+    Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse);
+    Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse);
+
+    BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse);
+    BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
+                                                    LHS, RHS,
+                                                    NarrowBO->getName());
+    IRBuilder<> Builder(DU.NarrowUse);
+    Builder.Insert(WideBO);
+    if (const OverflowingBinaryOperator *OBO =
+        dyn_cast<OverflowingBinaryOperator>(NarrowBO)) {
+      if (OBO->hasNoUnsignedWrap()) WideBO->setHasNoUnsignedWrap();
+      if (OBO->hasNoSignedWrap()) WideBO->setHasNoSignedWrap();
+    }
+    return WideBO;
+  }
+}
+
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+  // Handle the common case of add<nsw/nuw>
+  if (DU.NarrowUse->getOpcode() != Instruction::Add)
+    return 0;
+
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = 0;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  if (IsSigned && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(!IsSigned && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return 0;
+
+  // When creating this AddExpr, don't apply the current operations NSW or NUW
+  // flags. This instruction may be guarded by control flow that the no-wrap
+  // behavior depends on. Non-control-equivalent instructions can be mapped to
+  // the same SCEV expression, and it would be incorrect to transfer NSW/NUW
+  // semantics to those operations.
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr));
+
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+  return AddRec;
+}
+
+/// GetWideRecurrence - Is this instruction potentially interesting from
+/// IVUsers' perspective after widening it's type? In other words, can the
+/// extend be safely hoisted out of the loop with SCEV reducing the value to a
+/// recurrence on the same loop. If so, return the sign or zero extended
+/// recurrence. Otherwise return NULL.
+const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
+  if (!SE->isSCEVable(NarrowUse->getType()))
+    return 0;
+
+  const SCEV *NarrowExpr = SE->getSCEV(NarrowUse);
+  if (SE->getTypeSizeInBits(NarrowExpr->getType())
+      >= SE->getTypeSizeInBits(WideType)) {
+    // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
+    // index. So don't follow this use.
+    return 0;
+  }
+
+  const SCEV *WideExpr = IsSigned ?
+    SE->getSignExtendExpr(NarrowExpr, WideType) :
+    SE->getZeroExtendExpr(NarrowExpr, WideType);
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+  return AddRec;
+}
+
+/// WidenIVUse - Determine whether an individual user of the narrow IV can be
+/// widened. If so, return the wide clone of the user.
+Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
+
+  // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
+  if (isa<PHINode>(DU.NarrowUse) &&
+      LI->getLoopFor(DU.NarrowUse->getParent()) != L)
+    return 0;
+
+  // Our raison d'etre! Eliminate sign and zero extension.
+  if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
+      unsigned IVWidth = SE->getTypeSizeInBits(WideType);
+      if (CastWidth < IVWidth) {
+        // The cast isn't as wide as the IV, so insert a Trunc.
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+      }
+      else {
+        // A wider extend was hidden behind a narrower one. This may induce
+        // another round of IV widening in which the intermediate IV becomes
+        // dead. It should be very rare.
+        DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
+              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
+      }
+    }
+    if (NewDef != DU.NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+            << " replaced by " << *DU.WideDef << "\n");
+      ++NumElimExt;
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.push_back(DU.NarrowUse);
+    }
+    // Now that the extend is gone, we want to expose it's uses for potential
+    // further simplification. We don't need to directly inform SimplifyIVUsers
+    // of the new users, because their parent IV will be processed later as a
+    // new loop phi. If we preserved IVUsers analysis, we would also want to
+    // push the uses of WideDef here.
+
+    // No further widening is needed. The deceased [sz]ext had done it for us.
+    return 0;
+  }
+
+  // Does this user itself evaluate to a recurrence after widening?
+  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec) {
+      WideAddRec = GetExtendedOperandRecurrence(DU);
+  }
+  if (!WideAddRec) {
+    // This user does not evaluate to a recurence after widening, so don't
+    // follow it. Instead insert a Trunc to kill off the original use,
+    // eventually isolating the original narrow IV so it can be removed.
+    IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+    Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
+    return 0;
+  }
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
+  // insert a Trunc after a terminator if there happens to be a critical edge.
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
+         "SCEV is not expected to evaluate a block terminator");
+
+  // Reuse the IV increment that SCEVExpander created as long as it dominates
+  // NarrowUse.
+  Instruction *WideUse = 0;
+  if (WideAddRec == WideIncExpr
+      && Rewriter.hoistIVInc(WideInc, DU.NarrowUse))
+    WideUse = WideInc;
+  else {
+    WideUse = CloneIVUser(DU);
+    if (!WideUse)
+      return 0;
+  }
+  // Evaluation of WideAddRec ensured that the narrow expression could be
+  // extended outside the loop without overflow. This suggests that the wide use
+  // evaluates to the same expression as the extended narrow use, but doesn't
+  // absolutely guarantee it. Hence the following failsafe check. In rare cases
+  // where it fails, we simply throw away the newly created wide use.
+  if (WideAddRec != SE->getSCEV(WideUse)) {
+    DEBUG(dbgs() << "Wide use expression mismatch: " << *WideUse
+          << ": " << *SE->getSCEV(WideUse) << " != " << *WideAddRec << "\n");
+    DeadInsts.push_back(WideUse);
+    return 0;
+  }
+
+  // Returning WideUse pushes it on the worklist.
+  return WideUse;
+}
+
+/// pushNarrowIVUsers - Add eligible users of NarrowDef to NarrowIVUsers.
+///
+void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
+  for (Value::use_iterator UI = NarrowDef->use_begin(),
+         UE = NarrowDef->use_end(); UI != UE; ++UI) {
+    Instruction *NarrowUse = cast<Instruction>(*UI);
+
+    // Handle data flow merges and bizarre phi cycles.
+    if (!Widened.insert(NarrowUse))
+      continue;
+
+    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef));
+  }
+}
+
+/// CreateWideIV - Process a single induction variable. First use the
+/// SCEVExpander to create a wide induction variable that evaluates to the same
+/// recurrence as the original narrow IV. Then use a worklist to forward
+/// traverse the narrow IV's def-use chain. After WidenIVUse has processed all
+/// interesting IV users, the narrow IV will be isolated for removal by
+/// DeleteDeadPHIs.
+///
+/// It would be simpler to delete uses as they are processed, but we must avoid
+/// invalidating SCEV expressions.
+///
+PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
+  // Is this phi an induction variable?
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(OrigPhi));
+  if (!AddRec)
+    return NULL;
+
+  // Widen the induction variable expression.
+  const SCEV *WideIVExpr = IsSigned ?
+    SE->getSignExtendExpr(AddRec, WideType) :
+    SE->getZeroExtendExpr(AddRec, WideType);
+
+  assert(SE->getEffectiveSCEVType(WideIVExpr->getType()) == WideType &&
+         "Expect the new IV expression to preserve its type");
+
+  // Can the IV be extended outside the loop without overflow?
+  AddRec = dyn_cast<SCEVAddRecExpr>(WideIVExpr);
+  if (!AddRec || AddRec->getLoop() != L)
+    return NULL;
+
+  // An AddRec must have loop-invariant operands. Since this AddRec is
+  // materialized by a loop header phi, the expression cannot have any post-loop
+  // operands, so they must dominate the loop header.
+  assert(SE->properlyDominates(AddRec->getStart(), L->getHeader()) &&
+         SE->properlyDominates(AddRec->getStepRecurrence(*SE), L->getHeader())
+         && "Loop header phi recurrence inputs do not dominate the loop");
+
+  // The rewriter provides a value for the desired IV expression. This may
+  // either find an existing phi or materialize a new one. Either way, we
+  // expect a well-formed cyclic phi-with-increments. i.e. any operand not part
+  // of the phi-SCC dominates the loop entry.
+  Instruction *InsertPt = L->getHeader()->begin();
+  WidePhi = cast<PHINode>(Rewriter.expandCodeFor(AddRec, WideType, InsertPt));
+
+  // Remembering the WideIV increment generated by SCEVExpander allows
+  // WidenIVUse to reuse it when widening the narrow IV's increment. We don't
+  // employ a general reuse mechanism because the call above is the only call to
+  // SCEVExpander. Henceforth, we produce 1-to-1 narrow to wide uses.
+  if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+    WideInc =
+      cast<Instruction>(WidePhi->getIncomingValueForBlock(LatchBlock));
+    WideIncExpr = SE->getSCEV(WideInc);
+  }
+
+  DEBUG(dbgs() << "Wide IV: " << *WidePhi << "\n");
+  ++NumWidened;
+
+  // Traverse the def-use chain using a worklist starting at the original IV.
+  assert(Widened.empty() && NarrowIVUsers.empty() && "expect initial state" );
+
+  Widened.insert(OrigPhi);
+  pushNarrowIVUsers(OrigPhi, WidePhi);
+
+  while (!NarrowIVUsers.empty()) {
+    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
+
+    // Process a def-use edge. This may replace the use, so don't hold a
+    // use_iterator across it.
+    Instruction *WideUse = WidenIVUse(DU, Rewriter);
+
+    // Follow all def-use edges from the previous narrow use.
+    if (WideUse)
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
+
+    // WidenIVUse may have removed the def-use edge.
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.push_back(DU.NarrowDef);
+  }
+  return WidePhi;
+}
+
+//===----------------------------------------------------------------------===//
+//  Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+
+/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
+/// users. Each successive simplification may push more users which may
+/// themselves be candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+///
+void IndVarSimplify::SimplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LPPassManager &LPM) {
+  SmallVector<WideIVInfo, 8> WideIVs;
+
+  SmallVector<PHINode*, 8> LoopPhis;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    LoopPhis.push_back(cast<PHINode>(I));
+  }
+  // Each round of simplification iterates through the SimplifyIVUsers worklist
+  // for all current phis, then determines whether any IVs can be
+  // widened. Widening adds new phis to LoopPhis, inducing another round of
+  // simplification on the wide IVs.
+  while (!LoopPhis.empty()) {
+    // Evaluate as many IV expressions as possible before widening any IVs. This
+    // forces SCEV to set no-wrap flags before evaluating sign/zero
+    // extension. The first time SCEV attempts to normalize sign/zero extension,
+    // the result becomes final. So for the most predictable results, we delay
+    // evaluation of sign/zero extend evaluation until needed, and avoid running
+    // other SCEV based analysis prior to SimplifyAndExtend.
+    do {
+      PHINode *CurrIV = LoopPhis.pop_back_val();
+
+      // Information about sign/zero extensions of CurrIV.
+      WideIVVisitor WIV(CurrIV, SE, TD);
+
+      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV);
+
+      if (WIV.WI.WidestNativeType) {
+        WideIVs.push_back(WIV.WI);
+      }
+    } while(!LoopPhis.empty());
+
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
+      if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) {
+        Changed = true;
+        LoopPhis.push_back(WidePhi);
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Check for expressions that ScalarEvolution generates to compute
+/// BackedgeTakenInfo. If these expressions have not been reduced, then
+/// expanding them may incur additional cost (albeit in the loop preheader).
+static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
+                                SmallPtrSet<const SCEV*, 8> &Processed,
+                                ScalarEvolution *SE) {
+  if (!Processed.insert(S))
+    return false;
+
+  // If the backedge-taken count is a UDiv, it's very likely a UDiv that
+  // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a
+  // precise expression, rather than a UDiv from the user's code. If we can't
+  // find a UDiv in the code with some simple searching, assume the former and
+  // forego rewriting the loop.
+  if (isa<SCEVUDivExpr>(S)) {
+    ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!OrigCond) return true;
+    const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
+    R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
+    if (R != S) {
+      const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
+      L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
+      if (L != S)
+        return true;
+    }
+  }
+
+  // Recurse past add expressions, which commonly occur in the
+  // BackedgeTakenCount. They may already exist in program code, and if not,
+  // they are not too expensive rematerialize.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      if (isHighCostExpansion(*I, BI, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  // HowManyLessThans uses a Max expression whenever the loop is not guarded by
+  // the exit condition.
+  if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S))
+    return true;
+
+  // If we haven't recognized an expensive SCEV pattern, assume it's an
+  // expression produced by program code.
+  return false;
+}
+
+/// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
+/// count expression can be safely and cheaply expanded into an instruction
+/// sequence that can be used by LinearFunctionTestReplace.
+///
+/// TODO: This fails for pointer-type loop counters with greater than one byte
+/// strides, consequently preventing LFTR from running. For the purpose of LFTR
+/// we could skip this check in the case that the LFTR loop counter (chosen by
+/// FindLoopCounter) is also pointer type. Instead, we could directly convert
+/// the loop test to an inequality test by checking the target data's alignment
+/// of element types (given that the initial pointer value originates from or is
+/// used by ABI constrained operation, as opposed to inttoptr/ptrtoint).
+/// However, we don't yet have a strong motivation for converting loop tests
+/// into inequality tests.
+static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount) ||
+      BackedgeTakenCount->isZero())
+    return false;
+
+  if (!L->getExitingBlock())
+    return false;
+
+  // Can't rewrite non-branch yet.
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  if (!BI)
+    return false;
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  if (isHighCostExpansion(BackedgeTakenCount, BI, Processed, SE))
+    return false;
+
+  return true;
+}
+
+/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
+/// invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return 0;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+  default:
+    return 0;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(1), L, DT))
+      return Phi;
+    return 0;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return 0;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(0), L, DT))
+      return Phi;
+  }
+  return 0;
+}
+
+/// Return the compare guarding the loop latch, or NULL for unrecognized tests.
+static ICmpInst *getLoopTest(Loop *L) {
+  assert(L->getExitingBlock() && "expected loop exit");
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  // Don't bother with LFTR if the loop is not properly simplified.
+  if (!LatchBlock)
+    return 0;
+
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  assert(BI && "expected exit branch");
+
+  return dyn_cast<ICmpInst>(BI->getCondition());
+}
+
+/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
+/// that the current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = getLoopTest(L);
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!isLoopInvariant(RHS, L, DT)) {
+    if (!isLoopInvariant(LHS, L, DT))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L, DT);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  if (Idx < 0)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValue(Idx);
+  return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited,
+                               unsigned Depth) {
+  if (isa<Constant>(V))
+    return !isa<UndefValue>(V);
+
+  if (Depth >= 6)
+    return false;
+
+  // Conservatively handle non-constant non-instructions. For example, Arguments
+  // may be undef.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // Load and return values may be undef.
+  if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+    return false;
+
+  // Optimistically handle other instructions.
+  for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) {
+    if (!Visited.insert(*OI))
+      continue;
+    if (!hasConcreteDefImpl(*OI, Visited, Depth+1))
+      return false;
+  }
+  return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+  SmallPtrSet<Value*, 8> Visited;
+  Visited.insert(V);
+  return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
+/// be rewritten) loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != IncV) return false;
+  }
+
+  for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != Phi) return false;
+  }
+  return true;
+}
+
+/// FindLoopCounter - Find an affine IV in canonical form.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+///
+/// Currently only valid for LFTR. See the comments on hasConcreteDef below.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *
+FindLoopCounter(Loop *L, const SCEV *BECount,
+                ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) {
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond =
+    cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = 0;
+  const SCEV *BestInit = 0;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE->isSCEVable(Phi->getType()))
+      continue;
+
+    // Avoid comparing an integer IV against a pointer Limit.
+    if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+    if (!AR || AR->getLoop() != L || !AR->isAffine())
+      continue;
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth)))
+      continue;
+
+    const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+    if (!Step || !Step->isOne())
+      continue;
+
+    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+    Value *IncV = Phi->getIncomingValue(LatchIdx);
+    if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+      continue;
+
+    // Avoid reusing a potentially undef value to compute other values that may
+    // have originally had a concrete definition.
+    if (!hasConcreteDef(Phi)) {
+      // We explicitly allow unknown phis as long as they are already used by
+      // the loop test. In this case we assume that performing LFTR could not
+      // increase the number of undef users.
+      if (ICmpInst *Cond = getLoopTest(L)) {
+        if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT)
+            && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) {
+          continue;
+        }
+      }
+    }
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
+/// genLoopLimit - Help LinearFunctionTestReplace by generating a value that
+/// holds the RHS of the new loop test.
+static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
+                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  const SCEV *IVInit = AR->getStart();
+
+  // IVInit may be a pointer while IVCount is an integer when FindLoopCounter
+  // finds a valid pointer IV. Sign extend BECount in order to materialize a
+  // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+  // the existing GEPs whenever possible.
+  if (IndVar->getType()->isPointerTy()
+      && !IVCount->getType()->isPointerTy()) {
+
+    Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+
+    // Expand the code for the iteration count.
+    assert(SE->isLoopInvariant(IVOffset, L) &&
+           "Computed iteration count is not loop invariant!");
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    Value *GEPOffset = Rewriter.expandCodeFor(IVOffset, OfsTy, BI);
+
+    Value *GEPBase = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+    assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
+    // We could handle pointer IVs other than i8*, but we need to compensate for
+    // gep index scaling. See canExpandBackedgeTakenCount comments.
+    assert(SE->getSizeOfExpr(
+             cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
+           && "unit stride pointer IV must be i8*");
+
+    IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+    return Builder.CreateGEP(GEPBase, GEPOffset, "lftr.limit");
+  }
+  else {
+    // In any other case, convert both IVInit and IVCount to integers before
+    // comparing. This may result in SCEV expension of pointers, but in practice
+    // SCEV will fold the pointer arithmetic away as such:
+    // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+    //
+    // Valid Cases: (1) both integers is most common; (2) both may be pointers
+    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
+    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
+    // of case #2.
+
+    const SCEV *IVLimit = 0;
+    // For unit stride, IVCount = Start + BECount with 2's complement overflow.
+    // For non-zero Start, compute IVCount here.
+    if (AR->getStart()->isZero())
+      IVLimit = IVCount;
+    else {
+      assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+      const SCEV *IVInit = AR->getStart();
+
+      // For integer IVs, truncate the IV before computing IVInit + BECount.
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(IVCount->getType()))
+        IVInit = SE->getTruncateExpr(IVInit, IVCount->getType());
+
+      IVLimit = SE->getAddExpr(IVInit, IVCount);
+    }
+    // Expand the code for the iteration count.
+    BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+    IRBuilder<> Builder(BI);
+    assert(SE->isLoopInvariant(IVLimit, L) &&
+           "Computed iteration count is not loop invariant!");
+    // Ensure that we generate the same type as IndVar, or a smaller integer
+    // type. In the presence of null pointer values, we have an integer type
+    // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+    Type *LimitTy = IVCount->getType()->isPointerTy() ?
+      IndVar->getType() : IVCount->getType();
+    return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+  }
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+Value *IndVarSimplify::
+LinearFunctionTestReplace(Loop *L,
+                          const SCEV *BackedgeTakenCount,
+                          PHINode *IndVar,
+                          SCEVExpander &Rewriter) {
+  assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
+
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  Type *CntTy = BackedgeTakenCount->getType();
+
+  const SCEV *IVCount = BackedgeTakenCount;
+
+  // If the exiting block is the same as the backedge block, we prefer to
+  // compare against the post-incremented value, otherwise we must compare
+  // against the preincremented value.
+  Value *CmpIndVar;
+  if (L->getExitingBlock() == L->getLoopLatch()) {
+    // Add one to the "backedge-taken" count to get the trip count.
+    // If this addition may overflow, we have to be more pessimistic and
+    // cast the induction variable before doing the add.
+    const SCEV *N =
+      SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1));
+    if (CntTy == IVCount->getType())
+      IVCount = N;
+    else {
+      const SCEV *Zero = SE->getConstant(IVCount->getType(), 0);
+      if ((isa<SCEVConstant>(N) && !N->isZero()) ||
+          SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
+        // No overflow. Cast the sum.
+        IVCount = SE->getTruncateOrZeroExtend(N, CntTy);
+      } else {
+        // Potential overflow. Cast before doing the add.
+        IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
+        IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1));
+      }
+    }
+    // The BackedgeTaken expression contains the number of times that the
+    // backedge branches to the loop header.  This is one less than the
+    // number of times the loop executes, so use the incremented indvar.
+    CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
+  } else {
+    // We must use the preincremented value...
+    IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
+    CmpIndVar = IndVar;
+  }
+
+  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+  assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
+         && "genLoopLimit missed a cast");
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  ICmpInst::Predicate P;
+  if (L->contains(BI->getSuccessor(0)))
+    P = ICmpInst::ICMP_NE;
+  else
+    P = ICmpInst::ICMP_EQ;
+
+  DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+               << "      LHS:" << *CmpIndVar << '\n'
+               << "       op:\t"
+               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+               << "      RHS:\t" << *ExitCnt << "\n"
+               << "  IVCount:\t" << *IVCount << "\n");
+
+  IRBuilder<> Builder(BI);
+  if (SE->getTypeSizeInBits(CmpIndVar->getType())
+      > SE->getTypeSizeInBits(ExitCnt->getType())) {
+    CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                    "lftr.wideiv");
+  }
+
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+  Value *OrigCond = BI->getCondition();
+  // It's tempting to use replaceAllUsesWith here to fully replace the old
+  // comparison, but that's not immediately safe, since users of the old
+  // comparison may not be dominated by the new comparison. Instead, just
+  // update the branch to use the new comparison; in the common case this
+  // will make old comparison dead.
+  BI->setCondition(Cond);
+  DeadInsts.push_back(OrigCond);
+
+  ++NumLFTR;
+  Changed = true;
+  return Cond;
+}
+
+//===----------------------------------------------------------------------===//
+//  SinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) return;
+
+  Instruction *InsertPt = ExitBlock->getFirstInsertionPt();
+  BasicBlock::iterator I = Preheader->getTerminator();
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader.
+    if (isa<PHINode>(I))
+      break;
+
+    // Don't move instructions which might have side effects, since the side
+    // effects need to complete before instructions inside the loop.  Also don't
+    // move instructions which might read memory, since the loop may modify
+    // memory. Note that it's okay if the instruction might have undefined
+    // behavior: LoopSimplify guarantees that the preheader dominates the exit
+    // block.
+    if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      continue;
+
+    // Skip debug info intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Skip landingpad instructions.
+    if (isa<LandingPadInst>(I))
+      continue;
+
+    // Don't sink alloca: we never want to sink static alloca's out of the
+    // entry block, and correctly sinking dynamic alloca's requires
+    // checks for stacksave/stackrestore intrinsics.
+    // FIXME: Refactor this check somehow?
+    if (isa<AllocaInst>(I))
+      continue;
+
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+         UI != UE; ++UI) {
+      User *U = *UI;
+      BasicBlock *UseBB = cast<Instruction>(U)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(U)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = I;
+    bool Done = false;
+
+    if (I != Preheader->begin()) {
+      // Skip debug info intrinsics.
+      do {
+        --I;
+      } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+      if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+        Done = true;
+    } else {
+      Done = true;
+    }
+
+    ToMove->moveBefore(InsertPt);
+    if (Done) break;
+    InsertPt = ToMove;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // If LoopSimplify form is not available, stay out of trouble. Some notes:
+  //  - LSR currently only supports LoopSimplify-form loops. Indvars'
+  //    canonicalization can be a pessimization without LSR to "clean up"
+  //    afterwards.
+  //  - We depend on having a preheader; in particular,
+  //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
+  //    and we're in trouble if we can't find the induction variable even when
+  //    we've manually inserted one.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+  DeadInsts.clear();
+  Changed = false;
+
+  // If there are any floating-point recurrences, attempt to
+  // transform them to use integer recurrences.
+  RewriteNonIntegerIVs(L);
+
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+  // Eliminate redundant IV users.
+  //
+  // Simplification works best when run before other consumers of SCEV. We
+  // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+  // other expressions involving loop IVs have been evaluated. This helps SCEV
+  // set no-wrap flags before normalizing sign/zero extension.
+  Rewriter.disableCanonicalMode();
+  SimplifyAndExtend(L, Rewriter, LPM);
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    RewriteLoopExitValues(L, Rewriter);
+
+  // Eliminate redundant IV cycles.
+  NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+  // If we have a trip count expression, rewrite the loop's exit condition
+  // using it.  We can currently only handle loops with a single exit.
+  if (canExpandBackedgeTakenCount(L, SE) && needsLFTR(L, DT)) {
+    PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+    if (IndVar) {
+      // Check preconditions for proper SCEVExpander operation. SCEV does not
+      // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
+      // pass that uses the SCEVExpander must do it. This does not work well for
+      // loop passes because SCEVExpander makes assumptions about all loops, while
+      // LoopPassManager only forces the current loop to be simplified.
+      //
+      // FIXME: SCEV expansion has no way to bail out, so the caller must
+      // explicitly check any assumptions made by SCEV. Brittle.
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BackedgeTakenCount);
+      if (!AR || AR->getLoop()->getLoopPreheader())
+        (void)LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                        Rewriter);
+    }
+  }
+  // Clear the rewriter cache, because values that are in the rewriter's cache
+  // can be deleted in the loop below, causing the AssertingVH in the cache to
+  // trigger.
+  Rewriter.clear();
+
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty())
+    if (Instruction *Inst =
+          dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+
+  // The Rewriter may not be used from this point on.
+
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  SinkUnusedInvariants(L);
+
+  // Clean up dead instructions.
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+  // Check a post-condition.
+  assert(L->isLCSSAForm(*DT) &&
+         "Indvars did not leave the loop in lcssa form!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.
+#ifndef NDEBUG
+  if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+  }
+#endif
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
new file mode 100644
index 000000000000..b61c5ba56e0c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -0,0 +1,1618 @@
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jump-threading"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds,   "Number of terminators folded");
+STATISTIC(NumDupes,   "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+Threshold("jump-threading-threshold",
+          cl::desc("Max block size to duplicate for jump threading"),
+          cl::init(6), cl::Hidden);
+
+namespace {
+  // These are at global scope so static functions can use them too.
+  typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo;
+  typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy;
+
+  // This is used to keep track of what kind of constant we're currently hoping
+  // to find.
+  enum ConstantPreference {
+    WantInteger,
+    WantBlockAddress
+  };
+
+  /// This pass performs 'jump threading', which looks at blocks that have
+  /// multiple predecessors and multiple successors.  If one or more of the
+  /// predecessors of the block can be proven to always jump to one of the
+  /// successors, we forward the edge from the predecessor to the successor by
+  /// duplicating the contents of this block.
+  ///
+  /// An example of when this can occur is code like this:
+  ///
+  ///   if () { ...
+  ///     X = 4;
+  ///   }
+  ///   if (X < 3) {
+  ///
+  /// In this case, the unconditional branch at the end of the first if can be
+  /// revectored to the false side of the second if.
+  ///
+  class JumpThreading : public FunctionPass {
+    DataLayout *TD;
+    TargetLibraryInfo *TLI;
+    LazyValueInfo *LVI;
+#ifdef NDEBUG
+    SmallPtrSet<BasicBlock*, 16> LoopHeaders;
+#else
+    SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders;
+#endif
+    DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet;
+
+    // RAII helper for updating the recursion stack.
+    struct RecursionSetRemover {
+      DenseSet<std::pair<Value*, BasicBlock*> > &TheSet;
+      std::pair<Value*, BasicBlock*> ThePair;
+
+      RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S,
+                          std::pair<Value*, BasicBlock*> P)
+        : TheSet(S), ThePair(P) { }
+
+      ~RecursionSetRemover() {
+        TheSet.erase(ThePair);
+      }
+    };
+  public:
+    static char ID; // Pass identification
+    JumpThreading() : FunctionPass(ID) {
+      initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LazyValueInfo>();
+      AU.addPreserved<LazyValueInfo>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+    void FindLoopHeaders(Function &F);
+    bool ProcessBlock(BasicBlock *BB);
+    bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs,
+                    BasicBlock *SuccBB);
+    bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+                                  const SmallVectorImpl<BasicBlock *> &PredBBs);
+
+    bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,
+                                         PredValueInfo &Result,
+                                         ConstantPreference Preference);
+    bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                ConstantPreference Preference);
+
+    bool ProcessBranchOnPHI(PHINode *PN);
+    bool ProcessBranchOnXOR(BinaryOperator *BO);
+
+    bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+  };
+}
+
+char JumpThreading::ID = 0;
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfo)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+                "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
+FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
+
+/// runOnFunction - Top level algorithm.
+///
+bool JumpThreading::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  LVI = &getAnalysis<LazyValueInfo>();
+
+  FindLoopHeaders(F);
+
+  bool Changed, EverChanged = false;
+  do {
+    Changed = false;
+    for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+      BasicBlock *BB = I;
+      // Thread all of the branches we can over this block.
+      while (ProcessBlock(BB))
+        Changed = true;
+
+      ++I;
+
+      // If the block is trivially dead, zap it.  This eliminates the successor
+      // edges which simplifies the CFG.
+      if (pred_begin(BB) == pred_end(BB) &&
+          BB != &BB->getParent()->getEntryBlock()) {
+        DEBUG(dbgs() << "  JT: Deleting dead block '" << BB->getName()
+              << "' with terminator: " << *BB->getTerminator() << '\n');
+        LoopHeaders.erase(BB);
+        LVI->eraseBlock(BB);
+        DeleteDeadBlock(BB);
+        Changed = true;
+        continue;
+      }
+
+      BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+
+      // Can't thread an unconditional jump, but if the block is "almost
+      // empty", we can replace uses of it with uses of the successor and make
+      // this dead.
+      if (BI && BI->isUnconditional() &&
+          BB != &BB->getParent()->getEntryBlock() &&
+          // If the terminator is the only non-phi instruction, try to nuke it.
+          BB->getFirstNonPHIOrDbg()->isTerminator()) {
+        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
+        // block, we have to make sure it isn't in the LoopHeaders set.  We
+        // reinsert afterward if needed.
+        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
+        BasicBlock *Succ = BI->getSuccessor(0);
+
+        // FIXME: It is always conservatively correct to drop the info
+        // for a block even if it doesn't get erased.  This isn't totally
+        // awesome, but it allows us to use AssertingVH to prevent nasty
+        // dangling pointer issues within LazyValueInfo.
+        LVI->eraseBlock(BB);
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+          Changed = true;
+          // If we deleted BB and BB was the header of a loop, then the
+          // successor is now the header of the loop.
+          BB = Succ;
+        }
+
+        if (ErasedFromLoopHeaders)
+          LoopHeaders.insert(BB);
+      }
+    }
+    EverChanged |= Changed;
+  } while (Changed);
+
+  LoopHeaders.clear();
+  return EverChanged;
+}
+
+/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
+/// thread across it. Stop scanning the block when passing the threshold.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+                                             unsigned Threshold) {
+  /// Ignore PHI nodes, these will be flattened when duplication happens.
+  BasicBlock::const_iterator I = BB->getFirstNonPHI();
+
+  // FIXME: THREADING will delete values that are just used to compute the
+  // branch, so they shouldn't count against the duplication cost.
+
+  // Sum up the cost of each instruction until we get to the terminator.  Don't
+  // include the terminator because the copy won't include it.
+  unsigned Size = 0;
+  for (; !isa<TerminatorInst>(I); ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
+    // Debugger intrinsics don't incur code size.
+    if (isa<DbgInfoIntrinsic>(I)) continue;
+
+    // If this is a pointer->pointer bitcast, it is free.
+    if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+      continue;
+
+    // All other instructions count for at least one unit.
+    ++Size;
+
+    // Calls are more expensive.  If they are non-intrinsic calls, we model them
+    // as having cost of 4.  If they are a non-vector intrinsic, we model them
+    // as having cost of 2 total, and if they are a vector intrinsic, we model
+    // them as having cost 1.
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (CI->hasFnAttr(Attribute::NoDuplicate))
+        // Blocks with NoDuplicate are modelled as having infinite cost, so they
+        // are never duplicated.
+        return ~0U;
+      else if (!isa<IntrinsicInst>(CI))
+        Size += 3;
+      else if (!CI->getType()->isVectorTy())
+        Size += 1;
+    }
+  }
+
+  // Threading through a switch statement is particularly profitable.  If this
+  // block ends in a switch, decrease its cost to make it more likely to happen.
+  if (isa<SwitchInst>(I))
+    Size = Size > 6 ? Size-6 : 0;
+
+  // The same holds for indirect branches, but slightly more so.
+  if (isa<IndirectBrInst>(I))
+    Size = Size > 8 ? Size-8 : 0;
+
+  return Size;
+}
+
+/// FindLoopHeaders - We do not want jump threading to turn proper loop
+/// structures into irreducible loops.  Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations.  To prevent this from
+/// happening, we first have to find the loop headers.  Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header.  For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop).  This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
+///
+void JumpThreading::FindLoopHeaders(Function &F) {
+  SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+  FindFunctionBackedges(F, Edges);
+
+  for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+    LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second));
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+  if (!Val)
+    return 0;
+
+  // Undef is "known" enough.
+  if (UndefValue *U = dyn_cast<UndefValue>(Val))
+    return U;
+
+  if (Preference == WantBlockAddress)
+    return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+  return dyn_cast<ConstantInt>(Val);
+}
+
+/// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors.  If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
+///
+bool JumpThreading::
+ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result,
+                                ConstantPreference Preference) {
+  // This method walks up use-def chains recursively.  Because of this, we could
+  // get into an infinite loop going around loops in the use-def chain.  To
+  // prevent this, keep track of what (value, block) pairs we've already visited
+  // and terminate the search if we loop back to them
+  if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+    return false;
+
+  // An RAII help to remove this pair from the recursion set once the recursion
+  // stack pops back out again.
+  RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB));
+
+  // If V is a constant, then it is known in all predecessors.
+  if (Constant *KC = getKnownConstant(V, Preference)) {
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+      Result.push_back(std::make_pair(KC, *PI));
+
+    return true;
+  }
+
+  // If V is a non-instruction value, or an instruction in a different block,
+  // then it can't be derived from a PHI.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || I->getParent() != BB) {
+
+    // Okay, if this is a live-in value, see if it has a known value at the end
+    // of any of our predecessors.
+    //
+    // FIXME: This should be an edge property, not a block end property.
+    /// TODO: Per PR2563, we could infer value range information about a
+    /// predecessor based on its terminator.
+    //
+    // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+    // "I" is a non-local compare-with-a-constant instruction.  This would be
+    // able to handle value inequalities better, for example if the compare is
+    // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+    // Perhaps getConstantOnEdge should be smart enough to do this?
+
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      // If the value is known by LazyValueInfo to be a constant in a
+      // predecessor, use that information to try to thread this block.
+      Constant *PredCst = LVI->getConstantOnEdge(V, P, BB);
+      if (Constant *KC = getKnownConstant(PredCst, Preference))
+        Result.push_back(std::make_pair(KC, P));
+    }
+
+    return !Result.empty();
+  }
+
+  /// If I is a PHI node, then we know the incoming values for any constants.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *InVal = PN->getIncomingValue(i);
+      if (Constant *KC = getKnownConstant(InVal, Preference)) {
+        Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      } else {
+        Constant *CI = LVI->getConstantOnEdge(InVal,
+                                              PN->getIncomingBlock(i), BB);
+        if (Constant *KC = getKnownConstant(CI, Preference))
+          Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  PredValueInfoTy LHSVals, RHSVals;
+
+  // Handle some boolean conditions.
+  if (I->getType()->getPrimitiveSizeInBits() == 1) {
+    assert(Preference == WantInteger && "One-bit non-integer type?");
+    // X | true -> true
+    // X & false -> false
+    if (I->getOpcode() == Instruction::Or ||
+        I->getOpcode() == Instruction::And) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                      WantInteger);
+      ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals,
+                                      WantInteger);
+
+      if (LHSVals.empty() && RHSVals.empty())
+        return false;
+
+      ConstantInt *InterestingVal;
+      if (I->getOpcode() == Instruction::Or)
+        InterestingVal = ConstantInt::getTrue(I->getContext());
+      else
+        InterestingVal = ConstantInt::getFalse(I->getContext());
+
+      SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+      // Scan for the sentinel.  If we find an undef, force it to the
+      // interesting value: x|undef -> true and x&undef -> false.
+      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i)
+        if (LHSVals[i].first == InterestingVal ||
+            isa<UndefValue>(LHSVals[i].first)) {
+          Result.push_back(LHSVals[i]);
+          Result.back().first = InterestingVal;
+          LHSKnownBBs.insert(LHSVals[i].second);
+        }
+      for (unsigned i = 0, e = RHSVals.size(); i != e; ++i)
+        if (RHSVals[i].first == InterestingVal ||
+            isa<UndefValue>(RHSVals[i].first)) {
+          // If we already inferred a value for this block on the LHS, don't
+          // re-add it.
+          if (!LHSKnownBBs.count(RHSVals[i].second)) {
+            Result.push_back(RHSVals[i]);
+            Result.back().first = InterestingVal;
+          }
+        }
+
+      return !Result.empty();
+    }
+
+    // Handle the NOT form of XOR.
+    if (I->getOpcode() == Instruction::Xor &&
+        isa<ConstantInt>(I->getOperand(1)) &&
+        cast<ConstantInt>(I->getOperand(1))->isOne()) {
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result,
+                                      WantInteger);
+      if (Result.empty())
+        return false;
+
+      // Invert the known values.
+      for (unsigned i = 0, e = Result.size(); i != e; ++i)
+        Result[i].first = ConstantExpr::getNot(Result[i].first);
+
+      return true;
+    }
+
+  // Try to simplify some other binary operator values.
+  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    assert(Preference != WantBlockAddress
+            && "A binary operator creating a block address?");
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+      PredValueInfoTy LHSVals;
+      ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals,
+                                      WantInteger);
+
+      // Try to use constant folding to simplify the binary operator.
+      for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
+        Constant *V = LHSVals[i].first;
+        Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.push_back(std::make_pair(KC, LHSVals[i].second));
+      }
+    }
+
+    return !Result.empty();
+  }
+
+  // Handle compare with phi operand, where the PHI is defined in this block.
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    assert(Preference == WantInteger && "Compares only produce integers");
+    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+    if (PN && PN->getParent() == BB) {
+      // We can do this simplification if any comparisons fold to true or false.
+      // See if any do.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        Value *LHS = PN->getIncomingValue(i);
+        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+
+        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD);
+        if (Res == 0) {
+          if (!isa<Constant>(RHS))
+            continue;
+
+          LazyValueInfo::Tristate
+            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+                                           cast<Constant>(RHS), PredBB, BB);
+          if (ResT == LazyValueInfo::Unknown)
+            continue;
+          Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+        }
+
+        if (Constant *KC = getKnownConstant(Res, WantInteger))
+          Result.push_back(std::make_pair(KC, PredBB));
+      }
+
+      return !Result.empty();
+    }
+
+
+    // If comparing a live-in value against a constant, see if we know the
+    // live-in value on any predecessors.
+    if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
+      if (!isa<Instruction>(Cmp->getOperand(0)) ||
+          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
+        Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);PI != E; ++PI){
+          BasicBlock *P = *PI;
+          // If the value is known by LazyValueInfo to be a constant in a
+          // predecessor, use that information to try to thread this block.
+          LazyValueInfo::Tristate Res =
+            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
+                                    RHSCst, P, BB);
+          if (Res == LazyValueInfo::Unknown)
+            continue;
+
+          Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+          Result.push_back(std::make_pair(ResC, P));
+        }
+
+        return !Result.empty();
+      }
+
+      // Try to find a constant value for the LHS of a comparison,
+      // and evaluate it statically if we can.
+      if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
+        PredValueInfoTy LHSVals;
+        ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                        WantInteger);
+
+        for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) {
+          Constant *V = LHSVals[i].first;
+          Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
+                                                      V, CmpConst);
+          if (Constant *KC = getKnownConstant(Folded, WantInteger))
+            Result.push_back(std::make_pair(KC, LHSVals[i].second));
+        }
+
+        return !Result.empty();
+      }
+    }
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+    // Handle select instructions where at least one operand is a known constant
+    // and we can figure out the condition value for any predecessor block.
+    Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+    Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+    PredValueInfoTy Conds;
+    if ((TrueVal || FalseVal) &&
+        ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds,
+                                        WantInteger)) {
+      for (unsigned i = 0, e = Conds.size(); i != e; ++i) {
+        Constant *Cond = Conds[i].first;
+
+        // Figure out what value to use for the condition.
+        bool KnownCond;
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+          // A known boolean.
+          KnownCond = CI->isOne();
+        } else {
+          assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+          // Either operand will do, so be sure to pick the one that's a known
+          // constant.
+          // FIXME: Do this more cleverly if both values are known constants?
+          KnownCond = (TrueVal != 0);
+        }
+
+        // See if the select has a known constant value for this predecessor.
+        if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+          Result.push_back(std::make_pair(Val, Conds[i].second));
+      }
+
+      return !Result.empty();
+    }
+  }
+
+  // If all else fails, see if LVI can figure out a constant value for us.
+  Constant *CI = LVI->getConstant(V, BB);
+  if (Constant *KC = getKnownConstant(CI, Preference)) {
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+      Result.push_back(std::make_pair(KC, *PI));
+  }
+
+  return !Result.empty();
+}
+
+
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors.  This should reduce the in-degree of the others.
+///
+static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
+  TerminatorInst *BBTerm = BB->getTerminator();
+  unsigned MinSucc = 0;
+  BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+  // Compute the successor with the minimum number of predecessors.
+  unsigned MinNumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+  for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+    TestBB = BBTerm->getSuccessor(i);
+    unsigned NumPreds = std::distance(pred_begin(TestBB), pred_end(TestBB));
+    if (NumPreds < MinNumPreds) {
+      MinSucc = i;
+      MinNumPreds = NumPreds;
+    }
+  }
+
+  return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+  if (!BB->hasAddressTaken()) return false;
+
+  // If the block has its address taken, it may be a tree of dead constants
+  // hanging off of it.  These shouldn't keep the block alive.
+  BlockAddress *BA = BlockAddress::get(BB);
+  BA->removeDeadConstantUsers();
+  return !BA->use_empty();
+}
+
+/// ProcessBlock - If there are any predecessors whose control can be threaded
+/// through to a successor, transform them now.
+bool JumpThreading::ProcessBlock(BasicBlock *BB) {
+  // If the block is trivially dead, just return and let the caller nuke it.
+  // This simplifies other transformations.
+  if (pred_begin(BB) == pred_end(BB) &&
+      BB != &BB->getParent()->getEntryBlock())
+    return false;
+
+  // If this block has a single predecessor, and if that pred has a single
+  // successor, merge the blocks.  This encourages recursive jump threading
+  // because now the condition in this block can be threaded through
+  // predecessors of our predecessor block.
+  if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
+    if (SinglePred->getTerminator()->getNumSuccessors() == 1 &&
+        SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
+      // If SinglePred was a loop header, BB becomes one.
+      if (LoopHeaders.erase(SinglePred))
+        LoopHeaders.insert(BB);
+
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      LVI->eraseBlock(SinglePred);
+      MergeBasicBlockIntoOnlyPred(BB);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+      return true;
+    }
+  }
+
+  // What kind of constant we're looking for.
+  ConstantPreference Preference = WantInteger;
+
+  // Look to see if the terminator is a conditional branch, switch or indirect
+  // branch, if not we can't thread it.
+  Value *Condition;
+  Instruction *Terminator = BB->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+    // Can't thread an unconditional jump.
+    if (BI->isUnconditional()) return false;
+    Condition = BI->getCondition();
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+    Condition = SI->getCondition();
+  } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+    // Can't thread indirect branch with no successors.
+    if (IB->getNumSuccessors() == 0) return false;
+    Condition = IB->getAddress()->stripPointerCasts();
+    Preference = WantBlockAddress;
+  } else {
+    return false; // Must be an invoke.
+  }
+
+  // Run constant folding to see if we can reduce the condition to a simple
+  // constant.
+  if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+    Value *SimpleVal = ConstantFoldInstruction(I, TD, TLI);
+    if (SimpleVal) {
+      I->replaceAllUsesWith(SimpleVal);
+      I->eraseFromParent();
+      Condition = SimpleVal;
+    }
+  }
+
+  // If the terminator is branching on an undef, we can pick any of the
+  // successors to branch to.  Let GetBestDestForJumpOnUndef decide.
+  if (isa<UndefValue>(Condition)) {
+    unsigned BestSucc = GetBestDestForJumpOnUndef(BB);
+
+    // Fold the branch/switch.
+    TerminatorInst *BBTerm = BB->getTerminator();
+    for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+      if (i == BestSucc) continue;
+      BBTerm->getSuccessor(i)->removePredecessor(BB, true);
+    }
+
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding undef terminator: " << *BBTerm << '\n');
+    BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+    BBTerm->eraseFromParent();
+    return true;
+  }
+
+  // If the terminator of this block is branching on a constant, simplify the
+  // terminator to an unconditional branch.  This can occur due to threading in
+  // other blocks.
+  if (getKnownConstant(Condition, Preference)) {
+    DEBUG(dbgs() << "  In block '" << BB->getName()
+          << "' folding terminator: " << *BB->getTerminator() << '\n');
+    ++NumFolds;
+    ConstantFoldTerminator(BB, true);
+    return true;
+  }
+
+  Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+  // All the rest of our checks depend on the condition being an instruction.
+  if (CondInst == 0) {
+    // FIXME: Unify this with code below.
+    if (ProcessThreadableEdges(Condition, BB, Preference))
+      return true;
+    return false;
+  }
+
+
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+    // For a comparison where the LHS is outside this block, it's possible
+    // that we've branched on it before.  Used LVI to see if we can simplify
+    // the branch based on that.
+    BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+    Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    if (CondBr && CondConst && CondBr->isConditional() && PI != PE &&
+        (!isa<Instruction>(CondCmp->getOperand(0)) ||
+         cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) {
+      // For predecessor edge, determine if the comparison is true or false
+      // on that edge.  If they're all true or all false, we can simplify the
+      // branch.
+      // FIXME: We could handle mixed true/false by duplicating code.
+      LazyValueInfo::Tristate Baseline =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0),
+                                CondConst, *PI, BB);
+      if (Baseline != LazyValueInfo::Unknown) {
+        // Check that all remaining incoming values match the first one.
+        while (++PI != PE) {
+          LazyValueInfo::Tristate Ret =
+            LVI->getPredicateOnEdge(CondCmp->getPredicate(),
+                                    CondCmp->getOperand(0), CondConst, *PI, BB);
+          if (Ret != Baseline) break;
+        }
+
+        // If we terminated early, then one of the values didn't match.
+        if (PI == PE) {
+          unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0;
+          unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1;
+          CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true);
+          BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+          CondBr->eraseFromParent();
+          return true;
+        }
+      }
+    }
+  }
+
+  // Check for some cases that are worth simplifying.  Right now we want to look
+  // for loads that are used by a switch or by the condition for the branch.  If
+  // we see one, check to see if it's partially redundant.  If so, insert a PHI
+  // which can then be used to thread the values.
+  //
+  Value *SimplifyValue = CondInst;
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+    if (isa<Constant>(CondCmp->getOperand(1)))
+      SimplifyValue = CondCmp->getOperand(0);
+
+  // TODO: There are other places where load PRE would be profitable, such as
+  // more complex comparisons.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue))
+    if (SimplifyPartiallyRedundantLoad(LI))
+      return true;
+
+
+  // Handle a variety of cases where we are branching on something derived from
+  // a PHI node in the current block.  If we can prove that any predecessors
+  // compute a predictable value based on a PHI node, thread those predecessors.
+  //
+  if (ProcessThreadableEdges(CondInst, BB, Preference))
+    return true;
+
+  // If this is an otherwise-unfoldable branch on a phi node in the current
+  // block, see if we can simplify.
+  if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      return ProcessBranchOnPHI(PN);
+
+
+  // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+  if (CondInst->getOpcode() == Instruction::Xor &&
+      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst));
+
+
+  // TODO: If we have: "br (X > 0)"  and we have a predecessor where we know
+  // "(X == 4)", thread through this block.
+
+  return false;
+}
+
+
+/// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
+/// load instruction, eliminate it by replacing it with a PHI node.  This is an
+/// important optimization that encourages jump threading, and needs to be run
+/// interlaced with other jump threading tasks.
+bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
+  // Don't hack volatile/atomic loads.
+  if (!LI->isSimple()) return false;
+
+  // If the load is defined in a block with exactly one predecessor, it can't be
+  // partially redundant.
+  BasicBlock *LoadBB = LI->getParent();
+  if (LoadBB->getSinglePredecessor())
+    return false;
+
+  Value *LoadedPtr = LI->getOperand(0);
+
+  // If the loaded operand is defined in the LoadBB, it can't be available.
+  // TODO: Could do simple PHI translation, that would be fun :)
+  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
+    if (PtrOp->getParent() == LoadBB)
+      return false;
+
+  // Scan a few instructions up from the load, to see if it is obviously live at
+  // the entry to its block.
+  BasicBlock::iterator BBIt = LI;
+
+  if (Value *AvailableVal =
+        FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) {
+    // If the value if the load is locally available within the block, just use
+    // it.  This frequently occurs for reg2mem'd allocas.
+    //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n";
+
+    // If the returned value is the load itself, replace with an undef. This can
+    // only happen in dead loops.
+    if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType());
+    LI->replaceAllUsesWith(AvailableVal);
+    LI->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, if we scanned the whole block and got to the top of the block,
+  // we know the block is locally transparent to the load.  If not, something
+  // might clobber its value.
+  if (BBIt != LoadBB->begin())
+    return false;
+
+  // If all of the loads and stores that feed the value have the same TBAA tag,
+  // then we can propagate it onto any newly inserted loads.
+  MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa);
+
+  SmallPtrSet<BasicBlock*, 8> PredsScanned;
+  typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy;
+  AvailablePredsTy AvailablePreds;
+  BasicBlock *OneUnavailablePred = 0;
+
+  // If we got here, the loaded value is transparent through to the start of the
+  // block.  Check to see if it is available in any of the predecessor blocks.
+  for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredBB = *PI;
+
+    // If we already scanned this predecessor, skip it.
+    if (!PredsScanned.insert(PredBB))
+      continue;
+
+    // Scan the predecessor to see if the value is available in the pred.
+    BBIt = PredBB->end();
+    MDNode *ThisTBAATag = 0;
+    Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6,
+                                                    0, &ThisTBAATag);
+    if (!PredAvailable) {
+      OneUnavailablePred = PredBB;
+      continue;
+    }
+
+    // If tbaa tags disagree or are not present, forget about them.
+    if (TBAATag != ThisTBAATag) TBAATag = 0;
+
+    // If so, this load is partially redundant.  Remember this info so that we
+    // can create a PHI node.
+    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+  }
+
+  // If the loaded value isn't available in any predecessor, it isn't partially
+  // redundant.
+  if (AvailablePreds.empty()) return false;
+
+  // Okay, the loaded value is available in at least one (and maybe all!)
+  // predecessors.  If the value is unavailable in more than one unique
+  // predecessor, we want to insert a merge block for those common predecessors.
+  // This ensures that we only have to insert one reload, thus not increasing
+  // code size.
+  BasicBlock *UnavailablePred = 0;
+
+  // If there is exactly one predecessor where the value is unavailable, the
+  // already computed 'OneUnavailablePred' block is it.  If it ends in an
+  // unconditional branch, we know that it isn't a critical edge.
+  if (PredsScanned.size() == AvailablePreds.size()+1 &&
+      OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    UnavailablePred = OneUnavailablePred;
+  } else if (PredsScanned.size() != AvailablePreds.size()) {
+    // Otherwise, we had multiple unavailable predecessors or we had a critical
+    // edge from the one.
+    SmallVector<BasicBlock*, 8> PredsToSplit;
+    SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+    for (unsigned i = 0, e = AvailablePreds.size(); i != e; ++i)
+      AvailablePredSet.insert(AvailablePreds[i].first);
+
+    // Add all the unavailable predecessors to the PredsToSplit list.
+    for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB);
+         PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      // If the predecessor is an indirect goto, we can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()))
+        return false;
+
+      if (!AvailablePredSet.count(P))
+        PredsToSplit.push_back(P);
+    }
+
+    // Split them out to their own block.
+    UnavailablePred =
+      SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split", this);
+  }
+
+  // If the value isn't available in all predecessors, then there will be
+  // exactly one where it isn't available.  Insert a load on that edge and add
+  // it to the AvailablePreds list.
+  if (UnavailablePred) {
+    assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+           "Can't handle critical edge here!");
+    LoadInst *NewVal = new LoadInst(LoadedPtr, LI->getName()+".pr", false,
+                                 LI->getAlignment(),
+                                 UnavailablePred->getTerminator());
+    NewVal->setDebugLoc(LI->getDebugLoc());
+    if (TBAATag)
+      NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+  }
+
+  // Now we know that each predecessor of this block has a value in
+  // AvailablePreds, sort them for efficient access as we're walking the preds.
+  array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+  // Create a PHI node at the start of the block for the PRE'd load value.
+  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+  PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "",
+                                LoadBB->begin());
+  PN->takeName(LI);
+  PN->setDebugLoc(LI->getDebugLoc());
+
+  // Insert new entries into the PHI for each predecessor.  A single block may
+  // have multiple entries here.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    AvailablePredsTy::iterator I =
+      std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(),
+                       std::make_pair(P, (Value*)0));
+
+    assert(I != AvailablePreds.end() && I->first == P &&
+           "Didn't find entry for predecessor!");
+
+    PN->addIncoming(I->second, I->first);
+  }
+
+  //cerr << "PRE: " << *LI << *PN << "\n";
+
+  LI->replaceAllUsesWith(PN);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+/// FindMostPopularDest - The specified list contains multiple possible
+/// threadable destinations.  Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
+FindMostPopularDest(BasicBlock *BB,
+                    const SmallVectorImpl<std::pair<BasicBlock*,
+                                  BasicBlock*> > &PredToDestList) {
+  assert(!PredToDestList.empty());
+
+  // Determine popularity.  If there are multiple possible destinations, we
+  // explicitly choose to ignore 'undef' destinations.  We prefer to thread
+  // blocks with known and real destinations to threading undef.  We'll handle
+  // them later if interesting.
+  DenseMap<BasicBlock*, unsigned> DestPopularity;
+  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
+    if (PredToDestList[i].second)
+      DestPopularity[PredToDestList[i].second]++;
+
+  // Find the most popular dest.
+  DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
+  BasicBlock *MostPopularDest = DPI->first;
+  unsigned Popularity = DPI->second;
+  SmallVector<BasicBlock*, 4> SamePopularity;
+
+  for (++DPI; DPI != DestPopularity.end(); ++DPI) {
+    // If the popularity of this entry isn't higher than the popularity we've
+    // seen so far, ignore it.
+    if (DPI->second < Popularity)
+      ; // ignore.
+    else if (DPI->second == Popularity) {
+      // If it is the same as what we've seen so far, keep track of it.
+      SamePopularity.push_back(DPI->first);
+    } else {
+      // If it is more popular, remember it.
+      SamePopularity.clear();
+      MostPopularDest = DPI->first;
+      Popularity = DPI->second;
+    }
+  }
+
+  // Okay, now we know the most popular destination.  If there is more than one
+  // destination, we need to determine one.  This is arbitrary, but we need
+  // to make a deterministic decision.  Pick the first one that appears in the
+  // successor list.
+  if (!SamePopularity.empty()) {
+    SamePopularity.push_back(MostPopularDest);
+    TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0; ; ++i) {
+      assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+
+      if (std::find(SamePopularity.begin(), SamePopularity.end(),
+                    TI->getSuccessor(i)) == SamePopularity.end())
+        continue;
+
+      MostPopularDest = TI->getSuccessor(i);
+      break;
+    }
+  }
+
+  // Okay, we have finally picked the most popular destination.
+  return MostPopularDest;
+}
+
+bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
+                                           ConstantPreference Preference) {
+  // If threading this would thread across a loop header, don't even try to
+  // thread the edge.
+  if (LoopHeaders.count(BB))
+    return false;
+
+  PredValueInfoTy PredValues;
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference))
+    return false;
+
+  assert(!PredValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  DEBUG(dbgs() << "IN BB: " << *BB;
+        for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
+          dbgs() << "  BB '" << BB->getName() << "': FOUND condition = "
+            << *PredValues[i].first
+            << " for pred '" << PredValues[i].second->getName() << "'.\n";
+        });
+
+  // Decide what we want to thread through.  Convert our list of known values to
+  // a list of known destinations for each pred.  This also discards duplicate
+  // predecessors and keeps track of the undefined inputs (which are represented
+  // as a null dest in the PredToDestList).
+  SmallPtrSet<BasicBlock*, 16> SeenPreds;
+  SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+  BasicBlock *OnlyDest = 0;
+  BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i) {
+    BasicBlock *Pred = PredValues[i].second;
+    if (!SeenPreds.insert(Pred))
+      continue;  // Duplicate predecessor entry.
+
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination.
+    if (isa<IndirectBrInst>(Pred->getTerminator()))
+      continue;
+
+    Constant *Val = PredValues[i].first;
+
+    BasicBlock *DestBB;
+    if (isa<UndefValue>(Val))
+      DestBB = 0;
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+      DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+    else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+    } else {
+      assert(isa<IndirectBrInst>(BB->getTerminator())
+              && "Unexpected terminator");
+      DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+    }
+
+    // If we have exactly one destination, remember it for efficiency below.
+    if (PredToDestList.empty())
+      OnlyDest = DestBB;
+    else if (OnlyDest != DestBB)
+      OnlyDest = MultipleDestSentinel;
+
+    PredToDestList.push_back(std::make_pair(Pred, DestBB));
+  }
+
+  // If all edges were unthreadable, we fail.
+  if (PredToDestList.empty())
+    return false;
+
+  // Determine which is the most common successor.  If we have many inputs and
+  // this block is a switch, we want to start by threading the batch that goes
+  // to the most popular destination first.  If we only know about one
+  // threadable destination (the common case) we can avoid this.
+  BasicBlock *MostPopularDest = OnlyDest;
+
+  if (MostPopularDest == MultipleDestSentinel)
+    MostPopularDest = FindMostPopularDest(BB, PredToDestList);
+
+  // Now that we know what the most popular destination is, factor all
+  // predecessors that will jump to it into a single predecessor.
+  SmallVector<BasicBlock*, 16> PredsToFactor;
+  for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i)
+    if (PredToDestList[i].second == MostPopularDest) {
+      BasicBlock *Pred = PredToDestList[i].first;
+
+      // This predecessor may be a switch or something else that has multiple
+      // edges to the block.  Factor each of these edges by listing them
+      // according to # occurrences in PredsToFactor.
+      TerminatorInst *PredTI = Pred->getTerminator();
+      for (unsigned i = 0, e = PredTI->getNumSuccessors(); i != e; ++i)
+        if (PredTI->getSuccessor(i) == BB)
+          PredsToFactor.push_back(Pred);
+    }
+
+  // If the threadable edges are branching on an undefined value, we get to pick
+  // the destination that these predecessors should get to.
+  if (MostPopularDest == 0)
+    MostPopularDest = BB->getTerminator()->
+                            getSuccessor(GetBestDestForJumpOnUndef(BB));
+
+  // Ok, try to thread it!
+  return ThreadEdge(BB, PredsToFactor, MostPopularDest);
+}
+
+/// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on
+/// a PHI node in the current block.  See if there are any simplifications we
+/// can do based on inputs to the phi node.
+///
+bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) {
+  BasicBlock *BB = PN->getParent();
+
+  // TODO: We could make use of this to do it once for blocks with common PHI
+  // values.
+  SmallVector<BasicBlock*, 1> PredBBs;
+  PredBBs.resize(1);
+
+  // If any of the predecessor blocks end in an unconditional branch, we can
+  // *duplicate* the conditional branch into that block in order to further
+  // encourage jump threading and to eliminate cases where we have branch on a
+  // phi of an icmp (branch on icmp is much better).
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+      if (PredBr->isUnconditional()) {
+        PredBBs[0] = PredBB;
+        // Try to duplicate BB into PredBB.
+        if (DuplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+          return true;
+      }
+  }
+
+  return false;
+}
+
+/// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on
+/// a xor instruction in the current block.  See if there are any
+/// simplifications we can do based on inputs to the xor.
+///
+bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) {
+  BasicBlock *BB = BO->getParent();
+
+  // If either the LHS or RHS of the xor is a constant, don't do this
+  // optimization.
+  if (isa<ConstantInt>(BO->getOperand(0)) ||
+      isa<ConstantInt>(BO->getOperand(1)))
+    return false;
+
+  // If the first instruction in BB isn't a phi, we won't be able to infer
+  // anything special about any particular predecessor.
+  if (!isa<PHINode>(BB->front()))
+    return false;
+
+  // If we have a xor as the branch input to this block, and we know that the
+  // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+  // the condition into the predecessor and fix that value to true, saving some
+  // logical ops on that path and encouraging other paths to simplify.
+  //
+  // This copies something like this:
+  //
+  //  BB:
+  //    %X = phi i1 [1],  [%X']
+  //    %Y = icmp eq i32 %A, %B
+  //    %Z = xor i1 %X, %Y
+  //    br i1 %Z, ...
+  //
+  // Into:
+  //  BB':
+  //    %Y = icmp ne i32 %A, %B
+  //    br i1 %Z, ...
+
+  PredValueInfoTy XorOpValues;
+  bool isLHS = true;
+  if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
+                                       WantInteger)) {
+    assert(XorOpValues.empty());
+    if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
+                                         WantInteger))
+      return false;
+    isLHS = false;
+  }
+
+  assert(!XorOpValues.empty() &&
+         "ComputeValueKnownInPredecessors returned true with no values");
+
+  // Scan the information to see which is most popular: true or false.  The
+  // predecessors can be of the set true, false, or undef.
+  unsigned NumTrue = 0, NumFalse = 0;
+  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
+    if (isa<UndefValue>(XorOpValues[i].first))
+      // Ignore undefs for the count.
+      continue;
+    if (cast<ConstantInt>(XorOpValues[i].first)->isZero())
+      ++NumFalse;
+    else
+      ++NumTrue;
+  }
+
+  // Determine which value to split on, true, false, or undef if neither.
+  ConstantInt *SplitVal = 0;
+  if (NumTrue > NumFalse)
+    SplitVal = ConstantInt::getTrue(BB->getContext());
+  else if (NumTrue != 0 || NumFalse != 0)
+    SplitVal = ConstantInt::getFalse(BB->getContext());
+
+  // Collect all of the blocks that this can be folded into so that we can
+  // factor this once and clone it once.
+  SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+  for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) {
+    if (XorOpValues[i].first != SplitVal &&
+        !isa<UndefValue>(XorOpValues[i].first))
+      continue;
+
+    BlocksToFoldInto.push_back(XorOpValues[i].second);
+  }
+
+  // If we inferred a value for all of the predecessors, then duplication won't
+  // help us.  However, we can just replace the LHS or RHS with the constant.
+  if (BlocksToFoldInto.size() ==
+      cast<PHINode>(BB->front()).getNumIncomingValues()) {
+    if (SplitVal == 0) {
+      // If all preds provide undef, just nuke the xor, because it is undef too.
+      BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+      BO->eraseFromParent();
+    } else if (SplitVal->isZero()) {
+      // If all preds provide 0, replace the xor with the other input.
+      BO->replaceAllUsesWith(BO->getOperand(isLHS));
+      BO->eraseFromParent();
+    } else {
+      // If all preds provide 1, set the computed value to 1.
+      BO->setOperand(!isLHS, SplitVal);
+    }
+
+    return true;
+  }
+
+  // Try to duplicate BB into PredBB.
+  return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
+}
+
+
+/// AddPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
+/// predecessor to the PHIBB block.  If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
+static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
+                                            BasicBlock *OldPred,
+                                            BasicBlock *NewPred,
+                                     DenseMap<Instruction*, Value*> &ValueMap) {
+  for (BasicBlock::iterator PNI = PHIBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(PNI); ++PNI) {
+    // Ok, we have a PHI node.  Figure out what the incoming value was for the
+    // DestBlock.
+    Value *IV = PN->getIncomingValueForBlock(OldPred);
+
+    // Remap the value if necessary.
+    if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+      DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+      if (I != ValueMap.end())
+        IV = I->second;
+    }
+
+    PN->addIncoming(IV, NewPred);
+  }
+}
+
+/// ThreadEdge - We have decided that it is safe and profitable to factor the
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB.  Transform the IR to reflect this change.
+bool JumpThreading::ThreadEdge(BasicBlock *BB,
+                               const SmallVectorImpl<BasicBlock*> &PredBBs,
+                               BasicBlock *SuccBB) {
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+          << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not threading across loop header BB '" << BB->getName()
+          << "' to dest BB '" << SuccBB->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
+  if (JumpThreadCost > Threshold) {
+    DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+          << "' - Cost is too high: " << JumpThreadCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors is needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+  }
+
+  // And finally, do it!
+  DEBUG(dbgs() << "  Threading edge from '" << PredBB->getName() << "' to '"
+        << SuccBB->getName() << "' with cost: " << JumpThreadCost
+        << ", across block:\n    "
+        << *BB << "\n");
+
+  LVI->threadEdge(PredBB, BB, SuccBB);
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+                                         BB->getName()+".thread",
+                                         BB->getParent(), BB);
+  NewBB->moveAfter(PredBB);
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; !isa<TerminatorInst>(BI); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    NewBB->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  // We didn't copy the terminator from BB over to NewBB, because there is now
+  // an unconditional jump to SuccBB.  Insert the unconditional jump.
+  BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB);
+  NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+  // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+  // PHI nodes for NewBB now.
+  AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+         ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(UI) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&UI.getUse());
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I->getType(), I->getName());
+    SSAUpdate.AddAvailableValue(BB, I);
+    SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+
+  // Ok, NewBB is good to go.  Update the terminator of PredBB to jump to
+  // NewBB instead of BB.  This eliminates predecessors from BB, which requires
+  // us to simplify any PHI nodes in BB.
+  TerminatorInst *PredTerm = PredBB->getTerminator();
+  for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredTerm->getSuccessor(i) == BB) {
+      BB->removePredecessor(PredBB, true);
+      PredTerm->setSuccessor(i, NewBB);
+    }
+
+  // At this point, the IR is fully up to date and consistent.  Do a quick scan
+  // over the new instructions and zap any that are constants or dead.  This
+  // frequently happens because of phi translation.
+  SimplifyInstructionsInBlock(NewBB, TD, TLI);
+
+  // Threaded an edge!
+  ++NumThreads;
+  return true;
+}
+
+/// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
+bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
+                                 const SmallVectorImpl<BasicBlock *> &PredBBs) {
+  assert(!PredBBs.empty() && "Can't handle an empty set");
+
+  // If BB is a loop header, then duplicating this block outside the loop would
+  // cause us to transform this into an irreducible loop, don't do this.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB)) {
+    DEBUG(dbgs() << "  Not duplicating loop header '" << BB->getName()
+          << "' into predecessor block '" << PredBBs[0]->getName()
+          << "' - it might create an irreducible loop!\n");
+    return false;
+  }
+
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
+  if (DuplicationCost > Threshold) {
+    DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
+          << "' - Cost is too high: " << DuplicationCost << "\n");
+    return false;
+  }
+
+  // And finally, do it!  Start by factoring the predecessors is needed.
+  BasicBlock *PredBB;
+  if (PredBBs.size() == 1)
+    PredBB = PredBBs[0];
+  else {
+    DEBUG(dbgs() << "  Factoring out " << PredBBs.size()
+          << " common predecessors.\n");
+    PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm", this);
+  }
+
+  // Okay, we decided to do this!  Clone all the instructions in BB onto the end
+  // of PredBB.
+  DEBUG(dbgs() << "  Duplicating block '" << BB->getName() << "' into end of '"
+        << PredBB->getName() << "' to eliminate branch on phi.  Cost: "
+        << DuplicationCost << " block is:" << *BB << "\n");
+
+  // Unless PredBB ends with an unconditional branch, split the edge so that we
+  // can just clone the bits from BB into the end of the new PredBB.
+  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+  if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) {
+    PredBB = SplitEdge(PredBB, BB, this);
+    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+  }
+
+  // We are going to have to map operands from the original BB block into the
+  // PredBB block.  Evaluate PHI nodes in BB.
+  DenseMap<Instruction*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  // Clone the non-phi instructions of BB into PredBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; BI != BB->end(); ++BI) {
+    Instruction *New = BI->clone();
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+
+    // If this instruction can be simplified after the operands are updated,
+    // just use the simplified value instead.  This frequently happens due to
+    // phi translation.
+    if (Value *IV = SimplifyInstruction(New, TD)) {
+      delete New;
+      ValueMapping[BI] = IV;
+    } else {
+      // Otherwise, insert the new instruction into the block.
+      New->setName(BI->getName());
+      PredBB->getInstList().insert(OldPredBranch, New);
+      ValueMapping[BI] = New;
+    }
+  }
+
+  // Check to see if the targets of the branch had PHI nodes. If so, we need to
+  // add entries to the PHI nodes for branch from PredBB now.
+  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
+                                  ValueMapping);
+  AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
+                                  ValueMapping);
+
+  // If there were values defined in BB that are used outside the block, then we
+  // now have to update all uses of the value to use either the original value,
+  // the cloned value, or some PHI derived value.  This can require arbitrary
+  // PHI insertion, of which we are prepared to do, clean these up now.
+  SSAUpdater SSAUpdate;
+  SmallVector<Use*, 16> UsesToRename;
+  for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+    // Scan all uses of this instruction to see if it is used outside of its
+    // block, and if so, record them in UsesToRename.
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+         ++UI) {
+      Instruction *User = cast<Instruction>(*UI);
+      if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+        if (UserPN->getIncomingBlock(UI) == BB)
+          continue;
+      } else if (User->getParent() == BB)
+        continue;
+
+      UsesToRename.push_back(&UI.getUse());
+    }
+
+    // If there are no uses outside the block, we're done with this instruction.
+    if (UsesToRename.empty())
+      continue;
+
+    DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n");
+
+    // We found a use of I outside of BB.  Rename all uses of I that are outside
+    // its block to be uses of the appropriate PHI node etc.  See ValuesInBlocks
+    // with the two values we know.
+    SSAUpdate.Initialize(I->getType(), I->getName());
+    SSAUpdate.AddAvailableValue(BB, I);
+    SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]);
+
+    while (!UsesToRename.empty())
+      SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+    DEBUG(dbgs() << "\n");
+  }
+
+  // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+  // that we nuked.
+  BB->removePredecessor(PredBB, true);
+
+  // Remove the unconditional branch at the end of the PredBB block.
+  OldPredBranch->eraseFromParent();
+
+  ++NumDupes;
+  return true;
+}
+
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 000000000000..f94cd2a073ef
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -0,0 +1,895 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "licm"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+
+static cl::opt<bool>
+DisablePromotion("disable-licm-promotion", cl::Hidden,
+                 cl::desc("Disable memory promotion in LICM pass"));
+
+namespace {
+  struct LICM : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LICM() : LoopPass(ID) {
+      initializeLICMPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved("scalar-evolution");
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+    using llvm::Pass::doFinalization;
+
+    bool doFinalization() {
+      assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
+      return false;
+    }
+
+  private:
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop.
+
+    DataLayout *TD;          // DataLayout for constant folding.
+    TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
+
+    // State that is updated as we process loops.
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    bool MayThrow;           // The current loop contains an instruction which
+                             // may throw, thus preventing code motion of
+                             // instructions with side effects.
+    DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
+
+    /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+    void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L);
+
+    /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+    /// set.
+    void deleteAnalysisValue(Value *V, Loop *L);
+
+    /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+    /// dominated by the specified block, and that are in the current loop) in
+    /// reverse depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit uses before definitions, allowing us to sink a loop body in one
+    /// pass without iteration.
+    ///
+    void SinkRegion(DomTreeNode *N);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(DomTreeNode *N);
+
+    /// inSubLoop - Little predicate that returns true if the specified basic
+    /// block is in a subloop of the current one, not the current one itself.
+    ///
+    bool inSubLoop(BasicBlock *BB) {
+      assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+      return LI->getLoopFor(BB) != CurLoop;
+    }
+
+    /// sink - When an instruction is found to only be used outside of the loop,
+    /// this function moves it to the exit blocks and patches up SSA form as
+    /// needed.
+    ///
+    void sink(Instruction &I);
+
+    /// hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void hoist(Instruction &I);
+
+    /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
+    /// is not a trapping instruction or if it is a trapping instruction and is
+    /// guaranteed to execute.
+    ///
+    bool isSafeToExecuteUnconditionally(Instruction &I);
+
+    /// isGuaranteedToExecute - Check that the instruction is guaranteed to
+    /// execute.
+    ///
+    bool isGuaranteedToExecute(Instruction &I);
+
+    /// pointerInvalidatedByLoop - Return true if the body of this loop may
+    /// store into the memory location pointed to by V.
+    ///
+    bool pointerInvalidatedByLoop(Value *V, uint64_t Size,
+                                  const MDNode *TBAAInfo) {
+      // Check to see if any of the basic blocks in CurLoop invalidate *V.
+      return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod();
+    }
+
+    bool canSinkOrHoistInst(Instruction &I);
+    bool isNotUsedInLoop(Instruction &I);
+
+    void PromoteAliasSet(AliasSet &AS,
+                         SmallVectorImpl<BasicBlock*> &ExitBlocks,
+                         SmallVectorImpl<Instruction*> &InsertPts);
+  };
+}
+
+char LICM::ID = 0;
+INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false)
+
+Pass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  Changed = false;
+
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  CurAST = new AliasSetTracker(*AA);
+  // Collect Alias info from subloops.
+  for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
+       LoopItr != LoopItrE; ++LoopItr) {
+    Loop *InnerL = *LoopItr;
+    AliasSetTracker *InnerAST = LoopToAliasSetMap[InnerL];
+    assert(InnerAST && "Where is my AST?");
+
+    // What if InnerLoop was modified by other passes ?
+    CurAST->add(*InnerAST);
+
+    // Once we've incorporated the inner loop's AST into ours, we don't need the
+    // subloop's anymore.
+    delete InnerAST;
+    LoopToAliasSetMap.erase(InnerL);
+  }
+
+  CurLoop = L;
+
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+
+  // Loop over the body of this loop, looking for calls, invokes, and stores.
+  // Because subloops have already been incorporated into AST, we skip blocks in
+  // subloops.
+  //
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
+
+  MayThrow = false;
+  // TODO: We've already searched for instructions which may throw in subloops.
+  // We may want to reuse this information.
+  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
+       (BB != BBE) && !MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !MayThrow; ++I)
+      MayThrow |= I->mayThrow();
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  After sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  if (L->hasDedicatedExits())
+    SinkRegion(DT->getNode(L->getHeader()));
+  if (Preheader)
+    HoistRegion(DT->getNode(L->getHeader()));
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can.
+  if (!DisablePromotion && Preheader && L->hasDedicatedExits()) {
+    SmallVector<BasicBlock *, 8> ExitBlocks;
+    SmallVector<Instruction *, 8> InsertPts;
+
+    // Loop over all of the alias sets in the tracker object.
+    for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+         I != E; ++I)
+      PromoteAliasSet(*I, ExitBlocks, InsertPts);
+  }
+
+  // Clear out loops state information for the next iteration
+  CurLoop = 0;
+  Preheader = 0;
+
+  // If this loop is nested inside of another one, save the alias information
+  // for when we process the outer loop.
+  if (L->getParentLoop())
+    LoopToAliasSetMap[L] = CurAST;
+  else
+    delete CurAST;
+  return Changed;
+}
+
+/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in
+/// reverse depth first order w.r.t the DominatorTree.  This allows us to visit
+/// uses before definitions, allowing us to sink a loop body in one pass without
+/// iteration.
+///
+void LICM::SinkRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // We are processing blocks in reverse dfo, so process children first.
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    SinkRegion(Children[i]);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB)) return;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+    Instruction &I = *--II;
+
+    // If the instruction is dead, we would try to sink it because it isn't used
+    // in the loop, instead, just delete it.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+      ++II;
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+      ++II;
+      sink(I);
+    }
+  }
+}
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree.  This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void LICM::HoistRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (!inSubLoop(BB))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+      Instruction &I = *II++;
+
+      // Try constant folding this instruction.  If all the operands are
+      // constants, it is technically hoistable, but it would be better to just
+      // fold it.
+      if (Constant *C = ConstantFoldInstruction(&I, TD, TLI)) {
+        DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C << '\n');
+        CurAST->copyValue(&I, C);
+        CurAST->deleteValue(&I);
+        I.replaceAllUsesWith(C);
+        I.eraseFromParent();
+        continue;
+      }
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) &&
+          isSafeToExecuteUnconditionally(I))
+        hoist(I);
+    }
+
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    HoistRegion(Children[i]);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool LICM::canSinkOrHoistInst(Instruction &I) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (!LI->isUnordered())
+      return false;        // Don't hoist volatile/atomic loads!
+
+    // Loads from constant memory are always safe to move, even if they end up
+    // in the same alias set as something that ends up being modified.
+    if (AA->pointsToConstantMemory(LI->getOperand(0)))
+      return true;
+    if (LI->getMetadata("invariant.load"))
+      return true;
+
+    // Don't hoist loads which have may-aliased stores in loop.
+    uint64_t Size = 0;
+    if (LI->getType()->isSized())
+      Size = AA->getTypeStoreSize(LI->getType());
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size,
+                                     LI->getMetadata(LLVMContext::MD_tbaa));
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Don't sink or hoist dbg info; it's legal, but not useful.
+    if (isa<DbgInfoIntrinsic>(I))
+      return false;
+
+    // Handle simple cases by querying alias analysis.
+    AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI);
+    if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+      return true;
+    if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+      // If this call only reads from memory and there are no writes to memory
+      // in the loop, we can hoist or sink the call as appropriate.
+      bool FoundMod = false;
+      for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+           I != E; ++I) {
+        AliasSet &AS = *I;
+        if (!AS.isForwardingAliasSet() && AS.isMod()) {
+          FoundMod = true;
+          break;
+        }
+      }
+      if (!FoundMod) return true;
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
+
+    return false;
+  }
+
+  // Only these instructions are hoistable/sinkable.
+  if (!isa<BinaryOperator>(I) && !isa<CastInst>(I) && !isa<SelectInst>(I) &&
+      !isa<GetElementPtrInst>(I) && !isa<CmpInst>(I) &&
+      !isa<InsertElementInst>(I) && !isa<ExtractElementInst>(I) &&
+      !isa<ShuffleVectorInst>(I) && !isa<ExtractValueInst>(I) &&
+      !isa<InsertValueInst>(I))
+    return false;
+
+  return isSafeToExecuteUnconditionally(I);
+}
+
+/// isNotUsedInLoop - Return true if the only users of this instruction are
+/// outside of the loop.  If this is true, we can sink the instruction to the
+/// exit blocks of the loop.
+///
+bool LICM::isNotUsedInLoop(Instruction &I) {
+  for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      // PHI node uses occur in predecessor blocks!
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+    } else if (CurLoop->contains(User)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+/// sink - When an instruction is found to only be used outside of the loop,
+/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+void LICM::sink(Instruction &I) {
+  DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+  // The case where there is only a single exit node of this loop is common
+  // enough that we handle it as a special (more efficient) case.  It is more
+  // efficient to handle because there are no PHI nodes that need to be placed.
+  if (ExitBlocks.size() == 1) {
+    if (!DT->dominates(I.getParent(), ExitBlocks[0])) {
+      // Instruction is not used, just delete it.
+      CurAST->deleteValue(&I);
+      // If I has users in unreachable blocks, eliminate.
+      // If I is not void type then replaceAllUsesWith undef.
+      // This allows ValueHandlers and custom metadata to adjust itself.
+      if (!I.use_empty())
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+      I.eraseFromParent();
+    } else {
+      // Move the instruction to the start of the exit block, after any PHI
+      // nodes in it.
+      I.moveBefore(ExitBlocks[0]->getFirstInsertionPt());
+
+      // This instruction is no longer in the AST for the current loop, because
+      // we just sunk it out of the loop.  If we just sunk it into an outer
+      // loop, we will rediscover the operation when we process it.
+      CurAST->deleteValue(&I);
+    }
+    return;
+  }
+
+  if (ExitBlocks.empty()) {
+    // The instruction is actually dead if there ARE NO exit blocks.
+    CurAST->deleteValue(&I);
+    // If I has users in unreachable blocks, eliminate.
+    // If I is not void type then replaceAllUsesWith undef.
+    // This allows ValueHandlers and custom metadata to adjust itself.
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+    return;
+  }
+
+  // Otherwise, if we have multiple exits, use the SSAUpdater to do all of the
+  // hard work of inserting PHI nodes as necessary.
+  SmallVector<PHINode*, 8> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+
+  if (!I.use_empty())
+    SSA.Initialize(I.getType(), I.getName());
+
+  // Insert a copy of the instruction in each exit block of the loop that is
+  // dominated by the instruction.  Each exit block is known to only be in the
+  // ExitBlocks list once.
+  BasicBlock *InstOrigBB = I.getParent();
+  unsigned NumInserted = 0;
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+
+    if (!DT->dominates(InstOrigBB, ExitBlock))
+      continue;
+
+    // Insert the code after the last PHI node.
+    BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
+
+    // If this is the first exit block processed, just move the original
+    // instruction, otherwise clone the original instruction and insert
+    // the copy.
+    Instruction *New;
+    if (NumInserted++ == 0) {
+      I.moveBefore(InsertPt);
+      New = &I;
+    } else {
+      New = I.clone();
+      if (!I.getName().empty())
+        New->setName(I.getName()+".le");
+      ExitBlock->getInstList().insert(InsertPt, New);
+    }
+
+    // Now that we have inserted the instruction, inform SSAUpdater.
+    if (!I.use_empty())
+      SSA.AddAvailableValue(ExitBlock, New);
+  }
+
+  // If the instruction doesn't dominate any exit blocks, it must be dead.
+  if (NumInserted == 0) {
+    CurAST->deleteValue(&I);
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+    return;
+  }
+
+  // Next, rewrite uses of the instruction, inserting PHI nodes as needed.
+  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE; ) {
+    // Grab the use before incrementing the iterator.
+    Use &U = UI.getUse();
+    // Increment the iterator before removing the use from the list.
+    ++UI;
+    SSA.RewriteUseAfterInsertions(U);
+  }
+
+  // Update CurAST for NewPHIs if I had pointer type.
+  if (I.getType()->isPointerTy())
+    for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
+      CurAST->copyValue(&I, NewPHIs[i]);
+
+  // Finally, remove the instruction from CurAST.  It is no longer in the loop.
+  CurAST->deleteValue(&I);
+}
+
+/// hoist - When an instruction is found to only use loop invariant operands
+/// that is safe to hoist, this instruction is called to do the dirty work.
+///
+void LICM::hoist(Instruction &I) {
+  DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": "
+        << I << "\n");
+
+  // Move the new node to the Preheader, before its terminator.
+  I.moveBefore(Preheader->getTerminator());
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumHoisted;
+  Changed = true;
+}
+
+/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
+/// not a trapping instruction or if it is a trapping instruction and is
+/// guaranteed to execute.
+///
+bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+  // If it is not a trapping instruction, it is always safe to hoist.
+  if (isSafeToSpeculativelyExecute(&Inst))
+    return true;
+
+  return isGuaranteedToExecute(Inst);
+}
+
+bool LICM::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (MayThrow)
+    return false;
+
+  // Otherwise we have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    return true;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(Inst.getParent(), ExitBlocks[i]))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  return true;
+}
+
+namespace {
+  class LoopPromoter : public LoadAndStorePromoter {
+    Value *SomePtr;  // Designated pointer to store to.
+    SmallPtrSet<Value*, 4> &PointerMustAliases;
+    SmallVectorImpl<BasicBlock*> &LoopExitBlocks;
+    SmallVectorImpl<Instruction*> &LoopInsertPts;
+    AliasSetTracker &AST;
+    DebugLoc DL;
+    int Alignment;
+    MDNode *TBAATag;
+  public:
+    LoopPromoter(Value *SP,
+                 const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 SmallPtrSet<Value*, 4> &PMA,
+                 SmallVectorImpl<BasicBlock*> &LEB,
+                 SmallVectorImpl<Instruction*> &LIP,
+                 AliasSetTracker &ast, DebugLoc dl, int alignment,
+                 MDNode *TBAATag)
+      : LoadAndStorePromoter(Insts, S), SomePtr(SP),
+        PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP),
+        AST(ast), DL(dl), Alignment(alignment), TBAATag(TBAATag) {}
+
+    virtual bool isInstInList(Instruction *I,
+                              const SmallVectorImpl<Instruction*> &) const {
+      Value *Ptr;
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        Ptr = LI->getOperand(0);
+      else
+        Ptr = cast<StoreInst>(I)->getPointerOperand();
+      return PointerMustAliases.count(Ptr);
+    }
+
+    virtual void doExtraRewritesBeforeFinalDeletion() const {
+      // Insert stores after in the loop exit blocks.  Each exit block gets a
+      // store of the live-out values that feed them.  Since we've already told
+      // the SSA updater about the defs in the loop and the preheader
+      // definition, it is all set and we can start using it.
+      for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+        BasicBlock *ExitBlock = LoopExitBlocks[i];
+        Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+        Instruction *InsertPos = LoopInsertPts[i];
+        StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
+        NewSI->setAlignment(Alignment);
+        NewSI->setDebugLoc(DL);
+        if (TBAATag) NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+      }
+    }
+
+    virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const {
+      // Update alias analysis.
+      AST.copyValue(LI, V);
+    }
+    virtual void instructionDeleted(Instruction *I) const {
+      AST.deleteValue(I);
+    }
+  };
+} // end anon namespace
+
+/// PromoteAliasSet - Try to promote memory values to scalars by sinking
+/// stores out of the loop and moving loads to before the loop.  We do this by
+/// looping over the stores in the loop, looking for stores to Must pointers
+/// which are loop invariant.
+///
+void LICM::PromoteAliasSet(AliasSet &AS,
+                           SmallVectorImpl<BasicBlock*> &ExitBlocks,
+                           SmallVectorImpl<Instruction*> &InsertPts) {
+  // We can promote this alias set if it has a store, if it is a "Must" alias
+  // set, if the pointer is loop invariant, and if we are not eliminating any
+  // volatile loads or stores.
+  if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+      AS.isVolatile() || !CurLoop->isLoopInvariant(AS.begin()->getValue()))
+    return;
+
+  assert(!AS.empty() &&
+         "Must alias set should have at least one pointer element in it!");
+  Value *SomePtr = AS.begin()->getValue();
+
+  // It isn't safe to promote a load/store from the loop if the load/store is
+  // conditional.  For example, turning:
+  //
+  //    for () { if (c) *P += 1; }
+  //
+  // into:
+  //
+  //    tmp = *P;  for () { if (c) tmp +=1; } *P = tmp;
+  //
+  // is not safe, because *P may only be valid to access if 'c' is true.
+  //
+  // It is safe to promote P if all uses are direct load/stores and if at
+  // least one is guaranteed to be executed.
+  bool GuaranteedToExecute = false;
+
+  SmallVector<Instruction*, 64> LoopUses;
+  SmallPtrSet<Value*, 4> PointerMustAliases;
+
+  // We start with an alignment of one and try to find instructions that allow
+  // us to prove better alignment.
+  unsigned Alignment = 1;
+  MDNode *TBAATag = 0;
+
+  // Check that all of the pointers in the alias set have the same type.  We
+  // cannot (yet) promote a memory location that is loaded and stored in
+  // different sizes.  While we are at it, collect alignment and TBAA info.
+  for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
+    Value *ASIV = ASI->getValue();
+    PointerMustAliases.insert(ASIV);
+
+    // Check that all of the pointers in the alias set have the same type.  We
+    // cannot (yet) promote a memory location that is loaded and stored in
+    // different sizes.
+    if (SomePtr->getType() != ASIV->getType())
+      return;
+
+    for (Value::use_iterator UI = ASIV->use_begin(), UE = ASIV->use_end();
+         UI != UE; ++UI) {
+      // Ignore instructions that are outside the loop.
+      Instruction *Use = dyn_cast<Instruction>(*UI);
+      if (!Use || !CurLoop->contains(Use))
+        continue;
+
+      // If there is an non-load/store instruction in the loop, we can't promote
+      // it.
+      if (LoadInst *load = dyn_cast<LoadInst>(Use)) {
+        assert(!load->isVolatile() && "AST broken");
+        if (!load->isSimple())
+          return;
+      } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) {
+        // Stores *of* the pointer are not interesting, only stores *to* the
+        // pointer.
+        if (Use->getOperand(1) != ASIV)
+          continue;
+        assert(!store->isVolatile() && "AST broken");
+        if (!store->isSimple())
+          return;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
+        // If the alignment of this instruction allows us to specify a more
+        // restrictive (and performant) alignment and if we are sure this
+        // instruction will be executed, update the alignment.
+        // Larger is better, with the exception of 0 being the best alignment.
+        unsigned InstAlignment = store->getAlignment();
+        if ((InstAlignment > Alignment || InstAlignment == 0) && Alignment != 0)
+          if (isGuaranteedToExecute(*Use)) {
+            GuaranteedToExecute = true;
+            Alignment = InstAlignment;
+          }
+
+        if (!GuaranteedToExecute)
+          GuaranteedToExecute = isGuaranteedToExecute(*Use);
+
+      } else
+        return; // Not a load or store.
+
+      // Merge the TBAA tags.
+      if (LoopUses.empty()) {
+        // On the first load/store, just take its TBAA tag.
+        TBAATag = Use->getMetadata(LLVMContext::MD_tbaa);
+      } else if (TBAATag) {
+        TBAATag = MDNode::getMostGenericTBAA(TBAATag,
+                                       Use->getMetadata(LLVMContext::MD_tbaa));
+      }
+      
+      LoopUses.push_back(Use);
+    }
+  }
+
+  // If there isn't a guaranteed-to-execute instruction, we can't promote.
+  if (!GuaranteedToExecute)
+    return;
+
+  // Otherwise, this is safe to promote, lets do it!
+  DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " <<*SomePtr<<'\n');
+  Changed = true;
+  ++NumPromoted;
+
+  // Grab a debug location for the inserted loads/stores; given that the
+  // inserted loads/stores have little relation to the original loads/stores,
+  // this code just arbitrarily picks a location from one, since any debug
+  // location is better than none.
+  DebugLoc DL = LoopUses[0]->getDebugLoc();
+
+  // Figure out the loop exits and their insertion points, if this is the
+  // first promotion.
+  if (ExitBlocks.empty()) {
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    InsertPts.resize(ExitBlocks.size());
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt();
+  }
+
+  // We use the SSAUpdater interface to insert phi nodes as required.
+  SmallVector<PHINode*, 16> NewPHIs;
+  SSAUpdater SSA(&NewPHIs);
+  LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
+                        InsertPts, *CurAST, DL, Alignment, TBAATag);
+
+  // Set up the preheader to have a definition of the value.  It is the live-out
+  // value from the preheader that uses in the loop will use.
+  LoadInst *PreheaderLoad =
+    new LoadInst(SomePtr, SomePtr->getName()+".promoted",
+                 Preheader->getTerminator());
+  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setDebugLoc(DL);
+  if (TBAATag) PreheaderLoad->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+  SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+  // Rewrite all the loads in the loop and remember all the definitions from
+  // stores in the loop.
+  Promoter.run(LoopUses);
+
+  // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+  if (PreheaderLoad->use_empty())
+    PreheaderLoad->eraseFromParent();
+}
+
+
+/// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
+void LICM::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  AST->copyValue(From, To);
+}
+
+/// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
+/// set.
+void LICM::deleteAnalysisValue(Value *V, Loop *L) {
+  AliasSetTracker *AST = LoopToAliasSetMap.lookup(L);
+  if (!AST)
+    return;
+
+  AST->deleteValue(V);
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
new file mode 100644
index 000000000000..0b62050b17a0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -0,0 +1,250 @@
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-delete"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+using namespace llvm;
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+namespace {
+  class LoopDeletion : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopDeletion() : LoopPass(ID) {
+      initializeLoopDeletionPass(*PassRegistry::getPassRegistry());
+    }
+
+    // Possibly eliminate loop L if it is dead.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+    }
+
+  private:
+    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
+                    SmallVector<BasicBlock*, 4> &exitBlocks,
+                    bool &Changed, BasicBlock *Preheader);
+
+  };
+}
+
+char LoopDeletion::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion",
+                "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
+                "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() {
+  return new LoopDeletion();
+}
+
+/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// checked for unique exit and exiting blocks, and that the code is in LCSSA
+/// form.
+bool LoopDeletion::isLoopDead(Loop *L,
+                              SmallVector<BasicBlock*, 4> &exitingBlocks,
+                              SmallVector<BasicBlock*, 4> &exitBlocks,
+                              bool &Changed, BasicBlock *Preheader) {
+  BasicBlock *exitBlock = exitBlocks[0];
+
+  // Make sure that all PHI entries coming from the loop are loop invariant.
+  // Because the code is in LCSSA form, any values used outside of the loop
+  // must pass through a PHI in the exit block, meaning that this check is
+  // sufficient to guarantee that no loop-variant values are used outside
+  // of the loop.
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+
+    // Make sure all exiting blocks produce the same incoming value for the exit
+    // block.  If there are different incoming values for different exiting
+    // blocks, then it is impossible to statically determine which value should
+    // be used.
+    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
+      if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
+        return false;
+    }
+
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
+      if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
+        return false;
+
+    ++BI;
+  }
+
+  // Make sure that no instructions in the block have potential side-effects.
+  // This includes instructions that could write to memory, and loads that are
+  // marked volatile.  This could be made more aggressive by using aliasing
+  // information to identify readonly and readnone calls.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    for (BasicBlock::iterator BI = (*LI)->begin(), BE = (*LI)->end();
+         BI != BE; ++BI) {
+      if (BI->mayHaveSideEffects())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+/// runOnLoop - Remove dead loops, by which we mean loops that do not impact the
+/// observable behavior of the program other than finite running time.  Note
+/// we do ensure that this never remove a loop that might be infinite, as doing
+/// so could change the halting/non-halting nature of a program.
+/// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
+/// in order to make various safety checks work.
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
+  // We can only remove the loop if there is a preheader that we can
+  // branch from after removing it.
+  BasicBlock *preheader = L->getLoopPreheader();
+  if (!preheader)
+    return false;
+
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->hasDedicatedExits())
+    return false;
+
+  // We can't remove loops that contain subloops.  If the subloops were dead,
+  // they would already have been removed in earlier executions of this pass.
+  if (L->begin() != L->end())
+    return false;
+
+  SmallVector<BasicBlock*, 4> exitingBlocks;
+  L->getExitingBlocks(exitingBlocks);
+
+  SmallVector<BasicBlock*, 4> exitBlocks;
+  L->getUniqueExitBlocks(exitBlocks);
+
+  // We require that the loop only have a single exit block.  Otherwise, we'd
+  // be in the situation of needing to be able to solve statically which exit
+  // block will be branched to, or trying to preserve the branching logic in
+  // a loop invariant manner.
+  if (exitBlocks.size() != 1)
+    return false;
+
+  // Finally, we have to check that the loop really is dead.
+  bool Changed = false;
+  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+    return Changed;
+
+  // Don't remove loops for which we can't solve the trip count.
+  // They could be infinite, in which case we'd be changing program behavior.
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
+  const SCEV *S = SE.getMaxBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(S))
+    return Changed;
+
+  // Now that we know the removal is safe, remove the loop by changing the
+  // branch from the preheader to go to the single exit block.
+  BasicBlock *exitBlock = exitBlocks[0];
+
+  // Because we're deleting a large chunk of code at once, the sequence in which
+  // we remove things is very important to avoid invalidation issues.  Don't
+  // mess with this unless you have good reason and know what you're doing.
+
+  // Tell ScalarEvolution that the loop is deleted. Do this before
+  // deleting the loop so that ScalarEvolution can look at the loop
+  // to determine what it needs to clean up.
+  SE.forgetLoop(L);
+
+  // Connect the preheader directly to the exit block.
+  TerminatorInst *TI = preheader->getTerminator();
+  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock *exitingBlock = exitingBlocks[0];
+  BasicBlock::iterator BI = exitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    int j = P->getBasicBlockIndex(exitingBlock);
+    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+    P->setIncomingBlock(j, preheader);
+    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
+      P->removeIncomingValue(exitingBlocks[i]);
+    ++BI;
+  }
+
+  // Update the dominator tree and remove the instructions and blocks that will
+  // be deleted from the reference counting scheme.
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  SmallVector<DomTreeNode*, 8> ChildNodes;
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI) {
+    // Move all of the block's children to be children of the preheader, which
+    // allows us to remove the domtree entry for the block.
+    ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
+    for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+         DE = ChildNodes.end(); DI != DE; ++DI) {
+      DT.changeImmediateDominator(*DI, DT[preheader]);
+    }
+
+    ChildNodes.clear();
+    DT.eraseNode(*LI);
+
+    // Remove the block from the reference counting scheme, so that we can
+    // delete it freely later.
+    (*LI)->dropAllReferences();
+  }
+
+  // Erase the instructions and the blocks without having to worry
+  // about ordering because we already dropped the references.
+  // NOTE: This iteration is safe because erasing the block does not remove its
+  // entry from the loop's block list.  We do that in the next section.
+  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
+       LI != LE; ++LI)
+    (*LI)->eraseFromParent();
+
+  // Finally, the blocks from loopinfo.  This has to happen late because
+  // otherwise our loop iterators won't work.
+  LoopInfo &loopInfo = getAnalysis<LoopInfo>();
+  SmallPtrSet<BasicBlock*, 8> blocks;
+  blocks.insert(L->block_begin(), L->block_end());
+  for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
+       E = blocks.end(); I != E; ++I)
+    loopInfo.removeBlock(*I);
+
+  // The last step is to inform the loop pass manager that we've
+  // eliminated this loop.
+  LPM.deleteLoopFromQueue(L);
+  Changed = true;
+
+  ++NumDeleted;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
new file mode 100644
index 000000000000..8258719a0200
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -0,0 +1,1138 @@
+//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form.  In cases that this kicks in, it can be a significant
+// performance win.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+//   memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+//   fpowi
+// Future integer operation idioms to recognize:
+//   ctpop, ctlz, cttz
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set.  It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// We should enhance the memset/memcpy recognition to handle multiple stores in
+// the loop.  This would handle things like:
+//   void foo(_Complex float *P)
+//     for (i) { __real__(*P) = 0;  __imag__(*P) = 0; }
+//
+// We should enhance this to handle negative strides through memory.
+// Alternatively (and perhaps better) we could rely on an earlier pass to force
+// forward iteration through memory, which is generally better for cache
+// behavior.  Negative strides *do* happen for memset/memcpy loops.
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-idiom"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+
+namespace {
+
+  class LoopIdiomRecognize;
+
+  /// This class defines some utility functions for loop idiom recognization.
+  class LIRUtil {
+  public:
+    /// Return true iff the block contains nothing but an uncondition branch
+    /// (aka goto instruction).
+    static bool isAlmostEmpty(BasicBlock *);
+
+    static BranchInst *getBranch(BasicBlock *BB) {
+      return dyn_cast<BranchInst>(BB->getTerminator());
+    }
+
+    /// Return the condition of the branch terminating the given basic block.
+    static Value *getBrCondtion(BasicBlock *);
+
+    /// Derive the precondition block (i.e the block that guards the loop 
+    /// preheader) from the given preheader.
+    static BasicBlock *getPrecondBb(BasicBlock *PreHead);
+  };
+
+  /// This class is to recoginize idioms of population-count conducted in
+  /// a noncountable loop. Currently it only recognizes this pattern:
+  /// \code
+  ///   while(x) {cnt++; ...; x &= x - 1; ...}
+  /// \endcode
+  class NclPopcountRecognize {
+    LoopIdiomRecognize &LIR;
+    Loop *CurLoop;
+    BasicBlock *PreCondBB;
+
+    typedef IRBuilder<> IRBuilderTy;
+
+  public:
+    explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
+    bool recognize();
+
+  private:
+    /// Take a glimpse of the loop to see if we need to go ahead recoginizing
+    /// the idiom.
+    bool preliminaryScreen();
+
+    /// Check if the given conditional branch is based on the comparison
+    /// beween a variable and zero, and if the variable is non-zero, the
+    /// control yeilds to the loop entry. If the branch matches the behavior,
+    /// the variable involved in the comparion is returned. This function will
+    /// be called to see if the precondition and postcondition of the loop 
+    /// are in desirable form.
+    Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
+
+    /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
+    /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
+    /// is set to the corresponding phi node. 3) \p Var is set to the value
+    /// whose population bits are being counted.
+    bool detectIdiom
+      (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
+
+    /// Insert ctpop intrinsic function and some obviously dead instructions.
+    void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
+
+    /// Create llvm.ctpop.* intrinsic function.
+    CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
+  };
+
+  class LoopIdiomRecognize : public LoopPass {
+    Loop *CurLoop;
+    const DataLayout *TD;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    TargetLibraryInfo *TLI;
+    const TargetTransformInfo *TTI;
+  public:
+    static char ID;
+    explicit LoopIdiomRecognize() : LoopPass(ID) {
+      initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+      TD = 0; DT = 0; SE = 0; TLI = 0; TTI = 0;
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                        SmallVectorImpl<BasicBlock*> &ExitBlocks);
+
+    bool processLoopStore(StoreInst *SI, const SCEV *BECount);
+    bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+    bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                                 unsigned StoreAlignment,
+                                 Value *SplatValue, Instruction *TheStore,
+                                 const SCEVAddRecExpr *Ev,
+                                 const SCEV *BECount);
+    bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
+                                    const SCEVAddRecExpr *StoreEv,
+                                    const SCEVAddRecExpr *LoadEv,
+                                    const SCEV *BECount);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetLibraryInfo>();
+      AU.addRequired<TargetTransformInfo>();
+    }
+
+    const DataLayout *getDataLayout() {
+      return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>();
+    }
+
+    DominatorTree *getDominatorTree() {
+      return DT ? DT : (DT=&getAnalysis<DominatorTree>());
+    }
+
+    ScalarEvolution *getScalarEvolution() {
+      return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
+    }
+
+    TargetLibraryInfo *getTargetLibraryInfo() {
+      return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
+    }
+
+    const TargetTransformInfo *getTargetTransformInfo() {
+      return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>());
+    }
+
+    Loop *getLoop() const { return CurLoop; }
+
+  private:
+    bool runOnNoncountableLoop();
+    bool runOnCountableLoop();
+  };
+}
+
+char LoopIdiomRecognize::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
+                    false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
+
+/// deleteDeadInstruction - Delete this instruction.  Before we do, go through
+/// and zero out all the operands of this instruction.  If any of them become
+/// dead, delete them and the computation tree that feeds them.
+///
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+                                  const TargetLibraryInfo *TLI) {
+  SmallVector<Instruction*, 32> NowDeadInsts;
+
+  NowDeadInsts.push_back(I);
+
+  // Before we touch this instruction, remove it from SE!
+  do {
+    Instruction *DeadInst = NowDeadInsts.pop_back_val();
+
+    // This instruction is dead, zap it, in stages.  Start by removing it from
+    // SCEV.
+    SE.forgetValue(DeadInst);
+
+    for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+      Value *Op = DeadInst->getOperand(op);
+      DeadInst->setOperand(op, 0);
+
+      // If this operand just became dead, add it to the NowDeadInsts list.
+      if (!Op->use_empty()) continue;
+
+      if (Instruction *OpI = dyn_cast<Instruction>(Op))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          NowDeadInsts.push_back(OpI);
+    }
+
+    DeadInst->eraseFromParent();
+
+  } while (!NowDeadInsts.empty());
+}
+
+/// deleteIfDeadInstruction - If the specified value is a dead instruction,
+/// delete it and any recursively used instructions.
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
+                                    const TargetLibraryInfo *TLI) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (isInstructionTriviallyDead(I, TLI))
+      deleteDeadInstruction(I, SE, TLI);
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LIRUtil
+//
+//===----------------------------------------------------------------------===//
+
+// This fucntion will return true iff the given block contains nothing but goto. 
+// A typical usage of this function is to check if the preheader fucntion is 
+// "almost" empty such that generated intrinsic function can be moved across 
+// preheader and to be placed at the end of the preconditiona block without 
+// concerning of breaking data dependence.
+bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
+  if (BranchInst *Br = getBranch(BB)) {
+    return Br->isUnconditional() && BB->size() == 1;
+  }
+  return false;
+}
+
+Value *LIRUtil::getBrCondtion(BasicBlock *BB) {
+  BranchInst *Br = getBranch(BB);
+  return Br ? Br->getCondition() : 0;
+}
+
+BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
+  if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
+    BranchInst *Br = getBranch(BB);
+    return Br && Br->isConditional() ? BB : 0;
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of NclPopcountRecognize
+//
+//===----------------------------------------------------------------------===//
+
+NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
+  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
+}
+
+bool NclPopcountRecognize::preliminaryScreen() {
+  const TargetTransformInfo *TTI = LIR.getTargetTransformInfo();
+  if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+    return false;
+
+  // Counting population are usually conducted by few arithmetic instrutions.
+  // Such instructions can be easilly "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but a goto instruction.
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function will be inserted.
+  PreCondBB = LIRUtil::getPrecondBb(PreHead);
+  if (!PreCondBB)
+    return false;
+ 
+  return true;
+}
+
+Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
+                                             BasicBlock *LoopEntry) const {
+  if (!Br || !Br->isConditional())
+    return 0;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
+  if (!Cond)
+    return 0;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return 0;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return 0;
+}
+
+bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
+                                       PHINode *&CntPhi,
+                                       Value *&Var) const {
+  // Following code tries to detect this idiom:
+  //
+  //    if (x0 != 0)
+  //      goto loop-exit // the precondition of the loop
+  //    cnt0 = init-val;
+  //    do {
+  //       x1 = phi (x0, x2);
+  //       cnt1 = phi(cnt0, cnt2);
+  //
+  //       cnt2 = cnt1 + 1;
+  //        ...
+  //       x2 = x1 & (x1 - 1);
+  //        ...
+  //    } while(x != 0);
+  //
+  // loop-exit:
+  //
+
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = 0;
+  VarX1 = VarX0 = 0;
+  PhiX = CountPhi = 0;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  {
+    PhiX = dyn_cast<PHINode>(VarX1);
+    if (!PhiX ||
+        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+      return false;
+    }
+  }
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = NULL;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
+           IterE = LoopEntry->end(); Iter != IterE; Iter++) {
+      Instruction *Inst = Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+      if (!Phi || Phi->getParent() != LoopEntry)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+             I != E;  I++) {
+        if ((cast<Instruction>(*I))->getParent() != LoopEntry) {
+          LiveOutLoop = true; break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+    Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+void NclPopcountRecognize::transform(Instruction *CntInst,
+                                     PHINode *CntPhi, Value *Var) {
+
+  ScalarEvolution *SE = LIR.getScalarEvolution();
+  TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+ 
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilderTy Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+      Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the popoulation counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
+  //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond =
+      cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCond->replaceAllUsesWith(NewPreCond);
+
+    deleteDeadInstruction(PreCond, *SE, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enble us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop become dead after
+  //    the transformation. It is lots easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, a infite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    BranchInst *LbBr = LIRUtil::getBranch(Body);
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
+
+    Builder.SetInsertPoint(LbCond);
+    Value *Opnd1 = cast<Value>(TcPhi);
+    Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
+    Instruction *TcDec =
+      cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
+      CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  {
+    SmallVector<Value *, 4> CntUses;
+    for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end();
+         I != E; I++) {
+      if (cast<Instruction>(*I)->getParent() != Body)
+        CntUses.push_back(*I);
+    }
+    for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
+      (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
+    }
+  }
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
+CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 
+                                                      Value *Val, DebugLoc DL) {
+  Value *Ops[] = { Val };
+  Type *Tys[] = { Val->getType() };
+
+  Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// recognize - detect population count idiom in a non-countable loop. If
+///   detected, transform the relevant code to popcount intrinsic function
+///   call, and return true; otherwise, return false.
+bool NclPopcountRecognize::recognize() {
+
+  if (!LIR.getTargetTransformInfo())
+    return false;
+
+  LIR.getScalarEvolution();
+
+  if (!preliminaryScreen())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectIdiom(CntInst, CntPhi, Val))
+    return false;
+
+  transform(CntInst, CntPhi, Val);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+  if (isa<SCEVCouldNotCompute>(BECount)) return false;
+
+  // If this loop executes exactly one time, then it should be peeled, not
+  // optimized by this pass.
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    if (BECst->getValue()->getValue() == 0)
+      return false;
+
+  // We require target data for now.
+  if (!getDataLayout())
+    return false;
+
+  // set DT 
+  (void)getDominatorTree();
+
+  LoopInfo &LI = getAnalysis<LoopInfo>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  // set TLI 
+  (void)getTargetLibraryInfo();
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  DEBUG(dbgs() << "loop-idiom Scanning: F["
+               << CurLoop->getHeader()->getParent()->getName()
+               << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
+
+  bool MadeChange = false;
+  // Scan all the blocks in the loop that are not in subloops.
+  for (Loop::block_iterator BI = CurLoop->block_begin(),
+         E = CurLoop->block_end(); BI != E; ++BI) {
+    // Ignore blocks in subloops.
+    if (LI.getLoopFor(*BI) != CurLoop)
+      continue;
+
+    MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks);
+  }
+  return MadeChange;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  NclPopcountRecognize Popcount(*this);
+  if (Popcount.recognize())
+    return true;
+
+  return false;
+}
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  CurLoop = L;
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  SE = &getAnalysis<ScalarEvolution>();
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop();
+  return runOnNoncountableLoop();
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+                                     SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(BB, ExitBlocks[i]))
+      return false;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+    Instruction *Inst = I++;
+    // Look for store instructions, which may be optimized to memset/memcpy.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))  {
+      WeakVH InstPtr(I);
+      if (!processLoopStore(SI, BECount)) continue;
+      MadeChange = true;
+
+      // If processing the store invalidated our iterator, start over from the
+      // top of the block.
+      if (InstPtr == 0)
+        I = BB->begin();
+      continue;
+    }
+
+    // Look for memset instructions, which may be optimized to a larger memset.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst))  {
+      WeakVH InstPtr(I);
+      if (!processLoopMemSet(MSI, BECount)) continue;
+      MadeChange = true;
+
+      // If processing the memset invalidated our iterator, start over from the
+      // top of the block.
+      if (InstPtr == 0)
+        I = BB->begin();
+      continue;
+    }
+  }
+
+  return MadeChange;
+}
+
+
+/// processLoopStore - See if this store can be promoted to a memset or memcpy.
+bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
+  if (!SI->isSimple()) return false;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *StoreEv =
+    dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return false;
+
+  // Check to see if the stride matches the size of the store.  If so, then we
+  // know that every byte is touched in the loop.
+  unsigned StoreSize = (unsigned)SizeInBits >> 3;
+  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
+
+  if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) {
+    // TODO: Could also handle negative stride here someday, that will require
+    // the validity check in mayLoopAccessLocation to be updated though.
+    // Enable this to print exact negative strides.
+    if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) {
+      dbgs() << "NEGATIVE STRIDE: " << *SI << "\n";
+      dbgs() << "BB: " << *SI->getParent();
+    }
+
+    return false;
+  }
+
+  // See if we can optimize just this store in isolation.
+  if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(),
+                              StoredVal, SI, StoreEv, BECount))
+    return true;
+
+  // If the stored value is a strided load in the same loop with the same stride
+  // this this may be transformable into a memcpy.  This kicks in for stuff like
+  //   for (i) A[i] = B[i];
+  if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+    const SCEVAddRecExpr *LoadEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
+    if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
+        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
+      if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
+        return true;
+  }
+  //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n";
+
+  return false;
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::
+processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
+  // We can only handle non-volatile memsets with a constant size.
+  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false;
+
+  // If we're not allowed to hack on memset, we fail.
+  if (!TLI->has(LibFunc::memset))
+    return false;
+
+  Value *Pointer = MSI->getDest();
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+  if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine())
+    return false;
+
+  // Reject memsets that are so large that they overflow an unsigned.
+  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+  if ((SizeInBytes >> 32) != 0)
+    return false;
+
+  // Check to see if the stride matches the size of the memset.  If so, then we
+  // know that every byte is touched in the loop.
+  const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+
+  // TODO: Could also handle negative stride here someday, that will require the
+  // validity check in mayLoopAccessLocation to be updated though.
+  if (Stride == 0 || MSI->getLength() != Stride->getValue())
+    return false;
+
+  return processLoopStridedStore(Pointer, (unsigned)SizeInBytes,
+                                 MSI->getAlignment(), MSI->getValue(),
+                                 MSI, Ev, BECount);
+}
+
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
+                                  Loop *L, const SCEV *BECount,
+                                  unsigned StoreSize, AliasAnalysis &AA,
+                                  Instruction *IgnoredStore) {
+  // Get the location that may be stored across the loop.  Since the access is
+  // strided positively through memory, we say that the modified location starts
+  // at the pointer and has infinite size.
+  uint64_t AccessSize = AliasAnalysis::UnknownSize;
+
+  // If the loop iterates a fixed number of times, we can refine the access size
+  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize;
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  AliasAnalysis::Location StoreLoc(Ptr, AccessSize);
+
+  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+       ++BI)
+    for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
+      if (&*I != IgnoredStore &&
+          (AA.getModRefInfo(I, StoreLoc) & Access))
+        return true;
+
+  return false;
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in.  Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) {
+  // If the value isn't a constant, we can't promote it to being in a constant
+  // array.  We could theoretically do a store to an alloca or something, but
+  // that doesn't seem worthwhile.
+  Constant *C = dyn_cast<Constant>(V);
+  if (C == 0) return 0;
+
+  // Only handle simple values that are a power of two bytes in size.
+  uint64_t Size = TD.getTypeSizeInBits(V->getType());
+  if (Size == 0 || (Size & 7) || (Size & (Size-1)))
+    return 0;
+
+  // Don't care enough about darwin/ppc to implement this.
+  if (TD.isBigEndian())
+    return 0;
+
+  // Convert to size in bytes.
+  Size /= 8;
+
+  // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+  // if the top and bottom are the same (e.g. for vectors and large integers).
+  if (Size > 16) return 0;
+
+  // If the constant is exactly 16 bytes, just use it.
+  if (Size == 16) return C;
+
+  // Otherwise, we'll use an array of the constants.
+  unsigned ArraySize = 16/Size;
+  ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+  return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C));
+}
+
+
+/// processLoopStridedStore - We see a strided store of some value.  If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::
+processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+                        unsigned StoreAlignment, Value *StoredVal,
+                        Instruction *TheStore, const SCEVAddRecExpr *Ev,
+                        const SCEV *BECount) {
+
+  // If the stored value is a byte-wise value (like i32 -1), then it may be
+  // turned into a memset of i8 -1, assuming that all the consecutive bytes
+  // are stored.  A store of i32 0x01020304 can never be turned into a memset,
+  // but it can be turned into memset_pattern if the target supports it.
+  Value *SplatValue = isBytewiseValue(StoredVal);
+  Constant *PatternValue = 0;
+
+  // If we're allowed to form a memset, and the stored value would be acceptable
+  // for memset, use it.
+  if (SplatValue && TLI->has(LibFunc::memset) &&
+      // Verify that the stored value is loop invariant.  If not, we can't
+      // promote the memset.
+      CurLoop->isLoopInvariant(SplatValue)) {
+    // Keep and use SplatValue.
+    PatternValue = 0;
+  } else if (TLI->has(LibFunc::memset_pattern16) &&
+             (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+    // It looks like we can use PatternValue!
+    SplatValue = 0;
+  } else {
+    // Otherwise, this isn't an idiom we can transform.  For example, we can't
+    // do anything with a 3-byte store.
+    return false;
+  }
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, "loop-idiom");
+
+  // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
+  // this into a memset in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write to the aliased location.  Check for any overlap by generating the
+  // base pointer and checking the region.
+  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
+  Value *BasePtr =
+    Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
+                           Preheader->getTerminator());
+
+
+  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
+                            CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(BasePtr, *SE, TLI);
+    return false;
+  }
+
+  // Okay, everything looks good, insert the memset.
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+                                         SCEV::FlagNUW);
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+
+  Value *NumBytes =
+    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+  CallInst *NewCall;
+  if (SplatValue)
+    NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
+  else {
+    Module *M = TheStore->getParent()->getParent()->getParent();
+    Value *MSP = M->getOrInsertFunction("memset_pattern16",
+                                        Builder.getVoidTy(),
+                                        Builder.getInt8PtrTy(),
+                                        Builder.getInt8PtrTy(), IntPtr,
+                                        (void*)0);
+
+    // Otherwise we should form a memset_pattern16.  PatternValue is known to be
+    // an constant array of 16-bytes.  Plop the value into a mergable global.
+    GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+                                            GlobalValue::InternalLinkage,
+                                            PatternValue, ".memset_pattern");
+    GV->setUnnamedAddr(true); // Ok to merge these.
+    GV->setAlignment(16);
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy());
+    NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
+  }
+
+  DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
+               << "    from store to: " << *Ev << " at: " << *TheStore << "\n");
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+  // Okay, the memset has been formed.  Zap the original store and anything that
+  // feeds into it.
+  deleteDeadInstruction(TheStore, *SE, TLI);
+  ++NumMemSet;
+  return true;
+}
+
+/// processLoopStoreOfLoopLoad - We see a strided store whose value is a
+/// same-strided load.
+bool LoopIdiomRecognize::
+processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
+                           const SCEVAddRecExpr *StoreEv,
+                           const SCEVAddRecExpr *LoadEv,
+                           const SCEV *BECount) {
+  // If we're not allowed to form memcpy, we fail.
+  if (!TLI->has(LibFunc::memcpy))
+    return false;
+
+  LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, "loop-idiom");
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy in the loop preheader now if we want.  However, this
+  // would be unsafe to do if there is anything else in the loop that may read
+  // or write the memory region we're storing to.  This includes the load that
+  // feeds the stores.  Check for an alias by generating the base address and
+  // checking everything.
+  Value *StoreBasePtr =
+    Expander.expandCodeFor(StoreEv->getStart(),
+                           Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
+                           Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
+                            CurLoop, BECount, StoreSize,
+                            getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+    return false;
+  }
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  Value *LoadBasePtr =
+    Expander.expandCodeFor(LoadEv->getStart(),
+                           Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
+                           Preheader->getTerminator());
+
+  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
+                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
+    Expander.clear();
+    // If we generated new code for the base pointer, clean up.
+    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
+    return false;
+  }
+
+  // Okay, everything is safe, we can transform this!
+
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  Type *IntPtr = TD->getIntPtrType(SI->getContext());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+                                         SCEV::FlagNUW);
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+                               SCEV::FlagNUW);
+
+  Value *NumBytes =
+    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+
+  CallInst *NewCall =
+    Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
+                         std::min(SI->getAlignment(), LI->getAlignment()));
+  NewCall->setDebugLoc(SI->getDebugLoc());
+
+  DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+
+  // Okay, the memset has been formed.  Zap the original store and anything that
+  // feeds into it.
+  deleteDeadInstruction(SI, *SE, TLI);
+  ++NumMemCpy;
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
new file mode 100644
index 000000000000..a23860aad80e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -0,0 +1,175 @@
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-instsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+namespace {
+  class LoopInstSimplify : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopInstSimplify() : LoopPass(ID) {
+      initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop*, LPPassManager&);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved("scalar-evolution");
+      AU.addRequired<TargetLibraryInfo>();
+    }
+  };
+}
+
+char LoopInstSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify",
+                "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify",
+                "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+  return new LoopInstSimplify();
+}
+
+bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  array_pod_sort(ExitBlocks.begin(), ExitBlocks.end());
+
+  SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+  // The bit we are stealing from the pointer represents whether this basic
+  // block is the header of a subloop, in which case we only process its phis.
+  typedef PointerIntPair<BasicBlock*, 1> WorklistItem;
+  SmallVector<WorklistItem, 16> VisitStack;
+  SmallPtrSet<BasicBlock*, 32> Visited;
+
+  bool Changed = false;
+  bool LocalChanged;
+  do {
+    LocalChanged = false;
+
+    VisitStack.clear();
+    Visited.clear();
+
+    VisitStack.push_back(WorklistItem(L->getHeader(), false));
+
+    while (!VisitStack.empty()) {
+      WorklistItem Item = VisitStack.pop_back_val();
+      BasicBlock *BB = Item.getPointer();
+      bool IsSubloopHeader = Item.getInt();
+
+      // Simplify instructions in the current basic block.
+      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+        Instruction *I = BI++;
+
+        // The first time through the loop ToSimplify is empty and we try to
+        // simplify all instructions. On later iterations ToSimplify is not
+        // empty and we only bother simplifying instructions that are in it.
+        if (!ToSimplify->empty() && !ToSimplify->count(I))
+          continue;
+
+        // Don't bother simplifying unused instructions.
+        if (!I->use_empty()) {
+          Value *V = SimplifyInstruction(I, TD, TLI, DT);
+          if (V && LI->replacementPreservesLCSSAForm(I, V)) {
+            // Mark all uses for resimplification next time round the loop.
+            for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+                 UI != UE; ++UI)
+              Next->insert(cast<Instruction>(*UI));
+
+            I->replaceAllUsesWith(V);
+            LocalChanged = true;
+            ++NumSimplified;
+          }
+        }
+        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+
+        if (IsSubloopHeader && !isa<PHINode>(I))
+          break;
+      }
+
+      // Add all successors to the worklist, except for loop exit blocks and the
+      // bodies of subloops. We visit the headers of loops so that we can process
+      // their phis, but we contract the rest of the subloop body and only follow
+      // edges leading back to the original loop.
+      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+           ++SI) {
+        BasicBlock *SuccBB = *SI;
+        if (!Visited.insert(SuccBB))
+          continue;
+
+        const Loop *SuccLoop = LI->getLoopFor(SuccBB);
+        if (SuccLoop && SuccLoop->getHeader() == SuccBB
+                     && L->contains(SuccLoop)) {
+          VisitStack.push_back(WorklistItem(SuccBB, true));
+
+          SmallVector<BasicBlock*, 8> SubLoopExitBlocks;
+          SuccLoop->getExitBlocks(SubLoopExitBlocks);
+
+          for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) {
+            BasicBlock *ExitBB = SubLoopExitBlocks[i];
+            if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB))
+              VisitStack.push_back(WorklistItem(ExitBB, false));
+          }
+
+          continue;
+        }
+
+        bool IsExitBlock = std::binary_search(ExitBlocks.begin(),
+                                              ExitBlocks.end(), SuccBB);
+        if (IsExitBlock)
+          continue;
+
+        VisitStack.push_back(WorklistItem(SuccBB, false));
+      }
+    }
+
+    // Place the list of instructions to simplify on the next loop iteration
+    // into ToSimplify.
+    std::swap(ToSimplify, Next);
+    Next->clear();
+
+    Changed |= LocalChanged;
+  } while (LocalChanged);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 000000000000..e98ae953e532
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -0,0 +1,508 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-rotate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define MAX_HEADER_SIZE 16
+
+STATISTIC(NumRotated, "Number of loops rotated");
+namespace {
+
+  class LoopRotate : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopRotate() : LoopPass(ID) {
+      initializeLoopRotatePass(*PassRegistry::getPassRegistry());
+    }
+
+    // LCSSA form makes instruction renaming easier.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    void simplifyLoopLatch(Loop *L);
+    bool rotateLoop(Loop *L);
+
+  private:
+    LoopInfo *LI;
+    const TargetTransformInfo *TTI;
+  };
+}
+
+char LoopRotate::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false)
+
+Pass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+
+/// Rotate Loop L as many times as possible. Return true if
+/// the loop is rotated at least once.
+bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
+  LI = &getAnalysis<LoopInfo>();
+  TTI = &getAnalysis<TargetTransformInfo>();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  simplifyLoopLatch(L);
+
+  // One loop can be rotated multiple times.
+  bool MadeChange = false;
+  while (rotateLoop(L))
+    MadeChange = true;
+
+  return MadeChange;
+}
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA;
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal];
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+         UE = OrigHeaderVal->use_end(); UI != UE; ) {
+      // Grab the use before incrementing the iterator.
+      Use &U = UI.getUse();
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+  }
+}
+
+/// Determine whether the instructions in this range my be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End) {
+  bool seenIncrement = false;
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the cast of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+void LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
+    return;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+        << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI, Latch->getInstList(), Latch->begin(), Jmp);
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+bool LoopRotate::rotateLoop(Loop *L) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (BI == 0 || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non duplicatable"
+            << " instructions: "; L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MAX_HEADER_SIZE)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (OrigPreheader == 0)
+    return false;
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated.
+  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
+    SE->forgetLoop(L);
+
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+  while (I != E) {
+    Instruction *Inst = I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) &&
+        !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() &&
+        !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst) &&
+        !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      delete C;
+      ValueMap[Inst] = V;
+    } else {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+      ValueMap[Inst] = C;
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero())
+          != NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Update DominatorTree to reflect the CFG change we just made.  Then split
+    // edges as necessary to preserve LoopSimplify form.
+    if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
+
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+    }
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this);
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor.
+    BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this);
+    ExitSplit->moveBefore(Exit);
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
+      // Update OrigHeader to be dominated by the new header block.
+      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
+    }
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, this);
+
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 000000000000..73e44d7edf5e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -0,0 +1,4835 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
+//   ...
+//   %i.next = add %i, 1
+//   %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using // it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+//       of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+//       smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+//       multiple base registers, or as the increment expression in an addrec),
+//       we may not actually need both reg and (-1 * reg) in registers; the
+//       negation can be implemented by using a sub instead of an add. The
+//       lack of support for taking this into consideration when making
+//       register pressure decisions is partly worked around by the "Special"
+//       use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reduce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+  "enable-lsr-phielim", cl::Hidden, cl::init(true),
+  cl::desc("Enable LSR phi elimination"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+  "stress-ivchain", cl::Hidden, cl::init(false),
+  cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+/// RegSortData - This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+  /// UsedByIndices - This represents the set of LSRUse indices which reference
+  /// a particular register.
+  SmallBitVector UsedByIndices;
+
+  RegSortData() {}
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+void RegSortData::print(raw_ostream &OS) const {
+  OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegSortData::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// RegUseTracker - Map register candidates to information about how they are
+/// used.
+class RegUseTracker {
+  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
+
+  RegUsesTy RegUsesMap;
+  SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+  void CountRegister(const SCEV *Reg, size_t LUIdx);
+  void DropRegister(const SCEV *Reg, size_t LUIdx);
+  void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+  void clear();
+
+  typedef SmallVectorImpl<const SCEV *>::iterator iterator;
+  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
+  iterator begin() { return RegSequence.begin(); }
+  iterator end()   { return RegSequence.end(); }
+  const_iterator begin() const { return RegSequence.begin(); }
+  const_iterator end() const   { return RegSequence.end(); }
+};
+
+}
+
+void
+RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
+  std::pair<RegUsesTy::iterator, bool> Pair =
+    RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+  RegSortData &RSD = Pair.first->second;
+  if (Pair.second)
+    RegSequence.push_back(Reg);
+  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+  RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
+  RegUsesTy::iterator It = RegUsesMap.find(Reg);
+  assert(It != RegUsesMap.end());
+  RegSortData &RSD = It->second;
+  assert(RSD.UsedByIndices.size() > LUIdx);
+  RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+  assert(LUIdx <= LastLUIdx);
+
+  // Update RegUses. The data structure is not optimized for this purpose;
+  // we must iterate through it and update each of the bit vectors.
+  for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
+       I != E; ++I) {
+    SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+    if (LUIdx < UsedByIndices.size())
+      UsedByIndices[LUIdx] =
+        LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
+    UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+  }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  if (I == RegUsesMap.end())
+    return false;
+  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+  int i = UsedByIndices.find_first();
+  if (i == -1) return false;
+  if ((size_t)i != LUIdx) return true;
+  return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+  assert(I != RegUsesMap.end() && "Unknown register!");
+  return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+  RegUsesMap.clear();
+  RegSequence.clear();
+}
+
+namespace {
+
+/// Formula - This class holds information that describes a formula for
+/// computing satisfying a use. It may include broken-out immediates and scaled
+/// registers.
+struct Formula {
+  /// Global base address used for complex addressing.
+  GlobalValue *BaseGV;
+
+  /// Base offset for complex addressing.
+  int64_t BaseOffset;
+
+  /// Whether any complex addressing has a base register.
+  bool HasBaseReg;
+
+  /// The scale of any complex addressing.
+  int64_t Scale;
+
+  /// BaseRegs - The list of "base" registers for this use. When this is
+  /// non-empty,
+  SmallVector<const SCEV *, 4> BaseRegs;
+
+  /// ScaledReg - The 'scaled' register for this use. This should be non-null
+  /// when Scale is not zero.
+  const SCEV *ScaledReg;
+
+  /// UnfoldedOffset - An additional constant offset which added near the
+  /// use. This requires a temporary register, but the offset itself can
+  /// live in an add immediate field rather than a register.
+  int64_t UnfoldedOffset;
+
+  Formula()
+      : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0),
+        UnfoldedOffset(0) {}
+
+  void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+  unsigned getNumRegs() const;
+  Type *getType() const;
+
+  void DeleteBaseReg(const SCEV *&S);
+
+  bool referencesReg(const SCEV *S) const;
+  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                  const RegUseTracker &RegUses) const;
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// DoInitialMatch - Recursion helper for InitialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+                           SmallVectorImpl<const SCEV *> &Good,
+                           SmallVectorImpl<const SCEV *> &Bad,
+                           ScalarEvolution &SE) {
+  // Collect expressions which properly dominate the loop header.
+  if (SE.properlyDominates(S, L->getHeader())) {
+    Good.push_back(S);
+    return;
+  }
+
+  // Look at add operands.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I)
+      DoInitialMatch(*I, L, Good, Bad, SE);
+    return;
+  }
+
+  // Look at addrec operands.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+    if (!AR->getStart()->isZero()) {
+      DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+      DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+                                      AR->getStepRecurrence(SE),
+                                      // FIXME: AR->getNoWrapFlags()
+                                      AR->getLoop(), SCEV::FlagAnyWrap),
+                     L, Good, Bad, SE);
+      return;
+    }
+
+  // Handle a multiplication by -1 (negation) if it didn't fold.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+    if (Mul->getOperand(0)->isAllOnesValue()) {
+      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
+      const SCEV *NewMul = SE.getMulExpr(Ops);
+
+      SmallVector<const SCEV *, 4> MyGood;
+      SmallVector<const SCEV *, 4> MyBad;
+      DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+        SE.getEffectiveSCEVType(NewMul->getType())));
+      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
+           E = MyGood.end(); I != E; ++I)
+        Good.push_back(SE.getMulExpr(NegOne, *I));
+      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
+           E = MyBad.end(); I != E; ++I)
+        Bad.push_back(SE.getMulExpr(NegOne, *I));
+      return;
+    }
+
+  // Ok, we can't do anything interesting. Just stuff the whole thing into a
+  // register and hope for the best.
+  Bad.push_back(S);
+}
+
+/// InitialMatch - Incorporate loop-variant parts of S into this Formula,
+/// attempting to keep all loop-invariant and loop-computable values in a
+/// single base register.
+void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+  SmallVector<const SCEV *, 4> Good;
+  SmallVector<const SCEV *, 4> Bad;
+  DoInitialMatch(S, L, Good, Bad, SE);
+  if (!Good.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Good);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+  if (!Bad.empty()) {
+    const SCEV *Sum = SE.getAddExpr(Bad);
+    if (!Sum->isZero())
+      BaseRegs.push_back(Sum);
+    HasBaseReg = true;
+  }
+}
+
+/// getNumRegs - Return the total number of register operands used by this
+/// formula. This does not include register uses implied by non-constant
+/// addrec strides.
+unsigned Formula::getNumRegs() const {
+  return !!ScaledReg + BaseRegs.size();
+}
+
+/// getType - Return the type of this formula, if it has one, or null
+/// otherwise. This type is meaningless except for the bit size.
+Type *Formula::getType() const {
+  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+         ScaledReg ? ScaledReg->getType() :
+         BaseGV ? BaseGV->getType() :
+         0;
+}
+
+/// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
+void Formula::DeleteBaseReg(const SCEV *&S) {
+  if (&S != &BaseRegs.back())
+    std::swap(S, BaseRegs.back());
+  BaseRegs.pop_back();
+}
+
+/// referencesReg - Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+  return S == ScaledReg ||
+         std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
+}
+
+/// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
+/// which are used by uses other than the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+                                         const RegUseTracker &RegUses) const {
+  if (ScaledReg)
+    if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+      return true;
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
+       E = BaseRegs.end(); I != E; ++I)
+    if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
+      return true;
+  return false;
+}
+
+void Formula::print(raw_ostream &OS) const {
+  bool First = true;
+  if (BaseGV) {
+    if (!First) OS << " + "; else First = false;
+    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
+  }
+  if (BaseOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << BaseOffset;
+  }
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
+       E = BaseRegs.end(); I != E; ++I) {
+    if (!First) OS << " + "; else First = false;
+    OS << "reg(" << **I << ')';
+  }
+  if (HasBaseReg && BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: HasBaseReg**";
+  } else if (!HasBaseReg && !BaseRegs.empty()) {
+    if (!First) OS << " + "; else First = false;
+    OS << "**error: !HasBaseReg**";
+  }
+  if (Scale != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << Scale << "*reg(";
+    if (ScaledReg)
+      OS << *ScaledReg;
+    else
+      OS << "<unknown>";
+    OS << ')';
+  }
+  if (UnfoldedOffset != 0) {
+    if (!First) OS << " + "; else First = false;
+    OS << "imm(" << UnfoldedOffset << ')';
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Formula::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// isAddRecSExtable - Return true if the given addrec can be sign-extended
+/// without changing its value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// isAddSExtable - Return true if the given add can be sign-extended
+/// without changing its value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// isMulSExtable - Return true if the given mul can be sign-extended
+/// without changing its value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+  Type *WideTy =
+    IntegerType::get(SE.getContext(),
+                     SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
+/// and if the remainder is known to be zero,  or null otherwise. If
+/// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
+/// to Y, ignoring that the multiplication may overflow, which is useful when
+/// the result will be used in a context where the most significant bits are
+/// ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+                                ScalarEvolution &SE,
+                                bool IgnoreSignificantBits = false) {
+  // Handle the trivial case, which works for any SCEV type.
+  if (LHS == RHS)
+    return SE.getConstant(LHS->getType(), 1);
+
+  // Handle a few RHS special cases.
+  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+  if (RC) {
+    const APInt &RA = RC->getValue()->getValue();
+    // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+    // some folding.
+    if (RA.isAllOnesValue())
+      return SE.getMulExpr(LHS, RC);
+    // Handle x /s 1 as x.
+    if (RA == 1)
+      return LHS;
+  }
+
+  // Check for a division of a constant by a constant.
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+    if (!RC)
+      return 0;
+    const APInt &LA = C->getValue()->getValue();
+    const APInt &RA = RC->getValue()->getValue();
+    if (LA.srem(RA) != 0)
+      return 0;
+    return SE.getConstant(LA.sdiv(RA));
+  }
+
+  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
+      const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+                                      IgnoreSignificantBits);
+      if (!Step) return 0;
+      const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+                                       IgnoreSignificantBits);
+      if (!Start) return 0;
+      // FlagNW is independent of the start value, step direction, and is
+      // preserved with smaller magnitude steps.
+      // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+      return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+    }
+    return 0;
+  }
+
+  // Distribute the sdiv over add operands, if the add doesn't overflow.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+    if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+      SmallVector<const SCEV *, 8> Ops;
+      for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+           I != E; ++I) {
+        const SCEV *Op = getExactSDiv(*I, RHS, SE,
+                                      IgnoreSignificantBits);
+        if (!Op) return 0;
+        Ops.push_back(Op);
+      }
+      return SE.getAddExpr(Ops);
+    }
+    return 0;
+  }
+
+  // Check for a multiply operand that we can pull RHS out of.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+    if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+      SmallVector<const SCEV *, 4> Ops;
+      bool Found = false;
+      for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
+           I != E; ++I) {
+        const SCEV *S = *I;
+        if (!Found)
+          if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+                                           IgnoreSignificantBits)) {
+            S = Q;
+            Found = true;
+          }
+        Ops.push_back(S);
+      }
+      return Found ? SE.getMulExpr(Ops) : 0;
+    }
+    return 0;
+  }
+
+  // Otherwise we don't know.
+  return 0;
+}
+
+/// ExtractImmediate - If S involves the addition of a constant integer value,
+/// return that integer value, and mutate S to point to a new SCEV with that
+/// value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+    if (C->getValue()->getValue().getMinSignedBits() <= 64) {
+      S = SE.getConstant(C->getType(), 0);
+      return C->getValue()->getSExtValue();
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    int64_t Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result != 0)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return 0;
+}
+
+/// ExtractSymbol - If S involves the addition of a GlobalValue address,
+/// return that symbol, and mutate S to point to a new SCEV with that
+/// value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+      S = SE.getConstant(GV->getType(), 0);
+      return GV;
+    }
+  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+    if (Result)
+      S = SE.getAddExpr(NewOps);
+    return Result;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
+    GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+    if (Result)
+      S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+                           // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                           SCEV::FlagAnyWrap);
+    return Result;
+  }
+  return 0;
+}
+
+/// isAddressUse - Returns true if the specified instruction is using the
+/// specified value as an address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getOperand(1) == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+      case Intrinsic::x86_sse_storeu_ps:
+      case Intrinsic::x86_sse2_storeu_pd:
+      case Intrinsic::x86_sse2_storeu_dq:
+      case Intrinsic::x86_sse2_storel_dq:
+        if (II->getArgOperand(0) == OperandVal)
+          isAddress = true;
+        break;
+    }
+  }
+  return isAddress;
+}
+
+/// getAccessType - Return the type of the memory being accessed.
+static Type *getAccessType(const Instruction *Inst) {
+  Type *AccessTy = Inst->getType();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    AccessTy = SI->getOperand(0)->getType();
+  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      AccessTy = II->getArgOperand(0)->getType();
+      break;
+    }
+  }
+
+  // All pointers have the same requirements, so canonicalize them to an
+  // arbitrary pointer type to minimize variation.
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
+    AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+                                PTy->getAddressSpace());
+
+  return AccessTy;
+}
+
+/// isExistingPhi - Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(PN) == AR)
+      return true;
+  }
+  return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+                                SmallPtrSet<const SCEV*, 8> &Processed,
+                                ScalarEvolution &SE) {
+  // Zero/One operand expressions
+  switch (S->getSCEVType()) {
+  case scUnknown:
+  case scConstant:
+    return false;
+  case scTruncate:
+    return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scZeroExtend:
+    return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  case scSignExtend:
+    return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+                               Processed, SE);
+  }
+
+  if (!Processed.insert(S))
+    return false;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      if (isHighCostExpansion(*I, Processed, SE))
+        return true;
+    }
+    return false;
+  }
+
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    if (Mul->getNumOperands() == 2) {
+      // Multiplication by a constant is ok
+      if (isa<SCEVConstant>(Mul->getOperand(0)))
+        return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+      // If we have the value of one operand, check if an existing
+      // multiplication already generates this expression.
+      if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+        Value *UVal = U->getValue();
+        for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end();
+             UI != UE; ++UI) {
+          // If U is a constant, it may be used by a ConstantExpr.
+          Instruction *User = dyn_cast<Instruction>(*UI);
+          if (User && User->getOpcode() == Instruction::Mul
+              && SE.isSCEVable(User->getType())) {
+            return SE.getSCEV(User) == Mul;
+          }
+        }
+      }
+    }
+  }
+
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (isExistingPhi(AR, SE))
+      return false;
+  }
+
+  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+  return true;
+}
+
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+static bool
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+  bool Changed = false;
+
+  while (!DeadInsts.empty()) {
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
+
+    if (I == 0 || !isInstructionTriviallyDead(I))
+      continue;
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        *OI = 0;
+        if (U->use_empty())
+          DeadInsts.push_back(U);
+      }
+
+    I->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+namespace {
+
+/// Cost - This class is used to measure and compare candidate formulae.
+class Cost {
+  /// TODO: Some of these could be merged. Also, a lexical ordering
+  /// isn't always optimal.
+  unsigned NumRegs;
+  unsigned AddRecCost;
+  unsigned NumIVMuls;
+  unsigned NumBaseAdds;
+  unsigned ImmCost;
+  unsigned SetupCost;
+
+public:
+  Cost()
+    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
+      SetupCost(0) {}
+
+  bool operator<(const Cost &Other) const;
+
+  void Loose();
+
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+             | ImmCost | SetupCost) != ~0u)
+      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+           & ImmCost & SetupCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return NumRegs == ~0u;
+  }
+
+  void RateFormula(const Formula &F,
+                   SmallPtrSet<const SCEV *, 16> &Regs,
+                   const DenseSet<const SCEV *> &VisitedRegs,
+                   const Loop *L,
+                   const SmallVectorImpl<int64_t> &Offsets,
+                   ScalarEvolution &SE, DominatorTree &DT,
+                   SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+private:
+  void RateRegister(const SCEV *Reg,
+                    SmallPtrSet<const SCEV *, 16> &Regs,
+                    const Loop *L,
+                    ScalarEvolution &SE, DominatorTree &DT);
+  void RatePrimaryRegister(const SCEV *Reg,
+                           SmallPtrSet<const SCEV *, 16> &Regs,
+                           const Loop *L,
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSet<const SCEV *, 16> *LoserRegs);
+};
+
+}
+
+/// RateRegister - Tally up interesting quantities from the given register.
+void Cost::RateRegister(const SCEV *Reg,
+                        SmallPtrSet<const SCEV *, 16> &Regs,
+                        const Loop *L,
+                        ScalarEvolution &SE, DominatorTree &DT) {
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has already run on inner loops, will not run on outer
+    // loops, and cannot be expected to change sibling loops.
+    if (AR->getLoop() != L) {
+      // If the AddRec exists, consider it's register free and leave it alone.
+      if (isExistingPhi(AR, SE))
+        return;
+
+      // Otherwise, do not consider this formula at all.
+      Loose();
+      return;
+    }
+    AddRecCost += 1; /// TODO: This should be a function of the stride.
+
+    // Add the step value register, if it needs one.
+    // TODO: The non-affine case isn't precisely modeled here.
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
+        RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
+  }
+  ++NumRegs;
+
+  // Rough heuristic; favor registers which don't require extra setup
+  // instructions in the preheader.
+  if (!isa<SCEVUnknown>(Reg) &&
+      !isa<SCEVConstant>(Reg) &&
+      !(isa<SCEVAddRecExpr>(Reg) &&
+        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
+         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
+    ++SetupCost;
+
+    NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+                 SE.hasComputableLoopEvolution(Reg, L);
+}
+
+/// RatePrimaryRegister - Record this register in the set. If we haven't seen it
+/// before, rate it. Optional LoserRegs provides a way to declare any formula
+/// that refers to one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const SCEV *Reg,
+                               SmallPtrSet<const SCEV *, 16> &Regs,
+                               const Loop *L,
+                               ScalarEvolution &SE, DominatorTree &DT,
+                               SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Loose();
+    return;
+  }
+  if (Regs.insert(Reg)) {
+    RateRegister(Reg, Regs, L, SE, DT);
+    if (LoserRegs && isLoser())
+      LoserRegs->insert(Reg);
+  }
+}
+
+void Cost::RateFormula(const Formula &F,
+                       SmallPtrSet<const SCEV *, 16> &Regs,
+                       const DenseSet<const SCEV *> &VisitedRegs,
+                       const Loop *L,
+                       const SmallVectorImpl<int64_t> &Offsets,
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  // Tally up the registers.
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Loose();
+      return;
+    }
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
+       E = F.BaseRegs.end(); I != E; ++I) {
+    const SCEV *BaseReg = *I;
+    if (VisitedRegs.count(BaseReg)) {
+      Loose();
+      return;
+    }
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+  if (NumBaseParts > 1)
+    NumBaseAdds += NumBaseParts - 1;
+
+  // Tally up the non-zero immediates.
+  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
+       E = Offsets.end(); I != E; ++I) {
+    int64_t Offset = (uint64_t)*I + F.BaseOffset;
+    if (F.BaseGV)
+      ImmCost += 64; // Handle symbolic values conservatively.
+                     // TODO: This should probably be the pointer size.
+    else if (Offset != 0)
+      ImmCost += APInt(64, Offset, true).getMinSignedBits();
+  }
+  assert(isValid() && "invalid cost");
+}
+
+/// Loose - Set this cost to a losing value.
+void Cost::Loose() {
+  NumRegs = ~0u;
+  AddRecCost = ~0u;
+  NumIVMuls = ~0u;
+  NumBaseAdds = ~0u;
+  ImmCost = ~0u;
+  SetupCost = ~0u;
+}
+
+/// operator< - Choose the lower cost.
+bool Cost::operator<(const Cost &Other) const {
+  if (NumRegs != Other.NumRegs)
+    return NumRegs < Other.NumRegs;
+  if (AddRecCost != Other.AddRecCost)
+    return AddRecCost < Other.AddRecCost;
+  if (NumIVMuls != Other.NumIVMuls)
+    return NumIVMuls < Other.NumIVMuls;
+  if (NumBaseAdds != Other.NumBaseAdds)
+    return NumBaseAdds < Other.NumBaseAdds;
+  if (ImmCost != Other.ImmCost)
+    return ImmCost < Other.ImmCost;
+  if (SetupCost != Other.SetupCost)
+    return SetupCost < Other.SetupCost;
+  return false;
+}
+
+void Cost::print(raw_ostream &OS) const {
+  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
+  if (AddRecCost != 0)
+    OS << ", with addrec cost " << AddRecCost;
+  if (NumIVMuls != 0)
+    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
+  if (NumBaseAdds != 0)
+    OS << ", plus " << NumBaseAdds << " base add"
+       << (NumBaseAdds == 1 ? "" : "s");
+  if (ImmCost != 0)
+    OS << ", plus " << ImmCost << " imm cost";
+  if (SetupCost != 0)
+    OS << ", plus " << SetupCost << " setup cost";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Cost::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// LSRFixup - An operand value in an instruction which is to be replaced
+/// with some equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// UserInst - The instruction which will be updated.
+  Instruction *UserInst;
+
+  /// OperandValToReplace - The operand of the instruction which will
+  /// be replaced. The operand may be used more than once; every instance
+  /// will be replaced.
+  Value *OperandValToReplace;
+
+  /// PostIncLoops - If this user is to use the post-incremented value of an
+  /// induction variable, this variable is non-null and holds the loop
+  /// associated with the induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// LUIdx - The index of the LSRUse describing the expression which
+  /// this fixup needs, minus an offset (below).
+  size_t LUIdx;
+
+  /// Offset - A constant offset to be added to the LSRUse expression.
+  /// This allows multiple fixups to share the same LSRUse with different
+  /// offsets, for example in an unrolled loop.
+  int64_t Offset;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  LSRFixup();
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+LSRFixup::LSRFixup()
+  : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {}
+
+/// isUseFullyOutsideLoop - Test whether this fixup always uses its
+/// value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+  // PHI nodes use their value in their incoming blocks.
+  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == OperandValToReplace &&
+          L->contains(PN->getIncomingBlock(i)))
+        return false;
+    return true;
+  }
+
+  return !L->contains(UserInst);
+}
+
+void LSRFixup::print(raw_ostream &OS) const {
+  OS << "UserInst=";
+  // Store is common and interesting enough to be worth special-casing.
+  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+    OS << "store ";
+    WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
+  } else if (UserInst->getType()->isVoidTy())
+    OS << UserInst->getOpcodeName();
+  else
+    WriteAsOperand(OS, UserInst, /*PrintType=*/false);
+
+  OS << ", OperandValToReplace=";
+  WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
+
+  for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
+       E = PostIncLoops.end(); I != E; ++I) {
+    OS << ", PostIncLoop=";
+    WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false);
+  }
+
+  if (LUIdx != ~size_t(0))
+    OS << ", LUIdx=" << LUIdx;
+
+  if (Offset != 0)
+    OS << ", Offset=" << Offset;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRFixup::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+  static SmallVector<const SCEV *, 4> getEmptyKey() {
+    SmallVector<const SCEV *, 4>  V;
+    V.push_back(reinterpret_cast<const SCEV *>(-1));
+    return V;
+  }
+
+  static SmallVector<const SCEV *, 4> getTombstoneKey() {
+    SmallVector<const SCEV *, 4> V;
+    V.push_back(reinterpret_cast<const SCEV *>(-2));
+    return V;
+  }
+
+  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+    unsigned Result = 0;
+    for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(),
+         E = V.end(); I != E; ++I)
+      Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I);
+    return Result;
+  }
+
+  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+                      const SmallVector<const SCEV *, 4> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// LSRUse - This class holds the state that LSR keeps for each use in
+/// IVUsers, as well as uses invented by LSR itself. It includes information
+/// about what kinds of things can be folded into the user, information about
+/// the user itself, and information about how the use may be satisfied.
+/// TODO: Represent multiple users of the same expression in common?
+class LSRUse {
+  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+  /// KindType - An enum for a kind of use, indicating what types of
+  /// scaled and immediate operands it might support.
+  enum KindType {
+    Basic,   ///< A normal use, with no folding.
+    Special, ///< A special case of basic, allowing -1 scales.
+    Address, ///< An address use; folding according to TargetLowering
+    ICmpZero ///< An equality icmp with both operands folded into one.
+    // TODO: Add a generic icmp too?
+  };
+
+  KindType Kind;
+  Type *AccessTy;
+
+  SmallVector<int64_t, 8> Offsets;
+  int64_t MinOffset;
+  int64_t MaxOffset;
+
+  /// AllFixupsOutsideLoop - This records whether all of the fixups using this
+  /// LSRUse are outside of the loop, in which case some special-case heuristics
+  /// may be used.
+  bool AllFixupsOutsideLoop;
+
+  /// WidestFixupType - This records the widest use type for any fixup using
+  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
+  /// max fixup widths to be equivalent, because the narrower one may be relying
+  /// on the implicit truncation to truncate away bogus bits.
+  Type *WidestFixupType;
+
+  /// Formulae - A list of ways to build a value that can satisfy this user.
+  /// After the list is populated, one of these is selected heuristically and
+  /// used to formulate a replacement for OperandValToReplace in UserInst.
+  SmallVector<Formula, 12> Formulae;
+
+  /// Regs - The set of register candidates used by all formulae in this LSRUse.
+  SmallPtrSet<const SCEV *, 4> Regs;
+
+  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
+                                      MinOffset(INT64_MAX),
+                                      MaxOffset(INT64_MIN),
+                                      AllFixupsOutsideLoop(true),
+                                      WidestFixupType(0) {}
+
+  bool HasFormulaWithSameRegs(const Formula &F) const;
+  bool InsertFormula(const Formula &F);
+  void DeleteFormula(Formula &F);
+  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// HasFormula - Test whether this use as a formula which has the same
+/// registers as the given formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+  return Uniquifier.count(Key);
+}
+
+/// InsertFormula - If the given formula has not yet been inserted, add it to
+/// the list, and return true. Return false otherwise.
+bool LSRUse::InsertFormula(const Formula &F) {
+  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+  if (F.ScaledReg) Key.push_back(F.ScaledReg);
+  // Unstable sort by host order ok, because this is only used for uniquifying.
+  std::sort(Key.begin(), Key.end());
+
+  if (!Uniquifier.insert(Key).second)
+    return false;
+
+  // Using a register to hold the value of 0 is not profitable.
+  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+         "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+  for (SmallVectorImpl<const SCEV *>::const_iterator I =
+       F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
+    assert(!(*I)->isZero() && "Zero allocated in a base register!");
+#endif
+
+  // Add the formula to the list.
+  Formulae.push_back(F);
+
+  // Record registers now being used by this use.
+  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+
+  return true;
+}
+
+/// DeleteFormula - Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+  if (&F != &Formulae.back())
+    std::swap(F, Formulae.back());
+  Formulae.pop_back();
+}
+
+/// RecomputeRegs - Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+  // Now that we've filtered out some formulae, recompute the Regs set.
+  SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
+  Regs.clear();
+  for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
+       E = Formulae.end(); I != E; ++I) {
+    const Formula &F = *I;
+    if (F.ScaledReg) Regs.insert(F.ScaledReg);
+    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+  }
+
+  // Update the RegTracker.
+  for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
+       E = OldRegs.end(); I != E; ++I)
+    if (!Regs.count(*I))
+      RegUses.DropRegister(*I, LUIdx);
+}
+
+void LSRUse::print(raw_ostream &OS) const {
+  OS << "LSR Use: Kind=";
+  switch (Kind) {
+  case Basic:    OS << "Basic"; break;
+  case Special:  OS << "Special"; break;
+  case ICmpZero: OS << "ICmpZero"; break;
+  case Address:
+    OS << "Address of ";
+    if (AccessTy->isPointerTy())
+      OS << "pointer"; // the full pointer type could be really verbose
+    else
+      OS << *AccessTy;
+  }
+
+  OS << ", Offsets={";
+  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
+       E = Offsets.end(); I != E; ++I) {
+    OS << *I;
+    if (llvm::next(I) != E)
+      OS << ',';
+  }
+  OS << '}';
+
+  if (AllFixupsOutsideLoop)
+    OS << ", all-fixups-outside-loop";
+
+  if (WidestFixupType)
+    OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRUse::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
+/// be completely folded into the user instruction at isel time. This includes
+/// address-mode folding and special icmp tricks.
+static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind,
+                       Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset,
+                       bool HasBaseReg, int64_t Scale) {
+  switch (Kind) {
+  case LSRUse::Address:
+    return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
+
+    // Otherwise, just guess that reg+reg addressing is legal.
+    //return ;
+
+  case LSRUse::ICmpZero:
+    // There's not even a target hook for querying whether it would be legal to
+    // fold a GV into an ICmp.
+    if (BaseGV)
+      return false;
+
+    // ICmp only has two operands; don't allow more than two non-trivial parts.
+    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+      return false;
+
+    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+    // putting the scaled register in the other operand of the icmp.
+    if (Scale != 0 && Scale != -1)
+      return false;
+
+    // If we have low-level target information, ask the target if it can fold an
+    // integer immediate on an icmp.
+    if (BaseOffset != 0) {
+      // We have one of:
+      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+      // Offs is the ICmp immediate.
+      if (Scale == 0)
+        // The cast does the right thing with INT64_MIN.
+        BaseOffset = -(uint64_t)BaseOffset;
+      return TTI.isLegalICmpImmediate(BaseOffset);
+    }
+
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+    return true;
+
+  case LSRUse::Basic:
+    // Only handle single-register values.
+    return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+  case LSRUse::Special:
+    // Special case Basic to handle -1 scales.
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
+                       int64_t Scale) {
+  // Check for overflow.
+  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+      (MinOffset > 0))
+    return false;
+  MinOffset = (uint64_t)BaseOffset + MinOffset;
+  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+      (MaxOffset > 0))
+    return false;
+  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+  return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg,
+                    Scale) &&
+         isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale);
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+                       int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
+                       const Formula &F) {
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+                    F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             LSRUse::KindType Kind, Type *AccessTy,
+                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  // Canonicalize a scale of 1 to a base register if the formula doesn't
+  // already have a base register.
+  if (!HasBaseReg && Scale == 1) {
+    Scale = 0;
+    HasBaseReg = true;
+  }
+
+  return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+                             ScalarEvolution &SE, int64_t MinOffset,
+                             int64_t MaxOffset, LSRUse::KindType Kind,
+                             Type *AccessTy, const SCEV *S, bool HasBaseReg) {
+  // Fast-path: zero is always foldable.
+  if (S->isZero()) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t BaseOffset = ExtractImmediate(S, SE);
+  GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+  // If there's anything else involved, it's not foldable.
+  if (!S->isZero()) return false;
+
+  // Fast-path: zero is always foldable.
+  if (BaseOffset == 0 && !BaseGV) return true;
+
+  // Conservatively, create an address with an immediate and a
+  // base and a scale.
+  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+                    BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
+/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
+struct UseMapDenseMapInfo {
+  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
+  }
+
+  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
+    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
+  }
+
+  static unsigned
+  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
+    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
+    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
+    return Result;
+  }
+
+  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
+                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
+    return LHS == RHS;
+  }
+};
+
+/// IVInc - An individual increment in a Chain of IV increments.
+/// Relate an IV user to an expression that computes the IV it uses from the IV
+/// used by the previous link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+  Instruction *UserInst;
+  Value* IVOperand;
+  const SCEV *IncExpr;
+
+  IVInc(Instruction *U, Value *O, const SCEV *E):
+    UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// IVChain - The list of IV increments in program order.
+// We typically add the head of a chain without finding subsequent links.
+struct IVChain {
+  SmallVector<IVInc,1> Incs;
+  const SCEV *ExprBase;
+
+  IVChain() : ExprBase(0) {}
+
+  IVChain(const IVInc &Head, const SCEV *Base)
+    : Incs(1, Head), ExprBase(Base) {}
+
+  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
+
+  // begin - return the first increment in the chain.
+  const_iterator begin() const {
+    assert(!Incs.empty());
+    return llvm::next(Incs.begin());
+  }
+  const_iterator end() const {
+    return Incs.end();
+  }
+
+  // hasIncs - Returns true if this chain contains any increments.
+  bool hasIncs() const { return Incs.size() >= 2; }
+
+  // add - Add an IVInc to the end of this chain.
+  void add(const IVInc &X) { Incs.push_back(X); }
+
+  // tailUserInst - Returns the last UserInst in the chain.
+  Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+  // isProfitableIncrement - Returns true if IncExpr can be profitably added to
+  // this chain.
+  bool isProfitableIncrement(const SCEV *OperExpr,
+                             const SCEV *IncExpr,
+                             ScalarEvolution&);
+};
+
+/// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
+/// Distinguish between FarUsers that definitely cross IV increments and
+/// NearUsers that may be used between IV increments.
+struct ChainUsers {
+  SmallPtrSet<Instruction*, 4> FarUsers;
+  SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// LSRInstance - This class holds state for the main loop strength reduction
+/// logic.
+class LSRInstance {
+  IVUsers &IU;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  const TargetTransformInfo &TTI;
+  Loop *const L;
+  bool Changed;
+
+  /// IVIncInsertPos - This is the insert position that the current loop's
+  /// induction variable increment should be placed. In simple loops, this is
+  /// the latch block's terminator. But in more complicated cases, this is a
+  /// position which will dominate all the in-loop post-increment users.
+  Instruction *IVIncInsertPos;
+
+  /// Factors - Interesting factors between use strides.
+  SmallSetVector<int64_t, 8> Factors;
+
+  /// Types - Interesting use types, to facilitate truncation reuse.
+  SmallSetVector<Type *, 4> Types;
+
+  /// Fixups - The list of operands which are to be replaced.
+  SmallVector<LSRFixup, 16> Fixups;
+
+  /// Uses - The list of interesting uses.
+  SmallVector<LSRUse, 16> Uses;
+
+  /// RegUses - Track which uses use which register candidates.
+  RegUseTracker RegUses;
+
+  // Limit the number of chains to avoid quadratic behavior. We don't expect to
+  // have more than a few IV increment chains in a loop. Missing a Chain falls
+  // back to normal LSR behavior for those uses.
+  static const unsigned MaxChains = 8;
+
+  /// IVChainVec - IV users can form a chain of IV increments.
+  SmallVector<IVChain, MaxChains> IVChainVec;
+
+  /// IVIncSet - IV users that belong to profitable IVChains.
+  SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+  void OptimizeShadowIV();
+  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  void OptimizeLoopTermCond();
+
+  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                        SmallVectorImpl<ChainUsers> &ChainUsersVec);
+  void FinalizeChain(IVChain &Chain);
+  void CollectChains();
+  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                       SmallVectorImpl<WeakVH> &DeadInsts);
+
+  void CollectInterestingTypesAndFactors();
+  void CollectFixupsAndInitialFormulae();
+
+  LSRFixup &getNewFixup() {
+    Fixups.push_back(LSRFixup());
+    return Fixups.back();
+  }
+
+  // Support for sharing of LSRUses between LSRFixups.
+  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
+                   size_t,
+                   UseMapDenseMapInfo> UseMapTy;
+  UseMapTy UseMap;
+
+  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+                          LSRUse::KindType Kind, Type *AccessTy);
+
+  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
+                                    LSRUse::KindType Kind,
+                                    Type *AccessTy);
+
+  void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+  void CountRegisters(const Formula &F, size_t LUIdx);
+  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+  void CollectLoopInvariantFixupsAndFormulae();
+
+  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+                              unsigned Depth = 0);
+  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+  void GenerateCrossUseConstantOffsets();
+  void GenerateAllReuseFormulae();
+
+  void FilterOutUndesirableDedicatedRegisters();
+
+  size_t EstimateSearchSpaceComplexity() const;
+  void NarrowSearchSpaceByDetectingSupersets();
+  void NarrowSearchSpaceByCollapsingUnrolledCode();
+  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByPickingWinnerRegs();
+  void NarrowSearchSpaceUsingHeuristics();
+
+  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                    Cost &SolutionCost,
+                    SmallVectorImpl<const Formula *> &Workspace,
+                    const Cost &CurCost,
+                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                    DenseSet<const SCEV *> &VisitedRegs) const;
+  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+  BasicBlock::iterator
+    HoistInsertPosition(BasicBlock::iterator IP,
+                        const SmallVectorImpl<Instruction *> &Inputs) const;
+  BasicBlock::iterator
+    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                  const LSRFixup &LF,
+                                  const LSRUse &LU,
+                                  SCEVExpander &Rewriter) const;
+
+  Value *Expand(const LSRFixup &LF,
+                const Formula &F,
+                BasicBlock::iterator IP,
+                SCEVExpander &Rewriter,
+                SmallVectorImpl<WeakVH> &DeadInsts) const;
+  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
+                     const Formula &F,
+                     SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakVH> &DeadInsts,
+                     Pass *P) const;
+  void Rewrite(const LSRFixup &LF,
+               const Formula &F,
+               SCEVExpander &Rewriter,
+               SmallVectorImpl<WeakVH> &DeadInsts,
+               Pass *P) const;
+  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
+                         Pass *P);
+
+public:
+  LSRInstance(Loop *L, Pass *P);
+
+  bool getChanged() const { return Changed; }
+
+  void print_factors_and_types(raw_ostream &OS) const;
+  void print_fixups(raw_ostream &OS) const;
+  void print_uses(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+/// OptimizeShadowIV - If IV is used in a int-to-float cast
+/// inside the loop then try to eliminate the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return;
+
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+       UI != E; /* empty */) {
+    IVUsers::const_iterator CandidateUI = UI;
+    ++UI;
+    Instruction *ShadowUse = CandidateUI->getUser();
+    Type *DestTy = NULL;
+    bool IsSigned = false;
+
+    /* If shadow use is a int->float cast then insert a second IV
+       to eliminate this cast.
+
+         for (unsigned i = 0; i < n; ++i)
+           foo((double)i);
+
+       is transformed into
+
+         double d = 0.0;
+         for (unsigned i = 0; i < n; ++i, ++d)
+           foo(d);
+    */
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
+      DestTy = UCast->getDestTy();
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
+      DestTy = SCast->getDestTy();
+    }
+    if (!DestTy) continue;
+
+    // If target does not support DestTy natively then do not apply
+    // this transformation.
+    if (!TTI.isTypeLegal(DestTy)) continue;
+
+    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+    if (!PH) continue;
+    if (PH->getNumIncomingValues() != 2) continue;
+
+    Type *SrcTy = PH->getType();
+    int Mantissa = DestTy->getFPMantissaWidth();
+    if (Mantissa == -1) continue;
+    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+      continue;
+
+    unsigned Entry, Latch;
+    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+      Entry = 0;
+      Latch = 1;
+    } else {
+      Entry = 1;
+      Latch = 0;
+    }
+
+    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+    if (!Init) continue;
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
+
+    BinaryOperator *Incr =
+      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+    if (!Incr) continue;
+    if (Incr->getOpcode() != Instruction::Add
+        && Incr->getOpcode() != Instruction::Sub)
+      continue;
+
+    /* Initialize new IV, double d = 0.0 in above example. */
+    ConstantInt *C = NULL;
+    if (Incr->getOperand(0) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+    else if (Incr->getOperand(1) == PH)
+      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+    else
+      continue;
+
+    if (!C) continue;
+
+    // Ignore negative constants, as the code below doesn't handle them
+    // correctly. TODO: Remove this restriction.
+    if (!C->getValue().isStrictlyPositive()) continue;
+
+    /* Add new PHINode. */
+    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+    /* create new increment. '++d' in above example. */
+    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+    BinaryOperator *NewIncr =
+      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+                               Instruction::FAdd : Instruction::FSub,
+                             NewPH, CFP, "IV.S.next.", Incr);
+
+    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+    /* Remove cast operation */
+    ShadowUse->replaceAllUsesWith(NewPH);
+    ShadowUse->eraseFromParent();
+    Changed = true;
+    break;
+  }
+}
+
+/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
+/// set the IV user and stride information and return true, otherwise return
+/// false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+  for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+    if (UI->getUser() == Cond) {
+      // NOTE: we could handle setcc instructions with multiple uses here, but
+      // InstCombine does it as well for simple uses, it's not clear that it
+      // occurs enough in real life to handle.
+      CondUse = UI;
+      return true;
+    }
+  return false;
+}
+
+/// OptimizeMax - Rewrite the loop's terminating condition if it uses
+/// a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+///   i = 0;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+//
+///   if (n > 0) {
+///     i = 0;
+///     do {
+///       p[i] = 0.0;
+///     } while (++i < n);
+///   }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+///   i = 0;
+///   max = n < 1 ? 1 : n;
+///   do {
+///     p[i] = 0.0;
+///   } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+///
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+  // Check that the loop matches the pattern we're looking for.
+  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+      Cond->getPredicate() != CmpInst::ICMP_NE)
+    return Cond;
+
+  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+  if (!Sel || !Sel->hasOneUse()) return Cond;
+
+  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+    return Cond;
+  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+  // Add one to the backedge-taken count to get the trip count.
+  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+  if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+  // Check for a max calculation that matches the pattern. There's no check
+  // for ICMP_ULE here because the comparison would be with zero, which
+  // isn't interesting.
+  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+  const SCEVNAryExpr *Max = 0;
+  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+    Pred = ICmpInst::ICMP_SLE;
+    Max = S;
+  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_SLT;
+    Max = S;
+  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+    Pred = ICmpInst::ICMP_ULT;
+    Max = U;
+  } else {
+    // No match; bail.
+    return Cond;
+  }
+
+  // To handle a max with more than two operands, this optimization would
+  // require additional checking and setup.
+  if (Max->getNumOperands() != 2)
+    return Cond;
+
+  const SCEV *MaxLHS = Max->getOperand(0);
+  const SCEV *MaxRHS = Max->getOperand(1);
+
+  // ScalarEvolution canonicalizes constants to the left. For < and >, look
+  // for a comparison with 1. For <= and >=, a comparison with zero.
+  if (!MaxLHS ||
+      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+    return Cond;
+
+  // Check the relevant induction variable for conformance to
+  // the pattern.
+  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+  if (!AR || !AR->isAffine() ||
+      AR->getStart() != One ||
+      AR->getStepRecurrence(SE) != One)
+    return Cond;
+
+  assert(AR->getLoop() == L &&
+         "Loop condition operand is an addrec in a different loop!");
+
+  // Check the right operand of the select, and remember it, as it will
+  // be used in the new comparison instruction.
+  Value *NewRHS = 0;
+  if (ICmpInst::isTrueWhenEqual(Pred)) {
+    // Look for n+1, and grab n.
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
+    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
+    if (!NewRHS)
+      return Cond;
+  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+    NewRHS = Sel->getOperand(1);
+  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+    NewRHS = Sel->getOperand(2);
+  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+    NewRHS = SU->getValue();
+  else
+    // Max doesn't match expected pattern.
+    return Cond;
+
+  // Determine the new comparison opcode. It may be signed or unsigned,
+  // and the original comparison may be either equality or inequality.
+  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+    Pred = CmpInst::getInversePredicate(Pred);
+
+  // Ok, everything looks ok to change the condition into an SLT or SGE and
+  // delete the max calculation.
+  ICmpInst *NewCond =
+    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+  // Delete the max calculation instructions.
+  Cond->replaceAllUsesWith(NewCond);
+  CondUse->setUser(NewCond);
+  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
+  Sel->eraseFromParent();
+  if (Cmp->use_empty())
+    Cmp->eraseFromParent();
+  return NewCond;
+}
+
+/// OptimizeLoopTermCond - Change loop terminating condition to use the
+/// postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+  SmallPtrSet<Instruction *, 4> PostIncs;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitingBlock = ExitingBlocks[i];
+
+    // Get the terminating condition for the loop if possible.  If we
+    // can, we want to change it to use a post-incremented version of its
+    // induction variable, to allow coalescing the live ranges for the IV into
+    // one register value.
+
+    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
+      continue;
+    // FIXME: Overly conservative, termination condition could be an 'or' etc..
+    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+      continue;
+
+    // Search IVUsesByStride to find Cond's IVUse if there is one.
+    IVStrideUse *CondUse = 0;
+    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+    if (!FindIVUserForCond(Cond, CondUse))
+      continue;
+
+    // If the trip count is computed in terms of a max (due to ScalarEvolution
+    // being unable to find a sufficient guard, for example), change the loop
+    // comparison to use SLT or ULT instead of NE.
+    // One consequence of doing this now is that it disrupts the count-down
+    // optimization. That's not always a bad thing though, because in such
+    // cases it may still be worthwhile to avoid a max.
+    Cond = OptimizeMax(Cond, CondUse);
+
+    // If this exiting block dominates the latch block, it may also use
+    // the post-inc value if it won't be shared with other uses.
+    // Check for dominance.
+    if (!DT.dominates(ExitingBlock, LatchBlock))
+      continue;
+
+    // Conservatively avoid trying to use the post-inc value in non-latch
+    // exits if there may be pre-inc users in intervening blocks.
+    if (LatchBlock != ExitingBlock)
+      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+        // Test if the use is reachable from the exiting block. This dominator
+        // query is a conservative approximation of reachability.
+        if (&*UI != CondUse &&
+            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+          // Conservatively assume there may be reuse if the quotient of their
+          // strides could be a legal scale.
+          const SCEV *A = IU.getStride(*CondUse, L);
+          const SCEV *B = IU.getStride(*UI, L);
+          if (!A || !B) continue;
+          if (SE.getTypeSizeInBits(A->getType()) !=
+              SE.getTypeSizeInBits(B->getType())) {
+            if (SE.getTypeSizeInBits(A->getType()) >
+                SE.getTypeSizeInBits(B->getType()))
+              B = SE.getSignExtendExpr(B, A->getType());
+            else
+              A = SE.getSignExtendExpr(A, B->getType());
+          }
+          if (const SCEVConstant *D =
+                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+            const ConstantInt *C = D->getValue();
+            // Stride of one or negative one can have reuse with non-addresses.
+            if (C->isOne() || C->isAllOnesValue())
+              goto decline_post_inc;
+            // Avoid weird situations.
+            if (C->getValue().getMinSignedBits() >= 64 ||
+                C->getValue().isMinSignedValue())
+              goto decline_post_inc;
+            // Check for possible scaled-address reuse.
+            Type *AccessTy = getAccessType(UI->getUser());
+            int64_t Scale = C->getSExtValue();
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+                                          /*BaseOffset=*/ 0,
+                                          /*HasBaseReg=*/ false, Scale))
+              goto decline_post_inc;
+            Scale = -Scale;
+            if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0,
+                                          /*BaseOffset=*/ 0,
+                                          /*HasBaseReg=*/ false, Scale))
+              goto decline_post_inc;
+          }
+        }
+
+    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
+                 << *Cond << '\n');
+
+    // It's possible for the setcc instruction to be anywhere in the loop, and
+    // possible for it to have multiple users.  If it is not immediately before
+    // the exiting block branch, move it.
+    if (&*++BasicBlock::iterator(Cond) != TermBr) {
+      if (Cond->hasOneUse()) {
+        Cond->moveBefore(TermBr);
+      } else {
+        // Clone the terminating condition and insert into the loopend.
+        ICmpInst *OldCond = Cond;
+        Cond = cast<ICmpInst>(Cond->clone());
+        Cond->setName(L->getHeader()->getName() + ".termcond");
+        ExitingBlock->getInstList().insert(TermBr, Cond);
+
+        // Clone the IVUse, as the old use still exists!
+        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+        TermBr->replaceUsesOfWith(OldCond, Cond);
+      }
+    }
+
+    // If we get to here, we know that we can transform the setcc instruction to
+    // use the post-incremented version of the IV, allowing us to coalesce the
+    // live ranges for the IV correctly.
+    CondUse->transformToPostInc(L);
+    Changed = true;
+
+    PostIncs.insert(Cond);
+  decline_post_inc:;
+  }
+
+  // Determine an insertion point for the loop induction variable increment. It
+  // must dominate all the post-inc comparisons we just set up, and it must
+  // dominate the loop latch edge.
+  IVIncInsertPos = L->getLoopLatch()->getTerminator();
+  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
+       E = PostIncs.end(); I != E; ++I) {
+    BasicBlock *BB =
+      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+                                    (*I)->getParent());
+    if (BB == (*I)->getParent())
+      IVIncInsertPos = *I;
+    else if (BB != IVIncInsertPos->getParent())
+      IVIncInsertPos = BB->getTerminator();
+  }
+}
+
+/// reconcileNewOffset - Determine if the given use can accommodate a fixup
+/// at the given offset and other details. If so, update the use and
+/// return true.
+bool
+LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+                                LSRUse::KindType Kind, Type *AccessTy) {
+  int64_t NewMinOffset = LU.MinOffset;
+  int64_t NewMaxOffset = LU.MaxOffset;
+  Type *NewAccessTy = AccessTy;
+
+  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+  // something conservative, however this can pessimize in the case that one of
+  // the uses will have all its uses outside the loop, for example.
+  if (LU.Kind != Kind)
+    return false;
+  // Conservatively assume HasBaseReg is true for now.
+  if (NewOffset < LU.MinOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                          LU.MaxOffset - NewOffset, HasBaseReg))
+      return false;
+    NewMinOffset = NewOffset;
+  } else if (NewOffset > LU.MaxOffset) {
+    if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                          NewOffset - LU.MinOffset, HasBaseReg))
+      return false;
+    NewMaxOffset = NewOffset;
+  }
+  // Check for a mismatched access type, and fall back conservatively as needed.
+  // TODO: Be less conservative when the type is similar and can use the same
+  // addressing modes.
+  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
+    NewAccessTy = Type::getVoidTy(AccessTy->getContext());
+
+  // Update the use.
+  LU.MinOffset = NewMinOffset;
+  LU.MaxOffset = NewMaxOffset;
+  LU.AccessTy = NewAccessTy;
+  if (NewOffset != LU.Offsets.back())
+    LU.Offsets.push_back(NewOffset);
+  return true;
+}
+
+/// getUse - Return an LSRUse index and an offset value for a fixup which
+/// needs the given expression, with the given kind and optional access type.
+/// Either reuse an existing use or create a new one, as needed.
+std::pair<size_t, int64_t>
+LSRInstance::getUse(const SCEV *&Expr,
+                    LSRUse::KindType Kind, Type *AccessTy) {
+  const SCEV *Copy = Expr;
+  int64_t Offset = ExtractImmediate(Expr, SE);
+
+  // Basic uses can't accept any offset, for example.
+  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0,
+                        Offset, /*HasBaseReg=*/ true)) {
+    Expr = Copy;
+    Offset = 0;
+  }
+
+  std::pair<UseMapTy::iterator, bool> P =
+    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
+  if (!P.second) {
+    // A use already existed with this base.
+    size_t LUIdx = P.first->second;
+    LSRUse &LU = Uses[LUIdx];
+    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+      // Reuse this use.
+      return std::make_pair(LUIdx, Offset);
+  }
+
+  // Create a new use.
+  size_t LUIdx = Uses.size();
+  P.first->second = LUIdx;
+  Uses.push_back(LSRUse(Kind, AccessTy));
+  LSRUse &LU = Uses[LUIdx];
+
+  // We don't need to track redundant offsets, but we don't need to go out
+  // of our way here to avoid them.
+  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
+    LU.Offsets.push_back(Offset);
+
+  LU.MinOffset = Offset;
+  LU.MaxOffset = Offset;
+  return std::make_pair(LUIdx, Offset);
+}
+
+/// DeleteUse - Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+  if (&LU != &Uses.back())
+    std::swap(LU, Uses.back());
+  Uses.pop_back();
+
+  // Update RegUses.
+  RegUses.SwapAndDropUse(LUIdx, Uses.size());
+}
+
+/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
+/// a formula that has the same registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+                                       const LSRUse &OrigLU) {
+  // Search all uses for the formula. This could be more clever.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // Check whether this use is close enough to OrigLU, to see whether it's
+    // worthwhile looking through its formulae.
+    // Ignore ICmpZero uses because they may contain formulae generated by
+    // GenerateICmpZeroScales, in which case adding fixup offsets may
+    // be invalid.
+    if (&LU != &OrigLU &&
+        LU.Kind != LSRUse::ICmpZero &&
+        LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+        LU.WidestFixupType == OrigLU.WidestFixupType &&
+        LU.HasFormulaWithSameRegs(OrigF)) {
+      // Scan through this use's formulae.
+      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+           E = LU.Formulae.end(); I != E; ++I) {
+        const Formula &F = *I;
+        // Check to see if this formula has the same registers and symbols
+        // as OrigF.
+        if (F.BaseRegs == OrigF.BaseRegs &&
+            F.ScaledReg == OrigF.ScaledReg &&
+            F.BaseGV == OrigF.BaseGV &&
+            F.Scale == OrigF.Scale &&
+            F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+          if (F.BaseOffset == 0)
+            return &LU;
+          // This is the formula where all the registers and symbols matched;
+          // there aren't going to be any others. Since we declined it, we
+          // can skip the rest of the formulae and proceed to the next LSRUse.
+          break;
+        }
+      }
+    }
+  }
+
+  // Nothing looked good.
+  return 0;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+  SmallSetVector<const SCEV *, 4> Strides;
+
+  // Collect interesting types and strides.
+  SmallVector<const SCEV *, 4> Worklist;
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
+    const SCEV *Expr = IU.getExpr(*UI);
+
+    // Collect interesting types.
+    Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+    // Add strides for mentioned loops.
+    Worklist.push_back(Expr);
+    do {
+      const SCEV *S = Worklist.pop_back_val();
+      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+        if (AR->getLoop() == L)
+          Strides.insert(AR->getStepRecurrence(SE));
+        Worklist.push_back(AR->getStart());
+      } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+        Worklist.append(Add->op_begin(), Add->op_end());
+      }
+    } while (!Worklist.empty());
+  }
+
+  // Compute interesting factors from the set of interesting strides.
+  for (SmallSetVector<const SCEV *, 4>::const_iterator
+       I = Strides.begin(), E = Strides.end(); I != E; ++I)
+    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+         llvm::next(I); NewStrideIter != E; ++NewStrideIter) {
+      const SCEV *OldStride = *I;
+      const SCEV *NewStride = *NewStrideIter;
+
+      if (SE.getTypeSizeInBits(OldStride->getType()) !=
+          SE.getTypeSizeInBits(NewStride->getType())) {
+        if (SE.getTypeSizeInBits(OldStride->getType()) >
+            SE.getTypeSizeInBits(NewStride->getType()))
+          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+        else
+          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+      }
+      if (const SCEVConstant *Factor =
+            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+                                                        SE, true))) {
+        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getValue()->getValue().getSExtValue());
+      } else if (const SCEVConstant *Factor =
+                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+                                                               NewStride,
+                                                               SE, true))) {
+        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
+          Factors.insert(Factor->getValue()->getValue().getSExtValue());
+      }
+    }
+
+  // If all uses use the same type, don't bother looking for truncation-based
+  // reuse.
+  if (Types.size() == 1)
+    Types.clear();
+
+  DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// findIVOperand - Helper for CollectChains that finds an IV operand (computed
+/// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
+/// Instructions to IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+              Loop *L, ScalarEvolution &SE) {
+  for(; OI != OE; ++OI) {
+    if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+      if (!SE.isSCEVable(Oper->getType()))
+        continue;
+
+      if (const SCEVAddRecExpr *AR =
+          dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+        if (AR->getLoop() == L)
+          break;
+      }
+    }
+  }
+  return OI;
+}
+
+/// getWideOperand - IVChain logic must consistenctly peek base TruncInst
+/// operands, so wrap it in a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+    return Trunc->getOperand(0);
+  return Oper;
+}
+
+/// isCompatibleIVType - Return true if we allow an IV chain to include both
+/// types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+  Type *LType = LVal->getType();
+  Type *RType = RVal->getType();
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+}
+
+/// getExprBase - Return an approximation of this SCEV expression's "base", or
+/// NULL for any constant. Returning the expression itself is
+/// conservative. Returning a deeper subexpression is more precise and valid as
+/// long as it isn't less complex than another subexpression. For expressions
+/// involving multiple unscaled values, we need to return the pointer-type
+/// SCEVUnknown. This avoids forming chains across objects, such as:
+/// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+  switch (S->getSCEVType()) {
+  default: // uncluding scUnknown.
+    return S;
+  case scConstant:
+    return 0;
+  case scTruncate:
+    return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+  case scZeroExtend:
+    return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+  case scSignExtend:
+    return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+  case scAddExpr: {
+    // Skip over scaled operands (scMulExpr) to follow add operands as long as
+    // there's nothing more complex.
+    // FIXME: not sure if we want to recognize negation.
+    const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+    for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+           E(Add->op_begin()); I != E; ++I) {
+      const SCEV *SubExpr = *I;
+      if (SubExpr->getSCEVType() == scAddExpr)
+        return getExprBase(SubExpr);
+
+      if (SubExpr->getSCEVType() != scMulExpr)
+        return SubExpr;
+    }
+    return S; // all operands are scaled, be conservative.
+  }
+  case scAddRecExpr:
+    return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+  }
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+                                    const SCEV *IncExpr,
+                                    ScalarEvolution &SE) {
+  // Aggressively form chains when -stress-ivchain.
+  if (StressIVChain)
+    return true;
+
+  // Do not replace a constant offset from IV head with a nonconstant IV
+  // increment.
+  if (!isa<SCEVConstant>(IncExpr)) {
+    const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+    if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+      return 0;
+  }
+
+  SmallPtrSet<const SCEV*, 8> Processed;
+  return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool
+isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users,
+                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+  if (StressIVChain)
+    return true;
+
+  if (!Chain.hasIncs())
+    return false;
+
+  if (!Users.empty()) {
+    DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+          for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(),
+                 E = Users.end(); I != E; ++I) {
+            dbgs() << "  " << **I << "\n";
+          });
+    return false;
+  }
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+  // The chain itself may require a register, so intialize cost to 1.
+  int cost = 1;
+
+  // A complete chain likely eliminates the need for keeping the original IV in
+  // a register. LSR does not currently know how to form a complete chain unless
+  // the header phi already exists.
+  if (isa<PHINode>(Chain.tailUserInst())
+      && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+    --cost;
+  }
+  const SCEV *LastIncExpr = 0;
+  unsigned NumConstIncrements = 0;
+  unsigned NumVarIncrements = 0;
+  unsigned NumReusedIncrements = 0;
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
+       I != E; ++I) {
+
+    if (I->IncExpr->isZero())
+      continue;
+
+    // Incrementing by zero or some constant is neutral. We assume constants can
+    // be folded into an addressing mode or an add's immediate operand.
+    if (isa<SCEVConstant>(I->IncExpr)) {
+      ++NumConstIncrements;
+      continue;
+    }
+
+    if (I->IncExpr == LastIncExpr)
+      ++NumReusedIncrements;
+    else
+      ++NumVarIncrements;
+
+    LastIncExpr = I->IncExpr;
+  }
+  // An IV chain with a single increment is handled by LSR's postinc
+  // uses. However, a chain with multiple increments requires keeping the IV's
+  // value live longer than it needs to be if chained.
+  if (NumConstIncrements > 1)
+    --cost;
+
+  // Materializing increment expressions in the preheader that didn't exist in
+  // the original code may cost a register. For example, sign-extended array
+  // indices can produce ridiculous increments like this:
+  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+  cost += NumVarIncrements;
+
+  // Reusing variable increments likely saves a register to hold the multiple of
+  // the stride.
+  cost -= NumReusedIncrements;
+
+  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+               << "\n");
+
+  return cost < 0;
+}
+
+/// ChainInstruction - Add this IV user to an existing chain or make it the head
+/// of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+                                   SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+  // When IVs are used as types of varying widths, they are generally converted
+  // to a wider type with some uses remaining narrow under a (free) trunc.
+  Value *const NextIV = getWideOperand(IVOper);
+  const SCEV *const OperExpr = SE.getSCEV(NextIV);
+  const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+  // Visit all existing chains. Check if its IVOper can be computed as a
+  // profitable loop invariant increment from the last link in the Chain.
+  unsigned ChainIdx = 0, NChains = IVChainVec.size();
+  const SCEV *LastIncExpr = 0;
+  for (; ChainIdx < NChains; ++ChainIdx) {
+    IVChain &Chain = IVChainVec[ChainIdx];
+
+    // Prune the solution space aggressively by checking that both IV operands
+    // are expressions that operate on the same unscaled SCEVUnknown. This
+    // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+    // first avoids creating extra SCEV expressions.
+    if (!StressIVChain && Chain.ExprBase != OperExprBase)
+      continue;
+
+    Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+    if (!isCompatibleIVType(PrevIV, NextIV))
+      continue;
+
+    // A phi node terminates a chain.
+    if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+      continue;
+
+    // The increment must be loop-invariant so it can be kept in a register.
+    const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+    const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+    if (!SE.isLoopInvariant(IncExpr, L))
+      continue;
+
+    if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+      LastIncExpr = IncExpr;
+      break;
+    }
+  }
+  // If we haven't found a chain, create a new one, unless we hit the max. Don't
+  // bother for phi nodes, because they must be last in the chain.
+  if (ChainIdx == NChains) {
+    if (isa<PHINode>(UserInst))
+      return;
+    if (NChains >= MaxChains && !StressIVChain) {
+      DEBUG(dbgs() << "IV Chain Limit\n");
+      return;
+    }
+    LastIncExpr = OperExpr;
+    // IVUsers may have skipped over sign/zero extensions. We don't currently
+    // attempt to form chains involving extensions unless they can be hoisted
+    // into this loop's AddRec.
+    if (!isa<SCEVAddRecExpr>(LastIncExpr))
+      return;
+    ++NChains;
+    IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+                                 OperExprBase));
+    ChainUsersVec.resize(NChains);
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+                 << ") IV=" << *LastIncExpr << "\n");
+  } else {
+    DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
+                 << ") IV+" << *LastIncExpr << "\n");
+    // Add this IV user to the end of the chain.
+    IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+  }
+  IVChain &Chain = IVChainVec[ChainIdx];
+
+  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+  // This chain's NearUsers become FarUsers.
+  if (!LastIncExpr->isZero()) {
+    ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+                                            NearUsers.end());
+    NearUsers.clear();
+  }
+
+  // All other uses of IVOperand become near uses of the chain.
+  // We currently ignore intermediate values within SCEV expressions, assuming
+  // they will eventually be used be the current chain, or can be computed
+  // from one of the chain increments. To be more precise we could
+  // transitively follow its user and only add leaf IV users to the set.
+  for (Value::use_iterator UseIter = IVOper->use_begin(),
+         UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) {
+    Instruction *OtherUse = dyn_cast<Instruction>(*UseIter);
+    if (!OtherUse)
+      continue;
+    // Uses in the chain will no longer be uses if the chain is formed.
+    // Include the head of the chain in this iteration (not Chain.begin()).
+    IVChain::const_iterator IncIter = Chain.Incs.begin();
+    IVChain::const_iterator IncEnd = Chain.Incs.end();
+    for( ; IncIter != IncEnd; ++IncIter) {
+      if (IncIter->UserInst == OtherUse)
+        break;
+    }
+    if (IncIter != IncEnd)
+      continue;
+
+    if (SE.isSCEVable(OtherUse->getType())
+        && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+        && IU.isIVUserOrOperand(OtherUse)) {
+      continue;
+    }
+    NearUsers.insert(OtherUse);
+  }
+
+  // Since this user is part of the chain, it's no longer considered a use
+  // of the chain.
+  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// CollectChains - Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+///        = A[i]
+///        = A[i+x]
+/// A[i]   =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+  DEBUG(dbgs() << "Collecting IV Chains.\n");
+  SmallVector<ChainUsers, 8> ChainUsersVec;
+
+  SmallVector<BasicBlock *,8> LatchPath;
+  BasicBlock *LoopHeader = L->getHeader();
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+       Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+    LatchPath.push_back(Rung->getBlock());
+  }
+  LatchPath.push_back(LoopHeader);
+
+  // Walk the instruction stream from the loop header to the loop latch.
+  for (SmallVectorImpl<BasicBlock *>::reverse_iterator
+         BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
+       BBIter != BBEnd; ++BBIter) {
+    for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
+         I != E; ++I) {
+      // Skip instructions that weren't seen by IVUsers analysis.
+      if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
+        continue;
+
+      // Ignore users that are part of a SCEV expression. This way we only
+      // consider leaf IV Users. This effectively rediscovers a portion of
+      // IVUsers analysis but in program order this time.
+      if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
+        continue;
+
+      // Remove this instruction from any NearUsers set it may be in.
+      for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+           ChainIdx < NChains; ++ChainIdx) {
+        ChainUsersVec[ChainIdx].NearUsers.erase(I);
+      }
+      // Search for operands that can be chained.
+      SmallPtrSet<Instruction*, 4> UniqueOperands;
+      User::op_iterator IVOpEnd = I->op_end();
+      User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
+      while (IVOpIter != IVOpEnd) {
+        Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+        if (UniqueOperands.insert(IVOpInst))
+          ChainInstruction(I, IVOpInst, ChainUsersVec);
+        IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+      }
+    } // Continue walking down the instructions.
+  } // Continue walking down the domtree.
+  // Visit phi backedges to determine if the chain can generate the IV postinc.
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (!SE.isSCEVable(PN->getType()))
+      continue;
+
+    Instruction *IncV =
+      dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
+    if (IncV)
+      ChainInstruction(PN, IncV, ChainUsersVec);
+  }
+  // Remove any unprofitable chains.
+  unsigned ChainIdx = 0;
+  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+       UsersIdx < NChains; ++UsersIdx) {
+    if (!isProfitableChain(IVChainVec[UsersIdx],
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+      continue;
+    // Preserve the chain at UsesIdx.
+    if (ChainIdx != UsersIdx)
+      IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+    FinalizeChain(IVChainVec[ChainIdx]);
+    ++ChainIdx;
+  }
+  IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+  for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
+    User::op_iterator UseI =
+      std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
+    assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
+    IVIncSet.insert(UseI);
+  }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+                             Value *Operand, const TargetTransformInfo &TTI) {
+  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+  if (!IncConst || !isAddressUse(UserInst, Operand))
+    return false;
+
+  if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
+    return false;
+
+  int64_t IncOffset = IncConst->getValue()->getSExtValue();
+  if (!isAlwaysFoldable(TTI, LSRUse::Address,
+                        getAccessType(UserInst), /*BaseGV=*/ 0,
+                        IncOffset, /*HaseBaseReg=*/ false))
+    return false;
+
+  return true;
+}
+
+/// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
+/// materialize the IV user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+                                  SmallVectorImpl<WeakVH> &DeadInsts) {
+  // Find the new IVOperand for the head of the chain. It may have been replaced
+  // by LSR.
+  const IVInc &Head = Chain.Incs[0];
+  User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+                                             IVOpEnd, L, SE);
+  Value *IVSrc = 0;
+  while (IVOpIter != IVOpEnd) {
+    IVSrc = getWideOperand(*IVOpIter);
+
+    // If this operand computes the expression that the chain needs, we may use
+    // it. (Check this after setting IVSrc which is used below.)
+    //
+    // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+    // narrow for the chain, so we can no longer use it. We do allow using a
+    // wider phi, assuming the LSR checked for free truncation. In that case we
+    // should already have a truncate on this operand such that
+    // getSCEV(IVSrc) == IncExpr.
+    if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+        || SE.getSCEV(IVSrc) == Head.IncExpr) {
+      break;
+    }
+    IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE);
+  }
+  if (IVOpIter == IVOpEnd) {
+    // Gracefully give up on this chain.
+    DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+    return;
+  }
+
+  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+  Type *IVTy = IVSrc->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+  const SCEV *LeftOverExpr = 0;
+  for (IVChain::const_iterator IncI = Chain.begin(),
+         IncE = Chain.end(); IncI != IncE; ++IncI) {
+
+    Instruction *InsertPt = IncI->UserInst;
+    if (isa<PHINode>(InsertPt))
+      InsertPt = L->getLoopLatch()->getTerminator();
+
+    // IVOper will replace the current IV User's operand. IVSrc is the IV
+    // value currently held in a register.
+    Value *IVOper = IVSrc;
+    if (!IncI->IncExpr->isZero()) {
+      // IncExpr was the result of subtraction of two narrow values, so must
+      // be signed.
+      const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
+      LeftOverExpr = LeftOverExpr ?
+        SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+    }
+    if (LeftOverExpr && !LeftOverExpr->isZero()) {
+      // Expand the IV increment.
+      Rewriter.clearPostInc();
+      Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+      const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+                                             SE.getUnknown(IncV));
+      IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+      // If an IV increment can't be folded, use it as the next IV value.
+      if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
+                            TTI)) {
+        assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+        IVSrc = IVOper;
+        LeftOverExpr = 0;
+      }
+    }
+    Type *OperTy = IncI->IVOperand->getType();
+    if (IVTy != OperTy) {
+      assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+             "cannot extend a chained IV");
+      IRBuilder<> Builder(InsertPt);
+      IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+    }
+    IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
+    DeadInsts.push_back(IncI->IVOperand);
+  }
+  // If LSR created a new, wider phi, we may also replace its postinc. We only
+  // do this if we also found a wide value for the head of the chain.
+  if (isa<PHINode>(Chain.tailUserInst())) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
+      if (!isCompatibleIVType(Phi, IVSrc))
+        continue;
+      Instruction *PostIncV = dyn_cast<Instruction>(
+        Phi->getIncomingValueForBlock(L->getLoopLatch()));
+      if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+        continue;
+      Value *IVOper = IVSrc;
+      Type *PostIncTy = PostIncV->getType();
+      if (IVTy != PostIncTy) {
+        assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+        IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+        Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+        IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+      }
+      Phi->replaceUsesOfWith(PostIncV, IVOper);
+      DeadInsts.push_back(PostIncV);
+    }
+  }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
+    Instruction *UserInst = UI->getUser();
+    // Skip IV users that are part of profitable IV Chains.
+    User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
+                                       UI->getOperandValToReplace());
+    assert(UseI != UserInst->op_end() && "cannot find IV operand");
+    if (IVIncSet.count(UseI))
+      continue;
+
+    // Record the uses.
+    LSRFixup &LF = getNewFixup();
+    LF.UserInst = UserInst;
+    LF.OperandValToReplace = UI->getOperandValToReplace();
+    LF.PostIncLoops = UI->getPostIncLoops();
+
+    LSRUse::KindType Kind = LSRUse::Basic;
+    Type *AccessTy = 0;
+    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
+      Kind = LSRUse::Address;
+      AccessTy = getAccessType(LF.UserInst);
+    }
+
+    const SCEV *S = IU.getExpr(*UI);
+
+    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+    // (N - i == 0), and this allows (N - i) to be the expression that we work
+    // with rather than just N or i, so we can consider the register
+    // requirements for both N and i at the same time. Limiting this code to
+    // equality icmps is not a problem because all interesting loops use
+    // equality icmps, thanks to IndVarSimplify.
+    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
+      if (CI->isEquality()) {
+        // Swap the operands if needed to put the OperandValToReplace on the
+        // left, for consistency.
+        Value *NV = CI->getOperand(1);
+        if (NV == LF.OperandValToReplace) {
+          CI->setOperand(1, CI->getOperand(0));
+          CI->setOperand(0, NV);
+          NV = CI->getOperand(1);
+          Changed = true;
+        }
+
+        // x == y  -->  x - y == 0
+        const SCEV *N = SE.getSCEV(NV);
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
+          // S is normalized, so normalize N before folding it into S
+          // to keep the result normalized.
+          N = TransformForPostIncUse(Normalize, N, CI, 0,
+                                     LF.PostIncLoops, SE, DT);
+          Kind = LSRUse::ICmpZero;
+          S = SE.getMinusSCEV(N, S);
+        }
+
+        // -1 and the negations of all interesting strides (except the negation
+        // of -1) are now also interesting.
+        for (size_t i = 0, e = Factors.size(); i != e; ++i)
+          if (Factors[i] != -1)
+            Factors.insert(-(uint64_t)Factors[i]);
+        Factors.insert(-1);
+      }
+
+    // Set up the initial formula for this use.
+    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+    LF.LUIdx = P.first;
+    LF.Offset = P.second;
+    LSRUse &LU = Uses[LF.LUIdx];
+    LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+    if (!LU.WidestFixupType ||
+        SE.getTypeSizeInBits(LU.WidestFixupType) <
+        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+      LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+    // If this is the first use of this LSRUse, give it a formula.
+    if (LU.Formulae.empty()) {
+      InsertInitialFormula(S, LU, LF.LUIdx);
+      CountRegisters(LU.Formulae.back(), LF.LUIdx);
+    }
+  }
+
+  DEBUG(print_fixups(dbgs()));
+}
+
+/// InsertInitialFormula - Insert a formula for the given expression into
+/// the given use, separating out loop-variant portions from loop-invariant
+/// and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  Formula F;
+  F.InitialMatch(S, L, SE);
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// InsertSupplementalFormula - Insert a simple single-register formula for
+/// the given expression into the given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+                                       LSRUse &LU, size_t LUIdx) {
+  Formula F;
+  F.BaseRegs.push_back(S);
+  F.HasBaseReg = true;
+  bool Inserted = InsertFormula(LU, LUIdx, F);
+  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// CountRegisters - Note which registers are used by the given formula,
+/// updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+  if (F.ScaledReg)
+    RegUses.CountRegister(F.ScaledReg, LUIdx);
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
+       E = F.BaseRegs.end(); I != E; ++I)
+    RegUses.CountRegister(*I, LUIdx);
+}
+
+/// InsertFormula - If the given formula has not yet been inserted, add it to
+/// the list, and return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+  if (!LU.InsertFormula(F))
+    return false;
+
+  CountRegisters(F, LUIdx);
+  return true;
+}
+
+/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
+/// loop-invariant values which we're tracking. These other uses will pin these
+/// values in registers, making them less profitable for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+  SmallPtrSet<const SCEV *, 8> Inserted;
+
+  while (!Worklist.empty()) {
+    const SCEV *S = Worklist.pop_back_val();
+
+    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+      Worklist.append(N->op_begin(), N->op_end());
+    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
+      Worklist.push_back(C->getOperand());
+    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      Worklist.push_back(D->getLHS());
+      Worklist.push_back(D->getRHS());
+    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+      if (!Inserted.insert(U)) continue;
+      const Value *V = U->getValue();
+      if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+        // Look for instructions defined outside the loop.
+        if (L->contains(Inst)) continue;
+      } else if (isa<UndefValue>(V))
+        // Undef doesn't have a live range, so it doesn't matter.
+        continue;
+      for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+           UI != UE; ++UI) {
+        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
+        // Ignore non-instructions.
+        if (!UserInst)
+          continue;
+        // Ignore instructions in other functions (as can happen with
+        // Constants).
+        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+          continue;
+        // Ignore instructions not dominated by the loop.
+        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+          UserInst->getParent() :
+          cast<PHINode>(UserInst)->getIncomingBlock(
+            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
+        if (!DT.dominates(L->getHeader(), UseBB))
+          continue;
+        // Ignore uses which are part of other SCEV expressions, to avoid
+        // analyzing them multiple times.
+        if (SE.isSCEVable(UserInst->getType())) {
+          const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+          // If the user is a no-op, look through to its uses.
+          if (!isa<SCEVUnknown>(UserS))
+            continue;
+          if (UserS == U) {
+            Worklist.push_back(
+              SE.getUnknown(const_cast<Instruction *>(UserInst)));
+            continue;
+          }
+        }
+        // Ignore icmp instructions which are already being analyzed.
+        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+          unsigned OtherIdx = !UI.getOperandNo();
+          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+          if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+            continue;
+        }
+
+        LSRFixup &LF = getNewFixup();
+        LF.UserInst = const_cast<Instruction *>(UserInst);
+        LF.OperandValToReplace = UI.getUse();
+        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
+        LF.LUIdx = P.first;
+        LF.Offset = P.second;
+        LSRUse &LU = Uses[LF.LUIdx];
+        LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+        if (!LU.WidestFixupType ||
+            SE.getTypeSizeInBits(LU.WidestFixupType) <
+            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+          LU.WidestFixupType = LF.OperandValToReplace->getType();
+        InsertSupplementalFormula(U, LU, LF.LUIdx);
+        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+        break;
+      }
+    }
+  }
+}
+
+/// CollectSubexprs - Split S into subexpressions which can be pulled out into
+/// separate registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+                                   SmallVectorImpl<const SCEV *> &Ops,
+                                   const Loop *L,
+                                   ScalarEvolution &SE,
+                                   unsigned Depth = 0) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3)
+    return S;
+
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    // Break out add operands.
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+    }
+    return NULL;
+  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+    // Split a non-zero base out of an addrec.
+    if (AR->getStart()->isZero())
+      return S;
+
+    const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+                                            C, Ops, L, SE, Depth+1);
+    // Split the non-zero AddRec unless it is part of a nested recurrence that
+    // does not pertain to this loop.
+    if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+      Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+      Remainder = NULL;
+    }
+    if (Remainder != AR->getStart()) {
+      if (!Remainder)
+        Remainder = SE.getConstant(AR->getType(), 0);
+      return SE.getAddRecExpr(Remainder,
+                              AR->getStepRecurrence(SE),
+                              AR->getLoop(),
+                              //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+                              SCEV::FlagAnyWrap);
+    }
+  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+    // Break (C * (a + b + c)) into C*a + C*b + C*c.
+    if (Mul->getNumOperands() != 2)
+      return S;
+    if (const SCEVConstant *Op0 =
+        dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+      C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+      const SCEV *Remainder =
+        CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+      if (Remainder)
+        Ops.push_back(SE.getMulExpr(C, Remainder));
+      return NULL;
+    }
+  }
+  return S;
+}
+
+/// GenerateReassociations - Split out subexpressions from adds and the bases of
+/// addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base,
+                                         unsigned Depth) {
+  // Arbitrarily cap recursion to protect compile time.
+  if (Depth >= 3) return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+    const SCEV *BaseReg = Base.BaseRegs[i];
+
+    SmallVector<const SCEV *, 8> AddOps;
+    const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE);
+    if (Remainder)
+      AddOps.push_back(Remainder);
+
+    if (AddOps.size() == 1) continue;
+
+    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+         JE = AddOps.end(); J != JE; ++J) {
+
+      // Loop-variant "unknown" values are uninteresting; we won't be able to
+      // do anything meaningful with them.
+      if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+        continue;
+
+      // Don't pull a constant into a register if the constant could be folded
+      // into an immediate field.
+      if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                           LU.AccessTy, *J, Base.getNumRegs() > 1))
+        continue;
+
+      // Collect all operands except *J.
+      SmallVector<const SCEV *, 8> InnerAddOps
+        (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+      InnerAddOps.append
+        (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+      // Don't leave just a constant behind in a register if the constant could
+      // be folded into an immediate field.
+      if (InnerAddOps.size() == 1 &&
+          isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                           LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+        continue;
+
+      const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+      if (InnerSum->isZero())
+        continue;
+      Formula F = Base;
+
+      // Add the remaining pieces of the add back into the new formula.
+      const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+      if (InnerSumSC &&
+          SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                  InnerSumSC->getValue()->getZExtValue())) {
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           InnerSumSC->getValue()->getZExtValue();
+        F.BaseRegs.erase(F.BaseRegs.begin() + i);
+      } else
+        F.BaseRegs[i] = InnerSum;
+
+      // Add J as its own register, or an unfolded immediate.
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+      if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+          TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+                                  SC->getValue()->getZExtValue()))
+        F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+                           SC->getValue()->getZExtValue();
+      else
+        F.BaseRegs.push_back(*J);
+
+      if (InsertFormula(LU, LUIdx, F))
+        // If that formula hadn't been seen before, recurse to find more like
+        // it.
+        GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
+    }
+  }
+}
+
+/// GenerateCombinations - Generate a formula consisting of all of the
+/// loop-dominating registers added into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+                                       Formula Base) {
+  // This method is only interesting on a plurality of registers.
+  if (Base.BaseRegs.size() <= 1) return;
+
+  Formula F = Base;
+  F.BaseRegs.clear();
+  SmallVector<const SCEV *, 4> Ops;
+  for (SmallVectorImpl<const SCEV *>::const_iterator
+       I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
+    const SCEV *BaseReg = *I;
+    if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+        !SE.hasComputableLoopEvolution(BaseReg, L))
+      Ops.push_back(BaseReg);
+    else
+      F.BaseRegs.push_back(BaseReg);
+  }
+  if (Ops.size() > 1) {
+    const SCEV *Sum = SE.getAddExpr(Ops);
+    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+    // opportunity to fold something. For now, just ignore such cases
+    // rather than proceed with zero in a register.
+    if (!Sum->isZero()) {
+      F.BaseRegs.push_back(Sum);
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // We can't add a symbolic offset if the address already contains one.
+  if (Base.BaseGV) return;
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+    const SCEV *G = Base.BaseRegs[i];
+    GlobalValue *GV = ExtractSymbol(G, SE);
+    if (G->isZero() || !GV)
+      continue;
+    Formula F = Base;
+    F.BaseGV = GV;
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+      continue;
+    F.BaseRegs[i] = G;
+    (void)InsertFormula(LU, LUIdx, F);
+  }
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+                                          Formula Base) {
+  // TODO: For now, just add the min and max offset, because it usually isn't
+  // worthwhile looking at everything inbetween.
+  SmallVector<int64_t, 2> Worklist;
+  Worklist.push_back(LU.MinOffset);
+  if (LU.MaxOffset != LU.MinOffset)
+    Worklist.push_back(LU.MaxOffset);
+
+  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+    const SCEV *G = Base.BaseRegs[i];
+
+    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
+         E = Worklist.end(); I != E; ++I) {
+      Formula F = Base;
+      F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
+      if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
+                     LU.AccessTy, F)) {
+        // Add the offset to the base register.
+        const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
+        // If it cancelled out, drop the base register, otherwise update it.
+        if (NewG->isZero()) {
+          std::swap(F.BaseRegs[i], F.BaseRegs.back());
+          F.BaseRegs.pop_back();
+        } else
+          F.BaseRegs[i] = NewG;
+
+        (void)InsertFormula(LU, LUIdx, F);
+      }
+    }
+
+    int64_t Imm = ExtractImmediate(G, SE);
+    if (G->isZero() || Imm == 0)
+      continue;
+    Formula F = Base;
+    F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+      continue;
+    F.BaseRegs[i] = G;
+    (void)InsertFormula(LU, LUIdx, F);
+  }
+}
+
+/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
+/// the comparison. For example, x == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+                                         Formula Base) {
+  if (LU.Kind != LSRUse::ICmpZero) return;
+
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+  if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+  // Don't do this if there is more than one offset.
+  if (LU.MinOffset != LU.MaxOffset) return;
+
+  assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+  // Check each interesting stride.
+  for (SmallSetVector<int64_t, 8>::const_iterator
+       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
+    int64_t Factor = *I;
+
+    // Check that the multiplication doesn't overflow.
+    if (Base.BaseOffset == INT64_MIN && Factor == -1)
+      continue;
+    int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+    if (NewBaseOffset / Factor != Base.BaseOffset)
+      continue;
+
+    // Check that multiplying with the use offset doesn't overflow.
+    int64_t Offset = LU.MinOffset;
+    if (Offset == INT64_MIN && Factor == -1)
+      continue;
+    Offset = (uint64_t)Offset * Factor;
+    if (Offset / Factor != LU.MinOffset)
+      continue;
+
+    Formula F = Base;
+    F.BaseOffset = NewBaseOffset;
+
+    // Check that this scale is legal.
+    if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+      continue;
+
+    // Compensate for the use having MinOffset built into it.
+    F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+    const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+    // Check that multiplying with each base register doesn't overflow.
+    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+      if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+        goto next;
+    }
+
+    // Check that multiplying with the scaled register doesn't overflow.
+    if (F.ScaledReg) {
+      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+      if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+        continue;
+    }
+
+    // Check that multiplying with the unfolded offset doesn't overflow.
+    if (F.UnfoldedOffset != 0) {
+      if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
+        continue;
+      F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+      if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+        continue;
+    }
+
+    // If we make it here and it's legal, add it.
+    (void)InsertFormula(LU, LUIdx, F);
+  next:;
+  }
+}
+
+/// GenerateScales - Generate stride factor reuse formulae by making use of
+/// scaled-offset address modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Determine the integer type for the base formula.
+  Type *IntTy = Base.getType();
+  if (!IntTy) return;
+
+  // If this Formula already has a scaled register, we can't add another one.
+  if (Base.Scale != 0) return;
+
+  // Check each interesting stride.
+  for (SmallSetVector<int64_t, 8>::const_iterator
+       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
+    int64_t Factor = *I;
+
+    Base.Scale = Factor;
+    Base.HasBaseReg = Base.BaseRegs.size() > 1;
+    // Check whether this scale is going to be legal.
+    if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    Base)) {
+      // As a special-case, handle special out-of-loop Basic users specially.
+      // TODO: Reconsider this special case.
+      if (LU.Kind == LSRUse::Basic &&
+          isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+                     LU.AccessTy, Base) &&
+          LU.AllFixupsOutsideLoop)
+        LU.Kind = LSRUse::Special;
+      else
+        continue;
+    }
+    // For an ICmpZero, negating a solitary base register won't lead to
+    // new solutions.
+    if (LU.Kind == LSRUse::ICmpZero &&
+        !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+      continue;
+    // For each addrec base reg, apply the scale, if possible.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+      if (const SCEVAddRecExpr *AR =
+            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+        const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+        if (FactorS->isZero())
+          continue;
+        // Divide out the factor, ignoring high bits, since we'll be
+        // scaling the value back up in the end.
+        if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+          // TODO: This could be optimized to avoid all the copying.
+          Formula F = Base;
+          F.ScaledReg = Quotient;
+          F.DeleteBaseReg(F.BaseRegs[i]);
+          (void)InsertFormula(LU, LUIdx, F);
+        }
+      }
+  }
+}
+
+/// GenerateTruncates - Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+  // Don't bother truncating symbolic values.
+  if (Base.BaseGV) return;
+
+  // Determine the integer type for the base formula.
+  Type *DstTy = Base.getType();
+  if (!DstTy) return;
+  DstTy = SE.getEffectiveSCEVType(DstTy);
+
+  for (SmallSetVector<Type *, 4>::const_iterator
+       I = Types.begin(), E = Types.end(); I != E; ++I) {
+    Type *SrcTy = *I;
+    if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+      Formula F = Base;
+
+      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
+      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
+           JE = F.BaseRegs.end(); J != JE; ++J)
+        *J = SE.getAnyExtendExpr(*J, SrcTy);
+
+      // TODO: This assumes we've done basic processing on all uses and
+      // have an idea what the register usage is.
+      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+        continue;
+
+      (void)InsertFormula(LU, LUIdx, F);
+    }
+  }
+}
+
+namespace {
+
+/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
+/// defer modifications so that the search phase doesn't have to worry about
+/// the data structures moving underneath it.
+struct WorkItem {
+  size_t LUIdx;
+  int64_t Imm;
+  const SCEV *OrigReg;
+
+  WorkItem(size_t LI, int64_t I, const SCEV *R)
+    : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
+}
+
+void WorkItem::print(raw_ostream &OS) const {
+  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+     << " , add offset " << Imm;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void WorkItem::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
+/// distance apart and try to form reuse opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+  // Group the registers by their value without any added constant offset.
+  typedef std::map<int64_t, const SCEV *> ImmMapTy;
+  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
+  RegMapTy Map;
+  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+  SmallVector<const SCEV *, 8> Sequence;
+  for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
+       I != E; ++I) {
+    const SCEV *Reg = *I;
+    int64_t Imm = ExtractImmediate(Reg, SE);
+    std::pair<RegMapTy::iterator, bool> Pair =
+      Map.insert(std::make_pair(Reg, ImmMapTy()));
+    if (Pair.second)
+      Sequence.push_back(Reg);
+    Pair.first->second.insert(std::make_pair(Imm, *I));
+    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
+  }
+
+  // Now examine each set of registers with the same base value. Build up
+  // a list of work to do and do the work in a separate step so that we're
+  // not adding formulae and register counts while we're searching.
+  SmallVector<WorkItem, 32> WorkItems;
+  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
+       E = Sequence.end(); I != E; ++I) {
+    const SCEV *Reg = *I;
+    const ImmMapTy &Imms = Map.find(Reg)->second;
+
+    // It's not worthwhile looking for reuse if there's only one offset.
+    if (Imms.size() == 1)
+      continue;
+
+    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+          for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+               J != JE; ++J)
+            dbgs() << ' ' << J->first;
+          dbgs() << '\n');
+
+    // Examine each offset.
+    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+         J != JE; ++J) {
+      const SCEV *OrigReg = J->second;
+
+      int64_t JImm = J->first;
+      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+      if (!isa<SCEVConstant>(OrigReg) &&
+          UsedByIndicesMap[Reg].count() == 1) {
+        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
+        continue;
+      }
+
+      // Conservatively examine offsets between this orig reg a few selected
+      // other orig regs.
+      ImmMapTy::const_iterator OtherImms[] = {
+        Imms.begin(), prior(Imms.end()),
+        Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
+      };
+      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+        ImmMapTy::const_iterator M = OtherImms[i];
+        if (M == J || M == JE) continue;
+
+        // Compute the difference between the two.
+        int64_t Imm = (uint64_t)JImm - M->first;
+        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
+             LUIdx = UsedByIndices.find_next(LUIdx))
+          // Make a memo of this use, offset, and register tuple.
+          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
+            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+      }
+    }
+  }
+
+  Map.clear();
+  Sequence.clear();
+  UsedByIndicesMap.clear();
+  UniqueItems.clear();
+
+  // Now iterate through the worklist and add new formulae.
+  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
+       E = WorkItems.end(); I != E; ++I) {
+    const WorkItem &WI = *I;
+    size_t LUIdx = WI.LUIdx;
+    LSRUse &LU = Uses[LUIdx];
+    int64_t Imm = WI.Imm;
+    const SCEV *OrigReg = WI.OrigReg;
+
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+    // TODO: Use a more targeted data structure.
+    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+      const Formula &F = LU.Formulae[L];
+      // Use the immediate in the scaled register.
+      if (F.ScaledReg == OrigReg) {
+        int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+        // Don't create 50 + reg(-50).
+        if (F.referencesReg(SE.getSCEV(
+                   ConstantInt::get(IntTy, -(uint64_t)Offset))))
+          continue;
+        Formula NewF = F;
+        NewF.BaseOffset = Offset;
+        if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        NewF))
+          continue;
+        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+        // If the new scale is a constant in a register, and adding the constant
+        // value to the immediate would produce a value closer to zero than the
+        // immediate itself, then the formula isn't worthwhile.
+        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+          if (C->getValue()->isNegative() !=
+                (NewF.BaseOffset < 0) &&
+              (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
+                .ule(abs64(NewF.BaseOffset)))
+            continue;
+
+        // OK, looks good.
+        (void)InsertFormula(LU, LUIdx, NewF);
+      } else {
+        // Use the immediate in a base register.
+        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+          const SCEV *BaseReg = F.BaseRegs[N];
+          if (BaseReg != OrigReg)
+            continue;
+          Formula NewF = F;
+          NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+          if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+                          LU.Kind, LU.AccessTy, NewF)) {
+            if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+              continue;
+            NewF = F;
+            NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+          }
+          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+          // If the new formula has a constant in a register, and adding the
+          // constant value to the immediate would produce a value closer to
+          // zero than the immediate itself, then the formula isn't worthwhile.
+          for (SmallVectorImpl<const SCEV *>::const_iterator
+               J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
+               J != JE; ++J)
+            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
+              if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
+                   abs64(NewF.BaseOffset)) &&
+                  (C->getValue()->getValue() +
+                   NewF.BaseOffset).countTrailingZeros() >=
+                   CountTrailingZeros_64(NewF.BaseOffset))
+                goto skip_formula;
+
+          // Ok, looks good.
+          (void)InsertFormula(LU, LUIdx, NewF);
+          break;
+        skip_formula:;
+        }
+      }
+    }
+  }
+}
+
+/// GenerateAllReuseFormulae - Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+  // queries are more precise.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateScales(LU, LUIdx, LU.Formulae[i]);
+  }
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+  }
+
+  GenerateCrossUseConstantOffsets();
+
+  DEBUG(dbgs() << "\n"
+                  "After generating reuse formulae:\n";
+        print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+
+  // Collect the best formula for each unique set of shared registers. This
+  // is reset for each use.
+  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
+    BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+         FIdx != NumForms; ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF;
+      Regs.clear();
+      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+              dbgs() << "\n");
+      }
+      else {
+        SmallVector<const SCEV *, 4> Key;
+        for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
+               JE = F.BaseRegs.end(); J != JE; ++J) {
+          const SCEV *Reg = *J;
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        std::sort(Key.begin(), Key.end());
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
+        Formula &Best = LU.Formulae[P.first->second];
+
+        Cost CostBest;
+        Regs.clear();
+        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+        if (CostF < CostBest)
+          std::swap(F, Best);
+        DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+              dbgs() << "\n"
+                        "    in favor of formula "; Best.print(dbgs());
+              dbgs() << '\n');
+      }
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+
+    // Now that we've filtered out some formulae, recompute the Regs set.
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+          dbgs() << "\n"
+                    "After filtering out undesirable candidates:\n";
+          print_uses(dbgs());
+        });
+}
+
+// This is a rough guess that seems to work fairly well.
+static const size_t ComplexityLimit = UINT16_MAX;
+
+/// EstimateSearchSpaceComplexity - Estimate the worst-case number of
+/// solutions the solver might have to consider. It almost never considers
+/// this many solutions because it prune the search space, but the pruning
+/// isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+  size_t Power = 1;
+  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
+       E = Uses.end(); I != E; ++I) {
+    size_t FSize = I->Formulae.size();
+    if (FSize >= ComplexityLimit) {
+      Power = ComplexityLimit;
+      break;
+    }
+    Power *= FSize;
+    if (Power >= ComplexityLimit)
+      break;
+  }
+  return Power;
+}
+
+/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
+/// of the registers of another formula, it won't help reduce register
+/// pressure (though it may not necessarily hurt register pressure); remove
+/// it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+                    "which use a superset of registers used by other "
+                    "formulae.\n");
+
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        // Look for a formula with a constant or GV in a register. If the use
+        // also has a formula with that same value in an immediate field,
+        // delete the one that uses a register.
+        for (SmallVectorImpl<const SCEV *>::const_iterator
+             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+            Formula NewF = F;
+            NewF.BaseOffset += C->getValue()->getSExtValue();
+            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                (I - F.BaseRegs.begin()));
+            if (LU.HasFormulaWithSameRegs(NewF)) {
+              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+              LU.DeleteFormula(F);
+              --i;
+              --e;
+              Any = true;
+              break;
+            }
+          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+              if (!F.BaseGV) {
+                Formula NewF = F;
+                NewF.BaseGV = GV;
+                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+                                    (I - F.BaseRegs.begin()));
+                if (LU.HasFormulaWithSameRegs(NewF)) {
+                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                        dbgs() << '\n');
+                  LU.DeleteFormula(F);
+                  --i;
+                  --e;
+                  Any = true;
+                  break;
+                }
+              }
+          }
+        }
+      }
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
+/// for expressions like A, A+1, A+2, etc., allocate a single register for
+/// them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by assuming that uses separated "
+                  "by a constant offset will use the same registers.\n");
+
+  // This is especially useful for unrolled loops.
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+         E = LU.Formulae.end(); I != E; ++I) {
+      const Formula &F = *I;
+      if (F.BaseOffset == 0 || F.Scale != 0)
+        continue;
+
+      LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+      if (!LUThatHas)
+        continue;
+
+      if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+                              LU.Kind, LU.AccessTy))
+        continue;
+
+      DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+      LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+      // Update the relocs to reference the new use.
+      for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
+           E = Fixups.end(); I != E; ++I) {
+        LSRFixup &Fixup = *I;
+        if (Fixup.LUIdx == LUIdx) {
+          Fixup.LUIdx = LUThatHas - &Uses.front();
+          Fixup.Offset += F.BaseOffset;
+          // Add the new offset to LUThatHas' offset list.
+          if (LUThatHas->Offsets.back() != Fixup.Offset) {
+            LUThatHas->Offsets.push_back(Fixup.Offset);
+            if (Fixup.Offset > LUThatHas->MaxOffset)
+              LUThatHas->MaxOffset = Fixup.Offset;
+            if (Fixup.Offset < LUThatHas->MinOffset)
+              LUThatHas->MinOffset = Fixup.Offset;
+          }
+          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+        }
+        if (Fixup.LUIdx == NumUses-1)
+          Fixup.LUIdx = LUIdx;
+      }
+
+      // Delete formulae from the new use which are no longer legal.
+      bool Any = false;
+      for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+        Formula &F = LUThatHas->Formulae[i];
+        if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+                        LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
+                dbgs() << '\n');
+          LUThatHas->DeleteFormula(F);
+          --i;
+          --e;
+          Any = true;
+        }
+      }
+
+      if (Any)
+        LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+      // Delete the old use.
+      DeleteUse(LU, LUIdx);
+      --LUIdx;
+      --NumUses;
+      break;
+    }
+  }
+
+  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
+/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+                    "undesirable dedicated registers.\n");
+
+    FilterOutUndesirableDedicatedRegisters();
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
+/// to be profitable, and then in any use which has any reference to that
+/// register, delete all formulae which do not reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+  // With all other options exhausted, loop until the system is simple
+  // enough to handle.
+  SmallPtrSet<const SCEV *, 4> Taken;
+  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+    // Ok, we have too many of formulae on our hands to conveniently handle.
+    // Use a rough heuristic to thin out the list.
+    DEBUG(dbgs() << "The search space is too complex.\n");
+
+    // Pick the register which is used by the most LSRUses, which is likely
+    // to be a good reuse register candidate.
+    const SCEV *Best = 0;
+    unsigned BestNum = 0;
+    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
+         I != E; ++I) {
+      const SCEV *Reg = *I;
+      if (Taken.count(Reg))
+        continue;
+      if (!Best)
+        Best = Reg;
+      else {
+        unsigned Count = RegUses.getUsedByIndices(Reg).count();
+        if (Count > BestNum) {
+          Best = Reg;
+          BestNum = Count;
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+                 << " will yield profitable reuse.\n");
+    Taken.insert(Best);
+
+    // In any use with formulae which references this register, delete formulae
+    // which don't reference it.
+    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+      LSRUse &LU = Uses[LUIdx];
+      if (!LU.Regs.count(Best)) continue;
+
+      bool Any = false;
+      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+        Formula &F = LU.Formulae[i];
+        if (!F.referencesReg(Best)) {
+          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
+          LU.DeleteFormula(F);
+          --e;
+          --i;
+          Any = true;
+          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+          continue;
+        }
+      }
+
+      if (Any)
+        LU.RecomputeRegs(LUIdx, RegUses);
+    }
+
+    DEBUG(dbgs() << "After pre-selection:\n";
+          print_uses(dbgs()));
+  }
+}
+
+/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
+/// formulae to choose from, use some rough heuristics to prune down the number
+/// of formulae. This keeps the main solver from taking an extraordinary amount
+/// of time in some worst-case scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+  NarrowSearchSpaceByDetectingSupersets();
+  NarrowSearchSpaceByCollapsingUnrolledCode();
+  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// SolveRecurse - This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+                               Cost &SolutionCost,
+                               SmallVectorImpl<const Formula *> &Workspace,
+                               const Cost &CurCost,
+                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
+                               DenseSet<const SCEV *> &VisitedRegs) const {
+  // Some ideas:
+  //  - prune more:
+  //    - use more aggressive filtering
+  //    - sort the formula so that the most profitable solutions are found first
+  //    - sort the uses too
+  //  - search faster:
+  //    - don't compute a cost, and then compare. compare while computing a cost
+  //      and bail early.
+  //    - track register sets with SmallBitVector
+
+  const LSRUse &LU = Uses[Workspace.size()];
+
+  // If this use references any register that's already a part of the
+  // in-progress solution, consider it a requirement that a formula must
+  // reference that register in order to be considered. This prunes out
+  // unprofitable searching.
+  SmallSetVector<const SCEV *, 4> ReqRegs;
+  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
+       E = CurRegs.end(); I != E; ++I)
+    if (LU.Regs.count(*I))
+      ReqRegs.insert(*I);
+
+  SmallPtrSet<const SCEV *, 16> NewRegs;
+  Cost NewCost;
+  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
+       E = LU.Formulae.end(); I != E; ++I) {
+    const Formula &F = *I;
+
+    // Ignore formulae which do not use any of the required registers.
+    bool SatisfiedReqReg = true;
+    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
+         JE = ReqRegs.end(); J != JE; ++J) {
+      const SCEV *Reg = *J;
+      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
+          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
+          F.BaseRegs.end()) {
+        SatisfiedReqReg = false;
+        break;
+      }
+    }
+    if (!SatisfiedReqReg) {
+      // If none of the formulae satisfied the required registers, then we could
+      // clear ReqRegs and try again. Currently, we simply give up in this case.
+      continue;
+    }
+
+    // Evaluate the cost of the current formula. If it's already worse than
+    // the current best, prune the search at that point.
+    NewCost = CurCost;
+    NewRegs = CurRegs;
+    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+    if (NewCost < SolutionCost) {
+      Workspace.push_back(&F);
+      if (Workspace.size() != Uses.size()) {
+        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+                     NewRegs, VisitedRegs);
+        if (F.getNumRegs() == 1 && Workspace.size() == 1)
+          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+      } else {
+        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+              dbgs() << ".\n Regs:";
+              for (SmallPtrSet<const SCEV *, 16>::const_iterator
+                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
+                dbgs() << ' ' << **I;
+              dbgs() << '\n');
+
+        SolutionCost = NewCost;
+        Solution = Workspace;
+      }
+      Workspace.pop_back();
+    }
+  }
+}
+
+/// Solve - Choose one formula from each use. Return the results in the given
+/// Solution vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+  SmallVector<const Formula *, 8> Workspace;
+  Cost SolutionCost;
+  SolutionCost.Loose();
+  Cost CurCost;
+  SmallPtrSet<const SCEV *, 16> CurRegs;
+  DenseSet<const SCEV *> VisitedRegs;
+  Workspace.reserve(Uses.size());
+
+  // SolveRecurse does all the work.
+  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+               CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
+
+  // Ok, we've now made all our decisions.
+  DEBUG(dbgs() << "\n"
+                  "The chosen solution requires "; SolutionCost.print(dbgs());
+        dbgs() << ":\n";
+        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+          dbgs() << "  ";
+          Uses[i].print(dbgs());
+          dbgs() << "\n"
+                    "    ";
+          Solution[i]->print(dbgs());
+          dbgs() << '\n';
+        });
+
+  assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
+/// the dominator tree far as we can go while still being dominated by the
+/// input positions. This helps canonicalize the insert position, which
+/// encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+                                 const SmallVectorImpl<Instruction *> &Inputs)
+                                                                         const {
+  for (;;) {
+    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+    BasicBlock *IDom;
+    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+      if (!Rung) return IP;
+      Rung = Rung->getIDom();
+      if (!Rung) return IP;
+      IDom = Rung->getBlock();
+
+      // Don't climb into a loop though.
+      const Loop *IDomLoop = LI.getLoopFor(IDom);
+      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+      if (IDomDepth <= IPLoopDepth &&
+          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+        break;
+    }
+
+    bool AllDominate = true;
+    Instruction *BetterPos = 0;
+    Instruction *Tentative = IDom->getTerminator();
+    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
+         E = Inputs.end(); I != E; ++I) {
+      Instruction *Inst = *I;
+      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+        AllDominate = false;
+        break;
+      }
+      // Attempt to find an insert position in the middle of the block,
+      // instead of at the end, so that it can be used for other expansions.
+      if (IDom == Inst->getParent() &&
+          (!BetterPos || !DT.dominates(Inst, BetterPos)))
+        BetterPos = llvm::next(BasicBlock::iterator(Inst));
+    }
+    if (!AllDominate)
+      break;
+    if (BetterPos)
+      IP = BetterPos;
+    else
+      IP = Tentative;
+  }
+
+  return IP;
+}
+
+/// AdjustInsertPositionForExpand - Determine an input position which will be
+/// dominated by the operands and which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+                                           const LSRFixup &LF,
+                                           const LSRUse &LU,
+                                           SCEVExpander &Rewriter) const {
+  // Collect some instructions which must be dominated by the
+  // expanding replacement. These must be dominated by any operands that
+  // will be required in the expansion.
+  SmallVector<Instruction *, 4> Inputs;
+  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+    Inputs.push_back(I);
+  if (LU.Kind == LSRUse::ICmpZero)
+    if (Instruction *I =
+          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+      Inputs.push_back(I);
+  if (LF.PostIncLoops.count(L)) {
+    if (LF.isUseFullyOutsideLoop(L))
+      Inputs.push_back(L->getLoopLatch()->getTerminator());
+    else
+      Inputs.push_back(IVIncInsertPos);
+  }
+  // The expansion must also be dominated by the increment positions of any
+  // loops it for which it is using post-inc mode.
+  for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
+       E = LF.PostIncLoops.end(); I != E; ++I) {
+    const Loop *PIL = *I;
+    if (PIL == L) continue;
+
+    // Be dominated by the loop exit.
+    SmallVector<BasicBlock *, 4> ExitingBlocks;
+    PIL->getExitingBlocks(ExitingBlocks);
+    if (!ExitingBlocks.empty()) {
+      BasicBlock *BB = ExitingBlocks[0];
+      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+      Inputs.push_back(BB->getTerminator());
+    }
+  }
+
+  assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
+         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+         "Insertion point must be a normal instruction");
+
+  // Then, climb up the immediate dominator tree as far as we can go while
+  // still being dominated by the input positions.
+  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+  // Don't insert instructions before PHI nodes.
+  while (isa<PHINode>(IP)) ++IP;
+
+  // Ignore landingpad instructions.
+  while (isa<LandingPadInst>(IP)) ++IP;
+
+  // Ignore debug intrinsics.
+  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+  // IP consistent across expansions and allows the previously inserted
+  // instructions to be reused by subsequent expansion.
+  while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
+
+  return IP;
+}
+
+/// Expand - Emit instructions for the leading candidate expression for this
+/// LSRUse (this is called "expanding").
+Value *LSRInstance::Expand(const LSRFixup &LF,
+                           const Formula &F,
+                           BasicBlock::iterator IP,
+                           SCEVExpander &Rewriter,
+                           SmallVectorImpl<WeakVH> &DeadInsts) const {
+  const LSRUse &LU = Uses[LF.LUIdx];
+
+  // Determine an input position which will be dominated by the operands and
+  // which will dominate the result.
+  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+
+  // Inform the Rewriter if we have a post-increment use, so that it can
+  // perform an advantageous expansion.
+  Rewriter.setPostInc(LF.PostIncLoops);
+
+  // This is the type that the user actually needs.
+  Type *OpTy = LF.OperandValToReplace->getType();
+  // This will be the type that we'll initially expand to.
+  Type *Ty = F.getType();
+  if (!Ty)
+    // No type known; just expand directly to the ultimate type.
+    Ty = OpTy;
+  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+    // Expand directly to the ultimate type if it's the right size.
+    Ty = OpTy;
+  // This is the type to do integer arithmetic in.
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+  // Build up a list of operands to add together to form the full base.
+  SmallVector<const SCEV *, 8> Ops;
+
+  // Expand the BaseRegs portion.
+  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
+       E = F.BaseRegs.end(); I != E; ++I) {
+    const SCEV *Reg = *I;
+    assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    Reg = TransformForPostIncUse(Denormalize, Reg,
+                                 LF.UserInst, LF.OperandValToReplace,
+                                 Loops, SE, DT);
+
+    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
+  }
+
+  // Expand the ScaledReg portion.
+  Value *ICmpScaledV = 0;
+  if (F.Scale != 0) {
+    const SCEV *ScaledS = F.ScaledReg;
+
+    // If we're expanding for a post-inc user, make the post-inc adjustment.
+    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
+                                     LF.UserInst, LF.OperandValToReplace,
+                                     Loops, SE, DT);
+
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // An interesting way of "folding" with an icmp is to use a negated
+      // scale, which we'll implement by inserting it into the other operand
+      // of the icmp.
+      assert(F.Scale == -1 &&
+             "The only scale supported by ICmpZero uses is -1!");
+      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
+    } else {
+      // Otherwise just expand the scaled register and an explicit scale,
+      // which is expected to be matched as part of the address.
+
+      // Flush the operand list to suppress SCEVExpander hoisting address modes.
+      if (!Ops.empty() && LU.Kind == LSRUse::Address) {
+        Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+        Ops.clear();
+        Ops.push_back(SE.getUnknown(FullV));
+      }
+      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
+      ScaledS = SE.getMulExpr(ScaledS,
+                              SE.getConstant(ScaledS->getType(), F.Scale));
+      Ops.push_back(ScaledS);
+    }
+  }
+
+  // Expand the GV portion.
+  if (F.BaseGV) {
+    // Flush the operand list to suppress SCEVExpander hoisting.
+    if (!Ops.empty()) {
+      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+      Ops.clear();
+      Ops.push_back(SE.getUnknown(FullV));
+    }
+    Ops.push_back(SE.getUnknown(F.BaseGV));
+  }
+
+  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+  // unfolded offsets. LSR assumes they both live next to their uses.
+  if (!Ops.empty()) {
+    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
+    Ops.clear();
+    Ops.push_back(SE.getUnknown(FullV));
+  }
+
+  // Expand the immediate portion.
+  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+  if (Offset != 0) {
+    if (LU.Kind == LSRUse::ICmpZero) {
+      // The other interesting way of "folding" with an ICmpZero is to use a
+      // negated immediate.
+      if (!ICmpScaledV)
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+      else {
+        Ops.push_back(SE.getUnknown(ICmpScaledV));
+        ICmpScaledV = ConstantInt::get(IntTy, Offset);
+      }
+    } else {
+      // Just add the immediate values. These again are expected to be matched
+      // as part of the address.
+      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+    }
+  }
+
+  // Expand the unfolded offset portion.
+  int64_t UnfoldedOffset = F.UnfoldedOffset;
+  if (UnfoldedOffset != 0) {
+    // Just add the immediate values.
+    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+                                                       UnfoldedOffset)));
+  }
+
+  // Emit instructions summing all the operands.
+  const SCEV *FullS = Ops.empty() ?
+                      SE.getConstant(IntTy, 0) :
+                      SE.getAddExpr(Ops);
+  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
+
+  // We're done expanding now, so reset the rewriter.
+  Rewriter.clearPostInc();
+
+  // An ICmpZero Formula represents an ICmp which we're handling as a
+  // comparison against zero. Now that we've expanded an expression for that
+  // form, update the ICmp's other operand.
+  if (LU.Kind == LSRUse::ICmpZero) {
+    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+    DeadInsts.push_back(CI->getOperand(1));
+    assert(!F.BaseGV && "ICmp does not support folding a global value and "
+                           "a scale at the same time!");
+    if (F.Scale == -1) {
+      if (ICmpScaledV->getType() != OpTy) {
+        Instruction *Cast =
+          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+                                                   OpTy, false),
+                           ICmpScaledV, OpTy, "tmp", CI);
+        ICmpScaledV = Cast;
+      }
+      CI->setOperand(1, ICmpScaledV);
+    } else {
+      assert(F.Scale == 0 &&
+             "ICmp does not support folding a global value and "
+             "a scale at the same time!");
+      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+                                           -(uint64_t)Offset);
+      if (C->getType() != OpTy)
+        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+                                                          OpTy, false),
+                                  C, OpTy);
+
+      CI->setOperand(1, C);
+    }
+  }
+
+  return FullV;
+}
+
+/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
+/// of their operands effectively happens in their predecessor blocks, so the
+/// expression may need to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(PHINode *PN,
+                                const LSRFixup &LF,
+                                const Formula &F,
+                                SCEVExpander &Rewriter,
+                                SmallVectorImpl<WeakVH> &DeadInsts,
+                                Pass *P) const {
+  DenseMap<BasicBlock *, Value *> Inserted;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+      BasicBlock *BB = PN->getIncomingBlock(i);
+
+      // If this is a critical edge, split the edge so that we do not insert
+      // the code on all predecessor/successor paths.  We do this unless this
+      // is the canonical backedge for this loop, which complicates post-inc
+      // users.
+      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+          !isa<IndirectBrInst>(BB->getTerminator())) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
+          // Split the critical edge.
+          BasicBlock *NewBB = 0;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent, P,
+                                      /*MergeIdenticalEdges=*/true,
+                                      /*DontDeleteUselessPhis=*/true);
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+            NewBB = NewBBs[0];
+          }
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+          }
+        }
+      }
+
+      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
+      if (!Pair.second)
+        PN->setIncomingValue(i, Pair.first->second);
+      else {
+        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
+
+        // If this is reuse-by-noop-cast, insert the noop cast.
+        Type *OpTy = LF.OperandValToReplace->getType();
+        if (FullV->getType() != OpTy)
+          FullV =
+            CastInst::Create(CastInst::getCastOpcode(FullV, false,
+                                                     OpTy, false),
+                             FullV, LF.OperandValToReplace->getType(),
+                             "tmp", BB->getTerminator());
+
+        PN->setIncomingValue(i, FullV);
+        Pair.first->second = FullV;
+      }
+    }
+}
+
+/// Rewrite - Emit instructions for the leading candidate expression for this
+/// LSRUse (this is called "expanding"), and update the UserInst to reference
+/// the newly expanded value.
+void LSRInstance::Rewrite(const LSRFixup &LF,
+                          const Formula &F,
+                          SCEVExpander &Rewriter,
+                          SmallVectorImpl<WeakVH> &DeadInsts,
+                          Pass *P) const {
+  // First, find an insertion point that dominates UserInst. For PHI nodes,
+  // find the nearest block which dominates all the relevant uses.
+  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
+  } else {
+    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
+
+    // If this is reuse-by-noop-cast, insert the noop cast.
+    Type *OpTy = LF.OperandValToReplace->getType();
+    if (FullV->getType() != OpTy) {
+      Instruction *Cast =
+        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+                         FullV, OpTy, "tmp", LF.UserInst);
+      FullV = Cast;
+    }
+
+    // Update the user. ICmpZero is handled specially here (for now) because
+    // Expand may have updated one of the operands of the icmp already, and
+    // its new value may happen to be equal to LF.OperandValToReplace, in
+    // which case doing replaceUsesOfWith leads to replacing both operands
+    // with the same value. TODO: Reorganize this.
+    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
+      LF.UserInst->setOperand(0, FullV);
+    else
+      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+  }
+
+  DeadInsts.push_back(LF.OperandValToReplace);
+}
+
+/// ImplementSolution - Rewrite all the fixup locations with new values,
+/// following the chosen solution.
+void
+LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
+                               Pass *P) {
+  // Keep track of instructions we may have made dead, so that
+  // we can remove them after we are done working.
+  SmallVector<WeakVH, 16> DeadInsts;
+
+  SCEVExpander Rewriter(SE, "lsr");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
+  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+  // Mark phi nodes that terminate chains so the expander tries to reuse them.
+  for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
+         ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
+    if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
+      Rewriter.setChainedPhi(PN);
+  }
+
+  // Expand the new value definitions and update the users.
+  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
+       E = Fixups.end(); I != E; ++I) {
+    const LSRFixup &Fixup = *I;
+
+    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
+
+    Changed = true;
+  }
+
+  for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
+         ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
+    GenerateIVChain(*ChainI, Rewriter, DeadInsts);
+    Changed = true;
+  }
+  // Clean up after ourselves. This must be done before deleting any
+  // instructions.
+  Rewriter.clear();
+
+  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+}
+
+LSRInstance::LSRInstance(Loop *L, Pass *P)
+    : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
+      DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()),
+      TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false),
+      IVIncInsertPos(0) {
+  // If LoopSimplify form is not available, stay out of trouble.
+  if (!L->isLoopSimplifyForm())
+    return;
+
+  // If there's no interesting work to be done, bail early.
+  if (IU.empty()) return;
+
+  // If there's too much analysis to be done, bail early. We won't be able to
+  // model the problem anyway.
+  unsigned NumUsers = 0;
+  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
+    if (++NumUsers > MaxIVUsers) {
+      DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L
+            << "\n");
+      return;
+    }
+  }
+
+#ifndef NDEBUG
+  // All dominating loops must have preheaders, or SCEVExpander may not be able
+  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+  //
+  // IVUsers analysis should only create users that are dominated by simple loop
+  // headers. Since this loop should dominate all of its users, its user list
+  // should be empty if this loop itself is not within a simple loop nest.
+  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+       Rung; Rung = Rung->getIDom()) {
+    BasicBlock *BB = Rung->getBlock();
+    const Loop *DomLoop = LI.getLoopFor(BB);
+    if (DomLoop && DomLoop->getHeader() == BB) {
+      assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+    }
+  }
+#endif // DEBUG
+
+  DEBUG(dbgs() << "\nLSR on loop ";
+        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
+        dbgs() << ":\n");
+
+  // First, perform some low-level loop optimizations.
+  OptimizeShadowIV();
+  OptimizeLoopTermCond();
+
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!L->empty()) {
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
+  // Start collecting data and preparing for the solver.
+  CollectChains();
+  CollectInterestingTypesAndFactors();
+  CollectFixupsAndInitialFormulae();
+  CollectLoopInvariantFixupsAndFormulae();
+
+  assert(!Uses.empty() && "IVUsers reported at least one use");
+  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+        print_uses(dbgs()));
+
+  // Now use the reuse data to generate a bunch of interesting ways
+  // to formulate the values needed for the uses.
+  GenerateAllReuseFormulae();
+
+  FilterOutUndesirableDedicatedRegisters();
+  NarrowSearchSpaceUsingHeuristics();
+
+  SmallVector<const Formula *, 8> Solution;
+  Solve(Solution);
+
+  // Release memory that is no longer needed.
+  Factors.clear();
+  Types.clear();
+  RegUses.clear();
+
+  if (Solution.empty())
+    return;
+
+#ifndef NDEBUG
+  // Formulae should be legal.
+  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end();
+       I != E; ++I) {
+    const LSRUse &LU = *I;
+    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
+                                                  JE = LU.Formulae.end();
+         J != JE; ++J)
+      assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                        *J) && "Illegal formula generated!");
+  };
+#endif
+
+  // Now that we've decided what we want, make it so.
+  ImplementSolution(Solution, P);
+}
+
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+  if (Factors.empty() && Types.empty()) return;
+
+  OS << "LSR has identified the following interesting factors and types: ";
+  bool First = true;
+
+  for (SmallSetVector<int64_t, 8>::const_iterator
+       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '*' << *I;
+  }
+
+  for (SmallSetVector<Type *, 4>::const_iterator
+       I = Types.begin(), E = Types.end(); I != E; ++I) {
+    if (!First) OS << ", ";
+    First = false;
+    OS << '(' << **I << ')';
+  }
+  OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+  OS << "LSR is examining the following fixup sites:\n";
+  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
+       E = Fixups.end(); I != E; ++I) {
+    dbgs() << "  ";
+    I->print(OS);
+    OS << '\n';
+  }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+  OS << "LSR is examining the following uses:\n";
+  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
+       E = Uses.end(); I != E; ++I) {
+    const LSRUse &LU = *I;
+    dbgs() << "  ";
+    LU.print(OS);
+    OS << '\n';
+    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
+         JE = LU.Formulae.end(); J != JE; ++J) {
+      OS << "    ";
+      J->print(OS);
+      OS << '\n';
+    }
+  }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+  print_factors_and_types(OS);
+  print_fixups(OS);
+  print_uses(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRInstance::dump() const {
+  print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  LoopStrengthReduce();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+}
+
+char LoopStrengthReduce::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(IVUsers)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+                "Loop Strength Reduction", false, false)
+
+
+Pass *llvm::createLoopStrengthReducePass() {
+  return new LoopStrengthReduce();
+}
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We split critical edges, so we change the CFG.  However, we do update
+  // many analyses if they are around.
+  AU.addPreservedID(LoopSimplifyID);
+
+  AU.addRequired<LoopInfo>();
+  AU.addPreserved<LoopInfo>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<DominatorTree>();
+  AU.addPreserved<DominatorTree>();
+  AU.addRequired<ScalarEvolution>();
+  AU.addPreserved<ScalarEvolution>();
+  // Requiring LoopSimplify a second time here prevents IVUsers from running
+  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<IVUsers>();
+  AU.addPreserved<IVUsers>();
+  AU.addRequired<TargetTransformInfo>();
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+  bool Changed = false;
+
+  // Run the main LSR transformation.
+  Changed |= LSRInstance(L, this).getChanged();
+
+  // Remove any extra phis created by processing inner loops.
+  Changed |= DeleteDeadPHIs(L->getHeader());
+  if (EnablePhiElim && L->isLoopSimplifyForm()) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr");
+#ifndef NDEBUG
+    Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+    unsigned numFolded =
+        Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(),
+                                     DeadInsts,
+                                     &getAnalysis<TargetTransformInfo>());
+    if (numFolded) {
+      Changed = true;
+      DeleteTriviallyDeadInstructions(DeadInsts);
+      DeleteDeadPHIs(L->getHeader());
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
new file mode 100644
index 000000000000..80d060b926ea
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -0,0 +1,238 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <climits>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden,
+  cl::desc("The cut-off point for automatic loop unrolling"));
+
+static cl::opt<unsigned>
+UnrollCount("unroll-count", cl::init(0), cl::Hidden,
+  cl::desc("Use this unroll count for all loops, for testing purposes"));
+
+static cl::opt<bool>
+UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
+  cl::desc("Allows loops to be partially unrolled until "
+           "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool>
+UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::init(false), cl::Hidden,
+  cl::desc("Unroll loops with run-time trip counts"));
+
+namespace {
+  class LoopUnroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+      CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
+      CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
+      CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+
+      UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+
+      initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+    }
+
+    /// A magic value for use with the Threshold parameter to indicate
+    /// that the loop unroll should be performed regardless of how much
+    /// code expansion would result.
+    static const unsigned NoThreshold = UINT_MAX;
+
+    // Threshold to use when optsize is specified (and there is no
+    // explicit -unroll-threshold).
+    static const unsigned OptSizeUnrollThreshold = 50;
+
+    // Default unroll count for loops with run-time trip count if
+    // -unroll-count is not set
+    static const unsigned UnrollRuntimeCount = 8;
+
+    unsigned CurrentCount;
+    unsigned CurrentThreshold;
+    bool     CurrentAllowPartial;
+    bool     UserThreshold;        // CurrentThreshold is user-specified.
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTree>();
+    }
+  };
+}
+
+char LoopUnroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
+  return new LoopUnroll(Threshold, Count, AllowPartial);
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    bool &NotDuplicatable,
+                                    const TargetTransformInfo &TTI) {
+  CodeMetrics Metrics;
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    Metrics.analyzeBasicBlock(*I, TTI);
+  NumCalls = Metrics.NumInlineCandidates;
+  NotDuplicatable = Metrics.notDuplicatable;
+
+  unsigned LoopSize = Metrics.NumInsts;
+
+  // Don't allow an estimate of size zero.  This would allows unrolling of loops
+  // with huge iteration counts, which is a compile time problem even if it's
+  // not a problem for code quality.
+  if (LoopSize == 0) LoopSize = 1;
+
+  return LoopSize;
+}
+
+bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  LoopInfo *LI = &getAnalysis<LoopInfo>();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
+        << "] Loop %" << Header->getName() << "\n");
+  (void)Header;
+
+  // Determine the current unrolling threshold.  While this is normally set
+  // from UnrollThreshold, it is overridden to a smaller value if the current
+  // function is marked as optimize-for-size, and the unroll threshold was
+  // not user specified.
+  unsigned Threshold = CurrentThreshold;
+  if (!UserThreshold &&
+      Header->getParent()->getAttributes().
+        hasAttribute(AttributeSet::FunctionIndex,
+                     Attribute::OptimizeForSize))
+    Threshold = OptSizeUnrollThreshold;
+
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  // Find "latch trip count". UnrollLoop assumes that control cannot exit
+  // via the loop latch on any iteration prior to TripCount. The loop may exit
+  // early via an earlier branch.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (LatchBlock) {
+    TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
+    TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+  }
+  // Use a default unroll-count if the user doesn't specify a value
+  // and the trip count is a run-time value.  The default is different
+  // for run-time or compile-time trip count loops.
+  unsigned Count = CurrentCount;
+  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+    Count = UnrollRuntimeCount;
+
+  if (Count == 0) {
+    // Conservative heuristic: if we know the trip count, see if we can
+    // completely unroll (subject to the threshold, checked below); otherwise
+    // try to find greatest modulo of the trip count which is still under
+    // threshold value.
+    if (TripCount == 0)
+      return false;
+    Count = TripCount;
+  }
+
+  // Enforce the threshold.
+  if (Threshold != NoThreshold) {
+    unsigned NumInlineCandidates;
+    bool notDuplicatable;
+    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates,
+                                            notDuplicatable, TTI);
+    DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
+    if (notDuplicatable) {
+      DEBUG(dbgs() << "  Not unrolling loop which contains non duplicatable"
+            << " instructions.\n");
+      return false;
+    }
+    if (NumInlineCandidates != 0) {
+      DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
+      return false;
+    }
+    uint64_t Size = (uint64_t)LoopSize*Count;
+    if (TripCount != 1 && Size > Threshold) {
+      DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
+            << " because size: " << Size << ">" << Threshold << "\n");
+      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
+        DEBUG(dbgs() << "  will not try to unroll partially because "
+              << "-unroll-allow-partial not given\n");
+        return false;
+      }
+      if (TripCount) {
+        // Reduce unroll count to be modulo of TripCount for partial unrolling
+        Count = Threshold / LoopSize;
+        while (Count != 0 && TripCount%Count != 0)
+          Count--;
+      }
+      else if (UnrollRuntime) {
+        // Reduce unroll count to be a lower power-of-two value
+        while (Count != 0 && Size > Threshold) {
+          Count >>= 1;
+          Size = LoopSize*Count;
+        }
+      }
+      if (Count < 2) {
+        DEBUG(dbgs() << "  could not unroll partially\n");
+        return false;
+      }
+      DEBUG(dbgs() << "  partially unrolling with count: " << Count << "\n");
+    }
+  }
+
+  // Unroll the loop.
+  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
+    return false;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 000000000000..0e8199f2fd5c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -0,0 +1,1289 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unswitch"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts,  "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+          cl::init(100), cl::Hidden);
+
+namespace {
+
+  class LUAnalysisCache {
+
+    typedef DenseMap<const SwitchInst*, SmallPtrSet<const Value *, 8> >
+      UnswitchedValsMap;
+
+    typedef UnswitchedValsMap::iterator UnswitchedValsIt;
+
+    struct LoopProperties {
+      unsigned CanBeUnswitchedCount;
+      unsigned SizeEstimation;
+      UnswitchedValsMap UnswitchedVals;
+    };
+
+    // Here we use std::map instead of DenseMap, since we need to keep valid
+    // LoopProperties pointer for current loop for better performance.
+    typedef std::map<const Loop*, LoopProperties> LoopPropsMap;
+    typedef LoopPropsMap::iterator LoopPropsMapIt;
+
+    LoopPropsMap LoopsProperties;
+    UnswitchedValsMap* CurLoopInstructions;
+    LoopProperties* CurrentLoopProperties;
+
+    // Max size of code we can produce on remained iterations.
+    unsigned MaxSize;
+
+    public:
+
+      LUAnalysisCache() :
+        CurLoopInstructions(NULL), CurrentLoopProperties(NULL),
+        MaxSize(Threshold)
+      {}
+
+      // Analyze loop. Check its size, calculate is it possible to unswitch
+      // it. Returns true if we can unswitch this loop.
+      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
+
+      // Clean all data related to given loop.
+      void forgetLoop(const Loop* L);
+
+      // Mark case value as unswitched.
+      // Since SI instruction can be partly unswitched, in order to avoid
+      // extra unswitching in cloned loops keep track all unswitched values.
+      void setUnswitched(const SwitchInst* SI, const Value* V);
+
+      // Check was this case value unswitched before or not.
+      bool isUnswitched(const SwitchInst* SI, const Value* V);
+
+      // Clone all loop-unswitch related loop properties.
+      // Redistribute unswitching quotas.
+      // Note, that new loop data is stored inside the VMap.
+      void cloneData(const Loop* NewLoop, const Loop* OldLoop,
+                     const ValueToValueMapTy& VMap);
+  };
+
+  class LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+
+    // LoopProcessWorklist - Used to check if second loop needs processing
+    // after RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+
+    LUAnalysisCache BranchesInfo;
+
+    bool OptimizeForSize;
+    bool redoLoop;
+
+    Loop *currentLoop;
+    DominatorTree *DT;
+    BasicBlock *loopHeader;
+    BasicBlock *loopPreheader;
+
+    // LoopBlocks contains all of the basic blocks of the loop, including the
+    // preheader of the loop, the body of the loop, and the exit blocks of the
+    // loop, in that order.
+    std::vector<BasicBlock*> LoopBlocks;
+    // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+    std::vector<BasicBlock*> NewBlocks;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopUnswitch(bool Os = false) :
+      LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
+      currentLoop(NULL), DT(NULL), loopHeader(NULL),
+      loopPreheader(NULL) {
+        initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool processCurrentLoop();
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
+    }
+
+  private:
+
+    virtual void releaseMemory() {
+      BranchesInfo.forgetLoop(currentLoop);
+    }
+
+    /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
+    /// remove it.
+    void RemoveLoopFromWorklist(Loop *L) {
+      std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(),
+                                                 LoopProcessWorklist.end(), L);
+      if (I != LoopProcessWorklist.end())
+        LoopProcessWorklist.erase(I);
+    }
+
+    void initLoopData() {
+      loopHeader = currentLoop->getHeader();
+      loopPreheader = currentLoop->getLoopPreheader();
+    }
+
+    /// Split all of the edges from inside the loop to their exit blocks.
+    /// Update the appropriate Phi nodes as we do so.
+    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
+
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest,
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+    void RemoveBlockIfDead(BasicBlock *BB,
+                           std::vector<Instruction*> &Worklist, Loop *l);
+    void RemoveLoopFromHierarchy(Loop *L);
+    bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
+                                    BasicBlock **LoopExit = 0);
+
+  };
+}
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
+
+  std::pair<LoopPropsMapIt, bool> InsertRes =
+      LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+  LoopProperties& Props = InsertRes.first->second;
+
+  if (InsertRes.second) {
+    // New loop.
+
+    // Limit the number of instructions to avoid causing significant code
+    // expansion, and the number of basic blocks, to avoid loops with
+    // large numbers of branches which cause loop unswitching to go crazy.
+    // This is a very ad-hoc heuristic.
+
+    // FIXME: This is overly conservative because it does not take into
+    // consideration code simplification opportunities and code that can
+    // be shared by the resultant unswitched loops.
+    CodeMetrics Metrics;
+    for (Loop::block_iterator I = L->block_begin(),
+           E = L->block_end();
+         I != E; ++I)
+      Metrics.analyzeBasicBlock(*I, TTI);
+
+    Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5);
+    Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+    MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "NOT unswitching loop %"
+            << L->getHeader()->getName() << ", contents cannot be "
+            << "duplicated!\n");
+      return false;
+    }
+  }
+
+  if (!Props.CanBeUnswitchedCount) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+          << L->getHeader()->getName() << ", cost too high: "
+          << L->getBlocks().size() << "\n");
+
+    return false;
+  }
+
+  // Be careful. This links are good only before new loop addition.
+  CurrentLoopProperties = &Props;
+  CurLoopInstructions = &Props.UnswitchedVals;
+
+  return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop* L) {
+
+  LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+  if (LIt != LoopsProperties.end()) {
+    LoopProperties& Props = LIt->second;
+    MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
+    LoopsProperties.erase(LIt);
+  }
+
+  CurrentLoopProperties = NULL;
+  CurLoopInstructions = NULL;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) {
+  (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
+  return (*CurLoopInstructions)[SI].count(V);
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
+                     const ValueToValueMapTy& VMap) {
+
+  LoopProperties& NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties& OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals;
+
+  // Reallocate "can-be-unswitched quota"
+
+  --OldLoopProps.CanBeUnswitchedCount;
+  unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+  NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+  OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+  NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+  // Clone unswitched values info:
+  // for new loop switches we clone info about values that was
+  // already unswitched and has redundant successors.
+  for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+    const SwitchInst* OldInst = I->first;
+    Value* NewI = VMap.lookup(OldInst);
+    const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI);
+    assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+    NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+  }
+}
+
+char LoopUnswitch::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+                      false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os) {
+  return new LoopUnswitch(Os);
+}
+
+/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
+/// invariant in the loop, or has an invariant piece, return the invariant.
+/// Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+
+  // We started analyze new instruction, increment scanned instructions counter.
+  ++TotalInsts;
+
+  // We can never unswitch on vector conditions.
+  if (Cond->getType()->isVectorTy())
+    return 0;
+
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return 0;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+
+  // Hoist simple values out.
+  if (L->makeLoopInvariant(Cond, Changed))
+    return Cond;
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // If either the left or right side is invariant, we can unswitch on this,
+      // which will cause the branch to go away in one loop and the condition to
+      // simplify in the other one.
+      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+        return LHS;
+      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+        return RHS;
+    }
+
+  return 0;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  LI = &getAnalysis<LoopInfo>();
+  LPM = &LPM_Ref;
+  DT = getAnalysisIfAvailable<DominatorTree>();
+  currentLoop = L;
+  Function *F = currentLoop->getHeader()->getParent();
+  bool Changed = false;
+  do {
+    assert(currentLoop->isLCSSAForm(*DT));
+    redoLoop = false;
+    Changed |= processCurrentLoop();
+  } while(redoLoop);
+
+  if (Changed) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    if (DT)
+      DT->runOnFunction(*F);
+  }
+  return Changed;
+}
+
+/// processCurrentLoop - Do actual work and unswitch loop if possible
+/// and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+  bool Changed = false;
+
+  initLoopData();
+
+  // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+  if (!loopPreheader)
+    return false;
+
+  // Loops with indirectbr cannot be cloned.
+  if (!currentLoop->isSafeToClone())
+    return false;
+
+  // Without dedicated exits, splitting the exit edge may fail.
+  if (!currentLoop->hasDedicatedExits())
+    return false;
+
+  LLVMContext &Context = loopHeader->getContext();
+
+  // Probably we reach the quota of branches for this loop. If so
+  // stop unswitching.
+  if (!BranchesInfo.countLoop(currentLoop, getAnalysis<TargetTransformInfo>()))
+    return false;
+
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = currentLoop->block_begin(),
+         E = currentLoop->block_end(); I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
+                                             ConstantInt::getTrue(Context))) {
+          ++NumBranches;
+          return true;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                             currentLoop, Changed);
+      unsigned NumCases = SI->getNumCases();
+      if (LoopCond && NumCases) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        // FIXME: scan for a case with a non-critical edge?
+        Constant *UnswitchVal = NULL;
+
+        // Do not process same value again and again.
+        // At this point we have some cases already unswitched and
+        // some not yet unswitched. Let's find the first not yet unswitched one.
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          Constant* UnswitchValCandidate = i.getCaseValue();
+          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+            UnswitchVal = UnswitchValCandidate;
+            break;
+          }
+        }
+
+        if (!UnswitchVal)
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+          ++NumSwitches;
+          return true;
+        }
+      }
+    }
+
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
+                                               currentLoop, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond,
+                                             ConstantInt::getTrue(Context))) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  return Changed;
+}
+
+/// isTrivialLoopExitBlock - Check to see if all paths from BB exit the
+/// loop with no side effects (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited. Without more analysis, this could indicate an infinite
+    // loop.
+    return false;
+  } else if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB != 0) return false;
+    ExitBB = BB;
+    return true;
+  }
+
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+
+  return true;
+}
+
+/// isTrivialLoopExitBlock - Return true if the specified block unconditionally
+/// leads to an exit from the specified loop, and has no side-effects in the
+/// process.  If so, return the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header make infinite loops.
+  BasicBlock *ExitBB = 0;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return 0;
+}
+
+/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
+/// trivial: that is, that the condition controls whether or not the loop does
+/// anything at all.  If this is a trivial condition, unswitching produces no
+/// code duplications (equivalently, it produces a simpler loop and a new empty
+/// loop, which gets deleted).
+///
+/// If this is a trivial condition, return true, otherwise return false.  When
+/// returning true, this sets Cond and Val to the condition that controls the
+/// trivial condition: when Cond dynamically equals Val, the loop is known to
+/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
+/// Cond == Val.
+///
+bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
+                                       BasicBlock **LoopExit) {
+  BasicBlock *Header = currentLoop->getHeader();
+  TerminatorInst *HeaderTerm = Header->getTerminator();
+  LLVMContext &Context = Header->getContext();
+
+  BasicBlock *LoopExitBB = 0;
+  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
+    // If the header block doesn't end with a conditional branch on Cond, we
+    // can't handle it.
+    if (!BI->isConditional() || BI->getCondition() != Cond)
+      return false;
+
+    // Check to see if a successor of the branch is guaranteed to
+    // exit through a unique exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                             BI->getSuccessor(0)))) {
+      if (Val) *Val = ConstantInt::getTrue(Context);
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
+                                                    BI->getSuccessor(1)))) {
+      if (Val) *Val = ConstantInt::getFalse(Context);
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
+    // If this isn't a switch on Cond, we can't handle it.
+    if (SI->getCondition() != Cond) return false;
+
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    // Note that we can't trivially unswitch on the default case or
+    // on already unswitched cases.
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      BasicBlock* LoopExitCandidate;
+      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
+                                               i.getCaseSuccessor()))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        ConstantInt* CaseVal = i.getCaseValue();
+
+        // Check that it was not unswitched before, since already unswitched
+        // trivial vals are looks trivial too.
+        if (BranchesInfo.isUnswitched(SI, CaseVal))
+          continue;
+        LoopExitBB = LoopExitCandidate;
+        if (Val) *Val = CaseVal;
+        break;
+      }
+    }
+  }
+
+  // If we didn't find a single unique LoopExit block, or if the loop exit block
+  // contains phi nodes, this isn't trivial.
+  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+    return false;   // Can't handle this.
+
+  if (LoopExit) *LoopExit = LoopExitBB;
+
+  // We already know that nothing uses any scalar values defined inside of this
+  // loop.  As such, we just have to check to see if this loop will execute any
+  // side-effecting instructions (e.g. stores, calls, volatile loads) in the
+  // part of the loop that the code *would* execute.  We already checked the
+  // tail, check the header now.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I)
+    if (I->mayHaveSideEffects())
+      return false;
+  return true;
+}
+
+/// UnswitchIfProfitable - We have found that we can unswitch currentLoop when
+/// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
+  Function *F = loopHeader->getParent();
+  Constant *CondVal = 0;
+  BasicBlock *ExitBlock = 0;
+
+  if (IsTrivialUnswitchCondition(LoopCond, &CondVal, &ExitBlock)) {
+    // If the condition is trivial, always unswitch. There is no code growth
+    // for this case.
+    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, ExitBlock);
+    return true;
+  }
+
+  // Check to see if it would be profitable to unswitch current loop.
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  if (OptimizeForSize ||
+      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                      Attribute::OptimizeForSize))
+    return false;
+
+  UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
+  return true;
+}
+
+/// CloneLoop - Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop *New = new Loop();
+  LPM->insertLoop(New, PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), LI->getBase());
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    CloneLoop(*I, New, VM, LI, LPM);
+
+  return New;
+}
+
+/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values
+/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the
+/// code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  if (!isa<ConstantInt>(Val) ||
+      Val->getType() != Type::getInt1Ty(LIC->getContext()))
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
+  else if (Val != ConstantInt::getTrue(Val->getContext()))
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+
+  // Insert the new branch.
+  BranchInst *BI = BranchInst::Create(TrueDest, FalseDest, BranchVal, InsertPt);
+
+  // If either edge is critical, split it. This helps preserve LoopSimplify
+  // form for enclosing loops.
+  SplitCriticalEdge(BI, 0, this, false, false, true);
+  SplitCriticalEdge(BI, 1, this, false, false, true);
+}
+
+/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
+/// condition in it (a cond branch from its header block to its latch block,
+/// where the path through the loop that doesn't execute its body has no
+/// side-effects), unswitch it.  This doesn't involve any code duplication, just
+/// moving the conditional branch outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
+                                            Constant *Val,
+                                            BasicBlock *ExitBlock) {
+  DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+        << loopHeader->getName() << " [" << L->getBlocks().size()
+        << " blocks] in Function " << L->getHeader()->getParent()->getName()
+        << " on cond: " << *Val << " == " << *Cond << "\n");
+
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change loopPreheader to have a
+  // conditional branch on Cond.
+  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, this);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+
+  // Okay, now we have a position to branch from and a position to branch to,
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH,
+                                 loopPreheader->getTerminator());
+  LPM->deleteSimpleAnalysisValue(loopPreheader->getTerminator(), L);
+  loopPreheader->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  redoLoop = true;
+
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// SplitExitEdges - Split all of the edges from inside the loop to their exit
+/// blocks.  Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::SplitExitEdges(Loop *L,
+                                const SmallVector<BasicBlock *, 8> &ExitBlocks){
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+                                       pred_end(ExitBlock));
+
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
+    if (!ExitBlock->isLandingPad()) {
+      SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", this);
+    } else {
+      SmallVector<BasicBlock*, 2> NewBBs;
+      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
+                                  this, NewBBs);
+    }
+  }
+}
+
+/// UnswitchNontrivialCondition - We determined that the loop is profitable
+/// to unswitch when LIC equal Val.  Split it into loop versions and test the
+/// condition outside of either loop.  Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+                                               Loop *L) {
+  Function *F = loopHeader->getParent();
+  DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+        << loopHeader->getName() << " [" << L->getBlocks().size()
+        << " blocks] in Function " << F->getName()
+        << " when '" << *Val << "' == " << *LIC << "\n");
+
+  if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
+    SE->forgetLoop(L);
+
+  LoopBlocks.clear();
+  NewBlocks.clear();
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(loopPreheader, loopHeader, this);
+  LoopBlocks.push_back(NewPreheader);
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  SplitExitEdges(L, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  ValueToValueMapTy VMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+
+    NewBlocks.push_back(NewBB);
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+  }
+
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+  // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+  // Probably clone more loop-unswitch related loop properties.
+  BranchesInfo.cloneData(NewLoop, L, VMap);
+
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
+  }
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, LI->getBase());
+
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    PHINode *PN;
+    for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) {
+      PN = cast<PHINode>(I);
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      ValueToValueMapTy::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PN = PHINode::Create(LPad->getType(), 0, "",
+                           ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR);
+  LPM->deleteSimpleAnalysisValue(OldBR, L);
+  OldBR->eraseFromParent();
+
+  LoopProcessWorklist.push_back(NewLoop);
+  redoLoop = true;
+
+  // Keep a WeakVH holding onto LIC.  If the first call to RewriteLoopBody
+  // deletes the instruction (for example by simplifying a PHI that feeds into
+  // the condition that we're unswitching on), we don't rewrite the second
+  // iteration.
+  WeakVH LICHandle(LIC);
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+
+  // It's possible that simplifying one loop could cause the other to be
+  // changed to another value or a constant.  If its a constant, don't simplify
+  // it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
+      LICHandle && !isa<Constant>(LICHandle))
+    RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+}
+
+/// RemoveFromWorklist - Remove all instances of I from the worklist vector
+/// specified.
+static void RemoveFromWorklist(Instruction *I,
+                               std::vector<Instruction*> &Worklist) {
+
+  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+                 Worklist.end());
+}
+
+/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V,
+                              std::vector<Instruction*> &Worklist,
+                              Loop *L, LPPassManager *LPM) {
+  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI)
+    Worklist.push_back(cast<Instruction>(*UI));
+  LPM->deleteSimpleAnalysisValue(I, L);
+  RemoveFromWorklist(I, Worklist);
+  I->replaceAllUsesWith(V);
+  I->eraseFromParent();
+  ++NumSimplify;
+}
+
+/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
+/// information, and remove any dead successors it has.
+///
+void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
+                                     std::vector<Instruction*> &Worklist,
+                                     Loop *L) {
+  if (pred_begin(BB) != pred_end(BB)) {
+    // This block isn't dead, since an edge to BB was just removed, see if there
+    // are any easy simplifications we can do now.
+    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+      // If it has one pred, fold phi nodes in BB.
+      while (isa<PHINode>(BB->begin()))
+        ReplaceUsesOfWith(BB->begin(),
+                          cast<PHINode>(BB->begin())->getIncomingValue(0),
+                          Worklist, L, LPM);
+
+      // If this is the header of a loop and the only pred is the latch, we now
+      // have an unreachable loop.
+      if (Loop *L = LI->getLoopFor(BB))
+        if (loopHeader == BB && L->contains(Pred)) {
+          // Remove the branch from the latch to the header block, this makes
+          // the header dead, which will make the latch dead (because the header
+          // dominates the latch).
+          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
+          Pred->getTerminator()->eraseFromParent();
+          new UnreachableInst(BB->getContext(), Pred);
+
+          // The loop is now broken, remove it from LI.
+          RemoveLoopFromHierarchy(L);
+
+          // Reprocess the header, which now IS dead.
+          RemoveBlockIfDead(BB, Worklist, L);
+          return;
+        }
+
+      // If pred ends in a uncond branch, add uncond branch to worklist so that
+      // the two blocks will get merged.
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+        if (BI->isUnconditional())
+          Worklist.push_back(BI);
+    }
+    return;
+  }
+
+  DEBUG(dbgs() << "Nuking dead block: " << *BB);
+
+  // Remove the instructions in the basic block from the worklist.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    RemoveFromWorklist(I, Worklist);
+
+    // Anything that uses the instructions in this basic block should have their
+    // uses replaced with undefs.
+    // If I is not void type then replaceAllUsesWith undef.
+    // This allows ValueHandlers and custom metadata to adjust itself.
+    if (!I->getType()->isVoidTy())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+
+  // If this is the edge to the header block for a loop, remove the loop and
+  // promote all subloops.
+  if (Loop *BBLoop = LI->getLoopFor(BB)) {
+    if (BBLoop->getLoopLatch() == BB) {
+      RemoveLoopFromHierarchy(BBLoop);
+      if (currentLoop == BBLoop) {
+        currentLoop = 0;
+        redoLoop = false;
+      }
+    }
+  }
+
+  // Remove the block from the loop info, which removes it from any loops it
+  // was in.
+  LI->removeBlock(BB);
+
+
+  // Remove phi node entries in successors for this block.
+  TerminatorInst *TI = BB->getTerminator();
+  SmallVector<BasicBlock*, 4> Succs;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    Succs.push_back(TI->getSuccessor(i));
+    TI->getSuccessor(i)->removePredecessor(BB);
+  }
+
+  // Unique the successors, remove anything with multiple uses.
+  array_pod_sort(Succs.begin(), Succs.end());
+  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
+
+  // Remove the basic block, including all of the instructions contained in it.
+  LPM->deleteSimpleAnalysisValue(BB, L);
+  BB->eraseFromParent();
+  // Remove successor blocks here that are not dead, so that we know we only
+  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
+  // then getting removed before we revisit them, which is badness.
+  //
+  for (unsigned i = 0; i != Succs.size(); ++i)
+    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
+      // One exception is loop headers.  If this block was the preheader for a
+      // loop, then we DO want to visit the loop so the loop gets deleted.
+      // We know that if the successor is a loop header, that this loop had to
+      // be the preheader: the case where this was the latch block was handled
+      // above and headers can only have two predecessors.
+      if (!LI->isLoopHeader(Succs[i])) {
+        Succs.erase(Succs.begin()+i);
+        --i;
+      }
+    }
+
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    RemoveBlockIfDead(Succs[i], Worklist, L);
+}
+
+/// RemoveLoopFromHierarchy - We have discovered that the specified loop has
+/// become unwrapped, either because the backedge was deleted, or because the
+/// edge into the header was removed.  If the edge into the header from the
+/// latch block was removed, the loop is unwrapped but subloops are still alive,
+/// so they just reparent loops.  If the loops are actually dead, they will be
+/// removed later.
+void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) {
+  LPM->deleteLoopFromQueue(L);
+  RemoveLoopFromWorklist(L);
+}
+
+// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
+// the value specified by Val in the specified loop, or we know it does NOT have
+// that value.  Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<Instruction*> Worklist;
+  LLVMContext &Context = Val->getContext();
+
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) &&
+      Val->getType()->isIntegerTy(1))) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+
+    for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+         UI != E; ++UI) {
+      Instruction *U = dyn_cast<Instruction>(*UI);
+      if (!U || !L->contains(U))
+        continue;
+      Worklist.push_back(U);
+    }
+
+    for (std::vector<Instruction*>::iterator UI = Worklist.begin();
+         UI != Worklist.end(); ++UI)
+      (*UI)->replaceUsesOfWith(LIC, Replacement);
+
+    SimplifyCode(Worklist, L);
+    return;
+  }
+
+  // Otherwise, we don't know the precise value of LIC, but we do know that it
+  // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+  // can.  This case occurs when we unswitch switch statements.
+  for (Value::use_iterator UI = LIC->use_begin(), E = LIC->use_end();
+       UI != E; ++UI) {
+    Instruction *U = dyn_cast<Instruction>(*UI);
+    if (!U || !L->contains(U))
+      continue;
+
+    Worklist.push_back(U);
+
+    // TODO: We could do other simplifications, for example, turning
+    // 'icmp eq LIC, Val' -> false.
+
+    // If we know that LIC is not Val, use this info to simplify code.
+    SwitchInst *SI = dyn_cast<SwitchInst>(U);
+    if (SI == 0 || !isa<ConstantInt>(Val)) continue;
+
+    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // Default case is live for multiple values.
+    if (DeadCase == SI->case_default()) continue;
+
+    // Found a dead case value.  Don't remove PHI nodes in the
+    // successor if they become single-entry, those PHI nodes may
+    // be in the Users list.
+
+    BasicBlock *Switch = SI->getParent();
+    BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *Latch = L->getLoopLatch();
+
+    BranchesInfo.setUnswitched(SI, Val);
+
+    if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
+    // If the DeadCase successor dominates the loop latch, then the
+    // transformation isn't safe since it will delete the sole predecessor edge
+    // to the latch.
+    if (Latch && DT->dominates(SISucc, Latch))
+      continue;
+
+    // FIXME: This is a hack.  We need to keep the successor around
+    // and hooked up so as to preserve the loop structure, because
+    // trying to update it is complicated.  So instead we preserve the
+    // loop structure and put the block on a dead code path.
+    SplitEdge(Switch, SISucc, this);
+    // Compute the successors instead of relying on the return value
+    // of SplitEdge, since it may have split the switch successor
+    // after PHI nodes.
+    BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+    BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+    // Create an "unreachable" destination.
+    BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+                                           Switch->getParent(),
+                                           OldSISucc);
+    new UnreachableInst(Context, Abort);
+    // Force the new case destination to branch to the "unreachable"
+    // block while maintaining a (dead) CFG edge to the old block.
+    NewSISucc->getTerminator()->eraseFromParent();
+    BranchInst::Create(Abort, OldSISucc,
+                       ConstantInt::getTrue(Context), NewSISucc);
+    // Release the PHI operands for this edge.
+    for (BasicBlock::iterator II = NewSISucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(II); ++II)
+      PN->setIncomingValue(PN->getBasicBlockIndex(Switch),
+                           UndefValue::get(PN->getType()));
+    // Tell the domtree about the new block. We don't fully update the
+    // domtree here -- instead we force it to do a full recomputation
+    // after the pass is complete -- but we do need to inform it of
+    // new blocks.
+    if (DT)
+      DT->addNewBlock(Abort, NewSISucc);
+  }
+
+  SimplifyCode(Worklist, L);
+}
+
+/// SimplifyCode - Okay, now that we have simplified some instructions in the
+/// loop, walk over it and constant prop, dce, and fold control flow where
+/// possible.  Note that this is effectively a very simple loop-structure-aware
+/// optimizer.  During processing of this loop, L could very well be deleted, so
+/// it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DEBUG(dbgs() << "Remove dead instruction '" << *I);
+
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      LPM->deleteSimpleAnalysisValue(I, L);
+      RemoveFromWorklist(I, Worklist);
+      I->eraseFromParent();
+      ++NumSimplify;
+      continue;
+    }
+
+    // See if instruction simplification can hack this up.  This is common for
+    // things like "select false, X, Y" after unswitching made the condition be
+    // 'false'.  TODO: update the domtree properly so we can pass it here.
+    if (Value *V = SimplifyInstruction(I))
+      if (LI->replacementPreservesLCSSAForm(I, V)) {
+        ReplaceUsesOfWith(I, V, Worklist, L, LPM);
+        continue;
+      }
+
+    // Special case hacks that appear commonly in unswitched code.
+    if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- "
+              << Succ->getName() << "\n");
+
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM);
+
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
+                                   Succ->end());
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        LPM->deleteSimpleAnalysisValue(Succ, L);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+        continue;
+      }
+
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
+        // Conditional branch.  Turn it into an unconditional branch, then
+        // remove dead blocks.
+        continue;  // FIXME: Enable.
+
+        DEBUG(dbgs() << "Folded branch: " << *BI);
+        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
+        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
+        DeadSucc->removePredecessor(BI->getParent(), true);
+        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
+        LPM->deleteSimpleAnalysisValue(BI, L);
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        ++NumSimplify;
+
+        RemoveBlockIfDead(DeadSucc, Worklist, L);
+      }
+      continue;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
new file mode 100644
index 000000000000..8ced4946c832
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -0,0 +1,142 @@
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loweratomic"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI->getParent(), CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+
+  CXI->replaceAllUsesWith(Orig);
+  CXI->eraseFromParent();
+  return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI->getParent(), RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
+
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = NULL;
+
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
+    break;
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
+    break;
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
+    break;
+  }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(NotAtomic);
+  return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(NotAtomic);
+  return true;
+}
+
+namespace {
+  struct LowerAtomic : public BasicBlockPass {
+    static char ID;
+    LowerAtomic() : BasicBlockPass(ID) {
+      initializeLowerAtomicPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnBasicBlock(BasicBlock &BB) {
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+        Instruction *Inst = DI++;
+        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+          Changed |= LowerFenceInst(FI);
+        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+          Changed |= LowerAtomicCmpXchgInst(CXI);
+        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+          Changed |= LowerAtomicRMWInst(RMWI);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+          if (LI->isAtomic())
+            LowerLoadInst(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+          if (SI->isAtomic())
+            LowerStoreInst(SI);
+        }
+      }
+      return Changed;
+    }
+  };
+}
+
+char LowerAtomic::ID = 0;
+INITIALIZE_PASS(LowerAtomic, "loweratomic",
+                "Lower atomic intrinsics to non-atomic form",
+                false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
new file mode 100644
index 000000000000..be0f0e8a25f6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -0,0 +1,1020 @@
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "memcpyopt"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
+
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+                                  bool &VariableIdxFound, const DataLayout &TD){
+  // Skip over the first indices.
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1; i != Idx; ++i, ++GTI)
+    /*skip along*/;
+
+  // Compute the offset implied by the rest of the indices.
+  int64_t Offset = 0;
+  for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (OpC == 0)
+      return VariableIdxFound = true;
+    if (OpC->isZero()) continue;  // No offset.
+
+    // Handle struct indices, which add their field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+      continue;
+    }
+
+    // Otherwise, we have a sequential type like an array or vector.  Multiply
+    // the index by the ElementSize.
+    uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
+    Offset += Size*OpC->getSExtValue();
+  }
+
+  return Offset;
+}
+
+/// IsPointerOffset - Return true if Ptr1 is provably equal to Ptr2 plus a
+/// constant offset, and return that constant offset.  For example, Ptr1 might
+/// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
+static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
+                            const DataLayout &TD) {
+  Ptr1 = Ptr1->stripPointerCasts();
+  Ptr2 = Ptr2->stripPointerCasts();
+  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
+
+  bool VariableIdxFound = false;
+
+  // If one pointer is a GEP and the other isn't, then see if the GEP is a
+  // constant offset from the base, as in "P" and "gep P, 1".
+  if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) {
+    Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD);
+    return !VariableIdxFound;
+  }
+
+  if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) {
+    Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD);
+    return !VariableIdxFound;
+  }
+
+  // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical
+  // base.  After that base, they may have some number of common (and
+  // potentially variable) indices.  After that they handle some constant
+  // offset, which determines their offset from each other.  At this point, we
+  // handle no other case.
+  if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0))
+    return false;
+
+  // Skip any common indices and track the GEP types.
+  unsigned Idx = 1;
+  for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx)
+    if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx))
+      break;
+
+  int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD);
+  int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD);
+  if (VariableIdxFound) return false;
+
+  Offset = Offset2-Offset1;
+  return true;
+}
+
+
+/// MemsetRange - Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+///   store 0 -> P+1
+///   store 0 -> P+0
+///   store 0 -> P+3
+///   store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc.  When we see
+/// the first store, we make a range [1, 2).  The second store extends the range
+/// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+namespace {
+struct MemsetRange {
+  // Start/End - A semi range that describes the span that this range covers.
+  // The range is closed at the start and open at the end: [Start, End).
+  int64_t Start, End;
+
+  /// StartPtr - The getelementptr instruction that points to the start of the
+  /// range.
+  Value *StartPtr;
+
+  /// Alignment - The known alignment of the first store.
+  unsigned Alignment;
+
+  /// TheStores - The actual stores that make up this range.
+  SmallVector<Instruction*, 16> TheStores;
+
+  bool isProfitableToUseMemset(const DataLayout &TD) const;
+
+};
+} // end anon namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
+  // If we found more than 4 stores to merge or 16 bytes, use memset.
+  if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+  // If there is nothing to merge, don't do anything.
+  if (TheStores.size() < 2) return false;
+
+  // If any of the stores are a memset, then it is always good to extend the
+  // memset.
+  for (unsigned i = 0, e = TheStores.size(); i != e; ++i)
+    if (!isa<StoreInst>(TheStores[i]))
+      return true;
+
+  // Assume that the code generator is capable of merging pairs of stores
+  // together if it wants to.
+  if (TheStores.size() == 2) return false;
+
+  // If we have fewer than 8 stores, it can still be worthwhile to do this.
+  // For example, merging 4 i8 stores into an i32 store is useful almost always.
+  // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+  // memset will be split into 2 32-bit stores anyway) and doing so can
+  // pessimize the llvm optimizer.
+  //
+  // Since we don't have perfect knowledge here, make some assumptions: assume
+  // the maximum GPR width is the same size as the pointer size and assume that
+  // this width can be stored.  If so, check to see whether we will end up
+  // actually reducing the number of stores used.
+  unsigned Bytes = unsigned(End-Start);
+  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+
+  // Assume the remaining bytes if any are done a byte at a time.
+  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+
+  // If we will reduce the # stores (according to this heuristic), do the
+  // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+  // etc.
+  return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+
+namespace {
+class MemsetRanges {
+  /// Ranges - A sorted list of the memset ranges.  We use std::list here
+  /// because each element is relatively large and expensive to copy.
+  std::list<MemsetRange> Ranges;
+  typedef std::list<MemsetRange>::iterator range_iterator;
+  const DataLayout &TD;
+public:
+  MemsetRanges(const DataLayout &td) : TD(td) {}
+
+  typedef std::list<MemsetRange>::const_iterator const_iterator;
+  const_iterator begin() const { return Ranges.begin(); }
+  const_iterator end() const { return Ranges.end(); }
+  bool empty() const { return Ranges.empty(); }
+
+  void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      addStore(OffsetFromFirst, SI);
+    else
+      addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+  }
+
+  void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+    int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType());
+
+    addRange(OffsetFromFirst, StoreSize,
+             SI->getPointerOperand(), SI->getAlignment(), SI);
+  }
+
+  void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+    int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+  }
+
+  void addRange(int64_t Start, int64_t Size, Value *Ptr,
+                unsigned Alignment, Instruction *Inst);
+
+};
+
+} // end anon namespace
+
+
+/// addRange - Add a new store to the MemsetRanges data structure.  This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+///
+/// Do a linear search of the ranges to see if this can be joined and/or to
+/// find the insertion point in the list.  We keep the ranges sorted for
+/// simplicity here.  This is a linear search of a linked list, which is ugly,
+/// however the number of ranges is limited, so this won't get crazy slow.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+                            unsigned Alignment, Instruction *Inst) {
+  int64_t End = Start+Size;
+  range_iterator I = Ranges.begin(), E = Ranges.end();
+
+  while (I != E && Start > I->End)
+    ++I;
+
+  // We now know that I == E, in which case we didn't find anything to merge
+  // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
+  // to insert a new range.  Handle this now.
+  if (I == E || End < I->Start) {
+    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    R.Start        = Start;
+    R.End          = End;
+    R.StartPtr     = Ptr;
+    R.Alignment    = Alignment;
+    R.TheStores.push_back(Inst);
+    return;
+  }
+
+  // This store overlaps with I, add it.
+  I->TheStores.push_back(Inst);
+
+  // At this point, we may have an interval that completely contains our store.
+  // If so, just add it to the interval and return.
+  if (I->Start <= Start && I->End >= End)
+    return;
+
+  // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+  // but is not entirely contained within the range.
+
+  // See if the range extends the start of the range.  In this case, it couldn't
+  // possibly cause it to join the prior range, because otherwise we would have
+  // stopped on *it*.
+  if (Start < I->Start) {
+    I->Start = Start;
+    I->StartPtr = Ptr;
+    I->Alignment = Alignment;
+  }
+
+  // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+  // is in or right at the end of I), and that End >= I->Start.  Extend I out to
+  // End.
+  if (End > I->End) {
+    I->End = End;
+    range_iterator NextI = I;
+    while (++NextI != E && End >= NextI->Start) {
+      // Merge the range in.
+      I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+      if (NextI->End > I->End)
+        I->End = NextI->End;
+      Ranges.erase(NextI);
+      NextI = I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         MemCpyOpt Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class MemCpyOpt : public FunctionPass {
+    MemoryDependenceAnalysis *MD;
+    TargetLibraryInfo *TLI;
+    const DataLayout *TD;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MemCpyOpt() : FunctionPass(ID) {
+      initializeMemCpyOptPass(*PassRegistry::getPassRegistry());
+      MD = 0;
+      TLI = 0;
+      TD = 0;
+    }
+
+    bool runOnFunction(Function &F);
+
+  private:
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+
+    // Helper fuctions
+    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
+    bool processMemCpy(MemCpyInst *M);
+    bool processMemMove(MemMoveInst *M);
+    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
+    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
+                                       uint64_t MSize);
+    bool processByValArgument(CallSite CS, unsigned ArgNo);
+    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
+                                      Value *ByteVal);
+
+    bool iterateOnFunction(Function &F);
+  };
+
+  char MemCpyOpt::ID = 0;
+}
+
+// createMemCpyOptPass - The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
+                    false, false)
+
+/// tryMergingIntoMemset - When scanning forward over instructions, we look for
+/// some other patterns to fold away.  In particular, this looks for stores to
+/// neighboring locations of memory.  If it sees enough consecutive ones, it
+/// attempts to merge them together into a memcpy/memset.
+Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
+                                             Value *StartPtr, Value *ByteVal) {
+  if (TD == 0) return 0;
+
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemsetRanges Ranges(*TD);
+
+  BasicBlock::iterator BI = StartInst;
+  for (++BI; !isa<TerminatorInst>(BI); ++BI) {
+    if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      // If this is a store, see if we can merge it in.
+      if (!NextStore->isSimple()) break;
+
+      // Check to see if this stored value is of the same byte-splattable value.
+      if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(),
+                           Offset, *TD))
+        break;
+
+      Ranges.addStore(Offset, NextStore);
+    } else {
+      MemSetInst *MSI = cast<MemSetInst>(BI);
+
+      if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+          !isa<ConstantInt>(MSI->getLength()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t Offset;
+      if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD))
+        break;
+
+      Ranges.addMemSet(Offset, MSI);
+    }
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return 0;
+
+  // If we had at least one store that could be merged in, add the starting
+  // store as well.  We try to avoid this unless there is at least something
+  // interesting as a small compile-time optimization.
+  Ranges.addInst(0, StartInst);
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(BI);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemSet = 0;
+  for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
+       I != E; ++I) {
+    const MemsetRange &Range = *I;
+
+    if (Range.TheStores.size() == 1) continue;
+
+    // If it is profitable to lower this range to memset, do so now.
+    if (!Range.isProfitableToUseMemset(*TD))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memset.
+    // Get the starting pointer of the block.
+    StartPtr = Range.StartPtr;
+
+    // Determine alignment
+    unsigned Alignment = Range.Alignment;
+    if (Alignment == 0) {
+      Type *EltType =
+        cast<PointerType>(StartPtr->getType())->getElementType();
+      Alignment = TD->getABITypeAlignment(EltType);
+    }
+
+    AMemSet =
+      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+
+    DEBUG(dbgs() << "Replace stores:\n";
+          for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i)
+            dbgs() << *Range.TheStores[i] << '\n';
+          dbgs() << "With: " << *AMemSet << '\n');
+
+    if (!Range.TheStores.empty())
+      AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+    // Zap all the stores.
+    for (SmallVector<Instruction*, 16>::const_iterator
+         SI = Range.TheStores.begin(),
+         SE = Range.TheStores.end(); SI != SE; ++SI) {
+      MD->removeInstruction(*SI);
+      (*SI)->eraseFromParent();
+    }
+    ++NumMemSetInfer;
+  }
+
+  return AMemSet;
+}
+
+
+bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+  if (!SI->isSimple()) return false;
+
+  if (TD == 0) return false;
+
+  // Detect cases where we're performing call slot forwarding, but
+  // happen to be using a load-store pair to implement it, rather than
+  // a memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (LI->isSimple() && LI->hasOneUse() &&
+        LI->getParent() == SI->getParent()) {
+      MemDepResult ldep = MD->getDependency(LI);
+      CallInst *C = 0;
+      if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
+        C = dyn_cast<CallInst>(ldep.getInst());
+
+      if (C) {
+        // Check that nothing touches the dest of the "copy" between
+        // the call and the store.
+        AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+        AliasAnalysis::Location StoreLoc = AA.getLocation(SI);
+        for (BasicBlock::iterator I = --BasicBlock::iterator(SI),
+                                  E = C; I != E; --I) {
+          if (AA.getModRefInfo(&*I, StoreLoc) != AliasAnalysis::NoModRef) {
+            C = 0;
+            break;
+          }
+        }
+      }
+
+      if (C) {
+        unsigned storeAlign = SI->getAlignment();
+        if (!storeAlign)
+          storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType());
+        unsigned loadAlign = LI->getAlignment();
+        if (!loadAlign)
+          loadAlign = TD->getABITypeAlignment(LI->getType());
+
+        bool changed = performCallSlotOptzn(LI,
+                        SI->getPointerOperand()->stripPointerCasts(),
+                        LI->getPointerOperand()->stripPointerCasts(),
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()),
+                        std::min(storeAlign, loadAlign), C);
+        if (changed) {
+          MD->removeInstruction(SI);
+          SI->eraseFromParent();
+          MD->removeInstruction(LI);
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+
+  // There are two cases that are interesting for this code to handle: memcpy
+  // and memset.  Right now we only handle memset.
+
+  // Ensure that the value being stored is something that can be memset'able a
+  // byte at a time like "0" or "-1" or any width, as well as things like
+  // 0xA0A0A0A0 and 0.0.
+  if (Value *ByteVal = isBytewiseValue(SI->getOperand(0)))
+    if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+                                              ByteVal)) {
+      BBI = I;  // Don't invalidate iterator.
+      return true;
+    }
+
+  return false;
+}
+
+bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+  // See if there is another memset or store neighboring this memset which
+  // allows us to widen out the memset to do a single larger store.
+  if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+    if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+                                              MSI->getValue())) {
+      BBI = I;  // Don't invalidate iterator.
+      return true;
+    }
+  return false;
+}
+
+
+/// performCallSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+                                     Value *cpyDest, Value *cpySrc,
+                                     uint64_t cpyLen, unsigned cpyAlign,
+                                     CallInst *C) {
+  // The general transformation to keep in mind is
+  //
+  //   call @func(..., src, ...)
+  //   memcpy(dest, src, ...)
+  //
+  // ->
+  //
+  //   memcpy(dest, src, ...)
+  //   call @func(..., dest, ...)
+  //
+  // Since moving the memcpy is technically awkward, we additionally check that
+  // src only holds uninitialized values at the moment of the call, meaning that
+  // the memcpy can be discarded rather than moved.
+
+  // Deliberately get the source and destination with bitcasts stripped away,
+  // because we'll need to do type comparisons based on the underlying type.
+  CallSite CS(C);
+
+  // Require that src be an alloca.  This simplifies the reasoning considerably.
+  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  if (!srcAlloca)
+    return false;
+
+  // Check that all of src is copied to dest.
+  if (TD == 0) return false;
+
+  ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+  if (!srcArraySize)
+    return false;
+
+  uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) *
+    srcArraySize->getZExtValue();
+
+  if (cpyLen < srcSize)
+    return false;
+
+  // Check that accessing the first srcSize bytes of dest will not cause a
+  // trap.  Otherwise the transform is invalid since it might cause a trap
+  // to occur earlier than it otherwise would.
+  if (AllocaInst *A = dyn_cast<AllocaInst>(cpyDest)) {
+    // The destination is an alloca.  Check it is larger than srcSize.
+    ConstantInt *destArraySize = dyn_cast<ConstantInt>(A->getArraySize());
+    if (!destArraySize)
+      return false;
+
+    uint64_t destSize = TD->getTypeAllocSize(A->getAllocatedType()) *
+      destArraySize->getZExtValue();
+
+    if (destSize < srcSize)
+      return false;
+  } else if (Argument *A = dyn_cast<Argument>(cpyDest)) {
+    // If the destination is an sret parameter then only accesses that are
+    // outside of the returned struct type can trap.
+    if (!A->hasStructRetAttr())
+      return false;
+
+    Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+    uint64_t destSize = TD->getTypeAllocSize(StructTy);
+
+    if (destSize < srcSize)
+      return false;
+  } else {
+    return false;
+  }
+
+  // Check that dest points to memory that is at least as aligned as src.
+  unsigned srcAlign = srcAlloca->getAlignment();
+  if (!srcAlign)
+    srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType());
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
+  // Check that src is not accessed except via the call and the memcpy.  This
+  // guarantees that it holds only undefined values when passed in (so the final
+  // memcpy can be dropped), that it is not read or written between the call and
+  // the memcpy, and that writing beyond the end of it is undefined.
+  SmallVector<User*, 8> srcUseList(srcAlloca->use_begin(),
+                                   srcAlloca->use_end());
+  while (!srcUseList.empty()) {
+    User *UI = srcUseList.pop_back_val();
+
+    if (isa<BitCastInst>(UI)) {
+      for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+           I != E; ++I)
+        srcUseList.push_back(*I);
+    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(UI)) {
+      if (G->hasAllZeroIndices())
+        for (User::use_iterator I = UI->use_begin(), E = UI->use_end();
+             I != E; ++I)
+          srcUseList.push_back(*I);
+      else
+        return false;
+    } else if (UI != C && UI != cpy) {
+      return false;
+    }
+  }
+
+  // Since we're changing the parameter to the callsite, we need to make sure
+  // that what would be the new parameter dominates the callsite.
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  if (Instruction *cpyDestInst = dyn_cast<Instruction>(cpyDest))
+    if (!DT.dominates(cpyDestInst, C))
+      return false;
+
+  // In addition to knowing that the call does not access src in some
+  // unexpected manner, for example via a global, which we deduce from
+  // the use analysis, we also need to know that it does not sneakily
+  // access dest.  We rely on AA to figure this out for us.
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize);
+  // If necessary, perform additional analysis.
+  if (MR != AliasAnalysis::NoModRef)
+    MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT);
+  if (MR != AliasAnalysis::NoModRef)
+    return false;
+
+  // All the checks have passed, so do the transformation.
+  bool changedArgument = false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
+        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                      cpyDest->getName(), C);
+      changedArgument = true;
+      if (CS.getArgument(i)->getType() == Dest->getType())
+        CS.setArgument(i, Dest);
+      else
+        CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+                          CS.getArgument(i)->getType(), Dest->getName(), C));
+    }
+
+  if (!changedArgument)
+    return false;
+
+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
+  // Drop any cached information about the call, because we may have changed
+  // its dependence information by changing its parameter.
+  MD->removeInstruction(C);
+
+  // Remove the memcpy.
+  MD->removeInstruction(cpy);
+  ++NumMemCpyInstr;
+
+  return true;
+}
+
+/// processMemCpyMemCpyDependence - We've found that the (upward scanning)
+/// memory dependence of memcpy 'M' is the memcpy 'MDep'.  Try to simplify M to
+/// copy from MDep's input if we can.  MSize is the size of M's copy.
+///
+bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
+                                              uint64_t MSize) {
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other.
+  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+    return false;
+
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction.  Just
+  // ignore the input and let someone else zap MDep.  This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (M->getSource() == MDep->getSource())
+    return false;
+
+  // Second, the length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+  ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+    return false;
+
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+    MD->getPointerDependencyFrom(AA.getLocationForSource(MDep),
+                                 false, M, M->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  // If the dest of the second might alias the source of the first, then the
+  // source and dest might overlap.  We still want to eliminate the intermediate
+  // value, but we have to generate a memmove instead of memcpy.
+  bool UseMemMove = false;
+  if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep)))
+    UseMemMove = true;
+
+  // If all checks passed, then we can transform M.
+
+  // Make sure to use the lesser of the alignment of the source and the dest
+  // since we're changing where we're reading from, but don't want to increase
+  // the alignment past what can be read from or written to.
+  // TODO: Is this worth it if we're creating a less aligned memcpy? For
+  // example we could be moving from movaps -> movq on x86.
+  unsigned Align = std::min(MDep->getAlignment(), M->getAlignment());
+
+  IRBuilder<> Builder(M);
+  if (UseMemMove)
+    Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                          Align, M->isVolatile());
+  else
+    Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(),
+                         Align, M->isVolatile());
+
+  // Remove the instruction we're replacing.
+  MD->removeInstruction(M);
+  M->eraseFromParent();
+  ++NumMemCpyInstr;
+  return true;
+}
+
+
+/// processMemCpy - perform simplification of memcpy's.  If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
+  // We can only optimize statically-sized memcpy's that are non-volatile.
+  ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
+  if (CopySize == 0 || M->isVolatile()) return false;
+
+  // If the source and destination of the memcpy are the same, then zap it.
+  if (M->getSource() == M->getDest()) {
+    MD->removeInstruction(M);
+    M->eraseFromParent();
+    return false;
+  }
+
+  // If copying from a constant, try to turn the memcpy into a memset.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+    if (GV->isConstant() && GV->hasDefinitiveInitializer())
+      if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
+        IRBuilder<> Builder(M);
+        Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize,
+                             M->getAlignment(), false);
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        ++NumCpyToSet;
+        return true;
+      }
+
+  // The are two possible optimizations we can do for memcpy:
+  //   a) memcpy-memcpy xform which exposes redundance for DSE.
+  //   b) call-memcpy xform for return slot optimization.
+  MemDepResult DepInfo = MD->getDependency(M);
+  if (DepInfo.isClobber()) {
+    if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
+      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                               CopySize->getZExtValue(), M->getAlignment(),
+                               C)) {
+        MD->removeInstruction(M);
+        M->eraseFromParent();
+        return true;
+      }
+    }
+  }
+
+  AliasAnalysis::Location SrcLoc = AliasAnalysis::getLocationForSource(M);
+  MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(SrcLoc, true,
+                                                         M, M->getParent());
+  if (SrcDepInfo.isClobber()) {
+    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+      return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue());
+  }
+
+  return false;
+}
+
+/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst
+/// are guaranteed not to alias.
+bool MemCpyOpt::processMemMove(MemMoveInst *M) {
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+  if (!TLI->has(LibFunc::memmove))
+    return false;
+
+  // See if the pointers alias.
+  if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M)))
+    return false;
+
+  DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n");
+
+  // If not, then we know we can transform this.
+  Module *Mod = M->getParent()->getParent()->getParent();
+  Type *ArgTys[3] = { M->getRawDest()->getType(),
+                      M->getRawSource()->getType(),
+                      M->getLength()->getType() };
+  M->setCalledFunction(Intrinsic::getDeclaration(Mod, Intrinsic::memcpy,
+                                                 ArgTys));
+
+  // MemDep may have over conservative information about this instruction, just
+  // conservatively flush it from the cache.
+  MD->removeInstruction(M);
+
+  ++NumMoveToCpy;
+  return true;
+}
+
+/// processByValArgument - This is called on every byval argument in call sites.
+bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
+  if (TD == 0) return false;
+
+  // Find out what feeds this byval argument.
+  Value *ByValArg = CS.getArgument(ArgNo);
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+  uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
+  MemDepResult DepInfo =
+    MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
+                                 true, CS.getInstruction(),
+                                 CS.getInstruction()->getParent());
+  if (!DepInfo.isClobber())
+    return false;
+
+  // If the byval argument isn't fed by a memcpy, ignore it.  If it is fed by
+  // a memcpy, see if we can byval from the source of the memcpy instead of the
+  // result.
+  MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
+  if (MDep == 0 || MDep->isVolatile() ||
+      ByValArg->stripPointerCasts() != MDep->getDest())
+    return false;
+
+  // The length of the memcpy must be larger or equal to the size of the byval.
+  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize)
+    return false;
+
+  // Get the alignment of the byval.  If the call doesn't specify the alignment,
+  // then it is some target specific value that we can't know.
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+  if (ByValAlign == 0) return false;
+
+  // If it is greater than the memcpy, then we check to see if we can force the
+  // source of the memcpy to the alignment we need.  If we fail, we bail out.
+  if (MDep->getAlignment() < ByValAlign &&
+      getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign)
+    return false;
+
+  // Verify that the copied-from memory doesn't change in between the memcpy and
+  // the byval call.
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    foo(*a)
+  // It would be invalid to transform the second memcpy into foo(*b).
+  //
+  // NOTE: This is conservative, it will stop on any read from the source loc,
+  // not just the defining memcpy.
+  MemDepResult SourceDep =
+    MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep),
+                                 false, CS.getInstruction(), MDep->getParent());
+  if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
+    return false;
+
+  Value *TmpCast = MDep->getSource();
+  if (MDep->getSource()->getType() != ByValArg->getType())
+    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                              "tmpcast", CS.getInstruction());
+
+  DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n"
+               << "  " << *MDep << "\n"
+               << "  " << *CS.getInstruction() << "\n");
+
+  // Otherwise we're good!  Update the byval argument.
+  CS.setArgument(ArgNo, TmpCast);
+  ++NumMemCpyInstr;
+  return true;
+}
+
+/// iterateOnFunction - Executes one iteration of MemCpyOpt.
+bool MemCpyOpt::iterateOnFunction(Function &F) {
+  bool MadeChange = false;
+
+  // Walk all instruction in the function.
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      // Avoid invalidating the iterator.
+      Instruction *I = BI++;
+
+      bool RepeatInstruction = false;
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        MadeChange |= processStore(SI, BI);
+      else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+        RepeatInstruction = processMemSet(M, BI);
+      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+        RepeatInstruction = processMemCpy(M);
+      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+        RepeatInstruction = processMemMove(M);
+      else if (CallSite CS = (Value*)I) {
+        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
+          if (CS.isByValArgument(i))
+            MadeChange |= processByValArgument(CS, i);
+      }
+
+      // Reprocess the instruction if desired.
+      if (RepeatInstruction) {
+        if (BI != BB->begin()) --BI;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+// MemCpyOpt::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool MemCpyOpt::runOnFunction(Function &F) {
+  bool MadeChange = false;
+  MD = &getAnalysis<MemoryDependenceAnalysis>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  // If we don't have at least memset and memcpy, there is little point of doing
+  // anything here.  These are required by a freestanding implementation, so if
+  // even they are disabled, there is no point in trying hard.
+  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+    return false;
+
+  while (1) {
+    if (!iterateOnFunction(F))
+      break;
+    MadeChange = true;
+  }
+
+  MD = 0;
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 000000000000..7ee40273347b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -0,0 +1,2007 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reassociate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+  struct ValueEntry {
+    unsigned Rank;
+    Value *Op;
+    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+  };
+  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
+  }
+}
+
+#ifndef NDEBUG
+/// PrintOps - Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+  Module *M = I->getParent()->getParent()->getParent();
+  dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+       << *Ops[0].Op->getType() << '\t';
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    dbgs() << "[ ";
+    WriteAsOperand(dbgs(), Ops[i].Op, false, M);
+    dbgs() << ", #" << Ops[i].Rank << "] ";
+  }
+}
+#endif
+
+namespace {
+  /// \brief Utility class representing a base and exponent pair which form one
+  /// factor of some product.
+  struct Factor {
+    Value *Base;
+    unsigned Power;
+
+    Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {}
+
+    /// \brief Sort factors by their Base.
+    struct BaseSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base < RHS.Base;
+      }
+    };
+
+    /// \brief Compare factors for equal bases.
+    struct BaseEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Base == RHS.Base;
+      }
+    };
+
+    /// \brief Sort factors in descending order by their power.
+    struct PowerDescendingSorter {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power > RHS.Power;
+      }
+    };
+
+    /// \brief Compare factors for equal powers.
+    struct PowerEqual {
+      bool operator()(const Factor &LHS, const Factor &RHS) {
+        return LHS.Power == RHS.Power;
+      }
+    };
+  };
+  
+  /// Utility class representing a non-constant Xor-operand. We classify
+  /// non-constant Xor-Operands into two categories:
+  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+  ///  C2)
+  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+  ///          constant.
+  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+  ///          operand as "E | 0"
+  class XorOpnd {
+  public:
+    XorOpnd(Value *V);
+    const XorOpnd &operator=(const XorOpnd &That);
+
+    bool isInvalid() const { return SymbolicPart == 0; }
+    bool isOrExpr() const { return isOr; }
+    Value *getValue() const { return OrigVal; }
+    Value *getSymbolicPart() const { return SymbolicPart; }
+    unsigned getSymbolicRank() const { return SymbolicRank; }
+    const APInt &getConstPart() const { return ConstPart; }
+
+    void Invalidate() { SymbolicPart = OrigVal = 0; }
+    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+    // The purpose is twofold:
+    // 1) Cluster together the operands sharing the same symbolic-value.
+    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
+    //   could potentially shorten crital path, and expose more loop-invariants.
+    //   Note that values' rank are basically defined in RPO order (FIXME). 
+    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
+    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+    //   "z" in the order of X-Y-Z is better than any other orders.
+    class PtrSortFunctor {
+      ArrayRef<XorOpnd> A;
+
+    public:
+      PtrSortFunctor(ArrayRef<XorOpnd> Array) : A(Array) {}
+      bool operator()(unsigned LHSIndex, unsigned RHSIndex) {
+        return A[LHSIndex].getSymbolicRank() < A[RHSIndex].getSymbolicRank();
+      }
+    };
+  private:
+    Value *OrigVal;
+    Value *SymbolicPart;
+    APInt ConstPart;
+    unsigned SymbolicRank;
+    bool isOr;
+  };
+}
+
+namespace {
+  class Reassociate : public FunctionPass {
+    DenseMap<BasicBlock*, unsigned> RankMap;
+    DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
+    SetVector<AssertingVH<Instruction> > RedoInsts;
+    bool MadeChange;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    Reassociate() : FunctionPass(ID) {
+      initializeReassociatePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  private:
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeExpression(BinaryOperator *I,
+                              SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+                        Value *&Res);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                        APInt &ConstOpnd, Value *&Res);
+    bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                SmallVectorImpl<Factor> &Factors);
+    Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                   SmallVectorImpl<Factor> &Factors);
+    Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+    void EraseInst(Instruction *I);
+    void OptimizeInst(Instruction *I);
+  };
+}
+
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (isa<ConstantInt>(V0))
+      std::swap(V0, V1);
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+      ConstPart = C->getValue();
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  isOr = true;
+}
+
+const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
+  OrigVal = That.OrigVal;
+  SymbolicPart = That.SymbolicPart;
+  ConstPart = That.ConstPart;
+  SymbolicRank = That.SymbolicRank;
+  isOr = That.isOr;
+  return *this;
+}
+
+char Reassociate::ID = 0;
+INITIALIZE_PASS(Reassociate, "reassociate",
+                "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if (V->hasOneUse() && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
+}
+
+static bool isUnmovableInstruction(Instruction *I) {
+  if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::LandingPad ||
+      I->getOpcode() == Instruction::Alloca ||
+      I->getOpcode() == Instruction::Load ||
+      I->getOpcode() == Instruction::Invoke ||
+      (I->getOpcode() == Instruction::Call &&
+       !isa<DbgInfoIntrinsic>(I)) ||
+      I->getOpcode() == Instruction::UDiv ||
+      I->getOpcode() == Instruction::SDiv ||
+      I->getOpcode() == Instruction::FDiv ||
+      I->getOpcode() == Instruction::URem ||
+      I->getOpcode() == Instruction::SRem ||
+      I->getOpcode() == Instruction::FRem)
+    return true;
+  return false;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    ValueRankMap[&*I] = ++i;
+
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isUnmovableInstruction(I))
+        ValueRankMap[&*I] = ++BBRank;
+  }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) {
+    if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument.
+    return 0;  // Otherwise it's a global or constant, rank 0.
+  }
+
+  if (unsigned Rank = ValueRankMap[I])
+    return Rank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if (!I->getType()->isIntegerTy() ||
+      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+    ++Rank;
+
+  //DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = "
+  //     << Rank << "\n");
+
+  return ValueRankMap[I] = Rank;
+}
+
+/// LowerNegateToMultiply - Replace 0-X with X*-1.
+///
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+  Constant *Cst = Constant::getAllOnesValue(Neg->getType());
+
+  BinaryOperator *Res =
+    BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg);
+  Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op.
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Res->setDebugLoc(Neg->getDebugLoc());
+  return Res;
+}
+
+/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda
+/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+  if (Bitwidth < 3)
+    return Bitwidth - 1;
+  return Bitwidth - 2;
+}
+
+/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times.  The combined weight represents  X op X op ... op X with
+/// X occurring LHS + RHS times.  If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+  // If we were working with infinite precision arithmetic then the combined
+  // weight would be LHS + RHS.  But we are using finite precision arithmetic,
+  // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+  // for nilpotent operations and addition, but not for idempotent operations
+  // and multiplication), so it is important to correctly reduce the combined
+  // weight back into range if wrapping would be wrong.
+
+  // If RHS is zero then the weight didn't change.
+  if (RHS.isMinValue())
+    return;
+  // If LHS is zero then the combined weight is RHS.
+  if (LHS.isMinValue()) {
+    LHS = RHS;
+    return;
+  }
+  // From this point on we know that neither LHS nor RHS is zero.
+
+  if (Instruction::isIdempotent(Opcode)) {
+    // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+    // weight of 1.  Keeping weights at zero or one also means that wrapping is
+    // not a problem.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    return; // Return a weight of 1.
+  }
+  if (Instruction::isNilpotent(Opcode)) {
+    // Nilpotent means X op X === 0, so reduce weights modulo 2.
+    assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+    LHS = 0; // 1 + 1 === 0 modulo 2.
+    return;
+  }
+  if (Opcode == Instruction::Add) {
+    // TODO: Reduce the weight by exploiting nsw/nuw?
+    LHS += RHS;
+    return;
+  }
+
+  assert(Opcode == Instruction::Mul && "Unknown associative operation!");
+  unsigned Bitwidth = LHS.getBitWidth();
+  // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+  // can be replaced with W-CM.  That's because x^W=x^(W-CM) for every Bitwidth
+  // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+  // which case both x^W and x^(W - CM) are zero.  By subtracting off multiples
+  // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+  // which by a happy accident means that they can always be represented using
+  // Bitwidth bits.
+  // TODO: Reduce the weight by exploiting nsw/nuw?  (Could do much better than
+  // the Carmichael number).
+  if (Bitwidth > 3) {
+    /// CM - The value of Carmichael's lambda function.
+    APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+    // Any weight W >= Threshold can be replaced with W - CM.
+    APInt Threshold = CM + Bitwidth;
+    assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+    // For Bitwidth 4 or more the following sum does not overflow.
+    LHS += RHS;
+    while (LHS.uge(Threshold))
+      LHS -= CM;
+  } else {
+    // To avoid problems with overflow do everything the same as above but using
+    // a larger type.
+    unsigned CM = 1U << CarmichaelShift(Bitwidth);
+    unsigned Threshold = CM + Bitwidth;
+    assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+           "Weights not reduced!");
+    unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+    while (Total >= Threshold)
+      Total -= CM;
+    LHS = Total;
+  }
+}
+
+typedef std::pair<Value*, APInt> RepeatedValue;
+
+/// LinearizeExprTree - Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs).  The
+/// original expression is the same as
+///   (Ops[0].first op Ops[0].first op ... Ops[0].first)  <- Ops[0].second times
+/// op
+///   (Ops[1].first op Ops[1].first op ... Ops[1].first)  <- Ops[1].second times
+/// op
+///   ...
+/// op
+///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'.  The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different.  Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node.  Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+///           expression graph        node names
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B
+///                  / \ / \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// The leaf nodes are C, E, F and G.  The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression.  This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef.  If it is B's operand then this gives:
+///
+///                     +        |        I
+///                    / \       |
+///                   +   +      |      A,  B - operand of B replaced with undef
+///                  / \   \     |
+///                 *   +   *    |    C,  D,  E
+///                / \ / \ / \   |
+///                   +   *      |      F,  G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+
+static bool LinearizeExprTree(BinaryOperator *I,
+                              SmallVectorImpl<RepeatedValue> &Ops) {
+  DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+  unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+  unsigned Opcode = I->getOpcode();
+  assert(Instruction::isAssociative(Opcode) &&
+         Instruction::isCommutative(Opcode) &&
+         "Expected an associative and commutative operation!");
+
+  // Visit all operands of the expression, keeping track of their weight (the
+  // number of paths from the expression root to the operand, or if you like
+  // the number of times that operand occurs in the linearized expression).
+  // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+  // while A has weight two.
+
+  // Worklist of non-leaf nodes (their operands are in the expression too) along
+  // with their weights, representing a certain number of paths to the operator.
+  // If an operator occurs in the worklist multiple times then we found multiple
+  // ways to get to it.
+  SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight)
+  Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+  bool MadeChange = false;
+
+  // Leaves of the expression are values that either aren't the right kind of
+  // operation (eg: a constant, or a multiply in an add tree), or are, but have
+  // some uses that are not inside the expression.  For example, in I = X + X,
+  // X = A + B, the value X has two uses (by I) that are in the expression.  If
+  // X has any other uses, for example in a return instruction, then we consider
+  // X to be a leaf, and won't analyze it further.  When we first visit a value,
+  // if it has more than one use then at first we conservatively consider it to
+  // be a leaf.  Later, as the expression is explored, we may discover some more
+  // uses of the value from inside the expression.  If all uses turn out to be
+  // from within the expression (and the value is a binary operator of the right
+  // kind) then the value is no longer considered to be a leaf, and its operands
+  // are explored.
+
+  // Leaves - Keeps track of the set of putative leaves as well as the number of
+  // paths to each leaf seen so far.
+  typedef DenseMap<Value*, APInt> LeafMap;
+  LeafMap Leaves; // Leaf -> Total weight so far.
+  SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+  SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+  while (!Worklist.empty()) {
+    std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val();
+    I = P.first; // We examine the operands of this binary operator.
+
+    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands.
+      Value *Op = I->getOperand(OpIdx);
+      APInt Weight = P.second; // Number of paths to this operand.
+      DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+      assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+      // If this is a binary operation of the right kind with only one use then
+      // add its operands to the expression.
+      if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+        assert(Visited.insert(Op) && "Not first visit!");
+        DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+        Worklist.push_back(std::make_pair(BO, Weight));
+        continue;
+      }
+
+      // Appears to be a leaf.  Is the operand already in the set of leaves?
+      LeafMap::iterator It = Leaves.find(Op);
+      if (It == Leaves.end()) {
+        // Not in the leaf map.  Must be the first time we saw this operand.
+        assert(Visited.insert(Op) && "Not first visit!");
+        if (!Op->hasOneUse()) {
+          // This value has uses not accounted for by the expression, so it is
+          // not safe to modify.  Mark it as being a leaf.
+          DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+          LeafOrder.push_back(Op);
+          Leaves[Op] = Weight;
+          continue;
+        }
+        // No uses outside the expression, try morphing it.
+      } else if (It != Leaves.end()) {
+        // Already in the leaf map.
+        assert(Visited.count(Op) && "In leaf map but not visited!");
+
+        // Update the number of paths to the leaf.
+        IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0   // TODO: Re-enable once PR13021 is fixed.
+        // The leaf already has one use from inside the expression.  As we want
+        // exactly one such use, drop this new use of the leaf.
+        assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+        I->setOperand(OpIdx, UndefValue::get(I->getType()));
+        MadeChange = true;
+
+        // If the leaf is a binary operation of the right kind and we now see
+        // that its multiple original uses were in fact all by nodes belonging
+        // to the expression, then no longer consider it to be a leaf and add
+        // its operands to the expression.
+        if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+          DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+          Worklist.push_back(std::make_pair(BO, It->second));
+          Leaves.erase(It);
+          continue;
+        }
+#endif
+
+        // If we still have uses that are not accounted for by the expression
+        // then it is not safe to modify the value.
+        if (!Op->hasOneUse())
+          continue;
+
+        // No uses outside the expression, try morphing it.
+        Weight = It->second;
+        Leaves.erase(It); // Since the value may be morphed below.
+      }
+
+      // At this point we have a value which, first of all, is not a binary
+      // expression of the right kind, and secondly, is only used inside the
+      // expression.  This means that it can safely be modified.  See if we
+      // can usefully morph it into an expression of the right kind.
+      assert((!isa<Instruction>(Op) ||
+              cast<Instruction>(Op)->getOpcode() != Opcode) &&
+             "Should have been handled above!");
+      assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+      // If this is a multiply expression, turn any internal negations into
+      // multiplies by -1 so they can be reassociated.
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(Op);
+      if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) {
+        DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+        BO = LowerNegateToMultiply(BO);
+        DEBUG(dbgs() << *BO << 'n');
+        Worklist.push_back(std::make_pair(BO, Weight));
+        MadeChange = true;
+        continue;
+      }
+
+      // Failed to morph into an expression of the right type.  This really is
+      // a leaf.
+      DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+      assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+      LeafOrder.push_back(Op);
+      Leaves[Op] = Weight;
+    }
+  }
+
+  // The leaves, repeated according to their weights, represent the linearized
+  // form of the expression.
+  for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+    Value *V = LeafOrder[i];
+    LeafMap::iterator It = Leaves.find(V);
+    if (It == Leaves.end())
+      // Node initially thought to be a leaf wasn't.
+      continue;
+    assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+    APInt Weight = It->second;
+    if (Weight.isMinValue())
+      // Leaf already output or weight reduction eliminated it.
+      continue;
+    // Ensure the leaf is only output once.
+    It->second = 0;
+    Ops.push_back(std::make_pair(V, Weight));
+  }
+
+  // For nilpotent operations or addition there may be no operands, for example
+  // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+  // in both cases the weight reduces to 0 causing the value to be skipped.
+  if (Ops.empty()) {
+    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+    assert(Identity && "Associative operation without identity!");
+    Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
+  }
+
+  return MadeChange;
+}
+
+// RewriteExprTree - Now that the operands for this expression tree are
+// linearized and optimized, emit them in-order.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+                                  SmallVectorImpl<ValueEntry> &Ops) {
+  assert(Ops.size() > 1 && "Single values should be used directly!");
+
+  // Since our optimizations should never increase the number of operations, the
+  // new expression can usually be written reusing the existing binary operators
+  // from the original expression tree, without creating any new instructions,
+  // though the rewritten expression may have a completely different topology.
+  // We take care to not change anything if the new expression will be the same
+  // as the original.  If more than trivial changes (like commuting operands)
+  // were made then we are obliged to clear out any optional subclass data like
+  // nsw flags.
+
+  /// NodesToRewrite - Nodes from the original expression available for writing
+  /// the new expression into.
+  SmallVector<BinaryOperator*, 8> NodesToRewrite;
+  unsigned Opcode = I->getOpcode();
+  BinaryOperator *Op = I;
+
+  /// NotRewritable - The operands being written will be the leaves of the new
+  /// expression and must not be used as inner nodes (via NodesToRewrite) by
+  /// mistake.  Inner nodes are always reassociable, and usually leaves are not
+  /// (if they were they would have been incorporated into the expression and so
+  /// would not be leaves), so most of the time there is no danger of this.  But
+  /// in rare cases a leaf may become reassociable if an optimization kills uses
+  /// of it, or it may momentarily become reassociable during rewriting (below)
+  /// due it being removed as an operand of one of its uses.  Ensure that misuse
+  /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+  /// leaves and refusing to reuse any of them as inner nodes.
+  SmallPtrSet<Value*, 8> NotRewritable;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    NotRewritable.insert(Ops[i].Op);
+
+  // ExpressionChanged - Non-null if the rewritten expression differs from the
+  // original in some non-trivial way, requiring the clearing of optional flags.
+  // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+  BinaryOperator *ExpressionChanged = 0;
+  for (unsigned i = 0; ; ++i) {
+    // The last operation (which comes earliest in the IR) is special as both
+    // operands will come from Ops, rather than just one with the other being
+    // a subexpression.
+    if (i+2 == Ops.size()) {
+      Value *NewLHS = Ops[i].Op;
+      Value *NewRHS = Ops[i+1].Op;
+      Value *OldLHS = Op->getOperand(0);
+      Value *OldRHS = Op->getOperand(1);
+
+      if (NewLHS == OldLHS && NewRHS == OldRHS)
+        // Nothing changed, leave it alone.
+        break;
+
+      if (NewLHS == OldRHS && NewRHS == OldLHS) {
+        // The order of the operands was reversed.  Swap them.
+        DEBUG(dbgs() << "RA: " << *Op << '\n');
+        Op->swapOperands();
+        DEBUG(dbgs() << "TO: " << *Op << '\n');
+        MadeChange = true;
+        ++NumChanged;
+        break;
+      }
+
+      // The new operation differs non-trivially from the original. Overwrite
+      // the old operands with the new ones.
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewLHS != OldLHS) {
+        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(0, NewLHS);
+      }
+      if (NewRHS != OldRHS) {
+        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+      ExpressionChanged = Op;
+      MadeChange = true;
+      ++NumChanged;
+
+      break;
+    }
+
+    // Not the last operation.  The left-hand side will be a sub-expression
+    // while the right-hand side will be the current element of Ops.
+    Value *NewRHS = Ops[i].Op;
+    if (NewRHS != Op->getOperand(1)) {
+      DEBUG(dbgs() << "RA: " << *Op << '\n');
+      if (NewRHS == Op->getOperand(0)) {
+        // The new right-hand side was already present as the left operand.  If
+        // we are lucky then swapping the operands will sort out both of them.
+        Op->swapOperands();
+      } else {
+        // Overwrite with the new right-hand side.
+        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+        if (BO && !NotRewritable.count(BO))
+          NodesToRewrite.push_back(BO);
+        Op->setOperand(1, NewRHS);
+        ExpressionChanged = Op;
+      }
+      DEBUG(dbgs() << "TO: " << *Op << '\n');
+      MadeChange = true;
+      ++NumChanged;
+    }
+
+    // Now deal with the left-hand side.  If this is already an operation node
+    // from the original expression then just rewrite the rest of the expression
+    // into it.
+    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+    if (BO && !NotRewritable.count(BO)) {
+      Op = BO;
+      continue;
+    }
+
+    // Otherwise, grab a spare node from the original expression and use that as
+    // the left-hand side.  If there are no nodes left then the optimizers made
+    // an expression with more nodes than the original!  This usually means that
+    // they did something stupid but it might mean that the problem was just too
+    // hard (finding the mimimal number of multiplications needed to realize a
+    // multiplication expression is NP-complete).  Whatever the reason, smart or
+    // stupid, create a new node if there are none left.
+    BinaryOperator *NewOp;
+    if (NodesToRewrite.empty()) {
+      Constant *Undef = UndefValue::get(I->getType());
+      NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+                                     Undef, Undef, "", I);
+    } else {
+      NewOp = NodesToRewrite.pop_back_val();
+    }
+
+    DEBUG(dbgs() << "RA: " << *Op << '\n');
+    Op->setOperand(0, NewOp);
+    DEBUG(dbgs() << "TO: " << *Op << '\n');
+    ExpressionChanged = Op;
+    MadeChange = true;
+    ++NumChanged;
+    Op = NewOp;
+  }
+
+  // If the expression changed non-trivially then clear out all subclass data
+  // starting from the operator specified in ExpressionChanged, and compactify
+  // the operators to just before the expression root to guarantee that the
+  // expression tree is dominated by all of Ops.
+  if (ExpressionChanged)
+    do {
+      ExpressionChanged->clearSubclassOptionalData();
+      if (ExpressionChanged == I)
+        break;
+      ExpressionChanged->moveBefore(I);
+      ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin());
+    } while (1);
+
+  // Throw away any left over nodes from the original expression.
+  for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+    RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// NegateValue - Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified.  The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+static Value *NegateValue(Value *V, Instruction *BI) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getNeg(C);
+
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions.
+  //
+  if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) {
+    // Push the negates through the add.
+    I->setOperand(0, NegateValue(I->getOperand(0), BI));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+    // We must move the add instruction here, because the neg instructions do
+    // not dominate the old add instruction in general.  By moving it, we are
+    // assured that the neg instructions we just inserted dominate the
+    // instruction we are about to insert after them.
+    //
+    I->moveBefore(BI);
+    I->setName(I->getName()+".neg");
+    return I;
+  }
+
+  // Okay, we need to materialize a negated version of V with an instruction.
+  // Scan the use lists of V to see if we have one already.
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){
+    User *U = *UI;
+    if (!BinaryOperator::isNeg(U)) continue;
+
+    // We found one!  Now we have to make sure that the definition dominates
+    // this use.  We do this by moving it to the entry block (if it is a
+    // non-instruction value) or right after the definition.  These negates will
+    // be zapped by reassociate later, so we don't need much finesse here.
+    BinaryOperator *TheNeg = cast<BinaryOperator>(U);
+
+    // Verify that the negate is in this function, V might be a constant expr.
+    if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+      continue;
+
+    BasicBlock::iterator InsertPt;
+    if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+        InsertPt = II->getNormalDest()->begin();
+      } else {
+        InsertPt = InstInput;
+        ++InsertPt;
+      }
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+    } else {
+      InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+    }
+    TheNeg->moveBefore(InsertPt);
+    return TheNeg;
+  }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  return BinaryOperator::CreateNeg(V, V->getName() + ".neg", BI);
+}
+
+/// ShouldBreakUpSubtract - Return true if we should break up this subtract of
+/// X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+  // If this is a negation, we can't split it up!
+  if (BinaryOperator::isNeg(Sub))
+    return false;
+
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // subtract or if this is only used by one.
+  if (isReassociableOp(Sub->getOperand(0), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(0), Instruction::Sub))
+    return true;
+  if (isReassociableOp(Sub->getOperand(1), Instruction::Add) ||
+      isReassociableOp(Sub->getOperand(1), Instruction::Sub))
+    return true;
+  if (Sub->hasOneUse() &&
+      (isReassociableOp(Sub->use_back(), Instruction::Add) ||
+       isReassociableOp(Sub->use_back(), Instruction::Sub)))
+    return true;
+
+  return false;
+}
+
+/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
+/// only used by an add, transform this into (X+(0-Y)) to promote better
+/// reassociation.
+static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
+  // Convert a subtract into an add and a neg instruction. This allows sub
+  // instructions to be commuted with other add instructions.
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction,
+  // and set it as the RHS of the add instruction we just made.
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  BinaryOperator *New =
+    BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub);
+  Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  Sub->replaceAllUsesWith(New);
+  New->setDebugLoc(Sub->getDebugLoc());
+
+  DEBUG(dbgs() << "Negated: " << *New << '\n');
+  return New;
+}
+
+/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
+/// by one, change this into a multiply by a constant to assist with further
+/// reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+  Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+
+  BinaryOperator *Mul =
+    BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+  Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+  Mul->takeName(Shl);
+  Shl->replaceAllUsesWith(Mul);
+  Mul->setDebugLoc(Shl->getDebugLoc());
+  return Mul;
+}
+
+/// FindInOperandList - Scan backwards and forwards among values with the same
+/// rank as element i to see if X exists.  If X does not exist, return i.  This
+/// is useful when scanning for 'x' when we see '-x' because they both get the
+/// same rank.
+static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
+                                  Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+    if (Ops[j].Op == X)
+      return j;
+  // Scan backwards.
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+    if (Ops[j].Op == X)
+      return j;
+  return i;
+}
+
+/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+                                  SmallVectorImpl<WeakVH> &Ops){
+  if (Ops.size() == 1) return Ops.back();
+
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return BinaryOperator::CreateAdd(V2, V1, "tmp", I);
+}
+
+/// RemoveFactorFromExpression - If V is an expression tree that is a
+/// multiplication sequence, and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) return 0;
+
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(BO, Tree);
+  SmallVector<ValueEntry, 8> Factors;
+  Factors.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Factors.append(E.second.getZExtValue(),
+                   ValueEntry(getRank(E.first), E.first));
+  }
+
+  bool FoundFactor = false;
+  bool NeedsNegate = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+
+    // If this is a negative version of this factor, remove it.
+    if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor))
+      if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+        if (FC1->getValue() == -FC2->getValue()) {
+          FoundFactor = NeedsNegate = true;
+          Factors.erase(Factors.begin()+i);
+          break;
+        }
+  }
+
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return 0;
+  }
+
+  BasicBlock::iterator InsertPt = BO; ++InsertPt;
+
+  // If this was just a single multiply, remove the multiply and return the only
+  // remaining operand.
+  if (Factors.size() == 1) {
+    RedoInsts.insert(BO);
+    V = Factors[0].Op;
+  } else {
+    RewriteExprTree(BO, Factors);
+    V = BO;
+  }
+
+  if (NeedsNegate)
+    V = BinaryOperator::CreateNeg(V, "neg", InsertPt);
+
+  return V;
+}
+
+/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively
+/// add its operands as factors, otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         SmallVectorImpl<Value*> &Factors,
+                                       const SmallVectorImpl<ValueEntry> &Ops) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) {
+    Factors.push_back(V);
+    return;
+  }
+
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+}
+
+/// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor'
+/// instruction.  This optimizes based on identities.  If it can be reduced to
+/// a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+                               SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+  // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    // First, check for X and ~X in the operand list.
+    assert(i < Ops.size());
+    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+      unsigned FoundX = FindInOperandList(Ops, i, X);
+      if (FoundX != i) {
+        if (Opcode == Instruction::And)   // ...&X&~X = 0
+          return Constant::getNullValue(X->getType());
+
+        if (Opcode == Instruction::Or)    // ...|X|~X = -1
+          return Constant::getAllOnesValue(X->getType());
+      }
+    }
+
+    // Next, check for duplicate pairs of values, which we assume are next to
+    // each other, due to our sorting criteria.
+    assert(i < Ops.size());
+    if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+      if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+        // Drop duplicate values for And and Or.
+        Ops.erase(Ops.begin()+i);
+        --i; --e;
+        ++NumAnnihil;
+        continue;
+      }
+
+      // Drop pairs of values for Xor.
+      assert(Opcode == Instruction::Xor);
+      if (e == 2)
+        return Constant::getNullValue(Ops[0].Op->getType());
+
+      // Y ^ X^X -> Y
+      Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+      i -= 1; e -= 2;
+      ++NumAnnihil;
+    }
+  }
+  return 0;
+}
+
+/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd != 0) {
+    if (!ConstOpnd.isAllOnesValue()) {
+      LLVMContext &Ctx = Opnd->getType()->getContext();
+      Instruction *I;
+      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+                                    "and.ra", InsertBefore);
+      I->setDebugLoc(InsertBefore->getDebugLoc());
+      return I;
+    }
+    return Opnd;
+  }
+  return 0;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//  
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                 APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+    if (!Opnd1->getValue()->hasOneUse())
+      return false;
+
+    const APInt &C1 = Opnd1->getConstPart();
+    if (C1 != ConstOpnd)
+      return false;
+
+    Value *X = Opnd1->getSymbolicPart();
+    Res = createAndInstr(I, X, ~C1);
+    // ConstOpnd was C2, now C1 ^ C2.
+    ConstOpnd ^= C1;
+
+    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+      RedoInsts.insert(T);
+    return true;
+  }
+  return false;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                                 APInt &ConstOpnd, Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  const APInt &C1 = Opnd1->getConstPart();
+  const APInt &C2 = Opnd2->getConstPart();
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return 0;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<unsigned, 8> OpndIndices;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    if (!isa<ConstantInt>(V)) {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+      OpndIndices.push_back(Opnds.size() - 1);
+    } else
+      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+  }
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  std::sort(OpndIndices.begin(), OpndIndices.end(),
+            XorOpnd::PtrSortFunctor(Opnds));
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = 0;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = &Opnds[OpndIndices[i]];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = 0;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (ConstOpnd != 0) {
+      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    int Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    else if (Sz == 0) {
+      assert(ConstOpnd == 0);
+      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    }
+  }
+
+  return 0;
+}
+
+/// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
+/// optimizes based on identities.  If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *Reassociate::OptimizeAdd(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // Scan the operand lists looking for X and -X pairs.  If we find any, we
+  // can simplify the expression. X+-X == 0.  While we're at it, scan for any
+  // duplicates.  We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+  //
+  // TODO: We could handle "X + ~X" -> "-1" if we wanted, since "-X = ~X+1".
+  //
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *TheOp = Ops[i].Op;
+    // Check to see if we've seen this operand before.  If so, we factor all
+    // instances of the operand together.  Due to our sorting criteria, we know
+    // that these need to be next to each other in the vector.
+    if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+      // Rescan the list, remove all instances of this operand from the expr.
+      unsigned NumFound = 0;
+      do {
+        Ops.erase(Ops.begin()+i);
+        ++NumFound;
+      } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+      DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n');
+      ++NumFactor;
+
+      // Insert a new multiply.
+      Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound);
+      Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I);
+
+      // Now that we have inserted a multiply, optimize it. This allows us to
+      // handle cases that require multiple factoring steps, such as this:
+      // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+      RedoInsts.insert(cast<Instruction>(Mul));
+
+      // If every add operand was a duplicate, return the multiply.
+      if (Ops.empty())
+        return Mul;
+
+      // Otherwise, we had some input that didn't have the dupe, such as
+      // "A + A + B" -> "A*2 + B".  Add the new multiply to the list of
+      // things being added by this operation.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+      --i;
+      e = Ops.size();
+      continue;
+    }
+
+    // Check for X and -X in the operand list.
+    if (!BinaryOperator::isNeg(TheOp))
+      continue;
+
+    Value *X = BinaryOperator::getNegArgument(TheOp);
+    unsigned FoundX = FindInOperandList(Ops, i, X);
+    if (FoundX == i)
+      continue;
+
+    // Remove X and -X from the operand list.
+    if (Ops.size() == 2)
+      return Constant::getNullValue(X->getType());
+
+    Ops.erase(Ops.begin()+i);
+    if (i < FoundX)
+      --FoundX;
+    else
+      --i;   // Need to back up an extra one.
+    Ops.erase(Ops.begin()+FoundX);
+    ++NumAnnihil;
+    --i;     // Revisit element.
+    e -= 2;  // Removed two elements.
+  }
+
+  // Scan the operand list, checking to see if there are any common factors
+  // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+  // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+  // To efficiently find this, we count the number of times a factor occurs
+  // for any ADD operands that are MULs.
+  DenseMap<Value*, unsigned> FactorOccurrences;
+
+  // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+  // where they are actually the same multiply.
+  unsigned MaxOcc = 0;
+  Value *MaxOccVal = 0;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+    if (!BOp)
+      continue;
+
+    // Compute all of the factors of this added value.
+    SmallVector<Value*, 8> Factors;
+    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    assert(Factors.size() > 1 && "Bad linearize!");
+
+    // Add one to FactorOccurrences for each unique factor in this op.
+    SmallPtrSet<Value*, 8> Duplicates;
+    for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+      Value *Factor = Factors[i];
+      if (!Duplicates.insert(Factor)) continue;
+
+      unsigned Occ = ++FactorOccurrences[Factor];
+      if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+
+      // If Factor is a negative constant, add the negated value as a factor
+      // because we can percolate the negate out.  Watch for minint, which
+      // cannot be positivified.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor))
+        if (CI->isNegative() && !CI->isMinValue(true)) {
+          Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+          assert(!Duplicates.count(Factor) &&
+                 "Shouldn't have two constant factors, missed a canonicalize");
+
+          unsigned Occ = ++FactorOccurrences[Factor];
+          if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; }
+        }
+    }
+  }
+
+  // If any factor occurred more than one time, we can pull it out.
+  if (MaxOcc > 1) {
+    DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n');
+    ++NumFactor;
+
+    // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+    // this, we could otherwise run into situations where removing a factor
+    // from an expression will drop a use of maxocc, and this can cause
+    // RemoveFactorFromExpression on successive values to behave differently.
+    Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal);
+    SmallVector<WeakVH, 4> NewMulOps;
+    for (unsigned i = 0; i != Ops.size(); ++i) {
+      // Only try to remove factors from expressions we're allowed to.
+      BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul);
+      if (!BOp)
+        continue;
+
+      if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+        // The factorized operand may occur several times.  Convert them all in
+        // one fell swoop.
+        for (unsigned j = Ops.size(); j != i;) {
+          --j;
+          if (Ops[j].Op == Ops[i].Op) {
+            NewMulOps.push_back(V);
+            Ops.erase(Ops.begin()+j);
+          }
+        }
+        --i;
+      }
+    }
+
+    // No need for extra uses anymore.
+    delete DummyInst;
+
+    unsigned NumAddedValues = NewMulOps.size();
+    Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+    // Now that we have inserted the add tree, optimize it. This allows us to
+    // handle cases that require multiple factoring steps, such as this:
+    // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+    assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+    (void)NumAddedValues;
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      RedoInsts.insert(VI);
+
+    // Create the multiply.
+    Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I);
+
+    // Rerun associate on the multiply in case the inner expression turned into
+    // a multiply.  We want to make sure that we keep things in canonical form.
+    RedoInsts.insert(V2);
+
+    // If every add operand included the factor (e.g. "A*B + A*C"), then the
+    // entire result expression is just the multiply "A*(B+C)".
+    if (Ops.empty())
+      return V2;
+
+    // Otherwise, we had some input that didn't have the factor, such as
+    // "A*B + A*C + D" -> "A*(B+C) + D".  Add the new multiply to the list of
+    // things being added by this operation.
+    Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+  }
+
+  return 0;
+}
+
+namespace {
+  /// \brief Predicate tests whether a ValueEntry's op is in a map.
+  struct IsValueInMap {
+    const DenseMap<Value *, unsigned> &Map;
+
+    IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {}
+
+    bool operator()(const ValueEntry &Entry) {
+      return Map.find(Entry.Op) != Map.end();
+    }
+  };
+}
+
+/// \brief Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+///      (x*x)          -> [(x, 2)]
+///     ((x*x)*x)       -> [(x, 3)]
+///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                         SmallVectorImpl<Factor> &Factors) {
+  // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+  // Compute the sum of powers of simplifiable factors.
+  unsigned FactorPowerSum = 0;
+  for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    // Track for simplification all factors which occur 2 or more times.
+    if (Count > 1)
+      FactorPowerSum += Count;
+  }
+
+  // We can only simplify factors if the sum of the powers of our simplifiable
+  // factors is 4 or higher. When that is the case, we will *always* have
+  // a simplification. This is an important invariant to prevent cyclicly
+  // trying to simplify already minimal formations.
+  if (FactorPowerSum < 4)
+    return false;
+
+  // Now gather the simplifiable factors, removing them from Ops.
+  FactorPowerSum = 0;
+  for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+    Value *Op = Ops[Idx-1].Op;
+
+    // Count the number of occurrences of this value.
+    unsigned Count = 1;
+    for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+      ++Count;
+    if (Count == 1)
+      continue;
+    // Move an even number of occurrences to Factors.
+    Count &= ~1U;
+    Idx -= Count;
+    FactorPowerSum += Count;
+    Factors.push_back(Factor(Op, Count));
+    Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+  }
+
+  // None of the adjustments above should have reduced the sum of factor powers
+  // below our mininum of '4'.
+  assert(FactorPowerSum >= 4);
+
+  std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter());
+  return true;
+}
+
+/// \brief Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilder<> &Builder,
+                                SmallVectorImpl<Value*> &Ops) {
+  if (Ops.size() == 1)
+    return Ops.back();
+
+  Value *LHS = Ops.pop_back_val();
+  do {
+    LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+  } while (!Ops.empty());
+
+  return LHS;
+}
+
+/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+                                            SmallVectorImpl<Factor> &Factors) {
+  assert(Factors[0].Power);
+  SmallVector<Value *, 4> OuterProduct;
+  for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+       Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+    if (Factors[Idx].Power != Factors[LastIdx].Power) {
+      LastIdx = Idx;
+      continue;
+    }
+
+    // We want to multiply across all the factors with the same power so that
+    // we can raise them to that power as a single entity. Build a mini tree
+    // for that.
+    SmallVector<Value *, 4> InnerProduct;
+    InnerProduct.push_back(Factors[LastIdx].Base);
+    do {
+      InnerProduct.push_back(Factors[Idx].Base);
+      ++Idx;
+    } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+    // Reset the base value of the first factor to the new expression tree.
+    // We'll remove all the factors with the same power in a second pass.
+    Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+    if (Instruction *MI = dyn_cast<Instruction>(M))
+      RedoInsts.insert(MI);
+
+    LastIdx = Idx;
+  }
+  // Unique factors with equal powers -- we've folded them into the first one's
+  // base.
+  Factors.erase(std::unique(Factors.begin(), Factors.end(),
+                            Factor::PowerEqual()),
+                Factors.end());
+
+  // Iteratively collect the base of each factor with an add power into the
+  // outer product, and halve each power in preparation for squaring the
+  // expression.
+  for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+    if (Factors[Idx].Power & 1)
+      OuterProduct.push_back(Factors[Idx].Base);
+    Factors[Idx].Power >>= 1;
+  }
+  if (Factors[0].Power) {
+    Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+    OuterProduct.push_back(SquareRoot);
+    OuterProduct.push_back(SquareRoot);
+  }
+  if (OuterProduct.size() == 1)
+    return OuterProduct.front();
+
+  Value *V = buildMultiplyTree(Builder, OuterProduct);
+  return V;
+}
+
+Value *Reassociate::OptimizeMul(BinaryOperator *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  // We can only optimize the multiplies when there is a chain of more than
+  // three, such that a balanced tree might require fewer total multiplies.
+  if (Ops.size() < 4)
+    return 0;
+
+  // Try to turn linear trees of multiplies without other uses of the
+  // intermediate stages into minimal multiply DAGs with perfect sub-expression
+  // re-use.
+  SmallVector<Factor, 4> Factors;
+  if (!collectMultiplyFactors(Ops, Factors))
+    return 0; // All distinct factors, so nothing left for us to do.
+
+  IRBuilder<> Builder(I);
+  Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+  if (Ops.empty())
+    return V;
+
+  ValueEntry NewEntry = ValueEntry(getRank(V), V);
+  Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry);
+  return 0;
+}
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+                                       SmallVectorImpl<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  Constant *Cst = 0;
+  unsigned Opcode = I->getOpcode();
+  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+    Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+  }
+  // If there was nothing but constants then we are done.
+  if (Ops.empty())
+    return Cst;
+
+  // Put the combined constant back at the end of the operand list, except if
+  // there is no point.  For example, an add of 0 gets dropped here, while a
+  // multiplication by zero turns the whole expression into zero.
+  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+      return Cst;
+    Ops.push_back(ValueEntry(0, Cst));
+  }
+
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation due to identities between elements in the
+  // argument list here.
+  unsigned NumOps = Ops.size();
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+    if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+      return Result;
+    break;
+
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Add:
+    if (Value *Result = OptimizeAdd(I, Ops))
+      return Result;
+    break;
+
+  case Instruction::Mul:
+    if (Value *Result = OptimizeMul(I, Ops))
+      return Result;
+    break;
+  }
+
+  if (Ops.size() != NumOps)
+    return OptimizeExpression(I, Ops);
+  return 0;
+}
+
+/// EraseInst - Zap the given instruction, adding interesting operands to the
+/// work list.
+void Reassociate::EraseInst(Instruction *I) {
+  assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+  SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+  // Erase the dead instruction.
+  ValueRankMap.erase(I);
+  RedoInsts.remove(I);
+  I->eraseFromParent();
+  // Optimize its operands.
+  SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+      // If this is a node in an expression tree, climb to the expression root
+      // and add that since that's where optimization actually happens.
+      unsigned Opcode = Op->getOpcode();
+      while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode &&
+             Visited.insert(Op))
+        Op = Op->use_back();
+      RedoInsts.insert(Op);
+    }
+}
+
+/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void Reassociate::OptimizeInst(Instruction *I) {
+  // Only consider operations that we understand.
+  if (!isa<BinaryOperator>(I))
+    return;
+
+  if (I->getOpcode() == Instruction::Shl &&
+      isa<ConstantInt>(I->getOperand(1)))
+    // If an operand of this shift is a reassociable multiply, or if the shift
+    // is used by a reassociable multiply or add, turn into a multiply.
+    if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+        (I->hasOneUse() &&
+         (isReassociableOp(I->use_back(), Instruction::Mul) ||
+          isReassociableOp(I->use_back(), Instruction::Add)))) {
+      Instruction *NI = ConvertShiftToMul(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    }
+
+  // Floating point binary operators are not associative, but we can still
+  // commute (some) of them, to canonicalize the order of their operands.
+  // This can potentially expose more CSE opportunities, and makes writing
+  // other transformations simpler.
+  if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) {
+    // FAdd and FMul can be commuted.
+    if (I->getOpcode() != Instruction::FMul &&
+        I->getOpcode() != Instruction::FAdd)
+      return;
+
+    Value *LHS = I->getOperand(0);
+    Value *RHS = I->getOperand(1);
+    unsigned LHSRank = getRank(LHS);
+    unsigned RHSRank = getRank(RHS);
+
+    // Sort the operands by rank.
+    if (RHSRank < LHSRank) {
+      I->setOperand(0, RHS);
+      I->setOperand(1, LHS);
+    }
+
+    return;
+  }
+
+  // Do not reassociate boolean (i1) expressions.  We want to preserve the
+  // original order of evaluation for short-circuited comparisons that
+  // SimplifyCFG has folded to AND/OR expressions.  If the expression
+  // is not further optimized, it is likely to be transformed back to a
+  // short-circuited form for code gen, and the source order may have been
+  // optimized for the most likely conditions.
+  if (I->getType()->isIntegerTy(1))
+    return;
+
+  // If this is a subtract instruction which is not already in negate form,
+  // see if we can convert it to X+-Y.
+  if (I->getOpcode() == Instruction::Sub) {
+    if (ShouldBreakUpSubtract(I)) {
+      Instruction *NI = BreakUpSubtract(I);
+      RedoInsts.insert(I);
+      MadeChange = true;
+      I = NI;
+    } else if (BinaryOperator::isNeg(I)) {
+      // Otherwise, this is a negation.  See if the operand is a multiply tree
+      // and if this is not an inner node of a multiply tree.
+      if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+          (!I->hasOneUse() ||
+           !isReassociableOp(I->use_back(), Instruction::Mul))) {
+        Instruction *NI = LowerNegateToMultiply(I);
+        RedoInsts.insert(I);
+        MadeChange = true;
+        I = NI;
+      }
+    }
+  }
+
+  // If this instruction is an associative binary operator, process it.
+  if (!I->isAssociative()) return;
+  BinaryOperator *BO = cast<BinaryOperator>(I);
+
+  // If this is an interior node of a reassociable tree, ignore it until we
+  // get to the root of the tree, to avoid N^2 analysis.
+  unsigned Opcode = BO->getOpcode();
+  if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode)
+    return;
+
+  // If this is an add tree that is used by a sub instruction, ignore it
+  // until we process the subtract.
+  if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+      cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub)
+    return;
+
+  ReassociateExpression(BO);
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+
+  // First, walk the expression tree, linearizing the tree, collecting the
+  // operand information.
+  SmallVector<RepeatedValue, 8> Tree;
+  MadeChange |= LinearizeExprTree(I, Tree);
+  SmallVector<ValueEntry, 8> Ops;
+  Ops.reserve(Tree.size());
+  for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+    RepeatedValue E = Tree[i];
+    Ops.append(E.second.getZExtValue(),
+               ValueEntry(getRank(E.first), E.first));
+  }
+
+  DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+
+  // OptimizeExpression - Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    if (V == I)
+      // Self-referential expression in unreachable code.
+      return;
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+    I->replaceAllUsesWith(V);
+    if (Instruction *VI = dyn_cast<Instruction>(V))
+      VI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    ++NumAnnihil;
+    return;
+  }
+
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
+      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Ops.back().Op) &&
+      cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+    ValueEntry Tmp = Ops.pop_back_val();
+    Ops.insert(Ops.begin(), Tmp);
+  }
+
+  DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+  if (Ops.size() == 1) {
+    if (Ops[0].Op == I)
+      // Self-referential expression in unreachable code.
+      return;
+
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+      OI->setDebugLoc(I->getDebugLoc());
+    RedoInsts.insert(I);
+    return;
+  }
+
+  // Now that we ordered and optimized the expressions, splat them back into
+  // the expression tree, removing any unneeded nodes.
+  RewriteExprTree(I, Ops);
+}
+
+bool Reassociate::runOnFunction(Function &F) {
+  // Calculate the rank map for F
+  BuildRankMap(F);
+
+  MadeChange = false;
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    // Optimize every instruction in the basic block.
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; )
+      if (isInstructionTriviallyDead(II)) {
+        EraseInst(II++);
+      } else {
+        OptimizeInst(II);
+        assert(II->getParent() == BI && "Moved to a different block!");
+        ++II;
+      }
+
+    // If this produced extra instructions to optimize, handle them now.
+    while (!RedoInsts.empty()) {
+      Instruction *I = RedoInsts.pop_back_val();
+      if (isInstructionTriviallyDead(I))
+        EraseInst(I);
+      else
+        OptimizeInst(I);
+    }
+  }
+
+  // We are done with the rank map.
+  RankMap.clear();
+  ValueRankMap.clear();
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 000000000000..07f540a30127
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -0,0 +1,133 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intended to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg2mem"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
+namespace {
+  struct RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass(ID) {
+      initializeRegToMemPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+   bool valueEscapes(const Instruction *Inst) const {
+     const BasicBlock *BB = Inst->getParent();
+      for (Value::const_use_iterator UI = Inst->use_begin(),E = Inst->use_end();
+           UI != E; ++UI) {
+        const Instruction *I = cast<Instruction>(*UI);
+        if (I->getParent() != BB || isa<PHINode>(I))
+          return true;
+      }
+      return false;
+    }
+
+    virtual bool runOnFunction(Function &F);
+  };
+}
+
+char RegToMem::ID = 0;
+INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots",
+                false, false)
+
+bool RegToMem::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  // Insert all new allocas into entry block.
+  BasicBlock *BBEntry = &F.getEntryBlock();
+  assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+         "Entry block to function must not have predecessors!");
+
+  // Find first non-alloca instruction and create insertion point. This is
+  // safe if block is well-formed: it always have terminator, otherwise
+  // we'll get and assertion.
+  BasicBlock::iterator I = BBEntry->begin();
+  while (isa<AllocaInst>(I)) ++I;
+
+  CastInst *AllocaInsertionPoint =
+    new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                    Type::getInt32Ty(F.getContext()),
+                    "reg2mem alloca point", I);
+
+  // Find the escaped instructions. But don't create stack slots for
+  // allocas in entry block.
+  std::list<Instruction*> WorkList;
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib) {
+      if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+          valueEscapes(iib)) {
+        WorkList.push_front(&*iib);
+      }
+    }
+
+  // Demote escaped instructions
+  NumRegsDemoted += WorkList.size();
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+  WorkList.clear();
+
+  // Find all phi's
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib)
+      if (isa<PHINode>(iib))
+        WorkList.push_front(&*iib);
+
+  // Demote phi nodes
+  NumPhisDemoted += WorkList.size();
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+
+  return true;
+}
+
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+char &llvm::DemoteRegisterToMemoryID = RegToMem::ID;
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 000000000000..e30a2746b01e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -0,0 +1,1936 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sccp"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class LatticeVal {
+  enum LatticeValueTy {
+    /// undefined - This LLVM Value has no known value yet.
+    undefined,
+
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  };
+
+  /// Val: This stores the current lattice value along with the Constant* for
+  /// the constant if this is a 'constant' or 'forcedconstant' value.
+  PointerIntPair<Constant *, 2, LatticeValueTy> Val;
+
+  LatticeValueTy getLatticeValue() const {
+    return Val.getInt();
+  }
+
+public:
+  LatticeVal() : Val(0, undefined) {}
+
+  bool isUndefined() const { return getLatticeValue() == undefined; }
+  bool isConstant() const {
+    return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
+  }
+  bool isOverdefined() const { return getLatticeValue() == overdefined; }
+
+  Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return Val.getPointer();
+  }
+
+  /// markOverdefined - Return true if this is a change in status.
+  bool markOverdefined() {
+    if (isOverdefined())
+      return false;
+
+    Val.setInt(overdefined);
+    return true;
+  }
+
+  /// markConstant - Return true if this is a change in status.
+  bool markConstant(Constant *V) {
+    if (getLatticeValue() == constant) { // Constant but not forcedconstant.
+      assert(getConstant() == V && "Marking constant with different value");
+      return false;
+    }
+
+    if (isUndefined()) {
+      Val.setInt(constant);
+      assert(V && "Marking constant with NULL");
+      Val.setPointer(V);
+    } else {
+      assert(getLatticeValue() == forcedconstant &&
+             "Cannot move from overdefined to constant!");
+      // Stay at forcedconstant if the constant is the same.
+      if (V == getConstant()) return false;
+
+      // Otherwise, we go to overdefined.  Assumptions made based on the
+      // forced value are possibly wrong.  Assuming this is another constant
+      // could expose a contradiction.
+      Val.setInt(overdefined);
+    }
+    return true;
+  }
+
+  /// getConstantInt - If this is a constant with a ConstantInt value, return it
+  /// otherwise return null.
+  ConstantInt *getConstantInt() const {
+    if (isConstant())
+      return dyn_cast<ConstantInt>(getConstant());
+    return 0;
+  }
+
+  void markForcedConstant(Constant *V) {
+    assert(isUndefined() && "Can't force a defined value!");
+    Val.setInt(forcedconstant);
+    Val.setPointer(V);
+  }
+};
+} // end anonymous namespace.
+
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
+  DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// StructValueState - This maintains ValueState for values that have
+  /// StructType, for example for formal arguments, calls, insertelement, etc.
+  ///
+  DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedRetVals;
+
+  /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+  /// that return multiple values.
+  DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;
+
+  /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+  /// represented here for efficient lookup.
+  SmallPtrSet<Function*, 16> MRVFunctionsTracked;
+
+  /// TrackingIncomingArguments - This is the set of functions for whose
+  /// arguments we make optimistic assumptions about and try to prove as
+  /// constants.
+  SmallPtrSet<Function*, 16> TrackingIncomingArguments;
+
+  /// The reason for two worklists is that overdefined is the lowest state
+  /// on the lattice, and moving things to overdefined as fast as possible
+  /// makes SCCP converge much faster.
+  ///
+  /// By having a separate worklist, we accomplish this because everything
+  /// possibly overdefined will become overdefined at the soonest possible
+  /// point.
+  SmallVector<Value*, 64> OverdefinedInstWorkList;
+  SmallVector<Value*, 64> InstWorkList;
+
+
+  SmallVector<BasicBlock*, 64>  BBWorkList;  // The BasicBlock work list
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*, BasicBlock*> Edge;
+  DenseSet<Edge> KnownFeasibleEdges;
+public:
+  SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli)
+    : TD(td), TLI(tli) {}
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  ///
+  /// This returns true if the block was not considered live before.
+  bool MarkBlockExecutable(BasicBlock *BB) {
+    if (!BBExecutable.insert(BB)) return false;
+    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+    return true;
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    // We only track the contents of scalar globals.
+    if (GV->getType()->getElementType()->isSingleValueType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    // Add an entry, F -> undef.
+    if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+      MRVFunctionsTracked.insert(F);
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
+                                                     LatticeVal()));
+    } else
+      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+  }
+
+  void AddArgumentTrackedFunction(Function *F) {
+    TrackingIncomingArguments.insert(F);
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  bool isBlockExecutable(BasicBlock *BB) const {
+    return BBExecutable.count(BB);
+  }
+
+  LatticeVal getLatticeValueFor(Value *V) const {
+    DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
+    assert(I != ValueState.end() && "V is not in valuemap!");
+    return I->second;
+  }
+
+  /// getTrackedRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
+    return TrackedRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  void markOverdefined(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    markOverdefined(ValueState[V], V);
+  }
+
+  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// works with both scalars and structs.
+  void markAnythingOverdefined(Value *V) {
+    if (StructType *STy = dyn_cast<StructType>(V->getType()))
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+        markOverdefined(getStructValueState(V, i), V);
+    else
+      markOverdefined(V);
+  }
+
+private:
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (!IV.markConstant(C)) return;
+    DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+    if (IV.isOverdefined())
+      OverdefinedInstWorkList.push_back(V);
+    else
+      InstWorkList.push_back(V);
+  }
+
+  void markConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    markConstant(ValueState[V], V, C);
+  }
+
+  void markForcedConstant(Value *V, Constant *C) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    LatticeVal &IV = ValueState[V];
+    IV.markForcedConstant(C);
+    DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
+    if (IV.isOverdefined())
+      OverdefinedInstWorkList.push_back(V);
+    else
+      InstWorkList.push_back(V);
+  }
+
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+  void markOverdefined(LatticeVal &IV, Value *V) {
+    if (!IV.markOverdefined()) return;
+
+    DEBUG(dbgs() << "markOverdefined: ";
+          if (Function *F = dyn_cast<Function>(V))
+            dbgs() << "Function '" << F->getName() << "'\n";
+          else
+            dbgs() << *V << '\n');
+    // Only instructions go on the work list
+    OverdefinedInstWorkList.push_back(V);
+  }
+
+  void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUndefined())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      markOverdefined(IV, V);
+    else if (IV.isUndefined())
+      markConstant(IV, V, MergeWithV.getConstant());
+    else if (IV.getConstant() != MergeWithV.getConstant())
+      markOverdefined(IV, V);
+  }
+
+  void mergeInValue(Value *V, LatticeVal MergeWithV) {
+    assert(!V->getType()->isStructTy() && "Should use other method");
+    mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  /// getValueState - Return the LatticeVal object that corresponds to the
+  /// value.  This function handles the case when the value hasn't been seen yet
+  /// by properly seeding constants etc.
+  LatticeVal &getValueState(Value *V) {
+    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+    std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
+      ValueState.insert(std::make_pair(V, LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      // Undef values remain undefined.
+      if (!isa<UndefValue>(V))
+        LV.markConstant(C);          // Constants are constant
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+  /// getStructValueState - Return the LatticeVal object that corresponds to the
+  /// value/field pair.  This function handles the case when the value hasn't
+  /// been seen yet by properly seeding constants etc.
+  LatticeVal &getStructValueState(Value *V, unsigned i) {
+    assert(V->getType()->isStructTy() && "Should use getValueState");
+    assert(i < cast<StructType>(V->getType())->getNumElements() &&
+           "Invalid element #");
+
+    std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
+              bool> I = StructValueState.insert(
+                        std::make_pair(std::make_pair(V, i), LatticeVal()));
+    LatticeVal &LV = I.first->second;
+
+    if (!I.second)
+      return LV;  // Common case, already in the map.
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      Constant *Elt = C->getAggregateElement(i);
+
+      if (Elt == 0)
+        LV.markOverdefined();      // Unknown sort of constant.
+      else if (isa<UndefValue>(Elt))
+        ; // Undef values remain undefined.
+      else
+        LV.markConstant(Elt);      // Constants are constant.
+    }
+
+    // All others are underdefined by default.
+    return LV;
+  }
+
+
+  /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  /// work list if it is not already executable.
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (!MarkBlockExecutable(Dest)) {
+      // If the destination is already executable, we just made an *edge*
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+            << " -> " << Dest->getName() << "\n");
+
+      PHINode *PN;
+      for (BasicBlock::iterator I = Dest->begin();
+           (PN = dyn_cast<PHINode>(I)); ++I)
+        visitPHINode(*PN);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible.
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow.  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(Instruction *I) {
+    if (BBExecutable.count(I->getParent()))   // Inst is executable?
+      visit(*I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction.  Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &EVI);
+  void visitInsertValueInst(InsertValueInst &IVI);
+  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+
+  // Instructions that cannot be folded away.
+  void visitStoreInst     (StoreInst &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) {
+    visitCallSite(&I);
+  }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(&II);
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
+  void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
+
+  void visitInstruction(Instruction &I) {
+    // If a new instruction is added to LLVM that we don't handle.
+    dbgs() << "SCCP: Don't know how to handle: " << I;
+    markAnythingOverdefined(&I);   // Just in case
+  }
+};
+
+} // end anonymous namespace
+
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVector<bool, 16> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+      return;
+    }
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (CI == 0) {
+      // Overdefined condition variables, and branches on unfoldable constant
+      // conditions, mean the branch could go either way.
+      if (!BCValue.isUndefined())
+        Succs[0] = Succs[1] = true;
+      return;
+    }
+
+    // Constant condition variables mean the branch can only go a single way.
+    Succs[CI->isZero()] = true;
+    return;
+  }
+
+  if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    Succs[0] = Succs[1] = true;
+    return;
+  }
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (!SI->getNumCases()) {
+      Succs[0] = true;
+      return;
+    }
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (CI == 0) {   // Overdefined or undefined condition?
+      // All destinations are executable!
+      if (!SCValue.isUndefined())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    return;
+  }
+
+  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+  if (isa<IndirectBrInst>(&TI)) {
+    // Just mark all destinations executable!
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
+
+#ifndef NDEBUG
+  dbgs() << "Unknown terminator instruction: " << TI << '\n';
+#endif
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now.
+  TerminatorInst *TI = From->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+
+    // Overdefined condition variables mean the branch could go either way,
+    // undef conditions mean that neither edge is feasible yet.
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (CI == 0)
+      return !BCValue.isUndefined();
+
+    // Constant condition variables mean the branch can only go a single way.
+    return BI->getSuccessor(CI->isZero()) == To;
+  }
+
+  // Invoke instructions successors are always executable.
+  if (isa<InvokeInst>(TI))
+    return true;
+
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumCases() < 1)
+      return true;
+
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (CI == 0)
+      return !SCValue.isUndefined();
+
+    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+  }
+
+  // Just mark all destinations executable!
+  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
+  if (isa<IndirectBrInst>(TI))
+    return true;
+
+#ifndef NDEBUG
+  dbgs() << "Unknown terminator instruction: " << *TI << '\n';
+#endif
+  llvm_unreachable(0);
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  // If this PN returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (PN.getType()->isStructTy())
+    return markAnythingOverdefined(&PN);
+
+  if (getValueState(&PN).isOverdefined())
+    return;  // Quick exit
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64)
+    return markOverdefined(&PN);
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains undefined.
+  //
+  Constant *OperandVal = 0;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+
+    if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+      continue;
+
+    if (IV.isOverdefined())    // PHI node becomes overdefined!
+      return markOverdefined(&PN);
+
+    if (OperandVal == 0) {   // Grab the first value.
+      OperandVal = IV.getConstant();
+      continue;
+    }
+
+    // There is already a reachable operand.  If we conflict with it,
+    // then the PHI node becomes overdefined.  If we agree with it, we
+    // can continue on.
+
+    // Check to see if there are two different constants merging, if so, the PHI
+    // node is overdefined.
+    if (IV.getConstant() != OperandVal)
+      return markOverdefined(&PN);
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains undefined.
+  //
+  if (OperandVal)
+    markConstant(&PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // ret void
+
+  Function *F = I.getParent()->getParent();
+  Value *ResultOp = I.getOperand(0);
+
+  // If we are tracking the return value of this function, merge it in.
+  if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedRetVals.find(F);
+    if (TFRVI != TrackedRetVals.end()) {
+      mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+      return;
+    }
+  }
+
+  // Handle functions that return multiple values.
+  if (!TrackedMultipleRetVals.empty()) {
+    if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+      if (MRVFunctionsTracked.count(F))
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+          mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+                       getStructValueState(ResultOp, i));
+
+  }
+}
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable.
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  LatticeVal OpSt = getValueState(I.getOperand(0));
+  if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (OpSt.isConstant())        // Propagate constant value
+    markConstant(&I, ConstantExpr::getCast(I.getOpcode(),
+                                           OpSt.getConstant(), I.getType()));
+}
+
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+  // If this returns a struct, mark all elements over defined, we don't track
+  // structs in structs.
+  if (EVI.getType()->isStructTy())
+    return markAnythingOverdefined(&EVI);
+
+  // If this is extracting from more than one level of struct, we don't know.
+  if (EVI.getNumIndices() != 1)
+    return markOverdefined(&EVI);
+
+  Value *AggVal = EVI.getAggregateOperand();
+  if (AggVal->getType()->isStructTy()) {
+    unsigned i = *EVI.idx_begin();
+    LatticeVal EltVal = getStructValueState(AggVal, i);
+    mergeInValue(getValueState(&EVI), &EVI, EltVal);
+  } else {
+    // Otherwise, must be extracting from an array.
+    return markOverdefined(&EVI);
+  }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+  StructType *STy = dyn_cast<StructType>(IVI.getType());
+  if (STy == 0)
+    return markOverdefined(&IVI);
+
+  // If this has more than one index, we can't handle it, drive all results to
+  // undef.
+  if (IVI.getNumIndices() != 1)
+    return markAnythingOverdefined(&IVI);
+
+  Value *Aggr = IVI.getAggregateOperand();
+  unsigned Idx = *IVI.idx_begin();
+
+  // Compute the result based on what we're inserting.
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    // This passes through all values that aren't the inserted element.
+    if (i != Idx) {
+      LatticeVal EltVal = getStructValueState(Aggr, i);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+      continue;
+    }
+
+    Value *Val = IVI.getInsertedValueOperand();
+    if (Val->getType()->isStructTy())
+      // We don't track structs in structs.
+      markOverdefined(getStructValueState(&IVI, i), &IVI);
+    else {
+      LatticeVal InVal = getValueState(Val);
+      mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+    }
+  }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  // If this select returns a struct, just mark the result overdefined.
+  // TODO: We could do a lot better than this if code actually uses this.
+  if (I.getType()->isStructTy())
+    return markAnythingOverdefined(&I);
+
+  LatticeVal CondValue = getValueState(I.getCondition());
+  if (CondValue.isUndefined())
+    return;
+
+  if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+    Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+    mergeInValue(&I, getValueState(OpVal));
+    return;
+  }
+
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal TVal = getValueState(I.getTrueValue());
+  LatticeVal FVal = getValueState(I.getFalseValue());
+
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() &&
+      TVal.getConstant() == FVal.getConstant())
+    return markConstant(&I, FVal.getConstant());
+
+  if (TVal.isUndefined())   // select ?, undef, X -> X.
+    return mergeInValue(&I, FVal);
+  if (FVal.isUndefined())   // select ?, X, undef -> X.
+    return mergeInValue(&I, TVal);
+  markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant())
+    return markConstant(IV, &I,
+                        ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                          V2State.getConstant()));
+
+  // If something is undef, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  // Otherwise, one of our operands is overdefined.  Try to produce something
+  // better than overdefined with some tricks.
+
+  // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+  // operand is overdefined.
+  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+    LatticeVal *NonOverdefVal = 0;
+    if (!V1State.isOverdefined())
+      NonOverdefVal = &V1State;
+    else if (!V2State.isOverdefined())
+      NonOverdefVal = &V2State;
+
+    if (NonOverdefVal) {
+      if (NonOverdefVal->isUndefined()) {
+        // Could annihilate value.
+        if (I.getOpcode() == Instruction::And)
+          markConstant(IV, &I, Constant::getNullValue(I.getType()));
+        else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
+          markConstant(IV, &I, Constant::getAllOnesValue(PT));
+        else
+          markConstant(IV, &I,
+                       Constant::getAllOnesValue(I.getType()));
+        return;
+      }
+
+      if (I.getOpcode() == Instruction::And) {
+        // X and 0 = 0
+        if (NonOverdefVal->getConstant()->isNullValue())
+          return markConstant(IV, &I, NonOverdefVal->getConstant());
+      } else {
+        if (ConstantInt *CI = NonOverdefVal->getConstantInt())
+          if (CI->isAllOnesValue())     // X or -1 = -1
+            return markConstant(IV, &I, NonOverdefVal->getConstant());
+      }
+    }
+  }
+
+
+  markOverdefined(&I);
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal V1State = getValueState(I.getOperand(0));
+  LatticeVal V2State = getValueState(I.getOperand(1));
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (V1State.isConstant() && V2State.isConstant())
+    return markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(),
+                                                         V1State.getConstant(),
+                                                        V2State.getConstant()));
+
+  // If operands are still undefined, wait for it to resolve.
+  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+    return;
+
+  markOverdefined(&I);
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+  if (ValState.isOverdefined() || IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+                                                     IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &EltState = getValueState(I.getOperand(1));
+  LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+  if (ValState.isOverdefined() || EltState.isOverdefined() ||
+      IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && EltState.isConstant() &&
+          IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+                                                    EltState.getConstant(),
+                                                    IdxState.getConstant()));
+  else if (ValState.isUndefined() && EltState.isConstant() &&
+           IdxState.isConstant())
+    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+                                                   EltState.getConstant(),
+                                                   IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  // TODO : SCCP does not handle vectors properly.
+  return markOverdefined(&I);
+#if 0
+  LatticeVal &V1State   = getValueState(I.getOperand(0));
+  LatticeVal &V2State   = getValueState(I.getOperand(1));
+  LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+  if (MaskState.isUndefined() ||
+      (V1State.isUndefined() && V2State.isUndefined()))
+    return;  // Undefined output if mask or both inputs undefined.
+
+  if (V1State.isOverdefined() || V2State.isOverdefined() ||
+      MaskState.isOverdefined()) {
+    markOverdefined(&I);
+  } else {
+    // A mix of constant/undef inputs.
+    Constant *V1 = V1State.isConstant() ?
+        V1State.getConstant() : UndefValue::get(I.getType());
+    Constant *V2 = V2State.isConstant() ?
+        V2State.getConstant() : UndefValue::get(I.getType());
+    Constant *Mask = MaskState.isConstant() ?
+      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+  }
+#endif
+}
+
+// Handle getelementptr instructions.  If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  if (ValueState[&I].isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal State = getValueState(I.getOperand(i));
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet.
+
+    if (State.isOverdefined())
+      return markOverdefined(&I);
+
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+  // If this store is of a struct, ignore it.
+  if (SI.getOperand(0)->getType()->isStructTy())
+    return;
+
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global, then merge it.
+  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  // If this load is of a struct, just mark the result overdefined.
+  if (I.getType()->isStructTy())
+    return markAnythingOverdefined(&I);
+
+  LatticeVal PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  if (!PtrVal.isConstant() || I.isVolatile())
+    return markOverdefined(IV, &I);
+
+  Constant *Ptr = PtrVal.getConstant();
+
+  // load null -> null
+  if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
+    return markConstant(IV, &I, Constant::getNullValue(I.getType()));
+
+  // Transform load (constant global) into the value loaded.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+    if (!TrackedGlobals.empty()) {
+      // If we are tracking this global, merge in the known value for it.
+      DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+        TrackedGlobals.find(GV);
+      if (It != TrackedGlobals.end()) {
+        mergeInValue(IV, &I, It->second);
+        return;
+      }
+    }
+  }
+
+  // Transform load from a constant into a constant if possible.
+  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, TD))
+    return markConstant(IV, &I, C);
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  Instruction *I = CS.getInstruction();
+
+  // The common case is that we aren't tracking the callee, either because we
+  // are not doing interprocedural analysis or the callee is indirect, or is
+  // external.  Handle these cases first.
+  if (F == 0 || F->isDeclaration()) {
+CallOverdefined:
+    // Void return and not tracking callee, just bail.
+    if (I->getType()->isVoidTy()) return;
+
+    // Otherwise, if we have a single return value case, and if the function is
+    // a declaration, maybe we can constant fold it.
+    if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
+        canConstantFoldCallTo(F)) {
+
+      SmallVector<Constant*, 8> Operands;
+      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+           AI != E; ++AI) {
+        LatticeVal State = getValueState(*AI);
+
+        if (State.isUndefined())
+          return;  // Operands are not resolved yet.
+        if (State.isOverdefined())
+          return markOverdefined(I);
+        assert(State.isConstant() && "Unknown state!");
+        Operands.push_back(State.getConstant());
+      }
+
+      // If we can constant fold this, mark the result of the call as a
+      // constant.
+      if (Constant *C = ConstantFoldCall(F, Operands, TLI))
+        return markConstant(I, C);
+    }
+
+    // Otherwise, we don't know anything about this call, mark it overdefined.
+    return markAnythingOverdefined(I);
+  }
+
+  // If this is a local function that doesn't have its address taken, mark its
+  // entry block executable and merge in the actual arguments to the call into
+  // the formal arguments of the function.
+  if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
+    MarkBlockExecutable(F->begin());
+
+    // Propagate information from this call site into the callee.
+    CallSite::arg_iterator CAI = CS.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI, ++CAI) {
+      // If this argument is byval, and if the function is not readonly, there
+      // will be an implicit copy formed of the input aggregate.
+      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+        markOverdefined(AI);
+        continue;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal CallArg = getStructValueState(*CAI, i);
+          mergeInValue(getStructValueState(AI, i), AI, CallArg);
+        }
+      } else {
+        mergeInValue(AI, getValueState(*CAI));
+      }
+    }
+  }
+
+  // If this is a single/zero retval case, see if we're tracking the function.
+  if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (!MRVFunctionsTracked.count(F))
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If we are tracking this callee, propagate the result of the function
+    // into this call site.
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+      mergeInValue(getStructValueState(I, i), I,
+                   TrackedMultipleRetVals[std::make_pair(F, i)]);
+  } else {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+    if (TFRVI == TrackedRetVals.end())
+      goto CallOverdefined;  // Not tracking this callee.
+
+    // If so, propagate the return value of the callee into this call result.
+    mergeInValue(I, TFRVI->second);
+  }
+}
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the overdefined instruction's work list first, which drives other
+    // things to overdefined more quickly.
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant, or to overdefined.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value.
+      //
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        if (Instruction *I = dyn_cast<Instruction>(*UI))
+          OperandChangedState(I);
+    }
+
+    // Process the instruction work list.
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.pop_back_val();
+
+      DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+      // "I" got into the work list because it made the transition from undef to
+      // constant.
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value.
+      //
+      if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+        for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+             UI != E; ++UI)
+          if (Instruction *I = dyn_cast<Instruction>(*UI))
+            OperandChangedState(I);
+    }
+
+    // Process the basic block work list.
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      continue;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for instructions which produce undef values.
+      if (I->getType()->isVoidTy()) continue;
+
+      if (StructType *STy = dyn_cast<StructType>(I->getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands.
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(I, i);
+          if (LV.isUndefined())
+            markOverdefined(LV, I);
+        }
+        continue;
+      }
+
+      LatticeVal &LV = getValueState(I);
+      if (!LV.isUndefined()) continue;
+
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
+        continue;
+
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I->getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(I);
+        return true;
+      }
+      LatticeVal Op0LV = getValueState(I->getOperand(0));
+      LatticeVal Op1LV;
+      if (I->getNumOperands() == 2) {
+        if (I->getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(I);
+          return true;
+        }
+
+        Op1LV = getValueState(I->getOperand(1));
+      }
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      Type *ITy = I->getType();
+      switch (I->getOpcode()) {
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          markForcedConstant(I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(I);
+        return true;
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
+        markForcedConstant(I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
+        // undef | X -> -1.   X could be -1.
+        markForcedConstant(I, Constant::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+          markForcedConstant(I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUndefined()) break;
+
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::AShr:
+        // X >>a undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef >>a X -> all ones
+        markForcedConstant(I, Constant::getAllOnesValue(ITy));
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef << X -> 0
+        // undef >> X -> 0
+        markForcedConstant(I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        Op1LV = getValueState(I->getOperand(1));
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUndefined()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I->getOperand(2));
+        } else if (Op1LV.isUndefined()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I->getOperand(2));
+          if (Op1LV.isUndefined())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+
+        if (Op1LV.isConstant())
+          markForcedConstant(I, Op1LV.getConstant());
+        else
+          markOverdefined(I);
+        return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(I)->isEquality())
+          break;
+        markOverdefined(I);
+        return true;
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
+        // overdefined.
+        markOverdefined(I);
+        return true;
+      }
+    }
+
+    // Check to see if we have a branch or switch on an undefined value.  If so
+    // we force the branch to go one way or the other to make the successor
+    // values live.  It doesn't really matter which way we force it.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUndefined())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // false.
+      if (isa<UndefValue>(BI->getCondition())) {
+        BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+        markEdgeExecutable(BB, TI->getSuccessor(1));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to false.
+      markForcedConstant(BI->getCondition(),
+                         ConstantInt::getFalse(TI->getContext()));
+      return true;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (!SI->getNumCases())
+        continue;
+      if (!getValueState(SI->getCondition()).isUndefined())
+        continue;
+
+      // If the input to SCCP is actually switch on undef, fix the undef to
+      // the first constant.
+      if (isa<UndefValue>(SI->getCondition())) {
+        SI->setCondition(SI->case_begin().getCaseValue());
+        markEdgeExecutable(BB, SI->case_begin().getCaseSuccessor());
+        return true;
+      }
+
+      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+  /// Sparse Conditional Constant Propagator.
+  ///
+  struct SCCP : public FunctionPass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
+    static char ID; // Pass identification, replacement for typeid
+    SCCP() : FunctionPass(ID) {
+      initializeSCCPPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnFunction - Run the Sparse Conditional Constant Propagation
+    // algorithm, and return true if the function was modified.
+    //
+    bool runOnFunction(Function &F);
+  };
+} // end anonymous namespace
+
+char SCCP::ID = 0;
+INITIALIZE_PASS(SCCP, "sccp",
+                "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() {
+  return new SCCP();
+}
+
+static void DeleteInstructionInBlock(BasicBlock *BB) {
+  DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+  ++NumDeadBlocks;
+
+  // Check to see if there are non-terminating instructions to delete.
+  if (isa<TerminatorInst>(BB->begin()))
+    return;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != BB->begin()) {
+    // Delete the next to last instruction.
+    BasicBlock::iterator I = EndInst;
+    Instruction *Inst = --I;
+    if (!Inst->use_empty())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (isa<LandingPadInst>(Inst)) {
+      EndInst = Inst;
+      continue;
+    }
+    BB->getInstList().erase(Inst);
+    ++NumInstRemoved;
+  }
+}
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  SCCPSolver Solver(TD, TLI);
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(F.begin());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
+    Solver.markAnythingOverdefined(AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!Solver.isBlockExecutable(BB)) {
+      DeleteInstructionInBlock(BB);
+      MadeChanges = true;
+      continue;
+    }
+
+    // Iterate over all of the instructions in a function, replacing them with
+    // constants if we have found them to be of constant values.
+    //
+    for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+      Instruction *Inst = BI++;
+      if (Inst->getType()->isVoidTy() || isa<TerminatorInst>(Inst))
+        continue;
+
+      // TODO: Reconstruct structs from their elements.
+      if (Inst->getType()->isStructTy())
+        continue;
+
+      LatticeVal IV = Solver.getLatticeValueFor(Inst);
+      if (IV.isOverdefined())
+        continue;
+
+      Constant *Const = IV.isConstant()
+        ? IV.getConstant() : UndefValue::get(Inst->getType());
+      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+
+      // Replaces all of the uses of a variable with uses of the constant.
+      Inst->replaceAllUsesWith(Const);
+
+      // Delete the instruction.
+      Inst->eraseFromParent();
+
+      // Hey, we just changed something!
+      MadeChanges = true;
+      ++NumInstRemoved;
+    }
+  }
+
+  return MadeChanges;
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+  /// Constant Propagation.
+  ///
+  struct IPSCCP : public ModulePass {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
+    static char ID;
+    IPSCCP() : ModulePass(ID) {
+      initializeIPSCCPPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnModule(Module &M);
+  };
+} // end anonymous namespace
+
+char IPSCCP::ID = 0;
+INITIALIZE_PASS_BEGIN(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(IPSCCP, "ipsccp",
+                "Interprocedural Sparse Conditional Constant Propagation",
+                false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() {
+  return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(const GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (Value::const_use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(U) || isa<CallInst>(U)) {
+      // Make sure we are calling the function, not passing the address.
+      ImmutableCallSite CS(cast<Instruction>(U));
+      if (!CS.isCallee(UI))
+        return true;
+    } else if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (LI->isVolatile())
+        return true;
+    } else if (isa<BlockAddress>(U)) {
+      // blockaddress doesn't take the address of the function, it takes addr
+      // of label.
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  SCCPSolver Solver(TD, TLI);
+
+  // AddressTakenFunctions - This set keeps track of the address-taken functions
+  // that are in the input.  As IPSCCP runs through and simplifies code,
+  // functions that were address taken can end up losing their
+  // address-taken-ness.  Because of this, we keep track of their addresses from
+  // the first pass so we can use them for the later simplification pass.
+  SmallPtrSet<Function*, 32> AddressTakenFunctions;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration())
+      continue;
+
+    // If this is a strong or ODR definition of this function, then we can
+    // propagate information about its result into callsites of it.
+    if (!F->mayBeOverridden())
+      Solver.AddTrackedFunction(F);
+
+    // If this function only has direct calls that we can see, we can track its
+    // arguments and return value aggressively, and can assume it is not called
+    // unless we see evidence to the contrary.
+    if (F->hasLocalLinkage()) {
+      if (AddressIsTaken(F))
+        AddressTakenFunctions.insert(F);
+      else {
+        Solver.AddArgumentTrackedFunction(F);
+        continue;
+      }
+    }
+
+    // Assume the function is called.
+    Solver.MarkBlockExecutable(F->begin());
+
+    // Assume nothing about the incoming arguments.
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI)
+      Solver.markAnythingOverdefined(AI);
+  }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (Module::global_iterator G = M.global_begin(), E = M.global_end();
+       G != E; ++G)
+    if (!G->isConstant() && G->hasLocalLinkage() && !AddressIsTaken(G))
+      Solver.TrackValueOfGlobalVariable(G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+    ResolvedUndefs = false;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallVector<BasicBlock*, 512> BlocksToErase;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (Solver.isBlockExecutable(F->begin())) {
+      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+           AI != E; ++AI) {
+        if (AI->use_empty() || AI->getType()->isStructTy()) continue;
+
+        // TODO: Could use getStructLatticeValueFor to find out if the entire
+        // result is a constant and replace it entirely if so.
+
+        LatticeVal IV = Solver.getLatticeValueFor(AI);
+        if (IV.isOverdefined()) continue;
+
+        Constant *CST = IV.isConstant() ?
+        IV.getConstant() : UndefValue::get(AI->getType());
+        DEBUG(dbgs() << "***  Arg " << *AI << " = " << *CST <<"\n");
+
+        // Replaces all of the uses of a variable with uses of the
+        // constant.
+        AI->replaceAllUsesWith(CST);
+        ++IPNumArgsElimed;
+      }
+    }
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      if (!Solver.isBlockExecutable(BB)) {
+        DeleteInstructionInBlock(BB);
+        MadeChanges = true;
+
+        TerminatorInst *TI = BB->getTerminator();
+        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+          BasicBlock *Succ = TI->getSuccessor(i);
+          if (!Succ->empty() && isa<PHINode>(Succ->begin()))
+            TI->getSuccessor(i)->removePredecessor(BB);
+        }
+        if (!TI->use_empty())
+          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        TI->eraseFromParent();
+
+        if (&*BB != &F->front())
+          BlocksToErase.push_back(BB);
+        else
+          new UnreachableInst(M.getContext(), BB);
+        continue;
+      }
+
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = BI++;
+        if (Inst->getType()->isVoidTy() || Inst->getType()->isStructTy())
+          continue;
+
+        // TODO: Could use getStructLatticeValueFor to find out if the entire
+        // result is a constant and replace it entirely if so.
+
+        LatticeVal IV = Solver.getLatticeValueFor(Inst);
+        if (IV.isOverdefined())
+          continue;
+
+        Constant *Const = IV.isConstant()
+          ? IV.getConstant() : UndefValue::get(Inst->getType());
+        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+
+        // Replaces all of the uses of a variable with uses of the
+        // constant.
+        Inst->replaceAllUsesWith(Const);
+
+        // Delete the instruction.
+        if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
+          Inst->eraseFromParent();
+
+        // Hey, we just changed something!
+        MadeChanges = true;
+        ++IPNumInstRemoved;
+      }
+    }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      for (Value::use_iterator UI = DeadBB->use_begin(), UE = DeadBB->use_end();
+           UI != UE; ) {
+        // Grab the user and then increment the iterator early, as the user
+        // will be deleted. Step past all adjacent uses from the same user.
+        Instruction *I = dyn_cast<Instruction>(*UI);
+        do { ++UI; } while (UI != UE && *UI == I);
+
+        // Ignore blockaddress users; BasicBlock's dtor will handle them.
+        if (!I) continue;
+
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+#ifndef NDEBUG
+          if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            llvm_unreachable("Didn't fold away reference to block!");
+          }
+#endif
+
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          BranchInst::Create(TI->getSuccessor(0), TI);
+
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
+      }
+
+      // Finally, delete the basic block.
+      F->getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  //
+  // Do this in two stages: first identify the functions we should process, then
+  // actually zap their returns.  This is important because we can only do this
+  // if the address of the function isn't taken.  In cases where a return is the
+  // last use of a function, the order of processing functions would affect
+  // whether other functions are optimizable.
+  SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+  // TODO: Process multiple value ret instructions also.
+  const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
+  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+       E = RV.end(); I != E; ++I) {
+    Function *F = I->first;
+    if (I->second.isOverdefined() || F->getReturnType()->isVoidTy())
+      continue;
+
+    // We can only do this if we know that nothing else can call the function.
+    if (!F->hasLocalLinkage() || AddressTakenFunctions.count(F))
+      continue;
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+        if (!isa<UndefValue>(RI->getOperand(0)))
+          ReturnsToZap.push_back(RI);
+  }
+
+  // Zap all returns which we've identified as zap to change.
+  for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+    Function *F = ReturnsToZap[i]->getParent()->getParent();
+    ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+  }
+
+  // If we inferred constant or undef values for globals variables, we can
+  // delete the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 000000000000..f6bb365216ff
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,3765 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sroa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
+STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to force the pass to not use DomTree and mem2reg, instead
+/// forming SSA values through the SSAUpdater infrastructure.
+static cl::opt<bool>
+ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+
+namespace {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter :
+    public IRBuilderDefaultInserter<preserveNames> {
+  std::string Prefix;
+
+public:
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+  }
+};
+
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false> :
+    public IRBuilderDefaultInserter<false> {
+public:
+  void SetNamePrefix(const Twine &P) {}
+};
+
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder,
+                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder,
+                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
+#endif
+}
+
+namespace {
+/// \brief A common base class for representing a half-open byte range.
+struct ByteRange {
+  /// \brief The beginning offset of the range.
+  uint64_t BeginOffset;
+
+  /// \brief The ending offset, not included in the range.
+  uint64_t EndOffset;
+
+  ByteRange() : BeginOffset(), EndOffset() {}
+  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+
+  /// \brief Support for ordering ranges.
+  ///
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const ByteRange &RHS) const {
+    if (BeginOffset < RHS.BeginOffset) return true;
+    if (BeginOffset > RHS.BeginOffset) return false;
+    if (EndOffset > RHS.EndOffset) return true;
+    return false;
+  }
+
+  /// \brief Support comparison with a single offset to allow binary searches.
+  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
+    return LHS.BeginOffset < RHSOffset;
+  }
+
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const ByteRange &RHS) {
+    return LHSOffset < RHS.BeginOffset;
+  }
+
+  bool operator==(const ByteRange &RHS) const {
+    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+  }
+  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+};
+
+/// \brief A partition of an alloca.
+///
+/// This structure represents a contiguous partition of the alloca. These are
+/// formed by examining the uses of the alloca. During formation, they may
+/// overlap but once an AllocaPartitioning is built, the Partitions within it
+/// are all disjoint.
+struct Partition : public ByteRange {
+  /// \brief Whether this partition is splittable into smaller partitions.
+  ///
+  /// We flag partitions as splittable when they are formed entirely due to
+  /// accesses by trivially splittable operations such as memset and memcpy.
+  bool IsSplittable;
+
+  /// \brief Test whether a partition has been marked as dead.
+  bool isDead() const {
+    if (BeginOffset == UINT64_MAX) {
+      assert(EndOffset == UINT64_MAX);
+      return true;
+    }
+    return false;
+  }
+
+  /// \brief Kill a partition.
+  /// This is accomplished by setting both its beginning and end offset to
+  /// the maximum possible value.
+  void kill() {
+    assert(!isDead() && "He's Dead, Jim!");
+    BeginOffset = EndOffset = UINT64_MAX;
+  }
+
+  Partition() : ByteRange(), IsSplittable() {}
+  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
+      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
+};
+
+/// \brief A particular use of a partition of the alloca.
+///
+/// This structure is used to associate uses of a partition with it. They
+/// mark the range of bytes which are referenced by a particular instruction,
+/// and includes a handle to the user itself and the pointer value in use.
+/// The bounds of these uses are determined by intersecting the bounds of the
+/// memory use itself with a particular partition. As a consequence there is
+/// intentionally overlap between various uses of the same partition.
+class PartitionUse : public ByteRange {
+  /// \brief Combined storage for both the Use* and split state.
+  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
+
+public:
+  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
+  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
+               bool IsSplit)
+      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
+
+  /// \brief The use in question. Provides access to both user and used value.
+  ///
+  /// Note that this may be null if the partition use is *dead*, that is, it
+  /// should be ignored.
+  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
+
+  /// \brief Set the use for this partition use range.
+  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
+
+  /// \brief Whether this use is split across multiple partitions.
+  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
+};
+}
+
+namespace llvm {
+template <> struct isPodLike<Partition> : llvm::true_type {};
+template <> struct isPodLike<PartitionUse> : llvm::true_type {};
+}
+
+namespace {
+/// \brief Alloca partitioning representation.
+///
+/// This class represents a partitioning of an alloca into slices, and
+/// information about the nature of uses of each slice of the alloca. The goal
+/// is that this information is sufficient to decide if and how to split the
+/// alloca apart and replace slices with scalars. It is also intended that this
+/// structure can capture the relevant information needed both to decide about
+/// and to enact these transformations.
+class AllocaPartitioning {
+public:
+  /// \brief Construct a partitioning of a particular alloca.
+  ///
+  /// Construction does most of the work for partitioning the alloca. This
+  /// performs the necessary walks of users and builds a partitioning from it.
+  AllocaPartitioning(const DataLayout &TD, AllocaInst &AI);
+
+  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the partitioning is never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// \brief Support for iterating over the partitions.
+  /// @{
+  typedef SmallVectorImpl<Partition>::iterator iterator;
+  iterator begin() { return Partitions.begin(); }
+  iterator end() { return Partitions.end(); }
+
+  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
+  const_iterator begin() const { return Partitions.begin(); }
+  const_iterator end() const { return Partitions.end(); }
+  /// @}
+
+  /// \brief Support for iterating over and manipulating a particular
+  /// partition's uses.
+  ///
+  /// The iteration support provided for uses is more limited, but also
+  /// includes some manipulation routines to support rewriting the uses of
+  /// partitions during SROA.
+  /// @{
+  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
+  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
+  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
+  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
+  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
+
+  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
+  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
+  const_use_iterator use_begin(const_iterator I) const {
+    return Uses[I - begin()].begin();
+  }
+  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
+  const_use_iterator use_end(const_iterator I) const {
+    return Uses[I - begin()].end();
+  }
+
+  unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); }
+  unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); }
+  const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const {
+    return Uses[PIdx][UIdx];
+  }
+  const PartitionUse &getUse(const_iterator I, unsigned UIdx) const {
+    return Uses[I - begin()][UIdx];
+  }
+
+  void use_push_back(unsigned Idx, const PartitionUse &PU) {
+    Uses[Idx].push_back(PU);
+  }
+  void use_push_back(const_iterator I, const PartitionUse &PU) {
+    Uses[I - begin()].push_back(PU);
+  }
+  /// @}
+
+  /// \brief Allow iterating the dead users for this alloca.
+  ///
+  /// These are instructions which will never actually use the alloca as they
+  /// are outside the allocated range. They are safe to replace with undef and
+  /// delete.
+  /// @{
+  typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
+  dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
+  dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
+  /// @}
+
+  /// \brief Allow iterating the dead expressions referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  /// @{
+  typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
+  dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
+  dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
+  /// @}
+
+  /// \brief MemTransferInst auxiliary data.
+  /// This struct provides some auxiliary data about memory transfer
+  /// intrinsics such as memcpy and memmove. These intrinsics can use two
+  /// different ranges within the same alloca, and provide other challenges to
+  /// correctly represent. We stash extra data to help us untangle this
+  /// after the partitioning is complete.
+  struct MemTransferOffsets {
+    /// The destination begin and end offsets when the destination is within
+    /// this alloca. If the end offset is zero the destination is not within
+    /// this alloca.
+    uint64_t DestBegin, DestEnd;
+
+    /// The source begin and end offsets when the source is within this alloca.
+    /// If the end offset is zero, the source is not within this alloca.
+    uint64_t SourceBegin, SourceEnd;
+
+    /// Flag for whether an alloca is splittable.
+    bool IsSplittable;
+  };
+  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
+    return MemTransferInstData.lookup(&II);
+  }
+
+  /// \brief Map from a PHI or select operand back to a partition.
+  ///
+  /// When manipulating PHI nodes or selects, they can use more than one
+  /// partition of an alloca. We store a special mapping to allow finding the
+  /// partition referenced by each of these operands, if any.
+  iterator findPartitionForPHIOrSelectOperand(Use *U) {
+    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(U);
+    if (MapIt == PHIOrSelectOpMap.end())
+      return end();
+
+    return begin() + MapIt->second.first;
+  }
+
+  /// \brief Map from a PHI or select operand back to the specific use of
+  /// a partition.
+  ///
+  /// Similar to mapping these operands back to the partitions, this maps
+  /// directly to the use structure of that partition.
+  use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) {
+    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(U);
+    assert(MapIt != PHIOrSelectOpMap.end());
+    return Uses[MapIt->second.first].begin() + MapIt->second.second;
+  }
+
+  /// \brief Compute a common type among the uses of a particular partition.
+  ///
+  /// This routines walks all of the uses of a particular partition and tries
+  /// to find a common type between them. Untyped operations such as memset and
+  /// memcpy are ignored.
+  Type *getCommonType(iterator I) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printUsers(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class PartitionBuilder;
+  friend class AllocaPartitioning::PartitionBuilder;
+  class UseBuilder;
+  friend class AllocaPartitioning::UseBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// \brief Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// \brief The instruction responsible for this alloca having no partitioning.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to partition the alloca.
+  /// This will be null if the alloca is partitioned successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// \brief The partitions of the alloca.
+  ///
+  /// We store a vector of the partitions over the alloca here. This vector is
+  /// sorted by increasing begin offset, and then by decreasing end offset. See
+  /// the Partition inner class for more details. Initially (during
+  /// construction) there are overlaps, but we form a disjoint sequence of
+  /// partitions while finishing construction and a fully constructed object is
+  /// expected to always have this as a disjoint space.
+  SmallVector<Partition, 8> Partitions;
+
+  /// \brief The uses of the partitions.
+  ///
+  /// This is essentially a mapping from each partition to a list of uses of
+  /// that partition. The mapping is done with a Uses vector that has the exact
+  /// same number of entries as the partition vector. Each entry is itself
+  /// a vector of the uses.
+  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
+
+  /// \brief Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by partition. This is because we expect
+  /// a partitioned alloca to be completely rewritten or not rewritten at all.
+  /// If rewritten, all these instructions can simply be removed and replaced
+  /// with undef as they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
+  /// \brief Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+
+  /// \brief The underlying storage for auxiliary memcpy and memset info.
+  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
+
+  /// \brief A side datastructure used when building up the partitions and uses.
+  ///
+  /// This mapping is only really used during the initial building of the
+  /// partitioning so that we can retain information about PHI and select nodes
+  /// processed.
+  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
+
+  /// \brief Auxiliary information for particular PHI or select operands.
+  SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
+
+  /// \brief A utility routine called from the constructor.
+  ///
+  /// This does what it says on the tin. It is the key of the alloca partition
+  /// splitting and merging. After it is called we have the desired disjoint
+  /// collection of partitions.
+  void splitAndMergePartitions();
+};
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1+CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2))
+    return SI.getOperand(1);
+
+  return 0;
+}
+
+/// \brief Builder for the alloca partitioning.
+///
+/// This class builds an alloca partitioning by recursively visiting the uses
+/// of an alloca and splitting the partitions for each load and store at each
+/// offset.
+class AllocaPartitioning::PartitionBuilder
+    : public PtrUseVisitor<PartitionBuilder> {
+  friend class PtrUseVisitor<PartitionBuilder>;
+  friend class InstVisitor<PartitionBuilder>;
+  typedef PtrUseVisitor<PartitionBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
+
+public:
+  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
+      : PtrUseVisitor<PartitionBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
+
+private:
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      return;
+    }
+
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to remain within the " << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    Partition New(BeginOffset, EndOffset, IsSplittable);
+    P.Partitions.push_back(New);
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+                         uint64_t Size, bool IsVolatile) {
+    // We allow splitting of loads and stores where the type is an integer type
+    // and cover the entire alloca. This prevents us from splitting over
+    // eagerly.
+    // FIXME: In the great blue eventually, we should eagerly split all integer
+    // loads and stores, and then have a separate step that merges adjacent
+    // alloca partitions into a single partition suitable for integer widening.
+    // Or we should skip the merge step and rely on GVN and other passes to
+    // merge adjacent loads and stores that survive mem2reg.
+    bool IsSplittable =
+        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
+
+    insertUse(I, Offset, Size, IsSplittable);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << SI << "\n");
+      return;
+    }
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+  }
+
+
+  void visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - RawOffset;
+
+    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+
+    // Only intrinsics with a constant length can be split.
+    Offsets.IsSplittable = Length;
+
+    if (*U == II.getRawDest()) {
+      Offsets.DestBegin = RawOffset;
+      Offsets.DestEnd = RawOffset + Size;
+    }
+    if (*U == II.getRawSource()) {
+      Offsets.SourceBegin = RawOffset;
+      Offsets.SourceEnd = RawOffset + Size;
+    }
+
+    // If we have set up end offsets for both the source and the destination,
+    // we have found both sides of this transfer pointing at the same alloca.
+    bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd;
+    if (SeenBothEnds && II.getRawDest() != II.getRawSource()) {
+      unsigned PrevIdx = MemTransferPartitionMap[&II];
+
+      // Check if the begin offsets match and this is a non-volatile transfer.
+      // In that case, we can completely elide the transfer.
+      if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
+        P.Partitions[PrevIdx].kill();
+        return;
+      }
+
+      // Otherwise we have an offset transfer within the same alloca. We can't
+      // split those.
+      P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false;
+    } else if (SeenBothEnds) {
+      // Handle the case where this exact use provides both ends of the
+      // operation.
+      assert(II.getRawDest() == II.getRawSource());
+
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return;
+
+      // Otherwise just suppress splitting.
+      Offsets.IsSplittable = false;
+    }
+
+
+    // Insert the use now that we've fixed up the splittable nature.
+    insertUse(II, Offset, Size, Offsets.IsSplittable);
+
+    // Setup the mapping from intrinsic to partition of we've not seen both
+    // ends of this transfer.
+    if (!SeenBothEnds) {
+      unsigned NewIdx = P.Partitions.size() - 1;
+      bool Inserted
+        = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second;
+      assert(Inserted &&
+             "Already have intrinsic in map but haven't seen both ends");
+      (void)Inserted;
+    }
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug intrinsics? This matches old behavior, but
+  // doesn't make sense.
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+        II.getIntrinsicID() == Intrinsic::lifetime_end) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return;
+    }
+
+    Base::visitIntrinsicInst(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for partitioning purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    // If there are no loads or stores, the access is dead. We mark that as
+    // a size zero access.
+    Size = 0;
+    do {
+      Instruction *I, *UsedI;
+      llvm::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I)) {
+        return I;
+      }
+
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+           ++UI)
+        if (Visited.insert(cast<Instruction>(*UI)))
+          Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
+    } while (!Uses.empty());
+
+    return 0;
+  }
+
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return;
+    if (!IsOffsetKnown)
+      return PI.setAborted(&PN);
+
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
+    if (PHIInfo.first) {
+      PHIInfo.second = true;
+      insertUse(PN, Offset, PHIInfo.first);
+      return;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
+      return PI.setAborted(UnsafeI);
+
+    insertUse(PN, Offset, PHIInfo.first);
+  }
+
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return;
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI);
+
+      return;
+    }
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
+
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
+    if (SelectInfo.first) {
+      SelectInfo.second = true;
+      insertUse(SI, Offset, SelectInfo.first);
+      return;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
+      return PI.setAborted(UnsafeI);
+
+    insertUse(SI, Offset, SelectInfo.first);
+  }
+
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) {
+    PI.setAborted(&I);
+  }
+};
+
+/// \brief Use adder for the alloca partitioning.
+///
+/// This class adds the uses of an alloca to all of the partitions which they
+/// use. For splittable partitions, this can end up doing essentially a linear
+/// walk of the partitions, but the number of steps remains bounded by the
+/// total result instruction size:
+/// - The number of partitions is a result of the number unsplittable
+///   instructions using the alloca.
+/// - The number of users of each partition is at worst the total number of
+///   splittable instructions using the alloca.
+/// Thus we will produce N * M instructions in the end, where N are the number
+/// of unsplittable uses and M are the number of splittable. This visitor does
+/// the exact same number of updates to the partitioning.
+///
+/// In the more common case, this visitor will leverage the fact that the
+/// partition space is pre-sorted, and do a logarithmic search for the
+/// partition needed, making the total visit a classical ((N + M) * log(N))
+/// complexity operation.
+class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
+  friend class PtrUseVisitor<UseBuilder>;
+  friend class InstVisitor<UseBuilder>;
+  typedef PtrUseVisitor<UseBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
+
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : PtrUseVisitor<UseBuilder>(TD),
+        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I))
+      P.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
+    // If the use has a zero size or extends outside of the allocation, record
+    // it as a dead use for elimination later.
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
+      return markAsDead(User);
+
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset)
+      EndOffset = AllocSize;
+
+    // NB: This only works if we have zero overlapping partitions.
+    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
+    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
+      I = llvm::prior(I);
+    iterator E = P.end();
+    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
+    for (; I != E && I->BeginOffset < EndOffset; ++I) {
+      PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
+                         std::min(I->EndOffset, EndOffset), U, IsSplit);
+      P.use_push_back(I, NewPU);
+      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
+        P.PHIOrSelectOpMap[U]
+          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
+    }
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    return Base::visitGetElementPtrInst(GEPI);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    assert(IsOffsetKnown);
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    insertUse(LI, Offset, Size);
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    assert(IsOffsetKnown);
+    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse.
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size))
+      return markAsDead(SI);
+
+    insertUse(SI, Offset, Size);
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      return markAsDead(II);
+
+    assert(IsOffsetKnown);
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue());
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      return markAsDead(II);
+
+    assert(IsOffsetKnown);
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - Offset.getLimitedValue();
+
+    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+    if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
+        Offsets.DestBegin == Offsets.SourceBegin)
+      return markAsDead(II); // Skip identity transfers without side-effects.
+
+    insertUse(II, Offset, Size);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    assert(IsOffsetKnown);
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+
+    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+    insertUse(II, Offset, std::min(Length->getLimitedValue(),
+                                   AllocSize - Offset.getLimitedValue()));
+  }
+
+  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
+    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    if ((Offset.isNegative() && Offset.uge(Size)) ||
+        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+      P.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(User, Offset, Size);
+  }
+
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return markAsDead(PN);
+
+    assert(IsOffsetKnown);
+    insertPHIOrSelect(PN, Offset);
+  }
+
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return markAsDead(SI);
+
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI);
+      else
+        // Otherwise the operand to the select is dead, and we can replace it
+        // with undef.
+        P.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    assert(IsOffsetKnown);
+    insertPHIOrSelect(SI, Offset);
+  }
+
+  /// \brief Unreachable, we've already visited the alloca once.
+  void visitInstruction(Instruction &I) {
+    llvm_unreachable("Unhandled instruction in use builder.");
+  }
+};
+
+void AllocaPartitioning::splitAndMergePartitions() {
+  size_t NumDeadPartitions = 0;
+
+  // Track the range of splittable partitions that we pass when accumulating
+  // overlapping unsplittable partitions.
+  uint64_t SplitEndOffset = 0ull;
+
+  Partition New(0ull, 0ull, false);
+
+  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
+    ++j;
+
+    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
+      assert(New.BeginOffset == New.EndOffset);
+      New = Partitions[i];
+    } else {
+      assert(New.IsSplittable);
+      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
+    }
+    assert(New.BeginOffset != New.EndOffset);
+
+    // Scan the overlapping partitions.
+    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
+      // If the new partition we are forming is splittable, stop at the first
+      // unsplittable partition.
+      if (New.IsSplittable && !Partitions[j].IsSplittable)
+        break;
+
+      // Grow the new partition to include any equally splittable range. 'j' is
+      // always equally splittable when New is splittable, but when New is not
+      // splittable, we may subsume some (or part of some) splitable partition
+      // without growing the new one.
+      if (New.IsSplittable == Partitions[j].IsSplittable) {
+        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
+      } else {
+        assert(!New.IsSplittable);
+        assert(Partitions[j].IsSplittable);
+        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
+      }
+
+      Partitions[j].kill();
+      ++NumDeadPartitions;
+      ++j;
+    }
+
+    // If the new partition is splittable, chop off the end as soon as the
+    // unsplittable subsequent partition starts and ensure we eventually cover
+    // the splittable area.
+    if (j != e && New.IsSplittable) {
+      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
+      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+    }
+
+    // Add the new partition if it differs from the original one and is
+    // non-empty. We can end up with an empty partition here if it was
+    // splittable but there is an unsplittable one that starts at the same
+    // offset.
+    if (New != Partitions[i]) {
+      if (New.BeginOffset != New.EndOffset)
+        Partitions.push_back(New);
+      // Mark the old one for removal.
+      Partitions[i].kill();
+      ++NumDeadPartitions;
+    }
+
+    New.BeginOffset = New.EndOffset;
+    if (!New.IsSplittable) {
+      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
+      if (j != e && !Partitions[j].IsSplittable)
+        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+      New.IsSplittable = true;
+      // If there is a trailing splittable partition which won't be fused into
+      // the next splittable partition go ahead and add it onto the partitions
+      // list.
+      if (New.BeginOffset < New.EndOffset &&
+          (j == e || !Partitions[j].IsSplittable ||
+           New.EndOffset < Partitions[j].BeginOffset)) {
+        Partitions.push_back(New);
+        New.BeginOffset = New.EndOffset = 0ull;
+      }
+    }
+  }
+
+  // Re-sort the partitions now that they have been split and merged into
+  // disjoint set of partitions. Also remove any of the dead partitions we've
+  // replaced in the process.
+  std::sort(Partitions.begin(), Partitions.end());
+  if (NumDeadPartitions) {
+    assert(Partitions.back().isDead());
+    assert((ptrdiff_t)NumDeadPartitions ==
+           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
+  }
+  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
+}
+
+AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
+    :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+      AI(AI),
+#endif
+      PointerEscapingInstr(0) {
+  PartitionBuilder PB(TD, AI, *this);
+  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaPartitioning.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
+    return;
+  }
+
+  // Sort the uses. This arranges for the offsets to be in ascending order,
+  // and the sizes to be in descending order.
+  std::sort(Partitions.begin(), Partitions.end());
+
+  // Remove any partitions from the back which are marked as dead.
+  while (!Partitions.empty() && Partitions.back().isDead())
+    Partitions.pop_back();
+
+  if (Partitions.size() > 1) {
+    // Intersect splittability for all partitions with equal offsets and sizes.
+    // Then remove all but the first so that we have a sequence of non-equal but
+    // potentially overlapping partitions.
+    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
+         I = J) {
+      ++J;
+      while (J != E && *I == *J) {
+        I->IsSplittable &= J->IsSplittable;
+        ++J;
+      }
+    }
+    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
+                     Partitions.end());
+
+    // Split splittable and merge unsplittable partitions into a disjoint set
+    // of partitions over the used space of the allocation.
+    splitAndMergePartitions();
+  }
+
+  // Record how many partitions we end up with.
+  NumAllocaPartitions += Partitions.size();
+  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
+
+  // Now build up the user lists for each of these disjoint partitions by
+  // re-walking the recursive users of the alloca.
+  Uses.resize(Partitions.size());
+  UseBuilder UB(TD, AI, *this);
+  PtrI = UB.visitPtr(AI);
+  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
+  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
+
+  unsigned NumUses = 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
+    NumUses += Uses[Idx].size();
+#endif
+  NumAllocaPartitionUses += NumUses;
+  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
+}
+
+Type *AllocaPartitioning::getCommonType(iterator I) const {
+  Type *Ty = 0;
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
+    Use *U = UI->getUse();
+    if (!U)
+      continue; // Skip dead uses.
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
+      continue;
+
+    Type *UserTy = 0;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
+      UserTy = LI->getType();
+    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
+      UserTy = SI->getValueOperand()->getType();
+    else
+      return 0; // Bail if we have weird uses.
+
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split.
+      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for a "bucket
+      // of bits" type.
+      return ITy;
+    }
+
+    if (Ty && Ty != UserTy)
+      return 0;
+
+    Ty = UserTy;
+  }
+  return Ty;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
+                               StringRef Indent) const {
+  OS << Indent << "partition #" << (I - begin())
+     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
+     << (I->IsSplittable ? " (splittable)" : "")
+     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
+     << "\n";
+}
+
+void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
+                                    StringRef Indent) const {
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
+    if (!UI->getUse())
+      continue; // Skip dead uses.
+    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
+       << "used by: " << *UI->getUse()->getUser() << "\n";
+    if (MemTransferInst *II =
+            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
+      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
+      bool IsDest;
+      if (!MTO.IsSplittable)
+        IsDest = UI->BeginOffset == MTO.DestBegin;
+      else
+        IsDest = MTO.DestBegin != 0u;
+      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
+         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
+         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
+    }
+  }
+}
+
+void AllocaPartitioning::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "No partitioning for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Partitioning of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    print(OS, I);
+    printUsers(OS, I);
+  }
+}
+
+void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
+void AllocaPartitioning::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+
+namespace {
+/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
+///
+/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
+/// the loads and stores of an alloca instruction, as well as updating its
+/// debug information. This is used when a domtree is unavailable and thus
+/// mem2reg in its full form can't be used to handle promotion of allocas to
+/// scalar values.
+class AllocaPromoter : public LoadAndStorePromoter {
+  AllocaInst &AI;
+  DIBuilder &DIB;
+
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
+
+public:
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 AllocaInst &AI, DIBuilder &DIB)
+    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+
+  void run(const SmallVectorImpl<Instruction*> &Insts) {
+    // Remember which alloca we're promoting (for isInstInList).
+    if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
+      for (Value::use_iterator UI = DebugNode->use_begin(),
+                               UE = DebugNode->use_end();
+           UI != UE; ++UI)
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+          DDIs.push_back(DDI);
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+          DVIs.push_back(DVI);
+    }
+
+    LoadAndStorePromoter::run(Insts);
+    AI.eraseFromParent();
+    while (!DDIs.empty())
+      DDIs.pop_back_val()->eraseFromParent();
+    while (!DVIs.empty())
+      DVIs.pop_back_val()->eraseFromParent();
+  }
+
+  virtual bool isInstInList(Instruction *I,
+                            const SmallVectorImpl<Instruction*> &Insts) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      return LI->getOperand(0) == &AI;
+    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+  }
+
+  virtual void updateDebugInfo(Instruction *Inst) const {
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+    }
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      Value *Arg = 0;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getValueOperand();
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Arg = LI->getPointerOperand();
+      } else {
+        continue;
+      }
+      Instruction *DbgVal =
+        DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                     Inst);
+      DbgVal->setDebugLoc(DVI->getDebugLoc());
+    }
+  }
+};
+} // end anon namespace
+
+
+namespace {
+/// \brief An optimization pass providing Scalar Replacement of Aggregates.
+///
+/// This pass takes allocations which can be completely analyzed (that is, they
+/// don't escape) and tries to turn them into scalar SSA values. There are
+/// a few steps to this process.
+///
+/// 1) It takes allocations of aggregates and analyzes the ways in which they
+///    are used to try to split them into smaller allocations, ideally of
+///    a single scalar data type. It will split up memcpy and memset accesses
+///    as necessary and try to isolate individual scalar accesses.
+/// 2) It will transform accesses into forms which are suitable for SSA value
+///    promotion. This can be replacing a memset with a scalar store of an
+///    integer value, or it can involve speculating operations on a PHI or
+///    select to be a PHI or select of the results.
+/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
+///    onto insert and extract operations on a vector value, and convert them to
+///    this form. By doing so, it will enable promotion of vector aggregates to
+///    SSA vector values.
+class SROA : public FunctionPass {
+  const bool RequiresDomTree;
+
+  LLVMContext *C;
+  const DataLayout *TD;
+  DominatorTree *DT;
+
+  /// \brief Worklist of alloca instructions to simplify.
+  ///
+  /// Each alloca in the function is added to this. Each new alloca formed gets
+  /// added to it as well to recursively simplify unless that alloca can be
+  /// directly promoted. Finally, each time we rewrite a use of an alloca other
+  /// the one being actively rewritten, we add it back onto the list if not
+  /// already present to ensure it is re-visited.
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+
+  /// \brief A collection of instructions to delete.
+  /// We try to batch deletions to simplify code and make things a bit more
+  /// efficient.
+  SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
+
+  /// \brief Post-promotion worklist.
+  ///
+  /// Sometimes we discover an alloca which has a high probability of becoming
+  /// viable for SROA after a round of promotion takes place. In those cases,
+  /// the alloca is enqueued here for re-processing.
+  ///
+  /// Note that we have to be very careful to clear allocas out of this list in
+  /// the event they are deleted.
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
+
+  /// \brief A collection of alloca instructions we can directly promote.
+  std::vector<AllocaInst *> PromotableAllocas;
+
+public:
+  SROA(bool RequiresDomTree = true)
+      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
+        C(0), TD(0), DT(0) {
+    initializeSROAPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  const char *getPassName() const { return "SROA"; }
+  static char ID;
+
+private:
+  friend class PHIOrSelectSpeculator;
+  friend class AllocaPartitionRewriter;
+  friend class AllocaPartitionVectorRewriter;
+
+  bool rewriteAllocaPartition(AllocaInst &AI,
+                              AllocaPartitioning &P,
+                              AllocaPartitioning::iterator PI);
+  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
+  bool runOnAlloca(AllocaInst &AI);
+  void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
+  bool promoteAllocas(Function &F);
+};
+}
+
+char SROA::ID = 0;
+
+FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
+  return new SROA(RequiresDomTree);
+}
+
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
+
+namespace {
+/// \brief Visitor to speculate PHIs and Selects where possible.
+class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
+
+  const DataLayout &TD;
+  AllocaPartitioning &P;
+  SROA &Pass;
+
+public:
+  PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass)
+    : TD(TD), P(P), Pass(Pass) {}
+
+  /// \brief Visit the users of an alloca partition and rewrite them.
+  void visitUsers(AllocaPartitioning::const_iterator PI) {
+    // Note that we need to use an index here as the underlying vector of uses
+    // may be grown during speculation. However, we never need to re-visit the
+    // new uses, and so we can use the initial size bound.
+    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
+      const PartitionUse &PU = P.getUse(PI, Idx);
+      if (!PU.getUse())
+        continue; // Skip dead use.
+
+      visit(cast<Instruction>(PU.getUse()->getUser()));
+    }
+  }
+
+private:
+  // By default, skip this instruction.
+  void visitInstruction(Instruction &I) {}
+
+  /// PHI instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers in the pred blocks and then PHI the
+  /// results, allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = phi [i32* %Alloca, i32* %Other]
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   ...
+  ///   %V2 = load i32* %Other
+  ///   ...
+  ///   %V = phi [i32 %V1, i32 %V2]
+  ///
+  /// We can do this to a select if its only uses are loads and if the operands
+  /// to the select can be loaded unconditionally.
+  ///
+  /// FIXME: This should be hoisted into a generic utility, likely in
+  /// Transforms/Util/Local.h
+  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
+    // For now, we can only do this promotion if the load is in the same block
+    // as the PHI, and if there are no stores between the phi and load.
+    // TODO: Allow recursive phi users.
+    // TODO: Allow stores.
+    BasicBlock *BB = PN.getParent();
+    unsigned MaxAlign = 0;
+    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // For now we only allow loads in the same block as the PHI.  This is
+      // a common case that happens when instcombine merges two loads through
+      // a PHI.
+      if (LI->getParent() != BB) return false;
+
+      // Ensure that there are no instructions between the PHI and the load that
+      // could store.
+      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+        if (BBI->mayWriteToMemory())
+          return false;
+
+      MaxAlign = std::max(MaxAlign, LI->getAlignment());
+      Loads.push_back(LI);
+    }
+
+    // We can only transform this if it is safe to push the loads into the
+    // predecessor blocks. The only thing to watch out for is that we can't put
+    // a possibly trapping load in the predecessor if it is a critical edge.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+      Value *InVal = PN.getIncomingValue(Idx);
+
+      // If the value is produced by the terminator of the predecessor (an
+      // invoke) or it has side-effects, there is no valid place to put a load
+      // in the predecessor.
+      if (TI == InVal || TI->mayHaveSideEffects())
+        return false;
+
+      // If the predecessor has a single successor, then the edge isn't
+      // critical.
+      if (TI->getNumSuccessors() == 1)
+        continue;
+
+      // If this pointer is always safe to load, or if we can prove that there
+      // is already a load in the block, then we can move the load to the pred
+      // block.
+      if (InVal->isDereferenceablePointer() ||
+          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
+        continue;
+
+      return false;
+    }
+
+    return true;
+  }
+
+  void visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafePHIToSpeculate(PN, Loads))
+      return;
+
+    assert(!Loads.empty());
+
+    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+    IRBuilderTy PHIBuilder(&PN);
+    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                          PN.getName() + ".sroa.speculated");
+
+    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+    // matter which one we get and if any differ.
+    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
+    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+    unsigned Align = SomeLoad->getAlignment();
+
+    // Rewrite all loads of the PN to use the new PHI.
+    do {
+      LoadInst *LI = Loads.pop_back_val();
+      LI->replaceAllUsesWith(NewPN);
+      Pass.DeadInsts.insert(LI);
+    } while (!Loads.empty());
+
+    // Inject loads into all of the pred blocks.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+      BasicBlock *Pred = PN.getIncomingBlock(Idx);
+      TerminatorInst *TI = Pred->getTerminator();
+      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
+      Value *InVal = PN.getIncomingValue(Idx);
+      IRBuilderTy PredBuilder(TI);
+
+      LoadInst *Load
+        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
+                                         Pred->getName()));
+      ++NumLoadsSpeculated;
+      Load->setAlignment(Align);
+      if (TBAATag)
+        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+      NewPN->addIncoming(Load, Pred);
+
+      Instruction *Ptr = dyn_cast<Instruction>(InVal);
+      if (!Ptr)
+        // No uses to rewrite.
+        continue;
+
+      // Try to lookup and rewrite any partition uses corresponding to this phi
+      // input.
+      AllocaPartitioning::iterator PI
+        = P.findPartitionForPHIOrSelectOperand(InUse);
+      if (PI == P.end())
+        continue;
+
+      // Replace the Use in the PartitionUse for this operand with the Use
+      // inside the load.
+      AllocaPartitioning::use_iterator UI
+        = P.findPartitionUseForPHIOrSelectOperand(InUse);
+      assert(isa<PHINode>(*UI->getUse()->getUser()));
+      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
+    }
+    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  }
+
+  /// Select instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers and then select between the result,
+  /// allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   %V2 = load i32* %Other
+  ///   %V = select i1 %cond, i32 %V1, i32 %V2
+  ///
+  /// We can do this to a select if its only uses are loads and if the operand
+  /// to the select can be loaded unconditionally.
+  bool isSafeSelectToSpeculate(SelectInst &SI,
+                               SmallVectorImpl<LoadInst *> &Loads) {
+    Value *TValue = SI.getTrueValue();
+    Value *FValue = SI.getFalseValue();
+    bool TDerefable = TValue->isDereferenceablePointer();
+    bool FDerefable = FValue->isDereferenceablePointer();
+
+    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // Both operands to the select need to be dereferencable, either
+      // absolutely (e.g. allocas) or at this point because we can see other
+      // accesses to it.
+      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      Loads.push_back(LI);
+    }
+
+    return true;
+  }
+
+  void visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+
+    // If the select isn't safe to speculate, just use simple logic to emit it.
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafeSelectToSpeculate(SI, Loads))
+      return;
+
+    IRBuilderTy IRB(&SI);
+    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
+    AllocaPartitioning::iterator PIs[2];
+    PartitionUse PUs[2];
+    for (unsigned i = 0, e = 2; i != e; ++i) {
+      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
+      if (PIs[i] != P.end()) {
+        // If the pointer is within the partitioning, remove the select from
+        // its uses. We'll add in the new loads below.
+        AllocaPartitioning::use_iterator UI
+          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
+        PUs[i] = *UI;
+        // Clear out the use here so that the offsets into the use list remain
+        // stable but this use is ignored when rewriting.
+        UI->setUse(0);
+      }
+    }
+
+    Value *TV = SI.getTrueValue();
+    Value *FV = SI.getFalseValue();
+    // Replace the loads of the select with a select of two loads.
+    while (!Loads.empty()) {
+      LoadInst *LI = Loads.pop_back_val();
+
+      IRB.SetInsertPoint(LI);
+      LoadInst *TL =
+        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+      LoadInst *FL =
+        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+      NumLoadsSpeculated += 2;
+
+      // Transfer alignment and TBAA info if present.
+      TL->setAlignment(LI->getAlignment());
+      FL->setAlignment(LI->getAlignment());
+      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
+        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+      }
+
+      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                  LI->getName() + ".sroa.speculated");
+
+      LoadInst *Loads[2] = { TL, FL };
+      for (unsigned i = 0, e = 2; i != e; ++i) {
+        if (PIs[i] != P.end()) {
+          Use *LoadUse = &Loads[i]->getOperandUse(0);
+          assert(PUs[i].getUse()->get() == LoadUse->get());
+          PUs[i].setUse(LoadUse);
+          P.use_push_back(PIs[i], PUs[i]);
+        }
+      }
+
+      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+      LI->replaceAllUsesWith(V);
+      Pass.DeadInsts.insert(LI);
+    }
+  }
+};
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices) {
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices) {
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices);
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
+      ElementTy = SeqTy->getElementType();
+      // Note that we use the default address space as this index is over an
+      // array or a vector, not a pointer.
+      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      if (STy->element_begin() == STy->element_end())
+        break; // Nothing left to descend into.
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices) {
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return 0;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType());
+    if (ElementSizeInBits % 8)
+      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+      return 0;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return 0;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return 0;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
+    return 0; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices) {
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
+    return 0;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return 0; // We can't GEP through an unsized element.
+  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+  if (ElementSize == 0)
+    return 0; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy) {
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it around here.
+  Value *OffsetPtr = 0;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = 0;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  Type *TargetTy = PointerTy->getPointerElementType();
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr))
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
+                                           Indices)) {
+      if (P->getType() == PointerTy) {
+        // Zap any offset pointer that we ended up computing in previous rounds.
+        if (OffsetPtr && OffsetPtr->use_empty())
+          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
+            I->eraseFromParent();
+        return P;
+      }
+      if (!OffsetPtr) {
+        OffsetPtr = P;
+      }
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->mayBeOverridden())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr));
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
+                                  "raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
+      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+                            "raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (Ptr->getType() != PointerTy)
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
+
+  return Ptr;
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
+      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
+        return true;
+  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy())
+      return true;
+    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
+      return true;
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                           Type *Ty) {
+  assert(canConvertValue(DL, V->getType(), Ty) &&
+         "Value not convertable to type");
+  if (V->getType() == Ty)
+    return V;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+      if (NewITy->getBitWidth() > OldITy->getBitWidth())
+        return IRB.CreateZExt(V, NewITy);
+  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
+    return IRB.CreateIntToPtr(V, Ty);
+  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
+    return IRB.CreatePtrToInt(V, Ty);
+
+  return IRB.CreateBitCast(V, Ty);
+}
+
+/// \brief Test whether the given alloca partition can be promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static bool isVectorPromotionViable(const DataLayout &TD,
+                                    Type *AllocaTy,
+                                    AllocaPartitioning &P,
+                                    uint64_t PartitionBeginOffset,
+                                    uint64_t PartitionEndOffset,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
+  if (!Ty)
+    return false;
+
+  uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType());
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((TD.getTypeSizeInBits(Ty) % 8) == 0 &&
+         "vector size not a multiple of element size?");
+  ElementSize /= 8;
+
+  for (; I != E; ++I) {
+    Use *U = I->getUse();
+    if (!U)
+      continue; // Skip dead use.
+
+    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
+    uint64_t BeginIndex = BeginOffset / ElementSize;
+    if (BeginIndex * ElementSize != BeginOffset ||
+        BeginIndex >= Ty->getNumElements())
+      return false;
+    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
+    uint64_t EndIndex = EndOffset / ElementSize;
+    if (EndIndex * ElementSize != EndOffset ||
+        EndIndex > Ty->getNumElements())
+      return false;
+
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    uint64_t NumElements = EndIndex - BeginIndex;
+    Type *PartitionTy
+      = (NumElements == 1) ? Ty->getElementType()
+                           : VectorType::get(Ty->getElementType(), NumElements);
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+      if (MI->isVolatile())
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+      // Disable vector promotion when there are loads or stores of an FCA.
+      return false;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      if (LI->isVolatile())
+        return false;
+      if (!canConvertValue(TD, PartitionTy, LI->getType()))
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      if (SI->isVolatile())
+        return false;
+      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+        return false;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(const DataLayout &TD,
+                                    Type *AllocaTy,
+                                    uint64_t AllocBeginOffset,
+                                    AllocaPartitioning &P,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
+    return false;
+
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(TD, AllocaTy, IntTy) ||
+      !canConvertValue(TD, IntTy, AllocaTy))
+    return false;
+
+  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
+
+  // Check the uses to ensure the uses are (likely) promotable integer uses.
+  // Also ensure that the alloca has a covering load or store. We don't want
+  // to widen the integer operations only to fail to promote due to some other
+  // unsplittable entry (which we may make splittable later).
+  bool WholeAllocaOp = false;
+  for (; I != E; ++I) {
+    Use *U = I->getUse();
+    if (!U)
+      continue; // Skip dead use.
+
+    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
+    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
+
+    // We can't reasonably handle cases where the load or store extends past
+    // the end of the aloca's type and into its padding.
+    if (RelEnd > Size)
+      return false;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      if (LI->isVolatile())
+        return false;
+      if (RelBegin == 0 && RelEnd == Size)
+        WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, AllocaTy, LI->getType()))
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      Type *ValueTy = SI->getValueOperand()->getType();
+      if (SI->isVolatile())
+        return false;
+      if (RelBegin == 0 && RelEnd == Size)
+        WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, ValueTy, AllocaTy))
+        return false;
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+      if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else {
+      return false;
+    }
+  }
+  return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V,
+                            unsigned BeginIndex, unsigned EndIndex,
+                            const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(IRB.getInt32(i));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".extract");
+  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+         "Too many elements!");
+  if (Ty->getNumElements() == VecTy->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(VecTy->getNumElements());
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i - BeginIndex));
+    else
+      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".expand");
+  DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+
+  Mask.clear();
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i));
+    else
+      Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
+  V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask),
+                              Name + "insert");
+  DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+  return V;
+}
+
+namespace {
+/// \brief Visitor to rewrite instructions using a partition of an alloca to
+/// use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
+                                                   bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
+
+  const DataLayout &TD;
+  AllocaPartitioning &P;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+  Type *NewAllocaTy;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewritten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
+  // The offset of the partition user currently being rewritten.
+  uint64_t BeginOffset, EndOffset;
+  bool IsSplit;
+  Use *OldUse;
+  Instruction *OldPtr;
+
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
+
+public:
+  AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
+                          AllocaPartitioning::iterator PI,
+                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
+                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
+    : TD(TD), P(P), Pass(Pass),
+      OldAI(OldAI), NewAI(NewAI),
+      NewAllocaBeginOffset(NewBeginOffset),
+      NewAllocaEndOffset(NewEndOffset),
+      NewAllocaTy(NewAI.getAllocatedType()),
+      VecTy(), ElementTy(), ElementSize(), IntTy(),
+      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
+      IRB(NewAI.getContext(), ConstantFolder()) {
+  }
+
+  /// \brief Visit the users of the alloca partition and rewrite them.
+  bool visitUsers(AllocaPartitioning::const_use_iterator I,
+                  AllocaPartitioning::const_use_iterator E) {
+    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
+                                NewAllocaBeginOffset, NewAllocaEndOffset,
+                                I, E)) {
+      ++NumVectorized;
+      VecTy = cast<VectorType>(NewAI.getAllocatedType());
+      ElementTy = VecTy->getElementType();
+      assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8;
+    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
+                                       NewAllocaBeginOffset, P, I, E)) {
+      IntTy = Type::getIntNTy(NewAI.getContext(),
+                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
+    }
+    bool CanSROA = true;
+    for (; I != E; ++I) {
+      if (!I->getUse())
+        continue; // Skip dead uses.
+      BeginOffset = I->BeginOffset;
+      EndOffset = I->EndOffset;
+      IsSplit = I->isSplit();
+      OldUse = I->getUse();
+      OldPtr = cast<Instruction>(OldUse->get());
+
+      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+      IRB.SetInsertPoint(OldUserI);
+      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                        ".");
+
+      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    }
+    if (VecTy) {
+      assert(CanSROA);
+      VecTy = 0;
+      ElementTy = 0;
+      ElementSize = 0;
+    }
+    if (IntTy) {
+      assert(CanSROA);
+      IntTy = 0;
+    }
+    return CanSROA;
+  }
+
+private:
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
+    assert(BeginOffset >= NewAllocaBeginOffset);
+    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
+  }
+
+  /// \brief Compute suitable alignment to access an offset into the new alloca.
+  unsigned getOffsetAlign(uint64_t Offset) {
+    unsigned NewAIAlign = NewAI.getAlignment();
+    if (!NewAIAlign)
+      NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType());
+    return MinAlign(NewAIAlign, Offset);
+  }
+
+  /// \brief Compute suitable alignment to access this partition of the new
+  /// alloca.
+  unsigned getPartitionAlign() {
+    return getOffsetAlign(BeginOffset - NewAllocaBeginOffset);
+  }
+
+  /// \brief Compute suitable alignment to access a type at an offset of the
+  /// new alloca.
+  ///
+  /// \returns zero if the type's ABI alignment is a suitable alignment,
+  /// otherwise returns the maximal suitable alignment.
+  unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
+    unsigned Align = getOffsetAlign(Offset);
+    return Align == TD.getABITypeAlignment(Ty) ? 0 : Align;
+  }
+
+  /// \brief Compute suitable alignment to access a type at the beginning of
+  /// this partition of the new alloca.
+  ///
+  /// See \c getOffsetTypeAlign for details; this routine delegates to it.
+  unsigned getPartitionTypeAlign(Type *Ty) {
+    return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
+  }
+
+  unsigned getIndex(uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return Index;
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
+      Pass.DeadInsts.insert(I);
+  }
+
+  Value *rewriteVectorizedLoadInst() {
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+  }
+
+  Value *rewriteIntegerLoad(LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
+    assert(!LI.isVolatile());
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     "load");
+    V = convertValue(TD, IRB, V, IntTy);
+    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
+      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+                         "extract");
+    return V;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+
+    uint64_t Size = EndOffset - BeginOffset;
+
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
+                             : LI.getType();
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst();
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(LI);
+    } else if (BeginOffset == NewAllocaBeginOffset &&
+               canConvertValue(TD, NewAllocaTy, LI.getType())) {
+      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                LI.isVolatile(), "load");
+    } else {
+      Type *LTy = TargetTy->getPointerTo();
+      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
+                                getPartitionTypeAlign(TargetTy),
+                                LI.isVolatile(), "load");
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(TD, IRB, V, TargetTy);
+
+    if (IsSplit) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
+             "Split load isn't smaller than original load");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder
+        = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+                        "insert");
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      delete Placeholder;
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
+
+    Pass.DeadInsts.insert(&LI);
+    deleteIfTriviallyDead(OldOp);
+    DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(Value *V,
+                                  StoreInst &SI, Value *OldOp) {
+    unsigned BeginIndex = getIndex(BeginOffset);
+    unsigned EndIndex = getIndex(EndOffset);
+    assert(EndIndex > BeginIndex && "Empty vector!");
+    unsigned NumElements = EndIndex - BeginIndex;
+    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+    Type *PartitionTy
+      = (NumElements == 1) ? ElementTy
+                           : VectorType::get(ElementTy, NumElements);
+    if (V->getType() != PartitionTy)
+      V = convertValue(TD, IRB, V, PartitionTy);
+
+    // Mix in the existing elements.
+    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                       "load");
+    V = insertVector(IRB, Old, V, BeginIndex, "vec");
+
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
+    assert(!SI.isVolatile());
+    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "oldload");
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+                        "insert");
+    }
+    V = convertValue(TD, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+
+    Value *V = SI.getValueOperand();
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after promoting this alloca.
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+        Pass.PostPromotionWorklist.insert(AI);
+
+    uint64_t Size = EndOffset - BeginOffset;
+    if (Size < TD.getTypeStoreSize(V->getType())) {
+      assert(!SI.isVolatile());
+      assert(IsSplit && "A seemingly split store isn't splittable");
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(V->getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(V->getType()) &&
+             "Non-byte-multiple bit width");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
+                         "extract");
+    }
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(V, SI);
+
+    StoreInst *NewSI;
+    if (BeginOffset == NewAllocaBeginOffset &&
+        canConvertValue(TD, V->getType(), NewAllocaTy)) {
+      V = convertValue(TD, IRB, V, NewAllocaTy);
+      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                     SI.isVolatile());
+    } else {
+      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr,
+                                     getPartitionTypeAlign(V->getType()),
+                                     SI.isVolatile());
+    }
+    (void)NewSI;
+    Pass.DeadInsts.insert(&SI);
+    deleteIfTriviallyDead(OldOp);
+
+    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+  }
+
+  /// \brief Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the advice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
+    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
+                      ConstantExpr::getUDiv(
+                        Constant::getAllOnesValue(SplatIntTy),
+                        ConstantExpr::getZExt(
+                          Constant::getAllOnesValue(V->getType()),
+                          SplatIntTy)),
+                      "isplat");
+    return V;
+  }
+
+  /// \brief Compute a vector splat for a given element value.
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getRawDest() == OldPtr);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign()));
+
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!VecTy && !IntTy &&
+        (BeginOffset != NewAllocaBeginOffset ||
+         EndOffset != NewAllocaEndOffset ||
+         !AllocaTy->isSingleValueType() ||
+         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
+         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+      CallInst *New
+        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
+                                                II.getRawDest()->getType()),
+                           II.getValue(), Size, getPartitionAlign(),
+                           II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
+    Value *V;
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(BeginOffset);
+      unsigned EndIndex = getIndex(EndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+      Value *Splat =
+          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(TD, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(Splat, NumElements);
+
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
+
+      uint64_t Size = EndOffset - BeginOffset;
+      V = getIntegerSplat(II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                           "oldload");
+        Old = convertValue(TD, IRB, Old, IntTy);
+        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
+      V = convertValue(TD, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(BeginOffset == NewAllocaBeginOffset);
+      assert(EndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
+
+      V = convertValue(TD, IRB, V, AllocaTy);
+    }
+
+    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                        II.isVolatile());
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    DEBUG(dbgs() << "    original: " << II << "\n");
+
+    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
+    bool IsDest = II.getRawDest() == OldPtr;
+
+    const AllocaPartitioning::MemTransferOffsets &MTO
+      = P.getMemTransferOffsets(II);
+
+    // Compute the relative offset within the transfer.
+    unsigned IntPtrWidth = TD.getPointerSizeInBits();
+    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
+                                                       : MTO.SourceBegin));
+
+    unsigned Align = II.getAlignment();
+    if (Align > 1)
+      Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
+                       MinAlign(II.getAlignment(), getPartitionAlign()));
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!MTO.IsSplittable) {
+      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
+      if (IsDest)
+        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      else
+        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
+
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, Align));
+
+      DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldOp);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy
+      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                             EndOffset != NewAllocaEndOffset ||
+                             !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
+      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
+      // Ensure the start lines up.
+      assert(BeginOffset == OrigBegin);
+      (void)OrigBegin;
+
+      // Rewrite the size as needed.
+      if (EndOffset != OrigEnd)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      EndOffset - BeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    if (AllocaInst *AI
+          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
+      Pass.Worklist.insert(AI);
+
+    if (EmitMemCpy) {
+      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
+                                : II.getRawDest()->getType();
+
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+
+      Value *OurPtr
+        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
+                                           : II.getRawSource()->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+
+      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
+                                       IsDest ? OtherPtr : OurPtr,
+                                       Size, Align, II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy
+    // is equivalent to 1, but that isn't true if we end up rewriting this as
+    // a load or store.
+    if (!Align)
+      Align = 1;
+
+    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
+                         EndOffset == NewAllocaEndOffset;
+    uint64_t Size = EndOffset - BeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
+
+    Type *OtherPtrTy = NewAI.getType();
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherPtrTy = VecTy->getElementType();
+      else
+        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+      OtherPtrTy = OtherPtrTy->getPointerTo();
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherPtrTy = SubIntTy->getPointerTo();
+    }
+
+    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+    Value *DstPtr = &NewAI;
+    if (!IsDest)
+      std::swap(SrcPtr, DstPtr);
+
+    Value *Src;
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  "load");
+      Src = convertValue(TD, IRB, Src, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
+    } else {
+      Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
+                                  "copyload");
+    }
+
+    if (VecTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "oldload");
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(TD, IRB, Src, NewAllocaTy);
+    }
+
+    StoreInst *Store = cast<StoreInst>(
+      IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    assert(II.getArgOperand(1) == OldPtr);
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    ConstantInt *Size
+      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         EndOffset - BeginOffset);
+    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return true;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                             ".");
+
+    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+    DEBUG(dbgs() << "          to: " << PN << "\n");
+    deleteIfTriviallyDead(OldPtr);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+
+    // Find the operand we need to rewrite here.
+    bool IsTrueVal = SI.getTrueValue() == OldPtr;
+    if (IsTrueVal)
+      assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
+    else
+      assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
+
+    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
+    SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
+    DEBUG(dbgs() << "          to: " << SI << "\n");
+    deleteIfTriviallyDead(OldPtr);
+    return false;
+  }
+
+};
+}
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  const DataLayout &TD;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (Visited.insert(*UI))
+        Queue.push_back(&UI.getUse());
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission class.
+  template <typename Derived>
+  class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilderTy IRB;
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+  public:
+    /// \brief Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType())
+        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep");
+      Value *Load = IRB.CreateLoad(GEP, Name + ".load");
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LoadOpSplitter Splitter(&LI, *U);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      Value *Store = IRB.CreateStore(
+        IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+        IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    StoreOpSplitter Splitter(&SI, *U);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy))
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &TD, Type *Ty,
+                              uint64_t Offset, uint64_t Size) {
+  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(TD, Ty);
+  if (Offset > TD.getTypeAllocSize(Ty) ||
+      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+    return 0;
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+    // We can't partition pointers...
+    if (SeqTy->isPointerTy())
+      return 0;
+
+    Type *ElementTy = SeqTy->getElementType();
+    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
+      if (NumSkippedElements >= ArrTy->getNumElements())
+        return 0;
+    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
+      if (NumSkippedElements >= VecTy->getNumElements())
+        return 0;
+    }
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return 0;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(TD, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return stripAggregateTypeWrapping(TD, ElementTy);
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return 0;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return 0;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return 0;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+  if (Offset >= ElementSize)
+    return 0; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return 0;
+    return getTypePartition(TD, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return stripAggregateTypeWrapping(TD, ElementTy);
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return 0; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return 0;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
+                                      STy->isPacked());
+  const StructLayout *SubSL = TD.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return 0; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+bool SROA::rewriteAllocaPartition(AllocaInst &AI,
+                                  AllocaPartitioning &P,
+                                  AllocaPartitioning::iterator PI) {
+  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
+  bool IsLive = false;
+  for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
+                                        UE = P.use_end(PI);
+       UI != UE && !IsLive; ++UI)
+    if (UI->getUse())
+      IsLive = true;
+  if (!IsLive)
+    return false; // No live uses left of this partition.
+
+  DEBUG(dbgs() << "Speculating PHIs and selects in partition "
+               << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n");
+
+  PHIOrSelectSpeculator Speculator(*TD, P, *this);
+  DEBUG(dbgs() << "  speculating ");
+  DEBUG(P.print(dbgs(), PI, ""));
+  Speculator.visitUsers(PI);
+
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *AllocaTy = 0;
+  if (Type *PartitionTy = P.getCommonType(PI))
+    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
+      AllocaTy = PartitionTy;
+  if (!AllocaTy)
+    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
+                                             PI->BeginOffset, AllocaSize))
+      AllocaTy = PartitionTy;
+  if ((!AllocaTy ||
+       (AllocaTy->isArrayTy() &&
+        AllocaTy->getArrayElementType()->isIntegerTy())) &&
+      TD->isLegalInteger(AllocaSize * 8))
+    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
+  if (!AllocaTy)
+    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
+  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // perform phi and select speculation.
+  AllocaInst *NewAI;
+  if (AllocaTy == AI.getAllocatedType()) {
+    assert(PI->BeginOffset == 0 &&
+           "Non-zero begin offset but same alloca type");
+    assert(PI == P.begin() && "Begin offset is zero on later partition");
+    NewAI = &AI;
+  } else {
+    unsigned Alignment = AI.getAlignment();
+    if (!Alignment) {
+      // The minimum alignment which users can rely on when the explicit
+      // alignment is omitted or zero is that required by the ABI for this
+      // type.
+      Alignment = TD->getABITypeAlignment(AI.getAllocatedType());
+    }
+    Alignment = MinAlign(Alignment, PI->BeginOffset);
+    // If we will get at least this much alignment from the type alone, leave
+    // the alloca's alignment unconstrained.
+    if (Alignment <= TD->getABITypeAlignment(AllocaTy))
+      Alignment = 0;
+    NewAI = new AllocaInst(AllocaTy, 0, Alignment,
+                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
+                           &AI);
+    ++NumNewAllocas;
+  }
+
+  DEBUG(dbgs() << "Rewriting alloca partition "
+               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
+               << *NewAI << "\n");
+
+  // Track the high watermark of the post-promotion worklist. We will reset it
+  // to this point if the alloca is not in fact scheduled for promotion.
+  unsigned PPWOldSize = PostPromotionWorklist.size();
+
+  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
+                                   PI->BeginOffset, PI->EndOffset);
+  DEBUG(dbgs() << "  rewriting ");
+  DEBUG(P.print(dbgs(), PI, ""));
+  bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI));
+  if (Promotable) {
+    DEBUG(dbgs() << "  and queuing for promotion\n");
+    PromotableAllocas.push_back(NewAI);
+  } else if (NewAI != &AI) {
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
+  }
+
+  // Drop any post-promotion work items if promotion didn't happen.
+  if (!Promotable)
+    while (PostPromotionWorklist.size() > PPWOldSize)
+      PostPromotionWorklist.pop_back();
+
+  return true;
+}
+
+/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
+  bool Changed = false;
+  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
+       ++PI)
+    Changed |= rewriteAllocaPartition(AI, P, PI);
+
+  return Changed;
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// a partitioning of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+
+  // Skip alloca forms that this analysis can't handle.
+  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+    return false;
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(*TD);
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the partition set using a recursive instruction-visiting builder.
+  AllocaPartitioning P(*TD, AI);
+  DEBUG(P.print(dbgs()));
+  if (P.isEscaped())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
+                                              DE = P.dead_user_end();
+       DI != DE; ++DI) {
+    Changed = true;
+    (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+    DeadInsts.insert(*DI);
+  }
+  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
+                                            DE = P.dead_op_end();
+       DO != DE; ++DO) {
+    Value *OldV = **DO;
+    // Clobber the use with an undef value.
+    **DO = UndefValue::get(OldV->getType());
+    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+      if (isInstructionTriviallyDead(OldI)) {
+        Changed = true;
+        DeadInsts.insert(OldI);
+      }
+  }
+
+  // No partitions to split. Leave the dead alloca for a later pass to clean up.
+  if (P.begin() == P.end())
+    return Changed;
+
+  return splitAlloca(AI, P) || Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
+  while (!DeadInsts.empty()) {
+    Instruction *I = DeadInsts.pop_back_val();
+    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        *OI = 0;
+        if (isInstructionTriviallyDead(U))
+          DeadInsts.insert(U);
+      }
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      DeletedAllocas.insert(AI);
+
+    ++NumDeleted;
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// If there is a domtree available, we attempt to promote using the full power
+/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
+/// based on the SSAUpdater utilities. This function returns whether any
+/// promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  if (DT && !ForceSSAUpdater) {
+    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+    PromoteMemToReg(PromotableAllocas, *DT);
+    PromotableAllocas.clear();
+    return true;
+  }
+
+  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
+  SSAUpdater SSA;
+  DIBuilder DIB(*F.getParent());
+  SmallVector<Instruction*, 64> Insts;
+
+  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
+    AllocaInst *AI = PromotableAllocas[Idx];
+    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+         UI != UE;) {
+      Instruction *I = cast<Instruction>(*UI++);
+      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
+      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
+      // leading to them) here. Eventually it should use them to optimize the
+      // scalar values produced.
+      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        assert(onlyUsedByLifetimeMarkers(I) &&
+               "Found a bitcast used outside of a lifetime marker.");
+        while (!I->use_empty())
+          cast<Instruction>(*I->use_begin())->eraseFromParent();
+        I->eraseFromParent();
+        continue;
+      }
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
+               II->getIntrinsicID() == Intrinsic::lifetime_end);
+        II->eraseFromParent();
+        continue;
+      }
+
+      Insts.push_back(I);
+    }
+    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
+    Insts.clear();
+  }
+
+  PromotableAllocas.clear();
+  return true;
+}
+
+namespace {
+  /// \brief A predicate to test whether an alloca belongs to a set.
+  class IsAllocaInSet {
+    typedef SmallPtrSet<AllocaInst *, 4> SetType;
+    const SetType &Set;
+
+  public:
+    typedef AllocaInst *argument_type;
+
+    IsAllocaInSet(const SetType &Set) : Set(Set) {}
+    bool operator()(AllocaInst *AI) const { return Set.count(AI); }
+  };
+}
+
+bool SROA::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD) {
+    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
+    return false;
+  }
+  DT = getAnalysisIfAvailable<DominatorTree>();
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
+       I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      Worklist.insert(AI);
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  do {
+    while (!Worklist.empty()) {
+      Changed |= runOnAlloca(*Worklist.pop_back_val());
+      deleteDeadInstructions(DeletedAllocas);
+
+      // Remove the deleted allocas from various lists so that we don't try to
+      // continue processing them.
+      if (!DeletedAllocas.empty()) {
+        Worklist.remove_if(IsAllocaInSet(DeletedAllocas));
+        PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas));
+        PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
+                                               PromotableAllocas.end(),
+                                               IsAllocaInSet(DeletedAllocas)),
+                                PromotableAllocas.end());
+        DeletedAllocas.clear();
+      }
+    }
+
+    Changed |= promoteAllocas(F);
+
+    Worklist = PostPromotionWorklist;
+    PostPromotionWorklist.clear();
+  } while (!Worklist.empty());
+
+  return Changed;
+}
+
+void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (RequiresDomTree)
+    AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
new file mode 100644
index 000000000000..8a9c7da113c1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -0,0 +1,189 @@
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+  initializeADCEPass(Registry);
+  initializeBlockPlacementPass(Registry);
+  initializeCodeGenPreparePass(Registry);
+  initializeConstantPropagationPass(Registry);
+  initializeCorrelatedValuePropagationPass(Registry);
+  initializeDCEPass(Registry);
+  initializeDeadInstEliminationPass(Registry);
+  initializeDSEPass(Registry);
+  initializeGVNPass(Registry);
+  initializeEarlyCSEPass(Registry);
+  initializeIndVarSimplifyPass(Registry);
+  initializeJumpThreadingPass(Registry);
+  initializeLICMPass(Registry);
+  initializeLoopDeletionPass(Registry);
+  initializeLoopInstSimplifyPass(Registry);
+  initializeLoopRotatePass(Registry);
+  initializeLoopStrengthReducePass(Registry);
+  initializeLoopUnrollPass(Registry);
+  initializeLoopUnswitchPass(Registry);
+  initializeLoopIdiomRecognizePass(Registry);
+  initializeLowerAtomicPass(Registry);
+  initializeLowerExpectIntrinsicPass(Registry);
+  initializeMemCpyOptPass(Registry);
+  initializeReassociatePass(Registry);
+  initializeRegToMemPass(Registry);
+  initializeSCCPPass(Registry);
+  initializeIPSCCPPass(Registry);
+  initializeSROAPass(Registry);
+  initializeSROA_DTPass(Registry);
+  initializeSROA_SSAUpPass(Registry);
+  initializeCFGSimplifyPassPass(Registry);
+  initializeSimplifyLibCallsPass(Registry);
+  initializeSinkingPass(Registry);
+  initializeTailCallElimPass(Registry);
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+  initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCFGSimplificationPass());
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createInstructionCombiningPass());
+}
+
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopDeletionPass());
+}
+
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createScalarReplAggregatesPass(-1, false));
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+                                                  int Threshold) {
+  unwrap(PM)->add(createScalarReplAggregatesPass(Threshold));
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSimplifyLibCallsPass());
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddConstantPropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createConstantPropagationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createEarlyCSEPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createTypeBasedAliasAnalysisPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBasicAliasAnalysisPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 000000000000..e590a374eac2
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -0,0 +1,2611 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation.  This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible).  Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because they
+// often interact, especially for C++ programs.  As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalarrepl"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumReplaced,  "Number of allocas broken up");
+STATISTIC(NumPromoted,  "Number of allocas promoted");
+STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+
+namespace {
+  struct SROA : public FunctionPass {
+    SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT)
+      : FunctionPass(ID), HasDomTree(hasDT) {
+      if (T == -1)
+        SRThreshold = 128;
+      else
+        SRThreshold = T;
+      if (ST == -1)
+        StructMemberThreshold = 32;
+      else
+        StructMemberThreshold = ST;
+      if (AT == -1)
+        ArrayElementThreshold = 8;
+      else
+        ArrayElementThreshold = AT;
+      if (SLT == -1)
+        // Do not limit the scalar integer load size if no threshold is given.
+        ScalarLoadThreshold = -1;
+      else
+        ScalarLoadThreshold = SLT;
+    }
+
+    bool runOnFunction(Function &F);
+
+    bool performScalarRepl(Function &F);
+    bool performPromotion(Function &F);
+
+  private:
+    bool HasDomTree;
+    DataLayout *TD;
+
+    /// DeadInsts - Keep track of instructions we have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<Value*, 32> DeadInsts;
+
+    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+    /// information about the uses.  All these fields are initialized to false
+    /// and set to true when something is learned.
+    struct AllocaInfo {
+      /// The alloca to promote.
+      AllocaInst *AI;
+
+      /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite
+      /// looping and avoid redundant work.
+      SmallPtrSet<PHINode*, 8> CheckedPHIs;
+
+      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+      bool isUnsafe : 1;
+
+      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+      bool isMemCpySrc : 1;
+
+      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+      bool isMemCpyDst : 1;
+
+      /// hasSubelementAccess - This is true if a subelement of the alloca is
+      /// ever accessed, or false if the alloca is only accessed with mem
+      /// intrinsics or load/store that only access the entire alloca at once.
+      bool hasSubelementAccess : 1;
+
+      /// hasALoadOrStore - This is true if there are any loads or stores to it.
+      /// The alloca may just be accessed with memcpy, for example, which would
+      /// not set this.
+      bool hasALoadOrStore : 1;
+
+      explicit AllocaInfo(AllocaInst *ai)
+        : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false),
+          hasSubelementAccess(false), hasALoadOrStore(false) {}
+    };
+
+    /// SRThreshold - The maximum alloca size to considered for SROA.
+    unsigned SRThreshold;
+
+    /// StructMemberThreshold - The maximum number of members a struct can
+    /// contain to be considered for SROA.
+    unsigned StructMemberThreshold;
+
+    /// ArrayElementThreshold - The maximum number of elements an array can
+    /// have to be considered for SROA.
+    unsigned ArrayElementThreshold;
+
+    /// ScalarLoadThreshold - The maximum size in bits of scalars to load when
+    /// converting to scalar
+    unsigned ScalarLoadThreshold;
+
+    void MarkUnsafe(AllocaInfo &I, Instruction *User) {
+      I.isUnsafe = true;
+      DEBUG(dbgs() << "  Transformation preventing inst: " << *User << '\n');
+    }
+
+    bool isSafeAllocaToScalarRepl(AllocaInst *AI);
+
+    void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info);
+    void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset,
+                                         AllocaInfo &Info);
+    void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
+    void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+                         Type *MemOpType, bool isStore, AllocaInfo &Info,
+                         Instruction *TheAccess, bool AllowWholeAccess);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                  Type *&IdxTy);
+
+    void DoScalarReplacement(AllocaInst *AI,
+                             std::vector<AllocaInst*> &WorkList);
+    void DeleteDeadInstructions();
+
+    void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                              SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                        SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                    SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                      AllocaInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                       SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts);
+    bool ShouldAttemptScalarRepl(AllocaInst *AI);
+  };
+
+  // SROA_DT - SROA that uses DominatorTree.
+  struct SROA_DT : public SROA {
+    static char ID;
+  public:
+    SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, true, ID, ST, AT, SLT) {
+      initializeSROA_DTPass(*PassRegistry::getPassRegistry());
+    }
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.setPreservesCFG();
+    }
+  };
+
+  // SROA_SSAUp - SROA that uses SSAUpdater.
+  struct SROA_SSAUp : public SROA {
+    static char ID;
+  public:
+    SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) :
+        SROA(T, false, ID, ST, AT, SLT) {
+      initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry());
+    }
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+
+}
+
+char SROA_DT::ID = 0;
+char SROA_SSAUp::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl",
+                "Scalar Replacement of Aggregates (DT)", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(SROA_DT, "scalarrepl",
+                "Scalar Replacement of Aggregates (DT)", false, false)
+
+INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa",
+                      "Scalar Replacement of Aggregates (SSAUp)", false, false)
+INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa",
+                    "Scalar Replacement of Aggregates (SSAUp)", false, false)
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold,
+                                                   bool UseDomTree,
+                                                   int StructMemberThreshold,
+                                                   int ArrayElementThreshold,
+                                                   int ScalarLoadThreshold) {
+  if (UseDomTree)
+    return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold,
+                       ScalarLoadThreshold);
+  return new SROA_SSAUp(Threshold, StructMemberThreshold,
+                        ArrayElementThreshold, ScalarLoadThreshold);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Convert To Scalar Optimization.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// ConvertToScalarInfo - This class implements the "Convert To Scalar"
+/// optimization, which scans the uses of an alloca and determines if it can
+/// rewrite it in terms of a single new alloca that can be mem2reg'd.
+class ConvertToScalarInfo {
+  /// AllocaSize - The size of the alloca being considered in bytes.
+  unsigned AllocaSize;
+  const DataLayout &TD;
+  unsigned ScalarLoadThreshold;
+
+  /// IsNotTrivial - This is set to true if there is some access to the object
+  /// which means that mem2reg can't promote it.
+  bool IsNotTrivial;
+
+  /// ScalarKind - Tracks the kind of alloca being considered for promotion,
+  /// computed based on the uses of the alloca rather than the LLVM type system.
+  enum {
+    Unknown,
+
+    // Accesses via GEPs that are consistent with element access of a vector
+    // type. This will not be converted into a vector unless there is a later
+    // access using an actual vector type.
+    ImplicitVector,
+
+    // Accesses via vector operations and GEPs that are consistent with the
+    // layout of a vector type.
+    Vector,
+
+    // An integer bag-of-bits with bitwise operations for insertion and
+    // extraction. Any combination of types can be converted into this kind
+    // of scalar.
+    Integer
+  } ScalarKind;
+
+  /// VectorTy - This tracks the type that we should promote the vector to if
+  /// it is possible to turn it into a vector.  This starts out null, and if it
+  /// isn't possible to turn into a vector type, it gets set to VoidTy.
+  VectorType *VectorTy;
+
+  /// HadNonMemTransferAccess - True if there is at least one access to the
+  /// alloca that is not a MemTransferInst.  We don't want to turn structs into
+  /// large integers unless there is some potential for optimization.
+  bool HadNonMemTransferAccess;
+
+  /// HadDynamicAccess - True if some element of this alloca was dynamic.
+  /// We don't yet have support for turning a dynamic access into a large
+  /// integer.
+  bool HadDynamicAccess;
+
+public:
+  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td,
+                               unsigned SLT)
+    : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false),
+    ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
+    HadDynamicAccess(false) { }
+
+  AllocaInst *TryConvert(AllocaInst *AI);
+
+private:
+  bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx);
+  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
+  void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset,
+                           Value *NonConstantIdx);
+
+  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
+                                    uint64_t Offset, Value* NonConstantIdx,
+                                    IRBuilder<> &Builder);
+  Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
+                                   uint64_t Offset, Value* NonConstantIdx,
+                                   IRBuilder<> &Builder);
+};
+} // end anonymous namespace.
+
+
+/// TryConvert - Analyze the specified alloca, and if it is safe to do so,
+/// rewrite it to be a new alloca which is mem2reg'able.  This returns the new
+/// alloca if possible or null if not.
+AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
+  // If we can't convert this scalar, or if mem2reg can trivially do it, bail
+  // out.
+  if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial)
+    return 0;
+
+  // If an alloca has only memset / memcpy uses, it may still have an Unknown
+  // ScalarKind. Treat it as an Integer below.
+  if (ScalarKind == Unknown)
+    ScalarKind = Integer;
+
+  if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
+    ScalarKind = Integer;
+
+  // If we were able to find a vector type that can handle this with
+  // insert/extract elements, and if there was at least one use that had
+  // a vector type, promote this to a vector.  We don't want to promote
+  // random stuff that doesn't use vectors (e.g. <9 x double>) because then
+  // we just get a lot of insert/extracts.  If at least one vector is
+  // involved, then we probably really do have a union of vector/array.
+  Type *NewTy;
+  if (ScalarKind == Vector) {
+    assert(VectorTy && "Missing type for vector scalar.");
+    DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
+          << *VectorTy << '\n');
+    NewTy = VectorTy;  // Use the vector type.
+  } else {
+    unsigned BitWidth = AllocaSize * 8;
+
+    // Do not convert to scalar integer if the alloca size exceeds the
+    // scalar load threshold.
+    if (BitWidth > ScalarLoadThreshold)
+      return 0;
+
+    if ((ScalarKind == ImplicitVector || ScalarKind == Integer) &&
+        !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth))
+      return 0;
+    // Dynamic accesses on integers aren't yet supported.  They need us to shift
+    // by a dynamic amount which could be difficult to work out as we might not
+    // know whether to use a left or right shift.
+    if (ScalarKind == Integer && HadDynamicAccess)
+      return 0;
+
+    DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n");
+    // Create and insert the integer alloca.
+    NewTy = IntegerType::get(AI->getContext(), BitWidth);
+  }
+  AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin());
+  ConvertUsesToScalar(AI, NewAI, 0, 0);
+  return NewAI;
+}
+
+/// MergeInTypeForLoadOrStore - Add the 'In' type to the accumulated vector type
+/// (VectorTy) so far at the offset specified by Offset (which is specified in
+/// bytes).
+///
+/// There are two cases we handle here:
+///   1) A union of vector types of the same size and potentially its elements.
+///      Here we turn element accesses into insert/extract element operations.
+///      This promotes a <4 x float> with a store of float to the third element
+///      into a <4 x float> that uses insert element.
+///   2) A fully general blob of memory, which we turn into some (potentially
+///      large) integer type with extract and insert operations where the loads
+///      and stores would mutate the memory.  We mark this by setting VectorTy
+///      to VoidTy.
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
+                                                    uint64_t Offset) {
+  // If we already decided to turn this into a blob of integer memory, there is
+  // nothing to be done.
+  if (ScalarKind == Integer)
+    return;
+
+  // If this could be contributing to a vector, analyze it.
+
+  // If the In type is a vector that is the same size as the alloca, see if it
+  // matches the existing VecTy.
+  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
+    if (MergeInVectorType(VInTy, Offset))
+      return;
+  } else if (In->isFloatTy() || In->isDoubleTy() ||
+             (In->isIntegerTy() && In->getPrimitiveSizeInBits() >= 8 &&
+              isPowerOf2_32(In->getPrimitiveSizeInBits()))) {
+    // Full width accesses can be ignored, because they can always be turned
+    // into bitcasts.
+    unsigned EltSize = In->getPrimitiveSizeInBits()/8;
+    if (EltSize == AllocaSize)
+      return;
+
+    // If we're accessing something that could be an element of a vector, see
+    // if the implied vector agrees with what we already have and if Offset is
+    // compatible with it.
+    if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
+      if (!VectorTy) {
+        ScalarKind = ImplicitVector;
+        VectorTy = VectorType::get(In, AllocaSize/EltSize);
+      }
+      return;
+    }
+  }
+
+  // Otherwise, we have a case that we can't handle with an optimized vector
+  // form.  We can still turn this into a large integer.
+  ScalarKind = Integer;
+}
+
+/// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
+/// returning true if the type was successfully merged and false otherwise.
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
+                                            uint64_t Offset) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
+    ScalarKind = Vector;
+    return true;
+  }
+
+  return false;
+}
+
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
+/// its accesses to a single vector type, return true and set VecTy to
+/// the new type.  If we could convert the alloca into a single promotable
+/// integer, return true but set VecTy to VoidTy.  Further, if the use is not a
+/// completely trivial use that mem2reg could promote, set IsNotTrivial.  Offset
+/// is the current offset from the base of the alloca being analyzed.
+///
+/// If we see at least one access to the value that is as a vector type, set the
+/// SawVec flag.
+bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset,
+                                             Value* NonConstantIdx) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // Don't break volatile loads.
+      if (!LI->isSimple())
+        return false;
+      // Don't touch MMX operations.
+      if (LI->getType()->isX86_MMXTy())
+        return false;
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(LI->getType(), Offset);
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Storing the pointer, not into the value?
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
+      // Don't touch MMX operations.
+      if (SI->getOperand(0)->getType()->isX86_MMXTy())
+        return false;
+      HadNonMemTransferAccess = true;
+      MergeInTypeForLoadOrStore(SI->getOperand(0)->getType(), Offset);
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!CanConvertToScalar(BCI, Offset, NonConstantIdx))
+        return false;
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // If this is a GEP with a variable indices, we can't handle it.
+      PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+      if (!PtrTy)
+        return false;
+
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      Value *GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        if (!isa<VectorType>(PtrTy->getElementType()))
+          return false;
+        if (NonConstantIdx)
+          return false;
+        GEPNonConstantIdx = Indices.pop_back_val();
+        if (!GEPNonConstantIdx->getType()->isIntegerTy(32))
+          return false;
+        HadDynamicAccess = true;
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(PtrTy,
+                                               Indices);
+      // See if all uses can be converted.
+      if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx))
+        return false;
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // handle it.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
+      // Store of constant value.
+      if (!isa<ConstantInt>(MSI->getValue()))
+        return false;
+
+      // Store of constant size.
+      ConstantInt *Len = dyn_cast<ConstantInt>(MSI->getLength());
+      if (!Len)
+        return false;
+
+      // If the size differs from the alloca, we can only convert the alloca to
+      // an integer bag-of-bits.
+      // FIXME: This should handle all of the cases that are currently accepted
+      // as vector element insertions.
+      if (Len->getZExtValue() != AllocaSize || Offset != 0)
+        ScalarKind = Integer;
+
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      HadNonMemTransferAccess = true;
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      // Store to dynamic index.
+      if (NonConstantIdx)
+        return false;
+      ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength());
+      if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0)
+        return false;
+
+      IsNotTrivial = true;  // Can't be mem2reg'd.
+      continue;
+    }
+
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
+    // Otherwise, we cannot handle this!
+    return false;
+  }
+
+  return true;
+}
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly.  This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.  By the end of this, there should be no uses of Ptr.
+void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
+                                              uint64_t Offset,
+                                              Value* NonConstantIdx) {
+  while (!Ptr->use_empty()) {
+    Instruction *User = cast<Instruction>(Ptr->use_back());
+
+    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+      ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx);
+      CI->eraseFromParent();
+      continue;
+    }
+
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Compute the offset that this GEP adds to the pointer.
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
+      Value* GEPNonConstantIdx = 0;
+      if (!GEP->hasAllConstantIndices()) {
+        assert(!NonConstantIdx &&
+               "Dynamic GEP reading from dynamic GEP unsupported");
+        GEPNonConstantIdx = Indices.pop_back_val();
+      } else
+        GEPNonConstantIdx = NonConstantIdx;
+      uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
+                                               Indices);
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx);
+      GEP->eraseFromParent();
+      continue;
+    }
+
+    IRBuilder<> Builder(User);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // The load is a bit extract from NewAI shifted right by Offset bits.
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
+      Value *NewLoadVal
+        = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset,
+                                     NonConstantIdx, Builder);
+      LI->replaceAllUsesWith(NewLoadVal);
+      LI->eraseFromParent();
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      assert(SI->getOperand(0) != Ptr && "Consistency error!");
+      Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+      Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset,
+                                             NonConstantIdx, Builder);
+      Builder.CreateStore(New, NewAI);
+      SI->eraseFromParent();
+
+      // If the load we just inserted is now dead, then the inserted store
+      // overwrote the entire thing.
+      if (Old->use_empty())
+        Old->eraseFromParent();
+      continue;
+    }
+
+    // If this is a constant sized memset of a constant value (e.g. 0) we can
+    // transform it into a store of the expanded constant value.
+    if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) {
+      assert(MSI->getRawDest() == Ptr && "Consistency error!");
+      assert(!NonConstantIdx && "Cannot replace dynamic memset with insert");
+      int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue();
+      if (SNumBytes > 0 && (SNumBytes >> 32) == 0) {
+        unsigned NumBytes = static_cast<unsigned>(SNumBytes);
+        unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue();
+
+        // Compute the value replicated the right number of times.
+        APInt APVal(NumBytes*8, Val);
+
+        // Splat the value if non-zero.
+        if (Val)
+          for (unsigned i = 1; i != NumBytes; ++i)
+            APVal |= APVal << 8;
+
+        Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in");
+        Value *New = ConvertScalar_InsertValue(
+                                    ConstantInt::get(User->getContext(), APVal),
+                                               Old, Offset, 0, Builder);
+        Builder.CreateStore(New, NewAI);
+
+        // If the load we just inserted is now dead, then the memset overwrote
+        // the entire thing.
+        if (Old->use_empty())
+          Old->eraseFromParent();
+      }
+      MSI->eraseFromParent();
+      continue;
+    }
+
+    // If this is a memcpy or memmove into or out of the whole allocation, we
+    // can handle it like a load or store of the scalar type.
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) {
+      assert(Offset == 0 && "must be store to start of alloca");
+      assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert");
+
+      // If the source and destination are both to the same alloca, then this is
+      // a noop copy-to-self, just delete it.  Otherwise, emit a load and store
+      // as appropriate.
+      AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0));
+
+      if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) {
+        // Dest must be OrigAI, change this to be a load from the original
+        // pointer (bitcasted), then a store to our new alloca.
+        assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
+        Value *SrcPtr = MTI->getSource();
+        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+          AIPTy = PointerType::get(AIPTy->getElementType(),
+                                   SPTy->getAddressSpace());
+        }
+        SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy);
+
+        LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval");
+        SrcVal->setAlignment(MTI->getAlignment());
+        Builder.CreateStore(SrcVal, NewAI);
+      } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) {
+        // Src must be OrigAI, change this to be a load from NewAI then a store
+        // through the original dest pointer (bitcasted).
+        assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
+        LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
+
+        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
+          AIPTy = PointerType::get(AIPTy->getElementType(),
+                                   DPTy->getAddressSpace());
+        }
+        Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy);
+
+        StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr);
+        NewStore->setAlignment(MTI->getAlignment());
+      } else {
+        // Noop transfer. Src == Dst
+      }
+
+      MTI->eraseFromParent();
+      continue;
+    }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
+
+    llvm_unreachable("Unsupported operation!");
+  }
+}
+
+/// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
+/// or vector value FromVal, extracting the bits from the offset specified by
+/// Offset.  This returns the value, which is of type ToType.
+///
+/// This happens when we are converting an "integer union" to a single
+/// integer scalar, or when we are converting a "vector union" to a vector with
+/// insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+Value *ConvertToScalarInfo::
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
+                           uint64_t Offset, Value* NonConstantIdx,
+                           IRBuilder<> &Builder) {
+  // If the load is of the whole new alloca, no conversion is needed.
+  Type *FromType = FromVal->getType();
+  if (FromType == ToType && Offset == 0)
+    return FromVal;
+
+  // If the result alloca is a vector type, this is either an element
+  // access or a bitcast to another vector type of the same size.
+  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+    unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
+    unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
+
+    // Otherwise it must be an element access.
+    unsigned Elt = 0;
+    if (Offset) {
+      unsigned EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
+      Elt = Offset/EltSize;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
+    }
+    // Return the element extracted out of it.
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    Value *V = Builder.CreateExtractElement(FromVal, Idx);
+    if (V->getType() != ToType)
+      V = Builder.CreateBitCast(V, ToType);
+    return V;
+  }
+
+  // If ToType is a first class aggregate, extract out each of the pieces and
+  // use insertvalue's to form the FCA.
+  if (StructType *ST = dyn_cast<StructType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
+    const StructLayout &Layout = *TD.getStructLayout(ST);
+    Value *Res = UndefValue::get(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
+                                        Offset+Layout.getElementOffsetInBits(i),
+                                              0, Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i);
+    }
+    return Res;
+  }
+
+  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
+    Value *Res = UndefValue::get(AT);
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
+                                              Offset+i*EltSize, 0, Builder);
+      Res = Builder.CreateInsertValue(Res, Elt, i);
+    }
+    return Res;
+  }
+
+  // Otherwise, this must be a union that was converted to an integer value.
+  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+
+  // If this is a big-endian system and the load is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD.isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = TD.getTypeStoreSizeInBits(NTy) -
+            TD.getTypeStoreSizeInBits(ToType) - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shl) which are not defined.
+  // We do this to support (f.e.) loads off the end of a structure where
+  // only some bits are used.
+  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateLShr(FromVal,
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
+  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+    FromVal = Builder.CreateShl(FromVal,
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
+
+  // Finally, unconditionally truncate the integer to the right width.
+  unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
+  if (LIBitWidth < NTy->getBitWidth())
+    FromVal =
+      Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
+                                                    LIBitWidth));
+  else if (LIBitWidth > NTy->getBitWidth())
+    FromVal =
+       Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
+                                                    LIBitWidth));
+
+  // If the result is an integer, this is a trunc or bitcast.
+  if (ToType->isIntegerTy()) {
+    // Should be done.
+  } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
+    // Just do a bitcast, we know the sizes match up.
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
+  } else {
+    // Otherwise must be a pointer.
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
+  }
+  assert(FromVal->getType() == ToType && "Didn't convert right?");
+  return FromVal;
+}
+
+/// ConvertScalar_InsertValue - Insert the value "SV" into the existing integer
+/// or vector value "Old" at the offset specified by Offset.
+///
+/// This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.
+///
+/// NonConstantIdx is an index value if there was a GEP with a non-constant
+/// index value.  If this is 0 then all GEPs used to find this insert address
+/// are constant.
+Value *ConvertToScalarInfo::
+ConvertScalar_InsertValue(Value *SV, Value *Old,
+                          uint64_t Offset, Value* NonConstantIdx,
+                          IRBuilder<> &Builder) {
+  // Convert the stored type to the actual type, shift it left to insert
+  // then 'or' into place.
+  Type *AllocaType = Old->getType();
+  LLVMContext &Context = Old->getContext();
+
+  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+    uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
+    uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
+
+    // Changing the whole vector with memset or with an access of a different
+    // vector type?
+    if (ValSize == VecSize)
+        return Builder.CreateBitCast(SV, AllocaType);
+
+    // Must be an element insertion.
+    Type *EltTy = VTy->getElementType();
+    if (SV->getType() != EltTy)
+      SV = Builder.CreateBitCast(SV, EltTy);
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy);
+    unsigned Elt = Offset/EltSize;
+    Value *Idx;
+    if (NonConstantIdx) {
+      if (Elt)
+        Idx = Builder.CreateAdd(NonConstantIdx,
+                                Builder.getInt32(Elt),
+                                "dyn.offset");
+      else
+        Idx = NonConstantIdx;
+    } else
+      Idx = Builder.getInt32(Elt);
+    return Builder.CreateInsertElement(Old, SV, Idx);
+  }
+
+  // If SV is a first-class aggregate value, insert each value recursively.
+  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into struct types not supported");
+    const StructLayout &Layout = *TD.getStructLayout(ST);
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i);
+      Old = ConvertScalar_InsertValue(Elt, Old,
+                                      Offset+Layout.getElementOffsetInBits(i),
+                                      0, Builder);
+    }
+    return Old;
+  }
+
+  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+    assert(!NonConstantIdx &&
+           "Dynamic indexing into array types not supported");
+    uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      Value *Elt = Builder.CreateExtractValue(SV, i);
+      Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder);
+    }
+    return Old;
+  }
+
+  // If SV is a float, convert it to the appropriate integer type.
+  // If it is a pointer, do the same.
+  unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType());
+  unsigned DestWidth = TD.getTypeSizeInBits(AllocaType);
+  unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
+  unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
+  if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
+  else if (SV->getType()->isPointerTy())
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+
+  // Zero extend or truncate the value if needed.
+  if (SV->getType() != AllocaType) {
+    if (SV->getType()->getPrimitiveSizeInBits() <
+             AllocaType->getPrimitiveSizeInBits())
+      SV = Builder.CreateZExt(SV, AllocaType);
+    else {
+      // Truncation may be needed if storing more than the alloca can hold
+      // (undefined behavior).
+      SV = Builder.CreateTrunc(SV, AllocaType);
+      SrcWidth = DestWidth;
+      SrcStoreWidth = DestStoreWidth;
+    }
+  }
+
+  // If this is a big-endian system and the store is narrower than the
+  // full alloca type, we need to do a shift to get the right bits.
+  int ShAmt = 0;
+  if (TD.isBigEndian()) {
+    // On big-endian machines, the lowest bit is stored at the bit offset
+    // from the pointer given by getTypeStoreSizeInBits.  This matters for
+    // integers with a bitwidth that is not a multiple of 8.
+    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
+  } else {
+    ShAmt = Offset;
+  }
+
+  // Note: we support negative bitwidths (with shr) which are not defined.
+  // We do this to support (f.e.) stores off the end of a structure where
+  // only some bits in the structure are set.
+  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
+    Mask <<= ShAmt;
+  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
+    Mask = Mask.lshr(-ShAmt);
+  }
+
+  // Mask out the bits we are about to insert from the old value, and or
+  // in the new bits.
+  if (SrcWidth != DestWidth) {
+    assert(DestWidth > SrcWidth);
+    Old = Builder.CreateAnd(Old, ConstantInt::get(Context, ~Mask), "mask");
+    SV = Builder.CreateOr(Old, SV, "ins");
+  }
+  return SV;
+}
+
+
+//===----------------------------------------------------------------------===//
+// SRoA Driver
+//===----------------------------------------------------------------------===//
+
+
+bool SROA::runOnFunction(Function &F) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  bool Changed = performPromotion(F);
+
+  // FIXME: ScalarRepl currently depends on DataLayout more than it
+  // theoretically needs to. It should be refactored in order to support
+  // target-independent IR. Until this is done, just skip the actual
+  // scalar-replacement portion of this pass.
+  if (!TD) return Changed;
+
+  while (1) {
+    bool LocalChange = performScalarRepl(F);
+    if (!LocalChange) break;   // No need to repromote if no scalarrepl
+    Changed = true;
+    LocalChange = performPromotion(F);
+    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
+  }
+
+  return Changed;
+}
+
+namespace {
+class AllocaPromoter : public LoadAndStorePromoter {
+  AllocaInst *AI;
+  DIBuilder *DIB;
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
+public:
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 DIBuilder *DB)
+    : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {}
+
+  void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) {
+    // Remember which alloca we're promoting (for isInstInList).
+    this->AI = AI;
+    if (MDNode *DebugNode = MDNode::getIfExists(AI->getContext(), AI)) {
+      for (Value::use_iterator UI = DebugNode->use_begin(),
+             E = DebugNode->use_end(); UI != E; ++UI)
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+          DDIs.push_back(DDI);
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+          DVIs.push_back(DVI);
+    }
+
+    LoadAndStorePromoter::run(Insts);
+    AI->eraseFromParent();
+    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      DDI->eraseFromParent();
+    }
+    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      DVI->eraseFromParent();
+    }
+  }
+
+  virtual bool isInstInList(Instruction *I,
+                            const SmallVectorImpl<Instruction*> &Insts) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      return LI->getOperand(0) == AI;
+    return cast<StoreInst>(I)->getPointerOperand() == AI;
+  }
+
+  virtual void updateDebugInfo(Instruction *Inst) const {
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
+    }
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      Value *Arg = NULL;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getOperand(0);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Arg = LI->getOperand(0);
+      } else {
+        continue;
+      }
+      Instruction *DbgVal =
+        DIB->insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                     Inst);
+      DbgVal->setDebugLoc(DVI->getDebugLoc());
+    }
+  }
+};
+} // end anon namespace
+
+/// isSafeSelectToSpeculate - Select instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers and then
+/// select between the result, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) {
+  bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
+  bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
+
+  for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
+       UI != UE; ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple()) return false;
+
+    // Both operands to the select need to be dereferencable, either absolutely
+    // (e.g. allocas) or at this point because we can see other accesses to it.
+    if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI,
+                                                    LI->getAlignment(), TD))
+      return false;
+    if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI,
+                                                    LI->getAlignment(), TD))
+      return false;
+  }
+
+  return true;
+}
+
+/// isSafePHIToSpeculate - PHI instructions that use an alloca and are
+/// subsequently loaded can be rewritten to load both input pointers in the pred
+/// blocks and then PHI the results, allowing the load of the alloca to be
+/// promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operand to
+/// the select can be loaded unconditionally.
+static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) {
+  // For now, we can only do this promotion if the load is in the same block as
+  // the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN->getParent();
+  unsigned MaxAlign = 0;
+  for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
+       UI != UE; ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple()) return false;
+
+    // For now we only allow loads in the same block as the PHI.  This is a
+    // common case that happens when instcombine merges two loads through a PHI.
+    if (LI->getParent() != BB) return false;
+
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+  }
+
+  // Okay, we know that we have one or more loads in the same block as the PHI.
+  // We can transform this if it is safe to push the loads into the predecessor
+  // blocks.  The only thing to watch out for is that we can't put a possibly
+  // trapping load in the predecessor if it is a critical edge.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
+
+    // If the predecessor has a single successor, then the edge isn't critical.
+    if (Pred->getTerminator()->getNumSuccessors() == 1)
+      continue;
+
+    // If this pointer is always safe to load, or if we can prove that there is
+    // already a load in the block, then we can move the load to the pred block.
+    if (InVal->isDereferenceablePointer() ||
+        isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+
+/// tryToMakeAllocaBePromotable - This returns true if the alloca only has
+/// direct (non-volatile) loads and stores to it.  If the alloca is close but
+/// not quite there, this will transform the code to allow promotion.  As such,
+/// it is a non-pure predicate.
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
+  SetVector<Instruction*, SmallVector<Instruction*, 4>,
+            SmallPtrSet<Instruction*, 4> > InstsToRewrite;
+
+  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI) {
+    User *U = *UI;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      if (!LI->isSimple())
+        return false;
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI || !SI->isSimple())
+        return false;   // Don't allow a store OF the AI, only INTO the AI.
+      continue;
+    }
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(U)) {
+      // If the condition being selected on is a constant, fold the select, yes
+      // this does (rarely) happen early on.
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) {
+        Value *Result = SI->getOperand(1+CI->isZero());
+        SI->replaceAllUsesWith(Result);
+        SI->eraseFromParent();
+
+        // This is very rare and we just scrambled the use list of AI, start
+        // over completely.
+        return tryToMakeAllocaBePromotable(AI, TD);
+      }
+
+      // If it is safe to turn "load (select c, AI, ptr)" into a select of two
+      // loads, then we can transform this by rewriting the select.
+      if (!isSafeSelectToSpeculate(SI, TD))
+        return false;
+
+      InstsToRewrite.insert(SI);
+      continue;
+    }
+
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      if (PN->use_empty()) {  // Dead PHIs can be stripped.
+        InstsToRewrite.insert(PN);
+        continue;
+      }
+
+      // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads
+      // in the pred blocks, then we can transform this by rewriting the PHI.
+      if (!isSafePHIToSpeculate(PN, TD))
+        return false;
+
+      InstsToRewrite.insert(PN);
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+
+    return false;
+  }
+
+  // If there are no instructions to rewrite, then all uses are load/stores and
+  // we're done!
+  if (InstsToRewrite.empty())
+    return true;
+
+  // If we have instructions that need to be rewritten for this to be promotable
+  // take care of it now.
+  for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
+           I != E;) {
+        Use &U = I.getUse();
+        ++I;
+        cast<Instruction>(U.getUser())->eraseFromParent();
+      }
+      BCI->eraseFromParent();
+      continue;
+    }
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
+      // Selects in InstsToRewrite only have load uses.  Rewrite each as two
+      // loads with a new select.
+      while (!SI->use_empty()) {
+        LoadInst *LI = cast<LoadInst>(SI->use_back());
+
+        IRBuilder<> Builder(LI);
+        LoadInst *TrueLoad =
+          Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t");
+        LoadInst *FalseLoad =
+          Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f");
+
+        // Transfer alignment and TBAA info if present.
+        TrueLoad->setAlignment(LI->getAlignment());
+        FalseLoad->setAlignment(LI->getAlignment());
+        if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+          TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+          FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag);
+        }
+
+        Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad);
+        V->takeName(LI);
+        LI->replaceAllUsesWith(V);
+        LI->eraseFromParent();
+      }
+
+      // Now that all the loads are gone, the select is gone too.
+      SI->eraseFromParent();
+      continue;
+    }
+
+    // Otherwise, we have a PHI node which allows us to push the loads into the
+    // predecessors.
+    PHINode *PN = cast<PHINode>(InstsToRewrite[i]);
+    if (PN->use_empty()) {
+      PN->eraseFromParent();
+      continue;
+    }
+
+    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+    PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
+                                     PN->getName()+".ld", PN);
+
+    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+    // matter which one we get and if any differ, it doesn't matter.
+    LoadInst *SomeLoad = cast<LoadInst>(PN->use_back());
+    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+    unsigned Align = SomeLoad->getAlignment();
+
+    // Rewrite all loads of the PN to use the new PHI.
+    while (!PN->use_empty()) {
+      LoadInst *LI = cast<LoadInst>(PN->use_back());
+      LI->replaceAllUsesWith(NewPN);
+      LI->eraseFromParent();
+    }
+
+    // Inject loads into all of the pred blocks.  Keep track of which blocks we
+    // insert them into in case we have multiple edges from the same block.
+    DenseMap<BasicBlock*, LoadInst*> InsertedLoads;
+
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *Pred = PN->getIncomingBlock(i);
+      LoadInst *&Load = InsertedLoads[Pred];
+      if (Load == 0) {
+        Load = new LoadInst(PN->getIncomingValue(i),
+                            PN->getName() + "." + Pred->getName(),
+                            Pred->getTerminator());
+        Load->setAlignment(Align);
+        if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+      }
+
+      NewPN->addIncoming(Load, Pred);
+    }
+
+    PN->eraseFromParent();
+  }
+
+  ++NumAdjusted;
+  return true;
+}
+
+bool SROA::performPromotion(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+  DominatorTree *DT = 0;
+  if (HasDomTree)
+    DT = &getAnalysis<DominatorTree>();
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+  DIBuilder DIB(*F.getParent());
+  bool Changed = false;
+  SmallVector<Instruction*, 64> Insts;
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (tryToMakeAllocaBePromotable(AI, TD))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    if (HasDomTree)
+      PromoteMemToReg(Allocas, *DT);
+    else {
+      SSAUpdater SSA;
+      for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+        AllocaInst *AI = Allocas[i];
+
+        // Build list of instructions to promote.
+        for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+             UI != E; ++UI)
+          Insts.push_back(cast<Instruction>(*UI));
+        AllocaPromoter(Insts, SSA, &DIB).run(AI, Insts);
+        Insts.clear();
+      }
+    }
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+
+/// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
+/// SROA.  It must be a struct or array type with a small number of elements.
+bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
+  Type *T = AI->getAllocatedType();
+  // Do not promote any struct that has too many members.
+  if (StructType *ST = dyn_cast<StructType>(T))
+    return ST->getNumElements() <= StructMemberThreshold;
+  // Do not promote any array that has too many elements.
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getNumElements() <= ArrayElementThreshold;
+  return false;
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the alloca instructions in the function, removing them
+// if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+  std::vector<AllocaInst*> WorkList;
+
+  // Scan the entry basic block, adding allocas to the worklist.
+  BasicBlock &BB = F.getEntryBlock();
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    if (AllocaInst *A = dyn_cast<AllocaInst>(I))
+      WorkList.push_back(A);
+
+  // Process the worklist
+  bool Changed = false;
+  while (!WorkList.empty()) {
+    AllocaInst *AI = WorkList.back();
+    WorkList.pop_back();
+
+    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
+    // with unused elements.
+    if (AI->use_empty()) {
+      AI->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    // If this alloca is impossible for us to promote, reject it early.
+    if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
+      continue;
+
+    // Check to see if we can perform the core SROA transformation.  We cannot
+    // transform the allocation instruction if it is an array allocation
+    // (allocations OF arrays are ok though), and an allocation of a scalar
+    // value cannot be decomposed at all.
+    uint64_t AllocaSize = TD->getTypeAllocSize(AI->getAllocatedType());
+
+    // Do not promote [0 x %struct].
+    if (AllocaSize == 0) continue;
+
+    // Do not promote any struct whose size is too big.
+    if (AllocaSize > SRThreshold) continue;
+
+    // If the alloca looks like a good candidate for scalar replacement, and if
+    // all its users can be transformed, then split up the aggregate into its
+    // separate elements.
+    if (ShouldAttemptScalarRepl(AI) && isSafeAllocaToScalarRepl(AI)) {
+      DoScalarReplacement(AI, WorkList);
+      Changed = true;
+      continue;
+    }
+
+    // If we can turn this aggregate value (potentially with casts) into a
+    // simple scalar value that can be mem2reg'd into a register value.
+    // IsNotTrivial tracks whether this is something that mem2reg could have
+    // promoted itself.  If so, we don't want to transform it needlessly.  Note
+    // that we can't just check based on the type: the alloca may be of an i32
+    // but that has pointer arithmetic to set byte 3 of it or something.
+    if (AllocaInst *NewAI = ConvertToScalarInfo(
+              (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) {
+      NewAI->takeName(AI);
+      AI->eraseFromParent();
+      ++NumConverted;
+      Changed = true;
+      continue;
+    }
+
+    // Otherwise, couldn't process this alloca.
+  }
+
+  return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocaInst *AI,
+                               std::vector<AllocaInst*> &WorkList) {
+  DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
+  SmallVector<AllocaInst*, 32> ElementAllocas;
+  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    ElementAllocas.reserve(ST->getNumContainedTypes());
+    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
+                                      AI->getAlignment(),
+                                      AI->getName() + "." + Twine(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  } else {
+    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ElementAllocas.reserve(AT->getNumElements());
+    Type *ElTy = AT->getElementType();
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
+                                      AI->getName() + "." + Twine(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  }
+
+  // Now that we have created the new alloca instructions, rewrite all the
+  // uses of the old alloca.
+  RewriteForScalarRepl(AI, AI, 0, ElementAllocas);
+
+  // Now erase any instructions that were made dead while rewriting the alloca.
+  DeleteDeadInstructions();
+  AI->eraseFromParent();
+
+  ++NumReplaced;
+}
+
+/// DeleteDeadInstructions - Erase instructions on the DeadInstrs list,
+/// recursively including all their operands that become trivially dead.
+void SROA::DeleteDeadInstructions() {
+  while (!DeadInsts.empty()) {
+    Instruction *I = cast<Instruction>(DeadInsts.pop_back_val());
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        // (But, don't add allocas to the dead instruction list -- they are
+        // already on the worklist and will be deleted separately.)
+        *OI = 0;
+        if (isInstructionTriviallyDead(U) && !isa<AllocaInst>(U))
+          DeadInsts.push_back(U);
+      }
+
+    I->eraseFromParent();
+  }
+}
+
+/// isSafeForScalarRepl - Check if instruction I is a safe use with regard to
+/// performing scalar replacement of alloca AI.  The results are flagged in
+/// the Info parameter.  Offset indicates the position within AI that is
+/// referenced by this instruction.
+void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
+                               AllocaInfo &Info) {
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      isSafeForScalarRepl(BC, Offset, Info);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      uint64_t GEPOffset = Offset;
+      isSafeGEP(GEPI, GEPOffset, Info);
+      if (!Info.isUnsafe)
+        isSafeForScalarRepl(GEPI, GEPOffset, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      if (Length == 0)
+        return MarkUnsafe(Info, User);
+      if (Length->isNegative())
+        return MarkUnsafe(Info, User);
+
+      isSafeMemAccess(Offset, Length->getZExtValue(), 0,
+                      UI.getOperandNo() == 0, Info, MI,
+                      true /*AllowWholeAccess*/);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      if (!LI->isSimple())
+        return MarkUnsafe(Info, User);
+      Type *LIType = LI->getType();
+      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
+                      LIType, false, Info, LI, true /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (!SI->isSimple() || SI->getOperand(0) == I)
+        return MarkUnsafe(Info, User);
+
+      Type *SIType = SI->getOperand(0)->getType();
+      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
+                      SIType, true, Info, SI, true /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
+    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
+      isSafePHISelectUseForScalarRepl(User, Offset, Info);
+    } else {
+      return MarkUnsafe(Info, User);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+
+/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer
+/// derived from the alloca, we can often still split the alloca into elements.
+/// This is useful if we have a large alloca where one element is phi'd
+/// together somewhere: we can SRoA and promote all the other elements even if
+/// we end up not being able to promote this one.
+///
+/// All we require is that the uses of the PHI do not index into other parts of
+/// the alloca.  The most important use case for this is single load and stores
+/// that are PHI'd together, which can happen due to code sinking.
+void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
+                                           AllocaInfo &Info) {
+  // If we've already checked this PHI, don't do it again.
+  if (PHINode *PN = dyn_cast<PHINode>(I))
+    if (!Info.CheckedPHIs.insert(PN))
+      return;
+
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      isSafePHISelectUseForScalarRepl(BC, Offset, Info);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      // Only allow "bitcast" GEPs for simplicity.  We could generalize this,
+      // but would have to prove that we're staying inside of an element being
+      // promoted.
+      if (!GEPI->hasAllZeroIndices())
+        return MarkUnsafe(Info, User);
+      isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      if (!LI->isSimple())
+        return MarkUnsafe(Info, User);
+      Type *LIType = LI->getType();
+      isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
+                      LIType, false, Info, LI, false /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (!SI->isSimple() || SI->getOperand(0) == I)
+        return MarkUnsafe(Info, User);
+
+      Type *SIType = SI->getOperand(0)->getType();
+      isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
+                      SIType, true, Info, SI, false /*AllowWholeAccess*/);
+      Info.hasALoadOrStore = true;
+    } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
+      isSafePHISelectUseForScalarRepl(User, Offset, Info);
+    } else {
+      return MarkUnsafe(Info, User);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+/// isSafeGEP - Check if a GEP instruction can be handled for scalar
+/// replacement.  It is safe when all the indices are constant, in-bounds
+/// references, and when the resulting offset corresponds to an element within
+/// the alloca type.  The results are flagged in the Info parameter.  Upon
+/// return, Offset is adjusted as specified by the GEP indices.
+void SROA::isSafeGEP(GetElementPtrInst *GEPI,
+                     uint64_t &Offset, AllocaInfo &Info) {
+  gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+  if (GEPIt == E)
+    return;
+  bool NonConstant = false;
+  unsigned NonConstantIdxSize = 0;
+
+  // Walk through the GEP type indices, checking the types that this indexes
+  // into.
+  for (; GEPIt != E; ++GEPIt) {
+    // Ignore struct elements, no extra checking needed for these.
+    if ((*GEPIt)->isStructTy())
+      continue;
+
+    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
+    if (!IdxVal) {
+      // Non constant GEPs are only a problem on arrays, structs, and pointers
+      // Vectors can be dynamically indexed.
+      // FIXME: Add support for dynamic indexing on arrays.  This should be
+      // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0]
+      // isn't.
+      if (!(*GEPIt)->isVectorTy())
+        return MarkUnsafe(Info, GEPI);
+      NonConstant = true;
+      NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt);
+    }
+  }
+
+  // Compute the offset due to this GEP and check if the alloca has a
+  // component element at that offset.
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If this GEP is non constant then the last operand must have been a
+  // dynamic index into a vector.  Pop this now as it has no impact on the
+  // constant part of the offset.
+  if (NonConstant)
+    Indices.pop_back();
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+  if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset,
+                        NonConstantIdxSize))
+    MarkUnsafe(Info, GEPI);
+}
+
+/// isHomogeneousAggregate - Check if type T is a struct or array containing
+/// elements of the same type (which is always true for arrays).  If so,
+/// return true with NumElts and EltTy set to the number of elements and the
+/// element type, respectively.
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+                                   Type *&EltTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    NumElts = AT->getNumElements();
+    EltTy = (NumElts == 0 ? 0 : AT->getElementType());
+    return true;
+  }
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    NumElts = ST->getNumContainedTypes();
+    EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
+    for (unsigned n = 1; n < NumElts; ++n) {
+      if (ST->getContainedType(n) != EltTy)
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
+/// "homogeneous" aggregates with the same element type and number of elements.
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
+  if (T1 == T2)
+    return true;
+
+  unsigned NumElts1, NumElts2;
+  Type *EltTy1, *EltTy2;
+  if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
+      isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
+      NumElts1 == NumElts2 &&
+      EltTy1 == EltTy2)
+    return true;
+
+  return false;
+}
+
+/// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI
+/// alloca or has an offset and size that corresponds to a component element
+/// within it.  The offset checked here may have been formed from a GEP with a
+/// pointer bitcasted to a different type.
+///
+/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
+/// unit.  If false, it only allows accesses known to be in a single element.
+void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
+                           Type *MemOpType, bool isStore,
+                           AllocaInfo &Info, Instruction *TheAccess,
+                           bool AllowWholeAccess) {
+  // Check if this is a load/store of the entire alloca.
+  if (Offset == 0 && AllowWholeAccess &&
+      MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) {
+    // This can be safe for MemIntrinsics (where MemOpType is 0) and integer
+    // loads/stores (which are essentially the same as the MemIntrinsics with
+    // regard to copying padding between elements).  But, if an alloca is
+    // flagged as both a source and destination of such operations, we'll need
+    // to check later for padding between elements.
+    if (!MemOpType || MemOpType->isIntegerTy()) {
+      if (isStore)
+        Info.isMemCpyDst = true;
+      else
+        Info.isMemCpySrc = true;
+      return;
+    }
+    // This is also safe for references using a type that is compatible with
+    // the type of the alloca, so that loads/stores can be rewritten using
+    // insertvalue/extractvalue.
+    if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) {
+      Info.hasSubelementAccess = true;
+      return;
+    }
+  }
+  // Check if the offset/size correspond to a component within the alloca type.
+  Type *T = Info.AI->getAllocatedType();
+  if (TypeHasComponent(T, Offset, MemSize)) {
+    Info.hasSubelementAccess = true;
+    return;
+  }
+
+  return MarkUnsafe(Info, TheAccess);
+}
+
+/// TypeHasComponent - Return true if T has a component type with the
+/// specified offset and size.  If Size is zero, do not check the size.
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
+  Type *EltTy;
+  uint64_t EltSize;
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = TD->getStructLayout(ST);
+    unsigned EltIdx = Layout->getElementContainingOffset(Offset);
+    EltTy = ST->getContainedType(EltIdx);
+    EltSize = TD->getTypeAllocSize(EltTy);
+    Offset -= Layout->getElementOffset(EltIdx);
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    EltTy = AT->getElementType();
+    EltSize = TD->getTypeAllocSize(EltTy);
+    if (Offset >= AT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
+  } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    EltTy = VT->getElementType();
+    EltSize = TD->getTypeAllocSize(EltTy);
+    if (Offset >= VT->getNumElements() * EltSize)
+      return false;
+    Offset %= EltSize;
+  } else {
+    return false;
+  }
+  if (Offset == 0 && (Size == 0 || EltSize == Size))
+    return true;
+  // Check if the component spans multiple elements.
+  if (Offset + Size > EltSize)
+    return false;
+  return TypeHasComponent(EltTy, Offset, Size);
+}
+
+/// RewriteForScalarRepl - Alloca AI is being split into NewElts, so rewrite
+/// the instruction I, which references it, to use the separate elements.
+/// Offset indicates the position within AI that is referenced by this
+/// instruction.
+void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
+                                SmallVector<AllocaInst*, 32> &NewElts) {
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI++);
+
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCast(BC, AI, Offset, NewElts);
+      continue;
+    }
+
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) {
+      RewriteGEP(GEPI, AI, Offset, NewElts);
+      continue;
+    }
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+      ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+      uint64_t MemSize = Length->getZExtValue();
+      if (Offset == 0 &&
+          MemSize == TD->getTypeAllocSize(AI->getAllocatedType()))
+        RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts);
+      // Otherwise the intrinsic can only touch a single element and the
+      // address operand will be updated, so nothing else needs to be done.
+      continue;
+    }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      Type *LIType = LI->getType();
+
+      if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
+        // Replace:
+        //   %res = load { i32, i32 }* %alloc
+        // with:
+        //   %load.0 = load i32* %alloc.0
+        //   %insert.0 insertvalue { i32, i32 } zeroinitializer, i32 %load.0, 0
+        //   %load.1 = load i32* %alloc.1
+        //   %insert = insertvalue { i32, i32 } %insert.0, i32 %load.1, 1
+        // (Also works for arrays instead of structs)
+        Value *Insert = UndefValue::get(LIType);
+        IRBuilder<> Builder(LI);
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Load = Builder.CreateLoad(NewElts[i], "load");
+          Insert = Builder.CreateInsertValue(Insert, Load, i, "insert");
+        }
+        LI->replaceAllUsesWith(Insert);
+        DeadInsts.push_back(LI);
+      } else if (LIType->isIntegerTy() &&
+                 TD->getTypeAllocSize(LIType) ==
+                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a load of the entire alloca to an integer, rewrite it.
+        RewriteLoadUserOfWholeAlloca(LI, AI, NewElts);
+      }
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      Value *Val = SI->getOperand(0);
+      Type *SIType = Val->getType();
+      if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
+        // Replace:
+        //   store { i32, i32 } %val, { i32, i32 }* %alloc
+        // with:
+        //   %val.0 = extractvalue { i32, i32 } %val, 0
+        //   store i32 %val.0, i32* %alloc.0
+        //   %val.1 = extractvalue { i32, i32 } %val, 1
+        //   store i32 %val.1, i32* %alloc.1
+        // (Also works for arrays instead of structs)
+        IRBuilder<> Builder(SI);
+        for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+          Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
+          Builder.CreateStore(Extract, NewElts[i]);
+        }
+        DeadInsts.push_back(SI);
+      } else if (SIType->isIntegerTy() &&
+                 TD->getTypeAllocSize(SIType) ==
+                 TD->getTypeAllocSize(AI->getAllocatedType())) {
+        // If this is a store of the entire alloca from an integer, rewrite it.
+        RewriteStoreUserOfWholeAlloca(SI, AI, NewElts);
+      }
+      continue;
+    }
+
+    if (isa<SelectInst>(User) || isa<PHINode>(User)) {
+      // If we have a PHI user of the alloca itself (as opposed to a GEP or
+      // bitcast) we have to rewrite it.  GEP and bitcast uses will be RAUW'd to
+      // the new pointer.
+      if (!isa<AllocaInst>(I)) continue;
+
+      assert(Offset == 0 && NewElts[0] &&
+             "Direct alloca use should have a zero offset");
+
+      // If we have a use of the alloca, we know the derived uses will be
+      // utilizing just the first element of the scalarized result.  Insert a
+      // bitcast of the first alloca before the user as required.
+      AllocaInst *NewAI = NewElts[0];
+      BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI);
+      NewAI->moveBefore(BCI);
+      TheUse = BCI;
+      continue;
+    }
+  }
+}
+
+/// RewriteBitCast - Update a bitcast reference to the alloca being replaced
+/// and recursively continue updating all of its uses.
+void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
+                          SmallVector<AllocaInst*, 32> &NewElts) {
+  RewriteForScalarRepl(BC, AI, Offset, NewElts);
+  if (BC->getOperand(0) != AI)
+    return;
+
+  // The bitcast references the original alloca.  Replace its uses with
+  // references to the alloca containing offset zero (which is normally at
+  // index zero, but might not be in cases involving structs with elements
+  // of size zero).
+  Type *T = AI->getAllocatedType();
+  uint64_t EltOffset = 0;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+  Instruction *Val = NewElts[Idx];
+  if (Val->getType() != BC->getDestTy()) {
+    Val = new BitCastInst(Val, BC->getDestTy(), "", BC);
+    Val->takeName(BC);
+  }
+  BC->replaceAllUsesWith(Val);
+  DeadInsts.push_back(BC);
+}
+
+/// FindElementAndOffset - Return the index of the element containing Offset
+/// within the specified type, which must be either a struct or an array.
+/// Sets T to the type of the element and Offset to the offset within that
+/// element.  IdxTy is set to the type of the index result to be used in a
+/// GEP instruction.
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                    Type *&IdxTy) {
+  uint64_t Idx = 0;
+  if (StructType *ST = dyn_cast<StructType>(T)) {
+    const StructLayout *Layout = TD->getStructLayout(ST);
+    Idx = Layout->getElementContainingOffset(Offset);
+    T = ST->getContainedType(Idx);
+    Offset -= Layout->getElementOffset(Idx);
+    IdxTy = Type::getInt32Ty(T->getContext());
+    return Idx;
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
+    T = AT->getElementType();
+    uint64_t EltSize = TD->getTypeAllocSize(T);
+    Idx = Offset / EltSize;
+    Offset -= Idx * EltSize;
+    IdxTy = Type::getInt64Ty(T->getContext());
+    return Idx;
+  }
+  VectorType *VT = cast<VectorType>(T);
+  T = VT->getElementType();
+  uint64_t EltSize = TD->getTypeAllocSize(T);
+  Idx = Offset / EltSize;
+  Offset -= Idx * EltSize;
+  IdxTy = Type::getInt64Ty(T->getContext());
+  return Idx;
+}
+
+/// RewriteGEP - Check if this GEP instruction moves the pointer across
+/// elements of the alloca that are being split apart, and if so, rewrite
+/// the GEP to be relative to the new element.
+void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
+                      SmallVector<AllocaInst*, 32> &NewElts) {
+  uint64_t OldOffset = Offset;
+  SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
+  // If the GEP was dynamic then it must have been a dynamic vector lookup.
+  // In this case, it must be the last GEP operand which is dynamic so keep that
+  // aside until we've found the constant GEP offset then add it back in at the
+  // end.
+  Value* NonConstantIdx = 0;
+  if (!GEPI->hasAllConstantIndices())
+    NonConstantIdx = Indices.pop_back_val();
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
+
+  RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
+
+  Type *T = AI->getAllocatedType();
+  Type *IdxTy;
+  uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
+  if (GEPI->getOperand(0) == AI)
+    OldIdx = ~0ULL; // Force the GEP to be rewritten.
+
+  T = AI->getAllocatedType();
+  uint64_t EltOffset = Offset;
+  uint64_t Idx = FindElementAndOffset(T, EltOffset, IdxTy);
+
+  // If this GEP does not move the pointer across elements of the alloca
+  // being split, then it does not needs to be rewritten.
+  if (Idx == OldIdx)
+    return;
+
+  Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  SmallVector<Value*, 8> NewArgs;
+  NewArgs.push_back(Constant::getNullValue(i32Ty));
+  while (EltOffset != 0) {
+    uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy);
+    NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx));
+  }
+  if (NonConstantIdx) {
+    Type* GepTy = T;
+    // This GEP has a dynamic index.  We need to add "i32 0" to index through
+    // any structs or arrays in the original type until we get to the vector
+    // to index.
+    while (!isa<VectorType>(GepTy)) {
+      NewArgs.push_back(Constant::getNullValue(i32Ty));
+      GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U);
+    }
+    NewArgs.push_back(NonConstantIdx);
+  }
+  Instruction *Val = NewElts[Idx];
+  if (NewArgs.size() > 1) {
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
+    Val->takeName(GEPI);
+  }
+  if (Val->getType() != GEPI->getType())
+    Val = new BitCastInst(Val, GEPI->getType(), Val->getName(), GEPI);
+  GEPI->replaceAllUsesWith(Val);
+  DeadInsts.push_back(GEPI);
+}
+
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVector<AllocaInst*, 32> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
+/// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
+/// Rewrite it to copy or set the elements of the scalarized memory.
+void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                        AllocaInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  // If this is a memcpy/memmove, construct the other pointer as the
+  // appropriate type.  The "Other" pointer is the pointer that goes to memory
+  // that doesn't have anything to do with the alloca that we are promoting. For
+  // memset, this Value* stays null.
+  Value *OtherPtr = 0;
+  unsigned MemAlignment = MI->getAlignment();
+  if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { // memmove/memcopy
+    if (Inst == MTI->getRawDest())
+      OtherPtr = MTI->getRawSource();
+    else {
+      assert(Inst == MTI->getRawSource());
+      OtherPtr = MTI->getRawDest();
+    }
+  }
+
+  // If there is an other pointer, we want to convert it to the same pointer
+  // type as AI has, so we can GEP through it safely.
+  if (OtherPtr) {
+    unsigned AddrSpace =
+      cast<PointerType>(OtherPtr->getType())->getAddressSpace();
+
+    // Remove bitcasts and all-zero GEPs from OtherPtr.  This is an
+    // optimization, but it's also required to detect the corner case where
+    // both pointer operands are referencing the same memory, and where
+    // OtherPtr may be a bitcast or GEP that currently being rewritten.  (This
+    // function is only called for mem intrinsics that access the whole
+    // aggregate, so non-zero GEPs are not an issue here.)
+    OtherPtr = OtherPtr->stripPointerCasts();
+
+    // Copying the alloca to itself is a no-op: just delete it.
+    if (OtherPtr == AI || OtherPtr == NewElts[0]) {
+      // This code will run twice for a no-op memcpy -- once for each operand.
+      // Put only one reference to MI on the DeadInsts list.
+      for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(),
+             E = DeadInsts.end(); I != E; ++I)
+        if (*I == MI) return;
+      DeadInsts.push_back(MI);
+      return;
+    }
+
+    // If the pointer is not the right type, insert a bitcast to the right
+    // type.
+    Type *NewTy =
+      PointerType::get(AI->getType()->getElementType(), AddrSpace);
+
+    if (OtherPtr->getType() != NewTy)
+      OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI);
+  }
+
+  // Process each element of the aggregate.
+  bool SROADest = MI->getRawDest() == Inst;
+
+  Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext()));
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // If this is a memcpy/memmove, emit a GEP of the other element address.
+    Value *OtherElt = 0;
+    unsigned OtherEltAlign = MemAlignment;
+
+    if (OtherPtr) {
+      Value *Idx[2] = { Zero,
+                      ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
+                                              OtherPtr->getName()+"."+Twine(i),
+                                                   MI);
+      uint64_t EltOffset;
+      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      Type *OtherTy = OtherPtrTy->getElementType();
+      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
+        EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
+      } else {
+        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+        EltOffset = TD->getTypeAllocSize(EltTy)*i;
+      }
+
+      // The alignment of the other pointer is the guaranteed alignment of the
+      // element, which is affected by both the known alignment of the whole
+      // mem intrinsic and the alignment of the element.  If the alignment of
+      // the memcpy (f.e.) is 32 but the element is at a 4-byte offset, then the
+      // known alignment is just 4 bytes.
+      OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset);
+    }
+
+    Value *EltPtr = NewElts[i];
+    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+
+    // If we got down to a scalar, insert a load or store as appropriate.
+    if (EltTy->isSingleValueType()) {
+      if (isa<MemTransferInst>(MI)) {
+        if (SROADest) {
+          // From Other to Alloca.
+          Value *Elt = new LoadInst(OtherElt, "tmp", false, OtherEltAlign, MI);
+          new StoreInst(Elt, EltPtr, MI);
+        } else {
+          // From Alloca to Other.
+          Value *Elt = new LoadInst(EltPtr, "tmp", MI);
+          new StoreInst(Elt, OtherElt, false, OtherEltAlign, MI);
+        }
+        continue;
+      }
+      assert(isa<MemSetInst>(MI));
+
+      // If the stored element is zero (common case), just store a null
+      // constant.
+      Constant *StoreVal;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getArgOperand(1))) {
+        if (CI->isZero()) {
+          StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
+        } else {
+          // If EltTy is a vector type, get the element type.
+          Type *ValTy = EltTy->getScalarType();
+
+          // Construct an integer with the right value.
+          unsigned EltSize = TD->getTypeSizeInBits(ValTy);
+          APInt OneVal(EltSize, CI->getZExtValue());
+          APInt TotalVal(OneVal);
+          // Set each byte.
+          for (unsigned i = 0; 8*i < EltSize; ++i) {
+            TotalVal = TotalVal.shl(8);
+            TotalVal |= OneVal;
+          }
+
+          // Convert the integer value to the appropriate type.
+          StoreVal = ConstantInt::get(CI->getContext(), TotalVal);
+          if (ValTy->isPointerTy())
+            StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+          else if (ValTy->isFloatingPointTy())
+            StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+          assert(StoreVal->getType() == ValTy && "Type mismatch!");
+
+          // If the requested value was a vector constant, create it.
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
+            StoreVal = ConstantVector::getSplat(NumElts, StoreVal);
+          }
+        }
+        new StoreInst(StoreVal, EltPtr, MI);
+        continue;
+      }
+      // Otherwise, if we're storing a byte variable, use a memset call for
+      // this element.
+    }
+
+    unsigned EltSize = TD->getTypeAllocSize(EltTy);
+    if (!EltSize)
+      continue;
+
+    IRBuilder<> Builder(MI);
+
+    // Finally, insert the meminst for this element.
+    if (isa<MemSetInst>(MI)) {
+      Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize,
+                           MI->isVolatile());
+    } else {
+      assert(isa<MemTransferInst>(MI));
+      Value *Dst = SROADest ? EltPtr : OtherElt;  // Dest ptr
+      Value *Src = SROADest ? OtherElt : EltPtr;  // Src ptr
+
+      if (isa<MemCpyInst>(MI))
+        Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile());
+      else
+        Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile());
+    }
+  }
+  DeadInsts.push_back(MI);
+}
+
+/// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
+/// overwrites the entire allocation.  Extract out the pieces of the stored
+/// integer and store them individually.
+void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                         SmallVector<AllocaInst*, 32> &NewElts){
+  // Extract each element out of the integer according to its structure offset
+  // and store the element value to the individual alloca.
+  Value *SrcVal = SI->getOperand(0);
+  Type *AllocaEltTy = AI->getAllocatedType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+
+  IRBuilder<> Builder(SI);
+
+  // Handle tail padding by extending the operand
+  if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits)
+    SrcVal = Builder.CreateZExt(SrcVal,
+                            IntegerType::get(SI->getContext(), AllocaSizeBits));
+
+  DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI
+               << '\n');
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    const StructLayout *Layout = TD->getStructLayout(EltSTy);
+
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Get the number of bits to shift SrcVal to get the value.
+      Type *FieldTy = EltSTy->getElementType(i);
+      uint64_t Shift = Layout->getElementOffsetInBits(i);
+
+      if (TD->isBigEndian())
+        Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy);
+
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+      }
+
+      // Truncate down to an integer of the right size.
+      uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (FieldSizeBits == 0) continue;
+
+      if (FieldSizeBits != AllocaSizeBits)
+        EltVal = Builder.CreateTrunc(EltVal,
+                             IntegerType::get(SI->getContext(), FieldSizeBits));
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == FieldTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = Builder.CreateBitCast(EltVal, FieldTy);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = Builder.CreateBitCast(DestField,
+                                     PointerType::getUnqual(EltVal->getType()));
+      }
+      new StoreInst(EltVal, DestField, SI);
+    }
+
+  } else {
+    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    Type *ArrayEltTy = ATy->getElementType();
+    uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+    uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
+
+    uint64_t Shift;
+
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-ElementOffset;
+    else
+      Shift = 0;
+
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // Ignore zero sized fields like {}, they obviously contain no data.
+      if (ElementSizeBits == 0) continue;
+
+      Value *EltVal = SrcVal;
+      if (Shift) {
+        Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift);
+        EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt");
+      }
+
+      // Truncate down to an integer of the right size.
+      if (ElementSizeBits != AllocaSizeBits)
+        EltVal = Builder.CreateTrunc(EltVal,
+                                     IntegerType::get(SI->getContext(),
+                                                      ElementSizeBits));
+      Value *DestField = NewElts[i];
+      if (EltVal->getType() == ArrayEltTy) {
+        // Storing to an integer field of this size, just do it.
+      } else if (ArrayEltTy->isFloatingPointTy() ||
+                 ArrayEltTy->isVectorTy()) {
+        // Bitcast to the right element type (for fp/vector values).
+        EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy);
+      } else {
+        // Otherwise, bitcast the dest pointer (for aggregates).
+        DestField = Builder.CreateBitCast(DestField,
+                                     PointerType::getUnqual(EltVal->getType()));
+      }
+      new StoreInst(EltVal, DestField, SI);
+
+      if (TD->isBigEndian())
+        Shift -= ElementOffset;
+      else
+        Shift += ElementOffset;
+    }
+  }
+
+  DeadInsts.push_back(SI);
+}
+
+/// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
+/// an integer.  Load the individual pieces to form the aggregate value.
+void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                        SmallVector<AllocaInst*, 32> &NewElts) {
+  // Extract each element out of the NewElts according to its structure offset
+  // and form the result value.
+  Type *AllocaEltTy = AI->getAllocatedType();
+  uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
+
+  DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
+               << '\n');
+
+  // There are two forms here: AI could be an array or struct.  Both cases
+  // have different ways to compute the element offset.
+  const StructLayout *Layout = 0;
+  uint64_t ArrayEltBitOffset = 0;
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+    Layout = TD->getStructLayout(EltSTy);
+  } else {
+    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
+  }
+
+  Value *ResultVal =
+    Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits));
+
+  for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+    // Load the value from the alloca.  If the NewElt is an aggregate, cast
+    // the pointer to an integer of the same size before doing the load.
+    Value *SrcField = NewElts[i];
+    Type *FieldTy =
+      cast<PointerType>(SrcField->getType())->getElementType();
+    uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
+
+    // Ignore zero sized fields like {}, they obviously contain no data.
+    if (FieldSizeBits == 0) continue;
+
+    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+                                                     FieldSizeBits);
+    if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
+        !FieldTy->isVectorTy())
+      SrcField = new BitCastInst(SrcField,
+                                 PointerType::getUnqual(FieldIntTy),
+                                 "", LI);
+    SrcField = new LoadInst(SrcField, "sroa.load.elt", LI);
+
+    // If SrcField is a fp or vector of the right size but that isn't an
+    // integer type, bitcast to an integer so we can shift it.
+    if (SrcField->getType() != FieldIntTy)
+      SrcField = new BitCastInst(SrcField, FieldIntTy, "", LI);
+
+    // Zero extend the field to be the same size as the final alloca so that
+    // we can shift and insert it.
+    if (SrcField->getType() != ResultVal->getType())
+      SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI);
+
+    // Determine the number of bits to shift SrcField.
+    uint64_t Shift;
+    if (Layout) // Struct case.
+      Shift = Layout->getElementOffsetInBits(i);
+    else  // Array case.
+      Shift = i*ArrayEltBitOffset;
+
+    if (TD->isBigEndian())
+      Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth();
+
+    if (Shift) {
+      Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift);
+      SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI);
+    }
+
+    // Don't create an 'or x, 0' on the first iteration.
+    if (!isa<Constant>(ResultVal) ||
+        !cast<Constant>(ResultVal)->isNullValue())
+      ResultVal = BinaryOperator::CreateOr(SrcField, ResultVal, "", LI);
+    else
+      ResultVal = SrcField;
+  }
+
+  // Handle tail padding by truncating the result
+  if (TD->getTypeSizeInBits(LI->getType()) != AllocaSizeBits)
+    ResultVal = new TruncInst(ResultVal, LI->getType(), "", LI);
+
+  LI->replaceAllUsesWith(ResultVal);
+  DeadInsts.push_back(LI);
+}
+
+/// HasPadding - Return true if the specified type has any structure or
+/// alignment padding in between the elements that would be split apart
+/// by SROA; return false otherwise.
+static bool HasPadding(Type *Ty, const DataLayout &TD) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Ty = ATy->getElementType();
+    return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
+  }
+
+  // SROA currently handles only Arrays and Structs.
+  StructType *STy = cast<StructType>(Ty);
+  const StructLayout *SL = TD.getStructLayout(STy);
+  unsigned PrevFieldBitOffset = 0;
+  for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+    unsigned FieldBitOffset = SL->getElementOffsetInBits(i);
+
+    // Check to see if there is any padding between this element and the
+    // previous one.
+    if (i) {
+      unsigned PrevFieldEnd =
+        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
+      if (PrevFieldEnd < FieldBitOffset)
+        return true;
+    }
+    PrevFieldBitOffset = FieldBitOffset;
+  }
+  // Check for tail padding.
+  if (unsigned EltCount = STy->getNumElements()) {
+    unsigned PrevFieldEnd = PrevFieldBitOffset +
+      TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
+    if (PrevFieldEnd < SL->getSizeInBits())
+      return true;
+  }
+  return false;
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
+  // Loop over the use list of the alloca.  We can only transform it if all of
+  // the users are safe to transform.
+  AllocaInfo Info(AI);
+
+  isSafeForScalarRepl(AI, 0, Info);
+  if (Info.isUnsafe) {
+    DEBUG(dbgs() << "Cannot transform: " << *AI << '\n');
+    return false;
+  }
+
+  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
+  // source and destination, we have to be careful.  In particular, the memcpy
+  // could be moving around elements that live in structure padding of the LLVM
+  // types, but may actually be used.  In these cases, we refuse to promote the
+  // struct.
+  if (Info.isMemCpySrc && Info.isMemCpyDst &&
+      HasPadding(AI->getAllocatedType(), *TD))
+    return false;
+
+  // If the alloca never has an access to just *part* of it, but is accessed
+  // via loads and stores, then we should use ConvertToScalarInfo to promote
+  // the alloca instead of promoting each piece at a time and inserting fission
+  // and fusion code.
+  if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
+    // If the struct/array just has one element, use basic SRoA.
+    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+      if (ST->getNumElements() > 1) return false;
+    } else {
+      if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
+        return false;
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
new file mode 100644
index 000000000000..c243d34fd7db
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -0,0 +1,350 @@
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations.  For example:
+//
+//   * Removes basic blocks with no predecessors.
+//   * Merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor.
+//   * Eliminates a basic block that only contains an unconditional branch.
+//   * Changes invoke instructions to nounwind functions to be calls.
+//   * Change things like "if (x) if (y)" into "if (x&y)".
+//   * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+namespace {
+  struct CFGSimplifyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGSimplifyPass() : FunctionPass(ID) {
+      initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetTransformInfo>();
+    }
+  };
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
+                      false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
+                    false, false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass() {
+  return new CFGSimplifyPass();
+}
+
+/// changeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  new UnreachableInst(I->getContext(), I);
+
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
+  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
+}
+
+static bool markAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
+  do {
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(BBI, false);
+            Changed = true;
+          }
+          break;
+        }
+      }
+
+      // Store to undef and store to null are undefined and used to signal that
+      // they should be changed to unreachable by passes that can't modify the
+      // CFG.
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             SI->getPointerAddressSpace() == 0)) {
+          changeToUnreachable(SI, true);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          changeToCall(II);
+        Changed = true;
+      }
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true);
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
+  } while (!Worklist.empty());
+  return Changed;
+}
+
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise.
+static bool removeUnreachableBlocksFromFn(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
+
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() == F.size())
+    return Changed;
+
+  assert(Reachable.size() < F.size());
+  NumSimpl += F.size()-Reachable.size();
+
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
+      continue;
+
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
+  }
+
+  for (Function::iterator I = ++F.begin(); I != F.end();)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+
+  return true;
+}
+
+/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// node) return blocks, merge them together to promote recursive block merging.
+static bool mergeEmptyReturnBlocks(Function &F) {
+  bool Changed = false;
+
+  BasicBlock *RetBlock = 0;
+
+  // Scan all the blocks in the function, looking for empty return blocks.
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock &BB = *BBI++;
+
+    // Only look at return blocks.
+    ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (Ret == 0) continue;
+
+    // Only look at the block if it is empty or the only other thing in it is a
+    // single PHI node that is the operand to the return.
+    if (Ret != &BB.front()) {
+      // Check for something else in the block.
+      BasicBlock::iterator I = Ret;
+      --I;
+      // Skip over debug info.
+      while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+        --I;
+      if (!isa<DbgInfoIntrinsic>(I) &&
+          (!isa<PHINode>(I) || I != BB.begin() ||
+           Ret->getNumOperands() == 0 ||
+           Ret->getOperand(0) != I))
+        continue;
+    }
+
+    // If this is the first returning block, remember it and keep going.
+    if (RetBlock == 0) {
+      RetBlock = &BB;
+      continue;
+    }
+
+    // Otherwise, we found a duplicate return block.  Merge the two.
+    Changed = true;
+
+    // Case when there is no input to the return or when the returned values
+    // agree is trivial.  Note that they can't agree if there are phis in the
+    // blocks.
+    if (Ret->getNumOperands() == 0 ||
+        Ret->getOperand(0) ==
+          cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+      BB.replaceAllUsesWith(RetBlock);
+      BB.eraseFromParent();
+      continue;
+    }
+
+    // If the canonical return block has no PHI node, create one now.
+    PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+    if (RetBlockPHI == 0) {
+      Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+      pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+      RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+                                    std::distance(PB, PE), "merge",
+                                    &RetBlock->front());
+
+      for (pred_iterator PI = PB; PI != PE; ++PI)
+        RetBlockPHI->addIncoming(InVal, *PI);
+      RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+    }
+
+    // Turn BB into a block that just unconditionally branches to the return
+    // block.  This handles the case when the two return blocks have a common
+    // predecessor but that return different things.
+    RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+    BB.getTerminator()->eraseFromParent();
+    BranchInst::Create(RetBlock, &BB);
+  }
+
+  return Changed;
+}
+
+/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+                                   const DataLayout *TD) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded...
+    //
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(BBIt++, TTI, TD)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+// It is possible that we may require multiple passes over the code to fully
+// simplify the CFG.
+//
+bool CFGSimplifyPass::runOnFunction(Function &F) {
+  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
+
+  // If neither pass changed anything, we're done.
+  if (!EverChanged) return false;
+
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterate between the two optimizations.  We structure the code like this to
+  // avoid reruning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocksFromFn doesn't do anything.
+  if (!removeUnreachableBlocksFromFn(F))
+    return true;
+
+  do {
+    EverChanged = iterativelySimplifyCFG(F, TTI, TD);
+    EverChanged |= removeUnreachableBlocksFromFn(F);
+  } while (EverChanged);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
new file mode 100644
index 000000000000..3514e6c2aadc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -0,0 +1,247 @@
+//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple pass that applies a variety of small
+// optimizations for calls to specific well-known function calls (e.g. runtime
+// library functions).   Any optimization that takes the very simple form
+// "replace call to library function with simpler code that provides the same
+// result" belongs in this file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplify-libcalls"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Optimizer Base Class
+//===----------------------------------------------------------------------===//
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class LibCallOptimization {
+protected:
+  Function *Caller;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  LLVMContext* Context;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// CallOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
+    =0;
+
+  Value *OptimizeCall(CallInst *CI, const DataLayout *TD,
+                      const TargetLibraryInfo *TLI, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = TD;
+    this->TLI = TLI;
+    if (CI->getCalledFunction())
+      Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
+    return CallOptimizer(CI->getCalledFunction(), CI, B);
+  }
+};
+} // End anonymous namespace.
+
+
+//===----------------------------------------------------------------------===//
+// SimplifyLibCalls Pass Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// This pass optimizes well known library functions from libc and libm.
+  ///
+  class SimplifyLibCalls : public FunctionPass {
+    TargetLibraryInfo *TLI;
+
+    StringMap<LibCallOptimization*> Optimizations;
+  public:
+    static char ID; // Pass identification
+    SimplifyLibCalls() : FunctionPass(ID) {
+      initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+    void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
+    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+
+    void InitOptimizations();
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetLibraryInfo>();
+    }
+  };
+} // end anonymous namespace.
+
+char SimplifyLibCalls::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls",
+                      "Simplify well-known library calls", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls",
+                    "Simplify well-known library calls", false, false)
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createSimplifyLibCallsPass() {
+  return new SimplifyLibCalls();
+}
+
+void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
+void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
+                              LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
+/// Optimizations - Populate the Optimizations map with all the optimizations
+/// we know.
+void SimplifyLibCalls::InitOptimizations() {
+}
+
+
+/// runOnFunction - Top level algorithm.
+///
+bool SimplifyLibCalls::runOnFunction(Function &F) {
+  TLI = &getAnalysis<TargetLibraryInfo>();
+
+  if (Optimizations.empty())
+    InitOptimizations();
+
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+
+  IRBuilder<> Builder(F.getContext());
+
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I++);
+      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
+
+      // Ignore indirect calls and calls to non-external functions.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0 || !Callee->isDeclaration() ||
+          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
+        continue;
+
+      // Ignore unknown calls.
+      LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
+      if (!LCO) continue;
+
+      // Set the builder to the instruction after the call.
+      Builder.SetInsertPoint(BB, I);
+
+      // Use debug location of CI for all new instructions.
+      Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+      // Try to optimize this call.
+      Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder);
+      if (Result == 0) continue;
+
+      DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI;
+            dbgs() << "  into: " << *Result << "\n");
+
+      // Something changed!
+      Changed = true;
+
+      // Inspect the instruction after the call (which was potentially just
+      // added) next.
+      I = CI; ++I;
+
+      if (CI != Result && !CI->use_empty()) {
+        CI->replaceAllUsesWith(Result);
+        if (!Result->hasName())
+          Result->takeName(CI);
+      }
+      CI->eraseFromParent();
+    }
+  }
+  return Changed;
+}
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(x**y)     -> y*log(x)
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp2(y))  -> y*log(2)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//   * log(pow(x,y)) -> y*log(x)
+//
+// lround, lroundf, lroundl:
+//   * lround(cnst) -> cnst'
+//
+// pow, powf, powl:
+//   * pow(exp(x),y)  -> exp(x*y)
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// round, roundf, roundl:
+//   * round(cnst) -> cnst'
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+// strchr:
+//   * strchr(p, 0) -> strlen(p)
+// tan, tanf, tanl:
+//   * tan(atan(x)) -> x
+//
+// trunc, truncf, truncl:
+//   * trunc(cnst) -> cnst'
+//
+//
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
new file mode 100644
index 000000000000..d4595bb373e6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -0,0 +1,270 @@
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sink"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+namespace {
+  class Sinking : public FunctionPass {
+    DominatorTree *DT;
+    LoopInfo *LI;
+    AliasAnalysis *AA;
+
+  public:
+    static char ID; // Pass identification
+    Sinking() : FunctionPass(ID) {
+      initializeSinkingPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+    }
+  private:
+    bool ProcessBlock(BasicBlock &BB);
+    bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores);
+    bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const;
+    bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const;
+  };
+} // end anonymous namespace
+
+char Sinking::ID = 0;
+INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new Sinking(); }
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified value
+/// occur in blocks dominated by the specified block.
+bool Sinking::AllUsesDominatedByBlock(Instruction *Inst,
+                                      BasicBlock *BB) const {
+  // Ignoring debug uses is necessary so debug info doesn't affect the code.
+  // This may leave a referencing dbg_value in the original block, before
+  // the definition of the vreg.  Dwarf generator handles this although the
+  // user might not get the right info at runtime.
+  for (Value::use_iterator I = Inst->use_begin(),
+       E = Inst->use_end(); I != E; ++I) {
+    // Determine the block of the use.
+    Instruction *UseInst = cast<Instruction>(*I);
+    BasicBlock *UseBlock = UseInst->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(UseInst)) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      unsigned Num = PHINode::getIncomingValueNumForOperand(I.getOperandNo());
+      UseBlock = PN->getIncomingBlock(Num);
+    }
+    // Check that it dominates.
+    if (!DT->dominates(BB, UseBlock))
+      return false;
+  }
+  return true;
+}
+
+bool Sinking::runOnFunction(Function &F) {
+  DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+
+  bool MadeChange, EverMadeChange = false;
+
+  do {
+    MadeChange = false;
+    DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+    // Process all basic blocks.
+    for (Function::iterator I = F.begin(), E = F.end();
+         I != E; ++I)
+      MadeChange |= ProcessBlock(*I);
+    EverMadeChange |= MadeChange;
+    NumSinkIter++;
+  } while (MadeChange);
+
+  return EverMadeChange;
+}
+
+bool Sinking::ProcessBlock(BasicBlock &BB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&BB)) return false;
+
+  bool MadeChange = false;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  BasicBlock::iterator I = BB.end();
+  --I;
+  bool ProcessedBegin = false;
+  SmallPtrSet<Instruction *, 8> Stores;
+  do {
+    Instruction *Inst = I;  // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == BB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    if (SinkInstruction(Inst, Stores))
+      ++NumSunk, MadeChange = true;
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
+                         SmallPtrSet<Instruction *, 8> &Stores) {
+
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+    AliasAnalysis::Location Loc = AA->getLocation(L);
+    for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
+         E = Stores.end(); I != E; ++I)
+      if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod)
+        return false;
+  }
+
+  if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))
+    return false;
+
+  return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+bool Sinking::IsAcceptableTarget(Instruction *Inst,
+                                 BasicBlock *SuccToSinkTo) const {
+  assert(Inst && "Instruction to be sunk is null");
+  assert(SuccToSinkTo && "Candidate sink target is null");
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (Inst->getParent() == SuccToSinkTo)
+    return false;
+
+  // If the block has multiple predecessors, this would introduce computation
+  // on different code paths.  We could split the critical edge, but for now we
+  // just punt.
+  // FIXME: Split critical edges if not backedges.
+  if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    if (!isSafeToSpeculativelyExecute(Inst))
+      return false;
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!DT->dominates(Inst->getParent(), SuccToSinkTo))
+      return false;
+
+    // Don't sink instructions into a loop.
+    Loop *succ = LI->getLoopFor(SuccToSinkTo);
+    Loop *cur = LI->getLoopFor(Inst->getParent());
+    if (succ != 0 && succ != cur)
+      return false;
+  }
+
+  // Finally, check that all the uses of the instruction are actually
+  // dominated by the candidate
+  return AllUsesDominatedByBlock(Inst, SuccToSinkTo);
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool Sinking::SinkInstruction(Instruction *Inst,
+                              SmallPtrSet<Instruction *, 8> &Stores) {
+  // Check if it's safe to move the instruction.
+  if (!isSafeToMove(Inst, AA, Stores))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  BasicBlock *SuccToSinkTo = 0;
+
+  // Instructions can only be sunk if all their uses are in blocks
+  // dominated by one of the successors.
+  // Look at all the postdominators and see if we can sink it in one.
+  DomTreeNode *DTN = DT->getNode(Inst->getParent());
+  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
+      I != E && SuccToSinkTo == 0; ++I) {
+    BasicBlock *Candidate = (*I)->getBlock();
+    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
+        IsAcceptableTarget(Inst, Candidate))
+      SuccToSinkTo = Candidate;
+  }
+
+  // If no suitable postdominator was found, look at all the successors and
+  // decide which one we should sink to, if any.
+  for (succ_iterator I = succ_begin(Inst->getParent()),
+      E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) {
+    if (IsAcceptableTarget(Inst, *I))
+      SuccToSinkTo = *I;
+  }
+
+  // If we couldn't find a block to sink to, ignore this instruction.
+  if (SuccToSinkTo == 0)
+    return false;
+
+  DEBUG(dbgs() << "Sink" << *Inst << " (";
+        WriteAsOperand(dbgs(), Inst->getParent(), false);
+        dbgs() << " -> ";
+        WriteAsOperand(dbgs(), SuccToSinkTo, false);
+        dbgs() << ")\n");
+
+  // Move the instruction.
+  Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt());
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 000000000000..2002e680d195
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -0,0 +1,648 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative and commutative expression to use an
+//     accumulator variable, thus compiling the typical naive factorial or
+//     'fib' implementation into efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access their caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately precedes the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailcallelim"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped,   "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+  struct TailCallElim : public FunctionPass {
+    const TargetTransformInfo *TTI;
+
+    static char ID; // Pass identification, replacement for typeid
+    TailCallElim() : FunctionPass(ID) {
+      initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    CallInst *FindTRECandidate(Instruction *I,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                    BasicBlock *&OldEntry,
+                                    bool &TailCallsAreMarkedTail,
+                                    SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                    bool CannotTailCallElimCallsMarkedTail);
+    bool FoldReturnAndProcessPred(BasicBlock *BB,
+                                  ReturnInst *Ret, BasicBlock *&OldEntry,
+                                  bool &TailCallsAreMarkedTail,
+                                  SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                  bool CannotTailCallElimCallsMarkedTail);
+    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+                               bool &TailCallsAreMarkedTail,
+                               SmallVector<PHINode*, 8> &ArgumentPHIs,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+  };
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim",
+                      "Tail Call Elimination", false, false)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim",
+                    "Tail Call Elimination", false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfo>();
+}
+
+/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
+/// callees of this function.  We only do very simple analysis right now, this
+/// could be expanded in the future to use mod/ref information for particular
+/// call sites if desired.
+static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
+  // FIXME: do simple 'address taken' analysis.
+  return true;
+}
+
+/// CheckForEscapingAllocas - Scan the specified basic block for alloca
+/// instructions.  If it contains any that might be accessed by calls, return
+/// true.
+static bool CheckForEscapingAllocas(BasicBlock *BB,
+                                    bool &CannotTCETailMarkedCall) {
+  bool RetVal = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      RetVal |= AllocaMightEscapeToCalls(AI);
+
+      // If this alloca is in the body of the function, or if it is a variable
+      // sized allocation, we cannot tail call eliminate calls marked 'tail'
+      // with this mechanism.
+      if (BB != &BB->getParent()->getEntryBlock() ||
+          !isa<ConstantInt>(AI->getArraySize()))
+        CannotTCETailMarkedCall = true;
+    }
+  return RetVal;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg()) return false;
+
+  TTI = &getAnalysis<TargetTransformInfo>();
+  BasicBlock *OldEntry = 0;
+  bool TailCallsAreMarkedTail = false;
+  SmallVector<PHINode*, 8> ArgumentPHIs;
+  bool MadeChange = false;
+  bool FunctionContainsEscapingAllocas = false;
+
+  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // marked with the 'tail' attribute, because doing so would cause the stack
+  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // doesn't).
+  bool CannotTCETailMarkedCall = false;
+
+  // Loop over the function, looking for any returning blocks, and keeping track
+  // of whether this function has any non-trivially used allocas.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
+      break;
+
+    FunctionContainsEscapingAllocas |=
+      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  }
+
+  /// FIXME: The code generator produces really bad code when an 'escaping
+  /// alloca' is changed from being a static alloca to being a dynamic alloca.
+  /// Until this is resolved, disable this transformation if that would ever
+  /// happen.  This bug is PR962.
+  if (FunctionContainsEscapingAllocas)
+    return false;
+
+  // Second pass, change any tail calls to loops.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs,CannotTCETailMarkedCall);
+      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          CannotTCETailMarkedCall);
+      MadeChange |= Change;
+    }
+  }
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  if (!ArgumentPHIs.empty()) {
+    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
+      PHINode *PN = ArgumentPHIs[i];
+
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = SimplifyInstruction(PN)) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+  }
+
+  // Finally, if this function contains no non-escaping allocas, or calls
+  // setjmp, mark all calls in the function as eligible for tail calls
+  //(there is no stack memory for them to access).
+  if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          CI->setTailCall();
+          MadeChange = true;
+        }
+
+  return MadeChange;
+}
+
+
+/// CanMoveAboveCall - Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayHaveSideEffects())  // This also handles volatile loads.
+    return false;
+
+  if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+    // Loads may always be moved above calls without side effects.
+    if (CI->mayHaveSideEffects()) {
+      // Non-volatile loads may be moved above a call with side effects if it
+      // does not write to memory and the load provably won't trap.
+      // FIXME: Writes to memory only matter if they may alias the pointer
+      // being loaded from.
+      if (CI->mayWriteToMemory() ||
+          !isSafeToLoadUnconditionally(L->getPointerOperand(), L,
+                                       L->getAlignment()))
+        return false;
+    }
+  }
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (I->getOperand(i) == CI)
+      return false;
+  return true;
+}
+
+// isDynamicConstant - Return true if the specified value is the same when the
+// return would exit as it was when the initial iteration of the recursive
+// function was executed.
+//
+// We currently handle static constants and arguments that are not modified as
+// part of the recursion.
+//
+static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getArgOperand(ArgNo) == Arg)
+      return true;
+  }
+
+  // Switch cases are always constant integers. If the value is being switched
+  // on and the return is only reachable from one of its cases, it's
+  // effectively constant.
+  if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
+      if (SI->getCondition() == V)
+        return SI->getDefaultDest() != RI->getParent();
+
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+// getCommonReturnValue - Check to see if the function containing the specified
+// tail call consistently returns the same runtime-constant value at all exit
+// points except for IgnoreRI.  If so, return the returned value.
+//
+static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
+  Function *F = CI->getParent()->getParent();
+  Value *ReturnedValue = 0;
+
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI) {
+    ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator());
+    if (RI == 0 || RI == IgnoreRI) continue;
+
+    // We can only perform this transformation if the value returned is
+    // evaluatable at the start of the initial invocation of the function,
+    // instead of at the end of the evaluation.
+    //
+    Value *RetOp = RI->getOperand(0);
+    if (!isDynamicConstant(RetOp, CI, RI))
+      return 0;
+
+    if (ReturnedValue && RetOp != ReturnedValue)
+      return 0;     // Cannot transform if differing values are returned.
+    ReturnedValue = RetOp;
+  }
+  return ReturnedValue;
+}
+
+/// CanTransformAccumulatorRecursion - If the specified instruction can be
+/// transformed using accumulator recursion elimination, return the constant
+/// which is the start of the accumulator value.  Otherwise return null.
+///
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+                                                      CallInst *CI) {
+  if (!I->isAssociative() || !I->isCommutative()) return 0;
+  assert(I->getNumOperands() == 2 &&
+         "Associative/commutative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction.
+  if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+      (I->getOperand(0) != CI && I->getOperand(1) != CI))
+    return 0;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back()))
+    return 0;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI);
+}
+
+static Instruction *FirstNonDbg(BasicBlock::iterator I) {
+  while (isa<DbgInfoIntrinsic>(I))
+    ++I;
+  return &*I;
+}
+
+CallInst*
+TailCallElim::FindTRECandidate(Instruction *TI,
+                               bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = TI->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == TI) // Make sure there is something before the terminator.
+    return 0;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI = 0;
+  BasicBlock::iterator BBI = TI;
+  while (true) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return 0;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return 0;
+
+  // As a special case, detect code like this:
+  //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+  // and disable this xform in this case, because the code generator will
+  // lower the call to fabs into inline code.
+  if (BB == &F->getEntryBlock() &&
+      FirstNonDbg(BB->front()) == CI &&
+      FirstNonDbg(llvm::next(BB->begin())) == TI &&
+      CI->getCalledFunction() &&
+      !TTI->isLoweredToCall(CI->getCalledFunction())) {
+    // A single-block function with just a call and a return. Check that
+    // the arguments match.
+    CallSite::arg_iterator I = CallSite(CI).arg_begin(),
+                           E = CallSite(CI).arg_end();
+    Function::arg_iterator FI = F->arg_begin(),
+                           FE = F->arg_end();
+    for (; I != E && FI != FE; ++I, ++FI)
+      if (*I != &*FI) break;
+    if (I == E && FI == FE)
+      return 0;
+  }
+
+  return CI;
+}
+
+bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
+                                       BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  // If we are introducing accumulator recursion to eliminate operations after
+  // the call instruction that are both associative and commutative, the initial
+  // value for the accumulator is placed in this variable.  If this value is set
+  // then we actually perform accumulator recursion elimination instead of
+  // simple tail recursion elimination.  If the operation is an LLVM instruction
+  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
+  // we are handling the case when the return instruction returns a constant C
+  // which is different to the constant returned by other return instructions
+  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
+  // special case of accumulator recursion, the operation being "return C".
+  Value *AccumulatorRecursionEliminationInitVal = 0;
+  Instruction *AccumulatorRecursionInstr = 0;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  BasicBlock::iterator BBI = CI;
+  for (++BBI; &*BBI != Ret; ++BBI) {
+    if (CanMoveAboveCall(BBI, CI)) continue;
+
+    // If we can't move the instruction above the call, it might be because it
+    // is an associative and commutative operation that could be transformed
+    // using accumulator recursion elimination.  Check to see if this is the
+    // case, and if so, remember the initial accumulator value for later.
+    if ((AccumulatorRecursionEliminationInitVal =
+                           CanTransformAccumulatorRecursion(BBI, CI))) {
+      // Yes, this is accumulator recursion.  Remember which instruction
+      // accumulates.
+      AccumulatorRecursionInstr = BBI;
+    } else {
+      return false;   // Otherwise, we cannot eliminate the tail recursion!
+    }
+  }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == 0 &&
+      !getCommonReturnValue(0, CI)) {
+    // One case remains that we are able to handle: the current return
+    // instruction returns a constant, and all other return instructions
+    // return a different constant.
+    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
+      return false; // Current return instruction does not return a constant.
+    // Check that all other return instructions return a common constant.  If
+    // so, record it in AccumulatorRecursionEliminationInitVal.
+    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
+    if (!AccumulatorRecursionEliminationInitVal)
+      return false;
+  }
+
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (OldEntry == 0) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    BranchInst::Create(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = OldEntry->begin();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = PHINode::Create(I->getType(), 2,
+                                    I->getName() + ".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
+    PHINode *AccPN =
+      PHINode::Create(AccumulatorRecursionEliminationInitVal->getType(),
+                      std::distance(PB, PE) + 1,
+                      "accumulator.tr", OldEntry->begin());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = PB; PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (P == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
+      else
+        AccPN->addIncoming(AccPN, P);
+    }
+
+    if (AccRecInstr) {
+      // Add an incoming argument for the current block, which is computed by
+      // our associative and commutative accumulator instruction.
+      AccPN->addIncoming(AccRecInstr, BB);
+
+      // Next, rewrite the accumulator recursion instruction so that it does not
+      // use the result of the call anymore, instead, use the PHI node we just
+      // inserted.
+      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+    } else {
+      // Add an incoming argument for the current block, which is just the
+      // constant returned by the current return instruction.
+      AccPN->addIncoming(Ret->getReturnValue(), BB);
+    }
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+  NewBI->setDebugLoc(CI->getDebugLoc());
+
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
+
+bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
+                                       ReturnInst *Ret, BasicBlock *&OldEntry,
+                                       bool &TailCallsAreMarkedTail,
+                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  bool Change = false;
+
+  // If the return block contains nothing but the return and PHI's,
+  // there might be an opportunity to duplicate the return in its
+  // predecessors and perform TRC there. Look for predecessors that end
+  // in unconditional branch and recursive call(s).
+  SmallVector<BranchInst*, 8> UncondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    TerminatorInst *PTI = Pred->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(BI);
+  }
+
+  while (!UncondBranchPreds.empty()) {
+    BranchInst *BI = UncondBranchPreds.pop_back_val();
+    BasicBlock *Pred = BI->getParent();
+    if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){
+      DEBUG(dbgs() << "FOLDING: " << *BB
+            << "INTO UNCOND BRANCH PRED: " << *Pred);
+      EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred),
+                                 OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
+                                 CannotTailCallElimCallsMarkedTail);
+      ++NumRetDuped;
+      Change = true;
+    }
+  }
+
+  return Change;
+}
+
+bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                         bool &TailCallsAreMarkedTail,
+                                         SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
+  if (!CI)
+    return false;
+
+  return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
+                                    ArgumentPHIs,
+                                    CannotTailCallElimCallsMarkedTail);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
new file mode 100644
index 000000000000..ba99d2e662e4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -0,0 +1,728 @@
+//===-- BasicBlockUtils.cpp - BasicBlock Utilities -------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+/// DeleteDeadBlock - Delete the specified block, which must have no
+/// predecessors.
+void llvm::DeleteDeadBlock(BasicBlock *BB) {
+  assert((pred_begin(BB) == pred_end(BB) ||
+         // Can delete self loop.
+         BB->getSinglePredecessor() == BB) && "Block is not dead!");
+  TerminatorInst *BBTerm = BB->getTerminator();
+
+  // Loop through all of our successors and make sure they know that one
+  // of their predecessors is going away.
+  for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i)
+    BBTerm->getSuccessor(i)->removePredecessor(BB);
+
+  // Zap all the instructions in the block.
+  while (!BB->empty()) {
+    Instruction &I = BB->back();
+    // If this instruction is used, replace uses with an arbitrary value.
+    // Because control flow can't get here, we don't care what we replace the
+    // value with.  Note that since this block is unreachable, and all values
+    // contained within it must dominate their uses, that all uses will
+    // eventually be removed (they are themselves dead).
+    if (!I.use_empty())
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    BB->getInstList().pop_back();
+  }
+
+  // Zap the block!
+  BB->eraseFromParent();
+}
+
+/// FoldSingleEntryPHINodes - We know that BB has one predecessor.  If there are
+/// any single-entry PHI nodes in it, fold them away.  This handles the case
+/// when all entries to the PHI nodes in a block are guaranteed equal, such as
+/// when the block has exactly one predecessor.
+void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
+  if (!isa<PHINode>(BB->begin())) return;
+
+  AliasAnalysis *AA = 0;
+  MemoryDependenceAnalysis *MemDep = 0;
+  if (P) {
+    AA = P->getAnalysisIfAvailable<AliasAnalysis>();
+    MemDep = P->getAnalysisIfAvailable<MemoryDependenceAnalysis>();
+  }
+
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    if (PN->getIncomingValue(0) != PN)
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    else
+      PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+
+    if (MemDep)
+      MemDep->removeInstruction(PN);  // Memdep updates AA itself.
+    else if (AA && isa<PointerType>(PN->getType()))
+      AA->deleteValue(PN);
+
+    PN->eraseFromParent();
+  }
+}
+
+
+/// DeleteDeadPHIs - Examine each PHI in the given block and delete it if it
+/// is dead. Also recursively delete any operands that become dead as
+/// a result. This includes tracing the def-use list from the PHI to see if
+/// it is ultimately unused or if it reaches an unused cycle.
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
+  // Recursively deleting a PHI may cause multiple PHIs to be deleted
+  // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  bool Changed = false;
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
+
+  return Changed;
+}
+
+/// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
+/// if possible.  The return value indicates success or failure.
+bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
+  // Don't merge away blocks who have their address taken.
+  if (BB->hasAddressTaken()) return false;
+
+  // Can't merge if there are multiple predecessors, or no predecessors.
+  BasicBlock *PredBB = BB->getUniquePredecessor();
+  if (!PredBB) return false;
+
+  // Don't break self-loops.
+  if (PredBB == BB) return false;
+  // Don't break invokes.
+  if (isa<InvokeInst>(PredBB->getTerminator())) return false;
+
+  succ_iterator SI(succ_begin(PredBB)), SE(succ_end(PredBB));
+  BasicBlock *OnlySucc = BB;
+  for (; SI != SE; ++SI)
+    if (*SI != OnlySucc) {
+      OnlySucc = 0;     // There are multiple distinct successors!
+      break;
+    }
+
+  // Can't merge if there are multiple successors.
+  if (!OnlySucc) return false;
+
+  // Can't merge if there is PHI loop.
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    if (PHINode *PN = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == PN)
+          return false;
+    } else
+      break;
+  }
+
+  // Begin by getting rid of unneeded PHIs.
+  if (isa<PHINode>(BB->front()))
+    FoldSingleEntryPHINodes(BB, P);
+
+  // Delete the unconditional branch from the predecessor...
+  PredBB->getInstList().pop_back();
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(PredBB);
+
+  // Move all definitions in the successor to the predecessor...
+  PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+
+  // Inherit predecessors name if it exists.
+  if (!PredBB->hasName())
+    PredBB->takeName(BB);
+
+  // Finally, erase the old block and update dominator info.
+  if (P) {
+    if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+      if (DomTreeNode *DTN = DT->getNode(BB)) {
+        DomTreeNode *PredDTN = DT->getNode(PredBB);
+        SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end());
+        for (SmallVector<DomTreeNode*, 8>::iterator DI = Children.begin(),
+             DE = Children.end(); DI != DE; ++DI)
+          DT->changeImmediateDominator(*DI, PredDTN);
+
+        DT->eraseNode(BB);
+      }
+
+      if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
+        LI->removeBlock(BB);
+
+      if (MemoryDependenceAnalysis *MD =
+            P->getAnalysisIfAvailable<MemoryDependenceAnalysis>())
+        MD->invalidateCachedPredecessors();
+    }
+  }
+
+  BB->eraseFromParent();
+  return true;
+}
+
+/// ReplaceInstWithValue - Replace all uses of an instruction (specified by BI)
+/// with a value, then remove and delete the original instruction.
+///
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+                                BasicBlock::iterator &BI, Value *V) {
+  Instruction &I = *BI;
+  // Replaces all of the uses of the instruction with uses of the value
+  I.replaceAllUsesWith(V);
+
+  // Make sure to propagate a name if there is one already.
+  if (I.hasName() && !V->hasName())
+    V->takeName(&I);
+
+  // Delete the unnecessary instruction now...
+  BI = BIL.erase(BI);
+}
+
+
+/// ReplaceInstWithInst - Replace the instruction specified by BI with the
+/// instruction specified by I.  The original instruction is deleted and BI is
+/// updated to point to the new instruction.
+///
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+                               BasicBlock::iterator &BI, Instruction *I) {
+  assert(I->getParent() == 0 &&
+         "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+  // Insert the new instruction into the basic block...
+  BasicBlock::iterator New = BIL.insert(BI, I);
+
+  // Replace all uses of the old instruction, and delete it.
+  ReplaceInstWithValue(BIL, BI, I);
+
+  // Move BI back to point to the newly inserted instruction
+  BI = New;
+}
+
+/// ReplaceInstWithInst - Replace the instruction specified by From with the
+/// instruction specified by To.
+///
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+  BasicBlock::iterator BI(From);
+  ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+/// GetSuccessorNumber - Search for the specified successor of basic block BB
+/// and return its position in the terminator instruction's list of
+/// successors.  It is an error to call this with a block that is not a
+/// successor.
+unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
+  TerminatorInst *Term = BB->getTerminator();
+#ifndef NDEBUG
+  unsigned e = Term->getNumSuccessors();
+#endif
+  for (unsigned i = 0; ; ++i) {
+    assert(i != e && "Didn't find edge?");
+    if (Term->getSuccessor(i) == Succ)
+      return i;
+  }
+}
+
+/// SplitEdge -  Split the edge connecting specified block. Pass P must
+/// not be NULL.
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
+  unsigned SuccNum = GetSuccessorNumber(BB, Succ);
+
+  // If this is a critical edge, let SplitCriticalEdge do it.
+  TerminatorInst *LatchTerm = BB->getTerminator();
+  if (SplitCriticalEdge(LatchTerm, SuccNum, P))
+    return LatchTerm->getSuccessor(SuccNum);
+
+  // If the edge isn't critical, then BB has a single successor or Succ has a
+  // single pred.  Split the block.
+  BasicBlock::iterator SplitPoint;
+  if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+    // If the successor only has a single pred, split the top of the successor
+    // block.
+    assert(SP == BB && "CFG broken");
+    SP = NULL;
+    return SplitBlock(Succ, Succ->begin(), P);
+  }
+
+  // Otherwise, if BB has a single successor, split it at the bottom of the
+  // block.
+  assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+         "Should have a single succ!");
+  return SplitBlock(BB, BB->getTerminator(), P);
+}
+
+/// SplitBlock - Split the specified block at the specified instruction - every
+/// thing before SplitPt stays in Old and everything starting with SplitPt moves
+/// to a new block.  The two blocks are joined by an unconditional branch and
+/// the loop info is updated.
+///
+BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
+  BasicBlock::iterator SplitIt = SplitPt;
+  while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt))
+    ++SplitIt;
+  BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
+
+  // The new block lives in whichever loop the old one did. This preserves
+  // LCSSA as well, because we force the split point to be after any PHI nodes.
+  if (LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>())
+    if (Loop *L = LI->getLoopFor(Old))
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+  if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
+    // Old dominates New. New node dominates all other nodes dominated by Old.
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
+      std::vector<DomTreeNode *> Children;
+      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
+           I != E; ++I)
+        Children.push_back(*I);
+
+      DomTreeNode *NewNode = DT->addNewBlock(New,Old);
+      for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
+             E = Children.end(); I != E; ++I)
+        DT->changeImmediateDominator(*I, NewNode);
+    }
+  }
+
+  return New;
+}
+
+/// UpdateAnalysisInformation - Update DominatorTree, LoopInfo, and LCCSA
+/// analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+                                      ArrayRef<BasicBlock *> Preds,
+                                      Pass *P, bool &HasLoopExit) {
+  if (!P) return;
+
+  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
+  Loop *L = LI ? LI->getLoopFor(OldBB) : 0;
+
+  // If we need to preserve loop analyses, collect some information about how
+  // this split will affect loops.
+  bool IsLoopEntry = !!L;
+  bool SplitMakesNewLoopHeader = false;
+  if (LI) {
+    bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+
+      // If we need to preserve LCSSA, determine if any of the preds is a loop
+      // exit.
+      if (PreserveLCSSA)
+        if (Loop *PL = LI->getLoopFor(Pred))
+          if (!PL->contains(OldBB))
+            HasLoopExit = true;
+
+      // If we need to preserve LoopInfo, note whether any of the preds crosses
+      // an interesting loop boundary.
+      if (!L) continue;
+      if (L->contains(Pred))
+        IsLoopEntry = false;
+      else
+        SplitMakesNewLoopHeader = true;
+    }
+  }
+
+  // Update dominator tree if available.
+  DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  if (!L) return;
+
+  if (IsLoopEntry) {
+    // Add the new block to the nearest enclosing loop (and not an adjacent
+    // loop). To find this, examine each of the predecessors and determine which
+    // loops enclose them, and select the most-nested loop which contains the
+    // loop containing the block being split.
+    Loop *InnermostPredLoop = 0;
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+      if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+        // Seek a loop which actually contains the block being split (to avoid
+        // adjacent loops).
+        while (PredLoop && !PredLoop->contains(OldBB))
+          PredLoop = PredLoop->getParentLoop();
+
+        // Select the most-nested of these loops which contains the block.
+        if (PredLoop && PredLoop->contains(OldBB) &&
+            (!InnermostPredLoop ||
+             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+          InnermostPredLoop = PredLoop;
+      }
+    }
+
+    if (InnermostPredLoop)
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+  } else {
+    L->addBasicBlockToLoop(NewBB, LI->getBase());
+    if (SplitMakesNewLoopHeader)
+      L->moveToHeader(NewBB);
+  }
+}
+
+/// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
+/// from NewBB. This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+                           ArrayRef<BasicBlock*> Preds, BranchInst *BI,
+                           Pass *P, bool HasLoopExit) {
+  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
+  AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
+  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I++);
+
+    // Check to see if all of the values coming in are the same.  If so, we
+    // don't need to create a new PHI node, unless it's needed for LCSSA.
+    Value *InVal = 0;
+    if (!HasLoopExit) {
+      InVal = PN->getIncomingValueForBlock(Preds[0]);
+      for (unsigned i = 1, e = Preds.size(); i != e; ++i)
+        if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
+          InVal = 0;
+          break;
+        }
+    }
+
+    if (InVal) {
+      // If all incoming values for the new PHI would be the same, just don't
+      // make a new PHI.  Instead, just remove the incoming values from the old
+      // PHI.
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
+        PN->removeIncomingValue(Preds[i], false);
+    } else {
+      // If the values coming into the block are not the same, we need a PHI.
+      // Create the new PHI node, insert it into NewBB at the end of the block
+      PHINode *NewPHI =
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+      if (AA) AA->copyValue(PN, NewPHI);
+
+      // Move all of the PHI values for 'Preds' to the new PHI.
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        Value *V = PN->removeIncomingValue(Preds[i], false);
+        NewPHI->addIncoming(V, Preds[i]);
+      }
+
+      InVal = NewPHI;
+    }
+
+    // Add an incoming value to the PHI node in the loop for the preheader
+    // edge.
+    PN->addIncoming(InVal, NewBB);
+  }
+}
+
+/// SplitBlockPredecessors - This method transforms BB by introducing a new
+/// basic block into the function, and moving some of the predecessors of BB to
+/// be predecessors of the new block.  The new predecessors are indicated by the
+/// Preds array, which has NumPreds elements in it.  The new block is given a
+/// suffix of 'Suffix'.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
+/// preserve LoopSimplify (because it's complicated to handle the case where one
+/// of the edges being split is an exit of a loop with other exits).
+///
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
+                                         ArrayRef<BasicBlock*> Preds,
+                                         const char *Suffix, Pass *P) {
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
+                                         BB->getParent(), BB);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  }
+
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (Preds.size() == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(BB, NewBB, Preds, P, HasLoopExit);
+
+  // Update the PHI nodes in BB with the values coming from NewBB.
+  UpdatePHINodes(BB, NewBB, Preds, BI, P, HasLoopExit);
+  return NewBB;
+}
+
+/// SplitLandingPadPredecessors - This method transforms the landing pad,
+/// OrigBB, by introducing two new basic blocks into the function. One of those
+/// new basic blocks gets the predecessors listed in Preds. The other basic
+/// block gets the remaining predecessors of OrigBB. The landingpad instruction
+/// OrigBB is clone into both of the new basic blocks. The new blocks are given
+/// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
+/// it does not preserve LoopSimplify (because it's complicated to handle the
+/// case where one of the edges being split is an exit of a loop with other
+/// exits).
+///
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock*> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       Pass *P,
+                                       SmallVectorImpl<BasicBlock*> &NewBBs) {
+  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+  // it right before the original block.
+  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+                                          OrigBB->getName() + Suffix1,
+                                          OrigBB->getParent(), OrigBB);
+  NewBBs.push_back(NewBB1);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+
+  // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit);
+
+  // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit);
+
+  // Move the remaining edges from OrigBB to point to NewBB2.
+  SmallVector<BasicBlock*, 8> NewBB2Preds;
+  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+       i != e; ) {
+    BasicBlock *Pred = *i++;
+    if (Pred == NewBB1) continue;
+    assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    NewBB2Preds.push_back(Pred);
+    e = pred_end(OrigBB);
+  }
+
+  BasicBlock *NewBB2 = 0;
+  if (!NewBB2Preds.empty()) {
+    // Create another basic block for the rest of OrigBB's predecessors.
+    NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+                                OrigBB->getName() + Suffix2,
+                                OrigBB->getParent(), OrigBB);
+    NewBBs.push_back(NewBB2);
+
+    // The new block unconditionally branches to the old block.
+    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+
+    // Move the remaining edges from OrigBB to point to NewBB2.
+    for (SmallVectorImpl<BasicBlock*>::iterator
+           i = NewBB2Preds.begin(), e = NewBB2Preds.end(); i != e; ++i)
+      (*i)->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+    // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+    HasLoopExit = false;
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit);
+
+    // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit);
+  }
+
+  LandingPadInst *LPad = OrigBB->getLandingPadInst();
+  Instruction *Clone1 = LPad->clone();
+  Clone1->setName(Twine("lpad") + Suffix1);
+  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+  if (NewBB2) {
+    Instruction *Clone2 = LPad->clone();
+    Clone2->setName(Twine("lpad") + Suffix2);
+    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+    // Create a PHI node for the two cloned landingpad instructions.
+    PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+    PN->addIncoming(Clone1, NewBB1);
+    PN->addIncoming(Clone2, NewBB2);
+    LPad->replaceAllUsesWith(PN);
+    LPad->eraseFromParent();
+  } else {
+    // There is no second clone. Just replace the landing pad with the first
+    // clone.
+    LPad->replaceAllUsesWith(Clone1);
+    LPad->eraseFromParent();
+  }
+}
+
+/// FindFunctionBackedges - Analyze the specified function to find all of the
+/// loop backedges in the function and return them.  This is a relatively cheap
+/// (compared to computing dominators and loop info) analysis.
+///
+/// The output is added to Result, as pairs of <from,to> edge info.
+void llvm::FindFunctionBackedges(const Function &F,
+     SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
+  const BasicBlock *BB = &F.getEntryBlock();
+  if (succ_begin(BB) == succ_end(BB))
+    return;
+
+  SmallPtrSet<const BasicBlock*, 8> Visited;
+  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
+  SmallPtrSet<const BasicBlock*, 8> InStack;
+
+  Visited.insert(BB);
+  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+  InStack.insert(BB);
+  do {
+    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
+    const BasicBlock *ParentBB = Top.first;
+    succ_const_iterator &I = Top.second;
+
+    bool FoundNew = false;
+    while (I != succ_end(ParentBB)) {
+      BB = *I++;
+      if (Visited.insert(BB)) {
+        FoundNew = true;
+        break;
+      }
+      // Successor is in VisitStack, it's a back edge.
+      if (InStack.count(BB))
+        Result.push_back(std::make_pair(ParentBB, BB));
+    }
+
+    if (FoundNew) {
+      // Go down one level if there is a unvisited successor.
+      InStack.insert(BB);
+      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+    } else {
+      // Go up one level.
+      InStack.erase(VisitStack.pop_back_val().first);
+    }
+  } while (!VisitStack.empty());
+}
+
+/// FoldReturnIntoUncondBranch - This method duplicates the specified return
+/// instruction into a predecessor which ends in an unconditional branch. If
+/// the return instruction returns a value defined by a PHI, propagate the
+/// right value into the return. It returns the new return instruction in the
+/// predecessor.
+ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
+                                             BasicBlock *Pred) {
+  Instruction *UncondBranch = Pred->getTerminator();
+  // Clone the return and add it to the end of the predecessor.
+  Instruction *NewRet = RI->clone();
+  Pred->getInstList().push_back(NewRet);
+
+  // If the return instruction returns a value, and if the value was a
+  // PHI node in "BB", propagate the right value into the return.
+  for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+       i != e; ++i) {
+    Value *V = *i;
+    Instruction *NewBC = 0;
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+      // Return value might be bitcasted. Clone and insert it before the
+      // return instruction.
+      V = BCI->getOperand(0);
+      NewBC = BCI->clone();
+      Pred->getInstList().insert(NewRet, NewBC);
+      *i = NewBC;
+    }
+    if (PHINode *PN = dyn_cast<PHINode>(V)) {
+      if (PN->getParent() == BB) {
+        if (NewBC)
+          NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        else
+          *i = PN->getIncomingValueForBlock(Pred);
+      }
+    }
+  }
+
+  // Update any PHI nodes in the returning block to realize that we no
+  // longer branch to them.
+  BB->removePredecessor(Pred);
+  UncondBranch->eraseFromParent();
+  return cast<ReturnInst>(NewRet);
+}
+
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights) {
+  Instruction *SplitBefore = Cmp->getNextNode();
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  TerminatorInst *CheckTerm;
+  if (Unreachable)
+    CheckTerm = new UnreachableInst(C, ThenBlock);
+  else
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+  return CheckTerm;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
new file mode 100644
index 000000000000..8513772da2e8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -0,0 +1,409 @@
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block.  This pass may be "required" by passes that
+// cannot deal with critical edges.  For this usage, the structure type is
+// forward declared.  This pass obviously invalidates the CFG, but can update
+// dominator trees.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "break-crit-edges"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+  struct BreakCriticalEdges : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BreakCriticalEdges() : FunctionPass(ID) {
+      initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<ProfileInfo>();
+
+      // No loop canonicalization guarantees are broken by this pass.
+      AU.addPreservedID(LoopSimplifyID);
+    }
+  };
+}
+
+char BreakCriticalEdges::ID = 0;
+INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
+                "Break critical edges in CFG", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+  return new BreakCriticalEdges();
+}
+
+// runOnFunction - Loop over all of the edges in the CFG, breaking critical
+// edges as they are found.
+//
+bool BreakCriticalEdges::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
+      for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+        if (SplitCriticalEdge(TI, i, this)) {
+          ++NumBroken;
+          Changed = true;
+        }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+//    Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+// isCriticalEdge - Return true if the specified edge is a critical edge.
+// Critical edges are edges from a block with multiple successors to a block
+// with multiple predecessors.
+//
+bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+                          bool AllowIdenticalEdges) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *FirstPred = *I;
+  ++I;        // Skip one edge due to the incoming arc from TI.
+  if (!AllowIdenticalEdges)
+    return I != E;
+
+  // If AllowIdenticalEdges is true, then we allow this edge to be considered
+  // non-critical iff all preds come from TI's block.
+  while (I != E) {
+    const BasicBlock *P = *I;
+    if (P != FirstPred)
+      return true;
+    // Note: leave this as is until no one ever compiles with either gcc 4.0.1
+    // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
+    E = pred_end(P);
+    ++I;
+  }
+  return false;
+}
+
+/// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
+/// may require new PHIs in the new exit block. This function inserts the
+/// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
+/// is the new loop exit block, and DestBB is the old loop exit, now the
+/// successor of SplitBB.
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
+                                       BasicBlock *SplitBB,
+                                       BasicBlock *DestBB) {
+  // SplitBB shouldn't have anything non-trivial in it yet.
+  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+          SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
+
+  // For each PHI in the destination block.
+  for (BasicBlock::iterator I = DestBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned Idx = PN->getBasicBlockIndex(SplitBB);
+    Value *V = PN->getIncomingValue(Idx);
+
+    // If the input is a PHI which already satisfies LCSSA, don't create
+    // a new one.
+    if (const PHINode *VP = dyn_cast<PHINode>(V))
+      if (VP->getParent() == SplitBB)
+        continue;
+
+    // Otherwise a new PHI is needed. Create one and populate it.
+    PHINode *NewPN =
+      PHINode::Create(PN->getType(), Preds.size(), "split",
+                      SplitBB->isLandingPad() ?
+                      SplitBB->begin() : SplitBB->getTerminator());
+    for (unsigned i = 0, e = Preds.size(); i != e; ++i)
+      NewPN->addIncoming(V, Preds[i]);
+
+    // Update the original PHI.
+    PN->setIncomingValue(Idx, NewPN);
+  }
+}
+
+/// SplitCriticalEdge - If this edge is a critical edge, insert a new node to
+/// split the critical edge.  This will update DominatorTree information if it
+/// is available, thus calling this pass will not invalidate either of them.
+/// This returns the new block if the edge was split, null otherwise.
+///
+/// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
+/// specified successor will be merged into the same critical edge block.
+/// This is most commonly interesting with switch instructions, which may
+/// have many edges to any one destination.  This ensures that all edges to that
+/// dest go to one block instead of each going to a different block, but isn't
+/// the standard definition of a "critical edge".
+///
+/// It is invalid to call this function on a critical edge that starts at an
+/// IndirectBrInst.  Splitting these edges will almost always create an invalid
+/// program because the address of the new block won't be the one that is jumped
+/// to.
+///
+BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+                                    Pass *P, bool MergeIdenticalEdges,
+                                    bool DontDeleteUselessPhis,
+                                    bool SplitLandingPads) {
+  if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0;
+
+  assert(!isa<IndirectBrInst>(TI) &&
+         "Cannot split critical edge from IndirectBrInst");
+
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (DestBB->isLandingPad()) return 0;
+
+  // Create a new basic block, linking it into the CFG.
+  BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
+                      TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
+  // Create our unconditional branch.
+  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  NewBI->setDebugLoc(TI->getDebugLoc());
+
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
+  // Insert the block into the function... right after the block TI lives in.
+  Function &F = *TIBB->getParent();
+  Function::iterator FBBI = TIBB;
+  F.getBasicBlockList().insert(++FBBI, NewBB);
+
+  // If there are any PHI nodes in DestBB, we need to update them so that they
+  // merge incoming values from NewBB instead of from TIBB.
+  {
+    unsigned BBIdx = 0;
+    for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+      // We no longer enter through TIBB, now we come in through NewBB.
+      // Revector exactly one entry in the PHI node that used to come from
+      // TIBB to come from NewBB.
+      PHINode *PN = cast<PHINode>(I);
+
+      // Reuse the previous value of BBIdx if it lines up.  In cases where we
+      // have multiple phi nodes with *lots* of predecessors, this is a speed
+      // win because we don't have to scan the PHI looking for TIBB.  This
+      // happens because the BB list of PHI nodes are usually in the same
+      // order.
+      if (PN->getIncomingBlock(BBIdx) != TIBB)
+        BBIdx = PN->getBasicBlockIndex(TIBB);
+      PN->setIncomingBlock(BBIdx, NewBB);
+    }
+  }
+
+  // If there are any other edges from TIBB to DestBB, update those to go
+  // through the split block, making those edges non-critical as well (and
+  // reducing the number of phi entries in the DestBB if relevant).
+  if (MergeIdenticalEdges) {
+    for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+      if (TI->getSuccessor(i) != DestBB) continue;
+
+      // Remove an entry for TIBB from DestBB phi nodes.
+      DestBB->removePredecessor(TIBB, DontDeleteUselessPhis);
+
+      // We found another edge to DestBB, go to NewBB instead.
+      TI->setSuccessor(i, NewBB);
+    }
+  }
+
+
+
+  // If we don't have a pass object, we can't update anything...
+  if (P == 0) return NewBB;
+
+  DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
+  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
+  ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
+
+  // If we have nothing to update, just return.
+  if (DT == 0 && LI == 0 && PI == 0)
+    return NewBB;
+
+  // Now update analysis information.  Since the only predecessor of NewBB is
+  // the TIBB, TIBB clearly dominates NewBB.  TIBB usually doesn't dominate
+  // anything, as there are other successors of DestBB.  However, if all other
+  // predecessors of DestBB are already dominated by DestBB (e.g. DestBB is a
+  // loop header) then NewBB dominates DestBB.
+  SmallVector<BasicBlock*, 8> OtherPreds;
+
+  // If there is a PHI in the block, loop over predecessors with it, which is
+  // faster than iterating pred_begin/end.
+  if (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingBlock(i) != NewBB)
+        OtherPreds.push_back(PN->getIncomingBlock(i));
+  } else {
+    for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB);
+         I != E; ++I) {
+      BasicBlock *P = *I;
+      if (P != NewBB)
+        OtherPreds.push_back(P);
+    }
+  }
+
+  bool NewBBDominatesDestBB = true;
+
+  // Should we update DominatorTree information?
+  if (DT) {
+    DomTreeNode *TINode = DT->getNode(TIBB);
+
+    // The new block is not the immediate dominator for any other nodes, but
+    // TINode is the immediate dominator for the new node.
+    //
+    if (TINode) {       // Don't break unreachable code!
+      DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
+      DomTreeNode *DestBBNode = 0;
+
+      // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
+      if (!OtherPreds.empty()) {
+        DestBBNode = DT->getNode(DestBB);
+        while (!OtherPreds.empty() && NewBBDominatesDestBB) {
+          if (DomTreeNode *OPNode = DT->getNode(OtherPreds.back()))
+            NewBBDominatesDestBB = DT->dominates(DestBBNode, OPNode);
+          OtherPreds.pop_back();
+        }
+        OtherPreds.clear();
+      }
+
+      // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
+      // doesn't dominate anything.
+      if (NewBBDominatesDestBB) {
+        if (!DestBBNode) DestBBNode = DT->getNode(DestBB);
+        DT->changeImmediateDominator(DestBBNode, NewBBNode);
+      }
+    }
+  }
+
+  // Update LoopInfo if it is around.
+  if (LI) {
+    if (Loop *TIL = LI->getLoopFor(TIBB)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NewBB joins loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == DestBB &&
+                 "Should not create irreducible loops!");
+          if (Loop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NewBB, LI->getBase());
+        }
+      }
+      // If TIBB is in a loop and DestBB is outside of that loop, split the
+      // other exit blocks of the loop that also have predecessors outside
+      // the loop, to maintain a LoopSimplify guarantee.
+      if (!TIL->contains(DestBB) &&
+          P->mustPreserveAnalysisID(LoopSimplifyID)) {
+        assert(!TIL->contains(NewBB) &&
+               "Split point for loop exit is contained in loop!");
+
+        // Update LCSSA form in the newly created exit block.
+        if (P->mustPreserveAnalysisID(LCSSAID))
+          createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
+
+        // For each unique exit block...
+        // FIXME: This code is functionally equivalent to the corresponding
+        // loop in LoopSimplify.
+        SmallVector<BasicBlock *, 4> ExitBlocks;
+        TIL->getExitBlocks(ExitBlocks);
+        for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+          // Collect all the preds that are inside the loop, and note
+          // whether there are any preds outside the loop.
+          SmallVector<BasicBlock *, 4> Preds;
+          bool HasPredOutsideOfLoop = false;
+          BasicBlock *Exit = ExitBlocks[i];
+          for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit);
+               I != E; ++I) {
+            BasicBlock *P = *I;
+            if (TIL->contains(P)) {
+              if (isa<IndirectBrInst>(P->getTerminator())) {
+                Preds.clear();
+                break;
+              }
+              Preds.push_back(P);
+            } else {
+              HasPredOutsideOfLoop = true;
+            }
+          }
+          // If there are any preds not in the loop, we'll need to split
+          // the edges. The Preds.empty() check is needed because a block
+          // may appear multiple times in the list. We can't use
+          // getUniqueExitBlocks above because that depends on LoopSimplify
+          // form, which we're in the process of restoring!
+          if (!Preds.empty() && HasPredOutsideOfLoop) {
+            if (!Exit->isLandingPad()) {
+              BasicBlock *NewExitBB =
+                SplitBlockPredecessors(Exit, Preds, "split", P);
+              if (P->mustPreserveAnalysisID(LCSSAID))
+                createPHIsForSplitLoopExit(Preds, NewExitBB, Exit);
+            } else if (SplitLandingPads) {
+              SmallVector<BasicBlock*, 8> NewBBs;
+              SplitLandingPadPredecessors(Exit, Preds,
+                                          ".split1", ".split2",
+                                          P, NewBBs);
+              if (P->mustPreserveAnalysisID(LCSSAID))
+                createPHIsForSplitLoopExit(Preds, NewBBs[0], Exit);
+            }
+          }
+        }
+      }
+      // LCSSA form was updated above for the case where LoopSimplify is
+      // available, which means that all predecessors of loop exit blocks
+      // are within the loop. Without LoopSimplify form, it would be
+      // necessary to insert a new phi.
+      assert((!P->mustPreserveAnalysisID(LCSSAID) ||
+              P->mustPreserveAnalysisID(LoopSimplifyID)) &&
+             "SplitCriticalEdge doesn't know how to update LCCSA form "
+             "without LoopSimplify!");
+    }
+  }
+
+  // Update ProfileInfo if it is around.
+  if (PI)
+    PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges);
+
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
new file mode 100644
index 000000000000..6d13217df55d
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -0,0 +1,597 @@
+//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some functions that will create standard C libcalls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+
+using namespace llvm;
+
+/// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
+Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
+  return B.CreateBitCast(V, B.getInt8PtrTy(), "cstr");
+}
+
+/// EmitStrLen - Emit a call to the strlen function to the builder, for the
+/// specified pointer.  This always returns an integer value of size intptr_t.
+Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strlen))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Constant *StrLen = M->getOrInsertFunction("strlen",
+                                            AttributeSet::get(M->getContext(),
+                                                              AS),
+                                            TD->getIntPtrType(Context),
+                                            B.getInt8PtrTy(),
+                                            NULL);
+  CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
+  if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the
+/// specified pointer.  Ptr is required to be some pointer type, MaxLen must
+/// be of size_t type, and the return value has 'intptr_t' type.
+Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
+                         const DataLayout *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strnlen))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Constant *StrNLen = M->getOrInsertFunction("strnlen",
+                                             AttributeSet::get(M->getContext(),
+                                                              AS),
+                                             TD->getIntPtrType(Context),
+                                             B.getInt8PtrTy(),
+                                             TD->getIntPtrType(Context),
+                                             NULL);
+  CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen");
+  if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitStrChr - Emit a call to the strchr function to the builder, for the
+/// specified pointer and character.  Ptr is required to be some pointer type,
+/// and the return value has 'i8*' type.
+Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
+                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strchr))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AttributeSet AS =
+    AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                      ArrayRef<Attribute::AttrKind>(AVs, 2));
+
+  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I32Ty = B.getInt32Ty();
+  Constant *StrChr = M->getOrInsertFunction("strchr",
+                                            AttributeSet::get(M->getContext(),
+                                                             AS),
+                                            I8Ptr, I8Ptr, I32Ty, NULL);
+  CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
+                               ConstantInt::get(I32Ty, C), "strchr");
+  if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitStrNCmp - Emit a call to the strncmp function to the builder.
+Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
+                         IRBuilder<> &B, const DataLayout *TD,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::strncmp))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *StrNCmp = M->getOrInsertFunction("strncmp",
+                                          AttributeSet::get(M->getContext(),
+                                                           AS),
+                                          B.getInt32Ty(),
+                                          B.getInt8PtrTy(),
+                                          B.getInt8PtrTy(),
+                                          TD->getIntPtrType(Context), NULL);
+  CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B),
+                               CastToCStr(Ptr2, B), Len, "strncmp");
+
+  if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
+/// specified pointer arguments.
+Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+                        const DataLayout *TD, const TargetLibraryInfo *TLI,
+                        StringRef Name) {
+  if (!TLI->has(LibFunc::strcpy))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+  Type *I8Ptr = B.getInt8PtrTy();
+  Value *StrCpy = M->getOrInsertFunction(Name,
+                                         AttributeSet::get(M->getContext(), AS),
+                                         I8Ptr, I8Ptr, I8Ptr, NULL);
+  CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
+                               Name);
+  if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
+/// specified pointer arguments.
+Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
+                         IRBuilder<> &B, const DataLayout *TD,
+                         const TargetLibraryInfo *TLI, StringRef Name) {
+  if (!TLI->has(LibFunc::strncpy))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+  Type *I8Ptr = B.getInt8PtrTy();
+  Value *StrNCpy = M->getOrInsertFunction(Name,
+                                          AttributeSet::get(M->getContext(),
+                                                            AS),
+                                          I8Ptr, I8Ptr, I8Ptr,
+                                          Len->getType(), NULL);
+  CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
+                               Len, "strncpy");
+  if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
+/// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
+/// are pointers.
+Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+                           IRBuilder<> &B, const DataLayout *TD,
+                           const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memcpy_chk))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS;
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         Attribute::NoUnwind);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
+                                         AttributeSet::get(M->getContext(), AS),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         TD->getIntPtrType(Context),
+                                         TD->getIntPtrType(Context), NULL);
+  Dst = CastToCStr(Dst, B);
+  Src = CastToCStr(Src, B);
+  CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize);
+  if (const Function *F = dyn_cast<Function>(MemCpy->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
+/// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
+Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memchr))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS;
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                         ArrayRef<Attribute::AttrKind>(AVs, 2));
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemChr = M->getOrInsertFunction("memchr",
+                                         AttributeSet::get(M->getContext(), AS),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         B.getInt32Ty(),
+                                         TD->getIntPtrType(Context),
+                                         NULL);
+  CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
+
+  if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitMemCmp - Emit a call to the memcmp function.
+Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::memcmp))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
+
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  Value *MemCmp = M->getOrInsertFunction("memcmp",
+                                         AttributeSet::get(M->getContext(), AS),
+                                         B.getInt32Ty(),
+                                         B.getInt8PtrTy(),
+                                         B.getInt8PtrTy(),
+                                         TD->getIntPtrType(Context), NULL);
+  CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
+                               Len, "memcmp");
+
+  if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name' (e.g.
+/// 'floor').  This function is known to take a single of type matching 'Op' and
+/// returns one value with the same type.  If 'Op' is a long double, 'l' is
+/// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
+Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttributeSet &Attrs) {
+  SmallString<20> NameBuffer;
+  if (!Op->getType()->isDoubleTy()) {
+    // If we need to add a suffix, copy into NameBuffer.
+    NameBuffer += Name;
+    if (Op->getType()->isFloatTy())
+      NameBuffer += 'f'; // floorf
+    else
+      NameBuffer += 'l'; // floorl
+    Name = NameBuffer;
+  }
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                         Op->getType(), NULL);
+  CallInst *CI = B.CreateCall(Callee, Op, Name);
+  CI->setAttributes(Attrs);
+  if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
+/// is an integer.
+Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
+                         const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::putchar))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
+                                          B.getInt32Ty(), NULL);
+  CallInst *CI = B.CreateCall(PutChar,
+                              B.CreateIntCast(Char,
+                              B.getInt32Ty(),
+                              /*isSigned*/true,
+                              "chari"),
+                              "putchar");
+
+  if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitPutS - Emit a call to the puts function.  This assumes that Str is
+/// some pointer.
+Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
+                      const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::puts))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+
+  Value *PutS = M->getOrInsertFunction("puts",
+                                       AttributeSet::get(M->getContext(), AS),
+                                       B.getInt32Ty(),
+                                       B.getInt8PtrTy(),
+                                       NULL);
+  CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts");
+  if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
+    CI->setCallingConv(F->getCallingConv());
+  return CI;
+}
+
+/// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
+/// an integer and File is a pointer to FILE.
+Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fputc))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[2];
+  AS[0] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+  Constant *F;
+  if (File->getType()->isPointerTy())
+    F = M->getOrInsertFunction("fputc",
+                               AttributeSet::get(M->getContext(), AS),
+                               B.getInt32Ty(),
+                               B.getInt32Ty(), File->getType(),
+                               NULL);
+  else
+    F = M->getOrInsertFunction("fputc",
+                               B.getInt32Ty(),
+                               B.getInt32Ty(),
+                               File->getType(), NULL);
+  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
+                         "chari");
+  CallInst *CI = B.CreateCall2(F, Char, File, "fputc");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+/// EmitFPutS - Emit a call to the puts function.  Str is required to be a
+/// pointer and File is a pointer to FILE.
+Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fputs))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 2, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+  StringRef FPutsName = TLI->getName(LibFunc::fputs);
+  Constant *F;
+  if (File->getType()->isPointerTy())
+    F = M->getOrInsertFunction(FPutsName,
+                               AttributeSet::get(M->getContext(), AS),
+                               B.getInt32Ty(),
+                               B.getInt8PtrTy(),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
+                               B.getInt8PtrTy(),
+                               File->getType(), NULL);
+  CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+/// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
+/// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
+Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
+                        IRBuilder<> &B, const DataLayout *TD,
+                        const TargetLibraryInfo *TLI) {
+  if (!TLI->has(LibFunc::fwrite))
+    return 0;
+
+  Module *M = B.GetInsertBlock()->getParent()->getParent();
+  AttributeSet AS[3];
+  AS[0] = AttributeSet::get(M->getContext(), 1, Attribute::NoCapture);
+  AS[1] = AttributeSet::get(M->getContext(), 4, Attribute::NoCapture);
+  AS[2] = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
+                            Attribute::NoUnwind);
+  LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteName = TLI->getName(LibFunc::fwrite);
+  Constant *F;
+  if (File->getType()->isPointerTy())
+    F = M->getOrInsertFunction(FWriteName,
+                               AttributeSet::get(M->getContext(), AS),
+                               TD->getIntPtrType(Context),
+                               B.getInt8PtrTy(),
+                               TD->getIntPtrType(Context),
+                               TD->getIntPtrType(Context),
+                               File->getType(), NULL);
+  else
+    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
+                               B.getInt8PtrTy(),
+                               TD->getIntPtrType(Context),
+                               TD->getIntPtrType(Context),
+                               File->getType(), NULL);
+  CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size,
+                        ConstantInt::get(TD->getIntPtrType(Context), 1), File);
+
+  if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
+    CI->setCallingConv(Fn->getCallingConv());
+  return CI;
+}
+
+SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { }
+
+bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI) {
+  // We really need DataLayout for later.
+  if (!TD) return false;
+  
+  this->CI = CI;
+  Function *Callee = CI->getCalledFunction();
+  StringRef Name = Callee->getName();
+  FunctionType *FT = Callee->getFunctionType();
+  LLVMContext &Context = CI->getParent()->getContext();
+  IRBuilder<> B(CI);
+
+  if (Name == "__memcpy_chk") {
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return false;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2), 1);
+      replaceCall(CI->getArgOperand(0));
+      return true;
+    }
+    return false;
+  }
+
+  // Should be similar to memcpy.
+  if (Name == "__mempcpy_chk") {
+    return false;
+  }
+
+  if (Name == "__memmove_chk") {
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return false;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), 1);
+      replaceCall(CI->getArgOperand(0));
+      return true;
+    }
+    return false;
+  }
+
+  if (Name == "__memset_chk") {
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return false;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
+                                   false);
+      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+      replaceCall(CI->getArgOperand(0));
+      return true;
+    }
+    return false;
+  }
+
+  if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") {
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(Context))
+      return 0;
+    
+    
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
+    // st[rp]cpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD,
+                              TLI, Name.substr(2, 6));
+      if (!Ret)
+        return false;
+      replaceCall(Ret);
+      return true;
+    }
+    return false;
+  }
+
+  if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") {
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return false;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TD, TLI,
+                               Name.substr(2, 7));
+      if (!Ret)
+        return false;
+      replaceCall(Ret);
+      return true;
+    }
+    return false;
+  }
+
+  if (Name == "__strcat_chk") {
+    return false;
+  }
+
+  if (Name == "__strncat_chk") {
+    return false;
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
new file mode 100644
index 000000000000..1f517d038d19
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -0,0 +1,262 @@
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bypass-slow-division"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+namespace {
+  struct DivOpInfo {
+    bool SignedOp;
+    Value *Dividend;
+    Value *Divisor;
+
+    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+  };
+
+  struct DivPhiNodes {
+    PHINode *Quotient;
+    PHINode *Remainder;
+
+    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<DivOpInfo> {
+    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+      return Val1.SignedOp == Val2.SignedOp &&
+             Val1.Dividend == Val2.Dividend &&
+             Val1.Divisor == Val2.Divisor;
+    }
+
+    static DivOpInfo getEmptyKey() {
+      return DivOpInfo(false, 0, 0);
+    }
+
+    static DivOpInfo getTombstoneKey() {
+      return DivOpInfo(true, 0, 0);
+    }
+
+    static unsigned getHashValue(const DivOpInfo &Val) {
+      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+                        (unsigned)Val.SignedOp;
+    }
+  };
+
+  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+}
+
+// insertFastDiv - Substitutes the div/rem instruction with code that checks the
+// value of the operands and uses a shorter-faster div/rem instruction when
+// possible and the longer-slower div/rem instruction otherwise.
+static bool insertFastDiv(Function &F,
+                          Function::iterator &I,
+                          BasicBlock::iterator &J,
+                          IntegerType *BypassType,
+                          bool UseDivOp,
+                          bool UseSignedOp,
+                          DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  Value *Dividend = Instr->getOperand(0);
+  Value *Divisor = Instr->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor) ||
+      (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
+    // Operations with immediate values should have
+    // been solved and replaced during compile time.
+    return false;
+  }
+
+  // Basic Block is split before divide
+  BasicBlock *MainBB = I;
+  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
+  ++I; //advance iterator I to successorBB
+
+  // Add new basic block for slow divide operation
+  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  SlowBB->moveBefore(SuccessorBB);
+  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
+  Value *SlowQuotientV;
+  Value *SlowRemainderV;
+  if (UseSignedOp) {
+    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  } else {
+    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+  }
+  SlowBuilder.CreateBr(SuccessorBB);
+
+  // Add new basic block for fast divide operation
+  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  FastBB->moveBefore(SlowBB);
+  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
+  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
+                                                BypassType);
+  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
+                                                 BypassType);
+
+  // udiv/urem because optimization only handles positive numbers
+  Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
+                                                      ShortDivisorV);
+  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
+                                                  ShortDivisorV);
+  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                ShortQuotientV,
+                                                Dividend->getType());
+  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                 ShortRemainderV,
+                                                 Dividend->getType());
+  FastBuilder.CreateBr(SuccessorBB);
+
+  // Phi nodes for result of div and rem
+  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
+  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
+  QuoPhi->addIncoming(FastQuotientV, FastBB);
+  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  RemPhi->addIncoming(SlowRemainderV, SlowBB);
+  RemPhi->addIncoming(FastRemainderV, FastBB);
+
+  // Replace Instr with appropriate phi node
+  if (UseDivOp)
+    Instr->replaceAllUsesWith(QuoPhi);
+  else
+    Instr->replaceAllUsesWith(RemPhi);
+  Instr->eraseFromParent();
+
+  // Combine operands into a single value with OR for value testing below
+  MainBB->getInstList().back().eraseFromParent();
+  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values and branch
+  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
+  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
+  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
+
+  // point iterator J at first instruction of successorBB
+  J = I->begin();
+
+  // Cache phi nodes to be used later in place of other instances
+  // of div or rem with the same sign, dividend, and divisor
+  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
+  DivPhiNodes Value(QuoPhi, RemPhi);
+  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
+  return true;
+}
+
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
+// operands and operation are identical. Otherwise call insertFastDiv to perform
+// the optimization and cache the resulting dividend and remainder.
+static bool reuseOrInsertFastDiv(Function &F,
+                                 Function::iterator &I,
+                                 BasicBlock::iterator &J,
+                                 IntegerType *BypassType,
+                                 bool UseDivOp,
+                                 bool UseSignedOp,
+                                 DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
+
+  if (CacheI == PerBBDivCache.end()) {
+    // If previous instance does not exist, insert fast div
+    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
+                         PerBBDivCache);
+  }
+
+  // Replace operation value with previously generated phi node
+  DivPhiNodes &Value = CacheI->second;
+  if (UseDivOp) {
+    // Replace all uses of div instruction with quotient phi node
+    J->replaceAllUsesWith(Value.Quotient);
+  } else {
+    // Replace all uses of rem instruction with remainder phi node
+    J->replaceAllUsesWith(Value.Remainder);
+  }
+
+  // Advance to next operation
+  ++J;
+
+  // Remove redundant operation
+  Instr->eraseFromParent();
+  return true;
+}
+
+// bypassSlowDivision - This optimization identifies DIV instructions that can
+// be profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(Function &F,
+                              Function::iterator &I,
+                              const DenseMap<unsigned int, unsigned int> &BypassWidths) {
+  DivCacheTy DivCache;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+
+    // Get instruction details
+    unsigned Opcode = J->getOpcode();
+    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
+    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
+    bool UseSignedOp = Opcode == Instruction::SDiv ||
+                       Opcode == Instruction::SRem;
+
+    // Only optimize div or rem ops
+    if (!UseDivOp && !UseRemOp)
+      continue;
+
+    // Skip division on vector types, only optimize integer instructions
+    if (!J->getType()->isIntegerTy())
+      continue;
+
+    // Get bitwidth of div/rem instruction
+    IntegerType *T = cast<IntegerType>(J->getType());
+    unsigned int bitwidth = T->getBitWidth();
+
+    // Continue if bitwidth is not bypassed
+    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
+    if (BI == BypassWidths.end())
+      continue;
+
+    // Get type for div/rem instruction with bypass bitwidth
+    IntegerType *BT = IntegerType::get(J->getContext(), BI->second);
+
+    MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
+                                       UseSignedOp, DivCache);
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
new file mode 100644
index 000000000000..63d7a1d52aa5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -0,0 +1,576 @@
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner.  This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+using namespace llvm;
+
+// CloneBasicBlock - See comments in Cloning.h
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
+                                  ValueToValueMapTy &VMap,
+                                  const Twine &NameSuffix, Function *F,
+                                  ClonedCodeInfo *CodeInfo) {
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
+       II != IE; ++II) {
+    Instruction *NewInst = II->clone();
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[II] = NewInst;                // Add instruction map to value.
+    
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+                                        BB != &BB->getParent()->getEntryBlock();
+  }
+  return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                             ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst*> &Returns,
+                             const char *NameSuffix, ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+  for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); I != E; ++I)
+    assert(VMap.count(I) && "No mapping from source argument specified!");
+#endif
+
+  // Clone any attributes.
+  if (NewFunc->arg_size() == OldFunc->arg_size())
+    NewFunc->copyAttributesFrom(OldFunc);
+  else {
+    //Some arguments were deleted with the VMap. Copy arguments one by one
+    for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
+           E = OldFunc->arg_end(); I != E; ++I)
+      if (Argument* Anew = dyn_cast<Argument>(VMap[I])) {
+        AttributeSet attrs = OldFunc->getAttributes()
+          .getParamAttributes(I->getArgNo() + 1);
+        if (attrs.getNumSlots() > 0)
+          Anew->addAttr(attrs);
+      }
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::ReturnIndex,
+                                          OldFunc->getAttributes()));
+    NewFunc->setAttributes(NewFunc->getAttributes()
+                           .addAttributes(NewFunc->getContext(),
+                                          AttributeSet::FunctionIndex,
+                                          OldFunc->getAttributes()));
+
+  }
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.  Note that we save BE this way in order to handle cloning of
+  // recursive functions into themselves.
+  //
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    const BasicBlock &BB = *BI;
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo);
+
+    // Add basic block mapping.
+    VMap[&BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB.hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(&BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);                                         
+    }
+
+    // Note return instructions for the caller.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses VMap to do all the hard work.
+  for (Function::iterator BB = cast<BasicBlock>(VMap[OldFunc->begin()]),
+         BE = NewFunc->end(); BB != BE; ++BB)
+    // Loop over all instructions, fixing each one as we find it...
+    for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
+      RemapInstruction(II, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper);
+}
+
+/// CloneFunction - Return a copy of the specified function, but without
+/// embedding the function into another module.  Also, any references specified
+/// in the VMap are changed to refer to their mapped value instead of the
+/// original one.  If any of the arguments to the function are in the VMap,
+/// the arguments are deleted from the resultant function.  The VMap is
+/// updated to include mappings from all of the instructions and basicblocks in
+/// the function from their old to new values.
+///
+Function *llvm::CloneFunction(const Function *F, ValueToValueMapTy &VMap,
+                              bool ModuleLevelChanges,
+                              ClonedCodeInfo *CodeInfo) {
+  std::vector<Type*> ArgTypes;
+
+  // The user might be deleting arguments to the function by specifying them in
+  // the VMap.  If so, we need to not add the arguments to the arg ty vector
+  //
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (VMap.count(I) == 0)  // Haven't mapped the argument to anything yet?
+      ArgTypes.push_back(I->getType());
+
+  // Create a new function type...
+  FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+                                    ArgTypes, F->getFunctionType()->isVarArg());
+
+  // Create the new function...
+  Function *NewF = Function::Create(FTy, F->getLinkage(), F->getName());
+
+  // Loop over the arguments, copying the names of the mapped arguments over...
+  Function::arg_iterator DestI = NewF->arg_begin();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I)
+    if (VMap.count(I) == 0) {   // Is this argument preserved?
+      DestI->setName(I->getName()); // Copy the name over...
+      VMap[I] = DestI++;        // Add mapping to VMap
+    }
+
+  SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
+  CloneFunctionInto(NewF, F, VMap, ModuleLevelChanges, Returns, "", CodeInfo);
+  return NewF;
+}
+
+
+
+namespace {
+  /// PruningFunctionCloner - This class is a private class used to implement
+  /// the CloneAndPruneFunctionInto method.
+  struct PruningFunctionCloner {
+    Function *NewFunc;
+    const Function *OldFunc;
+    ValueToValueMapTy &VMap;
+    bool ModuleLevelChanges;
+    const char *NameSuffix;
+    ClonedCodeInfo *CodeInfo;
+    const DataLayout *TD;
+  public:
+    PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+                          ValueToValueMapTy &valueMap,
+                          bool moduleLevelChanges,
+                          const char *nameSuffix, 
+                          ClonedCodeInfo *codeInfo,
+                          const DataLayout *td)
+    : NewFunc(newFunc), OldFunc(oldFunc),
+      VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
+      NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
+    }
+
+    /// CloneBlock - The specified block is found to be reachable, clone it and
+    /// anything that it can reach.
+    void CloneBlock(const BasicBlock *BB,
+                    std::vector<const BasicBlock*> &ToClone);
+  };
+}
+
+/// CloneBlock - The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+                                       std::vector<const BasicBlock*> &ToClone){
+  WeakVH &BBEntry = VMap[BB];
+
+  // Have we already cloned this block?
+  if (BBEntry) return;
+  
+  // Nope, clone it now.
+  BasicBlock *NewBB;
+  BBEntry = NewBB = BasicBlock::Create(BB->getContext());
+  if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+  // It is only legal to clone a function if a block address within that
+  // function is never referenced outside of the function.  Given that, we
+  // want to map block addresses from the old function to block addresses in
+  // the clone. (This is different from the generic ValueMapper
+  // implementation, which generates an invalid blockaddress when
+  // cloning a function.)
+  //
+  // Note that we don't need to fix the mapping for unreachable blocks;
+  // the default mapping there is safe.
+  if (BB->hasAddressTaken()) {
+    Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                            const_cast<BasicBlock*>(BB));
+    VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
+  }
+    
+
+  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  
+  // Loop over all instructions, and copy them over, DCE'ing as we go.  This
+  // loop doesn't include the terminator.
+  for (BasicBlock::const_iterator II = BB->begin(), IE = --BB->end();
+       II != IE; ++II) {
+    Instruction *NewInst = II->clone();
+
+    // Eagerly remap operands to the newly cloned instruction, except for PHI
+    // nodes for which we defer processing until we update the CFG.
+    if (!isa<PHINode>(NewInst)) {
+      RemapInstruction(NewInst, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+      // If we can simplify this instruction to some other value, simply add
+      // a mapping to that value rather than inserting a new instruction into
+      // the basic block.
+      if (Value *V = SimplifyInstruction(NewInst, TD)) {
+        // On the off-chance that this simplifies to an instruction in the old
+        // function, map it back into the new function.
+        if (Value *MappedV = VMap.lookup(V))
+          V = MappedV;
+
+        VMap[II] = V;
+        delete NewInst;
+        continue;
+      }
+    }
+
+    if (II->hasName())
+      NewInst->setName(II->getName()+NameSuffix);
+    VMap[II] = NewInst;                // Add instruction map to value.
+    NewBB->getInstList().push_back(NewInst);
+    hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+      if (isa<ConstantInt>(AI->getArraySize()))
+        hasStaticAllocas = true;
+      else
+        hasDynamicAllocas = true;
+    }
+  }
+  
+  // Finally, clone over the terminator.
+  const TerminatorInst *OldTI = BB->getTerminator();
+  bool TerminatorDone = false;
+  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+    if (BI->isConditional()) {
+      // If the condition was a known constant in the callee...
+      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+      // Or is a known constant in the caller...
+      if (Cond == 0) {
+        Value *V = VMap[BI->getCondition()];
+        Cond = dyn_cast_or_null<ConstantInt>(V);
+      }
+
+      // Constant fold to uncond branch!
+      if (Cond) {
+        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+        VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+        ToClone.push_back(Dest);
+        TerminatorDone = true;
+      }
+    }
+  } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+    // If switching on a value known constant in the caller.
+    ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+    if (Cond == 0) { // Or known constant after constant prop in the callee...
+      Value *V = VMap[SI->getCondition()];
+      Cond = dyn_cast_or_null<ConstantInt>(V);
+    }
+    if (Cond) {     // Constant fold to uncond branch!
+      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
+      VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
+    }
+  }
+  
+  if (!TerminatorDone) {
+    Instruction *NewInst = OldTI->clone();
+    if (OldTI->hasName())
+      NewInst->setName(OldTI->getName()+NameSuffix);
+    NewBB->getInstList().push_back(NewInst);
+    VMap[OldTI] = NewInst;             // Add instruction map to value.
+    
+    // Recursively clone any reachable successor blocks.
+    const TerminatorInst *TI = BB->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      ToClone.push_back(TI->getSuccessor(i));
+  }
+  
+  if (CodeInfo) {
+    CodeInfo->ContainsCalls          |= hasCalls;
+    CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas && 
+      BB != &BB->getParent()->front();
+  }
+}
+
+/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly.  The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead.  Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+                                     ValueToValueMapTy &VMap,
+                                     bool ModuleLevelChanges,
+                                     SmallVectorImpl<ReturnInst*> &Returns,
+                                     const char *NameSuffix, 
+                                     ClonedCodeInfo *CodeInfo,
+                                     const DataLayout *TD,
+                                     Instruction *TheCall) {
+  assert(NameSuffix && "NameSuffix cannot be null!");
+  
+#ifndef NDEBUG
+  for (Function::const_arg_iterator II = OldFunc->arg_begin(), 
+       E = OldFunc->arg_end(); II != E; ++II)
+    assert(VMap.count(II) && "No mapping from source argument specified!");
+#endif
+
+  PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
+                            NameSuffix, CodeInfo, TD);
+
+  // Clone the entry block, and anything recursively reachable from it.
+  std::vector<const BasicBlock*> CloneWorklist;
+  CloneWorklist.push_back(&OldFunc->getEntryBlock());
+  while (!CloneWorklist.empty()) {
+    const BasicBlock *BB = CloneWorklist.back();
+    CloneWorklist.pop_back();
+    PFC.CloneBlock(BB, CloneWorklist);
+  }
+  
+  // Loop over all of the basic blocks in the old function.  If the block was
+  // reachable, we have cloned it and the old block is now in the value map:
+  // insert it into the new function in the right order.  If not, ignore it.
+  //
+  // Defer PHI resolution until rest of function is resolved.
+  SmallVector<const PHINode*, 16> PHIToResolve;
+  for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+       BI != BE; ++BI) {
+    Value *V = VMap[BI];
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
+    if (NewBB == 0) continue;  // Dead block.
+
+    // Add the new block to the new function.
+    NewFunc->getBasicBlockList().push_back(NewBB);
+
+    // Handle PHI nodes specially, as we have to remove references to dead
+    // blocks.
+    for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); I != E; ++I)
+      if (const PHINode *PN = dyn_cast<PHINode>(I))
+        PHIToResolve.push_back(PN);
+      else
+        break;
+
+    // Finally, remap the terminator instructions, as those can't be remapped
+    // until all BBs are mapped.
+    RemapInstruction(NewBB->getTerminator(), VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+  }
+  
+  // Defer PHI resolution until rest of function is resolved, PHI resolution
+  // requires the CFG to be up-to-date.
+  for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+    const PHINode *OPN = PHIToResolve[phino];
+    unsigned NumPreds = OPN->getNumIncomingValues();
+    const BasicBlock *OldBB = OPN->getParent();
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
+
+    // Map operands for blocks that are live and remove operands for blocks
+    // that are dead.
+    for (; phino != PHIToResolve.size() &&
+         PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+      OPN = PHIToResolve[phino];
+      PHINode *PN = cast<PHINode>(VMap[OPN]);
+      for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+        Value *V = VMap[PN->getIncomingBlock(pred)];
+        if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
+          Value *InVal = MapValue(PN->getIncomingValue(pred),
+                                  VMap, 
+                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+          assert(InVal && "Unknown input value?");
+          PN->setIncomingValue(pred, InVal);
+          PN->setIncomingBlock(pred, MappedBlock);
+        } else {
+          PN->removeIncomingValue(pred, false);
+          --pred, --e;  // Revisit the next entry.
+        }
+      } 
+    }
+    
+    // The loop above has removed PHI entries for those blocks that are dead
+    // and has updated others.  However, if a block is live (i.e. copied over)
+    // but its terminator has been changed to not go to this block, then our
+    // phi nodes will have invalid entries.  Update the PHI nodes in this
+    // case.
+    PHINode *PN = cast<PHINode>(NewBB->begin());
+    NumPreds = std::distance(pred_begin(NewBB), pred_end(NewBB));
+    if (NumPreds != PN->getNumIncomingValues()) {
+      assert(NumPreds < PN->getNumIncomingValues());
+      // Count how many times each predecessor comes to this block.
+      std::map<BasicBlock*, unsigned> PredCount;
+      for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+           PI != E; ++PI)
+        --PredCount[*PI];
+      
+      // Figure out how many entries to remove from each PHI.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        ++PredCount[PN->getIncomingBlock(i)];
+      
+      // At this point, the excess predecessor entries are positive in the
+      // map.  Loop over all of the PHIs and remove excess predecessor
+      // entries.
+      BasicBlock::iterator I = NewBB->begin();
+      for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+        for (std::map<BasicBlock*, unsigned>::iterator PCI =PredCount.begin(),
+             E = PredCount.end(); PCI != E; ++PCI) {
+          BasicBlock *Pred     = PCI->first;
+          for (unsigned NumToRemove = PCI->second; NumToRemove; --NumToRemove)
+            PN->removeIncomingValue(Pred, false);
+        }
+      }
+    }
+    
+    // If the loops above have made these phi nodes have 0 or 1 operand,
+    // replace them with undef or the input value.  We must do this for
+    // correctness, because 0-operand phis are not valid.
+    PN = cast<PHINode>(NewBB->begin());
+    if (PN->getNumIncomingValues() == 0) {
+      BasicBlock::iterator I = NewBB->begin();
+      BasicBlock::const_iterator OldI = OldBB->begin();
+      while ((PN = dyn_cast<PHINode>(I++))) {
+        Value *NV = UndefValue::get(PN->getType());
+        PN->replaceAllUsesWith(NV);
+        assert(VMap[OldI] == PN && "VMap mismatch");
+        VMap[OldI] = NV;
+        PN->eraseFromParent();
+        ++OldI;
+      }
+    }
+  }
+
+  // Make a second pass over the PHINodes now that all of them have been
+  // remapped into the new function, simplifying the PHINode and performing any
+  // recursive simplifications exposed. This will transparently update the
+  // WeakVH in the VMap. Notably, we rely on that so that if we coalesce
+  // two PHINodes, the iteration over the old PHIs remains valid, and the
+  // mapping will just map us to the new node (which may not even be a PHI
+  // node).
+  for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+    if (PHINode *PN = dyn_cast<PHINode>(VMap[PHIToResolve[Idx]]))
+      recursivelySimplifyInstruction(PN, TD);
+
+  // Now that the inlined function body has been fully constructed, go through
+  // and zap unconditional fall-through branches.  This happen all the time when
+  // specializing code: code specialization turns conditional branches into
+  // uncond branches, and this code folds them.
+  Function::iterator Begin = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]);
+  Function::iterator I = Begin;
+  while (I != NewFunc->end()) {
+    // Check if this block has become dead during inlining or other
+    // simplifications. Note that the first block will appear dead, as it has
+    // not yet been wired up properly.
+    if (I != Begin && (pred_begin(I) == pred_end(I) ||
+                       I->getSinglePredecessor() == I)) {
+      BasicBlock *DeadBB = I++;
+      DeleteDeadBlock(DeadBB);
+      continue;
+    }
+
+    // We need to simplify conditional branches and switches with a constant
+    // operand. We try to prune these out when cloning, but if the
+    // simplification required looking through PHI nodes, those are only
+    // available after forming the full basic block. That may leave some here,
+    // and we still want to prune the dead code as early as possible.
+    ConstantFoldTerminator(I);
+
+    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+    if (!BI || BI->isConditional()) { ++I; continue; }
+    
+    BasicBlock *Dest = BI->getSuccessor(0);
+    if (!Dest->getSinglePredecessor()) {
+      ++I; continue;
+    }
+
+    // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+    // above should have zapped all of them..
+    assert(!isa<PHINode>(Dest->begin()));
+
+    // We know all single-entry PHI nodes in the inlined function have been
+    // removed, so we just need to splice the blocks.
+    BI->eraseFromParent();
+    
+    // Make all PHI nodes that referred to Dest now refer to I as their source.
+    Dest->replaceAllUsesWith(I);
+
+    // Move all the instructions in the succ to the pred.
+    I->getInstList().splice(I->end(), Dest->getInstList());
+    
+    // Remove the dest block.
+    Dest->eraseFromParent();
+    
+    // Do not increment I, iteratively merge all things this block branches to.
+  }
+
+  // Make a final pass over the basic blocks from theh old function to gather
+  // any return instructions which survived folding. We have to do this here
+  // because we can iteratively remove and merge returns above.
+  for (Function::iterator I = cast<BasicBlock>(VMap[&OldFunc->getEntryBlock()]),
+                          E = NewFunc->end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+      Returns.push_back(RI);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
new file mode 100644
index 000000000000..64df089e1b81
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -0,0 +1,122 @@
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+/// CloneModule - Return an exact copy of the specified module.  This is not as
+/// easy as it might seem because we have to worry about making copies of global
+/// variables and functions, and making their (initializers and references,
+/// respectively) refer to the right globals.
+///
+Module *llvm::CloneModule(const Module *M) {
+  // Create the value map that maps things from the old module over to the new
+  // module.
+  ValueToValueMapTy VMap;
+  return CloneModule(M, VMap);
+}
+
+Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
+  // First off, we need to create the new module.
+  Module *New = new Module(M->getModuleIdentifier(), M->getContext());
+  New->setDataLayout(M->getDataLayout());
+  New->setTargetTriple(M->getTargetTriple());
+  New->setModuleInlineAsm(M->getModuleInlineAsm());
+   
+  // Loop over all of the global variables, making corresponding globals in the
+  // new module.  Here we add them to the VMap and to the new Module.  We
+  // don't worry about attributes or initializers, they will come later.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = new GlobalVariable(*New, 
+                                            I->getType()->getElementType(),
+                                            I->isConstant(), I->getLinkage(),
+                                            (Constant*) 0, I->getName(),
+                                            (GlobalVariable*) 0,
+                                            I->getThreadLocalMode(),
+                                            I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(I);
+    VMap[I] = GV;
+  }
+
+  // Loop over the functions in the module, making external functions as before
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *NF =
+      Function::Create(cast<FunctionType>(I->getType()->getElementType()),
+                       I->getLinkage(), I->getName(), New);
+    NF->copyAttributesFrom(I);
+    VMap[I] = NF;
+  }
+
+  // Loop over the aliases in the module
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    GlobalAlias *GA = new GlobalAlias(I->getType(), I->getLinkage(),
+                                      I->getName(), NULL, New);
+    GA->copyAttributesFrom(I);
+    VMap[I] = GA;
+  }
+  
+  // Now that all of the things that global variable initializer can refer to
+  // have been created, loop through and copy the global variable referrers
+  // over...  We also set the attributes on the global now.
+  //
+  for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
+       I != E; ++I) {
+    GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
+    if (I->hasInitializer())
+      GV->setInitializer(MapValue(I->getInitializer(), VMap));
+  }
+
+  // Similarly, copy over function bodies now...
+  //
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
+    Function *F = cast<Function>(VMap[I]);
+    if (!I->isDeclaration()) {
+      Function::arg_iterator DestI = F->arg_begin();
+      for (Function::const_arg_iterator J = I->arg_begin(); J != I->arg_end();
+           ++J) {
+        DestI->setName(J->getName());
+        VMap[J] = DestI++;
+      }
+
+      SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
+      CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns);
+    }
+  }
+
+  // And aliases
+  for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E; ++I) {
+    GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
+    if (const Constant *C = I->getAliasee())
+      GA->setAliasee(MapValue(C, VMap));
+  }
+
+  // And named metadata....
+  for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
+         E = M->named_metadata_end(); I != E; ++I) {
+    const NamedMDNode &NMD = *I;
+    NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
+    for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+      NewNMD->addOperand(MapValue(NMD.getOperand(i), VMap));
+  }
+
+  return New;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
new file mode 100644
index 000000000000..8fa412a18b99
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -0,0 +1,96 @@
+//===- CmpInstAnalysis.cpp - Utils to help fold compares ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file holds routines to help analyse compare instructions
+// and fold them into constants or other compare instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
+/// are carefully arranged to allow folding of expressions such as:
+///
+///      (A < B) | (A > B) --> (A != B)
+///
+/// Note that this is only valid if the first and second predicates have the
+/// same sign. Is illegal to do: (A u< B) | (A s> B)
+///
+/// Three bits are used to represent the condition, as follows:
+///   0  A > B
+///   1  A == B
+///   2  A < B
+///
+/// <=>  Value  Definition
+/// 000     0   Always false
+/// 001     1   A >  B
+/// 010     2   A == B
+/// 011     3   A >= B
+/// 100     4   A <  B
+/// 101     5   A != B
+/// 110     6   A <= B
+/// 111     7   Always true
+///
+unsigned llvm::getICmpCode(const ICmpInst *ICI, bool InvertPred) {
+  ICmpInst::Predicate Pred = InvertPred ? ICI->getInversePredicate()
+                                        : ICI->getPredicate();
+  switch (Pred) {
+      // False -> 0
+    case ICmpInst::ICMP_UGT: return 1;  // 001
+    case ICmpInst::ICMP_SGT: return 1;  // 001
+    case ICmpInst::ICMP_EQ:  return 2;  // 010
+    case ICmpInst::ICMP_UGE: return 3;  // 011
+    case ICmpInst::ICMP_SGE: return 3;  // 011
+    case ICmpInst::ICMP_ULT: return 4;  // 100
+    case ICmpInst::ICMP_SLT: return 4;  // 100
+    case ICmpInst::ICMP_NE:  return 5;  // 101
+    case ICmpInst::ICMP_ULE: return 6;  // 110
+    case ICmpInst::ICMP_SLE: return 6;  // 110
+      // True -> 7
+    default:
+      llvm_unreachable("Invalid ICmp predicate!");
+  }
+}
+
+/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or the
+/// predicate for a new ICmp instruction. The sign is passed in to determine
+/// which kind of predicate to use in the new icmp instruction.
+/// Non-NULL return value will be a true or false constant.
+/// NULL return means a new ICmp is needed.  The predicate for which is
+/// output in NewICmpPred.
+Value *llvm::getICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
+                          CmpInst::Predicate &NewICmpPred) {
+  switch (Code) {
+    default: llvm_unreachable("Illegal ICmp code!");
+    case 0: // False.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+    case 1: NewICmpPred = Sign ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    case 2: NewICmpPred = ICmpInst::ICMP_EQ; break;
+    case 3: NewICmpPred = Sign ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 4: NewICmpPred = Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 5: NewICmpPred = ICmpInst::ICMP_NE; break;
+    case 6: NewICmpPred = Sign ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 7: // True.
+      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+  }
+  return NULL;
+}
+
+/// PredicatesFoldable - Return true if both predicates match sign or if at
+/// least one of them is an equality comparison (which is signless).
+bool llvm::PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (CmpInst::isSigned(p1) == CmpInst::isSigned(p2)) ||
+         (CmpInst::isSigned(p1) && ICmpInst::isEquality(p2)) ||
+         (CmpInst::isSigned(p2) && ICmpInst::isEquality(p1));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
new file mode 100644
index 000000000000..f7c659f2193b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -0,0 +1,780 @@
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+                 cl::desc("Aggregate arguments to code-extracted functions"));
+
+/// \brief Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB) {
+  // Landing pads must be in the function where they were inserted for cleanup.
+  if (BB.isLandingPad())
+    return false;
+
+  // Don't hoist code containing allocas, invokes, or vastarts.
+  for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    if (isa<AllocaInst>(I) || isa<InvokeInst>(I))
+      return false;
+    if (const CallInst *CI = dyn_cast<CallInst>(I))
+      if (const Function *F = CI->getCalledFunction())
+        if (F->getIntrinsicID() == Intrinsic::vastart)
+          return false;
+  }
+
+  return true;
+}
+
+/// \brief Build a set of blocks to extract if the input blocks are viable.
+template <typename IteratorT>
+static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
+                                                       IteratorT BBEnd) {
+  SetVector<BasicBlock *> Result;
+
+  assert(BBBegin != BBEnd);
+
+  // Loop over the blocks, adding them to our set-vector, and aborting with an
+  // empty set if we encounter invalid blocks.
+  for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) {
+    if (!Result.insert(*I))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+
+    if (!isBlockValidForExtraction(**I)) {
+      Result.clear();
+      return Result;
+    }
+  }
+
+#ifndef NDEBUG
+  for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()),
+                                         E = Result.end();
+       I != E; ++I)
+    for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I);
+         PI != PE; ++PI)
+      assert(Result.count(*PI) &&
+             "No blocks in this region may have entries from outside the region"
+             " except for the first block!");
+#endif
+
+  return Result;
+}
+
+/// \brief Helper to call buildExtractionBlockSet with an ArrayRef.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) {
+  return buildExtractionBlockSet(BBs.begin(), BBs.end());
+}
+
+/// \brief Helper to call buildExtractionBlockSet with a RegionNode.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(const RegionNode &RN) {
+  if (!RN.isSubRegion())
+    // Just a single BasicBlock.
+    return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>());
+
+  const Region &R = *RN.getNodeAs<Region>();
+
+  return buildExtractionBlockSet(R.block_begin(), R.block_end());
+}
+
+CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs)
+  : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+                             bool AggregateArgs)
+  : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN,
+                             bool AggregateArgs)
+  : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt),
+    Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
+                                      ValueSet &Outputs) const {
+  for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(),
+                                               E = Blocks.end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
+         II != IE; ++II) {
+      for (User::op_iterator OI = II->op_begin(), OE = II->op_end();
+           OI != OE; ++OI)
+        if (definedInCaller(Blocks, *OI))
+          Inputs.insert(*OI);
+
+      for (Value::use_iterator UI = II->use_begin(), UE = II->use_end();
+           UI != UE; ++UI)
+        if (!definedInRegion(Blocks, *UI)) {
+          Outputs.insert(II);
+          break;
+        }
+    }
+  }
+}
+
+/// severSplitPHINodes - If a PHI node has multiple inputs from outside of the
+/// region, we need to split the entry block of the region so that the PHI node
+/// is easier to deal with.
+void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
+  unsigned NumPredsFromRegion = 0;
+  unsigned NumPredsOutsideRegion = 0;
+
+  if (Header != &Header->getParent()->getEntryBlock()) {
+    PHINode *PN = dyn_cast<PHINode>(Header->begin());
+    if (!PN) return;  // No PHI nodes.
+
+    // If the header node contains any PHI nodes, check to see if there is more
+    // than one entry from outside the region.  If so, we need to sever the
+    // header block into two.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i)))
+        ++NumPredsFromRegion;
+      else
+        ++NumPredsOutsideRegion;
+
+    // If there is one (or fewer) predecessor from outside the region, we don't
+    // need to do anything special.
+    if (NumPredsOutsideRegion <= 1) return;
+  }
+
+  // Otherwise, we need to split the header block into two pieces: one
+  // containing PHI nodes merging values from outside of the region, and a
+  // second that contains all of the code for the block and merges back any
+  // incoming values from inside of the region.
+  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI();
+  BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,
+                                              Header->getName()+".ce");
+
+  // We only want to code extract the second block now, and it becomes the new
+  // header of the region.
+  BasicBlock *OldPred = Header;
+  Blocks.remove(OldPred);
+  Blocks.insert(NewBB);
+  Header = NewBB;
+
+  // Okay, update dominator sets. The blocks that dominate the new one are the
+  // blocks that dominate TIBB plus the new block itself.
+  if (DT)
+    DT->splitBlock(NewBB);
+
+  // Okay, now we need to adjust the PHI nodes and any branches from within the
+  // region to go to the new header block instead of the old header block.
+  if (NumPredsFromRegion) {
+    PHINode *PN = cast<PHINode>(OldPred->begin());
+    // Loop over all of the predecessors of OldPred that are in the region,
+    // changing them to branch to NewBB instead.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (Blocks.count(PN->getIncomingBlock(i))) {
+        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        TI->replaceUsesOfWith(OldPred, NewBB);
+      }
+
+    // Okay, everything within the region is now branching to the right block, we
+    // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+      PHINode *PN = cast<PHINode>(AfterPHIs);
+      // Create a new PHI node in the new region, which has an incoming value
+      // from OldPred of PN.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
+                                       PN->getName()+".ce", NewBB->begin());
+      NewPN->addIncoming(PN, OldPred);
+
+      // Loop over all of the incoming value in PN, moving them to NewPN if they
+      // are from the extracted region.
+      for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+        if (Blocks.count(PN->getIncomingBlock(i))) {
+          NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+          PN->removeIncomingValue(i);
+          --i;
+        }
+      }
+    }
+  }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) {
+      BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret");
+      if (DT) {
+        // Old dominates New. New node dominates all other nodes dominated
+        // by Old.
+        DomTreeNode *OldNode = DT->getNode(*I);
+        SmallVector<DomTreeNode*, 8> Children;
+        for (DomTreeNode::iterator DI = OldNode->begin(), DE = OldNode->end();
+             DI != DE; ++DI) 
+          Children.push_back(*DI);
+
+        DomTreeNode *NewNode = DT->addNewBlock(New, *I);
+
+        for (SmallVector<DomTreeNode*, 8>::iterator I = Children.begin(),
+               E = Children.end(); I != E; ++I) 
+          DT->changeImmediateDominator(*I, NewNode);
+      }
+    }
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+///
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+                                           const ValueSet &outputs,
+                                           BasicBlock *header,
+                                           BasicBlock *newRootNode,
+                                           BasicBlock *newHeader,
+                                           Function *oldFunction,
+                                           Module *M) {
+  DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+  DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+
+  // This function returns unsigned, outputs will go back by reference.
+  switch (NumExitBlocks) {
+  case 0:
+  case 1: RetTy = Type::getVoidTy(header->getContext()); break;
+  case 2: RetTy = Type::getInt1Ty(header->getContext()); break;
+  default: RetTy = Type::getInt16Ty(header->getContext()); break;
+  }
+
+  std::vector<Type*> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end();
+       i != e; ++i) {
+    const Value *value = *i;
+    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "instr used in func: " << **I << "\n");
+    if (AggregateArgs)
+      paramTy.push_back((*I)->getType());
+    else
+      paramTy.push_back(PointerType::getUnqual((*I)->getType()));
+  }
+
+  DEBUG(dbgs() << "Function type: " << *RetTy << " f(");
+  for (std::vector<Type*>::iterator i = paramTy.begin(),
+         e = paramTy.end(); i != e; ++i)
+    DEBUG(dbgs() << **i << ", ");
+  DEBUG(dbgs() << ")\n");
+
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    PointerType *StructPtr =
+           PointerType::getUnqual(StructType::get(M->getContext(), paramTy));
+    paramTy.clear();
+    paramTy.push_back(StructPtr);
+  }
+  FunctionType *funcType =
+                  FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *newFunction = Function::Create(funcType,
+                                           GlobalValue::InternalLinkage,
+                                           oldFunction->getName() + "_" +
+                                           header->getName(), M);
+  // If the old function is no-throw, so is the new one.
+  if (oldFunction->doesNotThrow())
+    newFunction->setDoesNotThrow();
+  
+  newFunction->getBasicBlockList().push_back(newRootNode);
+
+  // Create an iterator to name all of the arguments we inserted.
+  Function::arg_iterator AI = newFunction->arg_begin();
+
+  // Rewrite all users of the inputs in the extracted region to use the
+  // arguments (or appropriate addressing into struct) instead.
+  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+    Value *RewriteVal;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      GetElementPtrInst *GEP = 
+        GetElementPtrInst::Create(AI, Idx, "gep_" + inputs[i]->getName(), TI);
+      RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
+    } else
+      RewriteVal = AI++;
+
+    std::vector<User*> Users(inputs[i]->use_begin(), inputs[i]->use_end());
+    for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end();
+         use != useE; ++use)
+      if (Instruction* inst = dyn_cast<Instruction>(*use))
+        if (Blocks.count(inst->getParent()))
+          inst->replaceUsesOfWith(inputs[i], RewriteVal);
+  }
+
+  // Set names for input and output arguments.
+  if (!AggregateArgs) {
+    AI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+      AI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+      AI->setName(outputs[i]->getName()+".out");
+  }
+
+  // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+  // within the new function. This must be done before we lose track of which
+  // blocks were originally in the code region.
+  std::vector<User*> Users(header->use_begin(), header->use_end());
+  for (unsigned i = 0, e = Users.size(); i != e; ++i)
+    // The BasicBlock which contains the branch is not in the region
+    // modify the branch target to a new block
+    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
+      if (!Blocks.count(TI->getParent()) &&
+          TI->getParent()->getParent() == oldFunction)
+        TI->replaceUsesOfWith(header, newHeader);
+
+  return newFunction;
+}
+
+/// FindPhiPredForUseInBlock - Given a value and a basic block, find a PHI
+/// that uses the value within the basic block, and return the predecessor
+/// block associated with that use, or return 0 if none is found.
+static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) {
+  for (Value::use_iterator UI = Used->use_begin(),
+       UE = Used->use_end(); UI != UE; ++UI) {
+     PHINode *P = dyn_cast<PHINode>(*UI);
+     if (P && P->getParent() == BB)
+       return P->getIncomingBlock(UI);
+  }
+  
+  return 0;
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+void CodeExtractor::
+emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
+                           ValueSet &inputs, ValueSet &outputs) {
+  // Emit a call to the new function, passing in: *pointer to struct (if
+  // aggregating parameters), or plan inputs and allocated memory for outputs
+  std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
+  
+  LLVMContext &Context = newFunction->getContext();
+
+  // Add inputs as params, or to be filled into the struct
+  for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i)
+    if (AggregateArgs)
+      StructValues.push_back(*i);
+    else
+      params.push_back(*i);
+
+  // Create allocas for the outputs
+  for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) {
+    if (AggregateArgs) {
+      StructValues.push_back(*i);
+    } else {
+      AllocaInst *alloca =
+        new AllocaInst((*i)->getType(), 0, (*i)->getName()+".loc",
+                       codeReplacer->getParent()->begin()->begin());
+      ReloadOutputs.push_back(alloca);
+      params.push_back(alloca);
+    }
+  }
+
+  AllocaInst *Struct = 0;
+  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+    std::vector<Type*> ArgTypes;
+    for (ValueSet::iterator v = StructValues.begin(),
+           ve = StructValues.end(); v != ve; ++v)
+      ArgTypes.push_back((*v)->getType());
+
+    // Allocate a struct at the beginning of this function
+    Type *StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
+    Struct =
+      new AllocaInst(StructArgTy, 0, "structArg",
+                     codeReplacer->getParent()->begin()->begin());
+    params.push_back(Struct);
+
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+      GetElementPtrInst *GEP =
+        GetElementPtrInst::Create(Struct, Idx,
+                                  "gep_" + StructValues[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      StoreInst *SI = new StoreInst(StructValues[i], GEP);
+      codeReplacer->getInstList().push_back(SI);
+    }
+  }
+
+  // Emit the call to the function
+  CallInst *call = CallInst::Create(newFunction, params,
+                                    NumExitBlocks > 1 ? "targetBlock" : "");
+  codeReplacer->getInstList().push_back(call);
+
+  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+  unsigned FirstOut = inputs.size();
+  if (!AggregateArgs)
+    std::advance(OutputArgBegin, inputs.size());
+
+  // Reload the outputs passed in by reference
+  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+    Value *Output = 0;
+    if (AggregateArgs) {
+      Value *Idx[2];
+      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      GetElementPtrInst *GEP
+        = GetElementPtrInst::Create(Struct, Idx,
+                                    "gep_reload_" + outputs[i]->getName());
+      codeReplacer->getInstList().push_back(GEP);
+      Output = GEP;
+    } else {
+      Output = ReloadOutputs[i];
+    }
+    LoadInst *load = new LoadInst(Output, outputs[i]->getName()+".reload");
+    Reloads.push_back(load);
+    codeReplacer->getInstList().push_back(load);
+    std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end());
+    for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+      Instruction *inst = cast<Instruction>(Users[u]);
+      if (!Blocks.count(inst->getParent()))
+        inst->replaceUsesOfWith(outputs[i], load);
+    }
+  }
+
+  // Now we can emit a switch statement using the call as a value.
+  SwitchInst *TheSwitch =
+      SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)),
+                         codeReplacer, 0, codeReplacer);
+
+  // Since there may be multiple exits from the original region, make the new
+  // function return an unsigned, switch on that number.  This loop iterates
+  // over all of the blocks in the extracted region, updating any terminator
+  // instructions in the to-be-extracted region that branch to blocks that are
+  // not in the region to be extracted.
+  std::map<BasicBlock*, BasicBlock*> ExitBlockMap;
+
+  unsigned switchVal = 0;
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
+    TerminatorInst *TI = (*i)->getTerminator();
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      if (!Blocks.count(TI->getSuccessor(i))) {
+        BasicBlock *OldTarget = TI->getSuccessor(i);
+        // add a new basic block which returns the appropriate value
+        BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+        if (!NewTarget) {
+          // If we don't already have an exit stub for this non-extracted
+          // destination, create one now!
+          NewTarget = BasicBlock::Create(Context,
+                                         OldTarget->getName() + ".exitStub",
+                                         newFunction);
+          unsigned SuccNum = switchVal++;
+
+          Value *brVal = 0;
+          switch (NumExitBlocks) {
+          case 0:
+          case 1: break;  // No value needed.
+          case 2:         // Conditional branch, return a bool
+            brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
+            break;
+          default:
+            brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
+            break;
+          }
+
+          ReturnInst *NTRet = ReturnInst::Create(Context, brVal, NewTarget);
+
+          // Update the switch instruction.
+          TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
+                                              SuccNum),
+                             OldTarget);
+
+          // Restore values just before we exit
+          Function::arg_iterator OAI = OutputArgBegin;
+          for (unsigned out = 0, e = outputs.size(); out != e; ++out) {
+            // For an invoke, the normal destination is the only one that is
+            // dominated by the result of the invocation
+            BasicBlock *DefBlock = cast<Instruction>(outputs[out])->getParent();
+
+            bool DominatesDef = true;
+
+            if (InvokeInst *Invoke = dyn_cast<InvokeInst>(outputs[out])) {
+              DefBlock = Invoke->getNormalDest();
+
+              // Make sure we are looking at the original successor block, not
+              // at a newly inserted exit block, which won't be in the dominator
+              // info.
+              for (std::map<BasicBlock*, BasicBlock*>::iterator I =
+                     ExitBlockMap.begin(), E = ExitBlockMap.end(); I != E; ++I)
+                if (DefBlock == I->second) {
+                  DefBlock = I->first;
+                  break;
+                }
+
+              // In the extract block case, if the block we are extracting ends
+              // with an invoke instruction, make sure that we don't emit a
+              // store of the invoke value for the unwind block.
+              if (!DT && DefBlock != OldTarget)
+                DominatesDef = false;
+            }
+
+            if (DT) {
+              DominatesDef = DT->dominates(DefBlock, OldTarget);
+              
+              // If the output value is used by a phi in the target block,
+              // then we need to test for dominance of the phi's predecessor
+              // instead.  Unfortunately, this a little complicated since we
+              // have already rewritten uses of the value to uses of the reload.
+              BasicBlock* pred = FindPhiPredForUseInBlock(Reloads[out], 
+                                                          OldTarget);
+              if (pred && DT && DT->dominates(DefBlock, pred))
+                DominatesDef = true;
+            }
+
+            if (DominatesDef) {
+              if (AggregateArgs) {
+                Value *Idx[2];
+                Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+                Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
+                                          FirstOut+out);
+                GetElementPtrInst *GEP =
+                  GetElementPtrInst::Create(OAI, Idx,
+                                            "gep_" + outputs[out]->getName(),
+                                            NTRet);
+                new StoreInst(outputs[out], GEP, NTRet);
+              } else {
+                new StoreInst(outputs[out], OAI, NTRet);
+              }
+            }
+            // Advance output iterator even if we don't emit a store
+            if (!AggregateArgs) ++OAI;
+          }
+        }
+
+        // rewrite the original branch instruction with this new target
+        TI->setSuccessor(i, NewTarget);
+      }
+  }
+
+  // Now that we've done the deed, simplify the switch instruction.
+  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  switch (NumExitBlocks) {
+  case 0:
+    // There are no successors (the block containing the switch itself), which
+    // means that previously this was the last part of the function, and hence
+    // this should be rewritten as a `ret'
+
+    // Check if the function should return a value
+    if (OldFnRetTy->isVoidTy()) {
+      ReturnInst::Create(Context, 0, TheSwitch);  // Return void
+    } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+      // return what we have
+      ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
+    } else {
+      // Otherwise we must have code extracted an unwind or something, just
+      // return whatever we want.
+      ReturnInst::Create(Context, 
+                         Constant::getNullValue(OldFnRetTy), TheSwitch);
+    }
+
+    TheSwitch->eraseFromParent();
+    break;
+  case 1:
+    // Only a single destination, change the switch into an unconditional
+    // branch.
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  case 2:
+    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+                       call, TheSwitch);
+    TheSwitch->eraseFromParent();
+    break;
+  default:
+    // Otherwise, make the default destination of the switch instruction be one
+    // of the other successors.
+    TheSwitch->setCondition(call);
+    TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
+    // Remove redundant case
+    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
+    TheSwitch->removeCase(ToBeRemoved);
+    break;
+  }
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+  Function *oldFunc = (*Blocks.begin())->getParent();
+  Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+  Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+  for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(),
+         e = Blocks.end(); i != e; ++i) {
+    // Delete the basic block from the old function, and the list of blocks
+    oldBlocks.remove(*i);
+
+    // Insert this basic block into the new function
+    newBlocks.push_back(*i);
+  }
+}
+
+Function *CodeExtractor::extractCodeRegion() {
+  if (!isEligible())
+    return 0;
+
+  ValueSet inputs, outputs;
+
+  // Assumption: this is a single-entry code region, and the header is the first
+  // block in the region.
+  BasicBlock *header = *Blocks.begin();
+
+  // If we have to split PHI nodes or the entry block, do so now.
+  severSplitPHINodes(header);
+
+  // If we have any return instructions in the region, split those blocks so
+  // that the return is not in the region.
+  splitReturnBlocks();
+
+  Function *oldFunction = header->getParent();
+
+  // This takes place of the original loop
+  BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(), 
+                                                "codeRepl", oldFunction,
+                                                header);
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(), 
+                                               "newFuncRoot");
+  newFuncRoot->getInstList().push_back(BranchInst::Create(header));
+
+  // Find inputs to, outputs from the code region.
+  findInputsOutputs(inputs, outputs);
+
+  SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+  for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I)
+    for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+      if (!Blocks.count(*SI))
+        ExitBlocks.insert(*SI);
+  NumExitBlocks = ExitBlocks.size();
+
+  // Construct new function based on inputs/outputs & add allocas for all defs.
+  Function *newFunction = constructFunction(inputs, outputs, header,
+                                            newFuncRoot,
+                                            codeReplacer, oldFunction,
+                                            oldFunction->getParent());
+
+  emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+  moveCodeToFunction(newFunction);
+
+  // Loop over all of the PHI nodes in the header block, and change any
+  // references to the old incoming edge to be the new incoming edge.
+  for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!Blocks.count(PN->getIncomingBlock(i)))
+        PN->setIncomingBlock(i, newFuncRoot);
+  }
+
+  // Look at all successors of the codeReplacer block.  If any of these blocks
+  // had PHI nodes in them, we need to update the "from" block to be the code
+  // replacer, not the original block in the extracted region.
+  std::vector<BasicBlock*> Succs(succ_begin(codeReplacer),
+                                 succ_end(codeReplacer));
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      std::set<BasicBlock*> ProcessedPreds;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (Blocks.count(PN->getIncomingBlock(i))) {
+          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
+            PN->setIncomingBlock(i, codeReplacer);
+          else {
+            // There were multiple entries in the PHI for this block, now there
+            // is only one, so remove the duplicated entries.
+            PN->removeIncomingValue(i, false);
+            --i; --e;
+          }
+        }
+    }
+
+  //cerr << "NEW FUNCTION: " << *newFunction;
+  //  verifyFunction(*newFunction);
+
+  //  cerr << "OLD FUNCTION: " << *oldFunction;
+  //  verifyFunction(*oldFunction);
+
+  DEBUG(if (verifyFunction(*newFunction)) 
+        report_fatal_error("verifyFunction failed!"));
+  return newFunction;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
new file mode 100644
index 000000000000..db525cdc24d8
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -0,0 +1,148 @@
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca.  This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value.  It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+                                   Instruction *AllocaPoint) {
+  if (I.use_empty()) {
+    I.eraseFromParent();
+    return 0;
+  }
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(I.getType(), 0,
+                          I.getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = I.getParent()->getParent();
+    Slot = new AllocaInst(I.getType(), 0, I.getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+
+  // Change all of the users of the instruction to read from the stack slot.
+  while (!I.use_empty()) {
+    Instruction *U = cast<Instruction>(I.use_back());
+    if (PHINode *PN = dyn_cast<PHINode>(U)) {
+      // If this is a PHI node, we can't insert a load of the value before the
+      // use.  Instead insert the load in the predecessor block corresponding
+      // to the incoming value.
+      //
+      // Note that if there are multiple edges from a basic block to this PHI
+      // node that we cannot have multiple loads. The problem is that the
+      // resulting PHI node will have multiple values (from each load) coming in
+      // from the same block, which is illegal SSA form. For this reason, we
+      // keep track of and reuse loads we insert.
+      DenseMap<BasicBlock*, Value*> Loads;
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I) {
+          Value *&V = Loads[PN->getIncomingBlock(i)];
+          if (V == 0) {
+            // Insert the load into the predecessor block
+            V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads,
+                             PN->getIncomingBlock(i)->getTerminator());
+          }
+          PN->setIncomingValue(i, V);
+        }
+
+    } else {
+      // If this is a normal instruction, just insert a load.
+      Value *V = new LoadInst(Slot, I.getName()+".reload", VolatileLoads, U);
+      U->replaceUsesOfWith(&I, V);
+    }
+  }
+
+
+  // Insert stores of the computed value into the stack slot. We have to be
+  // careful if I is an invoke instruction, because we can't insert the store
+  // AFTER the terminator instruction.
+  BasicBlock::iterator InsertPt;
+  if (!isa<TerminatorInst>(I)) {
+    InsertPt = &I;
+    ++InsertPt;
+  } else {
+    InvokeInst &II = cast<InvokeInst>(I);
+    if (II.getNormalDest()->getSinglePredecessor())
+      InsertPt = II.getNormalDest()->getFirstInsertionPt();
+    else {
+      // We cannot demote invoke instructions to the stack if their normal edge
+      // is critical.  Therefore, split the critical edge and insert the store
+      // in the newly created basic block.
+      unsigned SuccNum = GetSuccessorNumber(I.getParent(), II.getNormalDest());
+      TerminatorInst *TI = &cast<TerminatorInst>(I);
+      assert (isCriticalEdge(TI, SuccNum) &&
+              "Expected a critical edge!");
+      BasicBlock *BB = SplitCriticalEdge(TI, SuccNum);
+      assert (BB && "Unable to split critical edge.");
+      InsertPt = BB->getFirstInsertionPt();
+    }
+  }
+
+  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  new StoreInst(&I, Slot, InsertPt);
+  return Slot;
+}
+
+/// DemotePHIToStack - This function takes a virtual register computed by a PHI
+/// node and replaces it with a slot in the stack frame allocated via alloca.
+/// The PHI node is deleted. It returns the pointer to the alloca inserted.
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+  if (P->use_empty()) {
+    P->eraseFromParent();
+    return 0;
+  }
+
+  // Create a stack slot to hold the value.
+  AllocaInst *Slot;
+  if (AllocaPoint) {
+    Slot = new AllocaInst(P->getType(), 0,
+                          P->getName()+".reg2mem", AllocaPoint);
+  } else {
+    Function *F = P->getParent()->getParent();
+    Slot = new AllocaInst(P->getType(), 0, P->getName()+".reg2mem",
+                          F->getEntryBlock().begin());
+  }
+
+  // Iterate over each operand inserting a store in each predecessor.
+  for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+      assert(II->getParent() != P->getIncomingBlock(i) &&
+             "Invoke edge not supported yet"); (void)II;
+    }
+    new StoreInst(P->getIncomingValue(i), Slot,
+                  P->getIncomingBlock(i)->getTerminator());
+  }
+
+  // Insert a load in place of the PHI and replace all uses.
+  BasicBlock::iterator InsertPt = P;
+
+  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt);
+  P->replaceAllUsesWith(V);
+
+  // Delete PHI.
+  P->eraseFromParent();
+  return Slot;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
new file mode 100644
index 000000000000..e9828d60cd55
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -0,0 +1,916 @@
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
+  return InlineFunction(CallSite(CI), IFI, InsertLifetime);
+}
+bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
+  return InlineFunction(CallSite(II), IFI, InsertLifetime);
+}
+
+namespace {
+  /// A class for recording information about inlining through an invoke.
+  class InvokeInliningInfo {
+    BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; ///< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  ///< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   ///< PHI for EH values from landingpad insts.
+    SmallVector<Value*, 8> UnwindDestPHIValues;
+
+  public:
+    InvokeInliningInfo(InvokeInst *II)
+      : OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
+        CallerLPad(0), InnerEHValuesPHI(0) {
+      // If there are PHI nodes in the unwind destination block, we need to keep
+      // track of which values came into them from the invoke before removing
+      // the edge from this block.
+      llvm::BasicBlock *InvokeBB = II->getParent();
+      BasicBlock::iterator I = OuterResumeDest->begin();
+      for (; isa<PHINode>(I); ++I) {
+        // Save the value to use for this edge.
+        PHINode *PHI = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+      }
+
+      CallerLPad = cast<LandingPadInst>(I);
+    }
+
+    /// getOuterResumeDest - The outer unwind destination is the target of
+    /// unwind edges introduced for calls within the inlined function.
+    BasicBlock *getOuterResumeDest() const {
+      return OuterResumeDest;
+    }
+
+    BasicBlock *getInnerResumeDest();
+
+    LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
+    /// forwardResume - Forward the 'resume' instruction to the caller's landing
+    /// pad block. When the landing pad block has only one predecessor, this is
+    /// a simple branch. When there is more than one predecessor, we need to
+    /// split the landing pad block after the landingpad instruction and jump
+    /// to there.
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSet<LandingPadInst*, 16> &InlinedLPads);
+
+    /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
+    /// destination block for the given basic block, using the values for the
+    /// original invoke's source block.
+    void addIncomingPHIValuesFor(BasicBlock *BB) const {
+      addIncomingPHIValuesForInto(BB, OuterResumeDest);
+    }
+
+    void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+      BasicBlock::iterator I = dest->begin();
+      for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+        PHINode *phi = cast<PHINode>(I);
+        phi->addIncoming(UnwindDestPHIValues[i], src);
+      }
+    }
+  };
+}
+
+/// getInnerResumeDest - Get or create a target for the branch from ResumeInsts.
+BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
+  if (InnerResumeDest) return InnerResumeDest;
+
+  // Split the landing pad.
+  BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint;
+  InnerResumeDest =
+    OuterResumeDest->splitBasicBlock(SplitPoint,
+                                     OuterResumeDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned PHICapacity = 2;
+
+  // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+  BasicBlock::iterator InsertPoint = InnerResumeDest->begin();
+  BasicBlock::iterator I = OuterResumeDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *OuterPHI = cast<PHINode>(I);
+    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+                                        OuterPHI->getName() + ".lpad-body",
+                                        InsertPoint);
+    OuterPHI->replaceAllUsesWith(InnerPHI);
+    InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+  }
+
+  // Create a PHI for the exception values.
+  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+                                     "eh.lpad-body", InsertPoint);
+  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+  // All done.
+  return InnerResumeDest;
+}
+
+/// forwardResume - Forward the 'resume' instruction to the caller's landing pad
+/// block. When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void InvokeInliningInfo::forwardResume(ResumeInst *RI,
+                               SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) {
+  BasicBlock *Dest = getInnerResumeDest();
+  LandingPadInst *OuterLPad = getLandingPadInst();
+  BasicBlock *Src = RI->getParent();
+
+  BranchInst::Create(Dest, Src);
+
+  // Update the PHIs in the destination. They were inserted in an order which
+  // makes this work.
+  addIncomingPHIValuesForInto(Src, Dest);
+
+  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+  RI->eraseFromParent();
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(),
+         E = InlinedLPads.end(); I != E; ++I) {
+    LandingPadInst *InlinedLPad = *I;
+    for (unsigned OuterIdx = 0, OuterNum = OuterLPad->getNumClauses();
+         OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+  }
+}
+
+/// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
+/// an invoke, we have to turn all of the calls that can throw into
+/// invokes.  This function analyze BB to see if there are any calls, and if so,
+/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
+/// nodes in that block with the values specified in InvokeDestPHIValues.
+///
+/// Returns true to indicate that the next block should be skipped.
+static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
+                                                   InvokeInliningInfo &Invoke) {
+  LandingPadInst *LPI = Invoke.getLandingPadInst();
+
+  for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+    Instruction *I = BBI++;
+
+    if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) {
+      unsigned NumClauses = LPI->getNumClauses();
+      L->reserveClauses(NumClauses);
+      for (unsigned i = 0; i != NumClauses; ++i)
+        L->addClause(LPI->getClause(i));
+    }
+
+    // We only need to check for function calls: inlined invoke
+    // instructions require no special handling.
+    CallInst *CI = dyn_cast<CallInst>(I);
+
+    // If this call cannot unwind, don't convert it to an invoke.
+    if (!CI || CI->doesNotThrow())
+      continue;
+
+    // Convert this function call into an invoke instruction.  First, split the
+    // basic block.
+    BasicBlock *Split = BB->splitBasicBlock(CI, CI->getName()+".noexc");
+
+    // Delete the unconditional branch inserted by splitBasicBlock
+    BB->getInstList().pop_back();
+
+    // Create the new invoke instruction.
+    ImmutableCallSite CS(CI);
+    SmallVector<Value*, 8> InvokeArgs(CS.arg_begin(), CS.arg_end());
+    InvokeInst *II = InvokeInst::Create(CI->getCalledValue(), Split,
+                                        Invoke.getOuterResumeDest(),
+                                        InvokeArgs, CI->getName(), BB);
+    II->setCallingConv(CI->getCallingConv());
+    II->setAttributes(CI->getAttributes());
+    
+    // Make sure that anything using the call now uses the invoke!  This also
+    // updates the CallGraph if present, because it uses a WeakVH.
+    CI->replaceAllUsesWith(II);
+
+    // Delete the original call
+    Split->getInstList().pop_front();
+
+    // Update any PHI nodes in the exceptional block to indicate that there is
+    // now a new entry in them.
+    Invoke.addIncomingPHIValuesFor(BB);
+    return false;
+  }
+
+  return false;
+}
+
+/// HandleInlinedInvoke - If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined.  FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
+                                ClonedCodeInfo &InlinedCodeInfo) {
+  BasicBlock *InvokeDest = II->getUnwindDest();
+
+  Function *Caller = FirstNewBlock->getParent();
+
+  // The inlined code is currently at the end of the function, scan from the
+  // start of the inlined code to its end, checking for stuff we need to
+  // rewrite.
+  InvokeInliningInfo Invoke(II);
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
+  for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
+    if (InlinedCodeInfo.ContainsCalls)
+      if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
+        // Honor a request to skip the next block.
+        ++BB;
+        continue;
+      }
+
+    // Forward any resumes that are remaining here.
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
+      Invoke.forwardResume(RI, InlinedLPads);
+  }
+
+  // Now that everything is happy, we have one final detail.  The PHI nodes in
+  // the exception destination block still have entries due to the original
+  // invoke instruction. Eliminate these entries (which might even delete the
+  // PHI node) now.
+  InvokeDest->removePredecessor(II->getParent());
+}
+
+/// UpdateCallGraphAfterInlining - Once we have cloned code over from a callee
+/// into the caller, update the specified callgraph to reflect the changes we
+/// made.  Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallSite CS,
+                                         Function::iterator FirstNewBlock,
+                                         ValueToValueMapTy &VMap,
+                                         InlineFunctionInfo &IFI) {
+  CallGraph &CG = *IFI.CG;
+  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Callee = CS.getCalledFunction();
+  CallGraphNode *CalleeNode = CG[Callee];
+  CallGraphNode *CallerNode = CG[Caller];
+
+  // Since we inlined some uninlined call sites in the callee into the caller,
+  // add edges from the caller to all of the callees of the callee.
+  CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+  // Consider the case where CalleeNode == CallerNode.
+  CallGraphNode::CalledFunctionsVector CallCache;
+  if (CalleeNode == CallerNode) {
+    CallCache.assign(I, E);
+    I = CallCache.begin();
+    E = CallCache.end();
+  }
+
+  for (; I != E; ++I) {
+    const Value *OrigCall = I->first;
+
+    ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
+    // Only copy the edge if the call was inlined!
+    if (VMI == VMap.end() || VMI->second == 0)
+      continue;
+    
+    // If the call was inlined, but then constant folded, there is no edge to
+    // add.  Check for this case.
+    Instruction *NewCall = dyn_cast<Instruction>(VMI->second);
+    if (NewCall == 0) continue;
+
+    // Remember that this call site got inlined for the client of
+    // InlineFunction.
+    IFI.InlinedCalls.push_back(NewCall);
+
+    // It's possible that inlining the callsite will cause it to go from an
+    // indirect to a direct call by resolving a function pointer.  If this
+    // happens, set the callee of the new call site to a more precise
+    // destination.  This can also happen if the call graph node of the caller
+    // was just unnecessarily imprecise.
+    if (I->second->getFunction() == 0)
+      if (Function *F = CallSite(NewCall).getCalledFunction()) {
+        // Indirect call site resolved to direct call.
+        CallerNode->addCalledFunction(CallSite(NewCall), CG[F]);
+
+        continue;
+      }
+
+    CallerNode->addCalledFunction(CallSite(NewCall), I->second);
+  }
+  
+  // Update the call graph by deleting the edge from Callee to Caller.  We must
+  // do this after the loop above in case Caller and Callee are the same.
+  CallerNode->removeCallEdgeFor(CS);
+}
+
+/// HandleByValArgument - When inlining a call site that has a byval argument,
+/// we have to make the implicit memcpy explicit by adding it.
+static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+                                  const Function *CalledFunc,
+                                  InlineFunctionInfo &IFI,
+                                  unsigned ByValAlignment) {
+  Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
+
+  // If the called function is readonly, then it could not mutate the caller's
+  // copy of the byval'd memory.  In this case, it is safe to elide the copy and
+  // temporary.
+  if (CalledFunc->onlyReadsMemory()) {
+    // If the byval argument has a specified alignment that is greater than the
+    // passed in pointer, then we either have to round up the input pointer or
+    // give up on this transformation.
+    if (ByValAlignment <= 1)  // 0 = unspecified, 1 = no particular alignment.
+      return Arg;
+
+    // If the pointer is already known to be sufficiently aligned, or if we can
+    // round it up to a larger alignment, then we don't need a temporary.
+    if (getOrEnforceKnownAlignment(Arg, ByValAlignment,
+                                   IFI.TD) >= ByValAlignment)
+      return Arg;
+    
+    // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
+    // for code quality, but rarely happens and is required for correctness.
+  }
+  
+  LLVMContext &Context = Arg->getContext();
+
+  Type *VoidPtrTy = Type::getInt8PtrTy(Context);
+  
+  // Create the alloca.  If we have DataLayout, use nice alignment.
+  unsigned Align = 1;
+  if (IFI.TD)
+    Align = IFI.TD->getPrefTypeAlignment(AggTy);
+  
+  // If the byval had an alignment specified, we *must* use at least that
+  // alignment, as it is required by the byval argument (and uses of the
+  // pointer inside the callee).
+  Align = std::max(Align, ByValAlignment);
+  
+  Function *Caller = TheCall->getParent()->getParent(); 
+  
+  Value *NewAlloca = new AllocaInst(AggTy, 0, Align, Arg->getName(), 
+                                    &*Caller->begin()->begin());
+  // Emit a memcpy.
+  Type *Tys[3] = {VoidPtrTy, VoidPtrTy, Type::getInt64Ty(Context)};
+  Function *MemCpyFn = Intrinsic::getDeclaration(Caller->getParent(),
+                                                 Intrinsic::memcpy, 
+                                                 Tys);
+  Value *DestCast = new BitCastInst(NewAlloca, VoidPtrTy, "tmp", TheCall);
+  Value *SrcCast = new BitCastInst(Arg, VoidPtrTy, "tmp", TheCall);
+  
+  Value *Size;
+  if (IFI.TD == 0)
+    Size = ConstantExpr::getSizeOf(AggTy);
+  else
+    Size = ConstantInt::get(Type::getInt64Ty(Context),
+                            IFI.TD->getTypeStoreSize(AggTy));
+  
+  // Always generate a memcpy of alignment 1 here because we don't know
+  // the alignment of the src pointer.  Other optimizations can infer
+  // better alignment.
+  Value *CallArgs[] = {
+    DestCast, SrcCast, Size,
+    ConstantInt::get(Type::getInt32Ty(Context), 1),
+    ConstantInt::getFalse(Context) // isVolatile
+  };
+  IRBuilder<>(TheCall).CreateCall(MemCpyFn, CallArgs);
+  
+  // Uses of the argument in the function should use our new alloca
+  // instead.
+  return NewAlloca;
+}
+
+// isUsedByLifetimeMarker - Check whether this Value is used by a lifetime
+// intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE;
+       ++UI) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// hasLifetimeMarkers - Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
+  if (AI->getType() == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end(); I != E;
+       ++I) {
+    if (I->getType() != Int8PtrTy) continue;
+    if (I->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(*I))
+      return true;
+  }
+  return false;
+}
+
+/// updateInlinedAtInfo - Helper function used by fixupLineNumbers to
+/// recursively update InlinedAtEntry of a DebugLoc.
+static DebugLoc updateInlinedAtInfo(const DebugLoc &DL, 
+                                    const DebugLoc &InlinedAtDL,
+                                    LLVMContext &Ctx) {
+  if (MDNode *IA = DL.getInlinedAt(Ctx)) {
+    DebugLoc NewInlinedAtDL 
+      = updateInlinedAtInfo(DebugLoc::getFromDILocation(IA), InlinedAtDL, Ctx);
+    return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
+                         NewInlinedAtDL.getAsMDNode(Ctx));
+  }
+
+  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(Ctx),
+                       InlinedAtDL.getAsMDNode(Ctx));
+}
+
+/// fixupLineNumbers - Update inlined instructions' line numbers to 
+/// to encode location where these instructions are inlined.
+static void fixupLineNumbers(Function *Fn, Function::iterator FI,
+                             Instruction *TheCall) {
+  DebugLoc TheCallDL = TheCall->getDebugLoc();
+  if (TheCallDL.isUnknown())
+    return;
+
+  for (; FI != Fn->end(); ++FI) {
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      DebugLoc DL = BI->getDebugLoc();
+      if (!DL.isUnknown()) {
+        BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
+        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
+          LLVMContext &Ctx = BI->getContext();
+          MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx);
+          DVI->setOperand(2, createInlinedVariable(DVI->getVariable(), 
+                                                   InlinedAt, Ctx));
+        }
+      }
+    }
+  }
+}
+
+/// InlineFunction - This function inlines the called function into the basic
+/// block of the caller.  This returns false if it is not possible to inline
+/// this call.  The program is still in a well defined state if this occurs
+/// though.
+///
+/// Note that this only does one level of inlining.  For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream.  Similarly this will inline a recursive
+/// function by one level.
+bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+                          bool InsertLifetime) {
+  Instruction *TheCall = CS.getInstruction();
+  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
+         "Instruction not in function!");
+
+  // If IFI has any state in it, zap it before we fill it in.
+  IFI.reset();
+  
+  const Function *CalledFunc = CS.getCalledFunction();
+  if (CalledFunc == 0 ||          // Can't inline external function or indirect
+      CalledFunc->isDeclaration() || // call, or call to a vararg function!
+      CalledFunc->getFunctionType()->isVarArg()) return false;
+
+  // If the call to the callee is not a tail call, we must clear the 'tail'
+  // flags on any calls that we inline.
+  bool MustClearTailCallFlags =
+    !(isa<CallInst>(TheCall) && cast<CallInst>(TheCall)->isTailCall());
+
+  // If the call to the callee cannot throw, set the 'nounwind' flag on any
+  // calls that we inline.
+  bool MarkNoUnwind = CS.doesNotThrow();
+
+  BasicBlock *OrigBB = TheCall->getParent();
+  Function *Caller = OrigBB->getParent();
+
+  // GC poses two hazards to inlining, which only occur when the callee has GC:
+  //  1. If the caller has no GC, then the callee's GC must be propagated to the
+  //     caller.
+  //  2. If the caller has a differing GC, it is invalid to inline.
+  if (CalledFunc->hasGC()) {
+    if (!Caller->hasGC())
+      Caller->setGC(CalledFunc->getGC());
+    else if (CalledFunc->getGC() != Caller->getGC())
+      return false;
+  }
+
+  // Get the personality function from the callee if it contains a landing pad.
+  Value *CalleePersonality = 0;
+  for (Function::const_iterator I = CalledFunc->begin(), E = CalledFunc->end();
+       I != E; ++I)
+    if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+      const BasicBlock *BB = II->getUnwindDest();
+      const LandingPadInst *LP = BB->getLandingPadInst();
+      CalleePersonality = LP->getPersonalityFn();
+      break;
+    }
+
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  if (CalleePersonality) {
+    for (Function::const_iterator I = Caller->begin(), E = Caller->end();
+         I != E; ++I)
+      if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+        const BasicBlock *BB = II->getUnwindDest();
+        const LandingPadInst *LP = BB->getLandingPadInst();
+
+        // If the personality functions match, then we can perform the
+        // inlining. Otherwise, we can't inline.
+        // TODO: This isn't 100% true. Some personality functions are proper
+        //       supersets of others and can be used in place of the other.
+        if (LP->getPersonalityFn() != CalleePersonality)
+          return false;
+
+        break;
+      }
+  }
+
+  // Get an iterator to the last basic block in the function, which will have
+  // the new function inlined after it.
+  Function::iterator LastBlock = &Caller->back();
+
+  // Make sure to capture all of the return instructions from the cloned
+  // function.
+  SmallVector<ReturnInst*, 8> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+  Function::iterator FirstNewBlock;
+
+  { // Scope to destroy VMap after cloning.
+    ValueToValueMapTy VMap;
+
+    assert(CalledFunc->arg_size() == CS.arg_size() &&
+           "No varargs calls can be inlined!");
+
+    // Calculate the vector of arguments to pass into the function cloner, which
+    // matches up the formal to the actual argument values.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    unsigned ArgNo = 0;
+    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+         E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+      Value *ActualArg = *AI;
+
+      // When byval arguments actually inlined, we need to make the copy implied
+      // by them explicit.  However, we don't do this if the callee is readonly
+      // or readnone, because the copy would be unneeded: the callee doesn't
+      // modify the struct.
+      if (CS.isByValArgument(ArgNo)) {
+        ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
+                                        CalledFunc->getParamAlignment(ArgNo+1));
+ 
+        // Calls that we inline may use the new alloca, so we need to clear
+        // their 'tail' flags if HandleByValArgument introduced a new alloca and
+        // the callee has calls.
+        MustClearTailCallFlags |= ActualArg != *AI;
+      }
+
+      VMap[I] = ActualArg;
+    }
+
+    // We want the inliner to prune the code as it copies.  We would LOVE to
+    // have no dead or constant instructions leftover after inlining occurs
+    // (which can happen, e.g., because an argument was constant), but we'll be
+    // happy with whatever the cloner can do.
+    CloneAndPruneFunctionInto(Caller, CalledFunc, VMap, 
+                              /*ModuleLevelChanges=*/false, Returns, ".i",
+                              &InlinedFunctionInfo, IFI.TD, TheCall);
+
+    // Remember the first block that is newly cloned over.
+    FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+    // Update the callgraph if requested.
+    if (IFI.CG)
+      UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
+
+    // Update inlined instructions' line number information.
+    fixupLineNumbers(Caller, FirstNewBlock, TheCall);
+  }
+
+  // If there are any alloca instructions in the block that used to be the entry
+  // block for the callee, move them to the entry block of the caller.  First
+  // calculate which instruction they should be inserted before.  We insert the
+  // instructions at the end of the current alloca list.
+  {
+    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+         E = FirstNewBlock->end(); I != E; ) {
+      AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+      if (AI == 0) continue;
+      
+      // If the alloca is now dead, remove it.  This often occurs due to code
+      // specialization.
+      if (AI->use_empty()) {
+        AI->eraseFromParent();
+        continue;
+      }
+
+      if (!isa<Constant>(AI->getArraySize()))
+        continue;
+      
+      // Keep track of the static allocas that we inline into the caller.
+      IFI.StaticAllocas.push_back(AI);
+      
+      // Scan for the block of allocas that we can move over, and move them
+      // all at once.
+      while (isa<AllocaInst>(I) &&
+             isa<Constant>(cast<AllocaInst>(I)->getArraySize())) {
+        IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
+        ++I;
+      }
+
+      // Transfer all of the allocas over in a block.  Using splice means
+      // that the instructions aren't removed from the symbol table, then
+      // reinserted.
+      Caller->getEntryBlock().getInstList().splice(InsertPoint,
+                                                   FirstNewBlock->getInstList(),
+                                                   AI, I);
+    }
+  }
+
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // function we just inlined.
+  if (InsertLifetime && !IFI.StaticAllocas.empty()) {
+    IRBuilder<> builder(FirstNewBlock->begin());
+    for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = IFI.StaticAllocas[ai];
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = 0;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        if (IFI.TD) {
+          Type *AllocaType = AI->getAllocatedType();
+          uint64_t AllocaTypeSize = IFI.TD->getTypeAllocSize(AllocaType);
+          uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+          assert(AllocaArraySize > 0 && "array size of AllocaInst is zero");
+          // Check that array size doesn't saturate uint64_t and doesn't
+          // overflow when it's multiplied by type size.
+          if (AllocaArraySize != ~0ULL &&
+              UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+            AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                          AllocaArraySize * AllocaTypeSize);
+          }
+        }
+      }
+
+      builder.CreateLifetimeStart(AI, AllocaSize);
+      for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
+        IRBuilder<> builder(Returns[ri]);
+        builder.CreateLifetimeEnd(AI, AllocaSize);
+      }
+    }
+  }
+
+  // If the inlined code contained dynamic alloca instructions, wrap the inlined
+  // code with llvm.stacksave/llvm.stackrestore intrinsics.
+  if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+    Module *M = Caller->getParent();
+    // Get the two intrinsics we care about.
+    Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+    Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+    // Insert the llvm.stacksave.
+    CallInst *SavedPtr = IRBuilder<>(FirstNewBlock, FirstNewBlock->begin())
+      .CreateCall(StackSave, "savedstack");
+
+    // Insert a call to llvm.stackrestore before any return instructions in the
+    // inlined function.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      IRBuilder<>(Returns[i]).CreateCall(StackRestore, SavedPtr);
+    }
+  }
+
+  // If we are inlining tail call instruction through a call site that isn't
+  // marked 'tail', we must remove the tail marker for any calls in the inlined
+  // code.  Also, calls inlined through a 'nounwind' call site should be marked
+  // 'nounwind'.
+  if (InlinedFunctionInfo.ContainsCalls &&
+      (MustClearTailCallFlags || MarkNoUnwind)) {
+    for (Function::iterator BB = FirstNewBlock, E = Caller->end();
+         BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I)) {
+          if (MustClearTailCallFlags)
+            CI->setTailCall(false);
+          if (MarkNoUnwind)
+            CI->setDoesNotThrow();
+        }
+  }
+
+  // If we are inlining for an invoke instruction, we must make sure to rewrite
+  // any call instructions into invoke instructions.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+    HandleInlinedInvoke(II, FirstNewBlock, InlinedFunctionInfo);
+
+  // If we cloned in _exactly one_ basic block, and if that block ends in a
+  // return instruction, we splice the body of the inlined callee directly into
+  // the calling basic block.
+  if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+    // Move all of the instructions right before the call.
+    OrigBB->getInstList().splice(TheCall, FirstNewBlock->getInstList(),
+                                 FirstNewBlock->begin(), FirstNewBlock->end());
+    // Remove the cloned basic block.
+    Caller->getBasicBlockList().pop_back();
+
+    // If the call site was an invoke instruction, add a branch to the normal
+    // destination.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
+      BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // If the return instruction returned a value, replace uses of the call with
+    // uses of the returned value.
+    if (!TheCall->use_empty()) {
+      ReturnInst *R = Returns[0];
+      if (TheCall == R->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(R->getReturnValue());
+    }
+    // Since we are now done with the Call/Invoke, we can delete it.
+    TheCall->eraseFromParent();
+
+    // Since we are now done with the return instruction, delete it also.
+    Returns[0]->eraseFromParent();
+
+    // We are now done with the inlining.
+    return true;
+  }
+
+  // Otherwise, we have the normal case, of more than one block to inline or
+  // multiple return sites.
+
+  // We want to clone the entire callee function into the hole between the
+  // "starter" and "ender" blocks.  How we accomplish this depends on whether
+  // this is an invoke instruction or a call instruction.
+  BasicBlock *AfterCallBB;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+
+    // Add an unconditional branch to make this look like the CallInst case...
+    BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+
+    // Split the basic block.  This guarantees that no PHI nodes will have to be
+    // updated due to new incoming edges, and make the invoke case more
+    // symmetric to the call case.
+    AfterCallBB = OrigBB->splitBasicBlock(NewBr,
+                                          CalledFunc->getName()+".exit");
+
+  } else {  // It's a call
+    // If this is a call instruction, we need to split the basic block that
+    // the call lives in.
+    //
+    AfterCallBB = OrigBB->splitBasicBlock(TheCall,
+                                          CalledFunc->getName()+".exit");
+  }
+
+  // Change the branch that used to go to AfterCallBB to branch to the first
+  // basic block of the inlined function.
+  //
+  TerminatorInst *Br = OrigBB->getTerminator();
+  assert(Br && Br->getOpcode() == Instruction::Br &&
+         "splitBasicBlock broken!");
+  Br->setOperand(0, FirstNewBlock);
+
+
+  // Now that the function is correct, make it a little bit nicer.  In
+  // particular, move the basic blocks inserted from the end of the function
+  // into the space made by splitting the source basic block.
+  Caller->getBasicBlockList().splice(AfterCallBB, Caller->getBasicBlockList(),
+                                     FirstNewBlock, Caller->end());
+
+  // Handle all of the return instructions that we just cloned in, and eliminate
+  // any users of the original call/invoke instruction.
+  Type *RTy = CalledFunc->getReturnType();
+
+  PHINode *PHI = 0;
+  if (Returns.size() > 1) {
+    // The PHI node should go at the front of the new basic block to merge all
+    // possible incoming values.
+    if (!TheCall->use_empty()) {
+      PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(),
+                            AfterCallBB->begin());
+      // Anything that used the result of the function call should now use the
+      // PHI node as their operand.
+      TheCall->replaceAllUsesWith(PHI);
+    }
+
+    // Loop over all of the return instructions adding entries to the PHI node
+    // as appropriate.
+    if (PHI) {
+      for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+        ReturnInst *RI = Returns[i];
+        assert(RI->getReturnValue()->getType() == PHI->getType() &&
+               "Ret value not consistent in function!");
+        PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+      }
+    }
+
+
+    // Add a branch to the merge points and remove return instructions.
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *RI = Returns[i];
+      BranchInst::Create(AfterCallBB, RI);
+      RI->eraseFromParent();
+    }
+  } else if (!Returns.empty()) {
+    // Otherwise, if there is exactly one return value, just replace anything
+    // using the return value of the call with the computed value.
+    if (!TheCall->use_empty()) {
+      if (TheCall == Returns[0]->getReturnValue())
+        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      else
+        TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
+    }
+
+    // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+    BasicBlock *ReturnBB = Returns[0]->getParent();
+    ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+    // Splice the code from the return block into the block that it will return
+    // to, which contains the code that was after the call.
+    AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+                                      ReturnBB->getInstList());
+
+    // Delete the return instruction now and empty ReturnBB now.
+    Returns[0]->eraseFromParent();
+    ReturnBB->eraseFromParent();
+  } else if (!TheCall->use_empty()) {
+    // No returns, but something is using the return value of the call.  Just
+    // nuke the result.
+    TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+  }
+
+  // Since we are now done with the Call/Invoke, we can delete it.
+  TheCall->eraseFromParent();
+
+  // We should always be able to fold the entry block of the function into the
+  // single predecessor of the block...
+  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+  // Splice the code entry block into calling block, right before the
+  // unconditional branch.
+  CalleeEntry->replaceAllUsesWith(OrigBB);  // Update PHI nodes
+  OrigBB->getInstList().splice(Br, CalleeEntry->getInstList());
+
+  // Remove the unconditional branch.
+  OrigBB->getInstList().erase(Br);
+
+  // Now we can remove the CalleeEntry block, which is now empty.
+  Caller->getBasicBlockList().erase(CalleeEntry);
+
+  // If we inserted a phi node, check to see if it has a single value (e.g. all
+  // the entries are the same or undef).  If so, remove the PHI so it doesn't
+  // block other optimizations.
+  if (PHI) {
+    if (Value *V = SimplifyInstruction(PHI, IFI.TD)) {
+      PHI->replaceAllUsesWith(V);
+      PHI->eraseFromParent();
+    }
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
new file mode 100644
index 000000000000..a020bc7398f5
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -0,0 +1,64 @@
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+  struct InstNamer : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstNamer() : FunctionPass(ID) {
+      initializeInstNamerPass(*PassRegistry::getPassRegistry());
+    }
+    
+    void getAnalysisUsage(AnalysisUsage &Info) const {
+      Info.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->hasName() && !AI->getType()->isVoidTy())
+          AI->setName("arg");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        if (!BB->hasName())
+          BB->setName("bb");
+        
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->hasName() && !I->getType()->isVoidTy())
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+  
+  char InstNamer::ID = 0;
+}
+
+INITIALIZE_PASS(InstNamer, "instnamer", 
+                "Assign names to anonymous instructions", false, false)
+char &llvm::InstructionNamerID = InstNamer::ID;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+  return new InstNamer();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 000000000000..3cb8ded8506a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,524 @@
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit scalar integer division for
+// targets that don't have native support. It's largely derived from
+// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the
+// amount of control flow
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "integer-division"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+                                          IRBuilder<> &Builder) {
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %dividend_sgn = ashr i32 %dividend, 31
+  // ;   %divisor_sgn  = ashr i32 %divisor, 31
+  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn
+  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn
+  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn
+  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn
+  // ;   %urem         = urem i32 %dividend, %divisor
+  // ;   %xored        = xor i32 %urem, %dividend_sgn
+  // ;   %srem         = sub i32 %xored, %dividend_sgn
+  Value *DividendSign = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *DivisorSign  = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign);
+  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign);
+  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign);
+  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign);
+  Value *URem         = Builder.CreateURem(UDividend, UDivisor);
+  Value *Xored        = Builder.CreateXor(URem, DividendSign);
+  Value *SRem         = Builder.CreateSub(Xored, DividendSign);
+
+  if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+    Builder.SetInsertPoint(URemInst);
+
+  return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+                                             IRBuilder<> &Builder) {
+  // Remainder = Dividend - Quotient*Divisor
+
+  // ;   %quotient  = udiv i32 %dividend, %divisor
+  // ;   %product   = mul i32 %divisor, %quotient
+  // ;   %remainder = sub i32 %dividend, %product
+  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor);
+  Value *Product   = Builder.CreateMul(Divisor, Quotient);
+  Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+    Builder.SetInsertPoint(UDiv);
+
+  return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3
+
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit integers. Returns the
+/// quotient, rounded towards 0. Builder's insert point should be pointing where
+/// the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *I32Ty = Builder.getInt32Ty();
+
+  ConstantInt *Zero      = Builder.getInt32(0);
+  ConstantInt *One       = Builder.getInt32(1);
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+  ConstantInt *NegOne    = ConstantInt::getSigned(I32Ty, -1);
+  ConstantInt *True      = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                                I32Ty);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0        = Builder.CreateCall2(CTLZi32, Divisor, True);
+  Value *Tmp1        = Builder.CreateCall2(CTLZi32, Dividend, True);
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, ThirtyOne);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(ThirtyOne, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *SR_3    = Builder.CreatePHI(I32Ty, 2);
+  PHINode *R_1     = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_2     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, ThirtyOne);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, 31);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_3     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Currently only implements 32bit
+/// scalar division (due to udiv's limitation), but future work is removing this
+/// limitation.
+///
+/// @brief Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+         "Trying to expand remainder from a non-remainder function");
+
+  IRBuilder<> Builder(Rem);
+
+  // First prepare the sign if it's a signed remainder
+  if (Rem->getOpcode() == Instruction::SRem) {
+    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+                                                   Rem->getOperand(1), Builder);
+
+    Rem->replaceAllUsesWith(Remainder);
+    Rem->dropAllReferences();
+    Rem->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::URem)
+      return true;
+
+    Rem = BO;
+  }
+
+  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+                                                    Rem->getOperand(1),
+                                                    Builder);
+
+  Rem->replaceAllUsesWith(Remainder);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  // Expand the udiv
+  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+    expandDivision(UDiv);
+  }
+
+  return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Currently only
+/// implements 32bit scalar division, but future work is removing this
+/// limitation.
+///
+/// @brief Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  if (Div->getType()->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1), Builder);
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::UDiv)
+      return true;
+
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to 
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer 
+/// arithmetic.
+///
+/// @brief Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+          "Trying to expand remainder from a non-remainder function");
+
+  Type *RemTy = Rem->getType();
+  if (RemTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+  if (RemTyBitWidth > 32) 
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (RemTyBitWidth == 32) 
+    return expandRemainder(Rem);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Rem);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtRem;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Rem->getOpcode() == Instruction::SRem) {
+    ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+    ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+  }
+  Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+  Rem->replaceAllUsesWith(Trunc);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// @brief Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+          "Trying to expand division from a non-division function");
+
+  Type *DivTy = Div->getType();
+  if (DivTy->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+  if (DivTyBitWidth > 32)
+    llvm_unreachable("Div of bitwidth greater than 32 not supported");
+
+  if (DivTyBitWidth == 32)
+    return expandDivision(Div);
+
+  // If bitwidth smaller than 32 extend inputs, truncate output and proceed
+  // with 32 bit division.
+  IRBuilder<> Builder(Div);
+
+  Value *ExtDividend;
+  Value *ExtDivisor;
+  Value *ExtDiv;
+  Value *Trunc;
+  Type *Int32Ty = Builder.getInt32Ty();
+
+  if (Div->getOpcode() == Instruction::SDiv) {
+    ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+  } else {
+    ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+    ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+    ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);  
+  }
+  Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+  Div->replaceAllUsesWith(Trunc);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
new file mode 100644
index 000000000000..2d1b166c2101
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -0,0 +1,292 @@
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary.  For example, it turns
+// the left into the right code:
+// 
+// for (...)                for (...)
+//   if (c)                   if (c)
+//     X1 = ...                 X1 = ...
+//   else                     else
+//     X2 = ...                 X2 = ...
+//   X3 = phi(X1, X2)         X3 = phi(X1, X2)
+// ... = X3 + 4             X4 = phi(X3)
+//                          ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine.  The major benefit of this 
+// transformation is that it makes many other loop optimizations, such as 
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lcssa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/PredIteratorCache.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+namespace {
+  struct LCSSA : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LCSSA() : LoopPass(ID) {
+      initializeLCSSAPass(*PassRegistry::getPassRegistry());
+    }
+
+    // Cached analysis information for the current function.
+    DominatorTree *DT;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    std::vector<BasicBlock*> LoopBlocks;
+    PredIteratorCache PredCache;
+    Loop *L;
+    
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG.  It maintains both of these,
+    /// as well as the CFG.  It also requires dominator information.
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<ScalarEvolution>();
+    }
+  private:
+    bool ProcessInstruction(Instruction *Inst,
+                            const SmallVectorImpl<BasicBlock*> &ExitBlocks);
+    
+    /// verifyAnalysis() - Verify loop nest.
+    virtual void verifyAnalysis() const {
+      // Check the special guarantees that LCSSA makes.
+      assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!");
+    }
+
+    /// inLoop - returns true if the given block is within the current loop
+    bool inLoop(BasicBlock *B) const {
+      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
+    }
+  };
+}
+  
+char LCSSA::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(LCSSA, "lcssa", "Loop-Closed SSA Form Pass", false, false)
+
+Pass *llvm::createLCSSAPass() { return new LCSSA(); }
+char &llvm::LCSSAID = LCSSA::ID;
+
+
+/// BlockDominatesAnExit - Return true if the specified block dominates at least
+/// one of the blocks in the specified list.
+static bool BlockDominatesAnExit(BasicBlock *BB,
+                                 const SmallVectorImpl<BasicBlock*> &ExitBlocks,
+                                 DominatorTree *DT) {
+  DomTreeNode *DomNode = DT->getNode(BB);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (DT->dominates(DomNode, DT->getNode(ExitBlocks[i])))
+      return true;
+
+  return false;
+}
+
+
+/// runOnFunction - Process all loops in the function, inner-most out.
+bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
+  L = TheLoop;
+  
+  DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = getAnalysisIfAvailable<ScalarEvolution>();
+
+  // Get the set of exiting blocks.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  
+  if (ExitBlocks.empty())
+    return false;
+  
+  // Speed up queries by creating a sorted vector of blocks.
+  LoopBlocks.clear();
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+  array_pod_sort(LoopBlocks.begin(), LoopBlocks.end());
+  
+  // Look at all the instructions in the loop, checking to see if they have uses
+  // outside the loop.  If so, rewrite those uses.
+  bool MadeChange = false;
+  
+  for (Loop::block_iterator BBI = L->block_begin(), E = L->block_end();
+       BBI != E; ++BBI) {
+    BasicBlock *BB = *BBI;
+    
+    // For large loops, avoid use-scanning by using dominance information:  In
+    // particular, if a block does not dominate any of the loop exits, then none
+    // of the values defined in the block could be used outside the loop.
+    if (!BlockDominatesAnExit(BB, ExitBlocks, DT))
+      continue;
+    
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      // Reject two common cases fast: instructions with no uses (like stores)
+      // and instructions with one use that is in the same block as this.
+      if (I->use_empty() ||
+          (I->hasOneUse() && I->use_back()->getParent() == BB &&
+           !isa<PHINode>(I->use_back())))
+        continue;
+      
+      MadeChange |= ProcessInstruction(I, ExitBlocks);
+    }
+  }
+
+  // If we modified the code, remove any caches about the loop from SCEV to
+  // avoid dangling entries.
+  // FIXME: This is a big hammer, can we clear the cache more selectively?
+  if (SE && MadeChange)
+    SE->forgetLoop(L);
+  
+  assert(L->isLCSSAForm(*DT));
+  PredCache.clear();
+
+  return MadeChange;
+}
+
+/// isExitBlock - Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+                        const SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (ExitBlocks[i] == BB)
+      return true;
+  return false;
+}
+
+/// ProcessInstruction - Given an instruction in the loop, check to see if it
+/// has any uses that are outside the current loop.  If so, insert LCSSA PHI
+/// nodes and rewrite the uses.
+bool LCSSA::ProcessInstruction(Instruction *Inst,
+                               const SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+  SmallVector<Use*, 16> UsesToRewrite;
+  
+  BasicBlock *InstBB = Inst->getParent();
+  
+  for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+       UI != E; ++UI) {
+    User *U = *UI;
+    BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(U))
+      UserBB = PN->getIncomingBlock(UI);
+    
+    if (InstBB != UserBB && !inLoop(UserBB))
+      UsesToRewrite.push_back(&UI.getUse());
+  }
+
+  // If there are no uses outside the loop, exit with no change.
+  if (UsesToRewrite.empty()) return false;
+  
+  ++NumLCSSA; // We are applying the transformation
+
+  // Invoke instructions are special in that their result value is not available
+  // along their unwind edge. The code below tests to see whether DomBB dominates
+  // the value, so adjust DomBB to the normal destination block, which is
+  // effectively where the value is first usable.
+  BasicBlock *DomBB = Inst->getParent();
+  if (InvokeInst *Inv = dyn_cast<InvokeInst>(Inst))
+    DomBB = Inv->getNormalDest();
+
+  DomTreeNode *DomNode = DT->getNode(DomBB);
+
+  SmallVector<PHINode*, 16> AddedPHIs;
+
+  SSAUpdater SSAUpdate;
+  SSAUpdate.Initialize(Inst->getType(), Inst->getName());
+  
+  // Insert the LCSSA phi's into all of the exit blocks dominated by the
+  // value, and add them to the Phi's map.
+  for (SmallVectorImpl<BasicBlock*>::const_iterator BBI = ExitBlocks.begin(),
+      BBE = ExitBlocks.end(); BBI != BBE; ++BBI) {
+    BasicBlock *ExitBB = *BBI;
+    if (!DT->dominates(DomNode, DT->getNode(ExitBB))) continue;
+    
+    // If we already inserted something for this BB, don't reprocess it.
+    if (SSAUpdate.HasValueForBlock(ExitBB)) continue;
+    
+    PHINode *PN = PHINode::Create(Inst->getType(),
+                                  PredCache.GetNumPreds(ExitBB),
+                                  Inst->getName()+".lcssa",
+                                  ExitBB->begin());
+
+    // Add inputs from inside the loop for this PHI.
+    for (BasicBlock **PI = PredCache.GetPreds(ExitBB); *PI; ++PI) {
+      PN->addIncoming(Inst, *PI);
+
+      // If the exit block has a predecessor not within the loop, arrange for
+      // the incoming value use corresponding to that predecessor to be
+      // rewritten in terms of a different LCSSA PHI.
+      if (!inLoop(*PI))
+        UsesToRewrite.push_back(
+          &PN->getOperandUse(
+            PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1)));
+    }
+
+    AddedPHIs.push_back(PN);
+    
+    // Remember that this phi makes the value alive in this block.
+    SSAUpdate.AddAvailableValue(ExitBB, PN);
+  }
+
+  // Rewrite all uses outside the loop in terms of the new PHIs we just
+  // inserted.
+  for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
+    // If this use is in an exit block, rewrite to use the newly inserted PHI.
+    // This is required for correctness because SSAUpdate doesn't handle uses in
+    // the same block.  It assumes the PHI we inserted is at the end of the
+    // block.
+    Instruction *User = cast<Instruction>(UsesToRewrite[i]->getUser());
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User))
+      UserBB = PN->getIncomingBlock(*UsesToRewrite[i]);
+
+    if (isa<PHINode>(UserBB->begin()) &&
+        isExitBlock(UserBB, ExitBlocks)) {
+      // Tell the VHs that the uses changed. This updates SCEV's caches.
+      if (UsesToRewrite[i]->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
+      UsesToRewrite[i]->set(UserBB->begin());
+      continue;
+    }
+    
+    // Otherwise, do full PHI insertion.
+    SSAUpdate.RewriteUse(*UsesToRewrite[i]);
+  }
+
+  // Remove PHI nodes that did not have any uses rewritten.
+  for (unsigned i = 0, e = AddedPHIs.size(); i != e; ++i) {
+    if (AddedPHIs[i]->use_empty())
+      AddedPHIs[i]->eraseFromParent();
+  }
+  
+  return true;
+}
+
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
new file mode 100644
index 000000000000..be80d34d960f
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -0,0 +1,1001 @@
+//===-- Local.cpp - Functions to perform local transformations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Local constant propagation.
+//
+
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination.  This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI) {
+  TerminatorInst *T = BB->getTerminator();
+  IRBuilder<> Builder(T);
+
+  // Branch - See if we are conditional jumping on constant
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
+    BasicBlock *Dest1 = BI->getSuccessor(0);
+    BasicBlock *Dest2 = BI->getSuccessor(1);
+
+    if (ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
+      // Are we branching on constant?
+      // YES.  Change to unconditional branch...
+      BasicBlock *Destination = Cond->getZExtValue() ? Dest1 : Dest2;
+      BasicBlock *OldDest     = Cond->getZExtValue() ? Dest2 : Dest1;
+
+      //cerr << "Function: " << T->getParent()->getParent()
+      //     << "\nRemoving branch from " << T->getParent()
+      //     << "\n\nTo: " << OldDest << endl;
+
+      // Let the basic block know that we are letting go of it.  Based on this,
+      // it will adjust it's PHI nodes.
+      OldDest->removePredecessor(BB);
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Destination);
+      BI->eraseFromParent();
+      return true;
+    }
+    
+    if (Dest2 == Dest1) {       // Conditional branch to same location?
+      // This branch matches something like this:
+      //     br bool %cond, label %Dest, label %Dest
+      // and changes it into:  br label %Dest
+
+      // Let the basic block know that we are letting go of one copy of it.
+      assert(BI->getParent() && "Terminator not inserted in block!");
+      Dest1->removePredecessor(BI->getParent());
+
+      // Replace the conditional branch with an unconditional one.
+      Builder.CreateBr(Dest1);
+      Value *Cond = BI->getCondition();
+      BI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      return true;
+    }
+    return false;
+  }
+  
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    // If we are switching on a constant, we can convert the switch into a
+    // single branch instruction!
+    ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition());
+    BasicBlock *TheOnlyDest = SI->getDefaultDest();
+    BasicBlock *DefaultDest = TheOnlyDest;
+
+    // Figure out which case it goes to.
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+         i != e; ++i) {
+      // Found case matching a constant operand?
+      if (i.getCaseValue() == CI) {
+        TheOnlyDest = i.getCaseSuccessor();
+        break;
+      }
+
+      // Check to see if this branch is going to the same place as the default
+      // dest.  If so, eliminate it as an explicit compare.
+      if (i.getCaseSuccessor() == DefaultDest) {
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        // MD should have 2 + NumCases operands.
+        if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+            assert(CI);
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i.getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
+        // Remove this entry.
+        DefaultDest->removePredecessor(SI->getParent());
+        SI->removeCase(i);
+        --i; --e;
+        continue;
+      }
+
+      // Otherwise, check to see if the switch only branches to one destination.
+      // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+      // destinations.
+      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = 0;
+    }
+
+    if (CI && !TheOnlyDest) {
+      // Branching on a constant, but not any of the cases, go to the default
+      // successor.
+      TheOnlyDest = SI->getDefaultDest();
+    }
+
+    // If we found a single destination that we can fold the switch into, do so
+    // now.
+    if (TheOnlyDest) {
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+      BasicBlock *BB = SI->getParent();
+
+      // Remove entries from PHI nodes which we no longer branch to...
+      for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+        // Found case matching a constant operand?
+        BasicBlock *Succ = SI->getSuccessor(i);
+        if (Succ == TheOnlyDest)
+          TheOnlyDest = 0;  // Don't modify the first branch to TheOnlyDest
+        else
+          Succ->removePredecessor(BB);
+      }
+
+      // Delete the old switch.
+      Value *Cond = SI->getCondition();
+      SI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+      return true;
+    }
+    
+    if (SI->getNumCases() == 1) {
+      // Otherwise, we can fold this switch into a conditional branch
+      // instruction if it has only one non-default destination.
+      SwitchInst::CaseIt FirstCase = SI->case_begin();
+      IntegersSubset& Case = FirstCase.getCaseValueEx();
+      if (Case.isSingleNumber()) {
+        // FIXME: Currently work with ConstantInt based numbers.
+        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+             Case.getSingleNumber(0).toConstantInt(),
+            "cond");
+
+        // Insert the new branch.
+        BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                FirstCase.getCaseSuccessor(),
+                                SI->getDefaultDest());
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        if (MD && MD->getNumOperands() == 3) {
+          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+          assert(SICase && SIDef);
+          // The TrueWeight should be the weight for the single case of SI.
+          NewBr->setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(BB->getContext()).
+                 createBranchWeights(SICase->getValue().getZExtValue(),
+                                     SIDef->getValue().getZExtValue()));
+        }
+
+        // Delete the old switch.
+        SI->eraseFromParent();
+        return true;
+      }
+    }
+    return false;
+  }
+
+  if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(T)) {
+    // indirectbr blockaddress(@F, @BB) -> br label @BB
+    if (BlockAddress *BA =
+          dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
+      BasicBlock *TheOnlyDest = BA->getBasicBlock();
+      // Insert the new branch.
+      Builder.CreateBr(TheOnlyDest);
+      
+      for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+        if (IBI->getDestination(i) == TheOnlyDest)
+          TheOnlyDest = 0;
+        else
+          IBI->getDestination(i)->removePredecessor(IBI->getParent());
+      }
+      Value *Address = IBI->getAddress();
+      IBI->eraseFromParent();
+      if (DeleteDeadConditions)
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
+      
+      // If we didn't find our destination in the IBI successor list, then we
+      // have undefined behavior.  Replace the unconditional branch with an
+      // 'unreachable' instruction.
+      if (TheOnlyDest) {
+        BB->getTerminator()->eraseFromParent();
+        new UnreachableInst(BB->getContext(), BB);
+      }
+      
+      return true;
+    }
+  }
+  
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Local dead code elimination.
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
+  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+
+  // We don't want the landingpad instruction removed by anything this general.
+  if (isa<LandingPadInst>(I))
+    return false;
+
+  // We don't want debug info removed by anything this general, unless
+  // debug info is empty.
+  if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
+    if (DDI->getAddress())
+      return false;
+    return true;
+  }
+  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
+    if (DVI->getValue())
+      return false;
+    return true;
+  }
+
+  if (!I->mayHaveSideEffects()) return true;
+
+  // Special case intrinsics that "may have side effects" but can be deleted
+  // when dead.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Safe to delete llvm.stacksave if dead.
+    if (II->getIntrinsicID() == Intrinsic::stacksave)
+      return true;
+
+    // Lifetime intrinsics are dead when their right-hand is undef.
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+        II->getIntrinsicID() == Intrinsic::lifetime_end)
+      return isa<UndefValue>(II->getArgOperand(1));
+  }
+
+  if (isAllocLikeFn(I, TLI)) return true;
+
+  if (CallInst *CI = isFreeCall(I, TLI))
+    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
+      return C->isNullValue() || isa<UndefValue>(C);
+
+  return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it.  If that makes any of its operands
+/// trivially dead, delete them too, recursively.  Return true if any
+/// instructions were deleted.
+bool
+llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                 const TargetLibraryInfo *TLI) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
+    return false;
+  
+  SmallVector<Instruction*, 16> DeadInsts;
+  DeadInsts.push_back(I);
+  
+  do {
+    I = DeadInsts.pop_back_val();
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, 0);
+      
+      if (!OpV->use_empty()) continue;
+    
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+        if (isInstructionTriviallyDead(OpI, TLI))
+          DeadInsts.push_back(OpI);
+    }
+    
+    I->eraseFromParent();
+  } while (!DeadInsts.empty());
+
+  return true;
+}
+
+/// areAllUsesEqual - Check whether the uses of a value are all the same.
+/// This is similar to Instruction::hasOneUse() except this will also return
+/// true when there are no uses or multiple uses that all refer to the same
+/// value.
+static bool areAllUsesEqual(Instruction *I) {
+  Value::use_iterator UI = I->use_begin();
+  Value::use_iterator UE = I->use_end();
+  if (UI == UE)
+    return true;
+
+  User *TheUse = *UI;
+  for (++UI; UI != UE; ++UI) {
+    if (*UI != TheUse)
+      return false;
+  }
+  return true;
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it.  If that makes any of its operands trivially dead, delete them
+/// too, recursively.  Return true if a change was made.
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI) {
+  SmallPtrSet<Instruction*, 4> Visited;
+  for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
+       I = cast<Instruction>(*I->use_begin())) {
+    if (I->use_empty())
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+
+    // If we find an instruction more than once, we're on a cycle that
+    // won't prove fruitful.
+    if (!Visited.insert(I)) {
+      // Break the cycle and delete the instruction and its operands.
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// SimplifyInstructionsInBlock - Scan the specified basic block and try to
+/// simplify any instructions in it and recursively delete dead instructions.
+///
+/// This returns true if it changed the code, note that it can delete
+/// instructions in other blocks as well in this block.
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
+                                       const TargetLibraryInfo *TLI) {
+  bool MadeChange = false;
+
+#ifndef NDEBUG
+  // In debug builds, ensure that the terminator of the block is never replaced
+  // or deleted by these simplifications. The idea of simplification is that it
+  // cannot introduce new instructions, and there is no way to replace the
+  // terminator of a block without introducing a new instruction.
+  AssertingVH<Instruction> TerminatorVH(--BB->end());
+#endif
+
+  for (BasicBlock::iterator BI = BB->begin(), E = --BB->end(); BI != E; ) {
+    assert(!BI->isTerminator());
+    Instruction *Inst = BI++;
+
+    WeakVH BIHandle(BI);
+    if (recursivelySimplifyInstruction(Inst, TD)) {
+      MadeChange = true;
+      if (BIHandle != BI)
+        BI = BB->begin();
+      continue;
+    }
+
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+    if (BIHandle != BI)
+      BI = BB->begin();
+  }
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Graph Restructuring.
+//
+
+
+/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this
+/// method is called when we're about to delete Pred as a predecessor of BB.  If
+/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred.
+///
+/// Unlike the removePredecessor method, this attempts to simplify uses of PHI
+/// nodes that collapse into identity values.  For example, if we have:
+///   x = phi(1, 0, 0, 0)
+///   y = and x, z
+///
+/// .. and delete the predecessor corresponding to the '1', this will attempt to
+/// recursively fold the and to 0.
+void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
+                                        DataLayout *TD) {
+  // This only adjusts blocks with PHI nodes.
+  if (!isa<PHINode>(BB->begin()))
+    return;
+  
+  // Remove the entries for Pred from the PHI nodes in BB, but do not simplify
+  // them down.  This will leave us with single entry phi nodes and other phis
+  // that can be removed.
+  BB->removePredecessor(Pred, true);
+  
+  WeakVH PhiIt = &BB->front();
+  while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
+    PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
+    Value *OldPhiIt = PhiIt;
+
+    if (!recursivelySimplifyInstruction(PN, TD))
+      continue;
+
+    // If recursive simplification ended up deleting the next PHI node we would
+    // iterate to, then our iterator is invalid, restart scanning from the top
+    // of the block.
+    if (PhiIt != OldPhiIt) PhiIt = &BB->front();
+  }
+}
+
+
+/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its
+/// predecessor is known to have one successor (DestBB!).  Eliminate the edge
+/// between them, moving the instructions in the predecessor into DestBB and
+/// deleting the predecessor block.
+///
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
+  // If BB has single-entry PHI nodes, fold them.
+  while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+    Value *NewVal = PN->getIncomingValue(0);
+    // Replace self referencing PHI with undef, it must be dead.
+    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    PN->replaceAllUsesWith(NewVal);
+    PN->eraseFromParent();
+  }
+  
+  BasicBlock *PredBB = DestBB->getSinglePredecessor();
+  assert(PredBB && "Block doesn't have a single predecessor!");
+  
+  // Zap anything that took the address of DestBB.  Not doing this will give the
+  // address an invalid value.
+  if (DestBB->hasAddressTaken()) {
+    BlockAddress *BA = BlockAddress::get(DestBB);
+    Constant *Replacement =
+      ConstantInt::get(llvm::Type::getInt32Ty(BA->getContext()), 1);
+    BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
+                                                     BA->getType()));
+    BA->destroyConstant();
+  }
+  
+  // Anything that branched to PredBB now branches to DestBB.
+  PredBB->replaceAllUsesWith(DestBB);
+  
+  // Splice all the instructions from PredBB to DestBB.
+  PredBB->getTerminator()->eraseFromParent();
+  DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+
+  if (P) {
+    DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
+    if (DT) {
+      BasicBlock *PredBBIDom = DT->getNode(PredBB)->getIDom()->getBlock();
+      DT->changeImmediateDominator(DestBB, PredBBIDom);
+      DT->eraseNode(PredBB);
+    }
+    ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
+    if (PI) {
+      PI->replaceAllUses(PredBB, DestBB);
+      PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB));
+    }
+  }
+  // Nuke BB.
+  PredBB->eraseFromParent();
+}
+
+/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
+/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+///
+/// Assumption: Succ is the single successor for BB.
+///
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+  assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " 
+        << Succ->getName() << "\n");
+  // Shortcut, if there is only a single predecessor it must be BB and merging
+  // is always safe
+  if (Succ->getSinglePredecessor()) return true;
+
+  // Make a list of the predecessors of BB
+  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+
+  // Look at all the phi nodes in Succ, to see if they present a conflict when
+  // merging these blocks
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // If the incoming value from BB is again a PHINode in
+    // BB which has the same incoming value for *PI as PN does, we can
+    // merge the phi nodes and then the blocks can still be merged
+    PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+    if (BBPN && BBPN->getParent() == BB) {
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) &&
+            BBPN->getIncomingValueForBlock(IBB) != PN->getIncomingValue(PI)) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
+                << Succ->getName() << " is conflicting with " 
+                << BBPN->getName() << " with regard to common predecessor "
+                << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    } else {
+      Value* Val = PN->getIncomingValueForBlock(BB);
+      for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+        // See if the incoming value for the common predecessor is equal to the
+        // one for BB, in which case this phi node will not prevent the merging
+        // of the block.
+        BasicBlock *IBB = PN->getIncomingBlock(PI);
+        if (BBPreds.count(IBB) && Val != PN->getIncomingValue(PI)) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
+                << Succ->getName() << " is conflicting with regard to common "
+                << "predecessor " << IBB->getName() << "\n");
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
+/// unconditional branch, and contains no instructions other than PHI nodes,
+/// potential side-effect free intrinsics and the branch.  If possible,
+/// eliminate BB by rewriting all the predecessors to branch to the successor
+/// block and return true.  If we can't transform, return false.
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
+  assert(BB != &BB->getParent()->getEntryBlock() &&
+         "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
+
+  // We can't eliminate infinite loops.
+  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+  if (BB == Succ) return false;
+  
+  // Check to see if merging these blocks would cause conflicts for any of the
+  // phi nodes in BB or Succ. If not, we can safely merge.
+  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+
+  // Check for cases where Succ has multiple predecessors and a PHI node in BB
+  // has uses which will not disappear when the PHI nodes are merged.  It is
+  // possible to handle such cases, but difficult: it requires checking whether
+  // BB dominates Succ, which is non-trivial to calculate in the case where
+  // Succ has multiple predecessors.  Also, it requires checking whether
+  // constructing the necessary self-referential PHI node doesn't introduce any
+  // conflicts; this isn't too difficult, but the previous code for doing this
+  // was incorrect.
+  //
+  // Note that if this check finds a live use, BB dominates Succ, so BB is
+  // something like a loop pre-header (or rarely, a part of an irreducible CFG);
+  // folding the branch isn't profitable in that case anyway.
+  if (!Succ->getSinglePredecessor()) {
+    BasicBlock::iterator BBI = BB->begin();
+    while (isa<PHINode>(*BBI)) {
+      for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
+           UI != E; ++UI) {
+        if (PHINode* PN = dyn_cast<PHINode>(*UI)) {
+          if (PN->getIncomingBlock(UI) != BB)
+            return false;
+        } else {
+          return false;
+        }
+      }
+      ++BBI;
+    }
+  }
+
+  DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+  
+  if (isa<PHINode>(Succ->begin())) {
+    // If there is more than one pred of succ, and there are PHI nodes in
+    // the successor, then we need to add incoming edges for the PHI nodes
+    //
+    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+    
+    // Loop over all of the PHI nodes in the successor of BB.
+    for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+      PHINode *PN = cast<PHINode>(I);
+      Value *OldVal = PN->removeIncomingValue(BB, false);
+      assert(OldVal && "No entry in PHI for Pred BB!");
+      
+      // If this incoming value is one of the PHI nodes in BB, the new entries
+      // in the PHI node are the entries from the old PHI.
+      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+        PHINode *OldValPN = cast<PHINode>(OldVal);
+        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
+          // Note that, since we are merging phi nodes and BB and Succ might
+          // have common predecessors, we could end up with a phi node with
+          // identical incoming branches. This will be cleaned up later (and
+          // will trigger asserts if we try to clean it up now, without also
+          // simplifying the corresponding conditional branch).
+          PN->addIncoming(OldValPN->getIncomingValue(i),
+                          OldValPN->getIncomingBlock(i));
+      } else {
+        // Add an incoming value for each of the new incoming values.
+        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
+          PN->addIncoming(OldVal, BBPreds[i]);
+      }
+    }
+  }
+  
+  if (Succ->getSinglePredecessor()) {
+    // BB is the only predecessor of Succ, so Succ will end up with exactly
+    // the same predecessors BB had.
+
+    // Copy over any phi, debug or lifetime instruction.
+    BB->getTerminator()->eraseFromParent();
+    Succ->getInstList().splice(Succ->getFirstNonPHI(), BB->getInstList());
+  } else {
+    while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+      assert(PN->use_empty() && "There shouldn't be any uses here!");
+      PN->eraseFromParent();
+    }
+  }
+    
+  // Everything that jumped to BB now goes to Succ.
+  BB->replaceAllUsesWith(Succ);
+  if (!Succ->hasName()) Succ->takeName(BB);
+  BB->eraseFromParent();              // Delete the old basic block.
+  return true;
+}
+
+/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI
+/// nodes in this block. This doesn't try to be clever about PHI nodes
+/// which differ only in the order of the incoming values, but instcombine
+/// orders them so it usually won't matter.
+///
+bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
+  bool Changed = false;
+
+  // This implementation doesn't currently consider undef operands
+  // specially. Theoretically, two phis which are identical except for
+  // one having an undef where the other doesn't could be collapsed.
+
+  // Map from PHI hash values to PHI nodes. If multiple PHIs have
+  // the same hash value, the element is the first PHI in the
+  // linked list in CollisionMap.
+  DenseMap<uintptr_t, PHINode *> HashMap;
+
+  // Maintain linked lists of PHI nodes with common hash values.
+  DenseMap<PHINode *, PHINode *> CollisionMap;
+
+  // Examine each PHI.
+  for (BasicBlock::iterator I = BB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I++); ) {
+    // Compute a hash value on the operands. Instcombine will likely have sorted
+    // them, which helps expose duplicates, but we have to check all the
+    // operands to be safe in case instcombine hasn't run.
+    uintptr_t Hash = 0;
+    // This hash algorithm is quite weak as hash functions go, but it seems
+    // to do a good enough job for this particular purpose, and is very quick.
+    for (User::op_iterator I = PN->op_begin(), E = PN->op_end(); I != E; ++I) {
+      Hash ^= reinterpret_cast<uintptr_t>(static_cast<Value *>(*I));
+      Hash = (Hash << 7) | (Hash >> (sizeof(uintptr_t) * CHAR_BIT - 7));
+    }
+    for (PHINode::block_iterator I = PN->block_begin(), E = PN->block_end();
+         I != E; ++I) {
+      Hash ^= reinterpret_cast<uintptr_t>(static_cast<BasicBlock *>(*I));
+      Hash = (Hash << 7) | (Hash >> (sizeof(uintptr_t) * CHAR_BIT - 7));
+    }
+    // Avoid colliding with the DenseMap sentinels ~0 and ~0-1.
+    Hash >>= 1;
+    // If we've never seen this hash value before, it's a unique PHI.
+    std::pair<DenseMap<uintptr_t, PHINode *>::iterator, bool> Pair =
+      HashMap.insert(std::make_pair(Hash, PN));
+    if (Pair.second) continue;
+    // Otherwise it's either a duplicate or a hash collision.
+    for (PHINode *OtherPN = Pair.first->second; ; ) {
+      if (OtherPN->isIdenticalTo(PN)) {
+        // A duplicate. Replace this PHI with its duplicate.
+        PN->replaceAllUsesWith(OtherPN);
+        PN->eraseFromParent();
+        Changed = true;
+        break;
+      }
+      // A non-duplicate hash collision.
+      DenseMap<PHINode *, PHINode *>::iterator I = CollisionMap.find(OtherPN);
+      if (I == CollisionMap.end()) {
+        // Set this PHI to be the head of the linked list of colliding PHIs.
+        PHINode *Old = Pair.first->second;
+        Pair.first->second = PN;
+        CollisionMap[PN] = Old;
+        break;
+      }
+      // Proceed to the next PHI in the list.
+      OtherPN = I->second;
+    }
+  }
+
+  return Changed;
+}
+
+/// enforceKnownAlignment - If the specified pointer points to an object that
+/// we control, modify the object's alignment to PrefAlign. This isn't
+/// often possible though. If alignment is important, a more reliable approach
+/// is to simply align all global variables and allocation instructions to
+/// their preferred alignment from the beginning.
+///
+static unsigned enforceKnownAlignment(Value *V, unsigned Align,
+                                      unsigned PrefAlign, const DataLayout *TD) {
+  V = V->stripPointerCasts();
+
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // If the preferred alignment is greater than the natural stack alignment
+    // then don't round up. This avoids dynamic stack realignment.
+    if (TD && TD->exceedsNaturalStackAlignment(PrefAlign))
+      return Align;
+    // If there is a requested alignment and if this is an alloca, round up.
+    if (AI->getAlignment() >= PrefAlign)
+      return AI->getAlignment();
+    AI->setAlignment(PrefAlign);
+    return PrefAlign;
+  }
+
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // If there is a large requested alignment and we can, bump up the alignment
+    // of the global.
+    if (GV->isDeclaration()) return Align;
+    // If the memory we set aside for the global may not be the memory used by
+    // the final program then it is impossible for us to reliably enforce the
+    // preferred alignment.
+    if (GV->isWeakForLinker()) return Align;
+    
+    if (GV->getAlignment() >= PrefAlign)
+      return GV->getAlignment();
+    // We can only increase the alignment of the global if it has no alignment
+    // specified or if it is not assigned a section.  If it is assigned a
+    // section, the global could be densely packed with other objects in the
+    // section, increasing the alignment could cause padding issues.
+    if (!GV->hasSection() || GV->getAlignment() == 0)
+      GV->setAlignment(PrefAlign);
+    return GV->getAlignment();
+  }
+
+  return Align;
+}
+
+/// getOrEnforceKnownAlignment - If the specified pointer has an alignment that
+/// we can determine, return it, otherwise return 0.  If PrefAlign is specified,
+/// and it is more than the alignment of the ultimate object, see if we can
+/// increase the alignment of the ultimate object, making this check succeed.
+unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
+                                          const DataLayout *TD) {
+  assert(V->getType()->isPointerTy() &&
+         "getOrEnforceKnownAlignment expects a pointer!");
+  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
+  unsigned TrailZ = KnownZero.countTrailingOnes();
+  
+  // Avoid trouble with rediculously large TrailZ values, such as
+  // those computed from a null pointer.
+  TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
+  
+  unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
+  
+  // LLVM doesn't support alignments larger than this currently.
+  Align = std::min(Align, +Value::MaximumAlignment);
+  
+  if (PrefAlign > Align)
+    Align = enforceKnownAlignment(V, Align, PrefAlign, TD);
+    
+  // We don't need to make any adjustment.
+  return Align;
+}
+
+///===---------------------------------------------------------------------===//
+///  Dbg Intrinsic utilities
+///
+
+/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// that has an associated llvm.dbg.decl intrinsic.
+bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+                                           StoreInst *SI, DIBuilder &Builder) {
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  Instruction *DbgVal = NULL;
+  // If an argument is zero extended then use argument directly. The ZExt
+  // may be zapped by an optimization pass in future.
+  Argument *ExtendedArg = NULL;
+  if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(ZExt->getOperand(0));
+  if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+    ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
+  if (ExtendedArg)
+    DbgVal = Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, SI);
+  else
+    DbgVal = Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, SI);
+
+  // Propagate any debug metadata from the store onto the dbg.value.
+  DebugLoc SIDL = SI->getDebugLoc();
+  if (!SIDL.isUnknown())
+    DbgVal->setDebugLoc(SIDL);
+  // Otherwise propagate debug metadata from dbg.declare.
+  else
+    DbgVal->setDebugLoc(DDI->getDebugLoc());
+  return true;
+}
+
+/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// that has an associated llvm.dbg.decl intrinsic.
+bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
+                                           LoadInst *LI, DIBuilder &Builder) {
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  Instruction *DbgVal = 
+    Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
+                                    DIVar, LI);
+  
+  // Propagate any debug metadata from the store onto the dbg.value.
+  DebugLoc LIDL = LI->getDebugLoc();
+  if (!LIDL.isUnknown())
+    DbgVal->setDebugLoc(LIDL);
+  // Otherwise propagate debug metadata from dbg.declare.
+  else
+    DbgVal->setDebugLoc(DDI->getDebugLoc());
+  return true;
+}
+
+/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
+/// of llvm.dbg.value intrinsics.
+bool llvm::LowerDbgDeclare(Function &F) {
+  DIBuilder DIB(*F.getParent());
+  SmallVector<DbgDeclareInst *, 4> Dbgs;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) {
+      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(BI))
+        Dbgs.push_back(DDI);
+    }
+  if (Dbgs.empty())
+    return false;
+
+  for (SmallVector<DbgDeclareInst *, 4>::iterator I = Dbgs.begin(),
+         E = Dbgs.end(); I != E; ++I) {
+    DbgDeclareInst *DDI = *I;
+    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+      bool RemoveDDI = true;
+      for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+           UI != E; ++UI)
+        if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
+          ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(*UI))
+          ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+        else
+          RemoveDDI = false;
+      if (RemoveDDI)
+        DDI->eraseFromParent();
+    }
+  }
+  return true;
+}
+
+/// FindAllocaDbgDeclare - Finds the llvm.dbg.declare intrinsic describing the
+/// alloca 'V', if any.
+DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
+  if (MDNode *DebugNode = MDNode::getIfExists(V->getContext(), V))
+    for (Value::use_iterator UI = DebugNode->use_begin(),
+         E = DebugNode->use_end(); UI != E; ++UI)
+      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+        return DDI;
+
+  return 0;
+}
+
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+  if (!DDI)
+    return false;
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  // Create a copy of the original DIDescriptor for user variable, appending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  Type *Int64Ty = Type::getInt64Ty(AI->getContext());
+  SmallVector<Value*, 4> NewDIVarAddress;
+  if (DIVar.hasComplexAddress()) {
+    for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) {
+      NewDIVarAddress.push_back(
+          ConstantInt::get(Int64Ty, DIVar.getAddrElement(i)));
+    }
+  }
+  NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref));
+  DIVariable NewDIVar = Builder.createComplexVariable(
+      DIVar.getTag(), DIVar.getContext(), DIVar.getName(),
+      DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(),
+      NewDIVarAddress, DIVar.getArgNumber());
+
+  // Insert llvm.dbg.declare in the same basic block as the original alloca,
+  // and remove old llvm.dbg.declare.
+  BasicBlock *BB = AI->getParent();
+  Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB);
+  DDI->eraseFromParent();
+  return true;
+}
+
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 16> Reachable;
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(&F.getEntryBlock());
+  Reachable.insert(&F.getEntryBlock());
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
+  } while (!Worklist.empty());
+
+  if (Reachable.size() == F.size())
+    return false;
+
+  assert(Reachable.size() < F.size());
+  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
+    if (Reachable.count(I))
+      continue;
+
+    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(I);
+    I->dropAllReferences();
+  }
+
+  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
new file mode 100644
index 000000000000..37819cc9c917
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -0,0 +1,800 @@
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header.  This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header).  This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Indirectbr instructions introduce several complications. If the loop
+// contains or is entered by an indirectbr instruction, it may not be possible
+// to transform the loop and make these guarantees. Client code should check
+// that these conditions are true before relying on them.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-simplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
+STATISTIC(NumNested  , "Number of nested loops split out");
+
+namespace {
+  struct LoopSimplify : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LoopSimplify() : LoopPass(ID) {
+      initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+    }
+
+    // AA - If we have an alias analysis object to update, this is it, otherwise
+    // this is null.
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    Loop *L;
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We need loop information to identify the loops...
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DependenceAnalysis>();
+      AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
+    }
+
+    /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+    void verifyAnalysis() const;
+
+  private:
+    bool ProcessLoop(Loop *L, LPPassManager &LPM);
+    BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
+    BasicBlock *InsertPreheaderForLoop(Loop *L);
+    Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM,
+                             BasicBlock *Preheader);
+    BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
+    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                  Loop *L);
+  };
+}
+
+char LoopSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", true, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
+                "Canonicalize natural loops", true, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LoopSimplifyID = LoopSimplify::ID;
+Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnLoop - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnLoop(Loop *l, LPPassManager &LPM) {
+  L = l;
+  bool Changed = false;
+  LI = &getAnalysis<LoopInfo>();
+  AA = getAnalysisIfAvailable<AliasAnalysis>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = getAnalysisIfAvailable<ScalarEvolution>();
+
+  Changed |= ProcessLoop(L, LPM);
+
+  return Changed;
+}
+
+/// ProcessLoop - Walk the loop structure in depth first order, ensuring that
+/// all loops have preheaders.
+///
+bool LoopSimplify::ProcessLoop(Loop *L, LPPassManager &LPM) {
+  bool Changed = false;
+ReprocessLoop:
+
+  // Check to see that no blocks (other than the header) in this loop have
+  // predecessors that are not in the loop.  This is not valid for natural
+  // loops, but can occur if the blocks are unreachable.  Since they are
+  // unreachable we can just shamelessly delete those CFG edges!
+  for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
+       BB != E; ++BB) {
+    if (*BB == L->getHeader()) continue;
+
+    SmallPtrSet<BasicBlock*, 4> BadPreds;
+    for (pred_iterator PI = pred_begin(*BB),
+         PE = pred_end(*BB); PI != PE; ++PI) {
+      BasicBlock *P = *PI;
+      if (!L->contains(P))
+        BadPreds.insert(P);
+    }
+
+    // Delete each unique out-of-loop (and thus dead) predecessor.
+    for (SmallPtrSet<BasicBlock*, 4>::iterator I = BadPreds.begin(),
+         E = BadPreds.end(); I != E; ++I) {
+
+      DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+                   << (*I)->getName() << "\n");
+
+      // Inform each successor of each dead pred.
+      for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI)
+        (*SI)->removePredecessor(*I);
+      // Zap the dead pred's terminator and replace it with unreachable.
+      TerminatorInst *TI = (*I)->getTerminator();
+       TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+      (*I)->getTerminator()->eraseFromParent();
+      new UnreachableInst((*I)->getContext(), *I);
+      Changed = true;
+    }
+  }
+
+  // If there are exiting blocks with branches on undef, resolve the undef in
+  // the direction which will exit the loop. This will help simplify loop
+  // trip count computations.
+  SmallVector<BasicBlock*, 8> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       E = ExitingBlocks.end(); I != E; ++I)
+    if (BranchInst *BI = dyn_cast<BranchInst>((*I)->getTerminator()))
+      if (BI->isConditional()) {
+        if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
+
+          DEBUG(dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                       << (*I)->getName() << "\n");
+
+          BI->setCondition(ConstantInt::get(Cond->getType(),
+                                            !L->contains(BI->getSuccessor(0))));
+
+          // This may make the loop analyzable, force SCEV recomputation.
+          if (SE)
+            SE->forgetLoop(L);
+
+          Changed = true;
+        }
+      }
+
+  // Does the loop already have a preheader?  If so, don't insert one.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    Preheader = InsertPreheaderForLoop(L);
+    if (Preheader) {
+      ++NumInserted;
+      Changed = true;
+    }
+  }
+
+  // Next, check to make sure that all exit nodes of the loop only have
+  // predecessors that are inside of the loop.  This check guarantees that the
+  // loop preheader/header will dominate the exit blocks.  If the exit block has
+  // predecessors from outside of the loop, split the edge now.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+
+  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
+                                               ExitBlocks.end());
+  for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(),
+         E = ExitBlockSet.end(); I != E; ++I) {
+    BasicBlock *ExitBlock = *I;
+    for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
+         PI != PE; ++PI)
+      // Must be exactly this loop: no subloops, parent loops, or non-loop preds
+      // allowed.
+      if (!L->contains(*PI)) {
+        if (RewriteLoopExitBlock(L, ExitBlock)) {
+          ++NumInserted;
+          Changed = true;
+        }
+        break;
+      }
+  }
+
+  // If the header has more than two predecessors at this point (from the
+  // preheader and from multiple backedges), we must adjust the loop.
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch) {
+    // If this is really a nested loop, rip it out into a child loop.  Don't do
+    // this for loops with a giant number of backedges, just factor them into a
+    // common backedge instead.
+    if (L->getNumBackEdges() < 8) {
+      if (SeparateNestedLoop(L, LPM, Preheader)) {
+        ++NumNested;
+        // This is a big restructuring change, reprocess the whole loop.
+        Changed = true;
+        // GCC doesn't tail recursion eliminate this.
+        goto ReprocessLoop;
+      }
+    }
+
+    // If we either couldn't, or didn't want to, identify nesting of the loops,
+    // insert a new block that all backedges target, then make it jump to the
+    // loop header.
+    LoopLatch = InsertUniqueBackedgeBlock(L, Preheader);
+    if (LoopLatch) {
+      ++NumInserted;
+      Changed = true;
+    }
+  }
+
+  // Scan over the PHI nodes in the loop header.  Since they now have only two
+  // incoming values (the loop is canonicalized), we may have simplified the PHI
+  // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+  PHINode *PN;
+  for (BasicBlock::iterator I = L->getHeader()->begin();
+       (PN = dyn_cast<PHINode>(I++)); )
+    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
+      if (AA) AA->deleteValue(PN);
+      if (SE) SE->forgetValue(PN);
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+    }
+
+  // If this loop has multiple exits and the exits all go to the same
+  // block, attempt to merge the exits. This helps several passes, such
+  // as LoopRotation, which do not support loops with multiple exits.
+  // SimplifyCFG also does this (and this code uses the same utility
+  // function), however this code is loop-aware, where SimplifyCFG is
+  // not. That gives it the advantage of being able to hoist
+  // loop-invariant instructions out of the way to open up more
+  // opportunities, and the disadvantage of having the responsibility
+  // to preserve dominator information.
+  bool UniqueExit = true;
+  if (!ExitBlocks.empty())
+    for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i] != ExitBlocks[0]) {
+        UniqueExit = false;
+        break;
+      }
+  if (UniqueExit) {
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitingBlock = ExitingBlocks[i];
+      if (!ExitingBlock->getSinglePredecessor()) continue;
+      BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+      if (!BI || !BI->isConditional()) continue;
+      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      if (!CI || CI->getParent() != ExitingBlock) continue;
+
+      // Attempt to hoist out all instructions except for the
+      // comparison and the branch.
+      bool AllInvariant = true;
+      for (BasicBlock::iterator I = ExitingBlock->begin(); &*I != BI; ) {
+        Instruction *Inst = I++;
+        // Skip debug info intrinsics.
+        if (isa<DbgInfoIntrinsic>(Inst))
+          continue;
+        if (Inst == CI)
+          continue;
+        if (!L->makeLoopInvariant(Inst, Changed,
+                                  Preheader ? Preheader->getTerminator() : 0)) {
+          AllInvariant = false;
+          break;
+        }
+      }
+      if (!AllInvariant) continue;
+
+      // The block has now been cleared of all instructions except for
+      // a comparison and a conditional branch. SimplifyCFG may be able
+      // to fold it now.
+      if (!FoldBranchToCommonDest(BI)) continue;
+
+      // Success. The block is now dead, so remove it from the loop,
+      // update the dominator tree and delete it.
+      DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+                   << ExitingBlock->getName() << "\n");
+
+      // If any reachable control flow within this loop has changed, notify
+      // ScalarEvolution. Currently assume the parent loop doesn't change
+      // (spliting edges doesn't count). If blocks, CFG edges, or other values
+      // in the parent loop change, then we need call to forgetLoop() for the
+      // parent instead.
+      if (SE)
+        SE->forgetLoop(L);
+
+      assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
+      Changed = true;
+      LI->removeBlock(ExitingBlock);
+
+      DomTreeNode *Node = DT->getNode(ExitingBlock);
+      const std::vector<DomTreeNodeBase<BasicBlock> *> &Children =
+        Node->getChildren();
+      while (!Children.empty()) {
+        DomTreeNode *Child = Children.front();
+        DT->changeImmediateDominator(Child, Node->getIDom());
+      }
+      DT->eraseNode(ExitingBlock);
+
+      BI->getSuccessor(0)->removePredecessor(ExitingBlock);
+      BI->getSuccessor(1)->removePredecessor(ExitingBlock);
+      ExitingBlock->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one.  This method has two phases:
+/// preheader insertion and analysis updating.
+///
+BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
+  BasicBlock *Header = L->getHeader();
+
+  // Compute the set of predecessors of the loop that are not in the loop.
+  SmallVector<BasicBlock*, 8> OutsideBlocks;
+  for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+       PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (!L->contains(P)) {         // Coming in from outside the loop?
+      // If the loop is branched to from an indirect branch, we won't
+      // be able to fully transform the loop, because it prohibits
+      // edge splitting.
+      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
+
+      // Keep track of it.
+      OutsideBlocks.push_back(P);
+    }
+  }
+
+  // Split out the loop pre-header.
+  BasicBlock *PreheaderBB;
+  if (!Header->isLandingPad()) {
+    PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
+                                         this);
+  } else {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
+                                ".split-lp", this, NewBBs);
+    PreheaderBB = NewBBs[0];
+  }
+
+  PreheaderBB->getTerminator()->setDebugLoc(
+                                      Header->getFirstNonPHI()->getDebugLoc());
+  DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+               << PreheaderBB->getName() << "\n");
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
+
+  return PreheaderBB;
+}
+
+/// RewriteLoopExitBlock - Ensure that the loop preheader dominates all exit
+/// blocks.  This method is used to split exit blocks that have predecessors
+/// outside of the loop.
+BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
+  SmallVector<BasicBlock*, 8> LoopBlocks;
+  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
+    BasicBlock *P = *I;
+    if (L->contains(P)) {
+      // Don't do this if the loop is exited via an indirect branch.
+      if (isa<IndirectBrInst>(P->getTerminator())) return 0;
+
+      LoopBlocks.push_back(P);
+    }
+  }
+
+  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
+  BasicBlock *NewExitBB = 0;
+
+  if (Exit->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Exit, ArrayRef<BasicBlock*>(&LoopBlocks[0],
+                                                            LoopBlocks.size()),
+                                ".loopexit", ".nonloopexit",
+                                this, NewBBs);
+    NewExitBB = NewBBs[0];
+  } else {
+    NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", this);
+  }
+
+  DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+               << NewExitBB->getName() << "\n");
+  return NewExitBB;
+}
+
+/// AddBlockAndPredsToSet - Add the specified block, and all of its
+/// predecessors, to the specified set, if it's not already in there.  Stop
+/// predecessor traversal when we reach StopBlock.
+static void AddBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
+                                  std::set<BasicBlock*> &Blocks) {
+  std::vector<BasicBlock *> WorkList;
+  WorkList.push_back(InputBB);
+  do {
+    BasicBlock *BB = WorkList.back(); WorkList.pop_back();
+    if (Blocks.insert(BB).second && BB != StopBlock)
+      // If BB is not already processed and it is not a stop block then
+      // insert its predecessor in the work list
+      for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+        BasicBlock *WBB = *I;
+        WorkList.push_back(WBB);
+      }
+  } while(!WorkList.empty());
+}
+
+/// FindPHIToPartitionLoops - The first part of loop-nestification is to find a
+/// PHI node that tells us how to partition the loops.
+static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+                                        AliasAnalysis *AA, LoopInfo *LI) {
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+    ++I;
+    if (Value *V = SimplifyInstruction(PN, 0, 0, DT)) {
+      // This is a degenerate PHI already, don't modify it!
+      PN->replaceAllUsesWith(V);
+      if (AA) AA->deleteValue(PN);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    // Scan this PHI node looking for a use of the PHI node by itself.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (PN->getIncomingValue(i) == PN &&
+          L->contains(PN->getIncomingBlock(i)))
+        // We found something tasty to remove.
+        return PN;
+  }
+  return 0;
+}
+
+// PlaceSplitBlockCarefully - If the block isn't already, move the new block to
+// right after some 'outside block' block.  This prevents the preheader from
+// being placed inside the loop body, e.g. when the loop hasn't been rotated.
+void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                            Loop *L) {
+  // Check to see if NewBB is already well placed.
+  Function::iterator BBI = NewBB; --BBI;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    if (&*BBI == SplitPreds[i])
+      return;
+  }
+
+  // If it isn't already after an outside block, move it after one.  This is
+  // always good as it makes the uncond branch from the outside block into a
+  // fall-through.
+
+  // Figure out *which* outside block to put this after.  Prefer an outside
+  // block that neighbors a BB actually in the loop.
+  BasicBlock *FoundBB = 0;
+  for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+    Function::iterator BBI = SplitPreds[i];
+    if (++BBI != NewBB->getParent()->end() &&
+        L->contains(BBI)) {
+      FoundBB = SplitPreds[i];
+      break;
+    }
+  }
+
+  // If our heuristic for a *good* bb to place this after doesn't find
+  // anything, just pick something.  It's likely better than leaving it within
+  // the loop.
+  if (!FoundBB)
+    FoundBB = SplitPreds[0];
+  NewBB->moveAfter(FoundBB);
+}
+
+
+/// SeparateNestedLoop - If this loop has multiple backedges, try to pull one of
+/// them out into a nested loop.  This is important for code that looks like
+/// this:
+///
+///  Loop:
+///     ...
+///     br cond, Loop, Next
+///     ...
+///     br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop.  PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM,
+                                       BasicBlock *Preheader) {
+  // Don't try to separate loops without a preheader.
+  if (!Preheader)
+    return 0;
+
+  // The header is not a landing pad; preheader insertion should ensure this.
+  assert(!L->getHeader()->isLandingPad() &&
+         "Can't insert backedge to landing pad");
+
+  PHINode *PN = FindPHIToPartitionLoops(L, DT, AA, LI);
+  if (PN == 0) return 0;  // No known way to partition.
+
+  // Pull out all predecessors that have varying values in the loop.  This
+  // handles the case when a PHI node has multiple instances of itself as
+  // arguments.
+  SmallVector<BasicBlock*, 8> OuterLoopPreds;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) != PN ||
+        !L->contains(PN->getIncomingBlock(i))) {
+      // We can't split indirectbr edges.
+      if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
+        return 0;
+      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+    }
+  }
+  DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+
+  // If ScalarEvolution is around and knows anything about values in
+  // this loop, tell it to forget them, because we're about to
+  // substantially change it.
+  if (SE)
+    SE->forgetLoop(L);
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *NewBB =
+    SplitBlockPredecessors(Header, OuterLoopPreds,  ".outer", this);
+
+  // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+  // code layout too horribly.
+  PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+
+  // Create the new outer loop.
+  Loop *NewOuter = new Loop();
+
+  // Change the parent loop to use the outer loop as its child now.
+  if (Loop *Parent = L->getParentLoop())
+    Parent->replaceChildLoopWith(L, NewOuter);
+  else
+    LI->changeTopLevelLoop(L, NewOuter);
+
+  // L is now a subloop of our outer loop.
+  NewOuter->addChildLoop(L);
+
+  // Add the new loop to the pass manager queue.
+  LPM.insertLoopIntoQueue(NewOuter);
+
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    NewOuter->addBlockEntry(*I);
+
+  // Now reset the header in L, which had been moved by
+  // SplitBlockPredecessors for the outer loop.
+  L->moveToHeader(Header);
+
+  // Determine which blocks should stay in L and which should be moved out to
+  // the Outer loop now.
+  std::set<BasicBlock*> BlocksInL;
+  for (pred_iterator PI=pred_begin(Header), E = pred_end(Header); PI!=E; ++PI) {
+    BasicBlock *P = *PI;
+    if (DT->dominates(Header, P))
+      AddBlockAndPredsToSet(P, Header, BlocksInL);
+  }
+
+  // Scan all of the loop children of L, moving them to OuterLoop if they are
+  // not part of the inner loop.
+  const std::vector<Loop*> &SubLoops = L->getSubLoops();
+  for (size_t I = 0; I != SubLoops.size(); )
+    if (BlocksInL.count(SubLoops[I]->getHeader()))
+      ++I;   // Loop remains in L
+    else
+      NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+  // Now that we know which blocks are in L and which need to be moved to
+  // OuterLoop, move any blocks that need it.
+  for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    if (!BlocksInL.count(BB)) {
+      // Move this block to the parent, updating the exit blocks sets
+      L->removeBlockFromLoop(BB);
+      if ((*LI)[BB] == L)
+        LI->changeLoopFor(BB, NewOuter);
+      --i;
+    }
+  }
+
+  return NewOuter;
+}
+
+
+
+/// InsertUniqueBackedgeBlock - This method is called when the specified loop
+/// has more than one backedge in it.  If this occurs, revector all of these
+/// backedges to target a new basic block and have that block branch to the loop
+/// header.  This ensures that loops have exactly one backedge.
+///
+BasicBlock *
+LoopSimplify::InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader) {
+  assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+  // Get information about the loop
+  BasicBlock *Header = L->getHeader();
+  Function *F = Header->getParent();
+
+  // Unique backedge insertion currently depends on having a preheader.
+  if (!Preheader)
+    return 0;
+
+  // The header is not a landing pad; preheader insertion should ensure this.
+  assert(!Header->isLandingPad() && "Can't insert backedge to landing pad");
+
+  // Figure out which basic blocks contain back-edges to the loop header.
+  std::vector<BasicBlock*> BackedgeBlocks;
+  for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
+    BasicBlock *P = *I;
+
+    // Indirectbr edges cannot be split, so we must fail if we find one.
+    if (isa<IndirectBrInst>(P->getTerminator()))
+      return 0;
+
+    if (P != Preheader) BackedgeBlocks.push_back(P);
+  }
+
+  // Create and insert the new backedge block...
+  BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
+                                           Header->getName()+".backedge", F);
+  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+
+  DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+               << BEBlock->getName() << "\n");
+
+  // Move the new backedge block to right after the last backedge block.
+  Function::iterator InsertPos = BackedgeBlocks.back(); ++InsertPos;
+  F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+  // Now that the block has been inserted into the function, create PHI nodes in
+  // the backedge block which correspond to any PHI nodes in the header block.
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
+                                     PN->getName()+".be", BETerminator);
+    if (AA) AA->copyValue(PN, NewPN);
+
+    // Loop over the PHI node, moving all entries except the one for the
+    // preheader over to the new PHI node.
+    unsigned PreheaderIdx = ~0U;
+    bool HasUniqueIncomingValue = true;
+    Value *UniqueValue = 0;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *IBB = PN->getIncomingBlock(i);
+      Value *IV = PN->getIncomingValue(i);
+      if (IBB == Preheader) {
+        PreheaderIdx = i;
+      } else {
+        NewPN->addIncoming(IV, IBB);
+        if (HasUniqueIncomingValue) {
+          if (UniqueValue == 0)
+            UniqueValue = IV;
+          else if (UniqueValue != IV)
+            HasUniqueIncomingValue = false;
+        }
+      }
+    }
+
+    // Delete all of the incoming values from the old PN except the preheader's
+    assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+    if (PreheaderIdx != 0) {
+      PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+      PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+    }
+    // Nuke all entries except the zero'th.
+    for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+      PN->removeIncomingValue(e-i, false);
+
+    // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+    PN->addIncoming(NewPN, BEBlock);
+
+    // As an optimization, if all incoming values in the new PhiNode (which is a
+    // subset of the incoming values of the old PHI node) have the same value,
+    // eliminate the PHI Node.
+    if (HasUniqueIncomingValue) {
+      NewPN->replaceAllUsesWith(UniqueValue);
+      if (AA) AA->deleteValue(NewPN);
+      BEBlock->getInstList().erase(NewPN);
+    }
+  }
+
+  // Now that all of the PHI nodes have been inserted and adjusted, modify the
+  // backedge blocks to just to the BEBlock instead of the header.
+  for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    for (unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
+      if (TI->getSuccessor(Op) == Header)
+        TI->setSuccessor(Op, BEBlock);
+  }
+
+  //===--- Update all analyses which we must preserve now -----------------===//
+
+  // Update Loop Information - we know that this block is now in the current
+  // loop and all parent loops.
+  L->addBasicBlockToLoop(BEBlock, LI->getBase());
+
+  // Update dominator information
+  DT->splitBlock(BEBlock);
+
+  return BEBlock;
+}
+
+void LoopSimplify::verifyAnalysis() const {
+  // It used to be possible to just assert L->isLoopSimplifyForm(), however
+  // with the introduction of indirectbr, there are now cases where it's
+  // not possible to transform a loop as necessary. We can at least check
+  // that there is an indirectbr near any time there's trouble.
+
+  // Indirectbr can interfere with preheader and unique backedge insertion.
+  if (!L->getLoopPreheader() || !L->getLoopLatch()) {
+    bool HasIndBrPred = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PE = pred_end(L->getHeader()); PI != PE; ++PI)
+      if (isa<IndirectBrInst>((*PI)->getTerminator())) {
+        HasIndBrPred = true;
+        break;
+      }
+    assert(HasIndBrPred &&
+           "LoopSimplify has no excuse for missing loop header info!");
+    (void)HasIndBrPred;
+  }
+
+  // Indirectbr can interfere with exit block canonicalization.
+  if (!L->hasDedicatedExits()) {
+    bool HasIndBrExiting = false;
+    SmallVector<BasicBlock*, 8> ExitingBlocks;
+    L->getExitingBlocks(ExitingBlocks);
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+      if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
+        HasIndBrExiting = true;
+        break;
+      }
+    }
+
+    assert(HasIndBrExiting &&
+           "LoopSimplify has no excuse for missing exit block info!");
+    (void)HasIndBrExiting;
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
new file mode 100644
index 000000000000..cb581b3d13b9
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -0,0 +1,457 @@
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches.  This will be corrected in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+using namespace llvm;
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by VMap.
+static inline void RemapInstruction(Instruction *I,
+                                    ValueToValueMapTy &VMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    ValueToValueMapTy::iterator It = VMap.find(Op);
+    if (It != VMap.end())
+      I->setOperand(op, It->second);
+  }
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
+      if (It != VMap.end())
+        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
+    }
+  }
+}
+
+/// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
+/// only has one predecessor, and that predecessor only has one successor.
+/// The LoopInfo Analysis that is passed will be kept consistent.
+/// Returns the new combined block.
+static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
+                                            LPPassManager *LPM) {
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  BasicBlock *OnlyPred = BB->getSinglePredecessor();
+  if (!OnlyPred) return 0;
+
+  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
+    return 0;
+
+  DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
+
+  // Resolve any PHI nodes at the start of the block.  They are all
+  // guaranteed to have exactly one entry if they exist, unless there are
+  // multiple duplicate (but guaranteed to be equal) entries for the
+  // incoming edges.  This occurs when there are multiple edges from
+  // OnlyPred to OnlySucc.
+  FoldSingleEntryPHINodes(BB);
+
+  // Delete the unconditional branch from the predecessor...
+  OnlyPred->getInstList().pop_back();
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(OnlyPred);
+
+  // Move all definitions in the successor to the predecessor...
+  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
+
+  std::string OldName = BB->getName();
+
+  // Erase basic block from the function...
+
+  // ScalarEvolution holds references to loop exit blocks.
+  if (LPM) {
+    if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
+      if (Loop *L = LI->getLoopFor(BB))
+        SE->forgetLoop(L);
+    }
+  }
+  LI->removeBlock(BB);
+  BB->eraseFromParent();
+
+  // Inherit predecessor's name if it exists...
+  if (!OldName.empty() && !OnlyPred->hasName())
+    OnlyPred->setName(OldName);
+
+  return OnlyPred;
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
+/// if unrolling was successful, or false if the loop was unmodified. Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// TripCount is generally defined as the number of times the loop header
+/// executes. UnrollLoop relaxes the definition to permit early exits: here
+/// TripCount is the iteration on which control exits LatchBlock if no early
+/// exits were taken. Note that UnrollLoop assumes that the loop counter test
+/// terminates LatchBlock in order to remove unnecesssary instances of the
+/// test. In other words, control may exit the loop prior to TripCount
+/// iterations via an early branch, but control may not exit the loop from the
+/// LatchBlock's terminator prior to TripCount iterations.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// If a LoopPassManager is passed in, and the loop is fully removed, it will be
+/// removed from the LoopPassManager as well. LPM can also be NULL.
+///
+/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
+/// available it must also preserve those analyses.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
+                      bool AllowRuntime, unsigned TripMultiple,
+                      LoopInfo *LI, LPPassManager *LPM) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
+    return false;
+  }
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock) {
+    DEBUG(dbgs() << "  Can't unroll; loop exit-block-insertion failed.\n");
+    return false;
+  }
+
+  // Loops with indirectbr cannot be cloned.
+  if (!L->isSafeToClone()) {
+    DEBUG(dbgs() << "  Can't unroll; Loop body cannot be cloned.\n");
+    return false;
+  }
+
+  BasicBlock *Header = L->getHeader();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  if (!BI || BI->isUnconditional()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DEBUG(dbgs() <<
+             "  Can't unroll; loop not terminated by a conditional branch.\n");
+    return false;
+  }
+
+  if (Header->hasAddressTaken()) {
+    // The loop-rotate pass can be helpful to avoid this in many cases.
+    DEBUG(dbgs() <<
+          "  Won't unroll loop: address of header block is taken.\n");
+    return false;
+  }
+
+  if (TripCount != 0)
+    DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
+  if (TripMultiple != 1)
+    DEBUG(dbgs() << "  Trip Multiple = " << TripMultiple << "\n");
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (TripCount != 0 && Count > TripCount)
+    Count = TripCount;
+
+  // Don't enter the unroll code if there is nothing to do. This way we don't
+  // need to support "partial unrolling by 1".
+  if (TripCount == 0 && Count < 2)
+    return false;
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = Count == TripCount;
+
+  // We assume a run-time trip count if the compiler cannot
+  // figure out the loop trip count and the unroll-runtime
+  // flag is specified.
+  bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
+
+  if (RuntimeTripCount && !UnrollRuntimeLoopProlog(L, Count, LI, LPM))
+    return false;
+
+  // Notify ScalarEvolution that the loop will be substantially changed,
+  // if not outright eliminated.
+  if (LPM) {
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE)
+      SE->forgetLoop(L);
+  }
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (TripCount != 0) {
+    BreakoutTrip = TripCount % Count;
+    TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = TripMultiple =
+      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
+  }
+
+  if (CompletelyUnroll) {
+    DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+          << " with trip count " << TripCount << "!\n");
+  } else {
+    DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
+          << " by " << Count);
+    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
+      DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+    } else if (TripMultiple != 1) {
+      DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+    } else if (RuntimeTripCount) {
+      DEBUG(dbgs() << " with run-time trip count");
+    }
+    DEBUG(dbgs() << "!\n");
+  }
+
+  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
+
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  ValueToValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    OrigPHINode.push_back(cast<PHINode>(I));
+  }
+
+  std::vector<BasicBlock*> Headers;
+  std::vector<BasicBlock*> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+  for (unsigned It = 1; It != Count; ++It) {
+    std::vector<BasicBlock*> NewBlocks;
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+      ValueToValueMapTy VMap;
+      BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      // Loop over all of the PHI nodes in the block, changing them to use the
+      // incoming values from the previous block.
+      if (*BB == Header)
+        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+          PHINode *NewPHI = cast<PHINode>(VMap[OrigPHINode[i]]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI))
+              InVal = LastValueMap[InValI];
+          VMap[OrigPHINode[i]] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      L->addBasicBlockToLoop(New, LI->getBase());
+
+      // Add phi entries for newly created values to all exit blocks.
+      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
+           SI != SE; ++SI) {
+        if (L->contains(*SI))
+          continue;
+        for (BasicBlock::iterator BBI = (*SI)->begin();
+             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
+          Value *Incoming = phi->getIncomingValueForBlock(*BB);
+          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+          if (It != LastValueMap.end())
+            Incoming = It->second;
+          phi->addIncoming(Incoming, New);
+        }
+      }
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock)
+        Latches.push_back(New);
+
+      NewBlocks.push_back(New);
+    }
+
+    // Remap all instructions in the most recent iteration
+    for (unsigned i = 0; i < NewBlocks.size(); ++i)
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+        ::RemapInstruction(I, LastValueMap);
+  }
+
+  // Loop over the PHI nodes in the original block, setting incoming values.
+  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+    PHINode *PN = OrigPHINode[i];
+    if (CompletelyUnroll) {
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    }
+    else if (Count > 1) {
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI))
+          InVal = LastValueMap[InVal];
+      }
+      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+      PN->addIncoming(InVal, Latches.back());
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // set up the branches to connect them.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The original branch was replicated in each unrolled iteration.
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = true;
+
+    if (RuntimeTripCount && j != 0) {
+      NeedConditional = false;
+    }
+
+    // For a complete unroll, make the last iteration end with a branch
+    // to the exit block.
+    if (CompletelyUnroll && j == 0) {
+      Dest = LoopExit;
+      NeedConditional = false;
+    }
+
+    // If we know the trip count or a multiple of it, we can safely use an
+    // unconditional branch for some iterations.
+    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      NeedConditional = false;
+    }
+
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      Term->setSuccessor(!ContinueOnTrue, Dest);
+    } else {
+      // Remove phi operands at this loop exit
+      if (Dest != LoopExit) {
+        BasicBlock *BB = Latches[i];
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          if (*SI == Headers[i])
+            continue;
+          for (BasicBlock::iterator BBI = (*SI)->begin();
+               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
+            Phi->removeIncomingValue(BB, false);
+          }
+        }
+      }
+      // Replace the conditional branch with an unconditional one.
+      BranchInst::Create(Dest, Term);
+      Term->eraseFromParent();
+    }
+  }
+
+  // Merge adjacent basic blocks, if possible.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+    if (Term->isUnconditional()) {
+      BasicBlock *Dest = Term->getSuccessor(0);
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+    }
+  }
+
+  if (LPM) {
+    // FIXME: Reconstruct dom info, because it is not preserved properly.
+    // Incrementally updating domtree after loop unrolling would be easy.
+    if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
+      DT->runOnFunction(*L->getHeader()->getParent());
+
+    // Simplify any new induction variables in the partially unrolled loop.
+    ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+    if (SE && !CompletelyUnroll) {
+      SmallVector<WeakVH, 16> DeadInsts;
+      simplifyLoopIVs(L, SE, LPM, DeadInsts);
+
+      // Aggressively clean up dead instructions that simplifyLoopIVs already
+      // identified. Any remaining should be cleaned up below.
+      while (!DeadInsts.empty())
+        if (Instruction *Inst =
+            dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+          RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    }
+  }
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
+  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
+       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+      Instruction *Inst = I++;
+
+      if (isInstructionTriviallyDead(Inst))
+        (*BB)->getInstList().erase(Inst);
+      else if (Value *V = SimplifyInstruction(Inst))
+        if (LI->replacementPreservesLCSSAForm(Inst, V)) {
+          Inst->replaceAllUsesWith(V);
+          (*BB)->getInstList().erase(Inst);
+        }
+    }
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+  // Remove the loop from the LoopPassManager if it's completely removed.
+  if (CompletelyUnroll && LPM != NULL)
+    LPM->deleteLoopFromQueue(L);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
new file mode 100644
index 000000000000..d801d5f2c2a4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -0,0 +1,374 @@
+//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for loops with run-time
+// trip counts.  See LoopUnroll.cpp for unrolling loops with compile-time
+// trip counts.
+//
+// The functions in this file are used to generate extra code when the
+// run-time trip count modulo the unroll factor is not 0.  When this is the
+// case, we need to generate code to execute these 'left over' iterations.
+//
+// The current strategy generates an if-then-else sequence prior to the
+// unrolled loop to execute the 'left over' iterations.  Other strategies
+// include generate a loop before or after the unrolled loop.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+
+using namespace llvm;
+
+STATISTIC(NumRuntimeUnrolled,
+          "Number of loops unrolled with run-time trip counts");
+
+/// Connect the unrolling prolog code to the original loop.
+/// The unrolling prolog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Create PHI nodes at prolog end block to combine values
+///   that exit the prolog code and jump around the prolog.
+/// - Add a PHI operand to a PHI node at the loop exit block
+///   for values that exit the prolog and go around the loop.
+/// - Branch around the original loop if the trip count is less
+///   than the unroll factor.
+///
+static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count,
+                          BasicBlock *LastPrologBB, BasicBlock *PrologEnd,
+                          BasicBlock *OrigPH, BasicBlock *NewPH,
+                          ValueToValueMapTy &LVMap, Pass *P) {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch != 0 && "Loop must have a latch");
+
+  // Create a PHI node for each outgoing value from the original loop
+  // (which means it is an outgoing value from the prolog code too).
+  // The new PHI node is inserted in the prolog end basic block.
+  // The new PHI name is added as an operand of a PHI node in either
+  // the loop header or the loop exit block.
+  for (succ_iterator SBI = succ_begin(Latch), SBE = succ_end(Latch);
+       SBI != SBE; ++SBI) {
+    for (BasicBlock::iterator BBI = (*SBI)->begin();
+         PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
+
+      // Add a new PHI node to the prolog end block and add the
+      // appropriate incoming values.
+      PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName()+".unr",
+                                       PrologEnd->getTerminator());
+      // Adding a value to the new PHI node from the original loop preheader.
+      // This is the value that skips all the prolog code.
+      if (L->contains(PN)) {
+        NewPN->addIncoming(PN->getIncomingValueForBlock(NewPH), OrigPH);
+      } else {
+        NewPN->addIncoming(Constant::getNullValue(PN->getType()), OrigPH);
+      }
+
+      Value *V = PN->getIncomingValueForBlock(Latch);
+      if (Instruction *I = dyn_cast<Instruction>(V)) {
+        if (L->contains(I)) {
+          V = LVMap[I];
+        }
+      }
+      // Adding a value to the new PHI node from the last prolog block
+      // that was created.
+      NewPN->addIncoming(V, LastPrologBB);
+
+      // Update the existing PHI node operand with the value from the
+      // new PHI node.  How this is done depends on if the existing
+      // PHI node is in the original loop block, or the exit block.
+      if (L->contains(PN)) {
+        PN->setIncomingValue(PN->getBasicBlockIndex(NewPH), NewPN);
+      } else {
+        PN->addIncoming(NewPN, PrologEnd);
+      }
+    }
+  }
+
+  // Create a branch around the orignal loop, which is taken if the
+  // trip count is less than the unroll factor.
+  Instruction *InsertPt = PrologEnd->getTerminator();
+  Instruction *BrLoopExit =
+    new ICmpInst(InsertPt, ICmpInst::ICMP_ULT, TripCount,
+                 ConstantInt::get(TripCount->getType(), Count));
+  BasicBlock *Exit = L->getUniqueExitBlock();
+  assert(Exit != 0 && "Loop must have a single exit block only");
+  // Split the exit to maintain loop canonicalization guarantees
+  SmallVector<BasicBlock*, 4> Preds(pred_begin(Exit), pred_end(Exit));
+  if (!Exit->isLandingPad()) {
+    SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", P);
+  } else {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Exit, Preds, ".unr1-lcssa", ".unr2-lcssa",
+                                P, NewBBs);
+  }
+  // Add the branch to the exit block (around the unrolled loop)
+  BranchInst::Create(Exit, NewPH, BrLoopExit, InsertPt);
+  InsertPt->eraseFromParent();
+}
+
+/// Create a clone of the blocks in a loop and connect them together.
+/// This function doesn't create a clone of the loop structure.
+///
+/// There are two value maps that are defined and used.  VMap is
+/// for the values in the current loop instance.  LVMap contains
+/// the values from the last loop instance.  We need the LVMap values
+/// to update the initial values for the current loop instance.
+///
+static void CloneLoopBlocks(Loop *L,
+                            bool FirstCopy,
+                            BasicBlock *InsertTop,
+                            BasicBlock *InsertBot,
+                            std::vector<BasicBlock *> &NewBlocks,
+                            LoopBlocksDFS &LoopBlocks,
+                            ValueToValueMapTy &VMap,
+                            ValueToValueMapTy &LVMap,
+                            LoopInfo *LI) {
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  Function *F = Header->getParent();
+  LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+  // For each block in the original loop, create a new copy,
+  // and update the value map with the newly created values.
+  for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+    BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".unr", F);
+    NewBlocks.push_back(NewBB);
+
+    if (Loop *ParentLoop = L->getParentLoop())
+      ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+
+    VMap[*BB] = NewBB;
+    if (Header == *BB) {
+      // For the first block, add a CFG connection to this newly
+      // created block
+      InsertTop->getTerminator()->setSuccessor(0, NewBB);
+
+      // Change the incoming values to the ones defined in the
+      // previously cloned loop.
+      for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+        PHINode *NewPHI = cast<PHINode>(VMap[I]);
+        if (FirstCopy) {
+          // We replace the first phi node with the value from the preheader
+          VMap[I] = NewPHI->getIncomingValueForBlock(Preheader);
+          NewBB->getInstList().erase(NewPHI);
+        } else {
+          // Update VMap with values from the previous block
+          unsigned idx = NewPHI->getBasicBlockIndex(Latch);
+          Value *InVal = NewPHI->getIncomingValue(idx);
+          if (Instruction *I = dyn_cast<Instruction>(InVal))
+            if (L->contains(I))
+              InVal = LVMap[InVal];
+          NewPHI->setIncomingValue(idx, InVal);
+          NewPHI->setIncomingBlock(idx, InsertTop);
+        }
+      }
+    }
+
+    if (Latch == *BB) {
+      VMap.erase((*BB)->getTerminator());
+      NewBB->getTerminator()->eraseFromParent();
+      BranchInst::Create(InsertBot, NewBB);
+    }
+  }
+  // LastValueMap is updated with the values for the current loop
+  // which are used the next time this function is called.
+  for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+       VI != VE; ++VI) {
+    LVMap[VI->first] = VI->second;
+  }
+}
+
+/// Insert code in the prolog code when unrolling a loop with a
+/// run-time trip-count.
+///
+/// This method assumes that the loop unroll factor is total number
+/// of loop bodes in the loop after unrolling. (Some folks refer
+/// to the unroll factor as the number of *extra* copies added).
+/// We assume also that the loop unroll factor is a power-of-two. So, after
+/// unrolling the loop, the number of loop bodies executed is 2,
+/// 4, 8, etc.  Note - LLVM converts the if-then-sequence to a switch
+/// instruction in SimplifyCFG.cpp.  Then, the backend decides how code for
+/// the switch instruction is generated.
+///
+///    extraiters = tripcount % loopfactor
+///    if (extraiters == 0) jump Loop:
+///    if (extraiters == loopfactor) jump L1
+///    if (extraiters == loopfactor-1) jump L2
+///    ...
+///    L1:  LoopBody;
+///    L2:  LoopBody;
+///    ...
+///    if tripcount < loopfactor jump End
+///    Loop:
+///    ...
+///    End:
+///
+bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI,
+                                   LPPassManager *LPM) {
+  // for now, only unroll loops that contain a single exit
+  if (!L->getExitingBlock())
+    return false;
+
+  // Make sure the loop is in canonical form, and there is a single
+  // exit block only.
+  if (!L->isLoopSimplifyForm() || L->getUniqueExitBlock() == 0)
+    return false;
+
+  // Use Scalar Evolution to compute the trip count.  This allows more
+  // loops to be unrolled than relying on induction var simplification
+  if (!LPM)
+    return false;
+  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+  if (SE == 0)
+    return false;
+
+  // Only unroll loops with a computable trip count and the trip count needs
+  // to be an int value (allowing a pointer type is a TODO item)
+  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BECount) || !BECount->getType()->isIntegerTy())
+    return false;
+
+  // Add 1 since the backedge count doesn't include the first loop iteration
+  const SCEV *TripCountSC =
+    SE->getAddExpr(BECount, SE->getConstant(BECount->getType(), 1));
+  if (isa<SCEVCouldNotCompute>(TripCountSC))
+    return false;
+
+  // We only handle cases when the unroll factor is a power of 2.
+  // Count is the loop unroll factor, the number of extra copies added + 1.
+  if ((Count & (Count-1)) != 0)
+    return false;
+
+  // If this loop is nested, then the loop unroller changes the code in
+  // parent loop, so the Scalar Evolution pass needs to be run again
+  if (Loop *ParentLoop = L->getParentLoop())
+    SE->forgetLoop(ParentLoop);
+
+  BasicBlock *PH = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  // It helps to splits the original preheader twice, one for the end of the
+  // prolog code and one for a new loop preheader
+  BasicBlock *PEnd = SplitEdge(PH, Header, LPM->getAsPass());
+  BasicBlock *NewPH = SplitBlock(PEnd, PEnd->getTerminator(), LPM->getAsPass());
+  BranchInst *PreHeaderBR = cast<BranchInst>(PH->getTerminator());
+
+  // Compute the number of extra iterations required, which is:
+  //  extra iterations = run-time trip count % (loop unroll factor + 1)
+  SCEVExpander Expander(*SE, "loop-unroll");
+  Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
+                                            PreHeaderBR);
+  Type *CountTy = TripCount->getType();
+  BinaryOperator *ModVal =
+    BinaryOperator::CreateURem(TripCount,
+                               ConstantInt::get(CountTy, Count),
+                               "xtraiter");
+  ModVal->insertBefore(PreHeaderBR);
+
+  // Check if for no extra iterations, then jump to unrolled loop
+  Value *BranchVal = new ICmpInst(PreHeaderBR,
+                                  ICmpInst::ICMP_NE, ModVal,
+                                  ConstantInt::get(CountTy, 0), "lcmp");
+  // Branch to either the extra iterations or the unrolled loop
+  // We will fix up the true branch label when adding loop body copies
+  BranchInst::Create(PEnd, PEnd, BranchVal, PreHeaderBR);
+  assert(PreHeaderBR->isUnconditional() &&
+         PreHeaderBR->getSuccessor(0) == PEnd &&
+         "CFG edges in Preheader are not correct");
+  PreHeaderBR->eraseFromParent();
+
+  ValueToValueMapTy LVMap;
+  Function *F = Header->getParent();
+  // These variables are used to update the CFG links in each iteration
+  BasicBlock *CompareBB = 0;
+  BasicBlock *LastLoopBB = PH;
+  // Get an ordered list of blocks in the loop to help with the ordering of the
+  // cloned blocks in the prolog code
+  LoopBlocksDFS LoopBlocks(L);
+  LoopBlocks.perform(LI);
+
+  //
+  // For each extra loop iteration, create a copy of the loop's basic blocks
+  // and generate a condition that branches to the copy depending on the
+  // number of 'left over' iterations.
+  //
+  for (unsigned leftOverIters = Count-1; leftOverIters > 0; --leftOverIters) {
+    std::vector<BasicBlock*> NewBlocks;
+    ValueToValueMapTy VMap;
+
+    // Clone all the basic blocks in the loop, but we don't clone the loop
+    // This function adds the appropriate CFG connections.
+    CloneLoopBlocks(L, (leftOverIters == Count-1), LastLoopBB, PEnd, NewBlocks,
+                    LoopBlocks, VMap, LVMap, LI);
+    LastLoopBB = cast<BasicBlock>(VMap[Latch]);
+
+    // Insert the cloned blocks into function just before the original loop
+    F->getBasicBlockList().splice(PEnd, F->getBasicBlockList(),
+                                  NewBlocks[0], F->end());
+
+    // Generate the code for the comparison which determines if the loop
+    // prolog code needs to be executed.
+    if (leftOverIters == Count-1) {
+      // There is no compare block for the fall-thru case when for the last
+      // left over iteration
+      CompareBB = NewBlocks[0];
+    } else {
+      // Create a new block for the comparison
+      BasicBlock *NewBB = BasicBlock::Create(CompareBB->getContext(), "unr.cmp",
+                                             F, CompareBB);
+      if (Loop *ParentLoop = L->getParentLoop()) {
+        // Add the new block to the parent loop, if needed
+        ParentLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+      }
+
+      // The comparison w/ the extra iteration value and branch
+      Value *BranchVal = new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, ModVal,
+                                      ConstantInt::get(CountTy, leftOverIters),
+                                      "un.tmp");
+      // Branch to either the extra iterations or the unrolled loop
+      BranchInst::Create(NewBlocks[0], CompareBB,
+                         BranchVal, NewBB);
+      CompareBB = NewBB;
+      PH->getTerminator()->setSuccessor(0, NewBB);
+      VMap[NewPH] = CompareBB;
+    }
+
+    // Rewrite the cloned instruction operands to use the values
+    // created when the clone is created.
+    for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+             E = NewBlocks[i]->end(); I != E; ++I) {
+        RemapInstruction(I, VMap,
+                         RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+      }
+    }
+  }
+
+  // Connect the prolog code to the original loop and update the
+  // PHI functions.
+  ConnectProlog(L, TripCount, Count, LastLoopBB, PEnd, PH, NewPH, LVMap,
+                LPM->getAsPass());
+  NumRuntimeUnrolled++;
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
new file mode 100644
index 000000000000..4aee8ff51a4e
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -0,0 +1,174 @@
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <vector>
+
+using namespace llvm;
+
+STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled");
+
+static cl::opt<uint32_t>
+LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
+                   cl::desc("Weight of the branch likely to be taken (default = 64)"));
+static cl::opt<uint32_t>
+UnlikelyBranchWeight("unlikely-branch-weight", cl::Hidden, cl::init(4),
+                   cl::desc("Weight of the branch unlikely to be taken (default = 4)"));
+
+namespace {
+
+  class LowerExpectIntrinsic : public FunctionPass {
+
+    bool HandleSwitchExpect(SwitchInst *SI);
+
+    bool HandleIfExpect(BranchInst *BI);
+
+  public:
+    static char ID;
+    LowerExpectIntrinsic() : FunctionPass(ID) {
+      initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+  };
+}
+
+
+bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
+  CallInst *CI = dyn_cast<CallInst>(SI->getCondition());
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue);
+  unsigned n = SI->getNumCases(); // +1 for default case.
+  std::vector<uint32_t> Weights(n + 1);
+
+  Weights[0] = Case == SI->case_default() ? LikelyBranchWeight
+                                          : UnlikelyBranchWeight;
+  for (unsigned i = 0; i != n; ++i)
+    Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight
+                                              : UnlikelyBranchWeight;
+
+  SI->setMetadata(LLVMContext::MD_prof,
+                  MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+  SI->setCondition(ArgValue);
+  return true;
+}
+
+
+bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
+  if (BI->isUnconditional())
+    return false;
+
+  // Handle non-optimized IR code like:
+  //   %expval = call i64 @llvm.expect.i64.i64(i64 %conv1, i64 1)
+  //   %tobool = icmp ne i64 %expval, 0
+  //   br i1 %tobool, label %if.then, label %if.end
+
+  ICmpInst *CmpI = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!CmpI || CmpI->getPredicate() != CmpInst::ICMP_NE)
+    return false;
+
+  CallInst *CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+  if (!CI)
+    return false;
+
+  Function *Fn = CI->getCalledFunction();
+  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+    return false;
+
+  Value *ArgValue = CI->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+  if (!ExpectedValue)
+    return false;
+
+  MDBuilder MDB(CI->getContext());
+  MDNode *Node;
+
+  // If expect value is equal to 1 it means that we are more likely to take
+  // branch 0, in other case more likely is branch 1.
+  if (ExpectedValue->isOne())
+    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
+  else
+    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
+
+  BI->setMetadata(LLVMContext::MD_prof, Node);
+
+  CmpI->setOperand(0, ArgValue);
+  return true;
+}
+
+
+bool LowerExpectIntrinsic::runOnFunction(Function &F) {
+  for (Function::iterator I = F.begin(), E = F.end(); I != E;) {
+    BasicBlock *BB = I++;
+
+    // Create "block_weights" metadata.
+    if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+      if (HandleIfExpect(BI))
+        IfHandled++;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      if (HandleSwitchExpect(SI))
+        IfHandled++;
+    }
+
+    // remove llvm.expect intrinsics.
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ) {
+      CallInst *CI = dyn_cast<CallInst>(BI++);
+      if (!CI)
+        continue;
+
+      Function *Fn = CI->getCalledFunction();
+      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        Value *Exp = CI->getArgOperand(0);
+        CI->replaceAllUsesWith(Exp);
+        CI->eraseFromParent();
+      }
+    }
+  }
+
+  return false;
+}
+
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect", "Lower 'expect' "
+                "Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+  return new LowerExpectIntrinsic();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
new file mode 100644
index 000000000000..9ec84d730e46
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -0,0 +1,582 @@
+//===- LowerInvoke.cpp - Eliminate Invoke & Unwind instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding.  This pass supports two models of exception handling
+// lowering, the 'cheap' support and the 'expensive' support.
+//
+// 'Cheap' exception handling support gives the program the ability to execute
+// any program which does not "throw an exception", by turning 'invoke'
+// instructions into calls and by turning 'unwind' instructions into calls to
+// abort().  If the program does dynamically use the unwind instruction, the
+// program will print a message then abort.
+//
+// 'Expensive' exception handling support gives the full exception handling
+// support to the program at the cost of making the 'invoke' instruction
+// really expensive.  It basically inserts setjmp/longjmp calls to emulate the
+// exception handling as necessary.
+//
+// Because the 'expensive' support slows down programs a lot, and EH is only
+// used for a subset of the programs, it must be specifically enabled by an
+// option.
+//
+// Note that after this pass runs the CFG is not entirely accurate (exceptional
+// control flow edges are not correct anymore) so only very simple things should
+// be done after the lowerinvoke pass has run (like generation of native code).
+// This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't
+// support the invoke instruction yet" lowering pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowerinvoke"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <csetjmp>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
+ cl::desc("Make the -lowerinvoke pass insert expensive, but correct, EH code"));
+
+namespace {
+  class LowerInvoke : public FunctionPass {
+    // Used for both models.
+    Constant *AbortFn;
+
+    // Used for expensive EH support.
+    StructType *JBLinkTy;
+    GlobalVariable *JBListHead;
+    Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn;
+    bool useExpensiveEHSupport;
+
+    // We peek in TLI to grab the target's jmp_buf size and alignment
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerInvoke(const TargetLowering *tli = NULL,
+                         bool useExpensiveEHSupport = ExpensiveEHSupport)
+      : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport),
+        TLI(tli) {
+      initializeLowerInvokePass(*PassRegistry::getPassRegistry());
+    }
+    bool doInitialization(Module &M);
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved("mem2reg");
+      AU.addPreservedID(LowerSwitchID);
+    }
+
+  private:
+    bool insertCheapEHSupport(Function &F);
+    void splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*>&Invokes);
+    void rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                AllocaInst *InvokeNum, AllocaInst *StackPtr,
+                                SwitchInst *CatchSwitch);
+    bool insertExpensiveEHSupport(Function &F);
+  };
+}
+
+char LowerInvoke::ID = 0;
+INITIALIZE_PASS(LowerInvoke, "lowerinvoke",
+                "Lower invoke and unwind, for unwindless code generators",
+                false, false)
+
+char &llvm::LowerInvokePassID = LowerInvoke::ID;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
+  return new LowerInvoke(TLI, ExpensiveEHSupport);
+}
+FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
+                                          bool useExpensiveEHSupport) {
+  return new LowerInvoke(TLI, useExpensiveEHSupport);
+}
+
+// doInitialization - Make sure that there is a prototype for abort in the
+// current module.
+bool LowerInvoke::doInitialization(Module &M) {
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  if (useExpensiveEHSupport) {
+    // Insert a type for the linked list of jump buffers.
+    unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
+    JBSize = JBSize ? JBSize : 200;
+    Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
+
+    JBLinkTy = StructType::create(M.getContext(), "llvm.sjljeh.jmpbufty");
+    Type *Elts[] = { JmpBufTy, PointerType::getUnqual(JBLinkTy) };
+    JBLinkTy->setBody(Elts);
+
+    Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
+
+    // Now that we've done that, insert the jmpbuf list head global, unless it
+    // already exists.
+    if (!(JBListHead = M.getGlobalVariable("llvm.sjljeh.jblist", PtrJBList))) {
+      JBListHead = new GlobalVariable(M, PtrJBList, false,
+                                      GlobalValue::LinkOnceAnyLinkage,
+                                      Constant::getNullValue(PtrJBList),
+                                      "llvm.sjljeh.jblist");
+    }
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                         !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+    SetJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::setjmp);
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+   // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+    LongJmpFn = Intrinsic::getDeclaration(&M, Intrinsic::longjmp);
+    StackSaveFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+    StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
+  }
+
+  // We need the 'write' and 'abort' functions for both models.
+  AbortFn = M.getOrInsertFunction("abort", Type::getVoidTy(M.getContext()),
+                                  (Type *)0);
+  return true;
+}
+
+bool LowerInvoke::insertCheapEHSupport(Function &F) {
+  bool Changed = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
+      // Insert a normal call instruction...
+      CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                           CallArgs, "", II);
+      NewCall->takeName(II);
+      NewCall->setCallingConv(II->getCallingConv());
+      NewCall->setAttributes(II->getAttributes());
+      NewCall->setDebugLoc(II->getDebugLoc());
+      II->replaceAllUsesWith(NewCall);
+
+      // Insert an unconditional branch to the normal destination.
+      BranchInst::Create(II->getNormalDest(), II);
+
+      // Remove any PHI node entries from the exception destination.
+      II->getUnwindDest()->removePredecessor(BB);
+
+      // Remove the invoke instruction now.
+      BB->getInstList().erase(II);
+
+      ++NumInvokes; Changed = true;
+    }
+  return Changed;
+}
+
+/// rewriteExpensiveInvoke - Insert code and hack the function to replace the
+/// specified invoke instruction with a call.
+void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
+                                         AllocaInst *InvokeNum,
+                                         AllocaInst *StackPtr,
+                                         SwitchInst *CatchSwitch) {
+  ConstantInt *InvokeNoC = ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                            InvokeNo);
+
+  // If the unwind edge has phi nodes, split the edge.
+  if (isa<PHINode>(II->getUnwindDest()->begin())) {
+    SplitCriticalEdge(II, 1, this);
+
+    // If there are any phi nodes left, they must have a single predecessor.
+    while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) {
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      PN->eraseFromParent();
+    }
+  }
+
+  // Insert a store of the invoke num before the invoke and store zero into the
+  // location afterward.
+  new StoreInst(InvokeNoC, InvokeNum, true, II);  // volatile
+  
+  // Insert a store of the stack ptr before the invoke, so we can restore it
+  // later in the exception case.
+  CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II);
+  new StoreInst(StackSaveRet, StackPtr, true, II); // volatile
+
+  BasicBlock::iterator NI = II->getNormalDest()->getFirstInsertionPt();
+  // nonvolatile.
+  new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), 
+                InvokeNum, false, NI);
+
+  Instruction* StackPtrLoad =
+    new LoadInst(StackPtr, "stackptr.restore", true,
+                 II->getUnwindDest()->getFirstInsertionPt());
+  CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad);
+    
+  // Add a switch case to our unwind block.
+  CatchSwitch->addCase(InvokeNoC, II->getUnwindDest());
+
+  // Insert a normal call instruction.
+  SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                       CallArgs, "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Replace the invoke with an uncond branch.
+  BranchInst::Create(II->getNormalDest(), NewCall->getParent());
+  II->eraseFromParent();
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB, std::set<BasicBlock*> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second) return; // already been here.
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    MarkBlocksLiveIn(*PI, LiveBBs);
+}
+
+// First thing we need to do is scan the whole function for values that are
+// live across unwind edges.  Each value that is live across an unwind edge
+// we spill into a stack location, guaranteeing that there is nothing live
+// across the unwind edge.  This process also splits all critical edges
+// coming out of invoke's.
+void LowerInvoke::
+splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
+  // First step, split all critical edges from invoke instructions.
+  for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+    InvokeInst *II = Invokes[i];
+    SplitCriticalEdge(II, 0, this);
+    SplitCriticalEdge(II, 1, this);
+    assert(!isa<PHINode>(II->getNormalDest()) &&
+           !isa<PHINode>(II->getUnwindDest()) &&
+           "critical edge splitting left single entry phi nodes?");
+  }
+
+  Function *F = Invokes.back()->getParent()->getParent();
+
+  // To avoid having to handle incoming arguments specially, we lower each arg
+  // to a copy instruction in the entry block.  This ensures that the argument
+  // value itself cannot be live across the entry block.
+  BasicBlock::iterator AfterAllocaInsertPt = F->begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsertPt) &&
+        isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsertPt)->getArraySize()))
+    ++AfterAllocaInsertPt;
+  for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+       AI != E; ++AI) {
+    Type *Ty = AI->getType();
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "",AfterAllocaInsertPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC = new BitCastInst(
+        AI, AI->getType(), AI->getName()+".tmp", AfterAllocaInsertPt);
+      AI->replaceAllUsesWith(NC);
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However,
+      // we're replacing it here with the same value it was constructed with.
+      // We do this because the above replaceAllUsesWith() clobbered the
+      // operand, but we want this one to remain.
+      NC->setOperand(0, AI);
+    }
+  }
+
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      // Ignore obvious cases we don't have to handle.  In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block.  Ignore them quickly.
+      Instruction *Inst = II;
+      if (Inst->use_empty()) continue;
+      if (Inst->hasOneUse() &&
+          cast<Instruction>(Inst->use_back())->getParent() == BB &&
+          !isa<PHINode>(Inst->use_back())) continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+        if (isa<ConstantInt>(AI->getArraySize()) && BB == F->begin())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      SmallVector<Instruction*,16> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (User->getParent() != BB || isa<PHINode>(User))
+          Users.push_back(User);
+      }
+
+      // Scan all of the uses and see if the live range is live across an unwind
+      // edge.  If we find a use live across an invoke edge, create an alloca
+      // and spill the value.
+      std::set<InvokeInst*> InvokesWithStoreInserted;
+
+      // Find all of the blocks that this value is live in.
+      std::set<BasicBlock*> LiveBBs;
+      LiveBBs.insert(Inst->getParent());
+      while (!Users.empty()) {
+        Instruction *U = Users.back();
+        Users.pop_back();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+          NeedsSpill = true;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      if (NeedsSpill) {
+        ++NumSpilled;
+        DemoteRegToStack(*Inst, true);
+      }
+    }
+}
+
+bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
+  SmallVector<ReturnInst*,16> Returns;
+  SmallVector<InvokeInst*,16> Invokes;
+  UnreachableInst* UnreachablePlaceholder = 0;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      // Remember all return instructions in case we insert an invoke into this
+      // function.
+      Returns.push_back(RI);
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Invokes.push_back(II);
+    }
+
+  if (Invokes.empty()) return false;
+
+  NumInvokes += Invokes.size();
+
+  // TODO: This is not an optimal way to do this.  In particular, this always
+  // inserts setjmp calls into the entries of functions with invoke instructions
+  // even though there are possibly paths through the function that do not
+  // execute any invokes.  In particular, for functions with early exits, e.g.
+  // the 'addMove' method in hexxagon, it would be nice to not have to do the
+  // setjmp stuff on the early exit path.  This requires a bit of dataflow, but
+  // would not be too hard to do.
+
+  // If we have an invoke instruction, insert a setjmp that dominates all
+  // invokes.  After the setjmp, use a cond branch that goes to the original
+  // code path on zero, and to a designated 'catch' block of nonzero.
+  Value *OldJmpBufPtr = 0;
+  if (!Invokes.empty()) {
+    // First thing we need to do is scan the whole function for values that are
+    // live across unwind edges.  Each value that is live across an unwind edge
+    // we spill into a stack location, guaranteeing that there is nothing live
+    // across the unwind edge.  This process also splits all critical edges
+    // coming out of invoke's.
+    splitLiveRangesLiveAcrossInvokes(Invokes);
+
+    BasicBlock *EntryBB = F.begin();
+
+    // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+    // that needs to be restored on all exits from the function.  This is an
+    // alloca because the value needs to be live across invokes.
+    unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0;
+    AllocaInst *JmpBuf =
+      new AllocaInst(JBLinkTy, 0, Align,
+                     "jblink", F.begin()->begin());
+
+    Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                     ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) };
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "OldBuf",
+                                             EntryBB->getTerminator());
+
+    // Copy the JBListHead to the alloca.
+    Value *OldBuf = new LoadInst(JBListHead, "oldjmpbufptr", true,
+                                 EntryBB->getTerminator());
+    new StoreInst(OldBuf, OldJmpBufPtr, true, EntryBB->getTerminator());
+
+    // Add the new jumpbuf to the list.
+    new StoreInst(JmpBuf, JBListHead, true, EntryBB->getTerminator());
+
+    // Create the catch block.  The catch block is basically a big switch
+    // statement that goes to all of the invoke catch blocks.
+    BasicBlock *CatchBB =
+            BasicBlock::Create(F.getContext(), "setjmp.catch", &F);
+
+    // Create an alloca which keeps track of the stack pointer before every
+    // invoke, this allows us to properly restore the stack pointer after
+    // long jumping.
+    AllocaInst *StackPtr = new AllocaInst(Type::getInt8PtrTy(F.getContext()), 0,
+                                          "stackptr", EntryBB->begin());
+
+    // Create an alloca which keeps track of which invoke is currently
+    // executing.  For normal calls it contains zero.
+    AllocaInst *InvokeNum = new AllocaInst(Type::getInt32Ty(F.getContext()), 0,
+                                           "invokenum",EntryBB->begin());
+    new StoreInst(ConstantInt::get(Type::getInt32Ty(F.getContext()), 0), 
+                  InvokeNum, true, EntryBB->getTerminator());
+
+    // Insert a load in the Catch block, and a switch on its value.  By default,
+    // we go to a block that just does an unwind (which is the correct action
+    // for a standard call). We insert an unreachable instruction here and
+    // modify the block to jump to the correct unwinding pad later.
+    BasicBlock *UnwindBB = BasicBlock::Create(F.getContext(), "unwindbb", &F);
+    UnreachablePlaceholder = new UnreachableInst(F.getContext(), UnwindBB);
+
+    Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB);
+    SwitchInst *CatchSwitch =
+      SwitchInst::Create(CatchLoad, UnwindBB, Invokes.size(), CatchBB);
+
+    // Now that things are set up, insert the setjmp call itself.
+
+    // Split the entry block to insert the conditional branch for the setjmp.
+    BasicBlock *ContBlock = EntryBB->splitBasicBlock(EntryBB->getTerminator(),
+                                                     "setjmp.cont");
+
+    Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0);
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "TheJmpBuf",
+                                                 EntryBB->getTerminator());
+    JmpBufPtr = new BitCastInst(JmpBufPtr,
+                        Type::getInt8PtrTy(F.getContext()),
+                                "tmp", EntryBB->getTerminator());
+    Value *SJRet = CallInst::Create(SetJmpFn, JmpBufPtr, "sjret",
+                                    EntryBB->getTerminator());
+
+    // Compare the return value to zero.
+    Value *IsNormal = new ICmpInst(EntryBB->getTerminator(),
+                                   ICmpInst::ICMP_EQ, SJRet,
+                                   Constant::getNullValue(SJRet->getType()),
+                                   "notunwind");
+    // Nuke the uncond branch.
+    EntryBB->getTerminator()->eraseFromParent();
+
+    // Put in a new condbranch in its place.
+    BranchInst::Create(ContBlock, CatchBB, IsNormal, EntryBB);
+
+    // At this point, we are all set up, rewrite each invoke instruction.
+    for (unsigned i = 0, e = Invokes.size(); i != e; ++i)
+      rewriteExpensiveInvoke(Invokes[i], i+1, InvokeNum, StackPtr, CatchSwitch);
+  }
+
+  // We know that there is at least one unwind.
+
+  // Create three new blocks, the block to load the jmpbuf ptr and compare
+  // against null, the block to do the longjmp, and the error block for if it
+  // is null.  Add them at the end of the function because they are not hot.
+  BasicBlock *UnwindHandler = BasicBlock::Create(F.getContext(),
+                                                "dounwind", &F);
+  BasicBlock *UnwindBlock = BasicBlock::Create(F.getContext(), "unwind", &F);
+  BasicBlock *TermBlock = BasicBlock::Create(F.getContext(), "unwinderror", &F);
+
+  // If this function contains an invoke, restore the old jumpbuf ptr.
+  Value *BufPtr;
+  if (OldJmpBufPtr) {
+    // Before the return, insert a copy from the saved value to the new value.
+    BufPtr = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", UnwindHandler);
+    new StoreInst(BufPtr, JBListHead, UnwindHandler);
+  } else {
+    BufPtr = new LoadInst(JBListHead, "ehlist", UnwindHandler);
+  }
+
+  // Load the JBList, if it's null, then there was no catch!
+  Value *NotNull = new ICmpInst(*UnwindHandler, ICmpInst::ICMP_NE, BufPtr,
+                                Constant::getNullValue(BufPtr->getType()),
+                                "notnull");
+  BranchInst::Create(UnwindBlock, TermBlock, NotNull, UnwindHandler);
+
+  // Create the block to do the longjmp.
+  // Get a pointer to the jmpbuf and longjmp.
+  Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                   ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) };
+  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx, "JmpBuf", UnwindBlock);
+  Idx[0] = new BitCastInst(Idx[0],
+             Type::getInt8PtrTy(F.getContext()),
+                           "tmp", UnwindBlock);
+  Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1);
+  CallInst::Create(LongJmpFn, Idx, "", UnwindBlock);
+  new UnreachableInst(F.getContext(), UnwindBlock);
+
+  // Set up the term block ("throw without a catch").
+  new UnreachableInst(F.getContext(), TermBlock);
+
+  // Insert a call to abort()
+  CallInst::Create(AbortFn, "",
+                   TermBlock->getTerminator())->setTailCall();
+
+  // Replace the inserted unreachable with a branch to the unwind handler.
+  if (UnreachablePlaceholder) {
+    BranchInst::Create(UnwindHandler, UnreachablePlaceholder);
+    UnreachablePlaceholder->eraseFromParent();
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, restore the old jmpbuf pointer to its input value.
+  if (OldJmpBufPtr) {
+    for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+      ReturnInst *R = Returns[i];
+
+      // Before the return, insert a copy from the saved value to the new value.
+      Value *OldBuf = new LoadInst(OldJmpBufPtr, "oldjmpbufptr", true, R);
+      new StoreInst(OldBuf, JBListHead, true, R);
+    }
+  }
+
+  return true;
+}
+
+bool LowerInvoke::runOnFunction(Function &F) {
+  if (useExpensiveEHSupport)
+    return insertExpensiveEHSupport(F);
+  else
+    return insertCheapEHSupport(F);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
new file mode 100644
index 000000000000..955b853533b0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -0,0 +1,305 @@
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+  /// LowerSwitch Pass - Replace all SwitchInst instructions with chained branch
+  /// instructions.
+  class LowerSwitch : public FunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerSwitch() : FunctionPass(ID) {
+      initializeLowerSwitchPass(*PassRegistry::getPassRegistry());
+    } 
+
+    virtual bool runOnFunction(Function &F);
+    
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreserved("mem2reg");
+      AU.addPreservedID(LowerInvokePassID);
+    }
+
+    struct CaseRange {
+      Constant* Low;
+      Constant* High;
+      BasicBlock* BB;
+
+      CaseRange(Constant *low = 0, Constant *high = 0, BasicBlock *bb = 0) :
+        Low(low), High(high), BB(bb) { }
+    };
+
+    typedef std::vector<CaseRange>           CaseVector;
+    typedef std::vector<CaseRange>::iterator CaseItr;
+  private:
+    void processSwitchInst(SwitchInst *SI);
+
+    BasicBlock* switchConvert(CaseItr Begin, CaseItr End, Value* Val,
+                              BasicBlock* OrigBlock, BasicBlock* Default);
+    BasicBlock* newLeafBlock(CaseRange& Leaf, Value* Val,
+                             BasicBlock* OrigBlock, BasicBlock* Default);
+    unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
+  };
+}
+
+char LowerSwitch::ID = 0;
+INITIALIZE_PASS(LowerSwitch, "lowerswitch",
+                "Lower SwitchInst's to branches", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LowerSwitchID = LowerSwitch::ID;
+// createLowerSwitchPass - Interface to this file...
+FunctionPass *llvm::createLowerSwitchPass() {
+  return new LowerSwitch();
+}
+
+bool LowerSwitch::runOnFunction(Function &F) {
+  bool Changed = false;
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *Cur = I++; // Advance over block so we don't traverse new blocks
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(Cur->getTerminator())) {
+      Changed = true;
+      processSwitchInst(SI);
+    }
+  }
+
+  return Changed;
+}
+
+// operator<< - Used for debugging purposes.
+//
+static raw_ostream& operator<<(raw_ostream &O,
+                               const LowerSwitch::CaseVector &C)
+    LLVM_ATTRIBUTE_USED;
+static raw_ostream& operator<<(raw_ostream &O,
+                               const LowerSwitch::CaseVector &C) {
+  O << "[";
+
+  for (LowerSwitch::CaseVector::const_iterator B = C.begin(),
+         E = C.end(); B != E; ) {
+    O << *B->Low << " -" << *B->High;
+    if (++B != E) O << ", ";
+  }
+
+  return O << "]";
+}
+
+// switchConvert - Convert the switch statement into a binary lookup of
+// the case values. The function recursively builds this tree.
+//
+BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
+                                       Value* Val, BasicBlock* OrigBlock,
+                                       BasicBlock* Default)
+{
+  unsigned Size = End - Begin;
+
+  if (Size == 1)
+    return newLeafBlock(*Begin, Val, OrigBlock, Default);
+
+  unsigned Mid = Size / 2;
+  std::vector<CaseRange> LHS(Begin, Begin + Mid);
+  DEBUG(dbgs() << "LHS: " << LHS << "\n");
+  std::vector<CaseRange> RHS(Begin + Mid, End);
+  DEBUG(dbgs() << "RHS: " << RHS << "\n");
+
+  CaseRange& Pivot = *(Begin + Mid);
+  DEBUG(dbgs() << "Pivot ==> " 
+               << cast<ConstantInt>(Pivot.Low)->getValue() << " -"
+               << cast<ConstantInt>(Pivot.High)->getValue() << "\n");
+
+  BasicBlock* LBranch = switchConvert(LHS.begin(), LHS.end(), Val,
+                                      OrigBlock, Default);
+  BasicBlock* RBranch = switchConvert(RHS.begin(), RHS.end(), Val,
+                                      OrigBlock, Default);
+
+  // Create a new node that checks if the value is < pivot. Go to the
+  // left branch if it is and right branch if not.
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewNode);
+
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
+                                Val, Pivot.Low, "Pivot");
+  NewNode->getInstList().push_back(Comp);
+  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  return NewNode;
+}
+
+// newLeafBlock - Create a new leaf block for the binary lookup tree. It
+// checks if the switch's value == the case's value. If not, then it
+// jumps to the default branch. At this point in the tree, the value
+// can't be another valid case value, so the jump to the "default" branch
+// is warranted.
+//
+BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
+                                      BasicBlock* OrigBlock,
+                                      BasicBlock* Default)
+{
+  Function* F = OrigBlock->getParent();
+  BasicBlock* NewLeaf = BasicBlock::Create(Val->getContext(), "LeafBlock");
+  Function::iterator FI = OrigBlock;
+  F->getBasicBlockList().insert(++FI, NewLeaf);
+
+  // Emit comparison
+  ICmpInst* Comp = NULL;
+  if (Leaf.Low == Leaf.High) {
+    // Make the seteq instruction...
+    Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_EQ, Val,
+                        Leaf.Low, "SwitchLeaf");
+  } else {
+    // Make range comparison
+    if (cast<ConstantInt>(Leaf.Low)->isMinValue(true /*isSigned*/)) {
+      // Val >= Min && Val <= Hi --> Val <= Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_SLE, Val, Leaf.High,
+                          "SwitchLeaf");
+    } else if (cast<ConstantInt>(Leaf.Low)->isZero()) {
+      // Val >= 0 && Val <= Hi --> Val <=u Hi
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Val, Leaf.High,
+                          "SwitchLeaf");      
+    } else {
+      // Emit V-Lo <=u Hi-Lo
+      Constant* NegLo = ConstantExpr::getNeg(Leaf.Low);
+      Instruction* Add = BinaryOperator::CreateAdd(Val, NegLo,
+                                                   Val->getName()+".off",
+                                                   NewLeaf);
+      Constant *UpperBound = ConstantExpr::getAdd(NegLo, Leaf.High);
+      Comp = new ICmpInst(*NewLeaf, ICmpInst::ICMP_ULE, Add, UpperBound,
+                          "SwitchLeaf");
+    }
+  }
+
+  // Make the conditional branch...
+  BasicBlock* Succ = Leaf.BB;
+  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+
+  // If there were any PHI nodes in this successor, rewrite one entry
+  // from OrigBlock to come from NewLeaf.
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode* PN = cast<PHINode>(I);
+    // Remove all but one incoming entries from the cluster
+    uint64_t Range = cast<ConstantInt>(Leaf.High)->getSExtValue() -
+                     cast<ConstantInt>(Leaf.Low)->getSExtValue();    
+    for (uint64_t j = 0; j < Range; ++j) {
+      PN->removeIncomingValue(OrigBlock);
+    }
+    
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewLeaf);
+  }
+
+  return NewLeaf;
+}
+
+// Clusterify - Transform simple list of Cases into list of CaseRange's
+unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
+
+  IntegersSubsetToBB TheClusterifier;
+
+  // Start with "simple" cases
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+       i != e; ++i) {
+    BasicBlock *SuccBB = i.getCaseSuccessor();
+    IntegersSubset CaseRanges = i.getCaseValueEx();
+    TheClusterifier.add(CaseRanges, SuccBB);
+  }
+  
+  TheClusterifier.optimize();
+  
+  size_t numCmps = 0;
+  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
+       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
+    IntegersSubsetToBB::Cluster &C = *i;
+    
+    // FIXME: Currently work with ConstantInt based numbers.
+    // Changing it to APInt based is a pretty heavy for this commit.
+    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
+                              C.first.getHigh().toConstantInt(), C.second));
+    if (C.first.isSingleNumber())
+      // A range counts double, since it requires two compares.
+      ++numCmps;
+  }
+
+  return numCmps;  
+}
+
+// processSwitchInst - Replace the specified switch instruction with a sequence
+// of chained if-then insts in a balanced binary search.
+//
+void LowerSwitch::processSwitchInst(SwitchInst *SI) {
+  BasicBlock *CurBlock = SI->getParent();
+  BasicBlock *OrigBlock = CurBlock;
+  Function *F = CurBlock->getParent();
+  Value *Val = SI->getCondition();  // The value we are switching on...
+  BasicBlock* Default = SI->getDefaultDest();
+
+  // If there is only the default destination, don't bother with the code below.
+  if (!SI->getNumCases()) {
+    BranchInst::Create(SI->getDefaultDest(), CurBlock);
+    CurBlock->getInstList().erase(SI);
+    return;
+  }
+
+  // Create a new, empty default block so that the new hierarchy of
+  // if-then statements go to this and the PHI nodes are happy.
+  BasicBlock* NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+  F->getBasicBlockList().insert(Default, NewDefault);
+
+  BranchInst::Create(Default, NewDefault);
+
+  // If there is an entry in any PHI nodes for the default edge, make sure
+  // to update them as well.
+  for (BasicBlock::iterator I = Default->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    int BlockIdx = PN->getBasicBlockIndex(OrigBlock);
+    assert(BlockIdx != -1 && "Switch didn't go to this successor??");
+    PN->setIncomingBlock((unsigned)BlockIdx, NewDefault);
+  }
+
+  // Prepare cases vector.
+  CaseVector Cases;
+  unsigned numCmps = Clusterify(Cases, SI);
+
+  DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+               << ". Total compares: " << numCmps << "\n");
+  DEBUG(dbgs() << "Cases: " << Cases << "\n");
+  (void)numCmps;
+  
+  BasicBlock* SwitchBlock = switchConvert(Cases.begin(), Cases.end(), Val,
+                                          OrigBlock, NewDefault);
+
+  // Branch to our shiny new if-then stuff...
+  BranchInst::Create(SwitchBlock, OrigBlock);
+
+  // We are now done with the switch instruction, delete it.
+  CurBlock->getInstList().erase(SI);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
new file mode 100644
index 000000000000..61b3965d8f11
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -0,0 +1,90 @@
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+using namespace llvm;
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+namespace {
+  struct PromotePass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    PromotePass() : FunctionPass(ID) {
+      initializePromotePassPass(*PassRegistry::getPassRegistry());
+    }
+
+    // runOnFunction - To run this pass, first we calculate the alloca
+    // instructions that are safe for promotion, then we promote each one.
+    //
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.setPreservesCFG();
+      // This is a cluster of orthogonal Transforms
+      AU.addPreserved<UnifyFunctionExitNodes>();
+      AU.addPreservedID(LowerSwitchID);
+      AU.addPreservedID(LowerInvokePassID);
+    }
+  };
+}  // end of anonymous namespace
+
+char PromotePass::ID = 0;
+INITIALIZE_PASS_BEGIN(PromotePass, "mem2reg", "Promote Memory to Register",
+                false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(PromotePass, "mem2reg", "Promote Memory to Register",
+                false, false)
+
+bool PromotePass::runOnFunction(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed  = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+//
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+  return new PromotePass();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
new file mode 100644
index 000000000000..3716f586ff06
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -0,0 +1,145 @@
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+
+  // This PRNG is from the ISO C spec. It is intentionally simple and
+  // unsuitable for cryptographic use. We're just looking for enough
+  // variety to surprise and delight users.
+  struct PRNG {
+    unsigned long next;
+
+    void srand(unsigned int seed) {
+      next = seed;
+    }
+
+    int rand() {
+      next = next * 1103515245 + 12345;
+      return (unsigned int)(next / 65536) % 32768;
+    }
+  };
+
+  struct MetaRenamer : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MetaRenamer() : ModulePass(ID) {
+      initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    bool runOnModule(Module &M) {
+      static const char *metaNames[] = {
+        // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+        "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+        "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+      };
+
+      // Seed our PRNG with simple additive sum of ModuleID. We're looking to
+      // simply avoid always having the same function names, and we need to
+      // remain deterministic.
+      unsigned int randSeed = 0;
+      for (std::string::const_iterator I = M.getModuleIdentifier().begin(),
+           E = M.getModuleIdentifier().end(); I != E; ++I)
+        randSeed += *I;
+
+      PRNG prng;
+      prng.srand(randSeed);
+
+      // Rename all aliases
+      for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
+           AI != AE; ++AI) {
+        StringRef Name = AI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
+        AI->setName("alias");
+      }
+      
+      // Rename all global variables
+      for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
+           GI != GE; ++GI) {
+        StringRef Name = GI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
+        GI->setName("global");
+      }
+
+      // Rename all struct types
+      TypeFinder StructTypes;
+      StructTypes.run(M, true);
+      for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+        StructType *STy = StructTypes[i];
+        if (STy->isLiteral() || STy->getName().empty()) continue;
+
+        SmallString<128> NameStorage;
+        STy->setName((Twine("struct.") + metaNames[prng.rand() %
+                     array_lengthof(metaNames)]).toStringRef(NameStorage));
+      }
+
+      // Rename all functions
+      for (Module::iterator FI = M.begin(), FE = M.end();
+           FI != FE; ++FI) {
+        StringRef Name = FI->getName();
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+          continue;
+
+        FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
+        runOnFunction(*FI);
+      }
+      return true;
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->getType()->isVoidTy())
+          AI->setName("arg");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        BB->setName("bb");
+
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->getType()->isVoidTy())
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+}
+
+char MetaRenamer::ID = 0;
+INITIALIZE_PASS(MetaRenamer, "metarenamer", 
+                "Assign new names to everything", false, false)
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+  return new MetaRenamer();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
new file mode 100644
index 000000000000..d090b487213b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -0,0 +1,64 @@
+//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on Modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+static void appendToGlobalArray(const char *Array, 
+                                Module &M, Function *F, int Priority) {
+  IRBuilder<> IRB(M.getContext());
+  FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
+  StructType *Ty = StructType::get(
+      IRB.getInt32Ty(), PointerType::getUnqual(FnTy), NULL);
+
+  Constant *RuntimeCtorInit = ConstantStruct::get(
+      Ty, IRB.getInt32(Priority), F, NULL);
+
+  // Get the current set of static global constructors and add the new ctor
+  // to the list.
+  SmallVector<Constant *, 16> CurrentCtors;
+  if (GlobalVariable * GVCtor = M.getNamedGlobal(Array)) {
+    if (Constant *Init = GVCtor->getInitializer()) {
+      unsigned n = Init->getNumOperands();
+      CurrentCtors.reserve(n + 1);
+      for (unsigned i = 0; i != n; ++i)
+        CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+    }
+    GVCtor->eraseFromParent();
+  }
+
+  CurrentCtors.push_back(RuntimeCtorInit);
+
+  // Create a new initializer.
+  ArrayType *AT = ArrayType::get(RuntimeCtorInit->getType(),
+                                 CurrentCtors.size());
+  Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
+
+  // Create the new global variable and replace all uses of
+  // the old global variable with the new one.
+  (void)new GlobalVariable(M, NewInit->getType(), false,
+                           GlobalValue::AppendingLinkage, NewInit, Array);
+}
+
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
+  appendToGlobalArray("llvm.global_ctors", M, F, Priority);
+}
+
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) {
+  appendToGlobalArray("llvm.global_dtors", M, F, Priority);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
new file mode 100644
index 000000000000..de335ec1a05c
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -0,0 +1,1149 @@
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references.  It promotes
+// alloca instructions which only have loads and stores as uses.  An alloca is
+// transformed by using iterated dominator frontiers to place PHI nodes, then
+// traversing the function in depth-first order to rewrite loads and stores as
+// appropriate.
+//
+// The algorithm used here is based on:
+//
+//   Sreedhar and Gao. A linear time algorithm for placing phi-nodes.
+//   In Proceedings of the 22nd ACM SIGPLAN-SIGACT Symposium on Principles of
+//   Programming Languages
+//   POPL '95. ACM, New York, NY, 62-73.
+//
+// It has been modified to not explicitly use the DJ graph data structure and to
+// directly compute pruned SSA using per-variable liveness information.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mem2reg"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <queue>
+using namespace llvm;
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+
+namespace llvm {
+template<>
+struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
+  typedef std::pair<BasicBlock*, unsigned> EltTy;
+  static inline EltTy getEmptyKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U);
+  }
+  static inline EltTy getTombstoneKey() {
+    return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U);
+  }
+  static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) {
+    using llvm::hash_value;
+    return static_cast<unsigned>(hash_value(Val));
+  }
+  static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
+    return LHS == RHS;
+  }
+};
+}
+
+/// isAllocaPromotable - Return true if this alloca is legal for promotion.
+/// This is true if there are only loads and stores to the alloca.
+///
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+  // FIXME: If the memory unit is of pointer or integer type, we can permit
+  // assignments to subsections of the memory unit.
+
+  // Only allow direct and non-volatile loads and stores...
+  for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE; ++UI) {   // Loop over all of the uses of the alloca
+    const User *U = *UI;
+    if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (LI->isVolatile())
+        return false;
+    } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (SI->getOperand(0) == AI)
+        return false;   // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
+      if (SI->isVolatile())
+        return false;
+    } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (BCI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        return false;
+    } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (GEPI->getType() != Type::getInt8PtrTy(U->getContext()))
+        return false;
+      if (!GEPI->hasAllZeroIndices())
+        return false;
+      if (!onlyUsedByLifetimeMarkers(GEPI))
+        return false;
+    } else {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+namespace {
+  struct AllocaInfo;
+
+  // Data package used by RenamePass()
+  class RenamePassData {
+  public:
+    typedef std::vector<Value *> ValVector;
+    
+    RenamePassData() : BB(NULL), Pred(NULL), Values() {}
+    RenamePassData(BasicBlock *B, BasicBlock *P,
+                   const ValVector &V) : BB(B), Pred(P), Values(V) {}
+    BasicBlock *BB;
+    BasicBlock *Pred;
+    ValVector Values;
+    
+    void swap(RenamePassData &RHS) {
+      std::swap(BB, RHS.BB);
+      std::swap(Pred, RHS.Pred);
+      Values.swap(RHS.Values);
+    }
+  };
+  
+  /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of
+  /// load/store instructions in the block that directly load or store an alloca.
+  ///
+  /// This functionality is important because it avoids scanning large basic
+  /// blocks multiple times when promoting many allocas in the same block.
+  class LargeBlockInfo {
+    /// InstNumbers - For each instruction that we track, keep the index of the
+    /// instruction.  The index starts out as the number of the instruction from
+    /// the start of the block.
+    DenseMap<const Instruction *, unsigned> InstNumbers;
+  public:
+    
+    /// isInterestingInstruction - This code only looks at accesses to allocas.
+    static bool isInterestingInstruction(const Instruction *I) {
+      return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+             (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+    }
+    
+    /// getInstructionIndex - Get or calculate the index of the specified
+    /// instruction.
+    unsigned getInstructionIndex(const Instruction *I) {
+      assert(isInterestingInstruction(I) &&
+             "Not a load/store to/from an alloca?");
+      
+      // If we already have this instruction number, return it.
+      DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+      if (It != InstNumbers.end()) return It->second;
+      
+      // Scan the whole block to get the instruction.  This accumulates
+      // information for every interesting instruction in the block, in order to
+      // avoid gratuitus rescans.
+      const BasicBlock *BB = I->getParent();
+      unsigned InstNo = 0;
+      for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end();
+           BBI != E; ++BBI)
+        if (isInterestingInstruction(BBI))
+          InstNumbers[BBI] = InstNo++;
+      It = InstNumbers.find(I);
+      
+      assert(It != InstNumbers.end() && "Didn't insert instruction?");
+      return It->second;
+    }
+    
+    void deleteValue(const Instruction *I) {
+      InstNumbers.erase(I);
+    }
+    
+    void clear() {
+      InstNumbers.clear();
+    }
+  };
+
+  struct PromoteMem2Reg {
+    /// Allocas - The alloca instructions being promoted.
+    ///
+    std::vector<AllocaInst*> Allocas;
+    DominatorTree &DT;
+    DIBuilder *DIB;
+
+    /// AST - An AliasSetTracker object to update.  If null, don't update it.
+    ///
+    AliasSetTracker *AST;
+    
+    /// AllocaLookup - Reverse mapping of Allocas.
+    ///
+    DenseMap<AllocaInst*, unsigned>  AllocaLookup;
+
+    /// NewPhiNodes - The PhiNodes we're adding.  That map is used to simplify
+    /// some Phi nodes as we iterate over it, so it should have deterministic
+    /// iterators.  We could use a MapVector, but since we already maintain a
+    /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is
+    /// more efficient (also supports removal).
+    ///
+    DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes;
+    
+    /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
+    /// it corresponds to.
+    DenseMap<PHINode*, unsigned> PhiToAllocaMap;
+    
+    /// PointerAllocaValues - If we are updating an AliasSetTracker, then for
+    /// each alloca that is of pointer type, we keep track of what to copyValue
+    /// to the inserted PHI nodes here.
+    ///
+    std::vector<Value*> PointerAllocaValues;
+
+    /// AllocaDbgDeclares - For each alloca, we keep track of the dbg.declare
+    /// intrinsic that describes it, if any, so that we can convert it to a
+    /// dbg.value intrinsic if the alloca gets promoted.
+    SmallVector<DbgDeclareInst*, 8> AllocaDbgDeclares;
+
+    /// Visited - The set of basic blocks the renamer has already visited.
+    ///
+    SmallPtrSet<BasicBlock*, 16> Visited;
+
+    /// BBNumbers - Contains a stable numbering of basic blocks to avoid
+    /// non-determinstic behavior.
+    DenseMap<BasicBlock*, unsigned> BBNumbers;
+
+    /// DomLevels - Maps DomTreeNodes to their level in the dominator tree.
+    DenseMap<DomTreeNode*, unsigned> DomLevels;
+
+    /// BBNumPreds - Lazily compute the number of predecessors a block has.
+    DenseMap<const BasicBlock*, unsigned> BBNumPreds;
+  public:
+    PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt,
+                   AliasSetTracker *ast)
+      : Allocas(A), DT(dt), DIB(0), AST(ast) {}
+    ~PromoteMem2Reg() {
+      delete DIB;
+    }
+
+    void run();
+
+    /// dominates - Return true if BB1 dominates BB2 using the DominatorTree.
+    ///
+    bool dominates(BasicBlock *BB1, BasicBlock *BB2) const {
+      return DT.dominates(BB1, BB2);
+    }
+
+  private:
+    void RemoveFromAllocasList(unsigned &AllocaIdx) {
+      Allocas[AllocaIdx] = Allocas.back();
+      Allocas.pop_back();
+      --AllocaIdx;
+    }
+
+    unsigned getNumPreds(const BasicBlock *BB) {
+      unsigned &NP = BBNumPreds[BB];
+      if (NP == 0)
+        NP = std::distance(pred_begin(BB), pred_end(BB))+1;
+      return NP-1;
+    }
+
+    void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                 AllocaInfo &Info);
+    void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                             const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                             SmallPtrSet<BasicBlock*, 32> &LiveInBlocks);
+    
+    void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+    void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                  LargeBlockInfo &LBI);
+    
+    void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                    RenamePassData::ValVector &IncVals,
+                    std::vector<RenamePassData> &Worklist);
+    bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+  };
+  
+  struct AllocaInfo {
+    SmallVector<BasicBlock*, 32> DefiningBlocks;
+    SmallVector<BasicBlock*, 32> UsingBlocks;
+    
+    StoreInst  *OnlyStore;
+    BasicBlock *OnlyBlock;
+    bool OnlyUsedInOneBlock;
+    
+    Value *AllocaPointerVal;
+    DbgDeclareInst *DbgDeclare;
+    
+    void clear() {
+      DefiningBlocks.clear();
+      UsingBlocks.clear();
+      OnlyStore = 0;
+      OnlyBlock = 0;
+      OnlyUsedInOneBlock = true;
+      AllocaPointerVal = 0;
+      DbgDeclare = 0;
+    }
+    
+    /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our
+    /// ivars.
+    void AnalyzeAlloca(AllocaInst *AI) {
+      clear();
+
+      // As we scan the uses of the alloca instruction, keep track of stores,
+      // and decide whether all of the loads and stores to the alloca are within
+      // the same basic block.
+      for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+           UI != E;)  {
+        Instruction *User = cast<Instruction>(*UI++);
+
+        if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+          // Remember the basic blocks which define new values for the alloca
+          DefiningBlocks.push_back(SI->getParent());
+          AllocaPointerVal = SI->getOperand(0);
+          OnlyStore = SI;
+        } else {
+          LoadInst *LI = cast<LoadInst>(User);
+          // Otherwise it must be a load instruction, keep track of variable
+          // reads.
+          UsingBlocks.push_back(LI->getParent());
+          AllocaPointerVal = LI;
+        }
+        
+        if (OnlyUsedInOneBlock) {
+          if (OnlyBlock == 0)
+            OnlyBlock = User->getParent();
+          else if (OnlyBlock != User->getParent())
+            OnlyUsedInOneBlock = false;
+        }
+      }
+      
+      DbgDeclare = FindAllocaDbgDeclare(AI);
+    }
+  };
+
+  typedef std::pair<DomTreeNode*, unsigned> DomTreeNodePair;
+
+  struct DomTreeNodeCompare {
+    bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) {
+      return LHS.second < RHS.second;
+    }
+  };
+}  // end of anonymous namespace
+
+static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
+  // Knowing that this alloca is promotable, we know that it's safe to kill all
+  // instructions except for load and store.
+
+  for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+       UI != UE;) {
+    Instruction *I = cast<Instruction>(*UI);
+    ++UI;
+    if (isa<LoadInst>(I) || isa<StoreInst>(I))
+      continue;
+
+    if (!I->getType()->isVoidTy()) {
+      // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+      // Follow the use/def chain to erase them now instead of leaving it for
+      // dead code elimination later.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE;) {
+        Instruction *Inst = cast<Instruction>(*UI);
+        ++UI;
+        Inst->eraseFromParent();
+      }
+    }
+    I->eraseFromParent();
+  }
+}
+
+void PromoteMem2Reg::run() {
+  Function &F = *DT.getRoot()->getParent();
+
+  if (AST) PointerAllocaValues.resize(Allocas.size());
+  AllocaDbgDeclares.resize(Allocas.size());
+
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+
+  for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+    AllocaInst *AI = Allocas[AllocaNum];
+
+    assert(isAllocaPromotable(AI) &&
+           "Cannot promote non-promotable alloca!");
+    assert(AI->getParent()->getParent() == &F &&
+           "All allocas should be in the same function, which is same as DF!");
+
+    removeLifetimeIntrinsicUsers(AI);
+
+    if (AI->use_empty()) {
+      // If there are no uses of the alloca, just delete it now.
+      if (AST) AST->deleteValue(AI);
+      AI->eraseFromParent();
+
+      // Remove the alloca from the Allocas list, since it has been processed
+      RemoveFromAllocasList(AllocaNum);
+      ++NumDeadAlloca;
+      continue;
+    }
+    
+    // Calculate the set of read and write-locations for each alloca.  This is
+    // analogous to finding the 'uses' and 'definitions' of each variable.
+    Info.AnalyzeAlloca(AI);
+
+    // If there is only a single store to this value, replace any loads of
+    // it that are directly dominated by the definition with the value stored.
+    if (Info.DefiningBlocks.size() == 1) {
+      RewriteSingleStoreAlloca(AI, Info, LBI);
+
+      // Finally, after the scan, check to see if the store is all that is left.
+      if (Info.UsingBlocks.empty()) {
+        // Record debuginfo for the store and remove the declaration's 
+        // debuginfo.
+        if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+          if (!DIB)
+            DIB = new DIBuilder(*DDI->getParent()->getParent()->getParent());
+          ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, *DIB);
+          DDI->eraseFromParent();
+        }
+        // Remove the (now dead) store and alloca.
+        Info.OnlyStore->eraseFromParent();
+        LBI.deleteValue(Info.OnlyStore);
+
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        ++NumSingleStore;
+        continue;
+      }
+    }
+    
+    // If the alloca is only read and written in one basic block, just perform a
+    // linear sweep over the block to eliminate it.
+    if (Info.OnlyUsedInOneBlock) {
+      PromoteSingleBlockAlloca(AI, Info, LBI);
+      
+      // Finally, after the scan, check to see if the stores are all that is
+      // left.
+      if (Info.UsingBlocks.empty()) {
+        
+        // Remove the (now dead) stores and alloca.
+        while (!AI->use_empty()) {
+          StoreInst *SI = cast<StoreInst>(AI->use_back());
+          // Record debuginfo for the store before removing it.
+          if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+            if (!DIB)
+              DIB = new DIBuilder(*SI->getParent()->getParent()->getParent());
+            ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+          }
+          SI->eraseFromParent();
+          LBI.deleteValue(SI);
+        }
+        
+        if (AST) AST->deleteValue(AI);
+        AI->eraseFromParent();
+        LBI.deleteValue(AI);
+        
+        // The alloca has been processed, move on.
+        RemoveFromAllocasList(AllocaNum);
+        
+        // The alloca's debuginfo can be removed as well.
+        if (DbgDeclareInst *DDI = Info.DbgDeclare)
+          DDI->eraseFromParent();
+
+        ++NumLocalPromoted;
+        continue;
+      }
+    }
+
+    // If we haven't computed dominator tree levels, do so now.
+    if (DomLevels.empty()) {
+      SmallVector<DomTreeNode*, 32> Worklist;
+
+      DomTreeNode *Root = DT.getRootNode();
+      DomLevels[Root] = 0;
+      Worklist.push_back(Root);
+
+      while (!Worklist.empty()) {
+        DomTreeNode *Node = Worklist.pop_back_val();
+        unsigned ChildLevel = DomLevels[Node] + 1;
+        for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end();
+             CI != CE; ++CI) {
+          DomLevels[*CI] = ChildLevel;
+          Worklist.push_back(*CI);
+        }
+      }
+    }
+
+    // If we haven't computed a numbering for the BB's in the function, do so
+    // now.
+    if (BBNumbers.empty()) {
+      unsigned ID = 0;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        BBNumbers[I] = ID++;
+    }
+
+    // If we have an AST to keep updated, remember some pointer value that is
+    // stored into the alloca.
+    if (AST)
+      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
+      
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
+    if (Info.DbgDeclare) AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
+    
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.  Determine which blocks need PHI
+    // nodes and see if we can optimize out some work by avoiding insertion of
+    // dead phi nodes.
+    DetermineInsertionPoint(AI, AllocaNum, Info);
+  }
+
+  if (Allocas.empty())
+    return; // All of the allocas must have been trivial!
+
+  LBI.clear();
+  
+  
+  // Set the incoming values for the basic block to be null values for all of
+  // the alloca's.  We do this in case there is a load of a value that has not
+  // been stored yet.  In this case, it will get this null value.
+  //
+  RenamePassData::ValVector Values(Allocas.size());
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+    Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+  // Walks all basic blocks in the function performing the SSA rename algorithm
+  // and inserting the phi nodes we marked as necessary
+  //
+  std::vector<RenamePassData> RenamePassWorkList;
+  RenamePassWorkList.push_back(RenamePassData(F.begin(), 0, Values));
+  do {
+    RenamePassData RPD;
+    RPD.swap(RenamePassWorkList.back());
+    RenamePassWorkList.pop_back();
+    // RenamePass may add new worklist entries.
+    RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
+  } while (!RenamePassWorkList.empty());
+  
+  // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
+  Visited.clear();
+
+  // Remove the allocas themselves from the function.
+  for (unsigned i = 0, e = Allocas.size(); i != e; ++i) {
+    Instruction *A = Allocas[i];
+
+    // If there are any uses of the alloca instructions left, they must be in
+    // unreachable basic blocks that were not processed by walking the dominator
+    // tree. Just delete the users now.
+    if (!A->use_empty())
+      A->replaceAllUsesWith(UndefValue::get(A->getType()));
+    if (AST) AST->deleteValue(A);
+    A->eraseFromParent();
+  }
+
+  // Remove alloca's dbg.declare instrinsics from the function.
+  for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i)
+    if (DbgDeclareInst *DDI = AllocaDbgDeclares[i])
+      DDI->eraseFromParent();
+
+  // Loop over all of the PHI nodes and see if there are any that we can get
+  // rid of because they merge all of the same incoming values.  This can
+  // happen due to undef values coming into the PHI nodes.  This process is
+  // iterative, because eliminating one PHI node can cause others to be removed.
+  bool EliminatedAPHI = true;
+  while (EliminatedAPHI) {
+    EliminatedAPHI = false;
+    
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non deterministic order, thus creating
+    // non deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
+           NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
+      PHINode *PN = I->second;
+
+      // If this PHI node merges one value and/or undefs, get the value.
+      if (Value *V = SimplifyInstruction(PN, 0, 0, &DT)) {
+        if (AST && PN->getType()->isPointerTy())
+          AST->deleteValue(PN);
+        PN->replaceAllUsesWith(V);
+        PN->eraseFromParent();
+        NewPhiNodes.erase(I++);
+        EliminatedAPHI = true;
+        continue;
+      }
+      ++I;
+    }
+  }
+  
+  // At this point, the renamer has added entries to PHI nodes for all reachable
+  // code.  Unfortunately, there may be unreachable blocks which the renamer
+  // hasn't traversed.  If this is the case, the PHI nodes may not
+  // have incoming values for all predecessors.  Loop over all PHI nodes we have
+  // created, inserting undef values if they are missing any incoming values.
+  //
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
+         NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
+    // We want to do this once per basic block.  As such, only process a block
+    // when we find the PHI that is the first entry in the block.
+    PHINode *SomePHI = I->second;
+    BasicBlock *BB = SomePHI->getParent();
+    if (&BB->front() != SomePHI)
+      continue;
+
+    // Only do work here if there the PHI nodes are missing incoming values.  We
+    // know that all PHI nodes that were inserted in a block will have the same
+    // number of incoming values, so we can just check any of them.
+    if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+      continue;
+
+    // Get the preds for BB.
+    SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+    
+    // Ok, now we know that all of the PHI nodes are missing entries for some
+    // basic blocks.  Start by sorting the incoming predecessors for efficient
+    // access.
+    std::sort(Preds.begin(), Preds.end());
+    
+    // Now we loop through all BB's which have entries in SomePHI and remove
+    // them from the Preds list.
+    for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+      // Do a log(n) search of the Preds list for the entry we want.
+      SmallVector<BasicBlock*, 16>::iterator EntIt =
+        std::lower_bound(Preds.begin(), Preds.end(),
+                         SomePHI->getIncomingBlock(i));
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&&
+             "PHI node has entry for a block which is not a predecessor!");
+
+      // Remove the entry
+      Preds.erase(EntIt);
+    }
+
+    // At this point, the blocks left in the preds list must have dummy
+    // entries inserted into every PHI nodes for the block.  Update all the phi
+    // nodes in this block that we are inserting (there could be phis before
+    // mem2reg runs).
+    unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+    BasicBlock::iterator BBI = BB->begin();
+    while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+           SomePHI->getNumIncomingValues() == NumBadPreds) {
+      Value *UndefVal = UndefValue::get(SomePHI->getType());
+      for (unsigned pred = 0, e = Preds.size(); pred != e; ++pred)
+        SomePHI->addIncoming(UndefVal, Preds[pred]);
+    }
+  }
+        
+  NewPhiNodes.clear();
+}
+
+
+/// ComputeLiveInBlocks - Determine which blocks the value is live in.  These
+/// are blocks which lead to uses.  Knowing this allows us to avoid inserting
+/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes
+/// would be dead).
+void PromoteMem2Reg::
+ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
+                    const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
+                    SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) {
+  
+  // To determine liveness, we must iterate through the predecessors of blocks
+  // where the def is live.  Blocks are added to the worklist if we need to
+  // check their predecessors.  Start with all the using blocks.
+  SmallVector<BasicBlock*, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+                                                   Info.UsingBlocks.end());
+  
+  // If any of the using blocks is also a definition block, check to see if the
+  // definition occurs before or after the use.  If it happens before the use,
+  // the value isn't really live-in.
+  for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+    BasicBlock *BB = LiveInBlockWorklist[i];
+    if (!DefBlocks.count(BB)) continue;
+    
+    // Okay, this is a block that both uses and defines the value.  If the first
+    // reference to the alloca is a def (store), then we know it isn't live-in.
+    for (BasicBlock::iterator I = BB->begin(); ; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI) continue;
+        
+        // We found a store to the alloca before a load.  The alloca is not
+        // actually live-in here.
+        LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+        LiveInBlockWorklist.pop_back();
+        --i, --e;
+        break;
+      }
+      
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (LI->getOperand(0) != AI) continue;
+        
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block.
+        break;
+      }
+    }
+  }
+  
+  // Now that we have a set of blocks where the phi is live-in, recursively add
+  // their predecessors until we find the full region the value is live.
+  while (!LiveInBlockWorklist.empty()) {
+    BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+    
+    // The block really is live in here, insert it into the set.  If already in
+    // the set, then it has already been processed.
+    if (!LiveInBlocks.insert(BB))
+      continue;
+    
+    // Since the value is live into BB, it is either defined in a predecessor or
+    // live into it to.  Add the preds to the worklist unless they are a
+    // defining block.
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      
+      // The value is not live into a predecessor if it defines the value.
+      if (DefBlocks.count(P))
+        continue;
+      
+      // Otherwise it is, add to the worklist.
+      LiveInBlockWorklist.push_back(P);
+    }
+  }
+}
+
+/// DetermineInsertionPoint - At this point, we're committed to promoting the
+/// alloca using IDF's, and the standard SSA construction algorithm.  Determine
+/// which blocks need phi nodes and see if we can optimize out some work by
+/// avoiding insertion of dead phi nodes.
+void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                                             AllocaInfo &Info) {
+  // Unique the set of defining blocks for efficient lookup.
+  SmallPtrSet<BasicBlock*, 32> DefBlocks;
+  DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+  // Determine which blocks the value is live in.  These are blocks which lead
+  // to uses.
+  SmallPtrSet<BasicBlock*, 32> LiveInBlocks;
+  ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+  // Use a priority queue keyed on dominator tree level so that inserted nodes
+  // are handled from the bottom of the dominator tree upwards.
+  typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>,
+                              DomTreeNodeCompare> IDFPriorityQueue;
+  IDFPriorityQueue PQ;
+
+  for (SmallPtrSet<BasicBlock*, 32>::const_iterator I = DefBlocks.begin(),
+       E = DefBlocks.end(); I != E; ++I) {
+    if (DomTreeNode *Node = DT.getNode(*I))
+      PQ.push(std::make_pair(Node, DomLevels[Node]));
+  }
+
+  SmallVector<std::pair<unsigned, BasicBlock*>, 32> DFBlocks;
+  SmallPtrSet<DomTreeNode*, 32> Visited;
+  SmallVector<DomTreeNode*, 32> Worklist;
+  while (!PQ.empty()) {
+    DomTreeNodePair RootPair = PQ.top();
+    PQ.pop();
+    DomTreeNode *Root = RootPair.first;
+    unsigned RootLevel = RootPair.second;
+
+    // Walk all dominator tree children of Root, inspecting their CFG edges with
+    // targets elsewhere on the dominator tree. Only targets whose level is at
+    // most Root's level are added to the iterated dominance frontier of the
+    // definition set.
+
+    Worklist.clear();
+    Worklist.push_back(Root);
+
+    while (!Worklist.empty()) {
+      DomTreeNode *Node = Worklist.pop_back_val();
+      BasicBlock *BB = Node->getBlock();
+
+      for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE;
+           ++SI) {
+        DomTreeNode *SuccNode = DT.getNode(*SI);
+
+        // Quickly skip all CFG edges that are also dominator tree edges instead
+        // of catching them below.
+        if (SuccNode->getIDom() == Node)
+          continue;
+
+        unsigned SuccLevel = DomLevels[SuccNode];
+        if (SuccLevel > RootLevel)
+          continue;
+
+        if (!Visited.insert(SuccNode))
+          continue;
+
+        BasicBlock *SuccBB = SuccNode->getBlock();
+        if (!LiveInBlocks.count(SuccBB))
+          continue;
+
+        DFBlocks.push_back(std::make_pair(BBNumbers[SuccBB], SuccBB));
+        if (!DefBlocks.count(SuccBB))
+          PQ.push(std::make_pair(SuccNode, SuccLevel));
+      }
+
+      for (DomTreeNode::iterator CI = Node->begin(), CE = Node->end(); CI != CE;
+           ++CI) {
+        if (!Visited.count(*CI))
+          Worklist.push_back(*CI);
+      }
+    }
+  }
+
+  if (DFBlocks.size() > 1)
+    std::sort(DFBlocks.begin(), DFBlocks.end());
+
+  unsigned CurrentVersion = 0;
+  for (unsigned i = 0, e = DFBlocks.size(); i != e; ++i)
+    QueuePhiNode(DFBlocks[i].second, AllocaNum, CurrentVersion);
+}
+
+/// RewriteSingleStoreAlloca - If there is only a single store to this value,
+/// replace any loads of it that are directly dominated by the definition with
+/// the value stored.
+void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI,
+                                              AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) {
+    Instruction *UserInst = cast<Instruction>(*UI++);
+    if (!isa<LoadInst>(UserInst)) {
+      assert(UserInst == OnlyStore && "Should only have load/stores");
+      continue;
+    }
+    LoadInst *LI = cast<LoadInst>(UserInst);
+    
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) {  // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+        
+      } else if (LI->getParent() != StoreBB &&
+                 !dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+    
+    // Otherwise, we *can* safely rewrite this load.
+    Value *ReplVal = OnlyStore->getOperand(0);
+    // If the replacement value is the load, this must occur in unreachable
+    // code.
+    if (ReplVal == LI)
+      ReplVal = UndefValue::get(LI->getType());
+    LI->replaceAllUsesWith(ReplVal);
+    if (AST && LI->getType()->isPointerTy())
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+namespace {
+
+/// StoreIndexSearchPredicate - This is a helper predicate used to search by the
+/// first element of a pair.
+struct StoreIndexSearchPredicate {
+  bool operator()(const std::pair<unsigned, StoreInst*> &LHS,
+                  const std::pair<unsigned, StoreInst*> &RHS) {
+    return LHS.first < RHS.first;
+  }
+};
+
+}
+
+/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic
+/// block.  If this is the case, avoid traversing the CFG and inserting a lot of
+/// potentially useless PHI nodes by just performing a single linear pass over
+/// the basic block using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return true.  This is necessary in cases where, due to control flow, the
+/// alloca is potentially undefined on some control flow paths.  e.g. code like
+/// this is potentially correct:
+///
+///   for (...) { if (c) { A = undef; undef = B; } }
+///
+/// ... so long as A is not used before undef is set.
+///
+void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                              LargeBlockInfo &LBI) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+  
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+  
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy;
+  StoresByIndexTy StoresByIndex;
+  
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ++UI) 
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // If there are no stores to the alloca, just replace any loads with undef.
+  if (StoresByIndex.empty()) {
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) 
+      if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) {
+        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+        if (AST && LI->getType()->isPointerTy())
+          AST->deleteValue(LI);
+        LBI.deleteValue(LI);
+        LI->eraseFromParent();
+      }
+    return;
+  }
+  
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  std::sort(StoresByIndex.begin(), StoresByIndex.end());
+  
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+    if (!LI) continue;
+    
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+    
+    // Find the nearest store that has a lower than this load. 
+    StoresByIndexTy::iterator I = 
+      std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
+                       std::pair<unsigned, StoreInst*>(LoadIdx, static_cast<StoreInst*>(0)),
+                       StoreIndexSearchPredicate());
+    
+    // If there is no store before this load, then we can't promote this load.
+    if (I == StoresByIndex.begin()) {
+      // Can't handle this load, bail out.
+      Info.UsingBlocks.push_back(LI->getParent());
+      continue;
+    }
+      
+    // Otherwise, there was a store before this load, the load takes its value.
+    --I;
+    LI->replaceAllUsesWith(I->second->getOperand(0));
+    if (AST && LI->getType()->isPointerTy())
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+}
+
+// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
+// Alloca returns true if there wasn't already a phi-node for that variable
+//
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+                                  unsigned &Version) {
+  // Look up the basic-block in question.
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
+
+  // If the BB already has a phi node added for the i'th alloca then we're done!
+  if (PN) return false;
+
+  // Create a PhiNode using the dereferenced type... and add the phi-node to the
+  // BasicBlock.
+  PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
+                       Allocas[AllocaNo]->getName() + "." + Twine(Version++), 
+                       BB->begin());
+  ++NumPHIInsert;
+  PhiToAllocaMap[PN] = AllocaNo;
+
+  if (AST && PN->getType()->isPointerTy())
+    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
+
+  return true;
+}
+
+// RenamePass - Recursively traverse the CFG of the function, renaming loads and
+// stores to the allocas which we are promoting.  IncomingVals indicates what
+// value each Alloca contains on exit from the predecessor block Pred.
+//
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                                RenamePassData::ValVector &IncomingVals,
+                                std::vector<RenamePassData> &Worklist) {
+NextIteration:
+  // If we are inserting any phi nodes into this BB, they will already be in the
+  // block.
+  if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+    // If we have PHI nodes to update, compute the number of edges from Pred to
+    // BB.
+    if (PhiToAllocaMap.count(APN)) {
+      // We want to be able to distinguish between PHI nodes being inserted by
+      // this invocation of mem2reg from those phi nodes that already existed in
+      // the IR before mem2reg was run.  We determine that APN is being inserted
+      // because it is missing incoming edges.  All other PHI nodes being
+      // inserted by this pass of mem2reg will have the same number of incoming
+      // operands so far.  Remember this count.
+      unsigned NewPHINumOperands = APN->getNumOperands();
+      
+      unsigned NumEdges = 0;
+      for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I)
+        if (*I == BB)
+          ++NumEdges;
+      assert(NumEdges && "Must be at least one edge from Pred to BB!");
+      
+      // Add entries for all the phis.
+      BasicBlock::iterator PNI = BB->begin();
+      do {
+        unsigned AllocaNo = PhiToAllocaMap[APN];
+        
+        // Add N incoming values to the PHI node.
+        for (unsigned i = 0; i != NumEdges; ++i)
+          APN->addIncoming(IncomingVals[AllocaNo], Pred);
+        
+        // The currently active variable for this block is now the PHI.
+        IncomingVals[AllocaNo] = APN;
+        
+        // Get the next phi node.
+        ++PNI;
+        APN = dyn_cast<PHINode>(PNI);
+        if (APN == 0) break;
+        
+        // Verify that it is missing entries.  If not, it is not being inserted
+        // by this mem2reg invocation so we want to ignore it.
+      } while (APN->getNumOperands() == NewPHINumOperands);
+    }
+  }
+  
+  // Don't revisit blocks.
+  if (!Visited.insert(BB)) return;
+
+  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) {
+    Instruction *I = II++; // get the instruction, increment iterator
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+      if (!Src) continue;
+  
+      DenseMap<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end()) continue;
+
+      Value *V = IncomingVals[AI->second];
+
+      // Anything using the load now uses the current value.
+      LI->replaceAllUsesWith(V);
+      if (AST && LI->getType()->isPointerTy())
+        AST->deleteValue(LI);
+      BB->getInstList().erase(LI);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      // Delete this instruction and mark the name as the current holder of the
+      // value
+      AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+      if (!Dest) continue;
+      
+      DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+      if (ai == AllocaLookup.end())
+        continue;
+      
+      // what value were we writing?
+      IncomingVals[ai->second] = SI->getOperand(0);
+      // Record debuginfo for the store before removing it.
+      if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second]) {
+        if (!DIB)
+          DIB = new DIBuilder(*SI->getParent()->getParent()->getParent());
+        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
+      }
+      BB->getInstList().erase(SI);
+    }
+  }
+
+  // 'Recurse' to our successors.
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E) return;
+
+  // Keep track of the successors so we don't visit the same successor twice
+  SmallPtrSet<BasicBlock*, 8> VisitedSuccs;
+
+  // Handle the first successor without using the worklist.
+  VisitedSuccs.insert(*I);
+  Pred = BB;
+  BB = *I;
+  ++I;
+
+  for (; I != E; ++I)
+    if (VisitedSuccs.insert(*I))
+      Worklist.push_back(RenamePassData(*I, Pred, IncomingVals));
+
+  goto NextIteration;
+}
+
+/// PromoteMemToReg - Promote the specified list of alloca instructions into
+/// scalar registers, inserting PHI nodes as appropriate.  This function does
+/// not modify the CFG of the function at all.  All allocas must be from the
+/// same function.
+///
+/// If AST is specified, the specified tracker is updated to reflect changes
+/// made to the IR.
+///
+void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas,
+                           DominatorTree &DT, AliasSetTracker *AST) {
+  // If there is nothing to do, bail out...
+  if (Allocas.empty()) return;
+
+  PromoteMem2Reg(Allocas, DT, AST).run();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
new file mode 100644
index 000000000000..9d90fbe5654a
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -0,0 +1,530 @@
+//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdater class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ssaupdater"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+
+using namespace llvm;
+
+typedef DenseMap<BasicBlock*, Value*> AvailableValsTy;
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
+  : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {}
+
+SSAUpdater::~SSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates with type 'Ty'.  PHI nodes get a name based on 'Name'.
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
+  if (AV == 0)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+  ProtoType = Ty;
+  ProtoName = Name;
+}
+
+/// HasValueForBlock - Return true if the SSAUpdater already has a value for
+/// the specified block.
+bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
+  assert(ProtoType != 0 && "Need to initialize SSAUpdater");
+  assert(ProtoType == V->getType() &&
+         "All rewritten values must have the same type");
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// IsEquivalentPHI - Check if PHI has the same incoming value as specified
+/// in ValueMapping for each predecessor block.
+static bool IsEquivalentPHI(PHINode *PHI,
+                            DenseMap<BasicBlock*, Value*> &ValueMapping) {
+  unsigned PHINumValues = PHI->getNumIncomingValues();
+  if (PHINumValues != ValueMapping.size())
+    return false;
+
+  // Scan the phi to see if it matches.
+  for (unsigned i = 0, e = PHINumValues; i != e; ++i)
+    if (ValueMapping[PHI->getIncomingBlock(i)] !=
+        PHI->getIncomingValue(i)) {
+      return false;
+    }
+
+  return true;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
+  Value *Res = GetValueAtEndOfBlockInternal(BB);
+  return Res;
+}
+
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlock(BB);
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<BasicBlock*, Value*>, 8> PredValues;
+  Value *SingularValue = 0;
+
+  // We can get our predecessor info by walking the pred_iterator list, but it
+  // is relatively slow.  If we already have PHI nodes in this block, walk one
+  // of them to get the predecessor list instead.
+  if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+    for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
+      BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (i == 0)
+        SingularValue = PredVal;
+      else if (PredVal != SingularValue)
+        SingularValue = 0;
+    }
+  } else {
+    bool isFirstPred = true;
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *PredBB = *PI;
+      Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+      // Compute SingularValue.
+      if (isFirstPred) {
+        SingularValue = PredVal;
+        isFirstPred = false;
+      } else if (PredVal != SingularValue)
+        SingularValue = 0;
+    }
+  }
+
+  // If there are no predecessors, just return undef.
+  if (PredValues.empty())
+    return UndefValue::get(ProtoType);
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // Otherwise, we do need a PHI: check to see if we already have one available
+  // in this block that produces the right value.
+  if (isa<PHINode>(BB->begin())) {
+    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
+                                               PredValues.end());
+    PHINode *SomePHI;
+    for (BasicBlock::iterator It = BB->begin();
+         (SomePHI = dyn_cast<PHINode>(It)); ++It) {
+      if (IsEquivalentPHI(SomePHI, ValueMapping))
+        return SomePHI;
+    }
+  }
+
+  // Ok, we have no way out, insert a new one now.
+  PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(),
+                                         ProtoName, &BB->front());
+
+  // Fill in all the predecessors of the PHI.
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    InsertedPHI->addIncoming(PredValues[i].second, PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (Value *V = SimplifyInstruction(InsertedPHI)) {
+    InsertedPHI->eraseFromParent();
+    return V;
+  }
+
+  // Set the DebugLoc of the inserted PHI, if available.
+  DebugLoc DL;
+  if (const Instruction *I = BB->getFirstNonPHI())
+      DL = I->getDebugLoc();
+  InsertedPHI->setDebugLoc(DL);
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI;
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void SSAUpdater::RewriteUse(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueInMiddleOfBlock(User->getParent());
+
+  // Notify that users of the existing value that it is being replaced.
+  Value *OldVal = U.get();
+  if (OldVal != V && OldVal->hasValueHandle())
+    ValueHandleBase::ValueIsRAUWd(OldVal, V);
+
+  U.set(V);
+}
+
+/// RewriteUseAfterInsertions - Rewrite a use, just like RewriteUse.  However,
+/// this version of the method can rewrite uses in the same block as a
+/// definition, because it assumes that all uses of a value are below any
+/// inserted values.
+void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
+  Instruction *User = cast<Instruction>(U.getUser());
+  
+  Value *V;
+  if (PHINode *UserPN = dyn_cast<PHINode>(User))
+    V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+  else
+    V = GetValueAtEndOfBlock(User->getParent());
+  
+  U.set(V);
+}
+
+/// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template,
+/// specialized for SSAUpdater.
+namespace llvm {
+template<>
+class SSAUpdaterTraits<SSAUpdater> {
+public:
+  typedef BasicBlock BlkT;
+  typedef Value *ValT;
+  typedef PHINode PhiT;
+
+  typedef succ_iterator BlkSucc_iterator;
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
+
+  class PHI_iterator {
+  private:
+    PHINode *PHI;
+    unsigned idx;
+
+  public:
+    explicit PHI_iterator(PHINode *P) // begin iterator
+      : PHI(P), idx(0) {}
+    PHI_iterator(PHINode *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+    PHI_iterator &operator++() { ++idx; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+    BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+  };
+
+  static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds
+  /// vector, set Info->NumPreds, and allocate space in Info->Preds.
+  static void FindPredecessorBlocks(BasicBlock *BB,
+                                    SmallVectorImpl<BasicBlock*> *Preds) {
+    // We can get our predecessor info by walking the pred_iterator list,
+    // but it is relatively slow.  If we already have PHI nodes in this
+    // block, walk one of them to get the predecessor list instead.
+    if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+      for (unsigned PI = 0, E = SomePhi->getNumIncomingValues(); PI != E; ++PI)
+        Preds->push_back(SomePhi->getIncomingBlock(PI));
+    } else {
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        Preds->push_back(*PI);
+    }
+  }
+
+  /// GetUndefVal - Get an undefined value of the same type as the value
+  /// being handled.
+  static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
+    return UndefValue::get(Updater->ProtoType);
+  }
+
+  /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
+  /// Reserve space for the operands but do not fill them in yet.
+  static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
+                               SSAUpdater *Updater) {
+    PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds,
+                                   Updater->ProtoName, &BB->front());
+    return PHI;
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) {
+    PHI->addIncoming(Val, Pred);
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  ///
+  static PHINode *InstrIsPHI(Instruction *I) {
+    return dyn_cast<PHINode>(I);
+  }
+
+  /// ValueIsPHI - Check if a value is a PHI.
+  ///
+  static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
+    return dyn_cast<PHINode>(Val);
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) {
+    PHINode *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumIncomingValues() == 0)
+      return PHI;
+    return 0;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the value
+  /// that it defines.
+  static Value *GetPHIValue(PHINode *PHI) {
+    return PHI;
+  }
+};
+
+} // End llvm namespace
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
+Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (Value *V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
+
+//===----------------------------------------------------------------------===//
+// LoadAndStorePromoter Implementation
+//===----------------------------------------------------------------------===//
+
+LoadAndStorePromoter::
+LoadAndStorePromoter(const SmallVectorImpl<Instruction*> &Insts,
+                     SSAUpdater &S, StringRef BaseName) : SSA(S) {
+  if (Insts.empty()) return;
+  
+  Value *SomeVal;
+  if (LoadInst *LI = dyn_cast<LoadInst>(Insts[0]))
+    SomeVal = LI;
+  else
+    SomeVal = cast<StoreInst>(Insts[0])->getOperand(0);
+
+  if (BaseName.empty())
+    BaseName = SomeVal->getName();
+  SSA.Initialize(SomeVal->getType(), BaseName);
+}
+
+
+void LoadAndStorePromoter::
+run(const SmallVectorImpl<Instruction*> &Insts) const {
+  
+  // First step: bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
+  
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
+    Instruction *User = Insts[i];
+    UsesByBlock[User->getParent()].push_back(User);
+  }
+  
+  // Okay, now we can iterate over all the blocks in the function with uses,
+  // processing them.  Keep track of which loads are loading a live-in value.
+  // Walk the uses in the use-list order to be determinstic.
+  SmallVector<LoadInst*, 32> LiveInLoads;
+  DenseMap<Value*, Value*> ReplacedLoads;
+  
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
+    Instruction *User = Insts[i];
+    BasicBlock *BB = User->getParent();
+    TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB];
+    
+    // If this block has already been processed, ignore this repeat use.
+    if (BlockUses.empty()) continue;
+    
+    // Okay, this is the first use in the block.  If this block just has a
+    // single user in it, we can rewrite it trivially.
+    if (BlockUses.size() == 1) {
+      // If it is a store, it is a trivial def of the value in the block.
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        updateDebugInfo(SI);
+        SSA.AddAvailableValue(BB, SI->getOperand(0));
+      } else 
+        // Otherwise it is a load, queue it to rewrite as a live-in load.
+        LiveInLoads.push_back(cast<LoadInst>(User));
+      BlockUses.clear();
+      continue;
+    }
+    
+    // Otherwise, check to see if this block is all loads.
+    bool HasStore = false;
+    for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) {
+      if (isa<StoreInst>(BlockUses[i])) {
+        HasStore = true;
+        break;
+      }
+    }
+    
+    // If so, we can queue them all as live in loads.  We don't have an
+    // efficient way to tell which on is first in the block and don't want to
+    // scan large blocks, so just add all loads as live ins.
+    if (!HasStore) {
+      for (unsigned i = 0, e = BlockUses.size(); i != e; ++i)
+        LiveInLoads.push_back(cast<LoadInst>(BlockUses[i]));
+      BlockUses.clear();
+      continue;
+    }
+    
+    // Otherwise, we have mixed loads and stores (or just a bunch of stores).
+    // Since SSAUpdater is purely for cross-block values, we need to determine
+    // the order of these instructions in the block.  If the first use in the
+    // block is a load, then it uses the live in value.  The last store defines
+    // the live out value.  We handle this by doing a linear scan of the block.
+    Value *StoredValue = 0;
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) {
+      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+        // If this is a load from an unrelated pointer, ignore it.
+        if (!isInstInList(L, Insts)) continue;
+        
+        // If we haven't seen a store yet, this is a live in use, otherwise
+        // use the stored value.
+        if (StoredValue) {
+          replaceLoadWithValue(L, StoredValue);
+          L->replaceAllUsesWith(StoredValue);
+          ReplacedLoads[L] = StoredValue;
+        } else {
+          LiveInLoads.push_back(L);
+        }
+        continue;
+      }
+      
+      if (StoreInst *SI = dyn_cast<StoreInst>(II)) {
+        // If this is a store to an unrelated pointer, ignore it.
+        if (!isInstInList(SI, Insts)) continue;
+        updateDebugInfo(SI);
+
+        // Remember that this is the active value in the block.
+        StoredValue = SI->getOperand(0);
+      }
+    }
+    
+    // The last stored value that happened is the live-out for the block.
+    assert(StoredValue && "Already checked that there is a store in block");
+    SSA.AddAvailableValue(BB, StoredValue);
+    BlockUses.clear();
+  }
+  
+  // Okay, now we rewrite all loads that use live-in values in the loop,
+  // inserting PHI nodes as necessary.
+  for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) {
+    LoadInst *ALoad = LiveInLoads[i];
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    replaceLoadWithValue(ALoad, NewVal);
+
+    // Avoid assertions in unreachable code.
+    if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType());
+    ALoad->replaceAllUsesWith(NewVal);
+    ReplacedLoads[ALoad] = NewVal;
+  }
+  
+  // Allow the client to do stuff before we start nuking things.
+  doExtraRewritesBeforeFinalDeletion();
+  
+  // Now that everything is rewritten, delete the old instructions from the
+  // function.  They should all be dead now.
+  for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
+    Instruction *User = Insts[i];
+    
+    // If this is a load that still has uses, then the load must have been added
+    // as a live value in the SSAUpdate data structure for a block (e.g. because
+    // the loaded value was stored later).  In this case, we need to recursively
+    // propagate the updates until we get to the real value.
+    if (!User->use_empty()) {
+      Value *NewVal = ReplacedLoads[User];
+      assert(NewVal && "not a replaced load?");
+      
+      // Propagate down to the ultimate replacee.  The intermediately loads
+      // could theoretically already have been deleted, so we don't want to
+      // dereference the Value*'s.
+      DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
+      while (RLI != ReplacedLoads.end()) {
+        NewVal = RLI->second;
+        RLI = ReplacedLoads.find(NewVal);
+      }
+      
+      replaceLoadWithValue(cast<LoadInst>(User), NewVal);
+      User->replaceAllUsesWith(NewVal);
+    }
+    
+    instructionDeleted(User);
+    User->eraseFromParent();
+  }
+}
+
+bool
+LoadAndStorePromoter::isInstInList(Instruction *I,
+                                   const SmallVectorImpl<Instruction*> &Insts)
+                                   const {
+  return std::find(Insts.begin(), Insts.end(), I) != Insts.end();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
new file mode 100644
index 000000000000..681bf9c2b7a4
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -0,0 +1,4072 @@
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/NoFolder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <map>
+#include <set>
+using namespace llvm;
+
+static cl::opt<unsigned>
+PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1),
+   cl::desc("Control the amount of phi node folding to perform (default = 1)"));
+
+static cl::opt<bool>
+DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+       cl::desc("Duplicate return instructions into unconditional branches"));
+
+static cl::opt<bool>
+SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+       cl::desc("Sink common instructions down to the end block"));
+
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
+STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+
+namespace {
+  /// ValueEqualityComparisonCase - Represents a case of a switch.
+  struct ValueEqualityComparisonCase {
+    ConstantInt *Value;
+    BasicBlock *Dest;
+
+    ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+      : Value(Value), Dest(Dest) {}
+
+    bool operator<(ValueEqualityComparisonCase RHS) const {
+      // Comparing pointers is ok as we only rely on the order for uniquing.
+      return Value < RHS.Value;
+    }
+
+    bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+  };
+
+class SimplifyCFGOpt {
+  const TargetTransformInfo &TTI;
+  const DataLayout *const TD;
+
+  Value *isValueEqualityComparison(TerminatorInst *TI);
+  BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
+                               std::vector<ValueEqualityComparisonCase> &Cases);
+  bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+                                                     BasicBlock *Pred,
+                                                     IRBuilder<> &Builder);
+  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                           IRBuilder<> &Builder);
+
+  bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+  bool SimplifyUnreachable(UnreachableInst *UI);
+  bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+  bool SimplifyIndirectBr(IndirectBrInst *IBI);
+  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder);
+  bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
+
+public:
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD)
+      : TTI(TTI), TD(TD) {}
+  bool run(BasicBlock *BB);
+};
+}
+
+/// SafeToMergeTerminators - Return true if it is safe to merge these two
+/// terminator instructions together.
+///
+static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+
+  // It is not safe to merge these two switch instructions if they have a common
+  // successor, and if that successor has a PHI node, and if *that* PHI node has
+  // conflicting incoming values from the two switch blocks.
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) !=
+            PN->getIncomingValueForBlock(SI2BB))
+          return false;
+      }
+
+  return true;
+}
+
+/// isProfitableToFoldUnconditional - Return true if it is safe and profitable
+/// to merge these two terminator instructions together, where SI1 is an
+/// unconditional branch. PhiNodes will store all PHI nodes in common
+/// successors.
+///
+static bool isProfitableToFoldUnconditional(BranchInst *SI1,
+                                          BranchInst *SI2,
+                                          Instruction *Cond,
+                                          SmallVectorImpl<PHINode*> &PhiNodes) {
+  if (SI1 == SI2) return false;  // Can't merge with self!
+  assert(SI1->isUnconditional() && SI2->isConditional());
+
+  // We fold the unconditional branch if we can easily update all PHI nodes in
+  // common successors:
+  // 1> We have a constant incoming value for the conditional branch;
+  // 2> We have "Cond" as the incoming value for the unconditional branch;
+  // 3> SI2->getCondition() and Cond have same operands.
+  CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition());
+  if (!Ci2) return false;
+  if (!(Cond->getOperand(0) == Ci2->getOperand(0) &&
+        Cond->getOperand(1) == Ci2->getOperand(1)) &&
+      !(Cond->getOperand(0) == Ci2->getOperand(1) &&
+        Cond->getOperand(1) == Ci2->getOperand(0)))
+    return false;
+
+  BasicBlock *SI1BB = SI1->getParent();
+  BasicBlock *SI2BB = SI2->getParent();
+  SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+  for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
+    if (SI1Succs.count(*I))
+      for (BasicBlock::iterator BBI = (*I)->begin();
+           isa<PHINode>(BBI); ++BBI) {
+        PHINode *PN = cast<PHINode>(BBI);
+        if (PN->getIncomingValueForBlock(SI1BB) != Cond ||
+            !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB)))
+          return false;
+        PhiNodes.push_back(PN);
+      }
+  return true;
+}
+
+/// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will
+/// now be entries in it from the 'NewPred' block.  The values that will be
+/// flowing into the PHI nodes will be the same as those coming in from
+/// ExistPred, an existing predecessor of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+                                  BasicBlock *ExistPred) {
+  if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
+
+  PHINode *PN;
+  for (BasicBlock::iterator I = Succ->begin();
+       (PN = dyn_cast<PHINode>(I)); ++I)
+    PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+
+/// GetIfCondition - Given a basic block (BB) with two predecessors (and at
+/// least one PHI node in it), check to see if the merge at this block is due
+/// to an "if condition".  If so, return the boolean condition that determines
+/// which entry into BB will be taken.  Also, return by references the block
+/// that will be entered from if the condition is true, and the block that will
+/// be entered if the condition is false.
+///
+/// This does no checking to see if the true/false blocks have large or unsavory
+/// instructions in them.
+static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+                             BasicBlock *&IfFalse) {
+  PHINode *SomePHI = cast<PHINode>(BB->begin());
+  assert(SomePHI->getNumIncomingValues() == 2 &&
+         "Function can only handle blocks with 2 predecessors!");
+  BasicBlock *Pred1 = SomePHI->getIncomingBlock(0);
+  BasicBlock *Pred2 = SomePHI->getIncomingBlock(1);
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
+  if (Pred1Br == 0 || Pred2Br == 0)
+    return 0;
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return 0;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (Pred2->getSinglePredecessor() == 0)
+      return 0;
+
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return 0;
+    }
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  BasicBlock *CommonPred = Pred1->getSinglePredecessor();
+  if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor())
+    return 0;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+  if (BI == 0) return 0;
+
+  assert(BI->isConditional() && "Two successors but not conditional?");
+  if (BI->getSuccessor(0) == Pred1) {
+    IfTrue = Pred1;
+    IfFalse = Pred2;
+  } else {
+    IfTrue = Pred2;
+    IfFalse = Pred1;
+  }
+  return BI->getCondition();
+}
+
+/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the
+/// given instruction, which is assumed to be safe to speculate. 1 means
+/// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
+static unsigned ComputeSpeculationCost(const User *I) {
+  assert(isSafeToSpeculativelyExecute(I) &&
+         "Instruction is not safe to speculatively execute!");
+  switch (Operator::getOpcode(I)) {
+  default:
+    // In doubt, be conservative.
+    return UINT_MAX;
+  case Instruction::GetElementPtr:
+    // GEPs are cheap if all indices are constant.
+    if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+      return UINT_MAX;
+    return 1;
+  case Instruction::Load:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::ICmp:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    return 1; // These are all cheap.
+
+  case Instruction::Call:
+  case Instruction::Select:
+    return 2;
+  }
+}
+
+/// DominatesMergePoint - If we have a merge point of an "if condition" as
+/// accepted above, return true if the specified value dominates the block.  We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) and its recursive operands
+/// that do not dominate BB have a combined cost lower than CostRemaining and
+/// are non-trapping.  If both are true, the instruction is inserted into the
+/// set and true is returned.
+///
+/// The cost for most non-trapping instructions is defined as 1 except for
+/// Select whose cost is 2.
+///
+/// After this function returns, CostRemaining is decreased by the cost of
+/// V plus its non-dominating operands.  If that cost is greater than
+/// CostRemaining, false is returned and CostRemaining is undefined.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+                                SmallPtrSet<Instruction*, 4> *AggressiveInsts,
+                                unsigned &CostRemaining) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) {
+    // Non-instructions all dominate instructions, but not all constantexprs
+    // can be executed unconditionally.
+    if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+      if (C->canTrap())
+        return false;
+    return true;
+  }
+  BasicBlock *PBB = I->getParent();
+
+  // We don't want to allow weird loops that might have the "if condition" in
+  // the bottom of this block.
+  if (PBB == BB) return false;
+
+  // If this instruction is defined in a block that contains an unconditional
+  // branch to BB, then it must be in the 'conditional' part of the "if
+  // statement".  If not, it definitely dominates the region.
+  BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
+  if (BI == 0 || BI->isConditional() || BI->getSuccessor(0) != BB)
+    return true;
+
+  // If we aren't allowing aggressive promotion anymore, then don't consider
+  // instructions in the 'if region'.
+  if (AggressiveInsts == 0) return false;
+
+  // If we have seen this instruction before, don't count it again.
+  if (AggressiveInsts->count(I)) return true;
+
+  // Okay, it looks like the instruction IS in the "condition".  Check to
+  // see if it's a cheap instruction to unconditionally compute, and if it
+  // only uses stuff defined outside of the condition.  If so, hoist it out.
+  if (!isSafeToSpeculativelyExecute(I))
+    return false;
+
+  unsigned Cost = ComputeSpeculationCost(I);
+
+  if (Cost > CostRemaining)
+    return false;
+
+  CostRemaining -= Cost;
+
+  // Okay, we can only really hoist these out if their operands do
+  // not take us over the cost threshold.
+  for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+    if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining))
+      return false;
+  // Okay, it's safe to do this!  Remember this instruction.
+  AggressiveInsts->insert(I);
+  return true;
+}
+
+/// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr
+/// and PointerNullValue. Return NULL if value is not a constant int.
+static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) {
+  // Normal constant int.
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy())
+    return CI;
+
+  // This is some kind of pointer constant. Turn it into a pointer-sized
+  // ConstantInt if possible.
+  IntegerType *PtrTy = cast<IntegerType>(TD->getIntPtrType(V->getType()));
+
+  // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
+  if (isa<ConstantPointerNull>(V))
+    return ConstantInt::get(PtrTy, 0);
+
+  // IntToPtr const int.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::IntToPtr)
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) {
+        // The constant is very likely to have the right type already.
+        if (CI->getType() == PtrTy)
+          return CI;
+        else
+          return cast<ConstantInt>
+            (ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+      }
+  return 0;
+}
+
+/// GatherConstantCompares - Given a potentially 'or'd or 'and'd together
+/// collection of icmp eq/ne instructions that compare a value against a
+/// constant, return the value being compared, and stick the constant into the
+/// Values vector.
+static Value *
+GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
+                       const DataLayout *TD, bool isEQ, unsigned &UsedICmps) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return 0;
+
+  // If this is an icmp against a constant, handle this as one of the cases.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
+    if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) {
+      if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
+        UsedICmps++;
+        Vals.push_back(C);
+        return I->getOperand(0);
+      }
+
+      // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to
+      // the set.
+      ConstantRange Span =
+        ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
+
+      // If this is an and/!= check then we want to optimize "x ugt 2" into
+      // x != 0 && x != 1.
+      if (!isEQ)
+        Span = Span.inverse();
+
+      // If there are a ton of values, we don't want to make a ginormous switch.
+      if (Span.getSetSize().ugt(8) || Span.isEmptySet())
+        return 0;
+
+      for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+        Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
+      UsedICmps++;
+      return I->getOperand(0);
+    }
+    return 0;
+  }
+
+  // Otherwise, we can only handle an | or &, depending on isEQ.
+  if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
+    return 0;
+
+  unsigned NumValsBeforeLHS = Vals.size();
+  unsigned UsedICmpsBeforeLHS = UsedICmps;
+  if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD,
+                                          isEQ, UsedICmps)) {
+    unsigned NumVals = Vals.size();
+    unsigned UsedICmpsBeforeRHS = UsedICmps;
+    if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD,
+                                            isEQ, UsedICmps)) {
+      if (LHS == RHS)
+        return LHS;
+      Vals.resize(NumVals);
+      UsedICmps = UsedICmpsBeforeRHS;
+    }
+
+    // The RHS of the or/and can't be folded in and we haven't used "Extra" yet,
+    // set it and return success.
+    if (Extra == 0 || Extra == I->getOperand(1)) {
+      Extra = I->getOperand(1);
+      return LHS;
+    }
+
+    Vals.resize(NumValsBeforeLHS);
+    UsedICmps = UsedICmpsBeforeLHS;
+    return 0;
+  }
+
+  // If the LHS can't be folded in, but Extra is available and RHS can, try to
+  // use LHS as Extra.
+  if (Extra == 0 || Extra == I->getOperand(0)) {
+    Value *OldExtra = Extra;
+    Extra = I->getOperand(0);
+    if (Value *RHS = GatherConstantCompares(I->getOperand(1), Vals, Extra, TD,
+                                            isEQ, UsedICmps))
+      return RHS;
+    assert(Vals.size() == NumValsBeforeLHS);
+    Extra = OldExtra;
+  }
+
+  return 0;
+}
+
+static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+  Instruction *Cond = 0;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cond = dyn_cast<Instruction>(SI->getCondition());
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      Cond = dyn_cast<Instruction>(BI->getCondition());
+  } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
+    Cond = dyn_cast<Instruction>(IBI->getAddress());
+  }
+
+  TI->eraseFromParent();
+  if (Cond) RecursivelyDeleteTriviallyDeadInstructions(Cond);
+}
+
+/// isValueEqualityComparison - Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
+  Value *CV = 0;
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    // Do not permit merging of large switch instructions into their
+    // predecessors unless there is only one predecessor.
+    if (SI->getNumSuccessors()*std::distance(pred_begin(SI->getParent()),
+                                             pred_end(SI->getParent())) <= 128)
+      CV = SI->getCondition();
+  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+    if (BI->isConditional() && BI->getCondition()->hasOneUse())
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
+        if ((ICI->getPredicate() == ICmpInst::ICMP_EQ ||
+             ICI->getPredicate() == ICmpInst::ICMP_NE) &&
+            GetConstantInt(ICI->getOperand(1), TD))
+          CV = ICI->getOperand(0);
+
+  // Unwrap any lossless ptrtoint cast.
+  if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext()))
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV))
+      CV = PTII->getOperand(0);
+  return CV;
+}
+
+/// GetValueEqualityComparisonCases - Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+BasicBlock *SimplifyCFGOpt::
+GetValueEqualityComparisonCases(TerminatorInst *TI,
+                                std::vector<ValueEqualityComparisonCase>
+                                                                       &Cases) {
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    Cases.reserve(SI->getNumCases());
+    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+      Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(),
+                                                  i.getCaseSuccessor()));
+    return SI->getDefaultDest();
+  }
+
+  BranchInst *BI = cast<BranchInst>(TI);
+  ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+  BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+  Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1),
+                                                             TD),
+                                              Succ));
+  return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+
+/// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void EliminateBlockCases(BasicBlock *BB,
+                              std::vector<ValueEqualityComparisonCase> &Cases) {
+  Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
+}
+
+/// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
+/// well.
+static bool
+ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+              std::vector<ValueEqualityComparisonCase > &C2) {
+  std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
+
+  // Make V1 be smaller than V2.
+  if (V1->size() > V2->size())
+    std::swap(V1, V2);
+
+  if (V1->size() == 0) return false;
+  if (V1->size() == 1) {
+    // Just scan V2.
+    ConstantInt *TheVal = (*V1)[0].Value;
+    for (unsigned i = 0, e = V2->size(); i != e; ++i)
+      if (TheVal == (*V2)[i].Value)
+        return true;
+  }
+
+  // Otherwise, just sort both lists and compare element by element.
+  array_pod_sort(V1->begin(), V1->end());
+  array_pod_sort(V2->begin(), V2->end());
+  unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+  while (i1 != e1 && i2 != e2) {
+    if ((*V1)[i1].Value == (*V2)[i2].Value)
+      return true;
+    if ((*V1)[i1].Value < (*V2)[i2].Value)
+      ++i1;
+    else
+      ++i2;
+  }
+  return false;
+}
+
+/// SimplifyEqualityComparisonWithOnlyPredecessor - If TI is known to be a
+/// terminator instruction and its block is known to only have a single
+/// predecessor block, check to see if that predecessor is also a value
+/// comparison with the same value, and if that comparison determines the
+/// outcome of this comparison.  If so, simplify TI.  This does a very limited
+/// form of jump threading.
+bool SimplifyCFGOpt::
+SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+                                              BasicBlock *Pred,
+                                              IRBuilder<> &Builder) {
+  Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+  if (!PredVal) return false;  // Not a value comparison in predecessor.
+
+  Value *ThisVal = isValueEqualityComparison(TI);
+  assert(ThisVal && "This isn't a value comparison!!");
+  if (ThisVal != PredVal) return false;  // Different predicates.
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Find out information about when control will move from Pred to TI's block.
+  std::vector<ValueEqualityComparisonCase> PredCases;
+  BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
+                                                        PredCases);
+  EliminateBlockCases(PredDef, PredCases);  // Remove default from cases.
+
+  // Find information about how control leaves this block.
+  std::vector<ValueEqualityComparisonCase> ThisCases;
+  BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+  EliminateBlockCases(ThisDef, ThisCases);  // Remove default from cases.
+
+  // If TI's block is the default block from Pred's comparison, potentially
+  // simplify TI based on this knowledge.
+  if (PredDef == TI->getParent()) {
+    // If we are here, we know that the value is none of those cases listed in
+    // PredCases.  If there are any cases in ThisCases that are in PredCases, we
+    // can simplify TI.
+    if (!ValuesOverlap(PredCases, ThisCases))
+      return false;
+
+    if (isa<BranchInst>(TI)) {
+      // Okay, one of the successors of this condbr is dead.  Convert it to a
+      // uncond br.
+      assert(ThisCases.size() == 1 && "Branch can only have one case!");
+      // Insert the new branch.
+      Instruction *NI = Builder.CreateBr(ThisDef);
+      (void) NI;
+
+      // Remove PHI node entries for the dead edge.
+      ThisCases[0].Dest->removePredecessor(TI->getParent());
+
+      DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+           << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+
+      EraseTerminatorInstAndDCECond(TI);
+      return true;
+    }
+
+    SwitchInst *SI = cast<SwitchInst>(TI);
+    // Okay, TI has cases that are statically dead, prune them away.
+    SmallPtrSet<Constant*, 16> DeadCases;
+    for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+      DeadCases.insert(PredCases[i].Value);
+
+    DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+                 << "Through successor TI: " << *TI);
+
+    // Collect branch weights into a vector.
+    SmallVector<uint32_t, 8> Weights;
+    MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+    bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
+    if (HasWeight)
+      for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+           ++MD_i) {
+        ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+        assert(CI);
+        Weights.push_back(CI->getValue().getZExtValue());
+      }
+    for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
+      --i;
+      if (DeadCases.count(i.getCaseValue())) {
+        if (HasWeight) {
+          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          Weights.pop_back();
+        }
+        i.getCaseSuccessor()->removePredecessor(TI->getParent());
+        SI->removeCase(i);
+      }
+    }
+    if (HasWeight && Weights.size() >= 2)
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getParent()->getContext()).
+                      createBranchWeights(Weights));
+
+    DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+    return true;
+  }
+
+  // Otherwise, TI's block must correspond to some matched value.  Find out
+  // which value (or set of values) this is.
+  ConstantInt *TIV = 0;
+  BasicBlock *TIBB = TI->getParent();
+  for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+    if (PredCases[i].Dest == TIBB) {
+      if (TIV != 0)
+        return false;  // Cannot handle multiple values coming to this block.
+      TIV = PredCases[i].Value;
+    }
+  assert(TIV && "No edge from pred to succ?");
+
+  // Okay, we found the one constant that our value can be if we get into TI's
+  // BB.  Find out which successor will unconditionally be branched to.
+  BasicBlock *TheRealDest = 0;
+  for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+    if (ThisCases[i].Value == TIV) {
+      TheRealDest = ThisCases[i].Dest;
+      break;
+    }
+
+  // If not handled by any explicit cases, it is handled by the default case.
+  if (TheRealDest == 0) TheRealDest = ThisDef;
+
+  // Remove PHI node entries for dead edges.
+  BasicBlock *CheckEdge = TheRealDest;
+  for (succ_iterator SI = succ_begin(TIBB), e = succ_end(TIBB); SI != e; ++SI)
+    if (*SI != CheckEdge)
+      (*SI)->removePredecessor(TIBB);
+    else
+      CheckEdge = 0;
+
+  // Insert the new branch.
+  Instruction *NI = Builder.CreateBr(TheRealDest);
+  (void) NI;
+
+  DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+            << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n");
+
+  EraseTerminatorInstAndDCECond(TI);
+  return true;
+}
+
+namespace {
+  /// ConstantIntOrdering - This class implements a stable ordering of constant
+  /// integers that does not depend on their address.  This is important for
+  /// applications that sort ConstantInt's to ensure uniqueness.
+  struct ConstantIntOrdering {
+    bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+      return LHS->getValue().ult(RHS->getValue());
+    }
+  };
+}
+
+static int ConstantIntSortPredicate(const void *P1, const void *P2) {
+  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
+  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
+  if (LHS->getValue().ult(RHS->getValue()))
+    return 1;
+  if (LHS->getValue() == RHS->getValue())
+    return 0;
+  return -1;
+}
+
+static inline bool HasBranchWeights(const Instruction* I) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(TerminatorInst *TI,
+                             SmallVectorImpl<uint64_t> &Weights) {
+  MDNode* MD = TI->getMetadata(LLVMContext::MD_prof);
+  assert(MD);
+  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+    ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
+    assert(CI);
+    Weights.push_back(CI->getValue().getZExtValue());
+  }
+
+  // If TI is a conditional eq, the default case is the false case,
+  // and the corresponding branch-weight data is at index 2. We swap the
+  // default weight to be the first entry.
+  if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+    assert(Weights.size() == 2);
+    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      std::swap(Weights.front(), Weights.back());
+  }
+}
+
+/// Sees if any of the weights are too big for a uint32_t, and halves all the
+/// weights if any are.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  bool Halve = false;
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    if (Weights[i] > UINT_MAX) {
+      Halve = true;
+      break;
+    }
+
+  if (! Halve)
+    return;
+
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    Weights[i] /= 2;
+}
+
+/// FoldValueComparisonIntoPredecessors - The specified terminator is a value
+/// equality comparison instruction (either a switch or a branch on "X == c").
+/// See if any of the predecessors of the terminator block are value comparisons
+/// on the same value.  If so, and if safe to do so, fold them together.
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+                                                         IRBuilder<> &Builder) {
+  BasicBlock *BB = TI->getParent();
+  Value *CV = isValueEqualityComparison(TI);  // CondVal
+  assert(CV && "Not a comparison?");
+  bool Changed = false;
+
+  SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
+  while (!Preds.empty()) {
+    BasicBlock *Pred = Preds.pop_back_val();
+
+    // See if the predecessor is a comparison with the same value.
+    TerminatorInst *PTI = Pred->getTerminator();
+    Value *PCV = isValueEqualityComparison(PTI);  // PredCondVal
+
+    if (PCV == CV && SafeToMergeTerminators(TI, PTI)) {
+      // Figure out which 'cases' to copy from SI to PSI.
+      std::vector<ValueEqualityComparisonCase> BBCases;
+      BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
+
+      std::vector<ValueEqualityComparisonCase> PredCases;
+      BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
+
+      // Based on whether the default edge from PTI goes to BB or not, fill in
+      // PredCases and PredDefault with the new switch cases we would like to
+      // build.
+      SmallVector<BasicBlock*, 8> NewSuccessors;
+
+      // Update the branch weight metadata along the way
+      SmallVector<uint64_t, 8> Weights;
+      bool PredHasWeights = HasBranchWeights(PTI);
+      bool SuccHasWeights = HasBranchWeights(TI);
+
+      if (PredHasWeights) {
+        GetBranchWeights(PTI, Weights);
+        // branch-weight metadata is inconsistent here.
+        if (Weights.size() != 1 + PredCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (SuccHasWeights)
+        // If there are no predecessor weights but there are successor weights,
+        // populate Weights with 1, which will later be scaled to the sum of
+        // successor's weights
+        Weights.assign(1 + PredCases.size(), 1);
+
+      SmallVector<uint64_t, 8> SuccWeights;
+      if (SuccHasWeights) {
+        GetBranchWeights(TI, SuccWeights);
+        // branch-weight metadata is inconsistent here.
+        if (SuccWeights.size() != 1 + BBCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (PredHasWeights)
+        SuccWeights.assign(1 + BBCases.size(), 1);
+
+      if (PredDefault == BB) {
+        // If this is the default destination from PTI, only the edges in TI
+        // that don't occur in PTI, or that branch to BB will be activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].Dest != BB)
+            PTIHandled.insert(PredCases[i].Value);
+          else {
+            // The default destination is BB, we don't need explicit targets.
+            std::swap(PredCases[i], PredCases.back());
+
+            if (PredHasWeights || SuccHasWeights) {
+              // Increase weight for the default case.
+              Weights[0] += Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Reconstruct the new switch statement we will be building.
+        if (PredDefault != BBDefault) {
+          PredDefault->removePredecessor(Pred);
+          PredDefault = BBDefault;
+          NewSuccessors.push_back(BBDefault);
+        }
+
+        unsigned CasesFromPred = Weights.size();
+        uint64_t ValidTotalSuccWeight = 0;
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (!PTIHandled.count(BBCases[i].Value) &&
+              BBCases[i].Dest != BBDefault) {
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].Dest);
+            if (SuccHasWeights || PredHasWeights) {
+              // The default weight is at index 0, so weight for the ith case
+              // should be at index i+1. Scale the cases from successor by
+              // PredDefaultWeight (Weights[0]).
+              Weights.push_back(Weights[0] * SuccWeights[i+1]);
+              ValidTotalSuccWeight += SuccWeights[i+1];
+            }
+          }
+
+        if (SuccHasWeights || PredHasWeights) {
+          ValidTotalSuccWeight += SuccWeights[0];
+          // Scale the cases from predecessor by ValidTotalSuccWeight.
+          for (unsigned i = 1; i < CasesFromPred; ++i)
+            Weights[i] *= ValidTotalSuccWeight;
+          // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
+          Weights[0] *= SuccWeights[0];
+        }
+      } else {
+        // If this is not the default destination from PSI, only the edges
+        // in SI that occur in PSI with a destination of BB will be
+        // activated.
+        std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt*, uint64_t> WeightsForHandled;
+        for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+          if (PredCases[i].Dest == BB) {
+            PTIHandled.insert(PredCases[i].Value);
+
+            if (PredHasWeights || SuccHasWeights) {
+              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
+            std::swap(PredCases[i], PredCases.back());
+            PredCases.pop_back();
+            --i; --e;
+          }
+
+        // Okay, now we know which constants were sent to BB from the
+        // predecessor.  Figure out where they will all go now.
+        for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
+          if (PTIHandled.count(BBCases[i].Value)) {
+            // If this is one we are capable of getting...
+            if (PredHasWeights || SuccHasWeights)
+              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
+            PredCases.push_back(BBCases[i]);
+            NewSuccessors.push_back(BBCases[i].Dest);
+            PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
+          }
+
+        // If there are any constants vectored to BB that TI doesn't handle,
+        // they must go to the default destination of TI.
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
+                                    PTIHandled.begin(),
+               E = PTIHandled.end(); I != E; ++I) {
+          if (PredHasWeights || SuccHasWeights)
+            Weights.push_back(WeightsForHandled[*I]);
+          PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
+          NewSuccessors.push_back(BBDefault);
+        }
+      }
+
+      // Okay, at this point, we know which new successor Pred will get.  Make
+      // sure we update the number of entries in the PHI nodes for these
+      // successors.
+      for (unsigned i = 0, e = NewSuccessors.size(); i != e; ++i)
+        AddPredecessorToBlock(NewSuccessors[i], Pred, BB);
+
+      Builder.SetInsertPoint(PTI);
+      // Convert pointer to int before we switch.
+      if (CV->getType()->isPointerTy()) {
+        assert(TD && "Cannot switch on pointer without DataLayout");
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+                                    "magicptr");
+      }
+
+      // Now that the successors are updated, create the new Switch instruction.
+      SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault,
+                                               PredCases.size());
+      NewSI->setDebugLoc(PTI->getDebugLoc());
+      for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+        NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
+
+      if (PredHasWeights || SuccHasWeights) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(Weights);
+
+        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+
+        NewSI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(BB->getContext()).
+                           createBranchWeights(MDWeights));
+      }
+
+      EraseTerminatorInstAndDCECond(PTI);
+
+      // Okay, last check.  If BB is still a successor of PSI, then we must
+      // have an infinite loop case.  If so, add an infinitely looping block
+      // to handle the case to preserve the behavior of the code.
+      BasicBlock *InfLoopBlock = 0;
+      for (unsigned i = 0, e = NewSI->getNumSuccessors(); i != e; ++i)
+        if (NewSI->getSuccessor(i) == BB) {
+          if (InfLoopBlock == 0) {
+            // Insert it at the end of the function, because it's either code,
+            // or it won't matter if it's hot. :)
+            InfLoopBlock = BasicBlock::Create(BB->getContext(),
+                                              "infloop", BB->getParent());
+            BranchInst::Create(InfLoopBlock, InfLoopBlock);
+          }
+          NewSI->setSuccessor(i, InfLoopBlock);
+        }
+
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+// isSafeToHoistInvoke - If we would need to insert a select that uses the
+// value of this invoke (comments in HoistThenElseCodeToIf explain why we
+// would need to do this), we can't hoist the invoke, as there is nowhere
+// to put the select in this case.
+static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
+                                Instruction *I1, Instruction *I2) {
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V != BB2V && (BB1V==I1 || BB2V==I2)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// HoistThenElseCodeToIf - Given a conditional branch that goes to BB1 and
+/// BB2, hoist any common code in the two blocks up into the branch block.  The
+/// caller of this function guarantees that BI's block dominates BB1 and BB2.
+static bool HoistThenElseCodeToIf(BranchInst *BI) {
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  In particular, we don't want to get into
+  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
+  // such, we currently just scan for obviously identical instructions in an
+  // identical order.
+  BasicBlock *BB1 = BI->getSuccessor(0);  // The true destination.
+  BasicBlock *BB2 = BI->getSuccessor(1);  // The false destination
+
+  BasicBlock::iterator BB1_Itr = BB1->begin();
+  BasicBlock::iterator BB2_Itr = BB2->begin();
+
+  Instruction *I1 = BB1_Itr++, *I2 = BB2_Itr++;
+  // Skip debug info if it is not identical.
+  DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+  DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+  if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+    while (isa<DbgInfoIntrinsic>(I1))
+      I1 = BB1_Itr++;
+    while (isa<DbgInfoIntrinsic>(I2))
+      I2 = BB2_Itr++;
+  }
+  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
+      (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
+    return false;
+
+  // If we get here, we can hoist at least one instruction.
+  BasicBlock *BIParent = BI->getParent();
+
+  do {
+    // If we are hoisting the terminator instruction, don't move one (making a
+    // broken BB), instead clone it, and remove BI.
+    if (isa<TerminatorInst>(I1))
+      goto HoistTerminator;
+
+    // For a normal instruction, we just move one to right before the branch,
+    // then replace all uses of the other with the first.  Finally, we remove
+    // the now redundant second instruction.
+    BIParent->getInstList().splice(BI, BB1->getInstList(), I1);
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->intersectOptionalDataWith(I2);
+    I2->eraseFromParent();
+
+    I1 = BB1_Itr++;
+    I2 = BB2_Itr++;
+    // Skip debug info if it is not identical.
+    DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+    DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+    if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+      while (isa<DbgInfoIntrinsic>(I1))
+        I1 = BB1_Itr++;
+      while (isa<DbgInfoIntrinsic>(I2))
+        I2 = BB2_Itr++;
+    }
+  } while (I1->isIdenticalToWhenDefined(I2));
+
+  return true;
+
+HoistTerminator:
+  // It may not be possible to hoist an invoke.
+  if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
+    return true;
+
+  // Okay, it is safe to hoist the terminator.
+  Instruction *NT = I1->clone();
+  BIParent->getInstList().insert(BI, NT);
+  if (!NT->getType()->isVoidTy()) {
+    I1->replaceAllUsesWith(NT);
+    I2->replaceAllUsesWith(NT);
+    NT->takeName(I1);
+  }
+
+  IRBuilder<true, NoFolder> Builder(NT);
+  // Hoisting one of the terminators from our successor is a great thing.
+  // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+  // them.  If they do, all PHI entries for BB1/BB2 must agree for all PHI
+  // nodes, so we insert select instruction to compute the final result.
+  std::map<std::pair<Value*,Value*>, SelectInst*> InsertedSelects;
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V) continue;
+
+      // These values do not agree.  Insert a select instruction before NT
+      // that determines the right value.
+      SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+      if (SI == 0)
+        SI = cast<SelectInst>
+          (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+                                BB1V->getName()+"."+BB2V->getName()));
+
+      // Make the PHI node use the select for all incoming values for BB1/BB2
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingBlock(i) == BB1 || PN->getIncomingBlock(i) == BB2)
+          PN->setIncomingValue(i, SI);
+    }
+  }
+
+  // Update any PHI nodes in our new successors.
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI)
+    AddPredecessorToBlock(*SI, BIParent, BB1);
+
+  EraseTerminatorInstAndDCECond(BI);
+  return true;
+}
+
+/// SinkThenElseCodeToEnd - Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+  assert(BI1->isUnconditional());
+  BasicBlock *BB1 = BI1->getParent();
+  BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+  // Check that BBEnd has two predecessors and the other predecessor ends with
+  // an unconditional branch.
+  pred_iterator PI = pred_begin(BBEnd), PE = pred_end(BBEnd);
+  BasicBlock *Pred0 = *PI++;
+  if (PI == PE) // Only one predecessor.
+    return false;
+  BasicBlock *Pred1 = *PI++;
+  if (PI != PE) // More than two predecessors.
+    return false;
+  BasicBlock *BB2 = (Pred0 == BB1) ? Pred1 : Pred0;
+  BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator());
+  if (!BI2 || !BI2->isUnconditional())
+    return false;
+
+  // Gather the PHI nodes in BBEnd.
+  std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
+  Instruction *FirstNonPhiInBBEnd = 0;
+  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
+    } else {
+      FirstNonPhiInBBEnd = &*I;
+      break;
+    }
+  }
+  if (!FirstNonPhiInBBEnd)
+    return false;
+
+
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  We scan backward for obviously identical
+  // instructions in an identical order.
+  BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
+      RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(),
+      RE2 = BB2->getInstList().rend();
+  // Skip debug info.
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  if (RI1 == RE1)
+    return false;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  if (RI2 == RE2)
+    return false;
+  // Skip the unconditional branches.
+  ++RI1;
+  ++RI2;
+
+  bool Changed = false;
+  while (RI1 != RE1 && RI2 != RE2) {
+    // Skip debug info.
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    if (RI1 == RE1)
+      return Changed;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    if (RI2 == RE2)
+      return Changed;
+
+    Instruction *I1 = &*RI1, *I2 = &*RI2;
+    // I1 and I2 should have a single use in the same PHI node, and they
+    // perform the same operation.
+    // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
+        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
+        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) ||
+        isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
+        I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
+        I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
+        !I1->hasOneUse() || !I2->hasOneUse() ||
+        MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() ||
+        MapValueFromBB1ToBB2[I1].first != I2)
+      return Changed;
+
+    // Check whether we should swap the operands of ICmpInst.
+    ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
+    bool SwapOpnds = false;
+    if (ICmp1 && ICmp2 &&
+        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+        ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
+        (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
+         ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
+      ICmp2->swapOperands();
+      SwapOpnds = true;
+    }
+    if (!I1->isSameOperationAs(I2)) {
+      if (SwapOpnds)
+        ICmp2->swapOperands();
+      return Changed;
+    }
+
+    // The operands should be either the same or they need to be generated
+    // with a PHI node after sinking. We only handle the case where there is
+    // a single pair of different operands.
+    Value *DifferentOp1 = 0, *DifferentOp2 = 0;
+    unsigned Op1Idx = 0;
+    for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
+      if (I1->getOperand(I) == I2->getOperand(I))
+        continue;
+      // Early exit if we have more-than one pair of different operands or
+      // the different operand is already in MapValueFromBB1ToBB2.
+      // Early exit if we need a PHI node to replace a constant.
+      if (DifferentOp1 ||
+          MapValueFromBB1ToBB2.find(I1->getOperand(I)) !=
+          MapValueFromBB1ToBB2.end() ||
+          isa<Constant>(I1->getOperand(I)) ||
+          isa<Constant>(I2->getOperand(I))) {
+        // If we can't sink the instructions, undo the swapping.
+        if (SwapOpnds)
+          ICmp2->swapOperands();
+        return Changed;
+      }
+      DifferentOp1 = I1->getOperand(I);
+      Op1Idx = I;
+      DifferentOp2 = I2->getOperand(I);
+    }
+
+    // We insert the pair of different operands to MapValueFromBB1ToBB2 and
+    // remove (I1, I2) from MapValueFromBB1ToBB2.
+    if (DifferentOp1) {
+      PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2,
+                                       DifferentOp1->getName() + ".sink",
+                                       BBEnd->begin());
+      MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN);
+      // I1 should use NewPN instead of DifferentOp1.
+      I1->setOperand(Op1Idx, NewPN);
+      NewPN->addIncoming(DifferentOp1, BB1);
+      NewPN->addIncoming(DifferentOp2, BB2);
+      DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+    }
+    PHINode *OldPN = MapValueFromBB1ToBB2[I1].second;
+    MapValueFromBB1ToBB2.erase(I1);
+
+    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";);
+    DEBUG(dbgs() << "                         " << *I2 << "\n";);
+    // We need to update RE1 and RE2 if we are going to sink the first
+    // instruction in the basic block down.
+    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    // Sink the instruction.
+    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1);
+    if (!OldPN->use_empty())
+      OldPN->replaceAllUsesWith(I1);
+    OldPN->eraseFromParent();
+
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->intersectOptionalDataWith(I2);
+    I2->eraseFromParent();
+
+    if (UpdateRE1)
+      RE1 = BB1->getInstList().rend();
+    if (UpdateRE2)
+      RE2 = BB2->getInstList().rend();
+    FirstNonPhiInBBEnd = I1;
+    NumSinkCommons++;
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// \brief Speculate a conditional basic block flattening the CFG.
+///
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     %sub = sub %x, %y
+///     br label BB2
+///   EndBB:
+///     %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+///     ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+///   BB:
+///     %cmp = icmp ult %x, %y
+///     %sub = sub %x, %y
+///     %cond = select i1 %cmp, 0, %sub
+///     ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
+  // Be conservative for now. FP select instruction can often be expensive.
+  Value *BrCond = BI->getCondition();
+  if (isa<FCmpInst>(BrCond))
+    return false;
+
+  BasicBlock *BB = BI->getParent();
+  BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+
+  // If ThenBB is actually on the false edge of the conditional branch, remember
+  // to swap the select operands later.
+  bool Invert = false;
+  if (ThenBB != BI->getSuccessor(0)) {
+    assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+    Invert = true;
+  }
+  assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+  // Keep a count of how many times instructions are used within CondBB when
+  // they are candidates for sinking into CondBB. Specifically:
+  // - They are defined in BB, and
+  // - They have no side effects, and
+  // - All of their uses are in CondBB.
+  SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+  unsigned SpeculationCost = 0;
+  for (BasicBlock::iterator BBI = ThenBB->begin(),
+                            BBE = llvm::prior(ThenBB->end());
+       BBI != BBE; ++BBI) {
+    Instruction *I = BBI;
+    // Skip debug info.
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    // Only speculatively execution a single instruction (not counting the
+    // terminator) for now.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
+
+    // Don't hoist the instruction if it's unsafe or expensive.
+    if (!isSafeToSpeculativelyExecute(I))
+      return false;
+    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
+      return false;
+
+    // Do not hoist the instruction if any of its operands are defined but not
+    // used in this BB. The transformation will prevent the operand from
+    // being sunk into the use block.
+    for (User::op_iterator i = I->op_begin(), e = I->op_end();
+         i != e; ++i) {
+      Instruction *OpI = dyn_cast<Instruction>(*i);
+      if (!OpI || OpI->getParent() != BB ||
+          OpI->mayHaveSideEffects())
+        continue; // Not a candidate for sinking.
+
+      ++SinkCandidateUseCounts[OpI];
+    }
+  }
+
+  // Consider any sink candidates which are only used in CondBB as costs for
+  // speculation. Note, while we iterate over a DenseMap here, we are summing
+  // and so iteration order isn't significant.
+  for (SmallDenseMap<Instruction *, unsigned, 4>::iterator I =
+           SinkCandidateUseCounts.begin(), E = SinkCandidateUseCounts.end();
+       I != E; ++I)
+    if (I->first->getNumUses() == I->second) {
+      ++SpeculationCost;
+      if (SpeculationCost > 1)
+        return false;
+    }
+
+  // Check that the PHI nodes can be converted to selects.
+  bool HaveRewritablePHIs = false;
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    Value *OrigV = PN->getIncomingValueForBlock(BB);
+    Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
+
+    // Skip PHIs which are trivial.
+    if (ThenV == OrigV)
+      continue;
+
+    HaveRewritablePHIs = true;
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
+    if (!CE)
+      continue; // Known safe and cheap.
+
+    if (!isSafeToSpeculativelyExecute(CE))
+      return false;
+    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+      return false;
+
+    // Account for the cost of an unfolded ConstantExpr which could end up
+    // getting expanded into Instructions.
+    // FIXME: This doesn't account for how many operations are combined in the
+    // constant expression.
+    ++SpeculationCost;
+    if (SpeculationCost > 1)
+      return false;
+  }
+
+  // If there are no PHIs to process, bail early. This helps ensure idempotence
+  // as well.
+  if (!HaveRewritablePHIs)
+    return false;
+
+  // If we get here, we can hoist the instruction and if-convert.
+  DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+
+  // Hoist the instructions.
+  BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
+                           llvm::prior(ThenBB->end()));
+
+  // Insert selects and rewrite the PHI operands.
+  IRBuilder<true, NoFolder> Builder(BI);
+  for (BasicBlock::iterator I = EndBB->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    unsigned OrigI = PN->getBasicBlockIndex(BB);
+    unsigned ThenI = PN->getBasicBlockIndex(ThenBB);
+    Value *OrigV = PN->getIncomingValue(OrigI);
+    Value *ThenV = PN->getIncomingValue(ThenI);
+
+    // Skip PHIs which are trivial.
+    if (OrigV == ThenV)
+      continue;
+
+    // Create a select whose true value is the speculatively executed value and
+    // false value is the preexisting value. Swap them if the branch
+    // destinations were inverted.
+    Value *TrueV = ThenV, *FalseV = OrigV;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV,
+                                    TrueV->getName() + "." + FalseV->getName());
+    PN->setIncomingValue(OrigI, V);
+    PN->setIncomingValue(ThenI, V);
+  }
+
+  ++NumSpeculations;
+  return true;
+}
+
+/// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
+/// across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  unsigned Size = 0;
+
+  for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+    if (isa<DbgInfoIntrinsic>(BBI))
+      continue;
+    if (Size > 10) return false;  // Don't clone large BB's.
+    ++Size;
+
+    // We can only support instructions that do not define values that are
+    // live outside of the current basic block.
+    for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
+         UI != E; ++UI) {
+      Instruction *U = cast<Instruction>(*UI);
+      if (U->getParent() != BB || isa<PHINode>(U)) return false;
+    }
+
+    // Looks ok, continue checking.
+  }
+
+  return true;
+}
+
+/// FoldCondBranchOnPHI - If we have a conditional branch on a PHI node value
+/// that is defined in the same block as the branch and if any PHI entries are
+/// constants, thread edges corresponding to that entry to be branches to their
+/// ultimate destination.
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
+  BasicBlock *BB = BI->getParent();
+  PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+  // NOTE: we currently cannot transform this case if the PHI node is used
+  // outside of the block.
+  if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+    return false;
+
+  // Degenerate case of a single entry PHI.
+  if (PN->getNumIncomingValues() == 1) {
+    FoldSingleEntryPHINodes(PN->getParent());
+    return true;
+  }
+
+  // Now we know that this block has multiple preds and two succs.
+  if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
+
+  // Okay, this is a simple enough basic block.  See if any phi values are
+  // constants.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
+    if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue;
+
+    // Okay, we now know that all edges from PredBB should be revectored to
+    // branch to RealDest.
+    BasicBlock *PredBB = PN->getIncomingBlock(i);
+    BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+
+    if (RealDest == BB) continue;  // Skip self loops.
+    // Skip if the predecessor's terminator is an indirect branch.
+    if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
+
+    // The dest block might have PHI nodes, other predecessors and other
+    // difficult cases.  Instead of being smart about this, just insert a new
+    // block that jumps to the destination block, effectively splitting
+    // the edge we are about to create.
+    BasicBlock *EdgeBB = BasicBlock::Create(BB->getContext(),
+                                            RealDest->getName()+".critedge",
+                                            RealDest->getParent(), RealDest);
+    BranchInst::Create(RealDest, EdgeBB);
+
+    // Update PHI nodes.
+    AddPredecessorToBlock(RealDest, EdgeBB, BB);
+
+    // BB may have instructions that are being threaded over.  Clone these
+    // instructions into EdgeBB.  We know that there will be no uses of the
+    // cloned instructions outside of EdgeBB.
+    BasicBlock::iterator InsertPt = EdgeBB->begin();
+    DenseMap<Value*, Value*> TranslateMap;  // Track translated values.
+    for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+      if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+        TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+        continue;
+      }
+      // Clone the instruction.
+      Instruction *N = BBI->clone();
+      if (BBI->hasName()) N->setName(BBI->getName()+".c");
+
+      // Update operands due to translation.
+      for (User::op_iterator i = N->op_begin(), e = N->op_end();
+           i != e; ++i) {
+        DenseMap<Value*, Value*>::iterator PI = TranslateMap.find(*i);
+        if (PI != TranslateMap.end())
+          *i = PI->second;
+      }
+
+      // Check for trivial simplification.
+      if (Value *V = SimplifyInstruction(N, TD)) {
+        TranslateMap[BBI] = V;
+        delete N;   // Instruction folded away, don't need actual inst
+      } else {
+        // Insert the new instruction into its new home.
+        EdgeBB->getInstList().insert(InsertPt, N);
+        if (!BBI->use_empty())
+          TranslateMap[BBI] = N;
+      }
+    }
+
+    // Loop over all of the edges from PredBB to BB, changing them to branch
+    // to EdgeBB instead.
+    TerminatorInst *PredBBTI = PredBB->getTerminator();
+    for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+      if (PredBBTI->getSuccessor(i) == BB) {
+        BB->removePredecessor(PredBB);
+        PredBBTI->setSuccessor(i, EdgeBB);
+      }
+
+    // Recurse, simplifying any other constants.
+    return FoldCondBranchOnPHI(BI, TD) | true;
+  }
+
+  return false;
+}
+
+/// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
+/// PHI node, see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) {
+  // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
+  // statement", which has a very simple dominance structure.  Basically, we
+  // are trying to find the condition that is being branched on, which
+  // subsequently causes this merge to happen.  We really want control
+  // dependence information for this check, but simplifycfg can't keep it up
+  // to date, and this catches most of the cases we care about anyway.
+  BasicBlock *BB = PN->getParent();
+  BasicBlock *IfTrue, *IfFalse;
+  Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+  if (!IfCond ||
+      // Don't bother if the branch will be constant folded trivially.
+      isa<ConstantInt>(IfCond))
+    return false;
+
+  // Okay, we found that we can merge this two-entry phi node into a select.
+  // Doing so would require us to fold *all* two entry phi nodes in this block.
+  // At some point this becomes non-profitable (particularly if the target
+  // doesn't support cmov's).  Only do this transformation if there are two or
+  // fewer PHI nodes in this block.
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+    if (NumPhis > 2)
+      return false;
+
+  // Loop over the PHI's seeing if we can promote them all to select
+  // instructions.  While we are at it, keep track of the instructions
+  // that need to be moved to the dominating block.
+  SmallPtrSet<Instruction*, 4> AggressiveInsts;
+  unsigned MaxCostVal0 = PHINodeFoldingThreshold,
+           MaxCostVal1 = PHINodeFoldingThreshold;
+
+  for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
+    PHINode *PN = cast<PHINode>(II++);
+    if (Value *V = SimplifyInstruction(PN, TD)) {
+      PN->replaceAllUsesWith(V);
+      PN->eraseFromParent();
+      continue;
+    }
+
+    if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
+                             MaxCostVal0) ||
+        !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
+                             MaxCostVal1))
+      return false;
+  }
+
+  // If we folded the first phi, PN dangles at this point.  Refresh it.  If
+  // we ran out of PHIs then we simplified them all.
+  PN = dyn_cast<PHINode>(BB->begin());
+  if (PN == 0) return true;
+
+  // Don't fold i1 branches on PHIs which contain binary operators.  These can
+  // often be turned into switches and other things.
+  if (PN->getType()->isIntegerTy(1) &&
+      (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
+       isa<BinaryOperator>(PN->getIncomingValue(1)) ||
+       isa<BinaryOperator>(IfCond)))
+    return false;
+
+  // If we all PHI nodes are promotable, check to make sure that all
+  // instructions in the predecessor blocks can be promoted as well.  If
+  // not, we won't be able to get rid of the control flow, so it's not
+  // worth promoting to select instructions.
+  BasicBlock *DomBlock = 0;
+  BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
+  BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
+  if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
+    IfBlock1 = 0;
+  } else {
+    DomBlock = *pred_begin(IfBlock1);
+    for (BasicBlock::iterator I = IfBlock1->begin();!isa<TerminatorInst>(I);++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+
+  if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
+    IfBlock2 = 0;
+  } else {
+    DomBlock = *pred_begin(IfBlock2);
+    for (BasicBlock::iterator I = IfBlock2->begin();!isa<TerminatorInst>(I);++I)
+      if (!AggressiveInsts.count(I) && !isa<DbgInfoIntrinsic>(I)) {
+        // This is not an aggressive instruction that we can promote.
+        // Because of this, we won't be able to get rid of the control
+        // flow, so the xform is not worth it.
+        return false;
+      }
+  }
+
+  DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
+               << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
+
+  // If we can still promote the PHI nodes after this gauntlet of tests,
+  // do all of the PHI's now.
+  Instruction *InsertPt = DomBlock->getTerminator();
+  IRBuilder<true, NoFolder> Builder(InsertPt);
+
+  // Move all 'aggressive' instructions, which are defined in the
+  // conditional parts of the if's up to the dominating block.
+  if (IfBlock1)
+    DomBlock->getInstList().splice(InsertPt,
+                                   IfBlock1->getInstList(), IfBlock1->begin(),
+                                   IfBlock1->getTerminator());
+  if (IfBlock2)
+    DomBlock->getInstList().splice(InsertPt,
+                                   IfBlock2->getInstList(), IfBlock2->begin(),
+                                   IfBlock2->getTerminator());
+
+  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+    // Change the PHI node into a select instruction.
+    Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+    Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+
+    SelectInst *NV =
+      cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
+    PN->replaceAllUsesWith(NV);
+    NV->takeName(PN);
+    PN->eraseFromParent();
+  }
+
+  // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
+  // has been flattened.  Change DomBlock to jump directly to our new block to
+  // avoid other simplifycfg's kicking in on the diamond.
+  TerminatorInst *OldTI = DomBlock->getTerminator();
+  Builder.SetInsertPoint(OldTI);
+  Builder.CreateBr(BB);
+  OldTI->eraseFromParent();
+  return true;
+}
+
+/// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
+/// to two returning blocks, try to merge them together into one return,
+/// introducing a select if the return values disagree.
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                           IRBuilder<> &Builder) {
+  assert(BI->isConditional() && "Must be a conditional branch");
+  BasicBlock *TrueSucc = BI->getSuccessor(0);
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+  ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+
+  // Check to ensure both blocks are empty (just a return) or optionally empty
+  // with PHI nodes.  If there are other instructions, merging would cause extra
+  // computation on one path or the other.
+  if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+  if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
+    return false;
+
+  Builder.SetInsertPoint(BI);
+  // Okay, we found a branch that is going to two return nodes.  If
+  // there is no return value for this function, just change the
+  // branch into a return.
+  if (FalseRet->getNumOperands() == 0) {
+    TrueSucc->removePredecessor(BI->getParent());
+    FalseSucc->removePredecessor(BI->getParent());
+    Builder.CreateRetVoid();
+    EraseTerminatorInstAndDCECond(BI);
+    return true;
+  }
+
+  // Otherwise, figure out what the true and false return values are
+  // so we can insert a new select instruction.
+  Value *TrueValue = TrueRet->getReturnValue();
+  Value *FalseValue = FalseRet->getReturnValue();
+
+  // Unwrap any PHI nodes in the return blocks.
+  if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+    if (TVPN->getParent() == TrueSucc)
+      TrueValue = TVPN->getIncomingValueForBlock(BI->getParent());
+  if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+    if (FVPN->getParent() == FalseSucc)
+      FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
+
+  // In order for this transformation to be safe, we must be able to
+  // unconditionally execute both operands to the return.  This is
+  // normally the case, but we could have a potentially-trapping
+  // constant expression that prevents this transformation from being
+  // safe.
+  if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+    if (TCV->canTrap())
+      return false;
+  if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+    if (FCV->canTrap())
+      return false;
+
+  // Okay, we collected all the mapped values and checked them for sanity, and
+  // defined to really do this transformation.  First, update the CFG.
+  TrueSucc->removePredecessor(BI->getParent());
+  FalseSucc->removePredecessor(BI->getParent());
+
+  // Insert select instructions where needed.
+  Value *BrCond = BI->getCondition();
+  if (TrueValue) {
+    // Insert a select if the results differ.
+    if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+    } else if (isa<UndefValue>(TrueValue)) {
+      TrueValue = FalseValue;
+    } else {
+      TrueValue = Builder.CreateSelect(BrCond, TrueValue,
+                                       FalseValue, "retval");
+    }
+  }
+
+  Value *RI = !TrueValue ?
+    Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
+  (void) RI;
+
+  DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+               << "\n  " << *BI << "NewRet = " << *RI
+               << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
+
+  EraseTerminatorInstAndDCECond(BI);
+
+  return true;
+}
+
+/// ExtractBranchMetadata - Given a conditional BranchInstruction, retrieve the
+/// probabilities of the branch taking each edge. Fills in the two APInt
+/// parameters and return true, or returns false if no or invalid metadata was
+/// found.
+static bool ExtractBranchMetadata(BranchInst *BI,
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
+  assert(BI->isConditional() &&
+         "Looking for probabilities on unconditional branch?");
+  MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3) return false;
+  ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
+  ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse) return false;
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
+  return true;
+}
+
+/// checkCSEInPredecessor - Return true if the given instruction is available
+/// in its predecessor block. If yes, the instruction will be removed.
+///
+static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) {
+  if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst))
+    return false;
+  for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) {
+    Instruction *PBI = &*I;
+    // Check whether Inst and PBI generate the same value.
+    if (Inst->isIdenticalTo(PBI)) {
+      Inst->replaceAllUsesWith(PBI);
+      Inst->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// FoldBranchToCommonDest - If this basic block is simple enough, and if a
+/// predecessor branches to us and one of our successors, fold the block into
+/// the predecessor and use logical operations to pick the right destination.
+bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
+  BasicBlock *BB = BI->getParent();
+
+  Instruction *Cond = 0;
+  if (BI->isConditional())
+    Cond = dyn_cast<Instruction>(BI->getCondition());
+  else {
+    // For unconditional branch, check for a simple CFG pattern, where
+    // BB has a single predecessor and BB's successor is also its predecessor's
+    // successor. If such pattern exisits, check for CSE between BB and its
+    // predecessor.
+    if (BasicBlock *PB = BB->getSinglePredecessor())
+      if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator()))
+        if (PBI->isConditional() &&
+            (BI->getSuccessor(0) == PBI->getSuccessor(0) ||
+             BI->getSuccessor(0) == PBI->getSuccessor(1))) {
+          for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+               I != E; ) {
+            Instruction *Curr = I++;
+            if (isa<CmpInst>(Curr)) {
+              Cond = Curr;
+              break;
+            }
+            // Quit if we can't remove this instruction.
+            if (!checkCSEInPredecessor(Curr, PB))
+              return false;
+          }
+        }
+
+    if (Cond == 0)
+      return false;
+  }
+
+  if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+    Cond->getParent() != BB || !Cond->hasOneUse())
+  return false;
+
+  // Only allow this if the condition is a simple instruction that can be
+  // executed unconditionally.  It must be in the same block as the branch, and
+  // must be at the front of the block.
+  BasicBlock::iterator FrontIt = BB->front();
+
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
+
+  // Allow a single instruction to be hoisted in addition to the compare
+  // that feeds the branch.  We later ensure that any values that _it_ uses
+  // were also live in the predecessor, so that we don't unnecessarily create
+  // register pressure or inhibit out-of-order execution.
+  Instruction *BonusInst = 0;
+  if (&*FrontIt != Cond &&
+      FrontIt->hasOneUse() && *FrontIt->use_begin() == Cond &&
+      isSafeToSpeculativelyExecute(FrontIt)) {
+    BonusInst = &*FrontIt;
+    ++FrontIt;
+
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
+  }
+
+  // Only a single bonus inst is allowed.
+  if (&*FrontIt != Cond)
+    return false;
+
+  // Make sure the instruction after the condition is the cond branch.
+  BasicBlock::iterator CondIt = Cond; ++CondIt;
+
+  // Ingore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
+
+  if (&*CondIt != BI)
+    return false;
+
+  // Cond is known to be a compare or binary operator.  Check to make sure that
+  // neither operand is a potentially-trapping constant expression.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+    if (CE->canTrap())
+      return false;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+    if (CE->canTrap())
+      return false;
+
+  // Finally, don't infinitely unroll conditional loops.
+  BasicBlock *TrueDest  = BI->getSuccessor(0);
+  BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
+  if (TrueDest == BB || FalseDest == BB)
+    return false;
+
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *PredBlock = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+
+    // Check that we have two conditional branches.  If there is a PHI node in
+    // the common successor, verify that the same value flows in from both
+    // blocks.
+    SmallVector<PHINode*, 4> PHIs;
+    if (PBI == 0 || PBI->isUnconditional() ||
+        (BI->isConditional() &&
+         !SafeToMergeTerminators(BI, PBI)) ||
+        (!BI->isConditional() &&
+         !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
+      continue;
+
+    // Determine if the two branches share a common destination.
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
+    bool InvertPredCond = false;
+
+    if (BI->isConditional()) {
+      if (PBI->getSuccessor(0) == TrueDest)
+        Opc = Instruction::Or;
+      else if (PBI->getSuccessor(1) == FalseDest)
+        Opc = Instruction::And;
+      else if (PBI->getSuccessor(0) == FalseDest)
+        Opc = Instruction::And, InvertPredCond = true;
+      else if (PBI->getSuccessor(1) == TrueDest)
+        Opc = Instruction::Or, InvertPredCond = true;
+      else
+        continue;
+    } else {
+      if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest)
+        continue;
+    }
+
+    // Ensure that any values used in the bonus instruction are also used
+    // by the terminator of the predecessor.  This means that those values
+    // must already have been resolved, so we won't be inhibiting the
+    // out-of-order core by speculating them earlier.
+    if (BonusInst) {
+      // Collect the values used by the bonus inst
+      SmallPtrSet<Value*, 4> UsedValues;
+      for (Instruction::op_iterator OI = BonusInst->op_begin(),
+           OE = BonusInst->op_end(); OI != OE; ++OI) {
+        Value *V = *OI;
+        if (!isa<Constant>(V))
+          UsedValues.insert(V);
+      }
+
+      SmallVector<std::pair<Value*, unsigned>, 4> Worklist;
+      Worklist.push_back(std::make_pair(PBI->getOperand(0), 0));
+
+      // Walk up to four levels back up the use-def chain of the predecessor's
+      // terminator to see if all those values were used.  The choice of four
+      // levels is arbitrary, to provide a compile-time-cost bound.
+      while (!Worklist.empty()) {
+        std::pair<Value*, unsigned> Pair = Worklist.back();
+        Worklist.pop_back();
+
+        if (Pair.second >= 4) continue;
+        UsedValues.erase(Pair.first);
+        if (UsedValues.empty()) break;
+
+        if (Instruction *I = dyn_cast<Instruction>(Pair.first)) {
+          for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+               OI != OE; ++OI)
+            Worklist.push_back(std::make_pair(OI->get(), Pair.second+1));
+        }
+      }
+
+      if (!UsedValues.empty()) return false;
+    }
+
+    DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    IRBuilder<> Builder(PBI);
+
+    // If we need to invert the condition in the pred block to match, do so now.
+    if (InvertPredCond) {
+      Value *NewCond = PBI->getCondition();
+
+      if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
+        CmpInst *CI = cast<CmpInst>(NewCond);
+        CI->setPredicate(CI->getInversePredicate());
+      } else {
+        NewCond = Builder.CreateNot(NewCond,
+                                    PBI->getCondition()->getName()+".not");
+      }
+
+      PBI->setCondition(NewCond);
+      PBI->swapSuccessors();
+    }
+
+    // If we have a bonus inst, clone it into the predecessor block.
+    Instruction *NewBonus = 0;
+    if (BonusInst) {
+      NewBonus = BonusInst->clone();
+      PredBlock->getInstList().insert(PBI, NewBonus);
+      NewBonus->takeName(BonusInst);
+      BonusInst->setName(BonusInst->getName()+".old");
+    }
+
+    // Clone Cond into the predecessor basic block, and or/and the
+    // two conditions together.
+    Instruction *New = Cond->clone();
+    if (BonusInst) New->replaceUsesOfWith(BonusInst, NewBonus);
+    PredBlock->getInstList().insert(PBI, New);
+    New->takeName(Cond);
+    Cond->setName(New->getName()+".old");
+
+    if (BI->isConditional()) {
+      Instruction *NewCond =
+        cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
+                                            New, "or.cond"));
+      PBI->setCondition(NewCond);
+
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                                  PredFalseWeight);
+      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                                  SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
+
+      if (PBI->getSuccessor(0) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
+               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(TrueDest, PredBlock, BB);
+        PBI->setSuccessor(0, TrueDest);
+      }
+      if (PBI->getSuccessor(1) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
+              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
+        AddPredecessorToBlock(FalseDest, PredBlock, BB);
+        PBI->setSuccessor(1, FalseDest);
+      }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
+
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
+        PBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(BI->getContext()).
+                         createBranchWeights(MDWeights));
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, NULL);
+    } else {
+      // Update PHI nodes in the common successors.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+        ConstantInt *PBI_C = cast<ConstantInt>(
+          PHIs[i]->getIncomingValueForBlock(PBI->getParent()));
+        assert(PBI_C->getType()->isIntegerTy(1));
+        Instruction *MergedCond = 0;
+        if (PBI->getSuccessor(0) == TrueDest) {
+          // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value)
+          // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value)
+          //       is false: !PBI_Cond and BI_Value
+          Instruction *NotCond =
+            cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                "not.cond"));
+          MergedCond =
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                NotCond, New,
+                                "and.cond"));
+          if (PBI_C->isOne())
+            MergedCond =
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  PBI->getCondition(), MergedCond,
+                                  "or.cond"));
+        } else {
+          // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
+          // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
+          //       is false: PBI_Cond and BI_Value
+          MergedCond =
+            cast<Instruction>(Builder.CreateBinOp(Instruction::And,
+                                PBI->getCondition(), New,
+                                "and.cond"));
+          if (PBI_C->isOne()) {
+            Instruction *NotCond =
+              cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
+                                  "not.cond"));
+            MergedCond =
+              cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
+                                  NotCond, MergedCond,
+                                  "or.cond"));
+          }
+        }
+        // Update PHI Node.
+        PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()),
+                                  MergedCond);
+      }
+      // Change PBI from Conditional to Unconditional.
+      BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
+      EraseTerminatorInstAndDCECond(PBI);
+      PBI = New_PBI;
+    }
+
+    // TODO: If BB is reachable from all paths through PredBlock, then we
+    // could replace PBI's branch probabilities with BI's.
+
+    // Copy any debug value intrinsics into the end of PredBlock.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isa<DbgInfoIntrinsic>(*I))
+        I->clone()->insertBefore(PBI);
+
+    return true;
+  }
+  return false;
+}
+
+/// SimplifyCondBranchToCondBranch - If we have a conditional branch as a
+/// predecessor of another block, this function tries to simplify it.  We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
+  assert(PBI->isConditional() && BI->isConditional());
+  BasicBlock *BB = BI->getParent();
+
+  // If this block ends with a branch instruction, and if there is a
+  // predecessor that ends on a branch of the same condition, make
+  // this conditional branch redundant.
+  if (PBI->getCondition() == BI->getCondition() &&
+      PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+    // Okay, the outcome of this conditional branch is statically
+    // knowable.  If this block had a single pred, handle specially.
+    if (BB->getSinglePredecessor()) {
+      // Turn this into a branch on constant.
+      bool CondIsTrue = PBI->getSuccessor(0) == BB;
+      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
+                                        CondIsTrue));
+      return true;  // Nuke the branch on constant.
+    }
+
+    // Otherwise, if there are multiple predecessors, insert a PHI that merges
+    // in the constant and simplify the block result.  Subsequent passes of
+    // simplifycfg will thread the block.
+    if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+      pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+      PHINode *NewPN = PHINode::Create(Type::getInt1Ty(BB->getContext()),
+                                       std::distance(PB, PE),
+                                       BI->getCondition()->getName() + ".pr",
+                                       BB->begin());
+      // Okay, we're going to insert the PHI node.  Since PBI is not the only
+      // predecessor, compute the PHI'd conditional value for all of the preds.
+      // Any predecessor where the condition is not computable we keep symbolic.
+      for (pred_iterator PI = PB; PI != PE; ++PI) {
+        BasicBlock *P = *PI;
+        if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) &&
+            PBI != BI && PBI->isConditional() &&
+            PBI->getCondition() == BI->getCondition() &&
+            PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+          bool CondIsTrue = PBI->getSuccessor(0) == BB;
+          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
+                                              CondIsTrue), P);
+        } else {
+          NewPN->addIncoming(BI->getCondition(), P);
+        }
+      }
+
+      BI->setCondition(NewPN);
+      return true;
+    }
+  }
+
+  // If this is a conditional branch in an empty block, and if any
+  // predecessors is a conditional branch to one of our destinations,
+  // fold the conditions into logical ops and one cond br.
+  BasicBlock::iterator BBI = BB->begin();
+  // Ignore dbg intrinsics.
+  while (isa<DbgInfoIntrinsic>(BBI))
+    ++BBI;
+  if (&*BBI != BI)
+    return false;
+
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+    if (CE->canTrap())
+      return false;
+
+  int PBIOp, BIOp;
+  if (PBI->getSuccessor(0) == BI->getSuccessor(0))
+    PBIOp = BIOp = 0;
+  else if (PBI->getSuccessor(0) == BI->getSuccessor(1))
+    PBIOp = 0, BIOp = 1;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(0))
+    PBIOp = 1, BIOp = 0;
+  else if (PBI->getSuccessor(1) == BI->getSuccessor(1))
+    PBIOp = BIOp = 1;
+  else
+    return false;
+
+  // Check to make sure that the other destination of this branch
+  // isn't BB itself.  If so, this is an infinite loop that will
+  // keep getting unwound.
+  if (PBI->getSuccessor(PBIOp) == BB)
+    return false;
+
+  // Do not perform this transformation if it would require
+  // insertion of a large number of select instructions. For targets
+  // without predication/cmovs, this is a big pessimization.
+  BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+
+  unsigned NumPhis = 0;
+  for (BasicBlock::iterator II = CommonDest->begin();
+       isa<PHINode>(II); ++II, ++NumPhis)
+    if (NumPhis > 2) // Disable this xform.
+      return false;
+
+  // Finally, if everything is ok, fold the branches to logical ops.
+  BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
+
+  DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+               << "AND: " << *BI->getParent());
+
+
+  // If OtherDest *is* BB, then BB is a basic block with a single conditional
+  // branch in it, where one edge (OtherDest) goes back to itself but the other
+  // exits.  We don't *know* that the program avoids the infinite loop
+  // (even though that seems likely).  If we do this xform naively, we'll end up
+  // recursively unpeeling the loop.  Since we know that (after the xform is
+  // done) that the block *is* infinite if reached, we just make it an obviously
+  // infinite loop with no cond branch.
+  if (OtherDest == BB) {
+    // Insert it at the end of the function, because it's either code,
+    // or it won't matter if it's hot. :)
+    BasicBlock *InfLoopBlock = BasicBlock::Create(BB->getContext(),
+                                                  "infloop", BB->getParent());
+    BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    OtherDest = InfLoopBlock;
+  }
+
+  DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // BI may have other predecessors.  Because of this, we leave
+  // it alone, but modify PBI.
+
+  // Make sure we get to CommonDest on True&True directions.
+  Value *PBICond = PBI->getCondition();
+  IRBuilder<true, NoFolder> Builder(PBI);
+  if (PBIOp)
+    PBICond = Builder.CreateNot(PBICond, PBICond->getName()+".not");
+
+  Value *BICond = BI->getCondition();
+  if (BIOp)
+    BICond = Builder.CreateNot(BICond, BICond->getName()+".not");
+
+  // Merge the conditions.
+  Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
+
+  // Modify PBI to branch on the new condition to the new dests.
+  PBI->setCondition(Cond);
+  PBI->setSuccessor(0, CommonDest);
+  PBI->setSuccessor(1, OtherDest);
+
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                              PredFalseWeight);
+  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                              SuccFalseWeight);
+  if (PredHasWeights && SuccHasWeights) {
+    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
+    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    SmallVector<uint64_t, 2> NewWeights;
+    NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) +
+                         PredOther * SuccCommon);
+    NewWeights.push_back(PredOther * SuccOther);
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end());
+    PBI->setMetadata(LLVMContext::MD_prof,
+                     MDBuilder(BI->getContext()).
+                     createBranchWeights(MDWeights));
+  }
+
+  // OtherDest may have phi nodes.  If so, add an entry from PBI's
+  // block that are identical to the entries for BI's block.
+  AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
+
+  // We know that the CommonDest already had an edge from PBI to
+  // it.  If it has PHIs though, the PHIs may have different
+  // entries for BB and PBI's BB.  If so, insert a select to make
+  // them agree.
+  PHINode *PN;
+  for (BasicBlock::iterator II = CommonDest->begin();
+       (PN = dyn_cast<PHINode>(II)); ++II) {
+    Value *BIV = PN->getIncomingValueForBlock(BB);
+    unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+    Value *PBIV = PN->getIncomingValue(PBBIdx);
+    if (BIV != PBIV) {
+      // Insert a select in PBI to pick the right value.
+      Value *NV = cast<SelectInst>
+        (Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName()+".mux"));
+      PN->setIncomingValue(PBBIdx, NV);
+    }
+  }
+
+  DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+  DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+  // This basic block is probably dead.  We know it has at least
+  // one fewer predecessor.
+  return true;
+}
+
+// SimplifyTerminatorOnSelect - Simplifies a terminator by replacing it with a
+// branch to TrueBB if Cond is true or to FalseBB if Cond is false.
+// Takes care of updating the successors and removing the old terminator.
+// Also makes sure not to introduce new successors by assuming that edges to
+// non-successor TrueBBs and FalseBBs aren't reachable.
+static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
+                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                       uint32_t TrueWeight,
+                                       uint32_t FalseWeight){
+  // Remove any superfluous successor edges from the CFG.
+  // First, figure out which successors to preserve.
+  // If TrueBB and FalseBB are equal, only try to preserve one copy of that
+  // successor.
+  BasicBlock *KeepEdge1 = TrueBB;
+  BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : 0;
+
+  // Then remove the rest.
+  for (unsigned I = 0, E = OldTerm->getNumSuccessors(); I != E; ++I) {
+    BasicBlock *Succ = OldTerm->getSuccessor(I);
+    // Make sure only to keep exactly one copy of each edge.
+    if (Succ == KeepEdge1)
+      KeepEdge1 = 0;
+    else if (Succ == KeepEdge2)
+      KeepEdge2 = 0;
+    else
+      Succ->removePredecessor(OldTerm->getParent());
+  }
+
+  IRBuilder<> Builder(OldTerm);
+  Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
+  // Insert an appropriate new terminator.
+  if ((KeepEdge1 == 0) && (KeepEdge2 == 0)) {
+    if (TrueBB == FalseBB)
+      // We were only looking for one successor, and it was present.
+      // Create an unconditional branch to it.
+      Builder.CreateBr(TrueBB);
+    else {
+      // We found both of the successors we were looking for.
+      // Create a conditional branch sharing the condition of the select.
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        NewBI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(OldTerm->getContext()).
+                           createBranchWeights(TrueWeight, FalseWeight));
+    }
+  } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
+    // Neither of the selected blocks were successors, so this
+    // terminator must be unreachable.
+    new UnreachableInst(OldTerm->getContext(), OldTerm);
+  } else {
+    // One of the selected values was a successor, but the other wasn't.
+    // Insert an unconditional branch to the one that was found;
+    // the edge to the one that wasn't must be unreachable.
+    if (KeepEdge1 == 0)
+      // Only TrueBB was found.
+      Builder.CreateBr(TrueBB);
+    else
+      // Only FalseBB was found.
+      Builder.CreateBr(FalseBB);
+  }
+
+  EraseTerminatorInstAndDCECond(OldTerm);
+  return true;
+}
+
+// SimplifySwitchOnSelect - Replaces
+//   (switch (select cond, X, Y)) on constant X, Y
+// with a branch - conditional if X and Y lead to distinct BBs,
+// unconditional otherwise.
+static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
+  // Check for constant integer values in the select.
+  ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
+  ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
+  if (!TrueVal || !FalseVal)
+    return false;
+
+  // Find the relevant condition and destinations.
+  Value *Condition = Select->getCondition();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
+
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
+                                     getSuccessorIndex()];
+      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
+                                      getSuccessorIndex()];
+    }
+  }
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
+                                    TrueWeight, FalseWeight);
+}
+
+// SimplifyIndirectBrOnSelect - Replaces
+//   (indirectbr (select cond, blockaddress(@fn, BlockA),
+//                             blockaddress(@fn, BlockB)))
+// with
+//   (br cond, BlockA, BlockB).
+static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
+  // Check that both operands of the select are block addresses.
+  BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
+  BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
+  if (!TBA || !FBA)
+    return false;
+
+  // Extract the actual blocks.
+  BasicBlock *TrueBB = TBA->getBasicBlock();
+  BasicBlock *FalseBB = FBA->getBasicBlock();
+
+  // Perform the actual simplification.
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+                                    0, 0);
+}
+
+/// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp
+/// instruction (a seteq/setne with a constant) as the only instruction in a
+/// block that ends with an uncond branch.  We are looking for a very specific
+/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified.  In
+/// this case, we merge the first two "or's of icmp" into a switch, but then the
+/// default value goes to an uncond block with a seteq in it, we get something
+/// like:
+///
+///   switch i8 %A, label %DEFAULT [ i8 1, label %end    i8 2, label %end ]
+/// DEFAULT:
+///   %tmp = icmp eq i8 %A, 92
+///   br label %end
+/// end:
+///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
+///
+/// We prefer to split the edge to 'end' so that there is a true/false entry to
+/// the PHI, merging the third icmp into the switch.
+static bool TryToSimplifyUncondBranchWithICmpInIt(
+    ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
+    const DataLayout *TD) {
+  BasicBlock *BB = ICI->getParent();
+
+  // If the block has any PHIs in it or the icmp has multiple uses, it is too
+  // complex.
+  if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse()) return false;
+
+  Value *V = ICI->getOperand(0);
+  ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
+
+  // The pattern we're looking for is where our only predecessor is a switch on
+  // 'V' and this block is the default case for the switch.  In this case we can
+  // fold the compared value into the switch to simplify things.
+  BasicBlock *Pred = BB->getSinglePredecessor();
+  if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false;
+
+  SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
+  if (SI->getCondition() != V)
+    return false;
+
+  // If BB is reachable on a non-default case, then we simply know the value of
+  // V in this block.  Substitute it and constant fold the icmp instruction
+  // away.
+  if (SI->getDefaultDest() != BB) {
+    ConstantInt *VVal = SI->findCaseDest(BB);
+    assert(VVal && "Should have a unique destination value");
+    ICI->setOperand(0, VVal);
+
+    if (Value *V = SimplifyInstruction(ICI, TD)) {
+      ICI->replaceAllUsesWith(V);
+      ICI->eraseFromParent();
+    }
+    // BB is now empty, so it is likely to simplify away.
+    return SimplifyCFG(BB, TTI, TD) | true;
+  }
+
+  // Ok, the block is reachable from the default dest.  If the constant we're
+  // comparing exists in one of the other edges, then we can constant fold ICI
+  // and zap it.
+  if (SI->findCaseValue(Cst) != SI->case_default()) {
+    Value *V;
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      V = ConstantInt::getFalse(BB->getContext());
+    else
+      V = ConstantInt::getTrue(BB->getContext());
+
+    ICI->replaceAllUsesWith(V);
+    ICI->eraseFromParent();
+    // BB is now empty, so it is likely to simplify away.
+    return SimplifyCFG(BB, TTI, TD) | true;
+  }
+
+  // The use of the icmp has to be in the 'end' block, by the only PHI node in
+  // the block.
+  BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
+  PHINode *PHIUse = dyn_cast<PHINode>(ICI->use_back());
+  if (PHIUse == 0 || PHIUse != &SuccBlock->front() ||
+      isa<PHINode>(++BasicBlock::iterator(PHIUse)))
+    return false;
+
+  // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
+  // true in the PHI.
+  Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
+  Constant *NewCst     = ConstantInt::getFalse(BB->getContext());
+
+  if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+    std::swap(DefaultCst, NewCst);
+
+  // Replace ICI (which is used by the PHI for the default value) with true or
+  // false depending on if it is EQ or NE.
+  ICI->replaceAllUsesWith(DefaultCst);
+  ICI->eraseFromParent();
+
+  // Okay, the switch goes to this block on a default value.  Add an edge from
+  // the switch to the merge point on the compared value.
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
+                                         BB->getParent(), BB);
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Split weight for default case to case for "Cst".
+      Weights[0] = (Weights[0]+1) >> 1;
+      Weights.push_back(Weights[0]);
+
+      SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getContext()).
+                      createBranchWeights(MDWeights));
+    }
+  }
+  SI->addCase(Cst, NewBB);
+
+  // NewBB branches to the phi block, add the uncond branch and the phi entry.
+  Builder.SetInsertPoint(NewBB);
+  Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+  Builder.CreateBr(SuccBlock);
+  PHIUse->addIncoming(NewCst, NewBB);
+  return true;
+}
+
+/// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
+/// Check to see if it is branching on an or/and chain of icmp instructions, and
+/// fold it into a switch instruction if so.
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
+                                      IRBuilder<> &Builder) {
+  Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+  if (Cond == 0) return false;
+
+
+  // Change br (X == 0 | X == 1), T, F into a switch instruction.
+  // If this is a bunch of seteq's or'd together, or if it's a bunch of
+  // 'setne's and'ed together, collect them.
+  Value *CompVal = 0;
+  std::vector<ConstantInt*> Values;
+  bool TrueWhenEqual = true;
+  Value *ExtraCase = 0;
+  unsigned UsedICmps = 0;
+
+  if (Cond->getOpcode() == Instruction::Or) {
+    CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true,
+                                     UsedICmps);
+  } else if (Cond->getOpcode() == Instruction::And) {
+    CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, false,
+                                     UsedICmps);
+    TrueWhenEqual = false;
+  }
+
+  // If we didn't have a multiply compared value, fail.
+  if (CompVal == 0) return false;
+
+  // Avoid turning single icmps into a switch.
+  if (UsedICmps <= 1)
+    return false;
+
+  // There might be duplicate constants in the list, which the switch
+  // instruction can't handle, remove them now.
+  array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
+  Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+  // If Extra was used, we require at least two switch values to do the
+  // transformation.  A switch with one value is just an cond branch.
+  if (ExtraCase && Values.size() < 2) return false;
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
+  // Figure out which block is which destination.
+  BasicBlock *DefaultBB = BI->getSuccessor(1);
+  BasicBlock *EdgeBB    = BI->getSuccessor(0);
+  if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
+
+  BasicBlock *BB = BI->getParent();
+
+  DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+               << " cases into SWITCH.  BB is:\n" << *BB);
+
+  // If there are any extra values that couldn't be folded into the switch
+  // then we evaluate them with an explicit branch first.  Split the block
+  // right before the condbr to handle it.
+  if (ExtraCase) {
+    BasicBlock *NewBB = BB->splitBasicBlock(BI, "switch.early.test");
+    // Remove the uncond branch added to the old block.
+    TerminatorInst *OldTI = BB->getTerminator();
+    Builder.SetInsertPoint(OldTI);
+
+    if (TrueWhenEqual)
+      Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
+    else
+      Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+
+    OldTI->eraseFromParent();
+
+    // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
+    // for the edge we just added.
+    AddPredecessorToBlock(EdgeBB, BB, NewBB);
+
+    DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
+          << "\nEXTRABB = " << *BB);
+    BB = NewBB;
+  }
+
+  Builder.SetInsertPoint(BI);
+  // Convert pointer to int before we switch.
+  if (CompVal->getType()->isPointerTy()) {
+    assert(TD && "Cannot switch on pointer without DataLayout");
+    CompVal = Builder.CreatePtrToInt(CompVal,
+                                     TD->getIntPtrType(CompVal->getContext()),
+                                     "magicptr");
+  }
+
+  // Create the new switch instruction now.
+  SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
+  // Add all of the 'cases' to the switch instruction.
+  for (unsigned i = 0, e = Values.size(); i != e; ++i)
+    New->addCase(Values[i], EdgeBB);
+
+  // We added edges from PI to the EdgeBB.  As such, if there were any
+  // PHI nodes in EdgeBB, they need entries to be added corresponding to
+  // the number of edges added.
+  for (BasicBlock::iterator BBI = EdgeBB->begin();
+       isa<PHINode>(BBI); ++BBI) {
+    PHINode *PN = cast<PHINode>(BBI);
+    Value *InVal = PN->getIncomingValueForBlock(BB);
+    for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
+      PN->addIncoming(InVal, BB);
+  }
+
+  // Erase the old branch instruction.
+  EraseTerminatorInstAndDCECond(BI);
+
+  DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+  // If this is a trivial landing pad that just continues unwinding the caught
+  // exception then zap the landing pad, turning its invokes into calls.
+  BasicBlock *BB = RI->getParent();
+  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  if (RI->getValue() != LPInst)
+    // Not a landing pad, or the resume is not unwinding the exception that
+    // caused control to branch here.
+    return false;
+
+  // Check that there are no other instructions except for debug intrinsics.
+  BasicBlock::iterator I = LPInst, E = RI;
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  // Turn all invokes that unwind here into calls and delete the basic block.
+  bool InvokeRequiresTableEntry = false;
+  bool Changed = false;
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
+
+    if (II->hasFnAttr(Attribute::UWTable)) {
+      // Don't remove an `invoke' instruction if the ABI requires an entry into
+      // the table.
+      InvokeRequiresTableEntry = true;
+      continue;
+    }
+
+    SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+
+    // Insert a call instruction before the invoke.
+    CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+    Call->takeName(II);
+    Call->setCallingConv(II->getCallingConv());
+    Call->setAttributes(II->getAttributes());
+    Call->setDebugLoc(II->getDebugLoc());
+
+    // Anything that used the value produced by the invoke instruction now uses
+    // the value produced by the call instruction.  Note that we do this even
+    // for void functions and calls with no uses so that the callgraph edge is
+    // updated.
+    II->replaceAllUsesWith(Call);
+    BB->removePredecessor(II->getParent());
+
+    // Insert a branch to the normal destination right before the invoke.
+    BranchInst::Create(II->getNormalDest(), II);
+
+    // Finally, delete the invoke instruction!
+    II->eraseFromParent();
+    Changed = true;
+  }
+
+  if (!InvokeRequiresTableEntry)
+    // The landingpad is now unreachable.  Zap it.
+    BB->eraseFromParent();
+
+  return Changed;
+}
+
+bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
+  BasicBlock *BB = RI->getParent();
+  if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
+
+  // Find predecessors that end with branches.
+  SmallVector<BasicBlock*, 8> UncondBranchPreds;
+  SmallVector<BranchInst*, 8> CondBranchPreds;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+    BasicBlock *P = *PI;
+    TerminatorInst *PTI = P->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+      if (BI->isUnconditional())
+        UncondBranchPreds.push_back(P);
+      else
+        CondBranchPreds.push_back(BI);
+    }
+  }
+
+  // If we found some, do the transformation!
+  if (!UncondBranchPreds.empty() && DupRet) {
+    while (!UncondBranchPreds.empty()) {
+      BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+      DEBUG(dbgs() << "FOLDING: " << *BB
+            << "INTO UNCOND BRANCH PRED: " << *Pred);
+      (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
+    }
+
+    // If we eliminated all predecessors of the block, delete the block now.
+    if (pred_begin(BB) == pred_end(BB))
+      // We know there are no successors, so just nuke the block.
+      BB->eraseFromParent();
+
+    return true;
+  }
+
+  // Check out all of the conditional branches going to this return
+  // instruction.  If any of them just select between returns, change the
+  // branch itself into a select/return pair.
+  while (!CondBranchPreds.empty()) {
+    BranchInst *BI = CondBranchPreds.pop_back_val();
+
+    // Check to see if the non-BB successor is also a return block.
+    if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+        isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+        SimplifyCondBranchToTwoReturns(BI, Builder))
+      return true;
+  }
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
+  BasicBlock *BB = UI->getParent();
+
+  bool Changed = false;
+
+  // If there are any instructions immediately before the unreachable that can
+  // be removed, do so.
+  while (UI != BB->begin()) {
+    BasicBlock::iterator BBI = UI;
+    --BBI;
+    // Do not delete instructions that can have side effects which might cause
+    // the unreachable to not be reachable; specifically, calls and volatile
+    // operations may have this effect.
+    if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
+
+    if (BBI->mayHaveSideEffects()) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        if (SI->isVolatile())
+          break;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+        if (LI->isVolatile())
+          break;
+      } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+        if (RMWI->isVolatile())
+          break;
+      } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+        if (CXI->isVolatile())
+          break;
+      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+                 !isa<LandingPadInst>(BBI)) {
+        break;
+      }
+      // Note that deleting LandingPad's here is in fact okay, although it
+      // involves a bit of subtle reasoning. If this inst is a LandingPad,
+      // all the predecessors of this block will be the unwind edges of Invokes,
+      // and we can therefore guarantee this block will be erased.
+    }
+
+    // Delete this instruction (any uses are guaranteed to be dead)
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BBI->eraseFromParent();
+    Changed = true;
+  }
+
+  // If the unreachable instruction is the first in the block, take a gander
+  // at all of the predecessors of this instruction, and simplify them.
+  if (&BB->front() != UI) return Changed;
+
+  SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    TerminatorInst *TI = Preds[i]->getTerminator();
+    IRBuilder<> Builder(TI);
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isUnconditional()) {
+        if (BI->getSuccessor(0) == BB) {
+          new UnreachableInst(TI->getContext(), TI);
+          TI->eraseFromParent();
+          Changed = true;
+        }
+      } else {
+        if (BI->getSuccessor(0) == BB) {
+          Builder.CreateBr(BI->getSuccessor(1));
+          EraseTerminatorInstAndDCECond(BI);
+        } else if (BI->getSuccessor(1) == BB) {
+          Builder.CreateBr(BI->getSuccessor(0));
+          EraseTerminatorInstAndDCECond(BI);
+          Changed = true;
+        }
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+           i != e; ++i)
+        if (i.getCaseSuccessor() == BB) {
+          BB->removePredecessor(SI->getParent());
+          SI->removeCase(i);
+          --i; --e;
+          Changed = true;
+        }
+      // If the default value is unreachable, figure out the most popular
+      // destination and make it the default.
+      if (SI->getDefaultDest() == BB) {
+        std::map<BasicBlock*, std::pair<unsigned, unsigned> > Popularity;
+        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+             i != e; ++i) {
+          std::pair<unsigned, unsigned> &entry =
+              Popularity[i.getCaseSuccessor()];
+          if (entry.first == 0) {
+            entry.first = 1;
+            entry.second = i.getCaseIndex();
+          } else {
+            entry.first++;
+          }
+        }
+
+        // Find the most popular block.
+        unsigned MaxPop = 0;
+        unsigned MaxIndex = 0;
+        BasicBlock *MaxBlock = 0;
+        for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
+             I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
+          if (I->second.first > MaxPop ||
+              (I->second.first == MaxPop && MaxIndex > I->second.second)) {
+            MaxPop = I->second.first;
+            MaxIndex = I->second.second;
+            MaxBlock = I->first;
+          }
+        }
+        if (MaxBlock) {
+          // Make this the new default, allowing us to delete any explicit
+          // edges to it.
+          SI->setDefaultDest(MaxBlock);
+          Changed = true;
+
+          // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
+          // it.
+          if (isa<PHINode>(MaxBlock->begin()))
+            for (unsigned i = 0; i != MaxPop-1; ++i)
+              MaxBlock->removePredecessor(SI->getParent());
+
+          for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+               i != e; ++i)
+            if (i.getCaseSuccessor() == MaxBlock) {
+              SI->removeCase(i);
+              --i; --e;
+            }
+        }
+      }
+    } else if (InvokeInst *II = dyn_cast<InvokeInst>(TI)) {
+      if (II->getUnwindDest() == BB) {
+        // Convert the invoke to a call instruction.  This would be a good
+        // place to note that the call does not throw though.
+        BranchInst *BI = Builder.CreateBr(II->getNormalDest());
+        II->removeFromParent();   // Take out of symbol table
+
+        // Insert the call now...
+        SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
+        Builder.SetInsertPoint(BI);
+        CallInst *CI = Builder.CreateCall(II->getCalledValue(),
+                                          Args, II->getName());
+        CI->setCallingConv(II->getCallingConv());
+        CI->setAttributes(II->getAttributes());
+        // If the invoke produced a value, the call does now instead.
+        II->replaceAllUsesWith(CI);
+        delete II;
+        Changed = true;
+      }
+    }
+  }
+
+  // If this block is now dead, remove it.
+  if (pred_begin(BB) == pred_end(BB) &&
+      BB != &BB->getParent()->getEntryBlock()) {
+    // We know there are no successors, so just nuke the block.
+    BB->eraseFromParent();
+    return true;
+  }
+
+  return Changed;
+}
+
+/// TurnSwitchRangeIntoICmp - Turns a switch with that contains only a
+/// integer range comparison into a sub, an icmp and a branch.
+static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Make sure all cases point to the same destination and gather the values.
+  SmallVector<ConstantInt *, 16> Cases;
+  SwitchInst::CaseIt I = SI->case_begin();
+  Cases.push_back(I.getCaseValue());
+  SwitchInst::CaseIt PrevI = I++;
+  for (SwitchInst::CaseIt E = SI->case_end(); I != E; PrevI = I++) {
+    if (PrevI.getCaseSuccessor() != I.getCaseSuccessor())
+      return false;
+    Cases.push_back(I.getCaseValue());
+  }
+  assert(Cases.size() == SI->getNumCases() && "Not all cases gathered");
+
+  // Sort the case values, then check if they form a range we can transform.
+  array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
+  for (unsigned I = 1, E = Cases.size(); I != E; ++I) {
+    if (Cases[I-1]->getValue() != Cases[I]->getValue()+1)
+      return false;
+  }
+
+  Constant *Offset = ConstantExpr::getNeg(Cases.back());
+  Constant *NumCases = ConstantInt::get(Offset->getType(), SI->getNumCases());
+
+  Value *Sub = SI->getCondition();
+  if (!Offset->isNullValue())
+    Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
+  Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  BranchInst *NewBI = Builder.CreateCondBr(
+      Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
+
+  // Update weight for the newly-created conditional branch.
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Combine all weights for the cases to be the true weight of NewBI.
+      // We assume that the sum of all weights for a Terminator can fit into 32
+      // bits.
+      uint32_t NewTrueWeight = 0;
+      for (unsigned I = 1, E = Weights.size(); I != E; ++I)
+        NewTrueWeight += (uint32_t)Weights[I];
+      NewBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(SI->getContext()).
+                         createBranchWeights(NewTrueWeight,
+                                             (uint32_t)Weights[0]));
+    }
+  }
+
+  // Prune obsolete incoming values off the successor's PHI nodes.
+  for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
+       isa<PHINode>(BBI); ++BBI) {
+    for (unsigned I = 0, E = SI->getNumCases()-1; I != E; ++I)
+      cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  return true;
+}
+
+/// EliminateDeadSwitchCases - Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
+static bool EliminateDeadSwitchCases(SwitchInst *SI) {
+  Value *Cond = SI->getCondition();
+  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
+  ComputeMaskedBits(Cond, KnownZero, KnownOne);
+
+  // Gather dead cases.
+  SmallVector<ConstantInt*, 8> DeadCases;
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    if ((I.getCaseValue()->getValue() & KnownZero) != 0 ||
+        (I.getCaseValue()->getValue() & KnownOne) != KnownOne) {
+      DeadCases.push_back(I.getCaseValue());
+      DEBUG(dbgs() << "SimplifyCFG: switch case '"
+                   << I.getCaseValue() << "' is dead.\n");
+    }
+  }
+
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeight = HasBranchWeights(SI);
+  if (HasWeight) {
+    GetBranchWeights(SI, Weights);
+    HasWeight = (Weights.size() == 1 + SI->getNumCases());
+  }
+
+  // Remove dead cases from the switch.
+  for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
+    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
+    assert(Case != SI->case_default() &&
+           "Case was not found. Probably mistake in DeadCases forming.");
+    if (HasWeight) {
+      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      Weights.pop_back();
+    }
+
+    // Prune unused values from PHI nodes.
+    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
+    SI->removeCase(Case);
+  }
+  if (HasWeight) {
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SI->setMetadata(LLVMContext::MD_prof,
+                    MDBuilder(SI->getParent()->getContext()).
+                    createBranchWeights(MDWeights));
+  }
+
+  return !DeadCases.empty();
+}
+
+/// FindPHIForConditionForwarding - If BB would be eligible for simplification
+/// by TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated
+/// by an unconditional branch), look at the phi node for BB in the successor
+/// block and see if the incoming value is equal to CaseValue. If so, return
+/// the phi node, and set PhiIndex to BB's index in the phi node.
+static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
+                                              BasicBlock *BB,
+                                              int *PhiIndex) {
+  if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
+    return NULL; // BB must be empty to be a candidate for simplification.
+  if (!BB->getSinglePredecessor())
+    return NULL; // BB must be dominated by the switch.
+
+  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!Branch || !Branch->isUnconditional())
+    return NULL; // Terminator must be unconditional branch.
+
+  BasicBlock *Succ = Branch->getSuccessor(0);
+
+  BasicBlock::iterator I = Succ->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(BB);
+    assert(Idx >= 0 && "PHI has no entry for predecessor?");
+
+    Value *InValue = PHI->getIncomingValue(Idx);
+    if (InValue != CaseValue) continue;
+
+    *PhiIndex = Idx;
+    return PHI;
+  }
+
+  return NULL;
+}
+
+/// ForwardSwitchConditionToPHI - Try to forward the condition of a switch
+/// instruction to a phi node dominated by the switch, if that would mean that
+/// some of the destination blocks of the switch can be folded away.
+/// Returns true if a change is made.
+static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
+  typedef DenseMap<PHINode*, SmallVector<int,4> > ForwardingNodesMap;
+  ForwardingNodesMap ForwardingNodes;
+
+  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E; ++I) {
+    ConstantInt *CaseValue = I.getCaseValue();
+    BasicBlock *CaseDest = I.getCaseSuccessor();
+
+    int PhiIndex;
+    PHINode *PHI = FindPHIForConditionForwarding(CaseValue, CaseDest,
+                                                 &PhiIndex);
+    if (!PHI) continue;
+
+    ForwardingNodes[PHI].push_back(PhiIndex);
+  }
+
+  bool Changed = false;
+
+  for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(),
+       E = ForwardingNodes.end(); I != E; ++I) {
+    PHINode *Phi = I->first;
+    SmallVector<int,4> &Indexes = I->second;
+
+    if (Indexes.size() < 2) continue;
+
+    for (size_t I = 0, E = Indexes.size(); I != E; ++I)
+      Phi->setIncomingValue(Indexes[I], SI->getCondition());
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// ValidLookupTableConstant - Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return CE->isGEPWithNoNotionalOverIndexing();
+
+  return isa<ConstantFP>(C) ||
+      isa<ConstantInt>(C) ||
+      isa<ConstantPointerNull>(C) ||
+      isa<GlobalValue>(C) ||
+      isa<UndefValue>(C);
+}
+
+/// LookupConstant - If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *LookupConstant(Value *V,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C;
+  return ConstantPool.lookup(V);
+}
+
+/// ConstantFold - Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *ConstantFold(Instruction *I,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::get(BO->getOpcode(), A, B);
+  }
+
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
+  }
+
+  if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+    Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+    if (!A)
+      return 0;
+    if (A->isAllOnesValue())
+      return LookupConstant(Select->getTrueValue(), ConstantPool);
+    if (A->isNullValue())
+      return LookupConstant(Select->getFalseValue(), ConstantPool);
+    return 0;
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
+  }
+
+  return 0;
+}
+
+/// GetCaseResults - Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
+static bool GetCaseResults(SwitchInst *SI,
+                           ConstantInt *CaseVal,
+                           BasicBlock *CaseDest,
+                           BasicBlock **CommonDest,
+                           SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty except for some side-effect free instructions through
+  // which we can constant-propagate the CaseVal, continue to its successor.
+  SmallDenseMap<Value*, Constant*> ConstantPool;
+  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
+       ++I) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+      // If the terminator is a simple branch, continue to the next block.
+      if (T->getNumSuccessors() != 1)
+        return false;
+      Pred = CaseDest;
+      CaseDest = T->getSuccessor(0);
+    } else if (isa<DbgInfoIntrinsic>(I)) {
+      // Skip debug intrinsic.
+      continue;
+    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+      // Instruction is side-effect free and constant.
+      ConstantPool.insert(std::make_pair(I, C));
+    } else {
+      break;
+    }
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  BasicBlock::iterator I = (*CommonDest)->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx),
+                                        ConstantPool);
+    if (!ConstVal)
+      return false;
+
+    // Note: If the constant comes from constant-propagating the case value
+    // through the CaseDest basic block, it will be safe to remove the
+    // instructions in that block. They cannot be used (except in the phi nodes
+    // we visit) outside CaseDest, because that block does not dominate its
+    // successor. If it did, we would not be in this phi node.
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal))
+      return false;
+
+    Res.push_back(std::make_pair(PHI, ConstVal));
+  }
+
+  return true;
+}
+
+namespace {
+  /// SwitchLookupTable - This class represents a lookup table that can be used
+  /// to replace a switch.
+  class SwitchLookupTable {
+  public:
+    /// SwitchLookupTable - Create a lookup table to use as a switch replacement
+    /// with the contents of Values, using DefaultValue to fill any holes in the
+    /// table.
+    SwitchLookupTable(Module &M,
+                      uint64_t TableSize,
+                      ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                      Constant *DefaultValue,
+                      const DataLayout *TD);
+
+    /// BuildLookup - Build instructions with Builder to retrieve the value at
+    /// the position given by Index in the lookup table.
+    Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+    /// WouldFitInRegister - Return true if a table with TableSize elements of
+    /// type ElementType would fit in a target-legal register.
+    static bool WouldFitInRegister(const DataLayout *TD,
+                                   uint64_t TableSize,
+                                   const Type *ElementType);
+
+  private:
+    // Depending on the contents of the table, it can be represented in
+    // different ways.
+    enum {
+      // For tables where each element contains the same value, we just have to
+      // store that single value and return it for each lookup.
+      SingleValueKind,
+
+      // For small tables with integer elements, we can pack them into a bitmap
+      // that fits into a target-legal register. Values are retrieved by
+      // shift and mask operations.
+      BitMapKind,
+
+      // The table is stored as an array of values. Values are retrieved by load
+      // instructions from the table.
+      ArrayKind
+    } Kind;
+
+    // For SingleValueKind, this is the single value.
+    Constant *SingleValue;
+
+    // For BitMapKind, this is the bitmap.
+    ConstantInt *BitMap;
+    IntegerType *BitMapElementTy;
+
+    // For ArrayKind, this is the array.
+    GlobalVariable *Array;
+  };
+}
+
+SwitchLookupTable::SwitchLookupTable(Module &M,
+                                     uint64_t TableSize,
+                                     ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                                     Constant *DefaultValue,
+                                     const DataLayout *TD)
+    : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
+  assert(Values.size() && "Can't build lookup table without values!");
+  assert(TableSize >= Values.size() && "Can't fit values in table!");
+
+  // If all values in the table are equal, this is that value.
+  SingleValue = Values.begin()->second;
+
+  // Build up the table contents.
+  SmallVector<Constant*, 64> TableContents(TableSize);
+  for (size_t I = 0, E = Values.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Values[I].first;
+    Constant *CaseRes = Values[I].second;
+    assert(CaseRes->getType() == DefaultValue->getType());
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue())
+                   .getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SingleValue)
+      SingleValue = 0;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Values.size() < TableSize) {
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      if (!TableContents[I])
+        TableContents[I] = DefaultValue;
+    }
+
+    if (DefaultValue != SingleValue)
+      SingleValue = 0;
+  }
+
+  // If each element in the table contains the same value, we only need to store
+  // that single value.
+  if (SingleValue) {
+    Kind = SingleValueKind;
+    return;
+  }
+
+  // If the type is integer and the table fits in a register, build a bitmap.
+  if (WouldFitInRegister(TD, TableSize, DefaultValue->getType())) {
+    IntegerType *IT = cast<IntegerType>(DefaultValue->getType());
+    APInt TableInt(TableSize * IT->getBitWidth(), 0);
+    for (uint64_t I = TableSize; I > 0; --I) {
+      TableInt <<= IT->getBitWidth();
+      // Insert values into the bitmap. Undef values are set to zero.
+      if (!isa<UndefValue>(TableContents[I - 1])) {
+        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+        TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+      }
+    }
+    BitMap = ConstantInt::get(M.getContext(), TableInt);
+    BitMapElementTy = IT;
+    Kind = BitMapKind;
+    ++NumBitMaps;
+    return;
+  }
+
+  // Store the table in an array.
+  ArrayType *ArrayTy = ArrayType::get(DefaultValue->getType(), TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
+                             GlobalVariable::PrivateLinkage,
+                             Initializer,
+                             "switch.table");
+  Array->setUnnamedAddr(true);
+  Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+  switch (Kind) {
+    case SingleValueKind:
+      return SingleValue;
+    case BitMapKind: {
+      // Type of the bitmap (e.g. i59).
+      IntegerType *MapTy = BitMap->getType();
+
+      // Cast Index to the same type as the bitmap.
+      // Note: The Index is <= the number of elements in the table, so
+      // truncating it to the width of the bitmask is safe.
+      Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+      // Multiply the shift amount by the element width.
+      ShiftAmt = Builder.CreateMul(ShiftAmt,
+                      ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+                                   "switch.shiftamt");
+
+      // Shift down.
+      Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt,
+                                              "switch.downshift");
+      // Mask off.
+      return Builder.CreateTrunc(DownShifted, BitMapElementTy,
+                                 "switch.masked");
+    }
+    case ArrayKind: {
+      Value *GEPIndices[] = { Builder.getInt32(0), Index };
+      Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices,
+                                             "switch.gep");
+      return Builder.CreateLoad(GEP, "switch.load");
+    }
+  }
+  llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
+                                           uint64_t TableSize,
+                                           const Type *ElementType) {
+  if (!TD)
+    return false;
+  const IntegerType *IT = dyn_cast<IntegerType>(ElementType);
+  if (!IT)
+    return false;
+  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+  // are <= 15, we could try to narrow the type.
+
+  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+  if (TableSize >= UINT_MAX/IT->getBitWidth())
+    return false;
+  return TD->fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// ShouldBuildLookupTable - Determine whether a lookup table should be built
+/// for this switch, based on the number of caes, size of the table and the
+/// types of the results.
+static bool ShouldBuildLookupTable(SwitchInst *SI,
+                                   uint64_t TableSize,
+                                   const TargetTransformInfo &TTI,
+                                   const DataLayout *TD,
+                            const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
+  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+    return false; // TableSize overflowed, or mul below might overflow.
+
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
+  for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
+       E = ResultTypes.end(); I != E; ++I) {
+    Type *Ty = I->second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister = AllTablesFitInRegister &&
+      SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
+  }
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
+}
+
+/// SwitchToLookupTable - If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with different constant values,
+/// replace the switch with lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI,
+                                IRBuilder<> &Builder,
+                                const TargetTransformInfo &TTI,
+                                const DataLayout* TD) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Only build lookup table when we have a target that supports it.
+  if (!TTI.shouldBuildLookupTables())
+    return false;
+
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore the switch if the number of cases is too small.
+  // This is similar to the check when building jump tables in
+  // SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Determine the best cut-off.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the the min and max case values.
+  assert(SI->case_begin() != SI->case_end());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI.getCaseValue();
+  ConstantInt *MaxCaseVal = CI.getCaseValue();
+
+  BasicBlock *CommonDest = 0;
+  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
+  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode*, Constant*> DefaultResults;
+  SmallDenseMap<PHINode*, Type*> ResultTypes;
+  SmallVector<PHINode*, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI.getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+                        Results))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (ResultsTy::iterator I = Results.begin(), E = Results.end(); I!=E; ++I) {
+      if (!ResultLists.count(I->first))
+        PHIs.push_back(I->first);
+      ResultLists[I->first].push_back(std::make_pair(CaseVal, I->second));
+    }
+  }
+
+  // Get the resulting values for the default case.
+  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
+                      DefaultResultsList))
+    return false;
+  for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
+    PHINode *PHI = DefaultResultsList[I].first;
+    Constant *Result = DefaultResultsList[I].second;
+    DefaultResults[PHI] = Result;
+    ResultTypes[PHI] = Result->getType();
+  }
+
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  if (!ShouldBuildLookupTable(SI, TableSize, TTI, TD, ResultTypes))
+    return false;
+
+  // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
+  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
+                                            "switch.lookup",
+                                            CommonDest->getParent(),
+                                            CommonDest);
+
+  // Check whether the condition value is within the case range, and branch to
+  // the new BB.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+                                        "switch.tableidx");
+  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+      MinCaseVal->getType(), TableSize));
+  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+  bool ReturnedEarly = false;
+  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
+    PHINode *PHI = PHIs[I];
+
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI],
+                            DefaultResults[PHI], TD);
+
+    Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) &&
+        *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
+    }
+
+    PHI->addIncoming(Result, LookupBB);
+  }
+
+  if (!ReturnedEarly)
+    Builder.CreateBr(CommonDest);
+
+  // Remove the switch.
+  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+    if (Succ == SI->getDefaultDest()) continue;
+    Succ->removePredecessor(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  ++NumLookupTables;
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+  BasicBlock *BB = SI->getParent();
+
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return SimplifyCFG(BB, TTI, TD) | true;
+
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    BasicBlock::iterator BBI = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(BBI))
+      ++BBI;
+    if (SI == &*BBI)
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+  }
+
+  // Try to transform the switch into an icmp and a branch.
+  if (TurnSwitchRangeIntoICmp(SI, Builder))
+    return SimplifyCFG(BB, TTI, TD) | true;
+
+  // Remove unreachable cases.
+  if (EliminateDeadSwitchCases(SI))
+    return SimplifyCFG(BB, TTI, TD) | true;
+
+  if (ForwardSwitchConditionToPHI(SI))
+    return SimplifyCFG(BB, TTI, TD) | true;
+
+  if (SwitchToLookupTable(SI, Builder, TTI, TD))
+    return SimplifyCFG(BB, TTI, TD) | true;
+
+  return false;
+}
+
+bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+  BasicBlock *BB = IBI->getParent();
+  bool Changed = false;
+
+  // Eliminate redundant destinations.
+  SmallPtrSet<Value *, 8> Succs;
+  for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+    BasicBlock *Dest = IBI->getDestination(i);
+    if (!Dest->hasAddressTaken() || !Succs.insert(Dest)) {
+      Dest->removePredecessor(BB);
+      IBI->removeDestination(i);
+      --i; --e;
+      Changed = true;
+    }
+  }
+
+  if (IBI->getNumDestinations() == 0) {
+    // If the indirectbr has no successors, change it to unreachable.
+    new UnreachableInst(IBI->getContext(), IBI);
+    EraseTerminatorInstAndDCECond(IBI);
+    return true;
+  }
+
+  if (IBI->getNumDestinations() == 1) {
+    // If the indirectbr has one successor, change it to a direct branch.
+    BranchInst::Create(IBI->getDestination(0), IBI);
+    EraseTerminatorInstAndDCECond(IBI);
+    return true;
+  }
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
+    if (SimplifyIndirectBrOnSelect(IBI, SI))
+      return SimplifyCFG(BB, TTI, TD) | true;
+  }
+  return Changed;
+}
+
+bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
+  BasicBlock *BB = BI->getParent();
+
+  if (SinkCommon && SinkThenElseCodeToEnd(BI))
+    return true;
+
+  // If the Terminator is the only non-phi instruction, simplify the block.
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
+  if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      TryToSimplifyUncondBranchFromEmptyBlock(BB))
+    return true;
+
+  // If the only instruction in the block is a seteq/setne comparison
+  // against a constant, try to simplify the block.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+    if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
+      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+        ;
+      if (I->isTerminator() &&
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, TD))
+        return true;
+    }
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and our successor, fold the comparison into the
+  // predecessor and use logical operations to update the incoming value
+  // for PHI nodes in common successor.
+  if (FoldBranchToCommonDest(BI))
+    return SimplifyCFG(BB, TTI, TD) | true;
+  return false;
+}
+
+
+bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+  BasicBlock *BB = BI->getParent();
+
+  // Conditional branch
+  if (isValueEqualityComparison(BI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this
+    // switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+
+    // This block must be empty, except for the setcond inst, if it exists.
+    // Ignore dbg intrinsics.
+    BasicBlock::iterator I = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(I))
+      ++I;
+    if (&*I == BI) {
+      if (FoldValueComparisonIntoPredecessors(BI, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+    } else if (&*I == cast<Instruction>(BI->getCondition())){
+      ++I;
+      // Ignore dbg intrinsics.
+      while (isa<DbgInfoIntrinsic>(I))
+        ++I;
+      if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+    }
+  }
+
+  // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
+  if (SimplifyBranchOnICmpChain(BI, TD, Builder))
+    return true;
+
+  // If this basic block is ONLY a compare and a branch, and if a predecessor
+  // branches to us and one of our successors, fold the comparison into the
+  // predecessor and use logical operations to pick the right destination.
+  if (FoldBranchToCommonDest(BI))
+    return SimplifyCFG(BB, TTI, TD) | true;
+
+  // We have a conditional branch to two blocks that are only reachable
+  // from BI.  We know that the condbr dominates the two blocks, so see if
+  // there is any identical code in the "then" and "else" blocks.  If so, we
+  // can hoist it up to the branching block.
+  if (BI->getSuccessor(0)->getSinglePredecessor() != 0) {
+    if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
+      if (HoistThenElseCodeToIf(BI))
+        return SimplifyCFG(BB, TTI, TD) | true;
+    } else {
+      // If Successor #1 has multiple preds, we may be able to conditionally
+      // execute Successor #0 if it branches to successor #1.
+      TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      if (Succ0TI->getNumSuccessors() == 1 &&
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+        if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
+          return SimplifyCFG(BB, TTI, TD) | true;
+    }
+  } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
+    // If Successor #0 has multiple preds, we may be able to conditionally
+    // execute Successor #1 if it branches to successor #0.
+    TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    if (Succ1TI->getNumSuccessors() == 1 &&
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+      if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
+        return SimplifyCFG(BB, TTI, TD) | true;
+  }
+
+  // If this is a branch on a phi node in the current block, thread control
+  // through this block if any PHI node entries are constants.
+  if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+    if (PN->getParent() == BI->getParent())
+      if (FoldCondBranchOnPHI(BI, TD))
+        return SimplifyCFG(BB, TTI, TD) | true;
+
+  // Scan predecessor blocks for conditional branches.
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+      if (PBI != BI && PBI->isConditional())
+        if (SimplifyCondBranchToCondBranch(PBI, BI))
+          return SimplifyCFG(BB, TTI, TD) | true;
+
+  return false;
+}
+
+/// Check if passing a value to an instruction will cause undefined behavior.
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (I->use_empty())
+    return false;
+
+  if (C->isNullValue()) {
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->use_begin();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator i = ++BasicBlock::iterator(I); &*i != Use; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+      if (GEP->getPointerOperand() == I)
+        return passingValueIsAlwaysUndefined(V, GEP);
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+      return passingValueIsAlwaysUndefined(V, BC);
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      if (!LI->isVolatile())
+        return LI->getPointerAddressSpace() == 0;
+
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      if (!SI->isVolatile())
+        return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer dereference), remove the branch leading here.
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+  for (BasicBlock::iterator i = BB->begin();
+       PHINode *PHI = dyn_cast<PHINode>(i); ++i)
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
+        TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+          BB->removePredecessor(PHI->getIncomingBlock(i));
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) :
+                                                         BI->getSuccessor(0));
+          BI->eraseFromParent();
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
+
+  return false;
+}
+
+bool SimplifyCFGOpt::run(BasicBlock *BB) {
+  bool Changed = false;
+
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  // Remove basic blocks that have no predecessors (except the entry block)...
+  // or that just have themself as a predecessor.  These are unreachable.
+  if ((pred_begin(BB) == pred_end(BB) &&
+       BB != &BB->getParent()->getEntryBlock()) ||
+      BB->getSinglePredecessor() == BB) {
+    DEBUG(dbgs() << "Removing BB: \n" << *BB);
+    DeleteDeadBlock(BB);
+    return true;
+  }
+
+  // Check to see if we can constant propagate this terminator instruction
+  // away...
+  Changed |= ConstantFoldTerminator(BB, true);
+
+  // Check for and eliminate duplicate PHI nodes in this block.
+  Changed |= EliminateDuplicatePHINodes(BB);
+
+  // Check for and remove branches that will always cause undefined behavior.
+  Changed |= removeUndefIntroducingPredecessor(BB);
+
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  //
+  if (MergeBlockIntoPredecessor(BB))
+    return true;
+
+  IRBuilder<> Builder(BB);
+
+  // If there is a trivial two-entry PHI node in this basic block, and we can
+  // eliminate it, do so now.
+  if (PHINode *PN = dyn_cast<PHINode>(BB->begin()))
+    if (PN->getNumIncomingValues() == 2)
+      Changed |= FoldTwoEntryPHINode(PN, TD);
+
+  Builder.SetInsertPoint(BB->getTerminator());
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (BI->isUnconditional()) {
+      if (SimplifyUncondBranch(BI, Builder)) return true;
+    } else {
+      if (SimplifyCondBranch(BI, Builder)) return true;
+    }
+  } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+    if (SimplifyReturn(RI, Builder)) return true;
+  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (SimplifyResume(RI, Builder)) return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (SimplifySwitch(SI, Builder)) return true;
+  } else if (UnreachableInst *UI =
+               dyn_cast<UnreachableInst>(BB->getTerminator())) {
+    if (SimplifyUnreachable(UI)) return true;
+  } else if (IndirectBrInst *IBI =
+               dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+    if (SimplifyIndirectBr(IBI)) return true;
+  }
+
+  return Changed;
+}
+
+/// SimplifyCFG - This function is used to do simplification of a CFG.  For
+/// example, it adjusts branches to branches to eliminate the extra hop, it
+/// eliminates unreachable basic blocks, and does other "peephole" optimization
+/// of the CFG.  It returns true if a modification was made.
+///
+bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+                       const DataLayout *TD) {
+  return SimplifyCFGOpt(TTI, TD).run(BB);
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
new file mode 100644
index 000000000000..41c207c3d5cb
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -0,0 +1,393 @@
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
+STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
+
+namespace {
+  /// SimplifyIndvar - This is a utility for simplifying induction variables
+  /// based on ScalarEvolution. It is the primary instrument of the
+  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+  /// other loop passes that preserve SCEV.
+  class SimplifyIndvar {
+    Loop             *L;
+    LoopInfo         *LI;
+    ScalarEvolution  *SE;
+    const DataLayout *TD; // May be NULL
+
+    SmallVectorImpl<WeakVH> &DeadInsts;
+
+    bool Changed;
+
+  public:
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM,
+                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = NULL) :
+      L(Loop),
+      LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
+      SE(SE),
+      TD(LPM->getAnalysisIfAvailable<DataLayout>()),
+      DeadInsts(Dead),
+      Changed(false) {
+      assert(LI && "IV simplification requires LoopInfo");
+    }
+
+    bool hasChanged() const { return Changed; }
+
+    /// Iteratively perform simplification on a worklist of users of the
+    /// specified induction variable. This is the top-level driver that applies
+    /// all simplicitions to users of an IV.
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = NULL);
+
+    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                              bool IsSigned);
+  };
+}
+
+/// foldIVUser - Fold an IV operand into its use.  This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+  Value *IVSrc = 0;
+  unsigned OperIdx = 0;
+  const SCEV *FoldedExpr = 0;
+  switch (UseInst->getOpcode()) {
+  default:
+    return 0;
+  case Instruction::UDiv:
+  case Instruction::LShr:
+    // We're only interested in the case where we know something about
+    // the numerator and have a constant denominator.
+    if (IVOperand != UseInst->getOperand(OperIdx) ||
+        !isa<ConstantInt>(UseInst->getOperand(1)))
+      return 0;
+
+    // Attempt to fold a binary operator with constant operand.
+    // e.g. ((I + 1) >> 2) => I >> 2
+    if (!isa<BinaryOperator>(IVOperand)
+        || !isa<ConstantInt>(IVOperand->getOperand(1)))
+      return 0;
+
+    IVSrc = IVOperand->getOperand(0);
+    // IVSrc must be the (SCEVable) IV, since the other operand is const.
+    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+    if (UseInst->getOpcode() == Instruction::LShr) {
+      // Get a constant for the divisor. See createSCEV.
+      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+      if (D->getValue().uge(BitWidth))
+        return 0;
+
+      D = ConstantInt::get(UseInst->getContext(),
+                           APInt(BitWidth, 1).shl(D->getZExtValue()));
+    }
+    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+  }
+  // We have something that might fold it's operand. Compare SCEVs.
+  if (!SE->isSCEVable(UseInst->getType()))
+    return 0;
+
+  // Bypass the operand if SCEV can prove it has no effect.
+  if (SE->getSCEV(UseInst) != FoldedExpr)
+    return 0;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+        << " -> " << *UseInst << '\n');
+
+  UseInst->setOperand(OperIdx, IVSrc);
+  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+  ++NumElimOperand;
+  Changed = true;
+  if (IVOperand->use_empty())
+    DeadInsts.push_back(IVOperand);
+  return IVSrc;
+}
+
+/// eliminateIVComparison - SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+  else
+    return;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  ++NumElimCmp;
+  Changed = true;
+  DeadInsts.push_back(ICmp);
+}
+
+/// eliminateIVRemainder - SimplifyIVUsers helper for eliminating useless
+/// remainder operations operating on an induction variable.
+void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
+                                      Value *IVOperand,
+                                      bool IsSigned) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne =
+      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1));
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.push_back(Rem);
+}
+
+/// eliminateIVUser - Eliminate an operation that consumes a simple IV and has
+/// no observable side-effect given the range of IV values.
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+                                     Instruction *IVOperand) {
+  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+    eliminateIVComparison(ICmp, IVOperand);
+    return true;
+  }
+  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+      return true;
+    }
+  }
+
+  // Eliminate any operation that SCEV can prove is an identity function.
+  if (!SE->isSCEVable(UseInst->getType()) ||
+      (UseInst->getType() != IVOperand->getType()) ||
+      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+    return false;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+  UseInst->replaceAllUsesWith(IVOperand);
+  ++NumElimIdentity;
+  Changed = true;
+  DeadInsts.push_back(UseInst);
+  return true;
+}
+
+/// pushIVUsers - Add all uses of Def to the current IV's worklist.
+///
+static void pushIVUsers(
+  Instruction *Def,
+  SmallPtrSet<Instruction*,16> &Simplified,
+  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Avoid infinite or exponential worklist processing.
+    // Also ensure unique worklist users.
+    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+    // self edges first.
+    if (User != Def && Simplified.insert(User))
+      SimpleIVUsers.push_back(std::make_pair(User, Def));
+  }
+}
+
+/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  // Only consider affine recurrences.
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == L)
+    return true;
+
+  return false;
+}
+
+/// simplifyUsers - Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encouters a clear optimization
+/// opportunitiy.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+  if (!SE->isSCEVable(CurrIV->getType()))
+    return;
+
+  // Instructions processed by SimplifyIndvar for CurrIV.
+  SmallPtrSet<Instruction*,16> Simplified;
+
+  // Use-def pairs if IV users waiting to be processed for CurrIV.
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+  // called multiple times for the same LoopPhi. This is the proper thing to
+  // do for loop header phis that use each other.
+  pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+
+  while (!SimpleIVUsers.empty()) {
+    std::pair<Instruction*, Instruction*> UseOper =
+      SimpleIVUsers.pop_back_val();
+    // Bypass back edges to avoid extra work.
+    if (UseOper.first == CurrIV) continue;
+
+    Instruction *IVOperand = UseOper.second;
+    for (unsigned N = 0; IVOperand; ++N) {
+      assert(N <= Simplified.size() && "runaway iteration");
+
+      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      if (!NewOper)
+        break; // done folding
+      IVOperand = dyn_cast<Instruction>(NewOper);
+    }
+    if (!IVOperand)
+      continue;
+
+    if (eliminateIVUser(UseOper.first, IVOperand)) {
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      continue;
+    }
+    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    if (V && Cast) {
+      V->visitCast(Cast);
+      continue;
+    }
+    if (isSimpleIVUser(UseOper.first, L, SE)) {
+      pushIVUsers(UseOper.first, Simplified, SimpleIVUsers);
+    }
+  }
+}
+
+namespace llvm {
+
+void IVVisitor::anchor() { }
+
+/// simplifyUsersOfIV - Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
+                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
+{
+  LoopInfo *LI = &LPM->getAnalysis<LoopInfo>();
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead);
+  SIV.simplifyUsers(CurrIV, V);
+  return SIV.hasChanged();
+}
+
+/// simplifyLoopIVs - Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
+                     SmallVectorImpl<WeakVH> &Dead) {
+  bool Changed = false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead);
+  }
+  return Changed;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
new file mode 100644
index 000000000000..f9687e4d5890
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -0,0 +1,100 @@
+//===------ SimplifyInstructions.cpp - Remove redundant instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+STATISTIC(NumSimplified, "Number of redundant instructions removed");
+
+namespace {
+  struct InstSimplifier : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    InstSimplifier() : FunctionPass(ID) {
+      initializeInstSimplifierPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+    /// runOnFunction - Remove instructions that simplify.
+    bool runOnFunction(Function &F) {
+      const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
+      const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+      const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+      SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+      bool Changed = false;
+
+      do {
+        for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+             DE = df_end(&F.getEntryBlock()); DI != DE; ++DI)
+          for (BasicBlock::iterator BI = DI->begin(), BE = DI->end(); BI != BE;) {
+            Instruction *I = BI++;
+            // The first time through the loop ToSimplify is empty and we try to
+            // simplify all instructions.  On later iterations ToSimplify is not
+            // empty and we only bother simplifying instructions that are in it.
+            if (!ToSimplify->empty() && !ToSimplify->count(I))
+              continue;
+            // Don't waste time simplifying unused instructions.
+            if (!I->use_empty())
+              if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
+                // Mark all uses for resimplification next time round the loop.
+                for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+                     UI != UE; ++UI)
+                  Next->insert(cast<Instruction>(*UI));
+                I->replaceAllUsesWith(V);
+                ++NumSimplified;
+                Changed = true;
+              }
+            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+          }
+
+        // Place the list of instructions to simplify on the next loop iteration
+        // into ToSimplify.
+        std::swap(ToSimplify, Next);
+        Next->clear();
+      } while (!ToSimplify->empty());
+
+      return Changed;
+    }
+  };
+}
+
+char InstSimplifier::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
+                      "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
+                    "Remove redundant instructions", false, false)
+char &llvm::InstructionSimplifierID = InstSimplifier::ID;
+
+// Public interface to the simplify instructions pass.
+FunctionPass *llvm::createInstructionSimplifierPass() {
+  return new InstSimplifier();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
new file mode 100644
index 000000000000..c231704414fc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -0,0 +1,1949 @@
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class LibCallOptimization {
+protected:
+  Function *Caller;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  LLVMContext* Context;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// callOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
+    =0;
+
+  /// ignoreCallingConv - Returns false if this transformation could possibly
+  /// change the calling convention.
+  virtual bool ignoreCallingConv() { return false; }
+
+  Value *optimizeCall(CallInst *CI, const DataLayout *TD,
+                      const TargetLibraryInfo *TLI,
+                      const LibCallSimplifier *LCS, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+    if (CI->getCalledFunction())
+      Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (!ignoreCallingConv() && CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
+    return callOptimizer(CI->getCalledFunction(), CI, B);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
+       it != e; ++it) {
+    if ((*it)->getType()->isFloatingPointTy())
+      return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FortifiedLibCallOptimization : public LibCallOptimization {
+protected:
+  virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
+			  bool isString) const = 0;
+};
+
+struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization {
+  CallInst *CI;
+
+  bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
+    if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
+      return true;
+    if (ConstantInt *SizeCI =
+                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
+      if (SizeCI->isAllOnesValue())
+        return true;
+      if (isString) {
+        uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
+        // If the length is 0 we don't know how long it is and so we can't
+        // remove the check.
+        if (Len == 0) return false;
+        return SizeCI->getZExtValue() >= Len;
+      }
+      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
+                                                  CI->getArgOperand(SizeArgOp)))
+        return SizeCI->getZExtValue() >= Arg->getZExtValue();
+    }
+    return false;
+  }
+};
+
+struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
+                                   false);
+      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(Context))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // __strcpy_chk(x,x)  -> x
+      return Src;
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain strcpy. Otherwise we'll keep our
+    // strcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Value *Ret =
+	EmitMemCpyChk(Dst, Src,
+                      ConstantInt::get(TD->getIntPtrType(Context), Len),
+                      CI->getArgOperand(2), B, TD, TLI);
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain stpcpy. Otherwise we'll keep our
+    // stpcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Type *PT = FT->getParamType(0);
+      Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+      Value *DstEnd = B.CreateGEP(Dst,
+                                  ConstantInt::get(TD->getIntPtrType(PT),
+                                                   Len - 1));
+      if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD, TLI))
+        return 0;
+      return DstEnd;
+    }
+    return 0;
+  }
+};
+
+struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TD, TLI,
+                               Name.substr(2, 7));
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct StrCatOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    return emitStrLenMemCpy(Src, Dst, Len, B);
+  }
+
+  Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                          IRBuilder<> &B) {
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B, TD, TLI);
+    if (!DstLen)
+      return 0;
+
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(CpyDst, Src,
+                   ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
+    return Dst;
+  }
+};
+
+struct StrNCatOpt : public StrCatOpt {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    uint64_t Len;
+
+    // We don't do anything if length is not constant
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;  // Unbias length.
+
+    // Handle the simple, do-nothing cases:
+    // strncat(x, "", c) -> x
+    // strncat(x,  c, 0) -> x
+    if (SrcLen == 0 || Len == 0) return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // We don't optimize this case
+    if (Len < SrcLen) return 0;
+
+    // strncat(x, s, c) -> strcat(x, s)
+    // s is constant so the strcat can be optimized further
+    return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+  }
+};
+
+struct StrChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    if (CharC == 0) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
+        return 0;
+
+      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                        B, TD, TLI);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str))
+      return 0;
+
+    // Compute the offset, make sure to handle the case when we're searching for
+    // zero (a weird way to spell strlen).
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.find(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+      return Constant::getNullValue(CI->getType());
+
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
+  }
+};
+
+struct StrRChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strrchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+
+    // Cannot fold anything if we're not looking for a constant.
+    if (!CharC)
+      return 0;
+
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str)) {
+      // strrchr(s, 0) -> strchr(s, 0)
+      if (TD && CharC->isZero())
+        return EmitStrChr(SrcStr, '\0', B, TD, TLI);
+      return 0;
+    }
+
+    // Compute the offset.
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.rfind(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char. Return null.
+      return Constant::getNullValue(CI->getType());
+
+    // strrchr(s+n,c) -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
+  }
+};
+
+struct StrCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    // strcmp(P, "x") -> memcmp(P, "x", 2)
+    uint64_t Len1 = GetStringLength(Str1P);
+    uint64_t Len2 = GetStringLength(Str2P);
+    if (Len1 && Len2) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      return EmitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(TD->getIntPtrType(*Context),
+                        std::min(Len1, Len2)), B, TD, TLI);
+    }
+
+    return 0;
+  }
+};
+
+struct StrNCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = Str1.substr(0, Length);
+      StringRef SubStr2 = Str2.substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    return 0;
+  }
+};
+
+struct StrCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src,
+		   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+    return Dst;
+  }
+};
+
+struct StpCpyOpt: public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "stpcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    Type *PT = FT->getParamType(0);
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+    Value *DstEnd = B.CreateGEP(Dst,
+                                ConstantInt::get(TD->getIntPtrType(PT),
+                                                 Len - 1));
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src, LenV, 1);
+    return DstEnd;
+  }
+};
+
+struct StrNCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    Value *LenOp = CI->getArgOperand(2);
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;
+
+    if (SrcLen == 0) {
+      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+      B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+      return Dst;
+    }
+
+    uint64_t Len;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Let strncpy handle the zero padding
+    if (Len > SrcLen+1) return 0;
+
+    Type *PT = FT->getParamType(0);
+    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+    B.CreateMemCpy(Dst, Src,
+                   ConstantInt::get(TD->getIntPtrType(PT), Len), 1);
+
+    return Dst;
+  }
+};
+
+struct StrLenOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    Value *Src = CI->getArgOperand(0);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    if (isOnlyUsedInZeroEqualityComparison(CI))
+      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+    return 0;
+  }
+};
+
+struct StrPBrkOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        FT->getReturnType() != FT->getParamType(0))
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strpbrk(s, "") -> NULL
+    // strpbrk("", s) -> NULL
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t I = S1.find_first_of(S2);
+      if (I == std::string::npos) // No match.
+        return Constant::getNullValue(CI->getType());
+
+      return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
+    }
+
+    // strpbrk(s, "a") -> strchr(s, 'a')
+    if (TD && HasS2 && S2.size() == 1)
+      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrToOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy())
+      return 0;
+
+    Value *EndPtr = CI->getArgOperand(1);
+    if (isa<ConstantPointerNull>(EndPtr)) {
+      // With a null EndPtr, this function won't capture the main argument.
+      // It would be readonly too, except that it still may write to errno.
+      CI->addAttribute(1, Attribute::NoCapture);
+    }
+
+    return 0;
+  }
+};
+
+struct StrSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strspn(s, "") -> 0
+    // strspn("", s) -> 0
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_not_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    return 0;
+  }
+};
+
+struct StrCSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strcspn("", s) -> 0
+    if (HasS1 && S1.empty())
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    // strcspn(s, "") -> strlen(s)
+    if (TD && HasS2 && S2.empty())
+      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrStrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isPointerTy())
+      return 0;
+
+    // fold strstr(x, x) -> x.
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+      if (!StrLen)
+        return 0;
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD, TLI);
+      if (!StrNCmp)
+        return 0;
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(*UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        LCS->replaceAllUsesWith(Old, Cmp);
+      }
+      return CI;
+    }
+
+    // See if either input string is a constant string.
+    StringRef SearchStr, ToFindStr;
+    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+    // fold strstr(x, "") -> x.
+    if (HasStr2 && ToFindStr.empty())
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // If both strings are known, constant fold it.
+    if (HasStr1 && HasStr2) {
+      std::string::size_type Offset = SearchStr.find(ToFindStr);
+
+      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+        return Constant::getNullValue(CI->getType());
+
+      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
+      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+      return B.CreateBitCast(Result, CI->getType());
+    }
+
+    // fold strstr(x, "y") -> strchr(x, 'y').
+    if (HasStr2 && ToFindStr.size() == 1) {
+      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+    }
+    return 0;
+  }
+};
+
+struct MemCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy(32))
+      return 0;
+
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+    if (Len == 1) {
+      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+                                 CI->getType(), "lhsv");
+      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+                                 CI->getType(), "rhsv");
+      return B.CreateSub(LHSV, RHSV, "chardiff");
+    }
+
+    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+    StringRef LHSStr, RHSStr;
+    if (getConstantStringInfo(LHS, LHSStr) &&
+        getConstantStringInfo(RHS, RHSStr)) {
+      // Make sure we're not reading out-of-bounds memory.
+      if (Len > LHSStr.size() || Len > RHSStr.size())
+        return 0;
+      // Fold the memcmp and normalize the result.  This way we get consistent
+      // results across multiple platforms.
+      uint64_t Ret = 0;
+      int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      if (Cmp < 0)
+        Ret = -1;
+      else if (Cmp > 0)
+        Ret = 1;
+      return ConstantInt::get(CI->getType(), Ret);
+    }
+
+    return 0;
+  }
+};
+
+struct MemCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemMoveOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemSetOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
+
+struct UnsafeFPLibCallOptimization : public LibCallOptimization {
+  bool UnsafeFPShrink;
+  UnsafeFPLibCallOptimization(bool UnsafeFPShrink) {
+    this->UnsafeFPShrink = UnsafeFPShrink;
+  }
+};
+
+struct CosOpt : public UnsafeFPLibCallOptimization {
+  CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    // cos(-x) -> cos(x)
+    Value *Op1 = CI->getArgOperand(0);
+    if (BinaryOperator::isFNeg(Op1)) {
+      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+    }
+    return Ret;
+  }
+};
+
+struct PowOpt : public UnsafeFPLibCallOptimization {
+  PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
+    }
+
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return Ret;
+
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+
+    if (Op2C->isExactlyValue(0.5)) {
+      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
+      // This is faster than calling pow, and still handles negative zero
+      // and negative infinity correctly.
+      // TODO: In fast-math mode, this could be just sqrt(x).
+      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
+      Value *Inf = ConstantFP::getInfinity(CI->getType());
+      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+      Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B,
+                                         Callee->getAttributes());
+      Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
+                                         Callee->getAttributes());
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
+      return Sel;
+    }
+
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateFMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
+                          Op1, "powrecip");
+    return 0;
+  }
+};
+
+struct Exp2Opt : public UnsafeFPLibCallOptimization {
+  Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op = CI->getArgOperand(0);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
+    }
+
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType()->isFloatTy())
+        Name = "ldexpf";
+      else if (Op->getType()->isDoubleTy())
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(*Context, APFloat(1.0f));
+      if (!Op->getType()->isFloatTy())
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(),
+                                             B.getInt32Ty(), NULL);
+      CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
+      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+        CI->setCallingConv(F->getCallingConv());
+
+      return CI;
+    }
+    return Ret;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FFSOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getParamType(0)->isIntegerTy())
+      return 0;
+
+    Value *Op = CI->getArgOperand(0);
+
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->isZero()) // ffs(0) -> 0.
+        return B.getInt32(0);
+      // ffs(c) -> cttz(c)+1
+      return B.getInt32(CI->getValue().countTrailingZeros() + 1);
+    }
+
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, ArgType);
+    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+    return B.CreateSelect(Cond, V, B.getInt32(0));
+  }
+};
+
+struct AbsOpt : public LibCallOptimization {
+  virtual bool ignoreCallingConv() { return true; }
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getArgOperand(0);
+    Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+
+struct IsDigitOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct IsAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct ToAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // toascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getArgOperand(0),
+                       ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct PrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI :
+                               ConstantInt::get(CI->getType(), 0);
+
+    // Do not do any of the following transformations if the printf return value
+    // is used, in general the printf return value is not compatible with either
+    // putchar() or puts().
+    if (!CI->use_empty())
+      return 0;
+
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr = FormatStr.drop_back();
+      Value *GV = B.CreateGlobalString(FormatStr, "str");
+      Value *NewCI = EmitPutS(GV, B, TD, TLI);
+      return (CI->use_empty() || !NewCI) ?
+              NewCI :
+              ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(chr)
+    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isIntegerTy()) {
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI);
+
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isPointerTy()) {
+      return EmitPutS(CI->getArgOperand(1), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // printf(format, ...) -> iprintf(format, ...) if no floating point
+    // arguments.
+    if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *IPrintFFn =
+        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(IPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct SPrintFOpt : public LibCallOptimization {
+  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumArgOperands() == 2) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
+                                      FormatStr.size() + 1), 1);   // nul byte.
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
+
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
+      if (!Len)
+        return 0;
+      Value *IncLen = B.CreateAdd(Len,
+                                  ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+    // point arguments.
+    if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(SIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FPrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // All the optimizations depend on the format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumArgOperands() == 2) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 FormatStr.size()),
+                                CI->getArgOperand(0), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
+                               TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+        return 0;
+      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+    // floating point arguments.
+    if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(FIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FWriteOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        !FT->getParamType(3)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    return 0;
+  }
+};
+
+struct FPutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !CI->use_empty())
+      return 0;
+
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
+    if (!Len) return 0;
+    // Known to have no uses (see above).
+    return EmitFWrite(CI->getArgOperand(0),
+                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      CI->getArgOperand(1), B, TD, TLI);
+  }
+};
+
+struct PutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    // Check for a constant string.
+    StringRef Str;
+    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+      return 0;
+
+    if (Str.empty() && CI->use_empty()) {
+      // puts("") -> putchar('\n')
+      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    return 0;
+  }
+};
+
+} // End anonymous namespace.
+
+namespace llvm {
+
+class LibCallSimplifierImpl {
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  bool UnsafeFPShrink;
+
+  // Math library call optimizations.
+  CosOpt Cos;
+  PowOpt Pow;
+  Exp2Opt Exp2;
+public:
+  LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                        const LibCallSimplifier *LCS,
+                        bool UnsafeFPShrink = false)
+    : Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+    this->UnsafeFPShrink = UnsafeFPShrink;
+  }
+
+  Value *optimizeCall(CallInst *CI);
+  LibCallOptimization *lookupOptimization(CallInst *CI);
+  bool hasFloatVersion(StringRef FuncName);
+};
+
+bool LibCallSimplifierImpl::hasFloatVersion(StringRef FuncName) {
+  LibFunc::Func Func;
+  SmallString<20> FloatFuncName = FuncName;
+  FloatFuncName += 'f';
+  if (TLI->getLibFunc(FloatFuncName, Func))
+    return TLI->has(Func);
+  return false;
+}
+
+// Fortified library call optimizations.
+static MemCpyChkOpt MemCpyChk;
+static MemMoveChkOpt MemMoveChk;
+static MemSetChkOpt MemSetChk;
+static StrCpyChkOpt StrCpyChk;
+static StpCpyChkOpt StpCpyChk;
+static StrNCpyChkOpt StrNCpyChk;
+
+// String library call optimizations.
+static StrCatOpt StrCat;
+static StrNCatOpt StrNCat;
+static StrChrOpt StrChr;
+static StrRChrOpt StrRChr;
+static StrCmpOpt StrCmp;
+static StrNCmpOpt StrNCmp;
+static StrCpyOpt StrCpy;
+static StpCpyOpt StpCpy;
+static StrNCpyOpt StrNCpy;
+static StrLenOpt StrLen;
+static StrPBrkOpt StrPBrk;
+static StrToOpt StrTo;
+static StrSpnOpt StrSpn;
+static StrCSpnOpt StrCSpn;
+static StrStrOpt StrStr;
+
+// Memory library call optimizations.
+static MemCmpOpt MemCmp;
+static MemCpyOpt MemCpy;
+static MemMoveOpt MemMove;
+static MemSetOpt MemSet;
+
+// Math library call optimizations.
+static UnaryDoubleFPOpt UnaryDoubleFP(false);
+static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+
+  // Integer library call optimizations.
+static FFSOpt FFS;
+static AbsOpt Abs;
+static IsDigitOpt IsDigit;
+static IsAsciiOpt IsAscii;
+static ToAsciiOpt ToAscii;
+
+// Formatting and IO library call optimizations.
+static PrintFOpt PrintF;
+static SPrintFOpt SPrintF;
+static FPrintFOpt FPrintF;
+static FWriteOpt FWrite;
+static FPutsOpt FPuts;
+static PutsOpt Puts;
+
+LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
+  LibFunc::Func Func;
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+
+  // Next check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+       return &Pow;
+    case Intrinsic::exp2:
+       return &Exp2;
+    default:
+       return 0;
+    }
+  }
+
+  // Then check for known library functions.
+  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+    switch (Func) {
+      case LibFunc::strcat:
+        return &StrCat;
+      case LibFunc::strncat:
+        return &StrNCat;
+      case LibFunc::strchr:
+        return &StrChr;
+      case LibFunc::strrchr:
+        return &StrRChr;
+      case LibFunc::strcmp:
+        return &StrCmp;
+      case LibFunc::strncmp:
+        return &StrNCmp;
+      case LibFunc::strcpy:
+        return &StrCpy;
+      case LibFunc::stpcpy:
+        return &StpCpy;
+      case LibFunc::strncpy:
+        return &StrNCpy;
+      case LibFunc::strlen:
+        return &StrLen;
+      case LibFunc::strpbrk:
+        return &StrPBrk;
+      case LibFunc::strtol:
+      case LibFunc::strtod:
+      case LibFunc::strtof:
+      case LibFunc::strtoul:
+      case LibFunc::strtoll:
+      case LibFunc::strtold:
+      case LibFunc::strtoull:
+        return &StrTo;
+      case LibFunc::strspn:
+        return &StrSpn;
+      case LibFunc::strcspn:
+        return &StrCSpn;
+      case LibFunc::strstr:
+        return &StrStr;
+      case LibFunc::memcmp:
+        return &MemCmp;
+      case LibFunc::memcpy:
+        return &MemCpy;
+      case LibFunc::memmove:
+        return &MemMove;
+      case LibFunc::memset:
+        return &MemSet;
+      case LibFunc::cosf:
+      case LibFunc::cos:
+      case LibFunc::cosl:
+        return &Cos;
+      case LibFunc::powf:
+      case LibFunc::pow:
+      case LibFunc::powl:
+        return &Pow;
+      case LibFunc::exp2l:
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+        return &Exp2;
+      case LibFunc::ffs:
+      case LibFunc::ffsl:
+      case LibFunc::ffsll:
+        return &FFS;
+      case LibFunc::abs:
+      case LibFunc::labs:
+      case LibFunc::llabs:
+        return &Abs;
+      case LibFunc::isdigit:
+        return &IsDigit;
+      case LibFunc::isascii:
+        return &IsAscii;
+      case LibFunc::toascii:
+        return &ToAscii;
+      case LibFunc::printf:
+        return &PrintF;
+      case LibFunc::sprintf:
+        return &SPrintF;
+      case LibFunc::fprintf:
+        return &FPrintF;
+      case LibFunc::fwrite:
+        return &FWrite;
+      case LibFunc::fputs:
+        return &FPuts;
+      case LibFunc::puts:
+        return &Puts;
+      case LibFunc::ceil:
+      case LibFunc::fabs:
+      case LibFunc::floor:
+      case LibFunc::rint:
+      case LibFunc::round:
+      case LibFunc::nearbyint:
+      case LibFunc::trunc:
+        if (hasFloatVersion(FuncName))
+          return &UnaryDoubleFP;
+        return 0;
+      case LibFunc::acos:
+      case LibFunc::acosh:
+      case LibFunc::asin:
+      case LibFunc::asinh:
+      case LibFunc::atan:
+      case LibFunc::atanh:
+      case LibFunc::cbrt:
+      case LibFunc::cosh:
+      case LibFunc::exp:
+      case LibFunc::exp10:
+      case LibFunc::expm1:
+      case LibFunc::log:
+      case LibFunc::log10:
+      case LibFunc::log1p:
+      case LibFunc::log2:
+      case LibFunc::logb:
+      case LibFunc::sin:
+      case LibFunc::sinh:
+      case LibFunc::sqrt:
+      case LibFunc::tan:
+      case LibFunc::tanh:
+        if (UnsafeFPShrink && hasFloatVersion(FuncName))
+         return &UnsafeUnaryDoubleFP;
+        return 0;
+      case LibFunc::memcpy_chk:
+        return &MemCpyChk;
+      default:
+        return 0;
+      }
+  }
+
+  // Finally check for fortified library calls.
+  if (FuncName.endswith("_chk")) {
+    if (FuncName == "__memmove_chk")
+      return &MemMoveChk;
+    else if (FuncName == "__memset_chk")
+      return &MemSetChk;
+    else if (FuncName == "__strcpy_chk")
+      return &StrCpyChk;
+    else if (FuncName == "__stpcpy_chk")
+      return &StpCpyChk;
+    else if (FuncName == "__strncpy_chk")
+      return &StrNCpyChk;
+    else if (FuncName == "__stpncpy_chk")
+      return &StrNCpyChk;
+  }
+
+  return 0;
+
+}
+
+Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
+  LibCallOptimization *LCO = lookupOptimization(CI);
+  if (LCO) {
+    IRBuilder<> Builder(CI);
+    return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
+  }
+  return 0;
+}
+
+LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI,
+                                     bool UnsafeFPShrink) {
+  Impl = new LibCallSimplifierImpl(TD, TLI, this, UnsafeFPShrink);
+}
+
+LibCallSimplifier::~LibCallSimplifier() {
+  delete Impl;
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0;
+  return Impl->optimizeCall(CI);
+}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+  I->replaceAllUsesWith(With);
+  I->eraseFromParent();
+}
+
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
new file mode 100644
index 000000000000..560f58160753
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -0,0 +1,122 @@
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is used to ensure that functions have at most one return
+// instruction in them.  Additionally, it keeps track of which node is the new
+// exit node of the CFG.  If there are no exit nodes in the CFG, the getExitNode
+// method will return a null pointer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+char UnifyFunctionExitNodes::ID = 0;
+INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn",
+                "Unify function exit nodes", false, false)
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
+  return new UnifyFunctionExitNodes();
+}
+
+void UnifyFunctionExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+  // This is a cluster of orthogonal Transforms
+  AU.addPreserved("mem2reg");
+  AU.addPreservedID(LowerSwitchID);
+}
+
+// UnifyAllExitNodes - Unify all exit nodes of the CFG by creating a new
+// BasicBlock, and converting all returns to unconditional branches to this
+// new basic block.  The singular exit node is returned.
+//
+// If there are no return stmts in the Function, a null pointer is returned.
+//
+bool UnifyFunctionExitNodes::runOnFunction(Function &F) {
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  std::vector<BasicBlock*> ReturningBlocks;
+  std::vector<BasicBlock*> UnreachableBlocks;
+  for(Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (isa<ReturnInst>(I->getTerminator()))
+      ReturningBlocks.push_back(I);
+    else if (isa<UnreachableInst>(I->getTerminator()))
+      UnreachableBlocks.push_back(I);
+
+  // Then unreachable blocks.
+  if (UnreachableBlocks.empty()) {
+    UnreachableBlock = 0;
+  } else if (UnreachableBlocks.size() == 1) {
+    UnreachableBlock = UnreachableBlocks.front();
+  } else {
+    UnreachableBlock = BasicBlock::Create(F.getContext(), 
+                                          "UnifiedUnreachableBlock", &F);
+    new UnreachableInst(F.getContext(), UnreachableBlock);
+
+    for (std::vector<BasicBlock*>::iterator I = UnreachableBlocks.begin(),
+           E = UnreachableBlocks.end(); I != E; ++I) {
+      BasicBlock *BB = *I;
+      BB->getInstList().pop_back();  // Remove the unreachable inst.
+      BranchInst::Create(UnreachableBlock, BB);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty()) {
+    ReturnBlock = 0;
+    return false;                          // No blocks return
+  } else if (ReturningBlocks.size() == 1) {
+    ReturnBlock = ReturningBlocks.front(); // Already has a single return block
+    return false;
+  }
+
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+                                               "UnifiedReturnBlock", &F);
+
+  PHINode *PN = 0;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), NULL, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (std::vector<BasicBlock*>::iterator I = ReturningBlocks.begin(),
+         E = ReturningBlocks.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  ReturnBlock = NewRetBlock;
+  return true;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
new file mode 100644
index 000000000000..5812d4607dfc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
@@ -0,0 +1,38 @@
+//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// TransformUtils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+
+using namespace llvm;
+
+/// initializeTransformUtils - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeTransformUtils(PassRegistry &Registry) {
+  initializeBreakCriticalEdgesPass(Registry);
+  initializeInstNamerPass(Registry);
+  initializeLCSSAPass(Registry);
+  initializeLoopSimplifyPass(Registry);
+  initializeLowerInvokePass(Registry);
+  initializeLowerSwitchPass(Registry);
+  initializePromotePassPass(Registry);
+  initializeUnifyFunctionExitNodesPass(Registry);
+  initializeInstSimplifierPass(Registry);
+  initializeMetaRenamerPass(Registry);
+}
+
+/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
+void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
+  initializeTransformUtils(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
new file mode 100644
index 000000000000..b5941bdf2411
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -0,0 +1,215 @@
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+using namespace llvm;
+
+// Out of line method to get vtable etc for class.
+void ValueMapTypeRemapper::anchor() {}
+
+Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
+                      ValueMapTypeRemapper *TypeMapper) {
+  ValueToValueMapTy::iterator I = VM.find(V);
+  
+  // If the value already exists in the map, use it.
+  if (I != VM.end() && I->second) return I->second;
+  
+  // Global values do not need to be seeded into the VM if they
+  // are using the identity mapping.
+  if (isa<GlobalValue>(V) || isa<MDString>(V))
+    return VM[V] = const_cast<Value*>(V);
+  
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+    // Inline asm may need *type* remapping.
+    FunctionType *NewTy = IA->getFunctionType();
+    if (TypeMapper) {
+      NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
+
+      if (NewTy != IA->getFunctionType())
+        V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
+                           IA->hasSideEffects(), IA->isAlignStack());
+    }
+    
+    return VM[V] = const_cast<Value*>(V);
+  }
+  
+
+  if (const MDNode *MD = dyn_cast<MDNode>(V)) {
+    // If this is a module-level metadata and we know that nothing at the module
+    // level is changing, then use an identity mapping.
+    if (!MD->isFunctionLocal() && (Flags & RF_NoModuleLevelChanges))
+      return VM[V] = const_cast<Value*>(V);
+    
+    // Create a dummy node in case we have a metadata cycle.
+    MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>());
+    VM[V] = Dummy;
+    
+    // Check all operands to see if any need to be remapped.
+    for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
+      Value *OP = MD->getOperand(i);
+      if (OP == 0) continue;
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      // Use identity map if Mapped_Op is null and we can ignore missing
+      // entries.
+      if (Mapped_OP == OP ||
+          (Mapped_OP == 0 && (Flags & RF_IgnoreMissingEntries)))
+        continue;
+
+      // Ok, at least one operand needs remapping.  
+      SmallVector<Value*, 4> Elts;
+      Elts.reserve(MD->getNumOperands());
+      for (i = 0; i != e; ++i) {
+        Value *Op = MD->getOperand(i);
+        if (Op == 0)
+          Elts.push_back(0);
+        else {
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          // Use identity map if Mapped_Op is null and we can ignore missing
+          // entries.
+          if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
+            Mapped_Op = Op;
+          Elts.push_back(Mapped_Op);
+        }
+      }
+      MDNode *NewMD = MDNode::get(V->getContext(), Elts);
+      Dummy->replaceAllUsesWith(NewMD);
+      VM[V] = NewMD;
+      MDNode::deleteTemporary(Dummy);
+      return NewMD;
+    }
+
+    VM[V] = const_cast<Value*>(V);
+    MDNode::deleteTemporary(Dummy);
+
+    // No operands needed remapping.  Use an identity mapping.
+    return const_cast<Value*>(V);
+  }
+
+  // Okay, this either must be a constant (which may or may not be mappable) or
+  // is something that is not in the mapping table.
+  Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
+  if (C == 0)
+    return 0;
+  
+  if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
+    Function *F = 
+      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper));
+    BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM,
+                                                       Flags, TypeMapper));
+    return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
+  }
+  
+  // Otherwise, we have some other constant to remap.  Start by checking to see
+  // if all operands have an identity remapping.
+  unsigned OpNo = 0, NumOperands = C->getNumOperands();
+  Value *Mapped = 0;
+  for (; OpNo != NumOperands; ++OpNo) {
+    Value *Op = C->getOperand(OpNo);
+    Mapped = MapValue(Op, VM, Flags, TypeMapper);
+    if (Mapped != C) break;
+  }
+  
+  // See if the type mapper wants to remap the type as well.
+  Type *NewTy = C->getType();
+  if (TypeMapper)
+    NewTy = TypeMapper->remapType(NewTy);
+
+  // If the result type and all operands match up, then just insert an identity
+  // mapping.
+  if (OpNo == NumOperands && NewTy == C->getType())
+    return VM[V] = C;
+  
+  // Okay, we need to create a new constant.  We've already processed some or
+  // all of the operands, set them all up now.
+  SmallVector<Constant*, 8> Ops;
+  Ops.reserve(NumOperands);
+  for (unsigned j = 0; j != OpNo; ++j)
+    Ops.push_back(cast<Constant>(C->getOperand(j)));
+  
+  // If one of the operands mismatch, push it and the other mapped operands.
+  if (OpNo != NumOperands) {
+    Ops.push_back(cast<Constant>(Mapped));
+  
+    // Map the rest of the operands that aren't processed yet.
+    for (++OpNo; OpNo != NumOperands; ++OpNo)
+      Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
+                             Flags, TypeMapper));
+  }
+  
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return VM[V] = CE->getWithOperands(Ops, NewTy);
+  if (isa<ConstantArray>(C))
+    return VM[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+  if (isa<ConstantStruct>(C))
+    return VM[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+  if (isa<ConstantVector>(C))
+    return VM[V] = ConstantVector::get(Ops);
+  // If this is a no-operand constant, it must be because the type was remapped.
+  if (isa<UndefValue>(C))
+    return VM[V] = UndefValue::get(NewTy);
+  if (isa<ConstantAggregateZero>(C))
+    return VM[V] = ConstantAggregateZero::get(NewTy);
+  assert(isa<ConstantPointerNull>(C));
+  return VM[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+/// RemapInstruction - Convert the instruction operands from referencing the
+/// current values into those specified by VMap.
+///
+void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
+                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper){
+  // Remap operands.
+  for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
+    Value *V = MapValue(*op, VMap, Flags, TypeMapper);
+    // If we aren't ignoring missing entries, assert that something happened.
+    if (V != 0)
+      *op = V;
+    else
+      assert((Flags & RF_IgnoreMissingEntries) &&
+             "Referenced value not in value map!");
+  }
+
+  // Remap phi nodes' incoming blocks.
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      Value *V = MapValue(PN->getIncomingBlock(i), VMap, Flags);
+      // If we aren't ignoring missing entries, assert that something happened.
+      if (V != 0)
+        PN->setIncomingBlock(i, cast<BasicBlock>(V));
+      else
+        assert((Flags & RF_IgnoreMissingEntries) &&
+               "Referenced block not in value map!");
+    }
+  }
+
+  // Remap attached metadata.
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  I->getAllMetadata(MDs);
+  for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
+       MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
+    MDNode *Old = MI->second;
+    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper);
+    if (New != Old)
+      I->setMetadata(MI->first, New);
+  }
+  
+  // If the instruction's type is being remapped, do so now.
+  if (TypeMapper)
+    I->mutateType(TypeMapper->remapType(I->getType()));
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
new file mode 100644
index 000000000000..17900dabbefe
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -0,0 +1,3183 @@
+//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+
+#define BBV_NAME "bb-vectorize"
+#define DEBUG_TYPE BBV_NAME
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<bool>
+IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
+  cl::Hidden, cl::desc("Ignore target information"));
+
+static cl::opt<unsigned>
+ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
+  cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
+  cl::Hidden, cl::desc("Use the chain depth requirement with"
+                       " target information"));
+
+static cl::opt<unsigned>
+SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
+  cl::desc("The maximum search distance for instruction pairs"));
+
+static cl::opt<bool>
+SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
+  cl::desc("Replicating one element to a pair breaks the chain"));
+
+static cl::opt<unsigned>
+VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
+  cl::desc("The size of the native vector registers"));
+
+static cl::opt<unsigned>
+MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
+  cl::desc("The maximum number of pairing iterations"));
+
+static cl::opt<bool>
+Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to form non-2^n-length vectors"));
+
+static cl::opt<unsigned>
+MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
+  cl::desc("The maximum number of pairable instructions per group"));
+
+static cl::opt<unsigned>
+MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
+  cl::desc("The maximum number of candidate instruction pairs per group"));
+
+static cl::opt<unsigned>
+MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
+  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
+                       " a full cycle check"));
+
+static cl::opt<bool>
+NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize boolean (i1) values"));
+
+static cl::opt<bool>
+NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize integer values"));
+
+static cl::opt<bool>
+NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point values"));
+
+// FIXME: This should default to false once pointer vector support works.
+static cl::opt<bool>
+NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
+  cl::desc("Don't try to vectorize pointer values"));
+
+static cl::opt<bool>
+NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize casting (conversion) operations"));
+
+static cl::opt<bool>
+NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize select instructions"));
+
+static cl::opt<bool>
+NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize comparison instructions"));
+
+static cl::opt<bool>
+NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize getelementptr instructions"));
+
+static cl::opt<bool>
+NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
+  cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+NoMemOpBoost("bb-vectorize-no-mem-op-boost",
+  cl::init(false), cl::Hidden,
+  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
+
+static cl::opt<bool>
+FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
+  cl::desc("Use a fast instruction dependency analysis"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("bb-vectorize-debug-pair-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("bb-vectorize-debug-cycle-check",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, dump the basic block after"
+           " every pair is fused"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
+
+namespace {
+  struct BBVectorize : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+
+    const VectorizeConfig Config;
+
+    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
+      : BasicBlockPass(ID), Config(C) {
+      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
+    }
+
+    BBVectorize(Pass *P, const VectorizeConfig &C)
+      : BasicBlockPass(ID), Config(C) {
+      AA = &P->getAnalysis<AliasAnalysis>();
+      DT = &P->getAnalysis<DominatorTree>();
+      SE = &P->getAnalysis<ScalarEvolution>();
+      TD = P->getAnalysisIfAvailable<DataLayout>();
+      TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>();
+    }
+
+    typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::pair<ValuePair, int> ValuePairWithCost;
+    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<VPPair, unsigned> VPPairWithType;
+
+    AliasAnalysis *AA;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    DataLayout *TD;
+    const TargetTransformInfo *TTI;
+
+    // FIXME: const correct?
+
+    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
+
+    bool getCandidatePairs(BasicBlock &BB,
+                       BasicBlock::iterator &Start,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                       DenseSet<ValuePair> &FixedOrderPairs,
+                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
+
+    // FIXME: The current implementation does not account for pairs that
+    // are connected in multiple ways. For example:
+    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
+    enum PairConnectionType {
+      PairConnectionDirect,
+      PairConnectionSwap,
+      PairConnectionSplat
+    };
+
+    void computeConnectedPairs(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes);
+
+    void buildDepMap(BasicBlock &BB,
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *>& ChosenPairs);
+
+    void fuseChosenPairs(BasicBlock &BB,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *>& ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
+
+
+    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+    bool areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore, bool NonPow2Len,
+                       int &CostSavings, int &FixedOrder);
+
+    bool trackUsesOfI(DenseSet<Value *> &Users,
+                      AliasSetTracker &WriteSet, Instruction *I,
+                      Instruction *J, bool UpdateUsers = true,
+                      DenseSet<ValuePair> *LoadMoveSetPairs = 0);
+
+  void computePairsConnectedTo(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             ValuePair P);
+
+    bool pairsConflict(ValuePair P, ValuePair Q,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> >
+               *PairableInstUserMap = 0,
+             DenseSet<VPPair> *PairableInstUserPairSet = 0);
+
+    bool pairWillFormCycle(ValuePair P,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
+             DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG,
+             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+             bool UseCycleCheck);
+
+    void buildInitialDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
+
+    void findBestDAGFor(
+             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+             DenseSet<ValuePair> &CandidatePairsSet,
+             DenseMap<ValuePair, int> &CandidatePairCostSavings,
+             std::vector<Value *> &PairableInsts,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<VPPair> &PairableInstUserPairSet,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+             bool UseCycleCheck);
+
+    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o);
+
+    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask);
+
+    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+                     Instruction *J);
+
+    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
+                       unsigned o, Value *&LOp, unsigned numElemL,
+                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
+                       unsigned IdxOff = 0);
+
+    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool IBeforeJ);
+
+    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ);
+
+    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt, Instruction *&K1,
+                     Instruction *&K2);
+
+    void collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *I);
+
+    void collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs);
+
+    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *I, Instruction *J);
+
+    void moveUsesOfIAfterJ(BasicBlock &BB,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J);
+
+    void combineMetadata(Instruction *K, const Instruction *J);
+
+    bool vectorizeBB(BasicBlock &BB) {
+      if (!DT->isReachableFromEntry(&BB)) {
+        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
+              " in " << BB.getParent()->getName() << "\n");
+        return false;
+      }
+
+      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
+
+      bool changed = false;
+      // Iterate a sufficient number of times to merge types of size 1 bit,
+      // then 2 bits, then 4, etc. up to half of the target vector width of the
+      // target vector register.
+      unsigned n = 1;
+      for (unsigned v = 2;
+           (TTI || v <= Config.VectorBits) &&
+           (!Config.MaxIter || n <= Config.MaxIter);
+           v *= 2, ++n) {
+        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
+              " for " << BB.getName() << " in " <<
+              BB.getParent()->getName() << "...\n");
+        if (vectorizePairs(BB))
+          changed = true;
+        else
+          break;
+      }
+
+      if (changed && !Pow2LenOnly) {
+        ++n;
+        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
+          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
+                n << " for " << BB.getName() << " in " <<
+                BB.getParent()->getName() << "...\n");
+          if (!vectorizePairs(BB, true)) break;
+        }
+      }
+
+      DEBUG(dbgs() << "BBV: done!\n");
+      return changed;
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      AA = &getAnalysis<AliasAnalysis>();
+      DT = &getAnalysis<DominatorTree>();
+      SE = &getAnalysis<ScalarEvolution>();
+      TD = getAnalysisIfAvailable<DataLayout>();
+      TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>();
+
+      return vectorizeBB(BB);
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      BasicBlockPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetTransformInfo>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<DominatorTree>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.setPreservesCFG();
+    }
+
+    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
+      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
+             "Cannot form vector from incompatible scalar types");
+      Type *STy = ElemTy->getScalarType();
+
+      unsigned numElem;
+      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+        numElem = VTy->getNumElements();
+      } else {
+        numElem = 1;
+      }
+
+      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
+        numElem += VTy->getNumElements();
+      } else {
+        numElem += 1;
+      }
+
+      return VectorType::get(STy, numElem);
+    }
+
+    static inline void getInstructionTypes(Instruction *I,
+                                           Type *&T1, Type *&T2) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // For stores, it is the value type, not the pointer type that matters
+        // because the value is what will come from a vector register.
+  
+        Value *IVal = SI->getValueOperand();
+        T1 = IVal->getType();
+      } else {
+        T1 = I->getType();
+      }
+  
+      if (CastInst *CI = dyn_cast<CastInst>(I))
+        T2 = CI->getSrcTy();
+      else
+        T2 = T1;
+
+      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        T2 = SI->getCondition()->getType();
+      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+        T2 = SI->getOperand(0)->getType();
+      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+        T2 = CI->getOperand(0)->getType();
+      }
+    }
+
+    // Returns the weight associated with the provided value. A chain of
+    // candidate pairs has a length given by the sum of the weights of its
+    // members (one weight per pair; the weight of each member of the pair
+    // is assumed to be the same). This length is then compared to the
+    // chain-length threshold to determine if a given chain is significant
+    // enough to be vectorized. The length is also used in comparing
+    // candidate chains where longer chains are considered to be better.
+    // Note: when this function returns 0, the resulting instructions are
+    // not actually fused.
+    inline size_t getDepthFactor(Value *V) {
+      // InsertElement and ExtractElement have a depth factor of zero. This is
+      // for two reasons: First, they cannot be usefully fused. Second, because
+      // the pass generates a lot of these, they can confuse the simple metric
+      // used to compare the dags in the next iteration. Thus, giving them a
+      // weight of zero allows the pass to essentially ignore them in
+      // subsequent iterations when looking for vectorization opportunities
+      // while still tracking dependency chains that flow through those
+      // instructions.
+      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+        return 0;
+
+      // Give a load or store half of the required depth so that load/store
+      // pairs will vectorize.
+      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
+        return Config.ReqChainDepth/2;
+
+      return 1;
+    }
+
+    // Returns the cost of the provided instruction using TTI.
+    // This does not handle loads and stores.
+    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+      switch (Opcode) {
+      default: break;
+      case Instruction::GetElementPtr:
+        // We mark this instruction as zero-cost because scalar GEPs are usually
+        // lowered to the intruction addressing mode. At the moment we don't
+        // generate vector GEPs.
+        return 0;
+      case Instruction::Br:
+        return TTI->getCFInstrCost(Opcode);
+      case Instruction::PHI:
+        return 0;
+      case Instruction::Add:
+      case Instruction::FAdd:
+      case Instruction::Sub:
+      case Instruction::FSub:
+      case Instruction::Mul:
+      case Instruction::FMul:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+        return TTI->getArithmeticInstrCost(Opcode, T1);
+      case Instruction::Select:
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+      case Instruction::ShuffleVector:
+        return TTI->getCastInstrCost(Opcode, T1, T2);
+      }
+
+      return 1;
+    }
+
+    // This determines the relative offset of two loads or stores, returning
+    // true if the offset could be determined to be some constant value.
+    // For example, if OffsetInElmts == 1, then J accesses the memory directly
+    // after I; if OffsetInElmts == -1 then I accesses the memory
+    // directly after J.
+    bool getPairPtrInfo(Instruction *I, Instruction *J,
+        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+        unsigned &IAddressSpace, unsigned &JAddressSpace,
+        int64_t &OffsetInElmts, bool ComputeOffset = true) {
+      OffsetInElmts = 0;
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        LoadInst *LJ = cast<LoadInst>(J);
+        IPtr = LI->getPointerOperand();
+        JPtr = LJ->getPointerOperand();
+        IAlignment = LI->getAlignment();
+        JAlignment = LJ->getAlignment();
+        IAddressSpace = LI->getPointerAddressSpace();
+        JAddressSpace = LJ->getPointerAddressSpace();
+      } else {
+        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+        IPtr = SI->getPointerOperand();
+        JPtr = SJ->getPointerOperand();
+        IAlignment = SI->getAlignment();
+        JAlignment = SJ->getAlignment();
+        IAddressSpace = SI->getPointerAddressSpace();
+        JAddressSpace = SJ->getPointerAddressSpace();
+      }
+
+      if (!ComputeOffset)
+        return true;
+
+      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+      // If this is a trivial offset, then we'll get something like
+      // 1*sizeof(type). With target data, which we need anyway, this will get
+      // constant folded into a number.
+      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+      if (const SCEVConstant *ConstOffSCEV =
+            dyn_cast<SCEVConstant>(OffsetSCEV)) {
+        ConstantInt *IntOff = ConstOffSCEV->getValue();
+        int64_t Offset = IntOff->getSExtValue();
+
+        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        if (VTy != VTy2 && Offset < 0) {
+          int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
+          OffsetInElmts = Offset/VTy2TSS;
+          return (abs64(Offset) % VTy2TSS) == 0;
+        }
+
+        OffsetInElmts = Offset/VTyTSS;
+        return (abs64(Offset) % VTyTSS) == 0;
+      }
+
+      return false;
+    }
+
+    // Returns true if the provided CallInst represents an intrinsic that can
+    // be vectorized.
+    bool isVectorizableIntrinsic(CallInst* I) {
+      Function *F = I->getCalledFunction();
+      if (!F) return false;
+
+      Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
+      if (!IID) return false;
+
+      switch(IID) {
+      default:
+        return false;
+      case Intrinsic::sqrt:
+      case Intrinsic::powi:
+      case Intrinsic::sin:
+      case Intrinsic::cos:
+      case Intrinsic::log:
+      case Intrinsic::log2:
+      case Intrinsic::log10:
+      case Intrinsic::exp:
+      case Intrinsic::exp2:
+      case Intrinsic::pow:
+        return Config.VectorizeMath;
+      case Intrinsic::fma:
+      case Intrinsic::fmuladd:
+        return Config.VectorizeFMA;
+      }
+    }
+
+    bool isPureIEChain(InsertElementInst *IE) {
+      InsertElementInst *IENext = IE;
+      do {
+        if (!isa<UndefValue>(IENext->getOperand(0)) &&
+            !isa<InsertElementInst>(IENext->getOperand(0))) {
+          return false;
+        }
+      } while ((IENext =
+                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
+
+      return true;
+    }
+  };
+
+  // This function implements one vectorization iteration on the provided
+  // basic block. It returns true if the block is changed.
+  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
+    bool ShouldContinue;
+    BasicBlock::iterator Start = BB.getFirstInsertionPt();
+
+    std::vector<Value *> AllPairableInsts;
+    DenseMap<Value *, Value *> AllChosenPairs;
+    DenseSet<ValuePair> AllFixedOrderPairs;
+    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
+                                                 AllConnectedPairDeps;
+
+    do {
+      std::vector<Value *> PairableInsts;
+      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
+      DenseSet<ValuePair> FixedOrderPairs;
+      DenseMap<ValuePair, int> CandidatePairCostSavings;
+      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+                                         FixedOrderPairs,
+                                         CandidatePairCostSavings,
+                                         PairableInsts, NonPow2Len);
+      if (PairableInsts.empty()) continue;
+
+      // Build the candidate pair set for faster lookups.
+      DenseSet<ValuePair> CandidatePairsSet;
+      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
+           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
+        for (std::vector<Value *>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          CandidatePairsSet.insert(ValuePair(I->first, *J));
+
+      // Now we have a map of all of the pairable instructions and we need to
+      // select the best possible pairing. A good pairing is one such that the
+      // users of the pair are also paired. This defines a (directed) forest
+      // over the pairs such that two pairs are connected iff the second pair
+      // uses the first.
+
+      // Note that it only matters that both members of the second pair use some
+      // element of the first pair (to allow for splatting).
+
+      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
+                                                   ConnectedPairDeps;
+      DenseMap<VPPair, unsigned> PairConnectionTypes;
+      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
+                            PairableInsts, ConnectedPairs, PairConnectionTypes);
+      if (ConnectedPairs.empty()) continue;
+
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+             JE = I->second.end(); J != JE; ++J)
+          ConnectedPairDeps[*J].push_back(I->first);
+
+      // Build the pairable-instruction dependency map
+      DenseSet<ValuePair> PairableInstUsers;
+      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+      // There is now a graph of the connected pairs. For each variable, pick
+      // the pairing with the largest dag meeting the depth requirement on at
+      // least one branch. Then select all pairings that are part of that dag
+      // and remove them from the list of available pairings and pairable
+      // variables.
+
+      DenseMap<Value *, Value *> ChosenPairs;
+      choosePairs(CandidatePairs, CandidatePairsSet,
+        CandidatePairCostSavings,
+        PairableInsts, FixedOrderPairs, PairConnectionTypes,
+        ConnectedPairs, ConnectedPairDeps,
+        PairableInstUsers, ChosenPairs);
+
+      if (ChosenPairs.empty()) continue;
+      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
+                              PairableInsts.end());
+      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+      // Only for the chosen pairs, propagate information on fixed-order pairs,
+      // pair connections, and their types to the data structures used by the
+      // pair fusion procedures.
+      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
+           IE = ChosenPairs.end(); I != IE; ++I) {
+        if (FixedOrderPairs.count(*I))
+          AllFixedOrderPairs.insert(*I);
+        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
+          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+             J != IE; ++J) {
+          DenseMap<VPPair, unsigned>::iterator K =
+            PairConnectionTypes.find(VPPair(*I, *J));
+          if (K != PairConnectionTypes.end()) {
+            AllPairConnectionTypes.insert(*K);
+          } else {
+            K = PairConnectionTypes.find(VPPair(*J, *I));
+            if (K != PairConnectionTypes.end())
+              AllPairConnectionTypes.insert(*K);
+          }
+        }
+      }
+
+      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I)
+        for (std::vector<ValuePair>::iterator J = I->second.begin(),
+          JE = I->second.end(); J != JE; ++J)
+          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
+            AllConnectedPairs[I->first].push_back(*J);
+            AllConnectedPairDeps[*J].push_back(I->first);
+          }
+    } while (ShouldContinue);
+
+    if (AllChosenPairs.empty()) return false;
+    NumFusedOps += AllChosenPairs.size();
+
+    // A set of pairs has now been selected. It is now necessary to replace the
+    // paired instructions with vector instructions. For this procedure each
+    // operand must be replaced with a vector operand. This vector is formed
+    // by using build_vector on the old operands. The replaced values are then
+    // replaced with a vector_extract on the result.  Subsequent optimization
+    // passes should coalesce the build/extract combinations.
+
+    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+                    AllPairConnectionTypes,
+                    AllConnectedPairs, AllConnectedPairDeps);
+
+    // It is important to cleanup here so that future iterations of this
+    // function have less work to do.
+    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
+    return true;
+  }
+
+  // This function returns true if the provided instruction is capable of being
+  // fused into a vector instruction. This determination is based only on the
+  // type and other attributes of the instruction.
+  bool BBVectorize::isInstVectorizable(Instruction *I,
+                                         bool &IsSimpleLoadStore) {
+    IsSimpleLoadStore = false;
+
+    if (CallInst *C = dyn_cast<CallInst>(I)) {
+      if (!isVectorizableIntrinsic(C))
+        return false;
+    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      // Vectorize simple loads if possbile:
+      IsSimpleLoadStore = L->isSimple();
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+        return false;
+    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+      // Vectorize simple stores if possbile:
+      IsSimpleLoadStore = S->isSimple();
+      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
+        return false;
+    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+      // We can vectorize casts, but not casts of pointer types, etc.
+      if (!Config.VectorizeCasts)
+        return false;
+
+      Type *SrcTy = C->getSrcTy();
+      if (!SrcTy->isSingleValueType())
+        return false;
+
+      Type *DestTy = C->getDestTy();
+      if (!DestTy->isSingleValueType())
+        return false;
+    } else if (isa<SelectInst>(I)) {
+      if (!Config.VectorizeSelect)
+        return false;
+    } else if (isa<CmpInst>(I)) {
+      if (!Config.VectorizeCmp)
+        return false;
+    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
+      if (!Config.VectorizeGEP)
+        return false;
+
+      // Currently, vector GEPs exist only with one index.
+      if (G->getNumIndices() != 1)
+        return false;
+    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
+        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
+      return false;
+    }
+
+    // We can't vectorize memory operations without target data
+    if (TD == 0 && IsSimpleLoadStore)
+      return false;
+
+    Type *T1, *T2;
+    getInstructionTypes(I, T1, T2);
+
+    // Not every type can be vectorized...
+    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
+      return false;
+
+    if (T1->getScalarSizeInBits() == 1) {
+      if (!Config.VectorizeBools)
+        return false;
+    } else {
+      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
+        return false;
+    }
+
+    if (T2->getScalarSizeInBits() == 1) {
+      if (!Config.VectorizeBools)
+        return false;
+    } else {
+      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
+        return false;
+    }
+
+    if (!Config.VectorizeFloats
+        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+      return false;
+
+    // Don't vectorize target-specific types.
+    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
+      return false;
+    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
+      return false;
+
+    if ((!Config.VectorizePointers || TD == 0) &&
+        (T1->getScalarType()->isPointerTy() ||
+         T2->getScalarType()->isPointerTy()))
+      return false;
+
+    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
+      return false;
+
+    return true;
+  }
+
+  // This function returns true if the two provided instructions are compatible
+  // (meaning that they can be fused into a vector instruction). This assumes
+  // that I has already been determined to be vectorizable and that J is not
+  // in the use dag of I.
+  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore, bool NonPow2Len,
+                       int &CostSavings, int &FixedOrder) {
+    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
+                     " <-> " << *J << "\n");
+
+    CostSavings = 0;
+    FixedOrder = 0;
+
+    // Loads and stores can be merged if they have different alignments,
+    // but are otherwise the same.
+    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
+                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
+      return false;
+
+    Type *IT1, *IT2, *JT1, *JT2;
+    getInstructionTypes(I, IT1, IT2);
+    getInstructionTypes(J, JT1, JT2);
+    unsigned MaxTypeBits = std::max(
+      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
+      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
+    if (!TTI && MaxTypeBits > Config.VectorBits)
+      return false;
+
+    // FIXME: handle addsub-type operations!
+
+    if (IsSimpleLoadStore) {
+      Value *IPtr, *JPtr;
+      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+      int64_t OffsetInElmts = 0;
+      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+            IAddressSpace, JAddressSpace,
+            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+        FixedOrder = (int) OffsetInElmts;
+        unsigned BottomAlignment = IAlignment;
+        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+        Type *aTypeI = isa<StoreInst>(I) ?
+          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+        Type *aTypeJ = isa<StoreInst>(J) ?
+          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
+
+        if (Config.AlignedOnly) {
+          // An aligned load or store is possible only if the instruction
+          // with the lower offset has an alignment suitable for the
+          // vector type.
+
+          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+          if (BottomAlignment < VecAlignment)
+            return false;
+        }
+
+        if (TTI) {
+          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
+                                                IAlignment, IAddressSpace);
+          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
+                                                JAlignment, JAddressSpace);
+          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
+                                                BottomAlignment,
+                                                IAddressSpace);
+
+          ICost += TTI->getAddressComputationCost(aTypeI);
+          JCost += TTI->getAddressComputationCost(aTypeJ);
+          VCost += TTI->getAddressComputationCost(VType);
+
+          if (VCost > ICost + JCost)
+            return false;
+
+          // We don't want to fuse to a type that will be split, even
+          // if the two input types will also be split and there is no other
+          // associated cost.
+          unsigned VParts = TTI->getNumberOfParts(VType);
+          if (VParts > 1)
+            return false;
+          else if (!VParts && VCost == ICost + JCost)
+            return false;
+
+          CostSavings = ICost + JCost - VCost;
+        }
+      } else {
+        return false;
+      }
+    } else if (TTI) {
+      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+      Type *VT1 = getVecTypeForPair(IT1, JT1),
+           *VT2 = getVecTypeForPair(IT2, JT2);
+
+      // Note that this procedure is incorrect for insert and extract element
+      // instructions (because combining these often results in a shuffle),
+      // but this cost is ignored (because insert and extract element
+      // instructions are assigned a zero depth factor and are not really
+      // fused in general).
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+
+      if (VCost > ICost + JCost)
+        return false;
+
+      // We don't want to fuse to a type that will be split, even
+      // if the two input types will also be split and there is no other
+      // associated cost.
+      unsigned VParts1 = TTI->getNumberOfParts(VT1),
+               VParts2 = TTI->getNumberOfParts(VT2);
+      if (VParts1 > 1 || VParts2 > 1)
+        return false;
+      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+        return false;
+
+      CostSavings = ICost + JCost - VCost;
+    }
+
+    // The powi intrinsic is special because only the first argument is
+    // vectorized, the second arguments must be equal.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    Function *FI;
+    if (CI && (FI = CI->getCalledFunction())) {
+      Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID();
+      if (IID == Intrinsic::powi) {
+        Value *A1I = CI->getArgOperand(1),
+              *A1J = cast<CallInst>(J)->getArgOperand(1);
+        const SCEV *A1ISCEV = SE->getSCEV(A1I),
+                   *A1JSCEV = SE->getSCEV(A1J);
+        return (A1ISCEV == A1JSCEV);
+      }
+
+      if (IID && TTI) {
+        SmallVector<Type*, 4> Tys;
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+          Tys.push_back(CI->getArgOperand(i)->getType());
+        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys);
+
+        Tys.clear();
+        CallInst *CJ = cast<CallInst>(J);
+        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
+          Tys.push_back(CJ->getArgOperand(i)->getType());
+        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys);
+
+        Tys.clear();
+        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
+               "Intrinsic argument counts differ");
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          if (IID == Intrinsic::powi && i == 1)
+            Tys.push_back(CI->getArgOperand(i)->getType());
+          else
+            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
+                                            CJ->getArgOperand(i)->getType()));
+        }
+
+        Type *RetTy = getVecTypeForPair(IT1, JT1);
+        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys);
+
+        if (VCost > ICost + JCost)
+          return false;
+
+        // We don't want to fuse to a type that will be split, even
+        // if the two input types will also be split and there is no other
+        // associated cost.
+        unsigned RetParts = TTI->getNumberOfParts(RetTy);
+        if (RetParts > 1)
+          return false;
+        else if (!RetParts && VCost == ICost + JCost)
+          return false;
+
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          if (!Tys[i]->isVectorTy())
+            continue;
+
+          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
+          if (NumParts > 1)
+            return false;
+          else if (!NumParts && VCost == ICost + JCost)
+            return false;
+        }
+
+        CostSavings = ICost + JCost - VCost;
+      }
+    }
+
+    return true;
+  }
+
+  // Figure out whether or not J uses I and update the users and write-set
+  // structures associated with I. Specifically, Users represents the set of
+  // instructions that depend on I. WriteSet represents the set
+  // of memory locations that are dependent on I. If UpdateUsers is true,
+  // and J uses I, then Users is updated to contain J and WriteSet is updated
+  // to contain any memory locations to which J writes. The function returns
+  // true if J uses I. By default, alias analysis is used to determine
+  // whether J reads from memory that overlaps with a location in WriteSet.
+  // If LoadMoveSet is not null, then it is a previously-computed map
+  // where the key is the memory-based user instruction and the value is
+  // the instruction to be compared with I. So, if LoadMoveSet is provided,
+  // then the alias analysis is not used. This is necessary because this
+  // function is called during the process of moving instructions during
+  // vectorization and the results of the alias analysis are not stable during
+  // that process.
+  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+                       AliasSetTracker &WriteSet, Instruction *I,
+                       Instruction *J, bool UpdateUsers,
+                       DenseSet<ValuePair> *LoadMoveSetPairs) {
+    bool UsesI = false;
+
+    // This instruction may already be marked as a user due, for example, to
+    // being a member of a selected pair.
+    if (Users.count(J))
+      UsesI = true;
+
+    if (!UsesI)
+      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
+           JU != JE; ++JU) {
+        Value *V = *JU;
+        if (I == V || Users.count(V)) {
+          UsesI = true;
+          break;
+        }
+      }
+    if (!UsesI && J->mayReadFromMemory()) {
+      if (LoadMoveSetPairs) {
+        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
+      } else {
+        for (AliasSetTracker::iterator W = WriteSet.begin(),
+             WE = WriteSet.end(); W != WE; ++W) {
+          if (W->aliasesUnknownInst(J, *AA)) {
+            UsesI = true;
+            break;
+          }
+        }
+      }
+    }
+
+    if (UsesI && UpdateUsers) {
+      if (J->mayWriteToMemory()) WriteSet.add(J);
+      Users.insert(J);
+    }
+
+    return UsesI;
+  }
+
+  // This function iterates over all instruction pairs in the provided
+  // basic block and collects all candidate pairs for vectorization.
+  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
+                       BasicBlock::iterator &Start,
+                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                       DenseSet<ValuePair> &FixedOrderPairs,
+                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
+    size_t TotalPairs = 0;
+    BasicBlock::iterator E = BB.end();
+    if (Start == E) return false;
+
+    bool ShouldContinue = false, IAfterStart = false;
+    for (BasicBlock::iterator I = Start++; I != E; ++I) {
+      if (I == Start) IAfterStart = true;
+
+      bool IsSimpleLoadStore;
+      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+
+      // Look for an instruction with which to pair instruction *I...
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      bool JAfterStart = IAfterStart;
+      BasicBlock::iterator J = llvm::next(I);
+      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
+        if (J == Start) JAfterStart = true;
+
+        // Determine if J uses I, if so, exit the loop.
+        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep);
+        if (Config.FastDep) {
+          // Note: For this heuristic to be effective, independent operations
+          // must tend to be intermixed. This is likely to be true from some
+          // kinds of grouped loop unrolling (but not the generic LLVM pass),
+          // but otherwise may require some kind of reordering pass.
+
+          // When using fast dependency analysis,
+          // stop searching after first use:
+          if (UsesI) break;
+        } else {
+          if (UsesI) continue;
+        }
+
+        // J does not use I, and comes before the first use of I, so it can be
+        // merged with I if the instructions are compatible.
+        int CostSavings, FixedOrder;
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
+            CostSavings, FixedOrder)) continue;
+
+        // J is a candidate for merging with I.
+        if (!PairableInsts.size() ||
+             PairableInsts[PairableInsts.size()-1] != I) {
+          PairableInsts.push_back(I);
+        }
+
+        CandidatePairs[I].push_back(J);
+        ++TotalPairs;
+        if (TTI)
+          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
+                                                            CostSavings));
+
+        if (FixedOrder == 1)
+          FixedOrderPairs.insert(ValuePair(I, J));
+        else if (FixedOrder == -1)
+          FixedOrderPairs.insert(ValuePair(J, I));
+
+        // The next call to this function must start after the last instruction
+        // selected during this invocation.
+        if (JAfterStart) {
+          Start = llvm::next(J);
+          IAfterStart = JAfterStart = false;
+        }
+
+        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
+                     << *I << " <-> " << *J << " (cost savings: " <<
+                     CostSavings << ")\n");
+
+        // If we have already found too many pairs, break here and this function
+        // will be called again starting after the last instruction selected
+        // during this invocation.
+        if (PairableInsts.size() >= Config.MaxInsts ||
+            TotalPairs >= Config.MaxPairs) {
+          ShouldContinue = true;
+          break;
+        }
+      }
+
+      if (ShouldContinue)
+        break;
+    }
+
+    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
+           << " instructions with candidate pairs\n");
+
+    return ShouldContinue;
+  }
+
+  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+  // it looks for pairs such that both members have an input which is an
+  // output of PI or PJ.
+  void BBVectorize::computePairsConnectedTo(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                  ValuePair P) {
+    StoreInst *SI, *SJ;
+
+    // For each possible pairing for this variable, look at the uses of
+    // the first value...
+    for (Value::use_iterator I = P.first->use_begin(),
+         E = P.first->use_end(); I != E; ++I) {
+      if (isa<LoadInst>(*I)) {
+        // A pair cannot be connected to a load because the load only takes one
+        // operand (the address) and it is a scalar even after vectorization.
+        continue;
+      } else if ((SI = dyn_cast<StoreInst>(*I)) &&
+                 P.first == SI->getPointerOperand()) {
+        // Similarly, a pair cannot be connected to a store through its
+        // pointer operand.
+        continue;
+      }
+
+      // For each use of the first variable, look for uses of the second
+      // variable...
+      for (Value::use_iterator J = P.second->use_begin(),
+           E2 = P.second->use_end(); J != E2; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.second == SJ->getPointerOperand())
+          continue;
+
+        // Look for <I, J>:
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs[VP.first].push_back(VP.second);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+        }
+
+        // Look for <J, I>:
+        if (CandidatePairsSet.count(ValuePair(*J, *I))) {
+          VPPair VP(P, ValuePair(*J, *I));
+          ConnectedPairs[VP.first].push_back(VP.second);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+        }
+      }
+
+      if (Config.SplatBreaksChain) continue;
+      // Look for cases where just the first value in the pair is used by
+      // both members of another pair (splatting).
+      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.first == SJ->getPointerOperand())
+          continue;
+
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs[VP.first].push_back(VP.second);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
+      }
+    }
+
+    if (Config.SplatBreaksChain) return;
+    // Look for cases where just the second value in the pair is used by
+    // both members of another pair (splatting).
+    for (Value::use_iterator I = P.second->use_begin(),
+         E = P.second->use_end(); I != E; ++I) {
+      if (isa<LoadInst>(*I))
+        continue;
+      else if ((SI = dyn_cast<StoreInst>(*I)) &&
+               P.second == SI->getPointerOperand())
+        continue;
+
+      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+        if ((SJ = dyn_cast<StoreInst>(*J)) &&
+            P.second == SJ->getPointerOperand())
+          continue;
+
+        if (CandidatePairsSet.count(ValuePair(*I, *J))) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs[VP.first].push_back(VP.second);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
+      }
+    }
+  }
+
+  // This function figures out which pairs are connected.  Two pairs are
+  // connected if some output of the first pair forms an input to both members
+  // of the second pair.
+  void BBVectorize::computeConnectedPairs(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PE = PairableInsts.end(); PI != PE; ++PI) {
+      DenseMap<Value *, std::vector<Value *> >::iterator PP =
+        CandidatePairs.find(*PI);
+      if (PP == CandidatePairs.end())
+        continue;
+
+      for (std::vector<Value *>::iterator P = PP->second.begin(),
+           E = PP->second.end(); P != E; ++P)
+        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
+                                PairableInsts, ConnectedPairs,
+                                PairConnectionTypes, ValuePair(*PI, *P));
+    }
+
+    DEBUG(size_t TotalPairs = 0;
+          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
+               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
+            TotalPairs += I->second.size();
+          dbgs() << "BBV: found " << TotalPairs
+                 << " pair connections.\n");
+  }
+
+  // This function builds a set of use tuples such that <A, B> is in the set
+  // if B is in the use dag of A. If B is in the use dag of A, then B
+  // depends on the output of A.
+  void BBVectorize::buildDepMap(
+                      BasicBlock &BB,
+                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &PairableInstUsers) {
+    DenseSet<Value *> IsInPair;
+    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
+         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
+      IsInPair.insert(C->first);
+      IsInPair.insert(C->second.begin(), C->second.end());
+    }
+
+    // Iterate through the basic block, recording all users of each
+    // pairable instruction.
+
+    BasicBlock::iterator E = BB.end(), EL =
+      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
+    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+      if (IsInPair.find(I) == IsInPair.end()) continue;
+
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
+        (void) trackUsesOfI(Users, WriteSet, I, J);
+
+        if (J == EL)
+          break;
+      }
+
+      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+           U != E; ++U) {
+        if (IsInPair.find(*U) == IsInPair.end()) continue;
+        PairableInstUsers.insert(ValuePair(I, *U));
+      }
+
+      if (I == EL)
+        break;
+    }
+  }
+
+  // Returns true if an input to pair P is an output of pair Q and also an
+  // input of pair Q is an output of pair P. If this is the case, then these
+  // two pairs cannot be simultaneously fused.
+  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
+             DenseSet<ValuePair> &PairableInstUsers,
+             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
+             DenseSet<VPPair> *PairableInstUserPairSet) {
+    // Two pairs are in conflict if they are mutual Users of eachother.
+    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.second));
+    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.second));
+    if (PairableInstUserMap) {
+      // FIXME: The expensive part of the cycle check is not so much the cycle
+      // check itself but this edge insertion procedure. This needs some
+      // profiling and probably a different data structure.
+      if (PUsesQ) {
+        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
+          (*PairableInstUserMap)[Q].push_back(P);
+      }
+      if (QUsesP) {
+        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
+          (*PairableInstUserMap)[P].push_back(Q);
+      }
+    }
+
+    return (QUsesP && PUsesQ);
+  }
+
+  // This function walks the use graph of current pairs to see if, starting
+  // from P, the walk returns to P.
+  bool BBVectorize::pairWillFormCycle(ValuePair P,
+             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+             DenseSet<ValuePair> &CurrentPairs) {
+    DEBUG(if (DebugCycleCheck)
+            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
+                   << *P.second << "\n");
+    // A lookup table of visisted pairs is kept because the PairableInstUserMap
+    // contains non-direct associations.
+    DenseSet<ValuePair> Visited;
+    SmallVector<ValuePair, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(P);
+    do {
+      ValuePair QTop = Q.pop_back_val();
+      Visited.insert(QTop);
+
+      DEBUG(if (DebugCycleCheck)
+              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
+                     << *QTop.second << "\n");
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        PairableInstUserMap.find(QTop);
+      if (QQ == PairableInstUserMap.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
+           CE = QQ->second.end(); C != CE; ++C) {
+        if (*C == P) {
+          DEBUG(dbgs()
+                 << "BBV: rejected to prevent non-trivial cycle formation: "
+                 << QTop.first << " <-> " << C->second << "\n");
+          return true;
+        }
+
+        if (CurrentPairs.count(*C) && !Visited.count(*C))
+          Q.push_back(*C);
+      }
+    } while (!Q.empty());
+
+    return false;
+  }
+
+  // This function builds the initial dag of connected pairs with the
+  // pair J at the root.
+  void BBVectorize::buildInitialDAGFor(
+                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                  DenseSet<ValuePair> &CandidatePairsSet,
+                  std::vector<Value *> &PairableInsts,
+                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                  DenseSet<ValuePair> &PairableInstUsers,
+                  DenseMap<Value *, Value *> &ChosenPairs,
+                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a DAG. The DAG
+    // is then walked (depth-first). As this happens, we keep track of
+    // the pairs that compose the DAG and the maximum depth of the DAG.
+    SmallVector<ValuePairWithDepth, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    do {
+      ValuePairWithDepth QTop = Q.back();
+
+      // Push each child onto the queue:
+      bool MoreChildren = false;
+      size_t MaxChildDepth = QTop.second;
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
+             ke = QQ->second.end(); k != ke; ++k) {
+          // Make sure that this child pair is still a candidate:
+          if (CandidatePairsSet.count(*k)) {
+            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
+            if (C == DAG.end()) {
+              size_t d = getDepthFactor(k->first);
+              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
+              MoreChildren = true;
+            } else {
+              MaxChildDepth = std::max(MaxChildDepth, C->second);
+            }
+          }
+        }
+
+      if (!MoreChildren) {
+        // Record the current pair as part of the DAG:
+        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        Q.pop_back();
+      }
+    } while (!Q.empty());
+  }
+
+  // Given some initial dag, prune it by removing conflicting pairs (pairs
+  // that cannot be simultaneously chosen for vectorization).
+  void BBVectorize::pruneDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              std::vector<Value *> &PairableInsts,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseMap<ValuePair, size_t> &DAG,
+              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
+              bool UseCycleCheck) {
+    SmallVector<ValuePairWithDepth, 32> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    do {
+      ValuePairWithDepth QTop = Q.pop_back_val();
+      PrunedDAG.insert(QTop.first);
+
+      // Visit each child, pruning as necessary...
+      SmallVector<ValuePairWithDepth, 8> BestChildren;
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
+        ConnectedPairs.find(QTop.first);
+      if (QQ == ConnectedPairs.end())
+        continue;
+
+      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
+           KE = QQ->second.end(); K != KE; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
+        if (C == DAG.end()) continue;
+
+        // This child is in the DAG, now we need to make sure it is the
+        // best of any conflicting children. There could be multiple
+        // conflicting children, so first, determine if we're keeping
+        // this child, then delete conflicting children as necessary.
+
+        // It is also necessary to guard against pairing-induced
+        // dependencies. Consider instructions a .. x .. y .. b
+        // such that (a,b) are to be fused and (x,y) are to be fused
+        // but a is an input to x and b is an output from y. This
+        // means that y cannot be moved after b but x must be moved
+        // after b for (a,b) to be fused. In other words, after
+        // fusing (a,b) we have y .. a/b .. x where y is an input
+        // to a/b and x is an output to a/b: x and y can no longer
+        // be legally fused. To prevent this condition, we must
+        // make sure that a child pair added to the DAG is not
+        // both an input and output of an already-selected pair.
+
+        // Pairing-induced dependencies can also form from more complicated
+        // cycles. The pair vs. pair conflicts are easy to check, and so
+        // that is done explicitly for "fast rejection", and because for
+        // child vs. child conflicts, we may prefer to keep the current
+        // pair in preference to the already-selected child.
+        DenseSet<ValuePair> CurrentPairs;
+
+        bool CanAdd = true;
+        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+              = BestChildren.begin(), E2 = BestChildren.end();
+             C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+            if (C2->second >= C->second) {
+              CanAdd = false;
+              break;
+            }
+
+            CurrentPairs.insert(C2->first);
+          }
+        }
+        if (!CanAdd) continue;
+
+        // Even worse, this child could conflict with another node already
+        // selected for the DAG. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
+             E2 = PrunedDAG.end(); T != E2; ++T) {
+          if (T->first == C->first.first ||
+              T->first == C->first.second ||
+              T->second == C->first.first ||
+              T->second == C->first.second ||
+              pairsConflict(*T, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*T);
+        }
+        if (!CanAdd) continue;
+
+        // And check the queue too...
+        for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+             E2 = Q.end(); C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(C2->first);
+        }
+        if (!CanAdd) continue;
+
+        // Last but not least, check for a conflict with any of the
+        // already-chosen pairs.
+        for (DenseMap<Value *, Value *>::iterator C2 =
+              ChosenPairs.begin(), E2 = ChosenPairs.end();
+             C2 != E2; ++C2) {
+          if (pairsConflict(*C2, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0,
+                            UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*C2);
+        }
+        if (!CanAdd) continue;
+
+        // To check for non-trivial cycles formed by the addition of the
+        // current pair we've formed a list of all relevant pairs, now use a
+        // graph walk to check for a cycle. We start from the current pair and
+        // walk the use dag to see if we again reach the current pair. If we
+        // do, then the current pair is rejected.
+
+        // FIXME: It may be more efficient to use a topological-ordering
+        // algorithm to improve the cycle check. This should be investigated.
+        if (UseCycleCheck &&
+            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+          continue;
+
+        // This child can be added, but we may have chosen it in preference
+        // to an already-selected child. Check for this here, and if a
+        // conflict is found, then remove the previously-selected child
+        // before adding this one in its place.
+        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+              = BestChildren.begin(); C2 != BestChildren.end();) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers))
+            C2 = BestChildren.erase(C2);
+          else
+            ++C2;
+        }
+
+        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
+      }
+
+      for (SmallVector<ValuePairWithDepth, 8>::iterator C
+            = BestChildren.begin(), E2 = BestChildren.end();
+           C != E2; ++C) {
+        size_t DepthF = getDepthFactor(C->first.first);
+        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+      }
+    } while (!Q.empty());
+  }
+
+  // This function finds the best dag of mututally-compatible connected
+  // pairs, given the choice of root pairs as an iterator range.
+  void BBVectorize::findBestDAGFor(
+              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+              DenseSet<ValuePair> &CandidatePairsSet,
+              DenseMap<ValuePair, int> &CandidatePairCostSavings,
+              std::vector<Value *> &PairableInsts,
+              DenseSet<ValuePair> &FixedOrderPairs,
+              DenseMap<VPPair, unsigned> &PairConnectionTypes,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+              DenseSet<ValuePair> &PairableInstUsers,
+              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
+              DenseSet<VPPair> &PairableInstUserPairSet,
+              DenseMap<Value *, Value *> &ChosenPairs,
+              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
+              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
+              bool UseCycleCheck) {
+    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
+         J != JE; ++J) {
+      ValuePair IJ(II, *J);
+      if (!CandidatePairsSet.count(IJ))
+        continue;
+
+      // Before going any further, make sure that this pair does not
+      // conflict with any already-selected pairs (see comment below
+      // near the DAG pruning for more details).
+      DenseSet<ValuePair> ChosenPairSet;
+      bool DoesConflict = false;
+      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+           E = ChosenPairs.end(); C != E; ++C) {
+        if (pairsConflict(*C, IJ, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0,
+                          UseCycleCheck ? &PairableInstUserPairSet : 0)) {
+          DoesConflict = true;
+          break;
+        }
+
+        ChosenPairSet.insert(*C);
+      }
+      if (DoesConflict) continue;
+
+      if (UseCycleCheck &&
+          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
+        continue;
+
+      DenseMap<ValuePair, size_t> DAG;
+      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
+                          PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, DAG, IJ);
+
+      // Because we'll keep the child with the largest depth, the largest
+      // depth is still the same in the unpruned DAG.
+      size_t MaxDepth = DAG.lookup(IJ);
+
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
+                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+                   MaxDepth << " and size " << DAG.size() << "\n");
+
+      // At this point the DAG has been constructed, but, may contain
+      // contradictory children (meaning that different children of
+      // some dag node may be attempting to fuse the same instruction).
+      // So now we walk the dag again, in the case of a conflict,
+      // keep only the child with the largest depth. To break a tie,
+      // favor the first child.
+
+      DenseSet<ValuePair> PrunedDAG;
+      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap,
+                   PairableInstUserPairSet,
+                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
+
+      int EffSize = 0;
+      if (TTI) {
+        DenseSet<Value *> PrunedDAGInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
+          PrunedDAGInstrs.insert(S->first);
+          PrunedDAGInstrs.insert(S->second);
+        }
+
+        // The set of pairs that have already contributed to the total cost.
+        DenseSet<ValuePair> IncomingPairs;
+
+        // If the cost model were perfect, this might not be necessary; but we
+        // need to make sure that we don't get stuck vectorizing our own
+        // shuffle chains.
+        bool HasNontrivialInsts = false;
+
+        // The node weights represent the cost savings associated with
+        // fusing the pair of instructions.
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S) {
+          if (!isa<ShuffleVectorInst>(S->first) &&
+              !isa<InsertElementInst>(S->first) &&
+              !isa<ExtractElementInst>(S->first))
+            HasNontrivialInsts = true;
+
+          bool FlipOrder = false;
+
+          if (getDepthFactor(S->first)) {
+            int ESContrib = CandidatePairCostSavings.find(*S)->second;
+            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+                   << *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+            EffSize += ESContrib;
+          }
+
+          // The edge weights contribute in a negative sense: they represent
+          // the cost of shuffles.
+          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
+            ConnectedPairDeps.find(*S);
+          if (SS != ConnectedPairDeps.end()) {
+            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
+                continue;
+              DenseMap<VPPair, unsigned>::iterator R =
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
+              assert(R != PairConnectionTypes.end() &&
+                     "Cannot find pair connection type");
+              if (R->second == PairConnectionDirect)
+                ++NumDepsDirect;
+              else if (R->second == PairConnectionSwap)
+                ++NumDepsSwap;
+            }
+
+            // If there are more swaps than direct connections, then
+            // the pair order will be flipped during fusion. So the real
+            // number of swaps is the minimum number.
+            FlipOrder = !FixedOrderPairs.count(*S) &&
+              ((NumDepsSwap > NumDepsDirect) ||
+                FixedOrderPairs.count(ValuePair(S->second, S->first)));
+
+            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
+                 TE = SS->second.end(); T != TE; ++T) {
+              VPPair Q(*S, *T);
+              if (!PrunedDAG.count(Q.second))
+                continue;
+              DenseMap<VPPair, unsigned>::iterator R =
+                PairConnectionTypes.find(VPPair(Q.second, Q.first));
+              assert(R != PairConnectionTypes.end() &&
+                     "Cannot find pair connection type");
+              Type *Ty1 = Q.second.first->getType(),
+                   *Ty2 = Q.second.second->getType();
+              Type *VTy = getVecTypeForPair(Ty1, Ty2);
+              if ((R->second == PairConnectionDirect && FlipOrder) ||
+                  (R->second == PairConnectionSwap && !FlipOrder)  ||
+                  R->second == PairConnectionSplat) {
+                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                                   VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2) {
+                  if (R->second == PairConnectionSplat)
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Broadcast, VTy));
+                  else
+                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                      TargetTransformInfo::SK_Reverse, VTy));
+                }
+
+                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                  *Q.second.first << " <-> " << *Q.second.second <<
+                    "} -> {" <<
+                  *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+                EffSize -= ESContrib;
+              }
+            }
+          }
+
+          // Compute the cost of outgoing edges. We assume that edges outgoing
+          // to shuffles, inserts or extracts can be merged, and so contribute
+          // no additional cost.
+          if (!S->first->getType()->isVoidTy()) {
+            Type *Ty1 = S->first->getType(),
+                 *Ty2 = S->second->getType();
+            Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+            bool NeedsExtraction = false;
+            for (Value::use_iterator I = S->first->use_begin(),
+                 IE = S->first->use_end(); I != IE; ++I) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+                // Shuffle can be folded if it has no other input
+                if (isa<UndefValue>(SI->getOperand(1)))
+                  continue;
+              }
+              if (isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedDAGInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty1->isVectorTy()) {
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty1, VTy);
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
+              } else
+                ESContrib = (int) TTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 0);
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->first << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+
+            NeedsExtraction = false;
+            for (Value::use_iterator I = S->second->use_begin(),
+                 IE = S->second->use_end(); I != IE; ++I) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+                // Shuffle can be folded if it has no other input
+                if (isa<UndefValue>(SI->getOperand(1)))
+                  continue;
+              }
+              if (isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedDAGInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty2->isVectorTy()) {
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty2, VTy);
+                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                  TargetTransformInfo::SK_ExtractSubvector, VTy,
+                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
+              } else
+                ESContrib = (int) TTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 1);
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->second << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+          }
+
+          // Compute the cost of incoming edges.
+          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+            Instruction *S1 = cast<Instruction>(S->first),
+                        *S2 = cast<Instruction>(S->second);
+            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+              // Combining constants into vector constants (or small vector
+              // constants into larger ones are assumed free).
+              if (isa<Constant>(O1) && isa<Constant>(O2))
+                continue;
+
+              if (FlipOrder)
+                std::swap(O1, O2);
+
+              ValuePair VP  = ValuePair(O1, O2);
+              ValuePair VPR = ValuePair(O2, O1);
+
+              // Internal edges are not handled here.
+              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
+                continue;
+
+              Type *Ty1 = O1->getType(),
+                   *Ty2 = O2->getType();
+              Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+              // Combining vector operations of the same type is also assumed
+              // folded with other operations.
+              if (Ty1 == Ty2) {
+                // If both are insert elements, then both can be widened.
+                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
+                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
+                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
+                  continue;
+                // If both are extract elements, and both have the same input
+                // type, then they can be replaced with a shuffle
+                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
+                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
+                if (EIO1 && EIO2 &&
+                    EIO1->getOperand(0)->getType() ==
+                      EIO2->getOperand(0)->getType())
+                  continue;
+                // If both are a shuffle with equal operand types and only two
+                // unqiue operands, then they can be replaced with a single
+                // shuffle
+                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
+                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
+                if (SIO1 && SIO2 &&
+                    SIO1->getOperand(0)->getType() ==
+                      SIO2->getOperand(0)->getType()) {
+                  SmallSet<Value *, 4> SIOps;
+                  SIOps.insert(SIO1->getOperand(0));
+                  SIOps.insert(SIO1->getOperand(1));
+                  SIOps.insert(SIO2->getOperand(0));
+                  SIOps.insert(SIO2->getOperand(1));
+                  if (SIOps.size() <= 2)
+                    continue;
+                }
+              }
+
+              int ESContrib;
+              // This pair has already been formed.
+              if (IncomingPairs.count(VP)) {
+                continue;
+              } else if (IncomingPairs.count(VPR)) {
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, VTy);
+
+                if (VTy->getVectorNumElements() == 2)
+                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
+                    TargetTransformInfo::SK_Reverse, VTy));
+              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+                ESContrib = (int) TTI->getVectorInstrCost(
+                                    Instruction::InsertElement, VTy, 0);
+                ESContrib += (int) TTI->getVectorInstrCost(
+                                     Instruction::InsertElement, VTy, 1);
+              } else if (!Ty1->isVectorTy()) {
+                // O1 needs to be inserted into a vector of size O2, and then
+                // both need to be shuffled together.
+                ESContrib = (int) TTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty2, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty2);
+              } else if (!Ty2->isVectorTy()) {
+                // O2 needs to be inserted into a vector of size O1, and then
+                // both need to be shuffled together.
+                ESContrib = (int) TTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty1, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty1);
+              } else {
+                Type *TyBig = Ty1, *TySmall = Ty2;
+                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+                  std::swap(TyBig, TySmall);
+
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, TyBig);
+                if (TyBig != TySmall)
+                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                  TyBig, TySmall);
+              }
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+                     << *O1 << " <-> " << *O2 << "} = " <<
+                     ESContrib << "\n");
+              EffSize -= ESContrib;
+              IncomingPairs.insert(VP);
+            }
+          }
+        }
+
+        if (!HasNontrivialInsts) {
+          DEBUG(if (DebugPairSelection) dbgs() <<
+                "\tNo non-trivial instructions in DAG;"
+                " override to zero effective size\n");
+          EffSize = 0;
+        }
+      } else {
+        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
+             E = PrunedDAG.end(); S != E; ++S)
+          EffSize += (int) getDepthFactor(S->first);
+      }
+
+      DEBUG(if (DebugPairSelection)
+             dbgs() << "BBV: found pruned DAG for pair {"
+             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
+             MaxDepth << " and size " << PrunedDAG.size() <<
+            " (effective size: " << EffSize << ")\n");
+      if (((TTI && !UseChainDepthWithTI) ||
+            MaxDepth >= Config.ReqChainDepth) &&
+          EffSize > 0 && EffSize > BestEffSize) {
+        BestMaxDepth = MaxDepth;
+        BestEffSize = EffSize;
+        BestDAG = PrunedDAG;
+      }
+    }
+  }
+
+  // Given the list of candidate pairs, this function selects those
+  // that will be fused into vector instructions.
+  void BBVectorize::choosePairs(
+                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
+                DenseSet<ValuePair> &CandidatePairsSet,
+                DenseMap<ValuePair, int> &CandidatePairCostSavings,
+                std::vector<Value *> &PairableInsts,
+                DenseSet<ValuePair> &FixedOrderPairs,
+                DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
+                DenseSet<ValuePair> &PairableInstUsers,
+                DenseMap<Value *, Value *>& ChosenPairs) {
+    bool UseCycleCheck =
+     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
+
+    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
+    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
+         E = CandidatePairsSet.end(); I != E; ++I) {
+      std::vector<Value *> &JJ = CandidatePairs2[I->second];
+      if (JJ.empty()) JJ.reserve(32);
+      JJ.push_back(I->first);
+    }
+
+    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
+    DenseSet<VPPair> PairableInstUserPairSet;
+    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+         E = PairableInsts.end(); I != E; ++I) {
+      // The number of possible pairings for this variable:
+      size_t NumChoices = CandidatePairs.lookup(*I).size();
+      if (!NumChoices) continue;
+
+      std::vector<Value *> &JJ = CandidatePairs[*I];
+
+      // The best pair to choose and its dag:
+      size_t BestMaxDepth = 0;
+      int BestEffSize = 0;
+      DenseSet<ValuePair> BestDAG;
+      findBestDAGFor(CandidatePairs, CandidatePairsSet,
+                      CandidatePairCostSavings,
+                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
+                      ConnectedPairs, ConnectedPairDeps,
+                      PairableInstUsers, PairableInstUserMap,
+                      PairableInstUserPairSet, ChosenPairs,
+                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
+                      UseCycleCheck);
+
+      if (BestDAG.empty())
+        continue;
+
+      // A dag has been chosen (or not) at this point. If no dag was
+      // chosen, then this instruction, I, cannot be paired (and is no longer
+      // considered).
+
+      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
+                   << *cast<Instruction>(*I) << "\n");
+
+      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
+           SE2 = BestDAG.end(); S != SE2; ++S) {
+        // Insert the members of this dag into the list of chosen pairs.
+        ChosenPairs.insert(ValuePair(S->first, S->second));
+        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
+               *S->second << "\n");
+
+        // Remove all candidate pairs that have values in the chosen dag.
+        std::vector<Value *> &KK = CandidatePairs[S->first];
+        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
+             K != KE; ++K) {
+          if (*K == S->second)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(S->first, *K));
+        }
+
+        std::vector<Value *> &LL = CandidatePairs2[S->second];
+        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
+             L != LE; ++L) {
+          if (*L == S->first)
+            continue;
+
+          CandidatePairsSet.erase(ValuePair(*L, S->second));
+        }
+
+        std::vector<Value *> &MM = CandidatePairs[S->second];
+        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
+             M != ME; ++M) {
+          assert(*M != S->first && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(S->second, *M));
+        }
+
+        std::vector<Value *> &NN = CandidatePairs2[S->first];
+        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
+             N != NE; ++N) {
+          assert(*N != S->second && "Flipped pair in candidate list?");
+          CandidatePairsSet.erase(ValuePair(*N, S->first));
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
+  }
+
+  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+                     unsigned n = 0) {
+    if (!I->hasName())
+      return "";
+
+    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+             (n > 0 ? "." + utostr(n) : "")).str();
+  }
+
+  // Returns the value that is to be used as the pointer input to the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
+                     Instruction *I, Instruction *J, unsigned o) {
+    Value *IPtr, *JPtr;
+    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
+    int64_t OffsetInElmts;
+
+    // Note: the analysis might fail here, that is why the pair order has
+    // been precomputed (OffsetInElmts must be unused here).
+    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+                          IAddressSpace, JAddressSpace,
+                          OffsetInElmts, false);
+
+    // The pointer value is taken to be the one with the lowest offset.
+    Value *VPtr = IPtr;
+
+    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+    Type *VArgPtrType = PointerType::get(VArgType,
+      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+                        /* insert before */ I);
+  }
+
+  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned MaskOffset, unsigned NumInElem,
+                     unsigned NumInElem1, unsigned IdxOffset,
+                     std::vector<Constant*> &Mask) {
+    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    for (unsigned v = 0; v < NumElem1; ++v) {
+      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+      if (m < 0) {
+        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+      } else {
+        unsigned mm = m + (int) IdxOffset;
+        if (m >= (int) NumInElem1)
+          mm += (int) NumInElem;
+
+        Mask[v+MaskOffset] =
+          ConstantInt::get(Type::getInt32Ty(Context), mm);
+      }
+    }
+  }
+
+  // Returns the value that is to be used as the vector-shuffle mask to the
+  // vector instruction that fuses I with J.
+  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
+                     Instruction *I, Instruction *J) {
+    // This is the shuffle mask. We need to append the second
+    // mask to the first, and the numbers need to be adjusted.
+
+    Type *ArgTypeI = I->getType();
+    Type *ArgTypeJ = J->getType();
+    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
+
+    // Get the total number of elements in the fused vector type.
+    // By definition, this must equal the number of elements in
+    // the final mask.
+    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    std::vector<Constant*> Mask(NumElem);
+
+    Type *OpTypeI = I->getOperand(0)->getType();
+    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    Type *OpTypeJ = J->getOperand(0)->getType();
+    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+
+    // The fused vector will be:
+    // -----------------------------------------------------
+    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
+    // -----------------------------------------------------
+    // from which we'll extract NumElem total elements (where the first NumElemI
+    // of them come from the mask in I and the remainder come from the mask
+    // in J.
+
+    // For the mask from the first pair...
+    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
+                       0,          Mask);
+
+    // For the mask from the second pair...
+    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
+                       NumInElemI, Mask);
+
+    return ConstantVector::get(Mask);
+  }
+
+  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
+                                  Instruction *J, unsigned o, Value *&LOp,
+                                  unsigned numElemL,
+                                  Type *ArgTypeL, Type *ArgTypeH,
+                                  bool IBeforeJ, unsigned IdxOff) {
+    bool ExpandedIEChain = false;
+    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
+      // If we have a pure insertelement chain, then this can be rewritten
+      // into a chain that directly builds the larger type.
+      if (isPureIEChain(LIE)) {
+        SmallVector<Value *, 8> VectElemts(numElemL,
+          UndefValue::get(ArgTypeL->getScalarType()));
+        InsertElementInst *LIENext = LIE;
+        do {
+          unsigned Idx =
+            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
+          VectElemts[Idx] = LIENext->getOperand(1);
+        } while ((LIENext =
+                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
+
+        LIENext = 0;
+        Value *LIEPrev = UndefValue::get(ArgTypeH);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          if (isa<UndefValue>(VectElemts[i])) continue;
+          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
+                             ConstantInt::get(Type::getInt32Ty(Context),
+                                              i + IdxOff),
+                             getReplacementName(IBeforeJ ? I : J,
+                                                true, o, i+1));
+          LIENext->insertBefore(IBeforeJ ? J : I);
+          LIEPrev = LIENext;
+        }
+
+        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
+        ExpandedIEChain = true;
+      }
+    }
+
+    return ExpandedIEChain;
+  }
+
+  // Returns the value to be used as the specified operand of the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool IBeforeJ) {
+    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+    // Compute the fused vector type for this operand
+    Type *ArgTypeI = I->getOperand(o)->getType();
+    Type *ArgTypeJ = J->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+    Instruction *L = I, *H = J;
+    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
+
+    unsigned numElemL;
+    if (ArgTypeL->isVectorTy())
+      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
+    else
+      numElemL = 1;
+
+    unsigned numElemH;
+    if (ArgTypeH->isVectorTy())
+      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
+    else
+      numElemH = 1;
+
+    Value *LOp = L->getOperand(o);
+    Value *HOp = H->getOperand(o);
+    unsigned numElem = VArgType->getNumElements();
+
+    // First, we check if we can reuse the "original" vector outputs (if these
+    // exist). We might need a shuffle.
+    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
+    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
+    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
+    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
+
+    // FIXME: If we're fusing shuffle instructions, then we can't apply this
+    // optimization. The input vectors to the shuffle might be a different
+    // length from the shuffle outputs. Unfortunately, the replacement
+    // shuffle mask has already been formed, and the mask entries are sensitive
+    // to the sizes of the inputs.
+    bool IsSizeChangeShuffle =
+      isa<ShuffleVectorInst>(L) &&
+        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
+
+    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
+      // We can have at most two unique vector inputs.
+      bool CanUseInputs = true;
+      Value *I1, *I2 = 0;
+      if (LEE) {
+        I1 = LEE->getOperand(0);
+      } else {
+        I1 = LSV->getOperand(0);
+        I2 = LSV->getOperand(1);
+        if (I2 == I1 || isa<UndefValue>(I2))
+          I2 = 0;
+      }
+  
+      if (HEE) {
+        Value *I3 = HEE->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+      } else {
+        Value *I3 = HSV->getOperand(0);
+        if (!I2 && I3 != I1)
+          I2 = I3;
+        else if (I3 != I1 && I3 != I2)
+          CanUseInputs = false;
+
+        if (CanUseInputs) {
+          Value *I4 = HSV->getOperand(1);
+          if (!isa<UndefValue>(I4)) {
+            if (!I2 && I4 != I1)
+              I2 = I4;
+            else if (I4 != I1 && I4 != I2)
+              CanUseInputs = false;
+          }
+        }
+      }
+
+      if (CanUseInputs) {
+        unsigned LOpElem =
+          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
+            ->getNumElements();
+        unsigned HOpElem =
+          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
+            ->getNumElements();
+
+        // We have one or two input vectors. We need to map each index of the
+        // operands to the index of the original vector.
+        SmallVector<std::pair<int, int>, 8>  II(numElem);
+        for (unsigned i = 0; i < numElemL; ++i) {
+          int Idx, INum;
+          if (LEE) {
+            Idx =
+              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
+            INum = LEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = LSV->getMaskValue(i);
+            if (Idx < (int) LOpElem) {
+              INum = LSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= LOpElem;
+              INum = LSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i] = std::pair<int, int>(Idx, INum);
+        }
+        for (unsigned i = 0; i < numElemH; ++i) {
+          int Idx, INum;
+          if (HEE) {
+            Idx =
+              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
+            INum = HEE->getOperand(0) == I1 ? 0 : 1;
+          } else {
+            Idx = HSV->getMaskValue(i);
+            if (Idx < (int) HOpElem) {
+              INum = HSV->getOperand(0) == I1 ? 0 : 1;
+            } else {
+              Idx -= HOpElem;
+              INum = HSV->getOperand(1) == I1 ? 0 : 1;
+            }
+          }
+
+          II[i + numElemL] = std::pair<int, int>(Idx, INum);
+        }
+
+        // We now have an array which tells us from which index of which
+        // input vector each element of the operand comes.
+        VectorType *I1T = cast<VectorType>(I1->getType());
+        unsigned I1Elem = I1T->getNumElements();
+
+        if (!I2) {
+          // In this case there is only one underlying vector input. Check for
+          // the trivial case where we can use the input directly.
+          if (I1Elem == numElem) {
+            bool ElemInOrder = true;
+            for (unsigned i = 0; i < numElem; ++i) {
+              if (II[i].first != (int) i && II[i].first != -1) {
+                ElemInOrder = false;
+                break;
+              }
+            }
+
+            if (ElemInOrder)
+              return I1;
+          }
+
+          // A shuffle is needed.
+          std::vector<Constant *> Mask(numElem);
+          for (unsigned i = 0; i < numElem; ++i) {
+            int Idx = II[i].first;
+            if (Idx == -1)
+              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
+            else
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+
+          Instruction *S =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o));
+          S->insertBefore(IBeforeJ ? J : I);
+          return S;
+        }
+
+        VectorType *I2T = cast<VectorType>(I2->getType());
+        unsigned I2Elem = I2T->getNumElements();
+
+        // This input comes from two distinct vectors. The first step is to
+        // make sure that both vectors are the same length. If not, the
+        // smaller one will need to grow before they can be shuffled together.
+        if (I1Elem < I2Elem) {
+          std::vector<Constant *> Mask(I2Elem);
+          unsigned v = 0;
+          for (; v < I1Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I2Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI1 =
+            new ShuffleVectorInst(I1, UndefValue::get(I1T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI1->insertBefore(IBeforeJ ? J : I);
+          I1 = NewI1;
+          I1T = I2T;
+          I1Elem = I2Elem;
+        } else if (I1Elem > I2Elem) {
+          std::vector<Constant *> Mask(I1Elem);
+          unsigned v = 0;
+          for (; v < I2Elem; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < I1Elem; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+
+          Instruction *NewI2 =
+            new ShuffleVectorInst(I2, UndefValue::get(I2T),
+                                  ConstantVector::get(Mask),
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI2->insertBefore(IBeforeJ ? J : I);
+          I2 = NewI2;
+          I2T = I1T;
+          I2Elem = I1Elem;
+        }
+
+        // Now that both I1 and I2 are the same length we can shuffle them
+        // together (and use the result).
+        std::vector<Constant *> Mask(numElem);
+        for (unsigned v = 0; v < numElem; ++v) {
+          if (II[v].first == -1) {
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+          } else {
+            int Idx = II[v].first + II[v].second * I1Elem;
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+          }
+        }
+
+        Instruction *NewOp =
+          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
+                                getReplacementName(IBeforeJ ? I : J, true, o));
+        NewOp->insertBefore(IBeforeJ ? J : I);
+        return NewOp;
+      }
+    }
+
+    Type *ArgType = ArgTypeL;
+    if (numElemL < numElemH) {
+      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
+                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
+        // This is another short-circuit case: we're combining a scalar into
+        // a vector that is formed by an IE chain. We've just expanded the IE
+        // chain, now insert the scalar and we're done.
+
+        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
+                           getReplacementName(IBeforeJ ? I : J, true, o));
+        S->insertBefore(IBeforeJ ? J : I);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
+                                ArgTypeH, IBeforeJ)) {
+        // The two vector inputs to the shuffle must be the same length,
+        // so extend the smaller vector to be the same length as the larger one.
+        Instruction *NLOp;
+        if (numElemL > 1) {
+  
+          std::vector<Constant *> Mask(numElemH);
+          unsigned v = 0;
+          for (; v < numElemL; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemH; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
+        } else {
+          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
+        }
+  
+        NLOp->insertBefore(IBeforeJ ? J : I);
+        LOp = NLOp;
+      }
+
+      ArgType = ArgTypeH;
+    } else if (numElemL > numElemH) {
+      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
+                                         ArgTypeH, VArgType, IBeforeJ)) {
+        Instruction *S =
+          InsertElementInst::Create(LOp, HOp, 
+                                    ConstantInt::get(Type::getInt32Ty(Context),
+                                                     numElemL),
+                                    getReplacementName(IBeforeJ ? I : J,
+                                                       true, o));
+        S->insertBefore(IBeforeJ ? J : I);
+        return S;
+      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
+                                ArgTypeL, IBeforeJ)) {
+        Instruction *NHOp;
+        if (numElemH > 1) {
+          std::vector<Constant *> Mask(numElemL);
+          unsigned v = 0;
+          for (; v < numElemH; ++v)
+            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          for (; v < numElemL; ++v)
+            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
+    
+          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
+                                       ConstantVector::get(Mask),
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
+        } else {
+          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
+        }
+  
+        NHOp->insertBefore(IBeforeJ ? J : I);
+        HOp = NHOp;
+      }
+    }
+
+    if (ArgType->isVectorTy()) {
+      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      std::vector<Constant*> Mask(numElem);
+      for (unsigned v = 0; v < numElem; ++v) {
+        unsigned Idx = v;
+        // If the low vector was expanded, we need to skip the extra
+        // undefined entries.
+        if (v >= numElemL && numElemH > numElemL)
+          Idx += (numElemH - numElemL);
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
+      }
+
+      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
+                          ConstantVector::get(Mask),
+                          getReplacementName(IBeforeJ ? I : J, true, o));
+      BV->insertBefore(IBeforeJ ? J : I);
+      return BV;
+    }
+
+    Instruction *BV1 = InsertElementInst::Create(
+                                          UndefValue::get(VArgType), LOp, CV0,
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 1));
+    BV1->insertBefore(IBeforeJ ? J : I);
+    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 2));
+    BV2->insertBefore(IBeforeJ ? J : I);
+    return BV2;
+  }
+
+  // This function creates an array of values that will be used as the inputs
+  // to the vector instruction that fuses I with J.
+  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
+                     Instruction *I, Instruction *J,
+                     SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ) {
+    unsigned NumOperands = I->getNumOperands();
+
+    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+      // Iterate backward so that we look at the store pointer
+      // first and know whether or not we need to flip the inputs.
+
+      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+        // This is the pointer for a load/store instruction.
+        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
+        continue;
+      } else if (isa<CallInst>(I)) {
+        Function *F = cast<CallInst>(I)->getCalledFunction();
+        Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID();
+        if (o == NumOperands-1) {
+          BasicBlock &BB = *I->getParent();
+
+          Module *M = BB.getParent()->getParent();
+          Type *ArgTypeI = I->getType();
+          Type *ArgTypeJ = J->getType();
+          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
+
+          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
+          continue;
+        } else if (IID == Intrinsic::powi && o == 1) {
+          // The second argument of powi is a single integer and we've already
+          // checked that both arguments are equal. As a result, we just keep
+          // I's second argument.
+          ReplacedOperands[o] = I->getOperand(o);
+          continue;
+        }
+      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+        continue;
+      }
+
+      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
+    }
+  }
+
+  // This function creates two values that represent the outputs of the
+  // original I and J instructions. These are generally vector shuffles
+  // or extracts. In many cases, these will end up being unused and, thus,
+  // eliminated by later passes.
+  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt,
+                     Instruction *&K1, Instruction *&K2) {
+    if (isa<StoreInst>(I)) {
+      AA->replaceWithNewValue(I, K);
+      AA->replaceWithNewValue(J, K);
+    } else {
+      Type *IType = I->getType();
+      Type *JType = J->getType();
+
+      VectorType *VType = getVecTypeForPair(IType, JType);
+      unsigned numElem = VType->getNumElements();
+
+      unsigned numElemI, numElemJ;
+      if (IType->isVectorTy())
+        numElemI = cast<VectorType>(IType)->getNumElements();
+      else
+        numElemI = 1;
+
+      if (JType->isVectorTy())
+        numElemJ = cast<VectorType>(JType)->getNumElements();
+      else
+        numElemJ = 1;
+
+      if (IType->isVectorTy()) {
+        std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
+        for (unsigned v = 0; v < numElemI; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v);
+        }
+
+        K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get( Mask1),
+                                   getReplacementName(K, false, 1));
+      } else {
+        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+        K1 = ExtractElementInst::Create(K, CV0,
+                                          getReplacementName(K, false, 1));
+      }
+
+      if (JType->isVectorTy()) {
+        std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ);
+        for (unsigned v = 0; v < numElemJ; ++v) {
+          Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+          Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v);
+        }
+
+        K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                   ConstantVector::get( Mask2),
+                                   getReplacementName(K, false, 2));
+      } else {
+        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
+        K2 = ExtractElementInst::Create(K, CV1,
+                                          getReplacementName(K, false, 2));
+      }
+
+      K1->insertAfter(K);
+      K2->insertAfter(K1);
+      InsertionPt = K2;
+    }
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J; ++L)
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
+
+    assert(cast<Instruction>(L) == J &&
+      "Tracking has not proceeded far enough to check for dependencies");
+    // If J is now in the use set of I, then trackUsesOfI will return true
+    // and we have a dependency cycle (and the fusing operation must abort).
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J;) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
+        // Move this instruction
+        Instruction *InstToMove = L; ++L;
+
+        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
+                        " to after " << *InsertionPt << "\n");
+        InstToMove->removeFromParent();
+        InstToMove->insertAfter(InsertionPt);
+        InsertionPt = InstToMove;
+      } else {
+        ++L;
+      }
+    }
+  }
+
+  // Collect all load instruction that are in the move set of a given first
+  // pair member.  These loads depend on the first instruction, I, and so need
+  // to be moved after J (the second instruction) when the pair is fused.
+  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs,
+                     Instruction *I) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = llvm::next(BasicBlock::iterator(I));
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+
+    // Note: We cannot end the loop when we reach J because J could be moved
+    // farther down the use chain by another instruction pairing. Also, J
+    // could be before I if this is an inverted input.
+    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
+      if (trackUsesOfI(Users, WriteSet, I, L)) {
+        if (L->mayReadFromMemory()) {
+          LoadMoveSet[L].push_back(I);
+          LoadMoveSetPairs.insert(ValuePair(L, I));
+        }
+      }
+    }
+  }
+
+  // In cases where both load/stores and the computation of their pointers
+  // are chosen for vectorization, we can end up in a situation where the
+  // aliasing analysis starts returning different query results as the
+  // process of fusing instruction pairs continues. Because the algorithm
+  // relies on finding the same use dags here as were found earlier, we'll
+  // need to precompute the necessary aliasing information here and then
+  // manually update it during the fusion process.
+  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
+                     DenseSet<ValuePair> &LoadMoveSetPairs) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PIE = PairableInsts.end(); PI != PIE; ++PI) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+      if (P == ChosenPairs.end()) continue;
+
+      Instruction *I = cast<Instruction>(P->first);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
+                             LoadMoveSetPairs, I);
+    }
+  }
+
+  // When the first instruction in each pair is cloned, it will inherit its
+  // parent's metadata. This metadata must be combined with that of the other
+  // instruction in a safe way.
+  void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) {
+    SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata;
+    K->getAllMetadataOtherThanDebugLoc(Metadata);
+    for (unsigned i = 0, n = Metadata.size(); i < n; ++i) {
+      unsigned Kind = Metadata[i].first;
+      MDNode *JMD = J->getMetadata(Kind);
+      MDNode *KMD = Metadata[i].second;
+
+      switch (Kind) {
+      default:
+        K->setMetadata(Kind, 0); // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+        break;
+      case LLVMContext::MD_fpmath:
+        K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+        break;
+      }
+    }
+  }
+
+  // This function fuses the chosen instruction pairs into vector instructions,
+  // taking care preserve any needed scalar outputs and, then, it reorders the
+  // remaining instructions as needed (users of the first member of the pair
+  // need to be moved to after the location of the second member of the pair
+  // because the vector instruction is inserted in the location of the pair's
+  // second member).
+  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
+             std::vector<Value *> &PairableInsts,
+             DenseMap<Value *, Value *> &ChosenPairs,
+             DenseSet<ValuePair> &FixedOrderPairs,
+             DenseMap<VPPair, unsigned> &PairConnectionTypes,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
+             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
+    LLVMContext& Context = BB.getContext();
+
+    // During the vectorization process, the order of the pairs to be fused
+    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+    // list. After a pair is fused, the flipped pair is removed from the list.
+    DenseSet<ValuePair> FlippedPairs;
+    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+         E = ChosenPairs.end(); P != E; ++P)
+      FlippedPairs.insert(ValuePair(P->second, P->first));
+    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
+         E = FlippedPairs.end(); P != E; ++P)
+      ChosenPairs.insert(*P);
+
+    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
+    DenseSet<ValuePair> LoadMoveSetPairs;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
+                       LoadMoveSet, LoadMoveSetPairs);
+
+    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
+
+    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+      if (P == ChosenPairs.end()) {
+        ++PI;
+        continue;
+      }
+
+      if (getDepthFactor(P->first) == 0) {
+        // These instructions are not really fused, but are tracked as though
+        // they are. Any case in which it would be interesting to fuse them
+        // will be taken care of by InstCombine.
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      Instruction *I = cast<Instruction>(P->first),
+        *J = cast<Instruction>(P->second);
+
+      DEBUG(dbgs() << "BBV: fusing: " << *I <<
+             " <-> " << *J << "\n");
+
+      // Remove the pair and flipped pair from the list.
+      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+      ChosenPairs.erase(FP);
+      ChosenPairs.erase(P);
+
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
+        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
+               " <-> " << *J <<
+               " aborted because of non-trivial dependency cycle\n");
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      // If the pair must have the other order, then flip it.
+      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+        // This pair does not have a fixed order, and so we might want to
+        // flip it if that will yield fewer shuffles. We count the number
+        // of dependencies connected via swaps, and those directly connected,
+        // and flip the order if the number of swaps is greater.
+        bool OrigOrder = true;
+        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
+          ConnectedPairDeps.find(ValuePair(I, J));
+        if (IJ == ConnectedPairDeps.end()) {
+          IJ = ConnectedPairDeps.find(ValuePair(J, I));
+          OrigOrder = false;
+        }
+
+        if (IJ != ConnectedPairDeps.end()) {
+          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
+               TE = IJ->second.end(); T != TE; ++T) {
+            VPPair Q(IJ->first, *T);
+            DenseMap<VPPair, unsigned>::iterator R =
+              PairConnectionTypes.find(VPPair(Q.second, Q.first));
+            assert(R != PairConnectionTypes.end() &&
+                   "Cannot find pair connection type");
+            if (R->second == PairConnectionDirect)
+              ++NumDepsDirect;
+            else if (R->second == PairConnectionSwap)
+              ++NumDepsSwap;
+          }
+
+          if (!OrigOrder)
+            std::swap(NumDepsDirect, NumDepsSwap);
+
+          if (NumDepsSwap > NumDepsDirect) {
+            FlipPairOrder = true;
+            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+                            " <-> " << *J << "\n");
+          }
+        }
+      }
+
+      Instruction *L = I, *H = J;
+      if (FlipPairOrder)
+        std::swap(H, L);
+
+      // If the pair being fused uses the opposite order from that in the pair
+      // connection map, then we need to flip the types.
+      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
+        ConnectedPairs.find(ValuePair(H, L));
+      if (HL != ConnectedPairs.end())
+        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
+             TE = HL->second.end(); T != TE; ++T) {
+          VPPair Q(HL->first, *T);
+          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
+          assert(R != PairConnectionTypes.end() &&
+                 "Cannot find pair connection type");
+          if (R->second == PairConnectionDirect)
+            R->second = PairConnectionSwap;
+          else if (R->second == PairConnectionSwap)
+            R->second = PairConnectionDirect;
+        }
+
+      bool LBeforeH = !FlipPairOrder;
+      unsigned NumOperands = I->getNumOperands();
+      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+                                  LBeforeH);
+
+      // Make a copy of the original operation, change its type to the vector
+      // type and replace its operands with the vector operands.
+      Instruction *K = L->clone();
+      if (L->hasName())
+        K->takeName(L);
+      else if (H->hasName())
+        K->takeName(H);
+
+      if (!isa<StoreInst>(K))
+        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
+
+      combineMetadata(K, H);
+      K->intersectOptionalDataWith(H);
+
+      for (unsigned o = 0; o < NumOperands; ++o)
+        K->setOperand(o, ReplacedOperands[o]);
+
+      K->insertAfter(J);
+
+      // Instruction insertion point:
+      Instruction *InsertionPt = K;
+      Instruction *K1 = 0, *K2 = 0;
+      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
+
+      // The use dag of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use dag of the
+      // first instruction is disjoint from the input dag of the second
+      // (by definition), and so commutes with it.
+
+      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
+
+      if (!isa<StoreInst>(I)) {
+        L->replaceAllUsesWith(K1);
+        H->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(L, K1);
+        AA->replaceWithNewValue(H, K2);
+      }
+
+      // Instructions that may read from memory may be in the load move set.
+      // Once an instruction is fused, we no longer need its move set, and so
+      // the values of the map never need to be updated. However, when a load
+      // is fused, we need to merge the entries from both instructions in the
+      // pair in case those instructions were in the move set of some other
+      // yet-to-be-fused pair. The loads in question are the keys of the map.
+      if (I->mayReadFromMemory()) {
+        std::vector<ValuePair> NewSetMembers;
+        DenseMap<Value *, std::vector<Value *> >::iterator II =
+          LoadMoveSet.find(I);
+        if (II != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = II->second.begin(),
+               NE = II->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
+        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
+          LoadMoveSet.find(J);
+        if (JJ != LoadMoveSet.end())
+          for (std::vector<Value *>::iterator N = JJ->second.begin(),
+               NE = JJ->second.end(); N != NE; ++N)
+            NewSetMembers.push_back(ValuePair(K, *N));
+        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+             AE = NewSetMembers.end(); A != AE; ++A) {
+          LoadMoveSet[A->first].push_back(A->second);
+          LoadMoveSetPairs.insert(*A);
+        }
+      }
+
+      // Before removing I, set the iterator to the next instruction.
+      PI = llvm::next(BasicBlock::iterator(I));
+      if (cast<Instruction>(PI) == J)
+        ++PI;
+
+      SE->forgetValue(I);
+      SE->forgetValue(J);
+      I->eraseFromParent();
+      J->eraseFromParent();
+
+      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+                                               BB << "\n");
+    }
+
+    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
+  }
+}
+
+char BBVectorize::ID = 0;
+static const char bb_vectorize_name[] = "Basic-Block Vectorization";
+INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+
+BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
+  return new BBVectorize(C);
+}
+
+bool
+llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
+  BBVectorize BBVectorizer(P, C);
+  return BBVectorizer.vectorizeBB(BB);
+}
+
+//===----------------------------------------------------------------------===//
+VectorizeConfig::VectorizeConfig() {
+  VectorBits = ::VectorBits;
+  VectorizeBools = !::NoBools;
+  VectorizeInts = !::NoInts;
+  VectorizeFloats = !::NoFloats;
+  VectorizePointers = !::NoPointers;
+  VectorizeCasts = !::NoCasts;
+  VectorizeMath = !::NoMath;
+  VectorizeFMA = !::NoFMA;
+  VectorizeSelect = !::NoSelect;
+  VectorizeCmp = !::NoCmp;
+  VectorizeGEP = !::NoGEP;
+  VectorizeMemOps = !::NoMemOps;
+  AlignedOnly = ::AlignedOnly;
+  ReqChainDepth= ::ReqChainDepth;
+  SearchLimit = ::SearchLimit;
+  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
+  SplatBreaksChain = ::SplatBreaksChain;
+  MaxInsts = ::MaxInsts;
+  MaxPairs = ::MaxPairs;
+  MaxIter = ::MaxIter;
+  Pow2LenOnly = ::Pow2LenOnly;
+  NoMemOpBoost = ::NoMemOpBoost;
+  FastDep = ::FastDep;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 000000000000..acf2b819b813
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,3504 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizer uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+//  Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<unsigned>
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+                    cl::desc("Sets the SIMD width. Zero is autoselect."));
+
+static cl::opt<unsigned>
+VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
+                    cl::desc("Sets the vectorization unroll count. "
+                             "Zero is autoselect."));
+
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                   cl::desc("Enable if-conversion during vectorization."));
+
+/// We don't vectorize loops with a known constant trip count below this number.
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+                             cl::Hidden,
+                             cl::desc("Don't vectorize loops with a constant "
+                                      "trip count that is smaller than this "
+                                      "value."));
+
+/// We don't unroll loops with a known constant trip count below this number.
+static const unsigned TinyTripCountUnrollThreshold = 128;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+static const unsigned RuntimeMemoryCheckThreshold = 4;
+
+/// We use a metadata with this name  to indicate that a scalar loop was
+/// vectorized and that we don't need to re-vectorize it if we run into it
+/// again.
+static const char*
+AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
+
+namespace {
+
+// Forward declarations.
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+  InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                      DominatorTree *DT, DataLayout *DL,
+                      const TargetLibraryInfo *TLI, unsigned VecWidth,
+                      unsigned UnrollFactor)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+        VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+        OldInduction(0), WidenMap(UnrollFactor) {}
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize(LoopVectorizationLegality *Legal) {
+    // Create a new empty loop. Unlink the old loop and connect the new one.
+    createEmptyLoop(Legal);
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
+    vectorizeLoop(Legal);
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
+  }
+
+private:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  /// When we unroll loops we have multiple vector values for each scalar.
+  /// This data structure holds the unrolled and vectorized values that
+  /// originated from one scalar instruction.
+  typedef SmallVector<Value*, 2> VectorParts;
+
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comparator value or NULL if no check is needed.
+  Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                               Instruction *Loc);
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  VectorParts createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// A helper function to vectorize a single BB within the innermost loop.
+  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+                            PhiVector *PV);
+
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Vectorize Load and Store instructions,
+  void vectorizeMemoryInstruction(Instruction *Instr,
+                                  LoopVectorizationLegality *Legal);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+  /// The sequence starts at StartIndex.
+  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  VectorParts &getVectorValue(Value *V);
+
+  /// Generate a shuffle sequence that will reverse the vector Vec.
+  Value *reverseVector(Value *Vec);
+
+  /// This is a helper class that holds the vectorizer state. It maps scalar
+  /// instructions to vector instructions. When the code is 'unrolled' then
+  /// then a single scalar value is mapped to multiple vector parts. The parts
+  /// are stored in the VectorPart type.
+  struct ValueMap {
+    /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
+    /// are mapped.
+    ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
+
+    /// \return True if 'Key' is saved in the Value Map.
+    bool has(Value *Key) const { return MapStorage.count(Key); }
+
+    /// Initializes a new entry in the map. Sets all of the vector parts to the
+    /// save value in 'Val'.
+    /// \return A reference to a vector with splat values.
+    VectorParts &splat(Value *Key, Value *Val) {
+      VectorParts &Entry = MapStorage[Key];
+      Entry.assign(UF, Val);
+      return Entry;
+    }
+
+    ///\return A reference to the value that is stored at 'Key'.
+    VectorParts &get(Value *Key) {
+      VectorParts &Entry = MapStorage[Key];
+      if (Entry.empty())
+        Entry.resize(UF);
+      assert(Entry.size() == UF);
+      return Entry;
+    }
+
+  private:
+    /// The unroll factor. Each entry in the map stores this number of vector
+    /// elements.
+    unsigned UF;
+
+    /// Map storage. We use std::map and not DenseMap because insertions to a
+    /// dense map invalidates its iterators.
+    std::map<Value *, VectorParts> MapStorage;
+  };
+
+  /// The original loop.
+  Loop *OrigLoop;
+  /// Scev analysis to use.
+  ScalarEvolution *SE;
+  /// Loop Info.
+  LoopInfo *LI;
+  /// Dominator Tree.
+  DominatorTree *DT;
+  /// Data Layout.
+  DataLayout *DL;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+
+  /// The vectorization SIMD factor to use. Each vector will have this many
+  /// vector elements.
+  unsigned VF;
+  /// The vectorization unroll factor to use. Each scalar is vectorized to this
+  /// many different vector instructions.
+  unsigned UF;
+
+  /// The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  /// A list of all bypass blocks. The first block is the entry of the loop.
+  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+  /// Maps scalars to widened vectors.
+  ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+                            DominatorTree *DT, TargetTransformInfo* TTI,
+                            AliasAnalysis *AA, TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+        Induction(0) {}
+
+  /// This enum represents the kinds of reductions that we support.
+  enum ReductionKind {
+    RK_NoReduction, ///< Not a reduction.
+    RK_IntegerAdd,  ///< Sum of integers.
+    RK_IntegerMult, ///< Product of integers.
+    RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
+    RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
+    RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_FloatAdd,    ///< Sum of floats.
+    RK_FloatMult    ///< Product of floats.
+  };
+
+  /// This enum represents the kinds of inductions that we support.
+  enum InductionKind {
+    IK_NoInduction,         ///< Not an induction variable.
+    IK_IntInduction,        ///< Integer induction variable. Step = 1.
+    IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
+    IK_PtrInduction,        ///< Pointer induction var. Step = sizeof(elem).
+    IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+      Kind(RK_NoReduction) {}
+
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck() : Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
+  /// A POD for saving information about induction variables.
+  struct InductionInfo {
+    InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
+    InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+    /// Start value.
+    Value *StartValue;
+    /// Induction kind.
+    InductionKind IK;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef MapVector<PHINode*, InductionInfo> InductionList;
+
+  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+  /// respective Store/Load instruction(s) to calculate aliasing.
+  typedef MapVector<Value*, Instruction* > AliasMap;
+  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
+
+  /// Returns the Induction variable.
+  PHINode *getInduction() { return Induction; }
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Returns True if V is an induction variable in this loop.
+  bool isInductionVariable(const Value *V);
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  /// Returns:
+  /// 0 - Stride is unknown or non consecutive.
+  /// 1 - Address is consecutive.
+  /// -1 - Address is consecutive, and decreasing.
+  int isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
+
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeInstrs();
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Collect the variables that need to stay uniform after vectorization.
+  void collectLoopUniforms();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed.
+  bool blockCanBePredicated(BasicBlock *BB);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns the induction kind of Phi. This function may return NoInduction
+  /// if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
+  /// Return true if there is the chance of write reorder.
+  bool hasPossibleGlobalWriteReorder(Value *Object,
+                                     Instruction *Inst,
+                                     AliasMultiMap &WriteObjects,
+                                     unsigned MaxByteWidth);
+  /// Return the AA location for a load or a store.
+  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+  /// Dominators.
+  DominatorTree *DT;
+  /// Target Info.
+  TargetTransformInfo *TTI;
+  /// Alias Analysis.
+  AliasAnalysis *AA;
+  /// Target Library Info.
+  TargetLibraryInfo *TLI;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
+                             LoopVectorizationLegality *Legal,
+                             const TargetTransformInfo &TTI,
+                             DataLayout *DL, const TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
+
+  /// Information about vectorization costs
+  struct VectorizationFactor {
+    unsigned Width; // Vector width with best cost
+    unsigned Cost; // Cost of the loop with that width
+  };
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
+  VectorizationFactor selectVectorizationFactor(bool OptForSize,
+                                                unsigned UserVF);
+
+  /// \return The size (in bits) of the widest type in the code that
+  /// needs to be vectorized. We ignore values that remain scalar such as
+  /// 64 bit loop indices.
+  unsigned getWidestType();
+
+  /// \return The most profitable unroll factor.
+  /// If UserUF is non-zero then this method finds the best unroll-factor
+  /// based on register pressure and other parameters.
+  /// VF and LoopCost are the selected vectorization factor and the cost of the
+  /// selected VF.
+  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+                              unsigned LoopCost);
+
+  /// \brief A struct that represents some properties of the register usage
+  /// of a loop.
+  struct RegisterUsage {
+    /// Holds the number of loop invariant values that are used in the loop.
+    unsigned LoopInvariantRegs;
+    /// Holds the maximum number of concurrent live intervals in the loop.
+    unsigned MaxLocalUsers;
+    /// Holds the number of instructions in the loop.
+    unsigned NumInstructions;
+  };
+
+  /// \return  information about the register usage of the loop.
+  RegisterUsage calculateRegisterUsage();
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+  /// Returns whether the instruction is a load or store and will be a emitted
+  /// as a vector operation.
+  bool isConsecutiveLoadOrStore(Instruction *I);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// Loop Info analysis.
+  LoopInfo *LI;
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const TargetTransformInfo &TTI;
+  /// Target data layout information.
+  DataLayout *DL;
+  /// Target Library Info.
+  const TargetLibraryInfo *TLI;
+};
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public LoopPass {
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit LoopVectorize() : LoopPass(ID) {
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  LoopInfo *LI;
+  TargetTransformInfo *TTI;
+  DominatorTree *DT;
+  AliasAnalysis *AA;
+  TargetLibraryInfo *TLI;
+
+  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+    // We only vectorize innermost loops.
+    if (!L->empty())
+      return false;
+
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    LI = &getAnalysis<LoopInfo>();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
+    AA = getAnalysisIfAvailable<AliasAnalysis>();
+    TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
+    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
+          L->getHeader()->getParent()->getName() << "\"\n");
+
+    // Check if it is legal to vectorize the loop.
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
+    if (!LVL.canVectorize()) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      return false;
+    }
+
+    // Use the cost model.
+    LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
+
+    // Check the function attributes to find out if this function should be
+    // optimized for size.
+    Function *F = L->getHeader()->getParent();
+    Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
+    Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
+    unsigned FnIndex = AttributeSet::FunctionIndex;
+    bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+    bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
+
+    if (NoFloat) {
+      DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+            "attribute is used.\n");
+      return false;
+    }
+
+    // Select the optimal vectorization factor.
+    LoopVectorizationCostModel::VectorizationFactor VF;
+    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    // Select the unroll factor.
+    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+                                        VF.Width, VF.Cost);
+
+    if (VF.Width == 1) {
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+      return false;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
+          F->getParent()->getModuleIdentifier()<<"\n");
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
+
+    // If we decided that it is *legal* to vectorize the loop then do it.
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+    LB.vectorize(&LVL);
+
+    DEBUG(verifyFunction(*L->getHeader()->getParent()));
+    return true;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    LoopPass::getAnalysisUsage(AU);
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<DominatorTree>();
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<TargetTransformInfo>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
+  }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+void
+LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
+                                                       Loop *Lp, Value *Ptr) {
+  const SCEV *Sc = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+  assert(AR && "Invalid addrec expression");
+  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+  Pointers.push_back(Ptr);
+  Starts.push_back(AR->getStart());
+  Ends.push_back(ScEnd);
+}
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
+  return Shuf;
+}
+
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+                                                 bool Negate) {
+  assert(Val->getType()->isVectorTy() && "Must be a vector");
+  assert(Val->getType()->getScalarType()->isIntegerTy() &&
+         "Elem must be an integer");
+  // Create the types.
+  Type *ITy = Val->getType()->getScalarType();
+  VectorType *Ty = cast<VectorType>(Val->getType());
+  int VLen = Ty->getNumElements();
+  SmallVector<Constant*, 8> Indices;
+
+  // Create a vector of consecutive numbers from zero to VF.
+  for (int i = 0; i < VLen; ++i) {
+    int Idx = Negate ? (-i): i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+  }
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+  return Builder.CreateAdd(Val, Cv, "induction");
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+  // Make sure that the pointer does not point to structs.
+  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+    return 0;
+
+  // If this value is a pointer induction variable we know it is consecutive.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && Inductions.count(Phi)) {
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
+  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return 0;
+
+  unsigned NumOperands = Gep->getNumOperands();
+  Value *LastIndex = Gep->getOperand(NumOperands - 1);
+
+  Value *GpPtr = Gep->getPointerOperand();
+  // If this GEP value is a consecutive pointer induction variable and all of
+  // the indices are constant then we know it is consecutive. We can
+  Phi = dyn_cast<PHINode>(GpPtr);
+  if (Phi && Inductions.count(Phi)) {
+
+    // Make sure that the pointer does not point to structs.
+    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+    if (GepPtrType->getElementType()->isAggregateType())
+      return 0;
+
+    // Make sure that all of the index operands are loop invariant.
+    for (unsigned i = 1; i < NumOperands; ++i)
+      if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+        return 0;
+
+    InductionInfo II = Inductions[Phi];
+    if (IK_PtrInduction == II.IK)
+      return 1;
+    else if (IK_ReversePtrInduction == II.IK)
+      return -1;
+  }
+
+  // Check that all of the gep indices are uniform except for the last.
+  for (unsigned i = 0; i < NumOperands - 1; ++i)
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      return 0;
+
+  // We can emit wide load/stores only if the last index is the induction
+  // variable.
+  const SCEV *Last = SE->getSCEV(LastIndex);
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+
+    // The memory is consecutive because the last index is consecutive
+    // and all other indices are loop invariant.
+    if (Step->isOne())
+      return 1;
+    if (Step->isAllOnesValue())
+      return -1;
+  }
+
+  return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
+InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+
+  // If we have this scalar in the map, return it.
+  if (WidenMap.has(V))
+    return WidenMap.get(V);
+
+  // If this scalar is unknown, assume that it is a constant or that it is
+  // loop invariant. Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  return WidenMap.splat(V, B);
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+  assert(Vec->getType()->isVectorTy() && "Invalid type");
+  SmallVector<Constant*, 8> ShuffleMask;
+  for (unsigned i = 0; i < VF; ++i)
+    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+  return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+                                     ConstantVector::get(ShuffleMask),
+                                     "reverse");
+}
+
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+                                             LoopVectorizationLegality *Legal) {
+  // Attempt to issue a wide load.
+  LoadInst *LI = dyn_cast<LoadInst>(Instr);
+  StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+  assert((LI || SI) && "Invalid Load/Store instruction");
+
+  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *DataTy = VectorType::get(ScalarDataTy, VF);
+  Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+  // If the pointer is loop invariant or if it is non consecutive,
+  // scalarize the load.
+  int Stride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = Stride < 0;
+  bool UniformLoad = LI && Legal->isUniform(Ptr);
+  if (Stride == 0 || UniformLoad)
+    return scalarizeInstruction(Instr);
+
+  Constant *Zero = Builder.getInt32(0);
+  VectorParts &Entry = WidenMap.get(Instr);
+
+  // Handle consecutive loads/stores.
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    Value *PtrOperand = Gep->getPointerOperand();
+    Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+    FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(0, FirstBasePtr);
+    Gep2->setName("gep.indvar.base");
+    Ptr = Builder.Insert(Gep2);
+  } else if (Gep) {
+    assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+                               OrigLoop) && "Base ptr must be invariant");
+
+    // The last index does not have to be the induction. It can be
+    // consecutive and be a function of the index. For example A[I+1];
+    unsigned NumOperands = Gep->getNumOperands();
+
+    Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+    VectorParts &GEPParts = getVectorValue(LastGepOperand);
+    Value *LastIndex = GEPParts[0];
+    LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+    // Create the new GEP with the new induction variable.
+    GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+    Gep2->setOperand(NumOperands - 1, LastIndex);
+    Gep2->setName("gep.indvar.idx");
+    Ptr = Builder.Insert(Gep2);
+  } else {
+    // Use the induction element ptr.
+    assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+    VectorParts &PtrVal = getVectorValue(Ptr);
+    Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+  }
+
+  // Handle Stores:
+  if (SI) {
+    assert(!Legal->isUniform(SI->getPointerOperand()) &&
+           "We do not allow storing to uniform addresses");
+
+    VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      // Calculate the pointer for the specific unroll-part.
+      Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+      if (Reverse) {
+        // If we store to reverse consecutive memory locations then we need
+        // to reverse the order of elements in the stored value.
+        StoredVal[Part] = reverseVector(StoredVal[Part]);
+        // If the address is consecutive but reversed, then the
+        // wide store needs to start at the last vector element.
+        PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+        PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      }
+
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+    }
+  }
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // Calculate the pointer for the specific unroll-part.
+    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+    if (Reverse) {
+      // If the address is consecutive but reversed, then the
+      // wide store needs to start at the last vector element.
+      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+    }
+
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+    cast<LoadInst>(LI)->setAlignment(Alignment);
+    Entry[Part] = Reverse ? reverseVector(LI) :  LI;
+  }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<VectorParts, 4> Params;
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getVectorValue(SrcOp));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap.get(SrcInst));
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Value *UndefVec = IsVoidRetTy ? 0 :
+    UndefValue::get(VectorType::get(Instr->getType(), VF));
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+  // For each scalar that we create:
+  for (unsigned Width = 0; Width < VF; ++Width) {
+    // For each vector unroll 'part':
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instrucions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        // Param is a vector. Need to extract the right lane.
+        if (Op->getType()->isVectorTy())
+          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
+        Cloned->setOperand(op, Op);
+      }
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
+                                                       Builder.getInt32(Width));
+    }
+  }
+}
+
+Instruction *
+InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                     Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+  Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Instruction *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  IRBuilder<> ChkBuilder(Loc);
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+      if (MemoryRuntimeCheck)
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "conflict.rdx");
+
+      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
+    }
+  }
+
+  return MemoryRuntimeCheck;
+}
+
+void
+InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+       [ ] <-- vector loop bypass (may consist of multiple blocks).
+     /  |
+    /   v
+   |   [ ]     <-- vector pre header.
+   |    |
+   |    v
+   |   [  ] \
+   |   [  ]_|   <-- vector loop.
+   |    |
+    \   v
+      >[ ]   <--- middle-block.
+     /  |
+    /   v
+   |   [ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
+   ...
+   */
+
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
+  // Mark the old scalar loop with metadata that tells us not to vectorize this
+  // loop again if we run into it.
+  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
+  OldInduction = Legal->getInduction();
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+  DL->getIntPtrType(SE->getContext());
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+  OldInduction->getIncomingValueForBlock(BypassBlock):
+  ConstantInt::get(IdxTy, 0);
+
+  assert(BypassBlock && "Invalid loop structure");
+  LoopBypassBlocks.push_back(BypassBlock);
+
+  // Split the single block loop into the two loop structure described above.
+  BasicBlock *VectorPH =
+  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody =
+  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  BasicBlock *ScalarPH =
+  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Generate the induction variable.
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+
+  // This is the IR builder that we use to add all of the logic for bypassing
+  // the new vector loop.
+  IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
+    else
+      Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
+  }
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
+
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+  Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+  Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down");
+
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
+  Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+                                          "cmp.zero");
+
+  BasicBlock *LastBypassBlock = BypassBlock;
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+                                                 BypassBlock->getTerminator());
+  if (MemRuntimeCheck) {
+    // Create a new block containing the memory check.
+    BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+                                                          "vector.memcheck");
+    LoopBypassBlocks.push_back(CheckBlock);
+
+    // Replace the branch into the memory check block with a conditional branch
+    // for the "few elements case".
+    Instruction *OldTerm = BypassBlock->getTerminator();
+    BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+    OldTerm->eraseFromParent();
+
+    Cmp = MemRuntimeCheck;
+    LastBypassBlock = CheckBlock;
+  }
+
+  LastBypassBlock->getTerminator()->eraseFromParent();
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+                     LastBypassBlock);
+
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
+
+  // This variable saves the new starting index for the scalar loop.
+  PHINode *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    LoopVectorizationLegality::InductionInfo II = I->second;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                         MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    switch (II.IK) {
+    case LoopVectorizationLegality::IK_NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IK_IntInduction: {
+      // Handle the integer induction counter:
+      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReverseIntInduction: {
+      // Convert the CountRoundDown variable to the PHI size.
+      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
+      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
+      Value *CRD = CountRoundDown;
+      if (CRDSize > IISize)
+        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
+                               II.StartValue->getType(), "tr.crd",
+                               LoopBypassBlocks.back()->getTerminator());
+      else if (CRDSize < IISize)
+        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
+                               II.StartValue->getType(),
+                               "sext.crd",
+                               LoopBypassBlocks.back()->getTerminator());
+      // Handle reverse integer induction counter:
+      EndValue =
+        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_PtrInduction: {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue =
+        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::IK_ReversePtrInduction: {
+      // The value at the end of the loop for the reverse pointer is calculated
+      // by creating a GEP with a negative index starting from the start value.
+      Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+                                  "rev.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+                                  "rev.ptr.ind.end",
+                                  LoopBypassBlocks.back()->getTerminator());
+      break;
+    }
+    }// end of case
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                ResumeIndex, "cmp.n",
+                                MiddleBlock->getTerminator());
+
+  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+  // Remove the old terminator.
+  MiddleBlock->getTerminator()->eraseFromParent();
+
+  // Create i+1 and fill the PHINode.
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(StartIdx, VectorPH);
+  Induction->addIncoming(NextIdx, VecBody);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+  // Now we have two terminators. Remove the old one from the block.
+  VecBody->getTerminator()->eraseFromParent();
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Create and register the new vector loop.
+  Loop* Lp = new Loop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks.
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
+    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+      ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
+  }
+
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+  // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+}
+
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+static Constant*
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+  switch (K) {
+  case LoopVectorizationLegality:: RK_IntegerXor:
+  case LoopVectorizationLegality:: RK_IntegerAdd:
+  case LoopVectorizationLegality:: RK_IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
+    return ConstantInt::get(Tp, 0);
+  case LoopVectorizationLegality:: RK_IntegerMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantInt::get(Tp, 1);
+  case LoopVectorizationLegality:: RK_IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
+    return ConstantInt::get(Tp, -1, true);
+  case LoopVectorizationLegality:: RK_FloatMult:
+    // Multiplying a number by 1 does not change it.
+    return ConstantFP::get(Tp, 1.0L);
+  case LoopVectorizationLegality:: RK_FloatAdd:
+    // Adding zero to a number does not change it.
+    return ConstantFP::get(Tp, 0.0L);
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+  // If we have an intrinsic call, check if it is trivially vectorizable.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::sqrt:
+    case Intrinsic::sin:
+    case Intrinsic::cos:
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+    case Intrinsic::log:
+    case Intrinsic::log10:
+    case Intrinsic::log2:
+    case Intrinsic::fabs:
+    case Intrinsic::floor:
+    case Intrinsic::ceil:
+    case Intrinsic::trunc:
+    case Intrinsic::rint:
+    case Intrinsic::nearbyint:
+    case Intrinsic::pow:
+    case Intrinsic::fma:
+    case Intrinsic::fmuladd:
+      return II->getIntrinsicID();
+    default:
+      return Intrinsic::not_intrinsic;
+    }
+  }
+
+  if (!TLI)
+    return Intrinsic::not_intrinsic;
+
+  LibFunc::Func Func;
+  Function *F = CI->getCalledFunction();
+  // We're going to make assumptions on the semantics of the functions, check
+  // that the target knows that it's available in this environment.
+  if (!F || !TLI->getLibFunc(F->getName(), Func))
+    return Intrinsic::not_intrinsic;
+
+  // Otherwise check if we have a call to a function that can be turned into a
+  // vector intrinsic.
+  switch (Func) {
+  default:
+    break;
+  case LibFunc::sin:
+  case LibFunc::sinf:
+  case LibFunc::sinl:
+    return Intrinsic::sin;
+  case LibFunc::cos:
+  case LibFunc::cosf:
+  case LibFunc::cosl:
+    return Intrinsic::cos;
+  case LibFunc::exp:
+  case LibFunc::expf:
+  case LibFunc::expl:
+    return Intrinsic::exp;
+  case LibFunc::exp2:
+  case LibFunc::exp2f:
+  case LibFunc::exp2l:
+    return Intrinsic::exp2;
+  case LibFunc::log:
+  case LibFunc::logf:
+  case LibFunc::logl:
+    return Intrinsic::log;
+  case LibFunc::log10:
+  case LibFunc::log10f:
+  case LibFunc::log10l:
+    return Intrinsic::log10;
+  case LibFunc::log2:
+  case LibFunc::log2f:
+  case LibFunc::log2l:
+    return Intrinsic::log2;
+  case LibFunc::fabs:
+  case LibFunc::fabsf:
+  case LibFunc::fabsl:
+    return Intrinsic::fabs;
+  case LibFunc::floor:
+  case LibFunc::floorf:
+  case LibFunc::floorl:
+    return Intrinsic::floor;
+  case LibFunc::ceil:
+  case LibFunc::ceilf:
+  case LibFunc::ceill:
+    return Intrinsic::ceil;
+  case LibFunc::trunc:
+  case LibFunc::truncf:
+  case LibFunc::truncl:
+    return Intrinsic::trunc;
+  case LibFunc::rint:
+  case LibFunc::rintf:
+  case LibFunc::rintl:
+    return Intrinsic::rint;
+  case LibFunc::nearbyint:
+  case LibFunc::nearbyintf:
+  case LibFunc::nearbyintl:
+    return Intrinsic::nearbyint;
+  case LibFunc::pow:
+  case LibFunc::powf:
+  case LibFunc::powl:
+    return Intrinsic::pow;
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
+/// This function translates the reduction kind to an LLVM binary operator.
+static Instruction::BinaryOps
+getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
+  switch (Kind) {
+    case LoopVectorizationLegality::RK_IntegerAdd:
+      return Instruction::Add;
+    case LoopVectorizationLegality::RK_IntegerMult:
+      return Instruction::Mul;
+    case LoopVectorizationLegality::RK_IntegerOr:
+      return Instruction::Or;
+    case LoopVectorizationLegality::RK_IntegerAnd:
+      return Instruction::And;
+    case LoopVectorizationLegality::RK_IntegerXor:
+      return Instruction::Xor;
+    case LoopVectorizationLegality::RK_FloatMult:
+      return Instruction::FMul;
+    case LoopVectorizationLegality::RK_FloatAdd:
+      return Instruction::FAdd;
+    default:
+      llvm_unreachable("Unknown reduction operation");
+  }
+}
+
+void
+InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+  Constant *Zero = Builder.getInt32(0);
+
+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // stages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector RdxPHIsToFix;
+
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
+
+  // Vectorize all of the blocks in the original loop.
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb)
+    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
+
+  // At this point every instruction in the original loop is widened to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable descriptor.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+    (*Legal->getReductionVars())[RdxPhi];
+
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
+
+    // This is the vector-clone of the value that leaves the loop.
+    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit[0]->getType();
+
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Identity = ConstantVector::getSplat(VF, Iden);
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                     RdxDesc.StartValue, Zero);
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
+    BasicBlock *Latch = OrigLoop->getLoopLatch();
+    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
+    VectorParts &Val = getVectorValue(LoopVal);
+    for (unsigned part = 0; part < UF; ++part) {
+      // Make sure to add the reduction stat value only to the 
+      // first unroll part.
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
+      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+    }
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    VectorParts RdxParts;
+    for (unsigned part = 0; part < UF; ++part) {
+      // This PHINode contains the vectorized reduction variable, or
+      // the initial value vector, if we bypass the vector loop.
+      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+      Value *StartVal = (part == 0) ? VectorStart : Identity;
+      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
+      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+      RdxParts.push_back(NewPhi);
+    }
+
+    // Reduce all of the unrolled parts into a single vector.
+    Value *ReducedPartRdx = RdxParts[0];
+    for (unsigned part = 1; part < UF; ++part) {
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
+                                           "bin.rdx");
+    }
+
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = ReducedPartRdx;
+    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i/2; ++j)
+        ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                UndefValue::get(Builder.getInt32Ty()));
+
+      Value *Shuf =
+        Builder.CreateShuffleVector(TmpVec,
+                                    UndefValue::get(TmpVec->getType()),
+                                    ConstantVector::get(ShuffleMask),
+                                    "rdx.shuf");
+
+      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
+      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+    }
+
+    // The result is in the first element of the vector.
+    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx =
+    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+    // Pick the other block.
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
+  }// end of for each redux variable.
+
+  // The Loop exit block may have single value PHI nodes where the incoming
+  // value is 'undef'. While vectorizing we only handled real values that
+  // were defined inside the loop. Here we handle the 'undef case'.
+  // See PR14725.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+       LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi) continue;
+    if (LCSSAPhi->getNumIncomingValues() == 1)
+      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
+                            LoopMiddleBlock);
+  }
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+         "Invalid edge");
+
+  VectorParts SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  if (BI->isConditional()) {
+    VectorParts EdgeMask = getVectorValue(BI->getCondition());
+
+    if (BI->getSuccessor(0) != Dst)
+      for (unsigned part = 0; part < UF; ++part)
+        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+
+    for (unsigned part = 0; part < UF; ++part)
+      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+    return EdgeMask;
+  }
+
+  return SrcMask;
+}
+
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB) {
+    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+    return getVectorValue(C);
+  }
+
+  // This is the block mask. We OR all incoming edges, and with zero.
+  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+  VectorParts BlockMask = getVectorValue(Zero);
+
+  // For each pred:
+  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
+    VectorParts EM = createEdgeMask(*it, BB);
+    for (unsigned part = 0; part < UF; ++part)
+      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  }
+
+  return BlockMask;
+}
+
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      PHINode* P = cast<PHINode>(it);
+      // Handle reduction variables:
+      if (Legal->getReductionVars()->count(P)) {
+        for (unsigned part = 0; part < UF; ++part) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(it->getType(), VF);
+          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                        LoopVectorBody-> getFirstInsertionPt());
+        }
+        PV->push_back(P);
+        continue;
+      }
+
+      // Check for PHI nodes that are lowered to vector selects.
+      if (P->getParent() != OrigLoop->getHeader()) {
+        // We know that all PHIs in non header blocks are converted into
+        // selects, so we don't have to worry about the insertion order and we
+        // can just use the builder.
+
+        // At this point we generate the predication tree. There may be
+        // duplications since this is a simple recursive scan, but future
+        // optimizations will clean it up.
+        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
+                                               P->getParent());
+
+        for (unsigned part = 0; part < UF; ++part) {
+        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
+        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
+                                             "predphi");
+        }
+        continue;
+      }
+
+      // This PHINode must be an induction variable.
+      // Make sure that we know about it.
+      assert(Legal->getInductionVars()->count(P) &&
+             "Not an induction variable");
+
+      LoopVectorizationLegality::InductionInfo II =
+        Legal->getInductionVars()->lookup(P);
+
+      switch (II.IK) {
+      case LoopVectorizationLegality::IK_NoInduction:
+        llvm_unreachable("Unknown induction");
+      case LoopVectorizationLegality::IK_IntInduction: {
+        assert(P == OldInduction && "Unexpected PHI");
+        Value *Broadcasted = getBroadcastInstrs(Induction);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding 0, 1, 2 ...
+        for (unsigned part = 0; part < UF; ++part)
+          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+        continue;
+      }
+      case LoopVectorizationLegality::IK_ReverseIntInduction:
+      case LoopVectorizationLegality::IK_PtrInduction:
+      case LoopVectorizationLegality::IK_ReversePtrInduction:
+        // Handle reverse integer and pointer inductions.
+        Value *StartIdx = 0;
+        // If we have a single integer induction variable then use it.
+        // Otherwise, start counting at zero.
+        if (OldInduction) {
+          LoopVectorizationLegality::InductionInfo OldII =
+            Legal->getInductionVars()->lookup(OldInduction);
+          StartIdx = OldII.StartValue;
+        } else {
+          StartIdx = ConstantInt::get(Induction->getType(), 0);
+        }
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // Handle the reverse integer induction variable case.
+        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
+          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                                 "resize.norm.idx");
+          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                                 "reverse.idx");
+
+          // This is a new value so do not hoist it out.
+          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding  ... -3, -2, -1, 0.
+          for (unsigned part = 0; part < UF; ++part)
+            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+          continue;
+        }
+
+        // Handle the pointer induction variable case.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+        // Is this a reverse induction ptr or a consecutive induction ptr.
+        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                        II.IK);
+
+        // This is the vector of results. Notice that we don't generate
+        // vector geps because scalar geps result in better code.
+        for (unsigned part = 0; part < UF; ++part) {
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+            Value *GlobalIdx;
+            if (!Reverse)
+              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+            else
+              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
+            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                               "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+          Entry[part] = VecVal;
+        }
+        continue;
+      }
+
+    }// End of PHI.
+
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Just widen binops.
+      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
+
+      // Use this vector value for all users of the original instruction.
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+
+        // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
+        BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
+        if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (VecOp && isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
+
+        Entry[Part] = V;
+      }
+      break;
+    }
+    case Instruction::Select: {
+      // Widen selects.
+      // If the selector is loop invariant we can create a select
+      // instruction with a scalar condition. Otherwise, use vector-select.
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
+                                               OrigLoop);
+
+      // The condition can be loop invariant  but still defined inside the
+      // loop. This means that we can't just use the original 'cond' value.
+      // We have to take the 'vectorized' value and pick the first lane.
+      // Instcombine will make this a no-op.
+      VectorParts &Cond = getVectorValue(it->getOperand(0));
+      VectorParts &Op0  = getVectorValue(it->getOperand(1));
+      VectorParts &Op1  = getVectorValue(it->getOperand(2));
+      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
+                                                       Builder.getInt32(0));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Entry[Part] = Builder.CreateSelect(
+          InvariantCond ? ScalarCond : Cond[Part],
+          Op0[Part],
+          Op1[Part]);
+      }
+      break;
+    }
+
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Widen compares. Generate vector compares.
+      bool FCmp = (it->getOpcode() == Instruction::FCmp);
+      CmpInst *Cmp = dyn_cast<CmpInst>(it);
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      VectorParts &B = getVectorValue(it->getOperand(1));
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *C = 0;
+        if (FCmp)
+          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
+        else
+          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
+        Entry[Part] = C;
+      }
+      break;
+    }
+
+    case Instruction::Store:
+    case Instruction::Load:
+        vectorizeMemoryInstruction(it, Legal);
+        break;
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      CastInst *CI = dyn_cast<CastInst>(it);
+      /// Optimize the special case where the source is the induction
+      /// variable. Notice that we can only optimize the 'trunc' case
+      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
+      /// c. other casts depend on pointer size.
+      if (CI->getOperand(0) == OldInduction &&
+          it->getOpcode() == Instruction::Trunc) {
+        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
+                                               CI->getType());
+        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
+        break;
+      }
+      /// Vectorize casts.
+      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+
+      VectorParts &A = getVectorValue(it->getOperand(0));
+      for (unsigned Part = 0; Part < UF; ++Part)
+        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
+      break;
+    }
+
+    case Instruction::Call: {
+      // Ignore dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        break;
+
+      Module *M = BB->getParent()->getParent();
+      CallInst *CI = cast<CallInst>(it);
+      Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+      assert(ID && "Not an intrinsic call!");
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        SmallVector<Value*, 4> Args;
+        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+          VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
+          Args.push_back(Arg[Part]);
+        }
+        Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
+        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+        Entry[Part] = Builder.CreateCall(F, Args);
+      }
+      break;
+    }
+
+    default:
+      // All other instructions are unsupported. Scalarize them.
+      scalarizeInstruction(it);
+      break;
+    }// end of switch.
+  }// end of for_each instr.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
+  SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+    DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion)
+    return false;
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+  // Collect the blocks that need predication.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator()))
+      return false;
+
+    // We must have at most two predecessors because we need to convert
+    // all PHIs to selects.
+    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
+    if (Preds > 2)
+      return false;
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+      return false;
+  }
+
+  // We can if-convert this loop.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+
+  // We can only vectorize innermost loops.
+  if (TheLoop->getSubLoopsVector().size())
+    return false;
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1)
+    return false;
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock())
+    return false;
+
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+
+  // Check if we can if-convert non single-bb loops.
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+    return false;
+  }
+
+  // We need to have a loop header.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << "\n");
+
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+    return false;
+  }
+
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
+  if (TC > 0u && TC < TinyTripCountVectorThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeMemory()) {
+    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+    return false;
+  }
+
+  // Collect all of the variables that remain uniform after vectorization.
+  collectLoopUniforms();
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
+
+  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+  BasicBlock *Header = TheLoop->getHeader();
+
+  // If we marked the scalar loop as "already vectorized" then no need
+  // to vectorize it again.
+  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
+    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
+    return false;
+  }
+
+  // For each block in the loop.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the instructions in the block and look for hazards.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+        // This should not happen because the loop should be normalized.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
+
+        // Check that this PHI type is allowed.
+        if (!Phi->getType()->isIntegerTy() &&
+            !Phi->getType()->isFloatingPointTy() &&
+            !Phi->getType()->isPointerTy()) {
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+          return false;
+        }
+
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (*bb != Header)
+          continue;
+
+        // This is the value coming from the preheader.
+        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+        // Check if this is an induction variable.
+        InductionKind IK = isInductionVariable(Phi);
+
+        if (IK_NoInduction != IK) {
+          // Int inductions are special because we only allow one IV.
+          if (IK == IK_IntInduction) {
+            if (Induction) {
+              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+              return false;
+            }
+            Induction = Phi;
+          }
+
+          DEBUG(dbgs() << "LV: Found an induction variable.\n");
+          Inductions[Phi] = InductionInfo(StartValue, IK);
+          continue;
+        }
+
+        if (AddReductionVar(Phi, RK_IntegerAdd)) {
+          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerMult)) {
+          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerOr)) {
+          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerAnd)) {
+          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_IntegerXor)) {
+          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_FloatMult)) {
+          DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, RK_FloatAdd)) {
+          DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+
+        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+        return false;
+      }// end of PHI handling
+
+      // We still don't handle functions. However, we can ignore dbg intrinsic
+      // calls and we do handle certain intrinsic and libm functions.
+      CallInst *CI = dyn_cast<CallInst>(it);
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
+        DEBUG(dbgs() << "LV: Found a call site.\n");
+        return false;
+      }
+
+      // Check that the instruction return type is vectorizable.
+      if (!VectorType::isValidElementType(it->getType()) &&
+          !it->getType()->isVoidTy()) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+        return false;
+      }
+
+      // Check that the stored type is vectorizable.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
+        Type *T = ST->getValueOperand()->getType();
+        if (!VectorType::isValidElementType(T))
+          return false;
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (!AllowedExit.count(it))
+        //Check that all of the users of the loop are inside the BB.
+        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
+             I != E; ++I) {
+          Instruction *U = cast<Instruction>(*I);
+          // This user may be a reduction exit value.
+          if (!TheLoop->contains(U)) {
+            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+            return false;
+          }
+        }
+    } // next instr.
+
+  }
+
+  if (!Induction) {
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
+  }
+
+  return true;
+}
+
+void LoopVectorizationLegality::collectLoopUniforms() {
+  // We now know that the loop is vectorizable!
+  // Collect variables that will remain uniform after vectorization.
+  std::vector<Value*> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+
+  // Start with the conditional branch and walk up the block.
+  Worklist.push_back(Latch->getTerminator()->getOperand(0));
+
+  while (Worklist.size()) {
+    Instruction *I = dyn_cast<Instruction>(Worklist.back());
+    Worklist.pop_back();
+
+    // Look at instructions inside this loop.
+    // Stop when reaching PHI nodes.
+    // TODO: we need to follow values all over the loop, not only in this block.
+    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
+      continue;
+
+    // This is a known uniform.
+    Uniforms.insert(I);
+
+    // Insert all operands.
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
+      Worklist.push_back(I->getOperand(i));
+    }
+  }
+}
+
+AliasAnalysis::Location
+LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+    return AA->getLocation(Store);
+  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+    return AA->getLocation(Load);
+
+  llvm_unreachable("Should be either load or store instruction");
+}
+
+bool
+LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
+                                                Value *Object,
+                                                Instruction *Inst,
+                                                AliasMultiMap& WriteObjects,
+                                                unsigned MaxByteWidth) {
+
+  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+
+  std::vector<Instruction*>::iterator
+              it = WriteObjects[Object].begin(),
+              end = WriteObjects[Object].end();
+
+  for (; it != end; ++it) {
+    Instruction* I = *it;
+    if (I == Inst)
+      continue;
+
+    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
+    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
+                  ThatLoc.getWithNewSize(MaxByteWidth)))
+      return true;
+  }
+  return false;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+
+  if (TheLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld) return false;
+        if (!Ld->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+          return false;
+        }
+        Loads.push_back(Ld);
+        continue;
+      }
+
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) return false;
+        if (!St->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+          return false;
+        }
+        Stores.push_back(St);
+      }
+    } // next instr.
+  } // next block.
+
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
+
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+    return true;
+  }
+
+  // Holds the read and read-write *pointers* that we find. These maps hold
+  // unique values for pointers (so no need for multi-map).
+  AliasMap Reads;
+  AliasMap ReadWrites;
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = cast<StoreInst>(*I);
+    Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
+    // If we did *not* see this pointer before, insert it to
+    // the read-write list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr))
+      ReadWrites.insert(std::make_pair(Ptr, ST));
+  }
+
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = cast<LoadInst>(*I);
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
+      Reads.insert(std::make_pair(Ptr, LD));
+  }
+
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool CanDoRT = true;
+  AliasMap::iterator MI, ME;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
+    } else {
+      CanDoRT = false;
+      break;
+    }
+  }
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *V = (*MI).first;
+    if (hasComputableBounds(V)) {
+      PtrRtCheck.insert(SE, TheLoop, V);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
+    } else {
+      CanDoRT = false;
+      break;
+    }
+  }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.reset();
+    CanDoRT = false;
+  }
+
+  if (CanDoRT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
+  bool NeedRTCheck = false;
+
+  // Biggest vectorized access possible, vector width * unroll factor.
+  // TODO: We're being very pessimistic here, find a way to know the
+  // real access width before getting here.
+  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
+                           TTI->getMaximumUnrollFactor();
+  // Now that the pointers are in two lists (Reads and ReadWrites), we
+  // can check that there are no conflicts between each of the writes and
+  // between the writes to the reads.
+  // Note that WriteObjects duplicates the stores (indexed now by underlying
+  // objects) to avoid pointing to elements inside ReadWrites.
+  // TODO: Maybe create a new type where they can interact without duplication.
+  AliasMultiMap WriteObjects;
+  ValueVector TempObjects;
+
+  // Check that the read-writes do not conflict with other read-write
+  // pointers.
+  bool AllWritesIdentified = true;
+  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    Instruction *Inst = (*MI).second;
+
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      if (!isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
+        NeedRTCheck = true;
+        AllWritesIdentified = false;
+      }
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty()) {
+        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
+        WriteObjects[*UI].push_back(Inst);
+        continue;
+      }
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a conflicting global value:"
+            << **UI <<"\n");
+      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << *UI <<"\n");
+        return false;
+      }
+
+      // Didn't alias, insert into map for further reference.
+      WriteObjects[*UI].push_back(Inst);
+    }
+    TempObjects.clear();
+  }
+
+  /// Check that the reads don't conflict with the read-writes.
+  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
+    Value *Val = (*MI).first;
+    GetUnderlyingObjects(Val, TempObjects, DL);
+    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
+         UI != UE; ++UI) {
+      // If all of the writes are identified then we don't care if the read
+      // pointer is identified or not.
+      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
+        NeedRTCheck = true;
+      }
+
+      // Never seen it before, can't alias.
+      if (WriteObjects[*UI].empty())
+        continue;
+      // Direct alias found.
+      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **UI <<"\n");
+        return false;
+      }
+      DEBUG(dbgs() << "LV: Found a global value:  "
+            << **UI <<"\n");
+      Instruction *Inst = (*MI).second;
+      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
+      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
+
+      // If global alias, make sure they do alias.
+      if (hasPossibleGlobalWriteReorder(*UI,
+                                        Inst,
+                                        WriteObjects,
+                                        MaxByteWidth)) {
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
+              << *UI <<"\n");
+        return false;
+      }
+    }
+    TempObjects.clear();
+  }
+
+  PtrRtCheck.Need = NeedRTCheck;
+  if (NeedRTCheck && !CanDoRT) {
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
+        " need a runtime memory check.\n");
+  return true;
+}
+
+bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
+                                                ReductionKind Kind) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Reduction variables are only found in the loop header block.
+  if (Phi->getParent() != TheLoop->getHeader())
+    return false;
+
+  // Obtain the reduction start value from the value that comes from the loop
+  // preheader.
+  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = 0;
+  // Indicates that we found a binary operation in our scan.
+  bool FoundBinOp = false;
+
+  // Iter is our iterator. We start with the PHI node and scan for all of the
+  // users of this instruction. All users must be instructions that can be
+  // used as reduction variables (such as ADD). We may have a single
+  // out-of-block user. The cycle must end with the original PHI.
+  Instruction *Iter = Phi;
+  while (true) {
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
+      return false;
+
+    // Did we find a user inside this loop already ?
+    bool FoundInBlockUser = false;
+    // Did we reach the initial PHI node already ?
+    bool FoundStartPHI = false;
+
+    // Is this a bin op ?
+    FoundBinOp |= !isa<PHINode>(Iter);
+
+    // Remember the current instruction.
+    Instruction *OldIter = Iter;
+
+    // For each of the *users* of iter.
+    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      // We already know that the PHI is a user.
+      if (U == Phi) {
+        FoundStartPHI = true;
+        continue;
+      }
+
+      // Check if we found the exit user.
+      BasicBlock *Parent = U->getParent();
+      if (!TheLoop->contains(Parent)) {
+        // Exit if you find multiple outside users.
+        if (ExitInstruction != 0)
+          return false;
+        ExitInstruction = Iter;
+      }
+
+      // We allow in-loop PHINodes which are not the original reduction PHI
+      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+      // structure) then don't skip this PHI.
+      if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
+          U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U) &&
+          Iter->hasNUsesOrMore(2))
+        continue;
+
+      // We can't have multiple inside users.
+      if (FoundInBlockUser)
+        return false;
+      FoundInBlockUser = true;
+
+      // Any reduction instr must be of one of the allowed kinds.
+      if (!isReductionInstr(U, Kind))
+        return false;
+
+      // Reductions of instructions such as Div, and Sub is only
+      // possible if the LHS is the reduction variable.
+      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+        return false;
+
+      Iter = U;
+    }
+
+    // If all uses were skipped this can't be a reduction variable.
+    if (Iter == OldIter)
+      return false;
+
+    // We found a reduction var if we have reached the original
+    // phi node and we only have a single instruction with out-of-loop
+    // users.
+    if (FoundStartPHI) {
+      // This instruction is allowed to have out-of-loop users.
+      AllowedExit.insert(ExitInstruction);
+
+      // Save the description of this reduction variable.
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      Reductions[Phi] = RD;
+      // We've ended the cycle. This is a reduction variable if we have an
+      // outside user and it has a binary op.
+      return FoundBinOp && ExitInstruction;
+    }
+  }
+}
+
+bool
+LoopVectorizationLegality::isReductionInstr(Instruction *I,
+                                            ReductionKind Kind) {
+  bool FP = I->getType()->isFloatingPointTy();
+  bool FastMath = (FP && I->isCommutative() && I->isAssociative());
+
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::PHI:
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+        return false;
+    // possibly.
+    return true;
+  case Instruction::Sub:
+  case Instruction::Add:
+    return Kind == RK_IntegerAdd;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::Mul:
+    return Kind == RK_IntegerMult;
+  case Instruction::And:
+    return Kind == RK_IntegerAnd;
+  case Instruction::Or:
+    return Kind == RK_IntegerOr;
+  case Instruction::Xor:
+    return Kind == RK_IntegerXor;
+  case Instruction::FMul:
+    return Kind == RK_FloatMult && FastMath;
+  case Instruction::FAdd:
+    return Kind == RK_FloatAdd && FastMath;
+   }
+}
+
+LoopVectorizationLegality::InductionKind
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return IK_NoInduction;
+
+  // Check that the PHI is consecutive.
+  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return IK_NoInduction;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy()) {
+    if (Step->isOne())
+      return IK_IntInduction;
+    if (Step->isAllOnesValue())
+      return IK_ReverseIntInduction;
+    return IK_NoInduction;
+  }
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return IK_NoInduction;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  if (C->getValue()->equalsInt(Size))
+    return IK_PtrInduction;
+  else if (C->getValue()->equalsInt(0 - Size))
+    return IK_ReversePtrInduction;
+
+  return IK_NoInduction;
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  Value *In0 = const_cast<Value*>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
+  assert(TheLoop->contains(BB) && "Unknown block used");
+
+  // Blocks that do not dominate the latch need predication.
+  BasicBlock* Latch = TheLoop->getLoopLatch();
+  return !DT->dominates(BB, Latch);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // We don't predicate loads/stores at the moment.
+    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+      return false;
+
+    // The instructions below can trap.
+    switch (it->getOpcode()) {
+    default: continue;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+             return false;
+    }
+  }
+
+  return true;
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
+}
+
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
+                                                      unsigned UserVF) {
+  // Width 1 means no vectorize
+  VectorizationFactor Factor = { 1U, 0U };
+  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+    return Factor;
+  }
+
+  // Find the trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+
+  unsigned WidestType = getWidestType();
+  unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxVectorSize = WidestRegister / WidestType;
+  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+
+  if (MaxVectorSize == 0) {
+    DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+    MaxVectorSize = 1;
+  }
+
+  assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
+         " into one vector!");
+
+  unsigned VF = MaxVectorSize;
+
+  // If we optimize the program for size, avoid creating the tail loop.
+  if (OptForSize) {
+    // If we are unable to calculate the trip count then don't try to vectorize.
+    if (TC < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return Factor;
+    }
+
+    // Find the maximum SIMD width that can fit within the trip count.
+    VF = TC % MaxVectorSize;
+
+    if (VF == 0)
+      VF = MaxVectorSize;
+
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    if (VF < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return Factor;
+    }
+  }
+
+  if (UserVF != 0) {
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+
+    Factor.Width = UserVF;
+    return Factor;
+  }
+
+  float Cost = expectedCost(1);
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  for (unsigned i=2; i <= VF; i*=2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    float VectorCost = expectedCost(i) / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+          (int)VectorCost << ".\n");
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  Factor.Width = Width;
+  Factor.Cost = Width * Cost;
+  return Factor;
+}
+
+unsigned LoopVectorizationCostModel::getWidestType() {
+  unsigned MaxWidth = 8;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it))
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return MaxWidth;
+}
+
+unsigned
+LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
+                                               unsigned UserUF,
+                                               unsigned VF,
+                                               unsigned LoopCost) {
+
+  // -- The unroll heuristics --
+  // We unroll the loop in order to expose ILP and reduce the loop overhead.
+  // There are many micro-architectural considerations that we can't predict
+  // at this level. For example frontend pressure (on decode or fetch) due to
+  // code size, or the number and capabilities of the execution ports.
+  //
+  // We use the following heuristics to select the unroll factor:
+  // 1. If the code has reductions the we unroll in order to break the cross
+  // iteration dependency.
+  // 2. If the loop is really small then we unroll in order to reduce the loop
+  // overhead.
+  // 3. We don't unroll if we think that we will spill registers to memory due
+  // to the increased register pressure.
+
+  // Use the user preference, unless 'auto' is selected.
+  if (UserUF != 0)
+    return UserUF;
+
+  // When we optimize for size we don't unroll.
+  if (OptForSize)
+    return 1;
+
+  // Do not unroll loops with a relatively small trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop,
+                                              TheLoop->getLoopLatch());
+  if (TC > 1 && TC < TinyTripCountUnrollThreshold)
+    return 1;
+
+  unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
+  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
+        " vector registers\n");
+
+  LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
+  // We divide by these constants so assume that we have at least one
+  // instruction that uses at least one register.
+  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
+  R.NumInstructions = std::max(R.NumInstructions, 1U);
+
+  // We calculate the unroll factor using the following formula.
+  // Subtract the number of loop invariants from the number of available
+  // registers. These registers are used by all of the unrolled instances.
+  // Next, divide the remaining registers by the number of registers that is
+  // required by the loop, in order to estimate how many parallel instances
+  // fit without causing spills.
+  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
+
+  // Clamp the unroll factor ranges to reasonable factors.
+  unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
+
+  // If we did not calculate the cost for VF (because the user selected the VF)
+  // then we calculate the cost of VF here.
+  if (LoopCost == 0)
+    LoopCost = expectedCost(VF);
+
+  // Clamp the calculated UF to be between the 1 and the max unroll factor
+  // that the target allows.
+  if (UF > MaxUnrollSize)
+    UF = MaxUnrollSize;
+  else if (UF < 1)
+    UF = 1;
+
+  if (Legal->getReductionVars()->size()) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+    return UF;
+  }
+
+  // We want to unroll tiny loops in order to reduce the loop overhead.
+  // We assume that the cost overhead is 1 and we use the cost model
+  // to estimate the cost of the loop and unroll until the cost of the
+  // loop overhead is about 5% of the cost of the loop.
+  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
+  if (LoopCost < 20) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
+    unsigned NewUF = 20/LoopCost + 1;
+    return std::min(NewUF, UF);
+  }
+
+  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  return 1;
+}
+
+LoopVectorizationCostModel::RegisterUsage
+LoopVectorizationCostModel::calculateRegisterUsage() {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each instruction. We use RPO to ensure that defs are
+  // met before their users. We assume that each instruction that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // instruction. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+
+  RegisterUsage R;
+  R.NumInstructions = 0;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // instruction that is the key.
+  typedef DenseMap<Instruction*, unsigned> IntervalMap;
+  // Maps instruction to its index.
+  DenseMap<unsigned, Instruction*> IdxToInstr;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of instruction indices that are used in the loop.
+  SmallSet<Instruction*, 8> Ends;
+  // Saves the list of values that are used in the loop but are
+  // defined outside the loop, such as arguments and constants.
+  SmallPtrSet<Value*, 8> LoopInvariants;
+
+  unsigned Index = 0;
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb) {
+    R.NumInstructions += (*bb)->size();
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+      Instruction *I = it;
+      IdxToInstr[Index++] = I;
+
+      // Save the end location of each USE.
+      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+        Value *U = I->getOperand(i);
+        Instruction *Instr = dyn_cast<Instruction>(U);
+
+        // Ignore non-instruction values such as arguments, constants, etc.
+        if (!Instr) continue;
+
+        // If this instruction is outside the loop then record it and continue.
+        if (!TheLoop->contains(Instr)) {
+          LoopInvariants.insert(Instr);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[Instr] = Index;
+        Ends.insert(Instr);
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  typedef SmallVector<Instruction*, 2> InstrList;
+  DenseMap<unsigned, InstrList> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
+       it != e; ++it)
+    TransposeEnds[it->second].push_back(it->first);
+
+  SmallSet<Instruction*, 8> OpenIntervals;
+  unsigned MaxUsage = 0;
+
+
+  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+  for (unsigned int i = 0; i < Index; ++i) {
+    Instruction *I = IdxToInstr[i];
+    // Ignore instructions that are never used within the loop.
+    if (!Ends.count(I)) continue;
+
+    // Remove all of the instructions that end at this location.
+    InstrList &List = TransposeEnds[i];
+    for (unsigned int j=0, e = List.size(); j < e; ++j)
+      OpenIntervals.erase(List[j]);
+
+    // Count the number of live interals.
+    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
+
+    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
+          OpenIntervals.size() <<"\n");
+
+    // Add the current instruction to the list of open intervals.
+    OpenIntervals.insert(I);
+  }
+
+  unsigned Invariant = LoopInvariants.size();
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+
+  R.LoopInvariantRegs = Invariant;
+  R.MaxLocalUsers = MaxUsage;
+  return R;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  unsigned Cost = 0;
+
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    unsigned BlockCost = 0;
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the old loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        continue;
+
+      unsigned C = getInstructionCost(it, VF);
+      Cost += C;
+      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
+            VF << " For instruction: "<< *it << "\n");
+    }
+
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // When the code is scalar then some of the blocks are avoided due to CF.
+    // When the code is vectorized we execute all code paths.
+    if (Legal->blockNeedsPredication(*bb) && VF == 1)
+      BlockCost /= 2;
+
+    Cost += BlockCost;
+  }
+
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (Legal->isUniformAfterVectorization(I))
+    VF = 1;
+
+  Type *RetTy = I->getType();
+  Type *VectorTy = ToVectorTy(RetTy, VF);
+
+  // TODO: We need to estimate the cost of intrinsic calls.
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because the cost of GEPs in
+    // vectorized code depends on whether the corresponding memory instruction
+    // is scalarized or not. Therefore, we handle GEPs with the memory
+    // instruction cost.
+    return 0;
+  case Instruction::Br: {
+    return TTI.getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::PHI:
+    //TODO: IF-converted IFs become selects.
+    return 0;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+      TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+      TargetTransformInfo::OK_AnyValue;
+
+    if (isa<ConstantInt>(I->getOperand(1)))
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (!ScalarCond)
+      CondTy = VectorType::get(CondTy, VF);
+
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+  }
+  case Instruction::Store:
+  case Instruction::Load: {
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    Type *ValTy = (SI ? SI->getValueOperand()->getType() :
+                   LI->getType());
+    VectorTy = ToVectorTy(ValTy, VF);
+
+    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+    unsigned AS = SI ? SI->getPointerAddressSpace() :
+      LI->getPointerAddressSpace();
+    Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand();
+    // We add the cost of address computation here instead of with the gep
+    // instruction because only here we know whether the operation is
+    // scalarized.
+    if (VF == 1)
+      return TTI.getAddressComputationCost(VectorTy) +
+        TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+    // Scalarized loads/stores.
+    int Stride = Legal->isConsecutivePtr(Ptr);
+    bool Reverse = Stride < 0;
+    if (0 == Stride) {
+      unsigned Cost = 0;
+      // The cost of extracting from the value vector and pointer vector.
+      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+      for (unsigned i = 0; i < VF; ++i) {
+        //  The cost of extracting the pointer operand.
+        Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
+        // In case of STORE, the cost of ExtractElement from the vector.
+        // In case of LOAD, the cost of InsertElement into the returned
+        // vector.
+        Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement :
+                                            Instruction::InsertElement,
+                                            VectorTy, i);
+      }
+
+      // The cost of the scalar loads/stores.
+      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
+      Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                                       Alignment, AS);
+      return Cost;
+    }
+
+    // Wide load/stores.
+    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+
+    if (Reverse)
+      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                  VectorTy, 0);
+    return Cost;
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    // We optimize the truncation of induction variable.
+    // The cost of these is the same as the scalar operation.
+    if (I->getOpcode() == Instruction::Trunc &&
+        Legal->isInductionVariable(I->getOperand(0)))
+      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
+                                  I->getOperand(0)->getType());
+
+    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(I);
+    Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
+    assert(ID && "Not an intrinsic call!");
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type*, 4> Tys;
+    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
+      Tys.push_back(ToVectorTy(CI->getArgOperand(i)->getType(), VF));
+    return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+  }
+  default: {
+    // We are scalarizing the instruction. Return the cost of the scalar
+    // instruction, plus the cost of insert and extract into vector
+    // elements, times the vector width.
+    unsigned Cost = 0;
+
+    if (!RetTy->isVoidTy() && VF != 1) {
+      unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
+                                                VectorTy);
+      unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                                VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+    }
+
+    // The cost of executing VF copies of the scalar instruction. This opcode
+    // is unknown. Assume that it is the same as 'mul'.
+    Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+    return Cost;
+  }
+  }// end of switch.
+}
+
+Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createLoopVectorizePass() {
+    return new LoopVectorize();
+  }
+}
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+  // Check for a store.
+  if (StoreInst *ST = dyn_cast<StoreInst>(Inst))
+    return Legal->isConsecutivePtr(ST->getPointerOperand()) != 0;
+
+  // Check for a load.
+  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return Legal->isConsecutivePtr(LI->getPointerOperand()) != 0;
+
+  return false;
+}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 000000000000..19eefd2f87e0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,43 @@
+   //===-- Vectorize.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeBBVectorizePass(Registry);
+  initializeLoopVectorizePass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBBVectorizePass());
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVectorizePass());
+}